diff --git a/.arcconfig b/.arcconfig
index 7540b46..048706a 100644
--- a/.arcconfig
+++ b/.arcconfig
@@ -1,4 +1,4 @@
 {
   "project_id" : "clang",
-  "conduit_uri" : "http://reviews.llvm.org/"
+  "conduit_uri" : "https://reviews.llvm.org/"
 }
diff --git a/.clang-tidy b/.clang-tidy
index 3186da4..d10f688 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1 +1,12 @@
-Checks: '-*,clang-diagnostic-*,llvm-*,misc-*'
+Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,-misc-unused-parameters,readability-identifier-naming'
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.FunctionCase
+    value:           lowerCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.VariableCase
+    value:           CamelCase
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb6b3de..e6dde85 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,18 +1,4 @@
-cmake_minimum_required(VERSION 2.8.8)
-
-# FIXME: It may be removed when we use 2.8.12.
-if(CMAKE_VERSION VERSION_LESS 2.8.12)
-  # Invalidate a couple of keywords.
-  set(cmake_2_8_12_INTERFACE)
-  set(cmake_2_8_12_PRIVATE)
-else()
-  # Use ${cmake_2_8_12_KEYWORD} intead of KEYWORD in target_link_libraries().
-  set(cmake_2_8_12_INTERFACE INTERFACE)
-  set(cmake_2_8_12_PRIVATE PRIVATE)
-  if(POLICY CMP0022)
-    cmake_policy(SET CMP0022 NEW) # automatic when 2.8.12 is required
-  endif()
-endif()
+cmake_minimum_required(VERSION 3.4.3)
 
 # If we are not building as a part of LLVM, build Clang as an
 # standalone project, using LLVM as an external library:
@@ -72,7 +58,7 @@
   find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" ${LLVM_TOOLS_BINARY_DIR}
     NO_DEFAULT_PATH)
 
-  set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/share/llvm/cmake")
+  set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm")
   set(LLVMCONFIG_FILE "${LLVM_CMAKE_PATH}/LLVMConfig.cmake")
   if(EXISTS ${LLVMCONFIG_FILE})
     list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_PATH}")
@@ -182,6 +168,10 @@
   set(BACKEND_PACKAGE_STRING "${PACKAGE_STRING}")
 endif()
 
+# Make sure that our source directory is on the current cmake module path so that
+# we can include cmake files from this directory.
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
 find_package(LibXml2 2.5.3 QUIET)
 if (LIBXML2_FOUND)
   set(CLANG_HAVE_LIBXML 1)
@@ -197,6 +187,31 @@
 set(DEFAULT_SYSROOT "" CACHE PATH
   "Default <path> to all compiler invocations for --sysroot=<path>." )
 
+set(ENABLE_LINKER_BUILD_ID OFF CACHE BOOL "pass --build-id to ld")
+
+set(ENABLE_X86_RELAX_RELOCATIONS OFF CACHE BOOL
+    "enable x86 relax relocations by default")
+
+set(CLANG_DEFAULT_CXX_STDLIB "" CACHE STRING
+  "Default C++ stdlib to use (\"libstdc++\" or \"libc++\", empty for platform default")
+if (NOT(CLANG_DEFAULT_CXX_STDLIB STREQUAL "" OR
+        CLANG_DEFAULT_CXX_STDLIB STREQUAL "libstdc++" OR
+        CLANG_DEFAULT_CXX_STDLIB STREQUAL "libc++"))
+  message(WARNING "Resetting default C++ stdlib to use platform default")
+  set(CLANG_DEFAULT_CXX_STDLIB "" CACHE STRING
+    "Default C++ stdlib to use (\"libstdc++\" or \"libc++\", empty for platform default" FORCE)
+endif()
+
+set(CLANG_DEFAULT_RTLIB "" CACHE STRING
+  "Default runtime library to use (\"libgcc\" or \"compiler-rt\", empty for platform default)")
+if (NOT(CLANG_DEFAULT_RTLIB STREQUAL "" OR
+        CLANG_DEFAULT_RTLIB STREQUAL "libgcc" OR
+        CLANG_DEFAULT_RTLIB STREQUAL "compiler-rt"))
+  message(WARNING "Resetting default rtlib to use platform default")
+  set(CLANG_DEFAULT_RTLIB "" CACHE STRING
+    "Default runtime library to use (\"libgcc\" or \"compiler-rt\", empty for platform default)" FORCE)
+endif()
+
 set(CLANG_DEFAULT_OPENMP_RUNTIME "libomp" CACHE STRING
   "Default OpenMP runtime used by -fopenmp.")
 
@@ -214,19 +229,6 @@
   add_definitions(-DCLANG_REPOSITORY_STRING="${CLANG_REPOSITORY_STRING}")
 endif()
 
-option(CLANG_APPEND_VC_REV
-  "Append the version control system revision id to clang version spew" OFF)
-if(CLANG_APPEND_VC_REV)
-  if(NOT SVN_REVISION)
-    # This macro will set SVN_REVISION in the parent scope
-    add_version_info_from_vcs(VERSION_VAR)
-  endif()
-
-  if(SVN_REVISION)
-    add_definitions(-DSVN_REVISION="${SVN_REVISION}")
-  endif()
-endif()
-
 set(CLANG_VENDOR_UTI "org.llvm.clang" CACHE STRING
   "Vendor-specific uti.")
 
@@ -321,149 +323,7 @@
   ${CLANG_BINARY_DIR}/include/clang/Config/config.h)
 
 include(CMakeParseArguments)
-
-function(clang_tablegen)
-  # Syntax:
-  # clang_tablegen output-file [tablegen-arg ...] SOURCE source-file
-  # [[TARGET cmake-target-name] [DEPENDS extra-dependency ...]]
-  #
-  # Generates a custom command for invoking tblgen as
-  #
-  # tblgen source-file -o=output-file tablegen-arg ...
-  #
-  # and, if cmake-target-name is provided, creates a custom target for
-  # executing the custom command depending on output-file. It is
-  # possible to list more files to depend after DEPENDS.
-
-  cmake_parse_arguments(CTG "" "SOURCE;TARGET" "" ${ARGN})
-
-  if( NOT CTG_SOURCE )
-    message(FATAL_ERROR "SOURCE source-file required by clang_tablegen")
-  endif()
-
-  set( LLVM_TARGET_DEFINITIONS ${CTG_SOURCE} )
-  tablegen(CLANG ${CTG_UNPARSED_ARGUMENTS})
-
-  if(CTG_TARGET)
-    add_public_tablegen_target(${CTG_TARGET})
-    set_target_properties( ${CTG_TARGET} PROPERTIES FOLDER "Clang tablegenning")
-    set_property(GLOBAL APPEND PROPERTY CLANG_TABLEGEN_TARGETS ${CTG_TARGET})
-  endif()
-endfunction(clang_tablegen)
-
-macro(set_clang_windows_version_resource_properties name)
-  if(DEFINED windows_resource_file)
-    set_windows_version_resource_properties(${name} ${windows_resource_file}
-      VERSION_MAJOR ${CLANG_VERSION_MAJOR}
-      VERSION_MINOR ${CLANG_VERSION_MINOR}
-      VERSION_PATCHLEVEL ${CLANG_VERSION_PATCHLEVEL}
-      VERSION_STRING "${CLANG_VERSION} (${BACKEND_PACKAGE_STRING})"
-      PRODUCT_NAME "clang")
-  endif()
-endmacro()
-
-macro(add_clang_subdirectory name)
-  add_llvm_subdirectory(CLANG TOOL ${name})
-endmacro()
-
-macro(add_clang_library name)
-  cmake_parse_arguments(ARG
-    "SHARED"
-    ""
-    "ADDITIONAL_HEADERS"
-    ${ARGN})
-  set(srcs)
-  if(MSVC_IDE OR XCODE)
-    # Add public headers
-    file(RELATIVE_PATH lib_path
-      ${CLANG_SOURCE_DIR}/lib/
-      ${CMAKE_CURRENT_SOURCE_DIR}
-    )
-    if(NOT lib_path MATCHES "^[.][.]")
-      file( GLOB_RECURSE headers
-        ${CLANG_SOURCE_DIR}/include/clang/${lib_path}/*.h
-        ${CLANG_SOURCE_DIR}/include/clang/${lib_path}/*.def
-      )
-      set_source_files_properties(${headers} PROPERTIES HEADER_FILE_ONLY ON)
-
-      file( GLOB_RECURSE tds
-        ${CLANG_SOURCE_DIR}/include/clang/${lib_path}/*.td
-      )
-      source_group("TableGen descriptions" FILES ${tds})
-      set_source_files_properties(${tds}} PROPERTIES HEADER_FILE_ONLY ON)
-
-      if(headers OR tds)
-        set(srcs ${headers} ${tds})
-      endif()
-    endif()
-  endif(MSVC_IDE OR XCODE)
-  if(srcs OR ARG_ADDITIONAL_HEADERS)
-    set(srcs
-      ADDITIONAL_HEADERS
-      ${srcs}
-      ${ARG_ADDITIONAL_HEADERS} # It may contain unparsed unknown args.
-      )
-  endif()
-  if(ARG_SHARED)
-    set(ARG_ENABLE_SHARED SHARED)
-  endif()
-  llvm_add_library(${name} ${ARG_ENABLE_SHARED} ${ARG_UNPARSED_ARGUMENTS} ${srcs})
-
-  if(TARGET ${name})
-    target_link_libraries(${name} ${cmake_2_8_12_INTERFACE} ${LLVM_COMMON_LIBS})
-
-    if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "libclang")
-      install(TARGETS ${name}
-        COMPONENT ${name}
-        EXPORT ClangTargets
-        LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-        ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-        RUNTIME DESTINATION bin)
-
-      if (${ARG_SHARED} AND NOT CMAKE_CONFIGURATION_TYPES)
-        add_custom_target(install-${name}
-                          DEPENDS ${name}
-                          COMMAND "${CMAKE_COMMAND}"
-                                  -DCMAKE_INSTALL_COMPONENT=${name}
-                                  -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
-      endif()
-    endif()
-    set_property(GLOBAL APPEND PROPERTY CLANG_EXPORTS ${name})
-  else()
-    # Add empty "phony" target
-    add_custom_target(${name})
-  endif()
-
-  set_target_properties(${name} PROPERTIES FOLDER "Clang libraries")
-  set_clang_windows_version_resource_properties(${name})
-endmacro(add_clang_library)
-
-macro(add_clang_executable name)
-  add_llvm_executable( ${name} ${ARGN} )
-  set_target_properties(${name} PROPERTIES FOLDER "Clang executables")
-  set_clang_windows_version_resource_properties(${name})
-endmacro(add_clang_executable)
-
-macro(add_clang_tool name)
-  add_clang_executable(${name} ${ARGN})
-  install(TARGETS ${name}
-    RUNTIME DESTINATION bin
-    COMPONENT ${name})
-
-  if(NOT CMAKE_CONFIGURATION_TYPES)
-    add_custom_target(install-${name}
-      DEPENDS ${name}
-      COMMAND "${CMAKE_COMMAND}"
-              -DCMAKE_INSTALL_COMPONENT=${name}
-              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
-  endif()
-endmacro()
-
-macro(add_clang_symlink name dest)
-  add_llvm_tool_symlink(${name} ${dest} ALWAYS_GENERATE)
-  # Always generate install targets
-  llvm_install_symlink(${name} ${dest} ALWAYS_GENERATE)
-endmacro()
+include(AddClang)
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
@@ -493,6 +353,9 @@
 
 add_definitions( -D_GNU_SOURCE )
 
+option(CLANG_BUILD_TOOLS
+  "Build the Clang tools. If OFF, just generate build targets." ON)
+
 option(CLANG_ENABLE_ARCMT "Build ARCMT." ON)
 if (CLANG_ENABLE_ARCMT)
   set(ENABLE_CLANG_ARCMT "1")
@@ -586,48 +449,40 @@
   add_subdirectory(docs)
 endif()
 
-# this line is needed as a cleanup to ensure that any CMakeCaches with the old
-# default value get updated to the new default.
-if(CLANG_ORDER_FILE STREQUAL "")
-  unset(CLANG_ORDER_FILE CACHE)
-endif()
 
-set(CLANG_ORDER_FILE ${CMAKE_CURRENT_BINARY_DIR}/clang.order CACHE FILEPATH
-  "Order file to use when compiling clang in order to improve startup time.")
+if(APPLE)
+  # this line is needed as a cleanup to ensure that any CMakeCaches with the old
+  # default value get updated to the new default.
+  if(CLANG_ORDER_FILE STREQUAL "")
+    unset(CLANG_ORDER_FILE CACHE)
+    unset(CLANG_ORDER_FILE)
+  endif()
 
-if(CLANG_ORDER_FILE AND NOT EXISTS ${CLANG_ORDER_FILE})
-  string(FIND "${CLANG_ORDER_FILE}" "${CMAKE_CURRENT_BINARY_DIR}" PATH_START)
-  if(PATH_START EQUAL 0)
-    file(WRITE ${CLANG_ORDER_FILE} "\n")
-  else()
-    message(FATAL_ERROR "Specified order file '${CLANG_ORDER_FILE}' does not exist.")
+
+  set(CLANG_ORDER_FILE ${CMAKE_CURRENT_BINARY_DIR}/clang.order CACHE FILEPATH
+    "Order file to use when compiling clang in order to improve startup time (Darwin Only - requires ld64).")
+
+  if(CLANG_ORDER_FILE AND NOT EXISTS ${CLANG_ORDER_FILE})
+    string(FIND "${CLANG_ORDER_FILE}" "${CMAKE_CURRENT_BINARY_DIR}" PATH_START)
+    if(PATH_START EQUAL 0)
+      file(WRITE ${CLANG_ORDER_FILE} "\n")
+    else()
+      message(FATAL_ERROR "Specified order file '${CLANG_ORDER_FILE}' does not exist.")
+    endif()
   endif()
 endif()
 
-if (CLANG_BUILT_STANDALONE OR CMAKE_VERSION VERSION_EQUAL 3 OR
-    CMAKE_VERSION VERSION_GREATER 3)
-  add_subdirectory(cmake/modules)
-endif ()
+add_subdirectory(cmake/modules)
+
+if(CLANG_STAGE)
+  message(STATUS "Setting current clang stage to: ${CLANG_STAGE}")
+endif()
 
 if (CLANG_ENABLE_BOOTSTRAP)
   include(ExternalProject)
 
-  if(CMAKE_VERSION VERSION_GREATER 3.1.0)
-    set(cmake_3_1_EXCLUDE_FROM_ALL EXCLUDE_FROM_ALL 1)
-  endif()
-
-  if(CMAKE_VERSION VERSION_GREATER 3.3.20150708)
-    set(cmake_3_4_USES_TERMINAL_OPTIONS
-      USES_TERMINAL_CONFIGURE 1
-      USES_TERMINAL_BUILD 1
-      USES_TERMINAL_INSTALL 1
-      )
-    set(cmake_3_4_USES_TERMINAL USES_TERMINAL 1)
-  endif()
-
   if(NOT CLANG_STAGE)
     set(CLANG_STAGE stage1)
-    message(STATUS "Setting current clang stage to: ${CLANG_STAGE}")
   endif()
 
   string(REGEX MATCH "stage([0-9]*)" MATCHED_STAGE "${CLANG_STAGE}")
@@ -651,13 +506,25 @@
   set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/${NEXT_CLANG_STAGE}-stamps/)
   set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/${NEXT_CLANG_STAGE}-bins/)
 
-  # If on Darwin we need to make bootstrap depend on LTO and pass
-  # DARWIN_LTO_LIBRARY so that -flto will work using the just-built compiler
-  if(APPLE)
-    set(LTO_DEP LTO llvm-ar llvm-ranlib)
-    set(LTO_LIBRARY -DDARWIN_LTO_LIBRARY=${LLVM_SHLIB_OUTPUT_INTDIR}/libLTO.dylib)
-    set(LTO_AR -DCMAKE_AR=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-ar)
-    set(LTO_RANLIB -DCMAKE_RANLIB=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-ranlib)
+  # If the next stage is LTO we need to depend on LTO and possibly LLVMgold
+  if(BOOTSTRAP_LLVM_ENABLE_LTO OR LLVM_ENABLE_LTO)
+    set(LTO_DEP LTO)
+    if(APPLE)
+      # on Darwin we need to set DARWIN_LTO_LIBRARY so that -flto will work
+      # using the just-built compiler, and we need to override DYLD_LIBRARY_PATH
+      # so that the host object file tools will use the just-built libLTO.
+      # However if System Integrity Protection is enabled the DYLD variables
+      # will be scrubbed from the environment of any base system commands. This
+      # includes /bin/sh, which ninja uses when executing build commands. To
+      # work around the envar being filtered away we pass it in as a CMake
+      # variable, and have LLVM's CMake append the envar to the archiver calls.
+      set(LTO_LIBRARY -DDARWIN_LTO_LIBRARY=${LLVM_SHLIB_OUTPUT_INTDIR}/libLTO.dylib
+        -DDYLD_LIBRARY_PATH=${LLVM_LIBRARY_OUTPUT_INTDIR})
+    elseif(NOT WIN32)
+      list(APPEND LTO_DEP LLVMgold llvm-ar llvm-ranlib)
+      set(LTO_AR -DCMAKE_AR=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-ar)
+      set(LTO_RANLIB -DCMAKE_RANLIB=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-ranlib)
+    endif()
   endif()
 
   add_custom_target(${NEXT_CLANG_STAGE}-clear
@@ -677,12 +544,13 @@
     set(verbose -DCMAKE_VERBOSE_MAKEFILE=On)
   endif()
 
-  set(BOOTSTRAP_DEFAULT_PASSTHROUGH
+  set(_BOOTSTRAP_DEFAULT_PASSTHROUGH
     PACKAGE_VERSION
     LLVM_VERSION_MAJOR
     LLVM_VERSION_MINOR
     LLVM_VERSION_PATCH
     LLVM_VERSION_SUFFIX
+    LLVM_BINUTILS_INCDIR
     CLANG_REPOSITORY_STRING
     CMAKE_MAKE_PROGRAM)
 
@@ -703,11 +571,22 @@
   if(LLVM_BUILD_INSTRUMENTED)
     set(PGO_DEP generate-profdata)
     set(PGO_OPT -DLLVM_PROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.profdata)
-    set(COMPILER_OPTIONS
-      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-      -DCMAKE_ASM_COMPILER=${CMAKE_ASM_COMPILER})
+    # Use the current tools for LTO instead of the instrumented ones
+    list(APPEND _BOOTSTRAP_DEFAULT_PASSTHROUGH
+      CMAKE_CXX_COMPILER
+      CMAKE_C_COMPILER
+      CMAKE_ASM_COMPILER
+      CMAKE_AR
+      CMAKE_RANLIB
+      DARWIN_LTO_LIBRARY
+      DYLD_LIBRARY_PATH)
+
+    set(COMPILER_OPTIONS)
+    set(LTO_LIBRARY)
     set(RUNTIME_DEP) # Don't set runtime dependencies
+    set(LTO_DEP)     # Don't need to depend on LTO
+    set(LTO_AR)
+    set(LTO_RANLIB)
   endif()
 
   # Find all variables that start with BOOTSTRAP_ and populate a variable with
@@ -720,10 +599,14 @@
       list(APPEND PASSTHROUGH_VARIABLES
         -D${varName}=${value})
     endif()
+    if(${variableName} AND variableName MATCHES "LLVM_EXTERNAL_.*_SOURCE_DIR")
+      list(APPEND PASSTHROUGH_VARIABLES
+        -D${variableName}=${${variableName}})
+    endif()
   endforeach()
 
   # Populate the passthrough variables
-  foreach(variableName ${CLANG_BOOTSTRAP_PASSTHROUGH} ${BOOTSTRAP_DEFAULT_PASSTHROUGH})
+  foreach(variableName ${CLANG_BOOTSTRAP_PASSTHROUGH} ${_BOOTSTRAP_DEFAULT_PASSTHROUGH})
     if(${variableName})
       string(REPLACE ";" "\;" value ${${variableName}})
       list(APPEND PASSTHROUGH_VARIABLES
@@ -737,7 +620,7 @@
     SOURCE_DIR ${CMAKE_SOURCE_DIR}
     STAMP_DIR ${STAMP_DIR}
     BINARY_DIR ${BINARY_DIR}
-    ${cmake_3_1_EXCLUDE_FROM_ALL}
+    EXCLUDE_FROM_ALL 1
     CMAKE_ARGS
                 # We shouldn't need to set this here, but INSTALL_DIR doesn't
                 # seem to work, so instead I'm passing this through
@@ -749,7 +632,9 @@
                 ${LTO_LIBRARY} ${LTO_AR} ${LTO_RANLIB} ${verbose} ${PGO_OPT}
     INSTALL_COMMAND ""
     STEP_TARGETS configure build
-    ${cmake_3_4_USES_TERMINAL_OPTIONS}
+    USES_TERMINAL_CONFIGURE 1
+    USES_TERMINAL_BUILD 1
+    USES_TERMINAL_INSTALL 1
     )
 
   # exclude really-install from main target
@@ -758,7 +643,7 @@
     COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --target install
     COMMENT "Performing install step for '${NEXT_CLANG_STAGE}'"
     DEPENDEES build
-    ${cmake_3_4_USES_TERMINAL}
+    USES_TERMINAL 1
   )
   ExternalProject_Add_StepTargets(${NEXT_CLANG_STAGE} really-install)
   add_custom_target(${NEXT_CLANG_STAGE}-install DEPENDS ${NEXT_CLANG_STAGE}-really-install)
@@ -774,7 +659,7 @@
       COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --target ${target}
       COMMENT "Performing ${target} for '${NEXT_CLANG_STAGE}'"
       DEPENDEES configure
-      ${cmake_3_4_USES_TERMINAL}
+      USES_TERMINAL 1
     )
 
     if(target MATCHES "^stage[0-9]*")
@@ -784,3 +669,7 @@
     ExternalProject_Add_StepTargets(${NEXT_CLANG_STAGE} ${target})
   endforeach()
 endif()
+
+if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
+  add_subdirectory(utils/ClangVisualizers)
+endif()
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index 971fe9b..0aa156b 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -23,7 +23,7 @@
 
 N: Eric Christopher
 E: echristo@gmail.com
-D: Debug Information, autotools/configure/make build, inline assembly
+D: Debug Information, inline assembly
 
 N: Doug Gregor
 E: dgregor@apple.com
@@ -52,3 +52,7 @@
 N: Richard Smith
 E: richard@metafoo.co.uk
 D: All parts of Clang not covered by someone else
+
+N: Anastasia Stulova
+E: anastasia.stulova@arm.com
+D: OpenCL support
diff --git a/LICENSE.TXT b/LICENSE.TXT
index fc4afae..b452ca2 100644
--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@@ -4,7 +4,7 @@
 University of Illinois/NCSA
 Open Source License
 
-Copyright (c) 2007-2015 University of Illinois at Urbana-Champaign.
+Copyright (c) 2007-2016 University of Illinois at Urbana-Champaign.
 All rights reserved.
 
 Developed by:
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 9497b0a..0000000
--- a/Makefile
+++ /dev/null
@@ -1,124 +0,0 @@
-##===- Makefile --------------------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-# If CLANG_LEVEL is not set, then we are the top-level Makefile. Otherwise, we
-# are being included from a subdirectory makefile.
-
-ifndef CLANG_LEVEL
-
-IS_TOP_LEVEL := 1
-CLANG_LEVEL := .
-DIRS := utils/TableGen include lib tools runtime docs unittests
-
-PARALLEL_DIRS :=
-
-ifeq ($(BUILD_EXAMPLES),1)
-  PARALLEL_DIRS += examples
-endif
-endif
-
-ifeq ($(BUILD_EXAMPLES),1)
-  ENABLE_CLANG_EXAMPLES := 1
-else
-  ENABLE_CLANG_EXAMPLES := 0
-endif
-
-ifeq ($(MAKECMDGOALS),libs-only)
-  DIRS := $(filter-out tools docs, $(DIRS))
-  OPTIONAL_DIRS :=
-endif
-ifeq ($(BUILD_CLANG_ONLY),YES)
-  DIRS := $(filter-out docs unittests, $(DIRS))
-  OPTIONAL_DIRS :=
-endif
-
-###
-# Common Makefile code, shared by all Clang Makefiles.
-
-# Set LLVM source root level.
-LEVEL := $(CLANG_LEVEL)/../..
-
-# Include LLVM common makefile.
-include $(LEVEL)/Makefile.common
-
-ifneq ($(ENABLE_DOCS),1)
-  DIRS := $(filter-out docs, $(DIRS))
-endif
-
-# Set common Clang build flags.
-CPP.Flags += -I$(PROJ_SRC_DIR)/$(CLANG_LEVEL)/include -I$(PROJ_OBJ_DIR)/$(CLANG_LEVEL)/include
-ifdef CLANG_VENDOR
-CPP.Flags += -DCLANG_VENDOR='"$(CLANG_VENDOR) "'
-endif
-ifdef CLANG_REPOSITORY_STRING
-CPP.Flags += -DCLANG_REPOSITORY_STRING='"$(CLANG_REPOSITORY_STRING)"'
-endif
-
-# Disable -fstrict-aliasing. Darwin disables it by default (and LLVM doesn't
-# work with it enabled with GCC), Clang/llvm-gcc don't support it yet, and newer
-# GCC's have false positive warnings with it on Linux (which prove a pain to
-# fix). For example:
-#   http://gcc.gnu.org/PR41874
-#   http://gcc.gnu.org/PR41838
-#
-# We don't need to do this if the host compiler is clang.
-ifneq ($(CXX_COMPILER), "clang")
-CXX.Flags += -fno-strict-aliasing
-endif
-
-
-# Set up Clang's tblgen.
-ifndef CLANG_TBLGEN
-  ifeq ($(LLVM_CROSS_COMPILING),1)
-    CLANG_TBLGEN := $(BuildLLVMToolDir)/clang-tblgen$(BUILD_EXEEXT)
-  else
-    CLANG_TBLGEN := $(LLVMToolDir)/clang-tblgen$(EXEEXT)
-  endif
-endif
-ClangTableGen = $(CLANG_TBLGEN) $(TableGen.Flags)
-
-###
-# Clang Top Level specific stuff.
-
-ifeq ($(IS_TOP_LEVEL),1)
-
-ifneq ($(PROJ_SRC_ROOT),$(PROJ_OBJ_ROOT))
-$(RecursiveTargets)::
-	$(Verb) for dir in test unittests; do \
-	  if [ -f $(PROJ_SRC_DIR)/$${dir}/Makefile ] && [ ! -f $${dir}/Makefile ]; then \
-	    $(MKDIR) $${dir}; \
-	    $(CP) $(PROJ_SRC_DIR)/$${dir}/Makefile $${dir}/Makefile; \
-	  fi \
-	done
-endif
-
-test::
-	@ $(MAKE) -C test
-
-report::
-	@ $(MAKE) -C test report
-
-clean::
-	@ $(MAKE) -C test clean
-
-libs-only: all
-
-tags::
-	$(Verb) etags `find . -type f -name '*.h' -or -name '*.cpp' | \
-	  grep -v /lib/Headers | grep -v /test/`
-
-cscope.files:
-	find tools lib include -name '*.cpp' \
-	                    -or -name '*.def' \
-	                    -or -name '*.td' \
-	                    -or -name '*.h' > cscope.files
-
-.PHONY: test report clean cscope.files
-
-endif
diff --git a/bindings/python/clang/cindex.py b/bindings/python/clang/cindex.py
index e4b3876..cab4944 100644
--- a/bindings/python/clang/cindex.py
+++ b/bindings/python/clang/cindex.py
@@ -305,6 +305,14 @@
     Error   = 3
     Fatal   = 4
 
+    DisplaySourceLocation = 0x01
+    DisplayColumn         = 0x02
+    DisplaySourceRanges   = 0x04
+    DisplayOption         = 0x08
+    DisplayCategoryId     = 0x10
+    DisplayCategoryName   = 0x20
+    _FormatOptionsMask    = 0x3f
+
     def __init__(self, ptr):
         self.ptr = ptr
 
@@ -360,6 +368,23 @@
         return FixItIterator(self)
 
     @property
+    def children(self):
+        class ChildDiagnosticsIterator:
+            def __init__(self, diag):
+                self.diag_set = conf.lib.clang_getChildDiagnostics(diag)
+
+            def __len__(self):
+                return int(conf.lib.clang_getNumDiagnosticsInSet(self.diag_set))
+
+            def __getitem__(self, key):
+                diag = conf.lib.clang_getDiagnosticInSet(self.diag_set, key)
+                if not diag:
+                    raise IndexError
+                return Diagnostic(diag)
+
+        return ChildDiagnosticsIterator(self)
+
+    @property
     def category_number(self):
         """The category number for this diagnostic or 0 if unavailable."""
         return conf.lib.clang_getDiagnosticCategory(self)
@@ -382,10 +407,27 @@
 
         return conf.lib.clang_getCString(disable)
 
+    def format(self, options=None):
+        """
+        Format this diagnostic for display. The options argument takes
+        Diagnostic.Display* flags, which can be combined using bitwise OR. If
+        the options argument is not provided, the default display options will
+        be used.
+        """
+        if options is None:
+            options = conf.lib.clang_defaultDiagnosticDisplayOptions()
+        if options & ~Diagnostic._FormatOptionsMask:
+            raise ValueError('Invalid format options')
+        formatted = conf.lib.clang_formatDiagnostic(self, options)
+        return conf.lib.clang_getCString(formatted)
+
     def __repr__(self):
         return "<Diagnostic severity %r, location %r, spelling %r>" % (
             self.severity, self.location, self.spelling)
 
+    def __str__(self):
+        return self.format()
+
     def from_param(self):
       return self.ptr
 
@@ -1120,6 +1162,9 @@
 # A type alias template declaration
 CursorKind.TYPE_ALIAS_TEMPLATE_DECL = CursorKind(601)
 
+# A code completion overload candidate.
+CursorKind.OVERLOAD_CANDIDATE = CursorKind(700)
+
 ### Template Argument Kinds ###
 class TemplateArgumentKind(BaseEnumeration):
     """
@@ -1174,6 +1219,32 @@
         """
         return conf.lib.clang_CXXMethod_isConst(self)
 
+    def is_converting_constructor(self):
+        """Returns True if the cursor refers to a C++ converting constructor.
+        """
+        return conf.lib.clang_CXXConstructor_isConvertingConstructor(self)
+
+    def is_copy_constructor(self):
+        """Returns True if the cursor refers to a C++ copy constructor.
+        """
+        return conf.lib.clang_CXXConstructor_isCopyConstructor(self)
+
+    def is_default_constructor(self):
+        """Returns True if the cursor refers to a C++ default constructor.
+        """
+        return conf.lib.clang_CXXConstructor_isDefaultConstructor(self)
+
+    def is_move_constructor(self):
+        """Returns True if the cursor refers to a C++ move constructor.
+        """
+        return conf.lib.clang_CXXConstructor_isMoveConstructor(self)
+
+    def is_default_method(self):
+        """Returns True if the cursor refers to a C++ member function or member
+        function template that is declared '= default'.
+        """
+        return conf.lib.clang_CXXMethod_isDefaulted(self)
+
     def is_mutable_field(self):
         """Returns True if the cursor refers to a C++ field that is declared
         'mutable'.
@@ -1685,6 +1756,7 @@
 TypeKind.OBJCID = TypeKind(27)
 TypeKind.OBJCCLASS = TypeKind(28)
 TypeKind.OBJCSEL = TypeKind(29)
+TypeKind.FLOAT128 = TypeKind(30)
 TypeKind.COMPLEX = TypeKind(100)
 TypeKind.POINTER = TypeKind(101)
 TypeKind.BLOCKPOINTER = TypeKind(102)
@@ -1704,6 +1776,7 @@
 TypeKind.DEPENDENTSIZEDARRAY = TypeKind(116)
 TypeKind.MEMBERPOINTER = TypeKind(117)
 TypeKind.AUTO = TypeKind(118)
+TypeKind.ELABORATED = TypeKind(119)
 
 class RefQualifierKind(BaseEnumeration):
     """Describes a specific ref-qualifier of a type."""
@@ -1902,6 +1975,12 @@
         """
         return conf.lib.clang_Type_getClassType(self)
 
+    def get_named_type(self):
+        """
+        Retrieve the type named by the qualified-id.
+        """
+        return conf.lib.clang_Type_getNamedType(self)
+
     def get_align(self):
         """
         Retrieve the alignment of the record.
@@ -2383,7 +2462,7 @@
         functions above. __init__ is only called internally.
         """
         assert isinstance(index, Index)
-
+        self.index = index
         ClangObject.__init__(self, ptr)
 
     def __del__(self):
@@ -2703,6 +2782,11 @@
         return conf.lib.clang_CompileCommand_getDirectory(self.cmd)
 
     @property
+    def filename(self):
+        """Get the working filename for this CompileCommand"""
+        return conf.lib.clang_CompileCommand_getFilename(self.cmd)
+
+    @property
     def arguments(self):
         """
         Get an iterable object providing each argument in the
@@ -2884,6 +2968,11 @@
    _CXString,
    _CXString.from_result),
 
+  ("clang_CompileCommand_getFilename",
+   [c_object_p],
+   _CXString,
+   _CXString.from_result),
+
   ("clang_CompileCommand_getNumArgs",
    [c_object_p],
    c_uint),
@@ -2908,6 +2997,22 @@
    [Index, c_char_p],
    c_object_p),
 
+  ("clang_CXXConstructor_isConvertingConstructor",
+   [Cursor],
+   bool),
+
+  ("clang_CXXConstructor_isCopyConstructor",
+   [Cursor],
+   bool),
+
+  ("clang_CXXConstructor_isDefaultConstructor",
+   [Cursor],
+   bool),
+
+  ("clang_CXXConstructor_isMoveConstructor",
+   [Cursor],
+   bool),
+
   ("clang_CXXField_isMutable",
    [Cursor],
    bool),
@@ -2916,6 +3021,10 @@
    [Cursor],
    bool),
 
+  ("clang_CXXMethod_isDefaulted",
+   [Cursor],
+   bool),
+
   ("clang_CXXMethod_isPureVirtual",
    [Cursor],
    bool),
@@ -2928,6 +3037,10 @@
    [Cursor],
    bool),
 
+  ("clang_defaultDiagnosticDisplayOptions",
+   [],
+   c_uint),
+
   ("clang_defaultSaveOptions",
    [TranslationUnit],
    c_uint),
@@ -2969,6 +3082,10 @@
    [Type, Type],
    bool),
 
+  ("clang_formatDiagnostic",
+   [Diagnostic, c_uint],
+   _CXString),
+
   ("clang_getArgType",
    [Type, c_uint],
    Type,
@@ -2997,6 +3114,10 @@
    Type,
    Type.from_result),
 
+  ("clang_getChildDiagnostics",
+   [Diagnostic],
+   c_object_p),
+
   ("clang_getCompletionAvailability",
    [c_void_p],
    c_int),
@@ -3117,6 +3238,10 @@
    _CXString,
    _CXString.from_result),
 
+  ("clang_getDiagnosticInSet",
+   [c_object_p, c_uint],
+   c_object_p),
+
   ("clang_getDiagnosticLocation",
    [Diagnostic],
    SourceLocation),
@@ -3218,6 +3343,10 @@
    [c_object_p],
    c_uint),
 
+  ("clang_getNumDiagnosticsInSet",
+   [c_object_p],
+   c_uint),
+
   ("clang_getNumElements",
    [Type],
    c_longlong),
@@ -3477,6 +3606,11 @@
    [Type],
    c_uint),
 
+  ("clang_Type_getNamedType",
+   [Type],
+   Type,
+   Type.from_result),
+
   ("clang_Type_visitFields",
    [Type, callbacks['fields_visit'], py_object],
    c_uint),
diff --git a/bindings/python/tests/cindex/test_cdb.py b/bindings/python/tests/cindex/test_cdb.py
index e1f824f..35fe3e1 100644
--- a/bindings/python/tests/cindex/test_cdb.py
+++ b/bindings/python/tests/cindex/test_cdb.py
@@ -38,27 +38,34 @@
     cmds = cdb.getAllCompileCommands()
     assert len(cmds) == 3
     expected = [
+        { 'wd': '/home/john.doe/MyProject',
+          'file': '/home/john.doe/MyProject/project.cpp',
+          'line': ['clang++', '-o', 'project.o', '-c',
+                   '/home/john.doe/MyProject/project.cpp']},
         { 'wd': '/home/john.doe/MyProjectA',
+          'file': '/home/john.doe/MyProject/project2.cpp',
           'line': ['clang++', '-o', 'project2.o', '-c',
                    '/home/john.doe/MyProject/project2.cpp']},
         { 'wd': '/home/john.doe/MyProjectB',
+          'file': '/home/john.doe/MyProject/project2.cpp',
           'line': ['clang++', '-DFEATURE=1', '-o', 'project2-feature.o', '-c',
                    '/home/john.doe/MyProject/project2.cpp']},
-        { 'wd': '/home/john.doe/MyProject',
-          'line': ['clang++', '-o', 'project.o', '-c',
-                   '/home/john.doe/MyProject/project.cpp']}
+
         ]
     for i in range(len(cmds)):
         assert cmds[i].directory == expected[i]['wd']
+        assert cmds[i].filename == expected[i]['file']
         for arg, exp in zip(cmds[i].arguments, expected[i]['line']):
             assert arg == exp
 
 def test_1_compilecommand():
     """Check file with single compile command"""
     cdb = CompilationDatabase.fromDirectory(kInputsDir)
-    cmds = cdb.getCompileCommands('/home/john.doe/MyProject/project.cpp')
+    file = '/home/john.doe/MyProject/project.cpp'
+    cmds = cdb.getCompileCommands(file)
     assert len(cmds) == 1
-    assert cmds[0].directory == '/home/john.doe/MyProject'
+    assert cmds[0].directory == os.path.dirname(file)
+    assert cmds[0].filename == file
     expected = [ 'clang++', '-o', 'project.o', '-c',
                  '/home/john.doe/MyProject/project.cpp']
     for arg, exp in zip(cmds[0].arguments, expected):
diff --git a/bindings/python/tests/cindex/test_cursor.py b/bindings/python/tests/cindex/test_cursor.py
index c5ea505..6c8230d 100644
--- a/bindings/python/tests/cindex/test_cursor.py
+++ b/bindings/python/tests/cindex/test_cursor.py
@@ -112,6 +112,88 @@
     assert foo.is_const_method()
     assert not bar.is_const_method()
 
+def test_is_converting_constructor():
+    """Ensure Cursor.is_converting_constructor works."""
+    source = 'class X { explicit X(int); X(double); X(); };'
+    tu = get_tu(source, lang='cpp')
+
+    xs = get_cursors(tu, 'X')
+
+    assert len(xs) == 4
+    assert xs[0].kind == CursorKind.CLASS_DECL
+    cs = xs[1:]
+    assert cs[0].kind == CursorKind.CONSTRUCTOR
+    assert cs[1].kind == CursorKind.CONSTRUCTOR
+    assert cs[2].kind == CursorKind.CONSTRUCTOR
+
+    assert not cs[0].is_converting_constructor()
+    assert cs[1].is_converting_constructor()
+    assert not cs[2].is_converting_constructor()
+
+
+def test_is_copy_constructor():
+    """Ensure Cursor.is_copy_constructor works."""
+    source = 'class X { X(); X(const X&); X(X&&); };'
+    tu = get_tu(source, lang='cpp')
+
+    xs = get_cursors(tu, 'X')
+    assert xs[0].kind == CursorKind.CLASS_DECL
+    cs = xs[1:]
+    assert cs[0].kind == CursorKind.CONSTRUCTOR
+    assert cs[1].kind == CursorKind.CONSTRUCTOR
+    assert cs[2].kind == CursorKind.CONSTRUCTOR
+
+    assert not cs[0].is_copy_constructor()
+    assert cs[1].is_copy_constructor()
+    assert not cs[2].is_copy_constructor()
+
+def test_is_default_constructor():
+    """Ensure Cursor.is_default_constructor works."""
+    source = 'class X { X(); X(int); };'
+    tu = get_tu(source, lang='cpp')
+
+    xs = get_cursors(tu, 'X')
+    assert xs[0].kind == CursorKind.CLASS_DECL
+    cs = xs[1:]
+    assert cs[0].kind == CursorKind.CONSTRUCTOR
+    assert cs[1].kind == CursorKind.CONSTRUCTOR
+
+    assert cs[0].is_default_constructor()
+    assert not cs[1].is_default_constructor()
+
+def test_is_move_constructor():
+    """Ensure Cursor.is_move_constructor works."""
+    source = 'class X { X(); X(const X&); X(X&&); };'
+    tu = get_tu(source, lang='cpp')
+
+    xs = get_cursors(tu, 'X')
+    assert xs[0].kind == CursorKind.CLASS_DECL
+    cs = xs[1:]
+    assert cs[0].kind == CursorKind.CONSTRUCTOR
+    assert cs[1].kind == CursorKind.CONSTRUCTOR
+    assert cs[2].kind == CursorKind.CONSTRUCTOR
+
+    assert not cs[0].is_move_constructor()
+    assert not cs[1].is_move_constructor()
+    assert cs[2].is_move_constructor()
+
+def test_is_default_method():
+    """Ensure Cursor.is_default_method works."""
+    source = 'class X { X() = default; }; class Y { Y(); };'
+    tu = get_tu(source, lang='cpp')
+
+    xs = get_cursors(tu, 'X')
+    ys = get_cursors(tu, 'Y')
+
+    assert len(xs) == 2
+    assert len(ys) == 2
+
+    xc = xs[1]
+    yc = ys[1]
+
+    assert xc.is_default_method()
+    assert not yc.is_default_method()
+
 def test_is_mutable_field():
     """Ensure Cursor.is_mutable_field works."""
     source = 'class X { int x_; mutable int y_; };'
diff --git a/bindings/python/tests/cindex/test_diagnostics.py b/bindings/python/tests/cindex/test_diagnostics.py
index 48ab617..ba6e545 100644
--- a/bindings/python/tests/cindex/test_diagnostics.py
+++ b/bindings/python/tests/cindex/test_diagnostics.py
@@ -80,3 +80,15 @@
 
     assert d.option == '-Wunused-parameter'
     assert d.disable_option == '-Wno-unused-parameter'
+
+def test_diagnostic_children():
+    tu = get_tu('void f(int x) {} void g() { f(); }')
+    assert len(tu.diagnostics) == 1
+    d = tu.diagnostics[0]
+
+    children = d.children
+    assert len(children) == 1
+    assert children[0].severity == Diagnostic.Note
+    assert children[0].spelling.endswith('declared here')
+    assert children[0].location.line == 1
+    assert children[0].location.column == 1
diff --git a/cmake/caches/3-stage-base.cmake b/cmake/caches/3-stage-base.cmake
new file mode 100644
index 0000000..46c747e
--- /dev/null
+++ b/cmake/caches/3-stage-base.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "")
+set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
+set(LLVM_BUILD_EXTERNAL_COMPILER_RT ON CACHE BOOL "")
+set(BOOTSTRAP_LLVM_ENABLE_LTO ON CACHE BOOL "")
+
+set(CLANG_BOOTSTRAP_TARGETS
+  clang
+  check-all
+  check-llvm
+  check-clang
+  test-suite CACHE STRING "")
+
+set(CLANG_BOOTSTRAP_CMAKE_ARGS
+  -C ${CMAKE_CURRENT_LIST_DIR}/3-stage-base.cmake
+  CACHE STRING "")
diff --git a/cmake/caches/3-stage.cmake b/cmake/caches/3-stage.cmake
new file mode 100644
index 0000000..49bce39
--- /dev/null
+++ b/cmake/caches/3-stage.cmake
@@ -0,0 +1,16 @@
+set(CLANG_BOOTSTRAP_TARGETS
+  clang
+  check-all
+  check-llvm
+  check-clang
+  test-suite
+  stage3
+  stage3-clang
+  stage3-check-all
+  stage3-check-llvm
+  stage3-check-clang
+  stage3-test-suite CACHE STRING "")
+
+set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
+
+include(${CMAKE_CURRENT_LIST_DIR}/3-stage-base.cmake)
diff --git a/cmake/caches/Apple-stage1.cmake b/cmake/caches/Apple-stage1.cmake
index 814cfdf..3215981 100644
--- a/cmake/caches/Apple-stage1.cmake
+++ b/cmake/caches/Apple-stage1.cmake
@@ -27,11 +27,6 @@
 set(BOOTSTRAP_LLVM_ENABLE_LTO ON CACHE BOOL "")
 set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
 
-# LIBCXX Settings
-set(LIBCXX_INSTALL_LIBRARY OFF CACHE BOOL "")
-set(LIBCXX_INSTALL_HEADERS ON CACHE BOOL "")
-set(LIBCXX_OVERRIDE_DARWIN_INSTALL ON CACHE BOOL "")
-
 set(CLANG_BOOTSTRAP_TARGETS
   generate-order-file
   check-all
diff --git a/cmake/caches/Apple-stage2.cmake b/cmake/caches/Apple-stage2.cmake
index 6ec65dc..1d33435 100644
--- a/cmake/caches/Apple-stage2.cmake
+++ b/cmake/caches/Apple-stage2.cmake
@@ -3,6 +3,7 @@
 
 set(LLVM_TARGETS_TO_BUILD X86 ARM AArch64 CACHE STRING "") 
 set(PACKAGE_VENDOR Apple CACHE STRING "")
+set(CLANG_VENDOR_UTI com.apple.clang CACHE STRING "")
 set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL "")
 set(LLVM_INCLUDE_DOCS OFF CACHE BOOL "")
 set(LLVM_TOOL_CLANG_TOOLS_EXTRA_BUILD OFF CACHE BOOL "")
@@ -15,7 +16,9 @@
 set(LLVM_EXTERNALIZE_DEBUGINFO ON CACHE BOOL "")
 set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "")
 set(BUG_REPORT_URL "http://developer.apple.com/bugreporter/" CACHE STRING "")
-set(LLVM_ENABLE_TIMESTAMPS OFF CACHE BOOL "Don't time-stamp shipping builds - this makes builds reproducible")
+
+set(LLVM_BUILD_EXTERNAL_COMPILER_RT ON CACHE BOOL "Build Compiler-RT with just-built clang")
+set(COMPILER_RT_ENABLE_IOS ON CACHE BOOL "Build iOS Compiler-RT libraries")
 
 # Make unit tests (if present) part of the ALL target
 set(LLVM_BUILD_TESTS ON CACHE BOOL "")
@@ -28,7 +31,7 @@
 set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
 
 set(LIBCXX_INSTALL_LIBRARY OFF CACHE BOOL "")
-set(LIBCXX_INSTALL_HEADERS OFF CACHE BOOL "")
+set(LIBCXX_INSTALL_HEADERS ON CACHE BOOL "")
 set(LIBCXX_INCLUDE_TESTS OFF CACHE BOOL "")
 set(LLVM_LTO_VERSION_OFFSET 3000 CACHE STRING "")
 
@@ -48,6 +51,8 @@
   clang
   LTO
   clang-format
+  clang-headers
+  libcxx-headers
   ${LLVM_TOOLCHAIN_TOOLS}
   CACHE STRING "")
 
diff --git a/cmake/caches/PGO-stage2-instrumented.cmake b/cmake/caches/PGO-stage2-instrumented.cmake
index fe5e83d..37b319c 100644
--- a/cmake/caches/PGO-stage2-instrumented.cmake
+++ b/cmake/caches/PGO-stage2-instrumented.cmake
@@ -1,9 +1,21 @@
-set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "")
 set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
-set(LLVM_BUILD_EXTERNAL_COMPILER_RT ON CACHE BOOL "")
+set(CLANG_BOOTSTRAP_TARGETS
+  distribution
+  install-distribution
+  check-all
+  check-llvm
+  check-clang
+  test-suite CACHE STRING "")
 
-set(CLANG_BOOTSTRAP_TARGETS check-all check-llvm check-clang test-suite CACHE STRING "")
+if(PGO_BUILD_CONFIGURATION)
+  include(${PGO_BUILD_CONFIGURATION})
+  set(CLANG_BOOTSTRAP_CMAKE_ARGS
+    -C ${PGO_BUILD_CONFIGURATION}
+    CACHE STRING "")
+else()
+  include(${CMAKE_CURRENT_LIST_DIR}/PGO-stage2.cmake)
 
-set(CLANG_BOOTSTRAP_CMAKE_ARGS
-  -C ${CMAKE_CURRENT_LIST_DIR}/PGO-stage2.cmake
-  CACHE STRING "")
+  set(CLANG_BOOTSTRAP_CMAKE_ARGS
+    -C ${CMAKE_CURRENT_LIST_DIR}/PGO-stage2.cmake
+    CACHE STRING "")
+endif()
diff --git a/cmake/caches/PGO.cmake b/cmake/caches/PGO.cmake
index dc11173..bca9ba0 100644
--- a/cmake/caches/PGO.cmake
+++ b/cmake/caches/PGO.cmake
@@ -7,11 +7,23 @@
 set(CLANG_BOOTSTRAP_TARGETS
   generate-profdata
   stage2
+  stage2-distribution
+  stage2-install-distribution
   stage2-check-all
   stage2-check-llvm
   stage2-check-clang
   stage2-test-suite CACHE STRING "")
 
+if(PGO_INSTRUMENT_LTO)
+  set(BOOTSTRAP_LLVM_ENABLE_LTO ON CACHE BOOL "")
+  set(BOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_LTO ON CACHE BOOL "")
+endif()
+
+if(PGO_BUILD_CONFIGURATION)
+  set(EXTRA_ARGS -DPGO_BUILD_CONFIGURATION=${PGO_BUILD_CONFIGURATION})
+endif()
+
 set(CLANG_BOOTSTRAP_CMAKE_ARGS
+  ${EXTRA_ARGS}
   -C ${CMAKE_CURRENT_LIST_DIR}/PGO-stage2-instrumented.cmake
   CACHE STRING "")
diff --git a/cmake/caches/README.txt b/cmake/caches/README.txt
index 55e5e15..db5c85a 100644
--- a/cmake/caches/README.txt
+++ b/cmake/caches/README.txt
@@ -4,15 +4,71 @@
 This directory contains CMake cache scripts that pre-populate the CMakeCache in
 a build directory with commonly used settings.
 
-The first two cache files in the directory are used by Apple to build the clang
-distribution packaged with Xcode. You can use the caches with the following
-CMake invocation:
+You can use the caches files with the following CMake invocation:
 
 cmake -G <build system>
-  -C <path to llvm>/tools/clang/cmake/caches/Apple-stage1.cmake
-  -DCMAKE_BUILD_TYPE=Release
-  [-DCMAKE_INSTALL_PREFIX=<install path>]
+  -C <path to cache file>
+  [additional CMake options (i.e. -DCMAKE_INSTALL_PREFIX=<install path>)]
   <path to llvm>
 
-Building the `bootstrap` target from this generation will build clang, and
-`bootstrap-install` will install it.
+Options specified on the command line will override options in the cache files.
+
+The following cache files exist.
+
+Apple-stage1
+------------
+
+The Apple stage1 cache configures a two stage build similar to how Apple builds
+the clang shipped with Xcode. The build files generated from this invocation has
+a target named "stage2" which performs an LTO build of clang.
+
+The Apple-stage2 cache can be used directly to match the build settings Apple
+uses in shipping builds without doing a full bootstrap build.
+
+PGO
+---
+
+The PGO CMake cache can be used to generate a multi-stage instrumented compiler.
+You can configure your build directory with the following invocation of CMake:
+
+cmake -G <generator> -C <path_to_clang>/cmake/caches/PGO.cmake <source dir>
+
+After configuration the following additional targets will be generated:
+
+stage2-instrumented:
+Builds a stage1 x86 compiler, runtime, and required tools (llvm-config,
+llvm-profdata) then uses that compiler to build an instrumented stage2 compiler.
+
+stage2-instrumented-generate-profdata:
+Depends on "stage2-instrumented" and will use the instrumented compiler to
+generate profdata based on the training files in <clang>/utils/perf-training
+
+stage2:
+Depends on "stage2-instrumented-generate-profdata" and will use the stage1
+compiler with the stage2 profdata to build a PGO-optimized compiler.
+
+stage2-check-llvm:
+Depends on stage2 and runs check-llvm using the stage3 compiler.
+
+stage2-check-clang:
+Depends on stage2 and runs check-clang using the stage3 compiler.
+
+stage2-check-all:
+Depends on stage2 and runs check-all using the stage3 compiler.
+
+stage2-test-suite:
+Depends on stage2 and runs the test-suite using the stage3 compiler (requires
+in-tree test-suite).
+
+3-stage
+-------
+
+This cache file can be used to generate a 3-stage clang build. You can configure
+using the following CMake command:
+
+cmake -C <path to clang>/cmake/caches/3-stage.cmake -G Ninja <path to llvm>
+
+You can then run "ninja stage3-clang" to build stage1, stage2 and stage3 clangs.
+
+This is useful for finding non-determinism the compiler by verifying that stage2
+and stage3 are identical.
diff --git a/cmake/modules/AddClang.cmake b/cmake/modules/AddClang.cmake
new file mode 100644
index 0000000..6e063a7
--- /dev/null
+++ b/cmake/modules/AddClang.cmake
@@ -0,0 +1,149 @@
+function(clang_tablegen)
+  # Syntax:
+  # clang_tablegen output-file [tablegen-arg ...] SOURCE source-file
+  # [[TARGET cmake-target-name] [DEPENDS extra-dependency ...]]
+  #
+  # Generates a custom command for invoking tblgen as
+  #
+  # tblgen source-file -o=output-file tablegen-arg ...
+  #
+  # and, if cmake-target-name is provided, creates a custom target for
+  # executing the custom command depending on output-file. It is
+  # possible to list more files to depend after DEPENDS.
+
+  cmake_parse_arguments(CTG "" "SOURCE;TARGET" "" ${ARGN})
+
+  if( NOT CTG_SOURCE )
+    message(FATAL_ERROR "SOURCE source-file required by clang_tablegen")
+  endif()
+
+  set( LLVM_TARGET_DEFINITIONS ${CTG_SOURCE} )
+  tablegen(CLANG ${CTG_UNPARSED_ARGUMENTS})
+
+  if(CTG_TARGET)
+    add_public_tablegen_target(${CTG_TARGET})
+    set_target_properties( ${CTG_TARGET} PROPERTIES FOLDER "Clang tablegenning")
+    set_property(GLOBAL APPEND PROPERTY CLANG_TABLEGEN_TARGETS ${CTG_TARGET})
+  endif()
+endfunction(clang_tablegen)
+
+macro(set_clang_windows_version_resource_properties name)
+  if(DEFINED windows_resource_file)
+    set_windows_version_resource_properties(${name} ${windows_resource_file}
+      VERSION_MAJOR ${CLANG_VERSION_MAJOR}
+      VERSION_MINOR ${CLANG_VERSION_MINOR}
+      VERSION_PATCHLEVEL ${CLANG_VERSION_PATCHLEVEL}
+      VERSION_STRING "${CLANG_VERSION} (${BACKEND_PACKAGE_STRING})"
+      PRODUCT_NAME "clang")
+  endif()
+endmacro()
+
+macro(add_clang_subdirectory name)
+  add_llvm_subdirectory(CLANG TOOL ${name})
+endmacro()
+
+macro(add_clang_library name)
+  cmake_parse_arguments(ARG
+    "SHARED"
+    ""
+    "ADDITIONAL_HEADERS"
+    ${ARGN})
+  set(srcs)
+  if(MSVC_IDE OR XCODE)
+    # Add public headers
+    file(RELATIVE_PATH lib_path
+      ${CLANG_SOURCE_DIR}/lib/
+      ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+    if(NOT lib_path MATCHES "^[.][.]")
+      file( GLOB_RECURSE headers
+        ${CLANG_SOURCE_DIR}/include/clang/${lib_path}/*.h
+        ${CLANG_SOURCE_DIR}/include/clang/${lib_path}/*.def
+      )
+      set_source_files_properties(${headers} PROPERTIES HEADER_FILE_ONLY ON)
+
+      file( GLOB_RECURSE tds
+        ${CLANG_SOURCE_DIR}/include/clang/${lib_path}/*.td
+      )
+      source_group("TableGen descriptions" FILES ${tds})
+      set_source_files_properties(${tds}} PROPERTIES HEADER_FILE_ONLY ON)
+
+      if(headers OR tds)
+        set(srcs ${headers} ${tds})
+      endif()
+    endif()
+  endif(MSVC_IDE OR XCODE)
+  if(srcs OR ARG_ADDITIONAL_HEADERS)
+    set(srcs
+      ADDITIONAL_HEADERS
+      ${srcs}
+      ${ARG_ADDITIONAL_HEADERS} # It may contain unparsed unknown args.
+      )
+  endif()
+  if(ARG_SHARED)
+    set(ARG_ENABLE_SHARED SHARED)
+  endif()
+  llvm_add_library(${name} ${ARG_ENABLE_SHARED} ${ARG_UNPARSED_ARGUMENTS} ${srcs})
+
+  if(TARGET ${name})
+    target_link_libraries(${name} INTERFACE ${LLVM_COMMON_LIBS})
+
+    if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "libclang")
+      install(TARGETS ${name}
+        COMPONENT ${name}
+        EXPORT ClangTargets
+        LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+        ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+        RUNTIME DESTINATION bin)
+
+      if (${ARG_SHARED} AND NOT CMAKE_CONFIGURATION_TYPES)
+        add_custom_target(install-${name}
+                          DEPENDS ${name}
+                          COMMAND "${CMAKE_COMMAND}"
+                                  -DCMAKE_INSTALL_COMPONENT=${name}
+                                  -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+      endif()
+    endif()
+    set_property(GLOBAL APPEND PROPERTY CLANG_EXPORTS ${name})
+  else()
+    # Add empty "phony" target
+    add_custom_target(${name})
+  endif()
+
+  set_target_properties(${name} PROPERTIES FOLDER "Clang libraries")
+  set_clang_windows_version_resource_properties(${name})
+endmacro(add_clang_library)
+
+macro(add_clang_executable name)
+  add_llvm_executable( ${name} ${ARGN} )
+  set_target_properties(${name} PROPERTIES FOLDER "Clang executables")
+  set_clang_windows_version_resource_properties(${name})
+endmacro(add_clang_executable)
+
+macro(add_clang_tool name)
+  if (NOT CLANG_BUILD_TOOLS)
+    set(EXCLUDE_FROM_ALL ON)
+  endif()
+
+  add_clang_executable(${name} ${ARGN})
+
+  if (CLANG_BUILD_TOOLS)
+    install(TARGETS ${name}
+      RUNTIME DESTINATION bin
+      COMPONENT ${name})
+
+    if(NOT CMAKE_CONFIGURATION_TYPES)
+      add_custom_target(install-${name}
+        DEPENDS ${name}
+        COMMAND "${CMAKE_COMMAND}"
+        -DCMAKE_INSTALL_COMPONENT=${name}
+        -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+    endif()
+  endif()
+endmacro()
+
+macro(add_clang_symlink name dest)
+  add_llvm_tool_symlink(${name} ${dest} ALWAYS_GENERATE)
+  # Always generate install targets
+  llvm_install_symlink(${name} ${dest} ALWAYS_GENERATE)
+endmacro()
diff --git a/docs/AddressSanitizer.rst b/docs/AddressSanitizer.rst
index 93f6314..f64f60d 100644
--- a/docs/AddressSanitizer.rst
+++ b/docs/AddressSanitizer.rst
@@ -14,7 +14,8 @@
 
 * Out-of-bounds accesses to heap, stack and globals
 * Use-after-free
-* Use-after-return (to some extent)
+* Use-after-return (runtime flag `ASAN_OPTIONS=detect_stack_use_after_return=1`)
+* Use-after-scope (clang flag `-fsanitize-address-use-after-scope`)
 * Double-free, invalid free
 * Memory leaks (experimental)
 
@@ -232,6 +233,23 @@
     type:*BadInitClassSubstring*=init
     src:bad/init/files/*=init
 
+Suppressing memory leaks
+------------------------
+
+Memory leak reports produced by :doc:`LeakSanitizer` (if it is run as a part
+of AddressSanitizer) can be suppressed by a separate file passed as
+
+.. code-block:: bash
+
+    LSAN_OPTIONS=suppressions=MyLSan.supp
+
+which contains lines of the form `leak:<pattern>`. Memory leak will be
+suppressed if pattern matches any function name, source file name, or
+library name in the symbolized stack trace of the leak report. See
+`full documentation
+<https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer#suppressions>`_
+for more details.
+
 Limitations
 ===========
 
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index f42439a..13b79fdf 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -95,6 +95,10 @@
     include(AddSphinxTarget)
     if (${SPHINX_OUTPUT_HTML})
       add_sphinx_target(html clang)
+      add_custom_command(TARGET docs-clang-html POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+        "${CMAKE_CURRENT_SOURCE_DIR}/LibASTMatchersReference.html"
+        "${CMAKE_CURRENT_BINARY_DIR}/html/LibASTMatchersReference.html")
     endif()
     if (${SPHINX_OUTPUT_MAN})
       add_sphinx_target(man clang)
diff --git a/docs/ClangFormat.rst b/docs/ClangFormat.rst
index b4968ef..3f52b76 100644
--- a/docs/ClangFormat.rst
+++ b/docs/ClangFormat.rst
@@ -190,7 +190,7 @@
 
 .. code-block:: console
 
-  svn diff --diff-cmd=diff -x-U0 | clang-format-diff.py -i
+  svn diff --diff-cmd=diff -x -U0 | clang-format-diff.py -i
 
-The :option:`-U0` will create a diff without context lines (the script would format
+The option `-U0` will create a diff without context lines (the script would format
 those as well).
diff --git a/docs/ClangFormatStyleOptions.rst b/docs/ClangFormatStyleOptions.rst
index bfabd59..3f76da6 100644
--- a/docs/ClangFormatStyleOptions.rst
+++ b/docs/ClangFormatStyleOptions.rst
@@ -154,7 +154,7 @@
   If ``true``, horizontally aligns arguments after an open bracket.
 
   This applies to round brackets (parentheses), angle brackets and square
-  brackets. This will result in formattings like
+  brackets.
 
   Possible values:
 
@@ -165,6 +165,7 @@
 
       someLongFunction(argument1,
                        argument2);
+
   * ``BAS_DontAlign`` (in configuration: ``DontAlign``)
     Don't align, instead use ``ContinuationIndentWidth``, e.g.:
 
@@ -172,6 +173,7 @@
 
       someLongFunction(argument1,
           argument2);
+
   * ``BAS_AlwaysBreak`` (in configuration: ``AlwaysBreak``)
     Always break after an open bracket, if the parameters don't fit
     on a single line, e.g.:
@@ -182,6 +184,7 @@
           argument1, argument2);
 
 
+
 **AlignConsecutiveAssignments** (``bool``)
   If ``true``, aligns consecutive assignments.
 
@@ -214,6 +217,14 @@
   If ``true``, horizontally align operands of binary and ternary
   expressions.
 
+  Specifically, this aligns operands of a single expression that needs to be
+  split over multiple lines, e.g.:
+
+  .. code-block:: c++
+
+    int aaa = bbbbbbbbbbbbbbb +
+              ccccccccccccccc;
+
 **AlignTrailingComments** (``bool``)
   If ``true``, aligns trailing comments.
 
@@ -230,28 +241,31 @@
   If ``true``, short case labels will be contracted to a single line.
 
 **AllowShortFunctionsOnASingleLine** (``ShortFunctionStyle``)
-  Dependent on the value, ``int f() { return 0; }`` can be put
-  on a single line.
+  Dependent on the value, ``int f() { return 0; }`` can be put on a
+  single line.
 
   Possible values:
 
   * ``SFS_None`` (in configuration: ``None``)
     Never merge functions into a single line.
+
   * ``SFS_Empty`` (in configuration: ``Empty``)
     Only merge empty functions.
+
   * ``SFS_Inline`` (in configuration: ``Inline``)
     Only merge functions defined inside a class. Implies "empty".
+
   * ``SFS_All`` (in configuration: ``All``)
     Merge all functions fitting on a single line.
 
 
+
 **AllowShortIfStatementsOnASingleLine** (``bool``)
-  If ``true``, ``if (a) return;`` can be put on a single
-  line.
+  If ``true``, ``if (a) return;`` can be put on a single line.
 
 **AllowShortLoopsOnASingleLine** (``bool``)
-  If ``true``, ``while (true) continue;`` can be put on a
-  single line.
+  If ``true``, ``while (true) continue;`` can be put on a single
+  line.
 
 **AlwaysBreakAfterDefinitionReturnType** (``DefinitionReturnTypeBreakingStyle``)
   The function definition return type breaking style to use.  This
@@ -262,12 +276,15 @@
   * ``DRTBS_None`` (in configuration: ``None``)
     Break after return type automatically.
     ``PenaltyReturnTypeOnItsOwnLine`` is taken into account.
+
   * ``DRTBS_All`` (in configuration: ``All``)
     Always break after the return type.
+
   * ``DRTBS_TopLevel`` (in configuration: ``TopLevel``)
     Always break after the return types of top-level functions.
 
 
+
 **AlwaysBreakAfterReturnType** (``ReturnTypeBreakingStyle``)
   The function declaration return type breaking style to use.
 
@@ -276,16 +293,21 @@
   * ``RTBS_None`` (in configuration: ``None``)
     Break after return type automatically.
     ``PenaltyReturnTypeOnItsOwnLine`` is taken into account.
+
   * ``RTBS_All`` (in configuration: ``All``)
     Always break after the return type.
+
   * ``RTBS_TopLevel`` (in configuration: ``TopLevel``)
     Always break after the return types of top-level functions.
+
   * ``RTBS_AllDefinitions`` (in configuration: ``AllDefinitions``)
     Always break after the return type of function definitions.
+
   * ``RTBS_TopLevelDefinitions`` (in configuration: ``TopLevelDefinitions``)
     Always break after the return type of top-level definitions.
 
 
+
 **AlwaysBreakBeforeMultilineStrings** (``bool``)
   If ``true``, always break before multiline string literals.
 
@@ -295,8 +317,8 @@
   ``ContinuationIndentWidth`` spaces from the start of the line.
 
 **AlwaysBreakTemplateDeclarations** (``bool``)
-  If ``true``, always break after the ``template<...>`` of a
-  template declaration.
+  If ``true``, always break after the ``template<...>`` of a template
+  declaration.
 
 **BinPackArguments** (``bool``)
   If ``false``, a function call's arguments will either be all on the
@@ -309,17 +331,17 @@
 **BraceWrapping** (``BraceWrappingFlags``)
   Control of individual brace wrapping cases.
 
-  If ``BreakBeforeBraces`` is set to ``custom``, use this to specify how each
-  individual brace case should be handled. Otherwise, this is ignored.
+  If ``BreakBeforeBraces`` is set to ``BS_Custom``, use this to specify how
+  each individual brace case should be handled. Otherwise, this is ignored.
 
   Nested configuration flags:
 
   * ``bool AfterClass`` Wrap class definitions.
-  * ``bool AfterControlStatement`` Wrap control statements (if/for/while/switch/..).
+  * ``bool AfterControlStatement`` Wrap control statements (``if``/``for``/``while``/``switch``/..).
   * ``bool AfterEnum`` Wrap enum definitions.
   * ``bool AfterFunction`` Wrap function definitions.
   * ``bool AfterNamespace`` Wrap namespace definitions.
-  * ``bool AfterObjCDeclaration`` Wrap ObjC definitions (@autoreleasepool, interfaces, ..).
+  * ``bool AfterObjCDeclaration`` Wrap ObjC definitions (``@autoreleasepool``, interfaces, ..).
   * ``bool AfterStruct`` Wrap struct definitions.
   * ``bool AfterUnion`` Wrap union definitions.
   * ``bool BeforeCatch`` Wrap before ``catch``.
@@ -337,12 +359,15 @@
 
   * ``BOS_None`` (in configuration: ``None``)
     Break after operators.
+
   * ``BOS_NonAssignment`` (in configuration: ``NonAssignment``)
     Break before operators that aren't assignments.
+
   * ``BOS_All`` (in configuration: ``All``)
     Break before operators.
 
 
+
 **BreakBeforeBraces** (``BraceBreakingStyle``)
   The brace breaking style to use.
 
@@ -350,24 +375,33 @@
 
   * ``BS_Attach`` (in configuration: ``Attach``)
     Always attach braces to surrounding context.
+
   * ``BS_Linux`` (in configuration: ``Linux``)
     Like ``Attach``, but break before braces on function, namespace and
     class definitions.
+
   * ``BS_Mozilla`` (in configuration: ``Mozilla``)
     Like ``Attach``, but break before braces on enum, function, and record
     definitions.
+
   * ``BS_Stroustrup`` (in configuration: ``Stroustrup``)
-    Like ``Attach``, but break before function definitions, 'catch', and 'else'.
+    Like ``Attach``, but break before function definitions, ``catch``, and
+    ``else``.
+
   * ``BS_Allman`` (in configuration: ``Allman``)
     Always break before braces.
+
   * ``BS_GNU`` (in configuration: ``GNU``)
     Always break before braces and add an extra level of indentation to
     braces of control statements, not to those of class, function
     or other definitions.
+
   * ``BS_WebKit`` (in configuration: ``WebKit``)
     Like ``Attach``, but break before functions.
+
   * ``BS_Custom`` (in configuration: ``Custom``)
-    Configure each individual brace in ``BraceWrapping``.
+    Configure each individual brace in `BraceWrapping`.
+
 
 
 **BreakBeforeTernaryOperators** (``bool``)
@@ -377,6 +411,9 @@
   Always break constructor initializers before commas and align
   the commas with the colon.
 
+**BreakStringLiterals** (``bool``)
+  Allow breaking string literals when formatting.
+
 **ColumnLimit** (``unsigned``)
   The column limit.
 
@@ -416,7 +453,8 @@
 
 **DerivePointerAlignment** (``bool``)
   If ``true``, analyze the formatted file for the most common
-  alignment of & and \*. ``PointerAlignment`` is then used only as fallback.
+  alignment of ``&`` and ``\*``. ``PointerAlignment`` is then used only as
+  fallback.
 
 **DisableFormat** (``bool``)
   Disables formatting completely.
@@ -446,30 +484,32 @@
 
   In the .clang-format configuration file, this can be configured like:
 
-  .. code-block:: c++
+  .. code-block:: yaml
 
     ForEachMacros: ['RANGES_FOR', 'FOREACH']
 
   For example: BOOST_FOREACH.
 
 **IncludeCategories** (``std::vector<IncludeCategory>``)
-  Regular expressions denoting the different #include categories used
-  for ordering #includes.
+  Regular expressions denoting the different ``#include`` categories
+  used for ordering ``#includes``.
 
   These regular expressions are matched against the filename of an include
   (including the <> or "") in order. The value belonging to the first
-  matching regular expression is assigned and #includes are sorted first
+  matching regular expression is assigned and ``#includes`` are sorted first
   according to increasing category number and then alphabetically within
   each category.
 
-  If none of the regular expressions match, UINT_MAX is assigned as
-  category. The main header for a source file automatically gets category 0,
-  so that it is kept at the beginning of the #includes
-  (http://llvm.org/docs/CodingStandards.html#include-style).
+  If none of the regular expressions match, INT_MAX is assigned as
+  category. The main header for a source file automatically gets category 0.
+  so that it is generally kept at the beginning of the ``#includes``
+  (http://llvm.org/docs/CodingStandards.html#include-style). However, you
+  can also assign negative priorities if you have certain headers that
+  always need to be first.
 
   To configure this in the .clang-format file, use:
 
-  .. code-block:: c++
+  .. code-block:: yaml
 
     IncludeCategories:
       - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
@@ -479,6 +519,19 @@
       - Regex:           '.\*'
         Priority:        1
 
+**IncludeIsMainRegex** (``std::string``)
+  Specify a regular expression of suffixes that are allowed in the
+  file-to-main-include mapping.
+
+  When guessing whether a #include is the "main" include (to assign
+  category 0, see above), use this regex of allowed suffixes to the header
+  stem. A partial match is done, so that:
+  - "" means "arbitrary suffix"
+  - "$" means "no suffix"
+
+  For example, if configured to "(_test)?$", then a header a.h would be seen
+  as the "main" include in both a.cc and a_test.cc.
+
 **IndentCaseLabels** (``bool``)
   Indent case labels one level from the switch statement.
 
@@ -492,6 +545,22 @@
   Indent if a function definition or declaration is wrapped after the
   type.
 
+**JavaScriptQuotes** (``JavaScriptQuoteStyle``)
+  The JavaScriptQuoteStyle to use for JavaScript strings.
+
+  Possible values:
+
+  * ``JSQS_Leave`` (in configuration: ``Leave``)
+    Leave string quotes as they are.
+
+  * ``JSQS_Single`` (in configuration: ``Single``)
+    Always use single quotes.
+
+  * ``JSQS_Double`` (in configuration: ``Double``)
+    Always use double quotes.
+
+
+
 **KeepEmptyLinesAtTheStartOfBlocks** (``bool``)
   If true, empty lines at the start of blocks are kept.
 
@@ -502,16 +571,24 @@
 
   * ``LK_None`` (in configuration: ``None``)
     Do not use.
+
   * ``LK_Cpp`` (in configuration: ``Cpp``)
     Should be used for C, C++, ObjectiveC, ObjectiveC++.
+
   * ``LK_Java`` (in configuration: ``Java``)
     Should be used for Java.
+
   * ``LK_JavaScript`` (in configuration: ``JavaScript``)
     Should be used for JavaScript.
+
   * ``LK_Proto`` (in configuration: ``Proto``)
     Should be used for Protocol Buffers
     (https://developers.google.com/protocol-buffers/).
 
+  * ``LK_TableGen`` (in configuration: ``TableGen``)
+    Should be used for TableGen code.
+
+
 
 **MacroBlockBegin** (``std::string``)
   A regular expression matching macros that start a block.
@@ -529,25 +606,28 @@
 
   * ``NI_None`` (in configuration: ``None``)
     Don't indent in namespaces.
+
   * ``NI_Inner`` (in configuration: ``Inner``)
     Indent only in inner namespaces (nested in other namespaces).
+
   * ``NI_All`` (in configuration: ``All``)
     Indent in all namespaces.
 
 
+
 **ObjCBlockIndentWidth** (``unsigned``)
   The number of characters to use for indentation of ObjC blocks.
 
 **ObjCSpaceAfterProperty** (``bool``)
   Add a space after ``@property`` in Objective-C, i.e. use
-  ``\@property (readonly)`` instead of ``\@property(readonly)``.
+  ``@property (readonly)`` instead of ``@property(readonly)``.
 
 **ObjCSpaceBeforeProtocolList** (``bool``)
   Add a space in front of an Objective-C protocol list, i.e. use
   ``Foo <Protocol>`` instead of ``Foo<Protocol>``.
 
 **PenaltyBreakBeforeFirstCallParameter** (``unsigned``)
-  The penalty for breaking a function call after "call(".
+  The penalty for breaking a function call after ``call(``.
 
 **PenaltyBreakComment** (``unsigned``)
   The penalty for each line break introduced inside a comment.
@@ -572,15 +652,27 @@
 
   * ``PAS_Left`` (in configuration: ``Left``)
     Align pointer to the left.
+
   * ``PAS_Right`` (in configuration: ``Right``)
     Align pointer to the right.
+
   * ``PAS_Middle`` (in configuration: ``Middle``)
     Align pointer in the middle.
 
 
+
+**ReflowComments** (``bool``)
+  If ``true``, clang-format will attempt to re-flow comments.
+
+**SortIncludes** (``bool``)
+  If ``true``, clang-format will sort ``#includes``.
+
 **SpaceAfterCStyleCast** (``bool``)
   If ``true``, a space may be inserted after C style casts.
 
+**SpaceAfterTemplateKeyword** (``bool``)
+  If ``true``, a space will be inserted after the 'template' keyword.
+
 **SpaceBeforeAssignmentOperators** (``bool``)
   If ``false``, spaces will be removed before assignment operators.
 
@@ -591,9 +683,11 @@
 
   * ``SBPO_Never`` (in configuration: ``Never``)
     Never put a space before opening parentheses.
+
   * ``SBPO_ControlStatements`` (in configuration: ``ControlStatements``)
     Put a space before opening parentheses only after control statement
     keywords (``for/if/while...``).
+
   * ``SBPO_Always`` (in configuration: ``Always``)
     Always put a space before opening parentheses, except when it's
     prohibited by the syntax rules (in function-like macro definitions) or
@@ -601,19 +695,21 @@
     parentheses, etc.)
 
 
+
 **SpaceInEmptyParentheses** (``bool``)
-  If ``true``, spaces may be inserted into '()'.
+  If ``true``, spaces may be inserted into ``()``.
 
 **SpacesBeforeTrailingComments** (``unsigned``)
   The number of spaces before trailing line comments
   (``//`` - comments).
 
-  This does not affect trailing block comments (``/**/`` - comments) as those
-  commonly have different usage patterns and a number of special cases.
+  This does not affect trailing block comments (``/*`` - comments) as
+  those commonly have different usage patterns and a number of special
+  cases.
 
 **SpacesInAngles** (``bool``)
-  If ``true``, spaces will be inserted after '<' and before '>' in
-  template argument lists
+  If ``true``, spaces will be inserted after ``<`` and before ``>``
+  in template argument lists.
 
 **SpacesInCStyleCastParentheses** (``bool``)
   If ``true``, spaces may be inserted into C style casts.
@@ -623,26 +719,28 @@
   ObjC and Javascript array and dict literals).
 
 **SpacesInParentheses** (``bool``)
-  If ``true``, spaces will be inserted after '(' and before ')'.
+  If ``true``, spaces will be inserted after ``(`` and before ``)``.
 
 **SpacesInSquareBrackets** (``bool``)
-  If ``true``, spaces will be inserted after '[' and before ']'.
+  If ``true``, spaces will be inserted after ``[`` and before ``]``.
 
 **Standard** (``LanguageStandard``)
-  Format compatible with this standard, e.g. use
-  ``A<A<int> >`` instead of ``A<A<int>>`` for LS_Cpp03.
+  Format compatible with this standard, e.g. use ``A<A<int> >``
+  instead of ``A<A<int>>`` for ``LS_Cpp03``.
 
   Possible values:
 
   * ``LS_Cpp03`` (in configuration: ``Cpp03``)
     Use C++03-compatible syntax.
+
   * ``LS_Cpp11`` (in configuration: ``Cpp11``)
-    Use features of C++11 (e.g. ``A<A<int>>`` instead of
-    ``A<A<int> >``).
+    Use features of C++11 (e.g. ``A<A<int>>`` instead of ``A<A<int> >``).
+
   * ``LS_Auto`` (in configuration: ``Auto``)
     Automatic detection based on the input.
 
 
+
 **TabWidth** (``unsigned``)
   The number of columns used for tab stops.
 
@@ -653,20 +751,23 @@
 
   * ``UT_Never`` (in configuration: ``Never``)
     Never use tab.
+
   * ``UT_ForIndentation`` (in configuration: ``ForIndentation``)
     Use tabs only for indentation.
+
   * ``UT_Always`` (in configuration: ``Always``)
     Use tabs whenever we need to fill whitespace that spans at least from
     one tab stop to the next one.
 
 
+
 .. END_FORMAT_STYLE_OPTIONS
 
 Adding additional style options
 ===============================
 
 Each additional style option adds costs to the clang-format project. Some of
-these costs affect the clang-format developement itself, as we need to make
+these costs affect the clang-format development itself, as we need to make
 sure that any given combination of options work and that new features don't
 break any of the existing options in any way. There are also costs for end users
 as options become less discoverable and people have to think about and make a
diff --git a/docs/ClangPlugins.rst b/docs/ClangPlugins.rst
index 9a5bc14..833f0dd 100644
--- a/docs/ClangPlugins.rst
+++ b/docs/ClangPlugins.rst
@@ -43,6 +43,26 @@
 
   static FrontendPluginRegistry::Add<MyPlugin> X("my-plugin-name", "my plugin description");
 
+Defining pragmas
+================
+
+Plugins can also define pragmas by declaring a ``PragmaHandler`` and
+registering it using ``PragmaHandlerRegistry::Add<>``:
+
+.. code-block:: c++
+
+  // Define a pragma handler for #pragma example_pragma
+  class ExamplePragmaHandler : public PragmaHandler {
+  public:
+    ExamplePragmaHandler() : PragmaHandler("example_pragma") { }
+    void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
+                      Token &PragmaTok) {
+      // Handle the pragma
+    }
+  };
+
+  static PragmaHandlerRegistry::Add<ExamplePragmaHandler> Y("example_pragma","example pragma description");
+
 Putting it all together
 =======================
 
@@ -54,21 +74,25 @@
 Running the plugin
 ==================
 
+
+Using the cc1 command line
+--------------------------
+
 To run a plugin, the dynamic library containing the plugin registry must be
-loaded via the :option:`-load` command line option. This will load all plugins
+loaded via the `-load` command line option. This will load all plugins
 that are registered, and you can select the plugins to run by specifying the
-:option:`-plugin` option. Additional parameters for the plugins can be passed with
-:option:`-plugin-arg-<plugin-name>`.
+`-plugin` option. Additional parameters for the plugins can be passed with
+`-plugin-arg-<plugin-name>`.
 
 Note that those options must reach clang's cc1 process. There are two
 ways to do so:
 
-* Directly call the parsing process by using the :option:`-cc1` option; this
+* Directly call the parsing process by using the `-cc1` option; this
   has the downside of not configuring the default header search paths, so
   you'll need to specify the full system path configuration on the command
   line.
 * Use clang as usual, but prefix all arguments to the cc1 process with
-  :option:`-Xclang`.
+  `-Xclang`.
 
 For example, to run the ``print-function-names`` plugin over a source file in
 clang, first build the plugin, and then call clang with the plugin from the
@@ -88,3 +112,19 @@
 Also see the print-function-name plugin example's
 `README <http://llvm.org/viewvc/llvm-project/cfe/trunk/examples/PrintFunctionNames/README.txt?view=markup>`_
 
+
+Using the clang command line
+----------------------------
+
+Using `-fplugin=plugin` on the clang command line passes the plugin
+through as an argument to `-load` on the cc1 command line. If the plugin
+class implements the ``getActionType`` method then the plugin is run
+automatically. For example, to run the plugin automatically after the main AST
+action (i.e. the same as using `-add-plugin`):
+
+.. code-block:: c++
+
+  // Automatically run the plugin after the main AST action
+  PluginASTAction::ActionType getActionType() override {
+    return AddAfterMainAction;
+  }
diff --git a/docs/ControlFlowIntegrity.rst b/docs/ControlFlowIntegrity.rst
index 780ff88..eed5ac5 100644
--- a/docs/ControlFlowIntegrity.rst
+++ b/docs/ControlFlowIntegrity.rst
@@ -25,13 +25,25 @@
 so it is required to specify ``-flto``, and the linker used must support LTO,
 for example via the `gold plugin`_.
 
-To allow the checks to be implemented efficiently, the program must be
-structured such that certain object files are compiled with CFI
+To allow the checks to be implemented efficiently, the program must
+be structured such that certain object files are compiled with CFI
 enabled, and are statically linked into the program. This may preclude
-the use of shared libraries in some cases. Experimental support for
-:ref:`cross-DSO control flow integrity <cfi-cross-dso>` exists that
-does not have these requirements. This cross-DSO support has unstable
-ABI at this time.
+the use of shared libraries in some cases.
+
+The compiler will only produce CFI checks for a class if it can infer hidden
+LTO visibility for that class. LTO visibility is a property of a class that
+is inferred from flags and attributes. For more details, see the documentation
+for :doc:`LTO visibility <LTOVisibility>`.
+
+The ``-fsanitize=cfi-{vcall,nvcall,derived-cast,unrelated-cast}`` flags
+require that a ``-fvisibility=`` flag also be specified. This is because the
+default visibility setting is ``-fvisibility=default``, which would disable
+CFI checks for classes without visibility attributes. Most users will want
+to specify ``-fvisibility=hidden``, which enables CFI checks for such classes.
+
+Experimental support for :ref:`cross-DSO control flow integrity
+<cfi-cross-dso>` exists that does not require classes to have hidden LTO
+visibility. This cross-DSO support has unstable ABI at this time.
 
 .. _gold plugin: http://llvm.org/docs/GoldPlugin.html
 
@@ -129,7 +141,8 @@
 The difference between these two types of casts is that the first is defined
 by the C++ standard to produce an undefined value, while the second is not
 in itself undefined behavior (it is well defined to cast the pointer back
-to its original type).
+to its original type) unless the object is uninitialized and the cast is a
+``static_cast`` (see C++14 [basic.life]p5).
 
 If a program as a matter of policy forbids the second type of cast, that
 restriction can normally be enforced. However it may in some cases be necessary
@@ -232,11 +245,6 @@
 source files, functions and types using the ``src``, ``fun`` and ``type``
 entity types.
 
-In addition, if a type has a ``uuid`` attribute and the blacklist contains
-the type entry ``attr:uuid``, CFI checks are suppressed for that type. This
-allows all COM types to be easily blacklisted, which is useful as COM types
-are typically defined outside of the linked program.
-
 .. code-block:: bash
 
     # Suppress checking for code in a file.
@@ -246,8 +254,6 @@
     fun:*MyFooBar*
     # Ignore all types in the standard library.
     type:std::*
-    # Ignore all types with a uuid attribute.
-    type:attr:uuid
 
 .. _cfi-cross-dso:
 
@@ -259,6 +265,11 @@
 apply across DSO boundaries. As in the regular CFI, each DSO must be
 built with ``-flto``.
 
+Normally, CFI checks will only be performed for classes that have hidden LTO
+visibility. With this flag enabled, the compiler will emit cross-DSO CFI
+checks for all classes, except for those which appear in the CFI blacklist
+or which use a ``no_sanitize`` attribute.
+
 Design
 ======
 
diff --git a/docs/ControlFlowIntegrityDesign.rst b/docs/ControlFlowIntegrityDesign.rst
index b4aacd3..38c5e5b 100644
--- a/docs/ControlFlowIntegrityDesign.rst
+++ b/docs/ControlFlowIntegrityDesign.rst
@@ -90,10 +90,10 @@
 
 The compiler relies on co-operation from the linker in order to assemble
 the bit vectors for the whole program. It currently does this using LLVM's
-`bit sets`_ mechanism together with link-time optimization.
+`type metadata`_ mechanism together with link-time optimization.
 
 .. _address point: https://mentorembedded.github.io/cxx-abi/abi.html#vtable-general
-.. _bit sets: http://llvm.org/docs/BitSets.html
+.. _type metadata: http://llvm.org/docs/TypeMetadata.html
 .. _ByteArrayBuilder: http://llvm.org/docs/doxygen/html/structllvm_1_1ByteArrayBuilder.html
 
 Optimizations
@@ -196,7 +196,7 @@
 Vectors" above). The `GlobalLayoutBuilder`_ class is responsible for laying
 out the globals efficiently to minimize the sizes of the underlying bitsets.
 
-.. _GlobalLayoutBuilder: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/IPO/LowerBitSets.h?view=markup
+.. _GlobalLayoutBuilder: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/IPO/LowerTypeTests.h?view=markup
 
 Alignment
 ~~~~~~~~~
@@ -297,8 +297,8 @@
 jump table entries, so that addresses taken outside the module will pass
 any verification done inside the module.
 
-In more concrete terms, suppose we have three functions ``f``, ``g``, ``h``
-which are members of a single bitset, and a function foo that returns their
+In more concrete terms, suppose we have three functions ``f``, ``g``,
+``h`` which are all of the same type, and a function foo that returns their
 addresses:
 
 .. code-block:: none
@@ -439,10 +439,10 @@
 
    void __cfi_check(uint64 CallSiteTypeId, void *TargetAddr)
 
-This function provides external modules with access to CFI checks for
-the targets inside this DSO.  For each known ``CallSiteTypeId``, this
-functions performs an ``llvm.bitset.test`` with the corresponding bit
-set. It aborts if the type is unknown, or if the check fails.
+This function provides external modules with access to CFI checks for the
+targets inside this DSO.  For each known ``CallSiteTypeId``, this function
+performs an ``llvm.type.test`` with the corresponding type identifier. It
+aborts if the type is unknown, or if the check fails.
 
 The basic implementation is a large switch statement over all values
 of CallSiteTypeId supported by this DSO, and each case is similar to
diff --git a/docs/CrossCompilation.rst b/docs/CrossCompilation.rst
index 8a80271..c07bc21 100644
--- a/docs/CrossCompilation.rst
+++ b/docs/CrossCompilation.rst
@@ -32,7 +32,7 @@
 
 On the other hand, Clang/LLVM is natively a cross-compiler, meaning that
 one set of programs can compile to all targets by setting the ``-target``
-option. That makes it a lot easier for programers wishing to compile to
+option. That makes it a lot easier for programmers wishing to compile to
 different platforms and architectures, and for compiler developers that
 only have to maintain one build system, and for OS distributions, that
 need only one set of main packages.
diff --git a/docs/InternalsManual.rst b/docs/InternalsManual.rst
index c4af5b1..dc89d12 100644
--- a/docs/InternalsManual.rst
+++ b/docs/InternalsManual.rst
@@ -57,7 +57,7 @@
 <DiagnosticClient>` depending on how the ``DiagnosticClient`` interface is
 implemented.  A representative example of a diagnostic is:
 
-.. code-block:: c++
+.. code-block:: text
 
   t.c:38:15: error: invalid operands to binary expression ('int *' and '_Complex float')
   P = (P-42) + Gamma*4;
@@ -374,7 +374,7 @@
 example from the C++ front end, where we warn about the right-shift operator
 changing meaning from C++98 to C++11:
 
-.. code-block:: c++
+.. code-block:: text
 
   test.cpp:3:7: warning: use of right-shift operator ('>>') in template argument
                          will require parentheses in C++11
@@ -514,7 +514,7 @@
 each point to the beginning of their respective tokens.  For example consider
 the ``SourceRange`` of the following statement:
 
-.. code-block:: c++
+.. code-block:: text
 
   x = foo + bar;
   ^first    ^last
@@ -837,7 +837,7 @@
 The code above is illegal, and thus we expect there to be diagnostics emitted
 on the annotated lines.  In this example, we expect to get:
 
-.. code-block:: c++
+.. code-block:: text
 
   test.c:6:1: error: indirection requires pointer operand ('foo' invalid)
     *X; // error
@@ -1422,7 +1422,7 @@
 when one is using a debugger such as gdb.  For example, here is the output of
 ``FooCFG->dump()``:
 
-.. code-block:: c++
+.. code-block:: text
 
  [ B5 (ENTRY) ]
     Predecessors (0):
diff --git a/docs/ItaniumMangleAbiTags.rst b/docs/ItaniumMangleAbiTags.rst
new file mode 100644
index 0000000..2d65031
--- /dev/null
+++ b/docs/ItaniumMangleAbiTags.rst
@@ -0,0 +1,107 @@
+========
+ABI tags
+========
+
+Introduction
+============
+
+This text tries to describe gcc semantic for mangling "abi_tag" attributes
+described in https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Attributes.html
+
+There is no guarantee the following rules are correct, complete or make sense
+in any way as they were determined empirically by experiments with gcc5.
+
+Declaration
+===========
+
+ABI tags are declared in an abi_tag attribute and can be applied to a
+function, variable, class or inline namespace declaration. The attribute takes
+one or more strings (called tags); the order does not matter.
+
+See https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Attributes.html for
+details.
+
+Tags on an inline namespace are called "implicit tags", all other tags are
+"explicit tags".
+
+Mangling
+========
+
+All tags that are "active" on an <unqualified-name> are emitted after the
+<unqualified-name>, before <template-args> or <discriminator>, and are part of
+the same <substitution> the <unqualified-name> is.
+
+They are mangled as:
+
+.. code-block:: none
+
+    <abi-tags> ::= <abi-tag>*   # sort by name
+    <abi-tag> ::= B <tag source-name>
+
+Example:
+
+.. code-block:: c++
+
+    __attribute__((abi_tag("test")))
+    void Func();
+    // gets mangled as: _Z4FuncB4testv (prettified as `Func[abi:test]()`)
+
+Active tags
+===========
+
+A namespace does not have any active tags. For types (class / struct / union /
+enum), the explicit tags are the active tags.
+
+For variables and functions, the active tags are the explicit tags plus any
+"required tags" which are not in the "available tags" set:
+
+.. code-block:: none
+
+    derived-tags := (required-tags - available-tags)
+    active-tags := explicit-tags + derived-tags
+
+Required tags for a function
+============================
+
+If a function is used as a local scope for another name, and is part of
+another function as local scope, it doesn't have any required tags.
+
+If a function is used as a local scope for a guard variable name, it doesn't
+have any required tags.
+
+Otherwise the function requires any implicit or explicit tag used in the name
+for the return type.
+
+Example:
+
+.. code-block:: c++
+
+    namespace A {
+      inline namespace B __attribute__((abi_tag)) {
+        struct C { int x; };
+      }
+    }
+
+    A::C foo(); // gets mangled as: _Z3fooB1Bv (prettified as `foo[abi:B]()`)
+
+Required tags for a variable
+============================
+
+A variable requires any implicit or explicit tag used in its type.
+
+Available tags
+==============
+
+All tags used in the prefix and in the template arguments for a name are
+available. Also, for functions, all tags from the <bare-function-type>
+(which might include the return type for template functions) are available.
+
+For <local-name>s all active tags used in the local part (<function-
+encoding>) are available, but not implicit tags which were not active.
+
+Implicit and explicit tags used in the <unqualified-name> for a function (as
+in the type of a cast operator) are NOT available.
+
+Example: a cast operator to std::string (which is
+std::__cxx11::basic_string<...>) will use 'cxx11' as an active tag, as it is
+required from the return type `std::string` but not available.
diff --git a/docs/LTOVisibility.rst b/docs/LTOVisibility.rst
new file mode 100644
index 0000000..67367f3
--- /dev/null
+++ b/docs/LTOVisibility.rst
@@ -0,0 +1,113 @@
+==============
+LTO Visibility
+==============
+
+*LTO visibility* is a property of an entity that specifies whether it can be
+referenced from outside the current LTO unit. A *linkage unit* is a set of
+translation units linked together into an executable or DSO, and a linkage
+unit's *LTO unit* is the subset of the linkage unit that is linked together
+using link-time optimization; in the case where LTO is not being used, the
+linkage unit's LTO unit is empty. Each linkage unit has only a single LTO unit.
+
+The LTO visibility of a class is used by the compiler to determine which
+classes the virtual function call optimization and control flow integrity
+features apply to. These features use whole-program information, so they
+require the entire class hierarchy to be visible in order to work correctly.
+
+If any translation unit in the program uses either of the virtual function
+call optimization or control flow integrity features, it is effectively an
+ODR violation to define a class with hidden LTO visibility in multiple linkage
+units. A class with public LTO visibility may be defined in multiple linkage
+units, but the tradeoff is that the virtual function call optimization and
+control flow integrity features can only be applied to classes with hidden LTO
+visibility. A class's LTO visibility is treated as an ODR-relevant property
+of its definition, so it must be consistent between translation units.
+
+In translation units built with LTO, LTO visibility is based on the
+class's symbol visibility as expressed at the source level (i.e. the
+``__attribute__((visibility("...")))`` attribute, or the ``-fvisibility=``
+flag) or, on the Windows platform, the dllimport and dllexport attributes. When
+targeting non-Windows platforms, classes with a visibility other than hidden
+visibility receive public LTO visibility. When targeting Windows, classes
+with dllimport or dllexport attributes receive public LTO visibility. All
+other classes receive hidden LTO visibility. Classes with internal linkage
+(e.g. classes declared in unnamed namespaces) also receive hidden LTO
+visibility.
+
+A class defined in a translation unit built without LTO receives public
+LTO visibility regardless of its object file visibility, linkage or other
+attributes.
+
+This mechanism will produce the correct result in most cases, but there are
+two cases where it may wrongly infer hidden LTO visibility.
+
+1. As a corollary of the above rules, if a linkage unit is produced from a
+   combination of LTO object files and non-LTO object files, any hidden
+   visibility class defined in both a translation unit built with LTO and
+   a translation unit built without LTO must be defined with public LTO
+   visibility in order to avoid an ODR violation.
+
+2. Some ABIs provide the ability to define an abstract base class without
+   visibility attributes in multiple linkage units and have virtual calls
+   to derived classes in other linkage units work correctly. One example of
+   this is COM on Windows platforms. If the ABI allows this, any base class
+   used in this way must be defined with public LTO visibility.
+
+Classes that fall into either of these categories can be marked up with the
+``[[clang::lto_visibility_public]]`` attribute. To specifically handle the
+COM case, classes with the ``__declspec(uuid())`` attribute receive public
+LTO visibility. On Windows platforms, clang-cl's ``/MT`` and ``/MTd``
+flags statically link the program against a prebuilt standard library;
+these flags imply public LTO visibility for every class declared in the
+``std`` and ``stdext`` namespaces.
+
+Example
+=======
+
+The following example shows how LTO visibility works in practice in several
+cases involving two linkage units, ``main`` and ``dso.so``.
+
+.. code-block:: none
+
+    +-----------------------------------------------------------+  +----------------------------------------------------+
+    | main (clang++ -fvisibility=hidden):                       |  | dso.so (clang++ -fvisibility=hidden):              |
+    |                                                           |  |                                                    |
+    |  +-----------------------------------------------------+  |  |  struct __attribute__((visibility("default"))) C { |
+    |  | LTO unit (clang++ -fvisibility=hidden -flto):       |  |  |    virtual void f();                               |
+    |  |                                                     |  |  |  }                                                 |
+    |  |  struct A { ... };                                  |  |  |  void C::f() {}                                    |
+    |  |  struct [[clang::lto_visibility_public]] B { ... }; |  |  |  struct D {                                        |
+    |  |  struct __attribute__((visibility("default"))) C {  |  |  |    virtual void g() = 0;                           |
+    |  |    virtual void f();                                |  |  |  };                                                |
+    |  |  };                                                 |  |  |  struct E : D {                                    |
+    |  |  struct [[clang::lto_visibility_public]] D {        |  |  |    virtual void g() { ... }                        |
+    |  |    virtual void g() = 0;                            |  |  |  };                                                |
+    |  |  };                                                 |  |  |  __attribute__(visibility("default"))) D *mkE() {  |
+    |  |                                                     |  |  |    return new E;                                   |
+    |  +-----------------------------------------------------+  |  |  }                                                 |
+    |                                                           |  |                                                    |
+    |  struct B { ... };                                        |  +----------------------------------------------------+
+    |                                                           |
+    +-----------------------------------------------------------+
+
+We will now describe the LTO visibility of each of the classes defined in
+these linkage units.
+
+Class ``A`` is not defined outside of ``main``'s LTO unit, so it can have
+hidden LTO visibility. This is inferred from the object file visibility
+specified on the command line.
+
+Class ``B`` is defined in ``main``, both inside and outside its LTO unit. The
+definition outside the LTO unit has public LTO visibility, so the definition
+inside the LTO unit must also have public LTO visibility in order to avoid
+an ODR violation.
+
+Class ``C`` is defined in both ``main`` and ``dso.so`` and therefore must
+have public LTO visibility. This is correctly inferred from the ``visibility``
+attribute.
+
+Class ``D`` is an abstract base class with a derived class ``E`` defined
+in ``dso.so``.  This is an example of the COM scenario; the definition of
+``D`` in ``main``'s LTO unit must have public LTO visibility in order to be
+compatible with the definition of ``D`` in ``dso.so``, which is observable
+by calling the function ``mkE``.
diff --git a/docs/LanguageExtensions.rst b/docs/LanguageExtensions.rst
index 333dee6..51ac3ab 100644
--- a/docs/LanguageExtensions.rst
+++ b/docs/LanguageExtensions.rst
@@ -449,7 +449,7 @@
 If the deprecated or unavailable declaration is used, the message will be
 incorporated into the appropriate diagnostic:
 
-.. code-block:: c++
+.. code-block:: none
 
   harmless.c:4:3: warning: 'explode' is deprecated: extremely unsafe, use 'combust' instead!!!
         [-Wdeprecated-declarations]
@@ -1022,6 +1022,7 @@
 * ``__is_nothrow_assignable`` (MSVC 2013, clang)
 * ``__is_constructible`` (MSVC 2013, clang)
 * ``__is_nothrow_constructible`` (MSVC 2013, clang)
+* ``__is_assignable`` (MSVC 2015, clang)
 
 Blocks
 ======
@@ -1505,6 +1506,35 @@
 
 Query for this feature with ``__has_builtin(__builtin_convertvector)``.
 
+``__builtin_bitreverse``
+------------------------
+
+* ``__builtin_bitreverse8``
+* ``__builtin_bitreverse16``
+* ``__builtin_bitreverse32``
+* ``__builtin_bitreverse64``
+
+**Syntax**:
+
+.. code-block:: c++
+
+     __builtin_bitreverse32(x)
+
+**Examples**:
+
+.. code-block:: c++
+
+      uint8_t rev_x = __builtin_bitreverse8(x);
+      uint16_t rev_x = __builtin_bitreverse16(x);
+      uint32_t rev_y = __builtin_bitreverse32(y);
+      uint64_t rev_z = __builtin_bitreverse64(z);
+
+**Description**:
+
+The '``__builtin_bitreverse``' family of builtins is used to reverse
+the bitpattern of an integer value; for example ``0b10110110`` becomes
+``0b01101101``.
+
 ``__builtin_unreachable``
 -------------------------
 
@@ -1728,6 +1758,24 @@
 
 Query for this feature with ``__has_builtin(__builtin_add_overflow)``, etc.
 
+Floating point builtins
+---------------------------------------
+
+``__builtin_canonicalize``
+--------------------------
+
+.. code-block:: c
+
+   double __builtin_canonicalize(double);
+   float __builtin_canonicalizef(float);
+   long double__builtin_canonicalizel(long double);
+
+Returns the platform specific canonical encoding of a floating point
+number. This canonicalization is useful for implementing certain
+numeric primitives such as frexp. See `LLVM canonicalize intrinsic
+<http://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic>`_ for
+more information on the semantics.
+
 .. _langext-__c11_atomic:
 
 __c11_atomic builtins
@@ -1857,7 +1905,7 @@
 <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf>`_.
 Note that these intrinsics are implemented as motion barriers that block
 reordering of memory accesses and side effect instructions. Other instructions
-like simple arithmatic may be reordered around the intrinsic. If you expect to
+like simple arithmetic may be reordered around the intrinsic. If you expect to
 have no reordering at all, use inline assembly instead.
 
 X86/X86-64 Language Extensions
@@ -1865,12 +1913,13 @@
 
 The X86 backend has these language extensions:
 
-Memory references off the GS segment
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Memory references to specified segments
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Annotating a pointer with address space #256 causes it to be code generated
-relative to the X86 GS segment register, and address space #257 causes it to be
-relative to the X86 FS segment.  Note that this is a very very low-level
+relative to the X86 GS segment register, address space #257 causes it to be
+relative to the X86 FS segment, and address space #258 causes it to be
+relative to the X86 SS segment.  Note that this is a very very low-level
 feature that should only be used if you know what you're doing (for example in
 an OS kernel).
 
@@ -2001,9 +2050,9 @@
 
 The ``#pragma clang loop`` directive is used to specify hints for optimizing the
 subsequent for, while, do-while, or c++11 range-based for loop. The directive
-provides options for vectorization, interleaving, and unrolling. Loop hints can
-be specified before any loop and will be ignored if the optimization is not safe
-to apply.
+provides options for vectorization, interleaving, unrolling and
+distribution. Loop hints can be specified before any loop and will be ignored if
+the optimization is not safe to apply.
 
 Vectorization and Interleaving
 ------------------------------
@@ -2098,6 +2147,38 @@
 
 Unrolling of a loop can be prevented by specifying ``unroll(disable)``.
 
+Loop Distribution
+-----------------
+
+Loop Distribution allows splitting a loop into multiple loops.  This is
+beneficial for example when the entire loop cannot be vectorized but some of the
+resulting loops can.
+
+If ``distribute(enable))`` is specified and the loop has memory dependencies
+that inhibit vectorization, the compiler will attempt to isolate the offending
+operations into a new loop.  This optimization is not enabled by default, only
+loops marked with the pragma are considered.
+
+.. code-block:: c++
+
+  #pragma clang loop distribute(enable)
+  for (i = 0; i < N; ++i) {
+    S1: A[i + 1] = A[i] + B[i];
+    S2: C[i] = D[i] * E[i];
+  }
+
+This loop will be split into two loops between statements S1 and S2.  The
+second loop containing S2 will be vectorized.
+
+Loop Distribution is currently not enabled by default in the optimizer because
+it can hurt performance in some cases.  For example, instruction-level
+parallelism could be reduced by sequentializing the execution of the
+statements S1 and S2 above.
+
+If Loop Distribution is turned on globally with
+``-mllvm -enable-loop-distribution``, specifying ``distribute(disable)`` can
+be used the disable it on a per-loop basis.
+
 Additional Information
 ----------------------
 
diff --git a/docs/LeakSanitizer.rst b/docs/LeakSanitizer.rst
index 8591808..c3ccecc 100644
--- a/docs/LeakSanitizer.rst
+++ b/docs/LeakSanitizer.rst
@@ -9,21 +9,39 @@
 ============
 
 LeakSanitizer is a run-time memory leak detector. It can be combined with
-:doc:`AddressSanitizer` to get both memory error and leak detection.
-LeakSanitizer does not introduce any additional slowdown when used in this mode.
-The LeakSanitizer runtime can also be linked in separately to get leak detection
-only, at a minimal performance cost.
+:doc:`AddressSanitizer` to get both memory error and leak detection, or
+used in a stand-alone mode. LSan adds almost no performance overhead
+until the very end of the process, at which point there is an extra leak
+detection phase.
 
-Current status
-==============
+Usage
+=====
 
-LeakSanitizer is turned on by default, but it is only supported on x86\_64
-Linux.
+LeakSanitizer is only supported on x86\_64 Linux. In order to use it,
+simply build your program with :doc:`AddressSanitizer`:
 
-The combined mode has been tested on fairly large software projects. The
-stand-alone mode has received much less testing.
+.. code-block:: console
 
-There are plans to support LeakSanitizer in :doc:`MemorySanitizer` builds.
+    $ cat memory-leak.c
+    #include <stdlib.h>
+    void *p;
+    int main() {
+      p = malloc(7);
+      p = 0; // The memory is leaked here.
+      return 0;
+    }
+    % clang -fsanitize=address -g memory-leak.c ; ./a.out
+    ==23646==ERROR: LeakSanitizer: detected memory leaks
+    Direct leak of 7 byte(s) in 1 object(s) allocated from:
+        #0 0x4af01b in __interceptor_malloc /projects/compiler-rt/lib/asan/asan_malloc_linux.cc:52:3
+        #1 0x4da26a in main memory-leak.c:4:7
+        #2 0x7f076fd9cec4 in __libc_start_main libc-start.c:287
+    SUMMARY: AddressSanitizer: 7 byte(s) leaked in 1 allocation(s).
+
+To use LeakSanitizer in stand-alone mode, link your program with
+``-fsanitize=leak`` flag. Make sure to use ``clang`` (not ``ld``) for the
+link step, so that it would link in proper LeakSanitizer run-time library
+into the final executable.
 
 More Information
 ================
diff --git a/docs/LibASTMatchersReference.html b/docs/LibASTMatchersReference.html
index 7f21262..b87cae2 100644
--- a/docs/LibASTMatchersReference.html
+++ b/docs/LibASTMatchersReference.html
@@ -100,7 +100,7 @@
 <tr style="text-align:left"><th>Return type</th><th>Name</th><th>Parameters</th></tr>
 <!-- START_DECL_MATCHERS -->
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('cxxCtorInitializer0')"><a name="cxxCtorInitializer0Anchor">cxxCtorInitializer</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('cxxCtorInitializer0')"><a name="cxxCtorInitializer0Anchor">cxxCtorInitializer</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxCtorInitializer0"><pre>Matches constructor initializers.
 
 Examples matches i(42).
@@ -111,7 +111,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('accessSpecDecl0')"><a name="accessSpecDecl0Anchor">accessSpecDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AccessSpecDecl.html">AccessSpecDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('accessSpecDecl0')"><a name="accessSpecDecl0Anchor">accessSpecDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AccessSpecDecl.html">AccessSpecDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="accessSpecDecl0"><pre>Matches C++ access specifier declarations.
 
 Given
@@ -124,7 +124,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('classTemplateDecl0')"><a name="classTemplateDecl0Anchor">classTemplateDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateDecl.html">ClassTemplateDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('classTemplateDecl0')"><a name="classTemplateDecl0Anchor">classTemplateDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateDecl.html">ClassTemplateDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="classTemplateDecl0"><pre>Matches C++ class template declarations.
 
 Example matches Z
@@ -132,7 +132,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('classTemplateSpecializationDecl0')"><a name="classTemplateSpecializationDecl0Anchor">classTemplateSpecializationDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('classTemplateSpecializationDecl0')"><a name="classTemplateSpecializationDecl0Anchor">classTemplateSpecializationDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="classTemplateSpecializationDecl0"><pre>Matches C++ class template specializations.
 
 Given
@@ -144,7 +144,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxConstructorDecl0')"><a name="cxxConstructorDecl0Anchor">cxxConstructorDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxConstructorDecl0')"><a name="cxxConstructorDecl0Anchor">cxxConstructorDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxConstructorDecl0"><pre>Matches C++ constructor declarations.
 
 Example matches Foo::Foo() and Foo::Foo(int)
@@ -157,7 +157,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxConversionDecl0')"><a name="cxxConversionDecl0Anchor">cxxConversionDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConversionDecl.html">CXXConversionDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxConversionDecl0')"><a name="cxxConversionDecl0Anchor">cxxConversionDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConversionDecl.html">CXXConversionDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxConversionDecl0"><pre>Matches conversion operator declarations.
 
 Example matches the operator.
@@ -165,7 +165,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxDestructorDecl0')"><a name="cxxDestructorDecl0Anchor">cxxDestructorDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXDestructorDecl.html">CXXDestructorDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxDestructorDecl0')"><a name="cxxDestructorDecl0Anchor">cxxDestructorDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXDestructorDecl.html">CXXDestructorDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxDestructorDecl0"><pre>Matches explicit C++ destructor declarations.
 
 Example matches Foo::~Foo()
@@ -176,7 +176,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxMethodDecl0')"><a name="cxxMethodDecl0Anchor">cxxMethodDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxMethodDecl0')"><a name="cxxMethodDecl0Anchor">cxxMethodDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxMethodDecl0"><pre>Matches method declarations.
 
 Example matches y
@@ -184,7 +184,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxRecordDecl0')"><a name="cxxRecordDecl0Anchor">cxxRecordDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('cxxRecordDecl0')"><a name="cxxRecordDecl0Anchor">cxxRecordDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxRecordDecl0"><pre>Matches C++ class declarations.
 
 Example matches X, Z
@@ -193,7 +193,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('decl0')"><a name="decl0Anchor">decl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('decl0')"><a name="decl0Anchor">decl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="decl0"><pre>Matches declarations.
 
 Examples matches X, C, and the friend declaration inside C;
@@ -204,7 +204,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('declaratorDecl0')"><a name="declaratorDecl0Anchor">declaratorDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclaratorDecl.html">DeclaratorDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('declaratorDecl0')"><a name="declaratorDecl0Anchor">declaratorDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclaratorDecl.html">DeclaratorDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="declaratorDecl0"><pre>Matches declarator declarations (field, variable, function
 and non-type template parameter declarations).
 
@@ -215,7 +215,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('enumConstantDecl0')"><a name="enumConstantDecl0Anchor">enumConstantDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumConstantDecl.html">EnumConstantDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('enumConstantDecl0')"><a name="enumConstantDecl0Anchor">enumConstantDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumConstantDecl.html">EnumConstantDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="enumConstantDecl0"><pre>Matches enum constants.
 
 Example matches A, B, C
@@ -225,7 +225,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('enumDecl0')"><a name="enumDecl0Anchor">enumDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumDecl.html">EnumDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('enumDecl0')"><a name="enumDecl0Anchor">enumDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumDecl.html">EnumDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="enumDecl0"><pre>Matches enum declarations.
 
 Example matches X
@@ -235,7 +235,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('fieldDecl0')"><a name="fieldDecl0Anchor">fieldDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FieldDecl.html">FieldDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('fieldDecl0')"><a name="fieldDecl0Anchor">fieldDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FieldDecl.html">FieldDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="fieldDecl0"><pre>Matches field declarations.
 
 Given
@@ -245,7 +245,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('friendDecl0')"><a name="friendDecl0Anchor">friendDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('friendDecl0')"><a name="friendDecl0Anchor">friendDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="friendDecl0"><pre>Matches friend declarations.
 
 Given
@@ -255,7 +255,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('functionDecl0')"><a name="functionDecl0Anchor">functionDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('functionDecl0')"><a name="functionDecl0Anchor">functionDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="functionDecl0"><pre>Matches function declarations.
 
 Example matches f
@@ -263,7 +263,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('functionTemplateDecl0')"><a name="functionTemplateDecl0Anchor">functionTemplateDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionTemplateDecl.html">FunctionTemplateDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('functionTemplateDecl0')"><a name="functionTemplateDecl0Anchor">functionTemplateDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionTemplateDecl.html">FunctionTemplateDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="functionTemplateDecl0"><pre>Matches C++ function template declarations.
 
 Example matches f
@@ -271,7 +271,18 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('linkageSpecDecl0')"><a name="linkageSpecDecl0Anchor">linkageSpecDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LinkageSpecDecl.html">LinkageSpecDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('labelDecl0')"><a name="labelDecl0Anchor">labelDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelDecl.html">LabelDecl</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="labelDecl0"><pre>Matches a declaration of label.
+
+Given
+  goto FOO;
+  FOO: bar();
+labelDecl()
+  matches 'FOO:'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('linkageSpecDecl0')"><a name="linkageSpecDecl0Anchor">linkageSpecDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LinkageSpecDecl.html">LinkageSpecDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="linkageSpecDecl0"><pre>Matches a declaration of a linkage specification.
 
 Given
@@ -281,7 +292,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('namedDecl0')"><a name="namedDecl0Anchor">namedDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('namedDecl0')"><a name="namedDecl0Anchor">namedDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="namedDecl0"><pre>Matches a declaration of anything that could have a name.
 
 Example matches X, S, the anonymous union type, i, and U;
@@ -294,7 +305,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('namespaceAliasDecl0')"><a name="namespaceAliasDecl0Anchor">namespaceAliasDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceAliasDecl.html">NamespaceAliasDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('namespaceAliasDecl0')"><a name="namespaceAliasDecl0Anchor">namespaceAliasDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceAliasDecl.html">NamespaceAliasDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="namespaceAliasDecl0"><pre>Matches a declaration of a namespace alias.
 
 Given
@@ -305,7 +316,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('namespaceDecl0')"><a name="namespaceDecl0Anchor">namespaceDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('namespaceDecl0')"><a name="namespaceDecl0Anchor">namespaceDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="namespaceDecl0"><pre>Matches a declaration of a namespace.
 
 Given
@@ -316,7 +327,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('nonTypeTemplateParmDecl0')"><a name="nonTypeTemplateParmDecl0Anchor">nonTypeTemplateParmDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NonTypeTemplateParmDecl.html">NonTypeTemplateParmDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('nonTypeTemplateParmDecl0')"><a name="nonTypeTemplateParmDecl0Anchor">nonTypeTemplateParmDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NonTypeTemplateParmDecl.html">NonTypeTemplateParmDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="nonTypeTemplateParmDecl0"><pre>Matches non-type template parameter declarations.
 
 Given
@@ -326,7 +337,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('objcInterfaceDecl0')"><a name="objcInterfaceDecl0Anchor">objcInterfaceDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCInterfaceDecl.html">ObjCInterfaceDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('objcInterfaceDecl0')"><a name="objcInterfaceDecl0Anchor">objcInterfaceDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCInterfaceDecl.html">ObjCInterfaceDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="objcInterfaceDecl0"><pre>Matches Objective-C interface declarations.
 
 Example matches Foo
@@ -335,7 +346,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('parmVarDecl0')"><a name="parmVarDecl0Anchor">parmVarDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('parmVarDecl0')"><a name="parmVarDecl0Anchor">parmVarDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="parmVarDecl0"><pre>Matches parameter variable declarations.
 
 Given
@@ -345,7 +356,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('recordDecl0')"><a name="recordDecl0Anchor">recordDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordDecl.html">RecordDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('recordDecl0')"><a name="recordDecl0Anchor">recordDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordDecl.html">RecordDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="recordDecl0"><pre>Matches class, struct, and union declarations.
 
 Example matches X, Z, U, and S
@@ -356,7 +367,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('staticAssertDecl0')"><a name="staticAssertDecl0Anchor">staticAssertDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1StaticAssertDecl.html">StaticAssertDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('staticAssertDecl0')"><a name="staticAssertDecl0Anchor">staticAssertDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1StaticAssertDecl.html">StaticAssertDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="staticAssertDecl0"><pre>Matches a C++ static_assert declaration.
 
 Example:
@@ -371,7 +382,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('templateTypeParmDecl0')"><a name="templateTypeParmDecl0Anchor">templateTypeParmDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmDecl.html">TemplateTypeParmDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('templateTypeParmDecl0')"><a name="templateTypeParmDecl0Anchor">templateTypeParmDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmDecl.html">TemplateTypeParmDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="templateTypeParmDecl0"><pre>Matches template type parameter declarations.
 
 Given
@@ -381,7 +392,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('translationUnitDecl0')"><a name="translationUnitDecl0Anchor">translationUnitDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TranslationUnitDecl.html">TranslationUnitDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('translationUnitDecl0')"><a name="translationUnitDecl0Anchor">translationUnitDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TranslationUnitDecl.html">TranslationUnitDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="translationUnitDecl0"><pre>Matches the top declaration context.
 
 Given
@@ -394,17 +405,40 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('typedefDecl0')"><a name="typedefDecl0Anchor">typedefDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefDecl.html">TypedefDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('typeAliasDecl0')"><a name="typeAliasDecl0Anchor">typeAliasDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeAliasDecl.html">TypeAliasDecl</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="typeAliasDecl0"><pre>Matches type alias declarations.
+
+Given
+  typedef int X;
+  using Y = int;
+typeAliasDecl()
+  matches "using Y = int", but not "typedef int X"
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('typedefDecl0')"><a name="typedefDecl0Anchor">typedefDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefDecl.html">TypedefDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="typedefDecl0"><pre>Matches typedef declarations.
 
 Given
   typedef int X;
+  using Y = int;
 typedefDecl()
-  matches "typedef int X"
+  matches "typedef int X", but not "using Y = int"
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('unresolvedUsingTypenameDecl0')"><a name="unresolvedUsingTypenameDecl0Anchor">unresolvedUsingTypenameDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingTypenameDecl.html">UnresolvedUsingTypenameDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('typedefNameDecl0')"><a name="typedefNameDecl0Anchor">typedefNameDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefNameDecl.html">TypedefNameDecl</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="typedefNameDecl0"><pre>Matches typedef name declarations.
+
+Given
+  typedef int X;
+  using Y = int;
+typedefNameDecl()
+  matches "typedef int X" and "using Y = int"
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('unresolvedUsingTypenameDecl0')"><a name="unresolvedUsingTypenameDecl0Anchor">unresolvedUsingTypenameDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingTypenameDecl.html">UnresolvedUsingTypenameDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="unresolvedUsingTypenameDecl0"><pre>Matches unresolved using value declarations that involve the
 typename.
 
@@ -420,7 +454,7 @@
   matches using Base&lt;T&gt;::Foo </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('unresolvedUsingValueDecl0')"><a name="unresolvedUsingValueDecl0Anchor">unresolvedUsingValueDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingValueDecl.html">UnresolvedUsingValueDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('unresolvedUsingValueDecl0')"><a name="unresolvedUsingValueDecl0Anchor">unresolvedUsingValueDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingValueDecl.html">UnresolvedUsingValueDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="unresolvedUsingValueDecl0"><pre>Matches unresolved using value declarations.
 
 Given
@@ -432,7 +466,7 @@
   matches using X::x </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('usingDecl0')"><a name="usingDecl0Anchor">usingDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingDecl.html">UsingDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('usingDecl0')"><a name="usingDecl0Anchor">usingDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingDecl.html">UsingDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="usingDecl0"><pre>Matches using declarations.
 
 Given
@@ -442,7 +476,7 @@
   matches using X::x </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('usingDirectiveDecl0')"><a name="usingDirectiveDecl0Anchor">usingDirectiveDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingDirectiveDecl.html">UsingDirectiveDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('usingDirectiveDecl0')"><a name="usingDirectiveDecl0Anchor">usingDirectiveDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingDirectiveDecl.html">UsingDirectiveDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="usingDirectiveDecl0"><pre>Matches using namespace declarations.
 
 Given
@@ -452,7 +486,7 @@
   matches using namespace X </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('valueDecl0')"><a name="valueDecl0Anchor">valueDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('valueDecl0')"><a name="valueDecl0Anchor">valueDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="valueDecl0"><pre>Matches any value declaration.
 
 Example matches A, B, C and F
@@ -461,7 +495,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('varDecl0')"><a name="varDecl0Anchor">varDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('varDecl0')"><a name="varDecl0Anchor">varDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="varDecl0"><pre>Matches variable declarations.
 
 Note: this does not match declarations of member variables, which are
@@ -472,12 +506,12 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;</td><td class="name" onclick="toggle('nestedNameSpecifierLoc0')"><a name="nestedNameSpecifierLoc0Anchor">nestedNameSpecifierLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;</td><td class="name" onclick="toggle('nestedNameSpecifierLoc0')"><a name="nestedNameSpecifierLoc0Anchor">nestedNameSpecifierLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="nestedNameSpecifierLoc0"><pre>Same as nestedNameSpecifier but matches NestedNameSpecifierLoc.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;</td><td class="name" onclick="toggle('nestedNameSpecifier0')"><a name="nestedNameSpecifier0Anchor">nestedNameSpecifier</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;</td><td class="name" onclick="toggle('nestedNameSpecifier0')"><a name="nestedNameSpecifier0Anchor">nestedNameSpecifier</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="nestedNameSpecifier0"><pre>Matches nested name specifiers.
 
 Given
@@ -492,12 +526,24 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('qualType0')"><a name="qualType0Anchor">qualType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('qualType0')"><a name="qualType0Anchor">qualType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="qualType0"><pre>Matches QualTypes in the clang AST.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('arraySubscriptExpr0')"><a name="arraySubscriptExpr0Anchor">arraySubscriptExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('addrLabelExpr0')"><a name="addrLabelExpr0Anchor">addrLabelExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="addrLabelExpr0"><pre>Matches address of label statements (GNU extension).
+
+Given
+  FOO: bar();
+  void *ptr = &amp;&amp;FOO;
+  goto *bar;
+addrLabelExpr()
+  matches '&amp;&amp;FOO'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('arraySubscriptExpr0')"><a name="arraySubscriptExpr0Anchor">arraySubscriptExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="arraySubscriptExpr0"><pre>Matches array subscript expressions.
 
 Given
@@ -507,7 +553,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('asmStmt0')"><a name="asmStmt0Anchor">asmStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AsmStmt.html">AsmStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('asmStmt0')"><a name="asmStmt0Anchor">asmStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AsmStmt.html">AsmStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="asmStmt0"><pre>Matches asm statements.
 
  int i = 100;
@@ -517,7 +563,22 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('binaryOperator0')"><a name="binaryOperator0Anchor">binaryOperator</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('atomicExpr0')"><a name="atomicExpr0Anchor">atomicExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicExpr.html">AtomicExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="atomicExpr0"><pre>Matches atomic builtins.
+Example matches __atomic_load_n(ptr, 1)
+  void foo() { int *ptr; __atomic_load_n(ptr, 1); }
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('binaryConditionalOperator0')"><a name="binaryConditionalOperator0Anchor">binaryConditionalOperator</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryConditionalOperator.html">BinaryConditionalOperator</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="binaryConditionalOperator0"><pre>Matches binary conditional operator expressions (GNU extension).
+
+Example matches a ?: b
+  (a ?: b) + 42;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('binaryOperator0')"><a name="binaryOperator0Anchor">binaryOperator</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="binaryOperator0"><pre>Matches binary operator expressions.
 
 Example matches a || b
@@ -525,7 +586,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('breakStmt0')"><a name="breakStmt0Anchor">breakStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BreakStmt.html">BreakStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('breakStmt0')"><a name="breakStmt0Anchor">breakStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BreakStmt.html">BreakStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="breakStmt0"><pre>Matches break statements.
 
 Given
@@ -535,7 +596,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cStyleCastExpr0')"><a name="cStyleCastExpr0Anchor">cStyleCastExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CStyleCastExpr.html">CStyleCastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cStyleCastExpr0')"><a name="cStyleCastExpr0Anchor">cStyleCastExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CStyleCastExpr.html">CStyleCastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cStyleCastExpr0"><pre>Matches a C-style cast expression.
 
 Example: Matches (int*) 2.2f in
@@ -543,7 +604,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('callExpr0')"><a name="callExpr0Anchor">callExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('callExpr0')"><a name="callExpr0Anchor">callExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="callExpr0"><pre>Matches call expressions.
 
 Example matches x.y() and y()
@@ -553,7 +614,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('caseStmt0')"><a name="caseStmt0Anchor">caseStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CaseStmt.html">CaseStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('caseStmt0')"><a name="caseStmt0Anchor">caseStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CaseStmt.html">CaseStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="caseStmt0"><pre>Matches case statements inside switch statements.
 
 Given
@@ -563,7 +624,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('castExpr0')"><a name="castExpr0Anchor">castExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CastExpr.html">CastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('castExpr0')"><a name="castExpr0Anchor">castExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CastExpr.html">CastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="castExpr0"><pre>Matches any cast nodes of Clang's AST.
 
 Example: castExpr() matches each of the following:
@@ -576,26 +637,28 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('characterLiteral0')"><a name="characterLiteral0Anchor">characterLiteral</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('characterLiteral0')"><a name="characterLiteral0Anchor">characterLiteral</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="characterLiteral0"><pre>Matches character literals (also matches wchar_t).
 
 Not matching Hex-encoded chars (e.g. 0x1234, which is a IntegerLiteral),
 though.
 
 Example matches 'a', L'a'
-  char ch = 'a'; wchar_t chw = L'a';
+  char ch = 'a';
+  wchar_t chw = L'a';
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('compoundLiteralExpr0')"><a name="compoundLiteralExpr0Anchor">compoundLiteralExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundLiteralExpr.html">CompoundLiteralExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('compoundLiteralExpr0')"><a name="compoundLiteralExpr0Anchor">compoundLiteralExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundLiteralExpr.html">CompoundLiteralExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="compoundLiteralExpr0"><pre>Matches compound (i.e. non-scalar) literals
 
 Example match: {1}, (1, 2)
-  int array[4] = {1}; vector int myvec = (vector int)(1, 2);
+  int array[4] = {1};
+  vector int myvec = (vector int)(1, 2);
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('compoundStmt0')"><a name="compoundStmt0Anchor">compoundStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundStmt.html">CompoundStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('compoundStmt0')"><a name="compoundStmt0Anchor">compoundStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundStmt.html">CompoundStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="compoundStmt0"><pre>Matches compound statements.
 
 Example matches '{}' and '{{}}'in 'for (;;) {{}}'
@@ -603,7 +666,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('conditionalOperator0')"><a name="conditionalOperator0Anchor">conditionalOperator</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ConditionalOperator.html">ConditionalOperator</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('conditionalOperator0')"><a name="conditionalOperator0Anchor">conditionalOperator</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ConditionalOperator.html">ConditionalOperator</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="conditionalOperator0"><pre>Matches conditional operator expressions.
 
 Example matches a ? b : c
@@ -611,7 +674,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('continueStmt0')"><a name="continueStmt0Anchor">continueStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ContinueStmt.html">ContinueStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('continueStmt0')"><a name="continueStmt0Anchor">continueStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ContinueStmt.html">ContinueStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="continueStmt0"><pre>Matches continue statements.
 
 Given
@@ -621,7 +684,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cudaKernelCallExpr0')"><a name="cudaKernelCallExpr0Anchor">cudaKernelCallExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CUDAKernelCallExpr.html">CUDAKernelCallExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cudaKernelCallExpr0')"><a name="cudaKernelCallExpr0Anchor">cudaKernelCallExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CUDAKernelCallExpr.html">CUDAKernelCallExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cudaKernelCallExpr0"><pre>Matches CUDA kernel call expression.
 
 Example matches,
@@ -629,7 +692,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxBindTemporaryExpr0')"><a name="cxxBindTemporaryExpr0Anchor">cxxBindTemporaryExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBindTemporaryExpr.html">CXXBindTemporaryExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxBindTemporaryExpr0')"><a name="cxxBindTemporaryExpr0Anchor">cxxBindTemporaryExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBindTemporaryExpr.html">CXXBindTemporaryExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxBindTemporaryExpr0"><pre>Matches nodes where temporaries are created.
 
 Example matches FunctionTakesString(GetStringByValue())
@@ -639,7 +702,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxBoolLiteral0')"><a name="cxxBoolLiteral0Anchor">cxxBoolLiteral</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxBoolLiteral0')"><a name="cxxBoolLiteral0Anchor">cxxBoolLiteral</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxBoolLiteral0"><pre>Matches bool literals.
 
 Example matches true
@@ -647,7 +710,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxCatchStmt0')"><a name="cxxCatchStmt0Anchor">cxxCatchStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCatchStmt.html">CXXCatchStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxCatchStmt0')"><a name="cxxCatchStmt0Anchor">cxxCatchStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCatchStmt.html">CXXCatchStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxCatchStmt0"><pre>Matches catch statements.
 
   try {} catch(int i) {}
@@ -656,7 +719,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxConstCastExpr0')"><a name="cxxConstCastExpr0Anchor">cxxConstCastExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstCastExpr.html">CXXConstCastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxConstCastExpr0')"><a name="cxxConstCastExpr0Anchor">cxxConstCastExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstCastExpr.html">CXXConstCastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxConstCastExpr0"><pre>Matches a const_cast expression.
 
 Example: Matches const_cast&lt;int*&gt;(&amp;r) in
@@ -666,7 +729,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxConstructExpr0')"><a name="cxxConstructExpr0Anchor">cxxConstructExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxConstructExpr0')"><a name="cxxConstructExpr0Anchor">cxxConstructExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxConstructExpr0"><pre>Matches constructor call expressions (including implicit ones).
 
 Example matches string(ptr, n) and ptr within arguments of f
@@ -678,7 +741,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxDefaultArgExpr0')"><a name="cxxDefaultArgExpr0Anchor">cxxDefaultArgExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXDefaultArgExpr.html">CXXDefaultArgExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxDefaultArgExpr0')"><a name="cxxDefaultArgExpr0Anchor">cxxDefaultArgExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXDefaultArgExpr.html">CXXDefaultArgExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxDefaultArgExpr0"><pre>Matches the value of a default argument at the call site.
 
 Example matches the CXXDefaultArgExpr placeholder inserted for the
@@ -689,7 +752,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxDeleteExpr0')"><a name="cxxDeleteExpr0Anchor">cxxDeleteExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXDeleteExpr.html">CXXDeleteExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxDeleteExpr0')"><a name="cxxDeleteExpr0Anchor">cxxDeleteExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXDeleteExpr.html">CXXDeleteExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxDeleteExpr0"><pre>Matches delete expressions.
 
 Given
@@ -699,7 +762,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxDynamicCastExpr0')"><a name="cxxDynamicCastExpr0Anchor">cxxDynamicCastExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXDynamicCastExpr.html">CXXDynamicCastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxDynamicCastExpr0')"><a name="cxxDynamicCastExpr0Anchor">cxxDynamicCastExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXDynamicCastExpr.html">CXXDynamicCastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxDynamicCastExpr0"><pre>Matches a dynamic_cast expression.
 
 Example:
@@ -713,7 +776,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxForRangeStmt0')"><a name="cxxForRangeStmt0Anchor">cxxForRangeStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXForRangeStmt.html">CXXForRangeStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxForRangeStmt0')"><a name="cxxForRangeStmt0Anchor">cxxForRangeStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXForRangeStmt.html">CXXForRangeStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxForRangeStmt0"><pre>Matches range-based for statements.
 
 cxxForRangeStmt() matches 'for (auto a : i)'
@@ -722,7 +785,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxFunctionalCastExpr0')"><a name="cxxFunctionalCastExpr0Anchor">cxxFunctionalCastExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXFunctionalCastExpr.html">CXXFunctionalCastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxFunctionalCastExpr0')"><a name="cxxFunctionalCastExpr0Anchor">cxxFunctionalCastExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXFunctionalCastExpr.html">CXXFunctionalCastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxFunctionalCastExpr0"><pre>Matches functional cast expressions
 
 Example: Matches Foo(bar);
@@ -732,7 +795,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxMemberCallExpr0')"><a name="cxxMemberCallExpr0Anchor">cxxMemberCallExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxMemberCallExpr0')"><a name="cxxMemberCallExpr0Anchor">cxxMemberCallExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxMemberCallExpr0"><pre>Matches member call expressions.
 
 Example matches x.y()
@@ -741,7 +804,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxNewExpr0')"><a name="cxxNewExpr0Anchor">cxxNewExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxNewExpr0')"><a name="cxxNewExpr0Anchor">cxxNewExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxNewExpr0"><pre>Matches new expressions.
 
 Given
@@ -751,12 +814,12 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxNullPtrLiteralExpr0')"><a name="cxxNullPtrLiteralExpr0Anchor">cxxNullPtrLiteralExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNullPtrLiteralExpr.html">CXXNullPtrLiteralExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxNullPtrLiteralExpr0')"><a name="cxxNullPtrLiteralExpr0Anchor">cxxNullPtrLiteralExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNullPtrLiteralExpr.html">CXXNullPtrLiteralExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxNullPtrLiteralExpr0"><pre>Matches nullptr literal.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxOperatorCallExpr0')"><a name="cxxOperatorCallExpr0Anchor">cxxOperatorCallExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXOperatorCallExpr.html">CXXOperatorCallExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxOperatorCallExpr0')"><a name="cxxOperatorCallExpr0Anchor">cxxOperatorCallExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXOperatorCallExpr.html">CXXOperatorCallExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxOperatorCallExpr0"><pre>Matches overloaded operator calls.
 
 Note that if an operator isn't overloaded, it won't match. Instead, use
@@ -772,7 +835,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxReinterpretCastExpr0')"><a name="cxxReinterpretCastExpr0Anchor">cxxReinterpretCastExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXReinterpretCastExpr.html">CXXReinterpretCastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxReinterpretCastExpr0')"><a name="cxxReinterpretCastExpr0Anchor">cxxReinterpretCastExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXReinterpretCastExpr.html">CXXReinterpretCastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxReinterpretCastExpr0"><pre>Matches a reinterpret_cast expression.
 
 Either the source expression or the destination type can be matched
@@ -784,11 +847,11 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxStaticCastExpr0')"><a name="cxxStaticCastExpr0Anchor">cxxStaticCastExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXStaticCastExpr.html">CXXStaticCastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxStaticCastExpr0')"><a name="cxxStaticCastExpr0Anchor">cxxStaticCastExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXStaticCastExpr.html">CXXStaticCastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxStaticCastExpr0"><pre>Matches a C++ static_cast expression.
 
-hasDestinationType
-reinterpretCast
+See also: hasDestinationType
+See also: reinterpretCast
 
 Example:
   cxxStaticCastExpr()
@@ -799,7 +862,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxTemporaryObjectExpr0')"><a name="cxxTemporaryObjectExpr0Anchor">cxxTemporaryObjectExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXTemporaryObjectExpr.html">CXXTemporaryObjectExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxTemporaryObjectExpr0')"><a name="cxxTemporaryObjectExpr0Anchor">cxxTemporaryObjectExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXTemporaryObjectExpr.html">CXXTemporaryObjectExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxTemporaryObjectExpr0"><pre>Matches functional cast expressions having N != 1 arguments
 
 Example: Matches Foo(bar, bar)
@@ -807,7 +870,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxThisExpr0')"><a name="cxxThisExpr0Anchor">cxxThisExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXThisExpr.html">CXXThisExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxThisExpr0')"><a name="cxxThisExpr0Anchor">cxxThisExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXThisExpr.html">CXXThisExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxThisExpr0"><pre>Matches implicit and explicit this expressions.
 
 Example matches the implicit this expression in "return i".
@@ -819,7 +882,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxThrowExpr0')"><a name="cxxThrowExpr0Anchor">cxxThrowExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXThrowExpr.html">CXXThrowExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxThrowExpr0')"><a name="cxxThrowExpr0Anchor">cxxThrowExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXThrowExpr.html">CXXThrowExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxThrowExpr0"><pre>Matches throw expressions.
 
   try { throw 5; } catch(int i) {}
@@ -828,7 +891,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxTryStmt0')"><a name="cxxTryStmt0Anchor">cxxTryStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXTryStmt.html">CXXTryStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxTryStmt0')"><a name="cxxTryStmt0Anchor">cxxTryStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXTryStmt.html">CXXTryStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxTryStmt0"><pre>Matches try statements.
 
   try {} catch(int i) {}
@@ -837,7 +900,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxUnresolvedConstructExpr0')"><a name="cxxUnresolvedConstructExpr0Anchor">cxxUnresolvedConstructExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXUnresolvedConstructExpr.html">CXXUnresolvedConstructExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxUnresolvedConstructExpr0')"><a name="cxxUnresolvedConstructExpr0Anchor">cxxUnresolvedConstructExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXUnresolvedConstructExpr.html">CXXUnresolvedConstructExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxUnresolvedConstructExpr0"><pre>Matches unresolved constructor call expressions.
 
 Example matches T(t) in return statement of f
@@ -847,7 +910,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('declRefExpr0')"><a name="declRefExpr0Anchor">declRefExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('declRefExpr0')"><a name="declRefExpr0Anchor">declRefExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="declRefExpr0"><pre>Matches expressions that refer to declarations.
 
 Example matches x in if (x)
@@ -856,7 +919,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('declStmt0')"><a name="declStmt0Anchor">declStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('declStmt0')"><a name="declStmt0Anchor">declStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="declStmt0"><pre>Matches declaration statements.
 
 Given
@@ -866,7 +929,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('defaultStmt0')"><a name="defaultStmt0Anchor">defaultStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DefaultStmt.html">DefaultStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('defaultStmt0')"><a name="defaultStmt0Anchor">defaultStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DefaultStmt.html">DefaultStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="defaultStmt0"><pre>Matches default statements inside switch statements.
 
 Given
@@ -876,7 +939,15 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('doStmt0')"><a name="doStmt0Anchor">doStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DoStmt.html">DoStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('designatedInitExpr0')"><a name="designatedInitExpr0Anchor">designatedInitExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DesignatedInitExpr.html">DesignatedInitExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="designatedInitExpr0"><pre>Matches C99 designated initializer expressions [C99 6.7.8].
+
+Example: Matches { [2].y = 1.0, [0].x = 1.0 }
+  point ptarray[10] = { [2].y = 1.0, [0].x = 1.0 };
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('doStmt0')"><a name="doStmt0Anchor">doStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DoStmt.html">DoStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="doStmt0"><pre>Matches do statements.
 
 Given
@@ -886,7 +957,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('explicitCastExpr0')"><a name="explicitCastExpr0Anchor">explicitCastExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ExplicitCastExpr.html">ExplicitCastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('explicitCastExpr0')"><a name="explicitCastExpr0Anchor">explicitCastExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ExplicitCastExpr.html">ExplicitCastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="explicitCastExpr0"><pre>Matches explicit cast expressions.
 
 Matches any cast expression written in user code, whether it be a
@@ -898,7 +969,7 @@
 Clang uses the term "cast" to apply to implicit conversions as well as to
 actual cast expressions.
 
-hasDestinationType.
+See also: hasDestinationType.
 
 Example: matches all five of the casts in
   int((int)(reinterpret_cast&lt;int&gt;(static_cast&lt;int&gt;(const_cast&lt;int&gt;(42)))))
@@ -907,7 +978,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('expr0')"><a name="expr0Anchor">expr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('expr0')"><a name="expr0Anchor">expr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="expr0"><pre>Matches expressions.
 
 Example matches x()
@@ -915,7 +986,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('exprWithCleanups0')"><a name="exprWithCleanups0Anchor">exprWithCleanups</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ExprWithCleanups.html">ExprWithCleanups</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('exprWithCleanups0')"><a name="exprWithCleanups0Anchor">exprWithCleanups</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ExprWithCleanups.html">ExprWithCleanups</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="exprWithCleanups0"><pre>Matches expressions that introduce cleanups to be run at the end
 of the sub-expression's evaluation.
 
@@ -924,7 +995,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('floatLiteral0')"><a name="floatLiteral0Anchor">floatLiteral</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('floatLiteral0')"><a name="floatLiteral0Anchor">floatLiteral</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="floatLiteral0"><pre>Matches float literals of all sizes encodings, e.g.
 1.0, 1.0f, 1.0L and 1e10.
 
@@ -933,7 +1004,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('forStmt0')"><a name="forStmt0Anchor">forStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('forStmt0')"><a name="forStmt0Anchor">forStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="forStmt0"><pre>Matches for statements.
 
 Example matches 'for (;;) {}'
@@ -942,12 +1013,12 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('gnuNullExpr0')"><a name="gnuNullExpr0Anchor">gnuNullExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1GNUNullExpr.html">GNUNullExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('gnuNullExpr0')"><a name="gnuNullExpr0Anchor">gnuNullExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1GNUNullExpr.html">GNUNullExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="gnuNullExpr0"><pre>Matches GNU __null expression.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('gotoStmt0')"><a name="gotoStmt0Anchor">gotoStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1GotoStmt.html">GotoStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('gotoStmt0')"><a name="gotoStmt0Anchor">gotoStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1GotoStmt.html">GotoStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="gotoStmt0"><pre>Matches goto statements.
 
 Given
@@ -958,7 +1029,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('ifStmt0')"><a name="ifStmt0Anchor">ifStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('ifStmt0')"><a name="ifStmt0Anchor">ifStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="ifStmt0"><pre>Matches if statements.
 
 Example matches 'if (x) {}'
@@ -966,7 +1037,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('implicitCastExpr0')"><a name="implicitCastExpr0Anchor">implicitCastExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ImplicitCastExpr.html">ImplicitCastExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('implicitCastExpr0')"><a name="implicitCastExpr0Anchor">implicitCastExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ImplicitCastExpr.html">ImplicitCastExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="implicitCastExpr0"><pre>Matches the implicit cast nodes of Clang's AST.
 
 This matches many different places, including function call return value
@@ -974,7 +1045,17 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('initListExpr0')"><a name="initListExpr0Anchor">initListExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('implicitValueInitExpr0')"><a name="implicitValueInitExpr0Anchor">implicitValueInitExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ImplicitValueInitExpr.html">ImplicitValueInitExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="implicitValueInitExpr0"><pre>Matches implicit initializers of init list expressions.
+
+Given
+  point ptarray[10] = { [2].y = 1.0, [2].x = 2.0, [0].x = 1.0 };
+implicitValueInitExpr()
+  matches "[0].y" (implicitly)
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('initListExpr0')"><a name="initListExpr0Anchor">initListExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="initListExpr0"><pre>Matches init list expressions.
 
 Given
@@ -986,7 +1067,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('integerLiteral0')"><a name="integerLiteral0Anchor">integerLiteral</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('integerLiteral0')"><a name="integerLiteral0Anchor">integerLiteral</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="integerLiteral0"><pre>Matches integer literals of all sizes encodings, e.g.
 1, 1L, 0x1 and 1U.
 
@@ -994,7 +1075,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('labelStmt0')"><a name="labelStmt0Anchor">labelStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('labelStmt0')"><a name="labelStmt0Anchor">labelStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="labelStmt0"><pre>Matches label statements.
 
 Given
@@ -1005,7 +1086,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('lambdaExpr0')"><a name="lambdaExpr0Anchor">lambdaExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LambdaExpr.html">LambdaExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('lambdaExpr0')"><a name="lambdaExpr0Anchor">lambdaExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LambdaExpr.html">LambdaExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="lambdaExpr0"><pre>Matches lambda expressions.
 
 Example matches [&amp;](){return 5;}
@@ -1013,7 +1094,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('materializeTemporaryExpr0')"><a name="materializeTemporaryExpr0Anchor">materializeTemporaryExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MaterializeTemporaryExpr.html">MaterializeTemporaryExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('materializeTemporaryExpr0')"><a name="materializeTemporaryExpr0Anchor">materializeTemporaryExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MaterializeTemporaryExpr.html">MaterializeTemporaryExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="materializeTemporaryExpr0"><pre>Matches nodes where temporaries are materialized.
 
 Example: Given
@@ -1029,7 +1110,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('memberExpr0')"><a name="memberExpr0Anchor">memberExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('memberExpr0')"><a name="memberExpr0Anchor">memberExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="memberExpr0"><pre>Matches member expressions.
 
 Given
@@ -1042,7 +1123,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('nullStmt0')"><a name="nullStmt0Anchor">nullStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NullStmt.html">NullStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('nullStmt0')"><a name="nullStmt0Anchor">nullStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NullStmt.html">NullStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="nullStmt0"><pre>Matches null statements.
 
   foo();;
@@ -1051,7 +1132,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('objcMessageExpr0')"><a name="objcMessageExpr0Anchor">objcMessageExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('objcMessageExpr0')"><a name="objcMessageExpr0Anchor">objcMessageExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="objcMessageExpr0"><pre>Matches ObjectiveC Message invocation expressions.
 
 The innermost message send invokes the "alloc" class method on the
@@ -1062,7 +1143,51 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('returnStmt0')"><a name="returnStmt0Anchor">returnStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReturnStmt.html">ReturnStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('opaqueValueExpr0')"><a name="opaqueValueExpr0Anchor">opaqueValueExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1OpaqueValueExpr.html">OpaqueValueExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="opaqueValueExpr0"><pre>Matches opaque value expressions. They are used as helpers
+to reference another expressions and can be met
+in BinaryConditionalOperators, for example.
+
+Example matches 'a'
+  (a ?: c) + 42;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('parenExpr0')"><a name="parenExpr0Anchor">parenExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParenExpr.html">ParenExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="parenExpr0"><pre>Matches parentheses used in expressions.
+
+Example matches (foo() + 1)
+  int foo() { return 1; }
+  int a = (foo() + 1);
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('parenListExpr0')"><a name="parenListExpr0Anchor">parenListExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParenListExpr.html">ParenListExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="parenListExpr0"><pre>Matches paren list expressions.
+ParenListExprs don't have a predefined type and are used for late parsing.
+In the final AST, they can be met in template declarations.
+
+Given
+  template&lt;typename T&gt; class X {
+    void f() {
+      X x(*this);
+      int a = 0, b = 1; int i = (a, b);
+    }
+  };
+parenListExpr() matches "*this" but NOT matches (a, b) because (a, b)
+has a predefined type and is a ParenExpr, not a ParenListExpr.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('predefinedExpr0')"><a name="predefinedExpr0Anchor">predefinedExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PredefinedExpr.html">PredefinedExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="predefinedExpr0"><pre>Matches predefined identifier expressions [C99 6.4.2.2].
+
+Example: Matches __func__
+  printf("%s", __func__);
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('returnStmt0')"><a name="returnStmt0Anchor">returnStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReturnStmt.html">ReturnStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="returnStmt0"><pre>Matches return statements.
 
 Given
@@ -1072,7 +1197,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('stmt0')"><a name="stmt0Anchor">stmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('stmt0')"><a name="stmt0Anchor">stmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="stmt0"><pre>Matches statements.
 
 Given
@@ -1082,15 +1207,24 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('stringLiteral0')"><a name="stringLiteral0Anchor">stringLiteral</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1StringLiteral.html">StringLiteral</a>&gt;...</td></tr>
-<tr><td colspan="4" class="doc" id="stringLiteral0"><pre>Matches string literals (also matches wide string literals).
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('stmtExpr0')"><a name="stmtExpr0Anchor">stmtExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1StmtExpr.html">StmtExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="stmtExpr0"><pre>Matches statement expression (GNU extension).
 
-Example matches "abcd", L"abcd"
-  char *s = "abcd"; wchar_t *ws = L"abcd"
+Example match: ({ int X = 4; X; })
+  int C = ({ int X = 4; X; });
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('substNonTypeTemplateParmExpr0')"><a name="substNonTypeTemplateParmExpr0Anchor">substNonTypeTemplateParmExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1SubstNonTypeTemplateParmExpr.html">SubstNonTypeTemplateParmExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('stringLiteral0')"><a name="stringLiteral0Anchor">stringLiteral</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1StringLiteral.html">StringLiteral</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="stringLiteral0"><pre>Matches string literals (also matches wide string literals).
+
+Example matches "abcd", L"abcd"
+  char *s = "abcd";
+  wchar_t *ws = L"abcd";
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('substNonTypeTemplateParmExpr0')"><a name="substNonTypeTemplateParmExpr0Anchor">substNonTypeTemplateParmExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1SubstNonTypeTemplateParmExpr.html">SubstNonTypeTemplateParmExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="substNonTypeTemplateParmExpr0"><pre>Matches substitutions of non-type template parameters.
 
 Given
@@ -1102,7 +1236,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('switchCase0')"><a name="switchCase0Anchor">switchCase</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchCase.html">SwitchCase</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('switchCase0')"><a name="switchCase0Anchor">switchCase</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchCase.html">SwitchCase</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="switchCase0"><pre>Matches case and default statements inside switch statements.
 
 Given
@@ -1112,7 +1246,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('switchStmt0')"><a name="switchStmt0Anchor">switchStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchStmt.html">SwitchStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('switchStmt0')"><a name="switchStmt0Anchor">switchStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchStmt.html">SwitchStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="switchStmt0"><pre>Matches switch statements.
 
 Given
@@ -1122,7 +1256,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('unaryExprOrTypeTraitExpr0')"><a name="unaryExprOrTypeTraitExpr0Anchor">unaryExprOrTypeTraitExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('unaryExprOrTypeTraitExpr0')"><a name="unaryExprOrTypeTraitExpr0Anchor">unaryExprOrTypeTraitExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="unaryExprOrTypeTraitExpr0"><pre>Matches sizeof (C99), alignof (C++11) and vec_step (OpenCL)
 
 Given
@@ -1133,7 +1267,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('unaryOperator0')"><a name="unaryOperator0Anchor">unaryOperator</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryOperator.html">UnaryOperator</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('unaryOperator0')"><a name="unaryOperator0Anchor">unaryOperator</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryOperator.html">UnaryOperator</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="unaryOperator0"><pre>Matches unary operator expressions.
 
 Example matches !a
@@ -1141,14 +1275,29 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('userDefinedLiteral0')"><a name="userDefinedLiteral0Anchor">userDefinedLiteral</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UserDefinedLiteral.html">UserDefinedLiteral</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('unresolvedLookupExpr0')"><a name="unresolvedLookupExpr0Anchor">unresolvedLookupExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedLookupExpr.html">UnresolvedLookupExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="unresolvedLookupExpr0"><pre>Matches reference to a name that can be looked up during parsing
+but could not be resolved to a specific declaration.
+
+Given
+  template&lt;typename T&gt;
+  T foo() { T a; return a; }
+  template&lt;typename T&gt;
+  void bar() {
+    foo&lt;T&gt;();
+  }
+unresolvedLookupExpr()
+  matches foo&lt;T&gt;() </pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('userDefinedLiteral0')"><a name="userDefinedLiteral0Anchor">userDefinedLiteral</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UserDefinedLiteral.html">UserDefinedLiteral</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="userDefinedLiteral0"><pre>Matches user defined literal operator call.
 
 Example match: "foo"_suffix
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('whileStmt0')"><a name="whileStmt0Anchor">whileStmt</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1WhileStmt.html">WhileStmt</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('whileStmt0')"><a name="whileStmt0Anchor">whileStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1WhileStmt.html">WhileStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="whileStmt0"><pre>Matches while statements.
 
 Given
@@ -1158,7 +1307,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('templateArgument0')"><a name="templateArgument0Anchor">templateArgument</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('templateArgument0')"><a name="templateArgument0Anchor">templateArgument</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="templateArgument0"><pre>Matches template arguments.
 
 Given
@@ -1169,12 +1318,23 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('typeLoc0')"><a name="typeLoc0Anchor">typeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateName.html">TemplateName</a>&gt;</td><td class="name" onclick="toggle('templateName0')"><a name="templateName0Anchor">templateName</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateName.html">TemplateName</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="templateName0"><pre>Matches template name.
+
+Given
+  template &lt;typename T&gt; class X { };
+  X&lt;int&gt; xi;
+templateName()
+  matches 'X' in X&lt;int&gt;.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('typeLoc0')"><a name="typeLoc0Anchor">typeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="typeLoc0"><pre>Matches TypeLocs in the clang AST.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('arrayType0')"><a name="arrayType0Anchor">arrayType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('arrayType0')"><a name="arrayType0Anchor">arrayType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="arrayType0"><pre>Matches all kinds of arrays.
 
 Given
@@ -1186,7 +1346,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('atomicType0')"><a name="atomicType0Anchor">atomicType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicType.html">AtomicType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('atomicType0')"><a name="atomicType0Anchor">atomicType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicType.html">AtomicType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="atomicType0"><pre>Matches atomic types.
 
 Given
@@ -1196,7 +1356,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('autoType0')"><a name="autoType0Anchor">autoType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('autoType0')"><a name="autoType0Anchor">autoType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="autoType0"><pre>Matches types nodes representing C++11 auto types.
 
 Given:
@@ -1208,7 +1368,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('blockPointerType0')"><a name="blockPointerType0Anchor">blockPointerType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('blockPointerType0')"><a name="blockPointerType0Anchor">blockPointerType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="blockPointerType0"><pre>Matches block pointer types, i.e. types syntactically represented as
 "void (^)(int)".
 
@@ -1216,7 +1376,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('builtinType0')"><a name="builtinType0Anchor">builtinType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BuiltinType.html">BuiltinType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('builtinType0')"><a name="builtinType0Anchor">builtinType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BuiltinType.html">BuiltinType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="builtinType0"><pre>Matches builtin Types.
 
 Given
@@ -1230,7 +1390,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('complexType0')"><a name="complexType0Anchor">complexType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('complexType0')"><a name="complexType0Anchor">complexType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="complexType0"><pre>Matches C99 complex types.
 
 Given
@@ -1240,7 +1400,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('constantArrayType0')"><a name="constantArrayType0Anchor">constantArrayType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ConstantArrayType.html">ConstantArrayType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('constantArrayType0')"><a name="constantArrayType0Anchor">constantArrayType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ConstantArrayType.html">ConstantArrayType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="constantArrayType0"><pre>Matches C arrays with a specified constant size.
 
 Given
@@ -1254,7 +1414,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('decayedType0')"><a name="decayedType0Anchor">decayedType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DecayedType.html">DecayedType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('decayedType0')"><a name="decayedType0Anchor">decayedType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DecayedType.html">DecayedType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="decayedType0"><pre>Matches decayed type
 Example matches i[] in declaration of f.
     (matcher = valueDecl(hasType(decayedType(hasDecayedType(pointerType())))))
@@ -1266,7 +1426,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('dependentSizedArrayType0')"><a name="dependentSizedArrayType0Anchor">dependentSizedArrayType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DependentSizedArrayType.html">DependentSizedArrayType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('dependentSizedArrayType0')"><a name="dependentSizedArrayType0Anchor">dependentSizedArrayType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DependentSizedArrayType.html">DependentSizedArrayType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="dependentSizedArrayType0"><pre>Matches C++ arrays whose size is a value-dependent expression.
 
 Given
@@ -1279,7 +1439,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('elaboratedType0')"><a name="elaboratedType0Anchor">elaboratedType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('elaboratedType0')"><a name="elaboratedType0Anchor">elaboratedType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="elaboratedType0"><pre>Matches types specified with an elaborated type keyword or with a
 qualified name.
 
@@ -1299,7 +1459,34 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('functionType0')"><a name="functionType0Anchor">functionType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionType.html">FunctionType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('enumType0')"><a name="enumType0Anchor">enumType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="enumType0"><pre>Matches enum types.
+
+Given
+  enum C { Green };
+  enum class S { Red };
+
+  C c;
+  S s;
+
+enumType() matches the type of the variable declarations of both c and
+s.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('functionProtoType0')"><a name="functionProtoType0Anchor">functionProtoType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionProtoType.html">FunctionProtoType</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="functionProtoType0"><pre>Matches FunctionProtoType nodes.
+
+Given
+  int (*f)(int);
+  void g();
+functionProtoType()
+  matches "int (*f)(int)" and the type of "g" in C++ mode.
+  In C mode, "g" is not matched because it does not contain a prototype.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('functionType0')"><a name="functionType0Anchor">functionType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionType.html">FunctionType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="functionType0"><pre>Matches FunctionType nodes.
 
 Given
@@ -1310,7 +1497,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('incompleteArrayType0')"><a name="incompleteArrayType0Anchor">incompleteArrayType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IncompleteArrayType.html">IncompleteArrayType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('incompleteArrayType0')"><a name="incompleteArrayType0Anchor">incompleteArrayType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IncompleteArrayType.html">IncompleteArrayType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="incompleteArrayType0"><pre>Matches C arrays with unspecified size.
 
 Given
@@ -1322,7 +1509,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('injectedClassNameType0')"><a name="injectedClassNameType0Anchor">injectedClassNameType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('injectedClassNameType0')"><a name="injectedClassNameType0Anchor">injectedClassNameType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="injectedClassNameType0"><pre>Matches injected class name types.
 
 Example matches S s, but not S&lt;T&gt; s.
@@ -1334,7 +1521,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('lValueReferenceType0')"><a name="lValueReferenceType0Anchor">lValueReferenceType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LValueReferenceType.html">LValueReferenceType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('lValueReferenceType0')"><a name="lValueReferenceType0Anchor">lValueReferenceType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LValueReferenceType.html">LValueReferenceType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="lValueReferenceType0"><pre>Matches lvalue reference types.
 
 Given:
@@ -1351,7 +1538,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('memberPointerType0')"><a name="memberPointerType0Anchor">memberPointerType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('memberPointerType0')"><a name="memberPointerType0Anchor">memberPointerType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="memberPointerType0"><pre>Matches member pointer types.
 Given
   struct A { int i; }
@@ -1361,7 +1548,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('objcObjectPointerType0')"><a name="objcObjectPointerType0Anchor">objcObjectPointerType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCObjectPointerType.html">ObjCObjectPointerType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('objcObjectPointerType0')"><a name="objcObjectPointerType0Anchor">objcObjectPointerType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCObjectPointerType.html">ObjCObjectPointerType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="objcObjectPointerType0"><pre>Matches an Objective-C object pointer type, which is different from
 a pointer type, despite being syntactically similar.
 
@@ -1376,7 +1563,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('parenType0')"><a name="parenType0Anchor">parenType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ParenType.html">ParenType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('parenType0')"><a name="parenType0Anchor">parenType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParenType.html">ParenType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="parenType0"><pre>Matches ParenType nodes.
 
 Given
@@ -1388,7 +1575,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('pointerType0')"><a name="pointerType0Anchor">pointerType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('pointerType0')"><a name="pointerType0Anchor">pointerType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="pointerType0"><pre>Matches pointer types, but does not match Objective-C object pointer
 types.
 
@@ -1405,7 +1592,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('rValueReferenceType0')"><a name="rValueReferenceType0Anchor">rValueReferenceType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RValueReferenceType.html">RValueReferenceType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('rValueReferenceType0')"><a name="rValueReferenceType0Anchor">rValueReferenceType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RValueReferenceType.html">RValueReferenceType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="rValueReferenceType0"><pre>Matches rvalue reference types.
 
 Given:
@@ -1422,7 +1609,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('recordType0')"><a name="recordType0Anchor">recordType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('recordType0')"><a name="recordType0Anchor">recordType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="recordType0"><pre>Matches record types (e.g. structs, classes).
 
 Given
@@ -1437,7 +1624,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('referenceType0')"><a name="referenceType0Anchor">referenceType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('referenceType0')"><a name="referenceType0Anchor">referenceType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="referenceType0"><pre>Matches both lvalue and rvalue reference types.
 
 Given
@@ -1453,7 +1640,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('substTemplateTypeParmType0')"><a name="substTemplateTypeParmType0Anchor">substTemplateTypeParmType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1SubstTemplateTypeParmType.html">SubstTemplateTypeParmType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('substTemplateTypeParmType0')"><a name="substTemplateTypeParmType0Anchor">substTemplateTypeParmType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1SubstTemplateTypeParmType.html">SubstTemplateTypeParmType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="substTemplateTypeParmType0"><pre>Matches types that represent the result of substituting a type for a
 template type parameter.
 
@@ -1467,7 +1654,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('templateSpecializationType0')"><a name="templateSpecializationType0Anchor">templateSpecializationType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('templateSpecializationType0')"><a name="templateSpecializationType0Anchor">templateSpecializationType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="templateSpecializationType0"><pre>Matches template specialization types.
 
 Given
@@ -1482,7 +1669,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('templateTypeParmType0')"><a name="templateTypeParmType0Anchor">templateTypeParmType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('templateTypeParmType0')"><a name="templateTypeParmType0Anchor">templateTypeParmType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="templateTypeParmType0"><pre>Matches template type parameter types.
 
 Example matches T, but not int.
@@ -1491,12 +1678,12 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('type0')"><a name="type0Anchor">type</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('type0')"><a name="type0Anchor">type</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="type0"><pre>Matches Types in the clang AST.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('typedefType0')"><a name="typedefType0Anchor">typedefType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('typedefType0')"><a name="typedefType0Anchor">typedefType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="typedefType0"><pre>Matches typedef types.
 
 Given
@@ -1506,7 +1693,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('unaryTransformType0')"><a name="unaryTransformType0Anchor">unaryTransformType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryTransformType.html">UnaryTransformType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('unaryTransformType0')"><a name="unaryTransformType0Anchor">unaryTransformType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryTransformType.html">UnaryTransformType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="unaryTransformType0"><pre>Matches types nodes representing unary type transformations.
 
 Given:
@@ -1516,7 +1703,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('variableArrayType0')"><a name="variableArrayType0Anchor">variableArrayType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VariableArrayType.html">VariableArrayType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('variableArrayType0')"><a name="variableArrayType0Anchor">variableArrayType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VariableArrayType.html">VariableArrayType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="variableArrayType0"><pre>Matches C arrays with a specified size that is not an
 integer-constant-expression.
 
@@ -1588,7 +1775,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasOperatorName0')"><a name="hasOperatorName0Anchor">hasOperatorName</a></td><td>std::string Name</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasOperatorName0')"><a name="hasOperatorName0Anchor">hasOperatorName</a></td><td>std::string Name</td></tr>
 <tr><td colspan="4" class="doc" id="hasOperatorName0"><pre>Matches the operator Name of operator expressions (binary or
 unary).
 
@@ -1603,12 +1790,12 @@
 Example matches true (matcher = cxxBoolLiteral(equals(true)))
   true
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;CXXBoolLiteral&gt;,
-           Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;CXXBoolLiteral&gt;,
+           Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCatchStmt.html">CXXCatchStmt</a>&gt;</td><td class="name" onclick="toggle('isCatchAll0')"><a name="isCatchAll0Anchor">isCatchAll</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCatchStmt.html">CXXCatchStmt</a>&gt;</td><td class="name" onclick="toggle('isCatchAll0')"><a name="isCatchAll0Anchor">isCatchAll</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isCatchAll0"><pre>Matches a C++ catch statement that has a catch-all handler.
 
 Given
@@ -1624,7 +1811,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('argumentCountIs1')"><a name="argumentCountIs1Anchor">argumentCountIs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('argumentCountIs1')"><a name="argumentCountIs1Anchor">argumentCountIs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="argumentCountIs1"><pre>Checks that a call expression or a constructor call expression has
 a specific number of arguments (including absent default arguments).
 
@@ -1634,12 +1821,26 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('isListInitialization0')"><a name="isListInitialization0Anchor">isListInitialization</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('isListInitialization0')"><a name="isListInitialization0Anchor">isListInitialization</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isListInitialization0"><pre>Matches a constructor call expression which uses list initialization.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isCopyConstructor0')"><a name="isCopyConstructor0Anchor">isCopyConstructor</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('requiresZeroInitialization0')"><a name="requiresZeroInitialization0Anchor">requiresZeroInitialization</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="requiresZeroInitialization0"><pre>Matches a constructor call expression which requires
+zero initialization.
+
+Given
+void foo() {
+  struct point { double x; double y; };
+  point pt[2] = { { 1.0, 2.0 } };
+}
+initListExpr(has(cxxConstructExpr(requiresZeroInitialization()))
+will match the implicit array filler for pt[1].
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isCopyConstructor0')"><a name="isCopyConstructor0Anchor">isCopyConstructor</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isCopyConstructor0"><pre>Matches constructor declarations that are copy constructors.
 
 Given
@@ -1652,7 +1853,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isDefaultConstructor0')"><a name="isDefaultConstructor0Anchor">isDefaultConstructor</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isDefaultConstructor0')"><a name="isDefaultConstructor0Anchor">isDefaultConstructor</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isDefaultConstructor0"><pre>Matches constructor declarations that are default constructors.
 
 Given
@@ -1665,7 +1866,22 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicit0')"><a name="isExplicit0Anchor">isExplicit</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isDelegatingConstructor0')"><a name="isDelegatingConstructor0Anchor">isDelegatingConstructor</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isDelegatingConstructor0"><pre>Matches constructors that delegate to another constructor.
+
+Given
+  struct S {
+    S(); #1
+    S(int) {} #2
+    S(S &amp;&amp;) : S() {} #3
+  };
+  S::S() : S(0) {} #4
+cxxConstructorDecl(isDelegatingConstructor()) will match #3 and #4, but not
+#1 or #2.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicit0')"><a name="isExplicit0Anchor">isExplicit</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExplicit0"><pre>Matches constructor and conversion declarations that are marked with
 the explicit keyword.
 
@@ -1681,7 +1897,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isMoveConstructor0')"><a name="isMoveConstructor0Anchor">isMoveConstructor</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('isMoveConstructor0')"><a name="isMoveConstructor0Anchor">isMoveConstructor</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isMoveConstructor0"><pre>Matches constructor declarations that are move constructors.
 
 Given
@@ -1694,7 +1910,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConversionDecl.html">CXXConversionDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicit1')"><a name="isExplicit1Anchor">isExplicit</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConversionDecl.html">CXXConversionDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicit1')"><a name="isExplicit1Anchor">isExplicit</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExplicit1"><pre>Matches constructor and conversion declarations that are marked with
 the explicit keyword.
 
@@ -1710,7 +1926,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('isBaseInitializer0')"><a name="isBaseInitializer0Anchor">isBaseInitializer</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('isBaseInitializer0')"><a name="isBaseInitializer0Anchor">isBaseInitializer</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isBaseInitializer0"><pre>Matches a constructor initializer if it is initializing a base, as
 opposed to a member.
 
@@ -1728,7 +1944,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('isMemberInitializer0')"><a name="isMemberInitializer0Anchor">isMemberInitializer</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('isMemberInitializer0')"><a name="isMemberInitializer0Anchor">isMemberInitializer</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isMemberInitializer0"><pre>Matches a constructor initializer if it is initializing a member, as
 opposed to a base.
 
@@ -1746,7 +1962,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('isWritten0')"><a name="isWritten0Anchor">isWritten</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('isWritten0')"><a name="isWritten0Anchor">isWritten</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isWritten0"><pre>Matches a constructor initializer if it is explicitly written in
 code (as opposed to implicitly added by the compiler).
 
@@ -1761,7 +1977,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isConst0')"><a name="isConst0Anchor">isConst</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isConst0')"><a name="isConst0Anchor">isConst</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isConst0"><pre>Matches if the given method declaration is const.
 
 Given
@@ -1774,7 +1990,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isCopyAssignmentOperator0')"><a name="isCopyAssignmentOperator0Anchor">isCopyAssignmentOperator</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isCopyAssignmentOperator0')"><a name="isCopyAssignmentOperator0Anchor">isCopyAssignmentOperator</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isCopyAssignmentOperator0"><pre>Matches if the given method declaration declares a copy assignment
 operator.
 
@@ -1789,7 +2005,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isFinal1')"><a name="isFinal1Anchor">isFinal</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isFinal1')"><a name="isFinal1Anchor">isFinal</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isFinal1"><pre>Matches if the given method or class declaration is final.
 
 Given:
@@ -1806,7 +2022,22 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isOverride0')"><a name="isOverride0Anchor">isOverride</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isMoveAssignmentOperator0')"><a name="isMoveAssignmentOperator0Anchor">isMoveAssignmentOperator</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isMoveAssignmentOperator0"><pre>Matches if the given method declaration declares a move assignment
+operator.
+
+Given
+struct A {
+  A &amp;operator=(const A &amp;);
+  A &amp;operator=(A &amp;&amp;);
+};
+
+cxxMethodDecl(isMoveAssignmentOperator()) matches the second method but not
+the first one.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isOverride0')"><a name="isOverride0Anchor">isOverride</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isOverride0"><pre>Matches if the given method declaration overrides another method.
 
 Given
@@ -1822,7 +2053,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isPure0')"><a name="isPure0Anchor">isPure</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isPure0')"><a name="isPure0Anchor">isPure</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isPure0"><pre>Matches if the given method declaration is pure.
 
 Given
@@ -1834,7 +2065,20 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isVirtual0')"><a name="isVirtual0Anchor">isVirtual</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isUserProvided0')"><a name="isUserProvided0Anchor">isUserProvided</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isUserProvided0"><pre>Matches method declarations that are user-provided.
+
+Given
+  struct S {
+    S(); #1
+    S(const S &amp;) = default; #2
+    S(S &amp;&amp;) = delete; #3
+  };
+cxxConstructorDecl(isUserProvided()) will match #1, but not #2 or #3.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isVirtual0')"><a name="isVirtual0Anchor">isVirtual</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isVirtual0"><pre>Matches if the given method declaration is virtual.
 
 Given
@@ -1846,7 +2090,23 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXOperatorCallExpr.html">CXXOperatorCallExpr</a>&gt;</td><td class="name" onclick="toggle('hasOverloadedOperatorName1')"><a name="hasOverloadedOperatorName1Anchor">hasOverloadedOperatorName</a></td><td>StringRef Name</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isVirtualAsWritten0')"><a name="isVirtualAsWritten0Anchor">isVirtualAsWritten</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isVirtualAsWritten0"><pre>Matches if the given method declaration has an explicit "virtual".
+
+Given
+  class A {
+   public:
+    virtual void x();
+  };
+  class B : public A {
+   public:
+    void x();
+  };
+  matches A::x but not B::x
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXOperatorCallExpr.html">CXXOperatorCallExpr</a>&gt;</td><td class="name" onclick="toggle('hasOverloadedOperatorName1')"><a name="hasOverloadedOperatorName1Anchor">hasOverloadedOperatorName</a></td><td>StringRef Name</td></tr>
 <tr><td colspan="4" class="doc" id="hasOverloadedOperatorName1"><pre>Matches overloaded operator names.
 
 Matches overloaded operator names specified in strings without the
@@ -1863,16 +2123,16 @@
 cxxRecordDecl(hasMethod(hasOverloadedOperatorName("*")))
 matches the declaration of A.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXOperatorCallExpr.html">CXXOperatorCallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXOperatorCallExpr.html">CXXOperatorCallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isDerivedFrom1')"><a name="isDerivedFrom1Anchor">isDerivedFrom</a></td><td>std::string BaseName</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isDerivedFrom1')"><a name="isDerivedFrom1Anchor">isDerivedFrom</a></td><td>std::string BaseName</td></tr>
 <tr><td colspan="4" class="doc" id="isDerivedFrom1"><pre>Overloaded method as shortcut for isDerivedFrom(hasName(...)).
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicitTemplateSpecialization2')"><a name="isExplicitTemplateSpecialization2Anchor">isExplicitTemplateSpecialization</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicitTemplateSpecialization2')"><a name="isExplicitTemplateSpecialization2Anchor">isExplicitTemplateSpecialization</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExplicitTemplateSpecialization2"><pre>Matches explicit template specializations of function, class, or
 static member variable template instantiations.
 
@@ -1882,11 +2142,11 @@
 functionDecl(isExplicitTemplateSpecialization())
   matches the specialization A&lt;int&gt;().
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isFinal0')"><a name="isFinal0Anchor">isFinal</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isFinal0')"><a name="isFinal0Anchor">isFinal</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isFinal0"><pre>Matches if the given method or class declaration is final.
 
 Given:
@@ -1903,13 +2163,24 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isSameOrDerivedFrom1')"><a name="isSameOrDerivedFrom1Anchor">isSameOrDerivedFrom</a></td><td>std::string BaseName</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isLambda0')"><a name="isLambda0Anchor">isLambda</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isLambda0"><pre>Matches the generated class of lambda expressions.
+
+Given:
+  auto x = []{};
+
+cxxRecordDecl(isLambda()) matches the implicit class declaration of
+decltype(x)
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isSameOrDerivedFrom1')"><a name="isSameOrDerivedFrom1Anchor">isSameOrDerivedFrom</a></td><td>std::string BaseName</td></tr>
 <tr><td colspan="4" class="doc" id="isSameOrDerivedFrom1"><pre>Overloaded method as shortcut for
 isSameOrDerivedFrom(hasName(...)).
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isTemplateInstantiation2')"><a name="isTemplateInstantiation2Anchor">isTemplateInstantiation</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isTemplateInstantiation2')"><a name="isTemplateInstantiation2Anchor">isTemplateInstantiation</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isTemplateInstantiation2"><pre>Matches template instantiations of function, class, or static
 member variable template instantiations.
 
@@ -1926,11 +2197,11 @@
 cxxRecordDecl(hasName("::X"), isTemplateInstantiation())
   does not match, as X&lt;A&gt; is an explicit template specialization.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('argumentCountIs0')"><a name="argumentCountIs0Anchor">argumentCountIs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('argumentCountIs0')"><a name="argumentCountIs0Anchor">argumentCountIs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="argumentCountIs0"><pre>Checks that a call expression or a constructor call expression has
 a specific number of arguments (including absent default arguments).
 
@@ -1940,18 +2211,27 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;</td><td class="name" onclick="toggle('equals3')"><a name="equals3Anchor">equals</a></td><td>ValueT  Value</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CastExpr.html">CastExpr</a>&gt;</td><td class="name" onclick="toggle('hasCastKind0')"><a name="hasCastKind0Anchor">hasCastKind</a></td><td>CastKind Kind</td></tr>
+<tr><td colspan="4" class="doc" id="hasCastKind0"><pre>Matches casts that has a given cast kind.
+
+Example: matches the implicit cast around 0
+(matcher = castExpr(hasCastKind(CK_NullToPointer)))
+  int *p = 0;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;</td><td class="name" onclick="toggle('equals3')"><a name="equals3Anchor">equals</a></td><td>ValueT  Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals3"><pre>Matches literals that are equal to the given value.
 
 Example matches true (matcher = cxxBoolLiteral(equals(true)))
   true
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;CXXBoolLiteral&gt;,
-           Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;CXXBoolLiteral&gt;,
+           Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('templateArgumentCountIs0')"><a name="templateArgumentCountIs0Anchor">templateArgumentCountIs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('templateArgumentCountIs0')"><a name="templateArgumentCountIs0Anchor">templateArgumentCountIs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="templateArgumentCountIs0"><pre>Matches if the number of template arguments equals N.
 
 Given
@@ -1962,7 +2242,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundStmt.html">CompoundStmt</a>&gt;</td><td class="name" onclick="toggle('statementCountIs0')"><a name="statementCountIs0Anchor">statementCountIs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundStmt.html">CompoundStmt</a>&gt;</td><td class="name" onclick="toggle('statementCountIs0')"><a name="statementCountIs0Anchor">statementCountIs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="statementCountIs0"><pre>Checks that a compound statement contains a specific number of
 child statements.
 
@@ -1974,19 +2254,24 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ConstantArrayType.html">ConstantArrayType</a>&gt;</td><td class="name" onclick="toggle('hasSize0')"><a name="hasSize0Anchor">hasSize</a></td><td>unsigned N</td></tr>
-<tr><td colspan="4" class="doc" id="hasSize0"><pre>Matches ConstantArrayType nodes that have the specified size.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ConstantArrayType.html">ConstantArrayType</a>&gt;</td><td class="name" onclick="toggle('hasSize0')"><a name="hasSize0Anchor">hasSize</a></td><td>unsigned N</td></tr>
+<tr><td colspan="4" class="doc" id="hasSize0"><pre>Matches nodes that have the specified size.
 
 Given
   int a[42];
   int b[2 * 21];
   int c[41], d[43];
+  char *s = "abcd";
+  wchar_t *ws = L"abcd";
+  char *w = "a";
 constantArrayType(hasSize(42))
   matches "int a[42]" and "int b[2 * 21]"
+stringLiteral(hasSize(4))
+  matches "abcd", L"abcd"
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;</td><td class="name" onclick="toggle('declCountIs0')"><a name="declCountIs0Anchor">declCountIs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;</td><td class="name" onclick="toggle('declCountIs0')"><a name="declCountIs0Anchor">declCountIs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="declCountIs0"><pre>Matches declaration statements that contain a specific number of
 declarations.
 
@@ -1999,7 +2284,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode1')"><a name="equalsBoundNode1Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode1')"><a name="equalsBoundNode1Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
 <tr><td colspan="4" class="doc" id="equalsBoundNode1"><pre>Matches if a node equals a previously bound node.
 
 Matches a node if it equals the node previously bound to ID.
@@ -2022,7 +2307,14 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('hasAttr0')"><a name="hasAttr0Anchor">hasAttr</a></td><td>attr::Kind AttrKind</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('equalsNode0')"><a name="equalsNode0Anchor">equalsNode</a></td><td>const Decl* Other</td></tr>
+<tr><td colspan="4" class="doc" id="equalsNode0"><pre>Matches if a node equals another node.
+
+Decl has pointer identity in the AST.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('hasAttr0')"><a name="hasAttr0Anchor">hasAttr</a></td><td>attr::Kind AttrKind</td></tr>
 <tr><td colspan="4" class="doc" id="hasAttr0"><pre>Matches declaration that has a given attribute.
 
 Given
@@ -2033,7 +2325,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isExpansionInFileMatching0')"><a name="isExpansionInFileMatching0Anchor">isExpansionInFileMatching</a></td><td>std::string RegExp</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isExpansionInFileMatching0')"><a name="isExpansionInFileMatching0Anchor">isExpansionInFileMatching</a></td><td>std::string RegExp</td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInFileMatching0"><pre>Matches AST nodes that were expanded within files whose name is
 partially matching a given regex.
 
@@ -2044,11 +2336,11 @@
 ASTMatcher.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isExpansionInMainFile0')"><a name="isExpansionInMainFile0Anchor">isExpansionInMainFile</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isExpansionInMainFile0')"><a name="isExpansionInMainFile0Anchor">isExpansionInMainFile</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInMainFile0"><pre>Matches AST nodes that were expanded within the main-file.
 
 Example matches X but not Y
@@ -2058,11 +2350,11 @@
 Y.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isExpansionInSystemHeader0')"><a name="isExpansionInSystemHeader0Anchor">isExpansionInSystemHeader</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isExpansionInSystemHeader0')"><a name="isExpansionInSystemHeader0Anchor">isExpansionInSystemHeader</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInSystemHeader0"><pre>Matches AST nodes that were expanded within system-header-files.
 
 Example matches Y but not X
@@ -2072,17 +2364,17 @@
 SystemHeader.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isImplicit0')"><a name="isImplicit0Anchor">isImplicit</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isImplicit0')"><a name="isImplicit0Anchor">isImplicit</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isImplicit0"><pre>Matches a declaration that has been implicitly added
 by the compiler (eg. implicit defaultcopy constructors).
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isPrivate0')"><a name="isPrivate0Anchor">isPrivate</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isPrivate0')"><a name="isPrivate0Anchor">isPrivate</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isPrivate0"><pre>Matches private C++ declarations.
 
 Given
@@ -2092,11 +2384,11 @@
   private:   int c;
   };
 fieldDecl(isPrivate())
-  matches 'int c;' 
+  matches 'int c;'
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isProtected0')"><a name="isProtected0Anchor">isProtected</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isProtected0')"><a name="isProtected0Anchor">isProtected</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isProtected0"><pre>Matches protected C++ declarations.
 
 Given
@@ -2106,11 +2398,11 @@
   private:   int c;
   };
 fieldDecl(isProtected())
-  matches 'int b;' 
+  matches 'int b;'
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isPublic0')"><a name="isPublic0Anchor">isPublic</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isPublic0')"><a name="isPublic0Anchor">isPublic</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isPublic0"><pre>Matches public C++ declarations.
 
 Given
@@ -2120,22 +2412,79 @@
   private:   int c;
   };
 fieldDecl(isPublic())
-  matches 'int a;' 
+  matches 'int a;'
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;</td><td class="name" onclick="toggle('equals1')"><a name="equals1Anchor">equals</a></td><td>ValueT  Value</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DesignatedInitExpr.html">DesignatedInitExpr</a>&gt;</td><td class="name" onclick="toggle('designatorCountIs0')"><a name="designatorCountIs0Anchor">designatorCountIs</a></td><td>unsigned N</td></tr>
+<tr><td colspan="4" class="doc" id="designatorCountIs0"><pre>Matches designated initializer expressions that contain
+a specific number of designators.
+
+Example: Given
+  point ptarray[10] = { [2].y = 1.0, [0].x = 1.0 };
+  point ptarray2[10] = { [2].y = 1.0, [2].x = 0.0, [0].x = 1.0 };
+designatorCountIs(2)
+  matches '{ [2].y = 1.0, [0].x = 1.0 }',
+  but not '{ [2].y = 1.0, [2].x = 0.0, [0].x = 1.0 }'.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FieldDecl.html">FieldDecl</a>&gt;</td><td class="name" onclick="toggle('hasBitWidth0')"><a name="hasBitWidth0Anchor">hasBitWidth</a></td><td>unsigned Width</td></tr>
+<tr><td colspan="4" class="doc" id="hasBitWidth0"><pre>Matches non-static data members that are bit-fields.
+
+Given
+  class C {
+    int a : 2;
+    int b : 4;
+    int c : 2;
+  };
+fieldDecl(isBitField())
+  matches 'int a;' and 'int c;' but not 'int b;'.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FieldDecl.html">FieldDecl</a>&gt;</td><td class="name" onclick="toggle('isBitField0')"><a name="isBitField0Anchor">isBitField</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isBitField0"><pre>Matches non-static data members that are bit-fields.
+
+Given
+  class C {
+    int a : 2;
+    int b;
+  };
+fieldDecl(isBitField())
+  matches 'int a;' but not 'int b;'.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;</td><td class="name" onclick="toggle('equals1')"><a name="equals1Anchor">equals</a></td><td>ValueT  Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals1"><pre>Matches literals that are equal to the given value.
 
 Example matches true (matcher = cxxBoolLiteral(equals(true)))
   true
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;CXXBoolLiteral&gt;,
-           Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;CXXBoolLiteral&gt;,
+           Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasOverloadedOperatorName0')"><a name="hasOverloadedOperatorName0Anchor">hasOverloadedOperatorName</a></td><td>StringRef Name</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasDynamicExceptionSpec0')"><a name="hasDynamicExceptionSpec0Anchor">hasDynamicExceptionSpec</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="hasDynamicExceptionSpec0"><pre>Matches functions that have a dynamic exception specification.
+
+Given:
+  void f();
+  void g() noexcept;
+  void h() noexcept(true);
+  void i() noexcept(false);
+  void j() throw();
+  void k() throw(int);
+  void l() throw(...);
+functionDecl(hasDynamicExceptionSpec()) and
+  functionProtoType(hasDynamicExceptionSpec())
+  match the declarations of j, k, and l, but not f, g, h, or i.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasOverloadedOperatorName0')"><a name="hasOverloadedOperatorName0Anchor">hasOverloadedOperatorName</a></td><td>StringRef Name</td></tr>
 <tr><td colspan="4" class="doc" id="hasOverloadedOperatorName0"><pre>Matches overloaded operator names.
 
 Matches overloaded operator names specified in strings without the
@@ -2152,11 +2501,11 @@
 cxxRecordDecl(hasMethod(hasOverloadedOperatorName("*")))
 matches the declaration of A.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXOperatorCallExpr.html">CXXOperatorCallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXOperatorCallExpr.html">CXXOperatorCallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isConstexpr1')"><a name="isConstexpr1Anchor">isConstexpr</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isConstexpr1')"><a name="isConstexpr1Anchor">isConstexpr</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isConstexpr1"><pre>Matches constexpr variable and function declarations.
 
 Given:
@@ -2169,22 +2518,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isDefinition2')"><a name="isDefinition2Anchor">isDefinition</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isDefinition2"><pre>Matches if a declaration has a body attached.
-
-Example matches A, va, fa
-  class A {};
-  class B;  Doesn't match, as it has no body.
-  int va;
-  extern int vb;  Doesn't match, as it doesn't define the variable.
-  void fa() {}
-  void fb();  Doesn't match, as it has no body.
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagDecl.html">TagDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isDefaulted0')"><a name="isDefaulted0Anchor">isDefaulted</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isDefaulted0')"><a name="isDefaulted0Anchor">isDefaulted</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isDefaulted0"><pre>Matches defaulted function declarations.
 
 Given:
@@ -2195,7 +2529,22 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isDeleted0')"><a name="isDeleted0Anchor">isDeleted</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isDefinition2')"><a name="isDefinition2Anchor">isDefinition</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isDefinition2"><pre>Matches if a declaration has a body attached.
+
+Example matches A, va, fa
+  class A {};
+  class B;  Doesn't match, as it has no body.
+  int va;
+  extern int vb;  Doesn't match, as it doesn't define the variable.
+  void fa() {}
+  void fb();  Doesn't match, as it has no body.
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagDecl.html">TagDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isDeleted0')"><a name="isDeleted0Anchor">isDeleted</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isDeleted0"><pre>Matches deleted function declarations.
 
 Given:
@@ -2206,7 +2555,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicitTemplateSpecialization0')"><a name="isExplicitTemplateSpecialization0Anchor">isExplicitTemplateSpecialization</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicitTemplateSpecialization0')"><a name="isExplicitTemplateSpecialization0Anchor">isExplicitTemplateSpecialization</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExplicitTemplateSpecialization0"><pre>Matches explicit template specializations of function, class, or
 static member variable template instantiations.
 
@@ -2216,11 +2565,11 @@
 functionDecl(isExplicitTemplateSpecialization())
   matches the specialization A&lt;int&gt;().
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isExternC0')"><a name="isExternC0Anchor">isExternC</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isExternC0')"><a name="isExternC0Anchor">isExternC</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExternC0"><pre>Matches extern "C" function declarations.
 
 Given:
@@ -2232,7 +2581,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isInline1')"><a name="isInline1Anchor">isInline</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isInline1')"><a name="isInline1Anchor">isInline</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isInline1"><pre>Matches function and namespace declarations that are marked with
 the inline keyword.
 
@@ -2247,7 +2596,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isNoThrow0')"><a name="isNoThrow0Anchor">isNoThrow</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isNoThrow0')"><a name="isNoThrow0Anchor">isNoThrow</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isNoThrow0"><pre>Matches functions that have a non-throwing exception specification.
 
 Given:
@@ -2256,12 +2605,12 @@
   void h() throw();
   void i() throw(int);
   void j() noexcept(false);
-functionDecl(isNoThrow())
-  matches the declarations of g, and h, but not f, i or j.
+functionDecl(isNoThrow()) and functionProtoType(isNoThrow())
+  match the declarations of g, and h, but not f, i or j.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isTemplateInstantiation0')"><a name="isTemplateInstantiation0Anchor">isTemplateInstantiation</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isTemplateInstantiation0')"><a name="isTemplateInstantiation0Anchor">isTemplateInstantiation</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isTemplateInstantiation0"><pre>Matches template instantiations of function, class, or static
 member variable template instantiations.
 
@@ -2278,11 +2627,11 @@
 cxxRecordDecl(hasName("::X"), isTemplateInstantiation())
   does not match, as X&lt;A&gt; is an explicit template specialization.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isVariadic0')"><a name="isVariadic0Anchor">isVariadic</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isVariadic0')"><a name="isVariadic0Anchor">isVariadic</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isVariadic0"><pre>Matches if a function declaration is variadic.
 
 Example matches f, but not g or h. The function i will not match, even when
@@ -2294,29 +2643,87 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('parameterCountIs0')"><a name="parameterCountIs0Anchor">parameterCountIs</a></td><td>unsigned N</td></tr>
-<tr><td colspan="4" class="doc" id="parameterCountIs0"><pre>Matches FunctionDecls that have a specific parameter count.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('parameterCountIs0')"><a name="parameterCountIs0Anchor">parameterCountIs</a></td><td>unsigned N</td></tr>
+<tr><td colspan="4" class="doc" id="parameterCountIs0"><pre>Matches FunctionDecls and FunctionProtoTypes that have a
+specific parameter count.
 
 Given
   void f(int i) {}
   void g(int i, int j) {}
+  void h(int i, int j);
+  void j(int i);
+  void k(int x, int y, int z, ...);
 functionDecl(parameterCountIs(2))
-  matches g(int i, int j) {}
+  matches void g(int i, int j) {}
+functionProtoType(parameterCountIs(2))
+  matches void h(int i, int j)
+functionProtoType(parameterCountIs(3))
+  matches void k(int x, int y, int z, ...);
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;</td><td class="name" onclick="toggle('equals0')"><a name="equals0Anchor">equals</a></td><td>ValueT  Value</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionProtoType.html">FunctionProtoType</a>&gt;</td><td class="name" onclick="toggle('hasDynamicExceptionSpec1')"><a name="hasDynamicExceptionSpec1Anchor">hasDynamicExceptionSpec</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="hasDynamicExceptionSpec1"><pre>Matches functions that have a dynamic exception specification.
+
+Given:
+  void f();
+  void g() noexcept;
+  void h() noexcept(true);
+  void i() noexcept(false);
+  void j() throw();
+  void k() throw(int);
+  void l() throw(...);
+functionDecl(hasDynamicExceptionSpec()) and
+  functionProtoType(hasDynamicExceptionSpec())
+  match the declarations of j, k, and l, but not f, g, h, or i.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionProtoType.html">FunctionProtoType</a>&gt;</td><td class="name" onclick="toggle('isNoThrow1')"><a name="isNoThrow1Anchor">isNoThrow</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isNoThrow1"><pre>Matches functions that have a non-throwing exception specification.
+
+Given:
+  void f();
+  void g() noexcept;
+  void h() throw();
+  void i() throw(int);
+  void j() noexcept(false);
+functionDecl(isNoThrow()) and functionProtoType(isNoThrow())
+  match the declarations of g, and h, but not f, i or j.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionProtoType.html">FunctionProtoType</a>&gt;</td><td class="name" onclick="toggle('parameterCountIs1')"><a name="parameterCountIs1Anchor">parameterCountIs</a></td><td>unsigned N</td></tr>
+<tr><td colspan="4" class="doc" id="parameterCountIs1"><pre>Matches FunctionDecls and FunctionProtoTypes that have a
+specific parameter count.
+
+Given
+  void f(int i) {}
+  void g(int i, int j) {}
+  void h(int i, int j);
+  void j(int i);
+  void k(int x, int y, int z, ...);
+functionDecl(parameterCountIs(2))
+  matches void g(int i, int j) {}
+functionProtoType(parameterCountIs(2))
+  matches void h(int i, int j)
+functionProtoType(parameterCountIs(3))
+  matches void k(int x, int y, int z, ...);
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;</td><td class="name" onclick="toggle('equals0')"><a name="equals0Anchor">equals</a></td><td>ValueT  Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals0"><pre>Matches literals that are equal to the given value.
 
 Example matches true (matcher = cxxBoolLiteral(equals(true)))
   true
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;CXXBoolLiteral&gt;,
-           Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;CXXBoolLiteral&gt;,
+           Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('isArrow0')"><a name="isArrow0Anchor">isArrow</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('isArrow0')"><a name="isArrow0Anchor">isArrow</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isArrow0"><pre>Matches member expressions that are called with '-&gt;' as opposed
 to '.'.
 
@@ -2333,7 +2740,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;</td><td class="name" onclick="toggle('hasName0')"><a name="hasName0Anchor">hasName</a></td><td>std::string  Name</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;</td><td class="name" onclick="toggle('hasName0')"><a name="hasName0Anchor">hasName</a></td><td>std::string  Name</td></tr>
 <tr><td colspan="4" class="doc" id="hasName0"><pre>Matches NamedDecl nodes that have the specified name.
 
 Supports specifying enclosing namespaces or classes by prefixing the name
@@ -2348,7 +2755,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;</td><td class="name" onclick="toggle('matchesName0')"><a name="matchesName0Anchor">matchesName</a></td><td>std::string RegExp</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;</td><td class="name" onclick="toggle('matchesName0')"><a name="matchesName0Anchor">matchesName</a></td><td>std::string RegExp</td></tr>
 <tr><td colspan="4" class="doc" id="matchesName0"><pre>Matches NamedDecl nodes whose fully qualified names contain
 a substring matched by the given RegExp.
 
@@ -2364,7 +2771,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt;</td><td class="name" onclick="toggle('isAnonymous0')"><a name="isAnonymous0Anchor">isAnonymous</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt;</td><td class="name" onclick="toggle('isAnonymous0')"><a name="isAnonymous0Anchor">isAnonymous</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isAnonymous0"><pre>Matches anonymous namespace declarations.
 
 Given
@@ -2375,7 +2782,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt;</td><td class="name" onclick="toggle('isInline0')"><a name="isInline0Anchor">isInline</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt;</td><td class="name" onclick="toggle('isInline0')"><a name="isInline0Anchor">isInline</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isInline0"><pre>Matches function and namespace declarations that are marked with
 the inline keyword.
 
@@ -2390,7 +2797,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('argumentCountIs2')"><a name="argumentCountIs2Anchor">argumentCountIs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('argumentCountIs2')"><a name="argumentCountIs2Anchor">argumentCountIs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="argumentCountIs2"><pre>Checks that a call expression or a constructor call expression has
 a specific number of arguments (including absent default arguments).
 
@@ -2400,7 +2807,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasKeywordSelector0')"><a name="hasKeywordSelector0Anchor">hasKeywordSelector</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasKeywordSelector0')"><a name="hasKeywordSelector0Anchor">hasKeywordSelector</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasKeywordSelector0"><pre>Matches when the selector is a keyword selector
 
 objCMessageExpr(hasKeywordSelector()) matches the generated setFrame
@@ -2414,7 +2821,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasNullSelector0')"><a name="hasNullSelector0Anchor">hasNullSelector</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasNullSelector0')"><a name="hasNullSelector0Anchor">hasNullSelector</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasNullSelector0"><pre>Matches when the selector is the empty selector
 
 Matches only when the selector of the objCMessageExpr is NULL. This may
@@ -2422,7 +2829,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasSelector0')"><a name="hasSelector0Anchor">hasSelector</a></td><td>std::string BaseName</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasSelector0')"><a name="hasSelector0Anchor">hasSelector</a></td><td>std::string BaseName</td></tr>
 <tr><td colspan="4" class="doc" id="hasSelector0"><pre>Matches when BaseName == Selector.getAsString()
 
  matcher = objCMessageExpr(hasSelector("loadHTMLString:baseURL:"));
@@ -2432,7 +2839,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasUnarySelector0')"><a name="hasUnarySelector0Anchor">hasUnarySelector</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasUnarySelector0')"><a name="hasUnarySelector0Anchor">hasUnarySelector</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasUnarySelector0"><pre>Matches when the selector is a Unary Selector
 
  matcher = objCMessageExpr(matchesSelector(hasUnarySelector());
@@ -2442,7 +2849,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('matchesSelector0')"><a name="matchesSelector0Anchor">matchesSelector</a></td><td>std::string RegExp</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('matchesSelector0')"><a name="matchesSelector0Anchor">matchesSelector</a></td><td>std::string RegExp</td></tr>
 <tr><td colspan="4" class="doc" id="matchesSelector0"><pre>Matches ObjC selectors whose name contains
 a substring matched by the given RegExp.
  matcher = objCMessageExpr(matchesSelector("loadHTMLStringmatches the outer message expr in the code below, but NOT the message
@@ -2451,7 +2858,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('numSelectorArgs0')"><a name="numSelectorArgs0Anchor">numSelectorArgs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('numSelectorArgs0')"><a name="numSelectorArgs0Anchor">numSelectorArgs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="numSelectorArgs0"><pre>Matches when the selector has the specified number of arguments
 
  matcher = objCMessageExpr(numSelectorArgs(0));
@@ -2464,7 +2871,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('asString0')"><a name="asString0Anchor">asString</a></td><td>std::string Name</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('asString0')"><a name="asString0Anchor">asString</a></td><td>std::string Name</td></tr>
 <tr><td colspan="4" class="doc" id="asString0"><pre>Matches if the matched type is represented by the given string.
 
 Given
@@ -2475,7 +2882,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode3')"><a name="equalsBoundNode3Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode3')"><a name="equalsBoundNode3Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
 <tr><td colspan="4" class="doc" id="equalsBoundNode3"><pre>Matches if a node equals a previously bound node.
 
 Matches a node if it equals the node previously bound to ID.
@@ -2498,7 +2905,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasLocalQualifiers0')"><a name="hasLocalQualifiers0Anchor">hasLocalQualifiers</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasLocalQualifiers0')"><a name="hasLocalQualifiers0Anchor">hasLocalQualifiers</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasLocalQualifiers0"><pre>Matches QualType nodes that have local CV-qualifiers attached to
 the node, not hidden within a typedef.
 
@@ -2513,7 +2920,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isAnyCharacter0')"><a name="isAnyCharacter0Anchor">isAnyCharacter</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isAnyCharacter0')"><a name="isAnyCharacter0Anchor">isAnyCharacter</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isAnyCharacter0"><pre>Matches QualType nodes that are of character type.
 
 Given
@@ -2525,7 +2932,25 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isConstQualified0')"><a name="isConstQualified0Anchor">isConstQualified</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isAnyPointer0')"><a name="isAnyPointer0Anchor">isAnyPointer</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isAnyPointer0"><pre>Matches QualType nodes that are of any pointer type; this includes
+the Objective-C object pointer type, which is different despite being
+syntactically similar.
+
+Given
+  int *i = nullptr;
+
+  @interface Foo
+  @end
+  Foo *f;
+
+  int j;
+varDecl(hasType(isAnyPointer()))
+  matches "int *i" and "Foo *f", but not "int j".
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isConstQualified0')"><a name="isConstQualified0Anchor">isConstQualified</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isConstQualified0"><pre>Matches QualType nodes that are const-qualified, i.e., that
 include "top-level" const.
 
@@ -2542,7 +2967,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isInteger0')"><a name="isInteger0Anchor">isInteger</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isInteger0')"><a name="isInteger0Anchor">isInteger</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isInteger0"><pre>Matches QualType nodes that are of integer type.
 
 Given
@@ -2554,7 +2979,31 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isVolatileQualified0')"><a name="isVolatileQualified0Anchor">isVolatileQualified</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isSignedInteger0')"><a name="isSignedInteger0Anchor">isSignedInteger</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isSignedInteger0"><pre>Matches QualType nodes that are of signed integer type.
+
+Given
+  void a(int);
+  void b(unsigned long);
+  void c(double);
+functionDecl(hasAnyParameter(hasType(isInteger())))
+matches "a(int)", but not "b(unsigned long)" and "c(double)".
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isUnsignedInteger0')"><a name="isUnsignedInteger0Anchor">isUnsignedInteger</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isUnsignedInteger0"><pre>Matches QualType nodes that are of unsigned integer type.
+
+Given
+  void a(int);
+  void b(unsigned long);
+  void c(double);
+functionDecl(hasAnyParameter(hasType(isInteger())))
+matches "b(unsigned long)", but not "a(int)" and "c(double)".
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('isVolatileQualified0')"><a name="isVolatileQualified0Anchor">isVolatileQualified</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isVolatileQualified0"><pre>Matches QualType nodes that are volatile-qualified, i.e., that
 include "top-level" volatile.
 
@@ -2571,7 +3020,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordDecl.html">RecordDecl</a>&gt;</td><td class="name" onclick="toggle('isClass0')"><a name="isClass0Anchor">isClass</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordDecl.html">RecordDecl</a>&gt;</td><td class="name" onclick="toggle('isClass0')"><a name="isClass0Anchor">isClass</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isClass0"><pre>Matches RecordDecl object that are spelled with "class."
 
 Example matches C, but not S or U.
@@ -2581,7 +3030,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordDecl.html">RecordDecl</a>&gt;</td><td class="name" onclick="toggle('isStruct0')"><a name="isStruct0Anchor">isStruct</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordDecl.html">RecordDecl</a>&gt;</td><td class="name" onclick="toggle('isStruct0')"><a name="isStruct0Anchor">isStruct</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isStruct0"><pre>Matches RecordDecl object that are spelled with "struct."
 
 Example matches S, but not C or U.
@@ -2591,7 +3040,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordDecl.html">RecordDecl</a>&gt;</td><td class="name" onclick="toggle('isUnion0')"><a name="isUnion0Anchor">isUnion</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordDecl.html">RecordDecl</a>&gt;</td><td class="name" onclick="toggle('isUnion0')"><a name="isUnion0Anchor">isUnion</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isUnion0"><pre>Matches RecordDecl object that are spelled with "union."
 
 Example matches U, but not C or S.
@@ -2601,7 +3050,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode0')"><a name="equalsBoundNode0Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode0')"><a name="equalsBoundNode0Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
 <tr><td colspan="4" class="doc" id="equalsBoundNode0"><pre>Matches if a node equals a previously bound node.
 
 Matches a node if it equals the node previously bound to ID.
@@ -2624,7 +3073,14 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('isExpansionInFileMatching1')"><a name="isExpansionInFileMatching1Anchor">isExpansionInFileMatching</a></td><td>std::string RegExp</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('equalsNode1')"><a name="equalsNode1Anchor">equalsNode</a></td><td>const Stmt* Other</td></tr>
+<tr><td colspan="4" class="doc" id="equalsNode1"><pre>Matches if a node equals another node.
+
+Stmt has pointer identity in the AST.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('isExpansionInFileMatching1')"><a name="isExpansionInFileMatching1Anchor">isExpansionInFileMatching</a></td><td>std::string RegExp</td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInFileMatching1"><pre>Matches AST nodes that were expanded within files whose name is
 partially matching a given regex.
 
@@ -2635,11 +3091,11 @@
 ASTMatcher.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('isExpansionInMainFile1')"><a name="isExpansionInMainFile1Anchor">isExpansionInMainFile</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('isExpansionInMainFile1')"><a name="isExpansionInMainFile1Anchor">isExpansionInMainFile</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInMainFile1"><pre>Matches AST nodes that were expanded within the main-file.
 
 Example matches X but not Y
@@ -2649,11 +3105,11 @@
 Y.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('isExpansionInSystemHeader1')"><a name="isExpansionInSystemHeader1Anchor">isExpansionInSystemHeader</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('isExpansionInSystemHeader1')"><a name="isExpansionInSystemHeader1Anchor">isExpansionInSystemHeader</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInSystemHeader1"><pre>Matches AST nodes that were expanded within system-header-files.
 
 Example matches Y but not X
@@ -2663,11 +3119,28 @@
 SystemHeader.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagDecl.html">TagDecl</a>&gt;</td><td class="name" onclick="toggle('isDefinition0')"><a name="isDefinition0Anchor">isDefinition</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1StringLiteral.html">StringLiteral</a>&gt;</td><td class="name" onclick="toggle('hasSize1')"><a name="hasSize1Anchor">hasSize</a></td><td>unsigned N</td></tr>
+<tr><td colspan="4" class="doc" id="hasSize1"><pre>Matches nodes that have the specified size.
+
+Given
+  int a[42];
+  int b[2 * 21];
+  int c[41], d[43];
+  char *s = "abcd";
+  wchar_t *ws = L"abcd";
+  char *w = "a";
+constantArrayType(hasSize(42))
+  matches "int a[42]" and "int b[2 * 21]"
+stringLiteral(hasSize(4))
+  matches "abcd", L"abcd"
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagDecl.html">TagDecl</a>&gt;</td><td class="name" onclick="toggle('isDefinition0')"><a name="isDefinition0Anchor">isDefinition</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isDefinition0"><pre>Matches if a declaration has a body attached.
 
 Example matches A, va, fa
@@ -2678,11 +3151,11 @@
   void fa() {}
   void fb();  Doesn't match, as it has no body.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagDecl.html">TagDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagDecl.html">TagDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('equalsIntegralValue0')"><a name="equalsIntegralValue0Anchor">equalsIntegralValue</a></td><td>std::string Value</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('equalsIntegralValue0')"><a name="equalsIntegralValue0Anchor">equalsIntegralValue</a></td><td>std::string Value</td></tr>
 <tr><td colspan="4" class="doc" id="equalsIntegralValue0"><pre>Matches a TemplateArgument of integral type with a given value.
 
 Note that 'Value' is a string as the template argument's value is
@@ -2698,7 +3171,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('isIntegral0')"><a name="isIntegral0Anchor">isIntegral</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('isIntegral0')"><a name="isIntegral0Anchor">isIntegral</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isIntegral0"><pre>Matches a TemplateArgument that is an integral value.
 
 Given
@@ -2711,7 +3184,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('templateArgumentCountIs1')"><a name="templateArgumentCountIs1Anchor">templateArgumentCountIs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('templateArgumentCountIs1')"><a name="templateArgumentCountIs1Anchor">templateArgumentCountIs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="templateArgumentCountIs1"><pre>Matches if the number of template arguments equals N.
 
 Given
@@ -2722,7 +3195,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('isExpansionInFileMatching2')"><a name="isExpansionInFileMatching2Anchor">isExpansionInFileMatching</a></td><td>std::string RegExp</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('isExpansionInFileMatching2')"><a name="isExpansionInFileMatching2Anchor">isExpansionInFileMatching</a></td><td>std::string RegExp</td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInFileMatching2"><pre>Matches AST nodes that were expanded within files whose name is
 partially matching a given regex.
 
@@ -2733,11 +3206,11 @@
 ASTMatcher.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('isExpansionInMainFile2')"><a name="isExpansionInMainFile2Anchor">isExpansionInMainFile</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('isExpansionInMainFile2')"><a name="isExpansionInMainFile2Anchor">isExpansionInMainFile</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInMainFile2"><pre>Matches AST nodes that were expanded within the main-file.
 
 Example matches X but not Y
@@ -2747,11 +3220,11 @@
 Y.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('isExpansionInSystemHeader2')"><a name="isExpansionInSystemHeader2Anchor">isExpansionInSystemHeader</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('isExpansionInSystemHeader2')"><a name="isExpansionInSystemHeader2Anchor">isExpansionInSystemHeader</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExpansionInSystemHeader2"><pre>Matches AST nodes that were expanded within system-header-files.
 
 Example matches Y but not X
@@ -2761,11 +3234,11 @@
 SystemHeader.h:
   class Y {};
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('booleanType0')"><a name="booleanType0Anchor">booleanType</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('booleanType0')"><a name="booleanType0Anchor">booleanType</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="booleanType0"><pre>Matches type bool.
 
 Given
@@ -2775,7 +3248,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode2')"><a name="equalsBoundNode2Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode2')"><a name="equalsBoundNode2Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
 <tr><td colspan="4" class="doc" id="equalsBoundNode2"><pre>Matches if a node equals a previously bound node.
 
 Matches a node if it equals the node previously bound to ID.
@@ -2798,7 +3271,25 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('voidType0')"><a name="voidType0Anchor">voidType</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('equalsNode2')"><a name="equalsNode2Anchor">equalsNode</a></td><td>const Type* Other</td></tr>
+<tr><td colspan="4" class="doc" id="equalsNode2"><pre>Matches if a node equals another node.
+
+Type has pointer identity in the AST.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('realFloatingPointType0')"><a name="realFloatingPointType0Anchor">realFloatingPointType</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="realFloatingPointType0"><pre>Matches any real floating-point type (float, double, long double).
+
+Given
+  int i;
+  float f;
+realFloatingPointType()
+  matches "float f" but not "int i"
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('voidType0')"><a name="voidType0Anchor">voidType</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="voidType0"><pre>Matches type void.
 
 Given
@@ -2808,7 +3299,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;</td><td class="name" onclick="toggle('ofKind0')"><a name="ofKind0Anchor">ofKind</a></td><td>UnaryExprOrTypeTrait Kind</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;</td><td class="name" onclick="toggle('ofKind0')"><a name="ofKind0Anchor">ofKind</a></td><td>UnaryExprOrTypeTrait Kind</td></tr>
 <tr><td colspan="4" class="doc" id="ofKind0"><pre>Matches unary expressions of a certain kind.
 
 Given
@@ -2819,7 +3310,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryOperator.html">UnaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasOperatorName1')"><a name="hasOperatorName1Anchor">hasOperatorName</a></td><td>std::string Name</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryOperator.html">UnaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasOperatorName1')"><a name="hasOperatorName1Anchor">hasOperatorName</a></td><td>std::string Name</td></tr>
 <tr><td colspan="4" class="doc" id="hasOperatorName1"><pre>Matches the operator Name of operator expressions (binary or
 unary).
 
@@ -2828,7 +3319,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasAutomaticStorageDuration0')"><a name="hasAutomaticStorageDuration0Anchor">hasAutomaticStorageDuration</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasAutomaticStorageDuration0')"><a name="hasAutomaticStorageDuration0Anchor">hasAutomaticStorageDuration</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasAutomaticStorageDuration0"><pre>Matches a variable declaration that has automatic storage duration.
 
 Example matches x, but not y, z, or a.
@@ -2842,7 +3333,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasGlobalStorage0')"><a name="hasGlobalStorage0Anchor">hasGlobalStorage</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasGlobalStorage0')"><a name="hasGlobalStorage0Anchor">hasGlobalStorage</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasGlobalStorage0"><pre>Matches a variable declaration that does not have local storage.
 
 Example matches y and z (matcher = varDecl(hasGlobalStorage())
@@ -2854,7 +3345,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasLocalStorage0')"><a name="hasLocalStorage0Anchor">hasLocalStorage</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasLocalStorage0')"><a name="hasLocalStorage0Anchor">hasLocalStorage</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasLocalStorage0"><pre>Matches a variable declaration that has function scope and is a
 non-static local variable.
 
@@ -2867,7 +3358,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasStaticStorageDuration0')"><a name="hasStaticStorageDuration0Anchor">hasStaticStorageDuration</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasStaticStorageDuration0')"><a name="hasStaticStorageDuration0Anchor">hasStaticStorageDuration</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasStaticStorageDuration0"><pre>Matches a variable declaration that has static storage duration.
 
 Example matches y and a, but not x or z.
@@ -2881,7 +3372,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasThreadStorageDuration0')"><a name="hasThreadStorageDuration0Anchor">hasThreadStorageDuration</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasThreadStorageDuration0')"><a name="hasThreadStorageDuration0Anchor">hasThreadStorageDuration</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="hasThreadStorageDuration0"><pre>Matches a variable declaration that has thread storage duration.
 
 Example matches z, but not x, z, or a.
@@ -2895,7 +3386,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isConstexpr0')"><a name="isConstexpr0Anchor">isConstexpr</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isConstexpr0')"><a name="isConstexpr0Anchor">isConstexpr</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isConstexpr0"><pre>Matches constexpr variable and function declarations.
 
 Given:
@@ -2908,7 +3399,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isDefinition1')"><a name="isDefinition1Anchor">isDefinition</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isDefinition1')"><a name="isDefinition1Anchor">isDefinition</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isDefinition1"><pre>Matches if a declaration has a body attached.
 
 Example matches A, va, fa
@@ -2919,11 +3410,11 @@
   void fa() {}
   void fb();  Doesn't match, as it has no body.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagDecl.html">TagDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagDecl.html">TagDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isExceptionVariable0')"><a name="isExceptionVariable0Anchor">isExceptionVariable</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isExceptionVariable0')"><a name="isExceptionVariable0Anchor">isExceptionVariable</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExceptionVariable0"><pre>Matches a variable declaration that is an exception variable from
 a C++ catch block, or an Objective-C statement.
 
@@ -2936,7 +3427,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicitTemplateSpecialization1')"><a name="isExplicitTemplateSpecialization1Anchor">isExplicitTemplateSpecialization</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isExplicitTemplateSpecialization1')"><a name="isExplicitTemplateSpecialization1Anchor">isExplicitTemplateSpecialization</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExplicitTemplateSpecialization1"><pre>Matches explicit template specializations of function, class, or
 static member variable template instantiations.
 
@@ -2946,11 +3437,23 @@
 functionDecl(isExplicitTemplateSpecialization())
   matches the specialization A&lt;int&gt;().
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isTemplateInstantiation1')"><a name="isTemplateInstantiation1Anchor">isTemplateInstantiation</a></td><td></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isExternC1')"><a name="isExternC1Anchor">isExternC</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isExternC1"><pre>Matches extern "C" function declarations.
+
+Given:
+  extern "C" void f() {}
+  extern "C" { void g() {} }
+  void h() {}
+functionDecl(isExternC())
+  matches the declaration of f and g, but not the declaration h
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isTemplateInstantiation1')"><a name="isTemplateInstantiation1Anchor">isTemplateInstantiation</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isTemplateInstantiation1"><pre>Matches template instantiations of function, class, or static
 member variable template instantiations.
 
@@ -2967,11 +3470,11 @@
 cxxRecordDecl(hasName("::X"), isTemplateInstantiation())
   does not match, as X&lt;A&gt; is an explicit template specialization.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;internal::Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;&gt;</td><td class="name" onclick="toggle('isInstantiated0')"><a name="isInstantiated0Anchor">isInstantiated</a></td><td></td></tr>
+<tr><td>Matcher&lt;internal::Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;&gt;</td><td class="name" onclick="toggle('isInstantiated0')"><a name="isInstantiated0Anchor">isInstantiated</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isInstantiated0"><pre>Matches declarations that are template instantiations or are inside
 template instantiations.
 
@@ -2984,7 +3487,34 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;internal::Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;&gt;</td><td class="name" onclick="toggle('isInTemplateInstantiation0')"><a name="isInTemplateInstantiation0Anchor">isInTemplateInstantiation</a></td><td></td></tr>
+<tr><td>Matcher&lt;internal::Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;&gt;</td><td class="name" onclick="toggle('nullPointerConstant0')"><a name="nullPointerConstant0Anchor">nullPointerConstant</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="nullPointerConstant0"><pre>Matches expressions that resolve to a null pointer constant, such as
+GNU's __null, C++11's nullptr, or C's NULL macro.
+
+Given:
+  void *v1 = NULL;
+  void *v2 = nullptr;
+  void *v3 = __null; GNU extension
+  char *cp = (char *)0;
+  int *ip = 0;
+  int i = 0;
+expr(nullPointerConstant())
+  matches the initializer for v1, v2, v3, cp, and ip. Does not match the
+  initializer for i.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;internal::Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;&gt;</td><td class="name" onclick="toggle('hasAnyName0')"><a name="hasAnyName0Anchor">hasAnyName</a></td><td>StringRef, ..., StringRef</td></tr>
+<tr><td colspan="4" class="doc" id="hasAnyName0"><pre>Matches NamedDecl nodes that have any of the specified names.
+
+This matcher is only provided as a performance optimization of hasName.
+    hasAnyName(a, b, c)
+ is equivalent to, but faster than
+    anyOf(hasName(a), hasName(b), hasName(c))
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;internal::Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;&gt;</td><td class="name" onclick="toggle('isInTemplateInstantiation0')"><a name="isInTemplateInstantiation0Anchor">isInTemplateInstantiation</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isInTemplateInstantiation0"><pre>Matches statements inside of a template instantiation.
 
 Given
@@ -3123,6 +3653,10 @@
 ChildT must be an AST base type.
 
 Usable as: Any Matcher
+Note that has is direct matcher, so it also matches things like implicit
+casts and paren casts. If you are matching with expr then you should
+probably consider using ignoringParenImpCasts like:
+has(ignoringParenImpCasts(expr())).
 </pre></td></tr>
 
 
@@ -3138,218 +3672,38 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;</td><td class="name" onclick="toggle('hasBase0')"><a name="hasBase0Anchor">hasBase</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasBase0"><pre>Matches the base expression of an array subscript expression.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AbstractConditionalOperator.html">AbstractConditionalOperator</a>&gt;</td><td class="name" onclick="toggle('hasCondition5')"><a name="hasCondition5Anchor">hasCondition</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasCondition5"><pre>Matches the condition expression of an if statement, for loop,
+switch statement or conditional operator.
 
-Given
-  int i[5];
-  void f() { i[1] = 42; }
-arraySubscriptExpression(hasBase(implicitCastExpr(
-    hasSourceExpression(declRefExpr()))))
-  matches i[1] with the declRefExpr() matching i
+Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
+  if (true) {}
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;</td><td class="name" onclick="toggle('hasIndex0')"><a name="hasIndex0Anchor">hasIndex</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasIndex0"><pre>Matches the index expression of an array subscript expression.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AbstractConditionalOperator.html">AbstractConditionalOperator</a>&gt;</td><td class="name" onclick="toggle('hasFalseExpression0')"><a name="hasFalseExpression0Anchor">hasFalseExpression</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasFalseExpression0"><pre>Matches the false branch expression of a conditional operator
+(binary or ternary).
 
-Given
-  int i[5];
-  void f() { i[1] = 42; }
-arraySubscriptExpression(hasIndex(integerLiteral()))
-  matches i[1] with the integerLiteral() matching 1
+Example matches b
+  condition ? a : b
+  condition ?: b
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;</td><td class="name" onclick="toggle('hasLHS1')"><a name="hasLHS1Anchor">hasLHS</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasLHS1"><pre>Matches the left hand side of binary operator expressions.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AbstractConditionalOperator.html">AbstractConditionalOperator</a>&gt;</td><td class="name" onclick="toggle('hasTrueExpression0')"><a name="hasTrueExpression0Anchor">hasTrueExpression</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasTrueExpression0"><pre>Matches the true branch expression of a conditional operator.
 
-Example matches a (matcher = binaryOperator(hasLHS()))
-  a || b
+Example 1 (conditional ternary operator): matches a
+  condition ? a : b
+
+Example 2 (conditional binary operator): matches opaqueValueExpr(condition)
+  condition ?: b
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;</td><td class="name" onclick="toggle('hasRHS1')"><a name="hasRHS1Anchor">hasRHS</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasRHS1"><pre>Matches the right hand side of binary operator expressions.
-
-Example matches b (matcher = binaryOperator(hasRHS()))
-  a || b
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayTypeLoc.html">ArrayTypeLoc</a>&gt;</td><td class="name" onclick="toggle('hasElementTypeLoc0')"><a name="hasElementTypeLoc0Anchor">hasElementTypeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="hasElementTypeLoc0"><pre>Matches arrays and C99 complex types that have a specific element
-type.
-
-Given
-  struct A {};
-  A a[7];
-  int b[7];
-arrayType(hasElementType(builtinType()))
-  matches "int b[7]"
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;</td><td class="name" onclick="toggle('hasElementType0')"><a name="hasElementType0Anchor">hasElementType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="hasElementType0"><pre>Matches arrays and C99 complex types that have a specific element
-type.
-
-Given
-  struct A {};
-  A a[7];
-  int b[7];
-arrayType(hasElementType(builtinType()))
-  matches "int b[7]"
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicTypeLoc.html">AtomicTypeLoc</a>&gt;</td><td class="name" onclick="toggle('hasValueTypeLoc0')"><a name="hasValueTypeLoc0Anchor">hasValueTypeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="hasValueTypeLoc0"><pre>Matches atomic types with a specific value type.
-
-Given
-  _Atomic(int) i;
-  _Atomic(float) f;
-atomicType(hasValueType(isInteger()))
- matches "_Atomic(int) i"
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicType.html">AtomicType</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicType.html">AtomicType</a>&gt;</td><td class="name" onclick="toggle('hasValueType0')"><a name="hasValueType0Anchor">hasValueType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="hasValueType0"><pre>Matches atomic types with a specific value type.
-
-Given
-  _Atomic(int) i;
-  _Atomic(float) f;
-atomicType(hasValueType(isInteger()))
- matches "_Atomic(int) i"
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicType.html">AtomicType</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;</td><td class="name" onclick="toggle('hasDeducedType0')"><a name="hasDeducedType0Anchor">hasDeducedType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeducedType0"><pre>Matches AutoType nodes where the deduced type is a specific type.
-
-Note: There is no TypeLoc for the deduced type and thus no
-getDeducedLoc() matcher.
-
-Given
-  auto a = 1;
-  auto b = 2.0;
-autoType(hasDeducedType(isInteger()))
-  matches "auto a"
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasEitherOperand0')"><a name="hasEitherOperand0Anchor">hasEitherOperand</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasEitherOperand0"><pre>Matches if either the left hand side or the right hand side of a
-binary operator matches.
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasLHS0')"><a name="hasLHS0Anchor">hasLHS</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasLHS0"><pre>Matches the left hand side of binary operator expressions.
-
-Example matches a (matcher = binaryOperator(hasLHS()))
-  a || b
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasRHS0')"><a name="hasRHS0Anchor">hasRHS</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasRHS0"><pre>Matches the right hand side of binary operator expressions.
-
-Example matches b (matcher = binaryOperator(hasRHS()))
-  a || b
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerTypeLoc.html">BlockPointerTypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointeeLoc0')"><a name="pointeeLoc0Anchor">pointeeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="pointeeLoc0"><pre>Narrows PointerType (and similar) matchers to those where the
-pointee matches a given matcher.
-
-Given
-  int *a;
-  int const *b;
-  float const *f;
-pointerType(pointee(isConstQualified(), isInteger()))
-  matches "int const *b"
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;</td><td class="name" onclick="toggle('pointee0')"><a name="pointee0Anchor">pointee</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="pointee0"><pre>Narrows PointerType (and similar) matchers to those where the
-pointee matches a given matcher.
-
-Given
-  int *a;
-  int const *b;
-  float const *f;
-pointerType(pointee(isConstQualified(), isInteger()))
-  matches "int const *b"
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('forEachArgumentWithParam1')"><a name="forEachArgumentWithParam1Anchor">forEachArgumentWithParam</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; ArgMatcher, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; ParamMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="forEachArgumentWithParam1"><pre>Matches all arguments and their respective ParmVarDecl.
-
-Given
-  void f(int i);
-  int y;
-  f(y);
-callExpr(declRefExpr(to(varDecl(hasName("y")))),
-parmVarDecl(hasType(isInteger())))
-  matches f(y);
-with declRefExpr(...)
-  matching int y
-and parmVarDecl(...)
-  matching int i
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasAnyArgument1')"><a name="hasAnyArgument1Anchor">hasAnyArgument</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasAnyArgument1"><pre>Matches any argument of a call expression or a constructor call
-expression.
-
-Given
-  void x(int, int, int) { int y; x(1, y, 42); }
-callExpr(hasAnyArgument(declRefExpr()))
-  matches x(1, y, 42)
-with hasAnyArgument(...)
-  matching y
-
-FIXME: Currently this will ignore parentheses and implicit casts on
-the argument before applying the inner matcher. We'll want to remove
-this to allow for greater control by the user once ignoreImplicit()
-has been implemented.
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasArgument1')"><a name="hasArgument1Anchor">hasArgument</a></td><td>unsigned N, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasArgument1"><pre>Matches the n'th argument of a call expression or a constructor
-call expression.
-
-Example matches y in x(y)
-    (matcher = callExpr(hasArgument(0, declRefExpr())))
-  void x(int) { int y; x(y); }
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration12')"><a name="hasDeclaration12Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration12"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration8')"><a name="hasDeclaration8Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration8"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -3361,16 +3715,246 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('forEachConstructorInitializer0')"><a name="forEachConstructorInitializer0Anchor">forEachConstructorInitializer</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;</td><td class="name" onclick="toggle('hasBase0')"><a name="hasBase0Anchor">hasBase</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasBase0"><pre>Matches the base expression of an array subscript expression.
+
+Given
+  int i[5];
+  void f() { i[1] = 42; }
+arraySubscriptExpression(hasBase(implicitCastExpr(
+    hasSourceExpression(declRefExpr()))))
+  matches i[1] with the declRefExpr() matching i
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;</td><td class="name" onclick="toggle('hasIndex0')"><a name="hasIndex0Anchor">hasIndex</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasIndex0"><pre>Matches the index expression of an array subscript expression.
+
+Given
+  int i[5];
+  void f() { i[1] = 42; }
+arraySubscriptExpression(hasIndex(integerLiteral()))
+  matches i[1] with the integerLiteral() matching 1
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;</td><td class="name" onclick="toggle('hasLHS1')"><a name="hasLHS1Anchor">hasLHS</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasLHS1"><pre>Matches the left hand side of binary operator expressions.
+
+Example matches a (matcher = binaryOperator(hasLHS()))
+  a || b
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArraySubscriptExpr.html">ArraySubscriptExpr</a>&gt;</td><td class="name" onclick="toggle('hasRHS1')"><a name="hasRHS1Anchor">hasRHS</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasRHS1"><pre>Matches the right hand side of binary operator expressions.
+
+Example matches b (matcher = binaryOperator(hasRHS()))
+  a || b
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayTypeLoc.html">ArrayTypeLoc</a>&gt;</td><td class="name" onclick="toggle('hasElementTypeLoc0')"><a name="hasElementTypeLoc0Anchor">hasElementTypeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
+<tr><td colspan="4" class="doc" id="hasElementTypeLoc0"><pre>Matches arrays and C99 complex types that have a specific element
+type.
+
+Given
+  struct A {};
+  A a[7];
+  int b[7];
+arrayType(hasElementType(builtinType()))
+  matches "int b[7]"
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;</td><td class="name" onclick="toggle('hasElementType0')"><a name="hasElementType0Anchor">hasElementType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td colspan="4" class="doc" id="hasElementType0"><pre>Matches arrays and C99 complex types that have a specific element
+type.
+
+Given
+  struct A {};
+  A a[7];
+  int b[7];
+arrayType(hasElementType(builtinType()))
+  matches "int b[7]"
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicTypeLoc.html">AtomicTypeLoc</a>&gt;</td><td class="name" onclick="toggle('hasValueTypeLoc0')"><a name="hasValueTypeLoc0Anchor">hasValueTypeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
+<tr><td colspan="4" class="doc" id="hasValueTypeLoc0"><pre>Matches atomic types with a specific value type.
+
+Given
+  _Atomic(int) i;
+  _Atomic(float) f;
+atomicType(hasValueType(isInteger()))
+ matches "_Atomic(int) i"
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicType.html">AtomicType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicType.html">AtomicType</a>&gt;</td><td class="name" onclick="toggle('hasValueType0')"><a name="hasValueType0Anchor">hasValueType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td colspan="4" class="doc" id="hasValueType0"><pre>Matches atomic types with a specific value type.
+
+Given
+  _Atomic(int) i;
+  _Atomic(float) f;
+atomicType(hasValueType(isInteger()))
+ matches "_Atomic(int) i"
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AtomicType.html">AtomicType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;</td><td class="name" onclick="toggle('hasDeducedType0')"><a name="hasDeducedType0Anchor">hasDeducedType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeducedType0"><pre>Matches AutoType nodes where the deduced type is a specific type.
+
+Note: There is no TypeLoc for the deduced type and thus no
+getDeducedLoc() matcher.
+
+Given
+  auto a = 1;
+  auto b = 2.0;
+autoType(hasDeducedType(isInteger()))
+  matches "auto a"
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasEitherOperand0')"><a name="hasEitherOperand0Anchor">hasEitherOperand</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasEitherOperand0"><pre>Matches if either the left hand side or the right hand side of a
+binary operator matches.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasLHS0')"><a name="hasLHS0Anchor">hasLHS</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasLHS0"><pre>Matches the left hand side of binary operator expressions.
+
+Example matches a (matcher = binaryOperator(hasLHS()))
+  a || b
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasRHS0')"><a name="hasRHS0Anchor">hasRHS</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasRHS0"><pre>Matches the right hand side of binary operator expressions.
+
+Example matches b (matcher = binaryOperator(hasRHS()))
+  a || b
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerTypeLoc.html">BlockPointerTypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointeeLoc0')"><a name="pointeeLoc0Anchor">pointeeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
+<tr><td colspan="4" class="doc" id="pointeeLoc0"><pre>Narrows PointerType (and similar) matchers to those where the
+pointee matches a given matcher.
+
+Given
+  int *a;
+  int const *b;
+  float const *f;
+pointerType(pointee(isConstQualified(), isInteger()))
+  matches "int const *b"
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;</td><td class="name" onclick="toggle('pointee0')"><a name="pointee0Anchor">pointee</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td colspan="4" class="doc" id="pointee0"><pre>Narrows PointerType (and similar) matchers to those where the
+pointee matches a given matcher.
+
+Given
+  int *a;
+  int const *b;
+  float const *f;
+pointerType(pointee(isConstQualified(), isInteger()))
+  matches "int const *b"
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('forEachArgumentWithParam1')"><a name="forEachArgumentWithParam1Anchor">forEachArgumentWithParam</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; ArgMatcher, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; ParamMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="forEachArgumentWithParam1"><pre>Matches all arguments and their respective ParmVarDecl.
+
+Given
+  void f(int i);
+  int y;
+  f(y);
+callExpr(
+  forEachArgumentWithParam(
+    declRefExpr(to(varDecl(hasName("y")))),
+    parmVarDecl(hasType(isInteger()))
+))
+  matches f(y);
+with declRefExpr(...)
+  matching int y
+and parmVarDecl(...)
+  matching int i
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasAnyArgument1')"><a name="hasAnyArgument1Anchor">hasAnyArgument</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasAnyArgument1"><pre>Matches any argument of a call expression or a constructor call
+expression.
+
+Given
+  void x(int, int, int) { int y; x(1, y, 42); }
+callExpr(hasAnyArgument(declRefExpr()))
+  matches x(1, y, 42)
+with hasAnyArgument(...)
+  matching y
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasArgument1')"><a name="hasArgument1Anchor">hasArgument</a></td><td>unsigned N, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasArgument1"><pre>Matches the n'th argument of a call expression or a constructor
+call expression.
+
+Example matches y in x(y)
+    (matcher = callExpr(hasArgument(0, declRefExpr())))
+  void x(int) { int y; x(y); }
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration13')"><a name="hasDeclaration13Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration13"><pre>Matches a node if the declaration associated with that node
+matches the given matcher.
+
+The associated declaration is:
+- for type nodes, the declaration of the underlying type
+- for CallExpr, the declaration of the callee
+- for MemberExpr, the declaration of the referenced member
+- for CXXConstructExpr, the declaration of the constructor
+
+Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
+function. e.g. various subtypes of clang::Type and various expressions.
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('forEachConstructorInitializer0')"><a name="forEachConstructorInitializer0Anchor">forEachConstructorInitializer</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachConstructorInitializer0"><pre>Matches each constructor initializer in a constructor definition.
 
 Given
@@ -3382,7 +3966,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyConstructorInitializer0')"><a name="hasAnyConstructorInitializer0Anchor">hasAnyConstructorInitializer</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructorDecl.html">CXXConstructorDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyConstructorInitializer0')"><a name="hasAnyConstructorInitializer0Anchor">hasAnyConstructorInitializer</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasAnyConstructorInitializer0"><pre>Matches a constructor initializer.
 
 Given
@@ -3397,7 +3981,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('forField0')"><a name="forField0Anchor">forField</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FieldDecl.html">FieldDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('forField0')"><a name="forField0Anchor">forField</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FieldDecl.html">FieldDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forField0"><pre>Matches the field declaration of a constructor initializer.
 
 Given
@@ -3412,7 +3996,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('withInitializer0')"><a name="withInitializer0Anchor">withInitializer</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXCtorInitializer.html">CXXCtorInitializer</a>&gt;</td><td class="name" onclick="toggle('withInitializer0')"><a name="withInitializer0Anchor">withInitializer</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="withInitializer0"><pre>Matches the initializer expression of a constructor initializer.
 
 Given
@@ -3427,9 +4011,9 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXForRangeStmt.html">CXXForRangeStmt</a>&gt;</td><td class="name" onclick="toggle('hasBody3')"><a name="hasBody3Anchor">hasBody</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasBody3"><pre>Matches a 'for', 'while', or 'do while' statement that has
-a given body.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXForRangeStmt.html">CXXForRangeStmt</a>&gt;</td><td class="name" onclick="toggle('hasBody3')"><a name="hasBody3Anchor">hasBody</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasBody3"><pre>Matches a 'for', 'while', 'do while' statement or a function
+definition that has a given body.
 
 Given
   for (;;) {}
@@ -3440,7 +4024,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXForRangeStmt.html">CXXForRangeStmt</a>&gt;</td><td class="name" onclick="toggle('hasLoopVariable0')"><a name="hasLoopVariable0Anchor">hasLoopVariable</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXForRangeStmt.html">CXXForRangeStmt</a>&gt;</td><td class="name" onclick="toggle('hasLoopVariable0')"><a name="hasLoopVariable0Anchor">hasLoopVariable</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasLoopVariable0"><pre>Matches the initialization statement of a for loop.
 
 Example:
@@ -3450,7 +4034,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXForRangeStmt.html">CXXForRangeStmt</a>&gt;</td><td class="name" onclick="toggle('hasRangeInit0')"><a name="hasRangeInit0Anchor">hasRangeInit</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXForRangeStmt.html">CXXForRangeStmt</a>&gt;</td><td class="name" onclick="toggle('hasRangeInit0')"><a name="hasRangeInit0Anchor">hasRangeInit</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasRangeInit0"><pre>Matches the range initialization statement of a for loop.
 
 Example:
@@ -3460,11 +4044,11 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;</td><td class="name" onclick="toggle('onImplicitObjectArgument0')"><a name="onImplicitObjectArgument0Anchor">onImplicitObjectArgument</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;</td><td class="name" onclick="toggle('onImplicitObjectArgument0')"><a name="onImplicitObjectArgument0Anchor">onImplicitObjectArgument</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="onImplicitObjectArgument0"><pre></pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;</td><td class="name" onclick="toggle('on0')"><a name="on0Anchor">on</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;</td><td class="name" onclick="toggle('on0')"><a name="on0Anchor">on</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="on0"><pre>Matches on the implicit object argument of a member call expression.
 
 Example matches y.x()
@@ -3476,18 +4060,42 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;</td><td class="name" onclick="toggle('thisPointerType1')"><a name="thisPointerType1Anchor">thisPointerType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;</td><td class="name" onclick="toggle('thisPointerType1')"><a name="thisPointerType1Anchor">thisPointerType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="thisPointerType1"><pre>Overloaded to match the type's declaration.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;</td><td class="name" onclick="toggle('thisPointerType0')"><a name="thisPointerType0Anchor">thisPointerType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMemberCallExpr.html">CXXMemberCallExpr</a>&gt;</td><td class="name" onclick="toggle('thisPointerType0')"><a name="thisPointerType0Anchor">thisPointerType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="thisPointerType0"><pre>Matches if the expression's type either matches the specified
 matcher, or is a pointer to a type that matches the InnerMatcher.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('ofClass0')"><a name="ofClass0Anchor">ofClass</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('forEachOverridden0')"><a name="forEachOverridden0Anchor">forEachOverridden</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="forEachOverridden0"><pre>Matches each method overriden by the given method. This matcher may
+produce multiple matches.
+
+Given
+  class A { virtual void f(); };
+  class B : public A { void f(); };
+  class C : public B { void f(); };
+cxxMethodDecl(ofClass(hasName("C")),
+              forEachOverridden(cxxMethodDecl().bind("b"))).bind("d")
+  matches once, with "b" binding "A::f" and "d" binding "C::f" (Note
+  that B::f is not overridden by C::f).
+
+The check can produce multiple matches in case of multiple inheritance, e.g.
+  class A1 { virtual void f(); };
+  class A2 { virtual void f(); };
+  class C : public A1, public A2 { void f(); };
+cxxMethodDecl(ofClass(hasName("C")),
+              forEachOverridden(cxxMethodDecl().bind("b"))).bind("d")
+  matches twice, once with "b" binding "A1::f" and "d" binding "C::f", and
+  once with "b" binding "A2::f" and "d" binding "C::f".
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('ofClass0')"><a name="ofClass0Anchor">ofClass</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="ofClass0"><pre>Matches the class declaration that the given method declaration
 belongs to.
 
@@ -3506,7 +4114,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('hasMethod0')"><a name="hasMethod0Anchor">hasMethod</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('hasMethod0')"><a name="hasMethod0Anchor">hasMethod</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasMethod0"><pre>Matches the first method of a class or struct that satisfies InnerMatcher.
 
 Given:
@@ -3518,7 +4126,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isDerivedFrom0')"><a name="isDerivedFrom0Anchor">isDerivedFrom</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt; Base</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isDerivedFrom0')"><a name="isDerivedFrom0Anchor">isDerivedFrom</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt; Base</td></tr>
 <tr><td colspan="4" class="doc" id="isDerivedFrom0"><pre>Matches C++ classes that are directly or indirectly derived from
 a class matching Base.
 
@@ -3539,13 +4147,13 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isSameOrDerivedFrom0')"><a name="isSameOrDerivedFrom0Anchor">isSameOrDerivedFrom</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt; Base</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('isSameOrDerivedFrom0')"><a name="isSameOrDerivedFrom0Anchor">isSameOrDerivedFrom</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt; Base</td></tr>
 <tr><td colspan="4" class="doc" id="isSameOrDerivedFrom0"><pre>Similar to isDerivedFrom(), but also matches classes that directly
 match Base.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('callee1')"><a name="callee1Anchor">callee</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('callee1')"><a name="callee1Anchor">callee</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="callee1"><pre>Matches if the call expression's callee's declaration matches the
 given matcher.
 
@@ -3556,7 +4164,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('callee0')"><a name="callee0Anchor">callee</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('callee0')"><a name="callee0Anchor">callee</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="callee0"><pre>Matches if the call expression's callee expression matches.
 
 Given
@@ -3567,22 +4175,25 @@
 with callee(...)
   matching this-&gt;x, x, y.x, f respectively
 
-Note: Callee cannot take the more general internal::Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;
+Note: Callee cannot take the more general internal::Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;
 because this introduces ambiguous overloads with calls to Callee taking a
-internal::Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, as the matcher hierarchy is purely
+internal::Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;, as the matcher hierarchy is purely
 implemented in terms of implicit casts.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('forEachArgumentWithParam0')"><a name="forEachArgumentWithParam0Anchor">forEachArgumentWithParam</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; ArgMatcher, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; ParamMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('forEachArgumentWithParam0')"><a name="forEachArgumentWithParam0Anchor">forEachArgumentWithParam</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; ArgMatcher, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; ParamMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachArgumentWithParam0"><pre>Matches all arguments and their respective ParmVarDecl.
 
 Given
   void f(int i);
   int y;
   f(y);
-callExpr(declRefExpr(to(varDecl(hasName("y")))),
-parmVarDecl(hasType(isInteger())))
+callExpr(
+  forEachArgumentWithParam(
+    declRefExpr(to(varDecl(hasName("y")))),
+    parmVarDecl(hasType(isInteger()))
+))
   matches f(y);
 with declRefExpr(...)
   matching int y
@@ -3591,7 +4202,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasAnyArgument0')"><a name="hasAnyArgument0Anchor">hasAnyArgument</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasAnyArgument0')"><a name="hasAnyArgument0Anchor">hasAnyArgument</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasAnyArgument0"><pre>Matches any argument of a call expression or a constructor call
 expression.
 
@@ -3601,15 +4212,10 @@
   matches x(1, y, 42)
 with hasAnyArgument(...)
   matching y
-
-FIXME: Currently this will ignore parentheses and implicit casts on
-the argument before applying the inner matcher. We'll want to remove
-this to allow for greater control by the user once ignoreImplicit()
-has been implemented.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasArgument0')"><a name="hasArgument0Anchor">hasArgument</a></td><td>unsigned N, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasArgument0')"><a name="hasArgument0Anchor">hasArgument</a></td><td>unsigned N, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasArgument0"><pre>Matches the n'th argument of a call expression or a constructor
 call expression.
 
@@ -3619,8 +4225,8 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration13')"><a name="hasDeclaration13Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration13"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration14')"><a name="hasDeclaration14Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration14"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -3632,16 +4238,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CaseStmt.html">CaseStmt</a>&gt;</td><td class="name" onclick="toggle('hasCaseConstant0')"><a name="hasCaseConstant0Anchor">hasCaseConstant</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CaseStmt.html">CaseStmt</a>&gt;</td><td class="name" onclick="toggle('hasCaseConstant0')"><a name="hasCaseConstant0Anchor">hasCaseConstant</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasCaseConstant0"><pre>If the given case statement does not use the GNU case range
 extension, matches the constant given in the statement.
 
@@ -3652,45 +4258,53 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CastExpr.html">CastExpr</a>&gt;</td><td class="name" onclick="toggle('hasSourceExpression0')"><a name="hasSourceExpression0Anchor">hasSourceExpression</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasSourceExpression0"><pre>Matches if the cast's source expression matches the given matcher.
-
-Example: matches "a string" (matcher =
-                                 hasSourceExpression(cxxConstructExpr()))
-class URL { URL(string); };
-URL url = "a string";
-</pre></td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CastExpr.html">CastExpr</a>&gt;</td><td class="name" onclick="toggle('hasSourceExpression0')"><a name="hasSourceExpression0Anchor">hasSourceExpression</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasSourceExpression0"><pre></pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyTemplateArgument0')"><a name="hasAnyTemplateArgument0Anchor">hasAnyTemplateArgument</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasAnyTemplateArgument0"><pre>Matches classTemplateSpecializations that have at least one
-TemplateArgument matching the given InnerMatcher.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyTemplateArgument0')"><a name="hasAnyTemplateArgument0Anchor">hasAnyTemplateArgument</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasAnyTemplateArgument0"><pre>Matches classTemplateSpecializations, templateSpecializationType and
+functionDecl that have at least one TemplateArgument matching the given
+InnerMatcher.
 
 Given
   template&lt;typename T&gt; class A {};
   template&lt;&gt; class A&lt;double&gt; {};
   A&lt;int&gt; a;
+
+  template&lt;typename T&gt; f() {};
+  void func() { f&lt;int&gt;(); };
+
 classTemplateSpecializationDecl(hasAnyTemplateArgument(
     refersToType(asString("int"))))
   matches the specialization A&lt;int&gt;
+
+functionDecl(hasAnyTemplateArgument(refersToType(asString("int"))))
+  matches the specialization f&lt;int&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('hasTemplateArgument0')"><a name="hasTemplateArgument0Anchor">hasTemplateArgument</a></td><td>unsigned N, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasTemplateArgument0"><pre>Matches classTemplateSpecializations where the n'th TemplateArgument
-matches the given InnerMatcher.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('hasTemplateArgument0')"><a name="hasTemplateArgument0Anchor">hasTemplateArgument</a></td><td>unsigned N, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasTemplateArgument0"><pre>Matches classTemplateSpecializations, templateSpecializationType and
+functionDecl where the n'th TemplateArgument matches the given InnerMatcher.
 
 Given
   template&lt;typename T, typename U&gt; class A {};
   A&lt;bool, int&gt; b;
   A&lt;int, bool&gt; c;
+
+  template&lt;typename T&gt; f() {};
+  void func() { f&lt;int&gt;(); };
 classTemplateSpecializationDecl(hasTemplateArgument(
     1, refersToType(asString("int"))))
   matches the specialization A&lt;bool, int&gt;
+
+functionDecl(hasTemplateArgument(0, refersToType(asString("int"))))
+  matches the specialization f&lt;int&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexTypeLoc.html">ComplexTypeLoc</a>&gt;</td><td class="name" onclick="toggle('hasElementTypeLoc1')"><a name="hasElementTypeLoc1Anchor">hasElementTypeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexTypeLoc.html">ComplexTypeLoc</a>&gt;</td><td class="name" onclick="toggle('hasElementTypeLoc1')"><a name="hasElementTypeLoc1Anchor">hasElementTypeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="hasElementTypeLoc1"><pre>Matches arrays and C99 complex types that have a specific element
 type.
 
@@ -3701,11 +4315,11 @@
 arrayType(hasElementType(builtinType()))
   matches "int b[7]"
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;</td><td class="name" onclick="toggle('hasElementType1')"><a name="hasElementType1Anchor">hasElementType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;</td><td class="name" onclick="toggle('hasElementType1')"><a name="hasElementType1Anchor">hasElementType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="hasElementType1"><pre>Matches arrays and C99 complex types that have a specific element
 type.
 
@@ -3716,13 +4330,13 @@
 arrayType(hasElementType(builtinType()))
   matches "int b[7]"
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ArrayType.html">ArrayType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ComplexType.html">ComplexType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundStmt.html">CompoundStmt</a>&gt;</td><td class="name" onclick="toggle('hasAnySubstatement0')"><a name="hasAnySubstatement0Anchor">hasAnySubstatement</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundStmt.html">CompoundStmt</a>&gt;</td><td class="name" onclick="toggle('hasAnySubstatement0')"><a name="hasAnySubstatement0Anchor">hasAnySubstatement</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasAnySubstatement0"><pre>Matches compound statements where at least one substatement matches
-a given matcher.
+a given matcher. Also matches StmtExprs that have CompoundStmt as children.
 
 Given
   { {}; 1+2; }
@@ -3733,38 +4347,13 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ConditionalOperator.html">ConditionalOperator</a>&gt;</td><td class="name" onclick="toggle('hasCondition4')"><a name="hasCondition4Anchor">hasCondition</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasCondition4"><pre>Matches the condition expression of an if statement, for loop,
-or conditional operator.
-
-Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
-  if (true) {}
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ConditionalOperator.html">ConditionalOperator</a>&gt;</td><td class="name" onclick="toggle('hasFalseExpression0')"><a name="hasFalseExpression0Anchor">hasFalseExpression</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasFalseExpression0"><pre>Matches the false branch expression of a conditional operator.
-
-Example matches b
-  condition ? a : b
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ConditionalOperator.html">ConditionalOperator</a>&gt;</td><td class="name" onclick="toggle('hasTrueExpression0')"><a name="hasTrueExpression0Anchor">hasTrueExpression</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasTrueExpression0"><pre>Matches the true branch expression of a conditional operator.
-
-Example matches a
-  condition ? a : b
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DecayedType.html">DecayedType</a>&gt;</td><td class="name" onclick="toggle('hasDecayedType0')"><a name="hasDecayedType0Anchor">hasDecayedType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerType</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DecayedType.html">DecayedType</a>&gt;</td><td class="name" onclick="toggle('hasDecayedType0')"><a name="hasDecayedType0Anchor">hasDecayedType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerType</td></tr>
 <tr><td colspan="4" class="doc" id="hasDecayedType0"><pre>Matches the decayed type, whos decayed type matches InnerMatcher
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration11')"><a name="hasDeclaration11Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration11"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration12')"><a name="hasDeclaration12Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration12"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -3776,16 +4365,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('throughUsingDecl0')"><a name="throughUsingDecl0Anchor">throughUsingDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingShadowDecl.html">UsingShadowDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('throughUsingDecl0')"><a name="throughUsingDecl0Anchor">throughUsingDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingShadowDecl.html">UsingShadowDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="throughUsingDecl0"><pre>Matches a DeclRefExpr that refers to a declaration through a
 specific using shadow declaration.
 
@@ -3801,7 +4390,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('to0')"><a name="to0Anchor">to</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('to0')"><a name="to0Anchor">to</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="to0"><pre>Matches a DeclRefExpr that refers to a declaration that matches the
 specified matcher.
 
@@ -3812,7 +4401,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;</td><td class="name" onclick="toggle('containsDeclaration0')"><a name="containsDeclaration0Anchor">containsDeclaration</a></td><td>unsigned N, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;</td><td class="name" onclick="toggle('containsDeclaration0')"><a name="containsDeclaration0Anchor">containsDeclaration</a></td><td>unsigned N, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="containsDeclaration0"><pre>Matches the n'th declaration of a declaration statement.
 
 Note that this does not work for global declarations because the AST
@@ -3831,7 +4420,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;</td><td class="name" onclick="toggle('hasSingleDecl0')"><a name="hasSingleDecl0Anchor">hasSingleDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;</td><td class="name" onclick="toggle('hasSingleDecl0')"><a name="hasSingleDecl0Anchor">hasSingleDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasSingleDecl0"><pre>Matches the Decl of a DeclStmt which has a single declaration.
 
 Given
@@ -3842,7 +4431,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclaratorDecl.html">DeclaratorDecl</a>&gt;</td><td class="name" onclick="toggle('hasTypeLoc0')"><a name="hasTypeLoc0Anchor">hasTypeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt; Inner</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclaratorDecl.html">DeclaratorDecl</a>&gt;</td><td class="name" onclick="toggle('hasTypeLoc0')"><a name="hasTypeLoc0Anchor">hasTypeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt; Inner</td></tr>
 <tr><td colspan="4" class="doc" id="hasTypeLoc0"><pre>Matches if the type location of the declarator decl's type matches
 the inner matcher.
 
@@ -3853,7 +4442,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('hasDeclContext0')"><a name="hasDeclContext0Anchor">hasDeclContext</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('hasDeclContext0')"><a name="hasDeclContext0Anchor">hasDeclContext</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclContext0"><pre>Matches declarations whose declaration context, interpreted as a
 Decl, matches InnerMatcher.
 
@@ -3869,9 +4458,9 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DoStmt.html">DoStmt</a>&gt;</td><td class="name" onclick="toggle('hasBody0')"><a name="hasBody0Anchor">hasBody</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasBody0"><pre>Matches a 'for', 'while', or 'do while' statement that has
-a given body.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DoStmt.html">DoStmt</a>&gt;</td><td class="name" onclick="toggle('hasBody0')"><a name="hasBody0Anchor">hasBody</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasBody0"><pre>Matches a 'for', 'while', 'do while' statement or a function
+definition that has a given body.
 
 Given
   for (;;) {}
@@ -3882,16 +4471,16 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DoStmt.html">DoStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition3')"><a name="hasCondition3Anchor">hasCondition</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DoStmt.html">DoStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition3')"><a name="hasCondition3Anchor">hasCondition</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasCondition3"><pre>Matches the condition expression of an if statement, for loop,
-or conditional operator.
+switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;</td><td class="name" onclick="toggle('hasQualifier0')"><a name="hasQualifier0Anchor">hasQualifier</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;</td><td class="name" onclick="toggle('hasQualifier0')"><a name="hasQualifier0Anchor">hasQualifier</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasQualifier0"><pre>Matches ElaboratedTypes whose qualifier, a NestedNameSpecifier,
 matches InnerMatcher if the qualifier exists.
 
@@ -3908,7 +4497,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;</td><td class="name" onclick="toggle('namesType0')"><a name="namesType0Anchor">namesType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;</td><td class="name" onclick="toggle('namesType0')"><a name="namesType0Anchor">namesType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="namesType0"><pre>Matches ElaboratedTypes whose named type matches InnerMatcher.
 
 Given
@@ -3925,8 +4514,8 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration10')"><a name="hasDeclaration10Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration10"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration11')"><a name="hasDeclaration11Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration11"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -3938,16 +4527,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ExplicitCastExpr.html">ExplicitCastExpr</a>&gt;</td><td class="name" onclick="toggle('hasDestinationType0')"><a name="hasDestinationType0Anchor">hasDestinationType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ExplicitCastExpr.html">ExplicitCastExpr</a>&gt;</td><td class="name" onclick="toggle('hasDestinationType0')"><a name="hasDestinationType0Anchor">hasDestinationType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDestinationType0"><pre>Matches casts whose destination type matches a given matcher.
 
 (Note: Clang's AST refers to other conversions as "casts" too, and calls
@@ -3955,8 +4544,8 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('hasType2')"><a name="hasType2Anchor">hasType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasType2"><pre>Overloaded to match the declaration of the expression's or value
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('hasType3')"><a name="hasType3Anchor">hasType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasType3"><pre>Overloaded to match the declaration of the expression's or value
 declaration's type.
 
 In case of a value declaration (for example a variable declaration),
@@ -3970,22 +4559,24 @@
  class X {};
  void y(X &amp;x) { x; X z; }
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('hasType0')"><a name="hasType0Anchor">hasType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('hasType0')"><a name="hasType0Anchor">hasType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasType0"><pre>Matches if the expression's or declaration's type matches a type
 matcher.
 
 Example matches x (matcher = expr(hasType(cxxRecordDecl(hasName("X")))))
             and z (matcher = varDecl(hasType(cxxRecordDecl(hasName("X")))))
+            and U (matcher = typedefDecl(hasType(asString("int")))
  class X {};
  void y(X &amp;x) { x; X z; }
+ typedef int U;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringImpCasts0')"><a name="ignoringImpCasts0Anchor">ignoringImpCasts</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringImpCasts0')"><a name="ignoringImpCasts0Anchor">ignoringImpCasts</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="ignoringImpCasts0"><pre>Matches expressions that match InnerMatcher after any implicit casts
 are stripped off.
 
@@ -4008,7 +4599,26 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringParenCasts0')"><a name="ignoringParenCasts0Anchor">ignoringParenCasts</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringImplicit0')"><a name="ignoringImplicit0Anchor">ignoringImplicit</a></td><td>ast_matchers::Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="ignoringImplicit0"><pre>Matches expressions that match InnerMatcher after any implicit AST
+nodes are stripped off.
+
+Parentheses and explicit casts are not discarded.
+Given
+  class C {};
+  C a = C();
+  C b;
+  C c = b;
+The matchers
+   varDecl(hasInitializer(ignoringImplicit(cxxConstructExpr())))
+would match the declarations for a, b, and c.
+While
+   varDecl(hasInitializer(cxxConstructExpr()))
+only match the declarations for b and c.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringParenCasts0')"><a name="ignoringParenCasts0Anchor">ignoringParenCasts</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="ignoringParenCasts0"><pre>Matches expressions that match InnerMatcher after parentheses and
 casts are stripped off.
 
@@ -4027,7 +4637,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringParenImpCasts0')"><a name="ignoringParenImpCasts0Anchor">ignoringParenImpCasts</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringParenImpCasts0')"><a name="ignoringParenImpCasts0Anchor">ignoringParenImpCasts</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="ignoringParenImpCasts0"><pre>Matches expressions that match InnerMatcher after implicit casts and
 parentheses are stripped off.
 
@@ -4050,9 +4660,9 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasBody1')"><a name="hasBody1Anchor">hasBody</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasBody1"><pre>Matches a 'for', 'while', or 'do while' statement that has
-a given body.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasBody1')"><a name="hasBody1Anchor">hasBody</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasBody1"><pre>Matches a 'for', 'while', 'do while' statement or a function
+definition that has a given body.
 
 Given
   for (;;) {}
@@ -4063,16 +4673,16 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition1')"><a name="hasCondition1Anchor">hasCondition</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition1')"><a name="hasCondition1Anchor">hasCondition</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasCondition1"><pre>Matches the condition expression of an if statement, for loop,
-or conditional operator.
+switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasIncrement0')"><a name="hasIncrement0Anchor">hasIncrement</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasIncrement0')"><a name="hasIncrement0Anchor">hasIncrement</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasIncrement0"><pre>Matches the increment statement of a for loop.
 
 Example:
@@ -4082,7 +4692,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasLoopInit0')"><a name="hasLoopInit0Anchor">hasLoopInit</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasLoopInit0')"><a name="hasLoopInit0Anchor">hasLoopInit</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasLoopInit0"><pre>Matches the initialization statement of a for loop.
 
 Example:
@@ -4092,7 +4702,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyParameter0')"><a name="hasAnyParameter0Anchor">hasAnyParameter</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyParameter0')"><a name="hasAnyParameter0Anchor">hasAnyParameter</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasAnyParameter0"><pre>Matches any parameter of a function declaration.
 
 Does not match the 'this' parameter of a method.
@@ -4106,7 +4716,42 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasParameter0')"><a name="hasParameter0Anchor">hasParameter</a></td><td>unsigned N, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyTemplateArgument2')"><a name="hasAnyTemplateArgument2Anchor">hasAnyTemplateArgument</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasAnyTemplateArgument2"><pre>Matches classTemplateSpecializations, templateSpecializationType and
+functionDecl that have at least one TemplateArgument matching the given
+InnerMatcher.
+
+Given
+  template&lt;typename T&gt; class A {};
+  template&lt;&gt; class A&lt;double&gt; {};
+  A&lt;int&gt; a;
+
+  template&lt;typename T&gt; f() {};
+  void func() { f&lt;int&gt;(); };
+
+classTemplateSpecializationDecl(hasAnyTemplateArgument(
+    refersToType(asString("int"))))
+  matches the specialization A&lt;int&gt;
+
+functionDecl(hasAnyTemplateArgument(refersToType(asString("int"))))
+  matches the specialization f&lt;int&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasBody4')"><a name="hasBody4Anchor">hasBody</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasBody4"><pre>Matches a 'for', 'while', 'do while' statement or a function
+definition that has a given body.
+
+Given
+  for (;;) {}
+hasBody(compoundStmt())
+  matches 'for (;;) {}'
+with compoundStmt()
+  matching '{}'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasParameter0')"><a name="hasParameter0Anchor">hasParameter</a></td><td>unsigned N, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasParameter0"><pre>Matches the n'th parameter of a function declaration.
 
 Given
@@ -4118,7 +4763,27 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('returns0')"><a name="returns0Anchor">returns</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasTemplateArgument2')"><a name="hasTemplateArgument2Anchor">hasTemplateArgument</a></td><td>unsigned N, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasTemplateArgument2"><pre>Matches classTemplateSpecializations, templateSpecializationType and
+functionDecl where the n'th TemplateArgument matches the given InnerMatcher.
+
+Given
+  template&lt;typename T, typename U&gt; class A {};
+  A&lt;bool, int&gt; b;
+  A&lt;int, bool&gt; c;
+
+  template&lt;typename T&gt; f() {};
+  void func() { f&lt;int&gt;(); };
+classTemplateSpecializationDecl(hasTemplateArgument(
+    1, refersToType(asString("int"))))
+  matches the specialization A&lt;bool, int&gt;
+
+functionDecl(hasTemplateArgument(0, refersToType(asString("int"))))
+  matches the specialization f&lt;int&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('returns0')"><a name="returns0Anchor">returns</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="returns0"><pre>Matches the return type of a function declaration.
 
 Given:
@@ -4128,16 +4793,16 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition0')"><a name="hasCondition0Anchor">hasCondition</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition0')"><a name="hasCondition0Anchor">hasCondition</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasCondition0"><pre>Matches the condition expression of an if statement, for loop,
-or conditional operator.
+switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasConditionVariableStatement0')"><a name="hasConditionVariableStatement0Anchor">hasConditionVariableStatement</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasConditionVariableStatement0')"><a name="hasConditionVariableStatement0Anchor">hasConditionVariableStatement</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasConditionVariableStatement0"><pre>Matches the condition variable statement in an if statement.
 
 Given
@@ -4147,7 +4812,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasElse0')"><a name="hasElse0Anchor">hasElse</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasElse0')"><a name="hasElse0Anchor">hasElse</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasElse0"><pre>Matches the else-statement of an if statement.
 
 Examples matches the if statement
@@ -4156,7 +4821,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasThen0')"><a name="hasThen0Anchor">hasThen</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasThen0')"><a name="hasThen0Anchor">hasThen</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasThen0"><pre>Matches the then-statement of an if statement.
 
 Examples matches the if statement
@@ -4165,7 +4830,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ImplicitCastExpr.html">ImplicitCastExpr</a>&gt;</td><td class="name" onclick="toggle('hasImplicitDestinationType0')"><a name="hasImplicitDestinationType0Anchor">hasImplicitDestinationType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ImplicitCastExpr.html">ImplicitCastExpr</a>&gt;</td><td class="name" onclick="toggle('hasImplicitDestinationType0')"><a name="hasImplicitDestinationType0Anchor">hasImplicitDestinationType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasImplicitDestinationType0"><pre>Matches implicit casts whose destination type matches a given
 matcher.
 
@@ -4173,7 +4838,35 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration9')"><a name="hasDeclaration9Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;</td><td class="name" onclick="toggle('hasSyntacticForm0')"><a name="hasSyntacticForm0Anchor">hasSyntacticForm</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasSyntacticForm0"><pre>Matches the syntactic form of init list expressions
+(if expression have it).
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration10')"><a name="hasDeclaration10Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration10"><pre>Matches a node if the declaration associated with that node
+matches the given matcher.
+
+The associated declaration is:
+- for type nodes, the declaration of the underlying type
+- for CallExpr, the declaration of the callee
+- for MemberExpr, the declaration of the referenced member
+- for CXXConstructExpr, the declaration of the constructor
+
+Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
+function. e.g. various subtypes of clang::Type and various expressions.
+
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration9')"><a name="hasDeclaration9Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration9"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4186,38 +4879,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration8')"><a name="hasDeclaration8Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration8"><pre>Matches a node if the declaration associated with that node
-matches the given matcher.
-
-The associated declaration is:
-- for type nodes, the declaration of the underlying type
-- for CallExpr, the declaration of the callee
-- for MemberExpr, the declaration of the referenced member
-- for CXXConstructExpr, the declaration of the constructor
-
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
-
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration7')"><a name="hasDeclaration7Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration7')"><a name="hasDeclaration7Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration7"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4230,16 +4901,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('hasObjectExpression0')"><a name="hasObjectExpression0Anchor">hasObjectExpression</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('hasObjectExpression0')"><a name="hasObjectExpression0Anchor">hasObjectExpression</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasObjectExpression0"><pre>Matches a member expression where the object expression is
 matched by a given matcher.
 
@@ -4253,7 +4924,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('member0')"><a name="member0Anchor">member</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('member0')"><a name="member0Anchor">member</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="member0"><pre>Matches a member expression where the member is matched by a
 given matcher.
 
@@ -4267,7 +4938,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerTypeLoc.html">MemberPointerTypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointeeLoc1')"><a name="pointeeLoc1Anchor">pointeeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerTypeLoc.html">MemberPointerTypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointeeLoc1')"><a name="pointeeLoc1Anchor">pointeeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="pointeeLoc1"><pre>Narrows PointerType (and similar) matchers to those where the
 pointee matches a given matcher.
 
@@ -4278,12 +4949,12 @@
 pointerType(pointee(isConstQualified(), isInteger()))
   matches "int const *b"
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;</td><td class="name" onclick="toggle('pointee1')"><a name="pointee1Anchor">pointee</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;</td><td class="name" onclick="toggle('pointee1')"><a name="pointee1Anchor">pointee</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="pointee1"><pre>Narrows PointerType (and similar) matchers to those where the
 pointee matches a given matcher.
 
@@ -4294,12 +4965,25 @@
 pointerType(pointee(isConstQualified(), isInteger()))
   matches "int const *b"
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;</td><td class="name" onclick="toggle('hasPrefix1')"><a name="hasPrefix1Anchor">hasPrefix</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;</td><td class="name" onclick="toggle('hasUnderlyingDecl0')"><a name="hasUnderlyingDecl0Anchor">hasUnderlyingDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasUnderlyingDecl0"><pre>Matches a NamedDecl whose underlying declaration matches the given
+matcher.
+
+Given
+  namespace N { template&lt;class T&gt; void f(T t); }
+  template &lt;class T&gt; void g() { using N::f; f(T()); }
+unresolvedLookupExpr(hasAnyDeclaration(
+    namedDecl(hasUnderlyingDecl(hasName("::N::f")))))
+  matches the use of f in g() .
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;</td><td class="name" onclick="toggle('hasPrefix1')"><a name="hasPrefix1Anchor">hasPrefix</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasPrefix1"><pre>Matches on the prefix of a NestedNameSpecifierLoc.
 
 Given
@@ -4310,7 +4994,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;</td><td class="name" onclick="toggle('specifiesTypeLoc0')"><a name="specifiesTypeLoc0Anchor">specifiesTypeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;</td><td class="name" onclick="toggle('specifiesTypeLoc0')"><a name="specifiesTypeLoc0Anchor">specifiesTypeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="specifiesTypeLoc0"><pre>Matches nested name specifier locs that specify a type matching the
 given TypeLoc.
 
@@ -4323,7 +5007,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;</td><td class="name" onclick="toggle('hasPrefix0')"><a name="hasPrefix0Anchor">hasPrefix</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;</td><td class="name" onclick="toggle('hasPrefix0')"><a name="hasPrefix0Anchor">hasPrefix</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasPrefix0"><pre>Matches on the prefix of a NestedNameSpecifier.
 
 Given
@@ -4334,7 +5018,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;</td><td class="name" onclick="toggle('specifiesNamespace0')"><a name="specifiesNamespace0Anchor">specifiesNamespace</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;</td><td class="name" onclick="toggle('specifiesNamespace0')"><a name="specifiesNamespace0Anchor">specifiesNamespace</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="specifiesNamespace0"><pre>Matches nested name specifiers that specify a namespace matching the
 given namespace matcher.
 
@@ -4346,7 +5030,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;</td><td class="name" onclick="toggle('specifiesType0')"><a name="specifiesType0Anchor">specifiesType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt;</td><td class="name" onclick="toggle('specifiesType0')"><a name="specifiesType0Anchor">specifiesType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="specifiesType0"><pre>Matches nested name specifiers that specify a type matching the
 given QualType matcher without qualifiers.
 
@@ -4360,7 +5044,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasArgument2')"><a name="hasArgument2Anchor">hasArgument</a></td><td>unsigned N, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasArgument2')"><a name="hasArgument2Anchor">hasArgument</a></td><td>unsigned N, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasArgument2"><pre>Matches the n'th argument of a call expression or a constructor
 call expression.
 
@@ -4370,7 +5054,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasReceiverType0')"><a name="hasReceiverType0Anchor">hasReceiverType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCMessageExpr.html">ObjCMessageExpr</a>&gt;</td><td class="name" onclick="toggle('hasReceiverType0')"><a name="hasReceiverType0Anchor">hasReceiverType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasReceiverType0"><pre>Matches on the receiver of an ObjectiveC Message expression.
 
 Example
@@ -4382,7 +5066,28 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ParenType.html">ParenType</a>&gt;</td><td class="name" onclick="toggle('innerType0')"><a name="innerType0Anchor">innerType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1OpaqueValueExpr.html">OpaqueValueExpr</a>&gt;</td><td class="name" onclick="toggle('hasSourceExpression1')"><a name="hasSourceExpression1Anchor">hasSourceExpression</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasSourceExpression1"><pre></pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1OverloadExpr.html">OverloadExpr</a>&gt;</td><td class="name" onclick="toggle('hasAnyDeclaration0')"><a name="hasAnyDeclaration0Anchor">hasAnyDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasAnyDeclaration0"><pre>Matches an OverloadExpr if any of the declarations in the set of
+overloads matches the given matcher.
+
+Given
+  template &lt;typename T&gt; void foo(T);
+  template &lt;typename T&gt; void bar(T);
+  template &lt;typename T&gt; void baz(T t) {
+    foo(t);
+    bar(t);
+  }
+unresolvedLookupExpr(hasAnyDeclaration(
+    functionTemplateDecl(hasName("foo"))))
+  matches foo in foo(t); but not bar in bar(t);
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParenType.html">ParenType</a>&gt;</td><td class="name" onclick="toggle('innerType0')"><a name="innerType0Anchor">innerType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="innerType0"><pre>Matches ParenType nodes where the inner type is a specific type.
 
 Given
@@ -4392,11 +5097,11 @@
 varDecl(hasType(pointsTo(parenType(innerType(functionType()))))) matches
 ptr_to_func but not ptr_to_array.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ParenType.html">ParenType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ParenType.html">ParenType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerTypeLoc.html">PointerTypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointeeLoc2')"><a name="pointeeLoc2Anchor">pointeeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerTypeLoc.html">PointerTypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointeeLoc2')"><a name="pointeeLoc2Anchor">pointeeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="pointeeLoc2"><pre>Narrows PointerType (and similar) matchers to those where the
 pointee matches a given matcher.
 
@@ -4407,12 +5112,12 @@
 pointerType(pointee(isConstQualified(), isInteger()))
   matches "int const *b"
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;</td><td class="name" onclick="toggle('pointee2')"><a name="pointee2Anchor">pointee</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;</td><td class="name" onclick="toggle('pointee2')"><a name="pointee2Anchor">pointee</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="pointee2"><pre>Narrows PointerType (and similar) matchers to those where the
 pointee matches a given matcher.
 
@@ -4423,12 +5128,12 @@
 pointerType(pointee(isConstQualified(), isInteger()))
   matches "int const *b"
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasCanonicalType0')"><a name="hasCanonicalType0Anchor">hasCanonicalType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasCanonicalType0')"><a name="hasCanonicalType0Anchor">hasCanonicalType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasCanonicalType0"><pre>Matches QualTypes whose canonical type matches InnerMatcher.
 
 Given:
@@ -4441,7 +5146,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration6')"><a name="hasDeclaration6Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration6')"><a name="hasDeclaration6Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration6"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4454,21 +5159,32 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('pointsTo1')"><a name="pointsTo1Anchor">pointsTo</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('ignoringParens0')"><a name="ignoringParens0Anchor">ignoringParens</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="ignoringParens0"><pre>Matches types that match InnerMatcher after any parens are stripped.
+
+Given
+  void (*fp)(void);
+The matcher
+  varDecl(hasType(pointerType(pointee(ignoringParens(functionType())))))
+would match the declaration for fp.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('pointsTo1')"><a name="pointsTo1Anchor">pointsTo</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="pointsTo1"><pre>Overloaded to match the pointee type's declaration.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('pointsTo0')"><a name="pointsTo0Anchor">pointsTo</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('pointsTo0')"><a name="pointsTo0Anchor">pointsTo</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="pointsTo0"><pre>Matches if the matched type is a pointer type and the pointee type
 matches the specified matcher.
 
@@ -4480,12 +5196,12 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('references1')"><a name="references1Anchor">references</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('references1')"><a name="references1Anchor">references</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="references1"><pre>Overloaded to match the referenced type's declaration.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('references0')"><a name="references0Anchor">references</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('references0')"><a name="references0Anchor">references</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="references0"><pre>Matches if the matched type is a reference type and the referenced
 type matches the specified matcher.
 
@@ -4500,7 +5216,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration5')"><a name="hasDeclaration5Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration5')"><a name="hasDeclaration5Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration5"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4513,16 +5229,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceTypeLoc.html">ReferenceTypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointeeLoc3')"><a name="pointeeLoc3Anchor">pointeeLoc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceTypeLoc.html">ReferenceTypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointeeLoc3')"><a name="pointeeLoc3Anchor">pointeeLoc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="pointeeLoc3"><pre>Narrows PointerType (and similar) matchers to those where the
 pointee matches a given matcher.
 
@@ -4533,12 +5249,12 @@
 pointerType(pointee(isConstQualified(), isInteger()))
   matches "int const *b"
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;</td><td class="name" onclick="toggle('pointee3')"><a name="pointee3Anchor">pointee</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;</td><td class="name" onclick="toggle('pointee3')"><a name="pointee3Anchor">pointee</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="pointee3"><pre>Narrows PointerType (and similar) matchers to those where the
 pointee matches a given matcher.
 
@@ -4549,24 +5265,63 @@
 pointerType(pointee(isConstQualified(), isInteger()))
   matches "int const *b"
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BlockPointerType.html">BlockPointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberPointerType.html">MemberPointerType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1PointerType.html">PointerType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReferenceType.html">ReferenceType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('alignOfExpr0')"><a name="alignOfExpr0Anchor">alignOfExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ReturnStmt.html">ReturnStmt</a>&gt;</td><td class="name" onclick="toggle('hasReturnValue0')"><a name="hasReturnValue0Anchor">hasReturnValue</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasReturnValue0"><pre>Matches the return value expression of a return statement
+
+Given
+  return a + b;
+hasReturnValue(binaryOperator())
+  matches 'return a + b'
+with binaryOperator()
+  matching 'a + b'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1StmtExpr.html">StmtExpr</a>&gt;</td><td class="name" onclick="toggle('hasAnySubstatement1')"><a name="hasAnySubstatement1Anchor">hasAnySubstatement</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasAnySubstatement1"><pre>Matches compound statements where at least one substatement matches
+a given matcher. Also matches StmtExprs that have CompoundStmt as children.
+
+Given
+  { {}; 1+2; }
+hasAnySubstatement(compoundStmt())
+  matches '{ {}; 1+2; }'
+with compoundStmt()
+  matching '{}'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('alignOfExpr0')"><a name="alignOfExpr0Anchor">alignOfExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="alignOfExpr0"><pre>Same as unaryExprOrTypeTraitExpr, but only matching
 alignof.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('sizeOfExpr0')"><a name="sizeOfExpr0Anchor">sizeOfExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('forFunction0')"><a name="forFunction0Anchor">forFunction</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="forFunction0"><pre>Matches declaration of the function the statemenet belongs to
+
+Given:
+F&amp; operator=(const F&amp; o) {
+  std::copy_if(o.begin(), o.end(), begin(), [](V v) { return v &gt; 0; });
+  return *this;
+}
+returnStmt(forFunction(hasName("operator=")))
+  matches 'return *this'
+  but does match 'return &gt; 0'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('sizeOfExpr0')"><a name="sizeOfExpr0Anchor">sizeOfExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="sizeOfExpr0"><pre>Same as unaryExprOrTypeTraitExpr, but only matching
 sizeof.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchStmt.html">SwitchStmt</a>&gt;</td><td class="name" onclick="toggle('forEachSwitchCase0')"><a name="forEachSwitchCase0Anchor">forEachSwitchCase</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchCase.html">SwitchCase</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchStmt.html">SwitchStmt</a>&gt;</td><td class="name" onclick="toggle('forEachSwitchCase0')"><a name="forEachSwitchCase0Anchor">forEachSwitchCase</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchCase.html">SwitchCase</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachSwitchCase0"><pre>Matches each case or default statement belonging to the given switch
 statement. This matcher may produce multiple matches.
 
@@ -4579,7 +5334,16 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration4')"><a name="hasDeclaration4Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1SwitchStmt.html">SwitchStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition4')"><a name="hasCondition4Anchor">hasCondition</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasCondition4"><pre>Matches the condition expression of an if statement, for loop,
+switch statement or conditional operator.
+
+Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
+  if (true) {}
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration4')"><a name="hasDeclaration4Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration4"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4592,16 +5356,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('isExpr0')"><a name="isExpr0Anchor">isExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('isExpr0')"><a name="isExpr0Anchor">isExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="isExpr0"><pre>Matches a sugar TemplateArgument that refers to a certain expression.
 
 Given
@@ -4615,7 +5379,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('refersToDeclaration0')"><a name="refersToDeclaration0Anchor">refersToDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('refersToDeclaration0')"><a name="refersToDeclaration0Anchor">refersToDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="refersToDeclaration0"><pre>Matches a canonical TemplateArgument that refers to a certain
 declaration.
 
@@ -4630,7 +5394,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('refersToIntegralType0')"><a name="refersToIntegralType0Anchor">refersToIntegralType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('refersToIntegralType0')"><a name="refersToIntegralType0Anchor">refersToIntegralType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="refersToIntegralType0"><pre>Matches a TemplateArgument that referes to an integral type.
 
 Given
@@ -4642,7 +5406,20 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('refersToType0')"><a name="refersToType0Anchor">refersToType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('refersToTemplate0')"><a name="refersToTemplate0Anchor">refersToTemplate</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateName.html">TemplateName</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="refersToTemplate0"><pre>Matches a TemplateArgument that refers to a certain template.
+
+Given
+  template&lt;template &lt;typename&gt; class S&gt; class X {};
+  template&lt;typename T&gt; class Y {};"
+  X&lt;Y&gt; xi;
+classTemplateSpecializationDecl(hasAnyTemplateArgument(
+    refersToTemplate(templateName())))
+  matches the specialization X&lt;Y&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt;</td><td class="name" onclick="toggle('refersToType0')"><a name="refersToType0Anchor">refersToType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="refersToType0"><pre>Matches a TemplateArgument that refers to a certain type.
 
 Given
@@ -4655,21 +5432,29 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasAnyTemplateArgument1')"><a name="hasAnyTemplateArgument1Anchor">hasAnyTemplateArgument</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasAnyTemplateArgument1"><pre>Matches classTemplateSpecializations that have at least one
-TemplateArgument matching the given InnerMatcher.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasAnyTemplateArgument1')"><a name="hasAnyTemplateArgument1Anchor">hasAnyTemplateArgument</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasAnyTemplateArgument1"><pre>Matches classTemplateSpecializations, templateSpecializationType and
+functionDecl that have at least one TemplateArgument matching the given
+InnerMatcher.
 
 Given
   template&lt;typename T&gt; class A {};
   template&lt;&gt; class A&lt;double&gt; {};
   A&lt;int&gt; a;
+
+  template&lt;typename T&gt; f() {};
+  void func() { f&lt;int&gt;(); };
+
 classTemplateSpecializationDecl(hasAnyTemplateArgument(
     refersToType(asString("int"))))
   matches the specialization A&lt;int&gt;
+
+functionDecl(hasAnyTemplateArgument(refersToType(asString("int"))))
+  matches the specialization f&lt;int&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration3')"><a name="hasDeclaration3Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration3')"><a name="hasDeclaration3Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration3"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4682,30 +5467,36 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasTemplateArgument1')"><a name="hasTemplateArgument1Anchor">hasTemplateArgument</a></td><td>unsigned N, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasTemplateArgument1"><pre>Matches classTemplateSpecializations where the n'th TemplateArgument
-matches the given InnerMatcher.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasTemplateArgument1')"><a name="hasTemplateArgument1Anchor">hasTemplateArgument</a></td><td>unsigned N, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasTemplateArgument1"><pre>Matches classTemplateSpecializations, templateSpecializationType and
+functionDecl where the n'th TemplateArgument matches the given InnerMatcher.
 
 Given
   template&lt;typename T, typename U&gt; class A {};
   A&lt;bool, int&gt; b;
   A&lt;int, bool&gt; c;
+
+  template&lt;typename T&gt; f() {};
+  void func() { f&lt;int&gt;(); };
 classTemplateSpecializationDecl(hasTemplateArgument(
     1, refersToType(asString("int"))))
   matches the specialization A&lt;bool, int&gt;
+
+functionDecl(hasTemplateArgument(0, refersToType(asString("int"))))
+  matches the specialization f&lt;int&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration2')"><a name="hasDeclaration2Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration2')"><a name="hasDeclaration2Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration2"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4718,12 +5509,12 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -4743,7 +5534,20 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration1')"><a name="hasDeclaration1Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefNameDecl.html">TypedefNameDecl</a>&gt;</td><td class="name" onclick="toggle('hasType1')"><a name="hasType1Anchor">hasType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasType1"><pre>Matches if the expression's or declaration's type matches a type
+matcher.
+
+Example matches x (matcher = expr(hasType(cxxRecordDecl(hasName("X")))))
+            and z (matcher = varDecl(hasType(cxxRecordDecl(hasName("X")))))
+            and U (matcher = typedefDecl(hasType(asString("int")))
+ class X {};
+ void y(X &amp;x) { x; X z; }
+ typedef int U;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration1')"><a name="hasDeclaration1Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration1"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4756,16 +5560,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;</td><td class="name" onclick="toggle('hasArgumentOfType0')"><a name="hasArgumentOfType0Anchor">hasArgumentOfType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;</td><td class="name" onclick="toggle('hasArgumentOfType0')"><a name="hasArgumentOfType0Anchor">hasArgumentOfType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasArgumentOfType0"><pre>Matches unary expressions that have a specific type of argument.
 
 Given
@@ -4775,7 +5579,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryOperator.html">UnaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasUnaryOperand0')"><a name="hasUnaryOperand0Anchor">hasUnaryOperand</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryOperator.html">UnaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasUnaryOperand0')"><a name="hasUnaryOperand0Anchor">hasUnaryOperand</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasUnaryOperand0"><pre>Matches if the operand of a unary operator matches.
 
 Example matches true (matcher = hasUnaryOperand(
@@ -4784,7 +5588,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration0')"><a name="hasDeclaration0Anchor">hasDeclaration</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration0')"><a name="hasDeclaration0Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration0"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4797,16 +5601,16 @@
 Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
 function. e.g. various subtypes of clang::Type and various expressions.
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
-  Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;,
+  Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingDecl.html">UsingDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyUsingShadowDecl0')"><a name="hasAnyUsingShadowDecl0Anchor">hasAnyUsingShadowDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingShadowDecl.html">UsingShadowDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingDecl.html">UsingDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyUsingShadowDecl0')"><a name="hasAnyUsingShadowDecl0Anchor">hasAnyUsingShadowDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingShadowDecl.html">UsingShadowDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasAnyUsingShadowDecl0"><pre>Matches any using shadow declaration.
 
 Given
@@ -4816,7 +5620,7 @@
   matches using X::b </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingShadowDecl.html">UsingShadowDecl</a>&gt;</td><td class="name" onclick="toggle('hasTargetDecl0')"><a name="hasTargetDecl0Anchor">hasTargetDecl</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UsingShadowDecl.html">UsingShadowDecl</a>&gt;</td><td class="name" onclick="toggle('hasTargetDecl0')"><a name="hasTargetDecl0Anchor">hasTargetDecl</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasTargetDecl0"><pre>Matches a using shadow declaration where the target declaration is
 matched by the given matcher.
 
@@ -4828,8 +5632,8 @@
   matches using X::b but not using X::a </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;</td><td class="name" onclick="toggle('hasType3')"><a name="hasType3Anchor">hasType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasType3"><pre>Overloaded to match the declaration of the expression's or value
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;</td><td class="name" onclick="toggle('hasType4')"><a name="hasType4Anchor">hasType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasType4"><pre>Overloaded to match the declaration of the expression's or value
 declaration's type.
 
 In case of a value declaration (for example a variable declaration),
@@ -4843,22 +5647,24 @@
  class X {};
  void y(X &amp;x) { x; X z; }
 
-Usable as: Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;
+Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;</td><td class="name" onclick="toggle('hasType1')"><a name="hasType1Anchor">hasType</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasType1"><pre>Matches if the expression's or declaration's type matches a type
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;</td><td class="name" onclick="toggle('hasType2')"><a name="hasType2Anchor">hasType</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasType2"><pre>Matches if the expression's or declaration's type matches a type
 matcher.
 
 Example matches x (matcher = expr(hasType(cxxRecordDecl(hasName("X")))))
             and z (matcher = varDecl(hasType(cxxRecordDecl(hasName("X")))))
+            and U (matcher = typedefDecl(hasType(asString("int")))
  class X {};
  void y(X &amp;x) { x; X z; }
+ typedef int U;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasInitializer0')"><a name="hasInitializer0Anchor">hasInitializer</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('hasInitializer0')"><a name="hasInitializer0Anchor">hasInitializer</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasInitializer0"><pre>Matches a variable declaration that has an initializer expression
 that matches the given matcher.
 
@@ -4868,7 +5674,7 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1VariableArrayType.html">VariableArrayType</a>&gt;</td><td class="name" onclick="toggle('hasSizeExpr0')"><a name="hasSizeExpr0Anchor">hasSizeExpr</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VariableArrayType.html">VariableArrayType</a>&gt;</td><td class="name" onclick="toggle('hasSizeExpr0')"><a name="hasSizeExpr0Anchor">hasSizeExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasSizeExpr0"><pre>Matches VariableArrayType nodes that have a specific size
 expression.
 
@@ -4882,9 +5688,9 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1WhileStmt.html">WhileStmt</a>&gt;</td><td class="name" onclick="toggle('hasBody2')"><a name="hasBody2Anchor">hasBody</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasBody2"><pre>Matches a 'for', 'while', or 'do while' statement that has
-a given body.
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1WhileStmt.html">WhileStmt</a>&gt;</td><td class="name" onclick="toggle('hasBody2')"><a name="hasBody2Anchor">hasBody</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasBody2"><pre>Matches a 'for', 'while', 'do while' statement or a function
+definition that has a given body.
 
 Given
   for (;;) {}
@@ -4895,22 +5701,22 @@
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1WhileStmt.html">WhileStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition2')"><a name="hasCondition2Anchor">hasCondition</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1WhileStmt.html">WhileStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition2')"><a name="hasCondition2Anchor">hasCondition</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasCondition2"><pre>Matches the condition expression of an if statement, for loop,
-or conditional operator.
+switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;internal::BindableMatcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;&gt;</td><td class="name" onclick="toggle('loc1')"><a name="loc1Anchor">loc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;internal::BindableMatcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifierLoc.html">NestedNameSpecifierLoc</a>&gt;&gt;</td><td class="name" onclick="toggle('loc1')"><a name="loc1Anchor">loc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="loc1"><pre>Matches NestedNameSpecifierLocs for which the given inner
 NestedNameSpecifier-matcher matches.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;internal::BindableMatcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;&gt;</td><td class="name" onclick="toggle('loc0')"><a name="loc0Anchor">loc</a></td><td>Matcher&lt<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;internal::BindableMatcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;&gt;</td><td class="name" onclick="toggle('loc0')"><a name="loc0Anchor">loc</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="loc0"><pre>Matches TypeLocs for which the given inner
 QualType-matcher matches.
 </pre></td></tr>
diff --git a/docs/MSVCCompatibility.rst b/docs/MSVCCompatibility.rst
index 3794813..ead0ae8 100644
--- a/docs/MSVCCompatibility.rst
+++ b/docs/MSVCCompatibility.rst
@@ -84,18 +84,23 @@
 * RTTI: :good:`Complete`.  Generation of RTTI data structures has been
   finished, along with support for the ``/GR`` flag.
 
-* Exceptions and SEH: :partial:`Partial`.
-  C++ exceptions (``try`` / ``catch`` / ``throw``) and
-  structured exceptions (``__try`` / ``__except`` / ``__finally``) mostly
-  work on x64. 32-bit exception handling support is being worked on.  LLVM does
-  not model asynchronous exceptions, so it is currently impossible to catch an
-  asynchronous exception generated in the same frame as the catching ``__try``.
+* C++ Exceptions: :good:`Mostly complete`.  Support for
+  C++ exceptions (``try`` / ``catch`` / ``throw``) have been implemented for
+  x86 and x64.  Our implementation has been well tested but we still get the
+  odd bug report now and again.
   C++ exception specifications are ignored, but this is `consistent with Visual
   C++`_.
 
 .. _consistent with Visual C++:
   https://msdn.microsoft.com/en-us/library/wfa0edys.aspx
 
+* Asynchronous Exceptions (SEH): :partial:`Partial`.
+  Structured exceptions (``__try`` / ``__except`` / ``__finally``) mostly
+  work on x86 and x64.
+  LLVM does not model asynchronous exceptions, so it is currently impossible to
+  catch an asynchronous exception generated in the same frame as the catching
+  ``__try``.
+
 * Thread-safe initialization of local statics: :good:`Complete`.  MSVC 2015
   added support for thread-safe initialization of such variables by taking an
   ABI break.
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index a6c6e6c..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,110 +0,0 @@
-##===- docs/Makefile ---------------------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ..
-
-ifdef BUILD_FOR_WEBSITE
-PROJ_OBJ_DIR = .
-DOXYGEN = doxygen
-
-$(PROJ_OBJ_DIR)/doxygen.cfg: doxygen.cfg.in
-	cat $< | sed \
-	  -e 's/@DOT@/dot/g' \
-	  -e 's/@PACKAGE_VERSION@/mainline/' \
-	  -e 's/@abs_builddir@/./g' \
-	  -e 's/@abs_srcdir@/./g' \
-	  -e 's/@clang_doxygen_generate_qhp@/NO/g' \
-	  -e 's/@clang_doxygen_qch_filename@//g' \
-	  -e 's/@clang_doxygen_qhelpgenerator_path@//g' \
-	  -e 's/@clang_doxygen_qhp_cust_filter_attrs@//g' \
-	  -e 's/@clang_doxygen_qhp_cust_filter_name@//g' \
-	  -e 's/@clang_doxygen_qhp_namespace@//g' \
-	  -e 's/@enable_external_search@/NO/g' \
-	  -e 's/@enable_searchengine@/NO/g' \
-	  -e 's/@enable_server_based_search@/NO/g' \
-	  -e 's/@extra_search_mappings@//g' \
-	  -e 's/@searchengine_url@//g' \
-	  -e 's/@DOT_IMAGE_FORMAT@/png/g' \
-	  > $@
-endif
-
-include $(CLANG_LEVEL)/Makefile
-
-HTML       := $(wildcard $(PROJ_SRC_DIR)/*.html) \
-              $(wildcard $(PROJ_SRC_DIR)/*.css)
-#IMAGES     := $(wildcard $(PROJ_SRC_DIR)/img/*.*)
-DOXYFILES  := doxygen.cfg.in doxygen.intro
-
-.PHONY: install-html install-doxygen doxygen generated
-
-install_targets :=
-ifndef ONLY_MAN_DOCS
-install_targets += install-html
-endif
-ifeq ($(ENABLE_DOXYGEN),1)
-install_targets += install-doxygen
-endif
-install-local:: $(install_targets)
-
-# Live documentation is generated for the web site using this target:
-# 'make generated BUILD_FOR_WEBSITE=1'
-generated:: doxygen
-
-install-html: $(PROJ_OBJ_DIR)/html.tar.gz
-	$(Echo) Installing HTML documentation
-	$(Verb) $(MKDIR) $(DESTDIR)$(PROJ_docsdir)/html
-	$(Verb) $(MKDIR) $(DESTDIR)$(PROJ_docsdir)/html/img
-	$(Verb) $(DataInstall) $(HTML) $(DESTDIR)$(PROJ_docsdir)/html
-#	$(Verb) $(DataInstall) $(IMAGES) $(DESTDIR)$(PROJ_docsdir)/html/img
-	$(Verb) $(DataInstall) $(PROJ_OBJ_DIR)/html.tar.gz $(DESTDIR)$(PROJ_docsdir)
-
-$(PROJ_OBJ_DIR)/html.tar.gz: $(HTML)
-	$(Echo) Packaging HTML documentation
-	$(Verb) $(RM) -rf $@ $(PROJ_OBJ_DIR)/html.tar
-	$(Verb) cd $(PROJ_SRC_DIR) && \
-	  $(TAR) cf $(PROJ_OBJ_DIR)/html.tar *.html
-	$(Verb) $(GZIPBIN) $(PROJ_OBJ_DIR)/html.tar
-
-install-doxygen: doxygen
-	$(Echo) Installing doxygen documentation
-	$(Verb) $(DataInstall) $(PROJ_OBJ_DIR)/doxygen.tar.gz $(DESTDIR)$(PROJ_docsdir)
-	$(Verb) cd $(PROJ_OBJ_DIR)/doxygen/html && \
-	  for DIR in $$($(FIND) . -type d); do \
-	    DESTSUB="$(DESTDIR)$(PROJ_docsdir)/html/doxygen/$$(echo $$DIR | cut -c 3-)"; \
-	    $(MKDIR) $$DESTSUB && \
-	    $(FIND) $$DIR -maxdepth 1 -type f -exec $(DataInstall) {} $$DESTSUB \; ; \
-	    if [ $$? != 0 ]; then exit 1; fi  \
-	  done
-
-doxygen: regendoc $(PROJ_OBJ_DIR)/doxygen.tar.gz
-
-regendoc:
-	$(Echo) Building doxygen documentation
-	$(Verb) $(RM) -rf $(PROJ_OBJ_DIR)/doxygen
-	$(Verb) $(DOXYGEN) $(PROJ_OBJ_DIR)/doxygen.cfg
-	$(Verb) sed -i "s/[$$]LatestRev[$$]/`svnversion $(PROJ_SRC_DIR)`/g" \
-	 $(PROJ_OBJ_DIR)/doxygen/html/*.html
-
-$(PROJ_OBJ_DIR)/doxygen.tar.gz: $(DOXYFILES) $(PROJ_OBJ_DIR)/doxygen.cfg
-	$(Echo) Packaging doxygen documentation
-	$(Verb) $(RM) -rf $@ $(PROJ_OBJ_DIR)/doxygen.tar
-	$(Verb) $(TAR) cf $(PROJ_OBJ_DIR)/doxygen.tar doxygen
-	$(Verb) $(GZIPBIN) $(PROJ_OBJ_DIR)/doxygen.tar
-	$(Verb) $(CP) $(PROJ_OBJ_DIR)/doxygen.tar.gz $(PROJ_OBJ_DIR)/doxygen/html/
-
-userloc: $(LLVM_SRC_ROOT)/docs/userloc.html
-
-$(LLVM_SRC_ROOT)/docs/userloc.html:
-	$(Echo) Making User LOC Table
-	$(Verb) cd $(LLVM_SRC_ROOT) ; ./utils/userloc.pl -details -recurse \
-	  -html lib include tools runtime utils examples autoconf test > docs/userloc.html
-
-uninstall-local::
-	$(Echo) Uninstalling Documentation
-	$(Verb) $(RM) -rf $(DESTDIR)$(PROJ_docsdir)
diff --git a/docs/MemorySanitizer.rst b/docs/MemorySanitizer.rst
index 62cacce..4e58588 100644
--- a/docs/MemorySanitizer.rst
+++ b/docs/MemorySanitizer.rst
@@ -171,6 +171,8 @@
 MemorySanitizer requires that all program code is instrumented. This
 also includes any libraries that the program depends on, even libc.
 Failing to achieve this may result in false reports.
+For the same reason you may need to replace all inline assembly code that writes to memory
+with a pure C/C++ code.
 
 Full MemorySanitizer instrumentation is very difficult to achieve. To
 make it easier, MemorySanitizer runtime library includes 70+
diff --git a/docs/Modules.rst b/docs/Modules.rst
index 4187654..938a5b8 100644
--- a/docs/Modules.rst
+++ b/docs/Modules.rst
@@ -213,9 +213,6 @@
 ``-fmodule-file=<file>``
   Load the given precompiled module file.
 
-``-fprebuilt-module-path=<directory>``
-  Specify the path to the prebuilt modules. If specified, we will look for modules in this directory for a given top-level module name. We don't need a module map for loading prebuilt modules in this directory and the compiler will not try to rebuild these modules. This can be specified multiple times.
-
 Module Semantics
 ================
 
@@ -413,6 +410,9 @@
 cplusplus11
   C++11 support is available.
 
+gnuinlineasm
+  GNU inline ASM is available.
+
 objc
   Objective-C support is available.
 
diff --git a/docs/PCHInternals.rst b/docs/PCHInternals.rst
index 8f66ddf..b0372cb9 100644
--- a/docs/PCHInternals.rst
+++ b/docs/PCHInternals.rst
@@ -15,7 +15,7 @@
 The Clang compiler frontend, ``clang -cc1``, supports two command line options
 for generating and using PCH files.
 
-To generate PCH files using ``clang -cc1``, use the option :option:`-emit-pch`:
+To generate PCH files using ``clang -cc1``, use the option `-emit-pch`:
 
 .. code-block:: bash
 
@@ -24,7 +24,7 @@
 This option is transparently used by ``clang`` when generating PCH files.  The
 resulting PCH file contains the serialized form of the compiler's internal
 representation after it has completed parsing and semantic analysis.  The PCH
-file can then be used as a prefix header with the :option:`-include-pch`
+file can then be used as a prefix header with the `-include-pch`
 option:
 
 .. code-block:: bash
@@ -84,7 +84,7 @@
 proportional to the amount of code actually used from the AST file, rather than
 being proportional to the size of the AST file itself.
 
-When given the :option:`-print-stats` option, Clang produces statistics
+When given the `-print-stats` option, Clang produces statistics
 describing how much of the AST file was actually loaded from disk.  For a
 simple "Hello, World!" program that includes the Apple ``Cocoa.h`` header
 (which is built as a precompiled header), this option illustrates how little of
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index ea3dcf2..51dcefa 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -1,6 +1,6 @@
-=====================================
-Clang 3.9 (In-Progress) Release Notes
-=====================================
+=======================================
+Clang 4.0.0 (In-Progress) Release Notes
+=======================================
 
 .. contents::
    :local:
@@ -10,15 +10,15 @@
 
 .. warning::
 
-   These are in-progress notes for the upcoming Clang 3.9 release. You may
-   prefer the `Clang 3.7 Release Notes
-   <http://llvm.org/releases/3.7.0/tools/clang/docs/ReleaseNotes.html>`_.
+   These are in-progress notes for the upcoming Clang 4.0.0 release. You may
+   prefer the `Clang 3.8 Release Notes
+   <http://llvm.org/releases/3.8.0/tools/clang/docs/ReleaseNotes.html>`_.
 
 Introduction
 ============
 
 This document contains the release notes for the Clang C/C++/Objective-C
-frontend, part of the LLVM Compiler Infrastructure, release 3.9. Here we
+frontend, part of the LLVM Compiler Infrastructure, release 4.0.0. Here we
 describe the status of Clang in some detail, including major
 improvements from the previous release and new feature work. For the
 general LLVM release notes, see `the LLVM
@@ -36,8 +36,8 @@
 the current one. To see the release notes for a specific release, please
 see the `releases page <http://llvm.org/releases/>`_.
 
-What's New in Clang 3.9?
-========================
+What's New in Clang 4.0.0?
+==========================
 
 Some of the major new features and improvements to Clang are listed
 here. Generic improvements to Clang as a whole or to its underlying
@@ -47,15 +47,11 @@
 Major New Features
 ------------------
 
-- Feature1...
+-  ...
 
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Clang's diagnostics are constantly being improved to catch more issues,
-explain them more clearly, and provide more accurate source information
-about them. The improvements since the 3.7 release include:
-
 -  ...
 
 New Compiler Flags
@@ -69,17 +65,23 @@
 
 Clang now supports the ...
 
+
+Attribute Changes in Clang
+--------------------------
+
+-  ...
+
 Windows Support
 ---------------
 
 Clang's support for building native Windows programs ...
 
-TLS is enabled for Cygwin defaults to -femulated-tls.
-
 
 C Language Changes in Clang
 ---------------------------
 
+- ...
+
 ...
 
 C11 Feature Support
@@ -90,9 +92,9 @@
 C++ Language Changes in Clang
 -----------------------------
 
-- ...
+...
 
-C++11 Feature Support
+C++1z Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
 ...
@@ -107,6 +109,11 @@
 
 ...
 
+OpenMP Support in Clang
+----------------------------------
+
+...
+
 Internal API Changes
 --------------------
 
@@ -126,6 +133,10 @@
 
 ...
 
+With the option --show-description, scan-build's list of defects will also
+show the description of the defects.
+
+
 Static Analyzer
 ---------------
 
diff --git a/docs/SafeStack.rst b/docs/SafeStack.rst
index 21e9b6c..f01b75f 100644
--- a/docs/SafeStack.rst
+++ b/docs/SafeStack.rst
@@ -178,6 +178,17 @@
 project page for more information about the design of the SafeStack and its
 related technologies.
 
+setjmp and exception handling
+-----------------------------
+
+The `OSDI'14 paper <http://dslab.epfl.ch/pubs/cpi.pdf>`_ mentions that
+on Linux the instrumentation pass finds calls to setjmp or functions that
+may throw an exception, and inserts required instrumentation at their call
+sites. Specifically, the instrumentation pass saves the shadow stack pointer
+on the safe stack before the call site, and restores it either after the
+call to setjmp or after an exception has been caught. This is implemented
+in the function ``SafeStack::createStackRestorePoints``.
+
 Publications
 ------------
 
diff --git a/docs/SanitizerCoverage.rst b/docs/SanitizerCoverage.rst
index e759b35..f7ec65f 100644
--- a/docs/SanitizerCoverage.rst
+++ b/docs/SanitizerCoverage.rst
@@ -16,8 +16,9 @@
 ====================
 
 SanitizerCoverage can be used with :doc:`AddressSanitizer`,
-:doc:`LeakSanitizer`, :doc:`MemorySanitizer`, and UndefinedBehaviorSanitizer.
-In addition to ``-fsanitize=``, pass one of the following compile-time flags:
+:doc:`LeakSanitizer`, :doc:`MemorySanitizer`,
+UndefinedBehaviorSanitizer, or without any sanitizer.  Pass one of the
+following compile-time flags:
 
 * ``-fsanitize-coverage=func`` for function-level coverage (very fast).
 * ``-fsanitize-coverage=bb`` for basic-block-level coverage (may add up to 30%
@@ -27,8 +28,9 @@
 You may also specify ``-fsanitize-coverage=indirect-calls`` for
 additional `caller-callee coverage`_.
 
-At run time, pass ``coverage=1`` in ``ASAN_OPTIONS``, ``LSAN_OPTIONS``,
-``MSAN_OPTIONS`` or ``UBSAN_OPTIONS``, as appropriate.
+At run time, pass ``coverage=1`` in ``ASAN_OPTIONS``,
+``LSAN_OPTIONS``, ``MSAN_OPTIONS`` or ``UBSAN_OPTIONS``, as
+appropriate. For the standalone coverage mode, use ``UBSAN_OPTIONS``.
 
 To get `Coverage counters`_, add ``-fsanitize-coverage=8bit-counters``
 to one of the above compile-time flags. At runtime, use
@@ -94,6 +96,41 @@
     cov.cc:3
     cov.cc:5
 
+Sancov Tool
+===========
+
+A new experimental ``sancov`` tool is developed to process coverage files.
+The tool is part of LLVM project and is currently supported only on Linux.
+It can handle symbolization tasks autonomously without any extra support
+from the environment. You need to pass .sancov files (named 
+``<module_name>.<pid>.sancov`` and paths to all corresponding binary elf files. 
+Sancov matches these files using module names and binaries file names.
+
+.. code-block:: console
+
+    USAGE: sancov [options] <action> (<binary file>|<.sancov file>)...
+
+    Action (required)
+      -print                    - Print coverage addresses
+      -covered-functions        - Print all covered functions.
+      -not-covered-functions    - Print all not covered functions.
+      -html-report              - Print HTML coverage report.
+
+    Options
+      -blacklist=<string>         - Blacklist file (sanitizer blacklist format).
+      -demangle                   - Print demangled function name.
+      -strip_path_prefix=<string> - Strip this prefix from file paths in reports
+
+
+Automatic HTML Report Generation
+================================
+
+If ``*SAN_OPTIONS`` contains ``html_cov_report=1`` option set, then html
+coverage report would be automatically generated alongside the coverage files.
+The ``sancov`` binary should be present in ``PATH`` or
+``sancov_path=<path_to_sancov`` option can be used to specify tool location.
+
+
 How good is the coverage?
 =========================
 
@@ -209,7 +246,7 @@
 =================
 
 This experimental feature is inspired by
-`AFL <http://lcamtuf.coredump.cx/afl/technical_details.txt>`_'s coverage
+`AFL <http://lcamtuf.coredump.cx/afl/technical_details.txt>`__'s coverage
 instrumentation. With additional compile-time and run-time flags you can get
 more sensitive coverage information.  In addition to boolean values assigned to
 every basic block (edge) the instrumentation will collect imprecise counters.
@@ -251,10 +288,38 @@
 
 Tracing basic blocks
 ====================
-An *experimental* feature to support basic block (or edge) tracing.
+Experimental support for basic block (or edge) tracing.
 With ``-fsanitize-coverage=trace-bb`` the compiler will insert
 ``__sanitizer_cov_trace_basic_block(s32 *id)`` before every function, basic block, or edge
 (depending on the value of ``-fsanitize-coverage=[func,bb,edge]``).
+Example:
+
+.. code-block:: console
+
+    % clang -g -fsanitize=address -fsanitize-coverage=edge,trace-bb foo.cc
+    % ASAN_OPTIONS=coverage=1 ./a.out
+
+This will produce two files after the process exit:
+`trace-points.PID.sancov` and `trace-events.PID.sancov`.
+The first file will contain a textual description of all the instrumented points in the program
+in the form that you can feed into llvm-symbolizer (e.g. `a.out 0x4dca89`), one per line.
+The second file will contain the actual execution trace as a sequence of 4-byte integers
+-- these integers are the indices into the array of instrumented points (the first file).
+
+Basic block tracing is currently supported only for single-threaded applications.
+
+
+Tracing PCs
+===========
+*Experimental* feature similar to tracing basic blocks, but with a different API.
+With ``-fsanitize-coverage=trace-pc`` the compiler will insert
+``__sanitizer_cov_trace_pc()`` on every edge.
+With an additional ``...=trace-pc,indirect-calls`` flag
+``__sanitizer_cov_trace_pc_indirect(void *callee)`` will be inserted on every indirect call.
+These callbacks are not implemented in the Sanitizer run-time and should be defined
+by the user. So, these flags do not require the other sanitizer to be used.
+This mechanism is used for fuzzing the Linux kernel (https://github.com/google/syzkaller)
+and can be used with `AFL <http://lcamtuf.coredump.cx/afl>`__.
 
 Tracing data flow
 =================
diff --git a/docs/SourceBasedCodeCoverage.rst b/docs/SourceBasedCodeCoverage.rst
new file mode 100644
index 0000000..8c4ab8d
--- /dev/null
+++ b/docs/SourceBasedCodeCoverage.rst
@@ -0,0 +1,237 @@
+==========================
+Source-based Code Coverage
+==========================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This document explains how to use clang's source-based code coverage feature.
+It's called "source-based" because it operates on AST and preprocessor
+information directly. This allows it to generate very precise coverage data.
+
+Clang ships two other code coverage implementations:
+
+* :doc:`SanitizerCoverage` - A low-overhead tool meant for use alongside the
+  various sanitizers. It can provide up to edge-level coverage.
+
+* gcov - A GCC-compatible coverage implementation which operates on DebugInfo.
+
+From this point onwards "code coverage" will refer to the source-based kind.
+
+The code coverage workflow
+==========================
+
+The code coverage workflow consists of three main steps:
+
+* Compiling with coverage enabled.
+
+* Running the instrumented program.
+
+* Creating coverage reports.
+
+The next few sections work through a complete, copy-'n-paste friendly example
+based on this program:
+
+.. code-block:: cpp
+
+    % cat <<EOF > foo.cc
+    #define BAR(x) ((x) || (x))
+    template <typename T> void foo(T x) {
+      for (unsigned I = 0; I < 10; ++I) { BAR(I); }
+    }
+    int main() {
+      foo<int>(0);
+      foo<float>(0);
+      return 0;
+    }
+    EOF
+
+Compiling with coverage enabled
+===============================
+
+To compile code with coverage enabled, pass ``-fprofile-instr-generate
+-fcoverage-mapping`` to the compiler:
+
+.. code-block:: console
+
+    # Step 1: Compile with coverage enabled.
+    % clang++ -fprofile-instr-generate -fcoverage-mapping foo.cc -o foo
+
+Note that linking together code with and without coverage instrumentation is
+supported: any uninstrumented code simply won't be accounted for.
+
+Running the instrumented program
+================================
+
+The next step is to run the instrumented program. When the program exits it
+will write a **raw profile** to the path specified by the ``LLVM_PROFILE_FILE``
+environment variable. If that variable does not exist, the profile is written
+to ``default.profraw`` in the current directory of the program. If
+``LLVM_PROFILE_FILE`` contains a path to a non-existent directory, the missing
+directory structure will be created.  Additionally, the following special
+**pattern strings** are rewritten:
+
+* "%p" expands out to the process ID.
+
+* "%h" expands out to the hostname of the machine running the program.
+
+* "%Nm" expands out to the instrumented binary's signature. When this pattern
+  is specified, the runtime creates a pool of N raw profiles which are used for
+  on-line profile merging. The runtime takes care of selecting a raw profile
+  from the pool, locking it, and updating it before the program exits.  If N is
+  not specified (i.e the pattern is "%m"), it's assumed that ``N = 1``. N must
+  be between 1 and 9. The merge pool specifier can only occur once per filename
+  pattern.
+
+.. code-block:: console
+
+    # Step 2: Run the program.
+    % LLVM_PROFILE_FILE="foo.profraw" ./foo
+
+Creating coverage reports
+=========================
+
+Raw profiles have to be **indexed** before they can be used to generate
+coverage reports. This is done using the "merge" tool in ``llvm-profdata``, so
+named because it can combine and index profiles at the same time:
+
+.. code-block:: console
+
+    # Step 3(a): Index the raw profile.
+    % llvm-profdata merge -sparse foo.profraw -o foo.profdata
+
+There are multiple different ways to render coverage reports. One option is to
+generate a line-oriented report:
+
+.. code-block:: console
+
+    # Step 3(b): Create a line-oriented coverage report.
+    % llvm-cov show ./foo -instr-profile=foo.profdata
+
+To generate the same report in html with demangling turned on, use:
+
+.. code-block:: console
+
+    % llvm-cov show ./foo -instr-profile=foo.profdata -format html -o report.dir -Xdemangler c++filt -Xdemangler -n
+
+This report includes a summary view as well as dedicated sub-views for
+templated functions and their instantiations. For our example program, we get
+distinct views for ``foo<int>(...)`` and ``foo<float>(...)``.  If
+``-show-line-counts-or-regions`` is enabled, ``llvm-cov`` displays sub-line
+region counts (even in macro expansions):
+
+.. code-block:: none
+
+       20|    1|#define BAR(x) ((x) || (x))
+                               ^20     ^2
+        2|    2|template <typename T> void foo(T x) {
+       22|    3|  for (unsigned I = 0; I < 10; ++I) { BAR(I); }
+                                       ^22     ^20  ^20^20
+        2|    4|}
+    ------------------
+    | void foo<int>(int):
+    |      1|    2|template <typename T> void foo(T x) {
+    |     11|    3|  for (unsigned I = 0; I < 10; ++I) { BAR(I); }
+    |                                     ^11     ^10  ^10^10
+    |      1|    4|}
+    ------------------
+    | void foo<float>(int):
+    |      1|    2|template <typename T> void foo(T x) {
+    |     11|    3|  for (unsigned I = 0; I < 10; ++I) { BAR(I); }
+    |                                     ^11     ^10  ^10^10
+    |      1|    4|}
+    ------------------
+
+It's possible to generate a file-level summary of coverage statistics (instead
+of a line-oriented report) with:
+
+.. code-block:: console
+
+    # Step 3(c): Create a coverage summary.
+    % llvm-cov report ./foo -instr-profile=foo.profdata
+    Filename           Regions    Missed Regions     Cover   Functions  Missed Functions  Executed       Lines      Missed Lines     Cover
+    --------------------------------------------------------------------------------------------------------------------------------------
+    /tmp/foo.cc             13                 0   100.00%           3                 0   100.00%          13                 0   100.00%
+    --------------------------------------------------------------------------------------------------------------------------------------
+    TOTAL                   13                 0   100.00%           3                 0   100.00%          13                 0   100.00%
+
+A few final notes:
+
+* The ``-sparse`` flag is optional but can result in dramatically smaller
+  indexed profiles. This option should not be used if the indexed profile will
+  be reused for PGO.
+
+* Raw profiles can be discarded after they are indexed. Advanced use of the
+  profile runtime library allows an instrumented program to merge profiling
+  information directly into an existing raw profile on disk. The details are
+  out of scope.
+
+* The ``llvm-profdata`` tool can be used to merge together multiple raw or
+  indexed profiles. To combine profiling data from multiple runs of a program,
+  try e.g:
+
+  .. code-block:: console
+
+      % llvm-profdata merge -sparse foo1.profraw foo2.profdata -o foo3.profdata
+
+Format compatibility guarantees
+===============================
+
+* There are no backwards or forwards compatibility guarantees for the raw
+  profile format. Raw profiles may be dependent on the specific compiler
+  revision used to generate them. It's inadvisable to store raw profiles for
+  long periods of time.
+
+* Tools must retain **backwards** compatibility with indexed profile formats.
+  These formats are not forwards-compatible: i.e, a tool which uses format
+  version X will not be able to understand format version (X+k).
+
+* There is a third format in play: the format of the coverage mappings emitted
+  into instrumented binaries. Tools must retain **backwards** compatibility
+  with these formats. These formats are not forwards-compatible.
+
+Using the profiling runtime without static initializers
+=======================================================
+
+By default the compiler runtime uses a static initializer to determine the
+profile output path and to register a writer function. To collect profiles
+without using static initializers, do this manually:
+
+* Export a ``int __llvm_profile_runtime`` symbol from each instrumented shared
+  library and executable. When the linker finds a definition of this symbol, it
+  knows to skip loading the object which contains the profiling runtime's
+  static initializer.
+
+* Forward-declare ``void __llvm_profile_initialize_file(void)`` and call it
+  once from each instrumented executable. This function parses
+  ``LLVM_PROFILE_FILE``, sets the output path, and truncates any existing files
+  at that path. To get the same behavior without truncating existing files,
+  pass a filename pattern string to ``void __llvm_profile_set_filename(char
+  *)``.  These calls can be placed anywhere so long as they precede all calls
+  to ``__llvm_profile_write_file``.
+
+* Forward-declare ``int __llvm_profile_write_file(void)`` and call it to write
+  out a profile. This function returns 0 when it succeeds, and a non-zero value
+  otherwise. Calling this function multiple times appends profile data to an
+  existing on-disk raw profile.
+
+Drawbacks and limitations
+=========================
+
+* Code coverage does not handle unpredictable changes in control flow or stack
+  unwinding in the presence of exceptions precisely. Consider the following
+  function:
+
+  .. code-block:: cpp
+
+      int f() {
+        may_throw();
+        return 0;
+      }
+
+  If the call to ``may_throw()`` propagates an exception into ``f``, the code
+  coverage tool may mark the ``return`` statement as executed even though it is
+  not. A call to ``longjmp()`` can have similar effects.
diff --git a/docs/ThreadSanitizer.rst b/docs/ThreadSanitizer.rst
index 0b9b163..cfb0a95 100644
--- a/docs/ThreadSanitizer.rst
+++ b/docs/ThreadSanitizer.rst
@@ -30,7 +30,7 @@
 
 Example:
 
-.. code-block:: c++
+.. code-block:: console
 
   % cat projects/compiler-rt/lib/tsan/lit_tests/tiny_race.c
   #include <pthread.h>
diff --git a/docs/UndefinedBehaviorSanitizer.rst b/docs/UndefinedBehaviorSanitizer.rst
index 37ff16d..7babb96 100644
--- a/docs/UndefinedBehaviorSanitizer.rst
+++ b/docs/UndefinedBehaviorSanitizer.rst
@@ -92,11 +92,14 @@
      parameter which is declared to never be null.
   -  ``-fsanitize=null``: Use of a null pointer or creation of a null
      reference.
-  -  ``-fsanitize=object-size``: An attempt to use bytes which the
-     optimizer can determine are not part of the object being
-     accessed. The sizes of objects are determined using
-     ``__builtin_object_size``, and consequently may be able to detect
-     more problems at higher optimization levels.
+  -  ``-fsanitize=object-size``: An attempt to potentially use bytes which
+     the optimizer can determine are not part of the object being accessed.
+     This will also detect some types of undefined behavior that may not
+     directly access memory, but are provably incorrect given the size of
+     the objects involved, such as invalid downcasts and calling methods on
+     invalid pointers. These checks are made in terms of
+     ``__builtin_object_size``, and consequently may be able to detect more
+     problems at higher optimization levels.
   -  ``-fsanitize=return``: In C++, reaching the end of a
      value-returning function without returning a value.
   -  ``-fsanitize=returns-nonnull-attribute``: Returning null pointer
@@ -168,6 +171,38 @@
 :doc:`SanitizerSpecialCaseList`, that can be used to suppress error reports
 in the specified source files or functions.
 
+Runtime suppressions
+--------------------
+
+Sometimes you can suppress UBSan error reports for specific files, functions,
+or libraries without recompiling the code. You need to pass a path to
+suppression file in a ``UBSAN_OPTIONS`` environment variable.
+
+.. code-block:: bash
+
+    UBSAN_OPTIONS=suppressions=MyUBSan.supp
+
+You need to specify a :ref:`check <ubsan-checks>` you are suppressing and the
+bug location. For example:
+
+.. code-block:: bash
+
+  signed-integer-overflow:file-with-known-overflow.cpp
+  alignment:function_doing_unaligned_access
+  vptr:shared_object_with_vptr_failures.so
+
+There are several limitations:
+
+* Sometimes your binary must have enough debug info and/or symbol table, so
+  that the runtime could figure out source file or function name to match
+  against the suppression.
+* It is only possible to suppress recoverable checks. For the example above,
+  you can additionally pass
+  ``-fsanitize-recover=signed-integer-overflow,alignment,vptr``, although
+  most of UBSan checks are recoverable by default.
+* Check groups (like ``undefined``) can't be used in suppressions file, only
+  fine-grained checks are supported.
+
 Supported Platforms
 ===================
 
@@ -193,6 +228,26 @@
 3.3. The test suite is integrated into the CMake build and can be run with
 ``check-ubsan`` command.
 
+Additional Configuration
+========================
+
+UndefinedBehaviorSanitizer adds static check data for each check unless it is
+in trap mode. This check data includes the full file name. The option
+``-fsanitize-undefined-strip-path-components=N`` can be used to trim this
+information. If ``N`` is positive, file information emitted by
+UndefinedBehaviorSanitizer will drop the first ``N`` components from the file
+path. If ``N`` is negative, the last ``N`` components will be kept.
+
+Example
+-------
+
+For a file called ``/code/library/file.cpp``, here is what would be emitted:
+* Default (No flag, or ``-fsanitize-undefined-strip-path-components=0``): ``/code/library/file.cpp``
+* ``-fsanitize-undefined-strip-path-components=1``: ``code/library/file.cpp``
+* ``-fsanitize-undefined-strip-path-components=2``: ``library/file.cpp``
+* ``-fsanitize-undefined-strip-path-components=-1``: ``file.cpp``
+* ``-fsanitize-undefined-strip-path-components=-2``: ``library/file.cpp``
+
 More Information
 ================
 
diff --git a/docs/UsersManual.rst b/docs/UsersManual.rst
index ea75d1e..be7518b 100644
--- a/docs/UsersManual.rst
+++ b/docs/UsersManual.rst
@@ -2,6 +2,8 @@
 Clang Compiler User's Manual
 ============================
 
+.. include:: <isonum.txt>
+
 .. contents::
    :local:
 
@@ -133,13 +135,13 @@
 .. option:: -ferror-limit=123
 
   Stop emitting diagnostics after 123 errors have been produced. The default is
-  20, and the error limit can be disabled with :option:`-ferror-limit=0`.
+  20, and the error limit can be disabled with `-ferror-limit=0`.
 
 .. option:: -ftemplate-backtrace-limit=123
 
   Only emit up to 123 template instantiation notes within the template
   instantiation backtrace for a single warning or error. The default is 10, and
-  the limit can be disabled with :option:`-ftemplate-backtrace-limit=0`.
+  the limit can be disabled with `-ftemplate-backtrace-limit=0`.
 
 .. _cl_diag_formatting:
 
@@ -543,15 +545,15 @@
 Clang offers a family of flags which the optimizers can use to emit
 a diagnostic in three cases:
 
-1. When the pass makes a transformation (:option:`-Rpass`).
+1. When the pass makes a transformation (`-Rpass`).
 
-2. When the pass fails to make a transformation (:option:`-Rpass-missed`).
+2. When the pass fails to make a transformation (`-Rpass-missed`).
 
 3. When the pass determines whether or not to make a transformation
-   (:option:`-Rpass-analysis`).
+   (`-Rpass-analysis`).
 
-NOTE: Although the discussion below focuses on :option:`-Rpass`, the exact
-same options apply to :option:`-Rpass-missed` and :option:`-Rpass-analysis`.
+NOTE: Although the discussion below focuses on `-Rpass`, the exact
+same options apply to `-Rpass-missed` and `-Rpass-analysis`.
 
 Since there are dozens of passes inside the compiler, each of these flags
 take a regular expression that identifies the name of the pass which should
@@ -567,7 +569,7 @@
 
 Note that remarks from the inliner are identified with `[-Rpass=inline]`.
 To request a report from every optimization pass, you should use
-:option:`-Rpass=.*` (in fact, you can use any valid POSIX regular
+`-Rpass=.*` (in fact, you can use any valid POSIX regular
 expression). However, do not expect a report from every transformation
 made by the compiler. Optimization remarks do not really make sense
 outside of the major transformations (e.g., inlining, vectorization,
@@ -585,7 +587,7 @@
 2. Some source locations are not displayed correctly. The front end has
    a more detailed source location tracking than the locations included
    in the debug info (e.g., the front end can locate code inside macro
-   expansions). However, the locations used by :option:`-Rpass` are
+   expansions). However, the locations used by `-Rpass` are
    translated from debug annotations. That translation can be lossy,
    which results in some remarks having no location information.
 
@@ -711,16 +713,19 @@
 particularly useful when writing a header file that will be compiled by
 other people, because you don't know what warning flags they build with.
 
-In the below example :option:`-Wmultichar` is ignored for only a single line of
-code, after which the diagnostics return to whatever state had previously
+In the below example :option:`-Wextra-tokens` is ignored for only a single line
+of code, after which the diagnostics return to whatever state had previously
 existed.
 
 .. code-block:: c
 
-  #pragma clang diagnostic push
-  #pragma clang diagnostic ignored "-Wmultichar"
+  #if foo
+  #endif foo // warning: extra tokens at end of #endif directive
 
-  char b = 'df'; // no warning.
+  #pragma clang diagnostic ignored "-Wextra-tokens"
+
+  #if foo
+  #endif foo // no warning
 
   #pragma clang diagnostic pop
 
@@ -772,13 +777,15 @@
 
 .. code-block:: c
 
-  char a = 'xy'; // warning
+  #if foo
+  #endif foo // warning: extra tokens at end of #endif directive
 
   #pragma clang system_header
 
-  char b = 'ab'; // no warning
+  #if foo
+  #endif foo // no warning
 
-The :option:`--system-header-prefix=` and :option:`--no-system-header-prefix=`
+The `--system-header-prefix=` and `--no-system-header-prefix=`
 command-line arguments can be used to override whether subsets of an include
 path are treated as system headers. When the name in a ``#include`` directive
 is found within a header search path and starts with a system prefix, the
@@ -847,7 +854,7 @@
 ^^^^^^^^^^^^^^^^^^^^^
 
 To generate a PCH file using Clang, one invokes Clang with the
-:option:`-x <language>-header` option. This mirrors the interface in GCC
+`-x <language>-header` option. This mirrors the interface in GCC
 for generating PCH files:
 
 .. code-block:: console
@@ -910,7 +917,7 @@
 Building a relocatable precompiled header requires two additional
 arguments. First, pass the ``--relocatable-pch`` flag to indicate that
 the resulting PCH file should be relocatable. Second, pass
-:option:`-isysroot /path/to/build`, which makes all includes for your library
+`-isysroot /path/to/build`, which makes all includes for your library
 relative to the build directory. For example:
 
 .. code-block:: console
@@ -920,9 +927,9 @@
 When loading the relocatable PCH file, the various headers used in the
 PCH file are found from the system header root. For example, ``mylib.h``
 can be found in ``/usr/include/mylib.h``. If the headers are installed
-in some other system root, the :option:`-isysroot` option can be used provide
+in some other system root, the `-isysroot` option can be used provide
 a different system root from which the headers will be based. For
-example, :option:`-isysroot /Developer/SDKs/MacOSX10.4u.sdk` will look for
+example, `-isysroot /Developer/SDKs/MacOSX10.4u.sdk` will look for
 ``mylib.h`` in ``/Developer/SDKs/MacOSX10.4u.sdk/usr/include/mylib.h``.
 
 Relocatable precompiled headers are intended to be used in a limited
@@ -986,6 +993,8 @@
 
 **-f[no-]sanitize-recover=check1,check2,...**
 
+**-f[no-]sanitize-recover=all**
+
    Controls which checks enabled by ``-fsanitize=`` flag are non-fatal.
    If the check is fatal, program will halt after the first error
    of this kind is detected and error report is printed.
@@ -1053,6 +1062,25 @@
    the behavior of sanitizers in the ``cfi`` group to allow checking
    of cross-DSO virtual and indirect calls.
 
+.. option:: -ffast-math
+
+   Enable fast-math mode. This defines the ``__FAST_MATH__`` preprocessor
+   macro, and lets the compiler make aggressive, potentially-lossy assumptions
+   about floating-point math.  These include:
+
+   * Floating-point math obeys regular algebraic rules for real numbers (e.g.
+     ``+`` and ``*`` are associative, ``x/y == x * (1/y)``, and
+     ``(a + b) * c == a * c + b * c``),
+   * operands to floating-point operations are not equal to ``NaN`` and
+     ``Inf``, and
+   * ``+0`` and ``-0`` are interchangeable.
+
+.. option:: -fwhole-program-vtables
+
+   Enable whole-program vtable optimizations, such as single-implementation
+   devirtualization and virtual constant propagation, for classes with
+   :doc:`hidden LTO visibility <LTOVisibility>`. Requires ``-flto``.
+
 .. option:: -fno-assume-sane-operator-new
 
    Don't assume that the C++'s new operator is sane.
@@ -1119,6 +1147,16 @@
    This option restricts the generated code to use general registers
    only. This only applies to the AArch64 architecture.
 
+.. option:: -mcompact-branches=[values]
+
+   Control the usage of compact branches for MIPSR6.
+
+   Valid values are: ``never``, ``optimal`` and ``always``.
+   The default value is ``optimal`` which generates compact branches
+   when a delay slot cannot be filled. ``never`` disables the usage of
+   compact branches and ``always`` generates compact branches whenever
+   possible.
+
 **-f[no-]max-type-align=[number]**
    Instruct the code generator to not enforce a higher alignment than the given
    number (of bytes) when accessing memory via an opaque pointer or reference.
@@ -1434,8 +1472,13 @@
 
 2. Run the instrumented executable with inputs that reflect the typical usage.
    By default, the profile data will be written to a ``default.profraw`` file
-   in the current directory. You can override that default by setting the
-   ``LLVM_PROFILE_FILE`` environment variable to specify an alternate file.
+   in the current directory. You can override that default by using option
+   ``-fprofile-instr-generate=`` or by setting the ``LLVM_PROFILE_FILE`` 
+   environment variable to specify an alternate file. If non-default file name
+   is specified by both the environment variable and the command line option,
+   the environment variable takes precedence. The file name pattern specified
+   can include different modifiers: ``%p``, ``%h``, and ``%m``.
+
    Any instance of ``%p`` in that file name will be replaced by the process
    ID, so that you can easily distinguish the profile output from multiple
    runs.
@@ -1444,6 +1487,33 @@
 
      $ LLVM_PROFILE_FILE="code-%p.profraw" ./code
 
+   The modifier ``%h`` can be used in scenarios where the same instrumented
+   binary is run in multiple different host machines dumping profile data
+   to a shared network based storage. The ``%h`` specifier will be substituted
+   with the hostname so that profiles collected from different hosts do not
+   clobber each other.
+
+   While the use of ``%p`` specifier can reduce the likelihood for the profiles
+   dumped from different processes to clobber each other, such clobbering can still
+   happen because of the ``pid`` re-use by the OS. Another side-effect of using
+   ``%p`` is that the storage requirement for raw profile data files is greatly
+   increased.  To avoid issues like this, the ``%m`` specifier can used in the profile
+   name.  When this specifier is used, the profiler runtime will substitute ``%m``
+   with a unique integer identifier associated with the instrumented binary. Additionally,
+   multiple raw profiles dumped from different processes that share a file system (can be
+   on different hosts) will be automatically merged by the profiler runtime during the
+   dumping. If the program links in multiple instrumented shared libraries, each library
+   will dump the profile data into its own profile data file (with its unique integer
+   id embedded in the profile name). Note that the merging enabled by ``%m`` is for raw
+   profile data generated by profiler runtime. The resulting merged "raw" profile data
+   file still needs to be converted to a different format expected by the compiler (
+   see step 3 below).
+
+   .. code-block:: console
+
+     $ LLVM_PROFILE_FILE="code-%m.profraw" ./code
+
+
 3. Combine profiles from multiple runs and convert the "raw" profile format to
    the input expected by clang. Use the ``merge`` command of the
    ``llvm-profdata`` tool to do this.
@@ -1466,37 +1536,43 @@
    profile. As you make changes to your code, clang may no longer be able to
    use the profile data. It will warn you when this happens.
 
-Profile generation and use can also be controlled by the GCC-compatible flags
-``-fprofile-generate`` and ``-fprofile-use``. Although these flags are
-semantically equivalent to their GCC counterparts, they *do not* handle
-GCC-compatible profiles. They are only meant to implement GCC's semantics
-with respect to profile creation and use.
+Profile generation using an alternative instrumentation method can be
+controlled by the GCC-compatible flags ``-fprofile-generate`` and
+``-fprofile-use``. Although these flags are semantically equivalent to
+their GCC counterparts, they *do not* handle GCC-compatible profiles.
+They are only meant to implement GCC's semantics with respect to
+profile creation and use.
 
 .. option:: -fprofile-generate[=<dirname>]
 
-  Without any other arguments, ``-fprofile-generate`` behaves identically to
-  ``-fprofile-instr-generate``. When given a directory name, it generates the
-  profile file ``default.profraw`` in the directory named ``dirname``. If
-  ``dirname`` does not exist, it will be created at runtime. The environment
-  variable ``LLVM_PROFILE_FILE`` can be used to override the directory and
-  filename for the profile file at runtime. For example,
+  The ``-fprofile-generate`` and ``-fprofile-generate=`` flags will use
+  an alterantive instrumentation method for profile generation. When
+  given a directory name, it generates the profile file
+  ``default_%m.profraw`` in the directory named ``dirname`` if specified.
+  If ``dirname`` does not exist, it will be created at runtime. ``%m`` specifier
+  will be substibuted with a unique id documented in step 2 above. In other words,
+  with ``-fprofile-generate[=<dirname>]`` option, the "raw" profile data automatic
+  merging is turned on by default, so there will no longer any risk of profile
+  clobbering from different running processes.  For example,
 
   .. code-block:: console
 
     $ clang++ -O2 -fprofile-generate=yyy/zzz code.cc -o code
 
   When ``code`` is executed, the profile will be written to the file
-  ``yyy/zzz/default.profraw``. This can be altered at runtime via the
-  ``LLVM_PROFILE_FILE`` environment variable:
+  ``yyy/zzz/default_xxxx.profraw``.
 
-  .. code-block:: console
+  To generate the profile data file with the compiler readable format, the 
+  ``llvm-profdata`` tool can be used with the profile directory as the input:
 
-    $ LLVM_PROFILE_FILE=/tmp/myprofile/code.profraw ./code
+   .. code-block:: console
 
-  The above invocation will produce the profile file
-  ``/tmp/myprofile/code.profraw`` instead of ``yyy/zzz/default.profraw``.
-  Notice that ``LLVM_PROFILE_FILE`` overrides the directory *and* the file
-  name for the profile file.
+     $ llvm-profdata merge -output=code.profdata yyy/zzz/
+
+ If the user wants to turn off the auto-merging feature, or simply override the
+ the profile dumping path specified at command line, the environment variable
+ ``LLVM_PROFILE_FILE`` can still be used to override
+ the directory and filename for the profile file at runtime.
 
 .. option:: -fprofile-use[=<pathname>]
 
@@ -1576,7 +1652,7 @@
 
 .. option:: -ggdb, -glldb, -gsce
 
-  Tune the debug info for the ``gdb``, ``lldb``, or Sony Computer Entertainment
+  Tune the debug info for the ``gdb``, ``lldb``, or Sony PlayStation\ |reg|
   debugger, respectively. Each of these options implies **-g**. (Therefore, if
   you want both **-gline-tables-only** and debugger tuning, the tuning option
   must come first.)
@@ -1694,10 +1770,6 @@
 clang tries to be compatible with gcc as much as possible, but some gcc
 extensions are not implemented yet:
 
--  clang does not support #pragma weak (`bug
-   3679 <http://llvm.org/bugs/show_bug.cgi?id=3679>`_). Due to the uses
-   described in the bug, this is likely to be implemented at some point,
-   at least partially.
 -  clang does not support decimal floating point types (``_Decimal32`` and
    friends) or fixed-point types (``_Fract`` and friends); nobody has
    expressed interest in these features yet, so it's hard to say when
@@ -1715,9 +1787,6 @@
      ...
      local_function(1);
 
--  clang does not support global register variables; this is unlikely to
-   be implemented soon because it requires additional LLVM backend
-   support.
 -  clang does not support static initialization of flexible array
    members. This appears to be a rarely used extension, but could be
    implemented pending user demand.
@@ -1762,13 +1831,11 @@
 Microsoft extensions
 --------------------
 
-clang has some experimental support for extensions from Microsoft Visual
-C++; to enable it, use the ``-fms-extensions`` command-line option. This is
-the default for Windows targets. Note that the support is incomplete.
-Some constructs such as ``dllexport`` on classes are ignored with a warning,
-and others such as `Microsoft IDL annotations
-<http://msdn.microsoft.com/en-us/library/8tesw2eh.aspx>`_ are silently
-ignored.
+clang has support for many extensions from Microsoft Visual C++. To enable these
+extensions, use the ``-fms-extensions`` command-line option. This is the default
+for Windows targets. Clang does not implement every pragma or declspec provided
+by MSVC, but the popular ones, such as ``__declspec(dllexport)`` and ``#pragma
+comment(lib)`` are well supported.
 
 clang has a ``-fms-compatibility`` flag that makes clang accept enough
 invalid C++ to be able to parse most Microsoft headers. For example, it
@@ -1781,23 +1848,14 @@
 definitions until the end of a translation unit. This flag is enabled by
 default for Windows targets.
 
--  clang allows setting ``_MSC_VER`` with ``-fmsc-version=``. It defaults to
-   1700 which is the same as Visual C/C++ 2012. Any number is supported
-   and can greatly affect what Windows SDK and c++stdlib headers clang
-   can compile.
--  clang does not support the Microsoft extension where anonymous record
-   members can be declared using user defined typedefs.
--  clang supports the Microsoft ``#pragma pack`` feature for controlling
-   record layout. GCC also contains support for this feature, however
-   where MSVC and GCC are incompatible clang follows the MSVC
-   definition.
--  clang supports the Microsoft ``#pragma comment(lib, "foo.lib")`` feature for
-   automatically linking against the specified library.  Currently this feature
-   only works with the Visual C++ linker.
--  clang supports the Microsoft ``#pragma comment(linker, "/flag:foo")`` feature
-   for adding linker flags to COFF object files.  The user is responsible for
-   ensuring that the linker understands the flags.
--  clang defaults to C++11 for Windows targets.
+For compatibility with existing code that compiles with MSVC, clang defines the
+``_MSC_VER`` and ``_MSC_FULL_VER`` macros. These default to the values of 1800
+and 180000000 respectively, making clang look like an early release of Visual
+C++ 2013. The ``-fms-compatibility-version=`` flag overrides these values.  It
+accepts a dotted version tuple, such as 19.00.23506. Changing the MSVC
+compatibility version makes clang behave more like that version of MSVC. For
+example, ``-fms-compatibility-version=19`` will enable C++14 features and define
+``char16_t`` and ``char32_t`` as builtin types.
 
 .. _cxx:
 
@@ -1854,8 +1912,8 @@
 array sections), ``#pragma omp cancel`` and ``#pragma omp cancellation point``
 directives, and ``#pragma omp taskgroup`` directive.
 
-Use :option:`-fopenmp` to enable OpenMP. Support for OpenMP can be disabled with
-:option:`-fno-openmp`.
+Use `-fopenmp` to enable OpenMP. Support for OpenMP can be disabled with
+`-fno-openmp`.
 
 Controlling implementation limits
 ---------------------------------
@@ -1864,7 +1922,7 @@
 
  Controls code generation for OpenMP threadprivate variables. In presence of
  this option all threadprivate variables are generated the same way as thread
- local variables, using TLS support. If :option:`-fno-openmp-use-tls`
+ local variables, using TLS support. If `-fno-openmp-use-tls`
  is provided or target does not support TLS, code generation for threadprivate
  variables relies on OpenMP runtime library.
 
@@ -1888,7 +1946,7 @@
 Microsoft x64 calling convention. You might need to tweak
 ``WinX86_64ABIInfo::classify()`` in lib/CodeGen/TargetInfo.cpp.
 
-For the X86 target, clang supports the :option:`-m16` command line
+For the X86 target, clang supports the `-m16` command line
 argument which enables 16-bit code output. This is broadly similar to
 using ``asm(".code16gcc")`` with the GNU toolchain. The generated code
 and the ABI remains 32-bit but the assembler emits instructions
@@ -2024,8 +2082,9 @@
 
 To suppress warnings about unused arguments, use the ``-Qunused-arguments`` option.
 
-Options that are not known to clang-cl will cause errors. If they are spelled with a
-leading ``/``, they will be mistaken for a filename:
+Options that are not known to clang-cl will be ignored by default. Use the
+``-Werror=unknown-argument`` option in order to treat them as errors. If these
+options are spelled with a leading ``/``, they will be mistaken for a filename:
 
   ::
 
@@ -2041,6 +2100,8 @@
     CL.EXE COMPATIBILITY OPTIONS:
       /?                     Display available options
       /arch:<value>          Set architecture for code generation
+      /Brepro-               Emit an object file which cannot be reproduced over time
+      /Brepro                Emit an object file which can be reproduced over time
       /C                     Don't discard comments when preprocessing
       /c                     Compile only
       /D <macro[=value]>     Define macro
@@ -2059,16 +2120,26 @@
       /fp:fast
       /fp:precise
       /fp:strict
+      /Fp<filename>          Set pch filename (with /Yc and /Yu)
       /GA                    Assume thread-local variables are defined in the executable
+      /Gd                    Set __cdecl as a default calling convention
       /GF-                   Disable string pooling
       /GR-                   Disable emission of RTTI data
       /GR                    Enable emission of RTTI data
+      /Gr                    Set __fastcall as a default calling convention
+      /GS-                   Disable buffer security check
+      /GS                    Enable buffer security check
       /Gs<value>             Set stack probe size
+      /Gv                    Set __vectorcall as a default calling convention
       /Gw-                   Don't put each data item in its own section
       /Gw                    Put each data item in its own section
+      /GX-                   Enable exception handling
+      /GX                    Enable exception handling
       /Gy-                   Don't put each function in its own section
       /Gy                    Put each function in its own section
+      /Gz                    Set __stdcall as a default calling convention
       /help                  Display available options
+      /imsvc <dir>           Add directory to system include search path, as if part of %INCLUDE%
       /I <dir>               Add directory to include search path
       /J                     Make char type unsigned
       /LDd                   Create debug DLL
@@ -2078,20 +2149,18 @@
       /MD                    Use DLL run-time
       /MTd                   Use static debug run-time
       /MT                    Use static run-time
-      /Ob0                   Disable inlining
       /Od                    Disable optimization
       /Oi-                   Disable use of builtin functions
       /Oi                    Enable use of builtin functions
       /Os                    Optimize for size
       /Ot                    Optimize for speed
-      /Oy-                   Disable frame pointer omission
-      /Oy                    Enable frame pointer omission
       /O<value>              Optimization level
       /o <file or directory> Set output file or directory (ends in / or \)
       /P                     Preprocess to file
       /Qvec-                 Disable the loop vectorization passes
       /Qvec                  Enable the loop vectorization passes
       /showIncludes          Print info about included files to stderr
+      /std:<value>           Language standard to compile for
       /TC                    Treat all source files as C
       /Tc <filename>         Specify a C source file
       /TP                    Treat all source files as C++
@@ -2110,10 +2179,13 @@
       /W2                    Enable -Wall
       /W3                    Enable -Wall
       /W4                    Enable -Wall and -Wextra
-      /Wall                  Enable -Wall
+      /Wall                  Enable -Wall and -Wextra
       /WX-                   Do not treat warnings as errors
       /WX                    Treat warnings as errors
       /w                     Disable all warnings
+      /Y-                    Disable precompiled headers, overrides /Yc and /Yu
+      /Yc<filename>          Generate a pch file for all code up to and including <filename>
+      /Yu<filename>          Load a pch file and use it instead of all code up to and including <filename>
       /Z7                    Enable CodeView debug information in object files
       /Zc:sizedDealloc-      Disable C++14 sized global deallocation functions
       /Zc:sizedDealloc       Enable C++14 sized global deallocation functions
@@ -2122,6 +2194,7 @@
       /Zc:threadSafeInit     Enable thread-safe initialization of static variables
       /Zc:trigraphs-         Disable trigraphs (default)
       /Zc:trigraphs          Enable trigraphs
+      /Zd                    Emit debug line number tables only
       /Zi                    Alias for /Z7. Does not produce PDBs.
       /Zl                    Don't mention any default libraries in the object file
       /Zp                    Set the default maximum struct packing alignment to 1
@@ -2138,8 +2211,10 @@
       -fms-compatibility-version=<value>
                               Dot-separated value representing the Microsoft compiler version
                               number to report in _MSC_VER (0 = don't define it (default))
-      -fmsc-version=<value>   Microsoft compiler version number to report in _MSC_VER (0 = don't
-                              define it (default))
+      -fms-compatibility      Enable full Microsoft Visual C++ compatibility
+      -fms-extensions         Accept some non-standard constructs supported by the Microsoft compiler
+      -fmsc-version=<value>   Microsoft compiler version number to report in _MSC_VER
+                              (0 = don't define it (default))
       -fno-sanitize-coverage=<value>
                               Disable specified features of coverage instrumentation for Sanitizers
       -fno-sanitize-recover=<value>
@@ -2156,6 +2231,8 @@
       -fsanitize=<check>      Turn on runtime checks for various forms of undefined or suspicious
                               behavior. See user manual for available checks
       -gcodeview              Generate CodeView debug information
+      -gline-tables-only      Emit debug line number tables only
+      -miamcu                 Use Intel MCU ABI
       -mllvm <value>          Additional arguments to forward to LLVM's option processing
       -Qunused-arguments      Don't emit warning for unused driver arguments
       -R<remark>              Enable the specified remark
diff --git a/docs/analyzer/Makefile b/docs/analyzer/Makefile
deleted file mode 100644
index 14f5e60..0000000
--- a/docs/analyzer/Makefile
+++ /dev/null
@@ -1,155 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = _build
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
-
-default: html
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  texinfo    to make Texinfo files"
-	@echo "  info       to make Texinfo files and run them through makeinfo"
-	@echo "  gettext    to make PO message catalogs"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-
-clean:
-	-rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ClangStaticAnalyzer.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ClangStaticAnalyzer.qhc"
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/ClangStaticAnalyzer"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ClangStaticAnalyzer"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo
-	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
-	@echo "Run \`make' in that directory to run these through makeinfo" \
-	      "(use \`make info' here to do that automatically)."
-
-info:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo "Running Texinfo files through makeinfo..."
-	make -C $(BUILDDIR)/texinfo info
-	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
-	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
-	@echo
-	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/docs/analyzer/conf.py b/docs/analyzer/conf.py
index 1514708..6b54b06 100644
--- a/docs/analyzer/conf.py
+++ b/docs/analyzer/conf.py
@@ -12,6 +12,7 @@
 # serve to show the default.
 
 import sys, os
+from datetime import date
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
@@ -41,16 +42,16 @@
 
 # General information about the project.
 project = u'Clang Static Analyzer'
-copyright = u'2013-2014, Analyzer Team'
+copyright = u'2013-%d, Analyzer Team' % date.today().year
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '3.4'
+version = '4.0'
 # The full version, including alpha/beta/rc tags.
-release = '3.4'
+release = '4.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/conf.py b/docs/conf.py
index 1e8894a..4683ca5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -49,9 +49,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '3.9'
+version = '4.0'
 # The full version, including alpha/beta/rc tags.
-release = '3.9'
+release = '4.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/doxygen-mainpage.dox b/docs/doxygen-mainpage.dox
new file mode 100644
index 0000000..2fd34f0
--- /dev/null
+++ b/docs/doxygen-mainpage.dox
@@ -0,0 +1,15 @@
+/// \mainpage clang
+///
+/// \section main_intro Introduction
+/// Welcome to the clang project.
+///
+/// This documentation describes the **internal** software that makes
+/// up clang, not the **external** use of clang. There are no instructions
+/// here on how to use clang, only the APIs that make up the software. For
+/// usage instructions, please see the programmer's guide or reference
+/// manual.
+///
+/// \section main_caveat Caveat
+/// This documentation is generated directly from the source code with doxygen.
+/// Since clang is constantly under active development, what you're about to
+/// read is out of date!
diff --git a/docs/doxygen.cfg.in b/docs/doxygen.cfg.in
index f6c7cba..c96ab49 100644
--- a/docs/doxygen.cfg.in
+++ b/docs/doxygen.cfg.in
@@ -745,7 +745,7 @@
 
 INPUT                  = @abs_srcdir@/../include \
                          @abs_srcdir@/../lib \
-                         @abs_srcdir@/doxygen.intro
+                         @abs_srcdir@/doxygen-mainpage.dox
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1791,18 +1791,6 @@
 
 XML_OUTPUT             = xml
 
-# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_DTD                =
-
 # If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
 # listings (including syntax highlighting and cross-referencing information) to
 # the XML output. Note that enabling this will significantly increase the size
@@ -1949,7 +1937,7 @@
 EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all refrences to function-like macros that are alone on a line, have an
+# remove all references to function-like macros that are alone on a line, have an
 # all uppercase name, and do not end with a semicolon. Such function macros are
 # typically used for boiler-plate code, and will confuse the parser if not
 # removed.
diff --git a/docs/doxygen.intro b/docs/doxygen.intro
deleted file mode 100644
index accab72..0000000
--- a/docs/doxygen.intro
+++ /dev/null
@@ -1,15 +0,0 @@
-/// @mainpage clang
-///
-/// @section main_intro Introduction
-/// Welcome to the clang project.
-///
-/// This documentation describes the @b internal software that makes 
-/// up clang, not the @b external use of clang. There are no instructions
-/// here on how to use clang, only the APIs that make up the software. For 
-/// usage instructions, please see the programmer's guide or reference 
-/// manual.
-///
-/// @section main_caveat Caveat 
-/// This documentation is generated directly from the source code with doxygen. 
-/// Since clang is constantly under active development, what you're about to
-/// read is out of date!
diff --git a/docs/index.rst b/docs/index.rst
index 81a15b8..2960547 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -31,7 +31,9 @@
    SanitizerStats
    SanitizerSpecialCaseList
    ControlFlowIntegrity
+   LTOVisibility
    SafeStack
+   SourceBasedCodeCoverage
    Modules
    MSVCCompatibility
    CommandGuide/index
@@ -76,6 +78,7 @@
    DriverInternals
    PTHInternals
    PCHInternals
+   ItaniumMangleAbiTags
 
 
 Indices and tables
diff --git a/docs/tools/dump_ast_matchers.py b/docs/tools/dump_ast_matchers.py
old mode 100644
new mode 100755
index 22b09e2..4554040
--- a/docs/tools/dump_ast_matchers.py
+++ b/docs/tools/dump_ast_matchers.py
@@ -47,7 +47,7 @@
       except:
         doxygen_probes[url] = False
     if doxygen_probes[url]:
-      return r'Matcher&lt<a href="%s">%s</a>&gt;' % (url, name)
+      return r'Matcher&lt;<a href="%s">%s</a>&gt;' % (url, name)
     else:
       return m.group(0)
   text = re.sub(
@@ -83,6 +83,11 @@
   """Returns the given comment without \-escaped words."""
   # If there is only a doxygen keyword in the line, delete the whole line.
   comment = re.sub(r'^\\[^\s]+\n', r'', comment, flags=re.M)
+  
+  # If there is a doxygen \see command, change the \see prefix into "See also:".
+  # FIXME: it would be better to turn this into a link to the target instead.
+  comment = re.sub(r'\\see', r'See also:', comment)
+  
   # Delete the doxygen command and the following whitespace.
   comment = re.sub(r'\\[^\s]+\s+', r'', comment)
   return comment
@@ -90,7 +95,7 @@
 def unify_arguments(args):
   """Gets rid of anything the user doesn't care about in the argument list."""
   args = re.sub(r'internal::', r'', args)
-  args = re.sub(r'const\s+', r'', args)
+  args = re.sub(r'const\s+(.*)&', r'\1 ', args)
   args = re.sub(r'&', r' ', args)
   args = re.sub(r'(^|\s)M\d?(\s)', r'\1Matcher<*>\2', args)
   return args
@@ -226,7 +231,7 @@
     m = re.match(r"""^\s*AST_MATCHER(_P)?(.?)(?:_OVERLOAD)?\(
                        (?:\s*([^\s,]+)\s*,)?
                           \s*([^\s,]+)\s*
-                       (?:,\s*([^\s,]+)\s*
+                       (?:,\s*([^,]+)\s*
                           ,\s*([^\s,]+)\s*)?
                        (?:,\s*([^\s,]+)\s*
                           ,\s*([^\s,]+)\s*)?
@@ -259,6 +264,16 @@
       add_matcher('*', name, 'Matcher<*>', comment)
       return
 
+    # Parse Variadic functions.
+    m = re.match(
+        r"""^.*internal::VariadicFunction\s*<\s*([^,]+),\s*([^,]+),\s*[^>]+>\s*
+              ([a-zA-Z]*)\s*=\s*{.*};$""",
+        declaration, flags=re.X)
+    if m:
+      result, arg, name = m.groups()[:3]
+      add_matcher(result, name, '%s, ..., %s' % (arg, arg), comment)
+      return
+
     # Parse Variadic operator matchers.
     m = re.match(
         r"""^.*VariadicOperatorMatcherFunc\s*<\s*([^,]+),\s*([^\s>]+)\s*>\s*
diff --git a/docs/tools/dump_format_style.py b/docs/tools/dump_format_style.py
index b61d201..6e14939 100755
--- a/docs/tools/dump_format_style.py
+++ b/docs/tools/dump_format_style.py
@@ -4,11 +4,13 @@
 # Run from the directory in which this file is located to update the docs.
 
 import collections
+import os
 import re
 import urllib2
 
-FORMAT_STYLE_FILE = '../../include/clang/Format/Format.h'
-DOC_FILE = '../ClangFormatStyleOptions.rst'
+CLANG_DIR = os.path.join(os.path.dirname(__file__), '../..')
+FORMAT_STYLE_FILE = os.path.join(CLANG_DIR, 'include/clang/Format/Format.h')
+DOC_FILE = os.path.join(CLANG_DIR, 'docs/ClangFormatStyleOptions.rst')
 
 
 def substitute(text, tag, contents):
@@ -77,7 +79,7 @@
 class EnumValue:
   def __init__(self, name, comment):
     self.name = name
-    self.comment = comment.strip()
+    self.comment = comment
 
   def __str__(self):
     return '* ``%s`` (in configuration: ``%s``)\n%s' % (
@@ -86,8 +88,12 @@
         doxygen2rst(indent(self.comment, 2)))
 
 def clean_comment_line(line):
-  if line == '/// \\code':
-    return '\n.. code-block:: c++\n\n'
+  match = re.match(r'^/// \\code(\{.(\w+)\})?$', line)
+  if match:
+    lang = match.groups()[1]
+    if not lang:
+      lang = 'c++'
+    return '\n.. code-block:: %s\n\n' % lang
   if line == '/// \\endcode':
     return ''
   return line[4:] + '\n'
diff --git a/examples/AnnotateFunctions/AnnotateFunctions.cpp b/examples/AnnotateFunctions/AnnotateFunctions.cpp
new file mode 100644
index 0000000..375f18f
--- /dev/null
+++ b/examples/AnnotateFunctions/AnnotateFunctions.cpp
@@ -0,0 +1,88 @@
+//===- AnnotateFunctions.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Example clang plugin which adds an annotation to every function in
+// translation units that start with #pragma enable_annotate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Frontend/FrontendPluginRegistry.h"
+#include "clang/AST/AST.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/LexDiagnostic.h"
+using namespace clang;
+
+namespace {
+
+static bool EnableAnnotate = false;
+static bool HandledDecl = false;
+
+class AnnotateFunctionsConsumer : public ASTConsumer {
+public:
+  bool HandleTopLevelDecl(DeclGroupRef DG) override {
+    HandledDecl = true;
+    if (!EnableAnnotate)
+      return true;
+    for (auto D : DG)
+      if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
+        FD->addAttr(AnnotateAttr::CreateImplicit(FD->getASTContext(),
+                                                 "example_annotation"));
+    return true;
+  }
+};
+
+class AnnotateFunctionsAction : public PluginASTAction {
+public:
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
+                                                 llvm::StringRef) override {
+    return llvm::make_unique<AnnotateFunctionsConsumer>();
+  }
+
+  bool ParseArgs(const CompilerInstance &CI,
+                 const std::vector<std::string> &args) override {
+    return true;
+  }
+
+  PluginASTAction::ActionType getActionType() override {
+    return AddBeforeMainAction;
+  }
+};
+
+class PragmaAnnotateHandler : public PragmaHandler {
+public:
+  PragmaAnnotateHandler() : PragmaHandler("enable_annotate") { }
+
+  void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
+                    Token &PragmaTok) override {
+
+    Token Tok;
+    PP.LexUnexpandedToken(Tok);
+    if (Tok.isNot(tok::eod))
+      PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma";
+
+    if (HandledDecl) {
+      DiagnosticsEngine &D = PP.getDiagnostics();
+      unsigned ID = D.getCustomDiagID(
+        DiagnosticsEngine::Error,
+        "#pragma enable_annotate not allowed after declarations");
+      D.Report(PragmaTok.getLocation(), ID);
+    }
+
+    EnableAnnotate = true;
+  }
+};
+
+}
+
+static FrontendPluginRegistry::Add<AnnotateFunctionsAction>
+X("annotate-fns", "annotate functions");
+
+static PragmaHandlerRegistry::Add<PragmaAnnotateHandler>
+Y("enable_annotate","enable annotation");
diff --git a/examples/AnnotateFunctions/CMakeLists.txt b/examples/AnnotateFunctions/CMakeLists.txt
new file mode 100644
index 0000000..5684abf
--- /dev/null
+++ b/examples/AnnotateFunctions/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_llvm_loadable_module(AnnotateFunctions AnnotateFunctions.cpp PLUGIN_TOOL clang)
+
+if(LLVM_ENABLE_PLUGINS AND (WIN32 OR CYGWIN))
+  target_link_libraries(AnnotateFunctions PRIVATE
+    clangAST
+    clangBasic
+    clangFrontend
+    clangLex
+    LLVMSupport
+    )
+endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 5d4b5fc..8c26548 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -8,3 +8,4 @@
 endif()
 add_subdirectory(clang-interpreter)
 add_subdirectory(PrintFunctionNames)
+add_subdirectory(AnnotateFunctions)
diff --git a/examples/Makefile b/examples/Makefile
deleted file mode 100644
index d8d9028..0000000
--- a/examples/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- examples/Makefile -----------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ..
-
-PARALLEL_DIRS := analyzer-plugin clang-interpreter PrintFunctionNames
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/examples/PrintFunctionNames/CMakeLists.txt b/examples/PrintFunctionNames/CMakeLists.txt
index e700281..f5f8188 100644
--- a/examples/PrintFunctionNames/CMakeLists.txt
+++ b/examples/PrintFunctionNames/CMakeLists.txt
@@ -9,10 +9,10 @@
   endif()
 endif()
 
-add_llvm_loadable_module(PrintFunctionNames PrintFunctionNames.cpp)
+add_llvm_loadable_module(PrintFunctionNames PrintFunctionNames.cpp PLUGIN_TOOL clang)
 
 if(LLVM_ENABLE_PLUGINS AND (WIN32 OR CYGWIN))
-  target_link_libraries(PrintFunctionNames ${cmake_2_8_12_PRIVATE}
+  target_link_libraries(PrintFunctionNames PRIVATE
     clangAST
     clangBasic
     clangFrontend
diff --git a/examples/PrintFunctionNames/Makefile b/examples/PrintFunctionNames/Makefile
deleted file mode 100644
index 5865098..0000000
--- a/examples/PrintFunctionNames/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-##===- examples/PrintFunctionNames/Makefile ----------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME = PrintFunctionNames
-
-# If we don't need RTTI or EH, there's no reason to export anything
-# from the plugin.
-ifneq ($(REQUIRES_RTTI), 1)
-ifneq ($(REQUIRES_EH), 1)
-EXPORTED_SYMBOL_FILE = $(PROJ_SRC_DIR)/PrintFunctionNames.exports
-endif
-endif
-
-LINK_LIBS_IN_SHARED = 0
-LOADABLE_MODULE = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-ifeq ($(OS),Darwin)
-  LDFLAGS=-Wl,-undefined,dynamic_lookup
-endif
diff --git a/examples/analyzer-plugin/CMakeLists.txt b/examples/analyzer-plugin/CMakeLists.txt
index 1788d6c..0d5b275 100644
--- a/examples/analyzer-plugin/CMakeLists.txt
+++ b/examples/analyzer-plugin/CMakeLists.txt
@@ -1,7 +1,8 @@
-add_llvm_loadable_module(SampleAnalyzerPlugin MainCallChecker.cpp)
+set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/SampleAnalyzerPlugin.exports)
+add_llvm_loadable_module(SampleAnalyzerPlugin MainCallChecker.cpp PLUGIN_TOOL clang)
 
 if(LLVM_ENABLE_PLUGINS AND (WIN32 OR CYGWIN))
-  target_link_libraries(SampleAnalyzerPlugin ${cmake_2_8_12_PRIVATE}
+  target_link_libraries(SampleAnalyzerPlugin PRIVATE
     clangAnalysis
     clangAST
     clangStaticAnalyzerCore
diff --git a/examples/analyzer-plugin/Makefile b/examples/analyzer-plugin/Makefile
deleted file mode 100644
index 8b83bef..0000000
--- a/examples/analyzer-plugin/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-##===- examples/analyzer-plugin/Makefile -------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME = SampleAnalyzerPlugin
-
-LINK_LIBS_IN_SHARED = 0
-LOADABLE_MODULE = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-ifeq ($(OS),Darwin)
-  LDFLAGS=-Wl,-undefined,dynamic_lookup
-endif
diff --git a/examples/analyzer-plugin/SampleAnalyzerPlugin.exports b/examples/analyzer-plugin/SampleAnalyzerPlugin.exports
new file mode 100644
index 0000000..8d9ff88
--- /dev/null
+++ b/examples/analyzer-plugin/SampleAnalyzerPlugin.exports
@@ -0,0 +1,2 @@
+clang_registerCheckers
+clang_analyzerAPIVersionString
diff --git a/examples/clang-interpreter/Makefile b/examples/clang-interpreter/Makefile
deleted file mode 100644
index 2eff90b..0000000
--- a/examples/clang-interpreter/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-##===- examples/clang-interpreter/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-
-TOOLNAME = clang-interpreter
-NO_INSTALL = 1
-
-# No plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-LINK_COMPONENTS := mcjit interpreter nativecodegen bitreader bitwriter irreader \
-	ipo linker selectiondag asmparser instrumentation objcarcopts option
-USEDLIBS = clangFrontend.a clangSerialization.a clangDriver.a clangCodeGen.a \
-           clangParse.a clangSema.a clangStaticAnalyzerFrontend.a \
-           clangStaticAnalyzerCheckers.a clangStaticAnalyzerCore.a \
-           clangAnalysis.a clangRewrite.a clangRewriteFrontend.a \
-           clangEdit.a clangAST.a clangLex.a clangBasic.a LLVMCore.a \
-           LLVMExecutionEngine.a LLVMMC.a LLVMMCJIT.a LLVMRuntimeDyld.a \
-           LLVMObject.a LLVMSupport.a LLVMProfileData.a
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/include/Makefile b/include/Makefile
deleted file mode 100644
index 79b9adf..0000000
--- a/include/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-CLANG_LEVEL := ..
-DIRS := clang clang-c
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h
index 73cf02c..170e4e7 100644
--- a/include/clang-c/Index.h
+++ b/include/clang-c/Index.h
@@ -32,7 +32,7 @@
  * compatible, thus CINDEX_VERSION_MAJOR is expected to remain stable.
  */
 #define CINDEX_VERSION_MAJOR 0
-#define CINDEX_VERSION_MINOR 34
+#define CINDEX_VERSION_MINOR 35
 
 #define CINDEX_VERSION_ENCODE(major, minor) ( \
       ((major) * 10000)                       \
@@ -326,7 +326,7 @@
  *
  * \param tu the translation unit
  *
- * \param file_name the name of the file.
+* \param file_name the name of the file.
  *
  * \returns the file handle for the named file in the translation unit \p tu,
  * or a NULL file handle if the file was not a part of this translation unit.
@@ -1932,7 +1932,7 @@
    */
   CXCursor_CXXDeleteExpr                 = 135,
 
-  /** \brief A unary expression.
+  /** \brief A unary expression. (noexcept, sizeof, or other traits)
    */
   CXCursor_UnaryExpr                     = 136,
 
@@ -2014,7 +2014,11 @@
    */
   CXCursor_OMPArraySectionExpr           = 147,
 
-  CXCursor_LastExpr                      = CXCursor_OMPArraySectionExpr,
+  /** \brief Represents an @available(...) check.
+   */
+  CXCursor_ObjCAvailabilityCheckExpr     = 148,
+
+  CXCursor_LastExpr                      = CXCursor_ObjCAvailabilityCheckExpr,
 
   /* Statements */
   CXCursor_FirstStmt                     = 200,
@@ -2281,11 +2285,55 @@
    */
   CXCursor_OMPTaskLoopSimdDirective      = 259,
 
-   /** \brief OpenMP distribute directive.
+  /** \brief OpenMP distribute directive.
    */
   CXCursor_OMPDistributeDirective        = 260,
 
-  CXCursor_LastStmt                      = CXCursor_OMPDistributeDirective,
+  /** \brief OpenMP target enter data directive.
+   */
+  CXCursor_OMPTargetEnterDataDirective   = 261,
+
+  /** \brief OpenMP target exit data directive.
+   */
+  CXCursor_OMPTargetExitDataDirective    = 262,
+
+  /** \brief OpenMP target parallel directive.
+   */
+  CXCursor_OMPTargetParallelDirective    = 263,
+
+  /** \brief OpenMP target parallel for directive.
+   */
+  CXCursor_OMPTargetParallelForDirective = 264,
+
+  /** \brief OpenMP target update directive.
+   */
+  CXCursor_OMPTargetUpdateDirective      = 265,
+
+  /** \brief OpenMP distribute parallel for directive.
+   */
+  CXCursor_OMPDistributeParallelForDirective = 266,
+
+  /** \brief OpenMP distribute parallel for simd directive.
+   */
+  CXCursor_OMPDistributeParallelForSimdDirective = 267,
+
+  /** \brief OpenMP distribute simd directive.
+   */
+  CXCursor_OMPDistributeSimdDirective = 268,
+
+  /** \brief OpenMP target parallel for simd directive.
+   */
+  CXCursor_OMPTargetParallelForSimdDirective = 269,
+
+  /** \brief OpenMP target simd directive.
+   */
+  CXCursor_OMPTargetSimdDirective = 270,
+
+  /** \brief OpenMP teams distribute directive.
+   */
+  CXCursor_OMPTeamsDistributeDirective = 271,
+
+  CXCursor_LastStmt = CXCursor_OMPTeamsDistributeDirective,
 
   /**
    * \brief Cursor that represents the translation unit itself.
@@ -2339,8 +2387,12 @@
    */
   CXCursor_ModuleImportDecl              = 600,
   CXCursor_TypeAliasTemplateDecl         = 601,
+  /**
+   * \brief A static_assert or _Static_assert node
+   */
+  CXCursor_StaticAssert                  = 602,
   CXCursor_FirstExtraDecl                = CXCursor_ModuleImportDecl,
-  CXCursor_LastExtraDecl                 = CXCursor_TypeAliasTemplateDecl,
+  CXCursor_LastExtraDecl                 = CXCursor_StaticAssert,
 
   /**
    * \brief A code completion overload candidate.
@@ -2913,6 +2965,7 @@
   CXType_ObjCId = 27,
   CXType_ObjCClass = 28,
   CXType_ObjCSel = 29,
+  CXType_Float128 = 30,
   CXType_FirstBuiltin = CXType_Void,
   CXType_LastBuiltin  = CXType_ObjCSel,
 
@@ -2934,7 +2987,14 @@
   CXType_VariableArray = 115,
   CXType_DependentSizedArray = 116,
   CXType_MemberPointer = 117,
-  CXType_Auto = 118
+  CXType_Auto = 118,
+
+  /**
+   * \brief Represents a type that was referred to using an elaborated type keyword.
+   *
+   * E.g., struct S, or via a qualified name, e.g., N::M::type, or both.
+   */
+  CXType_Elaborated = 119
 };
 
 /**
@@ -3324,6 +3384,13 @@
 CINDEX_LINKAGE long long clang_getArraySize(CXType T);
 
 /**
+ * \brief Retrieve the type named by the qualified-id.
+ *
+ * If a non-elaborated type is passed in, an invalid type is returned.
+ */
+CINDEX_LINKAGE CXType clang_Type_getNamedType(CXType T);
+
+/**
  * \brief List the possible error codes for \c clang_Type_getSizeOf,
  *   \c clang_Type_getAlignOf, \c clang_Type_getOffsetOf and
  *   \c clang_Cursor_getOffsetOf.
@@ -4059,11 +4126,36 @@
  */
 
 /**
+ * \brief Determine if a C++ constructor is a converting constructor.
+ */
+CINDEX_LINKAGE unsigned clang_CXXConstructor_isConvertingConstructor(CXCursor C);
+
+/**
+ * \brief Determine if a C++ constructor is a copy constructor.
+ */
+CINDEX_LINKAGE unsigned clang_CXXConstructor_isCopyConstructor(CXCursor C);
+
+/**
+ * \brief Determine if a C++ constructor is the default constructor.
+ */
+CINDEX_LINKAGE unsigned clang_CXXConstructor_isDefaultConstructor(CXCursor C);
+
+/**
+ * \brief Determine if a C++ constructor is a move constructor.
+ */
+CINDEX_LINKAGE unsigned clang_CXXConstructor_isMoveConstructor(CXCursor C);
+
+/**
  * \brief Determine if a C++ field is declared 'mutable'.
  */
 CINDEX_LINKAGE unsigned clang_CXXField_isMutable(CXCursor C);
 
 /**
+ * \brief Determine if a C++ method is declared '= default'.
+ */
+CINDEX_LINKAGE unsigned clang_CXXMethod_isDefaulted(CXCursor C);
+
+/**
  * \brief Determine if a C++ member function or member function template is
  * pure virtual.
  */
@@ -4943,7 +5035,7 @@
  * Note that the column should point just after the syntactic construct that
  * initiated code completion, and not in the middle of a lexical token.
  *
- * \param unsaved_files the Tiles that have not yet been saved to disk
+ * \param unsaved_files the Files that have not yet been saved to disk
  * but may be required for parsing or code completion, including the
  * contents of those files.  The contents and name of these files (as
  * specified by CXUnsavedFile) are copied when necessary, so the
@@ -5246,7 +5338,7 @@
   CXVisit_Continue
 };
 
-typedef struct {
+typedef struct CXCursorAndRangeVisitor {
   void *context;
   enum CXVisitorResult (*visit)(void *context, CXCursor, CXSourceRange);
 } CXCursorAndRangeVisitor;
diff --git a/include/clang-c/Makefile b/include/clang-c/Makefile
deleted file mode 100644
index b29e29e..0000000
--- a/include/clang-c/Makefile
+++ /dev/null
@@ -1,38 +0,0 @@
-CLANG_LEVEL := ../..
-DIRS :=
-
-include $(CLANG_LEVEL)/Makefile
-
-IntIncludeDir = $(DESTDIR)$(PROJ_internal_prefix)/include
-
-install-local::
-	$(Echo) Installing Clang C API include files
-	$(Verb) $(MKDIR) $(IntIncludeDir)
-	$(Verb) if test -d "$(PROJ_SRC_DIR)" ; then \
-	  cd $(PROJ_SRC_DIR)/.. && \
-	  for  hdr in `find clang-c -type f '!' '(' -name '*~' \
-	      -o -name '.#*' -o -name '*.in' -o -name '*.txt' \
-	      -o -name 'Makefile' -o -name '*.td' ')' -print \
-              | grep -v CVS | grep -v .svn | grep -v .dir` ; do \
-	    instdir=`dirname "$(IntIncludeDir)/$$hdr"` ; \
-	    if test \! -d "$$instdir" ; then \
-	      $(EchoCmd) Making install directory $$instdir ; \
-	      $(MKDIR) $$instdir ;\
-	    fi ; \
-	    $(DataInstall) $$hdr $(IntIncludeDir)/$$hdr ; \
-	  done ; \
-	fi
-ifneq ($(PROJ_SRC_ROOT),$(PROJ_OBJ_ROOT))
-	$(Verb) if test -d "$(PROJ_OBJ_ROOT)/tools/clang/include/clang-c" ; then \
-	  cd $(PROJ_OBJ_ROOT)/tools/clang/include && \
-	  for hdr in `find clang-c -type f '!' '(' -name 'Makefile' ')' -print \
-            | grep -v CVS | grep -v .tmp | grep -v .dir` ; do \
-	    instdir=`dirname "$(IntIncludeDir)/$$hdr"` ; \
-	    if test \! -d "$$instdir" ; then \
-	      $(EchoCmd) Making install directory $$instdir ; \
-	      $(MKDIR) $$instdir ;\
-	    fi ; \
-	    $(DataInstall) $$hdr $(IntIncludeDir)/$$hdr ; \
-	  done ; \
-	fi
-endif
diff --git a/include/clang/AST/ASTConsumer.h b/include/clang/AST/ASTConsumer.h
index 02f64a6..ad368c8 100644
--- a/include/clang/AST/ASTConsumer.h
+++ b/include/clang/AST/ASTConsumer.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_CLANG_AST_ASTCONSUMER_H
 #define LLVM_CLANG_AST_ASTCONSUMER_H
 
-#include "llvm/ADT/StringRef.h"
-
 namespace clang {
   class ASTContext;
   class CXXMethodDecl;
@@ -55,9 +53,9 @@
   /// \returns true to continue parsing, or false to abort parsing.
   virtual bool HandleTopLevelDecl(DeclGroupRef D);
 
-  /// \brief This callback is invoked each time an inline method definition is
-  /// completed.
-  virtual void HandleInlineMethodDefinition(CXXMethodDecl *D) {}
+  /// \brief This callback is invoked each time an inline (method or friend)
+  /// function definition in a class is completed.
+  virtual void HandleInlineFunctionDefinition(FunctionDecl *D) {}
 
   /// HandleInterestingDecl - Handle the specified interesting declaration. This
   /// is called by the AST reader when deserializing things that might interest
@@ -94,22 +92,6 @@
   /// The default implementation passes it to HandleTopLevelDecl.
   virtual void HandleImplicitImportDecl(ImportDecl *D);
 
-  /// \brief Handle a pragma or command line flag that appends to Linker
-  /// Options.  This exists to support Microsoft's
-  /// #pragma comment(linker, "/foo") and the frontend flag --linker-option=.
-  virtual void HandleLinkerOption(llvm::StringRef Opts) {}
-
-  /// \brief Handle a pragma that emits a mismatch identifier and value to the
-  /// object file for the linker to work with.  Currently, this only exists to
-  /// support Microsoft's #pragma detect_mismatch.
-  virtual void HandleDetectMismatch(llvm::StringRef Name,
-                                    llvm::StringRef Value) {}
-
-  /// \brief Handle a dependent library created by a pragma in the source.
-  /// Currently this only exists to support Microsoft's
-  /// #pragma comment(lib, "/foo").
-  virtual void HandleDependentLibrary(llvm::StringRef Lib) {}
-
   /// CompleteTentativeDefinition - Callback invoked at the end of a translation
   /// unit to notify the consumer that the given tentative definition should be
   /// completed.
@@ -121,6 +103,10 @@
   /// modified by the introduction of an implicit zero initializer.
   virtual void CompleteTentativeDefinition(VarDecl *D) {}
 
+  /// \brief Callback invoked when an MSInheritanceAttr has been attached to a
+  /// CXXRecordDecl.
+  virtual void AssignInheritanceModel(CXXRecordDecl *RD) {}
+
   /// HandleCXXStaticMemberVarInstantiation - Tell the consumer that this
   // variable has been instantiated.
   virtual void HandleCXXStaticMemberVarInstantiation(VarDecl *D) {}
diff --git a/include/clang/AST/ASTContext.h b/include/clang/AST/ASTContext.h
index fb5a64f..45127ac 100644
--- a/include/clang/AST/ASTContext.h
+++ b/include/clang/AST/ASTContext.h
@@ -36,6 +36,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Support/Allocator.h"
@@ -128,6 +129,8 @@
   llvm::FoldingSet<PackExpansionType> PackExpansionTypes;
   mutable llvm::FoldingSet<ObjCObjectTypeImpl> ObjCObjectTypes;
   mutable llvm::FoldingSet<ObjCObjectPointerType> ObjCObjectPointerTypes;
+  mutable llvm::FoldingSet<DependentUnaryTransformType>
+    DependentUnaryTransformTypes;
   mutable llvm::FoldingSet<AutoType> AutoTypes;
   mutable llvm::FoldingSet<AtomicType> AtomicTypes;
   llvm::FoldingSet<AttributedType> AttributedTypes;
@@ -212,9 +215,6 @@
   /// \brief The typedef for the __uint128_t type.
   mutable TypedefDecl *UInt128Decl;
 
-  /// \brief The typedef for the __float128 stub type.
-  mutable TypeDecl *Float128StubDecl;
-  
   /// \brief The typedef for the target specific predefined
   /// __builtin_va_list type.
   mutable TypedefDecl *BuiltinVaListDecl;
@@ -243,6 +243,9 @@
   QualType ObjCClassRedefinitionType;
   QualType ObjCSelRedefinitionType;
 
+  /// The identifier 'bool'.
+  mutable IdentifierInfo *BoolName = nullptr;
+
   /// The identifier 'NSObject'.
   IdentifierInfo *NSObjectName = nullptr;
 
@@ -252,6 +255,9 @@
   /// The identifier '__make_integer_seq'.
   mutable IdentifierInfo *MakeIntegerSeqName = nullptr;
 
+  /// The identifier '__type_pack_element'.
+  mutable IdentifierInfo *TypePackElementName = nullptr;
+
   QualType ObjCConstantStringType;
   mutable RecordDecl *CFConstantStringTagDecl;
   mutable TypedefDecl *CFConstantStringTypeDecl;
@@ -306,6 +312,24 @@
   /// definitions of that entity.
   llvm::DenseMap<NamedDecl*, llvm::TinyPtrVector<Module*>> MergedDefModules;
 
+  /// \brief Initializers for a module, in order. Each Decl will be either
+  /// something that has a semantic effect on startup (such as a variable with
+  /// a non-constant initializer), or an ImportDecl (which recursively triggers
+  /// initialization of another module).
+  struct PerModuleInitializers {
+    llvm::SmallVector<Decl*, 4> Initializers;
+    llvm::SmallVector<uint32_t, 4> LazyInitializers;
+
+    void resolve(ASTContext &Ctx);
+  };
+  llvm::DenseMap<Module*, PerModuleInitializers*> ModuleInitializers;
+
+  /// Diagnostics that are emitted if and only if the given function is
+  /// codegen'ed.  Access these through FunctionDecl::addDeferredDiag() and
+  /// FunctionDecl::takeDeferredDiags().
+  llvm::DenseMap<const FunctionDecl *, std::vector<PartialDiagnosticAt>>
+      DeferredDiags;
+
 public:
   /// \brief A type synonym for the TemplateOrInstantiation mapping.
   typedef llvm::PointerUnion<VarTemplateDecl *, MemberSpecializationInfo *>
@@ -393,8 +417,8 @@
 
   /// \brief Side-table of mangling numbers for declarations which rarely
   /// need them (like static local vars).
-  llvm::DenseMap<const NamedDecl *, unsigned> MangleNumbers;
-  llvm::DenseMap<const VarDecl *, unsigned> StaticLocalNumbers;
+  llvm::MapVector<const NamedDecl *, unsigned> MangleNumbers;
+  llvm::MapVector<const VarDecl *, unsigned> StaticLocalNumbers;
 
   /// \brief Mapping that stores parameterIndex values for ParmVarDecls when
   /// that value exceeds the bitfield size of ParmVarDeclBits.ParameterIndex.
@@ -407,6 +431,7 @@
   TranslationUnitDecl *TUDecl;
   mutable ExternCContextDecl *ExternCContext;
   mutable BuiltinTemplateDecl *MakeIntegerSeqDecl;
+  mutable BuiltinTemplateDecl *TypePackElementDecl;
 
   /// \brief The associated SourceManager object.a
   SourceManager &SourceMgr;
@@ -578,6 +603,11 @@
     return DiagAllocator;
   }
 
+  decltype(DeferredDiags) &getDeferredDiags() { return DeferredDiags; }
+  const decltype(DeferredDiags) &getDeferredDiags() const {
+    return DeferredDiags;
+  }
+
   const TargetInfo &getTargetInfo() const { return *Target; }
   const TargetInfo *getAuxTargetInfo() const { return AuxTarget; }
 
@@ -818,6 +848,9 @@
   overridden_methods_end(const CXXMethodDecl *Method) const;
 
   unsigned overridden_methods_size(const CXXMethodDecl *Method) const;
+  typedef llvm::iterator_range<overridden_cxx_method_iterator>
+      overridden_method_range;
+  overridden_method_range overridden_methods(const CXXMethodDecl *Method) const;
 
   /// \brief Note that the given C++ \p Method overrides the given \p
   /// Overridden method.
@@ -873,10 +906,22 @@
     return MergedIt->second;
   }
 
+  /// Add a declaration to the list of declarations that are initialized
+  /// for a module. This will typically be a global variable (with internal
+  /// linkage) that runs module initializers, such as the iostream initializer,
+  /// or an ImportDecl nominating another module that has initializers.
+  void addModuleInitializer(Module *M, Decl *Init);
+
+  void addLazyModuleInitializers(Module *M, ArrayRef<uint32_t> IDs);
+
+  /// Get the initializations to perform when importing a module, if any.
+  ArrayRef<Decl*> getModuleInitializers(Module *M);
+
   TranslationUnitDecl *getTranslationUnitDecl() const { return TUDecl; }
 
   ExternCContextDecl *getExternCContextDecl() const;
   BuiltinTemplateDecl *getMakeIntegerSeqDecl() const;
+  BuiltinTemplateDecl *getTypePackElementDecl() const;
 
   // Builtin Types.
   CanQualType VoidTy;
@@ -890,20 +935,19 @@
   CanQualType SignedCharTy, ShortTy, IntTy, LongTy, LongLongTy, Int128Ty;
   CanQualType UnsignedCharTy, UnsignedShortTy, UnsignedIntTy, UnsignedLongTy;
   CanQualType UnsignedLongLongTy, UnsignedInt128Ty;
-  CanQualType FloatTy, DoubleTy, LongDoubleTy;
+  CanQualType FloatTy, DoubleTy, LongDoubleTy, Float128Ty;
   CanQualType HalfTy; // [OpenCL 6.1.1.1], ARM NEON
   CanQualType FloatComplexTy, DoubleComplexTy, LongDoubleComplexTy;
+  CanQualType Float128ComplexTy;
   CanQualType VoidPtrTy, NullPtrTy;
   CanQualType DependentTy, OverloadTy, BoundMemberTy, UnknownAnyTy;
   CanQualType BuiltinFnTy;
   CanQualType PseudoObjectTy, ARCUnbridgedCastTy;
   CanQualType ObjCBuiltinIdTy, ObjCBuiltinClassTy, ObjCBuiltinSelTy;
   CanQualType ObjCBuiltinBoolTy;
-  CanQualType OCLImage1dTy, OCLImage1dArrayTy, OCLImage1dBufferTy;
-  CanQualType OCLImage2dTy, OCLImage2dArrayTy, OCLImage2dDepthTy;
-  CanQualType OCLImage2dArrayDepthTy, OCLImage2dMSAATy, OCLImage2dArrayMSAATy;
-  CanQualType OCLImage2dMSAADepthTy, OCLImage2dArrayMSAADepthTy;
-  CanQualType OCLImage3dTy;
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  CanQualType SingletonId;
+#include "clang/Basic/OpenCLImageTypes.def"
   CanQualType OCLSamplerTy, OCLEventTy, OCLClkEventTy;
   CanQualType OCLQueueTy, OCLNDRangeTy, OCLReserveIDTy;
   CanQualType OMPArraySectionTy;
@@ -967,9 +1011,6 @@
   /// \brief Retrieve the declaration for the 128-bit unsigned integer type.
   TypedefDecl *getUInt128Decl() const;
 
-  /// \brief Retrieve the declaration for a 128-bit float stub type.
-  TypeDecl *getFloat128StubType() const;
-
   //===--------------------------------------------------------------------===//
   //                           Type Constructors
   //===--------------------------------------------------------------------===//
@@ -1230,13 +1271,12 @@
                           TemplateTypeParmDecl *ParmDecl = nullptr) const;
 
   QualType getTemplateSpecializationType(TemplateName T,
-                                         const TemplateArgument *Args,
-                                         unsigned NumArgs,
+                                         ArrayRef<TemplateArgument> Args,
                                          QualType Canon = QualType()) const;
 
-  QualType getCanonicalTemplateSpecializationType(TemplateName T,
-                                                  const TemplateArgument *Args,
-                                                  unsigned NumArgs) const;
+  QualType
+  getCanonicalTemplateSpecializationType(TemplateName T,
+                                         ArrayRef<TemplateArgument> Args) const;
 
   QualType getTemplateSpecializationType(TemplateName T,
                                          const TemplateArgumentListInfo &Args,
@@ -1261,11 +1301,9 @@
                                                   NestedNameSpecifier *NNS,
                                                   const IdentifierInfo *Name,
                                     const TemplateArgumentListInfo &Args) const;
-  QualType getDependentTemplateSpecializationType(ElaboratedTypeKeyword Keyword,
-                                                  NestedNameSpecifier *NNS,
-                                                  const IdentifierInfo *Name,
-                                                  unsigned NumArgs,
-                                            const TemplateArgument *Args) const;
+  QualType getDependentTemplateSpecializationType(
+      ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS,
+      const IdentifierInfo *Name, ArrayRef<TemplateArgument> Args) const;
 
   QualType getPackExpansionType(QualType Pattern,
                                 Optional<unsigned> NumExpansions);
@@ -1461,12 +1499,25 @@
     return NSCopyingName;
   }
 
+  /// Retrieve the identifier 'bool'.
+  IdentifierInfo *getBoolName() const {
+    if (!BoolName)
+      BoolName = &Idents.get("bool");
+    return BoolName;
+  }
+
   IdentifierInfo *getMakeIntegerSeqName() const {
     if (!MakeIntegerSeqName)
       MakeIntegerSeqName = &Idents.get("__make_integer_seq");
     return MakeIntegerSeqName;
   }
 
+  IdentifierInfo *getTypePackElementName() const {
+    if (!TypePackElementName)
+      TypePackElementName = &Idents.get("__type_pack_element");
+    return TypePackElementName;
+  }
+
   /// \brief Retrieve the Objective-C "instancetype" type, if already known;
   /// otherwise, returns a NULL type;
   QualType getObjCInstanceType() {
@@ -2511,7 +2562,21 @@
   /// \brief Returns true if this is an inline-initialized static data member
   /// which is treated as a definition for MSVC compatibility.
   bool isMSStaticDataMemberInlineDefinition(const VarDecl *VD) const;
-  
+
+  enum class InlineVariableDefinitionKind {
+    None,        ///< Not an inline variable.
+    Weak,        ///< Weak definition of inline variable.
+    WeakUnknown, ///< Weak for now, might become strong later in this TU.
+    Strong       ///< Strong definition.
+  };
+  /// \brief Determine whether a definition of this inline variable should
+  /// be treated as a weak or strong definition. For compatibility with
+  /// C++14 and before, for a constexpr static data member, if there is an
+  /// out-of-line declaration of the member, we may promote it from weak to
+  /// strong.
+  InlineVariableDefinitionKind
+  getInlineVariableDefinitionKind(const VarDecl *VD) const;
+
 private:
   const ASTRecordLayout &
   getObjCLayout(const ObjCInterfaceDecl *D,
diff --git a/include/clang/AST/ASTMutationListener.h b/include/clang/AST/ASTMutationListener.h
index cf3b55d..a8eff1a 100644
--- a/include/clang/AST/ASTMutationListener.h
+++ b/include/clang/AST/ASTMutationListener.h
@@ -17,10 +17,12 @@
   class Attr;
   class ClassTemplateDecl;
   class ClassTemplateSpecializationDecl;
+  class ConstructorUsingShadowDecl;
   class CXXDestructorDecl;
   class CXXRecordDecl;
   class Decl;
   class DeclContext;
+  class FieldDecl;
   class FunctionDecl;
   class FunctionTemplateDecl;
   class Module;
@@ -92,6 +94,9 @@
   /// \brief A default argument was instantiated.
   virtual void DefaultArgumentInstantiated(const ParmVarDecl *D) {}
 
+  /// \brief A default member initializer was instantiated.
+  virtual void DefaultMemberInitializerInstantiated(const FieldDecl *D) {}
+
   /// \brief A new objc category class was added for an interface.
   virtual void AddedObjCCategoryToInterface(const ObjCCategoryDecl *CatD,
                                             const ObjCInterfaceDecl *IFD) {}
@@ -107,6 +112,14 @@
   /// \param D the declaration marked OpenMP threadprivate.
   virtual void DeclarationMarkedOpenMPThreadPrivate(const Decl *D) {}
 
+  /// \brief A declaration is marked as OpenMP declaretarget which was not
+  /// previously marked as declaretarget.
+  ///
+  /// \param D the declaration marked OpenMP declaretarget.
+  /// \param Attr the added attribute.
+  virtual void DeclarationMarkedOpenMPDeclareTarget(const Decl *D,
+                                                    const Attr *Attr) {}
+
   /// \brief A definition has been made visible by being redefined locally.
   ///
   /// \param D The definition that was previously not visible.
diff --git a/include/clang/AST/ASTTypeTraits.h b/include/clang/AST/ASTTypeTraits.h
index dcaac80..51d60a9 100644
--- a/include/clang/AST/ASTTypeTraits.h
+++ b/include/clang/AST/ASTTypeTraits.h
@@ -62,7 +62,9 @@
   /// \}
 
   /// \brief Returns \c true if \c this and \c Other represent the same kind.
-  bool isSame(ASTNodeKind Other) const;
+  bool isSame(ASTNodeKind Other) const {
+    return KindId != NKI_None && KindId == Other.KindId;
+  }
 
   /// \brief Returns \c true only for the default \c ASTNodeKind()
   bool isNone() const { return KindId == NKI_None; }
@@ -119,6 +121,7 @@
   enum NodeKindId {
     NKI_None,
     NKI_TemplateArgument,
+    NKI_TemplateName,
     NKI_NestedNameSpecifierLoc,
     NKI_QualType,
     NKI_TypeLoc,
@@ -173,6 +176,7 @@
   };
 KIND_TO_KIND_ID(CXXCtorInitializer)
 KIND_TO_KIND_ID(TemplateArgument)
+KIND_TO_KIND_ID(TemplateName)
 KIND_TO_KIND_ID(NestedNameSpecifier)
 KIND_TO_KIND_ID(NestedNameSpecifierLoc)
 KIND_TO_KIND_ID(QualType)
@@ -470,6 +474,10 @@
 
 template <>
 struct DynTypedNode::BaseConverter<
+    TemplateName, void> : public ValueConverter<TemplateName> {};
+
+template <>
+struct DynTypedNode::BaseConverter<
     NestedNameSpecifierLoc,
     void> : public ValueConverter<NestedNameSpecifierLoc> {};
 
diff --git a/include/clang/AST/ASTVector.h b/include/clang/AST/ASTVector.h
index 79453bf..dd9e7fe 100644
--- a/include/clang/AST/ASTVector.h
+++ b/include/clang/AST/ASTVector.h
@@ -20,7 +20,6 @@
 
 #include "clang/AST/AttrIterator.h"
 #include "llvm/ADT/PointerIntPair.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
 #include <cstring>
diff --git a/include/clang/AST/Attr.h b/include/clang/AST/Attr.h
index 4d864ed..85caf9d 100644
--- a/include/clang/AST/Attr.h
+++ b/include/clang/AST/Attr.h
@@ -20,11 +20,10 @@
 #include "clang/AST/Type.h"
 #include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/Sanitizers.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/VersionTuple.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -50,11 +49,11 @@
   /// An index into the spelling list of an
   /// attribute defined in Attr.td file.
   unsigned SpellingListIndex : 4;
-  bool Inherited : 1;
-  bool IsPackExpansion : 1;
-  bool Implicit : 1;
-  bool IsLateParsed : 1;
-  bool DuplicatesAllowed : 1;
+  unsigned Inherited : 1;
+  unsigned IsPackExpansion : 1;
+  unsigned Implicit : 1;
+  unsigned IsLateParsed : 1;
+  unsigned DuplicatesAllowed : 1;
 
   void *operator new(size_t bytes) LLVM_NOEXCEPT {
     llvm_unreachable("Attrs cannot be allocated with regular 'new'.");
@@ -118,6 +117,19 @@
   bool duplicatesAllowed() const { return DuplicatesAllowed; }
 };
 
+class StmtAttr : public Attr {
+protected:
+  StmtAttr(attr::Kind AK, SourceRange R, unsigned SpellingListIndex,
+                  bool IsLateParsed, bool DuplicatesAllowed)
+      : Attr(AK, R, SpellingListIndex, IsLateParsed, DuplicatesAllowed) {}
+
+public:
+  static bool classof(const Attr *A) {
+    return A->getKind() >= attr::FirstStmtAttr &&
+           A->getKind() <= attr::LastStmtAttr;
+  }
+};
+
 class InheritableAttr : public Attr {
 protected:
   InheritableAttr(attr::Kind AK, SourceRange R, unsigned SpellingListIndex,
diff --git a/include/clang/AST/AttrIterator.h b/include/clang/AST/AttrIterator.h
index a0c8030..fb9b049 100644
--- a/include/clang/AST/AttrIterator.h
+++ b/include/clang/AST/AttrIterator.h
@@ -39,8 +39,7 @@
 namespace clang {
 
 /// AttrVec - A vector of Attr, which is how they are stored on the AST.
-typedef SmallVector<Attr*, 2> AttrVec;
-typedef SmallVector<const Attr*, 2> ConstAttrVec;
+typedef SmallVector<Attr *, 4> AttrVec;
 
 /// specific_attr_iterator - Iterates over a subrange of an AttrVec, only
 /// providing attributes that are of a specific type.
diff --git a/include/clang/AST/Availability.h b/include/clang/AST/Availability.h
new file mode 100644
index 0000000..5ed8313
--- /dev/null
+++ b/include/clang/AST/Availability.h
@@ -0,0 +1,63 @@
+//===--- Availability.h - Classes for availability --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This files defines some classes that implement availability checking.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_AVAILABILITY_H
+#define LLVM_CLANG_AST_AVAILABILITY_H
+
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/VersionTuple.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace clang {
+
+/// \brief One specifier in an @available expression.
+///
+/// \code
+///   @available(macos 10.10, *)
+/// \endcode
+///
+/// Here, 'macos 10.10' and '*' both map to an instance of this type.
+///
+class AvailabilitySpec {
+  /// Represents the version that this specifier requires. If the host OS
+  /// version is greater than or equal to Version, the @available will evaluate
+  /// to true.
+  VersionTuple Version;
+
+  /// Name of the platform that Version corresponds to.
+  StringRef Platform;
+
+  SourceLocation BeginLoc, EndLoc;
+
+public:
+  AvailabilitySpec(VersionTuple Version, StringRef Platform,
+                   SourceLocation BeginLoc, SourceLocation EndLoc)
+      : Version(Version), Platform(Platform), BeginLoc(BeginLoc),
+        EndLoc(EndLoc) {}
+
+  /// This constructor is used when representing the '*' case.
+  AvailabilitySpec(SourceLocation StarLoc)
+      : BeginLoc(StarLoc), EndLoc(StarLoc) {}
+
+  VersionTuple getVersion() const { return Version; }
+  StringRef getPlatform() const { return Platform; }
+  SourceLocation getBeginLoc() const { return BeginLoc; }
+  SourceLocation getEndLoc() const { return EndLoc; }
+
+  /// Returns true when this represents the '*' case.
+  bool isOtherPlatformSpec() const { return Version.empty(); }
+};
+
+} // end namespace clang
+
+#endif
diff --git a/include/clang/AST/BaseSubobject.h b/include/clang/AST/BaseSubobject.h
index da538e3..66af023 100644
--- a/include/clang/AST/BaseSubobject.h
+++ b/include/clang/AST/BaseSubobject.h
@@ -15,13 +15,12 @@
 #define LLVM_CLANG_AST_BASESUBOBJECT_H
 
 #include "clang/AST/CharUnits.h"
+#include "clang/AST/DeclCXX.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/type_traits.h"
 
 namespace clang {
-  class CXXRecordDecl;
-
 // BaseSubobject - Uniquely identifies a direct or indirect base class. 
 // Stores both the base class decl and the offset from the most derived class to
 // the base class. Used for vtable and VTT generation.
diff --git a/include/clang/AST/BuiltinTypes.def b/include/clang/AST/BuiltinTypes.def
index a08a683..c0c6819 100644
--- a/include/clang/AST/BuiltinTypes.def
+++ b/include/clang/AST/BuiltinTypes.def
@@ -133,6 +133,9 @@
 // 'long double'
 FLOATING_TYPE(LongDouble, LongDoubleTy)
 
+// '__float128'
+FLOATING_TYPE(Float128, Float128Ty)
+
 //===- Language-specific types --------------------------------------------===//
 
 // This is the type of C++0x 'nullptr'.
@@ -154,20 +157,6 @@
 // type is a typedef of a PointerType to this.
 BUILTIN_TYPE(ObjCSel, ObjCBuiltinSelTy)
 
-// OpenCL image types.
-BUILTIN_TYPE(OCLImage1d, OCLImage1dTy)
-BUILTIN_TYPE(OCLImage1dArray, OCLImage1dArrayTy)
-BUILTIN_TYPE(OCLImage1dBuffer, OCLImage1dBufferTy)
-BUILTIN_TYPE(OCLImage2d, OCLImage2dTy)
-BUILTIN_TYPE(OCLImage2dArray, OCLImage2dArrayTy)
-BUILTIN_TYPE(OCLImage2dDepth, OCLImage2dDepthTy)
-BUILTIN_TYPE(OCLImage2dArrayDepth, OCLImage2dArrayDepthTy)
-BUILTIN_TYPE(OCLImage2dMSAA, OCLImage2dMSAATy)
-BUILTIN_TYPE(OCLImage2dArrayMSAA, OCLImage2dArrayMSAATy)
-BUILTIN_TYPE(OCLImage2dMSAADepth, OCLImage2dMSAADepthTy)
-BUILTIN_TYPE(OCLImage2dArrayMSAADepth, OCLImage2dArrayMSAADepthTy)
-BUILTIN_TYPE(OCLImage3d, OCLImage3dTy)
-
 // OpenCL sampler_t.
 BUILTIN_TYPE(OCLSampler, OCLSamplerTy)
 
diff --git a/include/clang/AST/CXXInheritance.h b/include/clang/AST/CXXInheritance.h
index 8587260..3cf058f 100644
--- a/include/clang/AST/CXXInheritance.h
+++ b/include/clang/AST/CXXInheritance.h
@@ -16,7 +16,6 @@
 
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclCXX.h"
-#include "clang/AST/DeclarationName.h"
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeOrdering.h"
 #include "llvm/ADT/MapVector.h"
@@ -24,7 +23,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include <cassert>
 #include <list>
-#include <map>
 
 namespace clang {
   
@@ -174,7 +172,7 @@
   /// paths for a derived-to-base search.
   explicit CXXBasePaths(bool FindAmbiguities = true, bool RecordPaths = true,
                         bool DetectVirtual = true)
-      : FindAmbiguities(FindAmbiguities), RecordPaths(RecordPaths),
+      : Origin(), FindAmbiguities(FindAmbiguities), RecordPaths(RecordPaths),
         DetectVirtual(DetectVirtual), DetectedVirtual(nullptr),
         NumDeclsFound(0) {}
 
diff --git a/include/clang/AST/CommentLexer.h b/include/clang/AST/CommentLexer.h
index f190b93..5bb0758 100644
--- a/include/clang/AST/CommentLexer.h
+++ b/include/clang/AST/CommentLexer.h
@@ -17,7 +17,6 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/SourceManager.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/include/clang/AST/Decl.h b/include/clang/AST/Decl.h
index 73e004b..77bbdb2 100644
--- a/include/clang/AST/Decl.h
+++ b/include/clang/AST/Decl.h
@@ -23,6 +23,7 @@
 #include "clang/Basic/Linkage.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/OperatorKinds.h"
+#include "clang/Basic/PragmaKinds.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/Compiler.h"
@@ -103,6 +104,73 @@
   }
 };
 
+/// \brief Represents a `#pragma comment` line. Always a child of
+/// TranslationUnitDecl.
+class PragmaCommentDecl final
+    : public Decl,
+      private llvm::TrailingObjects<PragmaCommentDecl, char> {
+  virtual void anchor();
+
+  PragmaMSCommentKind CommentKind;
+
+  friend TrailingObjects;
+  friend class ASTDeclReader;
+  friend class ASTDeclWriter;
+
+  PragmaCommentDecl(TranslationUnitDecl *TU, SourceLocation CommentLoc,
+                    PragmaMSCommentKind CommentKind)
+      : Decl(PragmaComment, TU, CommentLoc), CommentKind(CommentKind) {}
+
+public:
+  static PragmaCommentDecl *Create(const ASTContext &C, TranslationUnitDecl *DC,
+                                   SourceLocation CommentLoc,
+                                   PragmaMSCommentKind CommentKind,
+                                   StringRef Arg);
+  static PragmaCommentDecl *CreateDeserialized(ASTContext &C, unsigned ID,
+                                               unsigned ArgSize);
+
+  PragmaMSCommentKind getCommentKind() const { return CommentKind; }
+
+  StringRef getArg() const { return getTrailingObjects<char>(); }
+
+  // Implement isa/cast/dyncast/etc.
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) { return K == PragmaComment; }
+};
+
+/// \brief Represents a `#pragma detect_mismatch` line. Always a child of
+/// TranslationUnitDecl.
+class PragmaDetectMismatchDecl final
+    : public Decl,
+      private llvm::TrailingObjects<PragmaDetectMismatchDecl, char> {
+  virtual void anchor();
+
+  size_t ValueStart;
+
+  friend TrailingObjects;
+  friend class ASTDeclReader;
+  friend class ASTDeclWriter;
+
+  PragmaDetectMismatchDecl(TranslationUnitDecl *TU, SourceLocation Loc,
+                           size_t ValueStart)
+      : Decl(PragmaDetectMismatch, TU, Loc), ValueStart(ValueStart) {}
+
+public:
+  static PragmaDetectMismatchDecl *Create(const ASTContext &C,
+                                          TranslationUnitDecl *DC,
+                                          SourceLocation Loc, StringRef Name,
+                                          StringRef Value);
+  static PragmaDetectMismatchDecl *
+  CreateDeserialized(ASTContext &C, unsigned ID, unsigned NameValueSize);
+
+  StringRef getName() const { return getTrailingObjects<char>(); }
+  StringRef getValue() const { return getTrailingObjects<char>() + ValueStart; }
+
+  // Implement isa/cast/dyncast/etc.
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) { return K == PragmaDetectMismatch; }
+};
+
 /// \brief Declaration context for names declared as extern "C" in C++. This
 /// is neither the semantic nor lexical context for such declarations, but is
 /// used to check for conflicts with other extern "C" declarations. Example:
@@ -183,7 +251,7 @@
   // FIXME: Deprecated, move clients to getName().
   std::string getNameAsString() const { return Name.getAsString(); }
 
-  void printName(raw_ostream &os) const { os << Name; }
+  virtual void printName(raw_ostream &os) const;
 
   /// getDeclName - Get the actual, stored name of the declaration,
   /// which may be a special name.
@@ -319,6 +387,7 @@
   NamedDecl *getUnderlyingDecl() {
     // Fast-path the common case.
     if (this->getKind() != UsingShadow &&
+        this->getKind() != ConstructorUsingShadow &&
         this->getKind() != ObjCCompatibleAlias &&
         this->getKind() != NamespaceAlias)
       return this;
@@ -720,7 +789,7 @@
 
 protected:
   // A pointer union of Stmt * and EvaluatedStmt *. When an EvaluatedStmt, we
-  // have allocated the auxilliary struct of information there.
+  // have allocated the auxiliary struct of information there.
   //
   // TODO: It is a bit unfortunate to use a PointerUnion inside the VarDecl for
   // this as *many* VarDecls are ParmVarDecls that don't have default
@@ -813,12 +882,15 @@
     /// variable;  see isARCPseudoStrong() for details.
     unsigned ARCPseudoStrong : 1;
 
+    /// \brief Whether this variable is (C++1z) inline.
+    unsigned IsInline : 1;
+
+    /// \brief Whether this variable has (C++1z) inline explicitly specified.
+    unsigned IsInlineSpecified : 1;
+
     /// \brief Whether this variable is (C++0x) constexpr.
     unsigned IsConstexpr : 1;
 
-    /// \brief Whether this variable is a (C++ Concepts TS) concept.
-    unsigned IsConcept : 1;
-
     /// \brief Whether this variable is the implicit variable for a lambda
     /// init-capture.
     unsigned IsInitCapture : 1;
@@ -953,7 +1025,7 @@
   ///   void foo() { int x; static int y; extern int z; }
   ///
   bool isLocalVarDecl() const {
-    if (getKind() != Decl::Var)
+    if (getKind() != Decl::Var && getKind() != Decl::Decomposition)
       return false;
     if (const DeclContext *DC = getLexicalDeclContext())
       return DC->getRedeclContext()->isFunctionOrMethod();
@@ -968,7 +1040,7 @@
   /// isFunctionOrMethodVarDecl - Similar to isLocalVarDecl, but
   /// excludes variables declared in blocks.
   bool isFunctionOrMethodVarDecl() const {
-    if (getKind() != Decl::Var)
+    if (getKind() != Decl::Var && getKind() != Decl::Decomposition)
       return false;
     const DeclContext *DC = getLexicalDeclContext()->getRedeclContext();
     return DC->isFunctionOrMethod() && DC->getDeclKind() != Decl::Block;
@@ -1037,9 +1109,6 @@
   /// definition of a static data member.
   bool isOutOfLine() const override;
 
-  /// \brief If this is a static data member, find its out-of-line definition.
-  VarDecl *getOutOfLineDefinition();
-
   /// isFileVarDecl - Returns true for file scoped variable declaration.
   bool isFileVarDecl() const {
     Kind K = getKind();
@@ -1185,6 +1254,24 @@
     NonParmVarDeclBits.ARCPseudoStrong = ps;
   }
 
+  /// Whether this variable is (C++1z) inline.
+  bool isInline() const {
+    return isa<ParmVarDecl>(this) ? false : NonParmVarDeclBits.IsInline;
+  }
+  bool isInlineSpecified() const {
+    return isa<ParmVarDecl>(this) ? false
+                                  : NonParmVarDeclBits.IsInlineSpecified;
+  }
+  void setInlineSpecified() {
+    assert(!isa<ParmVarDecl>(this));
+    NonParmVarDeclBits.IsInline = true;
+    NonParmVarDeclBits.IsInlineSpecified = true;
+  }
+  void setImplicitlyInline() {
+    assert(!isa<ParmVarDecl>(this));
+    NonParmVarDeclBits.IsInline = true;
+  }
+
   /// Whether this variable is (C++11) constexpr.
   bool isConstexpr() const {
     return isa<ParmVarDecl>(this) ? false : NonParmVarDeclBits.IsConstexpr;
@@ -1194,15 +1281,6 @@
     NonParmVarDeclBits.IsConstexpr = IC;
   }
 
-  /// Whether this variable is (C++ Concepts TS) concept.
-  bool isConcept() const {
-    return isa<ParmVarDecl>(this) ? false : NonParmVarDeclBits.IsConcept;
-  }
-  void setConcept(bool IC) {
-    assert(!isa<ParmVarDecl>(this));
-    NonParmVarDeclBits.IsConcept = IC;
-  }
-
   /// Whether this variable is the implicit variable for a lambda init-capture.
   bool isInitCapture() const {
     return isa<ParmVarDecl>(this) ? false : NonParmVarDeclBits.IsInitCapture;
@@ -1702,6 +1780,17 @@
     return isDefined(Definition);
   }
 
+  /// \brief Get the definition for this declaration.
+  FunctionDecl *getDefinition() {
+    const FunctionDecl *Definition;
+    if (isDefined(Definition))
+      return const_cast<FunctionDecl *>(Definition);
+    return nullptr;
+  }
+  const FunctionDecl *getDefinition() const {
+    return const_cast<FunctionDecl *>(this)->getDefinition();
+  }
+
   /// getBody - Retrieve the body (definition) of the function. The
   /// function body might be in any of the (re-)declarations of this
   /// function. The variant that accepts a FunctionDecl pointer will
@@ -1896,28 +1985,23 @@
 
   unsigned getBuiltinID() const;
 
+  // ArrayRef interface to parameters.
+  ArrayRef<ParmVarDecl *> parameters() const {
+    return {ParamInfo, getNumParams()};
+  }
+  MutableArrayRef<ParmVarDecl *> parameters() {
+    return {ParamInfo, getNumParams()};
+  }
+
   // Iterator access to formal parameters.
-  unsigned param_size() const { return getNumParams(); }
-  typedef ParmVarDecl **param_iterator;
-  typedef ParmVarDecl * const *param_const_iterator;
-  typedef llvm::iterator_range<param_iterator> param_range;
-  typedef llvm::iterator_range<param_const_iterator> param_const_range;
-
-  param_iterator param_begin() { return param_iterator(ParamInfo); }
-  param_iterator param_end() {
-    return param_iterator(ParamInfo + param_size());
-  }
-  param_range params() { return param_range(param_begin(), param_end()); }
-
-  param_const_iterator param_begin() const {
-    return param_const_iterator(ParamInfo);
-  }
-  param_const_iterator param_end() const {
-    return param_const_iterator(ParamInfo + param_size());
-  }
-  param_const_range params() const {
-    return param_const_range(param_begin(), param_end());
-  }
+  typedef MutableArrayRef<ParmVarDecl *>::iterator param_iterator;
+  typedef ArrayRef<ParmVarDecl *>::const_iterator param_const_iterator;
+  bool param_empty() const { return parameters().empty(); }
+  param_iterator param_begin() { return parameters().begin(); }
+  param_iterator param_end() { return parameters().end(); }
+  param_const_iterator param_begin() const { return parameters().begin(); }
+  param_const_iterator param_end() const { return parameters().end(); }
+  size_t param_size() const { return parameters().size(); }
 
   /// getNumParams - Return the number of parameters this function must have
   /// based on its FunctionType.  This is the length of the ParamInfo array
@@ -1936,12 +2020,6 @@
     setParams(getASTContext(), NewParamInfo);
   }
 
-  // ArrayRef iterface to parameters.
-  // FIXME: Should one day replace iterator interface.
-  ArrayRef<ParmVarDecl*> parameters() const {
-    return llvm::makeArrayRef(ParamInfo, getNumParams());
-  }
-
   ArrayRef<NamedDecl *> getDeclsInPrototypeScope() const {
     return DeclsInPrototypeScope;
   }
@@ -1969,12 +2047,16 @@
     return getType()->getAs<FunctionType>()->getCallResultType(getASTContext());
   }
 
+  /// \brief Returns the WarnUnusedResultAttr that is either declared on this
+  /// function, or its return type declaration.
+  const Attr *getUnusedResultAttr() const;
+
   /// \brief Returns true if this function or its return type has the
   /// warn_unused_result attribute. If the return type has the attribute and
   /// this function is a method of the return type's class, then false will be
   /// returned to avoid spurious warnings on member methods such as assignment
   /// operators.
-  bool hasUnusedResultAttr() const;
+  bool hasUnusedResultAttr() const { return getUnusedResultAttr() != nullptr; }
 
   /// \brief Returns the storage class as written in the source. For the
   /// computed linkage of symbol, see getLinkage.
@@ -2189,6 +2271,14 @@
   /// returns 0.
   unsigned getMemoryFunctionKind() const;
 
+  /// Add a diagnostic to be emitted if and when this function is codegen'ed.
+  void addDeferredDiag(PartialDiagnosticAt PD);
+
+  /// Gets this object's list of deferred diagnostics, if there are any.
+  ///
+  /// Although this is logically const, it clears our list of deferred diags.
+  std::vector<PartialDiagnosticAt> takeDeferredDiags() const;
+
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) {
@@ -2210,7 +2300,7 @@
 /// represent a member of a struct/union/class.
 class FieldDecl : public DeclaratorDecl, public Mergeable<FieldDecl> {
   // FIXME: This can be packed into the bitfields in Decl.
-  bool Mutable : 1;
+  unsigned Mutable : 1;
   mutable unsigned CachedFieldIndex : 31;
 
   /// The kinds of value we can store in InitializerOrBitWidth.
@@ -2444,34 +2534,33 @@
 
   IndirectFieldDecl(ASTContext &C, DeclContext *DC, SourceLocation L,
                     DeclarationName N, QualType T,
-                    NamedDecl **CH, unsigned CHS);
+                    MutableArrayRef<NamedDecl *> CH);
 
 public:
   static IndirectFieldDecl *Create(ASTContext &C, DeclContext *DC,
                                    SourceLocation L, IdentifierInfo *Id,
-                                   QualType T, NamedDecl **CH, unsigned CHS);
+                                   QualType T, llvm::MutableArrayRef<NamedDecl *> CH);
 
   static IndirectFieldDecl *CreateDeserialized(ASTContext &C, unsigned ID);
-  
-  typedef NamedDecl * const *chain_iterator;
-  typedef llvm::iterator_range<chain_iterator> chain_range;
 
-  chain_range chain() const { return chain_range(chain_begin(), chain_end()); }
-  chain_iterator chain_begin() const { return chain_iterator(Chaining); }
-  chain_iterator chain_end() const {
-    return chain_iterator(Chaining + ChainingSize);
+  typedef ArrayRef<NamedDecl *>::const_iterator chain_iterator;
+
+  ArrayRef<NamedDecl *> chain() const {
+    return llvm::makeArrayRef(Chaining, ChainingSize);
   }
+  chain_iterator chain_begin() const { return chain().begin(); }
+  chain_iterator chain_end() const { return chain().end(); }
 
   unsigned getChainingSize() const { return ChainingSize; }
 
   FieldDecl *getAnonField() const {
-    assert(ChainingSize >= 2);
-    return cast<FieldDecl>(Chaining[ChainingSize - 1]);
+    assert(chain().size() >= 2);
+    return cast<FieldDecl>(chain().back());
   }
 
   VarDecl *getVarDecl() const {
-    assert(ChainingSize >= 2);
-    return dyn_cast<VarDecl>(*chain_begin());
+    assert(chain().size() >= 2);
+    return dyn_cast<VarDecl>(chain().front());
   }
 
   IndirectFieldDecl *getCanonicalDecl() override { return getFirstDecl(); }
@@ -2657,20 +2746,20 @@
   /// IsCompleteDefinition - True if this is a definition ("struct foo
   /// {};"), false if it is a declaration ("struct foo;").  It is not
   /// a definition until the definition has been fully processed.
-  bool IsCompleteDefinition : 1;
+  unsigned IsCompleteDefinition : 1;
 
 protected:
   /// IsBeingDefined - True if this is currently being defined.
-  bool IsBeingDefined : 1;
+  unsigned IsBeingDefined : 1;
 
 private:
   /// IsEmbeddedInDeclarator - True if this tag declaration is
   /// "embedded" (i.e., defined or declared for the very first time)
   /// in the syntax of a declarator.
-  bool IsEmbeddedInDeclarator : 1;
+  unsigned IsEmbeddedInDeclarator : 1;
 
   /// \brief True if this tag is free standing, e.g. "struct foo;".
-  bool IsFreeStanding : 1;
+  unsigned IsFreeStanding : 1;
 
 protected:
   // These are used by (and only defined for) EnumDecl.
@@ -2679,26 +2768,26 @@
 
   /// IsScoped - True if this tag declaration is a scoped enumeration. Only
   /// possible in C++11 mode.
-  bool IsScoped : 1;
+  unsigned IsScoped : 1;
   /// IsScopedUsingClassTag - If this tag declaration is a scoped enum,
   /// then this is true if the scoped enum was declared using the class
   /// tag, false if it was declared with the struct tag. No meaning is
   /// associated if this tag declaration is not a scoped enum.
-  bool IsScopedUsingClassTag : 1;
+  unsigned IsScopedUsingClassTag : 1;
 
   /// IsFixed - True if this is an enumeration with fixed underlying type. Only
   /// possible in C++11, Microsoft extensions, or Objective C mode.
-  bool IsFixed : 1;
+  unsigned IsFixed : 1;
 
   /// \brief Indicates whether it is possible for declarations of this kind
   /// to have an out-of-date definition.
   ///
   /// This option is only enabled when modules are enabled.
-  bool MayHaveOutOfDateDef : 1;
+  unsigned MayHaveOutOfDateDef : 1;
 
   /// Has the full definition of this type been required by a use somewhere in
   /// the TU.
-  bool IsCompleteDefinitionRequired : 1;
+  unsigned IsCompleteDefinitionRequired : 1;
 private:
   SourceRange BraceRange;
 
@@ -3124,6 +3213,10 @@
     return isCompleteDefinition() || isFixed();
   }
 
+  /// \brief Retrieve the enum definition from which this enumeration could
+  /// be instantiated, if it is an instantiation (rather than a non-template).
+  EnumDecl *getTemplateInstantiationPattern() const;
+
   /// \brief Returns the enumeration (declared within the template)
   /// from which this enumeration type was instantiated, or NULL if
   /// this enumeration was not instantiated from any template.
@@ -3454,35 +3547,23 @@
   void setSignatureAsWritten(TypeSourceInfo *Sig) { SignatureAsWritten = Sig; }
   TypeSourceInfo *getSignatureAsWritten() const { return SignatureAsWritten; }
 
-  // Iterator access to formal parameters.
-  unsigned param_size() const { return getNumParams(); }
-  typedef ParmVarDecl **param_iterator;
-  typedef ParmVarDecl * const *param_const_iterator;
-  typedef llvm::iterator_range<param_iterator> param_range;
-  typedef llvm::iterator_range<param_const_iterator> param_const_range;
-
   // ArrayRef access to formal parameters.
-  // FIXME: Should eventual replace iterator access.
-  ArrayRef<ParmVarDecl*> parameters() const {
-    return llvm::makeArrayRef(ParamInfo, param_size());
+  ArrayRef<ParmVarDecl *> parameters() const {
+    return {ParamInfo, getNumParams()};
+  }
+  MutableArrayRef<ParmVarDecl *> parameters() {
+    return {ParamInfo, getNumParams()};
   }
 
-  bool param_empty() const { return NumParams == 0; }
-  param_range params() { return param_range(param_begin(), param_end()); }
-  param_iterator param_begin() { return param_iterator(ParamInfo); }
-  param_iterator param_end() {
-    return param_iterator(ParamInfo + param_size());
-  }
-
-  param_const_range params() const {
-    return param_const_range(param_begin(), param_end());
-  }
-  param_const_iterator param_begin() const {
-    return param_const_iterator(ParamInfo);
-  }
-  param_const_iterator param_end() const {
-    return param_const_iterator(ParamInfo + param_size());
-  }
+  // Iterator access to formal parameters.
+  typedef MutableArrayRef<ParmVarDecl *>::iterator param_iterator;
+  typedef ArrayRef<ParmVarDecl *>::const_iterator param_const_iterator;
+  bool param_empty() const { return parameters().empty(); }
+  param_iterator param_begin() { return parameters().begin(); }
+  param_iterator param_end() { return parameters().end(); }
+  param_const_iterator param_begin() const { return parameters().begin(); }
+  param_const_iterator param_end() const { return parameters().end(); }
+  size_t param_size() const { return parameters().size(); }
 
   unsigned getNumParams() const { return NumParams; }
   const ParmVarDecl *getParamDecl(unsigned i) const {
@@ -3503,22 +3584,12 @@
   /// Does not include an entry for 'this'.
   unsigned getNumCaptures() const { return NumCaptures; }
 
-  typedef const Capture *capture_iterator;
-  typedef const Capture *capture_const_iterator;
-  typedef llvm::iterator_range<capture_iterator> capture_range;
-  typedef llvm::iterator_range<capture_const_iterator> capture_const_range;
+  typedef ArrayRef<Capture>::const_iterator capture_const_iterator;
 
-  capture_range captures() {
-    return capture_range(capture_begin(), capture_end());
-  }
-  capture_const_range captures() const {
-    return capture_const_range(capture_begin(), capture_end());
-  }
+  ArrayRef<Capture> captures() const { return {Captures, NumCaptures}; }
 
-  capture_iterator capture_begin() { return Captures; }
-  capture_iterator capture_end() { return Captures + NumCaptures; }
-  capture_const_iterator capture_begin() const { return Captures; }
-  capture_const_iterator capture_end() const { return Captures + NumCaptures; }
+  capture_const_iterator capture_begin() const { return captures().begin(); }
+  capture_const_iterator capture_end() const { return captures().end(); }
 
   bool capturesCXXThis() const { return CapturesCXXThis; }
   bool blockMissingReturnType() const { return BlockMissingReturnType; }
@@ -3609,6 +3680,14 @@
     getParams()[i] = P;
   }
 
+  // ArrayRef interface to parameters.
+  ArrayRef<ImplicitParamDecl *> parameters() const {
+    return {getParams(), getNumParams()};
+  }
+  MutableArrayRef<ImplicitParamDecl *> parameters() {
+    return {getParams(), getNumParams()};
+  }
+
   /// \brief Retrieve the parameter containing captured variables.
   ImplicitParamDecl *getContextParam() const {
     assert(ContextParam < NumParams);
@@ -3629,9 +3708,6 @@
   /// \brief Retrieve an iterator one past the last parameter decl.
   param_iterator param_end() const { return getParams() + NumParams; }
 
-  /// \brief Retrieve an iterator range for the parameter declarations.
-  param_range params() const { return param_range(param_begin(), param_end()); }
-
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) { return K == Captured; }
diff --git a/include/clang/AST/DeclBase.h b/include/clang/AST/DeclBase.h
index 184b92e..4f3c9b3 100644
--- a/include/clang/AST/DeclBase.h
+++ b/include/clang/AST/DeclBase.h
@@ -53,6 +53,7 @@
 class RecordDecl;
 class Stmt;
 class StoredDeclsMap;
+class TemplateDecl;
 class TranslationUnitDecl;
 class UsingDirectiveDecl;
 }
@@ -73,13 +74,10 @@
 ///
 /// Note: There are objects tacked on before the *beginning* of Decl
 /// (and its subclasses) in its Decl::operator new(). Proper alignment
-/// of all subclasses (not requiring more than DeclObjAlignment) is
+/// of all subclasses (not requiring more than the alignment of Decl) is
 /// asserted in DeclBase.cpp.
-class Decl {
+class LLVM_ALIGNAS(/*alignof(uint64_t)*/ 8) Decl {
 public:
-  /// \brief Alignment guaranteed when allocating Decl and any subtypes.
-  enum { DeclObjAlignment = llvm::AlignOf<uint64_t>::Alignment };
-
   /// \brief Lists the kind of concrete classes of Decl.
   enum Kind {
 #define DECL(DERIVED, BASE) DERIVED,
@@ -167,7 +165,10 @@
     /// has been declared outside any function. These act mostly like
     /// invisible friend declarations, but are also visible to unqualified
     /// lookup within the scope of the declaring function.
-    IDNS_LocalExtern         = 0x0800
+    IDNS_LocalExtern         = 0x0800,
+
+    /// This declaration is an OpenMP user defined reduction construction.
+    IDNS_OMPReduction        = 0x1000
   };
 
   /// ObjCDeclQualifier - 'Qualifiers' written next to the return and
@@ -257,7 +258,7 @@
   SourceLocation Loc;
 
   /// DeclKind - This indicates which class this is.
-  unsigned DeclKind : 8;
+  unsigned DeclKind : 7;
 
   /// InvalidDecl - This indicates a semantic error occurred.
   unsigned InvalidDecl :  1;
@@ -297,7 +298,7 @@
   unsigned Hidden : 1;
   
   /// IdentifierNamespace - This specifies what IDNS_* namespace this lives in.
-  unsigned IdentifierNamespace : 12;
+  unsigned IdentifierNamespace : 13;
 
   /// \brief If 0, we have not computed the linkage of this declaration.
   /// Otherwise, it is the linkage + 1.
@@ -515,8 +516,8 @@
   bool isImplicit() const { return Implicit; }
   void setImplicit(bool I = true) { Implicit = I; }
 
-  /// \brief Whether this declaration was used, meaning that a definition
-  /// is required.
+  /// \brief Whether *any* (re-)declaration of the entity was used, meaning that
+  /// a definition is required.
   ///
   /// \param CheckUsedAttr When true, also consider the "used" attribute
   /// (in addition to the "used" bit set by \c setUsed()) when determining
@@ -526,7 +527,8 @@
   /// \brief Set whether the declaration is used, in the sense of odr-use.
   ///
   /// This should only be used immediately after creating a declaration.
-  void setIsUsed() { Used = true; }
+  /// It intentionally doesn't notify any listeners.
+  void setIsUsed() { getCanonicalDecl()->Used = true; }
 
   /// \brief Mark the declaration used, in the sense of odr-use.
   ///
@@ -565,6 +567,13 @@
     return NextInContextAndBits.getInt() & ModulePrivateFlag;
   }
 
+  /// Return true if this declaration has an attribute which acts as
+  /// definition of the entity, such as 'alias' or 'ifunc'.
+  bool hasDefiningAttr() const;
+
+  /// Return this declaration's defining attribute if it has one.
+  const Attr *getDefiningAttr() const;
+
 protected:
   /// \brief Specify whether this declaration was marked as being private
   /// to the module in which it was defined.
@@ -595,11 +604,12 @@
   /// AR_Available, will be set to a (possibly empty) message
   /// describing why the declaration has not been introduced, is
   /// deprecated, or is unavailable.
-  /// \param Version The version of the target OS to determine availability for.
-  /// If \c None, uses the version specified in the ASTContext's target info.
+  ///
+  /// \param EnclosingVersion The version to compare with. If empty, assume the
+  /// deployment target version.
   AvailabilityResult
   getAvailability(std::string *Message = nullptr,
-                  Optional<VersionTuple> Version = None) const;
+                  VersionTuple EnclosingVersion = VersionTuple()) const;
 
   /// \brief Determine whether this declaration is marked 'deprecated'.
   ///
@@ -900,6 +910,10 @@
            DeclKind == FunctionTemplate;
   }
 
+  /// \brief If this is a declaration that describes some template, this
+  /// method returns that template declaration.
+  TemplateDecl *getDescribedTemplate() const;
+
   /// \brief Returns the function itself, or the templated function if this is a
   /// function template.
   FunctionDecl *getAsFunction() LLVM_READONLY;
@@ -1122,6 +1136,7 @@
 ///   ObjCContainerDecl
 ///   LinkageSpecDecl
 ///   BlockDecl
+///   OMPDeclareReductionDecl
 ///
 class DeclContext {
   /// DeclKind - This indicates which class this is.
diff --git a/include/clang/AST/DeclCXX.h b/include/clang/AST/DeclCXX.h
index 7c54901..2071c1b 100644
--- a/include/clang/AST/DeclCXX.h
+++ b/include/clang/AST/DeclCXX.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_CLANG_AST_DECLCXX_H
 #define LLVM_CLANG_AST_DECLCXX_H
 
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTUnresolvedSet.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
@@ -29,6 +30,7 @@
 
 class ClassTemplateDecl;
 class ClassTemplateSpecializationDecl;
+class ConstructorUsingShadowDecl;
 class CXXBasePath;
 class CXXBasePaths;
 class CXXConstructorDecl;
@@ -137,7 +139,6 @@
   static bool classofKind(Kind K) { return K == AccessSpec; }
 };
 
-
 /// \brief Represents a base class of a C++ class.
 ///
 /// Each CXXBaseSpecifier represents a single, direct base class (or
@@ -165,13 +166,13 @@
   SourceLocation EllipsisLoc;
 
   /// \brief Whether this is a virtual base class or not.
-  bool Virtual : 1;
+  unsigned Virtual : 1;
 
   /// \brief Whether this is the base of a class (true) or of a struct (false).
   ///
   /// This determines the mapping from the access specifier as written in the
   /// source code to the access specifier used for semantic analysis.
-  bool BaseOfClass : 1;
+  unsigned BaseOfClass : 1;
 
   /// \brief Access specifier as written in the source code (may be AS_none).
   ///
@@ -181,7 +182,7 @@
 
   /// \brief Whether the class contains a using declaration
   /// to inherit the named class's constructors.
-  bool InheritConstructors : 1;
+  unsigned InheritConstructors : 1;
 
   /// \brief The type of the base class.
   ///
@@ -257,30 +258,6 @@
   TypeSourceInfo *getTypeSourceInfo() const { return BaseTypeInfo; }
 };
 
-/// \brief A lazy pointer to the definition data for a declaration.
-/// FIXME: This is a little CXXRecordDecl-specific that the moment.
-template<typename Decl, typename T> class LazyDefinitionDataPtr {
-  llvm::PointerUnion<T *, Decl *> DataOrCanonicalDecl;
-
-  LazyDefinitionDataPtr update() {
-    if (Decl *Canon = DataOrCanonicalDecl.template dyn_cast<Decl*>()) {
-      if (Canon->isCanonicalDecl())
-        Canon->getMostRecentDecl();
-      else
-        // Declaration isn't canonical any more;
-        // update it and perform path compression.
-        *this = Canon->getPreviousDecl()->DefinitionData.update();
-    }
-    return *this;
-  }
-
-public:
-  LazyDefinitionDataPtr(Decl *Canon) : DataOrCanonicalDecl(Canon) {}
-  LazyDefinitionDataPtr(T *Data) : DataOrCanonicalDecl(Data) {}
-  T *getNotUpdated() { return DataOrCanonicalDecl.template dyn_cast<T*>(); }
-  T *get() { return update().getNotUpdated(); }
-};
-
 /// \brief Represents a C++ struct/union/class.
 class CXXRecordDecl : public RecordDecl {
 
@@ -301,30 +278,30 @@
     DefinitionData(CXXRecordDecl *D);
 
     /// \brief True if this class has any user-declared constructors.
-    bool UserDeclaredConstructor : 1;
+    unsigned UserDeclaredConstructor : 1;
 
     /// \brief The user-declared special members which this class has.
     unsigned UserDeclaredSpecialMembers : 6;
 
     /// \brief True when this class is an aggregate.
-    bool Aggregate : 1;
+    unsigned Aggregate : 1;
 
     /// \brief True when this class is a POD-type.
-    bool PlainOldData : 1;
+    unsigned PlainOldData : 1;
 
     /// true when this class is empty for traits purposes,
     /// i.e. has no data members other than 0-width bit-fields, has no
     /// virtual function/base, and doesn't inherit from a non-empty
     /// class. Doesn't take union-ness into account.
-    bool Empty : 1;
+    unsigned Empty : 1;
 
     /// \brief True when this class is polymorphic, i.e., has at
     /// least one virtual member or derives from a polymorphic class.
-    bool Polymorphic : 1;
+    unsigned Polymorphic : 1;
 
     /// \brief True when this class is abstract, i.e., has at least
     /// one pure virtual function, (that can come from a base class).
-    bool Abstract : 1;
+    unsigned Abstract : 1;
 
     /// \brief True when this class has standard layout.
     ///
@@ -340,58 +317,70 @@
     ///   classes with non-static data members, and
     /// * has no base classes of the same type as the first non-static data
     ///   member.
-    bool IsStandardLayout : 1;
+    unsigned IsStandardLayout : 1;
 
     /// \brief True when there are no non-empty base classes.
     ///
     /// This is a helper bit of state used to implement IsStandardLayout more
     /// efficiently.
-    bool HasNoNonEmptyBases : 1;
+    unsigned HasNoNonEmptyBases : 1;
 
     /// \brief True when there are private non-static data members.
-    bool HasPrivateFields : 1;
+    unsigned HasPrivateFields : 1;
 
     /// \brief True when there are protected non-static data members.
-    bool HasProtectedFields : 1;
+    unsigned HasProtectedFields : 1;
 
     /// \brief True when there are private non-static data members.
-    bool HasPublicFields : 1;
+    unsigned HasPublicFields : 1;
 
     /// \brief True if this class (or any subobject) has mutable fields.
-    bool HasMutableFields : 1;
+    unsigned HasMutableFields : 1;
 
     /// \brief True if this class (or any nested anonymous struct or union)
     /// has variant members.
-    bool HasVariantMembers : 1;
+    unsigned HasVariantMembers : 1;
 
     /// \brief True if there no non-field members declared by the user.
-    bool HasOnlyCMembers : 1;
+    unsigned HasOnlyCMembers : 1;
 
     /// \brief True if any field has an in-class initializer, including those
     /// within anonymous unions or structs.
-    bool HasInClassInitializer : 1;
+    unsigned HasInClassInitializer : 1;
 
     /// \brief True if any field is of reference type, and does not have an
     /// in-class initializer.
     ///
     /// In this case, value-initialization of this class is illegal in C++98
     /// even if the class has a trivial default constructor.
-    bool HasUninitializedReferenceMember : 1;
+    unsigned HasUninitializedReferenceMember : 1;
+
+    /// \brief True if any non-mutable field whose type doesn't have a user-
+    /// provided default ctor also doesn't have an in-class initializer.
+    unsigned HasUninitializedFields : 1;
+
+    /// \brief True if there are any member using-declarations that inherit
+    /// constructors from a base class.
+    unsigned HasInheritedConstructor : 1;
+
+    /// \brief True if there are any member using-declarations named
+    /// 'operator='.
+    unsigned HasInheritedAssignment : 1;
 
     /// \brief These flags are \c true if a defaulted corresponding special
     /// member can't be fully analyzed without performing overload resolution.
     /// @{
-    bool NeedOverloadResolutionForMoveConstructor : 1;
-    bool NeedOverloadResolutionForMoveAssignment : 1;
-    bool NeedOverloadResolutionForDestructor : 1;
+    unsigned NeedOverloadResolutionForMoveConstructor : 1;
+    unsigned NeedOverloadResolutionForMoveAssignment : 1;
+    unsigned NeedOverloadResolutionForDestructor : 1;
     /// @}
 
     /// \brief These flags are \c true if an implicit defaulted corresponding
     /// special member would be defined as deleted.
     /// @{
-    bool DefaultedMoveConstructorIsDeleted : 1;
-    bool DefaultedMoveAssignmentIsDeleted : 1;
-    bool DefaultedDestructorIsDeleted : 1;
+    unsigned DefaultedMoveConstructorIsDeleted : 1;
+    unsigned DefaultedMoveAssignmentIsDeleted : 1;
+    unsigned DefaultedDestructorIsDeleted : 1;
     /// @}
 
     /// \brief The trivial special members which this class has, per
@@ -411,33 +400,37 @@
     unsigned DeclaredNonTrivialSpecialMembers : 6;
 
     /// \brief True when this class has a destructor with no semantic effect.
-    bool HasIrrelevantDestructor : 1;
+    unsigned HasIrrelevantDestructor : 1;
 
     /// \brief True when this class has at least one user-declared constexpr
     /// constructor which is neither the copy nor move constructor.
-    bool HasConstexprNonCopyMoveConstructor : 1;
+    unsigned HasConstexprNonCopyMoveConstructor : 1;
+
+    /// \brief True if this class has a (possibly implicit) defaulted default
+    /// constructor.
+    unsigned HasDefaultedDefaultConstructor : 1;
 
     /// \brief True if a defaulted default constructor for this class would
     /// be constexpr.
-    bool DefaultedDefaultConstructorIsConstexpr : 1;
+    unsigned DefaultedDefaultConstructorIsConstexpr : 1;
 
     /// \brief True if this class has a constexpr default constructor.
     ///
     /// This is true for either a user-declared constexpr default constructor
     /// or an implicitly declared constexpr default constructor.
-    bool HasConstexprDefaultConstructor : 1;
+    unsigned HasConstexprDefaultConstructor : 1;
 
     /// \brief True when this class contains at least one non-static data
     /// member or base class of non-literal or volatile type.
-    bool HasNonLiteralTypeFieldsOrBases : 1;
+    unsigned HasNonLiteralTypeFieldsOrBases : 1;
 
     /// \brief True when visible conversion functions are already computed
     /// and are available.
-    bool ComputedVisibleConversions : 1;
+    unsigned ComputedVisibleConversions : 1;
 
     /// \brief Whether we have a C++11 user-provided default constructor (not
     /// explicitly deleted or defaulted).
-    bool UserProvidedDefaultConstructor : 1;
+    unsigned UserProvidedDefaultConstructor : 1;
 
     /// \brief The special members which have been declared for this class,
     /// either by the user or implicitly.
@@ -445,25 +438,25 @@
 
     /// \brief Whether an implicit copy constructor would have a const-qualified
     /// parameter.
-    bool ImplicitCopyConstructorHasConstParam : 1;
+    unsigned ImplicitCopyConstructorHasConstParam : 1;
 
     /// \brief Whether an implicit copy assignment operator would have a
     /// const-qualified parameter.
-    bool ImplicitCopyAssignmentHasConstParam : 1;
+    unsigned ImplicitCopyAssignmentHasConstParam : 1;
 
     /// \brief Whether any declared copy constructor has a const-qualified
     /// parameter.
-    bool HasDeclaredCopyConstructorWithConstParam : 1;
+    unsigned HasDeclaredCopyConstructorWithConstParam : 1;
 
     /// \brief Whether any declared copy assignment operator has either a
     /// const-qualified reference parameter or a non-reference parameter.
-    bool HasDeclaredCopyAssignmentWithConstParam : 1;
+    unsigned HasDeclaredCopyAssignmentWithConstParam : 1;
 
     /// \brief Whether this class describes a C++ lambda.
-    bool IsLambda : 1;
+    unsigned IsLambda : 1;
 
     /// \brief Whether we are currently parsing base specifiers.
-    bool IsParsingBaseSpecifiers : 1;
+    unsigned IsParsingBaseSpecifiers : 1;
 
     /// \brief The number of base class specifiers in Bases.
     unsigned NumBases;
@@ -515,16 +508,19 @@
       return getVBasesSlowCase();
     }
 
+    ArrayRef<CXXBaseSpecifier> bases() const {
+      return llvm::makeArrayRef(getBases(), NumBases);
+    }
+    ArrayRef<CXXBaseSpecifier> vbases() const {
+      return llvm::makeArrayRef(getVBases(), NumVBases);
+    }
+
   private:
     CXXBaseSpecifier *getBasesSlowCase() const;
     CXXBaseSpecifier *getVBasesSlowCase() const;
   };
 
-  typedef LazyDefinitionDataPtr<CXXRecordDecl, struct DefinitionData>
-      DefinitionDataPtr;
-  friend class LazyDefinitionDataPtr<CXXRecordDecl, struct DefinitionData>;
-
-  mutable DefinitionDataPtr DefinitionData;
+  struct DefinitionData *DefinitionData;
 
   /// \brief Describes a C++ closure type (generated by a lambda expression).
   struct LambdaDefinitionData : public DefinitionData {
@@ -539,11 +535,10 @@
         MethodTyInfo(Info) {
       IsLambda = true;
 
-      // C++11 [expr.prim.lambda]p3:
-      //   This class type is neither an aggregate nor a literal type.
+      // C++1z [expr.prim.lambda]p4:
+      //   This class type is not an aggregate type.
       Aggregate = false;
       PlainOldData = false;
-      HasNonLiteralTypeFieldsOrBases = true;
     }
 
     /// \brief Whether this lambda is known to be dependent, even if its
@@ -587,8 +582,14 @@
        
   };
 
+  struct DefinitionData *dataPtr() const {
+    // Complete the redecl chain (if necessary).
+    getMostRecentDecl();
+    return DefinitionData;
+  }
+
   struct DefinitionData &data() const {
-    auto *DD = DefinitionData.get();
+    auto *DD = dataPtr();
     assert(DD && "queried property of class with no definition");
     return *DD;
   }
@@ -596,7 +597,7 @@
   struct LambdaDefinitionData &getLambdaData() const {
     // No update required: a merged definition cannot change any lambda
     // properties.
-    auto *DD = DefinitionData.getNotUpdated();
+    auto *DD = DefinitionData;
     assert(DD && DD->IsLambda && "queried lambda property of non-lambda class");
     return static_cast<LambdaDefinitionData&>(*DD);
   }
@@ -673,11 +674,13 @@
   }
 
   CXXRecordDecl *getDefinition() const {
-    auto *DD = DefinitionData.get();
+    // We only need an update if we don't already know which
+    // declaration is the definition.
+    auto *DD = DefinitionData ? DefinitionData : dataPtr();
     return DD ? DD->Definition : nullptr;
   }
 
-  bool hasDefinition() const { return DefinitionData.get(); }
+  bool hasDefinition() const { return DefinitionData || dataPtr(); }
 
   static CXXRecordDecl *Create(const ASTContext &C, TagKind TK, DeclContext *DC,
                                SourceLocation StartLoc, SourceLocation IdLoc,
@@ -1021,7 +1024,7 @@
   /// \brief Determine whether this class describes a lambda function object.
   bool isLambda() const {
     // An update record can't turn a non-lambda into a lambda.
-    auto *DD = DefinitionData.getNotUpdated();
+    auto *DD = DefinitionData;
     return DD && DD->IsLambda;
   }
 
@@ -1136,13 +1139,20 @@
   /// \brief Determine whether this is an empty class in the sense of
   /// (C++11 [meta.unary.prop]).
   ///
-  /// A non-union class is empty iff it has a virtual function, virtual base,
-  /// data member (other than 0-width bit-field) or inherits from a non-empty
-  /// class.
+  /// The CXXRecordDecl is a class type, but not a union type,
+  /// with no non-static data members other than bit-fields of length 0,
+  /// no virtual member functions, no virtual base classes,
+  /// and no base class B for which is_empty<B>::value is false.
   ///
   /// \note This does NOT include a check for union-ness.
   bool isEmpty() const { return data().Empty; }
 
+  /// \brief Determine whether this class has direct non-static data members.
+  bool hasDirectFields() const {
+    auto &D = data();
+    return D.HasPublicFields || D.HasProtectedFields || D.HasPrivateFields;
+  }
+
   /// Whether this class is polymorphic (C++ [class.virtual]),
   /// which means that the class contains or inherits a virtual function.
   bool isPolymorphic() const { return data().Polymorphic; }
@@ -1270,6 +1280,14 @@
     return !(data().HasTrivialSpecialMembers & SMF_Destructor);
   }
 
+  /// \brief Determine whether declaring a const variable with this type is ok
+  /// per core issue 253.
+  bool allowConstDefaultInit() const {
+    return !data().HasUninitializedFields ||
+           !(data().HasDefaultedDefaultConstructor ||
+             needsImplicitDefaultConstructor());
+  }
+
   /// \brief Determine whether this class has a destructor which has no
   /// semantic effect.
   ///
@@ -1285,6 +1303,18 @@
     return data().HasNonLiteralTypeFieldsOrBases;
   }
 
+  /// \brief Determine whether this class has a using-declaration that names
+  /// a user-declared base class constructor.
+  bool hasInheritedConstructor() const {
+    return data().HasInheritedConstructor;
+  }
+
+  /// \brief Determine whether this class has a using-declaration that names
+  /// a base class assignment operator.
+  bool hasInheritedAssignment() const {
+    return data().HasInheritedAssignment;
+  }
+
   /// \brief Determine whether this class is considered trivially copyable per
   /// (C++11 [class]p6).
   bool isTriviallyCopyable() const;
@@ -1313,11 +1343,15 @@
   ///
   /// We resolve DR1361 by ignoring the second bullet. We resolve DR1452 by
   /// treating types with trivial default constructors as literal types.
+  ///
+  /// Only in C++1z and beyond, are lambdas literal types.
   bool isLiteral() const {
     return hasTrivialDestructor() &&
-           (isAggregate() || hasConstexprNonCopyMoveConstructor() ||
-            hasTrivialDefaultConstructor()) &&
-           !hasNonLiteralTypeFieldsOrBases();
+           (!isLambda() || getASTContext().getLangOpts().CPlusPlus1z) &&
+           !hasNonLiteralTypeFieldsOrBases() &&
+           (isAggregate() || isLambda() ||
+            hasConstexprNonCopyMoveConstructor() ||
+            hasTrivialDefaultConstructor());
   }
 
   /// \brief If this record is an instantiation of a member class,
@@ -1555,6 +1589,14 @@
                                  CXXBasePath &Path, DeclarationName Name);
 
   /// \brief Base-class lookup callback that determines whether there exists
+  /// an OpenMP declare reduction member with the given name.
+  ///
+  /// This callback can be used with \c lookupInBases() to find members
+  /// of the given name within a C++ class hierarchy.
+  static bool FindOMPReductionMember(const CXXBaseSpecifier *Specifier,
+                                     CXXBasePath &Path, DeclarationName Name);
+
+  /// \brief Base-class lookup callback that determines whether there exists
   /// a member with the given name that can be used in a nested-name-specifier.
   ///
   /// This callback can be used with \c lookupInBases() to find members of
@@ -1690,6 +1732,7 @@
 
   friend class ASTDeclReader;
   friend class ASTDeclWriter;
+  friend class ASTRecordWriter;
   friend class ASTReader;
   friend class ASTWriter;
 };
@@ -1795,6 +1838,8 @@
   method_iterator begin_overridden_methods() const;
   method_iterator end_overridden_methods() const;
   unsigned size_overridden_methods() const;
+  typedef ASTContext::overridden_method_range overridden_method_range;
+  overridden_method_range overridden_methods() const;
 
   /// Returns the parent of this method declaration, which
   /// is the class in which this method is defined.
@@ -1910,15 +1955,15 @@
 
   /// \brief If the initializee is a type, whether that type makes this
   /// a delegating initialization.
-  bool IsDelegating : 1;
+  unsigned IsDelegating : 1;
 
   /// \brief If the initializer is a base initializer, this keeps track
   /// of whether the base is virtual or not.
-  bool IsVirtual : 1;
+  unsigned IsVirtual : 1;
 
   /// \brief Whether or not the initializer is explicitly written
   /// in the sources.
-  bool IsWritten : 1;
+  unsigned IsWritten : 1;
 
   /// If IsWritten is true, then this number keeps track of the textual order
   /// of this initializer in the original sources, counting from 0; otherwise,
@@ -2109,8 +2154,7 @@
     assert(I < getNumArrayIndices() && "Out of bounds member array index");
     getTrailingObjects<VarDecl *>()[I] = Index;
   }
-  ArrayRef<VarDecl *> getArrayIndexes() {
-    assert(getNumArrayIndices() != 0 && "Getting indexes for non-array init");
+  ArrayRef<VarDecl *> getArrayIndices() {
     return llvm::makeArrayRef(getTrailingObjects<VarDecl *>(),
                               getNumArrayIndices());
   }
@@ -2121,6 +2165,23 @@
   friend TrailingObjects;
 };
 
+/// Description of a constructor that was inherited from a base class.
+class InheritedConstructor {
+  ConstructorUsingShadowDecl *Shadow;
+  CXXConstructorDecl *BaseCtor;
+
+public:
+  InheritedConstructor() : Shadow(), BaseCtor() {}
+  InheritedConstructor(ConstructorUsingShadowDecl *Shadow,
+                       CXXConstructorDecl *BaseCtor)
+      : Shadow(Shadow), BaseCtor(BaseCtor) {}
+
+  explicit operator bool() const { return Shadow; }
+
+  ConstructorUsingShadowDecl *getShadowDecl() const { return Shadow; }
+  CXXConstructorDecl *getConstructor() const { return BaseCtor; }
+};
+
 /// \brief Represents a C++ constructor within a class.
 ///
 /// For example:
@@ -2131,40 +2192,51 @@
 ///   explicit X(int); // represented by a CXXConstructorDecl.
 /// };
 /// \endcode
-class CXXConstructorDecl : public CXXMethodDecl {
+class CXXConstructorDecl final
+    : public CXXMethodDecl,
+      private llvm::TrailingObjects<CXXConstructorDecl, InheritedConstructor> {
   void anchor() override;
-  /// \brief Whether this constructor declaration has the \c explicit keyword
-  /// specified.
-  bool IsExplicitSpecified : 1;
 
   /// \name Support for base and member initializers.
   /// \{
   /// \brief The arguments used to initialize the base or member.
   LazyCXXCtorInitializersPtr CtorInitializers;
-  unsigned NumCtorInitializers;
+  unsigned NumCtorInitializers : 30;
   /// \}
 
+  /// \brief Whether this constructor declaration has the \c explicit keyword
+  /// specified.
+  unsigned IsExplicitSpecified : 1;
+
+  /// \brief Whether this constructor declaration is an implicitly-declared
+  /// inheriting constructor.
+  unsigned IsInheritingConstructor : 1;
+
   CXXConstructorDecl(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
                      const DeclarationNameInfo &NameInfo,
                      QualType T, TypeSourceInfo *TInfo,
                      bool isExplicitSpecified, bool isInline,
-                     bool isImplicitlyDeclared, bool isConstexpr)
+                     bool isImplicitlyDeclared, bool isConstexpr,
+                     InheritedConstructor Inherited)
     : CXXMethodDecl(CXXConstructor, C, RD, StartLoc, NameInfo, T, TInfo,
                     SC_None, isInline, isConstexpr, SourceLocation()),
-      IsExplicitSpecified(isExplicitSpecified), CtorInitializers(nullptr),
-      NumCtorInitializers(0) {
+      CtorInitializers(nullptr), NumCtorInitializers(0),
+      IsExplicitSpecified(isExplicitSpecified),
+      IsInheritingConstructor((bool)Inherited) {
     setImplicit(isImplicitlyDeclared);
+    if (Inherited)
+      *getTrailingObjects<InheritedConstructor>() = Inherited;
   }
 
 public:
-  static CXXConstructorDecl *CreateDeserialized(ASTContext &C, unsigned ID);
-  static CXXConstructorDecl *Create(ASTContext &C, CXXRecordDecl *RD,
-                                    SourceLocation StartLoc,
-                                    const DeclarationNameInfo &NameInfo,
-                                    QualType T, TypeSourceInfo *TInfo,
-                                    bool isExplicit,
-                                    bool isInline, bool isImplicitlyDeclared,
-                                    bool isConstexpr);
+  static CXXConstructorDecl *CreateDeserialized(ASTContext &C, unsigned ID,
+                                                bool InheritsConstructor);
+  static CXXConstructorDecl *
+  Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
+         const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
+         bool isExplicit, bool isInline, bool isImplicitlyDeclared,
+         bool isConstexpr,
+         InheritedConstructor Inherited = InheritedConstructor());
 
   /// \brief Determine whether this constructor declaration has the
   /// \c explicit keyword specified.
@@ -2311,11 +2383,15 @@
   /// an object.
   bool isSpecializationCopyingObject() const;
 
-  /// \brief Get the constructor that this inheriting constructor is based on.
-  const CXXConstructorDecl *getInheritedConstructor() const;
+  /// \brief Determine whether this is an implicit constructor synthesized to
+  /// model a call to a constructor inherited from a base class.
+  bool isInheritingConstructor() const { return IsInheritingConstructor; }
 
-  /// \brief Set the constructor that this inheriting constructor is based on.
-  void setInheritedConstructor(const CXXConstructorDecl *BaseCtor);
+  /// \brief Get the constructor that this inheriting constructor is based on.
+  InheritedConstructor getInheritedConstructor() const {
+    return IsInheritingConstructor ? *getTrailingObjects<InheritedConstructor>()
+                                   : InheritedConstructor();
+  }
 
   CXXConstructorDecl *getCanonicalDecl() override {
     return cast<CXXConstructorDecl>(FunctionDecl::getCanonicalDecl());
@@ -2330,6 +2406,7 @@
 
   friend class ASTDeclReader;
   friend class ASTDeclWriter;
+  friend TrailingObjects;
 };
 
 /// \brief Represents a C++ destructor within a class.
@@ -2774,18 +2851,6 @@
   NamedDecl *UsingOrNextShadow;
   friend class UsingDecl;
 
-  UsingShadowDecl(ASTContext &C, DeclContext *DC, SourceLocation Loc,
-                  UsingDecl *Using, NamedDecl *Target)
-    : NamedDecl(UsingShadow, DC, Loc, DeclarationName()),
-      redeclarable_base(C), Underlying(Target),
-      UsingOrNextShadow(reinterpret_cast<NamedDecl *>(Using)) {
-    if (Target) {
-      setDeclName(Target->getDeclName());
-      IdentifierNamespace = Target->getIdentifierNamespace();
-    }
-    setImplicit();
-  }
-
   typedef Redeclarable<UsingShadowDecl> redeclarable_base;
   UsingShadowDecl *getNextRedeclarationImpl() override {
     return getNextRedeclaration();
@@ -2797,11 +2862,16 @@
     return getMostRecentDecl();
   }
 
+protected:
+  UsingShadowDecl(Kind K, ASTContext &C, DeclContext *DC, SourceLocation Loc,
+                  UsingDecl *Using, NamedDecl *Target);
+  UsingShadowDecl(Kind K, ASTContext &C, EmptyShell);
+
 public:
   static UsingShadowDecl *Create(ASTContext &C, DeclContext *DC,
                                  SourceLocation Loc, UsingDecl *Using,
                                  NamedDecl *Target) {
-    return new (C, DC) UsingShadowDecl(C, DC, Loc, Using, Target);
+    return new (C, DC) UsingShadowDecl(UsingShadow, C, DC, Loc, Using, Target);
   }
 
   static UsingShadowDecl *CreateDeserialized(ASTContext &C, unsigned ID);
@@ -2813,6 +2883,7 @@
   using redeclarable_base::redecls;
   using redeclarable_base::getPreviousDecl;
   using redeclarable_base::getMostRecentDecl;
+  using redeclarable_base::isFirstDecl;
 
   UsingShadowDecl *getCanonicalDecl() override {
     return getFirstDecl();
@@ -2843,7 +2914,127 @@
   }
 
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
-  static bool classofKind(Kind K) { return K == Decl::UsingShadow; }
+  static bool classofKind(Kind K) {
+    return K == Decl::UsingShadow || K == Decl::ConstructorUsingShadow;
+  }
+
+  friend class ASTDeclReader;
+  friend class ASTDeclWriter;
+};
+
+/// \brief Represents a shadow constructor declaration introduced into a
+/// class by a C++11 using-declaration that names a constructor.
+///
+/// For example:
+/// \code
+/// struct Base { Base(int); };
+/// struct Derived {
+///    using Base::Base; // creates a UsingDecl and a ConstructorUsingShadowDecl
+/// };
+/// \endcode
+class ConstructorUsingShadowDecl final : public UsingShadowDecl {
+  void anchor() override;
+
+  /// \brief If this constructor using declaration inherted the constructor
+  /// from an indirect base class, this is the ConstructorUsingShadowDecl
+  /// in the named direct base class from which the declaration was inherited.
+  ConstructorUsingShadowDecl *NominatedBaseClassShadowDecl;
+
+  /// \brief If this constructor using declaration inherted the constructor
+  /// from an indirect base class, this is the ConstructorUsingShadowDecl
+  /// that will be used to construct the unique direct or virtual base class
+  /// that receives the constructor arguments.
+  ConstructorUsingShadowDecl *ConstructedBaseClassShadowDecl;
+
+  /// \brief \c true if the constructor ultimately named by this using shadow
+  /// declaration is within a virtual base class subobject of the class that
+  /// contains this declaration.
+  unsigned IsVirtual : 1;
+
+  ConstructorUsingShadowDecl(ASTContext &C, DeclContext *DC, SourceLocation Loc,
+                             UsingDecl *Using, NamedDecl *Target,
+                             bool TargetInVirtualBase)
+      : UsingShadowDecl(ConstructorUsingShadow, C, DC, Loc, Using,
+                        Target->getUnderlyingDecl()),
+        NominatedBaseClassShadowDecl(
+            dyn_cast<ConstructorUsingShadowDecl>(Target)),
+        ConstructedBaseClassShadowDecl(NominatedBaseClassShadowDecl),
+        IsVirtual(TargetInVirtualBase) {
+    // If we found a constructor for a non-virtual base class, but it chains to
+    // a constructor for a virtual base, we should directly call the virtual
+    // base constructor instead.
+    // FIXME: This logic belongs in Sema.
+    if (!TargetInVirtualBase && NominatedBaseClassShadowDecl &&
+        NominatedBaseClassShadowDecl->constructsVirtualBase()) {
+      ConstructedBaseClassShadowDecl =
+          NominatedBaseClassShadowDecl->ConstructedBaseClassShadowDecl;
+      IsVirtual = true;
+    }
+  }
+  ConstructorUsingShadowDecl(ASTContext &C, EmptyShell Empty)
+      : UsingShadowDecl(ConstructorUsingShadow, C, Empty),
+        NominatedBaseClassShadowDecl(), ConstructedBaseClassShadowDecl(),
+        IsVirtual(false) {}
+
+public:
+  static ConstructorUsingShadowDecl *Create(ASTContext &C, DeclContext *DC,
+                                            SourceLocation Loc,
+                                            UsingDecl *Using, NamedDecl *Target,
+                                            bool IsVirtual);
+  static ConstructorUsingShadowDecl *CreateDeserialized(ASTContext &C,
+                                                        unsigned ID);
+
+  /// Returns the parent of this using shadow declaration, which
+  /// is the class in which this is declared.
+  //@{
+  const CXXRecordDecl *getParent() const {
+    return cast<CXXRecordDecl>(getDeclContext());
+  }
+  CXXRecordDecl *getParent() {
+    return cast<CXXRecordDecl>(getDeclContext());
+  }
+  //@}
+
+  /// \brief Get the inheriting constructor declaration for the direct base
+  /// class from which this using shadow declaration was inherited, if there is
+  /// one. This can be different for each redeclaration of the same shadow decl.
+  ConstructorUsingShadowDecl *getNominatedBaseClassShadowDecl() const {
+    return NominatedBaseClassShadowDecl;
+  }
+
+  /// \brief Get the inheriting constructor declaration for the base class
+  /// for which we don't have an explicit initializer, if there is one.
+  ConstructorUsingShadowDecl *getConstructedBaseClassShadowDecl() const {
+    return ConstructedBaseClassShadowDecl;
+  }
+
+  /// \brief Get the base class that was named in the using declaration. This
+  /// can be different for each redeclaration of this same shadow decl.
+  CXXRecordDecl *getNominatedBaseClass() const;
+
+  /// \brief Get the base class whose constructor or constructor shadow
+  /// declaration is passed the constructor arguments.
+  CXXRecordDecl *getConstructedBaseClass() const {
+    return cast<CXXRecordDecl>((ConstructedBaseClassShadowDecl
+                                    ? ConstructedBaseClassShadowDecl
+                                    : getTargetDecl())
+                                   ->getDeclContext());
+  }
+
+  /// \brief Returns \c true if the constructed base class is a virtual base
+  /// class subobject of this declaration's class.
+  bool constructsVirtualBase() const {
+    return IsVirtual;
+  }
+
+  /// \brief Get the constructor or constructor template in the derived class
+  /// correspnding to this using shadow declaration, if it has been implicitly
+  /// declared already.
+  CXXConstructorDecl *getConstructor() const;
+  void setConstructor(NamedDecl *Ctor);
+
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) { return K == ConstructorUsingShadow; }
 
   friend class ASTDeclReader;
   friend class ASTDeclWriter;
@@ -3183,6 +3374,104 @@
   friend class ASTDeclReader;
 };
 
+/// A binding in a decomposition declaration. For instance, given:
+///
+///   int n[3];
+///   auto &[a, b, c] = n;
+///
+/// a, b, and c are BindingDecls, whose bindings are the expressions
+/// x[0], x[1], and x[2] respectively, where x is the implicit
+/// DecompositionDecl of type 'int (&)[3]'.
+class BindingDecl : public ValueDecl {
+  void anchor() override;
+
+  /// The binding represented by this declaration. References to this
+  /// declaration are effectively equivalent to this expression (except
+  /// that it is only evaluated once at the point of declaration of the
+  /// binding).
+  Expr *Binding;
+
+  BindingDecl(DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id)
+      : ValueDecl(Decl::Binding, DC, IdLoc, Id, QualType()), Binding(nullptr) {}
+
+public:
+  static BindingDecl *Create(ASTContext &C, DeclContext *DC,
+                             SourceLocation IdLoc, IdentifierInfo *Id);
+  static BindingDecl *CreateDeserialized(ASTContext &C, unsigned ID);
+
+  /// Get the expression to which this declaration is bound. This may be null
+  /// in two different cases: while parsing the initializer for the
+  /// decomposition declaration, and when the initializer is type-dependent.
+  Expr *getBinding() const { return Binding; }
+
+  /// Get the variable (if any) that holds the value of evaluating the binding.
+  /// Only present for user-defined bindings for tuple-like types.
+  VarDecl *getHoldingVar() const;
+
+  /// Set the binding for this BindingDecl, along with its declared type (which
+  /// should be a possibly-cv-qualified form of the type of the binding, or a
+  /// reference to such a type).
+  void setBinding(QualType DeclaredType, Expr *Binding) {
+    setType(DeclaredType);
+    this->Binding = Binding;
+  }
+
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) { return K == Decl::Binding; }
+
+  friend class ASTDeclReader;
+};
+
+/// A decomposition declaration. For instance, given:
+///
+///   int n[3];
+///   auto &[a, b, c] = n;
+///
+/// the second line declares a DecompositionDecl of type 'int (&)[3]', and
+/// three BindingDecls (named a, b, and c). An instance of this class is always
+/// unnamed, but behaves in almost all other respects like a VarDecl.
+class DecompositionDecl final
+    : public VarDecl,
+      private llvm::TrailingObjects<DecompositionDecl, BindingDecl *> {
+  void anchor() override;
+
+  /// The number of BindingDecl*s following this object.
+  unsigned NumBindings;
+
+  DecompositionDecl(ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
+                    SourceLocation LSquareLoc, QualType T,
+                    TypeSourceInfo *TInfo, StorageClass SC,
+                    ArrayRef<BindingDecl *> Bindings)
+      : VarDecl(Decomposition, C, DC, StartLoc, LSquareLoc, nullptr, T, TInfo,
+                SC),
+        NumBindings(Bindings.size()) {
+    std::uninitialized_copy(Bindings.begin(), Bindings.end(),
+                            getTrailingObjects<BindingDecl *>());
+  }
+
+public:
+  static DecompositionDecl *Create(ASTContext &C, DeclContext *DC,
+                                   SourceLocation StartLoc,
+                                   SourceLocation LSquareLoc,
+                                   QualType T, TypeSourceInfo *TInfo,
+                                   StorageClass S,
+                                   ArrayRef<BindingDecl *> Bindings);
+  static DecompositionDecl *CreateDeserialized(ASTContext &C, unsigned ID,
+                                               unsigned NumBindings);
+
+  ArrayRef<BindingDecl *> bindings() const {
+    return llvm::makeArrayRef(getTrailingObjects<BindingDecl *>(), NumBindings);
+  }
+
+  void printName(raw_ostream &os) const override;
+
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) { return K == Decomposition; }
+
+  friend TrailingObjects;
+  friend class ASTDeclReader;
+};
+
 /// An instance of this class represents the declaration of a property
 /// member.  This is a Microsoft extension to C++, first introduced in
 /// Visual Studio .NET 2003 as a parallel to similar features in C#
diff --git a/include/clang/AST/DeclFriend.h b/include/clang/AST/DeclFriend.h
index 27b0388..5b2e2d9 100644
--- a/include/clang/AST/DeclFriend.h
+++ b/include/clang/AST/DeclFriend.h
@@ -57,7 +57,7 @@
   /// True if this 'friend' declaration is unsupported.  Eventually we
   /// will support every possible friend declaration, but for now we
   /// silently ignore some and set this flag to authorize all access.
-  bool UnsupportedFriend : 1;
+  unsigned UnsupportedFriend : 1;
 
   // The number of "outer" template parameter lists in non-templatic
   // (currently unsupported) friend type declarations, such as
diff --git a/include/clang/AST/DeclObjC.h b/include/clang/AST/DeclObjC.h
index f89717f..ad9b5a2 100644
--- a/include/clang/AST/DeclObjC.h
+++ b/include/clang/AST/DeclObjC.h
@@ -351,11 +351,6 @@
   typedef llvm::iterator_range<param_iterator> param_range;
   typedef llvm::iterator_range<param_const_iterator> param_const_range;
 
-  param_range params() { return param_range(param_begin(), param_end()); }
-  param_const_range params() const {
-    return param_const_range(param_begin(), param_end());
-  }
-
   param_const_iterator param_begin() const {
     return param_const_iterator(getParams());
   }
@@ -1129,15 +1124,15 @@
 
     /// \brief Indicates that the contents of this Objective-C class will be
     /// completed by the external AST source when required.
-    mutable bool ExternallyCompleted : 1;
+    mutable unsigned ExternallyCompleted : 1;
 
     /// \brief Indicates that the ivar cache does not yet include ivars
     /// declared in the implementation.
-    mutable bool IvarListMissingImplementation : 1;
+    mutable unsigned IvarListMissingImplementation : 1;
 
     /// Indicates that this interface decl contains at least one initializer
     /// marked with the 'objc_designated_initializer' attribute.
-    bool HasDesignatedInitializers : 1;
+    unsigned HasDesignatedInitializers : 1;
 
     enum InheritedDesignatedInitializersState {
       /// We didn't calculate whether the designated initializers should be
diff --git a/include/clang/AST/DeclOpenMP.h b/include/clang/AST/DeclOpenMP.h
index 598f418..1975bc5 100644
--- a/include/clang/AST/DeclOpenMP.h
+++ b/include/clang/AST/DeclOpenMP.h
@@ -15,11 +15,14 @@
 #ifndef LLVM_CLANG_AST_DECLOPENMP_H
 #define LLVM_CLANG_AST_DECLOPENMP_H
 
-#include "clang/AST/DeclBase.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/ExternalASTSource.h"
+#include "clang/AST/Type.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/TrailingObjects.h"
 
 namespace clang {
-class Expr;
 
 /// \brief This represents '#pragma omp threadprivate ...' directive.
 /// For example, in the following, both 'a' and 'A::b' are threadprivate:
@@ -86,6 +89,107 @@
   static bool classofKind(Kind K) { return K == OMPThreadPrivate; }
 };
 
-}  // end namespace clang
+/// \brief This represents '#pragma omp declare reduction ...' directive.
+/// For example, in the following, declared reduction 'foo' for types 'int' and
+/// 'float':
+///
+/// \code
+/// #pragma omp declare reduction (foo : int,float : omp_out += omp_in) \
+///                     initializer (omp_priv = 0)
+/// \endcode
+///
+/// Here 'omp_out += omp_in' is a combiner and 'omp_priv = 0' is an initializer.
+class OMPDeclareReductionDecl final : public ValueDecl, public DeclContext {
+private:
+  friend class ASTDeclReader;
+  /// \brief Combiner for declare reduction construct.
+  Expr *Combiner;
+  /// \brief Initializer for declare reduction construct.
+  Expr *Initializer;
+  /// \brief Reference to the previous declare reduction construct in the same
+  /// scope with the same name. Required for proper templates instantiation if
+  /// the declare reduction construct is declared inside compound statement.
+  LazyDeclPtr PrevDeclInScope;
+
+  virtual void anchor();
+
+  OMPDeclareReductionDecl(Kind DK, DeclContext *DC, SourceLocation L,
+                          DeclarationName Name, QualType Ty,
+                          OMPDeclareReductionDecl *PrevDeclInScope)
+      : ValueDecl(DK, DC, L, Name, Ty), DeclContext(DK), Combiner(nullptr),
+        Initializer(nullptr), PrevDeclInScope(PrevDeclInScope) {}
+
+  void setPrevDeclInScope(OMPDeclareReductionDecl *Prev) {
+    PrevDeclInScope = Prev;
+  }
+
+public:
+  /// \brief Create declare reduction node.
+  static OMPDeclareReductionDecl *
+  Create(ASTContext &C, DeclContext *DC, SourceLocation L, DeclarationName Name,
+         QualType T, OMPDeclareReductionDecl *PrevDeclInScope);
+  /// \brief Create deserialized declare reduction node.
+  static OMPDeclareReductionDecl *CreateDeserialized(ASTContext &C,
+                                                     unsigned ID);
+
+  /// \brief Get combiner expression of the declare reduction construct.
+  Expr *getCombiner() { return Combiner; }
+  const Expr *getCombiner() const { return Combiner; }
+  /// \brief Set combiner expression for the declare reduction construct.
+  void setCombiner(Expr *E) { Combiner = E; }
+
+  /// \brief Get initializer expression (if specified) of the declare reduction
+  /// construct.
+  Expr *getInitializer() { return Initializer; }
+  const Expr *getInitializer() const { return Initializer; }
+  /// \brief Set initializer expression for the declare reduction construct.
+  void setInitializer(Expr *E) { Initializer = E; }
+
+  /// \brief Get reference to previous declare reduction construct in the same
+  /// scope with the same name.
+  OMPDeclareReductionDecl *getPrevDeclInScope();
+  const OMPDeclareReductionDecl *getPrevDeclInScope() const;
+
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) { return K == OMPDeclareReduction; }
+  static DeclContext *castToDeclContext(const OMPDeclareReductionDecl *D) {
+    return static_cast<DeclContext *>(const_cast<OMPDeclareReductionDecl *>(D));
+  }
+  static OMPDeclareReductionDecl *castFromDeclContext(const DeclContext *DC) {
+    return static_cast<OMPDeclareReductionDecl *>(
+        const_cast<DeclContext *>(DC));
+  }
+};
+
+/// Pseudo declaration for capturing expressions. Also is used for capturing of
+/// non-static data members in non-static member functions.
+///
+/// Clang supports capturing of variables only, but OpenMP 4.5 allows to
+/// privatize non-static members of current class in non-static member
+/// functions. This pseudo-declaration allows properly handle this kind of
+/// capture by wrapping captured expression into a variable-like declaration.
+class OMPCapturedExprDecl final : public VarDecl {
+  friend class ASTDeclReader;
+  void anchor() override;
+
+  OMPCapturedExprDecl(ASTContext &C, DeclContext *DC, IdentifierInfo *Id,
+                      QualType Type)
+      : VarDecl(OMPCapturedExpr, C, DC, SourceLocation(), SourceLocation(), Id,
+                Type, nullptr, SC_None) {
+    setImplicit();
+  }
+
+public:
+  static OMPCapturedExprDecl *Create(ASTContext &C, DeclContext *DC,
+                                     IdentifierInfo *Id, QualType T);
+
+  static OMPCapturedExprDecl *CreateDeserialized(ASTContext &C, unsigned ID);
+
+  // Implement isa/cast/dyncast/etc.
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) { return K == OMPCapturedExpr; }
+};
+
+} // end namespace clang
 
 #endif
diff --git a/include/clang/AST/DeclTemplate.h b/include/clang/AST/DeclTemplate.h
index a9109ef..8671d95 100644
--- a/include/clang/AST/DeclTemplate.h
+++ b/include/clang/AST/DeclTemplate.h
@@ -21,7 +21,7 @@
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/TrailingObjects.h"
-#include <limits>
+#include <utility>
 
 namespace clang {
 
@@ -46,7 +46,8 @@
 /// \brief Stores a list of template parameters for a TemplateDecl and its
 /// derived classes.
 class TemplateParameterList final
-    : private llvm::TrailingObjects<TemplateParameterList, NamedDecl *> {
+    : private llvm::TrailingObjects<TemplateParameterList, NamedDecl *,
+                                    Expr *> {
 
   /// The location of the 'template' keyword.
   SourceLocation TemplateLoc;
@@ -56,26 +57,35 @@
 
   /// The number of template parameters in this template
   /// parameter list.
-  unsigned NumParams : 31;
+  unsigned NumParams : 30;
 
   /// Whether this template parameter list contains an unexpanded parameter
   /// pack.
   unsigned ContainsUnexpandedParameterPack : 1;
 
+  /// Whether this template parameter list has an associated requires-clause
+  unsigned HasRequiresClause : 1;
+
 protected:
   size_t numTrailingObjects(OverloadToken<NamedDecl *>) const {
     return NumParams;
   }
 
+  size_t numTrailingObjects(OverloadToken<Expr *>) const {
+    return HasRequiresClause;
+  }
+
   TemplateParameterList(SourceLocation TemplateLoc, SourceLocation LAngleLoc,
-                        ArrayRef<NamedDecl *> Params, SourceLocation RAngleLoc);
+                        ArrayRef<NamedDecl *> Params, SourceLocation RAngleLoc,
+                        Expr *RequiresClause);
 
 public:
   static TemplateParameterList *Create(const ASTContext &C,
                                        SourceLocation TemplateLoc,
                                        SourceLocation LAngleLoc,
                                        ArrayRef<NamedDecl *> Params,
-                                       SourceLocation RAngleLoc);
+                                       SourceLocation RAngleLoc,
+                                       Expr *RequiresClause);
 
   /// \brief Iterates through the template parameters in this list.
   typedef NamedDecl** iterator;
@@ -127,6 +137,16 @@
     return ContainsUnexpandedParameterPack;
   }
 
+  /// \brief The constraint-expression of the associated requires-clause.
+  Expr *getRequiresClause() {
+    return HasRequiresClause ? *getTrailingObjects<Expr *>() : nullptr;
+  }
+
+  /// \brief The constraint-expression of the associated requires-clause.
+  const Expr *getRequiresClause() const {
+    return HasRequiresClause ? *getTrailingObjects<Expr *>() : nullptr;
+  }
+
   SourceLocation getTemplateLoc() const { return TemplateLoc; }
   SourceLocation getLAngleLoc() const { return LAngleLoc; }
   SourceLocation getRAngleLoc() const { return RAngleLoc; }
@@ -136,36 +156,37 @@
   }
 
   friend TrailingObjects;
-  template <size_t N> friend class FixedSizeTemplateParameterListStorage;
+
+  template <size_t N, bool HasRequiresClause>
+  friend class FixedSizeTemplateParameterListStorage;
+
+public:
+  // FIXME: workaround for MSVC 2013; remove when no longer needed
+  using FixedSizeStorageOwner = TrailingObjects::FixedSizeStorageOwner;
 };
 
-/// \brief Stores a list of template parameters for a TemplateDecl and its
-/// derived classes. Suitable for creating on the stack.
-template <size_t N> class FixedSizeTemplateParameterListStorage {
-  // This is kinda ugly: TemplateParameterList usually gets allocated
-  // in a block of memory with NamedDecls appended to it. Here, to get
-  // it stack allocated, we include the params as a separate
-  // variable. After allocation, the TemplateParameterList object
-  // treats them as part of itself.
-  TemplateParameterList List;
-  NamedDecl *Params[N];
+/// \brief Stores a list of template parameters and the associated
+/// requires-clause (if any) for a TemplateDecl and its derived classes.
+/// Suitable for creating on the stack.
+template <size_t N, bool HasRequiresClause>
+class FixedSizeTemplateParameterListStorage
+    : public TemplateParameterList::FixedSizeStorageOwner {
+  typename TemplateParameterList::FixedSizeStorage<
+      NamedDecl *, Expr *>::with_counts<
+      N, HasRequiresClause ? 1u : 0u
+      >::type storage;
 
 public:
   FixedSizeTemplateParameterListStorage(SourceLocation TemplateLoc,
                                         SourceLocation LAngleLoc,
                                         ArrayRef<NamedDecl *> Params,
-                                        SourceLocation RAngleLoc)
-      : List(TemplateLoc, LAngleLoc, Params, RAngleLoc) {
-    // Because we're doing an evil layout hack above, have some
-    // asserts, just to double-check everything is laid out like
-    // expected.
-    assert(sizeof(*this) ==
-               TemplateParameterList::totalSizeToAlloc<NamedDecl *>(N) &&
-           "Object layout not as expected");
-    assert(this->Params == List.getTrailingObjects<NamedDecl *>() &&
-           "Object layout not as expected");
-  }
-  TemplateParameterList *get() { return &List; }
+                                        SourceLocation RAngleLoc,
+                                        Expr *RequiresClause)
+      : FixedSizeStorageOwner(
+            (assert(N == Params.size()),
+             assert(HasRequiresClause == static_cast<bool>(RequiresClause)),
+             new (static_cast<void *>(&storage)) TemplateParameterList(
+                 TemplateLoc, LAngleLoc, Params, RAngleLoc, RequiresClause))) {}
 };
 
 /// \brief A template argument list.
@@ -183,7 +204,7 @@
 
   // Constructs an instance with an internal Argument list, containing
   // a copy of the Args array. (Called by CreateCopy)
-  TemplateArgumentList(const TemplateArgument *Args, unsigned NumArgs);
+  TemplateArgumentList(ArrayRef<TemplateArgument> Args);
 
 public:
   /// \brief Type used to indicate that the template argument list itself is a
@@ -193,16 +214,14 @@
   /// \brief Create a new template argument list that copies the given set of
   /// template arguments.
   static TemplateArgumentList *CreateCopy(ASTContext &Context,
-                                          const TemplateArgument *Args,
-                                          unsigned NumArgs);
+                                          ArrayRef<TemplateArgument> Args);
 
   /// \brief Construct a new, temporary template argument list on the stack.
   ///
   /// The template argument list does not own the template arguments
   /// provided.
-  explicit TemplateArgumentList(OnStackType, const TemplateArgument *Args,
-                                unsigned NumArgs)
-      : Arguments(Args), NumArguments(NumArgs) {}
+  explicit TemplateArgumentList(OnStackType, ArrayRef<TemplateArgument> Args)
+      : Arguments(Args.data()), NumArguments(Args.size()) {}
 
   /// \brief Produces a shallow copy of the given template argument list.
   ///
@@ -332,32 +351,36 @@
   void anchor() override;
 protected:
   // This is probably never used.
-  TemplateDecl(Kind DK, DeclContext *DC, SourceLocation L,
-               DeclarationName Name)
-    : NamedDecl(DK, DC, L, Name), TemplatedDecl(nullptr),
-      TemplateParams(nullptr) {}
+  TemplateDecl(Kind DK, DeclContext *DC, SourceLocation L, DeclarationName Name)
+      : NamedDecl(DK, DC, L, Name), TemplatedDecl(nullptr, false),
+        TemplateParams(nullptr) {}
 
   // Construct a template decl with the given name and parameters.
   // Used when there is not templated element (tt-params).
-  TemplateDecl(Kind DK, DeclContext *DC, SourceLocation L,
-               DeclarationName Name, TemplateParameterList *Params)
-    : NamedDecl(DK, DC, L, Name), TemplatedDecl(nullptr),
-      TemplateParams(Params) {}
+  TemplateDecl(Kind DK, DeclContext *DC, SourceLocation L, DeclarationName Name,
+               TemplateParameterList *Params)
+      : NamedDecl(DK, DC, L, Name), TemplatedDecl(nullptr, false),
+        TemplateParams(Params) {}
 
   // Construct a template decl with name, parameters, and templated element.
-  TemplateDecl(Kind DK, DeclContext *DC, SourceLocation L,
-               DeclarationName Name, TemplateParameterList *Params,
-               NamedDecl *Decl)
-    : NamedDecl(DK, DC, L, Name), TemplatedDecl(Decl),
-      TemplateParams(Params) { }
+  TemplateDecl(Kind DK, DeclContext *DC, SourceLocation L, DeclarationName Name,
+               TemplateParameterList *Params, NamedDecl *Decl)
+      : NamedDecl(DK, DC, L, Name), TemplatedDecl(Decl, false),
+        TemplateParams(Params) {}
+
 public:
   /// Get the list of template parameters
   TemplateParameterList *getTemplateParameters() const {
     return TemplateParams;
   }
 
+  /// Get the constraint-expression from the associated requires-clause (if any)
+  const Expr *getRequiresClause() const {
+    return TemplateParams ? TemplateParams->getRequiresClause() : nullptr;
+  }
+
   /// Get the underlying, templated declaration.
-  NamedDecl *getTemplatedDecl() const { return TemplatedDecl; }
+  NamedDecl *getTemplatedDecl() const { return TemplatedDecl.getPointer(); }
 
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
@@ -367,20 +390,30 @@
 
   SourceRange getSourceRange() const override LLVM_READONLY {
     return SourceRange(TemplateParams->getTemplateLoc(),
-                       TemplatedDecl->getSourceRange().getEnd());
+                       TemplatedDecl.getPointer()->getSourceRange().getEnd());
   }
 
+  /// Whether this is a (C++ Concepts TS) function or variable concept.
+  bool isConcept() const { return TemplatedDecl.getInt(); }
+  void setConcept() { TemplatedDecl.setInt(true); }
+
 protected:
-  NamedDecl *TemplatedDecl;
+  /// \brief The named declaration from which this template was instantiated.
+  /// (or null).
+  ///
+  /// The boolean value will be true to indicate that this template
+  /// (function or variable) is a concept.
+  llvm::PointerIntPair<NamedDecl *, 1, bool> TemplatedDecl;
+
   TemplateParameterList* TemplateParams;
 
 public:
   /// \brief Initialize the underlying templated declaration and
   /// template parameters.
   void init(NamedDecl *templatedDecl, TemplateParameterList* templateParams) {
-    assert(!TemplatedDecl && "TemplatedDecl already set!");
+    assert(!TemplatedDecl.getPointer() && "TemplatedDecl already set!");
     assert(!TemplateParams && "TemplateParams already set!");
-    TemplatedDecl = templatedDecl;
+    TemplatedDecl.setPointer(templatedDecl);
     TemplateParams = templateParams;
   }
 };
@@ -481,8 +514,8 @@
   Profile(llvm::FoldingSetNodeID &ID, ArrayRef<TemplateArgument> TemplateArgs,
           ASTContext &Context) {
     ID.AddInteger(TemplateArgs.size());
-    for (unsigned Arg = 0; Arg != TemplateArgs.size(); ++Arg)
-      TemplateArgs[Arg].Profile(ID, Context);
+    for (const TemplateArgument &TemplateArg : TemplateArgs)
+      TemplateArg.Profile(ID, Context);
   }
 };
 
@@ -889,7 +922,7 @@
 
   /// Get the underlying function declaration of the template.
   FunctionDecl *getTemplatedDecl() const {
-    return static_cast<FunctionDecl*>(TemplatedDecl);
+    return static_cast<FunctionDecl *>(TemplatedDecl.getPointer());
   }
 
   /// Returns whether this template declaration defines the primary
@@ -1171,9 +1204,8 @@
                           SourceLocation IdLoc, unsigned D, unsigned P,
                           IdentifierInfo *Id, QualType T,
                           TypeSourceInfo *TInfo,
-                          const QualType *ExpandedTypes,
-                          unsigned NumExpandedTypes,
-                          TypeSourceInfo **ExpandedTInfos);
+                          ArrayRef<QualType> ExpandedTypes,
+                          ArrayRef<TypeSourceInfo *> ExpandedTInfos);
 
   friend class ASTDeclReader;
   friend TrailingObjects;
@@ -1187,9 +1219,8 @@
   static NonTypeTemplateParmDecl *
   Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
          SourceLocation IdLoc, unsigned D, unsigned P, IdentifierInfo *Id,
-         QualType T, TypeSourceInfo *TInfo,
-         const QualType *ExpandedTypes, unsigned NumExpandedTypes,
-         TypeSourceInfo **ExpandedTInfos);
+         QualType T, TypeSourceInfo *TInfo, ArrayRef<QualType> ExpandedTypes,
+         ArrayRef<TypeSourceInfo *> ExpandedTInfos);
 
   static NonTypeTemplateParmDecl *CreateDeserialized(ASTContext &C, 
                                                      unsigned ID);
@@ -1352,8 +1383,7 @@
   TemplateTemplateParmDecl(DeclContext *DC, SourceLocation L,
                            unsigned D, unsigned P,
                            IdentifierInfo *Id, TemplateParameterList *Params,
-                           unsigned NumExpansions,
-                           TemplateParameterList * const *Expansions);
+                           ArrayRef<TemplateParameterList *> Expansions);
 
 public:
   static TemplateTemplateParmDecl *Create(const ASTContext &C, DeclContext *DC,
@@ -1480,8 +1510,8 @@
 };
 
 /// \brief Represents the builtin template declaration which is used to
-/// implement __make_integer_seq.  It serves no real purpose beyond existing as
-/// a place to hold template parameters.
+/// implement __make_integer_seq and other builtin templates.  It serves
+/// no real purpose beyond existing as a place to hold template parameters.
 class BuiltinTemplateDecl : public TemplateDecl {
   void anchor() override;
 
@@ -1573,8 +1603,7 @@
                                   DeclContext *DC, SourceLocation StartLoc,
                                   SourceLocation IdLoc,
                                   ClassTemplateDecl *SpecializedTemplate,
-                                  const TemplateArgument *Args,
-                                  unsigned NumArgs,
+                                  ArrayRef<TemplateArgument> Args,
                                   ClassTemplateSpecializationDecl *PrevDecl);
 
   explicit ClassTemplateSpecializationDecl(ASTContext &C, Kind DK);
@@ -1584,8 +1613,7 @@
   Create(ASTContext &Context, TagKind TK, DeclContext *DC,
          SourceLocation StartLoc, SourceLocation IdLoc,
          ClassTemplateDecl *SpecializedTemplate,
-         const TemplateArgument *Args,
-         unsigned NumArgs,
+         ArrayRef<TemplateArgument> Args,
          ClassTemplateSpecializationDecl *PrevDecl);
   static ClassTemplateSpecializationDecl *
   CreateDeserialized(ASTContext &C, unsigned ID);
@@ -1762,8 +1790,8 @@
   Profile(llvm::FoldingSetNodeID &ID, ArrayRef<TemplateArgument> TemplateArgs,
           ASTContext &Context) {
     ID.AddInteger(TemplateArgs.size());
-    for (unsigned Arg = 0; Arg != TemplateArgs.size(); ++Arg)
-      TemplateArgs[Arg].Profile(ID, Context);
+    for (const TemplateArgument &TemplateArg : TemplateArgs)
+      TemplateArg.Profile(ID, Context);
   }
 
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
@@ -1801,8 +1829,7 @@
                                          SourceLocation IdLoc,
                                          TemplateParameterList *Params,
                                          ClassTemplateDecl *SpecializedTemplate,
-                                         const TemplateArgument *Args,
-                                         unsigned NumArgs,
+                                         ArrayRef<TemplateArgument> Args,
                                const ASTTemplateArgumentListInfo *ArgsAsWritten,
                                ClassTemplatePartialSpecializationDecl *PrevDecl);
 
@@ -1817,8 +1844,7 @@
          SourceLocation StartLoc, SourceLocation IdLoc,
          TemplateParameterList *Params,
          ClassTemplateDecl *SpecializedTemplate,
-         const TemplateArgument *Args,
-         unsigned NumArgs,
+         ArrayRef<TemplateArgument> Args,
          const TemplateArgumentListInfo &ArgInfos,
          QualType CanonInjectedType,
          ClassTemplatePartialSpecializationDecl *PrevDecl);
@@ -1867,6 +1893,10 @@
         cast<ClassTemplatePartialSpecializationDecl>(getFirstDecl());
     return First->InstantiatedFromMember.getPointer();
   }
+  ClassTemplatePartialSpecializationDecl *
+  getInstantiatedFromMemberTemplate() const {
+    return getInstantiatedFromMember();
+  }
 
   void setInstantiatedFromMember(
                           ClassTemplatePartialSpecializationDecl *PartialSpec) {
@@ -1982,7 +2012,7 @@
 
   /// \brief Get the underlying class declarations of the template.
   CXXRecordDecl *getTemplatedDecl() const {
-    return static_cast<CXXRecordDecl *>(TemplatedDecl);
+    return static_cast<CXXRecordDecl *>(TemplatedDecl.getPointer());
   }
 
   /// \brief Returns whether this template declaration defines the primary
@@ -2154,18 +2184,11 @@
   // Location of the 'friend' specifier.
   SourceLocation FriendLoc;
 
-
   FriendTemplateDecl(DeclContext *DC, SourceLocation Loc,
-                     unsigned NParams,
-                     TemplateParameterList **Params,
-                     FriendUnion Friend,
-                     SourceLocation FriendLoc)
-    : Decl(Decl::FriendTemplate, DC, Loc),
-      NumParams(NParams),
-      Params(Params),
-      Friend(Friend),
-      FriendLoc(FriendLoc)
-  {}
+                     MutableArrayRef<TemplateParameterList *> Params,
+                     FriendUnion Friend, SourceLocation FriendLoc)
+      : Decl(Decl::FriendTemplate, DC, Loc), NumParams(Params.size()),
+        Params(Params.data()), Friend(Friend), FriendLoc(FriendLoc) {}
 
   FriendTemplateDecl(EmptyShell Empty)
     : Decl(Decl::FriendTemplate, Empty),
@@ -2174,12 +2197,10 @@
   {}
 
 public:
-  static FriendTemplateDecl *Create(ASTContext &Context,
-                                    DeclContext *DC, SourceLocation Loc,
-                                    unsigned NParams,
-                                    TemplateParameterList **Params,
-                                    FriendUnion Friend,
-                                    SourceLocation FriendLoc);
+  static FriendTemplateDecl *
+  Create(ASTContext &Context, DeclContext *DC, SourceLocation Loc,
+         MutableArrayRef<TemplateParameterList *> Params, FriendUnion Friend,
+         SourceLocation FriendLoc);
 
   static FriendTemplateDecl *CreateDeserialized(ASTContext &C, unsigned ID);
 
@@ -2245,7 +2266,7 @@
 public:
   /// Get the underlying function declaration of the template.
   TypeAliasDecl *getTemplatedDecl() const {
-    return static_cast<TypeAliasDecl*>(TemplatedDecl);
+    return static_cast<TypeAliasDecl *>(TemplatedDecl.getPointer());
   }
 
 
@@ -2319,9 +2340,9 @@
   ClassScopeFunctionSpecializationDecl(DeclContext *DC, SourceLocation Loc,
                                        CXXMethodDecl *FD, bool Args,
                                        TemplateArgumentListInfo TemplArgs)
-    : Decl(Decl::ClassScopeFunctionSpecialization, DC, Loc),
-      Specialization(FD), HasExplicitTemplateArgs(Args),
-      TemplateArgs(TemplArgs) {}
+      : Decl(Decl::ClassScopeFunctionSpecialization, DC, Loc),
+        Specialization(FD), HasExplicitTemplateArgs(Args),
+        TemplateArgs(std::move(TemplArgs)) {}
 
   ClassScopeFunctionSpecializationDecl(EmptyShell Empty)
     : Decl(Decl::ClassScopeFunctionSpecialization, Empty) {}
@@ -2342,7 +2363,7 @@
                                                    bool HasExplicitTemplateArgs,
                                         TemplateArgumentListInfo TemplateArgs) {
     return new (C, DC) ClassScopeFunctionSpecializationDecl(
-        DC, Loc, FD, HasExplicitTemplateArgs, TemplateArgs);
+        DC, Loc, FD, HasExplicitTemplateArgs, std::move(TemplateArgs));
   }
 
   static ClassScopeFunctionSpecializationDecl *
@@ -2428,8 +2449,8 @@
                                 SourceLocation StartLoc, SourceLocation IdLoc,
                                 VarTemplateDecl *SpecializedTemplate,
                                 QualType T, TypeSourceInfo *TInfo,
-                                StorageClass S, const TemplateArgument *Args,
-                                unsigned NumArgs);
+                                StorageClass S,
+                                ArrayRef<TemplateArgument> Args);
 
   explicit VarTemplateSpecializationDecl(Kind DK, ASTContext &Context);
 
@@ -2437,8 +2458,8 @@
   static VarTemplateSpecializationDecl *
   Create(ASTContext &Context, DeclContext *DC, SourceLocation StartLoc,
          SourceLocation IdLoc, VarTemplateDecl *SpecializedTemplate, QualType T,
-         TypeSourceInfo *TInfo, StorageClass S, const TemplateArgument *Args,
-         unsigned NumArgs);
+         TypeSourceInfo *TInfo, StorageClass S,
+         ArrayRef<TemplateArgument> Args);
   static VarTemplateSpecializationDecl *CreateDeserialized(ASTContext &C,
                                                            unsigned ID);
 
@@ -2502,17 +2523,11 @@
   /// it was instantiated.
   llvm::PointerUnion<VarTemplateDecl *, VarTemplatePartialSpecializationDecl *>
   getInstantiatedFrom() const {
-    if (getSpecializationKind() != TSK_ImplicitInstantiation &&
-        getSpecializationKind() != TSK_ExplicitInstantiationDefinition &&
-        getSpecializationKind() != TSK_ExplicitInstantiationDeclaration)
+    if (!isTemplateInstantiation(getSpecializationKind()))
       return llvm::PointerUnion<VarTemplateDecl *,
                                 VarTemplatePartialSpecializationDecl *>();
 
-    if (SpecializedPartialSpecialization *PartialSpec =
-            SpecializedTemplate.dyn_cast<SpecializedPartialSpecialization *>())
-      return PartialSpec->PartialSpecialization;
-
-    return SpecializedTemplate.get<VarTemplateDecl *>();
+    return getSpecializedTemplateOrPartial();
   }
 
   /// \brief Retrieve the variable template or variable template partial
@@ -2610,8 +2625,8 @@
                       ArrayRef<TemplateArgument> TemplateArgs,
                       ASTContext &Context) {
     ID.AddInteger(TemplateArgs.size());
-    for (unsigned Arg = 0; Arg != TemplateArgs.size(); ++Arg)
-      TemplateArgs[Arg].Profile(ID, Context);
+    for (const TemplateArgument &TemplateArg : TemplateArgs)
+      TemplateArg.Profile(ID, Context);
   }
 
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
@@ -2647,7 +2662,7 @@
       ASTContext &Context, DeclContext *DC, SourceLocation StartLoc,
       SourceLocation IdLoc, TemplateParameterList *Params,
       VarTemplateDecl *SpecializedTemplate, QualType T, TypeSourceInfo *TInfo,
-      StorageClass S, const TemplateArgument *Args, unsigned NumArgs,
+      StorageClass S, ArrayRef<TemplateArgument> Args,
       const ASTTemplateArgumentListInfo *ArgInfos);
 
   VarTemplatePartialSpecializationDecl(ASTContext &Context)
@@ -2660,8 +2675,8 @@
   Create(ASTContext &Context, DeclContext *DC, SourceLocation StartLoc,
          SourceLocation IdLoc, TemplateParameterList *Params,
          VarTemplateDecl *SpecializedTemplate, QualType T,
-         TypeSourceInfo *TInfo, StorageClass S, const TemplateArgument *Args,
-         unsigned NumArgs, const TemplateArgumentListInfo &ArgInfos);
+         TypeSourceInfo *TInfo, StorageClass S, ArrayRef<TemplateArgument> Args,
+         const TemplateArgumentListInfo &ArgInfos);
 
   static VarTemplatePartialSpecializationDecl *CreateDeserialized(ASTContext &C,
                                                                   unsigned ID);
@@ -2808,7 +2823,7 @@
 
   /// \brief Get the underlying variable declarations of the template.
   VarDecl *getTemplatedDecl() const {
-    return static_cast<VarDecl *>(TemplatedDecl);
+    return static_cast<VarDecl *>(TemplatedDecl.getPointer());
   }
 
   /// \brief Returns whether this template declaration defines the primary
diff --git a/include/clang/AST/Expr.h b/include/clang/AST/Expr.h
index de28669..9179c77 100644
--- a/include/clang/AST/Expr.h
+++ b/include/clang/AST/Expr.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Compiler.h"
 
 namespace clang {
@@ -593,6 +594,13 @@
   bool EvaluateAsInt(llvm::APSInt &Result, const ASTContext &Ctx,
                      SideEffectsKind AllowSideEffects = SE_NoSideEffects) const;
 
+  /// EvaluateAsFloat - Return true if this is a constant which we can fold and
+  /// convert to a floating point value, using any crazy technique that we
+  /// want to.
+  bool
+  EvaluateAsFloat(llvm::APFloat &Result, const ASTContext &Ctx,
+                  SideEffectsKind AllowSideEffects = SE_NoSideEffects) const;
+
   /// isEvaluatable - Call EvaluateAsRValue to see if this expression can be
   /// constant folded without side-effects, but discard the result.
   bool isEvaluatable(const ASTContext &Ctx,
@@ -1112,6 +1120,10 @@
     return getTrailingObjects<ASTTemplateKWAndArgsInfo>()->NumTemplateArgs;
   }
 
+  ArrayRef<TemplateArgumentLoc> template_arguments() const {
+    return {getTemplateArgs(), getNumTemplateArgs()};
+  }
+
   /// \brief Returns true if this expression refers to a function that
   /// was resolved from an overloaded set having size greater than 1.
   bool hadMultipleCandidates() const {
@@ -2483,6 +2495,10 @@
     return getTrailingObjects<ASTTemplateKWAndArgsInfo>()->NumTemplateArgs;
   }
 
+  ArrayRef<TemplateArgumentLoc> template_arguments() const {
+    return {getTemplateArgs(), getNumTemplateArgs()};
+  }
+
   /// \brief Retrieve the member declaration name info.
   DeclarationNameInfo getMemberNameInfo() const {
     return DeclarationNameInfo(MemberDecl->getDeclName(),
@@ -3948,7 +3964,7 @@
 
   /// Whether this designated initializer used the GNU deprecated
   /// syntax rather than the C99 '=' syntax.
-  bool GNUSyntax : 1;
+  unsigned GNUSyntax : 1;
 
   /// The number of designators in this initializer expression.
   unsigned NumDesignators : 15;
@@ -3962,11 +3978,10 @@
   /// expression.
   Designator *Designators;
 
-
-  DesignatedInitExpr(const ASTContext &C, QualType Ty, unsigned NumDesignators,
-                     const Designator *Designators,
+  DesignatedInitExpr(const ASTContext &C, QualType Ty,
+                     llvm::ArrayRef<Designator> Designators,
                      SourceLocation EqualOrColonLoc, bool GNUSyntax,
-                     ArrayRef<Expr*> IndexExprs, Expr *Init);
+                     ArrayRef<Expr *> IndexExprs, Expr *Init);
 
   explicit DesignatedInitExpr(unsigned NumSubExprs)
     : Expr(DesignatedInitExprClass, EmptyShell()),
@@ -4126,8 +4141,7 @@
   };
 
   static DesignatedInitExpr *Create(const ASTContext &C,
-                                    Designator *Designators,
-                                    unsigned NumDesignators,
+                                    llvm::ArrayRef<Designator> Designators,
                                     ArrayRef<Expr*> IndexExprs,
                                     SourceLocation EqualOrColonLoc,
                                     bool GNUSyntax, Expr *Init);
@@ -4139,48 +4153,15 @@
   unsigned size() const { return NumDesignators; }
 
   // Iterator access to the designators.
-  typedef Designator *designators_iterator;
-  designators_iterator designators_begin() { return Designators; }
-  designators_iterator designators_end() {
-    return Designators + NumDesignators;
+  llvm::MutableArrayRef<Designator> designators() {
+    return {Designators, NumDesignators};
   }
 
-  typedef const Designator *const_designators_iterator;
-  const_designators_iterator designators_begin() const { return Designators; }
-  const_designators_iterator designators_end() const {
-    return Designators + NumDesignators;
+  llvm::ArrayRef<Designator> designators() const {
+    return {Designators, NumDesignators};
   }
 
-  typedef llvm::iterator_range<designators_iterator> designators_range;
-  designators_range designators() {
-    return designators_range(designators_begin(), designators_end());
-  }
-
-  typedef llvm::iterator_range<const_designators_iterator>
-          designators_const_range;
-  designators_const_range designators() const {
-    return designators_const_range(designators_begin(), designators_end());
-  }
-
-  typedef std::reverse_iterator<designators_iterator>
-          reverse_designators_iterator;
-  reverse_designators_iterator designators_rbegin() {
-    return reverse_designators_iterator(designators_end());
-  }
-  reverse_designators_iterator designators_rend() {
-    return reverse_designators_iterator(designators_begin());
-  }
-
-  typedef std::reverse_iterator<const_designators_iterator>
-          const_reverse_designators_iterator;
-  const_reverse_designators_iterator designators_rbegin() const {
-    return const_reverse_designators_iterator(designators_end());
-  }
-  const_reverse_designators_iterator designators_rend() const {
-    return const_reverse_designators_iterator(designators_begin());
-  }
-
-  Designator *getDesignator(unsigned Idx) { return &designators_begin()[Idx]; }
+  Designator *getDesignator(unsigned Idx) { return &designators()[Idx]; }
 
   void setDesignators(const ASTContext &C, const Designator *Desigs,
                       unsigned NumDesigs);
@@ -4830,16 +4811,6 @@
     BI_First = 0
   };
 
-  // The ABI values for various atomic memory orderings.
-  enum AtomicOrderingKind {
-    AO_ABI_memory_order_relaxed = 0,
-    AO_ABI_memory_order_consume = 1,
-    AO_ABI_memory_order_acquire = 2,
-    AO_ABI_memory_order_release = 3,
-    AO_ABI_memory_order_acq_rel = 4,
-    AO_ABI_memory_order_seq_cst = 5
-  };
-
 private:
   enum { PTR, ORDER, VAL1, ORDER_FAIL, VAL2, WEAK, END_EXPR };
   Stmt* SubExprs[END_EXPR];
diff --git a/include/clang/AST/ExprCXX.h b/include/clang/AST/ExprCXX.h
index 2b8c0ea..e3e4ca2 100644
--- a/include/clang/AST/ExprCXX.h
+++ b/include/clang/AST/ExprCXX.h
@@ -16,6 +16,7 @@
 #define LLVM_CLANG_AST_EXPRCXX_H
 
 #include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/LambdaCapture.h"
 #include "clang/AST/TemplateBase.h"
@@ -26,9 +27,6 @@
 
 namespace clang {
 
-class CXXConstructorDecl;
-class CXXDestructorDecl;
-class CXXMethodDecl;
 class CXXTemporary;
 class MSPropertyDecl;
 class TemplateArgumentListInfo;
@@ -145,6 +143,14 @@
   /// FIXME: Returns 0 for member pointer call exprs.
   CXXRecordDecl *getRecordDecl() const;
 
+  SourceLocation getExprLoc() const LLVM_READONLY {
+    SourceLocation CLoc = getCallee()->getExprLoc();
+    if (CLoc.isValid())
+      return CLoc;
+
+    return getLocStart();
+  }
+
   static bool classof(const Stmt *T) {
     return T->getStmtClass() == CXXMemberCallExprClass;
   }
@@ -778,22 +784,23 @@
 class CXXUuidofExpr : public Expr {
 private:
   llvm::PointerUnion<Stmt *, TypeSourceInfo *> Operand;
+  StringRef UuidStr;
   SourceRange Range;
 
 public:
-  CXXUuidofExpr(QualType Ty, TypeSourceInfo *Operand, SourceRange R)
-    : Expr(CXXUuidofExprClass, Ty, VK_LValue, OK_Ordinary,
-           false, Operand->getType()->isDependentType(),
-           Operand->getType()->isInstantiationDependentType(),
-           Operand->getType()->containsUnexpandedParameterPack()),
-      Operand(Operand), Range(R) { }
+  CXXUuidofExpr(QualType Ty, TypeSourceInfo *Operand, StringRef UuidStr,
+                SourceRange R)
+      : Expr(CXXUuidofExprClass, Ty, VK_LValue, OK_Ordinary, false,
+             Operand->getType()->isDependentType(),
+             Operand->getType()->isInstantiationDependentType(),
+             Operand->getType()->containsUnexpandedParameterPack()),
+        Operand(Operand), UuidStr(UuidStr), Range(R) {}
 
-  CXXUuidofExpr(QualType Ty, Expr *Operand, SourceRange R)
-    : Expr(CXXUuidofExprClass, Ty, VK_LValue, OK_Ordinary,
-           false, Operand->isTypeDependent(),
-           Operand->isInstantiationDependent(),
-           Operand->containsUnexpandedParameterPack()),
-      Operand(Operand), Range(R) { }
+  CXXUuidofExpr(QualType Ty, Expr *Operand, StringRef UuidStr, SourceRange R)
+      : Expr(CXXUuidofExprClass, Ty, VK_LValue, OK_Ordinary, false,
+             Operand->isTypeDependent(), Operand->isInstantiationDependent(),
+             Operand->containsUnexpandedParameterPack()),
+        Operand(Operand), UuidStr(UuidStr), Range(R) {}
 
   CXXUuidofExpr(EmptyShell Empty, bool isExpr)
     : Expr(CXXUuidofExprClass, Empty) {
@@ -830,7 +837,8 @@
     Operand = E;
   }
 
-  StringRef getUuidAsStringRef(ASTContext &Context) const;
+  void setUuidStr(StringRef US) { UuidStr = US; }
+  StringRef getUuidStr() const { return UuidStr; }
 
   SourceLocation getLocStart() const LLVM_READONLY { return Range.getBegin(); }
   SourceLocation getLocEnd() const LLVM_READONLY { return Range.getEnd(); }
@@ -841,11 +849,6 @@
     return T->getStmtClass() == CXXUuidofExprClass;
   }
 
-  /// Grabs __declspec(uuid()) off a type, or returns 0 if we cannot resolve to
-  /// a single GUID.
-  static const UuidAttr *GetUuidAttrOfType(QualType QT,
-                                           bool *HasMultipleGUIDsPtr = nullptr);
-
   // Iterators
   child_range children() {
     if (isTypeOperand())
@@ -1171,18 +1174,21 @@
   SourceLocation Loc;
   SourceRange ParenOrBraceRange;
   unsigned NumArgs : 16;
-  bool Elidable : 1;
-  bool HadMultipleCandidates : 1;
-  bool ListInitialization : 1;
-  bool StdInitListInitialization : 1;
-  bool ZeroInitialization : 1;
+  unsigned Elidable : 1;
+  unsigned HadMultipleCandidates : 1;
+  unsigned ListInitialization : 1;
+  unsigned StdInitListInitialization : 1;
+  unsigned ZeroInitialization : 1;
   unsigned ConstructKind : 2;
   Stmt **Args;
 
+  void setConstructor(CXXConstructorDecl *C) { Constructor = C; }
+
 protected:
   CXXConstructExpr(const ASTContext &C, StmtClass SC, QualType T,
                    SourceLocation Loc,
-                   CXXConstructorDecl *d, bool elidable,
+                   CXXConstructorDecl *Ctor,
+                   bool Elidable,
                    ArrayRef<Expr *> Args,
                    bool HadMultipleCandidates,
                    bool ListInitialization,
@@ -1201,15 +1207,12 @@
 public:
   /// \brief Construct an empty C++ construction expression.
   explicit CXXConstructExpr(EmptyShell Empty)
-    : Expr(CXXConstructExprClass, Empty), Constructor(nullptr),
-      NumArgs(0), Elidable(false), HadMultipleCandidates(false),
-      ListInitialization(false), ZeroInitialization(false),
-      ConstructKind(0), Args(nullptr)
-  { }
+    : CXXConstructExpr(CXXConstructExprClass, Empty) {}
 
   static CXXConstructExpr *Create(const ASTContext &C, QualType T,
                                   SourceLocation Loc,
-                                  CXXConstructorDecl *D, bool Elidable,
+                                  CXXConstructorDecl *Ctor,
+                                  bool Elidable,
                                   ArrayRef<Expr *> Args,
                                   bool HadMultipleCandidates,
                                   bool ListInitialization,
@@ -1218,8 +1221,8 @@
                                   ConstructionKind ConstructKind,
                                   SourceRange ParenOrBraceRange);
 
+  /// \brief Get the constructor that this expression will (ultimately) call.
   CXXConstructorDecl *getConstructor() const { return Constructor; }
-  void setConstructor(CXXConstructorDecl *C) { Constructor = C; }
 
   SourceLocation getLocation() const { return Loc; }
   void setLocation(SourceLocation Loc) { this->Loc = Loc; }
@@ -1315,6 +1318,73 @@
   friend class ASTStmtReader;
 };
 
+/// \brief Represents a call to an inherited base class constructor from an
+/// inheriting constructor. This call implicitly forwards the arguments from
+/// the enclosing context (an inheriting constructor) to the specified inherited
+/// base class constructor.
+class CXXInheritedCtorInitExpr : public Expr {
+private:
+  CXXConstructorDecl *Constructor;
+
+  /// The location of the using declaration.
+  SourceLocation Loc;
+
+  /// Whether this is the construction of a virtual base.
+  unsigned ConstructsVirtualBase : 1;
+
+  /// Whether the constructor is inherited from a virtual base class of the
+  /// class that we construct.
+  unsigned InheritedFromVirtualBase : 1;
+
+public:
+  /// \brief Construct a C++ inheriting construction expression.
+  CXXInheritedCtorInitExpr(SourceLocation Loc, QualType T,
+                           CXXConstructorDecl *Ctor, bool ConstructsVirtualBase,
+                           bool InheritedFromVirtualBase)
+      : Expr(CXXInheritedCtorInitExprClass, T, VK_RValue, OK_Ordinary, false,
+             false, false, false),
+        Constructor(Ctor), Loc(Loc),
+        ConstructsVirtualBase(ConstructsVirtualBase),
+        InheritedFromVirtualBase(InheritedFromVirtualBase) {
+    assert(!T->isDependentType());
+  }
+
+  /// \brief Construct an empty C++ inheriting construction expression.
+  explicit CXXInheritedCtorInitExpr(EmptyShell Empty)
+      : Expr(CXXInheritedCtorInitExprClass, Empty), Constructor(nullptr),
+        ConstructsVirtualBase(false), InheritedFromVirtualBase(false) {}
+
+  /// \brief Get the constructor that this expression will call.
+  CXXConstructorDecl *getConstructor() const { return Constructor; }
+
+  /// \brief Determine whether this constructor is actually constructing
+  /// a base class (rather than a complete object).
+  bool constructsVBase() const { return ConstructsVirtualBase; }
+  CXXConstructExpr::ConstructionKind getConstructionKind() const {
+    return ConstructsVirtualBase ? CXXConstructExpr::CK_VirtualBase
+                                 : CXXConstructExpr::CK_NonVirtualBase;
+  }
+
+  /// \brief Determine whether the inherited constructor is inherited from a
+  /// virtual base of the object we construct. If so, we are not responsible
+  /// for calling the inherited constructor (the complete object constructor
+  /// does that), and so we don't need to pass any arguments.
+  bool inheritedFromVBase() const { return InheritedFromVirtualBase; }
+
+  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
+  SourceLocation getLocStart() const LLVM_READONLY { return Loc; }
+  SourceLocation getLocEnd() const LLVM_READONLY { return Loc; }
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == CXXInheritedCtorInitExprClass;
+  }
+  child_range children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  friend class ASTStmtReader;
+};
+
 /// \brief Represents an explicit C++ type conversion that uses "functional"
 /// notation (C++ [expr.type.conv]).
 ///
@@ -1385,7 +1455,8 @@
   TypeSourceInfo *Type;
 
 public:
-  CXXTemporaryObjectExpr(const ASTContext &C, CXXConstructorDecl *Cons,
+  CXXTemporaryObjectExpr(const ASTContext &C,
+                         CXXConstructorDecl *Cons,
                          TypeSourceInfo *Type,
                          ArrayRef<Expr *> Args,
                          SourceRange ParenOrBraceRange,
@@ -1754,12 +1825,12 @@
   SourceRange DirectInitRange;
 
   /// Was the usage ::new, i.e. is the global new to be used?
-  bool GlobalNew : 1;
+  unsigned GlobalNew : 1;
   /// Do we allocate an array? If so, the first SubExpr is the size expression.
-  bool Array : 1;
+  unsigned Array : 1;
   /// If this is an array allocation, does the usual deallocation
   /// function for the allocated type want to know the allocated size?
-  bool UsualArrayDeleteWantsSize : 1;
+  unsigned UsualArrayDeleteWantsSize : 1;
   /// The number of placement new arguments.
   unsigned NumPlacementArgs : 13;
   /// What kind of initializer do we have? Could be none, parens, or braces.
@@ -2358,7 +2429,7 @@
   /// \brief The trait. A ExpressionTrait enum in MSVC compatible unsigned.
   unsigned ET : 31;
   /// \brief The value of the type trait. Unspecified if dependent.
-  bool Value : 1;
+  unsigned Value : 1;
 
   /// \brief The location of the type trait keyword.
   SourceLocation Loc;
@@ -2567,6 +2638,10 @@
     return getTrailingASTTemplateKWAndArgsInfo()->NumTemplateArgs;
   }
 
+  ArrayRef<TemplateArgumentLoc> template_arguments() const {
+    return {getTemplateArgs(), getNumTemplateArgs()};
+  }
+
   /// \brief Copies the template arguments into the given structure.
   void copyTemplateArgumentsInto(TemplateArgumentListInfo &List) const {
     if (hasExplicitTemplateArgs())
@@ -2820,6 +2895,10 @@
     return getTrailingObjects<ASTTemplateKWAndArgsInfo>()->NumTemplateArgs;
   }
 
+  ArrayRef<TemplateArgumentLoc> template_arguments() const {
+    return {getTemplateArgs(), getNumTemplateArgs()};
+  }
+
   /// Note: getLocStart() is the start of the whole DependentScopeDeclRefExpr,
   /// and differs from getLocation().getStart().
   SourceLocation getLocStart() const LLVM_READONLY {
@@ -2868,7 +2947,8 @@
   Stmt *SubExpr;
 
   ExprWithCleanups(EmptyShell, unsigned NumObjects);
-  ExprWithCleanups(Expr *SubExpr, ArrayRef<CleanupObject> Objects);
+  ExprWithCleanups(Expr *SubExpr, bool CleanupsHaveSideEffects,
+                   ArrayRef<CleanupObject> Objects);
 
   friend TrailingObjects;
   friend class ASTStmtReader;
@@ -2878,6 +2958,7 @@
                                   unsigned numObjects);
 
   static ExprWithCleanups *Create(const ASTContext &C, Expr *subexpr,
+                                  bool CleanupsHaveSideEffects,
                                   ArrayRef<CleanupObject> objects);
 
   ArrayRef<CleanupObject> getObjects() const {
@@ -2894,6 +2975,9 @@
 
   Expr *getSubExpr() { return cast<Expr>(SubExpr); }
   const Expr *getSubExpr() const { return cast<Expr>(SubExpr); }
+  bool cleanupsHaveSideEffects() const {
+    return ExprWithCleanupsBits.CleanupsHaveSideEffects;
+  }
 
   /// As with any mutator of the AST, be very careful
   /// when modifying an existing AST to preserve its invariants.
@@ -3230,6 +3314,10 @@
     return getTrailingObjects<ASTTemplateKWAndArgsInfo>()->NumTemplateArgs;
   }
 
+  ArrayRef<TemplateArgumentLoc> template_arguments() const {
+    return {getTemplateArgs(), getNumTemplateArgs()};
+  }
+
   SourceLocation getLocStart() const LLVM_READONLY {
     if (!isImplicitAccess())
       return Base->getLocStart();
@@ -3923,6 +4011,12 @@
     // within a default initializer.
     if (isa<FieldDecl>(ExtendingDecl))
       return SD_Automatic;
+    // FIXME: This only works because storage class specifiers are not allowed
+    // on decomposition declarations.
+    if (isa<BindingDecl>(ExtendingDecl))
+      return ExtendingDecl->getDeclContext()->isFunctionOrMethod()
+                 ? SD_Automatic
+                 : SD_Static;
     return cast<VarDecl>(ExtendingDecl)->getStorageDuration();
   }
 
diff --git a/include/clang/AST/ExprObjC.h b/include/clang/AST/ExprObjC.h
index 61e6383..5f9623d 100644
--- a/include/clang/AST/ExprObjC.h
+++ b/include/clang/AST/ExprObjC.h
@@ -1562,7 +1562,52 @@
     return T->getStmtClass() == ObjCBridgedCastExprClass;
   }
 };
-  
+
+/// \brief A runtime availability query.
+///
+/// There are 2 ways to spell this node:
+/// \code
+///   @available(macos 10.10, ios 8, *); // Objective-C
+///   __builtin_available(macos 10.10, ios 8, *); // C, C++, and Objective-C
+/// \endcode
+///
+/// Note that we only need to keep track of one \c VersionTuple here, which is
+/// the one that corresponds to the current deployment target. This is meant to
+/// be used in the condition of an \c if, but it is also usable as top level
+/// expressions.
+///
+class ObjCAvailabilityCheckExpr : public Expr {
+  VersionTuple VersionToCheck;
+  SourceLocation AtLoc, RParen;
+
+  friend class ASTStmtReader;
+public:
+  ObjCAvailabilityCheckExpr(VersionTuple VersionToCheck, SourceLocation AtLoc,
+                            SourceLocation RParen, QualType Ty)
+      : Expr(ObjCAvailabilityCheckExprClass, Ty, VK_RValue, OK_Ordinary, false,
+             false, false, false),
+        VersionToCheck(VersionToCheck), AtLoc(AtLoc), RParen(RParen) {}
+
+  explicit ObjCAvailabilityCheckExpr(EmptyShell Shell)
+      : Expr(ObjCAvailabilityCheckExprClass, Shell) {}
+
+  SourceLocation getLocStart() const { return AtLoc; }
+  SourceLocation getLocEnd() const { return RParen; }
+  SourceRange getSourceRange() const { return {AtLoc, RParen}; }
+
+  /// \brief This may be '*', in which case this should fold to true.
+  bool hasVersion() const { return !VersionToCheck.empty(); }
+  VersionTuple getVersion() { return VersionToCheck; }
+
+  child_range children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == ObjCAvailabilityCheckExprClass;
+  }
+};
+
 }  // end namespace clang
 
 #endif
diff --git a/include/clang/AST/ExprOpenMP.h b/include/clang/AST/ExprOpenMP.h
index 2d71a3a..a4ef93b 100644
--- a/include/clang/AST/ExprOpenMP.h
+++ b/include/clang/AST/ExprOpenMP.h
@@ -85,7 +85,7 @@
   void setBase(Expr *E) { SubExprs[BASE] = E; }
 
   /// \brief Return original type of the base expression for array section.
-  static QualType getBaseOriginalType(Expr *Base);
+  static QualType getBaseOriginalType(const Expr *Base);
 
   /// \brief Get lower bound of array section.
   Expr *getLowerBound() { return cast_or_null<Expr>(SubExprs[LOWER_BOUND]); }
diff --git a/include/clang/AST/ExternalASTSource.h b/include/clang/AST/ExternalASTSource.h
index 81cf631..2e99f39 100644
--- a/include/clang/AST/ExternalASTSource.h
+++ b/include/clang/AST/ExternalASTSource.h
@@ -503,8 +503,9 @@
   /// We define this as a wrapping iterator around an int. The
   /// iterator_adaptor_base class forwards the iterator methods to basic integer
   /// arithmetic.
-  class iterator : public llvm::iterator_adaptor_base<
-                       iterator, int, std::random_access_iterator_tag, T, int> {
+  class iterator
+      : public llvm::iterator_adaptor_base<
+            iterator, int, std::random_access_iterator_tag, T, int, T *, T &> {
     LazyVector *Self;
 
     iterator(LazyVector *Self, int Position)
diff --git a/include/clang/AST/GlobalDecl.h b/include/clang/AST/GlobalDecl.h
index 54c9d88..adf63a3 100644
--- a/include/clang/AST/GlobalDecl.h
+++ b/include/clang/AST/GlobalDecl.h
@@ -17,6 +17,7 @@
 
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/Basic/ABI.h"
 
 namespace clang {
@@ -43,6 +44,7 @@
   GlobalDecl(const BlockDecl *D) { Init(D); }
   GlobalDecl(const CapturedDecl *D) { Init(D); }
   GlobalDecl(const ObjCMethodDecl *D) { Init(D); }
+  GlobalDecl(const OMPDeclareReductionDecl *D) { Init(D); }
 
   GlobalDecl(const CXXConstructorDecl *D, CXXCtorType Type)
   : Value(D, Type) {}
diff --git a/include/clang/AST/LambdaCapture.h b/include/clang/AST/LambdaCapture.h
index ddefa88..6517d65 100644
--- a/include/clang/AST/LambdaCapture.h
+++ b/include/clang/AST/LambdaCapture.h
@@ -33,10 +33,22 @@
     /// given capture was by-copy.
     ///
     /// This includes the case of a non-reference init-capture.
-    Capture_ByCopy = 0x02
+    Capture_ByCopy = 0x02,
+
+    /// \brief Flag used by the Capture class to distinguish between a capture
+    /// of '*this' and a capture of a VLA type.
+    Capture_This = 0x04
   };
 
-  llvm::PointerIntPair<Decl *, 2> DeclAndBits;
+  // Decl could represent:
+  // - a VarDecl* that represents the variable that was captured or the 
+  //   init-capture.
+  // - or, is a nullptr and Capture_This is set in Bits if this represents a
+  //   capture of '*this' by value or reference.
+  // - or, is a nullptr and Capture_This is not set in Bits if this represents
+  //   a capture of a VLA type.
+  llvm::PointerIntPair<Decl*, 3> DeclAndBits;
+
   SourceLocation Loc;
   SourceLocation EllipsisLoc;
 
@@ -69,8 +81,8 @@
   /// \brief Determine whether this capture handles the C++ \c this
   /// pointer.
   bool capturesThis() const {
-    return (DeclAndBits.getPointer() == nullptr) &&
-           !(DeclAndBits.getInt() & Capture_ByCopy);
+    return DeclAndBits.getPointer() == nullptr &&
+          (DeclAndBits.getInt() & Capture_This);
   }
 
   /// \brief Determine whether this capture handles a variable.
@@ -81,8 +93,8 @@
   /// \brief Determine whether this captures a variable length array bound
   /// expression.
   bool capturesVLAType() const {
-    return (DeclAndBits.getPointer() == nullptr) &&
-           (DeclAndBits.getInt() & Capture_ByCopy);
+    return DeclAndBits.getPointer() == nullptr &&
+           !(DeclAndBits.getInt() & Capture_This);
   }
 
   /// \brief Retrieve the declaration of the local variable being
@@ -91,13 +103,15 @@
   /// This operation is only valid if this capture is a variable capture
   /// (other than a capture of \c this).
   VarDecl *getCapturedVar() const {
-    assert(capturesVariable() && "No variable available for 'this' capture");
-    return cast<VarDecl>(DeclAndBits.getPointer());
+    assert(capturesVariable() && "No variable available for capture");
+    return static_cast<VarDecl *>(DeclAndBits.getPointer());
   }
 
   /// \brief Determine whether this was an implicit capture (not
   /// written between the square brackets introducing the lambda).
-  bool isImplicit() const { return DeclAndBits.getInt() & Capture_Implicit; }
+  bool isImplicit() const {
+    return DeclAndBits.getInt() & Capture_Implicit;
+  }
 
   /// \brief Determine whether this was an explicit capture (written
   /// between the square brackets introducing the lambda).
diff --git a/include/clang/AST/LocInfoType.h b/include/clang/AST/LocInfoType.h
new file mode 100644
index 0000000..7e573bd
--- /dev/null
+++ b/include/clang/AST/LocInfoType.h
@@ -0,0 +1,61 @@
+//===--- LocInfoType.h - Parsed Type with Location Information---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LocInfoType class, which holds a type and its
+// source-location information.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CLANG_SEMA_LOCINFOTYPE_H
+#define LLVM_CLANG_SEMA_LOCINFOTYPE_H
+
+#include "clang/AST/Type.h"
+
+namespace clang {
+
+class TypeSourceInfo;
+
+/// \brief Holds a QualType and a TypeSourceInfo* that came out of a declarator
+/// parsing.
+///
+/// LocInfoType is a "transient" type, only needed for passing to/from Parser
+/// and Sema, when we want to preserve type source info for a parsed type.
+/// It will not participate in the type system semantics in any way.
+class LocInfoType : public Type {
+  enum {
+    // The last number that can fit in Type's TC.
+    // Avoids conflict with an existing Type class.
+    LocInfo = Type::TypeLast + 1
+  };
+
+  TypeSourceInfo *DeclInfo;
+
+  LocInfoType(QualType ty, TypeSourceInfo *TInfo)
+      : Type((TypeClass)LocInfo, ty, ty->isDependentType(),
+             ty->isInstantiationDependentType(), ty->isVariablyModifiedType(),
+             ty->containsUnexpandedParameterPack()),
+        DeclInfo(TInfo) {
+    assert(getTypeClass() == (TypeClass)LocInfo && "LocInfo didn't fit in TC?");
+  }
+  friend class Sema;
+
+public:
+  QualType getType() const { return getCanonicalTypeInternal(); }
+  TypeSourceInfo *getTypeSourceInfo() const { return DeclInfo; }
+
+  void getAsStringInternal(std::string &Str,
+                           const PrintingPolicy &Policy) const;
+
+  static bool classof(const Type *T) {
+    return T->getTypeClass() == (TypeClass)LocInfo;
+  }
+};
+
+} // end namespace clang
+
+#endif // LLVM_CLANG_SEMA_LOCINFOTYPE_H
diff --git a/include/clang/AST/Makefile b/include/clang/AST/Makefile
deleted file mode 100644
index 85e6449..0000000
--- a/include/clang/AST/Makefile
+++ /dev/null
@@ -1,79 +0,0 @@
-CLANG_LEVEL := ../../..
-TD_SRC_DIR = $(PROJ_SRC_DIR)/../Basic
-BUILT_SOURCES = Attrs.inc AttrImpl.inc AttrDump.inc AttrVisitor.inc \
-                StmtNodes.inc DeclNodes.inc \
-                CommentNodes.inc CommentHTMLTags.inc \
-                CommentHTMLTagsProperties.inc \
-                CommentHTMLNamedCharacterReferences.inc \
-                CommentCommandInfo.inc \
-                CommentCommandList.inc
-
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-$(ObjDir)/Attrs.inc.tmp : $(TD_SRC_DIR)/Attr.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang attribute classes with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-classes -o $(call SYSPATH, $@) \
-		-I $(PROJ_SRC_DIR)/../../ $<
-
-$(ObjDir)/AttrImpl.inc.tmp : $(TD_SRC_DIR)/Attr.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang attribute implementations with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-impl -o $(call SYSPATH, $@) \
-		-I $(PROJ_SRC_DIR)/../../ $<
-
-$(ObjDir)/AttrDump.inc.tmp : $(TD_SRC_DIR)/Attr.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang attribute dumper with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-dump -o $(call SYSPATH, $@) \
-		-I $(PROJ_SRC_DIR)/../../ $<
-
-$(ObjDir)/AttrVisitor.inc.tmp : $(TD_SRC_DIR)/Attr.td $(CLANG_TBLGEN) \
-                                $(ObjDir)/.dir
-	$(Echo) "Building Clang attribute AST visitor with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-ast-visitor -o $(call SYSPATH, $@) \
-		-I $(PROJ_SRC_DIR)/../../ $<
-
-$(ObjDir)/StmtNodes.inc.tmp : $(TD_SRC_DIR)/StmtNodes.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang statement node tables with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-stmt-nodes -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/DeclNodes.inc.tmp : $(TD_SRC_DIR)/DeclNodes.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang declaration node tables with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-decl-nodes -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/CommentNodes.inc.tmp : $(TD_SRC_DIR)/CommentNodes.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang comment node tables with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-comment-nodes -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/CommentHTMLTags.inc.tmp : $(PROJ_SRC_DIR)/CommentHTMLTags.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang comment HTML tag matchers with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-comment-html-tags -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/CommentHTMLTagsProperties.inc.tmp : $(PROJ_SRC_DIR)/CommentHTMLTags.td \
-                                              $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang comment HTML tag properties with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-comment-html-tags-properties -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/CommentHTMLNamedCharacterReferences.inc.tmp : \
-                    $(PROJ_SRC_DIR)/CommentHTMLNamedCharacterReferences.td \
-                    $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang named character reference translation function with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-comment-html-named-character-references -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/CommentCommandInfo.inc.tmp : $(PROJ_SRC_DIR)/CommentCommands.td \
-                                              $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang comment command info with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-comment-command-info -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/CommentCommandList.inc.tmp : $(PROJ_SRC_DIR)/CommentCommands.td \
-                                              $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang list of comment commands with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-comment-command-list -o $(call SYSPATH, $@) $<
-
diff --git a/include/clang/AST/Mangle.h b/include/clang/AST/Mangle.h
index dad9269..7a45d88 100644
--- a/include/clang/AST/Mangle.h
+++ b/include/clang/AST/Mangle.h
@@ -17,10 +17,11 @@
 #include "clang/AST/Type.h"
 #include "clang/Basic/ABI.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class raw_ostream;
+}
 
 namespace clang {
   class ASTContext;
@@ -207,7 +208,8 @@
                                                raw_ostream &Out) = 0;
 
   virtual void mangleCXXThrowInfo(QualType T, bool IsConst, bool IsVolatile,
-                                  uint32_t NumEntries, raw_ostream &Out) = 0;
+                                  bool IsUnaligned, uint32_t NumEntries,
+                                  raw_ostream &Out) = 0;
 
   virtual void mangleCXXCatchableTypeArray(QualType T, uint32_t NumEntries,
                                            raw_ostream &Out) = 0;
diff --git a/include/clang/AST/MangleNumberingContext.h b/include/clang/AST/MangleNumberingContext.h
index 7a81855..db26008 100644
--- a/include/clang/AST/MangleNumberingContext.h
+++ b/include/clang/AST/MangleNumberingContext.h
@@ -16,7 +16,6 @@
 #define LLVM_CLANG_AST_MANGLENUMBERINGCONTEXT_H
 
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 
 namespace clang {
diff --git a/include/clang/AST/OpenMPClause.h b/include/clang/AST/OpenMPClause.h
index 1275b7f..3e4c4bc 100644
--- a/include/clang/AST/OpenMPClause.h
+++ b/include/clang/AST/OpenMPClause.h
@@ -70,6 +70,51 @@
   static bool classof(const OMPClause *) { return true; }
 };
 
+/// Class that handles pre-initialization statement for some clauses, like
+/// 'shedule', 'firstprivate' etc.
+class OMPClauseWithPreInit {
+  friend class OMPClauseReader;
+  /// Pre-initialization statement for the clause.
+  Stmt *PreInit;
+protected:
+  /// Set pre-initialization statement for the clause.
+  void setPreInitStmt(Stmt *S) { PreInit = S; }
+  OMPClauseWithPreInit(const OMPClause *This) : PreInit(nullptr) {
+    assert(get(This) && "get is not tuned for pre-init.");
+  }
+
+public:
+  /// Get pre-initialization statement for the clause.
+  const Stmt *getPreInitStmt() const { return PreInit; }
+  /// Get pre-initialization statement for the clause.
+  Stmt *getPreInitStmt() { return PreInit; }
+  static OMPClauseWithPreInit *get(OMPClause *C);
+  static const OMPClauseWithPreInit *get(const OMPClause *C);
+};
+
+/// Class that handles post-update expression for some clauses, like
+/// 'lastprivate', 'reduction' etc.
+class OMPClauseWithPostUpdate : public OMPClauseWithPreInit {
+  friend class OMPClauseReader;
+  /// Post-update expression for the clause.
+  Expr *PostUpdate;
+protected:
+  /// Set pre-initialization statement for the clause.
+  void setPostUpdateExpr(Expr *S) { PostUpdate = S; }
+  OMPClauseWithPostUpdate(const OMPClause *This)
+      : OMPClauseWithPreInit(This), PostUpdate(nullptr) {
+    assert(get(This) && "get is not tuned for post-update.");
+  }
+
+public:
+  /// Get post-update expression for the clause.
+  const Expr *getPostUpdateExpr() const { return PostUpdate; }
+  /// Get post-update expression for the clause.
+  Expr *getPostUpdateExpr() { return PostUpdate; }
+  static OMPClauseWithPostUpdate *get(OMPClause *C);
+  static const OMPClauseWithPostUpdate *get(const OMPClause *C);
+};
+
 /// \brief This represents clauses with the list of variables like 'private',
 /// 'firstprivate', 'copyin', 'shared', or 'reduction' clauses in the
 /// '#pragma omp ...' directives.
@@ -650,7 +695,7 @@
 /// In this example directive '#pragma omp for' has 'schedule' clause with
 /// arguments 'static' and '3'.
 ///
-class OMPScheduleClause : public OMPClause {
+class OMPScheduleClause : public OMPClause, public OMPClauseWithPreInit {
   friend class OMPClauseReader;
   /// \brief Location of '('.
   SourceLocation LParenLoc;
@@ -665,10 +710,8 @@
   SourceLocation KindLoc;
   /// \brief Location of ',' (if any).
   SourceLocation CommaLoc;
-  /// \brief Chunk size and a reference to pseudo variable for combined
-  /// directives.
-  enum { CHUNK_SIZE, HELPER_CHUNK_SIZE, NUM_EXPRS };
-  Stmt *ChunkSizes[NUM_EXPRS];
+  /// \brief Chunk size.
+  Expr *ChunkSize;
 
   /// \brief Set schedule kind.
   ///
@@ -730,12 +773,7 @@
   ///
   /// \param E Chunk size.
   ///
-  void setChunkSize(Expr *E) { ChunkSizes[CHUNK_SIZE] = E; }
-  /// \brief Set helper chunk size.
-  ///
-  /// \param E Helper chunk size.
-  ///
-  void setHelperChunkSize(Expr *E) { ChunkSizes[HELPER_CHUNK_SIZE] = E; }
+  void setChunkSize(Expr *E) { ChunkSize = E; }
 
 public:
   /// \brief Build 'schedule' clause with schedule kind \a Kind and chunk size
@@ -757,13 +795,13 @@
   OMPScheduleClause(SourceLocation StartLoc, SourceLocation LParenLoc,
                     SourceLocation KLoc, SourceLocation CommaLoc,
                     SourceLocation EndLoc, OpenMPScheduleClauseKind Kind,
-                    Expr *ChunkSize, Expr *HelperChunkSize,
+                    Expr *ChunkSize, Stmt *HelperChunkSize,
                     OpenMPScheduleClauseModifier M1, SourceLocation M1Loc,
                     OpenMPScheduleClauseModifier M2, SourceLocation M2Loc)
-      : OMPClause(OMPC_schedule, StartLoc, EndLoc), LParenLoc(LParenLoc),
-        Kind(Kind), KindLoc(KLoc), CommaLoc(CommaLoc) {
-    ChunkSizes[CHUNK_SIZE] = ChunkSize;
-    ChunkSizes[HELPER_CHUNK_SIZE] = HelperChunkSize;
+      : OMPClause(OMPC_schedule, StartLoc, EndLoc), OMPClauseWithPreInit(this),
+        LParenLoc(LParenLoc), Kind(Kind), KindLoc(KLoc), CommaLoc(CommaLoc),
+        ChunkSize(ChunkSize) {
+    setPreInitStmt(HelperChunkSize);
     Modifiers[FIRST] = M1;
     Modifiers[SECOND] = M2;
     ModifiersLoc[FIRST] = M1Loc;
@@ -774,9 +812,8 @@
   ///
   explicit OMPScheduleClause()
       : OMPClause(OMPC_schedule, SourceLocation(), SourceLocation()),
-        Kind(OMPC_SCHEDULE_unknown) {
-    ChunkSizes[CHUNK_SIZE] = nullptr;
-    ChunkSizes[HELPER_CHUNK_SIZE] = nullptr;
+        OMPClauseWithPreInit(this), Kind(OMPC_SCHEDULE_unknown),
+        ChunkSize(nullptr) {
     Modifiers[FIRST] = OMPC_SCHEDULE_MODIFIER_unknown;
     Modifiers[SECOND] = OMPC_SCHEDULE_MODIFIER_unknown;
   }
@@ -815,29 +852,18 @@
   SourceLocation getCommaLoc() { return CommaLoc; }
   /// \brief Get chunk size.
   ///
-  Expr *getChunkSize() { return dyn_cast_or_null<Expr>(ChunkSizes[CHUNK_SIZE]); }
+  Expr *getChunkSize() { return ChunkSize; }
   /// \brief Get chunk size.
   ///
-  Expr *getChunkSize() const {
-    return dyn_cast_or_null<Expr>(ChunkSizes[CHUNK_SIZE]);
-  }
-  /// \brief Get helper chunk size.
-  ///
-  Expr *getHelperChunkSize() {
-    return dyn_cast_or_null<Expr>(ChunkSizes[HELPER_CHUNK_SIZE]);
-  }
-  /// \brief Get helper chunk size.
-  ///
-  Expr *getHelperChunkSize() const {
-    return dyn_cast_or_null<Expr>(ChunkSizes[HELPER_CHUNK_SIZE]);
-  }
+  const Expr *getChunkSize() const { return ChunkSize; }
 
   static bool classof(const OMPClause *T) {
     return T->getClauseKind() == OMPC_schedule;
   }
 
   child_range children() {
-    return child_range(&ChunkSizes[CHUNK_SIZE], &ChunkSizes[CHUNK_SIZE] + 1);
+    return child_range(reinterpret_cast<Stmt **>(&ChunkSize),
+                       reinterpret_cast<Stmt **>(&ChunkSize) + 1);
   }
 };
 
@@ -1250,6 +1276,7 @@
 ///
 class OMPFirstprivateClause final
     : public OMPVarListClause<OMPFirstprivateClause>,
+      public OMPClauseWithPreInit,
       private llvm::TrailingObjects<OMPFirstprivateClause, Expr *> {
   friend TrailingObjects;
   friend OMPVarListClause;
@@ -1265,7 +1292,8 @@
   OMPFirstprivateClause(SourceLocation StartLoc, SourceLocation LParenLoc,
                         SourceLocation EndLoc, unsigned N)
       : OMPVarListClause<OMPFirstprivateClause>(OMPC_firstprivate, StartLoc,
-                                                LParenLoc, EndLoc, N) {}
+                                                LParenLoc, EndLoc, N),
+        OMPClauseWithPreInit(this) {}
 
   /// \brief Build an empty clause.
   ///
@@ -1274,7 +1302,8 @@
   explicit OMPFirstprivateClause(unsigned N)
       : OMPVarListClause<OMPFirstprivateClause>(
             OMPC_firstprivate, SourceLocation(), SourceLocation(),
-            SourceLocation(), N) {}
+            SourceLocation(), N),
+        OMPClauseWithPreInit(this) {}
   /// \brief Sets the list of references to private copies with initializers for
   /// new private variables.
   /// \param VL List of references.
@@ -1315,11 +1344,13 @@
   /// \param InitVL List of references to auto generated variables used for
   /// initialization of a single array element. Used if firstprivate variable is
   /// of array type.
+  /// \param PreInit Statement that must be executed before entering the OpenMP
+  /// region with this clause.
   ///
   static OMPFirstprivateClause *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
          SourceLocation EndLoc, ArrayRef<Expr *> VL, ArrayRef<Expr *> PrivateVL,
-         ArrayRef<Expr *> InitVL);
+         ArrayRef<Expr *> InitVL, Stmt *PreInit);
   /// \brief Creates an empty clause with the place for \a N variables.
   ///
   /// \param C AST context.
@@ -1374,6 +1405,7 @@
 /// with the variables 'a' and 'b'.
 class OMPLastprivateClause final
     : public OMPVarListClause<OMPLastprivateClause>,
+      public OMPClauseWithPostUpdate,
       private llvm::TrailingObjects<OMPLastprivateClause, Expr *> {
   // There are 4 additional tail-allocated arrays at the end of the class:
   // 1. Contains list of pseudo variables with the default initialization for
@@ -1406,7 +1438,8 @@
   OMPLastprivateClause(SourceLocation StartLoc, SourceLocation LParenLoc,
                        SourceLocation EndLoc, unsigned N)
       : OMPVarListClause<OMPLastprivateClause>(OMPC_lastprivate, StartLoc,
-                                               LParenLoc, EndLoc, N) {}
+                                               LParenLoc, EndLoc, N),
+        OMPClauseWithPostUpdate(this) {}
 
   /// \brief Build an empty clause.
   ///
@@ -1415,7 +1448,8 @@
   explicit OMPLastprivateClause(unsigned N)
       : OMPVarListClause<OMPLastprivateClause>(
             OMPC_lastprivate, SourceLocation(), SourceLocation(),
-            SourceLocation(), N) {}
+            SourceLocation(), N),
+        OMPClauseWithPostUpdate(this) {}
 
   /// \brief Get the list of helper expressions for initialization of private
   /// copies for lastprivate variables.
@@ -1488,12 +1522,16 @@
   /// \endcode
   /// Required for proper codegen of final assignment performed by the
   /// lastprivate clause.
-  ///
+  /// \param PreInit Statement that must be executed before entering the OpenMP
+  /// region with this clause.
+  /// \param PostUpdate Expression that must be executed after exit from the
+  /// OpenMP region with this clause.
   ///
   static OMPLastprivateClause *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
          SourceLocation EndLoc, ArrayRef<Expr *> VL, ArrayRef<Expr *> SrcExprs,
-         ArrayRef<Expr *> DstExprs, ArrayRef<Expr *> AssignmentOps);
+         ArrayRef<Expr *> DstExprs, ArrayRef<Expr *> AssignmentOps,
+         Stmt *PreInit, Expr *PostUpdate);
   /// \brief Creates an empty clause with the place for \a N variables.
   ///
   /// \param C AST context.
@@ -1627,6 +1665,7 @@
 ///
 class OMPReductionClause final
     : public OMPVarListClause<OMPReductionClause>,
+      public OMPClauseWithPostUpdate,
       private llvm::TrailingObjects<OMPReductionClause, Expr *> {
   friend TrailingObjects;
   friend OMPVarListClause;
@@ -1654,7 +1693,8 @@
                      const DeclarationNameInfo &NameInfo)
       : OMPVarListClause<OMPReductionClause>(OMPC_reduction, StartLoc,
                                              LParenLoc, EndLoc, N),
-        ColonLoc(ColonLoc), QualifierLoc(QualifierLoc), NameInfo(NameInfo) {}
+        OMPClauseWithPostUpdate(this), ColonLoc(ColonLoc),
+        QualifierLoc(QualifierLoc), NameInfo(NameInfo) {}
 
   /// \brief Build an empty clause.
   ///
@@ -1664,7 +1704,7 @@
       : OMPVarListClause<OMPReductionClause>(OMPC_reduction, SourceLocation(),
                                              SourceLocation(), SourceLocation(),
                                              N),
-        ColonLoc(), QualifierLoc(), NameInfo() {}
+        OMPClauseWithPostUpdate(this), ColonLoc(), QualifierLoc(), NameInfo() {}
 
   /// \brief Sets location of ':' symbol in clause.
   void setColonLoc(SourceLocation CL) { ColonLoc = CL; }
@@ -1757,6 +1797,10 @@
   /// \endcode
   /// Required for proper codegen of final reduction operation performed by the
   /// reduction clause.
+  /// \param PreInit Statement that must be executed before entering the OpenMP
+  /// region with this clause.
+  /// \param PostUpdate Expression that must be executed after exit from the
+  /// OpenMP region with this clause.
   ///
   static OMPReductionClause *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
@@ -1764,7 +1808,7 @@
          NestedNameSpecifierLoc QualifierLoc,
          const DeclarationNameInfo &NameInfo, ArrayRef<Expr *> Privates,
          ArrayRef<Expr *> LHSExprs, ArrayRef<Expr *> RHSExprs,
-         ArrayRef<Expr *> ReductionOps);
+         ArrayRef<Expr *> ReductionOps, Stmt *PreInit, Expr *PostUpdate);
   /// \brief Creates an empty clause with the place for \a N variables.
   ///
   /// \param C AST context.
@@ -1833,6 +1877,7 @@
 ///
 class OMPLinearClause final
     : public OMPVarListClause<OMPLinearClause>,
+      public OMPClauseWithPostUpdate,
       private llvm::TrailingObjects<OMPLinearClause, Expr *> {
   friend TrailingObjects;
   friend OMPVarListClause;
@@ -1864,7 +1909,8 @@
                   unsigned NumVars)
       : OMPVarListClause<OMPLinearClause>(OMPC_linear, StartLoc, LParenLoc,
                                           EndLoc, NumVars),
-        Modifier(Modifier), ModifierLoc(ModifierLoc), ColonLoc(ColonLoc) {}
+        OMPClauseWithPostUpdate(this), Modifier(Modifier),
+        ModifierLoc(ModifierLoc), ColonLoc(ColonLoc) {}
 
   /// \brief Build an empty clause.
   ///
@@ -1874,7 +1920,8 @@
       : OMPVarListClause<OMPLinearClause>(OMPC_linear, SourceLocation(),
                                           SourceLocation(), SourceLocation(),
                                           NumVars),
-        Modifier(OMPC_LINEAR_val), ModifierLoc(), ColonLoc() {}
+        OMPClauseWithPostUpdate(this), Modifier(OMPC_LINEAR_val), ModifierLoc(),
+        ColonLoc() {}
 
   /// \brief Gets the list of initial values for linear variables.
   ///
@@ -1943,11 +1990,16 @@
   /// \param IL List of initial values for the variables.
   /// \param Step Linear step.
   /// \param CalcStep Calculation of the linear step.
+  /// \param PreInit Statement that must be executed before entering the OpenMP
+  /// region with this clause.
+  /// \param PostUpdate Expression that must be executed after exit from the
+  /// OpenMP region with this clause.
   static OMPLinearClause *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
          OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc,
          SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef<Expr *> VL,
-         ArrayRef<Expr *> PL, ArrayRef<Expr *> IL, Expr *Step, Expr *CalcStep);
+         ArrayRef<Expr *> PL, ArrayRef<Expr *> IL, Expr *Step, Expr *CalcStep,
+         Stmt *PreInit, Expr *PostUpdate);
 
   /// \brief Creates an empty clause with the place for \a NumVars variables.
   ///
@@ -2577,7 +2629,6 @@
   /// \param DepLoc Location of the dependency type.
   /// \param ColonLoc Colon location.
   /// \param VL List of references to the variables.
-  ///
   static OMPDependClause *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
          SourceLocation EndLoc, OpenMPDependClauseKind DepKind,
@@ -2596,6 +2647,14 @@
   /// \brief Get colon location.
   SourceLocation getColonLoc() const { return ColonLoc; }
 
+  /// Set the loop counter value for the depend clauses with 'sink|source' kind
+  /// of dependency. Required for codegen.
+  void setCounterValue(Expr *V);
+  /// Get the loop counter value.
+  Expr *getCounterValue();
+  /// Get the loop counter value.
+  const Expr *getCounterValue() const;
+
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
                        reinterpret_cast<Stmt **>(varlist_end()));
@@ -2722,6 +2781,495 @@
   }
 };
 
+/// \brief Struct that defines common infrastructure to handle mappable
+/// expressions used in OpenMP clauses.
+class OMPClauseMappableExprCommon {
+public:
+  // \brief Class that represents a component of a mappable expression. E.g.
+  // for an expression S.a, the first component is a declaration reference
+  // expression associated with 'S' and the second is a member expression
+  // associated with the field declaration 'a'. If the expression is an array
+  // subscript it may not have any associated declaration. In that case the
+  // associated declaration is set to nullptr.
+  class MappableComponent {
+    // \brief Expression associated with the component.
+    Expr *AssociatedExpression = nullptr;
+    // \brief Declaration associated with the declaration. If the component does
+    // not have a declaration (e.g. array subscripts or section), this is set to
+    // nullptr.
+    ValueDecl *AssociatedDeclaration = nullptr;
+
+  public:
+    explicit MappableComponent() {}
+    explicit MappableComponent(Expr *AssociatedExpression,
+                               ValueDecl *AssociatedDeclaration)
+        : AssociatedExpression(AssociatedExpression),
+          AssociatedDeclaration(
+              AssociatedDeclaration
+                  ? cast<ValueDecl>(AssociatedDeclaration->getCanonicalDecl())
+                  : nullptr) {}
+
+    Expr *getAssociatedExpression() const { return AssociatedExpression; }
+    ValueDecl *getAssociatedDeclaration() const {
+      return AssociatedDeclaration;
+    }
+  };
+
+  // \brief List of components of an expression. This first one is the whole
+  // expression and the last one is the base expression.
+  typedef SmallVector<MappableComponent, 8> MappableExprComponentList;
+  typedef ArrayRef<MappableComponent> MappableExprComponentListRef;
+
+  // \brief List of all component lists associated to the same base declaration.
+  // E.g. if both 'S.a' and 'S.b' are a mappable expressions, each will have
+  // their component list but the same base declaration 'S'.
+  typedef SmallVector<MappableExprComponentList, 8> MappableExprComponentLists;
+  typedef ArrayRef<MappableExprComponentList> MappableExprComponentListsRef;
+
+protected:
+  // \brief Return the total number of elements in a list of component lists.
+  static unsigned
+  getComponentsTotalNumber(MappableExprComponentListsRef ComponentLists);
+
+  // \brief Return the total number of elements in a list of declarations. All
+  // declarations are expected to be canonical.
+  static unsigned
+  getUniqueDeclarationsTotalNumber(ArrayRef<ValueDecl *> Declarations);
+};
+
+/// \brief This represents clauses with a list of expressions that are mappable.
+/// Examples of these clauses are 'map' in
+/// '#pragma omp target [enter|exit] [data]...' directives, and  'to' and 'from
+/// in '#pragma omp target update...' directives.
+template <class T>
+class OMPMappableExprListClause : public OMPVarListClause<T>,
+                                  public OMPClauseMappableExprCommon {
+  friend class OMPClauseReader;
+
+  /// \brief Number of unique declarations in this clause.
+  unsigned NumUniqueDeclarations;
+
+  /// \brief Number of component lists in this clause.
+  unsigned NumComponentLists;
+
+  /// \brief Total number of components in this clause.
+  unsigned NumComponents;
+
+protected:
+  /// \brief Get the unique declarations that are in the trailing objects of the
+  /// class.
+  MutableArrayRef<ValueDecl *> getUniqueDeclsRef() {
+    return MutableArrayRef<ValueDecl *>(
+        static_cast<T *>(this)->template getTrailingObjects<ValueDecl *>(),
+        NumUniqueDeclarations);
+  }
+
+  /// \brief Get the unique declarations that are in the trailing objects of the
+  /// class.
+  ArrayRef<ValueDecl *> getUniqueDeclsRef() const {
+    return ArrayRef<ValueDecl *>(
+        static_cast<const T *>(this)
+            ->template getTrailingObjects<ValueDecl *>(),
+        NumUniqueDeclarations);
+  }
+
+  /// \brief Set the unique declarations that are in the trailing objects of the
+  /// class.
+  void setUniqueDecls(ArrayRef<ValueDecl *> UDs) {
+    assert(UDs.size() == NumUniqueDeclarations &&
+           "Unexpected amount of unique declarations.");
+    std::copy(UDs.begin(), UDs.end(), getUniqueDeclsRef().begin());
+  }
+
+  /// \brief Get the number of lists per declaration that are in the trailing
+  /// objects of the class.
+  MutableArrayRef<unsigned> getDeclNumListsRef() {
+    return MutableArrayRef<unsigned>(
+        static_cast<T *>(this)->template getTrailingObjects<unsigned>(),
+        NumUniqueDeclarations);
+  }
+
+  /// \brief Get the number of lists per declaration that are in the trailing
+  /// objects of the class.
+  ArrayRef<unsigned> getDeclNumListsRef() const {
+    return ArrayRef<unsigned>(
+        static_cast<const T *>(this)->template getTrailingObjects<unsigned>(),
+        NumUniqueDeclarations);
+  }
+
+  /// \brief Set the number of lists per declaration that are in the trailing
+  /// objects of the class.
+  void setDeclNumLists(ArrayRef<unsigned> DNLs) {
+    assert(DNLs.size() == NumUniqueDeclarations &&
+           "Unexpected amount of list numbers.");
+    std::copy(DNLs.begin(), DNLs.end(), getDeclNumListsRef().begin());
+  }
+
+  /// \brief Get the cumulative component lists sizes that are in the trailing
+  /// objects of the class. They are appended after the number of lists.
+  MutableArrayRef<unsigned> getComponentListSizesRef() {
+    return MutableArrayRef<unsigned>(
+        static_cast<T *>(this)->template getTrailingObjects<unsigned>() +
+            NumUniqueDeclarations,
+        NumComponentLists);
+  }
+
+  /// \brief Get the cumulative component lists sizes that are in the trailing
+  /// objects of the class. They are appended after the number of lists.
+  ArrayRef<unsigned> getComponentListSizesRef() const {
+    return ArrayRef<unsigned>(
+        static_cast<const T *>(this)->template getTrailingObjects<unsigned>() +
+            NumUniqueDeclarations,
+        NumComponentLists);
+  }
+
+  /// \brief Set the cumulative component lists sizes that are in the trailing
+  /// objects of the class.
+  void setComponentListSizes(ArrayRef<unsigned> CLSs) {
+    assert(CLSs.size() == NumComponentLists &&
+           "Unexpected amount of component lists.");
+    std::copy(CLSs.begin(), CLSs.end(), getComponentListSizesRef().begin());
+  }
+
+  /// \brief Get the components that are in the trailing objects of the class.
+  MutableArrayRef<MappableComponent> getComponentsRef() {
+    return MutableArrayRef<MappableComponent>(
+        static_cast<T *>(this)
+            ->template getTrailingObjects<MappableComponent>(),
+        NumComponents);
+  }
+
+  /// \brief Get the components that are in the trailing objects of the class.
+  ArrayRef<MappableComponent> getComponentsRef() const {
+    return ArrayRef<MappableComponent>(
+        static_cast<const T *>(this)
+            ->template getTrailingObjects<MappableComponent>(),
+        NumComponents);
+  }
+
+  /// \brief Set the components that are in the trailing objects of the class.
+  /// This requires the list sizes so that it can also fill the original
+  /// expressions, which are the first component of each list.
+  void setComponents(ArrayRef<MappableComponent> Components,
+                     ArrayRef<unsigned> CLSs) {
+    assert(Components.size() == NumComponents &&
+           "Unexpected amount of component lists.");
+    assert(CLSs.size() == NumComponentLists &&
+           "Unexpected amount of list sizes.");
+    std::copy(Components.begin(), Components.end(), getComponentsRef().begin());
+  }
+
+  /// \brief Fill the clause information from the list of declarations and
+  /// associated component lists.
+  void setClauseInfo(ArrayRef<ValueDecl *> Declarations,
+                     MappableExprComponentListsRef ComponentLists) {
+    // Perform some checks to make sure the data sizes are consistent with the
+    // information available when the clause was created.
+    assert(getUniqueDeclarationsTotalNumber(Declarations) ==
+               NumUniqueDeclarations &&
+           "Unexpected number of mappable expression info entries!");
+    assert(getComponentsTotalNumber(ComponentLists) == NumComponents &&
+           "Unexpected total number of components!");
+    assert(Declarations.size() == ComponentLists.size() &&
+           "Declaration and component lists size is not consistent!");
+    assert(Declarations.size() == NumComponentLists &&
+           "Unexpected declaration and component lists size!");
+
+    // Organize the components by declaration and retrieve the original
+    // expression. Original expressions are always the first component of the
+    // mappable component list.
+    llvm::DenseMap<ValueDecl *, SmallVector<MappableExprComponentListRef, 8>>
+        ComponentListMap;
+    {
+      auto CI = ComponentLists.begin();
+      for (auto DI = Declarations.begin(), DE = Declarations.end(); DI != DE;
+           ++DI, ++CI) {
+        assert(!CI->empty() && "Invalid component list!");
+        ComponentListMap[*DI].push_back(*CI);
+      }
+    }
+
+    // Iterators of the target storage.
+    auto UniqueDeclarations = getUniqueDeclsRef();
+    auto UDI = UniqueDeclarations.begin();
+
+    auto DeclNumLists = getDeclNumListsRef();
+    auto DNLI = DeclNumLists.begin();
+
+    auto ComponentListSizes = getComponentListSizesRef();
+    auto CLSI = ComponentListSizes.begin();
+
+    auto Components = getComponentsRef();
+    auto CI = Components.begin();
+
+    // Variable to compute the accumulation of the number of components.
+    unsigned PrevSize = 0u;
+
+    // Scan all the declarations and associated component lists.
+    for (auto &M : ComponentListMap) {
+      // The declaration.
+      auto *D = M.first;
+      // The component lists.
+      auto CL = M.second;
+
+      // Initialize the entry.
+      *UDI = D;
+      ++UDI;
+
+      *DNLI = CL.size();
+      ++DNLI;
+
+      // Obtain the cumulative sizes and concatenate all the components in the
+      // reserved storage.
+      for (auto C : CL) {
+        // Accumulate with the previous size.
+        PrevSize += C.size();
+
+        // Save the size.
+        *CLSI = PrevSize;
+        ++CLSI;
+
+        // Append components after the current components iterator.
+        CI = std::copy(C.begin(), C.end(), CI);
+      }
+    }
+  }
+
+  /// \brief Build a clause for \a NumUniqueDeclarations declarations, \a
+  /// NumComponentLists total component lists, and \a NumComponents total
+  /// components.
+  ///
+  /// \param K Kind of the clause.
+  /// \param StartLoc Starting location of the clause (the clause keyword).
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  /// \param NumVars Number of expressions listed in the clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause - one
+  /// list for each expression in the clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  OMPMappableExprListClause(OpenMPClauseKind K, SourceLocation StartLoc,
+                            SourceLocation LParenLoc, SourceLocation EndLoc,
+                            unsigned NumVars, unsigned NumUniqueDeclarations,
+                            unsigned NumComponentLists, unsigned NumComponents)
+      : OMPVarListClause<T>(K, StartLoc, LParenLoc, EndLoc, NumVars),
+        NumUniqueDeclarations(NumUniqueDeclarations),
+        NumComponentLists(NumComponentLists), NumComponents(NumComponents) {}
+
+public:
+  /// \brief Return the number of unique base declarations in this clause.
+  unsigned getUniqueDeclarationsNum() const { return NumUniqueDeclarations; }
+  /// \brief Return the number of lists derived from the clause expressions.
+  unsigned getTotalComponentListNum() const { return NumComponentLists; }
+  /// \brief Return the total number of components in all lists derived from the
+  /// clause.
+  unsigned getTotalComponentsNum() const { return NumComponents; }
+
+  /// \brief Iterator that browse the components by lists. It also allows
+  /// browsing components of a single declaration.
+  class const_component_lists_iterator
+      : public llvm::iterator_adaptor_base<
+            const_component_lists_iterator,
+            MappableExprComponentListRef::const_iterator,
+            std::forward_iterator_tag, MappableComponent, ptrdiff_t,
+            MappableComponent, MappableComponent> {
+    // The declaration the iterator currently refers to.
+    ArrayRef<ValueDecl *>::iterator DeclCur;
+
+    // The list number associated with the current declaration.
+    ArrayRef<unsigned>::iterator NumListsCur;
+
+    // Remaining lists for the current declaration.
+    unsigned RemainingLists;
+
+    // The cumulative size of the previous list, or zero if there is no previous
+    // list.
+    unsigned PrevListSize;
+
+    // The cumulative sizes of the current list - it will delimit the remaining
+    // range of interest.
+    ArrayRef<unsigned>::const_iterator ListSizeCur;
+    ArrayRef<unsigned>::const_iterator ListSizeEnd;
+
+    // Iterator to the end of the components storage.
+    MappableExprComponentListRef::const_iterator End;
+
+  public:
+    /// \brief Construct an iterator that scans all lists.
+    explicit const_component_lists_iterator(
+        ArrayRef<ValueDecl *> UniqueDecls, ArrayRef<unsigned> DeclsListNum,
+        ArrayRef<unsigned> CumulativeListSizes,
+        MappableExprComponentListRef Components)
+        : const_component_lists_iterator::iterator_adaptor_base(
+              Components.begin()),
+          DeclCur(UniqueDecls.begin()), NumListsCur(DeclsListNum.begin()),
+          RemainingLists(0u), PrevListSize(0u),
+          ListSizeCur(CumulativeListSizes.begin()),
+          ListSizeEnd(CumulativeListSizes.end()), End(Components.end()) {
+      assert(UniqueDecls.size() == DeclsListNum.size() &&
+             "Inconsistent number of declarations and list sizes!");
+      if (!DeclsListNum.empty())
+        RemainingLists = *NumListsCur;
+    }
+
+    /// \brief Construct an iterator that scan lists for a given declaration \a
+    /// Declaration.
+    explicit const_component_lists_iterator(
+        const ValueDecl *Declaration, ArrayRef<ValueDecl *> UniqueDecls,
+        ArrayRef<unsigned> DeclsListNum, ArrayRef<unsigned> CumulativeListSizes,
+        MappableExprComponentListRef Components)
+        : const_component_lists_iterator(UniqueDecls, DeclsListNum,
+                                         CumulativeListSizes, Components) {
+
+      // Look for the desired declaration. While we are looking for it, we
+      // update the state so that we know the component where a given list
+      // starts.
+      for (; DeclCur != UniqueDecls.end(); ++DeclCur, ++NumListsCur) {
+        if (*DeclCur == Declaration)
+          break;
+
+        assert(*NumListsCur > 0 && "No lists associated with declaration??");
+
+        // Skip the lists associated with the current declaration, but save the
+        // last list size that was skipped.
+        std::advance(ListSizeCur, *NumListsCur - 1);
+        PrevListSize = *ListSizeCur;
+        ++ListSizeCur;
+      }
+
+      // If we didn't find any declaration, advance the iterator to after the
+      // last component and set remaining lists to zero.
+      if (ListSizeCur == CumulativeListSizes.end()) {
+        this->I = End;
+        RemainingLists = 0u;
+        return;
+      }
+
+      // Set the remaining lists with the total number of lists of the current
+      // declaration.
+      RemainingLists = *NumListsCur;
+
+      // Adjust the list size end iterator to the end of the relevant range.
+      ListSizeEnd = ListSizeCur;
+      std::advance(ListSizeEnd, RemainingLists);
+
+      // Given that the list sizes are cumulative, the index of the component
+      // that start the list is the size of the previous list.
+      std::advance(this->I, PrevListSize);
+    }
+
+    // Return the array with the current list. The sizes are cumulative, so the
+    // array size is the difference between the current size and previous one.
+    std::pair<const ValueDecl *, MappableExprComponentListRef>
+    operator*() const {
+      assert(ListSizeCur != ListSizeEnd && "Invalid iterator!");
+      return std::make_pair(
+          *DeclCur,
+          MappableExprComponentListRef(&*this->I, *ListSizeCur - PrevListSize));
+    }
+    std::pair<const ValueDecl *, MappableExprComponentListRef>
+    operator->() const {
+      return **this;
+    }
+
+    // Skip the components of the current list.
+    const_component_lists_iterator &operator++() {
+      assert(ListSizeCur != ListSizeEnd && RemainingLists &&
+             "Invalid iterator!");
+
+      // If we don't have more lists just skip all the components. Otherwise,
+      // advance the iterator by the number of components in the current list.
+      if (std::next(ListSizeCur) == ListSizeEnd) {
+        this->I = End;
+        RemainingLists = 0;
+      } else {
+        std::advance(this->I, *ListSizeCur - PrevListSize);
+        PrevListSize = *ListSizeCur;
+
+        // We are done with a declaration, move to the next one.
+        if (!(--RemainingLists)) {
+          ++DeclCur;
+          ++NumListsCur;
+          RemainingLists = *NumListsCur;
+          assert(RemainingLists && "No lists in the following declaration??");
+        }
+      }
+
+      ++ListSizeCur;
+      return *this;
+    }
+  };
+
+  typedef llvm::iterator_range<const_component_lists_iterator>
+      const_component_lists_range;
+
+  /// \brief Iterators for all component lists.
+  const_component_lists_iterator component_lists_begin() const {
+    return const_component_lists_iterator(
+        getUniqueDeclsRef(), getDeclNumListsRef(), getComponentListSizesRef(),
+        getComponentsRef());
+  }
+  const_component_lists_iterator component_lists_end() const {
+    return const_component_lists_iterator(
+        ArrayRef<ValueDecl *>(), ArrayRef<unsigned>(), ArrayRef<unsigned>(),
+        MappableExprComponentListRef(getComponentsRef().end(),
+                                     getComponentsRef().end()));
+  }
+  const_component_lists_range component_lists() const {
+    return {component_lists_begin(), component_lists_end()};
+  }
+
+  /// \brief Iterators for component lists associated with the provided
+  /// declaration.
+  const_component_lists_iterator
+  decl_component_lists_begin(const ValueDecl *VD) const {
+    return const_component_lists_iterator(
+        VD, getUniqueDeclsRef(), getDeclNumListsRef(),
+        getComponentListSizesRef(), getComponentsRef());
+  }
+  const_component_lists_iterator decl_component_lists_end() const {
+    return component_lists_end();
+  }
+  const_component_lists_range decl_component_lists(const ValueDecl *VD) const {
+    return {decl_component_lists_begin(VD), decl_component_lists_end()};
+  }
+
+  /// Iterators to access all the declarations, number of lists, list sizes, and
+  /// components.
+  typedef ArrayRef<ValueDecl *>::iterator const_all_decls_iterator;
+  typedef llvm::iterator_range<const_all_decls_iterator> const_all_decls_range;
+  const_all_decls_range all_decls() const {
+    auto A = getUniqueDeclsRef();
+    return const_all_decls_range(A.begin(), A.end());
+  }
+
+  typedef ArrayRef<unsigned>::iterator const_all_num_lists_iterator;
+  typedef llvm::iterator_range<const_all_num_lists_iterator>
+      const_all_num_lists_range;
+  const_all_num_lists_range all_num_lists() const {
+    auto A = getDeclNumListsRef();
+    return const_all_num_lists_range(A.begin(), A.end());
+  }
+
+  typedef ArrayRef<unsigned>::iterator const_all_lists_sizes_iterator;
+  typedef llvm::iterator_range<const_all_lists_sizes_iterator>
+      const_all_lists_sizes_range;
+  const_all_lists_sizes_range all_lists_sizes() const {
+    auto A = getComponentListSizesRef();
+    return const_all_lists_sizes_range(A.begin(), A.end());
+  }
+
+  typedef ArrayRef<MappableComponent>::iterator const_all_components_iterator;
+  typedef llvm::iterator_range<const_all_components_iterator>
+      const_all_components_range;
+  const_all_components_range all_components() const {
+    auto A = getComponentsRef();
+    return const_all_components_range(A.begin(), A.end());
+  }
+};
+
 /// \brief This represents clause 'map' in the '#pragma omp ...'
 /// directives.
 ///
@@ -2731,16 +3279,33 @@
 /// In this example directive '#pragma omp target' has clause 'map'
 /// with the variables 'a' and 'b'.
 ///
-class OMPMapClause final : public OMPVarListClause<OMPMapClause>,
-                           private llvm::TrailingObjects<OMPMapClause, Expr *> {
+class OMPMapClause final : public OMPMappableExprListClause<OMPMapClause>,
+                           private llvm::TrailingObjects<
+                               OMPMapClause, Expr *, ValueDecl *, unsigned,
+                               OMPClauseMappableExprCommon::MappableComponent> {
   friend TrailingObjects;
   friend OMPVarListClause;
+  friend OMPMappableExprListClause;
   friend class OMPClauseReader;
 
+  /// Define the sizes of each trailing object array except the last one. This
+  /// is required for TrailingObjects to work properly.
+  size_t numTrailingObjects(OverloadToken<Expr *>) const {
+    return varlist_size();
+  }
+  size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
+    return getUniqueDeclarationsNum();
+  }
+  size_t numTrailingObjects(OverloadToken<unsigned>) const {
+    return getUniqueDeclarationsNum() + getTotalComponentListNum();
+  }
+
   /// \brief Map type modifier for the 'map' clause.
   OpenMPMapClauseKind MapTypeModifier;
   /// \brief Map type for the 'map' clause.
   OpenMPMapClauseKind MapType;
+  /// \brief Is this an implicit map type or not.
+  bool MapTypeIsImplicit;
   /// \brief Location of the map type.
   SourceLocation MapLoc;
   /// \brief Colon location.
@@ -2767,30 +3332,49 @@
   /// \brief Set colon location.
   void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; }
 
-  /// \brief Build clause with number of variables \a N.
+  /// \brief Build a clause for \a NumVars listed expressions, \a
+  /// NumUniqueDeclarations declarations, \a NumComponentLists total component
+  /// lists, and \a NumComponents total expression components.
   ///
   /// \param MapTypeModifier Map type modifier.
   /// \param MapType Map type.
+  /// \param MapTypeIsImplicit Map type is inferred implicitly.
   /// \param MapLoc Location of the map type.
   /// \param StartLoc Starting location of the clause.
   /// \param EndLoc Ending location of the clause.
-  /// \param N Number of the variables in the clause.
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
   ///
   explicit OMPMapClause(OpenMPMapClauseKind MapTypeModifier,
-                        OpenMPMapClauseKind MapType, SourceLocation MapLoc,
-                        SourceLocation StartLoc, SourceLocation LParenLoc,
-                        SourceLocation EndLoc, unsigned N)
-    : OMPVarListClause<OMPMapClause>(OMPC_map, StartLoc, LParenLoc, EndLoc, N),
-      MapTypeModifier(MapTypeModifier), MapType(MapType), MapLoc(MapLoc) {}
+                        OpenMPMapClauseKind MapType, bool MapTypeIsImplicit,
+                        SourceLocation MapLoc, SourceLocation StartLoc,
+                        SourceLocation LParenLoc, SourceLocation EndLoc,
+                        unsigned NumVars, unsigned NumUniqueDeclarations,
+                        unsigned NumComponentLists, unsigned NumComponents)
+      : OMPMappableExprListClause(OMPC_map, StartLoc, LParenLoc, EndLoc,
+                                  NumVars, NumUniqueDeclarations,
+                                  NumComponentLists, NumComponents),
+        MapTypeModifier(MapTypeModifier), MapType(MapType),
+        MapTypeIsImplicit(MapTypeIsImplicit), MapLoc(MapLoc) {}
 
   /// \brief Build an empty clause.
   ///
-  /// \param N Number of variables.
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
   ///
-  explicit OMPMapClause(unsigned N)
-      : OMPVarListClause<OMPMapClause>(OMPC_map, SourceLocation(),
-                                       SourceLocation(), SourceLocation(), N),
-        MapTypeModifier(OMPC_MAP_unknown), MapType(OMPC_MAP_unknown), MapLoc() {}
+  explicit OMPMapClause(unsigned NumVars, unsigned NumUniqueDeclarations,
+                        unsigned NumComponentLists, unsigned NumComponents)
+      : OMPMappableExprListClause(
+            OMPC_map, SourceLocation(), SourceLocation(), SourceLocation(),
+            NumVars, NumUniqueDeclarations, NumComponentLists, NumComponents),
+        MapTypeModifier(OMPC_MAP_unknown), MapType(OMPC_MAP_unknown),
+        MapTypeIsImplicit(false), MapLoc() {}
 
 public:
   /// \brief Creates clause with a list of variables \a VL.
@@ -2798,26 +3382,49 @@
   /// \param C AST context.
   /// \param StartLoc Starting location of the clause.
   /// \param EndLoc Ending location of the clause.
-  /// \param VL List of references to the variables.
+  /// \param Vars The original expression used in the clause.
+  /// \param Declarations Declarations used in the clause.
+  /// \param ComponentLists Component lists used in the clause.
   /// \param TypeModifier Map type modifier.
   /// \param Type Map type.
+  /// \param TypeIsImplicit Map type is inferred implicitly.
   /// \param TypeLoc Location of the map type.
   ///
   static OMPMapClause *Create(const ASTContext &C, SourceLocation StartLoc,
-                              SourceLocation LParenLoc,
-                              SourceLocation EndLoc, ArrayRef<Expr *> VL,
+                              SourceLocation LParenLoc, SourceLocation EndLoc,
+                              ArrayRef<Expr *> Vars,
+                              ArrayRef<ValueDecl *> Declarations,
+                              MappableExprComponentListsRef ComponentLists,
                               OpenMPMapClauseKind TypeModifier,
-                              OpenMPMapClauseKind Type, SourceLocation TypeLoc);
-  /// \brief Creates an empty clause with the place for \a N variables.
+                              OpenMPMapClauseKind Type, bool TypeIsImplicit,
+                              SourceLocation TypeLoc);
+  /// \brief Creates an empty clause with the place for for \a NumVars original
+  /// expressions, \a NumUniqueDeclarations declarations, \NumComponentLists
+  /// lists, and \a NumComponents expression components.
   ///
   /// \param C AST context.
-  /// \param N The number of variables.
+  /// \param NumVars Number of expressions listed in the clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponents Total number of expression components in the clause.
   ///
-  static OMPMapClause *CreateEmpty(const ASTContext &C, unsigned N);
+  static OMPMapClause *CreateEmpty(const ASTContext &C, unsigned NumVars,
+                                   unsigned NumUniqueDeclarations,
+                                   unsigned NumComponentLists,
+                                   unsigned NumComponents);
 
   /// \brief Fetches mapping kind for the clause.
   OpenMPMapClauseKind getMapType() const LLVM_READONLY { return MapType; }
 
+  /// \brief Is this an implicit map type?
+  /// We have to capture 'IsMapTypeImplicit' from the parser for more
+  /// informative error messages.  It helps distinguish map(r) from
+  /// map(tofrom: r), which is important to print more helpful error
+  /// messages for some target directives.
+  bool isImplicitMapType() const LLVM_READONLY { return MapTypeIsImplicit; }
+
   /// \brief Fetches the map type modifier for the clause.
   OpenMPMapClauseKind getMapTypeModifier() const LLVM_READONLY {
     return MapTypeModifier;
@@ -3202,7 +3809,7 @@
 /// In this example directive '#pragma omp distribute' has 'dist_schedule'
 /// clause with arguments 'static' and '3'.
 ///
-class OMPDistScheduleClause : public OMPClause {
+class OMPDistScheduleClause : public OMPClause, public OMPClauseWithPreInit {
   friend class OMPClauseReader;
   /// \brief Location of '('.
   SourceLocation LParenLoc;
@@ -3212,10 +3819,8 @@
   SourceLocation KindLoc;
   /// \brief Location of ',' (if any).
   SourceLocation CommaLoc;
-  /// \brief Chunk size and a reference to pseudo variable for combined
-  /// directives.
-  enum { CHUNK_SIZE, HELPER_CHUNK_SIZE, NUM_EXPRS };
-  Stmt *ChunkSizes[NUM_EXPRS];
+  /// \brief Chunk size.
+  Expr *ChunkSize;
 
   /// \brief Set schedule kind.
   ///
@@ -3241,12 +3846,7 @@
   ///
   /// \param E Chunk size.
   ///
-  void setChunkSize(Expr *E) { ChunkSizes[CHUNK_SIZE] = E; }
-  /// \brief Set helper chunk size.
-  ///
-  /// \param E Helper chunk size.
-  ///
-  void setHelperChunkSize(Expr *E) { ChunkSizes[HELPER_CHUNK_SIZE] = E; }
+  void setChunkSize(Expr *E) { ChunkSize = E; }
 
 public:
   /// \brief Build 'dist_schedule' clause with schedule kind \a Kind and chunk
@@ -3265,21 +3865,19 @@
                         SourceLocation KLoc, SourceLocation CommaLoc,
                         SourceLocation EndLoc,
                         OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize,
-                        Expr *HelperChunkSize)
-      : OMPClause(OMPC_dist_schedule, StartLoc, EndLoc), LParenLoc(LParenLoc),
-        Kind(Kind), KindLoc(KLoc), CommaLoc(CommaLoc) {
-    ChunkSizes[CHUNK_SIZE] = ChunkSize;
-    ChunkSizes[HELPER_CHUNK_SIZE] = HelperChunkSize;
+                        Stmt *HelperChunkSize)
+      : OMPClause(OMPC_dist_schedule, StartLoc, EndLoc),
+        OMPClauseWithPreInit(this), LParenLoc(LParenLoc), Kind(Kind),
+        KindLoc(KLoc), CommaLoc(CommaLoc), ChunkSize(ChunkSize) {
+    setPreInitStmt(HelperChunkSize);
   }
 
   /// \brief Build an empty clause.
   ///
   explicit OMPDistScheduleClause()
       : OMPClause(OMPC_dist_schedule, SourceLocation(), SourceLocation()),
-        Kind(OMPC_DIST_SCHEDULE_unknown) {
-    ChunkSizes[CHUNK_SIZE] = nullptr;
-    ChunkSizes[HELPER_CHUNK_SIZE] = nullptr;
-  }
+        OMPClauseWithPreInit(this), Kind(OMPC_DIST_SCHEDULE_unknown),
+        ChunkSize(nullptr) {}
 
   /// \brief Get kind of the clause.
   ///
@@ -3295,31 +3893,605 @@
   SourceLocation getCommaLoc() { return CommaLoc; }
   /// \brief Get chunk size.
   ///
-  Expr *getChunkSize() {
-    return dyn_cast_or_null<Expr>(ChunkSizes[CHUNK_SIZE]);
-  }
+  Expr *getChunkSize() { return ChunkSize; }
   /// \brief Get chunk size.
   ///
-  Expr *getChunkSize() const {
-    return dyn_cast_or_null<Expr>(ChunkSizes[CHUNK_SIZE]);
-  }
-  /// \brief Get helper chunk size.
-  ///
-  Expr *getHelperChunkSize() {
-    return dyn_cast_or_null<Expr>(ChunkSizes[HELPER_CHUNK_SIZE]);
-  }
-  /// \brief Get helper chunk size.
-  ///
-  Expr *getHelperChunkSize() const {
-    return dyn_cast_or_null<Expr>(ChunkSizes[HELPER_CHUNK_SIZE]);
-  }
+  const Expr *getChunkSize() const { return ChunkSize; }
 
   static bool classof(const OMPClause *T) {
     return T->getClauseKind() == OMPC_dist_schedule;
   }
 
   child_range children() {
-    return child_range(&ChunkSizes[CHUNK_SIZE], &ChunkSizes[CHUNK_SIZE] + 1);
+    return child_range(reinterpret_cast<Stmt **>(&ChunkSize),
+                       reinterpret_cast<Stmt **>(&ChunkSize) + 1);
+  }
+};
+
+/// \brief This represents 'defaultmap' clause in the '#pragma omp ...' directive.
+///
+/// \code
+/// #pragma omp target defaultmap(tofrom: scalar)
+/// \endcode
+/// In this example directive '#pragma omp target' has 'defaultmap' clause of kind
+/// 'scalar' with modifier 'tofrom'.
+///
+class OMPDefaultmapClause : public OMPClause {
+  friend class OMPClauseReader;
+  /// \brief Location of '('.
+  SourceLocation LParenLoc;
+  /// \brief Modifiers for 'defaultmap' clause.
+  OpenMPDefaultmapClauseModifier Modifier;
+  /// \brief Locations of modifiers.
+  SourceLocation ModifierLoc;
+  /// \brief A kind of the 'defaultmap' clause.
+  OpenMPDefaultmapClauseKind Kind;
+  /// \brief Start location of the defaultmap kind in source code.
+  SourceLocation KindLoc;
+
+  /// \brief Set defaultmap kind.
+  ///
+  /// \param K Defaultmap kind.
+  ///
+  void setDefaultmapKind(OpenMPDefaultmapClauseKind K) { Kind = K; }
+  /// \brief Set the defaultmap modifier.
+  ///
+  /// \param M Defaultmap modifier.
+  ///
+  void setDefaultmapModifier(OpenMPDefaultmapClauseModifier M) {
+    Modifier = M;
+  }
+  /// \brief Set location of the defaultmap modifier.
+  ///
+  void setDefaultmapModifierLoc(SourceLocation Loc) {
+    ModifierLoc = Loc;
+  }
+  /// \brief Sets the location of '('.
+  ///
+  /// \param Loc Location of '('.
+  ///
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+  /// \brief Set defaultmap kind start location.
+  ///
+  /// \param KLoc Defaultmap kind location.
+  ///
+  void setDefaultmapKindLoc(SourceLocation KLoc) { KindLoc = KLoc; }
+
+public:
+  /// \brief Build 'defaultmap' clause with defaultmap kind \a Kind
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param KLoc Starting location of the argument.
+  /// \param EndLoc Ending location of the clause.
+  /// \param Kind Defaultmap kind.
+  /// \param M The modifier applied to 'defaultmap' clause.
+  /// \param MLoc Location of the modifier
+  ///
+  OMPDefaultmapClause(SourceLocation StartLoc, SourceLocation LParenLoc,
+                      SourceLocation MLoc, SourceLocation KLoc,
+                      SourceLocation EndLoc, OpenMPDefaultmapClauseKind Kind,
+                      OpenMPDefaultmapClauseModifier M)
+      : OMPClause(OMPC_defaultmap, StartLoc, EndLoc), LParenLoc(LParenLoc),
+        Modifier(M), ModifierLoc(MLoc), Kind(Kind), KindLoc(KLoc) {}
+
+  /// \brief Build an empty clause.
+  ///
+  explicit OMPDefaultmapClause()
+      : OMPClause(OMPC_defaultmap, SourceLocation(), SourceLocation()),
+        Modifier(OMPC_DEFAULTMAP_MODIFIER_unknown),
+        Kind(OMPC_DEFAULTMAP_unknown) {}
+
+  /// \brief Get kind of the clause.
+  ///
+  OpenMPDefaultmapClauseKind getDefaultmapKind() const { return Kind; }
+  /// \brief Get the modifier of the clause.
+  ///
+  OpenMPDefaultmapClauseModifier getDefaultmapModifier() const {
+    return Modifier;
+  }
+  /// \brief Get location of '('.
+  ///
+  SourceLocation getLParenLoc() { return LParenLoc; }
+  /// \brief Get kind location.
+  ///
+  SourceLocation getDefaultmapKindLoc() { return KindLoc; }
+  /// \brief Get the modifier location.
+  ///
+  SourceLocation getDefaultmapModifierLoc() const {
+    return ModifierLoc;
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == OMPC_defaultmap;
+  }
+
+  child_range children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+};
+
+/// \brief This represents clause 'to' in the '#pragma omp ...'
+/// directives.
+///
+/// \code
+/// #pragma omp target update to(a,b)
+/// \endcode
+/// In this example directive '#pragma omp target update' has clause 'to'
+/// with the variables 'a' and 'b'.
+///
+class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
+                          private llvm::TrailingObjects<
+                              OMPToClause, Expr *, ValueDecl *, unsigned,
+                              OMPClauseMappableExprCommon::MappableComponent> {
+  friend TrailingObjects;
+  friend OMPVarListClause;
+  friend OMPMappableExprListClause;
+  friend class OMPClauseReader;
+
+  /// Define the sizes of each trailing object array except the last one. This
+  /// is required for TrailingObjects to work properly.
+  size_t numTrailingObjects(OverloadToken<Expr *>) const {
+    return varlist_size();
+  }
+  size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
+    return getUniqueDeclarationsNum();
+  }
+  size_t numTrailingObjects(OverloadToken<unsigned>) const {
+    return getUniqueDeclarationsNum() + getTotalComponentListNum();
+  }
+
+  /// \brief Build clause with number of variables \a NumVars.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  explicit OMPToClause(SourceLocation StartLoc, SourceLocation LParenLoc,
+                       SourceLocation EndLoc, unsigned NumVars,
+                       unsigned NumUniqueDeclarations,
+                       unsigned NumComponentLists, unsigned NumComponents)
+      : OMPMappableExprListClause(OMPC_to, StartLoc, LParenLoc, EndLoc, NumVars,
+                                  NumUniqueDeclarations, NumComponentLists,
+                                  NumComponents) {}
+
+  /// \brief Build an empty clause.
+  ///
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  explicit OMPToClause(unsigned NumVars, unsigned NumUniqueDeclarations,
+                       unsigned NumComponentLists, unsigned NumComponents)
+      : OMPMappableExprListClause(
+            OMPC_to, SourceLocation(), SourceLocation(), SourceLocation(),
+            NumVars, NumUniqueDeclarations, NumComponentLists, NumComponents) {}
+
+public:
+  /// \brief Creates clause with a list of variables \a Vars.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  /// \param Vars The original expression used in the clause.
+  /// \param Declarations Declarations used in the clause.
+  /// \param ComponentLists Component lists used in the clause.
+  ///
+  static OMPToClause *Create(const ASTContext &C, SourceLocation StartLoc,
+                             SourceLocation LParenLoc, SourceLocation EndLoc,
+                             ArrayRef<Expr *> Vars,
+                             ArrayRef<ValueDecl *> Declarations,
+                             MappableExprComponentListsRef ComponentLists);
+
+  /// \brief Creates an empty clause with the place for \a NumVars variables.
+  ///
+  /// \param C AST context.
+  /// \param NumVars Number of expressions listed in the clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  static OMPToClause *CreateEmpty(const ASTContext &C, unsigned NumVars,
+                                  unsigned NumUniqueDeclarations,
+                                  unsigned NumComponentLists,
+                                  unsigned NumComponents);
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == OMPC_to;
+  }
+
+  child_range children() {
+    return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
+                       reinterpret_cast<Stmt **>(varlist_end()));
+  }
+};
+
+/// \brief This represents clause 'from' in the '#pragma omp ...'
+/// directives.
+///
+/// \code
+/// #pragma omp target update from(a,b)
+/// \endcode
+/// In this example directive '#pragma omp target update' has clause 'from'
+/// with the variables 'a' and 'b'.
+///
+class OMPFromClause final
+    : public OMPMappableExprListClause<OMPFromClause>,
+      private llvm::TrailingObjects<
+          OMPFromClause, Expr *, ValueDecl *, unsigned,
+          OMPClauseMappableExprCommon::MappableComponent> {
+  friend TrailingObjects;
+  friend OMPVarListClause;
+  friend OMPMappableExprListClause;
+  friend class OMPClauseReader;
+
+  /// Define the sizes of each trailing object array except the last one. This
+  /// is required for TrailingObjects to work properly.
+  size_t numTrailingObjects(OverloadToken<Expr *>) const {
+    return varlist_size();
+  }
+  size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
+    return getUniqueDeclarationsNum();
+  }
+  size_t numTrailingObjects(OverloadToken<unsigned>) const {
+    return getUniqueDeclarationsNum() + getTotalComponentListNum();
+  }
+
+  /// \brief Build clause with number of variables \a NumVars.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  explicit OMPFromClause(SourceLocation StartLoc, SourceLocation LParenLoc,
+                         SourceLocation EndLoc, unsigned NumVars,
+                         unsigned NumUniqueDeclarations,
+                         unsigned NumComponentLists, unsigned NumComponents)
+      : OMPMappableExprListClause(OMPC_from, StartLoc, LParenLoc, EndLoc,
+                                  NumVars, NumUniqueDeclarations,
+                                  NumComponentLists, NumComponents) {}
+
+  /// \brief Build an empty clause.
+  ///
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  explicit OMPFromClause(unsigned NumVars, unsigned NumUniqueDeclarations,
+                         unsigned NumComponentLists, unsigned NumComponents)
+      : OMPMappableExprListClause(
+            OMPC_from, SourceLocation(), SourceLocation(), SourceLocation(),
+            NumVars, NumUniqueDeclarations, NumComponentLists, NumComponents) {}
+
+public:
+  /// \brief Creates clause with a list of variables \a Vars.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  /// \param Vars The original expression used in the clause.
+  /// \param Declarations Declarations used in the clause.
+  /// \param ComponentLists Component lists used in the clause.
+  ///
+  static OMPFromClause *Create(const ASTContext &C, SourceLocation StartLoc,
+                               SourceLocation LParenLoc, SourceLocation EndLoc,
+                               ArrayRef<Expr *> Vars,
+                               ArrayRef<ValueDecl *> Declarations,
+                               MappableExprComponentListsRef ComponentLists);
+
+  /// \brief Creates an empty clause with the place for \a NumVars variables.
+  ///
+  /// \param C AST context.
+  /// \param NumVars Number of expressions listed in the clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  static OMPFromClause *CreateEmpty(const ASTContext &C, unsigned NumVars,
+                                    unsigned NumUniqueDeclarations,
+                                    unsigned NumComponentLists,
+                                    unsigned NumComponents);
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == OMPC_from;
+  }
+
+  child_range children() {
+    return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
+                       reinterpret_cast<Stmt **>(varlist_end()));
+  }
+};
+
+/// This represents clause 'use_device_ptr' in the '#pragma omp ...'
+/// directives.
+///
+/// \code
+/// #pragma omp target data use_device_ptr(a,b)
+/// \endcode
+/// In this example directive '#pragma omp target data' has clause
+/// 'use_device_ptr' with the variables 'a' and 'b'.
+///
+class OMPUseDevicePtrClause final
+    : public OMPMappableExprListClause<OMPUseDevicePtrClause>,
+      private llvm::TrailingObjects<
+          OMPUseDevicePtrClause, Expr *, ValueDecl *, unsigned,
+          OMPClauseMappableExprCommon::MappableComponent> {
+  friend TrailingObjects;
+  friend OMPVarListClause;
+  friend OMPMappableExprListClause;
+  friend class OMPClauseReader;
+
+  /// Define the sizes of each trailing object array except the last one. This
+  /// is required for TrailingObjects to work properly.
+  size_t numTrailingObjects(OverloadToken<Expr *>) const {
+    return 3 * varlist_size();
+  }
+  size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
+    return getUniqueDeclarationsNum();
+  }
+  size_t numTrailingObjects(OverloadToken<unsigned>) const {
+    return getUniqueDeclarationsNum() + getTotalComponentListNum();
+  }
+
+  /// Build clause with number of variables \a NumVars.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  explicit OMPUseDevicePtrClause(SourceLocation StartLoc,
+                                 SourceLocation LParenLoc,
+                                 SourceLocation EndLoc, unsigned NumVars,
+                                 unsigned NumUniqueDeclarations,
+                                 unsigned NumComponentLists,
+                                 unsigned NumComponents)
+      : OMPMappableExprListClause(OMPC_use_device_ptr, StartLoc, LParenLoc,
+                                  EndLoc, NumVars, NumUniqueDeclarations,
+                                  NumComponentLists, NumComponents) {}
+
+  /// Build an empty clause.
+  ///
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  explicit OMPUseDevicePtrClause(unsigned NumVars,
+                                 unsigned NumUniqueDeclarations,
+                                 unsigned NumComponentLists,
+                                 unsigned NumComponents)
+      : OMPMappableExprListClause(OMPC_use_device_ptr, SourceLocation(),
+                                  SourceLocation(), SourceLocation(), NumVars,
+                                  NumUniqueDeclarations, NumComponentLists,
+                                  NumComponents) {}
+
+  /// Sets the list of references to private copies with initializers for new
+  /// private variables.
+  /// \param VL List of references.
+  void setPrivateCopies(ArrayRef<Expr *> VL);
+
+  /// Gets the list of references to private copies with initializers for new
+  /// private variables.
+  MutableArrayRef<Expr *> getPrivateCopies() {
+    return MutableArrayRef<Expr *>(varlist_end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getPrivateCopies() const {
+    return llvm::makeArrayRef(varlist_end(), varlist_size());
+  }
+
+  /// Sets the list of references to initializer variables for new private
+  /// variables.
+  /// \param VL List of references.
+  void setInits(ArrayRef<Expr *> VL);
+
+  /// Gets the list of references to initializer variables for new private
+  /// variables.
+  MutableArrayRef<Expr *> getInits() {
+    return MutableArrayRef<Expr *>(getPrivateCopies().end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getInits() const {
+    return llvm::makeArrayRef(getPrivateCopies().end(), varlist_size());
+  }
+
+public:
+  /// Creates clause with a list of variables \a Vars.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  /// \param Vars The original expression used in the clause.
+  /// \param PrivateVars Expressions referring to private copies.
+  /// \param Inits Expressions referring to private copy initializers.
+  /// \param Declarations Declarations used in the clause.
+  /// \param ComponentLists Component lists used in the clause.
+  ///
+  static OMPUseDevicePtrClause *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+         SourceLocation EndLoc, ArrayRef<Expr *> Vars,
+         ArrayRef<Expr *> PrivateVars, ArrayRef<Expr *> Inits,
+         ArrayRef<ValueDecl *> Declarations,
+         MappableExprComponentListsRef ComponentLists);
+
+  /// Creates an empty clause with the place for \a NumVars variables.
+  ///
+  /// \param C AST context.
+  /// \param NumVars Number of expressions listed in the clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  static OMPUseDevicePtrClause *CreateEmpty(const ASTContext &C,
+                                            unsigned NumVars,
+                                            unsigned NumUniqueDeclarations,
+                                            unsigned NumComponentLists,
+                                            unsigned NumComponents);
+
+  typedef MutableArrayRef<Expr *>::iterator private_copies_iterator;
+  typedef ArrayRef<const Expr *>::iterator private_copies_const_iterator;
+  typedef llvm::iterator_range<private_copies_iterator> private_copies_range;
+  typedef llvm::iterator_range<private_copies_const_iterator>
+      private_copies_const_range;
+
+  private_copies_range private_copies() {
+    return private_copies_range(getPrivateCopies().begin(),
+                                getPrivateCopies().end());
+  }
+  private_copies_const_range private_copies() const {
+    return private_copies_const_range(getPrivateCopies().begin(),
+                                      getPrivateCopies().end());
+  }
+
+  typedef MutableArrayRef<Expr *>::iterator inits_iterator;
+  typedef ArrayRef<const Expr *>::iterator inits_const_iterator;
+  typedef llvm::iterator_range<inits_iterator> inits_range;
+  typedef llvm::iterator_range<inits_const_iterator> inits_const_range;
+
+  inits_range inits() {
+    return inits_range(getInits().begin(), getInits().end());
+  }
+  inits_const_range inits() const {
+    return inits_const_range(getInits().begin(), getInits().end());
+  }
+
+  child_range children() {
+    return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
+                       reinterpret_cast<Stmt **>(varlist_end()));
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == OMPC_use_device_ptr;
+  }
+};
+
+/// This represents clause 'is_device_ptr' in the '#pragma omp ...'
+/// directives.
+///
+/// \code
+/// #pragma omp target is_device_ptr(a,b)
+/// \endcode
+/// In this example directive '#pragma omp target' has clause
+/// 'is_device_ptr' with the variables 'a' and 'b'.
+///
+class OMPIsDevicePtrClause final
+    : public OMPMappableExprListClause<OMPIsDevicePtrClause>,
+      private llvm::TrailingObjects<
+          OMPIsDevicePtrClause, Expr *, ValueDecl *, unsigned,
+          OMPClauseMappableExprCommon::MappableComponent> {
+  friend TrailingObjects;
+  friend OMPVarListClause;
+  friend OMPMappableExprListClause;
+  friend class OMPClauseReader;
+
+  /// Define the sizes of each trailing object array except the last one. This
+  /// is required for TrailingObjects to work properly.
+  size_t numTrailingObjects(OverloadToken<Expr *>) const {
+    return varlist_size();
+  }
+  size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
+    return getUniqueDeclarationsNum();
+  }
+  size_t numTrailingObjects(OverloadToken<unsigned>) const {
+    return getUniqueDeclarationsNum() + getTotalComponentListNum();
+  }
+  /// Build clause with number of variables \a NumVars.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  explicit OMPIsDevicePtrClause(SourceLocation StartLoc,
+                                SourceLocation LParenLoc, SourceLocation EndLoc,
+                                unsigned NumVars,
+                                unsigned NumUniqueDeclarations,
+                                unsigned NumComponentLists,
+                                unsigned NumComponents)
+      : OMPMappableExprListClause(OMPC_is_device_ptr, StartLoc, LParenLoc,
+                                  EndLoc, NumVars, NumUniqueDeclarations,
+                                  NumComponentLists, NumComponents) {}
+
+  /// Build an empty clause.
+  ///
+  /// \param NumVars Number of expressions listed in this clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of component lists in this clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  explicit OMPIsDevicePtrClause(unsigned NumVars,
+                                unsigned NumUniqueDeclarations,
+                                unsigned NumComponentLists,
+                                unsigned NumComponents)
+      : OMPMappableExprListClause(OMPC_is_device_ptr, SourceLocation(),
+                                  SourceLocation(), SourceLocation(), NumVars,
+                                  NumUniqueDeclarations, NumComponentLists,
+                                  NumComponents) {}
+
+public:
+  /// Creates clause with a list of variables \a Vars.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  /// \param Vars The original expression used in the clause.
+  /// \param Declarations Declarations used in the clause.
+  /// \param ComponentLists Component lists used in the clause.
+  ///
+  static OMPIsDevicePtrClause *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+         SourceLocation EndLoc, ArrayRef<Expr *> Vars,
+         ArrayRef<ValueDecl *> Declarations,
+         MappableExprComponentListsRef ComponentLists);
+
+  /// Creates an empty clause with the place for \a NumVars variables.
+  ///
+  /// \param C AST context.
+  /// \param NumVars Number of expressions listed in the clause.
+  /// \param NumUniqueDeclarations Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponentLists Number of unique base declarations in this
+  /// clause.
+  /// \param NumComponents Total number of expression components in the clause.
+  ///
+  static OMPIsDevicePtrClause *CreateEmpty(const ASTContext &C,
+                                           unsigned NumVars,
+                                           unsigned NumUniqueDeclarations,
+                                           unsigned NumComponentLists,
+                                           unsigned NumComponents);
+
+  child_range children() {
+    return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
+                       reinterpret_cast<Stmt **>(varlist_end()));
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == OMPC_is_device_ptr;
   }
 };
 } // end namespace clang
diff --git a/include/clang/AST/OperationKinds.def b/include/clang/AST/OperationKinds.def
new file mode 100644
index 0000000..03a61e9
--- /dev/null
+++ b/include/clang/AST/OperationKinds.def
@@ -0,0 +1,408 @@
+//===--- OperationKinds.def - Operations Database ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file enumerates the different kinds of operations that can be
+// performed by various expressions.
+//
+//===----------------------------------------------------------------------===//
+//
+/// @file OperationKinds.def
+///
+/// In this file, each of the C/C++ operations is enumerated CAST_OPERATION,
+/// BINARY_OPERATION or UNARY_OPERATION macro, each of which can be specified by
+/// the code including this file.
+///
+/// Macros had one or two arguments:
+///
+/// Name: The name of the operation. Name (prefixed with CK_, UO_ or BO_) will
+/// be the name of the corresponding enumerator (see OperationsKinds.h).
+///
+/// Spelling: A string that provides a canonical spelling for the operation.
+
+#ifndef CAST_OPERATION
+#  define CAST_OPERATION(Name)
+#endif
+
+#ifndef BINARY_OPERATION
+#  define BINARY_OPERATION(Name, Spelling)
+#endif
+
+#ifndef UNARY_OPERATION
+#  define UNARY_OPERATION(Name, Spelling)
+#endif
+
+//===- Cast Operations  ---------------------------------------------------===//
+
+/// CK_Dependent - A conversion which cannot yet be analyzed because
+/// either the expression or target type is dependent.  These are
+/// created only for explicit casts; dependent ASTs aren't required
+/// to even approximately type-check.
+///   (T*) malloc(sizeof(T))
+///   reinterpret_cast<intptr_t>(A<T>::alloc());
+CAST_OPERATION(Dependent)
+
+/// CK_BitCast - A conversion which causes a bit pattern of one type
+/// to be reinterpreted as a bit pattern of another type.  Generally
+/// the operands must have equivalent size and unrelated types.
+///
+/// The pointer conversion char* -> int* is a bitcast.  A conversion
+/// from any pointer type to a C pointer type is a bitcast unless
+/// it's actually BaseToDerived or DerivedToBase.  A conversion to a
+/// block pointer or ObjC pointer type is a bitcast only if the
+/// operand has the same type kind; otherwise, it's one of the
+/// specialized casts below.
+///
+/// Vector coercions are bitcasts.
+CAST_OPERATION(BitCast)
+
+/// CK_LValueBitCast - A conversion which reinterprets the address of
+/// an l-value as an l-value of a different kind.  Used for
+/// reinterpret_casts of l-value expressions to reference types.
+///    bool b; reinterpret_cast<char&>(b) = 'a';
+CAST_OPERATION(LValueBitCast)
+
+/// CK_LValueToRValue - A conversion which causes the extraction of
+/// an r-value from the operand gl-value.  The result of an r-value
+/// conversion is always unqualified.
+CAST_OPERATION(LValueToRValue)
+
+/// CK_NoOp - A conversion which does not affect the type other than
+/// (possibly) adding qualifiers.
+///   int    -> int
+///   char** -> const char * const *
+CAST_OPERATION(NoOp)
+
+/// CK_BaseToDerived - A conversion from a C++ class pointer/reference
+/// to a derived class pointer/reference.
+///   B *b = static_cast<B*>(a);
+CAST_OPERATION(BaseToDerived)
+
+/// CK_DerivedToBase - A conversion from a C++ class pointer
+/// to a base class pointer.
+///   A *a = new B();
+CAST_OPERATION(DerivedToBase)
+
+/// CK_UncheckedDerivedToBase - A conversion from a C++ class
+/// pointer/reference to a base class that can assume that the
+/// derived pointer is not null.
+///   const A &a = B();
+///   b->method_from_a();
+CAST_OPERATION(UncheckedDerivedToBase)
+
+/// CK_Dynamic - A C++ dynamic_cast.
+CAST_OPERATION(Dynamic)
+
+/// CK_ToUnion - The GCC cast-to-union extension.
+///   int   -> union { int x; float y; }
+///   float -> union { int x; float y; }
+CAST_OPERATION(ToUnion)
+
+/// CK_ArrayToPointerDecay - Array to pointer decay.
+///   int[10] -> int*
+///   char[5][6] -> char(*)[6]
+CAST_OPERATION(ArrayToPointerDecay)
+
+/// CK_FunctionToPointerDecay - Function to pointer decay.
+///   void(int) -> void(*)(int)
+CAST_OPERATION(FunctionToPointerDecay)
+
+/// CK_NullToPointer - Null pointer constant to pointer, ObjC
+/// pointer, or block pointer.
+///   (void*) 0
+///   void (^block)() = 0;
+CAST_OPERATION(NullToPointer)
+
+/// CK_NullToMemberPointer - Null pointer constant to member pointer.
+///   int A::*mptr = 0;
+///   int (A::*fptr)(int) = nullptr;
+CAST_OPERATION(NullToMemberPointer)
+
+/// CK_BaseToDerivedMemberPointer - Member pointer in base class to
+/// member pointer in derived class.
+///   int B::*mptr = &A::member;
+CAST_OPERATION(BaseToDerivedMemberPointer)
+
+/// CK_DerivedToBaseMemberPointer - Member pointer in derived class to
+/// member pointer in base class.
+///   int A::*mptr = static_cast<int A::*>(&B::member);
+CAST_OPERATION(DerivedToBaseMemberPointer)
+
+/// CK_MemberPointerToBoolean - Member pointer to boolean.  A check
+/// against the null member pointer.
+CAST_OPERATION(MemberPointerToBoolean)
+
+/// CK_ReinterpretMemberPointer - Reinterpret a member pointer as a
+/// different kind of member pointer.  C++ forbids this from
+/// crossing between function and object types, but otherwise does
+/// not restrict it.  However, the only operation that is permitted
+/// on a "punned" member pointer is casting it back to the original
+/// type, which is required to be a lossless operation (although
+/// many ABIs do not guarantee this on all possible intermediate types).
+CAST_OPERATION(ReinterpretMemberPointer)
+
+/// CK_UserDefinedConversion - Conversion using a user defined type
+/// conversion function.
+///    struct A { operator int(); }; int i = int(A());
+CAST_OPERATION(UserDefinedConversion)
+
+/// CK_ConstructorConversion - Conversion by constructor.
+///    struct A { A(int); }; A a = A(10);
+CAST_OPERATION(ConstructorConversion)
+
+/// CK_IntegralToPointer - Integral to pointer.  A special kind of
+/// reinterpreting conversion.  Applies to normal, ObjC, and block
+/// pointers.
+///    (char*) 0x1001aab0
+///    reinterpret_cast<int*>(0)
+CAST_OPERATION(IntegralToPointer)
+
+/// CK_PointerToIntegral - Pointer to integral.  A special kind of
+/// reinterpreting conversion.  Applies to normal, ObjC, and block
+/// pointers.
+///    (intptr_t) "help!"
+CAST_OPERATION(PointerToIntegral)
+
+/// CK_PointerToBoolean - Pointer to boolean conversion.  A check
+/// against null.  Applies to normal, ObjC, and block pointers.
+CAST_OPERATION(PointerToBoolean)
+
+/// CK_ToVoid - Cast to void, discarding the computed value.
+///    (void) malloc(2048)
+CAST_OPERATION(ToVoid)
+
+/// CK_VectorSplat - A conversion from an arithmetic type to a
+/// vector of that element type.  Fills all elements ("splats") with
+/// the source value.
+///    __attribute__((ext_vector_type(4))) int v = 5;
+CAST_OPERATION(VectorSplat)
+
+/// CK_IntegralCast - A cast between integral types (other than to
+/// boolean).  Variously a bitcast, a truncation, a sign-extension,
+/// or a zero-extension.
+///    long l = 5;
+///    (unsigned) i
+CAST_OPERATION(IntegralCast)
+
+/// CK_IntegralToBoolean - Integral to boolean.  A check against zero.
+///    (bool) i
+CAST_OPERATION(IntegralToBoolean)
+
+/// CK_IntegralToFloating - Integral to floating point.
+///    float f = i;
+CAST_OPERATION(IntegralToFloating)
+
+/// CK_FloatingToIntegral - Floating point to integral.  Rounds
+/// towards zero, discarding any fractional component.
+///    (int) f
+CAST_OPERATION(FloatingToIntegral)
+
+/// CK_FloatingToBoolean - Floating point to boolean.
+///    (bool) f
+CAST_OPERATION(FloatingToBoolean)
+
+// CK_BooleanToSignedIntegral - Convert a boolean to -1 or 0 for true and
+// false, respectively.
+CAST_OPERATION(BooleanToSignedIntegral)
+
+/// CK_FloatingCast - Casting between floating types of different size.
+///    (double) f
+///    (float) ld
+CAST_OPERATION(FloatingCast)
+
+/// CK_CPointerToObjCPointerCast - Casting a C pointer kind to an
+/// Objective-C pointer.
+CAST_OPERATION(CPointerToObjCPointerCast)
+
+/// CK_BlockPointerToObjCPointerCast - Casting a block pointer to an
+/// ObjC pointer.
+CAST_OPERATION(BlockPointerToObjCPointerCast)
+
+/// CK_AnyPointerToBlockPointerCast - Casting any non-block pointer
+/// to a block pointer.  Block-to-block casts are bitcasts.
+CAST_OPERATION(AnyPointerToBlockPointerCast)
+
+/// \brief Converting between two Objective-C object types, which
+/// can occur when performing reference binding to an Objective-C
+/// object.
+CAST_OPERATION(ObjCObjectLValueCast)
+
+/// \brief A conversion of a floating point real to a floating point
+/// complex of the original type.  Injects the value as the real
+/// component with a zero imaginary component.
+///   float -> _Complex float
+CAST_OPERATION(FloatingRealToComplex)
+
+/// \brief Converts a floating point complex to floating point real
+/// of the source's element type.  Just discards the imaginary
+/// component.
+///   _Complex long double -> long double
+CAST_OPERATION(FloatingComplexToReal)
+
+/// \brief Converts a floating point complex to bool by comparing
+/// against 0+0i.
+CAST_OPERATION(FloatingComplexToBoolean)
+
+/// \brief Converts between different floating point complex types.
+///   _Complex float -> _Complex double
+CAST_OPERATION(FloatingComplexCast)
+
+/// \brief Converts from a floating complex to an integral complex.
+///   _Complex float -> _Complex int
+CAST_OPERATION(FloatingComplexToIntegralComplex)
+
+/// \brief Converts from an integral real to an integral complex
+/// whose element type matches the source.  Injects the value as
+/// the real component with a zero imaginary component.
+///   long -> _Complex long
+CAST_OPERATION(IntegralRealToComplex)
+
+/// \brief Converts an integral complex to an integral real of the
+/// source's element type by discarding the imaginary component.
+///   _Complex short -> short
+CAST_OPERATION(IntegralComplexToReal)
+
+/// \brief Converts an integral complex to bool by comparing against
+/// 0+0i.
+CAST_OPERATION(IntegralComplexToBoolean)
+
+/// \brief Converts between different integral complex types.
+///   _Complex char -> _Complex long long
+///   _Complex unsigned int -> _Complex signed int
+CAST_OPERATION(IntegralComplexCast)
+
+/// \brief Converts from an integral complex to a floating complex.
+///   _Complex unsigned -> _Complex float
+CAST_OPERATION(IntegralComplexToFloatingComplex)
+
+/// \brief [ARC] Produces a retainable object pointer so that it may
+/// be consumed, e.g. by being passed to a consuming parameter.
+/// Calls objc_retain.
+CAST_OPERATION(ARCProduceObject)
+
+/// \brief [ARC] Consumes a retainable object pointer that has just
+/// been produced, e.g. as the return value of a retaining call.
+/// Enters a cleanup to call objc_release at some indefinite time.
+CAST_OPERATION(ARCConsumeObject)
+
+/// \brief [ARC] Reclaim a retainable object pointer object that may
+/// have been produced and autoreleased as part of a function return
+/// sequence.
+CAST_OPERATION(ARCReclaimReturnedObject)
+
+/// \brief [ARC] Causes a value of block type to be copied to the
+/// heap, if it is not already there.  A number of other operations
+/// in ARC cause blocks to be copied; this is for cases where that
+/// would not otherwise be guaranteed, such as when casting to a
+/// non-block pointer type.
+CAST_OPERATION(ARCExtendBlockObject)
+
+/// \brief Converts from _Atomic(T) to T.
+CAST_OPERATION(AtomicToNonAtomic)
+/// \brief Converts from T to _Atomic(T).
+CAST_OPERATION(NonAtomicToAtomic)
+
+/// \brief Causes a block literal to by copied to the heap and then 
+/// autoreleased.
+///
+/// This particular cast kind is used for the conversion from a C++11
+/// lambda expression to a block pointer.
+CAST_OPERATION(CopyAndAutoreleaseBlockObject)
+
+// Convert a builtin function to a function pointer; only allowed in the
+// callee of a call expression.
+CAST_OPERATION(BuiltinFnToFnPtr)
+
+// Convert a zero value for OpenCL event_t initialization.
+CAST_OPERATION(ZeroToOCLEvent)
+
+// Convert a pointer to a different address space.
+CAST_OPERATION(AddressSpaceConversion)
+
+// Convert an integer initializer to an OpenCL sampler.
+CAST_OPERATION(IntToOCLSampler)
+
+//===- Binary Operations  -------------------------------------------------===//
+// Operators listed in order of precedence.
+// Note that additions to this should also update the StmtVisitor class.
+
+// [C++ 5.5] Pointer-to-member operators.
+BINARY_OPERATION(PtrMemD, ".*")
+BINARY_OPERATION(PtrMemI, "->*")
+// [C99 6.5.5] Multiplicative operators.
+BINARY_OPERATION(Mul, "*")
+BINARY_OPERATION(Div, "/")
+BINARY_OPERATION(Rem, "%")
+// [C99 6.5.6] Additive operators.
+BINARY_OPERATION(Add, "+")
+BINARY_OPERATION(Sub, "-")
+// [C99 6.5.7] Bitwise shift operators.
+BINARY_OPERATION(Shl, "<<")
+BINARY_OPERATION(Shr, ">>")
+// [C99 6.5.8] Relational operators.
+BINARY_OPERATION(LT, "<")
+BINARY_OPERATION(GT, ">")
+BINARY_OPERATION(LE, "<=")
+BINARY_OPERATION(GE, ">=")
+// [C99 6.5.9] Equality operators.
+BINARY_OPERATION(EQ, "==")
+BINARY_OPERATION(NE, "!=")
+// [C99 6.5.10] Bitwise AND operator.
+BINARY_OPERATION(And, "&")
+// [C99 6.5.11] Bitwise XOR operator.
+BINARY_OPERATION(Xor, "^")
+// [C99 6.5.12] Bitwise OR operator.
+BINARY_OPERATION(Or, "|")
+// [C99 6.5.13] Logical AND operator.
+BINARY_OPERATION(LAnd, "&&")
+// [C99 6.5.14] Logical OR operator.
+BINARY_OPERATION(LOr, "||")
+// [C99 6.5.16] Assignment operators.
+BINARY_OPERATION(Assign, "=")
+BINARY_OPERATION(MulAssign, "*=")
+BINARY_OPERATION(DivAssign, "/=")
+BINARY_OPERATION(RemAssign, "%=")
+BINARY_OPERATION(AddAssign, "+=")
+BINARY_OPERATION(SubAssign, "-=")
+BINARY_OPERATION(ShlAssign, "<<=")
+BINARY_OPERATION(ShrAssign, ">>=")
+BINARY_OPERATION(AndAssign, "&=")
+BINARY_OPERATION(XorAssign, "^=")
+BINARY_OPERATION(OrAssign, "|=")
+// [C99 6.5.17] Comma operator.
+BINARY_OPERATION(Comma, ",")
+
+
+//===- Unary Operations ---------------------------------------------------===//
+// Note that additions to this should also update the StmtVisitor class.
+
+// [C99 6.5.2.4] Postfix increment and decrement
+UNARY_OPERATION(PostInc, "++")
+UNARY_OPERATION(PostDec, "--")
+// [C99 6.5.3.1] Prefix increment and decrement 
+UNARY_OPERATION(PreInc, "++")
+UNARY_OPERATION(PreDec, "--")
+// [C99 6.5.3.2] Address and indirection
+UNARY_OPERATION(AddrOf, "&")
+UNARY_OPERATION(Deref, "*")
+// [C99 6.5.3.3] Unary arithmetic 
+UNARY_OPERATION(Plus, "+")
+UNARY_OPERATION(Minus, "-")
+UNARY_OPERATION(Not, "~")
+UNARY_OPERATION(LNot, "!")
+// "__real expr"/"__imag expr" Extension.
+UNARY_OPERATION(Real, "__real")
+UNARY_OPERATION(Imag, "__imag")
+// __extension__ marker.
+UNARY_OPERATION(Extension, "__extension__")
+// [C++ Coroutines] co_await operator
+UNARY_OPERATION(Coawait, "co_await")
+
+#undef CAST_OPERATION
+#undef BINARY_OPERATION
+#undef UNARY_OPERATION
diff --git a/include/clang/AST/OperationKinds.h b/include/clang/AST/OperationKinds.h
index 102bbc2..00f060f 100644
--- a/include/clang/AST/OperationKinds.h
+++ b/include/clang/AST/OperationKinds.h
@@ -19,327 +19,20 @@
   
 /// CastKind - The kind of operation required for a conversion.
 enum CastKind {
-  /// CK_Dependent - A conversion which cannot yet be analyzed because
-  /// either the expression or target type is dependent.  These are
-  /// created only for explicit casts; dependent ASTs aren't required
-  /// to even approximately type-check.
-  ///   (T*) malloc(sizeof(T))
-  ///   reinterpret_cast<intptr_t>(A<T>::alloc());
-  CK_Dependent,
-
-  /// CK_BitCast - A conversion which causes a bit pattern of one type
-  /// to be reinterpreted as a bit pattern of another type.  Generally
-  /// the operands must have equivalent size and unrelated types.
-  ///
-  /// The pointer conversion char* -> int* is a bitcast.  A conversion
-  /// from any pointer type to a C pointer type is a bitcast unless
-  /// it's actually BaseToDerived or DerivedToBase.  A conversion to a
-  /// block pointer or ObjC pointer type is a bitcast only if the
-  /// operand has the same type kind; otherwise, it's one of the
-  /// specialized casts below.
-  ///
-  /// Vector coercions are bitcasts.
-  CK_BitCast,
-
-  /// CK_LValueBitCast - A conversion which reinterprets the address of
-  /// an l-value as an l-value of a different kind.  Used for
-  /// reinterpret_casts of l-value expressions to reference types.
-  ///    bool b; reinterpret_cast<char&>(b) = 'a';
-  CK_LValueBitCast,
-  
-  /// CK_LValueToRValue - A conversion which causes the extraction of
-  /// an r-value from the operand gl-value.  The result of an r-value
-  /// conversion is always unqualified.
-  CK_LValueToRValue,
-
-  /// CK_NoOp - A conversion which does not affect the type other than
-  /// (possibly) adding qualifiers.
-  ///   int    -> int
-  ///   char** -> const char * const *
-  CK_NoOp,
-
-  /// CK_BaseToDerived - A conversion from a C++ class pointer/reference
-  /// to a derived class pointer/reference.
-  ///   B *b = static_cast<B*>(a);
-  CK_BaseToDerived,
-
-  /// CK_DerivedToBase - A conversion from a C++ class pointer
-  /// to a base class pointer.
-  ///   A *a = new B();
-  CK_DerivedToBase,
-
-  /// CK_UncheckedDerivedToBase - A conversion from a C++ class
-  /// pointer/reference to a base class that can assume that the
-  /// derived pointer is not null.
-  ///   const A &a = B();
-  ///   b->method_from_a();
-  CK_UncheckedDerivedToBase,
-
-  /// CK_Dynamic - A C++ dynamic_cast.
-  CK_Dynamic,
-
-  /// CK_ToUnion - The GCC cast-to-union extension.
-  ///   int   -> union { int x; float y; }
-  ///   float -> union { int x; float y; }
-  CK_ToUnion,
-
-  /// CK_ArrayToPointerDecay - Array to pointer decay.
-  ///   int[10] -> int*
-  ///   char[5][6] -> char(*)[6]
-  CK_ArrayToPointerDecay,
-
-  /// CK_FunctionToPointerDecay - Function to pointer decay.
-  ///   void(int) -> void(*)(int)
-  CK_FunctionToPointerDecay,
-
-  /// CK_NullToPointer - Null pointer constant to pointer, ObjC
-  /// pointer, or block pointer.
-  ///   (void*) 0
-  ///   void (^block)() = 0;
-  CK_NullToPointer,
-
-  /// CK_NullToMemberPointer - Null pointer constant to member pointer.
-  ///   int A::*mptr = 0;
-  ///   int (A::*fptr)(int) = nullptr;
-  CK_NullToMemberPointer,
-
-  /// CK_BaseToDerivedMemberPointer - Member pointer in base class to
-  /// member pointer in derived class.
-  ///   int B::*mptr = &A::member;
-  CK_BaseToDerivedMemberPointer,
-
-  /// CK_DerivedToBaseMemberPointer - Member pointer in derived class to
-  /// member pointer in base class.
-  ///   int A::*mptr = static_cast<int A::*>(&B::member);
-  CK_DerivedToBaseMemberPointer,
-    
-  /// CK_MemberPointerToBoolean - Member pointer to boolean.  A check
-  /// against the null member pointer.
-  CK_MemberPointerToBoolean,
-
-  /// CK_ReinterpretMemberPointer - Reinterpret a member pointer as a
-  /// different kind of member pointer.  C++ forbids this from
-  /// crossing between function and object types, but otherwise does
-  /// not restrict it.  However, the only operation that is permitted
-  /// on a "punned" member pointer is casting it back to the original
-  /// type, which is required to be a lossless operation (although
-  /// many ABIs do not guarantee this on all possible intermediate types).
-  CK_ReinterpretMemberPointer,
-
-  /// CK_UserDefinedConversion - Conversion using a user defined type
-  /// conversion function.
-  ///    struct A { operator int(); }; int i = int(A());
-  CK_UserDefinedConversion,
-
-  /// CK_ConstructorConversion - Conversion by constructor.
-  ///    struct A { A(int); }; A a = A(10);
-  CK_ConstructorConversion,
-    
-  /// CK_IntegralToPointer - Integral to pointer.  A special kind of
-  /// reinterpreting conversion.  Applies to normal, ObjC, and block
-  /// pointers.
-  ///    (char*) 0x1001aab0
-  ///    reinterpret_cast<int*>(0)
-  CK_IntegralToPointer,
-    
-  /// CK_PointerToIntegral - Pointer to integral.  A special kind of
-  /// reinterpreting conversion.  Applies to normal, ObjC, and block
-  /// pointers.
-  ///    (intptr_t) "help!"
-  CK_PointerToIntegral,
-
-  /// CK_PointerToBoolean - Pointer to boolean conversion.  A check
-  /// against null.  Applies to normal, ObjC, and block pointers.
-  CK_PointerToBoolean,
-    
-  /// CK_ToVoid - Cast to void, discarding the computed value.
-  ///    (void) malloc(2048)
-  CK_ToVoid,
-    
-  /// CK_VectorSplat - A conversion from an arithmetic type to a
-  /// vector of that element type.  Fills all elements ("splats") with
-  /// the source value.
-  ///    __attribute__((ext_vector_type(4))) int v = 5;
-  CK_VectorSplat,
-    
-  /// CK_IntegralCast - A cast between integral types (other than to
-  /// boolean).  Variously a bitcast, a truncation, a sign-extension,
-  /// or a zero-extension.
-  ///    long l = 5;
-  ///    (unsigned) i
-  CK_IntegralCast,
-
-  /// CK_IntegralToBoolean - Integral to boolean.  A check against zero.
-  ///    (bool) i
-  CK_IntegralToBoolean,
-
-  /// CK_IntegralToFloating - Integral to floating point.
-  ///    float f = i;
-  CK_IntegralToFloating,
-    
-  /// CK_FloatingToIntegral - Floating point to integral.  Rounds
-  /// towards zero, discarding any fractional component.
-  ///    (int) f
-  CK_FloatingToIntegral,
-
-  /// CK_FloatingToBoolean - Floating point to boolean.
-  ///    (bool) f
-  CK_FloatingToBoolean,
-
-  // CK_BooleanToSignedIntegral - Convert a boolean to -1 or 0 for true and
-  // false, respectively.
-  CK_BooleanToSignedIntegral,
-
-  /// CK_FloatingCast - Casting between floating types of different size.
-  ///    (double) f
-  ///    (float) ld
-  CK_FloatingCast,
-    
-  /// CK_CPointerToObjCPointerCast - Casting a C pointer kind to an
-  /// Objective-C pointer.
-  CK_CPointerToObjCPointerCast,
-
-  /// CK_BlockPointerToObjCPointerCast - Casting a block pointer to an
-  /// ObjC pointer.
-  CK_BlockPointerToObjCPointerCast,
-
-  /// CK_AnyPointerToBlockPointerCast - Casting any non-block pointer
-  /// to a block pointer.  Block-to-block casts are bitcasts.
-  CK_AnyPointerToBlockPointerCast,
-
-  /// \brief Converting between two Objective-C object types, which
-  /// can occur when performing reference binding to an Objective-C
-  /// object.
-  CK_ObjCObjectLValueCast,
-
-  /// \brief A conversion of a floating point real to a floating point
-  /// complex of the original type.  Injects the value as the real
-  /// component with a zero imaginary component.
-  ///   float -> _Complex float
-  CK_FloatingRealToComplex,
-
-  /// \brief Converts a floating point complex to floating point real
-  /// of the source's element type.  Just discards the imaginary
-  /// component.
-  ///   _Complex long double -> long double
-  CK_FloatingComplexToReal,
-
-  /// \brief Converts a floating point complex to bool by comparing
-  /// against 0+0i.
-  CK_FloatingComplexToBoolean,
-
-  /// \brief Converts between different floating point complex types.
-  ///   _Complex float -> _Complex double
-  CK_FloatingComplexCast,
-
-  /// \brief Converts from a floating complex to an integral complex.
-  ///   _Complex float -> _Complex int
-  CK_FloatingComplexToIntegralComplex,
-
-  /// \brief Converts from an integral real to an integral complex
-  /// whose element type matches the source.  Injects the value as
-  /// the real component with a zero imaginary component.
-  ///   long -> _Complex long
-  CK_IntegralRealToComplex,
-
-  /// \brief Converts an integral complex to an integral real of the
-  /// source's element type by discarding the imaginary component.
-  ///   _Complex short -> short
-  CK_IntegralComplexToReal,
-
-  /// \brief Converts an integral complex to bool by comparing against
-  /// 0+0i.
-  CK_IntegralComplexToBoolean,
-
-  /// \brief Converts between different integral complex types.
-  ///   _Complex char -> _Complex long long
-  ///   _Complex unsigned int -> _Complex signed int
-  CK_IntegralComplexCast,
-
-  /// \brief Converts from an integral complex to a floating complex.
-  ///   _Complex unsigned -> _Complex float
-  CK_IntegralComplexToFloatingComplex,
-
-  /// \brief [ARC] Produces a retainable object pointer so that it may
-  /// be consumed, e.g. by being passed to a consuming parameter.
-  /// Calls objc_retain.
-  CK_ARCProduceObject,
-
-  /// \brief [ARC] Consumes a retainable object pointer that has just
-  /// been produced, e.g. as the return value of a retaining call.
-  /// Enters a cleanup to call objc_release at some indefinite time.
-  CK_ARCConsumeObject,
-
-  /// \brief [ARC] Reclaim a retainable object pointer object that may
-  /// have been produced and autoreleased as part of a function return
-  /// sequence.
-  CK_ARCReclaimReturnedObject,
-
-  /// \brief [ARC] Causes a value of block type to be copied to the
-  /// heap, if it is not already there.  A number of other operations
-  /// in ARC cause blocks to be copied; this is for cases where that
-  /// would not otherwise be guaranteed, such as when casting to a
-  /// non-block pointer type.
-  CK_ARCExtendBlockObject,
-
-  /// \brief Converts from _Atomic(T) to T.
-  CK_AtomicToNonAtomic,
-  /// \brief Converts from T to _Atomic(T).
-  CK_NonAtomicToAtomic,
-  
-  /// \brief Causes a block literal to by copied to the heap and then 
-  /// autoreleased.
-  ///
-  /// This particular cast kind is used for the conversion from a C++11
-  /// lambda expression to a block pointer.
-  CK_CopyAndAutoreleaseBlockObject,
-
-  // Convert a builtin function to a function pointer; only allowed in the
-  // callee of a call expression.
-  CK_BuiltinFnToFnPtr,
-
-  // Convert a zero value for OpenCL event_t initialization.
-  CK_ZeroToOCLEvent,
-
-  // Convert a pointer to a different address space.
-  CK_AddressSpaceConversion
+#define CAST_OPERATION(Name) CK_##Name,
+#include "clang/AST/OperationKinds.def"
 };
 
 static const CastKind CK_Invalid = static_cast<CastKind>(-1);
 
 enum BinaryOperatorKind {
-  // Operators listed in order of precedence.
-  // Note that additions to this should also update the StmtVisitor class.
-  BO_PtrMemD, BO_PtrMemI,       // [C++ 5.5] Pointer-to-member operators.
-  BO_Mul, BO_Div, BO_Rem,       // [C99 6.5.5] Multiplicative operators.
-  BO_Add, BO_Sub,               // [C99 6.5.6] Additive operators.
-  BO_Shl, BO_Shr,               // [C99 6.5.7] Bitwise shift operators.
-  BO_LT, BO_GT, BO_LE, BO_GE,   // [C99 6.5.8] Relational operators.
-  BO_EQ, BO_NE,                 // [C99 6.5.9] Equality operators.
-  BO_And,                       // [C99 6.5.10] Bitwise AND operator.
-  BO_Xor,                       // [C99 6.5.11] Bitwise XOR operator.
-  BO_Or,                        // [C99 6.5.12] Bitwise OR operator.
-  BO_LAnd,                      // [C99 6.5.13] Logical AND operator.
-  BO_LOr,                       // [C99 6.5.14] Logical OR operator.
-  BO_Assign, BO_MulAssign,      // [C99 6.5.16] Assignment operators.
-  BO_DivAssign, BO_RemAssign,
-  BO_AddAssign, BO_SubAssign,
-  BO_ShlAssign, BO_ShrAssign,
-  BO_AndAssign, BO_XorAssign,
-  BO_OrAssign,
-  BO_Comma                      // [C99 6.5.17] Comma operator.
+#define BINARY_OPERATION(Name, Spelling) BO_##Name,
+#include "clang/AST/OperationKinds.def"
 };
 
 enum UnaryOperatorKind {
-  // Note that additions to this should also update the StmtVisitor class.
-  UO_PostInc, UO_PostDec, // [C99 6.5.2.4] Postfix increment and decrement
-  UO_PreInc, UO_PreDec,   // [C99 6.5.3.1] Prefix increment and decrement
-  UO_AddrOf, UO_Deref,    // [C99 6.5.3.2] Address and indirection
-  UO_Plus, UO_Minus,      // [C99 6.5.3.3] Unary arithmetic
-  UO_Not, UO_LNot,        // [C99 6.5.3.3] Unary arithmetic
-  UO_Real, UO_Imag,       // "__real expr"/"__imag expr" Extension.
-  UO_Extension,           // __extension__ marker.
-  UO_Coawait              // [C++ Coroutines] co_await operator
+#define UNARY_OPERATION(Name, Spelling) UO_##Name,
+#include "clang/AST/OperationKinds.def"
 };
 
 /// \brief The kind of bridging performed by the Objective-C bridge cast.
@@ -355,6 +48,6 @@
   OBC_BridgeRetained
 };
 
-}
+}  // end namespace clang
 
 #endif
diff --git a/include/clang/AST/PrettyPrinter.h b/include/clang/AST/PrettyPrinter.h
index 57495ef..274df22 100644
--- a/include/clang/AST/PrettyPrinter.h
+++ b/include/clang/AST/PrettyPrinter.h
@@ -32,21 +32,35 @@
 
 /// \brief Describes how types, statements, expressions, and
 /// declarations should be printed.
+///
+/// This type is intended to be small and suitable for passing by value.
+/// It is very frequently copied.
 struct PrintingPolicy {
-  /// \brief Create a default printing policy for C.
+  /// \brief Create a default printing policy for the specified language.
   PrintingPolicy(const LangOptions &LO)
-    : LangOpts(LO), Indentation(2), SuppressSpecifiers(false),
-      SuppressTagKeyword(false), SuppressTag(false), SuppressScope(false),
+    : Indentation(2), SuppressSpecifiers(false),
+      SuppressTagKeyword(LO.CPlusPlus),
+      IncludeTagDefinition(false), SuppressScope(false),
       SuppressUnwrittenScope(false), SuppressInitializers(false),
       ConstantArraySizeAsWritten(false), AnonymousTagLocations(true),
       SuppressStrongLifetime(false), SuppressLifetimeQualifiers(false),
       SuppressTemplateArgsInCXXConstructors(false),
-      Bool(LO.Bool), TerseOutput(false), PolishForDeclaration(false),
+      Bool(LO.Bool), Restrict(LO.C99),
+      Alignof(LO.CPlusPlus11), UnderscoreAlignof(LO.C11),
+      UseVoidForZeroParams(!LO.CPlusPlus),
+      TerseOutput(false), PolishForDeclaration(false),
       Half(LO.Half), MSWChar(LO.MicrosoftExt && !LO.WChar),
       IncludeNewlines(true), MSVCFormatting(false) { }
 
-  /// \brief What language we're printing.
-  LangOptions LangOpts;
+  /// \brief Adjust this printing policy for cases where it's known that
+  /// we're printing C++ code (for instance, if AST dumping reaches a
+  /// C++-only construct). This should not be used if a real LangOptions
+  /// object is available.
+  void adjustForCPlusPlus() {
+    SuppressTagKeyword = true;
+    Bool = true;
+    UseVoidForZeroParams = false;
+  }
 
   /// \brief The number of spaces to use to indent each line.
   unsigned Indentation : 8;
@@ -77,15 +91,15 @@
   /// \endcode
   bool SuppressTagKeyword : 1;
 
-  /// \brief Whether type printing should skip printing the actual tag type.
+  /// \brief When true, include the body of a tag definition.
   ///
-  /// This is used when the caller needs to print a tag definition in front
-  /// of the type, as in constructs like the following:
+  /// This is used to place the definition of a struct
+  /// in the middle of another declaration as with:
   ///
   /// \code
   /// typedef struct { int x, y; } Point;
   /// \endcode
-  bool SuppressTag : 1;
+  bool IncludeTagDefinition : 1;
 
   /// \brief Suppresses printing of scope specifiers.
   bool SuppressScope : 1;
@@ -142,10 +156,23 @@
   /// constructors.
   unsigned SuppressTemplateArgsInCXXConstructors : 1;
 
-  /// \brief Whether we can use 'bool' rather than '_Bool', even if the language
-  /// doesn't actually have 'bool' (because, e.g., it is defined as a macro).
+  /// \brief Whether we can use 'bool' rather than '_Bool' (even if the language
+  /// doesn't actually have 'bool', because, e.g., it is defined as a macro).
   unsigned Bool : 1;
 
+  /// \brief Whether we can use 'restrict' rather than '__restrict'.
+  unsigned Restrict : 1;
+
+  /// \brief Whether we can use 'alignof' rather than '__alignof'.
+  unsigned Alignof : 1;
+
+  /// \brief Whether we can use '_Alignof' rather than '__alignof'.
+  unsigned UnderscoreAlignof : 1;
+
+  /// \brief Whether we should use '(void)' rather than '()' for a function
+  /// prototype with zero parameters.
+  unsigned UseVoidForZeroParams : 1;
+
   /// \brief Provide a 'terse' output.
   ///
   /// For example, in this mode we don't print function bodies, class members,
diff --git a/include/clang/AST/RecordLayout.h b/include/clang/AST/RecordLayout.h
index 667f235..7a39c3b 100644
--- a/include/clang/AST/RecordLayout.h
+++ b/include/clang/AST/RecordLayout.h
@@ -71,10 +71,7 @@
   CharUnits RequiredAlignment;
 
   /// FieldOffsets - Array of field offsets in bits.
-  uint64_t *FieldOffsets;
-
-  // FieldCount - Number of fields.
-  unsigned FieldCount;
+  ASTVector<uint64_t> FieldOffsets;
 
   /// CXXRecordLayoutInfo - Contains C++ specific layout information.
   struct CXXRecordLayoutInfo {
@@ -104,10 +101,10 @@
     /// a primary base class.
     bool HasExtendableVFPtr : 1;
 
-    /// HasZeroSizedSubObject - True if this class contains a zero sized member
-    /// or base or a base with a zero sized member or base.  Only used for
-    /// MS-ABI.
-    bool HasZeroSizedSubObject : 1;
+    /// EndsWithZeroSizedObject - True if this class contains a zero sized
+    /// member or base or a base with a zero sized member or base.
+    /// Only used for MS-ABI.
+    bool EndsWithZeroSizedObject : 1;
 
     /// \brief True if this class is zero sized or first base is zero sized or
     /// has this property.  Only used for MS-ABI.
@@ -136,9 +133,8 @@
   friend class ASTContext;
 
   ASTRecordLayout(const ASTContext &Ctx, CharUnits size, CharUnits alignment,
-                  CharUnits requiredAlignment,
-                  CharUnits datasize, const uint64_t *fieldoffsets,
-                  unsigned fieldcount);
+                  CharUnits requiredAlignment, CharUnits datasize,
+                  ArrayRef<uint64_t> fieldoffsets);
 
   // Constructor for C++ records.
   typedef CXXRecordLayoutInfo::BaseOffsetsMapTy BaseOffsetsMapTy;
@@ -148,13 +144,13 @@
                   bool hasOwnVFPtr, bool hasExtendableVFPtr,
                   CharUnits vbptroffset,
                   CharUnits datasize,
-                  const uint64_t *fieldoffsets, unsigned fieldcount,
+                  ArrayRef<uint64_t> fieldoffsets,
                   CharUnits nonvirtualsize, CharUnits nonvirtualalignment,
                   CharUnits SizeOfLargestEmptySubobject,
                   const CXXRecordDecl *PrimaryBase,
                   bool IsPrimaryBaseVirtual,
                   const CXXRecordDecl *BaseSharingVBPtr,
-                  bool HasZeroSizedSubObject,
+                  bool EndsWithZeroSizedObject,
                   bool LeadsWithZeroSizedBase,
                   const BaseOffsetsMapTy& BaseOffsets,
                   const VBaseOffsetsMapTy& VBaseOffsets);
@@ -174,12 +170,11 @@
   CharUnits getSize() const { return Size; }
 
   /// getFieldCount - Get the number of fields in the layout.
-  unsigned getFieldCount() const { return FieldCount; }
+  unsigned getFieldCount() const { return FieldOffsets.size(); }
 
   /// getFieldOffset - Get the offset of the given field index, in
   /// bits.
   uint64_t getFieldOffset(unsigned FieldNo) const {
-    assert (FieldNo < FieldCount && "Invalid Field No");
     return FieldOffsets[FieldNo];
   }
 
@@ -283,8 +278,8 @@
     return RequiredAlignment;
   }
 
-  bool hasZeroSizedSubObject() const {
-    return CXXInfo && CXXInfo->HasZeroSizedSubObject;
+  bool endsWithZeroSizedObject() const {
+    return CXXInfo && CXXInfo->EndsWithZeroSizedObject;
   }
 
   bool leadsWithZeroSizedBase() const {
diff --git a/include/clang/AST/RecursiveASTVisitor.h b/include/clang/AST/RecursiveASTVisitor.h
index 27cb7bd..1812b55 100644
--- a/include/clang/AST/RecursiveASTVisitor.h
+++ b/include/clang/AST/RecursiveASTVisitor.h
@@ -72,8 +72,8 @@
       return false;                                                            \
   } while (0)
 
-/// \brief A class that does preorder depth-first traversal on the
-/// entire Clang AST and visits each node.
+/// \brief A class that does preordor or postorder
+/// depth-first traversal on the entire Clang AST and visits each node.
 ///
 /// This class performs three distinct tasks:
 ///   1. traverse the AST (i.e. go to each node);
@@ -133,6 +133,10 @@
 /// to return true, in which case all known implicit and explicit
 /// instantiations will be visited at the same time as the pattern
 /// from which they were produced.
+///
+/// By default, this visitor preorder traverses the AST. If postorder traversal
+/// is needed, the \c shouldTraversePostOrder method needs to be overriden
+/// to return \c true.
 template <typename Derived> class RecursiveASTVisitor {
 public:
   /// A queue used for performing data recursion over statements.
@@ -158,6 +162,9 @@
   /// code, e.g., implicit constructors and destructors.
   bool shouldVisitImplicitCode() const { return false; }
 
+  /// \brief Return whether this visitor should traverse post-order.
+  bool shouldTraversePostOrder() const { return false; }
+
   /// \brief Recursively visit a statement or expression, by
   /// dispatching to Traverse*() based on the argument's dynamic type.
   ///
@@ -349,7 +356,7 @@
   bool TraverseUnary##NAME(UnaryOperator *S,                                   \
                            DataRecursionQueue *Queue = nullptr) {              \
     TRY_TO(WalkUpFromUnary##NAME(S));                                          \
-    TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getSubExpr());                                    \
+    TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getSubExpr());                          \
     return true;                                                               \
   }                                                                            \
   bool WalkUpFromUnary##NAME(UnaryOperator *S) {                               \
@@ -367,9 +374,10 @@
 // (they're all opcodes in BinaryOperator) but do have visitors.
 #define GENERAL_BINOP_FALLBACK(NAME, BINOP_TYPE)                               \
   bool TraverseBin##NAME(BINOP_TYPE *S, DataRecursionQueue *Queue = nullptr) { \
-    TRY_TO(WalkUpFromBin##NAME(S));                                            \
-    TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getLHS());                                        \
-    TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getRHS());                                        \
+    if (!getDerived().shouldTraversePostOrder())                               \
+      TRY_TO(WalkUpFromBin##NAME(S));                                          \
+    TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getLHS());                              \
+    TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getRHS());                              \
     return true;                                                               \
   }                                                                            \
   bool WalkUpFromBin##NAME(BINOP_TYPE *S) {                                    \
@@ -494,8 +502,12 @@
 #include "clang/Basic/OpenMPKinds.def"
   /// \brief Process clauses with list of variables.
   template <typename T> bool VisitOMPClauseList(T *Node);
+  /// Process clauses with pre-initis.
+  bool VisitOMPClauseWithPreInit(OMPClauseWithPreInit *Node);
+  bool VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *Node);
 
   bool dataTraverseNode(Stmt *S, DataRecursionQueue *Queue);
+  bool PostVisitStmt(Stmt *S);
 };
 
 template <typename Derived>
@@ -553,6 +565,24 @@
 
 #undef DISPATCH_STMT
 
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::PostVisitStmt(Stmt *S) {
+  switch (S->getStmtClass()) {
+  case Stmt::NoStmtClass:
+    break;
+#define ABSTRACT_STMT(STMT)
+#define STMT(CLASS, PARENT)                                                    \
+  case Stmt::CLASS##Class:                                                     \
+    TRY_TO(WalkUpFrom##CLASS(static_cast<CLASS *>(S))); break;
+#include "clang/AST/StmtNodes.inc"
+  }
+
+  return true;
+}
+
+#undef DISPATCH_STMT
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::TraverseStmt(Stmt *S,
                                                 DataRecursionQueue *Queue) {
@@ -574,6 +604,9 @@
     if (Visited) {
       LocalQueue.pop_back();
       TRY_TO(dataTraverseStmtPost(CurrS));
+      if (getDerived().shouldTraversePostOrder()) {
+        TRY_TO(PostVisitStmt(CurrS));
+      }
       continue;
     }
 
@@ -835,6 +868,17 @@
 
   if (Init->isWritten() || getDerived().shouldVisitImplicitCode())
     TRY_TO(TraverseStmt(Init->getInit()));
+
+  if (getDerived().shouldVisitImplicitCode())
+    // The braces for this one-line loop are required for MSVC2013.  It
+    // refuses to compile
+    //     for (int i : int_vec)
+    //       do {} while(false);
+    // without braces on the for loop.
+    for (VarDecl *VD : Init->getArrayIndices()) {
+      TRY_TO(TraverseDecl(VD));
+    }
+
   return true;
 }
 
@@ -860,8 +904,11 @@
 #define DEF_TRAVERSE_TYPE(TYPE, CODE)                                          \
   template <typename Derived>                                                  \
   bool RecursiveASTVisitor<Derived>::Traverse##TYPE(TYPE *T) {                 \
-    TRY_TO(WalkUpFrom##TYPE(T));                                               \
+    if (!getDerived().shouldTraversePostOrder())                               \
+      TRY_TO(WalkUpFrom##TYPE(T));                                             \
     { CODE; }                                                                  \
+    if (getDerived().shouldTraversePostOrder())                                \
+      TRY_TO(WalkUpFrom##TYPE(T));                                             \
     return true;                                                               \
   }
 
@@ -1264,10 +1311,16 @@
 #define DEF_TRAVERSE_DECL(DECL, CODE)                                          \
   template <typename Derived>                                                  \
   bool RecursiveASTVisitor<Derived>::Traverse##DECL(DECL *D) {                 \
-    TRY_TO(WalkUpFrom##DECL(D));                                               \
+    bool ShouldVisitChildren = true;                                           \
+    bool ReturnValue = true;                                                   \
+    if (!getDerived().shouldTraversePostOrder())                               \
+      TRY_TO(WalkUpFrom##DECL(D));                                             \
     { CODE; }                                                                  \
-    TRY_TO(TraverseDeclContextHelper(dyn_cast<DeclContext>(D)));               \
-    return true;                                                               \
+    if (ReturnValue && ShouldVisitChildren)                                    \
+      TRY_TO(TraverseDeclContextHelper(dyn_cast<DeclContext>(D)));             \
+    if (ReturnValue && getDerived().shouldTraversePostOrder())                 \
+      TRY_TO(WalkUpFrom##DECL(D));                                             \
+    return ReturnValue;                                                        \
   }
 
 DEF_TRAVERSE_DECL(AccessSpecDecl, {})
@@ -1281,18 +1334,12 @@
       TRY_TO(TraverseStmt(I.getCopyExpr()));
     }
   }
-  // This return statement makes sure the traversal of nodes in
-  // decls_begin()/decls_end() (done in the DEF_TRAVERSE_DECL macro)
-  // is skipped - don't remove it.
-  return true;
+  ShouldVisitChildren = false;
 })
 
 DEF_TRAVERSE_DECL(CapturedDecl, {
   TRY_TO(TraverseStmt(D->getBody()));
-  // This return statement makes sure the traversal of nodes in
-  // decls_begin()/decls_end() (done in the DEF_TRAVERSE_DECL macro)
-  // is skipped - don't remove it.
-  return true;
+  ShouldVisitChildren = false;
 })
 
 DEF_TRAVERSE_DECL(EmptyDecl, {})
@@ -1351,6 +1398,10 @@
      // D->getAnonymousNamespace().
     })
 
+DEF_TRAVERSE_DECL(PragmaCommentDecl, {})
+
+DEF_TRAVERSE_DECL(PragmaDetectMismatchDecl, {})
+
 DEF_TRAVERSE_DECL(ExternCContextDecl, {})
 
 DEF_TRAVERSE_DECL(NamespaceAliasDecl, {
@@ -1358,11 +1409,7 @@
 
   // We shouldn't traverse an aliased namespace, since it will be
   // defined (and, therefore, traversed) somewhere else.
-  //
-  // This return statement makes sure the traversal of nodes in
-  // decls_begin()/decls_end() (done in the DEF_TRAVERSE_DECL macro)
-  // is skipped - don't remove it.
-  return true;
+  ShouldVisitChildren = false;
 })
 
 DEF_TRAVERSE_DECL(LabelDecl, {// There is no code in a LabelDecl.
@@ -1411,14 +1458,13 @@
   if (D->getReturnTypeSourceInfo()) {
     TRY_TO(TraverseTypeLoc(D->getReturnTypeSourceInfo()->getTypeLoc()));
   }
-  for (ObjCMethodDecl::param_iterator I = D->param_begin(), E = D->param_end();
-       I != E; ++I) {
-    TRY_TO(TraverseDecl(*I));
+  for (ParmVarDecl *Parameter : D->parameters()) {
+    TRY_TO(TraverseDecl(Parameter));
   }
   if (D->isThisDeclarationADefinition()) {
     TRY_TO(TraverseStmt(D->getBody()));
   }
-  return true;
+  ShouldVisitChildren = false;
 })
 
 DEF_TRAVERSE_DECL(ObjCTypeParamDecl, {
@@ -1435,7 +1481,7 @@
     TRY_TO(TraverseTypeLoc(D->getTypeSourceInfo()->getTypeLoc()));
   else
     TRY_TO(TraverseType(D->getType()));
-  return true;
+  ShouldVisitChildren = false;
 })
 
 DEF_TRAVERSE_DECL(UsingDecl, {
@@ -1449,12 +1495,24 @@
 
 DEF_TRAVERSE_DECL(UsingShadowDecl, {})
 
+DEF_TRAVERSE_DECL(ConstructorUsingShadowDecl, {})
+
 DEF_TRAVERSE_DECL(OMPThreadPrivateDecl, {
   for (auto *I : D->varlists()) {
     TRY_TO(TraverseStmt(I));
   }
 })
 
+DEF_TRAVERSE_DECL(OMPDeclareReductionDecl, {
+  TRY_TO(TraverseStmt(D->getCombiner()));
+  if (auto *Initializer = D->getInitializer())
+    TRY_TO(TraverseStmt(Initializer));
+  TRY_TO(TraverseType(D->getType()));
+  return true;
+})
+
+DEF_TRAVERSE_DECL(OMPCapturedExprDecl, { TRY_TO(TraverseVarHelper(D)); })
+
 // A helper method for TemplateDecl's children.
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::TraverseTemplateParameterListHelper(
@@ -1745,6 +1803,18 @@
   return true;
 }
 
+DEF_TRAVERSE_DECL(DecompositionDecl, {
+  TRY_TO(TraverseVarHelper(D));
+  for (auto *Binding : D->bindings()) {
+    TRY_TO(TraverseDecl(Binding));
+  }
+})
+
+DEF_TRAVERSE_DECL(BindingDecl, {
+  if (getDerived().shouldVisitImplicitCode())
+    TRY_TO(TraverseStmt(D->getBinding()));
+})
+
 DEF_TRAVERSE_DECL(MSPropertyDecl, { TRY_TO(TraverseDeclaratorHelper(D)); })
 
 DEF_TRAVERSE_DECL(FieldDecl, {
@@ -1804,10 +1874,9 @@
     // if the traverser is visiting implicit code. Parameter variable
     // declarations do not have valid TypeSourceInfo, so to visit them
     // we need to traverse the declarations explicitly.
-    for (FunctionDecl::param_const_iterator I = D->param_begin(),
-                                            E = D->param_end();
-         I != E; ++I)
-      TRY_TO(TraverseDecl(*I));
+    for (ParmVarDecl *Parameter : D->parameters()) {
+      TRY_TO(TraverseDecl(Parameter));
+    }
   }
 
   if (CXXConstructorDecl *Ctor = dyn_cast<CXXConstructorDecl>(D)) {
@@ -1826,19 +1895,22 @@
 DEF_TRAVERSE_DECL(FunctionDecl, {
   // We skip decls_begin/decls_end, which are already covered by
   // TraverseFunctionHelper().
-  return TraverseFunctionHelper(D);
+  ShouldVisitChildren = false;
+  ReturnValue = TraverseFunctionHelper(D);
 })
 
 DEF_TRAVERSE_DECL(CXXMethodDecl, {
   // We skip decls_begin/decls_end, which are already covered by
   // TraverseFunctionHelper().
-  return TraverseFunctionHelper(D);
+  ShouldVisitChildren = false;
+  ReturnValue = TraverseFunctionHelper(D);
 })
 
 DEF_TRAVERSE_DECL(CXXConstructorDecl, {
   // We skip decls_begin/decls_end, which are already covered by
   // TraverseFunctionHelper().
-  return TraverseFunctionHelper(D);
+  ShouldVisitChildren = false;
+  ReturnValue = TraverseFunctionHelper(D);
 })
 
 // CXXConversionDecl is the declaration of a type conversion operator.
@@ -1846,13 +1918,15 @@
 DEF_TRAVERSE_DECL(CXXConversionDecl, {
   // We skip decls_begin/decls_end, which are already covered by
   // TraverseFunctionHelper().
-  return TraverseFunctionHelper(D);
+  ShouldVisitChildren = false;
+  ReturnValue = TraverseFunctionHelper(D);
 })
 
 DEF_TRAVERSE_DECL(CXXDestructorDecl, {
   // We skip decls_begin/decls_end, which are already covered by
   // TraverseFunctionHelper().
-  return TraverseFunctionHelper(D);
+  ShouldVisitChildren = false;
+  ReturnValue = TraverseFunctionHelper(D);
 })
 
 template <typename Derived>
@@ -1904,12 +1978,19 @@
   template <typename Derived>                                                  \
   bool RecursiveASTVisitor<Derived>::Traverse##STMT(                           \
       STMT *S, DataRecursionQueue *Queue) {                                    \
-    TRY_TO(WalkUpFrom##STMT(S));                                               \
+    bool ShouldVisitChildren = true;                                           \
+    bool ReturnValue = true;                                                   \
+    if (!getDerived().shouldTraversePostOrder())                               \
+      TRY_TO(WalkUpFrom##STMT(S));                                             \
     { CODE; }                                                                  \
-    for (Stmt *SubStmt : S->children()) {                                      \
-      TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(SubStmt);                                          \
+    if (ShouldVisitChildren) {                                                 \
+      for (Stmt *SubStmt : S->children()) {                                    \
+        TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(SubStmt);                              \
+      }                                                                        \
     }                                                                          \
-    return true;                                                               \
+    if (!Queue && ReturnValue && getDerived().shouldTraversePostOrder())       \
+      TRY_TO(WalkUpFrom##STMT(S));                                             \
+    return ReturnValue;                                                        \
   }
 
 DEF_TRAVERSE_STMT(GCCAsmStmt, {
@@ -1946,7 +2027,7 @@
   // initializer]'.  The decls above already traverse over the
   // initializers, so we don't have to do it again (which
   // children() would do).
-  return true;
+  ShouldVisitChildren = false;
 })
 
 // These non-expr stmts (most of them), do not need any action except
@@ -1978,7 +2059,7 @@
     TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getRangeInit());
     TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getBody());
     // Visit everything else only if shouldVisitImplicitCode().
-    return true;
+    ShouldVisitChildren = false;
   }
 })
 DEF_TRAVERSE_STMT(MSDependentExistsStmt, {
@@ -2055,7 +2136,11 @@
 bool RecursiveASTVisitor<Derived>::TraverseSynOrSemInitListExpr(
     InitListExpr *S, DataRecursionQueue *Queue) {
   if (S) {
-    TRY_TO(WalkUpFromInitListExpr(S));
+    // Skip this if we traverse postorder. We will visit it later
+    // in PostVisitStmt.
+    if (!getDerived().shouldTraversePostOrder())
+      TRY_TO(WalkUpFromInitListExpr(S));
+
     // All we need are the default actions.  FIXME: use a helper function.
     for (Stmt *SubStmt : S->children()) {
       TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(SubStmt);
@@ -2075,7 +2160,7 @@
       S->isSemanticForm() ? S->getSyntacticForm() : S, Queue));
   TRY_TO(TraverseSynOrSemInitListExpr(
       S->isSemanticForm() ? S : S->getSemanticForm(), Queue));
-  return true;
+  ShouldVisitChildren = false;
 })
 
 // GenericSelectionExpr is a special case because the types and expressions
@@ -2088,7 +2173,7 @@
       TRY_TO(TraverseTypeLoc(TS->getTypeLoc()));
     TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getAssocExpr(i));
   }
-  return true;
+  ShouldVisitChildren = false;
 })
 
 // PseudoObjectExpr is a special case because of the weirdness with
@@ -2103,7 +2188,7 @@
       sub = OVE->getSourceExpr();
     TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(sub);
   }
-  return true;
+  ShouldVisitChildren = false;
 })
 
 DEF_TRAVERSE_STMT(CXXScalarValueInitExpr, {
@@ -2181,6 +2266,9 @@
        C != CEnd; ++C) {
     TRY_TO(TraverseLambdaCapture(S, C));
   }
+  for (Expr *Init : S->capture_inits()) {
+    TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(Init);
+  }
 
   TypeLoc TL = S->getCallOperator()->getTypeSourceInfo()->getTypeLoc();
   FunctionProtoTypeLoc Proto = TL.castAs<FunctionProtoTypeLoc>();
@@ -2207,7 +2295,8 @@
       TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(NE);
   }
 
-  return TRAVERSE_STMT_BASE(LambdaBody, LambdaExpr, S, Queue);
+  ReturnValue = TRAVERSE_STMT_BASE(LambdaBody, LambdaExpr, S, Queue);
+  ShouldVisitChildren = false;
 })
 
 DEF_TRAVERSE_STMT(CXXUnresolvedConstructExpr, {
@@ -2240,6 +2329,7 @@
 DEF_TRAVERSE_STMT(CXXDefaultInitExpr, {})
 DEF_TRAVERSE_STMT(CXXDeleteExpr, {})
 DEF_TRAVERSE_STMT(ExprWithCleanups, {})
+DEF_TRAVERSE_STMT(CXXInheritedCtorInitExpr, {})
 DEF_TRAVERSE_STMT(CXXNullPtrLiteralExpr, {})
 DEF_TRAVERSE_STMT(CXXStdInitializerListExpr, {})
 DEF_TRAVERSE_STMT(CXXPseudoDestructorExpr, {
@@ -2277,6 +2367,7 @@
 DEF_TRAVERSE_STMT(ObjCBridgedCastExpr, {
   TRY_TO(TraverseTypeLoc(S->getTypeInfoAsWritten()->getTypeLoc()));
 })
+DEF_TRAVERSE_STMT(ObjCAvailabilityCheckExpr, {})
 DEF_TRAVERSE_STMT(ParenExpr, {})
 DEF_TRAVERSE_STMT(ParenListExpr, {})
 DEF_TRAVERSE_STMT(PredefinedExpr, {})
@@ -2333,25 +2424,25 @@
 DEF_TRAVERSE_STMT(CoroutineBodyStmt, {
   if (!getDerived().shouldVisitImplicitCode()) {
     TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getBody());
-    return true;
+    ShouldVisitChildren = false;
   }
 })
 DEF_TRAVERSE_STMT(CoreturnStmt, {
   if (!getDerived().shouldVisitImplicitCode()) {
     TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getOperand());
-    return true;
+    ShouldVisitChildren = false;
   }
 })
 DEF_TRAVERSE_STMT(CoawaitExpr, {
   if (!getDerived().shouldVisitImplicitCode()) {
     TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getOperand());
-    return true;
+    ShouldVisitChildren = false;
   }
 })
 DEF_TRAVERSE_STMT(CoyieldExpr, {
   if (!getDerived().shouldVisitImplicitCode()) {
     TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getOperand());
-    return true;
+    ShouldVisitChildren = false;
   }
 })
 
@@ -2459,9 +2550,24 @@
 DEF_TRAVERSE_STMT(OMPTargetDataDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPTargetEnterDataDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPTargetExitDataDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPTargetParallelDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPTargetParallelForDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 DEF_TRAVERSE_STMT(OMPTeamsDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPTargetUpdateDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 DEF_TRAVERSE_STMT(OMPTaskLoopDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
@@ -2471,6 +2577,24 @@
 DEF_TRAVERSE_STMT(OMPDistributeDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPDistributeParallelForDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPDistributeParallelForSimdDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPDistributeSimdDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPTargetParallelForSimdDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPTargetSimdDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
+DEF_TRAVERSE_STMT(OMPTeamsDistributeDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 // OpenMP clauses.
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::TraverseOMPClause(OMPClause *C) {
@@ -2483,6 +2607,7 @@
     break;
 #include "clang/Basic/OpenMPKinds.def"
   case OMPC_threadprivate:
+  case OMPC_uniform:
   case OMPC_unknown:
     break;
   }
@@ -2490,6 +2615,21 @@
 }
 
 template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPClauseWithPreInit(
+    OMPClauseWithPreInit *Node) {
+  TRY_TO(TraverseStmt(Node->getPreInitStmt()));
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPClauseWithPostUpdate(
+    OMPClauseWithPostUpdate *Node) {
+  TRY_TO(VisitOMPClauseWithPreInit(Node));
+  TRY_TO(TraverseStmt(Node->getPostUpdateExpr()));
+  return true;
+}
+
+template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPIfClause(OMPIfClause *C) {
   TRY_TO(TraverseStmt(C->getCondition()));
   return true;
@@ -2540,8 +2680,8 @@
 template <typename Derived>
 bool
 RecursiveASTVisitor<Derived>::VisitOMPScheduleClause(OMPScheduleClause *C) {
+  TRY_TO(VisitOMPClauseWithPreInit(C));
   TRY_TO(TraverseStmt(C->getChunkSize()));
-  TRY_TO(TraverseStmt(C->getHelperChunkSize()));
   return true;
 }
 
@@ -2629,6 +2769,7 @@
 bool RecursiveASTVisitor<Derived>::VisitOMPFirstprivateClause(
     OMPFirstprivateClause *C) {
   TRY_TO(VisitOMPClauseList(C));
+  TRY_TO(VisitOMPClauseWithPreInit(C));
   for (auto *E : C->private_copies()) {
     TRY_TO(TraverseStmt(E));
   }
@@ -2642,6 +2783,7 @@
 bool RecursiveASTVisitor<Derived>::VisitOMPLastprivateClause(
     OMPLastprivateClause *C) {
   TRY_TO(VisitOMPClauseList(C));
+  TRY_TO(VisitOMPClauseWithPostUpdate(C));
   for (auto *E : C->private_copies()) {
     TRY_TO(TraverseStmt(E));
   }
@@ -2668,6 +2810,7 @@
   TRY_TO(TraverseStmt(C->getStep()));
   TRY_TO(TraverseStmt(C->getCalcStep()));
   TRY_TO(VisitOMPClauseList(C));
+  TRY_TO(VisitOMPClauseWithPostUpdate(C));
   for (auto *E : C->privates()) {
     TRY_TO(TraverseStmt(E));
   }
@@ -2727,6 +2870,7 @@
   TRY_TO(TraverseNestedNameSpecifierLoc(C->getQualifierLoc()));
   TRY_TO(TraverseDeclarationNameInfo(C->getNameInfo()));
   TRY_TO(VisitOMPClauseList(C));
+  TRY_TO(VisitOMPClauseWithPostUpdate(C));
   for (auto *E : C->privates()) {
     TRY_TO(TraverseStmt(E));
   }
@@ -2810,8 +2954,40 @@
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPDistScheduleClause(
     OMPDistScheduleClause *C) {
+  TRY_TO(VisitOMPClauseWithPreInit(C));
   TRY_TO(TraverseStmt(C->getChunkSize()));
-  TRY_TO(TraverseStmt(C->getHelperChunkSize()));
+  return true;
+}
+
+template <typename Derived>
+bool
+RecursiveASTVisitor<Derived>::VisitOMPDefaultmapClause(OMPDefaultmapClause *C) {
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPToClause(OMPToClause *C) {
+  TRY_TO(VisitOMPClauseList(C));
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPFromClause(OMPFromClause *C) {
+  TRY_TO(VisitOMPClauseList(C));
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPUseDevicePtrClause(
+    OMPUseDevicePtrClause *C) {
+  TRY_TO(VisitOMPClauseList(C));
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPIsDevicePtrClause(
+    OMPIsDevicePtrClause *C) {
+  TRY_TO(VisitOMPClauseList(C));
   return true;
 }
 
diff --git a/include/clang/AST/Redeclarable.h b/include/clang/AST/Redeclarable.h
index eaa22f8..dd82546 100644
--- a/include/clang/AST/Redeclarable.h
+++ b/include/clang/AST/Redeclarable.h
@@ -15,7 +15,6 @@
 #define LLVM_CLANG_AST_REDECLARABLE_H
 
 #include "clang/AST/ExternalASTSource.h"
-#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/Casting.h"
 #include <iterator>
 
diff --git a/include/clang/AST/Stmt.h b/include/clang/AST/Stmt.h
index b6ed6c5..3bce2ad 100644
--- a/include/clang/AST/Stmt.h
+++ b/include/clang/AST/Stmt.h
@@ -93,6 +93,13 @@
     unsigned NumStmts : 32 - NumStmtBits;
   };
 
+  class IfStmtBitfields {
+    friend class IfStmt;
+    unsigned : NumStmtBits;
+
+    unsigned IsConstexpr : 1;
+  };
+
   class ExprBitfields {
     friend class Expr;
     friend class DeclRefExpr; // computeDependence
@@ -192,7 +199,10 @@
 
     unsigned : NumExprBits;
 
-    unsigned NumObjects : 32 - NumExprBits;
+    // When false, it must not have side effects.
+    unsigned CleanupsHaveSideEffects : 1;
+
+    unsigned NumObjects : 32 - 1 - NumExprBits;
   };
 
   class PseudoObjectExprBitfields {
@@ -245,6 +255,7 @@
   union {
     StmtBitfields StmtBits;
     CompoundStmtBitfields CompoundStmtBits;
+    IfStmtBitfields IfStmtBits;
     ExprBitfields ExprBits;
     CharacterLiteralBitfields CharacterLiteralBits;
     FloatingLiteralBitfields FloatingLiteralBits;
@@ -868,14 +879,15 @@
 /// IfStmt - This represents an if/then/else.
 ///
 class IfStmt : public Stmt {
-  enum { VAR, COND, THEN, ELSE, END_EXPR };
+  enum { INIT, VAR, COND, THEN, ELSE, END_EXPR };
   Stmt* SubExprs[END_EXPR];
 
   SourceLocation IfLoc;
   SourceLocation ElseLoc;
 
 public:
-  IfStmt(const ASTContext &C, SourceLocation IL, VarDecl *var, Expr *cond,
+  IfStmt(const ASTContext &C, SourceLocation IL,
+         bool IsConstexpr, Stmt *init, VarDecl *var, Expr *cond,
          Stmt *then, SourceLocation EL = SourceLocation(),
          Stmt *elsev = nullptr);
 
@@ -899,6 +911,9 @@
     return reinterpret_cast<DeclStmt*>(SubExprs[VAR]);
   }
 
+  Stmt *getInit() { return SubExprs[INIT]; }
+  const Stmt *getInit() const { return SubExprs[INIT]; }
+  void setInit(Stmt *S) { SubExprs[INIT] = S; }
   const Expr *getCond() const { return reinterpret_cast<Expr*>(SubExprs[COND]);}
   void setCond(Expr *E) { SubExprs[COND] = reinterpret_cast<Stmt *>(E); }
   const Stmt *getThen() const { return SubExprs[THEN]; }
@@ -915,6 +930,11 @@
   SourceLocation getElseLoc() const { return ElseLoc; }
   void setElseLoc(SourceLocation L) { ElseLoc = L; }
 
+  bool isConstexpr() const { return IfStmtBits.IsConstexpr; }
+  void setConstexpr(bool C) { IfStmtBits.IsConstexpr = C; }
+
+  bool isObjCAvailabilityCheck() const;
+
   SourceLocation getLocStart() const LLVM_READONLY { return IfLoc; }
   SourceLocation getLocEnd() const LLVM_READONLY {
     if (SubExprs[ELSE])
@@ -938,7 +958,7 @@
 ///
 class SwitchStmt : public Stmt {
   SourceLocation SwitchLoc;
-  enum { VAR, COND, BODY, END_EXPR };
+  enum { INIT, VAR, COND, BODY, END_EXPR };
   Stmt* SubExprs[END_EXPR];
   // This points to a linked list of case and default statements and, if the
   // SwitchStmt is a switch on an enum value, records whether all the enum
@@ -947,7 +967,7 @@
   llvm::PointerIntPair<SwitchCase *, 1, bool> FirstCase;
 
 public:
-  SwitchStmt(const ASTContext &C, VarDecl *Var, Expr *cond);
+  SwitchStmt(const ASTContext &C, Stmt *Init, VarDecl *Var, Expr *cond);
 
   /// \brief Build a empty switch statement.
   explicit SwitchStmt(EmptyShell Empty) : Stmt(SwitchStmtClass, Empty) { }
@@ -970,6 +990,9 @@
     return reinterpret_cast<DeclStmt*>(SubExprs[VAR]);
   }
 
+  Stmt *getInit() { return SubExprs[INIT]; }
+  const Stmt *getInit() const { return SubExprs[INIT]; }
+  void setInit(Stmt *S) { SubExprs[INIT] = S; }
   const Expr *getCond() const { return reinterpret_cast<Expr*>(SubExprs[COND]);}
   const Stmt *getBody() const { return SubExprs[BODY]; }
   const SwitchCase *getSwitchCaseList() const { return FirstCase.getPointer(); }
diff --git a/include/clang/AST/StmtCXX.h b/include/clang/AST/StmtCXX.h
index 1ca73e2..1d29c22 100644
--- a/include/clang/AST/StmtCXX.h
+++ b/include/clang/AST/StmtCXX.h
@@ -127,7 +127,7 @@
 /// can be extracted using getLoopVariable and getRangeInit.
 class CXXForRangeStmt : public Stmt {
   SourceLocation ForLoc;
-  enum { RANGE, BEGINEND, COND, INC, LOOPVAR, BODY, END };
+  enum { RANGE, BEGINSTMT, ENDSTMT, COND, INC, LOOPVAR, BODY, END };
   // SubExprs[RANGE] is an expression or declstmt.
   // SubExprs[COND] and SubExprs[INC] are expressions.
   Stmt *SubExprs[END];
@@ -137,7 +137,7 @@
 
   friend class ASTStmtReader;
 public:
-  CXXForRangeStmt(DeclStmt *Range, DeclStmt *BeginEnd,
+  CXXForRangeStmt(DeclStmt *Range, DeclStmt *Begin, DeclStmt *End,
                   Expr *Cond, Expr *Inc, DeclStmt *LoopVar, Stmt *Body,
                   SourceLocation FL, SourceLocation CAL, SourceLocation CL,
                   SourceLocation RPL);
@@ -152,9 +152,10 @@
 
 
   DeclStmt *getRangeStmt() { return cast<DeclStmt>(SubExprs[RANGE]); }
-  DeclStmt *getBeginEndStmt() {
-    return cast_or_null<DeclStmt>(SubExprs[BEGINEND]);
+  DeclStmt *getBeginStmt() {
+    return cast_or_null<DeclStmt>(SubExprs[BEGINSTMT]);
   }
+  DeclStmt *getEndStmt() { return cast_or_null<DeclStmt>(SubExprs[ENDSTMT]); }
   Expr *getCond() { return cast_or_null<Expr>(SubExprs[COND]); }
   Expr *getInc() { return cast_or_null<Expr>(SubExprs[INC]); }
   DeclStmt *getLoopVarStmt() { return cast<DeclStmt>(SubExprs[LOOPVAR]); }
@@ -163,8 +164,11 @@
   const DeclStmt *getRangeStmt() const {
     return cast<DeclStmt>(SubExprs[RANGE]);
   }
-  const DeclStmt *getBeginEndStmt() const {
-    return cast_or_null<DeclStmt>(SubExprs[BEGINEND]);
+  const DeclStmt *getBeginStmt() const {
+    return cast_or_null<DeclStmt>(SubExprs[BEGINSTMT]);
+  }
+  const DeclStmt *getEndStmt() const {
+    return cast_or_null<DeclStmt>(SubExprs[ENDSTMT]);
   }
   const Expr *getCond() const {
     return cast_or_null<Expr>(SubExprs[COND]);
@@ -179,7 +183,8 @@
 
   void setRangeInit(Expr *E) { SubExprs[RANGE] = reinterpret_cast<Stmt*>(E); }
   void setRangeStmt(Stmt *S) { SubExprs[RANGE] = S; }
-  void setBeginEndStmt(Stmt *S) { SubExprs[BEGINEND] = S; }
+  void setBeginStmt(Stmt *S) { SubExprs[BEGINSTMT] = S; }
+  void setEndStmt(Stmt *S) { SubExprs[ENDSTMT] = S; }
   void setCond(Expr *E) { SubExprs[COND] = reinterpret_cast<Stmt*>(E); }
   void setInc(Expr *E) { SubExprs[INC] = reinterpret_cast<Stmt*>(E); }
   void setLoopVarStmt(Stmt *S) { SubExprs[LOOPVAR] = S; }
diff --git a/include/clang/AST/StmtGraphTraits.h b/include/clang/AST/StmtGraphTraits.h
index ab636a5..dac4495 100644
--- a/include/clang/AST/StmtGraphTraits.h
+++ b/include/clang/AST/StmtGraphTraits.h
@@ -26,6 +26,7 @@
 
 template <> struct GraphTraits<clang::Stmt*> {
   typedef clang::Stmt                       NodeType;
+  typedef clang::Stmt *                     NodeRef;
   typedef clang::Stmt::child_iterator       ChildIteratorType;
   typedef llvm::df_iterator<clang::Stmt*>   nodes_iterator;
 
@@ -53,6 +54,7 @@
 
 template <> struct GraphTraits<const clang::Stmt*> {
   typedef const clang::Stmt                       NodeType;
+  typedef const clang::Stmt *                     NodeRef;
   typedef clang::Stmt::const_child_iterator       ChildIteratorType;
   typedef llvm::df_iterator<const clang::Stmt*>   nodes_iterator;
 
diff --git a/include/clang/AST/StmtObjC.h b/include/clang/AST/StmtObjC.h
index 68fe3ef..5260b69 100644
--- a/include/clang/AST/StmtObjC.h
+++ b/include/clang/AST/StmtObjC.h
@@ -326,7 +326,7 @@
   Expr *getThrowExpr() { return reinterpret_cast<Expr*>(Throw); }
   void setThrowExpr(Stmt *S) { Throw = S; }
 
-  SourceLocation getThrowLoc() { return AtThrowLoc; }
+  SourceLocation getThrowLoc() const LLVM_READONLY { return AtThrowLoc; }
   void setThrowLoc(SourceLocation Loc) { AtThrowLoc = Loc; }
 
   SourceLocation getLocStart() const LLVM_READONLY { return AtThrowLoc; }
diff --git a/include/clang/AST/StmtOpenMP.h b/include/clang/AST/StmtOpenMP.h
index c82aeda..de603f4 100644
--- a/include/clang/AST/StmtOpenMP.h
+++ b/include/clang/AST/StmtOpenMP.h
@@ -299,9 +299,11 @@
   /// This enumeration contains offsets to all the pointers to children
   /// expressions stored in OMPLoopDirective.
   /// The first 9 children are nesessary for all the loop directives, and
-  /// the next 7 are specific to the worksharing ones.
+  /// the next 10 are specific to the worksharing ones.
   /// After the fixed children, three arrays of length CollapsedNum are
   /// allocated: loop counters, their updates and final values.
+  /// PrevLowerBound and PrevUpperBound are used to communicate blocking
+  /// information in composite constructs which require loop blocking
   ///
   enum {
     AssociatedStmtOffset = 0,
@@ -312,21 +314,25 @@
     CondOffset = 5,
     InitOffset = 6,
     IncOffset = 7,
+    PreInitsOffset = 8,
     // The '...End' enumerators do not correspond to child expressions - they
     // specify the offset to the end (and start of the following counters/
     // updates/finals arrays).
-    DefaultEnd = 8,
+    DefaultEnd = 9,
     // The following 7 exprs are used by worksharing loops only.
-    IsLastIterVariableOffset = 8,
-    LowerBoundVariableOffset = 9,
-    UpperBoundVariableOffset = 10,
-    StrideVariableOffset = 11,
-    EnsureUpperBoundOffset = 12,
-    NextLowerBoundOffset = 13,
-    NextUpperBoundOffset = 14,
+    IsLastIterVariableOffset = 9,
+    LowerBoundVariableOffset = 10,
+    UpperBoundVariableOffset = 11,
+    StrideVariableOffset = 12,
+    EnsureUpperBoundOffset = 13,
+    NextLowerBoundOffset = 14,
+    NextUpperBoundOffset = 15,
+    NumIterationsOffset = 16,
+    PrevLowerBoundVariableOffset = 17,
+    PrevUpperBoundVariableOffset = 18,
     // Offset to the end (and start of the following counters/updates/finals
     // arrays) for worksharing loop directives.
-    WorksharingEnd = 15,
+    WorksharingEnd = 19,
   };
 
   /// \brief Get the counters storage.
@@ -422,6 +428,9 @@
   }
   void setInit(Expr *Init) { *std::next(child_begin(), InitOffset) = Init; }
   void setInc(Expr *Inc) { *std::next(child_begin(), IncOffset) = Inc; }
+  void setPreInits(Stmt *PreInits) {
+    *std::next(child_begin(), PreInitsOffset) = PreInits;
+  }
   void setIsLastIterVariable(Expr *IL) {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
             isOpenMPTaskLoopDirective(getDirectiveKind()) ||
@@ -471,6 +480,27 @@
            "expected worksharing loop directive");
     *std::next(child_begin(), NextUpperBoundOffset) = NUB;
   }
+  void setNumIterations(Expr *NI) {
+    assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
+           "expected worksharing loop directive");
+    *std::next(child_begin(), NumIterationsOffset) = NI;
+  }
+  void setPrevLowerBoundVariable(Expr *PrevLB) {
+    assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
+           "expected worksharing loop directive");
+    *std::next(child_begin(), PrevLowerBoundVariableOffset) = PrevLB;
+  }
+  void setPrevUpperBoundVariable(Expr *PrevUB) {
+    assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
+           "expected worksharing loop directive");
+    *std::next(child_begin(), PrevUpperBoundVariableOffset) = PrevUB;
+  }
   void setCounters(ArrayRef<Expr *> A);
   void setPrivateCounters(ArrayRef<Expr *> A);
   void setInits(ArrayRef<Expr *> A);
@@ -511,6 +541,12 @@
     Expr *NLB;
     /// \brief Update of UpperBound for statically sheduled 'omp for' loops.
     Expr *NUB;
+    /// \brief PreviousLowerBound - local variable passed to runtime in the
+    /// enclosing schedule or null if that does not apply.
+    Expr *PrevLB;
+    /// \brief PreviousUpperBound - local variable passed to runtime in the
+    /// enclosing schedule or null if that does not apply.
+    Expr *PrevUB;
     /// \brief Counters Loop counters.
     SmallVector<Expr *, 4> Counters;
     /// \brief PrivateCounters Loop counters.
@@ -521,6 +557,8 @@
     SmallVector<Expr *, 4> Updates;
     /// \brief Final loop counter values for GodeGen.
     SmallVector<Expr *, 4> Finals;
+    /// Init statement for all captured expressions.
+    Stmt *PreInits;
 
     /// \brief Check if all the expressions are built (does not check the
     /// worksharing ones).
@@ -547,6 +585,9 @@
       EUB = nullptr;
       NLB = nullptr;
       NUB = nullptr;
+      NumIterations = nullptr;
+      PrevLB = nullptr;
+      PrevUB = nullptr;
       Counters.resize(Size);
       PrivateCounters.resize(Size);
       Inits.resize(Size);
@@ -559,6 +600,7 @@
         Updates[i] = nullptr;
         Finals[i] = nullptr;
       }
+      PreInits = nullptr;
     }
   };
 
@@ -593,55 +635,90 @@
     return const_cast<Expr *>(
         reinterpret_cast<const Expr *>(*std::next(child_begin(), IncOffset)));
   }
+  const Stmt *getPreInits() const {
+    return *std::next(child_begin(), PreInitsOffset);
+  }
+  Stmt *getPreInits() { return *std::next(child_begin(), PreInitsOffset); }
   Expr *getIsLastIterVariable() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), IsLastIterVariableOffset)));
   }
   Expr *getLowerBoundVariable() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), LowerBoundVariableOffset)));
   }
   Expr *getUpperBoundVariable() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), UpperBoundVariableOffset)));
   }
   Expr *getStrideVariable() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), StrideVariableOffset)));
   }
   Expr *getEnsureUpperBound() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), EnsureUpperBoundOffset)));
   }
   Expr *getNextLowerBound() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), NextLowerBoundOffset)));
   }
   Expr *getNextUpperBound() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), NextUpperBoundOffset)));
   }
+  Expr *getNumIterations() const {
+    assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
+           "expected worksharing loop directive");
+    return const_cast<Expr *>(reinterpret_cast<const Expr *>(
+        *std::next(child_begin(), NumIterationsOffset)));
+  }
+  Expr *getPrevLowerBoundVariable() const {
+    assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
+           "expected worksharing loop directive");
+    return const_cast<Expr *>(reinterpret_cast<const Expr *>(
+        *std::next(child_begin(), PrevLowerBoundVariableOffset)));
+  }
+  Expr *getPrevUpperBoundVariable() const {
+    assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
+           "expected worksharing loop directive");
+    return const_cast<Expr *>(reinterpret_cast<const Expr *>(
+        *std::next(child_begin(), PrevUpperBoundVariableOffset)));
+  }
   const Stmt *getBody() const {
     // This relies on the loop form is already checked by Sema.
     Stmt *Body = getAssociatedStmt()->IgnoreContainers(true);
@@ -691,7 +768,14 @@
            T->getStmtClass() == OMPParallelForSimdDirectiveClass ||
            T->getStmtClass() == OMPTaskLoopDirectiveClass ||
            T->getStmtClass() == OMPTaskLoopSimdDirectiveClass ||
-           T->getStmtClass() == OMPDistributeDirectiveClass;
+           T->getStmtClass() == OMPDistributeDirectiveClass ||
+           T->getStmtClass() == OMPTargetParallelForDirectiveClass ||
+           T->getStmtClass() == OMPDistributeParallelForDirectiveClass ||
+           T->getStmtClass() == OMPDistributeParallelForSimdDirectiveClass ||
+           T->getStmtClass() == OMPDistributeSimdDirectiveClass ||
+           T->getStmtClass() == OMPTargetParallelForSimdDirectiveClass ||
+           T->getStmtClass() == OMPTargetSimdDirectiveClass ||
+           T->getStmtClass() == OMPTeamsDistributeDirectiveClass;
   }
 };
 
@@ -2038,6 +2122,264 @@
   }
 };
 
+/// \brief This represents '#pragma omp target enter data' directive.
+///
+/// \code
+/// #pragma omp target enter data device(0) if(a) map(b[:])
+/// \endcode
+/// In this example directive '#pragma omp target enter data' has clauses
+/// 'device' with the value '0', 'if' with condition 'a' and 'map' with array
+/// section 'b[:]'.
+///
+class OMPTargetEnterDataDirective : public OMPExecutableDirective {
+  friend class ASTStmtReader;
+  /// \brief Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param NumClauses The number of clauses.
+  ///
+  OMPTargetEnterDataDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                              unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPTargetEnterDataDirectiveClass,
+                               OMPD_target_enter_data, StartLoc, EndLoc,
+                               NumClauses, /*NumChildren=*/0) {}
+
+  /// \brief Build an empty directive.
+  ///
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPTargetEnterDataDirective(unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPTargetEnterDataDirectiveClass,
+                               OMPD_target_enter_data, SourceLocation(),
+                               SourceLocation(), NumClauses,
+                               /*NumChildren=*/0) {}
+
+public:
+  /// \brief Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param Clauses List of clauses.
+  ///
+  static OMPTargetEnterDataDirective *Create(const ASTContext &C,
+                                             SourceLocation StartLoc,
+                                             SourceLocation EndLoc,
+                                             ArrayRef<OMPClause *> Clauses);
+
+  /// \brief Creates an empty directive with the place for \a N clauses.
+  ///
+  /// \param C AST context.
+  /// \param N The number of clauses.
+  ///
+  static OMPTargetEnterDataDirective *CreateEmpty(const ASTContext &C,
+                                                  unsigned N, EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTargetEnterDataDirectiveClass;
+  }
+};
+
+/// \brief This represents '#pragma omp target exit data' directive.
+///
+/// \code
+/// #pragma omp target exit data device(0) if(a) map(b[:])
+/// \endcode
+/// In this example directive '#pragma omp target exit data' has clauses
+/// 'device' with the value '0', 'if' with condition 'a' and 'map' with array
+/// section 'b[:]'.
+///
+class OMPTargetExitDataDirective : public OMPExecutableDirective {
+  friend class ASTStmtReader;
+  /// \brief Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param NumClauses The number of clauses.
+  ///
+  OMPTargetExitDataDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                             unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPTargetExitDataDirectiveClass,
+                               OMPD_target_exit_data, StartLoc, EndLoc,
+                               NumClauses, /*NumChildren=*/0) {}
+
+  /// \brief Build an empty directive.
+  ///
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPTargetExitDataDirective(unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPTargetExitDataDirectiveClass,
+                               OMPD_target_exit_data, SourceLocation(),
+                               SourceLocation(), NumClauses,
+                               /*NumChildren=*/0) {}
+
+public:
+  /// \brief Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param Clauses List of clauses.
+  ///
+  static OMPTargetExitDataDirective *Create(const ASTContext &C,
+                                            SourceLocation StartLoc,
+                                            SourceLocation EndLoc,
+                                            ArrayRef<OMPClause *> Clauses);
+
+  /// \brief Creates an empty directive with the place for \a N clauses.
+  ///
+  /// \param C AST context.
+  /// \param N The number of clauses.
+  ///
+  static OMPTargetExitDataDirective *CreateEmpty(const ASTContext &C,
+                                                 unsigned N, EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTargetExitDataDirectiveClass;
+  }
+};
+
+/// \brief This represents '#pragma omp target parallel' directive.
+///
+/// \code
+/// #pragma omp target parallel if(a)
+/// \endcode
+/// In this example directive '#pragma omp target parallel' has clause 'if' with
+/// condition 'a'.
+///
+class OMPTargetParallelDirective : public OMPExecutableDirective {
+  friend class ASTStmtReader;
+  /// \brief Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPTargetParallelDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                             unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPTargetParallelDirectiveClass,
+                               OMPD_target_parallel, StartLoc, EndLoc,
+                               NumClauses, /*NumChildren=*/1) {}
+
+  /// \brief Build an empty directive.
+  ///
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPTargetParallelDirective(unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPTargetParallelDirectiveClass,
+                               OMPD_target_parallel, SourceLocation(),
+                               SourceLocation(), NumClauses,
+                               /*NumChildren=*/1) {}
+
+public:
+  /// \brief Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param Clauses List of clauses.
+  /// \param AssociatedStmt Statement, associated with the directive.
+  ///
+  static OMPTargetParallelDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt);
+
+  /// \brief Creates an empty directive with the place for \a NumClauses
+  /// clauses.
+  ///
+  /// \param C AST context.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPTargetParallelDirective *
+  CreateEmpty(const ASTContext &C, unsigned NumClauses, EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTargetParallelDirectiveClass;
+  }
+};
+
+/// \brief This represents '#pragma omp target parallel for' directive.
+///
+/// \code
+/// #pragma omp target parallel for private(a,b) reduction(+:c,d)
+/// \endcode
+/// In this example directive '#pragma omp target parallel for' has clauses
+/// 'private' with the variables 'a' and 'b' and 'reduction' with operator '+'
+/// and variables 'c' and 'd'.
+///
+class OMPTargetParallelForDirective : public OMPLoopDirective {
+  friend class ASTStmtReader;
+
+  /// \brief true if current region has inner cancel directive.
+  bool HasCancel;
+
+  /// \brief Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPTargetParallelForDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                                unsigned CollapsedNum, unsigned NumClauses)
+      : OMPLoopDirective(this, OMPTargetParallelForDirectiveClass,
+                         OMPD_target_parallel_for, StartLoc, EndLoc,
+                         CollapsedNum, NumClauses),
+        HasCancel(false) {}
+
+  /// \brief Build an empty directive.
+  ///
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPTargetParallelForDirective(unsigned CollapsedNum,
+                                         unsigned NumClauses)
+      : OMPLoopDirective(this, OMPTargetParallelForDirectiveClass,
+                         OMPD_target_parallel_for, SourceLocation(),
+                         SourceLocation(), CollapsedNum, NumClauses),
+        HasCancel(false) {}
+
+  /// \brief Set cancel state.
+  void setHasCancel(bool Has) { HasCancel = Has; }
+
+public:
+  /// \brief Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param CollapsedNum Number of collapsed loops.
+  /// \param Clauses List of clauses.
+  /// \param AssociatedStmt Statement, associated with the directive.
+  /// \param Exprs Helper expressions for CodeGen.
+  /// \param HasCancel true if current directive has inner cancel directive.
+  ///
+  static OMPTargetParallelForDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
+         Stmt *AssociatedStmt, const HelperExprs &Exprs, bool HasCancel);
+
+  /// \brief Creates an empty directive with the place
+  /// for \a NumClauses clauses.
+  ///
+  /// \param C AST context.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPTargetParallelForDirective *CreateEmpty(const ASTContext &C,
+                                                    unsigned NumClauses,
+                                                    unsigned CollapsedNum,
+                                                    EmptyShell);
+
+  /// \brief Return true if current directive has inner cancel directive.
+  bool hasCancel() const { return HasCancel; }
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTargetParallelForDirectiveClass;
+  }
+};
+
 /// \brief This represents '#pragma omp teams' directive.
 ///
 /// \code
@@ -2394,7 +2736,7 @@
   /// \param Clauses List of clauses.
   /// \param AssociatedStmt Statement, associated with the directive.
   /// \param Exprs Helper expressions for CodeGen.
-    ///
+  ///
   static OMPDistributeDirective *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
          unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
@@ -2416,6 +2758,474 @@
   }
 };
 
+/// \brief This represents '#pragma omp target update' directive.
+///
+/// \code
+/// #pragma omp target update to(a) from(b) device(1)
+/// \endcode
+/// In this example directive '#pragma omp target update' has clause 'to' with
+/// argument 'a', clause 'from' with argument 'b' and clause 'device' with
+/// argument '1'.
+///
+class OMPTargetUpdateDirective : public OMPExecutableDirective {
+  friend class ASTStmtReader;
+  /// \brief Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param NumClauses The number of clauses.
+  ///
+  OMPTargetUpdateDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                           unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPTargetUpdateDirectiveClass,
+                               OMPD_target_update, StartLoc, EndLoc, NumClauses,
+                               0) {}
+
+  /// \brief Build an empty directive.
+  ///
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPTargetUpdateDirective(unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPTargetUpdateDirectiveClass,
+                               OMPD_target_update, SourceLocation(),
+                               SourceLocation(), NumClauses, 0) {}
+
+public:
+  /// \brief Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param Clauses List of clauses.
+  ///
+  static OMPTargetUpdateDirective *Create(const ASTContext &C,
+                                          SourceLocation StartLoc,
+                                          SourceLocation EndLoc,
+                                          ArrayRef<OMPClause *> Clauses);
+
+  /// \brief Creates an empty directive with the place for \a NumClauses
+  /// clauses.
+  ///
+  /// \param C AST context.
+  /// \param NumClauses The number of clauses.
+  ///
+  static OMPTargetUpdateDirective *CreateEmpty(const ASTContext &C,
+                                               unsigned NumClauses, EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTargetUpdateDirectiveClass;
+  }
+};
+
+/// \brief This represents '#pragma omp distribute parallel for' composite
+///  directive.
+///
+/// \code
+/// #pragma omp distribute parallel for private(a,b)
+/// \endcode
+/// In this example directive '#pragma omp distribute parallel for' has clause
+/// 'private' with the variables 'a' and 'b'
+///
+class OMPDistributeParallelForDirective : public OMPLoopDirective {
+  friend class ASTStmtReader;
+
+  /// \brief Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPDistributeParallelForDirective(SourceLocation StartLoc,
+                                    SourceLocation EndLoc,
+                                    unsigned CollapsedNum, unsigned NumClauses)
+      : OMPLoopDirective(this, OMPDistributeParallelForDirectiveClass,
+                         OMPD_distribute_parallel_for, StartLoc, EndLoc,
+                         CollapsedNum, NumClauses) {}
+
+  /// \brief Build an empty directive.
+  ///
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPDistributeParallelForDirective(unsigned CollapsedNum,
+                                             unsigned NumClauses)
+      : OMPLoopDirective(this, OMPDistributeParallelForDirectiveClass,
+                         OMPD_distribute_parallel_for, SourceLocation(),
+                         SourceLocation(), CollapsedNum, NumClauses) {}
+
+public:
+  /// \brief Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param CollapsedNum Number of collapsed loops.
+  /// \param Clauses List of clauses.
+  /// \param AssociatedStmt Statement, associated with the directive.
+  /// \param Exprs Helper expressions for CodeGen.
+  ///
+  static OMPDistributeParallelForDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
+         Stmt *AssociatedStmt, const HelperExprs &Exprs);
+
+  /// \brief Creates an empty directive with the place
+  /// for \a NumClauses clauses.
+  ///
+  /// \param C AST context.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPDistributeParallelForDirective *CreateEmpty(const ASTContext &C,
+                                                        unsigned NumClauses,
+                                                        unsigned CollapsedNum,
+                                                        EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPDistributeParallelForDirectiveClass;
+  }
+};
+
+/// This represents '#pragma omp distribute parallel for simd' composite
+/// directive.
+///
+/// \code
+/// #pragma omp distribute parallel for simd private(x)
+/// \endcode
+/// In this example directive '#pragma omp distribute parallel for simd' has
+/// clause 'private' with the variables 'x'
+///
+class OMPDistributeParallelForSimdDirective final : public OMPLoopDirective {
+  friend class ASTStmtReader;
+
+  /// Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPDistributeParallelForSimdDirective(SourceLocation StartLoc,
+                                        SourceLocation EndLoc,
+                                        unsigned CollapsedNum,
+                                        unsigned NumClauses)
+      : OMPLoopDirective(this, OMPDistributeParallelForSimdDirectiveClass,
+                         OMPD_distribute_parallel_for_simd, StartLoc, 
+                         EndLoc, CollapsedNum, NumClauses) {}
+
+  /// Build an empty directive.
+  ///
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPDistributeParallelForSimdDirective(unsigned CollapsedNum,
+                                                 unsigned NumClauses)
+      : OMPLoopDirective(this, OMPDistributeParallelForSimdDirectiveClass,
+                         OMPD_distribute_parallel_for_simd, 
+                         SourceLocation(), SourceLocation(), CollapsedNum,
+                         NumClauses) {}
+
+public:
+  /// Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param CollapsedNum Number of collapsed loops.
+  /// \param Clauses List of clauses.
+  /// \param AssociatedStmt Statement, associated with the directive.
+  /// \param Exprs Helper expressions for CodeGen.
+  ///
+  static OMPDistributeParallelForSimdDirective *Create(
+      const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+      unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
+      Stmt *AssociatedStmt, const HelperExprs &Exprs);
+
+  /// Creates an empty directive with the place for \a NumClauses clauses.
+  ///
+  /// \param C AST context.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPDistributeParallelForSimdDirective *CreateEmpty(
+      const ASTContext &C, unsigned NumClauses, unsigned CollapsedNum,
+      EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPDistributeParallelForSimdDirectiveClass;
+  }
+};
+
+/// This represents '#pragma omp distribute simd' composite directive.
+///
+/// \code
+/// #pragma omp distribute simd private(x)
+/// \endcode
+/// In this example directive '#pragma omp distribute simd' has clause
+/// 'private' with the variables 'x'
+///
+class OMPDistributeSimdDirective final : public OMPLoopDirective {
+  friend class ASTStmtReader;
+
+  /// Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPDistributeSimdDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                             unsigned CollapsedNum, unsigned NumClauses)
+      : OMPLoopDirective(this, OMPDistributeSimdDirectiveClass,
+                         OMPD_distribute_simd, StartLoc, EndLoc, CollapsedNum,
+                         NumClauses) {}
+
+  /// Build an empty directive.
+  ///
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPDistributeSimdDirective(unsigned CollapsedNum, 
+                                      unsigned NumClauses)
+      : OMPLoopDirective(this, OMPDistributeSimdDirectiveClass,
+                         OMPD_distribute_simd, SourceLocation(),
+                         SourceLocation(), CollapsedNum, NumClauses) {}
+
+public:
+  /// Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param CollapsedNum Number of collapsed loops.
+  /// \param Clauses List of clauses.
+  /// \param AssociatedStmt Statement, associated with the directive.
+  /// \param Exprs Helper expressions for CodeGen.
+  ///
+  static OMPDistributeSimdDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
+         Stmt *AssociatedStmt, const HelperExprs &Exprs);
+
+  /// Creates an empty directive with the place for \a NumClauses clauses.
+  ///
+  /// \param C AST context.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPDistributeSimdDirective *CreateEmpty(const ASTContext &C,
+                                                 unsigned NumClauses,
+                                                 unsigned CollapsedNum,
+                                                 EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPDistributeSimdDirectiveClass;
+  }
+};
+
+/// This represents '#pragma omp target parallel for simd' directive.
+///
+/// \code
+/// #pragma omp target parallel for simd private(a) map(b) safelen(c)
+/// \endcode
+/// In this example directive '#pragma omp target parallel for simd' has clauses
+/// 'private' with the variable 'a', 'map' with the variable 'b' and 'safelen'
+/// with the variable 'c'.
+///
+class OMPTargetParallelForSimdDirective final : public OMPLoopDirective {
+  friend class ASTStmtReader;
+
+  /// Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPTargetParallelForSimdDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                                unsigned CollapsedNum, unsigned NumClauses)
+      : OMPLoopDirective(this, OMPTargetParallelForSimdDirectiveClass,
+                         OMPD_target_parallel_for_simd, StartLoc, EndLoc,
+                         CollapsedNum, NumClauses) {}
+
+  /// Build an empty directive.
+  ///
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPTargetParallelForSimdDirective(unsigned CollapsedNum,
+                                             unsigned NumClauses)
+      : OMPLoopDirective(this, OMPTargetParallelForSimdDirectiveClass,
+                         OMPD_target_parallel_for_simd, SourceLocation(),
+                         SourceLocation(), CollapsedNum, NumClauses) {}
+
+public:
+  /// Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param CollapsedNum Number of collapsed loops.
+  /// \param Clauses List of clauses.
+  /// \param AssociatedStmt Statement, associated with the directive.
+  /// \param Exprs Helper expressions for CodeGen.
+  ///
+  static OMPTargetParallelForSimdDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
+         Stmt *AssociatedStmt, const HelperExprs &Exprs);
+
+  /// Creates an empty directive with the place for \a NumClauses clauses.
+  ///
+  /// \param C AST context.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPTargetParallelForSimdDirective *CreateEmpty(const ASTContext &C,
+                                                        unsigned NumClauses,
+                                                        unsigned CollapsedNum,
+                                                        EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTargetParallelForSimdDirectiveClass;
+  }
+};
+
+/// This represents '#pragma omp target simd' directive.
+///
+/// \code
+/// #pragma omp target simd private(a) map(b) safelen(c)
+/// \endcode
+/// In this example directive '#pragma omp target simd' has clauses 'private'
+/// with the variable 'a', 'map' with the variable 'b' and 'safelen' with
+/// the variable 'c'.
+///
+class OMPTargetSimdDirective final : public OMPLoopDirective {
+  friend class ASTStmtReader;
+
+  /// Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPTargetSimdDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                         unsigned CollapsedNum, unsigned NumClauses)
+      : OMPLoopDirective(this, OMPTargetSimdDirectiveClass,
+                         OMPD_target_simd, StartLoc, EndLoc, CollapsedNum,
+                         NumClauses) {}
+
+  /// Build an empty directive.
+  ///
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPTargetSimdDirective(unsigned CollapsedNum, unsigned NumClauses)
+      : OMPLoopDirective(this, OMPTargetSimdDirectiveClass, OMPD_target_simd, 
+                         SourceLocation(),SourceLocation(), CollapsedNum,
+                         NumClauses) {}
+
+public:
+  /// Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param CollapsedNum Number of collapsed loops.
+  /// \param Clauses List of clauses.
+  /// \param AssociatedStmt Statement, associated with the directive.
+  /// \param Exprs Helper expressions for CodeGen.
+  ///
+  static OMPTargetSimdDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
+         Stmt *AssociatedStmt, const HelperExprs &Exprs);
+
+  /// Creates an empty directive with the place for \a NumClauses clauses.
+  ///
+  /// \param C AST context.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPTargetSimdDirective *CreateEmpty(const ASTContext &C,
+                                             unsigned NumClauses,
+                                             unsigned CollapsedNum,
+                                             EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTargetSimdDirectiveClass;
+  }
+};
+
+/// This represents '#pragma omp teams distribute' directive.
+///
+/// \code
+/// #pragma omp teams distribute private(a,b)
+/// \endcode
+/// In this example directive '#pragma omp teams distribute' has clauses
+/// 'private' with the variables 'a' and 'b'
+///
+class OMPTeamsDistributeDirective final : public OMPLoopDirective {
+  friend class ASTStmtReader;
+
+  /// Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPTeamsDistributeDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                              unsigned CollapsedNum, unsigned NumClauses)
+      : OMPLoopDirective(this, OMPTeamsDistributeDirectiveClass, 
+                         OMPD_teams_distribute, StartLoc, EndLoc, 
+                         CollapsedNum, NumClauses) {}
+
+  /// Build an empty directive.
+  ///
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPTeamsDistributeDirective(unsigned CollapsedNum,
+                                       unsigned NumClauses)
+      : OMPLoopDirective(this, OMPTeamsDistributeDirectiveClass,
+                         OMPD_teams_distribute, SourceLocation(),
+                         SourceLocation(), CollapsedNum, NumClauses) {}
+
+public:
+  /// Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param CollapsedNum Number of collapsed loops.
+  /// \param Clauses List of clauses.
+  /// \param AssociatedStmt Statement, associated with the directive.
+  /// \param Exprs Helper expressions for CodeGen.
+  ///
+  static OMPTeamsDistributeDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses,
+         Stmt *AssociatedStmt, const HelperExprs &Exprs);
+
+  /// Creates an empty directive with the place for \a NumClauses clauses.
+  ///
+  /// \param C AST context.
+  /// \param CollapsedNum Number of collapsed nested loops.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPTeamsDistributeDirective *CreateEmpty(const ASTContext &C,
+                                                  unsigned NumClauses,
+                                                  unsigned CollapsedNum,
+                                                  EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTeamsDistributeDirectiveClass;
+  }
+};
+
 } // end namespace clang
 
 #endif
diff --git a/include/clang/AST/TemplateName.h b/include/clang/AST/TemplateName.h
index 3e10d2f..bf4d008 100644
--- a/include/clang/AST/TemplateName.h
+++ b/include/clang/AST/TemplateName.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_AST_TEMPLATENAME_H
 #define LLVM_CLANG_AST_TEMPLATENAME_H
 
+#include "clang/AST/NestedNameSpecifier.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/PointerUnion.h"
diff --git a/include/clang/AST/Type.h b/include/clang/AST/Type.h
index e339716..8bb7a62 100644
--- a/include/clang/AST/Type.h
+++ b/include/clang/AST/Type.h
@@ -111,6 +111,7 @@
 /// The collection of all-type qualifiers we support.
 /// Clang supports five independent qualifiers:
 /// * C99: const, volatile, and restrict
+/// * MS: __unaligned
 /// * Embedded C (TR18037): address spaces
 /// * Objective C: the GC attributes (none, weak, or strong)
 class Qualifiers {
@@ -152,8 +153,8 @@
 
   enum {
     /// The maximum supported address space number.
-    /// 24 bits should be enough for anyone.
-    MaxAddressSpace = 0xffffffu,
+    /// 23 bits should be enough for anyone.
+    MaxAddressSpace = 0x7fffffu,
 
     /// The width of the "fast" qualifier mask.
     FastWidth = 3,
@@ -214,6 +215,12 @@
     return Qs;
   }
 
+  static Qualifiers fromCVRUMask(unsigned CVRU) {
+    Qualifiers Qs;
+    Qs.addCVRUQualifiers(CVRU);
+    return Qs;
+  }
+
   // Deserialize qualifiers from an opaque representation.
   static Qualifiers fromOpaqueValue(unsigned opaque) {
     Qualifiers Qs;
@@ -264,6 +271,17 @@
     assert(!(mask & ~CVRMask) && "bitmask contains non-CVR bits");
     Mask |= mask;
   }
+  void addCVRUQualifiers(unsigned mask) {
+    assert(!(mask & ~CVRMask & ~UMask) && "bitmask contains non-CVRU bits");
+    Mask |= mask;
+  }
+
+  bool hasUnaligned() const { return Mask & UMask; }
+  void setUnaligned(bool flag) {
+    Mask = (Mask & ~UMask) | (flag ? UMask : 0);
+  }
+  void removeUnaligned() { Mask &= ~UMask; }
+  void addUnaligned() { Mask |= UMask; }
 
   bool hasObjCGCAttr() const { return Mask & GCAttrMask; }
   GC getObjCGCAttr() const { return GC((Mask & GCAttrMask) >> GCAttrShift); }
@@ -433,7 +451,9 @@
            // ObjC lifetime qualifiers must match exactly.
            getObjCLifetime() == other.getObjCLifetime() &&
            // CVR qualifiers may subset.
-           (((Mask & CVRMask) | (other.Mask & CVRMask)) == (Mask & CVRMask));
+           (((Mask & CVRMask) | (other.Mask & CVRMask)) == (Mask & CVRMask)) &&
+           // U qualifier may superset.
+           (!other.hasUnaligned() || hasUnaligned());
   }
 
   /// \brief Determines if these qualifiers compatibly include another set of
@@ -501,16 +521,19 @@
 
 private:
 
-  // bits:     |0 1 2|3 .. 4|5  ..  7|8   ...   31|
-  //           |C R V|GCAttr|Lifetime|AddressSpace|
+  // bits:     |0 1 2|3|4 .. 5|6  ..  8|9   ...   31|
+  //           |C R V|U|GCAttr|Lifetime|AddressSpace|
   uint32_t Mask;
 
-  static const uint32_t GCAttrMask = 0x18;
-  static const uint32_t GCAttrShift = 3;
-  static const uint32_t LifetimeMask = 0xE0;
-  static const uint32_t LifetimeShift = 5;
-  static const uint32_t AddressSpaceMask = ~(CVRMask|GCAttrMask|LifetimeMask);
-  static const uint32_t AddressSpaceShift = 8;
+  static const uint32_t UMask = 0x8;
+  static const uint32_t UShift = 3;
+  static const uint32_t GCAttrMask = 0x30;
+  static const uint32_t GCAttrShift = 4;
+  static const uint32_t LifetimeMask = 0x1C0;
+  static const uint32_t LifetimeShift = 6;
+  static const uint32_t AddressSpaceMask =
+      ~(CVRMask | UMask | GCAttrMask | LifetimeMask);
+  static const uint32_t AddressSpaceShift = 9;
 };
 
 /// A std::pair-like structure for storing a qualified type split
@@ -709,27 +732,27 @@
   /// applied to this type.
   unsigned getCVRQualifiers() const;
 
-  bool isConstant(ASTContext& Ctx) const {
+  bool isConstant(const ASTContext& Ctx) const {
     return QualType::isConstant(*this, Ctx);
   }
 
   /// \brief Determine whether this is a Plain Old Data (POD) type (C++ 3.9p10).
-  bool isPODType(ASTContext &Context) const;
+  bool isPODType(const ASTContext &Context) const;
 
   /// Return true if this is a POD type according to the rules of the C++98
   /// standard, regardless of the current compilation's language.
-  bool isCXX98PODType(ASTContext &Context) const;
+  bool isCXX98PODType(const ASTContext &Context) const;
 
   /// Return true if this is a POD type according to the more relaxed rules
   /// of the C++11 standard, regardless of the current compilation's language.
   /// (C++0x [basic.types]p9)
-  bool isCXX11PODType(ASTContext &Context) const;
+  bool isCXX11PODType(const ASTContext &Context) const;
 
   /// Return true if this is a trivial type per (C++0x [basic.types]p9)
-  bool isTrivialType(ASTContext &Context) const;
+  bool isTrivialType(const ASTContext &Context) const;
 
   /// Return true if this is a trivially copyable type (C++0x [basic.types]p9)
-  bool isTriviallyCopyableType(ASTContext &Context) const;
+  bool isTriviallyCopyableType(const ASTContext &Context) const;
 
   // Don't promise in the API that anything besides 'const' can be
   // easily added.
@@ -909,16 +932,19 @@
   std::string getAsString(const PrintingPolicy &Policy) const;
 
   void print(raw_ostream &OS, const PrintingPolicy &Policy,
-             const Twine &PlaceHolder = Twine()) const {
-    print(split(), OS, Policy, PlaceHolder);
+             const Twine &PlaceHolder = Twine(),
+             unsigned Indentation = 0) const {
+    print(split(), OS, Policy, PlaceHolder, Indentation);
   }
   static void print(SplitQualType split, raw_ostream &OS,
-                    const PrintingPolicy &policy, const Twine &PlaceHolder) {
-    return print(split.Ty, split.Quals, OS, policy, PlaceHolder);
+                    const PrintingPolicy &policy, const Twine &PlaceHolder,
+                    unsigned Indentation = 0) {
+    return print(split.Ty, split.Quals, OS, policy, PlaceHolder, Indentation);
   }
   static void print(const Type *ty, Qualifiers qs,
                     raw_ostream &OS, const PrintingPolicy &policy,
-                    const Twine &PlaceHolder);
+                    const Twine &PlaceHolder,
+                    unsigned Indentation = 0);
 
   void getAsStringInternal(std::string &Str,
                            const PrintingPolicy &Policy) const {
@@ -936,21 +962,24 @@
     const QualType &T;
     const PrintingPolicy &Policy;
     const Twine &PlaceHolder;
+    unsigned Indentation;
   public:
     StreamedQualTypeHelper(const QualType &T, const PrintingPolicy &Policy,
-                           const Twine &PlaceHolder)
-      : T(T), Policy(Policy), PlaceHolder(PlaceHolder) { }
+                           const Twine &PlaceHolder, unsigned Indentation)
+      : T(T), Policy(Policy), PlaceHolder(PlaceHolder),
+        Indentation(Indentation) { }
 
     friend raw_ostream &operator<<(raw_ostream &OS,
                                    const StreamedQualTypeHelper &SQT) {
-      SQT.T.print(OS, SQT.Policy, SQT.PlaceHolder);
+      SQT.T.print(OS, SQT.Policy, SQT.PlaceHolder, SQT.Indentation);
       return OS;
     }
   };
 
   StreamedQualTypeHelper stream(const PrintingPolicy &Policy,
-                                const Twine &PlaceHolder = Twine()) const {
-    return StreamedQualTypeHelper(*this, Policy, PlaceHolder);
+                                const Twine &PlaceHolder = Twine(),
+                                unsigned Indentation = 0) const {
+    return StreamedQualTypeHelper(*this, Policy, PlaceHolder, Indentation);
   }
 
   void dump(const char *s) const;
@@ -1068,7 +1097,7 @@
   // These methods are implemented in a separate translation unit;
   // "static"-ize them to avoid creating temporary QualTypes in the
   // caller.
-  static bool isConstant(QualType T, ASTContext& Ctx);
+  static bool isConstant(QualType T, const ASTContext& Ctx);
   static QualType getDesugaredType(QualType T, const ASTContext &Context);
   static SplitQualType getSplitDesugaredType(QualType T);
   static SplitQualType getSplitUnqualifiedTypeImpl(QualType type);
@@ -1356,7 +1385,7 @@
     ///
     /// C++ 8.3.5p4: The return type, the parameter type list and the
     /// cv-qualifier-seq, [...], are part of the function type.
-    unsigned TypeQuals : 3;
+    unsigned TypeQuals : 4;
 
     /// \brief The ref-qualifier associated with a \c FunctionProtoType.
     ///
@@ -1603,7 +1632,7 @@
   bool isChar16Type() const;
   bool isChar32Type() const;
   bool isAnyCharacterType() const;
-  bool isIntegralType(ASTContext &Ctx) const;
+  bool isIntegralType(const ASTContext &Ctx) const;
 
   /// Determine whether this type is an integral or enumeration type.
   bool isIntegralOrEnumerationType() const;
@@ -1702,18 +1731,9 @@
   bool isNullPtrType() const;                   // C++0x nullptr_t
   bool isAtomicType() const;                    // C11 _Atomic()
 
-  bool isImage1dT() const;               // OpenCL image1d_t
-  bool isImage1dArrayT() const;          // OpenCL image1d_array_t
-  bool isImage1dBufferT() const;         // OpenCL image1d_buffer_t
-  bool isImage2dT() const;               // OpenCL image2d_t
-  bool isImage2dArrayT() const;          // OpenCL image2d_array_t
-  bool isImage2dDepthT() const;          // OpenCL image_2d_depth_t
-  bool isImage2dArrayDepthT() const;     // OpenCL image_2d_array_depth_t
-  bool isImage2dMSAAT() const;           // OpenCL image_2d_msaa_t
-  bool isImage2dArrayMSAAT() const;      // OpenCL image_2d_array_msaa_t
-  bool isImage2dMSAATDepth() const;      // OpenCL image_2d_msaa_depth_t
-  bool isImage2dArrayMSAATDepth() const; // OpenCL image_2d_array_msaa_depth_t
-  bool isImage3dT() const;               // OpenCL image3d_t
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  bool is##Id##Type() const;
+#include "clang/Basic/OpenCLImageTypes.def"
 
   bool isImageType() const;                     // Any OpenCL image type
 
@@ -1878,6 +1898,11 @@
   /// This should never be used when type qualifiers are meaningful.
   const Type *getArrayElementTypeNoTypeQual() const;
 
+  /// If this is a pointer type, return the pointee type.
+  /// If this is an array type, return the array element type.
+  /// This should never be used when type qualifiers are meaningful.
+  const Type *getPointeeOrArrayElementType() const;
+
   /// If this is a pointer, ObjC object pointer, or block
   /// pointer, this returns the respective pointee.
   QualType getPointeeType() const;
@@ -2014,6 +2039,10 @@
 class BuiltinType : public Type {
 public:
   enum Kind {
+// OpenCL image types
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) Id,
+#include "clang/Basic/OpenCLImageTypes.def"
+// All other builtin types
 #define BUILTIN_TYPE(Id, SingletonId) Id,
 #define LAST_BUILTIN_TYPE(Id) LastKind = Id
 #include "clang/AST/BuiltinTypes.def"
@@ -2053,7 +2082,7 @@
   }
 
   bool isFloatingPoint() const {
-    return getKind() >= Half && getKind() <= LongDouble;
+    return getKind() >= Half && getKind() <= Float128;
   }
 
   /// Determines whether the given kind corresponds to a placeholder type.
@@ -2502,13 +2531,13 @@
 
   /// \brief Determine the number of bits required to address a member of
   // an array with the given element type and number of elements.
-  static unsigned getNumAddressingBits(ASTContext &Context,
+  static unsigned getNumAddressingBits(const ASTContext &Context,
                                        QualType ElementType,
                                        const llvm::APInt &NumElements);
 
   /// \brief Determine the maximum number of active bits that an array's size
   /// can require, which limits the maximum size of the array.
-  static unsigned getMaxSizeBits(ASTContext &Context);
+  static unsigned getMaxSizeBits(const ASTContext &Context);
 
   void Profile(llvm::FoldingSetNodeID &ID) {
     Profile(ID, getElementType(), getSize(),
@@ -2826,13 +2855,15 @@
     }
   }
 
-  static int getAccessorIdx(char c) {
-    if (int idx = getPointAccessorIdx(c)+1) return idx-1;
-    return getNumericAccessorIdx(c);
+  static int getAccessorIdx(char c, bool isNumericAccessor) {
+    if (isNumericAccessor)
+      return getNumericAccessorIdx(c);
+    else
+      return getPointAccessorIdx(c);
   }
 
-  bool isAccessorWithinNumElements(char c) const {
-    if (int idx = getAccessorIdx(c)+1)
+  bool isAccessorWithinNumElements(char c, bool isNumericAccessor) const {
+    if (int idx = getAccessorIdx(c, isNumericAccessor)+1)
       return unsigned(idx-1) < getNumElements();
     return false;
   }
@@ -2993,7 +3024,7 @@
 
   /// \brief Determine the type of an expression that calls a function of
   /// this type.
-  QualType getCallResultType(ASTContext &Context) const {
+  QualType getCallResultType(const ASTContext &Context) const {
     return getReturnType().getNonLValueExprType(Context);
   }
 
@@ -3630,6 +3661,28 @@
   }
 };
 
+/// \brief Internal representation of canonical, dependent
+/// __underlying_type(type) types.
+///
+/// This class is used internally by the ASTContext to manage
+/// canonical, dependent types, only. Clients will only see instances
+/// of this class via UnaryTransformType nodes.
+class DependentUnaryTransformType : public UnaryTransformType,
+                                    public llvm::FoldingSetNode {
+public:
+  DependentUnaryTransformType(const ASTContext &C, QualType BaseType,
+                              UTTKind UKind);
+  void Profile(llvm::FoldingSetNodeID &ID) {
+    Profile(ID, getBaseType(), getUTTKind());
+  }
+
+  static void Profile(llvm::FoldingSetNodeID &ID, QualType BaseType,
+                      UTTKind UKind) {
+    ID.AddPointer(BaseType.getAsOpaquePtr());
+    ID.AddInteger((unsigned)UKind);
+  }
+};
+
 class TagType : public Type {
   /// Stores the TagDecl associated with this type. The decl may point to any
   /// TagDecl that declares the entity.
@@ -4117,19 +4170,18 @@
   unsigned NumArgs : 31;
 
   /// Whether this template specialization type is a substituted type alias.
-  bool TypeAlias : 1;
+  unsigned TypeAlias : 1;
 
   TemplateSpecializationType(TemplateName T,
-                             const TemplateArgument *Args,
-                             unsigned NumArgs, QualType Canon,
+                             ArrayRef<TemplateArgument> Args,
+                             QualType Canon,
                              QualType Aliased);
 
   friend class ASTContext;  // ASTContext creates these
 
 public:
   /// Determine whether any of the given template arguments are dependent.
-  static bool anyDependentTemplateArguments(const TemplateArgumentLoc *Args,
-                                            unsigned NumArgs,
+  static bool anyDependentTemplateArguments(ArrayRef<TemplateArgumentLoc> Args,
                                             bool &InstantiationDependent);
 
   static bool anyDependentTemplateArguments(const TemplateArgumentListInfo &,
@@ -4138,14 +4190,12 @@
   /// \brief Print a template argument list, including the '<' and '>'
   /// enclosing the template arguments.
   static void PrintTemplateArgumentList(raw_ostream &OS,
-                                        const TemplateArgument *Args,
-                                        unsigned NumArgs,
+                                        ArrayRef<TemplateArgument> Args,
                                         const PrintingPolicy &Policy,
                                         bool SkipBrackets = false);
 
   static void PrintTemplateArgumentList(raw_ostream &OS,
-                                        const TemplateArgumentLoc *Args,
-                                        unsigned NumArgs,
+                                        ArrayRef<TemplateArgumentLoc> Args,
                                         const PrintingPolicy &Policy);
 
   static void PrintTemplateArgumentList(raw_ostream &OS,
@@ -4202,20 +4252,23 @@
   /// \pre \c isArgType(Arg)
   const TemplateArgument &getArg(unsigned Idx) const; // in TemplateBase.h
 
+  ArrayRef<TemplateArgument> template_arguments() const {
+    return {getArgs(), NumArgs};
+  }
+
   bool isSugared() const {
     return !isDependentType() || isCurrentInstantiation() || isTypeAlias();
   }
   QualType desugar() const { return getCanonicalTypeInternal(); }
 
   void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Ctx) {
-    Profile(ID, Template, getArgs(), NumArgs, Ctx);
+    Profile(ID, Template, template_arguments(), Ctx);
     if (isTypeAlias())
       getAliasedType().Profile(ID);
   }
 
   static void Profile(llvm::FoldingSetNodeID &ID, TemplateName T,
-                      const TemplateArgument *Args,
-                      unsigned NumArgs,
+                      ArrayRef<TemplateArgument> Args,
                       const ASTContext &Context);
 
   static bool classof(const Type *T) {
@@ -4258,6 +4311,8 @@
   friend class ASTReader; // FIXME: ASTContext::getInjectedClassNameType is not
                           // currently suitable for AST reading, too much
                           // interdependencies.
+  friend class ASTNodeImporter;
+
   InjectedClassNameType(CXXRecordDecl *D, QualType TST)
     : Type(InjectedClassName, QualType(), /*Dependent=*/true,
            /*InstantiationDependent=*/true,
@@ -4517,8 +4572,7 @@
   DependentTemplateSpecializationType(ElaboratedTypeKeyword Keyword,
                                       NestedNameSpecifier *NNS,
                                       const IdentifierInfo *Name,
-                                      unsigned NumArgs,
-                                      const TemplateArgument *Args,
+                                      ArrayRef<TemplateArgument> Args,
                                       QualType Canon);
 
   friend class ASTContext;  // ASTContext creates these
@@ -4537,6 +4591,10 @@
 
   const TemplateArgument &getArg(unsigned Idx) const; // in TemplateBase.h
 
+  ArrayRef<TemplateArgument> template_arguments() const {
+    return {getArgs(), NumArgs};
+  }
+
   typedef const TemplateArgument * iterator;
   iterator begin() const { return getArgs(); }
   iterator end() const; // inline in TemplateBase.h
@@ -4545,7 +4603,7 @@
   QualType desugar() const { return QualType(this, 0); }
 
   void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
-    Profile(ID, Context, getKeyword(), NNS, Name, NumArgs, getArgs());
+    Profile(ID, Context, getKeyword(), NNS, Name, {getArgs(), NumArgs});
   }
 
   static void Profile(llvm::FoldingSetNodeID &ID,
@@ -4553,8 +4611,7 @@
                       ElaboratedTypeKeyword Keyword,
                       NestedNameSpecifier *Qualifier,
                       const IdentifierInfo *Name,
-                      unsigned NumArgs,
-                      const TemplateArgument *Args);
+                      ArrayRef<TemplateArgument> Args);
 
   static bool classof(const Type *T) {
     return T->getTypeClass() == DependentTemplateSpecialization;
@@ -5309,7 +5366,8 @@
 
 inline void QualType::removeLocalCVRQualifiers(unsigned Mask) {
   assert(!(Mask & ~Qualifiers::CVRMask) && "mask has non-CVR bits");
-  assert((int)Qualifiers::CVRMask == (int)Qualifiers::FastMask);
+  static_assert((int)Qualifiers::CVRMask == (int)Qualifiers::FastMask,
+                "Fast bits differ from CVR bits!");
 
   // Fast path: we don't need to touch the slow qualifiers.
   removeLocalFastQualifiers(Mask);
@@ -5345,9 +5403,9 @@
 /// "int". However, it is not more qualified than "const volatile
 /// int".
 inline bool QualType::isMoreQualifiedThan(QualType other) const {
-  Qualifiers myQuals = getQualifiers();
-  Qualifiers otherQuals = other.getQualifiers();
-  return (myQuals != otherQuals && myQuals.compatiblyIncludes(otherQuals));
+  Qualifiers MyQuals = getQualifiers();
+  Qualifiers OtherQuals = other.getQualifiers();
+  return (MyQuals != OtherQuals && MyQuals.compatiblyIncludes(OtherQuals));
 }
 
 /// Determine whether this type is at last
@@ -5355,7 +5413,13 @@
 /// int" is at least as qualified as "const int", "volatile int",
 /// "int", and "const volatile int".
 inline bool QualType::isAtLeastAsQualifiedAs(QualType other) const {
-  return getQualifiers().compatiblyIncludes(other.getQualifiers());
+  Qualifiers OtherQuals = other.getQualifiers();
+
+  // Ignore __unaligned qualifier if this type is a void.
+  if (getUnqualifiedType()->isVoidType())
+    OtherQuals.removeUnaligned();
+
+  return getQualifiers().compatiblyIncludes(OtherQuals);
 }
 
 /// If Type is a reference type (e.g., const
@@ -5532,53 +5596,11 @@
   return isObjCIdType() || isObjCClassType() || isObjCSelType();
 }
 
-inline bool Type::isImage1dT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage1d);
-}
-
-inline bool Type::isImage1dArrayT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage1dArray);
-}
-
-inline bool Type::isImage1dBufferT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage1dBuffer);
-}
-
-inline bool Type::isImage2dT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage2d);
-}
-
-inline bool Type::isImage2dArrayT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage2dArray);
-}
-
-inline bool Type::isImage2dDepthT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage2dDepth);
-}
-
-inline bool Type::isImage2dArrayDepthT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage2dArrayDepth);
-}
-
-inline bool Type::isImage2dMSAAT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage2dMSAA);
-}
-
-inline bool Type::isImage2dArrayMSAAT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage2dArrayMSAA);
-}
-
-inline bool Type::isImage2dMSAATDepth() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage2dMSAADepth);
-}
-
-inline bool Type::isImage2dArrayMSAATDepth() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage2dArrayMSAADepth);
-}
-
-inline bool Type::isImage3dT() const {
-  return isSpecificBuiltinType(BuiltinType::OCLImage3d);
-}
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  inline bool Type::is##Id##Type() const { \
+    return isSpecificBuiltinType(BuiltinType::Id); \
+  }
+#include "clang/Basic/OpenCLImageTypes.def"
 
 inline bool Type::isSamplerT() const {
   return isSpecificBuiltinType(BuiltinType::OCLSampler);
@@ -5605,11 +5627,10 @@
 }
 
 inline bool Type::isImageType() const {
-  return isImage3dT() || isImage2dT() || isImage2dArrayT() ||
-         isImage2dDepthT() || isImage2dArrayDepthT() || isImage2dMSAAT() ||
-         isImage2dArrayMSAAT() || isImage2dMSAATDepth() ||
-         isImage2dArrayMSAATDepth() || isImage1dT() || isImage1dArrayT() ||
-         isImage1dBufferT();
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) is##Id##Type() ||
+  return
+#include "clang/Basic/OpenCLImageTypes.def"
+      0; // end boolean or operation
 }
 
 inline bool Type::isPipeType() const {
@@ -5759,6 +5780,15 @@
   return type;
 }
 
+inline const Type *Type::getPointeeOrArrayElementType() const {
+  const Type *type = this;
+  if (type->isAnyPointerType())
+    return type->getPointeeType().getTypePtr();
+  else if (type->isArrayType())
+    return type->getBaseElementTypeUnsafe();
+  return type;
+}
+
 /// Insertion operator for diagnostics.  This allows sending QualType's into a
 /// diagnostic with <<.
 inline const DiagnosticBuilder &operator<<(const DiagnosticBuilder &DB,
diff --git a/include/clang/AST/TypeLoc.h b/include/clang/AST/TypeLoc.h
index ba3f848..67adf4a 100644
--- a/include/clang/AST/TypeLoc.h
+++ b/include/clang/AST/TypeLoc.h
@@ -538,7 +538,7 @@
   bool needsExtraLocalData() const {
     BuiltinType::Kind bk = getTypePtr()->getKind();
     return (bk >= BuiltinType::UShort && bk <= BuiltinType::UInt128)
-      || (bk >= BuiltinType::Short && bk <= BuiltinType::LongDouble)
+      || (bk >= BuiltinType::Short && bk <= BuiltinType::Float128)
       || bk == BuiltinType::UChar
       || bk == BuiltinType::SChar;
   }
diff --git a/include/clang/AST/UnresolvedSet.h b/include/clang/AST/UnresolvedSet.h
index 26ee1cf..b63c6eb 100644
--- a/include/clang/AST/UnresolvedSet.h
+++ b/include/clang/AST/UnresolvedSet.h
@@ -17,7 +17,6 @@
 
 #include "clang/AST/DeclAccessPair.h"
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 
@@ -39,7 +38,9 @@
       : iterator_adaptor_base(const_cast<DeclAccessPair *>(Iter)) {}
 
 public:
-  UnresolvedSetIterator() {}
+  // Work around a bug in MSVC 2013 where explicitly default constructed
+  // temporaries with defaulted ctors are not zero initialized.
+  UnresolvedSetIterator() : iterator_adaptor_base(nullptr) {}
 
   NamedDecl *getDecl() const { return I->getDecl(); }
   void setDecl(NamedDecl *ND) const { return I->setDecl(ND); }
@@ -59,8 +60,13 @@
   // UnresolvedSet.
 private:
   template <unsigned N> friend class UnresolvedSet;
-  UnresolvedSetImpl() {}
-  UnresolvedSetImpl(const UnresolvedSetImpl &) {}
+  UnresolvedSetImpl() = default;
+  UnresolvedSetImpl(const UnresolvedSetImpl &) = default;
+  UnresolvedSetImpl &operator=(const UnresolvedSetImpl &) = default;
+
+  // FIXME: Switch these to "= default" once MSVC supports generating move ops
+  UnresolvedSetImpl(UnresolvedSetImpl &&) {}
+  UnresolvedSetImpl &operator=(UnresolvedSetImpl &&) { return *this; }
 
 public:
   // We don't currently support assignment through this iterator, so we might
diff --git a/include/clang/AST/VTTBuilder.h b/include/clang/AST/VTTBuilder.h
index 727bf51..b4a6fe3 100644
--- a/include/clang/AST/VTTBuilder.h
+++ b/include/clang/AST/VTTBuilder.h
@@ -20,7 +20,6 @@
 #include "clang/AST/GlobalDecl.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/Basic/ABI.h"
-#include "llvm/ADT/SetVector.h"
 #include <utility>
 
 namespace clang {
diff --git a/include/clang/AST/VTableBuilder.h b/include/clang/AST/VTableBuilder.h
index 481fd11..28ec4b8 100644
--- a/include/clang/AST/VTableBuilder.h
+++ b/include/clang/AST/VTableBuilder.h
@@ -20,7 +20,6 @@
 #include "clang/AST/RecordLayout.h"
 #include "clang/Basic/ABI.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SetVector.h"
 #include <memory>
 #include <utility>
 
@@ -399,21 +398,21 @@
   typedef SmallVector<const CXXRecordDecl *, 1> BasePath;
 
   VPtrInfo(const CXXRecordDecl *RD)
-      : ReusingBase(RD), BaseWithVPtr(RD), NextBaseToMangle(RD) {}
+      : ObjectWithVPtr(RD), IntroducingObject(RD), NextBaseToMangle(RD) {}
 
-  /// The vtable will hold all of the virtual bases or virtual methods of
-  /// ReusingBase.  This may or may not be the same class as VPtrSubobject.Base.
-  /// A derived class will reuse the vptr of the first non-virtual base
-  /// subobject that has one.
-  const CXXRecordDecl *ReusingBase;
+  /// This is the most derived class that has this vptr at offset zero. When
+  /// single inheritance is used, this is always the most derived class. If
+  /// multiple inheritance is used, it may be any direct or indirect base.
+  const CXXRecordDecl *ObjectWithVPtr;
 
-  /// BaseWithVPtr is at this offset from its containing complete object or
+  /// This is the class that introduced the vptr by declaring new virtual
+  /// methods or virtual bases.
+  const CXXRecordDecl *IntroducingObject;
+
+  /// IntroducingObject is at this offset from its containing complete object or
   /// virtual base.
   CharUnits NonVirtualOffset;
 
-  /// The vptr is stored inside this subobject.
-  const CXXRecordDecl *BaseWithVPtr;
-
   /// The bases from the inheritance path that got used to mangle the vbtable
   /// name.  This is not really a full path like a CXXBasePath.  It holds the
   /// subset of records that need to be mangled into the vbtable symbol name in
@@ -432,7 +431,7 @@
   /// This holds the base classes path from the complete type to the first base
   /// with the given vfptr offset, in the base-to-derived order.  Only used for
   /// vftables.
-  BasePath PathToBaseWithVPtr;
+  BasePath PathToIntroducingObject;
 
   /// Static offset from the top of the most derived class to this vfptr,
   /// including any virtual base offset.  Only used for vftables.
diff --git a/include/clang/ASTMatchers/ASTMatchFinder.h b/include/clang/ASTMatchers/ASTMatchFinder.h
index 92ec92c..2c9a377 100644
--- a/include/clang/ASTMatchers/ASTMatchFinder.h
+++ b/include/clang/ASTMatchers/ASTMatchFinder.h
@@ -229,6 +229,10 @@
 /// Multiple results occur when using matchers like \c forEachDescendant,
 /// which generate a result for each sub-match.
 ///
+/// If you want to find all matches on the sub-tree rooted at \c Node (rather
+/// than only the matches on \c Node itself), surround the \c Matcher with a
+/// \c findAll().
+///
 /// \see selectFirst
 /// @{
 template <typename MatcherT, typename NodeT>
@@ -241,6 +245,11 @@
       ASTContext &Context);
 /// @}
 
+/// \brief Returns the results of matching \p Matcher on the translation unit of
+/// \p Context and collects the \c BoundNodes of all callback invocations.
+template <typename MatcherT>
+SmallVector<BoundNodes, 1> match(MatcherT Matcher, ASTContext &Context);
+
 /// \brief Returns the first result of type \c NodeT bound to \p BoundTo.
 ///
 /// Returns \c NULL if there is no match, or if the matching node cannot be
@@ -288,6 +297,16 @@
   return match(Matcher, ast_type_traits::DynTypedNode::create(Node), Context);
 }
 
+template <typename MatcherT>
+SmallVector<BoundNodes, 1>
+match(MatcherT Matcher, ASTContext &Context) {
+  internal::CollectMatchesCallback Callback;
+  MatchFinder Finder;
+  Finder.addMatcher(Matcher, &Callback);
+  Finder.matchAST(Context);
+  return std::move(Callback.Nodes);
+}
+
 } // end namespace ast_matchers
 } // end namespace clang
 
diff --git a/include/clang/ASTMatchers/ASTMatchers.h b/include/clang/ASTMatchers/ASTMatchers.h
index f0d8227..6f194e2 100644
--- a/include/clang/ASTMatchers/ASTMatchers.h
+++ b/include/clang/ASTMatchers/ASTMatchers.h
@@ -51,7 +51,6 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 #include "clang/ASTMatchers/ASTMatchersMacros.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Regex.h"
 #include <iterator>
 
@@ -163,11 +162,35 @@
 /// Given
 /// \code
 ///   typedef int X;
+///   using Y = int;
 /// \endcode
 /// typedefDecl()
-///   matches "typedef int X"
+///   matches "typedef int X", but not "using Y = int"
 const internal::VariadicDynCastAllOfMatcher<Decl, TypedefDecl> typedefDecl;
 
+/// \brief Matches typedef name declarations.
+///
+/// Given
+/// \code
+///   typedef int X;
+///   using Y = int;
+/// \endcode
+/// typedefNameDecl()
+///   matches "typedef int X" and "using Y = int"
+const internal::VariadicDynCastAllOfMatcher<Decl, TypedefNameDecl>
+    typedefNameDecl;
+
+/// \brief Matches type alias declarations.
+///
+/// Given
+/// \code
+///   typedef int X;
+///   using Y = int;
+/// \endcode
+/// typeAliasDecl()
+///   matches "using Y = int", but not "typedef int X"
+const internal::VariadicDynCastAllOfMatcher<Decl, TypeAliasDecl> typeAliasDecl;
+
 /// \brief Matches AST nodes that were expanded within the main-file.
 ///
 /// Example matches X but not Y
@@ -282,6 +305,17 @@
 /// \endcode
 const internal::VariadicDynCastAllOfMatcher<Decl, NamedDecl> namedDecl;
 
+/// \brief Matches a declaration of label.
+///
+/// Given
+/// \code
+///   goto FOO;
+///   FOO: bar();
+/// \endcode
+/// labelDecl()
+///   matches 'FOO:'
+const internal::VariadicDynCastAllOfMatcher<Decl, LabelDecl> labelDecl;
+
 /// \brief Matches a declaration of a namespace.
 ///
 /// Given
@@ -412,6 +446,17 @@
 ///   matches 'int' in C<int>.
 const internal::VariadicAllOfMatcher<TemplateArgument> templateArgument;
 
+/// \brief Matches template name.
+///
+/// Given
+/// \code
+///   template <typename T> class X { };
+///   X<int> xi;
+/// \endcode
+/// templateName()
+///   matches 'X' in X<int>.
+const internal::VariadicAllOfMatcher<TemplateName> templateName;
+
 /// \brief Matches non-type template parameter declarations.
 ///
 /// Given
@@ -447,7 +492,7 @@
 ///   };
 /// \endcode
 /// fieldDecl(isPublic())
-///   matches 'int a;' 
+///   matches 'int a;'
 AST_MATCHER(Decl, isPublic) {
   return Node.getAccess() == AS_public;
 }
@@ -463,7 +508,7 @@
 ///   };
 /// \endcode
 /// fieldDecl(isProtected())
-///   matches 'int b;' 
+///   matches 'int b;'
 AST_MATCHER(Decl, isProtected) {
   return Node.getAccess() == AS_protected;
 }
@@ -479,33 +524,75 @@
 ///   };
 /// \endcode
 /// fieldDecl(isPrivate())
-///   matches 'int c;' 
+///   matches 'int c;'
 AST_MATCHER(Decl, isPrivate) {
   return Node.getAccess() == AS_private;
 }
 
+/// \brief Matches non-static data members that are bit-fields.
+///
+/// Given
+/// \code
+///   class C {
+///     int a : 2;
+///     int b;
+///   };
+/// \endcode
+/// fieldDecl(isBitField())
+///   matches 'int a;' but not 'int b;'.
+AST_MATCHER(FieldDecl, isBitField) {
+  return Node.isBitField();
+}
+
+/// \brief Matches non-static data members that are bit-fields.
+///
+/// Given
+/// \code
+///   class C {
+///     int a : 2;
+///     int b : 4;
+///     int c : 2;
+///   };
+/// \endcode
+/// fieldDecl(isBitField())
+///   matches 'int a;' and 'int c;' but not 'int b;'.
+AST_MATCHER_P(FieldDecl, hasBitWidth, unsigned, Width) {
+  return Node.isBitField() &&
+         Node.getBitWidthValue(Finder->getASTContext()) == Width;
+}
+
 /// \brief Matches a declaration that has been implicitly added
 /// by the compiler (eg. implicit default/copy constructors).
 AST_MATCHER(Decl, isImplicit) {
   return Node.isImplicit();
 }
 
-/// \brief Matches classTemplateSpecializations that have at least one
-/// TemplateArgument matching the given InnerMatcher.
+/// \brief Matches classTemplateSpecializations, templateSpecializationType and
+/// functionDecl that have at least one TemplateArgument matching the given
+/// InnerMatcher.
 ///
 /// Given
 /// \code
 ///   template<typename T> class A {};
 ///   template<> class A<double> {};
 ///   A<int> a;
+///
+///   template<typename T> f() {};
+///   void func() { f<int>(); };
+/// \endcode
+///
 /// \endcode
 /// classTemplateSpecializationDecl(hasAnyTemplateArgument(
 ///     refersToType(asString("int"))))
 ///   matches the specialization \c A<int>
+///
+/// functionDecl(hasAnyTemplateArgument(refersToType(asString("int"))))
+///   matches the specialization \c f<int>
 AST_POLYMORPHIC_MATCHER_P(
     hasAnyTemplateArgument,
     AST_POLYMORPHIC_SUPPORTED_TYPES(ClassTemplateSpecializationDecl,
-                                    TemplateSpecializationType),
+                                    TemplateSpecializationType,
+                                    FunctionDecl),
     internal::Matcher<TemplateArgument>, InnerMatcher) {
   ArrayRef<TemplateArgument> List =
       internal::getTemplateSpecializationArgs(Node);
@@ -513,6 +600,32 @@
                              Builder);
 }
 
+/// \brief Matches expressions that match InnerMatcher after any implicit AST
+/// nodes are stripped off.
+///
+/// Parentheses and explicit casts are not discarded.
+/// Given
+/// \code
+///   class C {};
+///   C a = C();
+///   C b;
+///   C c = b;
+/// \endcode
+/// The matchers
+/// \code
+///    varDecl(hasInitializer(ignoringImplicit(cxxConstructExpr())))
+/// \endcode
+/// would match the declarations for a, b, and c.
+/// While
+/// \code
+///    varDecl(hasInitializer(cxxConstructExpr()))
+/// \endcode
+/// only match the declarations for b and c.
+AST_MATCHER_P(Expr, ignoringImplicit, ast_matchers::internal::Matcher<Expr>,
+              InnerMatcher) {
+  return InnerMatcher.matches(*Node.IgnoreImplicit(), Finder, Builder);
+}
+
 /// \brief Matches expressions that match InnerMatcher after any implicit casts
 /// are stripped off.
 ///
@@ -590,22 +703,45 @@
   return InnerMatcher.matches(*Node.IgnoreParenImpCasts(), Finder, Builder);
 }
 
-/// \brief Matches classTemplateSpecializations where the n'th TemplateArgument
-/// matches the given InnerMatcher.
+/// \brief Matches types that match InnerMatcher after any parens are stripped.
+///
+/// Given
+/// \code
+///   void (*fp)(void);
+/// \endcode
+/// The matcher
+/// \code
+///   varDecl(hasType(pointerType(pointee(ignoringParens(functionType())))))
+/// \endcode
+/// would match the declaration for fp.
+AST_MATCHER_P(QualType, ignoringParens,
+              internal::Matcher<QualType>, InnerMatcher) {
+  return InnerMatcher.matches(Node.IgnoreParens(), Finder, Builder);
+}
+
+/// \brief Matches classTemplateSpecializations, templateSpecializationType and
+/// functionDecl where the n'th TemplateArgument matches the given InnerMatcher.
 ///
 /// Given
 /// \code
 ///   template<typename T, typename U> class A {};
 ///   A<bool, int> b;
 ///   A<int, bool> c;
+///
+///   template<typename T> f() {};
+///   void func() { f<int>(); };
 /// \endcode
 /// classTemplateSpecializationDecl(hasTemplateArgument(
 ///     1, refersToType(asString("int"))))
 ///   matches the specialization \c A<bool, int>
+///
+/// functionDecl(hasTemplateArgument(0, refersToType(asString("int"))))
+///   matches the specialization \c f<int>
 AST_POLYMORPHIC_MATCHER_P2(
     hasTemplateArgument,
     AST_POLYMORPHIC_SUPPORTED_TYPES(ClassTemplateSpecializationDecl,
-                                    TemplateSpecializationType),
+                                    TemplateSpecializationType,
+                                    FunctionDecl),
     unsigned, N, internal::Matcher<TemplateArgument>, InnerMatcher) {
   ArrayRef<TemplateArgument> List =
       internal::getTemplateSpecializationArgs(Node);
@@ -649,6 +785,24 @@
   return InnerMatcher.matches(Node.getAsType(), Finder, Builder);
 }
 
+/// \brief Matches a TemplateArgument that refers to a certain template.
+///
+/// Given
+/// \code
+///   template<template <typename> class S> class X {};
+///   template<typename T> class Y {};"
+///   X<Y> xi;
+/// \endcode
+/// classTemplateSpecializationDecl(hasAnyTemplateArgument(
+///     refersToTemplate(templateName())))
+///   matches the specialization \c X<Y>
+AST_MATCHER_P(TemplateArgument, refersToTemplate,
+              internal::Matcher<TemplateName>, InnerMatcher) {
+  if (Node.getKind() != TemplateArgument::Template)
+    return false;
+  return InnerMatcher.matches(Node.getAsTemplate(), Finder, Builder);
+}
+
 /// \brief Matches a canonical TemplateArgument that refers to a certain
 /// declaration.
 ///
@@ -976,6 +1130,43 @@
 ///   matches "{ 1, 2 }" and "{ 5, 6 }"
 const internal::VariadicDynCastAllOfMatcher<Stmt, InitListExpr> initListExpr;
 
+/// \brief Matches the syntactic form of init list expressions
+/// (if expression have it).
+AST_MATCHER_P(InitListExpr, hasSyntacticForm,
+              internal::Matcher<Expr>, InnerMatcher) {
+  const Expr *SyntForm = Node.getSyntacticForm();
+  return (SyntForm != nullptr &&
+          InnerMatcher.matches(*SyntForm, Finder, Builder));
+}
+
+/// \brief Matches implicit initializers of init list expressions.
+///
+/// Given
+/// \code
+///   point ptarray[10] = { [2].y = 1.0, [2].x = 2.0, [0].x = 1.0 };
+/// \endcode
+/// implicitValueInitExpr()
+///   matches "[0].y" (implicitly)
+const internal::VariadicDynCastAllOfMatcher<Stmt, ImplicitValueInitExpr>
+implicitValueInitExpr;
+
+/// \brief Matches paren list expressions.
+/// ParenListExprs don't have a predefined type and are used for late parsing.
+/// In the final AST, they can be met in template declarations.
+///
+/// Given
+/// \code
+///   template<typename T> class X {
+///     void f() {
+///       X x(*this);
+///       int a = 0, b = 1; int i = (a, b);
+///     }
+///   };
+/// \endcode
+/// parenListExpr() matches "*this" but NOT matches (a, b) because (a, b)
+/// has a predefined type and is a ParenExpr, not a ParenListExpr.
+const internal::VariadicDynCastAllOfMatcher<Stmt, ParenListExpr> parenListExpr;
+
 /// \brief Matches substitutions of non-type template parameters.
 ///
 /// Given
@@ -1014,6 +1205,24 @@
   Decl,
   UsingDirectiveDecl> usingDirectiveDecl;
 
+/// \brief Matches reference to a name that can be looked up during parsing
+/// but could not be resolved to a specific declaration.
+///
+/// Given
+/// \code
+///   template<typename T>
+///   T foo() { T a; return a; }
+///   template<typename T>
+///   void bar() {
+///     foo<T>();
+///   }
+/// \endcode
+/// unresolvedLookupExpr()
+///   matches \code foo<T>() \endcode
+const internal::VariadicDynCastAllOfMatcher<
+   Stmt,
+   UnresolvedLookupExpr> unresolvedLookupExpr;
+
 /// \brief Matches unresolved using value declarations.
 ///
 /// Given
@@ -1048,6 +1257,17 @@
   Decl,
   UnresolvedUsingTypenameDecl> unresolvedUsingTypenameDecl;
 
+/// \brief Matches parentheses used in expressions.
+///
+/// Example matches (foo() + 1)
+/// \code
+///   int foo() { return 1; }
+///   int a = (foo() + 1);
+/// \endcode
+const internal::VariadicDynCastAllOfMatcher<
+  Stmt,
+  ParenExpr> parenExpr;
+
 /// \brief Matches constructor call expressions (including implicit ones).
 ///
 /// Example matches string(ptr, n) and ptr within arguments of f
@@ -1357,6 +1577,18 @@
 ///   matches 'FOO:'
 const internal::VariadicDynCastAllOfMatcher<Stmt, LabelStmt> labelStmt;
 
+/// \brief Matches address of label statements (GNU extension).
+///
+/// Given
+/// \code
+///   FOO: bar();
+///   void *ptr = &&FOO;
+///   goto *bar;
+/// \endcode
+/// addrLabelExpr()
+///   matches '&&FOO'
+const internal::VariadicDynCastAllOfMatcher<Stmt, AddrLabelExpr> addrLabelExpr;
+
 /// \brief Matches switch statements.
 ///
 /// Given
@@ -1465,7 +1697,8 @@
 ///
 /// Example matches "abcd", L"abcd"
 /// \code
-///   char *s = "abcd"; wchar_t *ws = L"abcd"
+///   char *s = "abcd";
+///   wchar_t *ws = L"abcd";
 /// \endcode
 const internal::VariadicDynCastAllOfMatcher<
   Stmt,
@@ -1478,7 +1711,8 @@
 ///
 /// Example matches 'a', L'a'
 /// \code
-///   char ch = 'a'; wchar_t chw = L'a';
+///   char ch = 'a';
+///   wchar_t chw = L'a';
 /// \endcode
 const internal::VariadicDynCastAllOfMatcher<
   Stmt,
@@ -1514,7 +1748,8 @@
 ///
 /// Example match: {1}, (1, 2)
 /// \code
-///   int array[4] = {1}; vector int myvec = (vector int)(1, 2);
+///   int array[4] = {1};
+///   vector int myvec = (vector int)(1, 2);
 /// \endcode
 const internal::VariadicDynCastAllOfMatcher<
   Stmt,
@@ -1526,9 +1761,22 @@
   CXXNullPtrLiteralExpr> cxxNullPtrLiteralExpr;
 
 /// \brief Matches GNU __null expression.
-const internal::VariadicDynCastAllOfMatcher<
-  Stmt,
-  GNUNullExpr> gnuNullExpr;
+const internal::VariadicDynCastAllOfMatcher<Stmt, GNUNullExpr> gnuNullExpr;
+
+/// \brief Matches atomic builtins.
+/// Example matches __atomic_load_n(ptr, 1)
+/// \code
+///   void foo() { int *ptr; __atomic_load_n(ptr, 1); }
+/// \endcode
+const internal::VariadicDynCastAllOfMatcher<Stmt, AtomicExpr> atomicExpr;
+
+/// \brief Matches statement expression (GNU extension).
+///
+/// Example match: ({ int X = 4; X; })
+/// \code
+///   int C = ({ int X = 4; X; });
+/// \endcode
+const internal::VariadicDynCastAllOfMatcher<Stmt, StmtExpr> stmtExpr;
 
 /// \brief Matches binary operator expressions.
 ///
@@ -1560,6 +1808,28 @@
   Stmt,
   ConditionalOperator> conditionalOperator;
 
+/// \brief Matches binary conditional operator expressions (GNU extension).
+///
+/// Example matches a ?: b
+/// \code
+///   (a ?: b) + 42;
+/// \endcode
+const internal::VariadicDynCastAllOfMatcher<
+  Stmt,
+  BinaryConditionalOperator> binaryConditionalOperator;
+
+/// \brief Matches opaque value expressions. They are used as helpers
+/// to reference another expressions and can be met
+/// in BinaryConditionalOperators, for example.
+///
+/// Example matches 'a'
+/// \code
+///   (a ?: c) + 42;
+/// \endcode
+const internal::VariadicDynCastAllOfMatcher<
+  Stmt,
+  OpaqueValueExpr> opaqueValueExpr;
+
 /// \brief Matches a C++ static_assert declaration.
 ///
 /// Example:
@@ -1716,6 +1986,41 @@
   Stmt,
   CXXTemporaryObjectExpr> cxxTemporaryObjectExpr;
 
+/// \brief Matches predefined identifier expressions [C99 6.4.2.2].
+///
+/// Example: Matches __func__
+/// \code
+///   printf("%s", __func__);
+/// \endcode
+const internal::VariadicDynCastAllOfMatcher<
+  Stmt,
+  PredefinedExpr> predefinedExpr;
+
+/// \brief Matches C99 designated initializer expressions [C99 6.7.8].
+///
+/// Example: Matches { [2].y = 1.0, [0].x = 1.0 }
+/// \code
+///   point ptarray[10] = { [2].y = 1.0, [0].x = 1.0 };
+/// \endcode
+const internal::VariadicDynCastAllOfMatcher<
+  Stmt,
+  DesignatedInitExpr> designatedInitExpr;
+
+/// \brief Matches designated initializer expressions that contain
+/// a specific number of designators.
+///
+/// Example: Given
+/// \code
+///   point ptarray[10] = { [2].y = 1.0, [0].x = 1.0 };
+///   point ptarray2[10] = { [2].y = 1.0, [2].x = 0.0, [0].x = 1.0 };
+/// \endcode
+/// designatorCountIs(2)
+///   matches '{ [2].y = 1.0, [0].x = 1.0 }',
+///   but not '{ [2].y = 1.0, [2].x = 0.0, [0].x = 1.0 }'.
+AST_MATCHER_P(DesignatedInitExpr, designatorCountIs, unsigned, N) {
+  return Node.size() == N;
+}
+
 /// \brief Matches \c QualTypes in the clang AST.
 const internal::VariadicAllOfMatcher<QualType> qualType;
 
@@ -1834,9 +2139,25 @@
 ///   namespace a { namespace b { class X; } }
 /// \endcode
 inline internal::Matcher<NamedDecl> hasName(const std::string &Name) {
-  return internal::Matcher<NamedDecl>(new internal::HasNameMatcher(Name));
+  std::vector<std::string> Names;
+  Names.push_back(Name);
+  return internal::Matcher<NamedDecl>(new internal::HasNameMatcher(Names));
 }
 
+/// \brief Matches NamedDecl nodes that have any of the specified names.
+///
+/// This matcher is only provided as a performance optimization of hasName.
+/// \code
+///     hasAnyName(a, b, c)
+/// \endcode
+///  is equivalent to, but faster than
+/// \code
+///     anyOf(hasName(a), hasName(b), hasName(c))
+/// \endcode
+const internal::VariadicFunction<internal::Matcher<NamedDecl>, StringRef,
+                                 internal::hasAnyNameFunc>
+    hasAnyName = {};
+
 /// \brief Matches NamedDecl nodes whose fully qualified names contain
 /// a substring matched by the given RegExp.
 ///
@@ -1953,6 +2274,19 @@
                                     Node.method_end(), Finder, Builder);
 }
 
+/// \brief Matches the generated class of lambda expressions.
+///
+/// Given:
+/// \code
+///   auto x = []{};
+/// \endcode
+///
+/// \c cxxRecordDecl(isLambda()) matches the implicit class declaration of
+/// \c decltype(x)
+AST_MATCHER(CXXRecordDecl, isLambda) {
+  return Node.isLambda();
+}
+
 /// \brief Matches AST nodes that have child AST nodes that match the
 /// provided matcher.
 ///
@@ -1967,6 +2301,10 @@
 /// ChildT must be an AST base type.
 ///
 /// Usable as: Any Matcher
+/// Note that has is direct matcher, so it also matches things like implicit
+/// casts and paren casts. If you are matching with expr then you should
+/// probably consider using ignoringParenImpCasts like:
+/// has(ignoringParenImpCasts(expr())).
 const internal::ArgumentAdaptingMatcherFunc<internal::HasMatcher>
 LLVM_ATTRIBUTE_UNUSED has = {};
 
@@ -2117,8 +2455,8 @@
 ///
 /// Usable as: Matcher<CallExpr>, Matcher<CXXConstructExpr>,
 ///   Matcher<DeclRefExpr>, Matcher<EnumType>, Matcher<InjectedClassNameType>,
-///   Matcher<LabelStmt>, Matcher<MemberExpr>, Matcher<QualType>,
-///   Matcher<RecordType>, Matcher<TagType>,
+///   Matcher<LabelStmt>, Matcher<AddrLabelExpr>, Matcher<MemberExpr>,
+///   Matcher<QualType>, Matcher<RecordType>, Matcher<TagType>,
 ///   Matcher<TemplateSpecializationType>, Matcher<TemplateTypeParmType>,
 ///   Matcher<TypedefType>, Matcher<UnresolvedUsingType>
 inline internal::PolymorphicMatcherWithParam1<
@@ -2130,6 +2468,25 @@
       void(internal::HasDeclarationSupportedTypes)>(InnerMatcher);
 }
 
+/// \brief Matches a \c NamedDecl whose underlying declaration matches the given
+/// matcher.
+///
+/// Given
+/// \code
+///   namespace N { template<class T> void f(T t); }
+///   template <class T> void g() { using N::f; f(T()); }
+/// \endcode
+/// \c unresolvedLookupExpr(hasAnyDeclaration(
+///     namedDecl(hasUnderlyingDecl(hasName("::N::f")))))
+///   matches the use of \c f in \c g() .
+AST_MATCHER_P(NamedDecl, hasUnderlyingDecl, internal::Matcher<NamedDecl>,
+              InnerMatcher) {
+  const NamedDecl *UnderlyingDecl = Node.getUnderlyingDecl();
+
+  return UnderlyingDecl != nullptr &&
+         InnerMatcher.matches(*UnderlyingDecl, Finder, Builder);
+}
+
 /// \brief Matches on the implicit object argument of a member call expression.
 ///
 /// Example matches y.x()
@@ -2287,14 +2644,17 @@
 ///
 /// Example matches x (matcher = expr(hasType(cxxRecordDecl(hasName("X")))))
 ///             and z (matcher = varDecl(hasType(cxxRecordDecl(hasName("X")))))
+///             and U (matcher = typedefDecl(hasType(asString("int")))
 /// \code
 ///  class X {};
 ///  void y(X &x) { x; X z; }
+///  typedef int U;
 /// \endcode
 AST_POLYMORPHIC_MATCHER_P_OVERLOAD(
-    hasType, AST_POLYMORPHIC_SUPPORTED_TYPES(Expr, ValueDecl),
+    hasType, AST_POLYMORPHIC_SUPPORTED_TYPES(Expr, TypedefNameDecl, ValueDecl),
     internal::Matcher<QualType>, InnerMatcher, 0) {
-  return InnerMatcher.matches(Node.getType(), Finder, Builder);
+  return InnerMatcher.matches(internal::getUnderlyingType(Node),
+                              Finder, Builder);
 }
 
 /// \brief Overloaded to match the declaration of the expression's or value
@@ -2482,6 +2842,27 @@
   return false;
 }
 
+/// \brief Matches an \c OverloadExpr if any of the declarations in the set of
+/// overloads matches the given matcher.
+///
+/// Given
+/// \code
+///   template <typename T> void foo(T);
+///   template <typename T> void bar(T);
+///   template <typename T> void baz(T t) {
+///     foo(t);
+///     bar(t);
+///   }
+/// \endcode
+/// unresolvedLookupExpr(hasAnyDeclaration(
+///     functionTemplateDecl(hasName("foo"))))
+///   matches \c foo in \c foo(t); but not \c bar in \c bar(t);
+AST_MATCHER_P(OverloadExpr, hasAnyDeclaration, internal::Matcher<Decl>,
+              InnerMatcher) {
+  return matchesFirstInPointerRange(InnerMatcher, Node.decls_begin(),
+                                    Node.decls_end(), Finder, Builder);
+}
+
 /// \brief Matches the Decl of a DeclStmt which has a single declaration.
 ///
 /// Given
@@ -2829,18 +3210,13 @@
 ///   matches x(1, y, 42)
 /// with hasAnyArgument(...)
 ///   matching y
-///
-/// FIXME: Currently this will ignore parentheses and implicit casts on
-/// the argument before applying the inner matcher. We'll want to remove
-/// this to allow for greater control by the user once \c ignoreImplicit()
-/// has been implemented.
 AST_POLYMORPHIC_MATCHER_P(hasAnyArgument,
                           AST_POLYMORPHIC_SUPPORTED_TYPES(CallExpr,
                                                           CXXConstructExpr),
                           internal::Matcher<Expr>, InnerMatcher) {
   for (const Expr *Arg : Node.arguments()) {
     BoundNodesTreeBuilder Result(*Builder);
-    if (InnerMatcher.matches(*Arg->IgnoreParenImpCasts(), Finder, &Result)) {
+    if (InnerMatcher.matches(*Arg, Finder, &Result)) {
       *Builder = std::move(Result);
       return true;
     }
@@ -2853,6 +3229,22 @@
   return Node.isListInitialization();
 }
 
+/// \brief Matches a constructor call expression which requires
+/// zero initialization.
+///
+/// Given
+/// \code
+/// void foo() {
+///   struct point { double x; double y; };
+///   point pt[2] = { { 1.0, 2.0 } };
+/// }
+/// \endcode
+/// initListExpr(has(cxxConstructExpr(requiresZeroInitialization()))
+/// will match the implicit array filler for pt[1].
+AST_MATCHER(CXXConstructExpr, requiresZeroInitialization) {
+  return Node.requiresZeroInitialization();
+}
+
 /// \brief Matches the n'th parameter of a function declaration.
 ///
 /// Given
@@ -2879,8 +3271,11 @@
 ///   int y;
 ///   f(y);
 /// \endcode
-/// callExpr(declRefExpr(to(varDecl(hasName("y")))),
-/// parmVarDecl(hasType(isInteger())))
+/// callExpr(
+///   forEachArgumentWithParam(
+///     declRefExpr(to(varDecl(hasName("y")))),
+///     parmVarDecl(hasType(isInteger()))
+/// ))
 ///   matches f(y);
 /// with declRefExpr(...)
 ///   matching int y
@@ -2940,16 +3335,27 @@
                                     Node.param_end(), Finder, Builder);
 }
 
-/// \brief Matches \c FunctionDecls that have a specific parameter count.
+/// \brief Matches \c FunctionDecls and \c FunctionProtoTypes that have a
+/// specific parameter count.
 ///
 /// Given
 /// \code
 ///   void f(int i) {}
 ///   void g(int i, int j) {}
+///   void h(int i, int j);
+///   void j(int i);
+///   void k(int x, int y, int z, ...);
 /// \endcode
 /// functionDecl(parameterCountIs(2))
-///   matches g(int i, int j) {}
-AST_MATCHER_P(FunctionDecl, parameterCountIs, unsigned, N) {
+///   matches void g(int i, int j) {}
+/// functionProtoType(parameterCountIs(2))
+///   matches void h(int i, int j)
+/// functionProtoType(parameterCountIs(3))
+///   matches void k(int x, int y, int z, ...);
+AST_POLYMORPHIC_MATCHER_P(parameterCountIs,
+                          AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
+                                                          FunctionProtoType),
+                          unsigned, N) {
   return Node.getNumParams() == N;
 }
 
@@ -2976,7 +3382,8 @@
 /// \endcode
 /// functionDecl(isExternC())
 ///   matches the declaration of f and g, but not the declaration h
-AST_MATCHER(FunctionDecl, isExternC) {
+AST_POLYMORPHIC_MATCHER(isExternC, AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
+                                                                   VarDecl)) {
   return Node.isExternC();
 }
 
@@ -3006,6 +3413,29 @@
   return Node.isDefaulted();
 }
 
+/// \brief Matches functions that have a dynamic exception specification.
+///
+/// Given:
+/// \code
+///   void f();
+///   void g() noexcept;
+///   void h() noexcept(true);
+///   void i() noexcept(false);
+///   void j() throw();
+///   void k() throw(int);
+///   void l() throw(...);
+/// \endcode
+/// functionDecl(hasDynamicExceptionSpec()) and
+///   functionProtoType(hasDynamicExceptionSpec())
+///   match the declarations of j, k, and l, but not f, g, h, or i.
+AST_POLYMORPHIC_MATCHER(hasDynamicExceptionSpec,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
+                                                        FunctionProtoType)) {
+  if (const FunctionProtoType *FnTy = internal::getFunctionProtoType(Node))
+    return FnTy->hasDynamicExceptionSpec();
+  return false;
+}
+
 /// \brief Matches functions that have a non-throwing exception specification.
 ///
 /// Given:
@@ -3016,10 +3446,12 @@
 ///   void i() throw(int);
 ///   void j() noexcept(false);
 /// \endcode
-/// functionDecl(isNoThrow())
-///   matches the declarations of g, and h, but not f, i or j.
-AST_MATCHER(FunctionDecl, isNoThrow) {
-  const auto *FnTy = Node.getType()->getAs<FunctionProtoType>();
+/// functionDecl(isNoThrow()) and functionProtoType(isNoThrow())
+///   match the declarations of g, and h, but not f, i or j.
+AST_POLYMORPHIC_MATCHER(isNoThrow,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
+                                                        FunctionProtoType)) {
+  const FunctionProtoType *FnTy = internal::getFunctionProtoType(Node);
 
   // If the function does not have a prototype, then it is assumed to be a
   // throwing function (as it would if the function did not have any exception
@@ -3031,7 +3463,7 @@
   if (isUnresolvedExceptionSpec(FnTy->getExceptionSpecType()))
     return true;
 
-  return FnTy->isNothrow(Node.getASTContext());
+  return FnTy->isNothrow(Finder->getASTContext());
 }
 
 /// \brief Matches constexpr variable and function declarations.
@@ -3052,17 +3484,17 @@
 }
 
 /// \brief Matches the condition expression of an if statement, for loop,
-/// or conditional operator.
+/// switch statement or conditional operator.
 ///
 /// Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
 /// \code
 ///   if (true) {}
 /// \endcode
-AST_POLYMORPHIC_MATCHER_P(hasCondition,
-                          AST_POLYMORPHIC_SUPPORTED_TYPES(IfStmt, ForStmt,
-                                                          WhileStmt, DoStmt,
-                                                          ConditionalOperator),
-                          internal::Matcher<Expr>, InnerMatcher) {
+AST_POLYMORPHIC_MATCHER_P(
+    hasCondition,
+    AST_POLYMORPHIC_SUPPORTED_TYPES(IfStmt, ForStmt, WhileStmt, DoStmt,
+                                    SwitchStmt, AbstractConditionalOperator),
+    internal::Matcher<Expr>, InnerMatcher) {
   const Expr *const Condition = Node.getCond();
   return (Condition != nullptr &&
           InnerMatcher.matches(*Condition, Finder, Builder));
@@ -3178,8 +3610,8 @@
   return false;
 }
 
-/// \brief Matches a 'for', 'while', or 'do while' statement that has
-/// a given body.
+/// \brief Matches a 'for', 'while', 'do while' statement or a function
+/// definition that has a given body.
 ///
 /// Given
 /// \code
@@ -3192,15 +3624,16 @@
 AST_POLYMORPHIC_MATCHER_P(hasBody,
                           AST_POLYMORPHIC_SUPPORTED_TYPES(DoStmt, ForStmt,
                                                           WhileStmt,
-                                                          CXXForRangeStmt),
+                                                          CXXForRangeStmt,
+                                                          FunctionDecl),
                           internal::Matcher<Stmt>, InnerMatcher) {
-  const Stmt *const Statement = Node.getBody();
+  const Stmt *const Statement = internal::GetBodyMatcher<NodeType>::get(Node);
   return (Statement != nullptr &&
           InnerMatcher.matches(*Statement, Finder, Builder));
 }
 
 /// \brief Matches compound statements where at least one substatement matches
-/// a given matcher.
+/// a given matcher. Also matches StmtExprs that have CompoundStmt as children.
 ///
 /// Given
 /// \code
@@ -3210,10 +3643,13 @@
 ///   matches '{ {}; 1+2; }'
 /// with compoundStmt()
 ///   matching '{}'
-AST_MATCHER_P(CompoundStmt, hasAnySubstatement,
-              internal::Matcher<Stmt>, InnerMatcher) {
-  return matchesFirstInPointerRange(InnerMatcher, Node.body_begin(),
-                                    Node.body_end(), Finder, Builder);
+AST_POLYMORPHIC_MATCHER_P(hasAnySubstatement,
+                          AST_POLYMORPHIC_SUPPORTED_TYPES(CompoundStmt,
+                                                          StmtExpr),
+                          internal::Matcher<Stmt>, InnerMatcher) {
+  const CompoundStmt *CS = CompoundStmtMatcher<NodeType>::get(Node);
+  return CS && matchesFirstInPointerRange(InnerMatcher, CS->body_begin(),
+                                          CS->body_end(), Finder, Builder);
 }
 
 /// \brief Checks that a compound statement contains a specific number of
@@ -3312,21 +3748,43 @@
           InnerMatcher.matches(*Operand, Finder, Builder));
 }
 
-/// \brief Matches if the cast's source expression matches the given matcher.
+/// \brief Matches if the cast's source expression
+/// or opaque value's source expression matches the given matcher.
 ///
-/// Example: matches "a string" (matcher =
-///                                  hasSourceExpression(cxxConstructExpr()))
+/// Example 1: matches "a string"
+/// (matcher = castExpr(hasSourceExpression(cxxConstructExpr())))
 /// \code
 /// class URL { URL(string); };
 /// URL url = "a string";
 /// \endcode
-AST_MATCHER_P(CastExpr, hasSourceExpression,
-              internal::Matcher<Expr>, InnerMatcher) {
-  const Expr* const SubExpression = Node.getSubExpr();
+///
+/// Example 2: matches 'b' (matcher =
+/// opaqueValueExpr(hasSourceExpression(implicitCastExpr(declRefExpr())))
+/// \code
+/// int a = b ?: 1;
+/// \endcode
+
+AST_POLYMORPHIC_MATCHER_P(hasSourceExpression,
+                          AST_POLYMORPHIC_SUPPORTED_TYPES(CastExpr,
+                                                          OpaqueValueExpr),
+                          internal::Matcher<Expr>, InnerMatcher) {
+  const Expr *const SubExpression =
+      internal::GetSourceExpressionMatcher<NodeType>::get(Node);
   return (SubExpression != nullptr &&
           InnerMatcher.matches(*SubExpression, Finder, Builder));
 }
 
+/// \brief Matches casts that has a given cast kind.
+///
+/// Example: matches the implicit cast around \c 0
+/// (matcher = castExpr(hasCastKind(CK_NullToPointer)))
+/// \code
+///   int *p = 0;
+/// \endcode
+AST_MATCHER_P(CastExpr, hasCastKind, CastKind, Kind) {
+  return Node.getCastKind() == Kind;
+}
+
 /// \brief Matches casts whose destination type matches a given matcher.
 ///
 /// (Note: Clang's AST refers to other conversions as "casts" too, and calls
@@ -3384,24 +3842,31 @@
 
 /// \brief Matches the true branch expression of a conditional operator.
 ///
-/// Example matches a
+/// Example 1 (conditional ternary operator): matches a
 /// \code
 ///   condition ? a : b
 /// \endcode
-AST_MATCHER_P(ConditionalOperator, hasTrueExpression,
+///
+/// Example 2 (conditional binary operator): matches opaqueValueExpr(condition)
+/// \code
+///   condition ?: b
+/// \endcode
+AST_MATCHER_P(AbstractConditionalOperator, hasTrueExpression,
               internal::Matcher<Expr>, InnerMatcher) {
   const Expr *Expression = Node.getTrueExpr();
   return (Expression != nullptr &&
           InnerMatcher.matches(*Expression, Finder, Builder));
 }
 
-/// \brief Matches the false branch expression of a conditional operator.
+/// \brief Matches the false branch expression of a conditional operator
+/// (binary or ternary).
 ///
 /// Example matches b
 /// \code
 ///   condition ? a : b
+///   condition ?: b
 /// \endcode
-AST_MATCHER_P(ConditionalOperator, hasFalseExpression,
+AST_MATCHER_P(AbstractConditionalOperator, hasFalseExpression,
               internal::Matcher<Expr>, InnerMatcher) {
   const Expr *Expression = Node.getFalseExpr();
   return (Expression != nullptr &&
@@ -3465,6 +3930,47 @@
           InnerMatcher.matches(*Parent, Finder, Builder));
 }
 
+/// \brief Matches each method overriden by the given method. This matcher may
+/// produce multiple matches.
+///
+/// Given
+/// \code
+///   class A { virtual void f(); };
+///   class B : public A { void f(); };
+///   class C : public B { void f(); };
+/// \endcode
+/// cxxMethodDecl(ofClass(hasName("C")),
+///               forEachOverridden(cxxMethodDecl().bind("b"))).bind("d")
+///   matches once, with "b" binding "A::f" and "d" binding "C::f" (Note
+///   that B::f is not overridden by C::f).
+///
+/// The check can produce multiple matches in case of multiple inheritance, e.g.
+/// \code
+///   class A1 { virtual void f(); };
+///   class A2 { virtual void f(); };
+///   class C : public A1, public A2 { void f(); };
+/// \endcode
+/// cxxMethodDecl(ofClass(hasName("C")),
+///               forEachOverridden(cxxMethodDecl().bind("b"))).bind("d")
+///   matches twice, once with "b" binding "A1::f" and "d" binding "C::f", and
+///   once with "b" binding "A2::f" and "d" binding "C::f".
+AST_MATCHER_P(CXXMethodDecl, forEachOverridden,
+              internal::Matcher<CXXMethodDecl>, InnerMatcher) {
+  BoundNodesTreeBuilder Result;
+  bool Matched = false;
+  for (const auto *Overridden : Node.overridden_methods()) {
+    BoundNodesTreeBuilder OverriddenBuilder(*Builder);
+    const bool OverriddenMatched =
+        InnerMatcher.matches(*Overridden, Finder, &OverriddenBuilder);
+    if (OverriddenMatched) {
+      Matched = true;
+      Result.addMatch(OverriddenBuilder);
+    }
+  }
+  *Builder = std::move(Result);
+  return Matched;
+}
+
 /// \brief Matches if the given method declaration is virtual.
 ///
 /// Given
@@ -3479,6 +3985,24 @@
   return Node.isVirtual();
 }
 
+/// \brief Matches if the given method declaration has an explicit "virtual".
+///
+/// Given
+/// \code
+///   class A {
+///    public:
+///     virtual void x();
+///   };
+///   class B : public A {
+///    public:
+///     void x();
+///   };
+/// \endcode
+///   matches A::x but not B::x
+AST_MATCHER(CXXMethodDecl, isVirtualAsWritten) {
+  return Node.isVirtualAsWritten();
+}
+
 /// \brief Matches if the given method or class declaration is final.
 ///
 /// Given:
@@ -3546,6 +4070,23 @@
   return Node.isCopyAssignmentOperator();
 }
 
+/// \brief Matches if the given method declaration declares a move assignment
+/// operator.
+///
+/// Given
+/// \code
+/// struct A {
+///   A &operator=(const A &);
+///   A &operator=(A &&);
+/// };
+/// \endcode
+///
+/// cxxMethodDecl(isMoveAssignmentOperator()) matches the second method but not
+/// the first one.
+AST_MATCHER(CXXMethodDecl, isMoveAssignmentOperator) {
+  return Node.isMoveAssignmentOperator();
+}
+
 /// \brief Matches if the given method declaration overrides another method.
 ///
 /// Given
@@ -3564,6 +4105,21 @@
   return Node.size_overridden_methods() > 0 || Node.hasAttr<OverrideAttr>();
 }
 
+/// \brief Matches method declarations that are user-provided.
+///
+/// Given
+/// \code
+///   struct S {
+///     S(); // #1
+///     S(const S &) = default; // #2
+///     S(S &&) = delete; // #3
+///   };
+/// \endcode
+/// cxxConstructorDecl(isUserProvided()) will match #1, but not #2 or #3.
+AST_MATCHER(CXXMethodDecl, isUserProvided) {
+  return Node.isUserProvided();
+}
+
 /// \brief Matches member expressions that are called with '->' as opposed
 /// to '.'.
 ///
@@ -3597,6 +4153,34 @@
     return Node->isIntegerType();
 }
 
+/// \brief Matches QualType nodes that are of unsigned integer type.
+///
+/// Given
+/// \code
+///   void a(int);
+///   void b(unsigned long);
+///   void c(double);
+/// \endcode
+/// functionDecl(hasAnyParameter(hasType(isInteger())))
+/// matches "b(unsigned long)", but not "a(int)" and "c(double)".
+AST_MATCHER(QualType, isUnsignedInteger) {
+    return Node->isUnsignedIntegerType();
+}
+
+/// \brief Matches QualType nodes that are of signed integer type.
+///
+/// Given
+/// \code
+///   void a(int);
+///   void b(unsigned long);
+///   void c(double);
+/// \endcode
+/// functionDecl(hasAnyParameter(hasType(isInteger())))
+/// matches "a(int)", but not "b(unsigned long)" and "c(double)".
+AST_MATCHER(QualType, isSignedInteger) {
+    return Node->isSignedIntegerType();
+}
+
 /// \brief Matches QualType nodes that are of character type.
 ///
 /// Given
@@ -3611,6 +4195,26 @@
     return Node->isAnyCharacterType();
 }
 
+/// \brief Matches QualType nodes that are of any pointer type; this includes
+/// the Objective-C object pointer type, which is different despite being
+/// syntactically similar.
+///
+/// Given
+/// \code
+///   int *i = nullptr;
+///
+///   @interface Foo
+///   @end
+///   Foo *f;
+///
+///   int j;
+/// \endcode
+/// varDecl(hasType(isAnyPointer()))
+///   matches "int *i" and "Foo *f", but not "int j".
+AST_MATCHER(QualType, isAnyPointer) {
+  return Node->isAnyPointerType();
+}
+
 /// \brief Matches QualType nodes that are const-qualified, i.e., that
 /// include "top-level" const.
 ///
@@ -3886,6 +4490,19 @@
 ///   matches "_Complex float f"
 AST_TYPE_MATCHER(ComplexType, complexType);
 
+/// \brief Matches any real floating-point type (float, double, long double).
+///
+/// Given
+/// \code
+///   int i;
+///   float f;
+/// \endcode
+/// realFloatingPointType()
+///   matches "float f" but not "int i"
+AST_MATCHER(Type, realFloatingPointType) {
+  return Node.isRealFloatingType();
+}
+
 /// \brief Matches arrays and C99 complex types that have a specific element
 /// type.
 ///
@@ -3917,18 +4534,26 @@
 ///   matches "int a[2]"
 AST_TYPE_MATCHER(ConstantArrayType, constantArrayType);
 
-/// \brief Matches \c ConstantArrayType nodes that have the specified size.
+/// \brief Matches nodes that have the specified size.
 ///
 /// Given
 /// \code
 ///   int a[42];
 ///   int b[2 * 21];
 ///   int c[41], d[43];
+///   char *s = "abcd";
+///   wchar_t *ws = L"abcd";
+///   char *w = "a";
 /// \endcode
 /// constantArrayType(hasSize(42))
 ///   matches "int a[42]" and "int b[2 * 21]"
-AST_MATCHER_P(ConstantArrayType, hasSize, unsigned, N) {
-  return Node.getSize() == N;
+/// stringLiteral(hasSize(4))
+///   matches "abcd", L"abcd"
+AST_POLYMORPHIC_MATCHER_P(hasSize,
+                          AST_POLYMORPHIC_SUPPORTED_TYPES(ConstantArrayType,
+                                                          StringLiteral),
+                          unsigned, N) {
+  return internal::HasSizeMatcher<NodeType>::hasSize(Node, N);
 }
 
 /// \brief Matches C++ arrays whose size is a value-dependent expression.
@@ -4052,6 +4677,18 @@
 ///   matches "int (*f)(int)" and the type of "g".
 AST_TYPE_MATCHER(FunctionType, functionType);
 
+/// \brief Matches \c FunctionProtoType nodes.
+///
+/// Given
+/// \code
+///   int (*f)(int);
+///   void g();
+/// \endcode
+/// functionProtoType()
+///   matches "int (*f)(int)" and the type of "g" in C++ mode.
+///   In C mode, "g" is not matched because it does not contain a prototype.
+AST_TYPE_MATCHER(FunctionProtoType, functionProtoType);
+
 /// \brief Matches \c ParenType nodes.
 ///
 /// Given
@@ -4207,6 +4844,21 @@
 ///   matches "typedef int X"
 AST_TYPE_MATCHER(TypedefType, typedefType);
 
+/// \brief Matches enum types.
+///
+/// Given
+/// \code
+///   enum C { Green };
+///   enum class S { Red };
+///
+///   C c;
+///   S s;
+/// \endcode
+//
+/// \c enumType() matches the type of the variable declarations of both \c c and
+/// \c s.
+AST_TYPE_MATCHER(EnumType, enumType);
+
 /// \brief Matches template specialization types.
 ///
 /// Given
@@ -4627,6 +5279,23 @@
   return Node.isDefaultConstructor();
 }
 
+/// \brief Matches constructors that delegate to another constructor.
+///
+/// Given
+/// \code
+///   struct S {
+///     S(); // #1
+///     S(int) {} // #2
+///     S(S &&) : S() {} // #3
+///   };
+///   S::S() : S(0) {} // #4
+/// \endcode
+/// cxxConstructorDecl(isDelegatingConstructor()) will match #3 and #4, but not
+/// #1 or #2.
+AST_MATCHER(CXXConstructorDecl, isDelegatingConstructor) {
+  return Node.isDelegatingConstructor();
+}
+
 /// \brief Matches constructor and conversion declarations that are marked with
 /// the explicit keyword.
 ///
@@ -4719,6 +5388,24 @@
   return false;
 }
 
+/// \brief Matches the return value expression of a return statement
+///
+/// Given
+/// \code
+///   return a + b;
+/// \endcode
+/// hasReturnValue(binaryOperator())
+///   matches 'return a + b'
+/// with binaryOperator()
+///   matching 'a + b'
+AST_MATCHER_P(ReturnStmt, hasReturnValue, internal::Matcher<Expr>,
+              InnerMatcher) {
+  if (const auto *RetValue = Node.getRetValue())
+    return InnerMatcher.matches(*RetValue, Finder, Builder);
+  return false;
+}
+
+
 /// \brief Matches CUDA kernel call expression.
 ///
 /// Example matches,
@@ -4729,6 +5416,66 @@
   Stmt,
   CUDAKernelCallExpr> cudaKernelCallExpr;
 
+
+/// \brief Matches expressions that resolve to a null pointer constant, such as
+/// GNU's __null, C++11's nullptr, or C's NULL macro.
+///
+/// Given:
+/// \code
+///   void *v1 = NULL;
+///   void *v2 = nullptr;
+///   void *v3 = __null; // GNU extension
+///   char *cp = (char *)0;
+///   int *ip = 0;
+///   int i = 0;
+/// \endcode
+/// expr(nullPointerConstant())
+///   matches the initializer for v1, v2, v3, cp, and ip. Does not match the
+///   initializer for i.
+AST_MATCHER_FUNCTION(internal::Matcher<Expr>, nullPointerConstant) {
+  return anyOf(
+      gnuNullExpr(), cxxNullPtrLiteralExpr(),
+      integerLiteral(equals(0), hasParent(expr(hasType(pointerType())))));
+}
+
+/// \brief Matches declaration of the function the statemenet belongs to
+///
+/// Given:
+/// \code
+/// F& operator=(const F& o) {
+///   std::copy_if(o.begin(), o.end(), begin(), [](V v) { return v > 0; });
+///   return *this;
+/// }
+/// \endcode
+/// returnStmt(forFunction(hasName("operator=")))
+///   matches 'return *this'
+///   but does match 'return > 0'
+AST_MATCHER_P(Stmt, forFunction, internal::Matcher<FunctionDecl>,
+              InnerMatcher) {
+  const auto &Parents = Finder->getASTContext().getParents(Node);
+
+  llvm::SmallVector<ast_type_traits::DynTypedNode, 8> Stack(Parents.begin(),
+                                                            Parents.end());
+  while(!Stack.empty()) {
+    const auto &CurNode = Stack.back();
+    Stack.pop_back();
+    if(const auto *FuncDeclNode = CurNode.get<FunctionDecl>()) {
+      if(InnerMatcher.matches(*FuncDeclNode, Finder, Builder)) {
+        return true;
+      }
+    } else if(const auto *LambdaExprNode = CurNode.get<LambdaExpr>()) {
+      if(InnerMatcher.matches(*LambdaExprNode->getCallOperator(),
+                              Finder, Builder)) {
+        return true;
+      }
+    } else {
+      for(const auto &Parent: Finder->getASTContext().getParents(CurNode))
+        Stack.push_back(Parent);
+    }
+  }
+  return false;
+}
+
 } // end namespace ast_matchers
 } // end namespace clang
 
diff --git a/include/clang/ASTMatchers/ASTMatchersInternal.h b/include/clang/ASTMatchers/ASTMatchersInternal.h
index 1d1d795..b9e3157 100644
--- a/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -46,8 +46,9 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
 #include "clang/AST/Type.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/VariadicFunction.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ManagedStatic.h"
 #include <map>
 #include <string>
@@ -60,6 +61,62 @@
 
 namespace internal {
 
+/// \brief Variadic function object.
+///
+/// Most of the functions below that use VariadicFunction could be implemented
+/// using plain C++11 variadic functions, but the function object allows us to
+/// capture it on the dynamic matcher registry.
+template <typename ResultT, typename ArgT,
+          ResultT (*Func)(ArrayRef<const ArgT *>)>
+struct VariadicFunction {
+  ResultT operator()() const { return Func(None); }
+
+  template <typename... ArgsT>
+  ResultT operator()(const ArgT &Arg1, const ArgsT &... Args) const {
+    return Execute(Arg1, static_cast<const ArgT &>(Args)...);
+  }
+
+  // We also allow calls with an already created array, in case the caller
+  // already had it.
+  ResultT operator()(ArrayRef<ArgT> Args) const {
+    SmallVector<const ArgT*, 8> InnerArgs;
+    for (const ArgT &Arg : Args)
+      InnerArgs.push_back(&Arg);
+    return Func(InnerArgs);
+  }
+
+private:
+  // Trampoline function to allow for implicit conversions to take place
+  // before we make the array.
+  template <typename... ArgsT> ResultT Execute(const ArgsT &... Args) const {
+    const ArgT *const ArgsArray[] = {&Args...};
+    return Func(ArrayRef<const ArgT *>(ArgsArray, sizeof...(ArgsT)));
+  }
+};
+
+/// \brief Unifies obtaining the underlying type of a regular node through
+/// `getType` and a TypedefNameDecl node through `getUnderlyingType`.
+inline QualType getUnderlyingType(const Expr &Node) { return Node.getType(); }
+
+inline QualType getUnderlyingType(const ValueDecl &Node) {
+  return Node.getType();
+}
+
+inline QualType getUnderlyingType(const TypedefNameDecl &Node) {
+  return Node.getUnderlyingType();
+}
+
+/// \brief Unifies obtaining the FunctionProtoType pointer from both
+/// FunctionProtoType and FunctionDecl nodes..
+inline const FunctionProtoType *
+getFunctionProtoType(const FunctionProtoType &Node) {
+  return &Node;
+}
+
+inline const FunctionProtoType *getFunctionProtoType(const FunctionDecl &Node) {
+  return Node.getType()->getAs<FunctionProtoType>();
+}
+
 /// \brief Internal version of BoundNodes. Holds all the bound nodes.
 class BoundNodesMap {
 public:
@@ -420,7 +477,7 @@
   template <typename From>
   Matcher(const Matcher<From> &Other,
           typename std::enable_if<std::is_base_of<From, T>::value &&
-                                  !std::is_same<From, T>::value>::type * = 0)
+                               !std::is_same<From, T>::value>::type * = nullptr)
       : Implementation(restrictMatcher(Other.Implementation)) {
     assert(Implementation.getSupportedKind().isSame(
         ast_type_traits::ASTNodeKind::getFromNodeKind<T>()));
@@ -433,7 +490,7 @@
   Matcher(const Matcher<TypeT> &Other,
           typename std::enable_if<
             std::is_same<T, QualType>::value &&
-            std::is_same<TypeT, Type>::value>::type* = 0)
+            std::is_same<TypeT, Type>::value>::type* = nullptr)
       : Implementation(new TypeToQualType<TypeT>(Other)) {}
 
   /// \brief Convert \c this into a \c Matcher<T> by applying dyn_cast<> to the
@@ -558,32 +615,21 @@
   return false;
 }
 
-// Metafunction to determine if type T has a member called
-// getDecl.
-#if defined(_MSC_VER) && !defined(__clang__)
-// For MSVC, we use a weird nonstandard __if_exists statement, as it
-// is not standards-conformant enough to properly compile the standard
-// code below. (At least up through MSVC 2015 require this workaround)
-template <typename T> struct has_getDecl {
-  __if_exists(T::getDecl) {
-    enum { value = 1 };
-  }
-  __if_not_exists(T::getDecl) {
-    enum { value = 0 };
-  }
+// Metafunction to determine if type T has a member called getDecl.
+template <typename Ty>
+class has_getDecl {
+  typedef char yes[1];
+  typedef char no[2];
+
+  template <typename Inner>
+  static yes& test(Inner *I, decltype(I->getDecl()) * = nullptr);
+
+  template <typename>
+  static no& test(...);
+
+public:
+  static const bool value = sizeof(test<Ty>(nullptr)) == sizeof(yes);
 };
-#else
-// There is a default template inheriting from "false_type". Then, a
-// partial specialization inherits from "true_type". However, this
-// specialization will only exist when the call to getDecl() isn't an
-// error -- it vanishes by SFINAE when the member doesn't exist.
-template <typename> struct type_sink_to_void { typedef void type; };
-template <typename T, typename = void> struct has_getDecl : std::false_type {};
-template <typename T>
-struct has_getDecl<
-    T, typename type_sink_to_void<decltype(std::declval<T>().getDecl())>::type>
-    : std::true_type {};
-#endif
 
 /// \brief Matches overloaded operators with a specific name.
 ///
@@ -626,10 +672,10 @@
 
 /// \brief Matches named declarations with a specific name.
 ///
-/// See \c hasName() in ASTMatchers.h for details.
+/// See \c hasName() and \c hasAnyName() in ASTMatchers.h for details.
 class HasNameMatcher : public SingleNodeMatcherInterface<NamedDecl> {
  public:
-  explicit HasNameMatcher(StringRef Name);
+  explicit HasNameMatcher(std::vector<std::string> Names);
 
   bool matchesNode(const NamedDecl &Node) const override;
 
@@ -642,15 +688,27 @@
 
   /// \brief Full match routine
   ///
+  /// Fast implementation for the simple case of a named declaration at
+  /// namespace or RecordDecl scope.
+  /// It is slower than matchesNodeUnqualified, but faster than
+  /// matchesNodeFullSlow.
+  bool matchesNodeFullFast(const NamedDecl &Node) const;
+
+  /// \brief Full match routine
+  ///
   /// It generates the fully qualified name of the declaration (which is
   /// expensive) before trying to match.
   /// It is slower but simple and works on all cases.
-  bool matchesNodeFull(const NamedDecl &Node) const;
+  bool matchesNodeFullSlow(const NamedDecl &Node) const;
 
   const bool UseUnqualifiedMatch;
-  const std::string Name;
+  const std::vector<std::string> Names;
 };
 
+/// \brief Trampoline function to use VariadicFunction<> to construct a
+///        HasNameMatcher.
+Matcher<NamedDecl> hasAnyNameFunc(ArrayRef<const StringRef *> NameRefs);
+
 /// \brief Matches declarations for QualType and CallExpr.
 ///
 /// Type argument DeclMatcherT is required by PolymorphicMatcherWithParam1 but
@@ -737,6 +795,14 @@
     return matchesDecl(Node.getMemberDecl(), Finder, Builder);
   }
 
+  /// \brief Extracts the \c LabelDecl a \c AddrLabelExpr refers to and returns
+  /// whether the inner matcher matches on it.
+  bool matchesSpecialized(const AddrLabelExpr &Node,
+                          ASTMatchFinder *Finder,
+                          BoundNodesTreeBuilder *Builder) const {
+    return matchesDecl(Node.getLabel(), Finder, Builder);
+  }
+
   /// \brief Returns whether the inner matcher \c Node. Returns false if \c Node
   /// is \c NULL.
   bool matchesDecl(const Decl *Node, ASTMatchFinder *Finder,
@@ -942,8 +1008,8 @@
 
 /// \brief All types that are supported by HasDeclarationMatcher above.
 typedef TypeList<CallExpr, CXXConstructExpr, DeclRefExpr, EnumType,
-                 InjectedClassNameType, LabelStmt, MemberExpr, QualType,
-                 RecordType, TagType, TemplateSpecializationType,
+                 InjectedClassNameType, LabelStmt, AddrLabelExpr, MemberExpr,
+                 QualType, RecordType, TagType, TemplateSpecializationType,
                  TemplateTypeParmType, TypedefType,
                  UnresolvedUsingType> HasDeclarationSupportedTypes;
 
@@ -1110,8 +1176,6 @@
 /// ChildT must be an AST base type.
 template <typename T, typename ChildT>
 class HasMatcher : public WrapperMatcherInterface<T> {
-  static_assert(IsBaseType<ChildT>::value,
-                "has only accepts base type matcher");
 
 public:
   explicit HasMatcher(const Matcher<ChildT> &ChildMatcher)
@@ -1119,10 +1183,9 @@
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
-    return Finder->matchesChildOf(
-        Node, this->InnerMatcher, Builder,
-        ASTMatchFinder::TK_IgnoreImplicitCastsAndParentheses,
-        ASTMatchFinder::BK_First);
+    return Finder->matchesChildOf(Node, this->InnerMatcher, Builder,
+                                  ASTMatchFinder::TK_AsIs,
+                                  ASTMatchFinder::BK_First);
   }
 };
 
@@ -1385,9 +1448,8 @@
 /// casted to CXXRecordDecl and all given matchers match.
 template <typename SourceT, typename TargetT>
 class VariadicDynCastAllOfMatcher
-    : public llvm::VariadicFunction<
-        BindableMatcher<SourceT>, Matcher<TargetT>,
-        makeDynCastAllOfComposite<SourceT, TargetT> > {
+    : public VariadicFunction<BindableMatcher<SourceT>, Matcher<TargetT>,
+                              makeDynCastAllOfComposite<SourceT, TargetT>> {
 public:
   VariadicDynCastAllOfMatcher() {}
 };
@@ -1403,9 +1465,9 @@
 /// \c Matcher<NestedNameSpecifier>.
 /// The returned matcher matches if all given matchers match.
 template <typename T>
-class VariadicAllOfMatcher : public llvm::VariadicFunction<
-                               BindableMatcher<T>, Matcher<T>,
-                               makeAllOfComposite<T> > {
+class VariadicAllOfMatcher
+    : public VariadicFunction<BindableMatcher<T>, Matcher<T>,
+                              makeAllOfComposite<T>> {
 public:
   VariadicAllOfMatcher() {}
 };
@@ -1526,8 +1588,8 @@
         new MatcherImpl<OuterT>(InnerMatcher, Getter<OuterT>::value()));
   }
 
-  struct Func : public llvm::VariadicFunction<Self, Matcher<InnerTBase>,
-                                              &Self::create> {
+  struct Func
+      : public VariadicFunction<Self, Matcher<InnerTBase>, &Self::create> {
     Func() {}
   };
 
@@ -1576,6 +1638,13 @@
   return llvm::makeArrayRef(T.getArgs(), T.getNumArgs());
 }
 
+inline ArrayRef<TemplateArgument>
+getTemplateSpecializationArgs(const FunctionDecl &FD) {
+  if (const auto* TemplateArgs = FD.getTemplateSpecializationArgs())
+    return TemplateArgs->asArray();
+  return ArrayRef<TemplateArgument>();
+}
+
 struct NotEqualsBoundNodePredicate {
   bool operator()(const internal::BoundNodesMap &Nodes) const {
     return Nodes.getNode(ID) != Node;
@@ -1584,8 +1653,60 @@
   ast_type_traits::DynTypedNode Node;
 };
 
+template <typename Ty>
+struct GetBodyMatcher {
+  static const Stmt *get(const Ty &Node) {
+    return Node.getBody();
+  }
+};
+
+template <>
+inline const Stmt *GetBodyMatcher<FunctionDecl>::get(const FunctionDecl &Node) {
+  return Node.doesThisDeclarationHaveABody() ? Node.getBody() : nullptr;
+}
+
+template <typename Ty>
+struct HasSizeMatcher {
+  static bool hasSize(const Ty &Node, unsigned int N) {
+    return Node.getSize() == N;
+  }
+};
+
+template <>
+inline bool HasSizeMatcher<StringLiteral>::hasSize(
+    const StringLiteral &Node, unsigned int N) {
+  return Node.getLength() == N;
+}
+
+template <typename Ty>
+struct GetSourceExpressionMatcher {
+  static const Expr *get(const Ty &Node) {
+    return Node.getSubExpr();
+  }
+};
+
+template <>
+inline const Expr *GetSourceExpressionMatcher<OpaqueValueExpr>::get(
+    const OpaqueValueExpr &Node) {
+  return Node.getSourceExpr();
+}
+
+template <typename Ty>
+struct CompoundStmtMatcher {
+  static const CompoundStmt *get(const Ty &Node) {
+    return &Node;
+  }
+};
+
+template <>
+inline const CompoundStmt *
+CompoundStmtMatcher<StmtExpr>::get(const StmtExpr &Node) {
+  return Node.getSubStmt();
+}
+
+
 } // end namespace internal
 } // end namespace ast_matchers
 } // end namespace clang
 
-#endif
+#endif // LLVM_CLANG_ASTMATCHERS_ASTMATCHERSINTERNAL_H
diff --git a/include/clang/ASTMatchers/Dynamic/VariantValue.h b/include/clang/ASTMatchers/Dynamic/VariantValue.h
index c391b24..5296edd 100644
--- a/include/clang/ASTMatchers/Dynamic/VariantValue.h
+++ b/include/clang/ASTMatchers/Dynamic/VariantValue.h
@@ -21,7 +21,6 @@
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/Twine.h"
 #include <memory>
 #include <vector>
 
diff --git a/include/clang/Analysis/Analyses/Dominators.h b/include/clang/Analysis/Analyses/Dominators.h
index 4524aeb..c64a3ca 100644
--- a/include/clang/Analysis/Analyses/Dominators.h
+++ b/include/clang/Analysis/Analyses/Dominators.h
@@ -168,6 +168,7 @@
 namespace llvm {
 template <> struct GraphTraits< ::clang::DomTreeNode* > {
   typedef ::clang::DomTreeNode NodeType;
+  typedef ::clang::DomTreeNode *NodeRef;
   typedef NodeType::iterator  ChildIteratorType;
 
   static NodeType *getEntryNode(NodeType *N) {
diff --git a/include/clang/Analysis/Analyses/FormatString.h b/include/clang/Analysis/Analyses/FormatString.h
index ada3fb0..170cfad 100644
--- a/include/clang/Analysis/Analyses/FormatString.h
+++ b/include/clang/Analysis/Analyses/FormatString.h
@@ -221,6 +221,10 @@
     kind == FreeBSDrArg || kind == FreeBSDyArg; }
   bool isUIntArg() const { return kind >= UIntArgBeg && kind <= UIntArgEnd; }
   bool isAnyIntArg() const { return kind >= IntArgBeg && kind <= UIntArgEnd; }
+  bool isDoubleArg() const {
+    return kind >= DoubleArgBeg && kind <= DoubleArgEnd;
+  }
+
   const char *toString() const;
 
   bool isPrintfKind() const { return IsPrintf; }
diff --git a/include/clang/Analysis/Analyses/LiveVariables.h b/include/clang/Analysis/Analyses/LiveVariables.h
index e17f73a..8db4b0a 100644
--- a/include/clang/Analysis/Analyses/LiveVariables.h
+++ b/include/clang/Analysis/Analyses/LiveVariables.h
@@ -16,7 +16,6 @@
 
 #include "clang/AST/Decl.h"
 #include "clang/Analysis/AnalysisContext.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ImmutableSet.h"
 
 namespace clang {
diff --git a/include/clang/Analysis/Analyses/ThreadSafetyUtil.h b/include/clang/Analysis/Analyses/ThreadSafetyUtil.h
index 4d3402f..6ea9365 100644
--- a/include/clang/Analysis/Analyses/ThreadSafetyUtil.h
+++ b/include/clang/Analysis/Analyses/ThreadSafetyUtil.h
@@ -58,18 +58,15 @@
   llvm::BumpPtrAllocator *Allocator;
 };
 
-
 } // end namespace til
 } // end namespace threadSafety
 } // end namespace clang
 
-
 inline void *operator new(size_t Sz,
                           clang::threadSafety::til::MemRegionRef &R) {
   return R.allocate(Sz);
 }
 
-
 namespace clang {
 namespace threadSafety {
 
@@ -80,7 +77,6 @@
 
 namespace til {
 
-
 // A simple fixed size array class that does not manage its own memory,
 // suitable for use with bump pointer allocation.
 template <class T> class SimpleArray {
@@ -117,7 +113,6 @@
     Data = A.allocateT<T>(Ncp);
     Capacity = Ncp;
     memcpy(Data, Odata, sizeof(T) * Size);
-    return;
   }
 
   // Reserve space for at least N more items.
@@ -221,10 +216,8 @@
   size_t Capacity;
 };
 
-
 }  // end namespace til
 
-
 // A copy on write vector.
 // The vector can be in one of three states:
 // * invalid -- no operations are permitted.
@@ -346,13 +339,11 @@
   VectorData *Data;
 };
 
-
 inline std::ostream& operator<<(std::ostream& ss, const StringRef str) {
   return ss.write(str.data(), str.size());
 }
 
-
 } // end namespace threadSafety
 } // end namespace clang
 
-#endif  // LLVM_CLANG_THREAD_SAFETY_UTIL_H
+#endif // LLVM_CLANG_THREAD_SAFETY_UTIL_H
diff --git a/include/clang/Analysis/CFG.h b/include/clang/Analysis/CFG.h
index 293990c..02fbf37 100644
--- a/include/clang/Analysis/CFG.h
+++ b/include/clang/Analysis/CFG.h
@@ -999,6 +999,7 @@
 
 template <> struct GraphTraits< ::clang::CFGBlock *> {
   typedef ::clang::CFGBlock NodeType;
+  typedef ::clang::CFGBlock *NodeRef;
   typedef ::clang::CFGBlock::succ_iterator ChildIteratorType;
 
   static NodeType* getEntryNode(::clang::CFGBlock *BB)
@@ -1013,6 +1014,7 @@
 
 template <> struct GraphTraits< const ::clang::CFGBlock *> {
   typedef const ::clang::CFGBlock NodeType;
+  typedef const ::clang::CFGBlock *NodeRef;
   typedef ::clang::CFGBlock::const_succ_iterator ChildIteratorType;
 
   static NodeType* getEntryNode(const clang::CFGBlock *BB)
@@ -1027,6 +1029,7 @@
 
 template <> struct GraphTraits<Inverse< ::clang::CFGBlock*> > {
   typedef ::clang::CFGBlock NodeType;
+  typedef ::clang::CFGBlock *NodeRef;
   typedef ::clang::CFGBlock::const_pred_iterator ChildIteratorType;
 
   static NodeType *getEntryNode(Inverse< ::clang::CFGBlock*> G)
@@ -1041,6 +1044,7 @@
 
 template <> struct GraphTraits<Inverse<const ::clang::CFGBlock*> > {
   typedef const ::clang::CFGBlock NodeType;
+  typedef const ::clang::CFGBlock *NodeRef;
   typedef ::clang::CFGBlock::const_pred_iterator ChildIteratorType;
 
   static NodeType *getEntryNode(Inverse<const ::clang::CFGBlock*> G)
diff --git a/include/clang/Analysis/CallGraph.h b/include/clang/Analysis/CallGraph.h
index eda22a5..241ecd5 100644
--- a/include/clang/Analysis/CallGraph.h
+++ b/include/clang/Analysis/CallGraph.h
@@ -172,6 +172,7 @@
 namespace llvm {
 template <> struct GraphTraits<clang::CallGraphNode*> {
   typedef clang::CallGraphNode NodeType;
+  typedef clang::CallGraphNode *NodeRef;
   typedef clang::CallGraphNode::CallRecord CallRecordTy;
   typedef std::pointer_to_unary_function<CallRecordTy,
                                          clang::CallGraphNode*> CGNDerefFun;
@@ -190,6 +191,7 @@
 
 template <> struct GraphTraits<const clang::CallGraphNode*> {
   typedef const clang::CallGraphNode NodeType;
+  typedef const clang::CallGraphNode *NodeRef;
   typedef NodeType::const_iterator ChildIteratorType;
   static NodeType *getEntryNode(const clang::CallGraphNode *CGN) { return CGN; }
   static inline ChildIteratorType child_begin(NodeType *N) { return N->begin();}
diff --git a/include/clang/Analysis/CloneDetection.h b/include/clang/Analysis/CloneDetection.h
new file mode 100644
index 0000000..9bb4022c
--- /dev/null
+++ b/include/clang/Analysis/CloneDetection.h
@@ -0,0 +1,237 @@
+//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// /file
+/// This file defines classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
+#define LLVM_CLANG_AST_CLONEDETECTION_H
+
+#include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/StringMap.h"
+
+#include <vector>
+
+namespace clang {
+
+class Stmt;
+class Decl;
+class ASTContext;
+class CompoundStmt;
+
+/// \brief Identifies a list of statements.
+///
+/// Can either identify a single arbitrary Stmt object, a continuous sequence of
+/// child statements inside a CompoundStmt or no statements at all.
+class StmtSequence {
+  /// If this object identifies a sequence of statements inside a CompoundStmt,
+  /// S points to this CompoundStmt. If this object only identifies a single
+  /// Stmt, then S is a pointer to this Stmt.
+  const Stmt *S;
+
+  /// The related ASTContext for S.
+  ASTContext *Context;
+
+  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
+  /// instance is representing the CompoundStmt children inside the array
+  /// [StartIndex, EndIndex).
+  unsigned StartIndex;
+  unsigned EndIndex;
+
+public:
+  /// \brief Constructs a StmtSequence holding multiple statements.
+  ///
+  /// The resulting StmtSequence identifies a continuous sequence of statements
+  /// in the body of the given CompoundStmt. Which statements of the body should
+  /// be identified needs to be specified by providing a start and end index
+  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
+  ///
+  /// \param Stmt A CompoundStmt that contains all statements in its body.
+  /// \param Context The ASTContext for the given CompoundStmt.
+  /// \param StartIndex The inclusive start index in the children array of
+  ///                   \p Stmt
+  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
+  StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
+               unsigned StartIndex, unsigned EndIndex);
+
+  /// \brief Constructs a StmtSequence holding a single statement.
+  ///
+  /// \param Stmt An arbitrary Stmt.
+  /// \param Context The ASTContext for the given Stmt.
+  StmtSequence(const Stmt *Stmt, ASTContext &Context);
+
+  /// \brief Constructs an empty StmtSequence.
+  StmtSequence();
+
+  typedef const Stmt *const *iterator;
+
+  /// Returns an iterator pointing to the first statement in this sequence.
+  iterator begin() const;
+
+  /// Returns an iterator pointing behind the last statement in this sequence.
+  iterator end() const;
+
+  /// Returns the first statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  const Stmt *front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  /// Returns the last statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  const Stmt *back() const {
+    assert(!empty());
+    return begin()[size() - 1];
+  }
+
+  /// Returns the number of statements this object holds.
+  unsigned size() const {
+    if (holdsSequence())
+      return EndIndex - StartIndex;
+    if (S == nullptr)
+      return 0;
+    return 1;
+  }
+
+  /// Returns true if and only if this StmtSequence contains no statements.
+  bool empty() const { return size() == 0; }
+
+  /// Returns the related ASTContext for the stored Stmts.
+  ASTContext &getASTContext() const {
+    assert(Context);
+    return *Context;
+  }
+
+  /// Returns true if this objects holds a list of statements.
+  bool holdsSequence() const { return EndIndex != 0; }
+
+  /// Returns the start sourcelocation of the first statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getStartLoc() const;
+
+  /// Returns the end sourcelocation of the last statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getEndLoc() const;
+
+  bool operator==(const StmtSequence &Other) const {
+    return std::tie(S, StartIndex, EndIndex) ==
+           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+  }
+
+  bool operator!=(const StmtSequence &Other) const {
+    return std::tie(S, StartIndex, EndIndex) !=
+           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+  }
+
+  /// Returns true if and only if this sequence covers a source range that
+  /// contains the source range of the given sequence \p Other.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object
+  /// and passed a non-empty StmtSequence object.
+  bool contains(const StmtSequence &Other) const;
+};
+
+/// \brief Searches for clones in source code.
+///
+/// First, this class needs a translation unit which is passed via
+/// \p analyzeTranslationUnit . It will then generate and store search data
+/// for all statements inside the given translation unit.
+/// Afterwards the generated data can be used to find code clones by calling
+/// \p findClones .
+///
+/// This class only searches for clones in exectuable source code
+/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
+/// are not supported.
+class CloneDetector {
+public:
+  typedef unsigned DataPiece;
+
+  /// Holds the data about a StmtSequence that is needed during the search for
+  /// code clones.
+  struct CloneSignature {
+    /// \brief Holds all relevant data of a StmtSequence.
+    ///
+    /// If this variable is equal for two different StmtSequences, then they can
+    /// be considered clones of each other.
+    std::vector<DataPiece> Data;
+
+    /// \brief The complexity of the StmtSequence.
+    ///
+    /// This scalar value serves as a simple way of filtering clones that are
+    /// too small to be reported. A greater value indicates that the related
+    /// StmtSequence is probably more interesting to the user.
+    unsigned Complexity;
+
+    /// \brief Creates an empty CloneSignature without any data.
+    CloneSignature() : Complexity(1) {}
+
+    CloneSignature(const std::vector<unsigned> &Data, unsigned Complexity)
+        : Data(Data), Complexity(Complexity) {}
+
+    /// \brief Adds the data from the given CloneSignature to this one.
+    void add(const CloneSignature &Other) {
+      Data.insert(Data.end(), Other.Data.begin(), Other.Data.end());
+      Complexity += Other.Complexity;
+    }
+  };
+
+  /// Holds group of StmtSequences that are clones of each other and the
+  /// complexity value (see CloneSignature::Complexity) that all stored
+  /// StmtSequences have in common.
+  struct CloneGroup {
+    std::vector<StmtSequence> Sequences;
+    unsigned Complexity;
+
+    CloneGroup(const StmtSequence &Seq, unsigned Complexity)
+        : Complexity(Complexity) {
+      Sequences.push_back(Seq);
+    }
+
+    /// \brief Returns false if and only if this group should be skipped when
+    ///        searching for clones.
+    bool isValid() const {
+      // A clone group with only one member makes no sense, so we skip them.
+      return Sequences.size() > 1;
+    }
+  };
+
+  /// \brief Generates and stores search data for all statements in the body of
+  ///        the given Decl.
+  void analyzeCodeBody(const Decl *D);
+
+  /// \brief Stores the CloneSignature to allow future querying.
+  void add(const StmtSequence &S, const CloneSignature &Signature);
+
+  /// \brief Searches the provided statements for clones.
+  ///
+  /// \param Result Output parameter that is filled with a list of found
+  ///               clone groups. Each group contains multiple StmtSequences
+  ///               that were identified to be clones of each other.
+  /// \param MinGroupComplexity Only return clones which have at least this
+  ///                           complexity value.
+  void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity);
+
+private:
+  /// Stores all found clone groups including invalid groups with only a single
+  /// statement.
+  std::vector<CloneGroup> CloneGroups;
+  /// Maps search data to its related index in the \p CloneGroups vector.
+  llvm::StringMap<std::size_t> CloneGroupIndexes;
+};
+
+} // end namespace clang
+
+#endif // LLVM_CLANG_AST_CLONEDETECTION_H
diff --git a/include/clang/Basic/AddressSpaces.h b/include/clang/Basic/AddressSpaces.h
index 8dd7566..63df61b 100644
--- a/include/clang/Basic/AddressSpaces.h
+++ b/include/clang/Basic/AddressSpaces.h
@@ -25,7 +25,7 @@
 /// This uses a high starting offset so as not to conflict with any address
 /// space used by a target.
 enum ID {
-  Offset = 0xFFFF00,
+  Offset = 0x7FFF00,
 
   opencl_global = Offset,
   opencl_local,
diff --git a/include/clang/Basic/Attr.td b/include/clang/Basic/Attr.td
index 038afc3..e12450b 100644
--- a/include/clang/Basic/Attr.td
+++ b/include/clang/Basic/Attr.td
@@ -82,6 +82,8 @@
                                 S->getKind() != Decl::ImplicitParam &&
                                 S->getKind() != Decl::ParmVar &&
                                 S->getKind() != Decl::NonTypeTemplateParm}]>;
+def NonParmVar : SubsetSubject<Var,
+                               [{S->getKind() != Decl::ParmVar}]>;
 def NonBitField : SubsetSubject<Field,
                                 [{!S->isBitField()}]>;
 
@@ -242,6 +244,8 @@
 def Borland : LangOpt<"Borland">;
 def CUDA : LangOpt<"CUDA">;
 def COnly : LangOpt<"CPlusPlus", 1>;
+def OpenCL : LangOpt<"OpenCL">;
+def RenderScript : LangOpt<"RenderScript">;
 
 // Defines targets for target-specific attributes. The list of strings should
 // specify architectures for which the target applies, based off the ArchType
@@ -314,6 +318,9 @@
   let ASTNode = 0;
 }
 
+/// A stmt attribute is not processed on a declaration or a type.
+class StmtAttr : Attr;
+
 /// An inheritable attribute is inherited by later redeclarations.
 class InheritableAttr : Attr;
 
@@ -358,6 +365,14 @@
 // Attributes begin here
 //
 
+def AbiTag : Attr {
+  let Spellings = [GCC<"abi_tag">];
+  let Args = [VariadicStringArgument<"Tags">];
+  let Subjects = SubjectList<[Struct, Var, Function, Namespace], ErrorDiag,
+      "ExpectedStructClassVariableFunctionOrInlineNamespace">;
+  let Documentation = [AbiTagsDocs];
+}
+
 def AddressSpace : TypeAttr {
   let Spellings = [GNU<"address_space">];
   let Args = [IntArgument<"AddressSpace">];
@@ -416,6 +431,22 @@
   let Documentation = [Undocumented];
 }
 
+def XRayInstrument : InheritableAttr {
+  let Spellings = [GNU<"xray_always_instrument">,
+                   CXX11<"clang", "xray_always_instrument">,
+                   GNU<"xray_never_instrument">,
+                   CXX11<"clang", "xray_never_instrument">];
+  let Subjects = SubjectList<[CXXMethod, ObjCMethod, Function], WarnDiag,
+                              "ExpectedFunctionOrMethod">;
+  let Accessors = [Accessor<"alwaysXRayInstrument",
+                     [GNU<"xray_always_instrument">,
+                      CXX11<"clang", "xray_always_instrument">]>,
+                   Accessor<"neverXRayInstrument",
+                     [GNU<"xray_never_instrument">,
+                      CXX11<"clang", "xray_never_instrument">]>];
+  let Documentation = [XRayDocs];
+}
+
 def TLSModel : InheritableAttr {
   let Spellings = [GCC<"tls_model">];
   let Subjects = SubjectList<[TLSVar], ErrorDiag, "ExpectedTLSVar">;
@@ -665,20 +696,27 @@
   let Documentation = [Undocumented];
 }
 
+def OpenCLUnrollHint : InheritableAttr {
+  let Spellings = [GNU<"opencl_unroll_hint">];
+  let Args = [UnsignedArgument<"UnrollHint">];
+  let Documentation = [OpenCLUnrollHintDocs];
+}
+
 // This attribute is both a type attribute, and a declaration attribute (for
 // parameter variables).
-def OpenCLImageAccess : Attr {
+def OpenCLAccess : Attr {
   let Spellings = [Keyword<"__read_only">, Keyword<"read_only">,
                    Keyword<"__write_only">, Keyword<"write_only">,
                    Keyword<"__read_write">, Keyword<"read_write">];
-  let Subjects = SubjectList<[ParmVar], ErrorDiag>;
+  let Subjects = SubjectList<[ParmVar, TypedefName], ErrorDiag,
+                             "ExpectedParameterOrTypedef">;
   let Accessors = [Accessor<"isReadOnly", [Keyword<"__read_only">,
                                            Keyword<"read_only">]>,
                    Accessor<"isReadWrite", [Keyword<"__read_write">,
                                             Keyword<"read_write">]>,
                    Accessor<"isWriteOnly", [Keyword<"__write_only">,
                                             Keyword<"write_only">]>];
-  let Documentation = [Undocumented];
+  let Documentation = [OpenCLAccessDocs];
 }
 
 def OpenCLPrivateAddressSpace : TypeAttr {
@@ -706,6 +744,21 @@
   let Documentation = [OpenCLAddressSpaceGenericDocs];
 }
 
+def OpenCLNoSVM : Attr {
+  let Spellings = [GNU<"nosvm">];
+  let Subjects = SubjectList<[Var]>;
+  let Documentation = [OpenCLNoSVMDocs];
+  let LangOpts = [OpenCL];
+  let ASTNode = 0;
+}
+
+def RenderScriptKernel : Attr {
+  let Spellings = [GNU<"kernel">];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [RenderScriptKernelAttributeDocs];
+  let LangOpts = [RenderScript];
+}
+
 def Deprecated : InheritableAttr {
   let Spellings = [GCC<"deprecated">, Declspec<"deprecated">,
                    CXX11<"","deprecated", 201309>];
@@ -723,6 +776,12 @@
   let Documentation = [Undocumented];
 }
 
+def EmptyBases : InheritableAttr, TargetSpecificAttr<TargetMicrosoftCXXABI> {
+  let Spellings = [Declspec<"empty_bases">];
+  let Subjects = SubjectList<[CXXRecord]>;
+  let Documentation = [EmptyBasesDocs];
+}
+
 def EnableIf : InheritableAttr {
   let Spellings = [GNU<"enable_if">];
   let Subjects = SubjectList<[Function]>;
@@ -739,8 +798,9 @@
   let Documentation = [Undocumented];
 }
 
-def FallThrough : Attr {
-  let Spellings = [CXX11<"clang", "fallthrough">];
+def FallThrough : StmtAttr {
+  let Spellings = [CXX11<"", "fallthrough", 201603>,
+                   CXX11<"clang", "fallthrough">];
 //  let Subjects = [NullStmt];
   let Documentation = [FallthroughDocs];
 }
@@ -832,12 +892,26 @@
   let Documentation = [Undocumented];
 }
 
+def IFunc : Attr {
+  let Spellings = [GCC<"ifunc">];
+  let Args = [StringArgument<"Resolver">];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [IFuncDocs];
+}
+
 def Restrict : InheritableAttr {
   let Spellings = [Declspec<"restrict">, GCC<"malloc">];
   let Subjects = SubjectList<[Function]>;
   let Documentation = [Undocumented];
 }
 
+def LayoutVersion : InheritableAttr, TargetSpecificAttr<TargetMicrosoftCXXABI> {
+  let Spellings = [Declspec<"layout_version">];
+  let Args = [UnsignedArgument<"Version">];
+  let Subjects = SubjectList<[CXXRecord]>;
+  let Documentation = [LayoutVersionDocs];
+}
+
 def MaxFieldAlignment : InheritableAttr {
   // This attribute has no spellings as it is only ever created implicitly.
   let Spellings = [];
@@ -892,8 +966,8 @@
 
 def Mode : Attr {
   let Spellings = [GCC<"mode">];
-  let Subjects = SubjectList<[Var, TypedefName, Field], ErrorDiag,
-                             "ExpectedVariableFieldOrTypedef">;
+  let Subjects = SubjectList<[Var, Enum, TypedefName, Field], ErrorDiag,
+                             "ExpectedVariableEnumFieldOrTypedef">;
   let Args = [IdentifierArgument<"Mode">];
   let Documentation = [Undocumented];
 }
@@ -943,7 +1017,9 @@
 
 def NoDebug : InheritableAttr {
   let Spellings = [GCC<"nodebug">];
-  let Documentation = [Undocumented];
+  let Subjects = SubjectList<[FunctionLike, ObjCMethod, NonParmVar], WarnDiag,
+                              "ExpectedVariableOrFunction">;
+  let Documentation = [NoDebugDocs];
 }
 
 def NoDuplicate : InheritableAttr {
@@ -1562,11 +1638,11 @@
 }
 
 def Unused : InheritableAttr {
-  let Spellings = [GCC<"unused">];
-  let Subjects = SubjectList<[Var, ObjCIvar, Type, Label, Field, ObjCMethod,
-                              FunctionLike], WarnDiag,
-                             "ExpectedVariableFunctionOrLabel">;
-  let Documentation = [Undocumented];
+  let Spellings = [CXX11<"", "maybe_unused", 201603>, GCC<"unused">];
+  let Subjects = SubjectList<[Var, ObjCIvar, Type, Enum, EnumConstant, Label,
+                              Field, ObjCMethod, FunctionLike], WarnDiag,
+                             "ExpectedForMaybeUnused">;
+  let Documentation = [WarnMaybeUnusedDocs];
 }
 
 def Used : InheritableAttr {
@@ -1627,11 +1703,12 @@
 }
 
 def WarnUnusedResult : InheritableAttr {
-  let Spellings = [GCC<"warn_unused_result">,
-                   CXX11<"clang", "warn_unused_result">];
-  let Subjects = SubjectList<[ObjCMethod, CXXRecord, FunctionLike], WarnDiag,
-                             "ExpectedFunctionMethodOrClass">;
-  let Documentation = [Undocumented];
+  let Spellings = [CXX11<"", "nodiscard", 201603>,
+                   CXX11<"clang", "warn_unused_result">,
+                   GCC<"warn_unused_result">];
+  let Subjects = SubjectList<[ObjCMethod, Enum, CXXRecord, FunctionLike],
+                             WarnDiag, "ExpectedFunctionMethodEnumOrClass">;
+  let Documentation = [WarnUnusedResultsDocs];
 }
 
 def Weak : InheritableAttr {
@@ -1653,6 +1730,12 @@
   let Documentation = [Undocumented];
 }
 
+def LTOVisibilityPublic : InheritableAttr {
+  let Spellings = [CXX11<"clang", "lto_visibility_public">];
+  let Subjects = SubjectList<[Record]>;
+  let Documentation = [LTOVisibilityDocs];
+}
+
 def AnyX86Interrupt : InheritableAttr, TargetSpecificAttr<TargetAnyX86> {
   // NOTE: If you add any additional spellings, ARMInterrupt's,
   // MSP430Interrupt's and MipsInterrupt's spellings must match.
@@ -2081,14 +2164,14 @@
 
 def DLLExport : InheritableAttr, TargetSpecificAttr<TargetWindows> {
   let Spellings = [Declspec<"dllexport">, GCC<"dllexport">];
-  let Subjects = SubjectList<[Function, Var, CXXRecord]>;
-  let Documentation = [Undocumented];
+  let Subjects = SubjectList<[Function, Var, CXXRecord, ObjCInterface]>;
+  let Documentation = [DLLExportDocs];
 }
 
 def DLLImport : InheritableAttr, TargetSpecificAttr<TargetWindows> {
   let Spellings = [Declspec<"dllimport">, GCC<"dllimport">];
-  let Subjects = SubjectList<[Function, Var, CXXRecord]>;
-  let Documentation = [Undocumented];
+  let Subjects = SubjectList<[Function, Var, CXXRecord, ObjCInterface]>;
+  let Documentation = [DLLImportDocs];
 }
 
 def SelectAny : InheritableAttr {
@@ -2191,10 +2274,6 @@
   }];
 }
 
-def Unaligned : IgnoredAttr {
-  let Spellings = [Keyword<"__unaligned">];
-}
-
 def LoopHint : Attr {
   /// #pragma clang loop <option> directive
   /// vectorize: vectorizes loop operations if State == Enable.
@@ -2203,6 +2282,7 @@
   /// interleave_count: interleaves 'Value' loop interations.
   /// unroll: fully unroll loop if State == Enable.
   /// unroll_count: unrolls loop 'Value' times.
+  /// distribute: attempt to distribute loop if State == Enable
 
   /// #pragma unroll <argument> directive
   /// <no arg>: fully unrolls loop.
@@ -2215,9 +2295,9 @@
   /// State of the loop optimization specified by the spelling.
   let Args = [EnumArgument<"Option", "OptionType",
                           ["vectorize", "vectorize_width", "interleave", "interleave_count",
-                           "unroll", "unroll_count"],
+                           "unroll", "unroll_count", "distribute"],
                           ["Vectorize", "VectorizeWidth", "Interleave", "InterleaveCount",
-                           "Unroll", "UnrollCount"]>,
+                           "Unroll", "UnrollCount", "Distribute"]>,
               EnumArgument<"State", "LoopHintState",
                            ["enable", "disable", "numeric", "assume_safety", "full"],
                            ["Enable", "Disable", "Numeric", "AssumeSafety", "Full"]>,
@@ -2232,6 +2312,7 @@
     case InterleaveCount: return "interleave_count";
     case Unroll: return "unroll";
     case UnrollCount: return "unroll_count";
+    case Distribute: return "distribute";
     }
     llvm_unreachable("Unhandled LoopHint option.");
   }
@@ -2301,6 +2382,98 @@
   let Documentation = [Undocumented];
 }
 
+def OMPCaptureNoInit : InheritableAttr {
+  // This attribute has no spellings as it is only ever created implicitly.
+  let Spellings = [];
+  let SemaHandler = 0;
+  let Documentation = [Undocumented];
+}
+
+def OMPDeclareSimdDecl : Attr {
+  let Spellings = [Pragma<"omp", "declare simd">];
+  let Subjects = SubjectList<[Function]>;
+  let SemaHandler = 0;
+  let HasCustomParsing = 1;
+  let Documentation = [OMPDeclareSimdDocs];
+  let Args = [
+    EnumArgument<"BranchState", "BranchStateTy",
+                 [ "", "inbranch", "notinbranch" ],
+                 [ "BS_Undefined", "BS_Inbranch", "BS_Notinbranch" ]>,
+    ExprArgument<"Simdlen">, VariadicExprArgument<"Uniforms">,
+    VariadicExprArgument<"Aligneds">, VariadicExprArgument<"Alignments">,
+    VariadicExprArgument<"Linears">, VariadicUnsignedArgument<"Modifiers">,
+    VariadicExprArgument<"Steps">
+  ];
+  let AdditionalMembers = [{
+    void printPrettyPragma(raw_ostream & OS, const PrintingPolicy &Policy)
+        const {
+      if (getBranchState() != BS_Undefined)
+        OS << ConvertBranchStateTyToStr(getBranchState()) << " ";
+      if (auto *E = getSimdlen()) {
+        OS << "simdlen(";
+        E->printPretty(OS, nullptr, Policy);
+        OS << ") ";
+      }
+      if (uniforms_size() > 0) {
+        OS << "uniform";
+        StringRef Sep = "(";
+        for (auto *E : uniforms()) {
+          OS << Sep;
+          E->printPretty(OS, nullptr, Policy);
+          Sep = ", ";
+        }
+        OS << ") ";
+      }
+      alignments_iterator NI = alignments_begin();
+      for (auto *E : aligneds()) {
+        OS << "aligned(";
+        E->printPretty(OS, nullptr, Policy);
+        if (*NI) {
+          OS << ": ";
+          (*NI)->printPretty(OS, nullptr, Policy);
+        }
+        OS << ") ";
+        ++NI;
+      }
+      steps_iterator I = steps_begin();
+      modifiers_iterator MI = modifiers_begin();
+      for (auto *E : linears()) {
+        OS << "linear(";
+        if (*MI != OMPC_LINEAR_unknown)
+          OS << getOpenMPSimpleClauseTypeName(OMPC_linear, *MI) << "(";
+        E->printPretty(OS, nullptr, Policy);
+        if (*MI != OMPC_LINEAR_unknown)
+          OS << ")";
+        if (*I) {
+          OS << ": ";
+          (*I)->printPretty(OS, nullptr, Policy);
+        }
+        OS << ") ";
+        ++I;
+        ++MI;
+      }
+    }
+  }];
+}
+
+def OMPDeclareTargetDecl : Attr {
+  let Spellings = [Pragma<"omp", "declare target">];
+  let SemaHandler = 0;
+  let Documentation = [OMPDeclareTargetDocs];
+  let Args = [
+    EnumArgument<"MapType", "MapTypeTy",
+                 [ "to", "link" ],
+                 [ "MT_To", "MT_Link" ]>
+  ];
+  let AdditionalMembers = [{
+    void printPrettyPragma(raw_ostream &OS, const PrintingPolicy &Policy) const {
+      // Use fake syntax because it is for testing and debugging purpose only.
+      if (getMapType() != MT_To)
+        OS << ConvertMapTypeTyToStr(getMapType()) << " ";
+    }
+  }];
+}
+
 def InternalLinkage : InheritableAttr {
   let Spellings = [GNU<"internal_linkage">, CXX11<"clang", "internal_linkage">];
   let Subjects = SubjectList<[Var, Function, CXXRecord]>;
diff --git a/include/clang/Basic/AttrDocs.td b/include/clang/Basic/AttrDocs.td
index a971827..9145d19 100644
--- a/include/clang/Basic/AttrDocs.td
+++ b/include/clang/Basic/AttrDocs.td
@@ -67,6 +67,34 @@
   }];
 }
 
+def DLLExportDocs : Documentation {
+  let Category = DocCatVariable;
+  let Content = [{
+The ``__declspec(dllexport)`` attribute declares a variable, function, or
+Objective-C interface to be exported from the module.  It is available under the
+``-fdeclspec`` flag for compatibility with various compilers.  The primary use
+is for COFF object files which explicitly specify what interfaces are available
+for external use.  See the dllexport_ documentation on MSDN for more
+information.
+
+.. _dllexport: https://msdn.microsoft.com/en-us/library/3y1sfaz2.aspx
+  }];
+}
+
+def DLLImportDocs : Documentation {
+  let Category = DocCatVariable;
+  let Content = [{
+The ``__declspec(dllimport)`` attribute declares a variable, function, or
+Objective-C interface to be imported from an external module.  It is available
+under the ``-fdeclspec`` flag for compatibility with various compilers.  The
+primary use is for COFF object files which explicitly specify what interfaces
+are imported from external modules.  See the dllimport_ documentation on MSDN
+for more information.
+
+.. _dllimport: https://msdn.microsoft.com/en-us/library/3y1sfaz2.aspx
+  }];
+}
+
 def ThreadDocs : Documentation {
   let Category = DocCatVariable;
   let Content = [{
@@ -260,6 +288,55 @@
 not ODR-equivalent.
 
 Query for this feature with ``__has_attribute(enable_if)``.
+
+Note that functions with one or more ``enable_if`` attributes may not have
+their address taken, unless all of the conditions specified by said
+``enable_if`` are constants that evaluate to ``true``. For example:
+
+.. code-block:: c
+
+  const int TrueConstant = 1;
+  const int FalseConstant = 0;
+  int f(int a) __attribute__((enable_if(a > 0, "")));
+  int g(int a) __attribute__((enable_if(a == 0 || a != 0, "")));
+  int h(int a) __attribute__((enable_if(1, "")));
+  int i(int a) __attribute__((enable_if(TrueConstant, "")));
+  int j(int a) __attribute__((enable_if(FalseConstant, "")));
+
+  void fn() {
+    int (*ptr)(int);
+    ptr = &f; // error: 'a > 0' is not always true
+    ptr = &g; // error: 'a == 0 || a != 0' is not a truthy constant
+    ptr = &h; // OK: 1 is a truthy constant
+    ptr = &i; // OK: 'TrueConstant' is a truthy constant
+    ptr = &j; // error: 'FalseConstant' is a constant, but not truthy
+  }
+
+Because ``enable_if`` evaluation happens during overload resolution,
+``enable_if`` may give unintuitive results when used with templates, depending
+on when overloads are resolved. In the example below, clang will emit a
+diagnostic about no viable overloads for ``foo`` in ``bar``, but not in ``baz``:
+
+.. code-block:: c++
+
+  double foo(int i) __attribute__((enable_if(i > 0, "")));
+  void *foo(int i) __attribute__((enable_if(i <= 0, "")));
+  template <int I>
+  auto bar() { return foo(I); }
+
+  template <typename T>
+  auto baz() { return foo(T::number); }
+
+  struct WithNumber { constexpr static int number = 1; };
+  void callThem() {
+    bar<sizeof(WithNumber)>();
+    baz<WithNumber>();
+  }
+
+This is because, in ``bar``, ``foo`` is resolved prior to template
+instantiation, so the value for ``I`` isn't known (thus, both ``enable_if``
+conditions for ``foo`` fail). However, in ``baz``, ``foo`` is resolved during
+template instantiation, so the value for ``T::number`` is known.
   }];
 }
 
@@ -471,6 +548,15 @@
   }];
 }
 
+def NoDebugDocs : Documentation {
+  let Category = DocCatVariable;
+  let Content = [{
+The ``nodebug`` attribute allows you to suppress debugging information for a
+function or method, or for a variable that is not a parameter or a non-static
+data member.
+  }];
+}
+
 def NoDuplicateDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
@@ -743,11 +829,63 @@
   }];
 }
 
+def WarnMaybeUnusedDocs : Documentation {
+  let Category = DocCatVariable;
+  let Heading = "maybe_unused, unused, gnu::unused";
+  let Content = [{
+When passing the ``-Wunused`` flag to Clang, entities that are unused by the
+program may be diagnosed. The ``[[maybe_unused]]`` (or
+``__attribute__((unused))``) attribute can be used to silence such diagnostics
+when the entity cannot be removed. For instance, a local variable may exist
+solely for use in an ``assert()`` statement, which makes the local variable
+unused when ``NDEBUG`` is defined.
+
+The attribute may be applied to the declaration of a class, a typedef, a
+variable, a function or method, a function parameter, an enumeration, an
+enumerator, a non-static data member, or a label.
+
+.. code-block: c++
+  #include <cassert>
+
+  [[maybe_unused]] void f([[maybe_unused]] bool thing1,
+                          [[maybe_unused]] bool thing2) {
+    [[maybe_unused]] bool b = thing1 && thing2;
+    assert(b);
+  }
+  }];
+}
+
+def WarnUnusedResultsDocs : Documentation {
+  let Category = DocCatFunction;
+  let Heading = "nodiscard, warn_unused_result, clang::warn_unused_result, gnu::warn_unused_result";
+  let Content  = [{
+Clang supports the ability to diagnose when the results of a function call
+expression are discarded under suspicious circumstances. A diagnostic is
+generated when a function or its return type is marked with ``[[nodiscard]]``
+(or ``__attribute__((warn_unused_result))``) and the function call appears as a
+potentially-evaluated discarded-value expression that is not explicitly cast to
+`void`.
+
+.. code-block: c++
+  struct [[nodiscard]] error_info { /*...*/ };
+  error_info enable_missile_safety_mode();
+  
+  void launch_missiles();
+  void test_missiles() {
+    enable_missile_safety_mode(); // diagnoses
+    launch_missiles();
+  }
+  error_info &foo();
+  void f() { foo(); } // Does not diagnose, error_info is a reference.  
+  }];
+}
+
 def FallthroughDocs : Documentation {
   let Category = DocCatStmt;
+  let Heading = "fallthrough, clang::fallthrough";
   let Content = [{
-The ``clang::fallthrough`` attribute is used along with the
-``-Wimplicit-fallthrough`` argument to annotate intentional fall-through
+The ``fallthrough`` (or ``clang::fallthrough``) attribute is used
+to annotate intentional fall-through
 between switch labels.  It can only be applied to a null statement placed at a
 point of execution between any statement and the next switch label.  It is
 common to mark these places with a specific comment, but this attribute is
@@ -758,6 +896,10 @@
 where ``break;`` can, but only if there are no statements on the execution path
 between it and the next switch label.
 
+By default, Clang does not warn on unannotated fallthrough from one ``switch``
+case to another. Diagnostics on fallthrough without a corresponding annotation
+can be enabled with the ``-Wimplicit-fallthrough`` argument.
+
 Here is an example:
 
 .. code-block:: c++
@@ -1196,7 +1338,8 @@
 def DocCatTypeSafety : DocumentationCategory<"Type Safety Checking"> {
   let Content = [{
 Clang supports additional attributes to enable checking type safety properties
-that can't be enforced by the C type system.  Use cases include:
+that can't be enforced by the C type system. To see warnings produced by these
+checks, ensure that -Wtype-safety is enabled. Use cases include:
 
 * MPI library implementations, where these attributes enable checking that
   the buffer type matches the passed ``MPI_Datatype``;
@@ -1234,18 +1377,31 @@
 Use ``__attribute__((argument_with_type_tag(arg_kind, arg_idx,
 type_tag_idx)))`` on a function declaration to specify that the function
 accepts a type tag that determines the type of some other argument.
-``arg_kind`` is an identifier that should be used when annotating all
-applicable type tags.
 
 This attribute is primarily useful for checking arguments of variadic functions
 (``pointer_with_type_tag`` can be used in most non-variadic cases).
 
+In the attribute prototype above:
+  * ``arg_kind`` is an identifier that should be used when annotating all
+    applicable type tags.
+  * ``arg_idx`` provides the position of a function argument. The expected type of
+    this function argument will be determined by the function argument specified
+    by ``type_tag_idx``. In the code example below, "3" means that the type of the
+    function's third argument will be determined by ``type_tag_idx``.
+  * ``type_tag_idx`` provides the position of a function argument. This function
+    argument will be a type tag. The type tag will determine the expected type of
+    the argument specified by ``arg_idx``. In the code example below, "2" means
+    that the type tag associated with the function's second argument should agree
+    with the type of the argument specified by ``arg_idx``.
+
 For example:
 
 .. code-block:: c++
 
   int fcntl(int fd, int cmd, ...)
       __attribute__(( argument_with_type_tag(fcntl,3,2) ));
+  // The function's second argument will be a type tag; this type tag will
+  // determine the expected type of the function's third argument.
   }];
 }
 
@@ -1257,85 +1413,140 @@
 on a function declaration to specify that the function accepts a type tag that
 determines the pointee type of some other pointer argument.
 
+In the attribute prototype above:
+  * ``ptr_kind`` is an identifier that should be used when annotating all
+    applicable type tags.
+  * ``ptr_idx`` provides the position of a function argument; this function
+    argument will have a pointer type. The expected pointee type of this pointer
+    type will be determined by the function argument specified by
+    ``type_tag_idx``. In the code example below, "1" means that the pointee type
+    of the function's first argument will be determined by ``type_tag_idx``.
+  * ``type_tag_idx`` provides the position of a function argument; this function
+    argument will be a type tag. The type tag will determine the expected pointee
+    type of the pointer argument specified by ``ptr_idx``. In the code example
+    below, "3" means that the type tag associated with the function's third
+    argument should agree with the pointee type of the pointer argument specified
+    by ``ptr_idx``.
+
 For example:
 
 .. code-block:: c++
 
+  typedef int MPI_Datatype;
   int MPI_Send(void *buf, int count, MPI_Datatype datatype /*, other args omitted */)
       __attribute__(( pointer_with_type_tag(mpi,1,3) ));
+  // The function's 3rd argument will be a type tag; this type tag will
+  // determine the expected pointee type of the function's 1st argument.
   }];
 }
 
 def TypeTagForDatatypeDocs : Documentation {
   let Category = DocCatTypeSafety;
   let Content = [{
+When declaring a variable, use
+``__attribute__((type_tag_for_datatype(kind, type)))`` to create a type tag that
+is tied to the ``type`` argument given to the attribute.
+
+In the attribute prototype above:
+  * ``kind`` is an identifier that should be used when annotating all applicable
+    type tags.
+  * ``type`` indicates the name of the type.
+
 Clang supports annotating type tags of two forms.
 
-* **Type tag that is an expression containing a reference to some declared
-  identifier.** Use ``__attribute__((type_tag_for_datatype(kind, type)))`` on a
-  declaration with that identifier:
+  * **Type tag that is a reference to a declared identifier.**
+    Use ``__attribute__((type_tag_for_datatype(kind, type)))`` when declaring that
+    identifier:
 
-  .. code-block:: c++
+    .. code-block:: c++
 
-    extern struct mpi_datatype mpi_datatype_int
-        __attribute__(( type_tag_for_datatype(mpi,int) ));
-    #define MPI_INT ((MPI_Datatype) &mpi_datatype_int)
+      typedef int MPI_Datatype;
+      extern struct mpi_datatype mpi_datatype_int
+          __attribute__(( type_tag_for_datatype(mpi,int) ));
+      #define MPI_INT ((MPI_Datatype) &mpi_datatype_int)
+      // &mpi_datatype_int is a type tag. It is tied to type "int".
 
-* **Type tag that is an integral literal.** Introduce a ``static const``
-  variable with a corresponding initializer value and attach
-  ``__attribute__((type_tag_for_datatype(kind, type)))`` on that declaration,
-  for example:
+  * **Type tag that is an integral literal.**
+    Declare a ``static const`` variable with an initializer value and attach
+    ``__attribute__((type_tag_for_datatype(kind, type)))`` on that declaration:
 
-  .. code-block:: c++
+    .. code-block:: c++
 
-    #define MPI_INT ((MPI_Datatype) 42)
-    static const MPI_Datatype mpi_datatype_int
-        __attribute__(( type_tag_for_datatype(mpi,int) )) = 42
+      typedef int MPI_Datatype;
+      static const MPI_Datatype mpi_datatype_int
+          __attribute__(( type_tag_for_datatype(mpi,int) )) = 42;
+      #define MPI_INT ((MPI_Datatype) 42)
+      // The number 42 is a type tag. It is tied to type "int".
 
-The attribute also accepts an optional third argument that determines how the
-expression is compared to the type tag.  There are two supported flags:
 
-* ``layout_compatible`` will cause types to be compared according to
-  layout-compatibility rules (C++11 [class.mem] p 17, 18).  This is
-  implemented to support annotating types like ``MPI_DOUBLE_INT``.
+The ``type_tag_for_datatype`` attribute also accepts an optional third argument
+that determines how the type of the function argument specified by either
+``arg_idx`` or ``ptr_idx`` is compared against the type associated with the type
+tag. (Recall that for the ``argument_with_type_tag`` attribute, the type of the
+function argument specified by ``arg_idx`` is compared against the type
+associated with the type tag. Also recall that for the ``pointer_with_type_tag``
+attribute, the pointee type of the function argument specified by ``ptr_idx`` is
+compared against the type associated with the type tag.) There are two supported
+values for this optional third argument:
 
-  For example:
+  * ``layout_compatible`` will cause types to be compared according to
+    layout-compatibility rules (In C++11 [class.mem] p 17, 18, see the
+    layout-compatibility rules for two standard-layout struct types and for two
+    standard-layout union types). This is useful when creating a type tag
+    associated with a struct or union type. For example:
 
-  .. code-block:: c++
+    .. code-block:: c++
 
-    /* In mpi.h */
-    struct internal_mpi_double_int { double d; int i; };
-    extern struct mpi_datatype mpi_datatype_double_int
-        __attribute__(( type_tag_for_datatype(mpi, struct internal_mpi_double_int, layout_compatible) ));
+      /* In mpi.h */
+      typedef int MPI_Datatype;
+      struct internal_mpi_double_int { double d; int i; };
+      extern struct mpi_datatype mpi_datatype_double_int
+          __attribute__(( type_tag_for_datatype(mpi,
+                          struct internal_mpi_double_int, layout_compatible) ));
 
-    #define MPI_DOUBLE_INT ((MPI_Datatype) &mpi_datatype_double_int)
+      #define MPI_DOUBLE_INT ((MPI_Datatype) &mpi_datatype_double_int)
 
-    /* In user code */
-    struct my_pair { double a; int b; };
-    struct my_pair *buffer;
-    MPI_Send(buffer, 1, MPI_DOUBLE_INT /*, ...  */); // no warning
+      int MPI_Send(void *buf, int count, MPI_Datatype datatype, ...)
+          __attribute__(( pointer_with_type_tag(mpi,1,3) ));
 
-    struct my_int_pair { int a; int b; }
-    struct my_int_pair *buffer2;
-    MPI_Send(buffer2, 1, MPI_DOUBLE_INT /*, ...  */); // warning: actual buffer element
-                                                      // type 'struct my_int_pair'
-                                                      // doesn't match specified MPI_Datatype
+      /* In user code */
+      struct my_pair { double a; int b; };
+      struct my_pair *buffer;
+      MPI_Send(buffer, 1, MPI_DOUBLE_INT /*, ...  */); // no warning because the
+                                                       // layout of my_pair is
+                                                       // compatible with that of
+                                                       // internal_mpi_double_int
 
-* ``must_be_null`` specifies that the expression should be a null pointer
-  constant, for example:
+      struct my_int_pair { int a; int b; }
+      struct my_int_pair *buffer2;
+      MPI_Send(buffer2, 1, MPI_DOUBLE_INT /*, ...  */); // warning because the
+                                                        // layout of my_int_pair
+                                                        // does not match that of
+                                                        // internal_mpi_double_int
 
-  .. code-block:: c++
+  * ``must_be_null`` specifies that the function argument specified by either
+    ``arg_idx`` (for the ``argument_with_type_tag`` attribute) or ``ptr_idx`` (for
+    the ``pointer_with_type_tag`` attribute) should be a null pointer constant.
+    The second argument to the ``type_tag_for_datatype`` attribute is ignored. For
+    example:
 
-    /* In mpi.h */
-    extern struct mpi_datatype mpi_datatype_null
-        __attribute__(( type_tag_for_datatype(mpi, void, must_be_null) ));
+    .. code-block:: c++
 
-    #define MPI_DATATYPE_NULL ((MPI_Datatype) &mpi_datatype_null)
+      /* In mpi.h */
+      typedef int MPI_Datatype;
+      extern struct mpi_datatype mpi_datatype_null
+          __attribute__(( type_tag_for_datatype(mpi, void, must_be_null) ));
 
-    /* In user code */
-    MPI_Send(buffer, 1, MPI_DATATYPE_NULL /*, ...  */); // warning: MPI_DATATYPE_NULL
-                                                        // was specified but buffer
-                                                        // is not a null pointer
+      #define MPI_DATATYPE_NULL ((MPI_Datatype) &mpi_datatype_null)
+      int MPI_Send(void *buf, int count, MPI_Datatype datatype, ...)
+          __attribute__(( pointer_with_type_tag(mpi,1,3) ));
+
+      /* In user code */
+      struct my_pair { double a; int b; };
+      struct my_pair *buffer;
+      MPI_Send(buffer, 1, MPI_DATATYPE_NULL /*, ...  */); // warning: MPI_DATATYPE_NULL
+                                                          // was specified but buffer
+                                                          // is not a null pointer
   }];
 }
 
@@ -1441,6 +1652,26 @@
   }];
 }
 
+def EmptyBasesDocs : Documentation {
+  let Category = DocCatType;
+  let Content = [{
+The empty_bases attribute permits the compiler to utilize the
+empty-base-optimization more frequently.
+This attribute only applies to struct, class, and union types.
+It is only supported when using the Microsoft C++ ABI.
+  }];
+}
+
+def LayoutVersionDocs : Documentation {
+  let Category = DocCatType;
+  let Content = [{
+The layout_version attribute requests that the compiler utilize the class
+layout rules of a particular compiler version.
+This attribute only applies to struct, class, and union types.
+It is only supported when using the Microsoft C++ ABI.
+  }];
+}
+
 def MSInheritanceDocs : Documentation {
   let Category = DocCatType;
   let Heading = "__single_inhertiance, __multiple_inheritance, __virtual_inheritance";
@@ -1582,6 +1813,46 @@
   }];
 }
 
+def OpenCLUnrollHintDocs : Documentation {
+  let Category = DocCatStmt;
+  let Heading = "__attribute__((opencl_unroll_hint))";
+  let Content = [{
+The opencl_unroll_hint attribute qualifier can be used to specify that a loop
+(for, while and do loops) can be unrolled. This attribute qualifier can be
+used to specify full unrolling or partial unrolling by a specified amount.
+This is a compiler hint and the compiler may ignore this directive. See
+`OpenCL v2.0 <https://www.khronos.org/registry/cl/specs/opencl-2.0.pdf>`_
+s6.11.5 for details.
+  }];
+}
+
+def OpenCLAccessDocs : Documentation {
+  let Category = DocCatStmt;
+  let Heading = "__read_only, __write_only, __read_write (read_only, write_only, read_write)";
+  let Content = [{
+The access qualifiers must be used with image object arguments or pipe arguments
+to declare if they are being read or written by a kernel or function.
+
+The read_only/__read_only, write_only/__write_only and read_write/__read_write
+names are reserved for use as access qualifiers and shall not be used otherwise.
+
+.. code-block:: c
+
+  kernel void
+  foo (read_only image2d_t imageA,
+       write_only image2d_t imageB) {
+    ...
+  }
+
+In the above example imageA is a read-only 2D image object, and imageB is a
+write-only 2D image object.
+
+The read_write (or __read_write) qualifier can not be used with pipe.
+
+More details can be found in the OpenCL C language Spec v2.0, Section 6.6.
+    }];
+}
+
 def DocOpenCLAddressSpaces : DocumentationCategory<"OpenCL Address Spaces"> {
   let Content = [{
 The address space qualifier may be used to specify the region of memory that is
@@ -1663,6 +1934,17 @@
   }];
 }
 
+def OpenCLNoSVMDocs : Documentation {
+  let Category = DocCatVariable;
+  let Content = [{
+OpenCL 2.0 supports the optional ``__attribute__((nosvm))`` qualifier for
+pointer variable. It informs the compiler that the pointer does not refer
+to a shared virtual memory region. See OpenCL v2.0 s6.7.2 for details.
+
+Since it is not widely used and has been removed from OpenCL 2.1, it is ignored
+by Clang.
+  }];
+}
 def NullabilityDocs : DocumentationCategory<"Nullability Attributes"> {
   let Content = [{
 Whether a particular pointer may be "null" is an important concern when working with pointers in the C family of languages. The various nullability attributes indicate whether a particular pointer can be null or not, which makes APIs more expressive and can help static analysis tools identify bugs involving null pointers. Clang supports several kinds of nullability attributes: the ``nonnull`` and ``returns_nonnull`` attributes indicate which function or method parameters and result types can never be null, while nullability type qualifiers indicate which pointer types can be null (``_Nullable``) or cannot be null (``_Nonnull``). 
@@ -1778,6 +2060,41 @@
   }];
 }
 
+def OMPDeclareSimdDocs : Documentation {
+  let Category = DocCatFunction;
+  let Heading = "#pragma omp declare simd";
+  let Content = [{
+The `declare simd` construct can be applied to a function to enable the creation
+of one or more versions that can process multiple arguments using SIMD
+instructions from a single invocation in a SIMD loop. The `declare simd`
+directive is a declarative directive. There may be multiple `declare simd`
+directives for a function. The use of a `declare simd` construct on a function
+enables the creation of SIMD versions of the associated function that can be
+used to process multiple arguments from a single invocation from a SIMD loop
+concurrently.
+The syntax of the `declare simd` construct is as follows:
+
+  .. code-block:: c
+
+  #pragma omp declare simd [clause[[,] clause] ...] new-line
+  [#pragma omp declare simd [clause[[,] clause] ...] new-line]
+  [...]
+  function definition or declaration
+
+where clause is one of the following:
+
+  .. code-block:: c
+
+  simdlen(length)
+  linear(argument-list[:constant-linear-step])
+  aligned(argument-list[:alignment])
+  uniform(argument-list)
+  inbranch
+  notinbranch
+
+  }];
+}
+
 def SwiftDocs : DocumentationCategory<"Controlling Swift Import"> {
   let Content = [{
 Clang supports additional attributes for controlling how APIs are imported into Swift.
@@ -1791,7 +2108,6 @@
   }];
 }
 
-
 def SwiftBridgeDocs : Documentation {
   let Category = SwiftDocs;
   let Content = [{
@@ -1831,6 +2147,23 @@
 }
 
 
+def OMPDeclareTargetDocs : Documentation {
+  let Category = DocCatFunction;
+  let Heading = "#pragma omp declare target";
+  let Content = [{
+The `declare target` directive specifies that variables and functions are mapped
+to a device for OpenMP offload mechanism.
+
+The syntax of the declare target directive is as follows:
+
+  .. code-block:: c
+
+  #pragma omp declare target new-line
+  declarations-definition-seq
+  #pragma omp end declare target new-line
+  }];
+}
+
 def NotTailCalledDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
@@ -1838,7 +2171,7 @@
 
 For example, it prevents tail-call optimization in the following case:
 
-  .. code-block: c
+  .. code-block:: c
 
     int __attribute__((not_tail_called)) foo1(int);
 
@@ -1848,7 +2181,7 @@
 
 However, it doesn't prevent tail-call optimization in this case:
 
-  .. code-block: c
+  .. code-block:: c
 
     int __attribute__((not_tail_called)) foo1(int);
 
@@ -1862,7 +2195,7 @@
 
 Marking virtual functions as ``not_tail_called`` is an error:
 
-  .. code-block: c++
+  .. code-block:: c++
 
     class Base {
     public:
@@ -1912,7 +2245,7 @@
 
 Marking virtual functions as ``disable_tail_calls`` is legal.
 
-  .. code-block: c++
+  .. code-block:: c++
 
     int callee(int);
 
@@ -1986,94 +2319,6 @@
   }];
 }
 
-def DeprecatedDocs : Documentation {
-  let Category = DocCatFunction;
-  let Content = [{
-The ``deprecated`` attribute can be applied to a function, a variable, or a
-type. This is useful when identifying functions, variables, or types that are
-expected to be removed in a future version of a program.
-
-Consider the function declaration for a hypothetical function ``f``:
-
-.. code-block:: c++
-
-  void f(void) __attribute__((deprecated("message", "replacement")));
-
-When spelled as `__attribute__((deprecated))`, the deprecated attribute can have
-two optional string arguments. The first one is the message to display when
-emitting the warning; the second one enables the compiler to provide a Fix-It
-to replace the deprecated name with a new name. Otherwise, when spelled as
-`[[gnu::deprecated]] or [[deprecated]]`, the attribute can have one optional
-string argument which is the message to display when emitting the warning.
-  }];
-}
-
-def PreserveMostDocs : Documentation {
-  let Category = DocCatCallingConvs;
-  let Content = [{
-On X86-64 and AArch64 targets, this attribute changes the calling convention of
-a function. The ``preserve_most`` calling convention attempts to make the code
-in the caller as unintrusive as possible. This convention behaves identically
-to the ``C`` calling convention on how arguments and return values are passed,
-but it uses a different set of caller/callee-saved registers. This alleviates
-the burden of saving and recovering a large register set before and after the
-call in the caller. If the arguments are passed in callee-saved registers,
-then they will be preserved by the callee across the call. This doesn't
-apply for values returned in callee-saved registers.
-
-- On X86-64 the callee preserves all general purpose registers, except for
-  R11. R11 can be used as a scratch register. Floating-point registers
-  (XMMs/YMMs) are not preserved and need to be saved by the caller.
-
-The idea behind this convention is to support calls to runtime functions
-that have a hot path and a cold path. The hot path is usually a small piece
-of code that doesn't use many registers. The cold path might need to call out to
-another function and therefore only needs to preserve the caller-saved
-registers, which haven't already been saved by the caller. The
-`preserve_most` calling convention is very similar to the ``cold`` calling
-convention in terms of caller/callee-saved registers, but they are used for
-different types of function calls. ``coldcc`` is for function calls that are
-rarely executed, whereas `preserve_most` function calls are intended to be
-on the hot path and definitely executed a lot. Furthermore ``preserve_most``
-doesn't prevent the inliner from inlining the function call.
-
-This calling convention will be used by a future version of the Objective-C
-runtime and should therefore still be considered experimental at this time.
-Although this convention was created to optimize certain runtime calls to
-the Objective-C runtime, it is not limited to this runtime and might be used
-by other runtimes in the future too. The current implementation only
-supports X86-64 and AArch64, but the intention is to support more architectures
-in the future.
-  }];
-}
-
-def PreserveAllDocs : Documentation {
-  let Category = DocCatCallingConvs;
-  let Content = [{
-On X86-64 and AArch64 targets, this attribute changes the calling convention of
-a function. The ``preserve_all`` calling convention attempts to make the code
-in the caller even less intrusive than the ``preserve_most`` calling convention.
-This calling convention also behaves identical to the ``C`` calling convention
-on how arguments and return values are passed, but it uses a different set of
-caller/callee-saved registers. This removes the burden of saving and
-recovering a large register set before and after the call in the caller. If
-the arguments are passed in callee-saved registers, then they will be
-preserved by the callee across the call. This doesn't apply for values
-returned in callee-saved registers.
-
-- On X86-64 the callee preserves all general purpose registers, except for
-  R11. R11 can be used as a scratch register. Furthermore it also preserves
-  all floating-point registers (XMMs/YMMs).
-
-The idea behind this convention is to support calls to runtime functions
-that don't need to call out to any other functions.
-
-This calling convention, like the ``preserve_most`` calling convention, will be
-used by a future version of the Objective-C runtime and should be considered
-experimental at this time.
-  }];
-}
-
 def SwiftCallDocs : Documentation {
   let Category = DocCatVariable;
   let Content = [{
@@ -2217,3 +2462,152 @@
 optimizations like C++'s named return value optimization (NRVO).
   }];
 }
+
+def AbiTagsDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``abi_tag`` attribute can be applied to a function, variable, class or
+inline namespace declaration to modify the mangled name of the entity. It gives
+the ability to distinguish between different versions of the same entity but
+with different ABI versions supported. For example, a newer version of a class
+could have a different set of data members and thus have a different size. Using
+the ``abi_tag`` attribute, it is possible to have different mangled names for
+a global variable of the class type. Therefor, the old code could keep using
+the old manged name and the new code will use the new mangled name with tags.
+  }];
+}
+
+def PreserveMostDocs : Documentation {
+  let Category = DocCatCallingConvs;
+  let Content = [{
+On X86-64 and AArch64 targets, this attribute changes the calling convention of
+a function. The ``preserve_most`` calling convention attempts to make the code
+in the caller as unintrusive as possible. This convention behaves identically
+to the ``C`` calling convention on how arguments and return values are passed,
+but it uses a different set of caller/callee-saved registers. This alleviates
+the burden of saving and recovering a large register set before and after the
+call in the caller. If the arguments are passed in callee-saved registers,
+then they will be preserved by the callee across the call. This doesn't
+apply for values returned in callee-saved registers.
+
+- On X86-64 the callee preserves all general purpose registers, except for
+  R11. R11 can be used as a scratch register. Floating-point registers
+  (XMMs/YMMs) are not preserved and need to be saved by the caller.
+
+The idea behind this convention is to support calls to runtime functions
+that have a hot path and a cold path. The hot path is usually a small piece
+of code that doesn't use many registers. The cold path might need to call out to
+another function and therefore only needs to preserve the caller-saved
+registers, which haven't already been saved by the caller. The
+`preserve_most` calling convention is very similar to the ``cold`` calling
+convention in terms of caller/callee-saved registers, but they are used for
+different types of function calls. ``coldcc`` is for function calls that are
+rarely executed, whereas `preserve_most` function calls are intended to be
+on the hot path and definitely executed a lot. Furthermore ``preserve_most``
+doesn't prevent the inliner from inlining the function call.
+
+This calling convention will be used by a future version of the Objective-C
+runtime and should therefore still be considered experimental at this time.
+Although this convention was created to optimize certain runtime calls to
+the Objective-C runtime, it is not limited to this runtime and might be used
+by other runtimes in the future too. The current implementation only
+supports X86-64 and AArch64, but the intention is to support more architectures
+in the future.
+  }];
+}
+
+def PreserveAllDocs : Documentation {
+  let Category = DocCatCallingConvs;
+  let Content = [{
+On X86-64 and AArch64 targets, this attribute changes the calling convention of
+a function. The ``preserve_all`` calling convention attempts to make the code
+in the caller even less intrusive than the ``preserve_most`` calling convention.
+This calling convention also behaves identical to the ``C`` calling convention
+on how arguments and return values are passed, but it uses a different set of
+caller/callee-saved registers. This removes the burden of saving and
+recovering a large register set before and after the call in the caller. If
+the arguments are passed in callee-saved registers, then they will be
+preserved by the callee across the call. This doesn't apply for values
+returned in callee-saved registers.
+
+- On X86-64 the callee preserves all general purpose registers, except for
+  R11. R11 can be used as a scratch register. Furthermore it also preserves
+  all floating-point registers (XMMs/YMMs).
+
+The idea behind this convention is to support calls to runtime functions
+that don't need to call out to any other functions.
+
+This calling convention, like the ``preserve_most`` calling convention, will be
+used by a future version of the Objective-C runtime and should be considered
+experimental at this time.
+  }];
+}
+
+def DeprecatedDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``deprecated`` attribute can be applied to a function, a variable, or a
+type. This is useful when identifying functions, variables, or types that are
+expected to be removed in a future version of a program.
+
+Consider the function declaration for a hypothetical function ``f``:
+
+.. code-block:: c++
+
+  void f(void) __attribute__((deprecated("message", "replacement")));
+
+When spelled as `__attribute__((deprecated))`, the deprecated attribute can have
+two optional string arguments. The first one is the message to display when
+emitting the warning; the second one enables the compiler to provide a Fix-It
+to replace the deprecated name with a new name. Otherwise, when spelled as
+`[[gnu::deprecated]] or [[deprecated]]`, the attribute can have one optional
+string argument which is the message to display when emitting the warning.
+  }];
+}
+
+def IFuncDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+``__attribute__((ifunc("resolver")))`` is used to mark that the address of a declaration should be resolved at runtime by calling a resolver function.
+
+The symbol name of the resolver function is given in quotes.  A function with this name (after mangling) must be defined in the current translation unit; it may be ``static``.  The resolver function should take no arguments and return a pointer.
+
+The ``ifunc`` attribute may only be used on a function declaration.  A function declaration with an ``ifunc`` attribute is considered to be a definition of the declared entity.  The entity must not have weak linkage; for example, in C++, it cannot be applied to a declaration if a definition at that location would be considered inline.
+
+Not all targets support this attribute.  ELF targets support this attribute when using binutils v2.20.1 or higher and glibc v2.11.1 or higher.  Non-ELF targets currently do not support this attribute.
+  }];
+}
+
+def LTOVisibilityDocs : Documentation {
+  let Category = DocCatType;
+  let Content = [{
+See :doc:`LTOVisibility`.
+  }];
+}
+
+def RenderScriptKernelAttributeDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+``__attribute__((kernel))`` is used to mark a ``kernel`` function in
+RenderScript.
+
+In RenderScript, ``kernel`` functions are used to express data-parallel
+computations.  The RenderScript runtime efficiently parallelizes ``kernel``
+functions to run on computational resources such as multi-core CPUs and GPUs.
+See the RenderScript_ documentation for more information.
+
+.. _RenderScript: https://developer.android.com/guide/topics/renderscript/compute.html
+  }];
+}
+
+def XRayDocs : Documentation {
+  let Category = DocCatFunction;
+  let Heading = "xray_always_instrument (clang::xray_always_instrument), xray_never_instrument (clang::xray_never_instrument)";
+  let Content = [{
+``__attribute__((xray_always_instrument))`` or ``[[clang::xray_always_instrument]]`` is used to mark member functions (in C++), methods (in Objective C), and free functions (in C, C++, and Objective C) to be instrumented with XRay. This will cause the function to always have space at the beginning and exit points to allow for runtime patching.
+
+Conversely, ``__attribute__((xray_never_instrument))`` or ``[[clang::xray_never_instrument]]`` will inhibit the insertion of these instrumentation points.
+
+If a function has neither of these attributes, they become subject to the XRay heuristics used to determine whether a function should be instrumented or otherwise.
+  }];
+}
diff --git a/include/clang/Basic/Builtins.def b/include/clang/Basic/Builtins.def
index 1da3f02..2a37541 100644
--- a/include/clang/Basic/Builtins.def
+++ b/include/clang/Basic/Builtins.def
@@ -67,6 +67,7 @@
 // Builtin::Context class.  Currently we have:
 //  n -> nothrow
 //  r -> noreturn
+//  U -> pure
 //  c -> const
 //  t -> signature is meaningless, use custom typechecking
 //  F -> this is a libc/libm function with a '__builtin_' prefix added.
@@ -377,6 +378,11 @@
 BUILTIN(__builtin_signbitf, "if", "Fnc")
 BUILTIN(__builtin_signbitl, "iLd", "Fnc")
 
+// Special FP builtins.
+BUILTIN(__builtin_canonicalize, "dd", "nc")
+BUILTIN(__builtin_canonicalizef, "ff", "nc")
+BUILTIN(__builtin_canonicalizel, "LdLd", "nc")
+
 // Builtins for arithmetic.
 BUILTIN(__builtin_clzs , "iUs"  , "nc")
 BUILTIN(__builtin_clz  , "iUi"  , "nc")
@@ -404,6 +410,11 @@
 BUILTIN(__builtin_bswap32, "UiUi", "nc")
 BUILTIN(__builtin_bswap64, "ULLiULLi", "nc")
 
+BUILTIN(__builtin_bitreverse8, "UcUc", "nc")
+BUILTIN(__builtin_bitreverse16, "UsUs", "nc")
+BUILTIN(__builtin_bitreverse32, "UiUi", "nc")
+BUILTIN(__builtin_bitreverse64, "ULLiULLi", "nc")
+
 // Random GCC builtins
 BUILTIN(__builtin_constant_p, "i.", "nctu")
 BUILTIN(__builtin_classify_type, "i.", "nctu")
@@ -456,6 +467,7 @@
 BUILTIN(__builtin_snprintf, "ic*zcC*.", "nFp:2:")
 BUILTIN(__builtin_vsprintf, "ic*cC*a", "nFP:1:")
 BUILTIN(__builtin_vsnprintf, "ic*zcC*a", "nFP:2:")
+BUILTIN(__builtin_thread_pointer, "v*", "nc")
 
 // GCC exception builtins
 BUILTIN(__builtin_eh_return, "vzv*", "r") // FIXME: Takes intptr_t, not size_t!
@@ -763,6 +775,22 @@
 LIBBUILTIN(vscanf, "icC*Ra",      "fS:0:", "stdio.h", ALL_LANGUAGES)
 LIBBUILTIN(vfscanf, "iP*RcC*Ra",  "fS:1:", "stdio.h", ALL_LANGUAGES)
 LIBBUILTIN(vsscanf, "icC*RcC*Ra", "fS:1:", "stdio.h", ALL_LANGUAGES)
+// C99 ctype.h
+LIBBUILTIN(isalnum, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(isalpha, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(isblank, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(iscntrl, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(isdigit, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(isgraph, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(islower, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(isprint, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(ispunct, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(isspace, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(isupper, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(isxdigit, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(tolower, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+LIBBUILTIN(toupper, "ii", "fnU", "ctype.h", ALL_LANGUAGES)
+
 // C99
 // In some systems setjmp is a macro that expands to _setjmp. We undefine
 // it here to avoid having two identical LIBBUILTIN entries.
@@ -1255,6 +1283,42 @@
 // Builtins for os_log/os_trace
 BUILTIN(__builtin_os_log_format_buffer_size, "zcC*.", "p:0:nut")
 BUILTIN(__builtin_os_log_format, "v*v*cC*.", "p:0:nt")
+// OpenCL v2.0 s6.13.16, s9.17.3.5 - Pipe functions.
+// We need the generic prototype, since the packet type could be anything.
+LANGBUILTIN(read_pipe, "i.", "tn", OCLC20_LANG)
+LANGBUILTIN(write_pipe, "i.", "tn", OCLC20_LANG)
+
+LANGBUILTIN(reserve_read_pipe, "i.", "tn", OCLC20_LANG)
+LANGBUILTIN(reserve_write_pipe, "i.", "tn", OCLC20_LANG)
+
+LANGBUILTIN(commit_write_pipe, "v.", "tn", OCLC20_LANG)
+LANGBUILTIN(commit_read_pipe, "v.", "tn", OCLC20_LANG)
+
+LANGBUILTIN(sub_group_reserve_read_pipe, "i.", "tn", OCLC20_LANG)
+LANGBUILTIN(sub_group_reserve_write_pipe, "i.", "tn", OCLC20_LANG)
+
+LANGBUILTIN(sub_group_commit_read_pipe, "v.", "tn", OCLC20_LANG)
+LANGBUILTIN(sub_group_commit_write_pipe, "v.", "tn", OCLC20_LANG)
+
+LANGBUILTIN(work_group_reserve_read_pipe, "i.", "tn", OCLC20_LANG)
+LANGBUILTIN(work_group_reserve_write_pipe, "i.", "tn", OCLC20_LANG)
+
+LANGBUILTIN(work_group_commit_read_pipe, "v.", "tn", OCLC20_LANG)
+LANGBUILTIN(work_group_commit_write_pipe, "v.", "tn", OCLC20_LANG)
+
+LANGBUILTIN(get_pipe_num_packets, "Ui.", "tn", OCLC20_LANG)
+LANGBUILTIN(get_pipe_max_packets, "Ui.", "tn", OCLC20_LANG)
+
+// OpenCL v2.0 s6.13.17 - Enqueue kernel functions.
+// Custom builtin check allows to perform special check of passed block arguments.
+LANGBUILTIN(enqueue_kernel, "i.", "tn", OCLC20_LANG)
+LANGBUILTIN(get_kernel_work_group_size, "i.", "tn", OCLC20_LANG)
+LANGBUILTIN(get_kernel_preferred_work_group_size_multiple, "i.", "tn", OCLC20_LANG)
+
+// OpenCL v2.0 s6.13.9 - Address space qualifier functions.
+LANGBUILTIN(to_global, "v*v*", "tn", OCLC20_LANG)
+LANGBUILTIN(to_local, "v*v*", "tn", OCLC20_LANG)
+LANGBUILTIN(to_private, "v*v*", "tn", OCLC20_LANG)
 
 #undef BUILTIN
 #undef LIBBUILTIN
diff --git a/include/clang/Basic/Builtins.h b/include/clang/Basic/Builtins.h
index c0a6af9..15e9a41 100644
--- a/include/clang/Basic/Builtins.h
+++ b/include/clang/Basic/Builtins.h
@@ -31,11 +31,12 @@
 class LangOptions;
 
 enum LanguageID {
-  GNU_LANG = 0x1,  // builtin requires GNU mode.
-  C_LANG = 0x2,    // builtin for c only.
-  CXX_LANG = 0x4,  // builtin for cplusplus only.
-  OBJC_LANG = 0x8, // builtin for objective-c and objective-c++
-  MS_LANG = 0x10,  // builtin requires MS mode.
+  GNU_LANG = 0x1,     // builtin requires GNU mode.
+  C_LANG = 0x2,       // builtin for c only.
+  CXX_LANG = 0x4,     // builtin for cplusplus only.
+  OBJC_LANG = 0x8,    // builtin for objective-c and objective-c++
+  MS_LANG = 0x10,     // builtin requires MS mode.
+  OCLC20_LANG = 0x20, // builtin for OpenCL C only.
   ALL_LANGUAGES = C_LANG | CXX_LANG | OBJC_LANG, // builtin for all languages.
   ALL_GNU_LANGUAGES = ALL_LANGUAGES | GNU_LANG,  // builtin requires GNU mode.
   ALL_MS_LANGUAGES = ALL_LANGUAGES | MS_LANG     // builtin requires MS mode.
@@ -88,11 +89,16 @@
     return getRecord(ID).Type;
   }
 
-  /// \brief Return true if this function is a target-specific builtin
+  /// \brief Return true if this function is a target-specific builtin.
   bool isTSBuiltin(unsigned ID) const {
     return ID >= Builtin::FirstTSBuiltin;
   }
 
+  /// \brief Return true if this function has no side effects.
+  bool isPure(unsigned ID) const {
+    return strchr(getRecord(ID).Attributes, 'U') != nullptr;
+  }
+
   /// \brief Return true if this function has no side effects and doesn't
   /// read memory.
   bool isConst(unsigned ID) const {
@@ -154,7 +160,7 @@
   /// \brief Completely forget that the given ID was ever considered a builtin,
   /// e.g., because the user provided a conflicting signature.
   void forgetBuiltin(unsigned ID, IdentifierTable &Table);
-  
+
   /// \brief If this is a library function that comes from a specific
   /// header, retrieve that header name.
   const char *getHeaderName(unsigned ID) const {
@@ -213,7 +219,10 @@
 /// \brief Kinds of BuiltinTemplateDecl.
 enum BuiltinTemplateKind : int {
   /// \brief This names the __make_integer_seq BuiltinTemplateDecl.
-  BTK__make_integer_seq
+  BTK__make_integer_seq,
+
+  /// \brief This names the __type_pack_element BuiltinTemplateDecl.
+  BTK__type_pack_element
 };
 
 } // end namespace clang
diff --git a/include/clang/Basic/BuiltinsAArch64.def b/include/clang/Basic/BuiltinsAArch64.def
index b440443..1db4c14 100644
--- a/include/clang/Basic/BuiltinsAArch64.def
+++ b/include/clang/Basic/BuiltinsAArch64.def
@@ -60,6 +60,5 @@
 BUILTIN(__builtin_arm_wsr, "vcC*Ui", "nc")
 BUILTIN(__builtin_arm_wsr64, "vcC*LUi", "nc")
 BUILTIN(__builtin_arm_wsrp, "vcC*vC*", "nc")
-BUILTIN(__builtin_thread_pointer, "v*", "nc")
 
 #undef BUILTIN
diff --git a/include/clang/Basic/BuiltinsAMDGPU.def b/include/clang/Basic/BuiltinsAMDGPU.def
index bb9931f..b4314e6 100644
--- a/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/include/clang/Basic/BuiltinsAMDGPU.def
@@ -7,30 +7,103 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the R600-specific builtin function database. Users of this
-// file must define the BUILTIN macro to make use of this information.
+// This file defines the AMDGPU-specific builtin function database. Users of
+// this file must define the BUILTIN macro to make use of this information.
 //
 //===----------------------------------------------------------------------===//
 
 // The format of this database matches clang/Basic/Builtins.def.
 
-BUILTIN(__builtin_amdgpu_div_scale, "dddbb*", "n")
-BUILTIN(__builtin_amdgpu_div_scalef, "fffbb*", "n")
-BUILTIN(__builtin_amdgpu_div_fmas, "ddddb", "nc")
-BUILTIN(__builtin_amdgpu_div_fmasf, "ffffb", "nc")
-BUILTIN(__builtin_amdgpu_div_fixup, "dddd", "nc")
-BUILTIN(__builtin_amdgpu_div_fixupf, "ffff", "nc")
-BUILTIN(__builtin_amdgpu_trig_preop, "ddi", "nc")
-BUILTIN(__builtin_amdgpu_trig_preopf, "ffi", "nc")
-BUILTIN(__builtin_amdgpu_rcp, "dd", "nc")
-BUILTIN(__builtin_amdgpu_rcpf, "ff", "nc")
-BUILTIN(__builtin_amdgpu_rsq, "dd", "nc")
-BUILTIN(__builtin_amdgpu_rsqf, "ff", "nc")
-BUILTIN(__builtin_amdgpu_rsq_clamped, "dd", "nc")
-BUILTIN(__builtin_amdgpu_rsq_clampedf, "ff", "nc")
-BUILTIN(__builtin_amdgpu_ldexp, "ddi", "nc")
-BUILTIN(__builtin_amdgpu_ldexpf, "ffi", "nc")
-BUILTIN(__builtin_amdgpu_class, "bdi", "nc")
-BUILTIN(__builtin_amdgpu_classf, "bfi", "nc")
+#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
+#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
+#endif
+//===----------------------------------------------------------------------===//
+// SI+ only builtins.
+//===----------------------------------------------------------------------===//
+
+BUILTIN(__builtin_amdgcn_kernarg_segment_ptr, "Uc*2", "nc")
+BUILTIN(__builtin_amdgcn_implicitarg_ptr, "Uc*2", "nc")
+
+BUILTIN(__builtin_amdgcn_workgroup_id_x, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_workgroup_id_y, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_workgroup_id_z, "Ui", "nc")
+
+BUILTIN(__builtin_amdgcn_workitem_id_x, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_workitem_id_y, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_workitem_id_z, "Ui", "nc")
+
+//===----------------------------------------------------------------------===//
+// Instruction builtins.
+//===----------------------------------------------------------------------===//
+BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
+BUILTIN(__builtin_amdgcn_div_scale, "dddbb*", "n")
+BUILTIN(__builtin_amdgcn_div_scalef, "fffbb*", "n")
+BUILTIN(__builtin_amdgcn_div_fmas, "ddddb", "nc")
+BUILTIN(__builtin_amdgcn_div_fmasf, "ffffb", "nc")
+BUILTIN(__builtin_amdgcn_div_fixup, "dddd", "nc")
+BUILTIN(__builtin_amdgcn_div_fixupf, "ffff", "nc")
+BUILTIN(__builtin_amdgcn_trig_preop, "ddi", "nc")
+BUILTIN(__builtin_amdgcn_trig_preopf, "ffi", "nc")
+BUILTIN(__builtin_amdgcn_rcp, "dd", "nc")
+BUILTIN(__builtin_amdgcn_rcpf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_rsq, "dd", "nc")
+BUILTIN(__builtin_amdgcn_rsqf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_rsq_clamp, "dd", "nc")
+BUILTIN(__builtin_amdgcn_rsq_clampf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_sinf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_cosf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_log_clampf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_ldexp, "ddi", "nc")
+BUILTIN(__builtin_amdgcn_ldexpf, "ffi", "nc")
+BUILTIN(__builtin_amdgcn_frexp_mant, "dd", "nc")
+BUILTIN(__builtin_amdgcn_frexp_mantf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_frexp_exp, "id", "nc")
+BUILTIN(__builtin_amdgcn_frexp_expf, "if", "nc")
+BUILTIN(__builtin_amdgcn_fract, "dd", "nc")
+BUILTIN(__builtin_amdgcn_fractf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_lerp, "UiUiUiUi", "nc")
+BUILTIN(__builtin_amdgcn_class, "bdi", "nc")
+BUILTIN(__builtin_amdgcn_classf, "bfi", "nc")
+BUILTIN(__builtin_amdgcn_cubeid, "ffff", "nc")
+BUILTIN(__builtin_amdgcn_cubesc, "ffff", "nc")
+BUILTIN(__builtin_amdgcn_cubetc, "ffff", "nc")
+BUILTIN(__builtin_amdgcn_cubema, "ffff", "nc")
+BUILTIN(__builtin_amdgcn_s_memtime, "LUi", "n")
+BUILTIN(__builtin_amdgcn_s_sleep, "vIi", "n")
+BUILTIN(__builtin_amdgcn_uicmp, "LUiUiUiIi", "nc")
+BUILTIN(__builtin_amdgcn_uicmpl, "LUiLUiLUiIi", "nc")
+BUILTIN(__builtin_amdgcn_sicmp, "LUiiiIi", "nc")
+BUILTIN(__builtin_amdgcn_sicmpl, "LUiLiLiIi", "nc")
+BUILTIN(__builtin_amdgcn_fcmp, "LUiddIi", "nc")
+BUILTIN(__builtin_amdgcn_fcmpf, "LUiffIi", "nc")
+
+//===----------------------------------------------------------------------===//
+// VI+ only builtins.
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_amdgcn_s_memrealtime, "LUi", "n", "s-memrealtime")
+
+//===----------------------------------------------------------------------===//
+// Special builtins.
+//===----------------------------------------------------------------------===//
+BUILTIN(__builtin_amdgcn_read_exec, "LUi", "nc")
+
+//===----------------------------------------------------------------------===//
+// R600-NI only builtins.
+//===----------------------------------------------------------------------===//
+
+BUILTIN(__builtin_r600_implicitarg_ptr, "Uc*7", "nc")
+
+BUILTIN(__builtin_r600_read_tgid_x, "Ui", "nc")
+BUILTIN(__builtin_r600_read_tgid_y, "Ui", "nc")
+BUILTIN(__builtin_r600_read_tgid_z, "Ui", "nc")
+
+BUILTIN(__builtin_r600_read_tidig_x, "Ui", "nc")
+BUILTIN(__builtin_r600_read_tidig_y, "Ui", "nc")
+BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc")
+
+BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
+BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")
 
 #undef BUILTIN
+#undef TARGET_BUILTIN
diff --git a/include/clang/Basic/BuiltinsARM.def b/include/clang/Basic/BuiltinsARM.def
index 3e8e2bf..93b6458 100644
--- a/include/clang/Basic/BuiltinsARM.def
+++ b/include/clang/Basic/BuiltinsARM.def
@@ -20,7 +20,6 @@
 
 // In libgcc
 BUILTIN(__clear_cache, "vv*v*", "i")
-BUILTIN(__builtin_thread_pointer, "v*", "")
 
 // Saturating arithmetic
 BUILTIN(__builtin_arm_qadd, "iii", "nc")
@@ -48,14 +47,26 @@
 BUILTIN(__builtin_arm_vcvtr_d, "fdi", "nc")
 
 // Coprocessor
+BUILTIN(__builtin_arm_ldc, "vUIiUIivC*", "")
+BUILTIN(__builtin_arm_ldcl, "vUIiUIivC*", "")
+BUILTIN(__builtin_arm_ldc2, "vUIiUIivC*", "")
+BUILTIN(__builtin_arm_ldc2l, "vUIiUIivC*", "")
+
+BUILTIN(__builtin_arm_stc, "vUIiUIiv*", "")
+BUILTIN(__builtin_arm_stcl, "vUIiUIiv*", "")
+BUILTIN(__builtin_arm_stc2, "vUIiUIiv*", "")
+BUILTIN(__builtin_arm_stc2l, "vUIiUIiv*", "")
+
+BUILTIN(__builtin_arm_cdp, "vUIiUIiUIiUIiUIiUIi", "")
+BUILTIN(__builtin_arm_cdp2, "vUIiUIiUIiUIiUIiUIi", "")
 BUILTIN(__builtin_arm_mcr, "vUIiUIiUiUIiUIiUIi", "")
 BUILTIN(__builtin_arm_mcr2, "vUIiUIiUiUIiUIiUIi", "")
 BUILTIN(__builtin_arm_mrc, "UiUIiUIiUIiUIiUIi", "")
 BUILTIN(__builtin_arm_mrc2, "UiUIiUIiUIiUIiUIi", "")
-BUILTIN(__builtin_arm_cdp, "vUiUiUiUiUiUi", "")
-BUILTIN(__builtin_arm_cdp2, "vUiUiUiUiUiUi", "")
-BUILTIN(__builtin_arm_mcrr, "vUIiUIiUiUiUIi", "")
-BUILTIN(__builtin_arm_mcrr2, "vUIiUIiUiUiUIi", "")
+BUILTIN(__builtin_arm_mcrr, "vUIiUIiLLUiUIi", "")
+BUILTIN(__builtin_arm_mcrr2, "vUIiUIiLLUiUIi", "")
+BUILTIN(__builtin_arm_mrrc, "LLUiUIiUIiUIi", "")
+BUILTIN(__builtin_arm_mrrc2, "LLUiUIiUIiUIi", "")
 
 // CRC32
 BUILTIN(__builtin_arm_crc32b, "UiUiUc", "nc")
diff --git a/include/clang/Basic/BuiltinsHexagon.def b/include/clang/Basic/BuiltinsHexagon.def
index c4f0324..85936cb 100644
--- a/include/clang/Basic/BuiltinsHexagon.def
+++ b/include/clang/Basic/BuiltinsHexagon.def
@@ -18,7 +18,28 @@
 // Make sure you do not overwrite these.
 
 BUILTIN(__builtin_SI_to_SXTHI_asrh, "ii", "")
-BUILTIN(__builtin_circ_ldd, "LLi*LLi*LLi*ii", "")
+BUILTIN(__builtin_brev_ldd,   "LLi*LLi*LLi*i", "")
+BUILTIN(__builtin_brev_ldw,   "i*i*i*i", "")
+BUILTIN(__builtin_brev_ldh,   "s*s*s*i", "")
+BUILTIN(__builtin_brev_lduh,  "Us*Us*Us*i", "")
+BUILTIN(__builtin_brev_ldb,   "c*c*c*i", "")
+BUILTIN(__builtin_brev_ldub,  "Uc*Uc*Uc*i", "")
+BUILTIN(__builtin_circ_ldd,   "LLi*LLi*LLi*iIi", "")
+BUILTIN(__builtin_circ_ldw,   "i*i*i*iIi", "")
+BUILTIN(__builtin_circ_ldh,   "s*s*s*iIi", "")
+BUILTIN(__builtin_circ_lduh,  "Us*Us*Us*iIi", "")
+BUILTIN(__builtin_circ_ldb,   "c*c*c*iIi", "")
+BUILTIN(__builtin_circ_ldub,  "Uc*Uc*Uc*iIi", "")
+BUILTIN(__builtin_brev_std,   "LLi*LLi*LLii", "")
+BUILTIN(__builtin_brev_stw,   "i*i*ii", "")
+BUILTIN(__builtin_brev_sth,   "s*s*ii", "")
+BUILTIN(__builtin_brev_sthhi, "s*s*ii", "")
+BUILTIN(__builtin_brev_stb,   "c*c*ii", "")
+BUILTIN(__builtin_circ_std,   "LLi*LLi*LLiiIi", "")
+BUILTIN(__builtin_circ_stw,   "i*i*iiIi", "")
+BUILTIN(__builtin_circ_sth,   "s*s*iiIi", "")
+BUILTIN(__builtin_circ_sthhi, "s*s*iiIi", "")
+BUILTIN(__builtin_circ_stb,   "c*c*iiIi", "")
 
 // The builtins above are not autogenerated from iset.py.
 // Make sure you do not overwrite these.
@@ -632,16 +653,6 @@
 BUILTIN(__builtin_HEXAGON_F2_sffixupn,"fff","")
 BUILTIN(__builtin_HEXAGON_F2_sffixupd,"fff","")
 BUILTIN(__builtin_HEXAGON_F2_sffixupr,"ff","")
-BUILTIN(__builtin_HEXAGON_F2_dfadd,"ddd","")
-BUILTIN(__builtin_HEXAGON_F2_dfsub,"ddd","")
-BUILTIN(__builtin_HEXAGON_F2_dfmpy,"ddd","")
-BUILTIN(__builtin_HEXAGON_F2_dffma,"dddd","")
-BUILTIN(__builtin_HEXAGON_F2_dffms,"dddd","")
-BUILTIN(__builtin_HEXAGON_F2_dffma_lib,"dddd","")
-BUILTIN(__builtin_HEXAGON_F2_dffms_lib,"dddd","")
-BUILTIN(__builtin_HEXAGON_F2_dffma_sc,"ddddi","")
-BUILTIN(__builtin_HEXAGON_F2_dfmax,"ddd","")
-BUILTIN(__builtin_HEXAGON_F2_dfmin,"ddd","")
 BUILTIN(__builtin_HEXAGON_F2_dfcmpeq,"idd","")
 BUILTIN(__builtin_HEXAGON_F2_dfcmpgt,"idd","")
 BUILTIN(__builtin_HEXAGON_F2_dfcmpge,"idd","")
@@ -649,9 +660,6 @@
 BUILTIN(__builtin_HEXAGON_F2_dfclass,"idi","")
 BUILTIN(__builtin_HEXAGON_F2_dfimm_p,"di","")
 BUILTIN(__builtin_HEXAGON_F2_dfimm_n,"di","")
-BUILTIN(__builtin_HEXAGON_F2_dffixupn,"ddd","")
-BUILTIN(__builtin_HEXAGON_F2_dffixupd,"ddd","")
-BUILTIN(__builtin_HEXAGON_F2_dffixupr,"dd","")
 BUILTIN(__builtin_HEXAGON_F2_conv_sf2df,"df","")
 BUILTIN(__builtin_HEXAGON_F2_conv_df2sf,"fd","")
 BUILTIN(__builtin_HEXAGON_F2_conv_uw2sf,"fi","")
@@ -875,4 +883,623 @@
 BUILTIN(__builtin_HEXAGON_S2_interleave,"LLiLLi","")
 BUILTIN(__builtin_HEXAGON_S2_deinterleave,"LLiLLi","")
 
+BUILTIN(__builtin_HEXAGON_S6_rol_i_r,"iii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_p,"LLiLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_r_acc,"iiii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_p_acc,"LLiLLiLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_r_nac,"iiii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_p_nac,"LLiLLiLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_r_xacc,"iiii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_p_xacc,"LLiLLiLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_r_and,"iiii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_r_or,"iiii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_p_and,"LLiLLiLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_S6_rol_i_p_or,"LLiLLiLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_S2_cabacencbin,"LLiLLiLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_valignb,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_valignb_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlalignb,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlalignb_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_valignbi,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_valignbi_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlalignbi,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlalignbi_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vror,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vror_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackub,"V32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackub_128B,"V64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackb,"V32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackb_128B,"V64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackuh,"V32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackuh_128B,"V64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackh,"V32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackh_128B,"V64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackob,"V32iV32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackob_128B,"V64iV64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackoh,"V32iV32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vunpackoh_128B,"V64iV64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackeb,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackeb_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackeh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackeh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackob,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackob_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackoh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackoh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackhub_sat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackhub_sat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackhb_sat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackhb_sat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackwuh_sat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackwuh_sat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackwh_sat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpackwh_sat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vzb,"V32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vzb_128B,"V64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsb,"V32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsb_128B,"V64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vzh,"V32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vzh_128B,"V64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsh,"V32iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsh_128B,"V64iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpybus,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpybus_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpybus_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpybus_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpybus_dv,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpybus_dv_128B,"V64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpybus_dv_acc,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpybus_dv_acc_128B,"V64iV64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhb,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_dv,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_dv_128B,"V64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_dv_acc,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_dv_acc_128B,"V64iV64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhvsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhvsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhvsat_acc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhvsat_acc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsat,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsat_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsat_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsat_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhisat,"V16iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhisat_128B,"V32iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhisat_acc,"V16iV16iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhisat_acc_128B,"V32iV32iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsusat,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsusat_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsusat_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsusat_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsuisat,"V16iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsuisat_128B,"V32iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsuisat_acc,"V16iV16iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdmpyhsuisat_acc_128B,"V32iV32iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpyb,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpyb_128B,"V64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpyb_acc,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpyb_acc_128B,"V64iV64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpybus,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpybus_128B,"V64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpybus_acc,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpybus_acc_128B,"V64iV64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpyhb,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpyhb_128B,"V64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpyhb_acc,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vtmpyhb_acc_128B,"V64iV64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyub,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyub_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyub_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyub_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyubv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyubv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyubv_acc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyubv_acc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybv_acc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybv_acc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyubi,"V32iV32iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyubi_128B,"V64iV64iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyubi_acc,"V32iV32iV32iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpyubi_acc_128B,"V64iV64iV64iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybus,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybus_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybus_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybus_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybusi,"V32iV32iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybusi_128B,"V64iV64iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybusi_acc,"V32iV32iV32iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybusi_acc_128B,"V64iV64iV64iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybusv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybusv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybusv_acc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrmpybusv_acc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdsaduh,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdsaduh_128B,"V64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdsaduh_acc,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdsaduh_acc_128B,"V64iV64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrsadubi,"V32iV32iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrsadubi_128B,"V64iV64iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrsadubi_acc,"V32iV32iV32iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrsadubi_acc_128B,"V64iV64iV64iii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrw,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrw_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslw,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslw_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlsrw,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlsrw_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslwv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslwv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlsrwv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlsrwv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrh,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrh_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslh,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslh_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlsrh,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlsrh_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrhv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrhv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslhv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslhv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlsrhv,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlsrhv_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwh,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwh_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwhsat,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwhsat_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwhrndsat,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwhrndsat_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwuhsat,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrwuhsat_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vroundwh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vroundwh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vroundwuh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vroundwuh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrhubsat,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrhubsat_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrhubrndsat,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrhubrndsat_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrhbrndsat,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrhbrndsat_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vroundhb,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vroundhb_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vroundhub,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vroundhub_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslw_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaslw_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrw_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vasrw_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddb,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddb_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubb,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubb_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddb_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddb_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubb_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubb_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddh_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddh_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubh_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubh_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddw_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddw_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubw_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubw_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddubsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddubsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddubsat_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddubsat_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsububsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsububsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsububsat_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsububsat_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vadduhsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vadduhsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vadduhsat_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vadduhsat_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubuhsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubuhsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubuhsat_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubuhsat_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhsat_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhsat_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhsat_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhsat_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddwsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddwsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddwsat_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddwsat_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubwsat,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubwsat_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubwsat_dv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubwsat_dv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgub,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgub_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgubrnd,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgubrnd_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavguh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavguh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavguhrnd,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavguhrnd_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavghrnd,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavghrnd_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnavgh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnavgh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgwrnd,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vavgwrnd_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnavgw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnavgw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsdiffub,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsdiffub_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsdiffuh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsdiffuh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsdiffh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsdiffh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsdiffw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsdiffw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnavgub,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnavgub_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddubh,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddubh_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsububh,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsububh_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhw,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhw_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhw,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhw_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vadduhw,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vadduhw_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubuhw,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubuhw_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vd0,"V16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vd0_128B,"V32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddbq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddbq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubbq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubbq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddbnq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddbnq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubbnq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubbnq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhnq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddhnq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhnq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubhnq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddwq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddwq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubwq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubwq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddwnq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vaddwnq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubwnq,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsubwnq_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsh,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsh_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsh_sat,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsh_sat_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsw,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsw_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsw_sat,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vabsw_sat_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybv,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybv_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybv_acc,"V32iV32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybv_acc_128B,"V64iV64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyubv,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyubv_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyubv_acc,"V32iV32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyubv_acc_128B,"V64iV64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybusv,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybusv_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybusv_acc,"V32iV32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybusv_acc_128B,"V64iV64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpabusv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpabusv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpabuuv,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpabuuv_128B,"V64iV64iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhv,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhv_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhv_acc,"V32iV32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhv_acc_128B,"V64iV64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyuhv,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyuhv_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyuhv_acc,"V32iV32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyuhv_acc_128B,"V64iV64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhvsrs,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhvsrs_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhus,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhus_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhus_acc,"V32iV32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhus_acc_128B,"V64iV64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyih,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyih_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyih_acc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyih_acc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyewuh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyewuh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyowh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyowh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyowh_rnd,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyowh_rnd_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyowh_sacc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyowh_sacc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyowh_rnd_sacc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyowh_rnd_sacc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyieoh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyieoh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiewuh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiewuh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiowh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiowh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiewh_acc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiewh_acc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiewuh_acc,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiewuh_acc_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyub,"V32iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyub_128B,"V64iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyub_acc,"V32iV32iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyub_acc_128B,"V64iV64iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybus,"V32iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybus_128B,"V64iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybus_acc,"V32iV32iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpybus_acc_128B,"V64iV64iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpabus,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpabus_128B,"V64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpabus_acc,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpabus_acc_128B,"V64iV64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpahb,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpahb_128B,"V64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpahb_acc,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpahb_acc_128B,"V64iV64iV64ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyh,"V32iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyh_128B,"V64iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhsat_acc,"V32iV32iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhsat_acc_128B,"V64iV64iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhss,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhss_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhsrs,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyhsrs_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyuh,"V32iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyuh_128B,"V64iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyuh_acc,"V32iV32iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyuh_acc_128B,"V64iV64iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyihb,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyihb_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyihb_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyihb_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiwb,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiwb_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiwb_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiwb_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiwh,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiwh_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiwh_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmpyiwh_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vand,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vand_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vor,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vor_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vxor,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vxor_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnot,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnot_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vandqrt,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vandqrt_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vandqrt_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vandqrt_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vandvrt,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vandvrt_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vandvrt_acc,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vandvrt_acc_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtw_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtw_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtw_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtw_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtw_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtw_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqw_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqw_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqw_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqw_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqw_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqw_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgth,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgth_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgth_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgth_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgth_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgth_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgth_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgth_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqh_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqh_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqh_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqh_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqh_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqh_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtb,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtb_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtb_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtb_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtb_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtb_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtb_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtb_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqb,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqb_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqb_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqb_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqb_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqb_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqb_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_veqb_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuw_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuw_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuw_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuw_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuw_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuw_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuh_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuh_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuh_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuh_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuh_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtuh_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtub,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtub_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtub_and,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtub_and_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtub_or,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtub_or_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtub_xor,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vgtub_xor_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_or,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_or_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_and,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_and_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_not,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_not_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_xor,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_xor_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_and_n,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_and_n_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_or_n,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_or_n_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_scalar2,"V16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_pred_scalar2_128B,"V32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmux,"V16iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmux_128B,"V32iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vswap,"V32iV16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vswap_128B,"V64iV32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmaxub,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmaxub_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vminub,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vminub_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmaxuh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmaxuh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vminuh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vminuh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmaxh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmaxh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vminh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vminh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmaxw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vmaxw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vminw,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vminw_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsathub,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsathub_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsatwh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vsatwh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffeb,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffeb_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffob,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffob_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshufeh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshufeh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshufoh,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshufoh_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffvdd,"V32iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffvdd_128B,"V64iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdealvdd,"V32iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdealvdd_128B,"V64iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshufoeh,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshufoeh_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshufoeb,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshufoeb_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdealh,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdealh_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdealb,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdealb_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdealb4w,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdealb4w_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffh,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffh_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffb,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vshuffb_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_extractw,"iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_extractw_128B,"iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vinsertwr,"V16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vinsertwr_128B,"V32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_lvsplatw,"V16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_lvsplatw_128B,"V32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vassign,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vassign_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vcombine,"V32iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vcombine_128B,"V64iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutb,"V16iV16iLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutb_128B,"V32iV32iLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutb_acc,"V16iV16iV16iLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutb_acc_128B,"V32iV32iV32iLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutb_dv,"V32iV32iLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutb_dv_128B,"V64iV64iLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutb_dv_acc,"V32iV32iV32iLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutb_dv_acc_128B,"V64iV64iV64iLLii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdelta,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vdelta_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrdelta,"V16iV16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vrdelta_128B,"V32iV32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vcl0w,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vcl0w_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vcl0h,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vcl0h_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnormamtw,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnormamtw_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnormamth,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vnormamth_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpopcounth,"V16iV16i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vpopcounth_128B,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutvvb,"V16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutvvb_128B,"V32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutvvb_oracc,"V16iV16iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutvvb_oracc_128B,"V32iV32iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutvwh,"V32iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutvwh_128B,"V64iV32iV32ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutvwh_oracc,"V32iV32iV16iV16ii","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vlutvwh_oracc_128B,"V64iV64iV32iV32ii","v:60:")
+
+BUILTIN(__builtin_HEXAGON_V6_hi,"V16iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_lo,"V16iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_hi_128B,"V32iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_lo_128B,"V32iV64i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vassignp,"V32iV32i","v:60:")
+BUILTIN(__builtin_HEXAGON_V6_vassignp_128B,"V64iV64i","v:60:")
+
 #undef BUILTIN
diff --git a/include/clang/Basic/BuiltinsNVPTX.def b/include/clang/Basic/BuiltinsNVPTX.def
index 3ab6413..456d000 100644
--- a/include/clang/Basic/BuiltinsNVPTX.def
+++ b/include/clang/Basic/BuiltinsNVPTX.def
@@ -14,53 +14,50 @@
 
 // The format of this database matches clang/Basic/Builtins.def.
 
-// Builtins retained from previous PTX back-end
-BUILTIN(__builtin_ptx_read_tid_x, "i", "nc")
-BUILTIN(__builtin_ptx_read_tid_y, "i", "nc")
-BUILTIN(__builtin_ptx_read_tid_z, "i", "nc")
-BUILTIN(__builtin_ptx_read_tid_w, "i", "nc")
+// Special Registers
 
-BUILTIN(__builtin_ptx_read_ntid_x, "i", "nc")
-BUILTIN(__builtin_ptx_read_ntid_y, "i", "nc")
-BUILTIN(__builtin_ptx_read_ntid_z, "i", "nc")
-BUILTIN(__builtin_ptx_read_ntid_w, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_tid_x, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_tid_y, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_tid_z, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_tid_w, "i", "nc")
 
-BUILTIN(__builtin_ptx_read_ctaid_x, "i", "nc")
-BUILTIN(__builtin_ptx_read_ctaid_y, "i", "nc")
-BUILTIN(__builtin_ptx_read_ctaid_z, "i", "nc")
-BUILTIN(__builtin_ptx_read_ctaid_w, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_ntid_x, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_ntid_y, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_ntid_z, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_ntid_w, "i", "nc")
 
-BUILTIN(__builtin_ptx_read_nctaid_x, "i", "nc")
-BUILTIN(__builtin_ptx_read_nctaid_y, "i", "nc")
-BUILTIN(__builtin_ptx_read_nctaid_z, "i", "nc")
-BUILTIN(__builtin_ptx_read_nctaid_w, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_ctaid_x, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_ctaid_y, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_ctaid_z, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_ctaid_w, "i", "nc")
 
-BUILTIN(__builtin_ptx_read_laneid, "i", "nc")
-BUILTIN(__builtin_ptx_read_warpid, "i", "nc")
-BUILTIN(__builtin_ptx_read_nwarpid, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_nctaid_x, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_nctaid_y, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_nctaid_z, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_nctaid_w, "i", "nc")
 
-BUILTIN(__builtin_ptx_read_smid, "i", "nc")
-BUILTIN(__builtin_ptx_read_nsmid, "i", "nc")
-BUILTIN(__builtin_ptx_read_gridid, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_laneid, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_warpid, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_nwarpid, "i", "nc")
 
-BUILTIN(__builtin_ptx_read_lanemask_eq, "i", "nc")
-BUILTIN(__builtin_ptx_read_lanemask_le, "i", "nc")
-BUILTIN(__builtin_ptx_read_lanemask_lt, "i", "nc")
-BUILTIN(__builtin_ptx_read_lanemask_ge, "i", "nc")
-BUILTIN(__builtin_ptx_read_lanemask_gt, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_smid, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_nsmid, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_gridid, "i", "nc")
 
-BUILTIN(__builtin_ptx_read_clock, "i", "n")
-BUILTIN(__builtin_ptx_read_clock64, "LLi", "n")
+BUILTIN(__nvvm_read_ptx_sreg_lanemask_eq, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_lanemask_le, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_lanemask_lt, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_lanemask_ge, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_lanemask_gt, "i", "nc")
 
-BUILTIN(__builtin_ptx_read_pm0, "i", "n")
-BUILTIN(__builtin_ptx_read_pm1, "i", "n")
-BUILTIN(__builtin_ptx_read_pm2, "i", "n")
-BUILTIN(__builtin_ptx_read_pm3, "i", "n")
+BUILTIN(__nvvm_read_ptx_sreg_clock, "i", "n")
+BUILTIN(__nvvm_read_ptx_sreg_clock64, "LLi", "n")
 
-BUILTIN(__builtin_ptx_bar_sync, "vi", "n")
+BUILTIN(__nvvm_read_ptx_sreg_pm0, "i", "n")
+BUILTIN(__nvvm_read_ptx_sreg_pm1, "i", "n")
+BUILTIN(__nvvm_read_ptx_sreg_pm2, "i", "n")
+BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n")
 
-
-// Builtins exposed as part of NVVM
 // MISC
 
 BUILTIN(__nvvm_clz_i, "ii", "")
@@ -397,10 +394,21 @@
 // Sync
 
 BUILTIN(__syncthreads, "v", "")
-BUILTIN(__nvvm_bar0, "v", "")
 BUILTIN(__nvvm_bar0_popc, "ii", "")
 BUILTIN(__nvvm_bar0_and, "ii", "")
 BUILTIN(__nvvm_bar0_or, "ii", "")
+BUILTIN(__nvvm_bar_sync, "vi", "n")
+
+// Shuffle
+
+BUILTIN(__nvvm_shfl_down_i32, "iiii", "")
+BUILTIN(__nvvm_shfl_down_f32, "ffii", "")
+BUILTIN(__nvvm_shfl_up_i32, "iiii", "")
+BUILTIN(__nvvm_shfl_up_f32, "ffii", "")
+BUILTIN(__nvvm_shfl_bfly_i32, "iiii", "")
+BUILTIN(__nvvm_shfl_bfly_f32, "ffii", "")
+BUILTIN(__nvvm_shfl_idx_i32, "iiii", "")
+BUILTIN(__nvvm_shfl_idx_f32, "ffii", "")
 
 // Membar
 
@@ -566,4 +574,40 @@
 BUILTIN(__nvvm_compiler_error, "vcC*4", "n")
 BUILTIN(__nvvm_compiler_warn, "vcC*4", "n")
 
+// __ldg.  This is not implemented as a builtin by nvcc.
+BUILTIN(__nvvm_ldg_c, "ccC*", "")
+BUILTIN(__nvvm_ldg_s, "ssC*", "")
+BUILTIN(__nvvm_ldg_i, "iiC*", "")
+BUILTIN(__nvvm_ldg_l, "LiLiC*", "")
+BUILTIN(__nvvm_ldg_ll, "LLiLLiC*", "")
+
+BUILTIN(__nvvm_ldg_uc, "UcUcC*", "")
+BUILTIN(__nvvm_ldg_us, "UsUsC*", "")
+BUILTIN(__nvvm_ldg_ui, "UiUiC*", "")
+BUILTIN(__nvvm_ldg_ul, "ULiULiC*", "")
+BUILTIN(__nvvm_ldg_ull, "ULLiULLiC*", "")
+
+BUILTIN(__nvvm_ldg_f, "ffC*", "")
+BUILTIN(__nvvm_ldg_d, "ddC*", "")
+
+BUILTIN(__nvvm_ldg_c2, "E2cE2cC*", "")
+BUILTIN(__nvvm_ldg_c4, "E4cE4cC*", "")
+BUILTIN(__nvvm_ldg_s2, "E2sE2sC*", "")
+BUILTIN(__nvvm_ldg_s4, "E4sE4sC*", "")
+BUILTIN(__nvvm_ldg_i2, "E2iE2iC*", "")
+BUILTIN(__nvvm_ldg_i4, "E4iE4iC*", "")
+BUILTIN(__nvvm_ldg_ll2, "E2LLiE2LLiC*", "")
+
+BUILTIN(__nvvm_ldg_uc2, "E2UcE2UcC*", "")
+BUILTIN(__nvvm_ldg_uc4, "E4UcE4UcC*", "")
+BUILTIN(__nvvm_ldg_us2, "E2UsE2UsC*", "")
+BUILTIN(__nvvm_ldg_us4, "E4UsE4UsC*", "")
+BUILTIN(__nvvm_ldg_ui2, "E2UiE2UiC*", "")
+BUILTIN(__nvvm_ldg_ui4, "E4UiE4UiC*", "")
+BUILTIN(__nvvm_ldg_ull2, "E2ULLiE2ULLiC*", "")
+
+BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "")
+BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "")
+BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "")
+
 #undef BUILTIN
diff --git a/include/clang/Basic/BuiltinsPPC.def b/include/clang/Basic/BuiltinsPPC.def
index 5681c1f..365dcc0 100644
--- a/include/clang/Basic/BuiltinsPPC.def
+++ b/include/clang/Basic/BuiltinsPPC.def
@@ -336,6 +336,9 @@
 BUILTIN(__builtin_vsx_xvcpsgndp, "V2dV2dV2d", "")
 BUILTIN(__builtin_vsx_xvcpsgnsp, "V4fV4fV4f", "")
 
+BUILTIN(__builtin_vsx_xvabssp, "V4fV4f", "")
+BUILTIN(__builtin_vsx_xvabsdp, "V2dV2d", "")
+
 // HTM builtins
 BUILTIN(__builtin_tbegin, "UiUIi", "")
 BUILTIN(__builtin_tend, "UiUIi", "")
diff --git a/include/clang/Basic/BuiltinsSystemZ.def b/include/clang/Basic/BuiltinsSystemZ.def
index 68d5a1c..fa96e10 100644
--- a/include/clang/Basic/BuiltinsSystemZ.def
+++ b/include/clang/Basic/BuiltinsSystemZ.def
@@ -14,239 +14,244 @@
 
 // The format of this database matches clang/Basic/Builtins.def.
 
+#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
+#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
+#endif
+
 // Transactional-memory intrinsics
-BUILTIN(__builtin_tbegin, "iv*", "j")
-BUILTIN(__builtin_tbegin_nofloat, "iv*", "j")
-BUILTIN(__builtin_tbeginc, "v", "nj")
-BUILTIN(__builtin_tabort, "vi", "r")
-BUILTIN(__builtin_tend, "i", "n")
-BUILTIN(__builtin_tx_nesting_depth, "i", "nc")
-BUILTIN(__builtin_tx_assist, "vi", "n")
-BUILTIN(__builtin_non_tx_store, "vULi*ULi", "")
+TARGET_BUILTIN(__builtin_tbegin, "iv*", "j", "transactional-execution")
+TARGET_BUILTIN(__builtin_tbegin_nofloat, "iv*", "j", "transactional-execution")
+TARGET_BUILTIN(__builtin_tbeginc, "v", "nj", "transactional-execution")
+TARGET_BUILTIN(__builtin_tabort, "vi", "r", "transactional-execution")
+TARGET_BUILTIN(__builtin_tend, "i", "n", "transactional-execution")
+TARGET_BUILTIN(__builtin_tx_nesting_depth, "i", "nc", "transactional-execution")
+TARGET_BUILTIN(__builtin_tx_assist, "vi", "n", "transactional-execution")
+TARGET_BUILTIN(__builtin_non_tx_store, "vULi*ULi", "", "transactional-execution")
 
 // Vector intrinsics.
 // These all map directly to z instructions, except that some variants ending
 // in "s" have a final "int *" that receives the post-instruction CC value.
 
 // Vector support instructions (chapter 21 of the PoP)
-BUILTIN(__builtin_s390_lcbb, "UivC*Ii", "nc")
-BUILTIN(__builtin_s390_vlbb, "V16ScvC*Ii", "")
-BUILTIN(__builtin_s390_vll, "V16ScUivC*", "")
-BUILTIN(__builtin_s390_vstl, "vV16ScUiv*", "")
-BUILTIN(__builtin_s390_vperm, "V16UcV16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vpdi, "V2ULLiV2ULLiV2ULLiIi", "nc")
-BUILTIN(__builtin_s390_vpksh, "V16ScV8SsV8Ss", "nc")
-BUILTIN(__builtin_s390_vpkshs, "V16ScV8SsV8Ssi*", "nc")
-BUILTIN(__builtin_s390_vpksf, "V8SsV4SiV4Si", "nc")
-BUILTIN(__builtin_s390_vpksfs, "V8SsV4SiV4Sii*", "nc")
-BUILTIN(__builtin_s390_vpksg, "V4SiV2SLLiV2SLLi", "nc")
-BUILTIN(__builtin_s390_vpksgs, "V4SiV2SLLiV2SLLii*", "nc")
-BUILTIN(__builtin_s390_vpklsh, "V16UcV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vpklshs, "V16UcV8UsV8Usi*", "nc")
-BUILTIN(__builtin_s390_vpklsf, "V8UsV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vpklsfs, "V8UsV4UiV4Uii*", "nc")
-BUILTIN(__builtin_s390_vpklsg, "V4UiV2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vpklsgs, "V4UiV2ULLiV2ULLii*", "nc")
-BUILTIN(__builtin_s390_vuphb, "V8SsV16Sc", "nc")
-BUILTIN(__builtin_s390_vuphh, "V4SiV8Ss", "nc")
-BUILTIN(__builtin_s390_vuphf, "V2SLLiV4Si", "nc")
-BUILTIN(__builtin_s390_vuplb, "V8SsV16Sc", "nc")
-BUILTIN(__builtin_s390_vuplhw, "V4SiV8Ss", "nc")
-BUILTIN(__builtin_s390_vuplf, "V2SLLiV4Si", "nc")
-BUILTIN(__builtin_s390_vuplhb, "V8UsV16Uc", "nc")
-BUILTIN(__builtin_s390_vuplhh, "V4UiV8Us", "nc")
-BUILTIN(__builtin_s390_vuplhf, "V2ULLiV4Ui", "nc")
-BUILTIN(__builtin_s390_vupllb, "V8UsV16Uc", "nc")
-BUILTIN(__builtin_s390_vupllh, "V4UiV8Us", "nc")
-BUILTIN(__builtin_s390_vupllf, "V2ULLiV4Ui", "nc")
+TARGET_BUILTIN(__builtin_s390_lcbb, "UivC*Ii", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vlbb, "V16ScvC*Ii", "", "vector")
+TARGET_BUILTIN(__builtin_s390_vll, "V16ScUivC*", "", "vector")
+TARGET_BUILTIN(__builtin_s390_vstl, "vV16ScUiv*", "", "vector")
+TARGET_BUILTIN(__builtin_s390_vperm, "V16UcV16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpdi, "V2ULLiV2ULLiV2ULLiIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpksh, "V16ScV8SsV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpkshs, "V16ScV8SsV8Ssi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpksf, "V8SsV4SiV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpksfs, "V8SsV4SiV4Sii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpksg, "V4SiV2SLLiV2SLLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpksgs, "V4SiV2SLLiV2SLLii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpklsh, "V16UcV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpklshs, "V16UcV8UsV8Usi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpklsf, "V8UsV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpklsfs, "V8UsV4UiV4Uii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpklsg, "V4UiV2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpklsgs, "V4UiV2ULLiV2ULLii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuphb, "V8SsV16Sc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuphh, "V4SiV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuphf, "V2SLLiV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuplb, "V8SsV16Sc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuplhw, "V4SiV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuplf, "V2SLLiV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuplhb, "V8UsV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuplhh, "V4UiV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vuplhf, "V2ULLiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vupllb, "V8UsV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vupllh, "V4UiV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vupllf, "V2ULLiV4Ui", "nc", "vector")
 
 // Vector integer instructions (chapter 22 of the PoP)
-BUILTIN(__builtin_s390_vaq, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vacq, "V16UcV16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vaccb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vacch, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vaccf, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vaccg, "V2ULLiV2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vaccq, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vacccq, "V16UcV16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vavgb, "V16ScV16ScV16Sc", "nc")
-BUILTIN(__builtin_s390_vavgh, "V8SsV8SsV8Ss", "nc")
-BUILTIN(__builtin_s390_vavgf, "V4SiV4SiV4Si", "nc")
-BUILTIN(__builtin_s390_vavgg, "V2SLLiV2SLLiV2SLLi", "nc")
-BUILTIN(__builtin_s390_vavglb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vavglh, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vavglf, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vavglg, "V2ULLiV2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vceqbs, "V16ScV16ScV16Sci*", "nc")
-BUILTIN(__builtin_s390_vceqhs, "V8SsV8SsV8Ssi*", "nc")
-BUILTIN(__builtin_s390_vceqfs, "V4SiV4SiV4Sii*", "nc")
-BUILTIN(__builtin_s390_vceqgs, "V2SLLiV2SLLiV2SLLii*", "nc")
-BUILTIN(__builtin_s390_vchbs, "V16ScV16ScV16Sci*", "nc")
-BUILTIN(__builtin_s390_vchhs, "V8SsV8SsV8Ssi*", "nc")
-BUILTIN(__builtin_s390_vchfs, "V4SiV4SiV4Sii*", "nc")
-BUILTIN(__builtin_s390_vchgs, "V2SLLiV2SLLiV2SLLii*", "nc")
-BUILTIN(__builtin_s390_vchlbs, "V16ScV16UcV16Uci*", "nc")
-BUILTIN(__builtin_s390_vchlhs, "V8SsV8UsV8Usi*", "nc")
-BUILTIN(__builtin_s390_vchlfs, "V4SiV4UiV4Uii*", "nc")
-BUILTIN(__builtin_s390_vchlgs, "V2SLLiV2ULLiV2ULLii*", "nc")
-BUILTIN(__builtin_s390_vcksm, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vclzb, "V16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vclzh, "V8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vclzf, "V4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vclzg, "V2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vctzb, "V16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vctzh, "V8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vctzf, "V4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vctzg, "V2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_verimb, "V16UcV16UcV16UcV16UcIi", "nc")
-BUILTIN(__builtin_s390_verimh, "V8UsV8UsV8UsV8UsIi", "nc")
-BUILTIN(__builtin_s390_verimf, "V4UiV4UiV4UiV4UiIi", "nc")
-BUILTIN(__builtin_s390_verimg, "V2ULLiV2ULLiV2ULLiV2ULLiIi", "nc")
-BUILTIN(__builtin_s390_verllb, "V16UcV16UcUi", "nc")
-BUILTIN(__builtin_s390_verllh, "V8UsV8UsUi", "nc")
-BUILTIN(__builtin_s390_verllf, "V4UiV4UiUi", "nc")
-BUILTIN(__builtin_s390_verllg, "V2ULLiV2ULLiUi", "nc")
-BUILTIN(__builtin_s390_verllvb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_verllvh, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_verllvf, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_verllvg, "V2ULLiV2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vgfmb, "V8UsV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vgfmh, "V4UiV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vgfmf, "V2ULLiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vgfmg, "V16UcV2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vgfmab, "V8UsV16UcV16UcV8Us", "nc")
-BUILTIN(__builtin_s390_vgfmah, "V4UiV8UsV8UsV4Ui", "nc")
-BUILTIN(__builtin_s390_vgfmaf, "V2ULLiV4UiV4UiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vgfmag, "V16UcV2ULLiV2ULLiV16Uc", "nc")
-BUILTIN(__builtin_s390_vmahb, "V16ScV16ScV16ScV16Sc", "nc")
-BUILTIN(__builtin_s390_vmahh, "V8SsV8SsV8SsV8Ss", "nc")
-BUILTIN(__builtin_s390_vmahf, "V4SiV4SiV4SiV4Si", "nc")
-BUILTIN(__builtin_s390_vmalhb, "V16UcV16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vmalhh, "V8UsV8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vmalhf, "V4UiV4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vmaeb, "V8SsV16ScV16ScV8Ss", "nc")
-BUILTIN(__builtin_s390_vmaeh, "V4SiV8SsV8SsV4Si", "nc")
-BUILTIN(__builtin_s390_vmaef, "V2SLLiV4SiV4SiV2SLLi", "nc")
-BUILTIN(__builtin_s390_vmaleb, "V8UsV16UcV16UcV8Us", "nc")
-BUILTIN(__builtin_s390_vmaleh, "V4UiV8UsV8UsV4Ui", "nc")
-BUILTIN(__builtin_s390_vmalef, "V2ULLiV4UiV4UiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vmaob, "V8SsV16ScV16ScV8Ss", "nc")
-BUILTIN(__builtin_s390_vmaoh, "V4SiV8SsV8SsV4Si", "nc")
-BUILTIN(__builtin_s390_vmaof, "V2SLLiV4SiV4SiV2SLLi", "nc")
-BUILTIN(__builtin_s390_vmalob, "V8UsV16UcV16UcV8Us", "nc")
-BUILTIN(__builtin_s390_vmaloh, "V4UiV8UsV8UsV4Ui", "nc")
-BUILTIN(__builtin_s390_vmalof, "V2ULLiV4UiV4UiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vmhb, "V16ScV16ScV16Sc", "nc")
-BUILTIN(__builtin_s390_vmhh, "V8SsV8SsV8Ss", "nc")
-BUILTIN(__builtin_s390_vmhf, "V4SiV4SiV4Si", "nc")
-BUILTIN(__builtin_s390_vmlhb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vmlhh, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vmlhf, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vmeb, "V8SsV16ScV16Sc", "nc")
-BUILTIN(__builtin_s390_vmeh, "V4SiV8SsV8Ss", "nc")
-BUILTIN(__builtin_s390_vmef, "V2SLLiV4SiV4Si", "nc")
-BUILTIN(__builtin_s390_vmleb, "V8UsV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vmleh, "V4UiV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vmlef, "V2ULLiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vmob, "V8SsV16ScV16Sc", "nc")
-BUILTIN(__builtin_s390_vmoh, "V4SiV8SsV8Ss", "nc")
-BUILTIN(__builtin_s390_vmof, "V2SLLiV4SiV4Si", "nc")
-BUILTIN(__builtin_s390_vmlob, "V8UsV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vmloh, "V4UiV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vmlof, "V2ULLiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vpopctb, "V16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vpopcth, "V8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vpopctf, "V4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vpopctg, "V2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vsq, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsbcbiq, "V16UcV16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsbiq, "V16UcV16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vscbib, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vscbih, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vscbif, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vscbig, "V2ULLiV2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vscbiq, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsl, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vslb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsldb, "V16UcV16UcV16UcIi", "nc")
-BUILTIN(__builtin_s390_vsra, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsrab, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsrl, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsrlb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsumb, "V4UiV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vsumh, "V4UiV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vsumgh, "V2ULLiV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vsumgf, "V2ULLiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vsumqf, "V16UcV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vsumqg, "V16UcV2ULLiV2ULLi", "nc")
-BUILTIN(__builtin_s390_vtm, "iV16UcV16Uc", "nc")
+TARGET_BUILTIN(__builtin_s390_vaq, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vacq, "V16UcV16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vaccb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vacch, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vaccf, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vaccg, "V2ULLiV2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vaccq, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vacccq, "V16UcV16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vavgb, "V16ScV16ScV16Sc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vavgh, "V8SsV8SsV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vavgf, "V4SiV4SiV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vavgg, "V2SLLiV2SLLiV2SLLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vavglb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vavglh, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vavglf, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vavglg, "V2ULLiV2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vceqbs, "V16ScV16ScV16Sci*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vceqhs, "V8SsV8SsV8Ssi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vceqfs, "V4SiV4SiV4Sii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vceqgs, "V2SLLiV2SLLiV2SLLii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vchbs, "V16ScV16ScV16Sci*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vchhs, "V8SsV8SsV8Ssi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vchfs, "V4SiV4SiV4Sii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vchgs, "V2SLLiV2SLLiV2SLLii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vchlbs, "V16ScV16UcV16Uci*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vchlhs, "V8SsV8UsV8Usi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vchlfs, "V4SiV4UiV4Uii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vchlgs, "V2SLLiV2ULLiV2ULLii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vcksm, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vclzb, "V16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vclzh, "V8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vclzf, "V4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vclzg, "V2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vctzb, "V16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vctzh, "V8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vctzf, "V4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vctzg, "V2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verimb, "V16UcV16UcV16UcV16UcIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verimh, "V8UsV8UsV8UsV8UsIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verimf, "V4UiV4UiV4UiV4UiIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verimg, "V2ULLiV2ULLiV2ULLiV2ULLiIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllb, "V16UcV16UcUi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllh, "V8UsV8UsUi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllf, "V4UiV4UiUi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllg, "V2ULLiV2ULLiUi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllvb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllvh, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllvf, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllvg, "V2ULLiV2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vgfmb, "V8UsV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vgfmh, "V4UiV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vgfmf, "V2ULLiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vgfmg, "V16UcV2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vgfmab, "V8UsV16UcV16UcV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vgfmah, "V4UiV8UsV8UsV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vgfmaf, "V2ULLiV4UiV4UiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vgfmag, "V16UcV2ULLiV2ULLiV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmahb, "V16ScV16ScV16ScV16Sc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmahh, "V8SsV8SsV8SsV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmahf, "V4SiV4SiV4SiV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmalhb, "V16UcV16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmalhh, "V8UsV8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmalhf, "V4UiV4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaeb, "V8SsV16ScV16ScV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaeh, "V4SiV8SsV8SsV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaef, "V2SLLiV4SiV4SiV2SLLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaleb, "V8UsV16UcV16UcV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaleh, "V4UiV8UsV8UsV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmalef, "V2ULLiV4UiV4UiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaob, "V8SsV16ScV16ScV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaoh, "V4SiV8SsV8SsV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaof, "V2SLLiV4SiV4SiV2SLLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmalob, "V8UsV16UcV16UcV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmaloh, "V4UiV8UsV8UsV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmalof, "V2ULLiV4UiV4UiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmhb, "V16ScV16ScV16Sc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmhh, "V8SsV8SsV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmhf, "V4SiV4SiV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmlhb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmlhh, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmlhf, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmeb, "V8SsV16ScV16Sc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmeh, "V4SiV8SsV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmef, "V2SLLiV4SiV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmleb, "V8UsV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmleh, "V4UiV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmlef, "V2ULLiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmob, "V8SsV16ScV16Sc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmoh, "V4SiV8SsV8Ss", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmof, "V2SLLiV4SiV4Si", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmlob, "V8UsV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmloh, "V4UiV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vmlof, "V2ULLiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpopctb, "V16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpopcth, "V8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpopctf, "V4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vpopctg, "V2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsq, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsbcbiq, "V16UcV16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsbiq, "V16UcV16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vscbib, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vscbih, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vscbif, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vscbig, "V2ULLiV2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vscbiq, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsl, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vslb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsldb, "V16UcV16UcV16UcIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsra, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsrab, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsrl, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsrlb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsumb, "V4UiV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsumh, "V4UiV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsumgh, "V2ULLiV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsumgf, "V2ULLiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsumqf, "V16UcV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vsumqg, "V16UcV2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vtm, "iV16UcV16Uc", "nc", "vector")
 
 // Vector string instructions (chapter 23 of the PoP)
-BUILTIN(__builtin_s390_vfaeb, "V16UcV16UcV16UcIi", "nc")
-BUILTIN(__builtin_s390_vfaebs, "V16UcV16UcV16UcIii*", "nc")
-BUILTIN(__builtin_s390_vfaeh, "V8UsV8UsV8UsIi", "nc")
-BUILTIN(__builtin_s390_vfaehs, "V8UsV8UsV8UsIii*", "nc")
-BUILTIN(__builtin_s390_vfaef, "V4UiV4UiV4UiIi", "nc")
-BUILTIN(__builtin_s390_vfaefs, "V4UiV4UiV4UiIii*", "nc")
-BUILTIN(__builtin_s390_vfaezb, "V16UcV16UcV16UcIi", "nc")
-BUILTIN(__builtin_s390_vfaezbs, "V16UcV16UcV16UcIii*", "nc")
-BUILTIN(__builtin_s390_vfaezh, "V8UsV8UsV8UsIi", "nc")
-BUILTIN(__builtin_s390_vfaezhs, "V8UsV8UsV8UsIii*", "nc")
-BUILTIN(__builtin_s390_vfaezf, "V4UiV4UiV4UiIi", "nc")
-BUILTIN(__builtin_s390_vfaezfs, "V4UiV4UiV4UiIii*", "nc")
-BUILTIN(__builtin_s390_vfeeb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vfeebs, "V16UcV16UcV16Uci*", "nc")
-BUILTIN(__builtin_s390_vfeeh, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vfeehs, "V8UsV8UsV8Usi*", "nc")
-BUILTIN(__builtin_s390_vfeef, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vfeefs, "V4UiV4UiV4Uii*", "nc")
-BUILTIN(__builtin_s390_vfeezb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vfeezbs, "V16UcV16UcV16Uci*", "nc")
-BUILTIN(__builtin_s390_vfeezh, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vfeezhs, "V8UsV8UsV8Usi*", "nc")
-BUILTIN(__builtin_s390_vfeezf, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vfeezfs, "V4UiV4UiV4Uii*", "nc")
-BUILTIN(__builtin_s390_vfeneb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vfenebs, "V16UcV16UcV16Uci*", "nc")
-BUILTIN(__builtin_s390_vfeneh, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vfenehs, "V8UsV8UsV8Usi*", "nc")
-BUILTIN(__builtin_s390_vfenef, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vfenefs, "V4UiV4UiV4Uii*", "nc")
-BUILTIN(__builtin_s390_vfenezb, "V16UcV16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vfenezbs, "V16UcV16UcV16Uci*", "nc")
-BUILTIN(__builtin_s390_vfenezh, "V8UsV8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vfenezhs, "V8UsV8UsV8Usi*", "nc")
-BUILTIN(__builtin_s390_vfenezf, "V4UiV4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vfenezfs, "V4UiV4UiV4Uii*", "nc")
-BUILTIN(__builtin_s390_vistrb, "V16UcV16Uc", "nc")
-BUILTIN(__builtin_s390_vistrbs, "V16UcV16Uci*", "nc")
-BUILTIN(__builtin_s390_vistrh, "V8UsV8Us", "nc")
-BUILTIN(__builtin_s390_vistrhs, "V8UsV8Usi*", "nc")
-BUILTIN(__builtin_s390_vistrf, "V4UiV4Ui", "nc")
-BUILTIN(__builtin_s390_vistrfs, "V4UiV4Uii*", "nc")
-BUILTIN(__builtin_s390_vstrcb, "V16UcV16UcV16UcV16UcIi", "nc")
-BUILTIN(__builtin_s390_vstrcbs, "V16UcV16UcV16UcV16UcIii*", "nc")
-BUILTIN(__builtin_s390_vstrch, "V8UsV8UsV8UsV8UsIi", "nc")
-BUILTIN(__builtin_s390_vstrchs, "V8UsV8UsV8UsV8UsIii*", "nc")
-BUILTIN(__builtin_s390_vstrcf, "V4UiV4UiV4UiV4UiIi", "nc")
-BUILTIN(__builtin_s390_vstrcfs, "V4UiV4UiV4UiV4UiIii*", "nc")
-BUILTIN(__builtin_s390_vstrczb, "V16UcV16UcV16UcV16UcIi", "nc")
-BUILTIN(__builtin_s390_vstrczbs, "V16UcV16UcV16UcV16UcIii*", "nc")
-BUILTIN(__builtin_s390_vstrczh, "V8UsV8UsV8UsV8UsIi", "nc")
-BUILTIN(__builtin_s390_vstrczhs, "V8UsV8UsV8UsV8UsIii*", "nc")
-BUILTIN(__builtin_s390_vstrczf, "V4UiV4UiV4UiV4UiIi", "nc")
-BUILTIN(__builtin_s390_vstrczfs, "V4UiV4UiV4UiV4UiIii*", "nc")
+TARGET_BUILTIN(__builtin_s390_vfaeb, "V16UcV16UcV16UcIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaebs, "V16UcV16UcV16UcIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaeh, "V8UsV8UsV8UsIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaehs, "V8UsV8UsV8UsIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaef, "V4UiV4UiV4UiIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaefs, "V4UiV4UiV4UiIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaezb, "V16UcV16UcV16UcIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaezbs, "V16UcV16UcV16UcIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaezh, "V8UsV8UsV8UsIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaezhs, "V8UsV8UsV8UsIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaezf, "V4UiV4UiV4UiIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfaezfs, "V4UiV4UiV4UiIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeeb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeebs, "V16UcV16UcV16Uci*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeeh, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeehs, "V8UsV8UsV8Usi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeef, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeefs, "V4UiV4UiV4Uii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeezb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeezbs, "V16UcV16UcV16Uci*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeezh, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeezhs, "V8UsV8UsV8Usi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeezf, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeezfs, "V4UiV4UiV4Uii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeneb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenebs, "V16UcV16UcV16Uci*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfeneh, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenehs, "V8UsV8UsV8Usi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenef, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenefs, "V4UiV4UiV4Uii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenezb, "V16UcV16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenezbs, "V16UcV16UcV16Uci*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenezh, "V8UsV8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenezhs, "V8UsV8UsV8Usi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenezf, "V4UiV4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfenezfs, "V4UiV4UiV4Uii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vistrb, "V16UcV16Uc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vistrbs, "V16UcV16Uci*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vistrh, "V8UsV8Us", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vistrhs, "V8UsV8Usi*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vistrf, "V4UiV4Ui", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vistrfs, "V4UiV4Uii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrcb, "V16UcV16UcV16UcV16UcIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrcbs, "V16UcV16UcV16UcV16UcIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrch, "V8UsV8UsV8UsV8UsIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrchs, "V8UsV8UsV8UsV8UsIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrcf, "V4UiV4UiV4UiV4UiIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrcfs, "V4UiV4UiV4UiV4UiIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrczb, "V16UcV16UcV16UcV16UcIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrczbs, "V16UcV16UcV16UcV16UcIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrczh, "V8UsV8UsV8UsV8UsIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrczhs, "V8UsV8UsV8UsV8UsIii*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrczf, "V4UiV4UiV4UiV4UiIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vstrczfs, "V4UiV4UiV4UiV4UiIii*", "nc", "vector")
 
 // Vector floating-point instructions (chapter 24 of the PoP)
-BUILTIN(__builtin_s390_vfcedbs, "V2SLLiV2dV2di*", "nc")
-BUILTIN(__builtin_s390_vfchdbs, "V2SLLiV2dV2di*", "nc")
-BUILTIN(__builtin_s390_vfchedbs, "V2SLLiV2dV2di*", "nc")
-BUILTIN(__builtin_s390_vfidb, "V2dV2dIiIi", "nc")
-BUILTIN(__builtin_s390_vflndb, "V2dV2d", "nc")
-BUILTIN(__builtin_s390_vflpdb, "V2dV2d", "nc")
-BUILTIN(__builtin_s390_vfmadb, "V2dV2dV2dV2d", "nc")
-BUILTIN(__builtin_s390_vfmsdb, "V2dV2dV2dV2d", "nc")
-BUILTIN(__builtin_s390_vfsqdb, "V2dV2d", "nc")
-BUILTIN(__builtin_s390_vftcidb, "V2SLLiV2dIii*", "nc")
+TARGET_BUILTIN(__builtin_s390_vfcedbs, "V2SLLiV2dV2di*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfchdbs, "V2SLLiV2dV2di*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfchedbs, "V2SLLiV2dV2di*", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfidb, "V2dV2dIiIi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vflndb, "V2dV2d", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vflpdb, "V2dV2d", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfmadb, "V2dV2dV2dV2d", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfmsdb, "V2dV2dV2dV2d", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vfsqdb, "V2dV2d", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vftcidb, "V2SLLiV2dIii*", "nc", "vector")
 
 #undef BUILTIN
+#undef TARGET_BUILTIN
diff --git a/include/clang/Basic/BuiltinsWebAssembly.def b/include/clang/Basic/BuiltinsWebAssembly.def
index 9754335..97b59a1 100644
--- a/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/include/clang/Basic/BuiltinsWebAssembly.def
@@ -16,9 +16,9 @@
 
 // The format of this database matches clang/Basic/Builtins.def.
 
-// Note that memory_size is not "c" (readnone) because it must be sequenced with
+// Note that current_memory is not "c" (readnone) because it must be sequenced with
 // respect to grow_memory calls.
-BUILTIN(__builtin_wasm_memory_size, "z", "n")
+BUILTIN(__builtin_wasm_current_memory, "z", "n")
 BUILTIN(__builtin_wasm_grow_memory, "vz", "n")
 
 #undef BUILTIN
diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def
index f738cc1..0accba4 100644
--- a/include/clang/Basic/BuiltinsX86.def
+++ b/include/clang/Basic/BuiltinsX86.def
@@ -161,6 +161,8 @@
 TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "", "sse")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "", "sse")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "", "sse")
 
 // MMX+SSE2
 TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "", "sse2")
@@ -215,7 +217,6 @@
 TARGET_BUILTIN(__builtin_ia32_ucomisdge, "iV2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_ucomisdneq, "iV2dV2d", "", "sse2")
 
-TARGET_BUILTIN(__builtin_ia32_cmpps, "V4fV4fV4fIc", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cmpeqps, "V4fV4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cmpltps, "V4fV4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cmpleps, "V4fV4fV4f", "", "sse")
@@ -224,7 +225,6 @@
 TARGET_BUILTIN(__builtin_ia32_cmpnltps, "V4fV4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cmpnleps, "V4fV4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cmpordps, "V4fV4fV4f", "", "sse")
-TARGET_BUILTIN(__builtin_ia32_cmpss, "V4fV4fV4fIc", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cmpeqss, "V4fV4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cmpltss, "V4fV4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cmpless, "V4fV4fV4f", "", "sse")
@@ -238,7 +238,6 @@
 TARGET_BUILTIN(__builtin_ia32_minss, "V4fV4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_maxss, "V4fV4fV4f", "", "sse")
 
-TARGET_BUILTIN(__builtin_ia32_cmppd, "V2dV2dV2dIc", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpeqpd, "V2dV2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpltpd, "V2dV2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmplepd, "V2dV2dV2d", "", "sse2")
@@ -247,7 +246,6 @@
 TARGET_BUILTIN(__builtin_ia32_cmpnltpd, "V2dV2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpnlepd, "V2dV2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpordpd, "V2dV2dV2d", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cmpsd, "V2dV2dV2dIc", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpeqsd, "V2dV2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpltsd, "V2dV2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmplesd, "V2dV2dV2d", "", "sse2")
@@ -305,12 +303,12 @@
 TARGET_BUILTIN(__builtin_ia32_ldmxcsr, "vUi", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_stmxcsr, "Ui", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cvtss2si, "iV4f", "", "sse")
+TARGET_BUILTIN(__builtin_ia32_cvttss2si, "iV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_cvtss2si64, "LLiV4f", "", "sse")
-TARGET_BUILTIN(__builtin_ia32_storeups, "vf*V4f", "", "sse")
+TARGET_BUILTIN(__builtin_ia32_cvttss2si64, "LLiV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "", "sse")
-TARGET_BUILTIN(__builtin_ia32_movntps, "vf*V4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_sfence, "v", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "", "sse")
@@ -320,31 +318,28 @@
 TARGET_BUILTIN(__builtin_ia32_sqrtss, "V4fV4f", "", "sse")
 
 TARGET_BUILTIN(__builtin_ia32_maskmovdqu, "vV16cV16cc*", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_storeupd, "vd*V2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_movmskpd, "iV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_movnti64, "vLLi*LLi", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movntpd, "vd*V2d", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movntdq, "vV2LLi*V2LLi", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2LLiV16cV16c", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtdq2pd, "V2dV4i", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cvtdq2ps, "V4fV4i", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2dq, "V2LLiV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2ps, "V4fV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cvttpd2dq, "V4iV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cvtsd2si, "iV2d", "", "sse2")
+TARGET_BUILTIN(__builtin_ia32_cvttsd2si, "iV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cvtsd2si64, "LLiV2d", "", "sse2")
+TARGET_BUILTIN(__builtin_ia32_cvttsd2si64, "LLiV2d", "", "sse2")
+TARGET_BUILTIN(__builtin_ia32_cvtsd2ss, "V4fV4fV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cvtps2dq, "V4iV4f", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtps2pd, "V2dV4f", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cvttps2dq, "V4iV4f", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_clflush, "vvC*", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_lfence, "v", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_mfence, "v", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_pause, "v", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_storedqu, "vc*V16c", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_pmuludq128, "V2LLiV4iV4i", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_psraw128, "V8sV8sV8s", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_psrad128, "V4iV4iV4i", "", "sse2")
@@ -368,7 +363,7 @@
 TARGET_BUILTIN(__builtin_ia32_mwait, "vUiUi", "", "sse3")
 TARGET_BUILTIN(__builtin_ia32_lddqu, "V16ccC*", "", "sse3")
 
-TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIc", "", "ssse3")
+TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIi", "", "ssse3")
 
 TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "", "sse4.1")
@@ -384,14 +379,7 @@
 TARGET_BUILTIN(__builtin_ia32_pminsd128, "V4iV4iV4i", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pminud128, "V4iV4iV4i", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pminuw128, "V8sV8sV8s", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovzxbd128, "V4iV16c", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovzxbq128, "V2LLiV16c", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovzxbw128, "V8sV16c", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovzxdq128, "V2LLiV4i", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovzxwd128, "V4iV8s", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovzxwq128, "V2LLiV8s", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pmuldq128, "V2LLiV4iV4i", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmulld128, "V4iV4iV4i", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_roundps, "V4fV4fIi", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_roundss, "V4fV4fV4fIi", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_roundsd, "V2dV2dV2dIi", "", "sse4.1")
@@ -464,13 +452,15 @@
 TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "", "avx")
+TARGET_BUILTIN(__builtin_ia32_cmppd, "V2dV2dV2dIc", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_cmppd256, "V4dV4dV4dIc", "", "avx")
+TARGET_BUILTIN(__builtin_ia32_cmpps, "V4fV4fV4fIc", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtdq2pd256, "V4dV4i", "", "avx")
+TARGET_BUILTIN(__builtin_ia32_cmpsd, "V2dV2dV2dIc", "", "avx")
+TARGET_BUILTIN(__builtin_ia32_cmpss, "V4fV4fV4fIc", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvtdq2ps256, "V8fV8i", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2ps256, "V4fV4d", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvtps2dq256, "V8iV8f", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtps2pd256, "V4dV4f", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvttpd2dq256, "V4iV4d", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2dq256, "V4iV4d", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvttps2dq256, "V8iV8f", "", "avx")
@@ -504,13 +494,7 @@
 TARGET_BUILTIN(__builtin_ia32_vzeroupper, "v", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_pd256, "V4dV2dC*", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_ps256, "V8fV4fC*", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_storeupd256, "vd*V4d", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_storeups256, "vf*V8f", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_storedqu256, "vc*V32c", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntdq256, "vV4LLi*V4LLi", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntpd256, "vd*V4d", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntps256, "vf*V8f", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2LLi", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4LLi", "", "avx")
@@ -537,7 +521,7 @@
 TARGET_BUILTIN(__builtin_ia32_paddusw256, "V16sV16sV16s", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIc", "", "avx2")
+TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "", "avx2")
@@ -562,18 +546,6 @@
 TARGET_BUILTIN(__builtin_ia32_pminsw256, "V16sV16sV16s", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pminsd256, "V8iV8iV8i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pmovmskb256, "iV32c", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovsxbw256, "V16sV16c", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovsxbd256, "V8iV16c", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovsxbq256, "V4LLiV16c", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovsxwd256, "V8iV8s", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovsxwq256, "V4LLiV8s", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovsxdq256, "V4LLiV4i", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovzxbw256, "V16sV16c", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovzxbd256, "V8iV16c", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovzxbq256, "V4LLiV16c", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovzxwd256, "V8iV8s", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovzxwq256, "V4LLiV8s", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovzxdq256, "V4LLiV4i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pmuldq256, "V4LLiV8iV8i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pmulhrsw256, "V16sV16sV16s", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pmulhuw256, "V16sV16sV16s", "", "avx2")
@@ -584,7 +556,6 @@
 TARGET_BUILTIN(__builtin_ia32_psignb256, "V32cV32cV32c", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psignw256, "V16sV16sV16s", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psignd256, "V8iV8iV8i", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslldqi256, "V4LLiV4LLiIi", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psllwi256, "V16sV16si", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psllw256, "V16sV16sV8s", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pslldi256, "V8iV8ii", "", "avx2")
@@ -595,7 +566,6 @@
 TARGET_BUILTIN(__builtin_ia32_psraw256, "V16sV16sV8s", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psradi256, "V8iV8ii", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrad256, "V8iV8iV4i", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrldqi256, "V4LLiV4LLiIi", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrlwi256, "V16sV16si", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrlw256, "V16sV16sV8s", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "", "avx2")
@@ -647,10 +617,8 @@
 // F16C
 TARGET_BUILTIN(__builtin_ia32_vcvtps2ph, "V8sV4fIi", "", "f16c")
 TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256, "V8sV8fIi", "", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph512, "V16sV16fIi", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_vcvtph2ps, "V4fV8s", "", "f16c")
 TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256, "V8fV8s", "", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps512, "V16fV16s", "", "avx512f")
 
 // RDRAND
 TARGET_BUILTIN(__builtin_ia32_rdrand16_step, "UiUs*", "", "rdrnd")
@@ -687,13 +655,16 @@
 TARGET_BUILTIN(__builtin_ia32_xsaves, "vv*ULLi", "", "xsaves")
 TARGET_BUILTIN(__builtin_ia32_xsaves64, "vv*ULLi", "", "xsaves")
 
+//CLFLUSHOPT
+TARGET_BUILTIN(__builtin_ia32_clflushopt, "vc*", "", "clflushopt")
+
 // ADX
 TARGET_BUILTIN(__builtin_ia32_addcarryx_u32, "UcUcUiUiUi*", "", "adx")
 TARGET_BUILTIN(__builtin_ia32_addcarryx_u64, "UcUcULLiULLiULLi*", "", "adx")
-TARGET_BUILTIN(__builtin_ia32_addcarry_u32, "UcUcUiUiUi*", "", "adx")
-TARGET_BUILTIN(__builtin_ia32_addcarry_u64, "UcUcULLiULLiULLi*", "", "adx")
-TARGET_BUILTIN(__builtin_ia32_subborrow_u32, "UcUcUiUiUi*", "", "adx")
-TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcULLiULLiULLi*", "", "adx")
+TARGET_BUILTIN(__builtin_ia32_addcarry_u32, "UcUcUiUiUi*", "", "")
+TARGET_BUILTIN(__builtin_ia32_addcarry_u64, "UcUcULLiULLiULLi*", "", "")
+TARGET_BUILTIN(__builtin_ia32_subborrow_u32, "UcUcUiUiUi*", "", "")
+TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcULLiULLiULLi*", "", "")
 
 // RDSEED
 TARGET_BUILTIN(__builtin_ia32_rdseed16_step, "UiUs*", "", "rdseed")
@@ -931,23 +902,23 @@
 // AVX-512
 TARGET_BUILTIN(__builtin_ia32_sqrtpd512_mask, "V8dV8dV8dUcIi", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_sqrtps512_mask, "V16fV16fV16fUsIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14sd, "V2dV2dV2dV2dUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ss, "V4fV4fV4fV4fUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_rsqrt14sd_mask, "V2dV2dV2dV2dUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_rsqrt14ss_mask, "V4fV4fV4fV4fUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rsqrt14pd512_mask, "V8dV8dV8dUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rsqrt14ps512_mask, "V16fV16fV16fUs", "", "avx512f")
 
-TARGET_BUILTIN(__builtin_ia32_rsqrt28sd_round, "V2dV2dV2dV2dUcIi", "", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rsqrt28ss_round, "V4fV4fV4fV4fUcIi", "", "avx512er")
+TARGET_BUILTIN(__builtin_ia32_rsqrt28sd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512er")
+TARGET_BUILTIN(__builtin_ia32_rsqrt28ss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512er")
 TARGET_BUILTIN(__builtin_ia32_rsqrt28pd_mask, "V8dV8dV8dUcIi", "", "avx512er")
 TARGET_BUILTIN(__builtin_ia32_rsqrt28ps_mask, "V16fV16fV16fUsIi", "", "avx512er")
 
-TARGET_BUILTIN(__builtin_ia32_rcp14sd, "V2dV2dV2dV2dUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rcp14ss, "V4fV4fV4fV4fUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_rcp14sd_mask, "V2dV2dV2dV2dUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_rcp14ss_mask, "V4fV4fV4fV4fUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rcp14pd512_mask, "V8dV8dV8dUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rcp14ps512_mask, "V16fV16fV16fUs", "", "avx512f")
 
-TARGET_BUILTIN(__builtin_ia32_rcp28sd_round, "V2dV2dV2dV2dUcIi", "", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rcp28ss_round, "V4fV4fV4fV4fUcIi", "", "avx512er")
+TARGET_BUILTIN(__builtin_ia32_rcp28sd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512er")
+TARGET_BUILTIN(__builtin_ia32_rcp28ss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512er")
 TARGET_BUILTIN(__builtin_ia32_rcp28pd_mask, "V8dV8dV8dUcIi", "", "avx512er")
 TARGET_BUILTIN(__builtin_ia32_rcp28ps_mask, "V16fV16fV16fUsIi", "", "avx512er")
 TARGET_BUILTIN(__builtin_ia32_exp2pd_mask, "V8dV8dV8dUcIi", "", "avx512er")
@@ -968,12 +939,12 @@
 TARGET_BUILTIN(__builtin_ia32_pcmpeqw512_mask, "iV32sV32si", "", "avx512bw")
 
 TARGET_BUILTIN(__builtin_ia32_pcmpeqb256_mask, "iV32cV32ci", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqd256_mask, "cV8iV8ic", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqq256_mask, "cV4LLiV4LLic", "", "avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pcmpeqd256_mask, "cV8iV8ic", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pcmpeqq256_mask, "cV4LLiV4LLic", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pcmpeqw256_mask, "sV16sV16ss", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pcmpeqb128_mask, "sV16cV16cs", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqd128_mask, "cV4iV4ic", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqq128_mask, "cV2LLiV2LLic", "", "avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pcmpeqd128_mask, "cV4iV4ic", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pcmpeqq128_mask, "cV2LLiV2LLic", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pcmpeqw128_mask, "cV8sV8sc", "", "avx512vl,avx512bw")
 
 TARGET_BUILTIN(__builtin_ia32_pcmpgtb512_mask, "LLiV64cV64cLLi", "", "avx512bw")
@@ -982,12 +953,12 @@
 TARGET_BUILTIN(__builtin_ia32_pcmpgtw512_mask, "iV32sV32si", "", "avx512bw")
 
 TARGET_BUILTIN(__builtin_ia32_pcmpgtb256_mask, "iV32cV32ci", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtd256_mask, "cV8iV8ic", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtq256_mask, "cV4LLiV4LLic", "", "avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pcmpgtd256_mask, "cV8iV8ic", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pcmpgtq256_mask, "cV4LLiV4LLic", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pcmpgtw256_mask, "sV16sV16ss", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pcmpgtb128_mask, "sV16cV16cs", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtd128_mask, "cV4iV4ic", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtq128_mask, "cV2LLiV2LLic", "", "avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pcmpgtd128_mask, "cV4iV4ic", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pcmpgtq128_mask, "cV2LLiV2LLic", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pcmpgtw128_mask, "cV8sV8sc", "", "avx512vl,avx512bw")
 
 TARGET_BUILTIN(__builtin_ia32_cmppd512_mask, "UcV8dV8dIiUcIi", "", "avx512f")
@@ -1011,12 +982,6 @@
 TARGET_BUILTIN(__builtin_ia32_cvtpd2ps512_mask, "V8fV8dV8fUcIi", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_vcvtps2ph512_mask, "V16sV16fIiV16sUs", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_vcvtph2ps512_mask, "V16fV16sV16fUsIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pandd512_mask,  "V16iV16iV16iV16iUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pandq512_mask,  "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pord512_mask,  "V16iV16iV16iV16iUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_porq512_mask,  "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pxord512_mask,  "V16iV16iV16iV16iUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pxorq512_mask,  "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_pabsd512_mask, "V16iV16iV16iUs", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_pabsq512_mask, "V8LLiV8LLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_pmaxsd512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
@@ -1029,61 +994,77 @@
 TARGET_BUILTIN(__builtin_ia32_pminuq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_pmuldq512_mask, "V8LLiV16iV16iV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_pmuludq512_mask, "V8LLiV16iV16iV8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_blendmd_512_mask, "V16iV16iV16iUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_blendmq_512_mask, "V8LLiV8LLiV8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_blendmps_512_mask, "V16fV16fV16fUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_blendmpd_512_mask, "V8dV8dV8dUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_ptestmd512, "UsV16iV16iUs", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_ptestmq512, "UcV8LLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_pbroadcastd512_gpr_mask, "V16iiV16iUs", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_gpr_mask, "V8LLiLLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_mem_mask, "V8LLiLLiV8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16ivC*V16iUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi512_mask, "V8LLivC*V8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadups512_mask, "V16fvC*V16fUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadaps512_mask, "V16fvC*V16fUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadupd512_mask, "V8dvC*V8dUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadapd512_mask, "V8dvC*V8dUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storedqudi512_mask, "vv*V8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storedqusi512_mask, "vv*V16iUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeupd512_mask, "vv*V8dUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeapd512_mask, "vv*V8dUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeups512_mask, "vv*V16fUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeaps512_mask, "vv*V16fUs", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_loaddqudi512_mask, "V8LLiLLiC*V8LLiUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_loadups512_mask, "V16ffC*V16fUs", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_loadaps512_mask, "V16fV16fC*V16fUs", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_loadupd512_mask, "V8ddC*V8dUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_loadapd512_mask, "V8dV8dC*V8dUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_storedqudi512_mask, "vLLi*V8LLiUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_storedqusi512_mask, "vi*V16iUs", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_storeupd512_mask, "vd*V8dUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_storeapd512_mask, "vV8d*V8dUc", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_storeups512_mask, "vf*V16fUs", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_storeaps512_mask, "vV16f*V16fUs", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_vpermt2vard512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_vpermt2varq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_vpermt2varps512_mask, "V16fV16iV16fV16fUs", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_vpermt2varpd512_mask, "V8dV8LLiV8dV8dUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_alignq512_mask, "V8LLiV8LLiV8LLiIiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_alignd512_mask, "V16iV16iV16iIiV16iUs", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_alignd128_mask, "V4iV4iV4iIiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_alignd256_mask, "V8iV8iV8iIiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_alignq128_mask, "V2LLiV2LLiV2LLiIiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_alignq256_mask, "V4LLiV4LLiV4LLiIiV4LLiUc","","avx512vl")
 TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "", "avx512f")
 
-TARGET_BUILTIN(__builtin_ia32_gathersiv8df, "V8dV8dvC*V8iUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_gathersiv16sf, "V16fV16fvC*UsIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv8df, "V8dV8dvC*V8LLiUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv16sf, "V8fV8fvC*V8LLiUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_gathersiv8di, "V8LLiV8LLivC*V8iUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_gathersiv16si, "V16iV16ivC*UsIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv8di, "V8LLiV8LLivC*V8LLiUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv16si, "V8iV8ivC*V8LLiUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8df, "vv*UcV8iV8dIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scattersiv16sf, "vv*UsV16iV16fIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8df,  "vv*UcV8LLiV8dIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv16sf, "vv*UcV8LLiV8fIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8di,  "vv*UcV8iV8LLiIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vv*UsV16iV16iIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8di,  "vv*UcV8LLiV8LLiIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vv*UcV8LLiV8iIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2ddC*V2LLiUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V4iV2LLiLLiC*V2LLiUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3div4df, "V4dV4ddC*V4LLiUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3div4di, "V8iV4LLiLLiC*V4LLiUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3div4sf, "V4fV4ffC*V2LLiUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3div4si, "V4iV4iiC*V2LLiUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3div8sf, "V4fV4ffC*V4LLiUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3div8si, "V4iV4iiC*V4LLiUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3siv2df, "V2dV2ddC*V4iUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3siv2di, "V4iV2LLiLLiC*V4iUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3siv4df, "V4dV4ddC*V4iUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3siv4di, "V8iV4LLiLLiC*V4iUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3siv4sf, "V4fV4ffC*V4iUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3siv4si, "V4iV4iiC*V4iUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3siv8sf, "V8fV8ffC*V8iUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gather3siv8si, "V8iV8iiC*V8iUci","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_gathersiv8df, "V8dV8ddC*V8iUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_gathersiv16sf, "V16fV16ffC*V16fUsIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_gatherdiv8df, "V8dV8ddC*V8LLiUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_gatherdiv16sf, "V8fV8ffC*V8LLiUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_gathersiv8di, "V8LLiV8LLiLLiC*V8iUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_gathersiv16si, "V16iV16iiC*V16iUsIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_gatherdiv8di, "V8LLiV8LLiLLiC*V8LLiUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_gatherdiv16si, "V8iV8iiC*V8LLiUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_scattersiv8df, "vd*UcV8iV8dIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_scattersiv16sf, "vf*UsV16iV16fIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv8df,  "vd*UcV8LLiV8dIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv16sf, "vf*UcV8LLiV8fIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_scattersiv8di,  "vLLi*UcV8iV8LLiIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vi*UsV16iV16iIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv8di,  "vLLi*UcV8LLiV8LLiIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vi*UcV8LLiV8iIi", "", "avx512f")
 
-TARGET_BUILTIN(__builtin_ia32_gatherpfdpd,  "vUcV8ivC*IiIi", "", "avx512pf")
-TARGET_BUILTIN(__builtin_ia32_gatherpfdps,  "vUsV16ivC*IiIi", "", "avx512pf")
-TARGET_BUILTIN(__builtin_ia32_gatherpfqpd,  "vUcV8LLivC*IiIi", "", "avx512pf")
-TARGET_BUILTIN(__builtin_ia32_gatherpfqps,  "vUcV8LLivC*IiIi", "", "avx512pf")
-TARGET_BUILTIN(__builtin_ia32_scatterpfdpd, "vUcV8iv*IiIi", "", "avx512pf")
-TARGET_BUILTIN(__builtin_ia32_scatterpfdps, "vUsV16iv*IiIi", "", "avx512pf")
-TARGET_BUILTIN(__builtin_ia32_scatterpfqpd, "vUcV8LLiv*IiIi", "", "avx512pf")
-TARGET_BUILTIN(__builtin_ia32_scatterpfqps, "vUcV8LLiv*IiIi", "", "avx512pf")
+TARGET_BUILTIN(__builtin_ia32_gatherpfdpd,  "vUcV8iLLiC*IiIi", "", "avx512pf")
+TARGET_BUILTIN(__builtin_ia32_gatherpfdps,  "vUsV16iiC*IiIi", "", "avx512pf")
+TARGET_BUILTIN(__builtin_ia32_gatherpfqpd,  "vUcV8LLiLLiC*IiIi", "", "avx512pf")
+TARGET_BUILTIN(__builtin_ia32_gatherpfqps,  "vUcV8LLiiC*IiIi", "", "avx512pf")
+TARGET_BUILTIN(__builtin_ia32_scatterpfdpd, "vUcV8iLLi*IiIi", "", "avx512pf")
+TARGET_BUILTIN(__builtin_ia32_scatterpfdps, "vUsV16ii*IiIi", "", "avx512pf")
+TARGET_BUILTIN(__builtin_ia32_scatterpfqpd, "vUcV8LLiLLi*IiIi", "", "avx512pf")
+TARGET_BUILTIN(__builtin_ia32_scatterpfqps, "vUcV8LLii*IiIi", "", "avx512pf")
 
 TARGET_BUILTIN(__builtin_ia32_knothi, "UsUs", "", "avx512f")
 
@@ -1126,22 +1107,6 @@
 TARGET_BUILTIN(__builtin_ia32_pmuludq128_mask, "V2LLiV4iV4iV2LLiUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pmulld256_mask, "V8iV8iV8iV8iUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pmulld128_mask, "V4iV4iV4iV4iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pandd256_mask, "V8iV8iV8iV8iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pandd128_mask, "V4iV4iV4iV4iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pandnd256_mask, "V8iV8iV8iV8iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pandnd128_mask, "V4iV4iV4iV4iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pord256_mask, "V8iV8iV8iV8iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pord128_mask, "V4iV4iV4iV4iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pxord256_mask, "V8iV8iV8iV8iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pxord128_mask, "V4iV4iV4iV4iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pandq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pandq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pandnq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pandnq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_porq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_porq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pxorq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pxorq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
 
 TARGET_BUILTIN(__builtin_ia32_paddb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_psubb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
@@ -1159,8 +1124,6 @@
 TARGET_BUILTIN(__builtin_ia32_pmullw256_mask, "V16sV16sV16sV16sUs", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmullw128_mask, "V8sV8sV8sV8sUc", "", "avx512vl,avx512bw")
 
-TARGET_BUILTIN(__builtin_ia32_pandnd512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pandnq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_paddq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_psubq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_paddd512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
@@ -1195,8 +1158,6 @@
 TARGET_BUILTIN(__builtin_ia32_orps256_mask, "V8fV8fV8fV8fUc", "", "avx512vl,avx512dq")
 TARGET_BUILTIN(__builtin_ia32_orps128_mask, "V4fV4fV4fV4fUc", "", "avx512vl,avx512dq")
 
-TARGET_BUILTIN(__builtin_ia32_blendmb_512_mask, "V64cV64cV64cULLi", "", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_blendmw_512_mask, "V32sV32sV32sUi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pabsb512_mask, "V64cV64cV64cULLi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pabsw512_mask, "V32sV32sV32sUi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_packssdw512_mask, "V32sV16iV16iV32sUi", "", "avx512bw")
@@ -1227,15 +1188,15 @@
 TARGET_BUILTIN(__builtin_ia32_vpermt2varhi512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_vpermt2varhi512_maskz, "V32sV32sV32sV32sUi", "", "avx512bw")
 
+TARGET_BUILTIN(__builtin_ia32_vpconflictdi_128_mask, "V2LLiV2LLiV2LLiUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpconflictdi_256_mask, "V4LLiV4LLiV4LLiUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpconflictsi_128_mask, "V4iV4iV4iUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpconflictsi_256_mask, "V8iV8iV8iUc","","avx512cd,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_vpconflictdi_512_mask, "V8LLiV8LLiV8LLiUc", "", "avx512cd")
 TARGET_BUILTIN(__builtin_ia32_vpconflictsi_512_mask, "V16iV16iV16iUs", "", "avx512cd")
 TARGET_BUILTIN(__builtin_ia32_vplzcntd_512_mask, "V16iV16iV16iUs", "", "avx512cd")
 TARGET_BUILTIN(__builtin_ia32_vplzcntq_512_mask, "V8LLiV8LLiV8LLiUc", "", "avx512cd")
 
-TARGET_BUILTIN(__builtin_ia32_blendmb_128_mask, "V16cV16cV16cUs", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_blendmb_256_mask, "V32cV32cV32cUi", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_blendmw_128_mask, "V8sV8sV8sUc", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_blendmw_256_mask, "V16sV16sV16sUs", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pabsb128_mask, "V16cV16cV16cUs", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pabsb256_mask, "V32cV32cV32cUi", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pabsw128_mask, "V8sV8sV8sUc", "", "avx512vl,avx512bw")
@@ -1310,31 +1271,23 @@
 TARGET_BUILTIN(__builtin_ia32_pmaddubsw512_mask, "V32sV64cV64cV32sUi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmaddwd512_mask, "V16iV32sV32sV16iUs", "", "avx512bw")
 
-TARGET_BUILTIN(__builtin_ia32_addss_round, "V4fV4fV4fV4fUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_divss_round, "V4fV4fV4fV4fUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_mulss_round, "V4fV4fV4fV4fUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_subss_round, "V4fV4fV4fV4fUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_maxss_round, "V4fV4fV4fV4fUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_minss_round, "V4fV4fV4fV4fUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_addsd_round, "V2dV2dV2dV2dUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_divsd_round, "V2dV2dV2dV2dUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_mulsd_round, "V2dV2dV2dV2dUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_subsd_round, "V2dV2dV2dV2dUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_maxsd_round, "V2dV2dV2dV2dUcIi", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_minsd_round, "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_addss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_divss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_mulss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_subss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_maxss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_minss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_addsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_divsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_mulsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_subsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_maxsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_minsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
 
 TARGET_BUILTIN(__builtin_ia32_addpd128_mask, "V2dV2dV2dV2dUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_addpd256_mask, "V4dV4dV4dV4dUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_addps128_mask, "V4fV4fV4fV4fUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_addps256_mask, "V8fV8fV8fV8fUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_blendmd_128_mask, "V4iV4iV4iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_blendmd_256_mask, "V8iV8iV8iUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_blendmpd_128_mask, "V2dV2dV2dUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_blendmpd_256_mask, "V4dV4dV4dUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_blendmps_128_mask, "V4fV4fV4fUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_blendmps_256_mask, "V8fV8fV8fUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_blendmq_128_mask, "V2LLiV2LLiV2LLiUc", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_blendmq_256_mask, "V4LLiV4LLiV4LLiUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_compressdf128_mask, "V2dV2dV2dUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_compressdf256_mask, "V4dV4dV4dUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_compressdi128_mask, "V2LLiV2LLiV2LLiUc", "", "avx512vl")
@@ -1444,22 +1397,22 @@
 TARGET_BUILTIN(__builtin_ia32_scalefps128_mask, "V4fV4fV4fV4fUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_scalefps256_mask, "V8fV8fV8fV8fUc", "", "avx512vl")
 
-TARGET_BUILTIN(__builtin_ia32_scatterdiv2df, "vv*UcV2LLiV2dIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv2di, "vv*UcV2LLiV2LLiIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4df, "vv*UcV4LLiV4dIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4di, "vv*UcV4LLiV4LLiIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4sf, "vv*UcV2LLiV4fIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4si, "vv*UcV2LLiV4iIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8sf, "vv*UcV4LLiV4fIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8si, "vv*UcV4LLiV4iIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv2df, "vv*UcV4iV2dIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv2di, "vv*UcV4iV2LLiIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4df, "vv*UcV4iV4dIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4di, "vv*UcV4iV4LLiIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4sf, "vv*UcV4iV4fIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4si, "vv*UcV4iV4iIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8sf, "vv*UcV8iV8fIi", "", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8si, "vv*UcV8iV8iIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv2df, "vd*UcV2LLiV2dIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv2di, "vLLi*UcV2LLiV2LLiIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv4df, "vd*UcV4LLiV4dIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv4di, "vLLi*UcV4LLiV4LLiIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv4sf, "vf*UcV2LLiV4fIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv4si, "vi*UcV2LLiV4iIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv8sf, "vf*UcV4LLiV4fIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scatterdiv8si, "vi*UcV4LLiV4iIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scattersiv2df, "vd*UcV4iV2dIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scattersiv2di, "vLLi*UcV4iV2LLiIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scattersiv4df, "vd*UcV4iV4dIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scattersiv4di, "vLLi*UcV4iV4LLiIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scattersiv4sf, "vf*UcV4iV4fIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scattersiv4si, "vi*UcV4iV4iIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scattersiv8sf, "vf*UcV8iV8fIi", "", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scattersiv8si, "vi*UcV8iV8iIi", "", "avx512vl")
 
 TARGET_BUILTIN(__builtin_ia32_sqrtpd128_mask, "V2dV2dV2dUc", "", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_sqrtpd256_mask, "V4dV4dV4dUc", "", "avx512vl")
@@ -1496,10 +1449,6 @@
 TARGET_BUILTIN(__builtin_ia32_pmovswb512_mask, "V32cV32sV32cUi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmovuswb512_mask, "V32cV32sV32cUi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmovwb512_mask, "V32cV32sV32cUi", "", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpckhbw512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpckhwd512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpcklbw512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpcklwd512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2qq128_mask, "V2LLiV2dV2LLiUc", "", "avx512vl,avx512dq")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2qq256_mask, "V4LLiV4dV4LLiUc", "", "avx512vl,avx512dq")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq128_mask, "V2LLiV2dV2LLiUc", "", "avx512vl,avx512dq")
@@ -1528,10 +1477,14 @@
 TARGET_BUILTIN(__builtin_ia32_rangepd256_mask, "V4dV4dV4dIiV4dUc", "", "avx512vl,avx512dq")
 TARGET_BUILTIN(__builtin_ia32_rangeps128_mask, "V4fV4fV4fIiV4fUc", "", "avx512vl,avx512dq")
 TARGET_BUILTIN(__builtin_ia32_rangeps256_mask, "V8fV8fV8fIiV8fUc", "", "avx512vl,avx512dq")
+TARGET_BUILTIN(__builtin_ia32_rangesd128_round_mask, "V2dV2dV2dV2dUcIiIi", "", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_rangess128_round_mask, "V4fV4fV4fV4fUcIiIi", "", "avx512dq")
 TARGET_BUILTIN(__builtin_ia32_reducepd128_mask, "V2dV2dIiV2dUc", "", "avx512vl,avx512dq")
 TARGET_BUILTIN(__builtin_ia32_reducepd256_mask, "V4dV4dIiV4dUc", "", "avx512vl,avx512dq")
 TARGET_BUILTIN(__builtin_ia32_reduceps128_mask, "V4fV4fIiV4fUc", "", "avx512vl,avx512dq")
 TARGET_BUILTIN(__builtin_ia32_reduceps256_mask, "V8fV8fIiV8fUc", "", "avx512vl,avx512dq")
+TARGET_BUILTIN(__builtin_ia32_reducesd_mask, "V2dV2dV2dV2dUcIiIi", "", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_reducess_mask, "V4fV4fV4fV4fUcIiIi", "", "avx512dq")
 TARGET_BUILTIN(__builtin_ia32_pmaddubsw128_mask, "V8sV16cV16cV8sUc", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmaddubsw256_mask, "V16sV32cV32cV16sUs", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmaddwd128_mask, "V4iV8sV8sV4iUc", "", "avx512vl,avx512bw")
@@ -1548,14 +1501,6 @@
 TARGET_BUILTIN(__builtin_ia32_pmulhuw256_mask, "V16sV16sV16sV16sUs", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmulhw128_mask, "V8sV8sV8sV8sUc", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmulhw256_mask, "V16sV16sV16sV16sUs", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpckhbw128_mask, "V16cV16cV16cV16cUs", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpckhbw256_mask, "V32cV32cV32cV32cUi", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpckhwd128_mask, "V8sV8sV8sV8sUc", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpckhwd256_mask, "V16sV16sV16sV16sUs", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpcklbw128_mask, "V16cV16cV16cV16cUs", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpcklbw256_mask, "V32cV32cV32cV32cUi", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpcklwd128_mask, "V8sV8sV8sV8sUc", "", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_punpcklwd256_mask, "V16sV16sV16sV16sUs", "", "avx512vl,avx512bw")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2qq512_mask, "V8LLiV8dV8LLiUcIi", "", "avx512dq")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq512_mask, "V8LLiV8dV8LLiUcIi", "", "avx512dq")
 TARGET_BUILTIN(__builtin_ia32_cvtps2qq512_mask, "V8LLiV8fV8LLiUcIi", "", "avx512dq")
@@ -1572,6 +1517,621 @@
 TARGET_BUILTIN(__builtin_ia32_rangeps512_mask, "V16fV16fV16fIiV16fUsIi", "", "avx512dq")
 TARGET_BUILTIN(__builtin_ia32_reducepd512_mask, "V8dV8dIiV8dUcIi", "", "avx512dq")
 TARGET_BUILTIN(__builtin_ia32_reduceps512_mask, "V16fV16fIiV16fUsIi", "", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbw512_mask, "V32sV32cV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbd512_mask, "V16iV16cV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbq512_mask, "V8LLiV16cV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsxdq512_mask, "V8LLiV8iV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsxwd512_mask, "V16iV16sV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsxwq512_mask, "V8LLiV8sV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbw128_mask, "V8sV16cV8sUc","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbw256_mask, "V16sV16cV16sUs","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbd128_mask, "V4iV16cV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbd256_mask, "V8iV16cV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbq128_mask, "V2LLiV16cV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxbq256_mask, "V4LLiV16cV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxdq128_mask, "V2LLiV4iV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxdq256_mask, "V4LLiV4iV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxwd128_mask, "V4iV8sV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxwd256_mask, "V8iV8sV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxwq128_mask, "V2LLiV8sV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsxwq256_mask, "V4LLiV8sV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbw512_mask, "V32sV32cV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbd512_mask, "V16iV16cV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbq512_mask, "V8LLiV16cV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovzxdq512_mask, "V8LLiV8iV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovzxwd512_mask, "V16iV16sV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovzxwq512_mask, "V8LLiV8sV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbw128_mask, "V8sV16cV8sUc","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbw256_mask, "V16sV16cV16sUs","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbd128_mask, "V4iV16cV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbd256_mask, "V8iV16cV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbq128_mask, "V2LLiV16cV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxbq256_mask, "V4LLiV16cV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxdq128_mask, "V2LLiV4iV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxdq256_mask, "V4LLiV4iV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxwd128_mask, "V4iV8sV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxwd256_mask, "V8iV8sV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxwq128_mask, "V2LLiV8sV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovzxwq256_mask, "V4LLiV8sV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prold512_mask, "V16iV16iIiV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_prolq512_mask, "V8LLiV8LLiIiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_prold128_mask, "V4iV4iIiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prold256_mask, "V8iV8iIiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prolq128_mask, "V2LLiV2LLiIiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prolq256_mask, "V4LLiV4LLiIiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prolvd512_mask, "V16iV16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_prolvq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_prord512_mask, "V16iV16iiV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_prorq512_mask, "V8LLiV8LLiiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_prolvd128_mask, "V4iV4iV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prolvd256_mask, "V8iV8iV8iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prolvq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prolvq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prord128_mask, "V4iV4iIiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prord256_mask, "V8iV8iIiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prorq128_mask, "V2LLiV2LLiIiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prorq256_mask, "V4LLiV4LLiIiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prorvd512_mask, "V16iV16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_prorvq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_prorvd128_mask, "V4iV4iV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prorvd256_mask, "V8iV8iV8iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prorvq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_prorvq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllv32hi_mask, "V32sV32sV32sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psllw512_mask, "V32sV32sV8sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psllwi512_mask, "V32sV32sIiV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psllv16hi_mask, "V16sV16sV16sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllv8hi_mask, "V8sV8sV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllw128_mask, "V8sV8sV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllw256_mask, "V16sV16sV8sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllwi128_mask, "V8sV8sIiV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllwi256_mask, "V16sV16sIiV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllv2di_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllv4di_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllv4si_mask, "V4iV4iV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllv8si_mask, "V8iV8iV8iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pslldi512_mask, "V16iV16iIiV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psllqi512_mask, "V8LLiV8LLiIiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pslld128_mask, "V4iV4iV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pslld256_mask, "V8iV8iV4iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pslldi128_mask, "V4iV4iIiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pslldi256_mask, "V8iV8iIiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllq256_mask, "V4LLiV4LLiV2LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllqi128_mask, "V2LLiV2LLiIiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psllqi256_mask, "V4LLiV4LLiIiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlv32hi_mask, "V32sV32sV32sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psrlv16hi_mask, "V16sV16sV16sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlv8hi_mask, "V8sV8sV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlv2di_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlv4di_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlv4si_mask, "V4iV4iV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlv8si_mask, "V8iV8iV8iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrldi512_mask, "V16iV16iIiV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrlqi512_mask, "V8LLiV8LLiIiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrld128_mask, "V4iV4iV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrld256_mask, "V8iV8iV4iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrldi128_mask, "V4iV4iIiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrldi256_mask, "V8iV8iIiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlq256_mask, "V4LLiV4LLiV2LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlqi128_mask, "V2LLiV2LLiIiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlqi256_mask, "V4LLiV4LLiIiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrav32hi_mask, "V32sV32sV32sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psrav16hi_mask, "V16sV16sV16sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrav8hi_mask, "V8sV8sV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrav4si_mask, "V4iV4iV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrav8si_mask, "V8iV8iV8iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psravq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psravq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psraw512_mask, "V32sV32sV8sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psrawi512_mask, "V32sV32sIiV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psraw128_mask, "V8sV8sV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psraw256_mask, "V16sV16sV8sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrawi128_mask, "V8sV8sIiV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrawi256_mask, "V16sV16sIiV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlw512_mask, "V32sV32sV8sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psrlwi512_mask, "V32sV32sIiV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psrlw128_mask, "V8sV8sV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlw256_mask, "V16sV16sV8sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlwi128_mask, "V8sV8sIiV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrlwi256_mask, "V16sV16sIiV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_movdqa32load128_mask, "V4iV4i*V4iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa32load256_mask, "V8iV8i*V8iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa32load512_mask, "V16iV16iC*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa32store512_mask, "vV16i*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa64load512_mask, "V8LLiV8LLiC*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa64store512_mask, "vV8LLi*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa32store128_mask, "vV4i*V4iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa32store256_mask, "vV8i*V8iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa64load128_mask, "V2LLiV2LLiC*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_movdqa64load256_mask, "V4LLiV4LLiC*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_movdqa64store128_mask, "vV2LLi*V2LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, "vV4LLi*V4LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastb512_gpr_mask, "V64ccV64cULLi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastb128_gpr_mask, "V16ccV16cUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastb256_gpr_mask, "V32ccV32cUi","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastd128_gpr_mask, "V4iiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastd256_gpr_mask, "V8iiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastq128_gpr_mask, "V2LLiULLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastq256_gpr_mask, "V4LLiULLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512_maskz, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512_maskz, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128_maskz, "V2LLiV2LLiV2LLiV2LLiUc","","avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256_maskz, "V4LLiV4LLiV4LLiV4LLiUc","","avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128_maskz, "V2LLiV2LLiV2LLiV2LLiUc","","avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256_maskz, "V4LLiV4LLiV4LLiV4LLiUc","","avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermi2varqi512_mask, "V64cV64cV64cV64cULLi","","avx512vbmi")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varqi512_mask, "V64cV64cV64cV64cULLi","","avx512vbmi")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varqi512_maskz, "V64cV64cV64cV64cULLi","","avx512vbmi")
+TARGET_BUILTIN(__builtin_ia32_vpermi2varqi128_mask, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermi2varqi256_mask, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varqi128_mask, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varqi128_maskz, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varqi256_mask, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varqi256_maskz, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcomisd, "iV2dV2dIiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcomiss, "iV4fV4fIiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_kunpckdi, "ULLiULLiULLi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_kunpcksi, "UiUiUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_loaddquhi512_mask, "V32sV32s*V32sUi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_loaddquqi512_mask, "V64cV64c*V64cULLi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_mask, "V8dV8dV8dV8LLiIiUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_maskz, "V8dV8dV8dV8LLiIiUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_fixupimmps512_mask, "V16fV16fV16fV16iIiUsIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_fixupimmps512_maskz, "V16fV16fV16fV16iIiUsIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_fixupimmsd_mask, "V2dV2dV2dV2LLiIiUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_fixupimmsd_maskz, "V2dV2dV2dV2LLiIiUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_fixupimmss_mask, "V4fV4fV4fV4iIiUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_fixupimmss_maskz, "V4fV4fV4fV4iIiUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_getexpsd128_round_mask, "V2dV2dV2dV2dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_getexpss128_round_mask, "V4fV4fV4fV4fUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_getmantsd_round_mask, "V2dV2dV2dIiV2dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_getmantss_round_mask, "V4fV4fV4fIiV4fUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_loaddquhi128_mask, "V8sV8s*V8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loaddquhi256_mask, "V16sV16s*V16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loaddquqi128_mask, "V16cV16c*V16cUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loaddquqi256_mask, "V32cV32c*V32cUi","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_mask, "V2dV2dV2dV2LLiIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_maskz, "V2dV2dV2dV2LLiIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_mask, "V4dV4dV4dV4LLiIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_maskz, "V4dV4dV4dV4LLiIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fixupimmps128_mask, "V4fV4fV4fV4iIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fixupimmps128_maskz, "V4fV4fV4fV4iIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fixupimmps256_mask, "V8fV8fV8fV8iIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fixupimmps256_maskz, "V8fV8fV8fV8iIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loadapd128_mask, "V2dV2d*V2dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loadapd256_mask, "V4dV4d*V4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loadaps128_mask, "V4fV4f*V4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loadaps256_mask, "V8fV8f*V8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loaddqudi128_mask, "V2LLiV2LLi*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loaddqudi256_mask, "V4LLiV4LLi*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loaddqusi128_mask, "V4iV4i*V4iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_loaddqusi256_mask, "V8iV8i*V8iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_loadupd128_mask, "V2dV2d*V2dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loadupd256_mask, "V4dV4d*V4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loadups128_mask, "V4fV4f*V4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_loadups256_mask, "V8fV8f*V8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storedquhi512_mask, "vV32s*V32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_storedquqi512_mask, "vV64c*V64cULLi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_storedquhi128_mask, "vV8s*V8sUc","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_storedquhi256_mask, "vV16s*V16sUs","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_storedquqi128_mask, "vV16c*V16cUs","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_storedquqi256_mask, "vV32c*V32cUi","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_storeapd128_mask, "vV2d*V2dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storeapd256_mask, "vV4d*V4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storeaps128_mask, "vV4f*V4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storeaps256_mask, "vV8f*V8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storedqudi128_mask, "vV2LLi*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storedqudi256_mask, "vV4LLi*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storedqusi128_mask, "vV4i*V4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storedqusi256_mask, "vV8i*V8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storeupd128_mask, "vV2d*V2dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storeupd256_mask, "vV4d*V4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storeups128_mask, "vV4f*V4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_storeups256_mask, "vV8f*V8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rcp14pd128_mask, "V2dV2dV2dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rcp14pd256_mask, "V4dV4dV4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rcp14ps128_mask, "V4fV4fV4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rcp14ps256_mask, "V8fV8fV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vplzcntd_128_mask, "V4iV4iV4iUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vplzcntd_256_mask, "V8iV8iV8iUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vplzcntq_128_mask, "V2LLiV2LLiV2LLiUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vplzcntq_256_mask, "V4LLiV4LLiV4LLiUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "LLiV2dIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtsd2si32, "iV2dIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi32, "UiV2dIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "ULLiV2dIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtss2si32, "iV4fIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "LLiV4fIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtss2usi32, "UiV4fIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtss2usi64, "ULLiV4fIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttsd2si32, "iV2dIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttsd2si64, "LLiV2dIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi32, "UiV2dIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi64, "ULLiV2dIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttss2si32, "iV4fIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttss2si64, "LLiV4fIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttss2usi32, "UiV4fIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttss2usi64, "ULLiV4fIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermi2vard512_mask, "V16iV16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermi2varpd512_mask, "V8dV8dV8LLiV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermi2varps512_mask, "V16fV16fV16iV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermi2varq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermilvarpd512_mask, "V8dV8dV8LLiV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermilvarps512_mask, "V16fV16fV16iV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermt2vard512_maskz, "V16iV16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varpd512_maskz, "V8dV8LLiV8dV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varps512_maskz, "V16fV16iV16fV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermt2varq512_maskz, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermilvarpd_mask, "V2dV2dV2LLiV2dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256_mask, "V4dV4dV4LLiV4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermilvarps_mask, "V4fV4fV4iV4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpermilvarps256_mask, "V8fV8fV8iV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestmb512, "ULLiV64cV64cULLi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_ptestmw512, "UiV32sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_ptestnmb512, "ULLiV64cV64cULLi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_ptestnmw512, "UiV32sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_ptestmb128, "UsV16cV16cUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestmb256, "UiV32cV32cUi","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestmw128, "UcV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestmw256, "UsV16sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmb128, "UsV16cV16cUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmb256, "UiV32cV32cUi","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmw128, "UcV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmw256, "UsV16sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestmd128, "UcV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestmd256, "UcV8iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestmq128, "UcV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestmq256, "UcV4LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmd128, "UcV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmd256, "UcV8iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmq128, "UcV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmq256, "UcV4LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_ptestnmd512, "UsV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_ptestnmq512, "UcV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_rndscalesd_round_mask, "V2dV2dV2dV2dUcIiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_rndscaless_round_mask, "V4fV4fV4fV4fUcIiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_scalefpd512_mask, "V8dV8dV8dV8dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_scalefps512_mask, "V16fV16fV16fV16fUsIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_scalefsd_round_mask, "V2dV2dV2dV2dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_scalefss_round_mask, "V4fV4fV4fV4fUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psradi512_mask, "V16iV16iIiV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psraqi512_mask, "V8LLiV8LLiIiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrad128_mask, "V4iV4iV4iV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psrad256_mask, "V8iV8iV4iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psradi128_mask, "V4iV4iIiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psradi256_mask, "V8iV8iIiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psraq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psraq256_mask, "V4LLiV4LLiV2LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psraqi128_mask, "V2LLiV2LLiIiV2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_psraqi256_mask, "V4LLiV4LLiIiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pslld512_mask, "V16iV16iV4iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psllq512_mask, "V8LLiV8LLiV2LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psllv16si_mask, "V16iV16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psllv8di_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrad512_mask, "V16iV16iV4iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psraq512_mask, "V8LLiV8LLiV2LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrav16si_mask, "V16iV16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrav8di_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrld512_mask, "V16iV16iV4iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrlq512_mask, "V8LLiV8LLiV2LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrlv16si_mask, "V16iV16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_psrlv8di_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pternlogd512_mask, "V16iV16iV16iV16iIiUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pternlogd512_maskz, "V16iV16iV16iV16iIiUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pternlogq512_mask, "V8LLiV8LLiV8LLiV8LLiIiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pternlogq512_maskz, "V8LLiV8LLiV8LLiV8LLiIiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pternlogd128_mask, "V4iV4iV4iV4iIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pternlogd128_maskz, "V4iV4iV4iV4iIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pternlogd256_mask, "V8iV8iV8iV8iIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pternlogd256_maskz, "V8iV8iV8iV8iIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pternlogq128_mask, "V2LLiV2LLiV2LLiV2LLiIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pternlogq128_maskz, "V2LLiV2LLiV2LLiV2LLiIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pternlogq256_mask, "V4LLiV4LLiV4LLiV4LLiIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pternlogq256_maskz, "V4LLiV4LLiV4LLiV4LLiIiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_shuf_f32x4_mask, "V16fV16fV16fIiV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_shuf_f64x2_mask, "V8dV8dV8dIiV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_shuf_i32x4_mask, "V16iV16iV16iIiV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_shuf_i64x2_mask, "V8LLiV8LLiV8LLiIiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_shuf_f32x4_256_mask, "V8fV8fV8fIiV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_shuf_f64x2_256_mask, "V4dV4dV4dIiV4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_shuf_i32x4_256_mask, "V8iV8iV8iIiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_shuf_i64x2_256_mask, "V4LLiV4LLiV4LLiIiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_sqrtsd_round_mask, "V2dV2dV2dV2dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_sqrtss_round_mask, "V4fV4fV4fV4fUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_rsqrt14pd128_mask, "V2dV2dV2dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rsqrt14pd256_mask, "V4dV4dV4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rsqrt14ps128_mask, "V4fV4fV4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rsqrt14ps256_mask, "V8fV8fV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtb2mask512, "ULLiV64c","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2b512, "V64cULLi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2w512, "V32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_cvtd2mask512, "UsV16i","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2d512, "V16iUs","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2q512, "V8LLiUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_cvtq2mask512, "UcV8LLi","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_cvtb2mask128, "UsV16c","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtb2mask256, "UiV32c","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2b128, "V16cUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2b256, "V32cUi","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2w128, "V8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2w256, "V16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtd2mask128, "UcV4i","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtd2mask256, "UcV8i","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2d128, "V4iUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2d256, "V8iUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2q128, "V2LLiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtmask2q256, "V4LLiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtq2mask128, "UcV2LLi","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtq2mask256, "UcV4LLi","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcastmb512, "V8LLiUc","","avx512cd")
+TARGET_BUILTIN(__builtin_ia32_broadcastmw512, "V16iUs","","avx512cd")
+TARGET_BUILTIN(__builtin_ia32_broadcastf32x4_512, "V16fV4fV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_broadcastf64x4_512, "V8dV4dV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_broadcasti32x4_512, "V16iV4iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_broadcasti64x4_512, "V8LLiV4LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_broadcastmb128, "V2LLiUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcastmb256, "V4LLiUc","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcastmw128, "V4iUs","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcastmw256, "V8iUs","","avx512cd,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcastf32x2_512_mask, "V16fV4fV16fUs","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_broadcastf32x8_512_mask, "V16fV8fV16fUs","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_broadcastf64x2_512_mask, "V8dV2dV8dUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_broadcasti32x2_512_mask, "V16iV4iV16iUs","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_broadcasti32x8_512_mask, "V16iV8iV16iUs","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_broadcasti64x2_512_mask, "V8LLiV2LLiV8LLiUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_broadcastf32x2_256_mask, "V8fV4fV8fUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcastf64x2_256_mask, "V4dV2dV4dUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcasti32x2_128_mask, "V4iV4iV4iUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcasti32x2_256_mask, "V8iV4iV8iUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcasti64x2_256_mask, "V4LLiV2LLiV4LLiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcastf32x4_256_mask, "V8fV4fV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_broadcasti32x4_256_mask, "V8iV4iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastw512_gpr_mask, "V32shV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastw256_gpr_mask, "V16shV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pbroadcastw128_gpr_mask, "V8ssV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsdb512_mask, "V16cV16iV16cUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsdb512mem_mask, "vV16c*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovswb512mem_mask, "vV32c*V32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovsdw512_mask, "V16sV16iV16sUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsdw512mem_mask, "vV16s*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsqb512_mask, "V16cV8LLiV16cUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsqb512mem_mask, "vV16c*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsqd512_mask, "V8iV8LLiV8iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsqd512mem_mask, "vV8i*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsqw512_mask, "V8sV8LLiV8sUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsqw512mem_mask, "vV8s*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovsdb128_mask, "V16cV4iV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsdb128mem_mask, "vV16c*V4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovswb128mem_mask, "vV16c*V8sUc","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovsdb256_mask, "V16cV8iV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsdb256mem_mask, "vV16c*V8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovswb256mem_mask, "vV16c*V16sUs","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovsdw128_mask, "V8sV4iV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsdw128mem_mask, "vV8s*V4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsdw256_mask, "V8sV8iV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsdw256mem_mask, "vV8s*V8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqb128_mask, "V16cV2LLiV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqb128mem_mask, "vV16c*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqb256_mask, "V16cV4LLiV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqb256mem_mask, "vV16c*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqd128_mask, "V4iV2LLiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqd128mem_mask, "vV4i*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqd256_mask, "V4iV4LLiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqd256mem_mask, "vV4i*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqw128_mask, "V8sV2LLiV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqw128mem_mask, "vV8s*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqw256_mask, "V8sV4LLiV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovsqw256mem_mask, "vV8s*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusdb512_mask, "V16cV16iV16cUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusdb512mem_mask, "vV16c*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovuswb512mem_mask, "vV32c*V32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovusdw512_mask, "V16sV16iV16sUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusdw512mem_mask, "vV16s*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusqb512_mask, "V16cV8LLiV16cUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusqb512mem_mask, "vV16c*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusqd512_mask, "V8iV8LLiV8iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusqd512mem_mask, "vV8i*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusqw512_mask, "V8sV8LLiV8sUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusqw512mem_mask, "vV8s*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovusdb128_mask, "V16cV4iV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusdb128mem_mask, "vV16c*V4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovuswb128mem_mask, "vV16c*V8sUc","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovusdb256_mask, "V16cV8iV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusdb256mem_mask, "vV16c*V8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovuswb256mem_mask, "vV16c*V16sUs","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovusdw128_mask, "V8sV4iV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusdw128mem_mask, "vV8s*V4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusdw256_mask, "V8sV8iV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusdw256mem_mask, "vV8s*V8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqb128_mask, "V16cV2LLiV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqb128mem_mask, "vV16c*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqb256_mask, "V16cV4LLiV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqb256mem_mask, "vV16c*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqd128_mask, "V4iV2LLiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqd128mem_mask, "vV4i*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqd256_mask, "V4iV4LLiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqd256mem_mask, "vV4i*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqw128_mask, "V8sV2LLiV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqw128mem_mask, "vV8s*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqw256_mask, "V8sV4LLiV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovusqw256mem_mask, "vV8s*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovdb512_mask, "V16cV16iV16cUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovdb512mem_mask, "vV16c*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovwb512mem_mask, "vV32c*V32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovdw512_mask, "V16sV16iV16sUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovdw512mem_mask, "vV16s*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovqb512_mask, "V16cV8LLiV16cUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovqb512mem_mask, "vV16c*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovqd512_mask, "V8iV8LLiV8iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovqd512mem_mask, "vV8i*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovqw512_mask, "V8sV8LLiV8sUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovqw512mem_mask, "vV8s*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_pmovdb128_mask, "V16cV4iV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovwb128mem_mask, "vV16c*V8sUc","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovdb128mem_mask, "vV16c*V4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovdb256_mask, "V16cV8iV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovdb256mem_mask, "vV16c*V8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovwb256mem_mask, "vV16c*V16sUs","","avx512vl,avx512bw")
+TARGET_BUILTIN(__builtin_ia32_pmovdw128_mask, "V8sV4iV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovdw128mem_mask, "vV8s*V4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovdw256_mask, "V8sV8iV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovdw256mem_mask, "vV8s*V8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqb128_mask, "V16cV2LLiV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqb128mem_mask, "vV16c*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqb256_mask, "V16cV4LLiV16cUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqb256mem_mask, "vV16c*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqd128_mask, "V4iV2LLiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqd128mem_mask, "vV4i*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqd256_mask, "V4iV4LLiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqd256mem_mask, "vV4i*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqw128_mask, "V8sV2LLiV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqw128mem_mask, "vV8s*V2LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqw256_mask, "V8sV4LLiV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_pmovqw256mem_mask, "vV8s*V4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extractf32x8_mask, "V8fV16fIiV8fUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_extractf64x2_512_mask, "V2dV8dIiV2dUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_extracti32x8_mask, "V8iV16iIiV8iUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_extracti64x2_512_mask, "V2LLiV8LLiIiV2LLiUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_extracti32x4_mask, "V4iV16iIiV4iUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_extracti64x4_mask, "V4LLiV8LLiIiV4LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_extractf64x2_256_mask, "V2dV4dIiV2dUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extracti64x2_256_mask, "V2LLiV4LLiIiV2LLiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extractf32x4_256_mask, "V4fV8fIiV4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extracti32x4_256_mask, "V4iV8iIiV4iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_insertf32x8_mask, "V16fV16fV8fIiV16fUs","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_insertf64x2_512_mask, "V8dV8dV2dIiV8dUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_inserti32x8_mask, "V16iV16iV8iIiV16iUs","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_inserti64x2_512_mask, "V8LLiV8LLiV2LLiIiV8LLiUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_insertf64x4_mask, "V8dV8dV4dIiV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_inserti64x4_mask, "V8LLiV8LLiV4LLiIiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_insertf64x2_256_mask, "V4dV4dV2dIiV4dUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_inserti64x2_256_mask, "V4LLiV4LLiV2LLiIiV4LLiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_insertf32x4_256_mask, "V8fV8fV4fIiV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_inserti32x4_256_mask, "V8iV8iV4iIiV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_insertf32x4_mask, "V16fV16fV4fIiV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_inserti32x4_mask, "V16iV16iV4iIiV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_getmantpd128_mask, "V2dV2diV2dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getmantpd256_mask, "V4dV4diV4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getmantps128_mask, "V4fV4fiV4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getmantps256_mask, "V8fV8fiV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getmantpd512_mask, "V8dV8diV8dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_getmantps512_mask, "V16fV16fiV16fUsIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_getexppd512_mask, "V8dV8dV8dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_getexpps512_mask, "V16fV16fV16fUsIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask,  "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddss3_maskz, "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask3, "V4fV4fV4fV4fUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask,  "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_maskz, "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask3, "V2dV2dV2dV2dUcIi", "", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_permvarhi512_mask, "V32sV32sV32sV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_permvardf512_mask, "V8dV8dV8LLiV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_permvardi512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_permvarsf512_mask, "V16fV16fV16iV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_permvarsi512_mask, "V16iV16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_permvarqi512_mask, "V64cV64cV64cV64cULLi","","avx512vbmi")
+TARGET_BUILTIN(__builtin_ia32_permvarqi128_mask, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_permvarqi256_mask, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_permvarhi128_mask, "V8sV8sV8sV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_permvarhi256_mask, "V16sV16sV16sV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_permvardf256_mask, "V4dV4dV4LLiV4dUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_permvardi256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_permvarsf256_mask, "V8fV8fV8iV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_permvarsi256_mask, "V8iV8iV8iV8iUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclasspd128_mask, "UcV2dIiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclasspd256_mask, "UcV4dIiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclassps128_mask, "UcV4fIiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclassps256_mask, "UcV8fIiUc","","avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclassps512_mask, "UsV16fIiUs","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_fpclasspd512_mask, "UcV8dIiUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_fpclasssd_mask, "UcV2dIiUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_fpclassss_mask, "UcV4fIiUc","","avx512dq")
+TARGET_BUILTIN(__builtin_ia32_kandhi, "UsUsUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_kandnhi, "UsUsUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_korhi, "UsUsUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_kortestchi, "iUsUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_kortestzhi, "iUsUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_movntdqa512, "V8LLiV8LLi*","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_palignr512_mask, "V64cV64cV64cIiV64cULLi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_palignr128_mask, "V16cV16cV16cIiV16cUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_palignr256_mask, "V32cV32cV32cIiV32cUi","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_dbpsadbw128_mask, "V8sV16cV16cIiV8sUc","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_dbpsadbw256_mask, "V16sV32cV32cIiV16sUs","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_dbpsadbw512_mask, "V32sV64cV64cIiV32sUi","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_psadbw512, "V8LLiV64cV64c","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_compressdf512_mask, "V8dV8dV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_compressdi512_mask, "V8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_compresssf512_mask, "V16fV16fV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_compresssi512_mask, "V16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cmpsd_mask, "UcV2dV2dIiUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cmpss_mask, "UcV4fV4fIiUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_expanddf512_mask, "V8dV8dV8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_expanddi512_mask, "V8LLiV8LLiV8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_expandloaddf512_mask, "V8dV8dC*V8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_expandloaddi512_mask, "V8LLiV8LLiC*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_expandloadsf512_mask, "V16fV16fC*V16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_expandloadsi512_mask, "V16iV16iC*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_expandsf512_mask, "V16fV16fV16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_expandsi512_mask, "V16iV16iV16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtps2pd512_mask, "V8dV8fV8dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_compressstoredf512_mask, "vV8d*V8dUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_compressstoredi512_mask, "vV8LLi*V8LLiUc","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_compressstoresf512_mask, "vV16f*V16fUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_compressstoresi512_mask, "vV16i*V16iUs","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2ps_mask, "V4fV8sV4fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256_mask, "V8fV8sV8fUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2ph_mask, "V8sV4fIiV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256_mask, "V8sV8fIiV8sUc","","avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtw2mask512, "UiV32s","","avx512bw")
+TARGET_BUILTIN(__builtin_ia32_cvtw2mask128, "UcV8s","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtw2mask256, "UsV16s","","avx512bw,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cvtsd2ss_round_mask, "V4fV4fV2dV4fUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtsi2sd64, "V2dV2dLLiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtsi2ss32, "V4fV4fiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fLLiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtss2sd_round_mask, "V2dV2dV4fV2dUcIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtusi2sd32, "V2dV2dUi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dULLiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtusi2ss32, "V4fV4fUiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fULLiIi","","avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb512_mask, "V64cV64cV64cV64cULLi","","avx512vbmi")
+TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb128_mask, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb256_mask, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
+
+// generic select intrinsics
+TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectb_512, "V64cULLiV64cV64c", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectw_128, "V8sUcV8sV8s", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectw_256, "V16sUsV16sV16s", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectw_512, "V32sUiV32sV32s", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectd_128, "V4iUcV4iV4i", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectd_256, "V8iUcV8iV8i", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectd_512, "V16iUsV16iV16i", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2LLiUcV2LLiV2LLi", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4LLiUcV4LLiV4LLi", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8LLiUcV8LLiV8LLi", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectps_128, "V4fUcV4fV4f", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectps_256, "V8fUcV8fV8f", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "", "")
+TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "", "")
+
+// MONITORX/MWAITX
+TARGET_BUILTIN(__builtin_ia32_monitorx, "vv*UiUi", "", "mwaitx")
+TARGET_BUILTIN(__builtin_ia32_mwaitx, "vUiUiUi", "", "mwaitx")
 
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/include/clang/Basic/Cuda.h b/include/clang/Basic/Cuda.h
new file mode 100644
index 0000000..ad1139b
--- /dev/null
+++ b/include/clang/Basic/Cuda.h
@@ -0,0 +1,77 @@
+//===--- Cuda.h - Utilities for compiling CUDA code  ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_CUDA_H
+#define LLVM_CLANG_BASIC_CUDA_H
+
+namespace llvm {
+class StringRef;
+} // namespace llvm
+
+namespace clang {
+
+enum class CudaVersion {
+  UNKNOWN,
+  CUDA_70,
+  CUDA_75,
+  CUDA_80,
+};
+const char *CudaVersionToString(CudaVersion V);
+
+// No string -> CudaVersion conversion function because there's no canonical
+// spelling of the various CUDA versions.
+
+enum class CudaArch {
+  UNKNOWN,
+  SM_20,
+  SM_21,
+  SM_30,
+  SM_32,
+  SM_35,
+  SM_37,
+  SM_50,
+  SM_52,
+  SM_53,
+  SM_60,
+  SM_61,
+  SM_62,
+};
+const char *CudaArchToString(CudaArch A);
+
+// The input should have the form "sm_20".
+CudaArch StringToCudaArch(llvm::StringRef S);
+
+enum class CudaVirtualArch {
+  UNKNOWN,
+  COMPUTE_20,
+  COMPUTE_30,
+  COMPUTE_32,
+  COMPUTE_35,
+  COMPUTE_37,
+  COMPUTE_50,
+  COMPUTE_52,
+  COMPUTE_53,
+  COMPUTE_60,
+  COMPUTE_61,
+  COMPUTE_62,
+};
+const char *CudaVirtualArchToString(CudaVirtualArch A);
+
+// The input should have the form "compute_20".
+CudaVirtualArch StringToCudaVirtualArch(llvm::StringRef S);
+
+/// Get the compute_xx corresponding to an sm_yy.
+CudaVirtualArch VirtualArchForCudaArch(CudaArch A);
+
+/// Get the earliest CudaVersion that supports the given CudaArch.
+CudaVersion MinVersionForCudaArch(CudaArch A);
+
+} // namespace clang
+
+#endif
diff --git a/include/clang/Basic/DeclNodes.td b/include/clang/Basic/DeclNodes.td
index 723ea54..f29c399 100644
--- a/include/clang/Basic/DeclNodes.td
+++ b/include/clang/Basic/DeclNodes.td
@@ -11,6 +11,8 @@
 class DeclContext { }
 
 def TranslationUnit : Decl, DeclContext;
+def PragmaComment : Decl;
+def PragmaDetectMismatch : Decl;
 def ExternCContext : Decl, DeclContext;
 def Named : Decl<1>;
   def Namespace : DDecl<Named>, DeclContext;
@@ -35,6 +37,8 @@
     def EnumConstant : DDecl<Value>;
     def UnresolvedUsingValue : DDecl<Value>;
     def IndirectField : DDecl<Value>;
+    def Binding : DDecl<Value>;
+    def OMPDeclareReduction : DDecl<Value>, DeclContext;
     def Declarator : DDecl<Value, 1>;
       def Field : DDecl<Declarator>;
         def ObjCIvar : DDecl<Field>;
@@ -51,6 +55,8 @@
             : DDecl<VarTemplateSpecialization>;
         def ImplicitParam : DDecl<Var>;
         def ParmVar : DDecl<Var>;
+        def Decomposition : DDecl<Var>;
+        def OMPCapturedExpr : DDecl<Var>;
       def NonTypeTemplateParm : DDecl<Declarator>;
   def Template : DDecl<Named, 1>;
     def RedeclarableTemplate : DDecl<Template, 1>;
@@ -62,6 +68,7 @@
     def BuiltinTemplate : DDecl<Template>;
   def Using : DDecl<Named>;
   def UsingShadow : DDecl<Named>;
+    def ConstructorUsingShadow : DDecl<UsingShadow>;
   def ObjCMethod : DDecl<Named>, DeclContext;
   def ObjCContainer : DDecl<Named, 1>, DeclContext;
     def ObjCCategory : DDecl<ObjCContainer>;
diff --git a/include/clang/Basic/Diagnostic.h b/include/clang/Basic/Diagnostic.h
index dd4443e..49470d2 100644
--- a/include/clang/Basic/Diagnostic.h
+++ b/include/clang/Basic/Diagnostic.h
@@ -344,11 +344,10 @@
   std::string FlagValue;
 
 public:
-  explicit DiagnosticsEngine(
-                      const IntrusiveRefCntPtr<DiagnosticIDs> &Diags,
-                      DiagnosticOptions *DiagOpts,
-                      DiagnosticConsumer *client = nullptr,
-                      bool ShouldOwnClient = true);
+  explicit DiagnosticsEngine(IntrusiveRefCntPtr<DiagnosticIDs> Diags,
+                             DiagnosticOptions *DiagOpts,
+                             DiagnosticConsumer *client = nullptr,
+                             bool ShouldOwnClient = true);
   ~DiagnosticsEngine();
 
   const IntrusiveRefCntPtr<DiagnosticIDs> &getDiagnosticIDs() const {
@@ -1072,10 +1071,10 @@
 // so that we only match those arguments that are (statically) DeclContexts;
 // other arguments that derive from DeclContext (e.g., RecordDecls) will not
 // match.
-template<typename T>
-inline
-typename std::enable_if<std::is_same<T, DeclContext>::value,
-                        const DiagnosticBuilder &>::type
+template <typename T>
+inline typename std::enable_if<
+    std::is_same<typename std::remove_const<T>::type, DeclContext>::value,
+    const DiagnosticBuilder &>::type
 operator<<(const DiagnosticBuilder &DB, T *DC) {
   DB.AddTaggedVal(reinterpret_cast<intptr_t>(DC),
                   DiagnosticsEngine::ak_declcontext);
diff --git a/include/clang/Basic/DiagnosticASTKinds.td b/include/clang/Basic/DiagnosticASTKinds.td
index 0b37030..03ed8aa 100644
--- a/include/clang/Basic/DiagnosticASTKinds.td
+++ b/include/clang/Basic/DiagnosticASTKinds.td
@@ -26,6 +26,9 @@
 def note_constexpr_invalid_function : Note<
   "%select{non-constexpr|undefined}0 %select{function|constructor}1 %2 cannot "
   "be used in a constant expression">;
+def note_constexpr_invalid_inhctor : Note<
+  "constructor inherited from base class %0 cannot be used in a "
+  "constant expression; derived class cannot be implicitly initialized">;
 def note_constexpr_no_return : Note<
   "control reached end of constexpr function">;
 def note_constexpr_virtual_call : Note<
@@ -141,6 +144,8 @@
   "(skipping %0 call%s0 in backtrace; use -fconstexpr-backtrace-limit=0 to "
   "see all)">;
 def note_constexpr_call_here : Note<"in call to '%0'">;
+def note_constexpr_inherited_ctor_call_here : Note<
+  "in implicit initialization for inherited constructor of %0">;
 def note_constexpr_baa_insufficient_alignment : Note<
   "%select{alignment of|offset of the aligned pointer from}0 the base pointee "
   "object (%1 %plural{1:byte|:bytes}1) is %select{less than|not a multiple of}0 the "
@@ -153,6 +158,12 @@
   "overflow in expression; result is %0 with type %1">,
   InGroup<DiagGroup<"integer-overflow">>;
 
+// This is a temporary diagnostic, and shall be removed once our 
+// implementation is complete, and like the preceding constexpr notes belongs
+// in Sema.
+def note_unimplemented_constexpr_lambda_feature_ast : Note<
+    "unimplemented constexpr lambda feature: %0 (coming soon!)">;
+
 // inline asm related.
 let CategoryName = "Inline Assembly Issue" in {
   def err_asm_invalid_escape : Error<
diff --git a/include/clang/Basic/DiagnosticCommonKinds.td b/include/clang/Basic/DiagnosticCommonKinds.td
index 0be704c..ec06026 100644
--- a/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/include/clang/Basic/DiagnosticCommonKinds.td
@@ -96,8 +96,6 @@
   "timed out waiting to acquire lock file for module '%0'">, DefaultFatal;
 def err_module_cycle : Error<"cyclic dependency in module '%0': %1">, 
   DefaultFatal;
-def err_module_prebuilt : Error<
-  "error in loading module '%0' from prebuilt module path">, DefaultFatal;
 def note_pragma_entered_here : Note<"#pragma entered here">;  
 def note_decl_hiding_tag_type : Note<
   "%1 %0 is hidden by a non-type declaration of %0 here">;
@@ -161,6 +159,8 @@
   "this literal will %select{have type 'long long'|be ill-formed}0 "
   "in C++11 onwards">,
   InGroup<CXX11Compat>;
+def ext_clang_enable_if : Extension<"'enable_if' is a clang extension">,
+                          InGroup<GccCompat>;
 
 // SEH
 def err_seh_expected_handler : Error<
@@ -182,6 +182,9 @@
   "unknown target triple '%0', please use -triple or -arch">;
 def err_target_unknown_cpu : Error<"unknown target CPU '%0'">;
 def err_target_unknown_abi : Error<"unknown target ABI '%0'">;
+def err_target_unsupported_abi : Error<"ABI '%0' is not supported on CPU '%1'">;
+def err_target_unsupported_abi_for_triple : Error<
+  "ABI '%0' is not supported for '%1'">;
 def err_target_unknown_fpmath : Error<"unknown FP unit '%0'">;
 def err_target_unsupported_fpmath : Error<
     "the '%0' unit is not supported with this instruction set">;
diff --git a/include/clang/Basic/DiagnosticDriverKinds.td b/include/clang/Basic/DiagnosticDriverKinds.td
index 5479c35..27bcd77 100644
--- a/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/include/clang/Basic/DiagnosticDriverKinds.td
@@ -23,10 +23,23 @@
 def err_drv_invalid_arch_name : Error<
   "invalid arch name '%0'">;
 def err_drv_cuda_bad_gpu_arch : Error<"Unsupported CUDA gpu architecture: %0">;
+def err_drv_no_cuda_installation : Error<
+  "cannot find CUDA installation.  Provide its path via --cuda-path, or pass "
+  "-nocudainc to build without CUDA includes.">;
+def err_drv_no_cuda_libdevice : Error<
+  "cannot find libdevice for %0. Provide path to different CUDA installation "
+  "via --cuda-path, or pass -nocudalib to build without linking with libdevice.">;
+def err_drv_cuda_version_too_low : Error<
+  "GPU arch %1 requires CUDA version at least %3, but installation at %0 is %2. "
+  "Use --cuda-path to specify a different CUDA install, or pass "
+  "--no-cuda-version-check.">;
+def err_drv_cuda_nvptx_host : Error<"unsupported use of NVPTX for host compilation.">;
 def err_drv_invalid_thread_model_for_target : Error<
   "invalid thread model '%0' in '%1' for this target">;
 def err_drv_invalid_linker_name : Error<
   "invalid linker name in argument '%0'">;
+def err_drv_invalid_pgo_instrumentor : Error<
+  "invalid PGO instrumentor in argument '%0'">;
 def err_drv_invalid_rtlib_name : Error<
   "invalid runtime library name in argument '%0'">;
 def err_drv_unsupported_rtlib_for_platform : Error<
@@ -93,6 +106,23 @@
 def err_drv_I_dash_not_supported : Error<
   "'%0' not supported, please use -iquote instead">;
 def err_drv_unknown_argument : Error<"unknown argument: '%0'">;
+def warn_drv_unknown_argument_clang_cl : Warning<
+  "unknown argument ignored in clang-cl: '%0'">,
+  InGroup<UnknownArgument>;
+
+def warn_drv_ycyu_no_arg_clang_cl : Warning<
+  "support for '%0' without a filename not implemented yet; flag ignored">,
+  InGroup<ClangClPch>;
+def warn_drv_ycyu_different_arg_clang_cl : Warning<
+  "support for '/Yc' and '/Yu' with different filenames not implemented yet; flags ignored">,
+  InGroup<ClangClPch>;
+def warn_drv_ycyu_no_fi_arg_clang_cl : Warning<
+  "support for '%0' without a corresponding /FI flag not implemented yet; flag ignored">,
+  InGroup<ClangClPch>;
+def warn_drv_yc_multiple_inputs_clang_cl : Warning<
+  "support for '/Yc' with more than one source file not implemented yet; flag ignored">,
+  InGroup<ClangClPch>;
+
 def err_drv_invalid_value : Error<"invalid value '%1' in '%0'">;
 def err_drv_invalid_int_value : Error<"invalid integral value '%1' in '%0'">;
 def err_drv_invalid_remap_file : Error<
@@ -127,6 +157,10 @@
 def err_drv_invalid_omp_target : Error<"OpenMP target is invalid: '%0'">;
 def err_drv_omp_host_ir_file_not_found : Error<
   "The provided host compiler IR file '%0' is required to generate code for OpenMP target regions but cannot be found.">;
+def err_drv_omp_host_target_not_supported : Error<
+  "The target '%0' is not a supported OpenMP host target.">;
+def err_drv_bitcode_unsupported_on_toolchain : Error<
+  "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 
 def warn_O4_is_O3 : Warning<"-O4 is equivalent to -O3">, InGroup<Deprecated>;
 def warn_drv_lto_libpath : Warning<"libLTO.dylib relative to clang installed dir not found; using 'ld' default search path instead">,
@@ -206,12 +240,20 @@
 def warn_drv_invoking_fallback : Warning<"falling back to %0">,
   InGroup<Fallback>;
 
+def err_drv_ropi_rwpi_incompatible_with_pic : Error<
+  "embedded and GOT-based position independence are incompatible">;
+def err_drv_ropi_incompatible_with_cxx : Error<
+  "ROPI is not compatible with c++">;
+
 def warn_target_unsupported_nan2008 : Warning<
   "ignoring '-mnan=2008' option because the '%0' architecture does not support it">,
   InGroup<UnsupportedNan>;
 def warn_target_unsupported_nanlegacy : Warning<
   "ignoring '-mnan=legacy' option because the '%0' architecture does not support it">,
   InGroup<UnsupportedNan>;
+def warn_target_unsupported_compact_branches : Warning<
+  "ignoring '-mcompact-branches=' option because the '%0' architecture does not"
+  " support it">, InGroup<UnsupportedCB>;
 
 def warn_drv_unable_to_find_directory_expected : Warning<
   "unable to find %0 directory, expected to be in '%1'">,
@@ -222,7 +264,7 @@
   InGroup<OptionIgnored>;
 
 def warn_drv_ps4_sdk_dir : Warning<
-  "environment variable SCE_PS4_SDK_DIR is set, but points to invalid or nonexistent directory '%0'">,
+  "environment variable SCE_ORBIS_SDK_DIR is set, but points to invalid or nonexistent directory '%0'">,
   InGroup<InvalidOrNonExistentDirectory>;
 
 def err_drv_unsupported_linker : Error<"unsupported value '%0' for -linker option">;
diff --git a/include/clang/Basic/DiagnosticFrontendKinds.td b/include/clang/Basic/DiagnosticFrontendKinds.td
index 7dcf697..757c329 100644
--- a/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -58,8 +58,10 @@
     BackendInfo, InGroup<BackendOptimizationRemarkAnalysis>;
 def warn_fe_backend_optimization_failure : Warning<"%0">, BackendInfo,
     InGroup<BackendOptimizationFailure>, DefaultWarn;
-def note_fe_backend_optimization_remark_invalid_loc : Note<"could "
-  "not determine the original source location for %0:%1:%2">;
+def note_fe_backend_invalid_loc : Note<"could "
+  "not determine the original source location for %0:%1:%2">, BackendInfo;
+
+def err_fe_backend_unsupported : Error<"%0">, BackendInfo;
 
 def remark_sanitize_address_insert_extra_padding_accepted : Remark<
     "-fsanitize-address-field-padding applied to %0">, ShowInSystemHeader,
@@ -208,10 +210,6 @@
   "test module file extension '%0' has different version (%1.%2) than expected "
   "(%3.%4)">;
 
-def err_conflicting_module_names : Error<
-  "conflicting module names specified: '-fmodule-name=%0' and "
-  "'-fmodule-implementation-of %1'">;
-
 def err_missing_vfs_overlay_file : Error<
   "virtual filesystem overlay file '%0' not found">, DefaultFatal;
 def err_invalid_vfs_overlay : Error<
@@ -220,4 +218,7 @@
 def err_no_apinotes_cache_path : Error<
   "-fapinotes was provided without -fapinotes-cache-path=<directory>">,
   DefaultFatal;
+
+def warn_option_invalid_ocl_version : Warning<
+  "OpenCL version %0 does not support the option '%1'">, InGroup<Deprecated>;
 }
diff --git a/include/clang/Basic/DiagnosticGroups.td b/include/clang/Basic/DiagnosticGroups.td
index 6084a1e..d9ad81d 100644
--- a/include/clang/Basic/DiagnosticGroups.td
+++ b/include/clang/Basic/DiagnosticGroups.td
@@ -46,10 +46,17 @@
                                                    UndefinedBoolConversion]>;
 def IntConversion : DiagGroup<"int-conversion">;
 def EnumConversion : DiagGroup<"enum-conversion">;
-def FloatConversion : DiagGroup<"float-conversion">;
+
+def FloatOverflowConversion : DiagGroup<"float-overflow-conversion">;
+def FloatZeroConversion : DiagGroup<"float-zero-conversion">;
+def FloatConversion :
+  DiagGroup<"float-conversion", [FloatOverflowConversion,
+                                 FloatZeroConversion]>;
+
 def DoublePromotion : DiagGroup<"double-promotion">;
 def EnumTooLarge : DiagGroup<"enum-too-large">;
 def UnsupportedNan : DiagGroup<"unsupported-nan">;
+def UnsupportedCB : DiagGroup<"unsupported-cb">;
 def NonLiteralNullConversion : DiagGroup<"non-literal-null-conversion">;
 def NullConversion : DiagGroup<"null-conversion">;
 def ImplicitConversionFloatingPointToBool :
@@ -75,6 +82,8 @@
 def GNUDesignator : DiagGroup<"gnu-designator">;
 def GNUStringLiteralOperatorTemplate :
   DiagGroup<"gnu-string-literal-operator-template">;
+def UndefinedVarTemplate : DiagGroup<"undefined-var-template">;
+def UndefinedFuncTemplate : DiagGroup<"undefined-func-template">;
 
 def DeleteIncomplete : DiagGroup<"delete-incomplete">;
 def DeleteNonVirtualDtor : DiagGroup<"delete-non-virtual-dtor">;
@@ -86,7 +95,9 @@
 def DeprecatedAttributes : DiagGroup<"deprecated-attributes">;
 def DeprecatedDeclarations : DiagGroup<"deprecated-declarations">;
 def UnavailableDeclarations : DiagGroup<"unavailable-declarations">;
-def PartialAvailability : DiagGroup<"partial-availability">;
+def UnguardedAvailability : DiagGroup<"unguarded-availability">;
+// partial-availability is an alias of unguarded-availability.
+def : DiagGroup<"partial-availability", [UnguardedAvailability]>;
 def DeprecatedImplementations :DiagGroup<"deprecated-implementations">;
 def DeprecatedIncrementBool : DiagGroup<"deprecated-increment-bool">;
 def DeprecatedRegister : DiagGroup<"deprecated-register">;
@@ -204,6 +215,7 @@
 def DanglingElse: DiagGroup<"dangling-else">;
 def DanglingField : DiagGroup<"dangling-field">;
 def DistributedObjectModifiers : DiagGroup<"distributed-object-modifiers">;
+def ExpansionToDefined : DiagGroup<"expansion-to-defined">;
 def FlagEnum : DiagGroup<"flag-enum">;
 def IncrementBool : DiagGroup<"increment-bool", [DeprecatedIncrementBool]>;
 def InfiniteRecursion : DiagGroup<"infinite-recursion">;
@@ -214,9 +226,12 @@
 def IncompatibleMSStruct : DiagGroup<"incompatible-ms-struct">;
 def IncompatiblePointerTypesDiscardsQualifiers 
   : DiagGroup<"incompatible-pointer-types-discards-qualifiers">;
+def IncompatibleFunctionPointerTypes
+  : DiagGroup<"incompatible-function-pointer-types">;
 def IncompatiblePointerTypes
   : DiagGroup<"incompatible-pointer-types",
-    [IncompatiblePointerTypesDiscardsQualifiers]>;
+    [IncompatiblePointerTypesDiscardsQualifiers,
+     IncompatibleFunctionPointerTypes]>;
 def IncompleteUmbrella : DiagGroup<"incomplete-umbrella">;
 def NonModularIncludeInFrameworkModule
   : DiagGroup<"non-modular-include-in-framework-module">;
@@ -328,7 +343,16 @@
 def SemiBeforeMethodBody : DiagGroup<"semicolon-before-method-body">;
 def Sentinel : DiagGroup<"sentinel">;
 def MissingMethodReturnType : DiagGroup<"missing-method-return-type">;
-def Shadow : DiagGroup<"shadow">;
+
+def ShadowFieldInConstructorModified : DiagGroup<"shadow-field-in-constructor-modified">;
+def ShadowFieldInConstructor : DiagGroup<"shadow-field-in-constructor",
+                                         [ShadowFieldInConstructorModified]>;
+
+// -Wshadow-all is a catch-all for all shadowing. -Wshadow is just the
+// shadowing that we think is unsafe.
+def Shadow : DiagGroup<"shadow", [ShadowFieldInConstructorModified]>;
+def ShadowAll : DiagGroup<"shadow-all", [Shadow, ShadowFieldInConstructor]>;
+
 def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
 def : DiagGroup<"sign-promo">;
 def SignCompare : DiagGroup<"sign-compare">;
@@ -617,6 +641,7 @@
     CharSubscript,
     Comment,
     DeleteNonVirtualDtor,
+    ForLoopAnalysis,
     Format,
     Implicit,
     InfiniteRecursion,
@@ -756,6 +781,7 @@
 def MicrosoftDefaultArgRedefinition :
     DiagGroup<"microsoft-default-arg-redefinition">;
 def MicrosoftTemplate : DiagGroup<"microsoft-template">;
+def MicrosoftInconsistentDllImport : DiagGroup<"inconsistent-dllimport">;
 def MicrosoftRedeclareStatic : DiagGroup<"microsoft-redeclare-static">;
 def MicrosoftEnumForwardReference :
     DiagGroup<"microsoft-enum-forward-reference">;
@@ -782,7 +808,10 @@
      MicrosoftRedeclareStatic, MicrosoftEnumForwardReference, MicrosoftGoto,
      MicrosoftFlexibleArray, MicrosoftExtraQualification, MicrosoftCast,
      MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag,
-     MicrosoftCommentPaste, MicrosoftEndOfFile]>;
+     MicrosoftCommentPaste, MicrosoftEndOfFile,
+     MicrosoftInconsistentDllImport]>;
+
+def ClangClPch : DiagGroup<"clang-cl-pch">;
 
 def ObjCNonUnifiedException : DiagGroup<"objc-nonunified-exceptions">;
 
@@ -816,6 +845,7 @@
 def SourceUsesOpenMP : DiagGroup<"source-uses-openmp">;
 def OpenMPClauses : DiagGroup<"openmp-clauses">;
 def OpenMPLoopForm : DiagGroup<"openmp-loop-form">;
+def OpenMPTarget : DiagGroup<"openmp-target">;
 
 // Backend warnings.
 def BackendInlineAsm : DiagGroup<"inline-asm">;
@@ -847,3 +877,9 @@
 def InvalidOrNonExistentDirectory : DiagGroup<"invalid-or-nonexistent-directory">;
 
 def OptionIgnored : DiagGroup<"option-ignored">;
+
+def UnknownArgument : DiagGroup<"unknown-argument">;
+
+// A warning group for warnings about code that clang accepts when
+// compiling OpenCL C/C++ but which is not compatible with the SPIR spec.
+def SpirCompat : DiagGroup<"spir-compat">;
\ No newline at end of file
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index 2fc9664..604d51d 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -175,10 +175,17 @@
 def err_exponent_has_no_digits : Error<"exponent has no digits">;
 def ext_imaginary_constant : Extension<
   "imaginary constants are a GNU extension">, InGroup<GNUImaginaryConstant>;
-def err_hexconstant_requires: Error<
-  "hexadecimal floating constants require %select{an exponent|a significand}0">;
-def ext_hexconstant_invalid : Extension<
+def err_hex_constant_requires : Error<
+  "hexadecimal floating %select{constant|literal}0 requires "
+  "%select{an exponent|a significand}1">;
+def ext_hex_constant_invalid : Extension<
   "hexadecimal floating constants are a C99 feature">, InGroup<C99>;
+def ext_hex_literal_invalid : Extension<
+  "hexadecimal floating literals are a C++1z feature">, InGroup<CXX1z>;
+def warn_cxx1z_hex_literal : Warning<
+  "hexidecimal floating literals are incompatible with "
+  "C++ standards before C++1z">,
+  InGroup<CXXPre1zCompatPedantic>, DefaultIgnore;
 def ext_binary_literal : Extension<
   "binary integer literals are a GNU extension">, InGroup<GNUBinaryLiteral>;
 def ext_binary_literal_cxx14 : Extension<
@@ -267,6 +274,14 @@
   "whitespace required after macro name">;
 def warn_missing_whitespace_after_macro_name : Warning<
   "whitespace recommended after macro name">;
+
+class NonportablePath  : Warning<
+  "non-portable path to file '%0'; specified path differs in case from file"
+  " name on disk">;
+def pp_nonportable_path : NonportablePath,
+  InGroup<DiagGroup<"nonportable-include-path">>;
+def pp_nonportable_system_path : NonportablePath, DefaultIgnore,
+  InGroup<DiagGroup<"nonportable-system-include-path">>;
   
 def pp_pragma_once_in_main_file : Warning<"#pragma once in main file">,
   InGroup<DiagGroup<"pragma-once-outside-header">>;
@@ -380,7 +395,7 @@
 def err_pp_duplicate_name_in_arg_list : Error<
   "duplicate macro parameter name %0">;
 def err_pp_stringize_not_parameter : Error<
-  "'#' is not followed by a macro parameter">;
+  "'%select{#|#@}0' is not followed by a macro parameter">;
 def err_pp_malformed_ident : Error<"invalid #ident directive">;
 def err_pp_unterminated_conditional : Error<
   "unterminated conditional directive">;
@@ -394,6 +409,7 @@
 def err_pp_expected_eol : Error<
   "expected end of line in preprocessor expression">;
 def err_pp_expected_after : Error<"missing %1 after %0">;
+def err_pp_nested_paren : Error<"nested parentheses not permitted in %0">;
 def err_pp_colon_without_question : Error<"':' without preceding '?'">;
 def err_pp_division_by_zero : Error<
   "division by zero in preprocessor expression">;
@@ -401,6 +417,8 @@
   "remainder by zero in preprocessor expression">;
 def err_pp_expr_bad_token_binop : Error<
   "token is not a valid binary operator in a preprocessor subexpression">;
+def err_pp_expr_bad_token_lparen : Error<
+  "function-like macro %0 is not defined">;
 def err_pp_expr_bad_token_start_expr : Error<
   "invalid token at start of a preprocessor expression">;
 def err_pp_invalid_poison : Error<"can only poison identifier tokens">;
@@ -409,8 +427,6 @@
 def err_feature_check_malformed : Error<
   "builtin feature check macro requires a parenthesized identifier">;
 
-def err_warning_check_malformed : Error<
-  "builtin warning check macro requires a parenthesized string">;
 def warn_has_warning_invalid_option :
    ExtWarn<"__has_warning expected option name (e.g. \"-Wundef\")">,
    InGroup<MalformedWarningCheck>;
@@ -658,6 +674,13 @@
 def note_header_guard : Note<
   "%0 is defined here; did you mean %1?">;
 
+def warn_defined_in_object_type_macro : Warning<
+  "macro expansion producing 'defined' has undefined behavior">,
+  InGroup<ExpansionToDefined>;
+def warn_defined_in_function_type_macro : Extension<
+  "macro expansion producing 'defined' has undefined behavior">,
+  InGroup<ExpansionToDefined>;
+
 let CategoryName = "Nullability Issue" in {
 
 def err_pp_assume_nonnull_syntax : Error<"expected 'begin' or 'end'">;
diff --git a/include/clang/Basic/DiagnosticParseKinds.td b/include/clang/Basic/DiagnosticParseKinds.td
index 4e0cc79..fb3db57 100644
--- a/include/clang/Basic/DiagnosticParseKinds.td
+++ b/include/clang/Basic/DiagnosticParseKinds.td
@@ -27,6 +27,8 @@
   "MS-style inline assembly is not available: %0">;
 def err_gnu_inline_asm_disabled : Error<
   "GNU-style inline assembly is disabled">;
+def err_asm_goto_not_supported_yet : Error<
+  "'asm goto' constructs are not supported yet">;
 }
 
 let CategoryName = "Parse Issue" in {
@@ -353,6 +355,10 @@
 def err_expected_coloncolon_after_super : Error<
   "expected '::' after '__super'">;
 
+def ext_decomp_decl_empty : ExtWarn<
+  "ISO C++1z does not allow a decomposition group to be empty">,
+  InGroup<DiagGroup<"empty-decomposition">>;
+
 /// Objective-C parser diagnostics
 def err_expected_minus_or_plus : Error<
   "method type specifier must start with '-' or '+'">;
@@ -510,6 +516,11 @@
   "unexpected %0 in function call; perhaps remove the %0?">;
 def err_super_in_using_declaration : Error<
   "'__super' cannot be used with a using declaration">;
+def ext_constexpr_if : ExtWarn<
+  "constexpr if is a C++1z extension">, InGroup<CXX1z>;
+def warn_cxx14_compat_constexpr_if : Warning<
+  "constexpr if is incompatible with C++ standards before C++1z">,
+  DefaultIgnore, InGroup<CXXPre1zCompat>;
 
 // C++ derived classes
 def err_dup_virtual : Error<"duplicate 'virtual' in base specifier">;
@@ -553,6 +564,14 @@
   "attribute '%0' cannot be used as an attribute pack">;
 def err_cxx11_attribute_repeated : Error<
   "attribute %0 cannot appear multiple times in an attribute specifier">;
+def warn_cxx14_compat_using_attribute_ns : Warning<
+  "default scope specifier for attributes is incompatible with C++ standards "
+  "before C++1z">, InGroup<CXXPre1zCompat>, DefaultIgnore;
+def ext_using_attribute_ns : ExtWarn<
+  "default scope specifier for attributes is a C++1z extension">,
+  InGroup<CXX1z>;
+def err_using_attribute_ns_conflict : Error<
+  "attribute with scope specifier cannot follow default scope specifier">;
 def err_attributes_not_allowed : Error<"an attribute list cannot appear here">;
 def err_l_square_l_square_not_attribute : Error<
   "C++11 only allows consecutive left square brackets when "
@@ -765,7 +784,19 @@
   InGroup<CXX98Compat>, DefaultIgnore;
 def err_lambda_missing_parens : Error<
   "lambda requires '()' before %select{'mutable'|return type|"
-  "attribute specifier}0">;
+  "attribute specifier|'constexpr'}0">;
+def err_lambda_decl_specifier_repeated : Error<
+  "%select{'mutable'|'constexpr'}0 cannot appear multiple times in a lambda declarator">;
+// C++1z lambda expressions
+def err_expected_star_this_capture : Error<
+  "expected 'this' following '*' in lambda capture list">;
+
+// C++1z constexpr lambda expressions
+def warn_cxx14_compat_constexpr_on_lambda : Warning<
+  "constexpr on lambda expressions is incompatible with C++ standards before C++1z">,
+  InGroup<CXXPre1zCompat>, DefaultIgnore;
+def ext_constexpr_on_lambda_cxx1z : ExtWarn<
+  "'constexpr' on lambda expressions is a C++1z extension">, InGroup<CXX1z>;
 
 // Availability attribute
 def err_expected_version : Error<
@@ -796,6 +827,19 @@
   "'unavailable' availability overrides all other availability information">,
   InGroup<Availability>;
 
+// @available(...)
+def err_avail_query_expected_platform_name : Error<
+  "expected a platform name here">;
+
+def err_avail_query_unrecognized_platform_name : Error<
+  "unrecognized platform name %0">;
+def err_availability_query_wildcard_required: Error<
+  "must handle potential future platforms with '*'">;
+def err_availability_query_repeated_platform: Error<
+  "version for '%0' already specified">;
+def err_availability_query_repeated_star : Error<
+  "'*' query has already been specified">;
+
 // Type safety attributes
 def err_type_safety_unknown_flag : Error<
   "invalid comparison flag %0; use 'layout_compatible' or 'must_be_null'">;
@@ -902,6 +946,9 @@
 def err_pragma_optimize_extra_argument : Error<
   "unexpected extra argument '%0' to '#pragma clang optimize'">;
 
+def err_opencl_unroll_hint_on_non_loop : Error<
+  "OpenCL only supports 'opencl_unroll_hint' attribute on for, while, and do statements">;
+
 // OpenCL EXTENSION pragma (OpenCL 1.1 [9.1])
 def warn_pragma_expected_colon : Warning<
   "missing ':' after %0 - ignoring">, InGroup<IgnoredPragmas>;
@@ -909,10 +956,16 @@
   "expected 'enable' or 'disable' - ignoring">, InGroup<IgnoredPragmas>;
 def warn_pragma_unknown_extension : Warning<
   "unknown OpenCL extension %0 - ignoring">, InGroup<IgnoredPragmas>;
+def warn_pragma_unsupported_extension : Warning<
+  "unsupported OpenCL extension %0 - ignoring">, InGroup<IgnoredPragmas>;
+def warn_pragma_extension_is_core : Warning<
+  "OpenCL extension %0 is core feature or supported optional core feature - ignoring">, InGroup<DiagGroup<"pedantic-core-features">>, DefaultIgnore;
 
-// OpenCL error
+// OpenCL errors.
 def err_opencl_taking_function_address_parser : Error<
   "taking address of function is not allowed">;
+def err_opencl_logical_exclusive_or : Error<
+  "^^ is a reserved operator in OpenCL">;
 
 // OpenMP support.
 def warn_pragma_omp_ignored : Warning<
@@ -934,22 +987,32 @@
   "'#pragma omp %0' %select{|with '%2' clause }1cannot be an immediate substatement">;
 def err_omp_expected_identifier_for_critical : Error<
   "expected identifier specifying the name of the 'omp critical' directive">;
+def err_omp_expected_reduction_identifier : Error<
+  "expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'">;
+def err_omp_decl_in_declare_simd : Error<
+  "function declaration is expected after 'declare simd' directive">;
 def err_omp_unknown_map_type : Error<
   "incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'">;
 def err_omp_unknown_map_type_modifier : Error<
   "incorrect map type modifier, expected 'always'">;
 def err_omp_map_type_missing : Error<
   "missing map type">;
+def err_omp_declare_simd_inbranch_notinbranch : Error<
+  "unexpected '%0' clause, '%1' is specified already">;
+def err_expected_end_declare_target : Error<
+  "expected '#pragma omp end declare target'">;
+def err_omp_declare_target_unexpected_clause: Error<
+  "unexpected '%0' clause, only 'to' or 'link' clauses expected">;
 
 // Pragma loop support.
 def err_pragma_loop_missing_argument : Error<
   "missing argument; expected %select{an integer value|"
-  "'enable', %select{'assume_safety'|'full'}1 or 'disable'}0">;
+  "'enable'%select{|, 'full'}1%select{|, 'assume_safety'}2 or 'disable'}0">;
 def err_pragma_loop_invalid_option : Error<
   "%select{invalid|missing}0 option%select{ %1|}0; expected vectorize, "
-  "vectorize_width, interleave, interleave_count, unroll, or unroll_count">;
+  "vectorize_width, interleave, interleave_count, unroll, unroll_count, or distribute">;
 def err_pragma_invalid_keyword : Error<
-  "invalid argument; expected 'enable', %select{'assume_safety'|'full'}0 or 'disable'">;
+  "invalid argument; expected 'enable'%select{|, 'full'}0%select{|, 'assume_safety'}1 or 'disable'">;
 
 // Pragma unroll support.
 def warn_pragma_unroll_cuda_value_in_parens : Warning<
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 2129bcc..110d9fa 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -30,23 +30,6 @@
   InGroup<ForLoopAnalysis>, DefaultIgnore;
 def note_loop_iteration_here : Note<"%select{decremented|incremented}0 here">;
 
-def warn_for_range_const_reference_copy : Warning<
-  "loop variable %0 "
-  "%diff{has type $ but is initialized with type $"
-  "| is initialized with a value of a different type}1,2 resulting in a copy">,
-  InGroup<RangeLoopAnalysis>, DefaultIgnore;
-def note_use_type_or_non_reference : Note<
-  "use non-reference type %0 to keep the copy or type %1 to prevent copying">;
-def warn_for_range_variable_always_copy : Warning<
-  "loop variable %0 is always a copy because the range of type %1 does not "
-  "return a reference">,
-  InGroup<RangeLoopAnalysis>, DefaultIgnore;
-def note_use_non_reference_type : Note<"use non-reference type %0">;
-def warn_for_range_copy : Warning<
-  "loop variable %0 of type %1 creates a copy from type %2">,
-  InGroup<RangeLoopAnalysis>, DefaultIgnore;
-def note_use_reference_type : Note<"use reference type %0 to prevent copying">;
-
 def warn_duplicate_enum_values : Warning<
   "element %0 has been implicitly assigned %1 which another element has "
   "been assigned">, InGroup<DiagGroup<"duplicate-enum">>, DefaultIgnore;
@@ -74,6 +57,10 @@
   "all paths through this function will call itself">,
   InGroup<InfiniteRecursion>, DefaultIgnore;
 
+def warn_comma_operator : Warning<"possible misuse of comma operator here">,
+  InGroup<DiagGroup<"comma">>, DefaultIgnore;
+def note_cast_to_void : Note<"cast expression to void to silence warning">;
+
 // Constant expressions
 def err_expr_not_ice : Error<
   "expression is not an %select{integer|integral}0 constant expression">;
@@ -88,10 +75,12 @@
   "conversion from %0 to %1 in converted constant expression would "
   "bind reference to a temporary">;
 def err_expr_not_cce : Error<
-  "%select{case value|enumerator value|non-type template argument|array size}0 "
+  "%select{case value|enumerator value|non-type template argument|"
+  "array size|constexpr if condition}0 "
   "is not a constant expression">;
 def ext_cce_narrowing : ExtWarn<
-  "%select{case value|enumerator value|non-type template argument|array size}0 "
+  "%select{case value|enumerator value|non-type template argument|"
+  "array size|constexpr if condition}0 "
   "%select{cannot be narrowed from type %2 to %3|"
   "evaluates to %2, which cannot be narrowed to type %3}1">,
   InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
@@ -124,13 +113,14 @@
   InGroup<LiteralRange>;
 def warn_double_const_requires_fp64 : Warning<
   "double precision constant requires cl_khr_fp64, casting to single precision">;
+def err_half_const_requires_fp16 : Error<
+  "half precision constant requires cl_khr_fp16">;
 
 // C99 variable-length arrays
 def ext_vla : Extension<"variable length arrays are a C99 feature">,
   InGroup<VLAExtension>;
 def warn_vla_used : Warning<"variable length array used">,
   InGroup<VLA>, DefaultIgnore;
-def err_vla_non_pod : Error<"variable length array of non-POD element type %0">;
 def err_vla_in_sfinae : Error<
   "variable length array cannot be formed during template argument deduction">;
 def err_array_star_in_function_definition : Error<
@@ -349,12 +339,16 @@
 def warn_use_out_of_scope_declaration : Warning<
   "use of out-of-scope declaration of %0">;
 def err_inline_non_function : Error<
-  "'inline' can only appear on functions">;
+  "'inline' can only appear on functions%select{| and non-local variables}0">;
 def err_noreturn_non_function : Error<
   "'_Noreturn' can only appear on functions">;
 def warn_qual_return_type : Warning< 
   "'%0' type qualifier%s1 on return type %plural{1:has|:have}1 no effect">,
   InGroup<IgnoredQualifiers>, DefaultIgnore;
+def warn_deprecated_redundant_constexpr_static_def : Warning<
+  "out-of-line definition of constexpr static data member is redundant "
+  "in C++17 and is deprecated">,
+  InGroup<Deprecated>, DefaultIgnore;
 
 def warn_decl_shadow :
   Warning<"declaration shadows a %select{"
@@ -363,6 +357,68 @@
           "static data member of %2|"
           "field of %2}1">,
   InGroup<Shadow>, DefaultIgnore;
+def warn_ctor_parm_shadows_field:
+  Warning<"constructor parameter %0 shadows the field %1 of %2">,
+  InGroup<ShadowFieldInConstructor>, DefaultIgnore;
+def warn_modifying_shadowing_decl :
+  Warning<"modifying constructor parameter %0 that shadows a "
+          "field of %1">,
+  InGroup<ShadowFieldInConstructorModified>, DefaultIgnore;
+
+// C++ decomposition declarations
+def err_decomp_decl_context : Error<
+  "decomposition declaration not permitted in this context">;
+def warn_cxx14_compat_decomp_decl : Warning<
+  "decomposition declarations are incompatible with "
+  "C++ standards before C++1z">, DefaultIgnore, InGroup<CXXPre1zCompat>;
+def ext_decomp_decl : ExtWarn<
+  "decomposition declarations are a C++1z extension">, InGroup<CXX1z>;
+def err_decomp_decl_spec : Error<
+  "decomposition declaration cannot be declared "
+  "%plural{1:'%1'|:with '%1' specifiers}0">;
+def err_decomp_decl_type : Error<
+  "decomposition declaration cannot be declared with type %0; "
+  "declared type must be 'auto' or reference to 'auto'">;
+def err_decomp_decl_parens : Error<
+  "decomposition declaration cannot be declared with parentheses">;
+def err_decomp_decl_template : Error<
+  "decomposition declaration template not supported">;
+def err_decomp_decl_not_alone : Error<
+  "decomposition declaration must be the only declaration in its group">;
+def err_decomp_decl_requires_init : Error<
+  "decomposition declaration %0 requires an initializer">;
+def err_decomp_decl_paren_init : Error<
+  "decomposition declaration %0 cannot have a parenthesized initializer">;
+def err_decomp_decl_wrong_number_bindings : Error<
+  "type %0 decomposes into %2 elements, but %select{only |}3%1 "
+  "names were provided">;
+def err_decomp_decl_unbindable_type : Error<
+  "cannot decompose %select{union|non-class, non-array}1 type %2">;
+def err_decomp_decl_multiple_bases_with_members : Error<
+  "cannot decompose class type %1: "
+  "%select{its base classes %2 and|both it and its base class}0 %3 "
+  "have non-static data members">;
+def err_decomp_decl_ambiguous_base : Error<
+  "cannot decompose members of ambiguous base class %1 of %0:%2">;
+def err_decomp_decl_non_public_base : Error<
+  "cannot decompose members of non-public base class %1 of %0">;
+def err_decomp_decl_non_public_member : Error<
+  "cannot decompose non-public member %0 of %1">;
+def err_decomp_decl_anon_union_member : Error<
+  "cannot decompose class type %0 because it has an anonymous "
+  "%select{struct|union}1 member">;
+def err_decomp_decl_std_tuple_element_not_specialized : Error<
+  "cannot decompose this type; 'std::tuple_element<%0>::type' "
+  "does not name a type">;
+def err_decomp_decl_std_tuple_size_not_constant : Error<
+  "cannot decompose this type; 'std::tuple_size<%0>::value' "
+  "is not a valid integral constant expression">;
+def note_in_binding_decl_init : Note<
+  "in implicit initialization of binding declaration %0">;
+
+def err_std_type_trait_not_class_template : Error<
+  "unsupported standard library implementation: "
+  "'std::%0' is not a class template">;
 
 // C++ using declarations
 def err_using_requires_qualname : Error<
@@ -379,27 +435,19 @@
   "using declaration refers into '%0', which is not a base class of %1">;
 def err_using_decl_constructor_not_in_direct_base : Error<
   "%0 is not a direct base of %1, cannot inherit constructors">;
-def err_using_decl_constructor_conflict : Error<
-  "cannot inherit constructor, already inherited constructor with "
-  "the same signature">;
-def note_using_decl_constructor_conflict_current_ctor : Note<
-  "conflicting constructor">;
-def note_using_decl_constructor_conflict_previous_ctor : Note<
-  "previous constructor">;
-def note_using_decl_constructor_conflict_previous_using : Note<
-  "previously inherited here">;
-def warn_using_decl_constructor_ellipsis : Warning<
-  "inheriting constructor does not inherit ellipsis">,
-  InGroup<DiagGroup<"inherited-variadic-ctor">>;
-def note_using_decl_constructor_ellipsis : Note<
-  "constructor declared with ellipsis here">;
 def err_using_decl_can_not_refer_to_class_member : Error<
   "using declaration cannot refer to class member">;
+def err_ambiguous_inherited_constructor : Error<
+  "constructor of %0 inherited from multiple base class subobjects">;
+def note_ambiguous_inherited_constructor_using : Note<
+  "inherited from base class %0 here">;
 def note_using_decl_class_member_workaround : Note<
-  "use %select{an alias declaration|a typedef declaration|a reference}0 "
-  "instead">;
+  "use %select{an alias declaration|a typedef declaration|a reference|"
+  "a const variable|a constexpr variable}0 instead">;
 def err_using_decl_can_not_refer_to_namespace : Error<
-  "using declaration cannot refer to namespace">;
+  "using declaration cannot refer to a namespace">;
+def err_using_decl_can_not_refer_to_scoped_enum : Error<
+  "using declaration cannot refer to a scoped enumerator">;
 def err_using_decl_constructor : Error<
   "using declaration cannot refer to a constructor">;
 def warn_cxx98_compat_using_decl_constructor : Warning<
@@ -411,7 +459,7 @@
   "using declaration cannot refer to a template specialization">;
 def note_using_decl_target : Note<"target of using declaration">;
 def note_using_decl_conflict : Note<"conflicting declaration">;
-def err_using_decl_redeclaration : Error<"redeclaration of using decl">;
+def err_using_decl_redeclaration : Error<"redeclaration of using declaration">;
 def err_using_decl_conflict : Error<
   "target of using declaration conflicts with declaration already in scope">;
 def err_using_decl_conflict_reverse : Error<
@@ -617,8 +665,8 @@
   "declaring variable of type %0 is not allowed">;
 def err_opencl_half_param : Error<
   "declaring function parameter of type %0 is not allowed; did you forget * ?">;
-def err_opencl_half_return : Error<
-  "declaring function return value of type %0 is not allowed; did you forget * ?">;
+def err_opencl_invalid_return : Error<
+  "declaring function return value of type %0 is not allowed %select{; did you forget * ?|}1">;
 def warn_enum_value_overflow : Warning<"overflow in enumeration value">;
 def warn_pragma_options_align_reset_failed : Warning<
   "#pragma options align=reset failed: %0">,
@@ -1088,6 +1136,12 @@
   "static_assert with no message is incompatible with C++ standards before C++1z">,
   DefaultIgnore, InGroup<CXXPre1zCompat>;
 
+def ext_inline_variable : ExtWarn<
+  "inline variables are a C++1z extension">, InGroup<CXX1z>;
+def warn_cxx14_compat_inline_variable : Warning<
+  "inline variables are incompatible with C++ standards before C++1z">,
+  DefaultIgnore, InGroup<CXXPre1zCompat>;
+
 def warn_inline_namespace_reopened_noninline : Warning<
   "inline namespace cannot be reopened as a non-inline namespace">;
 def err_inline_namespace_mismatch : Error<
@@ -1227,6 +1281,8 @@
 def err_incomplete_in_exception_spec : Error<
   "%select{|pointer to |reference to }0incomplete type %1 is not allowed "
   "in exception specification">;
+def ext_incomplete_in_exception_spec : ExtWarn<err_incomplete_in_exception_spec.Text>,
+  InGroup<MicrosoftExceptionSpec>;
 def err_rref_in_exception_spec : Error<
   "rvalue reference type %0 is not allowed in exception specification">;
 def err_mismatched_exception_spec : Error<
@@ -1426,11 +1482,13 @@
   "assignment operator|move assignment operator|destructor}0 for %1 first "
   "required here">;
 def note_inhctor_synthesized_at : Note<
-  "inheriting constructor for %0 first required here">;
+  "inherited constructor for %0 first required here">;
 def err_missing_default_ctor : Error<
-  "%select{|implicit default |inheriting }0constructor for %1 must explicitly "
-  "initialize the %select{base class|member}2 %3 which does not have a default "
-  "constructor">;
+  "%select{constructor for %1 must explicitly initialize the|"
+  "implicit default constructor for %1 must explicitly initialize the|"
+  "cannot use constructor inherited from base class %4;}0 "
+  "%select{base class|member}2 %3 %select{which|which|of %1}0 "
+  "does not have a default constructor">;
 def note_due_to_dllexported_class : Note<
   "due to '%0' being dllexported%select{|; try compiling in C++11 mode}1">;
 
@@ -1680,7 +1738,7 @@
   "variable %0 may be uninitialized when "
   "%select{used here|captured by block}1">,
   InGroup<UninitializedMaybe>, DefaultIgnore;
-def note_uninit_var_def : Note<"variable %0 is declared here">;
+def note_var_declared_here : Note<"variable %0 is declared here">;
 def note_uninit_var_use : Note<
   "%select{uninitialized use occurs|variable is captured by block}0 here">;
 def warn_uninit_byref_blockvar_captured_by_block : Warning<
@@ -1754,6 +1812,9 @@
 def err_auto_variable_cannot_appear_in_own_initializer : Error<
   "variable %0 declared with %select{'auto'|'decltype(auto)'|'__auto_type'}1 "
   "type cannot appear in its own initializer">;
+def err_binding_cannot_appear_in_own_initializer : Error<
+  "binding %0 cannot appear in the initializer of its own "
+  "decomposition declaration">;
 def err_illegal_decl_array_of_auto : Error<
   "'%0' declared as array of %1">;
 def err_new_array_of_auto : Error<
@@ -1922,8 +1983,12 @@
   "cannot use type %0 as an iterator">;
 def err_for_range_member_begin_end_mismatch : Error<
   "range type %0 has '%select{begin|end}1' member but no '%select{end|begin}1' member">;
-def err_for_range_begin_end_types_differ : Error<
-  "'begin' and 'end' must return the same type (got %0 and %1)">;
+def ext_for_range_begin_end_types_differ : ExtWarn<
+  "'begin' and 'end' returning different types (%0 and %1) is a C++1z extension">,
+  InGroup<CXX1z>;
+def warn_for_range_begin_end_types_differ : Warning<
+  "'begin' and 'end' returning different types (%0 and %1) is incompatible "
+  "with C++ standards before C++1z">, InGroup<CXXPre1zCompat>, DefaultIgnore;
 def note_in_for_range: Note<
   "when looking up '%select{begin|end}0' function for range expression "
   "of type %1">;
@@ -1940,6 +2005,22 @@
   "in implicit call to 'operator%select{!=|*|++}0' for iterator of type %1">;
 def note_for_range_begin_end : Note<
   "selected '%select{begin|end}0' %select{function|template }1%2 with iterator type %3">;
+def warn_for_range_const_reference_copy : Warning<
+  "loop variable %0 "
+  "%diff{has type $ but is initialized with type $"
+  "| is initialized with a value of a different type}1,2 resulting in a copy">,
+  InGroup<RangeLoopAnalysis>, DefaultIgnore;
+def note_use_type_or_non_reference : Note<
+  "use non-reference type %0 to keep the copy or type %1 to prevent copying">;
+def warn_for_range_variable_always_copy : Warning<
+  "loop variable %0 is always a copy because the range of type %1 does not "
+  "return a reference">,
+  InGroup<RangeLoopAnalysis>, DefaultIgnore;
+def note_use_non_reference_type : Note<"use non-reference type %0">;
+def warn_for_range_copy : Warning<
+  "loop variable %0 of type %1 creates a copy from type %2">,
+  InGroup<RangeLoopAnalysis>, DefaultIgnore;
+def note_use_reference_type : Note<"use reference type %0 to prevent copying">;
 
 // C++11 constexpr
 def warn_cxx98_compat_constexpr : Warning<
@@ -2084,6 +2165,16 @@
   "'%select{thread_local|inline|friend|constexpr}1'">;
 def err_function_concept_with_params : Error<
   "function concept cannot have any parameters">;
+def err_function_concept_bool_ret : Error<
+  "declared return type of function concept must be 'bool'">;
+def err_variable_concept_bool_decl : Error<
+  "declared type of variable concept must be 'bool'">;
+def err_concept_specified_specialization : Error<
+  "'concept' cannot be applied on an "
+  "%select{explicit instantiation|explicit specialization|partial specialization}0">;
+def err_concept_specialized : Error<
+  "%select{function|variable}0 concept cannot be "
+  "%select{explicitly instantiated|explicitly specialized|partially specialized}1">;
 
 // C++11 char16_t/char32_t
 def warn_cxx98_compat_unicode_type : Warning<
@@ -2096,6 +2187,10 @@
 def err_integer_sequence_integral_element_type : Error<
   "integer sequences must have integral element type">;
 
+// __type_pack_element
+def err_type_pack_element_out_of_bounds : Error<
+  "a parameter pack may not be accessed at an out of bounds index">;
+
 // Objective-C++
 def err_objc_decls_may_only_appear_in_global_scope : Error<
   "Objective-C declarations may only appear in global scope">;
@@ -2118,6 +2213,10 @@
 def err_attribute_invalid_vector_type : Error<"invalid vector element type %0">;
 def err_attribute_bad_neon_vector_size : Error<
   "Neon vector size must be 64 or 128 bits">;
+def err_attribute_requires_positive_integer : Error<
+  "%0 attribute requires a positive integral compile time constant expression">;
+def err_attribute_requires_opencl_version : Error<
+  "%0 attribute requires OpenCL version %1%select{| or above}2">;
 def warn_unsupported_target_attribute
     : Warning<"Ignoring unsupported '%0' in the target attribute string">,
     InGroup<IgnoredAttributes>;
@@ -2329,7 +2428,10 @@
   "requested alignment must be %0 bytes or smaller">;
 def warn_redeclaration_without_attribute_prev_attribute_ignored : Warning<
   "%q0 redeclared without %1 attribute: previous %1 ignored">,
-  InGroup<DiagGroup<"inconsistent-dllimport">>;
+  InGroup<MicrosoftInconsistentDllImport>;
+def warn_redeclaration_without_import_attribute : Warning<
+  "%q0 redeclared without 'dllimport' attribute: 'dllexport' attribute added">,
+  InGroup<MicrosoftInconsistentDllImport>;
 def warn_dllimport_dropped_from_inline_function : Warning<
   "%q0 redeclared inline; %1 attribute ignored">,
   InGroup<IgnoredAttributes>;
@@ -2349,8 +2451,10 @@
 def warn_unhandled_ms_attribute_ignored : Warning<
   "__declspec attribute %0 is not supported">, 
   InGroup<IgnoredAttributes>;
-def err_attribute_invalid_on_stmt : Error<
+def err_decl_attribute_invalid_on_stmt : Error<
   "%0 attribute cannot be applied to a statement">;
+def err_stmt_attribute_invalid_on_decl : Error<
+  "%0 attribute cannot be applied to a declaration">;
 def warn_declspec_attribute_ignored : Warning<
   "attribute %0 is ignored, place it after "
   "\"%select{class|struct|interface|union|enum}1\" to apply attribute to "
@@ -2441,32 +2545,41 @@
 def err_alias_not_supported_on_darwin : Error <
   "only weak aliases are supported on darwin">;
 def err_alias_to_undefined : Error<
-  "alias must point to a defined variable or function">;
+  "%select{alias|ifunc}0 must point to a defined %select{variable or |}1function">;
 def warn_alias_to_weak_alias : Warning<
-  "alias will always resolve to %0 even if weak definition of alias %1 is overridden">,
+  "%select{alias|ifunc}2 will always resolve to %0 even if weak definition of %1 is overridden">,
   InGroup<IgnoredAttributes>;
 def warn_alias_with_section : Warning<
-  "alias will not be in section '%0' but in the same section as the aliasee">,
+  "%select{alias|ifunc}1 will not be in section '%0' but in the same section as the %select{aliasee|resolver}2">,
   InGroup<IgnoredAttributes>;
 def err_duplicate_mangled_name : Error<
   "definition with same mangled name as another definition">;
 def err_cyclic_alias : Error<
-  "alias definition is part of a cycle">;
+  "%select{alias|ifunc}0 definition is part of a cycle">;
+def err_ifunc_resolver_return : Error<
+  "ifunc resolver function must return a pointer">;
+def err_ifunc_resolver_params : Error<
+  "ifunc resolver function must have no parameters">;
 def warn_attribute_wrong_decl_type : Warning<
   "%0 attribute only applies to %select{functions|unions|"
-  "variables and functions|functions and methods|parameters|"
+  "variables and functions|"
+  "functions, variables, and Objective-C interfaces|"
+  "functions and methods|parameters|"
   "functions, methods and blocks|functions, methods, and classes|"
   "functions, methods, and parameters|classes|enums|variables|methods|"
-  "variables, functions and labels|fields and global variables|structs|"
-  "variables and typedefs|thread-local variables|"
-  "variables and fields|variables, data members and tag types|"
+  "fields and global variables|structs|parameters and typedefs|variables and typedefs|"
+  "thread-local variables|variables and fields|variables, data members and tag types|"
   "types and namespaces|Objective-C interfaces|methods and properties|"
   "struct or union|struct, union or class|types|"
   "Objective-C instance methods|init methods of interface or class extension declarations|"
-  "variables, functions and classes|Objective-C protocols|"
+  "variables, functions and classes|"
+  "functions, variables, classes, and Objective-C interfaces|"
+  "Objective-C protocols|"
   "functions and global variables|structs, unions, and typedefs|structs and typedefs|"
   "interface or protocol declarations|kernel functions|non-K&R-style functions|"
-  "variables, fields and typedefs}1">,
+  "variables, enums, fields and typedefs|functions, methods, enums, and classes|"
+  "structs, classes, variables, functions, and inline namespaces|"
+  "variables, functions, methods, types, enumerations, enumerators, labels, and non-static data members}1">,
   InGroup<IgnoredAttributes>;
 def err_attribute_wrong_decl_type : Error<warn_attribute_wrong_decl_type.Text>;
 def warn_type_attribute_wrong_type : Warning<
@@ -2476,9 +2589,6 @@
 def warn_incomplete_encoded_type : Warning<
   "encoding of %0 type is incomplete because %1 component has unknown encoding">,
   InGroup<DiagGroup<"encode-type">>;
-def warn_attribute_requires_functions_or_static_globals : Warning<
-  "%0 only applies to variables with static storage duration and functions">,
-  InGroup<IgnoredAttributes>;
 def warn_gnu_inline_attribute_requires_inline : Warning<
   "'gnu_inline' attribute requires function to be marked 'inline',"
   " attribute ignored">,
@@ -2545,12 +2655,27 @@
   InGroup<Availability>;
 def note_overridden_method : Note<
   "overridden method is here">;
-def warn_availability_swift_unavailable_only : Warning<
-  "only 'unavailable' is supported for Swift availability">,
+def warn_availability_swift_unavailable_deprecated_only : Warning<
+  "only 'unavailable' and 'deprecated' are supported for Swift availability">,
   InGroup<Availability>;
 def note_protocol_method : Note<
   "protocol method is here">;
 
+def warn_unguarded_availability :
+  Warning<"%0 is only available on %1 %2 or newer">,
+  InGroup<UnguardedAvailability>, DefaultIgnore;
+def warn_partial_availability : Warning<"%0 is only available conditionally">,
+    InGroup<UnguardedAvailability>, DefaultIgnore;
+def note_partial_availability_silence : Note<
+  "explicitly redeclare %0 to silence this warning">;
+def note_unguarded_available_silence : Note<
+  "enclose %0 in an @available check to silence this warning">;
+def warn_partial_message : Warning<"%0 is partial: %1">,
+    InGroup<UnguardedAvailability>, DefaultIgnore;
+def warn_partial_fwdclass_message : Warning<
+    "%0 may be partial because the receiver type is unknown">,
+    InGroup<UnguardedAvailability>, DefaultIgnore;
+
 // Thread Safety Attributes
 def warn_invalid_capability_name : Warning<
   "invalid capability name '%0'; capability name must be 'mutex' or 'role'">,
@@ -2717,9 +2842,6 @@
 def warn_impcast_double_promotion : Warning<
   "implicit conversion increases floating-point precision: %0 to %1">,
   InGroup<DoublePromotion>, DefaultIgnore;
-def warn_impcast_float_integer : Warning<
-  "implicit conversion turns floating-point number into integer: %0 to %1">,
-  InGroup<FloatConversion>, DefaultIgnore;
 def warn_impcast_integer_sign : Warning<
   "implicit conversion changes signedness: %0 to %1">,
   InGroup<SignConversion>, DefaultIgnore;
@@ -2738,9 +2860,22 @@
 def warn_impcast_bitfield_precision_constant : Warning<
   "implicit truncation from %2 to bitfield changes value from %0 to %1">,
   InGroup<BitFieldConstantConversion>;
+
 def warn_impcast_literal_float_to_integer : Warning<
   "implicit conversion from %0 to %1 changes value from %2 to %3">,
   InGroup<LiteralConversion>;
+def warn_impcast_float_integer : Warning<
+  "implicit conversion turns floating-point number into integer: %0 to %1">,
+  InGroup<FloatConversion>, DefaultIgnore;
+
+def warn_impcast_float_to_integer : Warning<
+  "implicit conversion of out of range value from %0 to %1 changes value "
+  "from %2 to %3">,
+  InGroup<FloatOverflowConversion>, DefaultIgnore;
+def warn_impcast_float_to_integer_zero : Warning<
+  "implicit conversion from %0 to %1 changes non-zero value from %2 to %3">,
+  InGroup<FloatZeroConversion>, DefaultIgnore;
+
 def warn_impcast_string_literal_to_bool : Warning<
   "implicit conversion turns string literal into bool: %0 to %1">,
   InGroup<StringConversion>, DefaultIgnore;
@@ -2879,6 +3014,8 @@
   InGroup<DeprecatedAttributes>;
 def err_complex_mode_vector_type : Error<
   "type of machine mode does not support base vector types">;
+def err_enum_mode_vector_type : Error<
+  "mode %0 is not supported for enumeration types">;
 def warn_attribute_nonnull_no_pointers : Warning<
   "'nonnull' attribute applied to function with no pointer arguments">,
   InGroup<IgnoredAttributes>;
@@ -2888,6 +3025,8 @@
 def warn_attribute_noescape_non_pointer : Warning<
   "'noescape' attribute ignored on parameter of non-pointer type %0">,
   InGroup<IgnoredAttributes>;
+def note_declared_nonnull : Note<
+  "declared %select{'returns_nonnull'|'nonnull'}0 here">;
 def warn_attribute_sentinel_named_arguments : Warning<
   "'sentinel' attribute requires named arguments">,
   InGroup<IgnoredAttributes>;
@@ -3114,7 +3253,9 @@
   "non-static %select{reference|const}1 member %2 cannot use copy "
   "assignment operator">;
 def err_uninitialized_member_in_ctor : Error<
-  "%select{|implicit default |inheriting }0constructor for %1 must explicitly "
+  "%select{constructor for %1|"
+  "implicit default constructor for %1|"
+  "cannot use constructor inherited from %1:}0 must explicitly "
   "initialize the %select{reference|const}2 member %3">;
 def err_default_arg_makes_ctor_special : Error<
   "addition of default argument on redeclaration makes this constructor a "
@@ -3124,6 +3265,7 @@
   "use of default argument to function %0 that is declared later in class %1">;
 def note_default_argument_declared_here : Note<
   "default argument declared here">;
+def err_recursive_default_argument : Error<"recursive evaluation of default argument">;
 
 def ext_param_promoted_not_compatible_with_prototype : ExtWarn<
   "%diff{promoted type $ of K&R function parameter is not compatible with the "
@@ -3162,7 +3304,8 @@
     "is the implicit move constructor|"
     "is the implicit copy assignment operator|"
     "is the implicit move assignment operator|"
-    "is an inherited constructor}0%1"
+    "inherited constructor|"
+    "inherited constructor }0%1"
     "%select{| has different class%diff{ (expected $ but has $)|}3,4"
     "| has different number of parameters (expected %3 but has %4)"
     "| has type mismatch at %ordinal3 parameter"
@@ -3174,7 +3317,8 @@
     "%select{none|const|restrict|const and restrict|volatile|const and volatile"
     "|volatile and restrict|const, volatile, and restrict}4)}2">;
 
-def note_ovl_candidate_inherited_constructor : Note<"inherited from here">;
+def note_ovl_candidate_inherited_constructor : Note<
+    "constructor from base class %0 inherited here">;
 def note_ovl_candidate_illegal_constructor : Note<
     "candidate %select{constructor|template}0 ignored: "
     "instantiation %select{takes|would take}0 its own class type by value">;
@@ -3234,7 +3378,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0 %select{|template }1"
+    "inherited constructor|"
+    "inherited constructor}0 %select{|template }1"
     "not viable: requires%select{ at least| at most|}2 %3 argument%s3, but %4 "
     "%plural{1:was|:were}4 provided">;
 
@@ -3245,7 +3390,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0 %select{|template }1not viable: "
+    "inherited constructor|"
+    "inherited constructor}0 %select{|template }1not viable: "
     "%select{requires at least|allows at most single|requires single}2 "
     "argument %3, but %plural{0:no|:%4}4 arguments were provided">;
 
@@ -3257,7 +3403,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1 has been "
+    "inherited constructor|"
+    "inherited constructor }0%1 has been "
     "%select{explicitly made unavailable|explicitly deleted|"
     "implicitly deleted}2">;
 
@@ -3274,9 +3421,15 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1 "
+    "inherited constructor|"
+    "inherited constructor }0%1 "
     "not viable: cannot convert argument of incomplete type "
-    "%diff{$ to $|to parameter type}2,3">;
+    "%diff{$ to $|to parameter type}2,3 for "
+    "%select{%ordinal5 argument|object argument}4"
+    "%select{|; dereference the argument with *|"
+    "; take the address of the argument with &|"
+    "; remove *|"
+    "; remove &}6">;
 def note_ovl_candidate_bad_list_argument : Note<"candidate "
     "%select{function|function|constructor|"
     "function |function |constructor |"
@@ -3285,7 +3438,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1 "
+    "inherited constructor|"
+    "inherited constructor }0%1 "
     "not viable: cannot convert initializer list argument to %3">;
 def note_ovl_candidate_bad_overload : Note<"candidate "
     "%select{function|function|constructor|"
@@ -3295,7 +3449,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1"
+    "inherited constructor|"
+    "inherited constructor }0%1"
     " not viable: no overload of %3 matching %2 for %ordinal4 argument">;
 def note_ovl_candidate_bad_conv : Note<"candidate "
     "%select{function|function|constructor|"
@@ -3305,7 +3460,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1"
+    "inherited constructor|"
+    "inherited constructor }0%1"
     " not viable: no known conversion "
     "%diff{from $ to $|from argument type to parameter type}2,3 for "
     "%select{%ordinal5 argument|object argument}4"
@@ -3321,7 +3477,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1"
+    "inherited constructor|"
+    "inherited constructor }0%1"
     " not viable: cannot implicitly convert argument "
     "%diff{of type $ to $|type to parameter type}2,3 for "
     "%select{%ordinal5 argument|object argument}4 under ARC">;
@@ -3333,7 +3490,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1"
+    "inherited constructor|"
+    "inherited constructor }0%1"
     " not viable: expects an l-value for "
     "%select{%ordinal3 argument|object argument}2">;
 def note_ovl_candidate_bad_addrspace : Note<"candidate "
@@ -3344,7 +3502,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1 not viable: "
+    "inherited constructor|"
+    "inherited constructor }0%1 not viable: "
     "%select{%ordinal6|'this'}5 argument (%2) is in "
     "address space %3, but parameter must be in address space %4">;
 def note_ovl_candidate_bad_gc : Note<"candidate "
@@ -3355,7 +3514,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1 not viable: "
+    "inherited constructor|"
+    "inherited constructor }0%1 not viable: "
     "%select{%ordinal6|'this'}5 argument (%2) has %select{no|__weak|__strong}3 "
     "ownership, but parameter has %select{no|__weak|__strong}4 ownership">;
 def note_ovl_candidate_bad_ownership : Note<"candidate "
@@ -3366,7 +3526,8 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1 not viable: "
+    "inherited constructor|"
+    "inherited constructor }0%1 not viable: "
     "%select{%ordinal6|'this'}5 argument (%2) has "
     "%select{no|__unsafe_unretained|__strong|__weak|__autoreleasing}3 ownership,"
     " but parameter has %select{no|__unsafe_unretained|__strong|__weak|"
@@ -3374,7 +3535,7 @@
 def note_ovl_candidate_bad_cvr_this : Note<"candidate "
     "%select{|function|||function|||||"
     "function (the implicit copy assignment operator)|"
-    "function (the implicit move assignment operator)|}0 not viable: "
+    "function (the implicit move assignment operator)||}0 not viable: "
     "'this' argument has type %2, but method is not marked "
     "%select{const|restrict|const or restrict|volatile|const or volatile|"
     "volatile or restrict|const, volatile, or restrict}3">;
@@ -3386,11 +3547,23 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1 not viable: "
+    "inherited constructor|"
+    "inherited constructor }0%1 not viable: "
     "%ordinal4 argument (%2) would lose "
     "%select{const|restrict|const and restrict|volatile|const and volatile|"
     "volatile and restrict|const, volatile, and restrict}3 qualifier"
     "%select{||s||s|s|s}3">;
+def note_ovl_candidate_bad_unaligned : Note<"candidate "
+    "%select{function|function|constructor|"
+    "function |function |constructor |"
+    "constructor (the implicit default constructor)|"
+    "constructor (the implicit copy constructor)|"
+    "constructor (the implicit move constructor)|"
+    "function (the implicit copy assignment operator)|"
+    "function (the implicit move assignment operator)|"
+    "inherited constructor|"
+    "inherited constructor }0%1 not viable: "
+    "%ordinal4 argument (%2) would lose __unaligned qualifier">;
 def note_ovl_candidate_bad_base_to_derived_conv : Note<"candidate "
     "%select{function|function|constructor|"
     "function |function |constructor |"
@@ -3399,20 +3572,23 @@
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0%1"
-    " not viable: cannot %select{convert from|convert from|bind}2 "
+    "inherited constructor|"
+    "inherited constructor }0%1 not viable: "
+    "cannot %select{convert from|convert from|bind}2 "
     "%select{base class pointer|superclass|base class object of type}2 %3 to "
     "%select{derived class pointer|subclass|derived class reference}2 %4 for "
     "%ordinal5 argument">;
 def note_ovl_candidate_bad_target : Note<
     "candidate %select{function|function|constructor|"
-    "function |function |constructor |"
+    "function|function|constructor|"
     "constructor (the implicit default constructor)|"
     "constructor (the implicit copy constructor)|"
     "constructor (the implicit move constructor)|"
     "function (the implicit copy assignment operator)|"
     "function (the implicit move assignment operator)|"
-    "constructor (inherited)}0 not viable: call to "
+    "inherited constructor|"
+    "inherited constructor}0 not viable: "
+    "call to "
     "%select{__device__|__global__|__host__|__host__ __device__|invalid}1 function from"
     " %select{__device__|__global__|__host__|__host__ __device__|invalid}2 function">;
 def note_implicit_member_target_infer_collision : Note<
@@ -3616,8 +3792,8 @@
   "unnamed type used in template argument was declared here">;
 def err_template_arg_overload_type : Error<
   "template argument is the type of an unresolved overloaded function">;
-def err_template_arg_not_class_template : Error<
-  "template argument does not refer to a class template or template "
+def err_template_arg_not_valid_template : Error<
+  "template argument does not refer to a class or alias template, or template "
   "template parameter">;
 def note_template_arg_refers_here_func : Note<
   "template argument refers to function template %0, here">;
@@ -3728,6 +3904,8 @@
   "class template">;
 def note_specialized_entity : Note<
   "explicitly specialized declaration is here">;
+def note_explicit_specialization_declared_here : Note<
+  "explicit specialization declared here">;
 def err_template_spec_decl_function_scope : Error<
   "explicit specialization of %0 in function scope">;
 def err_template_spec_decl_class_scope : Error<
@@ -3850,6 +4028,8 @@
 def note_partial_spec_match : Note<"partial specialization matches %0">;
 def err_partial_spec_redeclared : Error<
   "class template partial specialization %0 cannot be redeclared">;
+def note_partial_specialization_declared_here : Note<
+  "explicit specialization declared here">;
 def note_prev_partial_spec_here : Note<
   "previous declaration of class template partial specialization %0 is here">;
 def err_partial_spec_fully_specialized : Error<
@@ -3916,7 +4096,18 @@
   "in instantiation of template type alias %0 requested here">;
 def note_template_exception_spec_instantiation_here : Note<
   "in instantiation of exception specification for %0 requested here">;
-  
+def warn_var_template_missing : Warning<"instantiation of variable %q0 "
+  "required here, but no definition is available">,
+  InGroup<UndefinedVarTemplate>;
+def warn_func_template_missing : Warning<"instantiation of function %q0 "
+  "required here, but no definition is available">,
+  InGroup<UndefinedFuncTemplate>, DefaultIgnore;
+def note_forward_template_decl : Note<
+  "forward declaration of template entity is here">;
+def note_inst_declaration_hint : Note<"add an explicit instantiation "
+  "declaration to suppress this warning if %q0 is explicitly instantiated in "
+  "another translation unit">;
+
 def note_default_arg_instantiation_here : Note<
   "in instantiation of default argument for '%0' required here">;
 def note_default_function_arg_instantiation_here : Note<
@@ -4170,15 +4361,6 @@
 def note_not_found_by_two_phase_lookup : Note<"%0 should be declared prior to the "
     "call site%select{| or in %2| or in an associated namespace of one of its arguments}1">;
 def err_undeclared_use : Error<"use of undeclared %0">;
-def warn_partial_availability : Warning<"%0 is only available conditionally">,
-    InGroup<PartialAvailability>, DefaultIgnore;
-def note_partial_availability_silence : Note<
-  "explicitly redeclare %0 to silence this warning">;
-def warn_partial_message : Warning<"%0 is partial: %1">,
-    InGroup<PartialAvailability>, DefaultIgnore;
-def warn_partial_fwdclass_message : Warning<
-    "%0 may be partial because the receiver type is unknown">,
-    InGroup<PartialAvailability>, DefaultIgnore;
 def warn_deprecated : Warning<"%0 is deprecated">,
     InGroup<DeprecatedDeclarations>;
 def warn_property_method_deprecated :
@@ -4207,10 +4389,6 @@
   "%select{unavailable|deleted|deprecated|partial}1 here">;
 def note_implicitly_deleted : Note<
   "explicitly defaulted function was implicitly deleted here">;
-def note_inherited_deleted_here : Note<
-  "deleted constructor was inherited here">;
-def note_cannot_inherit : Note<
-  "constructor cannot be inherited">;
 def warn_not_enough_argument : Warning<
   "not enough variable arguments in %0 declaration to fit a sentinel">,
   InGroup<Sentinel>;
@@ -4233,7 +4411,7 @@
 def err_alias_after_tentative :
   Error<"alias definition of %0 after tentative definition">;
 def err_alias_is_definition :
-  Error<"definition %0 cannot also be an alias">;
+  Error<"definition %0 cannot also be an %select{alias|ifunc}1">;
 def err_definition_of_implicitly_declared_member : Error<
   "definition of implicitly declared %select{default constructor|copy "
   "constructor|move constructor|copy assignment operator|move assignment "
@@ -4245,24 +4423,36 @@
 def err_redefinition_extern_inline : Error<
   "redefinition of a 'extern inline' function %0 is not supported in "
   "%select{C99 mode|C++}1">;
+def warn_attr_abi_tag_namespace : Warning<
+  "'abi_tag' attribute on %select{non-inline|anonymous}0 namespace ignored">,
+  InGroup<IgnoredAttributes>;
+def err_abi_tag_on_redeclaration : Error<
+  "cannot add 'abi_tag' attribute in a redeclaration">;
+def err_new_abi_tag_on_redeclaration : Error<
+  "'abi_tag' %0 missing in original declaration">;
 
 def note_deleted_dtor_no_operator_delete : Note<
   "virtual destructor requires an unambiguous, accessible 'operator delete'">;
 def note_deleted_special_member_class_subobject : Note<
-  "%select{default constructor|copy constructor|move constructor|"
-  "copy assignment operator|move assignment operator|destructor}0 of "
+  "%select{default constructor of|copy constructor of|move constructor of|"
+  "copy assignment operator of|move assignment operator of|destructor of|"
+  "constructor inherited by}0 "
   "%1 is implicitly deleted because "
   "%select{base class %3|%select{||||variant }4field %3}2 has "
   "%select{no|a deleted|multiple|an inaccessible|a non-trivial}4 "
   "%select{%select{default constructor|copy constructor|move constructor|copy "
-  "assignment operator|move assignment operator|destructor}0|destructor}5"
+  "assignment operator|move assignment operator|destructor|"
+  "%select{default|corresponding|default|default|default}4 constructor}0|"
+  "destructor}5"
   "%select{||s||}4">;
 def note_deleted_default_ctor_uninit_field : Note<
-  "default constructor of %0 is implicitly deleted because field %1 of "
-  "%select{reference|const-qualified}3 type %2 would not be initialized">;
+  "%select{default constructor of|constructor inherited by}0 "
+  "%1 is implicitly deleted because field %2 of "
+  "%select{reference|const-qualified}4 type %3 would not be initialized">;
 def note_deleted_default_ctor_all_const : Note<
-  "default constructor of %0 is implicitly deleted because all "
-  "%select{data members|data members of an anonymous union member}1"
+  "%select{default constructor of|constructor inherited by}0 "
+  "%1 is implicitly deleted because all "
+  "%select{data members|data members of an anonymous union member}2"
   " are const-qualified">;
 def note_deleted_copy_ctor_rvalue_reference : Note<
   "copy constructor of %0 is implicitly deleted because field %1 is of "
@@ -4280,6 +4470,7 @@
   InGroup<DiagGroup<"undefined-internal">>;
 def warn_undefined_inline : Warning<"inline function %q0 is not defined">,
   InGroup<DiagGroup<"undefined-inline">>;
+def err_undefined_inline_var : Error<"inline variable %q0 is not defined">;
 def note_used_here : Note<"used here">;
 
 def err_internal_linkage_redeclaration : Error<
@@ -4618,6 +4809,10 @@
   "jump bypasses initialization of VLA typedef">;
 def note_protected_by_vla_type_alias : Note<
   "jump bypasses initialization of VLA type alias">;
+def note_protected_by_constexpr_if : Note<
+  "jump enters controlled statement of constexpr if">;
+def note_protected_by_if_available : Note<
+  "jump enters controlled statement of if available">;
 def note_protected_by_vla : Note<
   "jump bypasses initialization of variable length array">;
 def note_protected_by_objc_try : Note<
@@ -5272,8 +5467,6 @@
   "arithmetic on%select{ a|}0 pointer%select{|s}0 to void">;
 def err_typecheck_decl_incomplete_type : Error<
   "variable has incomplete type %0">;
-def err_typecheck_decl_incomplete_type___float128 : Error<
-  "support for type '__float128' is not yet implemented">;
 def ext_typecheck_decl_incomplete_type : ExtWarn<
   "tentative definition of variable with internal linkage has incomplete non-array type %0">,
   InGroup<DiagGroup<"tentative-definition-incomplete-type">>;
@@ -5358,13 +5551,20 @@
   "ISO C++ does not allow indirection on operand of type %0">,
   InGroup<DiagGroup<"void-ptr-dereference">>;
 def warn_indirection_through_null : Warning<
-  "indirection of non-volatile null pointer will be deleted, not trap">, InGroup<NullDereference>;
+  "indirection of non-volatile null pointer will be deleted, not trap">,
+  InGroup<NullDereference>;
+def warn_binding_null_to_reference : Warning<
+  "binding dereferenced null pointer to reference has undefined behavior">,
+  InGroup<NullDereference>;
 def note_indirection_through_null : Note<
   "consider using __builtin_trap() or qualifying pointer with 'volatile'">;
 def warn_pointer_indirection_from_incompatible_type : Warning<
   "dereference of type %1 that was reinterpret_cast from type %0 has undefined "
   "behavior">,
   InGroup<UndefinedReinterpretCast>, DefaultIgnore;
+def warn_taking_address_of_packed_member : Warning<
+  "taking address of packed member %0 of class or structure %q1 may result in an unaligned pointer value">,
+  InGroup<DiagGroup<"address-of-packed-member">>;
 
 def err_objc_object_assignment : Error<
   "cannot assign to class object (%0 invalid)">;
@@ -5402,7 +5602,9 @@
   "composite pointer type %2">, InGroup<CompareDistinctPointerType>;
 def err_typecheck_op_on_nonoverlapping_address_space_pointers : Error<
   "%select{comparison between %diff{ ($ and $)|}0,1"
-  "|arithmetic operation with operands of type %diff{ ($ and $)|}0,1}2"
+  "|arithmetic operation with operands of type %diff{ ($ and $)|}0,1"
+  "|conditional operator with the second and third operands of type "
+  "%diff{ ($ and $)|}0,1}2"
   " which are pointers to non-overlapping address spaces">;
 
 def err_typecheck_assign_const : Error<
@@ -5882,6 +6084,8 @@
   "cannot catch reference to incomplete type %0">;
 def err_catch_incomplete : Error<"cannot catch incomplete type %0">;
 def err_catch_rvalue_ref : Error<"cannot catch exceptions by rvalue reference">;
+def err_catch_variably_modified : Error<
+  "cannot catch variably modified type %0">;
 def err_qualified_catch_declarator : Error<
   "exception declarator cannot be qualified">;
 def err_early_catch_all : Error<"catch-all handler must come last">;
@@ -6040,6 +6244,13 @@
     "cannot deduce type for lambda capture %0 from initializer of type %2">;
   def err_init_capture_deduction_failure_from_init_list : Error<
     "cannot deduce type for lambda capture %0 from initializer list">;
+
+  // C++1z '*this' captures.
+  def warn_cxx14_compat_star_this_lambda_capture : Warning<
+    "by value capture of '*this' is incompatible with C++ standards before C++1z">,
+     InGroup<CXXPre1zCompat>, DefaultIgnore;
+  def ext_star_this_lambda_capture_cxx1z : ExtWarn<
+    "capture of '*this' by copy is a C++1z extension">, InGroup<CXX1z>;
 }
 
 def err_return_in_captured_stmt : Error<
@@ -6273,6 +6484,24 @@
   "; remove *|"
   "; remove &}3">,
   InGroup<IncompatiblePointerTypes>;
+def ext_typecheck_convert_incompatible_function_pointer : ExtWarn<
+  "incompatible function pointer types "
+  "%select{%diff{assigning to $ from $|assigning to different types}0,1"
+  "|%diff{passing $ to parameter of type $|"
+  "passing to parameter of different type}0,1"
+  "|%diff{returning $ from a function with result type $|"
+  "returning from function with different return type}0,1"
+  "|%diff{converting $ to type $|converting between types}0,1"
+  "|%diff{initializing $ with an expression of type $|"
+  "initializing with expression of different type}0,1"
+  "|%diff{sending $ to parameter of type $|"
+  "sending to parameter of different type}0,1"
+  "|%diff{casting $ to type $|casting between types}0,1}2"
+  "%select{|; dereference with *|"
+  "; take the address with &|"
+  "; remove *|"
+  "; remove &}3">,
+  InGroup<IncompatibleFunctionPointerTypes>;
 def ext_typecheck_convert_discards_qualifiers : ExtWarn<
   "%select{%diff{assigning to $ from $|assigning to different types}0,1"
   "|%diff{passing $ to parameter of type $|"
@@ -6505,9 +6734,13 @@
   "pointer, or a vector of such types (%0 invalid)">;
 
 def err_deleted_function_use : Error<"attempt to use a deleted function">;
+def err_deleted_inherited_ctor_use : Error<
+  "constructor inherited by %0 from base class %1 is implicitly deleted">;
 
 def err_kern_type_not_void_return : Error<
   "kernel function type %0 must have void return type">;
+def err_kern_is_nonstatic_method : Error<
+  "kernel function %0 must be a free function or static member function">;
 def err_config_scalar_return : Error<
   "CUDA special function 'cudaConfigureCall' must have scalar return type">;
 def err_kern_call_not_global_function : Error<
@@ -6517,10 +6750,34 @@
 def err_ref_bad_target : Error<
   "reference to %select{__device__|__global__|__host__|__host__ __device__}0 "
   "function %1 in %select{__device__|__global__|__host__|__host__ __device__}2 function">;
-def warn_host_calls_from_host_device : Warning<
-  "calling __host__ function %0 from __host__ __device__ function %1 can lead to runtime errors">,
+def err_ref_bad_target_global_initializer : Error<
+  "reference to %select{__device__|__global__|__host__|__host__ __device__}0 "
+  "function %1 in global initializer">;
+def warn_kern_is_method : Extension<
+  "kernel function %0 is a member function; this may not be accepted by nvcc">,
   InGroup<CudaCompat>;
-
+def warn_kern_is_inline : Warning<
+  "ignored 'inline' attribute on kernel function %0">,
+  InGroup<CudaCompat>;
+def err_variadic_device_fn : Error<
+  "CUDA device code does not support variadic functions">;
+def err_va_arg_in_device : Error<
+  "CUDA device code does not support va_arg">;
+def err_alias_not_supported_on_nvptx : Error<"CUDA does not support aliases">;
+def err_cuda_unattributed_constexpr_cannot_overload_device : Error<
+  "constexpr function '%0' without __host__ or __device__ attributes cannot "
+  "overload __device__ function with same signature.  Add a __host__ "
+  "attribute, or build with -fno-cuda-host-device-constexpr.">;
+def note_cuda_conflicting_device_function_declared_here : Note<
+  "conflicting __device__ function declared here">;
+def err_dynamic_var_init : Error<
+    "dynamic initialization is not supported for "
+    "__device__, __constant__, and __shared__ variables.">;
+def err_shared_var_init : Error<
+    "initialization is not supported for __shared__ variables.">;
+def err_device_static_local_var : Error<
+    "Within a __device__/__global__ function, "
+    "only __shared__ variables may be marked \"static\"">;
 def warn_non_pod_vararg_with_format_string : Warning<
   "cannot pass %select{non-POD|non-trivial}0 object of type %1 to variadic "
   "%select{function|block|method|constructor}2; expected type from format "
@@ -6586,7 +6843,13 @@
 def warn_function_def_in_objc_container : Warning<
   "function definition inside an Objective-C container is deprecated">,
   InGroup<FunctionDefInObjCContainer>;
-  
+
+def warn_cast_calling_conv : Warning<
+  "cast between incompatible calling conventions '%0' and '%1'; "
+  "calls through this pointer may abort at runtime">,
+  InGroup<DiagGroup<"cast-calling-convention">>;
+def note_change_calling_conv_fixit : Note<
+  "consider defining %0 with the '%1' calling convention">;
 def warn_bad_function_cast : Warning<
   "cast from function call of type %0 to non-matching type %1">,
   InGroup<BadFunctionCast>, DefaultIgnore;
@@ -6631,12 +6894,17 @@
   "expression with side effects will be evaluated despite being used as an "
   "operand to 'typeid'">, InGroup<PotentiallyEvaluatedExpression>;
 def warn_unused_result : Warning<
-  "ignoring return value of function declared with warn_unused_result "
-  "attribute">, InGroup<DiagGroup<"unused-result">>;
+  "ignoring return value of function declared with %0 attribute">,
+  InGroup<DiagGroup<"unused-result">>;
 def warn_unused_volatile : Warning<
   "expression result unused; assign into a variable to force a volatile load">,
   InGroup<DiagGroup<"unused-volatile-lvalue">>;
 
+def ext_cxx14_attr : Extension<
+  "use of the %0 attribute is a C++14 extension">, InGroup<CXX14>;
+def ext_cxx1z_attr : Extension<
+  "use of the %0 attribute is a C++1z extension">, InGroup<CXX1z>;
+
 def warn_unused_comparison : Warning<
   "%select{%select{|in}1equality|relational}0 comparison result unused">,
   InGroup<UnusedComparison>;
@@ -6853,14 +7121,9 @@
   InGroup<MicrosoftAnonTag>;
 
 // C++ local classes
-def err_reference_to_local_var_in_enclosing_function : Error<
-  "reference to local variable %0 declared in enclosing function %1">;
-def err_reference_to_local_var_in_enclosing_block : Error<
-  "reference to local variable %0 declared in enclosing block literal">;
-def err_reference_to_local_var_in_enclosing_lambda : Error<
-  "reference to local variable %0 declared in enclosing lambda expression">;
-def err_reference_to_local_var_in_enclosing_context : Error<
-  "reference to local variable %0 declared in enclosing context">;
+def err_reference_to_local_in_enclosing_context : Error<
+  "reference to local %select{variable|binding}1 %0 declared in enclosing "
+  "%select{%3|block literal|lambda expression|context}2">;
 
 def err_static_data_member_not_allowed_in_local_class : Error<
   "static data member %0 not allowed in local class %1">; 
@@ -6962,9 +7225,16 @@
   "non-namespace scope '%0' cannot have a literal operator member">;
 def err_literal_operator_default_argument : Error<
   "literal operator cannot have a default argument">;
-// FIXME: This diagnostic sucks
-def err_literal_operator_params : Error<
-  "parameter declaration for literal operator %0 is not valid">;
+def err_literal_operator_bad_param_count : Error<
+  "non-template literal operator must have one or two parameters">;
+def err_literal_operator_invalid_param : Error<
+  "parameter of literal operator must have type 'unsigned long long', 'long double', 'char', 'wchar_t', 'char16_t', 'char32_t', or 'const char *'">;
+def err_literal_operator_param : Error<
+  "invalid literal operator parameter type %0, did you mean %1?">;
+def err_literal_operator_template_with_params : Error<
+  "literal operator template cannot have any parameters">;
+def err_literal_operator_template : Error<
+  "template parameter list for literal operator must be either 'char...' or 'typename T, T...'">;
 def err_literal_operator_extern_c : Error<
   "literal operator must have C++ linkage">;
 def ext_string_literal_operator_template : ExtWarn<
@@ -7280,7 +7550,7 @@
 
 // Blocks
 def err_blocks_disable : Error<"blocks support disabled - compile with -fblocks"
-  " or pick a deployment target that supports them">;
+  " or %select{pick a deployment target that supports them|for OpenCL 2.0 or above}0">;
 def err_block_returning_array_function : Error<
   "block cannot return %select{array|function}0 type %1">;
 
@@ -7356,16 +7626,15 @@
 def note_insert_break_fixit : Note<
   "insert 'break;' to avoid fall-through">;
 def err_fallthrough_attr_wrong_target : Error<
-  "clang::fallthrough attribute is only allowed on empty statements">;
+  "%0 attribute is only allowed on empty statements">;
 def note_fallthrough_insert_semi_fixit : Note<"did you forget ';'?">;
 def err_fallthrough_attr_outside_switch : Error<
   "fallthrough annotation is outside switch statement">;
-def warn_fallthrough_attr_invalid_placement : Warning<
-  "fallthrough annotation does not directly precede switch label">,
-  InGroup<ImplicitFallthrough>;
+def err_fallthrough_attr_invalid_placement : Error<
+  "fallthrough annotation does not directly precede switch label">;
 def warn_fallthrough_attr_unreachable : Warning<
   "fallthrough annotation in unreachable code">,
-  InGroup<ImplicitFallthrough>;
+  InGroup<ImplicitFallthrough>, DefaultIgnore;
 
 def warn_unreachable_default : Warning<
   "default label in switch which covers all enumeration values">,
@@ -7600,8 +7869,8 @@
   "feature, not permitted in C++">;
  def err_type_requires_extension : Error<
   "use of type %0 requires %1 extension to be enabled">;
-def err_int128_unsupported : Error<
-  "__int128 is not supported on this target">;
+def err_type_unsupported : Error<
+  "%0 is not supported on this target">;
 def err_nsconsumed_attribute_mismatch : Error<
   "overriding method has mismatched ns_consumed attribute on its"
   " parameter">;
@@ -7610,9 +7879,10 @@
   " attributes">;
 
 def err_nserrordomain_not_tagdecl : Error<
-  "ns_error_domain attribute only valid on enum/struct/union/class">;
+  "ns_error_domain attribute only valid on "
+  "%select{enums, structs, and unions|enums, structs, unions, and classes}0">;
 def err_nserrordomain_invalid_decl : Error<
-  "domain argument %0 not valid top-level declaration">;
+  "domain argument %0 does not refer to global constant">;
 def err_nserrordomain_requires_identifier : Error<
   "domain argument must be an identifier">;
   
@@ -7677,6 +7947,8 @@
   "%0 does not have a member named %1; did you mean %2?">;
 def err_property_not_found_suggest : Error<
   "property %0 not found on object of type %1; did you mean %2?">;
+def err_class_property_found : Error<
+  "property %0 is a class property; did you mean to access it with class '%1'?">;
 def err_ivar_access_using_property_syntax_suggest : Error<
   "property %0 not found on object of type %1; did you mean to access instance variable %2?">;
 def warn_property_access_suggest : Warning<
@@ -7736,9 +8008,6 @@
 def err_asm_naked_parm_ref : Error<
   "parameter references not allowed in naked functions">;
 
-def ext_deprecated_attr_is_a_cxx14_extension : ExtWarn<
-  "use of the 'deprecated' attribute is a C++14 extension">, InGroup<CXX14>;
-
 // OpenCL warnings and errors.
 def err_invalid_astype_of_different_size : Error<
   "invalid reinterpretation: sizes of %0 and %1 must match">;
@@ -7748,8 +8017,8 @@
   "kernel parameter cannot be declared as a pointer to a pointer">;
 def err_opencl_private_ptr_kernel_param : Error<
   "kernel parameter cannot be declared as a pointer to the __private address space">;
-def err_opencl_non_kernel_variable : Error<
-  "non-kernel function variable cannot be declared in %0 address space">;
+def err_opencl_function_variable : Error<
+  "%select{non-kernel function|function scope}0 variable cannot be declared in %1 address space">;
 def err_static_function_scope : Error<
   "variables in function scope cannot be declared static">;
 def err_opencl_bitfields : Error<
@@ -7766,18 +8035,24 @@
   "field of illegal %select{type|pointer type}0 %1 declared here">;
 def err_event_t_global_var : Error<
   "the event_t type cannot be used to declare a program scope variable">;
-def err_event_t_struct_field : Error<
-  "the event_t type cannot be used to declare a structure or union field">;
+def err_opencl_type_struct_or_union_field : Error<
+  "the %0 type cannot be used to declare a structure or union field">;
 def err_event_t_addr_space_qual : Error<
   "the event_t type can only be used with __private address space qualifier">;
 def err_expected_kernel_void_return_type : Error<
   "kernel must have void return type">;
+def err_sampler_initializer_not_integer : Error<
+  "sampler_t initialization requires 32-bit integer, not %0">;
+def warn_sampler_initializer_invalid_bits : Warning<
+  "sampler initializer has invalid %0 bits">, InGroup<SpirCompat>, DefaultIgnore;
 def err_sampler_argument_required : Error<
   "sampler_t variable required - got %0">;
 def err_wrong_sampler_addressspace: Error<
   "sampler type cannot be used with the __local and __global address space qualifiers">;
+def error_opencl_cast_non_zero_to_event_t : Error<
+  "cannot cast non-zero value '%0' to 'event_t'">;
 def err_opencl_global_invalid_addr_space : Error<
-  "program scope variable must reside in %0 address space">;
+  "%select{program scope|static local|extern}0 variable must reside in %1 address space">;
 def err_missing_actual_pipe_type : Error<
   "missing actual type specifier for pipe">;
 def err_reference_pipe_type : Error <
@@ -7794,11 +8069,71 @@
   " in the declaration statement in the program scope">;
 def err_opencl_implicit_vector_conversion : Error<
   "implicit conversions between vector types (%0 and %1) are not permitted">;
+def err_opencl_block_proto_variadic : Error<
+  "invalid block prototype, variadic arguments are not allowed in OpenCL">;
+def err_opencl_invalid_type_array : Error<
+  "array of %0 type is invalid in OpenCL">;
+def err_opencl_ternary_with_block : Error<
+  "block type cannot be used as expression in ternary expression in OpenCL">;
+def err_opencl_pointer_to_type : Error<
+  "pointer to type %0 is invalid in OpenCL">;
+def err_opencl_type_can_only_be_used_as_function_parameter : Error <
+  "type %0 can only be used as a function parameter in OpenCL">;
+def warn_opencl_attr_deprecated_ignored : Warning <
+  "%0 attribute is deprecated and ignored in OpenCL version %1">,
+  InGroup<IgnoredAttributes>;
+
+// OpenCL v2.0 s6.13.6 -- Builtin Pipe Functions
+def err_opencl_builtin_pipe_first_arg : Error<
+  "first argument to %0 must be a pipe type">;
+def err_opencl_builtin_pipe_arg_num : Error<
+  "invalid number of arguments to function: %0">;
+def err_opencl_builtin_pipe_invalid_arg : Error<
+  "invalid argument type to function %0 (expecting %1 having %2)">;
+def err_opencl_builtin_pipe_invalid_access_modifier : Error<
+  "invalid pipe access modifier (expecting %0)">;
+
+// OpenCL access qualifier
+def err_opencl_invalid_access_qualifier : Error<
+  "access qualifier can only be used for pipe and image type">;
+def err_opencl_invalid_read_write : Error<
+  "access qualifier %0 can not be used for %1 %select{|prior to OpenCL version 2.0}2">;
+def err_opencl_multiple_access_qualifiers : Error<
+  "multiple access qualifiers">;
+def note_opencl_typedef_access_qualifier : Note<
+  "previously declared '%0' here">;
 
 // OpenCL Section 6.8.g
 def err_opencl_unknown_type_specifier : Error<
-  "OpenCL does not support the '%0' %select{type qualifier|storage class specifier}1">;
+  "OpenCL version %0 does not support the '%1' %select{type qualifier|storage class specifier}2">;
 
+// OpenCL v2.0 s6.12.5 Blocks restrictions
+def err_opencl_block_storage_type : Error<
+  "the __block storage type is not permitted">;
+def err_opencl_invalid_block_declaration : Error<
+  "invalid block variable declaration - must be %select{const qualified|initialized}0">;
+def err_opencl_extern_block_declaration : Error<
+  "invalid block variable declaration - using 'extern' storage class is disallowed">;
+
+// OpenCL v2.0 s6.13.9 - Address space qualifier functions. 
+def err_opencl_builtin_to_addr_arg_num : Error<
+  "invalid number of arguments to function: %0">;
+def err_opencl_builtin_to_addr_invalid_arg : Error<
+  "invalid argument %0 to function: %1, expecting a generic pointer argument">;
+
+// OpenCL v2.0 s6.13.17 Enqueue kernel restrictions.
+def err_opencl_enqueue_kernel_incorrect_args : Error<
+  "illegal call to enqueue_kernel, incorrect argument types">;
+def err_opencl_enqueue_kernel_expected_type : Error<
+  "illegal call to enqueue_kernel, expected %0 argument type">;
+def err_opencl_enqueue_kernel_local_size_args : Error<
+  "mismatch in number of block parameters and local size arguments passed">;
+def err_opencl_enqueue_kernel_invalid_local_size_type : Error<
+  "local memory sizes need to be specified as uint">;
+def err_opencl_enqueue_kernel_blocks_non_local_void_args : Error<
+  "blocks used in device side enqueue are expected to have parameters of type 'local void*'">;
+def err_opencl_enqueue_kernel_blocks_no_args : Error<
+  "blocks in this form of device side enqueue call are expected to have have no parameters">;
 } // end of sema category
 
 let CategoryName = "OpenMP Issue" in {
@@ -7812,6 +8147,8 @@
   "arguments of '#pragma omp %0' must have %select{global storage|static storage duration}1">;
 def err_omp_ref_type_arg : Error<
   "arguments of '#pragma omp %0' cannot be of reference type %1">;
+def err_omp_region_not_file_context : Error<
+  "directive must be at file or namespace scope">;
 def err_omp_var_scope : Error<
   "'#pragma omp %0' must appear in the scope of the %q1 variable declaration">;
 def err_omp_var_used : Error<
@@ -7828,10 +8165,28 @@
   "a reduction list item with incomplete type %0">;
 def err_omp_unexpected_clause_value : Error<
   "expected %0 in OpenMP clause '%1'">;
-def err_omp_expected_var_name : Error<
-  "expected variable name">;
-def err_omp_expected_var_name_or_array_item : Error<
-  "expected variable name, array element or array section">;
+def err_omp_expected_var_name_member_expr : Error<
+  "expected variable name%select{| or data member of current class}0">;
+def err_omp_expected_var_name_member_expr_or_array_item : Error<
+  "expected variable name%select{|, data member of current class}0, array element or array section">;
+def err_omp_expected_named_var_member_or_array_expression: Error<
+  "expected expression containing only member accesses and/or array sections based on named variables">;
+def err_omp_bit_fields_forbidden_in_clause : Error<
+  "bit fields cannot be used to specify storage in a '%0' clause">;
+def err_array_section_does_not_specify_contiguous_storage : Error<
+  "array section does not specify contiguous storage">;
+def err_omp_union_type_not_allowed : Error<
+  "mapped storage cannot be derived from a union">;
+def err_omp_expected_access_to_data_field : Error<
+  "expected access to data field">;
+def err_omp_multiple_array_items_in_map_clause : Error<
+  "multiple array elements associated with the same variable are not allowed in map clauses of the same construct">;
+def err_omp_pointer_mapped_along_with_derived_section : Error<
+  "pointer cannot be mapped along with a section derived from itself">;
+def err_omp_original_storage_is_shared_and_does_not_contain : Error<
+  "original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage">;
+def err_omp_same_pointer_derreferenced : Error<
+  "same pointer derreferenced in multiple different ways in map clause expressions">;
 def note_omp_task_predetermined_firstprivate_here : Note<
   "predetermined as a firstprivate in a task construct here">;
 def err_omp_threadprivate_incomplete_type : Error<
@@ -7869,6 +8224,8 @@
 def err_omp_not_integral : Error<
   "expression must have integral or unscoped enumeration "
   "type, not %0">;
+def err_omp_threadprivate_in_target : Error<
+  "threadprivate variables cannot be used in target constructs">;
 def err_omp_incomplete_type : Error<
   "expression has incomplete class type %0">;
 def err_omp_explicit_conversion : Error<
@@ -7895,12 +8252,23 @@
 def warn_omp_alignment_not_power_of_two : Warning<
   "aligned clause will be ignored because the requested alignment is not a power of 2">,
   InGroup<OpenMPClauses>;
+def err_omp_enclosed_declare_target : Error<
+  "declare target region may not be enclosed within another declare target region">;
+def err_omp_invalid_target_decl : Error<
+  "%0 used in declare target directive is not a variable or a function name">;
+def err_omp_declare_target_multiple : Error<
+  "%0 appears multiple times in clauses on the same declare target directive">;
+def err_omp_declare_target_to_and_link : Error<
+  "%0 must not appear in both clauses 'to' and 'link'">;
+def warn_omp_not_in_target_context : Warning<
+  "declaration is not declared in any declare target region">,
+  InGroup<OpenMPTarget>;
 def err_omp_aligned_expected_array_or_ptr : Error<
   "argument of aligned clause should be array"
   "%select{ or pointer|, pointer, reference to array or reference to pointer}1"
   ", not %0">;
 def err_omp_aligned_twice : Error<
-  "a variable cannot appear in more than one aligned clause">;
+  "%select{a variable|a parameter|'this'}0 cannot appear in more than one aligned clause">;
 def err_omp_local_var_in_threadprivate_init : Error<
   "variable with local storage in initial value of threadprivate variable">;
 def err_omp_loop_not_canonical_init : Error<
@@ -7933,9 +8301,10 @@
   "OpenMP loop iteration variable cannot have more than 64 bits size and will be narrowed">,
   InGroup<OpenMPLoopForm>;
 def err_omp_unknown_reduction_identifier : Error<
-  "incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'">;
-def err_omp_reduction_type_array : Error<
-  "a reduction list item with array type %0">;
+  "incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', "
+  "'&&', '||', 'min' or 'max' or declare reduction for type %0">;
+def err_omp_not_resolved_reduction_identifier : Error<
+  "unable to resolve declare reduction construct for type %0">;
 def err_omp_reduction_ref_type_arg : Error<
   "argument of OpenMP clause 'reduction' must reference the same object in all threads">;
 def err_omp_clause_not_arithmetic_type_arg : Error<
@@ -7944,6 +8313,8 @@
   "arguments of OpenMP clause 'reduction' with bitwise operators cannot be of floating type">;
 def err_omp_once_referenced : Error<
   "variable can appear only once in OpenMP '%0' clause">;
+def err_omp_once_referenced_in_target_update : Error<
+  "variable can appear only once in OpenMP 'target update' construct">;
 def note_omp_referenced : Note<
   "previously referenced here">;
 def err_omp_reduction_in_task : Error<
@@ -8019,12 +8390,19 @@
   "the 'copyprivate' clause must not be used with the 'nowait' clause">;
 def note_omp_nowait_clause_here : Note<
   "'nowait' clause is here">;
+def err_omp_single_decl_in_declare_simd : Error<
+  "single declaration is expected after 'declare simd' directive">;
+def err_omp_function_expected : Error<
+  "'#pragma omp declare simd' can only be applied to functions">;
 def err_omp_wrong_cancel_region : Error<
   "one of 'for', 'parallel', 'sections' or 'taskgroup' is expected">;
 def err_omp_parent_cancel_region_nowait : Error<
   "parent region for 'omp %select{cancellation point/cancel}0' construct cannot be nowait">;
 def err_omp_parent_cancel_region_ordered : Error<
   "parent region for 'omp %select{cancellation point/cancel}0' construct cannot be ordered">;
+def err_omp_reduction_wrong_type : Error<"reduction type cannot be %select{qualified with 'const', 'volatile' or 'restrict'|a function|a reference|an array}0 type">;
+def err_omp_wrong_var_in_declare_reduction : Error<"only %select{'omp_priv' or 'omp_orig'|'omp_in' or 'omp_out'}0 variables are allowed in %select{initializer|combiner}0 expression">;
+def err_omp_declare_reduction_redefinition : Error<"redefinition of user-defined reduction for type %0">;
 def err_omp_array_section_use : Error<"OpenMP array section is not allowed here">;
 def err_omp_typecheck_section_value : Error<
   "subscripted value is not an array or pointer">;
@@ -8036,8 +8414,10 @@
   InGroup<CharSubscript>, DefaultIgnore;
 def err_omp_section_incomplete_type : Error<
   "section of pointer to incomplete type %0">;
-def err_omp_section_negative : Error<
-  "section %select{lower bound|length}0 is evaluated to a negative value %1">;
+def err_omp_section_not_subset_of_array : Error<
+  "array section must be a subset of the original array">;
+def err_omp_section_length_negative : Error<
+  "section length is evaluated to a negative value %0">;
 def err_omp_section_length_undefined : Error<
   "section length is unspecified and cannot be inferred because subscripted value is %select{not an array|an array of unknown bound}0">;
 def err_omp_wrong_linear_modifier : Error<
@@ -8066,12 +8446,16 @@
   "variable already marked as mapped in current construct">;
 def err_omp_not_mappable_type : Error<
   "type %0 is not mappable to target">;
+def err_omp_invalid_map_type_for_directive : Error<
+  "%select{map type '%1' is not allowed|map type must be specified}0 for '#pragma omp %2'">;
+def err_omp_no_map_for_directive : Error<
+  "expected at least one map clause for '#pragma omp %0'">;
 def note_omp_polymorphic_in_target : Note<
   "mappable type cannot be polymorphic">;
 def note_omp_static_member_in_target : Note<
   "mappable type cannot contain static members">;
-def err_omp_threadprivate_in_map : Error<
-  "threadprivate variables are not allowed in map clause">;
+def err_omp_threadprivate_in_clause : Error<
+  "threadprivate variables are not allowed in '%0' clause">;
 def err_omp_wrong_ordered_loop_count : Error<
   "the parameter of the 'ordered' clause must be greater than or equal to the parameter of the 'collapse' clause">;
 def note_collapse_loop_count : Note<
@@ -8096,8 +8480,6 @@
   "reduction variable in '#pragma omp teams' cannot be firstprivate in '#pragma omp distribute'">;
 def err_omp_depend_clause_thread_simd : Error<
   "'depend' clauses cannot be mixed with '%0' clause">;
-def err_omp_depend_sink_wrong_expr : Error<
-  "expected expression form x[+-d], where x is the loop iteration variable and d is a constant non-negative integer">;
 def err_omp_depend_sink_expected_loop_iteration : Error<
   "expected %0 loop iteration variable">;
 def err_omp_depend_sink_unexpected_expr : Error<
@@ -8116,6 +8498,26 @@
   "'schedule' clause with 'nonmonotonic' modifier cannot be specified if an 'ordered' clause is specified">;
 def err_omp_ordered_simd : Error<
   "'ordered' clause with a parameter can not be specified in '#pragma omp %0' directive">;
+def err_omp_variable_in_given_clause_and_dsa : Error<
+  "%0 variable cannot be in a %1 clause in '#pragma omp %2' directive">;
+def err_omp_param_or_this_in_clause : Error<
+  "expected reference to one of the parameters of function %0%select{| or 'this'}1">;
+def err_omp_expected_uniform_param : Error<
+  "expected a reference to a parameter specified in a 'uniform' clause">;
+def err_omp_expected_int_param : Error<
+  "expected a reference to an integer-typed parameter">;
+def err_omp_at_least_one_motion_clause_required : Error<
+  "expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'">;
+def  err_omp_usedeviceptr_not_a_pointer : Error<
+  "expected pointer or reference to pointer in 'use_device_ptr' clause">;
+def err_omp_argument_type_isdeviceptr : Error <
+  "expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'">;
+def warn_omp_nesting_simd : Warning<
+  "OpenMP only allows an ordered construct with the simd clause nested in a simd construct">,
+  InGroup<SourceUsesOpenMP>;
+def err_omp_orphaned_device_directive : Error<
+  "orphaned 'omp %0' directives are prohibited"
+  "; perhaps you forget to enclose the directive into a %select{|||target |teams }1region?">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
@@ -8154,10 +8556,17 @@
   "local %select{struct|interface|union|class|enum}0 cannot be declared "
   "__module_private__">;
 def err_module_unimported_use : Error<
-  "%select{declaration|definition|default argument}0 of %1 must be imported "
+  "%select{declaration|definition|default argument|"
+  "explicit specialization|partial specialization}0 of %1 must be imported "
+  "from module '%2' before it is required">;
+def err_module_unimported_use_header : Error<
+  "missing '#include %3'; "
+  "%select{declaration|definition|default argument|"
+  "explicit specialization|partial specialization}0 of %1 must be imported "
   "from module '%2' before it is required">;
 def err_module_unimported_use_multiple : Error<
-  "%select{declaration|definition|default argument}0 of %1 must be imported "
+  "%select{declaration|definition|default argument|"
+  "explicit specialization|partial specialization}0 of %1 must be imported "
   "from one of the following modules before it is required:%2">;
 def ext_module_import_in_extern_c : ExtWarn<
   "import of C++ module '%0' appears within extern \"C\" language linkage "
@@ -8365,4 +8774,8 @@
   "'%0' qualifier on omitted return type %1 has no effect">,
   InGroup<IgnoredQualifiers>;
 
+def ext_warn_gnu_final : ExtWarn<
+  "__final is a GNU extension, consider using C++11 final">,
+  InGroup<GccCompat>;
+
 } // end of sema component.
diff --git a/include/clang/Basic/FileManager.h b/include/clang/Basic/FileManager.h
index 17758ec..b6a9ca7 100644
--- a/include/clang/Basic/FileManager.h
+++ b/include/clang/Basic/FileManager.h
@@ -52,6 +52,7 @@
 /// descriptor for the file.
 class FileEntry {
   const char *Name;           // Name of the file.
+  std::string RealPathName;   // Real path to the file; could be empty.
   off_t Size;                 // File size in bytes.
   time_t ModTime;             // Modification time of file.
   const DirectoryEntry *Dir;  // Directory file lives in.
@@ -82,6 +83,7 @@
   }
 
   const char *getName() const { return Name; }
+  StringRef tryGetRealPathName() const { return RealPathName; }
   bool isValid() const { return IsValid; }
   off_t getSize() const { return Size; }
   unsigned getUID() const { return UID; }
diff --git a/include/clang/Basic/IdentifierTable.h b/include/clang/Basic/IdentifierTable.h
index d672314..fffb504 100644
--- a/include/clang/Basic/IdentifierTable.h
+++ b/include/clang/Basic/IdentifierTable.h
@@ -62,6 +62,9 @@
                                    // partially) from an AST file.
   bool ChangedAfterLoad       : 1; // True if identifier has changed from the
                                    // definition loaded from an AST file.
+  bool FEChangedAfterLoad     : 1; // True if identifier's frontend information
+                                   // has changed from the definition loaded
+                                   // from an AST file.
   bool RevertedTokenID        : 1; // True if revertTokenIDToIdentifier was
                                    // called.
   bool OutOfDate              : 1; // True if there may be additional
@@ -69,7 +72,7 @@
                                    // stored externally.
   bool IsModulesImport        : 1; // True if this is the 'import' contextual
                                    // keyword.
-  // 30 bit left in 64-bit word.
+  // 29 bit left in 64-bit word.
 
   void *FETokenInfo;               // Managed by the language front-end.
   llvm::StringMapEntry<IdentifierInfo*> *Entry;
@@ -303,6 +306,18 @@
     ChangedAfterLoad = true;
   }
 
+  /// \brief Determine whether the frontend token information for this
+  /// identifier has changed since it was loaded from an AST file.
+  bool hasFETokenInfoChangedSinceDeserialization() const {
+    return FEChangedAfterLoad;
+  }
+  
+  /// \brief Note that the frontend token information for this identifier has
+  /// changed since it was loaded from an AST file.
+  void setFETokenInfoChangedSinceDeserialization() {
+    FEChangedAfterLoad = true;
+  }
+
   /// \brief Determine whether the information for this identifier is out of
   /// date with respect to the external source.
   bool isOutOfDate() const { return OutOfDate; }
diff --git a/include/clang/Basic/Lambda.h b/include/clang/Basic/Lambda.h
index e676e72..1c19f1d 100644
--- a/include/clang/Basic/Lambda.h
+++ b/include/clang/Basic/Lambda.h
@@ -32,7 +32,8 @@
 /// by reference.  C++1y also allows "init-capture", where the initializer
 /// is an expression.
 enum LambdaCaptureKind {
-  LCK_This,   ///< Capturing the \c this pointer
+  LCK_This,   ///< Capturing the \c *this object by reference
+  LCK_StarThis, /// < Capturing the \c *this object by copy
   LCK_ByCopy, ///< Capturing by copy (a.k.a., by value)
   LCK_ByRef,  ///< Capturing by reference
   LCK_VLAType ///< Capturing variable-length array type
diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def
index 594c8c7..4eeaf41 100644
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -24,11 +24,15 @@
 //
 // VALUE_LANGOPT: for options that describe a value rather than a flag.
 //
-// BENIGN_ENUM_LANGOPT, COMPATIBLE_ENUM_LANGOPT: combinations of the above.
+// BENIGN_ENUM_LANGOPT, COMPATIBLE_ENUM_LANGOPT,
+// BENIGN_VALUE_LANGOPT, COMPATIBLE_VALUE_LANGOPT: combinations of the above.
 //
 // FIXME: Clients should be able to more easily select whether they want
 // different levels of compatibility versus how to handle different kinds
 // of option.
+//
+// The Description field should be a noun phrase, for instance "frobbing all
+// widgets" or "C's implicit blintz feature".
 //===----------------------------------------------------------------------===//
 
 #ifndef LANGOPT
@@ -65,6 +69,16 @@
      LANGOPT(Name, Bits, Default, Description)
 #endif
 
+#ifndef COMPATIBLE_VALUE_LANGOPT
+#  define COMPATIBLE_VALUE_LANGOPT(Name, Bits, Default, Description) \
+     VALUE_LANGOPT(Name, Bits, Default, Description)
+#endif
+
+#ifndef BENIGN_VALUE_LANGOPT
+#  define BENIGN_VALUE_LANGOPT(Name, Bits, Default, Description) \
+     COMPATIBLE_VALUE_LANGOPT(Name, Bits, Default, Description)
+#endif
+
 // FIXME: A lot of the BENIGN_ options should be COMPATIBLE_ instead.
 LANGOPT(C99               , 1, 0, "C99")
 LANGOPT(C11               , 1, 0, "C11")
@@ -110,6 +124,7 @@
 LANGOPT(ObjCExceptions    , 1, 0, "Objective-C exceptions")
 LANGOPT(CXXExceptions     , 1, 0, "C++ exceptions")
 LANGOPT(SjLjExceptions    , 1, 0, "setjmp-longjump exception handling")
+LANGOPT(ExternCNoUnwind   , 1, 0, "Assume extern C functions don't unwind")
 LANGOPT(TraditionalCPP    , 1, 0, "traditional CPP emulation")
 LANGOPT(RTTI              , 1, 1, "run-time type information")
 LANGOPT(RTTIData          , 1, 1, "emit run-time type information data")
@@ -123,31 +138,34 @@
 BENIGN_LANGOPT(ThreadsafeStatics , 1, 1, "thread-safe static initializers")
 LANGOPT(POSIXThreads      , 1, 0, "POSIX thread support")
 LANGOPT(Blocks            , 1, 0, "blocks extension to C")
-BENIGN_LANGOPT(EmitAllDecls      , 1, 0, "support for emitting all declarations")
-LANGOPT(MathErrno         , 1, 1, "errno support for math functions")
-BENIGN_LANGOPT(HeinousExtensions , 1, 0, "Extensions that we really don't like and may be ripped out at any time")
+BENIGN_LANGOPT(EmitAllDecls      , 1, 0, "emitting all declarations")
+LANGOPT(MathErrno         , 1, 1, "errno in math functions")
+BENIGN_LANGOPT(HeinousExtensions , 1, 0, "extensions that we really don't like and may be ripped out at any time")
 LANGOPT(Modules           , 1, 0, "modules extension to C")
+LANGOPT(ModulesTS         , 1, 0, "C++ Modules TS")
+BENIGN_LANGOPT(CompilingModule, 1, 0, "compiling a module interface")
 COMPATIBLE_LANGOPT(ModulesDeclUse    , 1, 0, "require declaration of module uses")
-LANGOPT(ModulesSearchAll  , 1, 1, "search even non-imported modules to find unresolved references")
-COMPATIBLE_LANGOPT(ModulesStrictDeclUse, 1, 0, "require declaration of module uses and all headers to be in modules")
-BENIGN_LANGOPT(ModulesErrorRecovery, 1, 1, "automatically import modules as needed when performing error recovery")
-BENIGN_LANGOPT(ImplicitModules, 1, 1, "build modules that are not specified via -fmodule-file")
+BENIGN_LANGOPT(ModulesSearchAll  , 1, 1, "searching even non-imported modules to find unresolved references")
+COMPATIBLE_LANGOPT(ModulesStrictDeclUse, 1, 0, "requiring declaration of module uses and all headers to be in modules")
+BENIGN_LANGOPT(ModulesErrorRecovery, 1, 1, "automatically importing modules as needed when performing error recovery")
+BENIGN_LANGOPT(ImplicitModules, 1, 1, "building modules that are not specified via -fmodule-file")
 COMPATIBLE_LANGOPT(ModulesLocalVisibility, 1, 0, "local submodule visibility")
 COMPATIBLE_LANGOPT(Optimize          , 1, 0, "__OPTIMIZE__ predefined macro")
 COMPATIBLE_LANGOPT(OptimizeSize      , 1, 0, "__OPTIMIZE_SIZE__ predefined macro")
-LANGOPT(Static            , 1, 0, "__STATIC__ predefined macro (as opposed to __DYNAMIC__)")
+COMPATIBLE_LANGOPT(Static            , 1, 0, "__STATIC__ predefined macro (as opposed to __DYNAMIC__)")
 VALUE_LANGOPT(PackStruct  , 32, 0,
               "default struct packing maximum alignment")
 VALUE_LANGOPT(MaxTypeAlign  , 32, 0,
               "default maximum alignment for types")
-VALUE_LANGOPT(PICLevel    , 2, 0, "__PIC__ level")
-VALUE_LANGOPT(PIELevel    , 2, 0, "__PIE__ level")
-LANGOPT(GNUInline         , 1, 0, "GNU inline semantics")
+VALUE_LANGOPT(AlignDouble            , 1, 0, "Controls if doubles should be aligned to 8 bytes (x86 only)")
+COMPATIBLE_VALUE_LANGOPT(PICLevel    , 2, 0, "__PIC__ level")
+COMPATIBLE_VALUE_LANGOPT(PIE         , 1, 0, "is pie")
+COMPATIBLE_LANGOPT(GNUInline         , 1, 0, "GNU inline semantics")
 COMPATIBLE_LANGOPT(NoInlineDefine    , 1, 0, "__NO_INLINE__ predefined macro")
 COMPATIBLE_LANGOPT(Deprecated        , 1, 0, "__DEPRECATED predefined macro")
-LANGOPT(FastMath          , 1, 0, "__FAST_MATH__ predefined macro")
-LANGOPT(FiniteMathOnly    , 1, 0, "__FINITE_MATH_ONLY__ predefined macro")
-LANGOPT(UnsafeFPMath      , 1, 0, "Unsafe Floating Point Math")
+COMPATIBLE_LANGOPT(FastMath          , 1, 0, "fast FP math optimizations, and __FAST_MATH__ predefined macro")
+COMPATIBLE_LANGOPT(FiniteMathOnly    , 1, 0, "__FINITE_MATH_ONLY__ predefined macro")
+COMPATIBLE_LANGOPT(UnsafeFPMath      , 1, 0, "Unsafe Floating Point Math")
 
 BENIGN_LANGOPT(ObjCGCBitmapPrint , 1, 0, "printing of GC's bitmap layout for __weak/__strong ivars")
 
@@ -155,24 +173,27 @@
 LANGOPT(CharIsSigned      , 1, 1, "signed char")
 LANGOPT(ShortWChar        , 1, 0, "unsigned short wchar_t")
 ENUM_LANGOPT(MSPointerToMemberRepresentationMethod, PragmaMSPointersToMembersKind, 2, PPTMK_BestCase, "member-pointer representation method")
+ENUM_LANGOPT(DefaultCallingConv, DefaultCallingConvention, 3, DCC_None, "default calling convention")
 
 LANGOPT(ShortEnums        , 1, 0, "short enum types")
 
 LANGOPT(OpenCL            , 1, 0, "OpenCL")
 LANGOPT(OpenCLVersion     , 32, 0, "OpenCL version")
 LANGOPT(NativeHalfType    , 1, 0, "Native half type support")
+LANGOPT(NativeHalfArgsAndReturns, 1, 0, "Native half args and returns")
 LANGOPT(HalfArgsAndReturns, 1, 0, "half args and returns")
 LANGOPT(CUDA              , 1, 0, "CUDA")
-LANGOPT(OpenMP            , 1, 0, "OpenMP support")
+LANGOPT(OpenMP            , 32, 0, "OpenMP support and version of OpenMP (31, 40 or 45)")
 LANGOPT(OpenMPUseTLS      , 1, 0, "Use TLS for threadprivates or runtime calls")
 LANGOPT(OpenMPIsDevice    , 1, 0, "Generate code only for OpenMP target device")
+LANGOPT(RenderScript      , 1, 0, "RenderScript")
 
-LANGOPT(CUDAIsDevice      , 1, 0, "Compiling for CUDA device")
-LANGOPT(CUDAAllowHostCallsFromHostDevice, 1, 0, "Allow host device functions to call host functions")
-LANGOPT(CUDADisableTargetCallChecks, 1, 0, "Disable checks for call targets (host, device, etc.)")
-LANGOPT(CUDATargetOverloads, 1, 0, "Enable function overloads based on CUDA target attributes")
+LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
+LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
+LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
+LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
+LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 
-LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators")
 LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")
 LANGOPT(ConceptsTS , 1, 0, "enable C++ Extensions for Concepts")
 BENIGN_LANGOPT(ElideConstructors , 1, 1, "C++ copy constructor elision")
@@ -198,8 +219,7 @@
 LANGOPT(ObjCSubscriptingLegacyRuntime         , 1, 0, "Subscripting support in legacy ObjectiveC runtime")
 LANGOPT(FakeAddressSpaceMap , 1, 0, "OpenCL fake address space map")
 ENUM_LANGOPT(AddressSpaceMapMangling , AddrSpaceMapMangling, 2, ASMM_Target, "OpenCL address space map mangling mode")
-
-LANGOPT(MRTD , 1, 0, "-mrtd calling convention")
+LANGOPT(IncludeDefaultHeader, 1, 0, "Include default header file for OpenCL")
 BENIGN_LANGOPT(DelayedTemplateParsing , 1, 0, "delayed template parsing")
 LANGOPT(BlocksRuntimeOptional , 1, 0, "optional blocks runtime")
 
@@ -244,4 +264,6 @@
 #undef COMPATIBLE_ENUM_LANGOPT
 #undef BENIGN_ENUM_LANGOPT
 #undef VALUE_LANGOPT
+#undef COMPATIBLE_VALUE_LANGOPT
+#undef BENIGN_VALUE_LANGOPT
 
diff --git a/include/clang/Basic/LangOptions.h b/include/clang/Basic/LangOptions.h
index 736d4e0..6ec499f 100644
--- a/include/clang/Basic/LangOptions.h
+++ b/include/clang/Basic/LangOptions.h
@@ -65,6 +65,14 @@
     PPTMK_FullGeneralityVirtualInheritance
   };
 
+  enum DefaultCallingConvention {
+    DCC_None,
+    DCC_CDecl,
+    DCC_FastCall,
+    DCC_StdCall,
+    DCC_VectorCall
+  };
+
   enum AddrSpaceMapMangling { ASMM_Target, ASMM_On, ASMM_Off };
 
   enum MSVCMajorVersion {
@@ -92,14 +100,12 @@
   /// If none is specified, abort (GCC-compatible behaviour).
   std::string OverflowHandler;
 
-  /// \brief The name of the current module.
+  /// \brief The name of the current module, of which the main source file
+  /// is a part. If CompilingModule is set, we are compiling the interface
+  /// of this module, otherwise we are compiling an implementation file of
+  /// it.
   std::string CurrentModule;
 
-  /// \brief The name of the module that the translation unit is an
-  /// implementation of. Prevents semantic imports, but does not otherwise
-  /// treat this as the CurrentModule.
-  std::string ImplementationOfModule;
-
   /// \brief The names of any features to enable in module 'requires' decls
   /// in addition to the hard-coded list in Module.cpp and the target features.
   ///
@@ -162,18 +168,6 @@
     fp_contract(LangOpts.DefaultFPContract) {}
 };
 
-/// \brief OpenCL volatile options
-class OpenCLOptions {
-public:
-#define OPENCLEXT(nm)  unsigned nm : 1;
-#include "clang/Basic/OpenCLExtensions.def"
-
-  OpenCLOptions() {
-#define OPENCLEXT(nm)   nm = 0;
-#include "clang/Basic/OpenCLExtensions.def"
-  }
-};
-
 /// \brief Describes the kind of translation unit being processed.
 enum TranslationUnitKind {
   /// \brief The translation unit is a complete translation unit.
diff --git a/include/clang/Basic/Makefile b/include/clang/Basic/Makefile
deleted file mode 100644
index 5579a99..0000000
--- a/include/clang/Basic/Makefile
+++ /dev/null
@@ -1,70 +0,0 @@
-CLANG_LEVEL := ../../..
-BUILT_SOURCES = \
-	DiagnosticAnalysisKinds.inc DiagnosticASTKinds.inc \
-	DiagnosticCommentKinds.inc \
-	DiagnosticCommonKinds.inc DiagnosticDriverKinds.inc \
-	DiagnosticFrontendKinds.inc DiagnosticLexKinds.inc \
-	DiagnosticParseKinds.inc DiagnosticSemaKinds.inc \
-	DiagnosticSerializationKinds.inc \
-	AttrHasAttributeImpl.inc \
-	DiagnosticIndexName.inc DiagnosticGroups.inc AttrList.inc arm_neon.inc \
-	Version.inc
-
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-INPUT_TDS = $(wildcard $(PROJ_SRC_DIR)/Diagnostic*.td)
-
-# Compute the Clang version from the LLVM version, unless specified explicitly.
-ifndef CLANG_VERSION
-CLANG_VERSION := $(subst svn,,$(LLVMVersion))
-CLANG_VERSION := $(subst rc,,$(CLANG_VERSION))
-endif
-
-CLANG_VERSION_COMPONENTS := $(subst ., ,$(CLANG_VERSION))
-CLANG_VERSION_MAJOR := $(word 1,$(CLANG_VERSION_COMPONENTS))
-CLANG_VERSION_MINOR := $(word 2,$(CLANG_VERSION_COMPONENTS))
-CLANG_VERSION_PATCHLEVEL := $(word 3,$(CLANG_VERSION_COMPONENTS))
-ifeq ($(CLANG_VERSION_PATCHLEVEL),)
-CLANG_HAS_VERSION_PATCHLEVEL := 0
-else
-CLANG_HAS_VERSION_PATCHLEVEL := 1
-endif
-
-$(ObjDir)/Diagnostic%Kinds.inc.tmp : Diagnostic.td $(INPUT_TDS) $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang $(patsubst Diagnostic%Kinds.inc.tmp,%,$(@F)) diagnostic tables with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-diags-defs -clang-component=$(patsubst Diagnostic%Kinds.inc.tmp,%,$(@F)) -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/DiagnosticIndexName.inc.tmp : Diagnostic.td $(INPUT_TDS) $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang diagnostic name index with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-diags-index-name -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/DiagnosticGroups.inc.tmp : Diagnostic.td DiagnosticGroups.td $(INPUT_TDS) $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang diagnostic groups with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-diag-groups -o $(call SYSPATH, $@) $<
-
-$(ObjDir)/AttrList.inc.tmp : Attr.td $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang attribute list with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-list -o $(call SYSPATH, $@) \
-	  -I $(PROJ_SRC_DIR)/../.. $<
-
-$(ObjDir)/AttrHasAttributeImpl.inc.tmp : Attr.td $(CLANG_TBLGEN) \
-                                  $(ObjDir)/.dir
-	$(Echo) "Building Clang __has_attribute implementation with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-has-attribute-impl -o $(call SYSPATH, $@) \
-		-I $(PROJ_SRC_DIR)/../../ $<
-
-$(ObjDir)/arm_neon.inc.tmp : arm_neon.td $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang arm_neon.inc with tblgen"
-	$(Verb) $(ClangTableGen) -gen-arm-neon-sema -o $(call SYSPATH, $@) \
-	  -I $(PROJ_SRC_DIR)/../.. $<
-
-$(ObjDir)/Version.inc.tmp : Version.inc.in Makefile $(LLVM_OBJ_ROOT)/Makefile.config $(ObjDir)/.dir
-	$(Echo) "Updating Clang version info."
-	$(Verb)sed -e "s#@CLANG_VERSION@#$(CLANG_VERSION)#g" \
-	           -e "s#@CLANG_VERSION_MAJOR@#$(CLANG_VERSION_MAJOR)#g" \
-	           -e "s#@CLANG_VERSION_MINOR@#$(CLANG_VERSION_MINOR)#g" \
-	           -e "s#@CLANG_VERSION_PATCHLEVEL@#$(CLANG_VERSION_PATCHLEVEL)#g" \
-	           -e "s#@CLANG_HAS_VERSION_PATCHLEVEL@#$(CLANG_HAS_VERSION_PATCHLEVEL)#g" \
-	           $< > $@
diff --git a/include/clang/Basic/OpenCLExtensions.def b/include/clang/Basic/OpenCLExtensions.def
index 91fd919..360fec4 100644
--- a/include/clang/Basic/OpenCLExtensions.def
+++ b/include/clang/Basic/OpenCLExtensions.def
@@ -11,25 +11,76 @@
 //
 //===----------------------------------------------------------------------===//
 
-// OpenCL 1.1.
-OPENCLEXT(cl_khr_fp64)
-OPENCLEXT(cl_khr_int64_base_atomics)
-OPENCLEXT(cl_khr_int64_extended_atomics)
-OPENCLEXT(cl_khr_fp16)
-OPENCLEXT(cl_khr_gl_sharing)
-OPENCLEXT(cl_khr_gl_event)
-OPENCLEXT(cl_khr_d3d10_sharing)
-OPENCLEXT(cl_khr_global_int32_base_atomics)
-OPENCLEXT(cl_khr_global_int32_extended_atomics)
-OPENCLEXT(cl_khr_local_int32_base_atomics)
-OPENCLEXT(cl_khr_local_int32_extended_atomics)
-OPENCLEXT(cl_khr_byte_addressable_store)
-OPENCLEXT(cl_khr_3d_image_writes)
+// Macro OPENCLEXT or OPENCLEXT_INTERNAL can be defined to enumerate the
+// OpenCL extensions listed in this file.
+//
+// If the extensions are to be enumerated without the supported OpenCL version,
+// define OPENCLEXT(ext) where ext is the name of the extension.
+//
+// If the extensions are to be enumerated with supported OpenCL version,
+// define OPENCLEXT_INTERNAL(ext, avail, core) where
+//   ext - name of the extension or optional core feature.
+//   avail - minimum OpenCL version supporting it.
+//   core - minimum OpenCL version when the extension becomes optional core
+//          feature or core feature. ~0U indicates not a core feature or an
+//          optional core feature.
 
-// OpenCL 2.0
-OPENCLEXT(cl_khr_gl_msaa_sharing)
+#ifndef OPENCLEXT_INTERNAL
+#ifndef OPENCLEXT
+#pragma error "macro OPENCLEXT or OPENCLEXT_INTERNAL is required"
+#else
+#define OPENCLEXT_INTERNAL(ext, ...) OPENCLEXT(ext)
+#endif // OPENCLEXT
+#endif // OPENCLEXT_INTERNAL
+
+// OpenCL 1.0.
+OPENCLEXT_INTERNAL(cl_khr_3d_image_writes, 100, 200)
+// fprounding mode is special since it is not mentioned beyond 1.0
+OPENCLEXT_INTERNAL(cl_khr_select_fprounding_mode, 100, 110)
+OPENCLEXT_INTERNAL(cl_khr_byte_addressable_store, 100, 110)
+OPENCLEXT_INTERNAL(cl_khr_fp16, 100, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_fp64, 100, 120)
+OPENCLEXT_INTERNAL(cl_khr_global_int32_base_atomics, 100, 110)
+OPENCLEXT_INTERNAL(cl_khr_global_int32_extended_atomics, 100, 110)
+OPENCLEXT_INTERNAL(cl_khr_local_int32_base_atomics, 100, 110)
+OPENCLEXT_INTERNAL(cl_khr_local_int32_extended_atomics, 100, 110)
+OPENCLEXT_INTERNAL(cl_khr_int64_base_atomics, 100, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_int64_extended_atomics, 100, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_gl_sharing, 100, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_icd, 100, ~0U)
+
+// OpenCL 1.1.
+OPENCLEXT_INTERNAL(cl_khr_gl_event, 110, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_d3d10_sharing, 110, ~0U)
+
+// OpenCL 1.2.
+OPENCLEXT_INTERNAL(cl_khr_context_abort, 120, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_d3d11_sharing, 120, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_depth_images, 120, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_dx9_media_sharing, 120, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_image2d_from_buffer, 120, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_initialize_memory, 120, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_gl_depth_images, 120, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_gl_msaa_sharing, 120, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_spir, 120, ~0U)
+
+// OpenCL 2.0.
+OPENCLEXT_INTERNAL(cl_khr_egl_event, 200, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_egl_image, 200, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_mipmap_image, 200, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_srgb_image_writes, 200, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_subgroups, 200, ~0U)
+OPENCLEXT_INTERNAL(cl_khr_terminate_context, 200, ~0U)
 
 // Clang Extensions.
-OPENCLEXT(cl_clang_storage_class_specifiers)
+OPENCLEXT_INTERNAL(cl_clang_storage_class_specifiers, 100, ~0U)
 
+// AMD OpenCL extensions
+OPENCLEXT_INTERNAL(cl_amd_media_ops, 100, ~0U)
+OPENCLEXT_INTERNAL(cl_amd_media_ops2, 100, ~0U)
+
+#undef OPENCLEXT_INTERNAL
+
+#ifdef OPENCLEXT
 #undef OPENCLEXT
+#endif
diff --git a/include/clang/Basic/OpenCLImageTypes.def b/include/clang/Basic/OpenCLImageTypes.def
new file mode 100644
index 0000000..9b92992
--- /dev/null
+++ b/include/clang/Basic/OpenCLImageTypes.def
@@ -0,0 +1,82 @@
+//===-- OpenCLImageTypes.def - Metadata about BuiltinTypes ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//  This file extends builtin types database with OpenCL image singleton types.
+//  Custom code should define one of those two macros:
+//    GENERIC_IMAGE_TYPE(Type, Id) - a generic image with its Id without an 
+//      access type
+//    IMAGE_TYPE(Type, Id, SingletonId, AccessType, CGSuffix) - an image type
+//      with given ID, singleton ID access type and a codegen suffix  
+
+#ifdef GENERIC_IMAGE_TYPE
+
+#define IMAGE_READ_TYPE(Type, Id) GENERIC_IMAGE_TYPE(Type, Id)
+#define IMAGE_WRITE_TYPE(Type, Id) 
+#define IMAGE_READ_WRITE_TYPE(Type, Id) 
+
+#else
+
+#ifndef IMAGE_READ_TYPE
+#define IMAGE_READ_TYPE(Type, Id) \
+          IMAGE_TYPE(Type, Id##RO, Id##ROTy,  read_only, ro)
+#endif
+#ifndef IMAGE_WRITE_TYPE
+#define IMAGE_WRITE_TYPE(Type, Id) \
+          IMAGE_TYPE(Type, Id##WO, Id##WOTy, write_only, wo)
+#endif
+#ifndef IMAGE_READ_WRITE_TYPE
+#define IMAGE_READ_WRITE_TYPE(Type, Id) \
+          IMAGE_TYPE(Type, Id##RW, Id##RWTy, read_write, rw)
+#endif
+
+#endif
+
+IMAGE_READ_TYPE(image1d, OCLImage1d)
+IMAGE_READ_TYPE(image1d_array, OCLImage1dArray)
+IMAGE_READ_TYPE(image1d_buffer, OCLImage1dBuffer)
+IMAGE_READ_TYPE(image2d, OCLImage2d)
+IMAGE_READ_TYPE(image2d_array, OCLImage2dArray)
+IMAGE_READ_TYPE(image2d_depth, OCLImage2dDepth)
+IMAGE_READ_TYPE(image2d_array_depth, OCLImage2dArrayDepth)
+IMAGE_READ_TYPE(image2d_msaa, OCLImage2dMSAA)
+IMAGE_READ_TYPE(image2d_array_msaa, OCLImage2dArrayMSAA)
+IMAGE_READ_TYPE(image2d_msaa_depth, OCLImage2dMSAADepth)
+IMAGE_READ_TYPE(image2d_array_msaa_depth, OCLImage2dArrayMSAADepth)
+IMAGE_READ_TYPE(image3d, OCLImage3d)
+
+IMAGE_WRITE_TYPE(image1d, OCLImage1d)
+IMAGE_WRITE_TYPE(image1d_array, OCLImage1dArray)
+IMAGE_WRITE_TYPE(image1d_buffer, OCLImage1dBuffer)
+IMAGE_WRITE_TYPE(image2d, OCLImage2d)
+IMAGE_WRITE_TYPE(image2d_array, OCLImage2dArray)
+IMAGE_WRITE_TYPE(image2d_depth, OCLImage2dDepth)
+IMAGE_WRITE_TYPE(image2d_array_depth, OCLImage2dArrayDepth)
+IMAGE_WRITE_TYPE(image2d_msaa, OCLImage2dMSAA)
+IMAGE_WRITE_TYPE(image2d_array_msaa, OCLImage2dArrayMSAA)
+IMAGE_WRITE_TYPE(image2d_msaa_depth, OCLImage2dMSAADepth)
+IMAGE_WRITE_TYPE(image2d_array_msaa_depth, OCLImage2dArrayMSAADepth)
+IMAGE_WRITE_TYPE(image3d, OCLImage3d)
+
+IMAGE_READ_WRITE_TYPE(image1d, OCLImage1d)
+IMAGE_READ_WRITE_TYPE(image1d_array, OCLImage1dArray)
+IMAGE_READ_WRITE_TYPE(image1d_buffer, OCLImage1dBuffer)
+IMAGE_READ_WRITE_TYPE(image2d, OCLImage2d)
+IMAGE_READ_WRITE_TYPE(image2d_array, OCLImage2dArray)
+IMAGE_READ_WRITE_TYPE(image2d_depth, OCLImage2dDepth)
+IMAGE_READ_WRITE_TYPE(image2d_array_depth, OCLImage2dArrayDepth)
+IMAGE_READ_WRITE_TYPE(image2d_msaa, OCLImage2dMSAA)
+IMAGE_READ_WRITE_TYPE(image2d_array_msaa, OCLImage2dArrayMSAA)
+IMAGE_READ_WRITE_TYPE(image2d_msaa_depth, OCLImage2dMSAADepth)
+IMAGE_READ_WRITE_TYPE(image2d_array_msaa_depth, OCLImage2dArrayMSAADepth)
+IMAGE_READ_WRITE_TYPE(image3d, OCLImage3d)
+
+#undef IMAGE_TYPE
+#undef GENERIC_IMAGE_TYPE
+#undef IMAGE_READ_TYPE
+#undef IMAGE_WRITE_TYPE
+#undef IMAGE_READ_WRITE_TYPE
\ No newline at end of file
diff --git a/include/clang/Basic/OpenCLOptions.h b/include/clang/Basic/OpenCLOptions.h
new file mode 100644
index 0000000..4a629c9
--- /dev/null
+++ b/include/clang/Basic/OpenCLOptions.h
@@ -0,0 +1,65 @@
+//===--- OpenCLOptions.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Defines the clang::OpenCLOptions class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_OPENCLOPTIONS_H
+#define LLVM_CLANG_BASIC_OPENCLOPTIONS_H
+
+namespace clang {
+
+/// \brief OpenCL supported extensions and optional core features
+class OpenCLOptions {
+public:
+#define OPENCLEXT(nm) unsigned nm : 1;
+#include "clang/Basic/OpenCLExtensions.def"
+
+  OpenCLOptions() {
+#define OPENCLEXT(nm)   nm = 0;
+#include "clang/Basic/OpenCLExtensions.def"
+  }
+
+  // Enable all options.
+  void setAll() {
+#define OPENCLEXT(nm)   nm = 1;
+#include "clang/Basic/OpenCLExtensions.def"
+  }
+
+  // Is supported with OpenCL version \p OCLVer.
+#define OPENCLEXT_INTERNAL(Ext, Avail, ...) \
+  bool is_##Ext##_supported(unsigned OCLVer) const { \
+    return Ext && OCLVer >= Avail; \
+  }
+#include "clang/Basic/OpenCLExtensions.def"
+
+
+  // Is supported OpenCL extension with OpenCL version \p OCLVer.
+  // For supported optional core feature, return false.
+#define OPENCLEXT_INTERNAL(Ext, Avail, Core) \
+  bool is_##Ext##_supported_extension(unsigned CLVer) const { \
+    return is_##Ext##_supported(CLVer) && (Core == ~0U || CLVer < Core); \
+  }
+#include "clang/Basic/OpenCLExtensions.def"
+
+  // Is supported OpenCL core features with OpenCL version \p OCLVer.
+  // For supported extension, return false.
+#define OPENCLEXT_INTERNAL(Ext, Avail, Core) \
+  bool is_##Ext##_supported_core(unsigned CLVer) const { \
+    return is_##Ext##_supported(CLVer) && Core != ~0U && CLVer >= Core; \
+  }
+#include "clang/Basic/OpenCLExtensions.def"
+
+};
+
+}  // end namespace clang
+
+#endif
diff --git a/include/clang/Basic/OpenMPKinds.def b/include/clang/Basic/OpenMPKinds.def
index a32e310..0d0cc1c 100644
--- a/include/clang/Basic/OpenMPKinds.def
+++ b/include/clang/Basic/OpenMPKinds.def
@@ -60,6 +60,21 @@
 #ifndef OPENMP_TARGET_DATA_CLAUSE
 #  define OPENMP_TARGET_DATA_CLAUSE(Name)
 #endif
+#ifndef OPENMP_TARGET_ENTER_DATA_CLAUSE
+#define OPENMP_TARGET_ENTER_DATA_CLAUSE(Name)
+#endif
+#ifndef OPENMP_TARGET_EXIT_DATA_CLAUSE
+#define OPENMP_TARGET_EXIT_DATA_CLAUSE(Name)
+#endif
+#ifndef OPENMP_TARGET_PARALLEL_CLAUSE
+#  define OPENMP_TARGET_PARALLEL_CLAUSE(Name)
+#endif
+#ifndef OPENMP_TARGET_PARALLEL_FOR_CLAUSE
+#  define OPENMP_TARGET_PARALLEL_FOR_CLAUSE(Name)
+#endif
+#ifndef OPENMP_TARGET_UPDATE_CLAUSE
+#  define OPENMP_TARGET_UPDATE_CLAUSE(Name)
+#endif
 #ifndef OPENMP_TEAMS_CLAUSE
 #  define OPENMP_TEAMS_CLAUSE(Name)
 #endif
@@ -105,6 +120,30 @@
 #ifndef OPENMP_DIST_SCHEDULE_KIND
 #define OPENMP_DIST_SCHEDULE_KIND(Name)
 #endif
+#ifndef OPENMP_DEFAULTMAP_KIND
+#define OPENMP_DEFAULTMAP_KIND(Name)
+#endif
+#ifndef OPENMP_DEFAULTMAP_MODIFIER
+#define OPENMP_DEFAULTMAP_MODIFIER(Name)
+#endif
+#ifndef OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE
+#define OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(Name)
+#endif
+#ifndef OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE
+#define OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(Name)
+#endif
+#ifndef OPENMP_DISTRIBUTE_SIMD_CLAUSE
+#define OPENMP_DISTRIBUTE_SIMD_CLAUSE(Name)
+#endif
+#ifndef OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE
+#define OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(Name)
+#endif
+#ifndef OPENMP_TARGET_SIMD_CLAUSE
+#define OPENMP_TARGET_SIMD_CLAUSE(Name)
+#endif
+#ifndef OPENMP_TEAMS_DISTRIBUTE_CLAUSE
+#define OPENMP_TEAMS_DISTRIBUTE_CLAUSE(Name)
+#endif
 
 // OpenMP directives.
 OPENMP_DIRECTIVE(threadprivate)
@@ -128,14 +167,29 @@
 OPENMP_DIRECTIVE(teams)
 OPENMP_DIRECTIVE(cancel)
 OPENMP_DIRECTIVE_EXT(target_data, "target data")
+OPENMP_DIRECTIVE_EXT(target_enter_data, "target enter data")
+OPENMP_DIRECTIVE_EXT(target_exit_data, "target exit data")
+OPENMP_DIRECTIVE_EXT(target_parallel, "target parallel")
+OPENMP_DIRECTIVE_EXT(target_parallel_for, "target parallel for")
+OPENMP_DIRECTIVE_EXT(target_update, "target update")
 OPENMP_DIRECTIVE_EXT(parallel_for, "parallel for")
 OPENMP_DIRECTIVE_EXT(parallel_for_simd, "parallel for simd")
 OPENMP_DIRECTIVE_EXT(parallel_sections, "parallel sections")
 OPENMP_DIRECTIVE_EXT(for_simd, "for simd")
 OPENMP_DIRECTIVE_EXT(cancellation_point, "cancellation point")
+OPENMP_DIRECTIVE_EXT(declare_reduction, "declare reduction")
+OPENMP_DIRECTIVE_EXT(declare_simd, "declare simd")
 OPENMP_DIRECTIVE(taskloop)
 OPENMP_DIRECTIVE_EXT(taskloop_simd, "taskloop simd")
 OPENMP_DIRECTIVE(distribute)
+OPENMP_DIRECTIVE_EXT(declare_target, "declare target")
+OPENMP_DIRECTIVE_EXT(end_declare_target, "end declare target")
+OPENMP_DIRECTIVE_EXT(distribute_parallel_for, "distribute parallel for")
+OPENMP_DIRECTIVE_EXT(distribute_parallel_for_simd, "distribute parallel for simd")
+OPENMP_DIRECTIVE_EXT(distribute_simd, "distribute simd")
+OPENMP_DIRECTIVE_EXT(target_parallel_for_simd, "target parallel for simd")
+OPENMP_DIRECTIVE_EXT(target_simd, "target simd")
+OPENMP_DIRECTIVE_EXT(teams_distribute, "teams distribute")
 
 // OpenMP clauses.
 OPENMP_CLAUSE(if, OMPIfClause)
@@ -179,6 +233,11 @@
 OPENMP_CLAUSE(num_tasks, OMPNumTasksClause)
 OPENMP_CLAUSE(hint, OMPHintClause)
 OPENMP_CLAUSE(dist_schedule, OMPDistScheduleClause)
+OPENMP_CLAUSE(defaultmap, OMPDefaultmapClause)
+OPENMP_CLAUSE(to, OMPToClause)
+OPENMP_CLAUSE(from, OMPFromClause)
+OPENMP_CLAUSE(use_device_ptr, OMPUseDevicePtrClause)
+OPENMP_CLAUSE(is_device_ptr, OMPIsDevicePtrClause)
 
 // Clauses allowed for OpenMP directive 'parallel'.
 OPENMP_PARALLEL_CLAUSE(if)
@@ -263,6 +322,12 @@
 OPENMP_SCHEDULE_MODIFIER(nonmonotonic)
 OPENMP_SCHEDULE_MODIFIER(simd)
 
+// Static attributes for 'defaultmap' clause.
+OPENMP_DEFAULTMAP_KIND(scalar)
+
+// Modifiers for 'defaultmap' clause.
+OPENMP_DEFAULTMAP_MODIFIER(tofrom)
+
 // Static attributes for 'depend' clause.
 OPENMP_DEPEND_KIND(in)
 OPENMP_DEPEND_KIND(out)
@@ -342,16 +407,82 @@
 OPENMP_ATOMIC_CLAUSE(seq_cst)
 
 // Clauses allowed for OpenMP directive 'target'.
-// TODO More clauses for 'target' directive.
 OPENMP_TARGET_CLAUSE(if)
 OPENMP_TARGET_CLAUSE(device)
 OPENMP_TARGET_CLAUSE(map)
+OPENMP_TARGET_CLAUSE(private)
+OPENMP_TARGET_CLAUSE(nowait)
+OPENMP_TARGET_CLAUSE(depend)
+OPENMP_TARGET_CLAUSE(defaultmap)
+OPENMP_TARGET_CLAUSE(firstprivate)
+OPENMP_TARGET_CLAUSE(is_device_ptr)
 
 // Clauses allowed for OpenMP directive 'target data'.
 // TODO More clauses for 'target data' directive.
 OPENMP_TARGET_DATA_CLAUSE(if)
 OPENMP_TARGET_DATA_CLAUSE(device)
 OPENMP_TARGET_DATA_CLAUSE(map)
+OPENMP_TARGET_DATA_CLAUSE(use_device_ptr)
+
+// Clauses allowed for OpenMP directive 'target enter data'.
+OPENMP_TARGET_ENTER_DATA_CLAUSE(if)
+OPENMP_TARGET_ENTER_DATA_CLAUSE(device)
+OPENMP_TARGET_ENTER_DATA_CLAUSE(map)
+OPENMP_TARGET_ENTER_DATA_CLAUSE(nowait)
+OPENMP_TARGET_ENTER_DATA_CLAUSE(depend)
+
+// Clauses allowed for OpenMP directive 'target exit data'.
+OPENMP_TARGET_EXIT_DATA_CLAUSE(if)
+OPENMP_TARGET_EXIT_DATA_CLAUSE(device)
+OPENMP_TARGET_EXIT_DATA_CLAUSE(map)
+OPENMP_TARGET_EXIT_DATA_CLAUSE(nowait)
+OPENMP_TARGET_EXIT_DATA_CLAUSE(depend)
+
+// Clauses allowed for OpenMP directive 'target parallel'.
+// TODO: add target clauses 'is_device_ptr'
+OPENMP_TARGET_PARALLEL_CLAUSE(if)
+OPENMP_TARGET_PARALLEL_CLAUSE(device)
+OPENMP_TARGET_PARALLEL_CLAUSE(map)
+OPENMP_TARGET_PARALLEL_CLAUSE(private)
+OPENMP_TARGET_PARALLEL_CLAUSE(firstprivate)
+OPENMP_TARGET_PARALLEL_CLAUSE(nowait)
+OPENMP_TARGET_PARALLEL_CLAUSE(depend)
+OPENMP_TARGET_PARALLEL_CLAUSE(defaultmap)
+OPENMP_TARGET_PARALLEL_CLAUSE(num_threads)
+OPENMP_TARGET_PARALLEL_CLAUSE(default)
+OPENMP_TARGET_PARALLEL_CLAUSE(proc_bind)
+OPENMP_TARGET_PARALLEL_CLAUSE(shared)
+OPENMP_TARGET_PARALLEL_CLAUSE(reduction)
+
+// Clauses allowed for OpenMP directive 'target parallel for'.
+// TODO: add target clauses 'is_device_ptr'
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(if)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(device)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(map)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(private)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(firstprivate)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(lastprivate)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(nowait)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(depend)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(defaultmap)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(num_threads)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(default)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(proc_bind)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(shared)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(reduction)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(collapse)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(schedule)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(ordered)
+OPENMP_TARGET_PARALLEL_FOR_CLAUSE(linear)
+
+// Clauses allowed for OpenMP directive 'target update'.
+// TODO More clauses for 'target update' directive.
+OPENMP_TARGET_UPDATE_CLAUSE(if)
+OPENMP_TARGET_UPDATE_CLAUSE(device)
+OPENMP_TARGET_UPDATE_CLAUSE(to)
+OPENMP_TARGET_UPDATE_CLAUSE(from)
+OPENMP_TARGET_UPDATE_CLAUSE(nowait)
+OPENMP_TARGET_UPDATE_CLAUSE(depend)
 
 // Clauses allowed for OpenMP directive 'teams'.
 // TODO More clauses for 'teams' directive.
@@ -427,6 +558,106 @@
 // Static attributes for 'dist_schedule' clause.
 OPENMP_DIST_SCHEDULE_KIND(static)
 
+// Clauses allowed for OpenMP directive 'distribute parallel for'
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(firstprivate)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(lastprivate)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(collapse)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(dist_schedule)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(if)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(num_threads)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(default)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(proc_bind)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(private)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(shared)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(reduction)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(copyin)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(schedule)
+
+// Clauses allowed for OpenMP directive 'distribute parallel for simd'
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(firstprivate)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(lastprivate)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(collapse)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(dist_schedule)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(if)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(num_threads)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(default)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(proc_bind)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(private)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(shared)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(reduction)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(copyin)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(schedule)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(linear)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(aligned)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(safelen)
+OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(simdlen)
+
+// Clauses allowed for OpenMP directive 'distribute simd'
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(private)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(firstprivate)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(lastprivate)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(collapse)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(dist_schedule)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(linear)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(aligned)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(safelen)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(simdlen)
+OPENMP_DISTRIBUTE_SIMD_CLAUSE(reduction)
+
+// Clauses allowed for OpenMP directive 'target parallel for simd'.
+// TODO: add target clauses 'is_device_ptr'
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(if)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(device)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(map)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(private)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(firstprivate)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(lastprivate)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(nowait)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(depend)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(defaultmap)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(num_threads)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(default)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(proc_bind)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(shared)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(reduction)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(collapse)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(schedule)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(ordered)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(linear)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(safelen)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(simdlen)
+OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(aligned)
+
+// Clauses allowed for OpenMP directive 'target simd'.
+OPENMP_TARGET_SIMD_CLAUSE(if)
+OPENMP_TARGET_SIMD_CLAUSE(device)
+OPENMP_TARGET_SIMD_CLAUSE(map)
+OPENMP_TARGET_SIMD_CLAUSE(private)
+OPENMP_TARGET_SIMD_CLAUSE(nowait)
+OPENMP_TARGET_SIMD_CLAUSE(depend)
+OPENMP_TARGET_SIMD_CLAUSE(defaultmap)
+OPENMP_TARGET_SIMD_CLAUSE(firstprivate)
+OPENMP_TARGET_SIMD_CLAUSE(is_device_ptr)
+OPENMP_TARGET_SIMD_CLAUSE(lastprivate)
+OPENMP_TARGET_SIMD_CLAUSE(linear)
+OPENMP_TARGET_SIMD_CLAUSE(aligned)
+OPENMP_TARGET_SIMD_CLAUSE(safelen)
+OPENMP_TARGET_SIMD_CLAUSE(simdlen)
+OPENMP_TARGET_SIMD_CLAUSE(collapse)
+OPENMP_TARGET_SIMD_CLAUSE(reduction)
+
+// Clauses allowed for OpenMP directive 'teams distribute'.
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(default)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(private)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(firstprivate)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(shared)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(reduction)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(num_teams)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(thread_limit)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(lastprivate)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(collapse)
+OPENMP_TEAMS_DISTRIBUTE_CLAUSE(dist_schedule)
+
 #undef OPENMP_TASKLOOP_SIMD_CLAUSE
 #undef OPENMP_TASKLOOP_CLAUSE
 #undef OPENMP_LINEAR_KIND
@@ -451,6 +682,10 @@
 #undef OPENMP_ATOMIC_CLAUSE
 #undef OPENMP_TARGET_CLAUSE
 #undef OPENMP_TARGET_DATA_CLAUSE
+#undef OPENMP_TARGET_ENTER_DATA_CLAUSE
+#undef OPENMP_TARGET_EXIT_DATA_CLAUSE
+#undef OPENMP_TARGET_PARALLEL_CLAUSE
+#undef OPENMP_TARGET_PARALLEL_FOR_CLAUSE
 #undef OPENMP_TEAMS_CLAUSE
 #undef OPENMP_SIMD_CLAUSE
 #undef OPENMP_FOR_CLAUSE
@@ -458,3 +693,12 @@
 #undef OPENMP_MAP_KIND
 #undef OPENMP_DISTRIBUTE_CLAUSE
 #undef OPENMP_DIST_SCHEDULE_KIND
+#undef OPENMP_DEFAULTMAP_KIND
+#undef OPENMP_DEFAULTMAP_MODIFIER
+#undef OPENMP_TARGET_UPDATE_CLAUSE
+#undef OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE
+#undef OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE
+#undef OPENMP_DISTRIBUTE_SIMD_CLAUSE
+#undef OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE
+#undef OPENMP_TARGET_SIMD_CLAUSE
+#undef OPENMP_TEAMS_DISTRIBUTE_CLAUSE
diff --git a/include/clang/Basic/OpenMPKinds.h b/include/clang/Basic/OpenMPKinds.h
index e7e87e7..105a156 100644
--- a/include/clang/Basic/OpenMPKinds.h
+++ b/include/clang/Basic/OpenMPKinds.h
@@ -35,6 +35,7 @@
   OMPC_##Name,
 #include "clang/Basic/OpenMPKinds.def"
   OMPC_threadprivate,
+  OMPC_uniform,
   OMPC_unknown
 };
 
@@ -102,6 +103,30 @@
   OMPC_DIST_SCHEDULE_unknown
 };
 
+/// \brief OpenMP attributes for 'defaultmap' clause.
+enum OpenMPDefaultmapClauseKind {
+#define OPENMP_DEFAULTMAP_KIND(Name) \
+  OMPC_DEFAULTMAP_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+  OMPC_DEFAULTMAP_unknown
+};
+
+/// \brief OpenMP modifiers for 'defaultmap' clause.
+enum OpenMPDefaultmapClauseModifier {
+  OMPC_DEFAULTMAP_MODIFIER_unknown = OMPC_DEFAULTMAP_unknown,
+#define OPENMP_DEFAULTMAP_MODIFIER(Name) \
+  OMPC_DEFAULTMAP_MODIFIER_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+  OMPC_DEFAULTMAP_MODIFIER_last
+};
+
+/// Scheduling data for loop-based OpenMP directives.
+struct OpenMPScheduleTy final {
+  OpenMPScheduleClauseKind Schedule = OMPC_SCHEDULE_unknown;
+  OpenMPScheduleClauseModifier M1 = OMPC_SCHEDULE_MODIFIER_unknown;
+  OpenMPScheduleClauseModifier M2 = OMPC_SCHEDULE_MODIFIER_unknown;
+};
+
 OpenMPDirectiveKind getOpenMPDirectiveKind(llvm::StringRef Str);
 const char *getOpenMPDirectiveName(OpenMPDirectiveKind Kind);
 
@@ -139,11 +164,20 @@
 /// parallel', otherwise - false.
 bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind);
 
-/// \brief Checks if the specified directive is a target-kind directive.
+/// \brief Checks if the specified directive is a target code offload directive.
 /// \param DKind Specified directive.
-/// \return true - the directive is a target-like directive like 'omp target',
+/// \return true - the directive is a target code offload directive like
+/// 'omp target', 'omp target parallel', 'omp target xxx'
 /// otherwise - false.
-bool isOpenMPTargetDirective(OpenMPDirectiveKind DKind);
+bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind);
+
+/// \brief Checks if the specified directive is a target data offload directive.
+/// \param DKind Specified directive.
+/// \return true - the directive is a target data offload directive like
+/// 'omp target data', 'omp target update', 'omp target enter data',
+/// 'omp target exit data'
+/// otherwise - false.
+bool isOpenMPTargetDataManagementDirective(OpenMPDirectiveKind DKind);
 
 /// \brief Checks if the specified directive is a teams-kind directive.
 /// \param DKind Specified directive.
@@ -164,6 +198,14 @@
 /// otherwise - false.
 bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind);
 
+/// Checks if the specified composite/combined directive constitutes a
+/// distribute directive in the outermost nest.  For example,
+/// 'omp distribute parallel for' or 'omp distribute'.
+/// \param DKind Specified directive.
+/// \return true - the directive has distribute on the outermost nest.
+/// otherwise - false.
+bool isOpenMPNestingDistributeDirective(OpenMPDirectiveKind DKind);
+
 /// \brief Checks if the specified clause is one of private clauses like
 /// 'private', 'firstprivate', 'reduction' etc..
 /// \param Kind Clause kind.
@@ -176,6 +218,14 @@
 /// \return true - the clause is a threadprivate clause, otherwise - false.
 bool isOpenMPThreadPrivate(OpenMPClauseKind Kind);
 
+/// Checks if the specified directive kind is one of tasking directives - task,
+/// taskloop or taksloop simd.
+bool isOpenMPTaskingDirective(OpenMPDirectiveKind Kind);
+
+/// Checks if the specified directive kind is one of the composite or combined
+/// directives that need loop bound sharing across loops outlined in nested
+/// functions
+bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind);
 }
 
 #endif
diff --git a/include/clang/Basic/PlistSupport.h b/include/clang/Basic/PlistSupport.h
index 84dd291..61de824 100644
--- a/include/clang/Basic/PlistSupport.h
+++ b/include/clang/Basic/PlistSupport.h
@@ -10,7 +10,6 @@
 #ifndef LLVM_CLANG_BASIC_PLISTSUPPORT_H
 #define LLVM_CLANG_BASIC_PLISTSUPPORT_H
 
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/include/clang/Basic/PragmaKinds.h b/include/clang/Basic/PragmaKinds.h
new file mode 100644
index 0000000..b373a9e
--- /dev/null
+++ b/include/clang/Basic/PragmaKinds.h
@@ -0,0 +1,31 @@
+//===--- PragmaKinds.h - #pragma comment() kinds  ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_PRAGMA_KINDS_H
+#define LLVM_CLANG_BASIC_PRAGMA_KINDS_H
+
+namespace clang {
+
+enum PragmaMSCommentKind {
+  PCK_Unknown,
+  PCK_Linker,   // #pragma comment(linker, ...)
+  PCK_Lib,      // #pragma comment(lib, ...)
+  PCK_Compiler, // #pragma comment(compiler, ...)
+  PCK_ExeStr,   // #pragma comment(exestr, ...)
+  PCK_User      // #pragma comment(user, ...)
+};
+
+enum PragmaMSStructKind {
+  PMSST_OFF, // #pragms ms_struct off
+  PMSST_ON   // #pragms ms_struct on
+};
+
+}
+
+#endif
diff --git a/include/clang/Basic/Sanitizers.def b/include/clang/Basic/Sanitizers.def
index 4b68593..c81273e 100644
--- a/include/clang/Basic/Sanitizers.def
+++ b/include/clang/Basic/Sanitizers.def
@@ -114,6 +114,13 @@
 SANITIZER("local-bounds", LocalBounds)
 SANITIZER_GROUP("bounds", Bounds, ArrayBounds | LocalBounds)
 
+// EfficiencySanitizer
+SANITIZER("efficiency-cache-frag", EfficiencyCacheFrag)
+SANITIZER("efficiency-working-set", EfficiencyWorkingSet)
+// Meta-group only used internally.
+SANITIZER_GROUP("efficiency-all", Efficiency,
+                EfficiencyCacheFrag | EfficiencyWorkingSet)
+
 // Magic group, containing all sanitizers. For example, "-fno-sanitize=all"
 // can be used to disable all the sanitizers.
 SANITIZER_GROUP("all", All, ~0ULL)
diff --git a/include/clang/Basic/SourceLocation.h b/include/clang/Basic/SourceLocation.h
index 0aeba5e..006cf3d 100644
--- a/include/clang/Basic/SourceLocation.h
+++ b/include/clang/Basic/SourceLocation.h
@@ -373,22 +373,22 @@
   /// \brief Return the presumed filename of this location.
   ///
   /// This can be affected by \#line etc.
-  const char *getFilename() const { return Filename; }
+  const char *getFilename() const { assert(isValid()); return Filename; }
 
   /// \brief Return the presumed line number of this location.
   ///
   /// This can be affected by \#line etc.
-  unsigned getLine() const { return Line; }
+  unsigned getLine() const { assert(isValid()); return Line; }
 
   /// \brief Return the presumed column number of this location.
   ///
   /// This cannot be affected by \#line, but is packaged here for convenience.
-  unsigned getColumn() const { return Col; }
+  unsigned getColumn() const { assert(isValid()); return Col; }
 
   /// \brief Return the presumed include location of this location.
   ///
   /// This can be affected by GNU linemarker directives.
-  SourceLocation getIncludeLoc() const { return IncludeLoc; }
+  SourceLocation getIncludeLoc() const { assert(isValid()); return IncludeLoc; }
 };
 
 
diff --git a/include/clang/Basic/SourceManager.h b/include/clang/Basic/SourceManager.h
index 99392a0..6610c56 100644
--- a/include/clang/Basic/SourceManager.h
+++ b/include/clang/Basic/SourceManager.h
@@ -44,7 +44,6 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/PointerUnion.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DataTypes.h"
@@ -61,7 +60,6 @@
 class FileManager;
 class FileEntry;
 class LineTableInfo;
-class LangOptions;
 class ASTWriter;
 class ASTReader;
 
@@ -798,6 +796,15 @@
                         IncludeLoc, FileCharacter, LoadedID, LoadedOffset);
   }
 
+  /// \brief Get the FileID for \p SourceFile if it exists. Otherwise, create a
+  /// new FileID for the \p SourceFile.
+  FileID getOrCreateFileID(const FileEntry *SourceFile,
+                           SrcMgr::CharacteristicKind FileCharacter) {
+    FileID ID = translateFile(SourceFile);
+    return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(),
+                                            FileCharacter);
+  }
+
   /// \brief Return a new SourceLocation that encodes the
   /// fact that a token from SpellingLoc should actually be referenced from
   /// ExpansionLoc, and that it represents the expansion of a macro argument
@@ -1350,7 +1357,7 @@
   }
 
   /// \brief Returns whether \p Loc is expanded from a macro in a system header.
-  bool isInSystemMacro(SourceLocation loc) {
+  bool isInSystemMacro(SourceLocation loc) const {
     return loc.isMacroID() && isInSystemHeader(getSpellingLoc(loc));
   }
 
diff --git a/include/clang/Basic/SourceMgrAdapter.h b/include/clang/Basic/SourceMgrAdapter.h
index 6782aeb..dd7b83f 100644
--- a/include/clang/Basic/SourceMgrAdapter.h
+++ b/include/clang/Basic/SourceMgrAdapter.h
@@ -70,7 +70,9 @@
   void handleDiag(const llvm::SMDiagnostic &diag);
 
   /// Retrieve the diagnostic handler to use with the underlying SourceMgr.
-  llvm::SourceMgr::DiagHandlerTy getDiagHandler() { return &handleDiag; }
+  llvm::SourceMgr::DiagHandlerTy getDiagHandler() {
+    return &SourceMgrAdapter::handleDiag;
+  }
 
   /// Retrieve the context to use with the diagnostic handler produced by
   /// \c getDiagHandler().
diff --git a/include/clang/Basic/Specifiers.h b/include/clang/Basic/Specifiers.h
index 8857e0d..fffd4b1 100644
--- a/include/clang/Basic/Specifiers.h
+++ b/include/clang/Basic/Specifiers.h
@@ -54,6 +54,7 @@
     TST_half,         // OpenCL half, ARM NEON __fp16
     TST_float,
     TST_double,
+    TST_float128,
     TST_bool,         // _Bool
     TST_decimal32,    // _Decimal32
     TST_decimal64,    // _Decimal64
@@ -73,16 +74,18 @@
     TST_auto_type,        // __auto_type extension
     TST_unknown_anytype,  // __unknown_anytype extension
     TST_atomic,           // C11 _Atomic
-    TST_error         // erroneous type
+#define GENERIC_IMAGE_TYPE(ImgType, Id) TST_##ImgType##_t, // OpenCL image types
+#include "clang/Basic/OpenCLImageTypes.def"
+    TST_error // erroneous type
   };
-  
+
   /// \brief Structure that packs information about the type specifiers that
   /// were written in a particular type specifier sequence.
   struct WrittenBuiltinSpecs {
     /*DeclSpec::TST*/ unsigned Type  : 5;
     /*DeclSpec::TSS*/ unsigned Sign  : 2;
     /*DeclSpec::TSW*/ unsigned Width : 2;
-    bool ModeAttr : 1;
+    unsigned ModeAttr : 1;
   };  
 
   /// \brief A C++ access specifier (public, private, protected), plus the
@@ -238,7 +241,7 @@
     CC_AAPCS_VFP,   // __attribute__((pcs("aapcs-vfp")))
     CC_IntelOclBicc, // __attribute__((intel_ocl_bicc))
     CC_SpirFunction, // default for OpenCL functions on SPIR target
-    CC_SpirKernel,   // inferred for OpenCL kernels on SPIR target
+    CC_OpenCLKernel, // inferred for OpenCL kernels
     CC_Swift,        // __attribute__((swiftcall))
     CC_PreserveMost, // __attribute__((preserve_most))
     CC_PreserveAll,  // __attribute__((preserve_all))
@@ -254,7 +257,7 @@
     case CC_X86Pascal:
     case CC_X86VectorCall:
     case CC_SpirFunction:
-    case CC_SpirKernel:
+    case CC_OpenCLKernel:
     case CC_Swift:
       return false;
     default:
diff --git a/include/clang/Basic/StmtNodes.td b/include/clang/Basic/StmtNodes.td
index 36519ea..554f19f 100644
--- a/include/clang/Basic/StmtNodes.td
+++ b/include/clang/Basic/StmtNodes.td
@@ -126,6 +126,7 @@
 def ExpressionTraitExpr : DStmt<Expr>;
 def DependentScopeDeclRefExpr : DStmt<Expr>;
 def CXXConstructExpr : DStmt<Expr>;
+def CXXInheritedCtorInitExpr : DStmt<Expr>;
 def CXXBindTemporaryExpr : DStmt<Expr>;
 def ExprWithCleanups : DStmt<Expr>;
 def CXXTemporaryObjectExpr : DStmt<CXXConstructExpr>;
@@ -164,6 +165,7 @@
 def ObjCIndirectCopyRestoreExpr : DStmt<Expr>;
 def ObjCBoolLiteralExpr : DStmt<Expr>;
 def ObjCSubscriptRefExpr : DStmt<Expr>;
+def ObjCAvailabilityCheckExpr : DStmt<Expr>;
 
 // Obj-C ARC Expressions.
 def ObjCBridgedCastExpr : DStmt<ExplicitCastExpr>;
@@ -216,9 +218,20 @@
 def OMPAtomicDirective : DStmt<OMPExecutableDirective>;
 def OMPTargetDirective : DStmt<OMPExecutableDirective>;
 def OMPTargetDataDirective : DStmt<OMPExecutableDirective>;
+def OMPTargetEnterDataDirective : DStmt<OMPExecutableDirective>;
+def OMPTargetExitDataDirective : DStmt<OMPExecutableDirective>;
+def OMPTargetParallelDirective : DStmt<OMPExecutableDirective>;
+def OMPTargetParallelForDirective : DStmt<OMPExecutableDirective>;
+def OMPTargetUpdateDirective : DStmt<OMPExecutableDirective>;
 def OMPTeamsDirective : DStmt<OMPExecutableDirective>;
 def OMPCancellationPointDirective : DStmt<OMPExecutableDirective>;
 def OMPCancelDirective : DStmt<OMPExecutableDirective>;
 def OMPTaskLoopDirective : DStmt<OMPLoopDirective>;
 def OMPTaskLoopSimdDirective : DStmt<OMPLoopDirective>;
 def OMPDistributeDirective : DStmt<OMPLoopDirective>;
+def OMPDistributeParallelForDirective : DStmt<OMPLoopDirective>;
+def OMPDistributeParallelForSimdDirective : DStmt<OMPLoopDirective>;
+def OMPDistributeSimdDirective : DStmt<OMPLoopDirective>;
+def OMPTargetParallelForSimdDirective : DStmt<OMPLoopDirective>;
+def OMPTargetSimdDirective : DStmt<OMPLoopDirective>;
+def OMPTeamsDistributeDirective : DStmt<OMPLoopDirective>;
diff --git a/include/clang/Basic/TargetCXXABI.h b/include/clang/Basic/TargetCXXABI.h
index f7d4b92..7fb1f82 100644
--- a/include/clang/Basic/TargetCXXABI.h
+++ b/include/clang/Basic/TargetCXXABI.h
@@ -16,7 +16,6 @@
 #ifndef LLVM_CLANG_BASIC_TARGETCXXABI_H
 #define LLVM_CLANG_BASIC_TARGETCXXABI_H
 
-#include "llvm/ADT/Triple.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace clang {
diff --git a/include/clang/Basic/TargetInfo.h b/include/clang/Basic/TargetInfo.h
index 7f1bf34..a9378f8 100644
--- a/include/clang/Basic/TargetInfo.h
+++ b/include/clang/Basic/TargetInfo.h
@@ -21,13 +21,13 @@
 #include "clang/Basic/TargetCXXABI.h"
 #include "clang/Basic/TargetOptions.h"
 #include "clang/Basic/VersionTuple.h"
-#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 #include <string>
@@ -40,6 +40,7 @@
 namespace clang {
 class DiagnosticsEngine;
 class LangOptions;
+class CodeGenOptions;
 class MacroBuilder;
 class SourceLocation;
 class SourceManager;
@@ -57,13 +58,14 @@
   bool BigEndian;
   bool TLSSupported;
   bool NoAsmVariants;  // True if {|} are normal characters.
+  bool HasFloat128;
   unsigned char PointerWidth, PointerAlign;
   unsigned char BoolWidth, BoolAlign;
   unsigned char IntWidth, IntAlign;
   unsigned char HalfWidth, HalfAlign;
   unsigned char FloatWidth, FloatAlign;
   unsigned char DoubleWidth, DoubleAlign;
-  unsigned char LongDoubleWidth, LongDoubleAlign;
+  unsigned char LongDoubleWidth, LongDoubleAlign, Float128Align;
   unsigned char LargeArrayMinWidth, LargeArrayAlign;
   unsigned char LongWidth, LongAlign;
   unsigned char LongLongWidth, LongLongAlign;
@@ -74,11 +76,10 @@
   unsigned short MaxVectorAlign;
   unsigned short MaxTLSAlign;
   unsigned short SimdDefaultAlign;
-  const char *DataLayoutString;
-  const char *UserLabelPrefix;
+  std::unique_ptr<llvm::DataLayout> DataLayout;
   const char *MCountName;
   const llvm::fltSemantics *HalfFormat, *FloatFormat, *DoubleFormat,
-    *LongDoubleFormat;
+    *LongDoubleFormat, *Float128Format;
   unsigned char RegParmMax, SSERegParmMax;
   TargetCXXABI TheCXXABI;
   const LangAS::Map *AddrSpaceMap;
@@ -92,9 +93,15 @@
 
   unsigned HasBuiltinMSVaList : 1;
 
+  unsigned IsRenderScriptTarget : 1;
+
   // TargetInfo Constructor.  Default initializes all fields.
   TargetInfo(const llvm::Triple &T);
 
+  void resetDataLayout(StringRef DL) {
+    DataLayout.reset(new llvm::DataLayout(DL));
+  }
+
 public:
   /// \brief Construct a target for the given options.
   ///
@@ -132,7 +139,8 @@
     NoFloat = 255,
     Float = 0,
     Double,
-    LongDouble
+    LongDouble,
+    Float128
   };
 
   /// \brief The different kinds of __builtin_va_list types defined by
@@ -202,6 +210,9 @@
   /// zero-length bitfield.
   unsigned UseZeroLengthBitfieldAlignment : 1;
 
+  /// \brief  Whether explicit bit field alignment attributes are honored.
+  unsigned UseExplicitBitFieldAlignment : 1;
+
   /// If non-zero, specifies a fixed alignment value for bitfields that follow
   /// zero length bitfield, regardless of the zero length bitfield type.
   unsigned ZeroLengthBitfieldBoundary;
@@ -320,6 +331,9 @@
     return getPointerWidth(0) >= 64;
   } // FIXME
 
+  /// \brief Determine whether the __float128 type is supported on this target.
+  virtual bool hasFloat128Type() const { return HasFloat128; }
+
   /// \brief Return the alignment that is suitable for storing any
   /// object with a fundamental alignment requirement.
   unsigned getSuitableAlign() const { return SuitableAlign; }
@@ -372,6 +386,14 @@
     return *LongDoubleFormat;
   }
 
+  /// getFloat128Width/Align/Format - Return the size/align/format of
+  /// '__float128'.
+  unsigned getFloat128Width() const { return 128; }
+  unsigned getFloat128Align() const { return Float128Align; }
+  const llvm::fltSemantics &getFloat128Format() const {
+    return *Float128Format;
+  }
+
   /// \brief Return true if the 'long double' type should be mangled like
   /// __float128.
   virtual bool useFloat128ManglingForLongDouble() const { return false; }
@@ -428,24 +450,16 @@
   }
 
   // Return the size of unwind_word for this target.
-  unsigned getUnwindWordWidth() const { return getPointerWidth(0); }
+  virtual unsigned getUnwindWordWidth() const { return getPointerWidth(0); }
 
   /// \brief Return the "preferred" register width on this target.
-  unsigned getRegisterWidth() const {
+  virtual unsigned getRegisterWidth() const {
     // Currently we assume the register width on the target matches the pointer
     // width, we can introduce a new variable for this if/when some target wants
     // it.
     return PointerWidth;
   }
 
-  /// \brief Returns the default value of the __USER_LABEL_PREFIX__ macro,
-  /// which is the prefix given to user symbols by default.
-  ///
-  /// On most platforms this is "_", but it is "" on some, and "." on others.
-  const char *getUserLabelPrefix() const {
-    return UserLabelPrefix;
-  }
-
   /// \brief Returns the name of the mcount instrumentation function.
   const char *getMCountName() const {
     return MCountName;
@@ -481,6 +495,12 @@
     return ZeroLengthBitfieldBoundary;
   }
 
+  /// \brief Check whether explicit bitfield alignment attributes should be
+  //  honored, as in "__attribute__((aligned(2))) int b : 1;".
+  bool useExplicitBitFieldAlignment() const {
+    return UseExplicitBitFieldAlignment;
+  }
+
   /// \brief Check whether this target support '\#pragma options align=mac68k'.
   bool hasAlignMac68kSupport() const {
     return HasAlignMac68kSupport;
@@ -548,6 +568,9 @@
   /// available on this target.
   bool hasBuiltinMSVaList() const { return HasBuiltinMSVaList; }
 
+  /// Returns true for RenderScript.
+  bool isRenderScriptTarget() const { return IsRenderScriptTarget; }
+
   /// \brief Returns whether the passed in string is a valid clobber in an
   /// inline asm statement.
   ///
@@ -727,9 +750,9 @@
     return Triple;
   }
 
-  const char *getDataLayoutString() const {
-    assert(DataLayoutString && "Uninitialized DataLayoutString!");
-    return DataLayoutString;
+  const llvm::DataLayout &getDataLayout() const {
+    assert(DataLayout && "Uninitialized DataLayout!");
+    return *DataLayout;
   }
 
   struct GCCRegAlias {
@@ -775,6 +798,10 @@
   /// language options which change the target configuration.
   virtual void adjust(const LangOptions &Opts);
 
+  /// \brief Adjust target options based on codegen options.
+  virtual void adjustTargetOptions(const CodeGenOptions &CGOpts,
+                                   TargetOptions &TargetOpts) const {}
+
   /// \brief Initialize the map with the default set of target features for the
   /// CPU this should include all legal feature strings on the target.
   ///
@@ -883,6 +910,8 @@
 
   /// \brief Return the register number that __builtin_eh_return_regno would
   /// return with the specified argument.
+  /// This corresponds with TargetLowering's getExceptionPointerRegister
+  /// and getExceptionSelectorRegister in the backend.
   virtual int getEHDataRegisterNumber(unsigned RegNo) const {
     return -1;
   }
@@ -946,6 +975,32 @@
     return false;
   }
 
+  /// \brief Whether target allows to overalign ABI-specified prefered alignment
+  virtual bool allowsLargerPreferedTypeAlignment() const { return true; }
+
+  /// \brief Set supported OpenCL extensions and optional core features.
+  virtual void setSupportedOpenCLOpts() {}
+
+  /// \brief Get supported OpenCL extensions and optional core features.
+  OpenCLOptions &getSupportedOpenCLOpts() {
+    return getTargetOpts().SupportedOpenCLOptions;
+  }
+
+  /// \brief Get const supported OpenCL extensions and optional core features.
+  const OpenCLOptions &getSupportedOpenCLOpts() const {
+      return getTargetOpts().SupportedOpenCLOptions;
+  }
+
+  /// \brief Get OpenCL image type address space.
+  virtual LangAS::ID getOpenCLImageAddrSpace() const {
+    return LangAS::opencl_global;
+  }
+
+  /// \brief Check the target is valid after it is fully initialized.
+  virtual bool validateTarget(DiagnosticsEngine &Diags) const {
+    return true;
+  }
+
 protected:
   virtual uint64_t getPointerWidthV(unsigned AddrSpace) const {
     return PointerWidth;
diff --git a/include/clang/Basic/TargetOptions.h b/include/clang/Basic/TargetOptions.h
index ca0cca7..fde294c 100644
--- a/include/clang/Basic/TargetOptions.h
+++ b/include/clang/Basic/TargetOptions.h
@@ -17,6 +17,7 @@
 
 #include <string>
 #include <vector>
+#include "clang/Basic/OpenCLOptions.h"
 
 namespace clang {
 
@@ -27,6 +28,10 @@
   /// target will be selected to match the host.
   std::string Triple;
 
+  /// When compiling for the device side, contains the triple used to compile
+  /// for the host.
+  std::string HostTriple;
+
   /// If given, the name of the target CPU to generate code for.
   std::string CPU;
 
@@ -36,6 +41,9 @@
   /// If given, the name of the target ABI to use.
   std::string ABI;
 
+  /// The EABI version to use
+  std::string EABIVersion;
+
   /// If given, the version string of the linker in use.
   std::string LinkerVersion;
 
@@ -45,8 +53,11 @@
   /// The list of target specific features to enable or disable -- this should
   /// be a list of strings starting with by '+' or '-'.
   std::vector<std::string> Features;
-  
+
   std::vector<std::string> Reciprocals;
+
+  /// Supported OpenCL extensions and optional core features.
+  OpenCLOptions SupportedOpenCLOptions;
 };
 
 }  // end namespace clang
diff --git a/include/clang/Basic/TokenKinds.def b/include/clang/Basic/TokenKinds.def
index 0269451..82cb6c2 100644
--- a/include/clang/Basic/TokenKinds.def
+++ b/include/clang/Basic/TokenKinds.def
@@ -30,6 +30,9 @@
 #ifndef CONCEPTS_KEYWORD
 #define CONCEPTS_KEYWORD(X) KEYWORD(X,KEYCONCEPTS)
 #endif
+#ifndef MODULES_KEYWORD
+#define MODULES_KEYWORD(X) KEYWORD(X,KEYMODULES)
+#endif
 #ifndef TYPE_TRAIT
 #define TYPE_TRAIT(N,I,K) KEYWORD(I,K)
 #endif
@@ -219,6 +222,9 @@
 PUNCTUATOR(lesslessless,          "<<<")
 PUNCTUATOR(greatergreatergreater, ">>>")
 
+// CL support
+PUNCTUATOR(caretcaret,            "^^")
+
 // C99 6.4.1: Keywords.  These turn into kw_* tokens.
 // Flags allowed:
 //   KEYALL   - This is a keyword in all variants of C and C++, or it
@@ -232,6 +238,8 @@
 //   KEYCXX11 - This is a C++ keyword introduced to C++ in C++11
 //   KEYCONCEPTS - This is a keyword if the C++ extensions for concepts
 //                 are enabled.
+//   KEYMODULES - This is a keyword if the C++ extensions for modules
+//                are enabled.
 //   KEYGNU   - This is a keyword if GNU extensions are enabled
 //   KEYMS    - This is a keyword if Microsoft extensions are enabled
 //   KEYNOMS18 - This is a keyword that must never be enabled under
@@ -363,6 +371,10 @@
 KEYWORD(co_return                   , KEYCOROUTINES)
 KEYWORD(co_yield                    , KEYCOROUTINES)
 
+// C++ modules TS keywords
+MODULES_KEYWORD(module)
+MODULES_KEYWORD(import)
+
 // GNU Extensions (in impl-reserved namespace)
 KEYWORD(_Decimal32                  , KEYALL)
 KEYWORD(_Decimal64                  , KEYALL)
@@ -377,6 +389,7 @@
 TYPE_TRAIT_2(__builtin_types_compatible_p, TypeCompatible, KEYNOCXX)
 KEYWORD(__builtin_va_arg            , KEYALL)
 KEYWORD(__extension__               , KEYALL)
+KEYWORD(__float128                  , KEYALL)
 KEYWORD(__imag                      , KEYALL)
 KEYWORD(__int128                    , KEYALL)
 KEYWORD(__label__                   , KEYALL)
@@ -403,6 +416,9 @@
 TYPE_TRAIT_N(__is_constructible, IsConstructible, KEYCXX)
 TYPE_TRAIT_N(__is_nothrow_constructible, IsNothrowConstructible, KEYCXX)
 
+// MSVC14.0 / VS2015 Type Traits
+TYPE_TRAIT_2(__is_assignable, IsAssignable, KEYCXX)
+
 // GNU and MS Type Traits
 TYPE_TRAIT_1(__has_nothrow_assign, HasNothrowAssign, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_move_assign, HasNothrowMoveAssign, KEYCXX)
@@ -515,6 +531,8 @@
 // OpenCL builtins
 KEYWORD(__builtin_astype            , KEYOPENCL)
 KEYWORD(vec_step                    , KEYOPENCL|KEYALTIVEC|KEYZVECTOR)
+#define GENERIC_IMAGE_TYPE(ImgType, Id) KEYWORD(ImgType##_t, KEYOPENCL)
+#include "clang/Basic/OpenCLImageTypes.def"
 
 // OpenMP Type Traits
 KEYWORD(__builtin_omp_required_simd_align, KEYALL)
@@ -621,6 +639,8 @@
 ALIAS("__char16_t"   , char16_t   , KEYCXX)
 ALIAS("__char32_t"   , char32_t   , KEYCXX)
 
+KEYWORD(__builtin_available       , KEYALL)
+
 // Clang-specific keywords enabled only in testing.
 TESTING_KEYWORD(__unknown_anytype , KEYALL)
 
@@ -659,6 +679,7 @@
 OBJC2_AT_KEYWORD(synthesize)
 OBJC2_AT_KEYWORD(dynamic)
 OBJC2_AT_KEYWORD(import)
+OBJC2_AT_KEYWORD(available)
 
 // TODO: What to do about context-sensitive keywords like:
 //       bycopy/byref/in/inout/oneway/out?
diff --git a/include/clang/Basic/TypeTraits.h b/include/clang/Basic/TypeTraits.h
index 765246b..730ecba 100644
--- a/include/clang/Basic/TypeTraits.h
+++ b/include/clang/Basic/TypeTraits.h
@@ -74,6 +74,7 @@
     BTT_IsConvertibleTo,
     BTT_IsSame,
     BTT_TypeCompatible,
+    BTT_IsAssignable,
     BTT_IsNothrowAssignable,
     BTT_IsTriviallyAssignable,
     BTT_Last = BTT_IsTriviallyAssignable,
diff --git a/include/clang/Basic/VersionTuple.h b/include/clang/Basic/VersionTuple.h
index 784f3f3..da3b019 100644
--- a/include/clang/Basic/VersionTuple.h
+++ b/include/clang/Basic/VersionTuple.h
@@ -25,39 +25,44 @@
 /// \brief Represents a version number in the form major[.minor[.subminor[.build]]].
 class VersionTuple {
   unsigned Major : 31;
-  unsigned Minor : 31;
-  unsigned Subminor : 31;
-  unsigned Build : 31;
-  unsigned HasMinor : 1;
-  unsigned HasSubminor : 1;
-  unsigned HasBuild : 1;
+
   unsigned UsesUnderscores : 1;
 
+  unsigned Minor : 31;
+  unsigned HasMinor : 1;
+
+  unsigned Subminor : 31;
+  unsigned HasSubminor : 1;
+
+  unsigned Build : 31;
+  unsigned HasBuild : 1;
+
 public:
   VersionTuple()
-      : Major(0), Minor(0), Subminor(0), Build(0), HasMinor(false),
-        HasSubminor(false), HasBuild(false), UsesUnderscores(false) {}
+      : Major(0), UsesUnderscores(false), Minor(0), HasMinor(false),
+        Subminor(0), HasSubminor(false), Build(0), HasBuild(false) {}
 
   explicit VersionTuple(unsigned Major)
-      : Major(Major), Minor(0), Subminor(0), Build(0), HasMinor(false),
-        HasSubminor(false), HasBuild(false), UsesUnderscores(false) {}
+      : Major(Major), UsesUnderscores(false), Minor(0), HasMinor(false),
+        Subminor(0), HasSubminor(false), Build(0), HasBuild(false) {}
 
   explicit VersionTuple(unsigned Major, unsigned Minor,
                         bool UsesUnderscores = false)
-      : Major(Major), Minor(Minor), Subminor(0), Build(0), HasMinor(true),
-        HasSubminor(false), HasBuild(false), UsesUnderscores(UsesUnderscores) {}
+      : Major(Major), UsesUnderscores(UsesUnderscores), Minor(Minor),
+        HasMinor(true), Subminor(0), HasSubminor(false), Build(0),
+        HasBuild(false) {}
 
   explicit VersionTuple(unsigned Major, unsigned Minor, unsigned Subminor,
                         bool UsesUnderscores = false)
-      : Major(Major), Minor(Minor), Subminor(Subminor), Build(0),
-        HasMinor(true), HasSubminor(true), HasBuild(false),
-        UsesUnderscores(UsesUnderscores) {}
+      : Major(Major), UsesUnderscores(UsesUnderscores), Minor(Minor),
+        HasMinor(true), Subminor(Subminor), HasSubminor(true), Build(0),
+        HasBuild(false) {}
 
   explicit VersionTuple(unsigned Major, unsigned Minor, unsigned Subminor,
                         unsigned Build, bool UsesUnderscores = false)
-      : Major(Major), Minor(Minor), Subminor(Subminor), Build(Build),
-        HasMinor(true), HasSubminor(true), HasBuild(true),
-        UsesUnderscores(UsesUnderscores) {}
+      : Major(Major), UsesUnderscores(UsesUnderscores), Minor(Minor),
+        HasMinor(true), Subminor(Subminor), HasSubminor(true), Build(Build),
+        HasBuild(true) {}
 
   /// \brief Determine whether this version information is empty
   /// (e.g., all version components are zero).
diff --git a/include/clang/Basic/VirtualFileSystem.h b/include/clang/Basic/VirtualFileSystem.h
index 6c7127f..e977aa6 100644
--- a/include/clang/Basic/VirtualFileSystem.h
+++ b/include/clang/Basic/VirtualFileSystem.h
@@ -20,6 +20,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <utility>
 
 namespace llvm {
 class MemoryBuffer;
@@ -90,6 +91,13 @@
   virtual ~File();
   /// \brief Get the status of the file.
   virtual llvm::ErrorOr<Status> status() = 0;
+  /// \brief Get the name of the file
+  virtual llvm::ErrorOr<std::string> getName() {
+    if (auto Status = status())
+      return Status->getName().str();
+    else
+      return Status.getError();
+  }
   /// \brief Get the contents of the file as a \p MemoryBuffer.
   virtual llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
   getBuffer(const Twine &Name, int64_t FileSize = -1,
@@ -116,7 +124,8 @@
   std::shared_ptr<detail::DirIterImpl> Impl; // Input iterator semantics on copy
 
 public:
-  directory_iterator(std::shared_ptr<detail::DirIterImpl> I) : Impl(I) {
+  directory_iterator(std::shared_ptr<detail::DirIterImpl> I)
+      : Impl(std::move(I)) {
     assert(Impl.get() != nullptr && "requires non-null implementation");
     if (!Impl->CurrentEntry.isStatusKnown())
       Impl.reset(); // Normalize the end iterator to Impl == nullptr.
@@ -331,6 +340,7 @@
   Optional<bool> IsCaseSensitive;
   Optional<bool> IsOverlayRelative;
   Optional<bool> UseExternalNames;
+  Optional<bool> IgnoreNonExistentContents;
   std::string OverlayDir;
 
 public:
@@ -342,6 +352,9 @@
   void setUseExternalNames(bool UseExtNames) {
     UseExternalNames = UseExtNames;
   }
+  void setIgnoreNonExistentContents(bool IgnoreContents) {
+    IgnoreNonExistentContents = IgnoreContents;
+  }
   void setOverlayDir(StringRef OverlayDirectory) {
     IsOverlayRelative = true;
     OverlayDir.assign(OverlayDirectory.str());
diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index 6641ed2..5605fc6 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -704,8 +704,10 @@
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.22 Converting vectors
 
-def VCVT_F16_F32 : SInst<"vcvt_f16_f32", "md", "Hf">;
-def VCVT_F32_F16 : SInst<"vcvt_f32_f16", "wd", "h">;
+let ArchGuard = "(__ARM_FP & 2)" in {
+  def VCVT_F16_F32 : SInst<"vcvt_f16_f32", "md", "Hf">;
+  def VCVT_F32_F16 : SInst<"vcvt_f32_f16", "wd", "h">;
+}
 
 def VCVT_S32     : SInst<"vcvt_s32", "xd",  "fQf">;
 def VCVT_U32     : SInst<"vcvt_u32", "ud",  "fQf">;
@@ -965,7 +967,7 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 // Signed integer saturating extract and unsigned narrow to high
-def SQXTUN2 : SOpInst<"vqmovun_high", "qhk", "sil", OP_SQXTUN>;
+def SQXTUN2 : SOpInst<"vqmovun_high", "emd", "HsHiHl", OP_SQXTUN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Integer saturating extract and narrow to high
diff --git a/include/clang/CMakeLists.txt b/include/clang/CMakeLists.txt
index 1d8aecd..feb81f0 100644
--- a/include/clang/CMakeLists.txt
+++ b/include/clang/CMakeLists.txt
@@ -4,3 +4,4 @@
 add_subdirectory(Parse)
 add_subdirectory(Sema)
 add_subdirectory(Serialization)
+add_subdirectory(StaticAnalyzer/Checkers)
diff --git a/include/clang/CodeGen/BackendUtil.h b/include/clang/CodeGen/BackendUtil.h
index 7df1c04..01721d3 100644
--- a/include/clang/CodeGen/BackendUtil.h
+++ b/include/clang/CodeGen/BackendUtil.h
@@ -16,6 +16,7 @@
 
 namespace llvm {
   class Module;
+  class MemoryBufferRef;
 }
 
 namespace clang {
@@ -35,8 +36,12 @@
 
   void EmitBackendOutput(DiagnosticsEngine &Diags, const CodeGenOptions &CGOpts,
                          const TargetOptions &TOpts, const LangOptions &LOpts,
-                         StringRef TDesc, llvm::Module *M, BackendAction Action,
-                         raw_pwrite_stream *OS);
+                         const llvm::DataLayout &TDesc, llvm::Module *M,
+                         BackendAction Action,
+                         std::unique_ptr<raw_pwrite_stream> OS);
+
+  void EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts,
+                    llvm::MemoryBufferRef Buf);
 }
 
 #endif
diff --git a/include/clang/CodeGen/CGFunctionInfo.h b/include/clang/CodeGen/CGFunctionInfo.h
index 5066c5d..8dd6ad1 100644
--- a/include/clang/CodeGen/CGFunctionInfo.h
+++ b/include/clang/CodeGen/CGFunctionInfo.h
@@ -16,16 +16,17 @@
 #ifndef LLVM_CLANG_CODEGEN_CGFUNCTIONINFO_H
 #define LLVM_CLANG_CODEGEN_CGFUNCTIONINFO_H
 
+#include "clang/AST/Attr.h"
 #include "clang/AST/CanonicalType.h"
 #include "clang/AST/CharUnits.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/Type.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/Support/TrailingObjects.h"
 #include <cassert>
 
 namespace clang {
-class Decl;
-
 namespace CodeGen {
 
 /// ABIArgInfo - Helper class to encapsulate information about how a
@@ -392,23 +393,34 @@
   /// Compute the arguments required by the given formal prototype,
   /// given that there may be some additional, non-formal arguments
   /// in play.
+  ///
+  /// If FD is not null, this will consider pass_object_size params in FD.
   static RequiredArgs forPrototypePlus(const FunctionProtoType *prototype,
-                                       unsigned additional) {
+                                       unsigned additional,
+                                       const FunctionDecl *FD) {
     if (!prototype->isVariadic()) return All;
+    if (FD)
+      additional +=
+          llvm::count_if(FD->parameters(), [](const ParmVarDecl *PVD) {
+            return PVD->hasAttr<PassObjectSizeAttr>();
+          });
     return RequiredArgs(prototype->getNumParams() + additional);
   }
 
-  static RequiredArgs forPrototype(const FunctionProtoType *prototype) {
-    return forPrototypePlus(prototype, 0);
+  static RequiredArgs forPrototype(const FunctionProtoType *prototype,
+                                   const FunctionDecl *FD) {
+    return forPrototypePlus(prototype, 0, FD);
   }
 
-  static RequiredArgs forPrototype(CanQual<FunctionProtoType> prototype) {
-    return forPrototype(prototype.getTypePtr());
+  static RequiredArgs forPrototype(CanQual<FunctionProtoType> prototype,
+                                   const FunctionDecl *FD) {
+    return forPrototype(prototype.getTypePtr(), FD);
   }
 
   static RequiredArgs forPrototypePlus(CanQual<FunctionProtoType> prototype,
-                                       unsigned additional) {
-    return forPrototypePlus(prototype.getTypePtr(), additional);
+                                       unsigned additional,
+                                       const FunctionDecl *FD) {
+    return forPrototypePlus(prototype.getTypePtr(), additional, FD);
   }
 
   bool allowsOptionalArgs() const { return NumRequired != ~0U; }
@@ -424,13 +436,20 @@
   }
 };
 
+// Implementation detail of CGFunctionInfo, factored out so it can be named
+// in the TrailingObjects base class of CGFunctionInfo.
+struct CGFunctionInfoArgInfo {
+  CanQualType type;
+  ABIArgInfo info;
+};
+
 /// CGFunctionInfo - Class to encapsulate the information about a
 /// function definition.
-class CGFunctionInfo : public llvm::FoldingSetNode {
-  struct ArgInfo {
-    CanQualType type;
-    ABIArgInfo info;
-  };
+class CGFunctionInfo final
+    : public llvm::FoldingSetNode,
+      private llvm::TrailingObjects<CGFunctionInfo, CGFunctionInfoArgInfo,
+                                    FunctionProtoType::ExtParameterInfo> {
+  typedef CGFunctionInfoArgInfo ArgInfo;
   typedef FunctionProtoType::ExtParameterInfo ExtParameterInfo;
 
   /// The LLVM::CallingConv to use for this function (as specified by the
@@ -469,19 +488,19 @@
   unsigned HasExtParameterInfos : 1;
 
   unsigned NumArgs;
+
   ArgInfo *getArgsBuffer() {
-    return reinterpret_cast<ArgInfo*>(this+1);
+    return getTrailingObjects<ArgInfo>();
   }
   const ArgInfo *getArgsBuffer() const {
-    return reinterpret_cast<const ArgInfo*>(this + 1);
+    return getTrailingObjects<ArgInfo>();
   }
 
   ExtParameterInfo *getExtParameterInfosBuffer() {
-    return reinterpret_cast<ExtParameterInfo*>(getArgsBuffer() + NumArgs + 1);
+    return getTrailingObjects<ExtParameterInfo>();
   }
   const ExtParameterInfo *getExtParameterInfosBuffer() const{
-    return reinterpret_cast<const ExtParameterInfo*>(
-                                               getArgsBuffer() + NumArgs + 1);
+    return getTrailingObjects<ExtParameterInfo>();
   }
 
   CGFunctionInfo() : Required(RequiredArgs::All) {}
@@ -495,6 +514,17 @@
                                 CanQualType resultType,
                                 ArrayRef<CanQualType> argTypes,
                                 RequiredArgs required);
+  void operator delete(void *p) { ::operator delete(p); }
+
+  // Friending class TrailingObjects is apparently not good enough for MSVC,
+  // so these have to be public.
+  friend class TrailingObjects;
+  size_t numTrailingObjects(OverloadToken<ArgInfo>) const {
+    return NumArgs + 1;
+  }
+  size_t numTrailingObjects(OverloadToken<ExtParameterInfo>) const {
+    return (HasExtParameterInfos ? NumArgs : 0);
+  }
 
   typedef const ArgInfo *const_arg_iterator;
   typedef ArgInfo *arg_iterator;
diff --git a/include/clang/CodeGen/ModuleBuilder.h b/include/clang/CodeGen/ModuleBuilder.h
index ce7696d..5863834 100644
--- a/include/clang/CodeGen/ModuleBuilder.h
+++ b/include/clang/CodeGen/ModuleBuilder.h
@@ -20,6 +20,7 @@
   class Constant;
   class LLVMContext;
   class Module;
+  class StringRef;
 }
 
 namespace clang {
diff --git a/include/clang/CodeGen/ObjectFilePCHContainerOperations.h b/include/clang/CodeGen/ObjectFilePCHContainerOperations.h
index 15132ac..6437f4f 100644
--- a/include/clang/CodeGen/ObjectFilePCHContainerOperations.h
+++ b/include/clang/CodeGen/ObjectFilePCHContainerOperations.h
@@ -22,10 +22,12 @@
   /// Return an ASTConsumer that can be chained with a
   /// PCHGenerator that produces a wrapper file format
   /// that also contains full debug info for the module.
-  std::unique_ptr<ASTConsumer> CreatePCHContainerGenerator(
-      CompilerInstance &CI, const std::string &MainFileName,
-      const std::string &OutputFileName, llvm::raw_pwrite_stream *OS,
-      std::shared_ptr<PCHBuffer> Buffer) const override;
+  std::unique_ptr<ASTConsumer>
+  CreatePCHContainerGenerator(CompilerInstance &CI,
+                              const std::string &MainFileName,
+                              const std::string &OutputFileName,
+                              std::unique_ptr<llvm::raw_pwrite_stream> OS,
+                              std::shared_ptr<PCHBuffer> Buffer) const override;
 };
 
 /// A PCHContainerReader implementation that uses LLVM to
diff --git a/include/clang/CodeGen/SwiftCallingConv.h b/include/clang/CodeGen/SwiftCallingConv.h
index f9c2fd9..b6b8c47 100644
--- a/include/clang/CodeGen/SwiftCallingConv.h
+++ b/include/clang/CodeGen/SwiftCallingConv.h
@@ -17,7 +17,6 @@
 #include "clang/AST/CanonicalType.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Type.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/Support/TrailingObjects.h"
 #include <cassert>
 
diff --git a/include/clang/Config/config.h.cmake b/include/clang/Config/config.h.cmake
index b7486f3..9200ed9 100644
--- a/include/clang/Config/config.h.cmake
+++ b/include/clang/Config/config.h.cmake
@@ -8,6 +8,12 @@
 /* Bug report URL. */
 #define BUG_REPORT_URL "${BUG_REPORT_URL}"
 
+/* Default C++ stdlib to use. */
+#define CLANG_DEFAULT_CXX_STDLIB "${CLANG_DEFAULT_CXX_STDLIB}"
+
+/* Default runtime library to use. */
+#define CLANG_DEFAULT_RTLIB "${CLANG_DEFAULT_RTLIB}"
+
 /* Default OpenMP runtime used by -fopenmp. */
 #define CLANG_DEFAULT_OPENMP_RUNTIME "${CLANG_DEFAULT_OPENMP_RUNTIME}"
 
@@ -35,4 +41,10 @@
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
+/* pass --build-id to ld */
+#cmakedefine ENABLE_LINKER_BUILD_ID
+
+/* enable x86 relax relocations by default */
+#cmakedefine01 ENABLE_X86_RELAX_RELOCATIONS
+
 #endif
diff --git a/include/clang/Config/config.h.in b/include/clang/Config/config.h.in
deleted file mode 100644
index 91983f6..0000000
--- a/include/clang/Config/config.h.in
+++ /dev/null
@@ -1,40 +0,0 @@
-/* This generated file is for internal use. Do not include it from headers. */
-
-#ifdef CLANG_CONFIG_H
-#error config.h can only be included once
-#else
-#define CLANG_CONFIG_H
-
-/* Bug report URL. */
-#undef BUG_REPORT_URL
-
-/* Default OpenMP runtime used by -fopenmp. */
-#undef CLANG_DEFAULT_OPENMP_RUNTIME
-
-/* Multilib suffix for libdir. */
-#undef CLANG_LIBDIR_SUFFIX
-
-/* Relative directory for resource files */
-#undef CLANG_RESOURCE_DIR
-
-/* Directories clang will search for headers */
-#undef C_INCLUDE_DIRS
-
-/* Default <path> to all compiler invocations for --sysroot=<path>. */
-#undef DEFAULT_SYSROOT
-
-/* Directory where gcc is installed. */
-#undef GCC_INSTALL_PREFIX
-
-/* Define if we have libxml2 */
-#undef CLANG_HAVE_LIBXML
-
-#undef PACKAGE_STRING
-
-/* The LLVM product name and version */
-#define BACKEND_PACKAGE_STRING PACKAGE_STRING
-
-/* Linker version detected at compile time. */
-#undef HOST_LINK_VERSION
-
-#endif
diff --git a/include/clang/Driver/Action.h b/include/clang/Driver/Action.h
index 2cf53bc..3fe6510 100644
--- a/include/clang/Driver/Action.h
+++ b/include/clang/Driver/Action.h
@@ -10,8 +10,10 @@
 #ifndef LLVM_CLANG_DRIVER_ACTION_H
 #define LLVM_CLANG_DRIVER_ACTION_H
 
+#include "clang/Basic/Cuda.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
@@ -26,6 +28,8 @@
 namespace clang {
 namespace driver {
 
+class ToolChain;
+
 /// Action - Represent an abstract compilation step to perform.
 ///
 /// An action represents an edge in the compilation graph; typically
@@ -41,14 +45,15 @@
 class Action {
 public:
   typedef ActionList::size_type size_type;
-  typedef ActionList::iterator iterator;
-  typedef ActionList::const_iterator const_iterator;
+  typedef ActionList::iterator input_iterator;
+  typedef ActionList::const_iterator input_const_iterator;
+  typedef llvm::iterator_range<input_iterator> input_range;
+  typedef llvm::iterator_range<input_const_iterator> input_const_range;
 
   enum ActionClass {
     InputClass = 0,
     BindArchClass,
-    CudaDeviceClass,
-    CudaHostClass,
+    OffloadClass,
     PreprocessJobClass,
     PrecompileJobClass,
     AnalyzeJobClass,
@@ -62,8 +67,19 @@
     VerifyDebugInfoJobClass,
     VerifyPCHJobClass,
 
-    JobClassFirst=PreprocessJobClass,
-    JobClassLast=VerifyPCHJobClass
+    JobClassFirst = PreprocessJobClass,
+    JobClassLast = VerifyPCHJobClass
+  };
+
+  // The offloading kind determines if this action is binded to a particular
+  // programming model. Each entry reserves one bit. We also have a special kind
+  // to designate the host offloading tool chain.
+  enum OffloadKind {
+    OFK_None = 0x00,
+    // The host offloading tool chain.
+    OFK_Host = 0x01,
+    // The device offloading tool chains - one bit for each programming model.
+    OFK_Cuda = 0x02,
   };
 
   static const char *getClassName(ActionClass AC);
@@ -77,6 +93,19 @@
   ActionList Inputs;
 
 protected:
+  ///
+  /// Offload information.
+  ///
+
+  /// The host offloading kind - a combination of kinds encoded in a mask.
+  /// Multiple programming models may be supported simultaneously by the same
+  /// host.
+  unsigned ActiveOffloadKindMask = 0u;
+  /// Offloading kind of the device.
+  OffloadKind OffloadingDeviceKind = OFK_None;
+  /// The Offloading architecture associated with this action.
+  const char *OffloadingArch = nullptr;
+
   Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {}
   Action(ActionClass Kind, Action *Input, types::ID Type)
       : Action(Kind, ActionList({Input}), Type) {}
@@ -98,10 +127,49 @@
 
   size_type size() const { return Inputs.size(); }
 
-  iterator begin() { return Inputs.begin(); }
-  iterator end() { return Inputs.end(); }
-  const_iterator begin() const { return Inputs.begin(); }
-  const_iterator end() const { return Inputs.end(); }
+  input_iterator input_begin() { return Inputs.begin(); }
+  input_iterator input_end() { return Inputs.end(); }
+  input_range inputs() { return input_range(input_begin(), input_end()); }
+  input_const_iterator input_begin() const { return Inputs.begin(); }
+  input_const_iterator input_end() const { return Inputs.end(); }
+  input_const_range inputs() const {
+    return input_const_range(input_begin(), input_end());
+  }
+
+  /// Return a string containing the offload kind of the action.
+  std::string getOffloadingKindPrefix() const;
+  /// Return a string that can be used as prefix in order to generate unique
+  /// files for each offloading kind.
+  std::string
+  getOffloadingFileNamePrefix(llvm::StringRef NormalizedTriple) const;
+
+  /// Set the device offload info of this action and propagate it to its
+  /// dependences.
+  void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch);
+  /// Append the host offload info of this action and propagate it to its
+  /// dependences.
+  void propagateHostOffloadInfo(unsigned OKinds, const char *OArch);
+  /// Set the offload info of this action to be the same as the provided action,
+  /// and propagate it to its dependences.
+  void propagateOffloadInfo(const Action *A);
+
+  unsigned getOffloadingHostActiveKinds() const {
+    return ActiveOffloadKindMask;
+  }
+  OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; }
+  const char *getOffloadingArch() const { return OffloadingArch; }
+
+  /// Check if this action have any offload kinds. Note that host offload kinds
+  /// are only set if the action is a dependence to a host offload action.
+  bool isHostOffloading(OffloadKind OKind) const {
+    return ActiveOffloadKindMask & OKind;
+  }
+  bool isDeviceOffloading(OffloadKind OKind) const {
+    return OffloadingDeviceKind == OKind;
+  }
+  bool isOffloading(OffloadKind OKind) const {
+    return isHostOffloading(OKind) || isDeviceOffloading(OKind);
+  }
 };
 
 class InputAction : public Action {
@@ -134,43 +202,126 @@
   }
 };
 
-class CudaDeviceAction : public Action {
+/// An offload action combines host or/and device actions according to the
+/// programming model implementation needs and propagates the offloading kind to
+/// its dependences.
+class OffloadAction final : public Action {
   virtual void anchor();
-  /// GPU architecture to bind.  Always of the form /sm_\d+/ or null (when the
-  /// action applies to multiple architectures).
-  const char *GpuArchName;
-  /// True when action results are not consumed by the host action (e.g when
-  /// -fsyntax-only or --cuda-device-only options are used).
-  bool AtTopLevel;
 
 public:
-  CudaDeviceAction(Action *Input, const char *ArchName, bool AtTopLevel);
+  /// Type used to communicate device actions. It associates bound architecture,
+  /// toolchain, and offload kind to each action.
+  class DeviceDependences final {
+  public:
+    typedef SmallVector<const ToolChain *, 3> ToolChainList;
+    typedef SmallVector<const char *, 3> BoundArchList;
+    typedef SmallVector<OffloadKind, 3> OffloadKindList;
 
-  const char *getGpuArchName() const { return GpuArchName; }
+  private:
+    // Lists that keep the information for each dependency. All the lists are
+    // meant to be updated in sync. We are adopting separate lists instead of a
+    // list of structs, because that simplifies forwarding the actions list to
+    // initialize the inputs of the base Action class.
 
-  /// Gets the compute_XX that corresponds to getGpuArchName().  Returns null
-  /// when getGpuArchName() is null.
-  const char *getComputeArchName() const;
+    /// The dependence actions.
+    ActionList DeviceActions;
+    /// The offloading toolchains that should be used with the action.
+    ToolChainList DeviceToolChains;
+    /// The architectures that should be used with this action.
+    BoundArchList DeviceBoundArchs;
+    /// The offload kind of each dependence.
+    OffloadKindList DeviceOffloadKinds;
 
-  bool isAtTopLevel() const { return AtTopLevel; }
+  public:
+    /// Add a action along with the associated toolchain, bound arch, and
+    /// offload kind.
+    void add(Action &A, const ToolChain &TC, const char *BoundArch,
+             OffloadKind OKind);
 
-  static bool IsValidGpuArchName(llvm::StringRef ArchName);
+    /// Get each of the individual arrays.
+    const ActionList &getActions() const { return DeviceActions; };
+    const ToolChainList &getToolChains() const { return DeviceToolChains; };
+    const BoundArchList &getBoundArchs() const { return DeviceBoundArchs; };
+    const OffloadKindList &getOffloadKinds() const {
+      return DeviceOffloadKinds;
+    };
+  };
 
-  static bool classof(const Action *A) {
-    return A->getKind() == CudaDeviceClass;
-  }
-};
+  /// Type used to communicate host actions. It associates bound architecture,
+  /// toolchain, and offload kinds to the host action.
+  class HostDependence final {
+    /// The dependence action.
+    Action &HostAction;
+    /// The offloading toolchain that should be used with the action.
+    const ToolChain &HostToolChain;
+    /// The architectures that should be used with this action.
+    const char *HostBoundArch = nullptr;
+    /// The offload kind of each dependence.
+    unsigned HostOffloadKinds = 0u;
 
-class CudaHostAction : public Action {
-  virtual void anchor();
-  ActionList DeviceActions;
+  public:
+    HostDependence(Action &A, const ToolChain &TC, const char *BoundArch,
+                   const unsigned OffloadKinds)
+        : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch),
+          HostOffloadKinds(OffloadKinds){};
+    /// Constructor version that obtains the offload kinds from the device
+    /// dependencies.
+    HostDependence(Action &A, const ToolChain &TC, const char *BoundArch,
+                   const DeviceDependences &DDeps);
+    Action *getAction() const { return &HostAction; };
+    const ToolChain *getToolChain() const { return &HostToolChain; };
+    const char *getBoundArch() const { return HostBoundArch; };
+    unsigned getOffloadKinds() const { return HostOffloadKinds; };
+  };
+
+  typedef llvm::function_ref<void(Action *, const ToolChain *, const char *)>
+      OffloadActionWorkTy;
+
+private:
+  /// The host offloading toolchain that should be used with the action.
+  const ToolChain *HostTC = nullptr;
+
+  /// The tool chains associated with the list of actions.
+  DeviceDependences::ToolChainList DevToolChains;
 
 public:
-  CudaHostAction(Action *Input, const ActionList &DeviceActions);
+  OffloadAction(const HostDependence &HDep);
+  OffloadAction(const DeviceDependences &DDeps, types::ID Ty);
+  OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps);
 
-  const ActionList &getDeviceActions() const { return DeviceActions; }
+  /// Execute the work specified in \a Work on the host dependence.
+  void doOnHostDependence(const OffloadActionWorkTy &Work) const;
 
-  static bool classof(const Action *A) { return A->getKind() == CudaHostClass; }
+  /// Execute the work specified in \a Work on each device dependence.
+  void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const;
+
+  /// Execute the work specified in \a Work on each dependence.
+  void doOnEachDependence(const OffloadActionWorkTy &Work) const;
+
+  /// Execute the work specified in \a Work on each host or device dependence if
+  /// \a IsHostDependenceto is true or false, respectively.
+  void doOnEachDependence(bool IsHostDependence,
+                          const OffloadActionWorkTy &Work) const;
+
+  /// Return true if the action has a host dependence.
+  bool hasHostDependence() const;
+
+  /// Return the host dependence of this action. This function is only expected
+  /// to be called if the host dependence exists.
+  Action *getHostDependence() const;
+
+  /// Return true if the action has a single device dependence. If \a
+  /// DoNotConsiderHostActions is set, ignore the host dependence, if any, while
+  /// accounting for the number of dependences.
+  bool hasSingleDeviceDependence(bool DoNotConsiderHostActions = false) const;
+
+  /// Return the single device dependence of this action. This function is only
+  /// expected to be called if a single device dependence exists. If \a
+  /// DoNotConsiderHostActions is set, a host dependence is allowed.
+  Action *
+  getSingleDeviceDependence(bool DoNotConsiderHostActions = false) const;
+
+  static bool classof(const Action *A) { return A->getKind() == OffloadClass; }
 };
 
 class JobAction : public Action {
diff --git a/include/clang/Driver/CC1Options.td b/include/clang/Driver/CC1Options.td
index be333c3..f2cfc11 100644
--- a/include/clang/Driver/CC1Options.td
+++ b/include/clang/Driver/CC1Options.td
@@ -114,6 +114,9 @@
 def analyzer_checker_help : Flag<["-"], "analyzer-checker-help">,
   HelpText<"Display the list of analyzer checkers that are available">;
 
+def analyzer_list_enabled_checkers : Flag<["-"], "analyzer-list-enabled-checkers">,
+  HelpText<"Display the list of enabled analyzer checkers">;
+
 def analyzer_config : Separate<["-"], "analyzer-config">,
   HelpText<"Choose analyzer options to enable">;
 
@@ -143,6 +146,8 @@
   HelpText<"Mark the file as not needing an executable stack">;
 def massembler_fatal_warnings : Flag<["-"], "massembler-fatal-warnings">,
   HelpText<"Make assembler warnings fatal">;
+def mrelax_relocations : Flag<["--"], "mrelax-relocations">,
+    HelpText<"Use relaxable elf relocations">;
 def compress_debug_sections : Flag<["-"], "compress-debug-sections">,
     HelpText<"Compress DWARF debug sections using zlib">;
 def msave_temp_labels : Flag<["-"], "msave-temp-labels">,
@@ -151,6 +156,8 @@
            "on compiler-generated code.">;
 def mrelocation_model : Separate<["-"], "mrelocation-model">,
   HelpText<"The relocation model to use">;
+def fno_math_builtin : Flag<["-"], "fno-math-builtin">,
+  HelpText<"Disable implicit builtin knowledge of math functions">;
 }
 
 def disable_llvm_optzns : Flag<["-"], "disable-llvm-optzns">,
@@ -268,6 +275,21 @@
 def fsanitize_coverage_8bit_counters
     : Flag<["-"], "fsanitize-coverage-8bit-counters">,
       HelpText<"Enable frequency counters in sanitizer coverage">;
+def fsanitize_coverage_trace_pc
+    : Flag<["-"], "fsanitize-coverage-trace-pc">,
+      HelpText<"Enable PC tracing in sanitizer coverage">;
+def fprofile_instrument_EQ : Joined<["-"], "fprofile-instrument=">,
+    HelpText<"Enable PGO instrumentation. The accepted value is clang, llvm, "
+             "or none">;
+def fprofile_instrument_path_EQ : Joined<["-"], "fprofile-instrument-path=">,
+    HelpText<"Generate instrumented code to collect execution counts into "
+             "<file> (overridden by LLVM_PROFILE_FILE env var)">;
+def fprofile_instrument_use_path_EQ :
+    Joined<["-"], "fprofile-instrument-use-path=">,
+    HelpText<"Specify the profile path in PGO use compilation">;
+def flto_visibility_public_std:
+    Flag<["-"], "flto-visibility-public-std">,
+    HelpText<"Use public LTO visibility for classes in std and stdext namespaces">;
 
 //===----------------------------------------------------------------------===//
 // Dependency Output Options
@@ -369,13 +391,12 @@
   HelpText<"Use with -ast-dump or -ast-print to dump/print only AST declaration"
            " nodes having a certain substring in a qualified name. Use"
            " -ast-list to list all filterable declaration node names.">;
+def fmodules_ts : Flag <["-"], "fmodules-ts">, Group<f_Group>,
+  HelpText<"Enable support for the C++ Modules TS">;
 def fno_modules_global_index : Flag<["-"], "fno-modules-global-index">,
   HelpText<"Do not automatically generate or update the global module index">;
 def fno_modules_error_recovery : Flag<["-"], "fno-modules-error-recovery">,
   HelpText<"Do not automatically import modules for error recovery">;
-def fmodule_implementation_of : Separate<["-"], "fmodule-implementation-of">,
-  MetaVarName<"<name>">,
-  HelpText<"Specify the name of the module whose implementation file this is">;
 def fmodule_map_file_home_is_cwd : Flag<["-"], "fmodule-map-file-home-is-cwd">,
   HelpText<"Use the current working directory as the home directory of "
            "module maps specified by -fmodule-map-file=<FILE>">;
@@ -489,6 +510,12 @@
 
 def foverride_record_layout_EQ : Joined<["-"], "foverride-record-layout=">,
   HelpText<"Override record layouts with those in the given file">;
+def find_pch_source_EQ : Joined<["-"], "find-pch-source=">,
+  HelpText<"When building a pch, try to find the input file in include "
+           "directories, as if it had been included by the argument passed "
+           "to this flag.">;
+def fno_pch_timestamp : Flag<["-"], "fno-pch-timestamp">,
+  HelpText<"Disable inclusion of timestamp in precompiled headers">;
   
 //===----------------------------------------------------------------------===//
 // Language Options
@@ -505,10 +532,8 @@
 
 def fblocks_runtime_optional : Flag<["-"], "fblocks-runtime-optional">,
   HelpText<"Weakly link in the blocks runtime">;
-def fsjlj_exceptions : Flag<["-"], "fsjlj-exceptions">,
-  HelpText<"Use SjLj style exceptions">;
-def fnew_ms_eh: Flag<["-"], "fnew-ms-eh">,
-  HelpText<"Use the new IR representation for MS exceptions">;
+def fexternc_nounwind : Flag<["-"], "fexternc-nounwind">,
+  HelpText<"Assume all functions with C linkage do not unwind">;
 def split_dwarf_file : Separate<["-"], "split-dwarf-file">,
   HelpText<"File name to use for split dwarf debug info output">;
 def fno_wchar : Flag<["-"], "fno-wchar">,
@@ -528,8 +553,8 @@
   HelpText<"enable extended encoding of block type signature">;
 def pic_level : Separate<["-"], "pic-level">,
   HelpText<"Value for __PIC__">;
-def pie_level : Separate<["-"], "pie-level">,
-  HelpText<"Value for __PIE__">;
+def pic_is_pie : Flag<["-"], "pic-is-pie">,
+  HelpText<"File is for a position independent executable">;
 def fno_validate_pch : Flag<["-"], "fno-validate-pch">,
   HelpText<"Disable validation of precompiled headers">;
 def dump_deserialized_pch_decls : Flag<["-"], "dump-deserialized-decls">,
@@ -588,8 +613,14 @@
   HelpText<"Control emission of RTTI data">;
 def fnative_half_type: Flag<["-"], "fnative-half-type">,
   HelpText<"Use the native half type for __fp16 instead of promoting to float">;
+def fnative_half_arguments_and_returns : Flag<["-"], "fnative-half-arguments-and-returns">,
+  HelpText<"Use the native __fp16 type for arguments and returns (and skip ABI-specific lowering)">;
 def fallow_half_arguments_and_returns : Flag<["-"], "fallow-half-arguments-and-returns">,
   HelpText<"Allow function arguments and returns of type half">;
+def fdefault_calling_conv_EQ : Joined<["-"], "fdefault-calling-conv=">,
+  HelpText<"Set default MS calling convention">;
+def finclude_default_header : Flag<["-"], "finclude-default-header">,
+  HelpText<"Include the default header file for OpenCL">;
 
 // C++ TSes.
 def fcoroutines : Flag<["-"], "fcoroutines">,
@@ -640,46 +671,17 @@
   HelpText<"include a detailed record of preprocessing actions">;
 
 //===----------------------------------------------------------------------===//
-// OpenCL Options
-//===----------------------------------------------------------------------===//
-
-def cl_opt_disable : Flag<["-"], "cl-opt-disable">,
-  HelpText<"OpenCL only. This option disables all optimizations. The default is optimizations are enabled.">;
-def cl_strict_aliasing : Flag<["-"], "cl-strict-aliasing">,
-  HelpText<"OpenCL only. This option does nothing and is for compatibility with OpenCL 1.0">;
-def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">,
-  HelpText<"OpenCL only. Treat double precision floating-point constant as single precision constant.">;
-def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">,
-  HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
-def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">,
-  HelpText<"OpenCL only. Generate kernel argument metadata.">;
-def cl_unsafe_math_optimizations : Flag<["-"], "cl-unsafe-math-optimizations">,
-  HelpText<"OpenCL only. Allow unsafe floating-point optimizations.  Also implies -cl-no-signed-zeros and -cl-mad-enable">;
-def cl_fast_relaxed_math : Flag<["-"], "cl-fast-relaxed-math">,
-  HelpText<"OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__">;
-def cl_mad_enable : Flag<["-"], "cl-mad-enable">,
-  HelpText<"OpenCL only. Enable less precise MAD instructions to be generated.">;
-def cl_std_EQ : Joined<["-"], "cl-std=">,
-  HelpText<"OpenCL language standard to compile for">;
-def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">,
-  HelpText<"OpenCL only. Allow denormals to be flushed to zero">;
-
-//===----------------------------------------------------------------------===//
 // CUDA Options
 //===----------------------------------------------------------------------===//
 
 def fcuda_is_device : Flag<["-"], "fcuda-is-device">,
   HelpText<"Generate code for CUDA device">;
-def fcuda_allow_host_calls_from_host_device : Flag<["-"],
-    "fcuda-allow-host-calls-from-host-device">,
-  HelpText<"Allow host device functions to call host functions">;
-def fcuda_disable_target_call_checks : Flag<["-"],
-    "fcuda-disable-target-call-checks">,
-  HelpText<"Disable all cross-target (host, device, etc.) call checks in CUDA">;
 def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">,
   HelpText<"Incorporate CUDA device-side binary into host object file.">;
-def fcuda_target_overloads : Flag<["-"], "fcuda-target-overloads">,
-  HelpText<"Enable function overloads based on CUDA target attributes.">;
+def fcuda_allow_variadic_functions : Flag<["-"], "fcuda-allow-variadic-functions">,
+  HelpText<"Allow variadic functions in CUDA device code.">;
+def fno_cuda_host_device_constexpr : Flag<["-"], "fno-cuda-host-device-constexpr">,
+  HelpText<"Don't treat unattributed constexpr functions as __host__ __device__.">;
 
 //===----------------------------------------------------------------------===//
 // OpenMP Options
@@ -687,7 +689,7 @@
 
 def fopenmp_is_device : Flag<["-"], "fopenmp-is-device">,
   HelpText<"Generate code only for an OpenMP target device.">;
-def omp_host_ir_file_path : Separate<["-"], "omp-host-ir-file-path">,
+def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">,
   HelpText<"Path to the IR file produced by the frontend for the host.">;
   
 } // let Flags = [CC1Option]
diff --git a/include/clang/Driver/CLCompatOptions.td b/include/clang/Driver/CLCompatOptions.td
index 16a5b72..b1d2459 100644
--- a/include/clang/Driver/CLCompatOptions.td
+++ b/include/clang/Driver/CLCompatOptions.td
@@ -45,8 +45,8 @@
   KIND_JOINED_OR_SEPARATE>, Group<cl_compile_Group>,
   Flags<[CLOption, DriverOption]>;
 
-class CLRemainingArgs<string name> : Option<["/", "-"], name,
-  KIND_REMAINING_ARGS>, Group<cl_Group>, Flags<[CLOption, DriverOption]>;
+class CLRemainingArgsJoined<string name> : Option<["/", "-"], name,
+  KIND_REMAINING_ARGS_JOINED>, Group<cl_Group>, Flags<[CLOption, DriverOption]>;
 
 // Aliases:
 // (We don't put any of these in cl_compile_Group as the options they alias are
@@ -77,6 +77,8 @@
 def _SLASH_GR_ : CLFlag<"GR-">, HelpText<"Disable emission of RTTI data">;
 def _SLASH_GF_ : CLFlag<"GF-">, HelpText<"Disable string pooling">,
   Alias<fwritable_strings>;
+def _SLASH_GS : CLFlag<"GS">, HelpText<"Enable buffer security check">;
+def _SLASH_GS_ : CLFlag<"GS-">, HelpText<"Disable buffer security check">;
 def _SLASH_Gs : CLJoined<"Gs">, HelpText<"Set stack probe size">,
   Alias<mstack_probe_size>;
 def _SLASH_Gy : CLFlag<"Gy">, HelpText<"Put each function in its own section">,
@@ -98,9 +100,8 @@
 def _SLASH_J : CLFlag<"J">, HelpText<"Make char type unsigned">,
   Alias<funsigned_char>;
 def _SLASH_O0 : CLFlag<"O0">, Alias<O0>;
+// /Oy- is handled by the /O option because /Oy- only has an effect on 32-bit.
 def _SLASH_O : CLJoined<"O">, HelpText<"Optimization level">;
-def _SLASH_Ob0 : CLFlag<"Ob0">, HelpText<"Disable inlining">,
-  Alias<fno_inline>;
 def _SLASH_Od : CLFlag<"Od">, HelpText<"Disable optimization">, Alias<O0>;
 def _SLASH_Oi : CLFlag<"Oi">, HelpText<"Enable use of builtin functions">,
   Alias<fbuiltin>;
@@ -119,6 +120,8 @@
 def _SLASH_showIncludes : CLFlag<"showIncludes">,
   HelpText<"Print info about included files to stderr">,
   Alias<show_includes>;
+def _SLASH_std : CLCompileJoined<"std:">,
+  HelpText<"Language standard to compile for">;
 def _SLASH_U : CLJoinedOrSeparate<"U">, HelpText<"Undefine macro">,
   MetaVarName<"<macro>">, Alias<U>;
 def _SLASH_W0 : CLFlag<"W0">, HelpText<"Disable all warnings">, Alias<w>;
@@ -163,6 +166,8 @@
   HelpText<"Disable trigraphs (default)">, Alias<fno_trigraphs>;
 def _SLASH_Z7 : CLFlag<"Z7">,
   HelpText<"Enable CodeView debug information in object files">;
+def _SLASH_Zd : CLFlag<"Zd">,
+  HelpText<"Emit debug line number tables only">;
 def _SLASH_Zi : CLFlag<"Zi">, Alias<_SLASH_Z7>,
   HelpText<"Alias for /Z7. Does not produce PDBs.">;
 def _SLASH_Zp : CLJoined<"Zp">,
@@ -205,9 +210,16 @@
 def _SLASH_Fo : CLCompileJoined<"Fo">,
   HelpText<"Set output object file, or directory (ends in / or \\) (with /c)">,
   MetaVarName<"<file or directory>">;
+def _SLASH_GX : CLFlag<"GX">,
+  HelpText<"Enable exception handling">;
+def _SLASH_GX_ : CLFlag<"GX-">,
+  HelpText<"Enable exception handling">;
+def _SLASH_imsvc : CLJoinedOrSeparate<"imsvc">,
+  HelpText<"Add directory to system include search path, as if part of %INCLUDE%">,
+  MetaVarName<"<dir>">;
 def _SLASH_LD : CLFlag<"LD">, HelpText<"Create DLL">;
 def _SLASH_LDd : CLFlag<"LDd">, HelpText<"Create debug DLL">;
-def _SLASH_link : CLRemainingArgs<"link">,
+def _SLASH_link : CLRemainingArgsJoined<"link">,
   HelpText<"Forward options to the linker">, MetaVarName<"<options>">;
 def _SLASH_MD : Option<["/", "-"], "MD", KIND_FLAG>, Group<_SLASH_M_Group>,
   Flags<[CLOption, DriverOption]>, HelpText<"Use DLL run-time">;
@@ -248,22 +260,41 @@
 def _SLASH_Zl : CLFlag<"Zl">,
   HelpText<"Don't mention any default libraries in the object file">;
 
+def _SLASH_Yc : CLJoined<"Yc">,
+  HelpText<"Generate a pch file for all code up to and including <filename>">,
+  MetaVarName<"<filename>">;
+def _SLASH_Yu : CLJoined<"Yu">,
+  HelpText<"Load a pch file and use it instead of all code up to "
+           "and including <filename>">,
+  MetaVarName<"<filename>">;
+def _SLASH_Y_ : CLFlag<"Y-">,
+  HelpText<"Disable precompiled headers, overrides /Yc and /Yu">;
+def _SLASH_Fp : CLJoined<"Fp">,
+  HelpText<"Set pch filename (with /Yc and /Yu)">, MetaVarName<"<filename>">;
+
+def _SLASH_Gd : CLFlag<"Gd">,
+  HelpText<"Set __cdecl as a default calling convention">;
+def _SLASH_Gr : CLFlag<"Gr">,
+  HelpText<"Set __fastcall as a default calling convention">;
+def _SLASH_Gz : CLFlag<"Gz">,
+  HelpText<"Set __stdcall as a default calling convention">;
+def _SLASH_Gv : CLFlag<"Gv">,
+  HelpText<"Set __vectorcall as a default calling convention">;
+
 // Ignored:
 
 def _SLASH_analyze_ : CLIgnoredFlag<"analyze-">;
 def _SLASH_bigobj : CLIgnoredFlag<"bigobj">;
 def _SLASH_cgthreads : CLIgnoredJoined<"cgthreads">;
+def _SLASH_d2FastFail : CLIgnoredFlag<"d2FastFail">;
 def _SLASH_d2Zi_PLUS : CLIgnoredFlag<"d2Zi+">;
 def _SLASH_errorReport : CLIgnoredJoined<"errorReport">;
 def _SLASH_Fd : CLIgnoredJoined<"Fd">;
+def _SLASH_FC : CLIgnoredFlag<"FC">;
 def _SLASH_FS : CLIgnoredFlag<"FS">, HelpText<"Force synchronous PDB writes">;
-def _SLASH_Gd : CLIgnoredFlag<"Gd">;
 def _SLASH_GF : CLIgnoredFlag<"GF">;
-def _SLASH_GS_ : CLIgnoredFlag<"GS-">;
 def _SLASH_kernel_ : CLIgnoredFlag<"kernel-">;
 def _SLASH_nologo : CLIgnoredFlag<"nologo">;
-def _SLASH_Ob1 : CLIgnoredFlag<"Ob1">;
-def _SLASH_Ob2 : CLIgnoredFlag<"Ob2">;
 def _SLASH_Og : CLIgnoredFlag<"Og">;
 def _SLASH_openmp_ : CLIgnoredFlag<"openmp-">;
 def _SLASH_RTC : CLIgnoredJoined<"RTC">;
@@ -287,10 +318,8 @@
 def _SLASH_doc : CLJoined<"doc">;
 def _SLASH_FA_joined : CLJoined<"FA">;
 def _SLASH_favor : CLJoined<"favor">;
-def _SLASH_FC : CLFlag<"FC">;
 def _SLASH_F : CLFlag<"F">;
 def _SLASH_Fm : CLJoined<"Fm">;
-def _SLASH_Fp : CLJoined<"Fp">;
 def _SLASH_Fr : CLJoined<"Fr">;
 def _SLASH_FR : CLJoined<"FR">;
 def _SLASH_FU : CLJoinedOrSeparate<"FU">;
@@ -304,13 +333,8 @@
 def _SLASH_GL_ : CLFlag<"GL-">;
 def _SLASH_Gm : CLFlag<"Gm">;
 def _SLASH_Gm_ : CLFlag<"Gm-">;
-def _SLASH_Gr : CLFlag<"Gr">;
-def _SLASH_GS : CLFlag<"GS">;
 def _SLASH_GT : CLFlag<"GT">;
 def _SLASH_Guard : CLJoined<"guard:">;
-def _SLASH_GX : CLFlag<"GX">;
-def _SLASH_Gv : CLFlag<"Gv">;
-def _SLASH_Gz : CLFlag<"Gz">;
 def _SLASH_GZ : CLFlag<"GZ">;
 def _SLASH_H : CLFlag<"H">;
 def _SLASH_homeparams : CLFlag<"homeparams">;
@@ -329,11 +353,8 @@
 def _SLASH_WL : CLFlag<"WL">;
 def _SLASH_Wp64 : CLFlag<"Wp64">;
 def _SLASH_X : CLFlag<"X">;
-def _SLASH_Yc : CLJoined<"Yc">;
-def _SLASH_Y_ : CLFlag<"Y-">;
 def _SLASH_Yd : CLFlag<"Yd">;
 def _SLASH_Yl : CLJoined<"Yl">;
-def _SLASH_Yu : CLJoined<"Yu">;
 def _SLASH_Za : CLFlag<"Za">;
 def _SLASH_Zc : CLJoined<"Zc:">;
 def _SLASH_Ze : CLFlag<"Ze">;
diff --git a/include/clang/Driver/Compilation.h b/include/clang/Driver/Compilation.h
index 3ed1913..8846b6f 100644
--- a/include/clang/Driver/Compilation.h
+++ b/include/clang/Driver/Compilation.h
@@ -14,7 +14,7 @@
 #include "clang/Driver/Job.h"
 #include "clang/Driver/Util.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Path.h"
+#include <map>
 
 namespace llvm {
 namespace opt {
@@ -38,8 +38,16 @@
   /// The default tool chain.
   const ToolChain &DefaultToolChain;
 
-  const ToolChain *CudaHostToolChain;
-  const ToolChain *CudaDeviceToolChain;
+  /// A mask of all the programming models the host has to support in the
+  /// current compilation.
+  unsigned ActiveOffloadMask;
+
+  /// Array with the toolchains of offloading host and devices in the order they
+  /// were requested by the user. We are preserving that order in case the code
+  /// generation needs to derive a programming-model-specific semantic out of
+  /// it.
+  std::multimap<Action::OffloadKind, const ToolChain *>
+      OrderedOffloadingToolchains;
 
   /// The original (untranslated) input argument list.
   llvm::opt::InputArgList *Args;
@@ -89,16 +97,46 @@
   const Driver &getDriver() const { return TheDriver; }
 
   const ToolChain &getDefaultToolChain() const { return DefaultToolChain; }
-  const ToolChain *getCudaHostToolChain() const { return CudaHostToolChain; }
-  const ToolChain *getCudaDeviceToolChain() const {
-    return CudaDeviceToolChain;
+
+  unsigned isOffloadingHostKind(Action::OffloadKind Kind) const {
+    return ActiveOffloadMask & Kind;
   }
 
-  void setCudaHostToolChain(const ToolChain *HostToolChain) {
-    CudaHostToolChain = HostToolChain;
+  /// Iterator that visits device toolchains of a given kind.
+  typedef const std::multimap<Action::OffloadKind,
+                              const ToolChain *>::const_iterator
+      const_offload_toolchains_iterator;
+  typedef std::pair<const_offload_toolchains_iterator,
+                    const_offload_toolchains_iterator>
+      const_offload_toolchains_range;
+
+  template <Action::OffloadKind Kind>
+  const_offload_toolchains_range getOffloadToolChains() const {
+    return OrderedOffloadingToolchains.equal_range(Kind);
   }
-  void setCudaDeviceToolChain(const ToolChain *DeviceToolChain) {
-    CudaDeviceToolChain = DeviceToolChain;
+
+  /// Return an offload toolchain of the provided kind. Only one is expected to
+  /// exist.
+  template <Action::OffloadKind Kind>
+  const ToolChain *getSingleOffloadToolChain() const {
+    auto TCs = getOffloadToolChains<Kind>();
+
+    assert(TCs.first != TCs.second &&
+           "No tool chains of the selected kind exist!");
+    assert(std::next(TCs.first) == TCs.second &&
+           "More than one tool chain of the this kind exist.");
+    return TCs.first->second;
+  }
+
+  void addOffloadDeviceToolChain(const ToolChain *DeviceToolChain,
+                                 Action::OffloadKind OffloadKind) {
+    assert(OffloadKind != Action::OFK_Host && OffloadKind != Action::OFK_None &&
+           "This is not a device tool chain!");
+
+    // Update the host offload kind to also contain this kind.
+    ActiveOffloadMask |= OffloadKind;
+    OrderedOffloadingToolchains.insert(
+        std::make_pair(OffloadKind, DeviceToolChain));
   }
 
   const llvm::opt::InputArgList &getInputArgs() const { return *Args; }
@@ -208,6 +246,15 @@
 
   /// Return true if we're compiling for diagnostics.
   bool isForDiagnostics() const { return ForDiagnostics; }
+
+  /// Redirect - Redirect output of this compilation. Can only be done once.
+  ///
+  /// \param Redirects - array of pointers to paths. The array
+  /// should have a size of three. The inferior process's
+  /// stdin(0), stdout(1), and stderr(2) will be redirected to the
+  /// corresponding paths. This compilation instance becomes
+  /// the owner of Redirects and will delete the array and StringRef's.
+  void Redirect(const StringRef** Redirects);
 };
 
 } // end namespace driver
diff --git a/include/clang/Driver/Driver.h b/include/clang/Driver/Driver.h
index 3f6e006..2a0bb28 100644
--- a/include/clang/Driver/Driver.h
+++ b/include/clang/Driver/Driver.h
@@ -17,16 +17,14 @@
 #include "clang/Driver/Util.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/Path.h" // FIXME: Kill when CompilationInfo lands.
 
 #include <list>
 #include <map>
-#include <memory>
-#include <set>
 #include <string>
 
 namespace llvm {
+class Triple;
+
 namespace opt {
   class Arg;
   class ArgList;
@@ -83,6 +81,12 @@
     SaveTempsObj
   } SaveTemps;
 
+  enum BitcodeEmbedMode {
+    EmbedNone,
+    EmbedMarker,
+    EmbedBitcode
+  } BitcodeEmbed;
+
   /// LTO mode selected via -f(no-)?lto(=.*)? options.
   LTOKind LTOMode;
 
@@ -126,9 +130,6 @@
   /// If the standard library is used
   bool UseStdLib;
 
-  /// Default target triple.
-  std::string DefaultTargetTriple;
-
   /// Driver title to use with help.
   std::string DriverTitle;
 
@@ -154,6 +155,9 @@
   /// Whether the driver is just the preprocessor.
   bool CCCIsCPP() const { return Mode == CPPMode; }
 
+  /// Whether the driver should follow gcc like behavior.
+  bool CCCIsCC() const { return Mode == GCCMode; }
+
   /// Whether the driver should follow cl.exe like behavior.
   bool IsCLMode() const { return Mode == CLMode; }
 
@@ -177,6 +181,9 @@
   unsigned CCGenDiagnostics : 1;
 
 private:
+  /// Default target triple.
+  std::string DefaultTargetTriple;
+
   /// Name to use when invoking gcc/g++.
   std::string CCCGenericGCCName;
 
@@ -190,7 +197,7 @@
 
 private:
   /// Certain options suppress the 'no input files' warning.
-  bool SuppressMissingInputWarning : 1;
+  unsigned SuppressMissingInputWarning : 1;
 
   std::list<std::string> TempFiles;
   std::list<std::string> ResultFiles;
@@ -242,7 +249,7 @@
   void setCheckInputsExist(bool Value) { CheckInputsExist = Value; }
 
   const std::string &getTitle() { return DriverTitle; }
-  void setTitle(std::string Value) { DriverTitle = Value; }
+  void setTitle(std::string Value) { DriverTitle = std::move(Value); }
 
   /// \brief Get the path to the main clang executable.
   const char *getClangProgramPath() const {
@@ -262,10 +269,18 @@
   bool isSaveTempsEnabled() const { return SaveTemps != SaveTempsNone; }
   bool isSaveTempsObj() const { return SaveTemps == SaveTempsObj; }
 
+  bool embedBitcodeEnabled() const { return BitcodeEmbed == EmbedBitcode; }
+  bool embedBitcodeMarkerOnly() const { return BitcodeEmbed == EmbedMarker; }
+
   /// @}
   /// @name Primary Functionality
   /// @{
 
+  /// CreateOffloadingDeviceToolChains - create all the toolchains required to
+  /// support offloading devices given the programming models specified in the
+  /// current compilation. Also, update the host tool chain kind accordingly.
+  void CreateOffloadingDeviceToolChains(Compilation &C, InputList &Inputs);
+
   /// BuildCompilation - Construct a compilation object for a command
   /// line argument vector.
   ///
@@ -279,7 +294,7 @@
   /// @{
 
   /// ParseDriverMode - Look for and handle the driver mode option in Args.
-  void ParseDriverMode(ArrayRef<const char *> Args);
+  void ParseDriverMode(StringRef ProgramName, ArrayRef<const char *> Args);
 
   /// ParseArgStrings - Parse the given list of strings into an
   /// ArgList.
@@ -299,12 +314,10 @@
   /// given arguments, which are only done for a single architecture.
   ///
   /// \param C - The compilation that is being built.
-  /// \param TC - The default host tool chain.
   /// \param Args - The input arguments.
   /// \param Actions - The list to store the resulting actions onto.
-  void BuildActions(Compilation &C, const ToolChain &TC,
-                    llvm::opt::DerivedArgList &Args, const InputList &Inputs,
-                    ActionList &Actions) const;
+  void BuildActions(Compilation &C, llvm::opt::DerivedArgList &Args,
+                    const InputList &Inputs, ActionList &Actions) const;
 
   /// BuildUniversalActions - Construct the list of actions to perform
   /// for the given arguments, which may require a universal build.
@@ -376,19 +389,19 @@
   /// ConstructAction - Construct the appropriate action to do for
   /// \p Phase on the \p Input, taking in to account arguments
   /// like -fsyntax-only or --analyze.
-  Action *ConstructPhaseAction(Compilation &C, const ToolChain &TC,
-                               const llvm::opt::ArgList &Args, phases::ID Phase,
-                               Action *Input) const;
+  Action *ConstructPhaseAction(Compilation &C, const llvm::opt::ArgList &Args,
+                               phases::ID Phase, Action *Input) const;
 
   /// BuildJobsForAction - Construct the jobs to perform for the action \p A and
   /// return an InputInfo for the result of running \p A.  Will only construct
   /// jobs for a given (Action, ToolChain, BoundArch) tuple once.
-  InputInfo BuildJobsForAction(Compilation &C, const Action *A,
-                               const ToolChain *TC, const char *BoundArch,
-                               bool AtTopLevel, bool MultipleArchs,
-                               const char *LinkingOutput,
-                               std::map<std::pair<const Action *, std::string>,
-                                        InputInfo> &CachedResults) const;
+  InputInfo
+  BuildJobsForAction(Compilation &C, const Action *A, const ToolChain *TC,
+                     const char *BoundArch, bool AtTopLevel, bool MultipleArchs,
+                     const char *LinkingOutput,
+                     std::map<std::pair<const Action *, std::string>, InputInfo>
+                         &CachedResults,
+                     bool BuildForOffloadDevice) const;
 
   /// Returns the default name for linked images (e.g., "a.out").
   const char *getDefaultImageName() const;
@@ -404,12 +417,11 @@
   /// \param BoundArch - The bound architecture. 
   /// \param AtTopLevel - Whether this is a "top-level" action.
   /// \param MultipleArchs - Whether multiple -arch options were supplied.
-  const char *GetNamedOutputPath(Compilation &C,
-                                 const JobAction &JA,
-                                 const char *BaseInput,
-                                 const char *BoundArch,
-                                 bool AtTopLevel,
-                                 bool MultipleArchs) const;
+  /// \param NormalizedTriple - The normalized triple of the relevant target.
+  const char *GetNamedOutputPath(Compilation &C, const JobAction &JA,
+                                 const char *BaseInput, const char *BoundArch,
+                                 bool AtTopLevel, bool MultipleArchs,
+                                 StringRef NormalizedTriple) const;
 
   /// GetTemporaryPath - Return the pathname of a temporary file to use 
   /// as part of compilation; the file will have the given prefix and suffix.
@@ -417,6 +429,9 @@
   /// GCC goes to extra lengths here to be a bit more robust.
   std::string GetTemporaryPath(StringRef Prefix, const char *Suffix) const;
 
+  /// Return the pathname of the pch file in clang-cl mode.
+  std::string GetClPchPath(Compilation &C, StringRef BaseName) const;
+
   /// ShouldUseClangCompiler - Should the clang compiler be used to
   /// handle this action.
   bool ShouldUseClangCompiler(const JobAction &JA) const;
@@ -428,6 +443,10 @@
   LTOKind getLTOMode() const { return LTOMode; }
 
 private:
+  /// Set the driver mode (cl, gcc, etc) from an option string of the form
+  /// --driver-mode=<mode>.
+  void setDriverModeFromOption(StringRef Opt);
+
   /// Parse the \p Args list for LTO options and record the type of LTO
   /// compilation based on which -f(no-)?lto(=.*)? option occurs last.
   void setLTOMode(const llvm::opt::ArgList &Args);
@@ -453,7 +472,8 @@
       const char *BoundArch, bool AtTopLevel, bool MultipleArchs,
       const char *LinkingOutput,
       std::map<std::pair<const Action *, std::string>, InputInfo>
-          &CachedResults) const;
+          &CachedResults,
+      bool BuildForOffloadDevice) const;
 
 public:
   /// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and
diff --git a/include/clang/Driver/Job.h b/include/clang/Driver/Job.h
index 263356f..3366fc4 100644
--- a/include/clang/Driver/Job.h
+++ b/include/clang/Driver/Job.h
@@ -138,6 +138,20 @@
   std::unique_ptr<Command> Fallback;
 };
 
+/// Like Command, but always pretends that the wrapped command succeeded.
+class ForceSuccessCommand : public Command {
+public:
+  ForceSuccessCommand(const Action &Source_, const Tool &Creator_,
+                      const char *Executable_, const ArgStringList &Arguments_,
+                      ArrayRef<InputInfo> Inputs);
+
+  void Print(llvm::raw_ostream &OS, const char *Terminator, bool Quote,
+             CrashReportInfo *CrashInfo = nullptr) const override;
+
+  int Execute(const StringRef **Redirects, std::string *ErrMsg,
+              bool *ExecutionFailed) const override;
+};
+
 /// JobList - A sequence of jobs to perform.
 class JobList {
 public:
diff --git a/include/clang/Driver/Makefile b/include/clang/Driver/Makefile
deleted file mode 100644
index 8309330..0000000
--- a/include/clang/Driver/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-CLANG_LEVEL := ../../..
-BUILT_SOURCES = Options.inc
-
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-$(ObjDir)/Options.inc.tmp : Options.td CC1Options.td CLCompatOptions.td $(LLVM_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang Driver Option tables with tblgen"
-	$(Verb) $(LLVMTableGen) -gen-opt-parser-defs -o $(call SYSPATH, $@) $<
diff --git a/include/clang/Driver/Multilib.h b/include/clang/Driver/Multilib.h
index 20bb80d..0419186 100644
--- a/include/clang/Driver/Multilib.h
+++ b/include/clang/Driver/Multilib.h
@@ -12,7 +12,6 @@
 
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Option/Option.h"
 #include <functional>
 #include <string>
@@ -99,15 +98,15 @@
   typedef multilib_list::iterator iterator;
   typedef multilib_list::const_iterator const_iterator;
 
-  typedef std::function<std::vector<std::string>(
-      StringRef InstallDir, StringRef Triple, const Multilib &M)>
-  IncludeDirsFunc;
+  typedef std::function<std::vector<std::string>(const Multilib &M)>
+      IncludeDirsFunc;
 
   typedef llvm::function_ref<bool(const Multilib &)> FilterCallback;
 
 private:
   multilib_list Multilibs;
   IncludeDirsFunc IncludeCallback;
+  IncludeDirsFunc FilePathsCallback;
 
 public:
   MultilibSet() {}
@@ -159,6 +158,12 @@
   }
   const IncludeDirsFunc &includeDirsCallback() const { return IncludeCallback; }
 
+  MultilibSet &setFilePathsCallback(IncludeDirsFunc F) {
+    FilePathsCallback = std::move(F);
+    return *this;
+  }
+  const IncludeDirsFunc &filePathsCallback() const { return FilePathsCallback; }
+
 private:
   /// Apply the filter to Multilibs and return the subset that remains
   static multilib_list filterCopy(FilterCallback F, const multilib_list &Ms);
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td
index 735eca2..56fa04c 100644
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -78,6 +78,7 @@
 def i_Group               : OptionGroup<"<i group>">, Group<CompileOnly_Group>;
 def clang_i_Group         : OptionGroup<"<clang i group>">, Group<i_Group>;
 def m_Group               : OptionGroup<"<m group>">, Group<CompileOnly_Group>;
+def opencl_Group          : OptionGroup<"<opencl group>">, Group<CompileOnly_Group>;
 
 // Feature groups - these take command line options that correspond directly to
 // target specific features and can be translated directly from command line
@@ -95,6 +96,8 @@
                            Group<m_Group>;
 def m_wasm_Features_Group : OptionGroup<"<wasm features group>">,
                             Group<m_Group>;
+def m_amdgpu_Features_Group : OptionGroup<"<amdgpu features group>">,
+                              Group<m_Group>;
 
 def m_libc_Group          : OptionGroup<"<m libc group>">, Group<m_Group>;
 def u_Group               : OptionGroup<"<u group>">;
@@ -150,6 +153,9 @@
 def driver_mode : Joined<["--"], "driver-mode=">, Group<internal_driver_Group>,
   Flags<[CoreOption, DriverOption, HelpHidden]>,
   HelpText<"Set the driver mode to either 'gcc', 'g++', 'cpp', or 'cl'">;
+def rsp_quoting : Joined<["--"], "rsp-quoting=">, Group<internal_driver_Group>,
+  Flags<[CoreOption, DriverOption, HelpHidden]>,
+  HelpText<"Set the rsp quoting to either 'posix', or 'windows'">;
 def ccc_gcc_name : Separate<["-"], "ccc-gcc-name">, InternalDriverOpt,
   HelpText<"Name for native GCC compiler">,
   MetaVarName<"<gcc-path>">;
@@ -361,6 +367,30 @@
 def bundle__loader : Separate<["-"], "bundle_loader">;
 def bundle : Flag<["-"], "bundle">;
 def b : JoinedOrSeparate<["-"], "b">, Flags<[Unsupported]>;
+def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">;
+def cl_strict_aliasing : Flag<["-"], "cl-strict-aliasing">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. This option is added for compatibility with OpenCL 1.0.">;
+def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Treat double precision floating-point constant as single precision constant.">;
+def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
+def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Generate kernel argument metadata.">;
+def cl_unsafe_math_optimizations : Flag<["-"], "cl-unsafe-math-optimizations">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Allow unsafe floating-point optimizations.  Also implies -cl-no-signed-zeros and -cl-mad-enable.">;
+def cl_fast_relaxed_math : Flag<["-"], "cl-fast-relaxed-math">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__.">;
+def cl_mad_enable : Flag<["-"], "cl-mad-enable">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Allow use of less precise MAD computations in the generated binary.">;
+def cl_no_signed_zeros : Flag<["-"], "cl-no-signed-zeros">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">;
+def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL language standard to compile for.">;
+def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
+def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
 def client__name : JoinedOrSeparate<["-"], "client_name">;
 def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
 def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
@@ -373,13 +403,29 @@
 def c : Flag<["-"], "c">, Flags<[DriverOption]>,
   HelpText<"Only run preprocess, compile, and assemble steps">;
 def cuda_device_only : Flag<["--"], "cuda-device-only">,
-  HelpText<"Do device-side CUDA compilation only">;
-def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">,
-  Flags<[DriverOption, HelpHidden]>, HelpText<"CUDA GPU architecture">;
+  HelpText<"Compile CUDA code for device only">;
 def cuda_host_only : Flag<["--"], "cuda-host-only">,
-  HelpText<"Do host-side CUDA compilation only">;
+  HelpText<"Compile CUDA code for host only.  Has no effect on non-CUDA "
+           "compilations.">;
+def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">,
+  HelpText<"Compile CUDA code for both host and device (default).  Has no "
+           "effect on non-CUDA compilations.">;
+def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
+  HelpText<"CUDA GPU architecture (e.g. sm_35).  May be specified more than once.">;
+def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">,
+  HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;
+def no_cuda_version_check : Flag<["--"], "no-cuda-version-check">,
+  HelpText<"Don't error out if the detected version of the CUDA install is "
+           "too low for the requested CUDA gpu architecture.">;
+def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;
 def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
   HelpText<"CUDA installation path">;
+def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
+  Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
+def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
+def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
+  Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
+def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
 def dA : Flag<["-"], "dA">, Group<d_Group>;
 def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
   HelpText<"Print macro definitions in -E mode in addition to normal output">;
@@ -435,6 +481,15 @@
   Flags<[DriverOption, CC1Option]>,
   HelpText<"Disable generation of linker directives for automatic library linking">;
 
+def fembed_bitcode_EQ : Joined<["-"], "fembed-bitcode=">,
+    Group<f_Group>, Flags<[DriverOption, CC1Option]>, MetaVarName<"<option>">,
+    HelpText<"Embed LLVM bitcode (option: off, all, bitcode, marker)">;
+def fembed_bitcode : Flag<["-"], "fembed-bitcode">, Group<f_Group>,
+  Alias<fembed_bitcode_EQ>, AliasArgs<["all"]>,
+  HelpText<"Embed LLVM IR bitcode as data">;
+def fembed_bitcode_marker : Flag<["-"], "fembed-bitcode-marker">,
+  Alias<fembed_bitcode_EQ>, AliasArgs<["marker"]>,
+  HelpText<"Embed placeholder LLVM IR data as a marker">;
 def fgnu_inline_asm : Flag<["-"], "fgnu-inline-asm">, Group<f_Group>, Flags<[DriverOption]>;
 def fno_gnu_inline_asm : Flag<["-"], "fno-gnu-inline-asm">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>,
@@ -446,15 +501,15 @@
 def fauto_profile_EQ : Joined<["-"], "fauto-profile=">,
     Alias<fprofile_sample_use_EQ>;
 def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
-    Group<f_Group>, Flags<[CC1Option]>,
+    Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overriden by '=' form of option or LLVM_PROFILE_FILE env var)">;
 def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
-    Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<file>">,
+    Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<file>">,
     HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
 def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
     Flags<[DriverOption]>;
 def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
-    Group<f_Group>, Flags<[CC1Option]>,
+    Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Use instrumentation data for profile-guided optimization">;
 def fcoverage_mapping : Flag<["-"], "fcoverage-mapping">,
     Group<f_Group>, Flags<[CC1Option]>,
@@ -463,7 +518,8 @@
     Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Disable code coverage analysis">;
 def fprofile_generate : Flag<["-"], "fprofile-generate">,
-    Alias<fprofile_instr_generate>;
+    Group<f_Group>, Flags<[DriverOption]>,
+    HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
 def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">,
     Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
     HelpText<"Generate instrumented code to collect execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
@@ -476,7 +532,8 @@
     Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Disable generation of profile instrumentation.">;
 def fno_profile_generate : Flag<["-"], "fno-profile-generate">,
-    Alias<fno_profile_instr_generate>;
+    Group<f_Group>, Flags<[DriverOption]>,
+    HelpText<"Disable generation of profile instrumentation.">;
 def fno_profile_instr_use : Flag<["-"], "fno-profile-instr-use">,
     Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Disable using instrumentation data for profile-guided optimization">;
@@ -565,6 +622,8 @@
 def ferror_limit_EQ : Joined<["-"], "ferror-limit=">, Group<f_Group>, Flags<[CoreOption]>;
 def fexceptions : Flag<["-"], "fexceptions">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable support for exception handling">;
+def fsjlj_exceptions : Flag<["-"], "fsjlj-exceptions">, Group<f_Group>,
+  Flags<[CC1Option]>, HelpText<"Use SjLj style exceptions">;
 def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 def : Flag<["-"], "fexpensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
@@ -576,9 +635,7 @@
 def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
 def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
 def ffast_math : Flag<["-"], "ffast-math">, Group<f_Group>, Flags<[CC1Option]>,
-  HelpText<"Enable the *frontend*'s 'fast-math' mode. This has no effect on "
-           "optimizations, but provides a preprocessor macro __FAST_MATH__ the "
-           "same as GCC's -ffast-math flag">;
+  HelpText<"Allow aggressive, lossy floating-point optimizations">;
 def fno_fast_math : Flag<["-"], "fno-fast-math">, Group<f_Group>;
 def fmath_errno : Flag<["-"], "fmath-errno">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Require math functions to indicate errors by setting errno">;
@@ -586,6 +643,9 @@
 def fbracket_depth_EQ : Joined<["-"], "fbracket-depth=">, Group<f_Group>;
 def fsignaling_math : Flag<["-"], "fsignaling-math">, Group<f_Group>;
 def fno_signaling_math : Flag<["-"], "fno-signaling-math">, Group<f_Group>;
+def fjump_tables : Flag<["-"], "fjump-tables">, Group<f_Group>;
+def fno_jump_tables : Flag<["-"], "fno-jump-tables">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Do not use jump tables for lowering switches">;
 def fsanitize_EQ : CommaJoined<["-"], "fsanitize=">, Group<f_clang_Group>,
                    Flags<[CC1Option, CoreOption]>, MetaVarName<"<check>">,
                    HelpText<"Turn on runtime checks for various forms of undefined "
@@ -622,6 +682,9 @@
 def fsanitize_address_field_padding : Joined<["-"], "fsanitize-address-field-padding=">,
                                         Group<f_clang_Group>, Flags<[CC1Option]>,
                                         HelpText<"Level of field padding for AddressSanitizer">;
+def fsanitize_address_use_after_scope : Flag<["-"], "fsanitize-address-use-after-scope">,
+                                        Group<f_clang_Group>, Flags<[CC1Option]>,
+                                        HelpText<"Enable use-after-scope detection in AddressSanitizer">;
 def fsanitize_recover : Flag<["-"], "fsanitize-recover">, Group<f_clang_Group>,
                         Flags<[CoreOption]>;
 def fno_sanitize_recover : Flag<["-"], "fno-sanitize-recover">,
@@ -658,6 +721,10 @@
 def fno_sanitize_stats : Flag<["-"], "fno-sanitize-stats">,
                                  Group<f_clang_Group>, Flags<[CC1Option]>,
                                  HelpText<"Disable sanitizer statistics gathering.">;
+def fsanitize_undefined_strip_path_components_EQ : Joined<["-"], "fsanitize-undefined-strip-path-components=">,
+  Group<f_clang_Group>, Flags<[CC1Option]>, MetaVarName<"<number>">,
+  HelpText<"Strip (or keep only, if negative) a given number of path components "
+           "when emitting check metadata.">;
 def funsafe_math_optimizations : Flag<["-"], "funsafe-math-optimizations">,
   Group<f_Group>;
 def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">,
@@ -717,12 +784,30 @@
 def fheinous_gnu_extensions : Flag<["-"], "fheinous-gnu-extensions">, Flags<[CC1Option]>;
 def filelist : Separate<["-"], "filelist">, Flags<[LinkerInput]>;
 def : Flag<["-"], "findirect-virtual-calls">, Alias<fapple_kext>;
-def finline_functions : Flag<["-"], "finline-functions">, Group<clang_ignored_gcc_optimization_f_Group>;
+def finline_functions : Flag<["-"], "finline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
+  HelpText<"Inline suitable functions">;
+def finline_hint_functions: Flag<["-"], "finline-hint-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
+  HelpText<"Inline functions wich are (explicitly or implicitly) marked inline">;
 def finline : Flag<["-"], "finline">, Group<clang_ignored_f_Group>;
 def finput_charset_EQ : Joined<["-"], "finput-charset=">, Group<f_Group>;
 def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>;
 def finstrument_functions : Flag<["-"], "finstrument-functions">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Generate calls to instrument function entry and exit">;
+
+def fxray_instrument : Flag<["-"], "fxray-instrument">, Group<f_Group>,
+  Flags<[CC1Option]>,
+  HelpText<"Generate XRay instrumentation sleds on function entry and exit">;
+def fnoxray_instrument : Flag<["-"], "fno-xray-instrument">, Group<f_Group>,
+  Flags<[CC1Option]>;
+
+def fxray_instruction_threshold_EQ :
+  JoinedOrSeparate<["-"], "fxray-instruction-threshold=">,
+  Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Sets the minimum function size to instrument with XRay">;
+def fxray_instruction_threshold_ :
+  JoinedOrSeparate<["-"], "fxray-instruction-threshold">,
+  Group<f_Group>, Flags<[CC1Option]>;
+
 def flat__namespace : Flag<["-"], "flat_namespace">;
 def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Group>;
 def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<f_Group>;
@@ -763,9 +848,6 @@
 def fmodules_user_build_path : Separate<["-"], "fmodules-user-build-path">, Group<i_Group>,
   Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
   HelpText<"Specify the module user build path">;
-def fprebuilt_module_path : Joined<["-"], "fprebuilt-module-path=">, Group<i_Group>,
-  Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
-  HelpText<"Specify the prebuilt module path">;
 def fmodules_prune_interval : Joined<["-"], "fmodules-prune-interval=">, Group<i_Group>,
   Flags<[CC1Option]>, MetaVarName<"<seconds>">,
   HelpText<"Specify the interval (in seconds) between attempts to prune the module cache">;
@@ -798,9 +880,12 @@
   Flags<[DriverOption, CC1Option]>,
   HelpText<"Implicitly search the file system for module map files.">;
 def fmodule_maps : Flag <["-"], "fmodule-maps">, Alias<fimplicit_module_maps>;
-def fmodule_name : JoinedOrSeparate<["-"], "fmodule-name=">, Group<f_Group>,
+def fmodule_name_EQ : Joined<["-"], "fmodule-name=">, Group<f_Group>,
   Flags<[DriverOption,CC1Option]>, MetaVarName<"<name>">,
   HelpText<"Specify the name of the module to build">;
+def fmodule_name : Separate<["-"], "fmodule-name">, Alias<fmodule_name_EQ>;
+def fmodule_implementation_of : Separate<["-"], "fmodule-implementation-of">,
+  Flags<[CC1Option]>, Alias<fmodule_name_EQ>;
 def fmodule_map_file : Joined<["-"], "fmodule-map-file=">,
   Group<f_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"<file>">,
   HelpText<"Load this module map file">;
@@ -840,8 +925,6 @@
   HelpText<"Disable implicit builtin knowledge of functions">;
 def fno_builtin_ : Joined<["-"], "fno-builtin-">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Disable implicit builtin knowledge of a specific function">;
-def fno_math_builtin : Flag<["-"], "fno-math-builtin">, Group<f_Group>, Flags<[CC1Option]>,
-  HelpText<"Disable implicit builtin knowledge of math functions">;
 def fno_caret_diagnostics : Flag<["-"], "fno-caret-diagnostics">, Group<f_Group>,
  Flags<[CC1Option]>;
 def fno_color_diagnostics : Flag<["-"], "fno-color-diagnostics">, Group<f_Group>,
@@ -922,7 +1005,7 @@
 def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group<f_Group>;
 def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
 def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
-def fno_strict_vtable_pointers: Flag<["-"], "fno-strict-vtable-pointers">, 
+def fno_strict_vtable_pointers: Flag<["-"], "fno-strict-vtable-pointers">,
   Group<f_Group>;
 def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>;
 def fno_threadsafe_statics : Flag<["-"], "fno-threadsafe-statics">, Group<f_Group>,
@@ -962,7 +1045,7 @@
   HelpText<"Enable Objective-C garbage collection">;
 def fobjc_legacy_dispatch : Flag<["-"], "fobjc-legacy-dispatch">, Group<f_Group>;
 def fobjc_new_property : Flag<["-"], "fobjc-new-property">, Group<clang_ignored_f_Group>;
-def fobjc_infer_related_result_type : Flag<["-"], "fobjc-infer-related-result-type">, 
+def fobjc_infer_related_result_type : Flag<["-"], "fobjc-infer-related-result-type">,
                                       Group<f_Group>;
 def fno_objc_infer_related_result_type : Flag<["-"],
   "fno-objc-infer-related-result-type">, Group<f_Group>,
@@ -985,9 +1068,12 @@
 def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>;
 def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
 def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>, Flags<[NoArgumentUnused]>;
+def fopenmp_version_EQ : Joined<["-"], "fopenmp-version=">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
 def fopenmp_EQ : Joined<["-"], "fopenmp=">, Group<f_Group>;
 def fopenmp_use_tls : Flag<["-"], "fopenmp-use-tls">, Group<f_Group>, Flags<[NoArgumentUnused]>;
 def fnoopenmp_use_tls : Flag<["-"], "fnoopenmp-use-tls">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
+def fopenmp_targets_EQ : CommaJoined<["-"], "fopenmp-targets=">, Flags<[DriverOption, CC1Option]>,
+  HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
 def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
 def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
 def force__cpusubtype__ALL : Flag<["-"], "force_cpusubtype_ALL">;
@@ -1011,8 +1097,15 @@
 def fno_pic : Flag<["-"], "fno-pic">, Group<f_Group>;
 def fpie : Flag<["-"], "fpie">, Group<f_Group>;
 def fno_pie : Flag<["-"], "fno-pie">, Group<f_Group>;
+def fropi : Flag<["-"], "fropi">, Group<f_Group>;
+def fno_ropi : Flag<["-"], "fno-ropi">, Group<f_Group>;
+def frwpi : Flag<["-"], "frwpi">, Group<f_Group>;
+def fno_rwpi : Flag<["-"], "fno-rwpi">, Group<f_Group>;
 def fplugin_EQ : Joined<["-"], "fplugin=">, Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<dsopath>">,
   HelpText<"Load the named plugin (dynamic shared object)">;
+def fpreserve_as_comments : Flag<["-"], "fpreserve-as-comments">, Group<f_Group>;
+def fno_preserve_as_comments : Flag<["-"], "fno-preserve-as-comments">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Do not preserve comments in inline assembly">;
 def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group<f_Group>;
 def fno_profile_arcs : Flag<["-"], "fno-profile-arcs">, Group<f_Group>;
 def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>;
@@ -1056,7 +1149,7 @@
 def fstrict_enums : Flag<["-"], "fstrict-enums">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable optimizations based on the strict definition of an enum's "
            "value range">;
-def fstrict_vtable_pointers: Flag<["-"], "fstrict-vtable-pointers">, 
+def fstrict_vtable_pointers: Flag<["-"], "fstrict-vtable-pointers">,
   Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable optimizations based on the strict rules for overwriting "
              "polymorphic C++ objects">;
@@ -1137,6 +1230,10 @@
 def fvisibility_ms_compat : Flag<["-"], "fvisibility-ms-compat">, Group<f_Group>,
   HelpText<"Give global types 'default' visibility and global functions and "
            "variables 'hidden' visibility by default">;
+def fwhole_program_vtables : Flag<["-"], "fwhole-program-vtables">, Group<f_Group>,
+  Flags<[CC1Option]>,
+  HelpText<"Enables whole-program vtable optimization. Requires -flto">;
+def fno_whole_program_vtables : Flag<["-"], "fno-whole-program-vtables">, Group<f_Group>;
 def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Treat signed integer overflow as two's complement">;
 def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>, Flags<[CC1Option]>,
@@ -1169,7 +1266,7 @@
 def g_Flag : Flag<["-"], "g">, Group<g_Group>,
   HelpText<"Generate source-level debug information">;
 def gline_tables_only : Flag<["-"], "gline-tables-only">, Group<gN_Group>,
-  HelpText<"Emit debug line number tables only">;
+  Flags<[CoreOption]>, HelpText<"Emit debug line number tables only">;
 def gmlt : Flag<["-"], "gmlt">, Alias<gline_tables_only>;
 def g0 : Flag<["-"], "g0">, Group<gN_Group>;
 def g1 : Flag<["-"], "g1">, Group<gN_Group>, Alias<gline_tables_only>;
@@ -1247,8 +1344,12 @@
   HelpText<"Add directory to QUOTE include search path">, MetaVarName<"<directory>">;
 def isysroot : JoinedOrSeparate<["-"], "isysroot">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Set the system root directory (usually /)">, MetaVarName<"<dir>">;
-def isystem : JoinedOrSeparate<["-"], "isystem">, Group<clang_i_Group>, Flags<[CC1Option]>,
+def isystem : JoinedOrSeparate<["-"], "isystem">, Group<clang_i_Group>,
+  Flags<[CC1Option]>,
   HelpText<"Add directory to SYSTEM include search path">, MetaVarName<"<directory>">;
+def isystem_after : JoinedOrSeparate<["-"], "isystem-after">,
+  Group<clang_i_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
+  HelpText<"Add directory to end of the SYSTEM include search path">;
 def iwithprefixbefore : JoinedOrSeparate<["-"], "iwithprefixbefore">, Group<clang_i_Group>,
   HelpText<"Set directory to include search path with prefix">, MetaVarName<"<dir>">,
   Flags<[CC1Option]>;
@@ -1278,6 +1379,9 @@
 def m64 : Flag<["-"], "m64">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
 def mx32 : Flag<["-"], "mx32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
 def mabi_EQ : Joined<["-"], "mabi=">, Group<m_Group>;
+def miamcu : Flag<["-"], "miamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>,
+  HelpText<"Use Intel MCU ABI">;
+def mno_iamcu : Flag<["-"], "mno-iamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
 def malign_functions_EQ : Joined<["-"], "malign-functions=">, Group<clang_ignored_m_Group>;
 def malign_loops_EQ : Joined<["-"], "malign-loops=">, Group<clang_ignored_m_Group>;
 def malign_jumps_EQ : Joined<["-"], "malign-jumps=">, Group<clang_ignored_m_Group>;
@@ -1292,6 +1396,7 @@
 def march_EQ : Joined<["-"], "march=">, Group<m_Group>;
 def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Flags<[DriverOption]>;
 def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>;
+def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group<m_Group>;
 def mconstant_cfstrings : Flag<["-"], "mconstant-cfstrings">, Group<clang_ignored_m_Group>;
 def mconsole : Joined<["-"], "mconsole">, Group<m_Group>, Flags<[DriverOption]>;
 def mwindows : Joined<["-"], "mwindows">, Group<m_Group>, Flags<[DriverOption]>;
@@ -1304,6 +1409,8 @@
 def mieee_fp : Flag<["-"], "mieee-fp">, Group<clang_ignored_m_Group>;
 def minline_all_stringops : Flag<["-"], "minline-all-stringops">, Group<clang_ignored_m_Group>;
 def mno_inline_all_stringops : Flag<["-"], "mno-inline-all-stringops">, Group<clang_ignored_m_Group>;
+def malign_double : Flag<["-"], "malign-double">, Group<m_Group>, Flags<[CC1Option]>,
+  HelpText<"Align doubles to two words in structs (x86 only)">;
 def mfloat_abi_EQ : Joined<["-"], "mfloat-abi=">, Group<m_Group>;
 def mfpmath_EQ : Joined<["-"], "mfpmath=">, Group<m_Group>;
 def mfpu_EQ : Joined<["-"], "mfpu=">, Group<m_Group>;
@@ -1352,6 +1459,8 @@
 def mno_rtd: Flag<["-"], "mno-rtd">, Group<m_Group>;
 def mno_soft_float : Flag<["-"], "mno-soft-float">, Group<m_Group>;
 def mno_stackrealign : Flag<["-"], "mno-stackrealign">, Group<m_Group>;
+def mno_x87 : Flag<["-"], "mno-x87">, Group<m_x86_Features_Group>;
+def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
 def mno_sse2 : Flag<["-"], "mno-sse2">, Group<m_x86_Features_Group>;
 def mno_sse3 : Flag<["-"], "mno-sse3">, Group<m_x86_Features_Group>;
 def mno_sse4a : Flag<["-"], "mno-sse4a">, Group<m_x86_Features_Group>;
@@ -1373,6 +1482,8 @@
 def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
 def mno_avx512bw : Flag<["-"], "mno-avx512bw">, Group<m_x86_Features_Group>;
 def mno_avx512vl : Flag<["-"], "mno-avx512vl">, Group<m_x86_Features_Group>;
+def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group<m_x86_Features_Group>;
+def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
 def mno_pclmul : Flag<["-"], "mno-pclmul">, Group<m_x86_Features_Group>;
 def mno_lzcnt : Flag<["-"], "mno-lzcnt">, Group<m_x86_Features_Group>;
 def mno_rdrnd : Flag<["-"], "mno-rdrnd">, Group<m_x86_Features_Group>;
@@ -1390,11 +1501,13 @@
 def mno_rdseed : Flag<["-"], "mno-rdseed">, Group<m_x86_Features_Group>;
 def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
 def mno_sha : Flag<["-"], "mno-sha">, Group<m_x86_Features_Group>;
+def mno_cx16 : Flag<["-"], "mno-cx16">, Group<m_x86_Features_Group>;
 def mno_fxsr : Flag<["-"], "mno-fxsr">, Group<m_x86_Features_Group>;
 def mno_xsave : Flag<["-"], "mno-xsave">, Group<m_x86_Features_Group>;
 def mno_xsaveopt : Flag<["-"], "mno-xsaveopt">, Group<m_x86_Features_Group>;
 def mno_xsavec : Flag<["-"], "mno-xsavec">, Group<m_x86_Features_Group>;
 def mno_xsaves : Flag<["-"], "mno-xsaves">, Group<m_x86_Features_Group>;
+def mno_mwaitx : Flag<["-"], "mno-mwaitx">, Group<m_x86_Features_Group>;
 def mno_pku : Flag<["-"], "mno-pku">, Group<m_x86_Features_Group>;
 
 def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_arm_Features_Group>,
@@ -1437,6 +1550,12 @@
 def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
 def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
 
+def mamdgpu_debugger_abi : Joined<["-"], "mamdgpu-debugger-abi=">,
+  Flags<[HelpHidden]>,
+  Group<m_Group>,
+  HelpText<"Generate additional code for specified <version> of debugger ABI (AMDGPU only)">,
+  MetaVarName<"<version>">;
+
 def mvsx : Flag<["-"], "mvsx">, Group<m_ppc_Features_Group>;
 def mno_vsx : Flag<["-"], "mno-vsx">, Group<m_ppc_Features_Group>;
 def mpower8_vector : Flag<["-"], "mpower8-vector">,
@@ -1474,6 +1593,10 @@
 def mno_invariant_function_descriptors :
   Flag<["-"], "mno-invariant-function-descriptors">,
   Group<m_ppc_Features_Group>;
+def mfloat128: Flag<["-"], "mfloat128">,
+    Group<m_ppc_Features_Group>;
+def mno_float128 : Flag<["-"], "mno-float128">,
+    Group<m_ppc_Features_Group>;
 
 def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable AltiVec vector initializer syntax">;
@@ -1491,6 +1614,10 @@
 def mzvector : Flag<["-"], "mzvector">, Alias<fzvector>;
 def mno_zvector : Flag<["-"], "mno-zvector">, Alias<fno_zvector>;
 
+def mbackchain : Flag<["-"], "mbackchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
+  HelpText<"Link stack frames through backchain on System Z">;
+def mno_backchain : Flag<["-"], "mno-backchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>;
+
 def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings">, Group<m_Group>;
 def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group<m_Group>;
 def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group<m_Group>,
@@ -1516,6 +1643,8 @@
 def mimplicit_float : Flag<["-"], "mimplicit-float">, Group<m_Group>;
 def mrecip : Flag<["-"], "mrecip">, Group<m_Group>;
 def mrecip_EQ : CommaJoined<["-"], "mrecip=">, Group<m_Group>, Flags<[CC1Option]>;
+def mx87 : Flag<["-"], "mx87">, Group<m_x86_Features_Group>;
+def m80387 : Flag<["-"], "m80387">, Alias<mx87>;
 def msse2 : Flag<["-"], "msse2">, Group<m_x86_Features_Group>;
 def msse3 : Flag<["-"], "msse3">, Group<m_x86_Features_Group>;
 def msse4a : Flag<["-"], "msse4a">, Group<m_x86_Features_Group>;
@@ -1534,6 +1663,8 @@
 def mavx512dq : Flag<["-"], "mavx512dq">, Group<m_x86_Features_Group>;
 def mavx512bw : Flag<["-"], "mavx512bw">, Group<m_x86_Features_Group>;
 def mavx512vl : Flag<["-"], "mavx512vl">, Group<m_x86_Features_Group>;
+def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group<m_x86_Features_Group>;
+def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
 def mpclmul : Flag<["-"], "mpclmul">, Group<m_x86_Features_Group>;
 def mlzcnt : Flag<["-"], "mlzcnt">, Group<m_x86_Features_Group>;
 def mrdrnd : Flag<["-"], "mrdrnd">, Group<m_x86_Features_Group>;
@@ -1558,6 +1689,7 @@
 def mxsaveopt : Flag<["-"], "mxsaveopt">, Group<m_x86_Features_Group>;
 def mxsavec : Flag<["-"], "mxsavec">, Group<m_x86_Features_Group>;
 def mxsaves : Flag<["-"], "mxsaves">, Group<m_x86_Features_Group>;
+def mmwaitx : Flag<["-"], "mmwaitx">, Group<m_x86_Features_Group>;
 def mips16 : Flag<["-"], "mips16">, Group<m_Group>;
 def mno_mips16 : Flag<["-"], "mno-mips16">, Group<m_Group>;
 def mmicromips : Flag<["-"], "mmicromips">, Group<m_Group>;
@@ -1569,6 +1701,7 @@
 def mcheck_zero_division : Flag<["-"], "mcheck-zero-division">, Group<m_Group>;
 def mno_check_zero_division : Flag<["-"], "mno-check-zero-division">,
                               Group<m_Group>;
+def mcompact_branches_EQ : Joined<["-"], "mcompact-branches=">, Group<m_Group>;
 def mdsp : Flag<["-"], "mdsp">, Group<m_Group>;
 def mno_dsp : Flag<["-"], "mno-dsp">, Group<m_Group>;
 def mdspr2 : Flag<["-"], "mdspr2">, Group<m_Group>;
@@ -1677,8 +1810,6 @@
 def object : Flag<["-"], "object">;
 def o : JoinedOrSeparate<["-"], "o">, Flags<[DriverOption, RenderAsInput, CC1Option, CC1AsOption]>,
   HelpText<"Write output to <file>">, MetaVarName<"<file>">;
-def omptargets_EQ : CommaJoined<["-"], "omptargets=">, Flags<[DriverOption, CC1Option]>,
-  HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
 def pagezero__size : JoinedOrSeparate<["-"], "pagezero_size">;
 def pass_exit_codes : Flag<["-", "--"], "pass-exit-codes">, Flags<[Unsupported]>;
 def pedantic_errors : Flag<["-", "--"], "pedantic-errors">, Group<pedantic_Group>, Flags<[CC1Option]>;
@@ -1718,7 +1849,7 @@
   HelpText<"Rewrite Legacy Objective-C source to C++">;
 def rdynamic : Flag<["-"], "rdynamic">;
 def resource_dir : Separate<["-"], "resource-dir">,
-  Flags<[DriverOption, CC1Option, HelpHidden]>,
+  Flags<[DriverOption, CC1Option, CoreOption, HelpHidden]>,
   HelpText<"The directory which holds the compiler resource files">;
 def resource_dir_EQ : Joined<["-"], "resource-dir=">, Flags<[DriverOption]>,
   Alias<resource_dir>;
diff --git a/include/clang/Driver/SanitizerArgs.h b/include/clang/Driver/SanitizerArgs.h
index 072ddee..7b293e0 100644
--- a/include/clang/Driver/SanitizerArgs.h
+++ b/include/clang/Driver/SanitizerArgs.h
@@ -34,6 +34,7 @@
   bool CfiCrossDso = false;
   int AsanFieldPadding = 0;
   bool AsanSharedRuntime = false;
+  bool AsanUseAfterScope = false;
   bool LinkCXXRuntimes = false;
   bool NeedPIE = false;
   bool Stats = false;
@@ -58,6 +59,9 @@
   bool needsCfiRt() const;
   bool needsCfiDiagRt() const;
   bool needsStatsRt() const { return Stats; }
+  bool needsEsanRt() const {
+    return Sanitizers.hasOneOf(SanitizerKind::Efficiency);
+  }
 
   bool requiresPIE() const;
   bool needsUnwindTables() const;
diff --git a/include/clang/Driver/ToolChain.h b/include/clang/Driver/ToolChain.h
index 1778680..4ccea9d 100644
--- a/include/clang/Driver/ToolChain.h
+++ b/include/clang/Driver/ToolChain.h
@@ -11,13 +11,13 @@
 #define LLVM_CLANG_DRIVER_TOOLCHAIN_H
 
 #include "clang/Basic/Sanitizers.h"
+#include "clang/Basic/VersionTuple.h"
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Multilib.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Target/TargetOptions.h"
 #include <memory>
 #include <string>
@@ -40,6 +40,7 @@
   class Compilation;
   class Driver;
   class JobAction;
+  class RegisterEffectiveTriple;
   class SanitizerArgs;
   class Tool;
 
@@ -91,6 +92,14 @@
 
   mutable std::unique_ptr<SanitizerArgs> SanitizerArguments;
 
+  /// The effective clang triple for the current Job.
+  mutable llvm::Triple EffectiveTriple;
+
+  /// Set the toolchain's effective clang triple.
+  void setEffectiveTriple(llvm::Triple ET) const { EffectiveTriple = ET; }
+
+  friend class RegisterEffectiveTriple;
+
 protected:
   MultilibSet Multilibs;
   const char *DefaultLinker = "ld";
@@ -141,6 +150,12 @@
     return Triple.getTriple();
   }
 
+  /// Get the toolchain's effective clang triple.
+  const llvm::Triple &getEffectiveTriple() const {
+    assert(!EffectiveTriple.getTriple().empty() && "No effective triple");
+    return EffectiveTriple;
+  }
+
   path_list &getFilePaths() { return FilePaths; }
   const path_list &getFilePaths() const { return FilePaths; }
 
@@ -256,6 +271,10 @@
     return ToolChain::RLT_Libgcc;
   }
 
+  virtual CXXStdlibType GetDefaultCXXStdlibType() const {
+    return ToolChain::CST_Libstdcxx;
+  }
+
   virtual std::string getCompilerRT(const llvm::opt::ArgList &Args,
                                     StringRef Component,
                                     bool Shared = false) const;
@@ -315,6 +334,11 @@
     return false;
   }
 
+  /// SupportsEmbeddedBitcode - Does this tool chain support embedded bitcode.
+  virtual bool SupportsEmbeddedBitcode() const {
+    return false;
+  }
+
   /// getThreadModel() - Which thread model does this target use?
   virtual std::string getThreadModel() const { return "posix"; }
 
@@ -408,8 +432,32 @@
   virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                   llvm::opt::ArgStringList &CC1Args) const;
 
+  /// \brief Add arguments to use MCU GCC toolchain includes.
+  virtual void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                                   llvm::opt::ArgStringList &CC1Args) const;
+
   /// \brief Return sanitizers which are available in this toolchain.
   virtual SanitizerMask getSupportedSanitizers() const;
+
+  /// \brief Return sanitizers which are enabled by default.
+  virtual SanitizerMask getDefaultSanitizers() const { return 0; }
+
+  /// \brief On Windows, returns the version of cl.exe.  On other platforms,
+  /// returns an empty VersionTuple.
+  virtual VersionTuple getMSVCVersionFromExe() const { return VersionTuple(); }
+};
+
+/// Set a ToolChain's effective triple. Reset it when the registration object
+/// is destroyed.
+class RegisterEffectiveTriple {
+  const ToolChain &TC;
+
+public:
+  RegisterEffectiveTriple(const ToolChain &TC, llvm::Triple T) : TC(TC) {
+    TC.setEffectiveTriple(T);
+  }
+
+  ~RegisterEffectiveTriple() { TC.setEffectiveTriple(llvm::Triple()); }
 };
 
 } // end namespace driver
diff --git a/include/clang/Driver/Types.def b/include/clang/Driver/Types.def
index baaa411..f2ff194 100644
--- a/include/clang/Driver/Types.def
+++ b/include/clang/Driver/Types.def
@@ -53,6 +53,7 @@
 TYPE("objective-c++-cpp-output", PP_ObjCXX,    INVALID,         "mii",   "u")
 TYPE("objc++-cpp-output",        PP_ObjCXX_Alias, INVALID,      "mii",   "u")
 TYPE("objective-c++",            ObjCXX,       PP_ObjCXX,       "mm",    "u")
+TYPE("renderscript",             RenderScript, PP_C,            "rs",    "u")
 
 // C family input files to precompile.
 TYPE("c-header-cpp-output",      PP_CHeader,   INVALID,         "i",     "p")
diff --git a/include/clang/Edit/Rewriters.h b/include/clang/Edit/Rewriters.h
index 5e3425f..980ed1d 100644
--- a/include/clang/Edit/Rewriters.h
+++ b/include/clang/Edit/Rewriters.h
@@ -9,7 +9,6 @@
 
 #ifndef LLVM_CLANG_EDIT_REWRITERS_H
 #define LLVM_CLANG_EDIT_REWRITERS_H
-#include "llvm/ADT/SmallVector.h"
 
 namespace clang {
   class ObjCMessageExpr;
diff --git a/include/clang/Format/Format.h b/include/clang/Format/Format.h
index 9f3f032..8589891 100644
--- a/include/clang/Format/Format.h
+++ b/include/clang/Format/Format.h
@@ -26,6 +26,10 @@
 class SourceManager;
 class DiagnosticConsumer;
 
+namespace vfs {
+class FileSystem;
+}
+
 namespace format {
 
 enum class ParseError { Success = 0, Error, Unsuitable };
@@ -37,10 +41,10 @@
 const std::error_category &getParseCategory();
 std::error_code make_error_code(ParseError e);
 
-/// \brief The \c FormatStyle is used to configure the formatting to follow
+/// \brief The ``FormatStyle`` is used to configure the formatting to follow
 /// specific guidelines.
 struct FormatStyle {
-  /// \brief The extra indent or outdent of access modifiers, e.g. \c public:.
+  /// \brief The extra indent or outdent of access modifiers, e.g. ``public:``.
   int AccessModifierOffset;
 
   /// \brief Different styles for aligning after open brackets.
@@ -51,7 +55,7 @@
     ///                    argument2);
     /// \endcode
     BAS_Align,
-    /// \brief Don't align, instead use \c ContinuationIndentWidth, e.g.:
+    /// \brief Don't align, instead use ``ContinuationIndentWidth``, e.g.:
     /// \code
     ///   someLongFunction(argument1,
     ///       argument2);
@@ -66,13 +70,13 @@
     BAS_AlwaysBreak,
   };
 
-  /// \brief If \c true, horizontally aligns arguments after an open bracket.
+  /// \brief If ``true``, horizontally aligns arguments after an open bracket.
   ///
   /// This applies to round brackets (parentheses), angle brackets and square
   /// brackets.
   BracketAlignmentStyle AlignAfterOpenBracket;
 
-  /// \brief If \c true, aligns consecutive assignments.
+  /// \brief If ``true``, aligns consecutive assignments.
   ///
   /// This will align the assignment operators of consecutive lines. This
   /// will result in formattings like
@@ -83,7 +87,7 @@
   /// \endcode
   bool AlignConsecutiveAssignments;
 
-  /// \brief If \c true, aligns consecutive declarations.
+  /// \brief If ``true``, aligns consecutive declarations.
   ///
   /// This will align the declaration names of consecutive lines. This
   /// will result in formattings like
@@ -94,11 +98,11 @@
   /// \endcode
   bool AlignConsecutiveDeclarations;
 
-  /// \brief If \c true, aligns escaped newlines as far left as possible.
+  /// \brief If ``true``, aligns escaped newlines as far left as possible.
   /// Otherwise puts them into the right-most column.
   bool AlignEscapedNewlinesLeft;
 
-  /// \brief If \c true, horizontally align operands of binary and ternary
+  /// \brief If ``true``, horizontally align operands of binary and ternary
   /// expressions.
   ///
   /// Specifically, this aligns operands of a single expression that needs to be
@@ -109,19 +113,19 @@
   /// \endcode
   bool AlignOperands;
 
-  /// \brief If \c true, aligns trailing comments.
+  /// \brief If ``true``, aligns trailing comments.
   bool AlignTrailingComments;
 
   /// \brief Allow putting all parameters of a function declaration onto
-  /// the next line even if \c BinPackParameters is \c false.
+  /// the next line even if ``BinPackParameters`` is ``false``.
   bool AllowAllParametersOfDeclarationOnNextLine;
 
   /// \brief Allows contracting simple braced statements to a single line.
   ///
-  /// E.g., this allows <tt>if (a) { return; }</tt> to be put on a single line.
+  /// E.g., this allows ``if (a) { return; }`` to be put on a single line.
   bool AllowShortBlocksOnASingleLine;
 
-  /// \brief If \c true, short case labels will be contracted to a single line.
+  /// \brief If ``true``, short case labels will be contracted to a single line.
   bool AllowShortCaseLabelsOnASingleLine;
 
   /// \brief Different styles for merging short functions containing at most one
@@ -137,22 +141,21 @@
     SFS_All,
   };
 
-  /// \brief Dependent on the value, <tt>int f() { return 0; }</tt> can be put
-  /// on a single line.
+  /// \brief Dependent on the value, ``int f() { return 0; }`` can be put on a
+  /// single line.
   ShortFunctionStyle AllowShortFunctionsOnASingleLine;
 
-  /// \brief If \c true, <tt>if (a) return;</tt> can be put on a single
-  /// line.
+  /// \brief If ``true``, ``if (a) return;`` can be put on a single line.
   bool AllowShortIfStatementsOnASingleLine;
 
-  /// \brief If \c true, <tt>while (true) continue;</tt> can be put on a
-  /// single line.
+  /// \brief If ``true``, ``while (true) continue;`` can be put on a single
+  /// line.
   bool AllowShortLoopsOnASingleLine;
 
   /// \brief Different ways to break after the function definition return type.
   enum DefinitionReturnTypeBreakingStyle {
     /// Break after return type automatically.
-    /// \c PenaltyReturnTypeOnItsOwnLine is taken into account.
+    /// ``PenaltyReturnTypeOnItsOwnLine`` is taken into account.
     DRTBS_None,
     /// Always break after the return type.
     DRTBS_All,
@@ -164,7 +167,7 @@
   /// declaration return type.
   enum ReturnTypeBreakingStyle {
     /// Break after return type automatically.
-    /// \c PenaltyReturnTypeOnItsOwnLine is taken into account.
+    /// ``PenaltyReturnTypeOnItsOwnLine`` is taken into account.
     RTBS_None,
     /// Always break after the return type.
     RTBS_All,
@@ -183,23 +186,23 @@
   /// \brief The function declaration return type breaking style to use.
   ReturnTypeBreakingStyle AlwaysBreakAfterReturnType;
 
-  /// \brief If \c true, always break before multiline string literals.
+  /// \brief If ``true``, always break before multiline string literals.
   ///
   /// This flag is mean to make cases where there are multiple multiline strings
   /// in a file look more consistent. Thus, it will only take effect if wrapping
   /// the string at that point leads to it being indented
-  /// \c ContinuationIndentWidth spaces from the start of the line.
+  /// ``ContinuationIndentWidth`` spaces from the start of the line.
   bool AlwaysBreakBeforeMultilineStrings;
 
-  /// \brief If \c true, always break after the <tt>template<...></tt> of a
-  /// template declaration.
+  /// \brief If ``true``, always break after the ``template<...>`` of a template
+  /// declaration.
   bool AlwaysBreakTemplateDeclarations;
 
-  /// \brief If \c false, a function call's arguments will either be all on the
+  /// \brief If ``false``, a function call's arguments will either be all on the
   /// same line or will have one line each.
   bool BinPackArguments;
 
-  /// \brief If \c false, a function declaration's or function definition's
+  /// \brief If ``false``, a function declaration's or function definition's
   /// parameters will either all be on the same line or will have one line each.
   bool BinPackParameters;
 
@@ -220,13 +223,14 @@
   enum BraceBreakingStyle {
     /// Always attach braces to surrounding context.
     BS_Attach,
-    /// Like \c Attach, but break before braces on function, namespace and
+    /// Like ``Attach``, but break before braces on function, namespace and
     /// class definitions.
     BS_Linux,
     /// Like ``Attach``, but break before braces on enum, function, and record
     /// definitions.
     BS_Mozilla,
-    /// Like \c Attach, but break before function definitions, 'catch', and 'else'.
+    /// Like ``Attach``, but break before function definitions, ``catch``, and
+    /// ``else``.
     BS_Stroustrup,
     /// Always break before braces.
     BS_Allman,
@@ -236,7 +240,7 @@
     BS_GNU,
     /// Like ``Attach``, but break before functions.
     BS_WebKit,
-    /// Configure each individual brace in \c BraceWrapping.
+    /// Configure each individual brace in `BraceWrapping`.
     BS_Custom
   };
 
@@ -247,7 +251,7 @@
   struct BraceWrappingFlags {
     /// \brief Wrap class definitions.
     bool AfterClass;
-    /// \brief Wrap control statements (if/for/while/switch/..).
+    /// \brief Wrap control statements (``if``/``for``/``while``/``switch``/..).
     bool AfterControlStatement;
     /// \brief Wrap enum definitions.
     bool AfterEnum;
@@ -255,15 +259,15 @@
     bool AfterFunction;
     /// \brief Wrap namespace definitions.
     bool AfterNamespace;
-    /// \brief Wrap ObjC definitions (@autoreleasepool, interfaces, ..).
+    /// \brief Wrap ObjC definitions (``@autoreleasepool``, interfaces, ..).
     bool AfterObjCDeclaration;
     /// \brief Wrap struct definitions.
     bool AfterStruct;
     /// \brief Wrap union definitions.
     bool AfterUnion;
-    /// \brief Wrap before \c catch.
+    /// \brief Wrap before ``catch``.
     bool BeforeCatch;
-    /// \brief Wrap before \c else.
+    /// \brief Wrap before ``else``.
     bool BeforeElse;
     /// \brief Indent the wrapped braces themselves.
     bool IndentBraces;
@@ -271,11 +275,11 @@
 
   /// \brief Control of individual brace wrapping cases.
   ///
-  /// If \c BreakBeforeBraces is set to \c custom, use this to specify how each
-  /// individual brace case should be handled. Otherwise, this is ignored.
+  /// If ``BreakBeforeBraces`` is set to ``BS_Custom``, use this to specify how
+  /// each individual brace case should be handled. Otherwise, this is ignored.
   BraceWrappingFlags BraceWrapping;
 
-  /// \brief If \c true, ternary operators will be placed after line breaks.
+  /// \brief If ``true``, ternary operators will be placed after line breaks.
   bool BreakBeforeTernaryOperators;
 
   /// \brief Always break constructor initializers before commas and align
@@ -290,7 +294,7 @@
 
   /// \brief The column limit.
   ///
-  /// A column limit of \c 0 means that there is no column limit. In this case,
+  /// A column limit of ``0`` means that there is no column limit. In this case,
   /// clang-format will respect the input's line breaking decisions within
   /// statements unless they contradict other rules.
   unsigned ColumnLimit;
@@ -310,7 +314,7 @@
   /// \brief Indent width for line continuations.
   unsigned ContinuationIndentWidth;
 
-  /// \brief If \c true, format braced lists as best suited for C++11 braced
+  /// \brief If ``true``, format braced lists as best suited for C++11 braced
   /// lists.
   ///
   /// Important differences:
@@ -320,19 +324,20 @@
   ///
   /// Fundamentally, C++11 braced lists are formatted exactly like function
   /// calls would be formatted in their place. If the braced list follows a name
-  /// (e.g. a type or variable name), clang-format formats as if the \c {} were
+  /// (e.g. a type or variable name), clang-format formats as if the ``{}`` were
   /// the parentheses of a function call with that name. If there is no name,
   /// a zero-length name is assumed.
   bool Cpp11BracedListStyle;
 
-  /// \brief If \c true, analyze the formatted file for the most common
-  /// alignment of & and *. \c PointerAlignment is then used only as fallback.
+  /// \brief If ``true``, analyze the formatted file for the most common
+  /// alignment of ``&`` and ``*``. ``PointerAlignment`` is then used only as
+  /// fallback.
   bool DerivePointerAlignment;
 
   /// \brief Disables formatting completely.
   bool DisableFormat;
 
-  /// \brief If \c true, clang-format detects whether function calls and
+  /// \brief If ``true``, clang-format detects whether function calls and
   /// definitions are formatted with one parameter per line.
   ///
   /// Each call can be bin-packed, one-per-line or inconclusive. If it is
@@ -354,14 +359,14 @@
   /// \endcode
   ///
   /// In the .clang-format configuration file, this can be configured like:
-  /// \code
+  /// \code{.yaml}
   ///   ForEachMacros: ['RANGES_FOR', 'FOREACH']
   /// \endcode
   ///
   /// For example: BOOST_FOREACH.
   std::vector<std::string> ForEachMacros;
 
-  /// \brief See documentation of \c IncludeCategories.
+  /// \brief See documentation of ``IncludeCategories``.
   struct IncludeCategory {
     /// \brief The regular expression that this category matches.
     std::string Regex;
@@ -372,24 +377,24 @@
     }
   };
 
-  /// \brief Regular expressions denoting the different #include categories used
-  /// for ordering #includes.
+  /// \brief Regular expressions denoting the different ``#include`` categories
+  /// used for ordering ``#includes``.
   ///
   /// These regular expressions are matched against the filename of an include
   /// (including the <> or "") in order. The value belonging to the first
-  /// matching regular expression is assigned and #includes are sorted first
+  /// matching regular expression is assigned and ``#includes`` are sorted first
   /// according to increasing category number and then alphabetically within
   /// each category.
   ///
   /// If none of the regular expressions match, INT_MAX is assigned as
   /// category. The main header for a source file automatically gets category 0.
-  /// so that it is generally kept at the beginning of the #includes
+  /// so that it is generally kept at the beginning of the ``#includes``
   /// (http://llvm.org/docs/CodingStandards.html#include-style). However, you
   /// can also assign negative priorities if you have certain headers that
   /// always need to be first.
   ///
   /// To configure this in the .clang-format file, use:
-  /// \code
+  /// \code{.yaml}
   ///   IncludeCategories:
   ///     - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
   ///       Priority:        2
@@ -400,9 +405,22 @@
   /// \endcode
   std::vector<IncludeCategory> IncludeCategories;
 
+  /// \brief Specify a regular expression of suffixes that are allowed in the
+  /// file-to-main-include mapping.
+  ///
+  /// When guessing whether a #include is the "main" include (to assign
+  /// category 0, see above), use this regex of allowed suffixes to the header
+  /// stem. A partial match is done, so that:
+  /// - "" means "arbitrary suffix"
+  /// - "$" means "no suffix"
+  ///
+  /// For example, if configured to "(_test)?$", then a header a.h would be seen
+  /// as the "main" include in both a.cc and a_test.cc.
+  std::string IncludeIsMainRegex;
+
   /// \brief Indent case labels one level from the switch statement.
   ///
-  /// When \c false, use the same indentation level as for the switch statement.
+  /// When ``false``, use the same indentation level as for the switch statement.
   /// Switch statement body is always indented one level more than case labels.
   bool IndentCaseLabels;
 
@@ -413,12 +431,31 @@
   /// type.
   bool IndentWrappedFunctionNames;
 
+  /// \brief Quotation styles for JavaScript strings. Does not affect template
+  /// strings.
+  enum JavaScriptQuoteStyle {
+    /// Leave string quotes as they are.
+    JSQS_Leave,
+    /// Always use single quotes.
+    JSQS_Single,
+    /// Always use double quotes.
+    JSQS_Double
+  };
+
+  /// \brief The JavaScriptQuoteStyle to use for JavaScript strings.
+  JavaScriptQuoteStyle JavaScriptQuotes;
+
+  /// \brief Whether to wrap JavaScript import/export statements.
+  bool JavaScriptWrapImports;
+
   /// \brief If true, empty lines at the start of blocks are kept.
   bool KeepEmptyLinesAtTheStartOfBlocks;
 
-  /// \brief Supported languages. When stored in a configuration file, specifies
-  /// the language, that the configuration targets. When passed to the
-  /// reformat() function, enables syntax features specific to the language.
+  /// \brief Supported languages.
+  ///
+  /// When stored in a configuration file, specifies the language, that the
+  /// configuration targets. When passed to the ``reformat()`` function, enables
+  /// syntax features specific to the language.
   enum LanguageKind {
     /// Do not use.
     LK_None,
@@ -463,21 +500,21 @@
   /// \brief The number of characters to use for indentation of ObjC blocks.
   unsigned ObjCBlockIndentWidth;
 
-  /// \brief Add a space after \c @property in Objective-C, i.e. use
-  /// <tt>\@property (readonly)</tt> instead of <tt>\@property(readonly)</tt>.
+  /// \brief Add a space after ``@property`` in Objective-C, i.e. use
+  /// ``@property (readonly)`` instead of ``@property(readonly)``.
   bool ObjCSpaceAfterProperty;
 
   /// \brief Add a space in front of an Objective-C protocol list, i.e. use
-  /// <tt>Foo <Protocol></tt> instead of \c Foo<Protocol>.
+  /// ``Foo <Protocol>`` instead of ``Foo<Protocol>``.
   bool ObjCSpaceBeforeProtocolList;
 
-  /// \brief The penalty for breaking a function call after "call(".
+  /// \brief The penalty for breaking a function call after ``call(``.
   unsigned PenaltyBreakBeforeFirstCallParameter;
 
   /// \brief The penalty for each line break introduced inside a comment.
   unsigned PenaltyBreakComment;
 
-  /// \brief The penalty for breaking before the first \c <<.
+  /// \brief The penalty for breaking before the first ``<<``.
   unsigned PenaltyBreakFirstLessLess;
 
   /// \brief The penalty for each line break introduced inside a string literal.
@@ -490,7 +527,7 @@
   /// line.
   unsigned PenaltyReturnTypeOnItsOwnLine;
 
-  /// \brief The & and * alignment style.
+  /// \brief The ``&`` and ``*`` alignment style.
   enum PointerAlignmentStyle {
     /// Align pointer to the left.
     PAS_Left,
@@ -503,16 +540,19 @@
   /// \brief Pointer and reference alignment style.
   PointerAlignmentStyle PointerAlignment;
 
-  /// \brief If true, clang-format will attempt to re-flow comments.
+  /// \brief If ``true``, clang-format will attempt to re-flow comments.
   bool ReflowComments;
 
-  /// \brief If true, clang-format will sort #includes.
+  /// \brief If ``true``, clang-format will sort ``#includes``.
   bool SortIncludes;
 
-  /// \brief If \c true, a space may be inserted after C style casts.
+  /// \brief If ``true``, a space may be inserted after C style casts.
   bool SpaceAfterCStyleCast;
 
-  /// \brief If \c false, spaces will be removed before assignment operators.
+  /// \brief If \c true, a space will be inserted after the 'template' keyword.
+  bool SpaceAfterTemplateKeyword;
+
+  /// \brief If ``false``, spaces will be removed before assignment operators.
   bool SpaceBeforeAssignmentOperators;
 
   /// \brief Different ways to put a space before opening parentheses.
@@ -520,7 +560,7 @@
     /// Never put a space before opening parentheses.
     SBPO_Never,
     /// Put a space before opening parentheses only after control statement
-    /// keywords (<tt>for/if/while...</tt>).
+    /// keywords (``for/if/while...``).
     SBPO_ControlStatements,
     /// Always put a space before opening parentheses, except when it's
     /// prohibited by the syntax rules (in function-like macro definitions) or
@@ -532,46 +572,46 @@
   /// \brief Defines in which cases to put a space before opening parentheses.
   SpaceBeforeParensOptions SpaceBeforeParens;
 
-  /// \brief If \c true, spaces may be inserted into '()'.
+  /// \brief If ``true``, spaces may be inserted into ``()``.
   bool SpaceInEmptyParentheses;
 
   /// \brief The number of spaces before trailing line comments
-  /// (\c // - comments).
+  /// (``//`` - comments).
   ///
-  /// This does not affect trailing block comments (\c /**/ - comments) as those
-  /// commonly have different usage patterns and a number of special cases.
+  /// This does not affect trailing block comments (``/*`` - comments) as
+  /// those commonly have different usage patterns and a number of special
+  /// cases.
   unsigned SpacesBeforeTrailingComments;
 
-  /// \brief If \c true, spaces will be inserted after '<' and before '>' in
-  /// template argument lists
+  /// \brief If ``true``, spaces will be inserted after ``<`` and before ``>``
+  /// in template argument lists.
   bool SpacesInAngles;
 
-  /// \brief If \c true, spaces are inserted inside container literals (e.g.
+  /// \brief If ``true``, spaces are inserted inside container literals (e.g.
   /// ObjC and Javascript array and dict literals).
   bool SpacesInContainerLiterals;
 
-  /// \brief If \c true, spaces may be inserted into C style casts.
+  /// \brief If ``true``, spaces may be inserted into C style casts.
   bool SpacesInCStyleCastParentheses;
 
-  /// \brief If \c true, spaces will be inserted after '(' and before ')'.
+  /// \brief If ``true``, spaces will be inserted after ``(`` and before ``)``.
   bool SpacesInParentheses;
 
-  /// \brief If \c true, spaces will be inserted after '[' and before ']'.
+  /// \brief If ``true``, spaces will be inserted after ``[`` and before ``]``.
   bool SpacesInSquareBrackets;
 
   /// \brief Supported language standards.
   enum LanguageStandard {
     /// Use C++03-compatible syntax.
     LS_Cpp03,
-    /// Use features of C++11 (e.g. \c A<A<int>> instead of
-    /// <tt>A<A<int> ></tt>).
+    /// Use features of C++11 (e.g. ``A<A<int>>`` instead of ``A<A<int> >``).
     LS_Cpp11,
     /// Automatic detection based on the input.
     LS_Auto
   };
 
-  /// \brief Format compatible with this standard, e.g. use
-  /// <tt>A<A<int> ></tt> instead of \c A<A<int>> for LS_Cpp03.
+  /// \brief Format compatible with this standard, e.g. use ``A<A<int> >``
+  /// instead of ``A<A<int>>`` for ``LS_Cpp03``.
   LanguageStandard Standard;
 
   /// \brief The number of columns used for tab stops.
@@ -583,6 +623,8 @@
     UT_Never,
     /// Use tabs only for indentation.
     UT_ForIndentation,
+    /// Use tabs only for line continuation and indentation.
+    UT_ForContinuationAndIndentation,
     /// Use tabs whenever we need to fill whitespace that spans at least from
     /// one tab stop to the next one.
     UT_Always
@@ -639,6 +681,8 @@
            IndentCaseLabels == R.IndentCaseLabels &&
            IndentWidth == R.IndentWidth && Language == R.Language &&
            IndentWrappedFunctionNames == R.IndentWrappedFunctionNames &&
+           JavaScriptQuotes == R.JavaScriptQuotes &&
+           JavaScriptWrapImports == R.JavaScriptWrapImports &&
            KeepEmptyLinesAtTheStartOfBlocks ==
                R.KeepEmptyLinesAtTheStartOfBlocks &&
            MacroBlockBegin == R.MacroBlockBegin &&
@@ -657,6 +701,7 @@
            PenaltyReturnTypeOnItsOwnLine == R.PenaltyReturnTypeOnItsOwnLine &&
            PointerAlignment == R.PointerAlignment &&
            SpaceAfterCStyleCast == R.SpaceAfterCStyleCast &&
+           SpaceAfterTemplateKeyword == R.SpaceAfterTemplateKeyword &&
            SpaceBeforeAssignmentOperators == R.SpaceBeforeAssignmentOperators &&
            SpaceBeforeParens == R.SpaceBeforeParens &&
            SpaceInEmptyParentheses == R.SpaceInEmptyParentheses &&
@@ -705,39 +750,55 @@
 /// Currently supported names: LLVM, Google, Chromium, Mozilla. Names are
 /// compared case-insensitively.
 ///
-/// Returns \c true if the Style has been set.
+/// Returns ``true`` if the Style has been set.
 bool getPredefinedStyle(StringRef Name, FormatStyle::LanguageKind Language,
                         FormatStyle *Style);
 
 /// \brief Parse configuration from YAML-formatted text.
 ///
-/// Style->Language is used to get the base style, if the \c BasedOnStyle
+/// Style->Language is used to get the base style, if the ``BasedOnStyle``
 /// option is present.
 ///
-/// When \c BasedOnStyle is not present, options not present in the YAML
+/// When ``BasedOnStyle`` is not present, options not present in the YAML
 /// document, are retained in \p Style.
 std::error_code parseConfiguration(StringRef Text, FormatStyle *Style);
 
 /// \brief Gets configuration in a YAML string.
 std::string configurationAsText(const FormatStyle &Style);
 
-/// \brief Returns the replacements necessary to sort all #include blocks that
-/// are affected by 'Ranges'.
+/// \brief Returns the replacements necessary to sort all ``#include`` blocks
+/// that are affected by ``Ranges``.
 tooling::Replacements sortIncludes(const FormatStyle &Style, StringRef Code,
                                    ArrayRef<tooling::Range> Ranges,
                                    StringRef FileName,
                                    unsigned *Cursor = nullptr);
 
+/// \brief Returns the replacements corresponding to applying and formatting
+/// \p Replaces on success; otheriwse, return an llvm::Error carrying
+/// llvm::StringError.
+llvm::Expected<tooling::Replacements>
+formatReplacements(StringRef Code, const tooling::Replacements &Replaces,
+                   const FormatStyle &Style);
+
+/// \brief Returns the replacements corresponding to applying \p Replaces and
+/// cleaning up the code after that on success; otherwise, return an llvm::Error
+/// carrying llvm::StringError.
+/// This also inserts a C++ #include directive into the correct block if the
+/// replacement corresponding to the header insertion has offset UINT_MAX.
+llvm::Expected<tooling::Replacements>
+cleanupAroundReplacements(StringRef Code, const tooling::Replacements &Replaces,
+                          const FormatStyle &Style);
+
 /// \brief Reformats the given \p Ranges in the file \p ID.
 ///
 /// Each range is extended on either end to its next bigger logic unit, i.e.
 /// everything that might influence its formatting or might be influenced by its
 /// formatting.
 ///
-/// Returns the \c Replacements necessary to make all \p Ranges comply with
+/// Returns the ``Replacements`` necessary to make all \p Ranges comply with
 /// \p Style.
 ///
-/// If \c IncompleteFormat is non-null, its value will be set to true if any
+/// If ``IncompleteFormat`` is non-null, its value will be set to true if any
 /// of the affected ranges were not formatted due to a non-recoverable syntax
 /// error.
 tooling::Replacements reformat(const FormatStyle &Style,
@@ -753,37 +814,71 @@
                                StringRef FileName = "<stdin>",
                                bool *IncompleteFormat = nullptr);
 
-/// \brief Returns the \c LangOpts that the formatter expects you to set.
+/// \brief Clean up any erroneous/redundant code in the given \p Ranges in the
+/// file \p ID.
+///
+/// Returns the ``Replacements`` that clean up all \p Ranges in the file \p ID.
+tooling::Replacements cleanup(const FormatStyle &Style,
+                              SourceManager &SourceMgr, FileID ID,
+                              ArrayRef<CharSourceRange> Ranges);
+
+/// \brief Clean up any erroneous/redundant code in the given \p Ranges in \p
+/// Code.
+///
+/// Otherwise identical to the cleanup() function using a file ID.
+tooling::Replacements cleanup(const FormatStyle &Style, StringRef Code,
+                              ArrayRef<tooling::Range> Ranges,
+                              StringRef FileName = "<stdin>");
+
+/// \brief Returns the ``LangOpts`` that the formatter expects you to set.
 ///
 /// \param Style determines specific settings for lexing mode.
 LangOptions getFormattingLangOpts(const FormatStyle &Style = getLLVMStyle());
 
-/// \brief Description to be used for help text for a llvm::cl option for
+/// \brief Description to be used for help text for a ``llvm::cl`` option for
 /// specifying format style. The description is closely related to the operation
-/// of getStyle().
+/// of ``getStyle()``.
 extern const char *StyleOptionHelpDescription;
 
-/// \brief Construct a FormatStyle based on \c StyleName.
+/// \brief Construct a FormatStyle based on ``StyleName``.
 ///
-/// \c StyleName can take several forms:
-/// \li "{<key>: <value>, ...}" - Set specic style parameters.
-/// \li "<style name>" - One of the style names supported by
+/// ``StyleName`` can take several forms:
+/// * "{<key>: <value>, ...}" - Set specic style parameters.
+/// * "<style name>" - One of the style names supported by
 /// getPredefinedStyle().
-/// \li "file" - Load style configuration from a file called '.clang-format'
-/// located in one of the parent directories of \c FileName or the current
-/// directory if \c FileName is empty.
+/// * "file" - Load style configuration from a file called ``.clang-format``
+/// located in one of the parent directories of ``FileName`` or the current
+/// directory if ``FileName`` is empty.
 ///
 /// \param[in] StyleName Style name to interpret according to the description
 /// above.
-/// \param[in] FileName Path to start search for .clang-format if \c StyleName
+/// \param[in] FileName Path to start search for .clang-format if ``StyleName``
 /// == "file".
 /// \param[in] FallbackStyle The name of a predefined style used to fallback to
 /// in case the style can't be determined from \p StyleName.
+/// \param[in] FS The underlying file system, in which the file resides. By
+/// default, the file system is the real file system.
 ///
-/// \returns FormatStyle as specified by \c StyleName. If no style could be
-/// determined, the default is LLVM Style (see getLLVMStyle()).
+/// \returns FormatStyle as specified by ``StyleName``. If no style could be
+/// determined, the default is LLVM Style (see ``getLLVMStyle()``).
 FormatStyle getStyle(StringRef StyleName, StringRef FileName,
-                     StringRef FallbackStyle);
+                     StringRef FallbackStyle, vfs::FileSystem *FS = nullptr);
+
+// \brief Returns a string representation of ``Language``.
+inline StringRef getLanguageName(FormatStyle::LanguageKind Language) {
+  switch (Language) {
+  case FormatStyle::LK_Cpp:
+    return "C++";
+  case FormatStyle::LK_Java:
+    return "Java";
+  case FormatStyle::LK_JavaScript:
+    return "JavaScript";
+  case FormatStyle::LK_Proto:
+    return "Proto";
+  default:
+    return "Unknown";
+  }
+}
 
 } // end namespace format
 } // end namespace clang
diff --git a/include/clang/Frontend/ASTConsumers.h b/include/clang/Frontend/ASTConsumers.h
index 757fcae..b76bfcb 100644
--- a/include/clang/Frontend/ASTConsumers.h
+++ b/include/clang/Frontend/ASTConsumers.h
@@ -31,7 +31,7 @@
 // original C code.  The output is intended to be in a format such that
 // clang could re-parse the output back into the same AST, but the
 // implementation is still incomplete.
-std::unique_ptr<ASTConsumer> CreateASTPrinter(raw_ostream *OS,
+std::unique_ptr<ASTConsumer> CreateASTPrinter(std::unique_ptr<raw_ostream> OS,
                                               StringRef FilterString);
 
 // AST dumper: dumps the raw AST in human-readable form to stderr; this is
diff --git a/include/clang/Frontend/ASTUnit.h b/include/clang/Frontend/ASTUnit.h
index 04e6dce..2791227 100644
--- a/include/clang/Frontend/ASTUnit.h
+++ b/include/clang/Frontend/ASTUnit.h
@@ -16,7 +16,6 @@
 
 #include "clang-c/Index.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/FileSystemOptions.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceManager.h"
@@ -30,9 +29,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/MD5.h"
-#include "llvm/Support/Path.h"
 #include <cassert>
-#include <map>
 #include <memory>
 #include <string>
 #include <sys/types.h>
@@ -47,7 +44,6 @@
 class Sema;
 class ASTContext;
 class ASTReader;
-class CodeCompleteConsumer;
 class CompilerInvocation;
 class CompilerInstance;
 class Decl;
@@ -58,7 +54,6 @@
 class Preprocessor;
 class PCHContainerOperations;
 class PCHContainerReader;
-class SourceManager;
 class TargetInfo;
 class FrontendAction;
 class ASTDeserializationListener;
diff --git a/include/clang/Frontend/CodeGenOptions.def b/include/clang/Frontend/CodeGenOptions.def
index 6d2bec5..51b0cf6 100644
--- a/include/clang/Frontend/CodeGenOptions.def
+++ b/include/clang/Frontend/CodeGenOptions.def
@@ -30,9 +30,13 @@
 
 CODEGENOPT(DisableIntegratedAS, 1, 0) ///< -no-integrated-as
 CODEGENOPT(CompressDebugSections, 1, 0) ///< -Wa,-compress-debug-sections
-CODEGENOPT(Autolink          , 1, 1) ///< -fno-autolink
+CODEGENOPT(RelaxELFRelocations, 1, 0) ///< -Wa,--mrelax-relocations
 CODEGENOPT(AsmVerbose        , 1, 0) ///< -dA, -fverbose-asm.
+CODEGENOPT(PreserveAsmComments, 1, 1) ///< -dA, -fno-preserve-as-comments.
+CODEGENOPT(AssumeSaneOperatorNew , 1, 1) ///< implicit __attribute__((malloc)) operator new
+CODEGENOPT(Autolink          , 1, 1) ///< -fno-autolink
 CODEGENOPT(ObjCAutoRefCountExceptions , 1, 0) ///< Whether ARC should be EH-safe.
+CODEGENOPT(Backchain         , 1, 0) ///< -mbackchain
 CODEGENOPT(CoverageExtraChecksum, 1, 0) ///< Whether we need a second checksum for functions in GCNO files.
 CODEGENOPT(CoverageNoFunctionNamesInData, 1, 0) ///< Do not include function names in GCDA files.
 CODEGENOPT(CoverageExitBlockBeforeBody, 1, 0) ///< Whether to emit the exit block before the body blocks in GCNO files.
@@ -55,7 +59,7 @@
 CODEGENOPT(DisableRedZone    , 1, 0) ///< Set when -mno-red-zone is enabled.
 CODEGENOPT(DisableTailCalls  , 1, 0) ///< Do not emit tail calls.
 CODEGENOPT(EmitDeclMetadata  , 1, 0) ///< Emit special metadata indicating what
-                                     ///< Decl* various IR entities came from. 
+                                     ///< Decl* various IR entities came from.
                                      ///< Only useful when running CodeGen as a
                                      ///< subroutine.
 CODEGENOPT(EmitGcovArcs      , 1, 0) ///< Emit coverage data files, aka. GCDA.
@@ -64,11 +68,21 @@
 CODEGENOPT(EmulatedTLS       , 1, 0) ///< Set when -femulated-tls is enabled.
 /// \brief FP_CONTRACT mode (on/off/fast).
 ENUM_CODEGENOPT(FPContractMode, FPContractModeKind, 2, FPC_On)
+/// \brief Embed Bitcode mode (off/all/bitcode/marker).
+ENUM_CODEGENOPT(EmbedBitcode, EmbedBitcodeKind, 2, Embed_Off)
 CODEGENOPT(ForbidGuardVariables , 1, 0) ///< Issue errors if C++ guard variables
                                         ///< are required.
 CODEGENOPT(FunctionSections  , 1, 0) ///< Set when -ffunction-sections is enabled.
 CODEGENOPT(InstrumentFunctions , 1, 0) ///< Set when -finstrument-functions is
                                        ///< enabled.
+
+CODEGENOPT(XRayInstrumentFunctions , 1, 0) ///< Set when -fxray-instrument is
+                                           ///< enabled.
+
+///< Set the minimum number of instructions in a function to determine selective
+///< XRay instrumentation.
+VALUE_CODEGENOPT(XRayInstructionThreshold , 32, 200)
+
 CODEGENOPT(InstrumentForProfiling , 1, 0) ///< Set when -pg is enabled.
 CODEGENOPT(LessPreciseFPMAD  , 1, 0) ///< Enable less precise MAD instructions to
                                      ///< be generated.
@@ -96,16 +110,20 @@
 CODEGENOPT(NoInline          , 1, 0) ///< Set when -fno-inline is enabled.
                                      ///< Disables use of the inline keyword.
 CODEGENOPT(NoNaNsFPMath      , 1, 0) ///< Assume FP arguments, results not NaN.
+CODEGENOPT(FlushDenorm       , 1, 0) ///< Allow FP denorm numbers to be flushed to zero
+CODEGENOPT(CorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt
 CODEGENOPT(NoZeroInitializedInBSS , 1, 0) ///< -fno-zero-initialized-in-bss.
 /// \brief Method of Objective-C dispatch to use.
-ENUM_CODEGENOPT(ObjCDispatchMethod, ObjCDispatchMethodKind, 2, Legacy) 
+ENUM_CODEGENOPT(ObjCDispatchMethod, ObjCDispatchMethodKind, 2, Legacy)
 CODEGENOPT(OmitLeafFramePointer , 1, 0) ///< Set when -momit-leaf-frame-pointer is
                                         ///< enabled.
 VALUE_CODEGENOPT(OptimizationLevel, 2, 0) ///< The -O[0-3] option specified.
 VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is specified.
 
-CODEGENOPT(ProfileInstrGenerate , 1, 0) ///< Instrument code to generate
-                                        ///< execution counts to use with PGO.
+/// \brief Choose profile instrumenation kind or no instrumentation.
+ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 2, ProfileNone)
+/// \brief Choose profile kind for PGO use compilation.
+ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
 CODEGENOPT(CoverageMapping , 1, 0) ///< Generate coverage mapping regions to
                                    ///< enable code coverage analysis.
 CODEGENOPT(DumpCoverageMapping , 1, 0) ///< Dump the generated coverage mapping
@@ -118,8 +136,8 @@
 CODEGENOPT(RelaxedAliasing   , 1, 0) ///< Set when -fno-strict-aliasing is enabled.
 CODEGENOPT(StructPathTBAA    , 1, 0) ///< Whether or not to use struct-path TBAA.
 CODEGENOPT(SaveTempLabels    , 1, 0) ///< Save temporary labels.
-CODEGENOPT(SanitizeAddressZeroBaseShadow , 1, 0) ///< Map shadow memory at zero
-                                                 ///< offset in AddressSanitizer.
+CODEGENOPT(SanitizeAddressUseAfterScope , 1, 0) ///< Enable use-after-scope detection
+                                                ///< in AddressSanitizer
 CODEGENOPT(SanitizeMemoryTrackOrigins, 2, 0) ///< Enable tracking origins in
                                              ///< MemorySanitizer
 CODEGENOPT(SanitizeMemoryUseAfterDtor, 1, 0) ///< Enable use-after-delete detection
@@ -135,16 +153,17 @@
                                            ///< in sanitizer coverage.
 CODEGENOPT(SanitizeCoverage8bitCounters, 1, 0) ///< Use 8-bit frequency counters
                                                ///< in sanitizer coverage.
+CODEGENOPT(SanitizeCoverageTracePC, 1, 0) ///< Enable PC tracing
+                                          ///< in sanitizer coverage.
 CODEGENOPT(SanitizeStats     , 1, 0) ///< Collect statistics for sanitizers.
 CODEGENOPT(SimplifyLibCalls  , 1, 1) ///< Set when -fbuiltin is enabled.
 CODEGENOPT(SoftFloat         , 1, 0) ///< -soft-float.
 CODEGENOPT(StrictEnums       , 1, 0) ///< Optimize based on strict enum definition.
 CODEGENOPT(StrictVTablePointers, 1, 0) ///< Optimize based on the strict vtable pointers
 CODEGENOPT(TimePasses        , 1, 0) ///< Set when -ftime-report is enabled.
-CODEGENOPT(UnitAtATime       , 1, 1) ///< Unused. For mirroring GCC optimization
-                                     ///< selection.
 CODEGENOPT(UnrollLoops       , 1, 0) ///< Control whether loops are unrolled.
 CODEGENOPT(RerollLoops       , 1, 0) ///< Control whether loops are rerolled.
+CODEGENOPT(NoUseJumpTables   , 1, 0) ///< Set when -fno-jump-tables is enabled.
 CODEGENOPT(UnsafeFPMath      , 1, 0) ///< Allow unsafe floating point optzns.
 CODEGENOPT(UnwindTables      , 1, 0) ///< Emit unwind tables.
 CODEGENOPT(VectorizeBB       , 1, 0) ///< Run basic block vectorizer.
@@ -162,7 +181,7 @@
                                      ///< realignment.
 CODEGENOPT(UseInitArray      , 1, 0) ///< Control whether to use .init_array or
                                      ///< .ctors.
-VALUE_CODEGENOPT(StackAlignment    , 32, 0) ///< Overrides default stack 
+VALUE_CODEGENOPT(StackAlignment    , 32, 0) ///< Overrides default stack
                                             ///< alignment, if not 0.
 VALUE_CODEGENOPT(StackProbeSize    , 32, 4096) ///< Overrides default stack
                                                ///< probe size, even if 0.
@@ -172,12 +191,19 @@
 CODEGENOPT(DebugTypeExtRefs, 1, 0) ///< Whether or not debug info should contain
                                    ///< external references to a PCH or module.
 
-CODEGENOPT(DebugExplicitImport, 1, 0)  ///< Whether or not debug info should 
-                                       ///< contain explicit imports for 
+CODEGENOPT(DebugExplicitImport, 1, 0)  ///< Whether or not debug info should
+                                       ///< contain explicit imports for
                                        ///< anonymous namespaces
 
 CODEGENOPT(EmitLLVMUseLists, 1, 0) ///< Control whether to serialize use-lists.
 
+CODEGENOPT(WholeProgramVTables, 1, 0) ///< Whether to apply whole-program
+                                      ///  vtable optimization.
+
+/// Whether to use public LTO visibility for entities in std and stdext
+/// namespaces. This is enabled by clang-cl's /MT and /MTd flags.
+CODEGENOPT(LTOVisibilityPublicStd, 1, 0)
+
 /// The user specified number of registers to be used for integral arguments,
 /// or 0 if unspecified.
 VALUE_CODEGENOPT(NumRegisterParameters, 32, 0)
@@ -189,7 +215,8 @@
 ENUM_CODEGENOPT(DebugInfo, codegenoptions::DebugInfoKind, 3, codegenoptions::NoDebugInfo)
 
 /// Tune the debug info for this debugger.
-ENUM_CODEGENOPT(DebuggerTuning, DebuggerKind, 2, DebuggerKindDefault)
+ENUM_CODEGENOPT(DebuggerTuning, llvm::DebuggerKind, 2,
+                llvm::DebuggerKind::Default)
 
 /// Dwarf version. Version zero indicates to LLVM that no DWARF should be
 /// emitted.
@@ -203,11 +230,15 @@
 ENUM_CODEGENOPT(Inlining, InliningMethod, 2, NoInlining)
 
 // Vector functions library to use.
-ENUM_CODEGENOPT(VecLib, VectorLibrary, 1, NoLibrary)
+ENUM_CODEGENOPT(VecLib, VectorLibrary, 2, NoLibrary)
 
 /// The default TLS model to use.
 ENUM_CODEGENOPT(DefaultTLSModel, TLSModel, 2, GeneralDynamicTLSModel)
 
+/// Number of path components to strip when emitting checks. (0 == full
+/// filename)
+VALUE_CODEGENOPT(EmitCheckPathComponentsToStrip, 32, 0)
+
 #undef CODEGENOPT
 #undef ENUM_CODEGENOPT
 #undef VALUE_CODEGENOPT
diff --git a/include/clang/Frontend/CodeGenOptions.h b/include/clang/Frontend/CodeGenOptions.h
index ee6c51b..0bdc1ef 100644
--- a/include/clang/Frontend/CodeGenOptions.h
+++ b/include/clang/Frontend/CodeGenOptions.h
@@ -17,6 +17,7 @@
 #include "clang/Basic/DebugInfoOptions.h"
 #include "clang/Basic/Sanitizers.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Target/TargetOptions.h"
 #include <map>
 #include <memory>
 #include <string>
@@ -45,27 +46,23 @@
   enum InliningMethod {
     NoInlining,         // Perform no inlining whatsoever.
     NormalInlining,     // Use the standard function inlining pass.
+    OnlyHintInlining,   // Inline only (implicitly) hinted functions.
     OnlyAlwaysInlining  // Only run the always inlining pass.
   };
 
   enum VectorLibrary {
-    NoLibrary, // Don't use any vector library.
-    Accelerate // Use the Accelerate framework.
+    NoLibrary,  // Don't use any vector library.
+    Accelerate, // Use the Accelerate framework.
+    SVML        // Intel short vector math library.
   };
 
+
   enum ObjCDispatchMethodKind {
     Legacy = 0,
     NonLegacy = 1,
     Mixed = 2
   };
 
-  enum DebuggerKind {
-    DebuggerKindDefault,
-    DebuggerKindGDB,
-    DebuggerKindLLDB,
-    DebuggerKindSCE
-  };
-
   enum TLSModel {
     GeneralDynamicTLSModel,
     LocalDynamicTLSModel,
@@ -85,6 +82,20 @@
     SRCK_InRegs    // Small structs in registers (-freg-struct-return).
   };
 
+  enum ProfileInstrKind {
+    ProfileNone,       // Profile instrumentation is turned off.
+    ProfileClangInstr, // Clang instrumentation to generate execution counts
+                       // to use with PGO.
+    ProfileIRInstr,    // IR level PGO instrumentation in LLVM.
+  };
+
+  enum EmbedBitcodeKind {
+    Embed_Off,      // No embedded bitcode.
+    Embed_All,      // Embed both bitcode and commandline in the output.
+    Embed_Bitcode,  // Embed just the bitcode in the output.
+    Embed_Marker    // Embed a marker as a placeholder for bitcode.
+  };
+
   /// The code model to use (-mcmodel).
   std::string CodeModel;
 
@@ -152,15 +163,12 @@
   std::string SampleProfileFile;
 
   /// Name of the profile file to use as input for -fprofile-instr-use
-  std::string InstrProfileInput;
+  std::string ProfileInstrumentUsePath;
 
   /// Name of the function summary index file to use for ThinLTO function
   /// importing.
   std::string ThinLTOIndexFile;
 
-  /// The EABI version to use
-  std::string EABIVersion;
-
   /// A list of file names passed with -fcuda-include-gpubinary options to
   /// forward to CUDA runtime back-end for incorporating them into host-side
   /// object file.
@@ -198,6 +206,9 @@
   /// Set of sanitizer checks that trap rather than diagnose.
   SanitizerSet SanitizeTrap;
 
+  /// List of backend command-line options for -fembed-bitcode.
+  std::vector<uint8_t> CmdArgs;
+
   /// \brief A list of all -fno-builtin-* function names (e.g., memset).
   std::vector<std::string> NoBuiltinFuncs;
 
@@ -218,6 +229,27 @@
   const std::vector<std::string> &getNoBuiltinFuncs() const {
     return NoBuiltinFuncs;
   }
+
+  /// \brief Check if Clang profile instrumenation is on.
+  bool hasProfileClangInstr() const {
+    return getProfileInstr() == ProfileClangInstr;
+  }
+
+  /// \brief Check if IR level profile instrumentation is on.
+  bool hasProfileIRInstr() const {
+    return getProfileInstr() == ProfileIRInstr;
+  }
+
+  /// \brief Check if Clang profile use is on.
+  bool hasProfileClangUse() const {
+    return getProfileUse() == ProfileClangInstr;
+  }
+
+  /// \brief Check if IR level profile use is on.
+  bool hasProfileIRUse() const {
+    return getProfileUse() == ProfileIRInstr;
+  }
+
 };
 
 }  // end namespace clang
diff --git a/include/clang/Frontend/CompilerInstance.h b/include/clang/Frontend/CompilerInstance.h
index 798cc2d..01db211 100644
--- a/include/clang/Frontend/CompilerInstance.h
+++ b/include/clang/Frontend/CompilerInstance.h
@@ -11,11 +11,12 @@
 #define LLVM_CLANG_FRONTEND_COMPILERINSTANCE_H_
 
 #include "clang/AST/ASTConsumer.h"
-#include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/ModuleLoader.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -35,7 +36,6 @@
 
 namespace clang {
 class ASTContext;
-class ASTConsumer;
 class ASTReader;
 class CodeCompleteConsumer;
 class DiagnosticsEngine;
@@ -155,15 +155,10 @@
   struct OutputFile {
     std::string Filename;
     std::string TempFilename;
-    std::unique_ptr<raw_ostream> OS;
 
-    OutputFile(std::string filename, std::string tempFilename,
-               std::unique_ptr<raw_ostream> OS)
-        : Filename(std::move(filename)), TempFilename(std::move(tempFilename)),
-          OS(std::move(OS)) {}
-    OutputFile(OutputFile &&O)
-        : Filename(std::move(O.Filename)),
-          TempFilename(std::move(O.TempFilename)), OS(std::move(O.OS)) {}
+    OutputFile(std::string filename, std::string tempFilename)
+        : Filename(std::move(filename)), TempFilename(std::move(tempFilename)) {
+    }
   };
 
   /// If the output doesn't support seeking (terminal, pipe). we switch
@@ -387,7 +382,7 @@
   /// \note Most clients should use setFileManager, which will implicitly reset
   /// the virtual file system to the one contained in the file manager.
   void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS) {
-    VirtualFileSystem = FS;
+    VirtualFileSystem = std::move(FS);
   }
 
   /// }
@@ -584,8 +579,8 @@
   /// \param OutFile - The output file info.
   void addOutputFile(OutputFile &&OutFile);
 
-  /// clearOutputFiles - Clear the output file list, destroying the contained
-  /// output streams.
+  /// clearOutputFiles - Clear the output file list. The underlying output
+  /// streams must have been closed beforehand.
   ///
   /// \param EraseFiles - If true, attempt to erase the files from disk.
   void clearOutputFiles(bool EraseFiles);
@@ -692,19 +687,18 @@
   /// atomically replace the target output on success).
   ///
   /// \return - Null on error.
-  raw_pwrite_stream *createDefaultOutputFile(bool Binary = true,
-                                             StringRef BaseInput = "",
-                                             StringRef Extension = "");
+  std::unique_ptr<raw_pwrite_stream>
+  createDefaultOutputFile(bool Binary = true, StringRef BaseInput = "",
+                          StringRef Extension = "");
 
   /// Create a new output file and add it to the list of tracked output files,
   /// optionally deriving the output path name.
   ///
   /// \return - Null on error.
-  raw_pwrite_stream *createOutputFile(StringRef OutputPath, bool Binary,
-                                      bool RemoveFileOnSignal,
-                                      StringRef BaseInput, StringRef Extension,
-                                      bool UseTemporary,
-                                      bool CreateMissingDirectories = false);
+  std::unique_ptr<raw_pwrite_stream>
+  createOutputFile(StringRef OutputPath, bool Binary, bool RemoveFileOnSignal,
+                   StringRef BaseInput, StringRef Extension, bool UseTemporary,
+                   bool CreateMissingDirectories = false);
 
   /// Create a new output file, optionally deriving the output path name.
   ///
@@ -738,7 +732,7 @@
                    bool CreateMissingDirectories, std::string *ResultPathName,
                    std::string *TempPathName);
 
-  llvm::raw_null_ostream *createNullOutputFile();
+  std::unique_ptr<raw_pwrite_stream> createNullOutputFile();
 
   /// }
   /// @name Initialization Utility Methods
@@ -755,10 +749,12 @@
   ///
   /// \return True on success.
   static bool InitializeSourceManager(const FrontendInputFile &Input,
-                DiagnosticsEngine &Diags,
-                FileManager &FileMgr,
-                SourceManager &SourceMgr,
-                const FrontendOptions &Opts);
+                                      DiagnosticsEngine &Diags,
+                                      FileManager &FileMgr,
+                                      SourceManager &SourceMgr,
+                                      HeaderSearch *HS,
+                                      DependencyOutputOptions &DepOpts,
+                                      const FrontendOptions &Opts);
 
   /// }
 
diff --git a/include/clang/Frontend/CompilerInvocation.h b/include/clang/Frontend/CompilerInvocation.h
index 9188552..283240c 100644
--- a/include/clang/Frontend/CompilerInvocation.h
+++ b/include/clang/Frontend/CompilerInvocation.h
@@ -14,29 +14,29 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileSystemOptions.h"
 #include "clang/Basic/LangOptions.h"
-#include "clang/Basic/TargetOptions.h"
 #include "clang/Frontend/CodeGenOptions.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
 #include "clang/Frontend/FrontendOptions.h"
 #include "clang/Frontend/LangStandard.h"
 #include "clang/Frontend/MigratorOptions.h"
 #include "clang/Frontend/PreprocessorOutputOptions.h"
-#include "clang/Lex/HeaderSearchOptions.h"
-#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
 #include <string>
-#include <vector>
 
 namespace llvm {
+class Triple;
+
 namespace opt {
 class ArgList;
 }
 }
 
 namespace clang {
+class PreprocessorOptions;
+class HeaderSearchOptions;
+class TargetOptions;
+class LangOptions;
 class CompilerInvocation;
 class DiagnosticsEngine;
 
@@ -158,8 +158,11 @@
   ///
   /// \param Opts - The LangOptions object to set up.
   /// \param IK - The input language.
+  /// \param T - The target triple.
+  /// \param PPOpts - The PreprocessorOptions affected.
   /// \param LangStd - The input language standard.
   static void setLangDefaults(LangOptions &Opts, InputKind IK,
+                   const llvm::Triple &T, PreprocessorOptions &PPOpts,
                    LangStandard::Kind LangStd = LangStandard::lang_unspecified);
   
   /// \brief Retrieve a module hash string that is suitable for uniquely 
diff --git a/include/clang/Frontend/DependencyOutputOptions.h b/include/clang/Frontend/DependencyOutputOptions.h
index 129b534..0be36cd 100644
--- a/include/clang/Frontend/DependencyOutputOptions.h
+++ b/include/clang/Frontend/DependencyOutputOptions.h
@@ -50,6 +50,9 @@
   /// A list of filenames to be used as extra dependencies for every target.
   std::vector<std::string> ExtraDeps;
 
+  /// In /showIncludes mode, pretend the main TU is a header with this name.
+  std::string ShowIncludesPretendHeader;
+
   /// \brief The file to write GraphViz-formatted header dependencies to.
   std::string DOTOutputFile;
 
diff --git a/include/clang/Frontend/DiagnosticRenderer.h b/include/clang/Frontend/DiagnosticRenderer.h
index c372fdd..2588feb 100644
--- a/include/clang/Frontend/DiagnosticRenderer.h
+++ b/include/clang/Frontend/DiagnosticRenderer.h
@@ -19,7 +19,6 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PointerUnion.h"
 
 namespace clang {
diff --git a/include/clang/Frontend/FrontendAction.h b/include/clang/Frontend/FrontendAction.h
index 1b021ef..384499a 100644
--- a/include/clang/Frontend/FrontendAction.h
+++ b/include/clang/Frontend/FrontendAction.h
@@ -130,7 +130,7 @@
   const FrontendInputFile &getCurrentInput() const {
     return CurrentInput;
   }
-  
+
   const StringRef getCurrentFile() const {
     assert(!CurrentInput.isEmpty() && "No current file!");
     return CurrentInput.getFile();
@@ -157,7 +157,7 @@
   /// @name Supported Modes
   /// @{
 
-  /// \brief Is this action invoked on a model file? 
+  /// \brief Is this action invoked on a model file?
   ///
   /// Model files are incomplete translation units that relies on type
   /// information from another translation unit. Check ParseModelFileAction for
@@ -249,6 +249,19 @@
   /// CompilerInstance's Diagnostic object to report errors.
   virtual bool ParseArgs(const CompilerInstance &CI,
                          const std::vector<std::string> &arg) = 0;
+
+  enum ActionType {
+    Cmdline,             ///< Action is determined by the cc1 command-line
+    ReplaceAction,       ///< Replace the main action
+    AddBeforeMainAction, ///< Execute the action before the main action
+    AddAfterMainAction   ///< Execute the action after the main action
+  };
+  /// \brief Get the action type for this plugin
+  ///
+  /// \return The action type. If the type is Cmdline then by default the
+  /// plugin does nothing and what it does is determined by the cc1
+  /// command-line.
+  virtual ActionType getActionType() { return Cmdline; }
 };
 
 /// \brief Abstract base class to use for preprocessor-based frontend actions.
diff --git a/include/clang/Frontend/FrontendActions.h b/include/clang/Frontend/FrontendActions.h
index 025955d..b56a04a 100644
--- a/include/clang/Frontend/FrontendActions.h
+++ b/include/clang/Frontend/FrontendActions.h
@@ -85,7 +85,7 @@
   /// create the PCHGenerator instance returned by CreateASTConsumer.
   ///
   /// \returns true if an error occurred, false otherwise.
-  static raw_pwrite_stream *
+  static std::unique_ptr<raw_pwrite_stream>
   ComputeASTConsumerArguments(CompilerInstance &CI, StringRef InFile,
                               std::string &Sysroot, std::string &OutputFile);
 };
@@ -117,10 +117,9 @@
   /// create the PCHGenerator instance returned by CreateASTConsumer.
   ///
   /// \returns true if an error occurred, false otherwise.
-  raw_pwrite_stream *ComputeASTConsumerArguments(CompilerInstance &CI,
-                                                 StringRef InFile,
-                                                 std::string &Sysroot,
-                                                 std::string &OutputFile);
+  std::unique_ptr<raw_pwrite_stream>
+  ComputeASTConsumerArguments(CompilerInstance &CI, StringRef InFile,
+                              std::string &Sysroot, std::string &OutputFile);
 };
 
 class SyntaxOnlyAction : public ASTFrontendAction {
@@ -129,6 +128,7 @@
                                                  StringRef InFile) override;
 
 public:
+  ~SyntaxOnlyAction() override;
   bool hasCodeCompletionSupport() const override { return true; }
 };
 
@@ -138,6 +138,7 @@
 protected:
   std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
                                                  StringRef InFile) override;
+  bool BeginInvocation(CompilerInstance &CI) override;
   void ExecuteAction() override;
 
 public:
diff --git a/include/clang/Frontend/FrontendOptions.h b/include/clang/Frontend/FrontendOptions.h
index c800a51..a75523f 100644
--- a/include/clang/Frontend/FrontendOptions.h
+++ b/include/clang/Frontend/FrontendOptions.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include <string>
 #include <vector>
+#include <unordered_map>
 
 namespace llvm {
 class MemoryBuffer;
@@ -73,6 +74,7 @@
   IK_OpenCL,
   IK_CUDA,
   IK_PreprocessedCuda,
+  IK_RenderScript,
   IK_AST,
   IK_LLVM_IR
 };
@@ -152,6 +154,8 @@
                                            ///< implicit module build.
   unsigned ModulesEmbedAllFiles : 1;       ///< Whether we should embed all used
                                            ///< files into the PCM file.
+  unsigned IncludeTimestamps : 1;          ///< Whether timestamps should be
+                                           ///< written to the produced PCH file.
 
   CodeCompleteOptions CodeCompleteOpts;
 
@@ -227,15 +231,12 @@
   /// The name of the action to run when using a plugin action.
   std::string ActionName;
 
-  /// Args to pass to the plugin
-  std::vector<std::string> PluginArgs;
+  /// Args to pass to the plugins
+  std::unordered_map<std::string,std::vector<std::string>> PluginArgs;
 
   /// The list of plugin actions to run in addition to the normal action.
   std::vector<std::string> AddPluginActions;
 
-  /// Args to pass to the additional plugins
-  std::vector<std::vector<std::string> > AddPluginArgs;
-
   /// The list of plugins to load.
   std::vector<std::string> Plugins;
 
@@ -266,6 +267,10 @@
   /// \brief Auxiliary triple for CUDA compilation.
   std::string AuxTriple;
 
+  /// \brief If non-empty, search the pch input file as it was a header
+  // included by this file.
+  std::string FindPchSource;
+
 public:
   FrontendOptions() :
     DisableFree(false), RelocatablePCH(false), ShowHelp(false),
@@ -275,8 +280,8 @@
     SkipFunctionBodies(false), UseGlobalModuleIndex(true),
     GenerateGlobalModuleIndex(true), ASTDumpDecls(false), ASTDumpLookups(false),
     BuildingImplicitModule(false), ModulesEmbedAllFiles(false),
-    ARCMTAction(ARCMT_None), ObjCMTAction(ObjCMT_None),
-    ProgramAction(frontend::ParseSyntaxOnly)
+    IncludeTimestamps(true), ARCMTAction(ARCMT_None),
+    ObjCMTAction(ObjCMT_None), ProgramAction(frontend::ParseSyntaxOnly)
   {}
 
   /// getInputKindForExtension - Return the appropriate input kind for a file
diff --git a/include/clang/Frontend/FrontendPluginRegistry.h b/include/clang/Frontend/FrontendPluginRegistry.h
index ecab630..9d7ee08 100644
--- a/include/clang/Frontend/FrontendPluginRegistry.h
+++ b/include/clang/Frontend/FrontendPluginRegistry.h
@@ -13,9 +13,6 @@
 #include "clang/Frontend/FrontendAction.h"
 #include "llvm/Support/Registry.h"
 
-// Instantiated in FrontendAction.cpp.
-extern template class llvm::Registry<clang::PluginASTAction>;
-
 namespace clang {
 
 /// The frontend plugin registry.
diff --git a/include/clang/Frontend/LangStandards.def b/include/clang/Frontend/LangStandards.def
index cac9c3c..a303693 100644
--- a/include/clang/Frontend/LangStandards.def
+++ b/include/clang/Frontend/LangStandards.def
@@ -19,6 +19,14 @@
 /// \param FEATURES - The standard features as flags, these are enums from the
 /// clang::frontend namespace, which is assumed to be be available.
 
+/// LANGSTANDARD_ALIAS(IDENT, ALIAS)
+/// \param IDENT - The name of the standard as a C++ identifier.
+/// \param ALIAS - The alias of the standard.
+
+#ifndef LANGSTANDARD_ALIAS
+#define LANGSTANDARD_ALIAS(IDENT, ALIAS)
+#endif
+
 // C89-ish modes.
 LANGSTANDARD(c89, "c89",
              "ISO C 1990",
@@ -125,29 +133,36 @@
 LANGSTANDARD(cxx1z, "c++1z",
              "Working draft for ISO C++ 2017",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus1z |
-             Digraphs)
+             Digraphs | HexFloat)
 LANGSTANDARD(gnucxx1z, "gnu++1z",
              "Working draft for ISO C++ 2017 with GNU extensions",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus1z |
-             Digraphs | GNUMode)
+             Digraphs | HexFloat | GNUMode)
 
 // OpenCL
 LANGSTANDARD(opencl, "cl",
              "OpenCL 1.0",
              LineComment | C99 | Digraphs | HexFloat)
-LANGSTANDARD(opencl11, "CL1.1",
+LANGSTANDARD(opencl11, "cl1.1",
              "OpenCL 1.1",
              LineComment | C99 | Digraphs | HexFloat)
-LANGSTANDARD(opencl12, "CL1.2",
+LANGSTANDARD(opencl12, "cl1.2",
              "OpenCL 1.2",
              LineComment | C99 | Digraphs | HexFloat)
-LANGSTANDARD(opencl20, "CL2.0",
+LANGSTANDARD(opencl20, "cl2.0",
              "OpenCL 2.0",
              LineComment | C99 | Digraphs | HexFloat)
 
+LANGSTANDARD_ALIAS(opencl, "CL")
+LANGSTANDARD_ALIAS(opencl11, "CL1.1")
+LANGSTANDARD_ALIAS(opencl12, "CL1.2")
+LANGSTANDARD_ALIAS(opencl20, "CL2.0")
+
 // CUDA
 LANGSTANDARD(cuda, "cuda",
              "NVIDIA CUDA(tm)",
              LineComment | CPlusPlus | Digraphs)
 
 #undef LANGSTANDARD
+#undef LANGSTANDARD_ALIAS
+
diff --git a/include/clang/Frontend/MultiplexConsumer.h b/include/clang/Frontend/MultiplexConsumer.h
index ae6db29..d13565c 100644
--- a/include/clang/Frontend/MultiplexConsumer.h
+++ b/include/clang/Frontend/MultiplexConsumer.h
@@ -36,7 +36,7 @@
   void Initialize(ASTContext &Context) override;
   void HandleCXXStaticMemberVarInstantiation(VarDecl *VD) override;
   bool HandleTopLevelDecl(DeclGroupRef D) override;
-  void HandleInlineMethodDefinition(CXXMethodDecl *D) override;
+  void HandleInlineFunctionDefinition(FunctionDecl *D) override;
   void HandleInterestingDecl(DeclGroupRef D) override;
   void HandleTranslationUnit(ASTContext &Ctx) override;
   void HandleTagDeclDefinition(TagDecl *D) override;
@@ -44,11 +44,8 @@
   void HandleCXXImplicitFunctionInstantiation(FunctionDecl *D) override;
   void HandleTopLevelDeclInObjCContainer(DeclGroupRef D) override;
   void HandleImplicitImportDecl(ImportDecl *D) override;
-  void HandleLinkerOption(llvm::StringRef Opts) override;
-  void HandleDetectMismatch(llvm::StringRef Name,
-                            llvm::StringRef Value) override;
-  void HandleDependentLibrary(llvm::StringRef Lib) override;
   void CompleteTentativeDefinition(VarDecl *D) override;
+  void AssignInheritanceModel(CXXRecordDecl *RD) override;
   void HandleVTable(CXXRecordDecl *RD) override;
   ASTMutationListener *GetASTMutationListener() override;
   ASTDeserializationListener *GetASTDeserializationListener() override;
diff --git a/include/clang/Frontend/PCHContainerOperations.h b/include/clang/Frontend/PCHContainerOperations.h
index 67c36cf..0c1b28e 100644
--- a/include/clang/Frontend/PCHContainerOperations.h
+++ b/include/clang/Frontend/PCHContainerOperations.h
@@ -46,10 +46,12 @@
   /// Return an ASTConsumer that can be chained with a
   /// PCHGenerator that produces a wrapper file format containing a
   /// serialized AST bitstream.
-  virtual std::unique_ptr<ASTConsumer> CreatePCHContainerGenerator(
-      CompilerInstance &CI, const std::string &MainFileName,
-      const std::string &OutputFileName, llvm::raw_pwrite_stream *OS,
-      std::shared_ptr<PCHBuffer> Buffer) const = 0;
+  virtual std::unique_ptr<ASTConsumer>
+  CreatePCHContainerGenerator(CompilerInstance &CI,
+                              const std::string &MainFileName,
+                              const std::string &OutputFileName,
+                              std::unique_ptr<llvm::raw_pwrite_stream> OS,
+                              std::shared_ptr<PCHBuffer> Buffer) const = 0;
 };
 
 /// This abstract interface provides operations for unwrapping
@@ -73,10 +75,12 @@
 
   /// Return an ASTConsumer that can be chained with a
   /// PCHGenerator that writes the module to a flat file.
-  std::unique_ptr<ASTConsumer> CreatePCHContainerGenerator(
-      CompilerInstance &CI, const std::string &MainFileName,
-      const std::string &OutputFileName, llvm::raw_pwrite_stream *OS,
-      std::shared_ptr<PCHBuffer> Buffer) const override;
+  std::unique_ptr<ASTConsumer>
+  CreatePCHContainerGenerator(CompilerInstance &CI,
+                              const std::string &MainFileName,
+                              const std::string &OutputFileName,
+                              std::unique_ptr<llvm::raw_pwrite_stream> OS,
+                              std::shared_ptr<PCHBuffer> Buffer) const override;
 };
 
 /// Implements read operations for a raw pass-through PCH container.
diff --git a/include/clang/Frontend/SerializedDiagnosticReader.h b/include/clang/Frontend/SerializedDiagnosticReader.h
index 3db362bf..0747984 100644
--- a/include/clang/Frontend/SerializedDiagnosticReader.h
+++ b/include/clang/Frontend/SerializedDiagnosticReader.h
@@ -11,7 +11,6 @@
 #define LLVM_CLANG_FRONTEND_SERIALIZED_DIAGNOSTIC_READER_H_
 
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Bitcode/BitstreamReader.h"
 #include "llvm/Support/ErrorOr.h"
 
diff --git a/include/clang/Frontend/TextDiagnosticPrinter.h b/include/clang/Frontend/TextDiagnosticPrinter.h
index 04a5705..07cee9f 100644
--- a/include/clang/Frontend/TextDiagnosticPrinter.h
+++ b/include/clang/Frontend/TextDiagnosticPrinter.h
@@ -45,7 +45,7 @@
   /// setPrefix - Set the diagnostic printer prefix string, which will be
   /// printed at the start of any diagnostics. If empty, no prefix string is
   /// used.
-  void setPrefix(std::string Value) { Prefix = Value; }
+  void setPrefix(std::string Value) { Prefix = std::move(Value); }
 
   void BeginSourceFile(const LangOptions &LO, const Preprocessor *PP) override;
   void EndSourceFile() override;
diff --git a/include/clang/Frontend/Utils.h b/include/clang/Frontend/Utils.h
index 933468d..cf943a5 100644
--- a/include/clang/Frontend/Utils.h
+++ b/include/clang/Frontend/Utils.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Option/OptSpecifier.h"
+#include <utility>
 
 namespace llvm {
 class raw_fd_ostream;
@@ -141,21 +142,20 @@
 
   void writeFileMap();
   bool hasErrors() { return HasErrors; }
-  ModuleDependencyCollector(std::string DestDir) : DestDir(DestDir) {}
+  ModuleDependencyCollector(std::string DestDir)
+      : DestDir(std::move(DestDir)) {}
   ~ModuleDependencyCollector() { writeFileMap(); }
 };
 
 /// AttachDependencyGraphGen - Create a dependency graph generator, and attach
 /// it to the given preprocessor.
-  void AttachDependencyGraphGen(Preprocessor &PP, StringRef OutputFile,
-                                StringRef SysRoot);
+void AttachDependencyGraphGen(Preprocessor &PP, StringRef OutputFile,
+                              StringRef SysRoot);
 
 /// AttachHeaderIncludeGen - Create a header include list generator, and attach
 /// it to the given preprocessor.
 ///
-/// \param ExtraHeaders - If not empty, will write the header filenames, just
-/// like they were included during a regular preprocessing. Useful for
-/// implicit include dependencies, like sanitizer blacklists.
+/// \param DepOpts - Options controlling the output.
 /// \param ShowAllHeaders - If true, show all header information instead of just
 /// headers following the predefines buffer. This is useful for making sure
 /// includes mentioned on the command line are also reported, but differs from
@@ -165,7 +165,7 @@
 /// \param ShowDepth - Whether to indent to show the nesting of the includes.
 /// \param MSStyle - Whether to print in cl.exe /showIncludes style.
 void AttachHeaderIncludeGen(Preprocessor &PP,
-                            const std::vector<std::string> &ExtraHeaders,
+                            const DependencyOutputOptions &DepOpts,
                             bool ShowAllHeaders = false,
                             StringRef OutputPath = "",
                             bool ShowDepth = true, bool MSStyle = false);
diff --git a/include/clang/Lex/DirectoryLookup.h b/include/clang/Lex/DirectoryLookup.h
index 20c4bb0..ee0af29 100644
--- a/include/clang/Lex/DirectoryLookup.h
+++ b/include/clang/Lex/DirectoryLookup.h
@@ -151,6 +151,9 @@
   ///
   /// \param HS The header search instance to search with.
   ///
+  /// \param IncludeLoc the source location of the #include or #import
+  /// directive.
+  ///
   /// \param SearchPath If not NULL, will be set to the search path relative
   /// to which the file was found.
   ///
@@ -172,6 +175,7 @@
   /// a framework include ("Foo.h" -> "Foo/Foo.h"), set the new name to this
   /// vector and point Filename to it.
   const FileEntry *LookupFile(StringRef &Filename, HeaderSearch &HS,
+                              SourceLocation IncludeLoc,
                               SmallVectorImpl<char> *SearchPath,
                               SmallVectorImpl<char> *RelativePath,
                               Module *RequestingModule,
diff --git a/include/clang/Lex/HeaderMap.h b/include/clang/Lex/HeaderMap.h
index 58cd549..8466f1a 100644
--- a/include/clang/Lex/HeaderMap.h
+++ b/include/clang/Lex/HeaderMap.h
@@ -17,11 +17,9 @@
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include <memory>
 
-namespace llvm {
-  class MemoryBuffer;
-}
 namespace clang {
 
 class FileEntry;
diff --git a/include/clang/Lex/HeaderSearch.h b/include/clang/Lex/HeaderSearch.h
index 6ebeb8b..7bac01e 100644
--- a/include/clang/Lex/HeaderSearch.h
+++ b/include/clang/Lex/HeaderSearch.h
@@ -241,8 +241,6 @@
   unsigned NumMultiIncludeFileOptzn;
   unsigned NumFrameworkLookups, NumSubFrameworkLookups;
 
-  const LangOptions &LangOpts;
-
   // HeaderSearch doesn't support default or copy construction.
   HeaderSearch(const HeaderSearch&) = delete;
   void operator=(const HeaderSearch&) = delete;
@@ -483,12 +481,9 @@
   /// \param ModuleMapPath A path that when combined with \c ModuleName
   /// uniquely identifies this module. See Module::ModuleMap.
   ///
-  /// \param UsePrebuiltPath Whether we should use the prebuilt module path.
-  ///
   /// \returns The name of the module file that corresponds to this module,
   /// or an empty string if this module does not correspond to any module file.
-  std::string getModuleFileName(StringRef ModuleName, StringRef ModuleMapPath,
-                                bool UsePrebuiltPath);
+  std::string getModuleFileName(StringRef ModuleName, StringRef ModuleMapPath);
 
   /// \brief Lookup a module Search for a module with the given name.
   ///
@@ -585,8 +580,9 @@
   /// \brief Look up the file with the specified name and determine its owning
   /// module.
   const FileEntry *
-  getFileAndSuggestModule(StringRef FileName, const DirectoryEntry *Dir,
-                          bool IsSystemHeaderDir, Module *RequestingModule,
+  getFileAndSuggestModule(StringRef FileName, SourceLocation IncludeLoc,
+                          const DirectoryEntry *Dir, bool IsSystemHeaderDir,
+                          Module *RequestingModule,
                           ModuleMap::KnownHeader *SuggestedModule);
 
 public:
@@ -637,13 +633,18 @@
   /// \brief Retrieve a uniqued framework name.
   StringRef getUniqueFrameworkName(StringRef Framework);
   
+  /// \brief Suggest a path by which the specified file could be found, for
+  /// use in diagnostics to suggest a #include.
+  ///
+  /// \param IsSystem If non-null, filled in to indicate whether the suggested
+  ///        path is relative to a system header directory.
+  std::string suggestPathToFileForDiagnostics(const FileEntry *File,
+                                              bool *IsSystem = nullptr);
+
   void PrintStats();
   
   size_t getTotalMemory() const;
 
-  static std::string NormalizeDashIncludePath(StringRef File,
-                                              FileManager &FileMgr);
-
 private:
   /// \brief Describes what happened when we tried to load a module map file.
   enum LoadModuleMapResult {
diff --git a/include/clang/Lex/HeaderSearchOptions.h b/include/clang/Lex/HeaderSearchOptions.h
index 53909e6..9727f56 100644
--- a/include/clang/Lex/HeaderSearchOptions.h
+++ b/include/clang/Lex/HeaderSearchOptions.h
@@ -93,9 +93,6 @@
   /// \brief The directory used for a user build.
   std::string ModuleUserBuildPath;
 
-  /// \brief The directories used to load prebuilt module files.
-  std::vector<std::string> PrebuiltModulePaths;
-
   /// The module/pch container format.
   std::string ModuleFormat;
 
@@ -204,10 +201,6 @@
   void AddVFSOverlayFile(StringRef Name) {
     VFSOverlayFiles.push_back(Name);
   }
-
-  void AddPrebuiltModulePath(StringRef Name) {
-    PrebuiltModulePaths.push_back(Name);
-  }
 };
 
 } // end namespace clang
diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h
index d568614..5f946fc 100644
--- a/include/clang/Lex/LiteralSupport.h
+++ b/include/clang/Lex/LiteralSupport.h
@@ -19,6 +19,7 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
@@ -61,8 +62,10 @@
   bool isUnsigned : 1;
   bool isLong : 1;          // This is *not* set for long long.
   bool isLongLong : 1;
+  bool isHalf : 1;          // 1.0h
   bool isFloat : 1;         // 1.0f
   bool isImaginary : 1;     // 1.0i
+  bool isFloat128 : 1;      // 1.0q
   uint8_t MicrosoftInteger; // Microsoft suffix extension i8, i16, i32, or i64.
 
   bool isIntegerLiteral() const {
@@ -104,9 +107,16 @@
 private:
 
   void ParseNumberStartingWithZero(SourceLocation TokLoc);
+  void ParseDecimalOrOctalCommon(SourceLocation TokLoc);
 
   static bool isDigitSeparator(char C) { return C == '\''; }
 
+  /// \brief Determine whether the sequence of characters [Start, End) contains
+  /// any real digits (not digit separators).
+  bool containsDigits(const char *Start, const char *End) {
+    return Start != End && (Start + 1 != End || !isDigitSeparator(Start[0]));
+  }
+
   enum CheckSeparatorKind { CSK_BeforeDigits, CSK_AfterDigits };
 
   /// \brief Ensure that we don't have a digit separator here.
diff --git a/include/clang/Lex/MacroArgs.h b/include/clang/Lex/MacroArgs.h
index 243b143..7b2a485 100644
--- a/include/clang/Lex/MacroArgs.h
+++ b/include/clang/Lex/MacroArgs.h
@@ -15,13 +15,13 @@
 #define LLVM_CLANG_LEX_MACROARGS_H
 
 #include "clang/Basic/LLVM.h"
+#include "clang/Lex/Token.h"
 #include "llvm/ADT/ArrayRef.h"
 #include <vector>
 
 namespace clang {
   class MacroInfo;
   class Preprocessor;
-  class Token;
   class SourceLocation;
 
 /// MacroArgs - An instance of this class captures information about
diff --git a/include/clang/Lex/MacroInfo.h b/include/clang/Lex/MacroInfo.h
index 320645e..6cc3b0b 100644
--- a/include/clang/Lex/MacroInfo.h
+++ b/include/clang/Lex/MacroInfo.h
@@ -106,7 +106,7 @@
   bool IsWarnIfUnused : 1;
 
   /// \brief Whether this macro info was loaded from an AST file.
-  unsigned FromASTFile : 1;
+  bool FromASTFile : 1;
 
   /// \brief Whether this macro was used as header guard.
   bool UsedForHeaderGuard : 1;
@@ -318,13 +318,13 @@
   unsigned MDKind : 2;
 
   /// \brief True if the macro directive was loaded from a PCH file.
-  bool IsFromPCH : 1;
+  unsigned IsFromPCH : 1;
 
   // Used by VisibilityMacroDirective ----------------------------------------//
 
   /// \brief Whether the macro has public visibility (when described in a
   /// module).
-  bool IsPublic : 1;
+  unsigned IsPublic : 1;
 
   MacroDirective(Kind K, SourceLocation Loc)
       : Previous(nullptr), Loc(Loc), MDKind(K), IsFromPCH(false),
diff --git a/include/clang/Lex/ModuleMap.h b/include/clang/Lex/ModuleMap.h
index 8973a71..7fab030 100644
--- a/include/clang/Lex/ModuleMap.h
+++ b/include/clang/Lex/ModuleMap.h
@@ -59,7 +59,7 @@
   /// \brief Called when an umbrella header is added during module map parsing.
   ///
   /// \param FileMgr FileManager instance
-  /// \param Header The umbreall header to collect.
+  /// \param Header The umbrella header to collect.
   virtual void moduleMapAddUmbrellaHeader(FileManager *FileMgr,
                                           const FileEntry *Header) {}
 };
@@ -82,15 +82,10 @@
   /// These are always simple C language options.
   LangOptions MMapLangOpts;
 
-  // The module that we are building; related to \c LangOptions::CurrentModule.
-  Module *CompilingModule;
-
-public:
-  // The module that the .cc source file is associated with.
+  // The module that the main source file is associated with (the module
+  // named LangOpts::CurrentModule, if we've loaded it).
   Module *SourceModule;
-  std::string SourceModuleName;
 
-private:
   /// \brief The unshadowed top-level modules that are known.
   llvm::StringMap<Module *> Modules;
 
@@ -142,6 +137,12 @@
       return getModule()->isAvailable();
     }
 
+    /// \brief Whether this header is accessible from the specified module.
+    bool isAccessibleFrom(Module *M) const {
+      return !(getRole() & PrivateHeader) ||
+             (M && M->getTopLevelModule() == getModule()->getTopLevelModule());
+    }
+
     // \brief Whether this known header is valid (i.e., it has an
     // associated module).
     explicit operator bool() const {
@@ -342,12 +343,18 @@
   ///
   /// \param RequestingModule The module including a file.
   ///
+  /// \param RequestingModuleIsModuleInterface \c true if the inclusion is in
+  ///        the interface of RequestingModule, \c false if it's in the
+  ///        implementation of RequestingModule. Value is ignored and
+  ///        meaningless if RequestingModule is nullptr.
+  ///
   /// \param FilenameLoc The location of the inclusion's filename.
   ///
   /// \param Filename The included filename as written.
   ///
   /// \param File The included file.
   void diagnoseHeaderInclusion(Module *RequestingModule,
+                               bool RequestingModuleIsModuleInterface,
                                SourceLocation FilenameLoc, StringRef Filename,
                                const FileEntry *File);
 
diff --git a/include/clang/Lex/PPCallbacks.h b/include/clang/Lex/PPCallbacks.h
index 68b8f1c..2d027f3 100644
--- a/include/clang/Lex/PPCallbacks.h
+++ b/include/clang/Lex/PPCallbacks.h
@@ -17,14 +17,12 @@
 
 #include "clang/Basic/DiagnosticIDs.h"
 #include "clang/Basic/SourceLocation.h"
-#include "clang/Lex/DirectoryLookup.h"
+#include "clang/Basic/SourceManager.h"
 #include "clang/Lex/ModuleLoader.h"
 #include "clang/Lex/Pragma.h"
 #include "llvm/ADT/StringRef.h"
-#include <string>
 
 namespace clang {
-  class SourceLocation;
   class Token;
   class IdentifierInfo;
   class MacroDefinition;
diff --git a/include/clang/Lex/PTHManager.h b/include/clang/Lex/PTHManager.h
index 26178ed..f4e4774 100644
--- a/include/clang/Lex/PTHManager.h
+++ b/include/clang/Lex/PTHManager.h
@@ -14,15 +14,11 @@
 #ifndef LLVM_CLANG_LEX_PTHMANAGER_H
 #define LLVM_CLANG_LEX_PTHMANAGER_H
 
-#include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/IdentifierTable.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Lex/PTHLexer.h"
-#include "llvm/ADT/DenseMap.h"
+#include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/OnDiskHashTable.h"
-#include <string>
 
 namespace llvm {
   class MemoryBuffer;
@@ -31,6 +27,7 @@
 namespace clang {
 
 class FileEntry;
+class Preprocessor;
 class PTHLexer;
 class DiagnosticsEngine;
 class FileSystemStatCache;
diff --git a/include/clang/Lex/PreprocessingRecord.h b/include/clang/Lex/PreprocessingRecord.h
index 87b8ce1..3ddf450 100644
--- a/include/clang/Lex/PreprocessingRecord.h
+++ b/include/clang/Lex/PreprocessingRecord.h
@@ -19,7 +19,6 @@
 #include "clang/Lex/PPCallbacks.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
diff --git a/include/clang/Lex/Preprocessor.h b/include/clang/Lex/Preprocessor.h
index e4f8557..000df66 100644
--- a/include/clang/Lex/Preprocessor.h
+++ b/include/clang/Lex/Preprocessor.h
@@ -32,6 +32,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Registry.h"
 #include <memory>
 #include <vector>
 
@@ -264,6 +265,10 @@
   /// \brief True if we hit the code-completion point.
   bool CodeCompletionReached;
 
+  /// \brief The code completion token containing the information
+  /// on the stem that is to be code completed.
+  IdentifierInfo *CodeCompletionII;
+
   /// \brief The directory that the main file should be considered to occupy,
   /// if it does not correspond to a real file (as happens when building a
   /// module).
@@ -983,6 +988,18 @@
   /// completion point.
   void CodeCompleteNaturalLanguage();
 
+  /// \brief Set the code completion token for filtering purposes.
+  void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter) {
+    CodeCompletionII = Filter;
+  }
+
+  /// \brief Get the code completion token for filtering purposes.
+  StringRef getCodeCompletionFilter() {
+    if (CodeCompletionII)
+      return CodeCompletionII->getName();
+    return {};
+  }
+
   /// \brief Retrieve the preprocessing record, or NULL if there is no
   /// preprocessing record.
   PreprocessingRecord *getPreprocessingRecord() const { return Record; }
@@ -1024,10 +1041,20 @@
   /// If \p OwnsTokens is false, this method assumes that the specified stream
   /// of tokens has a permanent owner somewhere, so they do not need to be
   /// copied. If it is true, it assumes the array of tokens is allocated with
-  /// \c new[] and must be freed.
+  /// \c new[] and the Preprocessor will delete[] it.
+private:
   void EnterTokenStream(const Token *Toks, unsigned NumToks,
                         bool DisableMacroExpansion, bool OwnsTokens);
 
+public:
+  void EnterTokenStream(std::unique_ptr<Token[]> Toks, unsigned NumToks,
+                        bool DisableMacroExpansion) {
+    EnterTokenStream(Toks.release(), NumToks, DisableMacroExpansion, true);
+  }
+  void EnterTokenStream(ArrayRef<Token> Toks, bool DisableMacroExpansion) {
+    EnterTokenStream(Toks.data(), Toks.size(), DisableMacroExpansion, false);
+  }
+
   /// \brief Pop the current lexer/macro exp off the top of the lexer stack.
   ///
   /// This should only be used in situations where the current state of the
@@ -1880,6 +1907,19 @@
   /// directly or indirectly.
   Module *getModuleContainingLocation(SourceLocation Loc);
 
+  /// \brief We want to produce a diagnostic at location IncLoc concerning a
+  /// missing module import.
+  ///
+  /// \param IncLoc The location at which the missing import was detected.
+  /// \param MLoc A location within the desired module at which some desired
+  ///        effect occurred (eg, where a desired entity was declared).
+  ///
+  /// \return A file that can be #included to import a module containing MLoc.
+  ///         Null if no such file could be determined or if a #include is not
+  ///         appropriate.
+  const FileEntry *getModuleHeaderToIncludeForDiagnostics(SourceLocation IncLoc,
+                                                          SourceLocation MLoc);
+
 private:
   // Macro handling.
   void HandleDefineDirective(Token &Tok, bool ImmediatelyAfterTopLevelIfndef);
@@ -1927,6 +1967,9 @@
   virtual bool HandleComment(Preprocessor &PP, SourceRange Comment) = 0;
 };
 
+/// \brief Registry of pragma handlers added by plugins
+typedef llvm::Registry<PragmaHandler> PragmaHandlerRegistry;
+
 }  // end namespace clang
 
 #endif
diff --git a/include/clang/Lex/PreprocessorOptions.h b/include/clang/Lex/PreprocessorOptions.h
index 963d95d..de652cc 100644
--- a/include/clang/Lex/PreprocessorOptions.h
+++ b/include/clang/Lex/PreprocessorOptions.h
@@ -12,7 +12,6 @@
 
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include <cassert>
diff --git a/include/clang/Lex/Token.h b/include/clang/Lex/Token.h
index 7ba22b2..4393e20 100644
--- a/include/clang/Lex/Token.h
+++ b/include/clang/Lex/Token.h
@@ -14,12 +14,10 @@
 #ifndef LLVM_CLANG_LEX_TOKEN_H
 #define LLVM_CLANG_LEX_TOKEN_H
 
-#include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/TemplateKinds.h"
 #include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/StringRef.h"
-#include <cstdlib>
+#include <cassert>
 
 namespace clang {
 
@@ -69,8 +67,8 @@
 
   /// Flags - Bits we track about this token, members of the TokenFlags enum.
   unsigned short Flags;
-public:
 
+public:
   // Various flags set per token:
   enum TokenFlags {
     StartOfLine   = 0x01,  // At start of line or only after whitespace
@@ -85,6 +83,7 @@
     IgnoredComma = 0x80,   // This comma is not a macro argument separator (MS).
     StringifiedInMacro = 0x100, // This string or character literal is formed by
                                 // macro stringizing or charizing operator.
+    CommaAfterElided = 0x200, // The comma following this token was elided (MS).
   };
 
   tok::TokenKind getKind() const { return Kind; }
@@ -235,6 +234,11 @@
     Flags |= Flag;
   }
 
+  /// \brief Get the specified flag.
+  bool getFlag(TokenFlags Flag) const {
+    return (Flags & Flag) != 0;
+  }
+
   /// \brief Unset the specified flag.
   void clearFlag(TokenFlags Flag) {
     Flags &= ~Flag;
@@ -258,17 +262,15 @@
 
   /// isAtStartOfLine - Return true if this token is at the start of a line.
   ///
-  bool isAtStartOfLine() const { return (Flags & StartOfLine) ? true : false; }
+  bool isAtStartOfLine() const { return getFlag(StartOfLine); }
 
   /// \brief Return true if this token has whitespace before it.
   ///
-  bool hasLeadingSpace() const { return (Flags & LeadingSpace) ? true : false; }
+  bool hasLeadingSpace() const { return getFlag(LeadingSpace); }
 
   /// \brief Return true if this identifier token should never
   /// be expanded in the future, due to C99 6.10.3.4p2.
-  bool isExpandDisabled() const {
-    return (Flags & DisableExpand) ? true : false;
-  }
+  bool isExpandDisabled() const { return getFlag(DisableExpand); }
 
   /// \brief Return true if we have an ObjC keyword identifier.
   bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const;
@@ -277,26 +279,25 @@
   tok::ObjCKeywordKind getObjCKeywordID() const;
 
   /// \brief Return true if this token has trigraphs or escaped newlines in it.
-  bool needsCleaning() const { return (Flags & NeedsCleaning) ? true : false; }
+  bool needsCleaning() const { return getFlag(NeedsCleaning); }
 
   /// \brief Return true if this token has an empty macro before it.
   ///
-  bool hasLeadingEmptyMacro() const {
-    return (Flags & LeadingEmptyMacro) ? true : false;
-  }
+  bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); }
 
   /// \brief Return true if this token is a string or character literal which
   /// has a ud-suffix.
-  bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; }
+  bool hasUDSuffix() const { return getFlag(HasUDSuffix); }
 
   /// Returns true if this token contains a universal character name.
-  bool hasUCN() const { return (Flags & HasUCN) ? true : false; }
+  bool hasUCN() const { return getFlag(HasUCN); }
 
   /// Returns true if this token is formed by macro by stringizing or charizing
   /// operator.
-  bool stringifiedInMacro() const {
-    return (Flags & StringifiedInMacro) ? true : false;
-  }
+  bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); }
+
+  /// Returns true if the comma after this token was elided.
+  bool commaAfterElided() const { return getFlag(CommaAfterElided); }
 };
 
 /// \brief Information about the conditional stack (\#if directives)
@@ -318,11 +319,11 @@
   bool FoundElse;
 };
 
-}  // end namespace clang
+} // end namespace clang
 
 namespace llvm {
   template <>
   struct isPodLike<clang::Token> { static const bool value = true; };
-}  // end namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CLANG_LEX_TOKEN_H
diff --git a/include/clang/Makefile b/include/clang/Makefile
deleted file mode 100644
index 5ba2dd2..0000000
--- a/include/clang/Makefile
+++ /dev/null
@@ -1,44 +0,0 @@
-CLANG_LEVEL := ../..
-DIRS := AST Basic Driver Parse Sema Serialization
-
-include $(CLANG_LEVEL)/Makefile
-
-install-local::
-	$(Echo) Installing Clang include files
-	$(Verb) $(MKDIR) $(DESTDIR)$(PROJ_includedir)
-	$(Verb) if test -d "$(PROJ_SRC_DIR)" ; then \
-	  cd $(PROJ_SRC_DIR)/.. && \
-	  for  hdr in `find clang -type f \
-	      '(' -name LICENSE.TXT \
-	       -o -name '*.def' \
-	       -o -name '*.h' \
-	       -o -name '*.inc' \
-	      ')' -print \
-              | grep -v CVS | grep -v .svn | grep -v .dir` ; do \
-	    instdir=$(DESTDIR)`dirname "$(PROJ_includedir)/$$hdr"` ; \
-	    if test \! -d "$$instdir" ; then \
-	      $(EchoCmd) Making install directory $$instdir ; \
-	      $(MKDIR) $$instdir ;\
-	    fi ; \
-	    $(DataInstall) $$hdr $(DESTDIR)$(PROJ_includedir)/$$hdr ; \
-	  done ; \
-	fi
-ifneq ($(PROJ_SRC_ROOT),$(PROJ_OBJ_ROOT))
-	$(Verb) if test -d "$(PROJ_OBJ_ROOT)/tools/clang/include/clang" ; then \
-	  cd $(PROJ_OBJ_ROOT)/tools/clang/include && \
-	  for hdr in `find clang -type f \
-	      '(' -name LICENSE.TXT \
-	       -o -name '*.def' \
-	       -o -name '*.h' \
-	       -o -name '*.inc' \
-	      ')' -print \
-            | grep -v CVS | grep -v .tmp | grep -v .dir` ; do \
-	    instdir=$(DESTDIR)`dirname "$(PROJ_includedir)/$$hdr"` ; \
-	    if test \! -d "$$instdir" ; then \
-	      $(EchoCmd) Making install directory $$instdir ; \
-	      $(MKDIR) $$instdir ;\
-	    fi ; \
-	    $(DataInstall) $$hdr $(DESTDIR)$(PROJ_includedir)/$$hdr ; \
-	  done ; \
-	fi
-endif
diff --git a/include/clang/Parse/Makefile b/include/clang/Parse/Makefile
deleted file mode 100644
index c477019..0000000
--- a/include/clang/Parse/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-CLANG_LEVEL := ../../..
-TD_SRC_DIR = $(PROJ_SRC_DIR)/../Basic
-BUILT_SOURCES = AttrParserStringSwitches.inc
-
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-$(ObjDir)/AttrParserStringSwitches.inc.tmp : $(TD_SRC_DIR)/Attr.td $(CLANG_TBLGEN) \
-                                   $(ObjDir)/.dir
-	$(Echo) "Building Clang parser-related attribute string switches"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-parser-string-switches -o $(call SYSPATH, $@) \
-		-I $(PROJ_SRC_DIR)/../../ $<
diff --git a/include/clang/Parse/Parser.h b/include/clang/Parse/Parser.h
index fafe7cb..9e9e1c0 100644
--- a/include/clang/Parse/Parser.h
+++ b/include/clang/Parse/Parser.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_PARSE_PARSER_H
 #define LLVM_CLANG_PARSE_PARSER_H
 
+#include "clang/AST/Availability.h"
 #include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/OperatorPrecedence.h"
 #include "clang/Basic/Specifiers.h"
@@ -142,6 +143,7 @@
 
   /// C++0x contextual keywords.
   mutable IdentifierInfo *Ident_final;
+  mutable IdentifierInfo *Ident_GNU_final;
   mutable IdentifierInfo *Ident_override;
 
   // C++ type trait keywords that can be reverted to identifiers and still be
@@ -313,6 +315,10 @@
     return true;
   }
 
+  SourceLocation getEndOfPreviousToken() {
+    return PP.getLocForEndOfToken(PrevTokLocation);
+  }
+
   /// Retrieve the underscored keyword (_Nonnull, _Nullable) that corresponds
   /// to the given nullability kind.
   IdentifierInfo *getNullabilityKeyword(NullabilityKind nullability) {
@@ -647,6 +653,8 @@
   /// Should only be used in Objective-C language modes.
   bool isObjCInstancetype() {
     assert(getLangOpts().ObjC1);
+    if (Tok.isAnnotation())
+      return false;
     if (!Ident_instancetype)
       Ident_instancetype = PP.getIdentifierInfo("instancetype");
     return Tok.getIdentifierInfo() == Ident_instancetype;
@@ -713,6 +721,16 @@
       assert(!isActive && "Forgot to call Commit or Revert!");
     }
   };
+  /// A TentativeParsingAction that automatically reverts in its destructor.
+  /// Useful for disambiguation parses that will always be reverted.
+  class RevertingTentativeParsingAction
+      : private Parser::TentativeParsingAction {
+  public:
+    RevertingTentativeParsingAction(Parser &P)
+        : Parser::TentativeParsingAction(P) {}
+    ~RevertingTentativeParsingAction() { Revert(); }
+  };
+
   class UnannotatedTentativeParsingAction;
 
   /// ObjCDeclContextSwitch - An object used to switch context from
@@ -1586,8 +1604,9 @@
 
   //===--------------------------------------------------------------------===//
   // C++ if/switch/while condition expression.
-  bool ParseCXXCondition(ExprResult &ExprResult, Decl *&DeclResult,
-                         SourceLocation Loc, bool ConvertToBoolean);
+  Sema::ConditionResult ParseCXXCondition(StmtResult *InitStmt,
+                                          SourceLocation Loc,
+                                          Sema::ConditionKind CK);
 
   //===--------------------------------------------------------------------===//
   // C++ Coroutines
@@ -1678,10 +1697,10 @@
                                     unsigned ScopeFlags);
   void ParseCompoundStatementLeadingPragmas();
   StmtResult ParseCompoundStatementBody(bool isStmtExpr = false);
-  bool ParseParenExprOrCondition(ExprResult &ExprResult,
-                                 Decl *&DeclResult,
+  bool ParseParenExprOrCondition(StmtResult *InitStmt,
+                                 Sema::ConditionResult &CondResult,
                                  SourceLocation Loc,
-                                 bool ConvertToBoolean);
+                                 Sema::ConditionKind CK);
   StmtResult ParseIfStatement(SourceLocation *TrailingElseLoc);
   StmtResult ParseSwitchStatement(SourceLocation *TrailingElseLoc);
   StmtResult ParseWhileStatement(SourceLocation *TrailingElseLoc);
@@ -1873,7 +1892,6 @@
 
   bool isDeclarationSpecifier(bool DisambiguatingWithExpression = false);
   bool isTypeSpecifierQualifier();
-  bool isTypeQualifier() const;
 
   /// isKnownToBeTypeSpecifier - Return true if we know that the specified token
   /// is definitely a type-specifier.  Return false if it isn't part of a type
@@ -1974,11 +1992,18 @@
   /// the function returns true to let the declaration parsing code handle it.
   bool isCXXFunctionDeclarator(bool *IsAmbiguous = nullptr);
 
-  /// isCXXConditionDeclaration - Disambiguates between a declaration or an
-  /// expression for a condition of a if/switch/while/for statement.
-  /// If during the disambiguation process a parsing error is encountered,
-  /// the function returns true to let the declaration parsing code handle it.
-  bool isCXXConditionDeclaration();
+  struct ConditionDeclarationOrInitStatementState;
+  enum class ConditionOrInitStatement {
+    Expression,    ///< Disambiguated as an expression (either kind).
+    ConditionDecl, ///< Disambiguated as the declaration form of condition.
+    InitStmtDecl,  ///< Disambiguated as a simple-declaration init-statement.
+    Error          ///< Can't be any of the above!
+  };
+  /// \brief Disambiguates between the different kinds of things that can happen
+  /// after 'if (' or 'switch ('. This could be one of two different kinds of
+  /// declaration (depending on whether there is a ';' later) or an expression.
+  ConditionOrInitStatement
+  isCXXConditionDeclarationOrInitStatement(bool CanBeInitStmt);
 
   bool isCXXTypeId(TentativeCXXTypeIdContext Context, bool &isAmbiguous);
   bool isCXXTypeId(TentativeCXXTypeIdContext Context) {
@@ -2201,8 +2226,19 @@
   SourceLocation SkipExtendedMicrosoftTypeAttributes();
   void ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs);
   void ParseBorlandTypeAttributes(ParsedAttributes &attrs);
-  void ParseOpenCLAttributes(ParsedAttributes &attrs);
+  void ParseOpenCLKernelAttributes(ParsedAttributes &attrs);
   void ParseOpenCLQualifiers(ParsedAttributes &Attrs);
+  /// \brief Parses opencl_unroll_hint attribute if language is OpenCL v2.0
+  /// or higher.
+  /// \return false if error happens.
+  bool MaybeParseOpenCLUnrollHintAttribute(ParsedAttributes &Attrs) {
+    if (getLangOpts().OpenCL)
+      return ParseOpenCLUnrollHintAttribute(Attrs);
+    return true;
+  }
+  /// \brief Parses opencl_unroll_hint attribute.
+  /// \return false if error happens.
+  bool ParseOpenCLUnrollHintAttribute(ParsedAttributes &Attrs);
   void ParseNullabilityTypeSpecifiers(ParsedAttributes &attrs);
 
   VersionTuple ParseVersionTuple(SourceRange &Range);
@@ -2214,6 +2250,9 @@
                                   SourceLocation ScopeLoc,
                                   AttributeList::Syntax Syntax);
 
+  Optional<AvailabilitySpec> ParseAvailabilitySpec();
+  ExprResult ParseAvailabilityCheckExpr(SourceLocation StartLoc);
+
   void ParseObjCBridgeRelatedAttribute(IdentifierInfo &ObjCBridgeRelated,
                                        SourceLocation ObjCBridgeRelatedLoc,
                                        ParsedAttributes &attrs,
@@ -2326,6 +2365,7 @@
                                  bool AtomicAllowed = true,
                                  bool IdentifierRequired = false);
   void ParseDirectDeclarator(Declarator &D);
+  void ParseDecompositionDeclarator(Declarator &D);
   void ParseParenDeclarator(Declarator &D);
   void ParseFunctionDeclarator(Declarator &D,
                                ParsedAttributes &attrs,
@@ -2425,7 +2465,7 @@
       ParsingDeclRAIIObject *DiagsFromTParams = nullptr);
   DeclGroupPtrTy ParseCXXClassMemberDeclarationWithPragmas(
       AccessSpecifier &AS, ParsedAttributesWithRange &AccessAttrs,
-      DeclSpec::TST TagType, Decl *TagDecl);
+      DeclSpec::TST TagType, Decl *Tag);
   void ParseConstructorInitializer(Decl *ConstructorDecl);
   MemInitResult ParseMemInitializer(Decl *ConstructorDecl);
   void HandleMemberFunctionDeclDelays(Declarator& DeclaratorInfo,
@@ -2453,18 +2493,30 @@
 
   //===--------------------------------------------------------------------===//
   // OpenMP: Directives and clauses.
+  /// Parse clauses for '#pragma omp declare simd'.
+  DeclGroupPtrTy ParseOMPDeclareSimdClauses(DeclGroupPtrTy Ptr,
+                                            CachedTokens &Toks,
+                                            SourceLocation Loc);
   /// \brief Parses declarative OpenMP directives.
-  DeclGroupPtrTy ParseOpenMPDeclarativeDirective();
+  DeclGroupPtrTy ParseOpenMPDeclarativeDirectiveWithExtDecl(
+      AccessSpecifier &AS, ParsedAttributesWithRange &Attrs,
+      DeclSpec::TST TagType = DeclSpec::TST_unspecified,
+      Decl *TagDecl = nullptr);
+  /// \brief Parse 'omp declare reduction' construct.
+  DeclGroupPtrTy ParseOpenMPDeclareReductionDirective(AccessSpecifier AS);
+
   /// \brief Parses simple list of variables.
   ///
   /// \param Kind Kind of the directive.
-  /// \param [out] VarList List of referenced variables.
+  /// \param Callback Callback function to be called for the list elements.
   /// \param AllowScopeSpecifier true, if the variables can have fully
   /// qualified names.
   ///
-  bool ParseOpenMPSimpleVarList(OpenMPDirectiveKind Kind,
-                                SmallVectorImpl<Expr *> &VarList,
-                                bool AllowScopeSpecifier);
+  bool ParseOpenMPSimpleVarList(
+      OpenMPDirectiveKind Kind,
+      const llvm::function_ref<void(CXXScopeSpec &, DeclarationNameInfo)> &
+          Callback,
+      bool AllowScopeSpecifier);
   /// \brief Parses declarative or executable directive.
   ///
   /// \param Allowed ACK_Any, if any directives are allowed,
@@ -2512,6 +2564,29 @@
                                       OpenMPClauseKind Kind);
 
 public:
+  /// Parses simple expression in parens for single-expression clauses of OpenMP
+  /// constructs.
+  /// \param RLoc Returned location of right paren.
+  ExprResult ParseOpenMPParensExpr(StringRef ClauseName, SourceLocation &RLoc);
+
+  /// Data used for parsing list of variables in OpenMP clauses.
+  struct OpenMPVarListDataTy {
+    Expr *TailExpr = nullptr;
+    SourceLocation ColonLoc;
+    CXXScopeSpec ReductionIdScopeSpec;
+    DeclarationNameInfo ReductionId;
+    OpenMPDependClauseKind DepKind = OMPC_DEPEND_unknown;
+    OpenMPLinearClauseKind LinKind = OMPC_LINEAR_val;
+    OpenMPMapClauseKind MapTypeModifier = OMPC_MAP_unknown;
+    OpenMPMapClauseKind MapType = OMPC_MAP_unknown;
+    bool IsMapTypeImplicit = false;
+    SourceLocation DepLinMapLoc;
+  };
+
+  /// Parses clauses with list.
+  bool ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind,
+                          SmallVectorImpl<Expr *> &Vars,
+                          OpenMPVarListDataTy &Data);
   bool ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext,
                           bool AllowDestructorName,
                           bool AllowConstructorName,
diff --git a/include/clang/Rewrite/Frontend/ASTConsumers.h b/include/clang/Rewrite/Frontend/ASTConsumers.h
index c9df889..e054e75 100644
--- a/include/clang/Rewrite/Frontend/ASTConsumers.h
+++ b/include/clang/Rewrite/Frontend/ASTConsumers.h
@@ -28,17 +28,18 @@
 // ObjC rewriter: attempts to rewrite ObjC constructs into pure C code.
 // This is considered experimental, and only works with Apple's ObjC runtime.
 std::unique_ptr<ASTConsumer>
-CreateObjCRewriter(const std::string &InFile, raw_ostream *OS,
+CreateObjCRewriter(const std::string &InFile, std::unique_ptr<raw_ostream> OS,
                    DiagnosticsEngine &Diags, const LangOptions &LOpts,
                    bool SilenceRewriteMacroWarning);
 std::unique_ptr<ASTConsumer>
-CreateModernObjCRewriter(const std::string &InFile, raw_ostream *OS,
+CreateModernObjCRewriter(const std::string &InFile,
+                         std::unique_ptr<raw_ostream> OS,
                          DiagnosticsEngine &Diags, const LangOptions &LOpts,
                          bool SilenceRewriteMacroWarning, bool LineInfo);
 
 /// CreateHTMLPrinter - Create an AST consumer which rewrites source code to
 /// HTML with syntax highlighting suitable for viewing in a web-browser.
-std::unique_ptr<ASTConsumer> CreateHTMLPrinter(raw_ostream *OS,
+std::unique_ptr<ASTConsumer> CreateHTMLPrinter(std::unique_ptr<raw_ostream> OS,
                                                Preprocessor &PP,
                                                bool SyntaxHighlight = true,
                                                bool HighlightMacros = true);
diff --git a/include/clang/Sema/AttributeList.h b/include/clang/Sema/AttributeList.h
index 7a161ba..fcddbec 100644
--- a/include/clang/Sema/AttributeList.h
+++ b/include/clang/Sema/AttributeList.h
@@ -283,11 +283,10 @@
     Invalid(false), UsedAsTypeAttr(false), IsAvailability(false),
     IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
     HasProcessingCache(false), NextInPosition(nullptr), NextInPool(nullptr) {
-    ArgsVector Args;
-    Args.push_back(Parm1);
-    Args.push_back(Parm2);
-    Args.push_back(Parm3);
-    memcpy(getArgsBuffer(), &Args[0], 3 * sizeof(ArgsUnion));
+    ArgsUnion *Args = getArgsBuffer();
+    Args[0] = Parm1;
+    Args[1] = Parm2;
+    Args[2] = Parm3;
     AttrKind = getKind(getName(), getScopeName(), syntaxUsed);
   }
   
@@ -500,6 +499,7 @@
 
   bool isTargetSpecificAttr() const;
   bool isTypeAttr() const;
+  bool isStmtAttr() const;
 
   bool hasCustomParsing() const;
   unsigned getMinArgs() const;
@@ -869,6 +869,7 @@
   ExpectedFunction,
   ExpectedUnion,
   ExpectedVariableOrFunction,
+  ExpectedFunctionVariableOrObjCInterface,
   ExpectedFunctionOrMethod,
   ExpectedParameter,
   ExpectedFunctionMethodOrBlock,
@@ -878,9 +879,9 @@
   ExpectedEnum,
   ExpectedVariable,
   ExpectedMethod,
-  ExpectedVariableFunctionOrLabel,
   ExpectedFieldOrGlobalVar,
   ExpectedStruct,
+  ExpectedParameterOrTypedef,
   ExpectedVariableOrTypedef,
   ExpectedTLSVar,
   ExpectedVariableOrField,
@@ -894,6 +895,7 @@
   ExpectedObjCInstanceMethod,
   ExpectedObjCInterfaceDeclInitMethod,
   ExpectedFunctionVariableOrClass,
+  ExpectedFunctionVariableClassOrObjCInterface,
   ExpectedObjectiveCProtocol,
   ExpectedFunctionGlobalVarMethodOrProperty,
   ExpectedStructOrUnionOrTypedef,
@@ -901,7 +903,10 @@
   ExpectedObjectiveCInterfaceOrProtocol,
   ExpectedKernelFunction,
   ExpectedFunctionWithProtoType,
-  ExpectedVariableFieldOrTypedef
+  ExpectedVariableEnumFieldOrTypedef,
+  ExpectedFunctionMethodEnumOrClass,
+  ExpectedStructClassVariableFunctionOrInlineNamespace,
+  ExpectedForMaybeUnused
 };
 
 }  // end namespace clang
diff --git a/include/clang/Sema/CleanupInfo.h b/include/clang/Sema/CleanupInfo.h
new file mode 100644
index 0000000..751bfb6
--- /dev/null
+++ b/include/clang/Sema/CleanupInfo.h
@@ -0,0 +1,47 @@
+//===--- CleanupInfo.cpp - Cleanup Control in Sema ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements a set of operations on whether generating an
+//  ExprWithCleanups in a full expression.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_CLEANUP_INFO_H
+#define LLVM_CLANG_SEMA_CLEANUP_INFO_H
+
+namespace clang {
+
+class CleanupInfo {
+  bool ExprNeedsCleanups = false;
+  bool CleanupsHaveSideEffects = false;
+
+public:
+  bool exprNeedsCleanups() const { return ExprNeedsCleanups; }
+
+  bool cleanupsHaveSideEffects() const { return CleanupsHaveSideEffects; }
+
+  void setExprNeedsCleanups(bool SideEffects) {
+    ExprNeedsCleanups = true;
+    CleanupsHaveSideEffects |= SideEffects;
+  }
+
+  void reset() {
+    ExprNeedsCleanups = false;
+    CleanupsHaveSideEffects = false;
+  }
+
+  void mergeFrom(CleanupInfo Rhs) {
+    ExprNeedsCleanups |= Rhs.ExprNeedsCleanups;
+    CleanupsHaveSideEffects |= Rhs.CleanupsHaveSideEffects;
+  }
+};
+
+} // end namespace clang
+
+#endif
diff --git a/include/clang/Sema/CodeCompleteConsumer.h b/include/clang/Sema/CodeCompleteConsumer.h
index 9702273..4ad1b01 100644
--- a/include/clang/Sema/CodeCompleteConsumer.h
+++ b/include/clang/Sema/CodeCompleteConsumer.h
@@ -15,6 +15,7 @@
 
 #include "clang-c/Index.h"
 #include "clang/AST/CanonicalType.h"
+#include "clang/AST/DeclBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Sema/CodeCompleteOptions.h"
 #include "llvm/ADT/DenseMap.h"
@@ -22,6 +23,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include <string>
+#include <utility>
 
 namespace clang {
 
@@ -516,8 +518,8 @@
 
 public:
   explicit CodeCompletionTUInfo(
-                    IntrusiveRefCntPtr<GlobalCodeCompletionAllocator> Allocator)
-    : AllocatorRef(Allocator) { }
+      IntrusiveRefCntPtr<GlobalCodeCompletionAllocator> Allocator)
+      : AllocatorRef(std::move(Allocator)) {}
 
   IntrusiveRefCntPtr<GlobalCodeCompletionAllocator> getAllocatorRef() const {
     return AllocatorRef;
@@ -911,6 +913,13 @@
   /// \brief Deregisters and destroys this code-completion consumer.
   virtual ~CodeCompleteConsumer();
 
+  /// \name Code-completion filtering
+  /// \brief Check if the result should be filtered out.
+  virtual bool isResultFilteredOut(StringRef Filter,
+                                   CodeCompletionResult Results) {
+    return false;
+  }
+
   /// \name Code-completion callbacks
   //@{
   /// \brief Process the finalized code-completion results.
@@ -964,6 +973,8 @@
                                  OverloadCandidate *Candidates,
                                  unsigned NumCandidates) override;
 
+  bool isResultFilteredOut(StringRef Filter, CodeCompletionResult Results) override;
+
   CodeCompletionAllocator &getAllocator() override {
     return CCTUInfo.getAllocator();
   }
diff --git a/include/clang/Sema/DeclSpec.h b/include/clang/Sema/DeclSpec.h
index 8224992..df434ec 100644
--- a/include/clang/Sema/DeclSpec.h
+++ b/include/clang/Sema/DeclSpec.h
@@ -280,6 +280,7 @@
   static const TST TST_half = clang::TST_half;
   static const TST TST_float = clang::TST_float;
   static const TST TST_double = clang::TST_double;
+  static const TST TST_float128 = clang::TST_float128;
   static const TST TST_bool = clang::TST_bool;
   static const TST TST_decimal32 = clang::TST_decimal32;
   static const TST TST_decimal64 = clang::TST_decimal64;
@@ -299,6 +300,9 @@
   static const TST TST_auto_type = clang::TST_auto_type;
   static const TST TST_unknown_anytype = clang::TST_unknown_anytype;
   static const TST TST_atomic = clang::TST_atomic;
+#define GENERIC_IMAGE_TYPE(ImgType, Id) \
+  static const TST TST_##ImgType##_t = clang::TST_##ImgType##_t;
+#include "clang/Basic/OpenCLImageTypes.def"
   static const TST TST_error = clang::TST_error;
 
   // type-qualifiers
@@ -307,9 +311,10 @@
     TQ_const       = 1,
     TQ_restrict    = 2,
     TQ_volatile    = 4,
+    TQ_unaligned   = 8,
     // This has no corresponding Qualifiers::TQ value, because it's not treated
     // as a qualifier in our type system.
-    TQ_atomic      = 8
+    TQ_atomic      = 16
   };
 
   /// ParsedSpecifiers - Flags to query which specifiers were applied.  This is
@@ -340,7 +345,7 @@
   unsigned TypeSpecPipe : 1;
 
   // type-qualifiers
-  unsigned TypeQualifiers : 4;  // Bitwise OR of TQ.
+  unsigned TypeQualifiers : 5;  // Bitwise OR of TQ.
 
   // function-specifier
   unsigned FS_inline_specified : 1;
@@ -382,7 +387,8 @@
   /// TSTNameLoc provides source range info for tag types.
   SourceLocation TSTNameLoc;
   SourceRange TypeofParensRange;
-  SourceLocation TQ_constLoc, TQ_restrictLoc, TQ_volatileLoc, TQ_atomicLoc;
+  SourceLocation TQ_constLoc, TQ_restrictLoc, TQ_volatileLoc, TQ_atomicLoc,
+      TQ_unalignedLoc;
   SourceLocation FS_inlineLoc, FS_virtualLoc, FS_explicitLoc, FS_noreturnLoc;
   SourceLocation FS_forceinlineLoc;
   SourceLocation FriendLoc, ModulePrivateLoc, ConstexprLoc, ConceptLoc;
@@ -536,6 +542,7 @@
   SourceLocation getRestrictSpecLoc() const { return TQ_restrictLoc; }
   SourceLocation getVolatileSpecLoc() const { return TQ_volatileLoc; }
   SourceLocation getAtomicSpecLoc() const { return TQ_atomicLoc; }
+  SourceLocation getUnalignedSpecLoc() const { return TQ_unalignedLoc; }
   SourceLocation getPipeLoc() const { return TQ_pipeLoc; }
 
   /// \brief Clear out all of the type qualifiers.
@@ -545,6 +552,7 @@
     TQ_restrictLoc = SourceLocation();
     TQ_volatileLoc = SourceLocation();
     TQ_atomicLoc = SourceLocation();
+    TQ_unalignedLoc = SourceLocation();
     TQ_pipeLoc = SourceLocation();
   }
 
@@ -1111,8 +1119,8 @@
   };
 
   struct PointerTypeInfo : TypeInfoCommon {
-    /// The type qualifiers: const/volatile/restrict/atomic.
-    unsigned TypeQuals : 4;
+    /// The type qualifiers: const/volatile/restrict/unaligned/atomic.
+    unsigned TypeQuals : 5;
 
     /// The location of the const-qualifier, if any.
     unsigned ConstQualLoc;
@@ -1126,6 +1134,9 @@
     /// The location of the _Atomic-qualifier, if any.
     unsigned AtomicQualLoc;
 
+    /// The location of the __unaligned-qualifier, if any.
+    unsigned UnalignedQualLoc;
+
     void destroy() {
     }
   };
@@ -1140,14 +1151,15 @@
   };
 
   struct ArrayTypeInfo : TypeInfoCommon {
-    /// The type qualifiers for the array: const/volatile/restrict/_Atomic.
-    unsigned TypeQuals : 4;
+    /// The type qualifiers for the array:
+    /// const/volatile/restrict/__unaligned/_Atomic.
+    unsigned TypeQuals : 5;
 
     /// True if this dimension included the 'static' keyword.
-    bool hasStatic : 1;
+    unsigned hasStatic : 1;
 
     /// True if this dimension was [*].  In this case, NumElts is null.
-    bool isStar : 1;
+    unsigned isStar : 1;
 
     /// This is the size of the array, or null if [] or [*] was specified.
     /// Since the parser is multi-purpose, and we don't want to impose a root
@@ -1176,7 +1188,7 @@
     /// complete. Non-NULL indicates that there is a default argument.
     CachedTokens *DefaultArgTokens;
 
-    ParamInfo() {}
+    ParamInfo() = default;
     ParamInfo(IdentifierInfo *ident, SourceLocation iloc,
               Decl *param,
               CachedTokens *DefArgTokens = nullptr)
@@ -1207,9 +1219,9 @@
     /// Otherwise, it's an rvalue reference.
     unsigned RefQualifierIsLValueRef : 1;
 
-    /// The type qualifiers: const/volatile/restrict.
+    /// The type qualifiers: const/volatile/restrict/__unaligned
     /// The qualifier bitmask values are the same as in QualType.
-    unsigned TypeQuals : 3;
+    unsigned TypeQuals : 4;
 
     /// ExceptionSpecType - An ExceptionSpecificationType value.
     unsigned ExceptionSpecType : 4;
@@ -1393,16 +1405,16 @@
 
   struct BlockPointerTypeInfo : TypeInfoCommon {
     /// For now, sema will catch these as invalid.
-    /// The type qualifiers: const/volatile/restrict/_Atomic.
-    unsigned TypeQuals : 4;
+    /// The type qualifiers: const/volatile/restrict/__unaligned/_Atomic.
+    unsigned TypeQuals : 5;
 
     void destroy() {
     }
   };
 
   struct MemberPointerTypeInfo : TypeInfoCommon {
-    /// The type qualifiers: const/volatile/restrict/_Atomic.
-    unsigned TypeQuals : 4;
+    /// The type qualifiers: const/volatile/restrict/__unaligned/_Atomic.
+    unsigned TypeQuals : 5;
     // CXXScopeSpec has a constructor, so it can't be a direct member.
     // So we need some pointer-aligned storage and a bit of trickery.
     union {
@@ -1466,7 +1478,8 @@
                                     SourceLocation ConstQualLoc,
                                     SourceLocation VolatileQualLoc,
                                     SourceLocation RestrictQualLoc,
-                                    SourceLocation AtomicQualLoc) {
+                                    SourceLocation AtomicQualLoc,
+                                    SourceLocation UnalignedQualLoc) {
     DeclaratorChunk I;
     I.Kind                = Pointer;
     I.Loc                 = Loc;
@@ -1475,6 +1488,7 @@
     I.Ptr.VolatileQualLoc = VolatileQualLoc.getRawEncoding();
     I.Ptr.RestrictQualLoc = RestrictQualLoc.getRawEncoding();
     I.Ptr.AtomicQualLoc   = AtomicQualLoc.getRawEncoding();
+    I.Ptr.UnalignedQualLoc = UnalignedQualLoc.getRawEncoding();
     I.Ptr.AttrList        = nullptr;
     return I;
   }
@@ -1553,7 +1567,7 @@
     I.Kind          = Pipe;
     I.Loc           = Loc;
     I.Cls.TypeQuals = TypeQuals;
-    I.Cls.AttrList  = 0;
+    I.Cls.AttrList  = nullptr;
     return I;
   }
 
@@ -1586,6 +1600,58 @@
   }
 };
 
+/// A parsed C++17 decomposition declarator of the form
+///   '[' identifier-list ']'
+class DecompositionDeclarator {
+public:
+  struct Binding {
+    IdentifierInfo *Name;
+    SourceLocation NameLoc;
+  };
+
+private:
+  /// The locations of the '[' and ']' tokens.
+  SourceLocation LSquareLoc, RSquareLoc;
+
+  /// The bindings.
+  Binding *Bindings;
+  unsigned NumBindings : 31;
+  unsigned DeleteBindings : 1;
+
+  friend class Declarator;
+
+public:
+  DecompositionDeclarator()
+      : Bindings(nullptr), NumBindings(0), DeleteBindings(false) {}
+  DecompositionDeclarator(const DecompositionDeclarator &G) = delete;
+  DecompositionDeclarator &operator=(const DecompositionDeclarator &G) = delete;
+  ~DecompositionDeclarator() {
+    if (DeleteBindings)
+      delete[] Bindings;
+  }
+
+  void clear() {
+    LSquareLoc = RSquareLoc = SourceLocation();
+    if (DeleteBindings)
+      delete[] Bindings;
+    Bindings = nullptr;
+    NumBindings = 0;
+    DeleteBindings = false;
+  }
+
+  ArrayRef<Binding> bindings() const {
+    return llvm::makeArrayRef(Bindings, NumBindings);
+  }
+
+  bool isSet() const { return LSquareLoc.isValid(); }
+
+  SourceLocation getLSquareLoc() const { return LSquareLoc; }
+  SourceLocation getRSquareLoc() const { return RSquareLoc; }
+  SourceRange getSourceRange() const {
+    return SourceRange(LSquareLoc, RSquareLoc);
+  }
+};
+
 /// \brief Described the kind of function definition (if any) provided for
 /// a function.
 enum FunctionDefinitionKind {
@@ -1619,6 +1685,7 @@
     MemberContext,       // Struct/Union field.
     BlockContext,        // Declaration within a block in a function.
     ForContext,          // Declaration within first part of a for loop.
+    InitStmtContext,     // Declaration within optional init stmt of if/switch.
     ConditionContext,    // Condition declaration in a C++ if/switch/while/for.
     TemplateParamContext,// Within a template parameter list.
     CXXNewContext,       // C++ new-expression.
@@ -1643,6 +1710,9 @@
   /// \brief Where we are parsing this declarator.
   TheContext Context;
 
+  /// The C++17 structured binding, if any. This is an alternative to a Name.
+  DecompositionDeclarator BindingGroup;
+
   /// DeclTypeInfo - This holds each type that the declarator includes as it is
   /// parsed.  This is pushed from the identifier out, which means that element
   /// #0 will be the most closely bound to the identifier, and
@@ -1650,10 +1720,10 @@
   SmallVector<DeclaratorChunk, 8> DeclTypeInfo;
 
   /// InvalidType - Set by Sema::GetTypeForDeclarator().
-  bool InvalidType : 1;
+  unsigned InvalidType : 1;
 
   /// GroupingParens - Set by Parser::ParseParenDeclarator().
-  bool GroupingParens : 1;
+  unsigned GroupingParens : 1;
 
   /// FunctionDefinition - Is this Declarator for a function or member 
   /// definition and, if so, what kind?
@@ -1662,19 +1732,7 @@
   unsigned FunctionDefinition : 2;
 
   /// \brief Is this Declarator a redeclaration?
-  bool Redeclaration : 1;
-
-  /// Attrs - Attributes.
-  ParsedAttributes Attrs;
-
-  /// \brief The asm label, if specified.
-  Expr *AsmLabel;
-
-  /// InlineParams - This is a local array used for the first function decl
-  /// chunk to avoid going to the heap for the common case when we have one
-  /// function chunk in the declarator.
-  DeclaratorChunk::ParamInfo InlineParams[16];
-  bool InlineParamsUsed;
+  unsigned Redeclaration : 1;
 
   /// \brief true if the declaration is preceded by \c __extension__.
   unsigned Extension : 1;
@@ -1685,6 +1743,27 @@
   /// Indicates whether this is an Objective-C 'weak' property.
   unsigned ObjCWeakProperty : 1;
 
+  /// Indicates whether the InlineParams / InlineBindings storage has been used.
+  unsigned InlineStorageUsed : 1;
+
+  /// Attrs - Attributes.
+  ParsedAttributes Attrs;
+
+  /// \brief The asm label, if specified.
+  Expr *AsmLabel;
+
+#ifndef _MSC_VER
+  union {
+#endif
+    /// InlineParams - This is a local array used for the first function decl
+    /// chunk to avoid going to the heap for the common case when we have one
+    /// function chunk in the declarator.
+    DeclaratorChunk::ParamInfo InlineParams[16];
+    DecompositionDeclarator::Binding InlineBindings[16];
+#ifndef _MSC_VER
+  };
+#endif
+
   /// \brief If this is the second or subsequent declarator in this declaration,
   /// the location of the comma before this declarator.
   SourceLocation CommaLoc;
@@ -1697,14 +1776,12 @@
 
 public:
   Declarator(const DeclSpec &ds, TheContext C)
-    : DS(ds), Range(ds.getSourceRange()), Context(C),
-      InvalidType(DS.getTypeSpecType() == DeclSpec::TST_error),
-      GroupingParens(false), FunctionDefinition(FDK_Declaration), 
-      Redeclaration(false),
-      Attrs(ds.getAttributePool().getFactory()), AsmLabel(nullptr),
-      InlineParamsUsed(false), Extension(false), ObjCIvar(false),
-      ObjCWeakProperty(false) {
-  }
+      : DS(ds), Range(ds.getSourceRange()), Context(C),
+        InvalidType(DS.getTypeSpecType() == DeclSpec::TST_error),
+        GroupingParens(false), FunctionDefinition(FDK_Declaration),
+        Redeclaration(false), Extension(false), ObjCIvar(false),
+        ObjCWeakProperty(false), InlineStorageUsed(false),
+        Attrs(ds.getAttributePool().getFactory()), AsmLabel(nullptr) {}
 
   ~Declarator() {
     clear();
@@ -1731,6 +1808,10 @@
 
   /// \brief Retrieve the name specified by this declarator.
   UnqualifiedId &getName() { return Name; }
+
+  const DecompositionDeclarator &getDecompositionDeclarator() const {
+    return BindingGroup;
+  }
   
   TheContext getContext() const { return Context; }
 
@@ -1774,13 +1855,14 @@
     SS.clear();
     Name.clear();
     Range = DS.getSourceRange();
-    
+    BindingGroup.clear();
+
     for (unsigned i = 0, e = DeclTypeInfo.size(); i != e; ++i)
       DeclTypeInfo[i].destroy();
     DeclTypeInfo.clear();
     Attrs.clear();
     AsmLabel = nullptr;
-    InlineParamsUsed = false;
+    InlineStorageUsed = false;
     ObjCIvar = false;
     ObjCWeakProperty = false;
     CommaLoc = SourceLocation();
@@ -1797,6 +1879,7 @@
     case MemberContext:
     case BlockContext:
     case ForContext:
+    case InitStmtContext:
     case ConditionContext:
       return false;
 
@@ -1831,6 +1914,7 @@
     case MemberContext:
     case BlockContext:
     case ForContext:
+    case InitStmtContext:
     case ConditionContext:
     case PrototypeContext:
     case LambdaExprParameterContext:
@@ -1864,6 +1948,7 @@
     case MemberContext:
     case BlockContext:
     case ForContext:
+    case InitStmtContext:
     case ConditionContext:
     case PrototypeContext:
     case LambdaExprParameterContext:
@@ -1888,6 +1973,45 @@
     llvm_unreachable("unknown context kind!");
   }
 
+  /// Return true if the context permits a C++17 decomposition declarator.
+  bool mayHaveDecompositionDeclarator() const {
+    switch (Context) {
+    case FileContext:
+      // FIXME: It's not clear that the proposal meant to allow file-scope
+      // structured bindings, but it does.
+    case BlockContext:
+    case ForContext:
+    case InitStmtContext:
+      return true;
+
+    case ConditionContext:
+    case MemberContext:
+    case PrototypeContext:
+    case TemplateParamContext:
+      // Maybe one day...
+      return false;
+
+    // These contexts don't allow any kind of non-abstract declarator.
+    case KNRTypeListContext:
+    case TypeNameContext:
+    case AliasDeclContext:
+    case AliasTemplateContext:
+    case LambdaExprParameterContext:
+    case ObjCParameterContext:
+    case ObjCResultContext:
+    case CXXNewContext:
+    case CXXCatchContext:
+    case ObjCCatchContext:
+    case BlockLiteralContext:
+    case LambdaExprContext:
+    case ConversionIdContext:
+    case TemplateTypeArgContext:
+    case TrailingReturnContext:
+      return false;
+    }
+    llvm_unreachable("unknown context kind!");
+  }
+
   /// mayBeFollowedByCXXDirectInit - Return true if the declarator can be
   /// followed by a C++ direct initializer, e.g. "int x(1);".
   bool mayBeFollowedByCXXDirectInit() const {
@@ -1908,6 +2032,7 @@
     case FileContext:
     case BlockContext:
     case ForContext:
+    case InitStmtContext:
       return true;
 
     case ConditionContext:
@@ -1940,14 +2065,22 @@
   }
 
   /// isPastIdentifier - Return true if we have parsed beyond the point where
-  /// the
+  /// the name would appear. (This may happen even if we haven't actually parsed
+  /// a name, perhaps because this context doesn't require one.)
   bool isPastIdentifier() const { return Name.isValid(); }
 
   /// hasName - Whether this declarator has a name, which might be an
   /// identifier (accessible via getIdentifier()) or some kind of
-  /// special C++ name (constructor, destructor, etc.).
-  bool hasName() const { 
-    return Name.getKind() != UnqualifiedId::IK_Identifier || Name.Identifier;
+  /// special C++ name (constructor, destructor, etc.), or a structured
+  /// binding (which is not exactly a name, but occupies the same position).
+  bool hasName() const {
+    return Name.getKind() != UnqualifiedId::IK_Identifier || Name.Identifier ||
+           isDecompositionDeclarator();
+  }
+
+  /// Return whether this declarator is a decomposition declarator.
+  bool isDecompositionDeclarator() const {
+    return BindingGroup.isSet();
   }
 
   IdentifierInfo *getIdentifier() const { 
@@ -1962,7 +2095,13 @@
   void SetIdentifier(IdentifierInfo *Id, SourceLocation IdLoc) {
     Name.setIdentifier(Id, IdLoc);
   }
-  
+
+  /// Set the decomposition bindings for this declarator.
+  void
+  setDecompositionBindings(SourceLocation LSquareLoc,
+                           ArrayRef<DecompositionDeclarator::Binding> Bindings,
+                           SourceLocation RSquareLoc);
+
   /// AddTypeInfo - Add a chunk to this declarator. Also extend the range to
   /// EndLoc, which should be the last token of the chunk.
   void AddTypeInfo(const DeclaratorChunk &TI,
@@ -2106,9 +2245,10 @@
     case FileContext:
     case MemberContext:
     case BlockContext:
+    case ForContext:
+    case InitStmtContext:
       return true;
 
-    case ForContext:
     case ConditionContext:
     case KNRTypeListContext:
     case TypeNameContext:
@@ -2259,7 +2399,9 @@
     VS_None = 0,
     VS_Override = 1,
     VS_Final = 2,
-    VS_Sealed = 4
+    VS_Sealed = 4,
+    // Represents the __final keyword, which is legal for gcc in pre-C++11 mode.
+    VS_GNU_Final = 8
   };
 
   VirtSpecifiers() : Specifiers(0), LastSpecifier(VS_None) { }
@@ -2272,7 +2414,7 @@
   bool isOverrideSpecified() const { return Specifiers & VS_Override; }
   SourceLocation getOverrideLoc() const { return VS_overrideLoc; }
 
-  bool isFinalSpecified() const { return Specifiers & (VS_Final | VS_Sealed); }
+  bool isFinalSpecified() const { return Specifiers & (VS_Final | VS_Sealed | VS_GNU_Final); }
   bool isFinalSpelledSealed() const { return Specifiers & VS_Sealed; }
   SourceLocation getFinalLoc() const { return VS_finalLoc; }
 
@@ -2342,4 +2484,4 @@
 
 } // end namespace clang
 
-#endif
+#endif // LLVM_CLANG_SEMA_DECLSPEC_H
diff --git a/include/clang/Sema/DelayedDiagnostic.h b/include/clang/Sema/DelayedDiagnostic.h
index 155b3aa..1d184fb 100644
--- a/include/clang/Sema/DelayedDiagnostic.h
+++ b/include/clang/Sema/DelayedDiagnostic.h
@@ -122,7 +122,7 @@
 
   void Destroy();
 
-  static DelayedDiagnostic makeAvailability(Sema::AvailabilityDiagnostic AD,
+  static DelayedDiagnostic makeAvailability(AvailabilityResult AR,
                                             SourceLocation Loc,
                                             const NamedDecl *D,
                                             const ObjCInterfaceDecl *UnknownObjCClass,
diff --git a/include/clang/Sema/Initialization.h b/include/clang/Sema/Initialization.h
index 1022cc0..09f24cc 100644
--- a/include/clang/Sema/Initialization.h
+++ b/include/clang/Sema/Initialization.h
@@ -20,7 +20,6 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Sema/Overload.h"
 #include "clang/Sema/Ownership.h"
-#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cassert>
 
@@ -85,7 +84,10 @@
     EK_RelatedResult,
     /// \brief The entity being initialized is a function parameter; function
     /// is member of group of audited CF APIs.
-    EK_Parameter_CF_Audited
+    EK_Parameter_CF_Audited,
+    /// \brief The entity being initialized is a structured binding of a
+    /// decomposition declaration.
+    EK_Binding,
 
     // Note: err_init_conversion_failed in DiagnosticSemaKinds.td uses this
     // enum as an index for its first %select.  When modifying this list,
@@ -127,9 +129,9 @@
   };
 
   union {
-    /// \brief When Kind == EK_Variable, or EK_Member, the VarDecl or
-    /// FieldDecl, respectively.
-    DeclaratorDecl *VariableOrMember;
+    /// \brief When Kind == EK_Variable, EK_Member or EK_Binding, the VarDecl,
+    /// FieldDecl or BindingDecl, respectively.
+    ValueDecl *VariableOrMember;
     
     /// \brief When Kind == EK_RelatedResult, the ObjectiveC method where
     /// result type was implicitly changed to accommodate ARC semantics.
@@ -161,8 +163,8 @@
   InitializedEntity() : ManglingNumber(0) {}
 
   /// \brief Create the initialization entity for a variable.
-  InitializedEntity(VarDecl *Var)
-    : Kind(EK_Variable), Parent(nullptr), Type(Var->getType()),
+  InitializedEntity(VarDecl *Var, EntityKind EK = EK_Variable)
+    : Kind(EK), Parent(nullptr), Type(Var->getType()),
       ManglingNumber(0), VariableOrMember(Var) { }
   
   /// \brief Create the initialization entity for the result of a
@@ -284,9 +286,10 @@
 
 
   /// \brief Create the initialization entity for a base class subobject.
-  static InitializedEntity InitializeBase(ASTContext &Context,
-                                          const CXXBaseSpecifier *Base,
-                                          bool IsInheritedVirtualBase);
+  static InitializedEntity
+  InitializeBase(ASTContext &Context, const CXXBaseSpecifier *Base,
+                 bool IsInheritedVirtualBase,
+                 const InitializedEntity *Parent = nullptr);
 
   /// \brief Create the initialization entity for a delegated constructor.
   static InitializedEntity InitializeDelegation(QualType Type) {
@@ -314,6 +317,11 @@
     return InitializedEntity(Context, Index, Parent);
   }
 
+  /// \brief Create the initialization entity for a structured binding.
+  static InitializedEntity InitializeBinding(VarDecl *Binding) {
+    return InitializedEntity(Binding, EK_Binding);
+  }
+
   /// \brief Create the initialization entity for a lambda capture.
   static InitializedEntity InitializeLambdaCapture(IdentifierInfo *VarID,
                                                    QualType FieldType,
@@ -355,7 +363,7 @@
 
   /// \brief Retrieve the variable, parameter, or field being
   /// initialized.
-  DeclaratorDecl *getDecl() const;
+  ValueDecl *getDecl() const;
   
   /// \brief Retrieve the ObjectiveC method being initialized.
   ObjCMethodDecl *getMethodDecl() const { return MethodDecl; }
@@ -1045,8 +1053,8 @@
   /// \param FromInitList The constructor call is syntactically an initializer
   /// list.
   /// \param AsInitList The constructor is called as an init list constructor.
-  void AddConstructorInitializationStep(CXXConstructorDecl *Constructor,
-                                        AccessSpecifier Access,
+  void AddConstructorInitializationStep(DeclAccessPair FoundDecl,
+                                        CXXConstructorDecl *Constructor,
                                         QualType T,
                                         bool HadMultipleCandidates,
                                         bool FromInitList, bool AsInitList);
diff --git a/include/clang/Sema/LocInfoType.h b/include/clang/Sema/LocInfoType.h
deleted file mode 100644
index 63dfa72..0000000
--- a/include/clang/Sema/LocInfoType.h
+++ /dev/null
@@ -1,62 +0,0 @@
-//===--- LocInfoType.h - Parsed Type with Location Information---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the LocInfoType class, which holds a type and its
-// source-location information.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_SEMA_LOCINFOTYPE_H
-#define LLVM_CLANG_SEMA_LOCINFOTYPE_H
-
-#include "clang/AST/Type.h"
-
-namespace clang {
-
-class TypeSourceInfo;
-
-/// \brief Holds a QualType and a TypeSourceInfo* that came out of a declarator
-/// parsing.
-///
-/// LocInfoType is a "transient" type, only needed for passing to/from Parser
-/// and Sema, when we want to preserve type source info for a parsed type.
-/// It will not participate in the type system semantics in any way.
-class LocInfoType : public Type {
-  enum {
-    // The last number that can fit in Type's TC.
-    // Avoids conflict with an existing Type class.
-    LocInfo = Type::TypeLast + 1
-  };
-
-  TypeSourceInfo *DeclInfo;
-
-  LocInfoType(QualType ty, TypeSourceInfo *TInfo)
-    : Type((TypeClass)LocInfo, ty, ty->isDependentType(),
-           ty->isInstantiationDependentType(),
-           ty->isVariablyModifiedType(),
-           ty->containsUnexpandedParameterPack()),
-      DeclInfo(TInfo) {
-    assert(getTypeClass() == (TypeClass)LocInfo && "LocInfo didn't fit in TC?");
-  }
-  friend class Sema;
-
-public:
-  QualType getType() const { return getCanonicalTypeInternal(); }
-  TypeSourceInfo *getTypeSourceInfo() const { return DeclInfo; }
-
-  void getAsStringInternal(std::string &Str,
-                           const PrintingPolicy &Policy) const;
-
-  static bool classof(const Type *T) {
-    return T->getTypeClass() == (TypeClass)LocInfo;
-  }
-};
-
-} // end namespace clang
-
-#endif // LLVM_CLANG_SEMA_LOCINFOTYPE_H
diff --git a/include/clang/Sema/Lookup.h b/include/clang/Sema/Lookup.h
index d18dfcb..2ed9548 100644
--- a/include/clang/Sema/Lookup.h
+++ b/include/clang/Sema/Lookup.h
@@ -185,6 +185,49 @@
       Shadowed(false)
   {}
 
+  // FIXME: Remove these deleted methods once the default build includes
+  // -Wdeprecated.
+  LookupResult(const LookupResult &) = delete;
+  LookupResult &operator=(const LookupResult &) = delete;
+
+  LookupResult(LookupResult &&Other)
+      : ResultKind(std::move(Other.ResultKind)),
+        Ambiguity(std::move(Other.Ambiguity)), Decls(std::move(Other.Decls)),
+        Paths(std::move(Other.Paths)),
+        NamingClass(std::move(Other.NamingClass)),
+        BaseObjectType(std::move(Other.BaseObjectType)),
+        SemaPtr(std::move(Other.SemaPtr)), NameInfo(std::move(Other.NameInfo)),
+        NameContextRange(std::move(Other.NameContextRange)),
+        LookupKind(std::move(Other.LookupKind)), IDNS(std::move(Other.IDNS)),
+        Redecl(std::move(Other.Redecl)), HideTags(std::move(Other.HideTags)),
+        Diagnose(std::move(Other.Diagnose)),
+        AllowHidden(std::move(Other.AllowHidden)),
+        Shadowed(std::move(Other.Shadowed)) {
+    Other.Paths = nullptr;
+    Other.Diagnose = false;
+  }
+  LookupResult &operator=(LookupResult &&Other) {
+    ResultKind = std::move(Other.ResultKind);
+    Ambiguity = std::move(Other.Ambiguity);
+    Decls = std::move(Other.Decls);
+    Paths = std::move(Other.Paths);
+    NamingClass = std::move(Other.NamingClass);
+    BaseObjectType = std::move(Other.BaseObjectType);
+    SemaPtr = std::move(Other.SemaPtr);
+    NameInfo = std::move(Other.NameInfo);
+    NameContextRange = std::move(Other.NameContextRange);
+    LookupKind = std::move(Other.LookupKind);
+    IDNS = std::move(Other.IDNS);
+    Redecl = std::move(Other.Redecl);
+    HideTags = std::move(Other.HideTags);
+    Diagnose = std::move(Other.Diagnose);
+    AllowHidden = std::move(Other.AllowHidden);
+    Shadowed = std::move(Other.Shadowed);
+    Other.Paths = nullptr;
+    Other.Diagnose = false;
+    return *this;
+  }
+
   ~LookupResult() {
     if (Diagnose) diagnose();
     if (Paths) deletePaths(Paths);
diff --git a/include/clang/Sema/Makefile b/include/clang/Sema/Makefile
deleted file mode 100644
index 799f789..0000000
--- a/include/clang/Sema/Makefile
+++ /dev/null
@@ -1,39 +0,0 @@
-CLANG_LEVEL := ../../..
-TD_SRC_DIR = $(PROJ_SRC_DIR)/../Basic
-BUILT_SOURCES = AttrTemplateInstantiate.inc AttrParsedAttrList.inc AttrParsedAttrKinds.inc \
-        AttrSpellingListIndex.inc AttrParsedAttrImpl.inc
-
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-$(ObjDir)/AttrTemplateInstantiate.inc.tmp : $(TD_SRC_DIR)/Attr.td \
-                                            $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang attribute template instantiate code with tablegen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-template-instantiate -o \
-	  $(call SYSPATH, $@) -I $(PROJ_SRC_DIR)/../../ $<
-	  
-$(ObjDir)/AttrParsedAttrList.inc.tmp : $(TD_SRC_DIR)/Attr.td \
-                                       $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang parsed attribute list with tablegen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-parsed-attr-list -o \
-	  $(call SYSPATH, $@) -I $(PROJ_SRC_DIR)/../../ $<
-	  
-$(ObjDir)/AttrParsedAttrKinds.inc.tmp : $(TD_SRC_DIR)/Attr.td \
-                                       $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang parsed attribute kinds with tablegen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-parsed-attr-kinds -o \
-	  $(call SYSPATH, $@) -I $(PROJ_SRC_DIR)/../../ $<
-
-$(ObjDir)/AttrSpellingListIndex.inc.tmp : $(TD_SRC_DIR)/Attr.td \
-                                       $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang attribute spelling list index with tablegen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-spelling-index -o \
-	  $(call SYSPATH, $@) -I $(PROJ_SRC_DIR)/../../ $<
-
-$(ObjDir)/AttrParsedAttrImpl.inc.tmp : $(TD_SRC_DIR)/Attr.td \
-                                       $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang parsed attribute list impl with tablegen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-parsed-attr-impl -o \
-	  $(call SYSPATH, $@) -I $(PROJ_SRC_DIR)/../../ $<
-
diff --git a/include/clang/Sema/ObjCMethodList.h b/include/clang/Sema/ObjCMethodList.h
index da59176..80ccd36 100644
--- a/include/clang/Sema/ObjCMethodList.h
+++ b/include/clang/Sema/ObjCMethodList.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_SEMA_OBJCMETHODLIST_H
 #define LLVM_CLANG_SEMA_OBJCMETHODLIST_H
 
+#include "clang/AST/DeclObjC.h"
 #include "llvm/ADT/PointerIntPair.h"
 
 namespace clang {
diff --git a/include/clang/Sema/Overload.h b/include/clang/Sema/Overload.h
index 6243795..ce01ff9 100644
--- a/include/clang/Sema/Overload.h
+++ b/include/clang/Sema/Overload.h
@@ -199,6 +199,7 @@
     /// conversions are either identity conversions or derived-to-base
     /// conversions.
     CXXConstructorDecl *CopyConstructor;
+    DeclAccessPair FoundCopyConstructor;
 
     void setFromType(QualType T) { FromTypePtr = T.getAsOpaquePtr(); }
     void setToType(unsigned Idx, QualType T) { 
@@ -282,7 +283,7 @@
 
   /// Represents an ambiguous user-defined conversion sequence.
   struct AmbiguousConversionSequence {
-    typedef SmallVector<FunctionDecl*, 4> ConversionSet;
+    typedef SmallVector<std::pair<NamedDecl*, FunctionDecl*>, 4> ConversionSet;
 
     void *FromTypePtr;
     void *ToTypePtr;
@@ -305,8 +306,8 @@
       return *reinterpret_cast<const ConversionSet*>(Buffer);
     }
 
-    void addConversion(FunctionDecl *D) {
-      conversions().push_back(D);
+    void addConversion(NamedDecl *Found, FunctionDecl *D) {
+      conversions().push_back(std::make_pair(Found, D));
     }
 
     typedef ConversionSet::iterator iterator;
@@ -396,7 +397,7 @@
 
     /// \brief Whether the target is really a std::initializer_list, and the
     /// sequence only represents the worst element conversion.
-    bool StdInitializerListElement : 1;
+    unsigned StdInitializerListElement : 1;
 
     void setKind(Kind K) {
       destruct();
@@ -427,8 +428,9 @@
     };
 
     ImplicitConversionSequence()
-      : ConversionKind(Uninitialized), StdInitializerListElement(false)
-    {}
+        : ConversionKind(Uninitialized), StdInitializerListElement(false) {
+      Standard.setAsIdentityConversion();
+    }
     ~ImplicitConversionSequence() {
       destruct();
     }
@@ -797,6 +799,30 @@
                                  const OverloadCandidate& Cand2,
                                  SourceLocation Loc,
                                  bool UserDefinedConversion = false);
+
+  struct ConstructorInfo {
+    DeclAccessPair FoundDecl;
+    CXXConstructorDecl *Constructor;
+    FunctionTemplateDecl *ConstructorTmpl;
+    explicit operator bool() const { return Constructor; }
+  };
+  // FIXME: Add an AddOverloadCandidate / AddTemplateOverloadCandidate overload
+  // that takes one of these.
+  inline ConstructorInfo getConstructorInfo(NamedDecl *ND) {
+    if (isa<UsingDecl>(ND))
+      return ConstructorInfo{};
+
+    // For constructors, the access check is performed against the underlying
+    // declaration, not the found declaration.
+    auto *D = ND->getUnderlyingDecl();
+    ConstructorInfo Info = {DeclAccessPair::make(ND, D->getAccess()), nullptr,
+                            nullptr};
+    Info.ConstructorTmpl = dyn_cast<FunctionTemplateDecl>(D);
+    if (Info.ConstructorTmpl)
+      D = Info.ConstructorTmpl->getTemplatedDecl();
+    Info.Constructor = dyn_cast<CXXConstructorDecl>(D);
+    return Info;
+  }
 } // end namespace clang
 
 #endif // LLVM_CLANG_SEMA_OVERLOAD_H
diff --git a/include/clang/Sema/Ownership.h b/include/clang/Sema/Ownership.h
index dfde374..92ea529 100644
--- a/include/clang/Sema/Ownership.h
+++ b/include/clang/Sema/Ownership.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_CLANG_SEMA_OWNERSHIP_H
 #define LLVM_CLANG_SEMA_OWNERSHIP_H
 
+#include "clang/AST/Expr.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/PointerIntPair.h"
 
 //===----------------------------------------------------------------------===//
 // OpaquePtr
diff --git a/include/clang/Sema/ParsedTemplate.h b/include/clang/Sema/ParsedTemplate.h
index b36425f..03de9ff 100644
--- a/include/clang/Sema/ParsedTemplate.h
+++ b/include/clang/Sema/ParsedTemplate.h
@@ -1,4 +1,4 @@
-//===--- ParsedTemplate.h - Template Parsing Data Types -------------------===//
+//===--- ParsedTemplate.h - Template Parsing Data Types ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,19 @@
 //  templates.
 //
 //===----------------------------------------------------------------------===//
+
 #ifndef LLVM_CLANG_SEMA_PARSEDTEMPLATE_H
 #define LLVM_CLANG_SEMA_PARSEDTEMPLATE_H
 
+#include "clang/Basic/OperatorKinds.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/TemplateKinds.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/Ownership.h"
+#include "llvm/ADT/SmallVector.h"
 #include <cassert>
+#include <cstdlib>
+#include <new>
 
 namespace clang {  
   /// \brief Represents the parsed form of a C++ template argument.
@@ -114,8 +121,8 @@
     KindType Kind;
     
     /// \brief The actual template argument representation, which may be
-    /// an \c ActionBase::TypeTy* (for a type), an Expr* (for an
-    /// expression), or an ActionBase::TemplateTy (for a template).
+    /// an \c Sema::TypeTy* (for a type), an Expr* (for an
+    /// expression), or an Sema::TemplateTy (for a template).
     void *Arg;
 
     /// \brief The nested-name-specifier that can accompany a template template
@@ -209,6 +216,6 @@
   /// Retrieves the range of the given template parameter lists.
   SourceRange getTemplateParamsRange(TemplateParameterList const *const *Params,
                                      unsigned NumParams);  
-}
+} // end namespace clang
 
-#endif
+#endif // LLVM_CLANG_SEMA_PARSEDTEMPLATE_H
diff --git a/include/clang/Sema/Scope.h b/include/clang/Sema/Scope.h
index 03793f1..d0b006b 100644
--- a/include/clang/Sema/Scope.h
+++ b/include/clang/Sema/Scope.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_SEMA_SCOPE_H
 #define LLVM_CLANG_SEMA_SCOPE_H
 
+#include "clang/AST/Decl.h"
 #include "clang/Basic/Diagnostic.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
diff --git a/include/clang/Sema/ScopeInfo.h b/include/clang/Sema/ScopeInfo.h
index ca740ae..b4e09fa 100644
--- a/include/clang/Sema/ScopeInfo.h
+++ b/include/clang/Sema/ScopeInfo.h
@@ -19,6 +19,7 @@
 #include "clang/AST/Type.h"
 #include "clang/Basic/CapturedStmt.h"
 #include "clang/Basic/PartialDiagnostic.h"
+#include "clang/Sema/CleanupInfo.h"
 #include "clang/Sema/Ownership.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
@@ -104,6 +105,16 @@
   /// \brief Whether a statement was dropped because it was invalid.
   bool HasDroppedStmt : 1;
 
+  /// \brief True if current scope is for OpenMP declare reduction combiner.
+  bool HasOMPDeclareReductionCombiner : 1;
+
+  /// \brief Whether there is a fallthrough statement in this function.
+  bool HasFallthroughStmt : 1;
+
+  /// \brief Whether we make reference to a declaration that could be
+  /// unavailable.
+  bool HasPotentialAvailabilityViolations : 1;
+
   /// A flag that is set when parsing a method that must call super's
   /// implementation, such as \c -dealloc, \c -finalize, or any method marked
   /// with \c __attribute__((objc_requires_super)).
@@ -342,6 +353,14 @@
     HasDroppedStmt = true;
   }
 
+  void setHasOMPDeclareReductionCombiner() {
+    HasOMPDeclareReductionCombiner = true;
+  }
+
+  void setHasFallthroughStmt() {
+    HasFallthroughStmt = true;
+  }
+
   void setHasCXXTry(SourceLocation TryLoc) {
     setHasBranchProtectedScope();
     FirstCXXTryLoc = TryLoc;
@@ -364,6 +383,9 @@
       HasBranchIntoScope(false),
       HasIndirectGoto(false),
       HasDroppedStmt(false),
+      HasOMPDeclareReductionCombiner(false),
+      HasFallthroughStmt(false),
+      HasPotentialAvailabilityViolations(false),
       ObjCShouldCallSuper(false),
       ObjCIsDesignatedInit(false),
       ObjCWarnForNoDesignatedInitChain(false),
@@ -405,19 +427,21 @@
     // variables of reference type are captured by reference, and other
     // variables are captured by copy.
     enum CaptureKind {
-      Cap_ByCopy, Cap_ByRef, Cap_Block, Cap_This
+      Cap_ByCopy, Cap_ByRef, Cap_Block, Cap_VLA
     };
-
+    enum {
+      IsNestedCapture = 0x1,
+      IsThisCaptured = 0x2
+    };
     /// The variable being captured (if we are not capturing 'this') and whether
-    /// this is a nested capture.
-    llvm::PointerIntPair<VarDecl*, 1, bool> VarAndNested;
-
+    /// this is a nested capture, and whether we are capturing 'this'
+    llvm::PointerIntPair<VarDecl*, 2> VarAndNestedAndThis;
     /// Expression to initialize a field of the given type, and the kind of
     /// capture (if this is a capture and not an init-capture). The expression
     /// is only required if we are capturing ByVal and the variable's type has
     /// a non-trivial copy constructor.
     llvm::PointerIntPair<void *, 2, CaptureKind> InitExprAndCaptureKind;
-
+    
     /// \brief The source location at which the first capture occurred.
     SourceLocation Loc;
 
@@ -432,27 +456,28 @@
     Capture(VarDecl *Var, bool Block, bool ByRef, bool IsNested,
             SourceLocation Loc, SourceLocation EllipsisLoc,
             QualType CaptureType, Expr *Cpy)
-        : VarAndNested(Var, IsNested),
-          InitExprAndCaptureKind(Cpy, Block ? Cap_Block :
-                                      ByRef ? Cap_ByRef : Cap_ByCopy),
+        : VarAndNestedAndThis(Var, IsNested ? IsNestedCapture : 0),
+          InitExprAndCaptureKind(
+              Cpy, !Var ? Cap_VLA : Block ? Cap_Block : ByRef ? Cap_ByRef
+                                                              : Cap_ByCopy),
           Loc(Loc), EllipsisLoc(EllipsisLoc), CaptureType(CaptureType) {}
 
     enum IsThisCapture { ThisCapture };
     Capture(IsThisCapture, bool IsNested, SourceLocation Loc,
-            QualType CaptureType, Expr *Cpy)
-        : VarAndNested(nullptr, IsNested),
-          InitExprAndCaptureKind(Cpy, Cap_This),
+            QualType CaptureType, Expr *Cpy, const bool ByCopy)
+        : VarAndNestedAndThis(
+              nullptr, (IsThisCaptured | (IsNested ? IsNestedCapture : 0))),
+          InitExprAndCaptureKind(Cpy, ByCopy ? Cap_ByCopy : Cap_ByRef),
           Loc(Loc), EllipsisLoc(), CaptureType(CaptureType) {}
 
     bool isThisCapture() const {
-      return InitExprAndCaptureKind.getInt() == Cap_This;
+      return VarAndNestedAndThis.getInt() & IsThisCaptured;
     }
     bool isVariableCapture() const {
-      return InitExprAndCaptureKind.getInt() != Cap_This && !isVLATypeCapture();
+      return !isThisCapture() && !isVLATypeCapture();
     }
     bool isCopyCapture() const {
-      return InitExprAndCaptureKind.getInt() == Cap_ByCopy &&
-             !isVLATypeCapture();
+      return InitExprAndCaptureKind.getInt() == Cap_ByCopy;
     }
     bool isReferenceCapture() const {
       return InitExprAndCaptureKind.getInt() == Cap_ByRef;
@@ -461,13 +486,14 @@
       return InitExprAndCaptureKind.getInt() == Cap_Block;
     }
     bool isVLATypeCapture() const {
-      return InitExprAndCaptureKind.getInt() == Cap_ByCopy &&
-             getVariable() == nullptr;
+      return InitExprAndCaptureKind.getInt() == Cap_VLA;
     }
-    bool isNested() const { return VarAndNested.getInt(); }
+    bool isNested() const {
+      return VarAndNestedAndThis.getInt() & IsNestedCapture;
+    }
 
     VarDecl *getVariable() const {
-      return VarAndNested.getPointer();
+      return VarAndNestedAndThis.getPointer();
     }
     
     /// \brief Retrieve the location at which this variable was captured.
@@ -480,8 +506,11 @@
     /// \brief Retrieve the capture type for this capture, which is effectively
     /// the type of the non-static data member in the lambda/block structure
     /// that would store this capture.
-    QualType getCaptureType() const { return CaptureType; }
-    
+    QualType getCaptureType() const {
+      assert(!isThisCapture());
+      return CaptureType;
+    }
+
     Expr *getInitExpr() const {
       assert(!isVLATypeCapture() && "no init expression for type capture");
       return static_cast<Expr *>(InitExprAndCaptureKind.getPointer());
@@ -526,8 +555,11 @@
                                /*Cpy*/ nullptr));
   }
 
-  void addThisCapture(bool isNested, SourceLocation Loc, QualType CaptureType,
-                      Expr *Cpy);
+  // Note, we do not need to add the type of 'this' since that is always
+  // retrievable from Sema::getCurrentThisType - and is also encoded within the
+  // type of the corresponding FieldDecl.
+  void addThisCapture(bool isNested, SourceLocation Loc,
+                      Expr *Cpy, bool ByCopy);
 
   /// \brief Determine whether the C++ 'this' is captured.
   bool isCXXThisCaptured() const { return CXXThisCaptureIndex != 0; }
@@ -605,14 +637,15 @@
   /// \brief The implicit parameter for the captured variables.
   ImplicitParamDecl *ContextParam;
   /// \brief The kind of captured region.
-  CapturedRegionKind CapRegionKind;
+  unsigned short CapRegionKind;
+  unsigned short OpenMPLevel;
 
   CapturedRegionScopeInfo(DiagnosticsEngine &Diag, Scope *S, CapturedDecl *CD,
                           RecordDecl *RD, ImplicitParamDecl *Context,
-                          CapturedRegionKind K)
+                          CapturedRegionKind K, unsigned OpenMPLevel)
     : CapturingScopeInfo(Diag, ImpCap_CapturedRegion),
       TheCapturedDecl(CD), TheRecordDecl(RD), TheScope(S),
-      ContextParam(Context), CapRegionKind(K)
+      ContextParam(Context), CapRegionKind(K), OpenMPLevel(OpenMPLevel)
   {
     Kind = SK_CapturedRegion;
   }
@@ -661,7 +694,7 @@
   bool ExplicitParams;
 
   /// \brief Whether any of the capture expressions requires cleanups.
-  bool ExprNeedsCleanups;
+  CleanupInfo Cleanup;
 
   /// \brief Whether the lambda contains an unexpanded parameter pack.
   bool ContainsUnexpandedParameterPack;
@@ -709,7 +742,7 @@
   LambdaScopeInfo(DiagnosticsEngine &Diag)
     : CapturingScopeInfo(Diag, ImpCap_None), Lambda(nullptr),
       CallOperator(nullptr), NumExplicitCaptures(0), Mutable(false),
-      ExplicitParams(false), ExprNeedsCleanups(false),
+      ExplicitParams(false), Cleanup{},
       ContainsUnexpandedParameterPack(false), AutoTemplateParameterDepth(0),
       GLTemplateParameterList(nullptr) {
     Kind = SK_Lambda;
@@ -847,9 +880,10 @@
 
 inline void
 CapturingScopeInfo::addThisCapture(bool isNested, SourceLocation Loc,
-                                   QualType CaptureType, Expr *Cpy) {
-  Captures.push_back(Capture(Capture::ThisCapture, isNested, Loc, CaptureType,
-                             Cpy));
+                                   Expr *Cpy,
+                                   const bool ByCopy) {
+  Captures.push_back(Capture(Capture::ThisCapture, isNested, Loc, QualType(),
+                             Cpy, ByCopy));
   CXXThisCaptureIndex = Captures.size();
 }
 
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index ecbc305..4943d40 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -16,10 +16,12 @@
 #define LLVM_CLANG_SEMA_SEMA_H
 
 #include "clang/AST/Attr.h"
+#include "clang/AST/Availability.h"
 #include "clang/AST/DeclarationName.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/ExternalASTSource.h"
+#include "clang/AST/LocInfoType.h"
 #include "clang/AST/MangleNumberingContext.h"
 #include "clang/AST/NSAPI.h"
 #include "clang/AST/PrettyPrinter.h"
@@ -29,14 +31,15 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/OpenMPKinds.h"
+#include "clang/Basic/PragmaKinds.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Basic/TemplateKinds.h"
 #include "clang/Basic/TypeTraits.h"
 #include "clang/Sema/AnalysisBasedWarnings.h"
+#include "clang/Sema/CleanupInfo.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/ExternalSemaSource.h"
 #include "clang/Sema/IdentifierResolver.h"
-#include "clang/Sema/LocInfoType.h"
 #include "clang/Sema/ObjCMethodList.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Scope.h"
@@ -71,6 +74,7 @@
   class ASTWriter;
   class ArrayType;
   class AttributeList;
+  class BindingDecl;
   class BlockDecl;
   class CapturedDecl;
   class CXXBasePath;
@@ -109,7 +113,6 @@
   class EnumConstantDecl;
   class Expr;
   class ExtVectorType;
-  class ExternalSemaSource;
   class FormatAttr;
   class FriendDecl;
   class FunctionDecl;
@@ -145,6 +148,8 @@
   class ObjCPropertyDecl;
   class ObjCProtocolDecl;
   class OMPThreadPrivateDecl;
+  class OMPDeclareReductionDecl;
+  class OMPDeclareSimdDecl;
   class OMPClause;
   struct OverloadCandidate;
   class OverloadCandidateSet;
@@ -316,50 +321,28 @@
   /// This is used as part of a hack to omit that class from ADL results.
   DeclarationName VAListTagName;
 
-  /// PackContext - Manages the stack for \#pragma pack. An alignment
-  /// of 0 indicates default alignment.
-  void *PackContext; // Really a "PragmaPackStack*"
-
   bool MSStructPragmaOn; // True when \#pragma ms_struct on
 
   /// \brief Controls member pointer representation format under the MS ABI.
   LangOptions::PragmaMSPointersToMembersKind
       MSPointerToMemberRepresentationMethod;
 
-  enum PragmaVtorDispKind {
-    PVDK_Push,          ///< #pragma vtordisp(push, mode)
-    PVDK_Set,           ///< #pragma vtordisp(mode)
-    PVDK_Pop,           ///< #pragma vtordisp(pop)
-    PVDK_Reset          ///< #pragma vtordisp()
-  };
-
-  enum PragmaMsStackAction {
-    PSK_Reset,    // #pragma ()
-    PSK_Set,      // #pragma ("name")
-    PSK_Push,     // #pragma (push[, id])
-    PSK_Push_Set, // #pragma (push[, id], "name")
-    PSK_Pop,      // #pragma (pop[, id])
-    PSK_Pop_Set,  // #pragma (pop[, id], "name")
-  };
-
-  /// \brief Whether to insert vtordisps prior to virtual bases in the Microsoft
-  /// C++ ABI.  Possible values are 0, 1, and 2, which mean:
-  ///
-  /// 0: Suppress all vtordisps
-  /// 1: Insert vtordisps in the presence of vbase overrides and non-trivial
-  ///    structors
-  /// 2: Always insert vtordisps to support RTTI on partially constructed
-  ///    objects
-  ///
-  /// The stack always has at least one element in it.
-  SmallVector<MSVtorDispAttr::Mode, 2> VtorDispModeStack;
-
   /// Stack of active SEH __finally scopes.  Can be empty.
   SmallVector<Scope*, 2> CurrentSEHFinally;
 
   /// \brief Source location for newly created implicit MSInheritanceAttrs
   SourceLocation ImplicitMSInheritanceAttrLoc;
 
+  enum PragmaMsStackAction {
+    PSK_Reset     = 0x0,                // #pragma ()
+    PSK_Set       = 0x1,                // #pragma (value)
+    PSK_Push      = 0x2,                // #pragma (push[, id])
+    PSK_Pop       = 0x4,                // #pragma (pop[, id])
+    PSK_Show      = 0x8,                // #pragma (show) -- only for "pack"!
+    PSK_Push_Set  = PSK_Push | PSK_Set, // #pragma (push[, id], value)
+    PSK_Pop_Set   = PSK_Pop | PSK_Set,  // #pragma (pop[, id], value)
+  };
+
   template<typename ValueType>
   struct PragmaStack {
     struct Slot {
@@ -376,19 +359,71 @@
              PragmaMsStackAction Action,
              llvm::StringRef StackSlotLabel,
              ValueType Value);
-    explicit PragmaStack(const ValueType &Value)
-      : CurrentValue(Value) {}
+
+    // MSVC seems to add artificial slots to #pragma stacks on entering a C++
+    // method body to restore the stacks on exit, so it works like this:
+    //
+    //   struct S {
+    //     #pragma <name>(push, InternalPragmaSlot, <current_pragma_value>)
+    //     void Method {}
+    //     #pragma <name>(pop, InternalPragmaSlot)
+    //   };
+    //
+    // It works even with #pragma vtordisp, although MSVC doesn't support
+    //   #pragma vtordisp(push [, id], n)
+    // syntax.
+    //
+    // Push / pop a named sentinel slot.
+    void SentinelAction(PragmaMsStackAction Action, StringRef Label) {
+      assert((Action == PSK_Push || Action == PSK_Pop) &&
+             "Can only push / pop #pragma stack sentinels!");
+      Act(CurrentPragmaLocation, Action, Label, CurrentValue);
+    }
+
+    // Constructors.
+    explicit PragmaStack(const ValueType &Default)
+        : DefaultValue(Default), CurrentValue(Default) {}
+
     SmallVector<Slot, 2> Stack;
+    ValueType DefaultValue; // Value used for PSK_Reset action.
     ValueType CurrentValue;
     SourceLocation CurrentPragmaLocation;
   };
   // FIXME: We should serialize / deserialize these if they occur in a PCH (but
   // we shouldn't do so if they're in a module).
+
+  /// \brief Whether to insert vtordisps prior to virtual bases in the Microsoft
+  /// C++ ABI.  Possible values are 0, 1, and 2, which mean:
+  ///
+  /// 0: Suppress all vtordisps
+  /// 1: Insert vtordisps in the presence of vbase overrides and non-trivial
+  ///    structors
+  /// 2: Always insert vtordisps to support RTTI on partially constructed
+  ///    objects
+  PragmaStack<MSVtorDispAttr::Mode> VtorDispStack;
+  // #pragma pack.
+  // Sentinel to represent when the stack is set to mac68k alignment.
+  static const unsigned kMac68kAlignmentSentinel = ~0U;
+  PragmaStack<unsigned> PackStack;
+  // Segment #pragmas.
   PragmaStack<StringLiteral *> DataSegStack;
   PragmaStack<StringLiteral *> BSSSegStack;
   PragmaStack<StringLiteral *> ConstSegStack;
   PragmaStack<StringLiteral *> CodeSegStack;
 
+  // RAII object to push / pop sentinel slots for all MS #pragma stacks.
+  // Actions should be performed only if we enter / exit a C++ method body.
+  class PragmaStackSentinelRAII {
+  public:
+    PragmaStackSentinelRAII(Sema &S, StringRef SlotLabel, bool ShouldAct);
+    ~PragmaStackSentinelRAII();
+
+  private:
+    Sema &S;
+    StringRef SlotLabel;
+    bool ShouldAct;
+  };
+
   /// A mapping that describes the nullability we've seen in each header file.
   FileNullabilityMap NullabilityMap;
 
@@ -410,9 +445,8 @@
   /// if Sema is already doing so, which would cause infinite recursions.
   bool IsBuildingRecoveryCallExpr;
 
-  /// ExprNeedsCleanups - True if the current evaluation context
-  /// requires cleanups to be run at its conclusion.
-  bool ExprNeedsCleanups;
+  /// Used to control the generation of ExprWithCleanups.
+  CleanupInfo Cleanup;
 
   /// ExprCleanupObjects - This is the stack of objects requiring
   /// cleanup that are created by the current full expression.  The
@@ -767,6 +801,11 @@
     /// run time.
     Unevaluated,
 
+    /// \brief The current expression occurs within a discarded statement.
+    /// This behaves largely similarly to an unevaluated operand in preventing
+    /// definitions from being required, but not in other ways.
+    DiscardedStatement,
+
     /// \brief The current expression occurs within an unevaluated
     /// operand that unconditionally permits abstract references to
     /// fields, such as a SIZE operator in MS-style inline assembly.
@@ -800,7 +839,7 @@
     ExpressionEvaluationContext Context;
 
     /// \brief Whether the enclosing context needed a cleanup.
-    bool ParentNeedsCleanups;
+    CleanupInfo ParentCleanup;
 
     /// \brief Whether we are in a decltype expression.
     bool IsDecltype;
@@ -841,10 +880,10 @@
 
     ExpressionEvaluationContextRecord(ExpressionEvaluationContext Context,
                                       unsigned NumCleanupObjects,
-                                      bool ParentNeedsCleanups,
+                                      CleanupInfo ParentCleanup,
                                       Decl *ManglingContextDecl,
                                       bool IsDecltype)
-      : Context(Context), ParentNeedsCleanups(ParentNeedsCleanups),
+      : Context(Context), ParentCleanup(ParentCleanup),
         IsDecltype(IsDecltype), NumCleanupObjects(NumCleanupObjects),
         NumTypos(0),
         ManglingContextDecl(ManglingContextDecl), MangleNumbering() { }
@@ -1011,24 +1050,6 @@
     bool OldFPContractState : 1;
   };
 
-  /// Records and restores the vtordisp state on entry/exit of C++ method body.
-  class VtorDispStackRAII {
-  public:
-    VtorDispStackRAII(Sema &S, bool ShouldSaveAndRestore)
-      : S(S), ShouldSaveAndRestore(ShouldSaveAndRestore), OldVtorDispStack() {
-      if (ShouldSaveAndRestore)
-        OldVtorDispStack = S.VtorDispModeStack;
-    }
-    ~VtorDispStackRAII() {
-      if (ShouldSaveAndRestore)
-        S.VtorDispModeStack = OldVtorDispStack;
-    }
-  private:
-    Sema &S;
-    bool ShouldSaveAndRestore;
-    SmallVector<MSVtorDispAttr::Mode, 2> OldVtorDispStack;
-  };
-
   void addImplicitTypedef(StringRef Name, QualType T);
 
 public:
@@ -1390,8 +1411,14 @@
   bool RequireCompleteTypeImpl(SourceLocation Loc, QualType T,
                                TypeDiagnoser *Diagnoser);
 
+  struct ModuleScope {
+    clang::Module *Module;
+    VisibleModuleSet OuterVisibleModules;
+  };
+  /// The modules we're currently parsing.
+  llvm::SmallVector<ModuleScope, 16> ModuleScopes;
+
   VisibleModuleSet VisibleModules;
-  llvm::SmallVector<VisibleModuleSet, 16> VisibleModulesStack;
 
   Module *CachedFakeTopLevelModule;
 
@@ -1409,6 +1436,16 @@
   bool isVisible(const NamedDecl *D) {
     return !D->isHidden() || isVisibleSlow(D);
   }
+
+  /// Determine whether any declaration of an entity is visible.
+  bool
+  hasVisibleDeclaration(const NamedDecl *D,
+                        llvm::SmallVectorImpl<Module *> *Modules = nullptr) {
+    return isVisible(D) || hasVisibleDeclarationSlow(D, Modules);
+  }
+  bool hasVisibleDeclarationSlow(const NamedDecl *D,
+                                 llvm::SmallVectorImpl<Module *> *Modules);
+
   bool hasVisibleMergedDefinition(NamedDecl *Def);
 
   /// Determine if \p D has a visible definition. If not, suggest a declaration
@@ -1425,6 +1462,11 @@
   hasVisibleDefaultArgument(const NamedDecl *D,
                             llvm::SmallVectorImpl<Module *> *Modules = nullptr);
 
+  /// Determine if there is a visible declaration of \p D that is a member
+  /// specialization declaration (as opposed to an instantiated declaration).
+  bool hasVisibleMemberSpecialization(
+      const NamedDecl *D, llvm::SmallVectorImpl<Module *> *Modules = nullptr);
+
   /// Determine if \p A and \p B are equivalent internal linkage declarations
   /// from different modules, and thus an ambiguity error can be downgraded to
   /// an extension warning.
@@ -1520,12 +1562,13 @@
                                ParsedType &SuggestedType,
                                bool AllowClassTemplates = false);
 
-  /// \brief For compatibility with MSVC, we delay parsing of some default
-  /// template type arguments until instantiation time.  Emits a warning and
-  /// returns a synthesized DependentNameType that isn't really dependent on any
-  /// other template arguments.
-  ParsedType ActOnDelayedDefaultTemplateArg(const IdentifierInfo &II,
-                                            SourceLocation NameLoc);
+  /// Attempt to behave like MSVC in situations where lookup of an unqualified
+  /// type name has failed in a dependent context. In these situations, we
+  /// automatically form a DependentTypeName that will retry lookup in a related
+  /// scope during instantiation.
+  ParsedType ActOnMSVCUnknownTypeName(const IdentifierInfo &II,
+                                      SourceLocation NameLoc,
+                                      bool IsTemplateTypeArg);
 
   /// \brief Describes the result of the name lookup and resolution performed
   /// by \c ClassifyName().
@@ -1665,12 +1708,24 @@
                             SourceLocation ConstQualLoc = SourceLocation(),
                             SourceLocation VolatileQualLoc = SourceLocation(),
                             SourceLocation RestrictQualLoc = SourceLocation(),
-                            SourceLocation AtomicQualLoc = SourceLocation());
+                            SourceLocation AtomicQualLoc = SourceLocation(),
+                            SourceLocation UnalignedQualLoc = SourceLocation());
 
   static bool adjustContextForLocalExternDecl(DeclContext *&DC);
   void DiagnoseFunctionSpecifiers(const DeclSpec &DS);
   void CheckShadow(Scope *S, VarDecl *D, const LookupResult& R);
   void CheckShadow(Scope *S, VarDecl *D);
+
+  /// Warn if 'E', which is an expression that is about to be modified, refers
+  /// to a shadowing declaration.
+  void CheckShadowingDeclModification(Expr *E, SourceLocation Loc);
+
+private:
+  /// Map of current shadowing declarations to shadowed declarations. Warn if
+  /// it looks like the user is trying to modify the shadowing declaration.
+  llvm::DenseMap<const NamedDecl *, const NamedDecl *> ShadowingDecls;
+
+public:
   void CheckCastAlign(Expr *Op, QualType T, SourceRange TRange);
   void handleTagNumbering(const TagDecl *Tag, Scope *TagScope);
   void setTagNameForLinkagePurposes(TagDecl *TagFromDeclSpec,
@@ -1685,11 +1740,16 @@
                                      TypeSourceInfo *TInfo,
                                      LookupResult &Previous,
                                      MultiTemplateParamsArg TemplateParamLists,
-                                     bool &AddToScope);
+                                     bool &AddToScope,
+                                     ArrayRef<BindingDecl *> Bindings = None);
+  NamedDecl *
+  ActOnDecompositionDeclarator(Scope *S, Declarator &D,
+                               MultiTemplateParamsArg TemplateParamLists);
   // Returns true if the variable declaration is a redeclaration
   bool CheckVariableDeclaration(VarDecl *NewVD, LookupResult &Previous);
   void CheckVariableDeclarationType(VarDecl *NewVD);
-  void CheckCompleteVariableDeclaration(VarDecl *var);
+  void CheckCompleteVariableDeclaration(VarDecl *VD);
+  void CheckCompleteDecompositionDeclaration(DecompositionDecl *DD);
   void MaybeSuggestAddingStaticToDecl(const FunctionDecl *D);
 
   NamedDecl* ActOnFunctionDeclarator(Scope* S, Declarator& D, DeclContext* DC,
@@ -1792,7 +1852,7 @@
   Decl *ActOnFinishFunctionBody(Decl *Decl, Stmt *Body);
   Decl *ActOnFinishFunctionBody(Decl *Decl, Stmt *Body, bool IsInstantiation);
   Decl *ActOnSkippedFunctionBody(Decl *Decl);
-  void ActOnFinishInlineMethodDef(CXXMethodDecl *D);
+  void ActOnFinishInlineFunctionDef(FunctionDecl *D);
 
   /// ActOnFinishDelayedAttribute - Invoked when we have finished parsing an
   /// attribute for which parsing is delayed.
@@ -1800,16 +1860,14 @@
 
   /// \brief Diagnose any unused parameters in the given sequence of
   /// ParmVarDecl pointers.
-  void DiagnoseUnusedParameters(ParmVarDecl * const *Begin,
-                                ParmVarDecl * const *End);
+  void DiagnoseUnusedParameters(ArrayRef<ParmVarDecl *> Parameters);
 
   /// \brief Diagnose whether the size of parameters or return value of a
   /// function or obj-c method definition is pass-by-value and larger than a
   /// specified threshold.
-  void DiagnoseSizeOfParametersAndReturnValue(ParmVarDecl * const *Begin,
-                                              ParmVarDecl * const *End,
-                                              QualType ReturnTy,
-                                              NamedDecl *D);
+  void
+  DiagnoseSizeOfParametersAndReturnValue(ArrayRef<ParmVarDecl *> Parameters,
+                                         QualType ReturnTy, NamedDecl *D);
 
   void DiagnoseInvalidJumps(Stmt *Body);
   Decl *ActOnFileScopeAsmDecl(Expr *expr,
@@ -1858,17 +1916,30 @@
   enum class MissingImportKind {
     Declaration,
     Definition,
-    DefaultArgument
+    DefaultArgument,
+    ExplicitSpecialization,
+    PartialSpecialization
   };
 
   /// \brief Diagnose that the specified declaration needs to be visible but
   /// isn't, and suggest a module import that would resolve the problem.
   void diagnoseMissingImport(SourceLocation Loc, NamedDecl *Decl,
-                             bool NeedDefinition, bool Recover = true);
+                             MissingImportKind MIK, bool Recover = true);
   void diagnoseMissingImport(SourceLocation Loc, NamedDecl *Decl,
                              SourceLocation DeclLoc, ArrayRef<Module *> Modules,
                              MissingImportKind MIK, bool Recover);
 
+  /// \brief We've found a use of a templated declaration that would trigger an
+  /// implicit instantiation. Check that any relevant explicit specializations
+  /// and partial specializations are visible, and diagnose if not.
+  void checkSpecializationVisibility(SourceLocation Loc, NamedDecl *Spec);
+
+  /// \brief We've found a use of a template specialization that would select a
+  /// partial specialization. Check that the partial specialization is visible,
+  /// and diagnose if not.
+  void checkPartialSpecializationVisibility(SourceLocation Loc,
+                                            NamedDecl *Spec);
+
   /// \brief Retrieve a suitable printing policy.
   PrintingPolicy getPrintingPolicy() const {
     return getPrintingPolicy(Context, PP);
@@ -1882,12 +1953,12 @@
   void ActOnPopScope(SourceLocation Loc, Scope *S);
   void ActOnTranslationUnitScope(Scope *S);
 
-  Decl *ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS,
-                                   DeclSpec &DS);
-  Decl *ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS,
-                                   DeclSpec &DS,
+  Decl *ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS, DeclSpec &DS,
+                                   RecordDecl *&AnonRecord);
+  Decl *ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS, DeclSpec &DS,
                                    MultiTemplateParamsArg TemplateParams,
-                                   bool IsExplicitInstantiation = false);
+                                   bool IsExplicitInstantiation,
+                                   RecordDecl *&AnonRecord);
 
   Decl *BuildAnonymousStructOrUnion(Scope *S, DeclSpec &DS,
                                     AccessSpecifier AS,
@@ -2214,7 +2285,8 @@
                              const LookupResult &OldDecls,
                              NamedDecl *&OldDecl,
                              bool IsForUsingDecl);
-  bool IsOverload(FunctionDecl *New, FunctionDecl *Old, bool IsForUsingDecl);
+  bool IsOverload(FunctionDecl *New, FunctionDecl *Old, bool IsForUsingDecl,
+                  bool ConsiderCudaAttrs = true);
 
   /// \brief Checks availability of the function depending on the current
   /// function context.Inside an unavailable function,unavailability is ignored.
@@ -2296,7 +2368,8 @@
     CCEK_CaseValue,   ///< Expression in a case label.
     CCEK_Enumerator,  ///< Enumerator value with fixed underlying type.
     CCEK_TemplateArg, ///< Value of a non-type template parameter.
-    CCEK_NewExpr      ///< Constant expression in a noptr-new-declarator.
+    CCEK_NewExpr,     ///< Constant expression in a noptr-new-declarator.
+    CCEK_ConstexprIf  ///< Condition in a constexpr if statement.
   };
   ExprResult CheckConvertedConstantExpression(Expr *From, QualType T,
                                               llvm::APSInt &Value, CCEKind CCE);
@@ -2493,7 +2566,8 @@
                                             bool PartialOverloading = false);
 
   // Emit as a 'note' the specific overload candidate
-  void NoteOverloadCandidate(FunctionDecl *Fn, QualType DestType = QualType(),
+  void NoteOverloadCandidate(NamedDecl *Found, FunctionDecl *Fn,
+                             QualType DestType = QualType(),
                              bool TakingAddress = false);
 
   // Emit as a series of 'note's all template and non-templates identified by
@@ -2533,6 +2607,8 @@
   resolveAddressOfOnlyViableOverloadCandidate(Expr *E,
                                               DeclAccessPair &FoundResult);
 
+  bool resolveAndFixAddressOfOnlyViableOverloadCandidate(ExprResult &SrcExpr);
+
   FunctionDecl *
   ResolveSingleFunctionTemplateSpecialization(OverloadExpr *ovl,
                                               bool Complain = false,
@@ -2623,8 +2699,7 @@
                            CallExpr *CE, FunctionDecl *FD);
 
   /// Helpers for dealing with blocks and functions.
-  bool CheckParmsForFunctionDef(ParmVarDecl *const *Param,
-                                ParmVarDecl *const *ParamEnd,
+  bool CheckParmsForFunctionDef(ArrayRef<ParmVarDecl *> Parameters,
                                 bool CheckParameterNames);
   void CheckCXXDefaultArguments(FunctionDecl *FD);
   void CheckExtraCXXDefaultArguments(Declarator &D);
@@ -2699,6 +2774,8 @@
     LookupObjCProtocolName,
     /// Look up implicit 'self' parameter of an objective-c method.
     LookupObjCImplicitSelfParam,
+    /// \brief Look up the name of an OpenMP user-defined reduction operation.
+    LookupOMPReductionName,
     /// \brief Look up any declaration with any name.
     LookupAnyName
   };
@@ -3267,6 +3344,7 @@
 public:
   class FullExprArg {
   public:
+    FullExprArg() : E(nullptr) { }
     FullExprArg(Sema &actions) : E(nullptr) { }
 
     ExprResult release() {
@@ -3360,27 +3438,30 @@
                                  ArrayRef<const Attr*> Attrs,
                                  Stmt *SubStmt);
 
-  StmtResult ActOnIfStmt(SourceLocation IfLoc,
-                         FullExprArg CondVal, Decl *CondVar,
-                         Stmt *ThenVal,
+  class ConditionResult;
+  StmtResult ActOnIfStmt(SourceLocation IfLoc, bool IsConstexpr,
+                         Stmt *InitStmt,
+                         ConditionResult Cond, Stmt *ThenVal,
+                         SourceLocation ElseLoc, Stmt *ElseVal);
+  StmtResult BuildIfStmt(SourceLocation IfLoc, bool IsConstexpr,
+                         Stmt *InitStmt,
+                         ConditionResult Cond, Stmt *ThenVal,
                          SourceLocation ElseLoc, Stmt *ElseVal);
   StmtResult ActOnStartOfSwitchStmt(SourceLocation SwitchLoc,
-                                            Expr *Cond,
-                                            Decl *CondVar);
+                                    Stmt *InitStmt,
+                                    ConditionResult Cond);
   StmtResult ActOnFinishSwitchStmt(SourceLocation SwitchLoc,
                                            Stmt *Switch, Stmt *Body);
-  StmtResult ActOnWhileStmt(SourceLocation WhileLoc,
-                            FullExprArg Cond,
-                            Decl *CondVar, Stmt *Body);
+  StmtResult ActOnWhileStmt(SourceLocation WhileLoc, ConditionResult Cond,
+                            Stmt *Body);
   StmtResult ActOnDoStmt(SourceLocation DoLoc, Stmt *Body,
-                                 SourceLocation WhileLoc,
-                                 SourceLocation CondLParen, Expr *Cond,
-                                 SourceLocation CondRParen);
+                         SourceLocation WhileLoc, SourceLocation CondLParen,
+                         Expr *Cond, SourceLocation CondRParen);
 
   StmtResult ActOnForStmt(SourceLocation ForLoc,
                           SourceLocation LParenLoc,
-                          Stmt *First, FullExprArg Second,
-                          Decl *SecondVar,
+                          Stmt *First,
+                          ConditionResult Second,
                           FullExprArg Third,
                           SourceLocation RParenLoc,
                           Stmt *Body);
@@ -3411,7 +3492,7 @@
   StmtResult BuildCXXForRangeStmt(SourceLocation ForLoc,
                                   SourceLocation CoawaitLoc,
                                   SourceLocation ColonLoc,
-                                  Stmt *RangeDecl, Stmt *BeginEndDecl,
+                                  Stmt *RangeDecl, Stmt *Begin, Stmt *End,
                                   Expr *Cond, Expr *Inc,
                                   Stmt *LoopVarDecl,
                                   SourceLocation RParenLoc,
@@ -3579,18 +3660,18 @@
 
   void redelayDiagnostics(sema::DelayedDiagnosticPool &pool);
 
-  enum AvailabilityDiagnostic { AD_Deprecation, AD_Unavailable, AD_Partial };
-
-  void EmitAvailabilityWarning(AvailabilityDiagnostic AD,
-                               NamedDecl *D, StringRef Message,
-                               SourceLocation Loc,
+  void EmitAvailabilityWarning(AvailabilityResult AR, NamedDecl *D,
+                               StringRef Message, SourceLocation Loc,
                                const ObjCInterfaceDecl *UnknownObjCClass,
-                               const ObjCPropertyDecl  *ObjCProperty,
+                               const ObjCPropertyDecl *ObjCProperty,
                                bool ObjCPropertyAccess);
 
   bool makeUnavailableInSystemHeader(SourceLocation loc,
                                      UnavailableAttr::ImplicitReason reason);
 
+  /// \brief Issue any -Wunguarded-availability warnings in \c FD
+  void DiagnoseUnguardedAvailabilityViolations(Decl *FD);
+
   //===--------------------------------------------------------------------===//
   // Expression Parsing Callbacks: SemaExpr.cpp.
 
@@ -3599,6 +3680,7 @@
                          const ObjCInterfaceDecl *UnknownObjCClass=nullptr,
                          bool ObjCPropertyAccess=false);
   void NoteDeletedFunction(FunctionDecl *FD);
+  void NoteDeletedInheritingConstructor(CXXConstructorDecl *CD);
   std::string getDeletedOrUnavailableSuffix(const FunctionDecl *FD);
   bool DiagnosePropertyAccessorMismatch(ObjCPropertyDecl *PD,
                                         ObjCMethodDecl *Getter,
@@ -3922,6 +4004,12 @@
                            bool SuppressQualifierCheck = false,
                            ActOnMemberAccessExtraArgs *ExtraArgs = nullptr);
 
+  ExprResult BuildFieldReferenceExpr(Expr *BaseExpr, bool IsArrow,
+                                     SourceLocation OpLoc,
+                                     const CXXScopeSpec &SS, FieldDecl *Field,
+                                     DeclAccessPair FoundDecl,
+                                     const DeclarationNameInfo &MemberNameInfo);
+
   ExprResult PerformMemberExprBaseConversion(Expr *Base, bool IsArrow);
 
   bool CheckQualifiedMemberReference(Expr *BaseExpr, QualType BaseType,
@@ -4019,6 +4107,8 @@
   ExprResult CreateBuiltinBinOp(SourceLocation OpLoc, BinaryOperatorKind Opc,
                                 Expr *LHSExpr, Expr *RHSExpr);
 
+  void DiagnoseCommaOperator(const Expr *LHS, SourceLocation Loc);
+
   /// ActOnConditionalOp - Parse a ?: operation.  Note that 'LHS' may be null
   /// in the case of a the GNU conditional expr extension.
   ExprResult ActOnConditionalOp(SourceLocation QuestionLoc,
@@ -4219,6 +4309,13 @@
 
   bool CheckInheritingConstructorUsingDecl(UsingDecl *UD);
 
+  /// Given a derived-class using shadow declaration for a constructor and the
+  /// correspnding base class constructor, find or create the implicit
+  /// synthesized derived class constructor to use for this initialization.
+  CXXConstructorDecl *
+  findInheritingConstructor(SourceLocation Loc, CXXConstructorDecl *BaseCtor,
+                            ConstructorUsingShadowDecl *DerivedShadow);
+
   Decl *ActOnUsingDeclaration(Scope *CurScope,
                               AccessSpecifier AS,
                               bool HasUsingKeyword,
@@ -4243,16 +4340,29 @@
   /// \param ConstructKind - a CXXConstructExpr::ConstructionKind
   ExprResult
   BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
+                        NamedDecl *FoundDecl,
                         CXXConstructorDecl *Constructor, MultiExprArg Exprs,
                         bool HadMultipleCandidates, bool IsListInitialization,
                         bool IsStdInitListInitialization,
                         bool RequiresZeroInit, unsigned ConstructKind,
                         SourceRange ParenRange);
 
+  /// Build a CXXConstructExpr whose constructor has already been resolved if
+  /// it denotes an inherited constructor.
+  ExprResult
+  BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
+                        CXXConstructorDecl *Constructor, bool Elidable,
+                        MultiExprArg Exprs,
+                        bool HadMultipleCandidates, bool IsListInitialization,
+                        bool IsStdInitListInitialization,
+                        bool RequiresZeroInit, unsigned ConstructKind,
+                        SourceRange ParenRange);
+
   // FIXME: Can we remove this and have the above BuildCXXConstructExpr check if
   // the constructor can be elidable?
   ExprResult
   BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
+                        NamedDecl *FoundDecl,
                         CXXConstructorDecl *Constructor, bool Elidable,
                         MultiExprArg Exprs, bool HadMultipleCandidates,
                         bool IsListInitialization,
@@ -4372,7 +4482,8 @@
   /// \brief Determine what sort of exception specification an inheriting
   /// constructor of a class will have.
   ImplicitExceptionSpecification
-  ComputeInheritingCtorExceptionSpec(CXXConstructorDecl *CD);
+  ComputeInheritingCtorExceptionSpec(SourceLocation Loc,
+                                     CXXConstructorDecl *CD);
 
   /// \brief Evaluate the implicit exception specification for a defaulted
   /// special member function.
@@ -4402,9 +4513,12 @@
          ArrayRef<SourceRange> DynamicExceptionRanges,
          Expr *NoexceptExpr);
 
+  class InheritedConstructorInfo;
+
   /// \brief Determine if a special member function should have a deleted
   /// definition when it is defaulted.
   bool ShouldDeleteSpecialMember(CXXMethodDecl *MD, CXXSpecialMember CSM,
+                                 InheritedConstructorInfo *ICI = nullptr,
                                  bool Diagnose = false);
 
   /// \brief Declare the implicit default constructor for the given class.
@@ -4441,12 +4555,6 @@
   void AdjustDestructorExceptionSpec(CXXRecordDecl *ClassDecl,
                                      CXXDestructorDecl *Destructor);
 
-  /// \brief Declare all inheriting constructors for the given class.
-  ///
-  /// \param ClassDecl The class declaration into which the inheriting
-  /// constructors will be added.
-  void DeclareInheritingConstructors(CXXRecordDecl *ClassDecl);
-
   /// \brief Define the specified inheriting constructor.
   void DefineInheritingConstructor(SourceLocation UseLoc,
                                    CXXConstructorDecl *Constructor);
@@ -4507,6 +4615,9 @@
   /// class.
   void ForceDeclarationOfImplicitMembers(CXXRecordDecl *Class);
 
+  /// \brief Check a completed declaration of an implicit special member.
+  void CheckImplicitSpecialMemberDeclaration(Scope *S, FunctionDecl *FD);
+
   /// \brief Determine whether the given function is an implicitly-deleted
   /// special member function.
   bool isImplicitlyDeleted(FunctionDecl *FD);
@@ -4662,7 +4773,8 @@
   /// \return returns 'true' if failed, 'false' if success.
   bool CheckCXXThisCapture(SourceLocation Loc, bool Explicit = false, 
       bool BuildAndDiagnose = true,
-      const unsigned *const FunctionScopeIndexToStopAt = nullptr);
+      const unsigned *const FunctionScopeIndexToStopAt = nullptr,
+      bool ByCopy = false);
 
   /// \brief Determine whether the given type is the type of *this that is used
   /// outside of the body of a member function for a type that is currently
@@ -4676,6 +4788,10 @@
   /// ActOnObjCBoolLiteral - Parse {__objc_yes,__objc_no} literals.
   ExprResult ActOnObjCBoolLiteral(SourceLocation OpLoc, tok::TokenKind Kind);
 
+  ExprResult
+  ActOnObjCAvailabilityCheckExpr(llvm::ArrayRef<AvailabilitySpec> AvailSpecs,
+                                 SourceLocation AtLoc, SourceLocation RParen);
+
   /// ActOnCXXNullPtrLiteral - Parse 'nullptr'.
   ExprResult ActOnCXXNullPtrLiteral(SourceLocation Loc);
 
@@ -4733,8 +4849,7 @@
   void DeclareGlobalNewDelete();
   void DeclareGlobalAllocationFunction(DeclarationName Name, QualType Return,
                                        QualType Param1,
-                                       QualType Param2 = QualType(),
-                                       bool addRestrictAttr = false);
+                                       QualType Param2 = QualType());
 
   bool FindDeallocationFunction(SourceLocation StartLoc, CXXRecordDecl *RD,
                                 DeclarationName Name, FunctionDecl* &Operator,
@@ -4752,11 +4867,6 @@
                             bool WarnOnNonAbstractTypes,
                             SourceLocation DtorLoc);
 
-  DeclResult ActOnCXXConditionDeclaration(Scope *S, Declarator &D);
-  ExprResult CheckConditionVariable(VarDecl *ConditionVar,
-                                    SourceLocation StmtLoc,
-                                    bool ConvertToBoolean);
-
   ExprResult ActOnNoexceptExpr(SourceLocation KeyLoc, SourceLocation LParen,
                                Expr *Operand, SourceLocation RParen);
   ExprResult BuildCXXNoexceptExpr(SourceLocation KeyLoc, Expr *Operand,
@@ -4834,6 +4944,10 @@
   Stmt *MaybeCreateStmtWithCleanups(Stmt *SubStmt);
   ExprResult MaybeCreateExprWithCleanups(ExprResult SubExpr);
 
+  MaterializeTemporaryExpr *
+  CreateMaterializeTemporaryExpr(QualType T, Expr *Temporary,
+                                 bool BoundToLvalueReference);
+
   ExprResult ActOnFinishFullExpr(Expr *Expr) {
     return ActOnFinishFullExpr(Expr, Expr ? Expr->getExprLoc()
                                           : SourceLocation());
@@ -4880,16 +4994,41 @@
                                        bool *CanCorrect = nullptr);
   NamedDecl *FindFirstQualifierInScope(Scope *S, NestedNameSpecifier *NNS);
 
+  /// \brief Keeps information about an identifier in a nested-name-spec.
+  ///
+  struct NestedNameSpecInfo {
+    /// \brief The type of the object, if we're parsing nested-name-specifier in
+    /// a member access expression.
+    ParsedType ObjectType;
+
+    /// \brief The identifier preceding the '::'.
+    IdentifierInfo *Identifier;
+
+    /// \brief The location of the identifier.
+    SourceLocation IdentifierLoc;
+
+    /// \brief The location of the '::'.
+    SourceLocation CCLoc;
+
+    /// \brief Creates info object for the most typical case.
+    NestedNameSpecInfo(IdentifierInfo *II, SourceLocation IdLoc,
+             SourceLocation ColonColonLoc, ParsedType ObjectType = ParsedType())
+      : ObjectType(ObjectType), Identifier(II), IdentifierLoc(IdLoc),
+        CCLoc(ColonColonLoc) {
+    }
+
+    NestedNameSpecInfo(IdentifierInfo *II, SourceLocation IdLoc,
+                       SourceLocation ColonColonLoc, QualType ObjectType)
+      : ObjectType(ParsedType::make(ObjectType)), Identifier(II),
+        IdentifierLoc(IdLoc), CCLoc(ColonColonLoc) {
+    }
+  };
+
   bool isNonTypeNestedNameSpecifier(Scope *S, CXXScopeSpec &SS,
-                                    SourceLocation IdLoc,
-                                    IdentifierInfo &II,
-                                    ParsedType ObjectType);
+                                    NestedNameSpecInfo &IdInfo);
 
   bool BuildCXXNestedNameSpecifier(Scope *S,
-                                   IdentifierInfo &Identifier,
-                                   SourceLocation IdentifierLoc,
-                                   SourceLocation CCLoc,
-                                   QualType ObjectType,
+                                   NestedNameSpecInfo &IdInfo,
                                    bool EnteringContext,
                                    CXXScopeSpec &SS,
                                    NamedDecl *ScopeLookupResult,
@@ -4900,14 +5039,8 @@
   ///
   /// \param S The scope in which this nested-name-specifier occurs.
   ///
-  /// \param Identifier The identifier preceding the '::'.
-  ///
-  /// \param IdentifierLoc The location of the identifier.
-  ///
-  /// \param CCLoc The location of the '::'.
-  ///
-  /// \param ObjectType The type of the object, if we're parsing
-  /// nested-name-specifier in a member access expression.
+  /// \param IdInfo Parser information about an identifier in the
+  /// nested-name-spec.
   ///
   /// \param EnteringContext Whether we're entering the context nominated by
   /// this nested-name-specifier.
@@ -4926,10 +5059,7 @@
   ///
   /// \returns true if an error occurred, false otherwise.
   bool ActOnCXXNestedNameSpecifier(Scope *S,
-                                   IdentifierInfo &Identifier,
-                                   SourceLocation IdentifierLoc,
-                                   SourceLocation CCLoc,
-                                   ParsedType ObjectType,
+                                   NestedNameSpecInfo &IdInfo,
                                    bool EnteringContext,
                                    CXXScopeSpec &SS,
                                    bool ErrorRecoveryLookup = false,
@@ -4942,10 +5072,7 @@
                                            SourceLocation ColonColonLoc);
 
   bool IsInvalidUnlessNestedName(Scope *S, CXXScopeSpec &SS,
-                                 IdentifierInfo &Identifier,
-                                 SourceLocation IdentifierLoc,
-                                 SourceLocation ColonLoc,
-                                 ParsedType ObjectType,
+                                 NestedNameSpecInfo &IdInfo,
                                  bool EnteringContext);
 
   /// \brief The parser has parsed a nested-name-specifier
@@ -5045,7 +5172,8 @@
                                        SourceRange IntroducerRange,
                                        TypeSourceInfo *MethodType,
                                        SourceLocation EndLoc,
-                                       ArrayRef<ParmVarDecl *> Params);
+                                       ArrayRef<ParmVarDecl *> Params, 
+                                       bool IsConstexprSpecified);
 
   /// \brief Endow the lambda scope info with the relevant properties.
   void buildLambdaScope(sema::LambdaScopeInfo *LSI, 
@@ -5342,11 +5470,18 @@
                             ArrayRef<CXXCtorInitializer*> MemInits,
                             bool AnyErrors);
 
+  /// \brief Check class-level dllimport/dllexport attribute. The caller must
+  /// ensure that referenceDLLExportedClassMethods is called some point later
+  /// when all outer classes of Class are complete.
   void checkClassLevelDLLAttribute(CXXRecordDecl *Class);
+
+  void referenceDLLExportedClassMethods();
+
   void propagateDLLAttrToBaseClassTemplate(
       CXXRecordDecl *Class, Attr *ClassAttr,
       ClassTemplateSpecializationDecl *BaseTemplateSpec,
       SourceLocation BaseLoc);
+
   void CheckCompletedCXXClass(CXXRecordDecl *Record);
   void ActOnFinishCXXMemberSpecification(Scope* S, SourceLocation RLoc,
                                          Decl *TagDecl,
@@ -5502,13 +5637,13 @@
                                      bool Diagnose = true);
   AccessResult CheckConstructorAccess(SourceLocation Loc,
                                       CXXConstructorDecl *D,
+                                      DeclAccessPair FoundDecl,
                                       const InitializedEntity &Entity,
-                                      AccessSpecifier Access,
                                       bool IsCopyBindingRefToTemp = false);
   AccessResult CheckConstructorAccess(SourceLocation Loc,
                                       CXXConstructorDecl *D,
+                                      DeclAccessPair FoundDecl,
                                       const InitializedEntity &Entity,
-                                      AccessSpecifier Access,
                                       const PartialDiagnostic &PDiag);
   AccessResult CheckDestructorAccess(SourceLocation Loc,
                                      CXXDestructorDecl *Dtor,
@@ -5641,7 +5776,8 @@
                              SourceLocation TemplateLoc,
                              SourceLocation LAngleLoc,
                              ArrayRef<Decl *> Params,
-                             SourceLocation RAngleLoc);
+                             SourceLocation RAngleLoc,
+                             Expr *RequiresClause);
 
   /// \brief The context in which we are checking a template parameter list.
   enum TemplateParamListContext {
@@ -5676,6 +5812,10 @@
                             TemplateParameterList **OuterTemplateParamLists,
                                 SkipBodyInfo *SkipBody = nullptr);
 
+  TemplateArgumentLoc getTrivialTemplateArgumentLoc(const TemplateArgument &Arg,
+                                                    QualType NTTPType,
+                                                    SourceLocation Loc);
+
   void translateTemplateArguments(const ASTTemplateArgsPtr &In,
                                   TemplateArgumentListInfo &Out);
 
@@ -6593,6 +6733,10 @@
     /// \brief The number of template arguments in TemplateArgs.
     unsigned NumTemplateArgs;
 
+    ArrayRef<TemplateArgument> template_arguments() const {
+      return {TemplateArgs, NumTemplateArgs};
+    }
+
     /// \brief The template deduction info object associated with the
     /// substitution or checking of explicit or deduced template arguments.
     sema::TemplateDeductionInfo *DeductionInfo;
@@ -6667,6 +6811,10 @@
   /// template defined within it.
   llvm::DenseSet<Module*> &getLookupModules();
 
+  /// \brief Map from the most recent declaration of a namespace to the most
+  /// recent visible declaration of that namespace.
+  llvm::DenseMap<NamedDecl*, NamedDecl*> VisibleNamespaceCache;
+
   /// \brief Whether we are in a SFINAE context that is not associated with
   /// template instantiation.
   ///
@@ -7072,8 +7220,7 @@
                                 int indexAdjustment,
                                 Optional<unsigned> NumExpansions,
                                 bool ExpectParameterPack);
-  bool SubstParmTypes(SourceLocation Loc,
-                      ParmVarDecl **Params, unsigned NumParams,
+  bool SubstParmTypes(SourceLocation Loc, ArrayRef<ParmVarDecl *> Params,
                       const FunctionProtoType::ExtParameterInfo *ExtParamInfos,
                       const MultiLevelTemplateArgumentList &TemplateArgs,
                       SmallVectorImpl<QualType> &ParamTypes,
@@ -7183,7 +7330,8 @@
   void InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
                                      FunctionDecl *Function,
                                      bool Recursive = false,
-                                     bool DefinitionRequired = false);
+                                     bool DefinitionRequired = false,
+                                     bool AtEndOfTU = false);
   VarTemplateSpecializationDecl *BuildVarTemplateInstantiation(
       VarTemplateDecl *VarTemplate, VarDecl *FromVar,
       const TemplateArgumentList &TemplateArgList,
@@ -7207,7 +7355,8 @@
       const MultiLevelTemplateArgumentList &TemplateArgs);
   void InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
                                      VarDecl *Var, bool Recursive = false,
-                                     bool DefinitionRequired = false);
+                                     bool DefinitionRequired = false,
+                                     bool AtEndOfTU = false);
   void InstantiateStaticDataMemberDefinition(
                                      SourceLocation PointOfInstantiation,
                                      VarDecl *Var,
@@ -7651,41 +7800,17 @@
   void ActOnPragmaOptionsAlign(PragmaOptionsAlignKind Kind,
                                SourceLocation PragmaLoc);
 
-  enum PragmaPackKind {
-    PPK_Default, // #pragma pack([n])
-    PPK_Show,    // #pragma pack(show), only supported by MSVC.
-    PPK_Push,    // #pragma pack(push, [identifier], [n])
-    PPK_Pop      // #pragma pack(pop, [identifier], [n])
-  };
-
-  enum PragmaMSStructKind {
-    PMSST_OFF,  // #pragms ms_struct off
-    PMSST_ON    // #pragms ms_struct on
-  };
-
-  enum PragmaMSCommentKind {
-    PCK_Unknown,
-    PCK_Linker,   // #pragma comment(linker, ...)
-    PCK_Lib,      // #pragma comment(lib, ...)
-    PCK_Compiler, // #pragma comment(compiler, ...)
-    PCK_ExeStr,   // #pragma comment(exestr, ...)
-    PCK_User      // #pragma comment(user, ...)
-  };
-
   /// ActOnPragmaPack - Called on well formed \#pragma pack(...).
-  void ActOnPragmaPack(PragmaPackKind Kind,
-                       IdentifierInfo *Name,
-                       Expr *Alignment,
-                       SourceLocation PragmaLoc,
-                       SourceLocation LParenLoc,
-                       SourceLocation RParenLoc);
+  void ActOnPragmaPack(SourceLocation PragmaLoc, PragmaMsStackAction Action,
+                       StringRef SlotLabel, Expr *Alignment);
 
   /// ActOnPragmaMSStruct - Called on well formed \#pragma ms_struct [on|off].
   void ActOnPragmaMSStruct(PragmaMSStructKind Kind);
 
   /// ActOnPragmaMSComment - Called on well formed
   /// \#pragma comment(kind, "arg").
-  void ActOnPragmaMSComment(PragmaMSCommentKind Kind, StringRef Arg);
+  void ActOnPragmaMSComment(SourceLocation CommentLoc, PragmaMSCommentKind Kind,
+                            StringRef Arg);
 
   /// ActOnPragmaMSPointersToMembers - called on well formed \#pragma
   /// pointers_to_members(representation method[, general purpose
@@ -7695,7 +7820,8 @@
       SourceLocation PragmaLoc);
 
   /// \brief Called on well formed \#pragma vtordisp().
-  void ActOnPragmaMSVtorDisp(PragmaVtorDispKind Kind, SourceLocation PragmaLoc,
+  void ActOnPragmaMSVtorDisp(PragmaMsStackAction Action,
+                             SourceLocation PragmaLoc,
                              MSVtorDispAttr::Mode Value);
 
   enum PragmaSectionKind {
@@ -7731,7 +7857,8 @@
   void ActOnPragmaDump(Scope *S, SourceLocation Loc, IdentifierInfo *II);
 
   /// ActOnPragmaDetectMismatch - Call on well-formed \#pragma detect_mismatch
-  void ActOnPragmaDetectMismatch(StringRef Name, StringRef Value);
+  void ActOnPragmaDetectMismatch(SourceLocation Loc, StringRef Name,
+                                 StringRef Value);
 
   /// ActOnPragmaUnused - Called on well-formed '\#pragma unused'.
   void ActOnPragmaUnused(const Token &Identifier,
@@ -7841,6 +7968,10 @@
   void AddLaunchBoundsAttr(SourceRange AttrRange, Decl *D, Expr *MaxThreads,
                            Expr *MinBlocks, unsigned SpellingListIndex);
 
+  /// AddModeAttr - Adds a mode attribute to a particular declaration.
+  void AddModeAttr(SourceRange AttrRange, Decl *D, IdentifierInfo *Name,
+                   unsigned SpellingListIndex, bool InInstantiation = false);
+
   void AddParameterABIAttr(SourceRange AttrRange, Decl *D,
                            ParameterABI ABI, unsigned SpellingListIndex);
 
@@ -7866,34 +7997,40 @@
   //
 private:
   void *VarDataSharingAttributesStack;
+  /// Set to true inside '#pragma omp declare target' region.
+  bool IsInOpenMPDeclareTargetContext = false;
   /// \brief Initialization of data-sharing attributes stack.
   void InitDataSharingAttributesStack();
   void DestroyDataSharingAttributesStack();
   ExprResult
   VerifyPositiveIntegerConstantInClause(Expr *Op, OpenMPClauseKind CKind,
                                         bool StrictlyPositive = true);
+  /// Returns OpenMP nesting level for current directive.
+  unsigned getOpenMPNestingLevel() const;
 
 public:
   /// \brief Return true if the provided declaration \a VD should be captured by
-  /// reference in the provided scope \a RSI. This will take into account the
-  /// semantics of the directive and associated clauses.
-  bool IsOpenMPCapturedByRef(VarDecl *VD,
-                             const sema::CapturedRegionScopeInfo *RSI);
+  /// reference.
+  /// \param Level Relative level of nested OpenMP construct for that the check
+  /// is performed.
+  bool IsOpenMPCapturedByRef(ValueDecl *D, unsigned Level);
 
   /// \brief Check if the specified variable is used in one of the private
   /// clauses (private, firstprivate, lastprivate, reduction etc.) in OpenMP
   /// constructs.
-  bool IsOpenMPCapturedVar(VarDecl *VD);
+  VarDecl *IsOpenMPCapturedDecl(ValueDecl *D);
+  ExprResult getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK,
+                                   ExprObjectKind OK, SourceLocation Loc);
 
   /// \brief Check if the specified variable is used in 'private' clause.
   /// \param Level Relative level of nested OpenMP construct for that the check
   /// is performed.
-  bool isOpenMPPrivateVar(VarDecl *VD, unsigned Level);
+  bool isOpenMPPrivateDecl(ValueDecl *D, unsigned Level);
 
   /// \brief Check if the specified variable is captured  by 'target' directive.
   /// \param Level Relative level of nested OpenMP construct for that the check
   /// is performed.
-  bool isOpenMPTargetCapturedVar(VarDecl *VD, unsigned Level);
+  bool isOpenMPTargetCapturedDecl(ValueDecl *D, unsigned Level);
 
   ExprResult PerformOpenMPImplicitIntegerConversion(SourceLocation OpLoc,
                                                     Expr *Op);
@@ -7928,6 +8065,42 @@
   OMPThreadPrivateDecl *CheckOMPThreadPrivateDecl(
                                      SourceLocation Loc,
                                      ArrayRef<Expr *> VarList);
+  /// \brief Check if the specified type is allowed to be used in 'omp declare
+  /// reduction' construct.
+  QualType ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
+                                           TypeResult ParsedType);
+  /// \brief Called on start of '#pragma omp declare reduction'.
+  DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveStart(
+      Scope *S, DeclContext *DC, DeclarationName Name,
+      ArrayRef<std::pair<QualType, SourceLocation>> ReductionTypes,
+      AccessSpecifier AS, Decl *PrevDeclInScope = nullptr);
+  /// \brief Initialize declare reduction construct initializer.
+  void ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D);
+  /// \brief Finish current declare reduction construct initializer.
+  void ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner);
+  /// \brief Initialize declare reduction construct initializer.
+  void ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D);
+  /// \brief Finish current declare reduction construct initializer.
+  void ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer);
+  /// \brief Called at the end of '#pragma omp declare reduction'.
+  DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveEnd(
+      Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid);
+
+  /// Called on the start of target region i.e. '#pragma omp declare target'.
+  bool ActOnStartOpenMPDeclareTargetDirective(SourceLocation Loc);
+  /// Called at the end of target region i.e. '#pragme omp end declare target'.
+  void ActOnFinishOpenMPDeclareTargetDirective();
+  /// Called on correct id-expression from the '#pragma omp declare target'.
+  void ActOnOpenMPDeclareTargetName(Scope *CurScope, CXXScopeSpec &ScopeSpec,
+                                    const DeclarationNameInfo &Id,
+                                    OMPDeclareTargetDeclAttr::MapTypeTy MT,
+                                    NamedDeclSetType &SameDirectiveDecls);
+  /// Check declaration inside target region.
+  void checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D);
+  /// Return true inside OpenMP target region.
+  bool isInOpenMPDeclareTargetContext() const {
+    return IsInOpenMPDeclareTargetContext;
+  }
 
   /// \brief Initialization of captured region for OpenMP region.
   void ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope);
@@ -7953,19 +8126,19 @@
   StmtResult ActOnOpenMPSimdDirective(
       ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
       SourceLocation EndLoc,
-      llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA);
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
   /// \brief Called on well-formed '\#pragma omp for' after parsing
   /// of the associated statement.
   StmtResult ActOnOpenMPForDirective(
       ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
       SourceLocation EndLoc,
-      llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA);
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
   /// \brief Called on well-formed '\#pragma omp for simd' after parsing
   /// of the associated statement.
   StmtResult ActOnOpenMPForSimdDirective(
       ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
       SourceLocation EndLoc,
-      llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA);
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
   /// \brief Called on well-formed '\#pragma omp sections' after parsing
   /// of the associated statement.
   StmtResult ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
@@ -7995,13 +8168,13 @@
   StmtResult ActOnOpenMPParallelForDirective(
       ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
       SourceLocation EndLoc,
-      llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA);
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
   /// \brief Called on well-formed '\#pragma omp parallel for simd' after
   /// parsing of the  associated statement.
   StmtResult ActOnOpenMPParallelForSimdDirective(
       ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
       SourceLocation EndLoc,
-      llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA);
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
   /// \brief Called on well-formed '\#pragma omp parallel sections' after
   /// parsing of the  associated statement.
   StmtResult ActOnOpenMPParallelSectionsDirective(ArrayRef<OMPClause *> Clauses,
@@ -8049,6 +8222,28 @@
   StmtResult ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
                                             Stmt *AStmt, SourceLocation StartLoc,
                                             SourceLocation EndLoc);
+  /// \brief Called on well-formed '\#pragma omp target enter data' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation EndLoc);
+  /// \brief Called on well-formed '\#pragma omp target exit data' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc);
+  /// \brief Called on well-formed '\#pragma omp target parallel' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
+                                                Stmt *AStmt,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc);
+  /// \brief Called on well-formed '\#pragma omp target parallel for' after
+  /// parsing of the  associated statement.
+  StmtResult ActOnOpenMPTargetParallelForDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc,
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
   /// \brief Called on well-formed '\#pragma omp teams' after parsing of the
   /// associated statement.
   StmtResult ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
@@ -8069,19 +8264,75 @@
   StmtResult ActOnOpenMPTaskLoopDirective(
       ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
       SourceLocation EndLoc,
-      llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA);
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
   /// \brief Called on well-formed '\#pragma omp taskloop simd' after parsing of
   /// the associated statement.
   StmtResult ActOnOpenMPTaskLoopSimdDirective(
       ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
       SourceLocation EndLoc,
-      llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA);
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
   /// \brief Called on well-formed '\#pragma omp distribute' after parsing
   /// of the associated statement.
   StmtResult ActOnOpenMPDistributeDirective(
       ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
       SourceLocation EndLoc,
-      llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA);
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
+  /// \brief Called on well-formed '\#pragma omp target update'.
+  StmtResult ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
+                                              SourceLocation StartLoc,
+                                              SourceLocation EndLoc);
+  /// \brief Called on well-formed '\#pragma omp distribute parallel for' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPDistributeParallelForDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc,
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
+  /// \brief Called on well-formed '\#pragma omp distribute parallel for simd'
+  /// after parsing of the associated statement.
+  StmtResult ActOnOpenMPDistributeParallelForSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc,
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
+  /// \brief Called on well-formed '\#pragma omp distribute simd' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPDistributeSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc,
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
+  /// \brief Called on well-formed '\#pragma omp target parallel for simd' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetParallelForSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc,
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
+  /// \brief Called on well-formed '\#pragma omp target simd' after parsing of
+  /// the associated statement.
+  StmtResult ActOnOpenMPTargetSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc,
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp teams distribute' after parsing of
+  /// the associated statement.
+  StmtResult ActOnOpenMPTeamsDistributeDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc,
+      llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA);
+
+  /// Checks correctness of linear modifiers.
+  bool CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
+                                 SourceLocation LinLoc);
+  /// Checks that the specified declaration matches requirements for the linear
+  /// decls.
+  bool CheckOpenMPLinearDecl(ValueDecl *D, SourceLocation ELoc,
+                             OpenMPLinearClauseKind LinKind, QualType Type);
+
+  /// \brief Called on well-formed '\#pragma omp declare simd' after parsing of
+  /// the associated method/function.
+  DeclGroupPtrTy ActOnOpenMPDeclareSimdDirective(
+      DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS,
+      Expr *Simdlen, ArrayRef<Expr *> Uniforms, ArrayRef<Expr *> Aligneds,
+      ArrayRef<Expr *> Alignments, ArrayRef<Expr *> Linears,
+      ArrayRef<unsigned> LinModifiers, ArrayRef<Expr *> Steps, SourceRange SR);
 
   OMPClause *ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
                                          Expr *Expr,
@@ -8210,7 +8461,8 @@
       CXXScopeSpec &ReductionIdScopeSpec,
       const DeclarationNameInfo &ReductionId, OpenMPDependClauseKind DepKind,
       OpenMPLinearClauseKind LinKind, OpenMPMapClauseKind MapTypeModifier,
-      OpenMPMapClauseKind MapType, SourceLocation DepLinMapLoc);
+      OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
+      SourceLocation DepLinMapLoc);
   /// \brief Called on well-formed 'private' clause.
   OMPClause *ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
                                       SourceLocation StartLoc,
@@ -8232,12 +8484,12 @@
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc);
   /// \brief Called on well-formed 'reduction' clause.
-  OMPClause *
-  ActOnOpenMPReductionClause(ArrayRef<Expr *> VarList, SourceLocation StartLoc,
-                             SourceLocation LParenLoc, SourceLocation ColonLoc,
-                             SourceLocation EndLoc,
-                             CXXScopeSpec &ReductionIdScopeSpec,
-                             const DeclarationNameInfo &ReductionId);
+  OMPClause *ActOnOpenMPReductionClause(
+      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+      SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
+      CXXScopeSpec &ReductionIdScopeSpec,
+      const DeclarationNameInfo &ReductionId,
+      ArrayRef<Expr *> UnresolvedReductions = llvm::None);
   /// \brief Called on well-formed 'linear' clause.
   OMPClause *
   ActOnOpenMPLinearClause(ArrayRef<Expr *> VarList, Expr *Step,
@@ -8277,10 +8529,12 @@
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc);
   /// \brief Called on well-formed 'map' clause.
-  OMPClause *ActOnOpenMPMapClause(
-      OpenMPMapClauseKind MapTypeModifier, OpenMPMapClauseKind MapType,
-      SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
-      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc);
+  OMPClause *
+  ActOnOpenMPMapClause(OpenMPMapClauseKind MapTypeModifier,
+                       OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
+                       SourceLocation MapLoc, SourceLocation ColonLoc,
+                       ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+                       SourceLocation LParenLoc, SourceLocation EndLoc);
   /// \brief Called on well-formed 'num_teams' clause.
   OMPClause *ActOnOpenMPNumTeamsClause(Expr *NumTeams, SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
@@ -8299,6 +8553,31 @@
       OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize,
       SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation KindLoc,
       SourceLocation CommaLoc, SourceLocation EndLoc);
+  /// \brief Called on well-formed 'defaultmap' clause.
+  OMPClause *ActOnOpenMPDefaultmapClause(
+      OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind,
+      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc,
+      SourceLocation KindLoc, SourceLocation EndLoc);
+  /// \brief Called on well-formed 'to' clause.
+  OMPClause *ActOnOpenMPToClause(ArrayRef<Expr *> VarList,
+                                 SourceLocation StartLoc,
+                                 SourceLocation LParenLoc,
+                                 SourceLocation EndLoc);
+  /// \brief Called on well-formed 'from' clause.
+  OMPClause *ActOnOpenMPFromClause(ArrayRef<Expr *> VarList,
+                                   SourceLocation StartLoc,
+                                   SourceLocation LParenLoc,
+                                   SourceLocation EndLoc);
+  /// Called on well-formed 'use_device_ptr' clause.
+  OMPClause *ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
+                                           SourceLocation StartLoc,
+                                           SourceLocation LParenLoc,
+                                           SourceLocation EndLoc);
+  /// Called on well-formed 'is_device_ptr' clause.
+  OMPClause *ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
+                                          SourceLocation StartLoc,
+                                          SourceLocation LParenLoc,
+                                          SourceLocation EndLoc);
 
   /// \brief The kind of conversion being performed.
   enum CheckedConversionKind {
@@ -8777,6 +9056,60 @@
   /// type, and if so, emit a note describing what happened.
   void EmitRelatedResultTypeNoteForReturn(QualType destType);
 
+  class ConditionResult {
+    Decl *ConditionVar;
+    FullExprArg Condition;
+    bool Invalid;
+    bool HasKnownValue;
+    bool KnownValue;
+
+    friend class Sema;
+    ConditionResult(Sema &S, Decl *ConditionVar, FullExprArg Condition,
+                    bool IsConstexpr)
+        : ConditionVar(ConditionVar), Condition(Condition), Invalid(false),
+          HasKnownValue(IsConstexpr && Condition.get() &&
+                        !Condition.get()->isValueDependent()),
+          KnownValue(HasKnownValue &&
+                     !!Condition.get()->EvaluateKnownConstInt(S.Context)) {}
+    explicit ConditionResult(bool Invalid)
+        : ConditionVar(nullptr), Condition(nullptr), Invalid(Invalid),
+          HasKnownValue(false), KnownValue(false) {}
+
+  public:
+    ConditionResult() : ConditionResult(false) {}
+    bool isInvalid() const { return Invalid; }
+    std::pair<VarDecl *, Expr *> get() const {
+      return std::make_pair(cast_or_null<VarDecl>(ConditionVar),
+                            Condition.get());
+    }
+    llvm::Optional<bool> getKnownValue() const {
+      if (!HasKnownValue)
+        return None;
+      return KnownValue;
+    }
+  };
+  static ConditionResult ConditionError() { return ConditionResult(true); }
+
+  enum class ConditionKind {
+    Boolean,     ///< A boolean condition, from 'if', 'while', 'for', or 'do'.
+    ConstexprIf, ///< A constant boolean condition from 'if constexpr'.
+    Switch       ///< An integral condition for a 'switch' statement.
+  };
+
+  ConditionResult ActOnCondition(Scope *S, SourceLocation Loc,
+                                 Expr *SubExpr, ConditionKind CK);
+
+  ConditionResult ActOnConditionVariable(Decl *ConditionVar,
+                                         SourceLocation StmtLoc,
+                                         ConditionKind CK);
+
+  DeclResult ActOnCXXConditionDeclaration(Scope *S, Declarator &D);
+
+  ExprResult CheckConditionVariable(VarDecl *ConditionVar,
+                                    SourceLocation StmtLoc,
+                                    ConditionKind CK);
+  ExprResult CheckSwitchCondition(SourceLocation SwitchLoc, Expr *Cond);
+
   /// CheckBooleanCondition - Diagnose problems involving the use of
   /// the given expression as a boolean condition (e.g. in an if
   /// statement).  Also performs the standard function and array
@@ -8785,10 +9118,8 @@
   /// \param Loc - A location associated with the condition, e.g. the
   /// 'if' keyword.
   /// \return true iff there were any errors
-  ExprResult CheckBooleanCondition(Expr *E, SourceLocation Loc);
-
-  ExprResult ActOnBooleanCondition(Scope *S, SourceLocation Loc,
-                                   Expr *SubExpr);
+  ExprResult CheckBooleanCondition(SourceLocation Loc, Expr *E,
+                                   bool IsConstexpr = false);
 
   /// DiagnoseAssignmentAsCondition - Given that an expression is
   /// being used as a boolean condition, warn if it's an assignment.
@@ -8799,7 +9130,7 @@
   void DiagnoseEqualityWithExtraParens(ParenExpr *ParenE);
 
   /// CheckCXXBooleanCondition - Returns true if conversion to bool is invalid.
-  ExprResult CheckCXXBooleanCondition(Expr *CondExpr);
+  ExprResult CheckCXXBooleanCondition(Expr *CondExpr, bool IsConstexpr = false);
 
   /// ConvertIntegerToTypeWarnOnOverflow - Convert the specified APInt to have
   /// the specified width and sign.  If an overflow occurs, detect it and emit
@@ -8856,12 +9187,17 @@
 
   CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *D);
 
+  // CUDA function call preference. Must be ordered numerically from
+  // worst to best.
   enum CUDAFunctionPreference {
     CFP_Never,      // Invalid caller/callee combination.
-    CFP_LastResort, // Lowest priority. Only in effect if
-                    // LangOpts.CUDADisableTargetCallChecks is true.
-    CFP_Fallback,   // Low priority caller/callee combination
-    CFP_Best,       // Preferred caller/callee combination
+    CFP_WrongSide,  // Calls from host-device to host or device
+                    // function that do not match current compilation
+                    // mode.
+    CFP_HostDevice, // Any calls to host/device functions.
+    CFP_SameSide,   // Calls from host-device to host or device
+                    // function matching current compilation mode.
+    CFP_Native,     // host-to-host or device-to-device calls.
   };
 
   /// Identifies relative preference of a given Caller/Callee
@@ -8874,7 +9210,32 @@
   CUDAFunctionPreference IdentifyCUDAPreference(const FunctionDecl *Caller,
                                                 const FunctionDecl *Callee);
 
-  bool CheckCUDATarget(const FunctionDecl *Caller, const FunctionDecl *Callee);
+  /// Determines whether Caller may invoke Callee, based on their CUDA
+  /// host/device attributes.  Returns false if the call is not allowed.
+  ///
+  /// Note: Will return true for CFP_WrongSide calls.  These may appear in
+  /// semantically correct CUDA programs, but only if they're never codegen'ed.
+  bool IsAllowedCUDACall(const FunctionDecl *Caller,
+                         const FunctionDecl *Callee) {
+    return IdentifyCUDAPreference(Caller, Callee) != CFP_Never;
+  }
+
+  /// May add implicit CUDAHostAttr and CUDADeviceAttr attributes to FD,
+  /// depending on FD and the current compilation settings.
+  void maybeAddCUDAHostDeviceAttrs(Scope *S, FunctionDecl *FD,
+                                   const LookupResult &Previous);
+
+  /// Check whether we're allowed to call Callee from the current context.
+  ///
+  /// If the call is never allowed in a semantically-correct program
+  /// (CFP_Never), emits an error and returns false.
+  ///
+  /// If the call is allowed in semantically-correct programs, but only if it's
+  /// never codegen'ed (CFP_WrongSide), creates a deferred diagnostic to be
+  /// emitted if and when the caller is codegen'ed, and returns true.
+  ///
+  /// Otherwise, returns true without emitting any diagnostics.
+  bool CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee);
 
   /// Finds a function in \p Matches with highest calling priority
   /// from \p Caller context and erases all functions with lower
@@ -8904,6 +9265,11 @@
                                                bool ConstRHS,
                                                bool Diagnose);
 
+  /// \return true if \p CD can be considered empty according to CUDA
+  /// (E.2.3.1 in CUDA 7.5 Programming guide).
+  bool isEmptyCudaConstructor(SourceLocation Loc, CXXConstructorDecl *CD);
+  bool isEmptyCudaDestructor(SourceLocation Loc, CXXDestructorDecl *CD);
+
   /// \name Code completion
   //@{
   /// \brief Describes the context in which code completion occurs.
@@ -8968,6 +9334,7 @@
   void CodeCompletePostfixExpression(Scope *S, ExprResult LHS);
   void CodeCompleteTag(Scope *S, unsigned TagSpec);
   void CodeCompleteTypeQualifiers(DeclSpec &DS);
+  void CodeCompleteBracketDeclarator(Scope *S);
   void CodeCompleteCase(Scope *S);
   void CodeCompleteCall(Scope *S, Expr *Fn, ArrayRef<Expr *> Args);
   void CodeCompleteConstructor(Scope *S, QualType Type, SourceLocation Loc,
@@ -9158,13 +9525,6 @@
   };
   static FormatStringType GetFormatStringType(const FormatAttr *Format);
 
-  void CheckFormatString(const StringLiteral *FExpr, const Expr *OrigFormatExpr,
-                         ArrayRef<const Expr *> Args, bool HasVAListArg,
-                         unsigned format_idx, unsigned firstDataArg,
-                         FormatStringType Type, bool inFunctionCall,
-                         VariadicCallType CallType,
-                         llvm::SmallBitVector &CheckedVarArgs);
-  
   bool FormatStringHasSArg(const StringLiteral *FExpr);
   
   static bool GetFormatNSStringIdx(const FormatAttr *Format, unsigned &Idx);
@@ -9264,6 +9624,10 @@
   void CheckArgumentWithTypeTag(const ArgumentWithTypeTagAttr *Attr,
                                 const Expr * const *ExprArgs);
 
+  /// \brief Check if we are taking the address of a packed field
+  /// as this may be a problem if the pointer value is dereferenced.
+  void CheckAddressOfPackedMember(Expr *rhs);
+
   /// \brief The parser's current scope.
   ///
   /// The parser maintains this state here.
@@ -9321,7 +9685,23 @@
   }
 
   AvailabilityResult getCurContextAvailability() const;
-  
+
+  /// \brief Get the verison that this context implies.
+  /// For instance, a method in an interface that is annotated with an
+  /// availability attribuite effectively has the availability of the interface.
+  VersionTuple getVersionForDecl(const Decl *Ctx) const;
+
+  /// \brief The diagnostic we should emit for \c D, or \c AR_Available.
+  ///
+  /// \param D The declaration to check. Note that this may be altered to point
+  /// to another declaration that \c D gets it's availability from. i.e., we
+  /// walk the list of typedefs to find an availability attribute.
+  ///
+  /// \param ContextVersion The version to compare availability against.
+  AvailabilityResult
+  ShouldDiagnoseAvailabilityOfDecl(NamedDecl *&D, VersionTuple ContextVersion,
+                                   std::string *Message);
+
   const DeclContext *getCurObjCLexicalContext() const {
     const DeclContext *DC = getCurLexicalContext();
     // A category implicitly has the attribute of the interface.
@@ -9343,20 +9723,68 @@
   // Emitting members of dllexported classes is delayed until the class
   // (including field initializers) is fully parsed.
   SmallVector<CXXRecordDecl*, 4> DelayedDllExportClasses;
+
+private:
+  /// \brief Helper class that collects misaligned member designations and
+  /// their location info for delayed diagnostics.
+  struct MisalignedMember {
+    Expr *E;
+    RecordDecl *RD;
+    ValueDecl *MD;
+    CharUnits Alignment;
+
+    MisalignedMember() : E(), RD(), MD(), Alignment() {}
+    MisalignedMember(Expr *E, RecordDecl *RD, ValueDecl *MD,
+                     CharUnits Alignment)
+        : E(E), RD(RD), MD(MD), Alignment(Alignment) {}
+    explicit MisalignedMember(Expr *E)
+        : MisalignedMember(E, nullptr, nullptr, CharUnits()) {}
+
+    bool operator==(const MisalignedMember &m) { return this->E == m.E; }
+  };
+  /// \brief Small set of gathered accesses to potentially misaligned members
+  /// due to the packed attribute.
+  SmallVector<MisalignedMember, 4> MisalignedMembers;
+
+  /// \brief Adds an expression to the set of gathered misaligned members.
+  void AddPotentialMisalignedMembers(Expr *E, RecordDecl *RD, ValueDecl *MD,
+                                     CharUnits Alignment);
+
+public:
+  /// \brief Diagnoses the current set of gathered accesses. This typically
+  /// happens at full expression level. The set is cleared after emitting the
+  /// diagnostics.
+  void DiagnoseMisalignedMembers();
+
+  /// \brief This function checks if the expression is in the sef of potentially
+  /// misaligned members and it is converted to some pointer type T with lower
+  /// or equal alignment requirements.  If so it removes it. This is used when
+  /// we do not want to diagnose such misaligned access (e.g. in conversions to void*).
+  void DiscardMisalignedMemberAddress(const Type *T, Expr *E);
+
+  /// \brief This function calls Action when it determines that E designates a
+  /// misaligned member due to the packed attribute. This is used to emit
+  /// local diagnostics like in reference binding.
+  void RefersToMemberWithReducedAlignment(
+      Expr *E,
+      std::function<void(Expr *, RecordDecl *, ValueDecl *, CharUnits)> Action);
 };
 
 /// \brief RAII object that enters a new expression evaluation context.
 class EnterExpressionEvaluationContext {
   Sema &Actions;
+  bool Entered = true;
 
 public:
   EnterExpressionEvaluationContext(Sema &Actions,
                                    Sema::ExpressionEvaluationContext NewContext,
                                    Decl *LambdaContextDecl = nullptr,
-                                   bool IsDecltype = false)
-    : Actions(Actions) {
-    Actions.PushExpressionEvaluationContext(NewContext, LambdaContextDecl,
-                                            IsDecltype);
+                                   bool IsDecltype = false,
+                                   bool ShouldEnter = true)
+      : Actions(Actions), Entered(ShouldEnter) {
+    if (Entered)
+      Actions.PushExpressionEvaluationContext(NewContext, LambdaContextDecl,
+                                              IsDecltype);
   }
   EnterExpressionEvaluationContext(Sema &Actions,
                                    Sema::ExpressionEvaluationContext NewContext,
@@ -9369,7 +9797,8 @@
   }
 
   ~EnterExpressionEvaluationContext() {
-    Actions.PopExpressionEvaluationContext();
+    if (Entered)
+      Actions.PopExpressionEvaluationContext();
   }
 };
 
diff --git a/include/clang/Sema/SemaInternal.h b/include/clang/Sema/SemaInternal.h
index 60c6598..76567f3 100644
--- a/include/clang/Sema/SemaInternal.h
+++ b/include/clang/Sema/SemaInternal.h
@@ -73,10 +73,11 @@
   // Keep track of used but undefined variables.
   // FIXME: We shouldn't suppress this warning for static data members.
   if (Var->hasDefinition(SemaRef.Context) == VarDecl::DeclarationOnly &&
-    !Var->isExternallyVisible() &&
-    !(Var->isStaticDataMember() && Var->hasInit())) {
-      SourceLocation &old = SemaRef.UndefinedButUsed[Var->getCanonicalDecl()];
-      if (old.isInvalid()) old = Loc;
+      (!Var->isExternallyVisible() || Var->isInline()) &&
+      !(Var->isStaticDataMember() && Var->hasInit())) {
+    SourceLocation &old = SemaRef.UndefinedButUsed[Var->getCanonicalDecl()];
+    if (old.isInvalid())
+      old = Loc;
   }
   QualType CaptureType, DeclRefType;
   SemaRef.tryCaptureVariable(Var, Loc, Sema::TryCapture_Implicit, 
@@ -216,6 +217,9 @@
   bool isAddressOfOperand() const { return CorrectionValidator->IsAddressOfOperand; }
   const CXXScopeSpec *getSS() const { return SS.get(); }
   Scope *getScope() const { return S; }
+  CorrectionCandidateCallback *getCorrectionValidator() const {
+    return CorrectionValidator.get();
+  }
 
 private:
   class NamespaceSpecifierSet {
diff --git a/include/clang/Sema/SemaLambda.h b/include/clang/Sema/SemaLambda.h
index d043e2c..df40b13 100644
--- a/include/clang/Sema/SemaLambda.h
+++ b/include/clang/Sema/SemaLambda.h
@@ -18,7 +18,7 @@
 #include "clang/AST/ASTLambda.h"
 #include "clang/Sema/ScopeInfo.h"
 namespace clang {
- 
+class Sema;
 
 /// \brief Examines the FunctionScopeInfo stack to determine the nearest
 /// enclosing lambda (to the current lambda) that is 'capture-capable' for 
diff --git a/include/clang/Sema/Template.h b/include/clang/Sema/Template.h
index c092630..cda82a6 100644
--- a/include/clang/Sema/Template.h
+++ b/include/clang/Sema/Template.h
@@ -433,7 +433,8 @@
     Decl *VisitFunctionDecl(FunctionDecl *D,
                             TemplateParameterList *TemplateParams);
     Decl *VisitDecl(Decl *D);
-    Decl *VisitVarDecl(VarDecl *D, bool InstantiatingVarTemplate);
+    Decl *VisitVarDecl(VarDecl *D, bool InstantiatingVarTemplate,
+                       ArrayRef<BindingDecl *> *Bindings = nullptr);
 
     // Enable late instantiation of attributes.  Late instantiated attributes
     // will be stored in LA.
diff --git a/include/clang/Sema/TemplateDeduction.h b/include/clang/Sema/TemplateDeduction.h
index c22c703..ed1e768 100644
--- a/include/clang/Sema/TemplateDeduction.h
+++ b/include/clang/Sema/TemplateDeduction.h
@@ -244,6 +244,10 @@
 /// TODO: In the future, we may need to unify/generalize this with
 /// OverloadCandidate.
 struct TemplateSpecCandidate {
+  /// \brief The declaration that was looked up, together with its access.
+  /// Might be a UsingShadowDecl, but usually a FunctionTemplateDecl.
+  DeclAccessPair FoundDecl;
+
   /// Specialization - The actual specialization that this candidate
   /// represents. When NULL, this may be a built-in candidate.
   Decl *Specialization;
@@ -251,7 +255,8 @@
   /// Template argument deduction info
   DeductionFailureInfo DeductionFailure;
 
-  void set(Decl *Spec, DeductionFailureInfo Info) {
+  void set(DeclAccessPair Found, Decl *Spec, DeductionFailureInfo Info) {
+    FoundDecl = Found;
     Specialization = Spec;
     DeductionFailure = Info;
   }
diff --git a/include/clang/Serialization/ASTBitCodes.h b/include/clang/Serialization/ASTBitCodes.h
index 910c577..47678a1 100644
--- a/include/clang/Serialization/ASTBitCodes.h
+++ b/include/clang/Serialization/ASTBitCodes.h
@@ -175,6 +175,12 @@
         : Begin(R.getBegin().getRawEncoding()),
           End(R.getEnd().getRawEncoding()),
           BitOffset(BitOffset) { }
+      SourceLocation getBegin() const {
+        return SourceLocation::getFromRawEncoding(Begin);
+      }
+      SourceLocation getEnd() const {
+        return SourceLocation::getFromRawEncoding(End);
+      }
     };
 
     /// \brief Source range/offset of a preprocessed entity.
@@ -191,6 +197,9 @@
       void setLocation(SourceLocation L) {
         Loc = L.getRawEncoding();
       }
+      SourceLocation getLocation() const {
+        return SourceLocation::getFromRawEncoding(Loc);
+      }
     };
 
     /// \brief The number of predefined preprocessed entity IDs.
@@ -468,12 +477,7 @@
       /// \brief Record code for pending implicit instantiations.
       PENDING_IMPLICIT_INSTANTIATIONS = 26,
 
-      /// \brief Record code for a decl replacement block.
-      ///
-      /// If a declaration is modified after having been deserialized, and then
-      /// written to a dependent AST file, its ID and offset must be added to
-      /// the replacement block.
-      DECL_REPLACEMENTS = 27,
+      // ID 27 used to be for a list of replacement decls.
 
       /// \brief Record code for an update to a decl context's lookup table.
       ///
@@ -484,13 +488,10 @@
       /// that were modified after being deserialized and need updates.
       DECL_UPDATE_OFFSETS = 29,
 
-      /// \brief Record of updates for a declaration that was modified after
-      /// being deserialized.
-      DECL_UPDATES = 30,
+      // ID 30 used to be a decl update record. These are now in the DECLTYPES
+      // block.
       
-      /// \brief Record code for the table of offsets to CXXBaseSpecifier
-      /// sets.
-      CXX_BASE_SPECIFIER_OFFSETS = 31,
+      // ID 31 used to be a list of offsets to DECL_CXX_BASE_SPECIFIERS records.
 
       /// \brief Record code for \#pragma diagnostic mappings.
       DIAG_PRAGMA_MAPPINGS = 32,
@@ -570,12 +571,16 @@
       /// \brief Record code for potentially unused local typedef names.
       UNUSED_LOCAL_TYPEDEF_NAME_CANDIDATES = 52,
 
-      /// \brief Record code for the table of offsets to CXXCtorInitializers
-      /// lists.
-      CXX_CTOR_INITIALIZERS_OFFSETS = 53,
+      // ID 53 used to be a table of constructor initializer records.
 
       /// \brief Delete expressions that will be analyzed later.
-      DELETE_EXPRS_TO_ANALYZE = 54
+      DELETE_EXPRS_TO_ANALYZE = 54,
+
+      /// \brief Record code for \#pragma ms_struct options.
+      MSSTRUCT_PRAGMA_OPTIONS = 55,
+
+      /// \brief Record code for \#pragma ms_struct options.
+      POINTERS_TO_MEMBERS_PRAGMA_OPTIONS = 56
     };
 
     /// \brief Record types used within a source manager block.
@@ -591,9 +596,12 @@
       /// SM_SLOC_BUFFER_ENTRY record or a SM_SLOC_FILE_ENTRY with an
       /// overridden buffer.
       SM_SLOC_BUFFER_BLOB = 3,
+      /// \brief Describes a zlib-compressed blob that contains the data for
+      /// a buffer entry.
+      SM_SLOC_BUFFER_BLOB_COMPRESSED = 4,
       /// \brief Describes a source location entry (SLocEntry) for a
       /// macro expansion.
-      SM_SLOC_EXPANSION_ENTRY = 4
+      SM_SLOC_EXPANSION_ENTRY = 5
     };
 
     /// \brief Record types used within a preprocessor block.
@@ -676,6 +684,9 @@
       /// \brief Specifies a header that is private to this submodule but
       /// must be textually included.
       SUBMODULE_PRIVATE_TEXTUAL_HEADER = 15,
+      /// \brief Specifies some declarations with initializers that must be
+      /// emitted to initialize the module.
+      SUBMODULE_INITIALIZERS = 16,
     };
 
     /// \brief Record types used within a comments block.
@@ -772,44 +783,26 @@
       PREDEF_TYPE_PSEUDO_OBJECT = 35,
       /// \brief The placeholder type for builtin functions.
       PREDEF_TYPE_BUILTIN_FN = 36,
-      /// \brief OpenCL 1d image type.
-      PREDEF_TYPE_IMAGE1D_ID    = 37,
-      /// \brief OpenCL 1d image array type.
-      PREDEF_TYPE_IMAGE1D_ARR_ID = 38,
-      /// \brief OpenCL 1d image buffer type.
-      PREDEF_TYPE_IMAGE1D_BUFF_ID = 39,
-      /// \brief OpenCL 2d image type.
-      PREDEF_TYPE_IMAGE2D_ID    = 40,
-      /// \brief OpenCL 2d image array type.
-      PREDEF_TYPE_IMAGE2D_ARR_ID = 41,
-      /// \brief OpenCL 2d image depth type.
-      PREDEF_TYPE_IMAGE2D_DEP_ID = 42,
-      /// \brief OpenCL 2d image array depth type.
-      PREDEF_TYPE_IMAGE2D_ARR_DEP_ID = 43,
-      /// \brief OpenCL 2d image MSAA type.
-      PREDEF_TYPE_IMAGE2D_MSAA_ID = 44,
-      /// \brief OpenCL 2d image array MSAA type.
-      PREDEF_TYPE_IMAGE2D_ARR_MSAA_ID = 45,
-      /// \brief OpenCL 2d image MSAA depth type.
-      PREDEF_TYPE_IMAGE2D_MSAA_DEP_ID = 46,
-      /// \brief OpenCL 2d image array MSAA depth type.
-      PREDEF_TYPE_IMAGE2D_ARR_MSAA_DEPTH_ID = 47,
-      /// \brief OpenCL 3d image type.
-      PREDEF_TYPE_IMAGE3D_ID    = 48,
       /// \brief OpenCL event type.
-      PREDEF_TYPE_EVENT_ID      = 49,
+      PREDEF_TYPE_EVENT_ID      = 37,
       /// \brief OpenCL clk event type.
-      PREDEF_TYPE_CLK_EVENT_ID  = 50,
+      PREDEF_TYPE_CLK_EVENT_ID  = 38,
       /// \brief OpenCL sampler type.
-      PREDEF_TYPE_SAMPLER_ID    = 51,
+      PREDEF_TYPE_SAMPLER_ID    = 39,
       /// \brief OpenCL queue type.
-      PREDEF_TYPE_QUEUE_ID      = 52,
+      PREDEF_TYPE_QUEUE_ID      = 40,
       /// \brief OpenCL ndrange type.
-      PREDEF_TYPE_NDRANGE_ID    = 53,
+      PREDEF_TYPE_NDRANGE_ID    = 41,
       /// \brief OpenCL reserve_id type.
-      PREDEF_TYPE_RESERVE_ID_ID = 54,
+      PREDEF_TYPE_RESERVE_ID_ID = 42,
       /// \brief The placeholder type for OpenMP array section.
-      PREDEF_TYPE_OMP_ARRAY_SECTION = 55
+      PREDEF_TYPE_OMP_ARRAY_SECTION = 43,
+      /// \brief The '__float128' type
+      PREDEF_TYPE_FLOAT128_ID = 44,
+      /// \brief OpenCL image types with auto numeration
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+      PREDEF_TYPE_##Id##_ID,
+#include "clang/Basic/OpenCLImageTypes.def"
     };
 
     /// \brief The number of predefined type IDs that are reserved for
@@ -824,7 +817,7 @@
     /// These constants describe the type records that can occur within a
     /// block identified by DECLTYPES_BLOCK_ID in the AST file. Each
     /// constant describes a record for a specific type class in the
-    /// AST.
+    /// AST. Note that DeclCode values share this code space.
     enum TypeCode {
       /// \brief An ExtQualType record.
       TYPE_EXT_QUAL                 = 1,
@@ -993,23 +986,31 @@
 
       /// \brief The internal '__NSConstantString' tag type.
       PREDEF_DECL_CF_CONSTANT_STRING_TAG_ID = 15,
+
+      /// \brief The internal '__type_pack_element' template.
+      PREDEF_DECL_TYPE_PACK_ELEMENT_ID = 16,
     };
 
     /// \brief The number of declaration IDs that are predefined.
     ///
     /// For more information about predefined declarations, see the
     /// \c PredefinedDeclIDs type and the PREDEF_DECL_*_ID constants.
-    const unsigned int NUM_PREDEF_DECL_IDS = 16;
+    const unsigned int NUM_PREDEF_DECL_IDS = 17;
+
+    /// \brief Record of updates for a declaration that was modified after
+    /// being deserialized. This can occur within DECLTYPES_BLOCK_ID.
+    const unsigned int DECL_UPDATES = 49;
 
     /// \brief Record code for a list of local redeclarations of a declaration.
+    /// This can occur within DECLTYPES_BLOCK_ID.
     const unsigned int LOCAL_REDECLARATIONS = 50;
     
     /// \brief Record codes for each kind of declaration.
     ///
     /// These constants describe the declaration records that can occur within
-    /// a declarations block (identified by DECLS_BLOCK_ID). Each
+    /// a declarations block (identified by DECLTYPES_BLOCK_ID). Each
     /// constant describes a record for a specific declaration class
-    /// in the AST.
+    /// in the AST. Note that TypeCode values share this code space.
     enum DeclCode {
       /// \brief A TypedefDecl record.
       DECL_TYPEDEF = 51,
@@ -1055,6 +1056,10 @@
       DECL_IMPLICIT_PARAM,
       /// \brief A ParmVarDecl record.
       DECL_PARM_VAR,
+      /// \brief A DecompositionDecl record.
+      DECL_DECOMPOSITION,
+      /// \brief A BindingDecl record.
+      DECL_BINDING,
       /// \brief A FileScopeAsmDecl record.
       DECL_FILE_SCOPE_ASM,
       /// \brief A BlockDecl record.
@@ -1088,6 +1093,8 @@
       DECL_USING,
       /// \brief A UsingShadowDecl record.
       DECL_USING_SHADOW,
+      /// \brief A ConstructorUsingShadowDecl record.
+      DECL_CONSTRUCTOR_USING_SHADOW,
       /// \brief A UsingDirecitveDecl record.
       DECL_USING_DIRECTIVE,
       /// \brief An UnresolvedUsingValueDecl record.
@@ -1102,6 +1109,8 @@
       DECL_CXX_METHOD,
       /// \brief A CXXConstructorDecl record.
       DECL_CXX_CONSTRUCTOR,
+      /// \brief A CXXConstructorDecl record for an inherited constructor.
+      DECL_CXX_INHERITED_CONSTRUCTOR,
       /// \brief A CXXDestructorDecl record.
       DECL_CXX_DESTRUCTOR,
       /// \brief A CXXConversionDecl record.
@@ -1160,6 +1169,14 @@
       DECL_EMPTY,
       /// \brief An ObjCTypeParamDecl record.
       DECL_OBJC_TYPE_PARAM,
+      /// \brief An OMPCapturedExprDecl record.
+      DECL_OMP_CAPTUREDEXPR,
+      /// \brief A PragmaCommentDecl record.
+      DECL_PRAGMA_COMMENT,
+      /// \brief A PragmaDetectMismatchDecl record.
+      DECL_PRAGMA_DETECT_MISMATCH,
+      /// \brief An OMPDeclareReductionDecl record.
+      DECL_OMP_DECLARE_REDUCTION,
     };
 
     /// \brief Record codes for each kind of statement or expression.
@@ -1339,8 +1356,10 @@
       STMT_OBJC_AT_THROW,
       /// \brief An ObjCAutoreleasePoolStmt record.
       STMT_OBJC_AUTORELEASE_POOL,
-      /// \brief A ObjCBoolLiteralExpr record.
+      /// \brief An ObjCBoolLiteralExpr record.
       EXPR_OBJC_BOOL_LITERAL,
+      /// \brief An ObjCAvailabilityCheckExpr record.
+      EXPR_OBJC_AVAILABILITY_CHECK,
 
       // C++
       
@@ -1357,6 +1376,8 @@
       EXPR_CXX_MEMBER_CALL,
       /// \brief A CXXConstructExpr record.
       EXPR_CXX_CONSTRUCT,
+      /// \brief A CXXInheritedCtorInitExpr record.
+      EXPR_CXX_INHERITED_CTOR_INIT,
       /// \brief A CXXTemporaryObjectExpr record.
       EXPR_CXX_TEMPORARY_OBJECT,
       /// \brief A CXXStaticCastExpr record.
@@ -1451,6 +1472,10 @@
       STMT_OMP_ATOMIC_DIRECTIVE,
       STMT_OMP_TARGET_DIRECTIVE,
       STMT_OMP_TARGET_DATA_DIRECTIVE,
+      STMT_OMP_TARGET_ENTER_DATA_DIRECTIVE,
+      STMT_OMP_TARGET_EXIT_DATA_DIRECTIVE,
+      STMT_OMP_TARGET_PARALLEL_DIRECTIVE,
+      STMT_OMP_TARGET_PARALLEL_FOR_DIRECTIVE,
       STMT_OMP_TEAMS_DIRECTIVE,
       STMT_OMP_TASKGROUP_DIRECTIVE,
       STMT_OMP_CANCELLATION_POINT_DIRECTIVE,
@@ -1458,6 +1483,13 @@
       STMT_OMP_TASKLOOP_DIRECTIVE,
       STMT_OMP_TASKLOOP_SIMD_DIRECTIVE,
       STMT_OMP_DISTRIBUTE_DIRECTIVE,
+      STMT_OMP_TARGET_UPDATE_DIRECTIVE,
+      STMT_OMP_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE,
+      STMT_OMP_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE,
+      STMT_OMP_DISTRIBUTE_SIMD_DIRECTIVE,
+      STMT_OMP_TARGET_PARALLEL_FOR_SIMD_DIRECTIVE,
+      STMT_OMP_TARGET_SIMD_DIRECTIVE,
+      STMT_OMP_TEAMS_DISTRIBUTE_DIRECTIVE,
       EXPR_OMP_ARRAY_SECTION,
 
       // ARC
diff --git a/include/clang/Serialization/ASTReader.h b/include/clang/Serialization/ASTReader.h
index a4c8cae..943765d 100644
--- a/include/clang/Serialization/ASTReader.h
+++ b/include/clang/Serialization/ASTReader.h
@@ -18,10 +18,8 @@
 #include "clang/AST/DeclarationName.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/Basic/Diagnostic.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/FileSystemOptions.h"
 #include "clang/Basic/IdentifierTable.h"
-#include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Version.h"
 #include "clang/Lex/ExternalPreprocessorSource.h"
 #include "clang/Lex/HeaderSearch.h"
@@ -33,9 +31,6 @@
 #include "clang/Serialization/Module.h"
 #include "clang/Serialization/ModuleFileExtension.h"
 #include "clang/Serialization/ModuleManager.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -43,11 +38,9 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Bitcode/BitstreamReader.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Timer.h"
 #include <deque>
-#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -55,10 +48,16 @@
 
 namespace llvm {
   class MemoryBuffer;
+  class APInt;
+  class APSInt;
+  class APFloat;
 }
 
 namespace clang {
 
+class SourceManager;
+class HeaderSearchOptions;
+class FileManager;
 class AddrLabelExpr;
 class ASTConsumer;
 class ASTContext;
@@ -475,21 +474,6 @@
   /// declaration that has an exception specification.
   llvm::SmallMapVector<Decl *, FunctionDecl *, 4> PendingExceptionSpecUpdates;
 
-  struct ReplacedDeclInfo {
-    ModuleFile *Mod;
-    uint64_t Offset;
-    unsigned RawLoc;
-
-    ReplacedDeclInfo() : Mod(nullptr), Offset(0), RawLoc(0) {}
-    ReplacedDeclInfo(ModuleFile *Mod, uint64_t Offset, unsigned RawLoc)
-      : Mod(Mod), Offset(Offset), RawLoc(RawLoc) {}
-  };
-
-  typedef llvm::DenseMap<serialization::DeclID, ReplacedDeclInfo>
-      DeclReplacementMap;
-  /// \brief Declarations that have been replaced in a later file in the chain.
-  DeclReplacementMap ReplacedDecls;
-
   /// \brief Declarations that have been imported and have typedef names for
   /// linkage purposes.
   llvm::DenseMap<std::pair<DeclContext*, IdentifierInfo*>, NamedDecl*>
@@ -795,6 +779,13 @@
   /// \brief The pragma clang optimize location (if the pragma state is "off").
   SourceLocation OptimizeOffPragmaLocation;
 
+  /// \brief The PragmaMSStructKind pragma ms_struct state if set, or -1.
+  int PragmaMSStructState;
+
+  /// \brief The PragmaMSPointersToMembersKind pragma pointers_to_members state.
+  int PragmaMSPointersToMembersState;
+  SourceLocation PointersToMembersPragmaLocation;
+
   /// \brief The OpenCL extension settings.
   SmallVector<uint64_t, 1> OpenCLExtensions;
 
@@ -851,6 +842,9 @@
   /// \brief Whether we have tried loading the global module index yet.
   bool TriedLoadingGlobalIndex;
 
+  ///\brief Whether we are currently processing update records.
+  bool ProcessingUpdateRecords;
+
   typedef llvm::DenseMap<unsigned, SwitchCase *> SwitchCaseMapTy;
   /// \brief Mapping from switch-case IDs in the chain to switch-case statements
   ///
@@ -1050,6 +1044,23 @@
     ~ReadingKindTracker() { Reader.ReadingKind = PrevKind; }
   };
 
+  /// \brief RAII object to mark the start of processing updates.
+  class ProcessingUpdatesRAIIObj {
+    ASTReader &Reader;
+    bool PrevState;
+
+    ProcessingUpdatesRAIIObj(const ProcessingUpdatesRAIIObj &) = delete;
+    void operator=(const ProcessingUpdatesRAIIObj &) = delete;
+
+  public:
+    ProcessingUpdatesRAIIObj(ASTReader &reader)
+      : Reader(reader), PrevState(Reader.ProcessingUpdateRecords) {
+      Reader.ProcessingUpdateRecords = true;
+    }
+
+    ~ProcessingUpdatesRAIIObj() { Reader.ProcessingUpdateRecords = PrevState; }
+  };
+
   /// \brief Suggested contents of the predefines buffer, after this
   /// PCH file has been processed.
   ///
@@ -1193,7 +1204,7 @@
   Decl *getMostRecentExistingDecl(Decl *D);
 
   RecordLocation DeclCursorForID(serialization::DeclID ID,
-                                 unsigned &RawLocation);
+                                 SourceLocation &Location);
   void loadDeclUpdateRecords(serialization::DeclID ID, Decl *D);
   void loadPendingDeclChain(Decl *D, uint64_t LocalOffset);
   void loadObjCCategories(serialization::GlobalDeclID ID, ObjCInterfaceDecl *D,
@@ -1374,7 +1385,7 @@
   /// \param ClientLoadCapabilities The set of client load-failure
   /// capabilities, represented as a bitset of the enumerators of
   /// LoadFailureCapabilities.
-  ASTReadResult ReadAST(const std::string &FileName, ModuleKind Type,
+  ASTReadResult ReadAST(StringRef FileName, ModuleKind Type,
                         SourceLocation ImportLoc,
                         unsigned ClientLoadCapabilities);
 
@@ -1706,11 +1717,6 @@
   /// redeclaration chain for \p D.
   void CompleteRedeclChain(const Decl *D) override;
 
-  /// \brief Read a CXXBaseSpecifiers ID form the given record and
-  /// return its global bit offset.
-  uint64_t readCXXBaseSpecifiers(ModuleFile &M, const RecordData &Record,
-                                 unsigned &Idx);
-
   CXXBaseSpecifier *GetExternalCXXBaseSpecifiers(uint64_t Offset) override;
 
   /// \brief Resolve the offset of a statement into a statement.
@@ -1994,18 +2000,27 @@
   ReadCXXCtorInitializers(ModuleFile &F, const RecordData &Record,
                           unsigned &Idx);
 
-  /// \brief Read a CXXCtorInitializers ID from the given record and
-  /// return its global bit offset.
-  uint64_t ReadCXXCtorInitializersRef(ModuleFile &M, const RecordData &Record,
-                                      unsigned &Idx);
-
   /// \brief Read the contents of a CXXCtorInitializer array.
   CXXCtorInitializer **GetExternalCXXCtorInitializers(uint64_t Offset) override;
 
+  /// \brief Read a source location from raw form and return it in its
+  /// originating module file's source location space.
+  SourceLocation ReadUntranslatedSourceLocation(uint32_t Raw) const {
+    return SourceLocation::getFromRawEncoding((Raw >> 1) | (Raw << 31));
+  }
+
   /// \brief Read a source location from raw form.
-  SourceLocation ReadSourceLocation(ModuleFile &ModuleFile, unsigned Raw) const {
-    SourceLocation Loc = SourceLocation::getFromRawEncoding(Raw);
-    assert(ModuleFile.SLocRemap.find(Loc.getOffset()) != ModuleFile.SLocRemap.end() &&
+  SourceLocation ReadSourceLocation(ModuleFile &ModuleFile, uint32_t Raw) const {
+    SourceLocation Loc = ReadUntranslatedSourceLocation(Raw);
+    return TranslateSourceLocation(ModuleFile, Loc);
+  }
+
+  /// \brief Translate a source location from another module file's source
+  /// location space into ours.
+  SourceLocation TranslateSourceLocation(ModuleFile &ModuleFile,
+                                         SourceLocation Loc) const {
+    assert(ModuleFile.SLocRemap.find(Loc.getOffset()) !=
+               ModuleFile.SLocRemap.end() &&
            "Cannot find offset to remap.");
     int Remap = ModuleFile.SLocRemap.find(Loc.getOffset())->second;
     return Loc.getLocWithOffset(Remap);
@@ -2135,6 +2150,8 @@
 
   /// \brief Loads comments ranges.
   void ReadComments() override;
+
+  bool isProcessingUpdateRecords() { return ProcessingUpdateRecords; }
 };
 
 /// \brief Helper class that saves the current stream position and
diff --git a/include/clang/Serialization/ASTWriter.h b/include/clang/Serialization/ASTWriter.h
index ef8c653..b07b36c 100644
--- a/include/clang/Serialization/ASTWriter.h
+++ b/include/clang/Serialization/ASTWriter.h
@@ -16,9 +16,8 @@
 
 #include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/Decl.h"
-#include "clang/AST/DeclarationName.h"
-#include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/AST/TemplateBase.h"
+#include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/Sema/SemaConsumer.h"
 #include "clang/Serialization/ASTBitCodes.h"
 #include "clang/Serialization/ASTDeserializationListener.h"
@@ -26,21 +25,19 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
-#include <map>
 #include <queue>
 #include <vector>
 
 namespace llvm {
   class APFloat;
   class APInt;
-  class BitstreamWriter;
 }
 
 namespace clang {
 
+class DeclarationName;
 class ASTContext;
 class Attr;
 class NestedNameSpecifier;
@@ -90,6 +87,8 @@
 
   friend class ASTDeclWriter;
   friend class ASTStmtWriter;
+  friend class ASTTypeWriter;
+  friend class ASTRecordWriter;
 private:
   /// \brief Map that provides the ID numbers of each type within the
   /// output stream, plus those deserialized from a chained PCH.
@@ -382,25 +381,6 @@
   /// should serialize.
   llvm::SetVector<ObjCInterfaceDecl *> ObjCClassesWithCategories;
                     
-  struct ReplacedDeclInfo {
-    serialization::DeclID ID;
-    uint64_t Offset;
-    unsigned Loc;
-
-    ReplacedDeclInfo() : ID(0), Offset(0), Loc(0) {}
-    ReplacedDeclInfo(serialization::DeclID ID, uint64_t Offset,
-                     SourceLocation Loc)
-      : ID(ID), Offset(Offset), Loc(Loc.getRawEncoding()) {}
-  };
-
-  /// \brief Decls that have been replaced in the current dependent AST file.
-  ///
-  /// When a decl changes fundamentally after being deserialized (this shouldn't
-  /// happen, but the ObjC AST nodes are designed this way), it will be
-  /// serialized again. In this case, it is registered here, so that the reader
-  /// knows to read the updated version.
-  SmallVector<ReplacedDeclInfo, 16> ReplacedDecls;
-                 
   /// \brief The set of declarations that may have redeclaration chains that
   /// need to be serialized.
   llvm::SmallVector<const Decl *, 16> Redeclarations;
@@ -409,14 +389,6 @@
   /// redeclaration chains.
   llvm::DenseMap<const Decl *, const Decl *> FirstLocalDeclCache;
                                       
-  /// \brief Statements that we've encountered while serializing a
-  /// declaration or type.
-  SmallVector<Stmt *, 16> StmtsToEmit;
-
-  /// \brief Statements collection to use for ASTWriter::AddStmt().
-  /// It will point to StmtsToEmit unless it is overriden.
-  SmallVector<Stmt *, 16> *CollectedStmts;
-
   /// \brief Mapping from SwitchCase statements to IDs.
   llvm::DenseMap<SwitchCase *, unsigned> SwitchCaseIDs;
 
@@ -434,62 +406,6 @@
   /// file.
   unsigned NumVisibleDeclContexts;
 
-  /// \brief The offset of each CXXBaseSpecifier set within the AST.
-  SmallVector<uint32_t, 16> CXXBaseSpecifiersOffsets;
-
-  /// \brief The first ID number we can use for our own base specifiers.
-  serialization::CXXBaseSpecifiersID FirstCXXBaseSpecifiersID;
-
-  /// \brief The base specifiers ID that will be assigned to the next new
-  /// set of C++ base specifiers.
-  serialization::CXXBaseSpecifiersID NextCXXBaseSpecifiersID;
-
-  /// \brief A set of C++ base specifiers that is queued to be written into the
-  /// AST file.
-  struct QueuedCXXBaseSpecifiers {
-    QueuedCXXBaseSpecifiers() : ID(), Bases(), BasesEnd() { }
-
-    QueuedCXXBaseSpecifiers(serialization::CXXBaseSpecifiersID ID,
-                            CXXBaseSpecifier const *Bases,
-                            CXXBaseSpecifier const *BasesEnd)
-      : ID(ID), Bases(Bases), BasesEnd(BasesEnd) { }
-
-    serialization::CXXBaseSpecifiersID ID;
-    CXXBaseSpecifier const * Bases;
-    CXXBaseSpecifier const * BasesEnd;
-  };
-
-  /// \brief Queue of C++ base specifiers to be written to the AST file,
-  /// in the order they should be written.
-  SmallVector<QueuedCXXBaseSpecifiers, 2> CXXBaseSpecifiersToWrite;
-
-  /// \brief The offset of each CXXCtorInitializer list within the AST.
-  SmallVector<uint32_t, 16> CXXCtorInitializersOffsets;
-
-  /// \brief The first ID number we can use for our own ctor initializers.
-  serialization::CXXCtorInitializersID FirstCXXCtorInitializersID;
-
-  /// \brief The ctor initializers ID that will be assigned to the next new
-  /// list of C++ ctor initializers.
-  serialization::CXXCtorInitializersID NextCXXCtorInitializersID;
-
-  /// \brief A set of C++ ctor initializers that is queued to be written
-  /// into the AST file.
-  struct QueuedCXXCtorInitializers {
-    QueuedCXXCtorInitializers() : ID() {}
-
-    QueuedCXXCtorInitializers(serialization::CXXCtorInitializersID ID,
-                              ArrayRef<CXXCtorInitializer*> Inits)
-        : ID(ID), Inits(Inits) {}
-
-    serialization::CXXCtorInitializersID ID;
-    ArrayRef<CXXCtorInitializer*> Inits;
-  };
-
-  /// \brief Queue of C++ ctor initializers to be written to the AST file,
-  /// in the order they should be written.
-  SmallVector<QueuedCXXCtorInitializers, 2> CXXCtorInitializersToWrite;
-
   /// \brief A mapping from each known submodule to its ID number, which will
   /// be a positive integer.
   llvm::DenseMap<Module *, unsigned> SubmoduleIDs;
@@ -502,9 +418,7 @@
   unsigned getSubmoduleID(Module *Mod);
 
   /// \brief Write the given subexpression to the bitstream.
-  void WriteSubStmt(Stmt *S,
-                    llvm::DenseMap<Stmt *, uint64_t> &SubStmtEntries,
-                    llvm::DenseSet<Stmt *> &ParentStmts);
+  void WriteSubStmt(Stmt *S);
 
   void WriteBlockInfoBlock();
   uint64_t WriteControlBlock(Preprocessor &PP, ASTContext &Context,
@@ -520,8 +434,6 @@
                                         
   void WritePragmaDiagnosticMappings(const DiagnosticsEngine &Diag,
                                      bool isModule);
-  void WriteCXXBaseSpecifiersOffsets();
-  void WriteCXXCtorInitializersOffsets();
 
   unsigned TypeExtQualAbbrev;
   unsigned TypeFunctionProtoAbbrev;
@@ -542,15 +454,15 @@
   void WriteReferencedSelectorsPool(Sema &SemaRef);
   void WriteIdentifierTable(Preprocessor &PP, IdentifierResolver &IdResolver,
                             bool IsModule);
-  void WriteAttributes(ArrayRef<const Attr*> Attrs, RecordDataImpl &Record);
   void WriteDeclUpdatesBlocks(RecordDataImpl &OffsetsRecord);
-  void WriteDeclReplacementsBlock();
   void WriteDeclContextVisibleUpdate(const DeclContext *DC);
   void WriteFPPragmaOptions(const FPOptions &Opts);
   void WriteOpenCLExtensions(Sema &SemaRef);
   void WriteObjCCategories();
   void WriteLateParsedTemplates(Sema &SemaRef);
   void WriteOptimizePragmaOptions(Sema &SemaRef);
+  void WriteMSStructPragmaOptions(Sema &SemaRef);
+  void WriteMSPointersToMembersPragmaOptions(Sema &SemaRef);
   void WriteModuleFileExtension(Sema &SemaRef,
                                 ModuleFileExtensionWriter &Writer);
 
@@ -573,7 +485,6 @@
 
   void WriteDeclAbbrevs();
   void WriteDecl(ASTContext &Context, Decl *D);
-  void AddFunctionDefinition(const FunctionDecl *FD, RecordData &Record);
 
   uint64_t WriteASTCore(Sema &SemaRef,
                         StringRef isysroot, const std::string &OutputFile,
@@ -621,29 +532,9 @@
   /// \brief Emit a source range.
   void AddSourceRange(SourceRange Range, RecordDataImpl &Record);
 
-  /// \brief Emit an integral value.
-  void AddAPInt(const llvm::APInt &Value, RecordDataImpl &Record);
-
-  /// \brief Emit a signed integral value.
-  void AddAPSInt(const llvm::APSInt &Value, RecordDataImpl &Record);
-
-  /// \brief Emit a floating-point value.
-  void AddAPFloat(const llvm::APFloat &Value, RecordDataImpl &Record);
-
   /// \brief Emit a reference to an identifier.
   void AddIdentifierRef(const IdentifierInfo *II, RecordDataImpl &Record);
 
-  /// \brief Emit a Selector (which is a smart pointer reference).
-  void AddSelectorRef(Selector, RecordDataImpl &Record);
-
-  /// \brief Emit a CXXTemporary.
-  void AddCXXTemporary(const CXXTemporary *Temp, RecordDataImpl &Record);
-
-  /// \brief Emit a set of C++ base specifiers to the record.
-  void AddCXXBaseSpecifiersRef(CXXBaseSpecifier const *Bases,
-                               CXXBaseSpecifier const *BasesEnd,
-                               RecordDataImpl &Record);
-
   /// \brief Get the unique number used to refer to the given selector.
   serialization::SelectorID getSelectorRef(Selector Sel);
 
@@ -667,30 +558,21 @@
   /// \brief Determine the type ID of an already-emitted type.
   serialization::TypeID getTypeID(QualType T) const;
 
-  /// \brief Emits a reference to a declarator info.
-  void AddTypeSourceInfo(TypeSourceInfo *TInfo, RecordDataImpl &Record);
-
-  /// \brief Emits a type with source-location information.
-  void AddTypeLoc(TypeLoc TL, RecordDataImpl &Record);
-
-  /// \brief Emits a template argument location info.
-  void AddTemplateArgumentLocInfo(TemplateArgument::ArgKind Kind,
-                                  const TemplateArgumentLocInfo &Arg,
-                                  RecordDataImpl &Record);
-
-  /// \brief Emits a template argument location.
-  void AddTemplateArgumentLoc(const TemplateArgumentLoc &Arg,
-                              RecordDataImpl &Record);
-
-  /// \brief Emits an AST template argument list info.
-  void AddASTTemplateArgumentListInfo(
-                          const ASTTemplateArgumentListInfo *ASTTemplArgList,
-                          RecordDataImpl &Record);
-
   /// \brief Find the first local declaration of a given local redeclarable
   /// decl.
   const Decl *getFirstLocalDecl(const Decl *D);
 
+  /// \brief Is this a local declaration (that is, one that will be written to
+  /// our AST file)? This is the case for declarations that are neither imported
+  /// from another AST file nor predefined.
+  bool IsLocalDecl(const Decl *D) {
+    if (D->isFromASTFile())
+      return false;
+    auto I = DeclIDs.find(D);
+    return (I == DeclIDs.end() ||
+            I->second >= serialization::NUM_PREDEF_DECL_IDS);
+  };
+
   /// \brief Emit a reference to a declaration.
   void AddDeclRef(const Decl *D, RecordDataImpl &Record);
 
@@ -702,57 +584,8 @@
   /// declaration.
   serialization::DeclID getDeclID(const Decl *D);
 
-  /// \brief Emit a declaration name.
-  void AddDeclarationName(DeclarationName Name, RecordDataImpl &Record);
-  void AddDeclarationNameLoc(const DeclarationNameLoc &DNLoc,
-                             DeclarationName Name, RecordDataImpl &Record);
-  void AddDeclarationNameInfo(const DeclarationNameInfo &NameInfo,
-                              RecordDataImpl &Record);
   unsigned getAnonymousDeclarationNumber(const NamedDecl *D);
 
-  void AddQualifierInfo(const QualifierInfo &Info, RecordDataImpl &Record);
-
-  /// \brief Emit a nested name specifier.
-  void AddNestedNameSpecifier(NestedNameSpecifier *NNS, RecordDataImpl &Record);
-
-  /// \brief Emit a nested name specifier with source-location information.
-  void AddNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS,
-                                 RecordDataImpl &Record);
-
-  /// \brief Emit a template name.
-  void AddTemplateName(TemplateName Name, RecordDataImpl &Record);
-
-  /// \brief Emit a template argument.
-  void AddTemplateArgument(const TemplateArgument &Arg, RecordDataImpl &Record);
-
-  /// \brief Emit a template parameter list.
-  void AddTemplateParameterList(const TemplateParameterList *TemplateParams,
-                                RecordDataImpl &Record);
-
-  /// \brief Emit a template argument list.
-  void AddTemplateArgumentList(const TemplateArgumentList *TemplateArgs,
-                                RecordDataImpl &Record);
-
-  /// \brief Emit a UnresolvedSet structure.
-  void AddUnresolvedSet(const ASTUnresolvedSet &Set, RecordDataImpl &Record);
-
-  /// \brief Emit a C++ base specifier.
-  void AddCXXBaseSpecifier(const CXXBaseSpecifier &Base,
-                           RecordDataImpl &Record);
-
-  /// \brief Emit the ID for a CXXCtorInitializer array and register the array
-  /// for later serialization.
-  void AddCXXCtorInitializersRef(ArrayRef<CXXCtorInitializer *> Inits,
-                                 RecordDataImpl &Record);
-
-  /// \brief Emit a CXXCtorInitializer array.
-  void AddCXXCtorInitializers(
-                             const CXXCtorInitializer * const *CtorInitializers,
-                             unsigned NumCtorInitializers,
-                             RecordDataImpl &Record);
-
-  void AddCXXDefinitionData(const CXXRecordDecl *D, RecordDataImpl &Record);
-
   /// \brief Add a string to the given record.
   void AddString(StringRef Str, RecordDataImpl &Record);
 
@@ -787,38 +620,6 @@
   /// within the method pool/selector table.
   void SetSelectorOffset(Selector Sel, uint32_t Offset);
 
-  /// \brief Add the given statement or expression to the queue of
-  /// statements to emit.
-  ///
-  /// This routine should be used when emitting types and declarations
-  /// that have expressions as part of their formulation. Once the
-  /// type or declaration has been written, call FlushStmts() to write
-  /// the corresponding statements just after the type or
-  /// declaration.
-  void AddStmt(Stmt *S) {
-      CollectedStmts->push_back(S);
-  }
-
-  /// \brief Flush all of the statements and expressions that have
-  /// been added to the queue via AddStmt().
-  void FlushStmts();
-
-  /// \brief Flush all of the C++ base specifier sets that have been added
-  /// via \c AddCXXBaseSpecifiersRef().
-  void FlushCXXBaseSpecifiers();
-
-  /// \brief Flush all of the C++ constructor initializer lists that have been
-  /// added via \c AddCXXCtorInitializersRef().
-  void FlushCXXCtorInitializers();
-
-  /// \brief Flush all pending records that are tacked onto the end of
-  /// decl and decl update records.
-  void FlushPendingAfterDecl() {
-    FlushStmts();
-    FlushCXXBaseSpecifiers();
-    FlushCXXCtorInitializers();
-  }
-
   /// \brief Record an ID for the given switch-case statement.
   unsigned RecordSwitchCaseID(SwitchCase *S);
 
@@ -851,6 +652,7 @@
   bool hasChain() const { return Chain; }
   ASTReader *getChain() const { return Chain; }
 
+private:
   // ASTDeserializationListener implementation
   void ReaderInitialized(ASTReader *Reader) override;
   void IdentifierRead(serialization::IdentID ID, IdentifierInfo *II) override;
@@ -872,16 +674,236 @@
   void CompletedImplicitDefinition(const FunctionDecl *D) override;
   void StaticDataMemberInstantiated(const VarDecl *D) override;
   void DefaultArgumentInstantiated(const ParmVarDecl *D) override;
+  void DefaultMemberInitializerInstantiated(const FieldDecl *D) override;
   void FunctionDefinitionInstantiated(const FunctionDecl *D) override;
   void AddedObjCCategoryToInterface(const ObjCCategoryDecl *CatD,
                                     const ObjCInterfaceDecl *IFD) override;
   void DeclarationMarkedUsed(const Decl *D) override;
   void DeclarationMarkedOpenMPThreadPrivate(const Decl *D) override;
+  void DeclarationMarkedOpenMPDeclareTarget(const Decl *D,
+                                            const Attr *Attr) override;
   void RedefinedHiddenDefinition(const NamedDecl *D, Module *M) override;
   void AddedAttributeToRecord(const Attr *Attr,
                               const RecordDecl *Record) override;
 };
 
+/// \brief An object for streaming information to a record.
+class ASTRecordWriter {
+  ASTWriter *Writer;
+  ASTWriter::RecordDataImpl *Record;
+
+  /// \brief Statements that we've encountered while serializing a
+  /// declaration or type.
+  SmallVector<Stmt *, 16> StmtsToEmit;
+
+  /// \brief Indices of record elements that describe offsets within the
+  /// bitcode. These will be converted to offsets relative to the current
+  /// record when emitted.
+  SmallVector<unsigned, 8> OffsetIndices;
+
+  /// \brief Flush all of the statements and expressions that have
+  /// been added to the queue via AddStmt().
+  void FlushStmts();
+  void FlushSubStmts();
+
+  void PrepareToEmit(uint64_t MyOffset) {
+    // Convert offsets into relative form.
+    for (unsigned I : OffsetIndices) {
+      auto &StoredOffset = (*Record)[I];
+      assert(StoredOffset < MyOffset && "invalid offset");
+      if (StoredOffset)
+        StoredOffset = MyOffset - StoredOffset;
+    }
+    OffsetIndices.clear();
+  }
+
+public:
+  /// Construct a ASTRecordWriter that uses the default encoding scheme.
+  ASTRecordWriter(ASTWriter &Writer, ASTWriter::RecordDataImpl &Record)
+      : Writer(&Writer), Record(&Record) {}
+
+  /// Construct a ASTRecordWriter that uses the same encoding scheme as another
+  /// ASTRecordWriter.
+  ASTRecordWriter(ASTRecordWriter &Parent, ASTWriter::RecordDataImpl &Record)
+      : Writer(Parent.Writer), Record(&Record) {}
+
+  /// Copying an ASTRecordWriter is almost certainly a bug.
+  ASTRecordWriter(const ASTRecordWriter&) = delete;
+  void operator=(const ASTRecordWriter&) = delete;
+
+  /// \brief Extract the underlying record storage.
+  ASTWriter::RecordDataImpl &getRecordData() const { return *Record; }
+
+  /// \brief Minimal vector-like interface.
+  /// @{
+  void push_back(uint64_t N) { Record->push_back(N); }
+  template<typename InputIterator>
+  void append(InputIterator begin, InputIterator end) {
+    Record->append(begin, end);
+  }
+  bool empty() const { return Record->empty(); }
+  size_t size() const { return Record->size(); }
+  uint64_t &operator[](size_t N) { return (*Record)[N]; }
+  /// @}
+
+  /// \brief Emit the record to the stream, followed by its substatements, and
+  /// return its offset.
+  // FIXME: Allow record producers to suggest Abbrevs.
+  uint64_t Emit(unsigned Code, unsigned Abbrev = 0) {
+    uint64_t Offset = Writer->Stream.GetCurrentBitNo();
+    PrepareToEmit(Offset);
+    Writer->Stream.EmitRecord(Code, *Record, Abbrev);
+    FlushStmts();
+    return Offset;
+  }
+
+  /// \brief Emit the record to the stream, preceded by its substatements.
+  uint64_t EmitStmt(unsigned Code, unsigned Abbrev = 0) {
+    FlushSubStmts();
+    PrepareToEmit(Writer->Stream.GetCurrentBitNo());
+    Writer->Stream.EmitRecord(Code, *Record, Abbrev);
+    return Writer->Stream.GetCurrentBitNo();
+  }
+
+  /// \brief Add a bit offset into the record. This will be converted into an
+  /// offset relative to the current record when emitted.
+  void AddOffset(uint64_t BitOffset) {
+    OffsetIndices.push_back(Record->size());
+    Record->push_back(BitOffset);
+  }
+
+  /// \brief Add the given statement or expression to the queue of
+  /// statements to emit.
+  ///
+  /// This routine should be used when emitting types and declarations
+  /// that have expressions as part of their formulation. Once the
+  /// type or declaration has been written, Emit() will write
+  /// the corresponding statements just after the record.
+  void AddStmt(Stmt *S) {
+    StmtsToEmit.push_back(S);
+  }
+
+  /// \brief Add a definition for the given function to the queue of statements
+  /// to emit.
+  void AddFunctionDefinition(const FunctionDecl *FD);
+
+  /// \brief Emit a source location.
+  void AddSourceLocation(SourceLocation Loc) {
+    return Writer->AddSourceLocation(Loc, *Record);
+  }
+
+  /// \brief Emit a source range.
+  void AddSourceRange(SourceRange Range) {
+    return Writer->AddSourceRange(Range, *Record);
+  }
+
+  /// \brief Emit an integral value.
+  void AddAPInt(const llvm::APInt &Value);
+
+  /// \brief Emit a signed integral value.
+  void AddAPSInt(const llvm::APSInt &Value);
+
+  /// \brief Emit a floating-point value.
+  void AddAPFloat(const llvm::APFloat &Value);
+
+  /// \brief Emit a reference to an identifier.
+  void AddIdentifierRef(const IdentifierInfo *II) {
+    return Writer->AddIdentifierRef(II, *Record);
+  }
+
+  /// \brief Emit a Selector (which is a smart pointer reference).
+  void AddSelectorRef(Selector S);
+
+  /// \brief Emit a CXXTemporary.
+  void AddCXXTemporary(const CXXTemporary *Temp);
+
+  /// \brief Emit a C++ base specifier.
+  void AddCXXBaseSpecifier(const CXXBaseSpecifier &Base);
+
+  /// \brief Emit a set of C++ base specifiers.
+  void AddCXXBaseSpecifiers(ArrayRef<CXXBaseSpecifier> Bases);
+
+  /// \brief Emit a reference to a type.
+  void AddTypeRef(QualType T) {
+    return Writer->AddTypeRef(T, *Record);
+  }
+
+  /// \brief Emits a reference to a declarator info.
+  void AddTypeSourceInfo(TypeSourceInfo *TInfo);
+
+  /// \brief Emits a type with source-location information.
+  void AddTypeLoc(TypeLoc TL);
+
+  /// \brief Emits a template argument location info.
+  void AddTemplateArgumentLocInfo(TemplateArgument::ArgKind Kind,
+                                  const TemplateArgumentLocInfo &Arg);
+
+  /// \brief Emits a template argument location.
+  void AddTemplateArgumentLoc(const TemplateArgumentLoc &Arg);
+
+  /// \brief Emits an AST template argument list info.
+  void AddASTTemplateArgumentListInfo(
+      const ASTTemplateArgumentListInfo *ASTTemplArgList);
+
+  /// \brief Emit a reference to a declaration.
+  void AddDeclRef(const Decl *D) {
+    return Writer->AddDeclRef(D, *Record);
+  }
+
+  /// \brief Emit a declaration name.
+  void AddDeclarationName(DeclarationName Name);
+
+  void AddDeclarationNameLoc(const DeclarationNameLoc &DNLoc,
+                             DeclarationName Name);
+  void AddDeclarationNameInfo(const DeclarationNameInfo &NameInfo);
+
+  void AddQualifierInfo(const QualifierInfo &Info);
+
+  /// \brief Emit a nested name specifier.
+  void AddNestedNameSpecifier(NestedNameSpecifier *NNS);
+
+  /// \brief Emit a nested name specifier with source-location information.
+  void AddNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS);
+
+  /// \brief Emit a template name.
+  void AddTemplateName(TemplateName Name);
+
+  /// \brief Emit a template argument.
+  void AddTemplateArgument(const TemplateArgument &Arg);
+
+  /// \brief Emit a template parameter list.
+  void AddTemplateParameterList(const TemplateParameterList *TemplateParams);
+
+  /// \brief Emit a template argument list.
+  void AddTemplateArgumentList(const TemplateArgumentList *TemplateArgs);
+
+  /// \brief Emit a UnresolvedSet structure.
+  void AddUnresolvedSet(const ASTUnresolvedSet &Set);
+
+  /// \brief Emit a CXXCtorInitializer array.
+  void AddCXXCtorInitializers(ArrayRef<CXXCtorInitializer*> CtorInits);
+
+  void AddCXXDefinitionData(const CXXRecordDecl *D);
+
+  /// \brief Emit a string.
+  void AddString(StringRef Str) {
+    return Writer->AddString(Str, *Record);
+  }
+
+  /// \brief Emit a path.
+  void AddPath(StringRef Path) {
+    return Writer->AddPath(Path, *Record);
+  }
+
+  /// \brief Emit a version tuple.
+  void AddVersionTuple(const VersionTuple &Version) {
+    return Writer->AddVersionTuple(Version, *Record);
+  }
+
+  /// \brief Emit a list of attributes.
+  void AddAttributes(ArrayRef<const Attr*> Attrs);
+};
+
 /// \brief AST and semantic-analysis consumer that generates a
 /// precompiled header from the parsed source code.
 class PCHGenerator : public SemaConsumer {
diff --git a/include/clang/Serialization/Makefile b/include/clang/Serialization/Makefile
deleted file mode 100644
index 386f453..0000000
--- a/include/clang/Serialization/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-CLANG_LEVEL := ../../..
-TD_SRC_DIR = $(PROJ_SRC_DIR)/../Basic
-BUILT_SOURCES = AttrPCHRead.inc AttrPCHWrite.inc
-
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-$(ObjDir)/AttrPCHRead.inc.tmp : $(TD_SRC_DIR)/Attr.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang PCH reader with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-pch-read -o $(call SYSPATH, $@) \
-		-I $(PROJ_SRC_DIR)/../../ $<
-
-$(ObjDir)/AttrPCHWrite.inc.tmp : $(TD_SRC_DIR)/Attr.td $(CLANG_TBLGEN) \
-                              $(ObjDir)/.dir
-	$(Echo) "Building Clang PCH writer with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-attr-pch-write -o $(call SYSPATH, $@) \
-		-I $(PROJ_SRC_DIR)/../../ $<
diff --git a/include/clang/Serialization/Module.h b/include/clang/Serialization/Module.h
index 0e56dfba..aa0392f 100644
--- a/include/clang/Serialization/Module.h
+++ b/include/clang/Serialization/Module.h
@@ -48,8 +48,7 @@
   MK_ExplicitModule, ///< File is an explicitly-loaded module.
   MK_PCH,            ///< File is a PCH file treated as such.
   MK_Preamble,       ///< File is a PCH file treated as the preamble.
-  MK_MainFile,       ///< File is a PCH file treated as the actual main file.
-  MK_PrebuiltModule  ///< File is from a prebuilt module path.
+  MK_MainFile        ///< File is a PCH file treated as the actual main file.
 };
 
 /// \brief The input file that has been loaded from this AST file, along with
@@ -400,20 +399,6 @@
   /// as a local ID (for this module file).
   llvm::DenseMap<ModuleFile *, serialization::DeclID> GlobalToLocalDeclIDs;
 
-  /// \brief The number of C++ base specifier sets in this AST file.
-  unsigned LocalNumCXXBaseSpecifiers;
-
-  /// \brief Offset of each C++ base specifier set within the bitstream,
-  /// indexed by the C++ base specifier set ID (-1).
-  const uint32_t *CXXBaseSpecifiersOffsets;
-
-  /// \brief The number of C++ ctor initializer lists in this AST file.
-  unsigned LocalNumCXXCtorInitializers;
-
-  /// \brief Offset of each C++ ctor initializer list within the bitstream,
-  /// indexed by the C++ ctor initializer list ID minus 1.
-  const uint32_t *CXXCtorInitializersOffsets;
-
   /// \brief Array of file-level DeclIDs sorted by file.
   const serialization::DeclID *FileSortedDecls;
   unsigned NumFileSortedDecls;
@@ -462,8 +447,7 @@
 
   /// \brief Is this a module file for a module (rather than a PCH or similar).
   bool isModule() const {
-    return Kind == MK_ImplicitModule || Kind == MK_ExplicitModule ||
-           Kind == MK_PrebuiltModule;
+    return Kind == MK_ImplicitModule || Kind == MK_ExplicitModule;
   }
 
   /// \brief Dump debugging output for this module.
diff --git a/include/clang/StaticAnalyzer/Checkers/CMakeLists.txt b/include/clang/StaticAnalyzer/Checkers/CMakeLists.txt
new file mode 100644
index 0000000..37dd9e8
--- /dev/null
+++ b/include/clang/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -0,0 +1,4 @@
+clang_tablegen(Checkers.inc -gen-clang-sa-checkers
+  -I ${CMAKE_CURRENT_SOURCE_DIR}/../../../
+  SOURCE Checkers.td
+  TARGET ClangSACheckers)
diff --git a/include/clang/StaticAnalyzer/Checkers/Checkers.td b/include/clang/StaticAnalyzer/Checkers/Checkers.td
new file mode 100644
index 0000000..e3ce6f3
--- /dev/null
+++ b/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -0,0 +1,679 @@
+//===--- Checkers.td - Static Analyzer Checkers -===-----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/StaticAnalyzer/Checkers/CheckerBase.td"
+
+//===----------------------------------------------------------------------===//
+// Packages.
+//===----------------------------------------------------------------------===//
+
+// The Alpha package is for checkers that have too many false positives to be
+// turned on by default. The hierarchy under Alpha should be organized in the
+// hierarchy checkers would have had if they were truly at the top level.
+// (For example, a Cocoa-specific checker that is alpha should be in
+// alpha.osx.cocoa).
+def Alpha : Package<"alpha">;
+
+def Core : Package<"core">;
+def CoreBuiltin : Package<"builtin">, InPackage<Core>;
+def CoreUninitialized  : Package<"uninitialized">, InPackage<Core>;
+def CoreAlpha : Package<"core">, InPackage<Alpha>, Hidden;
+
+// The OptIn package is for checkers that are not alpha and that would normally
+// be on by default but where the driver does not have enough information to
+// determine when they are applicable. For example, localizability checkers fit
+// this criterion because the driver cannot determine whether a project is
+// localized or not -- this is best determined at the IDE or build-system level.
+//
+// The checker hierarchy under OptIn should mirror that in Alpha: checkers
+// should be organized as if they were at the top level.
+//
+// Note: OptIn is *not* intended for checkers that are too noisy to be on by
+// default. Such checkers belong in the alpha package.
+def OptIn : Package<"optin">;
+
+def Nullability : Package<"nullability">;
+
+def Cplusplus : Package<"cplusplus">;
+def CplusplusAlpha : Package<"cplusplus">, InPackage<Alpha>, Hidden;
+
+def DeadCode : Package<"deadcode">;
+def DeadCodeAlpha : Package<"deadcode">, InPackage<Alpha>, Hidden;
+
+def Performance : Package<"performance">, InPackage<OptIn>;
+
+def Security : Package <"security">;
+def InsecureAPI : Package<"insecureAPI">, InPackage<Security>;
+def SecurityAlpha : Package<"security">, InPackage<Alpha>, Hidden;
+def Taint : Package<"taint">, InPackage<SecurityAlpha>, Hidden;
+
+def Unix : Package<"unix">;
+def UnixAlpha : Package<"unix">, InPackage<Alpha>, Hidden;
+def CString : Package<"cstring">, InPackage<Unix>, Hidden;
+def CStringAlpha : Package<"cstring">, InPackage<UnixAlpha>, Hidden;
+
+def OSX : Package<"osx">;
+def OSXAlpha : Package<"osx">, InPackage<Alpha>, Hidden;
+def OSXOptIn : Package<"osx">, InPackage<OptIn>;
+
+def Cocoa : Package<"cocoa">, InPackage<OSX>;
+def CocoaAlpha : Package<"cocoa">, InPackage<OSXAlpha>, Hidden;
+def CocoaOptIn : Package<"cocoa">, InPackage<OSXOptIn>;
+
+def CoreFoundation : Package<"coreFoundation">, InPackage<OSX>;
+def Containers : Package<"containers">, InPackage<CoreFoundation>;
+
+def LocalizabilityAlpha : Package<"localizability">, InPackage<CocoaAlpha>;
+def LocalizabilityOptIn : Package<"localizability">, InPackage<CocoaOptIn>;
+
+def MPI : Package<"mpi">, InPackage<OptIn>;
+
+def LLVM : Package<"llvm">;
+def Debug : Package<"debug">;
+
+def CloneDetectionAlpha : Package<"clone">, InPackage<Alpha>, Hidden;
+
+//===----------------------------------------------------------------------===//
+// Core Checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = Core in {
+
+def DereferenceChecker : Checker<"NullDereference">,
+  HelpText<"Check for dereferences of null pointers">,
+  DescFile<"DereferenceChecker.cpp">;
+
+def CallAndMessageChecker : Checker<"CallAndMessage">,
+  HelpText<"Check for logical errors for function calls and Objective-C message expressions (e.g., uninitialized arguments, null function pointers)">,
+  DescFile<"CallAndMessageChecker.cpp">;
+
+def NonNullParamChecker : Checker<"NonNullParamChecker">,
+  HelpText<"Check for null pointers passed as arguments to a function whose arguments are references or marked with the 'nonnull' attribute">,
+  DescFile<"NonNullParamChecker.cpp">;
+
+def VLASizeChecker : Checker<"VLASize">,
+  HelpText<"Check for declarations of VLA of undefined or zero size">,
+  DescFile<"VLASizeChecker.cpp">;
+
+def DivZeroChecker : Checker<"DivideZero">,
+  HelpText<"Check for division by zero">,
+  DescFile<"DivZeroChecker.cpp">;
+
+def UndefResultChecker : Checker<"UndefinedBinaryOperatorResult">,
+  HelpText<"Check for undefined results of binary operators">,
+  DescFile<"UndefResultChecker.cpp">;
+
+def StackAddrEscapeChecker : Checker<"StackAddressEscape">,
+  HelpText<"Check that addresses to stack memory do not escape the function">,
+  DescFile<"StackAddrEscapeChecker.cpp">;
+
+def DynamicTypePropagation : Checker<"DynamicTypePropagation">,
+  HelpText<"Generate dynamic type information">,
+  DescFile<"DynamicTypePropagation.cpp">;
+
+} // end "core"
+
+let ParentPackage = CoreAlpha in {
+
+def BoolAssignmentChecker : Checker<"BoolAssignment">,
+  HelpText<"Warn about assigning non-{0,1} values to Boolean variables">,
+  DescFile<"BoolAssignmentChecker.cpp">;
+
+def CastSizeChecker : Checker<"CastSize">,
+  HelpText<"Check when casting a malloc'ed type T, whether the size is a multiple of the size of T">,
+  DescFile<"CastSizeChecker.cpp">;
+
+def CastToStructChecker : Checker<"CastToStruct">,
+  HelpText<"Check for cast from non-struct pointer to struct pointer">,
+  DescFile<"CastToStructChecker.cpp">;
+
+def IdenticalExprChecker : Checker<"IdenticalExpr">,
+  HelpText<"Warn about unintended use of identical expressions in operators">,
+  DescFile<"IdenticalExprChecker.cpp">;
+
+def FixedAddressChecker : Checker<"FixedAddr">,
+  HelpText<"Check for assignment of a fixed address to a pointer">,
+  DescFile<"FixedAddressChecker.cpp">;
+
+def PointerArithChecker : Checker<"PointerArithm">,
+  HelpText<"Check for pointer arithmetic on locations other than array elements">,
+  DescFile<"PointerArithChecker">;
+
+def PointerSubChecker : Checker<"PointerSub">,
+  HelpText<"Check for pointer subtractions on two pointers pointing to different memory chunks">,
+  DescFile<"PointerSubChecker">;
+
+def SizeofPointerChecker : Checker<"SizeofPtr">,
+  HelpText<"Warn about unintended use of sizeof() on pointer expressions">,
+  DescFile<"CheckSizeofPointer.cpp">;
+
+def CallAndMessageUnInitRefArg : Checker<"CallAndMessageUnInitRefArg">,
+  HelpText<"Check for logical errors for function calls and Objective-C message expressions (e.g., uninitialized arguments, null function pointers, and pointer to undefined variables)">,
+  DescFile<"CallAndMessageChecker.cpp">;
+
+def TestAfterDivZeroChecker : Checker<"TestAfterDivZero">,
+  HelpText<"Check for division by variable that is later compared against 0. Either the comparison is useless or there is division by zero.">,
+  DescFile<"TestAfterDivZeroChecker.cpp">;
+
+def DynamicTypeChecker : Checker<"DynamicTypeChecker">,
+  HelpText<"Check for cases where the dynamic and the static type of an object are unrelated.">,
+  DescFile<"DynamicTypeChecker.cpp">;
+
+} // end "alpha.core"
+
+let ParentPackage = Nullability in {
+
+def NullPassedToNonnullChecker : Checker<"NullPassedToNonnull">,
+  HelpText<"Warns when a null pointer is passed to a pointer which has a _Nonnull type.">,
+  DescFile<"NullabilityChecker.cpp">;
+
+def NullReturnedFromNonnullChecker : Checker<"NullReturnedFromNonnull">,
+  HelpText<"Warns when a null pointer is returned from a function that has _Nonnull return type.">,
+  DescFile<"NullabilityChecker.cpp">;
+
+def NullableDereferencedChecker : Checker<"NullableDereferenced">,
+  HelpText<"Warns when a nullable pointer is dereferenced.">,
+  DescFile<"NullabilityChecker.cpp">;
+
+def NullablePassedToNonnullChecker : Checker<"NullablePassedToNonnull">,
+  HelpText<"Warns when a nullable pointer is passed to a pointer which has a _Nonnull type.">,
+  DescFile<"NullabilityChecker.cpp">;
+
+def NullableReturnedFromNonnullChecker : Checker<"NullablePassedToNonnull">,
+  HelpText<"Warns when a nullable pointer is returned from a function that has _Nonnull return type.">,
+  DescFile<"NullabilityChecker.cpp">;
+
+} // end "nullability"
+
+//===----------------------------------------------------------------------===//
+// Evaluate "builtin" functions.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = CoreBuiltin in {
+
+def NoReturnFunctionChecker : Checker<"NoReturnFunctions">,
+  HelpText<"Evaluate \"panic\" functions that are known to not return to the caller">,
+  DescFile<"NoReturnFunctionChecker.cpp">;
+
+def BuiltinFunctionChecker : Checker<"BuiltinFunctions">,
+  HelpText<"Evaluate compiler builtin functions (e.g., alloca())">,
+  DescFile<"BuiltinFunctionChecker.cpp">;
+
+} // end "core.builtin"
+
+//===----------------------------------------------------------------------===//
+// Uninitialized values checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = CoreUninitialized in {
+
+def UndefinedArraySubscriptChecker : Checker<"ArraySubscript">,
+  HelpText<"Check for uninitialized values used as array subscripts">,
+  DescFile<"UndefinedArraySubscriptChecker.cpp">;
+
+def UndefinedAssignmentChecker : Checker<"Assign">,
+  HelpText<"Check for assigning uninitialized values">,
+  DescFile<"UndefinedAssignmentChecker.cpp">;
+
+def UndefBranchChecker : Checker<"Branch">,
+  HelpText<"Check for uninitialized values used as branch conditions">,
+  DescFile<"UndefBranchChecker.cpp">;
+
+def UndefCapturedBlockVarChecker : Checker<"CapturedBlockVariable">,
+  HelpText<"Check for blocks that capture uninitialized values">,
+  DescFile<"UndefCapturedBlockVarChecker.cpp">;
+
+def ReturnUndefChecker : Checker<"UndefReturn">,
+  HelpText<"Check for uninitialized values being returned to the caller">,
+  DescFile<"ReturnUndefChecker.cpp">;
+
+} // end "core.uninitialized"
+
+//===----------------------------------------------------------------------===//
+// C++ checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = Cplusplus in {
+
+def NewDeleteChecker : Checker<"NewDelete">,
+  HelpText<"Check for double-free and use-after-free problems. Traces memory managed by new/delete.">,
+  DescFile<"MallocChecker.cpp">;
+
+def NewDeleteLeaksChecker : Checker<"NewDeleteLeaks">,
+  HelpText<"Check for memory leaks. Traces memory managed by new/delete.">,
+  DescFile<"MallocChecker.cpp">;
+
+def CXXSelfAssignmentChecker : Checker<"SelfAssignment">,
+  HelpText<"Checks C++ copy and move assignment operators for self assignment">,
+  DescFile<"CXXSelfAssignmentChecker.cpp">;
+
+} // end: "cplusplus"
+
+let ParentPackage = CplusplusAlpha in {
+
+def VirtualCallChecker : Checker<"VirtualCall">,
+  HelpText<"Check virtual function calls during construction or destruction">,
+  DescFile<"VirtualCallChecker.cpp">;
+
+} // end: "alpha.cplusplus"
+
+//===----------------------------------------------------------------------===//
+// Deadcode checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = DeadCode in {
+
+def DeadStoresChecker : Checker<"DeadStores">,
+  HelpText<"Check for values stored to variables that are never read afterwards">,
+  DescFile<"DeadStoresChecker.cpp">;
+} // end DeadCode
+
+let ParentPackage = DeadCodeAlpha in {
+
+def UnreachableCodeChecker : Checker<"UnreachableCode">,
+  HelpText<"Check unreachable code">,
+  DescFile<"UnreachableCodeChecker.cpp">;
+
+} // end "alpha.deadcode"
+
+//===----------------------------------------------------------------------===//
+// Performance checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = Performance in {
+
+def PaddingChecker : Checker<"Padding">,
+  HelpText<"Check for excessively padded structs.">,
+  DescFile<"PaddingChecker.cpp">;
+
+} // end: "padding"
+
+//===----------------------------------------------------------------------===//
+// Security checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = InsecureAPI in {
+  def gets : Checker<"gets">,
+    HelpText<"Warn on uses of the 'gets' function">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+  def getpw : Checker<"getpw">,
+    HelpText<"Warn on uses of the 'getpw' function">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+  def mktemp : Checker<"mktemp">,
+    HelpText<"Warn on uses of the 'mktemp' function">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+  def mkstemp : Checker<"mkstemp">,
+    HelpText<"Warn when 'mkstemp' is passed fewer than 6 X's in the format string">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+  def rand : Checker<"rand">,
+    HelpText<"Warn on uses of the 'rand', 'random', and related functions">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+  def strcpy : Checker<"strcpy">,
+    HelpText<"Warn on uses of the 'strcpy' and 'strcat' functions">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+  def vfork : Checker<"vfork">,
+    HelpText<"Warn on uses of the 'vfork' function">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+  def UncheckedReturn : Checker<"UncheckedReturn">,
+    HelpText<"Warn on uses of functions whose return values must be always checked">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+}
+let ParentPackage = Security in {
+  def FloatLoopCounter : Checker<"FloatLoopCounter">,
+    HelpText<"Warn on using a floating point value as a loop counter (CERT: FLP30-C, FLP30-CPP)">,
+    DescFile<"CheckSecuritySyntaxOnly.cpp">;
+}
+
+let ParentPackage = SecurityAlpha in {
+
+def ArrayBoundChecker : Checker<"ArrayBound">,
+  HelpText<"Warn about buffer overflows (older checker)">,
+  DescFile<"ArrayBoundChecker.cpp">;
+
+def ArrayBoundCheckerV2 : Checker<"ArrayBoundV2">,
+  HelpText<"Warn about buffer overflows (newer checker)">,
+  DescFile<"ArrayBoundCheckerV2.cpp">;
+
+def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
+  HelpText<"Check for an out-of-bound pointer being returned to callers">,
+  DescFile<"ReturnPointerRangeChecker.cpp">;
+
+def MallocOverflowSecurityChecker : Checker<"MallocOverflow">,
+  HelpText<"Check for overflows in the arguments to malloc()">,
+  DescFile<"MallocOverflowSecurityChecker.cpp">;
+
+} // end "alpha.security"
+
+//===----------------------------------------------------------------------===//
+// Taint checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = Taint in {
+
+def GenericTaintChecker : Checker<"TaintPropagation">,
+  HelpText<"Generate taint information used by other checkers">,
+  DescFile<"GenericTaintChecker.cpp">;
+
+} // end "alpha.security.taint"
+
+//===----------------------------------------------------------------------===//
+// Unix API checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = Unix in {
+
+def UnixAPIChecker : Checker<"API">,
+  HelpText<"Check calls to various UNIX/Posix functions">,
+  DescFile<"UnixAPIChecker.cpp">;
+
+def MallocChecker: Checker<"Malloc">,
+  HelpText<"Check for memory leaks, double free, and use-after-free problems. Traces memory managed by malloc()/free().">,
+  DescFile<"MallocChecker.cpp">;
+
+def MallocSizeofChecker : Checker<"MallocSizeof">,
+  HelpText<"Check for dubious malloc arguments involving sizeof">,
+  DescFile<"MallocSizeofChecker.cpp">;
+
+def MismatchedDeallocatorChecker : Checker<"MismatchedDeallocator">,
+  HelpText<"Check for mismatched deallocators.">,
+  DescFile<"MallocChecker.cpp">;
+
+def VforkChecker : Checker<"Vfork">,
+  HelpText<"Check for proper usage of vfork">,
+  DescFile<"VforkChecker.cpp">;
+
+} // end "unix"
+
+let ParentPackage = UnixAlpha in {
+
+def ChrootChecker : Checker<"Chroot">,
+  HelpText<"Check improper use of chroot">,
+  DescFile<"ChrootChecker.cpp">;
+
+def PthreadLockChecker : Checker<"PthreadLock">,
+  HelpText<"Simple lock -> unlock checker">,
+  DescFile<"PthreadLockChecker.cpp">;
+
+def StreamChecker : Checker<"Stream">,
+  HelpText<"Check stream handling functions">,
+  DescFile<"StreamChecker.cpp">;
+
+def SimpleStreamChecker : Checker<"SimpleStream">,
+  HelpText<"Check for misuses of stream APIs">,
+  DescFile<"SimpleStreamChecker.cpp">;
+
+} // end "alpha.unix"
+
+let ParentPackage = CString in {
+
+def CStringNullArg : Checker<"NullArg">,
+  HelpText<"Check for null pointers being passed as arguments to C string functions">,
+  DescFile<"CStringChecker.cpp">;
+
+def CStringSyntaxChecker : Checker<"BadSizeArg">,
+  HelpText<"Check the size argument passed into C string functions for common erroneous patterns">,
+  DescFile<"CStringSyntaxChecker.cpp">;
+}
+
+let ParentPackage = CStringAlpha in {
+
+def CStringOutOfBounds : Checker<"OutOfBounds">,
+  HelpText<"Check for out-of-bounds access in string functions">,
+  DescFile<"CStringChecker.cpp">;
+
+def CStringBufferOverlap : Checker<"BufferOverlap">,
+  HelpText<"Checks for overlap in two buffer arguments">,
+  DescFile<"CStringChecker.cpp">;
+
+def CStringNotNullTerm : Checker<"NotNullTerminated">,
+  HelpText<"Check for arguments which are not null-terminating strings">,
+  DescFile<"CStringChecker.cpp">;
+}
+
+//===----------------------------------------------------------------------===//
+// Mac OS X, Cocoa, and Core Foundation checkers.
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = OSX in {
+
+def MacOSXAPIChecker : Checker<"API">,
+  InPackage<OSX>,
+  HelpText<"Check for proper uses of various Apple APIs">,
+  DescFile<"MacOSXAPIChecker.cpp">;
+
+def MacOSKeychainAPIChecker : Checker<"SecKeychainAPI">,
+  InPackage<OSX>,
+  HelpText<"Check for proper uses of Secure Keychain APIs">,
+  DescFile<"MacOSKeychainAPIChecker.cpp">;
+
+} // end "osx"
+
+let ParentPackage = Cocoa in {
+
+def ObjCAtSyncChecker : Checker<"AtSync">,
+  HelpText<"Check for nil pointers used as mutexes for @synchronized">,
+  DescFile<"ObjCAtSyncChecker.cpp">;
+
+def NilArgChecker : Checker<"NilArg">,
+  HelpText<"Check for prohibited nil arguments to ObjC method calls">,
+  DescFile<"BasicObjCFoundationChecks.cpp">;
+
+def ClassReleaseChecker : Checker<"ClassRelease">,
+  HelpText<"Check for sending 'retain', 'release', or 'autorelease' directly to a Class">,
+  DescFile<"BasicObjCFoundationChecks.cpp">;
+
+def VariadicMethodTypeChecker : Checker<"VariadicMethodTypes">,
+  HelpText<"Check for passing non-Objective-C types to variadic collection "
+           "initialization methods that expect only Objective-C types">,
+  DescFile<"BasicObjCFoundationChecks.cpp">;
+
+def NSAutoreleasePoolChecker : Checker<"NSAutoreleasePool">,
+  HelpText<"Warn for suboptimal uses of NSAutoreleasePool in Objective-C GC mode">,
+  DescFile<"NSAutoreleasePoolChecker.cpp">;
+
+def ObjCMethSigsChecker : Checker<"IncompatibleMethodTypes">,
+  HelpText<"Warn about Objective-C method signatures with type incompatibilities">,
+  DescFile<"CheckObjCInstMethSignature.cpp">;
+
+def ObjCUnusedIvarsChecker : Checker<"UnusedIvars">,
+  HelpText<"Warn about private ivars that are never used">,
+  DescFile<"ObjCUnusedIVarsChecker.cpp">;
+
+def ObjCSelfInitChecker : Checker<"SelfInit">,
+  HelpText<"Check that 'self' is properly initialized inside an initializer method">,
+  DescFile<"ObjCSelfInitChecker.cpp">;
+
+def ObjCLoopChecker : Checker<"Loops">,
+  HelpText<"Improved modeling of loops using Cocoa collection types">,
+  DescFile<"BasicObjCFoundationChecks.cpp">;
+
+def ObjCNonNilReturnValueChecker : Checker<"NonNilReturnValue">,
+  HelpText<"Model the APIs that are guaranteed to return a non-nil value">,
+  DescFile<"BasicObjCFoundationChecks.cpp">;
+
+def ObjCSuperCallChecker : Checker<"MissingSuperCall">,
+  HelpText<"Warn about Objective-C methods that lack a necessary call to super">,
+  DescFile<"ObjCMissingSuperCallChecker.cpp">;
+
+def NSErrorChecker : Checker<"NSError">,
+  HelpText<"Check usage of NSError** parameters">,
+  DescFile<"NSErrorChecker.cpp">;
+
+def RetainCountChecker : Checker<"RetainCount">,
+  HelpText<"Check for leaks and improper reference count management">,
+  DescFile<"RetainCountChecker.cpp">;
+
+def ObjCGenericsChecker : Checker<"ObjCGenerics">,
+  HelpText<"Check for type errors when using Objective-C generics">,
+  DescFile<"DynamicTypePropagation.cpp">;
+
+def ObjCDeallocChecker : Checker<"Dealloc">,
+  HelpText<"Warn about Objective-C classes that lack a correct implementation of -dealloc">,
+  DescFile<"CheckObjCDealloc.cpp">;
+
+def ObjCSuperDeallocChecker : Checker<"SuperDealloc">,
+  HelpText<"Warn about improper use of '[super dealloc]' in Objective-C">,
+  DescFile<"ObjCSuperDeallocChecker.cpp">;
+
+} // end "osx.cocoa"
+
+let ParentPackage = CocoaAlpha in {
+
+def InstanceVariableInvalidation : Checker<"InstanceVariableInvalidation">,
+  HelpText<"Check that the invalidatable instance variables are invalidated in the methods annotated with objc_instance_variable_invalidator">,
+  DescFile<"IvarInvalidationChecker.cpp">;
+
+def MissingInvalidationMethod : Checker<"MissingInvalidationMethod">,
+  HelpText<"Check that the invalidation methods are present in classes that contain invalidatable instance variables">,
+  DescFile<"IvarInvalidationChecker.cpp">;
+
+def DirectIvarAssignment : Checker<"DirectIvarAssignment">,
+  HelpText<"Check for direct assignments to instance variables">,
+  DescFile<"DirectIvarAssignment.cpp">;
+
+def DirectIvarAssignmentForAnnotatedFunctions : Checker<"DirectIvarAssignmentForAnnotatedFunctions">,
+  HelpText<"Check for direct assignments to instance variables in the methods annotated with objc_no_direct_instance_variable_assignment">,
+  DescFile<"DirectIvarAssignment.cpp">;
+
+} // end "alpha.osx.cocoa"
+
+let ParentPackage = CoreFoundation in {
+
+def CFNumberCreateChecker : Checker<"CFNumber">,
+  HelpText<"Check for proper uses of CFNumberCreate">,
+  DescFile<"BasicObjCFoundationChecks.cpp">;
+
+def CFRetainReleaseChecker : Checker<"CFRetainRelease">,
+  HelpText<"Check for null arguments to CFRetain/CFRelease/CFMakeCollectable">,
+  DescFile<"BasicObjCFoundationChecks.cpp">;
+
+def CFErrorChecker : Checker<"CFError">,
+  HelpText<"Check usage of CFErrorRef* parameters">,
+  DescFile<"NSErrorChecker.cpp">;
+}
+
+let ParentPackage = Containers in {
+def ObjCContainersASTChecker : Checker<"PointerSizedValues">,
+  HelpText<"Warns if 'CFArray', 'CFDictionary', 'CFSet' are created with non-pointer-size values">,
+  DescFile<"ObjCContainersASTChecker.cpp">;
+
+def ObjCContainersChecker : Checker<"OutOfBounds">,
+  HelpText<"Checks for index out-of-bounds when using 'CFArray' API">,
+  DescFile<"ObjCContainersChecker.cpp">;
+
+}
+
+let ParentPackage = LocalizabilityOptIn in {
+def NonLocalizedStringChecker : Checker<"NonLocalizedStringChecker">,
+  HelpText<"Warns about uses of non-localized NSStrings passed to UI methods expecting localized NSStrings">,
+  DescFile<"LocalizationChecker.cpp">;
+
+def EmptyLocalizationContextChecker : Checker<"EmptyLocalizationContextChecker">,
+  HelpText<"Check that NSLocalizedString macros include a comment for context">,
+  DescFile<"LocalizationChecker.cpp">;
+}
+
+let ParentPackage = LocalizabilityAlpha in {
+def PluralMisuseChecker : Checker<"PluralMisuseChecker">,
+  HelpText<"Warns against using one vs. many plural pattern in code when generating localized strings.">,
+  DescFile<"LocalizationChecker.cpp">;
+}
+
+let ParentPackage = MPI in {
+  def MPIChecker : Checker<"MPI-Checker">,
+  HelpText<"Checks MPI code">,
+  DescFile<"MPIChecker.cpp">;
+}
+
+//===----------------------------------------------------------------------===//
+// Checkers for LLVM development.
+//===----------------------------------------------------------------------===//
+
+def LLVMConventionsChecker : Checker<"Conventions">,
+  InPackage<LLVM>,
+  HelpText<"Check code for LLVM codebase conventions">,
+  DescFile<"LLVMConventionsChecker.cpp">;
+
+//===----------------------------------------------------------------------===//
+// Debugging checkers (for analyzer development).
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = Debug in {
+
+def DominatorsTreeDumper : Checker<"DumpDominators">,
+  HelpText<"Print the dominance tree for a given CFG">,
+  DescFile<"DebugCheckers.cpp">;
+
+def LiveVariablesDumper : Checker<"DumpLiveVars">,
+  HelpText<"Print results of live variable analysis">,
+  DescFile<"DebugCheckers.cpp">;
+
+def CFGViewer : Checker<"ViewCFG">,
+  HelpText<"View Control-Flow Graphs using GraphViz">,
+  DescFile<"DebugCheckers.cpp">;
+
+def CFGDumper : Checker<"DumpCFG">,
+  HelpText<"Display Control-Flow Graphs">,
+  DescFile<"DebugCheckers.cpp">;
+
+def CallGraphViewer : Checker<"ViewCallGraph">,
+  HelpText<"View Call Graph using GraphViz">,
+  DescFile<"DebugCheckers.cpp">;
+
+def CallGraphDumper : Checker<"DumpCallGraph">,
+  HelpText<"Display Call Graph">,
+  DescFile<"DebugCheckers.cpp">;
+
+def ConfigDumper : Checker<"ConfigDumper">,
+  HelpText<"Dump config table">,
+  DescFile<"DebugCheckers.cpp">;
+
+def TraversalDumper : Checker<"DumpTraversal">,
+  HelpText<"Print branch conditions as they are traversed by the engine">,
+  DescFile<"TraversalChecker.cpp">;
+
+def CallDumper : Checker<"DumpCalls">,
+  HelpText<"Print calls as they are traversed by the engine">,
+  DescFile<"TraversalChecker.cpp">;
+
+def AnalyzerStatsChecker : Checker<"Stats">,
+  HelpText<"Emit warnings with analyzer statistics">,
+  DescFile<"AnalyzerStatsChecker.cpp">;
+
+def TaintTesterChecker : Checker<"TaintTest">,
+  HelpText<"Mark tainted symbols as such.">,
+  DescFile<"TaintTesterChecker.cpp">;
+
+def ExprInspectionChecker : Checker<"ExprInspection">,
+  HelpText<"Check the analyzer's understanding of expressions">,
+  DescFile<"ExprInspectionChecker.cpp">;
+
+def ExplodedGraphViewer : Checker<"ViewExplodedGraph">,
+  HelpText<"View Exploded Graphs using GraphViz">,
+  DescFile<"DebugCheckers.cpp">;
+
+def BugHashDumper : Checker<"DumpBugHash">,
+  HelpText<"Dump the bug hash for all statements.">,
+  DescFile<"DebugCheckers.cpp">;
+
+} // end "debug"
+
+
+//===----------------------------------------------------------------------===//
+// Clone Detection
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = CloneDetectionAlpha in {
+
+def CloneChecker : Checker<"CloneChecker">,
+  HelpText<"Reports similar pieces of code.">,
+  DescFile<"CloneChecker.cpp">;
+
+} // end "clone"
+
diff --git a/include/clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h b/include/clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h
new file mode 100644
index 0000000..65e9089
--- /dev/null
+++ b/include/clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h
@@ -0,0 +1,97 @@
+//===-- MPIFunctionClassifier.h - classifies MPI functions ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functionality to identify and classify MPI functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPIFUNCTIONCLASSIFIER_H
+#define LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPIFUNCTIONCLASSIFIER_H
+
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+namespace clang {
+namespace ento {
+namespace mpi {
+
+class MPIFunctionClassifier {
+public:
+  MPIFunctionClassifier(ASTContext &ASTCtx) { identifierInit(ASTCtx); }
+
+  // general identifiers
+  bool isMPIType(const IdentifierInfo *const IdentInfo) const;
+  bool isNonBlockingType(const IdentifierInfo *const IdentInfo) const;
+
+  // point-to-point identifiers
+  bool isPointToPointType(const IdentifierInfo *const IdentInfo) const;
+
+  // collective identifiers
+  bool isCollectiveType(const IdentifierInfo *const IdentInfo) const;
+  bool isCollToColl(const IdentifierInfo *const IdentInfo) const;
+  bool isScatterType(const IdentifierInfo *const IdentInfo) const;
+  bool isGatherType(const IdentifierInfo *const IdentInfo) const;
+  bool isAllgatherType(const IdentifierInfo *const IdentInfo) const;
+  bool isAlltoallType(const IdentifierInfo *const IdentInfo) const;
+  bool isReduceType(const IdentifierInfo *const IdentInfo) const;
+  bool isBcastType(const IdentifierInfo *const IdentInfo) const;
+
+  // additional identifiers
+  bool isMPI_Wait(const IdentifierInfo *const IdentInfo) const;
+  bool isMPI_Waitall(const IdentifierInfo *const IdentInfo) const;
+  bool isWaitType(const IdentifierInfo *const IdentInfo) const;
+
+private:
+  // Initializes function identifiers, to recognize them during analysis.
+  void identifierInit(ASTContext &ASTCtx);
+  void initPointToPointIdentifiers(ASTContext &ASTCtx);
+  void initCollectiveIdentifiers(ASTContext &ASTCtx);
+  void initAdditionalIdentifiers(ASTContext &ASTCtx);
+
+  // The containers are used, to enable classification of MPI-functions during
+  // analysis.
+  llvm::SmallVector<IdentifierInfo *, 12> MPINonBlockingTypes;
+
+  llvm::SmallVector<IdentifierInfo *, 10> MPIPointToPointTypes;
+  llvm::SmallVector<IdentifierInfo *, 16> MPICollectiveTypes;
+
+  llvm::SmallVector<IdentifierInfo *, 4> MPIPointToCollTypes;
+  llvm::SmallVector<IdentifierInfo *, 4> MPICollToPointTypes;
+  llvm::SmallVector<IdentifierInfo *, 6> MPICollToCollTypes;
+
+  llvm::SmallVector<IdentifierInfo *, 32> MPIType;
+
+  // point-to-point functions
+  IdentifierInfo *IdentInfo_MPI_Send = nullptr, *IdentInfo_MPI_Isend = nullptr,
+      *IdentInfo_MPI_Ssend = nullptr, *IdentInfo_MPI_Issend = nullptr,
+      *IdentInfo_MPI_Bsend = nullptr, *IdentInfo_MPI_Ibsend = nullptr,
+      *IdentInfo_MPI_Rsend = nullptr, *IdentInfo_MPI_Irsend = nullptr,
+      *IdentInfo_MPI_Recv = nullptr, *IdentInfo_MPI_Irecv = nullptr;
+
+  // collective functions
+  IdentifierInfo *IdentInfo_MPI_Scatter = nullptr,
+      *IdentInfo_MPI_Iscatter = nullptr, *IdentInfo_MPI_Gather = nullptr,
+      *IdentInfo_MPI_Igather = nullptr, *IdentInfo_MPI_Allgather = nullptr,
+      *IdentInfo_MPI_Iallgather = nullptr, *IdentInfo_MPI_Bcast = nullptr,
+      *IdentInfo_MPI_Ibcast = nullptr, *IdentInfo_MPI_Reduce = nullptr,
+      *IdentInfo_MPI_Ireduce = nullptr, *IdentInfo_MPI_Allreduce = nullptr,
+      *IdentInfo_MPI_Iallreduce = nullptr, *IdentInfo_MPI_Alltoall = nullptr,
+      *IdentInfo_MPI_Ialltoall = nullptr, *IdentInfo_MPI_Barrier = nullptr;
+
+  // additional functions
+  IdentifierInfo *IdentInfo_MPI_Comm_rank = nullptr,
+      *IdentInfo_MPI_Comm_size = nullptr, *IdentInfo_MPI_Wait = nullptr,
+      *IdentInfo_MPI_Waitall = nullptr;
+};
+
+} // end of namespace: mpi
+} // end of namespace: ento
+} // end of namespace: clang
+
+#endif
diff --git a/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index 3959de2..fe8aea5 100644
--- a/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -149,6 +149,7 @@
   unsigned DisableAllChecks : 1;
 
   unsigned ShowCheckerHelp : 1;
+  unsigned ShowEnabledCheckerList : 1;
   unsigned AnalyzeAll : 1;
   unsigned AnalyzerDisplayProgress : 1;
   unsigned AnalyzeNestedBlocks : 1;
@@ -541,6 +542,7 @@
     AnalysisPurgeOpt(PurgeStmt),
     DisableAllChecks(0),
     ShowCheckerHelp(0),
+    ShowEnabledCheckerList(0),
     AnalyzeAll(0),
     AnalyzerDisplayProgress(0),
     AnalyzeNestedBlocks(0),
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
index 57c73fd..02f56fc 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
@@ -315,22 +315,9 @@
 } // end clang namespace
 
 namespace llvm {
-  template<> struct ilist_traits<clang::ento::BugReport>
-    : public ilist_default_traits<clang::ento::BugReport> {
-    clang::ento::BugReport *createSentinel() const {
-      return static_cast<clang::ento::BugReport *>(&Sentinel);
-    }
-    void destroySentinel(clang::ento::BugReport *) const {}
-
-    clang::ento::BugReport *provideInitialHead() const {
-      return createSentinel();
-    }
-    clang::ento::BugReport *ensureHead(clang::ento::BugReport *) const {
-      return createSentinel();
-    }
-  private:
-    mutable ilist_half_node<clang::ento::BugReport> Sentinel;
-  };
+template <>
+struct ilist_sentinel_traits<clang::ento::BugReport>
+    : public ilist_half_embedded_sentinel_traits<clang::ento::BugReport> {};
 }
 
 namespace clang {
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitor.h b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitor.h
index c954bbf..cb785f3 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitor.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitor.h
@@ -331,6 +331,22 @@
                                  BugReport &BR) override;
 };
 
+class CXXSelfAssignmentBRVisitor final
+  : public BugReporterVisitorImpl<CXXSelfAssignmentBRVisitor> {
+  
+  bool Satisfied;
+
+public:
+  CXXSelfAssignmentBRVisitor() : Satisfied(false) {}
+
+  void Profile(llvm::FoldingSetNodeID &ID) const override {}
+
+  PathDiagnosticPiece *VisitNode(const ExplodedNode *Succ,
+                                 const ExplodedNode *Pred,
+                                 BugReporterContext &BRC,
+                                 BugReport &BR) override;
+};
+
 namespace bugreporter {
 
 /// Attempts to add visitors to trace a null or undefined value back to its
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/BugType.h b/include/clang/StaticAnalyzer/Core/BugReporter/BugType.h
index 16226e9..18fa85c 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/BugType.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/BugType.h
@@ -17,7 +17,6 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
-#include "llvm/ADT/FoldingSet.h"
 #include <string>
 
 namespace clang {
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h b/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
index 35421f9..c34b14c 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
@@ -774,8 +774,8 @@
 
   void appendToDesc(StringRef S) {
     if (!ShortDesc.empty())
-      ShortDesc.append(S);
-    VerboseDesc.append(S);
+      ShortDesc += S;
+    VerboseDesc += S;
   }
 
   void resetPath() {
diff --git a/include/clang/StaticAnalyzer/Core/CheckerManager.h b/include/clang/StaticAnalyzer/Core/CheckerManager.h
index 612e105..b06b74d 100644
--- a/include/clang/StaticAnalyzer/Core/CheckerManager.h
+++ b/include/clang/StaticAnalyzer/Core/CheckerManager.h
@@ -20,6 +20,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/Store.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include <utility>
 #include <vector>
 
 namespace clang {
@@ -105,10 +106,8 @@
   CheckName CurrentCheckName;
 
 public:
-  CheckerManager(const LangOptions &langOpts,
-                 AnalyzerOptionsRef AOptions)
-    : LangOpts(langOpts),
-      AOptions(AOptions) {}
+  CheckerManager(const LangOptions &langOpts, AnalyzerOptionsRef AOptions)
+      : LangOpts(langOpts), AOptions(std::move(AOptions)) {}
 
   ~CheckerManager();
 
diff --git a/include/clang/StaticAnalyzer/Core/CheckerRegistry.h b/include/clang/StaticAnalyzer/Core/CheckerRegistry.h
index c9724c0..3b26ed3 100644
--- a/include/clang/StaticAnalyzer/Core/CheckerRegistry.h
+++ b/include/clang/StaticAnalyzer/Core/CheckerRegistry.h
@@ -127,7 +127,9 @@
 
   /// Prints the name and description of all checkers in this registry.
   /// This output is not intended to be machine-parseable.
-  void printHelp(raw_ostream &out, size_t maxNameChars = 30) const ;
+  void printHelp(raw_ostream &out, size_t maxNameChars = 30) const;
+  void printList(raw_ostream &out,
+                 SmallVectorImpl<CheckerOptInfo> &opts) const;
 
 private:
   mutable CheckerInfoList Checkers;
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h b/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
index 0bd53e6..89610ef 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
@@ -24,6 +24,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include <utility>
 
 namespace clang {
 class ProgramPoint;
@@ -165,10 +166,10 @@
   friend class CallEventManager;
 
   CallEvent(const Expr *E, ProgramStateRef state, const LocationContext *lctx)
-    : State(state), LCtx(lctx), Origin(E), RefCount(0) {}
+      : State(std::move(state)), LCtx(lctx), Origin(E), RefCount(0) {}
 
   CallEvent(const Decl *D, ProgramStateRef state, const LocationContext *lctx)
-    : State(state), LCtx(lctx), Origin(D), RefCount(0) {}
+      : State(std::move(state)), LCtx(lctx), Origin(D), RefCount(0) {}
 
   // DO NOT MAKE PUBLIC
   CallEvent(const CallEvent &Original)
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h b/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h
index cfb1b92..03a0f11 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h
@@ -28,10 +28,10 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
 #include <memory>
+#include <utility>
 #include <vector>
 
 namespace clang {
@@ -121,10 +121,9 @@
   NodeGroup Succs;
 
 public:
-
   explicit ExplodedNode(const ProgramPoint &loc, ProgramStateRef state,
                         bool IsSink)
-    : Location(loc), State(state), Succs(IsSink) {
+      : Location(loc), State(std::move(state)), Succs(IsSink) {
     assert(isSink() == IsSink);
   }
 
@@ -295,6 +294,14 @@
                         bool IsSink = false,
                         bool* IsNew = nullptr);
 
+  /// \brief Create a node for a (Location, State) pair,
+  ///  but don't store it for deduplication later.  This
+  ///  is useful when copying an already completed
+  ///  ExplodedGraph for further processing.
+  ExplodedNode *createUncachedNode(const ProgramPoint &L,
+    ProgramStateRef State,
+    bool IsSink = false);
+
   std::unique_ptr<ExplodedGraph> MakeEmptyGraph() const {
     return llvm::make_unique<ExplodedGraph>();
   }
@@ -321,6 +328,8 @@
   bool empty() const { return NumNodes == 0; }
   unsigned size() const { return NumNodes; }
 
+  void reserve(unsigned NodeCount) { Nodes.reserve(NodeCount); }
+
   // Iterators.
   typedef ExplodedNode                        NodeTy;
   typedef llvm::FoldingSet<ExplodedNode>      AllNodesTy;
@@ -442,6 +451,7 @@
 namespace llvm {
   template<> struct GraphTraits<clang::ento::ExplodedNode*> {
     typedef clang::ento::ExplodedNode NodeType;
+    typedef clang::ento::ExplodedNode *NodeRef;
     typedef NodeType::succ_iterator  ChildIteratorType;
     typedef llvm::df_iterator<NodeType*>      nodes_iterator;
 
@@ -468,6 +478,7 @@
 
   template<> struct GraphTraits<const clang::ento::ExplodedNode*> {
     typedef const clang::ento::ExplodedNode NodeType;
+    typedef const clang::ento::ExplodedNode *NodeRef;
     typedef NodeType::const_succ_iterator   ChildIteratorType;
     typedef llvm::df_iterator<NodeType*>       nodes_iterator;
 
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h b/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
index 1d33301..da4b964 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
@@ -150,6 +150,28 @@
   template<typename RegionTy> const RegionTy* getAs() const;
 
   virtual bool isBoundable() const { return false; }
+
+
+  /// Get descriptive name for memory region. The name is obtained from
+  /// the variable/field declaration retrieved from the memory region.
+  /// Regions that point to an element of an array are returned as: "arr[0]".
+  /// Regions that point to a struct are returned as: "st.var".
+  //
+  /// \param UseQuotes Set if the name should be quoted.
+  ///
+  /// \returns variable name for memory region
+  std::string getDescriptiveName(bool UseQuotes = true) const;
+
+
+  /// Retrieve source range from memory region. The range retrieval
+  /// is based on the decl obtained from the memory region.
+  /// For a VarRegion the range of the base region is returned.
+  /// For a FieldRegion the range of the field is returned.
+  /// If no declaration is found, an empty source range is returned.
+  /// The client is responsible for checking if the returned range is valid.
+  ///
+  /// \returns source range for declaration retrieved from memory region
+  clang::SourceRange sourceRange() const;
 };
 
 /// MemSpaceRegion - A memory region that represents a "memory space";
@@ -1255,15 +1277,9 @@
 
 private:
   template <typename RegionTy, typename A1>
-  RegionTy* getRegion(const A1 a1);
-
-  template <typename RegionTy, typename A1>
   RegionTy* getSubRegion(const A1 a1, const MemRegion* superRegion);
 
   template <typename RegionTy, typename A1, typename A2>
-  RegionTy* getRegion(const A1 a1, const A2 a2);
-
-  template <typename RegionTy, typename A1, typename A2>
   RegionTy* getSubRegion(const A1 a1, const A2 a2,
                          const MemRegion* superRegion);
 
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
index c4a62ec..9872aff 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
@@ -24,8 +24,8 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/TaintTag.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/ImmutableMap.h"
-#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/Allocator.h"
+#include <utility>
 
 namespace llvm {
 class APSInt;
@@ -836,9 +836,8 @@
   ProgramStateRef state;
   SymbolVisitor &visitor;
 public:
-
-  ScanReachableSymbols(ProgramStateRef st, SymbolVisitor& v)
-    : state(st), visitor(v) {}
+  ScanReachableSymbols(ProgramStateRef st, SymbolVisitor &v)
+      : state(std::move(st)), visitor(v) {}
 
   bool scan(nonloc::LazyCompoundVal val);
   bool scan(nonloc::CompoundVal val);
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h b/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
index a03b630..fa7d3f7 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
@@ -18,7 +18,6 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/StoreRef.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Optional.h"
 
 namespace clang {
 
@@ -124,15 +123,18 @@
   SVal evalDerivedToBase(SVal Derived, QualType DerivedPtrType,
                          bool IsVirtual);
 
-  /// \brief Evaluates C++ dynamic_cast cast.
+  /// \brief Attempts to do a down cast. Used to model BaseToDerived and C++
+  ///        dynamic_cast.
   /// The callback may result in the following 3 scenarios:
   ///  - Successful cast (ex: derived is subclass of base).
   ///  - Failed cast (ex: derived is definitely not a subclass of base).
+  ///    The distinction of this case from the next one is necessary to model
+  ///    dynamic_cast. 
   ///  - We don't know (base is a symbolic region and we don't have 
   ///    enough info to determine if the cast will succeed at run time).
   /// The function returns an SVal representing the derived class; it's
   /// valid only if Failed flag is set to false.
-  SVal evalDynamicCast(SVal Base, QualType DerivedPtrType, bool &Failed);
+  SVal attemptDownCast(SVal Base, QualType DerivedPtrType, bool &Failed);
 
   const ElementRegion *GetElementZeroRegion(const MemRegion *R, QualType T);
 
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/SummaryManager.h b/include/clang/StaticAnalyzer/Core/PathSensitive/SummaryManager.h
index ed87851..52d78b6 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/SummaryManager.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/SummaryManager.h
@@ -15,9 +15,6 @@
 #ifndef LLVM_CLANG_GR_SUMMARY
 #define LLVM_CLANG_GR_SUMMARY
 
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/Support/Allocator.h"
-
 namespace clang {
 
 namespace ento {
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h b/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
index be9646b..18bc607 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
@@ -22,6 +22,8 @@
 namespace clang {
 namespace ento {
 
+class MemRegion;
+
 /// \brief Symbolic value. These values used to capture symbolic execution of
 /// the program.
 class SymExpr : public llvm::FoldingSetNode {
@@ -76,6 +78,18 @@
   static symbol_iterator symbol_end() { return symbol_iterator(); }
 
   unsigned computeComplexity() const;
+
+  /// \brief Find the region from which this symbol originates.
+  ///
+  /// Whenever the symbol was constructed to denote an unknown value of
+  /// a certain memory region, return this region. This method
+  /// allows checkers to make decisions depending on the origin of the symbol.
+  /// Symbol classes for which the origin region is known include
+  /// SymbolRegionValue which denotes the value of the region before
+  /// the beginning of the analysis, and SymbolDerived which denotes the value
+  /// of a certain memory region after its super region (a memory space or
+  /// a larger record region) is default-bound with a certain symbol.
+  virtual const MemRegion *getOriginRegion() const { return nullptr; }
 };
 
 typedef const SymExpr *SymbolRef;
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
index 30481ea..0874305 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
@@ -58,6 +58,7 @@
   }
 
   void dumpToStream(raw_ostream &os) const override;
+  const MemRegion *getOriginRegion() const override { return getRegion(); }
 
   QualType getType() const override;
 
@@ -127,6 +128,7 @@
   QualType getType() const override;
 
   void dumpToStream(raw_ostream &os) const override;
+  const MemRegion *getOriginRegion() const override { return getRegion(); }
 
   static void Profile(llvm::FoldingSetNodeID& profile, SymbolRef parent,
                       const TypedValueRegion *r) {
diff --git a/include/clang/StaticAnalyzer/Frontend/AnalysisConsumer.h b/include/clang/StaticAnalyzer/Frontend/AnalysisConsumer.h
index 37ea05f..a9dad6c 100644
--- a/include/clang/StaticAnalyzer/Frontend/AnalysisConsumer.h
+++ b/include/clang/StaticAnalyzer/Frontend/AnalysisConsumer.h
@@ -17,9 +17,7 @@
 
 #include "clang/AST/ASTConsumer.h"
 #include "clang/Basic/LLVM.h"
-#include "clang/StaticAnalyzer/Core/AnalyzerOptions.h"
-#include "clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h"
-#include <string>
+#include <memory>
 
 namespace clang {
 
@@ -29,6 +27,7 @@
 class CompilerInstance;
 
 namespace ento {
+class PathDiagnosticConsumer;
 class CheckerManager;
 
 class AnalysisASTConsumer : public ASTConsumer {
diff --git a/include/clang/StaticAnalyzer/Frontend/FrontendActions.h b/include/clang/StaticAnalyzer/Frontend/FrontendActions.h
index 36afb4b..e66d48b 100644
--- a/include/clang/StaticAnalyzer/Frontend/FrontendActions.h
+++ b/include/clang/StaticAnalyzer/Frontend/FrontendActions.h
@@ -17,6 +17,7 @@
 namespace clang {
 
 class Stmt;
+class AnalyzerOptions;
 
 namespace ento {
 
@@ -52,6 +53,8 @@
 };
 
 void printCheckerHelp(raw_ostream &OS, ArrayRef<std::string> plugins);
+void printEnabledCheckerList(raw_ostream &OS, ArrayRef<std::string> plugins,
+                             const AnalyzerOptions &opts);
 
 } // end GR namespace
 
diff --git a/include/clang/Tooling/CommonOptionsParser.h b/include/clang/Tooling/CommonOptionsParser.h
index 1e8462c..3d630c5 100644
--- a/include/clang/Tooling/CommonOptionsParser.h
+++ b/include/clang/Tooling/CommonOptionsParser.h
@@ -98,7 +98,7 @@
   }
 
   /// Returns a list of source file paths to process.
-  std::vector<std::string> getSourcePathList() {
+  const std::vector<std::string> &getSourcePathList() const {
     return SourcePathList;
   }
 
diff --git a/include/clang/Tooling/Core/QualTypeNames.h b/include/clang/Tooling/Core/QualTypeNames.h
new file mode 100644
index 0000000..7248356
--- /dev/null
+++ b/include/clang/Tooling/Core/QualTypeNames.h
@@ -0,0 +1,79 @@
+//===--- QualTypeNames.h - Generate Complete QualType Names ----*- C++ -*-===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===----------------------------------------------------------------------===//
+//
+// \file
+// Functionality to generate the fully-qualified names of QualTypes,
+// including recursively expanding any subtypes and template
+// parameters.
+//
+// More precisely: Generates a name that can be used to name the same
+// type if used at the end of the current translation unit--with
+// certain limitations. See below.
+//
+// This code desugars names only very minimally, so in this code:
+//
+// namespace A {
+//   struct X {};
+// }
+// using A::X;
+// namespace B {
+//   using std::tuple;
+//   typedef tuple<X> TX;
+//   TX t;
+// }
+//
+// B::t's type is reported as "B::TX", rather than std::tuple<A::X>.
+//
+// Also, this code replaces types found via using declarations with
+// their more qualified name, so for the code:
+//
+// using std::tuple;
+// tuple<int> TInt;
+//
+// TInt's type will be named, "std::tuple<int>".
+//
+// Limitations:
+//
+// Some types have ambiguous names at the end of a translation unit,
+// are not namable at all there, or are special cases in other ways.
+//
+// 1) Types with only local scope will have their local names:
+//
+// void foo() {
+//   struct LocalType {} LocalVar;
+// }
+//
+// LocalVar's type will be named, "struct LocalType", without any
+// qualification.
+//
+// 2) Types that have been shadowed are reported normally, but a
+// client using that name at the end of the translation unit will be
+// referring to a different type.
+//
+// ===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_CORE_QUALTYPENAMES_H
+#define LLVM_CLANG_TOOLING_CORE_QUALTYPENAMES_H
+
+#include "clang/AST/ASTContext.h"
+
+namespace clang {
+namespace TypeName {
+/// \brief Get the fully qualified name for a type. This includes full
+/// qualification of all template parameters etc.
+///
+/// \param[in] QT - the type for which the fully qualified name will be
+/// returned.
+/// \param[in] Ctx - the ASTContext to be used.
+/// \param[in] WithGlobalNsPrefix - If true, then the global namespace
+/// specifier "::" will be prepended to the fully qualified name.
+std::string getFullyQualifiedName(QualType QT,
+                                  const ASTContext &Ctx,
+                                  bool WithGlobalNsPrefix = false);
+}  // end namespace TypeName
+}  // end namespace clang
+#endif  // LLVM_CLANG_TOOLING_CORE_QUALTYPENAMES_H
diff --git a/include/clang/Tooling/Core/Replacement.h b/include/clang/Tooling/Core/Replacement.h
index 37389ac..22fc2ae 100644
--- a/include/clang/Tooling/Core/Replacement.h
+++ b/include/clang/Tooling/Core/Replacement.h
@@ -22,6 +22,8 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <map>
 #include <set>
 #include <string>
 #include <vector>
@@ -56,6 +58,11 @@
     return RHS.Offset >= Offset &&
            (RHS.Offset + RHS.Length) <= (Offset + Length);
   }
+
+  /// \brief Whether this range equals to \p RHS or not.
+  bool operator==(const Range &RHS) const {
+    return Offset == RHS.getOffset() && Length == RHS.getLength();
+  }
   /// @}
 
 private:
@@ -116,14 +123,13 @@
   /// \brief Returns a human readable string representation.
   std::string toString() const;
 
- private:
-   void setFromSourceLocation(const SourceManager &Sources,
-                              SourceLocation Start, unsigned Length,
-                              StringRef ReplacementText);
-   void setFromSourceRange(const SourceManager &Sources,
-                           const CharSourceRange &Range,
-                           StringRef ReplacementText,
-                           const LangOptions &LangOpts);
+private:
+  void setFromSourceLocation(const SourceManager &Sources, SourceLocation Start,
+                             unsigned Length, StringRef ReplacementText);
+  void setFromSourceRange(const SourceManager &Sources,
+                          const CharSourceRange &Range,
+                          StringRef ReplacementText,
+                          const LangOptions &LangOpts);
 
   std::string FilePath;
   Range ReplacementRange;
@@ -136,9 +142,70 @@
 /// \brief Equal-to operator between two Replacements.
 bool operator==(const Replacement &LHS, const Replacement &RHS);
 
-/// \brief A set of Replacements.
-/// FIXME: Change to a vector and deduplicate in the RefactoringTool.
-typedef std::set<Replacement> Replacements;
+/// \brief Maintains a set of replacements that are conflict-free.
+/// Two replacements are considered conflicts if they overlap or have the same
+/// offset (i.e. order-dependent).
+class Replacements {
+ private:
+   typedef std::set<Replacement> ReplacementsImpl;
+
+ public:
+  typedef ReplacementsImpl::const_iterator const_iterator;
+
+  Replacements() = default;
+
+  explicit Replacements(const Replacement &R) { Replaces.insert(R); }
+
+  /// \brief Adds a new replacement \p R to the current set of replacements.
+  /// \p R must have the same file path as all existing replacements.
+  /// Returns true if the replacement is successfully inserted; otherwise,
+  /// it returns an llvm::Error, i.e. there is a conflict between R and the
+  /// existing replacements or R's file path is different from the filepath of
+  /// existing replacements. Callers must explicitly check the Error returned.
+  /// This prevents users from adding order-dependent replacements. To control
+  /// the order in which order-dependent replacements are applied, use
+  /// merge({R}) with R referring to the changed code after applying all
+  /// existing replacements.
+  /// Replacements with offset UINT_MAX are special - we do not detect conflicts
+  /// for such replacements since users may add them intentionally as a special
+  /// category of replacements.
+  llvm::Error add(const Replacement &R);
+
+  /// \brief Merges \p Replaces into the current replacements. \p Replaces
+  /// refers to code after applying the current replacements.
+  Replacements merge(const Replacements &Replaces) const;
+
+  // Returns the affected ranges in the changed code.
+  std::vector<Range> getAffectedRanges() const;
+
+  // Returns the new offset in the code after replacements being applied.
+  // Note that if there is an insertion at Offset in the current replacements,
+  // \p Offset will be shifted to Offset + Length in inserted text.
+  unsigned getShiftedCodePosition(unsigned Position) const;
+
+  unsigned size() const { return Replaces.size(); }
+
+  void clear() { Replaces.clear(); }
+
+  bool empty() const { return Replaces.empty(); }
+
+  const_iterator begin() const { return Replaces.begin(); }
+
+  const_iterator end() const { return Replaces.end(); }
+
+  bool operator==(const Replacements &RHS) const {
+    return Replaces == RHS.Replaces;
+  }
+
+
+private:
+  Replacements(const_iterator Begin, const_iterator End)
+      : Replaces(Begin, End) {}
+
+  Replacements mergeReplacements(const ReplacementsImpl &Second) const;
+
+  ReplacementsImpl Replaces;
+};
 
 /// \brief Apply all replacements in \p Replaces to the Rewriter \p Rewrite.
 ///
@@ -148,41 +215,15 @@
 /// \returns true if all replacements apply. false otherwise.
 bool applyAllReplacements(const Replacements &Replaces, Rewriter &Rewrite);
 
-/// \brief Apply all replacements in \p Replaces to the Rewriter \p Rewrite.
-///
-/// Replacement applications happen independently of the success of
-/// other applications.
-///
-/// \returns true if all replacements apply. false otherwise.
-bool applyAllReplacements(const std::vector<Replacement> &Replaces,
-                          Rewriter &Rewrite);
-
 /// \brief Applies all replacements in \p Replaces to \p Code.
 ///
-/// This completely ignores the path stored in each replacement. If one or more
-/// replacements cannot be applied, this returns an empty \c string.
-std::string applyAllReplacements(StringRef Code, const Replacements &Replaces);
-
-/// \brief Calculates how a code \p Position is shifted when \p Replaces are
-/// applied.
-unsigned shiftedCodePosition(const Replacements& Replaces, unsigned Position);
-
-/// \brief Calculates how a code \p Position is shifted when \p Replaces are
-/// applied.
-///
-/// \pre Replaces[i].getOffset() <= Replaces[i+1].getOffset().
-unsigned shiftedCodePosition(const std::vector<Replacement> &Replaces,
-                             unsigned Position);
-
-/// \brief Removes duplicate Replacements and reports if Replacements conflict
-/// with one another. All Replacements are assumed to be in the same file.
-///
-/// \post Replaces[i].getOffset() <= Replaces[i+1].getOffset().
-///
-/// This function sorts \p Replaces so that conflicts can be reported simply by
-/// offset into \p Replaces and number of elements in the conflict.
-void deduplicate(std::vector<Replacement> &Replaces,
-                 std::vector<Range> &Conflicts);
+/// This completely ignores the path stored in each replacement. If all
+/// replacements are applied successfully, this returns the code with
+/// replacements applied; otherwise, an llvm::Error carrying llvm::StringError
+/// is returned (the Error message can be converted to string using
+/// `llvm::toString()` and 'std::error_code` in the `Error` should be ignored).
+llvm::Expected<std::string> applyAllReplacements(StringRef Code,
+                                                 const Replacements &Replaces);
 
 /// \brief Collection of Replacements generated from a single translation unit.
 struct TranslationUnitReplacements {
@@ -197,34 +238,22 @@
   std::vector<Replacement> Replacements;
 };
 
-/// \brief Apply all replacements in \p Replaces to the Rewriter \p Rewrite.
+/// \brief Calculates the new ranges after \p Replaces are applied. These
+/// include both the original \p Ranges and the affected ranges of \p Replaces
+/// in the new code.
 ///
-/// Replacement applications happen independently of the success of
-/// other applications.
+/// \pre Replacements must be for the same file.
 ///
-/// \returns true if all replacements apply. false otherwise.
-bool applyAllReplacements(const Replacements &Replaces, Rewriter &Rewrite);
+/// \return The new ranges after \p Replaces are applied. The new ranges will be
+/// sorted and non-overlapping.
+std::vector<Range>
+calculateRangesAfterReplacements(const Replacements &Replaces,
+                                 const std::vector<Range> &Ranges);
 
-/// \brief Apply all replacements in \p Replaces to the Rewriter \p Rewrite.
-///
-/// Replacement applications happen independently of the success of
-/// other applications.
-///
-/// \returns true if all replacements apply. false otherwise.
-bool applyAllReplacements(const std::vector<Replacement> &Replaces,
-                          Rewriter &Rewrite);
-
-/// \brief Applies all replacements in \p Replaces to \p Code.
-///
-/// This completely ignores the path stored in each replacement. If one or more
-/// replacements cannot be applied, this returns an empty \c string.
-std::string applyAllReplacements(StringRef Code, const Replacements &Replaces);
-
-/// \brief Merges two sets of replacements with the second set referring to the
-/// code after applying the first set. Within both 'First' and 'Second',
-/// replacements must not overlap.
-Replacements mergeReplacements(const Replacements &First,
-                               const Replacements &Second);
+/// \brief Groups a random set of replacements by file path. Replacements
+/// related to the same file entry are put into the same vector.
+std::map<std::string, Replacements>
+groupReplacementsByFile(const Replacements &Replaces);
 
 template <typename Node>
 Replacement::Replacement(const SourceManager &Sources,
diff --git a/include/clang/Tooling/FileMatchTrie.h b/include/clang/Tooling/FileMatchTrie.h
index 745c164..882979e 100644
--- a/include/clang/Tooling/FileMatchTrie.h
+++ b/include/clang/Tooling/FileMatchTrie.h
@@ -16,10 +16,11 @@
 #define LLVM_CLANG_TOOLING_FILEMATCHTRIE_H
 
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/StringRef.h"
 #include <memory>
-#include <string>
-#include <vector>
+
+namespace llvm {
+class StringRef;
+}
 
 namespace clang {
 namespace tooling {
diff --git a/include/clang/Tooling/FixIt.h b/include/clang/Tooling/FixIt.h
new file mode 100644
index 0000000..e2259d4
--- /dev/null
+++ b/include/clang/Tooling/FixIt.h
@@ -0,0 +1,72 @@
+//===--- FixIt.h - FixIt Hint utilities -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements functions to ease source rewriting from AST-nodes.
+//
+//  Example swapping A and B expressions:
+//
+//    Expr *A, *B;
+//    tooling::fixit::createReplacement(*A, *B);
+//    tooling::fixit::createReplacement(*B, *A);
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_FIXIT_H
+#define LLVM_CLANG_TOOLING_FIXIT_H
+
+#include "clang/AST/ASTContext.h"
+
+namespace clang {
+namespace tooling {
+namespace fixit {
+
+namespace internal {
+StringRef getText(SourceRange Range, const ASTContext &Context);
+
+/// \brief Returns the SourceRange of a SourceRange. This identity function is
+///        used by the following template abstractions.
+inline SourceRange getSourceRange(const SourceRange &Range) { return Range; }
+
+/// \brief Returns the SourceRange of the token at Location \p Loc.
+inline SourceRange getSourceRange(const SourceLocation &Loc) {
+  return SourceRange(Loc);
+}
+
+/// \brief Returns the SourceRange of an given Node. \p Node is typically a
+///        'Stmt', 'Expr' or a 'Decl'.
+template <typename T> SourceRange getSourceRange(const T &Node) {
+  return Node.getSourceRange();
+}
+} // end namespace internal
+
+// \brief Returns a textual representation of \p Node.
+template <typename T>
+StringRef getText(const T &Node, const ASTContext &Context) {
+  return internal::getText(internal::getSourceRange(Node), Context);
+}
+
+// \brief Returns a FixItHint to remove \p Node.
+// TODO: Add support for related syntactical elements (i.e. comments, ...).
+template <typename T> FixItHint createRemoval(const T &Node) {
+  return FixItHint::CreateRemoval(internal::getSourceRange(Node));
+}
+
+// \brief Returns a FixItHint to replace \p Destination by \p Source.
+template <typename D, typename S>
+FixItHint createReplacement(const D &Destination, const S &Source,
+                                   const ASTContext &Context) {
+  return FixItHint::CreateReplacement(internal::getSourceRange(Destination),
+                                      getText(Source, Context));
+}
+
+} // end namespace fixit
+} // end namespace tooling
+} // end namespace clang
+
+#endif // LLVM_CLANG_TOOLING_FIXINT_H
diff --git a/include/clang/Tooling/Refactoring.h b/include/clang/Tooling/Refactoring.h
index 54deff6..7a5f9dd 100644
--- a/include/clang/Tooling/Refactoring.h
+++ b/include/clang/Tooling/Refactoring.h
@@ -21,6 +21,7 @@
 
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/Tooling/Tooling.h"
+#include <map>
 #include <string>
 
 namespace clang {
@@ -42,9 +43,9 @@
                   std::shared_ptr<PCHContainerOperations> PCHContainerOps =
                       std::make_shared<PCHContainerOperations>());
 
-  /// \brief Returns the set of replacements to which replacements should
-  /// be added during the run of the tool.
-  Replacements &getReplacements();
+  /// \brief Returns the file path to replacements map to which replacements
+  /// should be added during the run of the tool.
+  std::map<std::string, Replacements> &getReplacements();
 
   /// \brief Call run(), apply all generated replacements, and immediately save
   /// the results to disk.
@@ -65,9 +66,28 @@
   int saveRewrittenFiles(Rewriter &Rewrite);
 
 private:
-  Replacements Replace;
+  std::map<std::string, Replacements> FileToReplaces;
 };
 
+/// \brief Groups \p Replaces by the file path and applies each group of
+/// Replacements on the related file in \p Rewriter. In addition to applying
+/// given Replacements, this function also formats the changed code.
+///
+/// \pre Replacements must be conflict-free.
+///
+/// Replacement applications happen independently of the success of other
+/// applications.
+///
+/// \param[in] FileToReplaces Replacements (grouped by files) to apply.
+/// \param[in] Rewrite The `Rewritter` to apply replacements on.
+/// \param[in] Style The style name used for reformatting. See ```getStyle``` in
+/// "include/clang/Format/Format.h" for all possible style forms.
+///
+/// \returns true if all replacements applied and formatted. false otherwise.
+bool formatAndApplyAllReplacements(
+    const std::map<std::string, Replacements> &FileToReplaces,
+    Rewriter &Rewrite, StringRef Style = "file");
+
 } // end namespace tooling
 } // end namespace clang
 
diff --git a/include/clang/Tooling/ReplacementsYaml.h b/include/clang/Tooling/ReplacementsYaml.h
index 4a7666d..47b7f3f 100644
--- a/include/clang/Tooling/ReplacementsYaml.h
+++ b/include/clang/Tooling/ReplacementsYaml.h
@@ -19,7 +19,6 @@
 #include "clang/Tooling/Refactoring.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <string>
-#include <vector>
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(clang::tooling::Replacement)
 
diff --git a/include/clang/Tooling/Tooling.h b/include/clang/Tooling/Tooling.h
index b7a9b25..ca232f4 100644
--- a/include/clang/Tooling/Tooling.h
+++ b/include/clang/Tooling/Tooling.h
@@ -163,6 +163,8 @@
 /// \param Code C++ code.
 /// \param Args Additional flags to pass on.
 /// \param FileName The file name which 'Code' will be mapped as.
+/// \param ToolName The name of the binary running the tool. Standard library
+///                 header paths will be resolved relative to this.
 /// \param PCHContainerOps   The PCHContainerOperations for loading and creating
 ///                          clang modules.
 ///
@@ -170,6 +172,7 @@
 bool runToolOnCodeWithArgs(
     clang::FrontendAction *ToolAction, const Twine &Code,
     const std::vector<std::string> &Args, const Twine &FileName = "input.cc",
+    const Twine &ToolName = "clang-tool",
     std::shared_ptr<PCHContainerOperations> PCHContainerOps =
         std::make_shared<PCHContainerOperations>(),
     const FileContentMappings &VirtualMappedFiles = FileContentMappings());
@@ -192,13 +195,15 @@
 /// \param Code C++ code.
 /// \param Args Additional flags to pass on.
 /// \param FileName The file name which 'Code' will be mapped as.
+/// \param ToolName The name of the binary running the tool. Standard library
+///                 header paths will be resolved relative to this.
 /// \param PCHContainerOps The PCHContainerOperations for loading and creating
 /// clang modules.
 ///
 /// \return The resulting AST or null if an error occurred.
 std::unique_ptr<ASTUnit> buildASTFromCodeWithArgs(
     const Twine &Code, const std::vector<std::string> &Args,
-    const Twine &FileName = "input.cc",
+    const Twine &FileName = "input.cc", const Twine &ToolName = "clang-tool",
     std::shared_ptr<PCHContainerOperations> PCHContainerOps =
         std::make_shared<PCHContainerOperations>());
 
diff --git a/include/clang/module.modulemap b/include/clang/module.modulemap
index a3e18ff..7fa8b82 100644
--- a/include/clang/module.modulemap
+++ b/include/clang/module.modulemap
@@ -12,6 +12,7 @@
   umbrella "AST"
 
   textual header "AST/BuiltinTypes.def"
+  textual header "AST/OperationKinds.def"
   textual header "AST/TypeLocNodes.def"
   textual header "AST/TypeNodes.def"
 
@@ -41,6 +42,7 @@
   textual header "Basic/DiagnosticOptions.def"
   textual header "Basic/LangOptions.def"
   textual header "Basic/OpenCLExtensions.def"
+  textual header "Basic/OpenCLImageTypes.def"
   textual header "Basic/OpenMPKinds.def"
   textual header "Basic/OperatorKinds.def"
   textual header "Basic/Sanitizers.def"
diff --git a/lib/APINotes/APINotesFormat.h b/lib/APINotes/APINotesFormat.h
index 34db384..aad13f7 100644
--- a/lib/APINotes/APINotesFormat.h
+++ b/lib/APINotes/APINotesFormat.h
@@ -1,4 +1,4 @@
-//===--- APINotesFormat.h - The internals of API notes files ------*- C++ -*-===//
+//===--- APINotesFormat.h - The internals of API notes files ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerEmbeddedInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Bitcode/RecordLayout.h"
 
@@ -37,13 +38,13 @@
 /// When the format changes IN ANY WAY, this number should be incremented.
 const uint16_t VERSION_MINOR = 13;  // Function/method parameters
 
-using IdentifierID = Fixnum<31>;
+using IdentifierID = PointerEmbeddedInt<unsigned, 31>;
 using IdentifierIDField = BCVBR<16>;
 
-using SelectorID = Fixnum<31>;
+using SelectorID = PointerEmbeddedInt<unsigned, 31>;
 using SelectorIDField = BCVBR<16>;
 
-using StoredContextID = Fixnum<31>;
+using StoredContextID = PointerEmbeddedInt<unsigned, 31>;
 
 /// The various types of blocks that can occur within a API notes file.
 ///
diff --git a/lib/APINotes/APINotesReader.cpp b/lib/APINotes/APINotesReader.cpp
index bc2f548..56c782a 100644
--- a/lib/APINotes/APINotesReader.cpp
+++ b/lib/APINotes/APINotesReader.cpp
@@ -143,7 +143,7 @@
     
     static internal_key_type ReadKey(const uint8_t *data, unsigned length) {
       auto nameID
-        = endian::readNext<IdentifierID, little, unaligned>(data);
+        = endian::readNext<uint32_t, little, unaligned>(data);
       auto isProtocol = endian::readNext<uint8_t, little, unaligned>(data);
       return { nameID, isProtocol };
     }
@@ -151,7 +151,7 @@
     static data_type ReadData(internal_key_type key, const uint8_t *data,
                               unsigned length) {
       data_type result;
-      result.first = endian::readNext<StoredContextID, little, unaligned>(data);
+      result.first = endian::readNext<uint32_t, little, unaligned>(data);
       readCommonTypeInfo(data, result.second);
       if (*data++) {
         result.second.setDefaultNullability(static_cast<NullabilityKind>(*data));
@@ -206,8 +206,8 @@
     }
     
     static internal_key_type ReadKey(const uint8_t *data, unsigned length) {
-      auto classID = endian::readNext<IdentifierID, little, unaligned>(data);
-      auto nameID = endian::readNext<IdentifierID, little, unaligned>(data);
+      auto classID = endian::readNext<uint32_t, little, unaligned>(data);
+      auto nameID = endian::readNext<uint32_t, little, unaligned>(data);
       return { classID, nameID };
     }
     
@@ -282,8 +282,8 @@
     }
     
     static internal_key_type ReadKey(const uint8_t *data, unsigned length) {
-      auto classID = endian::readNext<IdentifierID, little, unaligned>(data);
-      auto selectorID = endian::readNext<SelectorID, little, unaligned>(data);
+      auto classID = endian::readNext<uint32_t, little, unaligned>(data);
+      auto selectorID = endian::readNext<uint32_t, little, unaligned>(data);
       auto isInstance = endian::readNext<uint8_t, little, unaligned>(data);
       return internal_key_type{ classID, selectorID, isInstance };
     }
@@ -334,17 +334,17 @@
     static internal_key_type ReadKey(const uint8_t *data, unsigned length) {
       internal_key_type key;
       key.NumPieces = endian::readNext<uint16_t, little, unaligned>(data);
-      unsigned numIdents = (length - sizeof(uint16_t)) / sizeof(IdentifierID);
+      unsigned numIdents = (length - sizeof(uint16_t)) / sizeof(uint32_t);
       for (unsigned i = 0; i != numIdents; ++i) {
         key.Identifiers.push_back(
-          endian::readNext<IdentifierID, little, unaligned>(data));
+          endian::readNext<uint32_t, little, unaligned>(data));
       }
       return key;
     }
     
     static data_type ReadData(internal_key_type key, const uint8_t *data,
                               unsigned length) {
-      return endian::readNext<SelectorID, little, unaligned>(data);
+      return endian::readNext<uint32_t, little, unaligned>(data);
     }
   };
 
@@ -381,7 +381,7 @@
     }
     
     static internal_key_type ReadKey(const uint8_t *data, unsigned length) {
-      auto nameID = endian::readNext<IdentifierID, little, unaligned>(data);
+      auto nameID = endian::readNext<uint32_t, little, unaligned>(data);
       return nameID;
     }
     
@@ -426,7 +426,7 @@
     }
     
     static internal_key_type ReadKey(const uint8_t *data, unsigned length) {
-      auto nameID = endian::readNext<IdentifierID, little, unaligned>(data);
+      auto nameID = endian::readNext<uint32_t, little, unaligned>(data);
       return nameID;
     }
     
diff --git a/lib/APINotes/APINotesWriter.cpp b/lib/APINotes/APINotesWriter.cpp
index 7178abb..0c178be 100644
--- a/lib/APINotes/APINotesWriter.cpp
+++ b/lib/APINotes/APINotesWriter.cpp
@@ -255,7 +255,6 @@
 
     void EmitData(raw_ostream &out, key_type_ref key, data_type_ref data,
                   unsigned len) {
-      static_assert(sizeof(IdentifierID) <= 4, "DeclID too large");
       endian::Writer<little> writer(out);
       writer.write<uint32_t>(data);
     }
@@ -344,8 +343,8 @@
     std::pair<unsigned, unsigned> EmitKeyDataLength(raw_ostream &out,
                                                     key_type_ref key,
                                                     data_type_ref data) {
-      uint32_t keyLength = sizeof(IdentifierID) + 1;
-      uint32_t dataLength = sizeof(ContextID)
+      uint32_t keyLength = sizeof(uint32_t) + 1;
+      uint32_t dataLength = sizeof(uint32_t)
                           + getCommonTypeInfoSize(data.second)
                           + dataBytes;
       endian::Writer<little> writer(out);
@@ -356,14 +355,14 @@
 
     void EmitKey(raw_ostream &out, key_type_ref key, unsigned len) {
       endian::Writer<little> writer(out);
-      writer.write<IdentifierID>(key.first);
+      writer.write<uint32_t>(key.first);
       writer.write<uint8_t>(key.second);
     }
 
     void EmitData(raw_ostream &out, key_type_ref key, data_type_ref data,
                   unsigned len) {
       endian::Writer<little> writer(out);
-      writer.write<StoredContextID >(data.first);
+      writer.write<uint32_t>(data.first);
 
       emitCommonTypeInfo(out, data.second);
 
@@ -445,7 +444,7 @@
     std::pair<unsigned, unsigned> EmitKeyDataLength(raw_ostream &out,
                                                     key_type_ref key,
                                                     data_type_ref data) {
-      uint32_t keyLength = sizeof(IdentifierID) + sizeof(IdentifierID);
+      uint32_t keyLength = sizeof(uint32_t) + sizeof(uint32_t);
       uint32_t dataLength = getVariableInfoSize(data);
       endian::Writer<little> writer(out);
       writer.write<uint16_t>(keyLength);
@@ -455,8 +454,8 @@
 
     void EmitKey(raw_ostream &out, key_type_ref key, unsigned len) {
       endian::Writer<little> writer(out);
-      writer.write<IdentifierID>(key.first);
-      writer.write<IdentifierID>(key.second);
+      writer.write<uint32_t>(key.first);
+      writer.write<uint32_t>(key.second);
     }
 
     void EmitData(raw_ostream &out, key_type_ref key, data_type_ref data,
@@ -542,7 +541,7 @@
     std::pair<unsigned, unsigned> EmitKeyDataLength(raw_ostream &out,
                                                     key_type_ref key,
                                                     data_type_ref data) {
-      uint32_t keyLength = sizeof(IdentifierID) + sizeof(SelectorID) + 1;
+      uint32_t keyLength = sizeof(uint32_t) + sizeof(uint32_t) + 1;
       uint32_t dataLength = getFunctionInfoSize(data) + 3;
       endian::Writer<little> writer(out);
       writer.write<uint16_t>(keyLength);
@@ -552,8 +551,8 @@
 
     void EmitKey(raw_ostream &out, key_type_ref key, unsigned len) {
       endian::Writer<little> writer(out);
-      writer.write<IdentifierID>(std::get<0>(key));
-      writer.write<SelectorID>(std::get<1>(key));
+      writer.write<uint32_t>(std::get<0>(key));
+      writer.write<uint32_t>(std::get<1>(key));
       writer.write<uint8_t>(std::get<2>(key));
     }
 
@@ -615,8 +614,8 @@
                                                     key_type_ref key,
                                                     data_type_ref data) {
       uint32_t keyLength = sizeof(uint16_t) 
-                         + sizeof(IdentifierID) * key.Identifiers.size();
-      uint32_t dataLength = sizeof(SelectorID);
+                         + sizeof(uint32_t) * key.Identifiers.size();
+      uint32_t dataLength = sizeof(uint32_t);
       endian::Writer<little> writer(out);
       writer.write<uint16_t>(keyLength);
       writer.write<uint16_t>(dataLength);
@@ -627,14 +626,14 @@
       endian::Writer<little> writer(out);
       writer.write<uint16_t>(key.NumPieces);
       for (auto piece : key.Identifiers) {
-        writer.write<IdentifierID>(piece);
+        writer.write<uint32_t>(piece);
       }
     }
 
     void EmitData(raw_ostream &out, key_type_ref key, data_type_ref data,
                   unsigned len) {
       endian::Writer<little> writer(out);
-      writer.write<SelectorID>(data);
+      writer.write<uint32_t>(data);
     }
   };
 } // end anonymous namespace
@@ -681,7 +680,7 @@
     std::pair<unsigned, unsigned> EmitKeyDataLength(raw_ostream &out,
                                                     key_type_ref key,
                                                     data_type_ref data) {
-      uint32_t keyLength = sizeof(IdentifierID);
+      uint32_t keyLength = sizeof(uint32_t);
       uint32_t dataLength = getVariableInfoSize(data);
       endian::Writer<little> writer(out);
       writer.write<uint16_t>(keyLength);
@@ -691,7 +690,7 @@
 
     void EmitKey(raw_ostream &out, key_type_ref key, unsigned len) {
       endian::Writer<little> writer(out);
-      writer.write<IdentifierID>(key);
+      writer.write<uint32_t>(key);
     }
 
     void EmitData(raw_ostream &out, key_type_ref key, data_type_ref data,
@@ -743,7 +742,7 @@
     std::pair<unsigned, unsigned> EmitKeyDataLength(raw_ostream &out,
                                                     key_type_ref key,
                                                     data_type_ref data) {
-      uint32_t keyLength = sizeof(IdentifierID);
+      uint32_t keyLength = sizeof(uint32_t);
       uint32_t dataLength = getFunctionInfoSize(data);
       endian::Writer<little> writer(out);
       writer.write<uint16_t>(keyLength);
@@ -753,7 +752,7 @@
 
     void EmitKey(raw_ostream &out, key_type_ref key, unsigned len) {
       endian::Writer<little> writer(out);
-      writer.write<IdentifierID>(key);
+      writer.write<uint32_t>(key);
     }
 
     void EmitData(raw_ostream &out, key_type_ref key, data_type_ref data,
diff --git a/lib/APINotes/APINotesYAMLCompiler.cpp b/lib/APINotes/APINotesYAMLCompiler.cpp
index ad2dec6..a218859 100644
--- a/lib/APINotes/APINotesYAMLCompiler.cpp
+++ b/lib/APINotes/APINotesYAMLCompiler.cpp
@@ -57,6 +57,8 @@
  ...
  Globals:                 # List of globals
  ...
+ Enumerators:             # List of enumerators
+ ...
  Tags:                    # List of tags (struct/union/enum/C++ class)
  ...
  Typedefs:                # List of typedef-names and C++11 type aliases
@@ -830,7 +832,7 @@
       llvm::StringSet<> knownTypedefs;
       for (const auto &t : TheModule.Typedefs) {
         // Check for duplicate typedef definitions.
-        if (!knownTags.insert(t.Name).second) {
+        if (!knownTypedefs.insert(t.Name).second) {
           emitError("multiple definitions of typedef '" + t.Name + "'");
           continue;
         }
diff --git a/lib/APINotes/Makefile b/lib/APINotes/Makefile
deleted file mode 100644
index 69ddcd4..0000000
--- a/lib/APINotes/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- clang/lib/APINotes/Makefile -------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-#  This implements the APINotes library for the C-Language front-end.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangAPINotes
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/ARCMigrate/ARCMT.cpp b/lib/ARCMigrate/ARCMT.cpp
index 8c04c83..680aa3e 100644
--- a/lib/ARCMigrate/ARCMT.cpp
+++ b/lib/ARCMigrate/ARCMT.cpp
@@ -16,11 +16,13 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Rewrite/Core/Rewriter.h"
 #include "clang/Sema/SemaDiagnostic.h"
 #include "clang/Serialization/ASTReader.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include <utility>
 using namespace clang;
 using namespace arcmt;
 
@@ -508,8 +510,8 @@
     const CompilerInvocation &CI,
     std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     DiagnosticConsumer *diagClient, StringRef outputDir)
-    : OrigCI(CI), PCHContainerOps(PCHContainerOps), DiagClient(diagClient),
-      HadARCErrors(false) {
+    : OrigCI(CI), PCHContainerOps(std::move(PCHContainerOps)),
+      DiagClient(diagClient), HadARCErrors(false) {
   if (!outputDir.empty()) {
     IntrusiveRefCntPtr<DiagnosticIDs> DiagID(new DiagnosticIDs());
     IntrusiveRefCntPtr<DiagnosticsEngine> Diags(
diff --git a/lib/ARCMigrate/Makefile b/lib/ARCMigrate/Makefile
deleted file mode 100644
index 5232c5e..0000000
--- a/lib/ARCMigrate/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- clang/lib/ARCMigrate/Makefile --------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-# This implements code transformation to ARC mode.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangARCMigrate
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/ARCMigrate/ObjCMT.cpp b/lib/ARCMigrate/ObjCMT.cpp
index a233a4c..241a724 100644
--- a/lib/ARCMigrate/ObjCMT.cpp
+++ b/lib/ARCMigrate/ObjCMT.cpp
@@ -177,7 +177,7 @@
   }
 };
 
-}
+} // end anonymous namespace
 
 ObjCMigrateAction::ObjCMigrateAction(
                                   std::unique_ptr<FrontendAction> WrappedAction,
@@ -307,7 +307,6 @@
     }
     return true;
   }
-  
 
 class ObjCMigrator : public RecursiveASTVisitor<ObjCMigrator> {
   ObjCMigrateASTConsumer &Consumer;
@@ -370,7 +369,7 @@
     return true;
   }
 };
-}
+} // end anonymous namespace
 
 void ObjCMigrateASTConsumer::migrateDecl(Decl *D) {
   if (!D)
@@ -1094,7 +1093,6 @@
           versionsMatch(Deprecated1, Deprecated2) &&
           versionsMatch(Obsoleted1, Obsoleted2) &&
           IsUnavailable1 == IsUnavailable2);
-  
 }
 
 static bool MatchTwoAttributeLists(const AttrVec &Attrs1, const AttrVec &Attrs2,
@@ -1499,7 +1497,6 @@
   }
 }
 
-
 ObjCMigrateASTConsumer::CF_BRIDGING_KIND
   ObjCMigrateASTConsumer::migrateAddFunctionAnnotation(
                                                   ASTContext &Ctx,
@@ -1673,7 +1670,6 @@
       return;
     }
   }
-  return;
 }
 
 namespace {
@@ -1690,7 +1686,7 @@
     return true;
   }
 };
-} // anonymous namespace
+} // end anonymous namespace
 
 static bool hasSuperInitCall(const ObjCMethodDecl *MD) {
   return !SuperInitChecker().TraverseStmt(MD->getBody());
@@ -1836,7 +1832,7 @@
   }
 };
 
-}
+} // end anonymous namespace
 
 void ObjCMigrateASTConsumer::HandleTranslationUnit(ASTContext &Ctx) {
   
@@ -2039,7 +2035,7 @@
 
   EditEntry() : File(), Offset(), RemoveLen() {}
 };
-}
+} // end anonymous namespace
 
 namespace llvm {
 template<> struct DenseMapInfo<EditEntry> {
@@ -2068,7 +2064,7 @@
         LHS.Text == RHS.Text;
   }
 };
-}
+} // end namespace llvm
 
 namespace {
 class RemapFileParser {
@@ -2150,7 +2146,7 @@
       Entries.push_back(Entry);
   }
 };
-}
+} // end anonymous namespace
 
 static bool reportDiag(const Twine &Err, DiagnosticsEngine &Diag) {
   Diag.Report(Diag.getCustomDiagID(DiagnosticsEngine::Error, "%0"))
diff --git a/lib/ARCMigrate/Transforms.cpp b/lib/ARCMigrate/Transforms.cpp
index 3fd36ff..cb96a54 100644
--- a/lib/ARCMigrate/Transforms.cpp
+++ b/lib/ARCMigrate/Transforms.cpp
@@ -11,17 +11,12 @@
 #include "Internals.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
-#include "clang/AST/StmtVisitor.h"
 #include "clang/Analysis/DomainSpecific/CocoaConventions.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Sema.h"
-#include "clang/Sema/SemaDiagnostic.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringSwitch.h"
-#include <map>
 
 using namespace clang;
 using namespace arcmt;
diff --git a/lib/AST/APValue.cpp b/lib/AST/APValue.cpp
index 3c58733..5e1938c 100644
--- a/lib/AST/APValue.cpp
+++ b/lib/AST/APValue.cpp
@@ -17,8 +17,6 @@
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/Type.h"
-#include "clang/Basic/Diagnostic.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace clang;
diff --git a/lib/AST/ASTConsumer.cpp b/lib/AST/ASTConsumer.cpp
index cff82e9..55033b2 100644
--- a/lib/AST/ASTConsumer.cpp
+++ b/lib/AST/ASTConsumer.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/ASTConsumer.h"
-#include "llvm/Bitcode/BitstreamReader.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclGroup.h"
 using namespace clang;
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index e0bef99..49948a5 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -34,7 +34,6 @@
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Capacity.h"
@@ -58,7 +57,7 @@
 unsigned ASTContext::NumImplicitDestructorsDeclared;
 
 enum FloatingRank {
-  HalfRank, FloatRank, DoubleRank, LongDoubleRank
+  HalfRank, FloatRank, DoubleRank, LongDoubleRank, Float128Rank
 };
 
 RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const {
@@ -633,9 +632,8 @@
                                                 NTTP->getPosition(), nullptr,
                                                 T,
                                                 TInfo,
-                                                ExpandedTypes.data(),
-                                                ExpandedTypes.size(),
-                                                ExpandedTInfos.data());
+                                                ExpandedTypes,
+                                                ExpandedTInfos);
       } else {
         Param = NonTypeTemplateParmDecl::Create(*this, getTranslationUnitDecl(),
                                                 SourceLocation(),
@@ -653,6 +651,10 @@
                                            cast<TemplateTemplateParmDecl>(*P)));
   }
 
+  assert(!TTP->getRequiresClause() &&
+         "Unexpected requires-clause on template template-parameter");
+  LLVM_CONSTEXPR Expr *const CanonRequiresClause = nullptr;
+
   TemplateTemplateParmDecl *CanonTTP
     = TemplateTemplateParmDecl::Create(*this, getTranslationUnitDecl(), 
                                        SourceLocation(), TTP->getDepth(),
@@ -662,7 +664,8 @@
                          TemplateParameterList::Create(*this, SourceLocation(),
                                                        SourceLocation(),
                                                        CanonParams,
-                                                       SourceLocation()));
+                                                       SourceLocation(),
+                                                       CanonRequiresClause));
 
   // Get the new insert position for the node we care about.
   Canonical = CanonTemplateTemplateParms.FindNodeOrInsertPos(ID, InsertPos);
@@ -734,17 +737,16 @@
       DependentTemplateSpecializationTypes(this_()),
       SubstTemplateTemplateParmPacks(this_()),
       GlobalNestedNameSpecifier(nullptr), Int128Decl(nullptr),
-      UInt128Decl(nullptr), Float128StubDecl(nullptr),
-      BuiltinVaListDecl(nullptr), BuiltinMSVaListDecl(nullptr),
-      ObjCIdDecl(nullptr), ObjCSelDecl(nullptr), ObjCClassDecl(nullptr),
-      ObjCProtocolClassDecl(nullptr), BOOLDecl(nullptr),
+      UInt128Decl(nullptr), BuiltinVaListDecl(nullptr),
+      BuiltinMSVaListDecl(nullptr), ObjCIdDecl(nullptr), ObjCSelDecl(nullptr),
+      ObjCClassDecl(nullptr), ObjCProtocolClassDecl(nullptr), BOOLDecl(nullptr),
       CFConstantStringTagDecl(nullptr), CFConstantStringTypeDecl(nullptr),
       ObjCInstanceTypeDecl(nullptr), FILEDecl(nullptr), jmp_bufDecl(nullptr),
       sigjmp_bufDecl(nullptr), ucontext_tDecl(nullptr),
       BlockDescriptorType(nullptr), BlockDescriptorExtendedType(nullptr),
       cudaConfigureCallDecl(nullptr), FirstLocalImport(), LastLocalImport(),
-      ExternCContext(nullptr), MakeIntegerSeqDecl(nullptr), SourceMgr(SM),
-      LangOpts(LOpts),
+      ExternCContext(nullptr), MakeIntegerSeqDecl(nullptr),
+      TypePackElementDecl(nullptr), SourceMgr(SM), LangOpts(LOpts),
       SanitizerBL(new SanitizerBlacklist(LangOpts.SanitizerBlacklistFiles, SM)),
       AddrSpaceMap(nullptr), Target(nullptr), AuxTarget(nullptr),
       PrintingPolicy(LOpts), Idents(idents), Selectors(sels),
@@ -790,6 +792,9 @@
        MaterializedTemporaryValues)
     MTVPair.second->~APValue();
 
+  for (const auto &Value : ModuleInitializers)
+    Value.second->~PerModuleInitializers();
+
   llvm::DeleteContainerSeconds(MangleNumberingContexts);
 }
 
@@ -817,7 +822,7 @@
 
 void
 ASTContext::setExternalSource(IntrusiveRefCntPtr<ExternalASTSource> Source) {
-  ExternalSource = Source;
+  ExternalSource = std::move(Source);
 }
 
 void ASTContext::PrintStats() const {
@@ -904,6 +909,67 @@
   Merged.erase(std::remove(Merged.begin(), Merged.end(), nullptr), Merged.end());
 }
 
+void ASTContext::PerModuleInitializers::resolve(ASTContext &Ctx) {
+  if (LazyInitializers.empty())
+    return;
+
+  auto *Source = Ctx.getExternalSource();
+  assert(Source && "lazy initializers but no external source");
+
+  auto LazyInits = std::move(LazyInitializers);
+  LazyInitializers.clear();
+
+  for (auto ID : LazyInits)
+    Initializers.push_back(Source->GetExternalDecl(ID));
+
+  assert(LazyInitializers.empty() &&
+         "GetExternalDecl for lazy module initializer added more inits");
+}
+
+void ASTContext::addModuleInitializer(Module *M, Decl *D) {
+  // One special case: if we add a module initializer that imports another
+  // module, and that module's only initializer is an ImportDecl, simplify.
+  if (auto *ID = dyn_cast<ImportDecl>(D)) {
+    auto It = ModuleInitializers.find(ID->getImportedModule());
+
+    // Maybe the ImportDecl does nothing at all. (Common case.)
+    if (It == ModuleInitializers.end())
+      return;
+
+    // Maybe the ImportDecl only imports another ImportDecl.
+    auto &Imported = *It->second;
+    if (Imported.Initializers.size() + Imported.LazyInitializers.size() == 1) {
+      Imported.resolve(*this);
+      auto *OnlyDecl = Imported.Initializers.front();
+      if (isa<ImportDecl>(OnlyDecl))
+        D = OnlyDecl;
+    }
+  }
+
+  auto *&Inits = ModuleInitializers[M];
+  if (!Inits)
+    Inits = new (*this) PerModuleInitializers;
+  Inits->Initializers.push_back(D);
+}
+
+void ASTContext::addLazyModuleInitializers(Module *M, ArrayRef<uint32_t> IDs) {
+  auto *&Inits = ModuleInitializers[M];
+  if (!Inits)
+    Inits = new (*this) PerModuleInitializers;
+  Inits->LazyInitializers.insert(Inits->LazyInitializers.end(),
+                                 IDs.begin(), IDs.end());
+}
+
+ArrayRef<Decl*> ASTContext::getModuleInitializers(Module *M) {
+  auto It = ModuleInitializers.find(M);
+  if (It == ModuleInitializers.end()) 
+    return None;
+
+  auto *Inits = It->second;
+  Inits->resolve(*this);
+  return Inits->Initializers;
+}
+
 ExternCContextDecl *ASTContext::getExternCContextDecl() const {
   if (!ExternCContext)
     ExternCContext = ExternCContextDecl::Create(*this, getTranslationUnitDecl());
@@ -929,6 +995,14 @@
   return MakeIntegerSeqDecl;
 }
 
+BuiltinTemplateDecl *
+ASTContext::getTypePackElementDecl() const {
+  if (!TypePackElementDecl)
+    TypePackElementDecl = buildBuiltinTemplateDecl(BTK__type_pack_element,
+                                                   getTypePackElementName());
+  return TypePackElementDecl;
+}
+
 RecordDecl *ASTContext::buildImplicitRecord(StringRef Name,
                                             RecordDecl::TagKind TK) const {
   SourceLocation Loc;
@@ -967,14 +1041,6 @@
   return UInt128Decl;
 }
 
-TypeDecl *ASTContext::getFloat128StubType() const {
-  assert(LangOpts.CPlusPlus && "should only be called for c++");
-  if (!Float128StubDecl)
-    Float128StubDecl = buildImplicitRecord("__float128");
-
-  return Float128StubDecl;
-}
-
 void ASTContext::InitBuiltinType(CanQualType &R, BuiltinType::Kind K) {
   BuiltinType *Ty = new (*this, TypeAlignment) BuiltinType(K);
   R = CanQualType::CreateUnsafe(QualType(Ty, 0));
@@ -1023,6 +1089,9 @@
   InitBuiltinType(DoubleTy,            BuiltinType::Double);
   InitBuiltinType(LongDoubleTy,        BuiltinType::LongDouble);
 
+  // GNU extension, __float128 for IEEE quadruple precision
+  InitBuiltinType(Float128Ty,          BuiltinType::Float128);
+
   // GNU extension, 128-bit integers.
   InitBuiltinType(Int128Ty,            BuiltinType::Int128);
   InitBuiltinType(UnsignedInt128Ty,    BuiltinType::UInt128);
@@ -1084,26 +1153,17 @@
   FloatComplexTy      = getComplexType(FloatTy);
   DoubleComplexTy     = getComplexType(DoubleTy);
   LongDoubleComplexTy = getComplexType(LongDoubleTy);
+  Float128ComplexTy   = getComplexType(Float128Ty);
 
   // Builtin types for 'id', 'Class', and 'SEL'.
   InitBuiltinType(ObjCBuiltinIdTy, BuiltinType::ObjCId);
   InitBuiltinType(ObjCBuiltinClassTy, BuiltinType::ObjCClass);
   InitBuiltinType(ObjCBuiltinSelTy, BuiltinType::ObjCSel);
 
-  if (LangOpts.OpenCL) { 
-    InitBuiltinType(OCLImage1dTy, BuiltinType::OCLImage1d);
-    InitBuiltinType(OCLImage1dArrayTy, BuiltinType::OCLImage1dArray);
-    InitBuiltinType(OCLImage1dBufferTy, BuiltinType::OCLImage1dBuffer);
-    InitBuiltinType(OCLImage2dTy, BuiltinType::OCLImage2d);
-    InitBuiltinType(OCLImage2dArrayTy, BuiltinType::OCLImage2dArray);
-    InitBuiltinType(OCLImage2dDepthTy, BuiltinType::OCLImage2dDepth);
-    InitBuiltinType(OCLImage2dArrayDepthTy, BuiltinType::OCLImage2dArrayDepth);
-    InitBuiltinType(OCLImage2dMSAATy, BuiltinType::OCLImage2dMSAA);
-    InitBuiltinType(OCLImage2dArrayMSAATy, BuiltinType::OCLImage2dArrayMSAA);
-    InitBuiltinType(OCLImage2dMSAADepthTy, BuiltinType::OCLImage2dMSAADepth);
-    InitBuiltinType(OCLImage2dArrayMSAADepthTy,
-                    BuiltinType::OCLImage2dArrayMSAADepth);
-    InitBuiltinType(OCLImage3dTy, BuiltinType::OCLImage3d);
+  if (LangOpts.OpenCL) {
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    InitBuiltinType(SingletonId, BuiltinType::Id);
+#include "clang/Basic/OpenCLImageTypes.def"
 
     InitBuiltinType(OCLSamplerTy, BuiltinType::OCLSampler);
     InitBuiltinType(OCLEventTy, BuiltinType::OCLEvent);
@@ -1269,34 +1329,37 @@
 
 ASTContext::overridden_cxx_method_iterator
 ASTContext::overridden_methods_begin(const CXXMethodDecl *Method) const {
-  llvm::DenseMap<const CXXMethodDecl *, CXXMethodVector>::const_iterator Pos
-    = OverriddenMethods.find(Method->getCanonicalDecl());
+  llvm::DenseMap<const CXXMethodDecl *, CXXMethodVector>::const_iterator Pos =
+      OverriddenMethods.find(Method->getCanonicalDecl());
   if (Pos == OverriddenMethods.end())
     return nullptr;
-
   return Pos->second.begin();
 }
 
 ASTContext::overridden_cxx_method_iterator
 ASTContext::overridden_methods_end(const CXXMethodDecl *Method) const {
-  llvm::DenseMap<const CXXMethodDecl *, CXXMethodVector>::const_iterator Pos
-    = OverriddenMethods.find(Method->getCanonicalDecl());
+  llvm::DenseMap<const CXXMethodDecl *, CXXMethodVector>::const_iterator Pos =
+      OverriddenMethods.find(Method->getCanonicalDecl());
   if (Pos == OverriddenMethods.end())
     return nullptr;
-
   return Pos->second.end();
 }
 
 unsigned
 ASTContext::overridden_methods_size(const CXXMethodDecl *Method) const {
-  llvm::DenseMap<const CXXMethodDecl *, CXXMethodVector>::const_iterator Pos
-    = OverriddenMethods.find(Method->getCanonicalDecl());
+  llvm::DenseMap<const CXXMethodDecl *, CXXMethodVector>::const_iterator Pos =
+      OverriddenMethods.find(Method->getCanonicalDecl());
   if (Pos == OverriddenMethods.end())
     return 0;
-
   return Pos->second.size();
 }
 
+ASTContext::overridden_method_range
+ASTContext::overridden_methods(const CXXMethodDecl *Method) const {
+  return overridden_method_range(overridden_methods_begin(Method),
+                                 overridden_methods_end(Method));
+}
+
 void ASTContext::addOverriddenMethod(const CXXMethodDecl *Method, 
                                      const CXXMethodDecl *Overridden) {
   assert(Method->isCanonicalDecl() && Overridden->isCanonicalDecl());
@@ -1351,6 +1414,7 @@
   case BuiltinType::Float:      return Target->getFloatFormat();
   case BuiltinType::Double:     return Target->getDoubleFormat();
   case BuiltinType::LongDouble: return Target->getLongDoubleFormat();
+  case BuiltinType::Float128:   return Target->getFloat128Format();
   }
 }
 
@@ -1661,6 +1725,10 @@
       Width = Target->getLongDoubleWidth();
       Align = Target->getLongDoubleAlign();
       break;
+    case BuiltinType::Float128:
+      Width = Target->getFloat128Width();
+      Align = Target->getFloat128Align();
+      break;
     case BuiltinType::NullPtr:
       Width = Target->getPointerWidth(0); // C++ 3.9.1p11: sizeof(nullptr_t)
       Align = Target->getPointerAlign(0); //   == sizeof(void*)
@@ -1671,32 +1739,29 @@
       Width = Target->getPointerWidth(0); 
       Align = Target->getPointerAlign(0);
       break;
-    case BuiltinType::OCLSampler:
-      // Samplers are modeled as integers.
-      Width = Target->getIntWidth();
-      Align = Target->getIntAlign();
+    case BuiltinType::OCLSampler: {
+      auto AS = getTargetAddressSpace(LangAS::opencl_constant);
+      Width = Target->getPointerWidth(AS);
+      Align = Target->getPointerAlign(AS);
       break;
+    }
     case BuiltinType::OCLEvent:
     case BuiltinType::OCLClkEvent:
     case BuiltinType::OCLQueue:
     case BuiltinType::OCLNDRange:
     case BuiltinType::OCLReserveID:
-    case BuiltinType::OCLImage1d:
-    case BuiltinType::OCLImage1dArray:
-    case BuiltinType::OCLImage1dBuffer:
-    case BuiltinType::OCLImage2d:
-    case BuiltinType::OCLImage2dArray:
-    case BuiltinType::OCLImage2dDepth:
-    case BuiltinType::OCLImage2dArrayDepth:
-    case BuiltinType::OCLImage2dMSAA:
-    case BuiltinType::OCLImage2dArrayMSAA:
-    case BuiltinType::OCLImage2dMSAADepth:
-    case BuiltinType::OCLImage2dArrayMSAADepth:
-    case BuiltinType::OCLImage3d:
       // Currently these types are pointers to opaque types.
       Width = Target->getPointerWidth(0);
       Align = Target->getPointerAlign(0);
       break;
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
+      {
+        auto AS = getTargetAddressSpace(Target->getOpenCLImageAddrSpace());
+        Width = Target->getPointerWidth(AS);
+        Align = Target->getPointerAlign(AS);
+      }
     }
     break;
   case Type::ObjCObjectPointer:
@@ -1904,8 +1969,8 @@
   if (T->isMemberPointerType())
     return getPreferredTypeAlign(getPointerDiffType().getTypePtr());
 
-  if (Target->getTriple().getArch() == llvm::Triple::xcore)
-    return ABIAlign;  // Never overalign on XCore.
+  if (!Target->allowsLargerPreferedTypeAlignment())
+    return ABIAlign;
 
   // Double and long long should be naturally aligned if possible.
   if (const ComplexType *CT = T->getAs<ComplexType>())
@@ -3401,23 +3466,19 @@
                                           QualType Underlying) const {
   assert(!Template.getAsDependentTemplateName() && 
          "No dependent template names here!");
-  
-  unsigned NumArgs = Args.size();
 
   SmallVector<TemplateArgument, 4> ArgVec;
-  ArgVec.reserve(NumArgs);
-  for (unsigned i = 0; i != NumArgs; ++i)
-    ArgVec.push_back(Args[i].getArgument());
+  ArgVec.reserve(Args.size());
+  for (const TemplateArgumentLoc &Arg : Args.arguments())
+    ArgVec.push_back(Arg.getArgument());
 
-  return getTemplateSpecializationType(Template, ArgVec.data(), NumArgs,
-                                       Underlying);
+  return getTemplateSpecializationType(Template, ArgVec, Underlying);
 }
 
 #ifndef NDEBUG
-static bool hasAnyPackExpansions(const TemplateArgument *Args,
-                                 unsigned NumArgs) {
-  for (unsigned I = 0; I != NumArgs; ++I)
-    if (Args[I].isPackExpansion())
+static bool hasAnyPackExpansions(ArrayRef<TemplateArgument> Args) {
+  for (const TemplateArgument &Arg : Args)
+    if (Arg.isPackExpansion())
       return true;
   
   return true;
@@ -3426,8 +3487,7 @@
 
 QualType
 ASTContext::getTemplateSpecializationType(TemplateName Template,
-                                          const TemplateArgument *Args,
-                                          unsigned NumArgs,
+                                          ArrayRef<TemplateArgument> Args,
                                           QualType Underlying) const {
   assert(!Template.getAsDependentTemplateName() && 
          "No dependent template names here!");
@@ -3444,32 +3504,29 @@
   else {
     // We can get here with an alias template when the specialization contains
     // a pack expansion that does not match up with a parameter pack.
-    assert((!IsTypeAlias || hasAnyPackExpansions(Args, NumArgs)) &&
+    assert((!IsTypeAlias || hasAnyPackExpansions(Args)) &&
            "Caller must compute aliased type");
     IsTypeAlias = false;
-    CanonType = getCanonicalTemplateSpecializationType(Template, Args,
-                                                       NumArgs);
+    CanonType = getCanonicalTemplateSpecializationType(Template, Args);
   }
 
   // Allocate the (non-canonical) template specialization type, but don't
   // try to unique it: these types typically have location information that
   // we don't unique and don't want to lose.
   void *Mem = Allocate(sizeof(TemplateSpecializationType) +
-                       sizeof(TemplateArgument) * NumArgs +
+                       sizeof(TemplateArgument) * Args.size() +
                        (IsTypeAlias? sizeof(QualType) : 0),
                        TypeAlignment);
   TemplateSpecializationType *Spec
-    = new (Mem) TemplateSpecializationType(Template, Args, NumArgs, CanonType,
+    = new (Mem) TemplateSpecializationType(Template, Args, CanonType,
                                          IsTypeAlias ? Underlying : QualType());
 
   Types.push_back(Spec);
   return QualType(Spec, 0);
 }
 
-QualType
-ASTContext::getCanonicalTemplateSpecializationType(TemplateName Template,
-                                                   const TemplateArgument *Args,
-                                                   unsigned NumArgs) const {
+QualType ASTContext::getCanonicalTemplateSpecializationType(
+    TemplateName Template, ArrayRef<TemplateArgument> Args) const {
   assert(!Template.getAsDependentTemplateName() && 
          "No dependent template names here!");
 
@@ -3480,15 +3537,16 @@
   // Build the canonical template specialization type.
   TemplateName CanonTemplate = getCanonicalTemplateName(Template);
   SmallVector<TemplateArgument, 4> CanonArgs;
+  unsigned NumArgs = Args.size();
   CanonArgs.reserve(NumArgs);
-  for (unsigned I = 0; I != NumArgs; ++I)
-    CanonArgs.push_back(getCanonicalTemplateArgument(Args[I]));
+  for (const TemplateArgument &Arg : Args)
+    CanonArgs.push_back(getCanonicalTemplateArgument(Arg));
 
   // Determine whether this canonical template specialization type already
   // exists.
   llvm::FoldingSetNodeID ID;
   TemplateSpecializationType::Profile(ID, CanonTemplate,
-                                      CanonArgs.data(), NumArgs, *this);
+                                      CanonArgs, *this);
 
   void *InsertPos = nullptr;
   TemplateSpecializationType *Spec
@@ -3500,7 +3558,7 @@
                           sizeof(TemplateArgument) * NumArgs),
                          TypeAlignment);
     Spec = new (Mem) TemplateSpecializationType(CanonTemplate,
-                                                CanonArgs.data(), NumArgs,
+                                                CanonArgs,
                                                 QualType(), QualType());
     Types.push_back(Spec);
     TemplateSpecializationTypes.InsertNode(Spec, InsertPos);
@@ -3600,9 +3658,7 @@
   SmallVector<TemplateArgument, 16> ArgCopy;
   for (unsigned I = 0, E = Args.size(); I != E; ++I)
     ArgCopy.push_back(Args[I].getArgument());
-  return getDependentTemplateSpecializationType(Keyword, NNS, Name,
-                                                ArgCopy.size(),
-                                                ArgCopy.data());
+  return getDependentTemplateSpecializationType(Keyword, NNS, Name, ArgCopy);
 }
 
 QualType
@@ -3610,14 +3666,13 @@
                                  ElaboratedTypeKeyword Keyword,
                                  NestedNameSpecifier *NNS,
                                  const IdentifierInfo *Name,
-                                 unsigned NumArgs,
-                                 const TemplateArgument *Args) const {
+                                 ArrayRef<TemplateArgument> Args) const {
   assert((!NNS || NNS->isDependent()) && 
          "nested-name-specifier must be dependent");
 
   llvm::FoldingSetNodeID ID;
   DependentTemplateSpecializationType::Profile(ID, *this, Keyword, NNS,
-                                               Name, NumArgs, Args);
+                                               Name, Args);
 
   void *InsertPos = nullptr;
   DependentTemplateSpecializationType *T
@@ -3631,6 +3686,7 @@
   if (Keyword == ETK_None) CanonKeyword = ETK_Typename;
 
   bool AnyNonCanonArgs = false;
+  unsigned NumArgs = Args.size();
   SmallVector<TemplateArgument, 16> CanonArgs(NumArgs);
   for (unsigned I = 0; I != NumArgs; ++I) {
     CanonArgs[I] = getCanonicalTemplateArgument(Args[I]);
@@ -3641,8 +3697,8 @@
   QualType Canon;
   if (AnyNonCanonArgs || CanonNNS != NNS || CanonKeyword != Keyword) {
     Canon = getDependentTemplateSpecializationType(CanonKeyword, CanonNNS,
-                                                   Name, NumArgs,
-                                                   CanonArgs.data());
+                                                   Name,
+                                                   CanonArgs);
 
     // Find the insert position again.
     DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos);
@@ -3652,7 +3708,7 @@
                         sizeof(TemplateArgument) * NumArgs),
                        TypeAlignment);
   T = new (Mem) DependentTemplateSpecializationType(Keyword, NNS,
-                                                    Name, NumArgs, Args, Canon);
+                                                    Name, Args, Canon);
   Types.push_back(T);
   DependentTemplateSpecializationTypes.InsertNode(T, InsertPos);
   return QualType(T, 0);
@@ -4020,13 +4076,35 @@
                                            QualType UnderlyingType,
                                            UnaryTransformType::UTTKind Kind)
     const {
-  UnaryTransformType *Ty =
-    new (*this, TypeAlignment) UnaryTransformType (BaseType, UnderlyingType, 
-                                                   Kind,
-                                 UnderlyingType->isDependentType() ?
-                                 QualType() : getCanonicalType(UnderlyingType));
-  Types.push_back(Ty);
-  return QualType(Ty, 0);
+  UnaryTransformType *ut = nullptr;
+
+  if (BaseType->isDependentType()) {
+    // Look in the folding set for an existing type.
+    llvm::FoldingSetNodeID ID;
+    DependentUnaryTransformType::Profile(ID, getCanonicalType(BaseType), Kind);
+
+    void *InsertPos = nullptr;
+    DependentUnaryTransformType *Canon
+      = DependentUnaryTransformTypes.FindNodeOrInsertPos(ID, InsertPos);
+
+    if (!Canon) {
+      // Build a new, canonical __underlying_type(type) type.
+      Canon = new (*this, TypeAlignment)
+             DependentUnaryTransformType(*this, getCanonicalType(BaseType),
+                                         Kind);
+      DependentUnaryTransformTypes.InsertNode(Canon, InsertPos);
+    }
+    ut = new (*this, TypeAlignment) UnaryTransformType (BaseType,
+                                                        QualType(), Kind,
+                                                        QualType(Canon, 0));
+  } else {
+    QualType CanonType = getCanonicalType(UnderlyingType);
+    ut = new (*this, TypeAlignment) UnaryTransformType (BaseType,
+                                                        UnderlyingType, Kind,
+                                                        CanonType);
+  }
+  Types.push_back(ut);
+  return QualType(ut, 0);
 }
 
 /// getAutoType - Return the uniqued reference to the 'auto' type which has been
@@ -4631,6 +4709,7 @@
   case BuiltinType::Float:      return FloatRank;
   case BuiltinType::Double:     return DoubleRank;
   case BuiltinType::LongDouble: return LongDoubleRank;
+  case BuiltinType::Float128:   return Float128Rank;
   }
 }
 
@@ -4647,6 +4726,7 @@
     case FloatRank:      return FloatComplexTy;
     case DoubleRank:     return DoubleComplexTy;
     case LongDoubleRank: return LongDoubleComplexTy;
+    case Float128Rank:   return Float128ComplexTy;
     }
   }
 
@@ -4656,6 +4736,7 @@
   case FloatRank:      return FloatTy;
   case DoubleRank:     return DoubleTy;
   case LongDoubleRank: return LongDoubleTy;
+  case Float128Rank:   return Float128Ty;
   }
   llvm_unreachable("getFloatingRank(): illegal value for rank");
 }
@@ -4884,21 +4965,27 @@
     CFConstantStringTagDecl->startDefinition();
 
     QualType FieldTypes[4];
+    const char *FieldNames[4];
 
     // const int *isa;
     FieldTypes[0] = getPointerType(IntTy.withConst());
+    FieldNames[0] = "isa";
     // int flags;
     FieldTypes[1] = IntTy;
+    FieldNames[1] = "flags";
     // const char *str;
     FieldTypes[2] = getPointerType(CharTy.withConst());
+    FieldNames[2] = "str";
     // long length;
     FieldTypes[3] = LongTy;
+    FieldNames[3] = "length";
 
     // Create fields
     for (unsigned i = 0; i < 4; ++i) {
       FieldDecl *Field = FieldDecl::Create(*this, CFConstantStringTagDecl,
                                            SourceLocation(),
-                                           SourceLocation(), nullptr,
+                                           SourceLocation(),
+                                           &Idents.get(FieldNames[i]),
                                            FieldTypes[i], /*TInfo=*/nullptr,
                                            /*BitWidth=*/nullptr,
                                            /*Mutable=*/false,
@@ -5125,6 +5212,27 @@
          !VD->getFirstDecl()->isOutOfLine() && VD->getFirstDecl()->hasInit();
 }
 
+ASTContext::InlineVariableDefinitionKind
+ASTContext::getInlineVariableDefinitionKind(const VarDecl *VD) const {
+  if (!VD->isInline())
+    return InlineVariableDefinitionKind::None;
+
+  // In almost all cases, it's a weak definition.
+  auto *First = VD->getFirstDecl();
+  if (!First->isConstexpr() || First->isInlineSpecified() ||
+      !VD->isStaticDataMember())
+    return InlineVariableDefinitionKind::Weak;
+
+  // If there's a file-context declaration in this translation unit, it's a
+  // non-discardable definition.
+  for (auto *D : VD->redecls())
+    if (D->getLexicalDeclContext()->isFileContext())
+      return InlineVariableDefinitionKind::Strong;
+
+  // If we've not seen one yet, we don't know.
+  return InlineVariableDefinitionKind::WeakUnknown;
+}
+
 static inline 
 std::string charUnitsToString(const CharUnits &CU) {
   return llvm::itostr(CU.getQuantity());
@@ -5151,7 +5259,7 @@
   SourceLocation Loc;
   CharUnits PtrSize = getTypeSizeInChars(VoidPtrTy);
   CharUnits ParmOffset = PtrSize;
-  for (auto PI : Decl->params()) {
+  for (auto PI : Decl->parameters()) {
     QualType PType = PI->getType();
     CharUnits sz = getObjCEncodingTypeSize(PType);
     if (sz.isZero())
@@ -5166,7 +5274,7 @@
   
   // Argument types.
   ParmOffset = PtrSize;
-  for (auto PVDecl : Decl->params()) {
+  for (auto PVDecl : Decl->parameters()) {
     QualType PType = PVDecl->getOriginalType(); 
     if (const ArrayType *AT =
           dyn_cast<ArrayType>(PType->getCanonicalTypeInternal())) {
@@ -5194,7 +5302,7 @@
   getObjCEncodingForType(Decl->getReturnType(), S);
   CharUnits ParmOffset;
   // Compute size of all parameters.
-  for (auto PI : Decl->params()) {
+  for (auto PI : Decl->parameters()) {
     QualType PType = PI->getType();
     CharUnits sz = getObjCEncodingTypeSize(PType);
     if (sz.isZero())
@@ -5208,7 +5316,7 @@
   ParmOffset = CharUnits::Zero();
 
   // Argument types.
-  for (auto PVDecl : Decl->params()) {
+  for (auto PVDecl : Decl->parameters()) {
     QualType PType = PVDecl->getOriginalType();
     if (const ArrayType *AT =
           dyn_cast<ArrayType>(PType->getCanonicalTypeInternal())) {
@@ -5479,6 +5587,7 @@
     case BuiltinType::LongDouble: return 'D';
     case BuiltinType::NullPtr:    return '*'; // like char*
 
+    case BuiltinType::Float128:
     case BuiltinType::Half:
       // FIXME: potentially need @encodes for these!
       return ' ';
@@ -5489,18 +5598,9 @@
       llvm_unreachable("@encoding ObjC primitive type");
 
     // OpenCL and placeholder types don't need @encodings.
-    case BuiltinType::OCLImage1d:
-    case BuiltinType::OCLImage1dArray:
-    case BuiltinType::OCLImage1dBuffer:
-    case BuiltinType::OCLImage2d:
-    case BuiltinType::OCLImage2dArray:
-    case BuiltinType::OCLImage2dDepth:
-    case BuiltinType::OCLImage2dArrayDepth:
-    case BuiltinType::OCLImage2dMSAA:
-    case BuiltinType::OCLImage2dArrayMSAA:
-    case BuiltinType::OCLImage2dMSAADepth:
-    case BuiltinType::OCLImage2dArrayMSAADepth:
-    case BuiltinType::OCLImage3d:
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
     case BuiltinType::OCLEvent:
     case BuiltinType::OCLClkEvent:
     case BuiltinType::OCLQueue:
@@ -5720,8 +5820,7 @@
         const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs();
         llvm::raw_string_ostream OS(S);
         TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                            TemplateArgs.data(),
-                                            TemplateArgs.size(),
+                                            TemplateArgs.asArray(),
                                             (*this).getPrintingPolicy());
       }
     } else {
@@ -6383,6 +6482,7 @@
 
   // };
   VaListDecl->completeDefinition();
+  Context->VaListTagDecl = VaListDecl;
 
   // typedef struct __va_list __builtin_va_list;
   QualType T = Context->getRecordType(VaListDecl);
@@ -7624,6 +7724,15 @@
   Qualifiers LQuals = LHSCan.getLocalQualifiers();
   Qualifiers RQuals = RHSCan.getLocalQualifiers();
   if (LQuals != RQuals) {
+    if (getLangOpts().OpenCL) {
+      if (LHSCan.getUnqualifiedType() != RHSCan.getUnqualifiedType() ||
+          LQuals.getCVRQualifiers() != RQuals.getCVRQualifiers())
+        return QualType();
+      if (LQuals.isAddressSpaceSupersetOf(RQuals))
+        return LHS;
+      if (RQuals.isAddressSpaceSupersetOf(LQuals))
+        return RHS;
+    }
     // If any of these qualifiers are different, we have a type
     // mismatch.
     if (LQuals.getCVRQualifiers() != RQuals.getCVRQualifiers() ||
@@ -8416,22 +8525,29 @@
   return GVA_DiscardableODR;
 }
 
-static GVALinkage adjustGVALinkageForAttributes(GVALinkage L, const Decl *D) {
+static GVALinkage adjustGVALinkageForAttributes(const ASTContext &Context,
+                                                GVALinkage L, const Decl *D) {
   // See http://msdn.microsoft.com/en-us/library/xa0d9ste.aspx
   // dllexport/dllimport on inline functions.
   if (D->hasAttr<DLLImportAttr>()) {
     if (L == GVA_DiscardableODR || L == GVA_StrongODR)
       return GVA_AvailableExternally;
-  } else if (D->hasAttr<DLLExportAttr>() || D->hasAttr<CUDAGlobalAttr>()) {
+  } else if (D->hasAttr<DLLExportAttr>()) {
     if (L == GVA_DiscardableODR)
       return GVA_StrongODR;
+  } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice &&
+             D->hasAttr<CUDAGlobalAttr>()) {
+    // Device-side functions with __global__ attribute must always be
+    // visible externally so they can be launched from host.
+    if (L == GVA_DiscardableODR || L == GVA_Internal)
+      return GVA_StrongODR;
   }
   return L;
 }
 
 GVALinkage ASTContext::GetGVALinkageForFunction(const FunctionDecl *FD) const {
-  return adjustGVALinkageForAttributes(basicGVALinkageForFunction(*this, FD),
-                                       FD);
+  return adjustGVALinkageForAttributes(
+      *this, basicGVALinkageForFunction(*this, FD), FD);
 }
 
 static GVALinkage basicGVALinkageForVariable(const ASTContext &Context,
@@ -8464,15 +8580,31 @@
   if (Context.isMSStaticDataMemberInlineDefinition(VD))
     return GVA_DiscardableODR;
 
+  // Most non-template variables have strong linkage; inline variables are
+  // linkonce_odr or (occasionally, for compatibility) weak_odr.
+  GVALinkage StrongLinkage;
+  switch (Context.getInlineVariableDefinitionKind(VD)) {
+  case ASTContext::InlineVariableDefinitionKind::None:
+    StrongLinkage = GVA_StrongExternal;
+    break;
+  case ASTContext::InlineVariableDefinitionKind::Weak:
+  case ASTContext::InlineVariableDefinitionKind::WeakUnknown:
+    StrongLinkage = GVA_DiscardableODR;
+    break;
+  case ASTContext::InlineVariableDefinitionKind::Strong:
+    StrongLinkage = GVA_StrongODR;
+    break;
+  }
+
   switch (VD->getTemplateSpecializationKind()) {
   case TSK_Undeclared:
-    return GVA_StrongExternal;
+    return StrongLinkage;
 
   case TSK_ExplicitSpecialization:
     return Context.getTargetInfo().getCXXABI().isMicrosoft() &&
                    VD->isStaticDataMember()
                ? GVA_StrongODR
-               : GVA_StrongExternal;
+               : StrongLinkage;
 
   case TSK_ExplicitInstantiationDefinition:
     return GVA_StrongODR;
@@ -8488,8 +8620,8 @@
 }
 
 GVALinkage ASTContext::GetGVALinkageForVariable(const VarDecl *VD) {
-  return adjustGVALinkageForAttributes(basicGVALinkageForVariable(*this, VD),
-                                       VD);
+  return adjustGVALinkageForAttributes(
+      *this, basicGVALinkageForVariable(*this, VD), VD);
 }
 
 bool ASTContext::DeclMustBeEmitted(const Decl *D) {
@@ -8506,7 +8638,18 @@
     // We never need to emit an uninstantiated function template.
     if (FD->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate)
       return false;
-  } else if (isa<OMPThreadPrivateDecl>(D))
+  } else if (isa<PragmaCommentDecl>(D))
+    return true;
+  else if (isa<OMPThreadPrivateDecl>(D) ||
+           D->hasAttr<OMPDeclareTargetDeclAttr>())
+    return true;
+  else if (isa<PragmaDetectMismatchDecl>(D))
+    return true;
+  else if (isa<OMPThreadPrivateDecl>(D))
+    return !D->getDeclContext()->isDependentContext();
+  else if (isa<OMPDeclareReductionDecl>(D))
+    return !D->getDeclContext()->isDependentContext();
+  else if (isa<ImportDecl>(D))
     return true;
   else
     return false;
@@ -8578,6 +8721,14 @@
       !VD->evaluateValue())
     return true;
 
+  // Likewise, variables with tuple-like bindings are required if their
+  // bindings have side-effects.
+  if (auto *DD = dyn_cast<DecompositionDecl>(VD))
+    for (auto *BD : DD->bindings())
+      if (auto *BindingVD = BD->getHoldingVar())
+        if (DeclMustBeEmitted(BindingVD))
+          return true;
+
   return false;
 }
 
@@ -8587,8 +8738,25 @@
   if (IsCXXMethod)
     return ABI->getDefaultMethodCallConv(IsVariadic);
 
-  if (LangOpts.MRTD && !IsVariadic) return CC_X86StdCall;
-
+  switch (LangOpts.getDefaultCallingConv()) {
+  case LangOptions::DCC_None:
+    break;
+  case LangOptions::DCC_CDecl:
+    return CC_C;
+  case LangOptions::DCC_FastCall:
+    if (getTargetInfo().hasFeature("sse2"))
+      return CC_X86FastCall;
+    break;
+  case LangOptions::DCC_StdCall:
+    if (!IsVariadic)
+      return CC_X86StdCall;
+    break;
+  case LangOptions::DCC_VectorCall:
+    // __vectorcall cannot be applied to variadic functions.
+    if (!IsVariadic)
+      return CC_X86VectorCall;
+    break;
+  }
   return Target->getDefaultCallingConv(TargetInfo::CCMT_Unknown);
 }
 
@@ -8668,6 +8836,8 @@
     return DoubleTy;
   case TargetInfo::LongDouble:
     return LongDoubleTy;
+  case TargetInfo::Float128:
+    return Float128Ty;
   case TargetInfo::NoFloat:
     return QualType();
   }
@@ -8681,8 +8851,7 @@
 }
 
 unsigned ASTContext::getManglingNumber(const NamedDecl *ND) const {
-  llvm::DenseMap<const NamedDecl *, unsigned>::const_iterator I =
-    MangleNumbers.find(ND);
+  auto I = MangleNumbers.find(ND);
   return I != MangleNumbers.end() ? I->second : 1;
 }
 
@@ -8692,8 +8861,7 @@
 }
 
 unsigned ASTContext::getStaticLocalNumber(const VarDecl *VD) const {
-  llvm::DenseMap<const VarDecl *, unsigned>::const_iterator I =
-      StaticLocalNumbers.find(VD);
+  auto I = StaticLocalNumbers.find(VD);
   return I != StaticLocalNumbers.end() ? I->second : 1;
 }
 
diff --git a/lib/AST/ASTDiagnostic.cpp b/lib/AST/ASTDiagnostic.cpp
index a48b1d7..590defb 100644
--- a/lib/AST/ASTDiagnostic.cpp
+++ b/lib/AST/ASTDiagnostic.cpp
@@ -10,6 +10,7 @@
 // This file implements a diagnostic formatting hook for AST elements.
 //
 //===----------------------------------------------------------------------===//
+
 #include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTLambda.h"
@@ -19,7 +20,6 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -118,7 +118,7 @@
         if (DesugarArgument) {
           ShouldAKA = true;
           QT = Context.getTemplateSpecializationType(
-              TST->getTemplateName(), Args.data(), Args.size(), QT);
+              TST->getTemplateName(), Args, QT);
         }
         break;
       }
@@ -443,7 +443,6 @@
       NeedQuotes = false;
       break;
     }
-
   }
 
   if (NeedQuotes) {
@@ -497,7 +496,7 @@
     enum DiffKind {
       /// Incomplete or invalid node.
       Invalid,
-      /// Another level of templates, requires that
+      /// Another level of templates
       Template,
       /// Type difference, all type differences except those falling under
       /// the Template difference.
@@ -616,7 +615,7 @@
       SetDefault(FromDefault, ToDefault);
     }
 
-    void SetIntegerDiff(llvm::APSInt FromInt, llvm::APSInt ToInt,
+    void SetIntegerDiff(const llvm::APSInt &FromInt, const llvm::APSInt &ToInt,
                         bool IsValidFromInt, bool IsValidToInt,
                         QualType FromIntType, QualType ToIntType,
                         Expr *FromExpr, Expr *ToExpr, bool FromDefault,
@@ -653,7 +652,7 @@
 
     void SetFromDeclarationAndToIntegerDiff(
         ValueDecl *FromValueDecl, bool FromAddressOf, bool FromNullPtr,
-        Expr *FromExpr, llvm::APSInt ToInt, bool IsValidToInt,
+        Expr *FromExpr, const llvm::APSInt &ToInt, bool IsValidToInt,
         QualType ToIntType, Expr *ToExpr, bool FromDefault, bool ToDefault) {
       assert(FlatTree[CurrentNode].Kind == Invalid && "Node is not empty.");
       FlatTree[CurrentNode].Kind = FromDeclarationAndToInteger;
@@ -669,7 +668,7 @@
     }
 
     void SetFromIntegerAndToDeclarationDiff(
-        llvm::APSInt FromInt, bool IsValidFromInt, QualType FromIntType,
+        const llvm::APSInt &FromInt, bool IsValidFromInt, QualType FromIntType,
         Expr *FromExpr, ValueDecl *ToValueDecl, bool ToAddressOf,
         bool ToNullPtr, Expr *ToExpr, bool FromDefault, bool ToDefault) {
       assert(FlatTree[CurrentNode].Kind == Invalid && "Node is not empty.");
@@ -917,6 +916,8 @@
       /// template argument.
       InternalIterator(const TemplateSpecializationType *TST)
           : TST(TST), Index(0), CurrentTA(nullptr), EndTA(nullptr) {
+        if (!TST) return;
+
         if (isEnd()) return;
 
         // Set to first template argument.  If not a parameter pack, done.
@@ -937,11 +938,13 @@
 
       /// isEnd - Returns true if the iterator is one past the end.
       bool isEnd() const {
+        assert(TST && "InternalIterator is invalid with a null TST.");
         return Index >= TST->getNumArgs();
       }
 
       /// &operator++ - Increment the iterator to the next template argument.
       InternalIterator &operator++() {
+        assert(TST && "InternalIterator is invalid with a null TST.");
         if (isEnd()) {
           return *this;
         }
@@ -977,6 +980,7 @@
 
       /// operator* - Returns the appropriate TemplateArgument.
       reference operator*() const {
+        assert(TST && "InternalIterator is invalid with a null TST.");
         assert(!isEnd() && "Index exceeds number of arguments.");
         if (CurrentTA == EndTA)
           return TST->getArg(Index);
@@ -986,6 +990,7 @@
 
       /// operator-> - Allow access to the underlying TemplateArgument.
       pointer operator->() const {
+        assert(TST && "InternalIterator is invalid with a null TST.");
         return &operator*();
       }
     };
@@ -1060,8 +1065,7 @@
 
     Ty = Context.getTemplateSpecializationType(
              TemplateName(CTSD->getSpecializedTemplate()),
-             CTSD->getTemplateArgs().data(),
-             CTSD->getTemplateArgs().size(),
+             CTSD->getTemplateArgs().asArray(),
              Ty.getLocalUnqualifiedType().getCanonicalType());
 
     return Ty->getAs<TemplateSpecializationType>();
@@ -1528,12 +1532,14 @@
         OS << FromTD->getNameAsString() << '<';
         Tree.MoveToChild();
         unsigned NumElideArgs = 0;
+        bool AllArgsElided = true;
         do {
           if (ElideType) {
             if (Tree.NodeIsSame()) {
               ++NumElideArgs;
               continue;
             }
+            AllArgsElided = false;
             if (NumElideArgs > 0) {
               PrintElideArgs(NumElideArgs, Indent);
               NumElideArgs = 0;
@@ -1544,8 +1550,12 @@
           if (Tree.HasNextSibling())
             OS << ", ";
         } while (Tree.AdvanceSibling());
-        if (NumElideArgs > 0)
-          PrintElideArgs(NumElideArgs, Indent);
+        if (NumElideArgs > 0) {
+          if (AllArgsElided)
+            OS << "...";
+          else
+            PrintElideArgs(NumElideArgs, Indent);
+        }
 
         Tree.Parent();
         OS << ">";
@@ -1627,7 +1637,6 @@
       Unbold();
       OS << "]";
     }
-    return;
   }
 
   /// PrintExpr - Prints out the expr template arguments, highlighting argument
@@ -1700,7 +1709,7 @@
 
   /// PrintAPSInt - Handles printing of integral arguments, highlighting
   /// argument differences.
-  void PrintAPSInt(llvm::APSInt FromInt, llvm::APSInt ToInt,
+  void PrintAPSInt(const llvm::APSInt &FromInt, const llvm::APSInt &ToInt,
                    bool IsValidFromInt, bool IsValidToInt, QualType FromIntType,
                    QualType ToIntType, Expr *FromExpr, Expr *ToExpr,
                    bool FromDefault, bool ToDefault, bool Same) {
@@ -1733,8 +1742,8 @@
 
   /// PrintAPSInt - If valid, print the APSInt.  If the expression is
   /// gives more information, print it too.
-  void PrintAPSInt(llvm::APSInt Val, Expr *E, bool Valid, QualType IntType,
-                   bool PrintType) {
+  void PrintAPSInt(const llvm::APSInt &Val, Expr *E, bool Valid,
+                   QualType IntType, bool PrintType) {
     Bold();
     if (Valid) {
       if (HasExtraInfo(E)) {
@@ -1839,14 +1848,13 @@
       Unbold();
       OS << ']';
     }
-
   }
 
   /// PrintValueDeclAndInteger - Uses the print functions for ValueDecl and
   /// APSInt to print a mixed difference.
   void PrintValueDeclAndInteger(ValueDecl *VD, bool NeedAddressOf,
                                 bool IsNullPtr, Expr *VDExpr, bool DefaultDecl,
-                                llvm::APSInt Val, QualType IntType,
+                                const llvm::APSInt &Val, QualType IntType,
                                 Expr *IntExpr, bool DefaultInt) {
     if (!PrintTree) {
       OS << (DefaultDecl ? "(default) " : "");
@@ -1866,7 +1874,7 @@
 
   /// PrintIntegerAndValueDecl - Uses the print functions for APSInt and
   /// ValueDecl to print a mixed difference.
-  void PrintIntegerAndValueDecl(llvm::APSInt Val, QualType IntType,
+  void PrintIntegerAndValueDecl(const llvm::APSInt &Val, QualType IntType,
                                 Expr *IntExpr, bool DefaultInt, ValueDecl *VD,
                                 bool NeedAddressOf, bool IsNullPtr,
                                 Expr *VDExpr, bool DefaultDecl) {
@@ -2021,7 +2029,7 @@
     return true;
   }
 }; // end class TemplateDiff
-}  // end namespace
+}  // end anonymous namespace
 
 /// FormatTemplateTypeDiff - A helper static function to start the template
 /// diff and return the properly formatted string.  Returns true if the diff
diff --git a/lib/AST/ASTDumper.cpp b/lib/AST/ASTDumper.cpp
index 76401f5..55f9309 100644
--- a/lib/AST/ASTDumper.cpp
+++ b/lib/AST/ASTDumper.cpp
@@ -18,13 +18,14 @@
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclLookups.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/AST/DeclVisitor.h"
+#include "clang/AST/LocInfoType.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/AST/TypeVisitor.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Sema/LocInfoType.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace clang;
 using namespace clang::comments;
@@ -403,6 +404,9 @@
     void VisitAtomicType(const AtomicType *T) {
       dumpTypeAsChild(T->getValueType());
     }
+    void VisitPipeType(const PipeType *T) {
+      dumpTypeAsChild(T->getElementType());
+    }
     void VisitAdjustedType(const AdjustedType *T) {
       dumpTypeAsChild(T->getOriginalType());
     }
@@ -424,8 +428,18 @@
     void VisitFunctionDecl(const FunctionDecl *D);
     void VisitFieldDecl(const FieldDecl *D);
     void VisitVarDecl(const VarDecl *D);
+    void VisitDecompositionDecl(const DecompositionDecl *D);
+    void VisitBindingDecl(const BindingDecl *D);
     void VisitFileScopeAsmDecl(const FileScopeAsmDecl *D);
     void VisitImportDecl(const ImportDecl *D);
+    void VisitPragmaCommentDecl(const PragmaCommentDecl *D);
+    void VisitPragmaDetectMismatchDecl(const PragmaDetectMismatchDecl *D);
+    void VisitCapturedDecl(const CapturedDecl *D);
+
+    // OpenMP decls
+    void VisitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D);
+    void VisitOMPDeclareReductionDecl(const OMPDeclareReductionDecl *D);
+    void VisitOMPCapturedExprDecl(const OMPCapturedExprDecl *D);
 
     // C++ Decls
     void VisitNamespaceDecl(const NamespaceDecl *D);
@@ -462,6 +476,7 @@
     void VisitUnresolvedUsingTypenameDecl(const UnresolvedUsingTypenameDecl *D);
     void VisitUnresolvedUsingValueDecl(const UnresolvedUsingValueDecl *D);
     void VisitUsingShadowDecl(const UsingShadowDecl *D);
+    void VisitConstructorUsingShadowDecl(const ConstructorUsingShadowDecl *D);
     void VisitLinkageSpecDecl(const LinkageSpecDecl *D);
     void VisitAccessSpecDecl(const AccessSpecDecl *D);
     void VisitFriendDecl(const FriendDecl *D);
@@ -487,6 +502,10 @@
     void VisitLabelStmt(const LabelStmt *Node);
     void VisitGotoStmt(const GotoStmt *Node);
     void VisitCXXCatchStmt(const CXXCatchStmt *Node);
+    void VisitCapturedStmt(const CapturedStmt *Node);
+
+    // OpenMP
+    void VisitOMPExecutableDirective(const OMPExecutableDirective *Node);
 
     // Exprs
     void VisitExpr(const Expr *Node);
@@ -697,6 +716,12 @@
 }
 
 void ASTDumper::dumpBareDeclRef(const Decl *D) {
+  if (!D) {
+    ColorScope Color(*this, NullColor);
+    OS << "<<<NULL>>>";
+    return;
+  }
+
   {
     ColorScope Color(*this, DeclKindNameColor);
     OS << D->getDeclKindName();
@@ -1138,10 +1163,8 @@
   if (!D->param_begin() && D->getNumParams())
     dumpChild([=] { OS << "<<NULL params x " << D->getNumParams() << ">>"; });
   else
-    for (FunctionDecl::param_const_iterator I = D->param_begin(),
-                                            E = D->param_end();
-         I != E; ++I)
-      dumpDecl(*I);
+    for (const ParmVarDecl *Parameter : D->parameters())
+      dumpDecl(Parameter);
 
   if (const CXXConstructorDecl *C = dyn_cast<CXXConstructorDecl>(D))
     for (CXXConstructorDecl::init_const_iterator I = C->init_begin(),
@@ -1182,6 +1205,10 @@
     OS << " __module_private__";
   if (D->isNRVOVariable())
     OS << " nrvo";
+  if (D->isInline())
+    OS << " inline";
+  if (D->isConstexpr())
+    OS << " constexpr";
   if (D->hasInit()) {
     switch (D->getInitStyle()) {
     case VarDecl::CInit: OS << " cinit"; break;
@@ -1192,6 +1219,19 @@
   }
 }
 
+void ASTDumper::VisitDecompositionDecl(const DecompositionDecl *D) {
+  VisitVarDecl(D);
+  for (auto *B : D->bindings())
+    dumpDecl(B);
+}
+
+void ASTDumper::VisitBindingDecl(const BindingDecl *D) {
+  dumpName(D);
+  dumpType(D->getType());
+  if (auto *E = D->getBinding())
+    dumpStmt(E);
+}
+
 void ASTDumper::VisitFileScopeAsmDecl(const FileScopeAsmDecl *D) {
   dumpStmt(D->getAsmString());
 }
@@ -1200,6 +1240,56 @@
   OS << ' ' << D->getImportedModule()->getFullModuleName();
 }
 
+void ASTDumper::VisitPragmaCommentDecl(const PragmaCommentDecl *D) {
+  OS << ' ';
+  switch (D->getCommentKind()) {
+  case PCK_Unknown:  llvm_unreachable("unexpected pragma comment kind");
+  case PCK_Compiler: OS << "compiler"; break;
+  case PCK_ExeStr:   OS << "exestr"; break;
+  case PCK_Lib:      OS << "lib"; break;
+  case PCK_Linker:   OS << "linker"; break;
+  case PCK_User:     OS << "user"; break;
+  }
+  StringRef Arg = D->getArg();
+  if (!Arg.empty())
+    OS << " \"" << Arg << "\"";
+}
+
+void ASTDumper::VisitPragmaDetectMismatchDecl(
+    const PragmaDetectMismatchDecl *D) {
+  OS << " \"" << D->getName() << "\" \"" << D->getValue() << "\"";
+}
+
+void ASTDumper::VisitCapturedDecl(const CapturedDecl *D) {
+  dumpStmt(D->getBody());
+}
+
+//===----------------------------------------------------------------------===//
+// OpenMP Declarations
+//===----------------------------------------------------------------------===//
+
+void ASTDumper::VisitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
+  for (auto *E : D->varlists())
+    dumpStmt(E);
+}
+
+void ASTDumper::VisitOMPDeclareReductionDecl(const OMPDeclareReductionDecl *D) {
+  dumpName(D);
+  dumpType(D->getType());
+  OS << " combiner";
+  dumpStmt(D->getCombiner());
+  if (auto *Initializer = D->getInitializer()) {
+    OS << " initializer";
+    dumpStmt(Initializer);
+  }
+}
+
+void ASTDumper::VisitOMPCapturedExprDecl(const OMPCapturedExprDecl *D) {
+  dumpName(D);
+  dumpType(D->getType());
+  dumpStmt(D->getInit());
+}
+
 //===----------------------------------------------------------------------===//
 // C++ Declarations
 //===----------------------------------------------------------------------===//
@@ -1423,6 +1513,31 @@
     dumpTypeAsChild(TD->getTypeForDecl());
 }
 
+void ASTDumper::VisitConstructorUsingShadowDecl(
+    const ConstructorUsingShadowDecl *D) {
+  if (D->constructsVirtualBase())
+    OS << " virtual";
+
+  dumpChild([=] {
+    OS << "target ";
+    dumpBareDeclRef(D->getTargetDecl());
+  });
+
+  dumpChild([=] {
+    OS << "nominated ";
+    dumpBareDeclRef(D->getNominatedBaseClass());
+    OS << ' ';
+    dumpBareDeclRef(D->getNominatedBaseClassShadowDecl());
+  });
+
+  dumpChild([=] {
+    OS << "constructed ";
+    dumpBareDeclRef(D->getConstructedBaseClass());
+    OS << ' ';
+    dumpBareDeclRef(D->getConstructedBaseClassShadowDecl());
+  });
+}
+
 void ASTDumper::VisitLinkageSpecDecl(const LinkageSpecDecl *D) {
   switch (D->getLanguage()) {
   case LinkageSpecDecl::lang_c: OS << " C"; break;
@@ -1482,10 +1597,8 @@
   if (D->isThisDeclarationADefinition()) {
     dumpDeclContext(D);
   } else {
-    for (ObjCMethodDecl::param_const_iterator I = D->param_begin(),
-                                              E = D->param_end();
-         I != E; ++I)
-      dumpDecl(*I);
+    for (const ParmVarDecl *Parameter : D->parameters())
+      dumpDecl(Parameter);
   }
 
   if (D->isVariadic())
@@ -1615,7 +1728,7 @@
 }
 
 void ASTDumper::VisitBlockDecl(const BlockDecl *D) {
-  for (auto I : D->params())
+  for (auto I : D->parameters())
     dumpDecl(I);
 
   if (D->isVariadic())
@@ -1707,6 +1820,41 @@
   dumpDecl(Node->getExceptionDecl());
 }
 
+void ASTDumper::VisitCapturedStmt(const CapturedStmt *Node) {
+  VisitStmt(Node);
+  dumpDecl(Node->getCapturedDecl());
+}
+
+//===----------------------------------------------------------------------===//
+//  OpenMP dumping methods.
+//===----------------------------------------------------------------------===//
+
+void ASTDumper::VisitOMPExecutableDirective(
+    const OMPExecutableDirective *Node) {
+  VisitStmt(Node);
+  for (auto *C : Node->clauses()) {
+    dumpChild([=] {
+      if (!C) {
+        ColorScope Color(*this, NullColor);
+        OS << "<<<NULL>>> OMPClause";
+        return;
+      }
+      {
+        ColorScope Color(*this, AttrColor);
+        StringRef ClauseName(getOpenMPClauseName(C->getClauseKind()));
+        OS << "OMP" << ClauseName.substr(/*Start=*/0, /*N=*/1).upper()
+           << ClauseName.drop_front() << "Clause";
+      }
+      dumpPointer(C);
+      dumpSourceRange(SourceRange(C->getLocStart(), C->getLocEnd()));
+      if (C->isImplicit())
+        OS << " <implicit>";
+      for (auto *S : C->children())
+        dumpStmt(S);
+    });
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //  Expr dumping methods.
 //===----------------------------------------------------------------------===//
@@ -2076,8 +2224,10 @@
 
 void ASTDumper::VisitObjCBoxedExpr(const ObjCBoxedExpr *Node) {
   VisitExpr(Node);
-  OS << " selector=";
-  Node->getBoxingMethod()->getSelector().print(OS);
+  if (auto *BoxingMethod = Node->getBoxingMethod()) {
+    OS << " selector=";
+    BoxingMethod->getSelector().print(OS);
+  }
 }
 
 void ASTDumper::VisitObjCAtCatchStmt(const ObjCAtCatchStmt *Node) {
diff --git a/lib/AST/ASTImporter.cpp b/lib/AST/ASTImporter.cpp
index dd8a06c..76b495f8 100644
--- a/lib/AST/ASTImporter.cpp
+++ b/lib/AST/ASTImporter.cpp
@@ -29,7 +29,7 @@
                           public DeclVisitor<ASTNodeImporter, Decl *>,
                           public StmtVisitor<ASTNodeImporter, Stmt *> {
     ASTImporter &Importer;
-    
+
   public:
     explicit ASTNodeImporter(ASTImporter &Importer) : Importer(Importer) { }
     
@@ -64,11 +64,12 @@
     QualType VisitDecltypeType(const DecltypeType *T);
     QualType VisitUnaryTransformType(const UnaryTransformType *T);
     QualType VisitAutoType(const AutoType *T);
+    QualType VisitInjectedClassNameType(const InjectedClassNameType *T);
     // FIXME: DependentDecltypeType
     QualType VisitRecordType(const RecordType *T);
     QualType VisitEnumType(const EnumType *T);
     QualType VisitAttributedType(const AttributedType *T);
-    // FIXME: TemplateTypeParmType
+    QualType VisitTemplateTypeParmType(const TemplateTypeParmType *T);
     // FIXME: SubstTemplateTypeParmType
     QualType VisitTemplateSpecializationType(const TemplateSpecializationType *T);
     QualType VisitElaboratedType(const ElaboratedType *T);
@@ -86,6 +87,10 @@
     void ImportDeclarationNameLoc(const DeclarationNameInfo &From,
                                   DeclarationNameInfo& To);
     void ImportDeclContext(DeclContext *FromDC, bool ForceImport = false);
+
+    typedef DesignatedInitExpr::Designator Designator;
+    Designator ImportDesignator(const Designator &D);
+
                         
     /// \brief What we should import from the definition.
     enum ImportDefinitionKind { 
@@ -130,11 +135,13 @@
     bool IsStructuralMatch(ClassTemplateDecl *From, ClassTemplateDecl *To);
     bool IsStructuralMatch(VarTemplateDecl *From, VarTemplateDecl *To);
     Decl *VisitDecl(Decl *D);
+    Decl *VisitAccessSpecDecl(AccessSpecDecl *D);
     Decl *VisitTranslationUnitDecl(TranslationUnitDecl *D);
     Decl *VisitNamespaceDecl(NamespaceDecl *D);
     Decl *VisitTypedefNameDecl(TypedefNameDecl *D, bool IsAlias);
     Decl *VisitTypedefDecl(TypedefDecl *D);
     Decl *VisitTypeAliasDecl(TypeAliasDecl *D);
+    Decl *VisitLabelDecl(LabelDecl *D);
     Decl *VisitEnumDecl(EnumDecl *D);
     Decl *VisitRecordDecl(RecordDecl *D);
     Decl *VisitEnumConstantDecl(EnumConstantDecl *D);
@@ -174,6 +181,7 @@
     DeclGroupRef ImportDeclGroup(DeclGroupRef DG);
 
     Stmt *VisitStmt(Stmt *S);
+    Stmt *VisitGCCAsmStmt(GCCAsmStmt *S);
     Stmt *VisitDeclStmt(DeclStmt *S);
     Stmt *VisitNullStmt(NullStmt *S);
     Stmt *VisitCompoundStmt(CompoundStmt *S);
@@ -191,7 +199,6 @@
     Stmt *VisitContinueStmt(ContinueStmt *S);
     Stmt *VisitBreakStmt(BreakStmt *S);
     Stmt *VisitReturnStmt(ReturnStmt *S);
-    // FIXME: GCCAsmStmt
     // FIXME: MSAsmStmt
     // FIXME: SEHExceptStmt
     // FIXME: SEHFinallyStmt
@@ -212,13 +219,29 @@
 
     // Importing expressions
     Expr *VisitExpr(Expr *E);
+    Expr *VisitVAArgExpr(VAArgExpr *E);
+    Expr *VisitGNUNullExpr(GNUNullExpr *E);
+    Expr *VisitPredefinedExpr(PredefinedExpr *E);
     Expr *VisitDeclRefExpr(DeclRefExpr *E);
+    Expr *VisitImplicitValueInitExpr(ImplicitValueInitExpr *ILE);
+    Expr *VisitDesignatedInitExpr(DesignatedInitExpr *E);
+    Expr *VisitCXXNullPtrLiteralExpr(CXXNullPtrLiteralExpr *E);
     Expr *VisitIntegerLiteral(IntegerLiteral *E);
+    Expr *VisitFloatingLiteral(FloatingLiteral *E);
     Expr *VisitCharacterLiteral(CharacterLiteral *E);
+    Expr *VisitStringLiteral(StringLiteral *E);
+    Expr *VisitCompoundLiteralExpr(CompoundLiteralExpr *E);
+    Expr *VisitAtomicExpr(AtomicExpr *E);
+    Expr *VisitAddrLabelExpr(AddrLabelExpr *E);
     Expr *VisitParenExpr(ParenExpr *E);
+    Expr *VisitParenListExpr(ParenListExpr *E);
+    Expr *VisitStmtExpr(StmtExpr *E);
     Expr *VisitUnaryOperator(UnaryOperator *E);
     Expr *VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *E);
     Expr *VisitBinaryOperator(BinaryOperator *E);
+    Expr *VisitConditionalOperator(ConditionalOperator *E);
+    Expr *VisitBinaryConditionalOperator(BinaryConditionalOperator *E);
+    Expr *VisitOpaqueValueExpr(OpaqueValueExpr *E);
     Expr *VisitCompoundAssignOperator(CompoundAssignOperator *E);
     Expr *VisitImplicitCastExpr(ImplicitCastExpr *E);
     Expr *VisitCStyleCastExpr(CStyleCastExpr *E);
@@ -232,33 +255,33 @@
     Expr *VisitCXXDefaultInitExpr(CXXDefaultInitExpr *E);
     Expr *VisitCXXNamedCastExpr(CXXNamedCastExpr *E);
 
-    template <typename T, typename Iter> bool ImportArray(
-        Iter B, Iter E, llvm::ArrayRef<T*> &ToArray) {
-      size_t NumElements = E - B;
-      SmallVector<T *, 1> ImportedElements(NumElements);
-      ASTImporter &_Importer = Importer;
-
-      bool Failed = false;
-      std::transform(B, E, ImportedElements.begin(),
-                     [&_Importer, &Failed](T *Element) -> T* {
-                       T *ToElement = _Importer.Import(Element);
-                       if (Element && !ToElement)
-                         Failed = true;
-                       return ToElement;
+    template<typename IIter, typename OIter>
+    void ImportArray(IIter Ibegin, IIter Iend, OIter Obegin) {
+      typedef typename std::remove_reference<decltype(*Obegin)>::type ItemT;
+      ASTImporter &ImporterRef = Importer;
+      std::transform(Ibegin, Iend, Obegin,
+                     [&ImporterRef](ItemT From) -> ItemT {
+                       return ImporterRef.Import(From);
                      });
-      
-      if (Failed)
-        return false;
-      
-      T **CopiedElements = new (Importer.getToContext()) T*[NumElements];
-      std::copy(ImportedElements.begin(), ImportedElements.end(),
-          &CopiedElements[0]);
-      ToArray = llvm::ArrayRef<T*>(CopiedElements, NumElements);
-      
-      return true;
+    }
+
+    template<typename IIter, typename OIter>
+    bool ImportArrayChecked(IIter Ibegin, IIter Iend, OIter Obegin) {
+      typedef typename std::remove_reference<decltype(**Obegin)>::type ItemT;
+      ASTImporter &ImporterRef = Importer;
+      bool Failed = false;
+      std::transform(Ibegin, Iend, Obegin,
+                     [&ImporterRef, &Failed](ItemT *From) -> ItemT * {
+                       ItemT *To = ImporterRef.Import(From);
+                       if (!To && From)
+                         Failed = true;
+                       return To;
+                     });
+      return Failed;
     }
   };
 }
+
 using namespace clang;
 
 //----------------------------------------------------------------------------
@@ -650,8 +673,8 @@
     if (!IsStructurallyEquivalent(Context, Function1->getReturnType(),
                                   Function2->getReturnType()))
       return false;
-      if (Function1->getExtInfo() != Function2->getExtInfo())
-        return false;
+    if (Function1->getExtInfo() != Function2->getExtInfo())
+      return false;
     break;
   }
    
@@ -1528,6 +1551,10 @@
 
 QualType ASTNodeImporter::VisitBuiltinType(const BuiltinType *T) {
   switch (T->getKind()) {
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id: \
+    return Importer.getToContext().SingletonId;
+#include "clang/Basic/OpenCLImageTypes.def"
 #define SHARED_SINGLETON_TYPE(Expansion)
 #define BUILTIN_TYPE(Id, SingletonId) \
   case BuiltinType::Id: return Importer.getToContext().SingletonId;
@@ -1806,6 +1833,28 @@
                                              /*IsDependent*/false);
 }
 
+QualType ASTNodeImporter::VisitInjectedClassNameType(
+    const InjectedClassNameType *T) {
+  CXXRecordDecl *D = cast_or_null<CXXRecordDecl>(Importer.Import(T->getDecl()));
+  if (!D)
+    return QualType();
+
+  QualType InjType = Importer.Import(T->getInjectedSpecializationType());
+  if (InjType.isNull())
+    return QualType();
+
+  // FIXME: ASTContext::getInjectedClassNameType is not suitable for AST reading
+  // See comments in InjectedClassNameType definition for details
+  // return Importer.getToContext().getInjectedClassNameType(D, InjType);
+  enum {
+    TypeAlignmentInBits = 4,
+    TypeAlignment = 1 << TypeAlignmentInBits
+  };
+
+  return QualType(new (Importer.getToContext(), TypeAlignment)
+                  InjectedClassNameType(D, InjType), 0);
+}
+
 QualType ASTNodeImporter::VisitRecordType(const RecordType *T) {
   RecordDecl *ToDecl
     = dyn_cast_or_null<RecordDecl>(Importer.Import(T->getDecl()));
@@ -1845,6 +1894,18 @@
     ToModifiedType, ToEquivalentType);
 }
 
+
+QualType ASTNodeImporter::VisitTemplateTypeParmType(
+    const TemplateTypeParmType *T) {
+  TemplateTypeParmDecl *ParmDecl =
+      cast_or_null<TemplateTypeParmDecl>(Importer.Import(T->getDecl()));
+  if (!ParmDecl && T->getDecl())
+    return QualType();
+
+  return Importer.getToContext().getTemplateTypeParmType(
+        T->getDepth(), T->getIndex(), T->isParameterPack(), ParmDecl);
+}
+
 QualType ASTNodeImporter::VisitTemplateSpecializationType(
                                        const TemplateSpecializationType *T) {
   TemplateName ToTemplate = Importer.Import(T->getTemplateName());
@@ -1864,8 +1925,7 @@
       return QualType();
   }
   return Importer.getToContext().getTemplateSpecializationType(ToTemplate, 
-                                                         ToTemplateArgs.data(), 
-                                                         ToTemplateArgs.size(),
+                                                               ToTemplateArgs,
                                                                ToCanonType);
 }
 
@@ -2071,6 +2131,9 @@
     ToData.HasInClassInitializer = FromData.HasInClassInitializer;
     ToData.HasUninitializedReferenceMember
       = FromData.HasUninitializedReferenceMember;
+    ToData.HasUninitializedFields = FromData.HasUninitializedFields;
+    ToData.HasInheritedConstructor = FromData.HasInheritedConstructor;
+    ToData.HasInheritedAssignment = FromData.HasInheritedAssignment;
     ToData.NeedOverloadResolutionForMoveConstructor
       = FromData.NeedOverloadResolutionForMoveConstructor;
     ToData.NeedOverloadResolutionForMoveAssignment
@@ -2086,6 +2149,8 @@
     ToData.HasIrrelevantDestructor = FromData.HasIrrelevantDestructor;
     ToData.HasConstexprNonCopyMoveConstructor
       = FromData.HasConstexprNonCopyMoveConstructor;
+    ToData.HasDefaultedDefaultConstructor
+      = FromData.HasDefaultedDefaultConstructor;
     ToData.DefaultedDefaultConstructorIsConstexpr
       = FromData.DefaultedDefaultConstructorIsConstexpr;
     ToData.HasConstexprDefaultConstructor
@@ -2197,11 +2262,21 @@
     ToParams.push_back(cast<NamedDecl>(To));
   }
   
+  Expr *ToRequiresClause;
+  if (Expr *const R = Params->getRequiresClause()) {
+    ToRequiresClause = Importer.Import(R);
+    if (!ToRequiresClause)
+      return nullptr;
+  } else {
+    ToRequiresClause = nullptr;
+  }
+
   return TemplateParameterList::Create(Importer.getToContext(),
                                        Importer.Import(Params->getTemplateLoc()),
                                        Importer.Import(Params->getLAngleLoc()),
                                        ToParams,
-                                       Importer.Import(Params->getRAngleLoc()));
+                                       Importer.Import(Params->getRAngleLoc()),
+                                       ToRequiresClause);
 }
 
 TemplateArgument 
@@ -2364,6 +2439,31 @@
   return ToD;
 }
 
+Decl *ASTNodeImporter::VisitAccessSpecDecl(AccessSpecDecl *D) {
+
+  SourceLocation Loc = Importer.Import(D->getLocation());
+  SourceLocation ColonLoc = Importer.Import(D->getColonLoc());
+
+  // Import the context of this declaration.
+  DeclContext *DC = Importer.ImportContext(D->getDeclContext());
+  if (!DC)
+    return nullptr;
+
+  AccessSpecDecl *accessSpecDecl
+    = AccessSpecDecl::Create(Importer.getToContext(), D->getAccess(),
+                             DC, Loc, ColonLoc);
+
+  if (!accessSpecDecl)
+    return nullptr;
+
+  // Lexical DeclContext and Semantic DeclContext
+  // is always the same for the accessSpec.
+  accessSpecDecl->setLexicalDeclContext(DC);
+  DC->addDeclInternal(accessSpecDecl);
+
+  return accessSpecDecl;
+}
+
 Decl *ASTNodeImporter::VisitNamespaceDecl(NamespaceDecl *D) {
   // Import the major distinguishing characteristics of this namespace.
   DeclContext *DC, *LexicalDC;
@@ -2512,6 +2612,39 @@
   return VisitTypedefNameDecl(D, /*IsAlias=*/true);
 }
 
+Decl *ASTNodeImporter::VisitLabelDecl(LabelDecl *D) {
+  // Import the major distinguishing characteristics of this label.
+  DeclContext *DC, *LexicalDC;
+  DeclarationName Name;
+  SourceLocation Loc;
+  NamedDecl *ToD;
+  if (ImportDeclParts(D, DC, LexicalDC, Name, ToD, Loc))
+    return nullptr;
+  if (ToD)
+    return ToD;
+
+  assert(LexicalDC->isFunctionOrMethod());
+
+  LabelDecl *ToLabel = D->isGnuLocal()
+      ? LabelDecl::Create(Importer.getToContext(),
+                          DC, Importer.Import(D->getLocation()),
+                          Name.getAsIdentifierInfo(),
+                          Importer.Import(D->getLocStart()))
+      : LabelDecl::Create(Importer.getToContext(),
+                          DC, Importer.Import(D->getLocation()),
+                          Name.getAsIdentifierInfo());
+  Importer.Imported(D, ToLabel);
+
+  LabelStmt *Label = cast_or_null<LabelStmt>(Importer.Import(D->getStmt()));
+  if (!Label)
+    return nullptr;
+
+  ToLabel->setStmt(Label);
+  ToLabel->setLexicalDeclContext(LexicalDC);
+  LexicalDC->addDeclInternal(ToLabel);
+  return ToLabel;
+}
+
 Decl *ASTNodeImporter::VisitEnumDecl(EnumDecl *D) {
   // Import the major distinguishing characteristics of this enum.
   DeclContext *DC, *LexicalDC;
@@ -2902,7 +3035,7 @@
 
   // Import the function parameters.
   SmallVector<ParmVarDecl *, 8> Parameters;
-  for (auto P : D->params()) {
+  for (auto P : D->parameters()) {
     ParmVarDecl *ToP = cast_or_null<ParmVarDecl>(Importer.Import(P));
     if (!ToP)
       return nullptr;
@@ -3168,7 +3301,7 @@
 
   IndirectFieldDecl *ToIndirectField = IndirectFieldDecl::Create(
       Importer.getToContext(), DC, Loc, Name.getAsIdentifierInfo(), T,
-      NamedChain, D->getChainingSize());
+      {NamedChain, D->getChainingSize()});
 
   for (const auto *Attr : D->attrs())
     ToIndirectField->addAttr(Attr->clone(Importer.getToContext()));
@@ -3511,7 +3644,7 @@
 
   // Import the parameters
   SmallVector<ParmVarDecl *, 5> ToParams;
-  for (auto *FromP : D->params()) {
+  for (auto *FromP : D->parameters()) {
     ParmVarDecl *ToP = cast_or_null<ParmVarDecl>(Importer.Import(FromP));
     if (!ToP)
       return nullptr;
@@ -4340,16 +4473,16 @@
   CXXRecordDecl *DTemplated = D->getTemplatedDecl();
   
   // Create the declaration that is being templated.
-  SourceLocation StartLoc = Importer.Import(DTemplated->getLocStart());
-  SourceLocation IdLoc = Importer.Import(DTemplated->getLocation());
-  CXXRecordDecl *D2Templated = CXXRecordDecl::Create(Importer.getToContext(),
-                                                     DTemplated->getTagKind(),
-                                                     DC, StartLoc, IdLoc,
-                                                   Name.getAsIdentifierInfo());
-  D2Templated->setAccess(DTemplated->getAccess());
-  D2Templated->setQualifierInfo(Importer.Import(DTemplated->getQualifierLoc()));
-  D2Templated->setLexicalDeclContext(LexicalDC);
-  
+  // Create the declaration that is being templated.
+  CXXRecordDecl *D2Templated = cast_or_null<CXXRecordDecl>(
+        Importer.Import(DTemplated));
+  if (!D2Templated)
+    return nullptr;
+
+  // Resolve possible cyclic import.
+  if (Decl *AlreadyImported = Importer.GetAlreadyImportedOrNull(D))
+    return AlreadyImported;
+
   // Create the class template declaration itself.
   TemplateParameterList *TemplateParams
     = ImportTemplateParameterList(D->getTemplateParameters());
@@ -4445,8 +4578,7 @@
                                                  D->getTagKind(), DC, 
                                                  StartLoc, IdLoc,
                                                  ClassTemplate,
-                                                 TemplateArgs.data(), 
-                                                 TemplateArgs.size(), 
+                                                 TemplateArgs,
                                                  /*PrevDecl=*/nullptr);
     D2->setSpecializationKind(D->getSpecializationKind());
 
@@ -4647,7 +4779,7 @@
     // Create a new specialization.
     D2 = VarTemplateSpecializationDecl::Create(
         Importer.getToContext(), DC, StartLoc, IdLoc, VarTemplate, T, TInfo,
-        D->getStorageClass(), TemplateArgs.data(), TemplateArgs.size());
+        D->getStorageClass(), TemplateArgs);
     D2->setSpecializationKind(D->getSpecializationKind());
     D2->setTemplateArgsInfo(D->getTemplateArgsInfo());
 
@@ -4693,7 +4825,78 @@
      << S->getStmtClassName();
    return nullptr;
  }
- 
+
+
+Stmt *ASTNodeImporter::VisitGCCAsmStmt(GCCAsmStmt *S) {
+  SmallVector<IdentifierInfo *, 4> Names;
+  for (unsigned I = 0, E = S->getNumOutputs(); I != E; I++) {
+    IdentifierInfo *ToII = Importer.Import(S->getOutputIdentifier(I));
+    if (!ToII)
+      return nullptr;
+    Names.push_back(ToII);
+  }
+  for (unsigned I = 0, E = S->getNumInputs(); I != E; I++) {
+    IdentifierInfo *ToII = Importer.Import(S->getInputIdentifier(I));
+    if (!ToII)
+      return nullptr;
+    Names.push_back(ToII);
+  }
+
+  SmallVector<StringLiteral *, 4> Clobbers;
+  for (unsigned I = 0, E = S->getNumClobbers(); I != E; I++) {
+    StringLiteral *Clobber = cast_or_null<StringLiteral>(
+          Importer.Import(S->getClobberStringLiteral(I)));
+    if (!Clobber)
+      return nullptr;
+    Clobbers.push_back(Clobber);
+  }
+
+  SmallVector<StringLiteral *, 4> Constraints;
+  for (unsigned I = 0, E = S->getNumOutputs(); I != E; I++) {
+    StringLiteral *Output = cast_or_null<StringLiteral>(
+          Importer.Import(S->getOutputConstraintLiteral(I)));
+    if (!Output)
+      return nullptr;
+    Constraints.push_back(Output);
+  }
+
+  for (unsigned I = 0, E = S->getNumInputs(); I != E; I++) {
+    StringLiteral *Input = cast_or_null<StringLiteral>(
+          Importer.Import(S->getInputConstraintLiteral(I)));
+    if (!Input)
+      return nullptr;
+    Constraints.push_back(Input);
+  }
+
+  SmallVector<Expr *, 4> Exprs(S->getNumOutputs() + S->getNumInputs());
+  if (ImportArrayChecked(S->begin_outputs(), S->end_outputs(), Exprs.begin()))
+    return nullptr;
+
+  if (ImportArrayChecked(S->begin_inputs(), S->end_inputs(),
+                         Exprs.begin() + S->getNumOutputs()))
+    return nullptr;
+
+  StringLiteral *AsmStr = cast_or_null<StringLiteral>(
+        Importer.Import(S->getAsmString()));
+  if (!AsmStr)
+    return nullptr;
+
+  return new (Importer.getToContext()) GCCAsmStmt(
+        Importer.getToContext(),
+        Importer.Import(S->getAsmLoc()),
+        S->isSimple(),
+        S->isVolatile(),
+        S->getNumOutputs(),
+        S->getNumInputs(),
+        Names.data(),
+        Constraints.data(),
+        Exprs.data(),
+        AsmStr,
+        S->getNumClobbers(),
+        Clobbers.data(),
+        Importer.Import(S->getRParenLoc()));
+}
+
 Stmt *ASTNodeImporter::VisitDeclStmt(DeclStmt *S) {
   DeclGroupRef ToDG = ImportDeclGroup(S->getDeclGroup());
   for (Decl *ToD : ToDG) {
@@ -4712,9 +4915,9 @@
 }
 
 Stmt *ASTNodeImporter::VisitCompoundStmt(CompoundStmt *S) {
-  llvm::ArrayRef<Stmt *> ToStmts;
+  llvm::SmallVector<Stmt *, 8> ToStmts(S->size());
     
-  if (!ImportArray(S->body_begin(), S->body_end(), ToStmts))
+  if (ImportArrayChecked(S->body_begin(), S->body_end(), ToStmts.begin()))
     return nullptr;
 
   SourceLocation ToLBraceLoc = Importer.Import(S->getLBracLoc());
@@ -4784,6 +4987,9 @@
 
 Stmt *ASTNodeImporter::VisitIfStmt(IfStmt *S) {
   SourceLocation ToIfLoc = Importer.Import(S->getIfLoc());
+  Stmt *ToInit = Importer.Import(S->getInit());
+  if (!ToInit && S->getInit())
+    return nullptr;
   VarDecl *ToConditionVariable = nullptr;
   if (VarDecl *FromConditionVariable = S->getConditionVariable()) {
     ToConditionVariable =
@@ -4802,12 +5008,17 @@
   if (!ToElseStmt && S->getElse())
     return nullptr;
   return new (Importer.getToContext()) IfStmt(Importer.getToContext(),
-                                              ToIfLoc, ToConditionVariable,
+                                              ToIfLoc, S->isConstexpr(),
+                                              ToInit,
+                                              ToConditionVariable,
                                               ToCondition, ToThenStmt,
                                               ToElseLoc, ToElseStmt);
 }
 
 Stmt *ASTNodeImporter::VisitSwitchStmt(SwitchStmt *S) {
+  Stmt *ToInit = Importer.Import(S->getInit());
+  if (!ToInit && S->getInit())
+    return nullptr;
   VarDecl *ToConditionVariable = nullptr;
   if (VarDecl *FromConditionVariable = S->getConditionVariable()) {
     ToConditionVariable =
@@ -4819,8 +5030,8 @@
   if (!ToCondition && S->getCond())
     return nullptr;
   SwitchStmt *ToStmt = new (Importer.getToContext()) SwitchStmt(
-                         Importer.getToContext(), ToConditionVariable,
-                         ToCondition);
+                         Importer.getToContext(), ToInit,
+                         ToConditionVariable, ToCondition);
   Stmt *ToBody = Importer.Import(S->getBody());
   if (!ToBody && S->getBody())
     return nullptr;
@@ -4994,9 +5205,13 @@
     dyn_cast_or_null<DeclStmt>(Importer.Import(S->getRangeStmt()));
   if (!ToRange && S->getRangeStmt())
     return nullptr;
-  DeclStmt *ToBeginEnd =
-    dyn_cast_or_null<DeclStmt>(Importer.Import(S->getBeginEndStmt()));
-  if (!ToBeginEnd && S->getBeginEndStmt())
+  DeclStmt *ToBegin =
+    dyn_cast_or_null<DeclStmt>(Importer.Import(S->getBeginStmt()));
+  if (!ToBegin && S->getBeginStmt())
+    return nullptr;
+  DeclStmt *ToEnd =
+    dyn_cast_or_null<DeclStmt>(Importer.Import(S->getEndStmt()));
+  if (!ToEnd && S->getEndStmt())
     return nullptr;
   Expr *ToCond = Importer.Import(S->getCond());
   if (!ToCond && S->getCond())
@@ -5015,7 +5230,7 @@
   SourceLocation ToCoawaitLoc = Importer.Import(S->getCoawaitLoc());
   SourceLocation ToColonLoc = Importer.Import(S->getColonLoc());
   SourceLocation ToRParenLoc = Importer.Import(S->getRParenLoc());
-  return new (Importer.getToContext()) CXXForRangeStmt(ToRange, ToBeginEnd,
+  return new (Importer.getToContext()) CXXForRangeStmt(ToRange, ToBegin, ToEnd,
                                                        ToCond, ToInc,
                                                        ToLoopVar, ToBody,
                                                        ToForLoc, ToCoawaitLoc,
@@ -5131,6 +5346,48 @@
   return nullptr;
 }
 
+Expr *ASTNodeImporter::VisitVAArgExpr(VAArgExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  Expr *SubExpr = Importer.Import(E->getSubExpr());
+  if (!SubExpr && E->getSubExpr())
+    return nullptr;
+
+  TypeSourceInfo *TInfo = Importer.Import(E->getWrittenTypeInfo());
+  if (!TInfo)
+    return nullptr;
+
+  return new (Importer.getToContext()) VAArgExpr(
+        Importer.Import(E->getBuiltinLoc()), SubExpr, TInfo,
+        Importer.Import(E->getRParenLoc()), T, E->isMicrosoftABI());
+}
+
+
+Expr *ASTNodeImporter::VisitGNUNullExpr(GNUNullExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  return new (Importer.getToContext()) GNUNullExpr(
+        T, Importer.Import(E->getExprLoc()));
+}
+
+Expr *ASTNodeImporter::VisitPredefinedExpr(PredefinedExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  StringLiteral *SL = cast_or_null<StringLiteral>(
+        Importer.Import(E->getFunctionName()));
+  if (!SL && E->getFunctionName())
+    return nullptr;
+
+  return new (Importer.getToContext()) PredefinedExpr(
+        Importer.Import(E->getExprLoc()), T, E->getIdentType(), SL);
+}
+
 Expr *ASTNodeImporter::VisitDeclRefExpr(DeclRefExpr *E) {
   ValueDecl *ToD = cast_or_null<ValueDecl>(Importer.Import(E->getDecl()));
   if (!ToD)
@@ -5161,6 +5418,74 @@
   return DRE;
 }
 
+Expr *ASTNodeImporter::VisitImplicitValueInitExpr(ImplicitValueInitExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return NULL;
+
+  return new (Importer.getToContext()) ImplicitValueInitExpr(T);
+}
+
+ASTNodeImporter::Designator
+ASTNodeImporter::ImportDesignator(const Designator &D) {
+  if (D.isFieldDesignator()) {
+    IdentifierInfo *ToFieldName = Importer.Import(D.getFieldName());
+    // Caller checks for import error
+    return Designator(ToFieldName, Importer.Import(D.getDotLoc()),
+                      Importer.Import(D.getFieldLoc()));
+  }
+  if (D.isArrayDesignator())
+    return Designator(D.getFirstExprIndex(),
+                      Importer.Import(D.getLBracketLoc()),
+                      Importer.Import(D.getRBracketLoc()));
+
+  assert(D.isArrayRangeDesignator());
+  return Designator(D.getFirstExprIndex(),
+                    Importer.Import(D.getLBracketLoc()),
+                    Importer.Import(D.getEllipsisLoc()),
+                    Importer.Import(D.getRBracketLoc()));
+}
+
+
+Expr *ASTNodeImporter::VisitDesignatedInitExpr(DesignatedInitExpr *DIE) {
+  Expr *Init = cast_or_null<Expr>(Importer.Import(DIE->getInit()));
+  if (!Init)
+    return nullptr;
+
+  SmallVector<Expr *, 4> IndexExprs(DIE->getNumSubExprs() - 1);
+  // List elements from the second, the first is Init itself
+  for (unsigned I = 1, E = DIE->getNumSubExprs(); I < E; I++) {
+    if (Expr *Arg = cast_or_null<Expr>(Importer.Import(DIE->getSubExpr(I))))
+      IndexExprs[I - 1] = Arg;
+    else
+      return nullptr;
+  }
+
+  SmallVector<Designator, 4> Designators(DIE->size());
+  llvm::transform(DIE->designators(), Designators.begin(),
+                  [this](const Designator &D) -> Designator {
+                    return ImportDesignator(D);
+                  });
+
+  for (const Designator &D : DIE->designators())
+    if (D.isFieldDesignator() && !D.getFieldName())
+      return nullptr;
+
+  return DesignatedInitExpr::Create(
+        Importer.getToContext(), Designators,
+        IndexExprs, Importer.Import(DIE->getEqualOrColonLoc()),
+        DIE->usesGNUSyntax(), Init);
+}
+
+Expr *ASTNodeImporter::VisitCXXNullPtrLiteralExpr(CXXNullPtrLiteralExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  return new (Importer.getToContext())
+      CXXNullPtrLiteralExpr(T, Importer.Import(E->getLocation()));
+}
+
 Expr *ASTNodeImporter::VisitIntegerLiteral(IntegerLiteral *E) {
   QualType T = Importer.Import(E->getType());
   if (T.isNull())
@@ -5171,6 +5496,16 @@
                                 Importer.Import(E->getLocation()));
 }
 
+Expr *ASTNodeImporter::VisitFloatingLiteral(FloatingLiteral *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  return FloatingLiteral::Create(Importer.getToContext(),
+                                E->getValue(), E->isExact(), T,
+                                Importer.Import(E->getLocation()));
+}
+
 Expr *ASTNodeImporter::VisitCharacterLiteral(CharacterLiteral *E) {
   QualType T = Importer.Import(E->getType());
   if (T.isNull())
@@ -5181,6 +5516,67 @@
                                           Importer.Import(E->getLocation()));
 }
 
+Expr *ASTNodeImporter::VisitStringLiteral(StringLiteral *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  SmallVector<SourceLocation, 4> Locations(E->getNumConcatenated());
+  ImportArray(E->tokloc_begin(), E->tokloc_end(), Locations.begin());
+
+  return StringLiteral::Create(Importer.getToContext(), E->getBytes(),
+                               E->getKind(), E->isPascal(), T,
+                               Locations.data(), Locations.size());
+}
+
+Expr *ASTNodeImporter::VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  TypeSourceInfo *TInfo = Importer.Import(E->getTypeSourceInfo());
+  if (!TInfo)
+    return nullptr;
+
+  Expr *Init = Importer.Import(E->getInitializer());
+  if (!Init)
+    return nullptr;
+
+  return new (Importer.getToContext()) CompoundLiteralExpr(
+        Importer.Import(E->getLParenLoc()), TInfo, T, E->getValueKind(),
+        Init, E->isFileScope());
+}
+
+Expr *ASTNodeImporter::VisitAtomicExpr(AtomicExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  SmallVector<Expr *, 6> Exprs(E->getNumSubExprs());
+  if (ImportArrayChecked(
+        E->getSubExprs(), E->getSubExprs() + E->getNumSubExprs(),
+        Exprs.begin()))
+    return nullptr;
+
+  return new (Importer.getToContext()) AtomicExpr(
+        Importer.Import(E->getBuiltinLoc()), Exprs, T, E->getOp(),
+        Importer.Import(E->getRParenLoc()));
+}
+
+Expr *ASTNodeImporter::VisitAddrLabelExpr(AddrLabelExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  LabelDecl *ToLabel = cast_or_null<LabelDecl>(Importer.Import(E->getLabel()));
+  if (!ToLabel)
+    return nullptr;
+
+  return new (Importer.getToContext()) AddrLabelExpr(
+        Importer.Import(E->getAmpAmpLoc()), Importer.Import(E->getLabelLoc()),
+        ToLabel, T);
+}
+
 Expr *ASTNodeImporter::VisitParenExpr(ParenExpr *E) {
   Expr *SubExpr = Importer.Import(E->getSubExpr());
   if (!SubExpr)
@@ -5192,6 +5588,31 @@
                                             SubExpr);
 }
 
+Expr *ASTNodeImporter::VisitParenListExpr(ParenListExpr *E) {
+  SmallVector<Expr *, 4> Exprs(E->getNumExprs());
+  if (ImportArrayChecked(
+        E->getExprs(), E->getExprs() + E->getNumExprs(), Exprs.begin()))
+    return nullptr;
+
+  return new (Importer.getToContext()) ParenListExpr(
+        Importer.getToContext(), Importer.Import(E->getLParenLoc()),
+        Exprs, Importer.Import(E->getLParenLoc()));
+}
+
+Expr *ASTNodeImporter::VisitStmtExpr(StmtExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  CompoundStmt *ToSubStmt = cast_or_null<CompoundStmt>(
+        Importer.Import(E->getSubStmt()));
+  if (!ToSubStmt && E->getSubStmt())
+    return nullptr;
+
+  return new (Importer.getToContext()) StmtExpr(ToSubStmt, T,
+        Importer.Import(E->getLParenLoc()), Importer.Import(E->getRParenLoc()));
+}
+
 Expr *ASTNodeImporter::VisitUnaryOperator(UnaryOperator *E) {
   QualType T = Importer.Import(E->getType());
   if (T.isNull())
@@ -5252,6 +5673,76 @@
                                                       E->isFPContractable());
 }
 
+Expr *ASTNodeImporter::VisitConditionalOperator(ConditionalOperator *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  Expr *ToLHS = Importer.Import(E->getLHS());
+  if (!ToLHS)
+    return nullptr;
+
+  Expr *ToRHS = Importer.Import(E->getRHS());
+  if (!ToRHS)
+    return nullptr;
+
+  Expr *ToCond = Importer.Import(E->getCond());
+  if (!ToCond)
+    return nullptr;
+
+  return new (Importer.getToContext()) ConditionalOperator(
+        ToCond, Importer.Import(E->getQuestionLoc()),
+        ToLHS, Importer.Import(E->getColonLoc()),
+        ToRHS, T, E->getValueKind(), E->getObjectKind());
+}
+
+Expr *ASTNodeImporter::VisitBinaryConditionalOperator(
+    BinaryConditionalOperator *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  Expr *Common = Importer.Import(E->getCommon());
+  if (!Common)
+    return nullptr;
+
+  Expr *Cond = Importer.Import(E->getCond());
+  if (!Cond)
+    return nullptr;
+
+  OpaqueValueExpr *OpaqueValue = cast_or_null<OpaqueValueExpr>(
+        Importer.Import(E->getOpaqueValue()));
+  if (!OpaqueValue)
+    return nullptr;
+
+  Expr *TrueExpr = Importer.Import(E->getTrueExpr());
+  if (!TrueExpr)
+    return nullptr;
+
+  Expr *FalseExpr = Importer.Import(E->getFalseExpr());
+  if (!FalseExpr)
+    return nullptr;
+
+  return new (Importer.getToContext()) BinaryConditionalOperator(
+        Common, OpaqueValue, Cond, TrueExpr, FalseExpr,
+        Importer.Import(E->getQuestionLoc()), Importer.Import(E->getColonLoc()),
+        T, E->getValueKind(), E->getObjectKind());
+}
+
+Expr *ASTNodeImporter::VisitOpaqueValueExpr(OpaqueValueExpr *E) {
+  QualType T = Importer.Import(E->getType());
+  if (T.isNull())
+    return nullptr;
+
+  Expr *SourceExpr = Importer.Import(E->getSourceExpr());
+  if (!SourceExpr && E->getSourceExpr())
+    return nullptr;
+
+  return new (Importer.getToContext()) OpaqueValueExpr(
+        Importer.Import(E->getExprLoc()), T, E->getValueKind(),
+        E->getObjectKind(), SourceExpr);
+}
+
 Expr *ASTNodeImporter::VisitCompoundAssignOperator(CompoundAssignOperator *E) {
   QualType T = Importer.Import(E->getType());
   if (T.isNull())
@@ -5340,9 +5831,9 @@
   if (!ToCCD)
     return nullptr;
 
-  ArrayRef<Expr *> ToArgs;
-  
-  if (!ImportArray(E->arg_begin(), E->arg_end(), ToArgs))
+  SmallVector<Expr *, 6> ToArgs(E->getNumArgs());
+  if (ImportArrayChecked(E->getArgs(), E->getArgs() + E->getNumArgs(),
+                         ToArgs.begin()))
     return nullptr;
 
   return CXXConstructExpr::Create(Importer.getToContext(), T,
@@ -5365,14 +5856,14 @@
   if (!ToFn)
     return nullptr;
   
-  ArrayRef<Expr *> ToArgs;
+  SmallVector<Expr *, 4> ToArgs(E->getNumArgs());
   
-  if (!ImportArray(E->arg_begin(), E->arg_end(), ToArgs))
+  if (ImportArrayChecked(E->arg_begin(), E->arg_end(), ToArgs.begin()))
     return nullptr;
 
-  return new (Importer.getToContext()) CXXMemberCallExpr(Importer.getToContext(), ToFn,
-                                                         ToArgs, T, E->getValueKind(),
-                                                         Importer.Import(E->getRParenLoc()));
+  return new (Importer.getToContext()) CXXMemberCallExpr(
+        Importer.getToContext(), ToFn, ToArgs, T, E->getValueKind(),
+        Importer.Import(E->getRParenLoc()));
 }
 
 Expr *ASTNodeImporter::VisitCXXThisExpr(CXXThisExpr *E) {
@@ -5462,26 +5953,49 @@
              Importer.Import(E->getRParenLoc()));
 }
 
-Expr *ASTNodeImporter::VisitInitListExpr(InitListExpr *E) {
-  QualType T = Importer.Import(E->getType());
+Expr *ASTNodeImporter::VisitInitListExpr(InitListExpr *ILE) {
+  QualType T = Importer.Import(ILE->getType());
   if (T.isNull())
     return nullptr;
-    
-  ArrayRef<Expr *> ToInits;
 
-  if (!ImportArray(E->inits().begin(), E->inits().end(), ToInits))
+  llvm::SmallVector<Expr *, 4> Exprs(ILE->getNumInits());
+  if (ImportArrayChecked(
+        ILE->getInits(), ILE->getInits() + ILE->getNumInits(), Exprs.begin()))
     return nullptr;
-    
-  InitListExpr *ToE = new (Importer.getToContext())
-    InitListExpr(Importer.getToContext(),
-                 Importer.Import(E->getLBraceLoc()),
-                 ToInits,
-                 Importer.Import(E->getRBraceLoc()));
-  
-  if (ToE)
-    ToE->setType(T);
 
-  return ToE;
+  ASTContext &ToCtx = Importer.getToContext();
+  InitListExpr *To = new (ToCtx) InitListExpr(
+        ToCtx, Importer.Import(ILE->getLBraceLoc()),
+        Exprs, Importer.Import(ILE->getLBraceLoc()));
+  To->setType(T);
+
+  if (ILE->hasArrayFiller()) {
+    Expr *Filler = Importer.Import(ILE->getArrayFiller());
+    if (!Filler)
+      return nullptr;
+    To->setArrayFiller(Filler);
+  }
+
+  if (FieldDecl *FromFD = ILE->getInitializedFieldInUnion()) {
+    FieldDecl *ToFD = cast_or_null<FieldDecl>(Importer.Import(FromFD));
+    if (!ToFD)
+      return nullptr;
+    To->setInitializedFieldInUnion(ToFD);
+  }
+
+  if (InitListExpr *SyntForm = ILE->getSyntacticForm()) {
+    InitListExpr *ToSyntForm = cast_or_null<InitListExpr>(
+          Importer.Import(SyntForm));
+    if (!ToSyntForm)
+      return nullptr;
+    To->setSyntacticForm(ToSyntForm);
+  }
+
+  To->sawArrayRangeDesignator(ILE->hadArrayRangeDesignator());
+  To->setValueDependent(ILE->isValueDependent());
+  To->setInstantiationDependent(ILE->isInstantiationDependent());
+
+  return To;
 }
 
 Expr *ASTNodeImporter::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *DIE) {
diff --git a/lib/AST/ASTTypeTraits.cpp b/lib/AST/ASTTypeTraits.cpp
index 56bd61f..680f526 100644
--- a/lib/AST/ASTTypeTraits.cpp
+++ b/lib/AST/ASTTypeTraits.cpp
@@ -23,6 +23,7 @@
 const ASTNodeKind::KindInfo ASTNodeKind::AllKindInfo[] = {
   { NKI_None, "<None>" },
   { NKI_None, "TemplateArgument" },
+  { NKI_None, "TemplateName" },
   { NKI_None, "NestedNameSpecifierLoc" },
   { NKI_None, "QualType" },
   { NKI_None, "TypeLoc" },
@@ -43,10 +44,6 @@
   return isBaseOf(KindId, Other.KindId, Distance);
 }
 
-bool ASTNodeKind::isSame(ASTNodeKind Other) const {
-  return KindId != NKI_None && KindId == Other.KindId;
-}
-
 bool ASTNodeKind::isBaseOf(NodeKindId Base, NodeKindId Derived,
                            unsigned *Distance) {
   if (Base == NKI_None || Derived == NKI_None) return false;
@@ -113,6 +110,8 @@
                          const PrintingPolicy &PP) const {
   if (const TemplateArgument *TA = get<TemplateArgument>())
     TA->print(PP, OS);
+  else if (const TemplateName *TN = get<TemplateName>())
+    TN->print(OS, PP);
   else if (const NestedNameSpecifier *NNS = get<NestedNameSpecifier>())
     NNS->print(OS, PP);
   else if (const NestedNameSpecifierLoc *NNSL = get<NestedNameSpecifierLoc>())
diff --git a/lib/AST/AttrImpl.cpp b/lib/AST/AttrImpl.cpp
index cb60870..b06b50c 100644
--- a/lib/AST/AttrImpl.cpp
+++ b/lib/AST/AttrImpl.cpp
@@ -11,11 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/AST/Attr.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/Type.h"
-#include "llvm/ADT/StringSwitch.h"
 using namespace clang;
 
 #include "clang/AST/AttrImpl.inc"
diff --git a/lib/AST/CXXInheritance.cpp b/lib/AST/CXXInheritance.cpp
index 6785a0c..a97d6a2 100644
--- a/lib/AST/CXXInheritance.cpp
+++ b/lib/AST/CXXInheritance.cpp
@@ -16,7 +16,6 @@
 #include "clang/AST/RecordLayout.h"
 #include "llvm/ADT/SetVector.h"
 #include <algorithm>
-#include <set>
 
 using namespace clang;
 
@@ -405,6 +404,21 @@
   return false;
 }
 
+bool CXXRecordDecl::FindOMPReductionMember(const CXXBaseSpecifier *Specifier,
+                                           CXXBasePath &Path,
+                                           DeclarationName Name) {
+  RecordDecl *BaseRecord =
+      Specifier->getType()->castAs<RecordType>()->getDecl();
+
+  for (Path.Decls = BaseRecord->lookup(Name); !Path.Decls.empty();
+       Path.Decls = Path.Decls.slice(1)) {
+    if (Path.Decls.front()->isInIdentifierNamespace(IDNS_OMPReduction))
+      return true;
+  }
+
+  return false;
+}
+
 bool CXXRecordDecl::
 FindNestedNameSpecifierMember(const CXXBaseSpecifier *Specifier, 
                               CXXBasePath &Path,
diff --git a/lib/AST/Comment.cpp b/lib/AST/Comment.cpp
index d05c5de..7a7d3dd 100644
--- a/lib/AST/Comment.cpp
+++ b/lib/AST/Comment.cpp
@@ -7,14 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/AST/ASTContext.h"
 #include "clang/AST/Comment.h"
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/Basic/CharInfo.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 namespace comments {
@@ -114,6 +113,65 @@
   return true;
 }
 
+static TypeLoc lookThroughTypedefOrTypeAliasLocs(TypeLoc &SrcTL) {
+  TypeLoc TL = SrcTL.IgnoreParens();
+
+  // Look through qualified types.
+  if (QualifiedTypeLoc QualifiedTL = TL.getAs<QualifiedTypeLoc>())
+    return QualifiedTL.getUnqualifiedLoc();
+  // Look through pointer types.
+  if (PointerTypeLoc PointerTL = TL.getAs<PointerTypeLoc>())
+    return PointerTL.getPointeeLoc().getUnqualifiedLoc();
+  // Look through reference types.
+  if (ReferenceTypeLoc ReferenceTL = TL.getAs<ReferenceTypeLoc>())
+    return ReferenceTL.getPointeeLoc().getUnqualifiedLoc();
+  // Look through adjusted types.
+  if (AdjustedTypeLoc ATL = TL.getAs<AdjustedTypeLoc>())
+    return ATL.getOriginalLoc();
+  if (BlockPointerTypeLoc BlockPointerTL = TL.getAs<BlockPointerTypeLoc>())
+    return BlockPointerTL.getPointeeLoc().getUnqualifiedLoc();
+  if (MemberPointerTypeLoc MemberPointerTL = TL.getAs<MemberPointerTypeLoc>())
+    return MemberPointerTL.getPointeeLoc().getUnqualifiedLoc();
+  if (ElaboratedTypeLoc ETL = TL.getAs<ElaboratedTypeLoc>())
+    return ETL.getNamedTypeLoc();
+
+  return TL;
+}
+
+static bool getFunctionTypeLoc(TypeLoc TL, FunctionTypeLoc &ResFTL) {
+  TypeLoc PrevTL;
+  while (PrevTL != TL) {
+    PrevTL = TL;
+    TL = lookThroughTypedefOrTypeAliasLocs(TL);
+  }
+
+  if (FunctionTypeLoc FTL = TL.getAs<FunctionTypeLoc>()) {
+    ResFTL = FTL;
+    return true;
+  }
+
+  if (TemplateSpecializationTypeLoc STL =
+          TL.getAs<TemplateSpecializationTypeLoc>()) {
+    // If we have a typedef to a template specialization with exactly one
+    // template argument of a function type, this looks like std::function,
+    // boost::function, or other function wrapper.  Treat these typedefs as
+    // functions.
+    if (STL.getNumArgs() != 1)
+      return false;
+    TemplateArgumentLoc MaybeFunction = STL.getArgLoc(0);
+    if (MaybeFunction.getArgument().getKind() != TemplateArgument::Type)
+      return false;
+    TypeSourceInfo *MaybeFunctionTSI = MaybeFunction.getTypeSourceInfo();
+    TypeLoc TL = MaybeFunctionTSI->getTypeLoc().getUnqualifiedLoc();
+    if (FunctionTypeLoc FTL = TL.getAs<FunctionTypeLoc>()) {
+      ResFTL = FTL;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 const char *ParamCommandComment::getDirectionAsString(PassDirection D) {
   switch (D) {
   case ParamCommandComment::In:
@@ -157,7 +215,7 @@
   case Decl::CXXConversion: {
     const FunctionDecl *FD = cast<FunctionDecl>(CommentDecl);
     Kind = FunctionKind;
-    ParamVars = llvm::makeArrayRef(FD->param_begin(), FD->getNumParams());
+    ParamVars = FD->parameters();
     ReturnType = FD->getReturnType();
     unsigned NumLists = FD->getNumTemplateParameterLists();
     if (NumLists != 0) {
@@ -177,7 +235,7 @@
   case Decl::ObjCMethod: {
     const ObjCMethodDecl *MD = cast<ObjCMethodDecl>(CommentDecl);
     Kind = FunctionKind;
-    ParamVars = llvm::makeArrayRef(MD->param_begin(), MD->param_size());
+    ParamVars = MD->parameters();
     ReturnType = MD->getReturnType();
     IsObjCMethod = true;
     IsInstanceMethod = MD->isInstanceMethod();
@@ -189,7 +247,7 @@
     Kind = FunctionKind;
     TemplateKind = Template;
     const FunctionDecl *FD = FTD->getTemplatedDecl();
-    ParamVars = llvm::makeArrayRef(FD->param_begin(), FD->getNumParams());
+    ParamVars = FD->parameters();
     ReturnType = FD->getReturnType();
     TemplateParameters = FTD->getTemplateParameters();
     break;
@@ -227,90 +285,45 @@
   case Decl::Namespace:
     Kind = NamespaceKind;
     break;
+  case Decl::TypeAlias:
   case Decl::Typedef: {
     Kind = TypedefKind;
-    // If this is a typedef to something we consider a function, extract
+    // If this is a typedef / using to something we consider a function, extract
     // arguments and return type.
-    const TypedefDecl *TD = cast<TypedefDecl>(CommentDecl);
-    const TypeSourceInfo *TSI = TD->getTypeSourceInfo();
+    const TypeSourceInfo *TSI =
+        K == Decl::Typedef
+            ? cast<TypedefDecl>(CommentDecl)->getTypeSourceInfo()
+            : cast<TypeAliasDecl>(CommentDecl)->getTypeSourceInfo();
     if (!TSI)
       break;
     TypeLoc TL = TSI->getTypeLoc().getUnqualifiedLoc();
-    while (true) {
-      TL = TL.IgnoreParens();
-      // Look through qualified types.
-      if (QualifiedTypeLoc QualifiedTL = TL.getAs<QualifiedTypeLoc>()) {
-        TL = QualifiedTL.getUnqualifiedLoc();
-        continue;
-      }
-      // Look through pointer types.
-      if (PointerTypeLoc PointerTL = TL.getAs<PointerTypeLoc>()) {
-        TL = PointerTL.getPointeeLoc().getUnqualifiedLoc();
-        continue;
-      }
-      // Look through reference types.
-      if (ReferenceTypeLoc ReferenceTL = TL.getAs<ReferenceTypeLoc>()) {
-        TL = ReferenceTL.getPointeeLoc().getUnqualifiedLoc();
-        continue;
-      }
-      // Look through adjusted types.
-      if (AdjustedTypeLoc ATL = TL.getAs<AdjustedTypeLoc>()) {
-        TL = ATL.getOriginalLoc();
-        continue;
-      }
-      if (BlockPointerTypeLoc BlockPointerTL =
-              TL.getAs<BlockPointerTypeLoc>()) {
-        TL = BlockPointerTL.getPointeeLoc().getUnqualifiedLoc();
-        continue;
-      }
-      if (MemberPointerTypeLoc MemberPointerTL =
-              TL.getAs<MemberPointerTypeLoc>()) {
-        TL = MemberPointerTL.getPointeeLoc().getUnqualifiedLoc();
-        continue;
-      }
-      if (ElaboratedTypeLoc ETL = TL.getAs<ElaboratedTypeLoc>()) {
-        TL = ETL.getNamedTypeLoc();
-        continue;
-      }
-      // Is this a typedef for a function type?
-      if (FunctionTypeLoc FTL = TL.getAs<FunctionTypeLoc>()) {
-        Kind = FunctionKind;
-        ParamVars = FTL.getParams();
-        ReturnType = FTL.getReturnLoc().getType();
-        break;
-      }
-      if (TemplateSpecializationTypeLoc STL =
-              TL.getAs<TemplateSpecializationTypeLoc>()) {
-        // If we have a typedef to a template specialization with exactly one
-        // template argument of a function type, this looks like std::function,
-        // boost::function, or other function wrapper.  Treat these typedefs as
-        // functions.
-        if (STL.getNumArgs() != 1)
-          break;
-        TemplateArgumentLoc MaybeFunction = STL.getArgLoc(0);
-        if (MaybeFunction.getArgument().getKind() != TemplateArgument::Type)
-          break;
-        TypeSourceInfo *MaybeFunctionTSI = MaybeFunction.getTypeSourceInfo();
-        TypeLoc TL = MaybeFunctionTSI->getTypeLoc().getUnqualifiedLoc();
-        if (FunctionTypeLoc FTL = TL.getAs<FunctionTypeLoc>()) {
-          Kind = FunctionKind;
-          ParamVars = FTL.getParams();
-          ReturnType = FTL.getReturnLoc().getType();
-        }
-        break;
-      }
-      break;
+    FunctionTypeLoc FTL;
+    if (getFunctionTypeLoc(TL, FTL)) {
+      Kind = FunctionKind;
+      ParamVars = FTL.getParams();
+      ReturnType = FTL.getReturnLoc().getType();
     }
     break;
   }
-  case Decl::TypeAlias:
-    Kind = TypedefKind;
-    break;
   case Decl::TypeAliasTemplate: {
     const TypeAliasTemplateDecl *TAT = cast<TypeAliasTemplateDecl>(CommentDecl);
     Kind = TypedefKind;
     TemplateKind = Template;
     TemplateParameters = TAT->getTemplateParameters();
+    TypeAliasDecl *TAD = TAT->getTemplatedDecl();
+    if (!TAD)
+      break;
+
+    const TypeSourceInfo *TSI = TAD->getTypeSourceInfo();
+    if (!TSI)
+      break;
+    TypeLoc TL = TSI->getTypeLoc().getUnqualifiedLoc();
+    FunctionTypeLoc FTL;
+    if (getFunctionTypeLoc(TL, FTL)) {
+      Kind = FunctionKind;
+      ParamVars = FTL.getParams();
+      ReturnType = FTL.getReturnLoc().getType();
+    }
     break;
   }
   case Decl::Enum:
diff --git a/lib/AST/CommentBriefParser.cpp b/lib/AST/CommentBriefParser.cpp
index 090b921..eecea8f 100644
--- a/lib/AST/CommentBriefParser.cpp
+++ b/lib/AST/CommentBriefParser.cpp
@@ -9,7 +9,6 @@
 
 #include "clang/AST/CommentBriefParser.h"
 #include "clang/AST/CommentCommandTraits.h"
-#include "llvm/ADT/StringSwitch.h"
 
 namespace clang {
 namespace comments {
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp
index 98b7e36..57bfef0 100644
--- a/lib/AST/CommentLexer.cpp
+++ b/lib/AST/CommentLexer.cpp
@@ -1,3 +1,12 @@
+//===--- CommentLexer.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 #include "clang/AST/CommentLexer.h"
 #include "clang/AST/CommentCommandTraits.h"
 #include "clang/AST/CommentDiagnostic.h"
@@ -44,7 +53,7 @@
 #include "clang/AST/CommentHTMLTags.inc"
 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
 
-} // unnamed namespace
+} // end anonymous namespace
 
 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
   // Fast path, first check a few most widely used named character references.
@@ -266,7 +275,7 @@
   llvm_unreachable("buffer end hit before '*/' was seen");
 }
     
-} // unnamed namespace
+} // end anonymous namespace
 
 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
                                tok::TokenKind Kind) {
@@ -411,7 +420,6 @@
           setupAndLexHTMLEndTag(T);
         else
           formTextToken(T, TokenPtr);
-
         return;
       }
 
@@ -604,7 +612,6 @@
   }
   formTokenWithChars(T, TokenPtr, tok::text);
   T.setText(Resolved);
-  return;
 }
 
 void Lexer::setupAndLexHTMLStartTag(Token &T) {
@@ -848,4 +855,3 @@
 
 } // end namespace comments
 } // end namespace clang
-
diff --git a/lib/AST/CommentSema.cpp b/lib/AST/CommentSema.cpp
index 12823c3..f5f4f70 100644
--- a/lib/AST/CommentSema.cpp
+++ b/lib/AST/CommentSema.cpp
@@ -23,7 +23,7 @@
 
 namespace {
 #include "clang/AST/CommentHTMLTagsProperties.inc"
-} // unnamed namespace
+} // end anonymous namespace
 
 Sema::Sema(llvm::BumpPtrAllocator &Allocator, const SourceManager &SourceMgr,
            DiagnosticsEngine &Diags, CommandTraits &Traits,
@@ -353,8 +353,6 @@
       << CorrectedName
       << FixItHint::CreateReplacement(ArgRange, CorrectedName);
   }
-
-  return;
 }
 
 void Sema::actOnTParamCommandFinish(TParamCommandComment *Command,
@@ -1002,7 +1000,7 @@
     BestIndex = CurrIndex;
   }
 }
-} // unnamed namespace
+} // end anonymous namespace
 
 unsigned Sema::correctTypoInParmVarReference(
                                     StringRef Typo,
@@ -1040,7 +1038,7 @@
   }
   return false;
 }
-} // unnamed namespace
+} // end anonymous namespace
 
 bool Sema::resolveTParamReference(
                             StringRef Name,
@@ -1067,7 +1065,7 @@
                                          Corrector);
   }
 }
-} // unnamed namespace
+} // end anonymous namespace
 
 StringRef Sema::correctTypoInTParamReference(
                             StringRef Typo,
@@ -1095,4 +1093,3 @@
 
 } // end namespace comments
 } // end namespace clang
-
diff --git a/lib/AST/Decl.cpp b/lib/AST/Decl.cpp
index 3a513e5..cfdd557 100644
--- a/lib/AST/Decl.cpp
+++ b/lib/AST/Decl.cpp
@@ -18,6 +18,7 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
@@ -591,12 +592,14 @@
     if (Var->getStorageClass() == SC_Static)
       return LinkageInfo::internal();
 
-    // - a non-volatile object or reference that is explicitly declared const
-    //   or constexpr and neither explicitly declared extern nor previously
-    //   declared to have external linkage; or (there is no equivalent in C99)
+    // - a non-inline, non-volatile object or reference that is explicitly
+    //   declared const or constexpr and neither explicitly declared extern
+    //   nor previously declared to have external linkage; or (there is no
+    //   equivalent in C99)
     if (Context.getLangOpts().CPlusPlus &&
         Var->getType().isConstQualified() && 
-        !Var->getType().isVolatileQualified()) {
+        !Var->getType().isVolatileQualified() &&
+        !Var->isInline()) {
       const VarDecl *PrevVar = Var->getPreviousDecl();
       if (PrevVar)
         return getLVForDecl(PrevVar, computation);
@@ -1392,6 +1395,10 @@
   return clang::LinkageComputer::getLVForDecl(D, computation);
 }
 
+void NamedDecl::printName(raw_ostream &os) const {
+  os << Name;
+}
+
 std::string NamedDecl::getQualifiedNameAsString() const {
   std::string QualName;
   llvm::raw_string_ostream OS(QualName);
@@ -1421,16 +1428,13 @@
     Ctx = Ctx->getParent();
   }
 
-  for (ContextsTy::reverse_iterator I = Contexts.rbegin(), E = Contexts.rend();
-       I != E; ++I) {
-    if (const auto *Spec = dyn_cast<ClassTemplateSpecializationDecl>(*I)) {
+  for (const DeclContext *DC : reverse(Contexts)) {
+    if (const auto *Spec = dyn_cast<ClassTemplateSpecializationDecl>(DC)) {
       OS << Spec->getName();
       const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs();
-      TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                                            TemplateArgs.data(),
-                                                            TemplateArgs.size(),
-                                                            P);
-    } else if (const auto *ND = dyn_cast<NamespaceDecl>(*I)) {
+      TemplateSpecializationType::PrintTemplateArgumentList(
+          OS, TemplateArgs.asArray(), P);
+    } else if (const auto *ND = dyn_cast<NamespaceDecl>(DC)) {
       if (P.SuppressUnwrittenScope &&
           (ND->isAnonymousNamespace() || ND->isInline()))
         continue;
@@ -1440,12 +1444,12 @@
       }
       else
         OS << *ND;
-    } else if (const auto *RD = dyn_cast<RecordDecl>(*I)) {
+    } else if (const auto *RD = dyn_cast<RecordDecl>(DC)) {
       if (!RD->getIdentifier())
         OS << "(anonymous " << RD->getKindName() << ')';
       else
         OS << *RD;
-    } else if (const auto *FD = dyn_cast<FunctionDecl>(*I)) {
+    } else if (const auto *FD = dyn_cast<FunctionDecl>(DC)) {
       const FunctionProtoType *FT = nullptr;
       if (FD->hasWrittenPrototype())
         FT = dyn_cast<FunctionProtoType>(FD->getType()->castAs<FunctionType>());
@@ -1466,7 +1470,7 @@
         }
       }
       OS << ')';
-    } else if (const auto *ED = dyn_cast<EnumDecl>(*I)) {
+    } else if (const auto *ED = dyn_cast<EnumDecl>(DC)) {
       // C++ [dcl.enum]p10: Each enum-name and each unscoped
       // enumerator is declared in the scope that immediately contains
       // the enum-specifier. Each scoped enumerator is declared in the
@@ -1476,12 +1480,12 @@
       else
         continue;
     } else {
-      OS << *cast<NamedDecl>(*I);
+      OS << *cast<NamedDecl>(DC);
     }
     OS << "::";
   }
 
-  if (getDeclName())
+  if (getDeclName() || isa<DecompositionDecl>(this))
     OS << *this;
   else
     OS << "(anonymous)";
@@ -1912,7 +1916,9 @@
   // C++ [basic.def]p2:
   //   A declaration is a definition unless [...] it contains the 'extern'
   //   specifier or a linkage-specification and neither an initializer [...],
-  //   it declares a static data member in a class declaration [...].
+  //   it declares a non-inline static data member in a class declaration [...],
+  //   it declares a static data member outside a class definition and the variable
+  //   was defined within the class with the constexpr specifier [...],
   // C++1y [temp.expl.spec]p15:
   //   An explicit specialization of a static data member or an explicit
   //   specialization of a static data member template is a definition if the
@@ -1922,6 +1928,8 @@
   // a static data member template outside the containing class?
   if (isStaticDataMember()) {
     if (isOutOfLine() &&
+        !(getCanonicalDecl()->isInline() &&
+          getCanonicalDecl()->isConstexpr()) &&
         (hasInit() ||
          // If the first declaration is out-of-line, this may be an
          // instantiation of an out-of-line partial specialization of a variable
@@ -1932,6 +1940,8 @@
                     TSK_ExplicitSpecialization) ||
          isa<VarTemplatePartialSpecializationDecl>(this)))
       return Definition;
+    else if (!isOutOfLine() && isInline())
+      return Definition;
     else
       return DeclarationOnly;
   }
@@ -1945,7 +1955,7 @@
   if (hasInit())
     return Definition;
 
-  if (hasAttr<AliasAttr>())
+  if (hasDefiningAttr())
     return Definition;
 
   if (const auto *SAA = getAttr<SelectAnyAttr>())
@@ -2072,18 +2082,6 @@
   return false;
 }
 
-VarDecl *VarDecl::getOutOfLineDefinition() {
-  if (!isStaticDataMember())
-    return nullptr;
-
-  for (auto RD : redecls()) {
-    if (RD->getLexicalDeclContext()->isFileContext())
-      return RD;
-  }
-
-  return nullptr;
-}
-
 void VarDecl::setInit(Expr *I) {
   if (auto *Eval = Init.dyn_cast<EvaluatedStmt *>()) {
     Eval->~EvaluatedStmt();
@@ -2448,7 +2446,7 @@
   const TemplateArgumentList *TemplateArgs = getTemplateSpecializationArgs();
   if (TemplateArgs)
     TemplateSpecializationType::PrintTemplateArgumentList(
-        OS, TemplateArgs->data(), TemplateArgs->size(), Policy);
+        OS, TemplateArgs->asArray(), Policy);
 }
 
 bool FunctionDecl::isVariadic() const {
@@ -2485,7 +2483,7 @@
 bool FunctionDecl::isDefined(const FunctionDecl *&Definition) const {
   for (auto I : redecls()) {
     if (I->IsDeleted || I->IsDefaulted || I->Body || I->IsLateTemplateParsed ||
-        I->hasAttr<AliasAttr>()) {
+        I->hasDefiningAttr()) {
       Definition = I->IsDeleted ? I->getCanonicalDecl() : I;
       return true;
     }
@@ -2659,9 +2657,14 @@
 }
 
 bool FunctionDecl::isNoReturn() const {
-  return hasAttr<NoReturnAttr>() || hasAttr<CXX11NoReturnAttr>() ||
-         hasAttr<C11NoReturnAttr>() ||
-         getType()->getAs<FunctionType>()->getNoReturnAttr();
+  if (hasAttr<NoReturnAttr>() || hasAttr<CXX11NoReturnAttr>() ||
+      hasAttr<C11NoReturnAttr>())
+    return true;
+
+  if (auto *FnTy = getType()->getAs<FunctionType>())
+    return FnTy->getNoReturnAttr();
+
+  return false;
 }
 
 void
@@ -2708,8 +2711,7 @@
     // declaration, for instance "extern "C" { namespace std { decl } }".
     if (!LinkageDecl) {
       if (BuiltinID == Builtin::BI__GetExceptionInfo &&
-          Context.getTargetInfo().getCXXABI().isMicrosoft() &&
-          isInStdNamespace())
+          Context.getTargetInfo().getCXXABI().isMicrosoft())
         return Builtin::BI__GetExceptionInfo;
       return 0;
     }
@@ -2733,6 +2735,12 @@
   if (getStorageClass() == SC_Static)
     return 0;
 
+  // OpenCL v1.2 s6.9.f - The library functions defined in
+  // the C99 standard headers are not available.
+  if (Context.getLangOpts().OpenCL &&
+      Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
+    return 0;
+
   return BuiltinID;
 }
 
@@ -2788,7 +2796,7 @@
     return getNumParams();
 
   unsigned NumRequiredArgs = 0;
-  for (auto *Param : params())
+  for (auto *Param : parameters())
     if (!Param->isParameterPack() && !Param->hasDefaultArg())
       ++NumRequiredArgs;
   return NumRequiredArgs;
@@ -2929,16 +2937,22 @@
   return RTRange;
 }
 
-bool FunctionDecl::hasUnusedResultAttr() const {
+const Attr *FunctionDecl::getUnusedResultAttr() const {
   QualType RetType = getReturnType();
   if (RetType->isRecordType()) {
     const CXXRecordDecl *Ret = RetType->getAsCXXRecordDecl();
     const auto *MD = dyn_cast<CXXMethodDecl>(this);
-    if (Ret && Ret->hasAttr<WarnUnusedResultAttr>() &&
-        !(MD && MD->getCorrespondingMethodInClass(Ret, true)))
-      return true;
+    if (Ret && !(MD && MD->getCorrespondingMethodInClass(Ret, true))) {
+      if (const auto *R = Ret->getAttr<WarnUnusedResultAttr>())
+        return R;
+    }
+  } else if (const auto *ET = RetType->getAs<EnumType>()) {
+    if (const EnumDecl *ED = ET->getDecl()) {
+      if (const auto *R = ED->getAttr<WarnUnusedResultAttr>())
+        return R;
+    }
   }
-  return hasAttr<WarnUnusedResultAttr>();
+  return getAttr<WarnUnusedResultAttr>();
 }
 
 /// \brief For an inline function definition in C, or for a gnu_inline function
@@ -3403,6 +3417,10 @@
   case Builtin::BIstrlen:
     return Builtin::BIstrlen;
 
+  case Builtin::BI__builtin_bzero:
+  case Builtin::BIbzero:
+    return Builtin::BIbzero;
+
   default:
     if (isExternC()) {
       if (FnInfo->isStr("memset"))
@@ -3425,12 +3443,28 @@
         return Builtin::BIstrndup;
       else if (FnInfo->isStr("strlen"))
         return Builtin::BIstrlen;
+      else if (FnInfo->isStr("bzero"))
+        return Builtin::BIbzero;
     }
     break;
   }
   return 0;
 }
 
+void FunctionDecl::addDeferredDiag(PartialDiagnosticAt PD) {
+  getASTContext().getDeferredDiags()[this].push_back(std::move(PD));
+}
+
+std::vector<PartialDiagnosticAt> FunctionDecl::takeDeferredDiags() const {
+  auto &DD = getASTContext().getDeferredDiags();
+  auto It = DD.find(this);
+  if (It == DD.end())
+    return {};
+  auto Ret = std::move(It->second);
+  DD.erase(It);
+  return Ret;
+}
+
 //===----------------------------------------------------------------------===//
 // FieldDecl Implementation
 //===----------------------------------------------------------------------===//
@@ -3676,6 +3710,21 @@
     MSI->setPointOfInstantiation(PointOfInstantiation);
 }
 
+EnumDecl *EnumDecl::getTemplateInstantiationPattern() const {
+  if (MemberSpecializationInfo *MSInfo = getMemberSpecializationInfo()) {
+    if (isTemplateInstantiation(MSInfo->getTemplateSpecializationKind())) {
+      EnumDecl *ED = getInstantiatedFromMemberEnum();
+      while (auto *NewED = ED->getInstantiatedFromMemberEnum())
+        ED = NewED;
+      return ED;
+    }
+  }
+
+  assert(!isTemplateInstantiation(getTemplateSpecializationKind()) &&
+         "couldn't find pattern for enum instantiation");
+  return nullptr;
+}
+
 EnumDecl *EnumDecl::getInstantiatedFromMemberEnum() const {
   if (SpecializationInfo)
     return cast<EnumDecl>(SpecializationInfo->getInstantiatedFrom());
@@ -3901,6 +3950,53 @@
   return new (C, (DeclContext *)nullptr) TranslationUnitDecl(C);
 }
 
+void PragmaCommentDecl::anchor() { }
+
+PragmaCommentDecl *PragmaCommentDecl::Create(const ASTContext &C,
+                                             TranslationUnitDecl *DC,
+                                             SourceLocation CommentLoc,
+                                             PragmaMSCommentKind CommentKind,
+                                             StringRef Arg) {
+  PragmaCommentDecl *PCD =
+      new (C, DC, additionalSizeToAlloc<char>(Arg.size() + 1))
+          PragmaCommentDecl(DC, CommentLoc, CommentKind);
+  memcpy(PCD->getTrailingObjects<char>(), Arg.data(), Arg.size());
+  PCD->getTrailingObjects<char>()[Arg.size()] = '\0';
+  return PCD;
+}
+
+PragmaCommentDecl *PragmaCommentDecl::CreateDeserialized(ASTContext &C,
+                                                         unsigned ID,
+                                                         unsigned ArgSize) {
+  return new (C, ID, additionalSizeToAlloc<char>(ArgSize + 1))
+      PragmaCommentDecl(nullptr, SourceLocation(), PCK_Unknown);
+}
+
+void PragmaDetectMismatchDecl::anchor() { }
+
+PragmaDetectMismatchDecl *
+PragmaDetectMismatchDecl::Create(const ASTContext &C, TranslationUnitDecl *DC,
+                                 SourceLocation Loc, StringRef Name,
+                                 StringRef Value) {
+  size_t ValueStart = Name.size() + 1;
+  PragmaDetectMismatchDecl *PDMD =
+      new (C, DC, additionalSizeToAlloc<char>(ValueStart + Value.size() + 1))
+          PragmaDetectMismatchDecl(DC, Loc, ValueStart);
+  memcpy(PDMD->getTrailingObjects<char>(), Name.data(), Name.size());
+  PDMD->getTrailingObjects<char>()[Name.size()] = '\0';
+  memcpy(PDMD->getTrailingObjects<char>() + ValueStart, Value.data(),
+         Value.size());
+  PDMD->getTrailingObjects<char>()[ValueStart + Value.size()] = '\0';
+  return PDMD;
+}
+
+PragmaDetectMismatchDecl *
+PragmaDetectMismatchDecl::CreateDeserialized(ASTContext &C, unsigned ID,
+                                             unsigned NameValueSize) {
+  return new (C, ID, additionalSizeToAlloc<char>(NameValueSize + 1))
+      PragmaDetectMismatchDecl(nullptr, SourceLocation(), 0);
+}
+
 void ExternCContextDecl::anchor() { }
 
 ExternCContextDecl *ExternCContextDecl::Create(const ASTContext &C,
@@ -4027,8 +4123,10 @@
 
 IndirectFieldDecl::IndirectFieldDecl(ASTContext &C, DeclContext *DC,
                                      SourceLocation L, DeclarationName N,
-                                     QualType T, NamedDecl **CH, unsigned CHS)
-    : ValueDecl(IndirectField, DC, L, N, T), Chaining(CH), ChainingSize(CHS) {
+                                     QualType T,
+                                     MutableArrayRef<NamedDecl *> CH)
+    : ValueDecl(IndirectField, DC, L, N, T), Chaining(CH.data()),
+      ChainingSize(CH.size()) {
   // In C++, indirect field declarations conflict with tag declarations in the
   // same scope, so add them to IDNS_Tag so that tag redeclaration finds them.
   if (C.getLangOpts().CPlusPlus)
@@ -4037,16 +4135,15 @@
 
 IndirectFieldDecl *
 IndirectFieldDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L,
-                          IdentifierInfo *Id, QualType T, NamedDecl **CH,
-                          unsigned CHS) {
-  return new (C, DC) IndirectFieldDecl(C, DC, L, Id, T, CH, CHS);
+                          IdentifierInfo *Id, QualType T,
+                          llvm::MutableArrayRef<NamedDecl *> CH) {
+  return new (C, DC) IndirectFieldDecl(C, DC, L, Id, T, CH);
 }
 
 IndirectFieldDecl *IndirectFieldDecl::CreateDeserialized(ASTContext &C,
                                                          unsigned ID) {
   return new (C, ID) IndirectFieldDecl(C, nullptr, SourceLocation(),
-                                       DeclarationName(), QualType(), nullptr,
-                                       0);
+                                       DeclarationName(), QualType(), None);
 }
 
 SourceRange EnumConstantDecl::getSourceRange() const {
diff --git a/lib/AST/DeclBase.cpp b/lib/AST/DeclBase.cpp
index e9fc0f0..8342c0f 100644
--- a/lib/AST/DeclBase.cpp
+++ b/lib/AST/DeclBase.cpp
@@ -28,7 +28,6 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/TargetInfo.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace clang;
@@ -46,7 +45,7 @@
 }
 
 #define DECL(DERIVED, BASE)                                                    \
-  static_assert(Decl::DeclObjAlignment >=                                      \
+  static_assert(llvm::AlignOf<Decl>::Alignment >=                              \
                     llvm::AlignOf<DERIVED##Decl>::Alignment,                   \
                 "Alignment sufficient after objects prepended to " #DERIVED);
 #define ABSTRACT_DECL(DECL)
@@ -56,7 +55,7 @@
                          unsigned ID, std::size_t Extra) {
   // Allocate an extra 8 bytes worth of storage, which ensures that the
   // resulting pointer will still be 8-byte aligned.
-  static_assert(sizeof(unsigned) * 2 >= DeclObjAlignment,
+  static_assert(sizeof(unsigned) * 2 >= llvm::AlignOf<Decl>::Alignment,
                 "Decl won't be misaligned");
   void *Start = Context.Allocate(Size + Extra + 8);
   void *Result = (char*)Start + 8;
@@ -81,7 +80,8 @@
     // Ensure required alignment of the resulting object by adding extra
     // padding at the start if required.
     size_t ExtraAlign =
-        llvm::OffsetToAlignment(sizeof(Module *), DeclObjAlignment);
+        llvm::OffsetToAlignment(sizeof(Module *),
+                                llvm::AlignOf<Decl>::Alignment);
     char *Buffer = reinterpret_cast<char *>(
         ::operator new(ExtraAlign + sizeof(Module *) + Size + Extra, Ctx));
     Buffer += ExtraAlign;
@@ -196,6 +196,17 @@
   return isa<TemplateDecl>(this);
 }
 
+TemplateDecl *Decl::getDescribedTemplate() const {
+  if (auto *FD = dyn_cast<FunctionDecl>(this))
+    return FD->getDescribedFunctionTemplate();
+  else if (auto *RD = dyn_cast<CXXRecordDecl>(this))
+    return RD->getDescribedClassTemplate();
+  else if (auto *VD = dyn_cast<VarDecl>(this))
+    return VD->getDescribedVarTemplate();
+
+  return nullptr;
+}
+
 const DeclContext *Decl::getParentFunctionOrMethod() const {
   for (const DeclContext *DC = getDeclContext();
        DC && !DC->isTranslationUnit() && !DC->isNamespace(); 
@@ -329,25 +340,29 @@
   return Align;
 }
 
-bool Decl::isUsed(bool CheckUsedAttr) const { 
-  if (Used)
-    return true;
-  
-  // Check for used attribute.
-  if (CheckUsedAttr && hasAttr<UsedAttr>())
+bool Decl::isUsed(bool CheckUsedAttr) const {
+  const Decl *CanonD = getCanonicalDecl();
+  if (CanonD->Used)
     return true;
 
-  return false; 
+  // Check for used attribute.
+  // Ask the most recent decl, since attributes accumulate in the redecl chain.
+  if (CheckUsedAttr && getMostRecentDecl()->hasAttr<UsedAttr>())
+    return true;
+
+  // The information may have not been deserialized yet. Force deserialization
+  // to complete the needed information.
+  return getMostRecentDecl()->getCanonicalDecl()->Used;
 }
 
 void Decl::markUsed(ASTContext &C) {
-  if (Used)
+  if (isUsed(false))
     return;
 
   if (C.getASTMutationListener())
     C.getASTMutationListener()->DeclarationMarkedUsed(this);
 
-  Used = true;
+  setIsUsed();
 }
 
 bool Decl::isReferenced() const { 
@@ -362,6 +377,18 @@
   return false; 
 }
 
+bool Decl::hasDefiningAttr() const {
+  return hasAttr<AliasAttr>() || hasAttr<IFuncAttr>();
+}
+
+const Attr *Decl::getDefiningAttr() const {
+  if (AliasAttr *AA = getAttr<AliasAttr>())
+    return AA;
+  if (IFuncAttr *IFA = getAttr<IFuncAttr>())
+    return IFA;
+  return nullptr;
+}
+
 /// \brief Determine the availability of the given declaration based on
 /// the target platform.
 ///
@@ -371,16 +398,14 @@
 ///
 /// FIXME: Make these strings localizable, since they end up in
 /// diagnostics.
-static AvailabilityResult
-checkAvailability(ASTContext &Context, const AvailabilityAttr *A,
-                  Optional<VersionTuple> Version, std::string *Message) {
-  VersionTuple TargetMinVersion;
-  if (Version.hasValue())
-    TargetMinVersion = Version.getValue();
-  else
-    TargetMinVersion = Context.getTargetInfo().getPlatformMinVersion();
+static AvailabilityResult CheckAvailability(ASTContext &Context,
+                                            const AvailabilityAttr *A,
+                                            std::string *Message,
+                                            VersionTuple EnclosingVersion) {
+  if (EnclosingVersion.empty())
+    EnclosingVersion = Context.getTargetInfo().getPlatformMinVersion();
 
-  if (TargetMinVersion.empty())
+  if (EnclosingVersion.empty())
     return AR_Available;
 
   // Check if this is an App Extension "platform", and if so chop off
@@ -425,7 +450,7 @@
 
   // Make sure that this declaration has already been introduced.
   if (!A->getIntroduced().empty() && 
-      TargetMinVersion < A->getIntroduced()) {
+      EnclosingVersion < A->getIntroduced()) {
     if (Message) {
       Message->clear();
       llvm::raw_string_ostream Out(*Message);
@@ -439,7 +464,7 @@
   }
 
   // Make sure that this declaration hasn't been obsoleted.
-  if (!A->getObsoleted().empty() && TargetMinVersion >= A->getObsoleted()) {
+  if (!A->getObsoleted().empty() && EnclosingVersion >= A->getObsoleted()) {
     if (Message) {
       Message->clear();
       llvm::raw_string_ostream Out(*Message);
@@ -453,7 +478,7 @@
   }
 
   // Make sure that this declaration hasn't been deprecated.
-  if (!A->getDeprecated().empty() && TargetMinVersion >= A->getDeprecated()) {
+  if (!A->getDeprecated().empty() && EnclosingVersion >= A->getDeprecated()) {
     if (Message) {
       Message->clear();
       llvm::raw_string_ostream Out(*Message);
@@ -470,9 +495,9 @@
 }
 
 AvailabilityResult Decl::getAvailability(std::string *Message,
-                                         Optional<VersionTuple> Version) const {
+                                         VersionTuple EnclosingVersion) const {
   if (auto *FTD = dyn_cast<FunctionTemplateDecl>(this))
-    return FTD->getTemplatedDecl()->getAvailability(Message, Version);
+    return FTD->getTemplatedDecl()->getAvailability(Message, EnclosingVersion);
 
   AvailabilityResult Result = AR_Available;
   std::string ResultMessage;
@@ -496,8 +521,8 @@
     }
 
     if (const auto *Availability = dyn_cast<AvailabilityAttr>(A)) {
-      AvailabilityResult AR = checkAvailability(getASTContext(), Availability,
-                                                Version, Message);
+      AvailabilityResult AR = CheckAvailability(getASTContext(), Availability,
+                                                Message, EnclosingVersion);
 
       if (AR == AR_Unavailable)
         return AR_Unavailable;
@@ -556,8 +581,8 @@
       return true;
 
     if (const auto *Availability = dyn_cast<AvailabilityAttr>(A)) {
-      if (checkAvailability(getASTContext(), Availability, None,
-                            nullptr) == AR_NotYetIntroduced)
+      if (CheckAvailability(getASTContext(), Availability, nullptr,
+                            VersionTuple()) == AR_NotYetIntroduced)
         return true;
     }
   }
@@ -570,10 +595,12 @@
     case Function:
     case CXXMethod:
     case CXXConstructor:
+    case ConstructorUsingShadow:
     case CXXDestructor:
     case CXXConversion:
     case EnumConstant:
     case Var:
+    case Binding:
     case ImplicitParam:
     case ParmVar:
     case ObjCMethod:
@@ -637,6 +664,9 @@
     case TemplateTemplateParm:
       return IDNS_Ordinary | IDNS_Tag | IDNS_Type;
 
+    case OMPDeclareReduction:
+      return IDNS_OMPReduction;
+
     // Never have names.
     case Friend:
     case FriendTemplate:
@@ -645,10 +675,13 @@
     case FileScopeAsm:
     case StaticAssert:
     case ObjCPropertyImpl:
+    case PragmaComment:
+    case PragmaDetectMismatch:
     case Block:
     case Captured:
     case TranslationUnit:
     case ExternCContext:
+    case Decomposition:
 
     case UsingDirective:
     case BuiltinTemplate:
@@ -662,6 +695,7 @@
     case ObjCCategoryImpl:
     case Import:
     case OMPThreadPrivate:
+    case OMPCapturedExpr:
     case Empty:
       // Never looked up by name.
       return 0;
@@ -964,6 +998,7 @@
   case Decl::LinkageSpec:
   case Decl::Block:
   case Decl::Captured:
+  case Decl::OMPDeclareReduction:
     // There is only one DeclContext for these entities.
     return this;
 
diff --git a/lib/AST/DeclCXX.cpp b/lib/AST/DeclCXX.cpp
index 4f24fdc..7395db5 100644
--- a/lib/AST/DeclCXX.cpp
+++ b/lib/AST/DeclCXX.cpp
@@ -46,34 +46,33 @@
 }
 
 CXXRecordDecl::DefinitionData::DefinitionData(CXXRecordDecl *D)
-  : UserDeclaredConstructor(false), UserDeclaredSpecialMembers(0),
-    Aggregate(true), PlainOldData(true), Empty(true), Polymorphic(false),
-    Abstract(false), IsStandardLayout(true), HasNoNonEmptyBases(true),
-    HasPrivateFields(false), HasProtectedFields(false), HasPublicFields(false),
-    HasMutableFields(false), HasVariantMembers(false), HasOnlyCMembers(true),
-    HasInClassInitializer(false), HasUninitializedReferenceMember(false),
-    NeedOverloadResolutionForMoveConstructor(false),
-    NeedOverloadResolutionForMoveAssignment(false),
-    NeedOverloadResolutionForDestructor(false),
-    DefaultedMoveConstructorIsDeleted(false),
-    DefaultedMoveAssignmentIsDeleted(false),
-    DefaultedDestructorIsDeleted(false),
-    HasTrivialSpecialMembers(SMF_All),
-    DeclaredNonTrivialSpecialMembers(0),
-    HasIrrelevantDestructor(true),
-    HasConstexprNonCopyMoveConstructor(false),
-    DefaultedDefaultConstructorIsConstexpr(true),
-    HasConstexprDefaultConstructor(false),
-    HasNonLiteralTypeFieldsOrBases(false), ComputedVisibleConversions(false),
-    UserProvidedDefaultConstructor(false), DeclaredSpecialMembers(0),
-    ImplicitCopyConstructorHasConstParam(true),
-    ImplicitCopyAssignmentHasConstParam(true),
-    HasDeclaredCopyConstructorWithConstParam(false),
-    HasDeclaredCopyAssignmentWithConstParam(false),
-    IsLambda(false), IsParsingBaseSpecifiers(false), NumBases(0), NumVBases(0),
-    Bases(), VBases(),
-    Definition(D), FirstFriend() {
-}
+    : UserDeclaredConstructor(false), UserDeclaredSpecialMembers(0),
+      Aggregate(true), PlainOldData(true), Empty(true), Polymorphic(false),
+      Abstract(false), IsStandardLayout(true), HasNoNonEmptyBases(true),
+      HasPrivateFields(false), HasProtectedFields(false),
+      HasPublicFields(false), HasMutableFields(false), HasVariantMembers(false),
+      HasOnlyCMembers(true), HasInClassInitializer(false),
+      HasUninitializedReferenceMember(false), HasUninitializedFields(false),
+      HasInheritedConstructor(false), HasInheritedAssignment(false),
+      NeedOverloadResolutionForMoveConstructor(false),
+      NeedOverloadResolutionForMoveAssignment(false),
+      NeedOverloadResolutionForDestructor(false),
+      DefaultedMoveConstructorIsDeleted(false),
+      DefaultedMoveAssignmentIsDeleted(false),
+      DefaultedDestructorIsDeleted(false), HasTrivialSpecialMembers(SMF_All),
+      DeclaredNonTrivialSpecialMembers(0), HasIrrelevantDestructor(true),
+      HasConstexprNonCopyMoveConstructor(false),
+      HasDefaultedDefaultConstructor(false),
+      DefaultedDefaultConstructorIsConstexpr(true),
+      HasConstexprDefaultConstructor(false),
+      HasNonLiteralTypeFieldsOrBases(false), ComputedVisibleConversions(false),
+      UserProvidedDefaultConstructor(false), DeclaredSpecialMembers(0),
+      ImplicitCopyConstructorHasConstParam(true),
+      ImplicitCopyAssignmentHasConstParam(true),
+      HasDeclaredCopyConstructorWithConstParam(false),
+      HasDeclaredCopyAssignmentWithConstParam(false), IsLambda(false),
+      IsParsingBaseSpecifiers(false), NumBases(0), NumVBases(0), Bases(),
+      VBases(), Definition(D), FirstFriend() {}
 
 CXXBaseSpecifier *CXXRecordDecl::DefinitionData::getBasesSlowCase() const {
   return Bases.get(Definition->getASTContext().getExternalSource());
@@ -89,7 +88,7 @@
                              CXXRecordDecl *PrevDecl)
     : RecordDecl(K, TK, C, DC, StartLoc, IdLoc, Id, PrevDecl),
       DefinitionData(PrevDecl ? PrevDecl->DefinitionData
-                              : DefinitionDataPtr(this)),
+                              : nullptr),
       TemplateOrInstantiation() {}
 
 CXXRecordDecl *CXXRecordDecl::Create(const ASTContext &C, TagKind TK,
@@ -143,9 +142,11 @@
     C.Deallocate(data().getBases());
 
   if (NumBases) {
-    // C++ [dcl.init.aggr]p1:
-    //   An aggregate is [...] a class with [...] no base classes [...].
-    data().Aggregate = false;
+    if (!C.getLangOpts().CPlusPlus1z) {
+      // C++ [dcl.init.aggr]p1:
+      //   An aggregate is [...] a class with [...] no base classes [...].
+      data().Aggregate = false;
+    }
 
     // C++ [class]p4:
     //   A POD-struct is an aggregate class...
@@ -171,8 +172,6 @@
     CXXRecordDecl *BaseClassDecl
       = cast<CXXRecordDecl>(BaseType->getAs<RecordType>()->getDecl());
 
-    // A class with a non-empty base class is not empty.
-    // FIXME: Standard ref?
     if (!BaseClassDecl->isEmpty()) {
       if (!data().Empty) {
         // C++0x [class]p7:
@@ -186,10 +185,18 @@
         data().IsStandardLayout = false;
       }
 
+      // C++14 [meta.unary.prop]p4:
+      //   T is a class type [...] with [...] no base class B for which
+      //   is_empty<B>::value is false.
       data().Empty = false;
       data().HasNoNonEmptyBases = false;
     }
     
+    // C++1z [dcl.init.agg]p1:
+    //   An aggregate is a class with [...] no private or protected base classes
+    if (Base->getAccessSpecifier() != AS_public)
+      data().Aggregate = false;
+
     // C++ [class.virtual]p1:
     //   A class that declares or inherits a virtual function is called a 
     //   polymorphic class.
@@ -220,6 +227,10 @@
         if (CXXRecordDecl *VBaseDecl = VBase.getType()->getAsCXXRecordDecl())
           if (!VBaseDecl->hasCopyConstructorWithConstParam())
             data().ImplicitCopyConstructorHasConstParam = false;
+
+        // C++1z [dcl.init.agg]p1:
+        //   An aggregate is a class with [...] no virtual base classes
+        data().Aggregate = false;
       }
     }
 
@@ -228,11 +239,15 @@
       if (SeenVBaseTypes.insert(C.getCanonicalType(BaseType)).second)
         VBases.push_back(Base);
 
-      // C++0x [meta.unary.prop] is_empty:
-      //    T is a class type, but not a union type, with ... no virtual base
-      //    classes
+      // C++14 [meta.unary.prop] is_empty:
+      //   T is a class type, but not a union type, with ... no virtual base
+      //   classes
       data().Empty = false;
 
+      // C++1z [dcl.init.agg]p1:
+      //   An aggregate is a class with [...] no virtual base classes
+      data().Aggregate = false;
+
       // C++11 [class.ctor]p5, C++11 [class.copy]p12, C++11 [class.copy]p25:
       //   A [default constructor, copy/move constructor, or copy/move assignment
       //   operator for a class X] is trivial [...] if:
@@ -332,6 +347,9 @@
     if (BaseClassDecl->hasUninitializedReferenceMember())
       data().HasUninitializedReferenceMember = true;
 
+    if (!BaseClassDecl->allowConstDefaultInit())
+      data().HasUninitializedFields = true;
+
     addedClassSubobject(BaseClassDecl);
   }
   
@@ -430,6 +448,15 @@
   FunctionTemplateDecl *FunTmpl = dyn_cast<FunctionTemplateDecl>(D);
   if (FunTmpl)
     D = FunTmpl->getTemplatedDecl();
+
+  // FIXME: Pass NamedDecl* to addedMember?
+  Decl *DUnderlying = D;
+  if (auto *ND = dyn_cast<NamedDecl>(DUnderlying)) {
+    DUnderlying = ND->getUnderlyingDecl();
+    if (FunctionTemplateDecl *UnderlyingFunTmpl =
+            dyn_cast<FunctionTemplateDecl>(DUnderlying))
+      DUnderlying = UnderlyingFunTmpl->getTemplatedDecl();
+  }
   
   if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D)) {
     if (Method->isVirtual()) {
@@ -441,8 +468,8 @@
       //   A POD-struct is an aggregate class...
       data().PlainOldData = false;
       
-      // Virtual functions make the class non-empty.
-      // FIXME: Standard ref?
+      // C++14 [meta.unary.prop]p4:
+      //   T is a class type [...] with [...] no virtual member functions...
       data().Empty = false;
 
       // C++ [class.virtual]p1:
@@ -485,18 +512,15 @@
       data().PlainOldData = false;
     }
 
-    // Technically, "user-provided" is only defined for special member
-    // functions, but the intent of the standard is clearly that it should apply
-    // to all functions.
-    bool UserProvided = Constructor->isUserProvided();
-
     if (Constructor->isDefaultConstructor()) {
       SMKind |= SMF_DefaultConstructor;
 
-      if (UserProvided)
+      if (Constructor->isUserProvided())
         data().UserProvidedDefaultConstructor = true;
       if (Constructor->isConstexpr())
         data().HasConstexprDefaultConstructor = true;
+      if (Constructor->isDefaulted())
+        data().HasDefaultedDefaultConstructor = true;
     }
 
     if (!FunTmpl) {
@@ -509,9 +533,17 @@
       } else if (Constructor->isMoveConstructor())
         SMKind |= SMF_MoveConstructor;
     }
+  }
 
+  // Handle constructors, including those inherited from base classes.
+  if (CXXConstructorDecl *Constructor =
+          dyn_cast<CXXConstructorDecl>(DUnderlying)) {
     // Record if we see any constexpr constructors which are neither copy
     // nor move constructors.
+    // C++1z [basic.types]p10:
+    //   [...] has at least one constexpr constructor or constructor template
+    //   (possibly inherited from a base class) that is not a copy or move
+    //   constructor [...]
     if (Constructor->isConstexpr() && !Constructor->isCopyOrMoveConstructor())
       data().HasConstexprNonCopyMoveConstructor = true;
 
@@ -521,8 +553,12 @@
     // C++11 [dcl.init.aggr]p1:
     //   An aggregate is an array or a class with no user-provided
     //   constructors [...].
+    // C++11 [dcl.init.aggr]p1:
+    //   An aggregate is an array or a class with no user-provided
+    //   constructors (including those inherited from a base class) [...].
     if (getASTContext().getLangOpts().CPlusPlus11
-          ? UserProvided : !Constructor->isImplicit())
+            ? Constructor->isUserProvided()
+            : !Constructor->isImplicit())
       data().Aggregate = false;
   }
 
@@ -702,6 +738,15 @@
       data().IsStandardLayout = false;
     }
 
+    if (!Field->hasInClassInitializer() && !Field->isMutable()) {
+      if (CXXRecordDecl *FieldType = Field->getType()->getAsCXXRecordDecl()) {
+        if (FieldType->hasDefinition() && !FieldType->allowConstDefaultInit())
+          data().HasUninitializedFields = true;
+      } else {
+        data().HasUninitializedFields = true;
+      }
+    }
+
     // Record if this field is the first non-literal or volatile field or base.
     if (!T->isLiteralType(Context) || T.isVolatileQualified())
       data().HasNonLiteralTypeFieldsOrBases = true;
@@ -720,7 +765,7 @@
       //   An aggregate is a [...] class with [...] no
       //   brace-or-equal-initializers for non-static data members.
       //
-      // This rule was removed in C++1y.
+      // This rule was removed in C++14.
       if (!getASTContext().getLangOpts().CPlusPlus14)
         data().Aggregate = false;
 
@@ -762,6 +807,17 @@
             data().DefaultedDestructorIsDeleted = true;
         }
 
+        // For an anonymous union member, our overload resolution will perform
+        // overload resolution for its members.
+        if (Field->isAnonymousStructOrUnion()) {
+          data().NeedOverloadResolutionForMoveConstructor |=
+              FieldRec->data().NeedOverloadResolutionForMoveConstructor;
+          data().NeedOverloadResolutionForMoveAssignment |=
+              FieldRec->data().NeedOverloadResolutionForMoveAssignment;
+          data().NeedOverloadResolutionForDestructor |=
+              FieldRec->data().NeedOverloadResolutionForDestructor;
+        }
+
         // C++0x [class.ctor]p5:
         //   A default constructor is trivial [...] if:
         //    -- for all the non-static data members of its class that are of
@@ -910,7 +966,9 @@
     if (!data().HasNoNonEmptyBases)
       data().IsStandardLayout = false;
 
-    // If this is not a zero-length bit-field, then the class is not empty.
+    // C++14 [meta.unary.prop]p4:
+    //   T is a class type [...] with [...] no non-static data members other
+    //   than bit-fields of length 0...
     if (data().Empty) {
       if (!Field->isBitField() ||
           (!Field->getBitWidth()->isTypeDependent() &&
@@ -928,6 +986,15 @@
       data().Conversions.get(Ctx).addDecl(Ctx, Shadow, Shadow->getAccess());
     }
   }
+
+  if (UsingDecl *Using = dyn_cast<UsingDecl>(D)) {
+    if (Using->getDeclName().getNameKind() ==
+        DeclarationName::CXXConstructorName)
+      data().HasInheritedConstructor = true;
+
+    if (Using->getDeclName().getCXXOverloadedOperator() == OO_Equal)
+      data().HasInheritedAssignment = true;
+  }
 }
 
 void CXXRecordDecl::finishedDefaultedOrDeletedMember(CXXMethodDecl *D) {
@@ -1595,6 +1662,13 @@
   return getASTContext().overridden_methods_size(this);
 }
 
+CXXMethodDecl::overridden_method_range
+CXXMethodDecl::overridden_methods() const {
+  if (isa<CXXConstructorDecl>(this))
+    return overridden_method_range(nullptr, nullptr);
+  return getASTContext().overridden_methods(this);
+}
+
 QualType CXXMethodDecl::getThisType(ASTContext &C) const {
   // C++ 9.3.2p1: The type of this in a member function of a class X is X*.
   // If the member function is declared const, the type of this is const X*,
@@ -1606,7 +1680,7 @@
 
   QualType ClassTy = C.getTypeDeclType(getParent());
   ClassTy = C.getQualifiedType(ClassTy,
-                               Qualifiers::fromCVRMask(getTypeQualifiers()));
+                               Qualifiers::fromCVRUMask(getTypeQualifiers()));
   return C.getPointerType(ClassTy);
 }
 
@@ -1744,11 +1818,15 @@
 
 void CXXConstructorDecl::anchor() { }
 
-CXXConstructorDecl *
-CXXConstructorDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
-  return new (C, ID) CXXConstructorDecl(C, nullptr, SourceLocation(),
-                                        DeclarationNameInfo(), QualType(),
-                                        nullptr, false, false, false, false);
+CXXConstructorDecl *CXXConstructorDecl::CreateDeserialized(ASTContext &C,
+                                                           unsigned ID,
+                                                           bool Inherited) {
+  unsigned Extra = additionalSizeToAlloc<InheritedConstructor>(Inherited);
+  auto *Result = new (C, ID, Extra) CXXConstructorDecl(
+      C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr,
+      false, false, false, false, InheritedConstructor());
+  Result->IsInheritingConstructor = Inherited;
+  return Result;
 }
 
 CXXConstructorDecl *
@@ -1757,13 +1835,16 @@
                            const DeclarationNameInfo &NameInfo,
                            QualType T, TypeSourceInfo *TInfo,
                            bool isExplicit, bool isInline,
-                           bool isImplicitlyDeclared, bool isConstexpr) {
+                           bool isImplicitlyDeclared, bool isConstexpr,
+                           InheritedConstructor Inherited) {
   assert(NameInfo.getName().getNameKind()
          == DeclarationName::CXXConstructorName &&
          "Name must refer to a constructor");
-  return new (C, RD) CXXConstructorDecl(C, RD, StartLoc, NameInfo, T, TInfo,
-                                        isExplicit, isInline,
-                                        isImplicitlyDeclared, isConstexpr);
+  unsigned Extra =
+      additionalSizeToAlloc<InheritedConstructor>(Inherited ? 1 : 0);
+  return new (C, RD, Extra) CXXConstructorDecl(
+      C, RD, StartLoc, NameInfo, T, TInfo, isExplicit, isInline,
+      isImplicitlyDeclared, isConstexpr, Inherited);
 }
 
 CXXConstructorDecl::init_const_iterator CXXConstructorDecl::init_begin() const {
@@ -1878,23 +1959,6 @@
   return true;  
 }
 
-const CXXConstructorDecl *CXXConstructorDecl::getInheritedConstructor() const {
-  // Hack: we store the inherited constructor in the overridden method table
-  method_iterator It = getASTContext().overridden_methods_begin(this);
-  if (It == getASTContext().overridden_methods_end(this))
-    return nullptr;
-
-  return cast<CXXConstructorDecl>(*It);
-}
-
-void
-CXXConstructorDecl::setInheritedConstructor(const CXXConstructorDecl *BaseCtor){
-  // Hack: we store the inherited constructor in the overridden method table
-  assert(getASTContext().overridden_methods_size(this) == 0 &&
-         "Base ctor already set.");
-  getASTContext().addOverriddenMethod(this, BaseCtor);
-}
-
 void CXXDestructorDecl::anchor() { }
 
 CXXDestructorDecl *
@@ -2090,10 +2154,24 @@
 
 void UsingShadowDecl::anchor() { }
 
+UsingShadowDecl::UsingShadowDecl(Kind K, ASTContext &C, DeclContext *DC,
+                                 SourceLocation Loc, UsingDecl *Using,
+                                 NamedDecl *Target)
+    : NamedDecl(K, DC, Loc, Using ? Using->getDeclName() : DeclarationName()),
+      redeclarable_base(C), Underlying(Target),
+      UsingOrNextShadow(cast<NamedDecl>(Using)) {
+  if (Target)
+    IdentifierNamespace = Target->getIdentifierNamespace();
+  setImplicit();
+}
+
+UsingShadowDecl::UsingShadowDecl(Kind K, ASTContext &C, EmptyShell Empty)
+    : NamedDecl(K, nullptr, SourceLocation(), DeclarationName()),
+      redeclarable_base(C), Underlying(), UsingOrNextShadow() {}
+
 UsingShadowDecl *
 UsingShadowDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
-  return new (C, ID) UsingShadowDecl(C, nullptr, SourceLocation(),
-                                     nullptr, nullptr);
+  return new (C, ID) UsingShadowDecl(UsingShadow, C, EmptyShell());
 }
 
 UsingDecl *UsingShadowDecl::getUsingDecl() const {
@@ -2104,6 +2182,25 @@
   return cast<UsingDecl>(Shadow->UsingOrNextShadow);
 }
 
+void ConstructorUsingShadowDecl::anchor() { }
+
+ConstructorUsingShadowDecl *
+ConstructorUsingShadowDecl::Create(ASTContext &C, DeclContext *DC,
+                                   SourceLocation Loc, UsingDecl *Using,
+                                   NamedDecl *Target, bool IsVirtual) {
+  return new (C, DC) ConstructorUsingShadowDecl(C, DC, Loc, Using, Target,
+                                                IsVirtual);
+}
+
+ConstructorUsingShadowDecl *
+ConstructorUsingShadowDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
+  return new (C, ID) ConstructorUsingShadowDecl(C, EmptyShell());
+}
+
+CXXRecordDecl *ConstructorUsingShadowDecl::getNominatedBaseClass() const {
+  return getUsingDecl()->getQualifier()->getAsRecordDecl();
+}
+
 void UsingDecl::anchor() { }
 
 void UsingDecl::addShadowDecl(UsingShadowDecl *S) {
@@ -2220,6 +2317,70 @@
                                       nullptr, SourceLocation(), false);
 }
 
+void BindingDecl::anchor() {}
+
+BindingDecl *BindingDecl::Create(ASTContext &C, DeclContext *DC,
+                                 SourceLocation IdLoc, IdentifierInfo *Id) {
+  return new (C, DC) BindingDecl(DC, IdLoc, Id);
+}
+
+BindingDecl *BindingDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
+  return new (C, ID) BindingDecl(nullptr, SourceLocation(), nullptr);
+}
+
+VarDecl *BindingDecl::getHoldingVar() const {
+  Expr *B = getBinding();
+  if (!B)
+    return nullptr;
+  auto *DRE = dyn_cast<DeclRefExpr>(B->IgnoreImplicit());
+  if (!DRE)
+    return nullptr;
+
+  auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
+  assert(VD->isImplicit() && "holding var for binding decl not implicit");
+  return VD;
+}
+
+void DecompositionDecl::anchor() {}
+
+DecompositionDecl *DecompositionDecl::Create(ASTContext &C, DeclContext *DC,
+                                             SourceLocation StartLoc,
+                                             SourceLocation LSquareLoc,
+                                             QualType T, TypeSourceInfo *TInfo,
+                                             StorageClass SC,
+                                             ArrayRef<BindingDecl *> Bindings) {
+  size_t Extra = additionalSizeToAlloc<BindingDecl *>(Bindings.size());
+  return new (C, DC, Extra)
+      DecompositionDecl(C, DC, StartLoc, LSquareLoc, T, TInfo, SC, Bindings);
+}
+
+DecompositionDecl *DecompositionDecl::CreateDeserialized(ASTContext &C,
+                                                         unsigned ID,
+                                                         unsigned NumBindings) {
+  size_t Extra = additionalSizeToAlloc<BindingDecl *>(NumBindings);
+  auto *Result = new (C, ID, Extra)
+      DecompositionDecl(C, nullptr, SourceLocation(), SourceLocation(),
+                        QualType(), nullptr, StorageClass(), None);
+  // Set up and clean out the bindings array.
+  Result->NumBindings = NumBindings;
+  auto *Trail = Result->getTrailingObjects<BindingDecl *>();
+  for (unsigned I = 0; I != NumBindings; ++I)
+    new (Trail + I) BindingDecl*(nullptr);
+  return Result;
+}
+
+void DecompositionDecl::printName(llvm::raw_ostream &os) const {
+  os << '[';
+  bool Comma = false;
+  for (auto *B : bindings()) {
+    if (Comma)
+      os << ", ";
+    B->printName(os);
+    Comma = true;
+  }
+  os << ']';
+}
+
 MSPropertyDecl *MSPropertyDecl::Create(ASTContext &C, DeclContext *DC,
                                        SourceLocation L, DeclarationName N,
                                        QualType T, TypeSourceInfo *TInfo,
diff --git a/lib/AST/DeclGroup.cpp b/lib/AST/DeclGroup.cpp
index f162e6d..8bcf7f2 100644
--- a/lib/AST/DeclGroup.cpp
+++ b/lib/AST/DeclGroup.cpp
@@ -14,7 +14,6 @@
 #include "clang/AST/DeclGroup.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
-#include "llvm/Support/Allocator.h"
 using namespace clang;
 
 DeclGroup* DeclGroup::Create(ASTContext &C, Decl **Decls, unsigned NumDecls) {
diff --git a/lib/AST/DeclOpenMP.cpp b/lib/AST/DeclOpenMP.cpp
index 493e2cd..5b06ce0 100644
--- a/lib/AST/DeclOpenMP.cpp
+++ b/lib/AST/DeclOpenMP.cpp
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 /// \file
-/// \brief This file implements OMPThreadPrivateDecl class.
+/// \brief This file implements OMPThreadPrivateDecl, OMPCapturedExprDecl
+/// classes.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -52,3 +53,50 @@
   std::uninitialized_copy(VL.begin(), VL.end(), getTrailingObjects<Expr *>());
 }
 
+//===----------------------------------------------------------------------===//
+// OMPDeclareReductionDecl Implementation.
+//===----------------------------------------------------------------------===//
+
+void OMPDeclareReductionDecl::anchor() {}
+
+OMPDeclareReductionDecl *OMPDeclareReductionDecl::Create(
+    ASTContext &C, DeclContext *DC, SourceLocation L, DeclarationName Name,
+    QualType T, OMPDeclareReductionDecl *PrevDeclInScope) {
+  return new (C, DC) OMPDeclareReductionDecl(OMPDeclareReduction, DC, L, Name,
+                                             T, PrevDeclInScope);
+}
+
+OMPDeclareReductionDecl *
+OMPDeclareReductionDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
+  return new (C, ID) OMPDeclareReductionDecl(
+      OMPDeclareReduction, /*DC=*/nullptr, SourceLocation(), DeclarationName(),
+      QualType(), /*PrevDeclInScope=*/nullptr);
+}
+
+OMPDeclareReductionDecl *OMPDeclareReductionDecl::getPrevDeclInScope() {
+  return cast_or_null<OMPDeclareReductionDecl>(
+      PrevDeclInScope.get(getASTContext().getExternalSource()));
+}
+const OMPDeclareReductionDecl *
+OMPDeclareReductionDecl::getPrevDeclInScope() const {
+  return cast_or_null<OMPDeclareReductionDecl>(
+      PrevDeclInScope.get(getASTContext().getExternalSource()));
+}
+
+//===----------------------------------------------------------------------===//
+// OMPCapturedExprDecl Implementation.
+//===----------------------------------------------------------------------===//
+
+void OMPCapturedExprDecl::anchor() {}
+
+OMPCapturedExprDecl *OMPCapturedExprDecl::Create(ASTContext &C, DeclContext *DC,
+                                                 IdentifierInfo *Id,
+                                                 QualType T) {
+  return new (C, DC) OMPCapturedExprDecl(C, DC, Id, T);
+}
+
+OMPCapturedExprDecl *OMPCapturedExprDecl::CreateDeserialized(ASTContext &C,
+                                                             unsigned ID) {
+  return new (C, ID) OMPCapturedExprDecl(C, nullptr, nullptr, QualType());
+}
+
diff --git a/lib/AST/DeclPrinter.cpp b/lib/AST/DeclPrinter.cpp
index e20c7d8..7e78699 100644
--- a/lib/AST/DeclPrinter.cpp
+++ b/lib/AST/DeclPrinter.cpp
@@ -92,6 +92,8 @@
     void VisitUsingDecl(UsingDecl *D);
     void VisitUsingShadowDecl(UsingShadowDecl *D);
     void VisitOMPThreadPrivateDecl(OMPThreadPrivateDecl *D);
+    void VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D);
+    void VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D);
 
     void PrintTemplateParameters(const TemplateParameterList *Params,
                                  const TemplateArgumentList *Args = nullptr);
@@ -130,6 +132,8 @@
       BaseType = VTy->getElementType();
     else if (const ReferenceType *RTy = BaseType->getAs<ReferenceType>())
       BaseType = RTy->getPointeeType();
+    else if (const AutoType *ATy = BaseType->getAs<AutoType>())
+      BaseType = ATy->getDeducedType();
     else
       llvm_unreachable("Unknown declarator!");
   }
@@ -158,19 +162,17 @@
     ++Begin;
 
   PrintingPolicy SubPolicy(Policy);
-  if (TD && TD->isCompleteDefinition()) {
-    TD->print(Out, Policy, Indentation);
-    Out << " ";
-    SubPolicy.SuppressTag = true;
-  }
 
   bool isFirst = true;
   for ( ; Begin != End; ++Begin) {
     if (isFirst) {
+      if(TD)
+        SubPolicy.IncludeTagDefinition = true;
       SubPolicy.SuppressSpecifiers = false;
       isFirst = false;
     } else {
       if (!isFirst) Out << ", ";
+      SubPolicy.IncludeTagDefinition = false;
       SubPolicy.SuppressSpecifiers = true;
     }
 
@@ -244,7 +246,7 @@
     Pack = true;
     T = PET->getPattern();
   }
-  T.print(Out, Policy, (Pack ? "..." : "") + DeclName);
+  T.print(Out, Policy, (Pack ? "..." : "") + DeclName, Indentation);
 }
 
 void DeclPrinter::ProcessDeclGroup(SmallVectorImpl<Decl*>& Decls) {
@@ -333,7 +335,7 @@
 
     // FIXME: Need to be able to tell the DeclPrinter when
     const char *Terminator = nullptr;
-    if (isa<OMPThreadPrivateDecl>(*D))
+    if (isa<OMPThreadPrivateDecl>(*D) || isa<OMPDeclareReductionDecl>(*D))
       Terminator = nullptr;
     else if (isa<FunctionDecl>(*D) &&
              cast<FunctionDecl>(*D)->isThisDeclarationADefinition())
@@ -358,6 +360,11 @@
     if (Terminator)
       Out << Terminator;
     Out << "\n";
+
+    // Declare target attribute is special one, natural spelling for the pragma
+    // assumes "ending" construct so print it here.
+    if (D->hasAttr<OMPDeclareTargetDeclAttr>())
+      Out << "#pragma omp end declare target\n";
   }
 
   if (!Decls.empty())
@@ -378,7 +385,8 @@
     if (D->isModulePrivate())
       Out << "__module_private__ ";
   }
-  D->getTypeSourceInfo()->getType().print(Out, Policy, D->getName());
+  QualType Ty = D->getTypeSourceInfo()->getType();
+  Ty.print(Out, Policy, D->getName(), Indentation);
   prettyPrintAttributes(D);
 }
 
@@ -683,7 +691,7 @@
     Out << "__module_private__ ";
 
   Out << D->getASTContext().getUnqualifiedObjCPointerType(D->getType()).
-            stream(Policy, D->getName());
+         stream(Policy, D->getName(), Indentation);
 
   if (D->isBitField()) {
     Out << " : ";
@@ -707,6 +715,11 @@
 
 void DeclPrinter::VisitVarDecl(VarDecl *D) {
   prettyPrintPragmas(D);
+
+  QualType T = D->getTypeSourceInfo()
+    ? D->getTypeSourceInfo()->getType()
+    : D->getASTContext().getUnqualifiedObjCPointerType(D->getType());
+
   if (!Policy.SuppressSpecifiers) {
     StorageClass SC = D->getStorageClass();
     if (SC != SC_None)
@@ -728,11 +741,13 @@
 
     if (D->isModulePrivate())
       Out << "__module_private__ ";
+
+    if (D->isConstexpr()) {
+      Out << "constexpr ";
+      T.removeLocalConst();
+    }
   }
 
-  QualType T = D->getTypeSourceInfo()
-    ? D->getTypeSourceInfo()->getType()
-    : D->getASTContext().getUnqualifiedObjCPointerType(D->getType());
   printDeclType(T, D->getName());
   Expr *Init = D->getInit();
   if (!Policy.SuppressInitializers && Init) {
@@ -751,7 +766,10 @@
       else if (D->getInitStyle() == VarDecl::CInit) {
         Out << " = ";
       }
-      Init->printPretty(Out, nullptr, Policy, Indentation);
+      PrintingPolicy SubPolicy(Policy);
+      SubPolicy.SuppressSpecifiers = false;
+      SubPolicy.IncludeTagDefinition = false;
+      Init->printPretty(Out, nullptr, SubPolicy, Indentation);
       if ((D->getInitStyle() == VarDecl::CallInit) && !isa<ParenListExpr>(Init))
         Out << ")";
     }
@@ -1046,7 +1064,7 @@
 
   std::string name = OMD->getSelector().getAsString();
   std::string::size_type pos, lastPos = 0;
-  for (const auto *PI : OMD->params()) {
+  for (const auto *PI : OMD->parameters()) {
     // FIXME: selector is missing here!
     pos = name.find_first_of(':', lastPos);
     Out << " " << name.substr(lastPos, pos - lastPos) << ':';
@@ -1363,3 +1381,38 @@
   }
 }
 
+void DeclPrinter::VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D) {
+  if (!D->isInvalidDecl()) {
+    Out << "#pragma omp declare reduction (";
+    if (D->getDeclName().getNameKind() == DeclarationName::CXXOperatorName) {
+      static const char *const OperatorNames[NUM_OVERLOADED_OPERATORS] = {
+          nullptr,
+#define OVERLOADED_OPERATOR(Name, Spelling, Token, Unary, Binary, MemberOnly)  \
+          Spelling,
+#include "clang/Basic/OperatorKinds.def"
+      };
+      const char *OpName =
+          OperatorNames[D->getDeclName().getCXXOverloadedOperator()];
+      assert(OpName && "not an overloaded operator");
+      Out << OpName;
+    } else {
+      assert(D->getDeclName().isIdentifier());
+      D->printName(Out);
+    }
+    Out << " : ";
+    D->getType().print(Out, Policy);
+    Out << " : ";
+    D->getCombiner()->printPretty(Out, nullptr, Policy, 0);
+    Out << ")";
+    if (auto *Init = D->getInitializer()) {
+      Out << " initializer(";
+      Init->printPretty(Out, nullptr, Policy, 0);
+      Out << ")";
+    }
+  }
+}
+
+void DeclPrinter::VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D) {
+  D->getInit()->printPretty(Out, nullptr, Policy, Indentation);
+}
+
diff --git a/lib/AST/DeclTemplate.cpp b/lib/AST/DeclTemplate.cpp
index f68bf4b..bcc8878 100644
--- a/lib/AST/DeclTemplate.cpp
+++ b/lib/AST/DeclTemplate.cpp
@@ -31,9 +31,11 @@
 TemplateParameterList::TemplateParameterList(SourceLocation TemplateLoc,
                                              SourceLocation LAngleLoc,
                                              ArrayRef<NamedDecl *> Params,
-                                             SourceLocation RAngleLoc)
+                                             SourceLocation RAngleLoc,
+                                             Expr *RequiresClause)
   : TemplateLoc(TemplateLoc), LAngleLoc(LAngleLoc), RAngleLoc(RAngleLoc),
-    NumParams(Params.size()), ContainsUnexpandedParameterPack(false) {
+    NumParams(Params.size()), ContainsUnexpandedParameterPack(false),
+    HasRequiresClause(static_cast<bool>(RequiresClause)) {
   assert(this->NumParams == NumParams && "Too many template parameters");
   for (unsigned Idx = 0; Idx < NumParams; ++Idx) {
     NamedDecl *P = Params[Idx];
@@ -52,45 +54,48 @@
       // template parameter list does too.
     }
   }
+  if (RequiresClause) {
+    *getTrailingObjects<Expr *>() = RequiresClause;
+  }
 }
 
-TemplateParameterList *TemplateParameterList::Create(
-    const ASTContext &C, SourceLocation TemplateLoc, SourceLocation LAngleLoc,
-    ArrayRef<NamedDecl *> Params, SourceLocation RAngleLoc) {
-  void *Mem = C.Allocate(totalSizeToAlloc<NamedDecl *>(Params.size()),
+TemplateParameterList *
+TemplateParameterList::Create(const ASTContext &C, SourceLocation TemplateLoc,
+                              SourceLocation LAngleLoc,
+                              ArrayRef<NamedDecl *> Params,
+                              SourceLocation RAngleLoc, Expr *RequiresClause) {
+  void *Mem = C.Allocate(totalSizeToAlloc<NamedDecl *, Expr *>(
+                             Params.size(), RequiresClause ? 1u : 0u),
                          llvm::alignOf<TemplateParameterList>());
   return new (Mem) TemplateParameterList(TemplateLoc, LAngleLoc, Params,
-                                         RAngleLoc);
+                                         RAngleLoc, RequiresClause);
 }
 
 unsigned TemplateParameterList::getMinRequiredArguments() const {
   unsigned NumRequiredArgs = 0;
-  for (iterator P = const_cast<TemplateParameterList *>(this)->begin(), 
-             PEnd = const_cast<TemplateParameterList *>(this)->end(); 
-       P != PEnd; ++P) {
-    if ((*P)->isTemplateParameterPack()) {
-      if (NonTypeTemplateParmDecl *NTTP = dyn_cast<NonTypeTemplateParmDecl>(*P))
+  for (const NamedDecl *P : asArray()) {
+    if (P->isTemplateParameterPack()) {
+      if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(P))
         if (NTTP->isExpandedParameterPack()) {
           NumRequiredArgs += NTTP->getNumExpansionTypes();
           continue;
         }
-      
+
       break;
     }
-  
-    if (TemplateTypeParmDecl *TTP = dyn_cast<TemplateTypeParmDecl>(*P)) {
+
+    if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(P)) {
       if (TTP->hasDefaultArgument())
         break;
-    } else if (NonTypeTemplateParmDecl *NTTP 
-                                    = dyn_cast<NonTypeTemplateParmDecl>(*P)) {
+    } else if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(P)) {
       if (NTTP->hasDefaultArgument())
         break;
-    } else if (cast<TemplateTemplateParmDecl>(*P)->hasDefaultArgument())
+    } else if (cast<TemplateTemplateParmDecl>(P)->hasDefaultArgument())
       break;
-    
+
     ++NumRequiredArgs;
   }
-  
+
   return NumRequiredArgs;
 }
 
@@ -111,12 +116,10 @@
 
 static void AdoptTemplateParameterList(TemplateParameterList *Params,
                                        DeclContext *Owner) {
-  for (TemplateParameterList::iterator P = Params->begin(), 
-                                    PEnd = Params->end();
-       P != PEnd; ++P) {
-    (*P)->setDeclContext(Owner);
-    
-    if (TemplateTemplateParmDecl *TTP = dyn_cast<TemplateTemplateParmDecl>(*P))
+  for (NamedDecl *P : *Params) {
+    P->setDeclContext(Owner);
+
+    if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(P))
       AdoptTemplateParameterList(TTP->getTemplateParameters(), Owner);
   }
 }
@@ -157,8 +160,8 @@
   }
 
   // Update any previous declarations we saw with the common pointer.
-  for (unsigned I = 0, N = PrevDecls.size(); I != N; ++I)
-    PrevDecls[I]->Common = Common;
+  for (const RedeclarableTemplateDecl *Prev : PrevDecls)
+    Prev->Common = Common;
 
   return Common;
 }
@@ -205,44 +208,41 @@
 /// \brief Generate the injected template arguments for the given template
 /// parameter list, e.g., for the injected-class-name of a class template.
 static void GenerateInjectedTemplateArgs(ASTContext &Context,
-                                        TemplateParameterList *Params,
+                                         TemplateParameterList *Params,
                                          TemplateArgument *Args) {
-  for (TemplateParameterList::iterator Param = Params->begin(),
-                                    ParamEnd = Params->end();
-       Param != ParamEnd; ++Param) {
+  for (NamedDecl *Param : *Params) {
     TemplateArgument Arg;
-    if (TemplateTypeParmDecl *TTP = dyn_cast<TemplateTypeParmDecl>(*Param)) {
+    if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(Param)) {
       QualType ArgType = Context.getTypeDeclType(TTP);
       if (TTP->isParameterPack())
         ArgType = Context.getPackExpansionType(ArgType, None);
 
       Arg = TemplateArgument(ArgType);
-    } else if (NonTypeTemplateParmDecl *NTTP =
-               dyn_cast<NonTypeTemplateParmDecl>(*Param)) {
+    } else if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
       Expr *E = new (Context) DeclRefExpr(NTTP, /*enclosing*/ false,
                                   NTTP->getType().getNonLValueExprType(Context),
                                   Expr::getValueKindForType(NTTP->getType()),
                                           NTTP->getLocation());
-      
+
       if (NTTP->isParameterPack())
         E = new (Context) PackExpansionExpr(Context.DependentTy, E,
                                             NTTP->getLocation(), None);
       Arg = TemplateArgument(E);
     } else {
-      TemplateTemplateParmDecl *TTP = cast<TemplateTemplateParmDecl>(*Param);
+      auto *TTP = cast<TemplateTemplateParmDecl>(Param);
       if (TTP->isParameterPack())
         Arg = TemplateArgument(TemplateName(TTP), Optional<unsigned>());
       else
         Arg = TemplateArgument(TemplateName(TTP));
     }
-    
-    if ((*Param)->isTemplateParameterPack())
+
+    if (Param->isTemplateParameterPack())
       Arg = TemplateArgument::CreatePackCopy(Context, Arg);
 
     *Args++ = Arg;
   }
 }
-                                      
+
 //===----------------------------------------------------------------------===//
 // FunctionTemplateDecl Implementation
 //===----------------------------------------------------------------------===//
@@ -421,23 +421,17 @@
     = getPartialSpecializations();
   PS.clear();
   PS.reserve(PartialSpecs.size());
-  for (llvm::FoldingSetVector<ClassTemplatePartialSpecializationDecl>::iterator
-       P = PartialSpecs.begin(), PEnd = PartialSpecs.end();
-       P != PEnd; ++P)
-    PS.push_back(P->getMostRecentDecl());
+  for (ClassTemplatePartialSpecializationDecl &P : PartialSpecs)
+    PS.push_back(P.getMostRecentDecl());
 }
 
 ClassTemplatePartialSpecializationDecl *
 ClassTemplateDecl::findPartialSpecialization(QualType T) {
   ASTContext &Context = getASTContext();
-  using llvm::FoldingSetVector;
-  typedef FoldingSetVector<ClassTemplatePartialSpecializationDecl>::iterator
-    partial_spec_iterator;
-  for (partial_spec_iterator P = getPartialSpecializations().begin(),
-                          PEnd = getPartialSpecializations().end();
-       P != PEnd; ++P) {
-    if (Context.hasSameType(P->getInjectedSpecializationType(), T))
-      return P->getMostRecentDecl();
+  for (ClassTemplatePartialSpecializationDecl &P :
+       getPartialSpecializations()) {
+    if (Context.hasSameType(P.getInjectedSpecializationType(), T))
+      return P.getMostRecentDecl();
   }
 
   return nullptr;
@@ -447,12 +441,9 @@
 ClassTemplateDecl::findPartialSpecInstantiatedFromMember(
                                     ClassTemplatePartialSpecializationDecl *D) {
   Decl *DCanon = D->getCanonicalDecl();
-  for (llvm::FoldingSetVector<ClassTemplatePartialSpecializationDecl>::iterator
-            P = getPartialSpecializations().begin(),
-         PEnd = getPartialSpecializations().end();
-       P != PEnd; ++P) {
-    if (P->getInstantiatedFromMember()->getCanonicalDecl() == DCanon)
-      return P->getMostRecentDecl();
+  for (ClassTemplatePartialSpecializationDecl &P : getPartialSpecializations()) {
+    if (P.getInstantiatedFromMember()->getCanonicalDecl() == DCanon)
+      return P.getMostRecentDecl();
   }
 
   return nullptr;
@@ -478,8 +469,7 @@
   GenerateInjectedTemplateArgs(getASTContext(), Params, TemplateArgs.data());
   CommonPtr->InjectedClassNameType
     = Context.getTemplateSpecializationType(TemplateName(this),
-                                            &TemplateArgs[0],
-                                            TemplateArgs.size());
+                                            TemplateArgs);
   return CommonPtr->InjectedClassNameType;
 }
 
@@ -535,20 +525,14 @@
 // NonTypeTemplateParmDecl Method Implementations
 //===----------------------------------------------------------------------===//
 
-NonTypeTemplateParmDecl::NonTypeTemplateParmDecl(DeclContext *DC, 
-                                                 SourceLocation StartLoc,
-                                                 SourceLocation IdLoc,
-                                                 unsigned D, unsigned P,
-                                                 IdentifierInfo *Id, 
-                                                 QualType T, 
-                                                 TypeSourceInfo *TInfo,
-                                                 const QualType *ExpandedTypes,
-                                                 unsigned NumExpandedTypes,
-                                                TypeSourceInfo **ExpandedTInfos)
-  : DeclaratorDecl(NonTypeTemplateParm, DC, IdLoc, Id, T, TInfo, StartLoc),
-    TemplateParmPosition(D, P), ParameterPack(true),
-    ExpandedParameterPack(true), NumExpandedTypes(NumExpandedTypes) {
-  if (ExpandedTypes && ExpandedTInfos) {
+NonTypeTemplateParmDecl::NonTypeTemplateParmDecl(
+    DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, unsigned D,
+    unsigned P, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo,
+    ArrayRef<QualType> ExpandedTypes, ArrayRef<TypeSourceInfo *> ExpandedTInfos)
+    : DeclaratorDecl(NonTypeTemplateParm, DC, IdLoc, Id, T, TInfo, StartLoc),
+      TemplateParmPosition(D, P), ParameterPack(true),
+      ExpandedParameterPack(true), NumExpandedTypes(ExpandedTypes.size()) {
+  if (!ExpandedTypes.empty() && !ExpandedTInfos.empty()) {
     auto TypesAndInfos =
         getTrailingObjects<std::pair<QualType, TypeSourceInfo *>>();
     for (unsigned I = 0; I != NumExpandedTypes; ++I) {
@@ -568,20 +552,16 @@
                                              T, ParameterPack, TInfo);
 }
 
-NonTypeTemplateParmDecl *
-NonTypeTemplateParmDecl::Create(const ASTContext &C, DeclContext *DC, 
-                                SourceLocation StartLoc, SourceLocation IdLoc,
-                                unsigned D, unsigned P, 
-                                IdentifierInfo *Id, QualType T, 
-                                TypeSourceInfo *TInfo,
-                                const QualType *ExpandedTypes, 
-                                unsigned NumExpandedTypes,
-                                TypeSourceInfo **ExpandedTInfos) {
+NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create(
+    const ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
+    SourceLocation IdLoc, unsigned D, unsigned P, IdentifierInfo *Id,
+    QualType T, TypeSourceInfo *TInfo, ArrayRef<QualType> ExpandedTypes,
+    ArrayRef<TypeSourceInfo *> ExpandedTInfos) {
   return new (C, DC,
               additionalSizeToAlloc<std::pair<QualType, TypeSourceInfo *>>(
-                  NumExpandedTypes))
+                  ExpandedTypes.size()))
       NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, TInfo,
-                              ExpandedTypes, NumExpandedTypes, ExpandedTInfos);
+                              ExpandedTypes, ExpandedTInfos);
 }
 
 NonTypeTemplateParmDecl *
@@ -594,12 +574,14 @@
 NonTypeTemplateParmDecl *
 NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, unsigned ID,
                                             unsigned NumExpandedTypes) {
-  return new (C, ID,
-              additionalSizeToAlloc<std::pair<QualType, TypeSourceInfo *>>(
-                  NumExpandedTypes))
-      NonTypeTemplateParmDecl(nullptr, SourceLocation(), SourceLocation(), 0, 0,
-                              nullptr, QualType(), nullptr, nullptr,
-                              NumExpandedTypes, nullptr);
+  auto *NTTP =
+      new (C, ID, additionalSizeToAlloc<std::pair<QualType, TypeSourceInfo *>>(
+                      NumExpandedTypes))
+          NonTypeTemplateParmDecl(nullptr, SourceLocation(), SourceLocation(),
+                                  0, 0, nullptr, QualType(), nullptr, None,
+                                  None);
+  NTTP->NumExpandedTypes = NumExpandedTypes;
+  return NTTP;
 }
 
 SourceRange NonTypeTemplateParmDecl::getSourceRange() const {
@@ -624,12 +606,12 @@
 TemplateTemplateParmDecl::TemplateTemplateParmDecl(
     DeclContext *DC, SourceLocation L, unsigned D, unsigned P,
     IdentifierInfo *Id, TemplateParameterList *Params,
-    unsigned NumExpansions, TemplateParameterList * const *Expansions)
-  : TemplateDecl(TemplateTemplateParm, DC, L, Id, Params),
-    TemplateParmPosition(D, P), ParameterPack(true),
-    ExpandedParameterPack(true), NumExpandedParams(NumExpansions) {
-  if (Expansions)
-    std::uninitialized_copy(Expansions, Expansions + NumExpandedParams,
+    ArrayRef<TemplateParameterList *> Expansions)
+    : TemplateDecl(TemplateTemplateParm, DC, L, Id, Params),
+      TemplateParmPosition(D, P), ParameterPack(true),
+      ExpandedParameterPack(true), NumExpandedParams(Expansions.size()) {
+  if (!Expansions.empty())
+    std::uninitialized_copy(Expansions.begin(), Expansions.end(),
                             getTrailingObjects<TemplateParameterList *>());
 }
 
@@ -650,8 +632,7 @@
                                  ArrayRef<TemplateParameterList *> Expansions) {
   return new (C, DC,
               additionalSizeToAlloc<TemplateParameterList *>(Expansions.size()))
-      TemplateTemplateParmDecl(DC, L, D, P, Id, Params, Expansions.size(),
-                               Expansions.data());
+      TemplateTemplateParmDecl(DC, L, D, P, Id, Params, Expansions);
 }
 
 TemplateTemplateParmDecl *
@@ -663,10 +644,12 @@
 TemplateTemplateParmDecl *
 TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, unsigned ID,
                                              unsigned NumExpansions) {
-  return new (C, ID,
-              additionalSizeToAlloc<TemplateParameterList *>(NumExpansions))
-      TemplateTemplateParmDecl(nullptr, SourceLocation(), 0, 0, nullptr,
-                               nullptr, NumExpansions, nullptr);
+  auto *TTP =
+      new (C, ID, additionalSizeToAlloc<TemplateParameterList *>(NumExpansions))
+          TemplateTemplateParmDecl(nullptr, SourceLocation(), 0, 0, nullptr,
+                                   nullptr, None);
+  TTP->NumExpandedParams = NumExpansions;
+  return TTP;
 }
 
 SourceLocation TemplateTemplateParmDecl::getDefaultArgumentLoc() const {
@@ -685,19 +668,18 @@
 //===----------------------------------------------------------------------===//
 // TemplateArgumentList Implementation
 //===----------------------------------------------------------------------===//
-TemplateArgumentList::TemplateArgumentList(const TemplateArgument *Args,
-                                           unsigned NumArgs)
-    : Arguments(getTrailingObjects<TemplateArgument>()), NumArguments(NumArgs) {
-  std::uninitialized_copy(Args, Args + NumArgs,
+TemplateArgumentList::TemplateArgumentList(ArrayRef<TemplateArgument> Args)
+    : Arguments(getTrailingObjects<TemplateArgument>()),
+      NumArguments(Args.size()) {
+  std::uninitialized_copy(Args.begin(), Args.end(),
                           getTrailingObjects<TemplateArgument>());
 }
 
 TemplateArgumentList *
 TemplateArgumentList::CreateCopy(ASTContext &Context,
-                                 const TemplateArgument *Args,
-                                 unsigned NumArgs) {
-  void *Mem = Context.Allocate(totalSizeToAlloc<TemplateArgument>(NumArgs));
-  return new (Mem) TemplateArgumentList(Args, NumArgs);
+                                 ArrayRef<TemplateArgument> Args) {
+  void *Mem = Context.Allocate(totalSizeToAlloc<TemplateArgument>(Args.size()));
+  return new (Mem) TemplateArgumentList(Args);
 }
 
 FunctionTemplateSpecializationInfo *
@@ -732,15 +714,14 @@
                                 DeclContext *DC, SourceLocation StartLoc,
                                 SourceLocation IdLoc,
                                 ClassTemplateDecl *SpecializedTemplate,
-                                const TemplateArgument *Args,
-                                unsigned NumArgs,
+                                ArrayRef<TemplateArgument> Args,
                                 ClassTemplateSpecializationDecl *PrevDecl)
   : CXXRecordDecl(DK, TK, Context, DC, StartLoc, IdLoc,
                   SpecializedTemplate->getIdentifier(),
                   PrevDecl),
     SpecializedTemplate(SpecializedTemplate),
     ExplicitInfo(nullptr),
-    TemplateArgs(TemplateArgumentList::CreateCopy(Context, Args, NumArgs)),
+    TemplateArgs(TemplateArgumentList::CreateCopy(Context, Args)),
     SpecializationKind(TSK_Undeclared) {
 }
 
@@ -756,13 +737,12 @@
                                         SourceLocation StartLoc,
                                         SourceLocation IdLoc,
                                         ClassTemplateDecl *SpecializedTemplate,
-                                        const TemplateArgument *Args,
-                                        unsigned NumArgs,
+                                        ArrayRef<TemplateArgument> Args,
                                    ClassTemplateSpecializationDecl *PrevDecl) {
   ClassTemplateSpecializationDecl *Result =
       new (Context, DC) ClassTemplateSpecializationDecl(
           Context, ClassTemplateSpecialization, TK, DC, StartLoc, IdLoc,
-          SpecializedTemplate, Args, NumArgs, PrevDecl);
+          SpecializedTemplate, Args, PrevDecl);
   Result->MayHaveOutOfDateDef = false;
 
   Context.getTypeDeclType(Result, PrevDecl);
@@ -784,7 +764,7 @@
 
   const TemplateArgumentList &TemplateArgs = getTemplateArgs();
   TemplateSpecializationType::PrintTemplateArgumentList(
-      OS, TemplateArgs.data(), TemplateArgs.size(), Policy);
+      OS, TemplateArgs.asArray(), Policy);
 }
 
 ClassTemplateDecl *
@@ -846,15 +826,14 @@
                                        SourceLocation IdLoc,
                                        TemplateParameterList *Params,
                                        ClassTemplateDecl *SpecializedTemplate,
-                                       const TemplateArgument *Args,
-                                       unsigned NumArgs,
+                                       ArrayRef<TemplateArgument> Args,
                                const ASTTemplateArgumentListInfo *ArgInfos,
                                ClassTemplatePartialSpecializationDecl *PrevDecl)
   : ClassTemplateSpecializationDecl(Context,
                                     ClassTemplatePartialSpecialization,
                                     TK, DC, StartLoc, IdLoc,
                                     SpecializedTemplate,
-                                    Args, NumArgs, PrevDecl),
+                                    Args, PrevDecl),
     TemplateParams(Params), ArgsAsWritten(ArgInfos),
     InstantiatedFromMember(nullptr, false)
 {
@@ -867,8 +846,7 @@
        SourceLocation StartLoc, SourceLocation IdLoc,
        TemplateParameterList *Params,
        ClassTemplateDecl *SpecializedTemplate,
-       const TemplateArgument *Args,
-       unsigned NumArgs,
+       ArrayRef<TemplateArgument> Args,
        const TemplateArgumentListInfo &ArgInfos,
        QualType CanonInjectedType,
        ClassTemplatePartialSpecializationDecl *PrevDecl) {
@@ -878,7 +856,7 @@
   ClassTemplatePartialSpecializationDecl *Result = new (Context, DC)
       ClassTemplatePartialSpecializationDecl(Context, TK, DC, StartLoc, IdLoc,
                                              Params, SpecializedTemplate, Args,
-                                             NumArgs, ASTArgInfos, PrevDecl);
+                                             ASTArgInfos, PrevDecl);
   Result->setSpecializationKind(TSK_ExplicitSpecialization);
   Result->MayHaveOutOfDateDef = false;
 
@@ -901,15 +879,12 @@
 
 void FriendTemplateDecl::anchor() { }
 
-FriendTemplateDecl *FriendTemplateDecl::Create(ASTContext &Context,
-                                               DeclContext *DC,
-                                               SourceLocation L,
-                                               unsigned NParams,
-                                               TemplateParameterList **Params,
-                                               FriendUnion Friend,
-                                               SourceLocation FLoc) {
-  return new (Context, DC) FriendTemplateDecl(DC, L, NParams, Params,
-                                              Friend, FLoc);
+FriendTemplateDecl *
+FriendTemplateDecl::Create(ASTContext &Context, DeclContext *DC,
+                           SourceLocation L,
+                           MutableArrayRef<TemplateParameterList *> Params,
+                           FriendUnion Friend, SourceLocation FLoc) {
+  return new (Context, DC) FriendTemplateDecl(DC, L, Params, Friend, FLoc);
 }
 
 FriendTemplateDecl *FriendTemplateDecl::CreateDeserialized(ASTContext &C,
@@ -1065,23 +1040,17 @@
       getPartialSpecializations();
   PS.clear();
   PS.reserve(PartialSpecs.size());
-  for (llvm::FoldingSetVector<VarTemplatePartialSpecializationDecl>::iterator
-           P = PartialSpecs.begin(),
-           PEnd = PartialSpecs.end();
-       P != PEnd; ++P)
-    PS.push_back(P->getMostRecentDecl());
+  for (VarTemplatePartialSpecializationDecl &P : PartialSpecs)
+    PS.push_back(P.getMostRecentDecl());
 }
 
 VarTemplatePartialSpecializationDecl *
 VarTemplateDecl::findPartialSpecInstantiatedFromMember(
     VarTemplatePartialSpecializationDecl *D) {
   Decl *DCanon = D->getCanonicalDecl();
-  for (llvm::FoldingSetVector<VarTemplatePartialSpecializationDecl>::iterator
-           P = getPartialSpecializations().begin(),
-           PEnd = getPartialSpecializations().end();
-       P != PEnd; ++P) {
-    if (P->getInstantiatedFromMember()->getCanonicalDecl() == DCanon)
-      return P->getMostRecentDecl();
+  for (VarTemplatePartialSpecializationDecl &P : getPartialSpecializations()) {
+    if (P.getInstantiatedFromMember()->getCanonicalDecl() == DCanon)
+      return P.getMostRecentDecl();
   }
 
   return nullptr;
@@ -1093,12 +1062,11 @@
 VarTemplateSpecializationDecl::VarTemplateSpecializationDecl(
     Kind DK, ASTContext &Context, DeclContext *DC, SourceLocation StartLoc,
     SourceLocation IdLoc, VarTemplateDecl *SpecializedTemplate, QualType T,
-    TypeSourceInfo *TInfo, StorageClass S, const TemplateArgument *Args,
-    unsigned NumArgs)
+    TypeSourceInfo *TInfo, StorageClass S, ArrayRef<TemplateArgument> Args)
     : VarDecl(DK, Context, DC, StartLoc, IdLoc,
               SpecializedTemplate->getIdentifier(), T, TInfo, S),
       SpecializedTemplate(SpecializedTemplate), ExplicitInfo(nullptr),
-      TemplateArgs(TemplateArgumentList::CreateCopy(Context, Args, NumArgs)),
+      TemplateArgs(TemplateArgumentList::CreateCopy(Context, Args)),
       SpecializationKind(TSK_Undeclared) {}
 
 VarTemplateSpecializationDecl::VarTemplateSpecializationDecl(Kind DK,
@@ -1110,11 +1078,10 @@
 VarTemplateSpecializationDecl *VarTemplateSpecializationDecl::Create(
     ASTContext &Context, DeclContext *DC, SourceLocation StartLoc,
     SourceLocation IdLoc, VarTemplateDecl *SpecializedTemplate, QualType T,
-    TypeSourceInfo *TInfo, StorageClass S, const TemplateArgument *Args,
-    unsigned NumArgs) {
+    TypeSourceInfo *TInfo, StorageClass S, ArrayRef<TemplateArgument> Args) {
   return new (Context, DC) VarTemplateSpecializationDecl(
       VarTemplateSpecialization, Context, DC, StartLoc, IdLoc,
-      SpecializedTemplate, T, TInfo, S, Args, NumArgs);
+      SpecializedTemplate, T, TInfo, S, Args);
 }
 
 VarTemplateSpecializationDecl *
@@ -1129,7 +1096,7 @@
 
   const TemplateArgumentList &TemplateArgs = getTemplateArgs();
   TemplateSpecializationType::PrintTemplateArgumentList(
-      OS, TemplateArgs.data(), TemplateArgs.size(), Policy);
+      OS, TemplateArgs.asArray(), Policy);
 }
 
 VarTemplateDecl *VarTemplateSpecializationDecl::getSpecializedTemplate() const {
@@ -1141,11 +1108,10 @@
 
 void VarTemplateSpecializationDecl::setTemplateArgsInfo(
     const TemplateArgumentListInfo &ArgsInfo) {
-  unsigned N = ArgsInfo.size();
   TemplateArgsInfo.setLAngleLoc(ArgsInfo.getLAngleLoc());
   TemplateArgsInfo.setRAngleLoc(ArgsInfo.getRAngleLoc());
-  for (unsigned I = 0; I != N; ++I)
-    TemplateArgsInfo.addArgument(ArgsInfo[I]);
+  for (const TemplateArgumentLoc &Loc : ArgsInfo.arguments())
+    TemplateArgsInfo.addArgument(Loc);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1157,11 +1123,11 @@
     ASTContext &Context, DeclContext *DC, SourceLocation StartLoc,
     SourceLocation IdLoc, TemplateParameterList *Params,
     VarTemplateDecl *SpecializedTemplate, QualType T, TypeSourceInfo *TInfo,
-    StorageClass S, const TemplateArgument *Args, unsigned NumArgs,
+    StorageClass S, ArrayRef<TemplateArgument> Args,
     const ASTTemplateArgumentListInfo *ArgInfos)
     : VarTemplateSpecializationDecl(VarTemplatePartialSpecialization, Context,
                                     DC, StartLoc, IdLoc, SpecializedTemplate, T,
-                                    TInfo, S, Args, NumArgs),
+                                    TInfo, S, Args),
       TemplateParams(Params), ArgsAsWritten(ArgInfos),
       InstantiatedFromMember(nullptr, false) {
   // TODO: The template parameters should be in DC by now. Verify.
@@ -1173,7 +1139,7 @@
     ASTContext &Context, DeclContext *DC, SourceLocation StartLoc,
     SourceLocation IdLoc, TemplateParameterList *Params,
     VarTemplateDecl *SpecializedTemplate, QualType T, TypeSourceInfo *TInfo,
-    StorageClass S, const TemplateArgument *Args, unsigned NumArgs,
+    StorageClass S, ArrayRef<TemplateArgument> Args,
     const TemplateArgumentListInfo &ArgInfos) {
   const ASTTemplateArgumentListInfo *ASTArgInfos
     = ASTTemplateArgumentListInfo::Create(Context, ArgInfos);
@@ -1181,7 +1147,7 @@
   VarTemplatePartialSpecializationDecl *Result =
       new (Context, DC) VarTemplatePartialSpecializationDecl(
           Context, DC, StartLoc, IdLoc, Params, SpecializedTemplate, T, TInfo,
-          S, Args, NumArgs, ASTArgInfos);
+          S, Args, ASTArgInfos);
   Result->setSpecializationKind(TSK_ExplicitSpecialization);
   return Result;
 }
@@ -1211,7 +1177,7 @@
   // <typename T, T ...Ints>
   NamedDecl *P[2] = {T, N};
   auto *TPL = TemplateParameterList::Create(
-      C, SourceLocation(), SourceLocation(), P, SourceLocation());
+      C, SourceLocation(), SourceLocation(), P, SourceLocation(), nullptr);
 
   // template <typename T, ...Ints> class IntSeq
   auto *TemplateTemplateParm = TemplateTemplateParmDecl::Create(
@@ -1236,7 +1202,28 @@
 
   // template <template <typename T, T ...Ints> class IntSeq, typename T, T N>
   return TemplateParameterList::Create(C, SourceLocation(), SourceLocation(),
-                                       Params, SourceLocation());
+                                       Params, SourceLocation(), nullptr);
+}
+
+static TemplateParameterList *
+createTypePackElementParameterList(const ASTContext &C, DeclContext *DC) {
+  // std::size_t Index
+  TypeSourceInfo *TInfo = C.getTrivialTypeSourceInfo(C.getSizeType());
+  auto *Index = NonTypeTemplateParmDecl::Create(
+      C, DC, SourceLocation(), SourceLocation(), /*Depth=*/0, /*Position=*/0,
+      /*Id=*/nullptr, TInfo->getType(), /*ParameterPack=*/false, TInfo);
+
+  // typename ...T
+  auto *Ts = TemplateTypeParmDecl::Create(
+      C, DC, SourceLocation(), SourceLocation(), /*Depth=*/0, /*Position=*/1,
+      /*Id=*/nullptr, /*Typename=*/true, /*ParameterPack=*/true);
+  Ts->setImplicit(true);
+
+  // template <std::size_t Index, typename ...T>
+  NamedDecl *Params[] = {Index, Ts};
+  return TemplateParameterList::Create(C, SourceLocation(), SourceLocation(),
+                                       llvm::makeArrayRef(Params),
+                                       SourceLocation(), nullptr);
 }
 
 static TemplateParameterList *createBuiltinTemplateParameterList(
@@ -1244,6 +1231,8 @@
   switch (BTK) {
   case BTK__make_integer_seq:
     return createMakeIntegerSeqParameterList(C, DC);
+  case BTK__type_pack_element:
+    return createTypePackElementParameterList(C, DC);
   }
 
   llvm_unreachable("unhandled BuiltinTemplateKind!");
diff --git a/lib/AST/DeclarationName.cpp b/lib/AST/DeclarationName.cpp
index 344a238..fea887e 100644
--- a/lib/AST/DeclarationName.cpp
+++ b/lib/AST/DeclarationName.cpp
@@ -11,14 +11,13 @@
 // classes.
 //
 //===----------------------------------------------------------------------===//
+#include "clang/AST/DeclarationName.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclCXX.h"
-#include "clang/AST/DeclarationName.h"
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/TypeOrdering.h"
 #include "clang/Basic/IdentifierTable.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -135,7 +134,10 @@
 
 static void printCXXConstructorDestructorName(QualType ClassType,
                                               raw_ostream &OS,
-                                              const PrintingPolicy &Policy) {
+                                              PrintingPolicy Policy) {
+  // We know we're printing C++ here. Ensure we print types properly.
+  Policy.adjustForCPlusPlus();
+
   if (const RecordType *ClassRec = ClassType->getAs<RecordType>()) {
     OS << *ClassRec->getDecl();
     return;
@@ -146,14 +148,7 @@
       return;
     }
   }
-  if (!Policy.LangOpts.CPlusPlus) {
-    // Passed policy is the default one from operator <<, use a C++ policy.
-    LangOptions LO;
-    LO.CPlusPlus = true;
-    ClassType.print(OS, PrintingPolicy(LO));
-  } else {
-    ClassType.print(OS, Policy);
-  }
+  ClassType.print(OS, Policy);
 }
 
 void DeclarationName::print(raw_ostream &OS, const PrintingPolicy &Policy) {
@@ -206,15 +201,10 @@
       OS << *Rec->getDecl();
       return;
     }
-    if (!Policy.LangOpts.CPlusPlus) {
-      // Passed policy is the default one from operator <<, use a C++ policy.
-      LangOptions LO;
-      LO.CPlusPlus = true;
-      LO.Bool = true;
-      Type.print(OS, PrintingPolicy(LO));
-    } else {
-      Type.print(OS, Policy);
-    }
+    // We know we're printing C++ here, ensure we print 'bool' properly.
+    PrintingPolicy CXXPolicy = Policy;
+    CXXPolicy.adjustForCPlusPlus();
+    Type.print(OS, CXXPolicy);
     return;
   }
   case DeclarationName::CXXUsingDirective:
diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp
index 62b7999..15386ae 100644
--- a/lib/AST/Expr.cpp
+++ b/lib/AST/Expr.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/AST/APValue.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
@@ -985,7 +984,7 @@
       break;
     }
     default:
-      assert(false && "unsupported CharByteWidth");
+      llvm_unreachable("unsupported CharByteWidth");
   }
 }
 
@@ -1084,20 +1083,8 @@
 /// corresponds to, e.g. "sizeof" or "[pre]++".
 StringRef UnaryOperator::getOpcodeStr(Opcode Op) {
   switch (Op) {
-  case UO_PostInc: return "++";
-  case UO_PostDec: return "--";
-  case UO_PreInc:  return "++";
-  case UO_PreDec:  return "--";
-  case UO_AddrOf:  return "&";
-  case UO_Deref:   return "*";
-  case UO_Plus:    return "+";
-  case UO_Minus:   return "-";
-  case UO_Not:     return "~";
-  case UO_LNot:    return "!";
-  case UO_Real:    return "__real";
-  case UO_Imag:    return "__imag";
-  case UO_Extension: return "__extension__";
-  case UO_Coawait: return "co_await";
+#define UNARY_OPERATION(Name, Spelling) case UO_##Name: return Spelling;
+#include "clang/AST/OperationKinds.def"
   }
   llvm_unreachable("Unknown unary operator");
 }
@@ -1582,6 +1569,7 @@
   case CK_ARCReclaimReturnedObject:
   case CK_ARCExtendBlockObject:
   case CK_ZeroToOCLEvent:
+  case CK_IntToOCLSampler:
     assert(!getType()->isBooleanType() && "unheralded conversion to bool");
     goto CheckNoBasePath;
 
@@ -1608,120 +1596,9 @@
 
 const char *CastExpr::getCastKindName() const {
   switch (getCastKind()) {
-  case CK_Dependent:
-    return "Dependent";
-  case CK_BitCast:
-    return "BitCast";
-  case CK_LValueBitCast:
-    return "LValueBitCast";
-  case CK_LValueToRValue:
-    return "LValueToRValue";
-  case CK_NoOp:
-    return "NoOp";
-  case CK_BaseToDerived:
-    return "BaseToDerived";
-  case CK_DerivedToBase:
-    return "DerivedToBase";
-  case CK_UncheckedDerivedToBase:
-    return "UncheckedDerivedToBase";
-  case CK_Dynamic:
-    return "Dynamic";
-  case CK_ToUnion:
-    return "ToUnion";
-  case CK_ArrayToPointerDecay:
-    return "ArrayToPointerDecay";
-  case CK_FunctionToPointerDecay:
-    return "FunctionToPointerDecay";
-  case CK_NullToMemberPointer:
-    return "NullToMemberPointer";
-  case CK_NullToPointer:
-    return "NullToPointer";
-  case CK_BaseToDerivedMemberPointer:
-    return "BaseToDerivedMemberPointer";
-  case CK_DerivedToBaseMemberPointer:
-    return "DerivedToBaseMemberPointer";
-  case CK_ReinterpretMemberPointer:
-    return "ReinterpretMemberPointer";
-  case CK_UserDefinedConversion:
-    return "UserDefinedConversion";
-  case CK_ConstructorConversion:
-    return "ConstructorConversion";
-  case CK_IntegralToPointer:
-    return "IntegralToPointer";
-  case CK_PointerToIntegral:
-    return "PointerToIntegral";
-  case CK_PointerToBoolean:
-    return "PointerToBoolean";
-  case CK_ToVoid:
-    return "ToVoid";
-  case CK_VectorSplat:
-    return "VectorSplat";
-  case CK_IntegralCast:
-    return "IntegralCast";
-  case CK_BooleanToSignedIntegral:
-    return "BooleanToSignedIntegral";
-  case CK_IntegralToBoolean:
-    return "IntegralToBoolean";
-  case CK_IntegralToFloating:
-    return "IntegralToFloating";
-  case CK_FloatingToIntegral:
-    return "FloatingToIntegral";
-  case CK_FloatingCast:
-    return "FloatingCast";
-  case CK_FloatingToBoolean:
-    return "FloatingToBoolean";
-  case CK_MemberPointerToBoolean:
-    return "MemberPointerToBoolean";
-  case CK_CPointerToObjCPointerCast:
-    return "CPointerToObjCPointerCast";
-  case CK_BlockPointerToObjCPointerCast:
-    return "BlockPointerToObjCPointerCast";
-  case CK_AnyPointerToBlockPointerCast:
-    return "AnyPointerToBlockPointerCast";
-  case CK_ObjCObjectLValueCast:
-    return "ObjCObjectLValueCast";
-  case CK_FloatingRealToComplex:
-    return "FloatingRealToComplex";
-  case CK_FloatingComplexToReal:
-    return "FloatingComplexToReal";
-  case CK_FloatingComplexToBoolean:
-    return "FloatingComplexToBoolean";
-  case CK_FloatingComplexCast:
-    return "FloatingComplexCast";
-  case CK_FloatingComplexToIntegralComplex:
-    return "FloatingComplexToIntegralComplex";
-  case CK_IntegralRealToComplex:
-    return "IntegralRealToComplex";
-  case CK_IntegralComplexToReal:
-    return "IntegralComplexToReal";
-  case CK_IntegralComplexToBoolean:
-    return "IntegralComplexToBoolean";
-  case CK_IntegralComplexCast:
-    return "IntegralComplexCast";
-  case CK_IntegralComplexToFloatingComplex:
-    return "IntegralComplexToFloatingComplex";
-  case CK_ARCConsumeObject:
-    return "ARCConsumeObject";
-  case CK_ARCProduceObject:
-    return "ARCProduceObject";
-  case CK_ARCReclaimReturnedObject:
-    return "ARCReclaimReturnedObject";
-  case CK_ARCExtendBlockObject:
-    return "ARCExtendBlockObject";
-  case CK_AtomicToNonAtomic:
-    return "AtomicToNonAtomic";
-  case CK_NonAtomicToAtomic:
-    return "NonAtomicToAtomic";
-  case CK_CopyAndAutoreleaseBlockObject:
-    return "CopyAndAutoreleaseBlockObject";
-  case CK_BuiltinFnToFnPtr:
-    return "BuiltinFnToFnPtr";
-  case CK_ZeroToOCLEvent:
-    return "ZeroToOCLEvent";
-  case CK_AddressSpaceConversion:
-    return "AddressSpaceConversion";
+#define CAST_OPERATION(Name) case CK_##Name: return #Name;
+#include "clang/AST/OperationKinds.def"
   }
-
   llvm_unreachable("Unhandled cast kind!");
 }
 
@@ -1818,40 +1695,9 @@
 /// corresponds to, e.g. "<<=".
 StringRef BinaryOperator::getOpcodeStr(Opcode Op) {
   switch (Op) {
-  case BO_PtrMemD:   return ".*";
-  case BO_PtrMemI:   return "->*";
-  case BO_Mul:       return "*";
-  case BO_Div:       return "/";
-  case BO_Rem:       return "%";
-  case BO_Add:       return "+";
-  case BO_Sub:       return "-";
-  case BO_Shl:       return "<<";
-  case BO_Shr:       return ">>";
-  case BO_LT:        return "<";
-  case BO_GT:        return ">";
-  case BO_LE:        return "<=";
-  case BO_GE:        return ">=";
-  case BO_EQ:        return "==";
-  case BO_NE:        return "!=";
-  case BO_And:       return "&";
-  case BO_Xor:       return "^";
-  case BO_Or:        return "|";
-  case BO_LAnd:      return "&&";
-  case BO_LOr:       return "||";
-  case BO_Assign:    return "=";
-  case BO_MulAssign: return "*=";
-  case BO_DivAssign: return "/=";
-  case BO_RemAssign: return "%=";
-  case BO_AddAssign: return "+=";
-  case BO_SubAssign: return "-=";
-  case BO_ShlAssign: return "<<=";
-  case BO_ShrAssign: return ">>=";
-  case BO_AndAssign: return "&=";
-  case BO_XorAssign: return "^=";
-  case BO_OrAssign:  return "|=";
-  case BO_Comma:     return ",";
+#define BINARY_OPERATION(Name, Spelling) case BO_##Name: return Spelling;
+#include "clang/AST/OperationKinds.def"
   }
-
   llvm_unreachable("Invalid OpCode!");
 }
 
@@ -2902,7 +2748,8 @@
         CE->getCastKind() == CK_ToUnion ||
         CE->getCastKind() == CK_ConstructorConversion ||
         CE->getCastKind() == CK_NonAtomicToAtomic ||
-        CE->getCastKind() == CK_AtomicToNonAtomic)
+        CE->getCastKind() == CK_AtomicToNonAtomic ||
+        CE->getCastKind() == CK_IntToOCLSampler)
       return CE->getSubExpr()->isConstantInitializer(Ctx, false, Culprit);
 
     break;
@@ -3010,6 +2857,7 @@
   case ObjCStringLiteralClass:
   case ObjCEncodeExprClass:
   case ObjCBoolLiteralExprClass:
+  case ObjCAvailabilityCheckExprClass:
   case CXXUuidofExprClass:
   case OpaqueValueExprClass:
     // These never have a side-effect.
@@ -3044,7 +2892,6 @@
   case CXXThrowExprClass:
   case CXXNewExprClass:
   case CXXDeleteExprClass:
-  case ExprWithCleanupsClass:
   case CoawaitExprClass:
   case CoyieldExprClass:
     // These always have a side-effect.
@@ -3057,6 +2904,12 @@
     return Finder.hasSideEffects();
   }
 
+  case ExprWithCleanupsClass:
+    if (IncludePossibleEffects)
+      if (cast<ExprWithCleanups>(this)->cleanupsHaveSideEffects())
+        return true;
+    break;
+
   case ParenExprClass:
   case ArraySubscriptExprClass:
   case OMPArraySectionExprClass:
@@ -3157,6 +3010,13 @@
     break;
   }
 
+  case CXXInheritedCtorInitExprClass: {
+    const auto *ICIE = cast<CXXInheritedCtorInitExpr>(this);
+    if (!ICIE->getConstructor()->isTrivial() && IncludePossibleEffects)
+      return true;
+    break;
+  }
+
   case LambdaExprClass: {
     const LambdaExpr *LE = cast<LambdaExpr>(this);
     for (LambdaExpr::capture_iterator I = LE->capture_begin(),
@@ -3448,11 +3308,16 @@
       if (Ivar->isBitField())
         return Ivar;
 
-  if (DeclRefExpr *DeclRef = dyn_cast<DeclRefExpr>(E))
+  if (DeclRefExpr *DeclRef = dyn_cast<DeclRefExpr>(E)) {
     if (FieldDecl *Field = dyn_cast<FieldDecl>(DeclRef->getDecl()))
       if (Field->isBitField())
         return Field;
 
+    if (BindingDecl *BD = dyn_cast<BindingDecl>(DeclRef->getDecl()))
+      if (Expr *E = BD->getBinding())
+        return E->getSourceBitField();
+  }
+
   if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(E)) {
     if (BinOp->isAssignmentOp() && BinOp->getLHS())
       return BinOp->getLHS()->getSourceBitField();
@@ -3469,6 +3334,7 @@
 }
 
 bool Expr::refersToVectorElement() const {
+  // FIXME: Why do we not just look at the ObjectKind here?
   const Expr *E = this->IgnoreParens();
   
   while (const ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(E)) {
@@ -3485,6 +3351,11 @@
   if (isa<ExtVectorElementExpr>(E))
     return true;
 
+  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
+    if (auto *BD = dyn_cast<BindingDecl>(DRE->getDecl()))
+      if (auto *E = BD->getBinding())
+        return E->refersToVectorElement();
+
   return false;
 }
 
@@ -3537,8 +3408,11 @@
 void ExtVectorElementExpr::getEncodedElementAccess(
     SmallVectorImpl<uint32_t> &Elts) const {
   StringRef Comp = Accessor->getName();
-  if (Comp[0] == 's' || Comp[0] == 'S')
+  bool isNumericAccessor = false;
+  if (Comp[0] == 's' || Comp[0] == 'S') {
     Comp = Comp.substr(1);
+    isNumericAccessor = true;
+  }
 
   bool isHi =   Comp == "hi";
   bool isLo =   Comp == "lo";
@@ -3557,7 +3431,7 @@
     else if (isOdd)
       Index = 2 * i + 1;
     else
-      Index = ExtVectorType::getAccessorIdx(Comp[i]);
+      Index = ExtVectorType::getAccessorIdx(Comp[i], isNumericAccessor);
 
     Elts.push_back(Index);
   }
@@ -3659,8 +3533,7 @@
 }
 
 DesignatedInitExpr::DesignatedInitExpr(const ASTContext &C, QualType Ty,
-                                       unsigned NumDesignators,
-                                       const Designator *Designators,
+                                       llvm::ArrayRef<Designator> Designators,
                                        SourceLocation EqualOrColonLoc,
                                        bool GNUSyntax,
                                        ArrayRef<Expr*> IndexExprs,
@@ -3671,7 +3544,7 @@
          Init->isInstantiationDependent(),
          Init->containsUnexpandedParameterPack()),
     EqualOrColonLoc(EqualOrColonLoc), GNUSyntax(GNUSyntax),
-    NumDesignators(NumDesignators), NumSubExprs(IndexExprs.size() + 1) {
+    NumDesignators(Designators.size()), NumSubExprs(IndexExprs.size() + 1) {
   this->Designators = new (C) Designator[NumDesignators];
 
   // Record the initializer itself.
@@ -3725,14 +3598,14 @@
 }
 
 DesignatedInitExpr *
-DesignatedInitExpr::Create(const ASTContext &C, Designator *Designators,
-                           unsigned NumDesignators,
+DesignatedInitExpr::Create(const ASTContext &C,
+                           llvm::ArrayRef<Designator> Designators,
                            ArrayRef<Expr*> IndexExprs,
                            SourceLocation ColonOrEqualLoc,
                            bool UsesColonSyntax, Expr *Init) {
   void *Mem = C.Allocate(totalSizeToAlloc<Stmt *>(IndexExprs.size() + 1),
                          llvm::alignOf<DesignatedInitExpr>());
-  return new (Mem) DesignatedInitExpr(C, C.VoidTy, NumDesignators, Designators,
+  return new (Mem) DesignatedInitExpr(C, C.VoidTy, Designators,
                                       ColonOrEqualLoc, UsesColonSyntax,
                                       IndexExprs, Init);
 }
@@ -3763,8 +3636,8 @@
 
 SourceLocation DesignatedInitExpr::getLocStart() const {
   SourceLocation StartLoc;
-  Designator &First =
-    *const_cast<DesignatedInitExpr*>(this)->designators_begin();
+  auto *DIE = const_cast<DesignatedInitExpr *>(this);
+  Designator &First = *DIE->getDesignator(0);
   if (First.isFieldDesignator()) {
     if (GNUSyntax)
       StartLoc = SourceLocation::getFromRawEncoding(First.Field.FieldLoc);
@@ -4026,16 +3899,18 @@
   llvm_unreachable("unknown atomic op");
 }
 
-QualType OMPArraySectionExpr::getBaseOriginalType(Expr *Base) {
+QualType OMPArraySectionExpr::getBaseOriginalType(const Expr *Base) {
   unsigned ArraySectionCount = 0;
   while (auto *OASE = dyn_cast<OMPArraySectionExpr>(Base->IgnoreParens())) {
     Base = OASE->getBase();
     ++ArraySectionCount;
   }
-  while (auto *ASE = dyn_cast<ArraySubscriptExpr>(Base->IgnoreParens())) {
+  while (auto *ASE =
+             dyn_cast<ArraySubscriptExpr>(Base->IgnoreParenImpCasts())) {
     Base = ASE->getBase();
     ++ArraySectionCount;
   }
+  Base = Base->IgnoreParenImpCasts();
   auto OriginalTy = Base->getType();
   if (auto *DRE = dyn_cast<DeclRefExpr>(Base))
     if (auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl()))
diff --git a/lib/AST/ExprCXX.cpp b/lib/AST/ExprCXX.cpp
index ea98334..a13033d 100644
--- a/lib/AST/ExprCXX.cpp
+++ b/lib/AST/ExprCXX.cpp
@@ -54,79 +54,6 @@
       Operand.get<TypeSourceInfo *>()->getType().getNonReferenceType(), Quals);
 }
 
-// static
-const UuidAttr *CXXUuidofExpr::GetUuidAttrOfType(QualType QT,
-                                                 bool *RDHasMultipleGUIDsPtr) {
-  // Optionally remove one level of pointer, reference or array indirection.
-  const Type *Ty = QT.getTypePtr();
-  if (QT->isPointerType() || QT->isReferenceType())
-    Ty = QT->getPointeeType().getTypePtr();
-  else if (QT->isArrayType())
-    Ty = Ty->getBaseElementTypeUnsafe();
-
-  const CXXRecordDecl *RD = Ty->getAsCXXRecordDecl();
-  if (!RD)
-    return nullptr;
-
-  if (const UuidAttr *Uuid = RD->getMostRecentDecl()->getAttr<UuidAttr>())
-    return Uuid;
-
-  // __uuidof can grab UUIDs from template arguments.
-  if (const ClassTemplateSpecializationDecl *CTSD =
-          dyn_cast<ClassTemplateSpecializationDecl>(RD)) {
-    const TemplateArgumentList &TAL = CTSD->getTemplateArgs();
-    const UuidAttr *UuidForRD = nullptr;
-
-    for (const TemplateArgument &TA : TAL.asArray()) {
-      bool SeenMultipleGUIDs = false;
-
-      const UuidAttr *UuidForTA = nullptr;
-      if (TA.getKind() == TemplateArgument::Type)
-        UuidForTA = GetUuidAttrOfType(TA.getAsType(), &SeenMultipleGUIDs);
-      else if (TA.getKind() == TemplateArgument::Declaration)
-        UuidForTA =
-            GetUuidAttrOfType(TA.getAsDecl()->getType(), &SeenMultipleGUIDs);
-
-      // If the template argument has a UUID, there are three cases:
-      //  - This is the first UUID seen for this RecordDecl.
-      //  - This is a different UUID than previously seen for this RecordDecl.
-      //  - This is the same UUID than previously seen for this RecordDecl.
-      if (UuidForTA) {
-        if (!UuidForRD)
-          UuidForRD = UuidForTA;
-        else if (UuidForRD != UuidForTA)
-          SeenMultipleGUIDs = true;
-      }
-
-      // Seeing multiple UUIDs means that we couldn't find a UUID
-      if (SeenMultipleGUIDs) {
-        if (RDHasMultipleGUIDsPtr)
-          *RDHasMultipleGUIDsPtr = true;
-        return nullptr;
-      }
-    }
-
-    return UuidForRD;
-  }
-
-  return nullptr;
-}
-
-StringRef CXXUuidofExpr::getUuidAsStringRef(ASTContext &Context) const {
-  StringRef Uuid;
-  if (isTypeOperand())
-    Uuid = CXXUuidofExpr::GetUuidAttrOfType(getTypeOperand(Context))->getGuid();
-  else {
-    // Special case: __uuidof(0) means an all-zero GUID.
-    Expr *Op = getExprOperand();
-    if (!Op->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull))
-      Uuid = CXXUuidofExpr::GetUuidAttrOfType(Op->getType())->getGuid();
-    else
-      Uuid = "00000000-0000-0000-0000-000000000000";
-  }
-  return Uuid;
-}
-
 // CXXScalarValueInitExpr
 SourceLocation CXXScalarValueInitExpr::getLocStart() const {
   return TypeInfo ? TypeInfo->getTypeLoc().getBeginLoc() : RParenLoc;
@@ -823,7 +750,8 @@
 
 CXXConstructExpr *CXXConstructExpr::Create(const ASTContext &C, QualType T,
                                            SourceLocation Loc,
-                                           CXXConstructorDecl *D, bool Elidable,
+                                           CXXConstructorDecl *Ctor,
+                                           bool Elidable,
                                            ArrayRef<Expr*> Args,
                                            bool HadMultipleCandidates,
                                            bool ListInitialization,
@@ -831,8 +759,8 @@
                                            bool ZeroInitialization,
                                            ConstructionKind ConstructKind,
                                            SourceRange ParenOrBraceRange) {
-  return new (C) CXXConstructExpr(C, CXXConstructExprClass, T, Loc, D, 
-                                  Elidable, Args,
+  return new (C) CXXConstructExpr(C, CXXConstructExprClass, T, Loc,
+                                  Ctor, Elidable, Args,
                                   HadMultipleCandidates, ListInitialization,
                                   StdInitListInitialization,
                                   ZeroInitialization, ConstructKind,
@@ -841,8 +769,9 @@
 
 CXXConstructExpr::CXXConstructExpr(const ASTContext &C, StmtClass SC,
                                    QualType T, SourceLocation Loc,
-                                   CXXConstructorDecl *D, bool elidable,
-                                   ArrayRef<Expr*> args,
+                                   CXXConstructorDecl *Ctor,
+                                   bool Elidable,
+                                   ArrayRef<Expr*> Args,
                                    bool HadMultipleCandidates,
                                    bool ListInitialization,
                                    bool StdInitListInitialization,
@@ -853,28 +782,28 @@
          T->isDependentType(), T->isDependentType(),
          T->isInstantiationDependentType(),
          T->containsUnexpandedParameterPack()),
-    Constructor(D), Loc(Loc), ParenOrBraceRange(ParenOrBraceRange),
-    NumArgs(args.size()),
-    Elidable(elidable), HadMultipleCandidates(HadMultipleCandidates),
+    Constructor(Ctor), Loc(Loc), ParenOrBraceRange(ParenOrBraceRange),
+    NumArgs(Args.size()),
+    Elidable(Elidable), HadMultipleCandidates(HadMultipleCandidates),
     ListInitialization(ListInitialization),
     StdInitListInitialization(StdInitListInitialization),
     ZeroInitialization(ZeroInitialization),
     ConstructKind(ConstructKind), Args(nullptr)
 {
   if (NumArgs) {
-    Args = new (C) Stmt*[args.size()];
+    this->Args = new (C) Stmt*[Args.size()];
     
-    for (unsigned i = 0; i != args.size(); ++i) {
-      assert(args[i] && "NULL argument in CXXConstructExpr");
+    for (unsigned i = 0; i != Args.size(); ++i) {
+      assert(Args[i] && "NULL argument in CXXConstructExpr");
 
-      if (args[i]->isValueDependent())
+      if (Args[i]->isValueDependent())
         ExprBits.ValueDependent = true;
-      if (args[i]->isInstantiationDependent())
+      if (Args[i]->isInstantiationDependent())
         ExprBits.InstantiationDependent = true;
-      if (args[i]->containsUnexpandedParameterPack())
+      if (Args[i]->containsUnexpandedParameterPack())
         ExprBits.ContainsUnexpandedParameterPack = true;
   
-      Args[i] = args[i];
+      this->Args[i] = Args[i];
     }
   }
 }
@@ -889,8 +818,12 @@
     Bits |= Capture_Implicit;
   
   switch (Kind) {
+  case LCK_StarThis:
+    Bits |= Capture_ByCopy;
+    // Fall through
   case LCK_This:
     assert(!Var && "'this' capture cannot have a variable!");
+    Bits |= Capture_This;
     break;
 
   case LCK_ByCopy:
@@ -901,18 +834,17 @@
     break;
   case LCK_VLAType:
     assert(!Var && "VLA type capture cannot have a variable!");
-    Bits |= Capture_ByCopy;
     break;
   }
   DeclAndBits.setInt(Bits);
 }
 
 LambdaCaptureKind LambdaCapture::getCaptureKind() const {
-  Decl *D = DeclAndBits.getPointer();
+  if (capturesVLAType())
+    return LCK_VLAType;
   bool CapByCopy = DeclAndBits.getInt() & Capture_ByCopy;
-  if (!D)
-    return CapByCopy ? LCK_VLAType : LCK_This;
-
+  if (capturesThis())
+    return CapByCopy ? LCK_StarThis : LCK_This;
   return CapByCopy ? LCK_ByCopy : LCK_ByRef;
 }
 
@@ -1091,6 +1023,7 @@
 }
 
 ExprWithCleanups::ExprWithCleanups(Expr *subexpr,
+                                   bool CleanupsHaveSideEffects,
                                    ArrayRef<CleanupObject> objects)
   : Expr(ExprWithCleanupsClass, subexpr->getType(),
          subexpr->getValueKind(), subexpr->getObjectKind(),
@@ -1098,16 +1031,19 @@
          subexpr->isInstantiationDependent(),
          subexpr->containsUnexpandedParameterPack()),
     SubExpr(subexpr) {
+  ExprWithCleanupsBits.CleanupsHaveSideEffects = CleanupsHaveSideEffects;
   ExprWithCleanupsBits.NumObjects = objects.size();
   for (unsigned i = 0, e = objects.size(); i != e; ++i)
     getTrailingObjects<CleanupObject>()[i] = objects[i];
 }
 
 ExprWithCleanups *ExprWithCleanups::Create(const ASTContext &C, Expr *subexpr,
+                                           bool CleanupsHaveSideEffects,
                                            ArrayRef<CleanupObject> objects) {
   void *buffer = C.Allocate(totalSizeToAlloc<CleanupObject>(objects.size()),
                             llvm::alignOf<ExprWithCleanups>());
-  return new (buffer) ExprWithCleanups(subexpr, objects);
+  return new (buffer)
+      ExprWithCleanups(subexpr, CleanupsHaveSideEffects, objects);
 }
 
 ExprWithCleanups::ExprWithCleanups(EmptyShell empty, unsigned numObjects)
diff --git a/lib/AST/ExprClassification.cpp b/lib/AST/ExprClassification.cpp
index a47b03c..8388013 100644
--- a/lib/AST/ExprClassification.cpp
+++ b/lib/AST/ExprClassification.cpp
@@ -178,6 +178,7 @@
   case Expr::ObjCArrayLiteralClass:
   case Expr::ObjCDictionaryLiteralClass:
   case Expr::ObjCBoolLiteralExprClass:
+  case Expr::ObjCAvailabilityCheckExprClass:
   case Expr::ParenListExprClass:
   case Expr::SizeOfPackExprClass:
   case Expr::SubstNonTypeTemplateParmPackExprClass:
@@ -360,6 +361,7 @@
       
     // Some C++ expressions are always class temporaries.
   case Expr::CXXConstructExprClass:
+  case Expr::CXXInheritedCtorInitExprClass:
   case Expr::CXXTemporaryObjectExprClass:
   case Expr::LambdaExprClass:
   case Expr::CXXStdInitializerListExprClass:
@@ -427,6 +429,7 @@
   else
     islvalue = isa<VarDecl>(D) || isa<FieldDecl>(D) ||
                isa<IndirectFieldDecl>(D) ||
+               isa<BindingDecl>(D) ||
                (Ctx.getLangOpts().CPlusPlus &&
                 (isa<FunctionDecl>(D) || isa<MSPropertyDecl>(D) ||
                  isa<FunctionTemplateDecl>(D)));
diff --git a/lib/AST/ExprConstant.cpp b/lib/AST/ExprConstant.cpp
index 020a86d..107913e 100644
--- a/lib/AST/ExprConstant.cpp
+++ b/lib/AST/ExprConstant.cpp
@@ -36,6 +36,7 @@
 #include "clang/AST/APValue.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/RecordLayout.h"
@@ -43,7 +44,6 @@
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetInfo.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstring>
 #include <functional>
@@ -157,13 +157,13 @@
     /// True if the subobject was named in a manner not supported by C++11. Such
     /// lvalues can still be folded, but they are not core constant expressions
     /// and we cannot perform lvalue-to-rvalue conversions on them.
-    bool Invalid : 1;
+    unsigned Invalid : 1;
 
     /// Is this a pointer one past the end of an object?
-    bool IsOnePastTheEnd : 1;
+    unsigned IsOnePastTheEnd : 1;
 
     /// Indicator of whether the most-derived object is an array element.
-    bool MostDerivedIsArrayElement : 1;
+    unsigned MostDerivedIsArrayElement : 1;
 
     /// The length of the path to the most-derived object of which this is a
     /// subobject.
@@ -477,6 +477,9 @@
     /// fold (not just why it's not strictly a constant expression)?
     bool HasFoldFailureDiagnostic;
 
+    /// \brief Whether or not we're currently speculatively evaluating.
+    bool IsSpeculativelyEvaluating;
+
     enum EvaluationMode {
       /// Evaluate as a constant expression. Stop if we find that the expression
       /// is not a constant expression.
@@ -541,7 +544,8 @@
         BottomFrame(*this, SourceLocation(), nullptr, nullptr, nullptr),
         EvaluatingDecl((const ValueDecl *)nullptr),
         EvaluatingDeclValue(nullptr), HasActiveDiagnostic(false),
-        HasFoldFailureDiagnostic(false), EvalMode(Mode) {}
+        HasFoldFailureDiagnostic(false), IsSpeculativelyEvaluating(false),
+        EvalMode(Mode) {}
 
     void setEvaluatingDecl(APValue::LValueBase Base, APValue &Value) {
       EvaluatingDecl = Base;
@@ -557,12 +561,12 @@
         return false;
       if (NextCallIndex == 0) {
         // NextCallIndex has wrapped around.
-        Diag(Loc, diag::note_constexpr_call_limit_exceeded);
+        FFDiag(Loc, diag::note_constexpr_call_limit_exceeded);
         return false;
       }
       if (CallStackDepth <= getLangOpts().ConstexprCallDepth)
         return true;
-      Diag(Loc, diag::note_constexpr_depth_limit_exceeded)
+      FFDiag(Loc, diag::note_constexpr_depth_limit_exceeded)
         << getLangOpts().ConstexprCallDepth;
       return false;
     }
@@ -579,7 +583,7 @@
 
     bool nextStep(const Stmt *S) {
       if (!StepsLeft) {
-        Diag(S->getLocStart(), diag::note_constexpr_step_limit_exceeded);
+        FFDiag(S->getLocStart(), diag::note_constexpr_step_limit_exceeded);
         return false;
       }
       --StepsLeft;
@@ -597,11 +601,10 @@
     /// Add notes containing a call stack to the current point of evaluation.
     void addCallStack(unsigned Limit);
 
-  public:
-    /// Diagnose that the evaluation cannot be folded.
-    OptionalDiagnostic Diag(SourceLocation Loc, diag::kind DiagId
-                              = diag::note_invalid_subexpr_in_const_expr,
-                            unsigned ExtraNotes = 0, bool IsCCEDiag = false) {
+  private:
+    OptionalDiagnostic Diag(SourceLocation Loc, diag::kind DiagId,
+                            unsigned ExtraNotes, bool IsCCEDiag) {
+    
       if (EvalStatus.Diag) {
         // If we have a prior diagnostic, it will be noting that the expression
         // isn't a constant expression. This diagnostic is more important,
@@ -646,12 +649,20 @@
       HasActiveDiagnostic = false;
       return OptionalDiagnostic();
     }
-
-    OptionalDiagnostic Diag(const Expr *E, diag::kind DiagId
+  public:
+    // Diagnose that the evaluation could not be folded (FF => FoldFailure)
+    OptionalDiagnostic
+    FFDiag(SourceLocation Loc,
+          diag::kind DiagId = diag::note_invalid_subexpr_in_const_expr,
+          unsigned ExtraNotes = 0) {
+      return Diag(Loc, DiagId, ExtraNotes, false);
+    }
+    
+    OptionalDiagnostic FFDiag(const Expr *E, diag::kind DiagId
                               = diag::note_invalid_subexpr_in_const_expr,
-                            unsigned ExtraNotes = 0, bool IsCCEDiag = false) {
+                            unsigned ExtraNotes = 0) {
       if (EvalStatus.Diag)
-        return Diag(E->getExprLoc(), DiagId, ExtraNotes, IsCCEDiag);
+        return Diag(E->getExprLoc(), DiagId, ExtraNotes, /*IsCCEDiag*/false);
       HasActiveDiagnostic = false;
       return OptionalDiagnostic();
     }
@@ -661,8 +672,7 @@
     ///
     /// FIXME: Stop evaluating if we're in EM_ConstantExpression or
     /// EM_PotentialConstantExpression mode and we produce one of these.
-    template<typename LocArg>
-    OptionalDiagnostic CCEDiag(LocArg Loc, diag::kind DiagId
+    OptionalDiagnostic CCEDiag(SourceLocation Loc, diag::kind DiagId
                                  = diag::note_invalid_subexpr_in_const_expr,
                                unsigned ExtraNotes = 0) {
       // Don't override a previous diagnostic. Don't bother collecting
@@ -673,7 +683,11 @@
       }
       return Diag(Loc, DiagId, ExtraNotes, true);
     }
-
+    OptionalDiagnostic CCEDiag(const Expr *E, diag::kind DiagId
+                                 = diag::note_invalid_subexpr_in_const_expr,
+                               unsigned ExtraNotes = 0) {
+      return CCEDiag(E->getExprLoc(), DiagId, ExtraNotes);
+    }
     /// Add a note to a prior diagnostic.
     OptionalDiagnostic Note(SourceLocation Loc, diag::kind DiagId) {
       if (!HasActiveDiagnostic)
@@ -763,6 +777,29 @@
       llvm_unreachable("Missed EvalMode case");
     }
 
+    /// Notes that we failed to evaluate an expression that other expressions
+    /// directly depend on, and determine if we should keep evaluating. This
+    /// should only be called if we actually intend to keep evaluating.
+    ///
+    /// Call noteSideEffect() instead if we may be able to ignore the value that
+    /// we failed to evaluate, e.g. if we failed to evaluate Foo() in:
+    ///
+    /// (Foo(), 1)      // use noteSideEffect
+    /// (Foo() || true) // use noteSideEffect
+    /// Foo() + 1       // use noteFailure
+    LLVM_ATTRIBUTE_UNUSED_RESULT bool noteFailure() {
+      // Failure when evaluating some expression often means there is some
+      // subexpression whose evaluation was skipped. Therefore, (because we
+      // don't track whether we skipped an expression when unwinding after an
+      // evaluation failure) every evaluation failure that bubbles up from a
+      // subexpression implies that a side-effect has potentially happened. We
+      // skip setting the HasSideEffects flag to true until we decide to
+      // continue evaluating after that point, which happens here.
+      bool KeepGoing = keepEvaluatingAfterFailure();
+      EvalStatus.HasSideEffects |= KeepGoing;
+      return KeepGoing;
+    }
+
     bool allowInvalidBaseExpr() const {
       return EvalMode == EM_DesignatorFold;
     }
@@ -811,24 +848,52 @@
     ~FoldOffsetRAII() { Info.EvalMode = OldMode; }
   };
 
-  /// RAII object used to suppress diagnostics and side-effects from a
-  /// speculative evaluation.
+  /// RAII object used to optionally suppress diagnostics and side-effects from
+  /// a speculative evaluation.
   class SpeculativeEvaluationRAII {
-    EvalInfo &Info;
+    /// Pair of EvalInfo, and a bit that stores whether or not we were
+    /// speculatively evaluating when we created this RAII.
+    llvm::PointerIntPair<EvalInfo *, 1, bool> InfoAndOldSpecEval;
     Expr::EvalStatus Old;
 
+    void moveFromAndCancel(SpeculativeEvaluationRAII &&Other) {
+      InfoAndOldSpecEval = Other.InfoAndOldSpecEval;
+      Old = Other.Old;
+      Other.InfoAndOldSpecEval.setPointer(nullptr);
+    }
+
+    void maybeRestoreState() {
+      EvalInfo *Info = InfoAndOldSpecEval.getPointer();
+      if (!Info)
+        return;
+
+      Info->EvalStatus = Old;
+      Info->IsSpeculativelyEvaluating = InfoAndOldSpecEval.getInt();
+    }
+
   public:
-    SpeculativeEvaluationRAII(EvalInfo &Info,
-                        SmallVectorImpl<PartialDiagnosticAt> *NewDiag = nullptr)
-      : Info(Info), Old(Info.EvalStatus) {
+    SpeculativeEvaluationRAII() = default;
+
+    SpeculativeEvaluationRAII(
+        EvalInfo &Info, SmallVectorImpl<PartialDiagnosticAt> *NewDiag = nullptr)
+        : InfoAndOldSpecEval(&Info, Info.IsSpeculativelyEvaluating),
+          Old(Info.EvalStatus) {
       Info.EvalStatus.Diag = NewDiag;
-      // If we're speculatively evaluating, we may have skipped over some
-      // evaluations and missed out a side effect.
-      Info.EvalStatus.HasSideEffects = true;
+      Info.IsSpeculativelyEvaluating = true;
     }
-    ~SpeculativeEvaluationRAII() {
-      Info.EvalStatus = Old;
+
+    SpeculativeEvaluationRAII(const SpeculativeEvaluationRAII &Other) = delete;
+    SpeculativeEvaluationRAII(SpeculativeEvaluationRAII &&Other) {
+      moveFromAndCancel(std::move(Other));
     }
+
+    SpeculativeEvaluationRAII &operator=(SpeculativeEvaluationRAII &&Other) {
+      maybeRestoreState();
+      moveFromAndCancel(std::move(Other));
+      return *this;
+    }
+
+    ~SpeculativeEvaluationRAII() { maybeRestoreState(); }
   };
 
   /// RAII object wrapping a full-expression or block scope, and handling
@@ -941,6 +1006,16 @@
       continue;
     }
 
+    // Use a different note for an inheriting constructor, because from the
+    // user's perspective it's not really a function at all.
+    if (auto *CD = dyn_cast_or_null<CXXConstructorDecl>(Frame->Callee)) {
+      if (CD->isInheritingConstructor()) {
+        addDiag(Frame->CallLoc, diag::note_constexpr_inherited_ctor_call_here)
+          << CD->getParent();
+        continue;
+      }
+    }
+
     SmallVector<char, 128> Buffer;
     llvm::raw_svector_ostream Out(Buffer);
     describeCall(Frame, Out);
@@ -992,7 +1067,7 @@
   struct LValue {
     APValue::LValueBase Base;
     CharUnits Offset;
-    bool InvalidBase : 1;
+    unsigned InvalidBase : 1;
     unsigned CallIndex : 31;
     SubobjectDesignator Designator;
 
@@ -1335,12 +1410,12 @@
   if (!IsGlobalLValue(Base)) {
     if (Info.getLangOpts().CPlusPlus11) {
       const ValueDecl *VD = Base.dyn_cast<const ValueDecl*>();
-      Info.Diag(Loc, diag::note_constexpr_non_global, 1)
+      Info.FFDiag(Loc, diag::note_constexpr_non_global, 1)
         << IsReferenceType << !Designator.Entries.empty()
         << !!VD << VD;
       NoteLValueLocation(Info, Base);
     } else {
-      Info.Diag(Loc);
+      Info.FFDiag(Loc);
     }
     // Don't allow references to temporaries to escape.
     return false;
@@ -1390,7 +1465,7 @@
   // Does this refer one past the end of some object?
   if (!Designator.Invalid && Designator.isOnePastTheEnd()) {
     const ValueDecl *VD = Base.dyn_cast<const ValueDecl*>();
-    Info.Diag(Loc, diag::note_constexpr_past_end, 1)
+    Info.FFDiag(Loc, diag::note_constexpr_past_end, 1)
       << !Designator.Entries.empty() << !!VD << VD;
     NoteLValueLocation(Info, Base);
   }
@@ -1414,10 +1489,10 @@
 
   // Prvalue constant expressions must be of literal types.
   if (Info.getLangOpts().CPlusPlus11)
-    Info.Diag(E, diag::note_constexpr_nonliteral)
+    Info.FFDiag(E, diag::note_constexpr_nonliteral)
       << E->getType();
   else
-    Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+    Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
   return false;
 }
 
@@ -1427,7 +1502,7 @@
 static bool CheckConstantExpression(EvalInfo &Info, SourceLocation DiagLoc,
                                     QualType Type, const APValue &Value) {
   if (Value.isUninit()) {
-    Info.Diag(DiagLoc, diag::note_constexpr_uninitialized)
+    Info.FFDiag(DiagLoc, diag::note_constexpr_uninitialized)
       << true << Type;
     return false;
   }
@@ -1637,7 +1712,7 @@
     // FIXME: In this case, we should provide the diagnostic for casting
     // a pointer to an integer.
     assert(Value.isLValue() && "integral value neither int nor lvalue?");
-    Info.Diag(E);
+    Info.FFDiag(E);
     return false;
   }
 
@@ -1679,7 +1754,7 @@
       } else {
         // Don't try to handle vectors of anything other than int or float
         // (not sure if it's possible to hit this case).
-        Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+        Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
         return false;
       }
       unsigned BaseEltSize = EltAsInt.getBitWidth();
@@ -1692,7 +1767,7 @@
   }
   // Give up if the input isn't an int, float, or vector.  For example, we
   // reject "(v4i16)(intptr_t)&a".
-  Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+  Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
   return false;
 }
 
@@ -1728,7 +1803,7 @@
                               APSInt &Result) {
   switch (Opcode) {
   default:
-    Info.Diag(E);
+    Info.FFDiag(E);
     return false;
   case BO_Mul:
     return CheckedIntArithmetic(Info, E, LHS, RHS, LHS.getBitWidth() * 2,
@@ -1745,7 +1820,7 @@
   case BO_Div:
   case BO_Rem:
     if (RHS == 0) {
-      Info.Diag(E, diag::note_expr_divide_by_zero);
+      Info.FFDiag(E, diag::note_expr_divide_by_zero);
       return false;
     }
     Result = (Opcode == BO_Rem ? LHS % RHS : LHS / RHS);
@@ -1826,7 +1901,7 @@
                                   const APFloat &RHS) {
   switch (Opcode) {
   default:
-    Info.Diag(E);
+    Info.FFDiag(E);
     return false;
   case BO_Mul:
     LHS.multiply(RHS, APFloat::rmNearestTiesToEven);
@@ -1968,10 +2043,15 @@
     return true;
   }
 
+  if (Type->isDependentType()) {
+    Info.FFDiag(Loc);
+    return false;
+  }
+
   if (!Type->isConstantSizeType()) {
     // sizeof(vla) is not a constantexpr: C99 6.5.3.4p2.
     // FIXME: Better diagnostic.
-    Info.Diag(Loc);
+    Info.FFDiag(Loc);
     return false;
   }
 
@@ -2035,7 +2115,7 @@
     if (Info.checkingPotentialConstantExpression())
       return false;
     if (!Frame || !Frame->Arguments) {
-      Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+      Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
       return false;
     }
     Result = &Frame->Arguments[PVD->getFunctionScopeIndex()];
@@ -2045,7 +2125,22 @@
   // If this is a local variable, dig out its value.
   if (Frame) {
     Result = Frame->getTemporary(VD);
-    assert(Result && "missing value for local variable");
+    if (!Result) {
+      // Assume variables referenced within a lambda's call operator that were
+      // not declared within the call operator are captures and during checking
+      // of a potential constant expression, assume they are unknown constant
+      // expressions.
+      assert(isLambdaCallOperator(Frame->Callee) &&
+             (VD->getDeclContext() != Frame->Callee || VD->isInitCapture()) &&
+             "missing value for local variable");
+      if (Info.checkingPotentialConstantExpression())
+        return false;
+      // FIXME: implement capture evaluation during constant expr evaluation.
+      Info.FFDiag(E->getLocStart(),
+           diag::note_unimplemented_constexpr_lambda_feature_ast)
+          << "captures not currently allowed";
+      return false;
+    }
     return true;
   }
 
@@ -2055,7 +2150,7 @@
     // If we're checking a potential constant expression, the variable could be
     // initialized later.
     if (!Info.checkingPotentialConstantExpression())
-      Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+      Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
     return false;
   }
 
@@ -2069,7 +2164,7 @@
   // Never evaluate the initializer of a weak variable. We can't be sure that
   // this is the definition which will be used.
   if (VD->isWeak()) {
-    Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+    Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
     return false;
   }
 
@@ -2077,7 +2172,7 @@
   // this in the cases where it matters for conformance.
   SmallVector<PartialDiagnosticAt, 8> Notes;
   if (!VD->evaluateValue(Notes)) {
-    Info.Diag(E, diag::note_constexpr_var_init_non_constant,
+    Info.FFDiag(E, diag::note_constexpr_var_init_non_constant,
               Notes.size() + 1) << VD;
     Info.Note(VD->getLocation(), diag::note_declared_at);
     Info.addNotes(Notes);
@@ -2218,7 +2313,7 @@
     // FIXME: Add core issue number for the union case.
     if (Field->isMutable() &&
         (RD->isUnion() || isReadByLvalueToRvalueConversion(Field->getType()))) {
-      Info.Diag(E, diag::note_constexpr_ltor_mutable, 1) << Field;
+      Info.FFDiag(E, diag::note_constexpr_ltor_mutable, 1) << Field;
       Info.Note(Field->getLocation(), diag::note_declared_at);
       return true;
     }
@@ -2272,10 +2367,10 @@
     return handler.failed();
   if (Sub.isOnePastTheEnd()) {
     if (Info.getLangOpts().CPlusPlus11)
-      Info.Diag(E, diag::note_constexpr_access_past_end)
+      Info.FFDiag(E, diag::note_constexpr_access_past_end)
         << handler.AccessKind;
     else
-      Info.Diag(E);
+      Info.FFDiag(E);
     return handler.failed();
   }
 
@@ -2287,7 +2382,7 @@
   for (unsigned I = 0, N = Sub.Entries.size(); /**/; ++I) {
     if (O->isUninit()) {
       if (!Info.checkingPotentialConstantExpression())
-        Info.Diag(E, diag::note_constexpr_access_uninit) << handler.AccessKind;
+        Info.FFDiag(E, diag::note_constexpr_access_uninit) << handler.AccessKind;
       return handler.failed();
     }
 
@@ -2322,10 +2417,10 @@
         // Note, it should not be possible to form a pointer with a valid
         // designator which points more than one past the end of the array.
         if (Info.getLangOpts().CPlusPlus11)
-          Info.Diag(E, diag::note_constexpr_access_past_end)
+          Info.FFDiag(E, diag::note_constexpr_access_past_end)
             << handler.AccessKind;
         else
-          Info.Diag(E);
+          Info.FFDiag(E);
         return handler.failed();
       }
 
@@ -2355,10 +2450,10 @@
       uint64_t Index = Sub.Entries[I].ArrayIndex;
       if (Index > 1) {
         if (Info.getLangOpts().CPlusPlus11)
-          Info.Diag(E, diag::note_constexpr_access_past_end)
+          Info.FFDiag(E, diag::note_constexpr_access_past_end)
             << handler.AccessKind;
         else
-          Info.Diag(E);
+          Info.FFDiag(E);
         return handler.failed();
       }
 
@@ -2378,7 +2473,7 @@
       }
     } else if (const FieldDecl *Field = getAsField(Sub.Entries[I])) {
       if (Field->isMutable() && handler.AccessKind == AK_Read) {
-        Info.Diag(E, diag::note_constexpr_ltor_mutable, 1)
+        Info.FFDiag(E, diag::note_constexpr_ltor_mutable, 1)
           << Field;
         Info.Note(Field->getLocation(), diag::note_declared_at);
         return handler.failed();
@@ -2390,7 +2485,7 @@
         const FieldDecl *UnionField = O->getUnionField();
         if (!UnionField ||
             UnionField->getCanonicalDecl() != Field->getCanonicalDecl()) {
-          Info.Diag(E, diag::note_constexpr_access_inactive_union_member)
+          Info.FFDiag(E, diag::note_constexpr_access_inactive_union_member)
             << handler.AccessKind << Field << !UnionField << UnionField;
           return handler.failed();
         }
@@ -2406,11 +2501,11 @@
       if (ObjType.isVolatileQualified()) {
         if (Info.getLangOpts().CPlusPlus) {
           // FIXME: Include a description of the path to the volatile subobject.
-          Info.Diag(E, diag::note_constexpr_access_volatile_obj, 1)
+          Info.FFDiag(E, diag::note_constexpr_access_volatile_obj, 1)
             << handler.AccessKind << 2 << Field;
           Info.Note(Field->getLocation(), diag::note_declared_at);
         } else {
-          Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+          Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
         }
         return handler.failed();
       }
@@ -2482,7 +2577,7 @@
   bool checkConst(QualType QT) {
     // Assigning to a const object has undefined behavior.
     if (QT.isConstQualified()) {
-      Info.Diag(E, diag::note_constexpr_modify_const_type) << QT;
+      Info.FFDiag(E, diag::note_constexpr_modify_const_type) << QT;
       return false;
     }
     return true;
@@ -2501,7 +2596,7 @@
       return false;
     if (!NewVal.isInt()) {
       // Maybe trying to write a cast pointer value into a complex?
-      Info.Diag(E);
+      Info.FFDiag(E);
       return false;
     }
     Value = NewVal.getInt();
@@ -2592,7 +2687,7 @@
                                          AccessKinds AK, const LValue &LVal,
                                          QualType LValType) {
   if (!LVal.Base) {
-    Info.Diag(E, diag::note_constexpr_access_null) << AK;
+    Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
     return CompleteObject();
   }
 
@@ -2600,7 +2695,7 @@
   if (LVal.CallIndex) {
     Frame = Info.getCallFrame(LVal.CallIndex);
     if (!Frame) {
-      Info.Diag(E, diag::note_constexpr_lifetime_ended, 1)
+      Info.FFDiag(E, diag::note_constexpr_lifetime_ended, 1)
         << AK << LVal.Base.is<const ValueDecl*>();
       NoteLValueLocation(Info, LVal.Base);
       return CompleteObject();
@@ -2613,10 +2708,10 @@
   // semantics.
   if (LValType.isVolatileQualified()) {
     if (Info.getLangOpts().CPlusPlus)
-      Info.Diag(E, diag::note_constexpr_access_volatile_type)
+      Info.FFDiag(E, diag::note_constexpr_access_volatile_type)
         << AK << LValType;
     else
-      Info.Diag(E);
+      Info.FFDiag(E);
     return CompleteObject();
   }
 
@@ -2638,18 +2733,18 @@
         VD = VDef;
     }
     if (!VD || VD->isInvalidDecl()) {
-      Info.Diag(E);
+      Info.FFDiag(E);
       return CompleteObject();
     }
 
     // Accesses of volatile-qualified objects are not allowed.
     if (BaseType.isVolatileQualified()) {
       if (Info.getLangOpts().CPlusPlus) {
-        Info.Diag(E, diag::note_constexpr_access_volatile_obj, 1)
+        Info.FFDiag(E, diag::note_constexpr_access_volatile_obj, 1)
           << AK << 1 << VD;
         Info.Note(VD->getLocation(), diag::note_declared_at);
       } else {
-        Info.Diag(E);
+        Info.FFDiag(E);
       }
       return CompleteObject();
     }
@@ -2664,17 +2759,20 @@
         // evaluation.
       } else if (AK != AK_Read) {
         // All the remaining cases only permit reading.
-        Info.Diag(E, diag::note_constexpr_modify_global);
+        Info.FFDiag(E, diag::note_constexpr_modify_global);
         return CompleteObject();
       } else if (VD->isConstexpr()) {
         // OK, we can read this variable.
       } else if (BaseType->isIntegralOrEnumerationType()) {
-        if (!BaseType.isConstQualified()) {
+        // In OpenCL if a variable is in constant address space it is a const value.
+        if (!(BaseType.isConstQualified() ||
+              (Info.getLangOpts().OpenCL &&
+               BaseType.getAddressSpace() == LangAS::opencl_constant))) {
           if (Info.getLangOpts().CPlusPlus) {
-            Info.Diag(E, diag::note_constexpr_ltor_non_const_int, 1) << VD;
+            Info.FFDiag(E, diag::note_constexpr_ltor_non_const_int, 1) << VD;
             Info.Note(VD->getLocation(), diag::note_declared_at);
           } else {
-            Info.Diag(E);
+            Info.FFDiag(E);
           }
           return CompleteObject();
         }
@@ -2690,11 +2788,15 @@
         }
       } else {
         // FIXME: Allow folding of values of any literal type in all languages.
-        if (Info.getLangOpts().CPlusPlus11) {
-          Info.Diag(E, diag::note_constexpr_ltor_non_constexpr, 1) << VD;
+        if (Info.checkingPotentialConstantExpression() &&
+            VD->getType().isConstQualified() && !VD->hasDefinition(Info.Ctx)) {
+          // The definition of this variable could be constexpr. We can't
+          // access it right now, but may be able to in future.
+        } else if (Info.getLangOpts().CPlusPlus11) {
+          Info.FFDiag(E, diag::note_constexpr_ltor_non_constexpr, 1) << VD;
           Info.Note(VD->getLocation(), diag::note_declared_at);
         } else {
-          Info.Diag(E);
+          Info.FFDiag(E);
         }
         return CompleteObject();
       }
@@ -2730,7 +2832,7 @@
         if (!(BaseType.isConstQualified() &&
               BaseType->isIntegralOrEnumerationType()) &&
             !(VD && VD->getCanonicalDecl() == ED->getCanonicalDecl())) {
-          Info.Diag(E, diag::note_constexpr_access_static_temporary, 1) << AK;
+          Info.FFDiag(E, diag::note_constexpr_access_static_temporary, 1) << AK;
           Info.Note(MTE->getExprLoc(), diag::note_constexpr_temporary_here);
           return CompleteObject();
         }
@@ -2738,7 +2840,7 @@
         BaseVal = Info.Ctx.getMaterializedTemporaryValue(MTE, false);
         assert(BaseVal && "got reference to unevaluated temporary");
       } else {
-        Info.Diag(E);
+        Info.FFDiag(E);
         return CompleteObject();
       }
     } else {
@@ -2749,11 +2851,11 @@
     // Volatile temporary objects cannot be accessed in constant expressions.
     if (BaseType.isVolatileQualified()) {
       if (Info.getLangOpts().CPlusPlus) {
-        Info.Diag(E, diag::note_constexpr_access_volatile_obj, 1)
+        Info.FFDiag(E, diag::note_constexpr_access_volatile_obj, 1)
           << AK << 0;
         Info.Note(Base->getExprLoc(), diag::note_constexpr_temporary_here);
       } else {
-        Info.Diag(E);
+        Info.FFDiag(E);
       }
       return CompleteObject();
     }
@@ -2769,12 +2871,13 @@
   }
 
   // In C++1y, we can't safely access any mutable state when we might be
-  // evaluating after an unmodeled side effect or an evaluation failure.
+  // evaluating after an unmodeled side effect.
   //
   // FIXME: Not all local state is mutable. Allow local constant subobjects
   // to be read here (but take care with 'mutable' fields).
-  if (Frame && Info.getLangOpts().CPlusPlus14 &&
-      (Info.EvalStatus.HasSideEffects || Info.keepEvaluatingAfterFailure()))
+  if ((Frame && Info.getLangOpts().CPlusPlus14 &&
+       Info.EvalStatus.HasSideEffects) ||
+      (AK != AK_Read && Info.IsSpeculativelyEvaluating))
     return CompleteObject();
 
   return CompleteObject(BaseVal, BaseType);
@@ -2806,7 +2909,7 @@
       // an ICE in C, so this only matters for fold.
       assert(!Info.getLangOpts().CPlusPlus && "lvalue compound literal in c++?");
       if (Type.isVolatileQualified()) {
-        Info.Diag(Conv);
+        Info.FFDiag(Conv);
         return false;
       }
       APValue Lit;
@@ -2835,7 +2938,7 @@
     return false;
 
   if (!Info.getLangOpts().CPlusPlus14) {
-    Info.Diag(E);
+    Info.FFDiag(E);
     return false;
   }
 
@@ -2863,7 +2966,7 @@
   bool checkConst(QualType QT) {
     // Assigning to a const object has undefined behavior.
     if (QT.isConstQualified()) {
-      Info.Diag(E, diag::note_constexpr_modify_const_type) << QT;
+      Info.FFDiag(E, diag::note_constexpr_modify_const_type) << QT;
       return false;
     }
     return true;
@@ -2879,13 +2982,13 @@
     case APValue::ComplexInt:
     case APValue::ComplexFloat:
       // FIXME: Implement complex compound assignment.
-      Info.Diag(E);
+      Info.FFDiag(E);
       return false;
     case APValue::LValue:
       return foundPointer(Subobj, SubobjType);
     default:
       // FIXME: can this happen?
-      Info.Diag(E);
+      Info.FFDiag(E);
       return false;
     }
   }
@@ -2896,7 +2999,7 @@
     if (!SubobjType->isIntegerType() || !RHS.isInt()) {
       // We don't support compound assignment on integer-cast-to-pointer
       // values.
-      Info.Diag(E);
+      Info.FFDiag(E);
       return false;
     }
 
@@ -2924,7 +3027,7 @@
 
     if (PointeeType.isNull() || !RHS.isInt() ||
         (Opcode != BO_Add && Opcode != BO_Sub)) {
-      Info.Diag(E);
+      Info.FFDiag(E);
       return false;
     }
 
@@ -2956,7 +3059,7 @@
     return false;
 
   if (!Info.getLangOpts().CPlusPlus14) {
-    Info.Diag(E);
+    Info.FFDiag(E);
     return false;
   }
 
@@ -2978,7 +3081,7 @@
   bool checkConst(QualType QT) {
     // Assigning to a const object has undefined behavior.
     if (QT.isConstQualified()) {
-      Info.Diag(E, diag::note_constexpr_modify_const_type) << QT;
+      Info.FFDiag(E, diag::note_constexpr_modify_const_type) << QT;
       return false;
     }
     return true;
@@ -3010,7 +3113,7 @@
       return foundPointer(Subobj, SubobjType);
     default:
       // FIXME: can this happen?
-      Info.Diag(E);
+      Info.FFDiag(E);
       return false;
     }
   }
@@ -3021,7 +3124,7 @@
     if (!SubobjType->isIntegerType()) {
       // We don't support increment / decrement on integer-cast-to-pointer
       // values.
-      Info.Diag(E);
+      Info.FFDiag(E);
       return false;
     }
 
@@ -3080,7 +3183,7 @@
     if (const PointerType *PT = SubobjType->getAs<PointerType>())
       PointeeType = PT->getPointeeType();
     else {
-      Info.Diag(E);
+      Info.FFDiag(E);
       return false;
     }
 
@@ -3105,7 +3208,7 @@
     return false;
 
   if (!Info.getLangOpts().CPlusPlus14) {
-    Info.Diag(E);
+    Info.FFDiag(E);
     return false;
   }
 
@@ -3127,7 +3230,7 @@
   if (Object->getType()->isLiteralType(Info.Ctx))
     return EvaluateTemporary(Object, This, Info);
 
-  Info.Diag(Object, diag::note_constexpr_nonliteral) << Object->getType();
+  Info.FFDiag(Object, diag::note_constexpr_nonliteral) << Object->getType();
   return false;
 }
 
@@ -3155,7 +3258,7 @@
   // member value, the behavior is undefined.
   if (!MemPtr.getDecl()) {
     // FIXME: Specific diagnostic.
-    Info.Diag(RHS);
+    Info.FFDiag(RHS);
     return nullptr;
   }
 
@@ -3165,7 +3268,7 @@
     // derived-to-base path for the member pointer.
     if (LV.Designator.MostDerivedPathLength + MemPtr.Path.size() >
         LV.Designator.Entries.size()) {
-      Info.Diag(RHS);
+      Info.FFDiag(RHS);
       return nullptr;
     }
     unsigned PathLengthToMember =
@@ -3175,7 +3278,7 @@
           LV.Designator.Entries[PathLengthToMember + I]);
       const CXXRecordDecl *MPDecl = MemPtr.Path[I];
       if (LVDecl->getCanonicalDecl() != MPDecl->getCanonicalDecl()) {
-        Info.Diag(RHS);
+        Info.FFDiag(RHS);
         return nullptr;
       }
     }
@@ -3231,7 +3334,7 @@
   assert(BO->getOpcode() == BO_PtrMemD || BO->getOpcode() == BO_PtrMemI);
 
   if (!EvaluateObjectArgument(Info, BO->getLHS(), LV)) {
-    if (Info.keepEvaluatingAfterFailure()) {
+    if (Info.noteFailure()) {
       MemberPtr MemPtr;
       EvaluateMemberPointer(BO->getRHS(), MemPtr, Info);
     }
@@ -3297,38 +3400,51 @@
 };
 }
 
-static bool EvaluateDecl(EvalInfo &Info, const Decl *D) {
-  if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
-    // We don't need to evaluate the initializer for a static local.
-    if (!VD->hasLocalStorage())
-      return true;
+static bool EvaluateVarDecl(EvalInfo &Info, const VarDecl *VD) {
+  // We don't need to evaluate the initializer for a static local.
+  if (!VD->hasLocalStorage())
+    return true;
 
-    LValue Result;
-    Result.set(VD, Info.CurrentCall->Index);
-    APValue &Val = Info.CurrentCall->createTemporary(VD, true);
+  LValue Result;
+  Result.set(VD, Info.CurrentCall->Index);
+  APValue &Val = Info.CurrentCall->createTemporary(VD, true);
 
-    const Expr *InitE = VD->getInit();
-    if (!InitE) {
-      Info.Diag(D->getLocStart(), diag::note_constexpr_uninitialized)
-        << false << VD->getType();
-      Val = APValue();
-      return false;
-    }
+  const Expr *InitE = VD->getInit();
+  if (!InitE) {
+    Info.FFDiag(VD->getLocStart(), diag::note_constexpr_uninitialized)
+      << false << VD->getType();
+    Val = APValue();
+    return false;
+  }
 
-    if (InitE->isValueDependent())
-      return false;
+  if (InitE->isValueDependent())
+    return false;
 
-    if (!EvaluateInPlace(Val, Info, Result, InitE)) {
-      // Wipe out any partially-computed value, to allow tracking that this
-      // evaluation failed.
-      Val = APValue();
-      return false;
-    }
+  if (!EvaluateInPlace(Val, Info, Result, InitE)) {
+    // Wipe out any partially-computed value, to allow tracking that this
+    // evaluation failed.
+    Val = APValue();
+    return false;
   }
 
   return true;
 }
 
+static bool EvaluateDecl(EvalInfo &Info, const Decl *D) {
+  bool OK = true;
+
+  if (const VarDecl *VD = dyn_cast<VarDecl>(D))
+    OK &= EvaluateVarDecl(Info, VD);
+
+  if (const DecompositionDecl *DD = dyn_cast<DecompositionDecl>(D))
+    for (auto *BD : DD->bindings())
+      if (auto *VD = BD->getHoldingVar())
+        OK &= EvaluateDecl(Info, VD);
+
+  return OK;
+}
+
+
 /// Evaluate a condition (either a variable declaration or an expression).
 static bool EvaluateCond(EvalInfo &Info, const VarDecl *CondDecl,
                          const Expr *Cond, bool &Result) {
@@ -3338,6 +3454,7 @@
   return EvaluateAsBooleanCondition(Cond, Result, Info);
 }
 
+namespace {
 /// \brief A location where the result (returned value) of evaluating a
 /// statement should be stored.
 struct StmtResult {
@@ -3346,6 +3463,7 @@
   /// The location containing the result, if any (used to support RVO).
   const LValue *Slot;
 };
+}
 
 static EvalStmtResult EvaluateStmt(StmtResult &Result, EvalInfo &Info,
                                    const Stmt *S,
@@ -3379,6 +3497,11 @@
   APSInt Value;
   {
     FullExpressionRAII Scope(Info);
+    if (const Stmt *Init = SS->getInit()) {
+      EvalStmtResult ESR = EvaluateStmt(Result, Info, Init);
+      if (ESR != ESR_Succeeded)
+        return ESR;
+    }
     if (SS->getConditionVariable() &&
         !EvaluateDecl(Info, SS->getConditionVariable()))
       return ESR_Failed;
@@ -3421,7 +3544,7 @@
   case ESR_CaseNotFound:
     // This can only happen if the switch case is nested within a statement
     // expression. We have no intention of supporting that.
-    Info.Diag(Found->getLocStart(), diag::note_constexpr_stmt_expr_unsupported);
+    Info.FFDiag(Found->getLocStart(), diag::note_constexpr_stmt_expr_unsupported);
     return ESR_Failed;
   }
   llvm_unreachable("Invalid EvalStmtResult!");
@@ -3512,7 +3635,7 @@
       return ESR_Succeeded;
     }
 
-    Info.Diag(S->getLocStart());
+    Info.FFDiag(S->getLocStart());
     return ESR_Failed;
 
   case Stmt::NullStmtClass:
@@ -3525,7 +3648,7 @@
       // FIXME: This isn't quite right; if we're performing aggregate
       // initialization, each braced subexpression is its own full-expression.
       FullExpressionRAII Scope(Info);
-      if (!EvaluateDecl(Info, DclIt) && !Info.keepEvaluatingAfterFailure())
+      if (!EvaluateDecl(Info, DclIt) && !Info.noteFailure())
         return ESR_Failed;
     }
     return ESR_Succeeded;
@@ -3561,6 +3684,11 @@
 
     // Evaluate the condition, as either a var decl or as an expression.
     BlockScopeRAII Scope(Info);
+    if (const Stmt *Init = IS->getInit()) {
+      EvalStmtResult ESR = EvaluateStmt(Result, Info, Init);
+      if (ESR != ESR_Succeeded)
+        return ESR;
+    }
     bool Cond;
     if (!EvaluateCond(Info, IS->getConditionVariable(), IS->getCond(), Cond))
       return ESR_Failed;
@@ -3647,7 +3775,10 @@
       return ESR;
 
     // Create the __begin and __end iterators.
-    ESR = EvaluateStmt(Result, Info, FS->getBeginEndStmt());
+    ESR = EvaluateStmt(Result, Info, FS->getBeginStmt());
+    if (ESR != ESR_Succeeded)
+      return ESR;
+    ESR = EvaluateStmt(Result, Info, FS->getEndStmt());
     if (ESR != ESR_Succeeded)
       return ESR;
 
@@ -3736,7 +3867,8 @@
 /// expression.
 static bool CheckConstexprFunction(EvalInfo &Info, SourceLocation CallLoc,
                                    const FunctionDecl *Declaration,
-                                   const FunctionDecl *Definition) {
+                                   const FunctionDecl *Definition,
+                                   const Stmt *Body) {
   // Potential constant expressions can contain calls to declared, but not yet
   // defined, constexpr functions.
   if (Info.checkingPotentialConstantExpression() && !Definition &&
@@ -3749,19 +3881,34 @@
     return false;
 
   // Can we evaluate this function call?
-  if (Definition && Definition->isConstexpr() && !Definition->isInvalidDecl())
+  if (Definition && Definition->isConstexpr() &&
+      !Definition->isInvalidDecl() && Body)
     return true;
 
   if (Info.getLangOpts().CPlusPlus11) {
     const FunctionDecl *DiagDecl = Definition ? Definition : Declaration;
-    // FIXME: If DiagDecl is an implicitly-declared special member function, we
-    // should be much more explicit about why it's not constexpr.
-    Info.Diag(CallLoc, diag::note_constexpr_invalid_function, 1)
-      << DiagDecl->isConstexpr() << isa<CXXConstructorDecl>(DiagDecl)
-      << DiagDecl;
+    
+    // If this function is not constexpr because it is an inherited
+    // non-constexpr constructor, diagnose that directly.
+    auto *CD = dyn_cast<CXXConstructorDecl>(DiagDecl);
+    if (CD && CD->isInheritingConstructor()) {
+      auto *Inherited = CD->getInheritedConstructor().getConstructor();
+      if (!Inherited->isConstexpr()) 
+        DiagDecl = CD = Inherited;
+    }
+
+    // FIXME: If DiagDecl is an implicitly-declared special member function
+    // or an inheriting constructor, we should be much more explicit about why
+    // it's not constexpr.
+    if (CD && CD->isInheritingConstructor())
+      Info.FFDiag(CallLoc, diag::note_constexpr_invalid_inhctor, 1)
+        << CD->getInheritedConstructor().getConstructor()->getParent();
+    else
+      Info.FFDiag(CallLoc, diag::note_constexpr_invalid_function, 1)
+        << DiagDecl->isConstexpr() << (bool)CD << DiagDecl;
     Info.Note(DiagDecl->getLocation(), diag::note_declared_at);
   } else {
-    Info.Diag(CallLoc, diag::note_invalid_subexpr_in_const_expr);
+    Info.FFDiag(CallLoc, diag::note_invalid_subexpr_in_const_expr);
   }
   return false;
 }
@@ -3795,7 +3942,7 @@
     if (!Evaluate(ArgValues[I - Args.begin()], Info, *I)) {
       // If we're checking for a potential constant expression, evaluate all
       // initializers even if some of them fail.
-      if (!Info.keepEvaluatingAfterFailure())
+      if (!Info.noteFailure())
         return false;
       Success = false;
     }
@@ -3848,37 +3995,34 @@
   if (ESR == ESR_Succeeded) {
     if (Callee->getReturnType()->isVoidType())
       return true;
-    Info.Diag(Callee->getLocEnd(), diag::note_constexpr_no_return);
+    Info.FFDiag(Callee->getLocEnd(), diag::note_constexpr_no_return);
   }
   return ESR == ESR_Returned;
 }
 
 /// Evaluate a constructor call.
-static bool HandleConstructorCall(SourceLocation CallLoc, const LValue &This,
-                                  ArrayRef<const Expr*> Args,
+static bool HandleConstructorCall(const Expr *E, const LValue &This,
+                                  APValue *ArgValues,
                                   const CXXConstructorDecl *Definition,
                                   EvalInfo &Info, APValue &Result) {
-  ArgVector ArgValues(Args.size());
-  if (!EvaluateArgs(Args, ArgValues, Info))
-    return false;
-
+  SourceLocation CallLoc = E->getExprLoc();
   if (!Info.CheckCallLimit(CallLoc))
     return false;
 
   const CXXRecordDecl *RD = Definition->getParent();
   if (RD->getNumVBases()) {
-    Info.Diag(CallLoc, diag::note_constexpr_virtual_base) << RD;
+    Info.FFDiag(CallLoc, diag::note_constexpr_virtual_base) << RD;
     return false;
   }
 
-  CallStackFrame Frame(Info, CallLoc, Definition, &This, ArgValues.data());
+  CallStackFrame Frame(Info, CallLoc, Definition, &This, ArgValues);
 
   // FIXME: Creating an APValue just to hold a nonexistent return value is
   // wasteful.
   APValue RetVal;
   StmtResult Ret = {RetVal, nullptr};
 
-  // If it's a delegating constructor, just delegate.
+  // If it's a delegating constructor, delegate.
   if (Definition->isDelegatingConstructor()) {
     CXXConstructorDecl::init_const_iterator I = Definition->init_begin();
     {
@@ -3902,8 +4046,9 @@
        (Definition->isTrivial() && hasFields(Definition->getParent())))) {
     LValue RHS;
     RHS.setFrom(Info.Ctx, ArgValues[0]);
-    return handleLValueToRValueConversion(Info, Args[0], Args[0]->getType(),
-                                          RHS, Result);
+    return handleLValueToRValueConversion(
+        Info, E, Definition->getParamDecl(0)->getType().getNonReferenceType(),
+        RHS, Result);
   }
 
   // Reserve space for the struct members.
@@ -3987,7 +4132,7 @@
                                                           *Value, FD))) {
       // If we're checking for a potential constant expression, evaluate all
       // initializers even if some of them fail.
-      if (!Info.keepEvaluatingAfterFailure())
+      if (!Info.noteFailure())
         return false;
       Success = false;
     }
@@ -3997,6 +4142,18 @@
          EvaluateStmt(Ret, Info, Definition->getBody()) != ESR_Failed;
 }
 
+static bool HandleConstructorCall(const Expr *E, const LValue &This,
+                                  ArrayRef<const Expr*> Args,
+                                  const CXXConstructorDecl *Definition,
+                                  EvalInfo &Info, APValue &Result) {
+  ArgVector ArgValues(Args.size());
+  if (!EvaluateArgs(Args, ArgValues, Info))
+    return false;
+
+  return HandleConstructorCall(E, This, ArgValues.data(), Definition,
+                               Info, Result);
+}
+
 //===----------------------------------------------------------------------===//
 // Generic Evaluation
 //===----------------------------------------------------------------------===//
@@ -4022,14 +4179,16 @@
     assert(Info.checkingPotentialConstantExpression());
 
     // Speculatively evaluate both arms.
+    SmallVector<PartialDiagnosticAt, 8> Diag;
     {
-      SmallVector<PartialDiagnosticAt, 8> Diag;
       SpeculativeEvaluationRAII Speculate(Info, &Diag);
-
       StmtVisitorTy::Visit(E->getFalseExpr());
       if (Diag.empty())
         return;
+    }
 
+    {
+      SpeculativeEvaluationRAII Speculate(Info, &Diag);
       Diag.clear();
       StmtVisitorTy::Visit(E->getTrueExpr());
       if (Diag.empty())
@@ -4044,7 +4203,7 @@
   bool HandleConditionalOperator(const ConditionalOperator *E) {
     bool BoolResult;
     if (!EvaluateAsBooleanCondition(E->getCond(), BoolResult, Info)) {
-      if (Info.checkingPotentialConstantExpression())
+      if (Info.checkingPotentialConstantExpression() && Info.noteFailure())
         CheckPotentialConstantConditional(E);
       return false;
     }
@@ -4072,7 +4231,7 @@
   /// Report an evaluation error. This should only be called when an error is
   /// first discovered. When propagating an error, just return false.
   bool Error(const Expr *E, diag::kind D) {
-    Info.Diag(E, D);
+    Info.FFDiag(E, D);
     return false;
   }
   bool Error(const Expr *E) {
@@ -4275,7 +4434,7 @@
     const FunctionDecl *Definition = nullptr;
     Stmt *Body = FD->getBody(Definition);
 
-    if (!CheckConstexprFunction(Info, E->getExprLoc(), FD, Definition) ||
+    if (!CheckConstexprFunction(Info, E->getExprLoc(), FD, Definition, Body) ||
         !HandleFunctionCall(E->getExprLoc(), Definition, This, Args, Body, Info,
                             Result, ResultSlot))
       return false;
@@ -4397,7 +4556,7 @@
       if (BI + 1 == BE) {
         const Expr *FinalExpr = dyn_cast<Expr>(*BI);
         if (!FinalExpr) {
-          Info.Diag((*BI)->getLocStart(),
+          Info.FFDiag((*BI)->getLocStart(),
                     diag::note_constexpr_stmt_expr_unsupported);
           return false;
         }
@@ -4412,7 +4571,7 @@
         // 'break', or 'continue', it would be nice to propagate that to
         // the outer statement evaluation rather than bailing out.
         if (ESR != ESR_Failed)
-          Info.Diag((*BI)->getLocStart(),
+          Info.FFDiag((*BI)->getLocStart(),
                     diag::note_constexpr_stmt_expr_unsupported);
         return false;
       }
@@ -4425,6 +4584,15 @@
   void VisitIgnoredValue(const Expr *E) {
     EvaluateIgnoredValue(Info, E);
   }
+
+  /// Potentially visit a MemberExpr's base expression.
+  void VisitIgnoredBaseExpression(const Expr *E) {
+    // While MSVC doesn't evaluate the base expression, it does diagnose the
+    // presence of side-effecting behavior.
+    if (Info.getLangOpts().MSVCCompat && !E->HasSideEffects(Info.Ctx))
+      return;
+    VisitIgnoredValue(E);
+  }
 };
 
 }
@@ -4630,6 +4798,8 @@
     return Success(FD);
   if (const VarDecl *VD = dyn_cast<VarDecl>(E->getDecl()))
     return VisitVarDecl(E, VD);
+  if (const BindingDecl *BD = dyn_cast<BindingDecl>(E->getDecl()))
+    return Visit(BD->getBinding());
   return Error(E);
 }
 
@@ -4651,7 +4821,7 @@
     return false;
   if (V->isUninit()) {
     if (!Info.checkingPotentialConstantExpression())
-      Info.Diag(E, diag::note_constexpr_use_uninit_reference);
+      Info.FFDiag(E, diag::note_constexpr_use_uninit_reference);
     return false;
   }
   return Success(*V, E);
@@ -4735,7 +4905,7 @@
   if (!E->isPotentiallyEvaluated())
     return Success(E);
 
-  Info.Diag(E, diag::note_constexpr_typeid_polymorphic)
+  Info.FFDiag(E, diag::note_constexpr_typeid_polymorphic)
     << E->getExprOperand()->getType()
     << E->getExprOperand()->getSourceRange();
   return false;
@@ -4748,14 +4918,14 @@
 bool LValueExprEvaluator::VisitMemberExpr(const MemberExpr *E) {
   // Handle static data members.
   if (const VarDecl *VD = dyn_cast<VarDecl>(E->getMemberDecl())) {
-    VisitIgnoredValue(E->getBase());
+    VisitIgnoredBaseExpression(E->getBase());
     return VisitVarDecl(E, VD);
   }
 
   // Handle static member functions.
   if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(E->getMemberDecl())) {
     if (MD->isStatic()) {
-      VisitIgnoredValue(E->getBase());
+      VisitIgnoredBaseExpression(E->getBase());
       return Success(MD);
     }
   }
@@ -4823,7 +4993,7 @@
 
   // The overall lvalue result is the result of evaluating the LHS.
   if (!this->Visit(CAO->getLHS())) {
-    if (Info.keepEvaluatingAfterFailure())
+    if (Info.noteFailure())
       Evaluate(RHS, this->Info, CAO->getRHS());
     return false;
   }
@@ -4844,7 +5014,7 @@
   APValue NewVal;
 
   if (!this->Visit(E->getLHS())) {
-    if (Info.keepEvaluatingAfterFailure())
+    if (Info.noteFailure())
       Evaluate(NewVal, this->Info, E->getRHS());
     return false;
   }
@@ -4903,9 +5073,9 @@
       return false;
     if (!Info.CurrentCall->This) {
       if (Info.getLangOpts().CPlusPlus11)
-        Info.Diag(E, diag::note_constexpr_this) << E->isImplicit();
+        Info.FFDiag(E, diag::note_constexpr_this) << E->isImplicit();
       else
-        Info.Diag(E);
+        Info.FFDiag(E);
       return false;
     }
     Result = *Info.CurrentCall->This;
@@ -4932,7 +5102,7 @@
     std::swap(PExp, IExp);
 
   bool EvalPtrOK = EvaluatePointer(PExp, Result, Info);
-  if (!EvalPtrOK && !Info.keepEvaluatingAfterFailure())
+  if (!EvalPtrOK && !Info.noteFailure())
     return false;
 
   llvm::APSInt Offset;
@@ -5265,14 +5435,21 @@
       Result = V;
       return true;
     }
-    bool ZeroInitialization(const Expr *E);
+    bool ZeroInitialization(const Expr *E) {
+      return ZeroInitialization(E, E->getType());
+    }
+    bool ZeroInitialization(const Expr *E, QualType T);
 
     bool VisitCallExpr(const CallExpr *E) {
       return handleCallExpr(E, Result, &This);
     }
     bool VisitCastExpr(const CastExpr *E);
     bool VisitInitListExpr(const InitListExpr *E);
-    bool VisitCXXConstructExpr(const CXXConstructExpr *E);
+    bool VisitCXXConstructExpr(const CXXConstructExpr *E) {
+      return VisitCXXConstructExpr(E, E->getType());
+    }
+    bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E);
+    bool VisitCXXConstructExpr(const CXXConstructExpr *E, QualType T);
     bool VisitCXXStdInitializerListExpr(const CXXStdInitializerListExpr *E);
   };
 }
@@ -5327,8 +5504,8 @@
   return true;
 }
 
-bool RecordExprEvaluator::ZeroInitialization(const Expr *E) {
-  const RecordDecl *RD = E->getType()->castAs<RecordType>()->getDecl();
+bool RecordExprEvaluator::ZeroInitialization(const Expr *E, QualType T) {
+  const RecordDecl *RD = T->castAs<RecordType>()->getDecl();
   if (RD->isInvalidDecl()) return false;
   if (RD->isUnion()) {
     // C++11 [dcl.init]p5: If T is a (possibly cv-qualified) union type, the
@@ -5348,7 +5525,7 @@
   }
 
   if (isa<CXXRecordDecl>(RD) && cast<CXXRecordDecl>(RD)->getNumVBases()) {
-    Info.Diag(E, diag::note_constexpr_virtual_base) << RD;
+    Info.FFDiag(E, diag::note_constexpr_virtual_base) << RD;
     return false;
   }
 
@@ -5417,12 +5594,34 @@
     return EvaluateInPlace(Result.getUnionValue(), Info, Subobject, InitExpr);
   }
 
-  assert((!isa<CXXRecordDecl>(RD) || !cast<CXXRecordDecl>(RD)->getNumBases()) &&
-         "initializer list for class with base classes");
-  Result = APValue(APValue::UninitStruct(), 0,
-                   std::distance(RD->field_begin(), RD->field_end()));
+  auto *CXXRD = dyn_cast<CXXRecordDecl>(RD);
+  if (Result.isUninit())
+    Result = APValue(APValue::UninitStruct(), CXXRD ? CXXRD->getNumBases() : 0,
+                     std::distance(RD->field_begin(), RD->field_end()));
   unsigned ElementNo = 0;
   bool Success = true;
+
+  // Initialize base classes.
+  if (CXXRD) {
+    for (const auto &Base : CXXRD->bases()) {
+      assert(ElementNo < E->getNumInits() && "missing init for base class");
+      const Expr *Init = E->getInit(ElementNo);
+
+      LValue Subobject = This;
+      if (!HandleLValueBase(Info, Init, Subobject, CXXRD, &Base))
+        return false;
+
+      APValue &FieldVal = Result.getStructBase(ElementNo);
+      if (!EvaluateInPlace(FieldVal, Info, Subobject, Init)) {
+        if (!Info.noteFailure())
+          return false;
+        Success = false;
+      }
+      ++ElementNo;
+    }
+  }
+
+  // Initialize members.
   for (const auto *Field : RD->fields()) {
     // Anonymous bit-fields are not considered members of the class for
     // purposes of aggregate initialization.
@@ -5452,7 +5651,7 @@
     if (!EvaluateInPlace(FieldVal, Info, Subobject, Init) ||
         (Field->isBitField() && !truncateBitfieldValue(Info, Init,
                                                        FieldVal, Field))) {
-      if (!Info.keepEvaluatingAfterFailure())
+      if (!Info.noteFailure())
         return false;
       Success = false;
     }
@@ -5461,7 +5660,10 @@
   return Success;
 }
 
-bool RecordExprEvaluator::VisitCXXConstructExpr(const CXXConstructExpr *E) {
+bool RecordExprEvaluator::VisitCXXConstructExpr(const CXXConstructExpr *E,
+                                                QualType T) {
+  // Note that E's type is not necessarily the type of our class here; we might
+  // be initializing an array element instead.
   const CXXConstructorDecl *FD = E->getConstructor();
   if (FD->isInvalidDecl() || FD->getParent()->isInvalidDecl()) return false;
 
@@ -5479,13 +5681,13 @@
     //     lifetimes of all the base subobjects (there can be no data member
     //     subobjects in this case) per [basic.life]p1.
     // Either way, ZeroInitialization is appropriate.
-    return ZeroInitialization(E);
+    return ZeroInitialization(E, T);
   }
 
   const FunctionDecl *Definition = nullptr;
-  FD->getBody(Definition);
+  auto Body = FD->getBody(Definition);
 
-  if (!CheckConstexprFunction(Info, E->getExprLoc(), FD, Definition))
+  if (!CheckConstexprFunction(Info, E->getExprLoc(), FD, Definition, Body))
     return false;
 
   // Avoid materializing a temporary for an elidable copy/move constructor.
@@ -5494,11 +5696,33 @@
           = dyn_cast<MaterializeTemporaryExpr>(E->getArg(0)))
       return Visit(ME->GetTemporaryExpr());
 
-  if (ZeroInit && !ZeroInitialization(E))
+  if (ZeroInit && !ZeroInitialization(E, T))
     return false;
 
   auto Args = llvm::makeArrayRef(E->getArgs(), E->getNumArgs());
-  return HandleConstructorCall(E->getExprLoc(), This, Args,
+  return HandleConstructorCall(E, This, Args,
+                               cast<CXXConstructorDecl>(Definition), Info,
+                               Result);
+}
+
+bool RecordExprEvaluator::VisitCXXInheritedCtorInitExpr(
+    const CXXInheritedCtorInitExpr *E) {
+  if (!Info.CurrentCall) {
+    assert(Info.checkingPotentialConstantExpression());
+    return false;
+  }
+
+  const CXXConstructorDecl *FD = E->getConstructor();
+  if (FD->isInvalidDecl() || FD->getParent()->isInvalidDecl())
+    return false;
+
+  const FunctionDecl *Definition = nullptr;
+  auto Body = FD->getBody(Definition);
+
+  if (!CheckConstexprFunction(Info, E->getExprLoc(), FD, Definition, Body))
+    return false;
+
+  return HandleConstructorCall(E, This, Info.CurrentCall->Arguments,
                                cast<CXXConstructorDecl>(Definition), Info,
                                Result);
 }
@@ -5902,7 +6126,7 @@
                          Info, Subobject, Init) ||
         !HandleLValueArrayAdjustment(Info, Init, Subobject,
                                      CAT->getElementType(), 1)) {
-      if (!Info.keepEvaluatingAfterFailure())
+      if (!Info.noteFailure())
         return false;
       Success = false;
     }
@@ -5958,34 +6182,8 @@
   if (!Type->isRecordType())
     return Error(E);
 
-  const CXXConstructorDecl *FD = E->getConstructor();
-
-  bool ZeroInit = E->requiresZeroInitialization();
-  if (CheckTrivialDefaultConstructor(Info, E->getExprLoc(), FD, ZeroInit)) {
-    if (HadZeroInit)
-      return true;
-
-    // See RecordExprEvaluator::VisitCXXConstructExpr for explanation.
-    ImplicitValueInitExpr VIE(Type);
-    return EvaluateInPlace(*Value, Info, Subobject, &VIE);
-  }
-
-  const FunctionDecl *Definition = nullptr;
-  FD->getBody(Definition);
-
-  if (!CheckConstexprFunction(Info, E->getExprLoc(), FD, Definition))
-    return false;
-
-  if (ZeroInit && !HadZeroInit) {
-    ImplicitValueInitExpr VIE(Type);
-    if (!EvaluateInPlace(*Value, Info, Subobject, &VIE))
-      return false;
-  }
-
-  auto Args = llvm::makeArrayRef(E->getArgs(), E->getNumArgs());
-  return HandleConstructorCall(E->getExprLoc(), Subobject, Args,
-                               cast<CXXConstructorDecl>(Definition),
-                               Info, *Value);
+  return RecordExprEvaluator(Info, Subobject, *Value)
+             .VisitCXXConstructExpr(E, Type);
 }
 
 //===----------------------------------------------------------------------===//
@@ -6076,7 +6274,7 @@
   }
   bool VisitMemberExpr(const MemberExpr *E) {
     if (CheckReferencedDecl(E, E->getMemberDecl())) {
-      VisitIgnoredValue(E->getBase());
+      VisitIgnoredBaseExpression(E->getBase());
       return true;
     }
 
@@ -6149,7 +6347,7 @@
   if (!Val.isInt()) {
     // FIXME: It would be better to produce the diagnostic for casting
     //        a pointer to an integer.
-    Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+    Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
     return false;
   }
   Result = Val.getInt();
@@ -6185,7 +6383,8 @@
 
 /// EvaluateBuiltinClassifyType - Evaluate __builtin_classify_type the same way
 /// as GCC.
-static int EvaluateBuiltinClassifyType(const CallExpr *E) {
+static int EvaluateBuiltinClassifyType(const CallExpr *E,
+                                       const LangOptions &LangOpts) {
   // The following enum mimics the values returned by GCC.
   // FIXME: Does GCC differ between lvalue and rvalue references here?
   enum gcc_type_class {
@@ -6205,37 +6404,123 @@
   if (E->getNumArgs() == 0)
     return no_type_class;
 
-  QualType ArgTy = E->getArg(0)->getType();
-  if (ArgTy->isVoidType())
-    return void_type_class;
-  else if (ArgTy->isEnumeralType())
-    return enumeral_type_class;
-  else if (ArgTy->isBooleanType())
-    return boolean_type_class;
-  else if (ArgTy->isCharType())
-    return string_type_class; // gcc doesn't appear to use char_type_class
-  else if (ArgTy->isIntegerType())
-    return integer_type_class;
-  else if (ArgTy->isPointerType())
+  QualType CanTy = E->getArg(0)->getType().getCanonicalType();
+  const BuiltinType *BT = dyn_cast<BuiltinType>(CanTy);
+
+  switch (CanTy->getTypeClass()) {
+#define TYPE(ID, BASE)
+#define DEPENDENT_TYPE(ID, BASE) case Type::ID:
+#define NON_CANONICAL_TYPE(ID, BASE) case Type::ID:
+#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(ID, BASE) case Type::ID:
+#include "clang/AST/TypeNodes.def"
+      llvm_unreachable("CallExpr::isBuiltinClassifyType(): unimplemented type");
+
+  case Type::Builtin:
+    switch (BT->getKind()) {
+#define BUILTIN_TYPE(ID, SINGLETON_ID)
+#define SIGNED_TYPE(ID, SINGLETON_ID) case BuiltinType::ID: return integer_type_class;
+#define FLOATING_TYPE(ID, SINGLETON_ID) case BuiltinType::ID: return real_type_class;
+#define PLACEHOLDER_TYPE(ID, SINGLETON_ID) case BuiltinType::ID: break;
+#include "clang/AST/BuiltinTypes.def"
+    case BuiltinType::Void:
+      return void_type_class;
+
+    case BuiltinType::Bool:
+      return boolean_type_class;
+
+    case BuiltinType::Char_U: // gcc doesn't appear to use char_type_class
+    case BuiltinType::UChar:
+    case BuiltinType::UShort:
+    case BuiltinType::UInt:
+    case BuiltinType::ULong:
+    case BuiltinType::ULongLong:
+    case BuiltinType::UInt128:
+      return integer_type_class;
+
+    case BuiltinType::NullPtr:
+      return pointer_type_class;
+
+    case BuiltinType::WChar_U:
+    case BuiltinType::Char16:
+    case BuiltinType::Char32:
+    case BuiltinType::ObjCId:
+    case BuiltinType::ObjCClass:
+    case BuiltinType::ObjCSel:
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
+    case BuiltinType::OCLSampler:
+    case BuiltinType::OCLEvent:
+    case BuiltinType::OCLClkEvent:
+    case BuiltinType::OCLQueue:
+    case BuiltinType::OCLNDRange:
+    case BuiltinType::OCLReserveID:
+    case BuiltinType::Dependent:
+      llvm_unreachable("CallExpr::isBuiltinClassifyType(): unimplemented type");
+    };
+
+  case Type::Enum:
+    return LangOpts.CPlusPlus ? enumeral_type_class : integer_type_class;
+    break;
+
+  case Type::Pointer:
     return pointer_type_class;
-  else if (ArgTy->isReferenceType())
-    return reference_type_class;
-  else if (ArgTy->isRealType())
-    return real_type_class;
-  else if (ArgTy->isComplexType())
+    break;
+
+  case Type::MemberPointer:
+    if (CanTy->isMemberDataPointerType())
+      return offset_type_class;
+    else {
+      // We expect member pointers to be either data or function pointers,
+      // nothing else.
+      assert(CanTy->isMemberFunctionPointerType());
+      return method_type_class;
+    }
+
+  case Type::Complex:
     return complex_type_class;
-  else if (ArgTy->isFunctionType())
-    return function_type_class;
-  else if (ArgTy->isStructureOrClassType())
-    return record_type_class;
-  else if (ArgTy->isUnionType())
-    return union_type_class;
-  else if (ArgTy->isArrayType())
-    return array_type_class;
-  else if (ArgTy->isUnionType())
-    return union_type_class;
-  else  // FIXME: offset_type_class, method_type_class, & lang_type_class?
+
+  case Type::FunctionNoProto:
+  case Type::FunctionProto:
+    return LangOpts.CPlusPlus ? function_type_class : pointer_type_class;
+
+  case Type::Record:
+    if (const RecordType *RT = CanTy->getAs<RecordType>()) {
+      switch (RT->getDecl()->getTagKind()) {
+      case TagTypeKind::TTK_Struct:
+      case TagTypeKind::TTK_Class:
+      case TagTypeKind::TTK_Interface:
+        return record_type_class;
+
+      case TagTypeKind::TTK_Enum:
+        return LangOpts.CPlusPlus ? enumeral_type_class : integer_type_class;
+
+      case TagTypeKind::TTK_Union:
+        return union_type_class;
+      }
+    }
     llvm_unreachable("CallExpr::isBuiltinClassifyType(): unimplemented type");
+
+  case Type::ConstantArray:
+  case Type::VariableArray:
+  case Type::IncompleteArray:
+    return LangOpts.CPlusPlus ? array_type_class : pointer_type_class;
+
+  case Type::BlockPointer:
+  case Type::LValueReference:
+  case Type::RValueReference:
+  case Type::Vector:
+  case Type::ExtVector:
+  case Type::Auto:
+  case Type::ObjCObject:
+  case Type::ObjCInterface:
+  case Type::ObjCObjectPointer:
+  case Type::Pipe:
+  case Type::Atomic:
+    llvm_unreachable("CallExpr::isBuiltinClassifyType(): unimplemented type");
+  }
+
+  llvm_unreachable("CallExpr::isBuiltinClassifyType(): unimplemented type");
 }
 
 /// EvaluateBuiltinConstantPForLValue - Determine the result of
@@ -6602,6 +6887,8 @@
       // Reduce it to a constant now.
       return Success((Type & 2) ? 0 : -1, E);
     }
+
+    llvm_unreachable("unexpected EvalMode");
   }
 
   case Builtin::BI__builtin_bswap16:
@@ -6615,7 +6902,7 @@
   }
 
   case Builtin::BI__builtin_classify_type:
-    return Success(EvaluateBuiltinClassifyType(E), E);
+    return Success(EvaluateBuiltinClassifyType(E, Info.getLangOpts()), E);
 
   // FIXME: BI__builtin_clrsb
   // FIXME: BI__builtin_clrsbl
@@ -6907,23 +7194,14 @@
     Job() = default;
     Job(Job &&J)
         : E(J.E), LHSResult(J.LHSResult), Kind(J.Kind),
-          StoredInfo(J.StoredInfo), OldEvalStatus(J.OldEvalStatus) {
-      J.StoredInfo = nullptr;
-    }
+          SpecEvalRAII(std::move(J.SpecEvalRAII)) {}
 
     void startSpeculativeEval(EvalInfo &Info) {
-      OldEvalStatus = Info.EvalStatus;
-      Info.EvalStatus.Diag = nullptr;
-      StoredInfo = &Info;
+      SpecEvalRAII = SpeculativeEvaluationRAII(Info);
     }
-    ~Job() {
-      if (StoredInfo) {
-        StoredInfo->EvalStatus = OldEvalStatus;
-      }
-    }
+
   private:
-    EvalInfo *StoredInfo = nullptr; // non-null if status changed.
-    Expr::EvalStatus OldEvalStatus;
+    SpeculativeEvaluationRAII SpecEvalRAII;
   };
 
   SmallVector<Job, 16> Queue;
@@ -6943,7 +7221,9 @@
   static bool shouldEnqueue(const BinaryOperator *E) {
     return E->getOpcode() == BO_Comma ||
            E->isLogicalOp() ||
-           (E->getLHS()->getType()->isIntegralOrEnumerationType() &&
+           (E->isRValue() &&
+            E->getType()->isIntegralOrEnumerationType() &&
+            E->getLHS()->getType()->isIntegralOrEnumerationType() &&
             E->getRHS()->getType()->isIntegralOrEnumerationType());
   }
 
@@ -7025,7 +7305,7 @@
       LHSResult.Failed = true;
 
       // Since we weren't able to evaluate the left hand side, it
-      // must have had side effects.
+      // might have had side effects.
       if (!Info.noteSideEffect())
         return false;
 
@@ -7041,7 +7321,7 @@
   assert(E->getLHS()->getType()->isIntegralOrEnumerationType() &&
          E->getRHS()->getType()->isIntegralOrEnumerationType());
 
-  if (LHSResult.Failed && !Info.keepEvaluatingAfterFailure())
+  if (LHSResult.Failed && !Info.noteFailure())
     return false; // Ignore RHS;
 
   return true;
@@ -7193,10 +7473,34 @@
   llvm_unreachable("Invalid Job::Kind!");
 }
 
+namespace {
+/// Used when we determine that we should fail, but can keep evaluating prior to
+/// noting that we had a failure.
+class DelayedNoteFailureRAII {
+  EvalInfo &Info;
+  bool NoteFailure;
+
+public:
+  DelayedNoteFailureRAII(EvalInfo &Info, bool NoteFailure = true)
+      : Info(Info), NoteFailure(NoteFailure) {}
+  ~DelayedNoteFailureRAII() {
+    if (NoteFailure) {
+      bool ContinueAfterFailure = Info.noteFailure();
+      (void)ContinueAfterFailure;
+      assert(ContinueAfterFailure &&
+             "Shouldn't have kept evaluating on failure.");
+    }
+  }
+};
+}
+
 bool IntExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) {
+  // We don't call noteFailure immediately because the assignment happens after
+  // we evaluate LHS and RHS.
   if (!Info.keepEvaluatingAfterFailure() && E->isAssignmentOp())
     return Error(E);
 
+  DelayedNoteFailureRAII MaybeNoteFailureLater(Info, E->isAssignmentOp());
   if (DataRecursiveIntBinOpEvaluator::shouldEnqueue(E))
     return DataRecursiveIntBinOpEvaluator(*this, Result).Traverse(E);
 
@@ -7219,7 +7523,7 @@
     } else {
       LHSOK = EvaluateComplex(E->getLHS(), LHS, Info);
     }
-    if (!LHSOK && !Info.keepEvaluatingAfterFailure())
+    if (!LHSOK && !Info.noteFailure())
       return false;
 
     if (E->getRHS()->getType()->isRealFloatingType()) {
@@ -7267,7 +7571,7 @@
     APFloat RHS(0.0), LHS(0.0);
 
     bool LHSOK = EvaluateFloat(E->getRHS(), RHS, Info);
-    if (!LHSOK && !Info.keepEvaluatingAfterFailure())
+    if (!LHSOK && !Info.noteFailure())
       return false;
 
     if (!EvaluateFloat(E->getLHS(), LHS, Info) || !LHSOK)
@@ -7301,7 +7605,7 @@
       LValue LHSValue, RHSValue;
 
       bool LHSOK = EvaluatePointer(E->getLHS(), LHSValue, Info);
-      if (!LHSOK && !Info.keepEvaluatingAfterFailure())
+      if (!LHSOK && !Info.noteFailure())
         return false;
 
       if (!EvaluatePointer(E->getRHS(), RHSValue, Info) || !LHSOK)
@@ -7395,7 +7699,7 @@
         // C, array of zero length). Pointer subtraction in such cases has
         // undefined behavior, so is not constant.
         if (ElementSize.isZero()) {
-          Info.Diag(E, diag::note_constexpr_pointer_subtraction_zero_size)
+          Info.FFDiag(E, diag::note_constexpr_pointer_subtraction_zero_size)
             << ElementType;
           return false;
         }
@@ -7518,7 +7822,7 @@
     MemberPtr LHSValue, RHSValue;
 
     bool LHSOK = EvaluateMemberPointer(E->getLHS(), LHSValue, Info);
-    if (!LHSOK && Info.keepEvaluatingAfterFailure())
+    if (!LHSOK && !Info.noteFailure())
       return false;
 
     if (!EvaluateMemberPointer(E->getRHS(), RHSValue, Info) || !LHSOK)
@@ -7767,6 +8071,7 @@
   case CK_ZeroToOCLEvent:
   case CK_NonAtomicToAtomic:
   case CK_AddressSpaceConversion:
+  case CK_IntToOCLSampler:
     llvm_unreachable("invalid cast kind for integral value");
 
   case CK_BitCast:
@@ -8090,7 +8395,7 @@
 
   APFloat RHS(0.0);
   bool LHSOK = EvaluateFloat(E->getLHS(), Result, Info);
-  if (!LHSOK && !Info.keepEvaluatingAfterFailure())
+  if (!LHSOK && !Info.noteFailure())
     return false;
   return EvaluateFloat(E->getRHS(), RHS, Info) && LHSOK &&
          handleFloatFloatBinOp(Info, E, Result, E->getOpcode(), RHS);
@@ -8258,6 +8563,7 @@
   case CK_ZeroToOCLEvent:
   case CK_NonAtomicToAtomic:
   case CK_AddressSpaceConversion:
+  case CK_IntToOCLSampler:
     llvm_unreachable("invalid cast kind for complex value");
 
   case CK_LValueToRValue:
@@ -8367,7 +8673,7 @@
   } else {
     LHSOK = Visit(E->getLHS());
   }
-  if (!LHSOK && !Info.keepEvaluatingAfterFailure())
+  if (!LHSOK && !Info.noteFailure())
     return false;
 
   ComplexValue RHS;
@@ -8768,10 +9074,10 @@
     if (!EvaluateAtomic(E, Result, Info))
       return false;
   } else if (Info.getLangOpts().CPlusPlus11) {
-    Info.Diag(E, diag::note_constexpr_nonliteral) << E->getType();
+    Info.FFDiag(E, diag::note_constexpr_nonliteral) << E->getType();
     return false;
   } else {
-    Info.Diag(E, diag::note_invalid_subexpr_in_const_expr);
+    Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
     return false;
   }
 
@@ -8895,6 +9201,20 @@
   return true;
 }
 
+bool Expr::EvaluateAsFloat(APFloat &Result, const ASTContext &Ctx,
+                           SideEffectsKind AllowSideEffects) const {
+  if (!getType()->isRealFloatingType())
+    return false;
+
+  EvalResult ExprResult;
+  if (!EvaluateAsRValue(ExprResult, Ctx) || !ExprResult.Val.isFloat() ||
+      hasUnacceptableSideEffect(ExprResult, AllowSideEffects))
+    return false;
+
+  Result = ExprResult.Val.getFloat();
+  return true;
+}
+
 bool Expr::EvaluateAsLValue(EvalResult &Result, const ASTContext &Ctx) const {
   EvalInfo Info(Ctx, Result, EvalInfo::EM_ConstantFold);
 
@@ -9080,6 +9400,7 @@
   case Expr::TypoExprClass:
   case Expr::DependentScopeDeclRefExprClass:
   case Expr::CXXConstructExprClass:
+  case Expr::CXXInheritedCtorInitExprClass:
   case Expr::CXXStdInitializerListExprClass:
   case Expr::CXXBindTemporaryExprClass:
   case Expr::ExprWithCleanupsClass:
@@ -9099,6 +9420,7 @@
   case Expr::ObjCPropertyRefExprClass:
   case Expr::ObjCSubscriptRefExprClass:
   case Expr::ObjCIsaExprClass:
+  case Expr::ObjCAvailabilityCheckExprClass:
   case Expr::ShuffleVectorExprClass:
   case Expr::ConvertVectorExprClass:
   case Expr::BlockExprClass:
@@ -9543,17 +9865,17 @@
 
   ArrayRef<const Expr*> Args;
 
-  SourceLocation Loc = FD->getLocation();
-
   APValue Scratch;
   if (const CXXConstructorDecl *CD = dyn_cast<CXXConstructorDecl>(FD)) {
     // Evaluate the call as a constant initializer, to allow the construction
     // of objects of non-literal types.
     Info.setEvaluatingDecl(This.getLValueBase(), Scratch);
-    HandleConstructorCall(Loc, This, Args, CD, Info, Scratch);
-  } else
+    HandleConstructorCall(&VIE, This, Args, CD, Info, Scratch);
+  } else {
+    SourceLocation Loc = FD->getLocation();
     HandleFunctionCall(Loc, FD, (MD && MD->isInstance()) ? &This : nullptr,
                        Args, FD->getBody(), Info, Scratch, nullptr);
+  }
 
   return Diags.empty();
 }
diff --git a/lib/AST/ItaniumMangle.cpp b/lib/AST/ItaniumMangle.cpp
index b8022a4..67d217e 100644
--- a/lib/AST/ItaniumMangle.cpp
+++ b/lib/AST/ItaniumMangle.cpp
@@ -20,6 +20,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
@@ -66,8 +67,9 @@
   }
   
   const DeclContext *DC = D->getDeclContext();
-  if (const CapturedDecl *CD = dyn_cast<CapturedDecl>(DC))
-    return getEffectiveDeclContext(CD);
+  if (isa<CapturedDecl>(DC) || isa<OMPDeclareReductionDecl>(DC)) {
+    return getEffectiveDeclContext(cast<Decl>(DC));
+  }
 
   if (const auto *VD = dyn_cast<VarDecl>(D))
     if (VD->isExternC())
@@ -77,7 +79,7 @@
     if (FD->isExternC())
       return FD->getASTContext().getTranslationUnitDecl();
 
-  return DC;
+  return DC->getRedeclContext();
 }
 
 static const DeclContext *getEffectiveParentContext(const DeclContext *DC) {
@@ -212,6 +214,12 @@
 class CXXNameMangler {
   ItaniumMangleContextImpl &Context;
   raw_ostream &Out;
+  bool NullOut = false;
+  /// In the "DisableDerivedAbiTags" mode derived ABI tags are not calculated.
+  /// This mode is used when mangler creates another mangler recursively to
+  /// calculate ABI tags for the function return value or the variable type.
+  /// Also it is required to avoid infinite recursion in some cases.
+  bool DisableDerivedAbiTags = false;
 
   /// The "structor" is the top-level declaration being mangled, if
   /// that's not a template specialization; otherwise it's the pattern
@@ -261,15 +269,126 @@
 
   } FunctionTypeDepth;
 
+  // abi_tag is a gcc attribute, taking one or more strings called "tags".
+  // The goal is to annotate against which version of a library an object was
+  // built and to be able to provide backwards compatibility ("dual abi").
+  // For more information see docs/ItaniumMangleAbiTags.rst.
+  typedef SmallVector<StringRef, 4> AbiTagList;
+
+  // State to gather all implicit and explicit tags used in a mangled name.
+  // Must always have an instance of this while emitting any name to keep
+  // track.
+  class AbiTagState final {
+  public:
+    explicit AbiTagState(AbiTagState *&Head) : LinkHead(Head) {
+      Parent = LinkHead;
+      LinkHead = this;
+    }
+
+    // No copy, no move.
+    AbiTagState(const AbiTagState &) = delete;
+    AbiTagState &operator=(const AbiTagState &) = delete;
+
+    ~AbiTagState() { pop(); }
+
+    void write(raw_ostream &Out, const NamedDecl *ND,
+               const AbiTagList *AdditionalAbiTags) {
+      ND = cast<NamedDecl>(ND->getCanonicalDecl());
+      if (!isa<FunctionDecl>(ND) && !isa<VarDecl>(ND)) {
+        assert(
+            !AdditionalAbiTags &&
+            "only function and variables need a list of additional abi tags");
+        if (const auto *NS = dyn_cast<NamespaceDecl>(ND)) {
+          if (const auto *AbiTag = NS->getAttr<AbiTagAttr>()) {
+            UsedAbiTags.insert(UsedAbiTags.end(), AbiTag->tags().begin(),
+                               AbiTag->tags().end());
+          }
+          // Don't emit abi tags for namespaces.
+          return;
+        }
+      }
+
+      AbiTagList TagList;
+      if (const auto *AbiTag = ND->getAttr<AbiTagAttr>()) {
+        UsedAbiTags.insert(UsedAbiTags.end(), AbiTag->tags().begin(),
+                           AbiTag->tags().end());
+        TagList.insert(TagList.end(), AbiTag->tags().begin(),
+                       AbiTag->tags().end());
+      }
+
+      if (AdditionalAbiTags) {
+        UsedAbiTags.insert(UsedAbiTags.end(), AdditionalAbiTags->begin(),
+                           AdditionalAbiTags->end());
+        TagList.insert(TagList.end(), AdditionalAbiTags->begin(),
+                       AdditionalAbiTags->end());
+      }
+
+      std::sort(TagList.begin(), TagList.end());
+      TagList.erase(std::unique(TagList.begin(), TagList.end()), TagList.end());
+
+      writeSortedUniqueAbiTags(Out, TagList);
+    }
+
+    const AbiTagList &getUsedAbiTags() const { return UsedAbiTags; }
+    void setUsedAbiTags(const AbiTagList &AbiTags) {
+      UsedAbiTags = AbiTags;
+    }
+
+    const AbiTagList &getEmittedAbiTags() const {
+      return EmittedAbiTags;
+    }
+
+    const AbiTagList &getSortedUniqueUsedAbiTags() {
+      std::sort(UsedAbiTags.begin(), UsedAbiTags.end());
+      UsedAbiTags.erase(std::unique(UsedAbiTags.begin(), UsedAbiTags.end()),
+                        UsedAbiTags.end());
+      return UsedAbiTags;
+    }
+
+  private:
+    //! All abi tags used implicitly or explicitly.
+    AbiTagList UsedAbiTags;
+    //! All explicit abi tags (i.e. not from namespace).
+    AbiTagList EmittedAbiTags;
+
+    AbiTagState *&LinkHead;
+    AbiTagState *Parent = nullptr;
+
+    void pop() {
+      assert(LinkHead == this &&
+             "abi tag link head must point to us on destruction");
+      if (Parent) {
+        Parent->UsedAbiTags.insert(Parent->UsedAbiTags.end(),
+                                   UsedAbiTags.begin(), UsedAbiTags.end());
+        Parent->EmittedAbiTags.insert(Parent->EmittedAbiTags.end(),
+                                      EmittedAbiTags.begin(),
+                                      EmittedAbiTags.end());
+      }
+      LinkHead = Parent;
+    }
+
+    void writeSortedUniqueAbiTags(raw_ostream &Out, const AbiTagList &AbiTags) {
+      for (const auto &Tag : AbiTags) {
+        EmittedAbiTags.push_back(Tag);
+        Out << "B";
+        Out << Tag.size();
+        Out << Tag;
+      }
+    }
+  };
+
+  AbiTagState *AbiTags = nullptr;
+  AbiTagState AbiTagsRoot;
+
   llvm::DenseMap<uintptr_t, unsigned> Substitutions;
 
   ASTContext &getASTContext() const { return Context.getASTContext(); }
 
 public:
   CXXNameMangler(ItaniumMangleContextImpl &C, raw_ostream &Out_,
-                 const NamedDecl *D = nullptr)
-    : Context(C), Out(Out_), Structor(getStructor(D)), StructorType(0),
-      SeqID(0) {
+                 const NamedDecl *D = nullptr, bool NullOut_ = false)
+    : Context(C), Out(Out_), NullOut(NullOut_),  Structor(getStructor(D)),
+      StructorType(0), SeqID(0), AbiTagsRoot(AbiTags) {
     // These can't be mangled without a ctor type or dtor type.
     assert(!D || (!isa<CXXDestructorDecl>(D) &&
                   !isa<CXXConstructorDecl>(D)));
@@ -277,11 +396,21 @@
   CXXNameMangler(ItaniumMangleContextImpl &C, raw_ostream &Out_,
                  const CXXConstructorDecl *D, CXXCtorType Type)
     : Context(C), Out(Out_), Structor(getStructor(D)), StructorType(Type),
-      SeqID(0) { }
+      SeqID(0), AbiTagsRoot(AbiTags) { }
   CXXNameMangler(ItaniumMangleContextImpl &C, raw_ostream &Out_,
                  const CXXDestructorDecl *D, CXXDtorType Type)
     : Context(C), Out(Out_), Structor(getStructor(D)), StructorType(Type),
-      SeqID(0) { }
+      SeqID(0), AbiTagsRoot(AbiTags) { }
+
+  CXXNameMangler(CXXNameMangler &Outer, raw_ostream &Out_)
+      : Context(Outer.Context), Out(Out_), NullOut(false),
+        Structor(Outer.Structor), StructorType(Outer.StructorType),
+        SeqID(Outer.SeqID), AbiTagsRoot(AbiTags) {}
+
+  CXXNameMangler(CXXNameMangler &Outer, llvm::raw_null_ostream &Out_)
+      : Context(Outer.Context), Out(Out_), NullOut(true),
+        Structor(Outer.Structor), StructorType(Outer.StructorType),
+        SeqID(Outer.SeqID), AbiTagsRoot(AbiTags) {}
 
 #if MANGLE_CHECKER
   ~CXXNameMangler() {
@@ -296,6 +425,9 @@
 #endif
   raw_ostream &getStream() { return Out; }
 
+  void disableDerivedAbiTags() { DisableDerivedAbiTags = true; }
+  static bool shouldHaveAbiTags(ItaniumMangleContextImpl &C, const VarDecl *VD);
+
   void mangle(const NamedDecl *D);
   void mangleCallOffset(int64_t NonVirtual, int64_t Virtual);
   void mangleNumber(const llvm::APSInt &I);
@@ -314,7 +446,6 @@
   bool mangleSubstitution(TemplateName Template);
   bool mangleSubstitution(uintptr_t Ptr);
 
-  void mangleExistingSubstitution(QualType type);
   void mangleExistingSubstitution(TemplateName name);
 
   bool mangleStandardSubstitution(const NamedDecl *ND);
@@ -334,23 +465,37 @@
                             DeclarationName name,
                             unsigned KnownArity = UnknownArity);
 
-  void mangleName(const TemplateDecl *TD,
-                  const TemplateArgument *TemplateArgs,
-                  unsigned NumTemplateArgs);
-  void mangleUnqualifiedName(const NamedDecl *ND) {
-    mangleUnqualifiedName(ND, ND->getDeclName(), UnknownArity);
+  void mangleFunctionEncodingBareType(const FunctionDecl *FD);
+
+  void mangleNameWithAbiTags(const NamedDecl *ND,
+                             const AbiTagList *AdditionalAbiTags);
+  void mangleTemplateName(const TemplateDecl *TD,
+                          const TemplateArgument *TemplateArgs,
+                          unsigned NumTemplateArgs);
+  void mangleUnqualifiedName(const NamedDecl *ND,
+                             const AbiTagList *AdditionalAbiTags) {
+    mangleUnqualifiedName(ND, ND->getDeclName(), UnknownArity,
+                          AdditionalAbiTags);
   }
   void mangleUnqualifiedName(const NamedDecl *ND, DeclarationName Name,
-                             unsigned KnownArity);
-  void mangleUnscopedName(const NamedDecl *ND);
-  void mangleUnscopedTemplateName(const TemplateDecl *ND);
-  void mangleUnscopedTemplateName(TemplateName);
+                             unsigned KnownArity,
+                             const AbiTagList *AdditionalAbiTags);
+  void mangleUnscopedName(const NamedDecl *ND,
+                          const AbiTagList *AdditionalAbiTags);
+  void mangleUnscopedTemplateName(const TemplateDecl *ND,
+                                  const AbiTagList *AdditionalAbiTags);
+  void mangleUnscopedTemplateName(TemplateName,
+                                  const AbiTagList *AdditionalAbiTags);
   void mangleSourceName(const IdentifierInfo *II);
-  void mangleLocalName(const Decl *D);
+  void mangleSourceNameWithAbiTags(
+      const NamedDecl *ND, const AbiTagList *AdditionalAbiTags = nullptr);
+  void mangleLocalName(const Decl *D,
+                       const AbiTagList *AdditionalAbiTags);
   void mangleBlockForPrefix(const BlockDecl *Block);
   void mangleUnqualifiedBlock(const BlockDecl *Block);
   void mangleLambda(const CXXRecordDecl *Lambda);
   void mangleNestedName(const NamedDecl *ND, const DeclContext *DC,
+                        const AbiTagList *AdditionalAbiTags,
                         bool NoFunction=false);
   void mangleNestedName(const TemplateDecl *TD,
                         const TemplateArgument *TemplateArgs,
@@ -396,7 +541,7 @@
   void mangleCastExpression(const Expr *E, StringRef CastEncoding);
   void mangleInitListElements(const InitListExpr *InitList);
   void mangleExpression(const Expr *E, unsigned Arity = UnknownArity);
-  void mangleCXXCtorType(CXXCtorType T);
+  void mangleCXXCtorType(CXXCtorType T, const CXXRecordDecl *InheritedFrom);
   void mangleCXXDtorType(CXXDtorType T);
 
   void mangleTemplateArgs(const TemplateArgumentLoc *TemplateArgs,
@@ -409,6 +554,14 @@
   void mangleTemplateParameter(unsigned Index);
 
   void mangleFunctionParam(const ParmVarDecl *parm);
+
+  void writeAbiTags(const NamedDecl *ND,
+                    const AbiTagList *AdditionalAbiTags);
+
+  // Returns sorted unique list of ABI tags.
+  AbiTagList makeFunctionReturnTypeTags(const FunctionDecl *FD);
+  // Returns sorted unique list of ABI tags.
+  AbiTagList makeVariableTypeTags(const VarDecl *VD);
 };
 
 }
@@ -440,7 +593,7 @@
     return false;
 
   const VarDecl *VD = dyn_cast<VarDecl>(D);
-  if (VD) {
+  if (VD && !isa<DecompositionDecl>(D)) {
     // C variables are not mangled.
     if (VD->isExternC())
       return false;
@@ -452,6 +605,7 @@
       while (!DC->isNamespace() && !DC->isTranslationUnit())
         DC = getEffectiveParentContext(DC);
     if (DC->isTranslationUnit() && D->getFormalLinkage() != InternalLinkage &&
+        !CXXNameMangler::shouldHaveAbiTags(*this, VD) &&
         !isa<VarTemplateSpecializationDecl>(D))
       return false;
   }
@@ -459,6 +613,18 @@
   return true;
 }
 
+void CXXNameMangler::writeAbiTags(const NamedDecl *ND,
+                                  const AbiTagList *AdditionalAbiTags) {
+  assert(AbiTags && "require AbiTagState");
+  AbiTags->write(Out, ND, DisableDerivedAbiTags ? nullptr : AdditionalAbiTags);
+}
+
+void CXXNameMangler::mangleSourceNameWithAbiTags(
+    const NamedDecl *ND, const AbiTagList *AdditionalAbiTags) {
+  mangleSourceName(ND->getIdentifier());
+  writeAbiTags(ND, AdditionalAbiTags);
+}
+
 void CXXNameMangler::mangle(const NamedDecl *D) {
   // <mangled-name> ::= _Z <encoding>
   //            ::= <data name>
@@ -476,12 +642,52 @@
 
 void CXXNameMangler::mangleFunctionEncoding(const FunctionDecl *FD) {
   // <encoding> ::= <function name> <bare-function-type>
-  mangleName(FD);
 
   // Don't mangle in the type if this isn't a decl we should typically mangle.
-  if (!Context.shouldMangleDeclName(FD))
+  if (!Context.shouldMangleDeclName(FD)) {
+    mangleName(FD);
     return;
+  }
 
+  AbiTagList ReturnTypeAbiTags = makeFunctionReturnTypeTags(FD);
+  if (ReturnTypeAbiTags.empty()) {
+    // There are no tags for return type, the simplest case.
+    mangleName(FD);
+    mangleFunctionEncodingBareType(FD);
+    return;
+  }
+
+  // Mangle function name and encoding to temporary buffer.
+  // We have to output name and encoding to the same mangler to get the same
+  // substitution as it will be in final mangling.
+  SmallString<256> FunctionEncodingBuf;
+  llvm::raw_svector_ostream FunctionEncodingStream(FunctionEncodingBuf);
+  CXXNameMangler FunctionEncodingMangler(*this, FunctionEncodingStream);
+  // Output name of the function.
+  FunctionEncodingMangler.disableDerivedAbiTags();
+  FunctionEncodingMangler.mangleNameWithAbiTags(FD, nullptr);
+
+  // Remember length of the function name in the buffer.
+  size_t EncodingPositionStart = FunctionEncodingStream.str().size();
+  FunctionEncodingMangler.mangleFunctionEncodingBareType(FD);
+
+  // Get tags from return type that are not present in function name or
+  // encoding.
+  const AbiTagList &UsedAbiTags =
+      FunctionEncodingMangler.AbiTagsRoot.getSortedUniqueUsedAbiTags();
+  AbiTagList AdditionalAbiTags(ReturnTypeAbiTags.size());
+  AdditionalAbiTags.erase(
+      std::set_difference(ReturnTypeAbiTags.begin(), ReturnTypeAbiTags.end(),
+                          UsedAbiTags.begin(), UsedAbiTags.end(),
+                          AdditionalAbiTags.begin()),
+      AdditionalAbiTags.end());
+
+  // Output name with implicit tags and function encoding from temporary buffer.
+  mangleNameWithAbiTags(FD, &AdditionalAbiTags);
+  Out << FunctionEncodingStream.str().substr(EncodingPositionStart);
+}
+
+void CXXNameMangler::mangleFunctionEncodingBareType(const FunctionDecl *FD) {
   if (FD->hasAttr<EnableIfAttr>()) {
     FunctionTypeDepthState Saved = FunctionTypeDepth.push();
     Out << "Ua9enable_ifI";
@@ -501,6 +707,12 @@
     FunctionTypeDepth.pop(Saved);
   }
 
+  // When mangling an inheriting constructor, the bare function type used is
+  // that of the inherited constructor.
+  if (auto *CD = dyn_cast<CXXConstructorDecl>(FD))
+    if (auto Inherited = CD->getInheritedConstructor())
+      FD = Inherited.getConstructor();
+
   // Whether the mangling of a function type includes the return type depends on
   // the context and the nature of the function. The rules for deciding whether
   // the return type is included are:
@@ -561,7 +773,7 @@
 static const TemplateDecl *
 isTemplate(const NamedDecl *ND, const TemplateArgumentList *&TemplateArgs) {
   // Check if we have a function template.
-  if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)){
+  if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)) {
     if (const TemplateDecl *TD = FD->getPrimaryTemplate()) {
       TemplateArgs = FD->getTemplateSpecializationArgs();
       return TD;
@@ -586,6 +798,40 @@
 }
 
 void CXXNameMangler::mangleName(const NamedDecl *ND) {
+  if (const VarDecl *VD = dyn_cast<VarDecl>(ND)) {
+    // Variables should have implicit tags from its type.
+    AbiTagList VariableTypeAbiTags = makeVariableTypeTags(VD);
+    if (VariableTypeAbiTags.empty()) {
+      // Simple case no variable type tags.
+      mangleNameWithAbiTags(VD, nullptr);
+      return;
+    }
+
+    // Mangle variable name to null stream to collect tags.
+    llvm::raw_null_ostream NullOutStream;
+    CXXNameMangler VariableNameMangler(*this, NullOutStream);
+    VariableNameMangler.disableDerivedAbiTags();
+    VariableNameMangler.mangleNameWithAbiTags(VD, nullptr);
+
+    // Get tags from variable type that are not present in its name.
+    const AbiTagList &UsedAbiTags =
+        VariableNameMangler.AbiTagsRoot.getSortedUniqueUsedAbiTags();
+    AbiTagList AdditionalAbiTags(VariableTypeAbiTags.size());
+    AdditionalAbiTags.erase(
+        std::set_difference(VariableTypeAbiTags.begin(),
+                            VariableTypeAbiTags.end(), UsedAbiTags.begin(),
+                            UsedAbiTags.end(), AdditionalAbiTags.begin()),
+        AdditionalAbiTags.end());
+
+    // Output name with implicit tags.
+    mangleNameWithAbiTags(VD, &AdditionalAbiTags);
+  } else {
+    mangleNameWithAbiTags(ND, nullptr);
+  }
+}
+
+void CXXNameMangler::mangleNameWithAbiTags(const NamedDecl *ND,
+                                           const AbiTagList *AdditionalAbiTags) {
   //  <name> ::= <nested-name>
   //         ::= <unscoped-name>
   //         ::= <unscoped-template-name> <template-args>
@@ -601,7 +847,7 @@
     while (!DC->isNamespace() && !DC->isTranslationUnit())
       DC = getEffectiveParentContext(DC);
   else if (GetLocalClassDecl(ND)) {
-    mangleLocalName(ND);
+    mangleLocalName(ND, AdditionalAbiTags);
     return;
   }
 
@@ -611,76 +857,88 @@
     // Check if we have a template.
     const TemplateArgumentList *TemplateArgs = nullptr;
     if (const TemplateDecl *TD = isTemplate(ND, TemplateArgs)) {
-      mangleUnscopedTemplateName(TD);
+      mangleUnscopedTemplateName(TD, AdditionalAbiTags);
       mangleTemplateArgs(*TemplateArgs);
       return;
     }
 
-    mangleUnscopedName(ND);
+    mangleUnscopedName(ND, AdditionalAbiTags);
     return;
   }
 
   if (isLocalContainerContext(DC)) {
-    mangleLocalName(ND);
+    mangleLocalName(ND, AdditionalAbiTags);
     return;
   }
 
-  mangleNestedName(ND, DC);
+  mangleNestedName(ND, DC, AdditionalAbiTags);
 }
-void CXXNameMangler::mangleName(const TemplateDecl *TD,
-                                const TemplateArgument *TemplateArgs,
-                                unsigned NumTemplateArgs) {
+
+void CXXNameMangler::mangleTemplateName(const TemplateDecl *TD,
+                                        const TemplateArgument *TemplateArgs,
+                                        unsigned NumTemplateArgs) {
   const DeclContext *DC = IgnoreLinkageSpecDecls(getEffectiveDeclContext(TD));
 
   if (DC->isTranslationUnit() || isStdNamespace(DC)) {
-    mangleUnscopedTemplateName(TD);
+    mangleUnscopedTemplateName(TD, nullptr);
     mangleTemplateArgs(TemplateArgs, NumTemplateArgs);
   } else {
     mangleNestedName(TD, TemplateArgs, NumTemplateArgs);
   }
 }
 
-void CXXNameMangler::mangleUnscopedName(const NamedDecl *ND) {
+void CXXNameMangler::mangleUnscopedName(const NamedDecl *ND,
+                                        const AbiTagList *AdditionalAbiTags) {
   //  <unscoped-name> ::= <unqualified-name>
   //                  ::= St <unqualified-name>   # ::std::
 
   if (isStdNamespace(IgnoreLinkageSpecDecls(getEffectiveDeclContext(ND))))
     Out << "St";
 
-  mangleUnqualifiedName(ND);
+  mangleUnqualifiedName(ND, AdditionalAbiTags);
 }
 
-void CXXNameMangler::mangleUnscopedTemplateName(const TemplateDecl *ND) {
+void CXXNameMangler::mangleUnscopedTemplateName(
+    const TemplateDecl *ND, const AbiTagList *AdditionalAbiTags) {
   //     <unscoped-template-name> ::= <unscoped-name>
   //                              ::= <substitution>
   if (mangleSubstitution(ND))
     return;
 
   // <template-template-param> ::= <template-param>
-  if (const auto *TTP = dyn_cast<TemplateTemplateParmDecl>(ND))
+  if (const auto *TTP = dyn_cast<TemplateTemplateParmDecl>(ND)) {
+    assert(!AdditionalAbiTags &&
+           "template template param cannot have abi tags");
     mangleTemplateParameter(TTP->getIndex());
-  else
-    mangleUnscopedName(ND->getTemplatedDecl());
+  } else if (isa<BuiltinTemplateDecl>(ND)) {
+    mangleUnscopedName(ND, AdditionalAbiTags);
+  } else {
+    mangleUnscopedName(ND->getTemplatedDecl(), AdditionalAbiTags);
+  }
 
   addSubstitution(ND);
 }
 
-void CXXNameMangler::mangleUnscopedTemplateName(TemplateName Template) {
+void CXXNameMangler::mangleUnscopedTemplateName(
+    TemplateName Template, const AbiTagList *AdditionalAbiTags) {
   //     <unscoped-template-name> ::= <unscoped-name>
   //                              ::= <substitution>
   if (TemplateDecl *TD = Template.getAsTemplateDecl())
-    return mangleUnscopedTemplateName(TD);
+    return mangleUnscopedTemplateName(TD, AdditionalAbiTags);
   
   if (mangleSubstitution(Template))
     return;
 
+  assert(!AdditionalAbiTags &&
+         "dependent template name cannot have abi tags");
+
   DependentTemplateName *Dependent = Template.getAsDependentTemplateName();
   assert(Dependent && "Not a dependent template name?");
   if (const IdentifierInfo *Id = Dependent->getIdentifier())
     mangleSourceName(Id);
   else
     mangleOperatorName(Dependent->getOperator(), UnknownArity);
-  
+
   addSubstitution(Template);
 }
 
@@ -838,7 +1096,7 @@
                              /*recursive*/ true);
     else
       Out << "sr";
-    mangleSourceName(qualifier->getAsNamespace()->getIdentifier());
+    mangleSourceNameWithAbiTags(qualifier->getAsNamespace());
     break;
   case NestedNameSpecifier::NamespaceAlias:
     if (qualifier->getPrefix())
@@ -846,7 +1104,7 @@
                              /*recursive*/ true);
     else
       Out << "sr";
-    mangleSourceName(qualifier->getAsNamespaceAlias()->getIdentifier());
+    mangleSourceNameWithAbiTags(qualifier->getAsNamespaceAlias());
     break;
 
   case NestedNameSpecifier::TypeSpec:
@@ -881,6 +1139,7 @@
       Out << "sr";
 
     mangleSourceName(qualifier->getAsIdentifier());
+    // An Identifier has no type information, so we can't emit abi tags for it.
     break;
   }
 
@@ -926,14 +1185,34 @@
 
 void CXXNameMangler::mangleUnqualifiedName(const NamedDecl *ND,
                                            DeclarationName Name,
-                                           unsigned KnownArity) {
+                                           unsigned KnownArity,
+                                           const AbiTagList *AdditionalAbiTags) {
   unsigned Arity = KnownArity;
   //  <unqualified-name> ::= <operator-name>
   //                     ::= <ctor-dtor-name>
   //                     ::= <source-name>
   switch (Name.getNameKind()) {
   case DeclarationName::Identifier: {
-    if (const IdentifierInfo *II = Name.getAsIdentifierInfo()) {
+    const IdentifierInfo *II = Name.getAsIdentifierInfo();
+
+    // We mangle decomposition declarations as the names of their bindings.
+    if (auto *DD = dyn_cast<DecompositionDecl>(ND)) {
+      // FIXME: Non-standard mangling for decomposition declarations:
+      //
+      //  <unqualified-name> ::= DC <source-name>* E
+      //
+      // These can never be referenced across translation units, so we do
+      // not need a cross-vendor mangling for anything other than demanglers.
+      // Proposed on cxx-abi-dev on 2016-08-12
+      Out << "DC";
+      for (auto *BD : DD->bindings())
+        mangleSourceName(BD->getDeclName().getAsIdentifierInfo());
+      Out << 'E';
+      writeAbiTags(ND, AdditionalAbiTags);
+      break;
+    }
+
+    if (II) {
       // We must avoid conflicts between internally- and externally-
       // linked variable and function declaration names in the same TU:
       //   void test() { extern void foo(); }
@@ -945,6 +1224,7 @@
         Out << 'L';
 
       mangleSourceName(II);
+      writeAbiTags(ND, AdditionalAbiTags);
       break;
     }
 
@@ -984,6 +1264,7 @@
       assert(FD->getIdentifier() && "Data member name isn't an identifier!");
 
       mangleSourceName(FD->getIdentifier());
+      // Not emitting abi tags: internal name anyway.
       break;
     }
 
@@ -1004,6 +1285,10 @@
       assert(D->getDeclName().getAsIdentifierInfo() &&
              "Typedef was not named!");
       mangleSourceName(D->getDeclName().getAsIdentifierInfo());
+      assert(!AdditionalAbiTags && "Type cannot have additional abi tags");
+      // Explicit abi tags are still possible; take from underlying type, not
+      // from typedef.
+      writeAbiTags(TD, nullptr);
       break;
     }
 
@@ -1013,6 +1298,8 @@
     // <lambda-sig> ::= <parameter-type>+   # Parameter types or 'v' for 'void'.
     if (const CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(TD)) {
       if (Record->isLambda() && Record->getLambdaManglingNumber()) {
+        assert(!AdditionalAbiTags &&
+               "Lambda type cannot have additional abi tags");
         mangleLambda(Record);
         break;
       }
@@ -1024,11 +1311,13 @@
       if (UnnamedMangle > 1)
         Out << UnnamedMangle - 2;
       Out << '_';
+      writeAbiTags(TD, AdditionalAbiTags);
       break;
     }
 
-    // Get a unique id for the anonymous struct.
-    unsigned AnonStructId = Context.getAnonymousStructId(TD);
+    // Get a unique id for the anonymous struct. If it is not a real output
+    // ID doesn't matter so use fake one.
+    unsigned AnonStructId = NullOut ? 0 : Context.getAnonymousStructId(TD);
 
     // Mangle it as a source name in the form
     // [n] $_<id>
@@ -1047,16 +1336,33 @@
   case DeclarationName::ObjCMultiArgSelector:
     llvm_unreachable("Can't mangle Objective-C selector names here!");
 
-  case DeclarationName::CXXConstructorName:
+  case DeclarationName::CXXConstructorName: {
+    const CXXRecordDecl *InheritedFrom = nullptr;
+    const TemplateArgumentList *InheritedTemplateArgs = nullptr;
+    if (auto Inherited =
+            cast<CXXConstructorDecl>(ND)->getInheritedConstructor()) {
+      InheritedFrom = Inherited.getConstructor()->getParent();
+      InheritedTemplateArgs =
+          Inherited.getConstructor()->getTemplateSpecializationArgs();
+    }
+
     if (ND == Structor)
       // If the named decl is the C++ constructor we're mangling, use the type
       // we were given.
-      mangleCXXCtorType(static_cast<CXXCtorType>(StructorType));
+      mangleCXXCtorType(static_cast<CXXCtorType>(StructorType), InheritedFrom);
     else
       // Otherwise, use the complete constructor name. This is relevant if a
       // class with a constructor is declared within a constructor.
-      mangleCXXCtorType(Ctor_Complete);
+      mangleCXXCtorType(Ctor_Complete, InheritedFrom);
+
+    // FIXME: The template arguments are part of the enclosing prefix or
+    // nested-name, but it's more convenient to mangle them here.
+    if (InheritedTemplateArgs)
+      mangleTemplateArgs(*InheritedTemplateArgs);
+
+    writeAbiTags(ND, AdditionalAbiTags);
     break;
+  }
 
   case DeclarationName::CXXDestructorName:
     if (ND == Structor)
@@ -1067,6 +1373,7 @@
       // Otherwise, use the complete destructor name. This is relevant if a
       // class with a destructor is declared within a destructor.
       mangleCXXDtorType(Dtor_Complete);
+    writeAbiTags(ND, AdditionalAbiTags);
     break;
 
   case DeclarationName::CXXOperatorName:
@@ -1082,6 +1389,7 @@
   case DeclarationName::CXXConversionFunctionName:
   case DeclarationName::CXXLiteralOperatorName:
     mangleOperatorName(Name, Arity);
+    writeAbiTags(ND, AdditionalAbiTags);
     break;
 
   case DeclarationName::CXXUsingDirective:
@@ -1098,6 +1406,7 @@
 
 void CXXNameMangler::mangleNestedName(const NamedDecl *ND,
                                       const DeclContext *DC,
+                                      const AbiTagList *AdditionalAbiTags,
                                       bool NoFunction) {
   // <nested-name> 
   //   ::= N [<CV-qualifiers>] [<ref-qualifier>] <prefix> <unqualified-name> E
@@ -1123,7 +1432,7 @@
   }
   else {
     manglePrefix(DC, NoFunction);
-    mangleUnqualifiedName(ND);
+    mangleUnqualifiedName(ND, AdditionalAbiTags);
   }
 
   Out << 'E';
@@ -1141,7 +1450,8 @@
   Out << 'E';
 }
 
-void CXXNameMangler::mangleLocalName(const Decl *D) {
+void CXXNameMangler::mangleLocalName(const Decl *D,
+                                     const AbiTagList *AdditionalAbiTags) {
   // <local-name> := Z <function encoding> E <entity name> [<discriminator>]
   //              := Z <function encoding> E s [<discriminator>]
   // <local-name> := Z <function encoding> E d [ <parameter number> ] 
@@ -1153,15 +1463,26 @@
 
   Out << 'Z';
 
-  if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(DC))
-    mangleObjCMethodName(MD);
-  else if (const BlockDecl *BD = dyn_cast<BlockDecl>(DC))
-    mangleBlockForPrefix(BD);
-  else
-    mangleFunctionEncoding(cast<FunctionDecl>(DC));
+  {
+    AbiTagState LocalAbiTags(AbiTags);
+
+    if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(DC))
+      mangleObjCMethodName(MD);
+    else if (const BlockDecl *BD = dyn_cast<BlockDecl>(DC))
+      mangleBlockForPrefix(BD);
+    else
+      mangleFunctionEncoding(cast<FunctionDecl>(DC));
+
+    // Implicit ABI tags (from namespace) are not available in the following
+    // entity; reset to actually emitted tags, which are available.
+    LocalAbiTags.setUsedAbiTags(LocalAbiTags.getEmittedAbiTags());
+  }
 
   Out << 'E';
 
+  // GCC 5.3.0 doesn't emit derived ABI tags for local names but that seems to
+  // be a bug that is fixed in trunk.
+
   if (RD) {
     // The parameter number is omitted for the last parameter, 0 for the 
     // second-to-last parameter, 1 for the third-to-last parameter, etc. The 
@@ -1169,7 +1490,7 @@
     // numbering will be local to the particular argument in which it appears
     // -- other default arguments do not affect its encoding.
     const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD);
-    if (CXXRD->isLambda()) {
+    if (CXXRD && CXXRD->isLambda()) {
       if (const ParmVarDecl *Parm
               = dyn_cast_or_null<ParmVarDecl>(CXXRD->getLambdaContextDecl())) {
         if (const FunctionDecl *Func
@@ -1186,13 +1507,15 @@
     // Mangle the name relative to the closest enclosing function.
     // equality ok because RD derived from ND above
     if (D == RD)  {
-      mangleUnqualifiedName(RD);
+      mangleUnqualifiedName(RD, AdditionalAbiTags);
     } else if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
       manglePrefix(getEffectiveDeclContext(BD), true /*NoFunction*/);
+      assert(!AdditionalAbiTags && "Block cannot have additional abi tags");
       mangleUnqualifiedBlock(BD);
     } else {
       const NamedDecl *ND = cast<NamedDecl>(D);
-      mangleNestedName(ND, getEffectiveDeclContext(ND), true /*NoFunction*/);
+      mangleNestedName(ND, getEffectiveDeclContext(ND), AdditionalAbiTags,
+                       true /*NoFunction*/);
     }
   } else if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
     // Mangle a block in a default parameter; see above explanation for
@@ -1209,9 +1532,10 @@
       }
     }
 
+    assert(!AdditionalAbiTags && "Block cannot have additional abi tags");
     mangleUnqualifiedBlock(BD);
   } else {
-    mangleUnqualifiedName(cast<NamedDecl>(D));
+    mangleUnqualifiedName(cast<NamedDecl>(D), AdditionalAbiTags);
   }
 
   if (const NamedDecl *ND = dyn_cast<NamedDecl>(RD ? RD : D)) {
@@ -1227,12 +1551,12 @@
 
 void CXXNameMangler::mangleBlockForPrefix(const BlockDecl *Block) {
   if (GetLocalClassDecl(Block)) {
-    mangleLocalName(Block);
+    mangleLocalName(Block, /* AdditionalAbiTags */ nullptr);
     return;
   }
   const DeclContext *DC = getEffectiveDeclContext(Block);
   if (isLocalContainerContext(DC)) {
-    mangleLocalName(Block);
+    mangleLocalName(Block, /* AdditionalAbiTags */ nullptr);
     return;
   }
   manglePrefix(getEffectiveDeclContext(Block));
@@ -1243,10 +1567,10 @@
   if (Decl *Context = Block->getBlockManglingContextDecl()) {
     if ((isa<VarDecl>(Context) || isa<FieldDecl>(Context)) &&
         Context->getDeclContext()->isRecord()) {
-      if (const IdentifierInfo *Name
-            = cast<NamedDecl>(Context)->getIdentifier()) {
-        mangleSourceName(Name);
-        Out << 'M';            
+      const auto *ND = cast<NamedDecl>(Context);
+      if (ND->getIdentifier()) {
+        mangleSourceNameWithAbiTags(ND);
+        Out << 'M';
       }
     }
   }
@@ -1279,7 +1603,7 @@
       if (const IdentifierInfo *Name
             = cast<NamedDecl>(Context)->getIdentifier()) {
         mangleSourceName(Name);
-        Out << 'M';            
+        Out << 'M';
       }
     }
   }
@@ -1366,7 +1690,7 @@
     mangleTemplateArgs(*TemplateArgs);
   } else {
     manglePrefix(getEffectiveDeclContext(ND), NoFunction);
-    mangleUnqualifiedName(ND);
+    mangleUnqualifiedName(ND, nullptr);
   }
 
   addSubstitution(ND);
@@ -1381,19 +1705,19 @@
 
   if (QualifiedTemplateName *Qualified = Template.getAsQualifiedTemplateName())
     manglePrefix(Qualified->getQualifier());
-  
+
   if (OverloadedTemplateStorage *Overloaded
                                       = Template.getAsOverloadedTemplate()) {
     mangleUnqualifiedName(nullptr, (*Overloaded->begin())->getDeclName(),
-                          UnknownArity);
+                          UnknownArity, nullptr);
     return;
   }
-   
+
   DependentTemplateName *Dependent = Template.getAsDependentTemplateName();
   assert(Dependent && "Unknown template name kind?");
   if (NestedNameSpecifier *Qualifier = Dependent->getQualifier())
     manglePrefix(Qualifier);
-  mangleUnscopedTemplateName(Template);
+  mangleUnscopedTemplateName(Template, /* AdditionalAbiTags */ nullptr);
 }
 
 void CXXNameMangler::mangleTemplatePrefix(const TemplateDecl *ND,
@@ -1412,7 +1736,10 @@
     mangleTemplateParameter(TTP->getIndex());
   } else {
     manglePrefix(getEffectiveDeclContext(ND), NoFunction);
-    mangleUnqualifiedName(ND->getTemplatedDecl());
+    if (isa<BuiltinTemplateDecl>(ND))
+      mangleUnqualifiedName(ND, nullptr);
+    else
+      mangleUnqualifiedName(ND->getTemplatedDecl(), nullptr);
   }
 
   addSubstitution(ND);
@@ -1547,17 +1874,17 @@
     return true;
 
   case Type::Typedef:
-    mangleSourceName(cast<TypedefType>(Ty)->getDecl()->getIdentifier());
+    mangleSourceNameWithAbiTags(cast<TypedefType>(Ty)->getDecl());
     break;
 
   case Type::UnresolvedUsing:
-    mangleSourceName(
-        cast<UnresolvedUsingType>(Ty)->getDecl()->getIdentifier());
+    mangleSourceNameWithAbiTags(
+        cast<UnresolvedUsingType>(Ty)->getDecl());
     break;
 
   case Type::Enum:
   case Type::Record:
-    mangleSourceName(cast<TagType>(Ty)->getDecl()->getIdentifier());
+    mangleSourceNameWithAbiTags(cast<TagType>(Ty)->getDecl());
     break;
 
   case Type::TemplateSpecialization: {
@@ -1575,7 +1902,7 @@
       if (isa<TemplateTemplateParmDecl>(TD))
         goto unresolvedType;
 
-      mangleSourceName(TD->getIdentifier());
+      mangleSourceNameWithAbiTags(TD);
       break;
     }
 
@@ -1605,8 +1932,8 @@
   }
 
   case Type::InjectedClassName:
-    mangleSourceName(
-        cast<InjectedClassNameType>(Ty)->getDecl()->getIdentifier());
+    mangleSourceNameWithAbiTags(
+        cast<InjectedClassNameType>(Ty)->getDecl());
     break;
 
   case Type::DependentName:
@@ -1657,8 +1984,6 @@
   }
 }
 
-
-
 void
 CXXNameMangler::mangleOperatorName(OverloadedOperatorKind OO, unsigned Arity) {
   switch (OO) {
@@ -1787,7 +2112,7 @@
     if (Context.getASTContext().addressSpaceMapManglingFor(AS)) {
       //  <target-addrspace> ::= "AS" <address-space-number>
       unsigned TargetAS = Context.getASTContext().getTargetAddressSpace(AS);
-      ASString = "AS" + llvm::utostr_32(TargetAS);
+      ASString = "AS" + llvm::utostr(TargetAS);
     } else {
       switch (AS) {
       default: llvm_unreachable("Not a language specific address space");
@@ -1990,7 +2315,7 @@
   //                 ::= f  # float
   //                 ::= d  # double
   //                 ::= e  # long double, __float80
-  // UNSUPPORTED:    ::= g  # __float128
+  //                 ::= g  # __float128
   // UNSUPPORTED:    ::= Dd # IEEE 754r decimal floating point (64 bits)
   // UNSUPPORTED:    ::= De # IEEE 754r decimal floating point (128 bits)
   // UNSUPPORTED:    ::= Df # IEEE 754r decimal floating point (32 bits)
@@ -1999,6 +2324,7 @@
   //                 ::= Ds # char16_t
   //                 ::= Dn # std::nullptr_t (i.e., decltype(nullptr))
   //                 ::= u <source-name>    # vendor extended type
+  std::string type_name;
   switch (T->getKind()) {
   case BuiltinType::Void:
     Out << 'v';
@@ -2070,6 +2396,12 @@
                 ? 'g'
                 : 'e');
     break;
+  case BuiltinType::Float128:
+    if (getASTContext().getTargetInfo().useFloat128ManglingForLongDouble())
+      Out << "U10__float128"; // Match the GCC mangling
+    else
+      Out << 'g';
+    break;
   case BuiltinType::NullPtr:
     Out << "Dn";
     break;
@@ -2079,7 +2411,9 @@
   case BuiltinType::Id:
 #include "clang/AST/BuiltinTypes.def"
   case BuiltinType::Dependent:
-    llvm_unreachable("mangling a placeholder type");
+    if (!NullOut)
+      llvm_unreachable("mangling a placeholder type");
+    break;
   case BuiltinType::ObjCId:
     Out << "11objc_object";
     break;
@@ -2089,42 +2423,12 @@
   case BuiltinType::ObjCSel:
     Out << "13objc_selector";
     break;
-  case BuiltinType::OCLImage1d:
-    Out << "11ocl_image1d";
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id: \
+    type_name = "ocl_" #ImgType "_" #Suffix; \
+    Out << type_name.size() << type_name; \
     break;
-  case BuiltinType::OCLImage1dArray:
-    Out << "16ocl_image1darray";
-    break;
-  case BuiltinType::OCLImage1dBuffer:
-    Out << "17ocl_image1dbuffer";
-    break;
-  case BuiltinType::OCLImage2d:
-    Out << "11ocl_image2d";
-    break;
-  case BuiltinType::OCLImage2dArray:
-    Out << "16ocl_image2darray";
-    break;
-  case BuiltinType::OCLImage2dDepth:
-    Out << "16ocl_image2ddepth";
-    break;
-  case BuiltinType::OCLImage2dArrayDepth:
-    Out << "21ocl_image2darraydepth";
-    break;
-  case BuiltinType::OCLImage2dMSAA:
-    Out << "15ocl_image2dmsaa";
-    break;
-  case BuiltinType::OCLImage2dArrayMSAA:
-    Out << "20ocl_image2darraymsaa";
-    break;
-  case BuiltinType::OCLImage2dMSAADepth:
-    Out << "20ocl_image2dmsaadepth";
-    break;
-  case BuiltinType::OCLImage2dArrayMSAADepth:
-    Out << "35ocl_image2darraymsaadepth";
-    break;
-  case BuiltinType::OCLImage3d:
-    Out << "11ocl_image3d";
-    break;
+#include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
     Out << "11ocl_sampler";
     break;
@@ -2151,11 +2455,6 @@
   case CC_C:
     return "";
 
-  case CC_PreserveMost:
-    return "perservemost";
-  case CC_PreserveAll:
-    return "perserveall";
-
   case CC_X86StdCall:
   case CC_X86FastCall:
   case CC_X86ThisCall:
@@ -2167,7 +2466,9 @@
   case CC_AAPCS_VFP:
   case CC_IntelOclBicc:
   case CC_SpirFunction:
-  case CC_SpirKernel:
+  case CC_OpenCLKernel:
+  case CC_PreserveMost:
+  case CC_PreserveAll:
     // FIXME: we should be mangling all of the above.
     return "";
 
@@ -2267,7 +2568,7 @@
     FunctionTypeDepth.enterResultType();
 
     // Mangle ns_returns_retained as an order-sensitive qualifier here.
-    if (Proto->getExtInfo().getProducesResult())
+    if (Proto->getExtInfo().getProducesResult() && FD == nullptr)
       mangleVendorQualifier("ns_returns_retained");
 
     // Mangle the return type without any direct ARC ownership qualifiers.
@@ -2293,7 +2594,7 @@
   assert(!FD || FD->getNumParams() == Proto->getNumParams());
   for (unsigned I = 0, E = Proto->getNumParams(); I != E; ++I) {
     // Mangle extended parameter info as order-sensitive qualifiers here.
-    if (Proto->hasExtParameterInfos()) {
+    if (Proto->hasExtParameterInfos() && FD == nullptr) {
       mangleExtParameterInfo(Proto->getExtParameterInfo(I));
     }
 
@@ -2644,7 +2945,7 @@
 
 void CXXNameMangler::mangleType(const TemplateSpecializationType *T) {
   if (TemplateDecl *TD = T->getTemplateName().getAsTemplateDecl()) {
-    mangleName(TD, T->getArgs(), T->getNumArgs());
+    mangleTemplateName(TD, T->getArgs(), T->getNumArgs());
   } else {
     if (mangleSubstitution(QualType(T, 0)))
       return;
@@ -2760,7 +3061,7 @@
     }
   }
 
-  mangleType(T->getUnderlyingType());
+  mangleType(T->getBaseType());
 }
 
 void CXXNameMangler::mangleType(const AutoType *T) {
@@ -2934,6 +3235,7 @@
   case Expr::MSPropertySubscriptExprClass:
   case Expr::TypoExprClass:  // This should no longer exist in the AST by now.
   case Expr::OMPArraySectionExprClass:
+  case Expr::CXXInheritedCtorInitExprClass:
     llvm_unreachable("unexpected statement kind");
 
   // FIXME: invent manglings for all these.
@@ -2956,6 +3258,7 @@
   case Expr::ObjCDictionaryLiteralClass:
   case Expr::ObjCSubscriptRefExprClass:
   case Expr::ObjCIndirectCopyRestoreExprClass:
+  case Expr::ObjCAvailabilityCheckExprClass:
   case Expr::OffsetOfExprClass:
   case Expr::PredefinedExprClass:
   case Expr::ShuffleVectorExprClass:
@@ -2970,12 +3273,14 @@
   case Expr::PseudoObjectExprClass:
   case Expr::AtomicExprClass:
   {
-    // As bad as this diagnostic is, it's better than crashing.
-    DiagnosticsEngine &Diags = Context.getDiags();
-    unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-                                     "cannot yet mangle expression type %0");
-    Diags.Report(E->getExprLoc(), DiagID)
-      << E->getStmtClassName() << E->getSourceRange();
+    if (!NullOut) {
+      // As bad as this diagnostic is, it's better than crashing.
+      DiagnosticsEngine &Diags = Context.getDiags();
+      unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                                       "cannot yet mangle expression type %0");
+      Diags.Report(E->getExprLoc(), DiagID)
+        << E->getStmtClassName() << E->getSourceRange();
+    }
     break;
   }
 
@@ -3713,25 +4018,33 @@
   Out << '_';
 }
 
-void CXXNameMangler::mangleCXXCtorType(CXXCtorType T) {
+void CXXNameMangler::mangleCXXCtorType(CXXCtorType T,
+                                       const CXXRecordDecl *InheritedFrom) {
   // <ctor-dtor-name> ::= C1  # complete object constructor
   //                  ::= C2  # base object constructor
+  //                  ::= CI1 <type> # complete inheriting constructor
+  //                  ::= CI2 <type> # base inheriting constructor
   //
   // In addition, C5 is a comdat name with C1 and C2 in it.
+  Out << 'C';
+  if (InheritedFrom)
+    Out << 'I';
   switch (T) {
   case Ctor_Complete:
-    Out << "C1";
+    Out << '1';
     break;
   case Ctor_Base:
-    Out << "C2";
+    Out << '2';
     break;
   case Ctor_Comdat:
-    Out << "C5";
+    Out << '5';
     break;
   case Ctor_DefaultClosure:
   case Ctor_CopyingClosure:
     llvm_unreachable("closure constructors don't exist for the Itanium ABI!");
   }
+  if (InheritedFrom)
+    mangleName(InheritedFrom);
 }
 
 void CXXNameMangler::mangleCXXDtorType(CXXDtorType T) {
@@ -3843,7 +4156,7 @@
 
     Out << 'L';
     // References to external entities use the mangled name; if the name would
-    // not normally be manged then mangle it as unqualified.
+    // not normally be mangled then mangle it as unqualified.
     mangle(D);
     Out << 'E';
 
@@ -3899,12 +4212,6 @@
   Out << '_';
 }
 
-void CXXNameMangler::mangleExistingSubstitution(QualType type) {
-  bool result = mangleSubstitution(type);
-  assert(result && "no existing substitution for type");
-  (void) result;
-}
-
 void CXXNameMangler::mangleExistingSubstitution(TemplateName tname) {
   bool result = mangleSubstitution(tname);
   assert(result && "no existing substitution for template name");
@@ -4118,6 +4425,48 @@
   Substitutions[Ptr] = SeqID++;
 }
 
+CXXNameMangler::AbiTagList
+CXXNameMangler::makeFunctionReturnTypeTags(const FunctionDecl *FD) {
+  // When derived abi tags are disabled there is no need to make any list.
+  if (DisableDerivedAbiTags)
+    return AbiTagList();
+
+  llvm::raw_null_ostream NullOutStream;
+  CXXNameMangler TrackReturnTypeTags(*this, NullOutStream);
+  TrackReturnTypeTags.disableDerivedAbiTags();
+
+  const FunctionProtoType *Proto =
+      cast<FunctionProtoType>(FD->getType()->getAs<FunctionType>());
+  TrackReturnTypeTags.FunctionTypeDepth.enterResultType();
+  TrackReturnTypeTags.mangleType(Proto->getReturnType());
+  TrackReturnTypeTags.FunctionTypeDepth.leaveResultType();
+
+  return TrackReturnTypeTags.AbiTagsRoot.getSortedUniqueUsedAbiTags();
+}
+
+CXXNameMangler::AbiTagList
+CXXNameMangler::makeVariableTypeTags(const VarDecl *VD) {
+  // When derived abi tags are disabled there is no need to make any list.
+  if (DisableDerivedAbiTags)
+    return AbiTagList();
+
+  llvm::raw_null_ostream NullOutStream;
+  CXXNameMangler TrackVariableType(*this, NullOutStream);
+  TrackVariableType.disableDerivedAbiTags();
+
+  TrackVariableType.mangleType(VD->getType());
+
+  return TrackVariableType.AbiTagsRoot.getSortedUniqueUsedAbiTags();
+}
+
+bool CXXNameMangler::shouldHaveAbiTags(ItaniumMangleContextImpl &C,
+                                       const VarDecl *VD) {
+  llvm::raw_null_ostream NullOutStream;
+  CXXNameMangler TrackAbiTags(C, NullOutStream, nullptr, true);
+  TrackAbiTags.mangle(VD);
+  return TrackAbiTags.AbiTagsRoot.getUsedAbiTags().size();
+}
+
 //
 
 /// Mangles the name of the declaration D and emits that name to the given
@@ -4219,6 +4568,8 @@
   //  <special-name> ::= GV <object name>       # Guard variable for one-time
   //                                            # initialization
   CXXNameMangler Mangler(*this, Out);
+  // GCC 5.3.0 doesn't emit derived ABI tags for local names but that seems to
+  // be a bug that is fixed in trunk.
   Mangler.getStream() << "_ZGV";
   Mangler.mangleName(D);
 }
diff --git a/lib/AST/Makefile b/lib/AST/Makefile
deleted file mode 100644
index 65383c5..0000000
--- a/lib/AST/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- clang/lib/AST/Makefile ------------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-#  This implements the AST library for the C-Language front-end.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangAST
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/AST/Mangle.cpp b/lib/AST/Mangle.cpp
index 7a9cf18..ee24173 100644
--- a/lib/AST/Mangle.cpp
+++ b/lib/AST/Mangle.cpp
@@ -126,9 +126,9 @@
     // llvm mangler on ELF is a nop, so we can just avoid adding the \01
     // marker.  We also avoid adding the marker if this is an alias for an
     // LLVM intrinsic.
-    StringRef UserLabelPrefix =
-        getASTContext().getTargetInfo().getUserLabelPrefix();
-    if (!UserLabelPrefix.empty() && !ALA->getLabel().startswith("llvm."))
+    char GlobalPrefix =
+        getASTContext().getTargetInfo().getDataLayout().getGlobalPrefix();
+    if (GlobalPrefix && !ALA->getLabel().startswith("llvm."))
       Out << '\01'; // LLVM IR Marker for __asm("foo")
 
     Out << ALA->getLabel();
diff --git a/lib/AST/MicrosoftMangle.cpp b/lib/AST/MicrosoftMangle.cpp
index 0634319..479ac44 100644
--- a/lib/AST/MicrosoftMangle.cpp
+++ b/lib/AST/MicrosoftMangle.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
@@ -27,13 +28,44 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/JamCRC.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace clang;
 
 namespace {
 
+struct msvc_hashing_ostream : public llvm::raw_svector_ostream {
+  raw_ostream &OS;
+  llvm::SmallString<64> Buffer;
+
+  msvc_hashing_ostream(raw_ostream &OS)
+      : llvm::raw_svector_ostream(Buffer), OS(OS) {}
+  ~msvc_hashing_ostream() override {
+    StringRef MangledName = str();
+    bool StartsWithEscape = MangledName.startswith("\01");
+    if (StartsWithEscape)
+      MangledName = MangledName.drop_front(1);
+    if (MangledName.size() <= 4096) {
+      OS << str();
+      return;
+    }
+
+    llvm::MD5 Hasher;
+    llvm::MD5::MD5Result Hash;
+    Hasher.update(MangledName);
+    Hasher.final(Hash);
+
+    SmallString<32> HexString;
+    llvm::MD5::stringifyResult(Hash, HexString);
+
+    if (StartsWithEscape)
+      OS << '\01';
+    OS << "??@" << HexString << '@';
+  }
+};
+
 /// \brief Retrieve the declaration context that should be used when mangling
 /// the given declaration.
 static const DeclContext *getEffectiveDeclContext(const Decl *D) {
@@ -58,10 +90,11 @@
   }
 
   const DeclContext *DC = D->getDeclContext();
-  if (const CapturedDecl *CD = dyn_cast<CapturedDecl>(DC))
-    return getEffectiveDeclContext(CD);
+  if (isa<CapturedDecl>(DC) || isa<OMPDeclareReductionDecl>(DC)) {
+    return getEffectiveDeclContext(cast<Decl>(DC));
+  }
 
-  return DC;
+  return DC->getRedeclContext();
 }
 
 static const DeclContext *getEffectiveParentContext(const DeclContext *DC) {
@@ -120,7 +153,8 @@
                                        const CXXRecordDecl *DstRD,
                                        raw_ostream &Out) override;
   void mangleCXXThrowInfo(QualType T, bool IsConst, bool IsVolatile,
-                          uint32_t NumEntries, raw_ostream &Out) override;
+                          bool IsUnaligned, uint32_t NumEntries,
+                          raw_ostream &Out) override;
   void mangleCXXCatchableTypeArray(QualType T, uint32_t NumEntries,
                                    raw_ostream &Out) override;
   void mangleCXXCatchableType(QualType T, const CXXConstructorDecl *CD,
@@ -204,7 +238,7 @@
   }
 
 private:
-  void mangleInitFiniStub(const VarDecl *D, raw_ostream &Out, char CharCode);
+  void mangleInitFiniStub(const VarDecl *D, char CharCode, raw_ostream &Out);
 };
 
 /// MicrosoftCXXNameMangler - Manage the mangling of a single name for the
@@ -360,7 +394,8 @@
   if (!getASTContext().getLangOpts().CPlusPlus)
     return false;
 
-  if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
+  const VarDecl *VD = dyn_cast<VarDecl>(D);
+  if (VD && !isa<DecompositionDecl>(D)) {
     // C variables are not mangled.
     if (VD->isExternC())
       return false;
@@ -746,6 +781,21 @@
         }
       }
 
+      if (const DecompositionDecl *DD = dyn_cast<DecompositionDecl>(ND)) {
+        // FIXME: Invented mangling for decomposition declarations:
+        //   [X,Y,Z]
+        // where X,Y,Z are the names of the bindings.
+        llvm::SmallString<128> Name("[");
+        for (auto *BD : DD->bindings()) {
+          if (Name.size() > 1)
+            Name += ',';
+          Name += BD->getDeclName().getAsIdentifierInfo()->getName();
+        }
+        Name += ']';
+        mangleSourceName(Name);
+        break;
+      }
+
       if (const VarDecl *VD = dyn_cast<VarDecl>(ND)) {
         // We must have an anonymous union or struct declaration.
         const CXXRecordDecl *RD = VD->getType()->getAsCXXRecordDecl();
@@ -1153,7 +1203,7 @@
 
     // This CXXUuidofExpr is mangled as-if it were actually a VarDecl from
     // const __s_GUID _GUID_{lower case UUID with underscores}
-    StringRef Uuid = UE->getUuidAsStringRef(Context.getASTContext());
+    StringRef Uuid = UE->getUuidStr();
     std::string Name = "_GUID_" + Uuid.lower();
     std::replace(Name.begin(), Name.end(), '-', '_');
 
@@ -1413,6 +1463,10 @@
 
   if (HasRestrict)
     Out << 'I';
+
+  if (Quals.hasUnaligned() ||
+      (!PointeeType.isNull() && PointeeType.getLocalQualifiers().hasUnaligned()))
+    Out << 'F';
 }
 
 void MicrosoftCXXNameMangler::manglePointerCVQualifiers(Qualifiers Quals) {
@@ -1544,6 +1598,8 @@
     }
     break;
   case QMM_Result:
+    // Presence of __unaligned qualifier shouldn't affect mangling here.
+    Quals.removeUnaligned();
     if ((!IsPointer && Quals) || isa<TagType>(T)) {
       Out << '?';
       mangleQualifiers(Quals, false);
@@ -1684,54 +1740,11 @@
     mangleArtificalTagType(TTK_Struct, "objc_selector");
     break;
 
-  case BuiltinType::OCLImage1d:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image1d");
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id: \
+    Out << "PAUocl_" #ImgType "_" #Suffix "@@"; \
     break;
-  case BuiltinType::OCLImage1dArray:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image1darray");
-    break;
-  case BuiltinType::OCLImage1dBuffer:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image1dbuffer");
-    break;
-  case BuiltinType::OCLImage2d:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image2d");
-    break;
-  case BuiltinType::OCLImage2dArray:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image2darray");
-    break;
-  case BuiltinType::OCLImage2dDepth:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image2ddepth");
-    break;
-  case BuiltinType::OCLImage2dArrayDepth:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image2darraydepth");
-    break;
-  case BuiltinType::OCLImage2dMSAA:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image2dmsaa");
-    break;
-  case BuiltinType::OCLImage2dArrayMSAA:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image2darraymsaa");
-    break;
-  case BuiltinType::OCLImage2dMSAADepth:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image2dmsaadepth");
-    break;
-  case BuiltinType::OCLImage2dArrayMSAADepth:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image2darraymsaadepth");
-    break;
-  case BuiltinType::OCLImage3d:
-    Out << "PA";
-    mangleArtificalTagType(TTK_Struct, "ocl_image3d");
-    break;
+#include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
     Out << "PA";
     mangleArtificalTagType(TTK_Struct, "ocl_sampler");
@@ -1761,6 +1774,7 @@
     Out << "$$T";
     break;
 
+  case BuiltinType::Float128:
   case BuiltinType::Half: {
     DiagnosticsEngine &Diags = Context.getDiags();
     unsigned DiagID = Diags.getCustomDiagID(
@@ -1826,7 +1840,7 @@
   // If this is a C++ instance method, mangle the CVR qualifiers for the
   // this pointer.
   if (HasThisQuals) {
-    Qualifiers Quals = Qualifiers::fromCVRMask(Proto->getTypeQuals());
+    Qualifiers Quals = Qualifiers::fromCVRUMask(Proto->getTypeQuals());
     manglePointerExtQualifiers(Quals, /*PointeeType=*/QualType());
     mangleRefQualifier(Proto->getRefQualifier());
     mangleQualifiers(Quals, /*IsMember=*/false);
@@ -2456,7 +2470,8 @@
                                  getASTContext().getSourceManager(),
                                  "Mangling declaration");
 
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   return Mangler.mangle(D);
 }
 
@@ -2556,7 +2571,8 @@
   const MicrosoftVTableContext::MethodVFTableLocation &ML =
       VTContext->getMethodVFTableLocation(GlobalDecl(MD));
 
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "\01?";
   Mangler.mangleVirtualMemPtrThunk(MD, ML);
 }
@@ -2564,10 +2580,11 @@
 void MicrosoftMangleContextImpl::mangleThunk(const CXXMethodDecl *MD,
                                              const ThunkInfo &Thunk,
                                              raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
-  Out << "\01?";
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
+  Mangler.getStream() << "\01?";
   Mangler.mangleName(MD);
-  mangleThunkThisAdjustment(MD, Thunk.This, Mangler, Out);
+  mangleThunkThisAdjustment(MD, Thunk.This, Mangler, MHO);
   if (!Thunk.Return.isEmpty())
     assert(Thunk.Method != nullptr &&
            "Thunk info should hold the overridee decl");
@@ -2584,10 +2601,11 @@
   // dtors rather than scalar deleting dtors. Just use the vector deleting dtor
   // mangling manually until we support both deleting dtor types.
   assert(Type == Dtor_Deleting);
-  MicrosoftCXXNameMangler Mangler(*this, Out, DD, Type);
-  Out << "\01??_E";
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO, DD, Type);
+  Mangler.getStream() << "\01??_E";
   Mangler.mangleName(DD->getParent());
-  mangleThunkThisAdjustment(DD, Adjustment, Mangler, Out);
+  mangleThunkThisAdjustment(DD, Adjustment, Mangler, MHO);
   Mangler.mangleFunctionType(DD->getType()->castAs<FunctionProtoType>(), DD);
 }
 
@@ -2598,8 +2616,12 @@
   //                    <cvr-qualifiers> [<name>] @
   // NOTE: <cvr-qualifiers> here is always 'B' (const). <storage-class>
   // is always '6' for vftables.
-  MicrosoftCXXNameMangler Mangler(*this, Out);
-  Mangler.getStream() << "\01??_7";
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
+  if (Derived->hasAttr<DLLImportAttr>())
+    Mangler.getStream() << "\01??_S";
+  else
+    Mangler.getStream() << "\01??_7";
   Mangler.mangleName(Derived);
   Mangler.getStream() << "6B"; // '6' for vftable, 'B' for const.
   for (const CXXRecordDecl *RD : BasePath)
@@ -2614,7 +2636,8 @@
   //                    <cvr-qualifiers> [<name>] @
   // NOTE: <cvr-qualifiers> here is always 'B' (const). <storage-class>
   // is always '7' for vbtables.
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "\01??_8";
   Mangler.mangleName(Derived);
   Mangler.getStream() << "7B";  // '7' for vbtable, 'B' for const.
@@ -2624,7 +2647,8 @@
 }
 
 void MicrosoftMangleContextImpl::mangleCXXRTTI(QualType T, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "\01??_R0";
   Mangler.mangleType(T, SourceRange(), MicrosoftCXXNameMangler::QMM_Result);
   Mangler.getStream() << "@8";
@@ -2639,31 +2663,36 @@
 
 void MicrosoftMangleContextImpl::mangleCXXVirtualDisplacementMap(
     const CXXRecordDecl *SrcRD, const CXXRecordDecl *DstRD, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "\01??_K";
   Mangler.mangleName(SrcRD);
   Mangler.getStream() << "$C";
   Mangler.mangleName(DstRD);
 }
 
-void MicrosoftMangleContextImpl::mangleCXXThrowInfo(QualType T,
-                                                    bool IsConst,
+void MicrosoftMangleContextImpl::mangleCXXThrowInfo(QualType T, bool IsConst,
                                                     bool IsVolatile,
+                                                    bool IsUnaligned,
                                                     uint32_t NumEntries,
                                                     raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "_TI";
   if (IsConst)
     Mangler.getStream() << 'C';
   if (IsVolatile)
     Mangler.getStream() << 'V';
+  if (IsUnaligned)
+    Mangler.getStream() << 'U';
   Mangler.getStream() << NumEntries;
   Mangler.mangleType(T, SourceRange(), MicrosoftCXXNameMangler::QMM_Result);
 }
 
 void MicrosoftMangleContextImpl::mangleCXXCatchableTypeArray(
     QualType T, uint32_t NumEntries, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "_CTA";
   Mangler.getStream() << NumEntries;
   Mangler.mangleType(T, SourceRange(), MicrosoftCXXNameMangler::QMM_Result);
@@ -2679,17 +2708,20 @@
   llvm::SmallString<64> RTTIMangling;
   {
     llvm::raw_svector_ostream Stream(RTTIMangling);
-    mangleCXXRTTI(T, Stream);
+    msvc_hashing_ostream MHO(Stream);
+    mangleCXXRTTI(T, MHO);
   }
   Mangler.getStream() << RTTIMangling.substr(1);
 
   // VS2015 CTP6 omits the copy-constructor in the mangled name.  This name is,
   // in fact, superfluous but I'm not sure the change was made consciously.
-  // TODO: Revisit this when VS2015 gets released.
   llvm::SmallString<64> CopyCtorMangling;
-  if (CD) {
+  if (!getASTContext().getLangOpts().isCompatibleWithMSVC(
+          LangOptions::MSVC2015) &&
+      CD) {
     llvm::raw_svector_ostream Stream(CopyCtorMangling);
-    mangleCXXCtor(CD, CT, Stream);
+    msvc_hashing_ostream MHO(Stream);
+    mangleCXXCtor(CD, CT, MHO);
   }
   Mangler.getStream() << CopyCtorMangling.substr(1);
 
@@ -2708,7 +2740,8 @@
 void MicrosoftMangleContextImpl::mangleCXXRTTIBaseClassDescriptor(
     const CXXRecordDecl *Derived, uint32_t NVOffset, int32_t VBPtrOffset,
     uint32_t VBTableOffset, uint32_t Flags, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "\01??_R1";
   Mangler.mangleNumber(NVOffset);
   Mangler.mangleNumber(VBPtrOffset);
@@ -2720,7 +2753,8 @@
 
 void MicrosoftMangleContextImpl::mangleCXXRTTIBaseClassArray(
     const CXXRecordDecl *Derived, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "\01??_R2";
   Mangler.mangleName(Derived);
   Mangler.getStream() << "8";
@@ -2728,7 +2762,8 @@
 
 void MicrosoftMangleContextImpl::mangleCXXRTTIClassHierarchyDescriptor(
     const CXXRecordDecl *Derived, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "\01??_R3";
   Mangler.mangleName(Derived);
   Mangler.getStream() << "8";
@@ -2741,18 +2776,26 @@
   //                    <cvr-qualifiers> [<name>] @
   // NOTE: <cvr-qualifiers> here is always 'B' (const). <storage-class>
   // is always '6' for vftables.
-  MicrosoftCXXNameMangler Mangler(*this, Out);
-  Mangler.getStream() << "\01??_R4";
-  Mangler.mangleName(Derived);
-  Mangler.getStream() << "6B"; // '6' for vftable, 'B' for const.
-  for (const CXXRecordDecl *RD : BasePath)
-    Mangler.mangleName(RD);
-  Mangler.getStream() << '@';
+  llvm::SmallString<64> VFTableMangling;
+  llvm::raw_svector_ostream Stream(VFTableMangling);
+  mangleCXXVFTable(Derived, BasePath, Stream);
+
+  if (VFTableMangling.startswith("\01??@")) {
+    assert(VFTableMangling.endswith("@"));
+    Out << VFTableMangling << "??_R4@";
+    return;
+  }
+
+  assert(VFTableMangling.startswith("\01??_7") ||
+         VFTableMangling.startswith("\01??_S"));
+
+  Out << "\01??_R4" << StringRef(VFTableMangling).drop_front(5);
 }
 
 void MicrosoftMangleContextImpl::mangleSEHFilterExpression(
     const NamedDecl *EnclosingDecl, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   // The function body is in the same comdat as the function with the handler,
   // so the numbering here doesn't have to be the same across TUs.
   //
@@ -2763,7 +2806,8 @@
 
 void MicrosoftMangleContextImpl::mangleSEHFinallyBlock(
     const NamedDecl *EnclosingDecl, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   // The function body is in the same comdat as the function with the handler,
   // so the numbering here doesn't have to be the same across TUs.
   //
@@ -2783,20 +2827,23 @@
 void MicrosoftMangleContextImpl::mangleCXXCtor(const CXXConstructorDecl *D,
                                                CXXCtorType Type,
                                                raw_ostream &Out) {
-  MicrosoftCXXNameMangler mangler(*this, Out, D, Type);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler mangler(*this, MHO, D, Type);
   mangler.mangle(D);
 }
 
 void MicrosoftMangleContextImpl::mangleCXXDtor(const CXXDestructorDecl *D,
                                                CXXDtorType Type,
                                                raw_ostream &Out) {
-  MicrosoftCXXNameMangler mangler(*this, Out, D, Type);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler mangler(*this, MHO, D, Type);
   mangler.mangle(D);
 }
 
 void MicrosoftMangleContextImpl::mangleReferenceTemporary(
     const VarDecl *VD, unsigned ManglingNumber, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
 
   Mangler.getStream() << "\01?$RT" << ManglingNumber << '@';
   Mangler.mangle(VD, "");
@@ -2804,10 +2851,12 @@
 
 void MicrosoftMangleContextImpl::mangleThreadSafeStaticGuardVariable(
     const VarDecl *VD, unsigned GuardNum, raw_ostream &Out) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
 
   Mangler.getStream() << "\01?$TSS" << GuardNum << '@';
   Mangler.mangleNestedName(VD);
+  Mangler.getStream() << "@4HA";
 }
 
 void MicrosoftMangleContextImpl::mangleStaticGuardVariable(const VarDecl *VD,
@@ -2822,7 +2871,8 @@
   // than 32 static locals.  We don't fully implement the second mangling
   // because those guards are not externally visible, and instead use LLVM's
   // default renaming when creating a new guard variable.
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
 
   bool Visible = VD->isExternallyVisible();
   if (Visible) {
@@ -2844,9 +2894,10 @@
 }
 
 void MicrosoftMangleContextImpl::mangleInitFiniStub(const VarDecl *D,
-                                                    raw_ostream &Out,
-                                                    char CharCode) {
-  MicrosoftCXXNameMangler Mangler(*this, Out);
+                                                    char CharCode,
+                                                    raw_ostream &Out) {
+  msvc_hashing_ostream MHO(Out);
+  MicrosoftCXXNameMangler Mangler(*this, MHO);
   Mangler.getStream() << "\01??__" << CharCode;
   Mangler.mangleName(D);
   if (D->isStaticDataMember()) {
@@ -2861,14 +2912,14 @@
 void MicrosoftMangleContextImpl::mangleDynamicInitializer(const VarDecl *D,
                                                           raw_ostream &Out) {
   // <initializer-name> ::= ?__E <name> YAXXZ
-  mangleInitFiniStub(D, Out, 'E');
+  mangleInitFiniStub(D, 'E', Out);
 }
 
 void
 MicrosoftMangleContextImpl::mangleDynamicAtExitDestructor(const VarDecl *D,
                                                           raw_ostream &Out) {
   // <destructor-name> ::= ?__F <name> YAXXZ
-  mangleInitFiniStub(D, Out, 'F');
+  mangleInitFiniStub(D, 'F', Out);
 }
 
 void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral *SL,
diff --git a/lib/AST/NSAPI.cpp b/lib/AST/NSAPI.cpp
index c562dae..ac2a8d3 100644
--- a/lib/AST/NSAPI.cpp
+++ b/lib/AST/NSAPI.cpp
@@ -441,22 +441,14 @@
   case BuiltinType::Int128:
   case BuiltinType::LongDouble:
   case BuiltinType::UInt128:
+  case BuiltinType::Float128:
   case BuiltinType::NullPtr:
   case BuiltinType::ObjCClass:
   case BuiltinType::ObjCId:
   case BuiltinType::ObjCSel:
-  case BuiltinType::OCLImage1d:
-  case BuiltinType::OCLImage1dArray:
-  case BuiltinType::OCLImage1dBuffer:
-  case BuiltinType::OCLImage2d:
-  case BuiltinType::OCLImage2dArray:
-  case BuiltinType::OCLImage2dDepth:
-  case BuiltinType::OCLImage2dArrayDepth:
-  case BuiltinType::OCLImage2dMSAA:
-  case BuiltinType::OCLImage2dArrayMSAA:
-  case BuiltinType::OCLImage2dMSAADepth:
-  case BuiltinType::OCLImage2dArrayMSAADepth:
-  case BuiltinType::OCLImage3d:
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
   case BuiltinType::OCLEvent:
   case BuiltinType::OCLClkEvent:
diff --git a/lib/AST/NestedNameSpecifier.cpp b/lib/AST/NestedNameSpecifier.cpp
index ede3862..82809d7 100644
--- a/lib/AST/NestedNameSpecifier.cpp
+++ b/lib/AST/NestedNameSpecifier.cpp
@@ -171,10 +171,19 @@
 
 /// \brief Retrieve the record declaration stored in this nested name specifier.
 CXXRecordDecl *NestedNameSpecifier::getAsRecordDecl() const {
-  if (Prefix.getInt() == StoredDecl)
+  switch (Prefix.getInt()) {
+  case StoredIdentifier:
+    return nullptr;
+
+  case StoredDecl:
     return dyn_cast<CXXRecordDecl>(static_cast<NamedDecl *>(Specifier));
 
-  return nullptr;
+  case StoredTypeSpec:
+  case StoredTypeSpecWithTemplate:
+    return getAsType()->getAsCXXRecordDecl();
+  }
+
+  llvm_unreachable("Invalid NNS Kind!");
 }
 
 /// \brief Whether this nested name specifier refers to a dependent
@@ -306,7 +315,7 @@
 
       // Print the template argument list.
       TemplateSpecializationType::PrintTemplateArgumentList(
-          OS, SpecType->getArgs(), SpecType->getNumArgs(), InnerPolicy);
+          OS, SpecType->template_arguments(), InnerPolicy);
     } else {
       // Print the type normally
       QualType(T, 0).print(OS, InnerPolicy);
diff --git a/lib/AST/OpenMPClause.cpp b/lib/AST/OpenMPClause.cpp
index 1ef43f7..a28b9f3 100644
--- a/lib/AST/OpenMPClause.cpp
+++ b/lib/AST/OpenMPClause.cpp
@@ -29,6 +29,139 @@
   llvm_unreachable("unknown OMPClause");
 }
 
+OMPClauseWithPreInit *OMPClauseWithPreInit::get(OMPClause *C) {
+  auto *Res = OMPClauseWithPreInit::get(const_cast<const OMPClause *>(C));
+  return Res ? const_cast<OMPClauseWithPreInit *>(Res) : nullptr;
+}
+
+const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
+  switch (C->getClauseKind()) {
+  case OMPC_schedule:
+    return static_cast<const OMPScheduleClause *>(C);
+  case OMPC_dist_schedule:
+    return static_cast<const OMPDistScheduleClause *>(C);
+  case OMPC_firstprivate:
+    return static_cast<const OMPFirstprivateClause *>(C);
+  case OMPC_lastprivate:
+    return static_cast<const OMPLastprivateClause *>(C);
+  case OMPC_reduction:
+    return static_cast<const OMPReductionClause *>(C);
+  case OMPC_linear:
+    return static_cast<const OMPLinearClause *>(C);
+  case OMPC_default:
+  case OMPC_proc_bind:
+  case OMPC_if:
+  case OMPC_final:
+  case OMPC_num_threads:
+  case OMPC_safelen:
+  case OMPC_simdlen:
+  case OMPC_collapse:
+  case OMPC_private:
+  case OMPC_shared:
+  case OMPC_aligned:
+  case OMPC_copyin:
+  case OMPC_copyprivate:
+  case OMPC_ordered:
+  case OMPC_nowait:
+  case OMPC_untied:
+  case OMPC_mergeable:
+  case OMPC_threadprivate:
+  case OMPC_flush:
+  case OMPC_read:
+  case OMPC_write:
+  case OMPC_update:
+  case OMPC_capture:
+  case OMPC_seq_cst:
+  case OMPC_depend:
+  case OMPC_device:
+  case OMPC_threads:
+  case OMPC_simd:
+  case OMPC_map:
+  case OMPC_num_teams:
+  case OMPC_thread_limit:
+  case OMPC_priority:
+  case OMPC_grainsize:
+  case OMPC_nogroup:
+  case OMPC_num_tasks:
+  case OMPC_hint:
+  case OMPC_defaultmap:
+  case OMPC_unknown:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
+    break;
+  }
+
+  return nullptr;
+}
+
+OMPClauseWithPostUpdate *OMPClauseWithPostUpdate::get(OMPClause *C) {
+  auto *Res = OMPClauseWithPostUpdate::get(const_cast<const OMPClause *>(C));
+  return Res ? const_cast<OMPClauseWithPostUpdate *>(Res) : nullptr;
+}
+
+const OMPClauseWithPostUpdate *OMPClauseWithPostUpdate::get(const OMPClause *C) {
+  switch (C->getClauseKind()) {
+  case OMPC_lastprivate:
+    return static_cast<const OMPLastprivateClause *>(C);
+  case OMPC_reduction:
+    return static_cast<const OMPReductionClause *>(C);
+  case OMPC_linear:
+    return static_cast<const OMPLinearClause *>(C);
+  case OMPC_schedule:
+  case OMPC_dist_schedule:
+  case OMPC_firstprivate:
+  case OMPC_default:
+  case OMPC_proc_bind:
+  case OMPC_if:
+  case OMPC_final:
+  case OMPC_num_threads:
+  case OMPC_safelen:
+  case OMPC_simdlen:
+  case OMPC_collapse:
+  case OMPC_private:
+  case OMPC_shared:
+  case OMPC_aligned:
+  case OMPC_copyin:
+  case OMPC_copyprivate:
+  case OMPC_ordered:
+  case OMPC_nowait:
+  case OMPC_untied:
+  case OMPC_mergeable:
+  case OMPC_threadprivate:
+  case OMPC_flush:
+  case OMPC_read:
+  case OMPC_write:
+  case OMPC_update:
+  case OMPC_capture:
+  case OMPC_seq_cst:
+  case OMPC_depend:
+  case OMPC_device:
+  case OMPC_threads:
+  case OMPC_simd:
+  case OMPC_map:
+  case OMPC_num_teams:
+  case OMPC_thread_limit:
+  case OMPC_priority:
+  case OMPC_grainsize:
+  case OMPC_nogroup:
+  case OMPC_num_tasks:
+  case OMPC_hint:
+  case OMPC_defaultmap:
+  case OMPC_unknown:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
+    break;
+  }
+
+  return nullptr;
+}
+
 void OMPPrivateClause::setPrivateCopies(ArrayRef<Expr *> VL) {
   assert(VL.size() == varlist_size() &&
          "Number of private copies is not the same as the preallocated buffer");
@@ -70,13 +203,14 @@
 OMPFirstprivateClause::Create(const ASTContext &C, SourceLocation StartLoc,
                               SourceLocation LParenLoc, SourceLocation EndLoc,
                               ArrayRef<Expr *> VL, ArrayRef<Expr *> PrivateVL,
-                              ArrayRef<Expr *> InitVL) {
+                              ArrayRef<Expr *> InitVL, Stmt *PreInit) {
   void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(3 * VL.size()));
   OMPFirstprivateClause *Clause =
       new (Mem) OMPFirstprivateClause(StartLoc, LParenLoc, EndLoc, VL.size());
   Clause->setVarRefs(VL);
   Clause->setPrivateCopies(PrivateVL);
   Clause->setInits(InitVL);
+  Clause->setPreInitStmt(PreInit);
   return Clause;
 }
 
@@ -117,7 +251,8 @@
 OMPLastprivateClause *OMPLastprivateClause::Create(
     const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation EndLoc, ArrayRef<Expr *> VL, ArrayRef<Expr *> SrcExprs,
-    ArrayRef<Expr *> DstExprs, ArrayRef<Expr *> AssignmentOps) {
+    ArrayRef<Expr *> DstExprs, ArrayRef<Expr *> AssignmentOps, Stmt *PreInit,
+    Expr *PostUpdate) {
   void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(5 * VL.size()));
   OMPLastprivateClause *Clause =
       new (Mem) OMPLastprivateClause(StartLoc, LParenLoc, EndLoc, VL.size());
@@ -125,6 +260,8 @@
   Clause->setSourceExprs(SrcExprs);
   Clause->setDestinationExprs(DstExprs);
   Clause->setAssignmentOps(AssignmentOps);
+  Clause->setPreInitStmt(PreInit);
+  Clause->setPostUpdateExpr(PostUpdate);
   return Clause;
 }
 
@@ -179,7 +316,8 @@
     const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
     OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc,
     SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef<Expr *> VL,
-    ArrayRef<Expr *> PL, ArrayRef<Expr *> IL, Expr *Step, Expr *CalcStep) {
+    ArrayRef<Expr *> PL, ArrayRef<Expr *> IL, Expr *Step, Expr *CalcStep,
+    Stmt *PreInit, Expr *PostUpdate) {
   // Allocate space for 4 lists (Vars, Inits, Updates, Finals) and 2 expressions
   // (Step and CalcStep).
   void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(5 * VL.size() + 2));
@@ -196,6 +334,8 @@
             nullptr);
   Clause->setStep(Step);
   Clause->setCalcStep(CalcStep);
+  Clause->setPreInitStmt(PreInit);
+  Clause->setPostUpdateExpr(PostUpdate);
   return Clause;
 }
 
@@ -340,7 +480,8 @@
     SourceLocation EndLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VL,
     NestedNameSpecifierLoc QualifierLoc, const DeclarationNameInfo &NameInfo,
     ArrayRef<Expr *> Privates, ArrayRef<Expr *> LHSExprs,
-    ArrayRef<Expr *> RHSExprs, ArrayRef<Expr *> ReductionOps) {
+    ArrayRef<Expr *> RHSExprs, ArrayRef<Expr *> ReductionOps, Stmt *PreInit,
+    Expr *PostUpdate) {
   void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(5 * VL.size()));
   OMPReductionClause *Clause = new (Mem) OMPReductionClause(
       StartLoc, LParenLoc, EndLoc, ColonLoc, VL.size(), QualifierLoc, NameInfo);
@@ -349,6 +490,8 @@
   Clause->setLHSExprs(LHSExprs);
   Clause->setRHSExprs(RHSExprs);
   Clause->setReductionOps(ReductionOps);
+  Clause->setPreInitStmt(PreInit);
+  Clause->setPostUpdateExpr(PostUpdate);
   return Clause;
 }
 
@@ -363,7 +506,7 @@
                                        SourceLocation LParenLoc,
                                        SourceLocation EndLoc,
                                        ArrayRef<Expr *> VL) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size()));
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size() + 1));
   OMPFlushClause *Clause =
       new (Mem) OMPFlushClause(StartLoc, LParenLoc, EndLoc, VL.size());
   Clause->setVarRefs(VL);
@@ -375,43 +518,327 @@
   return new (Mem) OMPFlushClause(N);
 }
 
-OMPDependClause *
-OMPDependClause::Create(const ASTContext &C, SourceLocation StartLoc,
-                        SourceLocation LParenLoc, SourceLocation EndLoc,
-                        OpenMPDependClauseKind DepKind, SourceLocation DepLoc,
-                        SourceLocation ColonLoc, ArrayRef<Expr *> VL) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size()));
+OMPDependClause *OMPDependClause::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation EndLoc, OpenMPDependClauseKind DepKind,
+    SourceLocation DepLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VL) {
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size() + 1));
   OMPDependClause *Clause =
       new (Mem) OMPDependClause(StartLoc, LParenLoc, EndLoc, VL.size());
   Clause->setVarRefs(VL);
   Clause->setDependencyKind(DepKind);
   Clause->setDependencyLoc(DepLoc);
   Clause->setColonLoc(ColonLoc);
+  Clause->setCounterValue(nullptr);
   return Clause;
 }
 
 OMPDependClause *OMPDependClause::CreateEmpty(const ASTContext &C, unsigned N) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(N));
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(N + 1));
   return new (Mem) OMPDependClause(N);
 }
 
-OMPMapClause *OMPMapClause::Create(const ASTContext &C, SourceLocation StartLoc,
-                                   SourceLocation LParenLoc,
-                                   SourceLocation EndLoc, ArrayRef<Expr *> VL,
-                                   OpenMPMapClauseKind TypeModifier,
-                                   OpenMPMapClauseKind Type,
-                                   SourceLocation TypeLoc) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size()));
+void OMPDependClause::setCounterValue(Expr *V) {
+  assert(getDependencyKind() == OMPC_DEPEND_sink ||
+         getDependencyKind() == OMPC_DEPEND_source || V == nullptr);
+  *getVarRefs().end() = V;
+}
+
+const Expr *OMPDependClause::getCounterValue() const {
+  auto *V = *getVarRefs().end();
+  assert(getDependencyKind() == OMPC_DEPEND_sink ||
+         getDependencyKind() == OMPC_DEPEND_source || V == nullptr);
+  return V;
+}
+
+Expr *OMPDependClause::getCounterValue() {
+  auto *V = *getVarRefs().end();
+  assert(getDependencyKind() == OMPC_DEPEND_sink ||
+         getDependencyKind() == OMPC_DEPEND_source || V == nullptr);
+  return V;
+}
+
+unsigned OMPClauseMappableExprCommon::getComponentsTotalNumber(
+    MappableExprComponentListsRef ComponentLists) {
+  unsigned TotalNum = 0u;
+  for (auto &C : ComponentLists)
+    TotalNum += C.size();
+  return TotalNum;
+}
+
+unsigned OMPClauseMappableExprCommon::getUniqueDeclarationsTotalNumber(
+    ArrayRef<ValueDecl *> Declarations) {
+  unsigned TotalNum = 0u;
+  llvm::SmallPtrSet<const ValueDecl *, 8> Cache;
+  for (auto *D : Declarations) {
+    const ValueDecl *VD = D ? cast<ValueDecl>(D->getCanonicalDecl()) : nullptr;
+    if (Cache.count(VD))
+      continue;
+    ++TotalNum;
+    Cache.insert(VD);
+  }
+  return TotalNum;
+}
+
+OMPMapClause *
+OMPMapClause::Create(const ASTContext &C, SourceLocation StartLoc,
+                     SourceLocation LParenLoc, SourceLocation EndLoc,
+                     ArrayRef<Expr *> Vars, ArrayRef<ValueDecl *> Declarations,
+                     MappableExprComponentListsRef ComponentLists,
+                     OpenMPMapClauseKind TypeModifier, OpenMPMapClauseKind Type,
+                     bool TypeIsImplicit, SourceLocation TypeLoc) {
+
+  unsigned NumVars = Vars.size();
+  unsigned NumUniqueDeclarations =
+      getUniqueDeclarationsTotalNumber(Declarations);
+  unsigned NumComponentLists = ComponentLists.size();
+  unsigned NumComponents = getComponentsTotalNumber(ComponentLists);
+
+  // We need to allocate:
+  // NumVars x Expr* - we have an original list expression for each clause list
+  // entry.
+  // NumUniqueDeclarations x ValueDecl* - unique base declarations associated
+  // with each component list.
+  // (NumUniqueDeclarations + NumComponentLists) x unsigned - we specify the
+  // number of lists for each unique declaration and the size of each component
+  // list.
+  // NumComponents x MappableComponent - the total of all the components in all
+  // the lists.
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
   OMPMapClause *Clause = new (Mem) OMPMapClause(
-      TypeModifier, Type, TypeLoc, StartLoc, LParenLoc, EndLoc, VL.size());
-  Clause->setVarRefs(VL);
+      TypeModifier, Type, TypeIsImplicit, TypeLoc, StartLoc, LParenLoc, EndLoc,
+      NumVars, NumUniqueDeclarations, NumComponentLists, NumComponents);
+
+  Clause->setVarRefs(Vars);
+  Clause->setClauseInfo(Declarations, ComponentLists);
   Clause->setMapTypeModifier(TypeModifier);
   Clause->setMapType(Type);
   Clause->setMapLoc(TypeLoc);
   return Clause;
 }
 
-OMPMapClause *OMPMapClause::CreateEmpty(const ASTContext &C, unsigned N) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(N));
-  return new (Mem) OMPMapClause(N);
+OMPMapClause *OMPMapClause::CreateEmpty(const ASTContext &C, unsigned NumVars,
+                                        unsigned NumUniqueDeclarations,
+                                        unsigned NumComponentLists,
+                                        unsigned NumComponents) {
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+  return new (Mem) OMPMapClause(NumVars, NumUniqueDeclarations,
+                                NumComponentLists, NumComponents);
+}
+
+OMPToClause *OMPToClause::Create(const ASTContext &C, SourceLocation StartLoc,
+                                 SourceLocation LParenLoc,
+                                 SourceLocation EndLoc, ArrayRef<Expr *> Vars,
+                                 ArrayRef<ValueDecl *> Declarations,
+                                 MappableExprComponentListsRef ComponentLists) {
+  unsigned NumVars = Vars.size();
+  unsigned NumUniqueDeclarations =
+      getUniqueDeclarationsTotalNumber(Declarations);
+  unsigned NumComponentLists = ComponentLists.size();
+  unsigned NumComponents = getComponentsTotalNumber(ComponentLists);
+
+  // We need to allocate:
+  // NumVars x Expr* - we have an original list expression for each clause list
+  // entry.
+  // NumUniqueDeclarations x ValueDecl* - unique base declarations associated
+  // with each component list.
+  // (NumUniqueDeclarations + NumComponentLists) x unsigned - we specify the
+  // number of lists for each unique declaration and the size of each component
+  // list.
+  // NumComponents x MappableComponent - the total of all the components in all
+  // the lists.
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+
+  OMPToClause *Clause = new (Mem)
+      OMPToClause(StartLoc, LParenLoc, EndLoc, NumVars, NumUniqueDeclarations,
+                  NumComponentLists, NumComponents);
+
+  Clause->setVarRefs(Vars);
+  Clause->setClauseInfo(Declarations, ComponentLists);
+  return Clause;
+}
+
+OMPToClause *OMPToClause::CreateEmpty(const ASTContext &C, unsigned NumVars,
+                                      unsigned NumUniqueDeclarations,
+                                      unsigned NumComponentLists,
+                                      unsigned NumComponents) {
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+  return new (Mem) OMPToClause(NumVars, NumUniqueDeclarations,
+                               NumComponentLists, NumComponents);
+}
+
+OMPFromClause *
+OMPFromClause::Create(const ASTContext &C, SourceLocation StartLoc,
+                      SourceLocation LParenLoc, SourceLocation EndLoc,
+                      ArrayRef<Expr *> Vars, ArrayRef<ValueDecl *> Declarations,
+                      MappableExprComponentListsRef ComponentLists) {
+  unsigned NumVars = Vars.size();
+  unsigned NumUniqueDeclarations =
+      getUniqueDeclarationsTotalNumber(Declarations);
+  unsigned NumComponentLists = ComponentLists.size();
+  unsigned NumComponents = getComponentsTotalNumber(ComponentLists);
+
+  // We need to allocate:
+  // NumVars x Expr* - we have an original list expression for each clause list
+  // entry.
+  // NumUniqueDeclarations x ValueDecl* - unique base declarations associated
+  // with each component list.
+  // (NumUniqueDeclarations + NumComponentLists) x unsigned - we specify the
+  // number of lists for each unique declaration and the size of each component
+  // list.
+  // NumComponents x MappableComponent - the total of all the components in all
+  // the lists.
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+
+  OMPFromClause *Clause = new (Mem)
+      OMPFromClause(StartLoc, LParenLoc, EndLoc, NumVars, NumUniqueDeclarations,
+                    NumComponentLists, NumComponents);
+
+  Clause->setVarRefs(Vars);
+  Clause->setClauseInfo(Declarations, ComponentLists);
+  return Clause;
+}
+
+OMPFromClause *OMPFromClause::CreateEmpty(const ASTContext &C, unsigned NumVars,
+                                          unsigned NumUniqueDeclarations,
+                                          unsigned NumComponentLists,
+                                          unsigned NumComponents) {
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+  return new (Mem) OMPFromClause(NumVars, NumUniqueDeclarations,
+                                 NumComponentLists, NumComponents);
+}
+
+void OMPUseDevicePtrClause::setPrivateCopies(ArrayRef<Expr *> VL) {
+  assert(VL.size() == varlist_size() &&
+         "Number of private copies is not the same as the preallocated buffer");
+  std::copy(VL.begin(), VL.end(), varlist_end());
+}
+
+void OMPUseDevicePtrClause::setInits(ArrayRef<Expr *> VL) {
+  assert(VL.size() == varlist_size() &&
+         "Number of inits is not the same as the preallocated buffer");
+  std::copy(VL.begin(), VL.end(), getPrivateCopies().end());
+}
+
+OMPUseDevicePtrClause *OMPUseDevicePtrClause::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation EndLoc, ArrayRef<Expr *> Vars, ArrayRef<Expr *> PrivateVars,
+    ArrayRef<Expr *> Inits, ArrayRef<ValueDecl *> Declarations,
+    MappableExprComponentListsRef ComponentLists) {
+  unsigned NumVars = Vars.size();
+  unsigned NumUniqueDeclarations =
+      getUniqueDeclarationsTotalNumber(Declarations);
+  unsigned NumComponentLists = ComponentLists.size();
+  unsigned NumComponents = getComponentsTotalNumber(ComponentLists);
+
+  // We need to allocate:
+  // 3 x NumVars x Expr* - we have an original list expression for each clause
+  // list entry and an equal number of private copies and inits.
+  // NumUniqueDeclarations x ValueDecl* - unique base declarations associated
+  // with each component list.
+  // (NumUniqueDeclarations + NumComponentLists) x unsigned - we specify the
+  // number of lists for each unique declaration and the size of each component
+  // list.
+  // NumComponents x MappableComponent - the total of all the components in all
+  // the lists.
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          3 * NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+
+  OMPUseDevicePtrClause *Clause = new (Mem) OMPUseDevicePtrClause(
+      StartLoc, LParenLoc, EndLoc, NumVars, NumUniqueDeclarations,
+      NumComponentLists, NumComponents);
+
+  Clause->setVarRefs(Vars);
+  Clause->setPrivateCopies(PrivateVars);
+  Clause->setInits(Inits);
+  Clause->setClauseInfo(Declarations, ComponentLists);
+  return Clause;
+}
+
+OMPUseDevicePtrClause *OMPUseDevicePtrClause::CreateEmpty(
+    const ASTContext &C, unsigned NumVars, unsigned NumUniqueDeclarations,
+    unsigned NumComponentLists, unsigned NumComponents) {
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          3 * NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+  return new (Mem) OMPUseDevicePtrClause(NumVars, NumUniqueDeclarations,
+                                         NumComponentLists, NumComponents);
+}
+
+OMPIsDevicePtrClause *
+OMPIsDevicePtrClause::Create(const ASTContext &C, SourceLocation StartLoc,
+                             SourceLocation LParenLoc, SourceLocation EndLoc,
+                             ArrayRef<Expr *> Vars,
+                             ArrayRef<ValueDecl *> Declarations,
+                             MappableExprComponentListsRef ComponentLists) {
+  unsigned NumVars = Vars.size();
+  unsigned NumUniqueDeclarations =
+      getUniqueDeclarationsTotalNumber(Declarations);
+  unsigned NumComponentLists = ComponentLists.size();
+  unsigned NumComponents = getComponentsTotalNumber(ComponentLists);
+
+  // We need to allocate:
+  // NumVars x Expr* - we have an original list expression for each clause list
+  // entry.
+  // NumUniqueDeclarations x ValueDecl* - unique base declarations associated
+  // with each component list.
+  // (NumUniqueDeclarations + NumComponentLists) x unsigned - we specify the
+  // number of lists for each unique declaration and the size of each component
+  // list.
+  // NumComponents x MappableComponent - the total of all the components in all
+  // the lists.
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+
+  OMPIsDevicePtrClause *Clause = new (Mem) OMPIsDevicePtrClause(
+      StartLoc, LParenLoc, EndLoc, NumVars, NumUniqueDeclarations,
+      NumComponentLists, NumComponents);
+
+  Clause->setVarRefs(Vars);
+  Clause->setClauseInfo(Declarations, ComponentLists);
+  return Clause;
+}
+
+OMPIsDevicePtrClause *OMPIsDevicePtrClause::CreateEmpty(
+    const ASTContext &C, unsigned NumVars, unsigned NumUniqueDeclarations,
+    unsigned NumComponentLists, unsigned NumComponents) {
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          NumVars, NumUniqueDeclarations,
+          NumUniqueDeclarations + NumComponentLists, NumComponents));
+  return new (Mem) OMPIsDevicePtrClause(NumVars, NumUniqueDeclarations,
+                                        NumComponentLists, NumComponents);
 }
diff --git a/lib/AST/RecordLayout.cpp b/lib/AST/RecordLayout.cpp
index b2c244e..299fd11 100644
--- a/lib/AST/RecordLayout.cpp
+++ b/lib/AST/RecordLayout.cpp
@@ -18,8 +18,6 @@
 using namespace clang;
 
 void ASTRecordLayout::Destroy(ASTContext &Ctx) {
-  if (FieldOffsets)
-    Ctx.Deallocate(FieldOffsets);
   if (CXXInfo) {
     CXXInfo->~CXXRecordLayoutInfo();
     Ctx.Deallocate(CXXInfo);
@@ -29,18 +27,13 @@
 }
 
 ASTRecordLayout::ASTRecordLayout(const ASTContext &Ctx, CharUnits size,
-                                 CharUnits alignment, 
+                                 CharUnits alignment,
                                  CharUnits requiredAlignment,
                                  CharUnits datasize,
-                                 const uint64_t *fieldoffsets,
-                                 unsigned fieldcount)
-  : Size(size), DataSize(datasize), Alignment(alignment),
-    RequiredAlignment(requiredAlignment), FieldOffsets(nullptr),
-    FieldCount(fieldcount), CXXInfo(nullptr) {
-  if (FieldCount > 0)  {
-    FieldOffsets = new (Ctx) uint64_t[FieldCount];
-    memcpy(FieldOffsets, fieldoffsets, FieldCount * sizeof(*FieldOffsets));
-  }
+                                 ArrayRef<uint64_t> fieldoffsets)
+    : Size(size), DataSize(datasize), Alignment(alignment),
+      RequiredAlignment(requiredAlignment), CXXInfo(nullptr) {
+  FieldOffsets.append(Ctx, fieldoffsets.begin(), fieldoffsets.end());
 }
 
 // Constructor for C++ records.
@@ -50,26 +43,21 @@
                                  bool hasOwnVFPtr, bool hasExtendableVFPtr,
                                  CharUnits vbptroffset,
                                  CharUnits datasize,
-                                 const uint64_t *fieldoffsets,
-                                 unsigned fieldcount,
+                                 ArrayRef<uint64_t> fieldoffsets,
                                  CharUnits nonvirtualsize,
                                  CharUnits nonvirtualalignment,
                                  CharUnits SizeOfLargestEmptySubobject,
                                  const CXXRecordDecl *PrimaryBase,
                                  bool IsPrimaryBaseVirtual,
                                  const CXXRecordDecl *BaseSharingVBPtr,
-                                 bool HasZeroSizedSubObject,
+                                 bool EndsWithZeroSizedObject,
                                  bool LeadsWithZeroSizedBase,
                                  const BaseOffsetsMapTy& BaseOffsets,
                                  const VBaseOffsetsMapTy& VBaseOffsets)
   : Size(size), DataSize(datasize), Alignment(alignment),
-    RequiredAlignment(requiredAlignment), FieldOffsets(nullptr),
-    FieldCount(fieldcount), CXXInfo(new (Ctx) CXXRecordLayoutInfo)
+    RequiredAlignment(requiredAlignment), CXXInfo(new (Ctx) CXXRecordLayoutInfo)
 {
-  if (FieldCount > 0)  {
-    FieldOffsets = new (Ctx) uint64_t[FieldCount];
-    memcpy(FieldOffsets, fieldoffsets, FieldCount * sizeof(*FieldOffsets));
-  }
+  FieldOffsets.append(Ctx, fieldoffsets.begin(), fieldoffsets.end());
 
   CXXInfo->PrimaryBase.setPointer(PrimaryBase);
   CXXInfo->PrimaryBase.setInt(IsPrimaryBaseVirtual);
@@ -82,7 +70,7 @@
   CXXInfo->VBPtrOffset = vbptroffset;
   CXXInfo->HasExtendableVFPtr = hasExtendableVFPtr;
   CXXInfo->BaseSharingVBPtr = BaseSharingVBPtr;
-  CXXInfo->HasZeroSizedSubObject = HasZeroSizedSubObject;
+  CXXInfo->EndsWithZeroSizedObject = EndsWithZeroSizedObject;
   CXXInfo->LeadsWithZeroSizedBase = LeadsWithZeroSizedBase;
 
 
diff --git a/lib/AST/RecordLayoutBuilder.cpp b/lib/AST/RecordLayoutBuilder.cpp
index 0572ff6..cf981be 100644
--- a/lib/AST/RecordLayoutBuilder.cpp
+++ b/lib/AST/RecordLayoutBuilder.cpp
@@ -1558,10 +1558,13 @@
 
   // But, if there's a #pragma pack in play, that takes precedent over
   // even the 'aligned' attribute, for non-zero-width bitfields.
+  unsigned MaxFieldAlignmentInBits = Context.toBits(MaxFieldAlignment);
   if (!MaxFieldAlignment.isZero() && FieldSize) {
-    unsigned MaxFieldAlignmentInBits = Context.toBits(MaxFieldAlignment);
-    FieldAlign = std::min(FieldAlign, MaxFieldAlignmentInBits);
     UnpackedFieldAlign = std::min(UnpackedFieldAlign, MaxFieldAlignmentInBits);
+    if (FieldPacked)
+      FieldAlign = UnpackedFieldAlign;
+    else
+      FieldAlign = std::min(FieldAlign, MaxFieldAlignmentInBits);
   }
 
   // But, ms_struct just ignores all of that in unions, even explicit
@@ -1600,7 +1603,10 @@
         (AllowPadding &&
          (FieldOffset & (FieldAlign-1)) + FieldSize > TypeSize)) {
       FieldOffset = llvm::alignTo(FieldOffset, FieldAlign);
-    } else if (ExplicitFieldAlign) {
+    } else if (ExplicitFieldAlign &&
+               (MaxFieldAlignmentInBits == 0 ||
+                ExplicitFieldAlign <= MaxFieldAlignmentInBits) &&
+               Context.getTargetInfo().useExplicitBitFieldAlignment()) {
       // TODO: figure it out what needs to be done on targets that don't honor
       // bit-field type alignment like ARM APCS ABI.
       FieldOffset = llvm::alignTo(FieldOffset, ExplicitFieldAlign);
@@ -1612,7 +1618,10 @@
          (UnpackedFieldOffset & (UnpackedFieldAlign-1)) + FieldSize > TypeSize))
       UnpackedFieldOffset =
           llvm::alignTo(UnpackedFieldOffset, UnpackedFieldAlign);
-    else if (ExplicitFieldAlign)
+    else if (ExplicitFieldAlign &&
+             (MaxFieldAlignmentInBits == 0 ||
+              ExplicitFieldAlign <= MaxFieldAlignmentInBits) &&
+             Context.getTargetInfo().useExplicitBitFieldAlignment())
       UnpackedFieldOffset =
           llvm::alignTo(UnpackedFieldOffset, ExplicitFieldAlign);
   }
@@ -2123,7 +2132,7 @@
 //   function pointer) and a vbptr (virtual base pointer).  They can each be
 //   shared with a, non-virtual bases. These bases need not be the same.  vfptrs
 //   always occur at offset 0.  vbptrs can occur at an arbitrary offset and are
-//   placed after the lexiographically last non-virtual base.  This placement
+//   placed after the lexicographically last non-virtual base.  This placement
 //   is always before fields but can be in the middle of the non-virtual bases
 //   due to the two-pass layout scheme for non-virtual-bases.
 // * Virtual bases sometimes require a 'vtordisp' field that is laid out before
@@ -2144,7 +2153,7 @@
 //   pushes all bases and fields back by the alignment imposed by those bases
 //   and fields.  This can potentially add a significant amount of padding.
 //   vbptrs are injected immediately after the last non-virtual base as
-//   lexiographically ordered in the code.  If this site isn't pointer aligned
+//   lexicographically ordered in the code.  If this site isn't pointer aligned
 //   the vbptr is placed at the next properly aligned location.  Enough padding
 //   is added to guarantee a fit.
 // * The last zero sized non-virtual base can be placed at the end of the
@@ -2219,7 +2228,8 @@
   /// laid out.
   void initializeCXXLayout(const CXXRecordDecl *RD);
   void layoutNonVirtualBases(const CXXRecordDecl *RD);
-  void layoutNonVirtualBase(const CXXRecordDecl *BaseDecl,
+  void layoutNonVirtualBase(const CXXRecordDecl *RD,
+                            const CXXRecordDecl *BaseDecl,
                             const ASTRecordLayout &BaseLayout,
                             const ASTRecordLayout *&PreviousBaseLayout);
   void injectVFPtr(const CXXRecordDecl *RD);
@@ -2325,7 +2335,7 @@
   if (!MaxFieldAlignment.isZero())
     Info.Alignment = std::min(Info.Alignment, MaxFieldAlignment);
   // Track zero-sized subobjects here where it's already available.
-  EndsWithZeroSizedObject = Layout.hasZeroSizedSubObject();
+  EndsWithZeroSizedObject = Layout.endsWithZeroSizedObject();
   // Respect required alignment, this is necessary because we may have adjusted
   // the alignment in the case of pragam pack.  Note that the required alignment
   // doesn't actually apply to the struct alignment at this point.
@@ -2360,7 +2370,7 @@
     if (auto RT =
             FD->getType()->getBaseElementTypeUnsafe()->getAs<RecordType>()) {
       auto const &Layout = Context.getASTRecordLayout(RT->getDecl());
-      EndsWithZeroSizedObject = Layout.hasZeroSizedSubObject();
+      EndsWithZeroSizedObject = Layout.endsWithZeroSizedObject();
       FieldRequiredAlignment = std::max(FieldRequiredAlignment,
                                         Layout.getRequiredAlignment());
     }
@@ -2467,7 +2477,7 @@
   // out any bases that do not contain vfptrs.  We implement this as two passes
   // over the bases.  This approach guarantees that the primary base is laid out
   // first.  We use these passes to calculate some additional aggregated
-  // information about the bases, such as reqruied alignment and the presence of
+  // information about the bases, such as required alignment and the presence of
   // zero sized members.
   const ASTRecordLayout *PreviousBaseLayout = nullptr;
   // Iterate through the bases and lay out the non-virtual ones.
@@ -2479,7 +2489,7 @@
       HasVBPtr = true;
       continue;
     }
-    // Check fo a base to share a VBPtr with.
+    // Check for a base to share a VBPtr with.
     if (!SharedVBPtrBase && BaseLayout.hasVBPtr()) {
       SharedVBPtrBase = BaseDecl;
       HasVBPtr = true;
@@ -2493,7 +2503,7 @@
       LeadsWithZeroSizedBase = BaseLayout.leadsWithZeroSizedBase();
     }
     // Lay out the base.
-    layoutNonVirtualBase(BaseDecl, BaseLayout, PreviousBaseLayout);
+    layoutNonVirtualBase(RD, BaseDecl, BaseLayout, PreviousBaseLayout);
   }
   // Figure out if we need a fresh VFPtr for this class.
   if (!PrimaryBase && RD->isDynamicClass())
@@ -2522,7 +2532,7 @@
       LeadsWithZeroSizedBase = BaseLayout.leadsWithZeroSizedBase();
     }
     // Lay out the base.
-    layoutNonVirtualBase(BaseDecl, BaseLayout, PreviousBaseLayout);
+    layoutNonVirtualBase(RD, BaseDecl, BaseLayout, PreviousBaseLayout);
     VBPtrOffset = Bases[BaseDecl] + BaseLayout.getNonVirtualSize();
   }
   // Set our VBPtroffset if we know it at this point.
@@ -2534,15 +2544,32 @@
   }
 }
 
+static bool recordUsesEBO(const RecordDecl *RD) {
+  if (!isa<CXXRecordDecl>(RD))
+    return false;
+  if (RD->hasAttr<EmptyBasesAttr>())
+    return true;
+  if (auto *LVA = RD->getAttr<LayoutVersionAttr>())
+    // TODO: Double check with the next version of MSVC.
+    if (LVA->getVersion() <= LangOptions::MSVC2015)
+      return false;
+  // TODO: Some later version of MSVC will change the default behavior of the
+  // compiler to enable EBO by default.  When this happens, we will need an
+  // additional isCompatibleWithMSVC check.
+  return false;
+}
+
 void MicrosoftRecordLayoutBuilder::layoutNonVirtualBase(
+    const CXXRecordDecl *RD,
     const CXXRecordDecl *BaseDecl,
     const ASTRecordLayout &BaseLayout,
     const ASTRecordLayout *&PreviousBaseLayout) {
   // Insert padding between two bases if the left first one is zero sized or
   // contains a zero sized subobject and the right is zero sized or one leads
   // with a zero sized base.
-  if (PreviousBaseLayout && PreviousBaseLayout->hasZeroSizedSubObject() &&
-      BaseLayout.leadsWithZeroSizedBase())
+  bool MDCUsesEBO = recordUsesEBO(RD);
+  if (PreviousBaseLayout && PreviousBaseLayout->endsWithZeroSizedObject() &&
+      BaseLayout.leadsWithZeroSizedBase() && !MDCUsesEBO)
     Size++;
   ElementInfo Info = getAdjustedElementInfo(BaseLayout);
   CharUnits BaseOffset;
@@ -2551,14 +2578,23 @@
   bool FoundBase = false;
   if (UseExternalLayout) {
     FoundBase = External.getExternalNVBaseOffset(BaseDecl, BaseOffset);
-    if (FoundBase)
+    if (FoundBase) {
       assert(BaseOffset >= Size && "base offset already allocated");
+      Size = BaseOffset;
+    }
   }
 
-  if (!FoundBase)
-    BaseOffset = Size.alignTo(Info.Alignment);
+  if (!FoundBase) {
+    if (MDCUsesEBO && BaseDecl->isEmpty() &&
+        BaseLayout.getNonVirtualSize() == CharUnits::Zero()) {
+      BaseOffset = CharUnits::Zero();
+    } else {
+      // Otherwise, lay the base out at the end of the MDC.
+      BaseOffset = Size = Size.alignTo(Info.Alignment);
+    }
+  }
   Bases.insert(std::make_pair(BaseDecl, BaseOffset));
-  Size = BaseOffset + BaseLayout.getNonVirtualSize();
+  Size += BaseLayout.getNonVirtualSize();
   PreviousBaseLayout = &BaseLayout;
 }
 
@@ -2737,8 +2773,9 @@
     // with a zero sized base.  The padding between virtual bases is 4
     // bytes (in both 32 and 64 bits modes) and always involves rounding up to
     // the required alignment, we don't know why.
-    if ((PreviousBaseLayout && PreviousBaseLayout->hasZeroSizedSubObject() &&
-        BaseLayout.leadsWithZeroSizedBase()) || HasVtordisp) {
+    if ((PreviousBaseLayout && PreviousBaseLayout->endsWithZeroSizedObject() &&
+         BaseLayout.leadsWithZeroSizedBase() && !recordUsesEBO(RD)) ||
+        HasVtordisp) {
       Size = Size.alignTo(VtorDispAlignment) + VtorDispSize;
       Alignment = std::max(VtorDispAlignment, Alignment);
     }
@@ -2776,8 +2813,10 @@
     Size = Size.alignTo(RoundingAlignment);
   }
   if (Size.isZero()) {
-    EndsWithZeroSizedObject = true;
-    LeadsWithZeroSizedBase = true;
+    if (!recordUsesEBO(RD) || !cast<CXXRecordDecl>(RD)->isEmpty()) {
+      EndsWithZeroSizedObject = true;
+      LeadsWithZeroSizedBase = true;
+    }
     // Zero-sized structures have size equal to their alignment if a
     // __declspec(align) came into play.
     if (RequiredAlignment >= MinEmptyStructSize)
@@ -2910,8 +2949,7 @@
       NewEntry = new (*this) ASTRecordLayout(
           *this, Builder.Size, Builder.Alignment, Builder.RequiredAlignment,
           Builder.HasOwnVFPtr, Builder.HasOwnVFPtr || Builder.PrimaryBase,
-          Builder.VBPtrOffset, Builder.NonVirtualSize,
-          Builder.FieldOffsets.data(), Builder.FieldOffsets.size(),
+          Builder.VBPtrOffset, Builder.DataSize, Builder.FieldOffsets,
           Builder.NonVirtualSize, Builder.Alignment, CharUnits::Zero(),
           Builder.PrimaryBase, false, Builder.SharedVBPtrBase,
           Builder.EndsWithZeroSizedObject, Builder.LeadsWithZeroSizedBase,
@@ -2920,8 +2958,7 @@
       Builder.layout(D);
       NewEntry = new (*this) ASTRecordLayout(
           *this, Builder.Size, Builder.Alignment, Builder.RequiredAlignment,
-          Builder.Size, Builder.FieldOffsets.data(),
-          Builder.FieldOffsets.size());
+          Builder.Size, Builder.FieldOffsets);
     }
   } else {
     if (const auto *RD = dyn_cast<CXXRecordDecl>(D)) {
@@ -2944,9 +2981,8 @@
           *this, Builder.getSize(), Builder.Alignment,
           /*RequiredAlignment : used by MS-ABI)*/
           Builder.Alignment, Builder.HasOwnVFPtr, RD->isDynamicClass(),
-          CharUnits::fromQuantity(-1), DataSize, Builder.FieldOffsets.data(),
-          Builder.FieldOffsets.size(), NonVirtualSize,
-          Builder.NonVirtualAlignment,
+          CharUnits::fromQuantity(-1), DataSize, Builder.FieldOffsets,
+          NonVirtualSize, Builder.NonVirtualAlignment,
           EmptySubobjects.SizeOfLargestEmptySubobject, Builder.PrimaryBase,
           Builder.PrimaryBaseIsVirtual, nullptr, false, false, Builder.Bases,
           Builder.VBases);
@@ -2957,8 +2993,7 @@
       NewEntry = new (*this) ASTRecordLayout(
           *this, Builder.getSize(), Builder.Alignment,
           /*RequiredAlignment : used by MS-ABI)*/
-          Builder.Alignment, Builder.getSize(), Builder.FieldOffsets.data(),
-          Builder.FieldOffsets.size());
+          Builder.Alignment, Builder.getSize(), Builder.FieldOffsets);
     }
   }
 
@@ -3061,7 +3096,7 @@
   // Add in synthesized ivar count if laying out an implementation.
   if (Impl) {
     unsigned SynthCount = CountNonClassIvars(D);
-    // If there aren't any sythesized ivars then reuse the interface
+    // If there aren't any synthesized ivars then reuse the interface
     // entry. Note we can't cache this because we simply free all
     // entries later; however we shouldn't look up implementations
     // frequently.
@@ -3073,13 +3108,12 @@
   Builder.Layout(D);
 
   const ASTRecordLayout *NewEntry =
-    new (*this) ASTRecordLayout(*this, Builder.getSize(), 
+    new (*this) ASTRecordLayout(*this, Builder.getSize(),
                                 Builder.Alignment,
                                 /*RequiredAlignment : used by MS-ABI)*/
                                 Builder.Alignment,
                                 Builder.getDataSize(),
-                                Builder.FieldOffsets.data(),
-                                Builder.FieldOffsets.size());
+                                Builder.FieldOffsets);
 
   ObjCLayouts[Key] = NewEntry;
 
diff --git a/lib/AST/Stmt.cpp b/lib/AST/Stmt.cpp
index 7dfa3a9..194e077 100644
--- a/lib/AST/Stmt.cpp
+++ b/lib/AST/Stmt.cpp
@@ -503,6 +503,9 @@
 
   bool HasVariants = !C.getTargetInfo().hasNoAsmVariants();
 
+  unsigned LastAsmStringToken = 0;
+  unsigned LastAsmStringOffset = 0;
+
   while (1) {
     // Done with the string?
     if (CurPtr == StrEnd) {
@@ -589,10 +592,12 @@
 
       // (BeginLoc, EndLoc) represents the range of the operand we are currently
       // processing. Unlike Str, the range includes the leading '%'.
-      SourceLocation BeginLoc =
-          getAsmString()->getLocationOfByte(Percent - StrStart, SM, LO, TI);
-      SourceLocation EndLoc =
-          getAsmString()->getLocationOfByte(CurPtr - StrStart, SM, LO, TI);
+      SourceLocation BeginLoc = getAsmString()->getLocationOfByte(
+          Percent - StrStart, SM, LO, TI, &LastAsmStringToken,
+          &LastAsmStringOffset);
+      SourceLocation EndLoc = getAsmString()->getLocationOfByte(
+          CurPtr - StrStart, SM, LO, TI, &LastAsmStringToken,
+          &LastAsmStringOffset);
 
       Pieces.emplace_back(N, std::move(Str), BeginLoc, EndLoc);
       continue;
@@ -623,10 +628,12 @@
 
       // (BeginLoc, EndLoc) represents the range of the operand we are currently
       // processing. Unlike Str, the range includes the leading '%'.
-      SourceLocation BeginLoc =
-          getAsmString()->getLocationOfByte(Percent - StrStart, SM, LO, TI);
-      SourceLocation EndLoc =
-          getAsmString()->getLocationOfByte(NameEnd + 1 - StrStart, SM, LO, TI);
+      SourceLocation BeginLoc = getAsmString()->getLocationOfByte(
+          Percent - StrStart, SM, LO, TI, &LastAsmStringToken,
+          &LastAsmStringOffset);
+      SourceLocation EndLoc = getAsmString()->getLocationOfByte(
+          NameEnd + 1 - StrStart, SM, LO, TI, &LastAsmStringToken,
+          &LastAsmStringOffset);
 
       Pieces.emplace_back(N, std::move(Str), BeginLoc, EndLoc);
 
@@ -756,11 +763,13 @@
                  });
 }
 
-IfStmt::IfStmt(const ASTContext &C, SourceLocation IL, VarDecl *var, Expr *cond,
-               Stmt *then, SourceLocation EL, Stmt *elsev)
-  : Stmt(IfStmtClass), IfLoc(IL), ElseLoc(EL)
-{
+IfStmt::IfStmt(const ASTContext &C, SourceLocation IL, bool IsConstexpr,
+               Stmt *init, VarDecl *var, Expr *cond, Stmt *then,
+               SourceLocation EL, Stmt *elsev)
+    : Stmt(IfStmtClass), IfLoc(IL), ElseLoc(EL) {
+  setConstexpr(IsConstexpr);
   setConditionVariable(C, var);
+  SubExprs[INIT] = init;
   SubExprs[COND] = cond;
   SubExprs[THEN] = then;
   SubExprs[ELSE] = elsev;
@@ -785,6 +794,10 @@
                                    VarRange.getEnd());
 }
 
+bool IfStmt::isObjCAvailabilityCheck() const {
+  return isa<ObjCAvailabilityCheckExpr>(SubExprs[COND]);
+}
+
 ForStmt::ForStmt(const ASTContext &C, Stmt *Init, Expr *Cond, VarDecl *condVar,
                  Expr *Inc, Stmt *Body, SourceLocation FL, SourceLocation LP,
                  SourceLocation RP)
@@ -816,9 +829,11 @@
                                        VarRange.getEnd());
 }
 
-SwitchStmt::SwitchStmt(const ASTContext &C, VarDecl *Var, Expr *cond)
+SwitchStmt::SwitchStmt(const ASTContext &C, Stmt *init, VarDecl *Var,
+                       Expr *cond)
     : Stmt(SwitchStmtClass), FirstCase(nullptr, false) {
   setConditionVariable(C, Var);
+  SubExprs[INIT] = init;
   SubExprs[COND] = cond;
   SubExprs[BODY] = nullptr;
 }
diff --git a/lib/AST/StmtCXX.cpp b/lib/AST/StmtCXX.cpp
index e39a01d..4692db8 100644
--- a/lib/AST/StmtCXX.cpp
+++ b/lib/AST/StmtCXX.cpp
@@ -49,7 +49,8 @@
   std::copy(handlers.begin(), handlers.end(), Stmts + 1);
 }
 
-CXXForRangeStmt::CXXForRangeStmt(DeclStmt *Range, DeclStmt *BeginEndStmt,
+CXXForRangeStmt::CXXForRangeStmt(DeclStmt *Range,
+                                 DeclStmt *BeginStmt, DeclStmt *EndStmt,
                                  Expr *Cond, Expr *Inc, DeclStmt *LoopVar,
                                  Stmt *Body, SourceLocation FL,
                                  SourceLocation CAL, SourceLocation CL,
@@ -57,7 +58,8 @@
     : Stmt(CXXForRangeStmtClass), ForLoc(FL), CoawaitLoc(CAL), ColonLoc(CL),
       RParenLoc(RPL) {
   SubExprs[RANGE] = Range;
-  SubExprs[BEGINEND] = BeginEndStmt;
+  SubExprs[BEGINSTMT] = BeginStmt;
+  SubExprs[ENDSTMT] = EndStmt;
   SubExprs[COND] = Cond;
   SubExprs[INC] = Inc;
   SubExprs[LOOPVAR] = LoopVar;
diff --git a/lib/AST/StmtOpenMP.cpp b/lib/AST/StmtOpenMP.cpp
index 72e62b7..7197586 100644
--- a/lib/AST/StmtOpenMP.cpp
+++ b/lib/AST/StmtOpenMP.cpp
@@ -105,6 +105,7 @@
   Dir->setInits(Exprs.Inits);
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
   return Dir;
 }
 
@@ -148,11 +149,15 @@
   Dir->setEnsureUpperBound(Exprs.EUB);
   Dir->setNextLowerBound(Exprs.NLB);
   Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
   Dir->setCounters(Exprs.Counters);
   Dir->setPrivateCounters(Exprs.PrivateCounters);
   Dir->setInits(Exprs.Inits);
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
   Dir->setHasCancel(HasCancel);
   return Dir;
 }
@@ -197,11 +202,15 @@
   Dir->setEnsureUpperBound(Exprs.EUB);
   Dir->setNextLowerBound(Exprs.NLB);
   Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
   Dir->setCounters(Exprs.Counters);
   Dir->setPrivateCounters(Exprs.PrivateCounters);
   Dir->setInits(Exprs.Inits);
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
   return Dir;
 }
 
@@ -362,11 +371,15 @@
   Dir->setEnsureUpperBound(Exprs.EUB);
   Dir->setNextLowerBound(Exprs.NLB);
   Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
   Dir->setCounters(Exprs.Counters);
   Dir->setPrivateCounters(Exprs.PrivateCounters);
   Dir->setInits(Exprs.Inits);
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
   Dir->setHasCancel(HasCancel);
   return Dir;
 }
@@ -409,11 +422,15 @@
   Dir->setEnsureUpperBound(Exprs.EUB);
   Dir->setNextLowerBound(Exprs.NLB);
   Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
   Dir->setCounters(Exprs.Counters);
   Dir->setPrivateCounters(Exprs.PrivateCounters);
   Dir->setInits(Exprs.Inits);
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
   return Dir;
 }
 
@@ -694,6 +711,82 @@
   return new (Mem) OMPTargetDirective(NumClauses);
 }
 
+OMPTargetParallelDirective *OMPTargetParallelDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetParallelDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem =
+      C.Allocate(Size + sizeof(OMPClause *) * Clauses.size() + sizeof(Stmt *));
+  OMPTargetParallelDirective *Dir =
+      new (Mem) OMPTargetParallelDirective(StartLoc, EndLoc, Clauses.size());
+  Dir->setClauses(Clauses);
+  Dir->setAssociatedStmt(AssociatedStmt);
+  return Dir;
+}
+
+OMPTargetParallelDirective *
+OMPTargetParallelDirective::CreateEmpty(const ASTContext &C,
+                                        unsigned NumClauses, EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetParallelDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem =
+      C.Allocate(Size + sizeof(OMPClause *) * NumClauses + sizeof(Stmt *));
+  return new (Mem) OMPTargetParallelDirective(NumClauses);
+}
+
+OMPTargetParallelForDirective *OMPTargetParallelForDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+    const HelperExprs &Exprs, bool HasCancel) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetParallelForDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * Clauses.size() +
+      sizeof(Stmt *) * numLoopChildren(CollapsedNum, OMPD_target_parallel_for));
+  OMPTargetParallelForDirective *Dir = new (Mem) OMPTargetParallelForDirective(
+      StartLoc, EndLoc, CollapsedNum, Clauses.size());
+  Dir->setClauses(Clauses);
+  Dir->setAssociatedStmt(AssociatedStmt);
+  Dir->setIterationVariable(Exprs.IterationVarRef);
+  Dir->setLastIteration(Exprs.LastIteration);
+  Dir->setCalcLastIteration(Exprs.CalcLastIteration);
+  Dir->setPreCond(Exprs.PreCond);
+  Dir->setCond(Exprs.Cond);
+  Dir->setInit(Exprs.Init);
+  Dir->setInc(Exprs.Inc);
+  Dir->setIsLastIterVariable(Exprs.IL);
+  Dir->setLowerBoundVariable(Exprs.LB);
+  Dir->setUpperBoundVariable(Exprs.UB);
+  Dir->setStrideVariable(Exprs.ST);
+  Dir->setEnsureUpperBound(Exprs.EUB);
+  Dir->setNextLowerBound(Exprs.NLB);
+  Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
+  Dir->setCounters(Exprs.Counters);
+  Dir->setPrivateCounters(Exprs.PrivateCounters);
+  Dir->setInits(Exprs.Inits);
+  Dir->setUpdates(Exprs.Updates);
+  Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
+  Dir->setHasCancel(HasCancel);
+  return Dir;
+}
+
+OMPTargetParallelForDirective *
+OMPTargetParallelForDirective::CreateEmpty(const ASTContext &C,
+                                           unsigned NumClauses,
+                                           unsigned CollapsedNum, EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetParallelForDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * NumClauses +
+      sizeof(Stmt *) * numLoopChildren(CollapsedNum, OMPD_target_parallel_for));
+  return new (Mem) OMPTargetParallelForDirective(CollapsedNum, NumClauses);
+}
+
 OMPTargetDataDirective *OMPTargetDataDirective::Create(
     const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
     ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt) {
@@ -716,6 +809,49 @@
   return new (Mem) OMPTargetDataDirective(N);
 }
 
+OMPTargetEnterDataDirective *OMPTargetEnterDataDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    ArrayRef<OMPClause *> Clauses) {
+  void *Mem = C.Allocate(llvm::alignTo(sizeof(OMPTargetEnterDataDirective),
+                                       llvm::alignOf<OMPClause *>()) +
+                         sizeof(OMPClause *) * Clauses.size());
+  OMPTargetEnterDataDirective *Dir =
+      new (Mem) OMPTargetEnterDataDirective(StartLoc, EndLoc, Clauses.size());
+  Dir->setClauses(Clauses);
+  return Dir;
+}
+
+OMPTargetEnterDataDirective *
+OMPTargetEnterDataDirective::CreateEmpty(const ASTContext &C, unsigned N,
+                                         EmptyShell) {
+  void *Mem = C.Allocate(llvm::alignTo(sizeof(OMPTargetEnterDataDirective),
+                                       llvm::alignOf<OMPClause *>()) +
+                         sizeof(OMPClause *) * N);
+  return new (Mem) OMPTargetEnterDataDirective(N);
+}
+
+OMPTargetExitDataDirective *
+OMPTargetExitDataDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+                                   SourceLocation EndLoc,
+                                   ArrayRef<OMPClause *> Clauses) {
+  void *Mem = C.Allocate(llvm::alignTo(sizeof(OMPTargetExitDataDirective),
+                                       llvm::alignOf<OMPClause *>()) +
+                         sizeof(OMPClause *) * Clauses.size());
+  OMPTargetExitDataDirective *Dir =
+      new (Mem) OMPTargetExitDataDirective(StartLoc, EndLoc, Clauses.size());
+  Dir->setClauses(Clauses);
+  return Dir;
+}
+
+OMPTargetExitDataDirective *
+OMPTargetExitDataDirective::CreateEmpty(const ASTContext &C, unsigned N,
+                                        EmptyShell) {
+  void *Mem = C.Allocate(llvm::alignTo(sizeof(OMPTargetExitDataDirective),
+                                       llvm::alignOf<OMPClause *>()) +
+                         sizeof(OMPClause *) * N);
+  return new (Mem) OMPTargetExitDataDirective(N);
+}
+
 OMPTeamsDirective *OMPTeamsDirective::Create(const ASTContext &C,
                                              SourceLocation StartLoc,
                                              SourceLocation EndLoc,
@@ -769,11 +905,15 @@
   Dir->setEnsureUpperBound(Exprs.EUB);
   Dir->setNextLowerBound(Exprs.NLB);
   Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
   Dir->setCounters(Exprs.Counters);
   Dir->setPrivateCounters(Exprs.PrivateCounters);
   Dir->setInits(Exprs.Inits);
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
   return Dir;
 }
 
@@ -816,11 +956,15 @@
   Dir->setEnsureUpperBound(Exprs.EUB);
   Dir->setNextLowerBound(Exprs.NLB);
   Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
   Dir->setCounters(Exprs.Counters);
   Dir->setPrivateCounters(Exprs.PrivateCounters);
   Dir->setInits(Exprs.Inits);
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
   return Dir;
 }
 
@@ -862,11 +1006,15 @@
   Dir->setEnsureUpperBound(Exprs.EUB);
   Dir->setNextLowerBound(Exprs.NLB);
   Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
   Dir->setCounters(Exprs.Counters);
   Dir->setPrivateCounters(Exprs.PrivateCounters);
   Dir->setInits(Exprs.Inits);
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
   return Dir;
 }
 
@@ -880,3 +1028,337 @@
                              numLoopChildren(CollapsedNum, OMPD_distribute));
   return new (Mem) OMPDistributeDirective(CollapsedNum, NumClauses);
 }
+
+OMPTargetUpdateDirective *
+OMPTargetUpdateDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+                                 SourceLocation EndLoc,
+                                 ArrayRef<OMPClause *> Clauses) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetUpdateDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(Size + sizeof(OMPClause *) * Clauses.size());
+  OMPTargetUpdateDirective *Dir =
+      new (Mem) OMPTargetUpdateDirective(StartLoc, EndLoc, Clauses.size());
+  Dir->setClauses(Clauses);
+  return Dir;
+}
+
+OMPTargetUpdateDirective *
+OMPTargetUpdateDirective::CreateEmpty(const ASTContext &C, unsigned NumClauses,
+                                      EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetUpdateDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(Size + sizeof(OMPClause *) * NumClauses);
+  return new (Mem) OMPTargetUpdateDirective(NumClauses);
+}
+
+OMPDistributeParallelForDirective *OMPDistributeParallelForDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+    const HelperExprs &Exprs) {
+  unsigned Size = llvm::alignTo(sizeof(OMPDistributeParallelForDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * Clauses.size() +
+      sizeof(Stmt *) *
+          numLoopChildren(CollapsedNum, OMPD_distribute_parallel_for));
+  OMPDistributeParallelForDirective *Dir =
+      new (Mem) OMPDistributeParallelForDirective(StartLoc, EndLoc,
+                                                  CollapsedNum, Clauses.size());
+  Dir->setClauses(Clauses);
+  Dir->setAssociatedStmt(AssociatedStmt);
+  Dir->setIterationVariable(Exprs.IterationVarRef);
+  Dir->setLastIteration(Exprs.LastIteration);
+  Dir->setCalcLastIteration(Exprs.CalcLastIteration);
+  Dir->setPreCond(Exprs.PreCond);
+  Dir->setCond(Exprs.Cond);
+  Dir->setInit(Exprs.Init);
+  Dir->setInc(Exprs.Inc);
+  Dir->setIsLastIterVariable(Exprs.IL);
+  Dir->setLowerBoundVariable(Exprs.LB);
+  Dir->setUpperBoundVariable(Exprs.UB);
+  Dir->setStrideVariable(Exprs.ST);
+  Dir->setEnsureUpperBound(Exprs.EUB);
+  Dir->setNextLowerBound(Exprs.NLB);
+  Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
+  Dir->setCounters(Exprs.Counters);
+  Dir->setPrivateCounters(Exprs.PrivateCounters);
+  Dir->setInits(Exprs.Inits);
+  Dir->setUpdates(Exprs.Updates);
+  Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
+  return Dir;
+}
+
+OMPDistributeParallelForDirective *
+OMPDistributeParallelForDirective::CreateEmpty(const ASTContext &C,
+                                               unsigned NumClauses,
+                                               unsigned CollapsedNum,
+                                               EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPDistributeParallelForDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * NumClauses +
+      sizeof(Stmt *) *
+          numLoopChildren(CollapsedNum, OMPD_distribute_parallel_for));
+  return new (Mem) OMPDistributeParallelForDirective(CollapsedNum, NumClauses);
+}
+
+OMPDistributeParallelForSimdDirective *
+OMPDistributeParallelForSimdDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+    const HelperExprs &Exprs) {
+  unsigned Size = llvm::alignTo(sizeof(OMPDistributeParallelForSimdDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * Clauses.size() +
+      sizeof(Stmt *) *
+          numLoopChildren(CollapsedNum, OMPD_distribute_parallel_for_simd));
+  OMPDistributeParallelForSimdDirective *Dir = new (Mem)
+      OMPDistributeParallelForSimdDirective(StartLoc, EndLoc, CollapsedNum,
+                                            Clauses.size());
+  Dir->setClauses(Clauses);
+  Dir->setAssociatedStmt(AssociatedStmt);
+  Dir->setIterationVariable(Exprs.IterationVarRef);
+  Dir->setLastIteration(Exprs.LastIteration);
+  Dir->setCalcLastIteration(Exprs.CalcLastIteration);
+  Dir->setPreCond(Exprs.PreCond);
+  Dir->setCond(Exprs.Cond);
+  Dir->setInit(Exprs.Init);
+  Dir->setInc(Exprs.Inc);
+  Dir->setIsLastIterVariable(Exprs.IL);
+  Dir->setLowerBoundVariable(Exprs.LB);
+  Dir->setUpperBoundVariable(Exprs.UB);
+  Dir->setStrideVariable(Exprs.ST);
+  Dir->setEnsureUpperBound(Exprs.EUB);
+  Dir->setNextLowerBound(Exprs.NLB);
+  Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
+  Dir->setCounters(Exprs.Counters);
+  Dir->setPrivateCounters(Exprs.PrivateCounters);
+  Dir->setInits(Exprs.Inits);
+  Dir->setUpdates(Exprs.Updates);
+  Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
+  return Dir;
+}
+
+OMPDistributeParallelForSimdDirective *
+OMPDistributeParallelForSimdDirective::CreateEmpty(const ASTContext &C,
+                                                   unsigned NumClauses,
+                                                   unsigned CollapsedNum,
+                                                   EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPDistributeParallelForSimdDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * NumClauses +
+      sizeof(Stmt *) *
+          numLoopChildren(CollapsedNum, OMPD_distribute_parallel_for_simd));
+  return new (Mem)
+      OMPDistributeParallelForSimdDirective(CollapsedNum, NumClauses);
+}
+
+OMPDistributeSimdDirective *OMPDistributeSimdDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+    const HelperExprs &Exprs) {
+  unsigned Size = llvm::alignTo(sizeof(OMPDistributeSimdDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * Clauses.size() +
+      sizeof(Stmt *) *
+          numLoopChildren(CollapsedNum, OMPD_distribute_simd));
+  OMPDistributeSimdDirective *Dir = new (Mem) OMPDistributeSimdDirective(
+      StartLoc, EndLoc, CollapsedNum, Clauses.size());
+  Dir->setClauses(Clauses);
+  Dir->setAssociatedStmt(AssociatedStmt);
+  Dir->setIterationVariable(Exprs.IterationVarRef);
+  Dir->setLastIteration(Exprs.LastIteration);
+  Dir->setCalcLastIteration(Exprs.CalcLastIteration);
+  Dir->setPreCond(Exprs.PreCond);
+  Dir->setCond(Exprs.Cond);
+  Dir->setInit(Exprs.Init);
+  Dir->setInc(Exprs.Inc);
+  Dir->setIsLastIterVariable(Exprs.IL);
+  Dir->setLowerBoundVariable(Exprs.LB);
+  Dir->setUpperBoundVariable(Exprs.UB);
+  Dir->setStrideVariable(Exprs.ST);
+  Dir->setEnsureUpperBound(Exprs.EUB);
+  Dir->setNextLowerBound(Exprs.NLB);
+  Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
+  Dir->setCounters(Exprs.Counters);
+  Dir->setPrivateCounters(Exprs.PrivateCounters);
+  Dir->setInits(Exprs.Inits);
+  Dir->setUpdates(Exprs.Updates);
+  Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
+  return Dir;
+}
+
+OMPDistributeSimdDirective *
+OMPDistributeSimdDirective::CreateEmpty(const ASTContext &C,
+                                        unsigned NumClauses,
+                                        unsigned CollapsedNum, EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPDistributeSimdDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * NumClauses +
+      sizeof(Stmt *) *
+          numLoopChildren(CollapsedNum, OMPD_distribute_simd));
+  return new (Mem) OMPDistributeSimdDirective(CollapsedNum, NumClauses);
+}
+
+OMPTargetParallelForSimdDirective *OMPTargetParallelForSimdDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+    const HelperExprs &Exprs) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetParallelForSimdDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * Clauses.size() +
+      sizeof(Stmt *) * 
+          numLoopChildren(CollapsedNum, OMPD_target_parallel_for_simd));
+  OMPTargetParallelForSimdDirective *Dir = 
+      new (Mem) OMPTargetParallelForSimdDirective(StartLoc, EndLoc,
+                                                  CollapsedNum, Clauses.size());
+  Dir->setClauses(Clauses);
+  Dir->setAssociatedStmt(AssociatedStmt);
+  Dir->setIterationVariable(Exprs.IterationVarRef);
+  Dir->setLastIteration(Exprs.LastIteration);
+  Dir->setCalcLastIteration(Exprs.CalcLastIteration);
+  Dir->setPreCond(Exprs.PreCond);
+  Dir->setCond(Exprs.Cond);
+  Dir->setInit(Exprs.Init);
+  Dir->setInc(Exprs.Inc);
+  Dir->setIsLastIterVariable(Exprs.IL);
+  Dir->setLowerBoundVariable(Exprs.LB);
+  Dir->setUpperBoundVariable(Exprs.UB);
+  Dir->setStrideVariable(Exprs.ST);
+  Dir->setEnsureUpperBound(Exprs.EUB);
+  Dir->setNextLowerBound(Exprs.NLB);
+  Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
+  Dir->setCounters(Exprs.Counters);
+  Dir->setPrivateCounters(Exprs.PrivateCounters);
+  Dir->setInits(Exprs.Inits);
+  Dir->setUpdates(Exprs.Updates);
+  Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
+  return Dir;
+}
+
+OMPTargetParallelForSimdDirective *
+OMPTargetParallelForSimdDirective::CreateEmpty(const ASTContext &C,
+                                               unsigned NumClauses,
+                                               unsigned CollapsedNum,
+                                               EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetParallelForSimdDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * NumClauses +
+      sizeof(Stmt *) * 
+          numLoopChildren(CollapsedNum, OMPD_target_parallel_for_simd));
+  return new (Mem) OMPTargetParallelForSimdDirective(CollapsedNum, NumClauses);
+}
+
+OMPTargetSimdDirective *
+OMPTargetSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc, 
+                               SourceLocation EndLoc, unsigned CollapsedNum,
+                               ArrayRef<OMPClause *> Clauses,
+                               Stmt *AssociatedStmt, const HelperExprs &Exprs) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetSimdDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(Size + sizeof(OMPClause *) * Clauses.size() +
+                         sizeof(Stmt *) * 
+                             numLoopChildren(CollapsedNum, OMPD_target_simd));
+  OMPTargetSimdDirective *Dir = new (Mem)
+      OMPTargetSimdDirective(StartLoc, EndLoc, CollapsedNum, Clauses.size());
+  Dir->setClauses(Clauses);
+  Dir->setAssociatedStmt(AssociatedStmt);
+  Dir->setIterationVariable(Exprs.IterationVarRef);
+  Dir->setLastIteration(Exprs.LastIteration);
+  Dir->setCalcLastIteration(Exprs.CalcLastIteration);
+  Dir->setPreCond(Exprs.PreCond);
+  Dir->setCond(Exprs.Cond);
+  Dir->setInit(Exprs.Init);
+  Dir->setInc(Exprs.Inc);
+  Dir->setCounters(Exprs.Counters);
+  Dir->setPrivateCounters(Exprs.PrivateCounters);
+  Dir->setInits(Exprs.Inits);
+  Dir->setUpdates(Exprs.Updates);
+  Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
+  return Dir;
+}
+
+OMPTargetSimdDirective *
+OMPTargetSimdDirective::CreateEmpty(const ASTContext &C, unsigned NumClauses,
+                                    unsigned CollapsedNum, EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTargetSimdDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(Size + sizeof(OMPClause *) * NumClauses +
+                         sizeof(Stmt *) * 
+                             numLoopChildren(CollapsedNum, OMPD_target_simd));
+  return new (Mem) OMPTargetSimdDirective(CollapsedNum, NumClauses);
+}
+
+OMPTeamsDistributeDirective *OMPTeamsDistributeDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    unsigned CollapsedNum, ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+    const HelperExprs &Exprs) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTeamsDistributeDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * Clauses.size() +
+      sizeof(Stmt *) * numLoopChildren(CollapsedNum, OMPD_teams_distribute));
+  OMPTeamsDistributeDirective *Dir = new (Mem) OMPTeamsDistributeDirective(
+      StartLoc, EndLoc, CollapsedNum, Clauses.size());
+  Dir->setClauses(Clauses);
+  Dir->setAssociatedStmt(AssociatedStmt);
+  Dir->setIterationVariable(Exprs.IterationVarRef);
+  Dir->setLastIteration(Exprs.LastIteration);
+  Dir->setCalcLastIteration(Exprs.CalcLastIteration);
+  Dir->setPreCond(Exprs.PreCond);
+  Dir->setCond(Exprs.Cond);
+  Dir->setInit(Exprs.Init);
+  Dir->setInc(Exprs.Inc);
+  Dir->setIsLastIterVariable(Exprs.IL);
+  Dir->setLowerBoundVariable(Exprs.LB);
+  Dir->setUpperBoundVariable(Exprs.UB);
+  Dir->setStrideVariable(Exprs.ST);
+  Dir->setEnsureUpperBound(Exprs.EUB);
+  Dir->setNextLowerBound(Exprs.NLB);
+  Dir->setNextUpperBound(Exprs.NUB);
+  Dir->setNumIterations(Exprs.NumIterations);
+  Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+  Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
+  Dir->setCounters(Exprs.Counters);
+  Dir->setPrivateCounters(Exprs.PrivateCounters);
+  Dir->setInits(Exprs.Inits);
+  Dir->setUpdates(Exprs.Updates);
+  Dir->setFinals(Exprs.Finals);
+  Dir->setPreInits(Exprs.PreInits);
+  return Dir;
+}
+
+OMPTeamsDistributeDirective *
+OMPTeamsDistributeDirective::CreateEmpty(const ASTContext &C,
+                                         unsigned NumClauses,
+                                         unsigned CollapsedNum, EmptyShell) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTeamsDistributeDirective),
+                                llvm::alignOf<OMPClause *>());
+  void *Mem = C.Allocate(
+      Size + sizeof(OMPClause *) * NumClauses +
+      sizeof(Stmt *) * numLoopChildren(CollapsedNum, OMPD_teams_distribute));
+  return new (Mem) OMPTeamsDistributeDirective(CollapsedNum, NumClauses);
+}
diff --git a/lib/AST/StmtPrinter.cpp b/lib/AST/StmtPrinter.cpp
index 0d7063e..0b103f9 100644
--- a/lib/AST/StmtPrinter.cpp
+++ b/lib/AST/StmtPrinter.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
@@ -496,6 +497,11 @@
   OS << ";\n";
 }
 
+void StmtPrinter::VisitObjCAvailabilityCheckExpr(
+    ObjCAvailabilityCheckExpr *Node) {
+  OS << "@available(...)";
+}
+
 void StmtPrinter::VisitObjCAtSynchronizedStmt(ObjCAtSynchronizedStmt *Node) {
   Indent() << "@synchronized (";
   PrintExpr(Node->getSynchExpr());
@@ -663,9 +669,9 @@
     OS << ": ";
   }
   OS << getOpenMPSimpleClauseTypeName(OMPC_schedule, Node->getScheduleKind());
-  if (Node->getChunkSize()) {
+  if (auto *E = Node->getChunkSize()) {
     OS << ", ";
-    Node->getChunkSize()->printPretty(OS, nullptr, Policy);
+    E->printPretty(OS, nullptr, Policy);
   }
   OS << ")";
 }
@@ -763,15 +769,16 @@
 void OMPClausePrinter::VisitOMPClauseList(T *Node, char StartSym) {
   for (typename T::varlist_iterator I = Node->varlist_begin(),
                                     E = Node->varlist_end();
-         I != E; ++I) {
+       I != E; ++I) {
     assert(*I && "Expected non-null Stmt");
+    OS << (I == Node->varlist_begin() ? StartSym : ',');
     if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(*I)) {
-      OS << (I == Node->varlist_begin() ? StartSym : ',');
-      cast<NamedDecl>(DRE->getDecl())->printQualifiedName(OS);
-    } else {
-      OS << (I == Node->varlist_begin() ? StartSym : ',');
+      if (isa<OMPCapturedExprDecl>(DRE->getDecl()))
+        DRE->printPretty(OS, nullptr, Policy, 0);
+      else
+        DRE->getDecl()->printQualifiedName(OS);
+    } else
       (*I)->printPretty(OS, nullptr, Policy, 0);
-    }
   }
 }
 
@@ -910,15 +917,57 @@
   }
 }
 
+void OMPClausePrinter::VisitOMPToClause(OMPToClause *Node) {
+  if (!Node->varlist_empty()) {
+    OS << "to";
+    VisitOMPClauseList(Node, '(');
+    OS << ")";
+  }
+}
+
+void OMPClausePrinter::VisitOMPFromClause(OMPFromClause *Node) {
+  if (!Node->varlist_empty()) {
+    OS << "from";
+    VisitOMPClauseList(Node, '(');
+    OS << ")";
+  }
+}
+
 void OMPClausePrinter::VisitOMPDistScheduleClause(OMPDistScheduleClause *Node) {
   OS << "dist_schedule(" << getOpenMPSimpleClauseTypeName(
                            OMPC_dist_schedule, Node->getDistScheduleKind());
-  if (Node->getChunkSize()) {
+  if (auto *E = Node->getChunkSize()) {
     OS << ", ";
-    Node->getChunkSize()->printPretty(OS, nullptr, Policy);
+    E->printPretty(OS, nullptr, Policy);
   }
   OS << ")";
 }
+
+void OMPClausePrinter::VisitOMPDefaultmapClause(OMPDefaultmapClause *Node) {
+  OS << "defaultmap(";
+  OS << getOpenMPSimpleClauseTypeName(OMPC_defaultmap,
+                                      Node->getDefaultmapModifier());
+  OS << ": ";
+  OS << getOpenMPSimpleClauseTypeName(OMPC_defaultmap,
+    Node->getDefaultmapKind());
+  OS << ")";
+}
+
+void OMPClausePrinter::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *Node) {
+  if (!Node->varlist_empty()) {
+    OS << "use_device_ptr";
+    VisitOMPClauseList(Node, '(');
+    OS << ")";
+  }
+}
+
+void OMPClausePrinter::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *Node) {
+  if (!Node->varlist_empty()) {
+    OS << "is_device_ptr";
+    VisitOMPClauseList(Node, '(');
+    OS << ")";
+  }
+}
 }
 
 //===----------------------------------------------------------------------===//
@@ -1061,6 +1110,30 @@
   PrintOMPExecutableDirective(Node);
 }
 
+void StmtPrinter::VisitOMPTargetEnterDataDirective(
+    OMPTargetEnterDataDirective *Node) {
+  Indent() << "#pragma omp target enter data ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPTargetExitDataDirective(
+    OMPTargetExitDataDirective *Node) {
+  Indent() << "#pragma omp target exit data ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPTargetParallelDirective(
+    OMPTargetParallelDirective *Node) {
+  Indent() << "#pragma omp target parallel ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPTargetParallelForDirective(
+    OMPTargetParallelForDirective *Node) {
+  Indent() << "#pragma omp target parallel for ";
+  PrintOMPExecutableDirective(Node);
+}
+
 void StmtPrinter::VisitOMPTeamsDirective(OMPTeamsDirective *Node) {
   Indent() << "#pragma omp teams ";
   PrintOMPExecutableDirective(Node);
@@ -1095,11 +1168,56 @@
   PrintOMPExecutableDirective(Node);
 }
 
+void StmtPrinter::VisitOMPTargetUpdateDirective(
+    OMPTargetUpdateDirective *Node) {
+  Indent() << "#pragma omp target update ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPDistributeParallelForDirective(
+    OMPDistributeParallelForDirective *Node) {
+  Indent() << "#pragma omp distribute parallel for ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPDistributeParallelForSimdDirective(
+    OMPDistributeParallelForSimdDirective *Node) {
+  Indent() << "#pragma omp distribute parallel for simd ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPDistributeSimdDirective(
+    OMPDistributeSimdDirective *Node) {
+  Indent() << "#pragma omp distribute simd ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPTargetParallelForSimdDirective(
+    OMPTargetParallelForSimdDirective *Node) {
+  Indent() << "#pragma omp target parallel for simd ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPTargetSimdDirective(OMPTargetSimdDirective *Node) {
+  Indent() << "#pragma omp target simd ";
+  PrintOMPExecutableDirective(Node);
+}
+
+void StmtPrinter::VisitOMPTeamsDistributeDirective(
+    OMPTeamsDistributeDirective *Node) {
+  Indent() << "#pragma omp teams distribute ";
+  PrintOMPExecutableDirective(Node);
+}
+
 //===----------------------------------------------------------------------===//
 //  Expr printing methods.
 //===----------------------------------------------------------------------===//
 
 void StmtPrinter::VisitDeclRefExpr(DeclRefExpr *Node) {
+  if (auto *OCED = dyn_cast<OMPCapturedExprDecl>(Node->getDecl())) {
+    OCED->getInit()->IgnoreImpCasts()->printPretty(OS, nullptr, Policy);
+    return;
+  }
   if (NestedNameSpecifier *Qualifier = Node->getQualifier())
     Qualifier->print(OS, Policy);
   if (Node->hasTemplateKeyword())
@@ -1107,7 +1225,7 @@
   OS << Node->getNameInfo();
   if (Node->hasExplicitTemplateArgs())
     TemplateSpecializationType::PrintTemplateArgumentList(
-        OS, Node->getTemplateArgs(), Node->getNumTemplateArgs(), Policy);
+        OS, Node->template_arguments(), Policy);
 }
 
 void StmtPrinter::VisitDependentScopeDeclRefExpr(
@@ -1119,7 +1237,7 @@
   OS << Node->getNameInfo();
   if (Node->hasExplicitTemplateArgs())
     TemplateSpecializationType::PrintTemplateArgumentList(
-        OS, Node->getTemplateArgs(), Node->getNumTemplateArgs(), Policy);
+        OS, Node->template_arguments(), Policy);
 }
 
 void StmtPrinter::VisitUnresolvedLookupExpr(UnresolvedLookupExpr *Node) {
@@ -1130,7 +1248,7 @@
   OS << Node->getNameInfo();
   if (Node->hasExplicitTemplateArgs())
     TemplateSpecializationType::PrintTemplateArgumentList(
-        OS, Node->getTemplateArgs(), Node->getNumTemplateArgs(), Policy);
+        OS, Node->template_arguments(), Policy);
 }
 
 void StmtPrinter::VisitObjCIvarRefExpr(ObjCIvarRefExpr *Node) {
@@ -1214,6 +1332,12 @@
     OS << "'\\v'";
     break;
   default:
+    // A character literal might be sign-extended, which
+    // would result in an invalid \U escape sequence.
+    // FIXME: multicharacter literals such as '\xFF\xFF\xFF\xFF'
+    // are not correctly handled.
+    if ((value & ~0xFFu) == ~0xFFu && Node->getKind() == CharacterLiteral::Ascii)
+      value &= 0xFFu;
     if (value < 256 && isPrintable((unsigned char)value))
       OS << "'" << (char)value << "'";
     else if (value < 256)
@@ -1264,6 +1388,7 @@
   case BuiltinType::Double:     break; // no suffix.
   case BuiltinType::Float:      OS << 'F'; break;
   case BuiltinType::LongDouble: OS << 'L'; break;
+  case BuiltinType::Float128:   OS << 'Q'; break;
   }
 }
 
@@ -1350,9 +1475,9 @@
     OS << "sizeof";
     break;
   case UETT_AlignOf:
-    if (Policy.LangOpts.CPlusPlus)
+    if (Policy.Alignof)
       OS << "alignof";
-    else if (Policy.LangOpts.C11)
+    else if (Policy.UnderscoreAlignof)
       OS << "_Alignof";
     else
       OS << "__alignof";
@@ -1450,7 +1575,7 @@
   OS << Node->getMemberNameInfo();
   if (Node->hasExplicitTemplateArgs())
     TemplateSpecializationType::PrintTemplateArgumentList(
-        OS, Node->getTemplateArgs(), Node->getNumTemplateArgs(), Policy);
+        OS, Node->template_arguments(), Policy);
 }
 void StmtPrinter::VisitObjCIsaExpr(ObjCIsaExpr *Node) {
   PrintExpr(Node->getBase());
@@ -1573,26 +1698,24 @@
 
 void StmtPrinter::VisitDesignatedInitExpr(DesignatedInitExpr *Node) {
   bool NeedsEquals = true;
-  for (DesignatedInitExpr::designators_iterator D = Node->designators_begin(),
-                      DEnd = Node->designators_end();
-       D != DEnd; ++D) {
-    if (D->isFieldDesignator()) {
-      if (D->getDotLoc().isInvalid()) {
-        if (IdentifierInfo *II = D->getFieldName()) {
+  for (const DesignatedInitExpr::Designator &D : Node->designators()) {
+    if (D.isFieldDesignator()) {
+      if (D.getDotLoc().isInvalid()) {
+        if (IdentifierInfo *II = D.getFieldName()) {
           OS << II->getName() << ":";
           NeedsEquals = false;
         }
       } else {
-        OS << "." << D->getFieldName()->getName();
+        OS << "." << D.getFieldName()->getName();
       }
     } else {
       OS << "[";
-      if (D->isArrayDesignator()) {
-        PrintExpr(Node->getArrayIndex(*D));
+      if (D.isArrayDesignator()) {
+        PrintExpr(Node->getArrayIndex(D));
       } else {
-        PrintExpr(Node->getArrayRangeStart(*D));
+        PrintExpr(Node->getArrayRangeStart(D));
         OS << " ... ";
-        PrintExpr(Node->getArrayRangeEnd(*D));
+        PrintExpr(Node->getArrayRangeEnd(D));
       }
       OS << "]";
     }
@@ -1622,7 +1745,7 @@
 }
 
 void StmtPrinter::VisitImplicitValueInitExpr(ImplicitValueInitExpr *Node) {
-  if (Policy.LangOpts.CPlusPlus) {
+  if (Node->getType()->getAsCXXRecordDecl()) {
     OS << "/*implicit*/";
     Node->getType().print(OS, Policy);
     OS << "()";
@@ -1832,7 +1955,7 @@
     if (Args->size() != 1) {
       OS << "operator\"\"" << Node->getUDSuffix()->getName();
       TemplateSpecializationType::PrintTemplateArgumentList(
-          OS, Args->data(), Args->size(), Policy);
+          OS, Args->asArray(), Policy);
       OS << "()";
       return;
     }
@@ -1962,7 +2085,9 @@
     case LCK_This:
       OS << "this";
       break;
-
+    case LCK_StarThis:
+      OS << "*this";
+      break;
     case LCK_ByRef:
       if (Node->getCaptureDefault() != LCD_ByRef || Node->isInitCapture(C))
         OS << '&';
@@ -1985,7 +2110,7 @@
     OS << " (";
     CXXMethodDecl *Method = Node->getCallOperator();
     NeedComma = false;
-    for (auto P : Method->params()) {
+    for (auto P : Method->parameters()) {
       if (NeedComma) {
         OS << ", ";
       } else {
@@ -2113,6 +2238,11 @@
     OS << "}";
 }
 
+void StmtPrinter::VisitCXXInheritedCtorInitExpr(CXXInheritedCtorInitExpr *E) {
+  // Parens are printed by the surrounding context.
+  OS << "<forwarded>";
+}
+
 void StmtPrinter::VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *E) {
   PrintExpr(E->getSubExpr());
 }
@@ -2150,7 +2280,7 @@
   OS << Node->getMemberNameInfo();
   if (Node->hasExplicitTemplateArgs())
     TemplateSpecializationType::PrintTemplateArgumentList(
-        OS, Node->getTemplateArgs(), Node->getNumTemplateArgs(), Policy);
+        OS, Node->template_arguments(), Policy);
 }
 
 void StmtPrinter::VisitUnresolvedMemberExpr(UnresolvedMemberExpr *Node) {
@@ -2165,7 +2295,7 @@
   OS << Node->getMemberNameInfo();
   if (Node->hasExplicitTemplateArgs())
     TemplateSpecializationType::PrintTemplateArgumentList(
-        OS, Node->getTemplateArgs(), Node->getNumTemplateArgs(), Policy);
+        OS, Node->template_arguments(), Policy);
 }
 
 static const char *getTypeTraitName(TypeTrait TT) {
@@ -2434,7 +2564,7 @@
 
 void StmtPrinter::VisitTypoExpr(TypoExpr *Node) {
   // TODO: Print something reasonable for a TypoExpr, if necessary.
-  assert(false && "Cannot print TypoExpr nodes");
+  llvm_unreachable("Cannot print TypoExpr nodes");
 }
 
 void StmtPrinter::VisitAsTypeExpr(AsTypeExpr *Node) {
diff --git a/lib/AST/StmtProfile.cpp b/lib/AST/StmtProfile.cpp
index f8aa4db..cff8369 100644
--- a/lib/AST/StmtProfile.cpp
+++ b/lib/AST/StmtProfile.cpp
@@ -69,6 +69,7 @@
 }
 
 void StmtProfiler::VisitStmt(const Stmt *S) {
+  assert(S && "Requires non-null Stmt pointer");
   ID.AddInteger(S->getStmtClass());
   for (const Stmt *SubStmt : S->children()) {
     if (SubStmt)
@@ -268,8 +269,23 @@
 #define OPENMP_CLAUSE(Name, Class)                                             \
   void Visit##Class(const Class *C);
 #include "clang/Basic/OpenMPKinds.def"
+  void VistOMPClauseWithPreInit(const OMPClauseWithPreInit *C);
+  void VistOMPClauseWithPostUpdate(const OMPClauseWithPostUpdate *C);
 };
 
+void OMPClauseProfiler::VistOMPClauseWithPreInit(
+    const OMPClauseWithPreInit *C) {
+  if (auto *S = C->getPreInitStmt())
+    Profiler->VisitStmt(S);
+}
+
+void OMPClauseProfiler::VistOMPClauseWithPostUpdate(
+    const OMPClauseWithPostUpdate *C) {
+  VistOMPClauseWithPreInit(C);
+  if (auto *E = C->getPostUpdateExpr())
+    Profiler->VisitStmt(E);
+}
+
 void OMPClauseProfiler::VisitOMPIfClause(const OMPIfClause *C) {
   if (C->getCondition())
     Profiler->VisitStmt(C->getCondition());
@@ -305,12 +321,9 @@
 void OMPClauseProfiler::VisitOMPProcBindClause(const OMPProcBindClause *C) { }
 
 void OMPClauseProfiler::VisitOMPScheduleClause(const OMPScheduleClause *C) {
-  if (C->getChunkSize()) {
-    Profiler->VisitStmt(C->getChunkSize());
-    if (C->getHelperChunkSize()) {
-      Profiler->VisitStmt(C->getChunkSize());
-    }
-  }
+  VistOMPClauseWithPreInit(C);
+  if (auto *S = C->getChunkSize())
+    Profiler->VisitStmt(S);
 }
 
 void OMPClauseProfiler::VisitOMPOrderedClause(const OMPOrderedClause *C) {
@@ -343,37 +356,46 @@
 template<typename T>
 void OMPClauseProfiler::VisitOMPClauseList(T *Node) {
   for (auto *E : Node->varlists()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
 }
 
 void OMPClauseProfiler::VisitOMPPrivateClause(const OMPPrivateClause *C) {
   VisitOMPClauseList(C);
   for (auto *E : C->private_copies()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
 }
 void
 OMPClauseProfiler::VisitOMPFirstprivateClause(const OMPFirstprivateClause *C) {
   VisitOMPClauseList(C);
+  VistOMPClauseWithPreInit(C);
   for (auto *E : C->private_copies()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->inits()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
 }
 void
 OMPClauseProfiler::VisitOMPLastprivateClause(const OMPLastprivateClause *C) {
   VisitOMPClauseList(C);
+  VistOMPClauseWithPostUpdate(C);
   for (auto *E : C->source_exprs()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->destination_exprs()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->assignment_ops()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
 }
 void OMPClauseProfiler::VisitOMPSharedClause(const OMPSharedClause *C) {
@@ -385,63 +407,82 @@
       C->getQualifierLoc().getNestedNameSpecifier());
   Profiler->VisitName(C->getNameInfo().getName());
   VisitOMPClauseList(C);
+  VistOMPClauseWithPostUpdate(C);
   for (auto *E : C->privates()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->lhs_exprs()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->rhs_exprs()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->reduction_ops()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
 }
 void OMPClauseProfiler::VisitOMPLinearClause(const OMPLinearClause *C) {
   VisitOMPClauseList(C);
+  VistOMPClauseWithPostUpdate(C);
   for (auto *E : C->privates()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->inits()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->updates()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->finals()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
-  Profiler->VisitStmt(C->getStep());
-  Profiler->VisitStmt(C->getCalcStep());
+  if (C->getStep())
+    Profiler->VisitStmt(C->getStep());
+  if (C->getCalcStep())
+    Profiler->VisitStmt(C->getCalcStep());
 }
 void OMPClauseProfiler::VisitOMPAlignedClause(const OMPAlignedClause *C) {
   VisitOMPClauseList(C);
-  Profiler->VisitStmt(C->getAlignment());
+  if (C->getAlignment())
+    Profiler->VisitStmt(C->getAlignment());
 }
 void OMPClauseProfiler::VisitOMPCopyinClause(const OMPCopyinClause *C) {
   VisitOMPClauseList(C);
   for (auto *E : C->source_exprs()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->destination_exprs()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->assignment_ops()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
 }
 void
 OMPClauseProfiler::VisitOMPCopyprivateClause(const OMPCopyprivateClause *C) {
   VisitOMPClauseList(C);
   for (auto *E : C->source_exprs()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->destination_exprs()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
   for (auto *E : C->assignment_ops()) {
-    Profiler->VisitStmt(E);
+    if (E)
+      Profiler->VisitStmt(E);
   }
 }
 void OMPClauseProfiler::VisitOMPFlushClause(const OMPFlushClause *C) {
@@ -451,29 +492,50 @@
   VisitOMPClauseList(C);
 }
 void OMPClauseProfiler::VisitOMPDeviceClause(const OMPDeviceClause *C) {
-  Profiler->VisitStmt(C->getDevice());
+  if (C->getDevice())
+    Profiler->VisitStmt(C->getDevice());
 }
 void OMPClauseProfiler::VisitOMPMapClause(const OMPMapClause *C) {
   VisitOMPClauseList(C);
 }
 void OMPClauseProfiler::VisitOMPNumTeamsClause(const OMPNumTeamsClause *C) {
-  Profiler->VisitStmt(C->getNumTeams());
+  if (C->getNumTeams())
+    Profiler->VisitStmt(C->getNumTeams());
 }
 void OMPClauseProfiler::VisitOMPThreadLimitClause(
     const OMPThreadLimitClause *C) {
-  Profiler->VisitStmt(C->getThreadLimit());
+  if (C->getThreadLimit())
+    Profiler->VisitStmt(C->getThreadLimit());
 }
 void OMPClauseProfiler::VisitOMPPriorityClause(const OMPPriorityClause *C) {
-  Profiler->VisitStmt(C->getPriority());
+  if (C->getPriority())
+    Profiler->VisitStmt(C->getPriority());
 }
 void OMPClauseProfiler::VisitOMPGrainsizeClause(const OMPGrainsizeClause *C) {
-  Profiler->VisitStmt(C->getGrainsize());
+  if (C->getGrainsize())
+    Profiler->VisitStmt(C->getGrainsize());
 }
 void OMPClauseProfiler::VisitOMPNumTasksClause(const OMPNumTasksClause *C) {
-  Profiler->VisitStmt(C->getNumTasks());
+  if (C->getNumTasks())
+    Profiler->VisitStmt(C->getNumTasks());
 }
 void OMPClauseProfiler::VisitOMPHintClause(const OMPHintClause *C) {
-  Profiler->VisitStmt(C->getHint());
+  if (C->getHint())
+    Profiler->VisitStmt(C->getHint());
+}
+void OMPClauseProfiler::VisitOMPToClause(const OMPToClause *C) {
+  VisitOMPClauseList(C);
+}
+void OMPClauseProfiler::VisitOMPFromClause(const OMPFromClause *C) {
+  VisitOMPClauseList(C);
+}
+void OMPClauseProfiler::VisitOMPUseDevicePtrClause(
+    const OMPUseDevicePtrClause *C) {
+  VisitOMPClauseList(C);
+}
+void OMPClauseProfiler::VisitOMPIsDevicePtrClause(
+    const OMPIsDevicePtrClause *C) {
+  VisitOMPClauseList(C);
 }
 }
 
@@ -584,6 +646,26 @@
   VisitOMPExecutableDirective(S);
 }
 
+void StmtProfiler::VisitOMPTargetEnterDataDirective(
+    const OMPTargetEnterDataDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
+void StmtProfiler::VisitOMPTargetExitDataDirective(
+    const OMPTargetExitDataDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
+void StmtProfiler::VisitOMPTargetParallelDirective(
+    const OMPTargetParallelDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
+void StmtProfiler::VisitOMPTargetParallelForDirective(
+    const OMPTargetParallelForDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
 void StmtProfiler::VisitOMPTeamsDirective(const OMPTeamsDirective *S) {
   VisitOMPExecutableDirective(S);
 }
@@ -613,12 +695,46 @@
 
 void OMPClauseProfiler::VisitOMPDistScheduleClause(
     const OMPDistScheduleClause *C) {
-  if (C->getChunkSize()) {
-    Profiler->VisitStmt(C->getChunkSize());
-    if (C->getHelperChunkSize()) {
-      Profiler->VisitStmt(C->getChunkSize());
-    }
-  }
+  VistOMPClauseWithPreInit(C);
+  if (auto *S = C->getChunkSize())
+    Profiler->VisitStmt(S);
+}
+
+void OMPClauseProfiler::VisitOMPDefaultmapClause(const OMPDefaultmapClause *) {}
+
+void StmtProfiler::VisitOMPTargetUpdateDirective(
+    const OMPTargetUpdateDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
+void StmtProfiler::VisitOMPDistributeParallelForDirective(
+    const OMPDistributeParallelForDirective *S) {
+  VisitOMPLoopDirective(S);
+}
+
+void StmtProfiler::VisitOMPDistributeParallelForSimdDirective(
+    const OMPDistributeParallelForSimdDirective *S) {
+  VisitOMPLoopDirective(S);
+}
+
+void StmtProfiler::VisitOMPDistributeSimdDirective(
+    const OMPDistributeSimdDirective *S) {
+  VisitOMPLoopDirective(S);
+}
+
+void StmtProfiler::VisitOMPTargetParallelForSimdDirective(
+    const OMPTargetParallelForSimdDirective *S) {
+  VisitOMPLoopDirective(S);
+}
+
+void StmtProfiler::VisitOMPTargetSimdDirective(
+    const OMPTargetSimdDirective *S) {
+  VisitOMPLoopDirective(S);
+}
+
+void StmtProfiler::VisitOMPTeamsDistributeDirective(
+    const OMPTeamsDistributeDirective *S) {
+  VisitOMPLoopDirective(S);
 }
 
 void StmtProfiler::VisitExpr(const Expr *S) {
@@ -820,22 +936,20 @@
 void StmtProfiler::VisitDesignatedInitExpr(const DesignatedInitExpr *S) {
   VisitExpr(S);
   ID.AddBoolean(S->usesGNUSyntax());
-  for (DesignatedInitExpr::const_designators_iterator D =
-         S->designators_begin(), DEnd = S->designators_end();
-       D != DEnd; ++D) {
-    if (D->isFieldDesignator()) {
+  for (const DesignatedInitExpr::Designator &D : S->designators()) {
+    if (D.isFieldDesignator()) {
       ID.AddInteger(0);
-      VisitName(D->getFieldName());
+      VisitName(D.getFieldName());
       continue;
     }
 
-    if (D->isArrayDesignator()) {
+    if (D.isArrayDesignator()) {
       ID.AddInteger(1);
     } else {
-      assert(D->isArrayRangeDesignator());
+      assert(D.isArrayRangeDesignator());
       ID.AddInteger(2);
     }
-    ID.AddInteger(D->getFirstExprIndex());
+    ID.AddInteger(D.getFirstExprIndex());
   }
 }
 
@@ -1206,6 +1320,12 @@
   ID.AddBoolean(S->isElidable());
 }
 
+void StmtProfiler::VisitCXXInheritedCtorInitExpr(
+    const CXXInheritedCtorInitExpr *S) {
+  VisitExpr(S);
+  VisitDecl(S->getConstructor());
+}
+
 void StmtProfiler::VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *S) {
   VisitExplicitCastExpr(S);
 }
@@ -1223,6 +1343,7 @@
        C != CEnd; ++C) {
     ID.AddInteger(C->getCaptureKind());
     switch (C->getCaptureKind()) {
+    case LCK_StarThis:
     case LCK_This:
       break;
     case LCK_ByRef:
@@ -1521,6 +1642,11 @@
   ID.AddBoolean(S->getBridgeKind());
 }
 
+void StmtProfiler::VisitObjCAvailabilityCheckExpr(
+    const ObjCAvailabilityCheckExpr *S) {
+  VisitExpr(S);
+}
+
 void StmtProfiler::VisitDecl(const Decl *D) {
   ID.AddInteger(D? D->getKind() : 0);
 
diff --git a/lib/AST/Type.cpp b/lib/AST/Type.cpp
index 4db275e..4c1d4ec 100644
--- a/lib/AST/Type.cpp
+++ b/lib/AST/Type.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/AST/Type.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/CharUnits.h"
@@ -19,13 +20,11 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/PrettyPrinter.h"
-#include "clang/AST/Type.h"
 #include "clang/AST/TypeVisitor.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace clang;
 
@@ -64,7 +63,7 @@
   return nullptr;
 }
 
-bool QualType::isConstant(QualType T, ASTContext &Ctx) {
+bool QualType::isConstant(QualType T, const ASTContext &Ctx) {
   if (T.isConstQualified())
     return true;
 
@@ -74,7 +73,7 @@
   return T.getAddressSpace() == LangAS::opencl_constant;
 }
 
-unsigned ConstantArrayType::getNumAddressingBits(ASTContext &Context,
+unsigned ConstantArrayType::getNumAddressingBits(const ASTContext &Context,
                                                  QualType ElementType,
                                                const llvm::APInt &NumElements) {
   uint64_t ElementSize = Context.getTypeSizeInChars(ElementType).getQuantity();
@@ -109,7 +108,7 @@
   return TotalSize.getActiveBits();
 }
 
-unsigned ConstantArrayType::getMaxSizeBits(ASTContext &Context) {
+unsigned ConstantArrayType::getMaxSizeBits(const ASTContext &Context) {
   unsigned Bits = Context.getTypeSize(Context.getSizeType());
   
   // Limit the number of bits in size_t so that maximal bit size fits 64 bit
@@ -1622,7 +1621,7 @@
 /// \param Ctx The context in which this type occurs.
 ///
 /// \returns true if the type is considered an integral type, false otherwise.
-bool Type::isIntegralType(ASTContext &Ctx) const {
+bool Type::isIntegralType(const ASTContext &Ctx) const {
   if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
     return BT->getKind() >= BuiltinType::Bool &&
            BT->getKind() <= BuiltinType::Int128;
@@ -1783,7 +1782,7 @@
 bool Type::isFloatingType() const {
   if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
     return BT->getKind() >= BuiltinType::Half &&
-           BT->getKind() <= BuiltinType::LongDouble;
+           BT->getKind() <= BuiltinType::Float128;
   if (const ComplexType *CT = dyn_cast<ComplexType>(CanonicalType))
     return CT->getElementType()->isFloatingType();
   return false;
@@ -1805,7 +1804,7 @@
 bool Type::isRealType() const {
   if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
     return BT->getKind() >= BuiltinType::Bool &&
-           BT->getKind() <= BuiltinType::LongDouble;
+           BT->getKind() <= BuiltinType::Float128;
   if (const EnumType *ET = dyn_cast<EnumType>(CanonicalType))
       return ET->getDecl()->isComplete() && !ET->getDecl()->isScoped();
   return false;
@@ -1814,7 +1813,7 @@
 bool Type::isArithmeticType() const {
   if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
     return BT->getKind() >= BuiltinType::Bool &&
-           BT->getKind() <= BuiltinType::LongDouble;
+           BT->getKind() <= BuiltinType::Float128;
   if (const EnumType *ET = dyn_cast<EnumType>(CanonicalType))
     // GCC allows forward declaration of enum types (forbid by C99 6.7.2.3p2).
     // If a body isn't seen by the time we get here, return false.
@@ -1964,7 +1963,7 @@
   }
 }
 
-bool QualType::isPODType(ASTContext &Context) const {
+bool QualType::isPODType(const ASTContext &Context) const {
   // C++11 has a more relaxed definition of POD.
   if (Context.getLangOpts().CPlusPlus11)
     return isCXX11PODType(Context);
@@ -1972,7 +1971,7 @@
   return isCXX98PODType(Context);
 }
 
-bool QualType::isCXX98PODType(ASTContext &Context) const {
+bool QualType::isCXX98PODType(const ASTContext &Context) const {
   // The compiler shouldn't query this for incomplete types, but the user might.
   // We return false for that case. Except for incomplete arrays of PODs, which
   // are PODs according to the standard.
@@ -2032,7 +2031,7 @@
   }
 }
 
-bool QualType::isTrivialType(ASTContext &Context) const {
+bool QualType::isTrivialType(const ASTContext &Context) const {
   // The compiler shouldn't query this for incomplete types, but the user might.
   // We return false for that case. Except for incomplete arrays of PODs, which
   // are PODs according to the standard.
@@ -2095,7 +2094,7 @@
   return false;
 }
 
-bool QualType::isTriviallyCopyableType(ASTContext &Context) const {
+bool QualType::isTriviallyCopyableType(const ASTContext &Context) const {
   if ((*this)->isArrayType())
     return Context.getBaseElementType(*this).isTriviallyCopyableType(Context);
 
@@ -2255,7 +2254,7 @@
 // This is effectively the intersection of isTrivialType and
 // isStandardLayoutType. We implement it directly to avoid redundant
 // conversions from a type to a CXXRecordDecl.
-bool QualType::isCXX11PODType(ASTContext &Context) const {
+bool QualType::isCXX11PODType(const ASTContext &Context) const {
   const Type *ty = getTypePtr();
   if (ty->isDependentType())
     return false;
@@ -2460,19 +2459,20 @@
 DependentTemplateSpecializationType::DependentTemplateSpecializationType(
                          ElaboratedTypeKeyword Keyword,
                          NestedNameSpecifier *NNS, const IdentifierInfo *Name,
-                         unsigned NumArgs, const TemplateArgument *Args,
+                         ArrayRef<TemplateArgument> Args,
                          QualType Canon)
   : TypeWithKeyword(Keyword, DependentTemplateSpecialization, Canon, true, true,
                     /*VariablyModified=*/false,
                     NNS && NNS->containsUnexpandedParameterPack()),
-    NNS(NNS), Name(Name), NumArgs(NumArgs) {
+    NNS(NNS), Name(Name), NumArgs(Args.size()) {
   assert((!NNS || NNS->isDependent()) &&
          "DependentTemplateSpecializatonType requires dependent qualifier");
-  for (unsigned I = 0; I != NumArgs; ++I) {
-    if (Args[I].containsUnexpandedParameterPack())
+  TemplateArgument *ArgBuffer = getArgBuffer();
+  for (const TemplateArgument &Arg : Args) {
+    if (Arg.containsUnexpandedParameterPack())
       setContainsUnexpandedParameterPack();
 
-    new (&getArgBuffer()[I]) TemplateArgument(Args[I]);
+    new (ArgBuffer++) TemplateArgument(Arg);
   }
 }
 
@@ -2482,13 +2482,12 @@
                                              ElaboratedTypeKeyword Keyword,
                                              NestedNameSpecifier *Qualifier,
                                              const IdentifierInfo *Name,
-                                             unsigned NumArgs,
-                                             const TemplateArgument *Args) {
+                                             ArrayRef<TemplateArgument> Args) {
   ID.AddInteger(Keyword);
   ID.AddPointer(Qualifier);
   ID.AddPointer(Name);
-  for (unsigned Idx = 0; Idx < NumArgs; ++Idx)
-    Args[Idx].Profile(ID, Context);
+  for (const TemplateArgument &Arg : Args)
+    Arg.Profile(ID, Context);
 }
 
 bool Type::isElaboratedTypeSpecifier() const {
@@ -2558,6 +2557,8 @@
     return "double";
   case LongDouble:
     return "long double";
+  case Float128:
+    return "__float128";
   case WChar_S:
   case WChar_U:
     return Policy.MSWChar ? "__wchar_t" : "wchar_t";
@@ -2587,30 +2588,10 @@
     return "Class";
   case ObjCSel:
     return "SEL";
-  case OCLImage1d:
-    return "image1d_t";
-  case OCLImage1dArray:
-    return "image1d_array_t";
-  case OCLImage1dBuffer:
-    return "image1d_buffer_t";
-  case OCLImage2d:
-    return "image2d_t";
-  case OCLImage2dArray:
-    return "image2d_array_t";
-  case OCLImage2dDepth:
-    return "image2d_depth_t";
-  case OCLImage2dArrayDepth:
-    return "image2d_array_depth_t";
-  case OCLImage2dMSAA:
-    return "image2d_msaa_t";
-  case OCLImage2dArrayMSAA:
-    return "image2d_array_msaa_t";
-  case OCLImage2dMSAADepth:
-    return "image2d_msaa_depth_t";
-  case OCLImage2dArrayMSAADepth:
-    return "image2d_array_msaa_depth_t";
-  case OCLImage3d:
-    return "image3d_t";
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case Id: \
+    return "__" #Access " " #ImgType "_t";
+#include "clang/Basic/OpenCLImageTypes.def"
   case OCLSampler:
     return "sampler_t";
   case OCLEvent:
@@ -2660,7 +2641,7 @@
   case CC_AAPCS_VFP: return "aapcs-vfp";
   case CC_IntelOclBicc: return "intel_ocl_bicc";
   case CC_SpirFunction: return "spir_function";
-  case CC_SpirKernel: return "spir_kernel";
+  case CC_OpenCLKernel: return "opencl_kernel";
   case CC_Swift: return "swiftcall";
   case CC_PreserveMost: return "preserve_most";
   case CC_PreserveAll: return "preserve_all";
@@ -2941,6 +2922,24 @@
   E->Profile(ID, Context, true);
 }
 
+UnaryTransformType::UnaryTransformType(QualType BaseType,
+                                       QualType UnderlyingType,
+                                       UTTKind UKind,
+                                       QualType CanonicalType)
+  : Type(UnaryTransform, CanonicalType, BaseType->isDependentType(),
+         BaseType->isInstantiationDependentType(),
+         BaseType->isVariablyModifiedType(),
+         BaseType->containsUnexpandedParameterPack())
+  , BaseType(BaseType), UnderlyingType(UnderlyingType), UKind(UKind)
+{}
+
+DependentUnaryTransformType::DependentUnaryTransformType(const ASTContext &C,
+                                                         QualType BaseType,
+                                                         UTTKind UKind)
+   : UnaryTransformType(BaseType, C.DependentTy, UKind, QualType())
+{}
+
+
 TagType::TagType(TypeClass TC, const TagDecl *D, QualType can)
   : Type(TC, can, D->isDependentType(), 
          /*InstantiationDependent=*/D->isDependentType(),
@@ -2957,17 +2956,6 @@
   return decl;
 }
 
-UnaryTransformType::UnaryTransformType(QualType BaseType,
-                                       QualType UnderlyingType,
-                                       UTTKind UKind,
-                                       QualType CanonicalType)
-  : Type(UnaryTransform, CanonicalType, UnderlyingType->isDependentType(),
-         UnderlyingType->isInstantiationDependentType(),
-         UnderlyingType->isVariablyModifiedType(),
-         BaseType->containsUnexpandedParameterPack())
-  , BaseType(BaseType), UnderlyingType(UnderlyingType), UKind(UKind)
-{}
-
 TagDecl *TagType::getDecl() const {
   return getInterestingTagDecl(decl);
 }
@@ -3111,20 +3099,20 @@
 bool TemplateSpecializationType::
 anyDependentTemplateArguments(const TemplateArgumentListInfo &Args,
                               bool &InstantiationDependent) {
-  return anyDependentTemplateArguments(Args.getArgumentArray(), Args.size(),
+  return anyDependentTemplateArguments(Args.arguments(),
                                        InstantiationDependent);
 }
 
 bool TemplateSpecializationType::
-anyDependentTemplateArguments(const TemplateArgumentLoc *Args, unsigned N,
+anyDependentTemplateArguments(ArrayRef<TemplateArgumentLoc> Args,
                               bool &InstantiationDependent) {
-  for (unsigned i = 0; i != N; ++i) {
-    if (Args[i].getArgument().isDependent()) {
+  for (const TemplateArgumentLoc &ArgLoc : Args) {
+    if (ArgLoc.getArgument().isDependent()) {
       InstantiationDependent = true;
       return true;
     }
-    
-    if (Args[i].getArgument().isInstantiationDependent())
+
+    if (ArgLoc.getArgument().isInstantiationDependent())
       InstantiationDependent = true;
   }
   return false;
@@ -3132,7 +3120,7 @@
 
 TemplateSpecializationType::
 TemplateSpecializationType(TemplateName T,
-                           const TemplateArgument *Args, unsigned NumArgs,
+                           ArrayRef<TemplateArgument> Args,
                            QualType Canon, QualType AliasedType)
   : Type(TemplateSpecialization,
          Canon.isNull()? QualType(this, 0) : Canon,
@@ -3140,7 +3128,7 @@
          Canon.isNull()? true : Canon->isInstantiationDependentType(),
          false,
          T.containsUnexpandedParameterPack()),
-    Template(T), NumArgs(NumArgs), TypeAlias(!AliasedType.isNull()) {
+    Template(T), NumArgs(Args.size()), TypeAlias(!AliasedType.isNull()) {
   assert(!T.getAsDependentTemplateName() && 
          "Use DependentTemplateSpecializationType for dependent template-name");
   assert((T.getKind() == TemplateName::Template ||
@@ -3150,7 +3138,7 @@
 
   TemplateArgument *TemplateArgs
     = reinterpret_cast<TemplateArgument *>(this + 1);
-  for (unsigned Arg = 0; Arg < NumArgs; ++Arg) {
+  for (const TemplateArgument &Arg : Args) {
     // Update instantiation-dependent and variably-modified bits.
     // If the canonical type exists and is non-dependent, the template
     // specialization type can be non-dependent even if one of the type
@@ -3159,14 +3147,14 @@
     // U<T> is always non-dependent, irrespective of the type T.
     // However, U<Ts> contains an unexpanded parameter pack, even though
     // its expansion (and thus its desugared type) doesn't.
-    if (Args[Arg].isInstantiationDependent())
+    if (Arg.isInstantiationDependent())
       setInstantiationDependent();
-    if (Args[Arg].getKind() == TemplateArgument::Type &&
-        Args[Arg].getAsType()->isVariablyModifiedType())
+    if (Arg.getKind() == TemplateArgument::Type &&
+        Arg.getAsType()->isVariablyModifiedType())
       setVariablyModified();
-    if (Args[Arg].containsUnexpandedParameterPack())
+    if (Arg.containsUnexpandedParameterPack())
       setContainsUnexpandedParameterPack();
-    new (&TemplateArgs[Arg]) TemplateArgument(Args[Arg]);
+    new (TemplateArgs++) TemplateArgument(Arg);
   }
 
   // Store the aliased type if this is a type alias template specialization.
@@ -3179,12 +3167,11 @@
 void
 TemplateSpecializationType::Profile(llvm::FoldingSetNodeID &ID,
                                     TemplateName T,
-                                    const TemplateArgument *Args,
-                                    unsigned NumArgs,
+                                    ArrayRef<TemplateArgument> Args,
                                     const ASTContext &Context) {
   T.Profile(ID);
-  for (unsigned Idx = 0; Idx < NumArgs; ++Idx)
-    Args[Idx].Profile(ID, Context);
+  for (const TemplateArgument &Arg : Args)
+    Arg.Profile(ID, Context);
 }
 
 QualType
@@ -3577,18 +3564,9 @@
     case BuiltinType::ObjCId:
     case BuiltinType::ObjCClass:
     case BuiltinType::ObjCSel:
-    case BuiltinType::OCLImage1d:
-    case BuiltinType::OCLImage1dArray:
-    case BuiltinType::OCLImage1dBuffer:
-    case BuiltinType::OCLImage2d:
-    case BuiltinType::OCLImage2dArray:
-    case BuiltinType::OCLImage2dDepth:
-    case BuiltinType::OCLImage2dArrayDepth:
-    case BuiltinType::OCLImage2dMSAA:
-    case BuiltinType::OCLImage2dArrayMSAA:
-    case BuiltinType::OCLImage2dMSAADepth:
-    case BuiltinType::OCLImage2dArrayMSAADepth:
-    case BuiltinType::OCLImage3d:
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
     case BuiltinType::OCLSampler:
     case BuiltinType::OCLEvent:
     case BuiltinType::OCLClkEvent:
@@ -3708,10 +3686,18 @@
 }
 
 bool Type::isObjCNSObjectType() const {
-  if (const TypedefType *typedefType = dyn_cast<TypedefType>(this))
-    return typedefType->getDecl()->hasAttr<ObjCNSObjectAttr>();
-  return false;
+  const Type *cur = this;
+  while (true) {
+    if (const TypedefType *typedefType = dyn_cast<TypedefType>(cur))
+      return typedefType->getDecl()->hasAttr<ObjCNSObjectAttr>();
+
+    // Single-step desugar until we run out of sugar.
+    QualType next = cur->getLocallyUnqualifiedSingleStepDesugaredType();
+    if (next.getTypePtr() == cur) return false;
+    cur = next.getTypePtr();
+  }
 }
+
 bool Type::isObjCIndependentClassType() const {
   if (const TypedefType *typedefType = dyn_cast<TypedefType>(this))
     return typedefType->getDecl()->hasAttr<ObjCIndependentClassAttr>();
diff --git a/lib/AST/TypeLoc.cpp b/lib/AST/TypeLoc.cpp
index 565fa5d..b5c7271 100644
--- a/lib/AST/TypeLoc.cpp
+++ b/lib/AST/TypeLoc.cpp
@@ -16,7 +16,6 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/TypeLocVisitor.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 using namespace clang;
 
 static const unsigned TypeLocMaxDataAlign = llvm::alignOf<void *>();
@@ -320,6 +319,7 @@
   case BuiltinType::Float:
   case BuiltinType::Double:
   case BuiltinType::LongDouble:
+  case BuiltinType::Float128:
     llvm_unreachable("Builtin type needs extra local data!");
     // Fall through, if the impossible happens.
       
@@ -333,18 +333,9 @@
   case BuiltinType::ObjCId:
   case BuiltinType::ObjCClass:
   case BuiltinType::ObjCSel:
-  case BuiltinType::OCLImage1d:
-  case BuiltinType::OCLImage1dArray:
-  case BuiltinType::OCLImage1dBuffer:
-  case BuiltinType::OCLImage2d:
-  case BuiltinType::OCLImage2dArray:
-  case BuiltinType::OCLImage2dDepth:
-  case BuiltinType::OCLImage2dArrayDepth:
-  case BuiltinType::OCLImage2dMSAA:
-  case BuiltinType::OCLImage2dArrayMSAA:
-  case BuiltinType::OCLImage2dMSAADepth:
-  case BuiltinType::OCLImage2dArrayMSAADepth:
-  case BuiltinType::OCLImage3d:
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
   case BuiltinType::OCLEvent:
   case BuiltinType::OCLClkEvent:
diff --git a/lib/AST/TypePrinter.cpp b/lib/AST/TypePrinter.cpp
index 7c519fc..065a2db 100644
--- a/lib/AST/TypePrinter.cpp
+++ b/lib/AST/TypePrinter.cpp
@@ -81,12 +81,14 @@
   
   class TypePrinter {
     PrintingPolicy Policy;
+    unsigned Indentation;
     bool HasEmptyPlaceHolder;
     bool InsideCCAttribute;
 
   public:
-    explicit TypePrinter(const PrintingPolicy &Policy)
-      : Policy(Policy), HasEmptyPlaceHolder(false), InsideCCAttribute(false) { }
+    explicit TypePrinter(const PrintingPolicy &Policy, unsigned Indentation = 0)
+      : Policy(Policy), Indentation(Indentation),
+        HasEmptyPlaceHolder(false), InsideCCAttribute(false) { }
 
     void print(const Type *ty, Qualifiers qs, raw_ostream &OS,
                StringRef PlaceHolder);
@@ -110,7 +112,8 @@
   };
 }
 
-static void AppendTypeQualList(raw_ostream &OS, unsigned TypeQuals, bool C99) {
+static void AppendTypeQualList(raw_ostream &OS, unsigned TypeQuals,
+                               bool HasRestrictKeyword) {
   bool appendSpace = false;
   if (TypeQuals & Qualifiers::Const) {
     OS << "const";
@@ -123,7 +126,7 @@
   }
   if (TypeQuals & Qualifiers::Restrict) {
     if (appendSpace) OS << ' ';
-    if (C99) {
+    if (HasRestrictKeyword) {
       OS << "restrict";
     } else {
       OS << "__restrict";
@@ -411,7 +414,7 @@
     OS << '(';
 
   PrintingPolicy InnerPolicy(Policy);
-  InnerPolicy.SuppressTag = false;
+  InnerPolicy.IncludeTagDefinition = false;
   TypePrinter(InnerPolicy).print(QualType(T->getClass(), 0), OS, StringRef());
 
   OS << "::*";
@@ -437,7 +440,8 @@
                                           raw_ostream &OS) {
   OS << '[';
   if (T->getIndexTypeQualifiers().hasQualifiers()) {
-    AppendTypeQualList(OS, T->getIndexTypeCVRQualifiers(), Policy.LangOpts.C99);
+    AppendTypeQualList(OS, T->getIndexTypeCVRQualifiers(),
+                       Policy.Restrict);
     OS << ' ';
   }
 
@@ -470,7 +474,7 @@
                                           raw_ostream &OS) {
   OS << '[';
   if (T->getIndexTypeQualifiers().hasQualifiers()) {
-    AppendTypeQualList(OS, T->getIndexTypeCVRQualifiers(), Policy.LangOpts.C99);
+    AppendTypeQualList(OS, T->getIndexTypeCVRQualifiers(), Policy.Restrict);
     OS << ' ';
   }
 
@@ -670,7 +674,7 @@
     if (T->getNumParams())
       OS << ", ";
     OS << "...";
-  } else if (T->getNumParams() == 0 && !Policy.LangOpts.CPlusPlus) {
+  } else if (T->getNumParams() == 0 && Policy.UseVoidForZeroParams) {
     // Do not emit int() if we have a proto, emit 'int(void)'.
     OS << "void";
   }
@@ -721,7 +725,7 @@
       OS << " __attribute__((sysv_abi))";
       break;
     case CC_SpirFunction:
-    case CC_SpirKernel:
+    case CC_OpenCLKernel:
       // Do nothing. These CCs are not available as attributes.
       break;
     case CC_Swift:
@@ -744,7 +748,7 @@
 
   if (unsigned quals = T->getTypeQuals()) {
     OS << ' ';
-    AppendTypeQualList(OS, quals, Policy.LangOpts.C99);
+    AppendTypeQualList(OS, quals, Policy.Restrict);
   }
 
   switch (T->getRefQualifier()) {
@@ -893,7 +897,8 @@
 void TypePrinter::printPipeBefore(const PipeType *T, raw_ostream &OS) {
   IncludeStrongLifetimeRAII Strong(Policy);
 
-  OS << "pipe";
+  OS << "pipe ";
+  print(T->getElementType(), OS, StringRef());
   spaceBeforePlaceHolder(OS);
 }
 
@@ -918,10 +923,8 @@
     IncludeStrongLifetimeRAII Strong(Policy);
     OS << Spec->getIdentifier()->getName();
     const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs();
-    TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                            TemplateArgs.data(),
-                                            TemplateArgs.size(),
-                                            Policy);
+    TemplateSpecializationType::PrintTemplateArgumentList(
+        OS, TemplateArgs.asArray(), Policy);
     OS << "::";
   } else if (TagDecl *Tag = dyn_cast<TagDecl>(DC)) {
     if (TypedefNameDecl *Typedef = Tag->getTypedefNameForAnonDecl())
@@ -934,18 +937,19 @@
 }
 
 void TypePrinter::printTag(TagDecl *D, raw_ostream &OS) {
-  if (Policy.SuppressTag)
+  if (Policy.IncludeTagDefinition) {
+    PrintingPolicy SubPolicy = Policy;
+    SubPolicy.IncludeTagDefinition = false;
+    D->print(OS, SubPolicy, Indentation);
+    spaceBeforePlaceHolder(OS);
     return;
+  }
 
   bool HasKindDecoration = false;
 
-  // bool SuppressTagKeyword
-  //   = Policy.LangOpts.CPlusPlus || Policy.SuppressTagKeyword;
-
   // We don't print tags unless this is an elaborated type.
   // In C, we just assume every RecordType is an elaborated type.
-  if (!(Policy.LangOpts.CPlusPlus || Policy.SuppressTagKeyword ||
-        D->getTypedefNameForAnonDecl())) {
+  if (!Policy.SuppressTagKeyword && !D->getTypedefNameForAnonDecl()) {
     HasKindDecoration = true;
     OS << D->getKindName();
     OS << ' ';
@@ -997,22 +1001,17 @@
   // arguments.
   if (ClassTemplateSpecializationDecl *Spec
         = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
-    const TemplateArgument *Args;
-    unsigned NumArgs;
+    ArrayRef<TemplateArgument> Args;
     if (TypeSourceInfo *TAW = Spec->getTypeAsWritten()) {
       const TemplateSpecializationType *TST =
         cast<TemplateSpecializationType>(TAW->getType());
-      Args = TST->getArgs();
-      NumArgs = TST->getNumArgs();
+      Args = TST->template_arguments();
     } else {
       const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs();
-      Args = TemplateArgs.data();
-      NumArgs = TemplateArgs.size();
+      Args = TemplateArgs.asArray();
     }
     IncludeStrongLifetimeRAII Strong(Policy);
-    TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                                          Args, NumArgs,
-                                                          Policy);
+    TemplateSpecializationType::PrintTemplateArgumentList(OS, Args, Policy);
   }
 
   spaceBeforePlaceHolder(OS);
@@ -1070,11 +1069,9 @@
                                             raw_ostream &OS) { 
   IncludeStrongLifetimeRAII Strong(Policy);
   T->getTemplateName().print(OS, Policy);
-  
-  TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                                        T->getArgs(), 
-                                                        T->getNumArgs(), 
-                                                        Policy);
+
+  TemplateSpecializationType::PrintTemplateArgumentList(
+      OS, T->template_arguments(), Policy);
   spaceBeforePlaceHolder(OS);
 }
 void TypePrinter::printTemplateSpecializationAfter(
@@ -1090,14 +1087,16 @@
 
 void TypePrinter::printElaboratedBefore(const ElaboratedType *T,
                                         raw_ostream &OS) {
-  if (Policy.SuppressTag && isa<TagType>(T->getNamedType()))
-    return;
-  OS << TypeWithKeyword::getKeywordName(T->getKeyword());
-  if (T->getKeyword() != ETK_None)
-    OS << " ";
-  NestedNameSpecifier* Qualifier = T->getQualifier();
-  if (Qualifier)
-    Qualifier->print(OS, Policy);
+  // The tag definition will take care of these.
+  if (!Policy.IncludeTagDefinition)
+  {
+    OS << TypeWithKeyword::getKeywordName(T->getKeyword());
+    if (T->getKeyword() != ETK_None)
+      OS << " ";
+    NestedNameSpecifier* Qualifier = T->getQualifier();
+    if (Qualifier)
+      Qualifier->print(OS, Policy);
+  }
   
   ElaboratedTypePolicyRAII PolicyRAII(Policy);
   printBefore(T->getNamedType(), OS);
@@ -1149,8 +1148,7 @@
     T->getQualifier()->print(OS, Policy);    
   OS << T->getIdentifier()->getName();
   TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                                        T->getArgs(),
-                                                        T->getNumArgs(),
+                                                        T->template_arguments(),
                                                         Policy);
   spaceBeforePlaceHolder(OS);
 }
@@ -1437,50 +1435,46 @@
                             const TemplateArgumentListInfo &Args,
                             const PrintingPolicy &Policy) {
   return PrintTemplateArgumentList(OS,
-                                   Args.getArgumentArray(),
-                                   Args.size(),
+                                   Args.arguments(),
                                    Policy);
 }
 
-void
-TemplateSpecializationType::PrintTemplateArgumentList(
-                                                raw_ostream &OS,
-                                                const TemplateArgument *Args,
-                                                unsigned NumArgs,
-                                                  const PrintingPolicy &Policy,
-                                                      bool SkipBrackets) {
+void TemplateSpecializationType::PrintTemplateArgumentList(
+    raw_ostream &OS, ArrayRef<TemplateArgument> Args,
+    const PrintingPolicy &Policy, bool SkipBrackets) {
   const char *Comma = Policy.MSVCFormatting ? "," : ", ";
   if (!SkipBrackets)
     OS << '<';
-  
+
   bool needSpace = false;
-  for (unsigned Arg = 0; Arg < NumArgs; ++Arg) {
+  bool FirstArg = true;
+  for (const TemplateArgument &Arg : Args) {
     // Print the argument into a string.
     SmallString<128> Buf;
     llvm::raw_svector_ostream ArgOS(Buf);
-    if (Args[Arg].getKind() == TemplateArgument::Pack) {
-      if (Args[Arg].pack_size() && Arg > 0)
+    if (Arg.getKind() == TemplateArgument::Pack) {
+      if (Arg.pack_size() && !FirstArg)
         OS << Comma;
       PrintTemplateArgumentList(ArgOS,
-                                Args[Arg].pack_begin(), 
-                                Args[Arg].pack_size(), 
+                                Arg.getPackAsArray(),
                                 Policy, true);
     } else {
-      if (Arg > 0)
+      if (!FirstArg)
         OS << Comma;
-      Args[Arg].print(Policy, ArgOS);
+      Arg.print(Policy, ArgOS);
     }
     StringRef ArgString = ArgOS.str();
 
     // If this is the first argument and its string representation
     // begins with the global scope specifier ('::foo'), add a space
     // to avoid printing the diagraph '<:'.
-    if (!Arg && !ArgString.empty() && ArgString[0] == ':')
+    if (FirstArg && !ArgString.empty() && ArgString[0] == ':')
       OS << ' ';
 
     OS << ArgString;
 
     needSpace = (!ArgString.empty() && ArgString.back() == '>');
+    FirstArg = false;
   }
 
   // If the last character of our string is '>', add another space to
@@ -1496,40 +1490,41 @@
 // Sadly, repeat all that with TemplateArgLoc.
 void TemplateSpecializationType::
 PrintTemplateArgumentList(raw_ostream &OS,
-                          const TemplateArgumentLoc *Args, unsigned NumArgs,
+                          ArrayRef<TemplateArgumentLoc> Args,
                           const PrintingPolicy &Policy) {
   OS << '<';
   const char *Comma = Policy.MSVCFormatting ? "," : ", ";
 
   bool needSpace = false;
-  for (unsigned Arg = 0; Arg < NumArgs; ++Arg) {
-    if (Arg > 0)
+  bool FirstArg = true;
+  for (const TemplateArgumentLoc &Arg : Args) {
+    if (!FirstArg)
       OS << Comma;
-    
+
     // Print the argument into a string.
     SmallString<128> Buf;
     llvm::raw_svector_ostream ArgOS(Buf);
-    if (Args[Arg].getArgument().getKind() == TemplateArgument::Pack) {
+    if (Arg.getArgument().getKind() == TemplateArgument::Pack) {
       PrintTemplateArgumentList(ArgOS,
-                                Args[Arg].getArgument().pack_begin(), 
-                                Args[Arg].getArgument().pack_size(), 
+                                Arg.getArgument().getPackAsArray(),
                                 Policy, true);
     } else {
-      Args[Arg].getArgument().print(Policy, ArgOS);
+      Arg.getArgument().print(Policy, ArgOS);
     }
     StringRef ArgString = ArgOS.str();
-    
+
     // If this is the first argument and its string representation
     // begins with the global scope specifier ('::foo'), add a space
     // to avoid printing the diagraph '<:'.
-    if (!Arg && !ArgString.empty() && ArgString[0] == ':')
+    if (FirstArg && !ArgString.empty() && ArgString[0] == ':')
       OS << ' ';
 
     OS << ArgString;
 
     needSpace = (!ArgString.empty() && ArgString.back() == '>');
+    FirstArg = false;
   }
-  
+
   // If the last character of our string is '>', add another space to
   // keep the two '>''s separate tokens. We don't *have* to do this in
   // C++0x, but it's still good hygiene.
@@ -1580,7 +1575,13 @@
 
   unsigned quals = getCVRQualifiers();
   if (quals) {
-    AppendTypeQualList(OS, quals, Policy.LangOpts.C99);
+    AppendTypeQualList(OS, quals, Policy.Restrict);
+    addSpace = true;
+  }
+  if (hasUnaligned()) {
+    if (addSpace)
+      OS << ' ';
+    OS << "__unaligned";
     addSpace = true;
   }
   if (unsigned addrspace = getAddressSpace()) {
@@ -1654,11 +1655,11 @@
 
 void QualType::print(const Type *ty, Qualifiers qs,
                      raw_ostream &OS, const PrintingPolicy &policy,
-                     const Twine &PlaceHolder) {
+                     const Twine &PlaceHolder, unsigned Indentation) {
   SmallString<128> PHBuf;
   StringRef PH = PlaceHolder.toStringRef(PHBuf);
 
-  TypePrinter(policy).print(ty, qs, OS, PH);
+  TypePrinter(policy, Indentation).print(ty, qs, OS, PH);
 }
 
 void QualType::getAsStringInternal(const Type *ty, Qualifiers qs,
diff --git a/lib/AST/VTableBuilder.cpp b/lib/AST/VTableBuilder.cpp
index bae0186..5c5fefd 100644
--- a/lib/AST/VTableBuilder.cpp
+++ b/lib/AST/VTableBuilder.cpp
@@ -2416,7 +2416,7 @@
   MethodVFTableLocationsTy MethodVFTableLocations;
 
   /// \brief Does this class have an RTTI component?
-  bool HasRTTIComponent;
+  bool HasRTTIComponent = false;
 
   /// MethodInfo - Contains information about a method in a vtable.
   /// (Used for computing 'this' pointer adjustment thunks.
@@ -2545,12 +2545,13 @@
         MostDerivedClassLayout(Context.getASTRecordLayout(MostDerivedClass)),
         WhichVFPtr(*Which),
         Overriders(MostDerivedClass, CharUnits(), MostDerivedClass) {
-    // Only include the RTTI component if we know that we will provide a
-    // definition of the vftable.
-    HasRTTIComponent = Context.getLangOpts().RTTIData &&
-                       !MostDerivedClass->hasAttr<DLLImportAttr>() &&
-                       MostDerivedClass->getTemplateSpecializationKind() !=
-                           TSK_ExplicitInstantiationDeclaration;
+    // Provide the RTTI component if RTTIData is enabled. If the vftable would
+    // be available externally, we should not provide the RTTI componenent. It
+    // is currently impossible to get available externally vftables with either
+    // dllimport or extern template instantiations, but eventually we may add a
+    // flag to support additional devirtualization that needs this.
+    if (Context.getLangOpts().RTTIData)
+      HasRTTIComponent = true;
 
     LayoutVFTable();
 
@@ -2930,8 +2931,8 @@
   // class.
   const CXXRecordDecl *NextBase = nullptr, *NextLastVBase = LastVBase;
   CharUnits NextBaseOffset;
-  if (BaseDepth < WhichVFPtr.PathToBaseWithVPtr.size()) {
-    NextBase = WhichVFPtr.PathToBaseWithVPtr[BaseDepth];
+  if (BaseDepth < WhichVFPtr.PathToIntroducingObject.size()) {
+    NextBase = WhichVFPtr.PathToIntroducingObject[BaseDepth];
     if (isDirectVBase(NextBase, RD)) {
       NextLastVBase = NextBase;
       NextBaseOffset = MostDerivedClassLayout.getVBaseClassOffset(NextBase);
@@ -3123,7 +3124,7 @@
 
 void VFTableBuilder::dumpLayout(raw_ostream &Out) {
   Out << "VFTable for ";
-  PrintBasePath(WhichVFPtr.PathToBaseWithVPtr, Out);
+  PrintBasePath(WhichVFPtr.PathToIntroducingObject, Out);
   Out << "'";
   MostDerivedClass->printQualifiedName(Out);
   Out << "' (" << Components.size()
@@ -3310,10 +3311,10 @@
       // Keep track of which vtable the derived class is going to extend with
       // new methods or bases.  We append to either the vftable of our primary
       // base, or the first non-virtual base that has a vbtable.
-      if (P->ReusingBase == Base &&
+      if (P->ObjectWithVPtr == Base &&
           Base == (ForVBTables ? Layout.getBaseSharingVBPtr()
                                : Layout.getPrimaryBase()))
-        P->ReusingBase = RD;
+        P->ObjectWithVPtr = RD;
 
       // Keep track of the full adjustment from the MDC to this vtable.  The
       // adjustment is captured by an optional vbase and a non-virtual offset.
@@ -3400,14 +3401,14 @@
 }
 
 // This recursive function finds all paths from a subobject centered at
-// (RD, Offset) to the subobject located at BaseWithVPtr.
+// (RD, Offset) to the subobject located at IntroducingObject.
 static void findPathsToSubobject(ASTContext &Context,
                                  const ASTRecordLayout &MostDerivedLayout,
                                  const CXXRecordDecl *RD, CharUnits Offset,
-                                 BaseSubobject BaseWithVPtr,
+                                 BaseSubobject IntroducingObject,
                                  FullPathTy &FullPath,
                                  std::list<FullPathTy> &Paths) {
-  if (BaseSubobject(RD, Offset) == BaseWithVPtr) {
+  if (BaseSubobject(RD, Offset) == IntroducingObject) {
     Paths.push_back(FullPath);
     return;
   }
@@ -3421,7 +3422,7 @@
                               : Offset + Layout.getBaseClassOffset(Base);
     FullPath.insert(BaseSubobject(Base, NewOffset));
     findPathsToSubobject(Context, MostDerivedLayout, Base, NewOffset,
-                         BaseWithVPtr, FullPath, Paths);
+                         IntroducingObject, FullPath, Paths);
     FullPath.pop_back();
   }
 }
@@ -3496,7 +3497,7 @@
     CharUnits BaseOffset =
         getOffsetOfFullPath(Context, TopLevelRD, SpecificPath);
     FinalOverriders Overriders(TopLevelRD, CharUnits::Zero(), TopLevelRD);
-    for (const CXXMethodDecl *MD : Info->BaseWithVPtr->methods()) {
+    for (const CXXMethodDecl *MD : Info->IntroducingObject->methods()) {
       if (!MD->isVirtual())
         continue;
       FinalOverriders::OverriderInfo OI =
@@ -3554,15 +3555,15 @@
   for (VPtrInfo *Info : Paths) {
     findPathsToSubobject(
         Context, MostDerivedLayout, RD, CharUnits::Zero(),
-        BaseSubobject(Info->BaseWithVPtr, Info->FullOffsetInMDC), FullPath,
+        BaseSubobject(Info->IntroducingObject, Info->FullOffsetInMDC), FullPath,
         FullPaths);
     FullPath.clear();
     removeRedundantPaths(FullPaths);
-    Info->PathToBaseWithVPtr.clear();
+    Info->PathToIntroducingObject.clear();
     if (const FullPathTy *BestPath =
             selectBestPath(Context, RD, Info, FullPaths))
       for (const BaseSubobject &BSO : *BestPath)
-        Info->PathToBaseWithVPtr.push_back(BSO.getBase());
+        Info->PathToIntroducingObject.push_back(BSO.getBase());
     FullPaths.clear();
   }
 }
diff --git a/lib/ASTMatchers/ASTMatchFinder.cpp b/lib/ASTMatchers/ASTMatchFinder.cpp
index 847398c..19e5743 100644
--- a/lib/ASTMatchers/ASTMatchFinder.cpp
+++ b/lib/ASTMatchers/ASTMatchFinder.cpp
@@ -616,6 +616,10 @@
         ActiveASTContext->getTranslationUnitDecl())
       return false;
 
+    // For AST-nodes that don't have an identity, we can't memoize.
+    if (!Builder->isComparable())
+      return matchesAncestorOfRecursively(Node, Matcher, Builder, MatchMode);
+
     MatchKey Key;
     Key.MatcherID = Matcher.getID();
     Key.Node = Node;
@@ -630,22 +634,34 @@
     }
 
     MemoizedMatchResult Result;
-    Result.ResultOfMatch = false;
     Result.Nodes = *Builder;
+    Result.ResultOfMatch =
+        matchesAncestorOfRecursively(Node, Matcher, &Result.Nodes, MatchMode);
 
+    MemoizedMatchResult &CachedResult = ResultCache[Key];
+    CachedResult = std::move(Result);
+
+    *Builder = CachedResult.Nodes;
+    return CachedResult.ResultOfMatch;
+  }
+
+  bool matchesAncestorOfRecursively(const ast_type_traits::DynTypedNode &Node,
+                                    const DynTypedMatcher &Matcher,
+                                    BoundNodesTreeBuilder *Builder,
+                                    AncestorMatchMode MatchMode) {
     const auto &Parents = ActiveASTContext->getParents(Node);
     assert(!Parents.empty() && "Found node that is not in the parent map.");
     if (Parents.size() == 1) {
       // Only one parent - do recursive memoization.
       const ast_type_traits::DynTypedNode Parent = Parents[0];
-      if (Matcher.matches(Parent, this, &Result.Nodes)) {
-        Result.ResultOfMatch = true;
-      } else if (MatchMode != ASTMatchFinder::AMM_ParentOnly) {
-        // Reset the results to not include the bound nodes from the failed
-        // match above.
-        Result.Nodes = *Builder;
-        Result.ResultOfMatch = memoizedMatchesAncestorOfRecursively(
-            Parent, Matcher, &Result.Nodes, MatchMode);
+      BoundNodesTreeBuilder BuilderCopy = *Builder;
+      if (Matcher.matches(Parent, this, &BuilderCopy)) {
+        *Builder = std::move(BuilderCopy);
+        return true;
+      }
+      if (MatchMode != ASTMatchFinder::AMM_ParentOnly) {
+        return memoizedMatchesAncestorOfRecursively(Parent, Matcher, Builder,
+                                                    MatchMode);
         // Once we get back from the recursive call, the result will be the
         // same as the parent's result.
       }
@@ -655,10 +671,10 @@
       std::deque<ast_type_traits::DynTypedNode> Queue(Parents.begin(),
                                                       Parents.end());
       while (!Queue.empty()) {
-        Result.Nodes = *Builder;
-        if (Matcher.matches(Queue.front(), this, &Result.Nodes)) {
-          Result.ResultOfMatch = true;
-          break;
+        BoundNodesTreeBuilder BuilderCopy = *Builder;
+        if (Matcher.matches(Queue.front(), this, &BuilderCopy)) {
+          *Builder = std::move(BuilderCopy);
+          return true;
         }
         if (MatchMode != ASTMatchFinder::AMM_ParentOnly) {
           for (const auto &Parent :
@@ -673,12 +689,7 @@
         Queue.pop_front();
       }
     }
-
-    MemoizedMatchResult &CachedResult = ResultCache[Key];
-    CachedResult = std::move(Result);
-
-    *Builder = CachedResult.Nodes;
-    return CachedResult.ResultOfMatch;
+    return false;
   }
 
   // Implements a BoundNodesTree::Visitor that calls a MatchCallback with
@@ -701,7 +712,7 @@
 
   // Returns true if 'TypeNode' has an alias that matches the given matcher.
   bool typeHasMatchingAlias(const Type *TypeNode,
-                            const Matcher<NamedDecl> Matcher,
+                            const Matcher<NamedDecl> &Matcher,
                             BoundNodesTreeBuilder *Builder) {
     const Type *const CanonicalType =
       ActiveASTContext->getCanonicalType(TypeNode);
@@ -744,46 +755,25 @@
   MemoizationMap ResultCache;
 };
 
-static CXXRecordDecl *getAsCXXRecordDecl(const Type *TypeNode) {
-  // Type::getAs<...>() drills through typedefs.
-  if (TypeNode->getAs<DependentNameType>() != nullptr ||
-      TypeNode->getAs<DependentTemplateSpecializationType>() != nullptr ||
-      TypeNode->getAs<TemplateTypeParmType>() != nullptr)
-    // Dependent names and template TypeNode parameters will be matched when
-    // the template is instantiated.
-    return nullptr;
-  TemplateSpecializationType const *TemplateType =
-      TypeNode->getAs<TemplateSpecializationType>();
-  if (!TemplateType) {
-    return TypeNode->getAsCXXRecordDecl();
-  }
-  if (TemplateType->getTemplateName().isDependent())
-    // Dependent template specializations will be matched when the
-    // template is instantiated.
-    return nullptr;
+static CXXRecordDecl *
+getAsCXXRecordDeclOrPrimaryTemplate(const Type *TypeNode) {
+  if (auto *RD = TypeNode->getAsCXXRecordDecl())
+    return RD;
 
-  // For template specialization types which are specializing a template
-  // declaration which is an explicit or partial specialization of another
-  // template declaration, getAsCXXRecordDecl() returns the corresponding
-  // ClassTemplateSpecializationDecl.
-  //
-  // For template specialization types which are specializing a template
-  // declaration which is neither an explicit nor partial specialization of
-  // another template declaration, getAsCXXRecordDecl() returns NULL and
-  // we get the CXXRecordDecl of the templated declaration.
-  CXXRecordDecl *SpecializationDecl = TemplateType->getAsCXXRecordDecl();
-  if (SpecializationDecl) {
-    return SpecializationDecl;
-  }
-  NamedDecl *Templated =
-      TemplateType->getTemplateName().getAsTemplateDecl()->getTemplatedDecl();
-  if (CXXRecordDecl *TemplatedRecord = dyn_cast<CXXRecordDecl>(Templated)) {
-    return TemplatedRecord;
-  }
-  // Now it can still be that we have an alias template.
-  TypeAliasDecl *AliasDecl = dyn_cast<TypeAliasDecl>(Templated);
-  assert(AliasDecl);
-  return getAsCXXRecordDecl(AliasDecl->getUnderlyingType().getTypePtr());
+  // Find the innermost TemplateSpecializationType that isn't an alias template.
+  auto *TemplateType = TypeNode->getAs<TemplateSpecializationType>();
+  while (TemplateType && TemplateType->isTypeAlias())
+    TemplateType =
+        TemplateType->getAliasedType()->getAs<TemplateSpecializationType>();
+
+  // If this is the name of a (dependent) template specialization, use the
+  // definition of the template, even though it might be specialized later.
+  if (TemplateType)
+    if (auto *ClassTemplate = dyn_cast_or_null<ClassTemplateDecl>(
+          TemplateType->getTemplateName().getAsTemplateDecl()))
+      return ClassTemplate->getTemplatedDecl();
+
+  return nullptr;
 }
 
 // Returns true if the given class is directly or indirectly derived
@@ -800,7 +790,10 @@
     if (typeHasMatchingAlias(TypeNode, Base, Builder))
       return true;
 
-    CXXRecordDecl *ClassDecl = getAsCXXRecordDecl(TypeNode);
+    // FIXME: Going to the primary template here isn't really correct, but
+    // unfortunately we accept a Decl matcher for the base class not a Type
+    // matcher, so it's the best thing we can do with our current interface.
+    CXXRecordDecl *ClassDecl = getAsCXXRecordDeclOrPrimaryTemplate(TypeNode);
     if (!ClassDecl)
       continue;
     if (ClassDecl == Declaration) {
diff --git a/lib/ASTMatchers/ASTMatchersInternal.cpp b/lib/ASTMatchers/ASTMatchersInternal.cpp
index 463cf0b..f0bfbf9 100644
--- a/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -14,6 +14,7 @@
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ManagedStatic.h"
 
 namespace clang {
@@ -71,10 +72,10 @@
 };
 
 class IdDynMatcher : public DynMatcherInterface {
- public:
+public:
   IdDynMatcher(StringRef ID,
-               const IntrusiveRefCntPtr<DynMatcherInterface> &InnerMatcher)
-      : ID(ID), InnerMatcher(InnerMatcher) {}
+               IntrusiveRefCntPtr<DynMatcherInterface> InnerMatcher)
+      : ID(ID), InnerMatcher(std::move(InnerMatcher)) {}
 
   bool dynMatches(const ast_type_traits::DynTypedNode &DynNode,
                   ASTMatchFinder *Finder,
@@ -84,7 +85,7 @@
     return Result;
   }
 
- private:
+private:
   const std::string ID;
   const IntrusiveRefCntPtr<DynMatcherInterface> InnerMatcher;
 };
@@ -209,8 +210,9 @@
 llvm::Optional<DynTypedMatcher> DynTypedMatcher::tryBind(StringRef ID) const {
   if (!AllowBind) return llvm::None;
   auto Result = *this;
-  Result.Implementation = new IdDynMatcher(ID, Result.Implementation);
-  return Result;
+  Result.Implementation =
+      new IdDynMatcher(ID, std::move(Result.Implementation));
+  return std::move(Result);
 }
 
 bool DynTypedMatcher::canConvertTo(ast_type_traits::ASTNodeKind To) const {
@@ -293,50 +295,212 @@
   return false;
 }
 
-HasNameMatcher::HasNameMatcher(StringRef NameRef)
-    : UseUnqualifiedMatch(NameRef.find("::") == NameRef.npos), Name(NameRef) {
-  assert(!Name.empty());
+Matcher<NamedDecl> hasAnyNameFunc(ArrayRef<const StringRef *> NameRefs) {
+  std::vector<std::string> Names;
+  for (auto *Name : NameRefs)
+    Names.emplace_back(*Name);
+  return internal::Matcher<NamedDecl>(
+      new internal::HasNameMatcher(std::move(Names)));
 }
 
+HasNameMatcher::HasNameMatcher(std::vector<std::string> N)
+    : UseUnqualifiedMatch(std::all_of(
+          N.begin(), N.end(),
+          [](StringRef Name) { return Name.find("::") == Name.npos; })),
+      Names(std::move(N)) {
+#ifndef NDEBUG
+  for (StringRef Name : Names)
+    assert(!Name.empty());
+#endif
+}
+
+namespace {
+
+bool consumeNameSuffix(StringRef &FullName, StringRef Suffix) {
+  StringRef Name = FullName;
+  if (!Name.endswith(Suffix))
+    return false;
+  Name = Name.drop_back(Suffix.size());
+  if (!Name.empty()) {
+    if (!Name.endswith("::"))
+      return false;
+    Name = Name.drop_back(2);
+  }
+  FullName = Name;
+  return true;
+}
+
+StringRef getNodeName(const NamedDecl &Node, llvm::SmallString<128> &Scratch) {
+  // Simple name.
+  if (Node.getIdentifier())
+    return Node.getName();
+
+  if (Node.getDeclName()) {
+    // Name needs to be constructed.
+    Scratch.clear();
+    llvm::raw_svector_ostream OS(Scratch);
+    Node.printName(OS);
+    return OS.str();
+  }
+
+  return "(anonymous)";
+}
+
+StringRef getNodeName(const RecordDecl &Node, llvm::SmallString<128> &Scratch) {
+  if (Node.getIdentifier()) {
+    return Node.getName();
+  }
+  Scratch.clear();
+  return ("(anonymous " + Node.getKindName() + ")").toStringRef(Scratch);
+}
+
+StringRef getNodeName(const NamespaceDecl &Node,
+                      llvm::SmallString<128> &Scratch) {
+  return Node.isAnonymousNamespace() ? "(anonymous namespace)" : Node.getName();
+}
+
+
+class PatternSet {
+public:
+  PatternSet(ArrayRef<std::string> Names) {
+    for (StringRef Name : Names)
+      Patterns.push_back({Name, Name.startswith("::")});
+  }
+
+  /// Consumes the name suffix from each pattern in the set and removes the ones
+  /// that didn't match.
+  /// Return true if there are still any patterns left.
+  bool consumeNameSuffix(StringRef NodeName, bool CanSkip) {
+    for (size_t I = 0; I < Patterns.size();) {
+      if (internal::consumeNameSuffix(Patterns[I].P, NodeName) ||
+          CanSkip) {
+        ++I;
+      } else {
+        Patterns.erase(Patterns.begin() + I);
+      }
+    }
+    return !Patterns.empty();
+  }
+
+  /// Check if any of the patterns are a match.
+  /// A match will be a pattern that was fully consumed, that also matches the
+  /// 'fully qualified' requirement.
+  bool foundMatch(bool AllowFullyQualified) const {
+    for (auto& P: Patterns)
+      if (P.P.empty() && (AllowFullyQualified || !P.IsFullyQualified))
+        return true;
+    return false;
+  }
+
+private:
+  struct Pattern {
+    StringRef P;
+    bool IsFullyQualified;
+  };
+  llvm::SmallVector<Pattern, 8> Patterns;
+};
+
+}  // namespace
+
 bool HasNameMatcher::matchesNodeUnqualified(const NamedDecl &Node) const {
   assert(UseUnqualifiedMatch);
-  if (Node.getIdentifier()) {
-    // Simple name.
-    return Name == Node.getName();
+  llvm::SmallString<128> Scratch;
+  StringRef NodeName = getNodeName(Node, Scratch);
+  return std::any_of(Names.begin(), Names.end(), [&](StringRef Name) {
+    return consumeNameSuffix(Name, NodeName) && Name.empty();
+  });
+}
+
+bool HasNameMatcher::matchesNodeFullFast(const NamedDecl &Node) const {
+  PatternSet Patterns(Names);
+  llvm::SmallString<128> Scratch;
+
+  // This function is copied and adapted from NamedDecl::printQualifiedName()
+  // By matching each part individually we optimize in a couple of ways:
+  //  - We can exit early on the first failure.
+  //  - We can skip inline/anonymous namespaces without another pass.
+  //  - We print one name at a time, reducing the chance of overflowing the
+  //    inlined space of the SmallString.
+
+  // First, match the name.
+  if (!Patterns.consumeNameSuffix(getNodeName(Node, Scratch),
+                                  /*CanSkip=*/false))
+    return false;
+
+  // Try to match each declaration context.
+  // We are allowed to skip anonymous and inline namespaces if they don't match.
+  const DeclContext *Ctx = Node.getDeclContext();
+
+  if (Ctx->isFunctionOrMethod())
+    return Patterns.foundMatch(/*AllowFullyQualified=*/false);
+
+  for (; Ctx && isa<NamedDecl>(Ctx); Ctx = Ctx->getParent()) {
+    if (Patterns.foundMatch(/*AllowFullyQualified=*/false))
+      return true;
+
+    if (const auto *ND = dyn_cast<NamespaceDecl>(Ctx)) {
+      // If it matches (or we can skip it), continue.
+      if (Patterns.consumeNameSuffix(getNodeName(*ND, Scratch),
+                                     /*CanSkip=*/ND->isAnonymousNamespace() ||
+                                         ND->isInline()))
+        continue;
+      return false;
+    }
+    if (const auto *RD = dyn_cast<RecordDecl>(Ctx)) {
+      if (!isa<ClassTemplateSpecializationDecl>(Ctx)) {
+        if (Patterns.consumeNameSuffix(getNodeName(*RD, Scratch),
+                                       /*CanSkip=*/false))
+          continue;
+
+        return false;
+      }
+    }
+
+    // We don't know how to deal with this DeclContext.
+    // Fallback to the slow version of the code.
+    return matchesNodeFullSlow(Node);
   }
-  if (Node.getDeclName()) {
-    // Name needs to be constructed.
-    llvm::SmallString<128> NodeName;
+
+  return Patterns.foundMatch(/*AllowFullyQualified=*/true);
+}
+
+bool HasNameMatcher::matchesNodeFullSlow(const NamedDecl &Node) const {
+  const bool SkipUnwrittenCases[] = {false, true};
+  for (bool SkipUnwritten : SkipUnwrittenCases) {
+    llvm::SmallString<128> NodeName = StringRef("::");
     llvm::raw_svector_ostream OS(NodeName);
-    Node.printName(OS);
-    return Name == OS.str();
+
+    if (SkipUnwritten) {
+      PrintingPolicy Policy = Node.getASTContext().getPrintingPolicy();
+      Policy.SuppressUnwrittenScope = true;
+      Node.printQualifiedName(OS, Policy);
+    } else {
+      Node.printQualifiedName(OS);
+    }
+
+    const StringRef FullName = OS.str();
+
+    for (const StringRef Pattern : Names) {
+      if (Pattern.startswith("::")) {
+        if (FullName == Pattern)
+          return true;
+      } else if (FullName.endswith(Pattern) &&
+                 FullName.drop_back(Pattern.size()).endswith("::")) {
+        return true;
+      }
+    }
   }
+
   return false;
 }
 
-bool HasNameMatcher::matchesNodeFull(const NamedDecl &Node) const {
-  llvm::SmallString<128> NodeName = StringRef("::");
-  llvm::raw_svector_ostream OS(NodeName);
-  Node.printQualifiedName(OS);
-  const StringRef FullName = OS.str();
-  const StringRef Pattern = Name;
-
-  if (Pattern.startswith("::"))
-    return FullName == Pattern;
-
-  return FullName.endswith(Pattern) &&
-         FullName.drop_back(Pattern.size()).endswith("::");
-}
-
 bool HasNameMatcher::matchesNode(const NamedDecl &Node) const {
-  // FIXME: There is still room for improvement, but it would require copying a
-  // lot of the logic from NamedDecl::printQualifiedName(). The benchmarks do
-  // not show like that extra complexity is needed right now.
+  assert(matchesNodeFullFast(Node) == matchesNodeFullSlow(Node));
   if (UseUnqualifiedMatch) {
-    assert(matchesNodeUnqualified(Node) == matchesNodeFull(Node));
+    assert(matchesNodeUnqualified(Node) == matchesNodeFullFast(Node));
     return matchesNodeUnqualified(Node);
   }
-  return matchesNodeFull(Node);
+  return matchesNodeFullFast(Node);
 }
 
 } // end namespace internal
diff --git a/lib/ASTMatchers/Dynamic/CMakeLists.txt b/lib/ASTMatchers/Dynamic/CMakeLists.txt
index c0d80ad..82c12a4 100644
--- a/lib/ASTMatchers/Dynamic/CMakeLists.txt
+++ b/lib/ASTMatchers/Dynamic/CMakeLists.txt
@@ -1,5 +1,14 @@
 set(LLVM_LINK_COMPONENTS support)
 
+# The registry source file ends up generating a lot of sections for each
+# matcher. Each matcher appears to get a vtable and several methods. Each
+# method needs .text, .pdata, .xdata, and .debug sections, adding to the
+# section multiplier. By default MSVC has a 2^16 limit on the number of
+# sections in an object file, and this needs more than that.
+if (MSVC)
+  set_source_files_properties(Registry.cpp PROPERTIES COMPILE_FLAGS /bigobj)
+endif()
+
 add_clang_library(clangDynamicASTMatchers
   Diagnostics.cpp
   VariantValue.cpp
diff --git a/lib/ASTMatchers/Dynamic/Makefile b/lib/ASTMatchers/Dynamic/Makefile
deleted file mode 100644
index a57d752..0000000
--- a/lib/ASTMatchers/Dynamic/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- clang/lib/ASTMatchers/Dynamic/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../../..
-LIBRARYNAME := clangDynamicASTMatchers
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/ASTMatchers/Dynamic/Marshallers.h b/lib/ASTMatchers/Dynamic/Marshallers.h
index 64d6b78..7b1a307 100644
--- a/lib/ASTMatchers/Dynamic/Marshallers.h
+++ b/lib/ASTMatchers/Dynamic/Marshallers.h
@@ -1,4 +1,4 @@
-//===--- Marshallers.h - Generic matcher function marshallers -*- C++ -*-===//
+//===--- Marshallers.h - Generic matcher function marshallers ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -32,7 +32,6 @@
 namespace dynamic {
 namespace internal {
 
-
 /// \brief Helper template class to just from argument type to the right is/get
 ///   functions in VariantValue.
 /// Used to verify and extract the matcher arguments below.
@@ -97,6 +96,28 @@
   }
 };
 
+template <> struct ArgTypeTraits<clang::CastKind> {
+private:
+  static clang::CastKind getCastKind(llvm::StringRef AttrKind) {
+    return llvm::StringSwitch<clang::CastKind>(AttrKind)
+#define CAST_OPERATION(Name) .Case( #Name, CK_##Name)
+#include "clang/AST/OperationKinds.def"
+        .Default(CK_Invalid);
+  }
+
+public:
+  static bool is(const VariantValue &Value) {
+    return Value.isString() &&  
+        getCastKind(Value.getString()) != CK_Invalid;
+  }
+  static clang::CastKind get(const VariantValue &Value) {
+    return getCastKind(Value.getString());
+  }
+  static ArgKind getKind() {
+    return ArgKind(ArgKind::AK_String);
+  }
+};
+
 /// \brief Matcher descriptor interface.
 ///
 /// Provides a \c create() method that constructs the matcher from the provided
@@ -234,7 +255,7 @@
 template <typename T>
 static VariantMatcher outvalueToVariantMatcher(const T &PolyMatcher,
                                                typename T::ReturnTypes * =
-                                                   NULL) {
+                                                   nullptr) {
   std::vector<DynTypedMatcher> Matchers;
   mergePolyMatchers(PolyMatcher, Matchers, typename T::ReturnTypes());
   VariantMatcher Out = VariantMatcher::PolymorphicMatcher(std::move(Matchers));
@@ -326,8 +347,9 @@
 
   template <typename ResultT, typename ArgT,
             ResultT (*F)(ArrayRef<const ArgT *>)>
-  VariadicFuncMatcherDescriptor(llvm::VariadicFunction<ResultT, ArgT, F> Func,
-                          StringRef MatcherName)
+  VariadicFuncMatcherDescriptor(
+      ast_matchers::internal::VariadicFunction<ResultT, ArgT, F> Func,
+      StringRef MatcherName)
       : Func(&variadicMatcherDescriptor<ResultT, ArgT, F>),
         MatcherName(MatcherName.str()),
         ArgsKind(ArgTypeTraits<ArgT>::getKind()) {
@@ -410,7 +432,6 @@
     return VariantMatcher();                                                   \
   }
 
-
 /// \brief 0-arg marshaller function.
 template <typename ReturnType>
 static VariantMatcher matcherMarshall0(void (*Func)(), StringRef MatcherName,
@@ -657,9 +678,9 @@
 /// \brief Variadic overload.
 template <typename ResultT, typename ArgT,
           ResultT (*Func)(ArrayRef<const ArgT *>)>
-MatcherDescriptor *
-makeMatcherAutoMarshall(llvm::VariadicFunction<ResultT, ArgT, Func> VarFunc,
-                        StringRef MatcherName) {
+MatcherDescriptor *makeMatcherAutoMarshall(
+    ast_matchers::internal::VariadicFunction<ResultT, ArgT, Func> VarFunc,
+    StringRef MatcherName) {
   return new VariadicFuncMatcherDescriptor(VarFunc, MatcherName);
 }
 
@@ -708,9 +729,9 @@
                                                MatcherName);
 }
 
-}  // namespace internal
-}  // namespace dynamic
-}  // namespace ast_matchers
-}  // namespace clang
+} // namespace internal
+} // namespace dynamic
+} // namespace ast_matchers
+} // namespace clang
 
-#endif  // LLVM_CLANG_AST_MATCHERS_DYNAMIC_MARSHALLERS_H
+#endif // LLVM_CLANG_AST_MATCHERS_DYNAMIC_MARSHALLERS_H
diff --git a/lib/ASTMatchers/Dynamic/Parser.cpp b/lib/ASTMatchers/Dynamic/Parser.cpp
index cf9dab6..ce8d0a9 100644
--- a/lib/ASTMatchers/Dynamic/Parser.cpp
+++ b/lib/ASTMatchers/Dynamic/Parser.cpp
@@ -16,7 +16,6 @@
 #include "clang/ASTMatchers/Dynamic/Registry.h"
 #include "clang/Basic/CharInfo.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include <string>
 #include <vector>
diff --git a/lib/ASTMatchers/Dynamic/Registry.cpp b/lib/ASTMatchers/Dynamic/Registry.cpp
index e2c9a47..192b4b6 100644
--- a/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -95,6 +95,7 @@
   REGISTER_OVERLOADED_2(thisPointerType);
 
   REGISTER_MATCHER(accessSpecDecl);
+  REGISTER_MATCHER(addrLabelExpr);
   REGISTER_MATCHER(alignOfExpr);
   REGISTER_MATCHER(allOf);
   REGISTER_MATCHER(anyOf);
@@ -104,9 +105,11 @@
   REGISTER_MATCHER(arrayType);
   REGISTER_MATCHER(asmStmt);
   REGISTER_MATCHER(asString);
+  REGISTER_MATCHER(atomicExpr);
   REGISTER_MATCHER(atomicType);
   REGISTER_MATCHER(autoType);
   REGISTER_MATCHER(binaryOperator);
+  REGISTER_MATCHER(binaryConditionalOperator);
   REGISTER_MATCHER(blockPointerType);
   REGISTER_MATCHER(booleanType);
   REGISTER_MATCHER(breakStmt);
@@ -161,11 +164,14 @@
   REGISTER_MATCHER(declStmt);
   REGISTER_MATCHER(defaultStmt);
   REGISTER_MATCHER(dependentSizedArrayType);
+  REGISTER_MATCHER(designatedInitExpr);
+  REGISTER_MATCHER(designatorCountIs);
   REGISTER_MATCHER(doStmt);
   REGISTER_MATCHER(eachOf);
   REGISTER_MATCHER(elaboratedType);
   REGISTER_MATCHER(enumConstantDecl);
   REGISTER_MATCHER(enumDecl);
+  REGISTER_MATCHER(enumType);
   REGISTER_MATCHER(equalsBoundNode);
   REGISTER_MATCHER(equalsIntegralValue);
   REGISTER_MATCHER(explicitCastExpr);
@@ -179,16 +185,21 @@
   REGISTER_MATCHER(forEachDescendant);
   REGISTER_MATCHER(forEachSwitchCase);
   REGISTER_MATCHER(forField);
+  REGISTER_MATCHER(forFunction);
   REGISTER_MATCHER(forStmt);
   REGISTER_MATCHER(friendDecl);
   REGISTER_MATCHER(functionDecl);
+  REGISTER_MATCHER(functionProtoType);
   REGISTER_MATCHER(functionTemplateDecl);
   REGISTER_MATCHER(functionType);
+  REGISTER_MATCHER(gnuNullExpr);
   REGISTER_MATCHER(gotoStmt);
   REGISTER_MATCHER(has);
   REGISTER_MATCHER(hasAncestor);
   REGISTER_MATCHER(hasAnyArgument);
   REGISTER_MATCHER(hasAnyConstructorInitializer);
+  REGISTER_MATCHER(hasAnyDeclaration);
+  REGISTER_MATCHER(hasAnyName);
   REGISTER_MATCHER(hasAnyParameter);
   REGISTER_MATCHER(hasAnySubstatement);
   REGISTER_MATCHER(hasAnyTemplateArgument);
@@ -198,9 +209,11 @@
   REGISTER_MATCHER(hasAttr);
   REGISTER_MATCHER(hasAutomaticStorageDuration);
   REGISTER_MATCHER(hasBase);
+  REGISTER_MATCHER(hasBitWidth);
   REGISTER_MATCHER(hasBody);
   REGISTER_MATCHER(hasCanonicalType);
   REGISTER_MATCHER(hasCaseConstant);
+  REGISTER_MATCHER(hasCastKind);
   REGISTER_MATCHER(hasCondition);
   REGISTER_MATCHER(hasConditionVariableStatement);
   REGISTER_MATCHER(hasDecayedType);
@@ -209,6 +222,7 @@
   REGISTER_MATCHER(hasDeducedType);
   REGISTER_MATCHER(hasDescendant);
   REGISTER_MATCHER(hasDestinationType);
+  REGISTER_MATCHER(hasDynamicExceptionSpec);
   REGISTER_MATCHER(hasEitherOperand);
   REGISTER_MATCHER(hasElementType);
   REGISTER_MATCHER(hasElse);
@@ -235,6 +249,7 @@
   REGISTER_MATCHER(hasQualifier);
   REGISTER_MATCHER(hasRangeInit);
   REGISTER_MATCHER(hasReceiverType);
+  REGISTER_MATCHER(hasReturnValue);
   REGISTER_MATCHER(hasRHS);
   REGISTER_MATCHER(hasSelector);
   REGISTER_MATCHER(hasSingleDecl);
@@ -242,6 +257,7 @@
   REGISTER_MATCHER(hasSizeExpr);
   REGISTER_MATCHER(hasSourceExpression);
   REGISTER_MATCHER(hasStaticStorageDuration);
+  REGISTER_MATCHER(hasSyntacticForm);
   REGISTER_MATCHER(hasTargetDecl);
   REGISTER_MATCHER(hasTemplateArgument);
   REGISTER_MATCHER(hasThen);
@@ -250,24 +266,32 @@
   REGISTER_MATCHER(hasTypeLoc);
   REGISTER_MATCHER(hasUnaryOperand);
   REGISTER_MATCHER(hasUnarySelector);
+  REGISTER_MATCHER(hasUnderlyingDecl);
   REGISTER_MATCHER(hasValueType);
   REGISTER_MATCHER(ifStmt);
+  REGISTER_MATCHER(ignoringImplicit);
   REGISTER_MATCHER(ignoringImpCasts);
   REGISTER_MATCHER(ignoringParenCasts);
   REGISTER_MATCHER(ignoringParenImpCasts);
+  REGISTER_MATCHER(ignoringParens);
   REGISTER_MATCHER(implicitCastExpr);
+  REGISTER_MATCHER(implicitValueInitExpr);
   REGISTER_MATCHER(incompleteArrayType);
   REGISTER_MATCHER(initListExpr);
   REGISTER_MATCHER(injectedClassNameType);
   REGISTER_MATCHER(innerType);
   REGISTER_MATCHER(integerLiteral);
   REGISTER_MATCHER(isAnonymous);
+  REGISTER_MATCHER(isAnyCharacter);
+  REGISTER_MATCHER(isAnyPointer);
   REGISTER_MATCHER(isArrow);
   REGISTER_MATCHER(isBaseInitializer);
+  REGISTER_MATCHER(isBitField);
   REGISTER_MATCHER(isCatchAll);
   REGISTER_MATCHER(isClass);
   REGISTER_MATCHER(isConst);
   REGISTER_MATCHER(isConstQualified);
+  REGISTER_MATCHER(isCopyAssignmentOperator);
   REGISTER_MATCHER(isCopyConstructor);
   REGISTER_MATCHER(isDefaultConstructor);
   REGISTER_MATCHER(isDefaulted);
@@ -288,8 +312,10 @@
   REGISTER_MATCHER(isInteger);
   REGISTER_MATCHER(isIntegral);
   REGISTER_MATCHER(isInTemplateInstantiation);
+  REGISTER_MATCHER(isLambda);
   REGISTER_MATCHER(isListInitialization);
   REGISTER_MATCHER(isMemberInitializer);
+  REGISTER_MATCHER(isMoveAssignmentOperator);
   REGISTER_MATCHER(isMoveConstructor);
   REGISTER_MATCHER(isNoThrow);
   REGISTER_MATCHER(isOverride);
@@ -297,13 +323,17 @@
   REGISTER_MATCHER(isProtected);
   REGISTER_MATCHER(isPublic);
   REGISTER_MATCHER(isPure);
+  REGISTER_MATCHER(isSignedInteger);
   REGISTER_MATCHER(isStruct);
   REGISTER_MATCHER(isTemplateInstantiation);
   REGISTER_MATCHER(isUnion);
+  REGISTER_MATCHER(isUnsignedInteger);
   REGISTER_MATCHER(isVariadic);
   REGISTER_MATCHER(isVirtual);
+  REGISTER_MATCHER(isVirtualAsWritten);
   REGISTER_MATCHER(isVolatileQualified);
   REGISTER_MATCHER(isWritten);
+  REGISTER_MATCHER(labelDecl);
   REGISTER_MATCHER(labelStmt);
   REGISTER_MATCHER(lambdaExpr);
   REGISTER_MATCHER(lValueReferenceType);
@@ -319,6 +349,7 @@
   REGISTER_MATCHER(namesType);
   REGISTER_MATCHER(nestedNameSpecifier);
   REGISTER_MATCHER(nestedNameSpecifierLoc);
+  REGISTER_MATCHER(nullPointerConstant);
   REGISTER_MATCHER(nullStmt);
   REGISTER_MATCHER(numSelectorArgs);
   REGISTER_MATCHER(ofClass);
@@ -327,18 +358,24 @@
   REGISTER_MATCHER(objcObjectPointerType);
   REGISTER_MATCHER(on);
   REGISTER_MATCHER(onImplicitObjectArgument);
+  REGISTER_MATCHER(opaqueValueExpr);
   REGISTER_MATCHER(parameterCountIs);
+  REGISTER_MATCHER(parenExpr);
+  REGISTER_MATCHER(parenListExpr);
   REGISTER_MATCHER(parenType);
   REGISTER_MATCHER(parmVarDecl);
   REGISTER_MATCHER(pointee);
   REGISTER_MATCHER(pointerType);
+  REGISTER_MATCHER(predefinedExpr);
   REGISTER_MATCHER(qualType);
+  REGISTER_MATCHER(realFloatingPointType);
   REGISTER_MATCHER(recordDecl);
   REGISTER_MATCHER(recordType);
   REGISTER_MATCHER(referenceType);
   REGISTER_MATCHER(refersToDeclaration);
   REGISTER_MATCHER(refersToIntegralType);
   REGISTER_MATCHER(refersToType);
+  REGISTER_MATCHER(requiresZeroInitialization);
   REGISTER_MATCHER(returns);
   REGISTER_MATCHER(returnStmt);
   REGISTER_MATCHER(rValueReferenceType);
@@ -349,26 +386,32 @@
   REGISTER_MATCHER(statementCountIs);
   REGISTER_MATCHER(staticAssertDecl);
   REGISTER_MATCHER(stmt);
+  REGISTER_MATCHER(stmtExpr);
   REGISTER_MATCHER(stringLiteral);
   REGISTER_MATCHER(substNonTypeTemplateParmExpr);
   REGISTER_MATCHER(substTemplateTypeParmType);
   REGISTER_MATCHER(switchCase);
   REGISTER_MATCHER(switchStmt);
   REGISTER_MATCHER(templateArgument);
+  REGISTER_MATCHER(templateName);
   REGISTER_MATCHER(templateArgumentCountIs);
   REGISTER_MATCHER(templateSpecializationType);
+  REGISTER_MATCHER(templateTypeParmDecl);
   REGISTER_MATCHER(templateTypeParmType);
   REGISTER_MATCHER(throughUsingDecl);
   REGISTER_MATCHER(to);
   REGISTER_MATCHER(translationUnitDecl);
   REGISTER_MATCHER(type);
   REGISTER_MATCHER(typedefDecl);
+  REGISTER_MATCHER(typedefNameDecl);
   REGISTER_MATCHER(typedefType);
+  REGISTER_MATCHER(typeAliasDecl);
   REGISTER_MATCHER(typeLoc);
   REGISTER_MATCHER(unaryExprOrTypeTraitExpr);
   REGISTER_MATCHER(unaryOperator);
   REGISTER_MATCHER(unaryTransformType);
   REGISTER_MATCHER(unless);
+  REGISTER_MATCHER(unresolvedLookupExpr);
   REGISTER_MATCHER(unresolvedUsingTypenameDecl);
   REGISTER_MATCHER(unresolvedUsingValueDecl);
   REGISTER_MATCHER(userDefinedLiteral);
diff --git a/lib/ASTMatchers/Makefile b/lib/ASTMatchers/Makefile
deleted file mode 100644
index 3ee9ccb..0000000
--- a/lib/ASTMatchers/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- clang/lib/ASTMatchers/Makefile ----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangASTMatchers
-
-PARALLEL_DIRS = Dynamic
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Analysis/BodyFarm.cpp b/lib/Analysis/BodyFarm.cpp
index d8b652f..d202a04 100644
--- a/lib/Analysis/BodyFarm.cpp
+++ b/lib/Analysis/BodyFarm.cpp
@@ -239,7 +239,8 @@
                                            SourceLocation());
   
   // (5) Create the 'if' statement.
-  IfStmt *If = new (C) IfStmt(C, SourceLocation(), nullptr, UO, CS);
+  IfStmt *If = new (C) IfStmt(C, SourceLocation(), false, nullptr, nullptr,
+                              UO, CS);
   return If;
 }
 
@@ -342,9 +343,8 @@
   Stmt *Else = M.makeReturn(RetVal);
   
   /// Construct the If.
-  Stmt *If =
-    new (C) IfStmt(C, SourceLocation(), nullptr, Comparison, Body,
-                   SourceLocation(), Else);
+  Stmt *If = new (C) IfStmt(C, SourceLocation(), false, nullptr, nullptr,
+                            Comparison, Body, SourceLocation(), Else);
 
   return If;  
 }
diff --git a/lib/Analysis/BodyFarm.h b/lib/Analysis/BodyFarm.h
index 9137943..edbe996 100644
--- a/lib/Analysis/BodyFarm.h
+++ b/lib/Analysis/BodyFarm.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CLANG_LIB_ANALYSIS_BODYFARM_H
 #define LLVM_CLANG_LIB_ANALYSIS_BODYFARM_H
 
+#include "clang/AST/DeclBase.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
@@ -22,7 +23,6 @@
 namespace clang {
 
 class ASTContext;
-class Decl;
 class FunctionDecl;
 class ObjCMethodDecl;
 class ObjCPropertyDecl;
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index dc58b92..a67f091 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -1,4 +1,4 @@
-  //===--- CFG.cpp - Classes for representing and building CFGs----*- C++ -*-===//
+//===--- CFG.cpp - Classes for representing and building CFGs----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -825,7 +825,7 @@
     // * Variable x is equal to the largest literal.
     // * Variable x is greater than largest literal.
     bool AlwaysTrue = true, AlwaysFalse = true;
-    for (llvm::APSInt Value : Values) {
+    for (const llvm::APSInt &Value : Values) {
       TryResult Res1, Res2;
       Res1 = analyzeLogicOperatorCondition(BO1, Value, L1);
       Res2 = analyzeLogicOperatorCondition(BO2, Value, L2);
@@ -1945,7 +1945,8 @@
     addLocalScopeForStmt(C);
   }
   if (!C->body_empty() && !isa<ReturnStmt>(*C->body_rbegin())) {
-    // If the body ends with a ReturnStmt, the dtors will be added in VisitReturnStmt
+    // If the body ends with a ReturnStmt, the dtors will be added in
+    // VisitReturnStmt.
     addAutomaticObjDtors(ScopePos, scopeBeginPos, C);
   }
 
@@ -2168,6 +2169,13 @@
   // won't be restored when traversing AST.
   SaveAndRestore<LocalScope::const_iterator> save_scope_pos(ScopePos);
 
+  // Create local scope for C++17 if init-stmt if one exists.
+  if (Stmt *Init = I->getInit()) {
+    LocalScope::const_iterator BeginScopePos = ScopePos;
+    addLocalScopeForStmt(Init);
+    addAutomaticObjDtors(ScopePos, BeginScopePos, I);
+  }
+
   // Create local scope for possible condition variable.
   // Store scope position. Add implicit destructor.
   if (VarDecl *VD = I->getConditionVariable()) {
@@ -2268,13 +2276,19 @@
   // blocks will be pointed to be "Block".
   CFGBlock *LastBlock = addStmt(I->getCond());
 
-  // Finally, if the IfStmt contains a condition variable, add it and its
+  // If the IfStmt contains a condition variable, add it and its
   // initializer to the CFG.
   if (const DeclStmt* DS = I->getConditionVariableDeclStmt()) {
     autoCreateBlock();
     LastBlock = addStmt(const_cast<DeclStmt *>(DS));
   }
 
+  // Finally, if the IfStmt contains a C++17 init-stmt, add it to the CFG.
+  if (Stmt *Init = I->getInit()) {
+    autoCreateBlock();
+    LastBlock = addStmt(Init);
+  }
+
   return LastBlock;
 }
 
@@ -3059,6 +3073,13 @@
   // won't be restored when traversing AST.
   SaveAndRestore<LocalScope::const_iterator> save_scope_pos(ScopePos);
 
+  // Create local scope for C++17 switch init-stmt if one exists.
+  if (Stmt *Init = Terminator->getInit()) {
+    LocalScope::const_iterator BeginScopePos = ScopePos;
+    addLocalScopeForStmt(Init);
+    addAutomaticObjDtors(ScopePos, BeginScopePos, Terminator);
+  }
+
   // Create local scope for possible condition variable.
   // Store scope position. Add implicit destructor.
   if (VarDecl *VD = Terminator->getConditionVariable()) {
@@ -3138,7 +3159,7 @@
   Block = SwitchTerminatedBlock;
   CFGBlock *LastBlock = addStmt(Terminator->getCond());
 
-  // Finally, if the SwitchStmt contains a condition variable, add both the
+  // If the SwitchStmt contains a condition variable, add both the
   // SwitchStmt and the condition variable initialization to the CFG.
   if (VarDecl *VD = Terminator->getConditionVariable()) {
     if (Expr *Init = VD->getInit()) {
@@ -3148,6 +3169,12 @@
     }
   }
 
+  // Finally, if the SwitchStmt contains a C++17 init-stmt, add it to the CFG.
+  if (Stmt *Init = Terminator->getInit()) {
+    autoCreateBlock();
+    LastBlock = addStmt(Init);
+  }
+
   return LastBlock;
 }
   
@@ -3397,8 +3424,10 @@
   // Create local scopes and destructors for range, begin and end variables.
   if (Stmt *Range = S->getRangeStmt())
     addLocalScopeForStmt(Range);
-  if (Stmt *BeginEnd = S->getBeginEndStmt())
-    addLocalScopeForStmt(BeginEnd);
+  if (Stmt *Begin = S->getBeginStmt())
+    addLocalScopeForStmt(Begin);
+  if (Stmt *End = S->getEndStmt())
+    addLocalScopeForStmt(End);
   addAutomaticObjDtors(ScopePos, save_scope_pos.get(), S);
 
   LocalScope::const_iterator ContinueScopePos = ScopePos;
@@ -3455,6 +3484,8 @@
     // continue statements.
     Block = nullptr;
     Succ = addStmt(S->getInc());
+    if (badCFG)
+      return nullptr;
     ContinueJumpTarget = JumpTarget(Succ, ContinueScopePos);
 
     // The starting block for the loop increment is the block that should
@@ -3489,7 +3520,8 @@
 
   // Add the initialization statements.
   Block = createBlock();
-  addStmt(S->getBeginEndStmt());
+  addStmt(S->getBeginStmt());
+  addStmt(S->getEndStmt());
   return addStmt(S->getRangeStmt());
 }
 
@@ -3870,7 +3902,17 @@
     case CFGElement::AutomaticObjectDtor: {
       const VarDecl *var = castAs<CFGAutomaticObjDtor>().getVarDecl();
       QualType ty = var->getType();
-      ty = ty.getNonReferenceType();
+
+      // FIXME: See CFGBuilder::addLocalScopeForVarDecl.
+      //
+      // Lifetime-extending constructs are handled here. This works for a single
+      // temporary in an initializer expression.
+      if (ty->isReferenceType()) {
+        if (const Expr *Init = var->getInit()) {
+          ty = getReferenceInitTemporaryType(astContext, Init);
+        }
+      }
+
       while (const ArrayType *arrayType = astContext.getAsArrayType(ty)) {
         ty = arrayType->getElementType();
       }
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index a3990d6..fdc9e6c 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -9,6 +9,7 @@
   CFGReachabilityAnalysis.cpp
   CFGStmtMap.cpp
   CallGraph.cpp
+  CloneDetection.cpp
   CocoaConventions.cpp
   Consumed.cpp
   CodeInjector.cpp
diff --git a/lib/Analysis/CloneDetection.cpp b/lib/Analysis/CloneDetection.cpp
new file mode 100644
index 0000000..27815f3
--- /dev/null
+++ b/lib/Analysis/CloneDetection.cpp
@@ -0,0 +1,582 @@
+//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+///  This file implements classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/CloneDetection.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+#include "clang/AST/StmtVisitor.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace clang;
+
+StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
+                           unsigned StartIndex, unsigned EndIndex)
+    : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
+  assert(Stmt && "Stmt must not be a nullptr");
+  assert(StartIndex < EndIndex && "Given array should not be empty");
+  assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
+}
+
+StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context)
+    : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
+
+StmtSequence::StmtSequence()
+    : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
+
+bool StmtSequence::contains(const StmtSequence &Other) const {
+  // If both sequences reside in different translation units, they can never
+  // contain each other.
+  if (Context != Other.Context)
+    return false;
+
+  const SourceManager &SM = Context->getSourceManager();
+
+  // Otherwise check if the start and end locations of the current sequence
+  // surround the other sequence.
+  bool StartIsInBounds =
+      SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
+      getStartLoc() == Other.getStartLoc();
+  if (!StartIsInBounds)
+    return false;
+
+  bool EndIsInBounds =
+      SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
+      Other.getEndLoc() == getEndLoc();
+  return EndIsInBounds;
+}
+
+StmtSequence::iterator StmtSequence::begin() const {
+  if (!holdsSequence()) {
+    return &S;
+  }
+  auto CS = cast<CompoundStmt>(S);
+  return CS->body_begin() + StartIndex;
+}
+
+StmtSequence::iterator StmtSequence::end() const {
+  if (!holdsSequence()) {
+    return reinterpret_cast<StmtSequence::iterator>(&S) + 1;
+  }
+  auto CS = cast<CompoundStmt>(S);
+  return CS->body_begin() + EndIndex;
+}
+
+SourceLocation StmtSequence::getStartLoc() const {
+  return front()->getLocStart();
+}
+
+SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
+
+namespace {
+
+/// \brief Analyzes the pattern of the referenced variables in a statement.
+class VariablePattern {
+
+  /// \brief Describes an occurence of a variable reference in a statement.
+  struct VariableOccurence {
+    /// The index of the associated VarDecl in the Variables vector.
+    size_t KindID;
+
+    VariableOccurence(size_t KindID) : KindID(KindID) {}
+  };
+
+  /// All occurences of referenced variables in the order of appearance.
+  std::vector<VariableOccurence> Occurences;
+  /// List of referenced variables in the order of appearance.
+  /// Every item in this list is unique.
+  std::vector<const VarDecl *> Variables;
+
+  /// \brief Adds a new variable referenced to this pattern.
+  /// \param VarDecl The declaration of the variable that is referenced.
+  void addVariableOccurence(const VarDecl *VarDecl) {
+    // First check if we already reference this variable
+    for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
+      if (Variables[KindIndex] == VarDecl) {
+        // If yes, add a new occurence that points to the existing entry in
+        // the Variables vector.
+        Occurences.emplace_back(KindIndex);
+        return;
+      }
+    }
+    // If this variable wasn't already referenced, add it to the list of
+    // referenced variables and add a occurence that points to this new entry.
+    Occurences.emplace_back(Variables.size());
+    Variables.push_back(VarDecl);
+  }
+
+  /// \brief Adds each referenced variable from the given statement.
+  void addVariables(const Stmt *S) {
+    // Sometimes we get a nullptr (such as from IfStmts which often have nullptr
+    // children). We skip such statements as they don't reference any
+    // variables.
+    if (!S)
+      return;
+
+    // Check if S is a reference to a variable. If yes, add it to the pattern.
+    if (auto D = dyn_cast<DeclRefExpr>(S)) {
+      if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
+        addVariableOccurence(VD);
+    }
+
+    // Recursively check all children of the given statement.
+    for (const Stmt *Child : S->children()) {
+      addVariables(Child);
+    }
+  }
+
+public:
+  /// \brief Creates an VariablePattern object with information about the given
+  ///        StmtSequence.
+  VariablePattern(const StmtSequence &Sequence) {
+    for (const Stmt *S : Sequence)
+      addVariables(S);
+  }
+
+  /// \brief Compares this pattern with the given one.
+  /// \param Other The given VariablePattern to compare with.
+  /// \return Returns true if and only if the references variables in this
+  ///         object follow the same pattern than the ones in the given
+  ///         VariablePattern.
+  ///
+  /// For example, the following statements all have the same pattern:
+  ///
+  ///   if (a < b) return a; return b;
+  ///   if (x < y) return x; return y;
+  ///   if (u2 < u1) return u2; return u1;
+  ///
+  /// but the following statement has a different pattern (note the changed
+  /// variables in the return statements).
+  ///
+  ///   if (a < b) return b; return a;
+  ///
+  /// This function should only be called if the related statements of the given
+  /// pattern and the statements of this objects are clones of each other.
+  bool comparePattern(const VariablePattern &Other) {
+    assert(Other.Occurences.size() == Occurences.size());
+    for (unsigned i = 0; i < Occurences.size(); ++i) {
+      if (Occurences[i].KindID != Other.Occurences[i].KindID) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+}
+
+namespace {
+/// \brief Collects the data of a single Stmt.
+///
+/// This class defines what a code clone is: If it collects for two statements
+/// the same data, then those two statements are considered to be clones of each
+/// other.
+class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector> {
+
+  ASTContext &Context;
+  std::vector<CloneDetector::DataPiece> &CollectedData;
+
+public:
+  /// \brief Collects data of the given Stmt.
+  /// \param S The given statement.
+  /// \param Context The ASTContext of S.
+  /// \param D The given data vector to which all collected data is appended.
+  StmtDataCollector(const Stmt *S, ASTContext &Context,
+                    std::vector<CloneDetector::DataPiece> &D)
+      : Context(Context), CollectedData(D) {
+    Visit(S);
+  }
+
+  // Below are utility methods for appending different data to the vector.
+
+  void addData(CloneDetector::DataPiece Integer) {
+    CollectedData.push_back(Integer);
+  }
+
+  // FIXME: The functions below add long strings to the data vector which are
+  // probably not good for performance. Replace the strings with pointer values
+  // or a some other unique integer.
+
+  void addData(llvm::StringRef Str) {
+    if (Str.empty())
+      return;
+
+    const size_t OldSize = CollectedData.size();
+
+    const size_t PieceSize = sizeof(CloneDetector::DataPiece);
+    // Calculate how many vector units we need to accomodate all string bytes.
+    size_t RoundedUpPieceNumber = (Str.size() + PieceSize - 1) / PieceSize;
+    // Allocate space for the string in the data vector.
+    CollectedData.resize(CollectedData.size() + RoundedUpPieceNumber);
+
+    // Copy the string to the allocated space at the end of the vector.
+    std::memcpy(CollectedData.data() + OldSize, Str.data(), Str.size());
+  }
+
+  void addData(const QualType &QT) { addData(QT.getAsString()); }
+
+// The functions below collect the class specific data of each Stmt subclass.
+
+// Utility macro for defining a visit method for a given class. This method
+// calls back to the ConstStmtVisitor to visit all parent classes.
+#define DEF_ADD_DATA(CLASS, CODE)                                              \
+  void Visit##CLASS(const CLASS *S) {                                          \
+    CODE;                                                                      \
+    ConstStmtVisitor<StmtDataCollector>::Visit##CLASS(S);                      \
+  }
+
+  DEF_ADD_DATA(Stmt, { addData(S->getStmtClass()); })
+  DEF_ADD_DATA(Expr, { addData(S->getType()); })
+
+  //--- Builtin functionality ----------------------------------------------//
+  DEF_ADD_DATA(ArrayTypeTraitExpr, { addData(S->getTrait()); })
+  DEF_ADD_DATA(ExpressionTraitExpr, { addData(S->getTrait()); })
+  DEF_ADD_DATA(PredefinedExpr, { addData(S->getIdentType()); })
+  DEF_ADD_DATA(TypeTraitExpr, {
+    addData(S->getTrait());
+    for (unsigned i = 0; i < S->getNumArgs(); ++i)
+      addData(S->getArg(i)->getType());
+  })
+
+  //--- Calls --------------------------------------------------------------//
+  DEF_ADD_DATA(CallExpr, {
+    // Function pointers don't have a callee and we just skip hashing it.
+    if (S->getDirectCallee())
+      addData(S->getDirectCallee()->getQualifiedNameAsString());
+  })
+
+  //--- Exceptions ---------------------------------------------------------//
+  DEF_ADD_DATA(CXXCatchStmt, { addData(S->getCaughtType()); })
+
+  //--- C++ OOP Stmts ------------------------------------------------------//
+  DEF_ADD_DATA(CXXDeleteExpr, {
+    addData(S->isArrayFormAsWritten());
+    addData(S->isGlobalDelete());
+  })
+
+  //--- Casts --------------------------------------------------------------//
+  DEF_ADD_DATA(ObjCBridgedCastExpr, { addData(S->getBridgeKind()); })
+
+  //--- Miscellaneous Exprs ------------------------------------------------//
+  DEF_ADD_DATA(BinaryOperator, { addData(S->getOpcode()); })
+  DEF_ADD_DATA(UnaryOperator, { addData(S->getOpcode()); })
+
+  //--- Control flow -------------------------------------------------------//
+  DEF_ADD_DATA(GotoStmt, { addData(S->getLabel()->getName()); })
+  DEF_ADD_DATA(IndirectGotoStmt, {
+    if (S->getConstantTarget())
+      addData(S->getConstantTarget()->getName());
+  })
+  DEF_ADD_DATA(LabelStmt, { addData(S->getDecl()->getName()); })
+  DEF_ADD_DATA(MSDependentExistsStmt, { addData(S->isIfExists()); })
+  DEF_ADD_DATA(AddrLabelExpr, { addData(S->getLabel()->getName()); })
+
+  //--- Objective-C --------------------------------------------------------//
+  DEF_ADD_DATA(ObjCIndirectCopyRestoreExpr, { addData(S->shouldCopy()); })
+  DEF_ADD_DATA(ObjCPropertyRefExpr, {
+    addData(S->isSuperReceiver());
+    addData(S->isImplicitProperty());
+  })
+  DEF_ADD_DATA(ObjCAtCatchStmt, { addData(S->hasEllipsis()); })
+
+  //--- Miscellaneous Stmts ------------------------------------------------//
+  DEF_ADD_DATA(CXXFoldExpr, {
+    addData(S->isRightFold());
+    addData(S->getOperator());
+  })
+  DEF_ADD_DATA(GenericSelectionExpr, {
+    for (unsigned i = 0; i < S->getNumAssocs(); ++i) {
+      addData(S->getAssocType(i));
+    }
+  })
+  DEF_ADD_DATA(LambdaExpr, {
+    for (const LambdaCapture &C : S->captures()) {
+      addData(C.isPackExpansion());
+      addData(C.getCaptureKind());
+      if (C.capturesVariable())
+        addData(C.getCapturedVar()->getType());
+    }
+    addData(S->isGenericLambda());
+    addData(S->isMutable());
+  })
+  DEF_ADD_DATA(DeclStmt, {
+    auto numDecls = std::distance(S->decl_begin(), S->decl_end());
+    addData(static_cast<CloneDetector::DataPiece>(numDecls));
+    for (const Decl *D : S->decls()) {
+      if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
+        addData(VD->getType());
+      }
+    }
+  })
+  DEF_ADD_DATA(AsmStmt, {
+    addData(S->isSimple());
+    addData(S->isVolatile());
+    addData(S->generateAsmString(Context));
+    for (unsigned i = 0; i < S->getNumInputs(); ++i) {
+      addData(S->getInputConstraint(i));
+    }
+    for (unsigned i = 0; i < S->getNumOutputs(); ++i) {
+      addData(S->getOutputConstraint(i));
+    }
+    for (unsigned i = 0; i < S->getNumClobbers(); ++i) {
+      addData(S->getClobber(i));
+    }
+  })
+  DEF_ADD_DATA(AttributedStmt, {
+    for (const Attr *A : S->getAttrs()) {
+      addData(std::string(A->getSpelling()));
+    }
+  })
+};
+} // end anonymous namespace
+
+namespace {
+/// Generates CloneSignatures for a set of statements and stores the results in
+/// a CloneDetector object.
+class CloneSignatureGenerator {
+
+  CloneDetector &CD;
+  ASTContext &Context;
+
+  /// \brief Generates CloneSignatures for all statements in the given statement
+  /// tree and stores them in the CloneDetector.
+  ///
+  /// \param S The root of the given statement tree.
+  /// \return The CloneSignature of the root statement.
+  CloneDetector::CloneSignature generateSignatures(const Stmt *S) {
+    // Create an empty signature that will be filled in this method.
+    CloneDetector::CloneSignature Signature;
+
+    // Collect all relevant data from S and put it into the empty signature.
+    StmtDataCollector(S, Context, Signature.Data);
+
+    // Storage for the signatures of the direct child statements. This is only
+    // needed if the current statement is a CompoundStmt.
+    std::vector<CloneDetector::CloneSignature> ChildSignatures;
+    const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S);
+
+    // The signature of a statement includes the signatures of its children.
+    // Therefore we create the signatures for every child and add them to the
+    // current signature.
+    for (const Stmt *Child : S->children()) {
+      // Some statements like 'if' can have nullptr children that we will skip.
+      if (!Child)
+        continue;
+
+      // Recursive call to create the signature of the child statement. This
+      // will also create and store all clone groups in this child statement.
+      auto ChildSignature = generateSignatures(Child);
+
+      // Add the collected data to the signature of the current statement.
+      Signature.add(ChildSignature);
+
+      // If the current statement is a CompoundStatement, we need to store the
+      // signature for the generation of the sub-sequences.
+      if (CS)
+        ChildSignatures.push_back(ChildSignature);
+    }
+
+    // If the current statement is a CompoundStmt, we also need to create the
+    // clone groups from the sub-sequences inside the children.
+    if (CS)
+      handleSubSequences(CS, ChildSignatures);
+
+    // Save the signature for the current statement in the CloneDetector object.
+    CD.add(StmtSequence(S, Context), Signature);
+
+    return Signature;
+  }
+
+  /// \brief Adds all possible sub-sequences in the child array of the given
+  ///        CompoundStmt to the CloneDetector.
+  /// \param CS The given CompoundStmt.
+  /// \param ChildSignatures A list of calculated signatures for each child in
+  ///                        the given CompoundStmt.
+  void handleSubSequences(
+      const CompoundStmt *CS,
+      const std::vector<CloneDetector::CloneSignature> &ChildSignatures) {
+
+    // FIXME: This function has quadratic runtime right now. Check if skipping
+    // this function for too long CompoundStmts is an option.
+
+    // The length of the sub-sequence. We don't need to handle sequences with
+    // the length 1 as they are already handled in CollectData().
+    for (unsigned Length = 2; Length <= CS->size(); ++Length) {
+      // The start index in the body of the CompoundStmt. We increase the
+      // position until the end of the sub-sequence reaches the end of the
+      // CompoundStmt body.
+      for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
+        // Create an empty signature and add the signatures of all selected
+        // child statements to it.
+        CloneDetector::CloneSignature SubSignature;
+
+        for (unsigned i = Pos; i < Pos + Length; ++i) {
+          SubSignature.add(ChildSignatures[i]);
+        }
+
+        // Save the signature together with the information about what children
+        // sequence we selected.
+        CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
+      }
+    }
+  }
+
+public:
+  explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context)
+      : CD(CD), Context(Context) {}
+
+  /// \brief Generates signatures for all statements in the given function body.
+  void consumeCodeBody(const Stmt *S) { generateSignatures(S); }
+};
+} // end anonymous namespace
+
+void CloneDetector::analyzeCodeBody(const Decl *D) {
+  assert(D);
+  assert(D->hasBody());
+  CloneSignatureGenerator Generator(*this, D->getASTContext());
+  Generator.consumeCodeBody(D->getBody());
+}
+
+void CloneDetector::add(const StmtSequence &S,
+                        const CloneSignature &Signature) {
+  // StringMap only works with StringRefs, so we create one for our data vector.
+  auto &Data = Signature.Data;
+  StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
+                                Data.size() * sizeof(unsigned));
+
+  // Search with the help of the signature if we already have encountered a
+  // clone of the given StmtSequence.
+  auto I = CloneGroupIndexes.find(DataRef);
+  if (I == CloneGroupIndexes.end()) {
+    // We haven't found an existing clone group, so we create a new clone group
+    // for this StmtSequence and store the index of it in our search map.
+    CloneGroupIndexes[DataRef] = CloneGroups.size();
+    CloneGroups.emplace_back(S, Signature.Complexity);
+    return;
+  }
+
+  // We have found an existing clone group and can expand it with the given
+  // StmtSequence.
+  CloneGroups[I->getValue()].Sequences.push_back(S);
+}
+
+namespace {
+/// \brief Returns true if and only if \p Stmt contains at least one other
+/// sequence in the \p Group.
+bool containsAnyInGroup(StmtSequence &Stmt, CloneDetector::CloneGroup &Group) {
+  for (StmtSequence &GroupStmt : Group.Sequences) {
+    if (Stmt.contains(GroupStmt))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Returns true if and only if all sequences in \p OtherGroup are
+/// contained by a sequence in \p Group.
+bool containsGroup(CloneDetector::CloneGroup &Group,
+                   CloneDetector::CloneGroup &OtherGroup) {
+  // We have less sequences in the current group than we have in the other,
+  // so we will never fulfill the requirement for returning true. This is only
+  // possible because we know that a sequence in Group can contain at most
+  // one sequence in OtherGroup.
+  if (Group.Sequences.size() < OtherGroup.Sequences.size())
+    return false;
+
+  for (StmtSequence &Stmt : Group.Sequences) {
+    if (!containsAnyInGroup(Stmt, OtherGroup))
+      return false;
+  }
+  return true;
+}
+} // end anonymous namespace
+
+/// \brief Finds all actual clone groups in a single group of presumed clones.
+/// \param Result Output parameter to which all found groups are added. Every
+///               clone in a group that was added this way follows the same
+///               variable pattern as the other clones in its group.
+/// \param Group A group of clones. The clones are allowed to have a different
+///              variable pattern.
+static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
+                              const CloneDetector::CloneGroup &Group) {
+  // We remove the Sequences one by one, so a list is more appropriate.
+  std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(),
+                                              Group.Sequences.end());
+
+  // Search for clones as long as there could be clones in UnassignedSequences.
+  while (UnassignedSequences.size() > 1) {
+
+    // Pick the first Sequence as a protoype for a new clone group.
+    StmtSequence Prototype = UnassignedSequences.front();
+    UnassignedSequences.pop_front();
+
+    CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Complexity);
+
+    // Analyze the variable pattern of the prototype. Every other StmtSequence
+    // needs to have the same pattern to get into the new clone group.
+    VariablePattern PrototypeFeatures(Prototype);
+
+    // Search all remaining StmtSequences for an identical variable pattern
+    // and assign them to our new clone group.
+    auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
+    while (I != E) {
+      if (VariablePattern(*I).comparePattern(PrototypeFeatures)) {
+        FilteredGroup.Sequences.push_back(*I);
+        I = UnassignedSequences.erase(I);
+        continue;
+      }
+      ++I;
+    }
+
+    // Add a valid clone group to the list of found clone groups.
+    if (!FilteredGroup.isValid())
+      continue;
+
+    Result.push_back(FilteredGroup);
+  }
+}
+
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+                               unsigned MinGroupComplexity) {
+  // Add every valid clone group that fulfills the complexity requirement.
+  for (const CloneGroup &Group : CloneGroups) {
+    if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
+      createCloneGroups(Result, Group);
+    }
+  }
+
+  std::vector<unsigned> IndexesToRemove;
+
+  // Compare every group in the result with the rest. If one groups contains
+  // another group, we only need to return the bigger group.
+  // Note: This doesn't scale well, so if possible avoid calling any heavy
+  // function from this loop to minimize the performance impact.
+  for (unsigned i = 0; i < Result.size(); ++i) {
+    for (unsigned j = 0; j < Result.size(); ++j) {
+      // Don't compare a group with itself.
+      if (i == j)
+        continue;
+
+      if (containsGroup(Result[j], Result[i])) {
+        IndexesToRemove.push_back(i);
+        break;
+      }
+    }
+  }
+
+  // Erasing a list of indexes from the vector should be done with decreasing
+  // indexes. As IndexesToRemove is constructed with increasing values, we just
+  // reverse iterate over it to get the desired order.
+  for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
+    Result.erase(Result.begin() + *I);
+  }
+}
diff --git a/lib/Analysis/Consumed.cpp b/lib/Analysis/Consumed.cpp
index 9df2392..f6fe78a 100644
--- a/lib/Analysis/Consumed.cpp
+++ b/lib/Analysis/Consumed.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Analysis/Analyses/Consumed.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
@@ -20,16 +21,12 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/AST/Type.h"
-#include "clang/Analysis/Analyses/Consumed.h"
 #include "clang/Analysis/Analyses/PostOrderCFGView.h"
 #include "clang/Analysis/AnalysisContext.h"
 #include "clang/Analysis/CFG.h"
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/raw_ostream.h"
 #include <memory>
 
 // TODO: Adjust states of args to constructors in the same way that arguments to
@@ -466,9 +463,15 @@
   MapType PropagationMap;
 
   InfoEntry findInfo(const Expr *E) {
+    if (auto Cleanups = dyn_cast<ExprWithCleanups>(E))
+      if (!Cleanups->cleanupsHaveSideEffects())
+        E = Cleanups->getSubExpr();
     return PropagationMap.find(E->IgnoreParens());
   }
   ConstInfoEntry findInfo(const Expr *E) const {
+    if (auto Cleanups = dyn_cast<ExprWithCleanups>(E))
+      if (!Cleanups->cleanupsHaveSideEffects())
+        E = Cleanups->getSubExpr();
     return PropagationMap.find(E->IgnoreParens());
   }
   void insertInfo(const Expr *E, const PropagationInfo &PI) {
@@ -1356,7 +1359,7 @@
   ConsumedStmtVisitor Visitor(AC, *this, CurrStates.get());
 
   // Add all trackable parameters to the state map.
-  for (const auto *PI : D->params())
+  for (const auto *PI : D->parameters())
     Visitor.VisitParmVarDecl(PI);
   
   // Visit all of the function's basic blocks.
diff --git a/lib/Analysis/FormatString.cpp b/lib/Analysis/FormatString.cpp
index b1c868a..a0a357d 100644
--- a/lib/Analysis/FormatString.cpp
+++ b/lib/Analysis/FormatString.cpp
@@ -191,13 +191,21 @@
       return false;
     case 'h':
       ++I;
-      lmKind = (I != E && *I == 'h') ? (++I, LengthModifier::AsChar)
-                                     : LengthModifier::AsShort;
+      if (I != E && *I == 'h') {
+        ++I;
+        lmKind = LengthModifier::AsChar;
+      } else {
+        lmKind = LengthModifier::AsShort;
+      }
       break;
     case 'l':
       ++I;
-      lmKind = (I != E && *I == 'l') ? (++I, LengthModifier::AsLongLong)
-                                     : LengthModifier::AsLong;
+      if (I != E && *I == 'l') {
+        ++I;
+        lmKind = LengthModifier::AsLongLong;
+      } else {
+        lmKind = LengthModifier::AsLong;
+      }
       break;
     case 'j': lmKind = LengthModifier::AsIntMax;     ++I; break;
     case 'z': lmKind = LengthModifier::AsSizeT;      ++I; break;
@@ -687,7 +695,7 @@
           return true;
         case ConversionSpecifier::FreeBSDrArg:
         case ConversionSpecifier::FreeBSDyArg:
-          return Target.getTriple().isOSFreeBSD();
+          return Target.getTriple().isOSFreeBSD() || Target.getTriple().isPS4();
         default:
           return false;
       }
@@ -720,7 +728,7 @@
           return true;
         case ConversionSpecifier::FreeBSDrArg:
         case ConversionSpecifier::FreeBSDyArg:
-          return Target.getTriple().isOSFreeBSD();
+          return Target.getTriple().isOSFreeBSD() || Target.getTriple().isPS4();
         default:
           return false;
       }
diff --git a/lib/Analysis/FormatStringParsing.h b/lib/Analysis/FormatStringParsing.h
index 8463fce..17fd2f6 100644
--- a/lib/Analysis/FormatStringParsing.h
+++ b/lib/Analysis/FormatStringParsing.h
@@ -4,7 +4,6 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Type.h"
 #include "clang/Analysis/Analyses/FormatString.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 
diff --git a/lib/Analysis/Makefile b/lib/Analysis/Makefile
deleted file mode 100644
index fbbb83d..0000000
--- a/lib/Analysis/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- clang/lib/Analysis/Makefile -------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-# This implements analyses built on top of source-level CFGs. 
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangAnalysis
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/Analysis/OSLog.cpp b/lib/Analysis/OSLog.cpp
index 70e45e9..b6e9c49 100644
--- a/lib/Analysis/OSLog.cpp
+++ b/lib/Analysis/OSLog.cpp
@@ -55,9 +55,6 @@
     //  * "%.16P" len (non-arg), pointer to data
     //  * "%@" pointer to objc object
 
-    if (!FS.consumesDataArgument())
-      return false;
-
     unsigned argIndex = FS.getArgIndex();
     if (argIndex >= Args.size()) {
       return false;
diff --git a/lib/Analysis/PrintfFormatString.cpp b/lib/Analysis/PrintfFormatString.cpp
index cb39c05..1966344 100644
--- a/lib/Analysis/PrintfFormatString.cpp
+++ b/lib/Analysis/PrintfFormatString.cpp
@@ -648,9 +648,13 @@
   case BuiltinType::UInt128:
   case BuiltinType::Int128:
   case BuiltinType::Half:
+  case BuiltinType::Float128:
     // Various types which are non-trivial to correct.
     return false;
 
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
 #define SIGNED_TYPE(Id, SingletonId)
 #define UNSIGNED_TYPE(Id, SingletonId)
 #define FLOATING_TYPE(Id, SingletonId)
diff --git a/lib/Analysis/ThreadSafety.cpp b/lib/Analysis/ThreadSafety.cpp
index b282a5b..1417a5a 100644
--- a/lib/Analysis/ThreadSafety.cpp
+++ b/lib/Analysis/ThreadSafety.cpp
@@ -15,13 +15,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Analysis/Analyses/ThreadSafety.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/Analysis/Analyses/PostOrderCFGView.h"
-#include "clang/Analysis/Analyses/ThreadSafety.h"
 #include "clang/Analysis/Analyses/ThreadSafetyCommon.h"
 #include "clang/Analysis/Analyses/ThreadSafetyLogical.h"
 #include "clang/Analysis/Analyses/ThreadSafetyTIL.h"
@@ -32,8 +32,6 @@
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/ImmutableMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/lib/Analysis/ThreadSafetyCommon.cpp b/lib/Analysis/ThreadSafetyCommon.cpp
index ffe95ea..96b317f 100644
--- a/lib/Analysis/ThreadSafetyCommon.cpp
+++ b/lib/Analysis/ThreadSafetyCommon.cpp
@@ -17,20 +17,14 @@
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/StmtCXX.h"
-#include "clang/Analysis/Analyses/PostOrderCFGView.h"
 #include "clang/Analysis/Analyses/ThreadSafetyTIL.h"
 #include "clang/Analysis/Analyses/ThreadSafetyTraverse.h"
 #include "clang/Analysis/AnalysisContext.h"
 #include "clang/Analysis/CFG.h"
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include <algorithm>
-#include <climits>
-#include <vector>
 
 using namespace clang;
 using namespace threadSafety;
diff --git a/lib/Basic/Builtins.cpp b/lib/Basic/Builtins.cpp
index fb6a645..28695d6 100644
--- a/lib/Basic/Builtins.cpp
+++ b/lib/Basic/Builtins.cpp
@@ -69,7 +69,9 @@
   bool MSModeUnsupported =
       !LangOpts.MicrosoftExt && (BuiltinInfo.Langs & MS_LANG);
   bool ObjCUnsupported = !LangOpts.ObjC1 && BuiltinInfo.Langs == OBJC_LANG;
-  return !BuiltinsUnsupported && !MathBuiltinsUnsupported &&
+  bool OclCUnsupported = LangOpts.OpenCLVersion != 200 &&
+                         BuiltinInfo.Langs == OCLC20_LANG;
+  return !BuiltinsUnsupported && !MathBuiltinsUnsupported && !OclCUnsupported &&
          !GnuModeUnsupported && !MSModeUnsupported && !ObjCUnsupported;
 }
 
diff --git a/lib/Basic/CMakeLists.txt b/lib/Basic/CMakeLists.txt
index f5e7f74..a34bf2a 100644
--- a/lib/Basic/CMakeLists.txt
+++ b/lib/Basic/CMakeLists.txt
@@ -53,12 +53,20 @@
 else()
   # Not producing a VC revision include.
   set(version_inc)
+
+  # Being able to force-set the SVN revision in cases where it isn't available
+  # is useful for performance tracking, and matches compatibility from autoconf.
+  if(SVN_REVISION)
+    set_source_files_properties(Version.cpp
+      PROPERTIES COMPILE_DEFINITIONS "SVN_REVISION=\"${SVN_REVISION}\"")
+  endif()
 endif()
 
 add_clang_library(clangBasic
   Attributes.cpp
   Builtins.cpp
   CharInfo.cpp
+  Cuda.cpp
   Diagnostic.cpp
   DiagnosticIDs.cpp
   DiagnosticOptions.cpp
diff --git a/lib/Basic/Cuda.cpp b/lib/Basic/Cuda.cpp
new file mode 100644
index 0000000..3264078
--- /dev/null
+++ b/lib/Basic/Cuda.cpp
@@ -0,0 +1,171 @@
+#include "clang/Basic/Cuda.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace clang {
+
+const char *CudaVersionToString(CudaVersion V) {
+  switch (V) {
+  case CudaVersion::UNKNOWN:
+    return "unknown";
+  case CudaVersion::CUDA_70:
+    return "7.0";
+  case CudaVersion::CUDA_75:
+    return "7.5";
+  case CudaVersion::CUDA_80:
+    return "8.0";
+  }
+  llvm_unreachable("invalid enum");
+}
+
+const char *CudaArchToString(CudaArch A) {
+  switch (A) {
+  case CudaArch::UNKNOWN:
+    return "unknown";
+  case CudaArch::SM_20:
+    return "sm_20";
+  case CudaArch::SM_21:
+    return "sm_21";
+  case CudaArch::SM_30:
+    return "sm_30";
+  case CudaArch::SM_32:
+    return "sm_32";
+  case CudaArch::SM_35:
+    return "sm_35";
+  case CudaArch::SM_37:
+    return "sm_37";
+  case CudaArch::SM_50:
+    return "sm_50";
+  case CudaArch::SM_52:
+    return "sm_52";
+  case CudaArch::SM_53:
+    return "sm_53";
+  case CudaArch::SM_60:
+    return "sm_60";
+  case CudaArch::SM_61:
+    return "sm_61";
+  case CudaArch::SM_62:
+    return "sm_62";
+  }
+  llvm_unreachable("invalid enum");
+}
+
+CudaArch StringToCudaArch(llvm::StringRef S) {
+  return llvm::StringSwitch<CudaArch>(S)
+      .Case("sm_20", CudaArch::SM_20)
+      .Case("sm_21", CudaArch::SM_21)
+      .Case("sm_30", CudaArch::SM_30)
+      .Case("sm_32", CudaArch::SM_32)
+      .Case("sm_35", CudaArch::SM_35)
+      .Case("sm_37", CudaArch::SM_37)
+      .Case("sm_50", CudaArch::SM_50)
+      .Case("sm_52", CudaArch::SM_52)
+      .Case("sm_53", CudaArch::SM_53)
+      .Case("sm_60", CudaArch::SM_60)
+      .Case("sm_61", CudaArch::SM_61)
+      .Case("sm_62", CudaArch::SM_62)
+      .Default(CudaArch::UNKNOWN);
+}
+
+const char *CudaVirtualArchToString(CudaVirtualArch A) {
+  switch (A) {
+  case CudaVirtualArch::UNKNOWN:
+    return "unknown";
+  case CudaVirtualArch::COMPUTE_20:
+    return "compute_20";
+  case CudaVirtualArch::COMPUTE_30:
+    return "compute_30";
+  case CudaVirtualArch::COMPUTE_32:
+    return "compute_32";
+  case CudaVirtualArch::COMPUTE_35:
+    return "compute_35";
+  case CudaVirtualArch::COMPUTE_37:
+    return "compute_37";
+  case CudaVirtualArch::COMPUTE_50:
+    return "compute_50";
+  case CudaVirtualArch::COMPUTE_52:
+    return "compute_52";
+  case CudaVirtualArch::COMPUTE_53:
+    return "compute_53";
+  case CudaVirtualArch::COMPUTE_60:
+    return "compute_60";
+  case CudaVirtualArch::COMPUTE_61:
+    return "compute_61";
+  case CudaVirtualArch::COMPUTE_62:
+    return "compute_62";
+  }
+  llvm_unreachable("invalid enum");
+}
+
+CudaVirtualArch StringToCudaVirtualArch(llvm::StringRef S) {
+  return llvm::StringSwitch<CudaVirtualArch>(S)
+      .Case("compute_20", CudaVirtualArch::COMPUTE_20)
+      .Case("compute_30", CudaVirtualArch::COMPUTE_30)
+      .Case("compute_32", CudaVirtualArch::COMPUTE_32)
+      .Case("compute_35", CudaVirtualArch::COMPUTE_35)
+      .Case("compute_37", CudaVirtualArch::COMPUTE_37)
+      .Case("compute_50", CudaVirtualArch::COMPUTE_50)
+      .Case("compute_52", CudaVirtualArch::COMPUTE_52)
+      .Case("compute_53", CudaVirtualArch::COMPUTE_53)
+      .Case("compute_60", CudaVirtualArch::COMPUTE_60)
+      .Case("compute_61", CudaVirtualArch::COMPUTE_61)
+      .Case("compute_62", CudaVirtualArch::COMPUTE_62)
+      .Default(CudaVirtualArch::UNKNOWN);
+}
+
+CudaVirtualArch VirtualArchForCudaArch(CudaArch A) {
+  switch (A) {
+  case CudaArch::UNKNOWN:
+    return CudaVirtualArch::UNKNOWN;
+  case CudaArch::SM_20:
+  case CudaArch::SM_21:
+    return CudaVirtualArch::COMPUTE_20;
+  case CudaArch::SM_30:
+    return CudaVirtualArch::COMPUTE_30;
+  case CudaArch::SM_32:
+    return CudaVirtualArch::COMPUTE_32;
+  case CudaArch::SM_35:
+    return CudaVirtualArch::COMPUTE_35;
+  case CudaArch::SM_37:
+    return CudaVirtualArch::COMPUTE_37;
+  case CudaArch::SM_50:
+    return CudaVirtualArch::COMPUTE_50;
+  case CudaArch::SM_52:
+    return CudaVirtualArch::COMPUTE_52;
+  case CudaArch::SM_53:
+    return CudaVirtualArch::COMPUTE_53;
+  case CudaArch::SM_60:
+    return CudaVirtualArch::COMPUTE_60;
+  case CudaArch::SM_61:
+    return CudaVirtualArch::COMPUTE_61;
+  case CudaArch::SM_62:
+    return CudaVirtualArch::COMPUTE_62;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+CudaVersion MinVersionForCudaArch(CudaArch A) {
+  switch (A) {
+  case CudaArch::UNKNOWN:
+    return CudaVersion::UNKNOWN;
+  case CudaArch::SM_20:
+  case CudaArch::SM_21:
+  case CudaArch::SM_30:
+  case CudaArch::SM_32:
+  case CudaArch::SM_35:
+  case CudaArch::SM_37:
+  case CudaArch::SM_50:
+  case CudaArch::SM_52:
+  case CudaArch::SM_53:
+    return CudaVersion::CUDA_70;
+  case CudaArch::SM_60:
+  case CudaArch::SM_61:
+  case CudaArch::SM_62:
+    return CudaVersion::CUDA_80;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+} // namespace clang
diff --git a/lib/Basic/Diagnostic.cpp b/lib/Basic/Diagnostic.cpp
index f10d156..1f4316a 100644
--- a/lib/Basic/Diagnostic.cpp
+++ b/lib/Basic/Diagnostic.cpp
@@ -55,10 +55,12 @@
   Output.append(Str.begin(), Str.end());
 }
 
-DiagnosticsEngine::DiagnosticsEngine(
-    const IntrusiveRefCntPtr<DiagnosticIDs> &diags, DiagnosticOptions *DiagOpts,
-    DiagnosticConsumer *client, bool ShouldOwnClient)
-    : Diags(diags), DiagOpts(DiagOpts), Client(nullptr), SourceMgr(nullptr) {
+DiagnosticsEngine::DiagnosticsEngine(IntrusiveRefCntPtr<DiagnosticIDs> diags,
+                                     DiagnosticOptions *DiagOpts,
+                                     DiagnosticConsumer *client,
+                                     bool ShouldOwnClient)
+    : Diags(std::move(diags)), DiagOpts(DiagOpts), Client(nullptr),
+      SourceMgr(nullptr) {
   setClient(client, ShouldOwnClient);
   ArgToStringFn = DummyArgToStringFn;
   ArgToStringCookie = nullptr;
@@ -1008,7 +1010,7 @@
 PartialDiagnostic::StorageAllocator::~StorageAllocator() {
   // Don't assert if we are in a CrashRecovery context, as this invariant may
   // be invalidated during a crash.
-  assert((NumFreeListEntries == NumCached || 
-          llvm::CrashRecoveryContext::isRecoveringFromCrash()) && 
-         "A partial is on the lamb");
+  assert((NumFreeListEntries == NumCached ||
+          llvm::CrashRecoveryContext::isRecoveringFromCrash()) &&
+         "A partial is on the lam");
 }
diff --git a/lib/Basic/DiagnosticOptions.cpp b/lib/Basic/DiagnosticOptions.cpp
index f54a0ef..93c2196 100644
--- a/lib/Basic/DiagnosticOptions.cpp
+++ b/lib/Basic/DiagnosticOptions.cpp
@@ -16,7 +16,7 @@
 
 namespace clang {
 
-raw_ostream& operator<<(raw_ostream& Out, DiagnosticLevelMask M) {
+raw_ostream &operator<<(raw_ostream &Out, DiagnosticLevelMask M) {
   using UT = std::underlying_type<DiagnosticLevelMask>::type;
   return Out << static_cast<UT>(M);
 }
diff --git a/lib/Basic/FileManager.cpp b/lib/Basic/FileManager.cpp
index c4cc8dc..94286e2 100644
--- a/lib/Basic/FileManager.cpp
+++ b/lib/Basic/FileManager.cpp
@@ -26,10 +26,13 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include <map>
-#include <set>
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <cstdlib>
 #include <string>
-#include <system_error>
+#include <utility>
 
 using namespace clang;
 
@@ -312,6 +315,9 @@
   UFE.InPCH = Data.InPCH;
   UFE.File = std::move(F);
   UFE.IsValid = true;
+  if (UFE.File)
+    if (auto RealPathName = UFE.File->getName())
+      UFE.RealPathName = *RealPathName;
   return &UFE;
 }
 
@@ -494,7 +500,6 @@
   UniqueRealFiles.erase(Entry->getUniqueID());
 }
 
-
 void FileManager::GetUniqueIDMapping(
                    SmallVectorImpl<const FileEntry *> &UIDToFiles) const {
   UIDToFiles.clear();
diff --git a/lib/Basic/IdentifierTable.cpp b/lib/Basic/IdentifierTable.cpp
index 67de1cb..537a2b7 100644
--- a/lib/Basic/IdentifierTable.cpp
+++ b/lib/Basic/IdentifierTable.cpp
@@ -42,6 +42,7 @@
   NeedsHandleIdentifier = false;
   IsFromAST = false;
   ChangedAfterLoad = false;
+  FEChangedAfterLoad = false;
   RevertedTokenID = false;
   OutOfDate = false;
   IsModulesImport = false;
@@ -112,7 +113,8 @@
     KEYOBJC2    = 0x20000,
     KEYZVECTOR  = 0x40000,
     KEYCOROUTINES = 0x80000,
-    KEYALL = (0xfffff & ~KEYNOMS18 &
+    KEYMODULES = 0x100000,
+    KEYALL = (0x1fffff & ~KEYNOMS18 &
               ~KEYNOOPENCL) // KEYNOMS18 and KEYNOOPENCL are used to exclude.
   };
 
@@ -146,9 +148,10 @@
   // We treat bridge casts as objective-C keywords so we can warn on them
   // in non-arc mode.
   if (LangOpts.ObjC2 && (Flags & KEYARC)) return KS_Enabled;
-  if (LangOpts.ConceptsTS && (Flags & KEYCONCEPTS)) return KS_Enabled;
   if (LangOpts.ObjC2 && (Flags & KEYOBJC2)) return KS_Enabled;
+  if (LangOpts.ConceptsTS && (Flags & KEYCONCEPTS)) return KS_Enabled;
   if (LangOpts.Coroutines && (Flags & KEYCOROUTINES)) return KS_Enabled;
+  if (LangOpts.ModulesTS && (Flags & KEYMODULES)) return KS_Enabled;
   if (LangOpts.CPlusPlus && (Flags & KEYCXX11)) return KS_Future;
   return KS_Disabled;
 }
diff --git a/lib/Basic/LangOptions.cpp b/lib/Basic/LangOptions.cpp
index 1b08b06..8c0ecd4 100644
--- a/lib/Basic/LangOptions.cpp
+++ b/lib/Basic/LangOptions.cpp
@@ -34,7 +34,6 @@
   SanitizerBlacklistFiles.clear();
 
   CurrentModule.clear();
-  ImplementationOfModule.clear();
 }
 
 bool LangOptions::isNoBuiltinFunc(const char *Name) const {
diff --git a/lib/Basic/Makefile b/lib/Basic/Makefile
deleted file mode 100644
index fe2c515..0000000
--- a/lib/Basic/Makefile
+++ /dev/null
@@ -1,40 +0,0 @@
-##===- clang/lib/Basic/Makefile ----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-#
-#  This implements the Basic library for the C-Language front-end.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangBasic
-
-include $(CLANG_LEVEL)/Makefile
-
-SVN_REVISION := $(strip \
-        $(shell $(LLVM_SRC_ROOT)/utils/GetSourceVersion $(PROJ_SRC_DIR)/../..))
-
-SVN_REPOSITORY := $(strip \
-        $(shell $(LLVM_SRC_ROOT)/utils/GetRepositoryPath $(PROJ_SRC_DIR)/../..))
-
-LLVM_REVISION := $(strip \
-        $(shell $(LLVM_SRC_ROOT)/utils/GetSourceVersion $(LLVM_SRC_ROOT)))
-
-LLVM_REPOSITORY := $(strip \
-        $(shell $(LLVM_SRC_ROOT)/utils/GetRepositoryPath $(LLVM_SRC_ROOT)))
-
-CPP.Defines += -I$(PROJ_SRC_DIR)/../../include -I$(PROJ_OBJ_DIR)/../../include \
-         -DSVN_REVISION='"$(SVN_REVISION)"' -DSVN_REPOSITORY='"$(SVN_REPOSITORY)"' \
-         -DLLVM_REVISION='"$(LLVM_REVISION)"' -DLLVM_REPOSITORY='"$(LLVM_REPOSITORY)"'
-
-$(ObjDir)/.ver-svn .ver: $(ObjDir)/.dir
-	@if [ '$(SVN_REVISION) $(LLVM_REVISION)' != '$(shell cat $(ObjDir)/.ver-svn 2>/dev/null)' ]; then\
-		echo '$(SVN_REVISION) $(LLVM_REVISION)' > $(ObjDir)/.ver-svn;                    \
-	fi
-$(ObjDir)/.ver-svn: .ver
-$(ObjDir)/Version.o: $(ObjDir)/.ver-svn
diff --git a/lib/Basic/Module.cpp b/lib/Basic/Module.cpp
index 9c39ecb..7f140e8 100644
--- a/lib/Basic/Module.cpp
+++ b/lib/Basic/Module.cpp
@@ -65,6 +65,7 @@
                         .Case("blocks", LangOpts.Blocks)
                         .Case("cplusplus", LangOpts.CPlusPlus)
                         .Case("cplusplus11", LangOpts.CPlusPlus11)
+                        .Case("gnuinlineasm", LangOpts.GNUAsm)
                         .Case("objc", LangOpts.ObjC1)
                         .Case("objc_arc", LangOpts.ObjCAutoRefCount)
                         .Case("opencl", LangOpts.OpenCL)
@@ -426,12 +427,8 @@
     OS.indent(Indent + 2);
     OS << "export ";
     printModuleId(OS, UnresolvedExports[I].Id);
-    if (UnresolvedExports[I].Wildcard) {
-      if (UnresolvedExports[I].Id.empty())
-        OS << "*";
-      else
-        OS << ".*";
-    }
+    if (UnresolvedExports[I].Wildcard)
+      OS << (UnresolvedExports[I].Id.empty() ? "*" : ".*");
     OS << "\n";
   }
 
diff --git a/lib/Basic/OpenMPKinds.cpp b/lib/Basic/OpenMPKinds.cpp
index 687bf4c..4b41ab1 100644
--- a/lib/Basic/OpenMPKinds.cpp
+++ b/lib/Basic/OpenMPKinds.cpp
@@ -55,6 +55,7 @@
   return llvm::StringSwitch<OpenMPClauseKind>(Str)
 #define OPENMP_CLAUSE(Name, Class) .Case(#Name, OMPC_##Name)
 #include "clang/Basic/OpenMPKinds.def"
+      .Case("uniform", OMPC_uniform)
       .Default(OMPC_unknown);
 }
 
@@ -67,6 +68,8 @@
   case OMPC_##Name:                                                            \
     return #Name;
 #include "clang/Basic/OpenMPKinds.def"
+  case OMPC_uniform:
+    return "uniform";
   case OMPC_threadprivate:
     return "threadprivate or thread local";
   }
@@ -114,6 +117,14 @@
 #define OPENMP_DIST_SCHEDULE_KIND(Name) .Case(#Name, OMPC_DIST_SCHEDULE_##Name)
 #include "clang/Basic/OpenMPKinds.def"
         .Default(OMPC_DIST_SCHEDULE_unknown);
+  case OMPC_defaultmap:
+    return llvm::StringSwitch<unsigned>(Str)
+#define OPENMP_DEFAULTMAP_KIND(Name)                                           \
+  .Case(#Name, static_cast<unsigned>(OMPC_DEFAULTMAP_##Name))
+#define OPENMP_DEFAULTMAP_MODIFIER(Name)                                       \
+  .Case(#Name, static_cast<unsigned>(OMPC_DEFAULTMAP_MODIFIER_##Name))
+#include "clang/Basic/OpenMPKinds.def"
+        .Default(OMPC_DEFAULTMAP_unknown);
   case OMPC_unknown:
   case OMPC_threadprivate:
   case OMPC_if:
@@ -150,6 +161,11 @@
   case OMPC_nogroup:
   case OMPC_num_tasks:
   case OMPC_hint:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
     break;
   }
   llvm_unreachable("Invalid OpenMP simple clause kind");
@@ -234,6 +250,20 @@
 #include "clang/Basic/OpenMPKinds.def"
     }
     llvm_unreachable("Invalid OpenMP 'dist_schedule' clause type");
+  case OMPC_defaultmap:
+    switch (Type) {
+    case OMPC_DEFAULTMAP_unknown:
+    case OMPC_DEFAULTMAP_MODIFIER_last:
+      return "unknown";
+#define OPENMP_DEFAULTMAP_KIND(Name)                                         \
+    case OMPC_DEFAULTMAP_##Name:                                             \
+      return #Name;
+#define OPENMP_DEFAULTMAP_MODIFIER(Name)                                     \
+    case OMPC_DEFAULTMAP_MODIFIER_##Name:                                    \
+      return #Name;
+#include "clang/Basic/OpenMPKinds.def"
+    }
+    llvm_unreachable("Invalid OpenMP 'schedule' clause type");
   case OMPC_unknown:
   case OMPC_threadprivate:
   case OMPC_if:
@@ -270,6 +300,11 @@
   case OMPC_nogroup:
   case OMPC_num_tasks:
   case OMPC_hint:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
     break;
   }
   llvm_unreachable("Invalid OpenMP simple clause kind");
@@ -413,6 +448,56 @@
       break;
     }
     break;
+  case OMPD_target_enter_data:
+    switch (CKind) {
+#define OPENMP_TARGET_ENTER_DATA_CLAUSE(Name)                                  \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_target_exit_data:
+    switch (CKind) {
+#define OPENMP_TARGET_EXIT_DATA_CLAUSE(Name)                                   \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_target_parallel:
+    switch (CKind) {
+#define OPENMP_TARGET_PARALLEL_CLAUSE(Name)                                    \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_target_parallel_for:
+    switch (CKind) {
+#define OPENMP_TARGET_PARALLEL_FOR_CLAUSE(Name)                                \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_target_update:
+    switch (CKind) {
+#define OPENMP_TARGET_UPDATE_CLAUSE(Name)                                      \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
   case OMPD_teams:
     switch (CKind) {
 #define OPENMP_TEAMS_CLAUSE(Name)                                              \
@@ -423,6 +508,8 @@
       break;
     }
     break;
+  case OMPD_declare_simd:
+    break;
   case OMPD_cancel:
     switch (CKind) {
 #define OPENMP_CANCEL_CLAUSE(Name)                                             \
@@ -483,6 +570,68 @@
       break;
     }
     break;
+  case OMPD_distribute_parallel_for:
+    switch (CKind) {
+#define OPENMP_DISTRIBUTE_PARALLEL_FOR_CLAUSE(Name)                            \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_distribute_parallel_for_simd:
+    switch (CKind) {
+#define OPENMP_DISTRIBUTE_PARALLEL_FOR_SIMD_CLAUSE(Name)                       \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_distribute_simd:
+    switch (CKind) {
+#define OPENMP_DISTRIBUTE_SIMD_CLAUSE(Name)                                    \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_target_parallel_for_simd:
+    switch (CKind) {
+#define OPENMP_TARGET_PARALLEL_FOR_SIMD_CLAUSE(Name)                           \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_target_simd:
+    switch (CKind) {
+#define OPENMP_TARGET_SIMD_CLAUSE(Name)                                        \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_teams_distribute:
+    switch (CKind) {
+#define OPENMP_TEAMS_DISTRIBUTE_CLAUSE(Name)                                   \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_declare_target:
+  case OMPD_end_declare_target:
   case OMPD_unknown:
   case OMPD_threadprivate:
   case OMPD_section:
@@ -492,6 +641,7 @@
   case OMPD_taskwait:
   case OMPD_taskgroup:
   case OMPD_cancellation_point:
+  case OMPD_declare_reduction:
     break;
   }
   return false;
@@ -500,17 +650,26 @@
 bool clang::isOpenMPLoopDirective(OpenMPDirectiveKind DKind) {
   return DKind == OMPD_simd || DKind == OMPD_for || DKind == OMPD_for_simd ||
          DKind == OMPD_parallel_for || DKind == OMPD_parallel_for_simd ||
-         DKind == OMPD_taskloop ||
-         DKind == OMPD_taskloop_simd ||
-         DKind == OMPD_distribute; // TODO add next directives.
+         DKind == OMPD_taskloop || DKind == OMPD_taskloop_simd ||
+         DKind == OMPD_distribute || DKind == OMPD_target_parallel_for ||
+         DKind == OMPD_distribute_parallel_for ||
+         DKind == OMPD_distribute_parallel_for_simd ||
+         DKind == OMPD_distribute_simd ||
+         DKind == OMPD_target_parallel_for_simd || DKind == OMPD_target_simd ||
+         DKind == OMPD_teams_distribute;
+  // TODO add next directives.
 }
 
 bool clang::isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind) {
   return DKind == OMPD_for || DKind == OMPD_for_simd ||
          DKind == OMPD_sections || DKind == OMPD_section ||
          DKind == OMPD_single || DKind == OMPD_parallel_for ||
-         DKind == OMPD_parallel_for_simd ||
-         DKind == OMPD_parallel_sections; // TODO add next directives.
+         DKind == OMPD_parallel_for_simd || DKind == OMPD_parallel_sections ||
+         DKind == OMPD_target_parallel_for ||
+         DKind == OMPD_distribute_parallel_for ||
+         DKind == OMPD_distribute_parallel_for_simd ||
+         DKind == OMPD_target_parallel_for_simd;
+  // TODO add next directives.
 }
 
 bool clang::isOpenMPTaskLoopDirective(OpenMPDirectiveKind DKind) {
@@ -519,26 +678,50 @@
 
 bool clang::isOpenMPParallelDirective(OpenMPDirectiveKind DKind) {
   return DKind == OMPD_parallel || DKind == OMPD_parallel_for ||
-         DKind == OMPD_parallel_for_simd ||
-         DKind == OMPD_parallel_sections; // TODO add next directives.
+         DKind == OMPD_parallel_for_simd || DKind == OMPD_parallel_sections ||
+         DKind == OMPD_target_parallel || DKind == OMPD_target_parallel_for ||
+         DKind == OMPD_distribute_parallel_for ||
+         DKind == OMPD_distribute_parallel_for_simd ||
+         DKind == OMPD_target_parallel_for_simd;
+  // TODO add next directives.
 }
 
-bool clang::isOpenMPTargetDirective(OpenMPDirectiveKind DKind) {
-  return DKind == OMPD_target; // TODO add next directives.
+bool clang::isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind) {
+  // TODO add next directives.
+  return DKind == OMPD_target || DKind == OMPD_target_parallel ||
+         DKind == OMPD_target_parallel_for || 
+         DKind == OMPD_target_parallel_for_simd || DKind == OMPD_target_simd;
+}
+
+bool clang::isOpenMPTargetDataManagementDirective(OpenMPDirectiveKind DKind) {
+  return DKind == OMPD_target_data || DKind == OMPD_target_enter_data ||
+         DKind == OMPD_target_exit_data || DKind == OMPD_target_update;
 }
 
 bool clang::isOpenMPTeamsDirective(OpenMPDirectiveKind DKind) {
-  return DKind == OMPD_teams; // TODO add next directives.
+  return DKind == OMPD_teams || DKind == OMPD_teams_distribute;
+  // TODO add next directives.
 }
 
 bool clang::isOpenMPSimdDirective(OpenMPDirectiveKind DKind) {
   return DKind == OMPD_simd || DKind == OMPD_for_simd ||
-         DKind == OMPD_parallel_for_simd ||
-         DKind == OMPD_taskloop_simd; // TODO add next directives.
+         DKind == OMPD_parallel_for_simd || DKind == OMPD_taskloop_simd ||
+         DKind == OMPD_distribute_parallel_for_simd ||
+         DKind == OMPD_distribute_simd || DKind == OMPD_target_simd;
+  // TODO add next directives.
+}
+
+bool clang::isOpenMPNestingDistributeDirective(OpenMPDirectiveKind Kind) {
+  return Kind == OMPD_distribute || Kind == OMPD_distribute_parallel_for ||
+         Kind == OMPD_distribute_parallel_for_simd ||
+         Kind == OMPD_distribute_simd;
+  // TODO add next directives.
 }
 
 bool clang::isOpenMPDistributeDirective(OpenMPDirectiveKind Kind) {
-  return Kind == OMPD_distribute; // TODO add next directives.
+  return isOpenMPNestingDistributeDirective(Kind) ||
+         Kind == OMPD_teams_distribute;
+  // TODO add next directives.
 }
 
 bool clang::isOpenMPPrivate(OpenMPClauseKind Kind) {
@@ -551,3 +734,12 @@
   return Kind == OMPC_threadprivate || Kind == OMPC_copyin;
 }
 
+bool clang::isOpenMPTaskingDirective(OpenMPDirectiveKind Kind) {
+  return Kind == OMPD_task || isOpenMPTaskLoopDirective(Kind);
+}
+
+bool clang::isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind) {
+  return Kind == OMPD_distribute_parallel_for ||
+         Kind == OMPD_distribute_parallel_for_simd ||
+         Kind == OMPD_distribute_simd || Kind == OMPD_teams_distribute;
+}
diff --git a/lib/Basic/OperatorPrecedence.cpp b/lib/Basic/OperatorPrecedence.cpp
index ade8d6d..384d23c 100644
--- a/lib/Basic/OperatorPrecedence.cpp
+++ b/lib/Basic/OperatorPrecedence.cpp
@@ -53,6 +53,7 @@
   case tok::pipeequal:            return prec::Assignment;
   case tok::question:             return prec::Conditional;
   case tok::pipepipe:             return prec::LogicalOr;
+  case tok::caretcaret:
   case tok::ampamp:               return prec::LogicalAnd;
   case tok::pipe:                 return prec::InclusiveOr;
   case tok::caret:                return prec::ExclusiveOr;
diff --git a/lib/Basic/SourceLocation.cpp b/lib/Basic/SourceLocation.cpp
index d254e86..a58d046 100644
--- a/lib/Basic/SourceLocation.cpp
+++ b/lib/Basic/SourceLocation.cpp
@@ -14,7 +14,6 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/PrettyStackTrace.h"
 #include "clang/Basic/SourceManager.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 using namespace clang;
diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp
index 4c50161..605f990 100644
--- a/lib/Basic/SourceManager.cpp
+++ b/lib/Basic/SourceManager.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstring>
-#include <string>
 
 using namespace clang;
 using namespace SrcMgr;
@@ -1160,7 +1159,8 @@
 
 // isInvalid - Return the result of calling loc.isInvalid(), and
 // if Invalid is not null, set its value to same.
-static bool isInvalid(SourceLocation Loc, bool *Invalid) {
+template<typename LocType>
+static bool isInvalid(LocType Loc, bool *Invalid) {
   bool MyInvalid = Loc.isInvalid();
   if (Invalid)
     *Invalid = MyInvalid;
@@ -1183,8 +1183,9 @@
 
 unsigned SourceManager::getPresumedColumnNumber(SourceLocation Loc,
                                                 bool *Invalid) const {
-  if (isInvalid(Loc, Invalid)) return 0;
-  return getPresumedLoc(Loc).getColumn();
+  PresumedLoc PLoc = getPresumedLoc(Loc);
+  if (isInvalid(PLoc, Invalid)) return 0;
+  return PLoc.getColumn();
 }
 
 #ifdef __SSE2__
@@ -1258,15 +1259,19 @@
 
     if (Buf[0] == '\n' || Buf[0] == '\r') {
       // If this is \n\r or \r\n, skip both characters.
-      if ((Buf[1] == '\n' || Buf[1] == '\r') && Buf[0] != Buf[1])
-        ++Offs, ++Buf;
-      ++Offs, ++Buf;
+      if ((Buf[1] == '\n' || Buf[1] == '\r') && Buf[0] != Buf[1]) {
+        ++Offs;
+        ++Buf;
+      }
+      ++Offs;
+      ++Buf;
       LineOffsets.push_back(Offs);
     } else {
       // Otherwise, this is a null.  If end of file, exit.
       if (Buf == End) break;
       // Otherwise, skip the null.
-      ++Offs, ++Buf;
+      ++Offs;
+      ++Buf;
     }
   }
 
@@ -1388,8 +1393,9 @@
 }
 unsigned SourceManager::getPresumedLineNumber(SourceLocation Loc,
                                               bool *Invalid) const {
-  if (isInvalid(Loc, Invalid)) return 0;
-  return getPresumedLoc(Loc).getLine();
+  PresumedLoc PLoc = getPresumedLoc(Loc);
+  if (isInvalid(PLoc, Invalid)) return 0;
+  return PLoc.getLine();
 }
 
 /// getFileCharacteristic - return the file characteristic of the specified
@@ -2089,10 +2095,10 @@
 
   // Clear the lookup cache, it depends on a common location.
   IsBeforeInTUCache.clear();
-  llvm::MemoryBuffer *LBuf = getBuffer(LOffs.first);
-  llvm::MemoryBuffer *RBuf = getBuffer(ROffs.first);
-  bool LIsBuiltins = strcmp("<built-in>", LBuf->getBufferIdentifier()) == 0;
-  bool RIsBuiltins = strcmp("<built-in>", RBuf->getBufferIdentifier()) == 0;
+  const char *LB = getBuffer(LOffs.first)->getBufferIdentifier();
+  const char *RB = getBuffer(ROffs.first)->getBufferIdentifier();
+  bool LIsBuiltins = strcmp("<built-in>", LB) == 0;
+  bool RIsBuiltins = strcmp("<built-in>", RB) == 0;
   // Sort built-in before non-built-in.
   if (LIsBuiltins || RIsBuiltins) {
     if (LIsBuiltins != RIsBuiltins)
@@ -2101,8 +2107,8 @@
     // lower IDs come first.
     return LOffs.first < ROffs.first;
   }
-  bool LIsAsm = strcmp("<inline asm>", LBuf->getBufferIdentifier()) == 0;
-  bool RIsAsm = strcmp("<inline asm>", RBuf->getBufferIdentifier()) == 0;
+  bool LIsAsm = strcmp("<inline asm>", LB) == 0;
+  bool RIsAsm = strcmp("<inline asm>", RB) == 0;
   // Sort assembler after built-ins, but before the rest.
   if (LIsAsm || RIsAsm) {
     if (LIsAsm != RIsAsm)
@@ -2110,6 +2116,14 @@
     assert(LOffs.first == ROffs.first);
     return false;
   }
+  bool LIsScratch = strcmp("<scratch space>", LB) == 0;
+  bool RIsScratch = strcmp("<scratch space>", RB) == 0;
+  // Sort scratch after inline asm, but before the rest.
+  if (LIsScratch || RIsScratch) {
+    if (LIsScratch != RIsScratch)
+      return LIsScratch;
+    return LOffs.second < ROffs.second;
+  }
   llvm_unreachable("Unsortable locations found");
 }
 
diff --git a/lib/Basic/TargetInfo.cpp b/lib/Basic/TargetInfo.cpp
index 1648a27..dec8b7c 100644
--- a/lib/Basic/TargetInfo.cpp
+++ b/lib/Basic/TargetInfo.cpp
@@ -30,6 +30,7 @@
   BigEndian = true;
   TLSSupported = true;
   NoAsmVariants = false;
+  HasFloat128 = false;
   PointerWidth = PointerAlign = 32;
   BoolWidth = BoolAlign = 8;
   IntWidth = IntAlign = 32;
@@ -46,6 +47,7 @@
   DoubleAlign = 64;
   LongDoubleWidth = 64;
   LongDoubleAlign = 64;
+  Float128Align = 128;
   LargeArrayMinWidth = 0;
   LargeArrayAlign = 0;
   MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 0;
@@ -66,18 +68,19 @@
   UseSignedCharForObjCBool = true;
   UseBitFieldTypeAlignment = true;
   UseZeroLengthBitfieldAlignment = false;
+  UseExplicitBitFieldAlignment = true;
   ZeroLengthBitfieldBoundary = 0;
   HalfFormat = &llvm::APFloat::IEEEhalf;
   FloatFormat = &llvm::APFloat::IEEEsingle;
   DoubleFormat = &llvm::APFloat::IEEEdouble;
   LongDoubleFormat = &llvm::APFloat::IEEEdouble;
-  DataLayoutString = nullptr;
-  UserLabelPrefix = "_";
+  Float128Format = &llvm::APFloat::IEEEquad;
   MCountName = "mcount";
   RegParmMax = 0;
   SSERegParmMax = 0;
   HasAlignMac68kSupport = false;
   HasBuiltinMSVaList = false;
+  IsRenderScriptTarget = false;
 
   // Default to no types using fpret.
   RealTypeUsesObjCFPRet = 0;
@@ -224,6 +227,8 @@
     if (&getLongDoubleFormat() == &llvm::APFloat::PPCDoubleDouble ||
         &getLongDoubleFormat() == &llvm::APFloat::IEEEquad)
       return LongDouble;
+    if (hasFloat128Type())
+      return Float128;
     break;
   }
 
@@ -276,6 +281,10 @@
     UseBitFieldTypeAlignment = false;
   if (Opts.ShortWChar)
     WCharType = UnsignedShort;
+  if (Opts.AlignDouble) {
+    DoubleAlign = LongLongAlign = 64;
+    LongDoubleAlign = 64;
+  }
 
   if (Opts.OpenCL) {
     // OpenCL C requires specific widths for types, irrespective of
diff --git a/lib/Basic/Targets.cpp b/lib/Basic/Targets.cpp
index 3b66b30..0fecdce 100644
--- a/lib/Basic/Targets.cpp
+++ b/lib/Basic/Targets.cpp
@@ -12,14 +12,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/TargetOptions.h"
 #include "clang/Basic/Version.h"
+#include "clang/Frontend/CodeGenOptions.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
@@ -65,6 +67,9 @@
     Builder.defineMacro("__tune_" + CPUName + "__");
 }
 
+static TargetInfo *AllocateTarget(const llvm::Triple &Triple,
+                                  const TargetOptions &Opts);
+
 //===----------------------------------------------------------------------===//
 // Defines specific to certain operating systems.
 //===----------------------------------------------------------------------===//
@@ -76,7 +81,8 @@
   virtual void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
                             MacroBuilder &Builder) const=0;
 public:
-  OSTargetInfo(const llvm::Triple &Triple) : TgtInfo(Triple) {}
+  OSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : TgtInfo(Triple, Opts) {}
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
     TgtInfo::getTargetDefines(Opts, Builder);
@@ -101,10 +107,8 @@
   }
 
 public:
-  CloudABITargetInfo(const llvm::Triple &Triple)
-      : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
-  }
+  CloudABITargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {}
 };
 
 static void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
@@ -237,7 +241,8 @@
   }
 
 public:
-  DarwinTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
+  DarwinTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     // By default, no TLS, and we whitelist permitted architecture/OS
     // combinations.
     this->TLSSupported = false;
@@ -304,10 +309,8 @@
     DefineStd(Builder, "unix", Opts);
   }
 public:
-  DragonFlyBSDTargetInfo(const llvm::Triple &Triple)
-      : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
-
+  DragonFlyBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     switch (Triple.getArch()) {
     default:
     case llvm::Triple::x86:
@@ -318,6 +321,10 @@
   }
 };
 
+#ifndef FREEBSD_CC_VERSION
+#define FREEBSD_CC_VERSION 0U
+#endif
+
 // FreeBSD Target
 template<typename Target>
 class FreeBSDTargetInfo : public OSTargetInfo<Target> {
@@ -328,10 +335,13 @@
 
     unsigned Release = Triple.getOSMajorVersion();
     if (Release == 0U)
-      Release = 8;
+      Release = 8U;
+    unsigned CCVersion = FREEBSD_CC_VERSION;
+    if (CCVersion == 0U)
+      CCVersion = Release * 100000U + 1U;
 
     Builder.defineMacro("__FreeBSD__", Twine(Release));
-    Builder.defineMacro("__FreeBSD_cc_version", Twine(Release * 100000U + 1U));
+    Builder.defineMacro("__FreeBSD_cc_version", Twine(CCVersion));
     Builder.defineMacro("__KPRINTF_ATTRIBUTE__");
     DefineStd(Builder, "unix", Opts);
     Builder.defineMacro("__ELF__");
@@ -348,9 +358,8 @@
     Builder.defineMacro("__STDC_MB_MIGHT_NEQ_WC__", "1");
   }
 public:
-  FreeBSDTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
-
+  FreeBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     switch (Triple.getArch()) {
     default:
     case llvm::Triple::x86:
@@ -389,9 +398,30 @@
       Builder.defineMacro("_GNU_SOURCE");
   }
 public:
-  KFreeBSDTargetInfo(const llvm::Triple &Triple)
-      : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
+  KFreeBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {}
+};
+
+// Haiku Target
+template<typename Target>
+class HaikuTargetInfo : public OSTargetInfo<Target> {
+protected:
+  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
+                    MacroBuilder &Builder) const override {
+    // Haiku defines; list based off of gcc output
+    Builder.defineMacro("__HAIKU__");
+    Builder.defineMacro("__ELF__");
+    DefineStd(Builder, "unix", Opts);
+  }
+public:
+  HaikuTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
+    this->SizeType = TargetInfo::UnsignedLong;
+    this->IntPtrType = TargetInfo::SignedLong;
+    this->PtrDiffType = TargetInfo::SignedLong;
+    this->ProcessIDType = TargetInfo::SignedLong;
+    this->TLSSupported = false;
+
   }
 };
 
@@ -414,9 +444,8 @@
     DefineStd(Builder, "unix", Opts);
   }
 public:
-  MinixTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
-  }
+  MinixTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {}
 };
 
 // Linux target
@@ -441,10 +470,12 @@
       Builder.defineMacro("_REENTRANT");
     if (Opts.CPlusPlus)
       Builder.defineMacro("_GNU_SOURCE");
+    if (this->HasFloat128)
+      Builder.defineMacro("__FLOAT128__");
   }
 public:
-  LinuxTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
+  LinuxTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->WIntType = TargetInfo::UnsignedInt;
 
     switch (Triple.getArch()) {
@@ -455,6 +486,11 @@
     case llvm::Triple::ppc64le:
       this->MCountName = "_mcount";
       break;
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+    case llvm::Triple::systemz:
+      this->HasFloat128 = true;
+      break;
     }
   }
 
@@ -488,8 +524,8 @@
     }
   }
 public:
-  NetBSDTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
+  NetBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->MCountName = "_mcount";
   }
 };
@@ -509,8 +545,8 @@
       Builder.defineMacro("_REENTRANT");
   }
 public:
-  OpenBSDTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
+  OpenBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->TLSSupported = false;
 
       switch (Triple.getArch()) {
@@ -557,8 +593,8 @@
     }
   }
 public:
-  BitrigTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
+  BitrigTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->MCountName = "__mcount";
   }
 };
@@ -576,9 +612,7 @@
     Builder.defineMacro("__ELF__");
   }
 public:
-  PSPTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
-  }
+  PSPTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {}
 };
 
 // PS3 PPU Target
@@ -597,14 +631,14 @@
     Builder.defineMacro("__powerpc64__");
   }
 public:
-  PS3PPUTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
+  PS3PPUTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->LongWidth = this->LongAlign = 32;
     this->PointerWidth = this->PointerAlign = 32;
     this->IntMaxType = TargetInfo::SignedLongLong;
     this->Int64Type = TargetInfo::SignedLongLong;
     this->SizeType = TargetInfo::UnsignedInt;
-    this->DataLayoutString = "E-m:e-p:32:32-i64:64-n32:64";
+    this->resetDataLayout("E-m:e-p:32:32-i64:64-n32:64");
   }
 };
 
@@ -618,15 +652,19 @@
     Builder.defineMacro("__KPRINTF_ATTRIBUTE__");
     DefineStd(Builder, "unix", Opts);
     Builder.defineMacro("__ELF__");
-    Builder.defineMacro("__PS4__");
+    Builder.defineMacro("__ORBIS__");
   }
 public:
-  PS4OSTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
+  PS4OSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->WCharType = this->UnsignedShort;
 
     // On PS4, TLS variable cannot be aligned to more than 32 bytes (256 bits).
     this->MaxTLSAlign = 256;
-    this->UserLabelPrefix = "";
+
+    // On PS4, do not honor explicit bit field alignment,
+    // as in "__attribute__((aligned(2))) int b : 1;".
+    this->UseExplicitBitFieldAlignment = false;
 
     switch (Triple.getArch()) {
     default:
@@ -664,8 +702,8 @@
     Builder.defineMacro("_REENTRANT");
   }
 public:
-  SolarisTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
+  SolarisTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->WCharType = this->SignedInt;
     // FIXME: WIntType should be SignedLong
   }
@@ -709,6 +747,13 @@
 
       if (Opts.CPlusPlus11 && Opts.isCompatibleWithMSVC(LangOptions::MSVC2015))
         Builder.defineMacro("_HAS_CHAR16_T_LANGUAGE_SUPPORT", Twine(1));
+
+      if (Opts.isCompatibleWithMSVC(LangOptions::MSVC2015)) {
+        if (Opts.CPlusPlus1z)
+          Builder.defineMacro("_MSVC_LANG", "201403L");
+        else if (Opts.CPlusPlus14)
+          Builder.defineMacro("_MSVC_LANG", "201402L");
+      }
     }
 
     if (Opts.MicrosoftExt) {
@@ -725,8 +770,8 @@
   }
 
 public:
-  WindowsTargetInfo(const llvm::Triple &Triple)
-      : OSTargetInfo<Target>(Triple) {}
+  WindowsTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {}
 };
 
 template <typename Target>
@@ -745,8 +790,8 @@
   }
 
 public:
-  NaClTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
+  NaClTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->LongAlign = 32;
     this->LongWidth = 32;
     this->PointerAlign = 32;
@@ -766,14 +811,14 @@
     if (Triple.getArch() == llvm::Triple::arm) {
       // Handled in ARM's setABI().
     } else if (Triple.getArch() == llvm::Triple::x86) {
-      this->DataLayoutString = "e-m:e-p:32:32-i64:64-n8:16:32-S128";
+      this->resetDataLayout("e-m:e-p:32:32-i64:64-n8:16:32-S128");
     } else if (Triple.getArch() == llvm::Triple::x86_64) {
-      this->DataLayoutString = "e-m:e-p:32:32-i64:64-n8:16:32:64-S128";
+      this->resetDataLayout("e-m:e-p:32:32-i64:64-n8:16:32:64-S128");
     } else if (Triple.getArch() == llvm::Triple::mipsel) {
-      // Handled on mips' setDataLayoutString.
+      // Handled on mips' setDataLayout.
     } else {
       assert(Triple.getArch() == llvm::Triple::le32);
-      this->DataLayoutString = "e-p:32:32-i64:64";
+      this->resetDataLayout("e-p:32:32-i64:64");
     }
   }
 };
@@ -797,10 +842,10 @@
   }
 
 public:
-  explicit WebAssemblyOSTargetInfo(const llvm::Triple &Triple)
-      : OSTargetInfo<Target>(Triple) {
+  explicit WebAssemblyOSTargetInfo(const llvm::Triple &Triple,
+                                   const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     this->MCountName = "__mcount";
-    this->UserLabelPrefix = "";
     this->TheCXXABI.set(TargetCXXABI::WebAssembly);
   }
 };
@@ -830,7 +875,7 @@
   std::string ABI;
 
 public:
-  PPCTargetInfo(const llvm::Triple &Triple)
+  PPCTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
     : TargetInfo(Triple), HasVSX(false), HasP8Vector(false),
       HasP8Crypto(false), HasDirectMove(false), HasQPX(false), HasHTM(false),
       HasBPERMD(false), HasExtDiv(false) {
@@ -856,8 +901,9 @@
     ArchDefinePwr6x = 1 << 10,
     ArchDefinePwr7  = 1 << 11,
     ArchDefinePwr8  = 1 << 12,
-    ArchDefineA2    = 1 << 13,
-    ArchDefineA2q   = 1 << 14
+    ArchDefinePwr9  = 1 << 13,
+    ArchDefineA2    = 1 << 14,
+    ArchDefineA2q   = 1 << 15
   } ArchDefineTypes;
 
   // Note: GCC recognizes the following additional cpus:
@@ -906,6 +952,8 @@
       .Case("pwr7", true)
       .Case("power8", true)
       .Case("pwr8", true)
+      .Case("power9", true)
+      .Case("pwr9", true)
       .Case("powerpc", true)
       .Case("ppc", true)
       .Case("powerpc64", true)
@@ -1108,6 +1156,8 @@
       HasQPX = true;
     } else if (Feature == "+htm") {
       HasHTM = true;
+    } else if (Feature == "+float128") {
+      HasFloat128 = true;
     }
     // TODO: Finish this list and add an assert that we've handled them
     // all.
@@ -1198,6 +1248,10 @@
     .Case("pwr8",  ArchDefineName | ArchDefinePwr7 | ArchDefinePwr6x
                      | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5
                      | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+    .Case("pwr9",  ArchDefineName | ArchDefinePwr8 | ArchDefinePwr7
+                     | ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x
+                     | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr
+                     | ArchDefinePpcsq)
     .Case("power3",  ArchDefinePpcgr)
     .Case("power4",  ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
     .Case("power5",  ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr
@@ -1215,6 +1269,10 @@
     .Case("power8",  ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6x
                        | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5
                        | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+    .Case("power9",  ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7
+                       | ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x
+                       | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr
+                       | ArchDefinePpcsq)
     .Default(ArchDefineNone);
 
   if (defs & ArchDefineName)
@@ -1243,6 +1301,8 @@
     Builder.defineMacro("_ARCH_PWR7");
   if (defs & ArchDefinePwr8)
     Builder.defineMacro("_ARCH_PWR8");
+  if (defs & ArchDefinePwr9)
+    Builder.defineMacro("_ARCH_PWR9");
   if (defs & ArchDefineA2)
     Builder.defineMacro("_ARCH_A2");
   if (defs & ArchDefineA2q) {
@@ -1265,6 +1325,8 @@
     Builder.defineMacro("__CRYPTO__");
   if (HasHTM)
     Builder.defineMacro("__HTM__");
+  if (HasFloat128)
+    Builder.defineMacro("__FLOAT128__");
 
   Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
   Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
@@ -1315,6 +1377,13 @@
                                                      << "-mno-vsx";
       return false;
     }
+
+    if (std::find(FeaturesVec.begin(), FeaturesVec.end(), "+float128") !=
+        FeaturesVec.end()) {
+      Diags.Report(diag::err_opt_not_valid_with_opt) << "-mfloat128"
+                                                     << "-mno-vsx";
+      return false;
+    }
   }
 
   return true;
@@ -1333,6 +1402,7 @@
     .Case("pwr6", true)
     .Case("pwr7", true)
     .Case("pwr8", true)
+    .Case("pwr9", true)
     .Case("ppc64", true)
     .Case("ppc64le", true)
     .Default(false);
@@ -1340,28 +1410,34 @@
   Features["qpx"] = (CPU == "a2q");
   Features["crypto"] = llvm::StringSwitch<bool>(CPU)
     .Case("ppc64le", true)
+    .Case("pwr9", true)
     .Case("pwr8", true)
     .Default(false);
   Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
     .Case("ppc64le", true)
+    .Case("pwr9", true)
     .Case("pwr8", true)
     .Default(false);
   Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
     .Case("ppc64le", true)
+    .Case("pwr9", true)
     .Case("pwr8", true)
     .Case("pwr7", true)
     .Default(false);
   Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
     .Case("ppc64le", true)
+    .Case("pwr9", true)
     .Case("pwr8", true)
     .Case("pwr7", true)
     .Default(false);
   Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
     .Case("ppc64le", true)
+    .Case("pwr9", true)
     .Case("pwr8", true)
     .Default(false);
   Features["vsx"] = llvm::StringSwitch<bool>(CPU)
     .Case("ppc64le", true)
+    .Case("pwr9", true)
     .Case("pwr8", true)
     .Case("pwr7", true)
     .Default(false);
@@ -1383,6 +1459,7 @@
     .Case("htm", HasHTM)
     .Case("bpermd", HasBPERMD)
     .Case("extdiv", HasExtDiv)
+    .Case("float128", HasFloat128)
     .Default(false);
 }
 
@@ -1392,19 +1469,19 @@
   // as well. Do the inverse if we're disabling vsx. We'll diagnose any user
   // incompatible options.
   if (Enabled) {
-    if (Name == "vsx") {
-     Features[Name] = true;
-    } else if (Name == "direct-move") {
+    if (Name == "direct-move") {
       Features[Name] = Features["vsx"] = true;
     } else if (Name == "power8-vector") {
       Features[Name] = Features["vsx"] = true;
+    } else if (Name == "float128") {
+      Features[Name] = Features["vsx"] = true;
     } else {
       Features[Name] = true;
     }
   } else {
     if (Name == "vsx") {
       Features[Name] = Features["direct-move"] = Features["power8-vector"] =
-          false;
+          Features["float128"] = false;
     } else {
       Features[Name] = false;
     }
@@ -1512,8 +1589,9 @@
 
 class PPC32TargetInfo : public PPCTargetInfo {
 public:
-  PPC32TargetInfo(const llvm::Triple &Triple) : PPCTargetInfo(Triple) {
-    DataLayoutString = "E-m:e-p:32:32-i64:64-n32";
+  PPC32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : PPCTargetInfo(Triple, Opts) {
+    resetDataLayout("E-m:e-p:32:32-i64:64-n32");
 
     switch (getTriple().getOS()) {
     case llvm::Triple::Linux:
@@ -1546,16 +1624,17 @@
 // TargetInfo for little endian.
 class PPC64TargetInfo : public PPCTargetInfo {
 public:
-  PPC64TargetInfo(const llvm::Triple &Triple) : PPCTargetInfo(Triple) {
+  PPC64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : PPCTargetInfo(Triple, Opts) {
     LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
     IntMaxType = SignedLong;
     Int64Type = SignedLong;
 
     if ((Triple.getArch() == llvm::Triple::ppc64le)) {
-      DataLayoutString = "e-m:e-i64:64-n32:64";
+      resetDataLayout("e-m:e-i64:64-n32:64");
       ABI = "elfv2";
     } else {
-      DataLayoutString = "E-m:e-i64:64-n32:64";
+      resetDataLayout("E-m:e-i64:64-n32:64");
       ABI = "elfv1";
     }
 
@@ -1588,31 +1667,29 @@
   }
 };
 
-class DarwinPPC32TargetInfo :
-  public DarwinTargetInfo<PPC32TargetInfo> {
+class DarwinPPC32TargetInfo : public DarwinTargetInfo<PPC32TargetInfo> {
 public:
-  DarwinPPC32TargetInfo(const llvm::Triple &Triple)
-      : DarwinTargetInfo<PPC32TargetInfo>(Triple) {
+  DarwinPPC32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : DarwinTargetInfo<PPC32TargetInfo>(Triple, Opts) {
     HasAlignMac68kSupport = true;
     BoolWidth = BoolAlign = 32; //XXX support -mone-byte-bool?
     PtrDiffType = SignedInt; // for http://llvm.org/bugs/show_bug.cgi?id=15726
     LongLongAlign = 32;
     SuitableAlign = 128;
-    DataLayoutString = "E-m:o-p:32:32-f64:32:64-n32";
+    resetDataLayout("E-m:o-p:32:32-f64:32:64-n32");
   }
   BuiltinVaListKind getBuiltinVaListKind() const override {
     return TargetInfo::CharPtrBuiltinVaList;
   }
 };
 
-class DarwinPPC64TargetInfo :
-  public DarwinTargetInfo<PPC64TargetInfo> {
+class DarwinPPC64TargetInfo : public DarwinTargetInfo<PPC64TargetInfo> {
 public:
-  DarwinPPC64TargetInfo(const llvm::Triple &Triple)
-      : DarwinTargetInfo<PPC64TargetInfo>(Triple) {
+  DarwinPPC64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : DarwinTargetInfo<PPC64TargetInfo>(Triple, Opts) {
     HasAlignMac68kSupport = true;
     SuitableAlign = 128;
-    DataLayoutString = "E-m:o-i64:64-n32:64";
+    resetDataLayout("E-m:o-i64:64-n32:64");
   }
 };
 
@@ -1630,19 +1707,11 @@
 class NVPTXTargetInfo : public TargetInfo {
   static const char *const GCCRegNames[];
   static const Builtin::Info BuiltinInfo[];
-
-  // The GPU profiles supported by the NVPTX backend
-  enum GPUKind {
-    GK_NONE,
-    GK_SM20,
-    GK_SM21,
-    GK_SM30,
-    GK_SM35,
-    GK_SM37,
-  } GPU;
+  CudaArch GPU;
 
 public:
-  NVPTXTargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  NVPTXTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : TargetInfo(Triple) {
     BigEndian = false;
     TLSSupported = false;
     LongWidth = LongAlign = 64;
@@ -1651,8 +1720,66 @@
     // Define available target features
     // These must be defined in sorted order!
     NoAsmVariants = true;
-    // Set the default GPU to sm20
-    GPU = GK_SM20;
+    GPU = CudaArch::SM_20;
+
+    // If possible, get a TargetInfo for our host triple, so we can match its
+    // types.
+    llvm::Triple HostTriple(Opts.HostTriple);
+    if (HostTriple.isNVPTX())
+      return;
+    std::unique_ptr<TargetInfo> HostTarget(
+        AllocateTarget(llvm::Triple(Opts.HostTriple), Opts));
+    if (!HostTarget) {
+      return;
+    }
+
+    PointerWidth = HostTarget->getPointerWidth(/* AddrSpace = */ 0);
+    PointerAlign = HostTarget->getPointerAlign(/* AddrSpace = */ 0);
+    BoolWidth = HostTarget->getBoolWidth();
+    BoolAlign = HostTarget->getBoolAlign();
+    IntWidth = HostTarget->getIntWidth();
+    IntAlign = HostTarget->getIntAlign();
+    HalfWidth = HostTarget->getHalfWidth();
+    HalfAlign = HostTarget->getHalfAlign();
+    FloatWidth = HostTarget->getFloatWidth();
+    FloatAlign = HostTarget->getFloatAlign();
+    DoubleWidth = HostTarget->getDoubleWidth();
+    DoubleAlign = HostTarget->getDoubleAlign();
+    LongWidth = HostTarget->getLongWidth();
+    LongAlign = HostTarget->getLongAlign();
+    LongLongWidth = HostTarget->getLongLongWidth();
+    LongLongAlign = HostTarget->getLongLongAlign();
+    MinGlobalAlign = HostTarget->getMinGlobalAlign();
+    DefaultAlignForAttributeAligned =
+        HostTarget->getDefaultAlignForAttributeAligned();
+    SizeType = HostTarget->getSizeType();
+    IntMaxType = HostTarget->getIntMaxType();
+    PtrDiffType = HostTarget->getPtrDiffType(/* AddrSpace = */ 0);
+    IntPtrType = HostTarget->getIntPtrType();
+    WCharType = HostTarget->getWCharType();
+    WIntType = HostTarget->getWIntType();
+    Char16Type = HostTarget->getChar16Type();
+    Char32Type = HostTarget->getChar32Type();
+    Int64Type = HostTarget->getInt64Type();
+    SigAtomicType = HostTarget->getSigAtomicType();
+    ProcessIDType = HostTarget->getProcessIDType();
+
+    UseBitFieldTypeAlignment = HostTarget->useBitFieldTypeAlignment();
+    UseZeroLengthBitfieldAlignment =
+        HostTarget->useZeroLengthBitfieldAlignment();
+    UseExplicitBitFieldAlignment = HostTarget->useExplicitBitFieldAlignment();
+    ZeroLengthBitfieldBoundary = HostTarget->getZeroLengthBitfieldBoundary();
+
+    // Properties intentionally not copied from host:
+    // - LargeArrayMinWidth, LargeArrayAlign: Not visible across the
+    //   host/device boundary.
+    // - SuitableAlign: Not visible across the host/device boundary, and may
+    //   correctly be different on host/device, e.g. if host has wider vector
+    //   types than device.
+    // - LongDoubleWidth, LongDoubleAlign: nvptx's long double type is the same
+    //   as its double type, but that's not necessarily true on the host.
+    //   TODO: nvcc emits a warning when using long double on device; we should
+    //   do the same.
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -1660,26 +1787,38 @@
     Builder.defineMacro("__NVPTX__");
     if (Opts.CUDAIsDevice) {
       // Set __CUDA_ARCH__ for the GPU specified.
-      std::string CUDAArchCode;
-      switch (GPU) {
-      case GK_SM20:
-        CUDAArchCode = "200";
-        break;
-      case GK_SM21:
-        CUDAArchCode = "210";
-        break;
-      case GK_SM30:
-        CUDAArchCode = "300";
-        break;
-      case GK_SM35:
-        CUDAArchCode = "350";
-        break;
-      case GK_SM37:
-        CUDAArchCode = "370";
-        break;
-      default:
-        llvm_unreachable("Unhandled target CPU");
-      }
+      std::string CUDAArchCode = [this] {
+        switch (GPU) {
+        case CudaArch::UNKNOWN:
+          assert(false && "No GPU arch when compiling CUDA device code.");
+          return "";
+        case CudaArch::SM_20:
+          return "200";
+        case CudaArch::SM_21:
+          return "210";
+        case CudaArch::SM_30:
+          return "300";
+        case CudaArch::SM_32:
+          return "320";
+        case CudaArch::SM_35:
+          return "350";
+        case CudaArch::SM_37:
+          return "370";
+        case CudaArch::SM_50:
+          return "500";
+        case CudaArch::SM_52:
+          return "520";
+        case CudaArch::SM_53:
+          return "530";
+        case CudaArch::SM_60:
+          return "600";
+        case CudaArch::SM_61:
+          return "610";
+        case CudaArch::SM_62:
+          return "620";
+        }
+        llvm_unreachable("unhandled CudaArch");
+      }();
       Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
     }
   }
@@ -1720,15 +1859,21 @@
     return TargetInfo::CharPtrBuiltinVaList;
   }
   bool setCPU(const std::string &Name) override {
-    GPU = llvm::StringSwitch<GPUKind>(Name)
-              .Case("sm_20", GK_SM20)
-              .Case("sm_21", GK_SM21)
-              .Case("sm_30", GK_SM30)
-              .Case("sm_35", GK_SM35)
-              .Case("sm_37", GK_SM37)
-              .Default(GK_NONE);
+    GPU = StringToCudaArch(Name);
+    return GPU != CudaArch::UNKNOWN;
+  }
+  void setSupportedOpenCLOpts() override {
+    auto &Opts = getSupportedOpenCLOpts();
+    Opts.cl_clang_storage_class_specifiers = 1;
+    Opts.cl_khr_gl_sharing = 1;
+    Opts.cl_khr_icd = 1;
 
-    return GPU != GK_NONE;
+    Opts.cl_khr_fp64 = 1;
+    Opts.cl_khr_byte_addressable_store = 1;
+    Opts.cl_khr_global_int32_base_atomics = 1;
+    Opts.cl_khr_global_int32_extended_atomics = 1;
+    Opts.cl_khr_local_int32_base_atomics = 1;
+    Opts.cl_khr_local_int32_extended_atomics = 1;
   }
 };
 
@@ -1748,24 +1893,26 @@
 
 class NVPTX32TargetInfo : public NVPTXTargetInfo {
 public:
-  NVPTX32TargetInfo(const llvm::Triple &Triple) : NVPTXTargetInfo(Triple) {
+  NVPTX32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : NVPTXTargetInfo(Triple, Opts) {
     LongWidth = LongAlign = 32;
     PointerWidth = PointerAlign = 32;
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = TargetInfo::SignedInt;
     IntPtrType = TargetInfo::SignedInt;
-    DataLayoutString = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+    resetDataLayout("e-p:32:32-i64:64-v16:16-v32:32-n16:32:64");
   }
 };
 
 class NVPTX64TargetInfo : public NVPTXTargetInfo {
 public:
-  NVPTX64TargetInfo(const llvm::Triple &Triple) : NVPTXTargetInfo(Triple) {
+  NVPTX64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : NVPTXTargetInfo(Triple, Opts) {
     PointerWidth = PointerAlign = 64;
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = TargetInfo::SignedLong;
     IntPtrType = TargetInfo::SignedLong;
-    DataLayoutString = "e-i64:64-v16:16-v32:32-n16:32:64";
+    resetDataLayout("e-i64:64-v16:16-v32:32-n16:32:64");
   }
 };
 
@@ -1786,16 +1933,12 @@
   "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
   "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
 
-static const char *const DataLayoutStringR600DoubleOps =
-  "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-  "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
-
 static const char *const DataLayoutStringSI =
-  "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"
+  "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
   "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
   "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
 
-class AMDGPUTargetInfo : public TargetInfo {
+class AMDGPUTargetInfo final : public TargetInfo {
   static const Builtin::Info BuiltinInfo[];
   static const char * const GCCRegNames[];
 
@@ -1818,24 +1961,31 @@
   bool hasFP64:1;
   bool hasFMAF:1;
   bool hasLDEXPF:1;
+  bool hasDenormSupport:1;
+
+  static bool isAMDGCN(const llvm::Triple &TT) {
+    return TT.getArch() == llvm::Triple::amdgcn;
+  }
 
 public:
-  AMDGPUTargetInfo(const llvm::Triple &Triple)
-    : TargetInfo(Triple) {
-
-    if (Triple.getArch() == llvm::Triple::amdgcn) {
-      DataLayoutString = DataLayoutStringSI;
-      GPU = GK_SOUTHERN_ISLANDS;
+  AMDGPUTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+    : TargetInfo(Triple) ,
+      GPU(isAMDGCN(Triple) ? GK_SOUTHERN_ISLANDS : GK_R600),
+      hasFP64(false),
+      hasFMAF(false),
+      hasLDEXPF(false),
+      hasDenormSupport(false){
+    if (getTriple().getArch() == llvm::Triple::amdgcn) {
       hasFP64 = true;
       hasFMAF = true;
       hasLDEXPF = true;
-    } else {
-      DataLayoutString = DataLayoutStringR600;
-      GPU = GK_R600;
-      hasFP64 = false;
-      hasFMAF = false;
-      hasLDEXPF = false;
     }
+    if (Opts.CPU == "fiji")
+      hasDenormSupport = true;
+
+    resetDataLayout(getTriple().getArch() == llvm::Triple::amdgcn ?
+                    DataLayoutStringSI : DataLayoutStringR600);
+
     AddrSpaceMap = &AMDGPUAddrSpaceMap;
     UseAddrSpaceMapMangling = true;
   }
@@ -1876,6 +2026,30 @@
     return false;
   }
 
+  bool initFeatureMap(llvm::StringMap<bool> &Features,
+                      DiagnosticsEngine &Diags, StringRef CPU,
+                      const std::vector<std::string> &FeatureVec) const override;
+
+  void adjustTargetOptions(const CodeGenOptions &CGOpts,
+                           TargetOptions &TargetOpts) const override {
+    if (!hasDenormSupport)
+      return;
+    bool hasFP32Denormals = false;
+    bool hasFP64Denormals = false;
+    for (auto &I : TargetOpts.FeaturesAsWritten) {
+      if (I == "+fp32-denormals" || I == "-fp32-denormals")
+        hasFP32Denormals = true;
+      if (I == "+fp64-denormals" || I == "-fp64-denormals")
+        hasFP64Denormals = true;
+    }
+    if (!hasFP32Denormals)
+      TargetOpts.Features.push_back((Twine(CGOpts.FlushDenorm ? '-' : '+') +
+                                     Twine("fp32-denormals")).str());
+    if (!hasFP64Denormals && hasFP64)
+      TargetOpts.Features.push_back((Twine(CGOpts.FlushDenorm ? '-' : '+') +
+                                     Twine("fp64-denormals")).str());
+  }
+
   ArrayRef<Builtin::Info> getTargetBuiltins() const override {
     return llvm::makeArrayRef(BuiltinInfo,
                         clang::AMDGPU::LastTSBuiltin - Builtin::FirstTSBuiltin);
@@ -1883,30 +2057,25 @@
 
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
-    Builder.defineMacro("__R600__");
+    if (getTriple().getArch() == llvm::Triple::amdgcn)
+      Builder.defineMacro("__AMDGCN__");
+    else
+      Builder.defineMacro("__R600__");
+
     if (hasFMAF)
       Builder.defineMacro("__HAS_FMAF__");
     if (hasLDEXPF)
       Builder.defineMacro("__HAS_LDEXPF__");
-    if (hasFP64 && Opts.OpenCL)
-      Builder.defineMacro("cl_khr_fp64");
-    if (Opts.OpenCL) {
-      if (GPU >= GK_NORTHERN_ISLANDS) {
-        Builder.defineMacro("cl_khr_byte_addressable_store");
-        Builder.defineMacro("cl_khr_global_int32_base_atomics");
-        Builder.defineMacro("cl_khr_global_int32_extended_atomics");
-        Builder.defineMacro("cl_khr_local_int32_base_atomics");
-        Builder.defineMacro("cl_khr_local_int32_extended_atomics");
-      }
-    }
+    if (hasFP64)
+      Builder.defineMacro("__HAS_FP64__");
   }
 
   BuiltinVaListKind getBuiltinVaListKind() const override {
     return TargetInfo::CharPtrBuiltinVaList;
   }
 
-  bool setCPU(const std::string &Name) override {
-    GPU = llvm::StringSwitch<GPUKind>(Name)
+  static GPUKind parseR600Name(StringRef Name) {
+    return llvm::StringSwitch<GPUKind>(Name)
       .Case("r600" ,    GK_R600)
       .Case("rv610",    GK_R600)
       .Case("rv620",    GK_R600)
@@ -1932,6 +2101,11 @@
       .Case("caicos",   GK_NORTHERN_ISLANDS)
       .Case("cayman",   GK_CAYMAN)
       .Case("aruba",    GK_CAYMAN)
+      .Default(GK_NONE);
+  }
+
+  static GPUKind parseAMDGCNName(StringRef Name) {
+    return llvm::StringSwitch<GPUKind>(Name)
       .Case("tahiti",   GK_SOUTHERN_ISLANDS)
       .Case("pitcairn", GK_SOUTHERN_ISLANDS)
       .Case("verde",    GK_SOUTHERN_ISLANDS)
@@ -1945,50 +2119,65 @@
       .Case("tonga",    GK_VOLCANIC_ISLANDS)
       .Case("iceland",  GK_VOLCANIC_ISLANDS)
       .Case("carrizo",  GK_VOLCANIC_ISLANDS)
+      .Case("fiji",     GK_VOLCANIC_ISLANDS)
+      .Case("stoney",   GK_VOLCANIC_ISLANDS)
       .Default(GK_NONE);
+  }
 
-    if (GPU == GK_NONE) {
-      return false;
+  bool setCPU(const std::string &Name) override {
+    if (getTriple().getArch() == llvm::Triple::amdgcn)
+      GPU = parseAMDGCNName(Name);
+    else
+      GPU = parseR600Name(Name);
+
+    return GPU != GK_NONE;
+  }
+
+  void setSupportedOpenCLOpts() override {
+    auto &Opts = getSupportedOpenCLOpts();
+    Opts.cl_clang_storage_class_specifiers = 1;
+    Opts.cl_khr_icd = 1;
+
+    if (hasFP64)
+      Opts.cl_khr_fp64 = 1;
+    if (GPU >= GK_EVERGREEN) {
+      Opts.cl_khr_byte_addressable_store = 1;
+      Opts.cl_khr_global_int32_base_atomics = 1;
+      Opts.cl_khr_global_int32_extended_atomics = 1;
+      Opts.cl_khr_local_int32_base_atomics = 1;
+      Opts.cl_khr_local_int32_extended_atomics = 1;
     }
-
-    // Set the correct data layout
-    switch (GPU) {
-    case GK_NONE:
-    case GK_R600:
-    case GK_R700:
-    case GK_EVERGREEN:
-    case GK_NORTHERN_ISLANDS:
-      DataLayoutString = DataLayoutStringR600;
-      hasFP64 = false;
-      hasFMAF = false;
-      hasLDEXPF = false;
-      break;
-    case GK_R600_DOUBLE_OPS:
-    case GK_R700_DOUBLE_OPS:
-    case GK_EVERGREEN_DOUBLE_OPS:
-    case GK_CAYMAN:
-      DataLayoutString = DataLayoutStringR600DoubleOps;
-      hasFP64 = true;
-      hasFMAF = true;
-      hasLDEXPF = false;
-      break;
-    case GK_SOUTHERN_ISLANDS:
-    case GK_SEA_ISLANDS:
-    case GK_VOLCANIC_ISLANDS:
-      DataLayoutString = DataLayoutStringSI;
-      hasFP64 = true;
-      hasFMAF = true;
-      hasLDEXPF = true;
-      break;
+    if (GPU >= GK_SOUTHERN_ISLANDS) {
+      Opts.cl_khr_fp16 = 1;
+      Opts.cl_khr_int64_base_atomics = 1;
+      Opts.cl_khr_int64_extended_atomics = 1;
+      Opts.cl_khr_mipmap_image = 1;
+      Opts.cl_khr_3d_image_writes = 1;
+      Opts.cl_amd_media_ops = 1;
+      Opts.cl_amd_media_ops2 = 1;
     }
+  }
 
-    return true;
+  LangAS::ID getOpenCLImageAddrSpace() const override {
+    return LangAS::opencl_constant;
+  }
+
+  CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
+    switch (CC) {
+      default:
+        return CCCR_Warning;
+      case CC_C:
+      case CC_OpenCLKernel:
+        return CCCR_OK;
+    }
   }
 };
 
 const Builtin::Info AMDGPUTargetInfo::BuiltinInfo[] = {
 #define BUILTIN(ID, TYPE, ATTRS)                \
   { #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
+#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
+  { #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE },
 #include "clang/Basic/BuiltinsAMDGPU.def"
 };
 const char * const AMDGPUTargetInfo::GCCRegNames[] = {
@@ -2039,15 +2228,66 @@
   "s96", "s97", "s98", "s99", "s100", "s101", "s102", "s103",
   "s104", "s105", "s106", "s107", "s108", "s109", "s110", "s111",
   "s112", "s113", "s114", "s115", "s116", "s117", "s118", "s119",
-  "s120", "s121", "s122", "s123", "s124", "s125", "s126", "s127"
-  "exec", "vcc", "scc", "m0", "flat_scr", "exec_lo", "exec_hi",
-  "vcc_lo", "vcc_hi", "flat_scr_lo", "flat_scr_hi"
+  "s120", "s121", "s122", "s123", "s124", "s125", "s126", "s127",
+  "exec", "vcc", "scc", "m0", "flat_scratch", "exec_lo", "exec_hi",
+  "vcc_lo", "vcc_hi", "flat_scratch_lo", "flat_scratch_hi"
 };
 
 ArrayRef<const char *> AMDGPUTargetInfo::getGCCRegNames() const {
   return llvm::makeArrayRef(GCCRegNames);
 }
 
+bool AMDGPUTargetInfo::initFeatureMap(
+  llvm::StringMap<bool> &Features,
+  DiagnosticsEngine &Diags, StringRef CPU,
+  const std::vector<std::string> &FeatureVec) const {
+
+  // XXX - What does the member GPU mean if device name string passed here?
+  if (getTriple().getArch() == llvm::Triple::amdgcn) {
+    if (CPU.empty())
+      CPU = "tahiti";
+
+    switch (parseAMDGCNName(CPU)) {
+    case GK_SOUTHERN_ISLANDS:
+    case GK_SEA_ISLANDS:
+      break;
+
+    case GK_VOLCANIC_ISLANDS:
+      Features["s-memrealtime"] = true;
+      Features["16-bit-insts"] = true;
+      break;
+
+    case GK_NONE:
+      return false;
+    default:
+      llvm_unreachable("unhandled subtarget");
+    }
+  } else {
+    if (CPU.empty())
+      CPU = "r600";
+
+    switch (parseR600Name(CPU)) {
+    case GK_R600:
+    case GK_R700:
+    case GK_EVERGREEN:
+    case GK_NORTHERN_ISLANDS:
+      break;
+    case GK_R600_DOUBLE_OPS:
+    case GK_R700_DOUBLE_OPS:
+    case GK_EVERGREEN_DOUBLE_OPS:
+    case GK_CAYMAN:
+      Features["fp64"] = true;
+      break;
+    case GK_NONE:
+      return false;
+    default:
+      llvm_unreachable("unhandled subtarget");
+    }
+  }
+
+  return TargetInfo::initFeatureMap(Features, Diags, CPU, FeatureVec);
+}
+
 // Namespace for x86 abstract base class
 const Builtin::Info BuiltinInfo[] = {
 #define BUILTIN(ID, TYPE, ATTRS)                                               \
@@ -2069,6 +2309,14 @@
   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
   "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
   "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
+  "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
+  "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31",
+  "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
+  "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31",
+  "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7",
+  "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+  "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
+  "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
 };
 
 const TargetInfo::AddlRegName AddlRegNames[] = {
@@ -2127,14 +2375,25 @@
   bool HasAVX512DQ = false;
   bool HasAVX512BW = false;
   bool HasAVX512VL = false;
+  bool HasAVX512VBMI = false;
+  bool HasAVX512IFMA = false;
   bool HasSHA = false;
+  bool HasMPX = false;
+  bool HasSGX = false;
   bool HasCX16 = false;
   bool HasFXSR = false;
   bool HasXSAVE = false;
   bool HasXSAVEOPT = false;
   bool HasXSAVEC = false;
   bool HasXSAVES = false;
+  bool HasMWAITX = false;
   bool HasPKU = false;
+  bool HasCLFLUSHOPT = false;
+  bool HasPCOMMIT = false;
+  bool HasCLWB = false;
+  bool HasUMIP = false;
+  bool HasMOVBE = false;
+  bool HasPREFETCHWT1 = false;
 
   /// \brief Enumeration of all of the X86 CPUs supported by Clang.
   ///
@@ -2235,14 +2494,26 @@
     /// Broadwell microarchitecture based processors.
     CK_Broadwell,
 
-    /// \name Skylake
-    /// Skylake microarchitecture based processors.
-    CK_Skylake,
+    /// \name Skylake Client
+    /// Skylake client microarchitecture based processors.
+    CK_SkylakeClient,
+
+    /// \name Skylake Server
+    /// Skylake server microarchitecture based processors.
+    CK_SkylakeServer,
+
+    /// \name Cannonlake Client
+    /// Cannonlake client microarchitecture based processors.
+    CK_Cannonlake,
 
     /// \name Knights Landing
     /// Knights Landing processor.
     CK_KNL,
 
+    /// \name Lakemont
+    /// Lakemont microarchitecture based processors.
+    CK_Lakemont,
+
     /// \name K6
     /// K6 architecture processors.
     //@{
@@ -2342,9 +2613,12 @@
         .Case("haswell", CK_Haswell)
         .Case("core-avx2", CK_Haswell) // Legacy name.
         .Case("broadwell", CK_Broadwell)
-        .Case("skylake", CK_Skylake)
-        .Case("skx", CK_Skylake) // Legacy name.
+        .Case("skylake", CK_SkylakeClient)
+        .Case("skylake-avx512", CK_SkylakeServer)
+        .Case("skx", CK_SkylakeServer) // Legacy name.
+        .Case("cannonlake", CK_Cannonlake)
         .Case("knl", CK_KNL)
+        .Case("lakemont", CK_Lakemont)
         .Case("k6", CK_K6)
         .Case("k6-2", CK_K6_2)
         .Case("k6-3", CK_K6_3)
@@ -2380,7 +2654,8 @@
   } FPMath = FP_Default;
 
 public:
-  X86TargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  X86TargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
     BigEndian = false;
     LongDoubleFormat = &llvm::APFloat::x87DoubleExtended;
   }
@@ -2492,6 +2767,7 @@
     case CK_C3_2:
     case CK_Pentium4:
     case CK_Pentium4M:
+    case CK_Lakemont:
     case CK_Prescott:
     case CK_K6:
     case CK_K6_2:
@@ -2518,7 +2794,9 @@
     case CK_IvyBridge:
     case CK_Haswell:
     case CK_Broadwell:
-    case CK_Skylake:
+    case CK_SkylakeClient:
+    case CK_SkylakeServer:
+    case CK_Cannonlake:
     case CK_KNL:
     case CK_Athlon64:
     case CK_Athlon64SSE3:
@@ -2566,6 +2844,10 @@
   bool hasSjLjLowering() const override {
     return true;
   }
+
+  void setSupportedOpenCLOpts() override {
+    getSupportedOpenCLOpts().setAll();
+  }
 };
 
 bool X86TargetInfo::setFPMath(StringRef Name) {
@@ -2588,7 +2870,13 @@
   if (getTriple().getArch() == llvm::Triple::x86_64)
     setFeatureEnabledImpl(Features, "sse2", true);
 
-  switch (getCPUKind(CPU)) {
+  const CPUKind Kind = getCPUKind(CPU);
+
+  // Enable X87 for all X86 processors but Lakemont.
+  if (Kind != CK_Lakemont)
+    setFeatureEnabledImpl(Features, "x87", true);
+
+  switch (Kind) {
   case CK_Generic:
   case CK_i386:
   case CK_i486:
@@ -2596,6 +2884,7 @@
   case CK_Pentium:
   case CK_i686:
   case CK_PentiumPro:
+  case CK_Lakemont:
     break;
   case CK_PentiumMMX:
   case CK_Pentium2:
@@ -2634,15 +2923,28 @@
     setFeatureEnabledImpl(Features, "fxsr", true);
     setFeatureEnabledImpl(Features, "cx16", true);
     break;
-  case CK_Skylake:
+  case CK_Cannonlake:
+    setFeatureEnabledImpl(Features, "avx512ifma", true);
+    setFeatureEnabledImpl(Features, "avx512vbmi", true);
+    setFeatureEnabledImpl(Features, "sha", true);
+    setFeatureEnabledImpl(Features, "umip", true);
+    // FALLTHROUGH
+  case CK_SkylakeServer:
     setFeatureEnabledImpl(Features, "avx512f", true);
     setFeatureEnabledImpl(Features, "avx512cd", true);
     setFeatureEnabledImpl(Features, "avx512dq", true);
     setFeatureEnabledImpl(Features, "avx512bw", true);
     setFeatureEnabledImpl(Features, "avx512vl", true);
+    setFeatureEnabledImpl(Features, "pku", true);
+    setFeatureEnabledImpl(Features, "pcommit", true);
+    setFeatureEnabledImpl(Features, "clwb", true);
+    // FALLTHROUGH
+  case CK_SkylakeClient:
     setFeatureEnabledImpl(Features, "xsavec", true);
     setFeatureEnabledImpl(Features, "xsaves", true);
-    setFeatureEnabledImpl(Features, "pku", true);
+    setFeatureEnabledImpl(Features, "mpx", true);
+    setFeatureEnabledImpl(Features, "sgx", true);
+    setFeatureEnabledImpl(Features, "clflushopt", true);
     // FALLTHROUGH
   case CK_Broadwell:
     setFeatureEnabledImpl(Features, "rdseed", true);
@@ -2655,6 +2957,7 @@
     setFeatureEnabledImpl(Features, "bmi2", true);
     setFeatureEnabledImpl(Features, "rtm", true);
     setFeatureEnabledImpl(Features, "fma", true);
+    setFeatureEnabledImpl(Features, "movbe", true);
     // FALLTHROUGH
   case CK_IvyBridge:
     setFeatureEnabledImpl(Features, "rdrnd", true);
@@ -2681,6 +2984,7 @@
     setFeatureEnabledImpl(Features, "avx512cd", true);
     setFeatureEnabledImpl(Features, "avx512er", true);
     setFeatureEnabledImpl(Features, "avx512pf", true);
+    setFeatureEnabledImpl(Features, "prefetchwt1", true);
     setFeatureEnabledImpl(Features, "fxsr", true);
     setFeatureEnabledImpl(Features, "rdseed", true);
     setFeatureEnabledImpl(Features, "adx", true);
@@ -2697,6 +3001,7 @@
     setFeatureEnabledImpl(Features, "cx16", true);
     setFeatureEnabledImpl(Features, "xsaveopt", true);
     setFeatureEnabledImpl(Features, "xsave", true);
+    setFeatureEnabledImpl(Features, "movbe", true);
     break;
   case CK_K6_2:
   case CK_K6_3:
@@ -2752,11 +3057,11 @@
     setFeatureEnabledImpl(Features, "prfchw", true);
     setFeatureEnabledImpl(Features, "cx16", true);
     setFeatureEnabledImpl(Features, "fxsr", true);
-    setFeatureEnabledImpl(Features, "xsave", true);
     break;
   case CK_BDVER4:
     setFeatureEnabledImpl(Features, "avx2", true);
     setFeatureEnabledImpl(Features, "bmi2", true);
+    setFeatureEnabledImpl(Features, "mwaitx", true);
     // FALLTHROUGH
   case CK_BDVER3:
     setFeatureEnabledImpl(Features, "fsgsbase", true);
@@ -2865,7 +3170,8 @@
   case AVX512F:
     Features["avx512f"] = Features["avx512cd"] = Features["avx512er"] =
       Features["avx512pf"] = Features["avx512dq"] = Features["avx512bw"] =
-      Features["avx512vl"] = false;
+      Features["avx512vl"] = Features["avx512vbmi"] =
+      Features["avx512ifma"] = false;
   }
 }
 
@@ -2963,8 +3269,9 @@
     setSSELevel(Features, AVX2, Enabled);
   } else if (Name == "avx512f") {
     setSSELevel(Features, AVX512F, Enabled);
-  } else if (Name == "avx512cd" || Name == "avx512er" || Name == "avx512pf"
-          || Name == "avx512dq" || Name == "avx512bw" || Name == "avx512vl") {
+  } else if (Name == "avx512cd" || Name == "avx512er" || Name == "avx512pf" ||
+             Name == "avx512dq" || Name == "avx512bw" || Name == "avx512vl" ||
+             Name == "avx512vbmi" || Name == "avx512ifma") {
     if (Enabled)
       setSSELevel(Features, AVX512F, Enabled);
   } else if (Name == "fma") {
@@ -2992,15 +3299,11 @@
     else
       setSSELevel(Features, SSE41, Enabled);
   } else if (Name == "xsave") {
-    if (Enabled)
-      setSSELevel(Features, AVX, Enabled);
-    else
+    if (!Enabled)
       Features["xsaveopt"] = false;
   } else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") {
-    if (Enabled) {
+    if (Enabled)
       Features["xsave"] = true;
-      setSSELevel(Features, AVX, Enabled);
-    }
   }
 }
 
@@ -3054,8 +3357,18 @@
       HasAVX512BW = true;
     } else if (Feature == "+avx512vl") {
       HasAVX512VL = true;
+    } else if (Feature == "+avx512vbmi") {
+      HasAVX512VBMI = true;
+    } else if (Feature == "+avx512ifma") {
+      HasAVX512IFMA = true;
     } else if (Feature == "+sha") {
       HasSHA = true;
+    } else if (Feature == "+mpx") {
+      HasMPX = true;
+    } else if (Feature == "+movbe") {
+      HasMOVBE = true;
+    } else if (Feature == "+sgx") {
+      HasSGX = true;
     } else if (Feature == "+cx16") {
       HasCX16 = true;
     } else if (Feature == "+fxsr") {
@@ -3068,8 +3381,20 @@
       HasXSAVEC = true;
     } else if (Feature == "+xsaves") {
       HasXSAVES = true;
+    } else if (Feature == "+mwaitx") {
+      HasMWAITX = true;
     } else if (Feature == "+pku") {
       HasPKU = true;
+    } else if (Feature == "+clflushopt") {
+      HasCLFLUSHOPT = true;
+    } else if (Feature == "+pcommit") {
+      HasPCOMMIT = true;
+    } else if (Feature == "+clwb") {
+      HasCLWB = true;
+    } else if (Feature == "+umip") {
+      HasUMIP = true;
+    } else if (Feature == "+prefetchwt1") {
+      HasPREFETCHWT1 = true;
     }
 
     X86SSEEnum Level = llvm::StringSwitch<X86SSEEnum>(Feature)
@@ -3203,21 +3528,23 @@
   case CK_IvyBridge:
   case CK_Haswell:
   case CK_Broadwell:
+  case CK_SkylakeClient:
     // FIXME: Historically, we defined this legacy name, it would be nice to
     // remove it at some point. We've never exposed fine-grained names for
     // recent primary x86 CPUs, and we should keep it that way.
     defineCPUMacros(Builder, "corei7");
     break;
-  case CK_Skylake:
-    // FIXME: Historically, we defined this legacy name, it would be nice to
-    // remove it at some point. This is the only fine-grained CPU macro in the
-    // main intel CPU line, and it would be better to not have these and force
-    // people to use ISA macros.
+  case CK_SkylakeServer:
     defineCPUMacros(Builder, "skx");
     break;
+  case CK_Cannonlake:
+    break;
   case CK_KNL:
     defineCPUMacros(Builder, "knl");
     break;
+  case CK_Lakemont:
+    Builder.defineMacro("__tune_lakemont__");
+    break;
   case CK_K6_2:
     Builder.defineMacro("__k6_2__");
     Builder.defineMacro("__tune_k6_2__");
@@ -3328,6 +3655,9 @@
   if (HasTBM)
     Builder.defineMacro("__TBM__");
 
+  if (HasMWAITX)
+    Builder.defineMacro("__MWAITX__");
+
   switch (XOPLevel) {
   case XOP:
     Builder.defineMacro("__XOP__");
@@ -3357,6 +3687,10 @@
     Builder.defineMacro("__AVX512BW__");
   if (HasAVX512VL)
     Builder.defineMacro("__AVX512VL__");
+  if (HasAVX512VBMI)
+    Builder.defineMacro("__AVX512VBMI__");
+  if (HasAVX512IFMA)
+    Builder.defineMacro("__AVX512IFMA__");
 
   if (HasSHA)
     Builder.defineMacro("__SHA__");
@@ -3455,8 +3789,12 @@
       .Case("avx512dq", HasAVX512DQ)
       .Case("avx512bw", HasAVX512BW)
       .Case("avx512vl", HasAVX512VL)
+      .Case("avx512vbmi", HasAVX512VBMI)
+      .Case("avx512ifma", HasAVX512IFMA)
       .Case("bmi", HasBMI)
       .Case("bmi2", HasBMI2)
+      .Case("clflushopt", HasCLFLUSHOPT)
+      .Case("clwb", HasCLWB)
       .Case("cx16", HasCX16)
       .Case("f16c", HasF16C)
       .Case("fma", HasFMA)
@@ -3467,12 +3805,18 @@
       .Case("mm3dnow", MMX3DNowLevel >= AMD3DNow)
       .Case("mm3dnowa", MMX3DNowLevel >= AMD3DNowAthlon)
       .Case("mmx", MMX3DNowLevel >= MMX)
+      .Case("movbe", HasMOVBE)
+      .Case("mpx", HasMPX)
       .Case("pclmul", HasPCLMUL)
+      .Case("pcommit", HasPCOMMIT)
+      .Case("pku", HasPKU)
       .Case("popcnt", HasPOPCNT)
+      .Case("prefetchwt1", HasPREFETCHWT1)
       .Case("prfchw", HasPRFCHW)
       .Case("rdrnd", HasRDRND)
       .Case("rdseed", HasRDSEED)
       .Case("rtm", HasRTM)
+      .Case("sgx", HasSGX)
       .Case("sha", HasSHA)
       .Case("sse", SSELevel >= SSE1)
       .Case("sse2", SSELevel >= SSE2)
@@ -3482,6 +3826,7 @@
       .Case("sse4.2", SSELevel >= SSE42)
       .Case("sse4a", XOPLevel >= SSE4A)
       .Case("tbm", HasTBM)
+      .Case("umip", HasUMIP)
       .Case("x86", true)
       .Case("x86_32", getTriple().getArch() == llvm::Triple::x86)
       .Case("x86_64", getTriple().getArch() == llvm::Triple::x86_64)
@@ -3490,7 +3835,6 @@
       .Case("xsavec", HasXSAVEC)
       .Case("xsaves", HasXSAVES)
       .Case("xsaveopt", HasXSAVEOPT)
-      .Case("pku", HasPKU)
       .Default(false);
 }
 
@@ -3507,6 +3851,7 @@
       .Case("sse", true)
       .Case("sse2", true)
       .Case("sse3", true)
+      .Case("ssse3", true)
       .Case("sse4.1", true)
       .Case("sse4.2", true)
       .Case("avx", true)
@@ -3518,6 +3863,16 @@
       .Case("avx512f", true)
       .Case("bmi", true)
       .Case("bmi2", true)
+      .Case("aes", true)
+      .Case("pclmul", true)
+      .Case("avx512vl", true)
+      .Case("avx512bw", true)
+      .Case("avx512dq", true)
+      .Case("avx512cd", true)
+      .Case("avx512er", true)
+      .Case("avx512pf", true)
+      .Case("avx512vbmi", true)
+      .Case("avx512ifma", true)
       .Default(false);
 }
 
@@ -3679,12 +4034,13 @@
 // X86-32 generic target
 class X86_32TargetInfo : public X86TargetInfo {
 public:
-  X86_32TargetInfo(const llvm::Triple &Triple) : X86TargetInfo(Triple) {
+  X86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : X86TargetInfo(Triple, Opts) {
     DoubleAlign = LongLongAlign = 32;
     LongDoubleWidth = 96;
     LongDoubleAlign = 32;
     SuitableAlign = 128;
-    DataLayoutString = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128";
+    resetDataLayout("e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128");
     SizeType = UnsignedInt;
     PtrDiffType = SignedInt;
     IntPtrType = SignedInt;
@@ -3733,8 +4089,8 @@
 
 class NetBSDI386TargetInfo : public NetBSDTargetInfo<X86_32TargetInfo> {
 public:
-  NetBSDI386TargetInfo(const llvm::Triple &Triple)
-      : NetBSDTargetInfo<X86_32TargetInfo>(Triple) {}
+  NetBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : NetBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {}
 
   unsigned getFloatEvalMethod() const override {
     unsigned Major, Minor, Micro;
@@ -3749,8 +4105,8 @@
 
 class OpenBSDI386TargetInfo : public OpenBSDTargetInfo<X86_32TargetInfo> {
 public:
-  OpenBSDI386TargetInfo(const llvm::Triple &Triple)
-      : OpenBSDTargetInfo<X86_32TargetInfo>(Triple) {
+  OpenBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OpenBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {
     SizeType = UnsignedLong;
     IntPtrType = SignedLong;
     PtrDiffType = SignedLong;
@@ -3759,8 +4115,8 @@
 
 class BitrigI386TargetInfo : public BitrigTargetInfo<X86_32TargetInfo> {
 public:
-  BitrigI386TargetInfo(const llvm::Triple &Triple)
-      : BitrigTargetInfo<X86_32TargetInfo>(Triple) {
+  BitrigI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : BitrigTargetInfo<X86_32TargetInfo>(Triple, Opts) {
     SizeType = UnsignedLong;
     IntPtrType = SignedLong;
     PtrDiffType = SignedLong;
@@ -3769,8 +4125,8 @@
 
 class DarwinI386TargetInfo : public DarwinTargetInfo<X86_32TargetInfo> {
 public:
-  DarwinI386TargetInfo(const llvm::Triple &Triple)
-      : DarwinTargetInfo<X86_32TargetInfo>(Triple) {
+  DarwinI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : DarwinTargetInfo<X86_32TargetInfo>(Triple, Opts) {
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     SuitableAlign = 128;
@@ -3781,7 +4137,7 @@
       UseSignedCharForObjCBool = false;
     SizeType = UnsignedLong;
     IntPtrType = SignedLong;
-    DataLayoutString = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128";
+    resetDataLayout("e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128");
     HasAlignMac68kSupport = true;
   }
 
@@ -3800,15 +4156,15 @@
 // x86-32 Windows target
 class WindowsX86_32TargetInfo : public WindowsTargetInfo<X86_32TargetInfo> {
 public:
-  WindowsX86_32TargetInfo(const llvm::Triple &Triple)
-      : WindowsTargetInfo<X86_32TargetInfo>(Triple) {
+  WindowsX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : WindowsTargetInfo<X86_32TargetInfo>(Triple, Opts) {
     WCharType = UnsignedShort;
     DoubleAlign = LongLongAlign = 64;
     bool IsWinCOFF =
         getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
-    DataLayoutString = IsWinCOFF
-                           ? "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
-                           : "e-m:e-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32";
+    resetDataLayout(IsWinCOFF
+                        ? "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+                        : "e-m:e-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32");
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -3819,8 +4175,9 @@
 // x86-32 Windows Visual Studio target
 class MicrosoftX86_32TargetInfo : public WindowsX86_32TargetInfo {
 public:
-  MicrosoftX86_32TargetInfo(const llvm::Triple &Triple)
-      : WindowsX86_32TargetInfo(Triple) {
+  MicrosoftX86_32TargetInfo(const llvm::Triple &Triple,
+                            const TargetOptions &Opts)
+      : WindowsX86_32TargetInfo(Triple, Opts) {
     LongDoubleWidth = LongDoubleAlign = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble;
   }
@@ -3868,8 +4225,8 @@
 // x86-32 MinGW target
 class MinGWX86_32TargetInfo : public WindowsX86_32TargetInfo {
 public:
-  MinGWX86_32TargetInfo(const llvm::Triple &Triple)
-      : WindowsX86_32TargetInfo(Triple) {}
+  MinGWX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : WindowsX86_32TargetInfo(Triple, Opts) {}
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
     WindowsX86_32TargetInfo::getTargetDefines(Opts, Builder);
@@ -3883,11 +4240,11 @@
 // x86-32 Cygwin target
 class CygwinX86_32TargetInfo : public X86_32TargetInfo {
 public:
-  CygwinX86_32TargetInfo(const llvm::Triple &Triple)
-      : X86_32TargetInfo(Triple) {
+  CygwinX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : X86_32TargetInfo(Triple, Opts) {
     WCharType = UnsignedShort;
     DoubleAlign = LongLongAlign = 64;
-    DataLayoutString = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32";
+    resetDataLayout("e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32");
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -3903,31 +4260,27 @@
 };
 
 // x86-32 Haiku target
-class HaikuX86_32TargetInfo : public X86_32TargetInfo {
+class HaikuX86_32TargetInfo : public HaikuTargetInfo<X86_32TargetInfo> {
 public:
-  HaikuX86_32TargetInfo(const llvm::Triple &Triple) : X86_32TargetInfo(Triple) {
-    SizeType = UnsignedLong;
-    IntPtrType = SignedLong;
-    PtrDiffType = SignedLong;
-    ProcessIDType = SignedLong;
-    this->UserLabelPrefix = "";
-    this->TLSSupported = false;
+  HaikuX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+    : HaikuTargetInfo<X86_32TargetInfo>(Triple, Opts) {
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
-    X86_32TargetInfo::getTargetDefines(Opts, Builder);
+    HaikuTargetInfo<X86_32TargetInfo>::getTargetDefines(Opts, Builder);
     Builder.defineMacro("__INTEL__");
-    Builder.defineMacro("__HAIKU__");
   }
 };
 
 // X86-32 MCU target
 class MCUX86_32TargetInfo : public X86_32TargetInfo {
 public:
-  MCUX86_32TargetInfo(const llvm::Triple &Triple) : X86_32TargetInfo(Triple) {
+  MCUX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : X86_32TargetInfo(Triple, Opts) {
     LongDoubleWidth = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble;
-    UserLabelPrefix = "";
+    resetDataLayout("e-m:e-p:32:32-i64:32-f64:32-f128:32-n8:16:32-a:0:32-S32");
+    WIntType = UnsignedInt;
   }
 
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
@@ -3941,6 +4294,10 @@
     Builder.defineMacro("__iamcu");
     Builder.defineMacro("__iamcu__");
   }
+
+  bool allowsLargerPreferedTypeAlignment() const override {
+    return false;
+  }
 };
 
 // RTEMS Target
@@ -3956,9 +4313,8 @@
   }
 
 public:
-  RTEMSTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
-    this->UserLabelPrefix = "";
-
+  RTEMSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
     switch (Triple.getArch()) {
     default:
     case llvm::Triple::x86:
@@ -3981,11 +4337,11 @@
 // x86-32 RTEMS target
 class RTEMSX86_32TargetInfo : public X86_32TargetInfo {
 public:
-  RTEMSX86_32TargetInfo(const llvm::Triple &Triple) : X86_32TargetInfo(Triple) {
+  RTEMSX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : X86_32TargetInfo(Triple, Opts) {
     SizeType = UnsignedLong;
     IntPtrType = SignedLong;
     PtrDiffType = SignedLong;
-    this->UserLabelPrefix = "";
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -3998,7 +4354,8 @@
 // x86-64 generic target
 class X86_64TargetInfo : public X86TargetInfo {
 public:
-  X86_64TargetInfo(const llvm::Triple &Triple) : X86TargetInfo(Triple) {
+  X86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : X86TargetInfo(Triple, Opts) {
     const bool IsX32 = getTriple().getEnvironment() == llvm::Triple::GNUX32;
     bool IsWinCOFF =
         getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
@@ -4016,10 +4373,10 @@
     RegParmMax = 6;
 
     // Pointers are 32-bit in x32.
-    DataLayoutString = IsX32 ? "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
-                             : IsWinCOFF
-                                   ? "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-                                   : "e-m:e-i64:64-f80:128-n8:16:32:64-S128";
+    resetDataLayout(IsX32
+                        ? "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
+                        : IsWinCOFF ? "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+                                    : "e-m:e-i64:64-f80:128-n8:16:32:64-S128");
 
     // Use fpret only for long double.
     RealTypeUsesObjCFPRet = (1 << TargetInfo::LongDouble);
@@ -4065,6 +4422,8 @@
 
   // for x32 we need it here explicitly
   bool hasInt128Type() const override { return true; }
+  unsigned getUnwindWordWidth() const override { return 64; }
+  unsigned getRegisterWidth() const override { return 64; }
 
   bool validateGlobalRegisterVariable(StringRef RegName,
                                       unsigned RegSize,
@@ -4086,8 +4445,8 @@
 // x86-64 Windows target
 class WindowsX86_64TargetInfo : public WindowsTargetInfo<X86_64TargetInfo> {
 public:
-  WindowsX86_64TargetInfo(const llvm::Triple &Triple)
-      : WindowsTargetInfo<X86_64TargetInfo>(Triple) {
+  WindowsX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : WindowsTargetInfo<X86_64TargetInfo>(Triple, Opts) {
     WCharType = UnsignedShort;
     LongWidth = LongAlign = 32;
     DoubleAlign = LongLongAlign = 64;
@@ -4096,7 +4455,6 @@
     SizeType = UnsignedLongLong;
     PtrDiffType = SignedLongLong;
     IntPtrType = SignedLongLong;
-    this->UserLabelPrefix = "";
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -4129,8 +4487,9 @@
 // x86-64 Windows Visual Studio target
 class MicrosoftX86_64TargetInfo : public WindowsX86_64TargetInfo {
 public:
-  MicrosoftX86_64TargetInfo(const llvm::Triple &Triple)
-      : WindowsX86_64TargetInfo(Triple) {
+  MicrosoftX86_64TargetInfo(const llvm::Triple &Triple,
+                            const TargetOptions &Opts)
+      : WindowsX86_64TargetInfo(Triple, Opts) {
     LongDoubleWidth = LongDoubleAlign = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble;
   }
@@ -4146,8 +4505,8 @@
 // x86-64 MinGW target
 class MinGWX86_64TargetInfo : public WindowsX86_64TargetInfo {
 public:
-  MinGWX86_64TargetInfo(const llvm::Triple &Triple)
-      : WindowsX86_64TargetInfo(Triple) {
+  MinGWX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : WindowsX86_64TargetInfo(Triple, Opts) {
     // Mingw64 rounds long double size and alignment up to 16 bytes, but sticks
     // with x86 FP ops. Weird.
     LongDoubleWidth = LongDoubleAlign = 128;
@@ -4170,8 +4529,8 @@
 // x86-64 Cygwin target
 class CygwinX86_64TargetInfo : public X86_64TargetInfo {
 public:
-  CygwinX86_64TargetInfo(const llvm::Triple &Triple)
-      : X86_64TargetInfo(Triple) {
+  CygwinX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : X86_64TargetInfo(Triple, Opts) {
     TLSSupported = false;
     WCharType = UnsignedShort;
   }
@@ -4194,14 +4553,14 @@
 
 class DarwinX86_64TargetInfo : public DarwinTargetInfo<X86_64TargetInfo> {
 public:
-  DarwinX86_64TargetInfo(const llvm::Triple &Triple)
-      : DarwinTargetInfo<X86_64TargetInfo>(Triple) {
+  DarwinX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : DarwinTargetInfo<X86_64TargetInfo>(Triple, Opts) {
     Int64Type = SignedLongLong;
     // The 64-bit iOS simulator uses the builtin bool type for Objective-C.
     llvm::Triple T = llvm::Triple(Triple);
     if (T.isiOS())
       UseSignedCharForObjCBool = false;
-    DataLayoutString = "e-m:o-i64:64-f80:128-n8:16:32:64-S128";
+    resetDataLayout("e-m:o-i64:64-f80:128-n8:16:32:64-S128");
   }
 
   bool handleTargetFeatures(std::vector<std::string> &Features,
@@ -4218,8 +4577,8 @@
 
 class OpenBSDX86_64TargetInfo : public OpenBSDTargetInfo<X86_64TargetInfo> {
 public:
-  OpenBSDX86_64TargetInfo(const llvm::Triple &Triple)
-      : OpenBSDTargetInfo<X86_64TargetInfo>(Triple) {
+  OpenBSDX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OpenBSDTargetInfo<X86_64TargetInfo>(Triple, Opts) {
     IntMaxType = SignedLongLong;
     Int64Type = SignedLongLong;
   }
@@ -4227,8 +4586,8 @@
 
 class BitrigX86_64TargetInfo : public BitrigTargetInfo<X86_64TargetInfo> {
 public:
-  BitrigX86_64TargetInfo(const llvm::Triple &Triple)
-      : BitrigTargetInfo<X86_64TargetInfo>(Triple) {
+  BitrigX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : BitrigTargetInfo<X86_64TargetInfo>(Triple, Opts) {
     IntMaxType = SignedLongLong;
     Int64Type = SignedLongLong;
   }
@@ -4340,26 +4699,26 @@
     // Thumb1 add sp, #imm requires the immediate value be multiple of 4,
     // so set preferred for small types to 32.
     if (T.isOSBinFormatMachO()) {
-      DataLayoutString =
-          BigEndian ? "E-m:o-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-                    : "e-m:o-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64";
+      resetDataLayout(BigEndian
+                          ? "E-m:o-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+                          : "e-m:o-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64");
     } else if (T.isOSWindows()) {
       assert(!BigEndian && "Windows on ARM does not support big endian");
-      DataLayoutString = "e"
-                         "-m:w"
-                         "-p:32:32"
-                         "-i64:64"
-                         "-v128:64:128"
-                         "-a:0:32"
-                         "-n32"
-                         "-S64";
+      resetDataLayout("e"
+                      "-m:w"
+                      "-p:32:32"
+                      "-i64:64"
+                      "-v128:64:128"
+                      "-a:0:32"
+                      "-n32"
+                      "-S64");
     } else if (T.isOSNaCl()) {
       assert(!BigEndian && "NaCl on ARM does not support big endian");
-      DataLayoutString = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S128";
+      resetDataLayout("e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S128");
     } else {
-      DataLayoutString =
-          BigEndian ? "E-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-                    : "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64";
+      resetDataLayout(BigEndian
+                          ? "E-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+                          : "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64");
     }
 
     // FIXME: Enumerated types are variable width in straight AAPCS.
@@ -4395,17 +4754,17 @@
 
     if (T.isOSBinFormatMachO() && IsAAPCS16) {
       assert(!BigEndian && "AAPCS16 does not support big-endian");
-      DataLayoutString = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128";
+      resetDataLayout("e-m:o-p:32:32-i64:64-a:0:32-n32-S128");
     } else if (T.isOSBinFormatMachO())
-      DataLayoutString =
+      resetDataLayout(
           BigEndian
               ? "E-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
-              : "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32";
+              : "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32");
     else
-      DataLayoutString =
+      resetDataLayout(
           BigEndian
               ? "E-m:e-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
-              : "e-m:e-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32";
+              : "e-m:e-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32");
 
     // FIXME: Override "preferred align" for double and long long.
   }
@@ -4463,7 +4822,8 @@
   }
 
   bool supportsThumb2() const {
-    return CPUAttr.equals("6T2") || ArchVersion >= 7;
+    return CPUAttr.equals("6T2") ||
+           (ArchVersion >= 7 && !CPUAttr.equals("8M_BASE"));
   }
 
   StringRef getCPUAttr() const {
@@ -4488,6 +4848,12 @@
       return "8A";
     case llvm::ARM::AK_ARMV8_1A:
       return "8_1A";
+    case llvm::ARM::AK_ARMV8_2A:
+      return "8_2A";
+    case llvm::ARM::AK_ARMV8MBaseline:
+      return "8M_BASE";
+    case llvm::ARM::AK_ARMV8MMainline:
+      return "8M_MAIN";
     }
   }
 
@@ -4505,9 +4871,10 @@
   }
 
 public:
-  ARMTargetInfo(const llvm::Triple &Triple, bool IsBigEndian)
-      : TargetInfo(Triple), FPMath(FP_Default),
-        IsAAPCS(true), LDREX(0), HW_FP(0) {
+  ARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts,
+                bool IsBigEndian)
+      : TargetInfo(Triple), FPMath(FP_Default), IsAAPCS(true), LDREX(0),
+        HW_FP(0) {
     BigEndian = IsBigEndian;
 
     switch (getTriple().getOS()) {
@@ -4550,6 +4917,8 @@
       case llvm::Triple::Android:
       case llvm::Triple::GNUEABI:
       case llvm::Triple::GNUEABIHF:
+      case llvm::Triple::MuslEABI:
+      case llvm::Triple::MuslEABIHF:
         setABI("aapcs-linux");
         break;
       case llvm::Triple::EABIHF:
@@ -4579,6 +4948,11 @@
     // that follows it, `bar', `bar' will be aligned as the  type of the
     // zero length bitfield.
     UseZeroLengthBitfieldAlignment = true;
+
+    if (Triple.getOS() == llvm::Triple::Linux ||
+        Triple.getOS() == llvm::Triple::UnknownOS)
+      this->MCountName =
+          Opts.EABIVersion == "gnu" ? "\01__gnu_mcount_nc" : "\01mcount";
   }
 
   StringRef getABI() const override { return ABI; }
@@ -4669,7 +5043,7 @@
       } else if (Feature == "+dsp") {
         DSP = 1;
       } else if (Feature == "+fp-only-sp") {
-        HW_FP_remove |= HW_FP_DP; 
+        HW_FP_remove |= HW_FP_DP;
       } else if (Feature == "+strict-align") {
         Unaligned = 0;
       } else if (Feature == "+fp16") {
@@ -4746,6 +5120,10 @@
     // Target identification.
     Builder.defineMacro("__arm");
     Builder.defineMacro("__arm__");
+    // For bare-metal none-eabi.
+    if (getTriple().getOS() == llvm::Triple::UnknownOS &&
+        getTriple().getEnvironment() == llvm::Triple::EABI)
+      Builder.defineMacro("__ELF__");
 
     // Target properties.
     Builder.defineMacro("__REGISTER_PREFIX__", "");
@@ -4777,13 +5155,14 @@
 
     // __ARM_ARCH_ISA_ARM is defined to 1 if the core supports the ARM ISA.  It
     // is not defined for the M-profile.
-    // NOTE that the deffault profile is assumed to be 'A'
-    if (CPUProfile.empty() || CPUProfile != "M")
+    // NOTE that the default profile is assumed to be 'A'
+    if (CPUProfile.empty() || ArchProfile != llvm::ARM::PK_M)
       Builder.defineMacro("__ARM_ARCH_ISA_ARM", "1");
 
-    // __ARM_ARCH_ISA_THUMB is defined to 1 if the core supporst the original
-    // Thumb ISA (including v6-M).  It is set to 2 if the core supports the
-    // Thumb-2 ISA as found in the v6T2 architecture and all v7 architecture.
+    // __ARM_ARCH_ISA_THUMB is defined to 1 if the core supports the original
+    // Thumb ISA (including v6-M and v8-M Baseline).  It is set to 2 if the
+    // core supports the Thumb-2 ISA as found in the v6T2 architecture and all
+    // v7 and v8 architectures excluding v8-M Baseline.
     if (supportsThumb2())
       Builder.defineMacro("__ARM_ARCH_ISA_THUMB", "2");
     else if (supportsThumb())
@@ -4825,7 +5204,7 @@
     Builder.defineMacro("__ARM_FP16_ARGS", "1");
 
     // ACLE 6.5.3 Fused multiply-accumulate (FMA)
-    if (ArchVersion >= 7 && (CPUProfile != "M" || CPUAttr == "7EM"))
+    if (ArchVersion >= 7 && (FPU & VFP4FPU))
       Builder.defineMacro("__ARM_FEATURE_FMA", "1");
 
     // Subtarget options.
@@ -4904,7 +5283,7 @@
     Builder.defineMacro("__ARM_SIZEOF_MINIMAL_ENUM",
                         Opts.ShortEnums ? "1" : "4");
 
-    if (ArchVersion >= 6 && CPUAttr != "6M") {
+    if (ArchVersion >= 6 && CPUAttr != "6M" && CPUAttr != "8M_BASE") {
       Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
       Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
       Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
@@ -4953,8 +5332,8 @@
     default: break;
     case 'l': // r0-r7
     case 'h': // r8-r15
-    case 'w': // VFP Floating point register single precision
-    case 'P': // VFP Floating point register double precision
+    case 't': // VFP Floating point register single precision
+    case 'w': // VFP Floating point register double precision
       Info.setAllowsRegister();
       return true;
     case 'I':
@@ -5135,8 +5514,8 @@
 
 class ARMleTargetInfo : public ARMTargetInfo {
 public:
-  ARMleTargetInfo(const llvm::Triple &Triple)
-    : ARMTargetInfo(Triple, false) { }
+  ARMleTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : ARMTargetInfo(Triple, Opts, /*BigEndian=*/false) {}
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
     Builder.defineMacro("__ARMEL__");
@@ -5146,8 +5525,8 @@
 
 class ARMbeTargetInfo : public ARMTargetInfo {
 public:
-  ARMbeTargetInfo(const llvm::Triple &Triple)
-    : ARMTargetInfo(Triple, true) { }
+  ARMbeTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : ARMTargetInfo(Triple, Opts, /*BigEndian=*/true) {}
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
     Builder.defineMacro("__ARMEB__");
@@ -5159,12 +5538,10 @@
 class WindowsARMTargetInfo : public WindowsTargetInfo<ARMleTargetInfo> {
   const llvm::Triple Triple;
 public:
-  WindowsARMTargetInfo(const llvm::Triple &Triple)
-    : WindowsTargetInfo<ARMleTargetInfo>(Triple), Triple(Triple) {
-    TLSSupported = false;
+  WindowsARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : WindowsTargetInfo<ARMleTargetInfo>(Triple, Opts), Triple(Triple) {
     WCharType = UnsignedShort;
     SizeType = UnsignedInt;
-    UserLabelPrefix = "";
   }
   void getVisualStudioDefines(const LangOptions &Opts,
                               MacroBuilder &Builder) const {
@@ -5206,8 +5583,9 @@
 // Windows ARM + Itanium C++ ABI Target
 class ItaniumWindowsARMleTargetInfo : public WindowsARMTargetInfo {
 public:
-  ItaniumWindowsARMleTargetInfo(const llvm::Triple &Triple)
-    : WindowsARMTargetInfo(Triple) {
+  ItaniumWindowsARMleTargetInfo(const llvm::Triple &Triple,
+                                const TargetOptions &Opts)
+      : WindowsARMTargetInfo(Triple, Opts) {
     TheCXXABI.set(TargetCXXABI::GenericARM);
   }
 
@@ -5223,8 +5601,9 @@
 // Windows ARM, MS (C++) ABI
 class MicrosoftARMleTargetInfo : public WindowsARMTargetInfo {
 public:
-  MicrosoftARMleTargetInfo(const llvm::Triple &Triple)
-    : WindowsARMTargetInfo(Triple) {
+  MicrosoftARMleTargetInfo(const llvm::Triple &Triple,
+                           const TargetOptions &Opts)
+      : WindowsARMTargetInfo(Triple, Opts) {
     TheCXXABI.set(TargetCXXABI::Microsoft);
   }
 
@@ -5238,8 +5617,8 @@
 // ARM MinGW target
 class MinGWARMTargetInfo : public WindowsARMTargetInfo {
 public:
-  MinGWARMTargetInfo(const llvm::Triple &Triple)
-      : WindowsARMTargetInfo(Triple) {
+  MinGWARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : WindowsARMTargetInfo(Triple, Opts) {
     TheCXXABI.set(TargetCXXABI::GenericARM);
   }
 
@@ -5256,11 +5635,12 @@
 // ARM Cygwin target
 class CygwinARMTargetInfo : public ARMleTargetInfo {
 public:
-  CygwinARMTargetInfo(const llvm::Triple &Triple) : ARMleTargetInfo(Triple) {
+  CygwinARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : ARMleTargetInfo(Triple, Opts) {
     TLSSupported = false;
     WCharType = UnsignedShort;
     DoubleAlign = LongLongAlign = 64;
-    DataLayoutString = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64";
+    resetDataLayout("e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64");
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -5274,8 +5654,7 @@
   }
 };
 
-class DarwinARMTargetInfo :
-  public DarwinTargetInfo<ARMleTargetInfo> {
+class DarwinARMTargetInfo : public DarwinTargetInfo<ARMleTargetInfo> {
 protected:
   void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
                     MacroBuilder &Builder) const override {
@@ -5283,8 +5662,8 @@
   }
 
 public:
-  DarwinARMTargetInfo(const llvm::Triple &Triple)
-      : DarwinTargetInfo<ARMleTargetInfo>(Triple) {
+  DarwinARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : DarwinTargetInfo<ARMleTargetInfo>(Triple, Opts) {
     HasAlignMac68kSupport = true;
     // iOS always has 64-bit atomic instructions.
     // FIXME: This should be based off of the target features in
@@ -5307,7 +5686,7 @@
 };
 
 class AArch64TargetInfo : public TargetInfo {
-  virtual void setDataLayoutString() = 0;
+  virtual void setDataLayout() = 0;
   static const TargetInfo::GCCRegAlias GCCRegAliases[];
   static const char *const GCCRegNames[];
 
@@ -5327,9 +5706,8 @@
   std::string ABI;
 
 public:
-  AArch64TargetInfo(const llvm::Triple &Triple)
+  AArch64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : TargetInfo(Triple), ABI("aapcs") {
-
     if (getTriple().getOS() == llvm::Triple::NetBSD) {
       WCharType = SignedInt;
 
@@ -5364,6 +5742,10 @@
 
     // AArch64 targets default to using the ARM C++ ABI.
     TheCXXABI.set(TargetCXXABI::GenericAArch64);
+
+    if (Triple.getOS() == llvm::Triple::Linux ||
+        Triple.getOS() == llvm::Triple::UnknownOS)
+      this->MCountName = Opts.EABIVersion == "gnu" ? "\01_mcount" : "mcount";
   }
 
   StringRef getABI() const override { return ABI; }
@@ -5376,13 +5758,9 @@
   }
 
   bool setCPU(const std::string &Name) override {
-    bool CPUKnown = llvm::StringSwitch<bool>(Name)
-                        .Case("generic", true)
-                        .Cases("cortex-a53", "cortex-a57", "cortex-a72",
-                               "cortex-a35", "exynos-m1", true)
-                        .Case("cyclone", true)
-                        .Default(false);
-    return CPUKnown;
+    return Name == "generic" ||
+           llvm::AArch64::parseCPUArch(Name) !=
+           static_cast<unsigned>(llvm::AArch64::ArchKind::AK_INVALID);
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -5487,7 +5865,7 @@
         V8_1A = 1;
     }
 
-    setDataLayoutString();
+    setDataLayout();
 
     return true;
   }
@@ -5649,18 +6027,18 @@
 };
 
 class AArch64leTargetInfo : public AArch64TargetInfo {
-  void setDataLayoutString() override {
+  void setDataLayout() override {
     if (getTriple().isOSBinFormatMachO())
-      DataLayoutString = "e-m:o-i64:64-i128:128-n32:64-S128";
+      resetDataLayout("e-m:o-i64:64-i128:128-n32:64-S128");
     else
-      DataLayoutString = "e-m:e-i64:64-i128:128-n32:64-S128";
+      resetDataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128");
   }
 
 public:
-  AArch64leTargetInfo(const llvm::Triple &Triple)
-    : AArch64TargetInfo(Triple) {
+  AArch64leTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : AArch64TargetInfo(Triple, Opts) {
     BigEndian = false;
-    }
+  }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
     Builder.defineMacro("__AARCH64EL__");
@@ -5669,14 +6047,14 @@
 };
 
 class AArch64beTargetInfo : public AArch64TargetInfo {
-  void setDataLayoutString() override {
+  void setDataLayout() override {
     assert(!getTriple().isOSBinFormatMachO());
-    DataLayoutString = "E-m:e-i64:64-i128:128-n32:64-S128";
+    resetDataLayout("E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128");
   }
 
 public:
-  AArch64beTargetInfo(const llvm::Triple &Triple)
-    : AArch64TargetInfo(Triple) { }
+  AArch64beTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : AArch64TargetInfo(Triple, Opts) {}
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
     Builder.defineMacro("__AARCH64EB__");
@@ -5702,8 +6080,8 @@
   }
 
 public:
-  DarwinAArch64TargetInfo(const llvm::Triple &Triple)
-      : DarwinTargetInfo<AArch64leTargetInfo>(Triple) {
+  DarwinAArch64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : DarwinTargetInfo<AArch64leTargetInfo>(Triple, Opts) {
     Int64Type = SignedLongLong;
     WCharType = SignedInt;
     UseSignedCharForObjCBool = false;
@@ -5728,11 +6106,15 @@
   bool HasHVX, HasHVXDouble;
 
 public:
-  HexagonTargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  HexagonTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
     BigEndian = false;
-    DataLayoutString = "e-m:e-p:32:32:32-"
-                       "i64:64:64-i32:32:32-i16:16:16-i1:8:8-"
-                       "f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32";
+    // Specify the vector alignment explicitly. For v512x1, the calculated
+    // alignment would be 512*alignment(i1), which is 512 bytes, instead of
+    // the required minimum of 64 bytes.
+    resetDataLayout("e-m:e-p:32:32:32-a:0-n16:32-"
+        "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
+        "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048");
     SizeType    = UnsignedInt;
     PtrDiffType = SignedInt;
     IntPtrType  = SignedInt;
@@ -5755,7 +6137,19 @@
 
   bool validateAsmConstraint(const char *&Name,
                              TargetInfo::ConstraintInfo &Info) const override {
-    return true;
+    switch (*Name) {
+      case 'v':
+      case 'q':
+        if (HasHVX) {
+          Info.setAllowsRegister();
+          return true;
+        }
+        break;
+      case 's':
+        // Relocatable constant.
+        return true;
+    }
+    return false;
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -5827,12 +6221,23 @@
       Builder.defineMacro("__QDSP6_V5__");
       Builder.defineMacro("__QDSP6_ARCH__", "5");
     }
+  } else if (CPU == "hexagonv55") {
+    Builder.defineMacro("__HEXAGON_V55__");
+    Builder.defineMacro("__HEXAGON_ARCH__", "55");
+    Builder.defineMacro("__QDSP6_V55__");
+    Builder.defineMacro("__QDSP6_ARCH__", "55");
   } else if (CPU == "hexagonv60") {
     Builder.defineMacro("__HEXAGON_V60__");
     Builder.defineMacro("__HEXAGON_ARCH__", "60");
     Builder.defineMacro("__QDSP6_V60__");
     Builder.defineMacro("__QDSP6_ARCH__", "60");
   }
+
+  if (hasFeature("hvx")) {
+    Builder.defineMacro("__HVX__");
+    if (hasFeature("hvx-double"))
+      Builder.defineMacro("__HVXDBL__");
+  }
 }
 
 bool HexagonTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
@@ -5893,23 +6298,133 @@
 #include "clang/Basic/BuiltinsHexagon.def"
 };
 
+class LanaiTargetInfo : public TargetInfo {
+  // Class for Lanai (32-bit).
+  // The CPU profiles supported by the Lanai backend
+  enum CPUKind {
+    CK_NONE,
+    CK_V11,
+  } CPU;
+
+  static const TargetInfo::GCCRegAlias GCCRegAliases[];
+  static const char *const GCCRegNames[];
+
+public:
+  LanaiTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
+    // Description string has to be kept in sync with backend.
+    resetDataLayout("E"        // Big endian
+                    "-m:e"     // ELF name manging
+                    "-p:32:32" // 32 bit pointers, 32 bit aligned
+                    "-i64:64"  // 64 bit integers, 64 bit aligned
+                    "-a:0:32"  // 32 bit alignment of objects of aggregate type
+                    "-n32"     // 32 bit native integer width
+                    "-S64"     // 64 bit natural stack alignment
+                    );
+
+    // Setting RegParmMax equal to what mregparm was set to in the old
+    // toolchain
+    RegParmMax = 4;
+
+    // Set the default CPU to V11
+    CPU = CK_V11;
+
+    // Temporary approach to make everything at least word-aligned and allow for
+    // safely casting between pointers with different alignment requirements.
+    // TODO: Remove this when there are no more cast align warnings on the
+    // firmware.
+    MinGlobalAlign = 32;
+  }
+
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override {
+    // Define __lanai__ when building for target lanai.
+    Builder.defineMacro("__lanai__");
+
+    // Set define for the CPU specified.
+    switch (CPU) {
+    case CK_V11:
+      Builder.defineMacro("__LANAI_V11__");
+      break;
+    case CK_NONE:
+      llvm_unreachable("Unhandled target CPU");
+    }
+  }
+
+  bool setCPU(const std::string &Name) override {
+    CPU = llvm::StringSwitch<CPUKind>(Name)
+              .Case("v11", CK_V11)
+              .Default(CK_NONE);
+
+    return CPU != CK_NONE;
+  }
+
+  bool hasFeature(StringRef Feature) const override {
+    return llvm::StringSwitch<bool>(Feature).Case("lanai", true).Default(false);
+  }
+
+  ArrayRef<const char *> getGCCRegNames() const override;
+
+  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
+
+  BuiltinVaListKind getBuiltinVaListKind() const override {
+    return TargetInfo::VoidPtrBuiltinVaList;
+  }
+
+  ArrayRef<Builtin::Info> getTargetBuiltins() const override { return None; }
+
+  bool validateAsmConstraint(const char *&Name,
+                             TargetInfo::ConstraintInfo &info) const override {
+    return false;
+  }
+
+  const char *getClobbers() const override { return ""; }
+};
+
+const char *const LanaiTargetInfo::GCCRegNames[] = {
+    "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",  "r8",  "r9",  "r10",
+    "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21",
+    "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"};
+
+ArrayRef<const char *> LanaiTargetInfo::getGCCRegNames() const {
+  return llvm::makeArrayRef(GCCRegNames);
+}
+
+const TargetInfo::GCCRegAlias LanaiTargetInfo::GCCRegAliases[] = {
+    {{"pc"}, "r2"},
+    {{"sp"}, "r4"},
+    {{"fp"}, "r5"},
+    {{"rv"}, "r8"},
+    {{"rr1"}, "r10"},
+    {{"rr2"}, "r11"},
+    {{"rca"}, "r15"},
+};
+
+ArrayRef<TargetInfo::GCCRegAlias> LanaiTargetInfo::getGCCRegAliases() const {
+  return llvm::makeArrayRef(GCCRegAliases);
+}
+
 // Shared base class for SPARC v8 (32-bit) and SPARC v9 (64-bit).
 class SparcTargetInfo : public TargetInfo {
   static const TargetInfo::GCCRegAlias GCCRegAliases[];
   static const char * const GCCRegNames[];
   bool SoftFloat;
 public:
-  SparcTargetInfo(const llvm::Triple &Triple)
+  SparcTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
       : TargetInfo(Triple), SoftFloat(false) {}
 
+  int getEHDataRegisterNumber(unsigned RegNo) const override {
+    if (RegNo == 0) return 24;
+    if (RegNo == 1) return 25;
+    return -1;
+  }
+
   bool handleTargetFeatures(std::vector<std::string> &Features,
                             DiagnosticsEngine &Diags) override {
-    // The backend doesn't actually handle soft float yet, but in case someone
-    // is using the support for the front end continue to support it.
+    // Check if software floating point is enabled
     auto Feature = std::find(Features.begin(), Features.end(), "+soft-float");
     if (Feature != Features.end()) {
       SoftFloat = true;
-      Features.erase(Feature);
     }
     return true;
   }
@@ -5929,6 +6444,10 @@
              .Default(false);
   }
 
+  bool hasSjLjLowering() const override {
+    return true;
+  }
+
   ArrayRef<Builtin::Info> getTargetBuiltins() const override {
     // FIXME: Implement!
     return None;
@@ -5975,7 +6494,18 @@
     CK_NIAGARA,
     CK_NIAGARA2,
     CK_NIAGARA3,
-    CK_NIAGARA4
+    CK_NIAGARA4,
+    CK_MYRIAD2100,
+    CK_MYRIAD2150,
+    CK_MYRIAD2450,
+    CK_LEON2,
+    CK_LEON2_AT697E,
+    CK_LEON2_AT697F,
+    CK_LEON3,
+    CK_LEON3_UT699,
+    CK_LEON3_GR712RC,
+    CK_LEON4,
+    CK_LEON4_GR740
   } CPU = CK_GENERIC;
 
   enum CPUGeneration {
@@ -5994,6 +6524,17 @@
     case CK_SPARCLITE86X:
     case CK_SPARCLET:
     case CK_TSC701:
+    case CK_MYRIAD2100:
+    case CK_MYRIAD2150:
+    case CK_MYRIAD2450:
+    case CK_LEON2:
+    case CK_LEON2_AT697E:
+    case CK_LEON2_AT697F:
+    case CK_LEON3:
+    case CK_LEON3_UT699:
+    case CK_LEON3_GR712RC:
+    case CK_LEON4:
+    case CK_LEON4_GR740:
       return CG_V8;
     case CK_V9:
     case CK_ULTRASPARC:
@@ -6024,6 +6565,22 @@
         .Case("niagara2", CK_NIAGARA2)
         .Case("niagara3", CK_NIAGARA3)
         .Case("niagara4", CK_NIAGARA4)
+        .Case("ma2100", CK_MYRIAD2100)
+        .Case("ma2150", CK_MYRIAD2150)
+        .Case("ma2450", CK_MYRIAD2450)
+        // FIXME: the myriad2[.n] spellings are obsolete,
+        // but a grace period is needed to allow updating dependent builds.
+        .Case("myriad2", CK_MYRIAD2100)
+        .Case("myriad2.1", CK_MYRIAD2100)
+        .Case("myriad2.2", CK_MYRIAD2150)
+        .Case("leon2", CK_LEON2)
+        .Case("at697e", CK_LEON2_AT697E)
+        .Case("at697f", CK_LEON2_AT697F)
+        .Case("leon3", CK_LEON3)
+        .Case("ut699", CK_LEON3_UT699)
+        .Case("gr712rc", CK_LEON3_GR712RC)
+        .Case("leon4", CK_LEON4)
+        .Case("gr740", CK_LEON4_GR740)
         .Default(CK_GENERIC);
   }
 
@@ -6086,8 +6643,9 @@
 // SPARC v8 is the 32-bit mode selected by Triple::sparc.
 class SparcV8TargetInfo : public SparcTargetInfo {
 public:
-  SparcV8TargetInfo(const llvm::Triple &Triple) : SparcTargetInfo(Triple) {
-    DataLayoutString = "E-m:e-p:32:32-i64:64-f128:64-n32-S64";
+  SparcV8TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : SparcTargetInfo(Triple, Opts) {
+    resetDataLayout("E-m:e-p:32:32-i64:64-f128:64-n32-S64");
     // NetBSD / OpenBSD use long (same as llvm default); everyone else uses int.
     switch (getTriple().getOS()) {
     default:
@@ -6102,6 +6660,7 @@
       PtrDiffType = SignedLong;
       break;
     }
+    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -6121,24 +6680,53 @@
       }
       break;
     }
+    if (getTriple().getVendor() == llvm::Triple::Myriad) {
+      std::string MyriadArchValue, Myriad2Value;
+      Builder.defineMacro("__sparc_v8__");
+      Builder.defineMacro("__leon__");
+      switch (CPU) {
+      case CK_MYRIAD2150:
+        MyriadArchValue = "__ma2150";
+        Myriad2Value = "2";
+        break;
+      case CK_MYRIAD2450:
+        MyriadArchValue = "__ma2450";
+        Myriad2Value = "2";
+        break;
+      default:
+        MyriadArchValue = "__ma2100";
+        Myriad2Value = "1";
+        break;
+      }
+      Builder.defineMacro(MyriadArchValue, "1");
+      Builder.defineMacro(MyriadArchValue+"__", "1");
+      Builder.defineMacro("__myriad2__", Myriad2Value);
+      Builder.defineMacro("__myriad2", Myriad2Value);
+    }
+  }
+
+  bool hasSjLjLowering() const override {
+    return true;
   }
 };
 
 // SPARCV8el is the 32-bit little-endian mode selected by Triple::sparcel.
 class SparcV8elTargetInfo : public SparcV8TargetInfo {
  public:
-  SparcV8elTargetInfo(const llvm::Triple &Triple) : SparcV8TargetInfo(Triple) {
-    DataLayoutString = "e-m:e-p:32:32-i64:64-f128:64-n32-S64";
-    BigEndian = false;
+   SparcV8elTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+       : SparcV8TargetInfo(Triple, Opts) {
+     resetDataLayout("e-m:e-p:32:32-i64:64-f128:64-n32-S64");
+     BigEndian = false;
   }
 };
 
 // SPARC v9 is the 64-bit mode selected by Triple::sparcv9.
 class SparcV9TargetInfo : public SparcTargetInfo {
 public:
-  SparcV9TargetInfo(const llvm::Triple &Triple) : SparcTargetInfo(Triple) {
+  SparcV9TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : SparcTargetInfo(Triple, Opts) {
     // FIXME: Support Sparc quad-precision long double?
-    DataLayoutString = "E-m:e-i64:64-n32:64-S128";
+    resetDataLayout("E-m:e-i64:64-n32:64-S128");
     // This is an LP64 platform.
     LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
 
@@ -6185,7 +6773,7 @@
   bool HasVector;
 
 public:
-  SystemZTargetInfo(const llvm::Triple &Triple)
+  SystemZTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
       : TargetInfo(Triple), CPU("z10"), HasTransactionalExecution(false),
         HasVector(false) {
     IntMaxType = SignedLong;
@@ -6199,7 +6787,7 @@
     LongDoubleFormat = &llvm::APFloat::IEEEquad;
     DefaultAlignForAttributeAligned = 64;
     MinGlobalAlign = 16;
-    DataLayoutString = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64";
+    resetDataLayout("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64");
     MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
   }
   void getTargetDefines(const LangOptions &Opts,
@@ -6208,6 +6796,12 @@
     Builder.defineMacro("__s390x__");
     Builder.defineMacro("__zarch__");
     Builder.defineMacro("__LONG_DOUBLE_128__");
+
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
+
     if (HasTransactionalExecution)
       Builder.defineMacro("__HTM__");
     if (Opts.ZVector)
@@ -6268,8 +6862,8 @@
     // If we use the vector ABI, vector types are 64-bit aligned.
     if (HasVector) {
       MaxVectorAlign = 64;
-      DataLayoutString = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64"
-                         "-v128:64-a:8:16-n32:64";
+      resetDataLayout("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64"
+                      "-v128:64-a:8:16-n32:64");
     }
     return true;
   }
@@ -6306,6 +6900,8 @@
 const Builtin::Info SystemZTargetInfo::BuiltinInfo[] = {
 #define BUILTIN(ID, TYPE, ATTRS)                                               \
   { #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
+#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
+  { #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE },
 #include "clang/Basic/BuiltinsSystemZ.def"
 };
 
@@ -6353,7 +6949,8 @@
   static const char *const GCCRegNames[];
 
 public:
-  MSP430TargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  MSP430TargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
     BigEndian = false;
     TLSSupported = false;
     IntWidth = 16;
@@ -6369,7 +6966,7 @@
     IntPtrType = SignedInt;
     PtrDiffType = SignedInt;
     SigAtomicType = SignedLong;
-    DataLayoutString = "e-m:e-p:16:16-i32:16:32-a:16-n8:16";
+    resetDataLayout("e-m:e-p:16:16-i32:16:32-a:16-n8:16");
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -6440,7 +7037,8 @@
 
 class TCETargetInfo : public TargetInfo {
 public:
-  TCETargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  TCETargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
     TLSSupported = false;
     IntWidth = 32;
     LongWidth = LongLongWidth = 32;
@@ -6462,8 +7060,8 @@
     FloatFormat = &llvm::APFloat::IEEEsingle;
     DoubleFormat = &llvm::APFloat::IEEEsingle;
     LongDoubleFormat = &llvm::APFloat::IEEEsingle;
-    DataLayoutString = "E-p:32:32-i8:8:32-i16:16:32-i64:32"
-                       "-f64:32-v64:32-v128:32-a:0:32-n32";
+    resetDataLayout("E-p:32:32-i8:8:32-i16:16:32-i64:32"
+                    "-f64:32-v64:32-v128:32-a:0:32-n32");
     AddrSpaceMap = &TCEOpenCLAddrSpaceMap;
     UseAddrSpaceMapMangling = true;
   }
@@ -6493,7 +7091,8 @@
 
 class BPFTargetInfo : public TargetInfo {
 public:
-  BPFTargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  BPFTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
     LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
     SizeType    = UnsignedLong;
     PtrDiffType = SignedLong;
@@ -6503,10 +7102,10 @@
     RegParmMax = 5;
     if (Triple.getArch() == llvm::Triple::bpfeb) {
       BigEndian = true;
-      DataLayoutString = "E-m:e-p:64:64-i64:64-n32:64-S128";
+      resetDataLayout("E-m:e-p:64:64-i64:64-n32:64-S128");
     } else {
       BigEndian = false;
-      DataLayoutString = "e-m:e-p:64:64-i64:64-n32:64-S128";
+      resetDataLayout("e-m:e-p:64:64-i64:64-n32:64-S128");
     }
     MaxAtomicPromoteWidth = 64;
     MaxAtomicInlineWidth = 64;
@@ -6540,8 +7139,25 @@
   }
 };
 
-class MipsTargetInfoBase : public TargetInfo {
-  virtual void setDataLayoutString() = 0;
+class MipsTargetInfo : public TargetInfo {
+  void setDataLayout() {
+    StringRef Layout;
+
+    if (ABI == "o32")
+      Layout = "m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64";
+    else if (ABI == "n32")
+      Layout = "m:e-p:32:32-i8:8:32-i16:16:32-i64:64-n32:64-S128";
+    else if (ABI == "n64")
+      Layout = "m:e-i8:8:32-i16:16:32-i64:64-n32:64-S128";
+    else
+      llvm_unreachable("Invalid ABI");
+
+    if (BigEndian)
+      resetDataLayout(("E-" + Layout).str());
+    else
+      resetDataLayout(("e-" + Layout).str());
+  }
+
 
   static const Builtin::Info BuiltinInfo[];
   std::string CPU;
@@ -6562,12 +7178,20 @@
   std::string ABI;
 
 public:
-  MipsTargetInfoBase(const llvm::Triple &Triple, const std::string &ABIStr,
-                     const std::string &CPUStr)
-      : TargetInfo(Triple), CPU(CPUStr), IsMips16(false), IsMicromips(false),
+  MipsTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple), IsMips16(false), IsMicromips(false),
         IsNan2008(false), IsSingleFloat(false), FloatABI(HardFloat),
-        DspRev(NoDSP), HasMSA(false), HasFP64(false), ABI(ABIStr) {
+        DspRev(NoDSP), HasMSA(false), HasFP64(false) {
     TheCXXABI.set(TargetCXXABI::GenericMIPS);
+    BigEndian = getTriple().getArch() == llvm::Triple::mips ||
+                getTriple().getArch() == llvm::Triple::mips64;
+
+    setABI((getTriple().getArch() == llvm::Triple::mips ||
+            getTriple().getArch() == llvm::Triple::mipsel)
+               ? "o32"
+               : "n64");
+
+    CPU = ABI == "o32" ? "mips32r2" : "mips64r2";
   }
 
   bool isNaN2008Default() const {
@@ -6582,22 +7206,99 @@
     return IsNan2008;
   }
 
-  StringRef getABI() const override { return ABI; }
-  bool setCPU(const std::string &Name) override {
-    bool IsMips32 = getTriple().getArch() == llvm::Triple::mips ||
-                    getTriple().getArch() == llvm::Triple::mipsel;
-    CPU = Name;
-    return llvm::StringSwitch<bool>(Name)
-        .Case("mips1", IsMips32)
-        .Case("mips2", IsMips32)
+  bool processorSupportsGPR64() const {
+    return llvm::StringSwitch<bool>(CPU)
         .Case("mips3", true)
         .Case("mips4", true)
         .Case("mips5", true)
-        .Case("mips32", IsMips32)
-        .Case("mips32r2", IsMips32)
-        .Case("mips32r3", IsMips32)
-        .Case("mips32r5", IsMips32)
-        .Case("mips32r6", IsMips32)
+        .Case("mips64", true)
+        .Case("mips64r2", true)
+        .Case("mips64r3", true)
+        .Case("mips64r5", true)
+        .Case("mips64r6", true)
+        .Case("octeon", true)
+        .Default(false);
+    return false;
+  }
+
+  StringRef getABI() const override { return ABI; }
+  bool setABI(const std::string &Name) override {
+    if (Name == "o32") {
+      setO32ABITypes();
+      ABI = Name;
+      return true;
+    }
+
+    if (Name == "n32") {
+      setN32ABITypes();
+      ABI = Name;
+      return true;
+    }
+    if (Name == "n64") {
+      setN64ABITypes();
+      ABI = Name;
+      return true;
+    }
+    return false;
+  }
+
+  void setO32ABITypes() {
+    Int64Type = SignedLongLong;
+    IntMaxType = Int64Type;
+    LongDoubleFormat = &llvm::APFloat::IEEEdouble;
+    LongDoubleWidth = LongDoubleAlign = 64;
+    LongWidth = LongAlign = 32;
+    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
+    PointerWidth = PointerAlign = 32;
+    PtrDiffType = SignedInt;
+    SizeType = UnsignedInt;
+    SuitableAlign = 64;
+  }
+
+  void setN32N64ABITypes() {
+    LongDoubleWidth = LongDoubleAlign = 128;
+    LongDoubleFormat = &llvm::APFloat::IEEEquad;
+    if (getTriple().getOS() == llvm::Triple::FreeBSD) {
+      LongDoubleWidth = LongDoubleAlign = 64;
+      LongDoubleFormat = &llvm::APFloat::IEEEdouble;
+    }
+    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
+    SuitableAlign = 128;
+  }
+
+  void setN64ABITypes() {
+    setN32N64ABITypes();
+    Int64Type = SignedLong;
+    IntMaxType = Int64Type;
+    LongWidth = LongAlign = 64;
+    PointerWidth = PointerAlign = 64;
+    PtrDiffType = SignedLong;
+    SizeType = UnsignedLong;
+  }
+
+  void setN32ABITypes() {
+    setN32N64ABITypes();
+    Int64Type = SignedLongLong;
+    IntMaxType = Int64Type;
+    LongWidth = LongAlign = 32;
+    PointerWidth = PointerAlign = 32;
+    PtrDiffType = SignedInt;
+    SizeType = UnsignedInt;
+  }
+
+  bool setCPU(const std::string &Name) override {
+    CPU = Name;
+    return llvm::StringSwitch<bool>(Name)
+        .Case("mips1", true)
+        .Case("mips2", true)
+        .Case("mips3", true)
+        .Case("mips4", true)
+        .Case("mips5", true)
+        .Case("mips32", true)
+        .Case("mips32r2", true)
+        .Case("mips32r3", true)
+        .Case("mips32r5", true)
+        .Case("mips32r6", true)
         .Case("mips64", true)
         .Case("mips64r2", true)
         .Case("mips64r3", true)
@@ -6612,6 +7313,8 @@
   initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
                  StringRef CPU,
                  const std::vector<std::string> &FeaturesVec) const override {
+    if (CPU.empty())
+      CPU = getCPU();
     if (CPU == "octeon")
       Features["mips64r2"] = Features["cnmips"] = true;
     else
@@ -6621,11 +7324,54 @@
 
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
+    if (BigEndian) {
+      DefineStd(Builder, "MIPSEB", Opts);
+      Builder.defineMacro("_MIPSEB");
+    } else {
+      DefineStd(Builder, "MIPSEL", Opts);
+      Builder.defineMacro("_MIPSEL");
+    }
+
     Builder.defineMacro("__mips__");
     Builder.defineMacro("_mips");
     if (Opts.GNUMode)
       Builder.defineMacro("mips");
 
+    if (ABI == "o32") {
+      Builder.defineMacro("__mips", "32");
+      Builder.defineMacro("_MIPS_ISA", "_MIPS_ISA_MIPS32");
+    } else {
+      Builder.defineMacro("__mips", "64");
+      Builder.defineMacro("__mips64");
+      Builder.defineMacro("__mips64__");
+      Builder.defineMacro("_MIPS_ISA", "_MIPS_ISA_MIPS64");
+    }
+
+    const std::string ISARev = llvm::StringSwitch<std::string>(getCPU())
+                                   .Cases("mips32", "mips64", "1")
+                                   .Cases("mips32r2", "mips64r2", "2")
+                                   .Cases("mips32r3", "mips64r3", "3")
+                                   .Cases("mips32r5", "mips64r5", "5")
+                                   .Cases("mips32r6", "mips64r6", "6")
+                                   .Default("");
+    if (!ISARev.empty())
+      Builder.defineMacro("__mips_isa_rev", ISARev);
+
+    if (ABI == "o32") {
+      Builder.defineMacro("__mips_o32");
+      Builder.defineMacro("_ABIO32", "1");
+      Builder.defineMacro("_MIPS_SIM", "_ABIO32");
+    } else if (ABI == "n32") {
+      Builder.defineMacro("__mips_n32");
+      Builder.defineMacro("_ABIN32", "2");
+      Builder.defineMacro("_MIPS_SIM", "_ABIN32");
+    } else if (ABI == "n64") {
+      Builder.defineMacro("__mips_n64");
+      Builder.defineMacro("_ABI64", "3");
+      Builder.defineMacro("_MIPS_SIM", "_ABI64");
+    } else
+      llvm_unreachable("Invalid ABI.");
+
     Builder.defineMacro("__REGISTER_PREFIX__", "");
 
     switch (FloatABI) {
@@ -6682,6 +7428,13 @@
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
+
+    // 32-bit MIPS processors don't have the necessary lld/scd instructions
+    // found in 64-bit processors. In the case of O32 on a 64-bit processor,
+    // the instructions exist but using them violates the ABI since they
+    // require 64-bit GPRs and O32 only supports 32-bit GPRs.
+    if (ABI == "n32" || ABI == "n64")
+      Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
   }
 
   ArrayRef<Builtin::Info> getTargetBuiltins() const override {
@@ -6712,7 +7465,8 @@
       "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31",
       // Hi/lo and condition register names
       "hi",   "lo",   "",     "$fcc0","$fcc1","$fcc2","$fcc3","$fcc4",
-      "$fcc5","$fcc6","$fcc7",
+      "$fcc5","$fcc6","$fcc7","$ac1hi","$ac1lo","$ac2hi","$ac2lo",
+      "$ac3hi","$ac3lo",
       // MSA register names
       "$w0",  "$w1",  "$w2",  "$w3",  "$w4",  "$w5",  "$w6",  "$w7",
       "$w8",  "$w9",  "$w10", "$w11", "$w12", "$w13", "$w14", "$w15",
@@ -6724,7 +7478,6 @@
     };
     return llvm::makeArrayRef(GCCRegNames);
   }
-  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override = 0;
   bool validateAsmConstraint(const char *&Name,
                              TargetInfo::ConstraintInfo &Info) const override {
     switch (*Name) {
@@ -6835,7 +7588,7 @@
         IsNan2008 = false;
     }
 
-    setDataLayoutString();
+    setDataLayout();
 
     return true;
   }
@@ -6847,9 +7600,82 @@
   }
 
   bool isCLZForZeroUndef() const override { return false; }
+
+  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
+    static const TargetInfo::GCCRegAlias O32RegAliases[] = {
+        {{"at"}, "$1"},  {{"v0"}, "$2"},         {{"v1"}, "$3"},
+        {{"a0"}, "$4"},  {{"a1"}, "$5"},         {{"a2"}, "$6"},
+        {{"a3"}, "$7"},  {{"t0"}, "$8"},         {{"t1"}, "$9"},
+        {{"t2"}, "$10"}, {{"t3"}, "$11"},        {{"t4"}, "$12"},
+        {{"t5"}, "$13"}, {{"t6"}, "$14"},        {{"t7"}, "$15"},
+        {{"s0"}, "$16"}, {{"s1"}, "$17"},        {{"s2"}, "$18"},
+        {{"s3"}, "$19"}, {{"s4"}, "$20"},        {{"s5"}, "$21"},
+        {{"s6"}, "$22"}, {{"s7"}, "$23"},        {{"t8"}, "$24"},
+        {{"t9"}, "$25"}, {{"k0"}, "$26"},        {{"k1"}, "$27"},
+        {{"gp"}, "$28"}, {{"sp", "$sp"}, "$29"}, {{"fp", "$fp"}, "$30"},
+        {{"ra"}, "$31"}};
+    static const TargetInfo::GCCRegAlias NewABIRegAliases[] = {
+        {{"at"}, "$1"},  {{"v0"}, "$2"},         {{"v1"}, "$3"},
+        {{"a0"}, "$4"},  {{"a1"}, "$5"},         {{"a2"}, "$6"},
+        {{"a3"}, "$7"},  {{"a4"}, "$8"},         {{"a5"}, "$9"},
+        {{"a6"}, "$10"}, {{"a7"}, "$11"},        {{"t0"}, "$12"},
+        {{"t1"}, "$13"}, {{"t2"}, "$14"},        {{"t3"}, "$15"},
+        {{"s0"}, "$16"}, {{"s1"}, "$17"},        {{"s2"}, "$18"},
+        {{"s3"}, "$19"}, {{"s4"}, "$20"},        {{"s5"}, "$21"},
+        {{"s6"}, "$22"}, {{"s7"}, "$23"},        {{"t8"}, "$24"},
+        {{"t9"}, "$25"}, {{"k0"}, "$26"},        {{"k1"}, "$27"},
+        {{"gp"}, "$28"}, {{"sp", "$sp"}, "$29"}, {{"fp", "$fp"}, "$30"},
+        {{"ra"}, "$31"}};
+    if (ABI == "o32")
+      return llvm::makeArrayRef(O32RegAliases);
+    return llvm::makeArrayRef(NewABIRegAliases);
+  }
+
+  bool hasInt128Type() const override {
+    return ABI == "n32" || ABI == "n64";
+  }
+
+  bool validateTarget(DiagnosticsEngine &Diags) const override {
+    // FIXME: It's valid to use O32 on a 64-bit CPU but the backend can't handle
+    //        this yet. It's better to fail here than on the backend assertion.
+    if (processorSupportsGPR64() && ABI == "o32") {
+      Diags.Report(diag::err_target_unsupported_abi) << ABI << CPU;
+      return false;
+    }
+
+    // 64-bit ABI's require 64-bit CPU's.
+    if (!processorSupportsGPR64() && (ABI == "n32" || ABI == "n64")) {
+      Diags.Report(diag::err_target_unsupported_abi) << ABI << CPU;
+      return false;
+    }
+
+    // FIXME: It's valid to use O32 on a mips64/mips64el triple but the backend
+    //        can't handle this yet. It's better to fail here than on the
+    //        backend assertion.
+    if ((getTriple().getArch() == llvm::Triple::mips64 ||
+         getTriple().getArch() == llvm::Triple::mips64el) &&
+        ABI == "o32") {
+      Diags.Report(diag::err_target_unsupported_abi_for_triple)
+          << ABI << getTriple().str();
+      return false;
+    }
+
+    // FIXME: It's valid to use N32/N64 on a mips/mipsel triple but the backend
+    //        can't handle this yet. It's better to fail here than on the
+    //        backend assertion.
+    if ((getTriple().getArch() == llvm::Triple::mips ||
+         getTriple().getArch() == llvm::Triple::mipsel) &&
+        (ABI == "n32" || ABI == "n64")) {
+      Diags.Report(diag::err_target_unsupported_abi_for_triple)
+          << ABI << getTriple().str();
+      return false;
+    }
+
+    return true;
+  }
 };
 
-const Builtin::Info MipsTargetInfoBase::BuiltinInfo[] = {
+const Builtin::Info MipsTargetInfo::BuiltinInfo[] = {
 #define BUILTIN(ID, TYPE, ATTRS) \
   { #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
 #define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
@@ -6857,294 +7683,11 @@
 #include "clang/Basic/BuiltinsMips.def"
 };
 
-class Mips32TargetInfoBase : public MipsTargetInfoBase {
-public:
-  Mips32TargetInfoBase(const llvm::Triple &Triple)
-      : MipsTargetInfoBase(Triple, "o32", "mips32r2") {
-    SizeType = UnsignedInt;
-    PtrDiffType = SignedInt;
-    Int64Type = SignedLongLong;
-    IntMaxType = Int64Type;
-    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
-  }
-  bool setABI(const std::string &Name) override {
-    if (Name == "o32" || Name == "eabi") {
-      ABI = Name;
-      return true;
-    }
-    return false;
-  }
-  void getTargetDefines(const LangOptions &Opts,
-                        MacroBuilder &Builder) const override {
-    MipsTargetInfoBase::getTargetDefines(Opts, Builder);
-
-    Builder.defineMacro("__mips", "32");
-    Builder.defineMacro("_MIPS_ISA", "_MIPS_ISA_MIPS32");
-
-    const std::string& CPUStr = getCPU();
-    if (CPUStr == "mips32")
-      Builder.defineMacro("__mips_isa_rev", "1");
-    else if (CPUStr == "mips32r2")
-      Builder.defineMacro("__mips_isa_rev", "2");
-    else if (CPUStr == "mips32r3")
-      Builder.defineMacro("__mips_isa_rev", "3");
-    else if (CPUStr == "mips32r5")
-      Builder.defineMacro("__mips_isa_rev", "5");
-    else if (CPUStr == "mips32r6")
-      Builder.defineMacro("__mips_isa_rev", "6");
-
-    if (ABI == "o32") {
-      Builder.defineMacro("__mips_o32");
-      Builder.defineMacro("_ABIO32", "1");
-      Builder.defineMacro("_MIPS_SIM", "_ABIO32");
-    }
-    else if (ABI == "eabi")
-      Builder.defineMacro("__mips_eabi");
-    else
-      llvm_unreachable("Invalid ABI for Mips32.");
-  }
-  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
-    static const TargetInfo::GCCRegAlias GCCRegAliases[] = {
-      { { "at" },  "$1" },
-      { { "v0" },  "$2" },
-      { { "v1" },  "$3" },
-      { { "a0" },  "$4" },
-      { { "a1" },  "$5" },
-      { { "a2" },  "$6" },
-      { { "a3" },  "$7" },
-      { { "t0" },  "$8" },
-      { { "t1" },  "$9" },
-      { { "t2" }, "$10" },
-      { { "t3" }, "$11" },
-      { { "t4" }, "$12" },
-      { { "t5" }, "$13" },
-      { { "t6" }, "$14" },
-      { { "t7" }, "$15" },
-      { { "s0" }, "$16" },
-      { { "s1" }, "$17" },
-      { { "s2" }, "$18" },
-      { { "s3" }, "$19" },
-      { { "s4" }, "$20" },
-      { { "s5" }, "$21" },
-      { { "s6" }, "$22" },
-      { { "s7" }, "$23" },
-      { { "t8" }, "$24" },
-      { { "t9" }, "$25" },
-      { { "k0" }, "$26" },
-      { { "k1" }, "$27" },
-      { { "gp" }, "$28" },
-      { { "sp","$sp" }, "$29" },
-      { { "fp","$fp" }, "$30" },
-      { { "ra" }, "$31" }
-    };
-    return llvm::makeArrayRef(GCCRegAliases);
-  }
-};
-
-class Mips32EBTargetInfo : public Mips32TargetInfoBase {
-  void setDataLayoutString() override {
-    DataLayoutString = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64";
-  }
-
-public:
-  Mips32EBTargetInfo(const llvm::Triple &Triple)
-      : Mips32TargetInfoBase(Triple) {
-  }
-  void getTargetDefines(const LangOptions &Opts,
-                        MacroBuilder &Builder) const override {
-    DefineStd(Builder, "MIPSEB", Opts);
-    Builder.defineMacro("_MIPSEB");
-    Mips32TargetInfoBase::getTargetDefines(Opts, Builder);
-  }
-};
-
-class Mips32ELTargetInfo : public Mips32TargetInfoBase {
-  void setDataLayoutString() override {
-    DataLayoutString = "e-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64";
-  }
-
-public:
-  Mips32ELTargetInfo(const llvm::Triple &Triple)
-      : Mips32TargetInfoBase(Triple) {
-    BigEndian = false;
-  }
-  void getTargetDefines(const LangOptions &Opts,
-                        MacroBuilder &Builder) const override {
-    DefineStd(Builder, "MIPSEL", Opts);
-    Builder.defineMacro("_MIPSEL");
-    Mips32TargetInfoBase::getTargetDefines(Opts, Builder);
-  }
-};
-
-class Mips64TargetInfoBase : public MipsTargetInfoBase {
-public:
-  Mips64TargetInfoBase(const llvm::Triple &Triple)
-      : MipsTargetInfoBase(Triple, "n64", "mips64r2") {
-    LongDoubleWidth = LongDoubleAlign = 128;
-    LongDoubleFormat = &llvm::APFloat::IEEEquad;
-    if (getTriple().getOS() == llvm::Triple::FreeBSD) {
-      LongDoubleWidth = LongDoubleAlign = 64;
-      LongDoubleFormat = &llvm::APFloat::IEEEdouble;
-    }
-    setN64ABITypes();
-    SuitableAlign = 128;
-    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
-  }
-
-  void setN64ABITypes() {
-    LongWidth = LongAlign = 64;
-    PointerWidth = PointerAlign = 64;
-    SizeType = UnsignedLong;
-    PtrDiffType = SignedLong;
-    Int64Type = SignedLong;
-    IntMaxType = Int64Type;
-  }
-
-  void setN32ABITypes() {
-    LongWidth = LongAlign = 32;
-    PointerWidth = PointerAlign = 32;
-    SizeType = UnsignedInt;
-    PtrDiffType = SignedInt;
-    Int64Type = SignedLongLong;
-    IntMaxType = Int64Type;
-  }
-
-  bool setABI(const std::string &Name) override {
-    if (Name == "n32") {
-      setN32ABITypes();
-      ABI = Name;
-      return true;
-    }
-    if (Name == "n64") {
-      setN64ABITypes();
-      ABI = Name;
-      return true;
-    }
-    return false;
-  }
-
-  void getTargetDefines(const LangOptions &Opts,
-                        MacroBuilder &Builder) const override {
-    MipsTargetInfoBase::getTargetDefines(Opts, Builder);
-
-    Builder.defineMacro("__mips", "64");
-    Builder.defineMacro("__mips64");
-    Builder.defineMacro("__mips64__");
-    Builder.defineMacro("_MIPS_ISA", "_MIPS_ISA_MIPS64");
-
-    const std::string& CPUStr = getCPU();
-    if (CPUStr == "mips64")
-      Builder.defineMacro("__mips_isa_rev", "1");
-    else if (CPUStr == "mips64r2")
-      Builder.defineMacro("__mips_isa_rev", "2");
-    else if (CPUStr == "mips64r3")
-      Builder.defineMacro("__mips_isa_rev", "3");
-    else if (CPUStr == "mips64r5")
-      Builder.defineMacro("__mips_isa_rev", "5");
-    else if (CPUStr == "mips64r6")
-      Builder.defineMacro("__mips_isa_rev", "6");
-
-    if (ABI == "n32") {
-      Builder.defineMacro("__mips_n32");
-      Builder.defineMacro("_ABIN32", "2");
-      Builder.defineMacro("_MIPS_SIM", "_ABIN32");
-    }
-    else if (ABI == "n64") {
-      Builder.defineMacro("__mips_n64");
-      Builder.defineMacro("_ABI64", "3");
-      Builder.defineMacro("_MIPS_SIM", "_ABI64");
-    }
-    else
-      llvm_unreachable("Invalid ABI for Mips64.");
-
-    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
-  }
-  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
-    static const TargetInfo::GCCRegAlias GCCRegAliases[] = {
-      { { "at" },  "$1" },
-      { { "v0" },  "$2" },
-      { { "v1" },  "$3" },
-      { { "a0" },  "$4" },
-      { { "a1" },  "$5" },
-      { { "a2" },  "$6" },
-      { { "a3" },  "$7" },
-      { { "a4" },  "$8" },
-      { { "a5" },  "$9" },
-      { { "a6" }, "$10" },
-      { { "a7" }, "$11" },
-      { { "t0" }, "$12" },
-      { { "t1" }, "$13" },
-      { { "t2" }, "$14" },
-      { { "t3" }, "$15" },
-      { { "s0" }, "$16" },
-      { { "s1" }, "$17" },
-      { { "s2" }, "$18" },
-      { { "s3" }, "$19" },
-      { { "s4" }, "$20" },
-      { { "s5" }, "$21" },
-      { { "s6" }, "$22" },
-      { { "s7" }, "$23" },
-      { { "t8" }, "$24" },
-      { { "t9" }, "$25" },
-      { { "k0" }, "$26" },
-      { { "k1" }, "$27" },
-      { { "gp" }, "$28" },
-      { { "sp","$sp" }, "$29" },
-      { { "fp","$fp" }, "$30" },
-      { { "ra" }, "$31" }
-    };
-    return llvm::makeArrayRef(GCCRegAliases);
-  }
-
-  bool hasInt128Type() const override { return true; }
-};
-
-class Mips64EBTargetInfo : public Mips64TargetInfoBase {
-  void setDataLayoutString() override {
-    if (ABI == "n32")
-      DataLayoutString = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32:64-S128";
-    else
-      DataLayoutString = "E-m:m-i8:8:32-i16:16:32-i64:64-n32:64-S128";
-
-  }
-
-public:
-  Mips64EBTargetInfo(const llvm::Triple &Triple)
-      : Mips64TargetInfoBase(Triple) {}
-  void getTargetDefines(const LangOptions &Opts,
-                        MacroBuilder &Builder) const override {
-    DefineStd(Builder, "MIPSEB", Opts);
-    Builder.defineMacro("_MIPSEB");
-    Mips64TargetInfoBase::getTargetDefines(Opts, Builder);
-  }
-};
-
-class Mips64ELTargetInfo : public Mips64TargetInfoBase {
-  void setDataLayoutString() override {
-    if (ABI == "n32")
-      DataLayoutString = "e-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32:64-S128";
-    else
-      DataLayoutString = "e-m:m-i8:8:32-i16:16:32-i64:64-n32:64-S128";
-  }
-public:
-  Mips64ELTargetInfo(const llvm::Triple &Triple)
-      : Mips64TargetInfoBase(Triple) {
-    // Default ABI is n64.
-    BigEndian = false;
-  }
-  void getTargetDefines(const LangOptions &Opts,
-                        MacroBuilder &Builder) const override {
-    DefineStd(Builder, "MIPSEL", Opts);
-    Builder.defineMacro("_MIPSEL");
-    Mips64TargetInfoBase::getTargetDefines(Opts, Builder);
-  }
-};
-
 class PNaClTargetInfo : public TargetInfo {
 public:
-  PNaClTargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  PNaClTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : TargetInfo(Triple) {
     BigEndian = false;
-    this->UserLabelPrefix = "";
     this->LongAlign = 32;
     this->LongWidth = 32;
     this->PointerAlign = 32;
@@ -7196,11 +7739,10 @@
 }
 
 // We attempt to use PNaCl (le32) frontend and Mips32EL backend.
-class NaClMips32ELTargetInfo : public Mips32ELTargetInfo {
+class NaClMips32TargetInfo : public MipsTargetInfo {
 public:
-  NaClMips32ELTargetInfo(const llvm::Triple &Triple) :
-    Mips32ELTargetInfo(Triple) {
-  }
+  NaClMips32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : MipsTargetInfo(Triple, Opts) {}
 
   BuiltinVaListKind getBuiltinVaListKind() const override {
     return TargetInfo::PNaClABIBuiltinVaList;
@@ -7211,12 +7753,13 @@
   static const Builtin::Info BuiltinInfo[];
 
 public:
-  Le64TargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  Le64TargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
     BigEndian = false;
     NoAsmVariants = true;
     LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
     MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
-    DataLayoutString = "e-m:e-v128:32-v16:16-v32:32-v96:32-n8:16:32:64-S128";
+    resetDataLayout("e-m:e-v128:32-v16:16-v32:32-v96:32-n8:16:32:64-S128");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -7256,7 +7799,7 @@
   } SIMDLevel;
 
 public:
-  explicit WebAssemblyTargetInfo(const llvm::Triple &T)
+  explicit WebAssemblyTargetInfo(const llvm::Triple &T, const TargetOptions &)
       : TargetInfo(T), SIMDLevel(NoSIMD) {
     BigEndian = false;
     NoAsmVariants = true;
@@ -7362,10 +7905,11 @@
 
 class WebAssembly32TargetInfo : public WebAssemblyTargetInfo {
 public:
-  explicit WebAssembly32TargetInfo(const llvm::Triple &T)
-      : WebAssemblyTargetInfo(T) {
+  explicit WebAssembly32TargetInfo(const llvm::Triple &T,
+                                   const TargetOptions &Opts)
+      : WebAssemblyTargetInfo(T, Opts) {
     MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
-    DataLayoutString = "e-m:e-p:32:32-i64:64-n32:64-S128";
+    resetDataLayout("e-m:e-p:32:32-i64:64-n32:64-S128");
   }
 
 protected:
@@ -7378,12 +7922,13 @@
 
 class WebAssembly64TargetInfo : public WebAssemblyTargetInfo {
 public:
-  explicit WebAssembly64TargetInfo(const llvm::Triple &T)
-      : WebAssemblyTargetInfo(T) {
+  explicit WebAssembly64TargetInfo(const llvm::Triple &T,
+                                   const TargetOptions &Opts)
+      : WebAssemblyTargetInfo(T, Opts) {
     LongAlign = LongWidth = 64;
     PointerAlign = PointerWidth = 64;
     MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
-    DataLayoutString = "e-m:e-p:64:64-i64:64-n32:64-S128";
+    resetDataLayout("e-m:e-p:64:64-i64:64-n32:64-S128");
   }
 
 protected:
@@ -7411,7 +7956,8 @@
 };
 class SPIRTargetInfo : public TargetInfo {
 public:
-  SPIRTargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  SPIRTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
     assert(getTriple().getOS() == llvm::Triple::UnknownOS &&
            "SPIR target must use unknown OS");
     assert(getTriple().getEnvironment() == llvm::Triple::UnknownEnvironment &&
@@ -7448,23 +7994,30 @@
   }
 
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
-    return (CC == CC_SpirFunction || CC == CC_SpirKernel) ? CCCR_OK
-                                                          : CCCR_Warning;
+    return (CC == CC_SpirFunction || CC == CC_OpenCLKernel) ? CCCR_OK
+                                                            : CCCR_Warning;
   }
 
   CallingConv getDefaultCallingConv(CallingConvMethodType MT) const override {
     return CC_SpirFunction;
   }
+
+  void setSupportedOpenCLOpts() override {
+    // Assume all OpenCL extensions and optional core features are supported
+    // for SPIR since it is a generic target.
+    getSupportedOpenCLOpts().setAll();
+  }
 };
 
 class SPIR32TargetInfo : public SPIRTargetInfo {
 public:
-  SPIR32TargetInfo(const llvm::Triple &Triple) : SPIRTargetInfo(Triple) {
+  SPIR32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : SPIRTargetInfo(Triple, Opts) {
     PointerWidth = PointerAlign = 32;
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = IntPtrType = TargetInfo::SignedInt;
-    DataLayoutString = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                       "v96:128-v192:256-v256:256-v512:512-v1024:1024";
+    resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -7474,12 +8027,13 @@
 
 class SPIR64TargetInfo : public SPIRTargetInfo {
 public:
-  SPIR64TargetInfo(const llvm::Triple &Triple) : SPIRTargetInfo(Triple) {
+  SPIR64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : SPIRTargetInfo(Triple, Opts) {
     PointerWidth = PointerAlign = 64;
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
-    DataLayoutString = "e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                       "v96:128-v192:256-v256:256-v512:512-v1024:1024";
+    resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -7490,7 +8044,8 @@
 class XCoreTargetInfo : public TargetInfo {
   static const Builtin::Info BuiltinInfo[];
 public:
-  XCoreTargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
+  XCoreTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
     BigEndian = false;
     NoAsmVariants = true;
     LongLongAlign = 32;
@@ -7502,8 +8057,8 @@
     WCharType = UnsignedChar;
     WIntType = UnsignedInt;
     UseZeroLengthBitfieldAlignment = true;
-    DataLayoutString = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32"
-                       "-f64:32-a:0:32-n32";
+    resetDataLayout("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32"
+                    "-f64:32-a:0:32-n32");
   }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override {
@@ -7537,6 +8092,9 @@
     // R0=ExceptionPointerRegister R1=ExceptionSelectorRegister
     return (RegNo < 2)? RegNo : -1;
   }
+  bool allowsLargerPreferedTypeAlignment() const override {
+    return false;
+  }
 };
 
 const Builtin::Info XCoreTargetInfo::BuiltinInfo[] = {
@@ -7550,8 +8108,8 @@
 // x86_32 Android target
 class AndroidX86_32TargetInfo : public LinuxTargetInfo<X86_32TargetInfo> {
 public:
-  AndroidX86_32TargetInfo(const llvm::Triple &Triple)
-      : LinuxTargetInfo<X86_32TargetInfo>(Triple) {
+  AndroidX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : LinuxTargetInfo<X86_32TargetInfo>(Triple, Opts) {
     SuitableAlign = 32;
     LongDoubleWidth = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble;
@@ -7561,8 +8119,8 @@
 // x86_64 Android target
 class AndroidX86_64TargetInfo : public LinuxTargetInfo<X86_64TargetInfo> {
 public:
-  AndroidX86_64TargetInfo(const llvm::Triple &Triple)
-      : LinuxTargetInfo<X86_64TargetInfo>(Triple) {
+  AndroidX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : LinuxTargetInfo<X86_64TargetInfo>(Triple, Opts) {
     LongDoubleFormat = &llvm::APFloat::IEEEquad;
   }
 
@@ -7570,13 +8128,53 @@
     return true;
   }
 };
+
+// 32-bit RenderScript is armv7 with width and align of 'long' set to 8-bytes
+class RenderScript32TargetInfo : public ARMleTargetInfo {
+public:
+  RenderScript32TargetInfo(const llvm::Triple &Triple,
+                           const TargetOptions &Opts)
+      : ARMleTargetInfo(llvm::Triple("armv7", Triple.getVendorName(),
+                                     Triple.getOSName(),
+                                     Triple.getEnvironmentName()),
+                        Opts) {
+    IsRenderScriptTarget = true;
+    LongWidth = LongAlign = 64;
+  }
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override {
+    Builder.defineMacro("__RENDERSCRIPT__");
+    ARMleTargetInfo::getTargetDefines(Opts, Builder);
+  }
+};
+
+// 64-bit RenderScript is aarch64
+class RenderScript64TargetInfo : public AArch64leTargetInfo {
+public:
+  RenderScript64TargetInfo(const llvm::Triple &Triple,
+                           const TargetOptions &Opts)
+      : AArch64leTargetInfo(llvm::Triple("aarch64", Triple.getVendorName(),
+                                         Triple.getOSName(),
+                                         Triple.getEnvironmentName()),
+                            Opts) {
+    IsRenderScriptTarget = true;
+  }
+
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override {
+    Builder.defineMacro("__RENDERSCRIPT__");
+    AArch64leTargetInfo::getTargetDefines(Opts, Builder);
+  }
+};
+
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Driver code
 //===----------------------------------------------------------------------===//
 
-static TargetInfo *AllocateTarget(const llvm::Triple &Triple) {
+static TargetInfo *AllocateTarget(const llvm::Triple &Triple,
+                                  const TargetOptions &Opts) {
   llvm::Triple::OSType os = Triple.getOS();
 
   switch (Triple.getArch()) {
@@ -7584,414 +8182,424 @@
     return nullptr;
 
   case llvm::Triple::xcore:
-    return new XCoreTargetInfo(Triple);
+    return new XCoreTargetInfo(Triple, Opts);
 
   case llvm::Triple::hexagon:
-    return new HexagonTargetInfo(Triple);
+    return new HexagonTargetInfo(Triple, Opts);
+
+  case llvm::Triple::lanai:
+    return new LanaiTargetInfo(Triple, Opts);
 
   case llvm::Triple::aarch64:
     if (Triple.isOSDarwin())
-      return new DarwinAArch64TargetInfo(Triple);
+      return new DarwinAArch64TargetInfo(Triple, Opts);
 
     switch (os) {
     case llvm::Triple::CloudABI:
-      return new CloudABITargetInfo<AArch64leTargetInfo>(Triple);
+      return new CloudABITargetInfo<AArch64leTargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<AArch64leTargetInfo>(Triple);
+      return new FreeBSDTargetInfo<AArch64leTargetInfo>(Triple, Opts);
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<AArch64leTargetInfo>(Triple);
+      return new LinuxTargetInfo<AArch64leTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<AArch64leTargetInfo>(Triple);
+      return new NetBSDTargetInfo<AArch64leTargetInfo>(Triple, Opts);
     default:
-      return new AArch64leTargetInfo(Triple);
+      return new AArch64leTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::aarch64_be:
     switch (os) {
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<AArch64beTargetInfo>(Triple);
+      return new FreeBSDTargetInfo<AArch64beTargetInfo>(Triple, Opts);
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<AArch64beTargetInfo>(Triple);
+      return new LinuxTargetInfo<AArch64beTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<AArch64beTargetInfo>(Triple);
+      return new NetBSDTargetInfo<AArch64beTargetInfo>(Triple, Opts);
     default:
-      return new AArch64beTargetInfo(Triple);
+      return new AArch64beTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::arm:
   case llvm::Triple::thumb:
     if (Triple.isOSBinFormatMachO())
-      return new DarwinARMTargetInfo(Triple);
+      return new DarwinARMTargetInfo(Triple, Opts);
 
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<ARMleTargetInfo>(Triple);
+      return new LinuxTargetInfo<ARMleTargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<ARMleTargetInfo>(Triple);
+      return new FreeBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<ARMleTargetInfo>(Triple);
+      return new NetBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<ARMleTargetInfo>(Triple);
+      return new OpenBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
     case llvm::Triple::Bitrig:
-      return new BitrigTargetInfo<ARMleTargetInfo>(Triple);
+      return new BitrigTargetInfo<ARMleTargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<ARMleTargetInfo>(Triple);
+      return new RTEMSTargetInfo<ARMleTargetInfo>(Triple, Opts);
     case llvm::Triple::NaCl:
-      return new NaClTargetInfo<ARMleTargetInfo>(Triple);
+      return new NaClTargetInfo<ARMleTargetInfo>(Triple, Opts);
     case llvm::Triple::Win32:
       switch (Triple.getEnvironment()) {
       case llvm::Triple::Cygnus:
-        return new CygwinARMTargetInfo(Triple);
+        return new CygwinARMTargetInfo(Triple, Opts);
       case llvm::Triple::GNU:
-        return new MinGWARMTargetInfo(Triple);
+        return new MinGWARMTargetInfo(Triple, Opts);
       case llvm::Triple::Itanium:
-        return new ItaniumWindowsARMleTargetInfo(Triple);
+        return new ItaniumWindowsARMleTargetInfo(Triple, Opts);
       case llvm::Triple::MSVC:
       default: // Assume MSVC for unknown environments
-        return new MicrosoftARMleTargetInfo(Triple);
+        return new MicrosoftARMleTargetInfo(Triple, Opts);
       }
     default:
-      return new ARMleTargetInfo(Triple);
+      return new ARMleTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::armeb:
   case llvm::Triple::thumbeb:
     if (Triple.isOSDarwin())
-      return new DarwinARMTargetInfo(Triple);
+      return new DarwinARMTargetInfo(Triple, Opts);
 
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<ARMbeTargetInfo>(Triple);
+      return new LinuxTargetInfo<ARMbeTargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<ARMbeTargetInfo>(Triple);
+      return new FreeBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<ARMbeTargetInfo>(Triple);
+      return new NetBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<ARMbeTargetInfo>(Triple);
+      return new OpenBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
     case llvm::Triple::Bitrig:
-      return new BitrigTargetInfo<ARMbeTargetInfo>(Triple);
+      return new BitrigTargetInfo<ARMbeTargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<ARMbeTargetInfo>(Triple);
+      return new RTEMSTargetInfo<ARMbeTargetInfo>(Triple, Opts);
     case llvm::Triple::NaCl:
-      return new NaClTargetInfo<ARMbeTargetInfo>(Triple);
+      return new NaClTargetInfo<ARMbeTargetInfo>(Triple, Opts);
     default:
-      return new ARMbeTargetInfo(Triple);
+      return new ARMbeTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::bpfeb:
   case llvm::Triple::bpfel:
-    return new BPFTargetInfo(Triple);
+    return new BPFTargetInfo(Triple, Opts);
 
   case llvm::Triple::msp430:
-    return new MSP430TargetInfo(Triple);
+    return new MSP430TargetInfo(Triple, Opts);
 
   case llvm::Triple::mips:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<Mips32EBTargetInfo>(Triple);
+      return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<Mips32EBTargetInfo>(Triple);
+      return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<Mips32EBTargetInfo>(Triple);
+      return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<Mips32EBTargetInfo>(Triple);
+      return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     default:
-      return new Mips32EBTargetInfo(Triple);
+      return new MipsTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::mipsel:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<Mips32ELTargetInfo>(Triple);
+      return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<Mips32ELTargetInfo>(Triple);
+      return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<Mips32ELTargetInfo>(Triple);
+      return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<Mips32ELTargetInfo>(Triple);
+      return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::NaCl:
-      return new NaClTargetInfo<NaClMips32ELTargetInfo>(Triple);
+      return new NaClTargetInfo<NaClMips32TargetInfo>(Triple, Opts);
     default:
-      return new Mips32ELTargetInfo(Triple);
+      return new MipsTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::mips64:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<Mips64EBTargetInfo>(Triple);
+      return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<Mips64EBTargetInfo>(Triple);
+      return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<Mips64EBTargetInfo>(Triple);
+      return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<Mips64EBTargetInfo>(Triple);
+      return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<Mips64EBTargetInfo>(Triple);
+      return new OpenBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     default:
-      return new Mips64EBTargetInfo(Triple);
+      return new MipsTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::mips64el:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<Mips64ELTargetInfo>(Triple);
+      return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<Mips64ELTargetInfo>(Triple);
+      return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<Mips64ELTargetInfo>(Triple);
+      return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<Mips64ELTargetInfo>(Triple);
+      return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<Mips64ELTargetInfo>(Triple);
+      return new OpenBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
     default:
-      return new Mips64ELTargetInfo(Triple);
+      return new MipsTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::le32:
     switch (os) {
     case llvm::Triple::NaCl:
-      return new NaClTargetInfo<PNaClTargetInfo>(Triple);
+      return new NaClTargetInfo<PNaClTargetInfo>(Triple, Opts);
     default:
       return nullptr;
     }
 
   case llvm::Triple::le64:
-    return new Le64TargetInfo(Triple);
+    return new Le64TargetInfo(Triple, Opts);
 
   case llvm::Triple::ppc:
     if (Triple.isOSDarwin())
-      return new DarwinPPC32TargetInfo(Triple);
+      return new DarwinPPC32TargetInfo(Triple, Opts);
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<PPC32TargetInfo>(Triple);
+      return new LinuxTargetInfo<PPC32TargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<PPC32TargetInfo>(Triple);
+      return new FreeBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<PPC32TargetInfo>(Triple);
+      return new NetBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<PPC32TargetInfo>(Triple);
+      return new OpenBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<PPC32TargetInfo>(Triple);
+      return new RTEMSTargetInfo<PPC32TargetInfo>(Triple, Opts);
     default:
-      return new PPC32TargetInfo(Triple);
+      return new PPC32TargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::ppc64:
     if (Triple.isOSDarwin())
-      return new DarwinPPC64TargetInfo(Triple);
+      return new DarwinPPC64TargetInfo(Triple, Opts);
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<PPC64TargetInfo>(Triple);
+      return new LinuxTargetInfo<PPC64TargetInfo>(Triple, Opts);
     case llvm::Triple::Lv2:
-      return new PS3PPUTargetInfo<PPC64TargetInfo>(Triple);
+      return new PS3PPUTargetInfo<PPC64TargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<PPC64TargetInfo>(Triple);
+      return new FreeBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<PPC64TargetInfo>(Triple);
+      return new NetBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
     default:
-      return new PPC64TargetInfo(Triple);
+      return new PPC64TargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::ppc64le:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<PPC64TargetInfo>(Triple);
+      return new LinuxTargetInfo<PPC64TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<PPC64TargetInfo>(Triple);
+      return new NetBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
     default:
-      return new PPC64TargetInfo(Triple);
+      return new PPC64TargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::nvptx:
-    return new NVPTX32TargetInfo(Triple);
+    return new NVPTX32TargetInfo(Triple, Opts);
   case llvm::Triple::nvptx64:
-    return new NVPTX64TargetInfo(Triple);
+    return new NVPTX64TargetInfo(Triple, Opts);
 
   case llvm::Triple::amdgcn:
   case llvm::Triple::r600:
-    return new AMDGPUTargetInfo(Triple);
+    return new AMDGPUTargetInfo(Triple, Opts);
 
   case llvm::Triple::sparc:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<SparcV8TargetInfo>(Triple);
+      return new LinuxTargetInfo<SparcV8TargetInfo>(Triple, Opts);
     case llvm::Triple::Solaris:
-      return new SolarisTargetInfo<SparcV8TargetInfo>(Triple);
+      return new SolarisTargetInfo<SparcV8TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<SparcV8TargetInfo>(Triple);
+      return new NetBSDTargetInfo<SparcV8TargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<SparcV8TargetInfo>(Triple);
+      return new OpenBSDTargetInfo<SparcV8TargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<SparcV8TargetInfo>(Triple);
+      return new RTEMSTargetInfo<SparcV8TargetInfo>(Triple, Opts);
     default:
-      return new SparcV8TargetInfo(Triple);
+      return new SparcV8TargetInfo(Triple, Opts);
     }
 
   // The 'sparcel' architecture copies all the above cases except for Solaris.
   case llvm::Triple::sparcel:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<SparcV8elTargetInfo>(Triple);
+      return new LinuxTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<SparcV8elTargetInfo>(Triple);
+      return new NetBSDTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<SparcV8elTargetInfo>(Triple);
+      return new OpenBSDTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSTargetInfo<SparcV8elTargetInfo>(Triple);
+      return new RTEMSTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
     default:
-      return new SparcV8elTargetInfo(Triple);
+      return new SparcV8elTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::sparcv9:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<SparcV9TargetInfo>(Triple);
+      return new LinuxTargetInfo<SparcV9TargetInfo>(Triple, Opts);
     case llvm::Triple::Solaris:
-      return new SolarisTargetInfo<SparcV9TargetInfo>(Triple);
+      return new SolarisTargetInfo<SparcV9TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<SparcV9TargetInfo>(Triple);
+      return new NetBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<SparcV9TargetInfo>(Triple);
+      return new OpenBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<SparcV9TargetInfo>(Triple);
+      return new FreeBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
     default:
-      return new SparcV9TargetInfo(Triple);
+      return new SparcV9TargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::systemz:
     switch (os) {
     case llvm::Triple::Linux:
-      return new LinuxTargetInfo<SystemZTargetInfo>(Triple);
+      return new LinuxTargetInfo<SystemZTargetInfo>(Triple, Opts);
     default:
-      return new SystemZTargetInfo(Triple);
+      return new SystemZTargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::tce:
-    return new TCETargetInfo(Triple);
+    return new TCETargetInfo(Triple, Opts);
 
   case llvm::Triple::x86:
     if (Triple.isOSDarwin())
-      return new DarwinI386TargetInfo(Triple);
+      return new DarwinI386TargetInfo(Triple, Opts);
 
     switch (os) {
     case llvm::Triple::CloudABI:
-      return new CloudABITargetInfo<X86_32TargetInfo>(Triple);
+      return new CloudABITargetInfo<X86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::Linux: {
       switch (Triple.getEnvironment()) {
       default:
-        return new LinuxTargetInfo<X86_32TargetInfo>(Triple);
+        return new LinuxTargetInfo<X86_32TargetInfo>(Triple, Opts);
       case llvm::Triple::Android:
-        return new AndroidX86_32TargetInfo(Triple);
+        return new AndroidX86_32TargetInfo(Triple, Opts);
       }
     }
     case llvm::Triple::DragonFly:
-      return new DragonFlyBSDTargetInfo<X86_32TargetInfo>(Triple);
+      return new DragonFlyBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDI386TargetInfo(Triple);
+      return new NetBSDI386TargetInfo(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDI386TargetInfo(Triple);
+      return new OpenBSDI386TargetInfo(Triple, Opts);
     case llvm::Triple::Bitrig:
-      return new BitrigI386TargetInfo(Triple);
+      return new BitrigI386TargetInfo(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<X86_32TargetInfo>(Triple);
+      return new FreeBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::KFreeBSD:
-      return new KFreeBSDTargetInfo<X86_32TargetInfo>(Triple);
+      return new KFreeBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::Minix:
-      return new MinixTargetInfo<X86_32TargetInfo>(Triple);
+      return new MinixTargetInfo<X86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::Solaris:
-      return new SolarisTargetInfo<X86_32TargetInfo>(Triple);
+      return new SolarisTargetInfo<X86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::Win32: {
       switch (Triple.getEnvironment()) {
       case llvm::Triple::Cygnus:
-        return new CygwinX86_32TargetInfo(Triple);
+        return new CygwinX86_32TargetInfo(Triple, Opts);
       case llvm::Triple::GNU:
-        return new MinGWX86_32TargetInfo(Triple);
+        return new MinGWX86_32TargetInfo(Triple, Opts);
       case llvm::Triple::Itanium:
       case llvm::Triple::MSVC:
       default: // Assume MSVC for unknown environments
-        return new MicrosoftX86_32TargetInfo(Triple);
+        return new MicrosoftX86_32TargetInfo(Triple, Opts);
       }
     }
     case llvm::Triple::Haiku:
-      return new HaikuX86_32TargetInfo(Triple);
+      return new HaikuX86_32TargetInfo(Triple, Opts);
     case llvm::Triple::RTEMS:
-      return new RTEMSX86_32TargetInfo(Triple);
+      return new RTEMSX86_32TargetInfo(Triple, Opts);
     case llvm::Triple::NaCl:
-      return new NaClTargetInfo<X86_32TargetInfo>(Triple);
+      return new NaClTargetInfo<X86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::ELFIAMCU:
-      return new MCUX86_32TargetInfo(Triple);
+      return new MCUX86_32TargetInfo(Triple, Opts);
     default:
-      return new X86_32TargetInfo(Triple);
+      return new X86_32TargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::x86_64:
     if (Triple.isOSDarwin() || Triple.isOSBinFormatMachO())
-      return new DarwinX86_64TargetInfo(Triple);
+      return new DarwinX86_64TargetInfo(Triple, Opts);
 
     switch (os) {
     case llvm::Triple::CloudABI:
-      return new CloudABITargetInfo<X86_64TargetInfo>(Triple);
+      return new CloudABITargetInfo<X86_64TargetInfo>(Triple, Opts);
     case llvm::Triple::Linux: {
       switch (Triple.getEnvironment()) {
       default:
-        return new LinuxTargetInfo<X86_64TargetInfo>(Triple);
+        return new LinuxTargetInfo<X86_64TargetInfo>(Triple, Opts);
       case llvm::Triple::Android:
-        return new AndroidX86_64TargetInfo(Triple);
+        return new AndroidX86_64TargetInfo(Triple, Opts);
       }
     }
     case llvm::Triple::DragonFly:
-      return new DragonFlyBSDTargetInfo<X86_64TargetInfo>(Triple);
+      return new DragonFlyBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
-      return new NetBSDTargetInfo<X86_64TargetInfo>(Triple);
+      return new NetBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
     case llvm::Triple::OpenBSD:
-      return new OpenBSDX86_64TargetInfo(Triple);
+      return new OpenBSDX86_64TargetInfo(Triple, Opts);
     case llvm::Triple::Bitrig:
-      return new BitrigX86_64TargetInfo(Triple);
+      return new BitrigX86_64TargetInfo(Triple, Opts);
     case llvm::Triple::FreeBSD:
-      return new FreeBSDTargetInfo<X86_64TargetInfo>(Triple);
+      return new FreeBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
     case llvm::Triple::KFreeBSD:
-      return new KFreeBSDTargetInfo<X86_64TargetInfo>(Triple);
+      return new KFreeBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
     case llvm::Triple::Solaris:
-      return new SolarisTargetInfo<X86_64TargetInfo>(Triple);
+      return new SolarisTargetInfo<X86_64TargetInfo>(Triple, Opts);
     case llvm::Triple::Win32: {
       switch (Triple.getEnvironment()) {
       case llvm::Triple::Cygnus:
-        return new CygwinX86_64TargetInfo(Triple);
+        return new CygwinX86_64TargetInfo(Triple, Opts);
       case llvm::Triple::GNU:
-        return new MinGWX86_64TargetInfo(Triple);
+        return new MinGWX86_64TargetInfo(Triple, Opts);
       case llvm::Triple::MSVC:
       default: // Assume MSVC for unknown environments
-        return new MicrosoftX86_64TargetInfo(Triple);
+        return new MicrosoftX86_64TargetInfo(Triple, Opts);
       }
     }
+    case llvm::Triple::Haiku:
+      return new HaikuTargetInfo<X86_64TargetInfo>(Triple, Opts);
     case llvm::Triple::NaCl:
-      return new NaClTargetInfo<X86_64TargetInfo>(Triple);
+      return new NaClTargetInfo<X86_64TargetInfo>(Triple, Opts);
     case llvm::Triple::PS4:
-      return new PS4OSTargetInfo<X86_64TargetInfo>(Triple);
+      return new PS4OSTargetInfo<X86_64TargetInfo>(Triple, Opts);
     default:
-      return new X86_64TargetInfo(Triple);
+      return new X86_64TargetInfo(Triple, Opts);
     }
 
   case llvm::Triple::spir: {
     if (Triple.getOS() != llvm::Triple::UnknownOS ||
         Triple.getEnvironment() != llvm::Triple::UnknownEnvironment)
       return nullptr;
-    return new SPIR32TargetInfo(Triple);
+    return new SPIR32TargetInfo(Triple, Opts);
   }
   case llvm::Triple::spir64: {
     if (Triple.getOS() != llvm::Triple::UnknownOS ||
         Triple.getEnvironment() != llvm::Triple::UnknownEnvironment)
       return nullptr;
-    return new SPIR64TargetInfo(Triple);
+    return new SPIR64TargetInfo(Triple, Opts);
   }
   case llvm::Triple::wasm32:
     if (!(Triple == llvm::Triple("wasm32-unknown-unknown")))
       return nullptr;
-    return new WebAssemblyOSTargetInfo<WebAssembly32TargetInfo>(Triple);
+    return new WebAssemblyOSTargetInfo<WebAssembly32TargetInfo>(Triple, Opts);
   case llvm::Triple::wasm64:
     if (!(Triple == llvm::Triple("wasm64-unknown-unknown")))
       return nullptr;
-    return new WebAssemblyOSTargetInfo<WebAssembly64TargetInfo>(Triple);
+    return new WebAssemblyOSTargetInfo<WebAssembly64TargetInfo>(Triple, Opts);
+
+  case llvm::Triple::renderscript32:
+    return new LinuxTargetInfo<RenderScript32TargetInfo>(Triple, Opts);
+  case llvm::Triple::renderscript64:
+    return new LinuxTargetInfo<RenderScript64TargetInfo>(Triple, Opts);
   }
 }
 
@@ -8003,7 +8611,7 @@
   llvm::Triple Triple(Opts->Triple);
 
   // Construct the target
-  std::unique_ptr<TargetInfo> Target(AllocateTarget(Triple));
+  std::unique_ptr<TargetInfo> Target(AllocateTarget(Triple, *Opts));
   if (!Target) {
     Diags.Report(diag::err_target_unknown_triple) << Triple.str();
     return nullptr;
@@ -8043,5 +8651,10 @@
   if (!Target->handleTargetFeatures(Opts->Features, Diags))
     return nullptr;
 
+  Target->setSupportedOpenCLOpts();
+
+  if (!Target->validateTarget(Diags))
+    return nullptr;
+
   return Target.release();
 }
diff --git a/lib/Basic/VirtualFileSystem.cpp b/lib/Basic/VirtualFileSystem.cpp
index ba10186..a56c911 100644
--- a/lib/Basic/VirtualFileSystem.cpp
+++ b/lib/Basic/VirtualFileSystem.cpp
@@ -16,14 +16,16 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/YAMLParser.h"
-#include "llvm/Config/llvm-config.h"
 #include <atomic>
 #include <memory>
+#include <utility>
 
 // For chdir.
 #ifdef LLVM_ON_WIN32
@@ -138,16 +140,19 @@
 class RealFile : public File {
   int FD;
   Status S;
+  std::string RealName;
   friend class RealFileSystem;
-  RealFile(int FD, StringRef NewName)
+  RealFile(int FD, StringRef NewName, StringRef NewRealPathName)
       : FD(FD), S(NewName, {}, {}, {}, {}, {},
-                  llvm::sys::fs::file_type::status_error, {}) {
+                  llvm::sys::fs::file_type::status_error, {}),
+        RealName(NewRealPathName.str()) {
     assert(FD >= 0 && "Invalid or inactive file descriptor");
   }
 
 public:
   ~RealFile() override;
   ErrorOr<Status> status() override;
+  ErrorOr<std::string> getName() override;
   ErrorOr<std::unique_ptr<MemoryBuffer>> getBuffer(const Twine &Name,
                                                    int64_t FileSize,
                                                    bool RequiresNullTerminator,
@@ -168,6 +173,10 @@
   return S;
 }
 
+ErrorOr<std::string> RealFile::getName() {
+  return RealName.empty() ? S.getName().str() : RealName;
+}
+
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 RealFile::getBuffer(const Twine &Name, int64_t FileSize,
                     bool RequiresNullTerminator, bool IsVolatile) {
@@ -176,21 +185,10 @@
                                    IsVolatile);
 }
 
-// FIXME: This is terrible, we need this for ::close.
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#include <sys/uio.h>
-#else
-#include <io.h>
-#ifndef S_ISFIFO
-#define S_ISFIFO(x) (0)
-#endif
-#endif
 std::error_code RealFile::close() {
-  if (::close(FD))
-    return std::error_code(errno, std::generic_category());
+  std::error_code EC = sys::Process::SafelyCloseFileDescriptor(FD);
   FD = -1;
-  return std::error_code();
+  return EC;
 }
 
 namespace {
@@ -216,9 +214,10 @@
 ErrorOr<std::unique_ptr<File>>
 RealFileSystem::openFileForRead(const Twine &Name) {
   int FD;
-  if (std::error_code EC = sys::fs::openFileForRead(Name, FD))
+  SmallString<256> RealName;
+  if (std::error_code EC = sys::fs::openFileForRead(Name, FD, &RealName))
     return EC;
-  return std::unique_ptr<File>(new RealFile(FD, Name.str()));
+  return std::unique_ptr<File>(new RealFile(FD, Name.str(), RealName.str()));
 }
 
 llvm::ErrorOr<std::string> RealFileSystem::getCurrentWorkingDirectory() const {
@@ -289,7 +288,7 @@
 // OverlayFileSystem implementation
 //===-----------------------------------------------------------------------===/
 OverlayFileSystem::OverlayFileSystem(IntrusiveRefCntPtr<FileSystem> BaseFS) {
-  FSList.push_back(BaseFS);
+  FSList.push_back(std::move(BaseFS));
 }
 
 void OverlayFileSystem::pushOverlay(IntrusiveRefCntPtr<FileSystem> FS) {
@@ -802,6 +801,7 @@
 ///   'case-sensitive': <boolean, default=true>
 ///   'use-external-names': <boolean, default=true>
 ///   'overlay-relative': <boolean, default=false>
+///   'ignore-non-existent-contents': <boolean, default=true>
 ///
 /// Virtual directories are represented as
 /// \verbatim
@@ -861,6 +861,14 @@
   /// \brief Whether to use to use the value of 'external-contents' for the
   /// names of files.  This global value is overridable on a per-file basis.
   bool UseExternalNames = true;
+
+  /// \brief Whether an invalid path obtained via 'external-contents' should
+  /// cause iteration on the VFS to stop. If 'true', the VFS should ignore
+  /// the entry and continue with the next. Allows YAML files to be shared
+  /// across multiple compiler invocations regardless of prior existent
+  /// paths in 'external-contents'. This global value is overridable on a
+  /// per-file basis.
+  bool IgnoreNonExistentContents = true;
   /// @}
 
   /// Virtual file paths and external files could be canonicalized without "..",
@@ -877,7 +885,7 @@
 
 private:
   RedirectingFileSystem(IntrusiveRefCntPtr<FileSystem> ExternalFS)
-      : ExternalFS(ExternalFS) {}
+      : ExternalFS(std::move(ExternalFS)) {}
 
   /// \brief Looks up \p Path in \c Roots.
   ErrorOr<Entry *> lookupPath(const Twine &Path);
@@ -938,6 +946,10 @@
     return ExternalContentsPrefixDir;
   }
 
+  bool ignoreNonExistentContents() const {
+    return IgnoreNonExistentContents;
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void dump() const {
     for (const std::unique_ptr<Entry> &Root : Roots)
@@ -1302,6 +1314,7 @@
       KeyStatusPair("case-sensitive", false),
       KeyStatusPair("use-external-names", false),
       KeyStatusPair("overlay-relative", false),
+      KeyStatusPair("ignore-non-existent-contents", false),
       KeyStatusPair("roots", true),
     };
 
@@ -1360,6 +1373,9 @@
       } else if (Key == "use-external-names") {
         if (!parseScalarBool(I->getValue(), FS->UseExternalNames))
           return false;
+      } else if (Key == "ignore-non-existent-contents") {
+        if (!parseScalarBool(I->getValue(), FS->IgnoreNonExistentContents))
+          return false;
       } else {
         llvm_unreachable("key missing from Keys");
       }
@@ -1404,7 +1420,7 @@
   RedirectingFileSystemParser P(Stream);
 
   std::unique_ptr<RedirectingFileSystem> FS(
-      new RedirectingFileSystem(ExternalFS));
+      new RedirectingFileSystem(std::move(ExternalFS)));
 
   if (!YAMLFilePath.empty()) {
     // Use the YAML path from -ivfsoverlay to compute the dir to be prefixed
@@ -1540,7 +1556,7 @@
 
 public:
   FileWithFixedStatus(std::unique_ptr<File> InnerFile, Status S)
-      : InnerFile(std::move(InnerFile)), S(S) {}
+      : InnerFile(std::move(InnerFile)), S(std::move(S)) {}
 
   ErrorOr<Status> status() override { return S; }
   ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
@@ -1585,7 +1601,8 @@
                     void *DiagContext,
                     IntrusiveRefCntPtr<FileSystem> ExternalFS) {
   return RedirectingFileSystem::create(std::move(Buffer), DiagHandler,
-                                       YAMLFilePath, DiagContext, ExternalFS);
+                                       YAMLFilePath, DiagContext,
+                                       std::move(ExternalFS));
 }
 
 UniqueID vfs::getNextVirtualUniqueID() {
@@ -1619,7 +1636,7 @@
   JSONWriter(llvm::raw_ostream &OS) : OS(OS) {}
   void write(ArrayRef<YAMLVFSEntry> Entries, Optional<bool> UseExternalNames,
              Optional<bool> IsCaseSensitive, Optional<bool> IsOverlayRelative,
-             StringRef OverlayDir);
+             Optional<bool> IgnoreNonExistentContents, StringRef OverlayDir);
 };
 }
 
@@ -1675,6 +1692,7 @@
                        Optional<bool> UseExternalNames,
                        Optional<bool> IsCaseSensitive,
                        Optional<bool> IsOverlayRelative,
+                       Optional<bool> IgnoreNonExistentContents,
                        StringRef OverlayDir) {
   using namespace llvm::sys;
 
@@ -1692,6 +1710,9 @@
     OS << "  'overlay-relative': '"
        << (UseOverlayRelative ? "true" : "false") << "',\n";
   }
+  if (IgnoreNonExistentContents.hasValue())
+    OS << "  'ignore-non-existent-contents': '"
+       << (IgnoreNonExistentContents.getValue() ? "true" : "false") << "',\n";
   OS << "  'roots': [\n";
 
   if (!Entries.empty()) {
@@ -1748,7 +1769,8 @@
   });
 
   JSONWriter(OS).write(Mappings, UseExternalNames, IsCaseSensitive,
-                       IsOverlayRelative, OverlayDir);
+                       IsOverlayRelative, IgnoreNonExistentContents,
+                       OverlayDir);
 }
 
 VFSFromYamlDirIterImpl::VFSFromYamlDirIterImpl(
@@ -1765,7 +1787,8 @@
       return;
     }
     // Skip entries which do not map to a reliable external content.
-    if (S.getError() == llvm::errc::no_such_file_or_directory) {
+    if (FS.ignoreNonExistentContents() &&
+        S.getError() == llvm::errc::no_such_file_or_directory) {
       ++Current;
       continue;
     } else {
@@ -1783,7 +1806,8 @@
     llvm::ErrorOr<vfs::Status> S = FS.status(PathStr);
     if (!S) {
       // Skip entries which do not map to a reliable external content.
-      if (S.getError() == llvm::errc::no_such_file_or_directory) {
+      if (FS.ignoreNonExistentContents() &&
+          S.getError() == llvm::errc::no_such_file_or_directory) {
         continue;
       } else {
         return S.getError();
diff --git a/lib/CodeGen/ABIInfo.h b/lib/CodeGen/ABIInfo.h
index bf46290..530a7ef 100644
--- a/lib/CodeGen/ABIInfo.h
+++ b/lib/CodeGen/ABIInfo.h
@@ -92,6 +92,8 @@
                                        CodeGen::Address VAListAddr,
                                        QualType Ty) const = 0;
 
+    bool isAndroid() const;
+
     /// Emit the target dependent code to load a value of
     /// \arg Ty from the \c __builtin_ms_va_list pointed to by \arg VAListAddr.
     virtual CodeGen::Address EmitMSVAArg(CodeGen::CodeGenFunction &CGF,
diff --git a/lib/CodeGen/Address.h b/lib/CodeGen/Address.h
index 9d145fa..3343080 100644
--- a/lib/CodeGen/Address.h
+++ b/lib/CodeGen/Address.h
@@ -104,23 +104,15 @@
 };
 
 }
+
+// Present a minimal LLVM-like casting interface.
+template <class U> inline U cast(CodeGen::Address addr) {
+  return U::castImpl(addr);
+}
+template <class U> inline bool isa(CodeGen::Address addr) {
+  return U::isaImpl(addr);
 }
 
-namespace llvm {
-  // Present a minimal LLVM-like casting interface.
-  template <class U> inline U cast(clang::CodeGen::Address addr) {
-    return U::castImpl(addr);
-  }
-  template <class U> inline bool isa(clang::CodeGen::Address addr) {
-    return U::isaImpl(addr);
-  }
-}
-
-namespace clang {
-  // Make our custom isa and cast available in namespace clang, to mirror
-  // what we do for LLVM's versions in Basic/LLVM.h.
-  using llvm::isa;
-  using llvm::cast;
 }
 
 #endif
diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp
index 8f4a812..80d043a 100644
--- a/lib/CodeGen/BackendUtil.cpp
+++ b/lib/CodeGen/BackendUtil.cpp
@@ -16,9 +16,11 @@
 #include "clang/Frontend/Utils.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
+#include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/IR/DataLayout.h"
@@ -27,9 +29,11 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/LTO/LTOBackend.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/ModuleSummaryIndexObjectFile.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
@@ -42,6 +46,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include <memory>
 using namespace clang;
@@ -58,9 +63,7 @@
 
   Timer CodeGenerationTime;
 
-  mutable legacy::PassManager *CodeGenPasses;
-  mutable legacy::PassManager *PerModulePasses;
-  mutable legacy::FunctionPassManager *PerFunctionPasses;
+  std::unique_ptr<raw_pwrite_stream> OS;
 
 private:
   TargetIRAnalysis getTargetIRAnalysis() const {
@@ -70,70 +73,43 @@
     return TargetIRAnalysis();
   }
 
-  legacy::PassManager *getCodeGenPasses() const {
-    if (!CodeGenPasses) {
-      CodeGenPasses = new legacy::PassManager();
-      CodeGenPasses->add(
-          createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
-    }
-    return CodeGenPasses;
-  }
+  /// Set LLVM command line options passed through -backend-option.
+  void setCommandLineOpts();
 
-  legacy::PassManager *getPerModulePasses() const {
-    if (!PerModulePasses) {
-      PerModulePasses = new legacy::PassManager();
-      PerModulePasses->add(
-          createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
-    }
-    return PerModulePasses;
-  }
-
-  legacy::FunctionPassManager *getPerFunctionPasses() const {
-    if (!PerFunctionPasses) {
-      PerFunctionPasses = new legacy::FunctionPassManager(TheModule);
-      PerFunctionPasses->add(
-          createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
-    }
-    return PerFunctionPasses;
-  }
-
-  void CreatePasses(ModuleSummaryIndex *ModuleSummary);
+  void CreatePasses(legacy::PassManager &MPM, legacy::FunctionPassManager &FPM);
 
   /// Generates the TargetMachine.
-  /// Returns Null if it is unable to create the target machine.
+  /// Leaves TM unchanged if it is unable to create the target machine.
   /// Some of our clang tests specify triples which are not built
   /// into clang. This is okay because these tests check the generated
   /// IR, and they require DataLayout which depends on the triple.
   /// In this case, we allow this method to fail and not report an error.
   /// When MustCreateTM is used, we print an error if we are unable to load
   /// the requested target.
-  TargetMachine *CreateTargetMachine(bool MustCreateTM);
+  void CreateTargetMachine(bool MustCreateTM);
 
   /// Add passes necessary to emit assembly or LLVM IR.
   ///
   /// \return True on success.
-  bool AddEmitPasses(BackendAction Action, raw_pwrite_stream &OS);
+  bool AddEmitPasses(legacy::PassManager &CodeGenPasses, BackendAction Action,
+                     raw_pwrite_stream &OS);
 
 public:
   EmitAssemblyHelper(DiagnosticsEngine &_Diags, const CodeGenOptions &CGOpts,
                      const clang::TargetOptions &TOpts,
                      const LangOptions &LOpts, Module *M)
       : Diags(_Diags), CodeGenOpts(CGOpts), TargetOpts(TOpts), LangOpts(LOpts),
-        TheModule(M), CodeGenerationTime("Code Generation Time"),
-        CodeGenPasses(nullptr), PerModulePasses(nullptr),
-        PerFunctionPasses(nullptr) {}
+        TheModule(M), CodeGenerationTime("Code Generation Time") {}
 
   ~EmitAssemblyHelper() {
-    delete CodeGenPasses;
-    delete PerModulePasses;
-    delete PerFunctionPasses;
     if (CodeGenOpts.DisableFree)
       BuryPointer(std::move(TM));
   }
 
   std::unique_ptr<TargetMachine> TM;
 
-  void EmitAssembly(BackendAction Action, raw_pwrite_stream *OS);
+  void EmitAssembly(BackendAction Action,
+                    std::unique_ptr<raw_pwrite_stream> OS);
 };
 
 // We need this wrapper to access LangOpts and CGOpts from extension functions
@@ -172,8 +148,19 @@
   PM.add(createAddDiscriminatorsPass());
 }
 
+static void addCleanupPassesForSampleProfiler(
+    const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) {
+  // instcombine is needed before sample profile annotation because it converts
+  // certain function calls to be inlinable. simplifycfg and sroa are needed
+  // before instcombine for necessary preparation. E.g. load store is eliminated
+  // properly so that instcombine will not introduce unecessary liverange.
+  PM.add(createCFGSimplificationPass());
+  PM.add(createSROAPass());
+  PM.add(createInstructionCombiningPass());
+}
+
 static void addBoundsCheckingPass(const PassManagerBuilder &Builder,
-                                    legacy::PassManagerBase &PM) {
+                                  legacy::PassManagerBase &PM) {
   PM.add(createBoundsCheckingPass());
 }
 
@@ -189,6 +176,7 @@
   Opts.TraceBB = CGOpts.SanitizeCoverageTraceBB;
   Opts.TraceCmp = CGOpts.SanitizeCoverageTraceCmp;
   Opts.Use8bitCounters = CGOpts.SanitizeCoverage8bitCounters;
+  Opts.TracePC = CGOpts.SanitizeCoverageTracePC;
   PM.add(createSanitizerCoverageModulePass(Opts));
 }
 
@@ -198,14 +186,17 @@
       static_cast<const PassManagerBuilderWrapper&>(Builder);
   const CodeGenOptions &CGOpts = BuilderWrapper.getCGOpts();
   bool Recover = CGOpts.SanitizeRecover.has(SanitizerKind::Address);
-  PM.add(createAddressSanitizerFunctionPass(/*CompileKernel*/false, Recover));
+  bool UseAfterScope = CGOpts.SanitizeAddressUseAfterScope;
+  PM.add(createAddressSanitizerFunctionPass(/*CompileKernel*/ false, Recover,
+                                            UseAfterScope));
   PM.add(createAddressSanitizerModulePass(/*CompileKernel*/false, Recover));
 }
 
 static void addKernelAddressSanitizerPasses(const PassManagerBuilder &Builder,
                                             legacy::PassManagerBase &PM) {
-  PM.add(createAddressSanitizerFunctionPass(/*CompileKernel*/true,
-                                            /*Recover*/true));
+  PM.add(createAddressSanitizerFunctionPass(
+      /*CompileKernel*/ true,
+      /*Recover*/ true, /*UseAfterScope*/ false));
   PM.add(createAddressSanitizerModulePass(/*CompileKernel*/true,
                                           /*Recover*/true));
 }
@@ -243,6 +234,19 @@
   PM.add(createDataFlowSanitizerPass(LangOpts.SanitizerBlacklistFiles));
 }
 
+static void addEfficiencySanitizerPass(const PassManagerBuilder &Builder,
+                                       legacy::PassManagerBase &PM) {
+  const PassManagerBuilderWrapper &BuilderWrapper =
+      static_cast<const PassManagerBuilderWrapper&>(Builder);
+  const LangOptions &LangOpts = BuilderWrapper.getLangOpts();
+  EfficiencySanitizerOptions Opts;
+  if (LangOpts.Sanitize.has(SanitizerKind::EfficiencyCacheFrag))
+    Opts.ToolType = EfficiencySanitizerOptions::ESAN_CacheFrag;
+  else if (LangOpts.Sanitize.has(SanitizerKind::EfficiencyWorkingSet))
+    Opts.ToolType = EfficiencySanitizerOptions::ESAN_WorkingSet;
+  PM.add(createEfficiencySanitizerPass(Opts));
+}
+
 static TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple,
                                          const CodeGenOptions &CodeGenOpts) {
   TargetLibraryInfoImpl *TLII = new TargetLibraryInfoImpl(TargetTriple);
@@ -260,6 +264,9 @@
   case CodeGenOptions::Accelerate:
     TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::Accelerate);
     break;
+  case CodeGenOptions::SVML:
+    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SVML);
+    break;
   default:
     break;
   }
@@ -277,7 +284,8 @@
   MPM->add(createRewriteSymbolsPass(DL));
 }
 
-void EmitAssemblyHelper::CreatePasses(ModuleSummaryIndex *ModuleSummary) {
+void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,
+                                      legacy::FunctionPassManager &FPM) {
   if (CodeGenOpts.DisableLLVMPasses)
     return;
 
@@ -300,7 +308,8 @@
   switch (Inlining) {
   case CodeGenOptions::NoInlining:
     break;
-  case CodeGenOptions::NormalInlining: {
+  case CodeGenOptions::NormalInlining:
+  case CodeGenOptions::OnlyHintInlining: {
     PMBuilder.Inliner =
         createFunctionInliningPass(OptLevel, CodeGenOpts.OptimizeSize);
     break;
@@ -321,22 +330,19 @@
   PMBuilder.SLPVectorize = CodeGenOpts.VectorizeSLP;
   PMBuilder.LoopVectorize = CodeGenOpts.VectorizeLoop;
 
-  PMBuilder.DisableUnitAtATime = !CodeGenOpts.UnitAtATime;
   PMBuilder.DisableUnrollLoops = !CodeGenOpts.UnrollLoops;
   PMBuilder.MergeFunctions = CodeGenOpts.MergeFunctions;
   PMBuilder.PrepareForThinLTO = CodeGenOpts.EmitSummaryIndex;
   PMBuilder.PrepareForLTO = CodeGenOpts.PrepareForLTO;
   PMBuilder.RerollLoops = CodeGenOpts.RerollLoops;
 
-  legacy::PassManager *MPM = getPerModulePasses();
-
-  // If we are performing a ThinLTO importing compile, invoke the LTO
-  // pipeline and pass down the in-memory module summary index.
-  if (ModuleSummary) {
-    PMBuilder.ModuleSummary = ModuleSummary;
-    PMBuilder.populateThinLTOPassManager(*MPM);
-    return;
-  }
+  // Add target-specific passes that need to run as early as possible.
+  if (TM)
+    PMBuilder.addExtension(
+        PassManagerBuilder::EP_EarlyAsPossible,
+        [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+          TM->addEarlyAsPossiblePasses(PM);
+        });
 
   PMBuilder.addExtension(PassManagerBuilder::EP_EarlyAsPossible,
                          addAddDiscriminatorsPass);
@@ -402,15 +408,20 @@
                            addDataFlowSanitizerPass);
   }
 
+  if (LangOpts.Sanitize.hasOneOf(SanitizerKind::Efficiency)) {
+    PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast,
+                           addEfficiencySanitizerPass);
+    PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                           addEfficiencySanitizerPass);
+  }
+
   // Set up the per-function pass manager.
-  legacy::FunctionPassManager *FPM = getPerFunctionPasses();
   if (CodeGenOpts.VerifyModule)
-    FPM->add(createVerifierPass());
-  PMBuilder.populateFunctionPassManager(*FPM);
+    FPM.add(createVerifierPass());
 
   // Set up the per-module pass manager.
   if (!CodeGenOpts.RewriteMapFiles.empty())
-    addSymbolRewriterPass(CodeGenOpts, MPM);
+    addSymbolRewriterPass(CodeGenOpts, &MPM);
 
   if (!CodeGenOpts.DisableGCov &&
       (CodeGenOpts.EmitGcovArcs || CodeGenOpts.EmitGcovNotes)) {
@@ -425,46 +436,39 @@
     Options.FunctionNamesInData =
         !CodeGenOpts.CoverageNoFunctionNamesInData;
     Options.ExitBlockBeforeBody = CodeGenOpts.CoverageExitBlockBeforeBody;
-    MPM->add(createGCOVProfilerPass(Options));
+    MPM.add(createGCOVProfilerPass(Options));
     if (CodeGenOpts.getDebugInfo() == codegenoptions::NoDebugInfo)
-      MPM->add(createStripSymbolsPass(true));
+      MPM.add(createStripSymbolsPass(true));
   }
 
-  if (CodeGenOpts.ProfileInstrGenerate) {
+  if (CodeGenOpts.hasProfileClangInstr()) {
     InstrProfOptions Options;
     Options.NoRedZone = CodeGenOpts.DisableRedZone;
     Options.InstrProfileOutput = CodeGenOpts.InstrProfileOutput;
-    MPM->add(createInstrProfilingPass(Options));
+    MPM.add(createInstrProfilingLegacyPass(Options));
+  }
+  if (CodeGenOpts.hasProfileIRInstr()) {
+    PMBuilder.EnablePGOInstrGen = true;
+    if (!CodeGenOpts.InstrProfileOutput.empty())
+      PMBuilder.PGOInstrGen = CodeGenOpts.InstrProfileOutput;
+    else
+      PMBuilder.PGOInstrGen = "default_%m.profraw";
+  }
+  if (CodeGenOpts.hasProfileIRUse())
+    PMBuilder.PGOInstrUse = CodeGenOpts.ProfileInstrumentUsePath;
+
+  if (!CodeGenOpts.SampleProfileFile.empty()) {
+    MPM.add(createPruneEHPass());
+    MPM.add(createSampleProfileLoaderPass(CodeGenOpts.SampleProfileFile));
+    PMBuilder.addExtension(PassManagerBuilder::EP_EarlyAsPossible,
+                           addCleanupPassesForSampleProfiler);
   }
 
-  if (!CodeGenOpts.SampleProfileFile.empty())
-    MPM->add(createSampleProfileLoaderPass(CodeGenOpts.SampleProfileFile));
-
-  PMBuilder.populateModulePassManager(*MPM);
+  PMBuilder.populateFunctionPassManager(FPM);
+  PMBuilder.populateModulePassManager(MPM);
 }
 
-TargetMachine *EmitAssemblyHelper::CreateTargetMachine(bool MustCreateTM) {
-  // Create the TargetMachine for generating code.
-  std::string Error;
-  std::string Triple = TheModule->getTargetTriple();
-  const llvm::Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
-  if (!TheTarget) {
-    if (MustCreateTM)
-      Diags.Report(diag::err_fe_unable_to_create_target) << Error;
-    return nullptr;
-  }
-
-  unsigned CodeModel =
-    llvm::StringSwitch<unsigned>(CodeGenOpts.CodeModel)
-      .Case("small", llvm::CodeModel::Small)
-      .Case("kernel", llvm::CodeModel::Kernel)
-      .Case("medium", llvm::CodeModel::Medium)
-      .Case("large", llvm::CodeModel::Large)
-      .Case("default", llvm::CodeModel::Default)
-      .Default(~0u);
-  assert(CodeModel != ~0u && "invalid code model!");
-  llvm::CodeModel::Model CM = static_cast<llvm::CodeModel::Model>(CodeModel);
-
+void EmitAssemblyHelper::setCommandLineOpts() {
   SmallVector<const char *, 16> BackendArgs;
   BackendArgs.push_back("clang"); // Fake program name.
   if (!CodeGenOpts.DebugPass.empty()) {
@@ -480,16 +484,45 @@
   BackendArgs.push_back(nullptr);
   llvm::cl::ParseCommandLineOptions(BackendArgs.size() - 1,
                                     BackendArgs.data());
+}
+
+void EmitAssemblyHelper::CreateTargetMachine(bool MustCreateTM) {
+  // Create the TargetMachine for generating code.
+  std::string Error;
+  std::string Triple = TheModule->getTargetTriple();
+  const llvm::Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
+  if (!TheTarget) {
+    if (MustCreateTM)
+      Diags.Report(diag::err_fe_unable_to_create_target) << Error;
+    return;
+  }
+
+  unsigned CodeModel =
+    llvm::StringSwitch<unsigned>(CodeGenOpts.CodeModel)
+      .Case("small", llvm::CodeModel::Small)
+      .Case("kernel", llvm::CodeModel::Kernel)
+      .Case("medium", llvm::CodeModel::Medium)
+      .Case("large", llvm::CodeModel::Large)
+      .Case("default", llvm::CodeModel::Default)
+      .Default(~0u);
+  assert(CodeModel != ~0u && "invalid code model!");
+  llvm::CodeModel::Model CM = static_cast<llvm::CodeModel::Model>(CodeModel);
 
   std::string FeaturesStr =
       llvm::join(TargetOpts.Features.begin(), TargetOpts.Features.end(), ",");
 
   // Keep this synced with the equivalent code in tools/driver/cc1as_main.cpp.
-  llvm::Reloc::Model RM = llvm::Reloc::Default;
+  llvm::Optional<llvm::Reloc::Model> RM;
   if (CodeGenOpts.RelocationModel == "static") {
     RM = llvm::Reloc::Static;
   } else if (CodeGenOpts.RelocationModel == "pic") {
     RM = llvm::Reloc::PIC_;
+  } else if (CodeGenOpts.RelocationModel == "ropi") {
+    RM = llvm::Reloc::ROPI;
+  } else if (CodeGenOpts.RelocationModel == "rwpi") {
+    RM = llvm::Reloc::RWPI;
+  } else if (CodeGenOpts.RelocationModel == "ropi-rwpi") {
+    RM = llvm::Reloc::ROPI_RWPI;
   } else {
     assert(CodeGenOpts.RelocationModel == "dynamic-no-pic" &&
            "Invalid PIC model!");
@@ -540,38 +573,29 @@
   Options.UseInitArray = CodeGenOpts.UseInitArray;
   Options.DisableIntegratedAS = CodeGenOpts.DisableIntegratedAS;
   Options.CompressDebugSections = CodeGenOpts.CompressDebugSections;
+  Options.RelaxELFRelocations = CodeGenOpts.RelaxELFRelocations;
 
   // Set EABI version.
-  Options.EABIVersion = llvm::StringSwitch<llvm::EABI>(CodeGenOpts.EABIVersion)
+  Options.EABIVersion = llvm::StringSwitch<llvm::EABI>(TargetOpts.EABIVersion)
                             .Case("4", llvm::EABI::EABI4)
                             .Case("5", llvm::EABI::EABI5)
                             .Case("gnu", llvm::EABI::GNU)
                             .Default(llvm::EABI::Default);
 
+  if (LangOpts.SjLjExceptions)
+    Options.ExceptionModel = llvm::ExceptionHandling::SjLj;
+
   Options.LessPreciseFPMADOption = CodeGenOpts.LessPreciseFPMAD;
   Options.NoInfsFPMath = CodeGenOpts.NoInfsFPMath;
   Options.NoNaNsFPMath = CodeGenOpts.NoNaNsFPMath;
   Options.NoZerosInBSS = CodeGenOpts.NoZeroInitializedInBSS;
   Options.UnsafeFPMath = CodeGenOpts.UnsafeFPMath;
   Options.StackAlignmentOverride = CodeGenOpts.StackAlignment;
-  Options.PositionIndependentExecutable = LangOpts.PIELevel != 0;
   Options.FunctionSections = CodeGenOpts.FunctionSections;
   Options.DataSections = CodeGenOpts.DataSections;
   Options.UniqueSectionNames = CodeGenOpts.UniqueSectionNames;
   Options.EmulatedTLS = CodeGenOpts.EmulatedTLS;
-  switch (CodeGenOpts.getDebuggerTuning()) {
-  case CodeGenOptions::DebuggerKindGDB:
-    Options.DebuggerTuning = llvm::DebuggerKind::GDB;
-    break;
-  case CodeGenOptions::DebuggerKindLLDB:
-    Options.DebuggerTuning = llvm::DebuggerKind::LLDB;
-    break;
-  case CodeGenOptions::DebuggerKindSCE:
-    Options.DebuggerTuning = llvm::DebuggerKind::SCE;
-    break;
-  default:
-    break;
-  }
+  Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning();
 
   Options.MCOptions.MCRelaxAll = CodeGenOpts.RelaxAll;
   Options.MCOptions.MCSaveTempLabels = CodeGenOpts.SaveTempLabels;
@@ -581,26 +605,21 @@
       CodeGenOpts.IncrementalLinkerCompatible;
   Options.MCOptions.MCFatalWarnings = CodeGenOpts.FatalWarnings;
   Options.MCOptions.AsmVerbose = CodeGenOpts.AsmVerbose;
+  Options.MCOptions.PreserveAsmComments = CodeGenOpts.PreserveAsmComments;
   Options.MCOptions.ABIName = TargetOpts.ABI;
 
-  TargetMachine *TM = TheTarget->createTargetMachine(Triple, TargetOpts.CPU,
-                                                     FeaturesStr, Options,
-                                                     RM, CM, OptLevel);
-
-  return TM;
+  TM.reset(TheTarget->createTargetMachine(Triple, TargetOpts.CPU, FeaturesStr,
+                                          Options, RM, CM, OptLevel));
 }
 
-bool EmitAssemblyHelper::AddEmitPasses(BackendAction Action,
+bool EmitAssemblyHelper::AddEmitPasses(legacy::PassManager &CodeGenPasses,
+                                       BackendAction Action,
                                        raw_pwrite_stream &OS) {
-
-  // Create the code generator passes.
-  legacy::PassManager *PM = getCodeGenPasses();
-
   // Add LibraryInfo.
   llvm::Triple TargetTriple(TheModule->getTargetTriple());
   std::unique_ptr<TargetLibraryInfoImpl> TLII(
       createTLII(TargetTriple, CodeGenOpts));
-  PM->add(new TargetLibraryInfoWrapperPass(*TLII));
+  CodeGenPasses.add(new TargetLibraryInfoWrapperPass(*TLII));
 
   // Normal mode, emit a .s or .o file by running the code generator. Note,
   // this also adds codegenerator level optimization passes.
@@ -616,9 +635,9 @@
   // "codegen" passes so that it isn't run multiple times when there is
   // inlining happening.
   if (CodeGenOpts.OptimizationLevel > 0)
-    PM->add(createObjCARCContractPass());
+    CodeGenPasses.add(createObjCARCContractPass());
 
-  if (TM->addPassesToEmitFile(*PM, OS, CGFT,
+  if (TM->addPassesToEmitFile(CodeGenPasses, OS, CGFT,
                               /*DisableVerify=*/!CodeGenOpts.VerifyModule)) {
     Diags.Report(diag::err_fe_unable_to_interface_with_target);
     return false;
@@ -628,59 +647,52 @@
 }
 
 void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
-                                      raw_pwrite_stream *OS) {
+                                      std::unique_ptr<raw_pwrite_stream> OS) {
   TimeRegion Region(llvm::TimePassesIsEnabled ? &CodeGenerationTime : nullptr);
 
+  setCommandLineOpts();
+
   bool UsesCodeGen = (Action != Backend_EmitNothing &&
                       Action != Backend_EmitBC &&
                       Action != Backend_EmitLL);
-  if (!TM)
-    TM.reset(CreateTargetMachine(UsesCodeGen));
+  CreateTargetMachine(UsesCodeGen);
 
   if (UsesCodeGen && !TM)
     return;
   if (TM)
     TheModule->setDataLayout(TM->createDataLayout());
 
-  // If we are performing a ThinLTO importing compile, load the function
-  // index into memory and pass it into CreatePasses, which will add it
-  // to the PassManagerBuilder and invoke LTO passes.
-  std::unique_ptr<ModuleSummaryIndex> ModuleSummary;
-  if (!CodeGenOpts.ThinLTOIndexFile.empty()) {
-    ErrorOr<std::unique_ptr<ModuleSummaryIndex>> IndexOrErr =
-        llvm::getModuleSummaryIndexForFile(
-            CodeGenOpts.ThinLTOIndexFile, [&](const DiagnosticInfo &DI) {
-              TheModule->getContext().diagnose(DI);
-            });
-    if (std::error_code EC = IndexOrErr.getError()) {
-      std::string Error = EC.message();
-      errs() << "Error loading index file '" << CodeGenOpts.ThinLTOIndexFile
-             << "': " << Error << "\n";
-      return;
-    }
-    ModuleSummary = std::move(IndexOrErr.get());
-    assert(ModuleSummary && "Expected non-empty module summary index");
-  }
+  legacy::PassManager PerModulePasses;
+  PerModulePasses.add(
+      createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
 
-  CreatePasses(ModuleSummary.get());
+  legacy::FunctionPassManager PerFunctionPasses(TheModule);
+  PerFunctionPasses.add(
+      createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
+
+  CreatePasses(PerModulePasses, PerFunctionPasses);
+
+  legacy::PassManager CodeGenPasses;
+  CodeGenPasses.add(
+      createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
 
   switch (Action) {
   case Backend_EmitNothing:
     break;
 
   case Backend_EmitBC:
-    getPerModulePasses()->add(createBitcodeWriterPass(
+    PerModulePasses.add(createBitcodeWriterPass(
         *OS, CodeGenOpts.EmitLLVMUseLists, CodeGenOpts.EmitSummaryIndex,
         CodeGenOpts.EmitSummaryIndex));
     break;
 
   case Backend_EmitLL:
-    getPerModulePasses()->add(
+    PerModulePasses.add(
         createPrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists));
     break;
 
   default:
-    if (!AddEmitPasses(Action, *OS))
+    if (!AddEmitPasses(CodeGenPasses, Action, *OS))
       return;
   }
 
@@ -690,46 +702,224 @@
   // Run passes. For now we do all passes at once, but eventually we
   // would like to have the option of streaming code generation.
 
-  if (PerFunctionPasses) {
+  {
     PrettyStackTraceString CrashInfo("Per-function optimization");
 
-    PerFunctionPasses->doInitialization();
+    PerFunctionPasses.doInitialization();
     for (Function &F : *TheModule)
       if (!F.isDeclaration())
-        PerFunctionPasses->run(F);
-    PerFunctionPasses->doFinalization();
+        PerFunctionPasses.run(F);
+    PerFunctionPasses.doFinalization();
   }
 
-  if (PerModulePasses) {
+  {
     PrettyStackTraceString CrashInfo("Per-module optimization passes");
-    PerModulePasses->run(*TheModule);
+    PerModulePasses.run(*TheModule);
   }
 
-  if (CodeGenPasses) {
+  {
     PrettyStackTraceString CrashInfo("Code generation");
-    CodeGenPasses->run(*TheModule);
+    CodeGenPasses.run(*TheModule);
+  }
+}
+
+static void runThinLTOBackend(const CodeGenOptions &CGOpts, Module *M,
+                              std::unique_ptr<raw_pwrite_stream> OS) {
+  // If we are performing a ThinLTO importing compile, load the function index
+  // into memory and pass it into thinBackend, which will run the function
+  // importer and invoke LTO passes.
+  ErrorOr<std::unique_ptr<ModuleSummaryIndex>> IndexOrErr =
+      llvm::getModuleSummaryIndexForFile(
+          CGOpts.ThinLTOIndexFile,
+          [&](const DiagnosticInfo &DI) { M->getContext().diagnose(DI); });
+  if (std::error_code EC = IndexOrErr.getError()) {
+    std::string Error = EC.message();
+    errs() << "Error loading index file '" << CGOpts.ThinLTOIndexFile
+           << "': " << Error << "\n";
+    return;
+  }
+  std::unique_ptr<ModuleSummaryIndex> CombinedIndex = std::move(*IndexOrErr);
+
+  auto AddStream = [&](size_t Task) { return std::move(OS); };
+
+  StringMap<std::map<GlobalValue::GUID, GlobalValueSummary *>>
+      ModuleToDefinedGVSummaries;
+  CombinedIndex->collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
+
+  // FIXME: We could simply import the modules mentioned in the combined index
+  // here.
+  FunctionImporter::ImportMapTy ImportList;
+  ComputeCrossModuleImportForModule(M->getModuleIdentifier(), *CombinedIndex,
+                                    ImportList);
+
+  std::vector<std::unique_ptr<llvm::MemoryBuffer>> OwnedImports;
+  MapVector<llvm::StringRef, llvm::MemoryBufferRef> ModuleMap;
+
+  for (auto &I : ImportList) {
+    ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MBOrErr =
+        llvm::MemoryBuffer::getFile(I.first());
+    if (!MBOrErr) {
+      errs() << "Error loading imported file '" << I.first()
+             << "': " << MBOrErr.getError().message() << "\n";
+      return;
+    }
+    ModuleMap[I.first()] = (*MBOrErr)->getMemBufferRef();
+    OwnedImports.push_back(std::move(*MBOrErr));
+  }
+
+  lto::Config Conf;
+  if (Error E = thinBackend(
+          Conf, 0, AddStream, *M, *CombinedIndex, ImportList,
+          ModuleToDefinedGVSummaries[M->getModuleIdentifier()], ModuleMap)) {
+    handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
+      errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
+    });
   }
 }
 
 void clang::EmitBackendOutput(DiagnosticsEngine &Diags,
                               const CodeGenOptions &CGOpts,
                               const clang::TargetOptions &TOpts,
-                              const LangOptions &LOpts, StringRef TDesc,
+                              const LangOptions &LOpts, const llvm::DataLayout &TDesc,
                               Module *M, BackendAction Action,
-                              raw_pwrite_stream *OS) {
+                              std::unique_ptr<raw_pwrite_stream> OS) {
+  if (!CGOpts.ThinLTOIndexFile.empty()) {
+    runThinLTOBackend(CGOpts, M, std::move(OS));
+    return;
+  }
+
   EmitAssemblyHelper AsmHelper(Diags, CGOpts, TOpts, LOpts, M);
 
-  AsmHelper.EmitAssembly(Action, OS);
+  AsmHelper.EmitAssembly(Action, std::move(OS));
 
-  // If an optional clang TargetInfo description string was passed in, use it to
-  // verify the LLVM TargetMachine's DataLayout.
-  if (AsmHelper.TM && !TDesc.empty()) {
+  // Verify clang's TargetInfo DataLayout against the LLVM TargetMachine's
+  // DataLayout.
+  if (AsmHelper.TM) {
     std::string DLDesc = M->getDataLayout().getStringRepresentation();
-    if (DLDesc != TDesc) {
+    if (DLDesc != TDesc.getStringRepresentation()) {
       unsigned DiagID = Diags.getCustomDiagID(
           DiagnosticsEngine::Error, "backend data layout '%0' does not match "
                                     "expected target description '%1'");
-      Diags.Report(DiagID) << DLDesc << TDesc;
+      Diags.Report(DiagID) << DLDesc << TDesc.getStringRepresentation();
     }
   }
 }
+
+static const char* getSectionNameForBitcode(const Triple &T) {
+  switch (T.getObjectFormat()) {
+  case Triple::MachO:
+    return "__LLVM,__bitcode";
+  case Triple::COFF:
+  case Triple::ELF:
+  case Triple::UnknownObjectFormat:
+    return ".llvmbc";
+  }
+  llvm_unreachable("Unimplemented ObjectFormatType");
+}
+
+static const char* getSectionNameForCommandline(const Triple &T) {
+  switch (T.getObjectFormat()) {
+  case Triple::MachO:
+    return "__LLVM,__cmdline";
+  case Triple::COFF:
+  case Triple::ELF:
+  case Triple::UnknownObjectFormat:
+    return ".llvmcmd";
+  }
+  llvm_unreachable("Unimplemented ObjectFormatType");
+}
+
+// With -fembed-bitcode, save a copy of the llvm IR as data in the
+// __LLVM,__bitcode section.
+void clang::EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts,
+                         llvm::MemoryBufferRef Buf) {
+  if (CGOpts.getEmbedBitcode() == CodeGenOptions::Embed_Off)
+    return;
+
+  // Save llvm.compiler.used and remote it.
+  SmallVector<Constant*, 2> UsedArray;
+  SmallSet<GlobalValue*, 4> UsedGlobals;
+  Type *UsedElementType = Type::getInt8Ty(M->getContext())->getPointerTo(0);
+  GlobalVariable *Used = collectUsedGlobalVariables(*M, UsedGlobals, true);
+  for (auto *GV : UsedGlobals) {
+    if (GV->getName() != "llvm.embedded.module" &&
+        GV->getName() != "llvm.cmdline")
+      UsedArray.push_back(
+          ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
+  }
+  if (Used)
+    Used->eraseFromParent();
+
+  // Embed the bitcode for the llvm module.
+  std::string Data;
+  ArrayRef<uint8_t> ModuleData;
+  Triple T(M->getTargetTriple());
+  // Create a constant that contains the bitcode.
+  // In case of embedding a marker, ignore the input Buf and use the empty
+  // ArrayRef. It is also legal to create a bitcode marker even Buf is empty.
+  if (CGOpts.getEmbedBitcode() != CodeGenOptions::Embed_Marker) {
+    if (!isBitcode((const unsigned char *)Buf.getBufferStart(),
+                   (const unsigned char *)Buf.getBufferEnd())) {
+      // If the input is LLVM Assembly, bitcode is produced by serializing
+      // the module. Use-lists order need to be perserved in this case.
+      llvm::raw_string_ostream OS(Data);
+      llvm::WriteBitcodeToFile(M, OS, /* ShouldPreserveUseListOrder */ true);
+      ModuleData =
+          ArrayRef<uint8_t>((const uint8_t *)OS.str().data(), OS.str().size());
+    } else
+      // If the input is LLVM bitcode, write the input byte stream directly.
+      ModuleData = ArrayRef<uint8_t>((const uint8_t *)Buf.getBufferStart(),
+                                     Buf.getBufferSize());
+  }
+  llvm::Constant *ModuleConstant =
+      llvm::ConstantDataArray::get(M->getContext(), ModuleData);
+  llvm::GlobalVariable *GV = new llvm::GlobalVariable(
+      *M, ModuleConstant->getType(), true, llvm::GlobalValue::PrivateLinkage,
+      ModuleConstant);
+  GV->setSection(getSectionNameForBitcode(T));
+  UsedArray.push_back(
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
+  if (llvm::GlobalVariable *Old =
+          M->getGlobalVariable("llvm.embedded.module", true)) {
+    assert(Old->hasOneUse() &&
+           "llvm.embedded.module can only be used once in llvm.compiler.used");
+    GV->takeName(Old);
+    Old->eraseFromParent();
+  } else {
+    GV->setName("llvm.embedded.module");
+  }
+
+  // Skip if only bitcode needs to be embedded.
+  if (CGOpts.getEmbedBitcode() != CodeGenOptions::Embed_Bitcode) {
+    // Embed command-line options.
+    ArrayRef<uint8_t> CmdData(const_cast<uint8_t *>(CGOpts.CmdArgs.data()),
+                              CGOpts.CmdArgs.size());
+    llvm::Constant *CmdConstant =
+      llvm::ConstantDataArray::get(M->getContext(), CmdData);
+    GV = new llvm::GlobalVariable(*M, CmdConstant->getType(), true,
+                                  llvm::GlobalValue::PrivateLinkage,
+                                  CmdConstant);
+    GV->setSection(getSectionNameForCommandline(T));
+    UsedArray.push_back(
+        ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
+    if (llvm::GlobalVariable *Old =
+            M->getGlobalVariable("llvm.cmdline", true)) {
+      assert(Old->hasOneUse() &&
+             "llvm.cmdline can only be used once in llvm.compiler.used");
+      GV->takeName(Old);
+      Old->eraseFromParent();
+    } else {
+      GV->setName("llvm.cmdline");
+    }
+  }
+
+  if (UsedArray.empty())
+    return;
+
+  // Recreate llvm.compiler.used.
+  ArrayType *ATy = ArrayType::get(UsedElementType, UsedArray.size());
+  auto *NewUsed = new GlobalVariable(
+      *M, ATy, false, llvm::GlobalValue::AppendingLinkage,
+      llvm::ConstantArray::get(ATy, UsedArray), "llvm.compiler.used");
+  NewUsed->setSection("llvm.metadata");
+}
diff --git a/lib/CodeGen/CGAtomic.cpp b/lib/CodeGen/CGAtomic.cpp
index f655295..ed5a724 100644
--- a/lib/CodeGen/CGAtomic.cpp
+++ b/lib/CodeGen/CGAtomic.cpp
@@ -11,13 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CodeGenFunction.h"
 #include "CGCall.h"
 #include "CGRecordLayout.h"
+#include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
@@ -221,11 +220,13 @@
     /// \param IsWeak true if atomic operation is weak, false otherwise.
     /// \returns Pair of values: previous value from storage (value type) and
     /// boolean flag (i1 type) with true if success and false otherwise.
-    std::pair<RValue, llvm::Value *> EmitAtomicCompareExchange(
-        RValue Expected, RValue Desired,
-        llvm::AtomicOrdering Success = llvm::SequentiallyConsistent,
-        llvm::AtomicOrdering Failure = llvm::SequentiallyConsistent,
-        bool IsWeak = false);
+    std::pair<RValue, llvm::Value *>
+    EmitAtomicCompareExchange(RValue Expected, RValue Desired,
+                              llvm::AtomicOrdering Success =
+                                  llvm::AtomicOrdering::SequentiallyConsistent,
+                              llvm::AtomicOrdering Failure =
+                                  llvm::AtomicOrdering::SequentiallyConsistent,
+                              bool IsWeak = false);
 
     /// \brief Emits atomic update.
     /// \param AO Atomic ordering.
@@ -241,11 +242,6 @@
     /// Materialize an atomic r-value in atomic-layout memory.
     Address materializeRValue(RValue rvalue) const;
 
-    /// \brief Translates LLVM atomic ordering to GNU atomic ordering for
-    /// libcalls.
-    static AtomicExpr::AtomicOrderingKind
-    translateAtomicOrdering(const llvm::AtomicOrdering AO);
-
     /// \brief Creates temp alloca for intermediate operations on atomic value.
     Address CreateTempAlloca() const;
   private:
@@ -260,13 +256,17 @@
     /// \brief Emits atomic compare-and-exchange op as a libcall.
     llvm::Value *EmitAtomicCompareExchangeLibcall(
         llvm::Value *ExpectedAddr, llvm::Value *DesiredAddr,
-        llvm::AtomicOrdering Success = llvm::SequentiallyConsistent,
-        llvm::AtomicOrdering Failure = llvm::SequentiallyConsistent);
+        llvm::AtomicOrdering Success =
+            llvm::AtomicOrdering::SequentiallyConsistent,
+        llvm::AtomicOrdering Failure =
+            llvm::AtomicOrdering::SequentiallyConsistent);
     /// \brief Emits atomic compare-and-exchange op as LLVM instruction.
     std::pair<llvm::Value *, llvm::Value *> EmitAtomicCompareExchangeOp(
         llvm::Value *ExpectedVal, llvm::Value *DesiredVal,
-        llvm::AtomicOrdering Success = llvm::SequentiallyConsistent,
-        llvm::AtomicOrdering Failure = llvm::SequentiallyConsistent,
+        llvm::AtomicOrdering Success =
+            llvm::AtomicOrdering::SequentiallyConsistent,
+        llvm::AtomicOrdering Failure =
+            llvm::AtomicOrdering::SequentiallyConsistent,
         bool IsWeak = false);
     /// \brief Emit atomic update as libcalls.
     void
@@ -286,25 +286,6 @@
   };
 }
 
-AtomicExpr::AtomicOrderingKind
-AtomicInfo::translateAtomicOrdering(const llvm::AtomicOrdering AO) {
-  switch (AO) {
-  case llvm::Unordered:
-  case llvm::NotAtomic:
-  case llvm::Monotonic:
-    return AtomicExpr::AO_ABI_memory_order_relaxed;
-  case llvm::Acquire:
-    return AtomicExpr::AO_ABI_memory_order_acquire;
-  case llvm::Release:
-    return AtomicExpr::AO_ABI_memory_order_release;
-  case llvm::AcquireRelease:
-    return AtomicExpr::AO_ABI_memory_order_acq_rel;
-  case llvm::SequentiallyConsistent:
-    return AtomicExpr::AO_ABI_memory_order_seq_cst;
-  }
-  llvm_unreachable("Unhandled AtomicOrdering");
-}
-
 Address AtomicInfo::CreateTempAlloca() const {
   Address TempAlloca = CGF.CreateMemTemp(
       (LVal.isBitField() && ValueSizeInBits > AtomicSizeInBits) ? ValueTy
@@ -421,33 +402,39 @@
 /// instructions to cope with the provided (but possibly only dynamically known)
 /// FailureOrder.
 static void emitAtomicCmpXchgFailureSet(CodeGenFunction &CGF, AtomicExpr *E,
-                                        bool IsWeak, Address Dest,
-                                        Address Ptr, Address Val1,
-                                        Address Val2,
+                                        bool IsWeak, Address Dest, Address Ptr,
+                                        Address Val1, Address Val2,
                                         llvm::Value *FailureOrderVal,
                                         uint64_t Size,
                                         llvm::AtomicOrdering SuccessOrder) {
   llvm::AtomicOrdering FailureOrder;
   if (llvm::ConstantInt *FO = dyn_cast<llvm::ConstantInt>(FailureOrderVal)) {
-    switch (FO->getSExtValue()) {
-    default:
-      FailureOrder = llvm::Monotonic;
-      break;
-    case AtomicExpr::AO_ABI_memory_order_consume:
-    case AtomicExpr::AO_ABI_memory_order_acquire:
-      FailureOrder = llvm::Acquire;
-      break;
-    case AtomicExpr::AO_ABI_memory_order_seq_cst:
-      FailureOrder = llvm::SequentiallyConsistent;
-      break;
-    }
-    if (FailureOrder >= SuccessOrder) {
-      // Don't assert on undefined behaviour.
+    auto FOS = FO->getSExtValue();
+    if (!llvm::isValidAtomicOrderingCABI(FOS))
+      FailureOrder = llvm::AtomicOrdering::Monotonic;
+    else
+      switch ((llvm::AtomicOrderingCABI)FOS) {
+      case llvm::AtomicOrderingCABI::relaxed:
+      case llvm::AtomicOrderingCABI::release:
+      case llvm::AtomicOrderingCABI::acq_rel:
+        FailureOrder = llvm::AtomicOrdering::Monotonic;
+        break;
+      case llvm::AtomicOrderingCABI::consume:
+      case llvm::AtomicOrderingCABI::acquire:
+        FailureOrder = llvm::AtomicOrdering::Acquire;
+        break;
+      case llvm::AtomicOrderingCABI::seq_cst:
+        FailureOrder = llvm::AtomicOrdering::SequentiallyConsistent;
+        break;
+      }
+    if (isStrongerThan(FailureOrder, SuccessOrder)) {
+      // Don't assert on undefined behavior "failure argument shall be no
+      // stronger than the success argument".
       FailureOrder =
-        llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrder);
+          llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrder);
     }
-    emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, Size,
-                      SuccessOrder, FailureOrder);
+    emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, Size, SuccessOrder,
+                      FailureOrder);
     return;
   }
 
@@ -455,9 +442,10 @@
   llvm::BasicBlock *MonotonicBB = nullptr, *AcquireBB = nullptr,
                    *SeqCstBB = nullptr;
   MonotonicBB = CGF.createBasicBlock("monotonic_fail", CGF.CurFn);
-  if (SuccessOrder != llvm::Monotonic && SuccessOrder != llvm::Release)
+  if (SuccessOrder != llvm::AtomicOrdering::Monotonic &&
+      SuccessOrder != llvm::AtomicOrdering::Release)
     AcquireBB = CGF.createBasicBlock("acquire_fail", CGF.CurFn);
-  if (SuccessOrder == llvm::SequentiallyConsistent)
+  if (SuccessOrder == llvm::AtomicOrdering::SequentiallyConsistent)
     SeqCstBB = CGF.createBasicBlock("seqcst_fail", CGF.CurFn);
 
   llvm::BasicBlock *ContBB = CGF.createBasicBlock("atomic.continue", CGF.CurFn);
@@ -471,25 +459,25 @@
   // doesn't fold to a constant for the ordering.
   CGF.Builder.SetInsertPoint(MonotonicBB);
   emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2,
-                    Size, SuccessOrder, llvm::Monotonic);
+                    Size, SuccessOrder, llvm::AtomicOrdering::Monotonic);
   CGF.Builder.CreateBr(ContBB);
 
   if (AcquireBB) {
     CGF.Builder.SetInsertPoint(AcquireBB);
     emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2,
-                      Size, SuccessOrder, llvm::Acquire);
+                      Size, SuccessOrder, llvm::AtomicOrdering::Acquire);
     CGF.Builder.CreateBr(ContBB);
-    SI->addCase(CGF.Builder.getInt32(AtomicExpr::AO_ABI_memory_order_consume),
+    SI->addCase(CGF.Builder.getInt32((int)llvm::AtomicOrderingCABI::consume),
                 AcquireBB);
-    SI->addCase(CGF.Builder.getInt32(AtomicExpr::AO_ABI_memory_order_acquire),
+    SI->addCase(CGF.Builder.getInt32((int)llvm::AtomicOrderingCABI::acquire),
                 AcquireBB);
   }
   if (SeqCstBB) {
     CGF.Builder.SetInsertPoint(SeqCstBB);
-    emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2,
-                      Size, SuccessOrder, llvm::SequentiallyConsistent);
+    emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, Size, SuccessOrder,
+                      llvm::AtomicOrdering::SequentiallyConsistent);
     CGF.Builder.CreateBr(ContBB);
-    SI->addCase(CGF.Builder.getInt32(AtomicExpr::AO_ABI_memory_order_seq_cst),
+    SI->addCase(CGF.Builder.getInt32((int)llvm::AtomicOrderingCABI::seq_cst),
                 SeqCstBB);
   }
 
@@ -1036,40 +1024,39 @@
                 E->getOp() == AtomicExpr::AO__atomic_load_n;
 
   if (isa<llvm::ConstantInt>(Order)) {
-    int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
-    switch (ord) {
-    case AtomicExpr::AO_ABI_memory_order_relaxed:
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-                   Size, llvm::Monotonic);
-      break;
-    case AtomicExpr::AO_ABI_memory_order_consume:
-    case AtomicExpr::AO_ABI_memory_order_acquire:
-      if (IsStore)
-        break; // Avoid crashing on code with undefined behavior
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-                   Size, llvm::Acquire);
-      break;
-    case AtomicExpr::AO_ABI_memory_order_release:
-      if (IsLoad)
-        break; // Avoid crashing on code with undefined behavior
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-                   Size, llvm::Release);
-      break;
-    case AtomicExpr::AO_ABI_memory_order_acq_rel:
-      if (IsLoad || IsStore)
-        break; // Avoid crashing on code with undefined behavior
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-                   Size, llvm::AcquireRelease);
-      break;
-    case AtomicExpr::AO_ABI_memory_order_seq_cst:
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-                   Size, llvm::SequentiallyConsistent);
-      break;
-    default: // invalid order
-      // We should not ever get here normally, but it's hard to
-      // enforce that in general.
-      break;
-    }
+    auto ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
+    // We should not ever get to a case where the ordering isn't a valid C ABI
+    // value, but it's hard to enforce that in general.
+    if (llvm::isValidAtomicOrderingCABI(ord))
+      switch ((llvm::AtomicOrderingCABI)ord) {
+      case llvm::AtomicOrderingCABI::relaxed:
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
+                     llvm::AtomicOrdering::Monotonic);
+        break;
+      case llvm::AtomicOrderingCABI::consume:
+      case llvm::AtomicOrderingCABI::acquire:
+        if (IsStore)
+          break; // Avoid crashing on code with undefined behavior
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
+                     llvm::AtomicOrdering::Acquire);
+        break;
+      case llvm::AtomicOrderingCABI::release:
+        if (IsLoad)
+          break; // Avoid crashing on code with undefined behavior
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
+                     llvm::AtomicOrdering::Release);
+        break;
+      case llvm::AtomicOrderingCABI::acq_rel:
+        if (IsLoad || IsStore)
+          break; // Avoid crashing on code with undefined behavior
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
+                     llvm::AtomicOrdering::AcquireRelease);
+        break;
+      case llvm::AtomicOrderingCABI::seq_cst:
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
+                     llvm::AtomicOrdering::SequentiallyConsistent);
+        break;
+      }
     if (RValTy->isVoidType())
       return RValue::get(nullptr);
 
@@ -1104,39 +1091,39 @@
   // Emit all the different atomics
   Builder.SetInsertPoint(MonotonicBB);
   EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-               Size, llvm::Monotonic);
+               Size, llvm::AtomicOrdering::Monotonic);
   Builder.CreateBr(ContBB);
   if (!IsStore) {
     Builder.SetInsertPoint(AcquireBB);
     EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-                 Size, llvm::Acquire);
+                 Size, llvm::AtomicOrdering::Acquire);
     Builder.CreateBr(ContBB);
-    SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_consume),
+    SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::consume),
                 AcquireBB);
-    SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_acquire),
+    SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::acquire),
                 AcquireBB);
   }
   if (!IsLoad) {
     Builder.SetInsertPoint(ReleaseBB);
     EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-                 Size, llvm::Release);
+                 Size, llvm::AtomicOrdering::Release);
     Builder.CreateBr(ContBB);
-    SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_release),
+    SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::release),
                 ReleaseBB);
   }
   if (!IsLoad && !IsStore) {
     Builder.SetInsertPoint(AcqRelBB);
     EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-                 Size, llvm::AcquireRelease);
+                 Size, llvm::AtomicOrdering::AcquireRelease);
     Builder.CreateBr(ContBB);
-    SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_acq_rel),
+    SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::acq_rel),
                 AcqRelBB);
   }
   Builder.SetInsertPoint(SeqCstBB);
   EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail,
-               Size, llvm::SequentiallyConsistent);
+               Size, llvm::AtomicOrdering::SequentiallyConsistent);
   Builder.CreateBr(ContBB);
-  SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_seq_cst),
+  SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::seq_cst),
               SeqCstBB);
 
   // Cleanup and return
@@ -1256,9 +1243,9 @@
            CGF.getContext().VoidPtrTy);
   Args.add(RValue::get(CGF.EmitCastToVoidPtr(AddForLoaded)),
            CGF.getContext().VoidPtrTy);
-  Args.add(RValue::get(
-               llvm::ConstantInt::get(CGF.IntTy, translateAtomicOrdering(AO))),
-           CGF.getContext().IntTy);
+  Args.add(
+      RValue::get(llvm::ConstantInt::get(CGF.IntTy, (int)llvm::toCABI(AO))),
+      CGF.getContext().IntTy);
   emitAtomicLibcall(CGF, "__atomic_load", CGF.getContext().VoidTy, Args);
 }
 
@@ -1286,28 +1273,21 @@
   bool IsVolatile = LV.isVolatile() || hasVolatileMember(LV.getType());
   // An atomic is inline if we don't need to use a libcall.
   bool AtomicIsInline = !AI.shouldUseLibcall();
+  // MSVC doesn't seem to do this for types wider than a pointer.
+  if (getContext().getTypeSize(LV.getType()) >
+      getContext().getTypeSize(getContext().getIntPtrType()))
+    return false;
   return IsVolatile && AtomicIsInline;
 }
 
-/// An type is a candidate for having its loads and stores be made atomic if
-/// we are operating under /volatile:ms *and* we know the access is volatile and
-/// performing such an operation can be performed without a libcall.
-bool CodeGenFunction::typeIsSuitableForInlineAtomic(QualType Ty,
-                                                    bool IsVolatile) const {
-  // An atomic is inline if we don't need to use a libcall (e.g. it is builtin).
-  bool AtomicIsInline = getContext().getTargetInfo().hasBuiltinAtomic(
-      getContext().getTypeSize(Ty), getContext().getTypeAlign(Ty));
-  return CGM.getCodeGenOpts().MSVolatile && IsVolatile && AtomicIsInline;
-}
-
 RValue CodeGenFunction::EmitAtomicLoad(LValue LV, SourceLocation SL,
                                        AggValueSlot Slot) {
   llvm::AtomicOrdering AO;
   bool IsVolatile = LV.isVolatileQualified();
   if (LV.getType()->isAtomicType()) {
-    AO = llvm::SequentiallyConsistent;
+    AO = llvm::AtomicOrdering::SequentiallyConsistent;
   } else {
-    AO = llvm::Acquire;
+    AO = llvm::AtomicOrdering::Acquire;
     IsVolatile = true;
   }
   return EmitAtomicLoad(LV, SL, AO, IsVolatile, Slot);
@@ -1461,11 +1441,11 @@
            CGF.getContext().VoidPtrTy);
   Args.add(RValue::get(CGF.EmitCastToVoidPtr(DesiredAddr)),
            CGF.getContext().VoidPtrTy);
-  Args.add(RValue::get(llvm::ConstantInt::get(
-               CGF.IntTy, translateAtomicOrdering(Success))),
+  Args.add(RValue::get(
+               llvm::ConstantInt::get(CGF.IntTy, (int)llvm::toCABI(Success))),
            CGF.getContext().IntTy);
-  Args.add(RValue::get(llvm::ConstantInt::get(
-               CGF.IntTy, translateAtomicOrdering(Failure))),
+  Args.add(RValue::get(
+               llvm::ConstantInt::get(CGF.IntTy, (int)llvm::toCABI(Failure))),
            CGF.getContext().IntTy);
   auto SuccessFailureRVal = emitAtomicLibcall(CGF, "__atomic_compare_exchange",
                                               CGF.getContext().BoolTy, Args);
@@ -1476,8 +1456,9 @@
 std::pair<RValue, llvm::Value *> AtomicInfo::EmitAtomicCompareExchange(
     RValue Expected, RValue Desired, llvm::AtomicOrdering Success,
     llvm::AtomicOrdering Failure, bool IsWeak) {
-  if (Failure >= Success)
-    // Don't assert on undefined behavior.
+  if (isStrongerThan(Failure, Success))
+    // Don't assert on undefined behavior "failure argument shall be no stronger
+    // than the success argument".
     Failure = llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(Success);
 
   // Check whether we should use a library call.
@@ -1726,9 +1707,9 @@
   bool IsVolatile = lvalue.isVolatileQualified();
   llvm::AtomicOrdering AO;
   if (lvalue.getType()->isAtomicType()) {
-    AO = llvm::SequentiallyConsistent;
+    AO = llvm::AtomicOrdering::SequentiallyConsistent;
   } else {
-    AO = llvm::Release;
+    AO = llvm::AtomicOrdering::Release;
     IsVolatile = true;
   }
   return EmitAtomicStore(rvalue, lvalue, AO, IsVolatile, isInit);
@@ -1771,9 +1752,9 @@
                getContext().VoidPtrTy);
       args.add(RValue::get(EmitCastToVoidPtr(srcAddr.getPointer())),
                getContext().VoidPtrTy);
-      args.add(RValue::get(llvm::ConstantInt::get(
-                   IntTy, AtomicInfo::translateAtomicOrdering(AO))),
-               getContext().IntTy);
+      args.add(
+          RValue::get(llvm::ConstantInt::get(IntTy, (int)llvm::toCABI(AO))),
+          getContext().IntTy);
       emitAtomicLibcall(*this, "__atomic_store", getContext().VoidTy, args);
       return;
     }
diff --git a/lib/CodeGen/CGBlocks.cpp b/lib/CodeGen/CGBlocks.cpp
index 1389749..e3658ab 100644
--- a/lib/CodeGen/CGBlocks.cpp
+++ b/lib/CodeGen/CGBlocks.cpp
@@ -125,10 +125,15 @@
 
   llvm::Constant *init = llvm::ConstantStruct::getAnon(elements);
 
+  unsigned AddrSpace = 0;
+  if (C.getLangOpts().OpenCL)
+    AddrSpace = C.getTargetAddressSpace(LangAS::opencl_constant);
   llvm::GlobalVariable *global =
     new llvm::GlobalVariable(CGM.getModule(), init->getType(), true,
                              llvm::GlobalValue::InternalLinkage,
-                             init, "__block_descriptor_tmp");
+                             init, "__block_descriptor_tmp", nullptr,
+                             llvm::GlobalValue::NotThreadLocal,
+                             AddrSpace);
 
   return llvm::ConstantExpr::getBitCast(global, CGM.getBlockDescriptorType());
 }
@@ -927,7 +932,10 @@
                              UnsignedLongTy, UnsignedLongTy, nullptr);
 
   // Now form a pointer to that.
-  BlockDescriptorType = llvm::PointerType::getUnqual(BlockDescriptorType);
+  unsigned AddrSpace = 0;
+  if (getLangOpts().OpenCL)
+    AddrSpace = getContext().getTargetAddressSpace(LangAS::opencl_constant);
+  BlockDescriptorType = llvm::PointerType::get(BlockDescriptorType, AddrSpace);
   return BlockDescriptorType;
 }
 
@@ -2287,9 +2295,36 @@
 /// Adjust the declaration of something from the blocks API.
 static void configureBlocksRuntimeObject(CodeGenModule &CGM,
                                          llvm::Constant *C) {
-  if (!CGM.getLangOpts().BlocksRuntimeOptional) return;
-
   auto *GV = cast<llvm::GlobalValue>(C->stripPointerCasts());
+
+  if (CGM.getTarget().getTriple().isOSBinFormatCOFF()) {
+    IdentifierInfo &II = CGM.getContext().Idents.get(C->getName());
+    TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
+    DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+
+    assert((isa<llvm::Function>(C->stripPointerCasts()) ||
+            isa<llvm::GlobalVariable>(C->stripPointerCasts())) &&
+           "expected Function or GlobalVariable");
+
+    const NamedDecl *ND = nullptr;
+    for (const auto &Result : DC->lookup(&II))
+      if ((ND = dyn_cast<FunctionDecl>(Result)) ||
+          (ND = dyn_cast<VarDecl>(Result)))
+        break;
+
+    // TODO: support static blocks runtime
+    if (GV->isDeclaration() && (!ND || !ND->hasAttr<DLLExportAttr>())) {
+      GV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
+      GV->setLinkage(llvm::GlobalValue::ExternalLinkage);
+    } else {
+      GV->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
+      GV->setLinkage(llvm::GlobalValue::ExternalLinkage);
+    }
+  }
+
+  if (!CGM.getLangOpts().BlocksRuntimeOptional)
+    return;
+
   if (GV->isDeclaration() && GV->hasExternalLinkage())
     GV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
 }
@@ -2337,5 +2372,5 @@
                                                Int8PtrTy->getPointerTo(),
                                                nullptr);
   configureBlocksRuntimeObject(*this, NSConcreteStackBlock);
-  return NSConcreteStackBlock;  
+  return NSConcreteStackBlock;
 }
diff --git a/lib/CodeGen/CGBlocks.h b/lib/CodeGen/CGBlocks.h
index 1edabef..aaf0679 100644
--- a/lib/CodeGen/CGBlocks.h
+++ b/lib/CodeGen/CGBlocks.h
@@ -25,10 +25,8 @@
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/TargetInfo.h"
-#include "llvm/IR/Module.h"
 
 namespace llvm {
-class Module;
 class Constant;
 class Function;
 class GlobalValue;
@@ -40,10 +38,8 @@
 }
 
 namespace clang {
-
 namespace CodeGen {
 
-class CodeGenModule;
 class CGBlockInfo;
 
 // Flags stored in __block variables.
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index bf9f2b4..fb87771 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include <sstream>
 
 using namespace clang;
@@ -106,9 +107,8 @@
   llvm::Type *ValueType = Args[1]->getType();
   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
 
-  llvm::Value *Result =
-      CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1],
-                                  llvm::SequentiallyConsistent);
+  llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
+      Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent);
   return EmitFromInt(CGF, Result, T, ValueType);
 }
 
@@ -168,9 +168,8 @@
   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
 
-  llvm::Value *Result =
-      CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1],
-                                  llvm::SequentiallyConsistent);
+  llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
+      Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent);
   Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);
   if (Invert)
     Result = CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
@@ -207,9 +206,9 @@
   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
   Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);
 
-  Value *Pair = CGF.Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
-                                                llvm::SequentiallyConsistent,
-                                                llvm::SequentiallyConsistent);
+  Value *Pair = CGF.Builder.CreateAtomicCmpXchg(
+      Args[0], Args[1], Args[2], llvm::AtomicOrdering::SequentiallyConsistent,
+      llvm::AtomicOrdering::SequentiallyConsistent);
   if (ReturnBool)
     // Extract boolean success flag and zext it to int.
     return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
@@ -220,6 +219,51 @@
                        ValueType);
 }
 
+// Emit a simple mangled intrinsic that has 1 argument and a return type
+// matching the argument type.
+static Value *emitUnaryBuiltin(CodeGenFunction &CGF,
+                               const CallExpr *E,
+                               unsigned IntrinsicID) {
+  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
+
+  Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
+  return CGF.Builder.CreateCall(F, Src0);
+}
+
+// Emit an intrinsic that has 2 operands of the same type as its result.
+static Value *emitBinaryBuiltin(CodeGenFunction &CGF,
+                                const CallExpr *E,
+                                unsigned IntrinsicID) {
+  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
+  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
+
+  Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
+  return CGF.Builder.CreateCall(F, { Src0, Src1 });
+}
+
+// Emit an intrinsic that has 3 operands of the same type as its result.
+static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
+                                 const CallExpr *E,
+                                 unsigned IntrinsicID) {
+  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
+  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
+  llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
+
+  Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
+  return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
+}
+
+// Emit an intrinsic that has 1 float or double operand, and 1 integer.
+static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
+                               const CallExpr *E,
+                               unsigned IntrinsicID) {
+  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
+  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
+
+  Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
+  return CGF.Builder.CreateCall(F, {Src0, Src1});
+}
+
 /// EmitFAbs - Emit a call to @llvm.fabs().
 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
   Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
@@ -249,8 +293,8 @@
     if (CGF.getTarget().isBigEndian()) {
       Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
       V = CGF.Builder.CreateLShr(V, ShiftCst);
-    } 
-    // We are truncating value in order to extract the higher-order 
+    }
+    // We are truncating value in order to extract the higher-order
     // double, which we will be using to extract the sign from.
     IntTy = llvm::IntegerType::get(C, Width);
     V = CGF.Builder.CreateTrunc(V, IntTy);
@@ -289,6 +333,17 @@
   return CGF.Builder.CreateExtractValue(Tmp, 0);
 }
 
+static Value *emitRangedBuiltin(CodeGenFunction &CGF,
+                                unsigned IntrinsicID,
+                                int low, int high) {
+    llvm::MDBuilder MDHelper(CGF.getLLVMContext());
+    llvm::MDNode *RNode = MDHelper.createRange(APInt(32, low), APInt(32, high));
+    Value *F = CGF.CGM.getIntrinsic(IntrinsicID, {});
+    llvm::Instruction *Call = CGF.Builder.CreateCall(F);
+    Call->setMetadata(llvm::LLVMContext::MD_range, RNode);
+    return Call;
+}
+
 namespace {
   struct WidthAndSignedness {
     unsigned Width;
@@ -477,9 +532,7 @@
   case Builtin::BI__builtin_fabs:
   case Builtin::BI__builtin_fabsf:
   case Builtin::BI__builtin_fabsl: {
-    Value *Arg1 = EmitScalarExpr(E->getArg(0));
-    Value *Result = EmitFAbs(*this, Arg1);
-    return RValue::get(Result);
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
   }
   case Builtin::BI__builtin_fmod:
   case Builtin::BI__builtin_fmodf:
@@ -489,7 +542,51 @@
     Value *Result = Builder.CreateFRem(Arg1, Arg2, "fmod");
     return RValue::get(Result);
   }
-
+  case Builtin::BI__builtin_copysign:
+  case Builtin::BI__builtin_copysignf:
+  case Builtin::BI__builtin_copysignl: {
+    return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
+  }
+  case Builtin::BI__builtin_ceil:
+  case Builtin::BI__builtin_ceilf:
+  case Builtin::BI__builtin_ceill: {
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::ceil));
+  }
+  case Builtin::BI__builtin_floor:
+  case Builtin::BI__builtin_floorf:
+  case Builtin::BI__builtin_floorl: {
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::floor));
+  }
+  case Builtin::BI__builtin_trunc:
+  case Builtin::BI__builtin_truncf:
+  case Builtin::BI__builtin_truncl: {
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::trunc));
+  }
+  case Builtin::BI__builtin_rint:
+  case Builtin::BI__builtin_rintf:
+  case Builtin::BI__builtin_rintl: {
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::rint));
+  }
+  case Builtin::BI__builtin_nearbyint:
+  case Builtin::BI__builtin_nearbyintf:
+  case Builtin::BI__builtin_nearbyintl: {
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::nearbyint));
+  }
+  case Builtin::BI__builtin_round:
+  case Builtin::BI__builtin_roundf:
+  case Builtin::BI__builtin_roundl: {
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::round));
+  }
+  case Builtin::BI__builtin_fmin:
+  case Builtin::BI__builtin_fminf:
+  case Builtin::BI__builtin_fminl: {
+    return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::minnum));
+  }
+  case Builtin::BI__builtin_fmax:
+  case Builtin::BI__builtin_fmaxf:
+  case Builtin::BI__builtin_fmaxl: {
+    return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::maxnum));
+  }
   case Builtin::BI__builtin_conj:
   case Builtin::BI__builtin_conjf:
   case Builtin::BI__builtin_conjl: {
@@ -657,10 +754,13 @@
   case Builtin::BI__builtin_bswap16:
   case Builtin::BI__builtin_bswap32:
   case Builtin::BI__builtin_bswap64: {
-    Value *ArgValue = EmitScalarExpr(E->getArg(0));
-    llvm::Type *ArgType = ArgValue->getType();
-    Value *F = CGM.getIntrinsic(Intrinsic::bswap, ArgType);
-    return RValue::get(Builder.CreateCall(F, ArgValue));
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bswap));
+  }
+  case Builtin::BI__builtin_bitreverse8:
+  case Builtin::BI__builtin_bitreverse16:
+  case Builtin::BI__builtin_bitreverse32:
+  case Builtin::BI__builtin_bitreverse64: {
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bitreverse));
   }
   case Builtin::BI__builtin_object_size: {
     unsigned Type =
@@ -763,13 +863,19 @@
     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
   }
 
-  case Builtin::BI__builtin_isinf: {
-    // isinf(x) --> fabs(x) == infinity
+  case Builtin::BI__builtin_isinf:
+  case Builtin::BI__builtin_isfinite: {
+    // isinf(x)    --> fabs(x) == infinity
+    // isfinite(x) --> fabs(x) != infinity
+    // x != NaN via the ordered compare in either case.
     Value *V = EmitScalarExpr(E->getArg(0));
-    V = EmitFAbs(*this, V);
-
-    V = Builder.CreateFCmpOEQ(V, ConstantFP::getInfinity(V->getType()),"isinf");
-    return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
+    Value *Fabs = EmitFAbs(*this, V);
+    Constant *Infinity = ConstantFP::getInfinity(V->getType());
+    CmpInst::Predicate Pred = (BuiltinID == Builtin::BI__builtin_isinf)
+                                  ? CmpInst::FCMP_OEQ
+                                  : CmpInst::FCMP_ONE;
+    Value *FCmp = Builder.CreateFCmp(Pred, Fabs, Infinity, "cmpinf");
+    return RValue::get(Builder.CreateZExt(FCmp, ConvertType(E->getType())));
   }
 
   case Builtin::BI__builtin_isinf_sign: {
@@ -807,19 +913,6 @@
     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
   }
 
-  case Builtin::BI__builtin_isfinite: {
-    // isfinite(x) --> x == x && fabs(x) != infinity;
-    Value *V = EmitScalarExpr(E->getArg(0));
-    Value *Eq = Builder.CreateFCmpOEQ(V, V, "iseq");
-
-    Value *Abs = EmitFAbs(*this, V);
-    Value *IsNotInf =
-      Builder.CreateFCmpUNE(Abs, ConstantFP::getInfinity(V->getType()),"isinf");
-
-    V = Builder.CreateAnd(Eq, IsNotInf, "and");
-    return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
-  }
-
   case Builtin::BI__builtin_fpclassify: {
     Value *V = EmitScalarExpr(E->getArg(5));
     llvm::Type *Ty = ConvertType(E->getArg(5)->getType());
@@ -1270,7 +1363,7 @@
     llvm::StoreInst *Store =
       Builder.CreateAlignedStore(llvm::Constant::getNullValue(ITy), Ptr,
                                  StoreSize);
-    Store->setAtomic(llvm::Release);
+    Store->setAtomic(llvm::AtomicOrdering::Release);
     return RValue::get(nullptr);
   }
 
@@ -1282,7 +1375,7 @@
     // any way to safely use it... but in practice, it mostly works
     // to use it with non-atomic loads and stores to get acquire/release
     // semantics.
-    Builder.CreateFence(llvm::SequentiallyConsistent);
+    Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
     return RValue::get(nullptr);
   }
 
@@ -1330,30 +1423,27 @@
       switch (ord) {
       case 0:  // memory_order_relaxed
       default: // invalid order
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
-                                         Ptr, NewVal,
-                                         llvm::Monotonic);
+        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+                                         llvm::AtomicOrdering::Monotonic);
         break;
-      case 1:  // memory_order_consume
-      case 2:  // memory_order_acquire
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
-                                         Ptr, NewVal,
-                                         llvm::Acquire);
+      case 1: // memory_order_consume
+      case 2: // memory_order_acquire
+        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+                                         llvm::AtomicOrdering::Acquire);
         break;
-      case 3:  // memory_order_release
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
-                                         Ptr, NewVal,
-                                         llvm::Release);
+      case 3: // memory_order_release
+        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+                                         llvm::AtomicOrdering::Release);
         break;
-      case 4:  // memory_order_acq_rel
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
-                                         Ptr, NewVal,
-                                         llvm::AcquireRelease);
+      case 4: // memory_order_acq_rel
+
+        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+                                         llvm::AtomicOrdering::AcquireRelease);
         break;
-      case 5:  // memory_order_seq_cst
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
-                                         Ptr, NewVal,
-                                         llvm::SequentiallyConsistent);
+      case 5: // memory_order_seq_cst
+        Result = Builder.CreateAtomicRMW(
+            llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+            llvm::AtomicOrdering::SequentiallyConsistent);
         break;
       }
       Result->setVolatile(Volatile);
@@ -1370,9 +1460,9 @@
       createBasicBlock("seqcst", CurFn)
     };
     llvm::AtomicOrdering Orders[5] = {
-      llvm::Monotonic, llvm::Acquire, llvm::Release,
-      llvm::AcquireRelease, llvm::SequentiallyConsistent
-    };
+        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
+        llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
+        llvm::AtomicOrdering::SequentiallyConsistent};
 
     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
@@ -1416,13 +1506,13 @@
       switch (ord) {
       case 0:  // memory_order_relaxed
       default: // invalid order
-        Store->setOrdering(llvm::Monotonic);
+        Store->setOrdering(llvm::AtomicOrdering::Monotonic);
         break;
       case 3:  // memory_order_release
-        Store->setOrdering(llvm::Release);
+        Store->setOrdering(llvm::AtomicOrdering::Release);
         break;
       case 5:  // memory_order_seq_cst
-        Store->setOrdering(llvm::SequentiallyConsistent);
+        Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
         break;
       }
       return RValue::get(nullptr);
@@ -1436,8 +1526,8 @@
       createBasicBlock("seqcst", CurFn)
     };
     llvm::AtomicOrdering Orders[3] = {
-      llvm::Monotonic, llvm::Release, llvm::SequentiallyConsistent
-    };
+        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
+        llvm::AtomicOrdering::SequentiallyConsistent};
 
     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
@@ -1476,16 +1566,17 @@
         break;
       case 1:  // memory_order_consume
       case 2:  // memory_order_acquire
-        Builder.CreateFence(llvm::Acquire, Scope);
+        Builder.CreateFence(llvm::AtomicOrdering::Acquire, Scope);
         break;
       case 3:  // memory_order_release
-        Builder.CreateFence(llvm::Release, Scope);
+        Builder.CreateFence(llvm::AtomicOrdering::Release, Scope);
         break;
       case 4:  // memory_order_acq_rel
-        Builder.CreateFence(llvm::AcquireRelease, Scope);
+        Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, Scope);
         break;
       case 5:  // memory_order_seq_cst
-        Builder.CreateFence(llvm::SequentiallyConsistent, Scope);
+        Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
+                            Scope);
         break;
       }
       return RValue::get(nullptr);
@@ -1502,23 +1593,23 @@
     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
 
     Builder.SetInsertPoint(AcquireBB);
-    Builder.CreateFence(llvm::Acquire, Scope);
+    Builder.CreateFence(llvm::AtomicOrdering::Acquire, Scope);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32(1), AcquireBB);
     SI->addCase(Builder.getInt32(2), AcquireBB);
 
     Builder.SetInsertPoint(ReleaseBB);
-    Builder.CreateFence(llvm::Release, Scope);
+    Builder.CreateFence(llvm::AtomicOrdering::Release, Scope);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32(3), ReleaseBB);
 
     Builder.SetInsertPoint(AcqRelBB);
-    Builder.CreateFence(llvm::AcquireRelease, Scope);
+    Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, Scope);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32(4), AcqRelBB);
 
     Builder.SetInsertPoint(SeqCstBB);
-    Builder.CreateFence(llvm::SequentiallyConsistent, Scope);
+    Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, Scope);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32(5), SeqCstBB);
 
@@ -1804,7 +1895,7 @@
       break;
     }
 
-    
+
     llvm::Value *Carry;
     llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
     Builder.CreateStore(Sum, SumOutPtr);
@@ -1849,9 +1940,10 @@
     llvm::Value *Comparand =
       Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType);
 
-    auto Result = Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
-                                              SequentiallyConsistent,
-                                              SequentiallyConsistent);
+    auto Result =
+        Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
+                                    AtomicOrdering::SequentiallyConsistent,
+                                    AtomicOrdering::SequentiallyConsistent);
     Result->setVolatile(true);
 
     return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result,
@@ -1863,44 +1955,47 @@
         EmitScalarExpr(E->getArg(0)),
         EmitScalarExpr(E->getArg(2)),
         EmitScalarExpr(E->getArg(1)),
-        SequentiallyConsistent,
-        SequentiallyConsistent);
+        AtomicOrdering::SequentiallyConsistent,
+        AtomicOrdering::SequentiallyConsistent);
       CXI->setVolatile(true);
       return RValue::get(Builder.CreateExtractValue(CXI, 0));
   }
   case Builtin::BI_InterlockedIncrement: {
+    llvm::Type *IntTy = ConvertType(E->getType());
     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
       AtomicRMWInst::Add,
       EmitScalarExpr(E->getArg(0)),
-      ConstantInt::get(Int32Ty, 1),
-      llvm::SequentiallyConsistent);
+      ConstantInt::get(IntTy, 1),
+      llvm::AtomicOrdering::SequentiallyConsistent);
     RMWI->setVolatile(true);
-    return RValue::get(Builder.CreateAdd(RMWI, ConstantInt::get(Int32Ty, 1)));
+    return RValue::get(Builder.CreateAdd(RMWI, ConstantInt::get(IntTy, 1)));
   }
   case Builtin::BI_InterlockedDecrement: {
+    llvm::Type *IntTy = ConvertType(E->getType());
     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
       AtomicRMWInst::Sub,
       EmitScalarExpr(E->getArg(0)),
-      ConstantInt::get(Int32Ty, 1),
-      llvm::SequentiallyConsistent);
+      ConstantInt::get(IntTy, 1),
+      llvm::AtomicOrdering::SequentiallyConsistent);
     RMWI->setVolatile(true);
-    return RValue::get(Builder.CreateSub(RMWI, ConstantInt::get(Int32Ty, 1)));
+    return RValue::get(Builder.CreateSub(RMWI, ConstantInt::get(IntTy, 1)));
   }
   case Builtin::BI_InterlockedExchangeAdd: {
     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
       AtomicRMWInst::Add,
       EmitScalarExpr(E->getArg(0)),
       EmitScalarExpr(E->getArg(1)),
-      llvm::SequentiallyConsistent);
+      llvm::AtomicOrdering::SequentiallyConsistent);
     RMWI->setVolatile(true);
     return RValue::get(RMWI);
   }
   case Builtin::BI__readfsdword: {
+    llvm::Type *IntTy = ConvertType(E->getType());
     Value *IntToPtr =
       Builder.CreateIntToPtr(EmitScalarExpr(E->getArg(0)),
-                             llvm::PointerType::get(CGM.Int32Ty, 257));
+                             llvm::PointerType::get(IntTy, 257));
     LoadInst *Load =
-        Builder.CreateAlignedLoad(IntToPtr, /*Align=*/4, /*isVolatile=*/true);
+        Builder.CreateDefaultAlignedLoad(IntToPtr, /*isVolatile=*/true);
     return RValue::get(Load);
   }
 
@@ -1973,7 +2068,6 @@
       return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
     break;
   }
-
   case Builtin::BI__builtin_os_log_format: {
     assert(E->getNumArgs() >= 2 &&
            "__builtin_os_log_format takes at least 2 arguments");
@@ -2044,6 +2138,323 @@
     return RValue::get(ConstantInt::get(ConvertType(E->getType()),
                                         Layout.getSize().getQuantity()));
   }
+
+  // OpenCL v2.0 s6.13.16.2, Built-in pipe read and write functions
+  case Builtin::BIread_pipe:
+  case Builtin::BIwrite_pipe: {
+    Value *Arg0 = EmitScalarExpr(E->getArg(0)),
+          *Arg1 = EmitScalarExpr(E->getArg(1));
+
+    // Type of the generic packet parameter.
+    unsigned GenericAS =
+        getContext().getTargetAddressSpace(LangAS::opencl_generic);
+    llvm::Type *I8PTy = llvm::PointerType::get(
+        llvm::Type::getInt8Ty(getLLVMContext()), GenericAS);
+
+    // Testing which overloaded version we should generate the call for.
+    if (2U == E->getNumArgs()) {
+      const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
+                                                             : "__write_pipe_2";
+      // Creating a generic function type to be able to call with any builtin or
+      // user defined type.
+      llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy};
+      llvm::FunctionType *FTy = llvm::FunctionType::get(
+          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+      Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy);
+      return RValue::get(Builder.CreateCall(
+          CGM.CreateRuntimeFunction(FTy, Name), {Arg0, BCast}));
+    } else {
+      assert(4 == E->getNumArgs() &&
+             "Illegal number of parameters to pipe function");
+      const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
+                                                             : "__write_pipe_4";
+
+      llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy};
+      Value *Arg2 = EmitScalarExpr(E->getArg(2)),
+            *Arg3 = EmitScalarExpr(E->getArg(3));
+      llvm::FunctionType *FTy = llvm::FunctionType::get(
+          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+      Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
+      // We know the third argument is an integer type, but we may need to cast
+      // it to i32.
+      if (Arg2->getType() != Int32Ty)
+        Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
+      return RValue::get(Builder.CreateCall(
+          CGM.CreateRuntimeFunction(FTy, Name), {Arg0, Arg1, Arg2, BCast}));
+    }
+  }
+  // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
+  // functions
+  case Builtin::BIreserve_read_pipe:
+  case Builtin::BIreserve_write_pipe:
+  case Builtin::BIwork_group_reserve_read_pipe:
+  case Builtin::BIwork_group_reserve_write_pipe:
+  case Builtin::BIsub_group_reserve_read_pipe:
+  case Builtin::BIsub_group_reserve_write_pipe: {
+    // Composing the mangled name for the function.
+    const char *Name;
+    if (BuiltinID == Builtin::BIreserve_read_pipe)
+      Name = "__reserve_read_pipe";
+    else if (BuiltinID == Builtin::BIreserve_write_pipe)
+      Name = "__reserve_write_pipe";
+    else if (BuiltinID == Builtin::BIwork_group_reserve_read_pipe)
+      Name = "__work_group_reserve_read_pipe";
+    else if (BuiltinID == Builtin::BIwork_group_reserve_write_pipe)
+      Name = "__work_group_reserve_write_pipe";
+    else if (BuiltinID == Builtin::BIsub_group_reserve_read_pipe)
+      Name = "__sub_group_reserve_read_pipe";
+    else
+      Name = "__sub_group_reserve_write_pipe";
+
+    Value *Arg0 = EmitScalarExpr(E->getArg(0)),
+          *Arg1 = EmitScalarExpr(E->getArg(1));
+    llvm::Type *ReservedIDTy = ConvertType(getContext().OCLReserveIDTy);
+
+    // Building the generic function prototype.
+    llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty};
+    llvm::FunctionType *FTy = llvm::FunctionType::get(
+        ReservedIDTy, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+    // We know the second argument is an integer type, but we may need to cast
+    // it to i32.
+    if (Arg1->getType() != Int32Ty)
+      Arg1 = Builder.CreateZExtOrTrunc(Arg1, Int32Ty);
+    return RValue::get(
+        Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), {Arg0, Arg1}));
+  }
+  // OpenCL v2.0 s6.13.16, s9.17.3.5 - Built-in pipe commit read and write
+  // functions
+  case Builtin::BIcommit_read_pipe:
+  case Builtin::BIcommit_write_pipe:
+  case Builtin::BIwork_group_commit_read_pipe:
+  case Builtin::BIwork_group_commit_write_pipe:
+  case Builtin::BIsub_group_commit_read_pipe:
+  case Builtin::BIsub_group_commit_write_pipe: {
+    const char *Name;
+    if (BuiltinID == Builtin::BIcommit_read_pipe)
+      Name = "__commit_read_pipe";
+    else if (BuiltinID == Builtin::BIcommit_write_pipe)
+      Name = "__commit_write_pipe";
+    else if (BuiltinID == Builtin::BIwork_group_commit_read_pipe)
+      Name = "__work_group_commit_read_pipe";
+    else if (BuiltinID == Builtin::BIwork_group_commit_write_pipe)
+      Name = "__work_group_commit_write_pipe";
+    else if (BuiltinID == Builtin::BIsub_group_commit_read_pipe)
+      Name = "__sub_group_commit_read_pipe";
+    else
+      Name = "__sub_group_commit_write_pipe";
+
+    Value *Arg0 = EmitScalarExpr(E->getArg(0)),
+          *Arg1 = EmitScalarExpr(E->getArg(1));
+
+    // Building the generic function prototype.
+    llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType()};
+    llvm::FunctionType *FTy =
+        llvm::FunctionType::get(llvm::Type::getVoidTy(getLLVMContext()),
+                                llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+
+    return RValue::get(
+        Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), {Arg0, Arg1}));
+  }
+  // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions
+  case Builtin::BIget_pipe_num_packets:
+  case Builtin::BIget_pipe_max_packets: {
+    const char *Name;
+    if (BuiltinID == Builtin::BIget_pipe_num_packets)
+      Name = "__get_pipe_num_packets";
+    else
+      Name = "__get_pipe_max_packets";
+
+    // Building the generic function prototype.
+    Value *Arg0 = EmitScalarExpr(E->getArg(0));
+    llvm::Type *ArgTys[] = {Arg0->getType()};
+    llvm::FunctionType *FTy = llvm::FunctionType::get(
+        Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+
+    return RValue::get(
+        Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), {Arg0}));
+  }
+
+  // OpenCL v2.0 s6.13.9 - Address space qualifier functions.
+  case Builtin::BIto_global:
+  case Builtin::BIto_local:
+  case Builtin::BIto_private: {
+    auto Arg0 = EmitScalarExpr(E->getArg(0));
+    auto NewArgT = llvm::PointerType::get(Int8Ty,
+      CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
+    auto NewRetT = llvm::PointerType::get(Int8Ty,
+      CGM.getContext().getTargetAddressSpace(
+        E->getType()->getPointeeType().getAddressSpace()));
+    auto FTy = llvm::FunctionType::get(NewRetT, {NewArgT}, false);
+    llvm::Value *NewArg;
+    if (Arg0->getType()->getPointerAddressSpace() !=
+        NewArgT->getPointerAddressSpace())
+      NewArg = Builder.CreateAddrSpaceCast(Arg0, NewArgT);
+    else
+      NewArg = Builder.CreateBitOrPointerCast(Arg0, NewArgT);
+    auto NewName = std::string("__") + E->getDirectCallee()->getName().str();
+    auto NewCall =
+        Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, NewName), {NewArg});
+    return RValue::get(Builder.CreateBitOrPointerCast(NewCall,
+      ConvertType(E->getType())));
+  }
+
+  // OpenCL v2.0, s6.13.17 - Enqueue kernel function.
+  // It contains four different overload formats specified in Table 6.13.17.1.
+  case Builtin::BIenqueue_kernel: {
+    StringRef Name; // Generated function call name
+    unsigned NumArgs = E->getNumArgs();
+
+    llvm::Type *QueueTy = ConvertType(getContext().OCLQueueTy);
+    llvm::Type *RangeTy = ConvertType(getContext().OCLNDRangeTy);
+
+    llvm::Value *Queue = EmitScalarExpr(E->getArg(0));
+    llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
+    llvm::Value *Range = EmitScalarExpr(E->getArg(2));
+
+    if (NumArgs == 4) {
+      // The most basic form of the call with parameters:
+      // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void)
+      Name = "__enqueue_kernel_basic";
+      llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, Int8PtrTy};
+      llvm::FunctionType *FTy = llvm::FunctionType::get(
+          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys, 4), false);
+
+      llvm::Value *Block =
+          Builder.CreateBitCast(EmitScalarExpr(E->getArg(3)), Int8PtrTy);
+
+      return RValue::get(Builder.CreateCall(
+          CGM.CreateRuntimeFunction(FTy, Name), {Queue, Flags, Range, Block}));
+    }
+    assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");
+
+    // Could have events and/or vaargs.
+    if (E->getArg(3)->getType()->isBlockPointerType()) {
+      // No events passed, but has variadic arguments.
+      Name = "__enqueue_kernel_vaargs";
+      llvm::Value *Block =
+          Builder.CreateBitCast(EmitScalarExpr(E->getArg(3)), Int8PtrTy);
+      // Create a vector of the arguments, as well as a constant value to
+      // express to the runtime the number of variadic arguments.
+      std::vector<llvm::Value *> Args = {Queue, Flags, Range, Block,
+                                         ConstantInt::get(IntTy, NumArgs - 4)};
+      std::vector<llvm::Type *> ArgTys = {QueueTy, IntTy, RangeTy, Int8PtrTy,
+                                          IntTy};
+
+      // Add the variadics.
+      for (unsigned I = 4; I < NumArgs; ++I) {
+        llvm::Value *ArgSize = EmitScalarExpr(E->getArg(I));
+        unsigned TypeSizeInBytes =
+            getContext()
+                .getTypeSizeInChars(E->getArg(I)->getType())
+                .getQuantity();
+        Args.push_back(TypeSizeInBytes < 4
+                           ? Builder.CreateZExt(ArgSize, Int32Ty)
+                           : ArgSize);
+      }
+
+      llvm::FunctionType *FTy = llvm::FunctionType::get(
+          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), true);
+      return RValue::get(
+          Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
+                             llvm::ArrayRef<llvm::Value *>(Args)));
+    }
+    // Any calls now have event arguments passed.
+    if (NumArgs >= 7) {
+      llvm::Type *EventTy = ConvertType(getContext().OCLClkEventTy);
+      unsigned AS4 =
+          E->getArg(4)->getType()->isArrayType()
+              ? E->getArg(4)->getType().getAddressSpace()
+              : E->getArg(4)->getType()->getPointeeType().getAddressSpace();
+      llvm::Type *EventPtrAS4Ty =
+          EventTy->getPointerTo(CGM.getContext().getTargetAddressSpace(AS4));
+      unsigned AS5 =
+          E->getArg(5)->getType()->getPointeeType().getAddressSpace();
+      llvm::Type *EventPtrAS5Ty =
+          EventTy->getPointerTo(CGM.getContext().getTargetAddressSpace(AS5));
+
+      llvm::Value *NumEvents = EmitScalarExpr(E->getArg(3));
+      llvm::Value *EventList =
+          E->getArg(4)->getType()->isArrayType()
+              ? EmitArrayToPointerDecay(E->getArg(4)).getPointer()
+              : EmitScalarExpr(E->getArg(4));
+      llvm::Value *ClkEvent = EmitScalarExpr(E->getArg(5));
+      llvm::Value *Block =
+          Builder.CreateBitCast(EmitScalarExpr(E->getArg(6)), Int8PtrTy);
+
+      std::vector<llvm::Type *> ArgTys = {
+          QueueTy,       Int32Ty,       RangeTy,  Int32Ty,
+          EventPtrAS4Ty, EventPtrAS5Ty, Int8PtrTy};
+      std::vector<llvm::Value *> Args = {Queue,     Flags,    Range, NumEvents,
+                                         EventList, ClkEvent, Block};
+
+      if (NumArgs == 7) {
+        // Has events but no variadics.
+        Name = "__enqueue_kernel_basic_events";
+        llvm::FunctionType *FTy = llvm::FunctionType::get(
+            Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+        return RValue::get(
+            Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
+                               llvm::ArrayRef<llvm::Value *>(Args)));
+      }
+      // Has event info and variadics
+      // Pass the number of variadics to the runtime function too.
+      Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7));
+      ArgTys.push_back(Int32Ty);
+      Name = "__enqueue_kernel_events_vaargs";
+
+      // Add the variadics.
+      for (unsigned I = 7; I < NumArgs; ++I) {
+        llvm::Value *ArgSize = EmitScalarExpr(E->getArg(I));
+        unsigned TypeSizeInBytes =
+            getContext()
+                .getTypeSizeInChars(E->getArg(I)->getType())
+                .getQuantity();
+        Args.push_back(TypeSizeInBytes < 4
+                           ? Builder.CreateZExt(ArgSize, Int32Ty)
+                           : ArgSize);
+      }
+      llvm::FunctionType *FTy = llvm::FunctionType::get(
+          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), true);
+      return RValue::get(
+          Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
+                             llvm::ArrayRef<llvm::Value *>(Args)));
+    }
+  }
+  // OpenCL v2.0 s6.13.17.6 - Kernel query functions need bitcast of block
+  // parameter.
+  case Builtin::BIget_kernel_work_group_size: {
+    Value *Arg = EmitScalarExpr(E->getArg(0));
+    Arg = Builder.CreateBitCast(Arg, Int8PtrTy);
+    return RValue::get(
+        Builder.CreateCall(CGM.CreateRuntimeFunction(
+                               llvm::FunctionType::get(IntTy, Int8PtrTy, false),
+                               "__get_kernel_work_group_size_impl"),
+                           Arg));
+  }
+  case Builtin::BIget_kernel_preferred_work_group_size_multiple: {
+    Value *Arg = EmitScalarExpr(E->getArg(0));
+    Arg = Builder.CreateBitCast(Arg, Int8PtrTy);
+    return RValue::get(Builder.CreateCall(
+        CGM.CreateRuntimeFunction(
+            llvm::FunctionType::get(IntTy, Int8PtrTy, false),
+            "__get_kernel_preferred_work_group_multiple_impl"),
+        Arg));
+  }
+  case Builtin::BIprintf:
+    if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice)
+      return EmitCUDADevicePrintfCallExpr(E, ReturnValue);
+    break;
+  case Builtin::BI__builtin_canonicalize:
+  case Builtin::BI__builtin_canonicalizef:
+  case Builtin::BI__builtin_canonicalizel:
+    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::canonicalize));
+
+  case Builtin::BI__builtin_thread_pointer: {
+    if (!getContext().getTargetInfo().isTLSSupported())
+      CGM.ErrorUnsupported(E, "__builtin_thread_pointer");
+    // Fall through - it's already mapped to the intrinsic by GCCBuiltin.
+    break;
+  }
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
@@ -2236,7 +2647,7 @@
 }
 
 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
-  unsigned nElts = cast<llvm::VectorType>(V->getType())->getNumElements();
+  unsigned nElts = V->getType()->getVectorNumElements();
   Value* SV = llvm::ConstantVector::getSplat(nElts, C);
   return Builder.CreateShuffleVector(V, V, SV, "lane");
 }
@@ -3154,14 +3565,13 @@
   case NEON::BI__builtin_neon_vext_v:
   case NEON::BI__builtin_neon_vextq_v: {
     int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
-    SmallVector<Constant*, 16> Indices;
+    SmallVector<uint32_t, 16> Indices;
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
-      Indices.push_back(ConstantInt::get(Int32Ty, i+CV));
+      Indices.push_back(i+CV);
 
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
-    Value *SV = llvm::ConstantVector::get(Indices);
-    return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vext");
+    return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext");
   }
   case NEON::BI__builtin_neon_vfma_v:
   case NEON::BI__builtin_neon_vfmaq_v: {
@@ -3359,14 +3769,13 @@
     Value *SV = nullptr;
 
     for (unsigned vi = 0; vi != 2; ++vi) {
-      SmallVector<Constant*, 16> Indices;
+      SmallVector<uint32_t, 16> Indices;
       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
-        Indices.push_back(Builder.getInt32(i+vi));
-        Indices.push_back(Builder.getInt32(i+e+vi));
+        Indices.push_back(i+vi);
+        Indices.push_back(i+e+vi);
       }
       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
-      SV = llvm::ConstantVector::get(Indices);
-      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn");
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
     }
     return SV;
@@ -3388,13 +3797,12 @@
     Value *SV = nullptr;
 
     for (unsigned vi = 0; vi != 2; ++vi) {
-      SmallVector<Constant*, 16> Indices;
+      SmallVector<uint32_t, 16> Indices;
       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
-        Indices.push_back(ConstantInt::get(Int32Ty, 2*i+vi));
+        Indices.push_back(2*i+vi);
 
       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
-      SV = llvm::ConstantVector::get(Indices);
-      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp");
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
     }
     return SV;
@@ -3407,14 +3815,13 @@
     Value *SV = nullptr;
 
     for (unsigned vi = 0; vi != 2; ++vi) {
-      SmallVector<Constant*, 16> Indices;
+      SmallVector<uint32_t, 16> Indices;
       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
-        Indices.push_back(ConstantInt::get(Int32Ty, (i + vi*e) >> 1));
-        Indices.push_back(ConstantInt::get(Int32Ty, ((i + vi*e) >> 1)+e));
+        Indices.push_back((i + vi*e) >> 1);
+        Indices.push_back(((i + vi*e) >> 1)+e);
       }
       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
-      SV = llvm::ConstantVector::get(Indices);
-      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip");
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
     }
     return SV;
@@ -3462,19 +3869,19 @@
   if (ExtOp)
     TblOps.push_back(ExtOp);
 
-  // Build a vector containing sequential number like (0, 1, 2, ..., 15)  
-  SmallVector<Constant*, 16> Indices;
+  // Build a vector containing sequential number like (0, 1, 2, ..., 15)
+  SmallVector<uint32_t, 16> Indices;
   llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
-    Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i));
-    Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i+1));
+    Indices.push_back(2*i);
+    Indices.push_back(2*i+1);
   }
-  Value *SV = llvm::ConstantVector::get(Indices);
 
   int PairPos = 0, End = Ops.size() - 1;
   while (PairPos < End) {
     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
-                                                     Ops[PairPos+1], SV, Name));
+                                                     Ops[PairPos+1], Indices,
+                                                     Name));
     PairPos += 2;
   }
 
@@ -3483,13 +3890,13 @@
   if (PairPos == End) {
     Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
-                                                     ZeroTbl, SV, Name));
+                                                     ZeroTbl, Indices, Name));
   }
 
   Function *TblF;
   TblOps.push_back(IndexOp);
   TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
-  
+
   return CGF.EmitNeonCall(TblF, TblOps, Name);
 }
 
@@ -3533,7 +3940,9 @@
 static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
                                          const CallExpr *E,
                                          llvm::Type *RegisterType,
-                                         llvm::Type *ValueType, bool IsRead) {
+                                         llvm::Type *ValueType,
+                                         bool IsRead,
+                                         StringRef SysReg = "") {
   // write and register intrinsics only support 32 and 64 bit operations.
   assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64))
           && "Unsupported size for register.");
@@ -3542,8 +3951,10 @@
   CodeGen::CodeGenModule &CGM = CGF.CGM;
   LLVMContext &Context = CGM.getLLVMContext();
 
-  const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
-  StringRef SysReg = cast<StringLiteral>(SysRegStrExpr)->getString();
+  if (SysReg.empty()) {
+    const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
+    SysReg = cast<StringLiteral>(SysRegStrExpr)->getString();
+  }
 
   llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) };
   llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
@@ -3683,6 +4094,74 @@
     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
   }
 
+  if (BuiltinID == ARM::BI__builtin_arm_mcrr ||
+      BuiltinID == ARM::BI__builtin_arm_mcrr2) {
+    Function *F;
+
+    switch (BuiltinID) {
+    default: llvm_unreachable("unexpected builtin");
+    case ARM::BI__builtin_arm_mcrr:
+      F = CGM.getIntrinsic(Intrinsic::arm_mcrr);
+      break;
+    case ARM::BI__builtin_arm_mcrr2:
+      F = CGM.getIntrinsic(Intrinsic::arm_mcrr2);
+      break;
+    }
+
+    // MCRR{2} instruction has 5 operands but
+    // the intrinsic has 4 because Rt and Rt2
+    // are represented as a single unsigned 64
+    // bit integer in the intrinsic definition
+    // but internally it's represented as 2 32
+    // bit integers.
+
+    Value *Coproc = EmitScalarExpr(E->getArg(0));
+    Value *Opc1 = EmitScalarExpr(E->getArg(1));
+    Value *RtAndRt2 = EmitScalarExpr(E->getArg(2));
+    Value *CRm = EmitScalarExpr(E->getArg(3));
+
+    Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
+    Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty);
+    Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1);
+    Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty);
+
+    return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm});
+  }
+
+  if (BuiltinID == ARM::BI__builtin_arm_mrrc ||
+      BuiltinID == ARM::BI__builtin_arm_mrrc2) {
+    Function *F;
+
+    switch (BuiltinID) {
+    default: llvm_unreachable("unexpected builtin");
+    case ARM::BI__builtin_arm_mrrc:
+      F = CGM.getIntrinsic(Intrinsic::arm_mrrc);
+      break;
+    case ARM::BI__builtin_arm_mrrc2:
+      F = CGM.getIntrinsic(Intrinsic::arm_mrrc2);
+      break;
+    }
+
+    Value *Coproc = EmitScalarExpr(E->getArg(0));
+    Value *Opc1 = EmitScalarExpr(E->getArg(1));
+    Value *CRm  = EmitScalarExpr(E->getArg(2));
+    Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm});
+
+    // Returns an unsigned 64 bit integer, represented
+    // as two 32 bit integers.
+
+    Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1);
+    Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0);
+    Rt = Builder.CreateZExt(Rt, Int64Ty);
+    Rt1 = Builder.CreateZExt(Rt1, Int64Ty);
+
+    Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32);
+    RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true);
+    RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1);
+
+    return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType()));
+  }
+
   if (BuiltinID == ARM::BI__builtin_arm_ldrexd ||
       ((BuiltinID == ARM::BI__builtin_arm_ldrex ||
         BuiltinID == ARM::BI__builtin_arm_ldaex) &&
@@ -3995,7 +4474,7 @@
   // the first argument, but the LLVM intrinsic expects it as the third one.
   case ARM::BI_MoveToCoprocessor:
   case ARM::BI_MoveToCoprocessor2: {
-    Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ? 
+    Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ?
                                    Intrinsic::arm_mcr : Intrinsic::arm_mcr2);
     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
                                   Ops[3], Ops[4], Ops[5]});
@@ -4559,11 +5038,6 @@
     return Builder.CreateCall(F);
   }
 
-  if (BuiltinID == AArch64::BI__builtin_thread_pointer) {
-    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_thread_pointer);
-    return Builder.CreateCall(F);
-  }
-
   // CRC32
   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
   switch (BuiltinID) {
@@ -6053,14 +6527,13 @@
     Value *SV = nullptr;
 
     for (unsigned vi = 0; vi != 2; ++vi) {
-      SmallVector<Constant*, 16> Indices;
+      SmallVector<uint32_t, 16> Indices;
       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
-        Indices.push_back(ConstantInt::get(Int32Ty, i+vi));
-        Indices.push_back(ConstantInt::get(Int32Ty, i+e+vi));
+        Indices.push_back(i+vi);
+        Indices.push_back(i+e+vi);
       }
       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
-      SV = llvm::ConstantVector::get(Indices);
-      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn");
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
     }
     return SV;
@@ -6073,13 +6546,12 @@
     Value *SV = nullptr;
 
     for (unsigned vi = 0; vi != 2; ++vi) {
-      SmallVector<Constant*, 16> Indices;
+      SmallVector<uint32_t, 16> Indices;
       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
-        Indices.push_back(ConstantInt::get(Int32Ty, 2*i+vi));
+        Indices.push_back(2*i+vi);
 
       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
-      SV = llvm::ConstantVector::get(Indices);
-      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp");
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
     }
     return SV;
@@ -6092,14 +6564,13 @@
     Value *SV = nullptr;
 
     for (unsigned vi = 0; vi != 2; ++vi) {
-      SmallVector<Constant*, 16> Indices;
+      SmallVector<uint32_t, 16> Indices;
       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
-        Indices.push_back(ConstantInt::get(Int32Ty, (i + vi*e) >> 1));
-        Indices.push_back(ConstantInt::get(Int32Ty, ((i + vi*e) >> 1)+e));
+        Indices.push_back((i + vi*e) >> 1);
+        Indices.push_back(((i + vi*e) >> 1)+e);
       }
       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
-      SV = llvm::ConstantVector::get(Indices);
-      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip");
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
     }
     return SV;
@@ -6175,6 +6646,138 @@
   return Result;
 }
 
+// Convert the mask from an integer type to a vector of i1.
+static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
+                              unsigned NumElts) {
+
+  llvm::VectorType *MaskTy = llvm::VectorType::get(CGF.Builder.getInt1Ty(),
+                         cast<IntegerType>(Mask->getType())->getBitWidth());
+  Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
+
+  // If we have less than 8 elements, then the starting mask was an i8 and
+  // we need to extract down to the right number of elements.
+  if (NumElts < 8) {
+    uint32_t Indices[4];
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = i;
+    MaskVec = CGF.Builder.CreateShuffleVector(MaskVec, MaskVec,
+                                             makeArrayRef(Indices, NumElts),
+                                             "extract");
+  }
+  return MaskVec;
+}
+
+static Value *EmitX86MaskedStore(CodeGenFunction &CGF,
+                                 SmallVectorImpl<Value *> &Ops,
+                                 unsigned Align) {
+  // Cast the pointer to right type.
+  Ops[0] = CGF.Builder.CreateBitCast(Ops[0],
+                               llvm::PointerType::getUnqual(Ops[1]->getType()));
+
+  // If the mask is all ones just emit a regular store.
+  if (const auto *C = dyn_cast<Constant>(Ops[2]))
+    if (C->isAllOnesValue())
+      return CGF.Builder.CreateAlignedStore(Ops[1], Ops[0], Align);
+
+  Value *MaskVec = getMaskVecValue(CGF, Ops[2],
+                                   Ops[1]->getType()->getVectorNumElements());
+
+  return CGF.Builder.CreateMaskedStore(Ops[1], Ops[0], Align, MaskVec);
+}
+
+static Value *EmitX86MaskedLoad(CodeGenFunction &CGF,
+                                SmallVectorImpl<Value *> &Ops, unsigned Align) {
+  // Cast the pointer to right type.
+  Ops[0] = CGF.Builder.CreateBitCast(Ops[0],
+                               llvm::PointerType::getUnqual(Ops[1]->getType()));
+
+  // If the mask is all ones just emit a regular store.
+  if (const auto *C = dyn_cast<Constant>(Ops[2]))
+    if (C->isAllOnesValue())
+      return CGF.Builder.CreateAlignedLoad(Ops[0], Align);
+
+  Value *MaskVec = getMaskVecValue(CGF, Ops[2],
+                                   Ops[1]->getType()->getVectorNumElements());
+
+  return CGF.Builder.CreateMaskedLoad(Ops[0], Align, MaskVec, Ops[1]);
+}
+
+static Value *EmitX86SubVectorBroadcast(CodeGenFunction &CGF,
+                                        SmallVectorImpl<Value *> &Ops,
+                                        llvm::Type *DstTy,
+                                        unsigned SrcSizeInBits,
+                                        unsigned Align) {
+  // Load the subvector.
+  Ops[0] = CGF.Builder.CreateAlignedLoad(Ops[0], Align);
+
+  // Create broadcast mask.
+  unsigned NumDstElts = DstTy->getVectorNumElements();
+  unsigned NumSrcElts = SrcSizeInBits / DstTy->getScalarSizeInBits();
+
+  SmallVector<uint32_t, 8> Mask;
+  for (unsigned i = 0; i != NumDstElts; i += NumSrcElts)
+    for (unsigned j = 0; j != NumSrcElts; ++j)
+      Mask.push_back(j);
+
+  return CGF.Builder.CreateShuffleVector(Ops[0], Ops[0], Mask, "subvecbcst");
+}
+
+static Value *EmitX86Select(CodeGenFunction &CGF,
+                            Value *Mask, Value *Op0, Value *Op1) {
+
+  // If the mask is all ones just return first argument.
+  if (const auto *C = dyn_cast<Constant>(Mask))
+    if (C->isAllOnesValue())
+      return Op0;
+
+  Mask = getMaskVecValue(CGF, Mask, Op0->getType()->getVectorNumElements());
+
+  return CGF.Builder.CreateSelect(Mask, Op0, Op1);
+}
+
+static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
+                                   bool Signed, SmallVectorImpl<Value *> &Ops) {
+  unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+  Value *Cmp;
+
+  if (CC == 3) {
+    Cmp = Constant::getNullValue(
+                       llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts));
+  } else if (CC == 7) {
+    Cmp = Constant::getAllOnesValue(
+                       llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts));
+  } else {
+    ICmpInst::Predicate Pred;
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case 0: Pred = ICmpInst::ICMP_EQ;  break;
+    case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
+    case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
+    case 4: Pred = ICmpInst::ICMP_NE;  break;
+    case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
+    case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
+    }
+    Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
+  }
+
+  const auto *C = dyn_cast<Constant>(Ops.back());
+  if (!C || !C->isAllOnesValue())
+    Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, Ops.back(), NumElts));
+
+  if (NumElts < 8) {
+    uint32_t Indices[8];
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = i;
+    for (unsigned i = NumElts; i != 8; ++i)
+      Indices[i] = i % NumElts + NumElts;
+    Cmp = CGF.Builder.CreateShuffleVector(
+        Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
+  }
+  return CGF.Builder.CreateBitCast(Cmp,
+                                   IntegerType::get(CGF.getLLVMContext(),
+                                                    std::max(NumElts, 8U)));
+}
+
 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
                                            const CallExpr *E) {
   if (BuiltinID == X86::BI__builtin_ms_va_start ||
@@ -6225,6 +6828,31 @@
     Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
   }
 
+  // These exist so that the builtin that takes an immediate can be bounds
+  // checked by clang to avoid passing bad immediates to the backend. Since
+  // AVX has a larger immediate than SSE we would need separate builtins to
+  // do the different bounds checking. Rather than create a clang specific
+  // SSE only builtin, this implements eight separate builtins to match gcc
+  // implementation.
+  auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
+    Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
+    llvm::Function *F = CGM.getIntrinsic(ID);
+    return Builder.CreateCall(F, Ops);
+  };
+
+  // For the vector forms of FP comparisons, translate the builtins directly to
+  // IR.
+  // TODO: The builtins could be removed if the SSE header files used vector
+  // extension comparisons directly (vector ordered/unordered may need
+  // additional support via __builtin_isnan()).
+  auto getVectorFCmpIR = [this, &Ops](CmpInst::Predicate Pred) {
+    Value *Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
+    llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
+    llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
+    Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
+    return Builder.CreateBitCast(Sext, FPVecTy);
+  };
+
   switch (BuiltinID) {
   default: return nullptr;
   case X86::BI__builtin_cpu_supports: {
@@ -6253,6 +6881,16 @@
       AVX512F,
       BMI,
       BMI2,
+      AES,
+      PCLMUL,
+      AVX512VL,
+      AVX512BW,
+      AVX512DQ,
+      AVX512CD,
+      AVX512ER,
+      AVX512PF,
+      AVX512VBMI,
+      AVX512IFMA,
       MAX
     };
 
@@ -6263,6 +6901,7 @@
                               .Case("sse", X86Features::SSE)
                               .Case("sse2", X86Features::SSE2)
                               .Case("sse3", X86Features::SSE3)
+                              .Case("ssse3", X86Features::SSSE3)
                               .Case("sse4.1", X86Features::SSE4_1)
                               .Case("sse4.2", X86Features::SSE4_2)
                               .Case("avx", X86Features::AVX)
@@ -6274,6 +6913,16 @@
                               .Case("avx512f", X86Features::AVX512F)
                               .Case("bmi", X86Features::BMI)
                               .Case("bmi2", X86Features::BMI2)
+                              .Case("aes", X86Features::AES)
+                              .Case("pclmul", X86Features::PCLMUL)
+                              .Case("avx512vl", X86Features::AVX512VL)
+                              .Case("avx512bw", X86Features::AVX512BW)
+                              .Case("avx512dq", X86Features::AVX512DQ)
+                              .Case("avx512cd", X86Features::AVX512CD)
+                              .Case("avx512er", X86Features::AVX512ER)
+                              .Case("avx512pf", X86Features::AVX512PF)
+                              .Case("avx512vbmi", X86Features::AVX512VBMI)
+                              .Case("avx512ifma", X86Features::AVX512IFMA)
                               .Default(X86Features::MAX);
     assert(Feature != X86Features::MAX && "Invalid feature!");
 
@@ -6302,7 +6951,7 @@
 
     // Check the value of the bit corresponding to the feature requested.
     Value *Bitset = Builder.CreateAnd(
-        Features, llvm::ConstantInt::get(Int32Ty, 1 << Feature));
+        Features, llvm::ConstantInt::get(Int32Ty, 1ULL << Feature));
     return Builder.CreateICmpNE(Bitset, llvm::ConstantInt::get(Int32Ty, 0));
   }
   case X86::BI_mm_prefetch: {
@@ -6377,6 +7026,85 @@
     Ops.push_back(Mlo);
     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
   }
+  case X86::BI__builtin_ia32_storedqudi128_mask:
+  case X86::BI__builtin_ia32_storedqusi128_mask:
+  case X86::BI__builtin_ia32_storedquhi128_mask:
+  case X86::BI__builtin_ia32_storedquqi128_mask:
+  case X86::BI__builtin_ia32_storeupd128_mask:
+  case X86::BI__builtin_ia32_storeups128_mask:
+  case X86::BI__builtin_ia32_storedqudi256_mask:
+  case X86::BI__builtin_ia32_storedqusi256_mask:
+  case X86::BI__builtin_ia32_storedquhi256_mask:
+  case X86::BI__builtin_ia32_storedquqi256_mask:
+  case X86::BI__builtin_ia32_storeupd256_mask:
+  case X86::BI__builtin_ia32_storeups256_mask:
+  case X86::BI__builtin_ia32_storedqudi512_mask:
+  case X86::BI__builtin_ia32_storedqusi512_mask:
+  case X86::BI__builtin_ia32_storedquhi512_mask:
+  case X86::BI__builtin_ia32_storedquqi512_mask:
+  case X86::BI__builtin_ia32_storeupd512_mask:
+  case X86::BI__builtin_ia32_storeups512_mask:
+    return EmitX86MaskedStore(*this, Ops, 1);
+
+  case X86::BI__builtin_ia32_movdqa32store128_mask:
+  case X86::BI__builtin_ia32_movdqa64store128_mask:
+  case X86::BI__builtin_ia32_storeaps128_mask:
+  case X86::BI__builtin_ia32_storeapd128_mask:
+  case X86::BI__builtin_ia32_movdqa32store256_mask:
+  case X86::BI__builtin_ia32_movdqa64store256_mask:
+  case X86::BI__builtin_ia32_storeaps256_mask:
+  case X86::BI__builtin_ia32_storeapd256_mask:
+  case X86::BI__builtin_ia32_movdqa32store512_mask:
+  case X86::BI__builtin_ia32_movdqa64store512_mask:
+  case X86::BI__builtin_ia32_storeaps512_mask:
+  case X86::BI__builtin_ia32_storeapd512_mask: {
+    unsigned Align =
+      getContext().getTypeAlignInChars(E->getArg(1)->getType()).getQuantity();
+    return EmitX86MaskedStore(*this, Ops, Align);
+  }
+  case X86::BI__builtin_ia32_loadups128_mask:
+  case X86::BI__builtin_ia32_loadups256_mask:
+  case X86::BI__builtin_ia32_loadups512_mask:
+  case X86::BI__builtin_ia32_loadupd128_mask:
+  case X86::BI__builtin_ia32_loadupd256_mask:
+  case X86::BI__builtin_ia32_loadupd512_mask:
+  case X86::BI__builtin_ia32_loaddquqi128_mask:
+  case X86::BI__builtin_ia32_loaddquqi256_mask:
+  case X86::BI__builtin_ia32_loaddquqi512_mask:
+  case X86::BI__builtin_ia32_loaddquhi128_mask:
+  case X86::BI__builtin_ia32_loaddquhi256_mask:
+  case X86::BI__builtin_ia32_loaddquhi512_mask:
+  case X86::BI__builtin_ia32_loaddqusi128_mask:
+  case X86::BI__builtin_ia32_loaddqusi256_mask:
+  case X86::BI__builtin_ia32_loaddqusi512_mask:
+  case X86::BI__builtin_ia32_loaddqudi128_mask:
+  case X86::BI__builtin_ia32_loaddqudi256_mask:
+  case X86::BI__builtin_ia32_loaddqudi512_mask:
+    return EmitX86MaskedLoad(*this, Ops, 1);
+
+  case X86::BI__builtin_ia32_loadaps128_mask:
+  case X86::BI__builtin_ia32_loadaps256_mask:
+  case X86::BI__builtin_ia32_loadaps512_mask:
+  case X86::BI__builtin_ia32_loadapd128_mask:
+  case X86::BI__builtin_ia32_loadapd256_mask:
+  case X86::BI__builtin_ia32_loadapd512_mask:
+  case X86::BI__builtin_ia32_movdqa32load128_mask:
+  case X86::BI__builtin_ia32_movdqa32load256_mask:
+  case X86::BI__builtin_ia32_movdqa32load512_mask:
+  case X86::BI__builtin_ia32_movdqa64load128_mask:
+  case X86::BI__builtin_ia32_movdqa64load256_mask:
+  case X86::BI__builtin_ia32_movdqa64load512_mask: {
+    unsigned Align =
+      getContext().getTypeAlignInChars(E->getArg(1)->getType()).getQuantity();
+    return EmitX86MaskedLoad(*this, Ops, Align);
+  }
+
+  case X86::BI__builtin_ia32_vbroadcastf128_pd256:
+  case X86::BI__builtin_ia32_vbroadcastf128_ps256: {
+    llvm::Type *DstTy = ConvertType(E->getType());
+    return EmitX86SubVectorBroadcast(*this, Ops, DstTy, 128, 1);
+  }
+
   case X86::BI__builtin_ia32_storehps:
   case X86::BI__builtin_ia32_storelps: {
     llvm::Type *PtrTy = llvm::PointerType::getUnqual(Int64Ty);
@@ -6395,103 +7123,50 @@
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
   case X86::BI__builtin_ia32_palignr128:
-  case X86::BI__builtin_ia32_palignr256: {
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr128_mask:
+  case X86::BI__builtin_ia32_palignr256_mask:
+  case X86::BI__builtin_ia32_palignr512_mask: {
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
 
-    unsigned NumElts =
-      cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+    unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
     assert(NumElts % 16 == 0);
-    unsigned NumLanes = NumElts / 16;
-    unsigned NumLaneElts = NumElts / NumLanes;
 
     // If palignr is shifting the pair of vectors more than the size of two
     // lanes, emit zero.
-    if (ShiftVal >= (2 * NumLaneElts))
+    if (ShiftVal >= 32)
       return llvm::Constant::getNullValue(ConvertType(E->getType()));
 
     // If palignr is shifting the pair of input vectors more than one lane,
     // but less than two lanes, convert to shifting in zeroes.
-    if (ShiftVal > NumLaneElts) {
-      ShiftVal -= NumLaneElts;
+    if (ShiftVal > 16) {
+      ShiftVal -= 16;
       Ops[1] = Ops[0];
       Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
     }
 
-    uint32_t Indices[32];
+    uint32_t Indices[64];
     // 256-bit palignr operates on 128-bit lanes so we need to handle that
-    for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-      for (unsigned i = 0; i != NumLaneElts; ++i) {
+    for (unsigned l = 0; l != NumElts; l += 16) {
+      for (unsigned i = 0; i != 16; ++i) {
         unsigned Idx = ShiftVal + i;
-        if (Idx >= NumLaneElts)
-          Idx += NumElts - NumLaneElts; // End of lane, switch operand.
+        if (Idx >= 16)
+          Idx += NumElts - 16; // End of lane, switch operand.
         Indices[l + i] = Idx + l;
       }
     }
 
-    Value *SV = llvm::ConstantDataVector::get(getLLVMContext(),
-                                              makeArrayRef(Indices, NumElts));
-    return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr");
+    Value *Align = Builder.CreateShuffleVector(Ops[1], Ops[0],
+                                               makeArrayRef(Indices, NumElts),
+                                               "palignr");
+
+    // If this isn't a masked builtin, just return the align operation.
+    if (Ops.size() == 3)
+      return Align;
+
+    return EmitX86Select(*this, Ops[4], Align, Ops[3]);
   }
-  case X86::BI__builtin_ia32_pslldqi256: {
-    // Shift value is in bits so divide by 8.
-    unsigned shiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() >> 3;
 
-    // If pslldq is shifting the vector more than 15 bytes, emit zero.
-    if (shiftVal >= 16)
-      return llvm::Constant::getNullValue(ConvertType(E->getType()));
-
-    uint32_t Indices[32];
-    // 256-bit pslldq operates on 128-bit lanes so we need to handle that
-    for (unsigned l = 0; l != 32; l += 16) {
-      for (unsigned i = 0; i != 16; ++i) {
-        unsigned Idx = 32 + i - shiftVal;
-        if (Idx < 32) Idx -= 16; // end of lane, switch operand.
-        Indices[l + i] = Idx + l;
-      }
-    }
-
-    llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, 32);
-    Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast");
-    Value *Zero = llvm::Constant::getNullValue(VecTy);
-
-    Value *SV = llvm::ConstantDataVector::get(getLLVMContext(), Indices);
-    SV = Builder.CreateShuffleVector(Zero, Ops[0], SV, "pslldq");
-    llvm::Type *ResultType = ConvertType(E->getType());
-    return Builder.CreateBitCast(SV, ResultType, "cast");
-  }
-  case X86::BI__builtin_ia32_psrldqi256: {
-    // Shift value is in bits so divide by 8.
-    unsigned shiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() >> 3;
-
-    // If psrldq is shifting the vector more than 15 bytes, emit zero.
-    if (shiftVal >= 16)
-      return llvm::Constant::getNullValue(ConvertType(E->getType()));
-
-    uint32_t Indices[32];
-    // 256-bit psrldq operates on 128-bit lanes so we need to handle that
-    for (unsigned l = 0; l != 32; l += 16) {
-      for (unsigned i = 0; i != 16; ++i) {
-        unsigned Idx = i + shiftVal;
-        if (Idx >= 16) Idx += 16; // end of lane, switch operand.
-        Indices[l + i] = Idx + l;
-      }
-    }
-
-    llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, 32);
-    Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast");
-    Value *Zero = llvm::Constant::getNullValue(VecTy);
-
-    Value *SV = llvm::ConstantDataVector::get(getLLVMContext(), Indices);
-    SV = Builder.CreateShuffleVector(Ops[0], Zero, SV, "psrldq");
-    llvm::Type *ResultType = ConvertType(E->getType());
-    return Builder.CreateBitCast(SV, ResultType, "cast");
-  }
-  case X86::BI__builtin_ia32_movntps:
-  case X86::BI__builtin_ia32_movntps256:
-  case X86::BI__builtin_ia32_movntpd:
-  case X86::BI__builtin_ia32_movntpd256:
-  case X86::BI__builtin_ia32_movntdq:
-  case X86::BI__builtin_ia32_movntdq256:
   case X86::BI__builtin_ia32_movnti:
   case X86::BI__builtin_ia32_movnti64: {
     llvm::MDNode *Node = llvm::MDNode::get(
@@ -6504,17 +7179,156 @@
     StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC);
     SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
 
-    // If the operand is an integer, we can't assume alignment. Otherwise,
-    // assume natural alignment.
-    QualType ArgTy = E->getArg(1)->getType();
-    unsigned Align;
-    if (ArgTy->isIntegerType())
-      Align = 1;
-    else
-      Align = getContext().getTypeSizeInChars(ArgTy).getQuantity();
-    SI->setAlignment(Align);
+    // No alignment for scalar intrinsic store.
+    SI->setAlignment(1);
     return SI;
   }
+  case X86::BI__builtin_ia32_movntsd:
+  case X86::BI__builtin_ia32_movntss: {
+    llvm::MDNode *Node = llvm::MDNode::get(
+        getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+
+    // Extract the 0'th element of the source vector.
+    Value *Scl = Builder.CreateExtractElement(Ops[1], (uint64_t)0, "extract");
+
+    // Convert the type of the pointer to a pointer to the stored type.
+    Value *BC = Builder.CreateBitCast(Ops[0],
+                                llvm::PointerType::getUnqual(Scl->getType()),
+                                      "cast");
+
+    // Unaligned nontemporal store of the scalar value.
+    StoreInst *SI = Builder.CreateDefaultAlignedStore(Scl, BC);
+    SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
+    SI->setAlignment(1);
+    return SI;
+  }
+
+  case X86::BI__builtin_ia32_selectb_128:
+  case X86::BI__builtin_ia32_selectb_256:
+  case X86::BI__builtin_ia32_selectb_512:
+  case X86::BI__builtin_ia32_selectw_128:
+  case X86::BI__builtin_ia32_selectw_256:
+  case X86::BI__builtin_ia32_selectw_512:
+  case X86::BI__builtin_ia32_selectd_128:
+  case X86::BI__builtin_ia32_selectd_256:
+  case X86::BI__builtin_ia32_selectd_512:
+  case X86::BI__builtin_ia32_selectq_128:
+  case X86::BI__builtin_ia32_selectq_256:
+  case X86::BI__builtin_ia32_selectq_512:
+  case X86::BI__builtin_ia32_selectps_128:
+  case X86::BI__builtin_ia32_selectps_256:
+  case X86::BI__builtin_ia32_selectps_512:
+  case X86::BI__builtin_ia32_selectpd_128:
+  case X86::BI__builtin_ia32_selectpd_256:
+  case X86::BI__builtin_ia32_selectpd_512:
+    return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
+  case X86::BI__builtin_ia32_pcmpeqb128_mask:
+  case X86::BI__builtin_ia32_pcmpeqb256_mask:
+  case X86::BI__builtin_ia32_pcmpeqb512_mask:
+  case X86::BI__builtin_ia32_pcmpeqw128_mask:
+  case X86::BI__builtin_ia32_pcmpeqw256_mask:
+  case X86::BI__builtin_ia32_pcmpeqw512_mask:
+  case X86::BI__builtin_ia32_pcmpeqd128_mask:
+  case X86::BI__builtin_ia32_pcmpeqd256_mask:
+  case X86::BI__builtin_ia32_pcmpeqd512_mask:
+  case X86::BI__builtin_ia32_pcmpeqq128_mask:
+  case X86::BI__builtin_ia32_pcmpeqq256_mask:
+  case X86::BI__builtin_ia32_pcmpeqq512_mask:
+    return EmitX86MaskedCompare(*this, 0, false, Ops);
+  case X86::BI__builtin_ia32_pcmpgtb128_mask:
+  case X86::BI__builtin_ia32_pcmpgtb256_mask:
+  case X86::BI__builtin_ia32_pcmpgtb512_mask:
+  case X86::BI__builtin_ia32_pcmpgtw128_mask:
+  case X86::BI__builtin_ia32_pcmpgtw256_mask:
+  case X86::BI__builtin_ia32_pcmpgtw512_mask:
+  case X86::BI__builtin_ia32_pcmpgtd128_mask:
+  case X86::BI__builtin_ia32_pcmpgtd256_mask:
+  case X86::BI__builtin_ia32_pcmpgtd512_mask:
+  case X86::BI__builtin_ia32_pcmpgtq128_mask:
+  case X86::BI__builtin_ia32_pcmpgtq256_mask:
+  case X86::BI__builtin_ia32_pcmpgtq512_mask:
+    return EmitX86MaskedCompare(*this, 6, true, Ops);
+  case X86::BI__builtin_ia32_cmpb128_mask:
+  case X86::BI__builtin_ia32_cmpb256_mask:
+  case X86::BI__builtin_ia32_cmpb512_mask:
+  case X86::BI__builtin_ia32_cmpw128_mask:
+  case X86::BI__builtin_ia32_cmpw256_mask:
+  case X86::BI__builtin_ia32_cmpw512_mask:
+  case X86::BI__builtin_ia32_cmpd128_mask:
+  case X86::BI__builtin_ia32_cmpd256_mask:
+  case X86::BI__builtin_ia32_cmpd512_mask:
+  case X86::BI__builtin_ia32_cmpq128_mask:
+  case X86::BI__builtin_ia32_cmpq256_mask:
+  case X86::BI__builtin_ia32_cmpq512_mask: {
+    unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
+    return EmitX86MaskedCompare(*this, CC, true, Ops);
+  }
+  case X86::BI__builtin_ia32_ucmpb128_mask:
+  case X86::BI__builtin_ia32_ucmpb256_mask:
+  case X86::BI__builtin_ia32_ucmpb512_mask:
+  case X86::BI__builtin_ia32_ucmpw128_mask:
+  case X86::BI__builtin_ia32_ucmpw256_mask:
+  case X86::BI__builtin_ia32_ucmpw512_mask:
+  case X86::BI__builtin_ia32_ucmpd128_mask:
+  case X86::BI__builtin_ia32_ucmpd256_mask:
+  case X86::BI__builtin_ia32_ucmpd512_mask:
+  case X86::BI__builtin_ia32_ucmpq128_mask:
+  case X86::BI__builtin_ia32_ucmpq256_mask:
+  case X86::BI__builtin_ia32_ucmpq512_mask: {
+    unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
+    return EmitX86MaskedCompare(*this, CC, false, Ops);
+  }
+
+  case X86::BI__builtin_ia32_vplzcntd_128_mask:
+  case X86::BI__builtin_ia32_vplzcntd_256_mask:
+  case X86::BI__builtin_ia32_vplzcntd_512_mask:
+  case X86::BI__builtin_ia32_vplzcntq_128_mask:
+  case X86::BI__builtin_ia32_vplzcntq_256_mask:
+  case X86::BI__builtin_ia32_vplzcntq_512_mask: {
+    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
+    return EmitX86Select(*this, Ops[2],
+                         Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)}),
+                         Ops[1]);
+  }
+
+  // TODO: Handle 64/512-bit vector widths of min/max.
+  case X86::BI__builtin_ia32_pmaxsb128:
+  case X86::BI__builtin_ia32_pmaxsw128:
+  case X86::BI__builtin_ia32_pmaxsd128:
+  case X86::BI__builtin_ia32_pmaxsb256:
+  case X86::BI__builtin_ia32_pmaxsw256:
+  case X86::BI__builtin_ia32_pmaxsd256: {
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Ops[0], Ops[1]);
+    return Builder.CreateSelect(Cmp, Ops[0], Ops[1]);
+  }
+  case X86::BI__builtin_ia32_pmaxub128:
+  case X86::BI__builtin_ia32_pmaxuw128:
+  case X86::BI__builtin_ia32_pmaxud128:
+  case X86::BI__builtin_ia32_pmaxub256:
+  case X86::BI__builtin_ia32_pmaxuw256:
+  case X86::BI__builtin_ia32_pmaxud256: {
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Ops[0], Ops[1]);
+    return Builder.CreateSelect(Cmp, Ops[0], Ops[1]);
+  }
+  case X86::BI__builtin_ia32_pminsb128:
+  case X86::BI__builtin_ia32_pminsw128:
+  case X86::BI__builtin_ia32_pminsd128:
+  case X86::BI__builtin_ia32_pminsb256:
+  case X86::BI__builtin_ia32_pminsw256:
+  case X86::BI__builtin_ia32_pminsd256: {
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SLT, Ops[0], Ops[1]);
+    return Builder.CreateSelect(Cmp, Ops[0], Ops[1]);
+  }
+  case X86::BI__builtin_ia32_pminub128:
+  case X86::BI__builtin_ia32_pminuw128:
+  case X86::BI__builtin_ia32_pminud128:
+  case X86::BI__builtin_ia32_pminub256:
+  case X86::BI__builtin_ia32_pminuw256:
+  case X86::BI__builtin_ia32_pminud256: {
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, Ops[0], Ops[1]);
+    return Builder.CreateSelect(Cmp, Ops[0], Ops[1]);
+  }
+
   // 3DNow!
   case X86::BI__builtin_ia32_pswapdsf:
   case X86::BI__builtin_ia32_pswapdsi: {
@@ -6557,154 +7371,107 @@
                                       Ops[0]);
     return Builder.CreateExtractValue(Call, 1);
   }
-  // SSE comparison intrisics
-  case X86::BI__builtin_ia32_cmpeqps:
-  case X86::BI__builtin_ia32_cmpltps:
-  case X86::BI__builtin_ia32_cmpleps:
-  case X86::BI__builtin_ia32_cmpunordps:
-  case X86::BI__builtin_ia32_cmpneqps:
-  case X86::BI__builtin_ia32_cmpnltps:
-  case X86::BI__builtin_ia32_cmpnleps:
-  case X86::BI__builtin_ia32_cmpordps:
-  case X86::BI__builtin_ia32_cmpeqss:
-  case X86::BI__builtin_ia32_cmpltss:
-  case X86::BI__builtin_ia32_cmpless:
-  case X86::BI__builtin_ia32_cmpunordss:
-  case X86::BI__builtin_ia32_cmpneqss:
-  case X86::BI__builtin_ia32_cmpnltss:
-  case X86::BI__builtin_ia32_cmpnless:
-  case X86::BI__builtin_ia32_cmpordss:
-  case X86::BI__builtin_ia32_cmpeqpd:
-  case X86::BI__builtin_ia32_cmpltpd:
-  case X86::BI__builtin_ia32_cmplepd:
-  case X86::BI__builtin_ia32_cmpunordpd:
-  case X86::BI__builtin_ia32_cmpneqpd:
-  case X86::BI__builtin_ia32_cmpnltpd:
-  case X86::BI__builtin_ia32_cmpnlepd:
-  case X86::BI__builtin_ia32_cmpordpd:
-  case X86::BI__builtin_ia32_cmpeqsd:
-  case X86::BI__builtin_ia32_cmpltsd:
-  case X86::BI__builtin_ia32_cmplesd:
-  case X86::BI__builtin_ia32_cmpunordsd:
-  case X86::BI__builtin_ia32_cmpneqsd:
-  case X86::BI__builtin_ia32_cmpnltsd:
-  case X86::BI__builtin_ia32_cmpnlesd:
-  case X86::BI__builtin_ia32_cmpordsd:
-    // These exist so that the builtin that takes an immediate can be bounds
-    // checked by clang to avoid passing bad immediates to the backend. Since
-    // AVX has a larger immediate than SSE we would need separate builtins to
-    // do the different bounds checking. Rather than create a clang specific
-    // SSE only builtin, this implements eight separate builtins to match gcc
-    // implementation.
 
-    // Choose the immediate.
-    unsigned Imm;
-    switch (BuiltinID) {
-    default: llvm_unreachable("Unsupported intrinsic!");
-    case X86::BI__builtin_ia32_cmpeqps:
-    case X86::BI__builtin_ia32_cmpeqss:
-    case X86::BI__builtin_ia32_cmpeqpd:
-    case X86::BI__builtin_ia32_cmpeqsd:
-      Imm = 0;
-      break;
-    case X86::BI__builtin_ia32_cmpltps:
-    case X86::BI__builtin_ia32_cmpltss:
-    case X86::BI__builtin_ia32_cmpltpd:
-    case X86::BI__builtin_ia32_cmpltsd:
-      Imm = 1;
-      break;
-    case X86::BI__builtin_ia32_cmpleps:
-    case X86::BI__builtin_ia32_cmpless:
-    case X86::BI__builtin_ia32_cmplepd:
-    case X86::BI__builtin_ia32_cmplesd:
-      Imm = 2;
-      break;
-    case X86::BI__builtin_ia32_cmpunordps:
-    case X86::BI__builtin_ia32_cmpunordss:
-    case X86::BI__builtin_ia32_cmpunordpd:
-    case X86::BI__builtin_ia32_cmpunordsd:
-      Imm = 3;
-      break;
-    case X86::BI__builtin_ia32_cmpneqps:
-    case X86::BI__builtin_ia32_cmpneqss:
-    case X86::BI__builtin_ia32_cmpneqpd:
-    case X86::BI__builtin_ia32_cmpneqsd:
-      Imm = 4;
-      break;
-    case X86::BI__builtin_ia32_cmpnltps:
-    case X86::BI__builtin_ia32_cmpnltss:
-    case X86::BI__builtin_ia32_cmpnltpd:
-    case X86::BI__builtin_ia32_cmpnltsd:
-      Imm = 5;
-      break;
-    case X86::BI__builtin_ia32_cmpnleps:
-    case X86::BI__builtin_ia32_cmpnless:
-    case X86::BI__builtin_ia32_cmpnlepd:
-    case X86::BI__builtin_ia32_cmpnlesd:
-      Imm = 6;
-      break;
-    case X86::BI__builtin_ia32_cmpordps:
-    case X86::BI__builtin_ia32_cmpordss:
-    case X86::BI__builtin_ia32_cmpordpd:
-    case X86::BI__builtin_ia32_cmpordsd:
-      Imm = 7;
-      break;
+  // SSE packed comparison intrinsics
+  case X86::BI__builtin_ia32_cmpeqps:
+  case X86::BI__builtin_ia32_cmpeqpd:
+    return getVectorFCmpIR(CmpInst::FCMP_OEQ);
+  case X86::BI__builtin_ia32_cmpltps:
+  case X86::BI__builtin_ia32_cmpltpd:
+    return getVectorFCmpIR(CmpInst::FCMP_OLT);
+  case X86::BI__builtin_ia32_cmpleps:
+  case X86::BI__builtin_ia32_cmplepd:
+    return getVectorFCmpIR(CmpInst::FCMP_OLE);
+  case X86::BI__builtin_ia32_cmpunordps:
+  case X86::BI__builtin_ia32_cmpunordpd:
+    return getVectorFCmpIR(CmpInst::FCMP_UNO);
+  case X86::BI__builtin_ia32_cmpneqps:
+  case X86::BI__builtin_ia32_cmpneqpd:
+    return getVectorFCmpIR(CmpInst::FCMP_UNE);
+  case X86::BI__builtin_ia32_cmpnltps:
+  case X86::BI__builtin_ia32_cmpnltpd:
+    return getVectorFCmpIR(CmpInst::FCMP_UGE);
+  case X86::BI__builtin_ia32_cmpnleps:
+  case X86::BI__builtin_ia32_cmpnlepd:
+    return getVectorFCmpIR(CmpInst::FCMP_UGT);
+  case X86::BI__builtin_ia32_cmpordps:
+  case X86::BI__builtin_ia32_cmpordpd:
+    return getVectorFCmpIR(CmpInst::FCMP_ORD);
+  case X86::BI__builtin_ia32_cmpps:
+  case X86::BI__builtin_ia32_cmpps256:
+  case X86::BI__builtin_ia32_cmppd:
+  case X86::BI__builtin_ia32_cmppd256: {
+    unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+    // If this one of the SSE immediates, we can use native IR.
+    if (CC < 8) {
+      FCmpInst::Predicate Pred;
+      switch (CC) {
+      case 0: Pred = FCmpInst::FCMP_OEQ; break;
+      case 1: Pred = FCmpInst::FCMP_OLT; break;
+      case 2: Pred = FCmpInst::FCMP_OLE; break;
+      case 3: Pred = FCmpInst::FCMP_UNO; break;
+      case 4: Pred = FCmpInst::FCMP_UNE; break;
+      case 5: Pred = FCmpInst::FCMP_UGE; break;
+      case 6: Pred = FCmpInst::FCMP_UGT; break;
+      case 7: Pred = FCmpInst::FCMP_ORD; break;
+      }
+      return getVectorFCmpIR(Pred);
     }
 
-    // Choose the intrinsic ID.
-    const char *name;
+    // We can't handle 8-31 immediates with native IR, use the intrinsic.
     Intrinsic::ID ID;
     switch (BuiltinID) {
     default: llvm_unreachable("Unsupported intrinsic!");
-    case X86::BI__builtin_ia32_cmpeqps:
-    case X86::BI__builtin_ia32_cmpltps:
-    case X86::BI__builtin_ia32_cmpleps:
-    case X86::BI__builtin_ia32_cmpunordps:
-    case X86::BI__builtin_ia32_cmpneqps:
-    case X86::BI__builtin_ia32_cmpnltps:
-    case X86::BI__builtin_ia32_cmpnleps:
-    case X86::BI__builtin_ia32_cmpordps:
-      name = "cmpps";
+    case X86::BI__builtin_ia32_cmpps:
       ID = Intrinsic::x86_sse_cmp_ps;
       break;
-    case X86::BI__builtin_ia32_cmpeqss:
-    case X86::BI__builtin_ia32_cmpltss:
-    case X86::BI__builtin_ia32_cmpless:
-    case X86::BI__builtin_ia32_cmpunordss:
-    case X86::BI__builtin_ia32_cmpneqss:
-    case X86::BI__builtin_ia32_cmpnltss:
-    case X86::BI__builtin_ia32_cmpnless:
-    case X86::BI__builtin_ia32_cmpordss:
-      name = "cmpss";
-      ID = Intrinsic::x86_sse_cmp_ss;
+    case X86::BI__builtin_ia32_cmpps256:
+      ID = Intrinsic::x86_avx_cmp_ps_256;
       break;
-    case X86::BI__builtin_ia32_cmpeqpd:
-    case X86::BI__builtin_ia32_cmpltpd:
-    case X86::BI__builtin_ia32_cmplepd:
-    case X86::BI__builtin_ia32_cmpunordpd:
-    case X86::BI__builtin_ia32_cmpneqpd:
-    case X86::BI__builtin_ia32_cmpnltpd:
-    case X86::BI__builtin_ia32_cmpnlepd:
-    case X86::BI__builtin_ia32_cmpordpd:
-      name = "cmppd";
+    case X86::BI__builtin_ia32_cmppd:
       ID = Intrinsic::x86_sse2_cmp_pd;
       break;
-    case X86::BI__builtin_ia32_cmpeqsd:
-    case X86::BI__builtin_ia32_cmpltsd:
-    case X86::BI__builtin_ia32_cmplesd:
-    case X86::BI__builtin_ia32_cmpunordsd:
-    case X86::BI__builtin_ia32_cmpneqsd:
-    case X86::BI__builtin_ia32_cmpnltsd:
-    case X86::BI__builtin_ia32_cmpnlesd:
-    case X86::BI__builtin_ia32_cmpordsd:
-      name = "cmpsd";
-      ID = Intrinsic::x86_sse2_cmp_sd;
+    case X86::BI__builtin_ia32_cmppd256:
+      ID = Intrinsic::x86_avx_cmp_pd_256;
       break;
     }
 
-    Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
-    llvm::Function *F = CGM.getIntrinsic(ID);
-    return Builder.CreateCall(F, Ops, name);
+    return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
+  }
+
+  // SSE scalar comparison intrinsics
+  case X86::BI__builtin_ia32_cmpeqss:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
+  case X86::BI__builtin_ia32_cmpltss:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
+  case X86::BI__builtin_ia32_cmpless:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
+  case X86::BI__builtin_ia32_cmpunordss:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
+  case X86::BI__builtin_ia32_cmpneqss:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
+  case X86::BI__builtin_ia32_cmpnltss:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
+  case X86::BI__builtin_ia32_cmpnless:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
+  case X86::BI__builtin_ia32_cmpordss:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
+  case X86::BI__builtin_ia32_cmpeqsd:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
+  case X86::BI__builtin_ia32_cmpltsd:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
+  case X86::BI__builtin_ia32_cmplesd:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
+  case X86::BI__builtin_ia32_cmpunordsd:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
+  case X86::BI__builtin_ia32_cmpneqsd:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
+  case X86::BI__builtin_ia32_cmpnltsd:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
+  case X86::BI__builtin_ia32_cmpnlesd:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
+  case X86::BI__builtin_ia32_cmpordsd:
+    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
   }
 }
 
@@ -6877,6 +7644,16 @@
     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
     return Builder.CreateCall(F, X);
   }
+
+  // Absolute value
+  case PPC::BI__builtin_vsx_xvabsdp:
+  case PPC::BI__builtin_vsx_xvabssp: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    Value *X = EmitScalarExpr(E->getArg(0));
+    llvm::Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
+    return Builder.CreateCall(F, X);
+  }
+
   // FMA variations
   case PPC::BI__builtin_vsx_xvmaddadp:
   case PPC::BI__builtin_vsx_xvmaddasp:
@@ -6916,44 +7693,11 @@
   }
 }
 
-// Emit an intrinsic that has 1 float or double.
-static Value *emitUnaryFPBuiltin(CodeGenFunction &CGF,
-                                 const CallExpr *E,
-                                 unsigned IntrinsicID) {
-  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
-
-  Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
-  return CGF.Builder.CreateCall(F, Src0);
-}
-
-// Emit an intrinsic that has 3 float or double operands.
-static Value *emitTernaryFPBuiltin(CodeGenFunction &CGF,
-                                   const CallExpr *E,
-                                   unsigned IntrinsicID) {
-  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
-  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
-  llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
-
-  Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
-  return CGF.Builder.CreateCall(F, {Src0, Src1, Src2});
-}
-
-// Emit an intrinsic that has 1 float or double operand, and 1 integer.
-static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
-                               const CallExpr *E,
-                               unsigned IntrinsicID) {
-  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
-  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
-
-  Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
-  return CGF.Builder.CreateCall(F, {Src0, Src1});
-}
-
 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                               const CallExpr *E) {
   switch (BuiltinID) {
-  case AMDGPU::BI__builtin_amdgpu_div_scale:
-  case AMDGPU::BI__builtin_amdgpu_div_scalef: {
+  case AMDGPU::BI__builtin_amdgcn_div_scale:
+  case AMDGPU::BI__builtin_amdgcn_div_scalef: {
     // Translate from the intrinsics's struct return to the builtin's out
     // argument.
 
@@ -6963,7 +7707,7 @@
     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
     llvm::Value *Z = EmitScalarExpr(E->getArg(2));
 
-    llvm::Value *Callee = CGM.getIntrinsic(Intrinsic::AMDGPU_div_scale,
+    llvm::Value *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
                                            X->getType());
 
     llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
@@ -6978,40 +7722,93 @@
     Builder.CreateStore(FlagExt, FlagOutPtr);
     return Result;
   }
-  case AMDGPU::BI__builtin_amdgpu_div_fmas:
-  case AMDGPU::BI__builtin_amdgpu_div_fmasf: {
+  case AMDGPU::BI__builtin_amdgcn_div_fmas:
+  case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
 
-    llvm::Value *F = CGM.getIntrinsic(Intrinsic::AMDGPU_div_fmas,
+    llvm::Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
                                       Src0->getType());
     llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
     return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
   }
-  case AMDGPU::BI__builtin_amdgpu_div_fixup:
-  case AMDGPU::BI__builtin_amdgpu_div_fixupf:
-    return emitTernaryFPBuiltin(*this, E, Intrinsic::AMDGPU_div_fixup);
-  case AMDGPU::BI__builtin_amdgpu_trig_preop:
-  case AMDGPU::BI__builtin_amdgpu_trig_preopf:
-    return emitFPIntBuiltin(*this, E, Intrinsic::AMDGPU_trig_preop);
-  case AMDGPU::BI__builtin_amdgpu_rcp:
-  case AMDGPU::BI__builtin_amdgpu_rcpf:
-    return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rcp);
-  case AMDGPU::BI__builtin_amdgpu_rsq:
-  case AMDGPU::BI__builtin_amdgpu_rsqf:
-    return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rsq);
-  case AMDGPU::BI__builtin_amdgpu_rsq_clamped:
-  case AMDGPU::BI__builtin_amdgpu_rsq_clampedf:
-    return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rsq_clamped);
-  case AMDGPU::BI__builtin_amdgpu_ldexp:
-  case AMDGPU::BI__builtin_amdgpu_ldexpf:
-    return emitFPIntBuiltin(*this, E, Intrinsic::AMDGPU_ldexp);
-  case AMDGPU::BI__builtin_amdgpu_class:
-  case AMDGPU::BI__builtin_amdgpu_classf:
-    return emitFPIntBuiltin(*this, E, Intrinsic::AMDGPU_class);
-   default:
+  case AMDGPU::BI__builtin_amdgcn_div_fixup:
+  case AMDGPU::BI__builtin_amdgcn_div_fixupf:
+    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup);
+  case AMDGPU::BI__builtin_amdgcn_trig_preop:
+  case AMDGPU::BI__builtin_amdgcn_trig_preopf:
+    return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
+  case AMDGPU::BI__builtin_amdgcn_rcp:
+  case AMDGPU::BI__builtin_amdgcn_rcpf:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rcp);
+  case AMDGPU::BI__builtin_amdgcn_rsq:
+  case AMDGPU::BI__builtin_amdgcn_rsqf:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq);
+  case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
+  case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);
+  case AMDGPU::BI__builtin_amdgcn_sinf:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);
+  case AMDGPU::BI__builtin_amdgcn_cosf:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
+  case AMDGPU::BI__builtin_amdgcn_log_clampf:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
+  case AMDGPU::BI__builtin_amdgcn_ldexp:
+  case AMDGPU::BI__builtin_amdgcn_ldexpf:
+    return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_ldexp);
+  case AMDGPU::BI__builtin_amdgcn_frexp_mant:
+  case AMDGPU::BI__builtin_amdgcn_frexp_mantf: {
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_mant);
+  }
+  case AMDGPU::BI__builtin_amdgcn_frexp_exp:
+  case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_exp);
+  }
+  case AMDGPU::BI__builtin_amdgcn_fract:
+  case AMDGPU::BI__builtin_amdgcn_fractf:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_fract);
+  case AMDGPU::BI__builtin_amdgcn_lerp:
+    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_lerp);
+  case AMDGPU::BI__builtin_amdgcn_uicmp:
+  case AMDGPU::BI__builtin_amdgcn_uicmpl:
+  case AMDGPU::BI__builtin_amdgcn_sicmp:
+  case AMDGPU::BI__builtin_amdgcn_sicmpl:
+    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_icmp);
+  case AMDGPU::BI__builtin_amdgcn_fcmp:
+  case AMDGPU::BI__builtin_amdgcn_fcmpf:
+    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fcmp);
+  case AMDGPU::BI__builtin_amdgcn_class:
+  case AMDGPU::BI__builtin_amdgcn_classf:
+    return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
+
+  case AMDGPU::BI__builtin_amdgcn_read_exec: {
+    CallInst *CI = cast<CallInst>(
+      EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, true, "exec"));
+    CI->setConvergent();
+    return CI;
+  }
+
+  // amdgcn workitem
+  case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
+    return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
+  case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
+    return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
+  case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
+    return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);
+
+  // r600 intrinsics
+  case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
+  case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
+    return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
+  case AMDGPU::BI__builtin_r600_read_tidig_x:
+    return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
+  case AMDGPU::BI__builtin_r600_read_tidig_y:
+    return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
+  case AMDGPU::BI__builtin_r600_read_tidig_z:
+    return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024);
+  default:
     return nullptr;
   }
 }
@@ -7261,6 +8058,17 @@
 
 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
                                              const CallExpr *E) {
+  auto MakeLdg = [&](unsigned IntrinsicID) {
+    Value *Ptr = EmitScalarExpr(E->getArg(0));
+    AlignmentSource AlignSource;
+    clang::CharUnits Align =
+        getNaturalPointeeTypeAlignment(E->getArg(0)->getType(), &AlignSource);
+    return Builder.CreateCall(
+        CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
+                                       Ptr->getType()}),
+        {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())});
+  };
+
   switch (BuiltinID) {
   case NVPTX::BI__nvvm_atom_add_gen_i:
   case NVPTX::BI__nvvm_atom_add_gen_l:
@@ -7329,6 +8137,56 @@
     return Builder.CreateCall(FnALAF32, {Ptr, Val});
   }
 
+  case NVPTX::BI__nvvm_atom_inc_gen_ui: {
+    Value *Ptr = EmitScalarExpr(E->getArg(0));
+    Value *Val = EmitScalarExpr(E->getArg(1));
+    Value *FnALI32 =
+        CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
+    return Builder.CreateCall(FnALI32, {Ptr, Val});
+  }
+
+  case NVPTX::BI__nvvm_atom_dec_gen_ui: {
+    Value *Ptr = EmitScalarExpr(E->getArg(0));
+    Value *Val = EmitScalarExpr(E->getArg(1));
+    Value *FnALD32 =
+        CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
+    return Builder.CreateCall(FnALD32, {Ptr, Val});
+  }
+
+  case NVPTX::BI__nvvm_ldg_c:
+  case NVPTX::BI__nvvm_ldg_c2:
+  case NVPTX::BI__nvvm_ldg_c4:
+  case NVPTX::BI__nvvm_ldg_s:
+  case NVPTX::BI__nvvm_ldg_s2:
+  case NVPTX::BI__nvvm_ldg_s4:
+  case NVPTX::BI__nvvm_ldg_i:
+  case NVPTX::BI__nvvm_ldg_i2:
+  case NVPTX::BI__nvvm_ldg_i4:
+  case NVPTX::BI__nvvm_ldg_l:
+  case NVPTX::BI__nvvm_ldg_ll:
+  case NVPTX::BI__nvvm_ldg_ll2:
+  case NVPTX::BI__nvvm_ldg_uc:
+  case NVPTX::BI__nvvm_ldg_uc2:
+  case NVPTX::BI__nvvm_ldg_uc4:
+  case NVPTX::BI__nvvm_ldg_us:
+  case NVPTX::BI__nvvm_ldg_us2:
+  case NVPTX::BI__nvvm_ldg_us4:
+  case NVPTX::BI__nvvm_ldg_ui:
+  case NVPTX::BI__nvvm_ldg_ui2:
+  case NVPTX::BI__nvvm_ldg_ui4:
+  case NVPTX::BI__nvvm_ldg_ul:
+  case NVPTX::BI__nvvm_ldg_ull:
+  case NVPTX::BI__nvvm_ldg_ull2:
+    // PTX Interoperability section 2.2: "For a vector with an even number of
+    // elements, its alignment is set to number of elements times the alignment
+    // of its member: n*alignof(t)."
+    return MakeLdg(Intrinsic::nvvm_ldg_global_i);
+  case NVPTX::BI__nvvm_ldg_f:
+  case NVPTX::BI__nvvm_ldg_f2:
+  case NVPTX::BI__nvvm_ldg_f4:
+  case NVPTX::BI__nvvm_ldg_d:
+  case NVPTX::BI__nvvm_ldg_d2:
+    return MakeLdg(Intrinsic::nvvm_ldg_global_f);
   default:
     return nullptr;
   }
@@ -7337,9 +8195,9 @@
 Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
                                                    const CallExpr *E) {
   switch (BuiltinID) {
-  case WebAssembly::BI__builtin_wasm_memory_size: {
+  case WebAssembly::BI__builtin_wasm_current_memory: {
     llvm::Type *ResultType = ConvertType(E->getType());
-    Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType);
+    Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_current_memory, ResultType);
     return Builder.CreateCall(Callee);
   }
   case WebAssembly::BI__builtin_wasm_grow_memory: {
diff --git a/lib/CodeGen/CGCUDABuiltin.cpp b/lib/CodeGen/CGCUDABuiltin.cpp
new file mode 100644
index 0000000..44dd003
--- /dev/null
+++ b/lib/CodeGen/CGCUDABuiltin.cpp
@@ -0,0 +1,123 @@
+//===----- CGCUDABuiltin.cpp - Codegen for CUDA builtins ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Generates code for built-in CUDA calls which are not runtime-specific.
+// (Runtime-specific codegen lives in CGCUDARuntime.)
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenFunction.h"
+#include "clang/Basic/Builtins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
+  llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
+                            llvm::Type::getInt8PtrTy(M.getContext())};
+  llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
+      llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
+
+  if (auto* F = M.getFunction("vprintf")) {
+    // Our CUDA system header declares vprintf with the right signature, so
+    // nobody else should have been able to declare vprintf with a bogus
+    // signature.
+    assert(F->getFunctionType() == VprintfFuncType);
+    return F;
+  }
+
+  // vprintf doesn't already exist; create a declaration and insert it into the
+  // module.
+  return llvm::Function::Create(
+      VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
+}
+
+// Transforms a call to printf into a call to the NVPTX vprintf syscall (which
+// isn't particularly special; it's invoked just like a regular function).
+// vprintf takes two args: A format string, and a pointer to a buffer containing
+// the varargs.
+//
+// For example, the call
+//
+//   printf("format string", arg1, arg2, arg3);
+//
+// is converted into something resembling
+//
+//   struct Tmp {
+//     Arg1 a1;
+//     Arg2 a2;
+//     Arg3 a3;
+//   };
+//   char* buf = alloca(sizeof(Tmp));
+//   *(Tmp*)buf = {a1, a2, a3};
+//   vprintf("format string", buf);
+//
+// buf is aligned to the max of {alignof(Arg1), ...}.  Furthermore, each of the
+// args is itself aligned to its preferred alignment.
+//
+// Note that by the time this function runs, E's args have already undergone the
+// standard C vararg promotion (short -> int, float -> double, etc.).
+RValue
+CodeGenFunction::EmitCUDADevicePrintfCallExpr(const CallExpr *E,
+                                              ReturnValueSlot ReturnValue) {
+  assert(getLangOpts().CUDA);
+  assert(getLangOpts().CUDAIsDevice);
+  assert(E->getBuiltinCallee() == Builtin::BIprintf);
+  assert(E->getNumArgs() >= 1); // printf always has at least one arg.
+
+  const llvm::DataLayout &DL = CGM.getDataLayout();
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+
+  CallArgList Args;
+  EmitCallArgs(Args,
+               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               E->arguments(), E->getDirectCallee(),
+               /* ParamsToSkip = */ 0);
+
+  // We don't know how to emit non-scalar varargs.
+  if (std::any_of(Args.begin() + 1, Args.end(),
+                  [](const CallArg &A) { return !A.RV.isScalar(); })) {
+    CGM.ErrorUnsupported(E, "non-scalar arg to printf");
+    return RValue::get(llvm::ConstantInt::get(IntTy, 0));
+  }
+
+  // Construct and fill the args buffer that we'll pass to vprintf.
+  llvm::Value *BufferPtr;
+  if (Args.size() <= 1) {
+    // If there are no args, pass a null pointer to vprintf.
+    BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
+  } else {
+    llvm::SmallVector<llvm::Type *, 8> ArgTypes;
+    for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I)
+      ArgTypes.push_back(Args[I].RV.getScalarVal()->getType());
+
+    // Using llvm::StructType is correct only because printf doesn't accept
+    // aggregates.  If we had to handle aggregates here, we'd have to manually
+    // compute the offsets within the alloca -- we wouldn't be able to assume
+    // that the alignment of the llvm type was the same as the alignment of the
+    // clang type.
+    llvm::Type *AllocaTy = llvm::StructType::create(ArgTypes, "printf_args");
+    llvm::Value *Alloca = CreateTempAlloca(AllocaTy);
+
+    for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
+      llvm::Value *P = Builder.CreateStructGEP(AllocaTy, Alloca, I - 1);
+      llvm::Value *Arg = Args[I].RV.getScalarVal();
+      Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlignment(Arg->getType()));
+    }
+    BufferPtr = Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
+  }
+
+  // Invoke vprintf and return.
+  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
+  return RValue::get(
+      Builder.CreateCall(VprintfFunc, {Args[0].RV.getScalarVal(), BufferPtr}));
+}
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp
index 9dd7928..dacc53b 100644
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -38,6 +38,7 @@
   llvm::Module &TheModule;
   /// Keeps track of kernel launch stubs emitted in this module
   llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
+  llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
   /// Keeps track of variables containing handles of GPU binaries. Populated by
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
@@ -47,17 +48,25 @@
   llvm::Constant *getLaunchFn() const;
 
   /// Creates a function to register all kernel stubs generated in this module.
-  llvm::Function *makeRegisterKernelsFn();
+  llvm::Function *makeRegisterGlobalsFn();
 
   /// Helper function that generates a constant string and returns a pointer to
   /// the start of the string.  The result of this function can be used anywhere
   /// where the C code specifies const char*.
   llvm::Constant *makeConstantString(const std::string &Str,
                                      const std::string &Name = "",
+                                     const std::string &SectionName = "",
                                      unsigned Alignment = 0) {
     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
                                llvm::ConstantInt::get(SizeTy, 0)};
     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
+    llvm::GlobalVariable *GV =
+        cast<llvm::GlobalVariable>(ConstStr.getPointer());
+    if (!SectionName.empty())
+      GV->setSection(SectionName);
+    if (Alignment)
+      GV->setAlignment(Alignment);
+
     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
                                                 ConstStr.getPointer(), Zeros);
  }
@@ -68,6 +77,10 @@
   CGNVCUDARuntime(CodeGenModule &CGM);
 
   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
+  void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
+    DeviceVars.push_back(std::make_pair(&Var, Flags));
+  }
+
   /// Creates module constructor function
   llvm::Function *makeModuleCtorFunction() override;
   /// Creates module destructor function
@@ -93,10 +106,7 @@
 
 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
-  std::vector<llvm::Type*> Params;
-  Params.push_back(VoidPtrTy);
-  Params.push_back(SizeTy);
-  Params.push_back(SizeTy);
+  llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
   return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy,
                                                            Params, false),
                                    "cudaSetupArgument");
@@ -116,37 +126,28 @@
 
 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
                                          FunctionArgList &Args) {
-  // Build the argument value list and the argument stack struct type.
-  SmallVector<llvm::Value *, 16> ArgValues;
-  std::vector<llvm::Type *> ArgTypes;
-  for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end();
-       I != E; ++I) {
-    llvm::Value *V = CGF.GetAddrOfLocalVar(*I).getPointer();
-    ArgValues.push_back(V);
-    assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType");
-    ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType());
-  }
-  llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes);
-
-  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
-
-  // Emit the calls to cudaSetupArgument
+  // Emit a call to cudaSetupArgument for each arg in Args.
   llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
-  for (unsigned I = 0, E = Args.size(); I != E; ++I) {
-    llvm::Value *Args[3];
-    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
-    Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy);
-    Args[1] = CGF.Builder.CreateIntCast(
-        llvm::ConstantExpr::getSizeOf(ArgTypes[I]),
-        SizeTy, false);
-    Args[2] = CGF.Builder.CreateIntCast(
-        llvm::ConstantExpr::getOffsetOf(ArgStackTy, I),
-        SizeTy, false);
+  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
+  CharUnits Offset = CharUnits::Zero();
+  for (const VarDecl *A : Args) {
+    CharUnits TyWidth, TyAlign;
+    std::tie(TyWidth, TyAlign) =
+        CGM.getContext().getTypeInfoInChars(A->getType());
+    Offset = Offset.alignTo(TyAlign);
+    llvm::Value *Args[] = {
+        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
+                                      VoidPtrTy),
+        llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
+        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
+    };
     llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
     llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero);
+    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
     CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock);
     CGF.EmitBlock(NextBlock);
+    Offset += TyWidth;
   }
 
   // Emit the call to cudaLaunch
@@ -158,19 +159,28 @@
   CGF.EmitBlock(EndBlock);
 }
 
-/// Creates internal function to register all kernel stubs generated in this
-/// module with the CUDA runtime.
+/// Creates a function that sets up state on the host side for CUDA objects that
+/// have a presence on both the host and device sides. Specifically, registers
+/// the host side of kernel functions and device global variables with the CUDA
+/// runtime.
 /// \code
-/// void __cuda_register_kernels(void** GpuBinaryHandle) {
+/// void __cuda_register_globals(void** GpuBinaryHandle) {
 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
 ///    ...
 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
+///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
+///    ...
+///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
 /// }
 /// \endcode
-llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
+llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
+  // No need to register anything
+  if (EmittedKernels.empty() && DeviceVars.empty())
+    return nullptr;
+
   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
-      llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule);
+      llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);
   llvm::BasicBlock *EntryBB =
       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
   CGBuilderTy Builder(CGM, Context);
@@ -178,7 +188,7 @@
 
   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
-  std::vector<llvm::Type *> RegisterFuncParams = {
+  llvm::Type *RegisterFuncParams[] = {
       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
   llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
@@ -186,18 +196,44 @@
       "__cudaRegisterFunction");
 
   // Extract GpuBinaryHandle passed as the first argument passed to
-  // __cuda_register_kernels() and generate __cudaRegisterFunction() call for
+  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
   // each emitted kernel.
   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
   for (llvm::Function *Kernel : EmittedKernels) {
     llvm::Constant *KernelName = makeConstantString(Kernel->getName());
     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
-    llvm::Value *args[] = {
+    llvm::Value *Args[] = {
         &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
         KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
         NullPtr, NullPtr, NullPtr,
         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
-    Builder.CreateCall(RegisterFunc, args);
+    Builder.CreateCall(RegisterFunc, Args);
+  }
+
+  // void __cudaRegisterVar(void **, char *, char *, const char *,
+  //                        int, int, int, int)
+  llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
+                                     CharPtrTy,    IntTy,     IntTy,
+                                     IntTy,        IntTy};
+  llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(IntTy, RegisterVarParams, false),
+      "__cudaRegisterVar");
+  for (auto &Pair : DeviceVars) {
+    llvm::GlobalVariable *Var = Pair.first;
+    unsigned Flags = Pair.second;
+    llvm::Constant *VarName = makeConstantString(Var->getName());
+    uint64_t VarSize =
+        CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
+    llvm::Value *Args[] = {
+        &GpuBinaryHandlePtr,
+        Builder.CreateBitCast(Var, VoidPtrTy),
+        VarName,
+        VarName,
+        llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
+        llvm::ConstantInt::get(IntTy, VarSize),
+        llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
+        llvm::ConstantInt::get(IntTy, 0)};
+    Builder.CreateCall(RegisterVar, Args);
   }
 
   Builder.CreateRetVoid();
@@ -208,15 +244,19 @@
 /// \code
 /// void __cuda_module_ctor(void*) {
 ///     Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
-///     __cuda_register_kernels(Handle0);
+///     __cuda_register_globals(Handle0);
 ///     ...
 ///     HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
-///     __cuda_register_kernels(HandleN);
+///     __cuda_register_globals(HandleN);
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
-  // void __cuda_register_kernels(void* handle);
-  llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
+  // No need to generate ctors/dtors if there are no GPU binaries.
+  if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
+    return nullptr;
+
+  // void __cuda_register_globals(void* handle);
+  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
   // void ** __cudaRegisterFatBinary(void *);
   llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
@@ -253,7 +293,8 @@
     llvm::Constant *Values[] = {
         llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic.
         llvm::ConstantInt::get(IntTy, 1),          // Fatbin version.
-        makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data.
+        makeConstantString(GpuBinaryOrErr.get()->getBuffer(), // Data.
+                           "", ".nv_fatbin", 8),              //
         llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1.
     llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable(
         TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
@@ -272,8 +313,9 @@
     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
                                    CGM.getPointerAlign());
 
-    // Call __cuda_register_kernels(GpuBinaryHandle);
-    CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
+    // Call __cuda_register_globals(GpuBinaryHandle);
+    if (RegisterGlobalsFunc)
+      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
 
     // Save GpuBinaryHandle so we can unregister it in destructor.
     GpuBinaryHandles.push_back(GpuBinaryHandle);
@@ -293,6 +335,10 @@
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
+  // No need for destructor if we don't have handles to unregister.
+  if (GpuBinaryHandles.empty())
+    return nullptr;
+
   // void __cudaUnregisterFatBinary(void ** handle);
   llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
diff --git a/lib/CodeGen/CGCUDARuntime.h b/lib/CodeGen/CGCUDARuntime.h
index dcacf97..0168f4f 100644
--- a/lib/CodeGen/CGCUDARuntime.h
+++ b/lib/CodeGen/CGCUDARuntime.h
@@ -18,6 +18,7 @@
 
 namespace llvm {
 class Function;
+class GlobalVariable;
 }
 
 namespace clang {
@@ -37,6 +38,12 @@
   CodeGenModule &CGM;
 
 public:
+  // Global variable properties that must be passed to CUDA runtime.
+  enum DeviceVarFlags {
+    ExternDeviceVar = 0x01,   // extern
+    ConstantDeviceVar = 0x02, // __constant__
+  };
+
   CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
   virtual ~CGCUDARuntime();
 
@@ -46,6 +53,7 @@
 
   /// Emits a kernel launch stub.
   virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
+  virtual void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) = 0;
 
   /// Constructs and returns a module initialization function or nullptr if it's
   /// not needed. Must be called after all kernels have been emitted.
diff --git a/lib/CodeGen/CGCXX.cpp b/lib/CodeGen/CGCXX.cpp
index 6847df9..40f1bc4 100644
--- a/lib/CodeGen/CGCXX.cpp
+++ b/lib/CodeGen/CGCXX.cpp
@@ -164,7 +164,7 @@
     // members with attribute "AlwaysInline" and expect no reference to
     // be generated. It is desirable to reenable this optimisation after
     // corresponding LLVM changes.
-    Replacements[MangledName] = Aliasee;
+    addReplacement(MangledName, Aliasee);
     return false;
   }
 
diff --git a/lib/CodeGen/CGCXXABI.h b/lib/CodeGen/CGCXXABI.h
index 3f240b1..9e10ec0 100644
--- a/lib/CodeGen/CGCXXABI.h
+++ b/lib/CodeGen/CGCXXABI.h
@@ -106,6 +106,16 @@
 
   virtual bool hasMostDerivedReturn(GlobalDecl GD) const { return false; }
 
+  /// Returns true if the target allows calling a function through a pointer
+  /// with a different signature than the actual function (or equivalently,
+  /// bitcasting a function or function pointer to a different function type).
+  /// In principle in the most general case this could depend on the target, the
+  /// calling convention, and the actual types of the arguments and return
+  /// value. Here it just means whether the signature mismatch could *ever* be
+  /// allowed; in other words, does the target do strict checking of signatures
+  /// for all calls.
+  virtual bool canCallMismatchedFunctionType() const { return true; }
+
   /// If the C++ ABI requires the given type be returned in a particular way,
   /// this method sets RetAI and returns true.
   virtual bool classifyReturnType(CGFunctionInfo &FI) const = 0;
@@ -326,6 +336,12 @@
   virtual void addImplicitStructorParams(CodeGenFunction &CGF, QualType &ResTy,
                                          FunctionArgList &Params) = 0;
 
+  /// Get the ABI-specific "this" parameter adjustment to apply in the prologue
+  /// of a virtual function.
+  virtual CharUnits getVirtualFunctionPrologueThisAdjustment(GlobalDecl GD) {
+    return CharUnits::Zero();
+  }
+
   /// Perform ABI-specific "this" parameter adjustment in a virtual function
   /// prologue.
   virtual llvm::Value *adjustThisParameterInVirtualFunctionPrologue(
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index 827171c..fdd83ea 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -30,6 +30,7 @@
 #include "clang/Frontend/CodeGenOptions.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
@@ -41,7 +42,7 @@
 
 /***/
 
-static unsigned ClangCallConvToLLVMCallConv(CallingConv CC) {
+unsigned CodeGenTypes::ClangCallConvToLLVMCallConv(CallingConv CC) {
   switch (CC) {
   default: return llvm::CallingConv::C;
   case CC_X86StdCall: return llvm::CallingConv::X86_StdCall;
@@ -57,7 +58,7 @@
   // TODO: Add support for __vectorcall to LLVM.
   case CC_X86VectorCall: return llvm::CallingConv::X86_VectorCall;
   case CC_SpirFunction: return llvm::CallingConv::SPIR_FUNC;
-  case CC_SpirKernel: return llvm::CallingConv::SPIR_KERNEL;
+  case CC_OpenCLKernel: return CGM.getTargetCodeGenInfo().getOpenCLKernelCallingConv();
   case CC_PreserveMost: return llvm::CallingConv::PreserveMost;
   case CC_PreserveAll: return llvm::CallingConv::PreserveAll;
   case CC_Swift: return llvm::CallingConv::Swift;
@@ -141,7 +142,8 @@
                         CanQual<FunctionProtoType> FTP,
                         const FunctionDecl *FD) {
   SmallVector<FunctionProtoType::ExtParameterInfo, 16> paramInfos;
-  RequiredArgs required = RequiredArgs::forPrototypePlus(FTP, prefix.size());
+  RequiredArgs Required =
+      RequiredArgs::forPrototypePlus(FTP, prefix.size(), FD);
   // FIXME: Kill copy.
   appendParameterTypes(CGT, prefix, paramInfos, FTP, FD);
   CanQualType resultType = FTP->getReturnType().getUnqualifiedType();
@@ -149,7 +151,7 @@
   return CGT.arrangeLLVMFunctionInfo(resultType, instanceMethod,
                                      /*chainCall=*/false, prefix,
                                      FTP->getExtInfo(), paramInfos,
-                                     required);
+                                     Required);
 }
 
 /// Arrange the argument and result information for a value of the
@@ -243,6 +245,15 @@
   return arrangeFreeFunctionType(prototype, MD);
 }
 
+bool CodeGenTypes::inheritingCtorHasParams(
+    const InheritedConstructor &Inherited, CXXCtorType Type) {
+  // Parameters are unnecessary if we're constructing a base class subobject
+  // and the inherited constructor lives in a virtual base.
+  return Type == Ctor_Complete ||
+         !Inherited.getShadowDecl()->constructsVirtualBase() ||
+         !Target.getCXXABI().hasConstructorVariants();
+  }
+
 const CGFunctionInfo &
 CodeGenTypes::arrangeCXXStructorDeclaration(const CXXMethodDecl *MD,
                                             StructorType Type) {
@@ -251,9 +262,16 @@
   SmallVector<FunctionProtoType::ExtParameterInfo, 16> paramInfos;
   argTypes.push_back(GetThisType(Context, MD->getParent()));
 
+  bool PassParams = true;
+
   GlobalDecl GD;
   if (auto *CD = dyn_cast<CXXConstructorDecl>(MD)) {
     GD = GlobalDecl(CD, toCXXCtorType(Type));
+
+    // A base class inheriting constructor doesn't get forwarded arguments
+    // needed to construct a virtual base (or base class thereof).
+    if (auto Inherited = CD->getInheritedConstructor())
+      PassParams = inheritingCtorHasParams(Inherited, toCXXCtorType(Type));
   } else {
     auto *DD = dyn_cast<CXXDestructorDecl>(MD);
     GD = GlobalDecl(DD, toCXXDtorType(Type));
@@ -262,12 +280,14 @@
   CanQual<FunctionProtoType> FTP = GetFormalType(MD);
 
   // Add the formal parameters.
-  appendParameterTypes(*this, argTypes, paramInfos, FTP, MD);
+  if (PassParams)
+    appendParameterTypes(*this, argTypes, paramInfos, FTP, MD);
 
   TheCXXABI.buildStructorSignature(MD, Type, argTypes);
 
   RequiredArgs required =
-      (MD->isVariadic() ? RequiredArgs(argTypes.size()) : RequiredArgs::All);
+      (PassParams && MD->isVariadic() ? RequiredArgs(argTypes.size())
+                                      : RequiredArgs::All);
 
   FunctionType::ExtInfo extInfo = FTP->getExtInfo();
   CanQualType resultType = TheCXXABI.HasThisReturn(GD)
@@ -338,7 +358,7 @@
     ArgTypes.push_back(Context.getCanonicalParamType(Arg.Ty));
 
   CanQual<FunctionProtoType> FPT = GetFormalType(D);
-  RequiredArgs Required = RequiredArgs::forPrototypePlus(FPT, 1 + ExtraArgs);
+  RequiredArgs Required = RequiredArgs::forPrototypePlus(FPT, 1 + ExtraArgs, D);
   GlobalDecl GD(D, CtorKind);
   CanQualType ResultType = TheCXXABI.HasThisReturn(GD)
                                ? ArgTypes.front()
@@ -401,7 +421,7 @@
   argTys.push_back(Context.getCanonicalParamType(receiverType));
   argTys.push_back(Context.getCanonicalParamType(Context.getObjCSelType()));
   // FIXME: Kill copy?
-  for (const auto *I : MD->params()) {
+  for (const auto *I : MD->parameters()) {
     argTys.push_back(Context.getCanonicalParamType(I->getType()));
   }
 
@@ -555,10 +575,11 @@
   auto paramInfos = getExtParameterInfosForCall(proto, 1, params.size());
   auto argTypes = getArgTypesForDeclaration(Context, params);
 
-  return arrangeLLVMFunctionInfo(GetReturnType(proto->getReturnType()),
-                                 /*instanceMethod*/ false, /*chainCall*/ false,
-                                 argTypes, proto->getExtInfo(), paramInfos,
-                                 RequiredArgs::forPrototypePlus(proto, 1));
+  return arrangeLLVMFunctionInfo(
+      GetReturnType(proto->getReturnType()),
+      /*instanceMethod*/ false, /*chainCall*/ false, argTypes,
+      proto->getExtInfo(), paramInfos,
+      RequiredArgs::forPrototypePlus(proto, 1, nullptr));
 }
 
 const CGFunctionInfo &
@@ -713,9 +734,9 @@
                                        RequiredArgs required) {
   assert(paramInfos.empty() || paramInfos.size() == argTypes.size());
 
-  void *buffer = operator new(sizeof(CGFunctionInfo) +
-                              sizeof(ArgInfo) * (argTypes.size() + 1) +
-                              sizeof(ExtParameterInfo) * paramInfos.size());
+  void *buffer =
+    operator new(totalSizeToAlloc<ArgInfo,             ExtParameterInfo>(
+                                  argTypes.size() + 1, paramInfos.size()));
 
   CGFunctionInfo *FI = new(buffer) CGFunctionInfo();
   FI->CallingConvention = llvmCC;
@@ -783,7 +804,8 @@
 
   RecordExpansion(SmallVector<const CXXBaseSpecifier *, 1> &&Bases,
                   SmallVector<const FieldDecl *, 1> &&Fields)
-      : TypeExpansion(TEK_Record), Bases(Bases), Fields(Fields) {}
+      : TypeExpansion(TEK_Record), Bases(std::move(Bases)),
+        Fields(std::move(Fields)) {}
   static bool classof(const TypeExpansion *TE) {
     return TE->Kind == TEK_Record;
   }
@@ -947,7 +969,7 @@
     }
     for (auto FD : RExp->Fields) {
       // FIXME: What are the right qualifiers here?
-      LValue SubLV = EmitLValueForField(LV, FD);
+      LValue SubLV = EmitLValueForFieldInitialization(LV, FD);
       ExpandTypeFromArgs(FD->getType(), SubLV, AI);
     }
   } else if (isa<ComplexExpansion>(Exp.get())) {
@@ -1614,6 +1636,7 @@
 
   const Decl *TargetDecl = CalleeInfo.getCalleeDecl();
 
+  bool HasAnyX86InterruptAttr = false;
   // FIXME: handle sseregparm someday...
   if (TargetDecl) {
     if (TargetDecl->hasAttr<ReturnsTwiceAttr>())
@@ -1651,6 +1674,7 @@
     if (TargetDecl->hasAttr<ReturnsNonNullAttr>())
       RetAttrs.addAttribute(llvm::Attribute::NonNull);
 
+    HasAnyX86InterruptAttr = TargetDecl->hasAttr<AnyX86InterruptAttr>();
     HasOptnone = TargetDecl->hasAttr<OptimizeNoneAttr>();
   }
 
@@ -1690,10 +1714,11 @@
     }
 
     bool DisableTailCalls =
-        CodeGenOpts.DisableTailCalls ||
+        CodeGenOpts.DisableTailCalls || HasAnyX86InterruptAttr ||
         (TargetDecl && TargetDecl->hasAttr<DisableTailCallsAttr>());
-    FuncAttrs.addAttribute("disable-tail-calls",
-                           llvm::toStringRef(DisableTailCalls));
+    FuncAttrs.addAttribute(
+        "disable-tail-calls",
+        llvm::toStringRef(DisableTailCalls));
 
     FuncAttrs.addAttribute("less-precise-fpmad",
                            llvm::toStringRef(CodeGenOpts.LessPreciseFPMAD));
@@ -1707,9 +1732,16 @@
                            llvm::toStringRef(CodeGenOpts.SoftFloat));
     FuncAttrs.addAttribute("stack-protector-buffer-size",
                            llvm::utostr(CodeGenOpts.SSPBufferSize));
+    FuncAttrs.addAttribute("no-signed-zeros-fp-math",
+                           llvm::toStringRef(CodeGenOpts.NoSignedZeros));
+    FuncAttrs.addAttribute(
+        "correctly-rounded-divide-sqrt-fp-math",
+        llvm::toStringRef(CodeGenOpts.CorrectlyRoundedDivSqrt));
 
     if (CodeGenOpts.StackRealignment)
       FuncAttrs.addAttribute("stackrealign");
+    if (CodeGenOpts.Backchain)
+      FuncAttrs.addAttribute("backchain");
 
     // Add target-cpu and target-features attributes to functions. If
     // we have a decl for the function and it has a target attribute then
@@ -1758,6 +1790,18 @@
     }
   }
 
+  if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) {
+    // Conservatively, mark all functions and calls in CUDA as convergent
+    // (meaning, they may call an intrinsically convergent op, such as
+    // __syncthreads(), and so can't have certain optimizations applied around
+    // them).  LLVM will remove this attribute where it safely can.
+    FuncAttrs.addAttribute(llvm::Attribute::Convergent);
+
+    // Respect -fcuda-flush-denormals-to-zero.
+    if (getLangOpts().CUDADeviceFlushDenormalsToZero)
+      FuncAttrs.addAttribute("nvptx-f32ftz", "true");
+  }
+
   ClangToLLVMArgMapping IRFunctionArgs(getContext(), FI);
 
   QualType RetTy = FI.getReturnType();
@@ -2847,23 +2891,15 @@
 
   QualType type = param->getType();
 
-  // For the most part, we just need to load the alloca, except:
-  // 1) aggregate r-values are actually pointers to temporaries, and
-  // 2) references to non-scalars are pointers directly to the aggregate.
-  // I don't know why references to scalars are different here.
-  if (const ReferenceType *ref = type->getAs<ReferenceType>()) {
-    if (!hasScalarEvaluationKind(ref->getPointeeType()))
-      return args.add(RValue::getAggregate(local), type);
-
-    // Locals which are references to scalars are represented
-    // with allocas holding the pointer.
-    return args.add(RValue::get(Builder.CreateLoad(local)), type);
-  }
-
   assert(!isInAllocaArgument(CGM.getCXXABI(), type) &&
          "cannot emit delegate call arguments for inalloca arguments!");
 
-  args.add(convertTempToRValue(local, type, loc), type);
+  // For the most part, we just need to load the alloca, except that
+  // aggregate r-values are actually pointers to temporaries.
+  if (type->isReferenceType())
+    args.add(RValue::get(Builder.CreateLoad(local)), type);
+  else
+    args.add(convertTempToRValue(local, type, loc), type);
 }
 
 static bool isProvablyNull(llvm::Value *addr) {
@@ -3174,10 +3210,10 @@
     size_t CallArgsStart = Args.size();
     for (int I = ArgTypes.size() - 1; I >= 0; --I) {
       CallExpr::const_arg_iterator Arg = ArgRange.begin() + I;
+      MaybeEmitImplicitObjectSize(I, *Arg);
       EmitCallArg(Args, *Arg, ArgTypes[I]);
       EmitNonNullArgCheck(Args.back().RV, ArgTypes[I], (*Arg)->getExprLoc(),
                           CalleeDecl, ParamsToSkip + I);
-      MaybeEmitImplicitObjectSize(I, *Arg);
     }
 
     // Un-reverse the arguments we just evaluated so they match up with the LLVM
@@ -3357,16 +3393,6 @@
   return EmitRuntimeCall(callee, None, name);
 }
 
-/// Emits a simple call (never an invoke) to the given runtime function.
-llvm::CallInst *
-CodeGenFunction::EmitRuntimeCall(llvm::Value *callee,
-                                 ArrayRef<llvm::Value*> args,
-                                 const llvm::Twine &name) {
-  llvm::CallInst *call = Builder.CreateCall(callee, args, name);
-  call->setCallingConv(getRuntimeCC());
-  return call;
-}
-
 // Calls which may throw must have operand bundles indicating which funclet
 // they are nested within.
 static void
@@ -3385,6 +3411,19 @@
   BundleList.emplace_back("funclet", CurrentFuncletPad);
 }
 
+/// Emits a simple call (never an invoke) to the given runtime function.
+llvm::CallInst *
+CodeGenFunction::EmitRuntimeCall(llvm::Value *callee,
+                                 ArrayRef<llvm::Value*> args,
+                                 const llvm::Twine &name) {
+  SmallVector<llvm::OperandBundleDef, 1> BundleList;
+  getBundlesForFunclet(callee, CurrentFuncletPad, BundleList);
+
+  llvm::CallInst *call = Builder.CreateCall(callee, args, BundleList, name);
+  call->setCallingConv(getRuntimeCC());
+  return call;
+}
+
 /// Emits a call or invoke to the given noreturn runtime function.
 void CodeGenFunction::EmitNoreturnRuntimeCallOrInvoke(llvm::Value *callee,
                                                ArrayRef<llvm::Value*> args) {
@@ -3432,13 +3471,16 @@
                                   ArrayRef<llvm::Value *> Args,
                                   const Twine &Name) {
   llvm::BasicBlock *InvokeDest = getInvokeDest();
+  SmallVector<llvm::OperandBundleDef, 1> BundleList;
+  getBundlesForFunclet(Callee, CurrentFuncletPad, BundleList);
 
   llvm::Instruction *Inst;
   if (!InvokeDest)
-    Inst = Builder.CreateCall(Callee, Args, Name);
+    Inst = Builder.CreateCall(Callee, Args, BundleList, Name);
   else {
     llvm::BasicBlock *ContBB = createBasicBlock("invoke.cont");
-    Inst = Builder.CreateInvoke(Callee, ContBB, InvokeDest, Args, Name);
+    Inst = Builder.CreateInvoke(Callee, ContBB, InvokeDest, Args, BundleList,
+                                Name);
     EmitBlock(ContBB);
   }
 
@@ -3918,7 +3960,9 @@
   CS.setAttributes(Attrs);
   CS.setCallingConv(static_cast<llvm::CallingConv::ID>(CallingConv));
 
-  // Insert instrumentation or attach profile metadata at indirect call sites
+  // Insert instrumentation or attach profile metadata at indirect call sites.
+  // For more details, see the comment before the definition of
+  // IPVK_IndirectCallTarget in InstrProfData.inc.
   if (!CS.getCalledFunction())
     PGO.valueProfile(Builder, llvm::IPVK_IndirectCallTarget,
                      CS.getInstruction(), Callee);
diff --git a/lib/CodeGen/CGCall.h b/lib/CodeGen/CGCall.h
index 2ebd09b..aee2957 100644
--- a/lib/CodeGen/CGCall.h
+++ b/lib/CodeGen/CGCall.h
@@ -19,7 +19,6 @@
 #include "EHScopeStack.h"
 #include "clang/AST/CanonicalType.h"
 #include "clang/AST/Type.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/IR/Value.h"
 
 // FIXME: Restructure so we don't have to expose so much stuff.
diff --git a/lib/CodeGen/CGClass.cpp b/lib/CodeGen/CGClass.cpp
index 41f93f4..4d69c3f 100644
--- a/lib/CodeGen/CGClass.cpp
+++ b/lib/CodeGen/CGClass.cpp
@@ -562,6 +562,20 @@
                                           isBaseVirtual);
 }
 
+/// Initialize a member of aggregate type using the given expression
+/// as an initializer.
+///
+/// The member may be an array.  If so:
+/// - the destination l-value will be a pointer of the *base* element type,
+/// - ArrayIndexVar will be a pointer to a variable containing the current
+///   index within the destination array, and
+/// - ArrayIndexes will be an array of index variables, one for each level
+///   of array nesting, which will need to be updated as appropriate for the
+///   array structure.
+///
+/// On an array, this function will invoke itself recursively.  Each time,
+/// it drills into one nesting level of the member type and sets up a
+/// loop updating the appropriate array index variable.
 static void EmitAggMemberInitializer(CodeGenFunction &CGF,
                                      LValue LHS,
                                      Expr *Init,
@@ -569,10 +583,18 @@
                                      QualType T,
                                      ArrayRef<VarDecl *> ArrayIndexes,
                                      unsigned Index) {
+  assert(ArrayIndexVar.isValid() == (ArrayIndexes.size() != 0));
+
   if (Index == ArrayIndexes.size()) {
     LValue LV = LHS;
 
+    Optional<CodeGenFunction::RunCleanupsScope> Scope;
+
     if (ArrayIndexVar.isValid()) {
+      // When we're processing an array, the temporaries from each
+      // element's construction are destroyed immediately.
+      Scope.emplace(CGF);
+
       // If we have an array index variable, load it and use it as an offset.
       // Then, increment the value.
       llvm::Value *Dest = LHS.getPointer();
@@ -586,6 +608,19 @@
       CharUnits EltSize = CGF.getContext().getTypeSizeInChars(T);
       CharUnits Align = LV.getAlignment().alignmentOfArrayElement(EltSize);
       LV.setAddress(Address(Dest, Align));
+
+      // Enter a partial-array EH cleanup to destroy previous members
+      // of the array if this initialization throws.
+      if (CGF.CGM.getLangOpts().Exceptions) {
+        if (auto DtorKind = T.isDestructedType()) {
+          if (CGF.needsEHCleanup(DtorKind)) {
+            CGF.pushRegularPartialArrayCleanup(LHS.getPointer(),
+                                               LV.getPointer(), T,
+                                               LV.getAlignment(),
+                                               CGF.getDestroyer(DtorKind));
+          }
+        }
+      }
     }
 
     switch (CGF.getEvaluationKind(T)) {
@@ -746,7 +781,7 @@
 
   ArrayRef<VarDecl *> ArrayIndexes;
   if (MemberInit->getNumArrayIndices())
-    ArrayIndexes = MemberInit->getArrayIndexes();
+    ArrayIndexes = MemberInit->getArrayIndices();
   CGF.EmitInitializerForField(Field, LHS, MemberInit->getInit(), ArrayIndexes);
 }
 
@@ -987,7 +1022,7 @@
     CodeGenFunction &CGF;
     SanitizerSet OldSanOpts;
   };
-}
+} // end anonymous namespace
  
 namespace {
   class FieldMemcpyizer {
@@ -1072,7 +1107,6 @@
     const CXXRecordDecl *ClassDecl;
 
   private:
-
     void emitMemcpyIR(Address DestPtr, Address SrcPtr, CharUnits Size) {
       llvm::PointerType *DPT = DestPtr.getType();
       llvm::Type *DBP =
@@ -1088,13 +1122,12 @@
     }
 
     void addInitialField(FieldDecl *F) {
-        FirstField = F;
-        LastField = F;
-        FirstFieldOffset = RecLayout.getFieldOffset(F->getFieldIndex());
-        LastFieldOffset = FirstFieldOffset;
-        LastAddedFieldIndex = F->getFieldIndex();
-        return;
-      }
+      FirstField = F;
+      LastField = F;
+      FirstFieldOffset = RecLayout.getFieldOffset(F->getFieldIndex());
+      LastFieldOffset = FirstFieldOffset;
+      LastAddedFieldIndex = F->getFieldIndex();
+    }
 
     void addNextField(FieldDecl *F) {
       // For the most part, the following invariant will hold:
@@ -1128,7 +1161,6 @@
 
   class ConstructorMemcpyizer : public FieldMemcpyizer {
   private:
-
     /// Get source argument for copy constructor. Returns null if not a copy
     /// constructor.
     static const VarDecl *getTrivialCopySource(CodeGenFunction &CGF,
@@ -1233,7 +1265,6 @@
 
   class AssignmentMemcpyizer : public FieldMemcpyizer {
   private:
-
     // Returns the memcpyable field copied by the given statement, if one
     // exists. Otherwise returns null.
     FieldDecl *getMemcpyableField(Stmt *S) {
@@ -1307,7 +1338,6 @@
     SmallVector<Stmt*, 16> AggregatedStmts;
 
   public:
-
     AssignmentMemcpyizer(CodeGenFunction &CGF, const CXXMethodDecl *AD,
                          FunctionArgList &Args)
       : FieldMemcpyizer(CGF, AD->getParent(), Args[Args.size() - 1]),
@@ -1630,6 +1660,7 @@
 
   struct CallDtorDeleteConditional final : EHScopeStack::Cleanup {
     llvm::Value *ShouldDeleteCondition;
+
   public:
     CallDtorDeleteConditional(llvm::Value *ShouldDeleteCondition)
         : ShouldDeleteCondition(ShouldDeleteCondition) {
@@ -1919,7 +1950,7 @@
 /// \param zeroInitialize true if each element should be
 ///   zero-initialized before it is constructed
 void CodeGenFunction::EmitCXXAggrConstructorCall(
-    const CXXConstructorDecl *ctor, const ConstantArrayType *arrayType,
+    const CXXConstructorDecl *ctor, const ArrayType *arrayType,
     Address arrayBegin, const CXXConstructExpr *E, bool zeroInitialize) {
   QualType elementType;
   llvm::Value *numElements =
@@ -2052,6 +2083,62 @@
                                              bool ForVirtualBase,
                                              bool Delegating, Address This,
                                              const CXXConstructExpr *E) {
+  CallArgList Args;
+
+  // Push the this ptr.
+  Args.add(RValue::get(This.getPointer()), D->getThisType(getContext()));
+
+  // If this is a trivial constructor, emit a memcpy now before we lose
+  // the alignment information on the argument.
+  // FIXME: It would be better to preserve alignment information into CallArg.
+  if (isMemcpyEquivalentSpecialMember(D)) {
+    assert(E->getNumArgs() == 1 && "unexpected argcount for trivial ctor");
+
+    const Expr *Arg = E->getArg(0);
+    QualType SrcTy = Arg->getType();
+    Address Src = EmitLValue(Arg).getAddress();
+    QualType DestTy = getContext().getTypeDeclType(D->getParent());
+    EmitAggregateCopyCtor(This, Src, DestTy, SrcTy);
+    return;
+  }
+
+  // Add the rest of the user-supplied arguments.
+  const FunctionProtoType *FPT = D->getType()->castAs<FunctionProtoType>();
+  EmitCallArgs(Args, FPT, E->arguments(), E->getConstructor());
+
+  EmitCXXConstructorCall(D, Type, ForVirtualBase, Delegating, This, Args);
+}
+
+static bool canEmitDelegateCallArgs(CodeGenFunction &CGF,
+                                    const CXXConstructorDecl *Ctor,
+                                    CXXCtorType Type, CallArgList &Args) {
+  // We can't forward a variadic call.
+  if (Ctor->isVariadic())
+    return false;
+
+  if (CGF.getTarget().getCXXABI().areArgsDestroyedLeftToRightInCallee()) {
+    // If the parameters are callee-cleanup, it's not safe to forward.
+    for (auto *P : Ctor->parameters())
+      if (P->getType().isDestructedType())
+        return false;
+
+    // Likewise if they're inalloca.
+    const CGFunctionInfo &Info =
+        CGF.CGM.getTypes().arrangeCXXConstructorCall(Args, Ctor, Type, 0);
+    if (Info.usesInAlloca())
+      return false;
+  }
+
+  // Anything else should be OK.
+  return true;
+}
+
+void CodeGenFunction::EmitCXXConstructorCall(const CXXConstructorDecl *D,
+                                             CXXCtorType Type,
+                                             bool ForVirtualBase,
+                                             bool Delegating,
+                                             Address This,
+                                             CallArgList &Args) {
   const CXXRecordDecl *ClassDecl = D->getParent();
 
   // C++11 [class.mfct.non-static]p2:
@@ -2062,7 +2149,7 @@
                 This.getPointer(), getContext().getRecordType(ClassDecl));
 
   if (D->isTrivial() && D->isDefaultConstructor()) {
-    assert(E->getNumArgs() == 0 && "trivial default ctor with args");
+    assert(Args.size() == 1 && "trivial default ctor with args");
     return;
   }
 
@@ -2070,24 +2157,24 @@
   // union copy constructor, we must emit a memcpy, because the AST does not
   // model that copy.
   if (isMemcpyEquivalentSpecialMember(D)) {
-    assert(E->getNumArgs() == 1 && "unexpected argcount for trivial ctor");
+    assert(Args.size() == 2 && "unexpected argcount for trivial ctor");
 
-    const Expr *Arg = E->getArg(0);
-    QualType SrcTy = Arg->getType();
-    Address Src = EmitLValue(Arg).getAddress();
+    QualType SrcTy = D->getParamDecl(0)->getType().getNonReferenceType();
+    Address Src(Args[1].RV.getScalarVal(), getNaturalTypeAlignment(SrcTy));
     QualType DestTy = getContext().getTypeDeclType(ClassDecl);
     EmitAggregateCopyCtor(This, Src, DestTy, SrcTy);
     return;
   }
 
-  CallArgList Args;
-
-  // Push the this ptr.
-  Args.add(RValue::get(This.getPointer()), D->getThisType(getContext()));
-
-  // Add the rest of the user-supplied arguments.
-  const FunctionProtoType *FPT = D->getType()->castAs<FunctionProtoType>();
-  EmitCallArgs(Args, FPT, E->arguments(), E->getConstructor());
+  // Check whether we can actually emit the constructor before trying to do so.
+  if (auto Inherited = D->getInheritedConstructor()) {
+    if (getTypes().inheritingCtorHasParams(Inherited, Type) &&
+        !canEmitDelegateCallArgs(*this, D, Type, Args)) {
+      EmitInlinedInheritingCXXConstructorCall(D, Type, ForVirtualBase,
+                                              Delegating, Args);
+      return;
+    }
+  }
 
   // Insert any ABI-specific implicit constructor arguments.
   unsigned ExtraArgs = CGM.getCXXABI().addImplicitConstructorArgs(
@@ -2117,6 +2204,95 @@
     EmitVTableAssumptionLoads(ClassDecl, This);
 }
 
+void CodeGenFunction::EmitInheritedCXXConstructorCall(
+    const CXXConstructorDecl *D, bool ForVirtualBase, Address This,
+    bool InheritedFromVBase, const CXXInheritedCtorInitExpr *E) {
+  CallArgList Args;
+  CallArg ThisArg(RValue::get(This.getPointer()), D->getThisType(getContext()),
+                  /*NeedsCopy=*/false);
+
+  // Forward the parameters.
+  if (InheritedFromVBase &&
+      CGM.getTarget().getCXXABI().hasConstructorVariants()) {
+    // Nothing to do; this construction is not responsible for constructing
+    // the base class containing the inherited constructor.
+    // FIXME: Can we just pass undef's for the remaining arguments if we don't
+    // have constructor variants?
+    Args.push_back(ThisArg);
+  } else if (!CXXInheritedCtorInitExprArgs.empty()) {
+    // The inheriting constructor was inlined; just inject its arguments.
+    assert(CXXInheritedCtorInitExprArgs.size() >= D->getNumParams() &&
+           "wrong number of parameters for inherited constructor call");
+    Args = CXXInheritedCtorInitExprArgs;
+    Args[0] = ThisArg;
+  } else {
+    // The inheriting constructor was not inlined. Emit delegating arguments.
+    Args.push_back(ThisArg);
+    const auto *OuterCtor = cast<CXXConstructorDecl>(CurCodeDecl);
+    assert(OuterCtor->getNumParams() == D->getNumParams());
+    assert(!OuterCtor->isVariadic() && "should have been inlined");
+
+    for (const auto *Param : OuterCtor->parameters()) {
+      assert(getContext().hasSameUnqualifiedType(
+          OuterCtor->getParamDecl(Param->getFunctionScopeIndex())->getType(),
+          Param->getType()));
+      EmitDelegateCallArg(Args, Param, E->getLocation());
+
+      // Forward __attribute__(pass_object_size).
+      if (Param->hasAttr<PassObjectSizeAttr>()) {
+        auto *POSParam = SizeArguments[Param];
+        assert(POSParam && "missing pass_object_size value for forwarding");
+        EmitDelegateCallArg(Args, POSParam, E->getLocation());
+      }
+    }
+  }
+
+  EmitCXXConstructorCall(D, Ctor_Base, ForVirtualBase, /*Delegating*/false,
+                         This, Args);
+}
+
+void CodeGenFunction::EmitInlinedInheritingCXXConstructorCall(
+    const CXXConstructorDecl *Ctor, CXXCtorType CtorType, bool ForVirtualBase,
+    bool Delegating, CallArgList &Args) {
+  InlinedInheritingConstructorScope Scope(*this, GlobalDecl(Ctor, CtorType));
+
+  // Save the arguments to be passed to the inherited constructor.
+  CXXInheritedCtorInitExprArgs = Args;
+
+  FunctionArgList Params;
+  QualType RetType = BuildFunctionArgList(CurGD, Params);
+  FnRetTy = RetType;
+
+  // Insert any ABI-specific implicit constructor arguments.
+  CGM.getCXXABI().addImplicitConstructorArgs(*this, Ctor, CtorType,
+                                             ForVirtualBase, Delegating, Args);
+
+  // Emit a simplified prolog. We only need to emit the implicit params.
+  assert(Args.size() >= Params.size() && "too few arguments for call");
+  for (unsigned I = 0, N = Args.size(); I != N; ++I) {
+    if (I < Params.size() && isa<ImplicitParamDecl>(Params[I])) {
+      const RValue &RV = Args[I].RV;
+      assert(!RV.isComplex() && "complex indirect params not supported");
+      ParamValue Val = RV.isScalar()
+                           ? ParamValue::forDirect(RV.getScalarVal())
+                           : ParamValue::forIndirect(RV.getAggregateAddress());
+      EmitParmDecl(*Params[I], Val, I + 1);
+    }
+  }
+
+  // Create a return value slot if the ABI implementation wants one.
+  // FIXME: This is dumb, we should ask the ABI not to try to set the return
+  // value instead.
+  if (!RetType->isVoidType())
+    ReturnValue = CreateIRTemp(RetType, "retval.inhctor");
+
+  CGM.getCXXABI().EmitInstanceFunctionProlog(*this);
+  CXXThisValue = CXXABIThisValue;
+
+  // Directly emit the constructor initializers.
+  EmitCtorPrologue(Ctor, CtorType, Params);
+}
+
 void CodeGenFunction::EmitVTableAssumptionLoad(const VPtr &Vptr, Address This) {
   llvm::Value *VTableGlobal =
       CGM.getCXXABI().getVTableAddressPoint(Vptr.Base, Vptr.VTableClass);
@@ -2149,19 +2325,6 @@
 CodeGenFunction::EmitSynthesizedCXXCopyCtorCall(const CXXConstructorDecl *D,
                                                 Address This, Address Src,
                                                 const CXXConstructExpr *E) {
-  if (isMemcpyEquivalentSpecialMember(D)) {
-    assert(E->getNumArgs() == 1 && "unexpected argcount for trivial ctor");
-    assert(D->isCopyOrMoveConstructor() &&
-           "trivial 1-arg ctor not a copy/move ctor");
-    EmitAggregateCopyCtor(This, Src,
-                          getContext().getTypeDeclType(D->getParent()),
-                          (*E->arg_begin())->getType());
-    return;
-  }
-  llvm::Value *Callee = CGM.getAddrOfCXXStructor(D, StructorType::Complete);
-  assert(D->isInstance() &&
-         "Trying to emit a member call expr on a static method!");
-
   const FunctionProtoType *FPT = D->getType()->castAs<FunctionProtoType>();
 
   CallArgList Args;
@@ -2179,8 +2342,7 @@
   EmitCallArgs(Args, FPT, drop_begin(E->arguments(), 1), E->getConstructor(),
                /*ParamsToSkip*/ 1);
 
-  EmitCall(CGM.getTypes().arrangeCXXMethodCall(Args, FPT, RequiredArgs::All),
-           Callee, ReturnValueSlot(), Args, D);
+  EmitCXXConstructorCall(D, Ctor_Complete, false, false, This, Args);
 }
 
 void
@@ -2194,21 +2356,17 @@
   assert(I != E && "no parameters to constructor");
 
   // this
-  DelegateArgs.add(RValue::get(LoadCXXThis()), (*I)->getType());
+  Address This = LoadCXXThisAddress();
+  DelegateArgs.add(RValue::get(This.getPointer()), (*I)->getType());
   ++I;
 
-  // vtt
-  if (llvm::Value *VTT = GetVTTParameter(GlobalDecl(Ctor, CtorType),
-                                         /*ForVirtualBase=*/false,
-                                         /*Delegating=*/true)) {
-    QualType VoidPP = getContext().getPointerType(getContext().VoidPtrTy);
-    DelegateArgs.add(RValue::get(VTT), VoidPP);
-
-    if (CGM.getCXXABI().NeedsVTTParameter(CurGD)) {
-      assert(I != E && "cannot skip vtt parameter, already done with args");
-      assert((*I)->getType() == VoidPP && "skipping parameter not of vtt type");
-      ++I;
-    }
+  // FIXME: The location of the VTT parameter in the parameter list is
+  // specific to the Itanium ABI and shouldn't be hardcoded here.
+  if (CGM.getCXXABI().NeedsVTTParameter(CurGD)) {
+    assert(I != E && "cannot skip vtt parameter, already done with args");
+    assert((*I)->getType()->isPointerType() &&
+           "skipping parameter not of vtt type");
+    ++I;
   }
 
   // Explicit arguments.
@@ -2218,11 +2376,8 @@
     EmitDelegateCallArg(DelegateArgs, param, Loc);
   }
 
-  llvm::Value *Callee =
-      CGM.getAddrOfCXXStructor(Ctor, getFromCtorType(CtorType));
-  EmitCall(CGM.getTypes()
-               .arrangeCXXStructorDeclaration(Ctor, getFromCtorType(CtorType)),
-           Callee, ReturnValueSlot(), DelegateArgs, Ctor);
+  EmitCXXConstructorCall(Ctor, CtorType, /*ForVirtualBase=*/false,
+                         /*Delegating=*/true, This, DelegateArgs);
 }
 
 namespace {
@@ -2291,7 +2446,7 @@
                                 /*Delegating=*/false, Addr);
     }
   };
-}
+} // end anonymous namespace
 
 void CodeGenFunction::PushDestructorCleanup(const CXXDestructorDecl *D,
                                             Address Addr) {
@@ -2489,15 +2644,35 @@
       RD->bases_begin()->getType()->getAsCXXRecordDecl());
 }
 
-void CodeGenFunction::EmitVTablePtrCheckForCall(const CXXMethodDecl *MD,
+void CodeGenFunction::EmitTypeMetadataCodeForVCall(const CXXRecordDecl *RD,
+                                                   llvm::Value *VTable,
+                                                   SourceLocation Loc) {
+  if (CGM.getCodeGenOpts().WholeProgramVTables &&
+      CGM.HasHiddenLTOVisibility(RD)) {
+    llvm::Metadata *MD =
+        CGM.CreateMetadataIdentifierForType(QualType(RD->getTypeForDecl(), 0));
+    llvm::Value *TypeId =
+        llvm::MetadataAsValue::get(CGM.getLLVMContext(), MD);
+
+    llvm::Value *CastedVTable = Builder.CreateBitCast(VTable, Int8PtrTy);
+    llvm::Value *TypeTest =
+        Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::type_test),
+                           {CastedVTable, TypeId});
+    Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::assume), TypeTest);
+  }
+
+  if (SanOpts.has(SanitizerKind::CFIVCall))
+    EmitVTablePtrCheckForCall(RD, VTable, CodeGenFunction::CFITCK_VCall, Loc);
+}
+
+void CodeGenFunction::EmitVTablePtrCheckForCall(const CXXRecordDecl *RD,
                                                 llvm::Value *VTable,
                                                 CFITypeCheckKind TCK,
                                                 SourceLocation Loc) {
-  const CXXRecordDecl *ClassDecl = MD->getParent();
   if (!SanOpts.has(SanitizerKind::CFICastStrict))
-    ClassDecl = LeastDerivedClassWithSameLayout(ClassDecl);
+    RD = LeastDerivedClassWithSameLayout(RD);
 
-  EmitVTablePtrCheck(ClassDecl, VTable, TCK, Loc);
+  EmitVTablePtrCheck(RD, VTable, TCK, Loc);
 }
 
 void CodeGenFunction::EmitVTablePtrCheckForCast(QualType T,
@@ -2549,7 +2724,12 @@
                                          llvm::Value *VTable,
                                          CFITypeCheckKind TCK,
                                          SourceLocation Loc) {
-  if (CGM.IsCFIBlacklistedRecord(RD))
+  if (!CGM.getCodeGenOpts().SanitizeCfiCrossDso &&
+      !CGM.HasHiddenLTOVisibility(RD))
+    return;
+
+  std::string TypeName = RD->getQualifiedNameAsString();
+  if (getContext().getSanitizerBlacklist().isBlacklistedType(TypeName))
     return;
 
   SanitizerScope SanScope(this);
@@ -2567,24 +2747,18 @@
   case CFITCK_UnrelatedCast:
     SSK = llvm::SanStat_CFI_UnrelatedCast;
     break;
+  case CFITCK_ICall:
+    llvm_unreachable("not expecting CFITCK_ICall");
   }
   EmitSanitizerStatReport(SSK);
 
   llvm::Metadata *MD =
       CGM.CreateMetadataIdentifierForType(QualType(RD->getTypeForDecl(), 0));
-  llvm::Value *BitSetName = llvm::MetadataAsValue::get(getLLVMContext(), MD);
+  llvm::Value *TypeId = llvm::MetadataAsValue::get(getLLVMContext(), MD);
 
   llvm::Value *CastedVTable = Builder.CreateBitCast(VTable, Int8PtrTy);
-  llvm::Value *BitSetTest =
-      Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::bitset_test),
-                         {CastedVTable, BitSetName});
-
-  if (CGM.getCodeGenOpts().SanitizeCfiCrossDso) {
-    if (auto TypeId = CGM.CreateCfiIdForTypeMetadata(MD)) {
-      EmitCfiSlowPathCheck(BitSetTest, TypeId, CastedVTable);
-      return;
-    }
-  }
+  llvm::Value *TypeTest = Builder.CreateCall(
+      CGM.getIntrinsic(llvm::Intrinsic::type_test), {CastedVTable, TypeId});
 
   SanitizerMask M;
   switch (TCK) {
@@ -2600,15 +2774,70 @@
   case CFITCK_UnrelatedCast:
     M = SanitizerKind::CFIUnrelatedCast;
     break;
+  case CFITCK_ICall:
+    llvm_unreachable("not expecting CFITCK_ICall");
   }
 
   llvm::Constant *StaticData[] = {
+      llvm::ConstantInt::get(Int8Ty, TCK),
       EmitCheckSourceLocation(Loc),
       EmitCheckTypeDescriptor(QualType(RD->getTypeForDecl(), 0)),
-      llvm::ConstantInt::get(Int8Ty, TCK),
   };
-  EmitCheck(std::make_pair(BitSetTest, M), "cfi_bad_type", StaticData,
-            CastedVTable);
+
+  auto CrossDsoTypeId = CGM.CreateCrossDsoCfiTypeId(MD);
+  if (CGM.getCodeGenOpts().SanitizeCfiCrossDso && CrossDsoTypeId) {
+    EmitCfiSlowPathCheck(M, TypeTest, CrossDsoTypeId, CastedVTable, StaticData);
+    return;
+  }
+
+  if (CGM.getCodeGenOpts().SanitizeTrap.has(M)) {
+    EmitTrapCheck(TypeTest);
+    return;
+  }
+
+  llvm::Value *AllVtables = llvm::MetadataAsValue::get(
+      CGM.getLLVMContext(),
+      llvm::MDString::get(CGM.getLLVMContext(), "all-vtables"));
+  llvm::Value *ValidVtable = Builder.CreateCall(
+      CGM.getIntrinsic(llvm::Intrinsic::type_test), {CastedVTable, AllVtables});
+  EmitCheck(std::make_pair(TypeTest, M), "cfi_check_fail", StaticData,
+            {CastedVTable, ValidVtable});
+}
+
+bool CodeGenFunction::ShouldEmitVTableTypeCheckedLoad(const CXXRecordDecl *RD) {
+  if (!CGM.getCodeGenOpts().WholeProgramVTables ||
+      !SanOpts.has(SanitizerKind::CFIVCall) ||
+      !CGM.getCodeGenOpts().SanitizeTrap.has(SanitizerKind::CFIVCall) ||
+      !CGM.HasHiddenLTOVisibility(RD))
+    return false;
+
+  std::string TypeName = RD->getQualifiedNameAsString();
+  return !getContext().getSanitizerBlacklist().isBlacklistedType(TypeName);
+}
+
+llvm::Value *CodeGenFunction::EmitVTableTypeCheckedLoad(
+    const CXXRecordDecl *RD, llvm::Value *VTable, uint64_t VTableByteOffset) {
+  SanitizerScope SanScope(this);
+
+  EmitSanitizerStatReport(llvm::SanStat_CFI_VCall);
+
+  llvm::Metadata *MD =
+      CGM.CreateMetadataIdentifierForType(QualType(RD->getTypeForDecl(), 0));
+  llvm::Value *TypeId = llvm::MetadataAsValue::get(CGM.getLLVMContext(), MD);
+
+  llvm::Value *CastedVTable = Builder.CreateBitCast(VTable, Int8PtrTy);
+  llvm::Value *CheckedLoad = Builder.CreateCall(
+      CGM.getIntrinsic(llvm::Intrinsic::type_checked_load),
+      {CastedVTable, llvm::ConstantInt::get(Int32Ty, VTableByteOffset),
+       TypeId});
+  llvm::Value *CheckResult = Builder.CreateExtractValue(CheckedLoad, 1);
+
+  EmitCheck(std::make_pair(CheckResult, SanitizerKind::CFIVCall),
+            "cfi_check_fail", nullptr, nullptr);
+
+  return Builder.CreateBitCast(
+      Builder.CreateExtractValue(CheckedLoad, 0),
+      cast<llvm::PointerType>(VTable->getType())->getElementType());
 }
 
 // FIXME: Ideally Expr::IgnoreParenNoopCasts should do this, but it doesn't do
@@ -2749,7 +2978,7 @@
   CallArgs.add(RValue::get(ThisPtr.getPointer()), ThisType);
 
   // Add the rest of the parameters.
-  for (auto param : BD->params())
+  for (auto param : BD->parameters())
     EmitDelegateCallArg(CallArgs, param, param->getLocStart());
 
   assert(!Lambda->isGenericLambda() &&
@@ -2779,7 +3008,7 @@
   CallArgs.add(RValue::get(ThisPtr), ThisType);
 
   // Add the rest of the parameters.
-  for (auto Param : MD->params())
+  for (auto Param : MD->parameters())
     EmitDelegateCallArg(CallArgs, Param, Param->getLocStart());
 
   const CXXMethodDecl *CallOp = Lambda->getLambdaCallOperator();
diff --git a/lib/CodeGen/CGCleanup.cpp b/lib/CodeGen/CGCleanup.cpp
index 95333d0..b3278b3 100644
--- a/lib/CodeGen/CGCleanup.cpp
+++ b/lib/CodeGen/CGCleanup.cpp
@@ -188,6 +188,7 @@
   bool IsNormalCleanup = Kind & NormalCleanup;
   bool IsEHCleanup = Kind & EHCleanup;
   bool IsActive = !(Kind & InactiveCleanup);
+  bool IsLifetimeMarker = Kind & LifetimeMarker;
   EHCleanupScope *Scope =
     new (Buffer) EHCleanupScope(IsNormalCleanup,
                                 IsEHCleanup,
@@ -200,6 +201,8 @@
     InnermostNormalCleanup = stable_begin();
   if (IsEHCleanup)
     InnermostEHScope = stable_begin();
+  if (IsLifetimeMarker)
+    Scope->setLifetimeMarker();
 
   return Scope->getCleanupBuffer();
 }
diff --git a/lib/CodeGen/CGCleanup.h b/lib/CodeGen/CGCleanup.h
index 4823773..98d01b1 100644
--- a/lib/CodeGen/CGCleanup.h
+++ b/lib/CodeGen/CGCleanup.h
@@ -86,11 +86,6 @@
     /// The amount of extra storage needed by the Cleanup.
     /// Always a multiple of the scope-stack alignment.
     unsigned CleanupSize : 12;
-
-    /// The number of fixups required by enclosing scopes (not including
-    /// this one).  If this is the top cleanup scope, all the fixups
-    /// from this index onwards belong to this scope.
-    unsigned FixupDepth : 32 - 18 - NumCommonBits; // currently 12
   };
 
   class FilterBitFields {
@@ -188,6 +183,7 @@
                EHScopeStack::stable_iterator enclosingEHScope)
     : EHScope(Catch, enclosingEHScope) {
     CatchBits.NumHandlers = numHandlers;
+    assert(CatchBits.NumHandlers == numHandlers && "NumHandlers overflow?");
   }
 
   unsigned getNumHandlers() const {
@@ -263,6 +259,11 @@
   };
   mutable struct ExtInfo *ExtInfo;
 
+  /// The number of fixups required by enclosing scopes (not including
+  /// this one).  If this is the top cleanup scope, all the fixups
+  /// from this index onwards belong to this scope.
+  unsigned FixupDepth;
+
   struct ExtInfo &getExtInfo() {
     if (!ExtInfo) ExtInfo = new struct ExtInfo();
     return *ExtInfo;
@@ -288,8 +289,9 @@
                  unsigned cleanupSize, unsigned fixupDepth,
                  EHScopeStack::stable_iterator enclosingNormal,
                  EHScopeStack::stable_iterator enclosingEH)
-    : EHScope(EHScope::Cleanup, enclosingEH), EnclosingNormal(enclosingNormal),
-      NormalBlock(nullptr), ActiveFlag(nullptr), ExtInfo(nullptr) {
+      : EHScope(EHScope::Cleanup, enclosingEH),
+        EnclosingNormal(enclosingNormal), NormalBlock(nullptr),
+        ActiveFlag(nullptr), ExtInfo(nullptr), FixupDepth(fixupDepth) {
     CleanupBits.IsNormalCleanup = isNormal;
     CleanupBits.IsEHCleanup = isEH;
     CleanupBits.IsActive = isActive;
@@ -297,7 +299,6 @@
     CleanupBits.TestFlagInNormalCleanup = false;
     CleanupBits.TestFlagInEHCleanup = false;
     CleanupBits.CleanupSize = cleanupSize;
-    CleanupBits.FixupDepth = fixupDepth;
 
     assert(CleanupBits.CleanupSize == cleanupSize && "cleanup size overflow");
   }
@@ -343,7 +344,7 @@
     return CleanupBits.TestFlagInEHCleanup;
   }
 
-  unsigned getFixupDepth() const { return CleanupBits.FixupDepth; }
+  unsigned getFixupDepth() const { return FixupDepth; }
   EHScopeStack::stable_iterator getEnclosingNormalCleanup() const {
     return EnclosingNormal;
   }
@@ -451,6 +452,7 @@
   EHFilterScope(unsigned numFilters)
     : EHScope(Filter, EHScopeStack::stable_end()) {
     FilterBits.NumFilters = numFilters;
+    assert(FilterBits.NumFilters == numFilters && "NumFilters overflow");
   }
 
   static size_t getSizeForNumFilters(unsigned numFilters) {
diff --git a/lib/CodeGen/CGDebugInfo.cpp b/lib/CodeGen/CGDebugInfo.cpp
index 51c175e..ec57017 100644
--- a/lib/CodeGen/CGDebugInfo.cpp
+++ b/lib/CodeGen/CGDebugInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "CGDebugInfo.h"
 #include "CGBlocks.h"
+#include "CGRecordLayout.h"
 #include "CGCXXABI.h"
 #include "CGObjCRuntime.h"
 #include "CodeGenFunction.h"
@@ -168,10 +169,10 @@
   }
 
   // Check namespace.
-  if (const NamespaceDecl *NSDecl = dyn_cast<NamespaceDecl>(Context))
+  if (const auto *NSDecl = dyn_cast<NamespaceDecl>(Context))
     return getOrCreateNameSpace(NSDecl);
 
-  if (const RecordDecl *RDecl = dyn_cast<RecordDecl>(Context))
+  if (const auto *RDecl = dyn_cast<RecordDecl>(Context))
     if (!RDecl->isDependentType())
       return getOrCreateType(CGM.getContext().getTypeDeclType(RDecl),
                              getOrCreateMainFile());
@@ -184,30 +185,32 @@
   FunctionTemplateSpecializationInfo *Info =
       FD->getTemplateSpecializationInfo();
 
-  if (!Info && FII && !CGM.getCodeGenOpts().EmitCodeView)
+  // Emit the unqualified name in normal operation. LLVM and the debugger can
+  // compute the fully qualified name from the scope chain. If we're only
+  // emitting line table info, there won't be any scope chains, so emit the
+  // fully qualified name here so that stack traces are more accurate.
+  // FIXME: Do this when emitting DWARF as well as when emitting CodeView after
+  // evaluating the size impact.
+  bool UseQualifiedName = DebugKind == codegenoptions::DebugLineTablesOnly &&
+                          CGM.getCodeGenOpts().EmitCodeView;
+
+  if (!Info && FII && !UseQualifiedName)
     return FII->getName();
 
-  // Otherwise construct human readable name for debug info.
   SmallString<128> NS;
   llvm::raw_svector_ostream OS(NS);
   PrintingPolicy Policy(CGM.getLangOpts());
-
-  if (CGM.getCodeGenOpts().EmitCodeView) {
-    // Print a fully qualified name like MSVC would.
-    Policy.MSVCFormatting = true;
-    FD->printQualifiedName(OS, Policy);
-  } else {
-    // Print the unqualified name with some template arguments. This is what
-    // DWARF-based debuggers expect.
+  Policy.MSVCFormatting = CGM.getCodeGenOpts().EmitCodeView;
+  if (!UseQualifiedName)
     FD->printName(OS);
-    // Add any template specialization args.
-    if (Info) {
-      const TemplateArgumentList *TArgs = Info->TemplateArguments;
-      const TemplateArgument *Args = TArgs->data();
-      unsigned NumArgs = TArgs->size();
-      TemplateSpecializationType::PrintTemplateArgumentList(OS, Args, NumArgs,
-                                                            Policy);
-    }
+  else
+    FD->printQualifiedName(OS, Policy);
+
+  // Add any template specialization args.
+  if (Info) {
+    const TemplateArgumentList *TArgs = Info->TemplateArguments;
+    TemplateSpecializationType::PrintTemplateArgumentList(OS, TArgs->asArray(),
+                                                          Policy);
   }
 
   // Copy this name on the side and use its reference.
@@ -219,21 +222,18 @@
   llvm::raw_svector_ostream OS(MethodName);
   OS << (OMD->isInstanceMethod() ? '-' : '+') << '[';
   const DeclContext *DC = OMD->getDeclContext();
-  if (const ObjCImplementationDecl *OID =
-          dyn_cast<const ObjCImplementationDecl>(DC)) {
+  if (const auto *OID = dyn_cast<ObjCImplementationDecl>(DC)) {
     OS << OID->getName();
-  } else if (const ObjCInterfaceDecl *OID =
-                 dyn_cast<const ObjCInterfaceDecl>(DC)) {
+  } else if (const auto *OID = dyn_cast<ObjCInterfaceDecl>(DC)) {
     OS << OID->getName();
-  } else if (const ObjCCategoryDecl *OC = dyn_cast<ObjCCategoryDecl>(DC)) {
+  } else if (const auto *OC = dyn_cast<ObjCCategoryDecl>(DC)) {
     if (OC->IsClassExtension()) {
       OS << OC->getClassInterface()->getName();
     } else {
-      OS << ((const NamedDecl *)OC)->getIdentifier()->getNameStart() << '('
+      OS << OC->getIdentifier()->getNameStart() << '('
          << OC->getIdentifier()->getNameStart() << ')';
     }
-  } else if (const ObjCCategoryImplDecl *OCD =
-                 dyn_cast<const ObjCCategoryImplDecl>(DC)) {
+  } else if (const auto *OCD = dyn_cast<ObjCCategoryImplDecl>(DC)) {
     OS << ((const NamedDecl *)OCD)->getIdentifier()->getNameStart() << '('
        << OCD->getIdentifier()->getNameStart() << ')';
   } else if (isa<ObjCProtocolDecl>(DC)) {
@@ -254,20 +254,56 @@
 }
 
 StringRef CGDebugInfo::getClassName(const RecordDecl *RD) {
-  // quick optimization to avoid having to intern strings that are already
-  // stored reliably elsewhere
-  if (!isa<ClassTemplateSpecializationDecl>(RD))
-    return RD->getName();
-
-  SmallString<128> Name;
-  {
+  if (isa<ClassTemplateSpecializationDecl>(RD)) {
+    SmallString<128> Name;
     llvm::raw_svector_ostream OS(Name);
     RD->getNameForDiagnostic(OS, CGM.getContext().getPrintingPolicy(),
                              /*Qualified*/ false);
+
+    // Copy this name on the side and use its reference.
+    return internString(Name);
   }
 
-  // Copy this name on the side and use its reference.
-  return internString(Name);
+  // quick optimization to avoid having to intern strings that are already
+  // stored reliably elsewhere
+  if (const IdentifierInfo *II = RD->getIdentifier())
+    return II->getName();
+
+  // The CodeView printer in LLVM wants to see the names of unnamed types: it is
+  // used to reconstruct the fully qualified type names.
+  if (CGM.getCodeGenOpts().EmitCodeView) {
+    if (const TypedefNameDecl *D = RD->getTypedefNameForAnonDecl()) {
+      assert(RD->getDeclContext() == D->getDeclContext() &&
+             "Typedef should not be in another decl context!");
+      assert(D->getDeclName().getAsIdentifierInfo() &&
+             "Typedef was not named!");
+      return D->getDeclName().getAsIdentifierInfo()->getName();
+    }
+
+    if (CGM.getLangOpts().CPlusPlus) {
+      StringRef Name;
+
+      ASTContext &Context = CGM.getContext();
+      if (const DeclaratorDecl *DD = Context.getDeclaratorForUnnamedTagDecl(RD))
+        // Anonymous types without a name for linkage purposes have their
+        // declarator mangled in if they have one.
+        Name = DD->getName();
+      else if (const TypedefNameDecl *TND =
+                   Context.getTypedefNameForUnnamedTagDecl(RD))
+        // Anonymous types without a name for linkage purposes have their
+        // associate typedef mangled in if they have one.
+        Name = TND->getName();
+
+      if (!Name.empty()) {
+        SmallString<256> UnnamedType("<unnamed-type-");
+        UnnamedType += Name;
+        UnnamedType += '>';
+        return internString(UnnamedType);
+      }
+    }
+  }
+
+  return StringRef();
 }
 
 llvm::DIFile *CGDebugInfo::getOrCreateFile(SourceLocation Loc) {
@@ -383,6 +419,8 @@
       LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
   } else if (LO.ObjC1) {
     LangTag = llvm::dwarf::DW_LANG_ObjC;
+  } else if (LO.RenderScript) {
+    LangTag = llvm::dwarf::DW_LANG_GOOGLE_RenderScript;
   } else if (LO.C99) {
     LangTag = llvm::dwarf::DW_LANG_C99;
   } else {
@@ -474,43 +512,14 @@
     return SelTy;
   }
 
-  case BuiltinType::OCLImage1d:
-    return getOrCreateStructPtrType("opencl_image1d_t", OCLImage1dDITy);
-  case BuiltinType::OCLImage1dArray:
-    return getOrCreateStructPtrType("opencl_image1d_array_t",
-                                    OCLImage1dArrayDITy);
-  case BuiltinType::OCLImage1dBuffer:
-    return getOrCreateStructPtrType("opencl_image1d_buffer_t",
-                                    OCLImage1dBufferDITy);
-  case BuiltinType::OCLImage2d:
-    return getOrCreateStructPtrType("opencl_image2d_t", OCLImage2dDITy);
-  case BuiltinType::OCLImage2dArray:
-    return getOrCreateStructPtrType("opencl_image2d_array_t",
-                                    OCLImage2dArrayDITy);
-  case BuiltinType::OCLImage2dDepth:
-    return getOrCreateStructPtrType("opencl_image2d_depth_t",
-                                    OCLImage2dDepthDITy);
-  case BuiltinType::OCLImage2dArrayDepth:
-    return getOrCreateStructPtrType("opencl_image2d_array_depth_t",
-                                    OCLImage2dArrayDepthDITy);
-  case BuiltinType::OCLImage2dMSAA:
-    return getOrCreateStructPtrType("opencl_image2d_msaa_t",
-                                    OCLImage2dMSAADITy);
-  case BuiltinType::OCLImage2dArrayMSAA:
-    return getOrCreateStructPtrType("opencl_image2d_array_msaa_t",
-                                    OCLImage2dArrayMSAADITy);
-  case BuiltinType::OCLImage2dMSAADepth:
-    return getOrCreateStructPtrType("opencl_image2d_msaa_depth_t",
-                                    OCLImage2dMSAADepthDITy);
-  case BuiltinType::OCLImage2dArrayMSAADepth:
-    return getOrCreateStructPtrType("opencl_image2d_array_msaa_depth_t",
-                                    OCLImage2dArrayMSAADepthDITy);
-  case BuiltinType::OCLImage3d:
-    return getOrCreateStructPtrType("opencl_image3d_t", OCLImage3dDITy);
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id: \
+    return getOrCreateStructPtrType("opencl_" #ImgType "_" #Suffix "_t", \
+                                    SingletonId);
+#include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
-    return DBuilder.createBasicType(
-        "opencl_sampler_t", CGM.getContext().getTypeSize(BT),
-        CGM.getContext().getTypeAlign(BT), llvm::dwarf::DW_ATE_unsigned);
+    return getOrCreateStructPtrType("opencl_sampler_t",
+                                    OCLSamplerDITy);
   case BuiltinType::OCLEvent:
     return getOrCreateStructPtrType("opencl_event_t", OCLEventDITy);
   case BuiltinType::OCLClkEvent:
@@ -556,7 +565,13 @@
   case BuiltinType::Half:
   case BuiltinType::Float:
   case BuiltinType::LongDouble:
+  case BuiltinType::Float128:
   case BuiltinType::Double:
+    // FIXME: For targets where long double and __float128 have the same size,
+    // they are currently indistinguishable in the debugger without some
+    // special treatment. However, there is currently no consensus on encoding
+    // and this should be updated once a DWARF encoding exists for distinct
+    // floating point types of the same size.
     Encoding = llvm::dwarf::DW_ATE_float;
     break;
   }
@@ -671,10 +686,6 @@
   if (!hasCXXMangling(TD, TheCU) || !TD->isExternallyVisible())
     return FullName;
 
-  // Microsoft Mangler does not have support for mangleCXXRTTIName yet.
-  if (CGM.getTarget().getCXXABI().isMicrosoft())
-    return FullName;
-
   // TODO: This is using the RTTI name. Is there a better way to get
   // a unique string for a type?
   llvm::raw_svector_ostream Out(FullName);
@@ -828,10 +839,10 @@
                               /*qualified*/ false);
 
   TemplateSpecializationType::PrintTemplateArgumentList(
-      OS, Ty->getArgs(), Ty->getNumArgs(),
+      OS, Ty->template_arguments(),
       CGM.getContext().getPrintingPolicy());
 
-  TypeAliasDecl *AliasDecl = cast<TypeAliasTemplateDecl>(
+  auto *AliasDecl = cast<TypeAliasTemplateDecl>(
       Ty->getTemplateName().getAsTemplateDecl())->getTemplatedDecl();
 
   SourceLocation Loc = AliasDecl->getLocation();
@@ -853,6 +864,39 @@
       getDeclContextDescriptor(Ty->getDecl()));
 }
 
+static unsigned getDwarfCC(CallingConv CC) {
+  switch (CC) {
+  case CC_C:
+    // Avoid emitting DW_AT_calling_convention if the C convention was used.
+    return 0;
+
+  case CC_X86StdCall:
+    return llvm::dwarf::DW_CC_BORLAND_stdcall;
+  case CC_X86FastCall:
+    return llvm::dwarf::DW_CC_BORLAND_msfastcall;
+  case CC_X86ThisCall:
+    return llvm::dwarf::DW_CC_BORLAND_thiscall;
+  case CC_X86VectorCall:
+    return llvm::dwarf::DW_CC_LLVM_vectorcall;
+  case CC_X86Pascal:
+    return llvm::dwarf::DW_CC_BORLAND_pascal;
+
+  // FIXME: Create new DW_CC_ codes for these calling conventions.
+  case CC_X86_64Win64:
+  case CC_X86_64SysV:
+  case CC_AAPCS:
+  case CC_AAPCS_VFP:
+  case CC_IntelOclBicc:
+  case CC_SpirFunction:
+  case CC_OpenCLKernel:
+  case CC_Swift:
+  case CC_PreserveMost:
+  case CC_PreserveAll:
+    return 0;
+  }
+  return 0;
+}
+
 llvm::DIType *CGDebugInfo::CreateType(const FunctionType *Ty,
                                       llvm::DIFile *Unit) {
   SmallVector<llvm::Metadata *, 16> EltTys;
@@ -864,15 +908,16 @@
   // otherwise emit it as a variadic function.
   if (isa<FunctionNoProtoType>(Ty))
     EltTys.push_back(DBuilder.createUnspecifiedParameter());
-  else if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(Ty)) {
-    for (unsigned i = 0, e = FPT->getNumParams(); i != e; ++i)
-      EltTys.push_back(getOrCreateType(FPT->getParamType(i), Unit));
+  else if (const auto *FPT = dyn_cast<FunctionProtoType>(Ty)) {
+    for (const QualType &ParamType : FPT->param_types())
+      EltTys.push_back(getOrCreateType(ParamType, Unit));
     if (FPT->isVariadic())
       EltTys.push_back(DBuilder.createUnspecifiedParameter());
   }
 
   llvm::DITypeRefArray EltTypeArray = DBuilder.getOrCreateTypeArray(EltTys);
-  return DBuilder.createSubroutineType(EltTypeArray);
+  return DBuilder.createSubroutineType(EltTypeArray, 0,
+                                       getDwarfCC(Ty->getCallConv()));
 }
 
 /// Convert an AccessSpecifier into the corresponding DINode flag.
@@ -901,10 +946,38 @@
   llvm_unreachable("unexpected access enumerator");
 }
 
-llvm::DIType *CGDebugInfo::createFieldType(
-    StringRef name, QualType type, uint64_t sizeInBitsOverride,
-    SourceLocation loc, AccessSpecifier AS, uint64_t offsetInBits,
-    llvm::DIFile *tunit, llvm::DIScope *scope, const RecordDecl *RD) {
+llvm::DIType *CGDebugInfo::createBitFieldType(const FieldDecl *BitFieldDecl,
+                                              llvm::DIScope *RecordTy,
+                                              const RecordDecl *RD) {
+  StringRef Name = BitFieldDecl->getName();
+  QualType Ty = BitFieldDecl->getType();
+  SourceLocation Loc = BitFieldDecl->getLocation();
+  llvm::DIFile *VUnit = getOrCreateFile(Loc);
+  llvm::DIType *DebugType = getOrCreateType(Ty, VUnit);
+
+  // Get the location for the field.
+  llvm::DIFile *File = getOrCreateFile(Loc);
+  unsigned Line = getLineNumber(Loc);
+
+  const CGBitFieldInfo &BitFieldInfo =
+      CGM.getTypes().getCGRecordLayout(RD).getBitFieldInfo(BitFieldDecl);
+  uint64_t SizeInBits = BitFieldInfo.Size;
+  assert(SizeInBits > 0 && "found named 0-width bitfield");
+  unsigned AlignInBits = CGM.getContext().getTypeAlign(Ty);
+  uint64_t StorageOffsetInBits =
+      CGM.getContext().toBits(BitFieldInfo.StorageOffset);
+  uint64_t OffsetInBits = StorageOffsetInBits + BitFieldInfo.Offset;
+  unsigned Flags = getAccessFlag(BitFieldDecl->getAccess(), RD);
+  return DBuilder.createBitFieldMemberType(
+      RecordTy, Name, File, Line, SizeInBits, AlignInBits, OffsetInBits,
+      StorageOffsetInBits, Flags, DebugType);
+}
+
+llvm::DIType *
+CGDebugInfo::createFieldType(StringRef name, QualType type, SourceLocation loc,
+                             AccessSpecifier AS, uint64_t offsetInBits,
+                             llvm::DIFile *tunit, llvm::DIScope *scope,
+                             const RecordDecl *RD) {
   llvm::DIType *debugType = getOrCreateType(type, tunit);
 
   // Get the location for the field.
@@ -917,9 +990,6 @@
     TypeInfo TI = CGM.getContext().getTypeInfo(type);
     SizeInBits = TI.Width;
     AlignInBits = TI.Align;
-
-    if (sizeInBitsOverride)
-      SizeInBits = sizeInBitsOverride;
   }
 
   unsigned flags = getAccessFlag(AS, RD);
@@ -941,19 +1011,15 @@
        I != E; ++I, ++Field, ++fieldno) {
     const LambdaCapture &C = *I;
     if (C.capturesVariable()) {
+      SourceLocation Loc = C.getLocation();
+      assert(!Field->isBitField() && "lambdas don't have bitfield members!");
       VarDecl *V = C.getCapturedVar();
-      llvm::DIFile *VUnit = getOrCreateFile(C.getLocation());
       StringRef VName = V->getName();
-      uint64_t SizeInBitsOverride = 0;
-      if (Field->isBitField()) {
-        SizeInBitsOverride = Field->getBitWidthValue(CGM.getContext());
-        assert(SizeInBitsOverride && "found named 0-width bitfield");
-      }
-      llvm::DIType *fieldType = createFieldType(
-          VName, Field->getType(), SizeInBitsOverride, C.getLocation(),
-          Field->getAccess(), layout.getFieldOffset(fieldno), VUnit, RecordTy,
-          CXXDecl);
-      elements.push_back(fieldType);
+      llvm::DIFile *VUnit = getOrCreateFile(Loc);
+      llvm::DIType *FieldType = createFieldType(
+          VName, Field->getType(), Loc, Field->getAccess(),
+          layout.getFieldOffset(fieldno), VUnit, RecordTy, CXXDecl);
+      elements.push_back(FieldType);
     } else if (C.capturesThis()) {
       // TODO: Need to handle 'this' in some way by probably renaming the
       // this of the lambda class and having a field member of 'this' or
@@ -963,7 +1029,7 @@
       llvm::DIFile *VUnit = getOrCreateFile(f->getLocation());
       QualType type = f->getType();
       llvm::DIType *fieldType = createFieldType(
-          "this", type, 0, f->getLocation(), f->getAccess(),
+          "this", type, f->getLocation(), f->getAccess(),
           layout.getFieldOffset(fieldno), VUnit, RecordTy, CXXDecl);
 
       elements.push_back(fieldType);
@@ -1011,30 +1077,44 @@
   if (name.empty() && !type->isRecordType())
     return;
 
-  uint64_t SizeInBitsOverride = 0;
+  llvm::DIType *FieldType;
   if (field->isBitField()) {
-    SizeInBitsOverride = field->getBitWidthValue(CGM.getContext());
-    assert(SizeInBitsOverride && "found named 0-width bitfield");
+    FieldType = createBitFieldType(field, RecordTy, RD);
+  } else {
+    FieldType =
+        createFieldType(name, type, field->getLocation(), field->getAccess(),
+                        OffsetInBits, tunit, RecordTy, RD);
   }
 
-  llvm::DIType *fieldType =
-      createFieldType(name, type, SizeInBitsOverride, field->getLocation(),
-                      field->getAccess(), OffsetInBits, tunit, RecordTy, RD);
+  elements.push_back(FieldType);
+}
 
-  elements.push_back(fieldType);
+void CGDebugInfo::CollectRecordNestedRecord(
+    const RecordDecl *RD, SmallVectorImpl<llvm::Metadata *> &elements) {
+  QualType Ty = CGM.getContext().getTypeDeclType(RD);
+  // Injected class names are not considered nested records.
+  if (isa<InjectedClassNameType>(Ty))
+    return;
+  SourceLocation Loc = RD->getLocation();
+  llvm::DIType *nestedType = getOrCreateType(Ty, getOrCreateFile(Loc));
+  elements.push_back(nestedType);
 }
 
 void CGDebugInfo::CollectRecordFields(
     const RecordDecl *record, llvm::DIFile *tunit,
     SmallVectorImpl<llvm::Metadata *> &elements,
     llvm::DICompositeType *RecordTy) {
-  const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(record);
+  const auto *CXXDecl = dyn_cast<CXXRecordDecl>(record);
 
   if (CXXDecl && CXXDecl->isLambda())
     CollectRecordLambdaFields(CXXDecl, elements, RecordTy);
   else {
     const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(record);
 
+    // Debug info for nested records is included in the member list only for
+    // CodeView.
+    bool IncludeNestedRecords = CGM.getCodeGenOpts().EmitCodeView;
+
     // Field number for non-static fields.
     unsigned fieldNo = 0;
 
@@ -1042,6 +1122,8 @@
     // the corresponding declarations in the source program.
     for (const auto *I : record->decls())
       if (const auto *V = dyn_cast<VarDecl>(I)) {
+        if (V->hasAttr<NoDebugAttr>())
+          continue;
         // Reuse the existing static member declaration if one exists
         auto MI = StaticDataMemberCache.find(V->getCanonicalDecl());
         if (MI != StaticDataMemberCache.end()) {
@@ -1058,7 +1140,10 @@
 
         // Bump field number for next field.
         ++fieldNo;
-      }
+      } else if (const auto *nestedRec = dyn_cast<CXXRecordDecl>(I))
+        if (IncludeNestedRecords && !nestedRec->isImplicit() &&
+            nestedRec->getDeclContext() == record)
+          CollectRecordNestedRecord(nestedRec, elements);
   }
 }
 
@@ -1123,13 +1208,14 @@
   if (Func->getExtProtoInfo().RefQualifier == RQ_RValue)
     Flags |= llvm::DINode::FlagRValueReference;
 
-  return DBuilder.createSubroutineType(EltTypeArray, Flags);
+  return DBuilder.createSubroutineType(EltTypeArray, Flags,
+                                       getDwarfCC(Func->getCallConv()));
 }
 
 /// isFunctionLocalClass - Return true if CXXRecordDecl is defined
 /// inside a function.
 static bool isFunctionLocalClass(const CXXRecordDecl *RD) {
-  if (const CXXRecordDecl *NRD = dyn_cast<CXXRecordDecl>(RD->getDeclContext()))
+  if (const auto *NRD = dyn_cast<CXXRecordDecl>(RD->getDeclContext()))
     return isFunctionLocalClass(NRD);
   if (isa<FunctionDecl>(RD->getDeclContext()))
     return true;
@@ -1147,6 +1233,11 @@
   // Since a single ctor/dtor corresponds to multiple functions, it doesn't
   // make sense to give a single ctor/dtor a linkage name.
   StringRef MethodLinkageName;
+  // FIXME: 'isFunctionLocalClass' seems like an arbitrary/unintentional
+  // property to use here. It may've been intended to model "is non-external
+  // type" but misses cases of non-function-local but non-external classes such
+  // as those in anonymous namespaces as well as the reverse - external types
+  // that are function local, such as those in (non-local) inline functions.
   if (!IsCtorOrDtor && !isFunctionLocalClass(Method->getParent()))
     MethodLinkageName = CGM.getMangledName(Method);
 
@@ -1162,6 +1253,8 @@
   llvm::DIType *ContainingType = nullptr;
   unsigned Virtuality = 0;
   unsigned VIndex = 0;
+  unsigned Flags = 0;
+  int ThisAdjustment = 0;
 
   if (Method->isVirtual()) {
     if (Method->isPure())
@@ -1169,26 +1262,45 @@
     else
       Virtuality = llvm::dwarf::DW_VIRTUALITY_virtual;
 
-    // It doesn't make sense to give a virtual destructor a vtable index,
-    // since a single destructor has two entries in the vtable.
-    // FIXME: Add proper support for debug info for virtual calls in
-    // the Microsoft ABI, where we may use multiple vptrs to make a vftable
-    // lookup if we have multiple or virtual inheritance.
-    if (!isa<CXXDestructorDecl>(Method) &&
-        !CGM.getTarget().getCXXABI().isMicrosoft())
-      VIndex = CGM.getItaniumVTableContext().getMethodVTableIndex(Method);
+    if (CGM.getTarget().getCXXABI().isItaniumFamily()) {
+      // It doesn't make sense to give a virtual destructor a vtable index,
+      // since a single destructor has two entries in the vtable.
+      if (!isa<CXXDestructorDecl>(Method))
+        VIndex = CGM.getItaniumVTableContext().getMethodVTableIndex(Method);
+    } else {
+      // Emit MS ABI vftable information.  There is only one entry for the
+      // deleting dtor.
+      const auto *DD = dyn_cast<CXXDestructorDecl>(Method);
+      GlobalDecl GD = DD ? GlobalDecl(DD, Dtor_Deleting) : GlobalDecl(Method);
+      MicrosoftVTableContext::MethodVFTableLocation ML =
+          CGM.getMicrosoftVTableContext().getMethodVFTableLocation(GD);
+      VIndex = ML.Index;
+
+      // CodeView only records the vftable offset in the class that introduces
+      // the virtual method. This is possible because, unlike Itanium, the MS
+      // C++ ABI does not include all virtual methods from non-primary bases in
+      // the vtable for the most derived class. For example, if C inherits from
+      // A and B, C's primary vftable will not include B's virtual methods.
+      if (Method->begin_overridden_methods() == Method->end_overridden_methods())
+        Flags |= llvm::DINode::FlagIntroducedVirtual;
+
+      // The 'this' adjustment accounts for both the virtual and non-virtual
+      // portions of the adjustment. Presumably the debugger only uses it when
+      // it knows the dynamic type of an object.
+      ThisAdjustment = CGM.getCXXABI()
+                           .getVirtualFunctionPrologueThisAdjustment(GD)
+                           .getQuantity();
+    }
     ContainingType = RecordTy;
   }
 
-  unsigned Flags = 0;
   if (Method->isImplicit())
     Flags |= llvm::DINode::FlagArtificial;
   Flags |= getAccessFlag(Method->getAccess(), Method->getParent());
-  if (const CXXConstructorDecl *CXXC = dyn_cast<CXXConstructorDecl>(Method)) {
+  if (const auto *CXXC = dyn_cast<CXXConstructorDecl>(Method)) {
     if (CXXC->isExplicit())
       Flags |= llvm::DINode::FlagExplicit;
-  } else if (const CXXConversionDecl *CXXC =
-                 dyn_cast<CXXConversionDecl>(Method)) {
+  } else if (const auto *CXXC = dyn_cast<CXXConversionDecl>(Method)) {
     if (CXXC->isExplicit())
       Flags |= llvm::DINode::FlagExplicit;
   }
@@ -1202,9 +1314,9 @@
   llvm::DINodeArray TParamsArray = CollectFunctionTemplateParams(Method, Unit);
   llvm::DISubprogram *SP = DBuilder.createMethod(
       RecordTy, MethodName, MethodLinkageName, MethodDefUnit, MethodLine,
-      MethodTy, /*isLocalToUnit=*/false,
-      /* isDefinition=*/false, Virtuality, VIndex, ContainingType, Flags,
-      CGM.getLangOpts().Optimize, TParamsArray.get());
+      MethodTy, /*isLocalToUnit=*/false, /*isDefinition=*/false, Virtuality,
+      VIndex, ThisAdjustment, ContainingType, Flags, CGM.getLangOpts().Optimize,
+      TParamsArray.get());
 
   SPCache[Method->getCanonicalDecl()].reset(SP);
 
@@ -1257,7 +1369,7 @@
     unsigned BFlags = 0;
     uint64_t BaseOffset;
 
-    const CXXRecordDecl *Base =
+    const auto *Base =
         cast<CXXRecordDecl>(BI.getType()->getAs<RecordType>()->getDecl());
 
     if (BI.isVirtual()) {
@@ -1345,8 +1457,7 @@
       llvm::Constant *V = nullptr;
       // Special case member data pointer null values since they're actually -1
       // instead of zero.
-      if (const MemberPointerType *MPT =
-              dyn_cast<MemberPointerType>(T.getTypePtr()))
+      if (const auto *MPT = dyn_cast<MemberPointerType>(T.getTypePtr()))
         // But treat member function pointers as simple zero integers because
         // it's easier than having a special case in LLVM's CodeGen. If LLVM
         // CodeGen grows handling for values of non-null member function
@@ -1357,7 +1468,7 @@
       if (!V)
         V = llvm::ConstantInt::get(CGM.Int8Ty, 0);
       TemplateParams.push_back(DBuilder.createTemplateValueParameter(
-          TheCU, Name, TTy, cast<llvm::Constant>(V)));
+          TheCU, Name, TTy, V));
     } break;
     case TemplateArgument::Template:
       TemplateParams.push_back(DBuilder.createTemplateTemplateParameter(
@@ -1378,7 +1489,7 @@
       assert(V && "Expression in template argument isn't constant");
       llvm::DIType *TTy = getOrCreateType(T, Unit);
       TemplateParams.push_back(DBuilder.createTemplateValueParameter(
-          TheCU, Name, TTy, cast<llvm::Constant>(V->stripPointerCasts())));
+          TheCU, Name, TTy, V->stripPointerCasts()));
     } break;
     // And the following should never occur:
     case TemplateArgument::TemplateExpansion:
@@ -1501,7 +1612,7 @@
   if (DebugKind <= codegenoptions::DebugLineTablesOnly)
     return;
 
-  if (const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD))
+  if (const auto *CXXDecl = dyn_cast<CXXRecordDecl>(RD))
     if (CXXDecl->isDynamicClass())
       return;
 
@@ -1529,22 +1640,25 @@
 
 static bool hasExplicitMemberDefinition(CXXRecordDecl::method_iterator I,
                                         CXXRecordDecl::method_iterator End) {
-  for (; I != End; ++I)
-    if (FunctionDecl *Tmpl = I->getInstantiatedFromMemberFunction())
+  for (CXXMethodDecl *MD : llvm::make_range(I, End))
+    if (FunctionDecl *Tmpl = MD->getInstantiatedFromMemberFunction())
       if (!Tmpl->isImplicit() && Tmpl->isThisDeclarationADefinition() &&
-          !I->getMemberSpecializationInfo()->isExplicitSpecialization())
+          !MD->getMemberSpecializationInfo()->isExplicitSpecialization())
         return true;
   return false;
 }
 
 /// Does a type definition exist in an imported clang module?
 static bool isDefinedInClangModule(const RecordDecl *RD) {
+  // Only definitions that where imported from an AST file come from a module.
   if (!RD || !RD->isFromASTFile())
     return false;
+  // Anonymous entities cannot be addressed. Treat them as not from module.
   if (!RD->isExternallyVisible() && RD->getName().empty())
     return false;
   if (auto *CXXDecl = dyn_cast<CXXRecordDecl>(RD)) {
-    assert(CXXDecl->isCompleteDefinition() && "incomplete record definition");
+    if (!CXXDecl->isCompleteDefinition())
+      return false;
     auto TemplateKind = CXXDecl->getTemplateSpecializationKind();
     if (TemplateKind != TSK_Undeclared) {
       // This is a template, check the origin of the first member.
@@ -1572,17 +1686,21 @@
   if (!RD->isCompleteDefinitionRequired())
     return true;
 
-  const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD);
+  const auto *CXXDecl = dyn_cast<CXXRecordDecl>(RD);
 
   if (!CXXDecl)
     return false;
 
-  if (CXXDecl->hasDefinition() && CXXDecl->isDynamicClass())
+  // Only emit complete debug info for a dynamic class when its vtable is
+  // emitted.  However, Microsoft debuggers don't resolve type information
+  // across DLL boundaries, so skip this optimization if the class is marked
+  // dllimport.
+  if (CXXDecl->hasDefinition() && CXXDecl->isDynamicClass() &&
+      !CXXDecl->hasAttr<DLLImportAttr>())
     return true;
 
   TemplateSpecializationKind Spec = TSK_Undeclared;
-  if (const ClassTemplateSpecializationDecl *SD =
-          dyn_cast<ClassTemplateSpecializationDecl>(RD))
+  if (const auto *SD = dyn_cast<ClassTemplateSpecializationDecl>(RD))
     Spec = SD->getSpecializationKind();
 
   if (Spec == TSK_ExplicitInstantiationDeclaration &&
@@ -1624,7 +1742,7 @@
   if (!D || !D->isCompleteDefinition())
     return FwdDecl;
 
-  if (const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD))
+  if (const auto *CXXDecl = dyn_cast<CXXRecordDecl>(RD))
     CollectContainingType(CXXDecl, FwdDecl);
 
   // Push the struct on region stack.
@@ -1639,7 +1757,7 @@
   // gdb tests will depend on a certain ordering at printout. The debug
   // information offsets are still correct if we merge them all together
   // though.
-  const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD);
+  const auto *CXXDecl = dyn_cast<CXXRecordDecl>(RD);
   if (CXXDecl) {
     CollectCXXBases(CXXDecl, DefUnit, EltTys, FwdDecl);
     CollectVTableInfo(CXXDecl, DefUnit, EltTys);
@@ -1972,7 +2090,7 @@
   uint64_t Align;
 
   // FIXME: make getTypeAlign() aware of VLAs and incomplete array types
-  if (const VariableArrayType *VAT = dyn_cast<VariableArrayType>(Ty)) {
+  if (const auto *VAT = dyn_cast<VariableArrayType>(Ty)) {
     Size = 0;
     Align =
         CGM.getContext().getTypeAlign(CGM.getContext().getBaseElementType(VAT));
@@ -2005,7 +2123,7 @@
     //     int x[0];
     //   };
     int64_t Count = -1; // Count == -1 is an unbounded array.
-    if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(Ty))
+    if (const auto *CAT = dyn_cast<ConstantArrayType>(Ty))
       Count = CAT->getSize().getZExtValue();
 
     // FIXME: Verify this is right for VLAs.
@@ -2033,12 +2151,35 @@
 
 llvm::DIType *CGDebugInfo::CreateType(const MemberPointerType *Ty,
                                       llvm::DIFile *U) {
-  uint64_t Size =
-      !Ty->isIncompleteType() ? CGM.getContext().getTypeSize(Ty) : 0;
+  unsigned Flags = 0;
+  uint64_t Size = 0;
+
+  if (!Ty->isIncompleteType()) {
+    Size = CGM.getContext().getTypeSize(Ty);
+
+    // Set the MS inheritance model. There is no flag for the unspecified model.
+    if (CGM.getTarget().getCXXABI().isMicrosoft()) {
+      switch (Ty->getMostRecentCXXRecordDecl()->getMSInheritanceModel()) {
+      case MSInheritanceAttr::Keyword_single_inheritance:
+        Flags |= llvm::DINode::FlagSingleInheritance;
+        break;
+      case MSInheritanceAttr::Keyword_multiple_inheritance:
+        Flags |= llvm::DINode::FlagMultipleInheritance;
+        break;
+      case MSInheritanceAttr::Keyword_virtual_inheritance:
+        Flags |= llvm::DINode::FlagVirtualInheritance;
+        break;
+      case MSInheritanceAttr::Keyword_unspecified_inheritance:
+        break;
+      }
+    }
+  }
+
   llvm::DIType *ClassType = getOrCreateType(QualType(Ty->getClass(), 0), U);
   if (Ty->isMemberDataPointerType())
     return DBuilder.createMemberPointerType(
-        getOrCreateType(Ty->getPointeeType(), U), ClassType, Size);
+        getOrCreateType(Ty->getPointeeType(), U), ClassType, Size, /*Align=*/0,
+        Flags);
 
   const FunctionProtoType *FPT =
       Ty->getPointeeType()->getAs<FunctionProtoType>();
@@ -2046,7 +2187,7 @@
       getOrCreateInstanceMethodType(CGM.getContext().getPointerType(QualType(
                                         Ty->getClass(), FPT->getTypeQuals())),
                                     FPT, U),
-      ClassType, Size);
+      ClassType, Size, /*Align=*/0, Flags);
 }
 
 llvm::DIType *CGDebugInfo::CreateType(const AtomicType *Ty, llvm::DIFile *U) {
@@ -2440,8 +2581,7 @@
   RegionMap[Ty->getDecl()].reset(RealDecl);
   TypeCache[QualType(Ty, 0).getAsOpaquePtr()].reset(RealDecl);
 
-  if (const ClassTemplateSpecializationDecl *TSpecial =
-          dyn_cast<ClassTemplateSpecializationDecl>(RD))
+  if (const auto *TSpecial = dyn_cast<ClassTemplateSpecializationDecl>(RD))
     DBuilder.replaceArrays(RealDecl, llvm::DINodeArray(),
                            CollectCXXTemplateParams(TSpecial, DefUnit));
   return RealDecl;
@@ -2488,7 +2628,7 @@
                                            llvm::DIScope *&FDContext,
                                            llvm::DINodeArray &TParamsArray,
                                            unsigned &Flags) {
-  const FunctionDecl *FD = cast<FunctionDecl>(GD.getDecl());
+  const auto *FD = cast<FunctionDecl>(GD.getDecl());
   Name = getFunctionName(FD);
   // Use mangled name as linkage name for C/C++ functions.
   if (FD->hasPrototype()) {
@@ -2512,6 +2652,9 @@
       llvm::DIScope *Mod = getParentModuleOrNull(RDecl);
       FDContext = getContextDescriptor(RDecl, Mod ? Mod : TheCU);
     }
+    // Check if it is a noreturn-marked function
+    if (FD->isNoReturn())
+      Flags |= llvm::DINode::FlagNoReturn;
     // Collect template parameters.
     TParamsArray = CollectFunctionTemplateParams(FD, Unit);
   }
@@ -2580,15 +2723,15 @@
   SmallVector<QualType, 16> ArgTypes;
   for (const ParmVarDecl *Parm: FD->parameters())
     ArgTypes.push_back(Parm->getType());
-  QualType FnType =
-    CGM.getContext().getFunctionType(FD->getReturnType(), ArgTypes,
-                                     FunctionProtoType::ExtProtoInfo());
+  CallingConv CC = FD->getType()->castAs<FunctionType>()->getCallConv();
+  QualType FnType = CGM.getContext().getFunctionType(
+      FD->getReturnType(), ArgTypes, FunctionProtoType::ExtProtoInfo(CC));
   llvm::DISubprogram *SP = DBuilder.createTempFunctionFwdDecl(
       DContext, Name, LinkageName, Unit, Line,
       getOrCreateFunctionType(FD, FnType, Unit), !FD->isExternallyVisible(),
       /* isDefinition = */ false, 0, Flags, CGM.getLangOpts().Optimize,
       TParamsArray.get(), getFunctionDeclaration(FD));
-  const FunctionDecl *CanonDecl = cast<FunctionDecl>(FD->getCanonicalDecl());
+  const auto *CanonDecl = cast<FunctionDecl>(FD->getCanonicalDecl());
   FwdDeclReplaceMap.emplace_back(std::piecewise_construct,
                                  std::make_tuple(CanonDecl),
                                  std::make_tuple(SP));
@@ -2620,7 +2763,7 @@
   // we would otherwise do to get a type for a pointee. (forward declarations in
   // limited debug info, full definitions (if the type definition is available)
   // in unlimited debug info)
-  if (const TypeDecl *TD = dyn_cast<TypeDecl>(D))
+  if (const auto *TD = dyn_cast<TypeDecl>(D))
     return getOrCreateType(CGM.getContext().getTypeDeclType(TD),
                            getOrCreateFile(TD->getLocation()));
   auto I = DeclCache.find(D->getCanonicalDecl());
@@ -2630,7 +2773,7 @@
 
   // No definition for now. Emit a forward definition that might be
   // merged with a potential upcoming definition.
-  if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D))
+  if (const auto *FD = dyn_cast<FunctionDecl>(D))
     return getFunctionForwardDeclaration(FD);
   else if (const auto *VD = dyn_cast<VarDecl>(D))
     return getGlobalVariableForwardDeclaration(VD);
@@ -2642,7 +2785,7 @@
   if (!D || DebugKind <= codegenoptions::DebugLineTablesOnly)
     return nullptr;
 
-  const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
+  const auto *FD = dyn_cast<FunctionDecl>(D);
   if (!FD)
     return nullptr;
 
@@ -2651,8 +2794,7 @@
 
   auto MI = SPCache.find(FD->getCanonicalDecl());
   if (MI == SPCache.end()) {
-    if (const CXXMethodDecl *MD =
-            dyn_cast<CXXMethodDecl>(FD->getCanonicalDecl())) {
+    if (const auto *MD = dyn_cast<CXXMethodDecl>(FD->getCanonicalDecl())) {
       return CreateCXXMemberFunction(MD, getOrCreateFile(MD->getLocation()),
                                      cast<llvm::DICompositeType>(S));
     }
@@ -2684,9 +2826,13 @@
     // subprogram DIE will miss DW_AT_decl_file and DW_AT_decl_line fields.
     return DBuilder.createSubroutineType(DBuilder.getOrCreateTypeArray(None));
 
-  if (const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D))
+  if (const auto *Method = dyn_cast<CXXMethodDecl>(D))
     return getOrCreateMethodType(Method, F);
-  if (const ObjCMethodDecl *OMethod = dyn_cast<ObjCMethodDecl>(D)) {
+
+  const auto *FTy = FnType->getAs<FunctionType>();
+  CallingConv CC = FTy ? FTy->getCallConv() : CallingConv::CC_C;
+
+  if (const auto *OMethod = dyn_cast<ObjCMethodDecl>(D)) {
     // Add "self" and "_cmd"
     SmallVector<llvm::Metadata *, 16> Elts;
 
@@ -2712,28 +2858,28 @@
     Elts.push_back(DBuilder.createArtificialType(
         getOrCreateType(CGM.getContext().getObjCSelType(), F)));
     // Get rest of the arguments.
-    for (const auto *PI : OMethod->params())
+    for (const auto *PI : OMethod->parameters())
       Elts.push_back(getOrCreateType(PI->getType(), F));
     // Variadic methods need a special marker at the end of the type list.
     if (OMethod->isVariadic())
       Elts.push_back(DBuilder.createUnspecifiedParameter());
 
     llvm::DITypeRefArray EltTypeArray = DBuilder.getOrCreateTypeArray(Elts);
-    return DBuilder.createSubroutineType(EltTypeArray);
+    return DBuilder.createSubroutineType(EltTypeArray, 0, getDwarfCC(CC));
   }
 
   // Handle variadic function types; they need an additional
   // unspecified parameter.
-  if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
+  if (const auto *FD = dyn_cast<FunctionDecl>(D))
     if (FD->isVariadic()) {
       SmallVector<llvm::Metadata *, 16> EltTys;
       EltTys.push_back(getOrCreateType(FD->getReturnType(), F));
-      if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(FnType))
-        for (unsigned i = 0, e = FPT->getNumParams(); i != e; ++i)
-          EltTys.push_back(getOrCreateType(FPT->getParamType(i), F));
+      if (const auto *FPT = dyn_cast<FunctionProtoType>(FnType))
+        for (QualType ParamType : FPT->param_types())
+          EltTys.push_back(getOrCreateType(ParamType, F));
       EltTys.push_back(DBuilder.createUnspecifiedParameter());
       llvm::DITypeRefArray EltTypeArray = DBuilder.getOrCreateTypeArray(EltTys);
-      return DBuilder.createSubroutineType(EltTypeArray);
+      return DBuilder.createSubroutineType(EltTypeArray, 0, getDwarfCC(CC));
     }
 
   return cast<llvm::DISubroutineType>(getOrCreateType(FnType, F));
@@ -2758,7 +2904,7 @@
   if (!HasDecl) {
     // Use llvm function name.
     LinkageName = Fn->getName();
-  } else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
+  } else if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
     // If there is a subprogram for this function available then use it.
     auto FI = SPCache.find(FD->getCanonicalDecl());
     if (FI != SPCache.end()) {
@@ -2771,7 +2917,7 @@
     }
     collectFunctionDeclProps(GD, Unit, Name, LinkageName, FDContext,
                              TParamsArray, Flags);
-  } else if (const ObjCMethodDecl *OMD = dyn_cast<ObjCMethodDecl>(D)) {
+  } else if (const auto *OMD = dyn_cast<ObjCMethodDecl>(D)) {
     Name = getObjCMethodName(OMD);
     Flags |= llvm::DINode::FlagPrototyped;
   } else {
@@ -2779,7 +2925,7 @@
     Name = Fn->getName();
     Flags |= llvm::DINode::FlagPrototyped;
   }
-  if (!Name.empty() && Name[0] == '\01')
+  if (Name.startswith("\01"))
     Name = Name.substr(1);
 
   if (!HasDecl || D->isImplicit()) {
@@ -2798,7 +2944,7 @@
   // are emitted as CU level entities by the backend.
   llvm::DISubprogram *SP = DBuilder.createFunction(
       FDContext, Name, LinkageName, Unit, LineNo,
-      getOrCreateFunctionType(D, FnType, Unit), Fn->hasInternalLinkage(),
+      getOrCreateFunctionType(D, FnType, Unit), Fn->hasLocalLinkage(),
       true /*definition*/, ScopeLine, Flags, CGM.getLangOpts().Optimize,
       TParamsArray.get(), getFunctionDeclaration(D));
   Fn->setSubprogram(SP);
@@ -2806,7 +2952,7 @@
   // code for the initialization of globals. Do not record these decls
   // as they will overwrite the actual VarDecl Decl in the cache.
   if (HasDecl && isa<FunctionDecl>(D))
-    DeclCache[D->getCanonicalDecl()].reset(static_cast<llvm::Metadata *>(SP));
+    DeclCache[D->getCanonicalDecl()].reset(SP);
 
   // Push the function onto the lexical block stack.
   LexicalBlockStack.emplace_back(SP);
@@ -2832,7 +2978,7 @@
     // If there is a DISubprogram for this function available then use it.
     collectFunctionDeclProps(GD, Unit, Name, LinkageName, FDContext,
                              TParamsArray, Flags);
-  } else if (const ObjCMethodDecl *OMD = dyn_cast<ObjCMethodDecl>(D)) {
+  } else if (const auto *OMD = dyn_cast<ObjCMethodDecl>(D)) {
     Name = getObjCMethodName(OMD);
     Flags |= llvm::DINode::FlagPrototyped;
   } else {
@@ -2998,6 +3144,8 @@
                               CGBuilderTy &Builder) {
   assert(DebugKind >= codegenoptions::LimitedDebugInfo);
   assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!");
+  if (VD->hasAttr<NoDebugAttr>())
+    return;
 
   bool Unwritten =
       VD->isImplicit() || (isa<Decl>(VD->getDeclContext()) &&
@@ -3035,7 +3183,7 @@
   // otherwise it is 'self' or 'this'.
   if (isa<ImplicitParamDecl>(VD) && ArgNo && *ArgNo == 1)
     Flags |= llvm::DINode::FlagObjectPointer;
-  if (llvm::Argument *Arg = dyn_cast<llvm::Argument>(Storage))
+  if (auto *Arg = dyn_cast<llvm::Argument>(Storage))
     if (Arg->getType()->isPointerTy() && !Arg->hasByValAttr() &&
         !VD->getType()->isPointerType())
       Expr.push_back(llvm::dwarf::DW_OP_deref);
@@ -3071,10 +3219,10 @@
       return;
     } else if (isa<VariableArrayType>(VD->getType()))
       Expr.push_back(llvm::dwarf::DW_OP_deref);
-  } else if (const RecordType *RT = dyn_cast<RecordType>(VD->getType())) {
+  } else if (const auto *RT = dyn_cast<RecordType>(VD->getType())) {
     // If VD is an anonymous union then Storage represents value for
     // all union fields.
-    const RecordDecl *RD = cast<RecordDecl>(RT->getDecl());
+    const auto *RD = cast<RecordDecl>(RT->getDecl());
     if (RD->isUnion() && RD->isAnonymousStructOrUnion()) {
       // GDB has trouble finding local variables in anonymous unions, so we emit
       // artifical local variables for each of the members.
@@ -3142,6 +3290,8 @@
 
   if (Builder.GetInsertBlock() == nullptr)
     return;
+  if (VD->hasAttr<NoDebugAttr>())
+    return;
 
   bool isByRef = VD->hasAttr<BlocksAttr>();
 
@@ -3241,25 +3391,25 @@
       CGM.getDataLayout().getStructLayout(block.StructureType);
 
   SmallVector<llvm::Metadata *, 16> fields;
-  fields.push_back(createFieldType("__isa", C.VoidPtrTy, 0, loc, AS_public,
+  fields.push_back(createFieldType("__isa", C.VoidPtrTy, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(0),
                                    tunit, tunit));
-  fields.push_back(createFieldType("__flags", C.IntTy, 0, loc, AS_public,
+  fields.push_back(createFieldType("__flags", C.IntTy, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(1),
                                    tunit, tunit));
-  fields.push_back(createFieldType("__reserved", C.IntTy, 0, loc, AS_public,
+  fields.push_back(createFieldType("__reserved", C.IntTy, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(2),
                                    tunit, tunit));
   auto *FnTy = block.getBlockExpr()->getFunctionType();
   auto FnPtrType = CGM.getContext().getPointerType(FnTy->desugar());
-  fields.push_back(createFieldType("__FuncPtr", FnPtrType, 0, loc, AS_public,
+  fields.push_back(createFieldType("__FuncPtr", FnPtrType, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(3),
                                    tunit, tunit));
   fields.push_back(createFieldType(
       "__descriptor", C.getPointerType(block.NeedsCopyDispose
                                            ? C.getBlockDescriptorExtendedType()
                                            : C.getBlockDescriptorType()),
-      0, loc, AS_public, blockLayout->getElementOffsetInBits(4), tunit, tunit));
+      loc, AS_public, blockLayout->getElementOffsetInBits(4), tunit, tunit));
 
   // We want to sort the captures by offset, not because DWARF
   // requires this, but because we're paranoid about debuggers.
@@ -3293,11 +3443,9 @@
   // Sort by offset.
   llvm::array_pod_sort(chunks.begin(), chunks.end());
 
-  for (SmallVectorImpl<BlockLayoutChunk>::iterator i = chunks.begin(),
-                                                   e = chunks.end();
-       i != e; ++i) {
-    uint64_t offsetInBits = i->OffsetInBits;
-    const BlockDecl::Capture *capture = i->Capture;
+  for (const BlockLayoutChunk &Chunk : chunks) {
+    uint64_t offsetInBits = Chunk.OffsetInBits;
+    const BlockDecl::Capture *capture = Chunk.Capture;
 
     // If we have a null capture, this must be the C++ 'this' capture.
     if (!capture) {
@@ -3310,7 +3458,7 @@
       else
         llvm_unreachable("unexpected block declcontext");
 
-      fields.push_back(createFieldType("this", type, 0, loc, AS_public,
+      fields.push_back(createFieldType("this", type, loc, AS_public,
                                        offsetInBits, tunit, tunit));
       continue;
     }
@@ -3330,7 +3478,7 @@
           DBuilder.createMemberType(tunit, name, tunit, line, PtrInfo.Width,
                                     PtrInfo.Align, offsetInBits, 0, fieldType);
     } else {
-      fieldType = createFieldType(name, variable->getType(), 0, loc, AS_public,
+      fieldType = createFieldType(name, variable->getType(), loc, AS_public,
                                   offsetInBits, tunit, tunit);
     }
     fields.push_back(fieldType);
@@ -3399,8 +3547,7 @@
 
     // Ignore unnamed fields, but recurse into anonymous records.
     if (FieldName.empty()) {
-      const RecordType *RT = dyn_cast<RecordType>(Field->getType());
-      if (RT)
+      if (const auto *RT = dyn_cast<RecordType>(Field->getType()))
         GV = CollectAnonRecordDecls(RT->getDecl(), Unit, LineNo, LinkageName,
                                     Var, DContext);
       continue;
@@ -3408,7 +3555,7 @@
     // Use VarDecl's Tag, Scope and Line number.
     GV = DBuilder.createGlobalVariable(DContext, FieldName, LinkageName, Unit,
                                        LineNo, FieldTy,
-                                       Var->hasInternalLinkage(), Var, nullptr);
+                                       Var->hasLocalLinkage(), Var, nullptr);
   }
   return GV;
 }
@@ -3416,6 +3563,8 @@
 void CGDebugInfo::EmitGlobalVariable(llvm::GlobalVariable *Var,
                                      const VarDecl *D) {
   assert(DebugKind >= codegenoptions::LimitedDebugInfo);
+  if (D->hasAttr<NoDebugAttr>())
+    return;
   // Create global variable debug descriptor.
   llvm::DIFile *Unit = nullptr;
   llvm::DIScope *DContext = nullptr;
@@ -3439,21 +3588,23 @@
   } else {
     GV = DBuilder.createGlobalVariable(
         DContext, DeclName, LinkageName, Unit, LineNo, getOrCreateType(T, Unit),
-        Var->hasInternalLinkage(), Var,
+        Var->hasLocalLinkage(), Var,
         getOrCreateStaticDataMemberDeclarationOrNull(D));
   }
-  DeclCache[D->getCanonicalDecl()].reset(static_cast<llvm::Metadata *>(GV));
+  DeclCache[D->getCanonicalDecl()].reset(GV);
 }
 
 void CGDebugInfo::EmitGlobalVariable(const ValueDecl *VD,
                                      llvm::Constant *Init) {
   assert(DebugKind >= codegenoptions::LimitedDebugInfo);
+  if (VD->hasAttr<NoDebugAttr>())
+    return;
   // Create the descriptor for the variable.
   llvm::DIFile *Unit = getOrCreateFile(VD->getLocation());
   StringRef Name = VD->getName();
   llvm::DIType *Ty = getOrCreateType(VD->getType(), Unit);
-  if (const EnumConstantDecl *ECD = dyn_cast<EnumConstantDecl>(VD)) {
-    const EnumDecl *ED = cast<EnumDecl>(ECD->getDeclContext());
+  if (const auto *ECD = dyn_cast<EnumConstantDecl>(VD)) {
+    const auto *ED = cast<EnumDecl>(ECD->getDeclContext());
     assert(isa<EnumType>(ED->getTypeForDecl()) && "Enum without EnumType?");
     Ty = getOrCreateType(QualType(ED->getTypeForDecl(), 0), Unit);
   }
@@ -3500,8 +3651,8 @@
   if (CGM.getCodeGenOpts().getDebugInfo() < codegenoptions::LimitedDebugInfo)
     return;
   const NamespaceDecl *NSDecl = UD.getNominatedNamespace();
-  if (!NSDecl->isAnonymousNamespace() || 
-      CGM.getCodeGenOpts().DebugExplicitImport) { 
+  if (!NSDecl->isAnonymousNamespace() ||
+      CGM.getCodeGenOpts().DebugExplicitImport) {
     DBuilder.createImportedModule(
         getCurrentContextDescriptor(cast<Decl>(UD.getDeclContext())),
         getOrCreateNameSpace(NSDecl),
@@ -3517,6 +3668,16 @@
   // Emitting one decl is sufficient - debuggers can detect that this is an
   // overloaded name & provide lookup for all the overloads.
   const UsingShadowDecl &USD = **UD.shadow_begin();
+
+  // FIXME: Skip functions with undeduced auto return type for now since we
+  // don't currently have the plumbing for separate declarations & definitions
+  // of free functions and mismatched types (auto in the declaration, concrete
+  // return type in the definition)
+  if (const auto *FD = dyn_cast<FunctionDecl>(USD.getUnderlyingDecl()))
+    if (const auto *AT =
+            FD->getType()->getAs<FunctionProtoType>()->getContainedAutoType())
+      if (AT->getDeducedType().isNull())
+        return;
   if (llvm::DINode *Target =
           getDeclarationOrDefinition(USD.getUnderlyingDecl()))
     DBuilder.createImportedDeclaration(
@@ -3525,6 +3686,8 @@
 }
 
 void CGDebugInfo::EmitImportDecl(const ImportDecl &ID) {
+  if (CGM.getCodeGenOpts().getDebuggerTuning() != llvm::DebuggerKind::LLDB)
+    return;
   if (Module *M = ID.getImportedModule()) {
     auto Info = ExternalASTSource::ASTSourceDescriptor(*M);
     DBuilder.createImportedDeclaration(
@@ -3542,7 +3705,7 @@
   if (VH)
     return cast<llvm::DIImportedEntity>(VH);
   llvm::DIImportedEntity *R;
-  if (const NamespaceAliasDecl *Underlying =
+  if (const auto *Underlying =
           dyn_cast<NamespaceAliasDecl>(NA.getAliasedNamespace()))
     // This could cache & dedup here rather than relying on metadata deduping.
     R = DBuilder.createImportedDeclaration(
diff --git a/lib/CodeGen/CGDebugInfo.h b/lib/CodeGen/CGDebugInfo.h
index 9e05461..71c0df4 100644
--- a/lib/CodeGen/CGDebugInfo.h
+++ b/lib/CodeGen/CGDebugInfo.h
@@ -64,18 +64,10 @@
   llvm::DIType *ClassTy = nullptr;
   llvm::DICompositeType *ObjTy = nullptr;
   llvm::DIType *SelTy = nullptr;
-  llvm::DIType *OCLImage1dDITy = nullptr;
-  llvm::DIType *OCLImage1dArrayDITy = nullptr;
-  llvm::DIType *OCLImage1dBufferDITy = nullptr;
-  llvm::DIType *OCLImage2dDITy = nullptr;
-  llvm::DIType *OCLImage2dArrayDITy = nullptr;
-  llvm::DIType *OCLImage2dDepthDITy = nullptr;
-  llvm::DIType *OCLImage2dArrayDepthDITy = nullptr;
-  llvm::DIType *OCLImage2dMSAADITy = nullptr;
-  llvm::DIType *OCLImage2dArrayMSAADITy = nullptr;
-  llvm::DIType *OCLImage2dMSAADepthDITy = nullptr;
-  llvm::DIType *OCLImage2dArrayMSAADepthDITy = nullptr;
-  llvm::DIType *OCLImage3dDITy = nullptr;
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  llvm::DIType *SingletonId = nullptr;
+#include "clang/Basic/OpenCLImageTypes.def"
+  llvm::DIType *OCLSamplerDITy = nullptr;
   llvm::DIType *OCLEventDITy = nullptr;
   llvm::DIType *OCLClkEventDITy = nullptr;
   llvm::DIType *OCLQueueDITy = nullptr;
@@ -109,7 +101,7 @@
   /// compilation.
   std::vector<std::pair<const TagType *, llvm::TrackingMDRef>> ReplaceMap;
 
-  /// Cache of replaceable forward declarartions (functions and
+  /// Cache of replaceable forward declarations (functions and
   /// variables) to RAUW at the end of compilation.
   std::vector<std::pair<const DeclaratorDecl *, llvm::TrackingMDRef>>
       FwdDeclReplaceMap;
@@ -241,11 +233,16 @@
                            llvm::DIFile *F);
 
   llvm::DIType *createFieldType(StringRef name, QualType type,
-                                uint64_t sizeInBitsOverride, SourceLocation loc,
-                                AccessSpecifier AS, uint64_t offsetInBits,
-                                llvm::DIFile *tunit, llvm::DIScope *scope,
+                                SourceLocation loc, AccessSpecifier AS,
+                                uint64_t offsetInBits, llvm::DIFile *tunit,
+                                llvm::DIScope *scope,
                                 const RecordDecl *RD = nullptr);
 
+  /// Create new bit field member.
+  llvm::DIType *createBitFieldType(const FieldDecl *BitFieldDecl,
+                                   llvm::DIScope *RecordTy,
+                                   const RecordDecl *RD);
+
   /// Helpers for collecting fields of a record.
   /// @{
   void CollectRecordLambdaFields(const CXXRecordDecl *CXXDecl,
@@ -258,6 +255,8 @@
                                 llvm::DIFile *F,
                                 SmallVectorImpl<llvm::Metadata *> &E,
                                 llvm::DIType *RecordTy, const RecordDecl *RD);
+  void CollectRecordNestedRecord(const RecordDecl *RD,
+                                 SmallVectorImpl<llvm::Metadata *> &E);
   void CollectRecordFields(const RecordDecl *Decl, llvm::DIFile *F,
                            SmallVectorImpl<llvm::Metadata *> &E,
                            llvm::DICompositeType *RecordTy);
diff --git a/lib/CodeGen/CGDecl.cpp b/lib/CodeGen/CGDecl.cpp
index 098be61..037b135 100644
--- a/lib/CodeGen/CGDecl.cpp
+++ b/lib/CodeGen/CGDecl.cpp
@@ -16,11 +16,13 @@
 #include "CGCleanup.h"
 #include "CGDebugInfo.h"
 #include "CGOpenCLRuntime.h"
+#include "CGOpenMPRuntime.h"
 #include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
@@ -29,10 +31,10 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
+
 using namespace clang;
 using namespace CodeGen;
 
-
 void CodeGenFunction::EmitDecl(const Decl &D) {
   switch (D.getKind()) {
   case Decl::BuiltinTemplate:
@@ -71,6 +73,8 @@
   case Decl::ObjCImplementation:
   case Decl::ObjCProperty:
   case Decl::ObjCCompatibleAlias:
+  case Decl::PragmaComment:
+  case Decl::PragmaDetectMismatch:
   case Decl::AccessSpec:
   case Decl::LinkageSpec:
   case Decl::ObjCPropertyImpl:
@@ -81,7 +85,9 @@
   case Decl::Captured:
   case Decl::ClassScopeFunctionSpecialization:
   case Decl::UsingShadow:
+  case Decl::ConstructorUsingShadow:
   case Decl::ObjCTypeParam:
+  case Decl::Binding:
     llvm_unreachable("Declaration should not be in declstmts!");
   case Decl::Function:  // void X();
   case Decl::Record:    // struct/union/class X;
@@ -92,6 +98,7 @@
   case Decl::Label:        // __label__ x;
   case Decl::Import:
   case Decl::OMPThreadPrivate:
+  case Decl::OMPCapturedExpr:
   case Decl::Empty:
     // None of these decls require codegen support.
     return;
@@ -108,13 +115,22 @@
     if (CGDebugInfo *DI = getDebugInfo())
       DI->EmitUsingDirective(cast<UsingDirectiveDecl>(D));
     return;
-  case Decl::Var: {
+  case Decl::Var:
+  case Decl::Decomposition: {
     const VarDecl &VD = cast<VarDecl>(D);
     assert(VD.isLocalVarDecl() &&
            "Should not see file-scope variables inside a function!");
-    return EmitVarDecl(VD);
+    EmitVarDecl(VD);
+    if (auto *DD = dyn_cast<DecompositionDecl>(&VD))
+      for (auto *B : DD->bindings())
+        if (auto *HD = B->getHoldingVar())
+          EmitVarDecl(*HD);
+    return;
   }
 
+  case Decl::OMPDeclareReduction:
+    return CGM.EmitOMPDeclareReduction(cast<OMPDeclareReductionDecl>(&D), this);
+
   case Decl::Typedef:      // typedef int X;
   case Decl::TypeAlias: {  // using X = int; [C++0x]
     const TypedefNameDecl &TD = cast<TypedefNameDecl>(D);
@@ -363,8 +379,15 @@
 
   llvm::GlobalVariable *var =
     cast<llvm::GlobalVariable>(addr->stripPointerCasts());
+
+  // CUDA's local and local static __shared__ variables should not
+  // have any non-empty initializers. This is ensured by Sema.
+  // Whatever initializer such variable may have when it gets here is
+  // a no-op and should not be emitted.
+  bool isCudaSharedVar = getLangOpts().CUDA && getLangOpts().CUDAIsDevice &&
+                         D.hasAttr<CUDASharedAttr>();
   // If this value has an initializer, emit it.
-  if (D.getInit())
+  if (D.getInit() && !isCudaSharedVar)
     var = AddInitializerToStaticVarDecl(D, var);
 
   var->setAlignment(alignment.getQuantity());
@@ -513,20 +536,7 @@
       CGF.EmitCall(FnInfo, CleanupFn, ReturnValueSlot(), Args);
     }
   };
-
-  /// A cleanup to call @llvm.lifetime.end.
-  class CallLifetimeEnd final : public EHScopeStack::Cleanup {
-    llvm::Value *Addr;
-    llvm::Value *Size;
-  public:
-    CallLifetimeEnd(Address addr, llvm::Value *size)
-      : Addr(addr.getPointer()), Size(size) {}
-
-    void Emit(CodeGenFunction &CGF, Flags flags) override {
-      CGF.EmitLifetimeEnd(Size, Addr);
-    }
-  };
-}
+} // end anonymous namespace
 
 /// EmitAutoVarWithLifetime - Does the setup required for an automatic
 /// variable with lifetime.
@@ -644,7 +654,6 @@
     }
 
     init = castExpr->getSubExpr();
-    continue;
   }
   return false;
 }
@@ -665,10 +674,10 @@
     EmitStoreThroughLValue(RValue::get(value), lvalue, true);
     return;
   }
-  
+
   if (const CXXDefaultInitExpr *DIE = dyn_cast<CXXDefaultInitExpr>(init))
     init = DIE->getExpr();
-    
+
   // If we're emitting a value with lifetime, we have to do the
   // initialization *before* we leave the cleanup scopes.
   if (const ExprWithCleanups *ewc = dyn_cast<ExprWithCleanups>(init)) {
@@ -818,7 +827,7 @@
     }
     return true;
   }
-  
+
   if (llvm::ConstantDataSequential *CDS =
         dyn_cast<llvm::ConstantDataSequential>(Init)) {
     for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
@@ -847,9 +856,9 @@
     Builder.CreateDefaultAlignedStore(Init, Loc, isVolatile);
     return;
   }
-  
-  if (llvm::ConstantDataSequential *CDS = 
-        dyn_cast<llvm::ConstantDataSequential>(Init)) {
+
+  if (llvm::ConstantDataSequential *CDS =
+          dyn_cast<llvm::ConstantDataSequential>(Init)) {
     for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
       llvm::Constant *Elt = CDS->getElementAsConstant(i);
 
@@ -876,7 +885,6 @@
   }
 }
 
-
 /// shouldUseMemSetPlusStoresToInitialize - Decide whether we should use memset
 /// plus some stores to initialize a local variable instead of using a memcpy
 /// from a constant global.  It is beneficial to use memset if the global is all
@@ -906,18 +914,29 @@
   EmitAutoVarCleanups(emission);
 }
 
+/// shouldEmitLifetimeMarkers - Decide whether we need emit the life-time
+/// markers.
+static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts,
+                                      const LangOptions &LangOpts) {
+  // Asan uses markers for use-after-scope checks.
+  if (CGOpts.SanitizeAddressUseAfterScope)
+    return true;
+
+  // Disable lifetime markers in msan builds.
+  // FIXME: Remove this when msan works with lifetime markers.
+  if (LangOpts.Sanitize.has(SanitizerKind::Memory))
+    return false;
+
+  // For now, only in optimized builds.
+  return CGOpts.OptimizationLevel != 0;
+}
+
 /// Emit a lifetime.begin marker if some criteria are satisfied.
 /// \return a pointer to the temporary size Value if a marker was emitted, null
 /// otherwise
 llvm::Value *CodeGenFunction::EmitLifetimeStart(uint64_t Size,
                                                 llvm::Value *Addr) {
-  // For now, only in optimized builds.
-  if (CGM.getCodeGenOpts().OptimizationLevel == 0)
-    return nullptr;
-
-  // Disable lifetime markers in msan builds.
-  // FIXME: Remove this when msan works with lifetime markers.
-  if (getLangOpts().Sanitize.has(SanitizerKind::Memory))
+  if (!shouldEmitLifetimeMarkers(CGM.getCodeGenOpts(), getLangOpts()))
     return nullptr;
 
   llvm::Value *SizeV = llvm::ConstantInt::get(Int64Ty, Size);
@@ -1162,6 +1181,7 @@
 
   return false;
 }
+
 void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) {
   assert(emission.Variable && "emission was not valid!");
 
@@ -1249,7 +1269,7 @@
                                llvm::GlobalValue::PrivateLinkage,
                                constant, Name);
     GV->setAlignment(Loc.getAlignment().getQuantity());
-    GV->setUnnamedAddr(true);
+    GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
 
     Address SrcPtr = Address(GV, Loc.getAlignment());
     if (SrcPtr.getType() != BP)
@@ -1380,13 +1400,10 @@
 
   // Make sure we call @llvm.lifetime.end.  This needs to happen
   // *last*, so the cleanup needs to be pushed *first*.
-  if (emission.useLifetimeMarkers()) {
-    EHStack.pushCleanup<CallLifetimeEnd>(NormalAndEHCleanup,
+  if (emission.useLifetimeMarkers())
+    EHStack.pushCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker,
                                          emission.getAllocatedAddress(),
                                          emission.getSizeForLifetimeMarkers());
-    EHCleanupScope &cleanup = cast<EHCleanupScope>(*EHStack.begin());
-    cleanup.setLifetimeMarker();
-  }
 
   // Check the type for a cleanup.
   if (QualType::DestructionKind dtorKind = D.getType().isDestructedType())
@@ -1661,7 +1678,7 @@
                               ElementType, ElementAlign, Destroyer);
     }
   };
-}
+} // end anonymous namespace
 
 /// pushIrregularPartialArrayCleanup - Push an EH cleanup to destroy
 /// already-constructed elements of the given array.  The cleanup
@@ -1730,7 +1747,7 @@
       CGF.EmitARCRelease(Param, Precise);
     }
   };
-}
+} // end anonymous namespace
 
 /// Emit an alloca (or GlobalValue depending on target)
 /// for the specified parameter and set up LocalDeclMap.
@@ -1860,3 +1877,10 @@
   if (D.hasAttr<AnnotateAttr>())
     EmitVarAnnotations(&D, DeclPtr.getPointer());
 }
+
+void CodeGenModule::EmitOMPDeclareReduction(const OMPDeclareReductionDecl *D,
+                                            CodeGenFunction *CGF) {
+  if (!LangOpts.OpenMP || (!LangOpts.EmitAllDecls && !D->isUsed()))
+    return;
+  getOpenMPRuntime().emitUserDefinedReduction(CGF, D);
+}
diff --git a/lib/CodeGen/CGDeclCXX.cpp b/lib/CodeGen/CGDeclCXX.cpp
index 50454ca..eb5d481 100644
--- a/lib/CodeGen/CGDeclCXX.cpp
+++ b/lib/CodeGen/CGDeclCXX.cpp
@@ -86,13 +86,21 @@
   llvm::Constant *function;
   llvm::Constant *argument;
 
-  // Special-case non-array C++ destructors, where there's a function
-  // with the right signature that we can just call.
-  const CXXRecordDecl *record = nullptr;
-  if (dtorKind == QualType::DK_cxx_destructor &&
-      (record = type->getAsCXXRecordDecl())) {
-    assert(!record->hasTrivialDestructor());
-    CXXDestructorDecl *dtor = record->getDestructor();
+  // Special-case non-array C++ destructors, if they have the right signature.
+  // Under some ABIs, destructors return this instead of void, and cannot be
+  // passed directly to __cxa_atexit if the target does not allow this mismatch.
+  const CXXRecordDecl *Record = type->getAsCXXRecordDecl();
+  bool CanRegisterDestructor =
+      Record && (!CGM.getCXXABI().HasThisReturn(
+                     GlobalDecl(Record->getDestructor(), Dtor_Complete)) ||
+                 CGM.getCXXABI().canCallMismatchedFunctionType());
+  // If __cxa_atexit is disabled via a flag, a different helper function is
+  // generated elsewhere which uses atexit instead, and it takes the destructor
+  // directly.
+  bool UsingExternalHelper = !CGM.getCodeGenOpts().CXAAtExit;
+  if (Record && (CanRegisterDestructor || UsingExternalHelper)) {
+    assert(!Record->hasTrivialDestructor());
+    CXXDestructorDecl *dtor = Record->getDestructor();
 
     function = CGM.getAddrOfCXXStructor(dtor, StructorType::Complete);
     argument = llvm::ConstantExpr::getBitCast(
@@ -113,13 +121,15 @@
 /// constant from this point onwards.
 static void EmitDeclInvariant(CodeGenFunction &CGF, const VarDecl &D,
                               llvm::Constant *Addr) {
-  // Don't emit the intrinsic if we're not optimizing.
+  // Do not emit the intrinsic if we're not optimizing.
   if (!CGF.CGM.getCodeGenOpts().OptimizationLevel)
     return;
 
   // Grab the llvm.invariant.start intrinsic.
   llvm::Intrinsic::ID InvStartID = llvm::Intrinsic::invariant_start;
-  llvm::Constant *InvariantStart = CGF.CGM.getIntrinsic(InvStartID);
+  // Overloaded address space type.
+  llvm::Type *ObjectPtr[1] = {CGF.Int8PtrTy};
+  llvm::Constant *InvariantStart = CGF.CGM.getIntrinsic(InvStartID, ObjectPtr);
 
   // Emit a call with the size in bytes of the object.
   CharUnits WidthChars = CGF.getContext().getTypeSizeInChars(D.getType());
@@ -304,6 +314,17 @@
 CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D,
                                             llvm::GlobalVariable *Addr,
                                             bool PerformInit) {
+
+  // According to E.2.3.1 in CUDA-7.5 Programming guide: __device__,
+  // __constant__ and __shared__ variables defined in namespace scope,
+  // that are of class type, cannot have a non-empty constructor. All
+  // the checks have been done in Sema by now. Whatever initializers
+  // are allowed are empty and we just need to ignore them here.
+  if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice &&
+      (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
+       D->hasAttr<CUDASharedAttr>()))
+    return;
+
   // Check if we've already initialized this decl.
   auto I = DelayedCXXInitPosition.find(D);
   if (I != DelayedCXXInitPosition.end() && I->second == ~0U)
diff --git a/lib/CodeGen/CGException.cpp b/lib/CodeGen/CGException.cpp
index fb51262..4a7dc42 100644
--- a/lib/CodeGen/CGException.cpp
+++ b/lib/CodeGen/CGException.cpp
@@ -686,8 +686,10 @@
   assert(EHStack.requiresLandingPad());
   assert(!EHStack.empty());
 
-  // If exceptions are disabled, there are usually no landingpads. However, when
-  // SEH is enabled, functions using SEH still get landingpads.
+  // If exceptions are disabled and SEH is not in use, then there is no invoke
+  // destination. SEH "works" even if exceptions are off. In practice, this
+  // means that C++ destructors and other EH cleanups don't run, which is
+  // consistent with MSVC's behavior.
   const LangOptions &LO = CGM.getLangOpts();
   if (!LO.Exceptions) {
     if (!LO.Borland && !LO.MicrosoftExt)
@@ -1326,11 +1328,13 @@
   TerminateHandler = createBasicBlock("terminate.handler");
   Builder.SetInsertPoint(TerminateHandler);
   llvm::Value *Exn = nullptr;
+  SaveAndRestore<llvm::Instruction *> RestoreCurrentFuncletPad(
+      CurrentFuncletPad);
   if (EHPersonality::get(*this).usesFuncletPads()) {
     llvm::Value *ParentPad = CurrentFuncletPad;
     if (!ParentPad)
       ParentPad = llvm::ConstantTokenNone::get(CGM.getLLVMContext());
-    Builder.CreateCleanupPad(ParentPad);
+    CurrentFuncletPad = Builder.CreateCleanupPad(ParentPad);
   } else {
     if (getLangOpts().CPlusPlus)
       Exn = getExceptionFromSlot();
@@ -1619,14 +1623,13 @@
   SmallString<128> Name;
   {
     llvm::raw_svector_ostream OS(Name);
-    const Decl *ParentCodeDecl = ParentCGF.CurCodeDecl;
-    const NamedDecl *Parent = dyn_cast_or_null<NamedDecl>(ParentCodeDecl);
-    assert(Parent && "FIXME: handle unnamed decls (lambdas, blocks) with SEH");
+    const FunctionDecl *ParentSEHFn = ParentCGF.CurSEHParent;
+    assert(ParentSEHFn && "No CurSEHParent!");
     MangleContext &Mangler = CGM.getCXXABI().getMangleContext();
     if (IsFilter)
-      Mangler.mangleSEHFilterExpression(Parent, OS);
+      Mangler.mangleSEHFilterExpression(ParentSEHFn, OS);
     else
-      Mangler.mangleSEHFinallyBlock(Parent, OS);
+      Mangler.mangleSEHFinallyBlock(ParentSEHFn, OS);
   }
 
   FunctionArgList Args;
@@ -1673,6 +1676,7 @@
 
   StartFunction(GlobalDecl(), RetTy, Fn, FnInfo, Args,
                 OutlinedStmt->getLocStart(), OutlinedStmt->getLocStart());
+  CurSEHParent = ParentCGF.CurSEHParent;
 
   CGM.SetLLVMFunctionAttributes(nullptr, FnInfo, CurFn);
   EmitCapturedLocals(ParentCGF, OutlinedStmt, IsFilter);
@@ -1704,12 +1708,6 @@
   const Stmt *FinallyBlock = Finally.getBlock();
   startOutlinedSEHHelper(ParentCGF, false, FinallyBlock);
 
-  // Mark finally block calls as nounwind and noinline to make LLVM's job a
-  // little easier.
-  // FIXME: Remove these restrictions in the future.
-  CurFn->addFnAttr(llvm::Attribute::NoUnwind);
-  CurFn->addFnAttr(llvm::Attribute::NoInline);
-
   // Emit the original filter expression, convert to i32, and return.
   EmitStmt(FinallyBlock);
 
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index e723ef9..89df63d 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CodeGenFunction.h"
 #include "CGCXXABI.h"
 #include "CGCall.h"
+#include "CGCleanup.h"
 #include "CGDebugInfo.h"
 #include "CGObjCRuntime.h"
 #include "CGOpenMPRuntime.h"
 #include "CGRecordLayout.h"
+#include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
@@ -32,6 +33,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Transforms/Utils/SanitizerStats.h"
 
 using namespace clang;
@@ -422,6 +424,23 @@
       EmitAnyExprToMem(E, Object, Qualifiers(), /*IsInit*/true);
     }
   } else {
+    switch (M->getStorageDuration()) {
+    case SD_Automatic:
+    case SD_FullExpression:
+      if (auto *Size = EmitLifetimeStart(
+              CGM.getDataLayout().getTypeAllocSize(Object.getElementType()),
+              Object.getPointer())) {
+        if (M->getStorageDuration() == SD_Automatic)
+          pushCleanupAfterFullExpr<CallLifetimeEnd>(NormalEHLifetimeMarker,
+                                                    Object, Size);
+        else
+          pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, Object,
+                                               Size);
+      }
+      break;
+    default:
+      break;
+    }
     EmitAnyExprToMem(E, Object, Qualifiers(), /*IsInit*/true);
   }
   pushTemporaryCleanup(*this, M, E, Object);
@@ -583,7 +602,7 @@
 
   if (Checks.size() > 0) {
     llvm::Constant *StaticData[] = {
-      EmitCheckSourceLocation(Loc),
+     EmitCheckSourceLocation(Loc),
       EmitCheckTypeDescriptor(Ty),
       llvm::ConstantInt::get(SizeTy, AlignVal),
       llvm::ConstantInt::get(Int8Ty, TCK)
@@ -1272,10 +1291,10 @@
   }
 
   // Atomic operations have to be done on integral types.
-  if (Ty->isAtomicType() || typeIsSuitableForInlineAtomic(Ty, Volatile)) {
-    LValue lvalue =
+  LValue AtomicLValue =
       LValue::MakeAddr(Addr, Ty, getContext(), AlignSource, TBAAInfo);
-    return EmitAtomicLoad(lvalue, Loc).getScalarVal();
+  if (Ty->isAtomicType() || LValueIsSuitableForInlineAtomic(AtomicLValue)) {
+    return EmitAtomicLoad(AtomicLValue, Loc).getScalarVal();
   }
 
   llvm::LoadInst *Load = Builder.CreateLoad(Addr, Volatile);
@@ -1383,12 +1402,11 @@
 
   Value = EmitToMemory(Value, Ty);
 
+  LValue AtomicLValue =
+      LValue::MakeAddr(Addr, Ty, getContext(), AlignSource, TBAAInfo);
   if (Ty->isAtomicType() ||
-      (!isInit && typeIsSuitableForInlineAtomic(Ty, Volatile))) {
-    EmitAtomicStore(RValue::get(Value),
-                    LValue::MakeAddr(Addr, Ty, getContext(),
-                                     AlignSource, TBAAInfo),
-                    isInit);
+      (!isInit && LValueIsSuitableForInlineAtomic(AtomicLValue))) {
+    EmitAtomicStore(RValue::get(Value), AtomicLValue, isInit);
     return;
   }
 
@@ -1740,8 +1758,7 @@
 
   if (const VectorType *VTy = Dst.getType()->getAs<VectorType>()) {
     unsigned NumSrcElts = VTy->getNumElements();
-    unsigned NumDstElts =
-       cast<llvm::VectorType>(Vec->getType())->getNumElements();
+    unsigned NumDstElts = Vec->getType()->getVectorNumElements();
     if (NumDstElts == NumSrcElts) {
       // Use shuffle vector is the src and destination are the same number of
       // elements and restore the vector mask since it is on the side it will be
@@ -1954,6 +1971,21 @@
   return MakeAddrLValue(Addr, RefTy->getPointeeType(), Source);
 }
 
+Address CodeGenFunction::EmitLoadOfPointer(Address Ptr,
+                                           const PointerType *PtrTy,
+                                           AlignmentSource *Source) {
+  llvm::Value *Addr = Builder.CreateLoad(Ptr);
+  return Address(Addr, getNaturalTypeAlignment(PtrTy->getPointeeType(), Source,
+                                               /*forPointeeType=*/true));
+}
+
+LValue CodeGenFunction::EmitLoadOfPointerLValue(Address PtrAddr,
+                                                const PointerType *PtrTy) {
+  AlignmentSource Source;
+  Address Addr = EmitLoadOfPointer(PtrAddr, PtrTy, &Source);
+  return MakeAddrLValue(Addr, PtrTy->getPointeeType(), Source);
+}
+
 static LValue EmitGlobalVarDeclLValue(CodeGenFunction &CGF,
                                       const Expr *E, const VarDecl *VD) {
   QualType T = E->getType();
@@ -2174,6 +2206,12 @@
   if (const auto *FD = dyn_cast<FunctionDecl>(ND))
     return EmitFunctionDeclLValue(*this, E, FD);
 
+  // FIXME: While we're emitting a binding from an enclosing scope, all other
+  // DeclRefExprs we see should be implicitly treated as if they also refer to
+  // an enclosing scope.
+  if (const auto *BD = dyn_cast<BindingDecl>(ND))
+    return EmitLValue(BD->getBinding());
+
   llvm_unreachable("Unhandled DeclRefExpr");
 }
 
@@ -2309,7 +2347,7 @@
   auto *GV = new llvm::GlobalVariable(
       CGM.getModule(), Descriptor->getType(),
       /*isConstant=*/true, llvm::GlobalVariable::PrivateLinkage, Descriptor);
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   CGM.getSanitizerMetadata()->disableSanitizerForGlobal(GV);
 
   // Remember the descriptor for this type.
@@ -2359,7 +2397,33 @@
 
   PresumedLoc PLoc = getContext().getSourceManager().getPresumedLoc(Loc);
   if (PLoc.isValid()) {
-    auto FilenameGV = CGM.GetAddrOfConstantCString(PLoc.getFilename(), ".src");
+    StringRef FilenameString = PLoc.getFilename();
+
+    int PathComponentsToStrip =
+        CGM.getCodeGenOpts().EmitCheckPathComponentsToStrip;
+    if (PathComponentsToStrip < 0) {
+      assert(PathComponentsToStrip != INT_MIN);
+      int PathComponentsToKeep = -PathComponentsToStrip;
+      auto I = llvm::sys::path::rbegin(FilenameString);
+      auto E = llvm::sys::path::rend(FilenameString);
+      while (I != E && --PathComponentsToKeep)
+        ++I;
+
+      FilenameString = FilenameString.substr(I - E);
+    } else if (PathComponentsToStrip > 0) {
+      auto I = llvm::sys::path::begin(FilenameString);
+      auto E = llvm::sys::path::end(FilenameString);
+      while (I != E && PathComponentsToStrip--)
+        ++I;
+
+      if (I != E)
+        FilenameString =
+            FilenameString.substr(I - llvm::sys::path::begin(FilenameString));
+      else
+        FilenameString = llvm::sys::path::filename(FilenameString);
+    }
+
+    auto FilenameGV = CGM.GetAddrOfConstantCString(FilenameString, ".src");
     CGM.getSanitizerMetadata()->disableSanitizerForGlobal(
                           cast<llvm::GlobalVariable>(FilenameGV.getPointer()));
     Filename = FilenameGV.getPointer();
@@ -2490,24 +2554,26 @@
   Branch->setMetadata(llvm::LLVMContext::MD_prof, Node);
   EmitBlock(Handlers);
 
-  // Emit handler arguments and create handler function type.
-  llvm::Constant *Info = llvm::ConstantStruct::getAnon(StaticArgs);
-  auto *InfoPtr =
-      new llvm::GlobalVariable(CGM.getModule(), Info->getType(), false,
-                               llvm::GlobalVariable::PrivateLinkage, Info);
-  InfoPtr->setUnnamedAddr(true);
-  CGM.getSanitizerMetadata()->disableSanitizerForGlobal(InfoPtr);
-
+  // Handler functions take an i8* pointing to the (handler-specific) static
+  // information block, followed by a sequence of intptr_t arguments
+  // representing operand values.
   SmallVector<llvm::Value *, 4> Args;
   SmallVector<llvm::Type *, 4> ArgTypes;
   Args.reserve(DynamicArgs.size() + 1);
   ArgTypes.reserve(DynamicArgs.size() + 1);
 
-  // Handler functions take an i8* pointing to the (handler-specific) static
-  // information block, followed by a sequence of intptr_t arguments
-  // representing operand values.
-  Args.push_back(Builder.CreateBitCast(InfoPtr, Int8PtrTy));
-  ArgTypes.push_back(Int8PtrTy);
+  // Emit handler arguments and create handler function type.
+  if (!StaticArgs.empty()) {
+    llvm::Constant *Info = llvm::ConstantStruct::getAnon(StaticArgs);
+    auto *InfoPtr =
+        new llvm::GlobalVariable(CGM.getModule(), Info->getType(), false,
+                                 llvm::GlobalVariable::PrivateLinkage, Info);
+    InfoPtr->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+    CGM.getSanitizerMetadata()->disableSanitizerForGlobal(InfoPtr);
+    Args.push_back(Builder.CreateBitCast(InfoPtr, Int8PtrTy));
+    ArgTypes.push_back(Int8PtrTy);
+  }
+
   for (size_t i = 0, n = DynamicArgs.size(); i != n; ++i) {
     Args.push_back(EmitCheckValue(DynamicArgs[i]));
     ArgTypes.push_back(IntPtrTy);
@@ -2539,10 +2605,9 @@
   EmitBlock(Cont);
 }
 
-void CodeGenFunction::EmitCfiSlowPathCheck(llvm::Value *Cond,
-                                           llvm::ConstantInt *TypeId,
-                                           llvm::Value *Ptr) {
-  auto &Ctx = getLLVMContext();
+void CodeGenFunction::EmitCfiSlowPathCheck(
+    SanitizerMask Kind, llvm::Value *Cond, llvm::ConstantInt *TypeId,
+    llvm::Value *Ptr, ArrayRef<llvm::Constant *> StaticArgs) {
   llvm::BasicBlock *Cont = createBasicBlock("cfi.cont");
 
   llvm::BasicBlock *CheckBB = createBasicBlock("cfi.slowpath");
@@ -2554,19 +2619,122 @@
 
   EmitBlock(CheckBB);
 
-  llvm::Constant *SlowPathFn = CGM.getModule().getOrInsertFunction(
-      "__cfi_slowpath",
-      llvm::FunctionType::get(
-          llvm::Type::getVoidTy(Ctx),
-          {llvm::Type::getInt64Ty(Ctx),
-           llvm::PointerType::getUnqual(llvm::Type::getInt8Ty(Ctx))},
-          false));
-  llvm::CallInst *CheckCall = Builder.CreateCall(SlowPathFn, {TypeId, Ptr});
+  bool WithDiag = !CGM.getCodeGenOpts().SanitizeTrap.has(Kind);
+
+  llvm::CallInst *CheckCall;
+  if (WithDiag) {
+    llvm::Constant *Info = llvm::ConstantStruct::getAnon(StaticArgs);
+    auto *InfoPtr =
+        new llvm::GlobalVariable(CGM.getModule(), Info->getType(), false,
+                                 llvm::GlobalVariable::PrivateLinkage, Info);
+    InfoPtr->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+    CGM.getSanitizerMetadata()->disableSanitizerForGlobal(InfoPtr);
+
+    llvm::Constant *SlowPathDiagFn = CGM.getModule().getOrInsertFunction(
+        "__cfi_slowpath_diag",
+        llvm::FunctionType::get(VoidTy, {Int64Ty, Int8PtrTy, Int8PtrTy},
+                                false));
+    CheckCall = Builder.CreateCall(
+        SlowPathDiagFn,
+        {TypeId, Ptr, Builder.CreateBitCast(InfoPtr, Int8PtrTy)});
+  } else {
+    llvm::Constant *SlowPathFn = CGM.getModule().getOrInsertFunction(
+        "__cfi_slowpath",
+        llvm::FunctionType::get(VoidTy, {Int64Ty, Int8PtrTy}, false));
+    CheckCall = Builder.CreateCall(SlowPathFn, {TypeId, Ptr});
+  }
+
   CheckCall->setDoesNotThrow();
 
   EmitBlock(Cont);
 }
 
+// This function is basically a switch over the CFI failure kind, which is
+// extracted from CFICheckFailData (1st function argument). Each case is either
+// llvm.trap or a call to one of the two runtime handlers, based on
+// -fsanitize-trap and -fsanitize-recover settings.  Default case (invalid
+// failure kind) traps, but this should really never happen.  CFICheckFailData
+// can be nullptr if the calling module has -fsanitize-trap behavior for this
+// check kind; in this case __cfi_check_fail traps as well.
+void CodeGenFunction::EmitCfiCheckFail() {
+  SanitizerScope SanScope(this);
+  FunctionArgList Args;
+  ImplicitParamDecl ArgData(getContext(), nullptr, SourceLocation(), nullptr,
+                            getContext().VoidPtrTy);
+  ImplicitParamDecl ArgAddr(getContext(), nullptr, SourceLocation(), nullptr,
+                            getContext().VoidPtrTy);
+  Args.push_back(&ArgData);
+  Args.push_back(&ArgAddr);
+
+  const CGFunctionInfo &FI =
+    CGM.getTypes().arrangeBuiltinFunctionDeclaration(getContext().VoidTy, Args);
+
+  llvm::Function *F = llvm::Function::Create(
+      llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy}, false),
+      llvm::GlobalValue::WeakODRLinkage, "__cfi_check_fail", &CGM.getModule());
+  F->setVisibility(llvm::GlobalValue::HiddenVisibility);
+
+  StartFunction(GlobalDecl(), CGM.getContext().VoidTy, F, FI, Args,
+                SourceLocation());
+
+  llvm::Value *Data =
+      EmitLoadOfScalar(GetAddrOfLocalVar(&ArgData), /*Volatile=*/false,
+                       CGM.getContext().VoidPtrTy, ArgData.getLocation());
+  llvm::Value *Addr =
+      EmitLoadOfScalar(GetAddrOfLocalVar(&ArgAddr), /*Volatile=*/false,
+                       CGM.getContext().VoidPtrTy, ArgAddr.getLocation());
+
+  // Data == nullptr means the calling module has trap behaviour for this check.
+  llvm::Value *DataIsNotNullPtr =
+      Builder.CreateICmpNE(Data, llvm::ConstantPointerNull::get(Int8PtrTy));
+  EmitTrapCheck(DataIsNotNullPtr);
+
+  llvm::StructType *SourceLocationTy =
+      llvm::StructType::get(VoidPtrTy, Int32Ty, Int32Ty, nullptr);
+  llvm::StructType *CfiCheckFailDataTy =
+      llvm::StructType::get(Int8Ty, SourceLocationTy, VoidPtrTy, nullptr);
+
+  llvm::Value *V = Builder.CreateConstGEP2_32(
+      CfiCheckFailDataTy,
+      Builder.CreatePointerCast(Data, CfiCheckFailDataTy->getPointerTo(0)), 0,
+      0);
+  Address CheckKindAddr(V, getIntAlign());
+  llvm::Value *CheckKind = Builder.CreateLoad(CheckKindAddr);
+
+  llvm::Value *AllVtables = llvm::MetadataAsValue::get(
+      CGM.getLLVMContext(),
+      llvm::MDString::get(CGM.getLLVMContext(), "all-vtables"));
+  llvm::Value *ValidVtable = Builder.CreateZExt(
+      Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::type_test),
+                         {Addr, AllVtables}),
+      IntPtrTy);
+
+  const std::pair<int, SanitizerMask> CheckKinds[] = {
+      {CFITCK_VCall, SanitizerKind::CFIVCall},
+      {CFITCK_NVCall, SanitizerKind::CFINVCall},
+      {CFITCK_DerivedCast, SanitizerKind::CFIDerivedCast},
+      {CFITCK_UnrelatedCast, SanitizerKind::CFIUnrelatedCast},
+      {CFITCK_ICall, SanitizerKind::CFIICall}};
+
+  SmallVector<std::pair<llvm::Value *, SanitizerMask>, 5> Checks;
+  for (auto CheckKindMaskPair : CheckKinds) {
+    int Kind = CheckKindMaskPair.first;
+    SanitizerMask Mask = CheckKindMaskPair.second;
+    llvm::Value *Cond =
+        Builder.CreateICmpNE(CheckKind, llvm::ConstantInt::get(Int8Ty, Kind));
+    if (CGM.getLangOpts().Sanitize.has(Mask))
+      EmitCheck(std::make_pair(Cond, Mask), "cfi_check_fail", {},
+                {Data, Addr, ValidVtable});
+    else
+      EmitTrapCheck(Cond);
+  }
+
+  FinishFunction();
+  // The only reference to this function will be created during LTO link.
+  // Make sure it survives until then.
+  CGM.addUsedGlobal(F);
+}
+
 void CodeGenFunction::EmitTrapCheck(llvm::Value *Checked) {
   llvm::BasicBlock *Cont = createBasicBlock("cont");
 
@@ -2834,22 +3002,55 @@
   return LV;
 }
 
+static Address emitOMPArraySectionBase(CodeGenFunction &CGF, const Expr *Base,
+                                       AlignmentSource &AlignSource,
+                                       QualType BaseTy, QualType ElTy,
+                                       bool IsLowerBound) {
+  LValue BaseLVal;
+  if (auto *ASE = dyn_cast<OMPArraySectionExpr>(Base->IgnoreParenImpCasts())) {
+    BaseLVal = CGF.EmitOMPArraySectionExpr(ASE, IsLowerBound);
+    if (BaseTy->isArrayType()) {
+      Address Addr = BaseLVal.getAddress();
+      AlignSource = BaseLVal.getAlignmentSource();
+
+      // If the array type was an incomplete type, we need to make sure
+      // the decay ends up being the right type.
+      llvm::Type *NewTy = CGF.ConvertType(BaseTy);
+      Addr = CGF.Builder.CreateElementBitCast(Addr, NewTy);
+
+      // Note that VLA pointers are always decayed, so we don't need to do
+      // anything here.
+      if (!BaseTy->isVariableArrayType()) {
+        assert(isa<llvm::ArrayType>(Addr.getElementType()) &&
+               "Expected pointer to array");
+        Addr = CGF.Builder.CreateStructGEP(Addr, 0, CharUnits::Zero(),
+                                           "arraydecay");
+      }
+
+      return CGF.Builder.CreateElementBitCast(Addr,
+                                              CGF.ConvertTypeForMem(ElTy));
+    }
+    CharUnits Align = CGF.getNaturalTypeAlignment(ElTy, &AlignSource);
+    return Address(CGF.Builder.CreateLoad(BaseLVal.getAddress()), Align);
+  }
+  return CGF.EmitPointerWithAlignment(Base, &AlignSource);
+}
+
 LValue CodeGenFunction::EmitOMPArraySectionExpr(const OMPArraySectionExpr *E,
                                                 bool IsLowerBound) {
-  LValue Base;
+  QualType BaseTy;
   if (auto *ASE =
           dyn_cast<OMPArraySectionExpr>(E->getBase()->IgnoreParenImpCasts()))
-    Base = EmitOMPArraySectionExpr(ASE, IsLowerBound);
+    BaseTy = OMPArraySectionExpr::getBaseOriginalType(ASE);
   else
-    Base = EmitLValue(E->getBase());
-  QualType BaseTy = Base.getType();
-  llvm::Value *Idx = nullptr;
+    BaseTy = E->getBase()->getType();
   QualType ResultExprTy;
   if (auto *AT = getContext().getAsArrayType(BaseTy))
     ResultExprTy = AT->getElementType();
   else
     ResultExprTy = BaseTy->getPointeeType();
-  if (IsLowerBound || (!IsLowerBound && E->getColonLoc().isInvalid())) {
+  llvm::Value *Idx = nullptr;
+  if (IsLowerBound || E->getColonLoc().isInvalid()) {
     // Requesting lower bound or upper bound, but without provided length and
     // without ':' symbol for the default length -> length = 1.
     // Idx = LowerBound ?: 0;
@@ -2860,9 +3061,9 @@
     } else
       Idx = llvm::ConstantInt::getNullValue(IntPtrTy);
   } else {
-    // Try to emit length or lower bound as constant. If this is possible, 1 is
-    // subtracted from constant length or lower bound. Otherwise, emit LLVM IR
-    // (LB + Len) - 1.
+    // Try to emit length or lower bound as constant. If this is possible, 1
+    // is subtracted from constant length or lower bound. Otherwise, emit LLVM
+    // IR (LB + Len) - 1.
     auto &C = CGM.getContext();
     auto *Length = E->getLength();
     llvm::APSInt ConstLength;
@@ -2908,12 +3109,15 @@
         Idx = llvm::ConstantInt::get(IntPtrTy, ConstLength + ConstLowerBound);
     } else {
       // Idx = ArraySize - 1;
-      if (auto *VAT = C.getAsVariableArrayType(BaseTy)) {
+      QualType ArrayTy = BaseTy->isPointerType()
+                             ? E->getBase()->IgnoreParenImpCasts()->getType()
+                             : BaseTy;
+      if (auto *VAT = C.getAsVariableArrayType(ArrayTy)) {
         Length = VAT->getSizeExpr();
         if (Length->isIntegerConstantExpr(ConstLength, C))
           Length = nullptr;
       } else {
-        auto *CAT = C.getAsConstantArrayType(BaseTy);
+        auto *CAT = C.getAsConstantArrayType(ArrayTy);
         ConstLength = CAT->getSize();
       }
       if (Length) {
@@ -2932,52 +3136,56 @@
   }
   assert(Idx);
 
-  llvm::Value *EltPtr;
-  QualType FixedSizeEltType = ResultExprTy;
+  Address EltPtr = Address::invalid();
+  AlignmentSource AlignSource;
   if (auto *VLA = getContext().getAsVariableArrayType(ResultExprTy)) {
+    // The base must be a pointer, which is not an aggregate.  Emit
+    // it.  It needs to be emitted first in case it's what captures
+    // the VLA bounds.
+    Address Base =
+        emitOMPArraySectionBase(*this, E->getBase(), AlignSource, BaseTy,
+                                VLA->getElementType(), IsLowerBound);
     // The element count here is the total number of non-VLA elements.
-    llvm::Value *numElements = getVLASize(VLA).first;
-    FixedSizeEltType = getFixedSizeElementType(getContext(), VLA);
+    llvm::Value *NumElements = getVLASize(VLA).first;
 
     // Effectively, the multiply by the VLA size is part of the GEP.
     // GEP indexes are signed, and scaling an index isn't permitted to
     // signed-overflow, so we use the same semantics for our explicit
     // multiply.  We suppress this if overflow is not undefined behavior.
-    if (getLangOpts().isSignedOverflowDefined()) {
-      Idx = Builder.CreateMul(Idx, numElements);
-      EltPtr = Builder.CreateGEP(Base.getPointer(), Idx, "arrayidx");
-    } else {
-      Idx = Builder.CreateNSWMul(Idx, numElements);
-      EltPtr = Builder.CreateInBoundsGEP(Base.getPointer(), Idx, "arrayidx");
-    }
-  } else if (BaseTy->isConstantArrayType()) {
-    llvm::Value *ArrayPtr = Base.getPointer();
-    llvm::Value *Zero = llvm::ConstantInt::getNullValue(IntPtrTy);
-    llvm::Value *Args[] = {Zero, Idx};
+    if (getLangOpts().isSignedOverflowDefined())
+      Idx = Builder.CreateMul(Idx, NumElements);
+    else
+      Idx = Builder.CreateNSWMul(Idx, NumElements);
+    EltPtr = emitArraySubscriptGEP(*this, Base, Idx, VLA->getElementType(),
+                                   !getLangOpts().isSignedOverflowDefined());
+  } else if (const Expr *Array = isSimpleArrayDecayOperand(E->getBase())) {
+    // If this is A[i] where A is an array, the frontend will have decayed the
+    // base to be a ArrayToPointerDecay implicit cast.  While correct, it is
+    // inefficient at -O0 to emit a "gep A, 0, 0" when codegen'ing it, then a
+    // "gep x, i" here.  Emit one "gep A, 0, i".
+    assert(Array->getType()->isArrayType() &&
+           "Array to pointer decay must have array source type!");
+    LValue ArrayLV;
+    // For simple multidimensional array indexing, set the 'accessed' flag for
+    // better bounds-checking of the base expression.
+    if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Array))
+      ArrayLV = EmitArraySubscriptExpr(ASE, /*Accessed*/ true);
+    else
+      ArrayLV = EmitLValue(Array);
 
-    if (getLangOpts().isSignedOverflowDefined())
-      EltPtr = Builder.CreateGEP(ArrayPtr, Args, "arrayidx");
-    else
-      EltPtr = Builder.CreateInBoundsGEP(ArrayPtr, Args, "arrayidx");
+    // Propagate the alignment from the array itself to the result.
+    EltPtr = emitArraySubscriptGEP(
+        *this, ArrayLV.getAddress(), {CGM.getSize(CharUnits::Zero()), Idx},
+        ResultExprTy, !getLangOpts().isSignedOverflowDefined());
+    AlignSource = ArrayLV.getAlignmentSource();
   } else {
-    // The base must be a pointer, which is not an aggregate.  Emit it.
-    if (getLangOpts().isSignedOverflowDefined())
-      EltPtr = Builder.CreateGEP(Base.getPointer(), Idx, "arrayidx");
-    else
-      EltPtr = Builder.CreateInBoundsGEP(Base.getPointer(), Idx, "arrayidx");
+    Address Base = emitOMPArraySectionBase(*this, E->getBase(), AlignSource,
+                                           BaseTy, ResultExprTy, IsLowerBound);
+    EltPtr = emitArraySubscriptGEP(*this, Base, Idx, ResultExprTy,
+                                   !getLangOpts().isSignedOverflowDefined());
   }
 
-  CharUnits EltAlign =
-    Base.getAlignment().alignmentOfArrayElement(
-                          getContext().getTypeSizeInChars(FixedSizeEltType));
-
-  // Limit the alignment to that of the result type.
-  LValue LV = MakeAddrLValue(Address(EltPtr, EltAlign), ResultExprTy,
-                             Base.getAlignmentSource());
-
-  LV.getQuals().setAddressSpace(BaseTy.getAddressSpace());
-
-  return LV;
+  return MakeAddrLValue(EltPtr, ResultExprTy, AlignSource);
 }
 
 LValue CodeGenFunction::
@@ -3399,6 +3607,7 @@
   case CK_ARCExtendBlockObject:
   case CK_CopyAndAutoreleaseBlockObject:
   case CK_AddressSpaceConversion:
+  case CK_IntToOCLSampler:
     return EmitUnsupportedLValue(E, "unexpected cast lvalue");
 
   case CK_Dependent:
@@ -3515,6 +3724,10 @@
   case TEK_Aggregate:
     return FieldLV.asAggregateRValue();
   case TEK_Scalar:
+    // This routine is used to load fields one-by-one to perform a copy, so
+    // don't load reference fields.
+    if (FD->getType()->isReferenceType())
+      return RValue::get(FieldLV.getPointer());
     return EmitLoadOfLValue(FieldLV, Loc);
   }
   llvm_unreachable("bad evaluation kind");
@@ -3861,23 +4074,25 @@
     EmitSanitizerStatReport(llvm::SanStat_CFI_ICall);
 
     llvm::Metadata *MD = CGM.CreateMetadataIdentifierForType(QualType(FnType, 0));
-    llvm::Value *BitSetName = llvm::MetadataAsValue::get(getLLVMContext(), MD);
+    llvm::Value *TypeId = llvm::MetadataAsValue::get(getLLVMContext(), MD);
 
     llvm::Value *CastedCallee = Builder.CreateBitCast(Callee, Int8PtrTy);
-    llvm::Value *BitSetTest =
-        Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::bitset_test),
-                           {CastedCallee, BitSetName});
+    llvm::Value *TypeTest = Builder.CreateCall(
+        CGM.getIntrinsic(llvm::Intrinsic::type_test), {CastedCallee, TypeId});
 
-    auto TypeId = CGM.CreateCfiIdForTypeMetadata(MD);
-    if (CGM.getCodeGenOpts().SanitizeCfiCrossDso && TypeId) {
-      EmitCfiSlowPathCheck(BitSetTest, TypeId, CastedCallee);
+    auto CrossDsoTypeId = CGM.CreateCrossDsoCfiTypeId(MD);
+    llvm::Constant *StaticData[] = {
+        llvm::ConstantInt::get(Int8Ty, CFITCK_ICall),
+        EmitCheckSourceLocation(E->getLocStart()),
+        EmitCheckTypeDescriptor(QualType(FnType, 0)),
+    };
+    if (CGM.getCodeGenOpts().SanitizeCfiCrossDso && CrossDsoTypeId) {
+      EmitCfiSlowPathCheck(SanitizerKind::CFIICall, TypeTest, CrossDsoTypeId,
+                           CastedCallee, StaticData);
     } else {
-      llvm::Constant *StaticData[] = {
-          EmitCheckSourceLocation(E->getLocStart()),
-          EmitCheckTypeDescriptor(QualType(FnType, 0)),
-      };
-      EmitCheck(std::make_pair(BitSetTest, SanitizerKind::CFIICall),
-                "cfi_bad_icall", StaticData, CastedCallee);
+      EmitCheck(std::make_pair(TypeTest, SanitizerKind::CFIICall),
+                "cfi_check_fail", StaticData,
+                {CastedCallee, llvm::UndefValue::get(IntPtrTy)});
     }
   }
 
diff --git a/lib/CodeGen/CGExprAgg.cpp b/lib/CodeGen/CGExprAgg.cpp
index a4547a9..f51330c 100644
--- a/lib/CodeGen/CGExprAgg.cpp
+++ b/lib/CodeGen/CGExprAgg.cpp
@@ -175,6 +175,7 @@
   }
   void VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E);
   void VisitCXXConstructExpr(const CXXConstructExpr *E);
+  void VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E);
   void VisitLambdaExpr(LambdaExpr *E);
   void VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *E);
   void VisitExprWithCleanups(ExprWithCleanups *E);
@@ -749,6 +750,7 @@
   case CK_BuiltinFnToFnPtr:
   case CK_ZeroToOCLEvent:
   case CK_AddressSpaceConversion:
+  case CK_IntToOCLSampler:
     llvm_unreachable("cast kind invalid for aggregate types");
   }
 }
@@ -967,12 +969,9 @@
   Address ArgValue = Address::invalid();
   Address ArgPtr = CGF.EmitVAArg(VE, ArgValue);
 
+  // If EmitVAArg fails, emit an error.
   if (!ArgPtr.isValid()) {
-    // If EmitVAArg fails, we fall back to the LLVM instruction.
-    llvm::Value *Val = Builder.CreateVAArg(ArgValue.getPointer(),
-                                           CGF.ConvertType(VE->getType()));
-    if (!Dest.isIgnored())
-      Builder.CreateStore(Val, Dest.getAddress());
+    CGF.ErrorUnsupported(VE, "aggregate va_arg expression");
     return;
   }
 
@@ -1001,6 +1000,14 @@
   CGF.EmitCXXConstructExpr(E, Slot);
 }
 
+void AggExprEmitter::VisitCXXInheritedCtorInitExpr(
+    const CXXInheritedCtorInitExpr *E) {
+  AggValueSlot Slot = EnsureSlot(E->getType());
+  CGF.EmitInheritedCXXConstructorCall(
+      E->getConstructor(), E->constructsVBase(), Slot.getAddress(),
+      E->inheritedFromVBase(), E);
+}
+
 void
 AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
   AggValueSlot Slot = EnsureSlot(E->getType());
@@ -1174,6 +1181,38 @@
   unsigned NumInitElements = E->getNumInits();
   RecordDecl *record = E->getType()->castAs<RecordType>()->getDecl();
 
+  // We'll need to enter cleanup scopes in case any of the element
+  // initializers throws an exception.
+  SmallVector<EHScopeStack::stable_iterator, 16> cleanups;
+  llvm::Instruction *cleanupDominator = nullptr;
+
+  unsigned curInitIndex = 0;
+
+  // Emit initialization of base classes.
+  if (auto *CXXRD = dyn_cast<CXXRecordDecl>(record)) {
+    assert(E->getNumInits() >= CXXRD->getNumBases() &&
+           "missing initializer for base class");
+    for (auto &Base : CXXRD->bases()) {
+      assert(!Base.isVirtual() && "should not see vbases here");
+      auto *BaseRD = Base.getType()->getAsCXXRecordDecl();
+      Address V = CGF.GetAddressOfDirectBaseInCompleteClass(
+          Dest.getAddress(), CXXRD, BaseRD,
+          /*isBaseVirtual*/ false);
+      AggValueSlot AggSlot =
+        AggValueSlot::forAddr(V, Qualifiers(),
+                              AggValueSlot::IsDestructed,
+                              AggValueSlot::DoesNotNeedGCBarriers,
+                              AggValueSlot::IsNotAliased);
+      CGF.EmitAggExpr(E->getInit(curInitIndex++), AggSlot);
+
+      if (QualType::DestructionKind dtorKind =
+              Base.getType().isDestructedType()) {
+        CGF.pushDestroy(dtorKind, V, Base.getType());
+        cleanups.push_back(CGF.EHStack.stable_begin());
+      }
+    }
+  }
+
   // Prepare a 'this' for CXXDefaultInitExprs.
   CodeGenFunction::FieldConstructionScope FCS(CGF, Dest.getAddress());
 
@@ -1207,14 +1246,8 @@
     return;
   }
 
-  // We'll need to enter cleanup scopes in case any of the member
-  // initializers throw an exception.
-  SmallVector<EHScopeStack::stable_iterator, 16> cleanups;
-  llvm::Instruction *cleanupDominator = nullptr;
-
   // Here we iterate over the fields; this makes it simpler to both
   // default-initialize fields and skip over unnamed fields.
-  unsigned curInitIndex = 0;
   for (const auto *field : record->fields()) {
     // We're done once we hit the flexible array member.
     if (field->getType()->isIncompleteArrayType())
@@ -1320,6 +1353,10 @@
       CharUnits NumNonZeroBytes = CharUnits::Zero();
       
       unsigned ILEElement = 0;
+      if (auto *CXXRD = dyn_cast<CXXRecordDecl>(SD))
+        while (ILEElement != CXXRD->getNumBases())
+          NumNonZeroBytes +=
+              GetNumNonZeroBytesInInit(ILE->getInit(ILEElement++), CGF);
       for (const auto *Field : SD->fields()) {
         // We're done once we hit the flexible array member or run out of
         // InitListExpr elements.
diff --git a/lib/CodeGen/CGExprCXX.cpp b/lib/CodeGen/CGExprCXX.cpp
index 604cde7..eec2ace 100644
--- a/lib/CodeGen/CGExprCXX.cpp
+++ b/lib/CodeGen/CGExprCXX.cpp
@@ -24,10 +24,11 @@
 using namespace clang;
 using namespace CodeGen;
 
-static RequiredArgs commonEmitCXXMemberOrOperatorCall(
-    CodeGenFunction &CGF, const CXXMethodDecl *MD, llvm::Value *Callee,
-    ReturnValueSlot ReturnValue, llvm::Value *This, llvm::Value *ImplicitParam,
-    QualType ImplicitParamTy, const CallExpr *CE, CallArgList &Args) {
+static RequiredArgs
+commonEmitCXXMemberOrOperatorCall(CodeGenFunction &CGF, const CXXMethodDecl *MD,
+                                  llvm::Value *This, llvm::Value *ImplicitParam,
+                                  QualType ImplicitParamTy, const CallExpr *CE,
+                                  CallArgList &Args) {
   assert(CE == nullptr || isa<CXXMemberCallExpr>(CE) ||
          isa<CXXOperatorCallExpr>(CE));
   assert(MD->isInstance() &&
@@ -53,7 +54,7 @@
   }
 
   const FunctionProtoType *FPT = MD->getType()->castAs<FunctionProtoType>();
-  RequiredArgs required = RequiredArgs::forPrototypePlus(FPT, Args.size());
+  RequiredArgs required = RequiredArgs::forPrototypePlus(FPT, Args.size(), MD);
 
   // And the rest of the call args.
   if (CE) {
@@ -76,21 +77,20 @@
   const FunctionProtoType *FPT = MD->getType()->castAs<FunctionProtoType>();
   CallArgList Args;
   RequiredArgs required = commonEmitCXXMemberOrOperatorCall(
-      *this, MD, Callee, ReturnValue, This, ImplicitParam, ImplicitParamTy, CE,
-      Args);
+      *this, MD, This, ImplicitParam, ImplicitParamTy, CE, Args);
   return EmitCall(CGM.getTypes().arrangeCXXMethodCall(Args, FPT, required),
                   Callee, ReturnValue, Args, MD);
 }
 
-RValue CodeGenFunction::EmitCXXStructorCall(
-    const CXXMethodDecl *MD, llvm::Value *Callee, ReturnValueSlot ReturnValue,
-    llvm::Value *This, llvm::Value *ImplicitParam, QualType ImplicitParamTy,
-    const CallExpr *CE, StructorType Type) {
+RValue CodeGenFunction::EmitCXXDestructorCall(
+    const CXXDestructorDecl *DD, llvm::Value *Callee, llvm::Value *This,
+    llvm::Value *ImplicitParam, QualType ImplicitParamTy, const CallExpr *CE,
+    StructorType Type) {
   CallArgList Args;
-  commonEmitCXXMemberOrOperatorCall(*this, MD, Callee, ReturnValue, This,
-                                    ImplicitParam, ImplicitParamTy, CE, Args);
-  return EmitCall(CGM.getTypes().arrangeCXXStructorDeclaration(MD, Type),
-                  Callee, ReturnValue, Args, MD);
+  commonEmitCXXMemberOrOperatorCall(*this, DD, This, ImplicitParam,
+                                    ImplicitParamTy, CE, Args);
+  return EmitCall(CGM.getTypes().arrangeCXXStructorDeclaration(DD, Type),
+                  Callee, ReturnValueSlot(), Args, DD);
 }
 
 static CXXRecordDecl *getCXXRecord(const Expr *E) {
@@ -259,7 +259,8 @@
     if (SanOpts.has(SanitizerKind::CFINVCall) &&
         MD->getParent()->isDynamicClass()) {
       llvm::Value *VTable = GetVTablePtr(This, Int8PtrTy, MD->getParent());
-      EmitVTablePtrCheckForCall(MD, VTable, CFITCK_NVCall, CE->getLocStart());
+      EmitVTablePtrCheckForCall(MD->getParent(), VTable, CFITCK_NVCall,
+                                CE->getLocStart());
     }
 
     if (getLangOpts().AppleKext && MD->isVirtual() && HasQualifier)
@@ -273,7 +274,7 @@
 
   if (MD->isVirtual()) {
     This = CGM.getCXXABI().adjustThisArgumentForVirtualFunctionCall(
-        *this, MD, This, UseVirtualCall);
+        *this, CalleeDecl, This, UseVirtualCall);
   }
 
   return EmitCXXMemberOrOperatorCall(MD, Callee, ReturnValue, This.getPointer(),
@@ -323,10 +324,11 @@
   // Push the this ptr.
   Args.add(RValue::get(ThisPtrForCall), ThisType);
 
-  RequiredArgs required = RequiredArgs::forPrototypePlus(FPT, 1);
-  
+  RequiredArgs required =
+      RequiredArgs::forPrototypePlus(FPT, 1, /*FD=*/nullptr);
+
   // And the rest of the call args
-  EmitCallArgs(Args, FPT, E->arguments(), E->getDirectCallee());
+  EmitCallArgs(Args, FPT, E->arguments());
   return EmitCall(CGM.getTypes().arrangeCXXMethodCall(Args, FPT, required),
                   Callee, ReturnValue, Args);
 }
@@ -369,6 +371,9 @@
   std::vector<CharUnits> VBPtrOffsets =
       CGF.CGM.getCXXABI().getVBPtrOffsets(Base);
   for (CharUnits VBPtrOffset : VBPtrOffsets) {
+    // Stop before we hit any virtual base pointers located in virtual bases.
+    if (VBPtrOffset >= NVSize)
+      break;
     std::pair<CharUnits, CharUnits> LastStore = Stores.pop_back_val();
     CharUnits LastStoreOffset = LastStore.first;
     CharUnits LastStoreSize = LastStore.second;
@@ -471,8 +476,8 @@
     }
   }
   
-  if (const ConstantArrayType *arrayType 
-        = getContext().getAsConstantArrayType(E->getType())) {
+  if (const ArrayType *arrayType
+        = getContext().getAsArrayType(E->getType())) {
     EmitCXXAggrConstructorCall(CD, arrayType, Dest.getAddress(), E);
   } else {
     CXXCtorType Type = Ctor_Complete;
@@ -1010,15 +1015,18 @@
   if (auto *ILE = dyn_cast<InitListExpr>(Init)) {
     if (const RecordType *RType = ILE->getType()->getAs<RecordType>()) {
       if (RType->getDecl()->isStruct()) {
-        unsigned NumFields = 0;
+        unsigned NumElements = 0;
+        if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RType->getDecl()))
+          NumElements = CXXRD->getNumBases();
         for (auto *Field : RType->getDecl()->fields())
           if (!Field->isUnnamedBitfield())
-            ++NumFields;
-        if (ILE->getNumInits() == NumFields)
+            ++NumElements;
+        // FIXME: Recurse into nested InitListExprs.
+        if (ILE->getNumInits() == NumElements)
           for (unsigned i = 0, e = ILE->getNumInits(); i != e; ++i)
             if (!isa<ImplicitValueInitExpr>(ILE->getInit(i)))
-              --NumFields;
-        if (ILE->getNumInits() == NumFields && TryMemsetInitialization())
+              --NumElements;
+        if (ILE->getNumInits() == NumElements && TryMemsetInitialization())
           return;
       }
     }
diff --git a/lib/CodeGen/CGExprComplex.cpp b/lib/CodeGen/CGExprComplex.cpp
index 22910d9..af7f190 100644
--- a/lib/CodeGen/CGExprComplex.cpp
+++ b/lib/CodeGen/CGExprComplex.cpp
@@ -13,12 +13,9 @@
 
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
-#include "clang/AST/ASTContext.h"
 #include "clang/AST/StmtVisitor.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
@@ -484,6 +481,7 @@
   case CK_BuiltinFnToFnPtr:
   case CK_ZeroToOCLEvent:
   case CK_AddressSpaceConversion:
+  case CK_IntToOCLSampler:
     llvm_unreachable("invalid cast kind for complex value");
 
   case CK_FloatingRealToComplex:
diff --git a/lib/CodeGen/CGExprConstant.cpp b/lib/CodeGen/CGExprConstant.cpp
index 2aed3bb..0e818e9 100644
--- a/lib/CodeGen/CGExprConstant.cpp
+++ b/lib/CodeGen/CGExprConstant.cpp
@@ -368,7 +368,14 @@
 
   unsigned FieldNo = 0;
   unsigned ElementNo = 0;
-  
+
+  // Bail out if we have base classes. We could support these, but they only
+  // arise in C++1z where we will have already constant folded most interesting
+  // cases. FIXME: There are still a few more cases we can handle this way.
+  if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
+    if (CXXRD->getNumBases())
+      return false;
+
   for (RecordDecl::field_iterator Field = RD->field_begin(),
        FieldEnd = RD->field_end(); Field != FieldEnd; ++Field, ++FieldNo) {
     // If this is a union, skip all the fields that aren't being initialized.
@@ -683,6 +690,9 @@
     case CK_ConstructorConversion:
       return C;
 
+    case CK_IntToOCLSampler:
+      llvm_unreachable("global sampler variables are not generated");
+
     case CK_Dependent: llvm_unreachable("saw dependent cast!");
 
     case CK_BuiltinFnToFnPtr:
@@ -757,6 +767,12 @@
     return Visit(DIE->getExpr());
   }
 
+  llvm::Constant *VisitExprWithCleanups(ExprWithCleanups *E) {
+    if (!E->cleanupsHaveSideEffects())
+      return Visit(E->getSubExpr());
+    return nullptr;
+  }
+
   llvm::Constant *VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E) {
     return Visit(E->GetTemporaryExpr());
   }
@@ -1124,6 +1140,13 @@
   unsigned FieldNo = -1;
   unsigned ElementNo = 0;
 
+  // Bail out if we have base classes. We could support these, but they only
+  // arise in C++1z where we will have already constant folded most interesting
+  // cases. FIXME: There are still a few more cases we can handle this way.
+  if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
+    if (CXXRD->getNumBases())
+      return false;
+
   for (FieldDecl *Field : RD->fields()) {
     ++FieldNo;
 
@@ -1300,8 +1323,14 @@
 
       // Convert to the appropriate type; this could be an lvalue for
       // an integer.
-      if (isa<llvm::PointerType>(DestTy))
+      if (isa<llvm::PointerType>(DestTy)) {
+        // Convert the integer to a pointer-sized integer before converting it
+        // to a pointer.
+        C = llvm::ConstantExpr::getIntegerCast(
+            C, getDataLayout().getIntPtrType(DestTy),
+            /*isSigned=*/false);
         return llvm::ConstantExpr::getIntToPtr(C, DestTy);
+      }
 
       // If the types don't match this should only be a truncate.
       if (C->getType() != DestTy)
diff --git a/lib/CodeGen/CGExprScalar.cpp b/lib/CodeGen/CGExprScalar.cpp
index 5b39b5d4..700b537 100644
--- a/lib/CodeGen/CGExprScalar.cpp
+++ b/lib/CodeGen/CGExprScalar.cpp
@@ -818,7 +818,7 @@
            "Splatted expr doesn't match with vector element type?");
 
     // Splat the element across to all elements
-    unsigned NumElements = cast<llvm::VectorType>(DstTy)->getNumElements();
+    unsigned NumElements = DstTy->getVectorNumElements();
     return Builder.CreateVectorSplat(NumElements, Src, "splat");
   }
 
@@ -984,8 +984,7 @@
 
 Value *ScalarExprEmitter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) {
   // Vector Mask Case
-  if (E->getNumSubExprs() == 2 ||
-      (E->getNumSubExprs() == 3 && E->getExpr(2)->getType()->isVectorType())) {
+  if (E->getNumSubExprs() == 2) {
     Value *LHS = CGF.EmitScalarExpr(E->getExpr(0));
     Value *RHS = CGF.EmitScalarExpr(E->getExpr(1));
     Value *Mask;
@@ -993,22 +992,7 @@
     llvm::VectorType *LTy = cast<llvm::VectorType>(LHS->getType());
     unsigned LHSElts = LTy->getNumElements();
 
-    if (E->getNumSubExprs() == 3) {
-      Mask = CGF.EmitScalarExpr(E->getExpr(2));
-
-      // Shuffle LHS & RHS into one input vector.
-      SmallVector<llvm::Constant*, 32> concat;
-      for (unsigned i = 0; i != LHSElts; ++i) {
-        concat.push_back(Builder.getInt32(2*i));
-        concat.push_back(Builder.getInt32(2*i+1));
-      }
-
-      Value* CV = llvm::ConstantVector::get(concat);
-      LHS = Builder.CreateShuffleVector(LHS, RHS, CV, "concat");
-      LHSElts *= 2;
-    } else {
-      Mask = RHS;
-    }
+    Mask = RHS;
 
     llvm::VectorType *MTy = cast<llvm::VectorType>(Mask->getType());
 
@@ -1411,7 +1395,10 @@
   }
   case CK_AddressSpaceConversion: {
     Value *Src = Visit(const_cast<Expr*>(E));
-    return Builder.CreateAddrSpaceCast(Src, ConvertType(DestTy));
+    // Since target may map different address spaces in AST to the same address
+    // space, an address space conversion may end up as a bitcast.
+    return Builder.CreatePointerBitCastOrAddrSpaceCast(Src,
+                                                       ConvertType(DestTy));
   }
   case CK_AtomicToNonAtomic:
   case CK_NonAtomicToAtomic:
@@ -1542,7 +1529,7 @@
     llvm::Type *DstTy = ConvertType(DestTy);
     Value *Elt = Visit(const_cast<Expr*>(E));
     // Splat the element across to all elements
-    unsigned NumElements = cast<llvm::VectorType>(DstTy)->getNumElements();
+    unsigned NumElements = DstTy->getVectorNumElements();
     return Builder.CreateVectorSplat(NumElements, Elt, "splat");
   }
 
@@ -1586,7 +1573,10 @@
     return llvm::Constant::getNullValue(ConvertType(DestTy));
   }
 
-  }
+  case CK_IntToOCLSampler:
+    return CGF.CGM.createOpenCLIntToSamplerConversion(E, CGF);
+
+  } // end of switch
 
   llvm_unreachable("unknown scalar cast");
 }
@@ -1652,13 +1642,14 @@
       llvm::Value *True = CGF.EmitToMemory(Builder.getTrue(), type);
       if (isPre) {
         Builder.CreateStore(True, LV.getAddress(), LV.isVolatileQualified())
-          ->setAtomic(llvm::SequentiallyConsistent);
+          ->setAtomic(llvm::AtomicOrdering::SequentiallyConsistent);
         return Builder.getTrue();
       }
       // For atomic bool increment, we just store true and return it for
       // preincrement, do an atomic swap with true for postincrement
-        return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
-            LV.getPointer(), True, llvm::SequentiallyConsistent);
+      return Builder.CreateAtomicRMW(
+          llvm::AtomicRMWInst::Xchg, LV.getPointer(), True,
+          llvm::AtomicOrdering::SequentiallyConsistent);
     }
     // Special case for atomic increment / decrement on integers, emit
     // atomicrmw instructions.  We skip this if we want to be doing overflow
@@ -1675,7 +1666,7 @@
       llvm::Value *amt = CGF.EmitToMemory(
           llvm::ConstantInt::get(ConvertType(type), 1, true), type);
       llvm::Value *old = Builder.CreateAtomicRMW(aop,
-          LV.getPointer(), amt, llvm::SequentiallyConsistent);
+          LV.getPointer(), amt, llvm::AtomicOrdering::SequentiallyConsistent);
       return isPre ? Builder.CreateBinOp(op, old, amt) : old;
     }
     value = EmitLoadOfLValue(LV, E->getExprLoc());
@@ -1792,15 +1783,19 @@
       amt = llvm::ConstantFP::get(VMContext,
                                   llvm::APFloat(static_cast<double>(amount)));
     else {
-      // Remaining types are either Half or LongDouble.  Convert from float.
+      // Remaining types are Half, LongDouble or __float128. Convert from float.
       llvm::APFloat F(static_cast<float>(amount));
       bool ignored;
+      const llvm::fltSemantics *FS;
       // Don't use getFloatTypeSemantics because Half isn't
       // necessarily represented using the "half" LLVM type.
-      F.convert(value->getType()->isHalfTy()
-                    ? CGF.getTarget().getHalfFormat()
-                    : CGF.getTarget().getLongDoubleFormat(),
-                llvm::APFloat::rmTowardZero, &ignored);
+      if (value->getType()->isFP128Ty())
+        FS = &CGF.getTarget().getFloat128Format();
+      else if (value->getType()->isHalfTy())
+        FS = &CGF.getTarget().getHalfFormat();
+      else
+        FS = &CGF.getTarget().getLongDoubleFormat();
+      F.convert(*FS, llvm::APFloat::rmTowardZero, &ignored);
       amt = llvm::ConstantFP::get(VMContext, F);
     }
     value = Builder.CreateFAdd(value, amt, isInc ? "inc" : "dec");
@@ -2157,7 +2152,7 @@
                                  E->getExprLoc()),
             LHSTy);
         Builder.CreateAtomicRMW(aop, LHSLV.getPointer(), amt,
-            llvm::SequentiallyConsistent);
+            llvm::AtomicOrdering::SequentiallyConsistent);
         return LHSLV;
       }
     }
@@ -2281,8 +2276,13 @@
 
   if (Ops.LHS->getType()->isFPOrFPVectorTy()) {
     llvm::Value *Val = Builder.CreateFDiv(Ops.LHS, Ops.RHS, "div");
-    if (CGF.getLangOpts().OpenCL) {
-      // OpenCL 1.1 7.4: minimum accuracy of single precision / is 2.5ulp
+    if (CGF.getLangOpts().OpenCL &&
+        !CGF.CGM.getCodeGenOpts().CorrectlyRoundedDivSqrt) {
+      // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
+      // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
+      // build option allows an application to specify that single precision
+      // floating-point divide (x/y and 1/x) and sqrt used in the program
+      // source are correctly rounded.
       llvm::Type *ValTy = Val->getType();
       if (ValTy->isFloatTy() ||
           (isa<llvm::VectorType>(ValTy) &&
@@ -2714,7 +2714,8 @@
     RHS = Builder.CreateIntCast(RHS, Ops.LHS->getType(), false, "sh_prom");
 
   bool SanitizeBase = CGF.SanOpts.has(SanitizerKind::ShiftBase) &&
-                      Ops.Ty->hasSignedIntegerRepresentation();
+                      Ops.Ty->hasSignedIntegerRepresentation() &&
+                      !CGF.getLangOpts().isSignedOverflowDefined();
   bool SanitizeExponent = CGF.SanOpts.has(SanitizerKind::ShiftExponent);
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
   if (CGF.getLangOpts().OpenCL)
@@ -3366,9 +3367,11 @@
 
   llvm::Type *ArgTy = ConvertType(VE->getType());
 
-  // If EmitVAArg fails, we fall back to the LLVM instruction.
-  if (!ArgPtr.isValid())
-    return Builder.CreateVAArg(ArgValue.getPointer(), ArgTy);
+  // If EmitVAArg fails, emit an error.
+  if (!ArgPtr.isValid()) {
+    CGF.ErrorUnsupported(VE, "va_arg expression");
+    return llvm::UndefValue::get(ArgTy);
+  }
 
   // FIXME Volatility.
   llvm::Value *Val = Builder.CreateLoad(ArgPtr);
@@ -3388,50 +3391,48 @@
   return CGF.EmitBlockLiteral(block);
 }
 
+// Convert a vec3 to vec4, or vice versa.
+static Value *ConvertVec3AndVec4(CGBuilderTy &Builder, CodeGenFunction &CGF,
+                                 Value *Src, unsigned NumElementsDst) {
+  llvm::Value *UnV = llvm::UndefValue::get(Src->getType());
+  SmallVector<llvm::Constant*, 4> Args;
+  Args.push_back(Builder.getInt32(0));
+  Args.push_back(Builder.getInt32(1));
+  Args.push_back(Builder.getInt32(2));
+  if (NumElementsDst == 4)
+    Args.push_back(llvm::UndefValue::get(CGF.Int32Ty));
+  llvm::Constant *Mask = llvm::ConstantVector::get(Args);
+  return Builder.CreateShuffleVector(Src, UnV, Mask);
+}
+
 Value *ScalarExprEmitter::VisitAsTypeExpr(AsTypeExpr *E) {
   Value *Src  = CGF.EmitScalarExpr(E->getSrcExpr());
   llvm::Type *DstTy = ConvertType(E->getType());
 
-  // Going from vec4->vec3 or vec3->vec4 is a special case and requires
-  // a shuffle vector instead of a bitcast.
   llvm::Type *SrcTy = Src->getType();
-  if (isa<llvm::VectorType>(DstTy) && isa<llvm::VectorType>(SrcTy)) {
-    unsigned numElementsDst = cast<llvm::VectorType>(DstTy)->getNumElements();
-    unsigned numElementsSrc = cast<llvm::VectorType>(SrcTy)->getNumElements();
-    if ((numElementsDst == 3 && numElementsSrc == 4)
-        || (numElementsDst == 4 && numElementsSrc == 3)) {
+  unsigned NumElementsSrc = isa<llvm::VectorType>(SrcTy) ?
+    cast<llvm::VectorType>(SrcTy)->getNumElements() : 0;
+  unsigned NumElementsDst = isa<llvm::VectorType>(DstTy) ?
+    cast<llvm::VectorType>(DstTy)->getNumElements() : 0;
 
+  // Going from vec3 to non-vec3 is a special case and requires a shuffle
+  // vector to get a vec4, then a bitcast if the target type is different.
+  if (NumElementsSrc == 3 && NumElementsDst != 3) {
+    Src = ConvertVec3AndVec4(Builder, CGF, Src, 4);
+    Src = Builder.CreateBitCast(Src, DstTy);
+    Src->setName("astype");
+    return Src;
+  }
 
-      // In the case of going from int4->float3, a bitcast is needed before
-      // doing a shuffle.
-      llvm::Type *srcElemTy =
-      cast<llvm::VectorType>(SrcTy)->getElementType();
-      llvm::Type *dstElemTy =
-      cast<llvm::VectorType>(DstTy)->getElementType();
-
-      if ((srcElemTy->isIntegerTy() && dstElemTy->isFloatTy())
-          || (srcElemTy->isFloatTy() && dstElemTy->isIntegerTy())) {
-        // Create a float type of the same size as the source or destination.
-        llvm::VectorType *newSrcTy = llvm::VectorType::get(dstElemTy,
-                                                                 numElementsSrc);
-
-        Src = Builder.CreateBitCast(Src, newSrcTy, "astypeCast");
-      }
-
-      llvm::Value *UnV = llvm::UndefValue::get(Src->getType());
-
-      SmallVector<llvm::Constant*, 3> Args;
-      Args.push_back(Builder.getInt32(0));
-      Args.push_back(Builder.getInt32(1));
-      Args.push_back(Builder.getInt32(2));
-
-      if (numElementsDst == 4)
-        Args.push_back(llvm::UndefValue::get(CGF.Int32Ty));
-
-      llvm::Constant *Mask = llvm::ConstantVector::get(Args);
-
-      return Builder.CreateShuffleVector(Src, UnV, Mask, "astype");
-    }
+  // Going from non-vec3 to vec3 is a special case and requires a bitcast
+  // to vec4 if the original type is not vec4, then a shuffle vector to
+  // get a vec3.
+  if (NumElementsSrc != 3 && NumElementsDst == 3) {
+    auto Vec4Ty = llvm::VectorType::get(DstTy->getVectorElementType(), 4);
+    Src = Builder.CreateBitCast(Src, Vec4Ty);
+    Src = ConvertVec3AndVec4(Builder, CGF, Src, 3);
+    Src->setName("astype");
+    return Src;
   }
 
   return Builder.CreateBitCast(Src, DstTy, "astype");
diff --git a/lib/CodeGen/CGLoopInfo.cpp b/lib/CodeGen/CGLoopInfo.cpp
index 25f1e2e..51474f1 100644
--- a/lib/CodeGen/CGLoopInfo.cpp
+++ b/lib/CodeGen/CGLoopInfo.cpp
@@ -19,12 +19,15 @@
 using namespace clang::CodeGen;
 using namespace llvm;
 
-static MDNode *createMetadata(LLVMContext &Ctx, const LoopAttributes &Attrs) {
+static MDNode *createMetadata(LLVMContext &Ctx, const LoopAttributes &Attrs,
+                              llvm::DebugLoc Location) {
 
   if (!Attrs.IsParallel && Attrs.VectorizeWidth == 0 &&
       Attrs.InterleaveCount == 0 && Attrs.UnrollCount == 0 &&
       Attrs.VectorizeEnable == LoopAttributes::Unspecified &&
-      Attrs.UnrollEnable == LoopAttributes::Unspecified)
+      Attrs.UnrollEnable == LoopAttributes::Unspecified &&
+      Attrs.DistributeEnable == LoopAttributes::Unspecified &&
+      !Location)
     return nullptr;
 
   SmallVector<Metadata *, 4> Args;
@@ -32,6 +35,10 @@
   auto TempNode = MDNode::getTemporary(Ctx, None);
   Args.push_back(TempNode.get());
 
+  // If we have a valid debug location for the loop, add it.
+  if (Location)
+    Args.push_back(Location.getAsMDNode());
+
   // Setting vectorize.width
   if (Attrs.VectorizeWidth > 0) {
     Metadata *Vals[] = {MDString::get(Ctx, "llvm.loop.vectorize.width"),
@@ -78,6 +85,14 @@
     Args.push_back(MDNode::get(Ctx, Vals));
   }
 
+  if (Attrs.DistributeEnable != LoopAttributes::Unspecified) {
+    Metadata *Vals[] = {MDString::get(Ctx, "llvm.loop.distribute.enable"),
+                        ConstantAsMetadata::get(ConstantInt::get(
+                            Type::getInt1Ty(Ctx), (Attrs.DistributeEnable ==
+                                                   LoopAttributes::Enable)))};
+    Args.push_back(MDNode::get(Ctx, Vals));
+  }
+
   // Set the first operand to itself.
   MDNode *LoopID = MDNode::get(Ctx, Args);
   LoopID->replaceOperandWith(0, LoopID);
@@ -87,7 +102,8 @@
 LoopAttributes::LoopAttributes(bool IsParallel)
     : IsParallel(IsParallel), VectorizeEnable(LoopAttributes::Unspecified),
       UnrollEnable(LoopAttributes::Unspecified), VectorizeWidth(0),
-      InterleaveCount(0), UnrollCount(0) {}
+      InterleaveCount(0), UnrollCount(0),
+      DistributeEnable(LoopAttributes::Unspecified) {}
 
 void LoopAttributes::clear() {
   IsParallel = false;
@@ -98,37 +114,60 @@
   UnrollEnable = LoopAttributes::Unspecified;
 }
 
-LoopInfo::LoopInfo(BasicBlock *Header, const LoopAttributes &Attrs)
+LoopInfo::LoopInfo(BasicBlock *Header, const LoopAttributes &Attrs,
+                   llvm::DebugLoc Location)
     : LoopID(nullptr), Header(Header), Attrs(Attrs) {
-  LoopID = createMetadata(Header->getContext(), Attrs);
+  LoopID = createMetadata(Header->getContext(), Attrs, Location);
 }
 
-void LoopInfoStack::push(BasicBlock *Header) {
-  Active.push_back(LoopInfo(Header, StagedAttrs));
+void LoopInfoStack::push(BasicBlock *Header, llvm::DebugLoc Location) {
+  Active.push_back(LoopInfo(Header, StagedAttrs, Location));
   // Clear the attributes so nested loops do not inherit them.
   StagedAttrs.clear();
 }
 
 void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx,
-                         ArrayRef<const clang::Attr *> Attrs) {
+                         ArrayRef<const clang::Attr *> Attrs,
+                         llvm::DebugLoc Location) {
 
   // Identify loop hint attributes from Attrs.
   for (const auto *Attr : Attrs) {
     const LoopHintAttr *LH = dyn_cast<LoopHintAttr>(Attr);
+    const OpenCLUnrollHintAttr *OpenCLHint =
+        dyn_cast<OpenCLUnrollHintAttr>(Attr);
 
     // Skip non loop hint attributes
-    if (!LH)
+    if (!LH && !OpenCLHint) {
       continue;
-
-    auto *ValueExpr = LH->getValue();
-    unsigned ValueInt = 1;
-    if (ValueExpr) {
-      llvm::APSInt ValueAPS = ValueExpr->EvaluateKnownConstInt(Ctx);
-      ValueInt = ValueAPS.getSExtValue();
     }
 
-    LoopHintAttr::OptionType Option = LH->getOption();
-    LoopHintAttr::LoopHintState State = LH->getState();
+    LoopHintAttr::OptionType Option = LoopHintAttr::Unroll;
+    LoopHintAttr::LoopHintState State = LoopHintAttr::Disable;
+    unsigned ValueInt = 1;
+    // Translate opencl_unroll_hint attribute argument to
+    // equivalent LoopHintAttr enums.
+    // OpenCL v2.0 s6.11.5:  
+    // 0 - full unroll (no argument).
+    // 1 - disable unroll.
+    // other positive integer n - unroll by n.
+    if (OpenCLHint) {
+      ValueInt = OpenCLHint->getUnrollHint();
+      if (ValueInt == 0) {
+        State = LoopHintAttr::Full;
+      } else if (ValueInt != 1) {
+        Option = LoopHintAttr::UnrollCount;
+        State = LoopHintAttr::Numeric;
+      }
+    } else if (LH) {
+      auto *ValueExpr = LH->getValue();
+      if (ValueExpr) {
+        llvm::APSInt ValueAPS = ValueExpr->EvaluateKnownConstInt(Ctx);
+        ValueInt = ValueAPS.getSExtValue();
+      }
+
+      Option = LH->getOption();
+      State = LH->getState();
+    }
     switch (State) {
     case LoopHintAttr::Disable:
       switch (Option) {
@@ -143,6 +182,9 @@
       case LoopHintAttr::Unroll:
         setUnrollState(LoopAttributes::Disable);
         break;
+      case LoopHintAttr::Distribute:
+        setDistributeState(false);
+        break;
       case LoopHintAttr::UnrollCount:
       case LoopHintAttr::VectorizeWidth:
       case LoopHintAttr::InterleaveCount:
@@ -159,6 +201,9 @@
       case LoopHintAttr::Unroll:
         setUnrollState(LoopAttributes::Enable);
         break;
+      case LoopHintAttr::Distribute:
+        setDistributeState(true);
+        break;
       case LoopHintAttr::UnrollCount:
       case LoopHintAttr::VectorizeWidth:
       case LoopHintAttr::InterleaveCount:
@@ -178,6 +223,7 @@
       case LoopHintAttr::UnrollCount:
       case LoopHintAttr::VectorizeWidth:
       case LoopHintAttr::InterleaveCount:
+      case LoopHintAttr::Distribute:
         llvm_unreachable("Options cannot be used to assume mem safety.");
         break;
       }
@@ -192,6 +238,7 @@
       case LoopHintAttr::UnrollCount:
       case LoopHintAttr::VectorizeWidth:
       case LoopHintAttr::InterleaveCount:
+      case LoopHintAttr::Distribute:
         llvm_unreachable("Options cannot be used with 'full' hint.");
         break;
       }
@@ -210,6 +257,7 @@
       case LoopHintAttr::Unroll:
       case LoopHintAttr::Vectorize:
       case LoopHintAttr::Interleave:
+      case LoopHintAttr::Distribute:
         llvm_unreachable("Options cannot be assigned a value.");
         break;
       }
@@ -218,7 +266,7 @@
   }
 
   /// Stage the attributes.
-  push(Header);
+  push(Header, Location);
 }
 
 void LoopInfoStack::pop() {
diff --git a/lib/CodeGen/CGLoopInfo.h b/lib/CodeGen/CGLoopInfo.h
index ec33906..76a039d 100644
--- a/lib/CodeGen/CGLoopInfo.h
+++ b/lib/CodeGen/CGLoopInfo.h
@@ -16,8 +16,8 @@
 #define LLVM_CLANG_LIB_CODEGEN_CGLOOPINFO_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Compiler.h"
 
@@ -57,13 +57,17 @@
 
   /// \brief llvm.unroll.
   unsigned UnrollCount;
+
+  /// \brief Value for llvm.loop.distribute.enable metadata.
+  LVEnableState DistributeEnable;
 };
 
 /// \brief Information used when generating a structured loop.
 class LoopInfo {
 public:
   /// \brief Construct a new LoopInfo for the loop with entry Header.
-  LoopInfo(llvm::BasicBlock *Header, const LoopAttributes &Attrs);
+  LoopInfo(llvm::BasicBlock *Header, const LoopAttributes &Attrs,
+           llvm::DebugLoc Location);
 
   /// \brief Get the loop id metadata for this loop.
   llvm::MDNode *getLoopID() const { return LoopID; }
@@ -95,12 +99,14 @@
 
   /// \brief Begin a new structured loop. The set of staged attributes will be
   /// applied to the loop and then cleared.
-  void push(llvm::BasicBlock *Header);
+  void push(llvm::BasicBlock *Header,
+            llvm::DebugLoc Location = llvm::DebugLoc());
 
   /// \brief Begin a new structured loop. Stage attributes from the Attrs list.
   /// The staged attributes are applied to the loop and then cleared.
   void push(llvm::BasicBlock *Header, clang::ASTContext &Ctx,
-            llvm::ArrayRef<const Attr *> Attrs);
+            llvm::ArrayRef<const Attr *> Attrs,
+            llvm::DebugLoc Location = llvm::DebugLoc());
 
   /// \brief End the current loop.
   void pop();
@@ -126,6 +132,12 @@
         Enable ? LoopAttributes::Enable : LoopAttributes::Disable;
   }
 
+  /// \brief Set the next pushed loop as a distribution candidate.
+  void setDistributeState(bool Enable = true) {
+    StagedAttrs.DistributeEnable =
+        Enable ? LoopAttributes::Enable : LoopAttributes::Disable;
+  }
+
   /// \brief Set the next pushed loop unroll state.
   void setUnrollState(const LoopAttributes::LVEnableState &State) {
     StagedAttrs.UnrollEnable = State;
diff --git a/lib/CodeGen/CGObjC.cpp b/lib/CodeGen/CGObjC.cpp
index 41f45d2..db894ce 100644
--- a/lib/CodeGen/CGObjC.cpp
+++ b/lib/CodeGen/CGObjC.cpp
@@ -905,7 +905,7 @@
     Address ivarAddr = LV.getAddress();
     ivarAddr = Builder.CreateBitCast(ivarAddr, bitcastType);
     llvm::LoadInst *load = Builder.CreateLoad(ivarAddr, "load");
-    load->setAtomic(llvm::Unordered);
+    load->setAtomic(llvm::AtomicOrdering::Unordered);
 
     // Store that value into the return address.  Doing this with a
     // bitcast is likely to produce some pretty ugly IR, but it's not
@@ -1190,7 +1190,7 @@
 
     // Perform an atomic store.  There are no memory ordering requirements.
     llvm::StoreInst *store = Builder.CreateStore(load, ivarAddr);
-    store->setAtomic(llvm::Unordered);
+    store->setAtomic(llvm::AtomicOrdering::Unordered);
     return;
   }
 
diff --git a/lib/CodeGen/CGObjCGNU.cpp b/lib/CodeGen/CGObjCGNU.cpp
index e4074a7..0a6c60a 100644
--- a/lib/CodeGen/CGObjCGNU.cpp
+++ b/lib/CodeGen/CGObjCGNU.cpp
@@ -35,11 +35,9 @@
 #include "llvm/Support/Compiler.h"
 #include <cstdarg>
 
-
 using namespace clang;
 using namespace CodeGen;
 
-
 namespace {
 /// Class that lazily initialises the runtime function.  Avoids inserting the
 /// types and the function declaration into a module if they're not used, and
@@ -161,6 +159,7 @@
   /// runtime provides some LLVM passes that can use this to do things like
   /// automatic IMP caching and speculative inlining.
   unsigned msgSendMDKind;
+
   /// Helper function that generates a constant string and returns a pointer to
   /// the start of the string.  The result of this function can be used anywhere
   /// where the C code specifies const char*.  
@@ -170,6 +169,7 @@
     return llvm::ConstantExpr::getGetElementPtr(Array.getElementType(),
                                                 Array.getPointer(), Zeros);
   }
+
   /// Emits a linkonce_odr string, whose name is the prefix followed by the
   /// string value.  This allows the linker to combine the strings between
   /// different modules.  Used for EH typeinfo names, selector strings, and a
@@ -186,6 +186,7 @@
     return llvm::ConstantExpr::getGetElementPtr(ConstStr->getValueType(),
                                                 ConstStr, Zeros);
   }
+
   /// Generates a global structure, initialized by the elements in the vector.
   /// The element types must match the types of the structure elements in the
   /// first argument.
@@ -201,6 +202,7 @@
     GV->setAlignment(Align.getQuantity());
     return GV;
   }
+
   /// Generates a global array.  The vector must contain the same number of
   /// elements that the array type declares, of the type specified as the array
   /// element type.
@@ -216,6 +218,7 @@
     GV->setAlignment(Align.getQuantity());
     return GV;
   }
+
   /// Generates a global array, inferring the array type from the specified
   /// element type and the size of the initialiser.  
   llvm::GlobalVariable *MakeGlobalArray(llvm::Type *Ty,
@@ -227,6 +230,7 @@
     llvm::ArrayType *ArrayTy = llvm::ArrayType::get(Ty, V.size());
     return MakeGlobal(ArrayTy, V, Align, Name, linkage);
   }
+
   /// Returns a property name and encoding string.
   llvm::Constant *MakePropertyEncodingString(const ObjCPropertyDecl *PD,
                                              const Decl *Container) {
@@ -245,6 +249,7 @@
     }
     return MakeConstantString(PD->getNameAsString());
   }
+
   /// Push the property attributes into two structure fields. 
   void PushPropertyAttributes(std::vector<llvm::Constant*> &Fields,
       ObjCPropertyDecl *property, bool isSynthesized=true, bool
@@ -273,6 +278,7 @@
     Fields.push_back(llvm::ConstantInt::get(Int8Ty, 0));
     Fields.push_back(llvm::ConstantInt::get(Int8Ty, 0));
   }
+
   /// Ensures that the value has the required type, by inserting a bitcast if
   /// required.  This function lets us avoid inserting bitcasts that are
   /// redundant.
@@ -284,12 +290,14 @@
     if (V.getType() == Ty) return V;
     return B.CreateBitCast(V, Ty);
   }
+
   // Some zeros used for GEPs in lots of places.
   llvm::Constant *Zeros[2];
   /// Null pointer value.  Mainly used as a terminator in various arrays.
   llvm::Constant *NULLPtr;
   /// LLVM context.
   llvm::LLVMContext &VMContext;
+
 private:
   /// Placeholder for the class.  Lots of things refer to the class before we've
   /// actually emitted it.  We use this alias as a placeholder, and then replace
@@ -360,7 +368,6 @@
   LazyRuntimeFunction SyncExitFn;
 
 private:
-
   /// Function called if fast enumeration detects that the collection is
   /// modified during the update.
   LazyRuntimeFunction EnumerationMutationFn;
@@ -385,7 +392,7 @@
   /// Objective-C 1 property structures when targeting the GCC runtime or it
   /// will abort.
   const int ProtocolVersion;
-private:
+
   /// Generates an instance variable list structure.  This is a structure
   /// containing a size and an array of structures containing instance variable
   /// metadata.  This is used purely for introspection in the fragile ABI.  In
@@ -393,6 +400,7 @@
   llvm::Constant *GenerateIvarList(ArrayRef<llvm::Constant *> IvarNames,
                                    ArrayRef<llvm::Constant *> IvarTypes,
                                    ArrayRef<llvm::Constant *> IvarOffsets);
+
   /// Generates a method list structure.  This is a structure containing a size
   /// and an array of structures containing method metadata.
   ///
@@ -403,23 +411,28 @@
       ArrayRef<Selector> MethodSels,
       ArrayRef<llvm::Constant *> MethodTypes,
       bool isClassMethodList);
+
   /// Emits an empty protocol.  This is used for \@protocol() where no protocol
   /// is found.  The runtime will (hopefully) fix up the pointer to refer to the
   /// real protocol.
   llvm::Constant *GenerateEmptyProtocol(const std::string &ProtocolName);
+
   /// Generates a list of property metadata structures.  This follows the same
   /// pattern as method and instance variable metadata lists.
   llvm::Constant *GeneratePropertyList(const ObjCImplementationDecl *OID,
         SmallVectorImpl<Selector> &InstanceMethodSels,
         SmallVectorImpl<llvm::Constant*> &InstanceMethodTypes);
+
   /// Generates a list of referenced protocols.  Classes, categories, and
   /// protocols all use this structure.
   llvm::Constant *GenerateProtocolList(ArrayRef<std::string> Protocols);
+
   /// To ensure that all protocols are seen by the runtime, we add a category on
   /// a class defined in the runtime, declaring no methods, but adopting the
   /// protocols.  This is a horribly ugly hack, but it allows us to collect all
   /// of the protocols without changing the ABI.
   void GenerateProtocolHolderCategory();
+
   /// Generates a class structure.
   llvm::Constant *GenerateClassStructure(
       llvm::Constant *MetaClass,
@@ -436,25 +449,31 @@
       llvm::Constant *StrongIvarBitmap,
       llvm::Constant *WeakIvarBitmap,
       bool isMeta=false);
+
   /// Generates a method list.  This is used by protocols to define the required
   /// and optional methods.
   llvm::Constant *GenerateProtocolMethodList(
       ArrayRef<llvm::Constant *> MethodNames,
       ArrayRef<llvm::Constant *> MethodTypes);
+
   /// Returns a selector with the specified type encoding.  An empty string is
   /// used to return an untyped selector (with the types field set to NULL).
   llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel,
                            const std::string &TypeEncoding);
+
   /// Returns the variable used to store the offset of an instance variable.
   llvm::GlobalVariable *ObjCIvarOffsetVariable(const ObjCInterfaceDecl *ID,
       const ObjCIvarDecl *Ivar);
   /// Emits a reference to a class.  This allows the linker to object if there
   /// is no class of the matching name.
+
 protected:
   void EmitClassRef(const std::string &className);
+
   /// Emits a pointer to the named class
   virtual llvm::Value *GetClassNamed(CodeGenFunction &CGF,
                                      const std::string &Name, bool isWeak);
+
   /// Looks up the method for sending a message to the specified object.  This
   /// mechanism differs between the GCC and GNU runtimes, so this method must be
   /// overridden in subclasses.
@@ -463,6 +482,7 @@
                                  llvm::Value *cmd,
                                  llvm::MDNode *node,
                                  MessageSendInfo &MSI) = 0;
+
   /// Looks up the method for sending a message to a superclass.  This
   /// mechanism differs between the GCC and GNU runtimes, so this method must
   /// be overridden in subclasses.
@@ -470,6 +490,7 @@
                                       Address ObjCSuper,
                                       llvm::Value *cmd,
                                       MessageSendInfo &MSI) = 0;
+
   /// Libobjc2 uses a bitfield representation where small(ish) bitfields are
   /// stored in a 64-bit value with the low bit set to 1 and the remaining 63
   /// bits set to their values, LSB first, while larger ones are stored in a
@@ -482,6 +503,7 @@
   /// a bitfield with the 64th bit set will be (int64_t)&{ 2, [0, 1<<31] },
   /// while a bitfield / with the 63rd bit set will be 1<<64.
   llvm::Constant *MakeBitField(ArrayRef<bool> bits);
+
 public:
   CGObjCGNU(CodeGenModule &cgm, unsigned runtimeABIVersion,
       unsigned protocolClassVersion);
@@ -569,12 +591,12 @@
     return NULLPtr;
   }
 
-  llvm::Constant *GetClassGlobal(const std::string &Name,
-                                 bool ForDefinition,
-                                 bool Weak) override {
+  llvm::Constant *GetClassGlobal(StringRef Name, bool ForDefinition,
+                                 bool Weak, bool DLLImport) override {
     return nullptr;
   }
 };
+
 /// Class representing the legacy GCC Objective-C ABI.  This is the default when
 /// -fobjc-nonfragile-abi is not specified.
 ///
@@ -591,6 +613,7 @@
   /// structure describing the receiver and the class, and a selector as
   /// arguments.  Returns the IMP for the corresponding method.
   LazyRuntimeFunction MsgLookupSuperFn;
+
 protected:
   llvm::Value *LookupIMP(CodeGenFunction &CGF, llvm::Value *&Receiver,
                          llvm::Value *cmd, llvm::MDNode *node,
@@ -603,23 +626,26 @@
     imp->setMetadata(msgSendMDKind, node);
     return imp.getInstruction();
   }
+
   llvm::Value *LookupIMPSuper(CodeGenFunction &CGF, Address ObjCSuper,
                               llvm::Value *cmd, MessageSendInfo &MSI) override {
-      CGBuilderTy &Builder = CGF.Builder;
-      llvm::Value *lookupArgs[] = {EnforceType(Builder, ObjCSuper,
-          PtrToObjCSuperTy).getPointer(), cmd};
-      return CGF.EmitNounwindRuntimeCall(MsgLookupSuperFn, lookupArgs);
-    }
-  public:
-    CGObjCGCC(CodeGenModule &Mod) : CGObjCGNU(Mod, 8, 2) {
-      // IMP objc_msg_lookup(id, SEL);
-      MsgLookupFn.init(&CGM, "objc_msg_lookup", IMPTy, IdTy, SelectorTy,
-                       nullptr);
-      // IMP objc_msg_lookup_super(struct objc_super*, SEL);
-      MsgLookupSuperFn.init(&CGM, "objc_msg_lookup_super", IMPTy,
-              PtrToObjCSuperTy, SelectorTy, nullptr);
-    }
+    CGBuilderTy &Builder = CGF.Builder;
+    llvm::Value *lookupArgs[] = {EnforceType(Builder, ObjCSuper,
+        PtrToObjCSuperTy).getPointer(), cmd};
+    return CGF.EmitNounwindRuntimeCall(MsgLookupSuperFn, lookupArgs);
+  }
+
+public:
+  CGObjCGCC(CodeGenModule &Mod) : CGObjCGNU(Mod, 8, 2) {
+    // IMP objc_msg_lookup(id, SEL);
+    MsgLookupFn.init(&CGM, "objc_msg_lookup", IMPTy, IdTy, SelectorTy,
+                     nullptr);
+    // IMP objc_msg_lookup_super(struct objc_super*, SEL);
+    MsgLookupSuperFn.init(&CGM, "objc_msg_lookup_super", IMPTy,
+                          PtrToObjCSuperTy, SelectorTy, nullptr);
+  }
 };
+
 /// Class used when targeting the new GNUstep runtime ABI.
 class CGObjCGNUstep : public CGObjCGNU {
     /// The slot lookup function.  Returns a pointer to a cacheable structure
@@ -647,8 +673,10 @@
     /// Type of an slot structure pointer.  This is returned by the various
     /// lookup functions.
     llvm::Type *SlotTy;
+
   public:
     llvm::Constant *GetEHType(QualType T) override;
+
   protected:
     llvm::Value *LookupIMP(CodeGenFunction &CGF, llvm::Value *&Receiver,
                            llvm::Value *cmd, llvm::MDNode *node,
@@ -690,6 +718,7 @@
       Receiver = Builder.CreateLoad(ReceiverPtr, true);
       return imp;
     }
+
     llvm::Value *LookupIMPSuper(CodeGenFunction &CGF, Address ObjCSuper,
                                 llvm::Value *cmd,
                                 MessageSendInfo &MSI) override {
@@ -703,6 +732,7 @@
       return Builder.CreateAlignedLoad(Builder.CreateStructGEP(nullptr, slot, 4),
                                        CGF.getPointerAlign());
     }
+
   public:
     CGObjCGNUstep(CodeGenModule &Mod) : CGObjCGNU(Mod, 9, 3) {
       const ObjCRuntime &R = CGM.getLangOpts().ObjCRuntime;
@@ -754,6 +784,7 @@
       CxxAtomicObjectGetFn.init(&CGM, "objc_getCppObjectAtomic", VoidTy, PtrTy,
           PtrTy, PtrTy, nullptr);
     }
+
     llvm::Constant *GetCppAtomicObjectGetFunction() override {
       // The optimised functions were added in version 1.7 of the GNUstep
       // runtime.
@@ -761,6 +792,7 @@
           VersionTuple(1, 7));
       return CxxAtomicObjectGetFn;
     }
+
     llvm::Constant *GetCppAtomicObjectSetFunction() override {
       // The optimised functions were added in version 1.7 of the GNUstep
       // runtime.
@@ -768,6 +800,7 @@
           VersionTuple(1, 7));
       return CxxAtomicObjectSetFn;
     }
+
     llvm::Constant *GetOptimizedPropertySetFunction(bool atomic,
                                                     bool copy) override {
       // The optimised property functions omit the GC check, and so are not
@@ -822,32 +855,29 @@
 
   llvm::Value *LookupIMPSuper(CodeGenFunction &CGF, Address ObjCSuper,
                               llvm::Value *cmd, MessageSendInfo &MSI) override {
-      CGBuilderTy &Builder = CGF.Builder;
-      llvm::Value *lookupArgs[] = {EnforceType(Builder, ObjCSuper.getPointer(),
-          PtrToObjCSuperTy), cmd};
+    CGBuilderTy &Builder = CGF.Builder;
+    llvm::Value *lookupArgs[] = {
+        EnforceType(Builder, ObjCSuper.getPointer(), PtrToObjCSuperTy), cmd,
+    };
 
-      if (CGM.ReturnTypeUsesSRet(MSI.CallInfo))
-        return CGF.EmitNounwindRuntimeCall(MsgLookupSuperFnSRet, lookupArgs);
-      else
-        return CGF.EmitNounwindRuntimeCall(MsgLookupSuperFn, lookupArgs);
-    }
+    if (CGM.ReturnTypeUsesSRet(MSI.CallInfo))
+      return CGF.EmitNounwindRuntimeCall(MsgLookupSuperFnSRet, lookupArgs);
+    else
+      return CGF.EmitNounwindRuntimeCall(MsgLookupSuperFn, lookupArgs);
+  }
 
-  llvm::Value *GetClassNamed(CodeGenFunction &CGF,
-                             const std::string &Name, bool isWeak) override {
+  llvm::Value *GetClassNamed(CodeGenFunction &CGF, const std::string &Name,
+                             bool isWeak) override {
     if (isWeak)
       return CGObjCGNU::GetClassNamed(CGF, Name, isWeak);
 
     EmitClassRef(Name);
-
     std::string SymbolName = "_OBJC_CLASS_" + Name;
-
     llvm::GlobalVariable *ClassSymbol = TheModule.getGlobalVariable(SymbolName);
-
     if (!ClassSymbol)
       ClassSymbol = new llvm::GlobalVariable(TheModule, LongTy, false,
                                              llvm::GlobalValue::ExternalLinkage,
                                              nullptr, SymbolName);
-
     return ClassSymbol;
   }
 
@@ -866,7 +896,6 @@
 };
 } // end anonymous namespace
 
-
 /// Emits a reference to a dummy variable which is emitted with each class.
 /// This ensures that a linker error will be generated when trying to link
 /// together modules where a referenced class is not defined.
@@ -1022,8 +1051,7 @@
 }
 
 llvm::Value *CGObjCGNU::GetClassNamed(CodeGenFunction &CGF,
-                                      const std::string &Name,
-                                      bool isWeak) {
+                                      const std::string &Name, bool isWeak) {
   llvm::Constant *ClassName = MakeConstantString(Name);
   // With the incompatible ABI, this will need to be replaced with a direct
   // reference to the class symbol.  For the compatible nonfragile ABI we are
@@ -1045,15 +1073,48 @@
 // techniques can modify the name -> class mapping.
 llvm::Value *CGObjCGNU::GetClass(CodeGenFunction &CGF,
                                  const ObjCInterfaceDecl *OID) {
-  return GetClassNamed(CGF, OID->getNameAsString(), OID->isWeakImported());
+  auto *Value =
+      GetClassNamed(CGF, OID->getNameAsString(), OID->isWeakImported());
+  if (CGM.getTriple().isOSBinFormatCOFF()) {
+    if (auto *ClassSymbol = dyn_cast<llvm::GlobalVariable>(Value)) {
+      auto DLLStorage = llvm::GlobalValue::DefaultStorageClass;
+      if (OID->hasAttr<DLLExportAttr>())
+        DLLStorage = llvm::GlobalValue::DLLExportStorageClass;
+      else if (OID->hasAttr<DLLImportAttr>())
+        DLLStorage = llvm::GlobalValue::DLLImportStorageClass;
+      ClassSymbol->setDLLStorageClass(DLLStorage);
+    }
+  }
+  return Value;
 }
+
 llvm::Value *CGObjCGNU::EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF) {
-  return GetClassNamed(CGF, "NSAutoreleasePool", false);
+  auto *Value  = GetClassNamed(CGF, "NSAutoreleasePool", false);
+  if (CGM.getTriple().isOSBinFormatCOFF()) {
+    if (auto *ClassSymbol = dyn_cast<llvm::GlobalVariable>(Value)) {
+      IdentifierInfo &II = CGF.CGM.getContext().Idents.get("NSAutoreleasePool");
+      TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
+      DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+
+      const VarDecl *VD = nullptr;
+      for (const auto &Result : DC->lookup(&II))
+        if ((VD = dyn_cast<VarDecl>(Result)))
+          break;
+
+      auto DLLStorage = llvm::GlobalValue::DefaultStorageClass;
+      if (!VD || VD->hasAttr<DLLImportAttr>())
+        DLLStorage = llvm::GlobalValue::DLLImportStorageClass;
+      else if (VD->hasAttr<DLLExportAttr>())
+        DLLStorage = llvm::GlobalValue::DLLExportStorageClass;
+
+      ClassSymbol->setDLLStorageClass(DLLStorage);
+    }
+  }
+  return Value;
 }
 
 llvm::Value *CGObjCGNU::GetSelector(CodeGenFunction &CGF, Selector Sel,
                                     const std::string &TypeEncoding) {
-
   SmallVectorImpl<TypedSelector> &Types = SelectorTable[Sel];
   llvm::GlobalAlias *SelValue = nullptr;
 
@@ -1248,8 +1309,6 @@
   }
 
   llvm::Value *cmd = GetSelector(CGF, Sel);
-
-
   CallArgList ActualArgs;
 
   ActualArgs.add(RValue::get(EnforceType(Builder, Receiver, IdTy)), ASTIdTy);
@@ -1498,21 +1557,17 @@
     IMPTy, //Method pointer
     nullptr);
   std::vector<llvm::Constant*> Methods;
-  std::vector<llvm::Constant*> Elements;
   for (unsigned int i = 0, e = MethodTypes.size(); i < e; ++i) {
-    Elements.clear();
     llvm::Constant *Method =
       TheModule.getFunction(SymbolNameForMethod(ClassName, CategoryName,
                                                 MethodSels[i],
                                                 isClassMethodList));
     assert(Method && "Can't generate metadata for method that doesn't exist");
     llvm::Constant *C = MakeConstantString(MethodSels[i].getAsString());
-    Elements.push_back(C);
-    Elements.push_back(MethodTypes[i]);
     Method = llvm::ConstantExpr::getBitCast(Method,
         IMPTy);
-    Elements.push_back(Method);
-    Methods.push_back(llvm::ConstantStruct::get(ObjCMethodTy, Elements));
+    Methods.push_back(
+        llvm::ConstantStruct::get(ObjCMethodTy, {C, MethodTypes[i], Method}));
   }
 
   // Array of method structures
@@ -1555,23 +1610,18 @@
     IntTy,
     nullptr);
   std::vector<llvm::Constant*> Ivars;
-  std::vector<llvm::Constant*> Elements;
   for (unsigned int i = 0, e = IvarNames.size() ; i < e ; i++) {
-    Elements.clear();
-    Elements.push_back(IvarNames[i]);
-    Elements.push_back(IvarTypes[i]);
-    Elements.push_back(IvarOffsets[i]);
-    Ivars.push_back(llvm::ConstantStruct::get(ObjCIvarTy, Elements));
+    Ivars.push_back(llvm::ConstantStruct::get(
+        ObjCIvarTy, {IvarNames[i], IvarTypes[i], IvarOffsets[i]}));
   }
 
   // Array of method structures
   llvm::ArrayType *ObjCIvarArrayTy = llvm::ArrayType::get(ObjCIvarTy,
       IvarNames.size());
 
-
-  Elements.clear();
-  Elements.push_back(llvm::ConstantInt::get(IntTy, (int)IvarNames.size()));
-  Elements.push_back(llvm::ConstantArray::get(ObjCIvarArrayTy, Ivars));
+  llvm::Constant *Elements[] = {
+      llvm::ConstantInt::get(IntTy, (int)IvarNames.size()),
+      llvm::ConstantArray::get(ObjCIvarArrayTy, Ivars)};
   // Structure containing array and array count
   llvm::StructType *ObjCIvarListTy = llvm::StructType::get(IntTy,
     ObjCIvarArrayTy,
@@ -1683,12 +1733,9 @@
     PtrToInt8Ty,
     nullptr);
   std::vector<llvm::Constant*> Methods;
-  std::vector<llvm::Constant*> Elements;
   for (unsigned int i = 0, e = MethodTypes.size() ; i < e ; i++) {
-    Elements.clear();
-    Elements.push_back(MethodNames[i]);
-    Elements.push_back(MethodTypes[i]);
-    Methods.push_back(llvm::ConstantStruct::get(ObjCMethodDescTy, Elements));
+    Methods.push_back(llvm::ConstantStruct::get(
+        ObjCMethodDescTy, {MethodNames[i], MethodTypes[i]}));
   }
   llvm::ArrayType *ObjCMethodArrayTy = llvm::ArrayType::get(ObjCMethodDescTy,
       MethodNames.size());
@@ -1763,17 +1810,13 @@
       MethodList->getType(),
       MethodList->getType(),
       nullptr);
-  std::vector<llvm::Constant*> Elements;
   // The isa pointer must be set to a magic number so the runtime knows it's
   // the correct layout.
-  Elements.push_back(llvm::ConstantExpr::getIntToPtr(
-        llvm::ConstantInt::get(Int32Ty, ProtocolVersion), IdTy));
-  Elements.push_back(MakeConstantString(ProtocolName, ".objc_protocol_name"));
-  Elements.push_back(ProtocolList);
-  Elements.push_back(MethodList);
-  Elements.push_back(MethodList);
-  Elements.push_back(MethodList);
-  Elements.push_back(MethodList);
+  llvm::Constant *Elements[] = {
+      llvm::ConstantExpr::getIntToPtr(
+          llvm::ConstantInt::get(Int32Ty, ProtocolVersion), IdTy),
+      MakeConstantString(ProtocolName, ".objc_protocol_name"), ProtocolList,
+      MethodList, MethodList, MethodList, MethodList};
   return MakeGlobal(ProtocolTy, Elements, CGM.getPointerAlign(),
                     ".objc_protocol");
 }
@@ -1921,19 +1964,14 @@
       PropertyList->getType(),
       OptionalPropertyList->getType(),
       nullptr);
-  std::vector<llvm::Constant*> Elements;
   // The isa pointer must be set to a magic number so the runtime knows it's
   // the correct layout.
-  Elements.push_back(llvm::ConstantExpr::getIntToPtr(
-        llvm::ConstantInt::get(Int32Ty, ProtocolVersion), IdTy));
-  Elements.push_back(MakeConstantString(ProtocolName, ".objc_protocol_name"));
-  Elements.push_back(ProtocolList);
-  Elements.push_back(InstanceMethodList);
-  Elements.push_back(ClassMethodList);
-  Elements.push_back(OptionalInstanceMethodList);
-  Elements.push_back(OptionalClassMethodList);
-  Elements.push_back(PropertyList);
-  Elements.push_back(OptionalPropertyList);
+  llvm::Constant *Elements[] = {
+      llvm::ConstantExpr::getIntToPtr(
+          llvm::ConstantInt::get(Int32Ty, ProtocolVersion), IdTy),
+      MakeConstantString(ProtocolName, ".objc_protocol_name"), ProtocolList,
+      InstanceMethodList, ClassMethodList, OptionalInstanceMethodList,
+      OptionalClassMethodList, PropertyList, OptionalPropertyList};
   ExistingProtocols[ProtocolName] =
     llvm::ConstantExpr::getBitCast(MakeGlobal(ProtocolTy, Elements,
           CGM.getPointerAlign(), ".objc_protocol"), IdTy);
@@ -2059,20 +2097,20 @@
        E = Protos.end(); I != E; ++I)
     Protocols.push_back((*I)->getNameAsString());
 
-  std::vector<llvm::Constant*> Elements;
-  Elements.push_back(MakeConstantString(CategoryName));
-  Elements.push_back(MakeConstantString(ClassName));
-  // Instance method list
-  Elements.push_back(llvm::ConstantExpr::getBitCast(GenerateMethodList(
-          ClassName, CategoryName, InstanceMethodSels, InstanceMethodTypes,
-          false), PtrTy));
-  // Class method list
-  Elements.push_back(llvm::ConstantExpr::getBitCast(GenerateMethodList(
-          ClassName, CategoryName, ClassMethodSels, ClassMethodTypes, true),
-        PtrTy));
-  // Protocol list
-  Elements.push_back(llvm::ConstantExpr::getBitCast(
-        GenerateProtocolList(Protocols), PtrTy));
+  llvm::Constant *Elements[] = {
+      MakeConstantString(CategoryName), MakeConstantString(ClassName),
+      // Instance method list
+      llvm::ConstantExpr::getBitCast(
+          GenerateMethodList(ClassName, CategoryName, InstanceMethodSels,
+                             InstanceMethodTypes, false),
+          PtrTy),
+      // Class method list
+      llvm::ConstantExpr::getBitCast(GenerateMethodList(ClassName, CategoryName,
+                                                        ClassMethodSels,
+                                                        ClassMethodTypes, true),
+                                     PtrTy),
+      // Protocol list
+      llvm::ConstantExpr::getBitCast(GenerateProtocolList(Protocols), PtrTy)};
   Categories.push_back(llvm::ConstantExpr::getBitCast(
         MakeGlobal(llvm::StructType::get(PtrToInt8Ty, PtrToInt8Ty,
             PtrTy, PtrTy, PtrTy, nullptr), Elements, CGM.getPointerAlign()),
@@ -2168,18 +2206,19 @@
 
   // Get the class name
   ObjCInterfaceDecl *ClassDecl =
-    const_cast<ObjCInterfaceDecl *>(OID->getClassInterface());
+      const_cast<ObjCInterfaceDecl *>(OID->getClassInterface());
   std::string ClassName = ClassDecl->getNameAsString();
+
   // Emit the symbol that is used to generate linker errors if this class is
   // referenced in other modules but not declared.
   std::string classSymbolName = "__objc_class_name_" + ClassName;
-  if (llvm::GlobalVariable *symbol =
-      TheModule.getGlobalVariable(classSymbolName)) {
+  if (auto *symbol = TheModule.getGlobalVariable(classSymbolName)) {
     symbol->setInitializer(llvm::ConstantInt::get(LongTy, 0));
   } else {
     new llvm::GlobalVariable(TheModule, LongTy, false,
-    llvm::GlobalValue::ExternalLinkage, llvm::ConstantInt::get(LongTy, 0),
-    classSymbolName);
+                             llvm::GlobalValue::ExternalLinkage,
+                             llvm::ConstantInt::get(LongTy, 0),
+                             classSymbolName);
   }
 
   // Get the size of instances.
@@ -2257,7 +2296,6 @@
     MakeGlobalArray(PtrToIntTy, IvarOffsetValues, CGM.getPointerAlign(),
                     ".ivar.offsets");
 
-
   // Collect information about instance methods
   SmallVector<Selector, 16> InstanceMethodSels;
   SmallVector<llvm::Constant*, 16> InstanceMethodTypes;
@@ -2271,7 +2309,6 @@
   llvm::Constant *Properties = GeneratePropertyList(OID, InstanceMethodSels,
           InstanceMethodTypes);
 
-
   // Collect information about class methods
   SmallVector<Selector, 16> ClassMethodSels;
   SmallVector<llvm::Constant*, 16> ClassMethodTypes;
@@ -2344,19 +2381,35 @@
       ++ivarIndex;
   }
   llvm::Constant *ZeroPtr = llvm::ConstantInt::get(IntPtrTy, 0);
+
   //Generate metaclass for class methods
-  llvm::Constant *MetaClassStruct = GenerateClassStructure(NULLPtr,
-      NULLPtr, 0x12L, ClassName.c_str(), nullptr, Zeros[0], GenerateIvarList(
-        empty, empty, empty), ClassMethodList, NULLPtr,
-      NULLPtr, NULLPtr, ZeroPtr, ZeroPtr, true);
+  llvm::Constant *MetaClassStruct = GenerateClassStructure(
+      NULLPtr, NULLPtr, 0x12L, ClassName.c_str(), nullptr, Zeros[0],
+      GenerateIvarList(empty, empty, empty), ClassMethodList, NULLPtr, NULLPtr,
+      NULLPtr, ZeroPtr, ZeroPtr, true);
+  if (CGM.getTriple().isOSBinFormatCOFF()) {
+    auto Storage = llvm::GlobalValue::DefaultStorageClass;
+    if (OID->getClassInterface()->hasAttr<DLLImportAttr>())
+      Storage = llvm::GlobalValue::DLLImportStorageClass;
+    else if (OID->getClassInterface()->hasAttr<DLLExportAttr>())
+      Storage = llvm::GlobalValue::DLLExportStorageClass;
+    cast<llvm::GlobalValue>(MetaClassStruct)->setDLLStorageClass(Storage);
+  }
 
   // Generate the class structure
-  llvm::Constant *ClassStruct =
-    GenerateClassStructure(MetaClassStruct, SuperClass, 0x11L,
-                           ClassName.c_str(), nullptr,
-      llvm::ConstantInt::get(LongTy, instanceSize), IvarList,
-      MethodList, GenerateProtocolList(Protocols), IvarOffsetArray,
-      Properties, StrongIvarBitmap, WeakIvarBitmap);
+  llvm::Constant *ClassStruct = GenerateClassStructure(
+      MetaClassStruct, SuperClass, 0x11L, ClassName.c_str(), nullptr,
+      llvm::ConstantInt::get(LongTy, instanceSize), IvarList, MethodList,
+      GenerateProtocolList(Protocols), IvarOffsetArray, Properties,
+      StrongIvarBitmap, WeakIvarBitmap);
+  if (CGM.getTriple().isOSBinFormatCOFF()) {
+    auto Storage = llvm::GlobalValue::DefaultStorageClass;
+    if (OID->getClassInterface()->hasAttr<DLLImportAttr>())
+      Storage = llvm::GlobalValue::DLLImportStorageClass;
+    else if (OID->getClassInterface()->hasAttr<DLLExportAttr>())
+      Storage = llvm::GlobalValue::DLLExportStorageClass;
+    cast<llvm::GlobalValue>(ClassStruct)->setDLLStorageClass(Storage);
+  }
 
   // Resolve the class aliases, if they exist.
   if (ClassPtrAlias) {
@@ -2377,7 +2430,6 @@
   Classes.push_back(ClassStruct);
 }
 
-
 llvm::Function *CGObjCGNU::ModuleInitFunction() {
   // Only emit an ObjC load function if no Objective-C stuff has been called
   if (Classes.empty() && Categories.empty() && ConstantStrings.empty() &&
@@ -2652,12 +2704,15 @@
 llvm::Constant *CGObjCGNU::GetGetStructFunction() {
   return GetStructPropertyFn;
 }
+
 llvm::Constant *CGObjCGNU::GetSetStructFunction() {
   return SetStructPropertyFn;
 }
+
 llvm::Constant *CGObjCGNU::GetCppAtomicObjectGetFunction() {
   return nullptr;
 }
+
 llvm::Constant *CGObjCGNU::GetCppAtomicObjectSetFunction() {
   return nullptr;
 }
@@ -2686,7 +2741,6 @@
   // In Objective-C++ mode, we actually emit something equivalent to the C++
   // exception handler. 
   EmitTryCatchStmt(CGF, S, EnterCatchFn, ExitCatchFn, ExceptionReThrowFn);
-  return ;
 }
 
 void CGObjCGNU::EmitThrowStmt(CodeGenFunction &CGF,
@@ -2801,7 +2855,7 @@
     // to replace it with the real version for a library.  In non-PIC code you
     // must compile with the fragile ABI if you want to use ivars from a
     // GCC-compiled class.
-    if (CGM.getLangOpts().PICLevel || CGM.getLangOpts().PIELevel) {
+    if (CGM.getLangOpts().PICLevel) {
       llvm::GlobalVariable *IvarOffsetGV = new llvm::GlobalVariable(TheModule,
             Int32Ty, false,
             llvm::GlobalValue::PrivateLinkage, OffsetGuess, Name+".guess");
@@ -2849,7 +2903,12 @@
                          const ObjCIvarDecl *Ivar) {
   if (CGM.getLangOpts().ObjCRuntime.isNonFragile()) {
     Interface = FindIvarInterface(CGM.getContext(), Interface, Ivar);
-    if (RuntimeVersion < 10)
+
+    // The MSVC linker cannot have a single global defined as LinkOnceAnyLinkage
+    // and ExternalLinkage, so create a reference to the ivar global and rely on
+    // the definition being created as part of GenerateClass.
+    if (RuntimeVersion < 10 ||
+        CGF.CGM.getTarget().getTriple().isKnownWindowsMSVCEnvironment())
       return CGF.Builder.CreateZExtOrBitCast(
           CGF.Builder.CreateDefaultAlignedLoad(CGF.Builder.CreateAlignedLoad(
                   ObjCIvarOffsetVariable(Interface, Ivar),
diff --git a/lib/CodeGen/CGObjCMac.cpp b/lib/CodeGen/CGObjCMac.cpp
index 29eecf5..8b038b3 100644
--- a/lib/CodeGen/CGObjCMac.cpp
+++ b/lib/CodeGen/CGObjCMac.cpp
@@ -236,13 +236,11 @@
     CodeGen::CodeGenTypes &Types = CGM.getTypes();
     ASTContext &Ctx = CGM.getContext();
     // id objc_getProperty (id, SEL, ptrdiff_t, bool)
-    SmallVector<CanQualType,4> Params;
     CanQualType IdType = Ctx.getCanonicalParamType(Ctx.getObjCIdType());
     CanQualType SelType = Ctx.getCanonicalParamType(Ctx.getObjCSelType());
-    Params.push_back(IdType);
-    Params.push_back(SelType);
-    Params.push_back(Ctx.getPointerDiffType()->getCanonicalTypeUnqualified());
-    Params.push_back(Ctx.BoolTy);
+    CanQualType Params[] = {
+        IdType, SelType,
+        Ctx.getPointerDiffType()->getCanonicalTypeUnqualified(), Ctx.BoolTy};
     llvm::FunctionType *FTy =
         Types.GetFunctionType(
           Types.arrangeBuiltinFunctionDeclaration(IdType, Params));
@@ -253,15 +251,15 @@
     CodeGen::CodeGenTypes &Types = CGM.getTypes();
     ASTContext &Ctx = CGM.getContext();
     // void objc_setProperty (id, SEL, ptrdiff_t, id, bool, bool)
-    SmallVector<CanQualType,6> Params;
     CanQualType IdType = Ctx.getCanonicalParamType(Ctx.getObjCIdType());
     CanQualType SelType = Ctx.getCanonicalParamType(Ctx.getObjCSelType());
-    Params.push_back(IdType);
-    Params.push_back(SelType);
-    Params.push_back(Ctx.getPointerDiffType()->getCanonicalTypeUnqualified());
-    Params.push_back(IdType);
-    Params.push_back(Ctx.BoolTy);
-    Params.push_back(Ctx.BoolTy);
+    CanQualType Params[] = {
+        IdType,
+        SelType,
+        Ctx.getPointerDiffType()->getCanonicalTypeUnqualified(),
+        IdType,
+        Ctx.BoolTy,
+        Ctx.BoolTy};
     llvm::FunctionType *FTy =
         Types.GetFunctionType(
           Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params));
@@ -357,8 +355,7 @@
     Params.push_back(
       Ctx.getCanonicalType(Ctx.getPointerType(Ctx.CharTy.withConst())));
     llvm::FunctionType *FTy =
-        Types.GetFunctionType(
-          Types.arrangeBuiltinFunctionDeclaration(
+        Types.GetFunctionType(Types.arrangeBuiltinFunctionDeclaration(
                                 Ctx.getCanonicalType(Ctx.getObjCClassType()),
                                 Params));
     return CGM.CreateRuntimeFunction(FTy, "objc_lookUpClass");
@@ -584,7 +581,6 @@
     return CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(CGM.Int32Ty, params, false),
       "objc_exception_match");
-
   }
 
   /// SetJmpFn - LLVM _setjmp function.
@@ -608,7 +604,6 @@
 /// modern abi
 class ObjCNonFragileABITypesHelper : public ObjCCommonTypesHelper {
 public:
-
   // MethodListnfABITy - LLVM for struct _method_list_t
   llvm::StructType *MethodListnfABITy;
 
@@ -1136,9 +1131,8 @@
 
   /// EmitMethodList - Emit the method list for the given
   /// implementation. The return value has type MethodListPtrTy.
-  llvm::Constant *EmitMethodList(Twine Name,
-                                 const char *Section,
-                                 ArrayRef<llvm::Constant*> Methods);
+  llvm::Constant *EmitMethodList(Twine Name, StringRef Section,
+                                 ArrayRef<llvm::Constant *> Methods);
 
   /// EmitMethodDescList - Emit a method description list for a list of
   /// method declarations.
@@ -1151,9 +1145,8 @@
   ///  - begin, end: The method list to output.
   ///
   /// The return value has type MethodDescriptionListPtrTy.
-  llvm::Constant *EmitMethodDescList(Twine Name,
-                                     const char *Section,
-                                     ArrayRef<llvm::Constant*> Methods);
+  llvm::Constant *EmitMethodDescList(Twine Name, StringRef Section,
+                                     ArrayRef<llvm::Constant *> Methods);
 
   /// GetOrEmitProtocol - Get the protocol object for the given
   /// declaration, emitting it if necessary. The return value has type
@@ -1272,9 +1265,10 @@
 
   /// GetClassGlobal - Return the global variable for the Objective-C
   /// class of the given name.
-  llvm::Constant *GetClassGlobal(const std::string &Name,
+  llvm::Constant *GetClassGlobal(StringRef Name,
                                  bool ForDefinition,
-                                 bool Weak) override {
+                                 bool Weak = false,
+                                 bool DLLImport = false) override {
     llvm_unreachable("CGObjCMac::GetClassGlobal");
   }
 };
@@ -1311,9 +1305,8 @@
 
   /// AddModuleClassList - Add the given list of class pointers to the
   /// module with the provided symbol and section names.
-  void AddModuleClassList(ArrayRef<llvm::GlobalValue*> Container,
-                          const char *SymbolName,
-                          const char *SectionName);
+  void AddModuleClassList(ArrayRef<llvm::GlobalValue *> Container,
+                          StringRef SymbolName, StringRef SectionName);
 
   llvm::GlobalVariable * BuildClassRoTInitializer(unsigned flags,
                                               unsigned InstanceStart,
@@ -1332,9 +1325,8 @@
 
   /// EmitMethodList - Emit the method list for the given
   /// implementation. The return value has type MethodListnfABITy.
-  llvm::Constant *EmitMethodList(Twine Name,
-                                 const char *Section,
-                                 ArrayRef<llvm::Constant*> Methods);
+  llvm::Constant *EmitMethodList(Twine Name, StringRef Section,
+                                 ArrayRef<llvm::Constant *> Methods);
   /// EmitIvarList - Emit the ivar list for the given
   /// implementation. If ForClass is true the list of class ivars
   /// (i.e. metaclass ivars) is emitted, otherwise the list of
@@ -1375,9 +1367,8 @@
   
   /// GetClassGlobal - Return the global variable for the Objective-C
   /// class of the given name.
-  llvm::Constant *GetClassGlobal(const std::string &Name,
-                                 bool ForDefinition,
-                                 bool Weak) override;
+  llvm::Constant *GetClassGlobal(StringRef Name, bool ForDefinition,
+                                 bool Weak, bool DLLImport) override;
 
   /// EmitClassRef - Return a Value*, of type ObjCTypes.ClassPtrTy,
   /// for the given class reference.
@@ -1417,13 +1408,9 @@
   llvm::Constant *GetInterfaceEHType(const ObjCInterfaceDecl *ID,
                                   bool ForDefinition);
 
-  const char *getMetaclassSymbolPrefix() const {
-    return "OBJC_METACLASS_$_";
-  }
+  StringRef getMetaclassSymbolPrefix() const { return "OBJC_METACLASS_$_"; }
 
-  const char *getClassSymbolPrefix() const {
-    return "OBJC_CLASS_$_";
-  }
+  StringRef getClassSymbolPrefix() const { return "OBJC_CLASS_$_"; }
 
   void GetClassSizeInfo(const ObjCImplementationDecl *OID,
                         uint32_t &InstanceStart,
@@ -1525,12 +1512,15 @@
   llvm::Constant *GetSetStructFunction() override {
     return ObjCTypes.getCopyStructFn();
   }
+
   llvm::Constant *GetGetStructFunction() override {
     return ObjCTypes.getCopyStructFn();
   }
+
   llvm::Constant *GetCppAtomicObjectSetFunction() override {
     return ObjCTypes.getCppAtomicObjectFunction();
   }
+
   llvm::Constant *GetCppAtomicObjectGetFunction() override {
     return ObjCTypes.getCppAtomicObjectFunction();
   }
@@ -1953,7 +1943,7 @@
   // Emit a null-check if there's a consumed argument other than the receiver.
   bool RequiresNullCheck = false;
   if (ReceiverCanBeNull && CGM.getLangOpts().ObjCAutoRefCount && Method) {
-    for (const auto *ParamDecl : Method->params()) {
+    for (const auto *ParamDecl : Method->parameters()) {
       if (ParamDecl->hasAttr<NSConsumedAttr>()) {
         if (!nullReturn.NullBB)
           nullReturn.init(CGF, Arg0);
@@ -2046,6 +2036,7 @@
     bool IsDisordered = false;
 
     llvm::SmallVector<IvarInfo, 8> IvarsInfo;
+
   public:
     IvarLayoutBuilder(CodeGenModule &CGM, CharUnits instanceBegin,
                       CharUnits instanceEnd, bool forStrongLayout)
@@ -2081,7 +2072,7 @@
       printf("\n");
     }
   };
-}
+} // end anonymous namespace
 
 llvm::Constant *CGObjCCommonMac::BuildGCBlockLayout(CodeGenModule &CGM,
                                                 const CGBlockInfo &blockInfo) {
@@ -2160,7 +2151,6 @@
   }
 }
 
-
 /// getBlockCaptureLifetime - This routine returns life time of the captured
 /// block variable for the purpose of block layout meta-data generation. FQT is
 /// the type of the variable captured in the block.
@@ -2648,7 +2638,6 @@
   return getBitmapBlockLayout(false);
 }
 
-
 llvm::Constant *CGObjCCommonMac::BuildByrefLayout(CodeGen::CodeGenModule &CGM,
                                                   QualType T) {
   assert(CGM.getLangOpts().getGC() == LangOptions::NonGC);
@@ -3065,8 +3054,8 @@
 }
 
 llvm::Constant *
-CGObjCMac::EmitMethodDescList(Twine Name, const char *Section,
-                              ArrayRef<llvm::Constant*> Methods) {
+CGObjCMac::EmitMethodDescList(Twine Name, StringRef Section,
+                              ArrayRef<llvm::Constant *> Methods) {
   // Return null for empty list.
   if (Methods.empty())
     return llvm::Constant::getNullValue(ObjCTypes.MethodDescriptionListPtrTy);
@@ -3609,9 +3598,8 @@
   return llvm::ConstantStruct::get(ObjCTypes.MethodTy, Method);
 }
 
-llvm::Constant *CGObjCMac::EmitMethodList(Twine Name,
-                                          const char *Section,
-                                          ArrayRef<llvm::Constant*> Methods) {
+llvm::Constant *CGObjCMac::EmitMethodList(Twine Name, StringRef Section,
+                                          ArrayRef<llvm::Constant *> Methods) {
   // Return null for empty list.
   if (Methods.empty())
     return llvm::Constant::getNullValue(ObjCTypes.MethodListPtrTy);
@@ -3686,6 +3674,7 @@
 llvm::Constant *CGObjCMac::GetGetStructFunction() {
   return ObjCTypes.getCopyStructFn();
 }
+
 llvm::Constant *CGObjCMac::GetSetStructFunction() {
   return ObjCTypes.getCopyStructFn();
 }
@@ -3693,6 +3682,7 @@
 llvm::Constant *CGObjCMac::GetCppAtomicObjectGetFunction() {
   return ObjCTypes.getCppAtomicObjectFunction();
 }
+
 llvm::Constant *CGObjCMac::GetCppAtomicObjectSetFunction() {
   return ObjCTypes.getCppAtomicObjectFunction();
 }
@@ -3790,7 +3780,7 @@
     void emitWriteHazard();
     void emitHazardsInNewBlocks();
   };
-}
+} // end anonymous namespace
 
 /// Create the fragile-ABI read and write hazards based on the current
 /// state of the function, which is presumed to be immediately prior
@@ -4411,7 +4401,6 @@
   llvm::Value *args[] = { src, dst.getPointer() };
   CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignWeakFn(),
                               args, "weakassign");
-  return;
 }
 
 /// EmitObjCGlobalAssign - Code gen for assigning to a __strong object.
@@ -4437,7 +4426,6 @@
   else
     CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignThreadLocalFn(),
                                 args, "threadlocalassign");
-  return;
 }
 
 /// EmitObjCIvarAssign - Code gen for assigning to a __strong object.
@@ -4459,7 +4447,6 @@
   dst = CGF.Builder.CreateBitCast(dst, ObjCTypes.PtrObjectPtrTy);
   llvm::Value *args[] = { src, dst.getPointer(), ivarOffset };
   CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignIvarFn(), args);
-  return;
 }
 
 /// EmitObjCStrongCastAssign - Code gen for assigning to a __strong cast object.
@@ -4480,7 +4467,6 @@
   llvm::Value *args[] = { src, dst.getPointer() };
   CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignStrongCastFn(),
                               args, "strongassign");
-  return;
 }
 
 void CGObjCMac::EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF,
@@ -5220,9 +5206,8 @@
 }
 
 CGObjCNonFragileABIMac::CGObjCNonFragileABIMac(CodeGen::CodeGenModule &cgm)
-  : CGObjCCommonMac(cgm),
-    ObjCTypes(cgm) {
-  ObjCEmptyCacheVar = ObjCEmptyVtableVar = nullptr;
+    : CGObjCCommonMac(cgm), ObjCTypes(cgm), ObjCEmptyCacheVar(nullptr),
+      ObjCEmptyVtableVar(nullptr) {
   ObjCABI = 2;
 }
 
@@ -5312,7 +5297,6 @@
   // struct _objc_cache *
   CacheTy = llvm::StructType::create(VMContext, "struct._objc_cache");
   CachePtrTy = llvm::PointerType::getUnqual(CacheTy);
-    
 }
 
 ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm)
@@ -5498,7 +5482,6 @@
     llvm::StructType::create("struct._objc_exception_data",
                              llvm::ArrayType::get(CGM.Int32Ty,SetJmpBufferSize),
                              StackPtrTy, nullptr);
-
 }
 
 ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(CodeGen::CodeGenModule &cgm)
@@ -5707,10 +5690,9 @@
   return nullptr;
 }
 
-void CGObjCNonFragileABIMac::
-AddModuleClassList(ArrayRef<llvm::GlobalValue*> Container,
-                   const char *SymbolName,
-                   const char *SectionName) {
+void CGObjCNonFragileABIMac::AddModuleClassList(
+    ArrayRef<llvm::GlobalValue *> Container, StringRef SymbolName,
+    StringRef SectionName) {
   unsigned NumClasses = Container.size();
 
   if (!NumClasses)
@@ -5969,15 +5951,17 @@
   llvm::GlobalVariable *GV = cast<llvm::GlobalVariable>(
                                GetClassGlobal(ClassName,
                                               /*ForDefinition=*/true,
-                                              Weak));
+                                              Weak,
+                                              /*DLLImport (ignored on def)*/ false));
   if (Init->getType() != GV->getValueType())
     Init = llvm::ConstantExpr::getBitCast(Init, GV->getValueType());
   GV->setInitializer(Init);
   GV->setSection("__DATA, __objc_data");
   GV->setAlignment(
     CGM.getDataLayout().getABITypeAlignment(ObjCTypes.ClassnfABITy));
-  if (HiddenVisibility)
-    GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+  if (!CGM.getTriple().isOSBinFormatCOFF())
+    if (HiddenVisibility)
+      GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
   return GV;
 }
 
@@ -6002,49 +5986,60 @@
     InstanceStart = RL.getFieldOffset(0) / CGM.getContext().getCharWidth();
 }
 
-void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) {
-  std::string ClassName = ID->getObjCRuntimeNameAsString();
-  if (!ObjCEmptyCacheVar) {
-    ObjCEmptyCacheVar = new llvm::GlobalVariable(
-      CGM.getModule(),
-      ObjCTypes.CacheTy,
-      false,
-      llvm::GlobalValue::ExternalLinkage,
-      nullptr,
-      "_objc_empty_cache");
+static llvm::GlobalValue::DLLStorageClassTypes getStorage(CodeGenModule &CGM,
+                                                          StringRef Name) {
+  IdentifierInfo &II = CGM.getContext().Idents.get(Name);
+  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
+  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
 
-    // Make this entry NULL for any iOS device target, any iOS simulator target,
-    // OS X with deployment target 10.9 or later.
+  const VarDecl *VD = nullptr;
+  for (const auto &Result : DC->lookup(&II))
+    if ((VD = dyn_cast<VarDecl>(Result)))
+      break;
+
+  if (!VD)
+    return llvm::GlobalValue::DLLImportStorageClass;
+  if (VD->hasAttr<DLLExportAttr>())
+    return llvm::GlobalValue::DLLExportStorageClass;
+  if (VD->hasAttr<DLLImportAttr>())
+    return llvm::GlobalValue::DLLImportStorageClass;
+  return llvm::GlobalValue::DefaultStorageClass;
+}
+
+void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) {
+  if (!ObjCEmptyCacheVar) {
+    ObjCEmptyCacheVar =
+        new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.CacheTy, false,
+                                 llvm::GlobalValue::ExternalLinkage, nullptr,
+                                 "_objc_empty_cache");
+    if (CGM.getTriple().isOSBinFormatCOFF())
+      ObjCEmptyCacheVar->setDLLStorageClass(getStorage(CGM, "_objc_empty_cache"));
+
+    // Only OS X with deployment version <10.9 use the empty vtable symbol
     const llvm::Triple &Triple = CGM.getTarget().getTriple();
-    if (Triple.isiOS() || Triple.isWatchOS() ||
-        (Triple.isMacOSX() && !Triple.isMacOSXVersionLT(10, 9)))
-      // This entry will be null.
-      ObjCEmptyVtableVar = nullptr;
-    else
-      ObjCEmptyVtableVar = new llvm::GlobalVariable(
-                                                    CGM.getModule(),
-                                                    ObjCTypes.ImpnfABITy,
-                                                    false,
-                                                    llvm::GlobalValue::ExternalLinkage,
-                                                    nullptr,
-                                                    "_objc_empty_vtable");
+    if (Triple.isMacOSX() && Triple.isMacOSXVersionLT(10, 9))
+      ObjCEmptyVtableVar =
+          new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ImpnfABITy, false,
+                                   llvm::GlobalValue::ExternalLinkage, nullptr,
+                                   "_objc_empty_vtable");
   }
-  assert(ID->getClassInterface() &&
-         "CGObjCNonFragileABIMac::GenerateClass - class is 0");
+
   // FIXME: Is this correct (that meta class size is never computed)?
   uint32_t InstanceStart =
     CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ClassnfABITy);
   uint32_t InstanceSize = InstanceStart;
   uint32_t flags = NonFragileABI_Class_Meta;
-  llvm::SmallString<64> ObjCMetaClassName(getMetaclassSymbolPrefix());
-  llvm::SmallString<64> ObjCClassName(getClassSymbolPrefix());
-  llvm::SmallString<64> TClassName;
 
   llvm::Constant *SuperClassGV, *IsAGV;
 
+  StringRef ClassName = ID->getObjCRuntimeNameAsString();
+  const auto *CI = ID->getClassInterface();
+  assert(CI && "CGObjCNonFragileABIMac::GenerateClass - class is 0");
+
   // Build the flags for the metaclass.
-  bool classIsHidden =
-    ID->getClassInterface()->getVisibility() == HiddenVisibility;
+  bool classIsHidden = (CGM.getTriple().isOSBinFormatCOFF())
+                           ? !CI->hasAttr<DLLExportAttr>()
+                           : CI->getVisibility() == HiddenVisibility;
   if (classIsHidden)
     flags |= NonFragileABI_Class_Hidden;
 
@@ -6053,49 +6048,59 @@
   if (ID->hasNonZeroConstructors() || ID->hasDestructors()) {
     flags |= NonFragileABI_Class_HasCXXStructors;
     if (!ID->hasNonZeroConstructors())
-      flags |= NonFragileABI_Class_HasCXXDestructorOnly;  
+      flags |= NonFragileABI_Class_HasCXXDestructorOnly;
   }
 
-  if (!ID->getClassInterface()->getSuperClass()) {
+  if (!CI->getSuperClass()) {
     // class is root
     flags |= NonFragileABI_Class_Root;
-    TClassName = ObjCClassName;
-    TClassName += ClassName;
-    SuperClassGV = GetClassGlobal(TClassName.str(),
+
+    SuperClassGV = GetClassGlobal((getClassSymbolPrefix() + ClassName).str(),
                                   /*ForDefinition=*/false,
-                                  ID->getClassInterface()->isWeakImported());
-    TClassName = ObjCMetaClassName;
-    TClassName += ClassName;
-    IsAGV = GetClassGlobal(TClassName.str(),
+                                  CI->isWeakImported(),
+                                  CGM.getTriple().isOSBinFormatCOFF()
+                                    && CI->hasAttr<DLLImportAttr>());
+
+    IsAGV = GetClassGlobal((getMetaclassSymbolPrefix() + ClassName).str(),
                            /*ForDefinition=*/false,
-                           ID->getClassInterface()->isWeakImported());
+                           CI->isWeakImported(),
+                           CGM.getTriple().isOSBinFormatCOFF()
+                             && CI->hasAttr<DLLImportAttr>());
   } else {
     // Has a root. Current class is not a root.
     const ObjCInterfaceDecl *Root = ID->getClassInterface();
     while (const ObjCInterfaceDecl *Super = Root->getSuperClass())
       Root = Super;
-    TClassName = ObjCMetaClassName ;
-    TClassName += Root->getObjCRuntimeNameAsString();
-    IsAGV = GetClassGlobal(TClassName.str(),
+
+    const auto *Super = CI->getSuperClass();
+    StringRef RootClassName = Root->getObjCRuntimeNameAsString();
+    StringRef SuperClassName = Super->getObjCRuntimeNameAsString();
+
+    IsAGV = GetClassGlobal((getMetaclassSymbolPrefix() + RootClassName).str(),
                            /*ForDefinition=*/false,
-                           Root->isWeakImported());
+                           Root->isWeakImported(),
+                           CGM.getTriple().isOSBinFormatCOFF()
+                             && Super->hasAttr<DLLImportAttr>());
 
     // work on super class metadata symbol.
-    TClassName = ObjCMetaClassName;
-    TClassName += ID->getClassInterface()->getSuperClass()->getObjCRuntimeNameAsString();
-    SuperClassGV = GetClassGlobal(
-                                  TClassName.str(),
-                                  /*ForDefinition=*/false,
-                                  ID->getClassInterface()->getSuperClass()->isWeakImported());
+    SuperClassGV =
+        GetClassGlobal((getMetaclassSymbolPrefix() + SuperClassName).str(),
+                       /*ForDefinition=*/false,
+                       Super->isWeakImported(),
+                       CGM.getTriple().isOSBinFormatCOFF()
+                         && Super->hasAttr<DLLImportAttr>());
   }
-  llvm::GlobalVariable *CLASS_RO_GV = BuildClassRoTInitializer(flags,
-                                                               InstanceStart,
-                                                               InstanceSize,ID);
-  TClassName = ObjCMetaClassName;
-  TClassName += ClassName;
-  llvm::GlobalVariable *MetaTClass = BuildClassMetaData(
-      TClassName.str(), IsAGV, SuperClassGV, CLASS_RO_GV, classIsHidden,
-      ID->getClassInterface()->isWeakImported());
+
+  llvm::GlobalVariable *CLASS_RO_GV =
+      BuildClassRoTInitializer(flags, InstanceStart, InstanceSize, ID);
+
+  llvm::GlobalVariable *MetaTClass =
+      BuildClassMetaData((getMetaclassSymbolPrefix() + ClassName).str(), IsAGV,
+                         SuperClassGV, CLASS_RO_GV, classIsHidden,
+                         CI->isWeakImported());
+  if (CGM.getTriple().isOSBinFormatCOFF())
+    if (CI->hasAttr<DLLExportAttr>())
+      MetaTClass->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
   DefinedMetaClasses.push_back(MetaTClass);
 
   // Metadata for the class
@@ -6116,35 +6121,38 @@
       flags |= NonFragileABI_Class_HasCXXDestructorOnly;
   }
 
-  if (hasObjCExceptionAttribute(CGM.getContext(), ID->getClassInterface()))
+  if (hasObjCExceptionAttribute(CGM.getContext(), CI))
     flags |= NonFragileABI_Class_Exception;
 
-  if (!ID->getClassInterface()->getSuperClass()) {
+  if (!CI->getSuperClass()) {
     flags |= NonFragileABI_Class_Root;
     SuperClassGV = nullptr;
   } else {
     // Has a root. Current class is not a root.
-    TClassName = ObjCClassName;
-    TClassName += ID->getClassInterface()->getSuperClass()->getObjCRuntimeNameAsString();
-    SuperClassGV = GetClassGlobal(
-                                  TClassName.str(),
-                                  /*ForDefinition=*/false,
-                                  ID->getClassInterface()->getSuperClass()->isWeakImported());
-  }
-  GetClassSizeInfo(ID, InstanceStart, InstanceSize);
-  CLASS_RO_GV = BuildClassRoTInitializer(flags,
-                                         InstanceStart,
-                                         InstanceSize,
-                                         ID);
+    const auto *Super = CI->getSuperClass();
+    StringRef SuperClassName = Super->getObjCRuntimeNameAsString();
 
-  TClassName = ObjCClassName;
-  TClassName += ClassName;
+    SuperClassGV =
+        GetClassGlobal((getClassSymbolPrefix() + SuperClassName).str(),
+                       /*ForDefinition=*/false,
+                       Super->isWeakImported(),
+                       CGM.getTriple().isOSBinFormatCOFF()
+                         && Super->hasAttr<DLLImportAttr>());
+  }
+
+  GetClassSizeInfo(ID, InstanceStart, InstanceSize);
+  CLASS_RO_GV =
+      BuildClassRoTInitializer(flags, InstanceStart, InstanceSize, ID);
+
   llvm::GlobalVariable *ClassMD =
-    BuildClassMetaData(TClassName.str(), MetaTClass, SuperClassGV, CLASS_RO_GV,
-                       classIsHidden,
-                       ID->getClassInterface()->isWeakImported());
+    BuildClassMetaData((getClassSymbolPrefix() + ClassName).str(), MetaTClass,
+                       SuperClassGV, CLASS_RO_GV, classIsHidden,
+                       CI->isWeakImported());
+  if (CGM.getTriple().isOSBinFormatCOFF())
+    if (CI->hasAttr<DLLExportAttr>())
+      ClassMD->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
   DefinedClasses.push_back(ClassMD);
-  ImplementedClasses.push_back(ID->getClassInterface());
+  ImplementedClasses.push_back(CI);
 
   // Determine if this class is also "non-lazy".
   if (ImplementationIsNonLazy(ID))
@@ -6152,7 +6160,7 @@
 
   // Force the definition of the EHType if necessary.
   if (flags & NonFragileABI_Class_Exception)
-    GetInterfaceEHType(ID->getClassInterface(), true);
+    GetInterfaceEHType(CI, true);
   // Make sure method definition entries are all clear for next implementation.
   MethodDefinitions.clear();
 }
@@ -6225,7 +6233,9 @@
   // meta-class entry symbol
   llvm::Constant *ClassGV = GetClassGlobal(ExtClassName.str(),
                                            /*ForDefinition=*/false,
-                                           Interface->isWeakImported());
+                                           Interface->isWeakImported(),
+                                           CGM.getTriple().isOSBinFormatCOFF()
+                                             && Interface->hasAttr<DLLImportAttr>());
 
   Values[1] = ClassGV;
   std::vector<llvm::Constant*> Methods;
@@ -6330,9 +6340,8 @@
 /// }
 ///
 llvm::Constant *
-CGObjCNonFragileABIMac::EmitMethodList(Twine Name,
-                                       const char *Section,
-                                       ArrayRef<llvm::Constant*> Methods) {
+CGObjCNonFragileABIMac::EmitMethodList(Twine Name, StringRef Section,
+                                       ArrayRef<llvm::Constant *> Methods) {
   // Return null for empty list.
   if (Methods.empty())
     return llvm::Constant::getNullValue(ObjCTypes.MethodListnfABIPtrTy);
@@ -6362,18 +6371,28 @@
 llvm::GlobalVariable *
 CGObjCNonFragileABIMac::ObjCIvarOffsetVariable(const ObjCInterfaceDecl *ID,
                                                const ObjCIvarDecl *Ivar) {
-    
   const ObjCInterfaceDecl *Container = Ivar->getContainingInterface();
   llvm::SmallString<64> Name("OBJC_IVAR_$_");
   Name += Container->getObjCRuntimeNameAsString();
   Name += ".";
   Name += Ivar->getName();
-  llvm::GlobalVariable *IvarOffsetGV =
-    CGM.getModule().getGlobalVariable(Name);
-  if (!IvarOffsetGV)
-    IvarOffsetGV = new llvm::GlobalVariable(
-      CGM.getModule(), ObjCTypes.IvarOffsetVarTy, false,
-      llvm::GlobalValue::ExternalLinkage, nullptr, Name.str());
+  llvm::GlobalVariable *IvarOffsetGV = CGM.getModule().getGlobalVariable(Name);
+  if (!IvarOffsetGV) {
+    IvarOffsetGV =
+        new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.IvarOffsetVarTy,
+                                 false, llvm::GlobalValue::ExternalLinkage,
+                                 nullptr, Name.str());
+    if (CGM.getTriple().isOSBinFormatCOFF()) {
+      bool IsPrivateOrPackage =
+          Ivar->getAccessControl() == ObjCIvarDecl::Private ||
+          Ivar->getAccessControl() == ObjCIvarDecl::Package;
+
+      if (ID->hasAttr<DLLExportAttr>() && !IsPrivateOrPackage)
+        IvarOffsetGV->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
+      else if (ID->hasAttr<DLLImportAttr>())
+        IvarOffsetGV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
+    }
+  }
   return IvarOffsetGV;
 }
 
@@ -6387,14 +6406,17 @@
   IvarOffsetGV->setAlignment(
       CGM.getDataLayout().getABITypeAlignment(ObjCTypes.IvarOffsetVarTy));
 
-  // FIXME: This matches gcc, but shouldn't the visibility be set on the use as
-  // well (i.e., in ObjCIvarOffsetVariable).
-  if (Ivar->getAccessControl() == ObjCIvarDecl::Private ||
-      Ivar->getAccessControl() == ObjCIvarDecl::Package ||
-      ID->getVisibility() == HiddenVisibility)
-    IvarOffsetGV->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  else
-    IvarOffsetGV->setVisibility(llvm::GlobalValue::DefaultVisibility);
+  if (!CGM.getTriple().isOSBinFormatCOFF()) {
+    // FIXME: This matches gcc, but shouldn't the visibility be set on the use
+    // as well (i.e., in ObjCIvarOffsetVariable).
+    if (Ivar->getAccessControl() == ObjCIvarDecl::Private ||
+        Ivar->getAccessControl() == ObjCIvarDecl::Package ||
+        ID->getVisibility() == HiddenVisibility)
+      IvarOffsetGV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+    else
+      IvarOffsetGV->setVisibility(llvm::GlobalValue::DefaultVisibility);
+  }
+
   IvarOffsetGV->setSection("__DATA, __objc_ivar");
   return IvarOffsetGV;
 }
@@ -6857,7 +6879,7 @@
   
   bool requiresnullCheck = false;
   if (CGM.getLangOpts().ObjCAutoRefCount && method)
-    for (const auto *ParamDecl : method->params()) {
+    for (const auto *ParamDecl : method->parameters()) {
       if (ParamDecl->hasAttr<NSConsumedAttr>()) {
         if (!nullReturn.NullBB)
           nullReturn.init(CGF, arg0);
@@ -6905,10 +6927,9 @@
                       false, CallArgs, Method, Class, ObjCTypes);
 }
 
-llvm::Constant *
-CGObjCNonFragileABIMac::GetClassGlobal(const std::string &Name,
-                                       bool ForDefinition,
-                                       bool Weak) {
+llvm::Constant *CGObjCNonFragileABIMac::GetClassGlobal(StringRef Name,
+                                                       bool ForDefinition,
+                                                       bool Weak, bool DLLImport) {
   llvm::GlobalValue::LinkageTypes L =
       Weak ? llvm::GlobalValue::ExternalWeakLinkage
            : llvm::GlobalValue::ExternalLinkage;
@@ -6919,6 +6940,9 @@
     GV = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABITy,
                                   false, L, nullptr, Name);
 
+  if (DLLImport)
+    GV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
+
   assert(GV->getLinkage() == L);
 
   if (ForDefinition ||
@@ -6936,12 +6960,12 @@
   llvm::GlobalVariable *&Entry = ClassReferences[II];
   
   if (!Entry) {
-    std::string ClassName(
-      getClassSymbolPrefix() +
-      (ID ? ID->getObjCRuntimeNameAsString() : II->getName()).str());
+    StringRef Name = ID ? ID->getObjCRuntimeNameAsString() : II->getName();
+    std::string ClassName = (getClassSymbolPrefix() + Name).str();
     llvm::Constant *ClassGV = GetClassGlobal(ClassName,
                                              /*ForDefinition=*/false,
-                                             Weak);
+                                             Weak,
+                                             /*FIXME:DLLImport*/ false);
     Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABIPtrTy,
                                      false, llvm::GlobalValue::PrivateLinkage,
                                      ClassGV, "OBJC_CLASSLIST_REFERENCES_$_");
@@ -6979,7 +7003,10 @@
     ClassName += ID->getObjCRuntimeNameAsString();
     llvm::Constant *ClassGV = GetClassGlobal(ClassName.str(),
                                              /*ForDefinition=*/false,
-                                             ID->isWeakImported());
+                                             ID->isWeakImported(),
+                                             CGM.getTriple().isOSBinFormatCOFF()
+                                               && ID->hasAttr<DLLImportAttr>());
+
     Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABIPtrTy,
                                      false, llvm::GlobalValue::PrivateLinkage,
                                      ClassGV, "OBJC_CLASSLIST_SUP_REFS_$_");
@@ -7003,7 +7030,9 @@
     MetaClassName += ID->getObjCRuntimeNameAsString();
     llvm::Constant *MetaClassGV = GetClassGlobal(MetaClassName.str(),
                                                  /*ForDefinition=*/false,
-                                                 Weak);
+                                                 Weak,
+                                                 CGM.getTriple().isOSBinFormatCOFF()
+                                                   && ID->hasAttr<DLLImportAttr>());
 
     Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABIPtrTy,
                                      false, llvm::GlobalValue::PrivateLinkage,
@@ -7027,7 +7056,8 @@
     llvm::GlobalVariable *ClassGV = cast<llvm::GlobalVariable>(
                                       GetClassGlobal(ClassName.str(),
                                                      /*ForDefinition=*/true,
-                                                     /*Weak=*/true));
+                                                     /*Weak=*/true,
+                                                     /*DLLImport*/false));
     (void)ClassGV;
     assert(ClassGV->hasExternalWeakLinkage());
   }
@@ -7242,27 +7272,28 @@
 llvm::Constant *
 CGObjCNonFragileABIMac::GetEHType(QualType T) {
   // There's a particular fixed type info for 'id'.
-  if (T->isObjCIdType() ||
-      T->isObjCQualifiedIdType()) {
-    llvm::Constant *IDEHType =
-      CGM.getModule().getGlobalVariable("OBJC_EHTYPE_id");
-    if (!IDEHType)
+  if (T->isObjCIdType() || T->isObjCQualifiedIdType()) {
+    auto *IDEHType = CGM.getModule().getGlobalVariable("OBJC_EHTYPE_id");
+    if (!IDEHType) {
       IDEHType =
-        new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy,
-                                 false,
-                                 llvm::GlobalValue::ExternalLinkage,
-                                 nullptr, "OBJC_EHTYPE_id");
+          new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy, false,
+                                   llvm::GlobalValue::ExternalLinkage, nullptr,
+                                   "OBJC_EHTYPE_id");
+      if (CGM.getTriple().isOSBinFormatCOFF())
+        IDEHType->setDLLStorageClass(getStorage(CGM, "OBJC_EHTYPE_id"));
+    }
     return IDEHType;
   }
 
   // All other types should be Objective-C interface pointer types.
-  const ObjCObjectPointerType *PT =
-    T->getAs<ObjCObjectPointerType>();
+  const ObjCObjectPointerType *PT = T->getAs<ObjCObjectPointerType>();
   assert(PT && "Invalid @catch type.");
+
   const ObjCInterfaceType *IT = PT->getInterfaceType();
   assert(IT && "Invalid @catch type.");
+
   return GetInterfaceEHType(IT->getDecl(), false);
-}                                                  
+}
 
 void CGObjCNonFragileABIMac::EmitTryStmt(CodeGen::CodeGenFunction &CGF,
                                          const ObjCAtTryStmt &S) {
@@ -7295,6 +7326,7 @@
 CGObjCNonFragileABIMac::GetInterfaceEHType(const ObjCInterfaceDecl *ID,
                                            bool ForDefinition) {
   llvm::GlobalVariable * &Entry = EHTypeReferences[ID->getIdentifier()];
+  StringRef ClassName = ID->getObjCRuntimeNameAsString();
 
   // If we don't need a definition, return the entry if found or check
   // if we use an external reference.
@@ -7304,42 +7336,47 @@
 
     // If this type (or a super class) has the __objc_exception__
     // attribute, emit an external reference.
-    if (hasObjCExceptionAttribute(CGM.getContext(), ID))
-      return Entry =
-          new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy, false,
-                                   llvm::GlobalValue::ExternalLinkage,
-                                   nullptr,
-                                   ("OBJC_EHTYPE_$_" +
-                                    ID->getObjCRuntimeNameAsString()));
+    if (hasObjCExceptionAttribute(CGM.getContext(), ID)) {
+      std::string EHTypeName = ("OBJC_EHTYPE_$_" + ClassName).str();
+      Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy,
+                                       false, llvm::GlobalValue::ExternalLinkage,
+                                       nullptr, EHTypeName);
+      if (CGM.getTriple().isOSBinFormatCOFF()) {
+        if (ID->hasAttr<DLLExportAttr>())
+          Entry->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
+        else if (ID->hasAttr<DLLImportAttr>())
+          Entry->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
+      }
+      return Entry;
+    }
   }
 
-  // Otherwise we need to either make a new entry or fill in the
-  // initializer.
+  // Otherwise we need to either make a new entry or fill in the initializer.
   assert((!Entry || !Entry->hasInitializer()) && "Duplicate EHType definition");
-  llvm::SmallString<64> ClassName(getClassSymbolPrefix());
-  ClassName += ID->getObjCRuntimeNameAsString();
+
   std::string VTableName = "objc_ehtype_vtable";
-  llvm::GlobalVariable *VTableGV =
-    CGM.getModule().getGlobalVariable(VTableName);
-  if (!VTableGV)
-    VTableGV = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.Int8PtrTy,
-                                        false,
-                                        llvm::GlobalValue::ExternalLinkage,
-                                        nullptr, VTableName);
+  auto *VTableGV = CGM.getModule().getGlobalVariable(VTableName);
+  if (!VTableGV) {
+    VTableGV =
+        new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.Int8PtrTy, false,
+                                 llvm::GlobalValue::ExternalLinkage, nullptr,
+                                 VTableName);
+    if (CGM.getTriple().isOSBinFormatCOFF())
+      VTableGV->setDLLStorageClass(getStorage(CGM, VTableName));
+  }
 
   llvm::Value *VTableIdx = llvm::ConstantInt::get(CGM.Int32Ty, 2);
-
-  llvm::Constant *ClassGV = GetClassGlobal(ClassName.str(),
-                                           /*ForDefinition=*/false,
-                                           /*Weak=*/false);
-
   llvm::Constant *Values[] = {
       llvm::ConstantExpr::getGetElementPtr(VTableGV->getValueType(), VTableGV,
                                            VTableIdx),
       GetClassName(ID->getObjCRuntimeNameAsString()),
-      ClassGV};
-  llvm::Constant *Init =
-    llvm::ConstantStruct::get(ObjCTypes.EHTypeTy, Values);
+      GetClassGlobal((getClassSymbolPrefix() + ClassName).str(),
+                     /*ForDefinition*/false,
+                     /*Weak*/ID->isWeakImported(),
+                     /*DLLImport*/CGM.getTriple().isOSBinFormatCOFF()
+                       && ID->hasAttr<DLLImportAttr>()),
+  };
+  llvm::Constant *Init = llvm::ConstantStruct::get(ObjCTypes.EHTypeTy, Values);
 
   llvm::GlobalValue::LinkageTypes L = ForDefinition
                                           ? llvm::GlobalValue::ExternalLinkage
@@ -7347,19 +7384,22 @@
   if (Entry) {
     Entry->setInitializer(Init);
   } else {
-    llvm::SmallString<64> EHTYPEName("OBJC_EHTYPE_$_");
-    EHTYPEName += ID->getObjCRuntimeNameAsString();
-    Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy, false,
-                                     L,
-                                     Init,
-                                     EHTYPEName.str());
+    Entry =
+        new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy, false, L,
+                                 Init, ("OBJC_EHTYPE_$_" + ClassName).str());
+    if (CGM.getTriple().isOSBinFormatCOFF())
+      if (hasObjCExceptionAttribute(CGM.getContext(), ID))
+        if (ID->hasAttr<DLLExportAttr>())
+          Entry->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
   }
   assert(Entry->getLinkage() == L);
 
-  if (ID->getVisibility() == HiddenVisibility)
-    Entry->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  Entry->setAlignment(CGM.getDataLayout().getABITypeAlignment(
-      ObjCTypes.EHTypeTy));
+  if (!CGM.getTriple().isOSBinFormatCOFF())
+    if (ID->getVisibility() == HiddenVisibility)
+      Entry->setVisibility(llvm::GlobalValue::HiddenVisibility);
+
+  const auto &DL = CGM.getDataLayout();
+  Entry->setAlignment(DL.getABITypeAlignment(ObjCTypes.EHTypeTy));
 
   if (ForDefinition)
     Entry->setSection("__DATA,__objc_const");
diff --git a/lib/CodeGen/CGObjCRuntime.h b/lib/CodeGen/CGObjCRuntime.h
index 40a167f..dee1e9c 100644
--- a/lib/CodeGen/CGObjCRuntime.h
+++ b/lib/CodeGen/CGObjCRuntime.h
@@ -280,9 +280,8 @@
   virtual llvm::Constant *BuildByrefLayout(CodeGen::CodeGenModule &CGM,
                                            QualType T) = 0;
 
-  virtual llvm::Constant *GetClassGlobal(const std::string &Name,
-                                         bool ForDefinition,
-                                         bool Weak) = 0;
+  virtual llvm::Constant *GetClassGlobal(StringRef Name, bool ForDefinition,
+                                         bool Weak = false, bool DLLImport = false) = 0;
 
   struct MessageSendInfo {
     const CGFunctionInfo &CallInfo;
diff --git a/lib/CodeGen/CGOpenCLRuntime.cpp b/lib/CodeGen/CGOpenCLRuntime.cpp
index 6866789..8983fde 100644
--- a/lib/CodeGen/CGOpenCLRuntime.cpp
+++ b/lib/CodeGen/CGOpenCLRuntime.cpp
@@ -15,6 +15,7 @@
 
 #include "CGOpenCLRuntime.h"
 #include "CodeGenFunction.h"
+#include "TargetInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
 #include <assert.h>
@@ -34,54 +35,20 @@
          "Not an OpenCL specific type!");
 
   llvm::LLVMContext& Ctx = CGM.getLLVMContext();
-  uint32_t ImgAddrSpc =
-    CGM.getContext().getTargetAddressSpace(LangAS::opencl_global);
+  uint32_t ImgAddrSpc = CGM.getContext().getTargetAddressSpace(
+    CGM.getTarget().getOpenCLImageAddrSpace());
   switch (cast<BuiltinType>(T)->getKind()) {
   default: 
     llvm_unreachable("Unexpected opencl builtin type!");
     return nullptr;
-  case BuiltinType::OCLImage1d:
-    return llvm::PointerType::get(llvm::StructType::create(
-                           Ctx, "opencl.image1d_t"), ImgAddrSpc);
-  case BuiltinType::OCLImage1dArray:
-    return llvm::PointerType::get(llvm::StructType::create(
-                           Ctx, "opencl.image1d_array_t"), ImgAddrSpc);
-  case BuiltinType::OCLImage1dBuffer:
-    return llvm::PointerType::get(llvm::StructType::create(
-                           Ctx, "opencl.image1d_buffer_t"), ImgAddrSpc);
-  case BuiltinType::OCLImage2d:
-    return llvm::PointerType::get(llvm::StructType::create(
-                           Ctx, "opencl.image2d_t"), ImgAddrSpc);
-  case BuiltinType::OCLImage2dArray:
-    return llvm::PointerType::get(llvm::StructType::create(
-                           Ctx, "opencl.image2d_array_t"), ImgAddrSpc);
-  case BuiltinType::OCLImage2dDepth:
-    return llvm::PointerType::get(
-        llvm::StructType::create(Ctx, "opencl.image2d_depth_t"), ImgAddrSpc);
-  case BuiltinType::OCLImage2dArrayDepth:
-    return llvm::PointerType::get(
-        llvm::StructType::create(Ctx, "opencl.image2d_array_depth_t"),
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id: \
+    return llvm::PointerType::get( \
+        llvm::StructType::create(Ctx, "opencl." #ImgType "_" #Suffix "_t"), \
         ImgAddrSpc);
-  case BuiltinType::OCLImage2dMSAA:
-    return llvm::PointerType::get(
-        llvm::StructType::create(Ctx, "opencl.image2d_msaa_t"), ImgAddrSpc);
-  case BuiltinType::OCLImage2dArrayMSAA:
-    return llvm::PointerType::get(
-        llvm::StructType::create(Ctx, "opencl.image2d_array_msaa_t"),
-        ImgAddrSpc);
-  case BuiltinType::OCLImage2dMSAADepth:
-    return llvm::PointerType::get(
-        llvm::StructType::create(Ctx, "opencl.image2d_msaa_depth_t"),
-        ImgAddrSpc);
-  case BuiltinType::OCLImage2dArrayMSAADepth:
-    return llvm::PointerType::get(
-        llvm::StructType::create(Ctx, "opencl.image2d_array_msaa_depth_t"),
-        ImgAddrSpc);
-  case BuiltinType::OCLImage3d:
-    return llvm::PointerType::get(llvm::StructType::create(
-                           Ctx, "opencl.image3d_t"), ImgAddrSpc);
+#include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
-    return llvm::IntegerType::get(Ctx, 32);
+    return getSamplerType();
   case BuiltinType::OCLEvent:
     return llvm::PointerType::get(llvm::StructType::create(
                            Ctx, "opencl.event_t"), 0);
@@ -110,3 +77,12 @@
 
   return PipeTy;
 }
+
+llvm::PointerType *CGOpenCLRuntime::getSamplerType() {
+  if (!SamplerTy)
+    SamplerTy = llvm::PointerType::get(llvm::StructType::create(
+      CGM.getLLVMContext(), "opencl.sampler_t"),
+      CGM.getContext().getTargetAddressSpace(
+      LangAS::opencl_constant));
+  return SamplerTy;
+}
diff --git a/lib/CodeGen/CGOpenCLRuntime.h b/lib/CodeGen/CGOpenCLRuntime.h
index f1a7a31..41ead10 100644
--- a/lib/CodeGen/CGOpenCLRuntime.h
+++ b/lib/CodeGen/CGOpenCLRuntime.h
@@ -33,9 +33,11 @@
 protected:
   CodeGenModule &CGM;
   llvm::Type *PipeTy;
+  llvm::PointerType *SamplerTy;
 
 public:
-  CGOpenCLRuntime(CodeGenModule &CGM) : CGM(CGM), PipeTy(nullptr) {}
+  CGOpenCLRuntime(CodeGenModule &CGM) : CGM(CGM), PipeTy(nullptr),
+    SamplerTy(nullptr) {}
   virtual ~CGOpenCLRuntime();
 
   /// Emit the IR required for a work-group-local variable declaration, and add
@@ -47,6 +49,8 @@
   virtual llvm::Type *convertOpenCLSpecificType(const Type *T);
 
   virtual llvm::Type *getPipeType();
+
+  llvm::PointerType *getSamplerType();
 };
 
 }
diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index 6c796c7..afa823c 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -72,6 +72,8 @@
   /// \return LValue for thread id variable. This LValue always has type int32*.
   virtual LValue getThreadIDVariableLValue(CodeGenFunction &CGF);
 
+  virtual void emitUntiedSwitch(CodeGenFunction & /*CGF*/) {}
+
   CGOpenMPRegionKind getRegionKind() const { return RegionKind; }
 
   OpenMPDirectiveKind getDirectiveKind() const { return Kind; }
@@ -82,6 +84,8 @@
     return Info->getKind() == CR_OpenMP;
   }
 
+  ~CGOpenMPRegionInfo() override = default;
+
 protected:
   CGOpenMPRegionKind RegionKind;
   RegionCodeGenTy CodeGen;
@@ -90,7 +94,7 @@
 };
 
 /// \brief API for captured statement code generation in OpenMP constructs.
-class CGOpenMPOutlinedRegionInfo : public CGOpenMPRegionInfo {
+class CGOpenMPOutlinedRegionInfo final : public CGOpenMPRegionInfo {
 public:
   CGOpenMPOutlinedRegionInfo(const CapturedStmt &CS, const VarDecl *ThreadIDVar,
                              const RegionCodeGenTy &CodeGen,
@@ -100,6 +104,7 @@
         ThreadIDVar(ThreadIDVar) {
     assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
   }
+
   /// \brief Get a variable or parameter for storing global thread id
   /// inside OpenMP construct.
   const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }
@@ -120,16 +125,65 @@
 };
 
 /// \brief API for captured statement code generation in OpenMP constructs.
-class CGOpenMPTaskOutlinedRegionInfo : public CGOpenMPRegionInfo {
+class CGOpenMPTaskOutlinedRegionInfo final : public CGOpenMPRegionInfo {
 public:
+  class UntiedTaskActionTy final : public PrePostActionTy {
+    bool Untied;
+    const VarDecl *PartIDVar;
+    const RegionCodeGenTy UntiedCodeGen;
+    llvm::SwitchInst *UntiedSwitch = nullptr;
+
+  public:
+    UntiedTaskActionTy(bool Tied, const VarDecl *PartIDVar,
+                       const RegionCodeGenTy &UntiedCodeGen)
+        : Untied(!Tied), PartIDVar(PartIDVar), UntiedCodeGen(UntiedCodeGen) {}
+    void Enter(CodeGenFunction &CGF) override {
+      if (Untied) {
+        // Emit task switching point.
+        auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
+            CGF.GetAddrOfLocalVar(PartIDVar),
+            PartIDVar->getType()->castAs<PointerType>());
+        auto *Res = CGF.EmitLoadOfScalar(PartIdLVal, SourceLocation());
+        auto *DoneBB = CGF.createBasicBlock(".untied.done.");
+        UntiedSwitch = CGF.Builder.CreateSwitch(Res, DoneBB);
+        CGF.EmitBlock(DoneBB);
+        CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
+        CGF.EmitBlock(CGF.createBasicBlock(".untied.jmp."));
+        UntiedSwitch->addCase(CGF.Builder.getInt32(0),
+                              CGF.Builder.GetInsertBlock());
+        emitUntiedSwitch(CGF);
+      }
+    }
+    void emitUntiedSwitch(CodeGenFunction &CGF) const {
+      if (Untied) {
+        auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
+            CGF.GetAddrOfLocalVar(PartIDVar),
+            PartIDVar->getType()->castAs<PointerType>());
+        CGF.EmitStoreOfScalar(CGF.Builder.getInt32(UntiedSwitch->getNumCases()),
+                              PartIdLVal);
+        UntiedCodeGen(CGF);
+        CodeGenFunction::JumpDest CurPoint =
+            CGF.getJumpDestInCurrentScope(".untied.next.");
+        CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
+        CGF.EmitBlock(CGF.createBasicBlock(".untied.jmp."));
+        UntiedSwitch->addCase(CGF.Builder.getInt32(UntiedSwitch->getNumCases()),
+                              CGF.Builder.GetInsertBlock());
+        CGF.EmitBranchThroughCleanup(CurPoint);
+        CGF.EmitBlock(CurPoint.getBlock());
+      }
+    }
+    unsigned getNumberOfParts() const { return UntiedSwitch->getNumCases(); }
+  };
   CGOpenMPTaskOutlinedRegionInfo(const CapturedStmt &CS,
                                  const VarDecl *ThreadIDVar,
                                  const RegionCodeGenTy &CodeGen,
-                                 OpenMPDirectiveKind Kind, bool HasCancel)
+                                 OpenMPDirectiveKind Kind, bool HasCancel,
+                                 const UntiedTaskActionTy &Action)
       : CGOpenMPRegionInfo(CS, TaskOutlinedRegion, CodeGen, Kind, HasCancel),
-        ThreadIDVar(ThreadIDVar) {
+        ThreadIDVar(ThreadIDVar), Action(Action) {
     assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
   }
+
   /// \brief Get a variable or parameter for storing global thread id
   /// inside OpenMP construct.
   const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }
@@ -140,6 +194,10 @@
   /// \brief Get the name of the capture helper.
   StringRef getHelperName() const override { return ".omp_outlined."; }
 
+  void emitUntiedSwitch(CodeGenFunction &CGF) override {
+    Action.emitUntiedSwitch(CGF);
+  }
+
   static bool classof(const CGCapturedStmtInfo *Info) {
     return CGOpenMPRegionInfo::classof(Info) &&
            cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
@@ -150,6 +208,8 @@
   /// \brief A variable or parameter storing global thread id for OpenMP
   /// constructs.
   const VarDecl *ThreadIDVar;
+  /// Action for emitting code for untied tasks.
+  const UntiedTaskActionTy &Action;
 };
 
 /// \brief API for inlined captured statement code generation in OpenMP
@@ -162,12 +222,14 @@
       : CGOpenMPRegionInfo(InlinedRegion, CodeGen, Kind, HasCancel),
         OldCSI(OldCSI),
         OuterRegionInfo(dyn_cast_or_null<CGOpenMPRegionInfo>(OldCSI)) {}
+
   // \brief Retrieve the value of the context parameter.
   llvm::Value *getContextValue() const override {
     if (OuterRegionInfo)
       return OuterRegionInfo->getContextValue();
     llvm_unreachable("No context value for inlined OpenMP region");
   }
+
   void setContextValue(llvm::Value *V) override {
     if (OuterRegionInfo) {
       OuterRegionInfo->setContextValue(V);
@@ -175,6 +237,7 @@
     }
     llvm_unreachable("No context value for inlined OpenMP region");
   }
+
   /// \brief Lookup the captured field decl for a variable.
   const FieldDecl *lookup(const VarDecl *VD) const override {
     if (OuterRegionInfo)
@@ -183,11 +246,13 @@
     // captured variables, we can use the original one.
     return nullptr;
   }
+
   FieldDecl *getThisFieldDecl() const override {
     if (OuterRegionInfo)
       return OuterRegionInfo->getThisFieldDecl();
     return nullptr;
   }
+
   /// \brief Get a variable or parameter for storing global thread id
   /// inside OpenMP construct.
   const VarDecl *getThreadIDVariable() const override {
@@ -203,6 +268,11 @@
     llvm_unreachable("No helper name for inlined OpenMP construct");
   }
 
+  void emitUntiedSwitch(CodeGenFunction &CGF) override {
+    if (OuterRegionInfo)
+      OuterRegionInfo->emitUntiedSwitch(CGF);
+  }
+
   CodeGenFunction::CGCapturedStmtInfo *getOldCSI() const { return OldCSI; }
 
   static bool classof(const CGCapturedStmtInfo *Info) {
@@ -210,6 +280,8 @@
            cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == InlinedRegion;
   }
 
+  ~CGOpenMPInlinedRegionInfo() override = default;
+
 private:
   /// \brief CodeGen info about outer OpenMP region.
   CodeGenFunction::CGCapturedStmtInfo *OldCSI;
@@ -221,7 +293,7 @@
 /// captured fields. The name of the target region has to be unique in a given
 /// application so it is provided by the client, because only the client has
 /// the information to generate that.
-class CGOpenMPTargetRegionInfo : public CGOpenMPRegionInfo {
+class CGOpenMPTargetRegionInfo final : public CGOpenMPRegionInfo {
 public:
   CGOpenMPTargetRegionInfo(const CapturedStmt &CS,
                            const RegionCodeGenTy &CodeGen, StringRef HelperName)
@@ -245,9 +317,75 @@
   StringRef HelperName;
 };
 
+static void EmptyCodeGen(CodeGenFunction &, PrePostActionTy &) {
+  llvm_unreachable("No codegen for expressions");
+}
+/// \brief API for generation of expressions captured in a innermost OpenMP
+/// region.
+class CGOpenMPInnerExprInfo final : public CGOpenMPInlinedRegionInfo {
+public:
+  CGOpenMPInnerExprInfo(CodeGenFunction &CGF, const CapturedStmt &CS)
+      : CGOpenMPInlinedRegionInfo(CGF.CapturedStmtInfo, EmptyCodeGen,
+                                  OMPD_unknown,
+                                  /*HasCancel=*/false),
+        PrivScope(CGF) {
+    // Make sure the globals captured in the provided statement are local by
+    // using the privatization logic. We assume the same variable is not
+    // captured more than once.
+    for (auto &C : CS.captures()) {
+      if (!C.capturesVariable() && !C.capturesVariableByCopy())
+        continue;
+
+      const VarDecl *VD = C.getCapturedVar();
+      if (VD->isLocalVarDeclOrParm())
+        continue;
+
+      DeclRefExpr DRE(const_cast<VarDecl *>(VD),
+                      /*RefersToEnclosingVariableOrCapture=*/false,
+                      VD->getType().getNonReferenceType(), VK_LValue,
+                      SourceLocation());
+      PrivScope.addPrivate(VD, [&CGF, &DRE]() -> Address {
+        return CGF.EmitLValue(&DRE).getAddress();
+      });
+    }
+    (void)PrivScope.Privatize();
+  }
+
+  /// \brief Lookup the captured field decl for a variable.
+  const FieldDecl *lookup(const VarDecl *VD) const override {
+    if (auto *FD = CGOpenMPInlinedRegionInfo::lookup(VD))
+      return FD;
+    return nullptr;
+  }
+
+  /// \brief Emit the captured statement body.
+  void EmitBody(CodeGenFunction &CGF, const Stmt *S) override {
+    llvm_unreachable("No body for expressions");
+  }
+
+  /// \brief Get a variable or parameter for storing global thread id
+  /// inside OpenMP construct.
+  const VarDecl *getThreadIDVariable() const override {
+    llvm_unreachable("No thread id for expressions");
+  }
+
+  /// \brief Get the name of the capture helper.
+  StringRef getHelperName() const override {
+    llvm_unreachable("No helper name for expressions");
+  }
+
+  static bool classof(const CGCapturedStmtInfo *Info) { return false; }
+
+private:
+  /// Private scope to capture global variables.
+  CodeGenFunction::OMPPrivateScope PrivScope;
+};
+
 /// \brief RAII for emitting code of OpenMP constructs.
 class InlinedOpenMPRegionRAII {
   CodeGenFunction &CGF;
+  llvm::DenseMap<const VarDecl *, FieldDecl *> LambdaCaptureFields;
+  FieldDecl *LambdaThisCaptureField = nullptr;
 
 public:
   /// \brief Constructs region for combined constructs.
@@ -260,30 +398,306 @@
     // Start emission for the construct.
     CGF.CapturedStmtInfo = new CGOpenMPInlinedRegionInfo(
         CGF.CapturedStmtInfo, CodeGen, Kind, HasCancel);
+    std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
+    LambdaThisCaptureField = CGF.LambdaThisCaptureField;
+    CGF.LambdaThisCaptureField = nullptr;
   }
+
   ~InlinedOpenMPRegionRAII() {
     // Restore original CapturedStmtInfo only if we're done with code emission.
     auto *OldCSI =
         cast<CGOpenMPInlinedRegionInfo>(CGF.CapturedStmtInfo)->getOldCSI();
     delete CGF.CapturedStmtInfo;
     CGF.CapturedStmtInfo = OldCSI;
+    std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
+    CGF.LambdaThisCaptureField = LambdaThisCaptureField;
+  }
+};
+
+/// \brief Values for bit flags used in the ident_t to describe the fields.
+/// All enumeric elements are named and described in accordance with the code
+/// from http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
+enum OpenMPLocationFlags {
+  /// \brief Use trampoline for internal microtask.
+  OMP_IDENT_IMD = 0x01,
+  /// \brief Use c-style ident structure.
+  OMP_IDENT_KMPC = 0x02,
+  /// \brief Atomic reduction option for kmpc_reduce.
+  OMP_ATOMIC_REDUCE = 0x10,
+  /// \brief Explicit 'barrier' directive.
+  OMP_IDENT_BARRIER_EXPL = 0x20,
+  /// \brief Implicit barrier in code.
+  OMP_IDENT_BARRIER_IMPL = 0x40,
+  /// \brief Implicit barrier in 'for' directive.
+  OMP_IDENT_BARRIER_IMPL_FOR = 0x40,
+  /// \brief Implicit barrier in 'sections' directive.
+  OMP_IDENT_BARRIER_IMPL_SECTIONS = 0xC0,
+  /// \brief Implicit barrier in 'single' directive.
+  OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140
+};
+
+/// \brief Describes ident structure that describes a source location.
+/// All descriptions are taken from
+/// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
+/// Original structure:
+/// typedef struct ident {
+///    kmp_int32 reserved_1;   /**<  might be used in Fortran;
+///                                  see above  */
+///    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags;
+///                                  KMP_IDENT_KMPC identifies this union
+///                                  member  */
+///    kmp_int32 reserved_2;   /**<  not really used in Fortran any more;
+///                                  see above */
+///#if USE_ITT_BUILD
+///                            /*  but currently used for storing
+///                                region-specific ITT */
+///                            /*  contextual information. */
+///#endif /* USE_ITT_BUILD */
+///    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for
+///                                 C++  */
+///    char const *psource;    /**< String describing the source location.
+///                            The string is composed of semi-colon separated
+//                             fields which describe the source file,
+///                            the function and a pair of line numbers that
+///                            delimit the construct.
+///                             */
+/// } ident_t;
+enum IdentFieldIndex {
+  /// \brief might be used in Fortran
+  IdentField_Reserved_1,
+  /// \brief OMP_IDENT_xxx flags; OMP_IDENT_KMPC identifies this union member.
+  IdentField_Flags,
+  /// \brief Not really used in Fortran any more
+  IdentField_Reserved_2,
+  /// \brief Source[4] in Fortran, do not use for C++
+  IdentField_Reserved_3,
+  /// \brief String describing the source location. The string is composed of
+  /// semi-colon separated fields which describe the source file, the function
+  /// and a pair of line numbers that delimit the construct.
+  IdentField_PSource
+};
+
+/// \brief Schedule types for 'omp for' loops (these enumerators are taken from
+/// the enum sched_type in kmp.h).
+enum OpenMPSchedType {
+  /// \brief Lower bound for default (unordered) versions.
+  OMP_sch_lower = 32,
+  OMP_sch_static_chunked = 33,
+  OMP_sch_static = 34,
+  OMP_sch_dynamic_chunked = 35,
+  OMP_sch_guided_chunked = 36,
+  OMP_sch_runtime = 37,
+  OMP_sch_auto = 38,
+  /// static with chunk adjustment (e.g., simd)
+  OMP_sch_static_balanced_chunked   = 45,
+  /// \brief Lower bound for 'ordered' versions.
+  OMP_ord_lower = 64,
+  OMP_ord_static_chunked = 65,
+  OMP_ord_static = 66,
+  OMP_ord_dynamic_chunked = 67,
+  OMP_ord_guided_chunked = 68,
+  OMP_ord_runtime = 69,
+  OMP_ord_auto = 70,
+  OMP_sch_default = OMP_sch_static,
+  /// \brief dist_schedule types
+  OMP_dist_sch_static_chunked = 91,
+  OMP_dist_sch_static = 92,
+  /// Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
+  /// Set if the monotonic schedule modifier was present.
+  OMP_sch_modifier_monotonic = (1 << 29),
+  /// Set if the nonmonotonic schedule modifier was present.
+  OMP_sch_modifier_nonmonotonic = (1 << 30),
+};
+
+enum OpenMPRTLFunction {
+  /// \brief Call to void __kmpc_fork_call(ident_t *loc, kmp_int32 argc,
+  /// kmpc_micro microtask, ...);
+  OMPRTL__kmpc_fork_call,
+  /// \brief Call to void *__kmpc_threadprivate_cached(ident_t *loc,
+  /// kmp_int32 global_tid, void *data, size_t size, void ***cache);
+  OMPRTL__kmpc_threadprivate_cached,
+  /// \brief Call to void __kmpc_threadprivate_register( ident_t *,
+  /// void *data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
+  OMPRTL__kmpc_threadprivate_register,
+  // Call to __kmpc_int32 kmpc_global_thread_num(ident_t *loc);
+  OMPRTL__kmpc_global_thread_num,
+  // Call to void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
+  // kmp_critical_name *crit);
+  OMPRTL__kmpc_critical,
+  // Call to void __kmpc_critical_with_hint(ident_t *loc, kmp_int32
+  // global_tid, kmp_critical_name *crit, uintptr_t hint);
+  OMPRTL__kmpc_critical_with_hint,
+  // Call to void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
+  // kmp_critical_name *crit);
+  OMPRTL__kmpc_end_critical,
+  // Call to kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32
+  // global_tid);
+  OMPRTL__kmpc_cancel_barrier,
+  // Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
+  OMPRTL__kmpc_barrier,
+  // Call to void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
+  OMPRTL__kmpc_for_static_fini,
+  // Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
+  // global_tid);
+  OMPRTL__kmpc_serialized_parallel,
+  // Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
+  // global_tid);
+  OMPRTL__kmpc_end_serialized_parallel,
+  // Call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
+  // kmp_int32 num_threads);
+  OMPRTL__kmpc_push_num_threads,
+  // Call to void __kmpc_flush(ident_t *loc);
+  OMPRTL__kmpc_flush,
+  // Call to kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
+  OMPRTL__kmpc_master,
+  // Call to void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
+  OMPRTL__kmpc_end_master,
+  // Call to kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid,
+  // int end_part);
+  OMPRTL__kmpc_omp_taskyield,
+  // Call to kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
+  OMPRTL__kmpc_single,
+  // Call to void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
+  OMPRTL__kmpc_end_single,
+  // Call to kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
+  // kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+  // kmp_routine_entry_t *task_entry);
+  OMPRTL__kmpc_omp_task_alloc,
+  // Call to kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t *
+  // new_task);
+  OMPRTL__kmpc_omp_task,
+  // Call to void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
+  // size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *),
+  // kmp_int32 didit);
+  OMPRTL__kmpc_copyprivate,
+  // Call to kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid,
+  // kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void
+  // (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck);
+  OMPRTL__kmpc_reduce,
+  // Call to kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32
+  // global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
+  // void (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name
+  // *lck);
+  OMPRTL__kmpc_reduce_nowait,
+  // Call to void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
+  // kmp_critical_name *lck);
+  OMPRTL__kmpc_end_reduce,
+  // Call to void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
+  // kmp_critical_name *lck);
+  OMPRTL__kmpc_end_reduce_nowait,
+  // Call to void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
+  // kmp_task_t * new_task);
+  OMPRTL__kmpc_omp_task_begin_if0,
+  // Call to void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
+  // kmp_task_t * new_task);
+  OMPRTL__kmpc_omp_task_complete_if0,
+  // Call to void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid);
+  OMPRTL__kmpc_ordered,
+  // Call to void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid);
+  OMPRTL__kmpc_end_ordered,
+  // Call to kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
+  // global_tid);
+  OMPRTL__kmpc_omp_taskwait,
+  // Call to void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid);
+  OMPRTL__kmpc_taskgroup,
+  // Call to void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid);
+  OMPRTL__kmpc_end_taskgroup,
+  // Call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
+  // int proc_bind);
+  OMPRTL__kmpc_push_proc_bind,
+  // Call to kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32
+  // gtid, kmp_task_t * new_task, kmp_int32 ndeps, kmp_depend_info_t
+  // *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
+  OMPRTL__kmpc_omp_task_with_deps,
+  // Call to void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32
+  // gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
+  // ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
+  OMPRTL__kmpc_omp_wait_deps,
+  // Call to kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
+  // global_tid, kmp_int32 cncl_kind);
+  OMPRTL__kmpc_cancellationpoint,
+  // Call to kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
+  // kmp_int32 cncl_kind);
+  OMPRTL__kmpc_cancel,
+  // Call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
+  // kmp_int32 num_teams, kmp_int32 thread_limit);
+  OMPRTL__kmpc_push_num_teams,
+  // Call to void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro
+  // microtask, ...);
+  OMPRTL__kmpc_fork_teams,
+  // Call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
+  // if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
+  // sched, kmp_uint64 grainsize, void *task_dup);
+  OMPRTL__kmpc_taskloop,
+  // Call to void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32
+  // num_dims, struct kmp_dim *dims);
+  OMPRTL__kmpc_doacross_init,
+  // Call to void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
+  OMPRTL__kmpc_doacross_fini,
+  // Call to void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64
+  // *vec);
+  OMPRTL__kmpc_doacross_post,
+  // Call to void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
+  // *vec);
+  OMPRTL__kmpc_doacross_wait,
+
+  //
+  // Offloading related calls
+  //
+  // Call to int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
+  // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
+  // *arg_types);
+  OMPRTL__tgt_target,
+  // Call to int32_t __tgt_target_teams(int32_t device_id, void *host_ptr,
+  // int32_t arg_num, void** args_base, void **args, size_t *arg_sizes,
+  // int32_t *arg_types, int32_t num_teams, int32_t thread_limit);
+  OMPRTL__tgt_target_teams,
+  // Call to void __tgt_register_lib(__tgt_bin_desc *desc);
+  OMPRTL__tgt_register_lib,
+  // Call to void __tgt_unregister_lib(__tgt_bin_desc *desc);
+  OMPRTL__tgt_unregister_lib,
+  // Call to void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
+  // void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
+  OMPRTL__tgt_target_data_begin,
+  // Call to void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
+  // void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
+  OMPRTL__tgt_target_data_end,
+  // Call to void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
+  // void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
+  OMPRTL__tgt_target_data_update,
+};
+
+/// A basic class for pre|post-action for advanced codegen sequence for OpenMP
+/// region.
+class CleanupTy final : public EHScopeStack::Cleanup {
+  PrePostActionTy *Action;
+
+public:
+  explicit CleanupTy(PrePostActionTy *Action) : Action(Action) {}
+  void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
+    if (!CGF.HaveInsertPoint())
+      return;
+    Action->Exit(CGF);
   }
 };
 
 } // anonymous namespace
 
-static LValue emitLoadOfPointerLValue(CodeGenFunction &CGF, Address PtrAddr,
-                                      QualType Ty) {
-  AlignmentSource Source;
-  CharUnits Align = CGF.getNaturalPointeeTypeAlignment(Ty, &Source);
-  return CGF.MakeAddrLValue(Address(CGF.Builder.CreateLoad(PtrAddr), Align),
-                            Ty->getPointeeType(), Source);
+void RegionCodeGenTy::operator()(CodeGenFunction &CGF) const {
+  CodeGenFunction::RunCleanupsScope Scope(CGF);
+  if (PrePostAction) {
+    CGF.EHStack.pushCleanup<CleanupTy>(NormalAndEHCleanup, PrePostAction);
+    Callback(CodeGen, CGF, *PrePostAction);
+  } else {
+    PrePostActionTy Action;
+    Callback(CodeGen, CGF, Action);
+  }
 }
 
 LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) {
-  return emitLoadOfPointerLValue(CGF,
-                                 CGF.GetAddrOfLocalVar(getThreadIDVariable()),
-                                 getThreadIDVariable()->getType());
+  return CGF.EmitLoadOfPointerLValue(
+      CGF.GetAddrOfLocalVar(getThreadIDVariable()),
+      getThreadIDVariable()->getType()->castAs<PointerType>());
 }
 
 void CGOpenMPRegionInfo::EmitBody(CodeGenFunction &CGF, const Stmt * /*S*/) {
@@ -295,10 +709,7 @@
   // The point of exit cannot be a branch out of the structured block.
   // longjmp() and throw() must not violate the entry/exit criteria.
   CGF.EHStack.pushTerminate();
-  {
-    CodeGenFunction::RunCleanupsScope Scope(CGF);
-    CodeGen(CGF);
-  }
+  CodeGen(CGF);
   CGF.EHStack.popTerminate();
 }
 
@@ -310,16 +721,11 @@
 }
 
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
-    : CGM(CGM), DefaultOpenMPPSource(nullptr), KmpRoutineEntryPtrTy(nullptr),
-      OffloadEntriesInfoManager(CGM) {
+    : CGM(CGM), OffloadEntriesInfoManager(CGM) {
   IdentTy = llvm::StructType::create(
       "ident_t", CGM.Int32Ty /* reserved_1 */, CGM.Int32Ty /* flags */,
       CGM.Int32Ty /* reserved_2 */, CGM.Int32Ty /* reserved_3 */,
       CGM.Int8PtrTy /* psource */, nullptr);
-  // Build void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
-  llvm::Type *MicroParams[] = {llvm::PointerType::getUnqual(CGM.Int32Ty),
-                               llvm::PointerType::getUnqual(CGM.Int32Ty)};
-  Kmpc_MicroTy = llvm::FunctionType::get(CGM.VoidTy, MicroParams, true);
   KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
 
   loadOffloadInfoMetadata();
@@ -329,6 +735,89 @@
   InternalVars.clear();
 }
 
+static llvm::Function *
+emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty,
+                          const Expr *CombinerInitializer, const VarDecl *In,
+                          const VarDecl *Out, bool IsCombiner) {
+  // void .omp_combiner.(Ty *in, Ty *out);
+  auto &C = CGM.getContext();
+  QualType PtrTy = C.getPointerType(Ty).withRestrict();
+  FunctionArgList Args;
+  ImplicitParamDecl OmpOutParm(C, /*DC=*/nullptr, Out->getLocation(),
+                               /*Id=*/nullptr, PtrTy);
+  ImplicitParamDecl OmpInParm(C, /*DC=*/nullptr, In->getLocation(),
+                              /*Id=*/nullptr, PtrTy);
+  Args.push_back(&OmpOutParm);
+  Args.push_back(&OmpInParm);
+  auto &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
+  auto *Fn = llvm::Function::Create(
+      FnTy, llvm::GlobalValue::InternalLinkage,
+      IsCombiner ? ".omp_combiner." : ".omp_initializer.", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
+  Fn->addFnAttr(llvm::Attribute::AlwaysInline);
+  CodeGenFunction CGF(CGM);
+  // Map "T omp_in;" variable to "*omp_in_parm" value in all expressions.
+  // Map "T omp_out;" variable to "*omp_out_parm" value in all expressions.
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  CodeGenFunction::OMPPrivateScope Scope(CGF);
+  Address AddrIn = CGF.GetAddrOfLocalVar(&OmpInParm);
+  Scope.addPrivate(In, [&CGF, AddrIn, PtrTy]() -> Address {
+    return CGF.EmitLoadOfPointerLValue(AddrIn, PtrTy->castAs<PointerType>())
+        .getAddress();
+  });
+  Address AddrOut = CGF.GetAddrOfLocalVar(&OmpOutParm);
+  Scope.addPrivate(Out, [&CGF, AddrOut, PtrTy]() -> Address {
+    return CGF.EmitLoadOfPointerLValue(AddrOut, PtrTy->castAs<PointerType>())
+        .getAddress();
+  });
+  (void)Scope.Privatize();
+  CGF.EmitIgnoredExpr(CombinerInitializer);
+  Scope.ForceCleanup();
+  CGF.FinishFunction();
+  return Fn;
+}
+
+void CGOpenMPRuntime::emitUserDefinedReduction(
+    CodeGenFunction *CGF, const OMPDeclareReductionDecl *D) {
+  if (UDRMap.count(D) > 0)
+    return;
+  auto &C = CGM.getContext();
+  if (!In || !Out) {
+    In = &C.Idents.get("omp_in");
+    Out = &C.Idents.get("omp_out");
+  }
+  llvm::Function *Combiner = emitCombinerOrInitializer(
+      CGM, D->getType(), D->getCombiner(), cast<VarDecl>(D->lookup(In).front()),
+      cast<VarDecl>(D->lookup(Out).front()),
+      /*IsCombiner=*/true);
+  llvm::Function *Initializer = nullptr;
+  if (auto *Init = D->getInitializer()) {
+    if (!Priv || !Orig) {
+      Priv = &C.Idents.get("omp_priv");
+      Orig = &C.Idents.get("omp_orig");
+    }
+    Initializer = emitCombinerOrInitializer(
+        CGM, D->getType(), Init, cast<VarDecl>(D->lookup(Orig).front()),
+        cast<VarDecl>(D->lookup(Priv).front()),
+        /*IsCombiner=*/false);
+  }
+  UDRMap.insert(std::make_pair(D, std::make_pair(Combiner, Initializer)));
+  if (CGF) {
+    auto &Decls = FunctionUDRMap.FindAndConstruct(CGF->CurFn);
+    Decls.second.push_back(D);
+  }
+}
+
+std::pair<llvm::Function *, llvm::Function *>
+CGOpenMPRuntime::getUserDefinedReduction(const OMPDeclareReductionDecl *D) {
+  auto I = UDRMap.find(D);
+  if (I != UDRMap.end())
+    return I->second;
+  emitUserDefinedReduction(/*CGF=*/nullptr, D);
+  return UDRMap.lookup(D);
+}
 
 // Layout information for ident_t.
 static CharUnits getIdentAlign(CodeGenModule &CGM) {
@@ -338,18 +827,18 @@
   assert((4 * CGM.getPointerSize()).isMultipleOf(CGM.getPointerAlign()));
   return CharUnits::fromQuantity(16) + CGM.getPointerSize();
 }
-static CharUnits getOffsetOfIdentField(CGOpenMPRuntime::IdentFieldIndex Field) {
+static CharUnits getOffsetOfIdentField(IdentFieldIndex Field) {
   // All the fields except the last are i32, so this works beautifully.
   return unsigned(Field) * CharUnits::fromQuantity(4);
 }
 static Address createIdentFieldGEP(CodeGenFunction &CGF, Address Addr,
-                                   CGOpenMPRuntime::IdentFieldIndex Field,
+                                   IdentFieldIndex Field,
                                    const llvm::Twine &Name = "") {
   auto Offset = getOffsetOfIdentField(Field);
   return CGF.Builder.CreateStructGEP(Addr, Field, Offset, Name);
 }
 
-llvm::Value *CGOpenMPRuntime::emitParallelOutlinedFunction(
+llvm::Value *CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
   assert(ThreadIDVar->getType()->isPointerType() &&
@@ -371,19 +860,39 @@
 
 llvm::Value *CGOpenMPRuntime::emitTaskOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
-    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+    const VarDecl *PartIDVar, const VarDecl *TaskTVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen,
+    bool Tied, unsigned &NumberOfParts) {
+  auto &&UntiedCodeGen = [this, &D, TaskTVar](CodeGenFunction &CGF,
+                                              PrePostActionTy &) {
+    auto *ThreadID = getThreadID(CGF, D.getLocStart());
+    auto *UpLoc = emitUpdateLocation(CGF, D.getLocStart());
+    llvm::Value *TaskArgs[] = {
+        UpLoc, ThreadID,
+        CGF.EmitLoadOfPointerLValue(CGF.GetAddrOfLocalVar(TaskTVar),
+                                    TaskTVar->getType()->castAs<PointerType>())
+            .getPointer()};
+    CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task), TaskArgs);
+  };
+  CGOpenMPTaskOutlinedRegionInfo::UntiedTaskActionTy Action(Tied, PartIDVar,
+                                                            UntiedCodeGen);
+  CodeGen.setAction(Action);
   assert(!ThreadIDVar->getType()->isPointerType() &&
          "thread id variable must be of type kmp_int32 for tasks");
   auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
+  auto *TD = dyn_cast<OMPTaskDirective>(&D);
   CodeGenFunction CGF(CGM, true);
   CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
                                         InnermostKind,
-                                        cast<OMPTaskDirective>(D).hasCancel());
+                                        TD ? TD->hasCancel() : false, Action);
   CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
-  return CGF.GenerateCapturedStmtFunction(*CS);
+  auto *Res = CGF.GenerateCapturedStmtFunction(*CS);
+  if (!Tied)
+    NumberOfParts = Action.getNumberOfParts();
+  return Res;
 }
 
-Address CGOpenMPRuntime::getOrCreateDefaultLocation(OpenMPLocationFlags Flags) {
+Address CGOpenMPRuntime::getOrCreateDefaultLocation(unsigned Flags) {
   CharUnits Align = getIdentAlign(CGM);
   llvm::Value *Entry = OpenMPDefaultLocMap.lookup(Flags);
   if (!Entry) {
@@ -400,7 +909,7 @@
     auto DefaultOpenMPLocation = new llvm::GlobalVariable(
         CGM.getModule(), IdentTy, /*isConstant*/ true,
         llvm::GlobalValue::PrivateLinkage, /*Initializer*/ nullptr);
-    DefaultOpenMPLocation->setUnnamedAddr(true);
+    DefaultOpenMPLocation->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
     DefaultOpenMPLocation->setAlignment(Align.getQuantity());
 
     llvm::Constant *Zero = llvm::ConstantInt::get(CGM.Int32Ty, 0, true);
@@ -416,7 +925,8 @@
 
 llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF,
                                                  SourceLocation Loc,
-                                                 OpenMPLocationFlags Flags) {
+                                                 unsigned Flags) {
+  Flags |= OMP_IDENT_KMPC;
   // If no debug info is generated - return global default location.
   if (CGM.getCodeGenOpts().getDebugInfo() == codegenoptions::NoDebugInfo ||
       Loc.isInvalid())
@@ -484,7 +994,7 @@
     if (ThreadID != nullptr)
       return ThreadID;
   }
-  if (auto OMPRegionInfo =
+  if (auto *OMPRegionInfo =
           dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
     if (OMPRegionInfo->getThreadIDVariable()) {
       // Check if this an outlined function with thread id passed as argument.
@@ -518,20 +1028,34 @@
   assert(CGF.CurFn && "No function in current CodeGenFunction.");
   if (OpenMPLocThreadIDMap.count(CGF.CurFn))
     OpenMPLocThreadIDMap.erase(CGF.CurFn);
+  if (FunctionUDRMap.count(CGF.CurFn) > 0) {
+    for(auto *D : FunctionUDRMap[CGF.CurFn]) {
+      UDRMap.erase(D);
+    }
+    FunctionUDRMap.erase(CGF.CurFn);
+  }
 }
 
 llvm::Type *CGOpenMPRuntime::getIdentTyPointerTy() {
+  if (!IdentTy) {
+  }
   return llvm::PointerType::getUnqual(IdentTy);
 }
 
 llvm::Type *CGOpenMPRuntime::getKmpc_MicroPointerTy() {
+  if (!Kmpc_MicroTy) {
+    // Build void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
+    llvm::Type *MicroParams[] = {llvm::PointerType::getUnqual(CGM.Int32Ty),
+                                 llvm::PointerType::getUnqual(CGM.Int32Ty)};
+    Kmpc_MicroTy = llvm::FunctionType::get(CGM.VoidTy, MicroParams, true);
+  }
   return llvm::PointerType::getUnqual(Kmpc_MicroTy);
 }
 
 llvm::Constant *
-CGOpenMPRuntime::createRuntimeFunction(OpenMPRTLFunction Function) {
+CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
   llvm::Constant *RTLFn = nullptr;
-  switch (Function) {
+  switch (static_cast<OpenMPRTLFunction>(Function)) {
   case OMPRTL__kmpc_fork_call: {
     // Build void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro
     // microtask, ...);
@@ -928,6 +1452,86 @@
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancel");
     break;
   }
+  case OMPRTL__kmpc_push_num_teams: {
+    // Build void kmpc_push_num_teams (ident_t loc, kmp_int32 global_tid,
+    // kmp_int32 num_teams, kmp_int32 num_threads)
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty,
+        CGM.Int32Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_teams");
+    break;
+  }
+  case OMPRTL__kmpc_fork_teams: {
+    // Build void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro
+    // microtask, ...);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
+                                getKmpc_MicroPointerTy()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ true);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_teams");
+    break;
+  }
+  case OMPRTL__kmpc_taskloop: {
+    // Build void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
+    // if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
+    // sched, kmp_uint64 grainsize, void *task_dup);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
+                                CGM.IntTy,
+                                CGM.VoidPtrTy,
+                                CGM.IntTy,
+                                CGM.Int64Ty->getPointerTo(),
+                                CGM.Int64Ty->getPointerTo(),
+                                CGM.Int64Ty,
+                                CGM.IntTy,
+                                CGM.IntTy,
+                                CGM.Int64Ty,
+                                CGM.VoidPtrTy};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_taskloop");
+    break;
+  }
+  case OMPRTL__kmpc_doacross_init: {
+    // Build void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32
+    // num_dims, struct kmp_dim *dims);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
+                                CGM.Int32Ty,
+                                CGM.Int32Ty,
+                                CGM.VoidPtrTy};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_init");
+    break;
+  }
+  case OMPRTL__kmpc_doacross_fini: {
+    // Build void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_fini");
+    break;
+  }
+  case OMPRTL__kmpc_doacross_post: {
+    // Build void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64
+    // *vec);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
+                                CGM.Int64Ty->getPointerTo()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_post");
+    break;
+  }
+  case OMPRTL__kmpc_doacross_wait: {
+    // Build void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
+    // *vec);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
+                                CGM.Int64Ty->getPointerTo()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_wait");
+    break;
+  }
   case OMPRTL__tgt_target: {
     // Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
     // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
@@ -944,6 +1548,24 @@
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target");
     break;
   }
+  case OMPRTL__tgt_target_teams: {
+    // Build int32_t __tgt_target_teams(int32_t device_id, void *host_ptr,
+    // int32_t arg_num, void** args_base, void **args, size_t *arg_sizes,
+    // int32_t *arg_types, int32_t num_teams, int32_t thread_limit);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty,
+                                CGM.VoidPtrTy,
+                                CGM.Int32Ty,
+                                CGM.VoidPtrPtrTy,
+                                CGM.VoidPtrPtrTy,
+                                CGM.SizeTy->getPointerTo(),
+                                CGM.Int32Ty->getPointerTo(),
+                                CGM.Int32Ty,
+                                CGM.Int32Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_teams");
+    break;
+  }
   case OMPRTL__tgt_register_lib: {
     // Build void __tgt_register_lib(__tgt_bin_desc *desc);
     QualType ParamTy =
@@ -964,30 +1586,53 @@
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_unregister_lib");
     break;
   }
+  case OMPRTL__tgt_target_data_begin: {
+    // Build void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
+    // void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty,
+                                CGM.Int32Ty,
+                                CGM.VoidPtrPtrTy,
+                                CGM.VoidPtrPtrTy,
+                                CGM.SizeTy->getPointerTo(),
+                                CGM.Int32Ty->getPointerTo()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_begin");
+    break;
   }
+  case OMPRTL__tgt_target_data_end: {
+    // Build void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
+    // void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty,
+                                CGM.Int32Ty,
+                                CGM.VoidPtrPtrTy,
+                                CGM.VoidPtrPtrTy,
+                                CGM.SizeTy->getPointerTo(),
+                                CGM.Int32Ty->getPointerTo()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_end");
+    break;
+  }
+  case OMPRTL__tgt_target_data_update: {
+    // Build void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
+    // void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty,
+                                CGM.Int32Ty,
+                                CGM.VoidPtrPtrTy,
+                                CGM.VoidPtrPtrTy,
+                                CGM.SizeTy->getPointerTo(),
+                                CGM.Int32Ty->getPointerTo()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_update");
+    break;
+  }
+  }
+  assert(RTLFn && "Unable to find OpenMP runtime function");
   return RTLFn;
 }
 
-static llvm::Value *getTypeSize(CodeGenFunction &CGF, QualType Ty) {
-  auto &C = CGF.getContext();
-  llvm::Value *Size = nullptr;
-  auto SizeInChars = C.getTypeSizeInChars(Ty);
-  if (SizeInChars.isZero()) {
-    // getTypeSizeInChars() returns 0 for a VLA.
-    while (auto *VAT = C.getAsVariableArrayType(Ty)) {
-      llvm::Value *ArraySize;
-      std::tie(ArraySize, Ty) = CGF.getVLASize(VAT);
-      Size = Size ? CGF.Builder.CreateNUWMul(Size, ArraySize) : ArraySize;
-    }
-    SizeInChars = C.getTypeSizeInChars(Ty);
-    assert(!SizeInChars.isZero());
-    Size = CGF.Builder.CreateNUWMul(
-        Size, llvm::ConstantInt::get(CGF.SizeTy, SizeInChars.getQuantity()));
-  } else
-    Size = llvm::ConstantInt::get(CGF.SizeTy, SizeInChars.getQuantity());
-  return Size;
-}
-
 llvm::Constant *CGOpenMPRuntime::createForStaticInitFunction(unsigned IVSize,
                                                              bool IVSigned) {
   assert((IVSize == 32 || IVSize == 64) &&
@@ -1253,12 +1898,10 @@
   // the condition and the dead arm of the if/else.
   bool CondConstant;
   if (CGF.ConstantFoldsToSimpleInteger(Cond, CondConstant)) {
-    CodeGenFunction::RunCleanupsScope Scope(CGF);
-    if (CondConstant) {
+    if (CondConstant)
       ThenGen(CGF);
-    } else {
+    else
       ElseGen(CGF);
-    }
     return;
   }
 
@@ -1271,26 +1914,16 @@
 
   // Emit the 'then' code.
   CGF.EmitBlock(ThenBlock);
-  {
-    CodeGenFunction::RunCleanupsScope ThenScope(CGF);
-    ThenGen(CGF);
-  }
+  ThenGen(CGF);
   CGF.EmitBranch(ContBlock);
   // Emit the 'else' code if present.
-  {
-    // There is no need to emit line number for unconditional branch.
-    auto NL = ApplyDebugLocation::CreateEmpty(CGF);
-    CGF.EmitBlock(ElseBlock);
-  }
-  {
-    CodeGenFunction::RunCleanupsScope ThenScope(CGF);
-    ElseGen(CGF);
-  }
-  {
-    // There is no need to emit line number for unconditional branch.
-    auto NL = ApplyDebugLocation::CreateEmpty(CGF);
-    CGF.EmitBranch(ContBlock);
-  }
+  // There is no need to emit line number for unconditional branch.
+  (void)ApplyDebugLocation::CreateEmpty(CGF);
+  CGF.EmitBlock(ElseBlock);
+  ElseGen(CGF);
+  // There is no need to emit line number for unconditional branch.
+  (void)ApplyDebugLocation::CreateEmpty(CGF);
+  CGF.EmitBranch(ContBlock);
   // Emit the continuation block for code after the if.
   CGF.EmitBlock(ContBlock, /*IsFinished=*/true);
 }
@@ -1302,34 +1935,36 @@
   if (!CGF.HaveInsertPoint())
     return;
   auto *RTLoc = emitUpdateLocation(CGF, Loc);
-  auto &&ThenGen = [this, OutlinedFn, CapturedVars,
-                    RTLoc](CodeGenFunction &CGF) {
+  auto &&ThenGen = [OutlinedFn, CapturedVars, RTLoc](CodeGenFunction &CGF,
+                                                     PrePostActionTy &) {
     // Build call __kmpc_fork_call(loc, n, microtask, var1, .., varn);
+    auto &RT = CGF.CGM.getOpenMPRuntime();
     llvm::Value *Args[] = {
         RTLoc,
         CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars
-        CGF.Builder.CreateBitCast(OutlinedFn, getKmpc_MicroPointerTy())};
+        CGF.Builder.CreateBitCast(OutlinedFn, RT.getKmpc_MicroPointerTy())};
     llvm::SmallVector<llvm::Value *, 16> RealArgs;
     RealArgs.append(std::begin(Args), std::end(Args));
     RealArgs.append(CapturedVars.begin(), CapturedVars.end());
 
-    auto RTLFn = createRuntimeFunction(OMPRTL__kmpc_fork_call);
+    auto RTLFn = RT.createRuntimeFunction(OMPRTL__kmpc_fork_call);
     CGF.EmitRuntimeCall(RTLFn, RealArgs);
   };
-  auto &&ElseGen = [this, OutlinedFn, CapturedVars, RTLoc,
-                    Loc](CodeGenFunction &CGF) {
-    auto ThreadID = getThreadID(CGF, Loc);
+  auto &&ElseGen = [OutlinedFn, CapturedVars, RTLoc, Loc](CodeGenFunction &CGF,
+                                                          PrePostActionTy &) {
+    auto &RT = CGF.CGM.getOpenMPRuntime();
+    auto ThreadID = RT.getThreadID(CGF, Loc);
     // Build calls:
     // __kmpc_serialized_parallel(&Loc, GTid);
     llvm::Value *Args[] = {RTLoc, ThreadID};
-    CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_serialized_parallel),
-                        Args);
+    CGF.EmitRuntimeCall(
+        RT.createRuntimeFunction(OMPRTL__kmpc_serialized_parallel), Args);
 
     // OutlinedFn(&GTid, &zero, CapturedStruct);
-    auto ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
+    auto ThreadIDAddr = RT.emitThreadIDAddress(CGF, Loc);
     Address ZeroAddr =
-      CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
-                           /*Name*/ ".zero.addr");
+        CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
+                             /*Name*/ ".zero.addr");
     CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
     llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
     OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
@@ -1338,15 +1973,16 @@
     CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
 
     // __kmpc_end_serialized_parallel(&Loc, GTid);
-    llvm::Value *EndArgs[] = {emitUpdateLocation(CGF, Loc), ThreadID};
+    llvm::Value *EndArgs[] = {RT.emitUpdateLocation(CGF, Loc), ThreadID};
     CGF.EmitRuntimeCall(
-        createRuntimeFunction(OMPRTL__kmpc_end_serialized_parallel), EndArgs);
+        RT.createRuntimeFunction(OMPRTL__kmpc_end_serialized_parallel),
+        EndArgs);
   };
-  if (IfCond) {
+  if (IfCond)
     emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
-  } else {
-    CodeGenFunction::RunCleanupsScope Scope(CGF);
-    ThenGen(CGF);
+  else {
+    RegionCodeGenTy ThenRCG(ThenGen);
+    ThenRCG(CGF);
   }
 }
 
@@ -1358,7 +1994,7 @@
 // return the address of that temp.
 Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF,
                                              SourceLocation Loc) {
-  if (auto OMPRegionInfo =
+  if (auto *OMPRegionInfo =
           dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
     if (OMPRegionInfo->getThreadIDVariable())
       return OMPRegionInfo->getThreadIDVariableLValue(CGF).getAddress();
@@ -1399,20 +2035,39 @@
 }
 
 namespace {
-template <size_t N> class CallEndCleanup final : public EHScopeStack::Cleanup {
-  llvm::Value *Callee;
-  llvm::Value *Args[N];
+/// Common pre(post)-action for different OpenMP constructs.
+class CommonActionTy final : public PrePostActionTy {
+  llvm::Value *EnterCallee;
+  ArrayRef<llvm::Value *> EnterArgs;
+  llvm::Value *ExitCallee;
+  ArrayRef<llvm::Value *> ExitArgs;
+  bool Conditional;
+  llvm::BasicBlock *ContBlock = nullptr;
 
 public:
-  CallEndCleanup(llvm::Value *Callee, ArrayRef<llvm::Value *> CleanupArgs)
-      : Callee(Callee) {
-    assert(CleanupArgs.size() == N);
-    std::copy(CleanupArgs.begin(), CleanupArgs.end(), std::begin(Args));
+  CommonActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
+                 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
+                 bool Conditional = false)
+      : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
+        ExitArgs(ExitArgs), Conditional(Conditional) {}
+  void Enter(CodeGenFunction &CGF) override {
+    llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
+    if (Conditional) {
+      llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
+      auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
+      ContBlock = CGF.createBasicBlock("omp_if.end");
+      // Generate the branch (If-stmt)
+      CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
+      CGF.EmitBlock(ThenBlock);
+    }
   }
-  void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
-    if (!CGF.HaveInsertPoint())
-      return;
-    CGF.EmitRuntimeCall(Callee, Args);
+  void Done(CodeGenFunction &CGF) {
+    // Emit the rest of blocks/branches
+    CGF.EmitBranch(ContBlock);
+    CGF.EmitBlock(ContBlock, true);
+  }
+  void Exit(CodeGenFunction &CGF) override {
+    CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
   }
 };
 } // anonymous namespace
@@ -1427,45 +2082,22 @@
   // Prepare arguments and build a call to __kmpc_critical
   if (!CGF.HaveInsertPoint())
     return;
-  CodeGenFunction::RunCleanupsScope Scope(CGF);
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
                          getCriticalRegionLock(CriticalName)};
+  llvm::SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args),
+                                                std::end(Args));
   if (Hint) {
-    llvm::SmallVector<llvm::Value *, 8> ArgsWithHint(std::begin(Args),
-                                                     std::end(Args));
-    auto *HintVal = CGF.EmitScalarExpr(Hint);
-    ArgsWithHint.push_back(
-        CGF.Builder.CreateIntCast(HintVal, CGM.IntPtrTy, /*isSigned=*/false));
-    CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_critical_with_hint),
-                        ArgsWithHint);
-  } else
-    CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_critical), Args);
-  // Build a call to __kmpc_end_critical
-  CGF.EHStack.pushCleanup<CallEndCleanup<std::extent<decltype(Args)>::value>>(
-      NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_critical),
-      llvm::makeArrayRef(Args));
+    EnterArgs.push_back(CGF.Builder.CreateIntCast(
+        CGF.EmitScalarExpr(Hint), CGM.IntPtrTy, /*isSigned=*/false));
+  }
+  CommonActionTy Action(
+      createRuntimeFunction(Hint ? OMPRTL__kmpc_critical_with_hint
+                                 : OMPRTL__kmpc_critical),
+      EnterArgs, createRuntimeFunction(OMPRTL__kmpc_end_critical), Args);
+  CriticalOpGen.setAction(Action);
   emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen);
 }
 
-static void emitIfStmt(CodeGenFunction &CGF, llvm::Value *IfCond,
-                       OpenMPDirectiveKind Kind, SourceLocation Loc,
-                       const RegionCodeGenTy &BodyOpGen) {
-  llvm::Value *CallBool = CGF.EmitScalarConversion(
-      IfCond,
-      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true),
-      CGF.getContext().BoolTy, Loc);
-
-  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
-  auto *ContBlock = CGF.createBasicBlock("omp_if.end");
-  // Generate the branch (If-stmt)
-  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
-  CGF.EmitBlock(ThenBlock);
-  CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, Kind, BodyOpGen);
-  // Emit the rest of bblocks/branches
-  CGF.EmitBranch(ContBlock);
-  CGF.EmitBlock(ContBlock, true);
-}
-
 void CGOpenMPRuntime::emitMasterRegion(CodeGenFunction &CGF,
                                        const RegionCodeGenTy &MasterOpGen,
                                        SourceLocation Loc) {
@@ -1477,18 +2109,12 @@
   // }
   // Prepare arguments and build a call to __kmpc_master
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
-  auto *IsMaster =
-      CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_master), Args);
-  typedef CallEndCleanup<std::extent<decltype(Args)>::value>
-      MasterCallEndCleanup;
-  emitIfStmt(
-      CGF, IsMaster, OMPD_master, Loc, [&](CodeGenFunction &CGF) -> void {
-        CodeGenFunction::RunCleanupsScope Scope(CGF);
-        CGF.EHStack.pushCleanup<MasterCallEndCleanup>(
-            NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_master),
-            llvm::makeArrayRef(Args));
-        MasterOpGen(CGF);
-      });
+  CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_master), Args,
+                        createRuntimeFunction(OMPRTL__kmpc_end_master), Args,
+                        /*Conditional=*/true);
+  MasterOpGen.setAction(Action);
+  emitInlinedDirective(CGF, OMPD_master, MasterOpGen);
+  Action.Done(CGF);
 }
 
 void CGOpenMPRuntime::emitTaskyieldCall(CodeGenFunction &CGF,
@@ -1500,6 +2126,8 @@
       emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
       llvm::ConstantInt::get(CGM.IntTy, /*V=*/0, /*isSigned=*/true)};
   CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskyield), Args);
+  if (auto *Region = dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
+    Region->emitUntiedSwitch(CGF);
 }
 
 void CGOpenMPRuntime::emitTaskgroupRegion(CodeGenFunction &CGF,
@@ -1511,16 +2139,12 @@
   // TaskgroupOpGen();
   // __kmpc_end_taskgroup(ident_t *, gtid);
   // Prepare arguments and build a call to __kmpc_taskgroup
-  {
-    CodeGenFunction::RunCleanupsScope Scope(CGF);
-    llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
-    CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_taskgroup), Args);
-    // Build a call to __kmpc_end_taskgroup
-    CGF.EHStack.pushCleanup<CallEndCleanup<std::extent<decltype(Args)>::value>>(
-        NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_taskgroup),
-        llvm::makeArrayRef(Args));
-    emitInlinedDirective(CGF, OMPD_taskgroup, TaskgroupOpGen);
-  }
+  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
+  CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_taskgroup), Args,
+                        createRuntimeFunction(OMPRTL__kmpc_end_taskgroup),
+                        Args);
+  TaskgroupOpGen.setAction(Action);
+  emitInlinedDirective(CGF, OMPD_taskgroup, TaskgroupOpGen);
 }
 
 /// Given an array of pointers to variables, project the address of a
@@ -1616,22 +2240,16 @@
   }
   // Prepare arguments and build a call to __kmpc_single
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
-  auto *IsSingle =
-      CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_single), Args);
-  typedef CallEndCleanup<std::extent<decltype(Args)>::value>
-      SingleCallEndCleanup;
-  emitIfStmt(
-      CGF, IsSingle, OMPD_single, Loc, [&](CodeGenFunction &CGF) -> void {
-        CodeGenFunction::RunCleanupsScope Scope(CGF);
-        CGF.EHStack.pushCleanup<SingleCallEndCleanup>(
-            NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_single),
-            llvm::makeArrayRef(Args));
-        SingleOpGen(CGF);
-        if (DidIt.isValid()) {
-          // did_it = 1;
-          CGF.Builder.CreateStore(CGF.Builder.getInt32(1), DidIt);
-        }
-      });
+  CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_single), Args,
+                        createRuntimeFunction(OMPRTL__kmpc_end_single), Args,
+                        /*Conditional=*/true);
+  SingleOpGen.setAction(Action);
+  emitInlinedDirective(CGF, OMPD_single, SingleOpGen);
+  if (DidIt.isValid()) {
+    // did_it = 1;
+    CGF.Builder.CreateStore(CGF.Builder.getInt32(1), DidIt);
+  }
+  Action.Done(CGF);
   // call __kmpc_copyprivate(ident_t *, gtid, <buf_size>, <copyprivate list>,
   // <copy_func>, did_it);
   if (DidIt.isValid()) {
@@ -1655,7 +2273,7 @@
     auto *CpyFn = emitCopyprivateCopyFunction(
         CGM, CGF.ConvertTypeForMem(CopyprivateArrayTy)->getPointerTo(),
         CopyprivateVars, SrcExprs, DstExprs, AssignmentOps);
-    auto *BufSize = getTypeSize(CGF, CopyprivateArrayTy);
+    auto *BufSize = CGF.getTypeSize(CopyprivateArrayTy);
     Address CL =
       CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(CopyprivateList,
                                                       CGF.VoidPtrTy);
@@ -1681,14 +2299,14 @@
   // OrderedOpGen();
   // __kmpc_end_ordered(ident_t *, gtid);
   // Prepare arguments and build a call to __kmpc_ordered
-  CodeGenFunction::RunCleanupsScope Scope(CGF);
   if (IsThreads) {
     llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
-    CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_ordered), Args);
-    // Build a call to __kmpc_end_ordered
-    CGF.EHStack.pushCleanup<CallEndCleanup<std::extent<decltype(Args)>::value>>(
-        NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_ordered),
-        llvm::makeArrayRef(Args));
+    CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_ordered), Args,
+                          createRuntimeFunction(OMPRTL__kmpc_end_ordered),
+                          Args);
+    OrderedOpGen.setAction(Action);
+    emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen);
+    return;
   }
   emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen);
 }
@@ -1700,32 +2318,23 @@
     return;
   // Build call __kmpc_cancel_barrier(loc, thread_id);
   // Build call __kmpc_barrier(loc, thread_id);
-  OpenMPLocationFlags Flags = OMP_IDENT_KMPC;
-  if (Kind == OMPD_for) {
-    Flags =
-        static_cast<OpenMPLocationFlags>(Flags | OMP_IDENT_BARRIER_IMPL_FOR);
-  } else if (Kind == OMPD_sections) {
-    Flags = static_cast<OpenMPLocationFlags>(Flags |
-                                             OMP_IDENT_BARRIER_IMPL_SECTIONS);
-  } else if (Kind == OMPD_single) {
-    Flags =
-        static_cast<OpenMPLocationFlags>(Flags | OMP_IDENT_BARRIER_IMPL_SINGLE);
-  } else if (Kind == OMPD_barrier) {
-    Flags = static_cast<OpenMPLocationFlags>(Flags | OMP_IDENT_BARRIER_EXPL);
-  } else {
-    Flags = static_cast<OpenMPLocationFlags>(Flags | OMP_IDENT_BARRIER_IMPL);
-  }
+  unsigned Flags;
+  if (Kind == OMPD_for)
+    Flags = OMP_IDENT_BARRIER_IMPL_FOR;
+  else if (Kind == OMPD_sections)
+    Flags = OMP_IDENT_BARRIER_IMPL_SECTIONS;
+  else if (Kind == OMPD_single)
+    Flags = OMP_IDENT_BARRIER_IMPL_SINGLE;
+  else if (Kind == OMPD_barrier)
+    Flags = OMP_IDENT_BARRIER_EXPL;
+  else
+    Flags = OMP_IDENT_BARRIER_IMPL;
   // Build call __kmpc_cancel_barrier(loc, thread_id) or __kmpc_barrier(loc,
   // thread_id);
-  auto *OMPRegionInfo =
-      dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo);
-  // Do not emit barrier call in the single directive emitted in some rare cases
-  // for sections directives.
-  if (OMPRegionInfo && OMPRegionInfo->getDirectiveKind() == OMPD_single)
-    return;
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
                          getThreadID(CGF, Loc)};
-  if (OMPRegionInfo) {
+  if (auto *OMPRegionInfo =
+          dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
     if (!ForceSimpleCall && OMPRegionInfo->hasCancel()) {
       auto *Result = CGF.EmitRuntimeCall(
           createRuntimeFunction(OMPRTL__kmpc_cancel_barrier), Args);
@@ -1750,28 +2359,6 @@
   CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_barrier), Args);
 }
 
-/// \brief Schedule types for 'omp for' loops (these enumerators are taken from
-/// the enum sched_type in kmp.h).
-enum OpenMPSchedType {
-  /// \brief Lower bound for default (unordered) versions.
-  OMP_sch_lower = 32,
-  OMP_sch_static_chunked = 33,
-  OMP_sch_static = 34,
-  OMP_sch_dynamic_chunked = 35,
-  OMP_sch_guided_chunked = 36,
-  OMP_sch_runtime = 37,
-  OMP_sch_auto = 38,
-  /// \brief Lower bound for 'ordered' versions.
-  OMP_ord_lower = 64,
-  OMP_ord_static_chunked = 65,
-  OMP_ord_static = 66,
-  OMP_ord_dynamic_chunked = 67,
-  OMP_ord_guided_chunked = 68,
-  OMP_ord_runtime = 69,
-  OMP_ord_auto = 70,
-  OMP_sch_default = OMP_sch_static,
-};
-
 /// \brief Map the OpenMP loop schedule to the runtime enumeration.
 static OpenMPSchedType getRuntimeSchedule(OpenMPScheduleClauseKind ScheduleKind,
                                           bool Chunked, bool Ordered) {
@@ -1794,12 +2381,26 @@
   llvm_unreachable("Unexpected runtime schedule");
 }
 
+/// \brief Map the OpenMP distribute schedule to the runtime enumeration.
+static OpenMPSchedType
+getRuntimeSchedule(OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) {
+  // only static is allowed for dist_schedule
+  return Chunked ? OMP_dist_sch_static_chunked : OMP_dist_sch_static;
+}
+
 bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
                                          bool Chunked) const {
   auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked, /*Ordered=*/false);
   return Schedule == OMP_sch_static;
 }
 
+bool CGOpenMPRuntime::isStaticNonchunked(
+    OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const {
+  auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked);
+  return Schedule == OMP_dist_sch_static;
+}
+
+
 bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const {
   auto Schedule =
       getRuntimeSchedule(ScheduleKind, /*Chunked=*/false, /*Ordered=*/false);
@@ -1807,19 +2408,57 @@
   return Schedule != OMP_sch_static;
 }
 
+static int addMonoNonMonoModifier(OpenMPSchedType Schedule,
+                                  OpenMPScheduleClauseModifier M1,
+                                  OpenMPScheduleClauseModifier M2) {
+  int Modifier = 0;
+  switch (M1) {
+  case OMPC_SCHEDULE_MODIFIER_monotonic:
+    Modifier = OMP_sch_modifier_monotonic;
+    break;
+  case OMPC_SCHEDULE_MODIFIER_nonmonotonic:
+    Modifier = OMP_sch_modifier_nonmonotonic;
+    break;
+  case OMPC_SCHEDULE_MODIFIER_simd:
+    if (Schedule == OMP_sch_static_chunked)
+      Schedule = OMP_sch_static_balanced_chunked;
+    break;
+  case OMPC_SCHEDULE_MODIFIER_last:
+  case OMPC_SCHEDULE_MODIFIER_unknown:
+    break;
+  }
+  switch (M2) {
+  case OMPC_SCHEDULE_MODIFIER_monotonic:
+    Modifier = OMP_sch_modifier_monotonic;
+    break;
+  case OMPC_SCHEDULE_MODIFIER_nonmonotonic:
+    Modifier = OMP_sch_modifier_nonmonotonic;
+    break;
+  case OMPC_SCHEDULE_MODIFIER_simd:
+    if (Schedule == OMP_sch_static_chunked)
+      Schedule = OMP_sch_static_balanced_chunked;
+    break;
+  case OMPC_SCHEDULE_MODIFIER_last:
+  case OMPC_SCHEDULE_MODIFIER_unknown:
+    break;
+  }
+  return Schedule | Modifier;
+}
+
 void CGOpenMPRuntime::emitForDispatchInit(CodeGenFunction &CGF,
                                           SourceLocation Loc,
-                                          OpenMPScheduleClauseKind ScheduleKind,
+                                          const OpenMPScheduleTy &ScheduleKind,
                                           unsigned IVSize, bool IVSigned,
                                           bool Ordered, llvm::Value *UB,
                                           llvm::Value *Chunk) {
   if (!CGF.HaveInsertPoint())
     return;
   OpenMPSchedType Schedule =
-      getRuntimeSchedule(ScheduleKind, Chunk != nullptr, Ordered);
+      getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered);
   assert(Ordered ||
          (Schedule != OMP_sch_static && Schedule != OMP_sch_static_chunked &&
-          Schedule != OMP_ord_static && Schedule != OMP_ord_static_chunked));
+          Schedule != OMP_ord_static && Schedule != OMP_ord_static_chunked &&
+          Schedule != OMP_sch_static_balanced_chunked));
   // Call __kmpc_dispatch_init(
   //          ident_t *loc, kmp_int32 tid, kmp_int32 schedule,
   //          kmp_int[32|64] lower, kmp_int[32|64] upper,
@@ -1829,59 +2468,94 @@
   if (Chunk == nullptr)
     Chunk = CGF.Builder.getIntN(IVSize, 1);
   llvm::Value *Args[] = {
-    emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
-    getThreadID(CGF, Loc),
-    CGF.Builder.getInt32(Schedule), // Schedule type
-    CGF.Builder.getIntN(IVSize, 0), // Lower
-    UB,                             // Upper
-    CGF.Builder.getIntN(IVSize, 1), // Stride
-    Chunk                           // Chunk
+      emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
+      CGF.Builder.getInt32(addMonoNonMonoModifier(
+          Schedule, ScheduleKind.M1, ScheduleKind.M2)), // Schedule type
+      CGF.Builder.getIntN(IVSize, 0),                   // Lower
+      UB,                                               // Upper
+      CGF.Builder.getIntN(IVSize, 1),                   // Stride
+      Chunk                                             // Chunk
   };
   CGF.EmitRuntimeCall(createDispatchInitFunction(IVSize, IVSigned), Args);
 }
 
+static void emitForStaticInitCall(
+    CodeGenFunction &CGF, llvm::Value *UpdateLocation, llvm::Value *ThreadId,
+    llvm::Constant *ForStaticInitFunction, OpenMPSchedType Schedule,
+    OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
+    unsigned IVSize, bool Ordered, Address IL, Address LB, Address UB,
+    Address ST, llvm::Value *Chunk) {
+  if (!CGF.HaveInsertPoint())
+     return;
+
+   assert(!Ordered);
+   assert(Schedule == OMP_sch_static || Schedule == OMP_sch_static_chunked ||
+          Schedule == OMP_sch_static_balanced_chunked ||
+          Schedule == OMP_ord_static || Schedule == OMP_ord_static_chunked ||
+          Schedule == OMP_dist_sch_static ||
+          Schedule == OMP_dist_sch_static_chunked);
+
+   // Call __kmpc_for_static_init(
+   //          ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
+   //          kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower,
+   //          kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride,
+   //          kmp_int[32|64] incr, kmp_int[32|64] chunk);
+   if (Chunk == nullptr) {
+     assert((Schedule == OMP_sch_static || Schedule == OMP_ord_static ||
+             Schedule == OMP_dist_sch_static) &&
+            "expected static non-chunked schedule");
+     // If the Chunk was not specified in the clause - use default value 1.
+       Chunk = CGF.Builder.getIntN(IVSize, 1);
+   } else {
+     assert((Schedule == OMP_sch_static_chunked ||
+             Schedule == OMP_sch_static_balanced_chunked ||
+             Schedule == OMP_ord_static_chunked ||
+             Schedule == OMP_dist_sch_static_chunked) &&
+            "expected static chunked schedule");
+   }
+   llvm::Value *Args[] = {
+       UpdateLocation, ThreadId, CGF.Builder.getInt32(addMonoNonMonoModifier(
+                                     Schedule, M1, M2)), // Schedule type
+       IL.getPointer(),                                  // &isLastIter
+       LB.getPointer(),                                  // &LB
+       UB.getPointer(),                                  // &UB
+       ST.getPointer(),                                  // &Stride
+       CGF.Builder.getIntN(IVSize, 1),                   // Incr
+       Chunk                                             // Chunk
+   };
+   CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
+}
+
 void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
                                         SourceLocation Loc,
-                                        OpenMPScheduleClauseKind ScheduleKind,
+                                        const OpenMPScheduleTy &ScheduleKind,
                                         unsigned IVSize, bool IVSigned,
                                         bool Ordered, Address IL, Address LB,
                                         Address UB, Address ST,
                                         llvm::Value *Chunk) {
-  if (!CGF.HaveInsertPoint())
-    return;
-  OpenMPSchedType Schedule =
-    getRuntimeSchedule(ScheduleKind, Chunk != nullptr, Ordered);
-  assert(!Ordered);
-  assert(Schedule == OMP_sch_static || Schedule == OMP_sch_static_chunked ||
-         Schedule == OMP_ord_static || Schedule == OMP_ord_static_chunked);
+  OpenMPSchedType ScheduleNum =
+      getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered);
+  auto *UpdatedLocation = emitUpdateLocation(CGF, Loc);
+  auto *ThreadId = getThreadID(CGF, Loc);
+  auto *StaticInitFunction = createForStaticInitFunction(IVSize, IVSigned);
+  emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
+                        ScheduleNum, ScheduleKind.M1, ScheduleKind.M2, IVSize,
+                        Ordered, IL, LB, UB, ST, Chunk);
+}
 
-  // Call __kmpc_for_static_init(
-  //          ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
-  //          kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower,
-  //          kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride,
-  //          kmp_int[32|64] incr, kmp_int[32|64] chunk);
-  if (Chunk == nullptr) {
-    assert((Schedule == OMP_sch_static || Schedule == OMP_ord_static) &&
-           "expected static non-chunked schedule");
-    // If the Chunk was not specified in the clause - use default value 1.
-      Chunk = CGF.Builder.getIntN(IVSize, 1);
-  } else {
-    assert((Schedule == OMP_sch_static_chunked ||
-            Schedule == OMP_ord_static_chunked) &&
-           "expected static chunked schedule");
-  }
-  llvm::Value *Args[] = {
-    emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
-    getThreadID(CGF, Loc),
-    CGF.Builder.getInt32(Schedule), // Schedule type
-    IL.getPointer(),                // &isLastIter
-    LB.getPointer(),                // &LB
-    UB.getPointer(),                // &UB
-    ST.getPointer(),                // &Stride
-    CGF.Builder.getIntN(IVSize, 1), // Incr
-    Chunk                           // Chunk
-  };
-  CGF.EmitRuntimeCall(createForStaticInitFunction(IVSize, IVSigned), Args);
+void CGOpenMPRuntime::emitDistributeStaticInit(
+    CodeGenFunction &CGF, SourceLocation Loc,
+    OpenMPDistScheduleClauseKind SchedKind, unsigned IVSize, bool IVSigned,
+    bool Ordered, Address IL, Address LB, Address UB, Address ST,
+    llvm::Value *Chunk) {
+  OpenMPSchedType ScheduleNum = getRuntimeSchedule(SchedKind, Chunk != nullptr);
+  auto *UpdatedLocation = emitUpdateLocation(CGF, Loc);
+  auto *ThreadId = getThreadID(CGF, Loc);
+  auto *StaticInitFunction = createForStaticInitFunction(IVSize, IVSigned);
+  emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
+                        ScheduleNum, OMPC_SCHEDULE_MODIFIER_unknown,
+                        OMPC_SCHEDULE_MODIFIER_unknown, IVSize, Ordered, IL, LB,
+                        UB, ST, Chunk);
 }
 
 void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF,
@@ -1889,8 +2563,7 @@
   if (!CGF.HaveInsertPoint())
     return;
   // Call __kmpc_for_static_fini(ident_t *loc, kmp_int32 tid);
-  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
-                         getThreadID(CGF, Loc)};
+  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
   CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_for_static_fini),
                       Args);
 }
@@ -1902,8 +2575,7 @@
   if (!CGF.HaveInsertPoint())
     return;
   // Call __kmpc_for_dynamic_fini_(4|8)[u](ident_t *loc, kmp_int32 tid);
-  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
-                         getThreadID(CGF, Loc)};
+  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
   CGF.EmitRuntimeCall(createDispatchFiniFunction(IVSize, IVSigned), Args);
 }
 
@@ -1917,7 +2589,8 @@
   //          kmp_int[32|64] *p_lower, kmp_int[32|64] *p_upper,
   //          kmp_int[32|64] *p_stride);
   llvm::Value *Args[] = {
-      emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC), getThreadID(CGF, Loc),
+      emitUpdateLocation(CGF, Loc),
+      getThreadID(CGF, Loc),
       IL.getPointer(), // &isLastIter
       LB.getPointer(), // &Lower
       UB.getPointer(), // &Upper
@@ -1996,8 +2669,18 @@
   KmpTaskTRoutine,
   /// \brief Partition id for the untied tasks.
   KmpTaskTPartId,
-  /// \brief Function with call of destructors for private variables.
-  KmpTaskTDestructors,
+  /// Function with call of destructors for private variables.
+  Data1,
+  /// Task priority.
+  Data2,
+  /// (Taskloops only) Lower bound.
+  KmpTaskTLowerBound,
+  /// (Taskloops only) Upper bound.
+  KmpTaskTUpperBound,
+  /// (Taskloops only) Stride.
+  KmpTaskTStride,
+  /// (Taskloops only) Is last iteration flag.
+  KmpTaskTLastIter,
 };
 } // anonymous namespace
 
@@ -2010,11 +2693,11 @@
 void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
     initializeTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
                                     StringRef ParentName, unsigned LineNum,
-                                    unsigned ColNum, unsigned Order) {
+                                    unsigned Order) {
   assert(CGM.getLangOpts().OpenMPIsDevice && "Initialization of entries is "
                                              "only required for the device "
                                              "code generation.");
-  OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum][ColNum] =
+  OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] =
       OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr);
   ++OffloadingEntriesNum;
 }
@@ -2022,30 +2705,27 @@
 void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
     registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
                                   StringRef ParentName, unsigned LineNum,
-                                  unsigned ColNum, llvm::Constant *Addr,
-                                  llvm::Constant *ID) {
+                                  llvm::Constant *Addr, llvm::Constant *ID) {
   // If we are emitting code for a target, the entry is already initialized,
   // only has to be registered.
   if (CGM.getLangOpts().OpenMPIsDevice) {
-    assert(hasTargetRegionEntryInfo(DeviceID, FileID, ParentName, LineNum,
-                                    ColNum) &&
+    assert(hasTargetRegionEntryInfo(DeviceID, FileID, ParentName, LineNum) &&
            "Entry must exist.");
-    auto &Entry = OffloadEntriesTargetRegion[DeviceID][FileID][ParentName]
-                                            [LineNum][ColNum];
+    auto &Entry =
+        OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum];
     assert(Entry.isValid() && "Entry not initialized!");
     Entry.setAddress(Addr);
     Entry.setID(ID);
     return;
   } else {
     OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID);
-    OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum][ColNum] =
-        Entry;
+    OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] = Entry;
   }
 }
 
 bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::hasTargetRegionEntryInfo(
-    unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned LineNum,
-    unsigned ColNum) const {
+    unsigned DeviceID, unsigned FileID, StringRef ParentName,
+    unsigned LineNum) const {
   auto PerDevice = OffloadEntriesTargetRegion.find(DeviceID);
   if (PerDevice == OffloadEntriesTargetRegion.end())
     return false;
@@ -2058,11 +2738,8 @@
   auto PerLine = PerParentName->second.find(LineNum);
   if (PerLine == PerParentName->second.end())
     return false;
-  auto PerColumn = PerLine->second.find(ColNum);
-  if (PerColumn == PerLine->second.end())
-    return false;
   // Fail if this entry is already registered.
-  if (PerColumn->second.getAddress() || PerColumn->second.getID())
+  if (PerLine->second.getAddress() || PerLine->second.getID())
     return false;
   return true;
 }
@@ -2074,8 +2751,7 @@
     for (auto &F : D.second)
       for (auto &P : F.second)
         for (auto &L : P.second)
-          for (auto &C : L.second)
-            Action(D.first, F.first, P.first(), L.first, C.first, C.second);
+          Action(D.first, F.first, P.first(), L.first, L.second);
 }
 
 /// \brief Create a Ctor/Dtor-like function whose body is emitted through
@@ -2126,11 +2802,11 @@
       CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy());
   llvm::GlobalVariable *HostEntriesBegin = new llvm::GlobalVariable(
       M, OffloadEntryTy, /*isConstant=*/true,
-      llvm::GlobalValue::ExternalLinkage, /*Initializer=*/0,
+      llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
       ".omp_offloading.entries_begin");
   llvm::GlobalVariable *HostEntriesEnd = new llvm::GlobalVariable(
       M, OffloadEntryTy, /*isConstant=*/true,
-      llvm::GlobalValue::ExternalLinkage, /*Initializer=*/0,
+      llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
       ".omp_offloading.entries_end");
 
   // Create all device images
@@ -2142,10 +2818,11 @@
     StringRef T = Devices[i].getTriple();
     auto *ImgBegin = new llvm::GlobalVariable(
         M, CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
-        /*Initializer=*/0, Twine(".omp_offloading.img_start.") + Twine(T));
+        /*Initializer=*/nullptr,
+        Twine(".omp_offloading.img_start.") + Twine(T));
     auto *ImgEnd = new llvm::GlobalVariable(
         M, CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
-        /*Initializer=*/0, Twine(".omp_offloading.img_end.") + Twine(T));
+        /*Initializer=*/nullptr, Twine(".omp_offloading.img_end.") + Twine(T));
 
     llvm::Constant *Dev =
         llvm::ConstantStruct::get(DeviceImageTy, ImgBegin, ImgEnd,
@@ -2163,7 +2840,7 @@
       M, DeviceImagesInitTy, /*isConstant=*/true,
       llvm::GlobalValue::InternalLinkage, DeviceImagesInit,
       ".omp_offloading.device_images");
-  DeviceImages->setUnnamedAddr(true);
+  DeviceImages->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
 
   // This is a Zero array to be used in the creation of the constant expressions
   llvm::Constant *Index[] = {llvm::Constant::getNullValue(CGM.Int32Ty),
@@ -2193,12 +2870,14 @@
                                 IdentInfo, C.CharTy);
 
   auto *UnRegFn = createOffloadingBinaryDescriptorFunction(
-      CGM, ".omp_offloading.descriptor_unreg", [&](CodeGenFunction &CGF) {
+      CGM, ".omp_offloading.descriptor_unreg",
+      [&](CodeGenFunction &CGF, PrePostActionTy &) {
         CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_unregister_lib),
                              Desc);
       });
   auto *RegFn = createOffloadingBinaryDescriptorFunction(
-      CGM, ".omp_offloading.descriptor_reg", [&](CodeGenFunction &CGF) {
+      CGM, ".omp_offloading.descriptor_reg",
+      [&](CodeGenFunction &CGF, PrePostActionTy &) {
         CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_register_lib),
                              Desc);
         CGM.getCXXABI().registerGlobalDtor(CGF, RegUnregVar, UnRegFn, Desc);
@@ -2206,15 +2885,16 @@
   return RegFn;
 }
 
-void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *Addr, StringRef Name,
-                                         uint64_t Size) {
+void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID,
+                                         llvm::Constant *Addr, uint64_t Size) {
+  StringRef Name = Addr->getName();
   auto *TgtOffloadEntryType = cast<llvm::StructType>(
       CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy()));
   llvm::LLVMContext &C = CGM.getModule().getContext();
   llvm::Module &M = CGM.getModule();
 
   // Make sure the address has the right type.
-  llvm::Constant *AddrPtr = llvm::ConstantExpr::getBitCast(Addr, CGM.VoidPtrTy);
+  llvm::Constant *AddrPtr = llvm::ConstantExpr::getBitCast(ID, CGM.VoidPtrTy);
 
   // Create constant string with the name.
   llvm::Constant *StrPtrInit = llvm::ConstantDataArray::getString(C, Name);
@@ -2223,7 +2903,7 @@
       new llvm::GlobalVariable(M, StrPtrInit->getType(), /*isConstant=*/true,
                                llvm::GlobalValue::InternalLinkage, StrPtrInit,
                                ".omp_offloading.entry_name");
-  Str->setUnnamedAddr(true);
+  Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   llvm::Constant *StrPtr = llvm::ConstantExpr::getBitCast(Str, CGM.Int8PtrTy);
 
   // Create the entry struct.
@@ -2239,7 +2919,6 @@
   // We can't have any padding between symbols, so we need to have 1-byte
   // alignment.
   Entry->setAlignment(1);
-  return;
 }
 
 void CGOpenMPRuntime::createOffloadEntriesAndInfoMetadata() {
@@ -2275,7 +2954,6 @@
   // Create function that emits metadata for each target region entry;
   auto &&TargetRegionMetadataEmitter = [&](
       unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned Line,
-      unsigned Column,
       OffloadEntriesInfoManagerTy::OffloadEntryInfoTargetRegion &E) {
     llvm::SmallVector<llvm::Metadata *, 32> Ops;
     // Generate metadata for target regions. Each entry of this metadata
@@ -2285,15 +2963,13 @@
     // - Entry 2 -> File ID of the file where the entry was identified.
     // - Entry 3 -> Mangled name of the function where the entry was identified.
     // - Entry 4 -> Line in the file where the entry was identified.
-    // - Entry 5 -> Column in the file where the entry was identified.
-    // - Entry 6 -> Order the entry was created.
+    // - Entry 5 -> Order the entry was created.
     // The first element of the metadata node is the kind.
     Ops.push_back(getMDInt(E.getKind()));
     Ops.push_back(getMDInt(DeviceID));
     Ops.push_back(getMDInt(FileID));
     Ops.push_back(getMDString(ParentName));
     Ops.push_back(getMDInt(Line));
-    Ops.push_back(getMDInt(Column));
     Ops.push_back(getMDInt(E.getOrder()));
 
     // Save this entry in the right position of the ordered entries array.
@@ -2313,7 +2989,7 @@
                 E)) {
       assert(CE->getID() && CE->getAddress() &&
              "Entry ID and Addr are invalid!");
-      createOffloadEntry(CE->getID(), CE->getAddress()->getName(), /*Size=*/0);
+      createOffloadEntry(CE->getID(), CE->getAddress(), /*Size=*/0);
     } else
       llvm_unreachable("Unsupported entry kind.");
   }
@@ -2368,7 +3044,7 @@
       OffloadEntriesInfoManager.initializeTargetRegionEntryInfo(
           /*DeviceID=*/getMDInt(1), /*FileID=*/getMDInt(2),
           /*ParentName=*/getMDString(3), /*Line=*/getMDInt(4),
-          /*Column=*/getMDInt(5), /*Order=*/getMDInt(6));
+          /*Order=*/getMDInt(5));
       break;
     }
   }
@@ -2512,21 +3188,45 @@
 }
 
 static RecordDecl *
-createKmpTaskTRecordDecl(CodeGenModule &CGM, QualType KmpInt32Ty,
+createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
+                         QualType KmpInt32Ty,
                          QualType KmpRoutineEntryPointerQTy) {
   auto &C = CGM.getContext();
   // Build struct kmp_task_t {
   //         void *              shareds;
   //         kmp_routine_entry_t routine;
   //         kmp_int32           part_id;
-  //         kmp_routine_entry_t destructors;
+  //         kmp_cmplrdata_t data1;
+  //         kmp_cmplrdata_t data2;
+  // For taskloops additional fields:
+  //         kmp_uint64          lb;
+  //         kmp_uint64          ub;
+  //         kmp_int64           st;
+  //         kmp_int32           liter;
   //       };
+  auto *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
+  UD->startDefinition();
+  addFieldToRecordDecl(C, UD, KmpInt32Ty);
+  addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
+  UD->completeDefinition();
+  QualType KmpCmplrdataTy = C.getRecordType(UD);
   auto *RD = C.buildImplicitRecord("kmp_task_t");
   RD->startDefinition();
   addFieldToRecordDecl(C, RD, C.VoidPtrTy);
   addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
   addFieldToRecordDecl(C, RD, KmpInt32Ty);
-  addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
+  addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
+  addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
+  if (isOpenMPTaskLoopDirective(Kind)) {
+    QualType KmpUInt64Ty =
+        CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
+    QualType KmpInt64Ty =
+        CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
+    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
+    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
+    addFieldToRecordDecl(C, RD, KmpInt64Ty);
+    addFieldToRecordDecl(C, RD, KmpInt32Ty);
+  }
   RD->completeDefinition();
   return RD;
 }
@@ -2553,14 +3253,17 @@
 /// argument.
 /// \code
 /// kmp_int32 .omp_task_entry.(kmp_int32 gtid, kmp_task_t *tt) {
-///   TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map,
+///   TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map, tt,
+///   For taskloops:
+///   tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
 ///   tt->shareds);
 ///   return 0;
 /// }
 /// \endcode
 static llvm::Value *
 emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
-                      QualType KmpInt32Ty, QualType KmpTaskTWithPrivatesPtrQTy,
+                      OpenMPDirectiveKind Kind, QualType KmpInt32Ty,
+                      QualType KmpTaskTWithPrivatesPtrQTy,
                       QualType KmpTaskTWithPrivatesQTy, QualType KmpTaskTQTy,
                       QualType SharedsPtrTy, llvm::Value *TaskFunction,
                       llvm::Value *TaskPrivatesMap) {
@@ -2584,11 +3287,15 @@
   CGF.StartFunction(GlobalDecl(), KmpInt32Ty, TaskEntry, TaskEntryFnInfo, Args);
 
   // TaskFunction(gtid, tt->task_data.part_id, &tt->privates, task_privates_map,
+  // tt,
+  // For taskloops:
+  // tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
   // tt->task_data.shareds);
   auto *GtidParam = CGF.EmitLoadOfScalar(
       CGF.GetAddrOfLocalVar(&GtidArg), /*Volatile=*/false, KmpInt32Ty, Loc);
-  LValue TDBase = emitLoadOfPointerLValue(
-      CGF, CGF.GetAddrOfLocalVar(&TaskTypeArg), KmpTaskTWithPrivatesPtrQTy);
+  LValue TDBase = CGF.EmitLoadOfPointerLValue(
+      CGF.GetAddrOfLocalVar(&TaskTypeArg),
+      KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
   auto *KmpTaskTWithPrivatesQTyRD =
       cast<RecordDecl>(KmpTaskTWithPrivatesQTy->getAsTagDecl());
   LValue Base =
@@ -2596,7 +3303,7 @@
   auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
   auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
   auto PartIdLVal = CGF.EmitLValueForField(Base, *PartIdFI);
-  auto *PartidParam = CGF.EmitLoadOfLValue(PartIdLVal, Loc).getScalarVal();
+  auto *PartidParam = PartIdLVal.getPointer();
 
   auto SharedsFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds);
   auto SharedsLVal = CGF.EmitLValueForField(Base, *SharedsFI);
@@ -2610,12 +3317,37 @@
     auto PrivatesLVal = CGF.EmitLValueForField(TDBase, *PrivatesFI);
     PrivatesParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
         PrivatesLVal.getPointer(), CGF.VoidPtrTy);
-  } else {
+  } else
     PrivatesParam = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
-  }
 
-  llvm::Value *CallArgs[] = {GtidParam, PartidParam, PrivatesParam,
-                             TaskPrivatesMap, SharedsParam};
+  llvm::Value *CommonArgs[] = {GtidParam, PartidParam, PrivatesParam,
+                               TaskPrivatesMap,
+                               CGF.Builder
+                                   .CreatePointerBitCastOrAddrSpaceCast(
+                                       TDBase.getAddress(), CGF.VoidPtrTy)
+                                   .getPointer()};
+  SmallVector<llvm::Value *, 16> CallArgs(std::begin(CommonArgs),
+                                          std::end(CommonArgs));
+  if (isOpenMPTaskLoopDirective(Kind)) {
+    auto LBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound);
+    auto LBLVal = CGF.EmitLValueForField(Base, *LBFI);
+    auto *LBParam = CGF.EmitLoadOfLValue(LBLVal, Loc).getScalarVal();
+    auto UBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound);
+    auto UBLVal = CGF.EmitLValueForField(Base, *UBFI);
+    auto *UBParam = CGF.EmitLoadOfLValue(UBLVal, Loc).getScalarVal();
+    auto StFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTStride);
+    auto StLVal = CGF.EmitLValueForField(Base, *StFI);
+    auto *StParam = CGF.EmitLoadOfLValue(StLVal, Loc).getScalarVal();
+    auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
+    auto LILVal = CGF.EmitLValueForField(Base, *LIFI);
+    auto *LIParam = CGF.EmitLoadOfLValue(LILVal, Loc).getScalarVal();
+    CallArgs.push_back(LBParam);
+    CallArgs.push_back(UBParam);
+    CallArgs.push_back(StParam);
+    CallArgs.push_back(LIParam);
+  }
+  CallArgs.push_back(SharedsParam);
+
   CGF.EmitCallOrInvoke(TaskFunction, CallArgs);
   CGF.EmitStoreThroughLValue(
       RValue::get(CGF.Builder.getInt32(/*C=*/0)),
@@ -2651,8 +3383,9 @@
   CGF.StartFunction(GlobalDecl(), KmpInt32Ty, DestructorFn, DestructorFnInfo,
                     Args);
 
-  LValue Base = emitLoadOfPointerLValue(
-      CGF, CGF.GetAddrOfLocalVar(&TaskTypeArg), KmpTaskTWithPrivatesPtrQTy);
+  LValue Base = CGF.EmitLoadOfPointerLValue(
+      CGF.GetAddrOfLocalVar(&TaskTypeArg),
+      KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
   auto *KmpTaskTWithPrivatesQTyRD =
       cast<RecordDecl>(KmpTaskTWithPrivatesQTy->getAsTagDecl());
   auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
@@ -2682,6 +3415,7 @@
 emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
                                ArrayRef<const Expr *> PrivateVars,
                                ArrayRef<const Expr *> FirstprivateVars,
+                               ArrayRef<const Expr *> LastprivateVars,
                                QualType PrivatesQTy,
                                ArrayRef<PrivateDataTy> Privates) {
   auto &C = CGM.getContext();
@@ -2712,6 +3446,16 @@
     PrivateVarsPos[VD] = Counter;
     ++Counter;
   }
+  for (auto *E: LastprivateVars) {
+    Args.push_back(ImplicitParamDecl::Create(
+        C, /*DC=*/nullptr, Loc,
+        /*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType()))
+                            .withConst()
+                            .withRestrict()));
+    auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
+    PrivateVarsPos[VD] = Counter;
+    ++Counter;
+  }
   auto &TaskPrivatesMapFnInfo =
       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
   auto *TaskPrivatesMapTy =
@@ -2728,16 +3472,17 @@
                     TaskPrivatesMapFnInfo, Args);
 
   // *privi = &.privates.privi;
-  LValue Base = emitLoadOfPointerLValue(
-      CGF, CGF.GetAddrOfLocalVar(&TaskPrivatesArg), TaskPrivatesArg.getType());
+  LValue Base = CGF.EmitLoadOfPointerLValue(
+      CGF.GetAddrOfLocalVar(&TaskPrivatesArg),
+      TaskPrivatesArg.getType()->castAs<PointerType>());
   auto *PrivatesQTyRD = cast<RecordDecl>(PrivatesQTy->getAsTagDecl());
   Counter = 0;
   for (auto *Field : PrivatesQTyRD->fields()) {
     auto FieldLVal = CGF.EmitLValueForField(Base, Field);
     auto *VD = Args[PrivateVarsPos[Privates[Counter].second.Original]];
     auto RefLVal = CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
-    auto RefLoadLVal =
-        emitLoadOfPointerLValue(CGF, RefLVal.getAddress(), RefLVal.getType());
+    auto RefLoadLVal = CGF.EmitLoadOfPointerLValue(
+        RefLVal.getAddress(), RefLVal.getType()->castAs<PointerType>());
     CGF.EmitStoreOfScalar(FieldLVal.getPointer(), RefLoadLVal);
     ++Counter;
   }
@@ -2750,23 +3495,199 @@
   return P1->first < P2->first ? 1 : (P2->first < P1->first ? -1 : 0);
 }
 
-void CGOpenMPRuntime::emitTaskCall(
-    CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D,
-    bool Tied, llvm::PointerIntPair<llvm::Value *, 1, bool> Final,
-    llvm::Value *TaskFunction, QualType SharedsTy, Address Shareds,
-    const Expr *IfCond, ArrayRef<const Expr *> PrivateVars,
-    ArrayRef<const Expr *> PrivateCopies,
-    ArrayRef<const Expr *> FirstprivateVars,
-    ArrayRef<const Expr *> FirstprivateCopies,
-    ArrayRef<const Expr *> FirstprivateInits,
-    ArrayRef<std::pair<OpenMPDependClauseKind, const Expr *>> Dependences) {
-  if (!CGF.HaveInsertPoint())
-    return;
+/// Emit initialization for private variables in task-based directives.
+static void emitPrivatesInit(CodeGenFunction &CGF,
+                             const OMPExecutableDirective &D,
+                             Address KmpTaskSharedsPtr, LValue TDBase,
+                             const RecordDecl *KmpTaskTWithPrivatesQTyRD,
+                             QualType SharedsTy, QualType SharedsPtrTy,
+                             const OMPTaskDataTy &Data,
+                             ArrayRef<PrivateDataTy> Privates, bool ForDup) {
+  auto &C = CGF.getContext();
+  auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
+  LValue PrivatesBase = CGF.EmitLValueForField(TDBase, *FI);
+  LValue SrcBase;
+  if (!Data.FirstprivateVars.empty()) {
+    SrcBase = CGF.MakeAddrLValue(
+        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+            KmpTaskSharedsPtr, CGF.ConvertTypeForMem(SharedsPtrTy)),
+        SharedsTy);
+  }
+  CodeGenFunction::CGCapturedStmtInfo CapturesInfo(
+      cast<CapturedStmt>(*D.getAssociatedStmt()));
+  FI = cast<RecordDecl>(FI->getType()->getAsTagDecl())->field_begin();
+  for (auto &&Pair : Privates) {
+    auto *VD = Pair.second.PrivateCopy;
+    auto *Init = VD->getAnyInitializer();
+    if (Init && (!ForDup || (isa<CXXConstructExpr>(Init) &&
+                             !CGF.isTrivialInitializer(Init)))) {
+      LValue PrivateLValue = CGF.EmitLValueForField(PrivatesBase, *FI);
+      if (auto *Elem = Pair.second.PrivateElemInit) {
+        auto *OriginalVD = Pair.second.Original;
+        auto *SharedField = CapturesInfo.lookup(OriginalVD);
+        auto SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField);
+        SharedRefLValue = CGF.MakeAddrLValue(
+            Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)),
+            SharedRefLValue.getType(), AlignmentSource::Decl);
+        QualType Type = OriginalVD->getType();
+        if (Type->isArrayType()) {
+          // Initialize firstprivate array.
+          if (!isa<CXXConstructExpr>(Init) || CGF.isTrivialInitializer(Init)) {
+            // Perform simple memcpy.
+            CGF.EmitAggregateAssign(PrivateLValue.getAddress(),
+                                    SharedRefLValue.getAddress(), Type);
+          } else {
+            // Initialize firstprivate array using element-by-element
+            // intialization.
+            CGF.EmitOMPAggregateAssign(
+                PrivateLValue.getAddress(), SharedRefLValue.getAddress(), Type,
+                [&CGF, Elem, Init, &CapturesInfo](Address DestElement,
+                                                  Address SrcElement) {
+                  // Clean up any temporaries needed by the initialization.
+                  CodeGenFunction::OMPPrivateScope InitScope(CGF);
+                  InitScope.addPrivate(
+                      Elem, [SrcElement]() -> Address { return SrcElement; });
+                  (void)InitScope.Privatize();
+                  // Emit initialization for single element.
+                  CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(
+                      CGF, &CapturesInfo);
+                  CGF.EmitAnyExprToMem(Init, DestElement,
+                                       Init->getType().getQualifiers(),
+                                       /*IsInitializer=*/false);
+                });
+          }
+        } else {
+          CodeGenFunction::OMPPrivateScope InitScope(CGF);
+          InitScope.addPrivate(Elem, [SharedRefLValue]() -> Address {
+            return SharedRefLValue.getAddress();
+          });
+          (void)InitScope.Privatize();
+          CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CapturesInfo);
+          CGF.EmitExprAsInit(Init, VD, PrivateLValue,
+                             /*capturedByInit=*/false);
+        }
+      } else
+        CGF.EmitExprAsInit(Init, VD, PrivateLValue, /*capturedByInit=*/false);
+    }
+    ++FI;
+  }
+}
+
+/// Check if duplication function is required for taskloops.
+static bool checkInitIsRequired(CodeGenFunction &CGF,
+                                ArrayRef<PrivateDataTy> Privates) {
+  bool InitRequired = false;
+  for (auto &&Pair : Privates) {
+    auto *VD = Pair.second.PrivateCopy;
+    auto *Init = VD->getAnyInitializer();
+    InitRequired = InitRequired || (Init && isa<CXXConstructExpr>(Init) &&
+                                    !CGF.isTrivialInitializer(Init));
+  }
+  return InitRequired;
+}
+
+
+/// Emit task_dup function (for initialization of
+/// private/firstprivate/lastprivate vars and last_iter flag)
+/// \code
+/// void __task_dup_entry(kmp_task_t *task_dst, const kmp_task_t *task_src, int
+/// lastpriv) {
+/// // setup lastprivate flag
+///    task_dst->last = lastpriv;
+/// // could be constructor calls here...
+/// }
+/// \endcode
+static llvm::Value *
+emitTaskDupFunction(CodeGenModule &CGM, SourceLocation Loc,
+                    const OMPExecutableDirective &D,
+                    QualType KmpTaskTWithPrivatesPtrQTy,
+                    const RecordDecl *KmpTaskTWithPrivatesQTyRD,
+                    const RecordDecl *KmpTaskTQTyRD, QualType SharedsTy,
+                    QualType SharedsPtrTy, const OMPTaskDataTy &Data,
+                    ArrayRef<PrivateDataTy> Privates, bool WithLastIter) {
   auto &C = CGM.getContext();
-  llvm::SmallVector<PrivateDataTy, 8> Privates;
+  FunctionArgList Args;
+  ImplicitParamDecl DstArg(C, /*DC=*/nullptr, Loc,
+                           /*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy);
+  ImplicitParamDecl SrcArg(C, /*DC=*/nullptr, Loc,
+                           /*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy);
+  ImplicitParamDecl LastprivArg(C, /*DC=*/nullptr, Loc,
+                                /*Id=*/nullptr, C.IntTy);
+  Args.push_back(&DstArg);
+  Args.push_back(&SrcArg);
+  Args.push_back(&LastprivArg);
+  auto &TaskDupFnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *TaskDupTy = CGM.getTypes().GetFunctionType(TaskDupFnInfo);
+  auto *TaskDup =
+      llvm::Function::Create(TaskDupTy, llvm::GlobalValue::InternalLinkage,
+                             ".omp_task_dup.", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskDup, TaskDupFnInfo);
+  CodeGenFunction CGF(CGM);
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskDup, TaskDupFnInfo, Args);
+
+  LValue TDBase = CGF.EmitLoadOfPointerLValue(
+      CGF.GetAddrOfLocalVar(&DstArg),
+      KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
+  // task_dst->liter = lastpriv;
+  if (WithLastIter) {
+    auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
+    LValue Base = CGF.EmitLValueForField(
+        TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
+    LValue LILVal = CGF.EmitLValueForField(Base, *LIFI);
+    llvm::Value *Lastpriv = CGF.EmitLoadOfScalar(
+        CGF.GetAddrOfLocalVar(&LastprivArg), /*Volatile=*/false, C.IntTy, Loc);
+    CGF.EmitStoreOfScalar(Lastpriv, LILVal);
+  }
+
+  // Emit initial values for private copies (if any).
+  assert(!Privates.empty());
+  Address KmpTaskSharedsPtr = Address::invalid();
+  if (!Data.FirstprivateVars.empty()) {
+    LValue TDBase = CGF.EmitLoadOfPointerLValue(
+        CGF.GetAddrOfLocalVar(&SrcArg),
+        KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
+    LValue Base = CGF.EmitLValueForField(
+        TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
+    KmpTaskSharedsPtr = Address(
+        CGF.EmitLoadOfScalar(CGF.EmitLValueForField(
+                                 Base, *std::next(KmpTaskTQTyRD->field_begin(),
+                                                  KmpTaskTShareds)),
+                             Loc),
+        CGF.getNaturalTypeAlignment(SharedsTy));
+  }
+  emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, TDBase, KmpTaskTWithPrivatesQTyRD,
+                   SharedsTy, SharedsPtrTy, Data, Privates, /*ForDup=*/true);
+  CGF.FinishFunction();
+  return TaskDup;
+}
+
+/// Checks if destructor function is required to be generated.
+/// \return true if cleanups are required, false otherwise.
+static bool
+checkDestructorsRequired(const RecordDecl *KmpTaskTWithPrivatesQTyRD) {
+  bool NeedsCleanup = false;
+  auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
+  auto *PrivateRD = cast<RecordDecl>(FI->getType()->getAsTagDecl());
+  for (auto *FD : PrivateRD->fields()) {
+    NeedsCleanup = NeedsCleanup || FD->getType().isDestructedType();
+    if (NeedsCleanup)
+      break;
+  }
+  return NeedsCleanup;
+}
+
+CGOpenMPRuntime::TaskResultTy
+CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
+                              const OMPExecutableDirective &D,
+                              llvm::Value *TaskFunction, QualType SharedsTy,
+                              Address Shareds, const OMPTaskDataTy &Data) {
+  auto &C = CGM.getContext();
+  llvm::SmallVector<PrivateDataTy, 4> Privates;
   // Aggregate privates and sort them by the alignment.
-  auto I = PrivateCopies.begin();
-  for (auto *E : PrivateVars) {
+  auto I = Data.PrivateCopies.begin();
+  for (auto *E : Data.PrivateVars) {
     auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
     Privates.push_back(std::make_pair(
         C.getDeclAlign(VD),
@@ -2774,16 +3695,26 @@
                          /*PrivateElemInit=*/nullptr)));
     ++I;
   }
-  I = FirstprivateCopies.begin();
-  auto IElemInitRef = FirstprivateInits.begin();
-  for (auto *E : FirstprivateVars) {
+  I = Data.FirstprivateCopies.begin();
+  auto IElemInitRef = Data.FirstprivateInits.begin();
+  for (auto *E : Data.FirstprivateVars) {
     auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
     Privates.push_back(std::make_pair(
         C.getDeclAlign(VD),
         PrivateHelpersTy(
             VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
             cast<VarDecl>(cast<DeclRefExpr>(*IElemInitRef)->getDecl()))));
-    ++I, ++IElemInitRef;
+    ++I;
+    ++IElemInitRef;
+  }
+  I = Data.LastprivateCopies.begin();
+  for (auto *E : Data.LastprivateVars) {
+    auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
+    Privates.push_back(std::make_pair(
+        C.getDeclAlign(VD),
+        PrivateHelpersTy(VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
+                         /*PrivateElemInit=*/nullptr)));
+    ++I;
   }
   llvm::array_pod_sort(Privates.begin(), Privates.end(),
                        array_pod_sort_comparator);
@@ -2792,8 +3723,8 @@
   emitKmpRoutineEntryT(KmpInt32Ty);
   // Build type kmp_task_t (if not built yet).
   if (KmpTaskTQTy.isNull()) {
-    KmpTaskTQTy = C.getRecordType(
-        createKmpTaskTRecordDecl(CGM, KmpInt32Ty, KmpRoutineEntryPtrQTy));
+    KmpTaskTQTy = C.getRecordType(createKmpTaskTRecordDecl(
+        CGM, D.getDirectiveKind(), KmpInt32Ty, KmpRoutineEntryPtrQTy));
   }
   auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
   // Build particular struct kmp_task_t for the given task.
@@ -2804,7 +3735,7 @@
       C.getPointerType(KmpTaskTWithPrivatesQTy);
   auto *KmpTaskTWithPrivatesTy = CGF.ConvertType(KmpTaskTWithPrivatesQTy);
   auto *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo();
-  auto *KmpTaskTWithPrivatesTySize = getTypeSize(CGF, KmpTaskTWithPrivatesQTy);
+  auto *KmpTaskTWithPrivatesTySize = CGF.getTypeSize(KmpTaskTWithPrivatesQTy);
   QualType SharedsPtrTy = C.getPointerType(SharedsTy);
 
   // Emit initial values for private copies (if any).
@@ -2816,7 +3747,8 @@
   if (!Privates.empty()) {
     auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
     TaskPrivatesMap = emitTaskPrivateMappingFunction(
-        CGM, Loc, PrivateVars, FirstprivateVars, FI->getType(), Privates);
+        CGM, Loc, Data.PrivateVars, Data.FirstprivateVars, Data.LastprivateVars,
+        FI->getType(), Privates);
     TaskPrivatesMap = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
         TaskPrivatesMap, TaskPrivatesMapTy);
   } else {
@@ -2826,8 +3758,9 @@
   // Build a proxy function kmp_int32 .omp_task_entry.(kmp_int32 gtid,
   // kmp_task_t *tt);
   auto *TaskEntry = emitProxyTaskFunction(
-      CGM, Loc, KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy, KmpTaskTWithPrivatesQTy,
-      KmpTaskTQTy, SharedsPtrTy, TaskFunction, TaskPrivatesMap);
+      CGM, Loc, D.getDirectiveKind(), KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy,
+      KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
+      TaskPrivatesMap);
 
   // Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
   // kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
@@ -2835,15 +3768,27 @@
   // Task flags. Format is taken from
   // http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h,
   // description of kmp_tasking_flags struct.
-  const unsigned TiedFlag = 0x1;
-  const unsigned FinalFlag = 0x2;
-  unsigned Flags = Tied ? TiedFlag : 0;
+  enum {
+    TiedFlag = 0x1,
+    FinalFlag = 0x2,
+    DestructorsFlag = 0x8,
+    PriorityFlag = 0x20
+  };
+  unsigned Flags = Data.Tied ? TiedFlag : 0;
+  bool NeedsCleanup = false;
+  if (!Privates.empty()) {
+    NeedsCleanup = checkDestructorsRequired(KmpTaskTWithPrivatesQTyRD);
+    if (NeedsCleanup)
+      Flags = Flags | DestructorsFlag;
+  }
+  if (Data.Priority.getInt())
+    Flags = Flags | PriorityFlag;
   auto *TaskFlags =
-      Final.getPointer()
-          ? CGF.Builder.CreateSelect(Final.getPointer(),
+      Data.Final.getPointer()
+          ? CGF.Builder.CreateSelect(Data.Final.getPointer(),
                                      CGF.Builder.getInt32(FinalFlag),
                                      CGF.Builder.getInt32(/*C=*/0))
-          : CGF.Builder.getInt32(Final.getInt() ? FinalFlag : 0);
+          : CGF.Builder.getInt32(Data.Final.getInt() ? FinalFlag : 0);
   TaskFlags = CGF.Builder.CreateOr(TaskFlags, CGF.Builder.getInt32(Flags));
   auto *SharedsSize = CGM.getSize(C.getTypeSizeInChars(SharedsTy));
   llvm::Value *AllocArgs[] = {emitUpdateLocation(CGF, Loc),
@@ -2873,96 +3818,71 @@
     CGF.EmitAggregateCopy(KmpTaskSharedsPtr, Shareds, SharedsTy);
   }
   // Emit initial values for private copies (if any).
-  bool NeedsCleanup = false;
+  TaskResultTy Result;
   if (!Privates.empty()) {
-    auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
-    auto PrivatesBase = CGF.EmitLValueForField(Base, *FI);
-    FI = cast<RecordDecl>(FI->getType()->getAsTagDecl())->field_begin();
-    LValue SharedsBase;
-    if (!FirstprivateVars.empty()) {
-      SharedsBase = CGF.MakeAddrLValue(
-          CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-              KmpTaskSharedsPtr, CGF.ConvertTypeForMem(SharedsPtrTy)),
-          SharedsTy);
-    }
-    CodeGenFunction::CGCapturedStmtInfo CapturesInfo(
-        cast<CapturedStmt>(*D.getAssociatedStmt()));
-    for (auto &&Pair : Privates) {
-      auto *VD = Pair.second.PrivateCopy;
-      auto *Init = VD->getAnyInitializer();
-      LValue PrivateLValue = CGF.EmitLValueForField(PrivatesBase, *FI);
-      if (Init) {
-        if (auto *Elem = Pair.second.PrivateElemInit) {
-          auto *OriginalVD = Pair.second.Original;
-          auto *SharedField = CapturesInfo.lookup(OriginalVD);
-          auto SharedRefLValue =
-              CGF.EmitLValueForField(SharedsBase, SharedField);
-          SharedRefLValue = CGF.MakeAddrLValue(
-              Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)),
-              SharedRefLValue.getType(), AlignmentSource::Decl);
-          QualType Type = OriginalVD->getType();
-          if (Type->isArrayType()) {
-            // Initialize firstprivate array.
-            if (!isa<CXXConstructExpr>(Init) ||
-                CGF.isTrivialInitializer(Init)) {
-              // Perform simple memcpy.
-              CGF.EmitAggregateAssign(PrivateLValue.getAddress(),
-                                      SharedRefLValue.getAddress(), Type);
-            } else {
-              // Initialize firstprivate array using element-by-element
-              // intialization.
-              CGF.EmitOMPAggregateAssign(
-                  PrivateLValue.getAddress(), SharedRefLValue.getAddress(),
-                  Type, [&CGF, Elem, Init, &CapturesInfo](
-                            Address DestElement, Address SrcElement) {
-                    // Clean up any temporaries needed by the initialization.
-                    CodeGenFunction::OMPPrivateScope InitScope(CGF);
-                    InitScope.addPrivate(Elem, [SrcElement]() -> Address {
-                      return SrcElement;
-                    });
-                    (void)InitScope.Privatize();
-                    // Emit initialization for single element.
-                    CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(
-                        CGF, &CapturesInfo);
-                    CGF.EmitAnyExprToMem(Init, DestElement,
-                                         Init->getType().getQualifiers(),
-                                         /*IsInitializer=*/false);
-                  });
-            }
-          } else {
-            CodeGenFunction::OMPPrivateScope InitScope(CGF);
-            InitScope.addPrivate(Elem, [SharedRefLValue]() -> Address {
-              return SharedRefLValue.getAddress();
-            });
-            (void)InitScope.Privatize();
-            CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CapturesInfo);
-            CGF.EmitExprAsInit(Init, VD, PrivateLValue,
-                               /*capturedByInit=*/false);
-          }
-        } else {
-          CGF.EmitExprAsInit(Init, VD, PrivateLValue, /*capturedByInit=*/false);
-        }
-      }
-      NeedsCleanup = NeedsCleanup || FI->getType().isDestructedType();
-      ++FI;
+    emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, Base, KmpTaskTWithPrivatesQTyRD,
+                     SharedsTy, SharedsPtrTy, Data, Privates,
+                     /*ForDup=*/false);
+    if (isOpenMPTaskLoopDirective(D.getDirectiveKind()) &&
+        (!Data.LastprivateVars.empty() || checkInitIsRequired(CGF, Privates))) {
+      Result.TaskDupFn = emitTaskDupFunction(
+          CGM, Loc, D, KmpTaskTWithPrivatesPtrQTy, KmpTaskTWithPrivatesQTyRD,
+          KmpTaskTQTyRD, SharedsTy, SharedsPtrTy, Data, Privates,
+          /*WithLastIter=*/!Data.LastprivateVars.empty());
     }
   }
+  // Fields of union "kmp_cmplrdata_t" for destructors and priority.
+  enum { Priority = 0, Destructors = 1 };
   // Provide pointer to function with destructors for privates.
-  llvm::Value *DestructorFn =
-      NeedsCleanup ? emitDestructorsFunction(CGM, Loc, KmpInt32Ty,
-                                             KmpTaskTWithPrivatesPtrQTy,
-                                             KmpTaskTWithPrivatesQTy)
-                   : llvm::ConstantPointerNull::get(
-                         cast<llvm::PointerType>(KmpRoutineEntryPtrTy));
-  LValue Destructor = CGF.EmitLValueForField(
-      TDBase, *std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTDestructors));
-  CGF.EmitStoreOfScalar(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-                            DestructorFn, KmpRoutineEntryPtrTy),
-                        Destructor);
+  auto FI = std::next(KmpTaskTQTyRD->field_begin(), Data1);
+  auto *KmpCmplrdataUD = (*FI)->getType()->getAsUnionType()->getDecl();
+  if (NeedsCleanup) {
+    llvm::Value *DestructorFn = emitDestructorsFunction(
+        CGM, Loc, KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy,
+        KmpTaskTWithPrivatesQTy);
+    LValue Data1LV = CGF.EmitLValueForField(TDBase, *FI);
+    LValue DestructorsLV = CGF.EmitLValueForField(
+        Data1LV, *std::next(KmpCmplrdataUD->field_begin(), Destructors));
+    CGF.EmitStoreOfScalar(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                              DestructorFn, KmpRoutineEntryPtrTy),
+                          DestructorsLV);
+  }
+  // Set priority.
+  if (Data.Priority.getInt()) {
+    LValue Data2LV = CGF.EmitLValueForField(
+        TDBase, *std::next(KmpTaskTQTyRD->field_begin(), Data2));
+    LValue PriorityLV = CGF.EmitLValueForField(
+        Data2LV, *std::next(KmpCmplrdataUD->field_begin(), Priority));
+    CGF.EmitStoreOfScalar(Data.Priority.getPointer(), PriorityLV);
+  }
+  Result.NewTask = NewTask;
+  Result.TaskEntry = TaskEntry;
+  Result.NewTaskNewTaskTTy = NewTaskNewTaskTTy;
+  Result.TDBase = TDBase;
+  Result.KmpTaskTQTyRD = KmpTaskTQTyRD;
+  return Result;
+}
 
+void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
+                                   const OMPExecutableDirective &D,
+                                   llvm::Value *TaskFunction,
+                                   QualType SharedsTy, Address Shareds,
+                                   const Expr *IfCond,
+                                   const OMPTaskDataTy &Data) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  TaskResultTy Result =
+      emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
+  llvm::Value *NewTask = Result.NewTask;
+  llvm::Value *TaskEntry = Result.TaskEntry;
+  llvm::Value *NewTaskNewTaskTTy = Result.NewTaskNewTaskTTy;
+  LValue TDBase = Result.TDBase;
+  RecordDecl *KmpTaskTQTyRD = Result.KmpTaskTQTyRD;
+  auto &C = CGM.getContext();
   // Process list of dependences.
   Address DependenciesArray = Address::invalid();
-  unsigned NumDependencies = Dependences.size();
+  unsigned NumDependencies = Data.Dependences.size();
   if (NumDependencies) {
     // Dependence kind for RTL.
     enum RTLDependenceKindTy { DepIn = 0x01, DepInOut = 0x3 };
@@ -2979,18 +3899,18 @@
       addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
       KmpDependInfoRD->completeDefinition();
       KmpDependInfoTy = C.getRecordType(KmpDependInfoRD);
-    } else {
+    } else
       KmpDependInfoRD = cast<RecordDecl>(KmpDependInfoTy->getAsTagDecl());
-    }
     CharUnits DependencySize = C.getTypeSizeInChars(KmpDependInfoTy);
     // Define type kmp_depend_info[<Dependences.size()>];
     QualType KmpDependInfoArrayTy = C.getConstantArrayType(
         KmpDependInfoTy, llvm::APInt(/*numBits=*/64, NumDependencies),
         ArrayType::Normal, /*IndexTypeQuals=*/0);
     // kmp_depend_info[<Dependences.size()>] deps;
-    DependenciesArray = CGF.CreateMemTemp(KmpDependInfoArrayTy);
+    DependenciesArray =
+        CGF.CreateMemTemp(KmpDependInfoArrayTy, ".dep.arr.addr");
     for (unsigned i = 0; i < NumDependencies; ++i) {
-      const Expr *E = Dependences[i].second;
+      const Expr *E = Data.Dependences[i].second;
       auto Addr = CGF.EmitLValue(E);
       llvm::Value *Size;
       QualType Ty = E->getType();
@@ -3004,7 +3924,7 @@
         llvm::Value *UpIntPtr = CGF.Builder.CreatePtrToInt(UpAddr, CGM.SizeTy);
         Size = CGF.Builder.CreateNUWSub(UpIntPtr, LowIntPtr);
       } else
-        Size = getTypeSize(CGF, Ty);
+        Size = CGF.getTypeSize(Ty);
       auto Base = CGF.MakeAddrLValue(
           CGF.Builder.CreateConstArrayGEP(DependenciesArray, i, DependencySize),
           KmpDependInfoTy);
@@ -3020,7 +3940,7 @@
       CGF.EmitStoreOfScalar(Size, LenLVal);
       // deps[i].flags = <Dependences[i].first>;
       RTLDependenceKindTy DepKind;
-      switch (Dependences[i].first) {
+      switch (Data.Dependences[i].first) {
       case OMPC_DEPEND_in:
         DepKind = DepIn;
         break;
@@ -3046,8 +3966,6 @@
 
   // NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc()
   // libcall.
-  // Build kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
-  // *new_task);
   // Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
   // kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
   // kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence
@@ -3065,19 +3983,26 @@
     DepTaskArgs[5] = CGF.Builder.getInt32(0);
     DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
   }
-  auto &&ThenCodeGen = [this, NumDependencies,
-                        &TaskArgs, &DepTaskArgs](CodeGenFunction &CGF) {
-    // TODO: add check for untied tasks.    
+  auto &&ThenCodeGen = [this, Loc, &Data, TDBase, KmpTaskTQTyRD,
+                        NumDependencies, &TaskArgs,
+                        &DepTaskArgs](CodeGenFunction &CGF, PrePostActionTy &) {
+    if (!Data.Tied) {
+      auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
+      auto PartIdLVal = CGF.EmitLValueForField(TDBase, *PartIdFI);
+      CGF.EmitStoreOfScalar(CGF.Builder.getInt32(0), PartIdLVal);
+    }
     if (NumDependencies) {
-      CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task_with_deps),
-                          DepTaskArgs);
+      CGF.EmitRuntimeCall(
+          createRuntimeFunction(OMPRTL__kmpc_omp_task_with_deps), DepTaskArgs);
     } else {
       CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task),
                           TaskArgs);
     }
+    // Check if parent region is untied and build return for untied task;
+    if (auto *Region =
+            dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
+      Region->emitUntiedSwitch(CGF);
   };
-  typedef CallEndCleanup<std::extent<decltype(TaskArgs)>::value>
-      IfCallEndCleanup;
 
   llvm::Value *DepWaitTaskArgs[6];
   if (NumDependencies) {
@@ -3088,40 +4013,111 @@
     DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
     DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
   }
-  auto &&ElseCodeGen = [this, &TaskArgs, ThreadID, NewTaskNewTaskTTy, TaskEntry,
-                        NumDependencies, &DepWaitTaskArgs](CodeGenFunction &CGF) {
+  auto &&ElseCodeGen = [&TaskArgs, ThreadID, NewTaskNewTaskTTy, TaskEntry,
+                        NumDependencies, &DepWaitTaskArgs](CodeGenFunction &CGF,
+                                                           PrePostActionTy &) {
+    auto &RT = CGF.CGM.getOpenMPRuntime();
     CodeGenFunction::RunCleanupsScope LocalScope(CGF);
     // Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
     // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
     // ndeps_noalias, kmp_depend_info_t *noalias_dep_list); if dependence info
     // is specified.
     if (NumDependencies)
-      CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_wait_deps),
+      CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__kmpc_omp_wait_deps),
                           DepWaitTaskArgs);
+    // Call proxy_task_entry(gtid, new_task);
+    auto &&CodeGen = [TaskEntry, ThreadID, NewTaskNewTaskTTy](
+        CodeGenFunction &CGF, PrePostActionTy &Action) {
+      Action.Enter(CGF);
+      llvm::Value *OutlinedFnArgs[] = {ThreadID, NewTaskNewTaskTTy};
+      CGF.EmitCallOrInvoke(TaskEntry, OutlinedFnArgs);
+    };
+
     // Build void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
     // kmp_task_t *new_task);
-    CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task_begin_if0),
-                        TaskArgs);
     // Build void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
     // kmp_task_t *new_task);
-    CGF.EHStack.pushCleanup<IfCallEndCleanup>(
-        NormalAndEHCleanup,
-        createRuntimeFunction(OMPRTL__kmpc_omp_task_complete_if0),
-        llvm::makeArrayRef(TaskArgs));
-
-    // Call proxy_task_entry(gtid, new_task);
-    llvm::Value *OutlinedFnArgs[] = {ThreadID, NewTaskNewTaskTTy};
-    CGF.EmitCallOrInvoke(TaskEntry, OutlinedFnArgs);
+    RegionCodeGenTy RCG(CodeGen);
+    CommonActionTy Action(
+        RT.createRuntimeFunction(OMPRTL__kmpc_omp_task_begin_if0), TaskArgs,
+        RT.createRuntimeFunction(OMPRTL__kmpc_omp_task_complete_if0), TaskArgs);
+    RCG.setAction(Action);
+    RCG(CGF);
   };
 
-  if (IfCond) {
+  if (IfCond)
     emitOMPIfClause(CGF, IfCond, ThenCodeGen, ElseCodeGen);
-  } else {
-    CodeGenFunction::RunCleanupsScope Scope(CGF);
-    ThenCodeGen(CGF);
+  else {
+    RegionCodeGenTy ThenRCG(ThenCodeGen);
+    ThenRCG(CGF);
   }
 }
 
+void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
+                                       const OMPLoopDirective &D,
+                                       llvm::Value *TaskFunction,
+                                       QualType SharedsTy, Address Shareds,
+                                       const Expr *IfCond,
+                                       const OMPTaskDataTy &Data) {
+  if (!CGF.HaveInsertPoint())
+    return;
+  TaskResultTy Result =
+      emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
+  // NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc()
+  // libcall.
+  // Call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
+  // if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
+  // sched, kmp_uint64 grainsize, void *task_dup);
+  llvm::Value *ThreadID = getThreadID(CGF, Loc);
+  llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
+  llvm::Value *IfVal;
+  if (IfCond) {
+    IfVal = CGF.Builder.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.IntTy,
+                                      /*isSigned=*/true);
+  } else
+    IfVal = llvm::ConstantInt::getSigned(CGF.IntTy, /*V=*/1);
+
+  LValue LBLVal = CGF.EmitLValueForField(
+      Result.TDBase,
+      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound));
+  auto *LBVar =
+      cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
+  CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(), LBLVal.getQuals(),
+                       /*IsInitializer=*/true);
+  LValue UBLVal = CGF.EmitLValueForField(
+      Result.TDBase,
+      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound));
+  auto *UBVar =
+      cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
+  CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(), UBLVal.getQuals(),
+                       /*IsInitializer=*/true);
+  LValue StLVal = CGF.EmitLValueForField(
+      Result.TDBase,
+      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTStride));
+  auto *StVar =
+      cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
+  CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
+                       /*IsInitializer=*/true);
+  enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
+  llvm::Value *TaskArgs[] = {
+      UpLoc, ThreadID, Result.NewTask, IfVal, LBLVal.getPointer(),
+      UBLVal.getPointer(), CGF.EmitLoadOfScalar(StLVal, SourceLocation()),
+      llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0),
+      llvm::ConstantInt::getSigned(
+          CGF.IntTy, Data.Schedule.getPointer()
+                         ? Data.Schedule.getInt() ? NumTasks : Grainsize
+                         : NoSchedule),
+      Data.Schedule.getPointer()
+          ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
+                                      /*isSigned=*/false)
+          : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0),
+      Result.TaskDupFn
+          ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Result.TaskDupFn,
+                                                            CGF.VoidPtrTy)
+          : llvm::ConstantPointerNull::get(CGF.VoidPtrTy)};
+  CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_taskloop), TaskArgs);
+}
+
 /// \brief Emit reduction operation for each element of array (required for
 /// array sections) LHS op = RHS.
 /// \param Type Type of array.
@@ -3202,6 +4198,26 @@
   CGF.EmitBlock(DoneBB, /*IsFinished=*/true);
 }
 
+/// Emit reduction combiner. If the combiner is a simple expression emit it as
+/// is, otherwise consider it as combiner of UDR decl and emit it as a call of
+/// UDR combiner function.
+static void emitReductionCombiner(CodeGenFunction &CGF,
+                                  const Expr *ReductionOp) {
+  if (auto *CE = dyn_cast<CallExpr>(ReductionOp))
+    if (auto *OVE = dyn_cast<OpaqueValueExpr>(CE->getCallee()))
+      if (auto *DRE =
+              dyn_cast<DeclRefExpr>(OVE->getSourceExpr()->IgnoreImpCasts()))
+        if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(DRE->getDecl())) {
+          std::pair<llvm::Function *, llvm::Function *> Reduction =
+              CGF.CGM.getOpenMPRuntime().getUserDefinedReduction(DRD);
+          RValue Func = RValue::get(Reduction.first);
+          CodeGenFunction::OpaqueValueMapping Map(CGF, OVE, Func);
+          CGF.EmitIgnoredExpr(ReductionOp);
+          return;
+        }
+  CGF.EmitIgnoredExpr(ReductionOp);
+}
+
 static llvm::Value *emitReductionFunction(CodeGenModule &CGM,
                                           llvm::Type *ArgsType,
                                           ArrayRef<const Expr *> Privates,
@@ -3251,17 +4267,16 @@
       return emitAddrOfVarFromArray(CGF, LHS, Idx, LHSVar);
     });
     QualType PrivTy = (*IPriv)->getType();
-    if (PrivTy->isArrayType()) {
+    if (PrivTy->isVariablyModifiedType()) {
       // Get array size and emit VLA type.
       ++Idx;
       Address Elem =
           CGF.Builder.CreateConstArrayGEP(LHS, Idx, CGF.getPointerSize());
       llvm::Value *Ptr = CGF.Builder.CreateLoad(Elem);
+      auto *VLA = CGF.getContext().getAsVariableArrayType(PrivTy);
+      auto *OVE = cast<OpaqueValueExpr>(VLA->getSizeExpr());
       CodeGenFunction::OpaqueValueMapping OpaqueMap(
-          CGF,
-          cast<OpaqueValueExpr>(
-              CGF.getContext().getAsVariableArrayType(PrivTy)->getSizeExpr()),
-          RValue::get(CGF.Builder.CreatePtrToInt(Ptr, CGF.SizeTy)));
+          CGF, OVE, RValue::get(CGF.Builder.CreatePtrToInt(Ptr, CGF.SizeTy)));
       CGF.EmitVariablyModifiedType(PrivTy);
     }
   }
@@ -3274,20 +4289,42 @@
       // Emit reduction for array section.
       auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
       auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
-      EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), LHSVar, RHSVar,
-                                [=](CodeGenFunction &CGF, const Expr *,
-                                    const Expr *,
-                                    const Expr *) { CGF.EmitIgnoredExpr(E); });
+      EmitOMPAggregateReduction(
+          CGF, (*IPriv)->getType(), LHSVar, RHSVar,
+          [=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) {
+            emitReductionCombiner(CGF, E);
+          });
     } else
       // Emit reduction for array subscript or single variable.
-      CGF.EmitIgnoredExpr(E);
-    ++IPriv, ++ILHS, ++IRHS;
+      emitReductionCombiner(CGF, E);
+    ++IPriv;
+    ++ILHS;
+    ++IRHS;
   }
   Scope.ForceCleanup();
   CGF.FinishFunction();
   return Fn;
 }
 
+static void emitSingleReductionCombiner(CodeGenFunction &CGF,
+                                        const Expr *ReductionOp,
+                                        const Expr *PrivateRef,
+                                        const DeclRefExpr *LHS,
+                                        const DeclRefExpr *RHS) {
+  if (PrivateRef->getType()->isArrayType()) {
+    // Emit reduction for array section.
+    auto *LHSVar = cast<VarDecl>(LHS->getDecl());
+    auto *RHSVar = cast<VarDecl>(RHS->getDecl());
+    EmitOMPAggregateReduction(
+        CGF, PrivateRef->getType(), LHSVar, RHSVar,
+        [=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) {
+          emitReductionCombiner(CGF, ReductionOp);
+        });
+  } else
+    // Emit reduction for array subscript or single variable.
+    emitReductionCombiner(CGF, ReductionOp);
+}
+
 void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
                                     ArrayRef<const Expr *> Privates,
                                     ArrayRef<const Expr *> LHSExprs,
@@ -3339,16 +4376,11 @@
     auto ILHS = LHSExprs.begin();
     auto IRHS = RHSExprs.begin();
     for (auto *E : ReductionOps) {
-      if ((*IPriv)->getType()->isArrayType()) {
-        auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
-        auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
-        EmitOMPAggregateReduction(
-            CGF, (*IPriv)->getType(), LHSVar, RHSVar,
-            [=](CodeGenFunction &CGF, const Expr *, const Expr *,
-                const Expr *) { CGF.EmitIgnoredExpr(E); });
-      } else
-        CGF.EmitIgnoredExpr(E);
-      ++IPriv, ++ILHS, ++IRHS;
+      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
+                                  cast<DeclRefExpr>(*IRHS));
+      ++IPriv;
+      ++ILHS;
+      ++IRHS;
     }
     return;
   }
@@ -3357,7 +4389,7 @@
   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   auto Size = RHSExprs.size();
   for (auto *E : Privates) {
-    if (E->getType()->isArrayType())
+    if (E->getType()->isVariablyModifiedType())
       // Reserve place for array size.
       ++Size;
   }
@@ -3376,20 +4408,18 @@
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
             CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
         Elem);
-    if ((*IPriv)->getType()->isArrayType()) {
+    if ((*IPriv)->getType()->isVariablyModifiedType()) {
       // Store array size.
       ++Idx;
       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
                                              CGF.getPointerSize());
-      CGF.Builder.CreateStore(
-          CGF.Builder.CreateIntToPtr(
-              CGF.Builder.CreateIntCast(
-                  CGF.getVLASize(CGF.getContext().getAsVariableArrayType(
-                                     (*IPriv)->getType()))
-                      .first,
-                  CGF.SizeTy, /*isSigned=*/false),
-              CGF.VoidPtrTy),
-          Elem);
+      llvm::Value *Size = CGF.Builder.CreateIntCast(
+          CGF.getVLASize(
+                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
+              .first,
+          CGF.SizeTy, /*isSigned=*/false);
+      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
+                              Elem);
     }
   }
 
@@ -3403,11 +4433,9 @@
 
   // 4. Build res = __kmpc_reduce{_nowait}(<loc>, <gtid>, <n>, sizeof(RedList),
   // RedList, reduce_func, &<lock>);
-  auto *IdentTLoc = emitUpdateLocation(
-      CGF, Loc,
-      static_cast<OpenMPLocationFlags>(OMP_IDENT_KMPC | OMP_ATOMIC_REDUCE));
+  auto *IdentTLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
   auto *ThreadId = getThreadID(CGF, Loc);
-  auto *ReductionArrayTySize = getTypeSize(CGF, ReductionArrayTy);
+  auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
   auto *RL =
     CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList.getPointer(),
                                                     CGF.VoidPtrTy);
@@ -3439,38 +4467,33 @@
   SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
   CGF.EmitBlock(Case1BB);
 
-  {
-    CodeGenFunction::RunCleanupsScope Scope(CGF);
-    // Add emission of __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
-    llvm::Value *EndArgs[] = {
-        IdentTLoc, // ident_t *<loc>
-        ThreadId,  // i32 <gtid>
-        Lock       // kmp_critical_name *&<lock>
-    };
-    CGF.EHStack
-        .pushCleanup<CallEndCleanup<std::extent<decltype(EndArgs)>::value>>(
-            NormalAndEHCleanup,
-            createRuntimeFunction(WithNowait ? OMPRTL__kmpc_end_reduce_nowait
-                                             : OMPRTL__kmpc_end_reduce),
-            llvm::makeArrayRef(EndArgs));
+  // Add emission of __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
+  llvm::Value *EndArgs[] = {
+      IdentTLoc, // ident_t *<loc>
+      ThreadId,  // i32 <gtid>
+      Lock       // kmp_critical_name *&<lock>
+  };
+  auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps](
+      CodeGenFunction &CGF, PrePostActionTy &Action) {
     auto IPriv = Privates.begin();
     auto ILHS = LHSExprs.begin();
     auto IRHS = RHSExprs.begin();
     for (auto *E : ReductionOps) {
-      if ((*IPriv)->getType()->isArrayType()) {
-        // Emit reduction for array section.
-        auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
-        auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
-        EmitOMPAggregateReduction(
-            CGF, (*IPriv)->getType(), LHSVar, RHSVar,
-            [=](CodeGenFunction &CGF, const Expr *, const Expr *,
-                const Expr *) { CGF.EmitIgnoredExpr(E); });
-      } else
-        // Emit reduction for array subscript or single variable.
-        CGF.EmitIgnoredExpr(E);
-      ++IPriv, ++ILHS, ++IRHS;
+      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
+                                  cast<DeclRefExpr>(*IRHS));
+      ++IPriv;
+      ++ILHS;
+      ++IRHS;
     }
-  }
+  };
+  RegionCodeGenTy RCG(CodeGen);
+  CommonActionTy Action(
+      nullptr, llvm::None,
+      createRuntimeFunction(WithNowait ? OMPRTL__kmpc_end_reduce_nowait
+                                       : OMPRTL__kmpc_end_reduce),
+      EndArgs);
+  RCG.setAction(Action);
+  RCG(CGF);
 
   CGF.EmitBranch(DefaultBB);
 
@@ -3483,101 +4506,113 @@
   SwInst->addCase(CGF.Builder.getInt32(2), Case2BB);
   CGF.EmitBlock(Case2BB);
 
-  {
-    CodeGenFunction::RunCleanupsScope Scope(CGF);
-    if (!WithNowait) {
-      // Add emission of __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
-      llvm::Value *EndArgs[] = {
-          IdentTLoc, // ident_t *<loc>
-          ThreadId,  // i32 <gtid>
-          Lock       // kmp_critical_name *&<lock>
-      };
-      CGF.EHStack
-          .pushCleanup<CallEndCleanup<std::extent<decltype(EndArgs)>::value>>(
-              NormalAndEHCleanup,
-              createRuntimeFunction(OMPRTL__kmpc_end_reduce),
-              llvm::makeArrayRef(EndArgs));
-    }
+  auto &&AtomicCodeGen = [Loc, &Privates, &LHSExprs, &RHSExprs, &ReductionOps](
+      CodeGenFunction &CGF, PrePostActionTy &Action) {
     auto ILHS = LHSExprs.begin();
     auto IRHS = RHSExprs.begin();
     auto IPriv = Privates.begin();
     for (auto *E : ReductionOps) {
-        const Expr *XExpr = nullptr;
-        const Expr *EExpr = nullptr;
-        const Expr *UpExpr = nullptr;
-        BinaryOperatorKind BO = BO_Comma;
-        if (auto *BO = dyn_cast<BinaryOperator>(E)) {
-          if (BO->getOpcode() == BO_Assign) {
-            XExpr = BO->getLHS();
-            UpExpr = BO->getRHS();
-          }
+      const Expr *XExpr = nullptr;
+      const Expr *EExpr = nullptr;
+      const Expr *UpExpr = nullptr;
+      BinaryOperatorKind BO = BO_Comma;
+      if (auto *BO = dyn_cast<BinaryOperator>(E)) {
+        if (BO->getOpcode() == BO_Assign) {
+          XExpr = BO->getLHS();
+          UpExpr = BO->getRHS();
         }
-        // Try to emit update expression as a simple atomic.
-        auto *RHSExpr = UpExpr;
-        if (RHSExpr) {
-          // Analyze RHS part of the whole expression.
-          if (auto *ACO = dyn_cast<AbstractConditionalOperator>(
-                  RHSExpr->IgnoreParenImpCasts())) {
-            // If this is a conditional operator, analyze its condition for
-            // min/max reduction operator.
-            RHSExpr = ACO->getCond();
-          }
-          if (auto *BORHS =
-                  dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
-            EExpr = BORHS->getRHS();
-            BO = BORHS->getOpcode();
-          }
+      }
+      // Try to emit update expression as a simple atomic.
+      auto *RHSExpr = UpExpr;
+      if (RHSExpr) {
+        // Analyze RHS part of the whole expression.
+        if (auto *ACO = dyn_cast<AbstractConditionalOperator>(
+                RHSExpr->IgnoreParenImpCasts())) {
+          // If this is a conditional operator, analyze its condition for
+          // min/max reduction operator.
+          RHSExpr = ACO->getCond();
         }
-        if (XExpr) {
-          auto *VD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
-          auto &&AtomicRedGen = [this, BO, VD, IPriv,
-                                 Loc](CodeGenFunction &CGF, const Expr *XExpr,
-                                      const Expr *EExpr, const Expr *UpExpr) {
-            LValue X = CGF.EmitLValue(XExpr);
-            RValue E;
-            if (EExpr)
-              E = CGF.EmitAnyExpr(EExpr);
-            CGF.EmitOMPAtomicSimpleUpdateExpr(
-                X, E, BO, /*IsXLHSInRHSPart=*/true, llvm::Monotonic, Loc,
-                [&CGF, UpExpr, VD, IPriv](RValue XRValue) {
-                  CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
-                  PrivateScope.addPrivate(VD, [&CGF, VD, XRValue]() -> Address {
-                    Address LHSTemp = CGF.CreateMemTemp(VD->getType());
-                    CGF.EmitStoreThroughLValue(
-                        XRValue, CGF.MakeAddrLValue(LHSTemp, VD->getType()));
-                    return LHSTemp;
-                  });
-                  (void)PrivateScope.Privatize();
-                  return CGF.EmitAnyExpr(UpExpr);
-                });
-          };
-          if ((*IPriv)->getType()->isArrayType()) {
-            // Emit atomic reduction for array section.
-            auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
-            EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), VD, RHSVar,
-                                      AtomicRedGen, XExpr, EExpr, UpExpr);
-          } else
-            // Emit atomic reduction for array subscript or single variable.
-            AtomicRedGen(CGF, XExpr, EExpr, UpExpr);
-        } else {
-          // Emit as a critical region.
-          auto &&CritRedGen = [this, E, Loc](CodeGenFunction &CGF, const Expr *,
-                                             const Expr *, const Expr *) {
-            emitCriticalRegion(
-                CGF, ".atomic_reduction",
-                [E](CodeGenFunction &CGF) { CGF.EmitIgnoredExpr(E); }, Loc);
-          };
-          if ((*IPriv)->getType()->isArrayType()) {
-            auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
-            auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
-            EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), LHSVar, RHSVar,
-                                      CritRedGen);
-          } else
-            CritRedGen(CGF, nullptr, nullptr, nullptr);
+        if (auto *BORHS =
+                dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
+          EExpr = BORHS->getRHS();
+          BO = BORHS->getOpcode();
         }
-      ++ILHS, ++IRHS, ++IPriv;
+      }
+      if (XExpr) {
+        auto *VD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
+        auto &&AtomicRedGen = [BO, VD, IPriv,
+                               Loc](CodeGenFunction &CGF, const Expr *XExpr,
+                                    const Expr *EExpr, const Expr *UpExpr) {
+          LValue X = CGF.EmitLValue(XExpr);
+          RValue E;
+          if (EExpr)
+            E = CGF.EmitAnyExpr(EExpr);
+          CGF.EmitOMPAtomicSimpleUpdateExpr(
+              X, E, BO, /*IsXLHSInRHSPart=*/true,
+              llvm::AtomicOrdering::Monotonic, Loc,
+              [&CGF, UpExpr, VD, IPriv, Loc](RValue XRValue) {
+                CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
+                PrivateScope.addPrivate(
+                    VD, [&CGF, VD, XRValue, Loc]() -> Address {
+                      Address LHSTemp = CGF.CreateMemTemp(VD->getType());
+                      CGF.emitOMPSimpleStore(
+                          CGF.MakeAddrLValue(LHSTemp, VD->getType()), XRValue,
+                          VD->getType().getNonReferenceType(), Loc);
+                      return LHSTemp;
+                    });
+                (void)PrivateScope.Privatize();
+                return CGF.EmitAnyExpr(UpExpr);
+              });
+        };
+        if ((*IPriv)->getType()->isArrayType()) {
+          // Emit atomic reduction for array section.
+          auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
+          EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), VD, RHSVar,
+                                    AtomicRedGen, XExpr, EExpr, UpExpr);
+        } else
+          // Emit atomic reduction for array subscript or single variable.
+          AtomicRedGen(CGF, XExpr, EExpr, UpExpr);
+      } else {
+        // Emit as a critical region.
+        auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *,
+                                     const Expr *, const Expr *) {
+          auto &RT = CGF.CGM.getOpenMPRuntime();
+          RT.emitCriticalRegion(
+              CGF, ".atomic_reduction",
+              [=](CodeGenFunction &CGF, PrePostActionTy &Action) {
+                Action.Enter(CGF);
+                emitReductionCombiner(CGF, E);
+              },
+              Loc);
+        };
+        if ((*IPriv)->getType()->isArrayType()) {
+          auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
+          auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
+          EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), LHSVar, RHSVar,
+                                    CritRedGen);
+        } else
+          CritRedGen(CGF, nullptr, nullptr, nullptr);
+      }
+      ++ILHS;
+      ++IRHS;
+      ++IPriv;
     }
-  }
+  };
+  RegionCodeGenTy AtomicRCG(AtomicCodeGen);
+  if (!WithNowait) {
+    // Add emission of __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+    llvm::Value *EndArgs[] = {
+        IdentTLoc, // ident_t *<loc>
+        ThreadId,  // i32 <gtid>
+        Lock       // kmp_critical_name *&<lock>
+    };
+    CommonActionTy Action(nullptr, llvm::None,
+                          createRuntimeFunction(OMPRTL__kmpc_end_reduce),
+                          EndArgs);
+    AtomicRCG.setAction(Action);
+    AtomicRCG(CGF);
+  } else
+    AtomicRCG(CGF);
 
   CGF.EmitBranch(DefaultBB);
   CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
@@ -3592,6 +4627,8 @@
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
   // Ignore return result until untied tasks are supported.
   CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskwait), Args);
+  if (auto *Region = dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
+    Region->emitUntiedSwitch(CGF);
 }
 
 void CGOpenMPRuntime::emitInlinedDirective(CodeGenFunction &CGF,
@@ -3612,7 +4649,7 @@
   CancelSections = 3,
   CancelTaskgroup = 4
 };
-}
+} // anonymous namespace
 
 static RTCancelKind getCancellationKind(OpenMPDirectiveKind CancelRegion) {
   RTCancelKind CancelKind = CancelNoreq;
@@ -3638,8 +4675,6 @@
   // global_tid, kmp_int32 cncl_kind);
   if (auto *OMPRegionInfo =
           dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
-    if (OMPRegionInfo->getDirectiveKind() == OMPD_single)
-      return;
     if (OMPRegionInfo->hasCancel()) {
       llvm::Value *Args[] = {
           emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
@@ -3676,16 +4711,15 @@
   // kmp_int32 cncl_kind);
   if (auto *OMPRegionInfo =
           dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
-    if (OMPRegionInfo->getDirectiveKind() == OMPD_single)
-      return;
-    auto &&ThenGen = [this, Loc, CancelRegion,
-                      OMPRegionInfo](CodeGenFunction &CGF) {
+    auto &&ThenGen = [Loc, CancelRegion, OMPRegionInfo](CodeGenFunction &CGF,
+                                                        PrePostActionTy &) {
+      auto &RT = CGF.CGM.getOpenMPRuntime();
       llvm::Value *Args[] = {
-          emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
+          RT.emitUpdateLocation(CGF, Loc), RT.getThreadID(CGF, Loc),
           CGF.Builder.getInt32(getCancellationKind(CancelRegion))};
       // Ignore return result until untied tasks are supported.
-      auto *Result =
-          CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_cancel), Args);
+      auto *Result = CGF.EmitRuntimeCall(
+          RT.createRuntimeFunction(OMPRTL__kmpc_cancel), Args);
       // if (__kmpc_cancel()) {
       //  __kmpc_cancel_barrier();
       //   exit from construct;
@@ -3696,7 +4730,7 @@
       CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
       CGF.EmitBlock(ExitBB);
       // __kmpc_cancel_barrier();
-      emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false);
+      RT.emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false);
       // exit from construct;
       auto CancelDest =
           CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
@@ -3704,18 +4738,21 @@
       CGF.EmitBlock(ContBB, /*IsFinished=*/true);
     };
     if (IfCond)
-      emitOMPIfClause(CGF, IfCond, ThenGen, [](CodeGenFunction &) {});
-    else
-      ThenGen(CGF);
+      emitOMPIfClause(CGF, IfCond, ThenGen,
+                      [](CodeGenFunction &, PrePostActionTy &) {});
+    else {
+      RegionCodeGenTy ThenRCG(ThenGen);
+      ThenRCG(CGF);
+    }
   }
 }
 
 /// \brief Obtain information that uniquely identifies a target entry. This
-/// consists of the file and device IDs as well as line and column numbers
-/// associated with the relevant entry source location.
+/// consists of the file and device IDs as well as line number associated with
+/// the relevant entry source location.
 static void getTargetEntryUniqueInfo(ASTContext &C, SourceLocation Loc,
                                      unsigned &DeviceID, unsigned &FileID,
-                                     unsigned &LineNum, unsigned &ColumnNum) {
+                                     unsigned &LineNum) {
 
   auto &SM = C.getSourceManager();
 
@@ -3735,49 +4772,45 @@
   DeviceID = ID.getDevice();
   FileID = ID.getFile();
   LineNum = PLoc.getLine();
-  ColumnNum = PLoc.getColumn();
-  return;
 }
 
 void CGOpenMPRuntime::emitTargetOutlinedFunction(
     const OMPExecutableDirective &D, StringRef ParentName,
     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
-    bool IsOffloadEntry) {
-
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
   assert(!ParentName.empty() && "Invalid target region parent name!");
 
-  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   IsOffloadEntry, CodeGen);
+}
 
-  // Emit target region as a standalone region.
-  auto &&CodeGen = [&CS](CodeGenFunction &CGF) {
-    CGF.EmitStmt(CS.getCapturedStmt());
-  };
-
-  // Create a unique name for the proxy/entry function that using the source
-  // location information of the current target region. The name will be
-  // something like:
+void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  // Create a unique name for the entry function using the source location
+  // information of the current target region. The name will be something like:
   //
-  // .omp_offloading.DD_FFFF.PP.lBB.cCC
+  // __omp_offloading_DD_FFFF_PP_lBB
   //
   // where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
-  // mangled name of the function that encloses the target region, BB is the
-  // line number of the target region, and CC is the column number of the target
-  // region.
+  // mangled name of the function that encloses the target region and BB is the
+  // line number of the target region.
 
   unsigned DeviceID;
   unsigned FileID;
   unsigned Line;
-  unsigned Column;
   getTargetEntryUniqueInfo(CGM.getContext(), D.getLocStart(), DeviceID, FileID,
-                           Line, Column);
+                           Line);
   SmallString<64> EntryFnName;
   {
     llvm::raw_svector_ostream OS(EntryFnName);
-    OS << ".omp_offloading" << llvm::format(".%x", DeviceID)
-       << llvm::format(".%x.", FileID) << ParentName << ".l" << Line << ".c"
-       << Column;
+    OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
+       << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
   }
 
+  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+
   CodeGenFunction CGF(CGM, true);
   CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName);
   CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
@@ -3811,8 +4844,1038 @@
 
   // Register the information for the entry associated with this target region.
   OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
-      DeviceID, FileID, ParentName, Line, Column, OutlinedFn, OutlinedFnID);
-  return;
+      DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID);
+}
+
+/// discard all CompoundStmts intervening between two constructs
+static const Stmt *ignoreCompoundStmts(const Stmt *Body) {
+  while (auto *CS = dyn_cast_or_null<CompoundStmt>(Body))
+    Body = CS->body_front();
+
+  return Body;
+}
+
+/// \brief Emit the num_teams clause of an enclosed teams directive at the
+/// target region scope. If there is no teams directive associated with the
+/// target directive, or if there is no num_teams clause associated with the
+/// enclosed teams directive, return nullptr.
+static llvm::Value *
+emitNumTeamsClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
+                                     CodeGenFunction &CGF,
+                                     const OMPExecutableDirective &D) {
+
+  assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
+                                              "teams directive expected to be "
+                                              "emitted only for the host!");
+
+  // FIXME: For the moment we do not support combined directives with target and
+  // teams, so we do not expect to get any num_teams clause in the provided
+  // directive. Once we support that, this assertion can be replaced by the
+  // actual emission of the clause expression.
+  assert(D.getSingleClause<OMPNumTeamsClause>() == nullptr &&
+         "Not expecting clause in directive.");
+
+  // If the current target region has a teams region enclosed, we need to get
+  // the number of teams to pass to the runtime function call. This is done
+  // by generating the expression in a inlined region. This is required because
+  // the expression is captured in the enclosing target environment when the
+  // teams directive is not combined with target.
+
+  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+
+  // FIXME: Accommodate other combined directives with teams when they become
+  // available.
+  if (auto *TeamsDir = dyn_cast_or_null<OMPTeamsDirective>(
+          ignoreCompoundStmts(CS.getCapturedStmt()))) {
+    if (auto *NTE = TeamsDir->getSingleClause<OMPNumTeamsClause>()) {
+      CGOpenMPInnerExprInfo CGInfo(CGF, CS);
+      CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
+      llvm::Value *NumTeams = CGF.EmitScalarExpr(NTE->getNumTeams());
+      return CGF.Builder.CreateIntCast(NumTeams, CGF.Int32Ty,
+                                       /*IsSigned=*/true);
+    }
+
+    // If we have an enclosed teams directive but no num_teams clause we use
+    // the default value 0.
+    return CGF.Builder.getInt32(0);
+  }
+
+  // No teams associated with the directive.
+  return nullptr;
+}
+
+/// \brief Emit the thread_limit clause of an enclosed teams directive at the
+/// target region scope. If there is no teams directive associated with the
+/// target directive, or if there is no thread_limit clause associated with the
+/// enclosed teams directive, return nullptr.
+static llvm::Value *
+emitThreadLimitClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
+                                        CodeGenFunction &CGF,
+                                        const OMPExecutableDirective &D) {
+
+  assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
+                                              "teams directive expected to be "
+                                              "emitted only for the host!");
+
+  // FIXME: For the moment we do not support combined directives with target and
+  // teams, so we do not expect to get any thread_limit clause in the provided
+  // directive. Once we support that, this assertion can be replaced by the
+  // actual emission of the clause expression.
+  assert(D.getSingleClause<OMPThreadLimitClause>() == nullptr &&
+         "Not expecting clause in directive.");
+
+  // If the current target region has a teams region enclosed, we need to get
+  // the thread limit to pass to the runtime function call. This is done
+  // by generating the expression in a inlined region. This is required because
+  // the expression is captured in the enclosing target environment when the
+  // teams directive is not combined with target.
+
+  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+
+  // FIXME: Accommodate other combined directives with teams when they become
+  // available.
+  if (auto *TeamsDir = dyn_cast_or_null<OMPTeamsDirective>(
+          ignoreCompoundStmts(CS.getCapturedStmt()))) {
+    if (auto *TLE = TeamsDir->getSingleClause<OMPThreadLimitClause>()) {
+      CGOpenMPInnerExprInfo CGInfo(CGF, CS);
+      CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
+      llvm::Value *ThreadLimit = CGF.EmitScalarExpr(TLE->getThreadLimit());
+      return CGF.Builder.CreateIntCast(ThreadLimit, CGF.Int32Ty,
+                                       /*IsSigned=*/true);
+    }
+
+    // If we have an enclosed teams directive but no thread_limit clause we use
+    // the default value 0.
+    return CGF.Builder.getInt32(0);
+  }
+
+  // No teams associated with the directive.
+  return nullptr;
+}
+
+namespace {
+// \brief Utility to handle information from clauses associated with a given
+// construct that use mappable expressions (e.g. 'map' clause, 'to' clause).
+// It provides a convenient interface to obtain the information and generate
+// code for that information.
+class MappableExprsHandler {
+public:
+  /// \brief Values for bit flags used to specify the mapping type for
+  /// offloading.
+  enum OpenMPOffloadMappingFlags {
+    /// \brief Allocate memory on the device and move data from host to device.
+    OMP_MAP_TO = 0x01,
+    /// \brief Allocate memory on the device and move data from device to host.
+    OMP_MAP_FROM = 0x02,
+    /// \brief Always perform the requested mapping action on the element, even
+    /// if it was already mapped before.
+    OMP_MAP_ALWAYS = 0x04,
+    /// \brief Delete the element from the device environment, ignoring the
+    /// current reference count associated with the element.
+    OMP_MAP_DELETE = 0x08,
+    /// \brief The element being mapped is a pointer, therefore the pointee
+    /// should be mapped as well.
+    OMP_MAP_IS_PTR = 0x10,
+    /// \brief This flags signals that an argument is the first one relating to
+    /// a map/private clause expression. For some cases a single
+    /// map/privatization results in multiple arguments passed to the runtime
+    /// library.
+    OMP_MAP_FIRST_REF = 0x20,
+    /// \brief Signal that the runtime library has to return the device pointer
+    /// in the current position for the data being mapped.
+    OMP_MAP_RETURN_PTR = 0x40,
+    /// \brief This flag signals that the reference being passed is a pointer to
+    /// private data.
+    OMP_MAP_PRIVATE_PTR = 0x80,
+    /// \brief Pass the element to the device by value.
+    OMP_MAP_PRIVATE_VAL = 0x100,
+  };
+
+  /// Class that associates information with a base pointer to be passed to the
+  /// runtime library.
+  class BasePointerInfo {
+    /// The base pointer.
+    llvm::Value *Ptr = nullptr;
+    /// The base declaration that refers to this device pointer, or null if
+    /// there is none.
+    const ValueDecl *DevPtrDecl = nullptr;
+
+  public:
+    BasePointerInfo(llvm::Value *Ptr, const ValueDecl *DevPtrDecl = nullptr)
+        : Ptr(Ptr), DevPtrDecl(DevPtrDecl) {}
+    llvm::Value *operator*() const { return Ptr; }
+    const ValueDecl *getDevicePtrDecl() const { return DevPtrDecl; }
+    void setDevicePtrDecl(const ValueDecl *D) { DevPtrDecl = D; }
+  };
+
+  typedef SmallVector<BasePointerInfo, 16> MapBaseValuesArrayTy;
+  typedef SmallVector<llvm::Value *, 16> MapValuesArrayTy;
+  typedef SmallVector<unsigned, 16> MapFlagsArrayTy;
+
+private:
+  /// \brief Directive from where the map clauses were extracted.
+  const OMPExecutableDirective &CurDir;
+
+  /// \brief Function the directive is being generated for.
+  CodeGenFunction &CGF;
+
+  /// \brief Set of all first private variables in the current directive.
+  llvm::SmallPtrSet<const VarDecl *, 8> FirstPrivateDecls;
+
+  /// Map between device pointer declarations and their expression components.
+  /// The key value for declarations in 'this' is null.
+  llvm::DenseMap<
+      const ValueDecl *,
+      SmallVector<OMPClauseMappableExprCommon::MappableExprComponentListRef, 4>>
+      DevPointersMap;
+
+  llvm::Value *getExprTypeSize(const Expr *E) const {
+    auto ExprTy = E->getType().getCanonicalType();
+
+    // Reference types are ignored for mapping purposes.
+    if (auto *RefTy = ExprTy->getAs<ReferenceType>())
+      ExprTy = RefTy->getPointeeType().getCanonicalType();
+
+    // Given that an array section is considered a built-in type, we need to
+    // do the calculation based on the length of the section instead of relying
+    // on CGF.getTypeSize(E->getType()).
+    if (const auto *OAE = dyn_cast<OMPArraySectionExpr>(E)) {
+      QualType BaseTy = OMPArraySectionExpr::getBaseOriginalType(
+                            OAE->getBase()->IgnoreParenImpCasts())
+                            .getCanonicalType();
+
+      // If there is no length associated with the expression, that means we
+      // are using the whole length of the base.
+      if (!OAE->getLength() && OAE->getColonLoc().isValid())
+        return CGF.getTypeSize(BaseTy);
+
+      llvm::Value *ElemSize;
+      if (auto *PTy = BaseTy->getAs<PointerType>())
+        ElemSize = CGF.getTypeSize(PTy->getPointeeType().getCanonicalType());
+      else {
+        auto *ATy = cast<ArrayType>(BaseTy.getTypePtr());
+        assert(ATy && "Expecting array type if not a pointer type.");
+        ElemSize = CGF.getTypeSize(ATy->getElementType().getCanonicalType());
+      }
+
+      // If we don't have a length at this point, that is because we have an
+      // array section with a single element.
+      if (!OAE->getLength())
+        return ElemSize;
+
+      auto *LengthVal = CGF.EmitScalarExpr(OAE->getLength());
+      LengthVal =
+          CGF.Builder.CreateIntCast(LengthVal, CGF.SizeTy, /*isSigned=*/false);
+      return CGF.Builder.CreateNUWMul(LengthVal, ElemSize);
+    }
+    return CGF.getTypeSize(ExprTy);
+  }
+
+  /// \brief Return the corresponding bits for a given map clause modifier. Add
+  /// a flag marking the map as a pointer if requested. Add a flag marking the
+  /// map as the first one of a series of maps that relate to the same map
+  /// expression.
+  unsigned getMapTypeBits(OpenMPMapClauseKind MapType,
+                          OpenMPMapClauseKind MapTypeModifier, bool AddPtrFlag,
+                          bool AddIsFirstFlag) const {
+    unsigned Bits = 0u;
+    switch (MapType) {
+    case OMPC_MAP_alloc:
+    case OMPC_MAP_release:
+      // alloc and release is the default behavior in the runtime library,  i.e.
+      // if we don't pass any bits alloc/release that is what the runtime is
+      // going to do. Therefore, we don't need to signal anything for these two
+      // type modifiers.
+      break;
+    case OMPC_MAP_to:
+      Bits = OMP_MAP_TO;
+      break;
+    case OMPC_MAP_from:
+      Bits = OMP_MAP_FROM;
+      break;
+    case OMPC_MAP_tofrom:
+      Bits = OMP_MAP_TO | OMP_MAP_FROM;
+      break;
+    case OMPC_MAP_delete:
+      Bits = OMP_MAP_DELETE;
+      break;
+    default:
+      llvm_unreachable("Unexpected map type!");
+      break;
+    }
+    if (AddPtrFlag)
+      Bits |= OMP_MAP_IS_PTR;
+    if (AddIsFirstFlag)
+      Bits |= OMP_MAP_FIRST_REF;
+    if (MapTypeModifier == OMPC_MAP_always)
+      Bits |= OMP_MAP_ALWAYS;
+    return Bits;
+  }
+
+  /// \brief Return true if the provided expression is a final array section. A
+  /// final array section, is one whose length can't be proved to be one.
+  bool isFinalArraySectionExpression(const Expr *E) const {
+    auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
+
+    // It is not an array section and therefore not a unity-size one.
+    if (!OASE)
+      return false;
+
+    // An array section with no colon always refer to a single element.
+    if (OASE->getColonLoc().isInvalid())
+      return false;
+
+    auto *Length = OASE->getLength();
+
+    // If we don't have a length we have to check if the array has size 1
+    // for this dimension. Also, we should always expect a length if the
+    // base type is pointer.
+    if (!Length) {
+      auto BaseQTy = OMPArraySectionExpr::getBaseOriginalType(
+                         OASE->getBase()->IgnoreParenImpCasts())
+                         .getCanonicalType();
+      if (auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
+        return ATy->getSize().getSExtValue() != 1;
+      // If we don't have a constant dimension length, we have to consider
+      // the current section as having any size, so it is not necessarily
+      // unitary. If it happen to be unity size, that's user fault.
+      return true;
+    }
+
+    // Check if the length evaluates to 1.
+    llvm::APSInt ConstLength;
+    if (!Length->EvaluateAsInt(ConstLength, CGF.getContext()))
+      return true; // Can have more that size 1.
+
+    return ConstLength.getSExtValue() != 1;
+  }
+
+  /// \brief Generate the base pointers, section pointers, sizes and map type
+  /// bits for the provided map type, map modifier, and expression components.
+  /// \a IsFirstComponent should be set to true if the provided set of
+  /// components is the first associated with a capture.
+  void generateInfoForComponentList(
+      OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapTypeModifier,
+      OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
+      MapBaseValuesArrayTy &BasePointers, MapValuesArrayTy &Pointers,
+      MapValuesArrayTy &Sizes, MapFlagsArrayTy &Types,
+      bool IsFirstComponentList) const {
+
+    // The following summarizes what has to be generated for each map and the
+    // types bellow. The generated information is expressed in this order:
+    // base pointer, section pointer, size, flags
+    // (to add to the ones that come from the map type and modifier).
+    //
+    // double d;
+    // int i[100];
+    // float *p;
+    //
+    // struct S1 {
+    //   int i;
+    //   float f[50];
+    // }
+    // struct S2 {
+    //   int i;
+    //   float f[50];
+    //   S1 s;
+    //   double *p;
+    //   struct S2 *ps;
+    // }
+    // S2 s;
+    // S2 *ps;
+    //
+    // map(d)
+    // &d, &d, sizeof(double), noflags
+    //
+    // map(i)
+    // &i, &i, 100*sizeof(int), noflags
+    //
+    // map(i[1:23])
+    // &i(=&i[0]), &i[1], 23*sizeof(int), noflags
+    //
+    // map(p)
+    // &p, &p, sizeof(float*), noflags
+    //
+    // map(p[1:24])
+    // p, &p[1], 24*sizeof(float), noflags
+    //
+    // map(s)
+    // &s, &s, sizeof(S2), noflags
+    //
+    // map(s.i)
+    // &s, &(s.i), sizeof(int), noflags
+    //
+    // map(s.s.f)
+    // &s, &(s.i.f), 50*sizeof(int), noflags
+    //
+    // map(s.p)
+    // &s, &(s.p), sizeof(double*), noflags
+    //
+    // map(s.p[:22], s.a s.b)
+    // &s, &(s.p), sizeof(double*), noflags
+    // &(s.p), &(s.p[0]), 22*sizeof(double), ptr_flag + extra_flag
+    //
+    // map(s.ps)
+    // &s, &(s.ps), sizeof(S2*), noflags
+    //
+    // map(s.ps->s.i)
+    // &s, &(s.ps), sizeof(S2*), noflags
+    // &(s.ps), &(s.ps->s.i), sizeof(int), ptr_flag + extra_flag
+    //
+    // map(s.ps->ps)
+    // &s, &(s.ps), sizeof(S2*), noflags
+    // &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
+    //
+    // map(s.ps->ps->ps)
+    // &s, &(s.ps), sizeof(S2*), noflags
+    // &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
+    // &(s.ps->ps), &(s.ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
+    //
+    // map(s.ps->ps->s.f[:22])
+    // &s, &(s.ps), sizeof(S2*), noflags
+    // &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
+    // &(s.ps->ps), &(s.ps->ps->s.f[0]), 22*sizeof(float), ptr_flag + extra_flag
+    //
+    // map(ps)
+    // &ps, &ps, sizeof(S2*), noflags
+    //
+    // map(ps->i)
+    // ps, &(ps->i), sizeof(int), noflags
+    //
+    // map(ps->s.f)
+    // ps, &(ps->s.f[0]), 50*sizeof(float), noflags
+    //
+    // map(ps->p)
+    // ps, &(ps->p), sizeof(double*), noflags
+    //
+    // map(ps->p[:22])
+    // ps, &(ps->p), sizeof(double*), noflags
+    // &(ps->p), &(ps->p[0]), 22*sizeof(double), ptr_flag + extra_flag
+    //
+    // map(ps->ps)
+    // ps, &(ps->ps), sizeof(S2*), noflags
+    //
+    // map(ps->ps->s.i)
+    // ps, &(ps->ps), sizeof(S2*), noflags
+    // &(ps->ps), &(ps->ps->s.i), sizeof(int), ptr_flag + extra_flag
+    //
+    // map(ps->ps->ps)
+    // ps, &(ps->ps), sizeof(S2*), noflags
+    // &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
+    //
+    // map(ps->ps->ps->ps)
+    // ps, &(ps->ps), sizeof(S2*), noflags
+    // &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
+    // &(ps->ps->ps), &(ps->ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
+    //
+    // map(ps->ps->ps->s.f[:22])
+    // ps, &(ps->ps), sizeof(S2*), noflags
+    // &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
+    // &(ps->ps->ps), &(ps->ps->ps->s.f[0]), 22*sizeof(float), ptr_flag +
+    // extra_flag
+
+    // Track if the map information being generated is the first for a capture.
+    bool IsCaptureFirstInfo = IsFirstComponentList;
+
+    // Scan the components from the base to the complete expression.
+    auto CI = Components.rbegin();
+    auto CE = Components.rend();
+    auto I = CI;
+
+    // Track if the map information being generated is the first for a list of
+    // components.
+    bool IsExpressionFirstInfo = true;
+    llvm::Value *BP = nullptr;
+
+    if (auto *ME = dyn_cast<MemberExpr>(I->getAssociatedExpression())) {
+      // The base is the 'this' pointer. The content of the pointer is going
+      // to be the base of the field being mapped.
+      BP = CGF.EmitScalarExpr(ME->getBase());
+    } else {
+      // The base is the reference to the variable.
+      // BP = &Var.
+      BP = CGF.EmitLValue(cast<DeclRefExpr>(I->getAssociatedExpression()))
+               .getPointer();
+
+      // If the variable is a pointer and is being dereferenced (i.e. is not
+      // the last component), the base has to be the pointer itself, not its
+      // reference. References are ignored for mapping purposes.
+      QualType Ty =
+          I->getAssociatedDeclaration()->getType().getNonReferenceType();
+      if (Ty->isAnyPointerType() && std::next(I) != CE) {
+        auto PtrAddr = CGF.MakeNaturalAlignAddrLValue(BP, Ty);
+        BP = CGF.EmitLoadOfPointerLValue(PtrAddr.getAddress(),
+                                         Ty->castAs<PointerType>())
+                 .getPointer();
+
+        // We do not need to generate individual map information for the
+        // pointer, it can be associated with the combined storage.
+        ++I;
+      }
+    }
+
+    for (; I != CE; ++I) {
+      auto Next = std::next(I);
+
+      // We need to generate the addresses and sizes if this is the last
+      // component, if the component is a pointer or if it is an array section
+      // whose length can't be proved to be one. If this is a pointer, it
+      // becomes the base address for the following components.
+
+      // A final array section, is one whose length can't be proved to be one.
+      bool IsFinalArraySection =
+          isFinalArraySectionExpression(I->getAssociatedExpression());
+
+      // Get information on whether the element is a pointer. Have to do a
+      // special treatment for array sections given that they are built-in
+      // types.
+      const auto *OASE =
+          dyn_cast<OMPArraySectionExpr>(I->getAssociatedExpression());
+      bool IsPointer =
+          (OASE &&
+           OMPArraySectionExpr::getBaseOriginalType(OASE)
+               .getCanonicalType()
+               ->isAnyPointerType()) ||
+          I->getAssociatedExpression()->getType()->isAnyPointerType();
+
+      if (Next == CE || IsPointer || IsFinalArraySection) {
+
+        // If this is not the last component, we expect the pointer to be
+        // associated with an array expression or member expression.
+        assert((Next == CE ||
+                isa<MemberExpr>(Next->getAssociatedExpression()) ||
+                isa<ArraySubscriptExpr>(Next->getAssociatedExpression()) ||
+                isa<OMPArraySectionExpr>(Next->getAssociatedExpression())) &&
+               "Unexpected expression");
+
+        auto *LB = CGF.EmitLValue(I->getAssociatedExpression()).getPointer();
+        auto *Size = getExprTypeSize(I->getAssociatedExpression());
+
+        // If we have a member expression and the current component is a
+        // reference, we have to map the reference too. Whenever we have a
+        // reference, the section that reference refers to is going to be a
+        // load instruction from the storage assigned to the reference.
+        if (isa<MemberExpr>(I->getAssociatedExpression()) &&
+            I->getAssociatedDeclaration()->getType()->isReferenceType()) {
+          auto *LI = cast<llvm::LoadInst>(LB);
+          auto *RefAddr = LI->getPointerOperand();
+
+          BasePointers.push_back(BP);
+          Pointers.push_back(RefAddr);
+          Sizes.push_back(CGF.getTypeSize(CGF.getContext().VoidPtrTy));
+          Types.push_back(getMapTypeBits(
+              /*MapType*/ OMPC_MAP_alloc, /*MapTypeModifier=*/OMPC_MAP_unknown,
+              !IsExpressionFirstInfo, IsCaptureFirstInfo));
+          IsExpressionFirstInfo = false;
+          IsCaptureFirstInfo = false;
+          // The reference will be the next base address.
+          BP = RefAddr;
+        }
+
+        BasePointers.push_back(BP);
+        Pointers.push_back(LB);
+        Sizes.push_back(Size);
+
+        // We need to add a pointer flag for each map that comes from the
+        // same expression except for the first one. We also need to signal
+        // this map is the first one that relates with the current capture
+        // (there is a set of entries for each capture).
+        Types.push_back(getMapTypeBits(MapType, MapTypeModifier,
+                                       !IsExpressionFirstInfo,
+                                       IsCaptureFirstInfo));
+
+        // If we have a final array section, we are done with this expression.
+        if (IsFinalArraySection)
+          break;
+
+        // The pointer becomes the base for the next element.
+        if (Next != CE)
+          BP = LB;
+
+        IsExpressionFirstInfo = false;
+        IsCaptureFirstInfo = false;
+        continue;
+      }
+    }
+  }
+
+  /// \brief Return the adjusted map modifiers if the declaration a capture
+  /// refers to appears in a first-private clause. This is expected to be used
+  /// only with directives that start with 'target'.
+  unsigned adjustMapModifiersForPrivateClauses(const CapturedStmt::Capture &Cap,
+                                               unsigned CurrentModifiers) {
+    assert(Cap.capturesVariable() && "Expected capture by reference only!");
+
+    // A first private variable captured by reference will use only the
+    // 'private ptr' and 'map to' flag. Return the right flags if the captured
+    // declaration is known as first-private in this handler.
+    if (FirstPrivateDecls.count(Cap.getCapturedVar()))
+      return MappableExprsHandler::OMP_MAP_PRIVATE_PTR |
+             MappableExprsHandler::OMP_MAP_TO;
+
+    // We didn't modify anything.
+    return CurrentModifiers;
+  }
+
+public:
+  MappableExprsHandler(const OMPExecutableDirective &Dir, CodeGenFunction &CGF)
+      : CurDir(Dir), CGF(CGF) {
+    // Extract firstprivate clause information.
+    for (const auto *C : Dir.getClausesOfKind<OMPFirstprivateClause>())
+      for (const auto *D : C->varlists())
+        FirstPrivateDecls.insert(
+            cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl());
+    // Extract device pointer clause information.
+    for (const auto *C : Dir.getClausesOfKind<OMPIsDevicePtrClause>())
+      for (auto L : C->component_lists())
+        DevPointersMap[L.first].push_back(L.second);
+  }
+
+  /// \brief Generate all the base pointers, section pointers, sizes and map
+  /// types for the extracted mappable expressions. Also, for each item that
+  /// relates with a device pointer, a pair of the relevant declaration and
+  /// index where it occurs is appended to the device pointers info array.
+  void generateAllInfo(MapBaseValuesArrayTy &BasePointers,
+                       MapValuesArrayTy &Pointers, MapValuesArrayTy &Sizes,
+                       MapFlagsArrayTy &Types) const {
+    BasePointers.clear();
+    Pointers.clear();
+    Sizes.clear();
+    Types.clear();
+
+    struct MapInfo {
+      /// Kind that defines how a device pointer has to be returned.
+      enum ReturnPointerKind {
+        // Don't have to return any pointer.
+        RPK_None,
+        // Pointer is the base of the declaration.
+        RPK_Base,
+        // Pointer is a member of the base declaration - 'this'
+        RPK_Member,
+        // Pointer is a reference and a member of the base declaration - 'this'
+        RPK_MemberReference,
+      };
+      OMPClauseMappableExprCommon::MappableExprComponentListRef Components;
+      OpenMPMapClauseKind MapType;
+      OpenMPMapClauseKind MapTypeModifier;
+      ReturnPointerKind ReturnDevicePointer;
+
+      MapInfo()
+          : MapType(OMPC_MAP_unknown), MapTypeModifier(OMPC_MAP_unknown),
+            ReturnDevicePointer(RPK_None) {}
+      MapInfo(
+          OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
+          OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapTypeModifier,
+          ReturnPointerKind ReturnDevicePointer)
+          : Components(Components), MapType(MapType),
+            MapTypeModifier(MapTypeModifier),
+            ReturnDevicePointer(ReturnDevicePointer) {}
+    };
+
+    // We have to process the component lists that relate with the same
+    // declaration in a single chunk so that we can generate the map flags
+    // correctly. Therefore, we organize all lists in a map.
+    llvm::DenseMap<const ValueDecl *, SmallVector<MapInfo, 8>> Info;
+
+    // Helper function to fill the information map for the different supported
+    // clauses.
+    auto &&InfoGen = [&Info](
+        const ValueDecl *D,
+        OMPClauseMappableExprCommon::MappableExprComponentListRef L,
+        OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapModifier,
+        MapInfo::ReturnPointerKind ReturnDevicePointer) {
+      const ValueDecl *VD =
+          D ? cast<ValueDecl>(D->getCanonicalDecl()) : nullptr;
+      Info[VD].push_back({L, MapType, MapModifier, ReturnDevicePointer});
+    };
+
+    // FIXME: MSVC 2013 seems to require this-> to find member CurDir.
+    for (auto *C : this->CurDir.getClausesOfKind<OMPMapClause>())
+      for (auto L : C->component_lists())
+        InfoGen(L.first, L.second, C->getMapType(), C->getMapTypeModifier(),
+                MapInfo::RPK_None);
+    for (auto *C : this->CurDir.getClausesOfKind<OMPToClause>())
+      for (auto L : C->component_lists())
+        InfoGen(L.first, L.second, OMPC_MAP_to, OMPC_MAP_unknown,
+                MapInfo::RPK_None);
+    for (auto *C : this->CurDir.getClausesOfKind<OMPFromClause>())
+      for (auto L : C->component_lists())
+        InfoGen(L.first, L.second, OMPC_MAP_from, OMPC_MAP_unknown,
+                MapInfo::RPK_None);
+
+    // Look at the use_device_ptr clause information and mark the existing map
+    // entries as such. If there is no map information for an entry in the
+    // use_device_ptr list, we create one with map type 'alloc' and zero size
+    // section. It is the user fault if that was not mapped before.
+    // FIXME: MSVC 2013 seems to require this-> to find member CurDir.
+    for (auto *C : this->CurDir.getClausesOfKind<OMPUseDevicePtrClause>())
+      for (auto L : C->component_lists()) {
+        assert(!L.second.empty() && "Not expecting empty list of components!");
+        const ValueDecl *VD = L.second.back().getAssociatedDeclaration();
+        VD = cast<ValueDecl>(VD->getCanonicalDecl());
+        auto *IE = L.second.back().getAssociatedExpression();
+        // If the first component is a member expression, we have to look into
+        // 'this', which maps to null in the map of map information. Otherwise
+        // look directly for the information.
+        auto It = Info.find(isa<MemberExpr>(IE) ? nullptr : VD);
+
+        // We potentially have map information for this declaration already.
+        // Look for the first set of components that refer to it.
+        if (It != Info.end()) {
+          auto CI = std::find_if(
+              It->second.begin(), It->second.end(), [VD](const MapInfo &MI) {
+                return MI.Components.back().getAssociatedDeclaration() == VD;
+              });
+          // If we found a map entry, signal that the pointer has to be returned
+          // and move on to the next declaration.
+          if (CI != It->second.end()) {
+            CI->ReturnDevicePointer = isa<MemberExpr>(IE)
+                                          ? (VD->getType()->isReferenceType()
+                                                 ? MapInfo::RPK_MemberReference
+                                                 : MapInfo::RPK_Member)
+                                          : MapInfo::RPK_Base;
+            continue;
+          }
+        }
+
+        // We didn't find any match in our map information - generate a zero
+        // size array section.
+        // FIXME: MSVC 2013 seems to require this-> to find member CGF.
+        llvm::Value *Ptr =
+            this->CGF
+                .EmitLoadOfLValue(this->CGF.EmitLValue(IE), SourceLocation())
+                .getScalarVal();
+        BasePointers.push_back({Ptr, VD});
+        Pointers.push_back(Ptr);
+        Sizes.push_back(llvm::Constant::getNullValue(this->CGF.SizeTy));
+        Types.push_back(OMP_MAP_RETURN_PTR | OMP_MAP_FIRST_REF);
+      }
+
+    for (auto &M : Info) {
+      // We need to know when we generate information for the first component
+      // associated with a capture, because the mapping flags depend on it.
+      bool IsFirstComponentList = true;
+      for (MapInfo &L : M.second) {
+        assert(!L.Components.empty() &&
+               "Not expecting declaration with no component lists.");
+
+        // Remember the current base pointer index.
+        unsigned CurrentBasePointersIdx = BasePointers.size();
+        // FIXME: MSVC 2013 seems to require this-> to find the member method.
+        this->generateInfoForComponentList(L.MapType, L.MapTypeModifier,
+                                           L.Components, BasePointers, Pointers,
+                                           Sizes, Types, IsFirstComponentList);
+
+        // If this entry relates with a device pointer, set the relevant
+        // declaration and add the 'return pointer' flag.
+        if (IsFirstComponentList &&
+            L.ReturnDevicePointer != MapInfo::RPK_None) {
+          // If the pointer is not the base of the map, we need to skip the
+          // base. If it is a reference in a member field, we also need to skip
+          // the map of the reference.
+          if (L.ReturnDevicePointer != MapInfo::RPK_Base) {
+            ++CurrentBasePointersIdx;
+            if (L.ReturnDevicePointer == MapInfo::RPK_MemberReference)
+              ++CurrentBasePointersIdx;
+          }
+          assert(BasePointers.size() > CurrentBasePointersIdx &&
+                 "Unexpected number of mapped base pointers.");
+
+          auto *RelevantVD = L.Components.back().getAssociatedDeclaration();
+          assert(RelevantVD &&
+                 "No relevant declaration related with device pointer??");
+
+          BasePointers[CurrentBasePointersIdx].setDevicePtrDecl(RelevantVD);
+          Types[CurrentBasePointersIdx] |= OMP_MAP_RETURN_PTR;
+        }
+        IsFirstComponentList = false;
+      }
+    }
+  }
+
+  /// \brief Generate the base pointers, section pointers, sizes and map types
+  /// associated to a given capture.
+  void generateInfoForCapture(const CapturedStmt::Capture *Cap,
+                              llvm::Value *Arg,
+                              MapBaseValuesArrayTy &BasePointers,
+                              MapValuesArrayTy &Pointers,
+                              MapValuesArrayTy &Sizes,
+                              MapFlagsArrayTy &Types) const {
+    assert(!Cap->capturesVariableArrayType() &&
+           "Not expecting to generate map info for a variable array type!");
+
+    BasePointers.clear();
+    Pointers.clear();
+    Sizes.clear();
+    Types.clear();
+
+    // We need to know when we generating information for the first component
+    // associated with a capture, because the mapping flags depend on it.
+    bool IsFirstComponentList = true;
+
+    const ValueDecl *VD =
+        Cap->capturesThis()
+            ? nullptr
+            : cast<ValueDecl>(Cap->getCapturedVar()->getCanonicalDecl());
+
+    // If this declaration appears in a is_device_ptr clause we just have to
+    // pass the pointer by value. If it is a reference to a declaration, we just
+    // pass its value, otherwise, if it is a member expression, we need to map
+    // 'to' the field.
+    if (!VD) {
+      auto It = DevPointersMap.find(VD);
+      if (It != DevPointersMap.end()) {
+        for (auto L : It->second) {
+          generateInfoForComponentList(
+              /*MapType=*/OMPC_MAP_to, /*MapTypeModifier=*/OMPC_MAP_unknown, L,
+              BasePointers, Pointers, Sizes, Types, IsFirstComponentList);
+          IsFirstComponentList = false;
+        }
+        return;
+      }
+    } else if (DevPointersMap.count(VD)) {
+      BasePointers.push_back({Arg, VD});
+      Pointers.push_back(Arg);
+      Sizes.push_back(CGF.getTypeSize(CGF.getContext().VoidPtrTy));
+      Types.push_back(OMP_MAP_PRIVATE_VAL | OMP_MAP_FIRST_REF);
+      return;
+    }
+
+    // FIXME: MSVC 2013 seems to require this-> to find member CurDir.
+    for (auto *C : this->CurDir.getClausesOfKind<OMPMapClause>())
+      for (auto L : C->decl_component_lists(VD)) {
+        assert(L.first == VD &&
+               "We got information for the wrong declaration??");
+        assert(!L.second.empty() &&
+               "Not expecting declaration with no component lists.");
+        generateInfoForComponentList(C->getMapType(), C->getMapTypeModifier(),
+                                     L.second, BasePointers, Pointers, Sizes,
+                                     Types, IsFirstComponentList);
+        IsFirstComponentList = false;
+      }
+
+    return;
+  }
+
+  /// \brief Generate the default map information for a given capture \a CI,
+  /// record field declaration \a RI and captured value \a CV.
+  void generateDefaultMapInfo(const CapturedStmt::Capture &CI,
+                              const FieldDecl &RI, llvm::Value *CV,
+                              MapBaseValuesArrayTy &CurBasePointers,
+                              MapValuesArrayTy &CurPointers,
+                              MapValuesArrayTy &CurSizes,
+                              MapFlagsArrayTy &CurMapTypes) {
+
+    // Do the default mapping.
+    if (CI.capturesThis()) {
+      CurBasePointers.push_back(CV);
+      CurPointers.push_back(CV);
+      const PointerType *PtrTy = cast<PointerType>(RI.getType().getTypePtr());
+      CurSizes.push_back(CGF.getTypeSize(PtrTy->getPointeeType()));
+      // Default map type.
+      CurMapTypes.push_back(OMP_MAP_TO | OMP_MAP_FROM);
+    } else if (CI.capturesVariableByCopy()) {
+      CurBasePointers.push_back(CV);
+      CurPointers.push_back(CV);
+      if (!RI.getType()->isAnyPointerType()) {
+        // We have to signal to the runtime captures passed by value that are
+        // not pointers.
+        CurMapTypes.push_back(OMP_MAP_PRIVATE_VAL);
+        CurSizes.push_back(CGF.getTypeSize(RI.getType()));
+      } else {
+        // Pointers are implicitly mapped with a zero size and no flags
+        // (other than first map that is added for all implicit maps).
+        CurMapTypes.push_back(0u);
+        CurSizes.push_back(llvm::Constant::getNullValue(CGF.SizeTy));
+      }
+    } else {
+      assert(CI.capturesVariable() && "Expected captured reference.");
+      CurBasePointers.push_back(CV);
+      CurPointers.push_back(CV);
+
+      const ReferenceType *PtrTy =
+          cast<ReferenceType>(RI.getType().getTypePtr());
+      QualType ElementType = PtrTy->getPointeeType();
+      CurSizes.push_back(CGF.getTypeSize(ElementType));
+      // The default map type for a scalar/complex type is 'to' because by
+      // default the value doesn't have to be retrieved. For an aggregate
+      // type, the default is 'tofrom'.
+      CurMapTypes.push_back(ElementType->isAggregateType()
+                                ? (OMP_MAP_TO | OMP_MAP_FROM)
+                                : OMP_MAP_TO);
+
+      // If we have a capture by reference we may need to add the private
+      // pointer flag if the base declaration shows in some first-private
+      // clause.
+      CurMapTypes.back() =
+          adjustMapModifiersForPrivateClauses(CI, CurMapTypes.back());
+    }
+    // Every default map produces a single argument, so, it is always the
+    // first one.
+    CurMapTypes.back() |= OMP_MAP_FIRST_REF;
+  }
+};
+
+enum OpenMPOffloadingReservedDeviceIDs {
+  /// \brief Device ID if the device was not defined, runtime should get it
+  /// from environment variables in the spec.
+  OMP_DEVICEID_UNDEF = -1,
+};
+} // anonymous namespace
+
+/// \brief Emit the arrays used to pass the captures and map information to the
+/// offloading runtime library. If there is no map or capture information,
+/// return nullptr by reference.
+static void
+emitOffloadingArrays(CodeGenFunction &CGF,
+                     MappableExprsHandler::MapBaseValuesArrayTy &BasePointers,
+                     MappableExprsHandler::MapValuesArrayTy &Pointers,
+                     MappableExprsHandler::MapValuesArrayTy &Sizes,
+                     MappableExprsHandler::MapFlagsArrayTy &MapTypes,
+                     CGOpenMPRuntime::TargetDataInfo &Info) {
+  auto &CGM = CGF.CGM;
+  auto &Ctx = CGF.getContext();
+
+  // Reset the array information.
+  Info.clearArrayInfo();
+  Info.NumberOfPtrs = BasePointers.size();
+
+  if (Info.NumberOfPtrs) {
+    // Detect if we have any capture size requiring runtime evaluation of the
+    // size so that a constant array could be eventually used.
+    bool hasRuntimeEvaluationCaptureSize = false;
+    for (auto *S : Sizes)
+      if (!isa<llvm::Constant>(S)) {
+        hasRuntimeEvaluationCaptureSize = true;
+        break;
+      }
+
+    llvm::APInt PointerNumAP(32, Info.NumberOfPtrs, /*isSigned=*/true);
+    QualType PointerArrayType =
+        Ctx.getConstantArrayType(Ctx.VoidPtrTy, PointerNumAP, ArrayType::Normal,
+                                 /*IndexTypeQuals=*/0);
+
+    Info.BasePointersArray =
+        CGF.CreateMemTemp(PointerArrayType, ".offload_baseptrs").getPointer();
+    Info.PointersArray =
+        CGF.CreateMemTemp(PointerArrayType, ".offload_ptrs").getPointer();
+
+    // If we don't have any VLA types or other types that require runtime
+    // evaluation, we can use a constant array for the map sizes, otherwise we
+    // need to fill up the arrays as we do for the pointers.
+    if (hasRuntimeEvaluationCaptureSize) {
+      QualType SizeArrayType = Ctx.getConstantArrayType(
+          Ctx.getSizeType(), PointerNumAP, ArrayType::Normal,
+          /*IndexTypeQuals=*/0);
+      Info.SizesArray =
+          CGF.CreateMemTemp(SizeArrayType, ".offload_sizes").getPointer();
+    } else {
+      // We expect all the sizes to be constant, so we collect them to create
+      // a constant array.
+      SmallVector<llvm::Constant *, 16> ConstSizes;
+      for (auto S : Sizes)
+        ConstSizes.push_back(cast<llvm::Constant>(S));
+
+      auto *SizesArrayInit = llvm::ConstantArray::get(
+          llvm::ArrayType::get(CGM.SizeTy, ConstSizes.size()), ConstSizes);
+      auto *SizesArrayGbl = new llvm::GlobalVariable(
+          CGM.getModule(), SizesArrayInit->getType(),
+          /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
+          SizesArrayInit, ".offload_sizes");
+      SizesArrayGbl->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+      Info.SizesArray = SizesArrayGbl;
+    }
+
+    // The map types are always constant so we don't need to generate code to
+    // fill arrays. Instead, we create an array constant.
+    llvm::Constant *MapTypesArrayInit =
+        llvm::ConstantDataArray::get(CGF.Builder.getContext(), MapTypes);
+    auto *MapTypesArrayGbl = new llvm::GlobalVariable(
+        CGM.getModule(), MapTypesArrayInit->getType(),
+        /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
+        MapTypesArrayInit, ".offload_maptypes");
+    MapTypesArrayGbl->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+    Info.MapTypesArray = MapTypesArrayGbl;
+
+    for (unsigned i = 0; i < Info.NumberOfPtrs; ++i) {
+      llvm::Value *BPVal = *BasePointers[i];
+      if (BPVal->getType()->isPointerTy())
+        BPVal = CGF.Builder.CreateBitCast(BPVal, CGM.VoidPtrTy);
+      else {
+        assert(BPVal->getType()->isIntegerTy() &&
+               "If not a pointer, the value type must be an integer.");
+        BPVal = CGF.Builder.CreateIntToPtr(BPVal, CGM.VoidPtrTy);
+      }
+      llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32(
+          llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
+          Info.BasePointersArray, 0, i);
+      Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
+      CGF.Builder.CreateStore(BPVal, BPAddr);
+
+      if (Info.requiresDevicePointerInfo())
+        if (auto *DevVD = BasePointers[i].getDevicePtrDecl())
+          Info.CaptureDeviceAddrMap.insert(std::make_pair(DevVD, BPAddr));
+
+      llvm::Value *PVal = Pointers[i];
+      if (PVal->getType()->isPointerTy())
+        PVal = CGF.Builder.CreateBitCast(PVal, CGM.VoidPtrTy);
+      else {
+        assert(PVal->getType()->isIntegerTy() &&
+               "If not a pointer, the value type must be an integer.");
+        PVal = CGF.Builder.CreateIntToPtr(PVal, CGM.VoidPtrTy);
+      }
+      llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32(
+          llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
+          Info.PointersArray, 0, i);
+      Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
+      CGF.Builder.CreateStore(PVal, PAddr);
+
+      if (hasRuntimeEvaluationCaptureSize) {
+        llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP2_32(
+            llvm::ArrayType::get(CGM.SizeTy, Info.NumberOfPtrs),
+            Info.SizesArray,
+            /*Idx0=*/0,
+            /*Idx1=*/i);
+        Address SAddr(S, Ctx.getTypeAlignInChars(Ctx.getSizeType()));
+        CGF.Builder.CreateStore(
+            CGF.Builder.CreateIntCast(Sizes[i], CGM.SizeTy, /*isSigned=*/true),
+            SAddr);
+      }
+    }
+  }
+}
+/// \brief Emit the arguments to be passed to the runtime library based on the
+/// arrays of pointers, sizes and map types.
+static void emitOffloadingArraysArgument(
+    CodeGenFunction &CGF, llvm::Value *&BasePointersArrayArg,
+    llvm::Value *&PointersArrayArg, llvm::Value *&SizesArrayArg,
+    llvm::Value *&MapTypesArrayArg, CGOpenMPRuntime::TargetDataInfo &Info) {
+  auto &CGM = CGF.CGM;
+  if (Info.NumberOfPtrs) {
+    BasePointersArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
+        llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
+        Info.BasePointersArray,
+        /*Idx0=*/0, /*Idx1=*/0);
+    PointersArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
+        llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
+        Info.PointersArray,
+        /*Idx0=*/0,
+        /*Idx1=*/0);
+    SizesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
+        llvm::ArrayType::get(CGM.SizeTy, Info.NumberOfPtrs), Info.SizesArray,
+        /*Idx0=*/0, /*Idx1=*/0);
+    MapTypesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
+        llvm::ArrayType::get(CGM.Int32Ty, Info.NumberOfPtrs),
+        Info.MapTypesArray,
+        /*Idx0=*/0,
+        /*Idx1=*/0);
+  } else {
+    BasePointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
+    PointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
+    SizesArrayArg = llvm::ConstantPointerNull::get(CGM.SizeTy->getPointerTo());
+    MapTypesArrayArg =
+        llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo());
+  }
 }
 
 void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
@@ -3823,112 +5886,73 @@
                                      ArrayRef<llvm::Value *> CapturedVars) {
   if (!CGF.HaveInsertPoint())
     return;
-  /// \brief Values for bit flags used to specify the mapping type for
-  /// offloading.
-  enum OpenMPOffloadMappingFlags {
-    /// \brief Allocate memory on the device and move data from host to device.
-    OMP_MAP_TO = 0x01,
-    /// \brief Allocate memory on the device and move data from device to host.
-    OMP_MAP_FROM = 0x02,
-    /// \brief The element passed to the device is a pointer.
-    OMP_MAP_PTR = 0x20,
-    /// \brief Pass the element to the device by value.
-    OMP_MAP_BYCOPY = 0x80,
-  };
-
-  enum OpenMPOffloadingReservedDeviceIDs {
-    /// \brief Device ID if the device was not defined, runtime should get it
-    /// from environment variables in the spec.
-    OMP_DEVICEID_UNDEF = -1,
-  };
 
   assert(OutlinedFn && "Invalid outlined function!");
 
   auto &Ctx = CGF.getContext();
 
-  // Fill up the arrays with the all the captured variables.
-  SmallVector<llvm::Value *, 16> BasePointers;
-  SmallVector<llvm::Value *, 16> Pointers;
-  SmallVector<llvm::Value *, 16> Sizes;
-  SmallVector<unsigned, 16> MapTypes;
+  // Fill up the arrays with all the captured variables.
+  MappableExprsHandler::MapValuesArrayTy KernelArgs;
+  MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
+  MappableExprsHandler::MapValuesArrayTy Pointers;
+  MappableExprsHandler::MapValuesArrayTy Sizes;
+  MappableExprsHandler::MapFlagsArrayTy MapTypes;
 
-  bool hasVLACaptures = false;
+  MappableExprsHandler::MapBaseValuesArrayTy CurBasePointers;
+  MappableExprsHandler::MapValuesArrayTy CurPointers;
+  MappableExprsHandler::MapValuesArrayTy CurSizes;
+  MappableExprsHandler::MapFlagsArrayTy CurMapTypes;
+
+  // Get mappable expression information.
+  MappableExprsHandler MEHandler(D, CGF);
 
   const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
   auto RI = CS.getCapturedRecordDecl()->field_begin();
-  // auto II = CS.capture_init_begin();
   auto CV = CapturedVars.begin();
   for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(),
                                             CE = CS.capture_end();
        CI != CE; ++CI, ++RI, ++CV) {
     StringRef Name;
     QualType Ty;
-    llvm::Value *BasePointer;
-    llvm::Value *Pointer;
-    llvm::Value *Size;
-    unsigned MapType;
 
-    // VLA sizes are passed to the outlined region by copy.
+    CurBasePointers.clear();
+    CurPointers.clear();
+    CurSizes.clear();
+    CurMapTypes.clear();
+
+    // VLA sizes are passed to the outlined region by copy and do not have map
+    // information associated.
     if (CI->capturesVariableArrayType()) {
-      BasePointer = Pointer = *CV;
-      Size = getTypeSize(CGF, RI->getType());
+      CurBasePointers.push_back(*CV);
+      CurPointers.push_back(*CV);
+      CurSizes.push_back(CGF.getTypeSize(RI->getType()));
       // Copy to the device as an argument. No need to retrieve it.
-      MapType = OMP_MAP_BYCOPY;
-      hasVLACaptures = true;
-    } else if (CI->capturesThis()) {
-      BasePointer = Pointer = *CV;
-      const PointerType *PtrTy = cast<PointerType>(RI->getType().getTypePtr());
-      Size = getTypeSize(CGF, PtrTy->getPointeeType());
-      // Default map type.
-      MapType = OMP_MAP_TO | OMP_MAP_FROM;
-    } else if (CI->capturesVariableByCopy()) {
-      MapType = OMP_MAP_BYCOPY;
-      if (!RI->getType()->isAnyPointerType()) {
-        // If the field is not a pointer, we need to save the actual value and
-        // load it as a void pointer.
-        auto DstAddr = CGF.CreateMemTemp(
-            Ctx.getUIntPtrType(),
-            Twine(CI->getCapturedVar()->getName()) + ".casted");
-        LValue DstLV = CGF.MakeAddrLValue(DstAddr, Ctx.getUIntPtrType());
-
-        auto *SrcAddrVal = CGF.EmitScalarConversion(
-            DstAddr.getPointer(), Ctx.getPointerType(Ctx.getUIntPtrType()),
-            Ctx.getPointerType(RI->getType()), SourceLocation());
-        LValue SrcLV =
-            CGF.MakeNaturalAlignAddrLValue(SrcAddrVal, RI->getType());
-
-        // Store the value using the source type pointer.
-        CGF.EmitStoreThroughLValue(RValue::get(*CV), SrcLV);
-
-        // Load the value using the destination type pointer.
-        BasePointer = Pointer =
-            CGF.EmitLoadOfLValue(DstLV, SourceLocation()).getScalarVal();
-      } else {
-        MapType |= OMP_MAP_PTR;
-        BasePointer = Pointer = *CV;
-      }
-      Size = getTypeSize(CGF, RI->getType());
+      CurMapTypes.push_back(MappableExprsHandler::OMP_MAP_PRIVATE_VAL |
+                            MappableExprsHandler::OMP_MAP_FIRST_REF);
     } else {
-      assert(CI->capturesVariable() && "Expected captured reference.");
-      BasePointer = Pointer = *CV;
-
-      const ReferenceType *PtrTy =
-          cast<ReferenceType>(RI->getType().getTypePtr());
-      QualType ElementType = PtrTy->getPointeeType();
-      Size = getTypeSize(CGF, ElementType);
-      // The default map type for a scalar/complex type is 'to' because by
-      // default the value doesn't have to be retrieved. For an aggregate type,
-      // the default is 'tofrom'.
-      MapType = ElementType->isAggregateType() ? (OMP_MAP_TO | OMP_MAP_FROM)
-                                               : OMP_MAP_TO;
-      if (ElementType->isAnyPointerType())
-        MapType |= OMP_MAP_PTR;
+      // If we have any information in the map clause, we use it, otherwise we
+      // just do a default mapping.
+      MEHandler.generateInfoForCapture(CI, *CV, CurBasePointers, CurPointers,
+                                       CurSizes, CurMapTypes);
+      if (CurBasePointers.empty())
+        MEHandler.generateDefaultMapInfo(*CI, **RI, *CV, CurBasePointers,
+                                         CurPointers, CurSizes, CurMapTypes);
     }
+    // We expect to have at least an element of information for this capture.
+    assert(!CurBasePointers.empty() && "Non-existing map pointer for capture!");
+    assert(CurBasePointers.size() == CurPointers.size() &&
+           CurBasePointers.size() == CurSizes.size() &&
+           CurBasePointers.size() == CurMapTypes.size() &&
+           "Inconsistent map information sizes!");
 
-    BasePointers.push_back(BasePointer);
-    Pointers.push_back(Pointer);
-    Sizes.push_back(Size);
-    MapTypes.push_back(MapType);
+    // The kernel args are always the first elements of the base pointers
+    // associated with a capture.
+    KernelArgs.push_back(*CurBasePointers.front());
+    // We need to append the results of this capture to what we already have.
+    BasePointers.append(CurBasePointers.begin(), CurBasePointers.end());
+    Pointers.append(CurPointers.begin(), CurPointers.end());
+    Sizes.append(CurSizes.begin(), CurSizes.end());
+    MapTypes.append(CurMapTypes.begin(), CurMapTypes.end());
   }
 
   // Keep track on whether the host function has to be executed.
@@ -3941,128 +5965,16 @@
                         OffloadError);
 
   // Fill up the pointer arrays and transfer execution to the device.
-  auto &&ThenGen = [this, &Ctx, &BasePointers, &Pointers, &Sizes, &MapTypes,
-                    hasVLACaptures, Device, OutlinedFnID, OffloadError,
-                    OffloadErrorQType](CodeGenFunction &CGF) {
-    unsigned PointerNumVal = BasePointers.size();
-    llvm::Value *PointerNum = CGF.Builder.getInt32(PointerNumVal);
-    llvm::Value *BasePointersArray;
-    llvm::Value *PointersArray;
-    llvm::Value *SizesArray;
-    llvm::Value *MapTypesArray;
-
-    if (PointerNumVal) {
-      llvm::APInt PointerNumAP(32, PointerNumVal, /*isSigned=*/true);
-      QualType PointerArrayType = Ctx.getConstantArrayType(
-          Ctx.VoidPtrTy, PointerNumAP, ArrayType::Normal,
-          /*IndexTypeQuals=*/0);
-
-      BasePointersArray =
-          CGF.CreateMemTemp(PointerArrayType, ".offload_baseptrs").getPointer();
-      PointersArray =
-          CGF.CreateMemTemp(PointerArrayType, ".offload_ptrs").getPointer();
-
-      // If we don't have any VLA types, we can use a constant array for the map
-      // sizes, otherwise we need to fill up the arrays as we do for the
-      // pointers.
-      if (hasVLACaptures) {
-        QualType SizeArrayType = Ctx.getConstantArrayType(
-            Ctx.getSizeType(), PointerNumAP, ArrayType::Normal,
-            /*IndexTypeQuals=*/0);
-        SizesArray =
-            CGF.CreateMemTemp(SizeArrayType, ".offload_sizes").getPointer();
-      } else {
-        // We expect all the sizes to be constant, so we collect them to create
-        // a constant array.
-        SmallVector<llvm::Constant *, 16> ConstSizes;
-        for (auto S : Sizes)
-          ConstSizes.push_back(cast<llvm::Constant>(S));
-
-        auto *SizesArrayInit = llvm::ConstantArray::get(
-            llvm::ArrayType::get(CGM.SizeTy, ConstSizes.size()), ConstSizes);
-        auto *SizesArrayGbl = new llvm::GlobalVariable(
-            CGM.getModule(), SizesArrayInit->getType(),
-            /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
-            SizesArrayInit, ".offload_sizes");
-        SizesArrayGbl->setUnnamedAddr(true);
-        SizesArray = SizesArrayGbl;
-      }
-
-      // The map types are always constant so we don't need to generate code to
-      // fill arrays. Instead, we create an array constant.
-      llvm::Constant *MapTypesArrayInit =
-          llvm::ConstantDataArray::get(CGF.Builder.getContext(), MapTypes);
-      auto *MapTypesArrayGbl = new llvm::GlobalVariable(
-          CGM.getModule(), MapTypesArrayInit->getType(),
-          /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
-          MapTypesArrayInit, ".offload_maptypes");
-      MapTypesArrayGbl->setUnnamedAddr(true);
-      MapTypesArray = MapTypesArrayGbl;
-
-      for (unsigned i = 0; i < PointerNumVal; ++i) {
-
-        llvm::Value *BPVal = BasePointers[i];
-        if (BPVal->getType()->isPointerTy())
-          BPVal = CGF.Builder.CreateBitCast(BPVal, CGM.VoidPtrTy);
-        else {
-          assert(BPVal->getType()->isIntegerTy() &&
-                 "If not a pointer, the value type must be an integer.");
-          BPVal = CGF.Builder.CreateIntToPtr(BPVal, CGM.VoidPtrTy);
-        }
-        llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32(
-            llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal),
-            BasePointersArray, 0, i);
-        Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
-        CGF.Builder.CreateStore(BPVal, BPAddr);
-
-        llvm::Value *PVal = Pointers[i];
-        if (PVal->getType()->isPointerTy())
-          PVal = CGF.Builder.CreateBitCast(PVal, CGM.VoidPtrTy);
-        else {
-          assert(PVal->getType()->isIntegerTy() &&
-                 "If not a pointer, the value type must be an integer.");
-          PVal = CGF.Builder.CreateIntToPtr(PVal, CGM.VoidPtrTy);
-        }
-        llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32(
-            llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), PointersArray,
-            0, i);
-        Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
-        CGF.Builder.CreateStore(PVal, PAddr);
-
-        if (hasVLACaptures) {
-          llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP2_32(
-              llvm::ArrayType::get(CGM.SizeTy, PointerNumVal), SizesArray,
-              /*Idx0=*/0,
-              /*Idx1=*/i);
-          Address SAddr(S, Ctx.getTypeAlignInChars(Ctx.getSizeType()));
-          CGF.Builder.CreateStore(CGF.Builder.CreateIntCast(
-                                      Sizes[i], CGM.SizeTy, /*isSigned=*/true),
-                                  SAddr);
-        }
-      }
-
-      BasePointersArray = CGF.Builder.CreateConstInBoundsGEP2_32(
-          llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), BasePointersArray,
-          /*Idx0=*/0, /*Idx1=*/0);
-      PointersArray = CGF.Builder.CreateConstInBoundsGEP2_32(
-          llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), PointersArray,
-          /*Idx0=*/0,
-          /*Idx1=*/0);
-      SizesArray = CGF.Builder.CreateConstInBoundsGEP2_32(
-          llvm::ArrayType::get(CGM.SizeTy, PointerNumVal), SizesArray,
-          /*Idx0=*/0, /*Idx1=*/0);
-      MapTypesArray = CGF.Builder.CreateConstInBoundsGEP2_32(
-          llvm::ArrayType::get(CGM.Int32Ty, PointerNumVal), MapTypesArray,
-          /*Idx0=*/0,
-          /*Idx1=*/0);
-
-    } else {
-      BasePointersArray = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
-      PointersArray = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
-      SizesArray = llvm::ConstantPointerNull::get(CGM.SizeTy->getPointerTo());
-      MapTypesArray =
-          llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo());
-    }
+  auto &&ThenGen = [&Ctx, &BasePointers, &Pointers, &Sizes, &MapTypes, Device,
+                    OutlinedFnID, OffloadError, OffloadErrorQType,
+                    &D](CodeGenFunction &CGF, PrePostActionTy &) {
+    auto &RT = CGF.CGM.getOpenMPRuntime();
+    // Emit the offloading arrays.
+    TargetDataInfo Info;
+    emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
+    emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
+                                 Info.PointersArray, Info.SizesArray,
+                                 Info.MapTypesArray, Info);
 
     // On top of the arrays that were filled up, the target offloading call
     // takes as arguments the device id as well as the host pointer. The host
@@ -4080,23 +5992,52 @@
     llvm::Value *DeviceID;
     if (Device)
       DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
-                                           CGM.Int32Ty, /*isSigned=*/true);
+                                           CGF.Int32Ty, /*isSigned=*/true);
     else
       DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
 
-    llvm::Value *OffloadingArgs[] = {
-        DeviceID,      OutlinedFnID, PointerNum,   BasePointersArray,
-        PointersArray, SizesArray,   MapTypesArray};
-    auto Return = CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__tgt_target),
-                                      OffloadingArgs);
+    // Emit the number of elements in the offloading arrays.
+    llvm::Value *PointerNum = CGF.Builder.getInt32(BasePointers.size());
+
+    // Return value of the runtime offloading call.
+    llvm::Value *Return;
+
+    auto *NumTeams = emitNumTeamsClauseForTargetDirective(RT, CGF, D);
+    auto *ThreadLimit = emitThreadLimitClauseForTargetDirective(RT, CGF, D);
+
+    // If we have NumTeams defined this means that we have an enclosed teams
+    // region. Therefore we also expect to have ThreadLimit defined. These two
+    // values should be defined in the presence of a teams directive, regardless
+    // of having any clauses associated. If the user is using teams but no
+    // clauses, these two values will be the default that should be passed to
+    // the runtime library - a 32-bit integer with the value zero.
+    if (NumTeams) {
+      assert(ThreadLimit && "Thread limit expression should be available along "
+                            "with number of teams.");
+      llvm::Value *OffloadingArgs[] = {
+          DeviceID,           OutlinedFnID,
+          PointerNum,         Info.BasePointersArray,
+          Info.PointersArray, Info.SizesArray,
+          Info.MapTypesArray, NumTeams,
+          ThreadLimit};
+      Return = CGF.EmitRuntimeCall(
+          RT.createRuntimeFunction(OMPRTL__tgt_target_teams), OffloadingArgs);
+    } else {
+      llvm::Value *OffloadingArgs[] = {
+          DeviceID,           OutlinedFnID,
+          PointerNum,         Info.BasePointersArray,
+          Info.PointersArray, Info.SizesArray,
+          Info.MapTypesArray};
+      Return = CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target),
+                                   OffloadingArgs);
+    }
 
     CGF.EmitStoreOfScalar(Return, OffloadError);
   };
 
   // Notify that the host version must be executed.
-  auto &&ElseGen = [this, OffloadError,
-                    OffloadErrorQType](CodeGenFunction &CGF) {
-    CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/-1u),
+  auto &&ElseGen = [OffloadError](CodeGenFunction &CGF, PrePostActionTy &) {
+    CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/-1u),
                           OffloadError);
   };
 
@@ -4105,15 +6046,15 @@
   // regardless of the conditional in the if clause if, e.g., the user do not
   // specify target triples.
   if (OutlinedFnID) {
-    if (IfCond) {
+    if (IfCond)
       emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
-    } else {
-      CodeGenFunction::RunCleanupsScope Scope(CGF);
-      ThenGen(CGF);
+    else {
+      RegionCodeGenTy ThenRCG(ThenGen);
+      ThenRCG(CGF);
     }
   } else {
-    CodeGenFunction::RunCleanupsScope Scope(CGF);
-    ElseGen(CGF);
+    RegionCodeGenTy ElseRCG(ElseGen);
+    ElseRCG(CGF);
   }
 
   // Check the error code and execute the host version if required.
@@ -4124,11 +6065,10 @@
   CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
 
   CGF.EmitBlock(OffloadFailedBlock);
-  CGF.Builder.CreateCall(OutlinedFn, BasePointers);
+  CGF.Builder.CreateCall(OutlinedFn, KernelArgs);
   CGF.EmitBranch(OffloadContBlock);
 
   CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true);
-  return;
 }
 
 void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
@@ -4146,26 +6086,27 @@
     unsigned DeviceID;
     unsigned FileID;
     unsigned Line;
-    unsigned Column;
     getTargetEntryUniqueInfo(CGM.getContext(), E->getLocStart(), DeviceID,
-                             FileID, Line, Column);
+                             FileID, Line);
 
     // Is this a target region that should not be emitted as an entry point? If
     // so just signal we are done with this target region.
-    if (!OffloadEntriesInfoManager.hasTargetRegionEntryInfo(
-            DeviceID, FileID, ParentName, Line, Column))
+    if (!OffloadEntriesInfoManager.hasTargetRegionEntryInfo(DeviceID, FileID,
+                                                            ParentName, Line))
       return;
 
     llvm::Function *Fn;
     llvm::Constant *Addr;
-    emitTargetOutlinedFunction(*E, ParentName, Fn, Addr,
-                               /*isOffloadEntry=*/true);
+    std::tie(Fn, Addr) =
+        CodeGenFunction::EmitOMPTargetDirectiveOutlinedFunction(
+            CGM, cast<OMPTargetDirective>(*E), ParentName,
+            /*isOffloadEntry=*/true);
     assert(Fn && Addr && "Target region emission failed.");
     return;
   }
 
   if (const OMPExecutableDirective *E = dyn_cast<OMPExecutableDirective>(S)) {
-    if (!E->getAssociatedStmt())
+    if (!E->hasAssociatedStmt())
       return;
 
     scanForTargetRegionsFunctions(
@@ -4181,8 +6122,6 @@
   // Keep looking for target regions recursively.
   for (auto *II : S->children())
     scanForTargetRegionsFunctions(II, ParentName);
-
-  return;
 }
 
 bool CGOpenMPRuntime::emitTargetFunctions(GlobalDecl GD) {
@@ -4247,3 +6186,593 @@
   // compilation unit.
   return createOffloadingBinaryDescriptorRegistration();
 }
+
+void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF,
+                                    const OMPExecutableDirective &D,
+                                    SourceLocation Loc,
+                                    llvm::Value *OutlinedFn,
+                                    ArrayRef<llvm::Value *> CapturedVars) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  auto *RTLoc = emitUpdateLocation(CGF, Loc);
+  CodeGenFunction::RunCleanupsScope Scope(CGF);
+
+  // Build call __kmpc_fork_teams(loc, n, microtask, var1, .., varn);
+  llvm::Value *Args[] = {
+      RTLoc,
+      CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars
+      CGF.Builder.CreateBitCast(OutlinedFn, getKmpc_MicroPointerTy())};
+  llvm::SmallVector<llvm::Value *, 16> RealArgs;
+  RealArgs.append(std::begin(Args), std::end(Args));
+  RealArgs.append(CapturedVars.begin(), CapturedVars.end());
+
+  auto RTLFn = createRuntimeFunction(OMPRTL__kmpc_fork_teams);
+  CGF.EmitRuntimeCall(RTLFn, RealArgs);
+}
+
+void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF,
+                                         const Expr *NumTeams,
+                                         const Expr *ThreadLimit,
+                                         SourceLocation Loc) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  auto *RTLoc = emitUpdateLocation(CGF, Loc);
+
+  llvm::Value *NumTeamsVal =
+      (NumTeams)
+          ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(NumTeams),
+                                      CGF.CGM.Int32Ty, /* isSigned = */ true)
+          : CGF.Builder.getInt32(0);
+
+  llvm::Value *ThreadLimitVal =
+      (ThreadLimit)
+          ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit),
+                                      CGF.CGM.Int32Ty, /* isSigned = */ true)
+          : CGF.Builder.getInt32(0);
+
+  // Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit)
+  llvm::Value *PushNumTeamsArgs[] = {RTLoc, getThreadID(CGF, Loc), NumTeamsVal,
+                                     ThreadLimitVal};
+  CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams),
+                      PushNumTeamsArgs);
+}
+
+void CGOpenMPRuntime::emitTargetDataCalls(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
+    const Expr *Device, const RegionCodeGenTy &CodeGen, TargetDataInfo &Info) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  // Action used to replace the default codegen action and turn privatization
+  // off.
+  PrePostActionTy NoPrivAction;
+
+  // Generate the code for the opening of the data environment. Capture all the
+  // arguments of the runtime call by reference because they are used in the
+  // closing of the region.
+  auto &&BeginThenGen = [&D, &CGF, Device, &Info, &CodeGen, &NoPrivAction](
+      CodeGenFunction &CGF, PrePostActionTy &) {
+    // Fill up the arrays with all the mapped variables.
+    MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
+    MappableExprsHandler::MapValuesArrayTy Pointers;
+    MappableExprsHandler::MapValuesArrayTy Sizes;
+    MappableExprsHandler::MapFlagsArrayTy MapTypes;
+
+    // Get map clause information.
+    MappableExprsHandler MCHandler(D, CGF);
+    MCHandler.generateAllInfo(BasePointers, Pointers, Sizes, MapTypes);
+
+    // Fill up the arrays and create the arguments.
+    emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
+
+    llvm::Value *BasePointersArrayArg = nullptr;
+    llvm::Value *PointersArrayArg = nullptr;
+    llvm::Value *SizesArrayArg = nullptr;
+    llvm::Value *MapTypesArrayArg = nullptr;
+    emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
+                                 SizesArrayArg, MapTypesArrayArg, Info);
+
+    // Emit device ID if any.
+    llvm::Value *DeviceID = nullptr;
+    if (Device)
+      DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
+                                           CGF.Int32Ty, /*isSigned=*/true);
+    else
+      DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
+
+    // Emit the number of elements in the offloading arrays.
+    auto *PointerNum = CGF.Builder.getInt32(Info.NumberOfPtrs);
+
+    llvm::Value *OffloadingArgs[] = {
+        DeviceID,         PointerNum,    BasePointersArrayArg,
+        PointersArrayArg, SizesArrayArg, MapTypesArrayArg};
+    auto &RT = CGF.CGM.getOpenMPRuntime();
+    CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target_data_begin),
+                        OffloadingArgs);
+
+    // If device pointer privatization is required, emit the body of the region
+    // here. It will have to be duplicated: with and without privatization.
+    if (!Info.CaptureDeviceAddrMap.empty())
+      CodeGen(CGF);
+  };
+
+  // Generate code for the closing of the data region.
+  auto &&EndThenGen = [&CGF, Device, &Info](CodeGenFunction &CGF,
+                                            PrePostActionTy &) {
+    assert(Info.isValid() && "Invalid data environment closing arguments.");
+
+    llvm::Value *BasePointersArrayArg = nullptr;
+    llvm::Value *PointersArrayArg = nullptr;
+    llvm::Value *SizesArrayArg = nullptr;
+    llvm::Value *MapTypesArrayArg = nullptr;
+    emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
+                                 SizesArrayArg, MapTypesArrayArg, Info);
+
+    // Emit device ID if any.
+    llvm::Value *DeviceID = nullptr;
+    if (Device)
+      DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
+                                           CGF.Int32Ty, /*isSigned=*/true);
+    else
+      DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
+
+    // Emit the number of elements in the offloading arrays.
+    auto *PointerNum = CGF.Builder.getInt32(Info.NumberOfPtrs);
+
+    llvm::Value *OffloadingArgs[] = {
+        DeviceID,         PointerNum,    BasePointersArrayArg,
+        PointersArrayArg, SizesArrayArg, MapTypesArrayArg};
+    auto &RT = CGF.CGM.getOpenMPRuntime();
+    CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target_data_end),
+                        OffloadingArgs);
+  };
+
+  // If we need device pointer privatization, we need to emit the body of the
+  // region with no privatization in the 'else' branch of the conditional.
+  // Otherwise, we don't have to do anything.
+  auto &&BeginElseGen = [&Info, &CodeGen, &NoPrivAction](CodeGenFunction &CGF,
+                                                         PrePostActionTy &) {
+    if (!Info.CaptureDeviceAddrMap.empty()) {
+      CodeGen.setAction(NoPrivAction);
+      CodeGen(CGF);
+    }
+  };
+
+  // We don't have to do anything to close the region if the if clause evaluates
+  // to false.
+  auto &&EndElseGen = [](CodeGenFunction &CGF, PrePostActionTy &) {};
+
+  if (IfCond) {
+    emitOMPIfClause(CGF, IfCond, BeginThenGen, BeginElseGen);
+  } else {
+    RegionCodeGenTy RCG(BeginThenGen);
+    RCG(CGF);
+  }
+
+  // If we don't require privatization of device pointers, we emit the body in
+  // between the runtime calls. This avoids duplicating the body code.
+  if (Info.CaptureDeviceAddrMap.empty()) {
+    CodeGen.setAction(NoPrivAction);
+    CodeGen(CGF);
+  }
+
+  if (IfCond) {
+    emitOMPIfClause(CGF, IfCond, EndThenGen, EndElseGen);
+  } else {
+    RegionCodeGenTy RCG(EndThenGen);
+    RCG(CGF);
+  }
+}
+
+void CGOpenMPRuntime::emitTargetDataStandAloneCall(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
+    const Expr *Device) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  assert((isa<OMPTargetEnterDataDirective>(D) ||
+          isa<OMPTargetExitDataDirective>(D) ||
+          isa<OMPTargetUpdateDirective>(D)) &&
+         "Expecting either target enter, exit data, or update directives.");
+
+  // Generate the code for the opening of the data environment.
+  auto &&ThenGen = [&D, &CGF, Device](CodeGenFunction &CGF, PrePostActionTy &) {
+    // Fill up the arrays with all the mapped variables.
+    MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
+    MappableExprsHandler::MapValuesArrayTy Pointers;
+    MappableExprsHandler::MapValuesArrayTy Sizes;
+    MappableExprsHandler::MapFlagsArrayTy MapTypes;
+
+    // Get map clause information.
+    MappableExprsHandler MEHandler(D, CGF);
+    MEHandler.generateAllInfo(BasePointers, Pointers, Sizes, MapTypes);
+
+    // Fill up the arrays and create the arguments.
+    TargetDataInfo Info;
+    emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
+    emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
+                                 Info.PointersArray, Info.SizesArray,
+                                 Info.MapTypesArray, Info);
+
+    // Emit device ID if any.
+    llvm::Value *DeviceID = nullptr;
+    if (Device)
+      DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
+                                           CGF.Int32Ty, /*isSigned=*/true);
+    else
+      DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
+
+    // Emit the number of elements in the offloading arrays.
+    auto *PointerNum = CGF.Builder.getInt32(BasePointers.size());
+
+    llvm::Value *OffloadingArgs[] = {
+        DeviceID,           PointerNum,      Info.BasePointersArray,
+        Info.PointersArray, Info.SizesArray, Info.MapTypesArray};
+
+    auto &RT = CGF.CGM.getOpenMPRuntime();
+    // Select the right runtime function call for each expected standalone
+    // directive.
+    OpenMPRTLFunction RTLFn;
+    switch (D.getDirectiveKind()) {
+    default:
+      llvm_unreachable("Unexpected standalone target data directive.");
+      break;
+    case OMPD_target_enter_data:
+      RTLFn = OMPRTL__tgt_target_data_begin;
+      break;
+    case OMPD_target_exit_data:
+      RTLFn = OMPRTL__tgt_target_data_end;
+      break;
+    case OMPD_target_update:
+      RTLFn = OMPRTL__tgt_target_data_update;
+      break;
+    }
+    CGF.EmitRuntimeCall(RT.createRuntimeFunction(RTLFn), OffloadingArgs);
+  };
+
+  // In the event we get an if clause, we don't have to take any action on the
+  // else side.
+  auto &&ElseGen = [](CodeGenFunction &CGF, PrePostActionTy &) {};
+
+  if (IfCond) {
+    emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
+  } else {
+    RegionCodeGenTy ThenGenRCG(ThenGen);
+    ThenGenRCG(CGF);
+  }
+}
+
+namespace {
+  /// Kind of parameter in a function with 'declare simd' directive.
+  enum ParamKindTy { LinearWithVarStride, Linear, Uniform, Vector };
+  /// Attribute set of the parameter.
+  struct ParamAttrTy {
+    ParamKindTy Kind = Vector;
+    llvm::APSInt StrideOrArg;
+    llvm::APSInt Alignment;
+  };
+} // namespace
+
+static unsigned evaluateCDTSize(const FunctionDecl *FD,
+                                ArrayRef<ParamAttrTy> ParamAttrs) {
+  // Every vector variant of a SIMD-enabled function has a vector length (VLEN).
+  // If OpenMP clause "simdlen" is used, the VLEN is the value of the argument
+  // of that clause. The VLEN value must be power of 2.
+  // In other case the notion of the function`s "characteristic data type" (CDT)
+  // is used to compute the vector length.
+  // CDT is defined in the following order:
+  //   a) For non-void function, the CDT is the return type.
+  //   b) If the function has any non-uniform, non-linear parameters, then the
+  //   CDT is the type of the first such parameter.
+  //   c) If the CDT determined by a) or b) above is struct, union, or class
+  //   type which is pass-by-value (except for the type that maps to the
+  //   built-in complex data type), the characteristic data type is int.
+  //   d) If none of the above three cases is applicable, the CDT is int.
+  // The VLEN is then determined based on the CDT and the size of vector
+  // register of that ISA for which current vector version is generated. The
+  // VLEN is computed using the formula below:
+  //   VLEN  = sizeof(vector_register) / sizeof(CDT),
+  // where vector register size specified in section 3.2.1 Registers and the
+  // Stack Frame of original AMD64 ABI document.
+  QualType RetType = FD->getReturnType();
+  if (RetType.isNull())
+    return 0;
+  ASTContext &C = FD->getASTContext();
+  QualType CDT;
+  if (!RetType.isNull() && !RetType->isVoidType())
+    CDT = RetType;
+  else {
+    unsigned Offset = 0;
+    if (auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
+      if (ParamAttrs[Offset].Kind == Vector)
+        CDT = C.getPointerType(C.getRecordType(MD->getParent()));
+      ++Offset;
+    }
+    if (CDT.isNull()) {
+      for (unsigned I = 0, E = FD->getNumParams(); I < E; ++I) {
+        if (ParamAttrs[I + Offset].Kind == Vector) {
+          CDT = FD->getParamDecl(I)->getType();
+          break;
+        }
+      }
+    }
+  }
+  if (CDT.isNull())
+    CDT = C.IntTy;
+  CDT = CDT->getCanonicalTypeUnqualified();
+  if (CDT->isRecordType() || CDT->isUnionType())
+    CDT = C.IntTy;
+  return C.getTypeSize(CDT);
+}
+
+static void
+emitX86DeclareSimdFunction(const FunctionDecl *FD, llvm::Function *Fn,
+                           llvm::APSInt VLENVal,
+                           ArrayRef<ParamAttrTy> ParamAttrs,
+                           OMPDeclareSimdDeclAttr::BranchStateTy State) {
+  struct ISADataTy {
+    char ISA;
+    unsigned VecRegSize;
+  };
+  ISADataTy ISAData[] = {
+      {
+          'b', 128
+      }, // SSE
+      {
+          'c', 256
+      }, // AVX
+      {
+          'd', 256
+      }, // AVX2
+      {
+          'e', 512
+      }, // AVX512
+  };
+  llvm::SmallVector<char, 2> Masked;
+  switch (State) {
+  case OMPDeclareSimdDeclAttr::BS_Undefined:
+    Masked.push_back('N');
+    Masked.push_back('M');
+    break;
+  case OMPDeclareSimdDeclAttr::BS_Notinbranch:
+    Masked.push_back('N');
+    break;
+  case OMPDeclareSimdDeclAttr::BS_Inbranch:
+    Masked.push_back('M');
+    break;
+  }
+  for (auto Mask : Masked) {
+    for (auto &Data : ISAData) {
+      SmallString<256> Buffer;
+      llvm::raw_svector_ostream Out(Buffer);
+      Out << "_ZGV" << Data.ISA << Mask;
+      if (!VLENVal) {
+        Out << llvm::APSInt::getUnsigned(Data.VecRegSize /
+                                         evaluateCDTSize(FD, ParamAttrs));
+      } else
+        Out << VLENVal;
+      for (auto &ParamAttr : ParamAttrs) {
+        switch (ParamAttr.Kind){
+        case LinearWithVarStride:
+          Out << 's' << ParamAttr.StrideOrArg;
+          break;
+        case Linear:
+          Out << 'l';
+          if (!!ParamAttr.StrideOrArg)
+            Out << ParamAttr.StrideOrArg;
+          break;
+        case Uniform:
+          Out << 'u';
+          break;
+        case Vector:
+          Out << 'v';
+          break;
+        }
+        if (!!ParamAttr.Alignment)
+          Out << 'a' << ParamAttr.Alignment;
+      }
+      Out << '_' << Fn->getName();
+      Fn->addFnAttr(Out.str());
+    }
+  }
+}
+
+void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
+                                              llvm::Function *Fn) {
+  ASTContext &C = CGM.getContext();
+  FD = FD->getCanonicalDecl();
+  // Map params to their positions in function decl.
+  llvm::DenseMap<const Decl *, unsigned> ParamPositions;
+  if (isa<CXXMethodDecl>(FD))
+    ParamPositions.insert({FD, 0});
+  unsigned ParamPos = ParamPositions.size();
+  for (auto *P : FD->parameters()) {
+    ParamPositions.insert({P->getCanonicalDecl(), ParamPos});
+    ++ParamPos;
+  }
+  for (auto *Attr : FD->specific_attrs<OMPDeclareSimdDeclAttr>()) {
+    llvm::SmallVector<ParamAttrTy, 8> ParamAttrs(ParamPositions.size());
+    // Mark uniform parameters.
+    for (auto *E : Attr->uniforms()) {
+      E = E->IgnoreParenImpCasts();
+      unsigned Pos;
+      if (isa<CXXThisExpr>(E))
+        Pos = ParamPositions[FD];
+      else {
+        auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
+                        ->getCanonicalDecl();
+        Pos = ParamPositions[PVD];
+      }
+      ParamAttrs[Pos].Kind = Uniform;
+    }
+    // Get alignment info.
+    auto NI = Attr->alignments_begin();
+    for (auto *E : Attr->aligneds()) {
+      E = E->IgnoreParenImpCasts();
+      unsigned Pos;
+      QualType ParmTy;
+      if (isa<CXXThisExpr>(E)) {
+        Pos = ParamPositions[FD];
+        ParmTy = E->getType();
+      } else {
+        auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
+                        ->getCanonicalDecl();
+        Pos = ParamPositions[PVD];
+        ParmTy = PVD->getType();
+      }
+      ParamAttrs[Pos].Alignment =
+          (*NI) ? (*NI)->EvaluateKnownConstInt(C)
+                : llvm::APSInt::getUnsigned(
+                      C.toCharUnitsFromBits(C.getOpenMPDefaultSimdAlign(ParmTy))
+                          .getQuantity());
+      ++NI;
+    }
+    // Mark linear parameters.
+    auto SI = Attr->steps_begin();
+    auto MI = Attr->modifiers_begin();
+    for (auto *E : Attr->linears()) {
+      E = E->IgnoreParenImpCasts();
+      unsigned Pos;
+      if (isa<CXXThisExpr>(E))
+        Pos = ParamPositions[FD];
+      else {
+        auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
+                        ->getCanonicalDecl();
+        Pos = ParamPositions[PVD];
+      }
+      auto &ParamAttr = ParamAttrs[Pos];
+      ParamAttr.Kind = Linear;
+      if (*SI) {
+        if (!(*SI)->EvaluateAsInt(ParamAttr.StrideOrArg, C,
+                                  Expr::SE_AllowSideEffects)) {
+          if (auto *DRE = cast<DeclRefExpr>((*SI)->IgnoreParenImpCasts())) {
+            if (auto *StridePVD = cast<ParmVarDecl>(DRE->getDecl())) {
+              ParamAttr.Kind = LinearWithVarStride;
+              ParamAttr.StrideOrArg = llvm::APSInt::getUnsigned(
+                  ParamPositions[StridePVD->getCanonicalDecl()]);
+            }
+          }
+        }
+      }
+      ++SI;
+      ++MI;
+    }
+    llvm::APSInt VLENVal;
+    if (const Expr *VLEN = Attr->getSimdlen())
+      VLENVal = VLEN->EvaluateKnownConstInt(C);
+    OMPDeclareSimdDeclAttr::BranchStateTy State = Attr->getBranchState();
+    if (CGM.getTriple().getArch() == llvm::Triple::x86 ||
+        CGM.getTriple().getArch() == llvm::Triple::x86_64)
+      emitX86DeclareSimdFunction(FD, Fn, VLENVal, ParamAttrs, State);
+  }
+}
+
+namespace {
+/// Cleanup action for doacross support.
+class DoacrossCleanupTy final : public EHScopeStack::Cleanup {
+public:
+  static const int DoacrossFinArgs = 2;
+
+private:
+  llvm::Value *RTLFn;
+  llvm::Value *Args[DoacrossFinArgs];
+
+public:
+  DoacrossCleanupTy(llvm::Value *RTLFn, ArrayRef<llvm::Value *> CallArgs)
+      : RTLFn(RTLFn) {
+    assert(CallArgs.size() == DoacrossFinArgs);
+    std::copy(CallArgs.begin(), CallArgs.end(), std::begin(Args));
+  }
+  void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
+    if (!CGF.HaveInsertPoint())
+      return;
+    CGF.EmitRuntimeCall(RTLFn, Args);
+  }
+};
+} // namespace
+
+void CGOpenMPRuntime::emitDoacrossInit(CodeGenFunction &CGF,
+                                       const OMPLoopDirective &D) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  ASTContext &C = CGM.getContext();
+  QualType Int64Ty = C.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/true);
+  RecordDecl *RD;
+  if (KmpDimTy.isNull()) {
+    // Build struct kmp_dim {  // loop bounds info casted to kmp_int64
+    //  kmp_int64 lo; // lower
+    //  kmp_int64 up; // upper
+    //  kmp_int64 st; // stride
+    // };
+    RD = C.buildImplicitRecord("kmp_dim");
+    RD->startDefinition();
+    addFieldToRecordDecl(C, RD, Int64Ty);
+    addFieldToRecordDecl(C, RD, Int64Ty);
+    addFieldToRecordDecl(C, RD, Int64Ty);
+    RD->completeDefinition();
+    KmpDimTy = C.getRecordType(RD);
+  } else
+    RD = cast<RecordDecl>(KmpDimTy->getAsTagDecl());
+
+  Address DimsAddr = CGF.CreateMemTemp(KmpDimTy, "dims");
+  CGF.EmitNullInitialization(DimsAddr, KmpDimTy);
+  enum { LowerFD = 0, UpperFD, StrideFD };
+  // Fill dims with data.
+  LValue DimsLVal = CGF.MakeAddrLValue(DimsAddr, KmpDimTy);
+  // dims.upper = num_iterations;
+  LValue UpperLVal =
+      CGF.EmitLValueForField(DimsLVal, *std::next(RD->field_begin(), UpperFD));
+  llvm::Value *NumIterVal = CGF.EmitScalarConversion(
+      CGF.EmitScalarExpr(D.getNumIterations()), D.getNumIterations()->getType(),
+      Int64Ty, D.getNumIterations()->getExprLoc());
+  CGF.EmitStoreOfScalar(NumIterVal, UpperLVal);
+  // dims.stride = 1;
+  LValue StrideLVal =
+      CGF.EmitLValueForField(DimsLVal, *std::next(RD->field_begin(), StrideFD));
+  CGF.EmitStoreOfScalar(llvm::ConstantInt::getSigned(CGM.Int64Ty, /*V=*/1),
+                        StrideLVal);
+
+  // Build call void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
+  // kmp_int32 num_dims, struct kmp_dim * dims);
+  llvm::Value *Args[] = {emitUpdateLocation(CGF, D.getLocStart()),
+                         getThreadID(CGF, D.getLocStart()),
+                         llvm::ConstantInt::getSigned(CGM.Int32Ty, 1),
+                         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                             DimsAddr.getPointer(), CGM.VoidPtrTy)};
+
+  llvm::Value *RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_init);
+  CGF.EmitRuntimeCall(RTLFn, Args);
+  llvm::Value *FiniArgs[DoacrossCleanupTy::DoacrossFinArgs] = {
+      emitUpdateLocation(CGF, D.getLocEnd()), getThreadID(CGF, D.getLocEnd())};
+  llvm::Value *FiniRTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_fini);
+  CGF.EHStack.pushCleanup<DoacrossCleanupTy>(NormalAndEHCleanup, FiniRTLFn,
+                                             llvm::makeArrayRef(FiniArgs));
+}
+
+void CGOpenMPRuntime::emitDoacrossOrdered(CodeGenFunction &CGF,
+                                          const OMPDependClause *C) {
+  QualType Int64Ty =
+      CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
+  const Expr *CounterVal = C->getCounterValue();
+  assert(CounterVal);
+  llvm::Value *CntVal = CGF.EmitScalarConversion(CGF.EmitScalarExpr(CounterVal),
+                                                 CounterVal->getType(), Int64Ty,
+                                                 CounterVal->getExprLoc());
+  Address CntAddr = CGF.CreateMemTemp(Int64Ty, ".cnt.addr");
+  CGF.EmitStoreOfScalar(CntVal, CntAddr, /*Volatile=*/false, Int64Ty);
+  llvm::Value *Args[] = {emitUpdateLocation(CGF, C->getLocStart()),
+                         getThreadID(CGF, C->getLocStart()),
+                         CntAddr.getPointer()};
+  llvm::Value *RTLFn;
+  if (C->getDependencyKind() == OMPC_DEPEND_source)
+    RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_post);
+  else {
+    assert(C->getDependencyKind() == OMPC_DEPEND_sink);
+    RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_wait);
+  }
+  CGF.EmitRuntimeCall(RTLFn, Args);
+}
+
diff --git a/lib/CodeGen/CGOpenMPRuntime.h b/lib/CodeGen/CGOpenMPRuntime.h
index b325637..9057e5e 100644
--- a/lib/CodeGen/CGOpenMPRuntime.h
+++ b/lib/CodeGen/CGOpenMPRuntime.h
@@ -14,18 +14,19 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIME_H
 #define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIME_H
 
+#include "CGValue.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
 class ArrayType;
 class Constant;
-class Function;
 class FunctionType;
 class GlobalVariable;
 class StructType;
@@ -36,222 +37,116 @@
 namespace clang {
 class Expr;
 class GlobalDecl;
+class OMPDependClause;
 class OMPExecutableDirective;
+class OMPLoopDirective;
 class VarDecl;
+class OMPDeclareReductionDecl;
+class IdentifierInfo;
 
 namespace CodeGen {
 class Address;
 class CodeGenFunction;
 class CodeGenModule;
 
-typedef llvm::function_ref<void(CodeGenFunction &)> RegionCodeGenTy;
+/// A basic class for pre|post-action for advanced codegen sequence for OpenMP
+/// region.
+class PrePostActionTy {
+public:
+  explicit PrePostActionTy() {}
+  virtual void Enter(CodeGenFunction &CGF) {}
+  virtual void Exit(CodeGenFunction &CGF) {}
+  virtual ~PrePostActionTy() {}
+};
+
+/// Class provides a way to call simple version of codegen for OpenMP region, or
+/// an advanced with possible pre|post-actions in codegen.
+class RegionCodeGenTy final {
+  intptr_t CodeGen;
+  typedef void (*CodeGenTy)(intptr_t, CodeGenFunction &, PrePostActionTy &);
+  CodeGenTy Callback;
+  mutable PrePostActionTy *PrePostAction;
+  RegionCodeGenTy() = delete;
+  RegionCodeGenTy &operator=(const RegionCodeGenTy &) = delete;
+  template <typename Callable>
+  static void CallbackFn(intptr_t CodeGen, CodeGenFunction &CGF,
+                         PrePostActionTy &Action) {
+    return (*reinterpret_cast<Callable *>(CodeGen))(CGF, Action);
+  }
+
+public:
+  template <typename Callable>
+  RegionCodeGenTy(
+      Callable &&CodeGen,
+      typename std::enable_if<
+          !std::is_same<typename std::remove_reference<Callable>::type,
+                        RegionCodeGenTy>::value>::type * = nullptr)
+      : CodeGen(reinterpret_cast<intptr_t>(&CodeGen)),
+        Callback(CallbackFn<typename std::remove_reference<Callable>::type>),
+        PrePostAction(nullptr) {}
+  void setAction(PrePostActionTy &Action) const { PrePostAction = &Action; }
+  void operator()(CodeGenFunction &CGF) const;
+};
+
+struct OMPTaskDataTy final {
+  SmallVector<const Expr *, 4> PrivateVars;
+  SmallVector<const Expr *, 4> PrivateCopies;
+  SmallVector<const Expr *, 4> FirstprivateVars;
+  SmallVector<const Expr *, 4> FirstprivateCopies;
+  SmallVector<const Expr *, 4> FirstprivateInits;
+  SmallVector<const Expr *, 4> LastprivateVars;
+  SmallVector<const Expr *, 4> LastprivateCopies;
+  SmallVector<std::pair<OpenMPDependClauseKind, const Expr *>, 4> Dependences;
+  llvm::PointerIntPair<llvm::Value *, 1, bool> Final;
+  llvm::PointerIntPair<llvm::Value *, 1, bool> Schedule;
+  llvm::PointerIntPair<llvm::Value *, 1, bool> Priority;
+  unsigned NumberOfParts = 0;
+  bool Tied = true;
+  bool Nogroup = false;
+};
 
 class CGOpenMPRuntime {
-private:
-  enum OpenMPRTLFunction {
-    /// \brief Call to void __kmpc_fork_call(ident_t *loc, kmp_int32 argc,
-    /// kmpc_micro microtask, ...);
-    OMPRTL__kmpc_fork_call,
-    /// \brief Call to void *__kmpc_threadprivate_cached(ident_t *loc,
-    /// kmp_int32 global_tid, void *data, size_t size, void ***cache);
-    OMPRTL__kmpc_threadprivate_cached,
-    /// \brief Call to void __kmpc_threadprivate_register( ident_t *,
-    /// void *data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
-    OMPRTL__kmpc_threadprivate_register,
-    // Call to __kmpc_int32 kmpc_global_thread_num(ident_t *loc);
-    OMPRTL__kmpc_global_thread_num,
-    // Call to void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
-    // kmp_critical_name *crit);
-    OMPRTL__kmpc_critical,
-    // Call to void __kmpc_critical_with_hint(ident_t *loc, kmp_int32
-    // global_tid, kmp_critical_name *crit, uintptr_t hint);
-    OMPRTL__kmpc_critical_with_hint,
-    // Call to void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
-    // kmp_critical_name *crit);
-    OMPRTL__kmpc_end_critical,
-    // Call to kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32
-    // global_tid);
-    OMPRTL__kmpc_cancel_barrier,
-    // Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
-    OMPRTL__kmpc_barrier,
-    // Call to void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
-    OMPRTL__kmpc_for_static_fini,
-    // Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
-    // global_tid);
-    OMPRTL__kmpc_serialized_parallel,
-    // Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
-    // global_tid);
-    OMPRTL__kmpc_end_serialized_parallel,
-    // Call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
-    // kmp_int32 num_threads);
-    OMPRTL__kmpc_push_num_threads,
-    // Call to void __kmpc_flush(ident_t *loc);
-    OMPRTL__kmpc_flush,
-    // Call to kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
-    OMPRTL__kmpc_master,
-    // Call to void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
-    OMPRTL__kmpc_end_master,
-    // Call to kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid,
-    // int end_part);
-    OMPRTL__kmpc_omp_taskyield,
-    // Call to kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
-    OMPRTL__kmpc_single,
-    // Call to void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
-    OMPRTL__kmpc_end_single,
-    // Call to kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
-    // kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
-    // kmp_routine_entry_t *task_entry);
-    OMPRTL__kmpc_omp_task_alloc,
-    // Call to kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t *
-    // new_task);
-    OMPRTL__kmpc_omp_task,
-    // Call to void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
-    // size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *),
-    // kmp_int32 didit);
-    OMPRTL__kmpc_copyprivate,
-    // Call to kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid,
-    // kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void
-    // (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck);
-    OMPRTL__kmpc_reduce,
-    // Call to kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32
-    // global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
-    // void (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name
-    // *lck);
-    OMPRTL__kmpc_reduce_nowait,
-    // Call to void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
-    // kmp_critical_name *lck);
-    OMPRTL__kmpc_end_reduce,
-    // Call to void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
-    // kmp_critical_name *lck);
-    OMPRTL__kmpc_end_reduce_nowait,
-    // Call to void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
-    // kmp_task_t * new_task);
-    OMPRTL__kmpc_omp_task_begin_if0,
-    // Call to void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
-    // kmp_task_t * new_task);
-    OMPRTL__kmpc_omp_task_complete_if0,
-    // Call to void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid);
-    OMPRTL__kmpc_ordered,
-    // Call to void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid);
-    OMPRTL__kmpc_end_ordered,
-    // Call to kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
-    // global_tid);
-    OMPRTL__kmpc_omp_taskwait,
-    // Call to void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid);
-    OMPRTL__kmpc_taskgroup,
-    // Call to void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid);
-    OMPRTL__kmpc_end_taskgroup,
-    // Call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
-    // int proc_bind);
-    OMPRTL__kmpc_push_proc_bind,
-    // Call to kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32
-    // gtid, kmp_task_t * new_task, kmp_int32 ndeps, kmp_depend_info_t
-    // *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
-    OMPRTL__kmpc_omp_task_with_deps,
-    // Call to void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32
-    // gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
-    // ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
-    OMPRTL__kmpc_omp_wait_deps,
-    // Call to kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
-    // global_tid, kmp_int32 cncl_kind);
-    OMPRTL__kmpc_cancellationpoint,
-    // Call to kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
-    // kmp_int32 cncl_kind);
-    OMPRTL__kmpc_cancel,
-
-    //
-    // Offloading related calls
-    //
-    // Call to int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
-    // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
-    // *arg_types);
-    OMPRTL__tgt_target,
-    // Call to void __tgt_register_lib(__tgt_bin_desc *desc);
-    OMPRTL__tgt_register_lib,
-    // Call to void __tgt_unregister_lib(__tgt_bin_desc *desc);
-    OMPRTL__tgt_unregister_lib,
-  };
-
-  /// \brief Values for bit flags used in the ident_t to describe the fields.
-  /// All enumeric elements are named and described in accordance with the code
-  /// from http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
-  enum OpenMPLocationFlags {
-    /// \brief Use trampoline for internal microtask.
-    OMP_IDENT_IMD = 0x01,
-    /// \brief Use c-style ident structure.
-    OMP_IDENT_KMPC = 0x02,
-    /// \brief Atomic reduction option for kmpc_reduce.
-    OMP_ATOMIC_REDUCE = 0x10,
-    /// \brief Explicit 'barrier' directive.
-    OMP_IDENT_BARRIER_EXPL = 0x20,
-    /// \brief Implicit barrier in code.
-    OMP_IDENT_BARRIER_IMPL = 0x40,
-    /// \brief Implicit barrier in 'for' directive.
-    OMP_IDENT_BARRIER_IMPL_FOR = 0x40,
-    /// \brief Implicit barrier in 'sections' directive.
-    OMP_IDENT_BARRIER_IMPL_SECTIONS = 0xC0,
-    /// \brief Implicit barrier in 'single' directive.
-    OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140
-  };
+protected:
   CodeGenModule &CGM;
+
+  /// \brief Creates offloading entry for the provided entry ID \a ID,
+  /// address \a Addr and size \a Size.
+  virtual void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr,
+                                  uint64_t Size);
+
+  /// \brief Helper to emit outlined function for 'target' directive.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param CodeGen Lambda codegen specific to an accelerator device.
+  /// An oulined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D,
+                                                StringRef ParentName,
+                                                llvm::Function *&OutlinedFn,
+                                                llvm::Constant *&OutlinedFnID,
+                                                bool IsOffloadEntry,
+                                                const RegionCodeGenTy &CodeGen);
+
+private:
   /// \brief Default const ident_t object used for initialization of all other
   /// ident_t objects.
-  llvm::Constant *DefaultOpenMPPSource;
+  llvm::Constant *DefaultOpenMPPSource = nullptr;
   /// \brief Map of flags and corresponding default locations.
   typedef llvm::DenseMap<unsigned, llvm::Value *> OpenMPDefaultLocMapTy;
   OpenMPDefaultLocMapTy OpenMPDefaultLocMap;
-  Address getOrCreateDefaultLocation(OpenMPLocationFlags Flags);
+  Address getOrCreateDefaultLocation(unsigned Flags);
 
-public:
-  /// \brief Describes ident structure that describes a source location.
-  /// All descriptions are taken from
-  /// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
-  /// Original structure:
-  /// typedef struct ident {
-  ///    kmp_int32 reserved_1;   /**<  might be used in Fortran;
-  ///                                  see above  */
-  ///    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags;
-  ///                                  KMP_IDENT_KMPC identifies this union
-  ///                                  member  */
-  ///    kmp_int32 reserved_2;   /**<  not really used in Fortran any more;
-  ///                                  see above */
-  ///#if USE_ITT_BUILD
-  ///                            /*  but currently used for storing
-  ///                                region-specific ITT */
-  ///                            /*  contextual information. */
-  ///#endif /* USE_ITT_BUILD */
-  ///    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for
-  ///                                 C++  */
-  ///    char const *psource;    /**< String describing the source location.
-  ///                            The string is composed of semi-colon separated
-  //                             fields which describe the source file,
-  ///                            the function and a pair of line numbers that
-  ///                            delimit the construct.
-  ///                             */
-  /// } ident_t;
-  enum IdentFieldIndex {
-    /// \brief might be used in Fortran
-    IdentField_Reserved_1,
-    /// \brief OMP_IDENT_xxx flags; OMP_IDENT_KMPC identifies this union member.
-    IdentField_Flags,
-    /// \brief Not really used in Fortran any more
-    IdentField_Reserved_2,
-    /// \brief Source[4] in Fortran, do not use for C++
-    IdentField_Reserved_3,
-    /// \brief String describing the source location. The string is composed of
-    /// semi-colon separated fields which describe the source file, the function
-    /// and a pair of line numbers that delimit the construct.
-    IdentField_PSource
-  };
-private:
-  llvm::StructType *IdentTy;
+  llvm::StructType *IdentTy = nullptr;
   /// \brief Map for SourceLocation and OpenMP runtime library debug locations.
   typedef llvm::DenseMap<unsigned, llvm::Value *> OpenMPDebugLocMapTy;
   OpenMPDebugLocMapTy OpenMPDebugLocMap;
   /// \brief The type for a microtask which gets passed to __kmpc_fork_call().
   /// Original representation is:
   /// typedef void (kmpc_micro)(kmp_int32 global_tid, kmp_int32 bound_tid,...);
-  llvm::FunctionType *Kmpc_MicroTy;
+  llvm::FunctionType *Kmpc_MicroTy = nullptr;
   /// \brief Stores debug location and ThreadID for the function.
   struct DebugLocThreadIdTy {
     llvm::Value *DebugLoc;
@@ -261,6 +156,20 @@
   typedef llvm::DenseMap<llvm::Function *, DebugLocThreadIdTy>
       OpenMPLocThreadIDMapTy;
   OpenMPLocThreadIDMapTy OpenMPLocThreadIDMap;
+  /// Map of UDRs and corresponding combiner/initializer.
+  typedef llvm::DenseMap<const OMPDeclareReductionDecl *,
+                         std::pair<llvm::Function *, llvm::Function *>>
+      UDRMapTy;
+  UDRMapTy UDRMap;
+  /// Map of functions and locally defined UDRs.
+  typedef llvm::DenseMap<llvm::Function *,
+                         SmallVector<const OMPDeclareReductionDecl *, 4>>
+      FunctionUDRMapTy;
+  FunctionUDRMapTy FunctionUDRMap;
+  IdentifierInfo *In = nullptr;
+  IdentifierInfo *Out = nullptr;
+  IdentifierInfo *Priv = nullptr;
+  IdentifierInfo *Orig = nullptr;
   /// \brief Type kmp_critical_name, originally defined as typedef kmp_int32
   /// kmp_critical_name[8];
   llvm::ArrayType *KmpCriticalNameTy;
@@ -272,7 +181,7 @@
   llvm::StringMap<llvm::AssertingVH<llvm::Constant>, llvm::BumpPtrAllocator>
       InternalVars;
   /// \brief Type typedef kmp_int32 (* kmp_routine_entry_t)(kmp_int32, void *);
-  llvm::Type *KmpRoutineEntryPtrTy;
+  llvm::Type *KmpRoutineEntryPtrTy = nullptr;
   QualType KmpRoutineEntryPtrQTy;
   /// \brief Type typedef struct kmp_task {
   ///    void *              shareds; /**< pointer to block of pointers to
@@ -293,6 +202,12 @@
   ///    } flags;
   /// } kmp_depend_info_t;
   QualType KmpDependInfoTy;
+  /// struct kmp_dim {  // loop bounds info casted to kmp_int64
+  ///  kmp_int64 lo; // lower
+  ///  kmp_int64 up; // upper
+  ///  kmp_int64 st; // stride
+  /// };
+  QualType KmpDimTy;
   /// \brief Type struct __tgt_offload_entry{
   ///   void      *addr;       // Pointer to the offload entry info.
   ///                          // (function or global)
@@ -402,30 +317,27 @@
     /// \brief Initialize target region entry.
     void initializeTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
                                          StringRef ParentName, unsigned LineNum,
-                                         unsigned ColNum, unsigned Order);
+                                         unsigned Order);
     /// \brief Register target region entry.
     void registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
                                        StringRef ParentName, unsigned LineNum,
-                                       unsigned ColNum, llvm::Constant *Addr,
+                                       llvm::Constant *Addr,
                                        llvm::Constant *ID);
     /// \brief Return true if a target region entry with the provided
     /// information exists.
     bool hasTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
-                                  StringRef ParentName, unsigned LineNum,
-                                  unsigned ColNum) const;
+                                  StringRef ParentName, unsigned LineNum) const;
     /// brief Applies action \a Action on all registered entries.
     typedef llvm::function_ref<void(unsigned, unsigned, StringRef, unsigned,
-                                    unsigned, OffloadEntryInfoTargetRegion &)>
+                                    OffloadEntryInfoTargetRegion &)>
         OffloadTargetRegionEntryInfoActTy;
     void actOnTargetRegionEntriesInfo(
         const OffloadTargetRegionEntryInfoActTy &Action);
 
   private:
     // Storage for target region entries kind. The storage is to be indexed by
-    // file ID, device ID, parent function name, lane number, and column number.
+    // file ID, device ID, parent function name and line number.
     typedef llvm::DenseMap<unsigned, OffloadEntryInfoTargetRegion>
-        OffloadEntriesTargetRegionPerColumn;
-    typedef llvm::DenseMap<unsigned, OffloadEntriesTargetRegionPerColumn>
         OffloadEntriesTargetRegionPerLine;
     typedef llvm::StringMap<OffloadEntriesTargetRegionPerLine>
         OffloadEntriesTargetRegionPerParentName;
@@ -442,10 +354,6 @@
   /// compilation unit. The function that does the registration is returned.
   llvm::Function *createOffloadingBinaryDescriptorRegistration();
 
-  /// \brief Creates offloading entry for the provided address \a Addr,
-  /// name \a Name and size \a Size.
-  void createOffloadEntry(llvm::Constant *Addr, StringRef Name, uint64_t Size);
-
   /// \brief Creates all the offload entries in the current compilation unit
   /// along with the associated metadata.
   void createOffloadEntriesAndInfoMetadata();
@@ -476,7 +384,7 @@
   /// \param Flags Flags for OpenMP location.
   ///
   llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc,
-                                  OpenMPLocationFlags Flags = OMP_IDENT_KMPC);
+                                  unsigned Flags = 0);
 
   /// \brief Returns pointer to ident_t type.
   llvm::Type *getIdentTyPointerTy();
@@ -487,7 +395,7 @@
   /// \brief Returns specified OpenMP runtime function.
   /// \param Function OpenMP runtime function.
   /// \return Specified function.
-  llvm::Constant *createRuntimeFunction(OpenMPRTLFunction Function);
+  llvm::Constant *createRuntimeFunction(unsigned Function);
 
   /// \brief Returns __kmpc_for_static_init_* runtime function for the specified
   /// size \a IVSize and sign \a IVSigned.
@@ -530,7 +438,7 @@
                                               const llvm::Twine &Name);
 
   /// \brief Set of threadprivate variables with the generated initializer.
-  llvm::DenseSet<const VarDecl *> ThreadPrivateWithDefinition;
+  llvm::SmallPtrSet<const VarDecl *, 4> ThreadPrivateWithDefinition;
 
   /// \brief Emits initialization code for the threadprivate variables.
   /// \param VDAddr Address of the global variable \a VD.
@@ -549,11 +457,52 @@
   ///
   llvm::Value *getCriticalRegionLock(StringRef CriticalName);
 
+  struct TaskResultTy {
+    llvm::Value *NewTask = nullptr;
+    llvm::Value *TaskEntry = nullptr;
+    llvm::Value *NewTaskNewTaskTTy = nullptr;
+    LValue TDBase;
+    RecordDecl *KmpTaskTQTyRD = nullptr;
+    llvm::Value *TaskDupFn = nullptr;
+  };
+  /// Emit task region for the task directive. The task region is emitted in
+  /// several steps:
+  /// 1. Emit a call to kmp_task_t *__kmpc_omp_task_alloc(ident_t *, kmp_int32
+  /// gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+  /// kmp_routine_entry_t *task_entry). Here task_entry is a pointer to the
+  /// function:
+  /// kmp_int32 .omp_task_entry.(kmp_int32 gtid, kmp_task_t *tt) {
+  ///   TaskFunction(gtid, tt->part_id, tt->shareds);
+  ///   return 0;
+  /// }
+  /// 2. Copy a list of shared variables to field shareds of the resulting
+  /// structure kmp_task_t returned by the previous call (if any).
+  /// 3. Copy a pointer to destructions function to field destructions of the
+  /// resulting structure kmp_task_t.
+  /// \param D Current task directive.
+  /// \param TaskFunction An LLVM function with type void (*)(i32 /*gtid*/, i32
+  /// /*part_id*/, captured_struct */*__context*/);
+  /// \param SharedsTy A type which contains references the shared variables.
+  /// \param Shareds Context with the list of shared variables from the \p
+  /// TaskFunction.
+  /// \param Data Additional data for task generation like tiednsee, final
+  /// state, list of privates etc.
+  TaskResultTy emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
+                            const OMPExecutableDirective &D,
+                            llvm::Value *TaskFunction, QualType SharedsTy,
+                            Address Shareds, const OMPTaskDataTy &Data);
+
 public:
   explicit CGOpenMPRuntime(CodeGenModule &CGM);
   virtual ~CGOpenMPRuntime() {}
   virtual void clear();
 
+  /// Emit code for the specified user defined reduction construct.
+  virtual void emitUserDefinedReduction(CodeGenFunction *CGF,
+                                        const OMPDeclareReductionDecl *D);
+  /// Get combiner/initializer for the specified user-defined reduction, if any.
+  virtual std::pair<llvm::Function *, llvm::Function *>
+  getUserDefinedReduction(const OMPDeclareReductionDecl *D);
   /// \brief Emits outlined function for the specified OpenMP parallel directive
   /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
   /// kmp_int32 BoundID, struct context_vars*).
@@ -562,22 +511,30 @@
   /// \param InnermostKind Kind of innermost directive (for simple directives it
   /// is a directive itself, for combined - its innermost directive).
   /// \param CodeGen Code generation sequence for the \a D directive.
-  virtual llvm::Value *emitParallelOutlinedFunction(
+  virtual llvm::Value *emitParallelOrTeamsOutlinedFunction(
       const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
       OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen);
 
   /// \brief Emits outlined function for the OpenMP task directive \a D. This
-  /// outlined function has type void(*)(kmp_int32 ThreadID, kmp_int32
-  /// PartID, struct context_vars*).
+  /// outlined function has type void(*)(kmp_int32 ThreadID, struct task_t*
+  /// TaskT).
   /// \param D OpenMP directive.
   /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param PartIDVar Variable for partition id in the current OpenMP untied
+  /// task region.
+  /// \param TaskTVar Variable for task_t argument.
   /// \param InnermostKind Kind of innermost directive (for simple directives it
   /// is a directive itself, for combined - its innermost directive).
   /// \param CodeGen Code generation sequence for the \a D directive.
+  /// \param Tied true if task is generated for tied task, false otherwise.
+  /// \param NumberOfParts Number of parts in untied task. Ignored for tied
+  /// tasks.
   ///
   virtual llvm::Value *emitTaskOutlinedFunction(
       const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
-      OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen);
+      const VarDecl *PartIDVar, const VarDecl *TaskTVar,
+      OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen,
+      bool Tied, unsigned &NumberOfParts);
 
   /// \brief Cleans up references to the objects in finished function.
   ///
@@ -664,6 +621,14 @@
   virtual bool isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
                                   bool Chunked) const;
 
+  /// \brief Check if the specified \a ScheduleKind is static non-chunked.
+  /// This kind of distribute directive is emitted without outer loop.
+  /// \param ScheduleKind Schedule kind specified in the 'dist_schedule' clause.
+  /// \param Chunked True if chunk is specified in the clause.
+  ///
+  virtual bool isStaticNonchunked(OpenMPDistScheduleClauseKind ScheduleKind,
+                                  bool Chunked) const;
+
   /// \brief Check if the specified \a ScheduleKind is dynamic.
   /// This kind of worksharing directive is emitted without outer loop.
   /// \param ScheduleKind Schedule Kind specified in the 'schedule' clause.
@@ -671,9 +636,9 @@
   virtual bool isDynamic(OpenMPScheduleClauseKind ScheduleKind) const;
 
   virtual void emitForDispatchInit(CodeGenFunction &CGF, SourceLocation Loc,
-                                   OpenMPScheduleClauseKind SchedKind,
-                                   unsigned IVSize, bool IVSigned,
-                                   bool Ordered, llvm::Value *UB,
+                                   const OpenMPScheduleTy &ScheduleKind,
+                                   unsigned IVSize, bool IVSigned, bool Ordered,
+                                   llvm::Value *UB,
                                    llvm::Value *Chunk = nullptr);
 
   /// \brief Call the appropriate runtime routine to initialize it before start
@@ -685,7 +650,7 @@
   ///
   /// \param CGF Reference to current CodeGenFunction.
   /// \param Loc Clang source location.
-  /// \param SchedKind Schedule kind, specified by the 'schedule' clause.
+  /// \param ScheduleKind Schedule kind, specified by the 'schedule' clause.
   /// \param IVSize Size of the iteration variable in bits.
   /// \param IVSigned Sign of the interation variable.
   /// \param Ordered true if loop is ordered, false otherwise.
@@ -701,12 +666,36 @@
   /// For the default (nullptr) value, the chunk 1 will be used.
   ///
   virtual void emitForStaticInit(CodeGenFunction &CGF, SourceLocation Loc,
-                                 OpenMPScheduleClauseKind SchedKind,
+                                 const OpenMPScheduleTy &ScheduleKind,
                                  unsigned IVSize, bool IVSigned, bool Ordered,
-                                 Address IL, Address LB,
-                                 Address UB, Address ST,
+                                 Address IL, Address LB, Address UB, Address ST,
                                  llvm::Value *Chunk = nullptr);
 
+  ///
+  /// \param CGF Reference to current CodeGenFunction.
+  /// \param Loc Clang source location.
+  /// \param SchedKind Schedule kind, specified by the 'dist_schedule' clause.
+  /// \param IVSize Size of the iteration variable in bits.
+  /// \param IVSigned Sign of the interation variable.
+  /// \param Ordered true if loop is ordered, false otherwise.
+  /// \param IL Address of the output variable in which the flag of the
+  /// last iteration is returned.
+  /// \param LB Address of the output variable in which the lower iteration
+  /// number is returned.
+  /// \param UB Address of the output variable in which the upper iteration
+  /// number is returned.
+  /// \param ST Address of the output variable in which the stride value is
+  /// returned nesessary to generated the static_chunked scheduled loop.
+  /// \param Chunk Value of the chunk for the static_chunked scheduled loop.
+  /// For the default (nullptr) value, the chunk 1 will be used.
+  ///
+  virtual void emitDistributeStaticInit(CodeGenFunction &CGF, SourceLocation Loc,
+                                        OpenMPDistScheduleClauseKind SchedKind,
+                                        unsigned IVSize, bool IVSigned,
+                                        bool Ordered, Address IL, Address LB,
+                                        Address UB, Address ST,
+                                        llvm::Value *Chunk = nullptr);
+
   /// \brief Call the appropriate runtime routine to notify that we finished
   /// iteration of the ordered loop with the dynamic scheduling.
   ///
@@ -807,12 +796,6 @@
   /// kmp_task_t *new_task), where new_task is a resulting structure from
   /// previous items.
   /// \param D Current task directive.
-  /// \param Tied true if the task is tied (the task is tied to the thread that
-  /// can suspend its task region), false - untied (the task is not tied to any
-  /// thread).
-  /// \param Final Contains either constant bool value, or llvm::Value * of i1
-  /// type for final clause. If the value is true, the task forces all of its
-  /// child tasks to become final and included tasks.
   /// \param TaskFunction An LLVM function with type void (*)(i32 /*gtid*/, i32
   /// /*part_id*/, captured_struct */*__context*/);
   /// \param SharedsTy A type which contains references the shared variables.
@@ -820,29 +803,47 @@
   /// TaskFunction.
   /// \param IfCond Not a nullptr if 'if' clause was specified, nullptr
   /// otherwise.
-  /// \param PrivateVars List of references to private variables for the task
-  /// directive.
-  /// \param PrivateCopies List of private copies for each private variable in
-  /// \p PrivateVars.
-  /// \param FirstprivateVars List of references to private variables for the
-  /// task directive.
-  /// \param FirstprivateCopies List of private copies for each private variable
-  /// in \p FirstprivateVars.
-  /// \param FirstprivateInits List of references to auto generated variables
-  /// used for initialization of a single array element. Used if firstprivate
-  /// variable is of array type.
-  /// \param Dependences List of dependences for the 'task' construct, including
-  /// original expression and dependency type.
-  virtual void emitTaskCall(
-      CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D,
-      bool Tied, llvm::PointerIntPair<llvm::Value *, 1, bool> Final,
+  /// \param Data Additional data for task generation like tiednsee, final
+  /// state, list of privates etc.
+  virtual void emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
+                            const OMPExecutableDirective &D,
+                            llvm::Value *TaskFunction, QualType SharedsTy,
+                            Address Shareds, const Expr *IfCond,
+                            const OMPTaskDataTy &Data);
+
+  /// Emit task region for the taskloop directive. The taskloop region is
+  /// emitted in several steps:
+  /// 1. Emit a call to kmp_task_t *__kmpc_omp_task_alloc(ident_t *, kmp_int32
+  /// gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+  /// kmp_routine_entry_t *task_entry). Here task_entry is a pointer to the
+  /// function:
+  /// kmp_int32 .omp_task_entry.(kmp_int32 gtid, kmp_task_t *tt) {
+  ///   TaskFunction(gtid, tt->part_id, tt->shareds);
+  ///   return 0;
+  /// }
+  /// 2. Copy a list of shared variables to field shareds of the resulting
+  /// structure kmp_task_t returned by the previous call (if any).
+  /// 3. Copy a pointer to destructions function to field destructions of the
+  /// resulting structure kmp_task_t.
+  /// 4. Emit a call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t
+  /// *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int
+  /// nogroup, int sched, kmp_uint64 grainsize, void *task_dup ), where new_task
+  /// is a resulting structure from
+  /// previous items.
+  /// \param D Current task directive.
+  /// \param TaskFunction An LLVM function with type void (*)(i32 /*gtid*/, i32
+  /// /*part_id*/, captured_struct */*__context*/);
+  /// \param SharedsTy A type which contains references the shared variables.
+  /// \param Shareds Context with the list of shared variables from the \p
+  /// TaskFunction.
+  /// \param IfCond Not a nullptr if 'if' clause was specified, nullptr
+  /// otherwise.
+  /// \param Data Additional data for task generation like tiednsee, final
+  /// state, list of privates etc.
+  virtual void emitTaskLoopCall(
+      CodeGenFunction &CGF, SourceLocation Loc, const OMPLoopDirective &D,
       llvm::Value *TaskFunction, QualType SharedsTy, Address Shareds,
-      const Expr *IfCond, ArrayRef<const Expr *> PrivateVars,
-      ArrayRef<const Expr *> PrivateCopies,
-      ArrayRef<const Expr *> FirstprivateVars,
-      ArrayRef<const Expr *> FirstprivateCopies,
-      ArrayRef<const Expr *> FirstprivateInits,
-      ArrayRef<std::pair<OpenMPDependClauseKind, const Expr *>> Dependences);
+      const Expr *IfCond, const OMPTaskDataTy &Data);
 
   /// \brief Emit code for the directive that does not require outlining.
   ///
@@ -926,13 +927,15 @@
   /// \param OutlinedFn Outlined function value to be defined by this call.
   /// \param OutlinedFnID Outlined function ID value to be defined by this call.
   /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param CodeGen Code generation sequence for the \a D directive.
   /// An oulined function may not be an entry if, e.g. the if clause always
   /// evaluates to false.
   virtual void emitTargetOutlinedFunction(const OMPExecutableDirective &D,
                                           StringRef ParentName,
                                           llvm::Function *&OutlinedFn,
                                           llvm::Constant *&OutlinedFnID,
-                                          bool IsOffloadEntry);
+                                          bool IsOffloadEntry,
+                                          const RegionCodeGenTy &CodeGen);
 
   /// \brief Emit the target offloading code associated with \a D. The emitted
   /// code attempts offloading the execution to the device, an the event of
@@ -972,6 +975,110 @@
   /// was emitted in the current module and return the function that registers
   /// it.
   virtual llvm::Function *emitRegistrationFunction();
+
+  /// \brief Emits code for teams call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run by team masters. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  ///
+  virtual void emitTeamsCall(CodeGenFunction &CGF,
+                             const OMPExecutableDirective &D,
+                             SourceLocation Loc, llvm::Value *OutlinedFn,
+                             ArrayRef<llvm::Value *> CapturedVars);
+
+  /// \brief Emits call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32
+  /// global_tid, kmp_int32 num_teams, kmp_int32 thread_limit) to generate code
+  /// for num_teams clause.
+  /// \param NumTeams An integer expression of teams.
+  /// \param ThreadLimit An integer expression of threads.
+  virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
+                                  const Expr *ThreadLimit, SourceLocation Loc);
+
+  /// Struct that keeps all the relevant information that should be kept
+  /// throughout a 'target data' region.
+  class TargetDataInfo {
+    /// Set to true if device pointer information have to be obtained.
+    bool RequiresDevicePointerInfo = false;
+
+  public:
+    /// The array of base pointer passed to the runtime library.
+    llvm::Value *BasePointersArray = nullptr;
+    /// The array of section pointers passed to the runtime library.
+    llvm::Value *PointersArray = nullptr;
+    /// The array of sizes passed to the runtime library.
+    llvm::Value *SizesArray = nullptr;
+    /// The array of map types passed to the runtime library.
+    llvm::Value *MapTypesArray = nullptr;
+    /// The total number of pointers passed to the runtime library.
+    unsigned NumberOfPtrs = 0u;
+    /// Map between the a declaration of a capture and the corresponding base
+    /// pointer address where the runtime returns the device pointers.
+    llvm::DenseMap<const ValueDecl *, Address> CaptureDeviceAddrMap;
+
+    explicit TargetDataInfo() {}
+    explicit TargetDataInfo(bool RequiresDevicePointerInfo)
+        : RequiresDevicePointerInfo(RequiresDevicePointerInfo) {}
+    /// Clear information about the data arrays.
+    void clearArrayInfo() {
+      BasePointersArray = nullptr;
+      PointersArray = nullptr;
+      SizesArray = nullptr;
+      MapTypesArray = nullptr;
+      NumberOfPtrs = 0u;
+    }
+    /// Return true if the current target data information has valid arrays.
+    bool isValid() {
+      return BasePointersArray && PointersArray && SizesArray &&
+             MapTypesArray && NumberOfPtrs;
+    }
+    bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
+  };
+
+  /// \brief Emit the target data mapping code associated with \a D.
+  /// \param D Directive to emit.
+  /// \param IfCond Expression evaluated in if clause associated with the
+  /// target directive, or null if no device clause is used.
+  /// \param Device Expression evaluated in device clause associated with the
+  /// target directive, or null if no device clause is used.
+  /// \param Info A record used to store information that needs to be preserved
+  /// until the region is closed.
+  virtual void emitTargetDataCalls(CodeGenFunction &CGF,
+                                   const OMPExecutableDirective &D,
+                                   const Expr *IfCond, const Expr *Device,
+                                   const RegionCodeGenTy &CodeGen,
+                                   TargetDataInfo &Info);
+
+  /// \brief Emit the data mapping/movement code associated with the directive
+  /// \a D that should be of the form 'target [{enter|exit} data | update]'.
+  /// \param D Directive to emit.
+  /// \param IfCond Expression evaluated in if clause associated with the target
+  /// directive, or null if no if clause is used.
+  /// \param Device Expression evaluated in device clause associated with the
+  /// target directive, or null if no device clause is used.
+  virtual void emitTargetDataStandAloneCall(CodeGenFunction &CGF,
+                                            const OMPExecutableDirective &D,
+                                            const Expr *IfCond,
+                                            const Expr *Device);
+
+  /// Marks function \a Fn with properly mangled versions of vector functions.
+  /// \param FD Function marked as 'declare simd'.
+  /// \param Fn LLVM function that must be marked with 'declare simd'
+  /// attributes.
+  virtual void emitDeclareSimdFunction(const FunctionDecl *FD,
+                                       llvm::Function *Fn);
+
+  /// Emit initialization for doacross loop nesting support.
+  /// \param D Loop-based construct used in doacross nesting construct.
+  virtual void emitDoacrossInit(CodeGenFunction &CGF,
+                                const OMPLoopDirective &D);
+
+  /// Emit code for doacross ordered directive with 'depend' clause.
+  /// \param C 'depend' clause with 'sink|source' dependency kind.
+  virtual void emitDoacrossOrdered(CodeGenFunction &CGF,
+                                   const OMPDependClause *C);
 };
 
 } // namespace CodeGen
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
new file mode 100644
index 0000000..d64f6df
--- /dev/null
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -0,0 +1,396 @@
+//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to NVPTX
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeNVPTX.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtOpenMP.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+/// \brief Get the GPU warp size.
+llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXWarpSize(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  return Bld.CreateCall(
+      llvm::Intrinsic::getDeclaration(
+          &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
+      llvm::None, "nvptx_warp_size");
+}
+
+/// \brief Get the id of the current thread on the GPU.
+llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXThreadID(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  return Bld.CreateCall(
+      llvm::Intrinsic::getDeclaration(
+          &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
+      llvm::None, "nvptx_tid");
+}
+
+// \brief Get the maximum number of threads in a block of the GPU.
+llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXNumThreads(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  return Bld.CreateCall(
+      llvm::Intrinsic::getDeclaration(
+          &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
+      llvm::None, "nvptx_num_threads");
+}
+
+/// \brief Get barrier to synchronize all threads in a block.
+void CGOpenMPRuntimeNVPTX::getNVPTXCTABarrier(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  Bld.CreateCall(llvm::Intrinsic::getDeclaration(
+      &CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
+}
+
+// \brief Synchronize all GPU threads in a block.
+void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
+  getNVPTXCTABarrier(CGF);
+}
+
+/// \brief Get the thread id of the OMP master thread.
+/// The master thread id is the first thread (lane) of the last warp in the
+/// GPU block.  Warp size is assumed to be some power of 2.
+/// Thread id is 0 indexed.
+/// E.g: If NumThreads is 33, master id is 32.
+///      If NumThreads is 64, master id is 32.
+///      If NumThreads is 1024, master id is 992.
+llvm::Value *CGOpenMPRuntimeNVPTX::getMasterThreadID(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
+
+  // We assume that the warp size is a power of 2.
+  llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
+
+  return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
+                       Bld.CreateNot(Mask), "master_tid");
+}
+
+namespace {
+enum OpenMPRTLFunctionNVPTX {
+  /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle,
+  /// kmp_int32 thread_limit);
+  OMPRTL_NVPTX__kmpc_kernel_init,
+};
+
+// NVPTX Address space
+enum ADDRESS_SPACE {
+  ADDRESS_SPACE_SHARED = 3,
+};
+} // namespace
+
+CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
+    CodeGenModule &CGM)
+    : WorkerFn(nullptr), CGFI(nullptr) {
+  createWorkerFunction(CGM);
+}
+
+void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
+    CodeGenModule &CGM) {
+  // Create an worker function with no arguments.
+  CGFI = &CGM.getTypes().arrangeNullaryFunction();
+
+  WorkerFn = llvm::Function::Create(
+      CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
+      /* placeholder */ "_worker", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
+  WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+  WorkerFn->addFnAttr(llvm::Attribute::NoInline);
+}
+
+void CGOpenMPRuntimeNVPTX::initializeEnvironment() {
+  //
+  // Initialize master-worker control state in shared memory.
+  //
+
+  auto DL = CGM.getDataLayout();
+  ActiveWorkers = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false,
+      llvm::GlobalValue::CommonLinkage,
+      llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0,
+      llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED);
+  ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty));
+
+  WorkID = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false,
+      llvm::GlobalValue::CommonLinkage,
+      llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0,
+      llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED);
+  WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty));
+}
+
+void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
+  auto &Ctx = CGM.getContext();
+
+  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
+  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
+  emitWorkerLoop(CGF, WST);
+  CGF.FinishFunction();
+}
+
+void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
+                                          WorkerFunctionState &WST) {
+  //
+  // The workers enter this loop and wait for parallel work from the master.
+  // When the master encounters a parallel region it sets up the work + variable
+  // arguments, and wakes up the workers.  The workers first check to see if
+  // they are required for the parallel region, i.e., within the # of requested
+  // parallel threads.  The activated workers load the variable arguments and
+  // execute the parallel work.
+  //
+
+  CGBuilderTy &Bld = CGF.Builder;
+
+  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
+  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
+  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
+  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
+  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
+  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
+
+  CGF.EmitBranch(AwaitBB);
+
+  // Workers wait for work from master.
+  CGF.EmitBlock(AwaitBB);
+  // Wait for parallel work
+  syncCTAThreads(CGF);
+  // On termination condition (workid == 0), exit loop.
+  llvm::Value *ShouldTerminate = Bld.CreateICmpEQ(
+      Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()),
+      llvm::Constant::getNullValue(WorkID->getType()->getElementType()),
+      "should_terminate");
+  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
+
+  // Activate requested workers.
+  CGF.EmitBlock(SelectWorkersBB);
+  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+  llvm::Value *ActiveThread = Bld.CreateICmpSLT(
+      ThreadID,
+      Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()),
+      "active_thread");
+  Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB);
+
+  // Signal start of parallel region.
+  CGF.EmitBlock(ExecuteBB);
+  // TODO: Add parallel work.
+
+  // Signal end of parallel region.
+  CGF.EmitBlock(TerminateBB);
+  CGF.EmitBranch(BarrierBB);
+
+  // All active and inactive workers wait at a barrier after parallel region.
+  CGF.EmitBlock(BarrierBB);
+  // Barrier after parallel region.
+  syncCTAThreads(CGF);
+  CGF.EmitBranch(AwaitBB);
+
+  // Exit target region.
+  CGF.EmitBlock(ExitBB);
+}
+
+// Setup NVPTX threads for master-worker OpenMP scheme.
+void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF,
+                                           EntryFunctionState &EST,
+                                           WorkerFunctionState &WST) {
+  CGBuilderTy &Bld = CGF.Builder;
+
+  // Get the master thread id.
+  llvm::Value *MasterID = getMasterThreadID(CGF);
+  // Current thread's identifier.
+  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+
+  // Setup BBs in entry function.
+  llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker");
+  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
+  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
+  EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  // The head (master thread) marches on while its body of companion threads in
+  // the warp go to sleep.
+  llvm::Value *ShouldDie =
+      Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp");
+  Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB);
+
+  // Select worker threads...
+  CGF.EmitBlock(WorkerCheckBB);
+  llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker");
+  Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB);
+
+  // ... and send to worker loop, awaiting parallel invocation.
+  CGF.EmitBlock(WorkerBB);
+  CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
+  CGF.EmitBranch(EST.ExitBB);
+
+  // Only master thread executes subsequent serial code.
+  CGF.EmitBlock(MasterBB);
+
+  // First action in sequential region:
+  // Initialize the state of the OpenMP runtime library on the GPU.
+  llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)};
+  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init),
+                      Args);
+}
+
+void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF,
+                                           EntryFunctionState &EST) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
+  CGF.EmitBranch(TerminateBB);
+
+  CGF.EmitBlock(TerminateBB);
+  // Signal termination condition.
+  Bld.CreateAlignedStore(
+      llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID,
+      WorkID->getAlignment());
+  // Barrier to terminate worker threads.
+  syncCTAThreads(CGF);
+  // Master thread jumps to exit point.
+  CGF.EmitBranch(EST.ExitBB);
+
+  CGF.EmitBlock(EST.ExitBB);
+}
+
+/// \brief Returns specified OpenMP runtime function for the current OpenMP
+/// implementation.  Specialized for the NVPTX device.
+/// \param Function OpenMP runtime function.
+/// \return Specified function.
+llvm::Constant *
+CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
+  llvm::Constant *RTLFn = nullptr;
+  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
+  case OMPRTL_NVPTX__kmpc_kernel_init: {
+    // Build void __kmpc_kernel_init(kmp_int32 omp_handle,
+    // kmp_int32 thread_limit);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
+    break;
+  }
+  }
+  return RTLFn;
+}
+
+void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
+                                              llvm::Constant *Addr,
+                                              uint64_t Size) {
+  auto *F = dyn_cast<llvm::Function>(Addr);
+  // TODO: Add support for global variables on the device after declare target
+  // support.
+  if (!F)
+    return;
+  llvm::Module *M = F->getParent();
+  llvm::LLVMContext &Ctx = M->getContext();
+
+  // Get "nvvm.annotations" metadata node
+  llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
+
+  llvm::Metadata *MDVals[] = {
+      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
+      llvm::ConstantAsMetadata::get(
+          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
+  // Append metadata to nvvm.annotations
+  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+}
+
+void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  if (!IsOffloadEntry) // Nothing to do.
+    return;
+
+  assert(!ParentName.empty() && "Invalid target region parent name!");
+
+  EntryFunctionState EST;
+  WorkerFunctionState WST(CGM);
+
+  // Emit target region as a standalone region.
+  class NVPTXPrePostActionTy : public PrePostActionTy {
+    CGOpenMPRuntimeNVPTX &RT;
+    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
+    CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
+
+  public:
+    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
+                         CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
+                         CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
+        : RT(RT), EST(EST), WST(WST) {}
+    void Enter(CodeGenFunction &CGF) override {
+      RT.emitEntryHeader(CGF, EST, WST);
+    }
+    void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); }
+  } Action(*this, EST, WST);
+  CodeGen.setAction(Action);
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   IsOffloadEntry, CodeGen);
+
+  // Create the worker function
+  emitWorkerFunction(WST);
+
+  // Now change the name of the worker function to correspond to this target
+  // region's entry function.
+  WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
+}
+
+CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
+    : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) {
+  if (!CGM.getLangOpts().OpenMPIsDevice)
+    llvm_unreachable("OpenMP NVPTX can only handle device code.");
+
+  // Called once per module during initialization.
+  initializeEnvironment();
+}
+
+void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
+                                              const Expr *NumTeams,
+                                              const Expr *ThreadLimit,
+                                              SourceLocation Loc) {}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+  llvm::Function *OutlinedFun = nullptr;
+  if (isa<OMPTeamsDirective>(D)) {
+    llvm::Value *OutlinedFunVal =
+        CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
+            D, ThreadIDVar, InnermostKind, CodeGen);
+    OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
+    OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
+  } else
+    llvm_unreachable("parallel directive is not yet supported for nvptx "
+                     "backend.");
+
+  return OutlinedFun;
+}
+
+void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
+                                         const OMPExecutableDirective &D,
+                                         SourceLocation Loc,
+                                         llvm::Value *OutlinedFn,
+                                         ArrayRef<llvm::Value *> CapturedVars) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  Address ZeroAddr =
+      CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
+                           /*Name*/ ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+  CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
+}
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
new file mode 100644
index 0000000..a6c64b2
--- /dev/null
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -0,0 +1,179 @@
+//===----- CGOpenMPRuntimeNVPTX.h - Interface to OpenMP NVPTX Runtimes ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to NVPTX
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
+
+#include "CGOpenMPRuntime.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtOpenMP.h"
+#include "llvm/IR/CallSite.h"
+
+namespace clang {
+namespace CodeGen {
+
+class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
+public:
+  class EntryFunctionState {
+  public:
+    llvm::BasicBlock *ExitBB;
+
+    EntryFunctionState() : ExitBB(nullptr){};
+  };
+
+  class WorkerFunctionState {
+  public:
+    llvm::Function *WorkerFn;
+    const CGFunctionInfo *CGFI;
+
+    WorkerFunctionState(CodeGenModule &CGM);
+
+  private:
+    void createWorkerFunction(CodeGenModule &CGM);
+  };
+
+  /// \brief Helper for target entry function. Guide the master and worker
+  /// threads to their respective locations.
+  void emitEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
+                       WorkerFunctionState &WST);
+
+  /// \brief Signal termination of OMP execution.
+  void emitEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+
+private:
+  //
+  // NVPTX calls.
+  //
+
+  /// \brief Get the GPU warp size.
+  llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF);
+
+  /// \brief Get the id of the current thread on the GPU.
+  llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF);
+
+  // \brief Get the maximum number of threads in a block of the GPU.
+  llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF);
+
+  /// \brief Get barrier to synchronize all threads in a block.
+  void getNVPTXCTABarrier(CodeGenFunction &CGF);
+
+  // \brief Synchronize all GPU threads in a block.
+  void syncCTAThreads(CodeGenFunction &CGF);
+
+  //
+  // OMP calls.
+  //
+
+  /// \brief Get the thread id of the OMP master thread.
+  /// The master thread id is the first thread (lane) of the last warp in the
+  /// GPU block.  Warp size is assumed to be some power of 2.
+  /// Thread id is 0 indexed.
+  /// E.g: If NumThreads is 33, master id is 32.
+  ///      If NumThreads is 64, master id is 32.
+  ///      If NumThreads is 1024, master id is 992.
+  llvm::Value *getMasterThreadID(CodeGenFunction &CGF);
+
+  //
+  // Private state and methods.
+  //
+
+  // Master-worker control state.
+  // Number of requested OMP threads in parallel region.
+  llvm::GlobalVariable *ActiveWorkers;
+  // Outlined function for the workers to execute.
+  llvm::GlobalVariable *WorkID;
+
+  /// \brief Initialize master-worker control state.
+  void initializeEnvironment();
+
+  /// \brief Emit the worker function for the current target region.
+  void emitWorkerFunction(WorkerFunctionState &WST);
+
+  /// \brief Helper for worker function. Emit body of worker loop.
+  void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST);
+
+  /// \brief Returns specified OpenMP runtime function for the current OpenMP
+  /// implementation.  Specialized for the NVPTX device.
+  /// \param Function OpenMP runtime function.
+  /// \return Specified function.
+  llvm::Constant *createNVPTXRuntimeFunction(unsigned Function);
+
+  //
+  // Base class overrides.
+  //
+
+  /// \brief Creates offloading entry for the provided entry ID \a ID,
+  /// address \a Addr and size \a Size.
+  void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr,
+                          uint64_t Size) override;
+
+  /// \brief Emit outlined function for 'target' directive on the NVPTX
+  /// device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitTargetOutlinedFunction(const OMPExecutableDirective &D,
+                                  StringRef ParentName,
+                                  llvm::Function *&OutlinedFn,
+                                  llvm::Constant *&OutlinedFnID,
+                                  bool IsOffloadEntry,
+                                  const RegionCodeGenTy &CodeGen) override;
+
+public:
+  explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
+
+  /// \brief This function ought to emit, in the general case, a call to
+  // the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed
+  // as these numbers are obtained through the PTX grid and block configuration.
+  /// \param NumTeams An integer expression of teams.
+  /// \param ThreadLimit An integer expression of threads.
+  void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
+                          const Expr *ThreadLimit, SourceLocation Loc) override;
+
+  /// \brief Emits inlined function for the specified OpenMP parallel
+  //  directive but an inlined function for teams.
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  llvm::Value *
+  emitParallelOrTeamsOutlinedFunction(const OMPExecutableDirective &D,
+                                      const VarDecl *ThreadIDVar,
+                                      OpenMPDirectiveKind InnermostKind,
+                                      const RegionCodeGenTy &CodeGen) override;
+
+  /// \brief Emits code for teams call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run by team masters. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  ///
+  void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
+                     SourceLocation Loc, llvm::Value *OutlinedFn,
+                     ArrayRef<llvm::Value *> CapturedVars) override;
+};
+
+} // CodeGen namespace.
+} // clang namespace.
+
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
diff --git a/lib/CodeGen/CGRecordLayout.h b/lib/CodeGen/CGRecordLayout.h
index d4ad33e..7b9c27d 100644
--- a/lib/CodeGen/CGRecordLayout.h
+++ b/lib/CodeGen/CGRecordLayout.h
@@ -11,7 +11,7 @@
 #define LLVM_CLANG_LIB_CODEGEN_CGRECORDLAYOUT_H
 
 #include "clang/AST/CharUnits.h"
-#include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/DerivedTypes.h"
diff --git a/lib/CodeGen/CGStmt.cpp b/lib/CodeGen/CGStmt.cpp
index a1e44c8..aa457ad 100644
--- a/lib/CodeGen/CGStmt.cpp
+++ b/lib/CodeGen/CGStmt.cpp
@@ -256,15 +256,51 @@
   case Stmt::OMPTargetDataDirectiveClass:
     EmitOMPTargetDataDirective(cast<OMPTargetDataDirective>(*S));
     break;
+  case Stmt::OMPTargetEnterDataDirectiveClass:
+    EmitOMPTargetEnterDataDirective(cast<OMPTargetEnterDataDirective>(*S));
+    break;
+  case Stmt::OMPTargetExitDataDirectiveClass:
+    EmitOMPTargetExitDataDirective(cast<OMPTargetExitDataDirective>(*S));
+    break;
+  case Stmt::OMPTargetParallelDirectiveClass:
+    EmitOMPTargetParallelDirective(cast<OMPTargetParallelDirective>(*S));
+    break;
+  case Stmt::OMPTargetParallelForDirectiveClass:
+    EmitOMPTargetParallelForDirective(cast<OMPTargetParallelForDirective>(*S));
+    break;
   case Stmt::OMPTaskLoopDirectiveClass:
     EmitOMPTaskLoopDirective(cast<OMPTaskLoopDirective>(*S));
     break;
   case Stmt::OMPTaskLoopSimdDirectiveClass:
     EmitOMPTaskLoopSimdDirective(cast<OMPTaskLoopSimdDirective>(*S));
     break;
-case Stmt::OMPDistributeDirectiveClass:
+  case Stmt::OMPDistributeDirectiveClass:
     EmitOMPDistributeDirective(cast<OMPDistributeDirective>(*S));
-	break;
+    break;
+  case Stmt::OMPTargetUpdateDirectiveClass:
+    EmitOMPTargetUpdateDirective(cast<OMPTargetUpdateDirective>(*S));
+    break;
+  case Stmt::OMPDistributeParallelForDirectiveClass:
+    EmitOMPDistributeParallelForDirective(
+        cast<OMPDistributeParallelForDirective>(*S));
+    break;
+  case Stmt::OMPDistributeParallelForSimdDirectiveClass:
+    EmitOMPDistributeParallelForSimdDirective(
+        cast<OMPDistributeParallelForSimdDirective>(*S));
+    break;
+  case Stmt::OMPDistributeSimdDirectiveClass:
+    EmitOMPDistributeSimdDirective(cast<OMPDistributeSimdDirective>(*S));
+    break;
+  case Stmt::OMPTargetParallelForSimdDirectiveClass:
+    EmitOMPTargetParallelForSimdDirective(
+        cast<OMPTargetParallelForSimdDirective>(*S));
+    break;
+  case Stmt::OMPTargetSimdDirectiveClass:
+    EmitOMPTargetSimdDirective(cast<OMPTargetSimdDirective>(*S));
+    break;
+  case Stmt::OMPTeamsDistributeDirectiveClass:
+    EmitOMPTeamsDistributeDirective(cast<OMPTeamsDistributeDirective>(*S));
+    break;
   }
 }
 
@@ -542,13 +578,17 @@
   // unequal to 0.  The condition must be a scalar type.
   LexicalScope ConditionScope(*this, S.getCond()->getSourceRange());
 
+  if (S.getInit())
+    EmitStmt(S.getInit());
+
   if (S.getConditionVariable())
     EmitAutoVarDecl(*S.getConditionVariable());
 
   // If the condition constant folds and can be elided, try to avoid emitting
   // the condition and the dead arm of the if/else.
   bool CondConstant;
-  if (ConstantFoldsToSimpleInteger(S.getCond(), CondConstant)) {
+  if (ConstantFoldsToSimpleInteger(S.getCond(), CondConstant,
+                                   S.isConstexpr())) {
     // Figure out which block (then or else) is executed.
     const Stmt *Executed = S.getThen();
     const Stmt *Skipped  = S.getElse();
@@ -557,7 +597,7 @@
 
     // If the skipped block has no labels in it, just emit the executed block.
     // This avoids emitting dead code and simplifies the CFG substantially.
-    if (!ContainsLabel(Skipped)) {
+    if (S.isConstexpr() || !ContainsLabel(Skipped)) {
       if (CondConstant)
         incrementProfileCounter(&S);
       if (Executed) {
@@ -617,7 +657,8 @@
   JumpDest LoopHeader = getJumpDestInCurrentScope("while.cond");
   EmitBlock(LoopHeader.getBlock());
 
-  LoopStack.push(LoopHeader.getBlock(), CGM.getContext(), WhileAttrs);
+  LoopStack.push(LoopHeader.getBlock(), CGM.getContext(), WhileAttrs,
+                 Builder.getCurrentDebugLocation());
 
   // Create an exit block for when the condition fails, which will
   // also become the break target.
@@ -708,7 +749,8 @@
   // Emit the body of the loop.
   llvm::BasicBlock *LoopBody = createBasicBlock("do.body");
 
-  LoopStack.push(LoopBody, CGM.getContext(), DoAttrs);
+  LoopStack.push(LoopBody, CGM.getContext(), DoAttrs,
+                 Builder.getCurrentDebugLocation());
 
   EmitBlockWithFallThrough(LoopBody, &S);
   {
@@ -760,6 +802,8 @@
 
   LexicalScope ForScope(*this, S.getSourceRange());
 
+  llvm::DebugLoc DL = Builder.getCurrentDebugLocation();
+
   // Evaluate the first part before the loop.
   if (S.getInit())
     EmitStmt(S.getInit());
@@ -771,7 +815,7 @@
   llvm::BasicBlock *CondBlock = Continue.getBlock();
   EmitBlock(CondBlock);
 
-  LoopStack.push(CondBlock, CGM.getContext(), ForAttrs);
+  LoopStack.push(CondBlock, CGM.getContext(), ForAttrs, DL);
 
   // If the for loop doesn't have an increment we can just use the
   // condition as the continue block.  Otherwise we'll need to create
@@ -856,9 +900,12 @@
 
   LexicalScope ForScope(*this, S.getSourceRange());
 
+  llvm::DebugLoc DL = Builder.getCurrentDebugLocation();
+
   // Evaluate the first pieces before the loop.
   EmitStmt(S.getRangeStmt());
-  EmitStmt(S.getBeginEndStmt());
+  EmitStmt(S.getBeginStmt());
+  EmitStmt(S.getEndStmt());
 
   // Start the loop with a block that tests the condition.
   // If there's an increment, the continue scope will be overwritten
@@ -866,7 +913,7 @@
   llvm::BasicBlock *CondBlock = createBasicBlock("for.cond");
   EmitBlock(CondBlock);
 
-  LoopStack.push(CondBlock, CGM.getContext(), ForAttrs);
+  LoopStack.push(CondBlock, CGM.getContext(), ForAttrs, DL);
 
   // If there are any cleanups between here and the loop-exit scope,
   // create a block to stage a loop exit along.
@@ -1147,7 +1194,7 @@
   // If the body of the case is just a 'break', try to not emit an empty block.
   // If we're profiling or we're not optimizing, leave the block in for better
   // debug and coverage analysis.
-  if (!CGM.getCodeGenOpts().ProfileInstrGenerate &&
+  if (!CGM.getCodeGenOpts().hasProfileClangInstr() &&
       CGM.getCodeGenOpts().OptimizationLevel > 0 &&
       isa<BreakStmt>(S.getSubStmt())) {
     JumpDest Block = BreakContinueStack.back().BreakBlock;
@@ -1194,7 +1241,7 @@
 
     if (SwitchWeights)
       SwitchWeights->push_back(getProfileCount(NextCase));
-    if (CGM.getCodeGenOpts().ProfileInstrGenerate) {
+    if (CGM.getCodeGenOpts().hasProfileClangInstr()) {
       CaseDest = createBasicBlock("sw.bb");
       EmitBlockWithFallThrough(CaseDest, &S);
     }
@@ -1208,6 +1255,14 @@
 }
 
 void CodeGenFunction::EmitDefaultStmt(const DefaultStmt &S) {
+  // If there is no enclosing switch instance that we're aware of, then this
+  // default statement can be elided. This situation only happens when we've
+  // constant-folded the switch.
+  if (!SwitchInsn) {
+    EmitStmt(S.getSubStmt());
+    return;
+  }
+
   llvm::BasicBlock *DefaultBlock = SwitchInsn->getDefaultDest();
   assert(DefaultBlock->empty() &&
          "EmitDefaultStmt: Default block already defined?");
@@ -1438,6 +1493,9 @@
         incrementProfileCounter(Case);
       RunCleanupsScope ExecutedScope(*this);
 
+      if (S.getInit())
+        EmitStmt(S.getInit());
+
       // Emit the condition variable if needed inside the entire cleanup scope
       // used by this special case for constant folded switches.
       if (S.getConditionVariable())
@@ -1465,6 +1523,10 @@
   JumpDest SwitchExit = getJumpDestInCurrentScope("sw.epilog");
 
   RunCleanupsScope ConditionScope(*this);
+
+  if (S.getInit())
+    EmitStmt(S.getInit());
+
   if (S.getConditionVariable())
     EmitAutoVarDecl(*S.getConditionVariable());
   llvm::Value *CondV = EmitScalarExpr(S.getCond());
@@ -1537,16 +1599,13 @@
   // If the switch has a condition wrapped by __builtin_unpredictable,
   // create metadata that specifies that the switch is unpredictable.
   // Don't bother if not optimizing because that metadata would not be used.
-  if (CGM.getCodeGenOpts().OptimizationLevel != 0) {
-    if (const CallExpr *Call = dyn_cast<CallExpr>(S.getCond())) {
-      const Decl *TargetDecl = Call->getCalleeDecl();
-      if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl)) {
-        if (FD->getBuiltinID() == Builtin::BI__builtin_unpredictable) {
-          llvm::MDBuilder MDHelper(getLLVMContext());
-          SwitchInsn->setMetadata(llvm::LLVMContext::MD_unpredictable,
-                                  MDHelper.createUnpredictable());
-        }
-      }
+  auto *Call = dyn_cast<CallExpr>(S.getCond());
+  if (Call && CGM.getCodeGenOpts().OptimizationLevel != 0) {
+    auto *FD = dyn_cast_or_null<FunctionDecl>(Call->getCalleeDecl());
+    if (FD && FD->getBuiltinID() == Builtin::BI__builtin_unpredictable) {
+      llvm::MDBuilder MDHelper(getLLVMContext());
+      SwitchInsn->setMetadata(llvm::LLVMContext::MD_unpredictable,
+                              MDHelper.createUnpredictable());
     }
   }
 
@@ -2035,6 +2094,14 @@
                                           llvm::ConstantAsMetadata::get(Loc)));
   }
 
+  if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) {
+    // Conservatively, mark all inline asm blocks in CUDA as convergent
+    // (meaning, they may call an intrinsically convergent op, such as bar.sync,
+    // and so can't have certain optimizations applied around them).
+    Result->addAttribute(llvm::AttributeSet::FunctionIndex,
+                         llvm::Attribute::Convergent);
+  }
+
   // Extract all of the register value results from the asm.
   std::vector<llvm::Value*> RegResults;
   if (ResultRegTypes.size() == 1) {
diff --git a/lib/CodeGen/CGStmtOpenMP.cpp b/lib/CodeGen/CGStmtOpenMP.cpp
index 68bd68b..f2b9942 100644
--- a/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/lib/CodeGen/CGStmtOpenMP.cpp
@@ -11,15 +11,117 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CGCleanup.h"
 #include "CGOpenMPRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtOpenMP.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "llvm/IR/CallSite.h"
 using namespace clang;
 using namespace CodeGen;
 
+namespace {
+/// Lexical scope for OpenMP executable constructs, that handles correct codegen
+/// for captured expressions.
+class OMPLexicalScope final : public CodeGenFunction::LexicalScope {
+  void emitPreInitStmt(CodeGenFunction &CGF, const OMPExecutableDirective &S) {
+    for (const auto *C : S.clauses()) {
+      if (auto *CPI = OMPClauseWithPreInit::get(C)) {
+        if (auto *PreInit = cast_or_null<DeclStmt>(CPI->getPreInitStmt())) {
+          for (const auto *I : PreInit->decls()) {
+            if (!I->hasAttr<OMPCaptureNoInitAttr>())
+              CGF.EmitVarDecl(cast<VarDecl>(*I));
+            else {
+              CodeGenFunction::AutoVarEmission Emission =
+                  CGF.EmitAutoVarAlloca(cast<VarDecl>(*I));
+              CGF.EmitAutoVarCleanups(Emission);
+            }
+          }
+        }
+      }
+    }
+  }
+  CodeGenFunction::OMPPrivateScope InlinedShareds;
+
+  static bool isCapturedVar(CodeGenFunction &CGF, const VarDecl *VD) {
+    return CGF.LambdaCaptureFields.lookup(VD) ||
+           (CGF.CapturedStmtInfo && CGF.CapturedStmtInfo->lookup(VD)) ||
+           (CGF.CurCodeDecl && isa<BlockDecl>(CGF.CurCodeDecl));
+  }
+
+public:
+  OMPLexicalScope(CodeGenFunction &CGF, const OMPExecutableDirective &S,
+                  bool AsInlined = false)
+      : CodeGenFunction::LexicalScope(CGF, S.getSourceRange()),
+        InlinedShareds(CGF) {
+    emitPreInitStmt(CGF, S);
+    if (AsInlined) {
+      if (S.hasAssociatedStmt()) {
+        auto *CS = cast<CapturedStmt>(S.getAssociatedStmt());
+        for (auto &C : CS->captures()) {
+          if (C.capturesVariable() || C.capturesVariableByCopy()) {
+            auto *VD = C.getCapturedVar();
+            DeclRefExpr DRE(const_cast<VarDecl *>(VD),
+                            isCapturedVar(CGF, VD) ||
+                                (CGF.CapturedStmtInfo &&
+                                 InlinedShareds.isGlobalVarCaptured(VD)),
+                            VD->getType().getNonReferenceType(), VK_LValue,
+                            SourceLocation());
+            InlinedShareds.addPrivate(VD, [&CGF, &DRE]() -> Address {
+              return CGF.EmitLValue(&DRE).getAddress();
+            });
+          }
+        }
+        (void)InlinedShareds.Privatize();
+      }
+    }
+  }
+};
+
+/// Private scope for OpenMP loop-based directives, that supports capturing
+/// of used expression from loop statement.
+class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
+  void emitPreInitStmt(CodeGenFunction &CGF, const OMPLoopDirective &S) {
+    if (auto *LD = dyn_cast<OMPLoopDirective>(&S)) {
+      if (auto *PreInits = cast_or_null<DeclStmt>(LD->getPreInits())) {
+        for (const auto *I : PreInits->decls())
+          CGF.EmitVarDecl(cast<VarDecl>(*I));
+      }
+    }
+  }
+
+public:
+  OMPLoopScope(CodeGenFunction &CGF, const OMPLoopDirective &S)
+      : CodeGenFunction::RunCleanupsScope(CGF) {
+    emitPreInitStmt(CGF, S);
+  }
+};
+
+} // namespace
+
+llvm::Value *CodeGenFunction::getTypeSize(QualType Ty) {
+  auto &C = getContext();
+  llvm::Value *Size = nullptr;
+  auto SizeInChars = C.getTypeSizeInChars(Ty);
+  if (SizeInChars.isZero()) {
+    // getTypeSizeInChars() returns 0 for a VLA.
+    while (auto *VAT = C.getAsVariableArrayType(Ty)) {
+      llvm::Value *ArraySize;
+      std::tie(ArraySize, Ty) = getVLASize(VAT);
+      Size = Size ? Builder.CreateNUWMul(Size, ArraySize) : ArraySize;
+    }
+    SizeInChars = C.getTypeSizeInChars(Ty);
+    if (SizeInChars.isZero())
+      return llvm::ConstantInt::get(SizeTy, /*V=*/0);
+    Size = Builder.CreateNUWMul(Size, CGM.getSize(SizeInChars));
+  } else
+    Size = CGM.getSize(SizeInChars);
+  return Size;
+}
+
 void CodeGenFunction::GenerateOpenMPCapturedVars(
     const CapturedStmt &S, SmallVectorImpl<llvm::Value *> &CapturedVars) {
   const RecordDecl *RD = S.getCapturedRecordDecl();
@@ -34,10 +136,33 @@
       CapturedVars.push_back(Val);
     } else if (CurCap->capturesThis())
       CapturedVars.push_back(CXXThisValue);
-    else if (CurCap->capturesVariableByCopy())
-      CapturedVars.push_back(
-          EmitLoadOfLValue(EmitLValue(*I), SourceLocation()).getScalarVal());
-    else {
+    else if (CurCap->capturesVariableByCopy()) {
+      llvm::Value *CV =
+          EmitLoadOfLValue(EmitLValue(*I), SourceLocation()).getScalarVal();
+
+      // If the field is not a pointer, we need to save the actual value
+      // and load it as a void pointer.
+      if (!CurField->getType()->isAnyPointerType()) {
+        auto &Ctx = getContext();
+        auto DstAddr = CreateMemTemp(
+            Ctx.getUIntPtrType(),
+            Twine(CurCap->getCapturedVar()->getName()) + ".casted");
+        LValue DstLV = MakeAddrLValue(DstAddr, Ctx.getUIntPtrType());
+
+        auto *SrcAddrVal = EmitScalarConversion(
+            DstAddr.getPointer(), Ctx.getPointerType(Ctx.getUIntPtrType()),
+            Ctx.getPointerType(CurField->getType()), SourceLocation());
+        LValue SrcLV =
+            MakeNaturalAlignAddrLValue(SrcAddrVal, CurField->getType());
+
+        // Store the value using the source type pointer.
+        EmitStoreThroughLValue(RValue::get(CV), SrcLV);
+
+        // Load the value using the destination type pointer.
+        CV = EmitLoadOfLValue(DstLV, SourceLocation()).getScalarVal();
+      }
+      CapturedVars.push_back(CV);
+    } else {
       assert(CurCap->capturesVariable() && "Expected capture by reference.");
       CapturedVars.push_back(EmitLValue(*I).getAddress().getPointer());
     }
@@ -139,8 +264,19 @@
     // If we are capturing a pointer by copy we don't need to do anything, just
     // use the value that we get from the arguments.
     if (I->capturesVariableByCopy() && FD->getType()->isAnyPointerType()) {
-      setAddrOfLocalVar(I->getCapturedVar(), GetAddrOfLocalVar(Args[Cnt]));
-      ++Cnt, ++I;
+      const VarDecl *CurVD = I->getCapturedVar();
+      Address LocalAddr = GetAddrOfLocalVar(Args[Cnt]);
+      // If the variable is a reference we need to materialize it here.
+      if (CurVD->getType()->isReferenceType()) {
+        Address RefAddr = CreateMemTemp(CurVD->getType(), getPointerAlign(),
+                                        ".materialized_ref");
+        EmitStoreOfScalar(LocalAddr.getPointer(), RefAddr, /*Volatile=*/false,
+                          CurVD->getType());
+        LocalAddr = RefAddr;
+      }
+      setAddrOfLocalVar(CurVD, LocalAddr);
+      ++Cnt;
+      ++I;
       continue;
     }
 
@@ -171,17 +307,17 @@
              "Not expecting a captured pointer.");
       auto *Var = I->getCapturedVar();
       QualType VarTy = Var->getType();
-      setAddrOfLocalVar(I->getCapturedVar(),
-                        castValueFromUintptr(*this, FD->getType(),
-                                             Args[Cnt]->getName(), ArgLVal,
-                                             VarTy->isReferenceType()));
+      setAddrOfLocalVar(Var, castValueFromUintptr(*this, FD->getType(),
+                                                  Args[Cnt]->getName(), ArgLVal,
+                                                  VarTy->isReferenceType()));
     } else {
       // If 'this' is captured, load it into CXXThisValue.
       assert(I->capturesThis());
       CXXThisValue =
           EmitLoadOfLValue(ArgLVal, Args[Cnt]->getLocation()).getScalarVal();
     }
-    ++Cnt, ++I;
+    ++Cnt;
+    ++I;
   }
 
   PGO.assignRegionCounters(GlobalDecl(CD), F);
@@ -255,12 +391,77 @@
   EmitBlock(DoneBB, /*IsFinished=*/true);
 }
 
+/// Check if the combiner is a call to UDR combiner and if it is so return the
+/// UDR decl used for reduction.
+static const OMPDeclareReductionDecl *
+getReductionInit(const Expr *ReductionOp) {
+  if (auto *CE = dyn_cast<CallExpr>(ReductionOp))
+    if (auto *OVE = dyn_cast<OpaqueValueExpr>(CE->getCallee()))
+      if (auto *DRE =
+              dyn_cast<DeclRefExpr>(OVE->getSourceExpr()->IgnoreImpCasts()))
+        if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(DRE->getDecl()))
+          return DRD;
+  return nullptr;
+}
+
+static void emitInitWithReductionInitializer(CodeGenFunction &CGF,
+                                             const OMPDeclareReductionDecl *DRD,
+                                             const Expr *InitOp,
+                                             Address Private, Address Original,
+                                             QualType Ty) {
+  if (DRD->getInitializer()) {
+    std::pair<llvm::Function *, llvm::Function *> Reduction =
+        CGF.CGM.getOpenMPRuntime().getUserDefinedReduction(DRD);
+    auto *CE = cast<CallExpr>(InitOp);
+    auto *OVE = cast<OpaqueValueExpr>(CE->getCallee());
+    const Expr *LHS = CE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
+    const Expr *RHS = CE->getArg(/*Arg=*/1)->IgnoreParenImpCasts();
+    auto *LHSDRE = cast<DeclRefExpr>(cast<UnaryOperator>(LHS)->getSubExpr());
+    auto *RHSDRE = cast<DeclRefExpr>(cast<UnaryOperator>(RHS)->getSubExpr());
+    CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
+    PrivateScope.addPrivate(cast<VarDecl>(LHSDRE->getDecl()),
+                            [=]() -> Address { return Private; });
+    PrivateScope.addPrivate(cast<VarDecl>(RHSDRE->getDecl()),
+                            [=]() -> Address { return Original; });
+    (void)PrivateScope.Privatize();
+    RValue Func = RValue::get(Reduction.second);
+    CodeGenFunction::OpaqueValueMapping Map(CGF, OVE, Func);
+    CGF.EmitIgnoredExpr(InitOp);
+  } else {
+    llvm::Constant *Init = CGF.CGM.EmitNullConstant(Ty);
+    auto *GV = new llvm::GlobalVariable(
+        CGF.CGM.getModule(), Init->getType(), /*isConstant=*/true,
+        llvm::GlobalValue::PrivateLinkage, Init, ".init");
+    LValue LV = CGF.MakeNaturalAlignAddrLValue(GV, Ty);
+    RValue InitRVal;
+    switch (CGF.getEvaluationKind(Ty)) {
+    case TEK_Scalar:
+      InitRVal = CGF.EmitLoadOfLValue(LV, SourceLocation());
+      break;
+    case TEK_Complex:
+      InitRVal =
+          RValue::getComplex(CGF.EmitLoadOfComplex(LV, SourceLocation()));
+      break;
+    case TEK_Aggregate:
+      InitRVal = RValue::getAggregate(LV.getAddress());
+      break;
+    }
+    OpaqueValueExpr OVE(SourceLocation(), Ty, VK_RValue);
+    CodeGenFunction::OpaqueValueMapping OpaqueMap(CGF, &OVE, InitRVal);
+    CGF.EmitAnyExprToMem(&OVE, Private, Ty.getQualifiers(),
+                         /*IsInitializer=*/false);
+  }
+}
+
 /// \brief Emit initialization of arrays of complex types.
 /// \param DestAddr Address of the array.
 /// \param Type Type of array.
 /// \param Init Initial expression of array.
+/// \param SrcAddr Address of the original array.
 static void EmitOMPAggregateInit(CodeGenFunction &CGF, Address DestAddr,
-                                 QualType Type, const Expr *Init) {
+                                 QualType Type, const Expr *Init,
+                                 Address SrcAddr = Address::invalid()) {
+  auto *DRD = getReductionInit(Init);
   // Perform element-by-element initialization.
   QualType ElementTy;
 
@@ -269,7 +470,13 @@
   auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, DestAddr);
   DestAddr =
       CGF.Builder.CreateElementBitCast(DestAddr, DestAddr.getElementType());
+  if (DRD)
+    SrcAddr =
+        CGF.Builder.CreateElementBitCast(SrcAddr, DestAddr.getElementType());
 
+  llvm::Value *SrcBegin = nullptr;
+  if (DRD)
+    SrcBegin = SrcAddr.getPointer();
   auto DestBegin = DestAddr.getPointer();
   // Cast from pointer to array type to pointer to single element.
   auto DestEnd = CGF.Builder.CreateGEP(DestBegin, NumElements);
@@ -286,6 +493,16 @@
 
   CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy);
 
+  llvm::PHINode *SrcElementPHI = nullptr;
+  Address SrcElementCurrent = Address::invalid();
+  if (DRD) {
+    SrcElementPHI = CGF.Builder.CreatePHI(SrcBegin->getType(), 2,
+                                          "omp.arraycpy.srcElementPast");
+    SrcElementPHI->addIncoming(SrcBegin, EntryBB);
+    SrcElementCurrent =
+        Address(SrcElementPHI,
+                SrcAddr.getAlignment().alignmentOfArrayElement(ElementSize));
+  }
   llvm::PHINode *DestElementPHI = CGF.Builder.CreatePHI(
       DestBegin->getType(), 2, "omp.arraycpy.destElementPast");
   DestElementPHI->addIncoming(DestBegin, EntryBB);
@@ -296,8 +513,19 @@
   // Emit copy.
   {
     CodeGenFunction::RunCleanupsScope InitScope(CGF);
-    CGF.EmitAnyExprToMem(Init, DestElementCurrent, ElementTy.getQualifiers(),
-                         /*IsInitializer=*/false);
+    if (DRD && (DRD->getInitializer() || !Init)) {
+      emitInitWithReductionInitializer(CGF, DRD, Init, DestElementCurrent,
+                                       SrcElementCurrent, ElementTy);
+    } else
+      CGF.EmitAnyExprToMem(Init, DestElementCurrent, ElementTy.getQualifiers(),
+                           /*IsInitializer=*/false);
+  }
+
+  if (DRD) {
+    // Shift the address forward by one element.
+    auto SrcElementNext = CGF.Builder.CreateConstGEP1_32(
+        SrcElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element");
+    SrcElementPHI->addIncoming(SrcElementNext, CGF.Builder.GetInsertBlock());
   }
 
   // Shift the address forward by one element.
@@ -355,24 +583,42 @@
                                                 OMPPrivateScope &PrivateScope) {
   if (!HaveInsertPoint())
     return false;
+  bool FirstprivateIsLastprivate = false;
+  llvm::DenseSet<const VarDecl *> Lastprivates;
+  for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
+    for (const auto *D : C->varlists())
+      Lastprivates.insert(
+          cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl());
+  }
   llvm::DenseSet<const VarDecl *> EmittedAsFirstprivate;
+  CGCapturedStmtInfo CapturesInfo(cast<CapturedStmt>(*D.getAssociatedStmt()));
   for (const auto *C : D.getClausesOfKind<OMPFirstprivateClause>()) {
     auto IRef = C->varlist_begin();
     auto InitsRef = C->inits().begin();
     for (auto IInit : C->private_copies()) {
       auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
-      if (EmittedAsFirstprivate.count(OrigVD) == 0) {
-        EmittedAsFirstprivate.insert(OrigVD);
+      bool ThisFirstprivateIsLastprivate =
+          Lastprivates.count(OrigVD->getCanonicalDecl()) > 0;
+      auto *CapFD = CapturesInfo.lookup(OrigVD);
+      auto *FD = CapturedStmtInfo->lookup(OrigVD);
+      if (!ThisFirstprivateIsLastprivate && FD && (FD == CapFD) &&
+          !FD->getType()->isReferenceType()) {
+        EmittedAsFirstprivate.insert(OrigVD->getCanonicalDecl());
+        ++IRef;
+        ++InitsRef;
+        continue;
+      }
+      FirstprivateIsLastprivate =
+          FirstprivateIsLastprivate || ThisFirstprivateIsLastprivate;
+      if (EmittedAsFirstprivate.insert(OrigVD->getCanonicalDecl()).second) {
         auto *VD = cast<VarDecl>(cast<DeclRefExpr>(IInit)->getDecl());
         auto *VDInit = cast<VarDecl>(cast<DeclRefExpr>(*InitsRef)->getDecl());
         bool IsRegistered;
-        DeclRefExpr DRE(
-            const_cast<VarDecl *>(OrigVD),
-            /*RefersToEnclosingVariableOrCapture=*/CapturedStmtInfo->lookup(
-                OrigVD) != nullptr,
-            (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc());
+        DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
+                        /*RefersToEnclosingVariableOrCapture=*/FD != nullptr,
+                        (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc());
         Address OriginalAddr = EmitLValue(&DRE).getAddress();
-        QualType Type = OrigVD->getType();
+        QualType Type = VD->getType();
         if (Type->isArrayType()) {
           // Emit VarDecl with copy init for arrays.
           // Get the address of the original variable captured in current
@@ -419,10 +665,11 @@
         // Silence the warning about unused variable.
         (void)IsRegistered;
       }
-      ++IRef, ++InitsRef;
+      ++IRef;
+      ++InitsRef;
     }
   }
-  return !EmittedAsFirstprivate.empty();
+  return FirstprivateIsLastprivate && !EmittedAsFirstprivate.empty();
 }
 
 void CodeGenFunction::EmitOMPPrivateClause(
@@ -469,7 +716,6 @@
       auto *VD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
       QualType Type = VD->getType();
       if (CopiedVars.insert(VD->getCanonicalDecl()).second) {
-
         // Get the address of the master variable. If we are emitting code with
         // TLS support, the address is passed from the master as field in the
         // captured declaration.
@@ -524,15 +770,27 @@
   if (!HaveInsertPoint())
     return false;
   bool HasAtLeastOneLastprivate = false;
+  llvm::DenseSet<const VarDecl *> SIMDLCVs;
+  if (isOpenMPSimdDirective(D.getDirectiveKind())) {
+    auto *LoopDirective = cast<OMPLoopDirective>(&D);
+    for (auto *C : LoopDirective->counters()) {
+      SIMDLCVs.insert(
+          cast<VarDecl>(cast<DeclRefExpr>(C)->getDecl())->getCanonicalDecl());
+    }
+  }
   llvm::DenseSet<const VarDecl *> AlreadyEmittedVars;
   for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
     HasAtLeastOneLastprivate = true;
+    if (isOpenMPTaskLoopDirective(D.getDirectiveKind()))
+      break;
     auto IRef = C->varlist_begin();
     auto IDestRef = C->destination_exprs().begin();
     for (auto *IInit : C->private_copies()) {
       // Keep the address of the original variable for future update at the end
       // of the loop.
       auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
+      // Taskloops do not require additional initialization, it is done in
+      // runtime support library.
       if (AlreadyEmittedVars.insert(OrigVD->getCanonicalDecl()).second) {
         auto *DestVD = cast<VarDecl>(cast<DeclRefExpr>(*IDestRef)->getDecl());
         PrivateScope.addPrivate(DestVD, [this, OrigVD, IRef]() -> Address {
@@ -546,27 +804,28 @@
         // Check if the variable is also a firstprivate: in this case IInit is
         // not generated. Initialization of this variable will happen in codegen
         // for 'firstprivate' clause.
-        if (IInit) {
+        if (IInit && !SIMDLCVs.count(OrigVD->getCanonicalDecl())) {
           auto *VD = cast<VarDecl>(cast<DeclRefExpr>(IInit)->getDecl());
-          bool IsRegistered =
-              PrivateScope.addPrivate(OrigVD, [&]() -> Address {
-                // Emit private VarDecl with copy init.
-                EmitDecl(*VD);
-                return GetAddrOfLocalVar(VD);
-              });
+          bool IsRegistered = PrivateScope.addPrivate(OrigVD, [&]() -> Address {
+            // Emit private VarDecl with copy init.
+            EmitDecl(*VD);
+            return GetAddrOfLocalVar(VD);
+          });
           assert(IsRegistered &&
                  "lastprivate var already registered as private");
           (void)IsRegistered;
         }
       }
-      ++IRef, ++IDestRef;
+      ++IRef;
+      ++IDestRef;
     }
   }
   return HasAtLeastOneLastprivate;
 }
 
 void CodeGenFunction::EmitOMPLastprivateClauseFinal(
-    const OMPExecutableDirective &D, llvm::Value *IsLastIterCond) {
+    const OMPExecutableDirective &D, bool NoFinals,
+    llvm::Value *IsLastIterCond) {
   if (!HaveInsertPoint())
     return;
   // Emit following code:
@@ -583,72 +842,103 @@
     Builder.CreateCondBr(IsLastIterCond, ThenBB, DoneBB);
     EmitBlock(ThenBB);
   }
-  llvm::DenseMap<const Decl *, const Expr *> LoopCountersAndUpdates;
-  const Expr *LastIterVal = nullptr;
-  const Expr *IVExpr = nullptr;
-  const Expr *IncExpr = nullptr;
+  llvm::DenseSet<const VarDecl *> AlreadyEmittedVars;
+  llvm::DenseMap<const VarDecl *, const Expr *> LoopCountersAndUpdates;
   if (auto *LoopDirective = dyn_cast<OMPLoopDirective>(&D)) {
-    if (isOpenMPWorksharingDirective(D.getDirectiveKind())) {
-      LastIterVal = cast<VarDecl>(cast<DeclRefExpr>(
-                                      LoopDirective->getUpperBoundVariable())
-                                      ->getDecl())
-                        ->getAnyInitializer();
-      IVExpr = LoopDirective->getIterationVariable();
-      IncExpr = LoopDirective->getInc();
-      auto IUpdate = LoopDirective->updates().begin();
-      for (auto *E : LoopDirective->counters()) {
-        auto *D = cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
-        LoopCountersAndUpdates[D] = *IUpdate;
-        ++IUpdate;
-      }
+    auto IC = LoopDirective->counters().begin();
+    for (auto F : LoopDirective->finals()) {
+      auto *D =
+          cast<VarDecl>(cast<DeclRefExpr>(*IC)->getDecl())->getCanonicalDecl();
+      if (NoFinals)
+        AlreadyEmittedVars.insert(D);
+      else
+        LoopCountersAndUpdates[D] = F;
+      ++IC;
     }
   }
-  {
-    llvm::DenseSet<const VarDecl *> AlreadyEmittedVars;
-    bool FirstLCV = true;
-    for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
-      auto IRef = C->varlist_begin();
-      auto ISrcRef = C->source_exprs().begin();
-      auto IDestRef = C->destination_exprs().begin();
-      for (auto *AssignOp : C->assignment_ops()) {
-        auto *PrivateVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
-        QualType Type = PrivateVD->getType();
-        auto *CanonicalVD = PrivateVD->getCanonicalDecl();
-        if (AlreadyEmittedVars.insert(CanonicalVD).second) {
-          // If lastprivate variable is a loop control variable for loop-based
-          // directive, update its value before copyin back to original
-          // variable.
-          if (auto *UpExpr = LoopCountersAndUpdates.lookup(CanonicalVD)) {
-            if (FirstLCV && LastIterVal) {
-              EmitAnyExprToMem(LastIterVal, EmitLValue(IVExpr).getAddress(),
-                               IVExpr->getType().getQualifiers(),
-                               /*IsInitializer=*/false);
-              EmitIgnoredExpr(IncExpr);
-              FirstLCV = false;
-            }
-            EmitIgnoredExpr(UpExpr);
-          }
-          auto *SrcVD = cast<VarDecl>(cast<DeclRefExpr>(*ISrcRef)->getDecl());
-          auto *DestVD = cast<VarDecl>(cast<DeclRefExpr>(*IDestRef)->getDecl());
-          // Get the address of the original variable.
-          Address OriginalAddr = GetAddrOfLocalVar(DestVD);
-          // Get the address of the private variable.
-          Address PrivateAddr = GetAddrOfLocalVar(PrivateVD);
-          if (auto RefTy = PrivateVD->getType()->getAs<ReferenceType>())
-            PrivateAddr =
+  for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
+    auto IRef = C->varlist_begin();
+    auto ISrcRef = C->source_exprs().begin();
+    auto IDestRef = C->destination_exprs().begin();
+    for (auto *AssignOp : C->assignment_ops()) {
+      auto *PrivateVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
+      QualType Type = PrivateVD->getType();
+      auto *CanonicalVD = PrivateVD->getCanonicalDecl();
+      if (AlreadyEmittedVars.insert(CanonicalVD).second) {
+        // If lastprivate variable is a loop control variable for loop-based
+        // directive, update its value before copyin back to original
+        // variable.
+        if (auto *FinalExpr = LoopCountersAndUpdates.lookup(CanonicalVD))
+          EmitIgnoredExpr(FinalExpr);
+        auto *SrcVD = cast<VarDecl>(cast<DeclRefExpr>(*ISrcRef)->getDecl());
+        auto *DestVD = cast<VarDecl>(cast<DeclRefExpr>(*IDestRef)->getDecl());
+        // Get the address of the original variable.
+        Address OriginalAddr = GetAddrOfLocalVar(DestVD);
+        // Get the address of the private variable.
+        Address PrivateAddr = GetAddrOfLocalVar(PrivateVD);
+        if (auto RefTy = PrivateVD->getType()->getAs<ReferenceType>())
+          PrivateAddr =
               Address(Builder.CreateLoad(PrivateAddr),
                       getNaturalTypeAlignment(RefTy->getPointeeType()));
-          EmitOMPCopy(Type, OriginalAddr, PrivateAddr, DestVD, SrcVD, AssignOp);
-        }
-        ++IRef;
-        ++ISrcRef;
-        ++IDestRef;
+        EmitOMPCopy(Type, OriginalAddr, PrivateAddr, DestVD, SrcVD, AssignOp);
       }
+      ++IRef;
+      ++ISrcRef;
+      ++IDestRef;
     }
+    if (auto *PostUpdate = C->getPostUpdateExpr())
+      EmitIgnoredExpr(PostUpdate);
   }
-  if (IsLastIterCond) {
+  if (IsLastIterCond)
     EmitBlock(DoneBB, /*IsFinished=*/true);
+}
+
+static Address castToBase(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
+                          LValue BaseLV, llvm::Value *Addr) {
+  Address Tmp = Address::invalid();
+  Address TopTmp = Address::invalid();
+  Address MostTopTmp = Address::invalid();
+  BaseTy = BaseTy.getNonReferenceType();
+  while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) &&
+         !CGF.getContext().hasSameType(BaseTy, ElTy)) {
+    Tmp = CGF.CreateMemTemp(BaseTy);
+    if (TopTmp.isValid())
+      CGF.Builder.CreateStore(Tmp.getPointer(), TopTmp);
+    else
+      MostTopTmp = Tmp;
+    TopTmp = Tmp;
+    BaseTy = BaseTy->getPointeeType();
   }
+  llvm::Type *Ty = BaseLV.getPointer()->getType();
+  if (Tmp.isValid())
+    Ty = Tmp.getElementType();
+  Addr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Ty);
+  if (Tmp.isValid()) {
+    CGF.Builder.CreateStore(Addr, Tmp);
+    return MostTopTmp;
+  }
+  return Address(Addr, BaseLV.getAlignment());
+}
+
+static LValue loadToBegin(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
+                          LValue BaseLV) {
+  BaseTy = BaseTy.getNonReferenceType();
+  while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) &&
+         !CGF.getContext().hasSameType(BaseTy, ElTy)) {
+    if (auto *PtrTy = BaseTy->getAs<PointerType>())
+      BaseLV = CGF.EmitLoadOfPointerLValue(BaseLV.getAddress(), PtrTy);
+    else {
+      BaseLV = CGF.EmitLoadOfReferenceLValue(BaseLV.getAddress(),
+                                             BaseTy->castAs<ReferenceType>());
+    }
+    BaseTy = BaseTy->getPointeeType();
+  }
+  return CGF.MakeAddrLValue(
+      Address(
+          CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+              BaseLV.getPointer(), CGF.ConvertTypeForMem(ElTy)->getPointerTo()),
+          BaseLV.getAlignment()),
+      BaseLV.getType(), BaseLV.getAlignmentSource());
 }
 
 void CodeGenFunction::EmitOMPReductionClauseInit(
@@ -660,10 +950,12 @@
     auto ILHS = C->lhs_exprs().begin();
     auto IRHS = C->rhs_exprs().begin();
     auto IPriv = C->privates().begin();
+    auto IRed = C->reduction_ops().begin();
     for (auto IRef : C->varlists()) {
       auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
       auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
       auto *PrivateVD = cast<VarDecl>(cast<DeclRefExpr>(*IPriv)->getDecl());
+      auto *DRD = getReductionInit(*IRed);
       if (auto *OASE = dyn_cast<OMPArraySectionExpr>(IRef)) {
         auto *Base = OASE->getBase()->IgnoreParenImpCasts();
         while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
@@ -676,21 +968,9 @@
         auto OASELValueUB =
             EmitOMPArraySectionExpr(OASE, /*IsLowerBound=*/false);
         auto OriginalBaseLValue = EmitLValue(DE);
-        auto BaseLValue = OriginalBaseLValue;
-        auto *Zero = Builder.getInt64(/*C=*/0);
-        llvm::SmallVector<llvm::Value *, 4> Indexes;
-        Indexes.push_back(Zero);
-        auto *ItemTy =
-            OASELValueLB.getPointer()->getType()->getPointerElementType();
-        auto *Ty = BaseLValue.getPointer()->getType()->getPointerElementType();
-        while (Ty != ItemTy) {
-          Indexes.push_back(Zero);
-          Ty = Ty->getPointerElementType();
-        }
-        BaseLValue = MakeAddrLValue(
-            Address(Builder.CreateInBoundsGEP(BaseLValue.getPointer(), Indexes),
-                    OASELValueLB.getAlignment()),
-            OASELValueLB.getType(), OASELValueLB.getAlignmentSource());
+        LValue BaseLValue =
+            loadToBegin(*this, OrigVD->getType(), OASELValueLB.getType(),
+                        OriginalBaseLValue);
         // Store the address of the original variable associated with the LHS
         // implicit variable.
         PrivateScope.addPrivate(LHSVD, [this, OASELValueLB]() -> Address {
@@ -698,8 +978,8 @@
         });
         // Emit reduction copy.
         bool IsRegistered = PrivateScope.addPrivate(
-            OrigVD, [this, PrivateVD, BaseLValue, OASELValueLB, OASELValueUB,
-                     OriginalBaseLValue]() -> Address {
+            OrigVD, [this, OrigVD, PrivateVD, BaseLValue, OASELValueLB,
+                     OASELValueUB, OriginalBaseLValue, DRD, IRed]() -> Address {
               // Emit VarDecl with copy init for arrays.
               // Get the address of the original variable captured in current
               // captured region.
@@ -717,15 +997,17 @@
               auto Emission = EmitAutoVarAlloca(*PrivateVD);
               auto Addr = Emission.getAllocatedAddress();
               auto *Init = PrivateVD->getInit();
-              EmitOMPAggregateInit(*this, Addr, PrivateVD->getType(), Init);
+              EmitOMPAggregateInit(*this, Addr, PrivateVD->getType(),
+                                   DRD ? *IRed : Init,
+                                   OASELValueLB.getAddress());
               EmitAutoVarCleanups(Emission);
               // Emit private VarDecl with reduction init.
               auto *Offset = Builder.CreatePtrDiff(BaseLValue.getPointer(),
                                                    OASELValueLB.getPointer());
               auto *Ptr = Builder.CreateGEP(Addr.getPointer(), Offset);
-              Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-                  Ptr, OriginalBaseLValue.getPointer()->getType());
-              return Address(Ptr, OriginalBaseLValue.getAlignment());
+              return castToBase(*this, OrigVD->getType(),
+                                OASELValueLB.getType(), OriginalBaseLValue,
+                                Ptr);
             });
         assert(IsRegistered && "private var already registered as private");
         // Silence the warning about unused variable.
@@ -741,21 +1023,8 @@
         auto *OrigVD = cast<VarDecl>(DE->getDecl());
         auto ASELValue = EmitLValue(ASE);
         auto OriginalBaseLValue = EmitLValue(DE);
-        auto BaseLValue = OriginalBaseLValue;
-        auto *Zero = Builder.getInt64(/*C=*/0);
-        llvm::SmallVector<llvm::Value *, 4> Indexes;
-        Indexes.push_back(Zero);
-        auto *ItemTy =
-            ASELValue.getPointer()->getType()->getPointerElementType();
-        auto *Ty = BaseLValue.getPointer()->getType()->getPointerElementType();
-        while (Ty != ItemTy) {
-          Indexes.push_back(Zero);
-          Ty = Ty->getPointerElementType();
-        }
-        BaseLValue = MakeAddrLValue(
-            Address(Builder.CreateInBoundsGEP(BaseLValue.getPointer(), Indexes),
-                    ASELValue.getAlignment()),
-            ASELValue.getType(), ASELValue.getAlignmentSource());
+        LValue BaseLValue = loadToBegin(
+            *this, OrigVD->getType(), ASELValue.getType(), OriginalBaseLValue);
         // Store the address of the original variable associated with the LHS
         // implicit variable.
         PrivateScope.addPrivate(LHSVD, [this, ASELValue]() -> Address {
@@ -763,49 +1032,114 @@
         });
         // Emit reduction copy.
         bool IsRegistered = PrivateScope.addPrivate(
-            OrigVD, [this, PrivateVD, BaseLValue, ASELValue,
-                     OriginalBaseLValue]() -> Address {
+            OrigVD, [this, OrigVD, PrivateVD, BaseLValue, ASELValue,
+                     OriginalBaseLValue, DRD, IRed]() -> Address {
               // Emit private VarDecl with reduction init.
-              EmitDecl(*PrivateVD);
-              auto Addr = GetAddrOfLocalVar(PrivateVD);
+              AutoVarEmission Emission = EmitAutoVarAlloca(*PrivateVD);
+              auto Addr = Emission.getAllocatedAddress();
+              if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) {
+                emitInitWithReductionInitializer(*this, DRD, *IRed, Addr,
+                                                 ASELValue.getAddress(),
+                                                 ASELValue.getType());
+              } else
+                EmitAutoVarInit(Emission);
+              EmitAutoVarCleanups(Emission);
               auto *Offset = Builder.CreatePtrDiff(BaseLValue.getPointer(),
                                                    ASELValue.getPointer());
               auto *Ptr = Builder.CreateGEP(Addr.getPointer(), Offset);
-              Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-                  Ptr, OriginalBaseLValue.getPointer()->getType());
-              return Address(Ptr, OriginalBaseLValue.getAlignment());
+              return castToBase(*this, OrigVD->getType(), ASELValue.getType(),
+                                OriginalBaseLValue, Ptr);
             });
         assert(IsRegistered && "private var already registered as private");
         // Silence the warning about unused variable.
         (void)IsRegistered;
-        PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address {
-          return GetAddrOfLocalVar(PrivateVD);
+        PrivateScope.addPrivate(RHSVD, [this, PrivateVD, RHSVD]() -> Address {
+          return Builder.CreateElementBitCast(
+              GetAddrOfLocalVar(PrivateVD), ConvertTypeForMem(RHSVD->getType()),
+              "rhs.begin");
         });
       } else {
         auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(IRef)->getDecl());
-        // Store the address of the original variable associated with the LHS
-        // implicit variable.
-        PrivateScope.addPrivate(LHSVD, [this, OrigVD, IRef]() -> Address {
+        QualType Type = PrivateVD->getType();
+        if (getContext().getAsArrayType(Type)) {
+          // Store the address of the original variable associated with the LHS
+          // implicit variable.
           DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
                           CapturedStmtInfo->lookup(OrigVD) != nullptr,
                           IRef->getType(), VK_LValue, IRef->getExprLoc());
-          return EmitLValue(&DRE).getAddress();
-        });
-        // Emit reduction copy.
-        bool IsRegistered =
-            PrivateScope.addPrivate(OrigVD, [this, PrivateVD]() -> Address {
-              // Emit private VarDecl with reduction init.
-              EmitDecl(*PrivateVD);
-              return GetAddrOfLocalVar(PrivateVD);
-            });
-        assert(IsRegistered && "private var already registered as private");
-        // Silence the warning about unused variable.
-        (void)IsRegistered;
-        PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address {
-          return GetAddrOfLocalVar(PrivateVD);
-        });
+          Address OriginalAddr = EmitLValue(&DRE).getAddress();
+          PrivateScope.addPrivate(LHSVD, [this, &OriginalAddr,
+                                          LHSVD]() -> Address {
+            OriginalAddr = Builder.CreateElementBitCast(
+                OriginalAddr, ConvertTypeForMem(LHSVD->getType()), "lhs.begin");
+            return OriginalAddr;
+          });
+          bool IsRegistered = PrivateScope.addPrivate(OrigVD, [&]() -> Address {
+            if (Type->isVariablyModifiedType()) {
+              CodeGenFunction::OpaqueValueMapping OpaqueMap(
+                  *this, cast<OpaqueValueExpr>(
+                             getContext()
+                                 .getAsVariableArrayType(PrivateVD->getType())
+                                 ->getSizeExpr()),
+                  RValue::get(
+                      getTypeSize(OrigVD->getType().getNonReferenceType())));
+              EmitVariablyModifiedType(Type);
+            }
+            auto Emission = EmitAutoVarAlloca(*PrivateVD);
+            auto Addr = Emission.getAllocatedAddress();
+            auto *Init = PrivateVD->getInit();
+            EmitOMPAggregateInit(*this, Addr, PrivateVD->getType(),
+                                 DRD ? *IRed : Init, OriginalAddr);
+            EmitAutoVarCleanups(Emission);
+            return Emission.getAllocatedAddress();
+          });
+          assert(IsRegistered && "private var already registered as private");
+          // Silence the warning about unused variable.
+          (void)IsRegistered;
+          PrivateScope.addPrivate(RHSVD, [this, PrivateVD, RHSVD]() -> Address {
+            return Builder.CreateElementBitCast(
+                GetAddrOfLocalVar(PrivateVD),
+                ConvertTypeForMem(RHSVD->getType()), "rhs.begin");
+          });
+        } else {
+          // Store the address of the original variable associated with the LHS
+          // implicit variable.
+          Address OriginalAddr = Address::invalid();
+          PrivateScope.addPrivate(LHSVD, [this, OrigVD, IRef,
+                                          &OriginalAddr]() -> Address {
+            DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
+                            CapturedStmtInfo->lookup(OrigVD) != nullptr,
+                            IRef->getType(), VK_LValue, IRef->getExprLoc());
+            OriginalAddr = EmitLValue(&DRE).getAddress();
+            return OriginalAddr;
+          });
+          // Emit reduction copy.
+          bool IsRegistered = PrivateScope.addPrivate(
+              OrigVD, [this, PrivateVD, OriginalAddr, DRD, IRed]() -> Address {
+                // Emit private VarDecl with reduction init.
+                AutoVarEmission Emission = EmitAutoVarAlloca(*PrivateVD);
+                auto Addr = Emission.getAllocatedAddress();
+                if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) {
+                  emitInitWithReductionInitializer(*this, DRD, *IRed, Addr,
+                                                   OriginalAddr,
+                                                   PrivateVD->getType());
+                } else
+                  EmitAutoVarInit(Emission);
+                EmitAutoVarCleanups(Emission);
+                return Addr;
+              });
+          assert(IsRegistered && "private var already registered as private");
+          // Silence the warning about unused variable.
+          (void)IsRegistered;
+          PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address {
+            return GetAddrOfLocalVar(PrivateVD);
+          });
+        }
       }
-      ++ILHS, ++IRHS, ++IPriv;
+      ++ILHS;
+      ++IRHS;
+      ++IPriv;
+      ++IRed;
     }
   }
 }
@@ -838,15 +1172,39 @@
   }
 }
 
+static void emitPostUpdateForReductionClause(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D,
+    const llvm::function_ref<llvm::Value *(CodeGenFunction &)> &CondGen) {
+  if (!CGF.HaveInsertPoint())
+    return;
+  llvm::BasicBlock *DoneBB = nullptr;
+  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
+    if (auto *PostUpdate = C->getPostUpdateExpr()) {
+      if (!DoneBB) {
+        if (auto *Cond = CondGen(CGF)) {
+          // If the first post-update expression is found, emit conditional
+          // block if it was requested.
+          auto *ThenBB = CGF.createBasicBlock(".omp.reduction.pu");
+          DoneBB = CGF.createBasicBlock(".omp.reduction.pu.done");
+          CGF.Builder.CreateCondBr(Cond, ThenBB, DoneBB);
+          CGF.EmitBlock(ThenBB);
+        }
+      }
+      CGF.EmitIgnoredExpr(PostUpdate);
+    }
+  }
+  if (DoneBB)
+    CGF.EmitBlock(DoneBB, /*IsFinished=*/true);
+}
+
 static void emitCommonOMPParallelDirective(CodeGenFunction &CGF,
                                            const OMPExecutableDirective &S,
                                            OpenMPDirectiveKind InnermostKind,
                                            const RegionCodeGenTy &CodeGen) {
   auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
-  llvm::SmallVector<llvm::Value *, 16> CapturedVars;
-  CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars);
-  auto OutlinedFn = CGF.CGM.getOpenMPRuntime().emitParallelOutlinedFunction(
-      S, *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen);
+  auto OutlinedFn = CGF.CGM.getOpenMPRuntime().
+      emitParallelOrTeamsOutlinedFunction(S,
+          *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen);
   if (const auto *NumThreadsClause = S.getSingleClause<OMPNumThreadsClause>()) {
     CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF);
     auto NumThreads = CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(),
@@ -855,7 +1213,7 @@
         CGF, NumThreads, NumThreadsClause->getLocStart());
   }
   if (const auto *ProcBindClause = S.getSingleClause<OMPProcBindClause>()) {
-    CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF);
+    CodeGenFunction::RunCleanupsScope ProcBindScope(CGF);
     CGF.CGM.getOpenMPRuntime().emitProcBindClause(
         CGF, ProcBindClause->getProcBindKind(), ProcBindClause->getLocStart());
   }
@@ -867,22 +1225,24 @@
       break;
     }
   }
+
+  OMPLexicalScope Scope(CGF, S);
+  llvm::SmallVector<llvm::Value *, 16> CapturedVars;
+  CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars);
   CGF.CGM.getOpenMPRuntime().emitParallelCall(CGF, S.getLocStart(), OutlinedFn,
                                               CapturedVars, IfCond);
 }
 
 void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
   // Emit parallel region as a standalone region.
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
     OMPPrivateScope PrivateScope(CGF);
     bool Copyins = CGF.EmitOMPCopyinClause(S);
-    bool Firstprivates = CGF.EmitOMPFirstprivateClause(S, PrivateScope);
-    if (Copyins || Firstprivates) {
+    (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope);
+    if (Copyins) {
       // Emit implicit barrier to synchronize threads and avoid data races on
-      // initialization of firstprivate variables or propagation master's thread
-      // values of threadprivate variables to local instances of that variables
-      // of all other implicit threads.
+      // propagation master's thread values of threadprivate variables to local
+      // instances of that variables of all other implicit threads.
       CGF.CGM.getOpenMPRuntime().emitBarrierCall(
           CGF, S.getLocStart(), OMPD_unknown, /*EmitChecks=*/false,
           /*ForceSimpleCall=*/true);
@@ -894,6 +1254,8 @@
     CGF.EmitOMPReductionClauseFinal(S);
   };
   emitCommonOMPParallelDirective(*this, S, OMPD_parallel, CodeGen);
+  emitPostUpdateForReductionClause(
+      *this, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; });
 }
 
 void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &D,
@@ -905,9 +1267,8 @@
   }
   // Update the linear variables.
   for (const auto *C : D.getClausesOfKind<OMPLinearClause>()) {
-    for (auto U : C->updates()) {
+    for (auto *U : C->updates())
       EmitIgnoredExpr(U);
-    }
   }
 
   // On a continue in the body, jump to the end.
@@ -918,10 +1279,6 @@
   // The end (updates/cleanups).
   EmitBlock(Continue.getBlock());
   BreakContinueStack.pop_back();
-    // TODO: Update lastprivates if the SeparateIter flag is true.
-    // This will be implemented in a follow-up OMPLastprivateClause patch, but
-    // result should be still correct without it, as we do not make these
-    // variables private yet.
 }
 
 void CodeGenFunction::EmitOMPInnerLoop(
@@ -934,7 +1291,7 @@
   // Start the loop with a block that tests the condition.
   auto CondBlock = createBasicBlock("omp.inner.for.cond");
   EmitBlock(CondBlock);
-  LoopStack.push(CondBlock);
+  LoopStack.push(CondBlock, Builder.getCurrentDebugLocation());
 
   // If there are any cleanups between here and the loop-exit scope,
   // create a block to stage a loop exit along.
@@ -976,19 +1333,21 @@
     return;
   // Emit inits for the linear variables.
   for (const auto *C : D.getClausesOfKind<OMPLinearClause>()) {
-    for (auto Init : C->inits()) {
+    for (auto *Init : C->inits()) {
       auto *VD = cast<VarDecl>(cast<DeclRefExpr>(Init)->getDecl());
-      auto *OrigVD = cast<VarDecl>(
-          cast<DeclRefExpr>(VD->getInit()->IgnoreImpCasts())->getDecl());
-      DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
-                      CapturedStmtInfo->lookup(OrigVD) != nullptr,
-                      VD->getInit()->getType(), VK_LValue,
-                      VD->getInit()->getExprLoc());
-      AutoVarEmission Emission = EmitAutoVarAlloca(*VD);
-      EmitExprAsInit(&DRE, VD,
-               MakeAddrLValue(Emission.getAllocatedAddress(), VD->getType()),
-                     /*capturedByInit=*/false);
-      EmitAutoVarCleanups(Emission);
+      if (auto *Ref = dyn_cast<DeclRefExpr>(VD->getInit()->IgnoreImpCasts())) {
+        AutoVarEmission Emission = EmitAutoVarAlloca(*VD);
+        auto *OrigVD = cast<VarDecl>(Ref->getDecl());
+        DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
+                        CapturedStmtInfo->lookup(OrigVD) != nullptr,
+                        VD->getInit()->getType(), VK_LValue,
+                        VD->getInit()->getExprLoc());
+        EmitExprAsInit(&DRE, VD, MakeAddrLValue(Emission.getAllocatedAddress(),
+                                                VD->getType()),
+                       /*capturedByInit=*/false);
+        EmitAutoVarCleanups(Emission);
+      } else
+        EmitVarDecl(*VD);
     }
     // Emit the linear steps for the linear clauses.
     // If a step is not constant, it is pre-calculated before the loop.
@@ -1001,27 +1360,42 @@
   }
 }
 
-static void emitLinearClauseFinal(CodeGenFunction &CGF,
-                                  const OMPLoopDirective &D) {
-  if (!CGF.HaveInsertPoint())
+void CodeGenFunction::EmitOMPLinearClauseFinal(
+    const OMPLoopDirective &D,
+    const llvm::function_ref<llvm::Value *(CodeGenFunction &)> &CondGen) {
+  if (!HaveInsertPoint())
     return;
+  llvm::BasicBlock *DoneBB = nullptr;
   // Emit the final values of the linear variables.
   for (const auto *C : D.getClausesOfKind<OMPLinearClause>()) {
     auto IC = C->varlist_begin();
-    for (auto F : C->finals()) {
+    for (auto *F : C->finals()) {
+      if (!DoneBB) {
+        if (auto *Cond = CondGen(*this)) {
+          // If the first post-update expression is found, emit conditional
+          // block if it was requested.
+          auto *ThenBB = createBasicBlock(".omp.linear.pu");
+          DoneBB = createBasicBlock(".omp.linear.pu.done");
+          Builder.CreateCondBr(Cond, ThenBB, DoneBB);
+          EmitBlock(ThenBB);
+        }
+      }
       auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*IC)->getDecl());
       DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
-                      CGF.CapturedStmtInfo->lookup(OrigVD) != nullptr,
+                      CapturedStmtInfo->lookup(OrigVD) != nullptr,
                       (*IC)->getType(), VK_LValue, (*IC)->getExprLoc());
-      Address OrigAddr = CGF.EmitLValue(&DRE).getAddress();
-      CodeGenFunction::OMPPrivateScope VarScope(CGF);
-      VarScope.addPrivate(OrigVD,
-                          [OrigAddr]() -> Address { return OrigAddr; });
+      Address OrigAddr = EmitLValue(&DRE).getAddress();
+      CodeGenFunction::OMPPrivateScope VarScope(*this);
+      VarScope.addPrivate(OrigVD, [OrigAddr]() -> Address { return OrigAddr; });
       (void)VarScope.Privatize();
-      CGF.EmitIgnoredExpr(F);
+      EmitIgnoredExpr(F);
       ++IC;
     }
+    if (auto *PostUpdate = C->getPostUpdateExpr())
+      EmitIgnoredExpr(PostUpdate);
   }
+  if (DoneBB)
+    EmitBlock(DoneBB, /*IsFinished=*/true);
 }
 
 static void emitAlignedClause(CodeGenFunction &CGF,
@@ -1057,25 +1431,34 @@
   }
 }
 
-static void emitPrivateLoopCounters(CodeGenFunction &CGF,
-                                    CodeGenFunction::OMPPrivateScope &LoopScope,
-                                    ArrayRef<Expr *> Counters,
-                                    ArrayRef<Expr *> PrivateCounters) {
-  if (!CGF.HaveInsertPoint())
+void CodeGenFunction::EmitOMPPrivateLoopCounters(
+    const OMPLoopDirective &S, CodeGenFunction::OMPPrivateScope &LoopScope) {
+  if (!HaveInsertPoint())
     return;
-  auto I = PrivateCounters.begin();
-  for (auto *E : Counters) {
+  auto I = S.private_counters().begin();
+  for (auto *E : S.counters()) {
     auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
     auto *PrivateVD = cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl());
-    Address Addr = Address::invalid();
-    (void)LoopScope.addPrivate(PrivateVD, [&]() -> Address {
+    (void)LoopScope.addPrivate(VD, [&]() -> Address {
       // Emit var without initialization.
-      auto VarEmission = CGF.EmitAutoVarAlloca(*PrivateVD);
-      CGF.EmitAutoVarCleanups(VarEmission);
-      Addr = VarEmission.getAllocatedAddress();
-      return Addr;
+      if (!LocalDeclMap.count(PrivateVD)) {
+        auto VarEmission = EmitAutoVarAlloca(*PrivateVD);
+        EmitAutoVarCleanups(VarEmission);
+      }
+      DeclRefExpr DRE(const_cast<VarDecl *>(PrivateVD),
+                      /*RefersToEnclosingVariableOrCapture=*/false,
+                      (*I)->getType(), VK_LValue, (*I)->getExprLoc());
+      return EmitLValue(&DRE).getAddress();
     });
-    (void)LoopScope.addPrivate(VD, [&]() -> Address { return Addr; });
+    if (LocalDeclMap.count(VD) || CapturedStmtInfo->lookup(VD) ||
+        VD->hasGlobalStorage()) {
+      (void)LoopScope.addPrivate(PrivateVD, [&]() -> Address {
+        DeclRefExpr DRE(const_cast<VarDecl *>(VD),
+                        LocalDeclMap.count(VD) || CapturedStmtInfo->lookup(VD),
+                        E->getType(), VK_LValue, E->getExprLoc());
+        return EmitLValue(&DRE).getAddress();
+      });
+    }
     ++I;
   }
 }
@@ -1087,8 +1470,7 @@
     return;
   {
     CodeGenFunction::OMPPrivateScope PreCondScope(CGF);
-    emitPrivateLoopCounters(CGF, PreCondScope, S.counters(),
-                            S.private_counters());
+    CGF.EmitOMPPrivateLoopCounters(S, PreCondScope);
     (void)PreCondScope.Privatize();
     // Get initial values of real counters.
     for (auto I : S.inits()) {
@@ -1099,25 +1481,35 @@
   CGF.EmitBranchOnBoolExpr(Cond, TrueBlock, FalseBlock, TrueCount);
 }
 
-static void
-emitPrivateLinearVars(CodeGenFunction &CGF, const OMPExecutableDirective &D,
-                      CodeGenFunction::OMPPrivateScope &PrivateScope) {
-  if (!CGF.HaveInsertPoint())
+void CodeGenFunction::EmitOMPLinearClause(
+    const OMPLoopDirective &D, CodeGenFunction::OMPPrivateScope &PrivateScope) {
+  if (!HaveInsertPoint())
     return;
+  llvm::DenseSet<const VarDecl *> SIMDLCVs;
+  if (isOpenMPSimdDirective(D.getDirectiveKind())) {
+    auto *LoopDirective = cast<OMPLoopDirective>(&D);
+    for (auto *C : LoopDirective->counters()) {
+      SIMDLCVs.insert(
+          cast<VarDecl>(cast<DeclRefExpr>(C)->getDecl())->getCanonicalDecl());
+    }
+  }
   for (const auto *C : D.getClausesOfKind<OMPLinearClause>()) {
     auto CurPrivate = C->privates().begin();
     for (auto *E : C->varlists()) {
       auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
       auto *PrivateVD =
           cast<VarDecl>(cast<DeclRefExpr>(*CurPrivate)->getDecl());
-      bool IsRegistered = PrivateScope.addPrivate(VD, [&]() -> Address {
-        // Emit private VarDecl with copy init.
-        CGF.EmitVarDecl(*PrivateVD);
-        return CGF.GetAddrOfLocalVar(PrivateVD);
-      });
-      assert(IsRegistered && "linear var already registered as private");
-      // Silence the warning about unused variable.
-      (void)IsRegistered;
+      if (!SIMDLCVs.count(VD->getCanonicalDecl())) {
+        bool IsRegistered = PrivateScope.addPrivate(VD, [&]() -> Address {
+          // Emit private VarDecl with copy init.
+          EmitVarDecl(*PrivateVD);
+          return GetAddrOfLocalVar(PrivateVD);
+        });
+        assert(IsRegistered && "linear var already registered as private");
+        // Silence the warning about unused variable.
+        (void)IsRegistered;
+      } else
+        EmitVarDecl(*PrivateVD);
       ++CurPrivate;
     }
   }
@@ -1158,17 +1550,39 @@
   emitSimdlenSafelenClause(*this, D, IsMonotonic);
 }
 
-void CodeGenFunction::EmitOMPSimdFinal(const OMPLoopDirective &D) {
+void CodeGenFunction::EmitOMPSimdFinal(
+    const OMPLoopDirective &D,
+    const llvm::function_ref<llvm::Value *(CodeGenFunction &)> &CondGen) {
   if (!HaveInsertPoint())
     return;
+  llvm::BasicBlock *DoneBB = nullptr;
   auto IC = D.counters().begin();
+  auto IPC = D.private_counters().begin();
   for (auto F : D.finals()) {
     auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>((*IC))->getDecl());
-    if (LocalDeclMap.count(OrigVD) || CapturedStmtInfo->lookup(OrigVD)) {
-      DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
-                      CapturedStmtInfo->lookup(OrigVD) != nullptr,
-                      (*IC)->getType(), VK_LValue, (*IC)->getExprLoc());
-      Address OrigAddr = EmitLValue(&DRE).getAddress();
+    auto *PrivateVD = cast<VarDecl>(cast<DeclRefExpr>((*IPC))->getDecl());
+    auto *CED = dyn_cast<OMPCapturedExprDecl>(OrigVD);
+    if (LocalDeclMap.count(OrigVD) || CapturedStmtInfo->lookup(OrigVD) ||
+        OrigVD->hasGlobalStorage() || CED) {
+      if (!DoneBB) {
+        if (auto *Cond = CondGen(*this)) {
+          // If the first post-update expression is found, emit conditional
+          // block if it was requested.
+          auto *ThenBB = createBasicBlock(".omp.final.then");
+          DoneBB = createBasicBlock(".omp.final.done");
+          Builder.CreateCondBr(Cond, ThenBB, DoneBB);
+          EmitBlock(ThenBB);
+        }
+      }
+      Address OrigAddr = Address::invalid();
+      if (CED)
+        OrigAddr = EmitLValue(CED->getInit()->IgnoreImpCasts()).getAddress();
+      else {
+        DeclRefExpr DRE(const_cast<VarDecl *>(PrivateVD),
+                        /*RefersToEnclosingVariableOrCapture=*/false,
+                        (*IPC)->getType(), VK_LValue, (*IPC)->getExprLoc());
+        OrigAddr = EmitLValue(&DRE).getAddress();
+      }
       OMPPrivateScope VarScope(*this);
       VarScope.addPrivate(OrigVD,
                           [OrigAddr]() -> Address { return OrigAddr; });
@@ -1176,12 +1590,15 @@
       EmitIgnoredExpr(F);
     }
     ++IC;
+    ++IPC;
   }
-  emitLinearClauseFinal(*this, D);
+  if (DoneBB)
+    EmitBlock(DoneBB, /*IsFinished=*/true);
 }
 
 void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+    OMPLoopScope PreInitScope(CGF, S);
     // if (PreCond) {
     //   for (IV in 0..LastIteration) BODY;
     //   <Final counter/linear vars updates>;
@@ -1224,15 +1641,14 @@
 
     emitAlignedClause(CGF, S);
     CGF.EmitOMPLinearClauseInit(S);
-    bool HasLastprivateClause;
     {
       OMPPrivateScope LoopScope(CGF);
-      emitPrivateLoopCounters(CGF, LoopScope, S.counters(),
-                              S.private_counters());
-      emitPrivateLinearVars(CGF, S, LoopScope);
+      CGF.EmitOMPPrivateLoopCounters(S, LoopScope);
+      CGF.EmitOMPLinearClause(S, LoopScope);
       CGF.EmitOMPPrivateClause(S, LoopScope);
       CGF.EmitOMPReductionClauseInit(S, LoopScope);
-      HasLastprivateClause = CGF.EmitOMPLastprivateClauseInit(S, LoopScope);
+      bool HasLastprivateClause =
+          CGF.EmitOMPLastprivateClauseInit(S, LoopScope);
       (void)LoopScope.Privatize();
       CGF.EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(),
                            S.getInc(),
@@ -1241,104 +1657,42 @@
                              CGF.EmitStopPoint(&S);
                            },
                            [](CodeGenFunction &) {});
+      CGF.EmitOMPSimdFinal(
+          S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; });
       // Emit final copy of the lastprivate variables at the end of loops.
-      if (HasLastprivateClause) {
-        CGF.EmitOMPLastprivateClauseFinal(S);
-      }
+      if (HasLastprivateClause)
+        CGF.EmitOMPLastprivateClauseFinal(S, /*NoFinals=*/true);
       CGF.EmitOMPReductionClauseFinal(S);
+      emitPostUpdateForReductionClause(
+          CGF, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; });
     }
-    CGF.EmitOMPSimdFinal(S);
+    CGF.EmitOMPLinearClauseFinal(
+        S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; });
     // Emit: if (PreCond) - end.
     if (ContBlock) {
       CGF.EmitBranch(ContBlock);
       CGF.EmitBlock(ContBlock, true);
     }
   };
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
 }
 
-void CodeGenFunction::EmitOMPForOuterLoop(
-    OpenMPScheduleClauseKind ScheduleKind, bool IsMonotonic,
+void CodeGenFunction::EmitOMPOuterLoop(bool DynamicOrOrdered, bool IsMonotonic,
     const OMPLoopDirective &S, OMPPrivateScope &LoopScope, bool Ordered,
     Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk) {
   auto &RT = CGM.getOpenMPRuntime();
 
-  // Dynamic scheduling of the outer loop (dynamic, guided, auto, runtime).
-  const bool DynamicOrOrdered = Ordered || RT.isDynamic(ScheduleKind);
-
-  assert((Ordered ||
-          !RT.isStaticNonchunked(ScheduleKind, /*Chunked=*/Chunk != nullptr)) &&
-         "static non-chunked schedule does not need outer loop");
-
-  // Emit outer loop.
-  //
-  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
-  // When schedule(dynamic,chunk_size) is specified, the iterations are
-  // distributed to threads in the team in chunks as the threads request them.
-  // Each thread executes a chunk of iterations, then requests another chunk,
-  // until no chunks remain to be distributed. Each chunk contains chunk_size
-  // iterations, except for the last chunk to be distributed, which may have
-  // fewer iterations. When no chunk_size is specified, it defaults to 1.
-  //
-  // When schedule(guided,chunk_size) is specified, the iterations are assigned
-  // to threads in the team in chunks as the executing threads request them.
-  // Each thread executes a chunk of iterations, then requests another chunk,
-  // until no chunks remain to be assigned. For a chunk_size of 1, the size of
-  // each chunk is proportional to the number of unassigned iterations divided
-  // by the number of threads in the team, decreasing to 1. For a chunk_size
-  // with value k (greater than 1), the size of each chunk is determined in the
-  // same way, with the restriction that the chunks do not contain fewer than k
-  // iterations (except for the last chunk to be assigned, which may have fewer
-  // than k iterations).
-  //
-  // When schedule(auto) is specified, the decision regarding scheduling is
-  // delegated to the compiler and/or runtime system. The programmer gives the
-  // implementation the freedom to choose any possible mapping of iterations to
-  // threads in the team.
-  //
-  // When schedule(runtime) is specified, the decision regarding scheduling is
-  // deferred until run time, and the schedule and chunk size are taken from the
-  // run-sched-var ICV. If the ICV is set to auto, the schedule is
-  // implementation defined
-  //
-  // while(__kmpc_dispatch_next(&LB, &UB)) {
-  //   idx = LB;
-  //   while (idx <= UB) { BODY; ++idx;
-  //   __kmpc_dispatch_fini_(4|8)[u](); // For ordered loops only.
-  //   } // inner loop
-  // }
-  //
-  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
-  // When schedule(static, chunk_size) is specified, iterations are divided into
-  // chunks of size chunk_size, and the chunks are assigned to the threads in
-  // the team in a round-robin fashion in the order of the thread number.
-  //
-  // while(UB = min(UB, GlobalUB), idx = LB, idx < UB) {
-  //   while (idx <= UB) { BODY; ++idx; } // inner loop
-  //   LB = LB + ST;
-  //   UB = UB + ST;
-  // }
-  //
-
   const Expr *IVExpr = S.getIterationVariable();
   const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
   const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
 
-  if (DynamicOrOrdered) {
-    llvm::Value *UBVal = EmitScalarExpr(S.getLastIteration());
-    RT.emitForDispatchInit(*this, S.getLocStart(), ScheduleKind,
-                           IVSize, IVSigned, Ordered, UBVal, Chunk);
-  } else {
-    RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind,
-                         IVSize, IVSigned, Ordered, IL, LB, UB, ST, Chunk);
-  }
-
   auto LoopExit = getJumpDestInCurrentScope("omp.dispatch.end");
 
   // Start the loop with a block that tests the condition.
   auto CondBlock = createBasicBlock("omp.dispatch.cond");
   EmitBlock(CondBlock);
-  LoopStack.push(CondBlock);
+  LoopStack.push(CondBlock, Builder.getCurrentDebugLocation());
 
   llvm::Value *BoolCondVal = nullptr;
   if (!DynamicOrOrdered) {
@@ -1349,8 +1703,8 @@
     // IV < UB
     BoolCondVal = EvaluateExprAsBool(S.getCond());
   } else {
-    BoolCondVal = RT.emitForNext(*this, S.getLocStart(), IVSize, IVSigned,
-                                    IL, LB, UB, ST);
+    BoolCondVal = RT.emitForNext(*this, S.getLocStart(), IVSize, IVSigned, IL,
+                                 LB, UB, ST);
   }
 
   // If there are any cleanups between here and the loop-exit scope,
@@ -1412,6 +1766,184 @@
   // Tell the runtime we are done.
   if (!DynamicOrOrdered)
     RT.emitForStaticFinish(*this, S.getLocEnd());
+
+}
+
+void CodeGenFunction::EmitOMPForOuterLoop(
+    const OpenMPScheduleTy &ScheduleKind, bool IsMonotonic,
+    const OMPLoopDirective &S, OMPPrivateScope &LoopScope, bool Ordered,
+    Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk) {
+  auto &RT = CGM.getOpenMPRuntime();
+
+  // Dynamic scheduling of the outer loop (dynamic, guided, auto, runtime).
+  const bool DynamicOrOrdered =
+      Ordered || RT.isDynamic(ScheduleKind.Schedule);
+
+  assert((Ordered ||
+          !RT.isStaticNonchunked(ScheduleKind.Schedule,
+                                 /*Chunked=*/Chunk != nullptr)) &&
+         "static non-chunked schedule does not need outer loop");
+
+  // Emit outer loop.
+  //
+  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
+  // When schedule(dynamic,chunk_size) is specified, the iterations are
+  // distributed to threads in the team in chunks as the threads request them.
+  // Each thread executes a chunk of iterations, then requests another chunk,
+  // until no chunks remain to be distributed. Each chunk contains chunk_size
+  // iterations, except for the last chunk to be distributed, which may have
+  // fewer iterations. When no chunk_size is specified, it defaults to 1.
+  //
+  // When schedule(guided,chunk_size) is specified, the iterations are assigned
+  // to threads in the team in chunks as the executing threads request them.
+  // Each thread executes a chunk of iterations, then requests another chunk,
+  // until no chunks remain to be assigned. For a chunk_size of 1, the size of
+  // each chunk is proportional to the number of unassigned iterations divided
+  // by the number of threads in the team, decreasing to 1. For a chunk_size
+  // with value k (greater than 1), the size of each chunk is determined in the
+  // same way, with the restriction that the chunks do not contain fewer than k
+  // iterations (except for the last chunk to be assigned, which may have fewer
+  // than k iterations).
+  //
+  // When schedule(auto) is specified, the decision regarding scheduling is
+  // delegated to the compiler and/or runtime system. The programmer gives the
+  // implementation the freedom to choose any possible mapping of iterations to
+  // threads in the team.
+  //
+  // When schedule(runtime) is specified, the decision regarding scheduling is
+  // deferred until run time, and the schedule and chunk size are taken from the
+  // run-sched-var ICV. If the ICV is set to auto, the schedule is
+  // implementation defined
+  //
+  // while(__kmpc_dispatch_next(&LB, &UB)) {
+  //   idx = LB;
+  //   while (idx <= UB) { BODY; ++idx;
+  //   __kmpc_dispatch_fini_(4|8)[u](); // For ordered loops only.
+  //   } // inner loop
+  // }
+  //
+  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
+  // When schedule(static, chunk_size) is specified, iterations are divided into
+  // chunks of size chunk_size, and the chunks are assigned to the threads in
+  // the team in a round-robin fashion in the order of the thread number.
+  //
+  // while(UB = min(UB, GlobalUB), idx = LB, idx < UB) {
+  //   while (idx <= UB) { BODY; ++idx; } // inner loop
+  //   LB = LB + ST;
+  //   UB = UB + ST;
+  // }
+  //
+
+  const Expr *IVExpr = S.getIterationVariable();
+  const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
+  const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+
+  if (DynamicOrOrdered) {
+    llvm::Value *UBVal = EmitScalarExpr(S.getLastIteration());
+    RT.emitForDispatchInit(*this, S.getLocStart(), ScheduleKind, IVSize,
+                           IVSigned, Ordered, UBVal, Chunk);
+  } else {
+    RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned,
+                         Ordered, IL, LB, UB, ST, Chunk);
+  }
+
+  EmitOMPOuterLoop(DynamicOrOrdered, IsMonotonic, S, LoopScope, Ordered, LB, UB,
+                   ST, IL, Chunk);
+}
+
+void CodeGenFunction::EmitOMPDistributeOuterLoop(
+    OpenMPDistScheduleClauseKind ScheduleKind,
+    const OMPDistributeDirective &S, OMPPrivateScope &LoopScope,
+    Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk) {
+
+  auto &RT = CGM.getOpenMPRuntime();
+
+  // Emit outer loop.
+  // Same behavior as a OMPForOuterLoop, except that schedule cannot be
+  // dynamic
+  //
+
+  const Expr *IVExpr = S.getIterationVariable();
+  const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
+  const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+
+  RT.emitDistributeStaticInit(*this, S.getLocStart(), ScheduleKind,
+                              IVSize, IVSigned, /* Ordered = */ false,
+                              IL, LB, UB, ST, Chunk);
+
+  EmitOMPOuterLoop(/* DynamicOrOrdered = */ false, /* IsMonotonic = */ false,
+                   S, LoopScope, /* Ordered = */ false, LB, UB, ST, IL, Chunk);
+}
+
+void CodeGenFunction::EmitOMPDistributeParallelForDirective(
+    const OMPDistributeParallelForDirective &S) {
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  CGM.getOpenMPRuntime().emitInlinedDirective(
+      *this, OMPD_distribute_parallel_for,
+      [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+        OMPLoopScope PreInitScope(CGF, S);
+        CGF.EmitStmt(
+            cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+      });
+}
+
+void CodeGenFunction::EmitOMPDistributeParallelForSimdDirective(
+    const OMPDistributeParallelForSimdDirective &S) {
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  CGM.getOpenMPRuntime().emitInlinedDirective(
+      *this, OMPD_distribute_parallel_for_simd,
+      [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+        OMPLoopScope PreInitScope(CGF, S);
+        CGF.EmitStmt(
+            cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+      });
+}
+
+void CodeGenFunction::EmitOMPDistributeSimdDirective(
+    const OMPDistributeSimdDirective &S) {
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  CGM.getOpenMPRuntime().emitInlinedDirective(
+      *this, OMPD_distribute_simd,
+      [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+        OMPLoopScope PreInitScope(CGF, S);
+        CGF.EmitStmt(
+            cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+      });
+}
+
+void CodeGenFunction::EmitOMPTargetParallelForSimdDirective(
+    const OMPTargetParallelForSimdDirective &S) {
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  CGM.getOpenMPRuntime().emitInlinedDirective(
+      *this, OMPD_target_parallel_for_simd,
+      [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+        OMPLoopScope PreInitScope(CGF, S);
+        CGF.EmitStmt(
+            cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+      });
+}
+
+void CodeGenFunction::EmitOMPTargetSimdDirective(
+    const OMPTargetSimdDirective &S) {
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  CGM.getOpenMPRuntime().emitInlinedDirective(
+      *this, OMPD_target_simd, [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+        OMPLoopScope PreInitScope(CGF, S);
+        CGF.EmitStmt(
+            cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+      });
+}
+
+void CodeGenFunction::EmitOMPTeamsDistributeDirective(
+    const OMPTeamsDistributeDirective &S) {
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  CGM.getOpenMPRuntime().emitInlinedDirective(
+      *this, OMPD_teams_distribute,
+      [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+        OMPLoopScope PreInitScope(CGF, S);
+        CGF.EmitStmt(
+            cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+      });
 }
 
 /// \brief Emit a helper variable and return corresponding lvalue.
@@ -1434,42 +1966,6 @@
   };
 } // namespace
 
-static std::pair<llvm::Value * /*Chunk*/, ScheduleKindModifiersTy>
-emitScheduleClause(CodeGenFunction &CGF, const OMPLoopDirective &S,
-                   bool OuterRegion) {
-  // Detect the loop schedule kind and chunk.
-  auto ScheduleKind = OMPC_SCHEDULE_unknown;
-  OpenMPScheduleClauseModifier M1 = OMPC_SCHEDULE_MODIFIER_unknown;
-  OpenMPScheduleClauseModifier M2 = OMPC_SCHEDULE_MODIFIER_unknown;
-  llvm::Value *Chunk = nullptr;
-  if (const auto *C = S.getSingleClause<OMPScheduleClause>()) {
-    ScheduleKind = C->getScheduleKind();
-    M1 = C->getFirstScheduleModifier();
-    M2 = C->getSecondScheduleModifier();
-    if (const auto *Ch = C->getChunkSize()) {
-      if (auto *ImpRef = cast_or_null<DeclRefExpr>(C->getHelperChunkSize())) {
-        if (OuterRegion) {
-          const VarDecl *ImpVar = cast<VarDecl>(ImpRef->getDecl());
-          CGF.EmitVarDecl(*ImpVar);
-          CGF.EmitStoreThroughLValue(
-              CGF.EmitAnyExpr(Ch),
-              CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(ImpVar),
-                                 ImpVar->getType()));
-        } else {
-          Ch = ImpRef;
-        }
-      }
-      if (!C->getHelperChunkSize() || !OuterRegion) {
-        Chunk = CGF.EmitScalarExpr(Ch);
-        Chunk = CGF.EmitScalarConversion(Chunk, Ch->getType(),
-                                         S.getIterationVariable()->getType(),
-                                         S.getLocStart());
-      }
-    }
-  }
-  return std::make_pair(Chunk, ScheduleKindModifiersTy(ScheduleKind, M1, M2));
-}
-
 bool CodeGenFunction::EmitOMPWorksharingLoop(const OMPLoopDirective &S) {
   // Emit the loop iteration variable.
   auto IVExpr = cast<DeclRefExpr>(S.getIterationVariable());
@@ -1490,6 +1986,7 @@
   bool HasLastprivateClause;
   // Check pre-condition.
   {
+    OMPLoopScope PreInitScope(*this, S);
     // Skip the entire loop if we don't meet the precondition.
     // If the condition constant folds and can be elided, avoid emitting the
     // whole loop.
@@ -1507,24 +2004,34 @@
       incrementProfileCounter(&S);
     }
 
+    bool Ordered = false;
+    if (auto *OrderedClause = S.getSingleClause<OMPOrderedClause>()) {
+      if (OrderedClause->getNumForLoops())
+        RT.emitDoacrossInit(*this, S);
+      else
+        Ordered = true;
+    }
+
+    llvm::DenseSet<const Expr *> EmittedFinals;
     emitAlignedClause(*this, S);
     EmitOMPLinearClauseInit(S);
+    // Emit helper vars inits.
+    LValue LB =
+        EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getLowerBoundVariable()));
+    LValue UB =
+        EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getUpperBoundVariable()));
+    LValue ST =
+        EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getStrideVariable()));
+    LValue IL =
+        EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getIsLastIterVariable()));
+
     // Emit 'then' code.
     {
-      // Emit helper vars inits.
-      LValue LB =
-          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getLowerBoundVariable()));
-      LValue UB =
-          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getUpperBoundVariable()));
-      LValue ST =
-          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getStrideVariable()));
-      LValue IL =
-          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getIsLastIterVariable()));
-
       OMPPrivateScope LoopScope(*this);
       if (EmitOMPFirstprivateClause(S, LoopScope)) {
         // Emit implicit barrier to synchronize threads and avoid data races on
-        // initialization of firstprivate variables.
+        // initialization of firstprivate variables and post-update of
+        // lastprivate variables.
         CGM.getOpenMPRuntime().emitBarrierCall(
             *this, S.getLocStart(), OMPD_unknown, /*EmitChecks=*/false,
             /*ForceSimpleCall=*/true);
@@ -1532,28 +2039,31 @@
       EmitOMPPrivateClause(S, LoopScope);
       HasLastprivateClause = EmitOMPLastprivateClauseInit(S, LoopScope);
       EmitOMPReductionClauseInit(S, LoopScope);
-      emitPrivateLoopCounters(*this, LoopScope, S.counters(),
-                              S.private_counters());
-      emitPrivateLinearVars(*this, S, LoopScope);
+      EmitOMPPrivateLoopCounters(S, LoopScope);
+      EmitOMPLinearClause(S, LoopScope);
       (void)LoopScope.Privatize();
 
       // Detect the loop schedule kind and chunk.
-      llvm::Value *Chunk;
-      OpenMPScheduleClauseKind ScheduleKind;
-      auto ScheduleInfo =
-          emitScheduleClause(*this, S, /*OuterRegion=*/false);
-      Chunk = ScheduleInfo.first;
-      ScheduleKind = ScheduleInfo.second.Kind;
-      const OpenMPScheduleClauseModifier M1 = ScheduleInfo.second.M1;
-      const OpenMPScheduleClauseModifier M2 = ScheduleInfo.second.M2;
+      llvm::Value *Chunk = nullptr;
+      OpenMPScheduleTy ScheduleKind;
+      if (auto *C = S.getSingleClause<OMPScheduleClause>()) {
+        ScheduleKind.Schedule = C->getScheduleKind();
+        ScheduleKind.M1 = C->getFirstScheduleModifier();
+        ScheduleKind.M2 = C->getSecondScheduleModifier();
+        if (const auto *Ch = C->getChunkSize()) {
+          Chunk = EmitScalarExpr(Ch);
+          Chunk = EmitScalarConversion(Chunk, Ch->getType(),
+                                       S.getIterationVariable()->getType(),
+                                       S.getLocStart());
+        }
+      }
       const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
       const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
-      const bool Ordered = S.getSingleClause<OMPOrderedClause>() != nullptr;
       // OpenMP 4.5, 2.7.1 Loop Construct, Description.
       // If the static schedule kind is specified or if the ordered clause is
       // specified, and if no monotonic modifier is specified, the effect will
       // be as if the monotonic modifier was specified.
-      if (RT.isStaticNonchunked(ScheduleKind,
+      if (RT.isStaticNonchunked(ScheduleKind.Schedule,
                                 /* Chunked */ Chunk != nullptr) &&
           !Ordered) {
         if (isOpenMPSimdDirective(S.getDirectiveKind()))
@@ -1585,26 +2095,41 @@
         // Tell the runtime we are done.
         RT.emitForStaticFinish(*this, S.getLocStart());
       } else {
-        const bool IsMonotonic = Ordered ||
-                                 ScheduleKind == OMPC_SCHEDULE_static ||
-                                 ScheduleKind == OMPC_SCHEDULE_unknown ||
-                                 M1 == OMPC_SCHEDULE_MODIFIER_monotonic ||
-                                 M2 == OMPC_SCHEDULE_MODIFIER_monotonic;
+        const bool IsMonotonic =
+            Ordered || ScheduleKind.Schedule == OMPC_SCHEDULE_static ||
+            ScheduleKind.Schedule == OMPC_SCHEDULE_unknown ||
+            ScheduleKind.M1 == OMPC_SCHEDULE_MODIFIER_monotonic ||
+            ScheduleKind.M2 == OMPC_SCHEDULE_MODIFIER_monotonic;
         // Emit the outer loop, which requests its work chunk [LB..UB] from
         // runtime and runs the inner loop to process it.
         EmitOMPForOuterLoop(ScheduleKind, IsMonotonic, S, LoopScope, Ordered,
                             LB.getAddress(), UB.getAddress(), ST.getAddress(),
                             IL.getAddress(), Chunk);
       }
+      if (isOpenMPSimdDirective(S.getDirectiveKind())) {
+        EmitOMPSimdFinal(S,
+                         [&](CodeGenFunction &CGF) -> llvm::Value * {
+                           return CGF.Builder.CreateIsNotNull(
+                               CGF.EmitLoadOfScalar(IL, S.getLocStart()));
+                         });
+      }
       EmitOMPReductionClauseFinal(S);
+      // Emit post-update of the reduction variables if IsLastIter != 0.
+      emitPostUpdateForReductionClause(
+          *this, S, [&](CodeGenFunction &CGF) -> llvm::Value * {
+            return CGF.Builder.CreateIsNotNull(
+                CGF.EmitLoadOfScalar(IL, S.getLocStart()));
+          });
       // Emit final copy of the lastprivate variables if IsLastIter != 0.
       if (HasLastprivateClause)
         EmitOMPLastprivateClauseFinal(
-            S, Builder.CreateIsNotNull(EmitLoadOfScalar(IL, S.getLocStart())));
+            S, isOpenMPSimdDirective(S.getDirectiveKind()),
+            Builder.CreateIsNotNull(EmitLoadOfScalar(IL, S.getLocStart())));
     }
-    if (isOpenMPSimdDirective(S.getDirectiveKind())) {
-      EmitOMPSimdFinal(S);
-    }
+    EmitOMPLinearClauseFinal(S, [&](CodeGenFunction &CGF) -> llvm::Value * {
+      return CGF.Builder.CreateIsNotNull(
+          CGF.EmitLoadOfScalar(IL, S.getLocStart()));
+    });
     // We're now done with the loop, so jump to the continuation block.
     if (ContBlock) {
       EmitBranch(ContBlock);
@@ -1615,13 +2140,16 @@
 }
 
 void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
   bool HasLastprivates = false;
-  auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF,
+                                          PrePostActionTy &) {
     HasLastprivates = CGF.EmitOMPWorksharingLoop(S);
   };
-  CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_for, CodeGen,
-                                              S.hasCancel());
+  {
+    OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+    CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_for, CodeGen,
+                                                S.hasCancel());
+  }
 
   // Emit an implicit barrier at the end.
   if (!S.getSingleClause<OMPNowaitClause>() || HasLastprivates) {
@@ -1630,12 +2158,15 @@
 }
 
 void CodeGenFunction::EmitOMPForSimdDirective(const OMPForSimdDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
   bool HasLastprivates = false;
-  auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF,
+                                          PrePostActionTy &) {
     HasLastprivates = CGF.EmitOMPWorksharingLoop(S);
   };
-  CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
+  {
+    OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+    CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
+  }
 
   // Emit an implicit barrier at the end.
   if (!S.getSingleClause<OMPNowaitClause>() || HasLastprivates) {
@@ -1652,54 +2183,55 @@
   return LVal;
 }
 
-OpenMPDirectiveKind
-CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
+void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
   auto *Stmt = cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt();
   auto *CS = dyn_cast<CompoundStmt>(Stmt);
-  if (CS && CS->size() > 1) {
-    bool HasLastprivates = false;
-    auto &&CodeGen = [&S, CS, &HasLastprivates](CodeGenFunction &CGF) {
-      auto &C = CGF.CGM.getContext();
-      auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
-      // Emit helper vars inits.
-      LValue LB = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.lb.",
-                                    CGF.Builder.getInt32(0));
-      auto *GlobalUBVal = CGF.Builder.getInt32(CS->size() - 1);
-      LValue UB =
-          createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.ub.", GlobalUBVal);
-      LValue ST = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.st.",
-                                    CGF.Builder.getInt32(1));
-      LValue IL = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.il.",
-                                    CGF.Builder.getInt32(0));
-      // Loop counter.
-      LValue IV = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.iv.");
-      OpaqueValueExpr IVRefExpr(S.getLocStart(), KmpInt32Ty, VK_LValue);
-      CodeGenFunction::OpaqueValueMapping OpaqueIV(CGF, &IVRefExpr, IV);
-      OpaqueValueExpr UBRefExpr(S.getLocStart(), KmpInt32Ty, VK_LValue);
-      CodeGenFunction::OpaqueValueMapping OpaqueUB(CGF, &UBRefExpr, UB);
-      // Generate condition for loop.
-      BinaryOperator Cond(&IVRefExpr, &UBRefExpr, BO_LE, C.BoolTy, VK_RValue,
-                          OK_Ordinary, S.getLocStart(),
-                          /*fpContractable=*/false);
-      // Increment for loop counter.
-      UnaryOperator Inc(&IVRefExpr, UO_PreInc, KmpInt32Ty, VK_RValue,
-                        OK_Ordinary, S.getLocStart());
-      auto BodyGen = [CS, &S, &IV](CodeGenFunction &CGF) {
-        // Iterate through all sections and emit a switch construct:
-        // switch (IV) {
-        //   case 0:
-        //     <SectionStmt[0]>;
-        //     break;
-        // ...
-        //   case <NumSection> - 1:
-        //     <SectionStmt[<NumSection> - 1]>;
-        //     break;
-        // }
-        // .omp.sections.exit:
-        auto *ExitBB = CGF.createBasicBlock(".omp.sections.exit");
-        auto *SwitchStmt = CGF.Builder.CreateSwitch(
-            CGF.EmitLoadOfLValue(IV, S.getLocStart()).getScalarVal(), ExitBB,
-            CS->size());
+  bool HasLastprivates = false;
+  auto &&CodeGen = [&S, Stmt, CS, &HasLastprivates](CodeGenFunction &CGF,
+                                                    PrePostActionTy &) {
+    auto &C = CGF.CGM.getContext();
+    auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
+    // Emit helper vars inits.
+    LValue LB = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.lb.",
+                                  CGF.Builder.getInt32(0));
+    auto *GlobalUBVal = CS != nullptr ? CGF.Builder.getInt32(CS->size() - 1)
+                                      : CGF.Builder.getInt32(0);
+    LValue UB =
+        createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.ub.", GlobalUBVal);
+    LValue ST = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.st.",
+                                  CGF.Builder.getInt32(1));
+    LValue IL = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.il.",
+                                  CGF.Builder.getInt32(0));
+    // Loop counter.
+    LValue IV = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.iv.");
+    OpaqueValueExpr IVRefExpr(S.getLocStart(), KmpInt32Ty, VK_LValue);
+    CodeGenFunction::OpaqueValueMapping OpaqueIV(CGF, &IVRefExpr, IV);
+    OpaqueValueExpr UBRefExpr(S.getLocStart(), KmpInt32Ty, VK_LValue);
+    CodeGenFunction::OpaqueValueMapping OpaqueUB(CGF, &UBRefExpr, UB);
+    // Generate condition for loop.
+    BinaryOperator Cond(&IVRefExpr, &UBRefExpr, BO_LE, C.BoolTy, VK_RValue,
+                        OK_Ordinary, S.getLocStart(),
+                        /*fpContractable=*/false);
+    // Increment for loop counter.
+    UnaryOperator Inc(&IVRefExpr, UO_PreInc, KmpInt32Ty, VK_RValue, OK_Ordinary,
+                      S.getLocStart());
+    auto BodyGen = [Stmt, CS, &S, &IV](CodeGenFunction &CGF) {
+      // Iterate through all sections and emit a switch construct:
+      // switch (IV) {
+      //   case 0:
+      //     <SectionStmt[0]>;
+      //     break;
+      // ...
+      //   case <NumSection> - 1:
+      //     <SectionStmt[<NumSection> - 1]>;
+      //     break;
+      // }
+      // .omp.sections.exit:
+      auto *ExitBB = CGF.createBasicBlock(".omp.sections.exit");
+      auto *SwitchStmt = CGF.Builder.CreateSwitch(
+          CGF.EmitLoadOfLValue(IV, S.getLocStart()).getScalarVal(), ExitBB,
+          CS == nullptr ? 1 : CS->size());
+      if (CS) {
         unsigned CaseNumber = 0;
         for (auto *SubStmt : CS->children()) {
           auto CaseBB = CGF.createBasicBlock(".omp.sections.case");
@@ -1709,115 +2241,100 @@
           CGF.EmitBranch(ExitBB);
           ++CaseNumber;
         }
-        CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
-      };
-
-      CodeGenFunction::OMPPrivateScope LoopScope(CGF);
-      if (CGF.EmitOMPFirstprivateClause(S, LoopScope)) {
-        // Emit implicit barrier to synchronize threads and avoid data races on
-        // initialization of firstprivate variables.
-        CGF.CGM.getOpenMPRuntime().emitBarrierCall(
-            CGF, S.getLocStart(), OMPD_unknown, /*EmitChecks=*/false,
-            /*ForceSimpleCall=*/true);
+      } else {
+        auto CaseBB = CGF.createBasicBlock(".omp.sections.case");
+        CGF.EmitBlock(CaseBB);
+        SwitchStmt->addCase(CGF.Builder.getInt32(0), CaseBB);
+        CGF.EmitStmt(Stmt);
+        CGF.EmitBranch(ExitBB);
       }
-      CGF.EmitOMPPrivateClause(S, LoopScope);
-      HasLastprivates = CGF.EmitOMPLastprivateClauseInit(S, LoopScope);
-      CGF.EmitOMPReductionClauseInit(S, LoopScope);
-      (void)LoopScope.Privatize();
-
-      // Emit static non-chunked loop.
-      CGF.CGM.getOpenMPRuntime().emitForStaticInit(
-          CGF, S.getLocStart(), OMPC_SCHEDULE_static, /*IVSize=*/32,
-          /*IVSigned=*/true, /*Ordered=*/false, IL.getAddress(),
-          LB.getAddress(), UB.getAddress(), ST.getAddress());
-      // UB = min(UB, GlobalUB);
-      auto *UBVal = CGF.EmitLoadOfScalar(UB, S.getLocStart());
-      auto *MinUBGlobalUB = CGF.Builder.CreateSelect(
-          CGF.Builder.CreateICmpSLT(UBVal, GlobalUBVal), UBVal, GlobalUBVal);
-      CGF.EmitStoreOfScalar(MinUBGlobalUB, UB);
-      // IV = LB;
-      CGF.EmitStoreOfScalar(CGF.EmitLoadOfScalar(LB, S.getLocStart()), IV);
-      // while (idx <= UB) { BODY; ++idx; }
-      CGF.EmitOMPInnerLoop(S, /*RequiresCleanup=*/false, &Cond, &Inc, BodyGen,
-                           [](CodeGenFunction &) {});
-      // Tell the runtime we are done.
-      CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getLocStart());
-      CGF.EmitOMPReductionClauseFinal(S);
-
-      // Emit final copy of the lastprivate variables if IsLastIter != 0.
-      if (HasLastprivates)
-        CGF.EmitOMPLastprivateClauseFinal(
-            S, CGF.Builder.CreateIsNotNull(
-                   CGF.EmitLoadOfScalar(IL, S.getLocStart())));
+      CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
     };
 
-    bool HasCancel = false;
-    if (auto *OSD = dyn_cast<OMPSectionsDirective>(&S))
-      HasCancel = OSD->hasCancel();
-    else if (auto *OPSD = dyn_cast<OMPParallelSectionsDirective>(&S))
-      HasCancel = OPSD->hasCancel();
-    CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_sections, CodeGen,
-                                                HasCancel);
-    // Emit barrier for lastprivates only if 'sections' directive has 'nowait'
-    // clause. Otherwise the barrier will be generated by the codegen for the
-    // directive.
-    if (HasLastprivates && S.getSingleClause<OMPNowaitClause>()) {
+    CodeGenFunction::OMPPrivateScope LoopScope(CGF);
+    if (CGF.EmitOMPFirstprivateClause(S, LoopScope)) {
       // Emit implicit barrier to synchronize threads and avoid data races on
-      // initialization of firstprivate variables.
-      CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(),
-                                             OMPD_unknown);
+      // initialization of firstprivate variables and post-update of lastprivate
+      // variables.
+      CGF.CGM.getOpenMPRuntime().emitBarrierCall(
+          CGF, S.getLocStart(), OMPD_unknown, /*EmitChecks=*/false,
+          /*ForceSimpleCall=*/true);
     }
-    return OMPD_sections;
-  }
-  // If only one section is found - no need to generate loop, emit as a single
-  // region.
-  bool HasFirstprivates;
-  // No need to generate reductions for sections with single section region, we
-  // can use original shared variables for all operations.
-  bool HasReductions = S.hasClausesOfKind<OMPReductionClause>();
-  // No need to generate lastprivates for sections with single section region,
-  // we can use original shared variable for all calculations with barrier at
-  // the end of the sections.
-  bool HasLastprivates = S.hasClausesOfKind<OMPLastprivateClause>();
-  auto &&CodeGen = [Stmt, &S, &HasFirstprivates](CodeGenFunction &CGF) {
-    CodeGenFunction::OMPPrivateScope SingleScope(CGF);
-    HasFirstprivates = CGF.EmitOMPFirstprivateClause(S, SingleScope);
-    CGF.EmitOMPPrivateClause(S, SingleScope);
-    (void)SingleScope.Privatize();
+    CGF.EmitOMPPrivateClause(S, LoopScope);
+    HasLastprivates = CGF.EmitOMPLastprivateClauseInit(S, LoopScope);
+    CGF.EmitOMPReductionClauseInit(S, LoopScope);
+    (void)LoopScope.Privatize();
 
-    CGF.EmitStmt(Stmt);
+    // Emit static non-chunked loop.
+    OpenMPScheduleTy ScheduleKind;
+    ScheduleKind.Schedule = OMPC_SCHEDULE_static;
+    CGF.CGM.getOpenMPRuntime().emitForStaticInit(
+        CGF, S.getLocStart(), ScheduleKind, /*IVSize=*/32,
+        /*IVSigned=*/true, /*Ordered=*/false, IL.getAddress(), LB.getAddress(),
+        UB.getAddress(), ST.getAddress());
+    // UB = min(UB, GlobalUB);
+    auto *UBVal = CGF.EmitLoadOfScalar(UB, S.getLocStart());
+    auto *MinUBGlobalUB = CGF.Builder.CreateSelect(
+        CGF.Builder.CreateICmpSLT(UBVal, GlobalUBVal), UBVal, GlobalUBVal);
+    CGF.EmitStoreOfScalar(MinUBGlobalUB, UB);
+    // IV = LB;
+    CGF.EmitStoreOfScalar(CGF.EmitLoadOfScalar(LB, S.getLocStart()), IV);
+    // while (idx <= UB) { BODY; ++idx; }
+    CGF.EmitOMPInnerLoop(S, /*RequiresCleanup=*/false, &Cond, &Inc, BodyGen,
+                         [](CodeGenFunction &) {});
+    // Tell the runtime we are done.
+    CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getLocStart());
+    CGF.EmitOMPReductionClauseFinal(S);
+    // Emit post-update of the reduction variables if IsLastIter != 0.
+    emitPostUpdateForReductionClause(
+        CGF, S, [&](CodeGenFunction &CGF) -> llvm::Value * {
+          return CGF.Builder.CreateIsNotNull(
+              CGF.EmitLoadOfScalar(IL, S.getLocStart()));
+        });
+
+    // Emit final copy of the lastprivate variables if IsLastIter != 0.
+    if (HasLastprivates)
+      CGF.EmitOMPLastprivateClauseFinal(
+          S, /*NoFinals=*/false,
+          CGF.Builder.CreateIsNotNull(
+              CGF.EmitLoadOfScalar(IL, S.getLocStart())));
   };
-  CGM.getOpenMPRuntime().emitSingleRegion(*this, CodeGen, S.getLocStart(),
-                                          llvm::None, llvm::None, llvm::None,
-                                          llvm::None);
-  // Emit barrier for firstprivates, lastprivates or reductions only if
-  // 'sections' directive has 'nowait' clause. Otherwise the barrier will be
-  // generated by the codegen for the directive.
-  if ((HasFirstprivates || HasLastprivates || HasReductions) &&
-      S.getSingleClause<OMPNowaitClause>()) {
+
+  bool HasCancel = false;
+  if (auto *OSD = dyn_cast<OMPSectionsDirective>(&S))
+    HasCancel = OSD->hasCancel();
+  else if (auto *OPSD = dyn_cast<OMPParallelSectionsDirective>(&S))
+    HasCancel = OPSD->hasCancel();
+  CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_sections, CodeGen,
+                                              HasCancel);
+  // Emit barrier for lastprivates only if 'sections' directive has 'nowait'
+  // clause. Otherwise the barrier will be generated by the codegen for the
+  // directive.
+  if (HasLastprivates && S.getSingleClause<OMPNowaitClause>()) {
     // Emit implicit barrier to synchronize threads and avoid data races on
     // initialization of firstprivate variables.
-    CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), OMPD_unknown,
-                                           /*EmitChecks=*/false,
-                                           /*ForceSimpleCall=*/true);
+    CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(),
+                                           OMPD_unknown);
   }
-  return OMPD_single;
 }
 
 void CodeGenFunction::EmitOMPSectionsDirective(const OMPSectionsDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
-  OpenMPDirectiveKind EmittedAs = EmitSections(S);
+  {
+    OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+    EmitSections(S);
+  }
   // Emit an implicit barrier at the end.
   if (!S.getSingleClause<OMPNowaitClause>()) {
-    CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), EmittedAs);
+    CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(),
+                                           OMPD_sections);
   }
 }
 
 void CodeGenFunction::EmitOMPSectionDirective(const OMPSectionDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
     CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
   };
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_section, CodeGen,
                                               S.hasCancel());
 }
@@ -1828,8 +2345,7 @@
   llvm::SmallVector<const Expr *, 8> SrcExprs;
   llvm::SmallVector<const Expr *, 8> AssignmentOps;
   // Check if there are any 'copyprivate' clauses associated with this
-  // 'single'
-  // construct.
+  // 'single' construct.
   // Build a list of copyprivate variables along with helper expressions
   // (<source>, <destination>, <destination>=<source> expressions)
   for (const auto *C : S.getClausesOfKind<OMPCopyprivateClause>()) {
@@ -1840,24 +2356,24 @@
     AssignmentOps.append(C->assignment_ops().begin(),
                          C->assignment_ops().end());
   }
-  LexicalScope Scope(*this, S.getSourceRange());
   // Emit code for 'single' region along with 'copyprivate' clauses
-  bool HasFirstprivates;
-  auto &&CodeGen = [&S, &HasFirstprivates](CodeGenFunction &CGF) {
-    CodeGenFunction::OMPPrivateScope SingleScope(CGF);
-    HasFirstprivates = CGF.EmitOMPFirstprivateClause(S, SingleScope);
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
+    OMPPrivateScope SingleScope(CGF);
+    (void)CGF.EmitOMPFirstprivateClause(S, SingleScope);
     CGF.EmitOMPPrivateClause(S, SingleScope);
     (void)SingleScope.Privatize();
-
     CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
   };
-  CGM.getOpenMPRuntime().emitSingleRegion(*this, CodeGen, S.getLocStart(),
-                                          CopyprivateVars, DestExprs, SrcExprs,
-                                          AssignmentOps);
+  {
+    OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+    CGM.getOpenMPRuntime().emitSingleRegion(*this, CodeGen, S.getLocStart(),
+                                            CopyprivateVars, DestExprs,
+                                            SrcExprs, AssignmentOps);
+  }
   // Emit an implicit barrier at the end (to avoid data race on firstprivate
   // init or if no 'nowait' clause was specified and no 'copyprivate' clause).
-  if ((!S.getSingleClause<OMPNowaitClause>() || HasFirstprivates) &&
-      CopyprivateVars.empty()) {
+  if (!S.getSingleClause<OMPNowaitClause>() && CopyprivateVars.empty()) {
     CGM.getOpenMPRuntime().emitBarrierCall(
         *this, S.getLocStart(),
         S.getSingleClause<OMPNowaitClause>() ? OMPD_unknown : OMPD_single);
@@ -1865,21 +2381,23 @@
 }
 
 void CodeGenFunction::EmitOMPMasterDirective(const OMPMasterDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
     CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
   };
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
   CGM.getOpenMPRuntime().emitMasterRegion(*this, CodeGen, S.getLocStart());
 }
 
 void CodeGenFunction::EmitOMPCriticalDirective(const OMPCriticalDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
     CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
   };
   Expr *Hint = nullptr;
   if (auto *HintClause = S.getSingleClause<OMPHintClause>())
     Hint = HintClause->getHint();
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
   CGM.getOpenMPRuntime().emitCriticalRegion(*this,
                                             S.getDirectiveName().getAsString(),
                                             CodeGen, S.getLocStart(), Hint);
@@ -1889,9 +2407,7 @@
     const OMPParallelForDirective &S) {
   // Emit directive as a combined directive that consists of two implicit
   // directives: 'parallel' with 'for' directive.
-  LexicalScope Scope(*this, S.getSourceRange());
-  (void)emitScheduleClause(*this, S, /*OuterRegion=*/true);
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
     CGF.EmitOMPWorksharingLoop(S);
   };
   emitCommonOMPParallelDirective(*this, S, OMPD_for, CodeGen);
@@ -1901,9 +2417,7 @@
     const OMPParallelForSimdDirective &S) {
   // Emit directive as a combined directive that consists of two implicit
   // directives: 'parallel' with 'for' directive.
-  LexicalScope Scope(*this, S.getSourceRange());
-  (void)emitScheduleClause(*this, S, /*OuterRegion=*/true);
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
     CGF.EmitOMPWorksharingLoop(S);
   };
   emitCommonOMPParallelDirective(*this, S, OMPD_simd, CodeGen);
@@ -1913,93 +2427,146 @@
     const OMPParallelSectionsDirective &S) {
   // Emit directive as a combined directive that consists of two implicit
   // directives: 'parallel' with 'sections' directive.
-  LexicalScope Scope(*this, S.getSourceRange());
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
-    (void)CGF.EmitSections(S);
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+    CGF.EmitSections(S);
   };
   emitCommonOMPParallelDirective(*this, S, OMPD_sections, CodeGen);
 }
 
-void CodeGenFunction::EmitOMPTaskDirective(const OMPTaskDirective &S) {
+void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
+                                                const RegionCodeGenTy &BodyGen,
+                                                const TaskGenTy &TaskGen,
+                                                OMPTaskDataTy &Data) {
   // Emit outlined function for task construct.
-  LexicalScope Scope(*this, S.getSourceRange());
   auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
-  auto CapturedStruct = GenerateCapturedStmtArgument(*CS);
   auto *I = CS->getCapturedDecl()->param_begin();
   auto *PartId = std::next(I);
+  auto *TaskT = std::next(I, 4);
+  // Check if the task is final
+  if (const auto *Clause = S.getSingleClause<OMPFinalClause>()) {
+    // If the condition constant folds and can be elided, try to avoid emitting
+    // the condition and the dead arm of the if/else.
+    auto *Cond = Clause->getCondition();
+    bool CondConstant;
+    if (ConstantFoldsToSimpleInteger(Cond, CondConstant))
+      Data.Final.setInt(CondConstant);
+    else
+      Data.Final.setPointer(EvaluateExprAsBool(Cond));
+  } else {
+    // By default the task is not final.
+    Data.Final.setInt(/*IntVal=*/false);
+  }
+  // Check if the task has 'priority' clause.
+  if (const auto *Clause = S.getSingleClause<OMPPriorityClause>()) {
+    auto *Prio = Clause->getPriority();
+    Data.Priority.setInt(/*IntVal=*/true);
+    Data.Priority.setPointer(EmitScalarConversion(
+        EmitScalarExpr(Prio), Prio->getType(),
+        getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
+        Prio->getExprLoc()));
+  }
   // The first function argument for tasks is a thread id, the second one is a
   // part id (0 for tied tasks, >=0 for untied task).
   llvm::DenseSet<const VarDecl *> EmittedAsPrivate;
   // Get list of private variables.
-  llvm::SmallVector<const Expr *, 8> PrivateVars;
-  llvm::SmallVector<const Expr *, 8> PrivateCopies;
   for (const auto *C : S.getClausesOfKind<OMPPrivateClause>()) {
     auto IRef = C->varlist_begin();
     for (auto *IInit : C->private_copies()) {
       auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
       if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) {
-        PrivateVars.push_back(*IRef);
-        PrivateCopies.push_back(IInit);
+        Data.PrivateVars.push_back(*IRef);
+        Data.PrivateCopies.push_back(IInit);
       }
       ++IRef;
     }
   }
   EmittedAsPrivate.clear();
   // Get list of firstprivate variables.
-  llvm::SmallVector<const Expr *, 8> FirstprivateVars;
-  llvm::SmallVector<const Expr *, 8> FirstprivateCopies;
-  llvm::SmallVector<const Expr *, 8> FirstprivateInits;
   for (const auto *C : S.getClausesOfKind<OMPFirstprivateClause>()) {
     auto IRef = C->varlist_begin();
     auto IElemInitRef = C->inits().begin();
     for (auto *IInit : C->private_copies()) {
       auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
       if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) {
-        FirstprivateVars.push_back(*IRef);
-        FirstprivateCopies.push_back(IInit);
-        FirstprivateInits.push_back(*IElemInitRef);
+        Data.FirstprivateVars.push_back(*IRef);
+        Data.FirstprivateCopies.push_back(IInit);
+        Data.FirstprivateInits.push_back(*IElemInitRef);
       }
-      ++IRef, ++IElemInitRef;
+      ++IRef;
+      ++IElemInitRef;
+    }
+  }
+  // Get list of lastprivate variables (for taskloops).
+  llvm::DenseMap<const VarDecl *, const DeclRefExpr *> LastprivateDstsOrigs;
+  for (const auto *C : S.getClausesOfKind<OMPLastprivateClause>()) {
+    auto IRef = C->varlist_begin();
+    auto ID = C->destination_exprs().begin();
+    for (auto *IInit : C->private_copies()) {
+      auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
+      if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) {
+        Data.LastprivateVars.push_back(*IRef);
+        Data.LastprivateCopies.push_back(IInit);
+      }
+      LastprivateDstsOrigs.insert(
+          {cast<VarDecl>(cast<DeclRefExpr>(*ID)->getDecl()),
+           cast<DeclRefExpr>(*IRef)});
+      ++IRef;
+      ++ID;
     }
   }
   // Build list of dependences.
-  llvm::SmallVector<std::pair<OpenMPDependClauseKind, const Expr *>, 8>
-      Dependences;
-  for (const auto *C : S.getClausesOfKind<OMPDependClause>()) {
-    for (auto *IRef : C->varlists()) {
-      Dependences.push_back(std::make_pair(C->getDependencyKind(), IRef));
-    }
-  }
-  auto &&CodeGen = [PartId, &S, &PrivateVars, &FirstprivateVars](
-      CodeGenFunction &CGF) {
+  for (const auto *C : S.getClausesOfKind<OMPDependClause>())
+    for (auto *IRef : C->varlists())
+      Data.Dependences.push_back(std::make_pair(C->getDependencyKind(), IRef));
+  auto &&CodeGen = [PartId, &S, &Data, CS, &BodyGen, &LastprivateDstsOrigs](
+      CodeGenFunction &CGF, PrePostActionTy &Action) {
     // Set proper addresses for generated private copies.
-    auto *CS = cast<CapturedStmt>(S.getAssociatedStmt());
     OMPPrivateScope Scope(CGF);
-    if (!PrivateVars.empty() || !FirstprivateVars.empty()) {
+    if (!Data.PrivateVars.empty() || !Data.FirstprivateVars.empty() ||
+        !Data.LastprivateVars.empty()) {
       auto *CopyFn = CGF.Builder.CreateLoad(
           CGF.GetAddrOfLocalVar(CS->getCapturedDecl()->getParam(3)));
       auto *PrivatesPtr = CGF.Builder.CreateLoad(
           CGF.GetAddrOfLocalVar(CS->getCapturedDecl()->getParam(2)));
       // Map privates.
-      llvm::SmallVector<std::pair<const VarDecl *, Address>, 16>
-          PrivatePtrs;
+      llvm::SmallVector<std::pair<const VarDecl *, Address>, 16> PrivatePtrs;
       llvm::SmallVector<llvm::Value *, 16> CallArgs;
       CallArgs.push_back(PrivatesPtr);
-      for (auto *E : PrivateVars) {
+      for (auto *E : Data.PrivateVars) {
         auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
-        Address PrivatePtr =
-            CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()));
+        Address PrivatePtr = CGF.CreateMemTemp(
+            CGF.getContext().getPointerType(E->getType()), ".priv.ptr.addr");
         PrivatePtrs.push_back(std::make_pair(VD, PrivatePtr));
         CallArgs.push_back(PrivatePtr.getPointer());
       }
-      for (auto *E : FirstprivateVars) {
+      for (auto *E : Data.FirstprivateVars) {
         auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
         Address PrivatePtr =
-            CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()));
+            CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()),
+                              ".firstpriv.ptr.addr");
+        PrivatePtrs.push_back(std::make_pair(VD, PrivatePtr));
+        CallArgs.push_back(PrivatePtr.getPointer());
+      }
+      for (auto *E : Data.LastprivateVars) {
+        auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
+        Address PrivatePtr =
+            CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()),
+                              ".lastpriv.ptr.addr");
         PrivatePtrs.push_back(std::make_pair(VD, PrivatePtr));
         CallArgs.push_back(PrivatePtr.getPointer());
       }
       CGF.EmitRuntimeCall(CopyFn, CallArgs);
+      for (auto &&Pair : LastprivateDstsOrigs) {
+        auto *OrigVD = cast<VarDecl>(Pair.second->getDecl());
+        DeclRefExpr DRE(
+            const_cast<VarDecl *>(OrigVD),
+            /*RefersToEnclosingVariableOrCapture=*/CGF.CapturedStmtInfo->lookup(
+                OrigVD) != nullptr,
+            Pair.second->getType(), VK_LValue, Pair.second->getExprLoc());
+        Scope.addPrivate(Pair.first, [&CGF, &DRE]() {
+          return CGF.EmitLValue(&DRE).getAddress();
+        });
+      }
       for (auto &&Pair : PrivatePtrs) {
         Address Replacement(CGF.Builder.CreateLoad(Pair.second),
                             CGF.getContext().getDeclAlign(Pair.first));
@@ -2007,30 +2574,21 @@
       }
     }
     (void)Scope.Privatize();
-    if (*PartId) {
-      // TODO: emit code for untied tasks.
-    }
-    CGF.EmitStmt(CS->getCapturedStmt());
+
+    Action.Enter(CGF);
+    BodyGen(CGF);
   };
-  auto OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
-      S, *I, OMPD_task, CodeGen);
-  // Check if we should emit tied or untied task.
-  bool Tied = !S.getSingleClause<OMPUntiedClause>();
-  // Check if the task is final
-  llvm::PointerIntPair<llvm::Value *, 1, bool> Final;
-  if (const auto *Clause = S.getSingleClause<OMPFinalClause>()) {
-    // If the condition constant folds and can be elided, try to avoid emitting
-    // the condition and the dead arm of the if/else.
-    auto *Cond = Clause->getCondition();
-    bool CondConstant;
-    if (ConstantFoldsToSimpleInteger(Cond, CondConstant))
-      Final.setInt(CondConstant);
-    else
-      Final.setPointer(EvaluateExprAsBool(Cond));
-  } else {
-    // By default the task is not final.
-    Final.setInt(/*IntVal=*/false);
-  }
+  auto *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
+      S, *I, *PartId, *TaskT, S.getDirectiveKind(), CodeGen, Data.Tied,
+      Data.NumberOfParts);
+  OMPLexicalScope Scope(*this, S);
+  TaskGen(*this, OutlinedFn, Data);
+}
+
+void CodeGenFunction::EmitOMPTaskDirective(const OMPTaskDirective &S) {
+  // Emit outlined function for task construct.
+  auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
+  auto CapturedStruct = GenerateCapturedStmtArgument(*CS);
   auto SharedsTy = getContext().getRecordType(CS->getCapturedRecordDecl());
   const Expr *IfCond = nullptr;
   for (const auto *C : S.getClausesOfKind<OMPIfClause>()) {
@@ -2040,10 +2598,21 @@
       break;
     }
   }
-  CGM.getOpenMPRuntime().emitTaskCall(
-      *this, S.getLocStart(), S, Tied, Final, OutlinedFn, SharedsTy,
-      CapturedStruct, IfCond, PrivateVars, PrivateCopies, FirstprivateVars,
-      FirstprivateCopies, FirstprivateInits, Dependences);
+
+  OMPTaskDataTy Data;
+  // Check if we should emit tied or untied task.
+  Data.Tied = !S.getSingleClause<OMPUntiedClause>();
+  auto &&BodyGen = [CS](CodeGenFunction &CGF, PrePostActionTy &) {
+    CGF.EmitStmt(CS->getCapturedStmt());
+  };
+  auto &&TaskGen = [&S, SharedsTy, CapturedStruct,
+                    IfCond](CodeGenFunction &CGF, llvm::Value *OutlinedFn,
+                            const OMPTaskDataTy &Data) {
+    CGF.CGM.getOpenMPRuntime().emitTaskCall(CGF, S.getLocStart(), S, OutlinedFn,
+                                            SharedsTy, CapturedStruct, IfCond,
+                                            Data);
+  };
+  EmitOMPTaskBasedDirective(S, BodyGen, TaskGen, Data);
 }
 
 void CodeGenFunction::EmitOMPTaskyieldDirective(
@@ -2061,10 +2630,11 @@
 
 void CodeGenFunction::EmitOMPTaskgroupDirective(
     const OMPTaskgroupDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
-  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
     CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
   };
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
   CGM.getOpenMPRuntime().emitTaskgroupRegion(*this, CodeGen, S.getLocStart());
 }
 
@@ -2078,9 +2648,130 @@
   }(), S.getLocStart());
 }
 
+void CodeGenFunction::EmitOMPDistributeLoop(const OMPDistributeDirective &S) {
+  // Emit the loop iteration variable.
+  auto IVExpr = cast<DeclRefExpr>(S.getIterationVariable());
+  auto IVDecl = cast<VarDecl>(IVExpr->getDecl());
+  EmitVarDecl(*IVDecl);
+
+  // Emit the iterations count variable.
+  // If it is not a variable, Sema decided to calculate iterations count on each
+  // iteration (e.g., it is foldable into a constant).
+  if (auto LIExpr = dyn_cast<DeclRefExpr>(S.getLastIteration())) {
+    EmitVarDecl(*cast<VarDecl>(LIExpr->getDecl()));
+    // Emit calculation of the iterations count.
+    EmitIgnoredExpr(S.getCalcLastIteration());
+  }
+
+  auto &RT = CGM.getOpenMPRuntime();
+
+  // Check pre-condition.
+  {
+    OMPLoopScope PreInitScope(*this, S);
+    // Skip the entire loop if we don't meet the precondition.
+    // If the condition constant folds and can be elided, avoid emitting the
+    // whole loop.
+    bool CondConstant;
+    llvm::BasicBlock *ContBlock = nullptr;
+    if (ConstantFoldsToSimpleInteger(S.getPreCond(), CondConstant)) {
+      if (!CondConstant)
+        return;
+    } else {
+      auto *ThenBlock = createBasicBlock("omp.precond.then");
+      ContBlock = createBasicBlock("omp.precond.end");
+      emitPreCond(*this, S, S.getPreCond(), ThenBlock, ContBlock,
+                  getProfileCount(&S));
+      EmitBlock(ThenBlock);
+      incrementProfileCounter(&S);
+    }
+
+    // Emit 'then' code.
+    {
+      // Emit helper vars inits.
+      LValue LB =
+          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getLowerBoundVariable()));
+      LValue UB =
+          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getUpperBoundVariable()));
+      LValue ST =
+          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getStrideVariable()));
+      LValue IL =
+          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getIsLastIterVariable()));
+
+      OMPPrivateScope LoopScope(*this);
+      EmitOMPPrivateLoopCounters(S, LoopScope);
+      (void)LoopScope.Privatize();
+
+      // Detect the distribute schedule kind and chunk.
+      llvm::Value *Chunk = nullptr;
+      OpenMPDistScheduleClauseKind ScheduleKind = OMPC_DIST_SCHEDULE_unknown;
+      if (auto *C = S.getSingleClause<OMPDistScheduleClause>()) {
+        ScheduleKind = C->getDistScheduleKind();
+        if (const auto *Ch = C->getChunkSize()) {
+          Chunk = EmitScalarExpr(Ch);
+          Chunk = EmitScalarConversion(Chunk, Ch->getType(),
+          S.getIterationVariable()->getType(),
+          S.getLocStart());
+        }
+      }
+      const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
+      const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+
+      // OpenMP [2.10.8, distribute Construct, Description]
+      // If dist_schedule is specified, kind must be static. If specified,
+      // iterations are divided into chunks of size chunk_size, chunks are
+      // assigned to the teams of the league in a round-robin fashion in the
+      // order of the team number. When no chunk_size is specified, the
+      // iteration space is divided into chunks that are approximately equal
+      // in size, and at most one chunk is distributed to each team of the
+      // league. The size of the chunks is unspecified in this case.
+      if (RT.isStaticNonchunked(ScheduleKind,
+                                /* Chunked */ Chunk != nullptr)) {
+        RT.emitDistributeStaticInit(*this, S.getLocStart(), ScheduleKind,
+                             IVSize, IVSigned, /* Ordered = */ false,
+                             IL.getAddress(), LB.getAddress(),
+                             UB.getAddress(), ST.getAddress());
+        auto LoopExit =
+            getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit"));
+        // UB = min(UB, GlobalUB);
+        EmitIgnoredExpr(S.getEnsureUpperBound());
+        // IV = LB;
+        EmitIgnoredExpr(S.getInit());
+        // while (idx <= UB) { BODY; ++idx; }
+        EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(),
+                         S.getInc(),
+                         [&S, LoopExit](CodeGenFunction &CGF) {
+                           CGF.EmitOMPLoopBody(S, LoopExit);
+                           CGF.EmitStopPoint(&S);
+                         },
+                         [](CodeGenFunction &) {});
+        EmitBlock(LoopExit.getBlock());
+        // Tell the runtime we are done.
+        RT.emitForStaticFinish(*this, S.getLocStart());
+      } else {
+        // Emit the outer loop, which requests its work chunk [LB..UB] from
+        // runtime and runs the inner loop to process it.
+        EmitOMPDistributeOuterLoop(ScheduleKind, S, LoopScope,
+                            LB.getAddress(), UB.getAddress(), ST.getAddress(),
+                            IL.getAddress(), Chunk);
+      }
+    }
+
+    // We're now done with the loop, so jump to the continuation block.
+    if (ContBlock) {
+      EmitBranch(ContBlock);
+      EmitBlock(ContBlock, true);
+    }
+  }
+}
+
 void CodeGenFunction::EmitOMPDistributeDirective(
     const OMPDistributeDirective &S) {
-  llvm_unreachable("CodeGen for 'omp distribute' is not supported yet.");
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+    CGF.EmitOMPDistributeLoop(S);
+  };
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_distribute, CodeGen,
+                                              false);
 }
 
 static llvm::Function *emitOutlinedOrderedFunction(CodeGenModule &CGM,
@@ -2094,11 +2785,14 @@
 }
 
 void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) {
-  if (!S.getAssociatedStmt())
+  if (!S.getAssociatedStmt()) {
+    for (const auto *DC : S.getClausesOfKind<OMPDependClause>())
+      CGM.getOpenMPRuntime().emitDoacrossOrdered(*this, DC);
     return;
-  LexicalScope Scope(*this, S.getSourceRange());
+  }
   auto *C = S.getSingleClause<OMPSIMDClause>();
-  auto &&CodeGen = [&S, C, this](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S, C, this](CodeGenFunction &CGF,
+                                 PrePostActionTy &Action) {
     if (C) {
       auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
       llvm::SmallVector<llvm::Value *, 16> CapturedVars;
@@ -2106,10 +2800,12 @@
       auto *OutlinedFn = emitOutlinedOrderedFunction(CGM, CS);
       CGF.EmitNounwindRuntimeCall(OutlinedFn, CapturedVars);
     } else {
+      Action.Enter(CGF);
       CGF.EmitStmt(
           cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
     }
   };
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
   CGM.getOpenMPRuntime().emitOrderedRegion(*this, CodeGen, S.getLocStart(), !C);
 }
 
@@ -2156,23 +2852,24 @@
   if (LVal.isGlobalReg()) {
     CGF.EmitStoreThroughGlobalRegLValue(RVal, LVal);
   } else {
-    CGF.EmitAtomicStore(RVal, LVal, IsSeqCst ? llvm::SequentiallyConsistent
-                                             : llvm::Monotonic,
+    CGF.EmitAtomicStore(RVal, LVal,
+                        IsSeqCst ? llvm::AtomicOrdering::SequentiallyConsistent
+                                 : llvm::AtomicOrdering::Monotonic,
                         LVal.isVolatile(), /*IsInit=*/false);
   }
 }
 
-static void emitSimpleStore(CodeGenFunction &CGF, LValue LVal, RValue RVal,
-                            QualType RValTy, SourceLocation Loc) {
-  switch (CGF.getEvaluationKind(LVal.getType())) {
+void CodeGenFunction::emitOMPSimpleStore(LValue LVal, RValue RVal,
+                                         QualType RValTy, SourceLocation Loc) {
+  switch (getEvaluationKind(LVal.getType())) {
   case TEK_Scalar:
-    CGF.EmitStoreThroughLValue(RValue::get(convertToScalarValue(
-                                   CGF, RVal, RValTy, LVal.getType(), Loc)),
-                               LVal);
+    EmitStoreThroughLValue(RValue::get(convertToScalarValue(
+                               *this, RVal, RValTy, LVal.getType(), Loc)),
+                           LVal);
     break;
   case TEK_Complex:
-    CGF.EmitStoreOfComplex(
-        convertToComplexValue(CGF, RVal, RValTy, LVal.getType(), Loc), LVal,
+    EmitStoreOfComplex(
+        convertToComplexValue(*this, RVal, RValTy, LVal.getType(), Loc), LVal,
         /*isInit=*/false);
     break;
   case TEK_Aggregate:
@@ -2190,17 +2887,18 @@
   LValue VLValue = CGF.EmitLValue(V);
   RValue Res = XLValue.isGlobalReg()
                    ? CGF.EmitLoadOfLValue(XLValue, Loc)
-                   : CGF.EmitAtomicLoad(XLValue, Loc,
-                                        IsSeqCst ? llvm::SequentiallyConsistent
-                                                 : llvm::Monotonic,
-                                        XLValue.isVolatile());
+                   : CGF.EmitAtomicLoad(
+                         XLValue, Loc,
+                         IsSeqCst ? llvm::AtomicOrdering::SequentiallyConsistent
+                                  : llvm::AtomicOrdering::Monotonic,
+                         XLValue.isVolatile());
   // OpenMP, 2.12.6, atomic Construct
   // Any atomic construct with a seq_cst clause forces the atomically
   // performed operation to include an implicit flush operation without a
   // list.
   if (IsSeqCst)
     CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc);
-  emitSimpleStore(CGF, VLValue, Res, X->getType().getNonReferenceType(), Loc);
+  CGF.emitOMPSimpleStore(VLValue, Res, X->getType().getNonReferenceType(), Loc);
 }
 
 static void EmitOMPAtomicWriteExpr(CodeGenFunction &CGF, bool IsSeqCst,
@@ -2349,7 +3047,8 @@
   assert(X->isLValue() && "X of 'omp atomic update' is not lvalue");
   LValue XLValue = CGF.EmitLValue(X);
   RValue ExprRValue = CGF.EmitAnyExpr(E);
-  auto AO = IsSeqCst ? llvm::SequentiallyConsistent : llvm::Monotonic;
+  auto AO = IsSeqCst ? llvm::AtomicOrdering::SequentiallyConsistent
+                     : llvm::AtomicOrdering::Monotonic;
   auto *LHS = cast<OpaqueValueExpr>(BOUE->getLHS()->IgnoreImpCasts());
   auto *RHS = cast<OpaqueValueExpr>(BOUE->getRHS()->IgnoreImpCasts());
   auto *XRValExpr = IsXLHSInRHSPart ? LHS : RHS;
@@ -2398,7 +3097,8 @@
   LValue VLValue = CGF.EmitLValue(V);
   LValue XLValue = CGF.EmitLValue(X);
   RValue ExprRValue = CGF.EmitAnyExpr(E);
-  auto AO = IsSeqCst ? llvm::SequentiallyConsistent : llvm::Monotonic;
+  auto AO = IsSeqCst ? llvm::AtomicOrdering::SequentiallyConsistent
+                     : llvm::AtomicOrdering::Monotonic;
   QualType NewVValType;
   if (UE) {
     // 'x' is updated with some additional value.
@@ -2458,7 +3158,7 @@
     }
   }
   // Emit post-update store to 'v' of old/new 'x' value.
-  emitSimpleStore(CGF, VLValue, NewVVal, NewVValType, Loc);
+  CGF.emitOMPSimpleStore(VLValue, NewVVal, NewVValType, Loc);
   // OpenMP, 2.12.6, atomic Construct
   // Any atomic construct with a seq_cst clause forces the atomically
   // performed operation to include an implicit flush operation without a
@@ -2525,6 +3225,12 @@
   case OMPC_num_tasks:
   case OMPC_hint:
   case OMPC_dist_schedule:
+  case OMPC_defaultmap:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
     llvm_unreachable("Clause is not allowed in 'omp atomic'.");
   }
 }
@@ -2554,18 +3260,39 @@
     }
   }
 
-  LexicalScope Scope(*this, S.getSourceRange());
-  auto &&CodeGen = [&S, Kind, IsSeqCst, CS](CodeGenFunction &CGF) {
+  auto &&CodeGen = [&S, Kind, IsSeqCst, CS](CodeGenFunction &CGF,
+                                            PrePostActionTy &) {
     CGF.EmitStopPoint(CS);
     EmitOMPAtomicExpr(CGF, Kind, IsSeqCst, S.isPostfixUpdate(), S.getX(),
                       S.getV(), S.getExpr(), S.getUpdateExpr(),
                       S.isXLHSInRHSPart(), S.getLocStart());
   };
+  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_atomic, CodeGen);
 }
 
+std::pair<llvm::Function * /*OutlinedFn*/, llvm::Constant * /*OutlinedFnID*/>
+CodeGenFunction::EmitOMPTargetDirectiveOutlinedFunction(
+    CodeGenModule &CGM, const OMPTargetDirective &S, StringRef ParentName,
+    bool IsOffloadEntry) {
+  llvm::Function *OutlinedFn = nullptr;
+  llvm::Constant *OutlinedFnID = nullptr;
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    OMPPrivateScope PrivateScope(CGF);
+    (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope);
+    CGF.EmitOMPPrivateClause(S, PrivateScope);
+    (void)PrivateScope.Privatize();
+
+    Action.Enter(CGF);
+    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+  };
+  // Emit target region as a standalone region.
+  CGM.getOpenMPRuntime().emitTargetOutlinedFunction(
+      S, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen);
+  return std::make_pair(OutlinedFn, OutlinedFnID);
+}
+
 void CodeGenFunction::EmitOMPTargetDirective(const OMPTargetDirective &S) {
-  LexicalScope Scope(*this, S.getSourceRange());
   const CapturedStmt &CS = *cast<CapturedStmt>(S.getAssociatedStmt());
 
   llvm::SmallVector<llvm::Value *, 16> CapturedVars;
@@ -2611,15 +3338,50 @@
     ParentName =
         CGM.getMangledName(GlobalDecl(cast<FunctionDecl>(CurFuncDecl)));
 
-  CGM.getOpenMPRuntime().emitTargetOutlinedFunction(S, ParentName, Fn, FnID,
-                                                    IsOffloadEntry);
-
+  std::tie(Fn, FnID) = EmitOMPTargetDirectiveOutlinedFunction(
+      CGM, S, ParentName, IsOffloadEntry);
+  OMPLexicalScope Scope(*this, S);
   CGM.getOpenMPRuntime().emitTargetCall(*this, S, Fn, FnID, IfCond, Device,
                                         CapturedVars);
 }
 
-void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &) {
-  llvm_unreachable("CodeGen for 'omp teams' is not supported yet.");
+static void emitCommonOMPTeamsDirective(CodeGenFunction &CGF,
+                                        const OMPExecutableDirective &S,
+                                        OpenMPDirectiveKind InnermostKind,
+                                        const RegionCodeGenTy &CodeGen) {
+  auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
+  auto OutlinedFn = CGF.CGM.getOpenMPRuntime().
+      emitParallelOrTeamsOutlinedFunction(S,
+          *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen);
+
+  const OMPTeamsDirective &TD = *dyn_cast<OMPTeamsDirective>(&S);
+  const OMPNumTeamsClause *NT = TD.getSingleClause<OMPNumTeamsClause>();
+  const OMPThreadLimitClause *TL = TD.getSingleClause<OMPThreadLimitClause>();
+  if (NT || TL) {
+    Expr *NumTeams = (NT) ? NT->getNumTeams() : nullptr;
+    Expr *ThreadLimit = (TL) ? TL->getThreadLimit() : nullptr;
+
+    CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeams, ThreadLimit,
+                                                  S.getLocStart());
+  }
+
+  OMPLexicalScope Scope(CGF, S);
+  llvm::SmallVector<llvm::Value *, 16> CapturedVars;
+  CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars);
+  CGF.CGM.getOpenMPRuntime().emitTeamsCall(CGF, S, S.getLocStart(), OutlinedFn,
+                                           CapturedVars);
+}
+
+void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &S) {
+  // Emit parallel region as a standalone region.
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+    OMPPrivateScope PrivateScope(CGF);
+    (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope);
+    CGF.EmitOMPPrivateClause(S, PrivateScope);
+    (void)PrivateScope.Privatize();
+    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+  };
+  emitCommonOMPTeamsDirective(*this, S, OMPD_teams, CodeGen);
 }
 
 void CodeGenFunction::EmitOMPCancellationPointDirective(
@@ -2650,30 +3412,371 @@
   return BreakContinueStack.back().BreakBlock;
 }
 
+void CodeGenFunction::EmitOMPUseDevicePtrClause(
+    const OMPClause &NC, OMPPrivateScope &PrivateScope,
+    const llvm::DenseMap<const ValueDecl *, Address> &CaptureDeviceAddrMap) {
+  const auto &C = cast<OMPUseDevicePtrClause>(NC);
+  auto OrigVarIt = C.varlist_begin();
+  auto InitIt = C.inits().begin();
+  for (auto PvtVarIt : C.private_copies()) {
+    auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*OrigVarIt)->getDecl());
+    auto *InitVD = cast<VarDecl>(cast<DeclRefExpr>(*InitIt)->getDecl());
+    auto *PvtVD = cast<VarDecl>(cast<DeclRefExpr>(PvtVarIt)->getDecl());
+
+    // In order to identify the right initializer we need to match the
+    // declaration used by the mapping logic. In some cases we may get
+    // OMPCapturedExprDecl that refers to the original declaration.
+    const ValueDecl *MatchingVD = OrigVD;
+    if (auto *OED = dyn_cast<OMPCapturedExprDecl>(MatchingVD)) {
+      // OMPCapturedExprDecl are used to privative fields of the current
+      // structure.
+      auto *ME = cast<MemberExpr>(OED->getInit());
+      assert(isa<CXXThisExpr>(ME->getBase()) &&
+             "Base should be the current struct!");
+      MatchingVD = ME->getMemberDecl();
+    }
+
+    // If we don't have information about the current list item, move on to
+    // the next one.
+    auto InitAddrIt = CaptureDeviceAddrMap.find(MatchingVD);
+    if (InitAddrIt == CaptureDeviceAddrMap.end())
+      continue;
+
+    bool IsRegistered = PrivateScope.addPrivate(OrigVD, [&]() -> Address {
+      // Initialize the temporary initialization variable with the address we
+      // get from the runtime library. We have to cast the source address
+      // because it is always a void *. References are materialized in the
+      // privatization scope, so the initialization here disregards the fact
+      // the original variable is a reference.
+      QualType AddrQTy =
+          getContext().getPointerType(OrigVD->getType().getNonReferenceType());
+      llvm::Type *AddrTy = ConvertTypeForMem(AddrQTy);
+      Address InitAddr = Builder.CreateBitCast(InitAddrIt->second, AddrTy);
+      setAddrOfLocalVar(InitVD, InitAddr);
+
+      // Emit private declaration, it will be initialized by the value we
+      // declaration we just added to the local declarations map.
+      EmitDecl(*PvtVD);
+
+      // The initialization variables reached its purpose in the emission
+      // ofthe previous declaration, so we don't need it anymore.
+      LocalDeclMap.erase(InitVD);
+
+      // Return the address of the private variable.
+      return GetAddrOfLocalVar(PvtVD);
+    });
+    assert(IsRegistered && "firstprivate var already registered as private");
+    // Silence the warning about unused variable.
+    (void)IsRegistered;
+
+    ++OrigVarIt;
+    ++InitIt;
+  }
+}
+
 // Generate the instructions for '#pragma omp target data' directive.
 void CodeGenFunction::EmitOMPTargetDataDirective(
     const OMPTargetDataDirective &S) {
-  // emit the code inside the construct for now
+  CGOpenMPRuntime::TargetDataInfo Info(/*RequiresDevicePointerInfo=*/true);
+
+  // Create a pre/post action to signal the privatization of the device pointer.
+  // This action can be replaced by the OpenMP runtime code generation to
+  // deactivate privatization.
+  bool PrivatizeDevicePointers = false;
+  class DevicePointerPrivActionTy : public PrePostActionTy {
+    bool &PrivatizeDevicePointers;
+
+  public:
+    explicit DevicePointerPrivActionTy(bool &PrivatizeDevicePointers)
+        : PrePostActionTy(), PrivatizeDevicePointers(PrivatizeDevicePointers) {}
+    void Enter(CodeGenFunction &CGF) override {
+      PrivatizeDevicePointers = true;
+    }
+  };
+  DevicePointerPrivActionTy PrivAction(PrivatizeDevicePointers);
+
+  auto &&CodeGen = [&S, &Info, &PrivatizeDevicePointers](
+      CodeGenFunction &CGF, PrePostActionTy &Action) {
+    auto &&InnermostCodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+      CGF.EmitStmt(
+          cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+    };
+
+    // Codegen that selects wheather to generate the privatization code or not.
+    auto &&PrivCodeGen = [&S, &Info, &PrivatizeDevicePointers,
+                          &InnermostCodeGen](CodeGenFunction &CGF,
+                                             PrePostActionTy &Action) {
+      RegionCodeGenTy RCG(InnermostCodeGen);
+      PrivatizeDevicePointers = false;
+
+      // Call the pre-action to change the status of PrivatizeDevicePointers if
+      // needed.
+      Action.Enter(CGF);
+
+      if (PrivatizeDevicePointers) {
+        OMPPrivateScope PrivateScope(CGF);
+        // Emit all instances of the use_device_ptr clause.
+        for (const auto *C : S.getClausesOfKind<OMPUseDevicePtrClause>())
+          CGF.EmitOMPUseDevicePtrClause(*C, PrivateScope,
+                                        Info.CaptureDeviceAddrMap);
+        (void)PrivateScope.Privatize();
+        RCG(CGF);
+      } else
+        RCG(CGF);
+    };
+
+    // Forward the provided action to the privatization codegen.
+    RegionCodeGenTy PrivRCG(PrivCodeGen);
+    PrivRCG.setAction(Action);
+
+    // Notwithstanding the body of the region is emitted as inlined directive,
+    // we don't use an inline scope as changes in the references inside the
+    // region are expected to be visible outside, so we do not privative them.
+    OMPLexicalScope Scope(CGF, S);
+    CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_target_data,
+                                                    PrivRCG);
+  };
+
+  RegionCodeGenTy RCG(CodeGen);
+
+  // If we don't have target devices, don't bother emitting the data mapping
+  // code.
+  if (CGM.getLangOpts().OMPTargetTriples.empty()) {
+    RCG(*this);
+    return;
+  }
+
+  // Check if we have any if clause associated with the directive.
+  const Expr *IfCond = nullptr;
+  if (auto *C = S.getSingleClause<OMPIfClause>())
+    IfCond = C->getCondition();
+
+  // Check if we have any device clause associated with the directive.
+  const Expr *Device = nullptr;
+  if (auto *C = S.getSingleClause<OMPDeviceClause>())
+    Device = C->getDevice();
+
+  // Set the action to signal privatization of device pointers.
+  RCG.setAction(PrivAction);
+
+  // Emit region code.
+  CGM.getOpenMPRuntime().emitTargetDataCalls(*this, S, IfCond, Device, RCG,
+                                             Info);
+}
+
+void CodeGenFunction::EmitOMPTargetEnterDataDirective(
+    const OMPTargetEnterDataDirective &S) {
+  // If we don't have target devices, don't bother emitting the data mapping
+  // code.
+  if (CGM.getLangOpts().OMPTargetTriples.empty())
+    return;
+
+  // Check if we have any if clause associated with the directive.
+  const Expr *IfCond = nullptr;
+  if (auto *C = S.getSingleClause<OMPIfClause>())
+    IfCond = C->getCondition();
+
+  // Check if we have any device clause associated with the directive.
+  const Expr *Device = nullptr;
+  if (auto *C = S.getSingleClause<OMPDeviceClause>())
+    Device = C->getDevice();
+
+  CGM.getOpenMPRuntime().emitTargetDataStandAloneCall(*this, S, IfCond, Device);
+}
+
+void CodeGenFunction::EmitOMPTargetExitDataDirective(
+    const OMPTargetExitDataDirective &S) {
+  // If we don't have target devices, don't bother emitting the data mapping
+  // code.
+  if (CGM.getLangOpts().OMPTargetTriples.empty())
+    return;
+
+  // Check if we have any if clause associated with the directive.
+  const Expr *IfCond = nullptr;
+  if (auto *C = S.getSingleClause<OMPIfClause>())
+    IfCond = C->getCondition();
+
+  // Check if we have any device clause associated with the directive.
+  const Expr *Device = nullptr;
+  if (auto *C = S.getSingleClause<OMPDeviceClause>())
+    Device = C->getDevice();
+
+  CGM.getOpenMPRuntime().emitTargetDataStandAloneCall(*this, S, IfCond, Device);
+}
+
+void CodeGenFunction::EmitOMPTargetParallelDirective(
+    const OMPTargetParallelDirective &S) {
+  // TODO: codegen for target parallel.
+}
+
+void CodeGenFunction::EmitOMPTargetParallelForDirective(
+    const OMPTargetParallelForDirective &S) {
+  // TODO: codegen for target parallel for.
+}
+
+/// Emit a helper variable and return corresponding lvalue.
+static void mapParam(CodeGenFunction &CGF, const DeclRefExpr *Helper,
+                     const ImplicitParamDecl *PVD,
+                     CodeGenFunction::OMPPrivateScope &Privates) {
+  auto *VDecl = cast<VarDecl>(Helper->getDecl());
+  Privates.addPrivate(
+      VDecl, [&CGF, PVD]() -> Address { return CGF.GetAddrOfLocalVar(PVD); });
+}
+
+void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
+  assert(isOpenMPTaskLoopDirective(S.getDirectiveKind()));
+  // Emit outlined function for task construct.
   auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
-  CGM.getOpenMPRuntime().emitInlinedDirective(
-      *this, OMPD_target_data,
-      [&CS](CodeGenFunction &CGF) { CGF.EmitStmt(CS->getCapturedStmt()); });
+  auto CapturedStruct = GenerateCapturedStmtArgument(*CS);
+  auto SharedsTy = getContext().getRecordType(CS->getCapturedRecordDecl());
+  const Expr *IfCond = nullptr;
+  for (const auto *C : S.getClausesOfKind<OMPIfClause>()) {
+    if (C->getNameModifier() == OMPD_unknown ||
+        C->getNameModifier() == OMPD_taskloop) {
+      IfCond = C->getCondition();
+      break;
+    }
+  }
+
+  OMPTaskDataTy Data;
+  // Check if taskloop must be emitted without taskgroup.
+  Data.Nogroup = S.getSingleClause<OMPNogroupClause>();
+  // TODO: Check if we should emit tied or untied task.
+  Data.Tied = true;
+  // Set scheduling for taskloop
+  if (const auto* Clause = S.getSingleClause<OMPGrainsizeClause>()) {
+    // grainsize clause
+    Data.Schedule.setInt(/*IntVal=*/false);
+    Data.Schedule.setPointer(EmitScalarExpr(Clause->getGrainsize()));
+  } else if (const auto* Clause = S.getSingleClause<OMPNumTasksClause>()) {
+    // num_tasks clause
+    Data.Schedule.setInt(/*IntVal=*/true);
+    Data.Schedule.setPointer(EmitScalarExpr(Clause->getNumTasks()));
+  }
+
+  auto &&BodyGen = [CS, &S](CodeGenFunction &CGF, PrePostActionTy &) {
+    // if (PreCond) {
+    //   for (IV in 0..LastIteration) BODY;
+    //   <Final counter/linear vars updates>;
+    // }
+    //
+
+    // Emit: if (PreCond) - begin.
+    // If the condition constant folds and can be elided, avoid emitting the
+    // whole loop.
+    bool CondConstant;
+    llvm::BasicBlock *ContBlock = nullptr;
+    OMPLoopScope PreInitScope(CGF, S);
+    if (CGF.ConstantFoldsToSimpleInteger(S.getPreCond(), CondConstant)) {
+      if (!CondConstant)
+        return;
+    } else {
+      auto *ThenBlock = CGF.createBasicBlock("taskloop.if.then");
+      ContBlock = CGF.createBasicBlock("taskloop.if.end");
+      emitPreCond(CGF, S, S.getPreCond(), ThenBlock, ContBlock,
+                  CGF.getProfileCount(&S));
+      CGF.EmitBlock(ThenBlock);
+      CGF.incrementProfileCounter(&S);
+    }
+
+    if (isOpenMPSimdDirective(S.getDirectiveKind()))
+      CGF.EmitOMPSimdInit(S);
+
+    OMPPrivateScope LoopScope(CGF);
+    // Emit helper vars inits.
+    enum { LowerBound = 5, UpperBound, Stride, LastIter };
+    auto *I = CS->getCapturedDecl()->param_begin();
+    auto *LBP = std::next(I, LowerBound);
+    auto *UBP = std::next(I, UpperBound);
+    auto *STP = std::next(I, Stride);
+    auto *LIP = std::next(I, LastIter);
+    mapParam(CGF, cast<DeclRefExpr>(S.getLowerBoundVariable()), *LBP,
+             LoopScope);
+    mapParam(CGF, cast<DeclRefExpr>(S.getUpperBoundVariable()), *UBP,
+             LoopScope);
+    mapParam(CGF, cast<DeclRefExpr>(S.getStrideVariable()), *STP, LoopScope);
+    mapParam(CGF, cast<DeclRefExpr>(S.getIsLastIterVariable()), *LIP,
+             LoopScope);
+    CGF.EmitOMPPrivateLoopCounters(S, LoopScope);
+    bool HasLastprivateClause = CGF.EmitOMPLastprivateClauseInit(S, LoopScope);
+    (void)LoopScope.Privatize();
+    // Emit the loop iteration variable.
+    const Expr *IVExpr = S.getIterationVariable();
+    const VarDecl *IVDecl = cast<VarDecl>(cast<DeclRefExpr>(IVExpr)->getDecl());
+    CGF.EmitVarDecl(*IVDecl);
+    CGF.EmitIgnoredExpr(S.getInit());
+
+    // Emit the iterations count variable.
+    // If it is not a variable, Sema decided to calculate iterations count on
+    // each iteration (e.g., it is foldable into a constant).
+    if (auto LIExpr = dyn_cast<DeclRefExpr>(S.getLastIteration())) {
+      CGF.EmitVarDecl(*cast<VarDecl>(LIExpr->getDecl()));
+      // Emit calculation of the iterations count.
+      CGF.EmitIgnoredExpr(S.getCalcLastIteration());
+    }
+
+    CGF.EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(),
+                         S.getInc(),
+                         [&S](CodeGenFunction &CGF) {
+                           CGF.EmitOMPLoopBody(S, JumpDest());
+                           CGF.EmitStopPoint(&S);
+                         },
+                         [](CodeGenFunction &) {});
+    // Emit: if (PreCond) - end.
+    if (ContBlock) {
+      CGF.EmitBranch(ContBlock);
+      CGF.EmitBlock(ContBlock, true);
+    }
+    // Emit final copy of the lastprivate variables if IsLastIter != 0.
+    if (HasLastprivateClause) {
+      CGF.EmitOMPLastprivateClauseFinal(
+          S, isOpenMPSimdDirective(S.getDirectiveKind()),
+          CGF.Builder.CreateIsNotNull(CGF.EmitLoadOfScalar(
+              CGF.GetAddrOfLocalVar(*LIP), /*Volatile=*/false,
+              (*LIP)->getType(), S.getLocStart())));
+    }
+  };
+  auto &&TaskGen = [&S, SharedsTy, CapturedStruct,
+                    IfCond](CodeGenFunction &CGF, llvm::Value *OutlinedFn,
+                            const OMPTaskDataTy &Data) {
+    auto &&CodeGen = [&](CodeGenFunction &CGF, PrePostActionTy &) {
+      OMPLoopScope PreInitScope(CGF, S);
+      CGF.CGM.getOpenMPRuntime().emitTaskLoopCall(CGF, S.getLocStart(), S,
+                                                  OutlinedFn, SharedsTy,
+                                                  CapturedStruct, IfCond, Data);
+    };
+    CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_taskloop,
+                                                    CodeGen);
+  };
+  EmitOMPTaskBasedDirective(S, BodyGen, TaskGen, Data);
 }
 
 void CodeGenFunction::EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S) {
-  // emit the code inside the construct for now
-  auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
-  CGM.getOpenMPRuntime().emitInlinedDirective(
-      *this, OMPD_taskloop,
-      [&CS](CodeGenFunction &CGF) { CGF.EmitStmt(CS->getCapturedStmt()); });
+  EmitOMPTaskLoopBasedDirective(S);
 }
 
 void CodeGenFunction::EmitOMPTaskLoopSimdDirective(
     const OMPTaskLoopSimdDirective &S) {
-  // emit the code inside the construct for now
-  auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
-  CGM.getOpenMPRuntime().emitInlinedDirective(
-      *this, OMPD_taskloop_simd,
-      [&CS](CodeGenFunction &CGF) { CGF.EmitStmt(CS->getCapturedStmt()); });
+  EmitOMPTaskLoopBasedDirective(S);
 }
 
+// Generate the instructions for '#pragma omp target update' directive.
+void CodeGenFunction::EmitOMPTargetUpdateDirective(
+    const OMPTargetUpdateDirective &S) {
+  // If we don't have target devices, don't bother emitting the data mapping
+  // code.
+  if (CGM.getLangOpts().OMPTargetTriples.empty())
+    return;
+
+  // Check if we have any if clause associated with the directive.
+  const Expr *IfCond = nullptr;
+  if (auto *C = S.getSingleClause<OMPIfClause>())
+    IfCond = C->getCondition();
+
+  // Check if we have any device clause associated with the directive.
+  const Expr *Device = nullptr;
+  if (auto *C = S.getSingleClause<OMPDeviceClause>())
+    Device = C->getDevice();
+
+  CGM.getOpenMPRuntime().emitTargetDataStandAloneCall(*this, S, IfCond, Device);
+}
diff --git a/lib/CodeGen/CGVTT.cpp b/lib/CodeGen/CGVTT.cpp
index 4fb7671..5b90ee6 100644
--- a/lib/CodeGen/CGVTT.cpp
+++ b/lib/CodeGen/CGVTT.cpp
@@ -44,7 +44,7 @@
                                   const CXXRecordDecl *RD) {
   VTTBuilder Builder(CGM.getContext(), RD, /*GenerateDefinition=*/true);
 
-  llvm::Type *Int8PtrTy = CGM.Int8PtrTy, *Int64Ty = CGM.Int64Ty;
+  llvm::Type *Int8PtrTy = CGM.Int8PtrTy, *Int32Ty = CGM.Int32Ty;
   llvm::ArrayType *ArrayType = 
     llvm::ArrayType::get(Int8PtrTy, Builder.getVTTComponents().size());
 
@@ -75,8 +75,8 @@
     }
 
      llvm::Value *Idxs[] = {
-       llvm::ConstantInt::get(Int64Ty, 0),
-       llvm::ConstantInt::get(Int64Ty, AddressPoint)
+       llvm::ConstantInt::get(Int32Ty, 0),
+       llvm::ConstantInt::get(Int32Ty, AddressPoint)
      };
 
      llvm::Constant *Init = llvm::ConstantExpr::getInBoundsGetElementPtr(
@@ -121,7 +121,7 @@
   llvm::GlobalVariable *GV =
     CGM.CreateOrReplaceCXXRuntimeVariable(Name, ArrayType, 
                                           llvm::GlobalValue::ExternalLinkage);
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   return GV;
 }
 
diff --git a/lib/CodeGen/CGVTables.cpp b/lib/CodeGen/CGVTables.cpp
index 31bf0dd..32ebc08 100644
--- a/lib/CodeGen/CGVTables.cpp
+++ b/lib/CodeGen/CGVTables.cpp
@@ -11,16 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CodeGenFunction.h"
 #include "CGCXXABI.h"
+#include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "clang/Frontend/CodeGenOptions.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
@@ -156,9 +153,7 @@
 
   // Clone to thunk.
   llvm::ValueToValueMapTy VMap;
-  llvm::Function *NewFn = llvm::CloneFunction(BaseFn, VMap,
-                                              /*ModuleLevelChanges=*/false);
-  CGM.getModule().getFunctionList().push_back(NewFn);
+  llvm::Function *NewFn = llvm::CloneFunction(BaseFn, VMap);
   Fn->replaceAllUsesWith(NewFn);
   NewFn->takeName(Fn);
   Fn->eraseFromParent();
@@ -286,15 +281,14 @@
     CGM.getCXXABI().adjustCallArgsForDestructorThunk(*this, CurGD, CallArgs);
 
   // Add the rest of the arguments.
-  for (const ParmVarDecl *PD : MD->params())
+  for (const ParmVarDecl *PD : MD->parameters())
     EmitDelegateCallArg(CallArgs, PD, PD->getLocStart());
 
   const FunctionProtoType *FPT = MD->getType()->getAs<FunctionProtoType>();
 
 #ifndef NDEBUG
-  const CGFunctionInfo &CallFnInfo =
-    CGM.getTypes().arrangeCXXMethodCall(CallArgs, FPT,
-                                       RequiredArgs::forPrototypePlus(FPT, 1));
+  const CGFunctionInfo &CallFnInfo = CGM.getTypes().arrangeCXXMethodCall(
+      CallArgs, FPT, RequiredArgs::forPrototypePlus(FPT, 1, MD));
   assert(CallFnInfo.getRegParm() == CurFnInfo->getRegParm() &&
          CallFnInfo.isNoReturn() == CurFnInfo->isNoReturn() &&
          CallFnInfo.getCallingConvention() == CurFnInfo->getCallingConvention());
@@ -328,6 +322,8 @@
   // Consider return adjustment if we have ThunkInfo.
   if (Thunk && !Thunk->Return.isEmpty())
     RV = PerformReturnAdjustment(*this, ResultType, RV, *Thunk);
+  else if (llvm::CallInst* Call = dyn_cast<llvm::CallInst>(CallOrInvoke))
+    Call->setTailCallKind(llvm::CallInst::TCK_Tail);
 
   // Emit return.
   if (!ResultType->isVoidType() && Slot.isNull())
@@ -605,6 +601,8 @@
             llvm::FunctionType::get(CGM.VoidTy, /*isVarArg=*/false);
           StringRef PureCallName = CGM.getCXXABI().GetPureVirtualCallName();
           PureVirtualFn = CGM.CreateRuntimeFunction(Ty, PureCallName);
+          if (auto *F = dyn_cast<llvm::Function>(PureVirtualFn))
+            F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
           PureVirtualFn = llvm::ConstantExpr::getBitCast(PureVirtualFn,
                                                          CGM.Int8PtrTy);
         }
@@ -616,6 +614,8 @@
           StringRef DeletedCallName =
             CGM.getCXXABI().GetDeletedVirtualCallName();
           DeletedVirtualFn = CGM.CreateRuntimeFunction(Ty, DeletedCallName);
+          if (auto *F = dyn_cast<llvm::Function>(DeletedVirtualFn))
+            F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
           DeletedVirtualFn = llvm::ConstantExpr::getBitCast(DeletedVirtualFn,
                                                          CGM.Int8PtrTy);
         }
@@ -694,7 +694,7 @@
   CGM.setGlobalVisibility(VTable, RD);
 
   // V-tables are always unnamed_addr.
-  VTable->setUnnamedAddr(true);
+  VTable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
 
   llvm::Constant *RTTI = CGM.GetAddrOfRTTIDescriptor(
       CGM.getContext().getTagDeclType(Base.getBase()));
@@ -706,7 +706,7 @@
       VTLayout->getNumVTableThunks(), RTTI);
   VTable->setInitializer(Init);
   
-  CGM.EmitVTableBitSetEntries(VTable, *VTLayout.get());
+  CGM.EmitVTableTypeMetadata(VTable, *VTLayout.get());
 
   return VTable;
 }
@@ -791,6 +791,10 @@
       return DiscardableODRLinkage;
 
     case TSK_ExplicitInstantiationDeclaration:
+      // Explicit instantiations in MSVC do not provide vtables, so we must emit
+      // our own.
+      if (getTarget().getCXXABI().isMicrosoft())
+        return DiscardableODRLinkage;
       return shouldEmitAvailableExternallyVTable(*this, RD)
                  ? llvm::GlobalVariable::AvailableExternallyLinkage
                  : llvm::GlobalVariable::ExternalLinkage;
@@ -836,6 +840,11 @@
 bool CodeGenVTables::isVTableExternal(const CXXRecordDecl *RD) {
   assert(RD->isDynamicClass() && "Non-dynamic classes have no VTable.");
 
+  // We always synthesize vtables if they are needed in the MS ABI. MSVC doesn't
+  // emit them even if there is an explicit template instantiation.
+  if (CGM.getTarget().getCXXABI().isMicrosoft())
+    return false;
+
   // If we have an explicit instantiation declaration (and not a
   // definition), the vtable is defined elsewhere.
   TemplateSpecializationKind TSK = RD->getTemplateSpecializationKind();
@@ -891,21 +900,43 @@
   DeferredVTables.clear();
 }
 
-bool CodeGenModule::IsCFIBlacklistedRecord(const CXXRecordDecl *RD) {
-  if (RD->hasAttr<UuidAttr>() &&
-      getContext().getSanitizerBlacklist().isBlacklistedType("attr:uuid"))
+bool CodeGenModule::HasHiddenLTOVisibility(const CXXRecordDecl *RD) {
+  LinkageInfo LV = RD->getLinkageAndVisibility();
+  if (!isExternallyVisible(LV.getLinkage()))
     return true;
 
-  return getContext().getSanitizerBlacklist().isBlacklistedType(
-      RD->getQualifiedNameAsString());
+  if (RD->hasAttr<LTOVisibilityPublicAttr>() || RD->hasAttr<UuidAttr>())
+    return false;
+
+  if (getTriple().isOSBinFormatCOFF()) {
+    if (RD->hasAttr<DLLExportAttr>() || RD->hasAttr<DLLImportAttr>())
+      return false;
+  } else {
+    if (LV.getVisibility() != HiddenVisibility)
+      return false;
+  }
+
+  if (getCodeGenOpts().LTOVisibilityPublicStd) {
+    const DeclContext *DC = RD;
+    while (1) {
+      auto *D = cast<Decl>(DC);
+      DC = DC->getParent();
+      if (isa<TranslationUnitDecl>(DC->getRedeclContext())) {
+        if (auto *ND = dyn_cast<NamespaceDecl>(D))
+          if (const IdentifierInfo *II = ND->getIdentifier())
+            if (II->isStr("std") || II->isStr("stdext"))
+              return false;
+        break;
+      }
+    }
+  }
+
+  return true;
 }
 
-void CodeGenModule::EmitVTableBitSetEntries(llvm::GlobalVariable *VTable,
-                                            const VTableLayout &VTLayout) {
-  if (!LangOpts.Sanitize.has(SanitizerKind::CFIVCall) &&
-      !LangOpts.Sanitize.has(SanitizerKind::CFINVCall) &&
-      !LangOpts.Sanitize.has(SanitizerKind::CFIDerivedCast) &&
-      !LangOpts.Sanitize.has(SanitizerKind::CFIUnrelatedCast))
+void CodeGenModule::EmitVTableTypeMetadata(llvm::GlobalVariable *VTable,
+                                           const VTableLayout &VTLayout) {
+  if (!getCodeGenOpts().PrepareForLTO)
     return;
 
   CharUnits PointerWidth =
@@ -914,12 +945,8 @@
   typedef std::pair<const CXXRecordDecl *, unsigned> BSEntry;
   std::vector<BSEntry> BitsetEntries;
   // Create a bit set entry for each address point.
-  for (auto &&AP : VTLayout.getAddressPoints()) {
-    if (IsCFIBlacklistedRecord(AP.first.getBase()))
-      continue;
-
+  for (auto &&AP : VTLayout.getAddressPoints())
     BitsetEntries.push_back(std::make_pair(AP.first.getBase(), AP.second));
-  }
 
   // Sort the bit set entries for determinism.
   std::sort(BitsetEntries.begin(), BitsetEntries.end(),
@@ -947,10 +974,7 @@
     return E1.second < E2.second;
   });
 
-  llvm::NamedMDNode *BitsetsMD =
-      getModule().getOrInsertNamedMetadata("llvm.bitsets");
   for (auto BitsetEntry : BitsetEntries)
-    CreateVTableBitSetEntry(BitsetsMD, VTable,
-                            PointerWidth * BitsetEntry.second,
-                            BitsetEntry.first);
+    AddVTableTypeMetadata(VTable, PointerWidth * BitsetEntry.second,
+                          BitsetEntry.first);
 }
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 6ec1ebb..175d9ae 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -3,10 +3,12 @@
   BitReader
   BitWriter
   Core
+  Coverage
   IPO
   IRReader
   InstCombine
   Instrumentation
+  LTO
   Linker
   MC
   ObjCARCOpts
@@ -32,6 +34,7 @@
   CGAtomic.cpp
   CGBlocks.cpp
   CGBuiltin.cpp
+  CGCUDABuiltin.cpp
   CGCUDANV.cpp
   CGCUDARuntime.cpp
   CGCXX.cpp
@@ -56,6 +59,7 @@
   CGObjCRuntime.cpp
   CGOpenCLRuntime.cpp
   CGOpenMPRuntime.cpp
+  CGOpenMPRuntimeNVPTX.cpp
   CGRecordLayoutBuilder.cpp
   CGStmt.cpp
   CGStmtOpenMP.cpp
diff --git a/lib/CodeGen/CodeGenAction.cpp b/lib/CodeGen/CodeGenAction.cpp
index bcf671b..dd80390 100644
--- a/lib/CodeGen/CodeGenAction.cpp
+++ b/lib/CodeGen/CodeGenAction.cpp
@@ -21,7 +21,6 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Lex/Preprocessor.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -46,14 +45,14 @@
     const CodeGenOptions &CodeGenOpts;
     const TargetOptions &TargetOpts;
     const LangOptions &LangOpts;
-    raw_pwrite_stream *AsmOutStream;
+    std::unique_ptr<raw_pwrite_stream> AsmOutStream;
     ASTContext *Context;
 
     Timer LLVMIRGeneration;
+    unsigned LLVMIRGenerationRefCount;
 
     std::unique_ptr<CodeGenerator> Gen;
 
-    std::unique_ptr<llvm::Module> TheModule;
     SmallVector<std::pair<unsigned, std::unique_ptr<llvm::Module>>, 4>
         LinkModules;
 
@@ -69,11 +68,13 @@
         const TargetOptions &TargetOpts, const LangOptions &LangOpts,
         bool TimePasses, const std::string &InFile,
         const SmallVectorImpl<std::pair<unsigned, llvm::Module *>> &LinkModules,
-        raw_pwrite_stream *OS, LLVMContext &C,
+        std::unique_ptr<raw_pwrite_stream> OS, LLVMContext &C,
         CoverageSourceInfo *CoverageInfo = nullptr)
         : Diags(Diags), Action(Action), CodeGenOpts(CodeGenOpts),
-          TargetOpts(TargetOpts), LangOpts(LangOpts), AsmOutStream(OS),
-          Context(nullptr), LLVMIRGeneration("LLVM IR Generation Time"),
+          TargetOpts(TargetOpts), LangOpts(LangOpts),
+          AsmOutStream(std::move(OS)), Context(nullptr),
+          LLVMIRGeneration("LLVM IR Generation Time"),
+          LLVMIRGenerationRefCount(0),
           Gen(CreateLLVMCodeGen(Diags, InFile, HeaderSearchOpts, PPOpts,
                                 CodeGenOpts, C, CoverageInfo)) {
       llvm::TimePassesIsEnabled = TimePasses;
@@ -81,7 +82,10 @@
         this->LinkModules.push_back(
             std::make_pair(I.first, std::unique_ptr<llvm::Module>(I.second)));
     }
-    std::unique_ptr<llvm::Module> takeModule() { return std::move(TheModule); }
+    llvm::Module *getModule() const { return Gen->GetModule(); }
+    std::unique_ptr<llvm::Module> takeModule() {
+      return std::unique_ptr<llvm::Module>(Gen->ReleaseModule());
+    }
     void releaseLinkModules() {
       for (auto &I : LinkModules)
         I.second.release();
@@ -101,8 +105,6 @@
 
       Gen->Initialize(Ctx);
 
-      TheModule.reset(Gen->GetModule());
-
       if (llvm::TimePassesIsEnabled)
         LLVMIRGeneration.stopTimer();
     }
@@ -112,25 +114,32 @@
                                      Context->getSourceManager(),
                                      "LLVM IR generation of declaration");
 
-      if (llvm::TimePassesIsEnabled)
-        LLVMIRGeneration.startTimer();
+      // Recurse.
+      if (llvm::TimePassesIsEnabled) {
+        LLVMIRGenerationRefCount += 1;
+        if (LLVMIRGenerationRefCount == 1)
+          LLVMIRGeneration.startTimer();
+      }
 
       Gen->HandleTopLevelDecl(D);
 
-      if (llvm::TimePassesIsEnabled)
-        LLVMIRGeneration.stopTimer();
+      if (llvm::TimePassesIsEnabled) {
+        LLVMIRGenerationRefCount -= 1;
+        if (LLVMIRGenerationRefCount == 0)
+          LLVMIRGeneration.stopTimer();
+      }
 
       return true;
     }
 
-    void HandleInlineMethodDefinition(CXXMethodDecl *D) override {
+    void HandleInlineFunctionDefinition(FunctionDecl *D) override {
       PrettyStackTraceDecl CrashInfo(D, SourceLocation(),
                                      Context->getSourceManager(),
-                                     "LLVM IR generation of inline method");
+                                     "LLVM IR generation of inline function");
       if (llvm::TimePassesIsEnabled)
         LLVMIRGeneration.startTimer();
 
-      Gen->HandleInlineMethodDefinition(D);
+      Gen->HandleInlineFunctionDefinition(D);
 
       if (llvm::TimePassesIsEnabled)
         LLVMIRGeneration.stopTimer();
@@ -139,35 +148,28 @@
     void HandleTranslationUnit(ASTContext &C) override {
       {
         PrettyStackTraceString CrashInfo("Per-file LLVM IR generation");
-        if (llvm::TimePassesIsEnabled)
-          LLVMIRGeneration.startTimer();
+        if (llvm::TimePassesIsEnabled) {
+          LLVMIRGenerationRefCount += 1;
+          if (LLVMIRGenerationRefCount == 1)
+            LLVMIRGeneration.startTimer();
+        }
 
         Gen->HandleTranslationUnit(C);
 
-        if (llvm::TimePassesIsEnabled)
-          LLVMIRGeneration.stopTimer();
+        if (llvm::TimePassesIsEnabled) {
+          LLVMIRGenerationRefCount -= 1;
+          if (LLVMIRGenerationRefCount == 0)
+            LLVMIRGeneration.stopTimer();
+        }
       }
 
       // Silently ignore if we weren't initialized for some reason.
-      if (!TheModule)
+      if (!getModule())
         return;
 
-      // Make sure IR generation is happy with the module. This is released by
-      // the module provider.
-      llvm::Module *M = Gen->ReleaseModule();
-      if (!M) {
-        // The module has been released by IR gen on failures, do not double
-        // free.
-        TheModule.release();
-        return;
-      }
-
-      assert(TheModule.get() == M &&
-             "Unexpected module change during IR generation");
-
       // Install an inline asm handler so that diagnostics get printed through
       // our diagnostics hooks.
-      LLVMContext &Ctx = TheModule->getContext();
+      LLVMContext &Ctx = getModule()->getContext();
       LLVMContext::InlineAsmDiagHandlerTy OldHandler =
         Ctx.getInlineAsmDiagnosticHandler();
       void *OldContext = Ctx.getInlineAsmDiagnosticContext();
@@ -182,13 +184,15 @@
       for (auto &I : LinkModules) {
         unsigned LinkFlags = I.first;
         CurLinkModule = I.second.get();
-        if (Linker::linkModules(*M, std::move(I.second), LinkFlags))
+        if (Linker::linkModules(*getModule(), std::move(I.second), LinkFlags))
           return;
       }
 
+      EmbedBitcode(getModule(), CodeGenOpts, llvm::MemoryBufferRef());
+
       EmitBackendOutput(Diags, CodeGenOpts, TargetOpts, LangOpts,
-                        C.getTargetInfo().getDataLayoutString(),
-                        TheModule.get(), Action, AsmOutStream);
+                        C.getTargetInfo().getDataLayout(),
+                        getModule(), Action, std::move(AsmOutStream));
 
       Ctx.setInlineAsmDiagnosticHandler(OldHandler, OldContext);
 
@@ -210,23 +214,14 @@
       Gen->CompleteTentativeDefinition(D);
     }
 
+    void AssignInheritanceModel(CXXRecordDecl *RD) override {
+      Gen->AssignInheritanceModel(RD);
+    }
+
     void HandleVTable(CXXRecordDecl *RD) override {
       Gen->HandleVTable(RD);
     }
 
-    void HandleLinkerOption(llvm::StringRef Opts) override {
-      Gen->HandleLinkerOption(Opts);
-    }
-
-    void HandleDetectMismatch(llvm::StringRef Name,
-                                      llvm::StringRef Value) override {
-      Gen->HandleDetectMismatch(Name, Value);
-    }
-
-    void HandleDependentLibrary(llvm::StringRef Opts) override {
-      Gen->HandleDependentLibrary(Opts);
-    }
-
     static void InlineAsmDiagHandler(const llvm::SMDiagnostic &SM,void *Context,
                                      unsigned LocCookie) {
       SourceLocation Loc = SourceLocation::getFromRawEncoding(LocCookie);
@@ -238,6 +233,13 @@
       ((BackendConsumer *)Context)->DiagnosticHandlerImpl(DI);
     }
 
+    /// Get the best possible source location to represent a diagnostic that
+    /// may have associated debug info.
+    const FullSourceLoc
+    getBestLocationFromDebugLoc(const llvm::DiagnosticInfoWithDebugLocBase &D,
+                                bool &BadDebugInfo, StringRef &Filename,
+                                unsigned &Line, unsigned &Column) const;
+
     void InlineAsmDiagHandler2(const llvm::SMDiagnostic &,
                                SourceLocation LocCookie);
 
@@ -250,6 +252,8 @@
     /// \return True if the diagnostic has been successfully reported, false
     /// otherwise.
     bool StackSizeDiagHandler(const llvm::DiagnosticInfoStackSize &D);
+    /// \brief Specialized handler for unsupported backend feature diagnostic.
+    void UnsupportedDiagHandler(const llvm::DiagnosticInfoUnsupported &D);
     /// \brief Specialized handlers for optimization remarks.
     /// Note that these handlers only accept remarks and they always handle
     /// them.
@@ -426,25 +430,21 @@
     return false;
 
   if (const Decl *ND = Gen->GetDeclForMangledName(D.getFunction().getName())) {
+    // FIXME: Shouldn't need to truncate to uint32_t
     Diags.Report(ND->getASTContext().getFullLoc(ND->getLocation()),
                  diag::warn_fe_frame_larger_than)
-        << D.getStackSize() << Decl::castToDeclContext(ND);
+      << static_cast<uint32_t>(D.getStackSize()) << Decl::castToDeclContext(ND);
     return true;
   }
 
   return false;
 }
 
-void BackendConsumer::EmitOptimizationMessage(
-    const llvm::DiagnosticInfoOptimizationBase &D, unsigned DiagID) {
-  // We only support warnings and remarks.
-  assert(D.getSeverity() == llvm::DS_Remark ||
-         D.getSeverity() == llvm::DS_Warning);
-
+const FullSourceLoc BackendConsumer::getBestLocationFromDebugLoc(
+    const llvm::DiagnosticInfoWithDebugLocBase &D, bool &BadDebugInfo, StringRef &Filename,
+                                unsigned &Line, unsigned &Column) const {
   SourceManager &SourceMgr = Context->getSourceManager();
   FileManager &FileMgr = SourceMgr.getFileManager();
-  StringRef Filename;
-  unsigned Line, Column;
   SourceLocation DILoc;
 
   if (D.isLocationAvailable()) {
@@ -455,6 +455,7 @@
       // source manager, so pass 1 if Column is not set.
       DILoc = SourceMgr.translateFileLineCol(FE, Line, Column ? Column : 1);
     }
+    BadDebugInfo = DILoc.isInvalid();
   }
 
   // If a location isn't available, try to approximate it using the associated
@@ -463,18 +464,63 @@
   FullSourceLoc Loc(DILoc, SourceMgr);
   if (Loc.isInvalid())
     if (const Decl *FD = Gen->GetDeclForMangledName(D.getFunction().getName()))
-      Loc = FD->getASTContext().getFullLoc(FD->getBodyRBrace());
-
-  Diags.Report(Loc, DiagID)
-      << AddFlagValue(D.getPassName() ? D.getPassName() : "")
-      << D.getMsg().str();
+      Loc = FD->getASTContext().getFullLoc(FD->getLocation());
 
   if (DILoc.isInvalid() && D.isLocationAvailable())
     // If we were not able to translate the file:line:col information
     // back to a SourceLocation, at least emit a note stating that
     // we could not translate this location. This can happen in the
     // case of #line directives.
-    Diags.Report(Loc, diag::note_fe_backend_optimization_remark_invalid_loc)
+    Diags.Report(Loc, diag::note_fe_backend_invalid_loc)
+        << Filename << Line << Column;
+
+  return Loc;
+}
+
+void BackendConsumer::UnsupportedDiagHandler(
+    const llvm::DiagnosticInfoUnsupported &D) {
+  // We only support errors.
+  assert(D.getSeverity() == llvm::DS_Error);
+
+  StringRef Filename;
+  unsigned Line, Column;
+  bool BadDebugInfo;
+  FullSourceLoc Loc = getBestLocationFromDebugLoc(D, BadDebugInfo, Filename,
+      Line, Column);
+
+  Diags.Report(Loc, diag::err_fe_backend_unsupported) << D.getMessage().str();
+
+  if (BadDebugInfo)
+    // If we were not able to translate the file:line:col information
+    // back to a SourceLocation, at least emit a note stating that
+    // we could not translate this location. This can happen in the
+    // case of #line directives.
+    Diags.Report(Loc, diag::note_fe_backend_invalid_loc)
+        << Filename << Line << Column;
+}
+
+void BackendConsumer::EmitOptimizationMessage(
+    const llvm::DiagnosticInfoOptimizationBase &D, unsigned DiagID) {
+  // We only support warnings and remarks.
+  assert(D.getSeverity() == llvm::DS_Remark ||
+         D.getSeverity() == llvm::DS_Warning);
+
+  StringRef Filename;
+  unsigned Line, Column;
+  bool BadDebugInfo = false;
+  FullSourceLoc Loc = getBestLocationFromDebugLoc(D, BadDebugInfo, Filename,
+      Line, Column);
+
+  Diags.Report(Loc, DiagID)
+      << AddFlagValue(D.getPassName() ? D.getPassName() : "")
+      << D.getMsg().str();
+
+  if (BadDebugInfo)
+    // If we were not able to translate the file:line:col information
+    // back to a SourceLocation, at least emit a note stating that
+    // we could not translate this location. This can happen in the
+    // case of #line directives.
+    Diags.Report(Loc, diag::note_fe_backend_invalid_loc)
         << Filename << Line << Column;
 }
 
@@ -504,7 +550,7 @@
   // llvm::DiagnosticInfo::AlwasyPrint or if the -Rpass-analysis flag has a
   // regular expression that matches the name of the pass name in \p D.
 
-  if (D.getPassName() == llvm::DiagnosticInfo::AlwaysPrint ||
+  if (D.shouldAlwaysPrint() ||
       (CodeGenOpts.OptimizationRemarkAnalysisPattern &&
        CodeGenOpts.OptimizationRemarkAnalysisPattern->match(D.getPassName())))
     EmitOptimizationMessage(
@@ -517,7 +563,7 @@
   // llvm::DiagnosticInfo::AlwasyPrint or if the -Rpass-analysis flag has a
   // regular expression that matches the name of the pass name in \p D.
 
-  if (D.getPassName() == llvm::DiagnosticInfo::AlwaysPrint ||
+  if (D.shouldAlwaysPrint() ||
       (CodeGenOpts.OptimizationRemarkAnalysisPattern &&
        CodeGenOpts.OptimizationRemarkAnalysisPattern->match(D.getPassName())))
     EmitOptimizationMessage(
@@ -530,7 +576,7 @@
   // llvm::DiagnosticInfo::AlwasyPrint or if the -Rpass-analysis flag has a
   // regular expression that matches the name of the pass name in \p D.
 
-  if (D.getPassName() == llvm::DiagnosticInfo::AlwaysPrint ||
+  if (D.shouldAlwaysPrint() ||
       (CodeGenOpts.OptimizationRemarkAnalysisPattern &&
        CodeGenOpts.OptimizationRemarkAnalysisPattern->match(D.getPassName())))
     EmitOptimizationMessage(
@@ -599,6 +645,9 @@
     // handler.
     OptimizationFailureHandler(cast<DiagnosticInfoOptimizationFailure>(DI));
     return;
+  case llvm::DK_Unsupported:
+    UnsupportedDiagHandler(cast<DiagnosticInfoUnsupported>(DI));
+    return;
   default:
     // Plugin IDs are not bound to any value as they are set dynamically.
     ComputeDiagRemarkID(Severity, backend_plugin, DiagID);
@@ -657,7 +706,7 @@
   return VMContext;
 }
 
-static raw_pwrite_stream *
+static std::unique_ptr<raw_pwrite_stream>
 GetOutputStream(CompilerInstance &CI, StringRef InFile, BackendAction Action) {
   switch (Action) {
   case Backend_EmitAssembly:
@@ -680,7 +729,7 @@
 std::unique_ptr<ASTConsumer>
 CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
   BackendAction BA = static_cast<BackendAction>(Act);
-  raw_pwrite_stream *OS = GetOutputStream(CI, InFile, BA);
+  std::unique_ptr<raw_pwrite_stream> OS = GetOutputStream(CI, InFile, BA);
   if (BA != Backend_EmitNothing && !OS)
     return nullptr;
 
@@ -720,7 +769,7 @@
       BA, CI.getDiagnostics(), CI.getHeaderSearchOpts(),
       CI.getPreprocessorOpts(), CI.getCodeGenOpts(), CI.getTargetOpts(),
       CI.getLangOpts(), CI.getFrontendOpts().ShowTimers, InFile, LinkModules,
-      OS, *VMContext, CoverageInfo));
+      std::move(OS), *VMContext, CoverageInfo));
   BEConsumer = Result.get();
   return std::move(Result);
 }
@@ -752,7 +801,8 @@
   if (getCurrentFileKind() == IK_LLVM_IR) {
     BackendAction BA = static_cast<BackendAction>(Act);
     CompilerInstance &CI = getCompilerInstance();
-    raw_pwrite_stream *OS = GetOutputStream(CI, getCurrentFile(), BA);
+    std::unique_ptr<raw_pwrite_stream> OS =
+        GetOutputStream(CI, getCurrentFile(), BA);
     if (BA != Backend_EmitNothing && !OS)
       return;
 
@@ -763,6 +813,11 @@
     if (Invalid)
       return;
 
+    // For ThinLTO backend invocations, ensure that the context
+    // merges types based on ODR identifiers.
+    if (!CI.getCodeGenOpts().ThinLTOIndexFile.empty())
+      VMContext->enableDebugTypeODRUniquing();
+
     llvm::SMDiagnostic Err;
     TheModule = parseIR(MainFile->getMemBufferRef(), Err, *VMContext);
     if (!TheModule) {
@@ -795,12 +850,16 @@
       TheModule->setTargetTriple(TargetOpts.Triple);
     }
 
+    EmbedBitcode(TheModule.get(), CI.getCodeGenOpts(),
+                 MainFile->getMemBufferRef());
+
     LLVMContext &Ctx = TheModule->getContext();
     Ctx.setInlineAsmDiagnosticHandler(BitcodeInlineAsmDiagHandler,
                                       &CI.getDiagnostics());
+
     EmitBackendOutput(CI.getDiagnostics(), CI.getCodeGenOpts(), TargetOpts,
-                      CI.getLangOpts(), CI.getTarget().getDataLayoutString(),
-                      TheModule.get(), BA, OS);
+                      CI.getLangOpts(), CI.getTarget().getDataLayout(),
+                      TheModule.get(), BA, std::move(OS));
     return;
   }
 
diff --git a/lib/CodeGen/CodeGenFunction.cpp b/lib/CodeGen/CodeGenFunction.cpp
index ccc1808..183ee12 100644
--- a/lib/CodeGen/CodeGenFunction.cpp
+++ b/lib/CodeGen/CodeGenFunction.cpp
@@ -397,6 +397,12 @@
   return true;
 }
 
+/// ShouldXRayInstrument - Return true if the current function should be
+/// instrumented with XRay nop sleds.
+bool CodeGenFunction::ShouldXRayInstrumentFunction() const {
+  return CGM.getCodeGenOpts().XRayInstrumentFunctions;
+}
+
 /// EmitFunctionInstrumentation - Emit LLVM code to call the specified
 /// instrumentation function with the current function and the call site, if
 /// function instrumentation is enabled.
@@ -435,7 +441,6 @@
 // includes the argument name, its type, the address and access qualifiers used.
 static void GenOpenCLArgMetadata(const FunctionDecl *FD, llvm::Function *Fn,
                                  CodeGenModule &CGM, llvm::LLVMContext &Context,
-                                 SmallVector<llvm::Metadata *, 5> &kernelMDArgs,
                                  CGBuilderTy &Builder, ASTContext &ASTCtx) {
   // Create MDNodes that represent the kernel arg metadata.
   // Each MDNode is a list in the form of "key", N number of values which is
@@ -445,28 +450,21 @@
 
   // MDNode for the kernel argument address space qualifiers.
   SmallVector<llvm::Metadata *, 8> addressQuals;
-  addressQuals.push_back(llvm::MDString::get(Context, "kernel_arg_addr_space"));
 
   // MDNode for the kernel argument access qualifiers (images only).
   SmallVector<llvm::Metadata *, 8> accessQuals;
-  accessQuals.push_back(llvm::MDString::get(Context, "kernel_arg_access_qual"));
 
   // MDNode for the kernel argument type names.
   SmallVector<llvm::Metadata *, 8> argTypeNames;
-  argTypeNames.push_back(llvm::MDString::get(Context, "kernel_arg_type"));
 
   // MDNode for the kernel argument base type names.
   SmallVector<llvm::Metadata *, 8> argBaseTypeNames;
-  argBaseTypeNames.push_back(
-      llvm::MDString::get(Context, "kernel_arg_base_type"));
 
   // MDNode for the kernel argument type qualifiers.
   SmallVector<llvm::Metadata *, 8> argTypeQuals;
-  argTypeQuals.push_back(llvm::MDString::get(Context, "kernel_arg_type_qual"));
 
   // MDNode for the kernel argument names.
   SmallVector<llvm::Metadata *, 8> argNames;
-  argNames.push_back(llvm::MDString::get(Context, "kernel_arg_name"));
 
   for (unsigned i = 0, e = FD->getNumParams(); i != e; ++i) {
     const ParmVarDecl *parm = FD->getParamDecl(i);
@@ -524,7 +522,8 @@
       // Get argument type name.
       std::string typeName;
       if (isPipe)
-        typeName = cast<PipeType>(ty)->getElementType().getAsString(Policy);
+        typeName = ty.getCanonicalType()->getAs<PipeType>()->getElementType()
+                     .getAsString(Policy);
       else
         typeName = ty.getUnqualifiedType().getAsString(Policy);
 
@@ -537,8 +536,9 @@
 
       std::string baseTypeName;
       if (isPipe)
-        baseTypeName =
-          cast<PipeType>(ty)->getElementType().getCanonicalType().getAsString(Policy);
+        baseTypeName = ty.getCanonicalType()->getAs<PipeType>()
+                          ->getElementType().getCanonicalType()
+                          .getAsString(Policy);
       else
         baseTypeName =
           ty.getUnqualifiedType().getCanonicalType().getAsString(Policy);
@@ -562,15 +562,14 @@
     argTypeQuals.push_back(llvm::MDString::get(Context, typeQuals));
 
     // Get image and pipe access qualifier:
-    // FIXME: now image and pipe share the same access qualifier maybe we can
-    // refine it to OpenCL access qualifier and also handle write_read
     if (ty->isImageType()|| ty->isPipeType()) {
-      const OpenCLImageAccessAttr *A = parm->getAttr<OpenCLImageAccessAttr>();
+      const OpenCLAccessAttr *A = parm->getAttr<OpenCLAccessAttr>();
       if (A && A->isWriteOnly())
         accessQuals.push_back(llvm::MDString::get(Context, "write_only"));
+      else if (A && A->isReadWrite())
+        accessQuals.push_back(llvm::MDString::get(Context, "read_write"));
       else
         accessQuals.push_back(llvm::MDString::get(Context, "read_only"));
-      // FIXME: what about read_write?
     } else
       accessQuals.push_back(llvm::MDString::get(Context, "none"));
 
@@ -578,13 +577,19 @@
     argNames.push_back(llvm::MDString::get(Context, parm->getName()));
   }
 
-  kernelMDArgs.push_back(llvm::MDNode::get(Context, addressQuals));
-  kernelMDArgs.push_back(llvm::MDNode::get(Context, accessQuals));
-  kernelMDArgs.push_back(llvm::MDNode::get(Context, argTypeNames));
-  kernelMDArgs.push_back(llvm::MDNode::get(Context, argBaseTypeNames));
-  kernelMDArgs.push_back(llvm::MDNode::get(Context, argTypeQuals));
+  Fn->setMetadata("kernel_arg_addr_space",
+                  llvm::MDNode::get(Context, addressQuals));
+  Fn->setMetadata("kernel_arg_access_qual",
+                  llvm::MDNode::get(Context, accessQuals));
+  Fn->setMetadata("kernel_arg_type",
+                  llvm::MDNode::get(Context, argTypeNames));
+  Fn->setMetadata("kernel_arg_base_type",
+                  llvm::MDNode::get(Context, argBaseTypeNames));
+  Fn->setMetadata("kernel_arg_type_qual",
+                  llvm::MDNode::get(Context, argTypeQuals));
   if (CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
-    kernelMDArgs.push_back(llvm::MDNode::get(Context, argNames));
+    Fn->setMetadata("kernel_arg_name",
+                    llvm::MDNode::get(Context, argNames));
 }
 
 void CodeGenFunction::EmitOpenCLKernelMetadata(const FunctionDecl *FD,
@@ -595,11 +600,7 @@
 
   llvm::LLVMContext &Context = getLLVMContext();
 
-  SmallVector<llvm::Metadata *, 5> kernelMDArgs;
-  kernelMDArgs.push_back(llvm::ConstantAsMetadata::get(Fn));
-
-  GenOpenCLArgMetadata(FD, Fn, CGM, Context, kernelMDArgs, Builder,
-                       getContext());
+  GenOpenCLArgMetadata(FD, Fn, CGM, Context, Builder, getContext());
 
   if (const VecTypeHintAttr *A = FD->getAttr<VecTypeHintAttr>()) {
     QualType hintQTy = A->getTypeHint();
@@ -608,37 +609,29 @@
         hintQTy->isSignedIntegerType() ||
         (hintEltQTy && hintEltQTy->getElementType()->isSignedIntegerType());
     llvm::Metadata *attrMDArgs[] = {
-        llvm::MDString::get(Context, "vec_type_hint"),
         llvm::ConstantAsMetadata::get(llvm::UndefValue::get(
             CGM.getTypes().ConvertType(A->getTypeHint()))),
         llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
             llvm::IntegerType::get(Context, 32),
             llvm::APInt(32, (uint64_t)(isSignedInteger ? 1 : 0))))};
-    kernelMDArgs.push_back(llvm::MDNode::get(Context, attrMDArgs));
+    Fn->setMetadata("vec_type_hint", llvm::MDNode::get(Context, attrMDArgs));
   }
 
   if (const WorkGroupSizeHintAttr *A = FD->getAttr<WorkGroupSizeHintAttr>()) {
     llvm::Metadata *attrMDArgs[] = {
-        llvm::MDString::get(Context, "work_group_size_hint"),
         llvm::ConstantAsMetadata::get(Builder.getInt32(A->getXDim())),
         llvm::ConstantAsMetadata::get(Builder.getInt32(A->getYDim())),
         llvm::ConstantAsMetadata::get(Builder.getInt32(A->getZDim()))};
-    kernelMDArgs.push_back(llvm::MDNode::get(Context, attrMDArgs));
+    Fn->setMetadata("work_group_size_hint", llvm::MDNode::get(Context, attrMDArgs));
   }
 
   if (const ReqdWorkGroupSizeAttr *A = FD->getAttr<ReqdWorkGroupSizeAttr>()) {
     llvm::Metadata *attrMDArgs[] = {
-        llvm::MDString::get(Context, "reqd_work_group_size"),
         llvm::ConstantAsMetadata::get(Builder.getInt32(A->getXDim())),
         llvm::ConstantAsMetadata::get(Builder.getInt32(A->getYDim())),
         llvm::ConstantAsMetadata::get(Builder.getInt32(A->getZDim()))};
-    kernelMDArgs.push_back(llvm::MDNode::get(Context, attrMDArgs));
+    Fn->setMetadata("reqd_work_group_size", llvm::MDNode::get(Context, attrMDArgs));
   }
-
-  llvm::MDNode *kernelMDNode = llvm::MDNode::get(Context, kernelMDArgs);
-  llvm::NamedMDNode *OpenCLKernelMetadata =
-    CGM.getModule().getOrInsertNamedMetadata("opencl.kernels");
-  OpenCLKernelMetadata->addOperand(kernelMDNode);
 }
 
 /// Determine whether the function F ends with a return stmt.
@@ -671,6 +664,9 @@
 
   DidCallStackSave = false;
   CurCodeDecl = D;
+  if (const auto *FD = dyn_cast_or_null<FunctionDecl>(D))
+    if (FD->usesSEHTry())
+      CurSEHParent = FD;
   CurFuncDecl = (D ? D->getNonClosureContext() : nullptr);
   FnRetTy = RetTy;
   CurFn = Fn;
@@ -696,20 +692,46 @@
   if (SanOpts.has(SanitizerKind::SafeStack))
     Fn->addFnAttr(llvm::Attribute::SafeStack);
 
+  // Apply xray attributes to the function (as a string, for now)
+  if (D && ShouldXRayInstrumentFunction()) {
+    if (const auto *XRayAttr = D->getAttr<XRayInstrumentAttr>()) {
+      if (XRayAttr->alwaysXRayInstrument())
+        Fn->addFnAttr("function-instrument", "xray-always");
+      if (XRayAttr->neverXRayInstrument())
+        Fn->addFnAttr("function-instrument", "xray-never");
+    } else {
+      Fn->addFnAttr(
+          "xray-instruction-threshold",
+          llvm::itostr(CGM.getCodeGenOpts().XRayInstructionThreshold));
+    }
+  }
+
   // Pass inline keyword to optimizer if it appears explicitly on any
   // declaration. Also, in the case of -fno-inline attach NoInline
-  // attribute to all function that are not marked AlwaysInline.
+  // attribute to all functions that are not marked AlwaysInline, or
+  // to all functions that are not marked inline or implicitly inline
+  // in the case of -finline-hint-functions.
   if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D)) {
-    if (!CGM.getCodeGenOpts().NoInline) {
+    const CodeGenOptions& CodeGenOpts = CGM.getCodeGenOpts();
+    if (!CodeGenOpts.NoInline) {
       for (auto RI : FD->redecls())
         if (RI->isInlineSpecified()) {
           Fn->addFnAttr(llvm::Attribute::InlineHint);
           break;
         }
+      if (CodeGenOpts.getInlining() == CodeGenOptions::OnlyHintInlining &&
+          !FD->isInlined() && !Fn->hasFnAttribute(llvm::Attribute::InlineHint))
+        Fn->addFnAttr(llvm::Attribute::NoInline);
     } else if (!FD->hasAttr<AlwaysInlineAttr>())
       Fn->addFnAttr(llvm::Attribute::NoInline);
+    if (CGM.getLangOpts().OpenMP && FD->hasAttr<OMPDeclareSimdDeclAttr>())
+      CGM.getOpenMPRuntime().emitDeclareSimdFunction(FD, Fn);
   }
 
+  // Add no-jump-tables value.
+  Fn->addFnAttr("no-jump-tables",
+                llvm::toStringRef(CGM.getCodeGenOpts().NoUseJumpTables));
+
   if (getLangOpts().OpenCL) {
     // Add metadata for a kernel function.
     if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D))
@@ -754,15 +776,18 @@
 
   // Emit subprogram debug descriptor.
   if (CGDebugInfo *DI = getDebugInfo()) {
+    // Reconstruct the type from the argument list so that implicit parameters,
+    // such as 'this' and 'vtt', show up in the debug info. Preserve the calling
+    // convention.
+    CallingConv CC = CallingConv::CC_C;
+    if (auto *FD = dyn_cast_or_null<FunctionDecl>(D))
+      if (const auto *SrcFnTy = FD->getType()->getAs<FunctionType>())
+        CC = SrcFnTy->getCallConv();
     SmallVector<QualType, 16> ArgTypes;
-    for (FunctionArgList::const_iterator i = Args.begin(), e = Args.end();
-	 i != e; ++i) {
-      ArgTypes.push_back((*i)->getType());
-    }
-
-    QualType FnType =
-      getContext().getFunctionType(RetTy, ArgTypes,
-                                   FunctionProtoType::ExtProtoInfo());
+    for (const VarDecl *VD : Args)
+      ArgTypes.push_back(VD->getType());
+    QualType FnType = getContext().getFunctionType(
+        RetTy, ArgTypes, FunctionProtoType::ExtProtoInfo(CC));
     DI->EmitFunctionStart(GD, Loc, StartLoc, FnType, CurFn, Builder);
   }
 
@@ -822,10 +847,22 @@
       MD->getParent()->getCaptureFields(LambdaCaptureFields,
                                         LambdaThisCaptureField);
       if (LambdaThisCaptureField) {
-        // If this lambda captures this, load it.
-        LValue ThisLValue = EmitLValueForLambdaField(LambdaThisCaptureField);
-        CXXThisValue = EmitLoadOfLValue(ThisLValue,
-                                        SourceLocation()).getScalarVal();
+        // If the lambda captures the object referred to by '*this' - either by
+        // value or by reference, make sure CXXThisValue points to the correct
+        // object.
+
+        // Get the lvalue for the field (which is a copy of the enclosing object
+        // or contains the address of the enclosing object).
+        LValue ThisFieldLValue = EmitLValueForLambdaField(LambdaThisCaptureField);
+        if (!LambdaThisCaptureField->getType()->isPointerType()) {
+          // If the enclosing object was captured by value, just use its address.
+          CXXThisValue = ThisFieldLValue.getAddress().getPointer();
+        } else {
+          // Load the lvalue pointed to by the field, since '*this' was captured
+          // by reference.
+          CXXThisValue =
+              EmitLoadOfLValue(ThisFieldLValue, SourceLocation()).getScalarVal();
+        }
       }
       for (auto *FD : MD->getParent()->fields()) {
         if (FD->hasCapturedVLAType()) {
@@ -882,7 +919,7 @@
 void CodeGenFunction::EmitBlockWithFallThrough(llvm::BasicBlock *BB,
                                                const Stmt *S) {
   llvm::BasicBlock *SkipCountBB = nullptr;
-  if (HaveInsertPoint() && CGM.getCodeGenOpts().ProfileInstrGenerate) {
+  if (HaveInsertPoint() && CGM.getCodeGenOpts().hasProfileClangInstr()) {
     // When instrumenting for profiling, the fallthrough to certain
     // statements needs to skip over the instrumentation code so that we
     // get an accurate count.
@@ -903,7 +940,7 @@
 static void TryMarkNoThrow(llvm::Function *F) {
   // LLVM treats 'nounwind' on a function as part of the type, so we
   // can't do this on functions that can be overwritten.
-  if (F->mayBeOverridden()) return;
+  if (F->isInterposable()) return;
 
   for (llvm::BasicBlock &BB : *F)
     for (llvm::Instruction &I : BB)
@@ -913,18 +950,11 @@
   F->setDoesNotThrow();
 }
 
-void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
-                                   const CGFunctionInfo &FnInfo) {
+QualType CodeGenFunction::BuildFunctionArgList(GlobalDecl GD,
+                                               FunctionArgList &Args) {
   const FunctionDecl *FD = cast<FunctionDecl>(GD.getDecl());
-
-  // Check if we should generate debug info for this function.
-  if (FD->hasAttr<NoDebugAttr>())
-    DebugInfo = nullptr; // disable debug info indefinitely for this function
-
-  FunctionArgList Args;
   QualType ResTy = FD->getReturnType();
 
-  CurGD = GD;
   const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD);
   if (MD && MD->isInstance()) {
     if (CGM.getCXXABI().HasThisReturn(GD))
@@ -934,22 +964,48 @@
     CGM.getCXXABI().buildThisParam(*this, Args);
   }
 
-  for (auto *Param : FD->params()) {
-    Args.push_back(Param);
-    if (!Param->hasAttr<PassObjectSizeAttr>())
-      continue;
+  // The base version of an inheriting constructor whose constructed base is a
+  // virtual base is not passed any arguments (because it doesn't actually call
+  // the inherited constructor).
+  bool PassedParams = true;
+  if (const CXXConstructorDecl *CD = dyn_cast<CXXConstructorDecl>(FD))
+    if (auto Inherited = CD->getInheritedConstructor())
+      PassedParams =
+          getTypes().inheritingCtorHasParams(Inherited, GD.getCtorType());
 
-    IdentifierInfo *NoID = nullptr;
-    auto *Implicit = ImplicitParamDecl::Create(
-        getContext(), Param->getDeclContext(), Param->getLocation(), NoID,
-        getContext().getSizeType());
-    SizeArguments[Param] = Implicit;
-    Args.push_back(Implicit);
+  if (PassedParams) {
+    for (auto *Param : FD->parameters()) {
+      Args.push_back(Param);
+      if (!Param->hasAttr<PassObjectSizeAttr>())
+        continue;
+
+      IdentifierInfo *NoID = nullptr;
+      auto *Implicit = ImplicitParamDecl::Create(
+          getContext(), Param->getDeclContext(), Param->getLocation(), NoID,
+          getContext().getSizeType());
+      SizeArguments[Param] = Implicit;
+      Args.push_back(Implicit);
+    }
   }
 
   if (MD && (isa<CXXConstructorDecl>(MD) || isa<CXXDestructorDecl>(MD)))
     CGM.getCXXABI().addImplicitStructorParams(*this, ResTy, Args);
 
+  return ResTy;
+}
+
+void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
+                                   const CGFunctionInfo &FnInfo) {
+  const FunctionDecl *FD = cast<FunctionDecl>(GD.getDecl());
+  CurGD = GD;
+
+  FunctionArgList Args;
+  QualType ResTy = BuildFunctionArgList(GD, Args);
+
+  // Check if we should generate debug info for this function.
+  if (FD->hasAttr<NoDebugAttr>())
+    DebugInfo = nullptr; // disable debug info indefinitely for this function
+
   SourceRange BodyRange;
   if (Stmt *Body = FD->getBody()) BodyRange = Body->getSourceRange();
   CurEHLocation = BodyRange.getEnd();
@@ -1092,9 +1148,10 @@
 /// to a constant, or if it does but contains a label, return false.  If it
 /// constant folds return true and set the boolean result in Result.
 bool CodeGenFunction::ConstantFoldsToSimpleInteger(const Expr *Cond,
-                                                   bool &ResultBool) {
+                                                   bool &ResultBool,
+                                                   bool AllowLabels) {
   llvm::APSInt ResultInt;
-  if (!ConstantFoldsToSimpleInteger(Cond, ResultInt))
+  if (!ConstantFoldsToSimpleInteger(Cond, ResultInt, AllowLabels))
     return false;
 
   ResultBool = ResultInt.getBoolValue();
@@ -1104,15 +1161,16 @@
 /// ConstantFoldsToSimpleInteger - If the specified expression does not fold
 /// to a constant, or if it does but contains a label, return false.  If it
 /// constant folds return true and set the folded value.
-bool CodeGenFunction::
-ConstantFoldsToSimpleInteger(const Expr *Cond, llvm::APSInt &ResultInt) {
+bool CodeGenFunction::ConstantFoldsToSimpleInteger(const Expr *Cond,
+                                                   llvm::APSInt &ResultInt,
+                                                   bool AllowLabels) {
   // FIXME: Rename and handle conversion of other evaluatable things
   // to bool.
   llvm::APSInt Int;
   if (!Cond->EvaluateAsInt(Int, getContext()))
     return false;  // Not foldable, not integer or not fully evaluatable.
 
-  if (CodeGenFunction::ContainsLabel(Cond))
+  if (!AllowLabels && CodeGenFunction::ContainsLabel(Cond))
     return false;  // Contains a label.
 
   ResultInt = Int;
@@ -1296,15 +1354,12 @@
   // create metadata that specifies that the branch is unpredictable.
   // Don't bother if not optimizing because that metadata would not be used.
   llvm::MDNode *Unpredictable = nullptr;
-  if (CGM.getCodeGenOpts().OptimizationLevel != 0) {
-    if (const CallExpr *Call = dyn_cast<CallExpr>(Cond)) {
-      const Decl *TargetDecl = Call->getCalleeDecl();
-      if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl)) {
-        if (FD->getBuiltinID() == Builtin::BI__builtin_unpredictable) {
-          llvm::MDBuilder MDHelper(getLLVMContext());
-          Unpredictable = MDHelper.createUnpredictable();
-        }
-      }
+  auto *Call = dyn_cast<CallExpr>(Cond);
+  if (Call && CGM.getCodeGenOpts().OptimizationLevel != 0) {
+    auto *FD = dyn_cast_or_null<FunctionDecl>(Call->getCalleeDecl());
+    if (FD && FD->getBuiltinID() == Builtin::BI__builtin_unpredictable) {
+      llvm::MDBuilder MDHelper(getLLVMContext());
+      Unpredictable = MDHelper.createUnpredictable();
     }
   }
 
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index ed2718e..bb04371 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -85,7 +85,9 @@
 class BlockByrefInfo;
 class BlockFlags;
 class BlockFieldFlags;
+class RegionCodeGenTy;
 class TargetCodeGenInfo;
+struct OMPTaskDataTy;
 
 /// The kind of evaluation to perform on values of a particular
 /// type.  Basically, is the code in CGExprScalar, CGExprComplex, or
@@ -189,6 +191,8 @@
           CXXThisFieldDecl = *Field;
         else if (I->capturesVariable())
           CaptureFields[I->getCapturedVar()] = *Field;
+        else if (I->capturesVariableByCopy())
+          CaptureFields[I->getCapturedVar()] = *Field;
       }
     }
 
@@ -276,6 +280,8 @@
   /// potentially set the return value.
   bool SawAsmBlock;
 
+  const FunctionDecl *CurSEHParent = nullptr;
+
   /// True if the current function is an outlined SEH helper. This can be a
   /// finally block or filter expression.
   bool IsOutlinedSEHHelper;
@@ -296,6 +302,19 @@
 
   llvm::Instruction *CurrentFuncletPad = nullptr;
 
+  class CallLifetimeEnd final : public EHScopeStack::Cleanup {
+    llvm::Value *Addr;
+    llvm::Value *Size;
+
+  public:
+    CallLifetimeEnd(Address addr, llvm::Value *size)
+        : Addr(addr.getPointer()), Size(size) {}
+
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
+      CGF.EmitLifetimeEnd(Size, Addr);
+    }
+  };
+
   /// Header for data within LifetimeExtendedCleanupStack.
   struct LifetimeExtendedCleanupHeader {
     /// The size of the following cleanup object.
@@ -638,6 +657,11 @@
         ForceCleanup();
     }
 
+    /// Checks if the global variable is captured in current function. 
+    bool isGlobalVarCaptured(const VarDecl *VD) const {
+      return !VD->isLocalVarDeclOrParm() && CGF.LocalDeclMap.count(VD) > 0;
+    }
+
   private:
     /// Copy all the entries in the source map over the corresponding
     /// entries in the destination, which must exist.
@@ -952,7 +976,7 @@
 public:
   /// Increment the profiler's counter for the given statement.
   void incrementProfileCounter(const Stmt *S) {
-    if (CGM.getCodeGenOpts().ProfileInstrGenerate)
+    if (CGM.getCodeGenOpts().hasProfileClangInstr())
       PGO.emitCounterIncrement(Builder, S);
     PGO.setCurrentStmt(S);
   }
@@ -1054,6 +1078,61 @@
     CharUnits OldCXXThisAlignment;
   };
 
+  class InlinedInheritingConstructorScope {
+  public:
+    InlinedInheritingConstructorScope(CodeGenFunction &CGF, GlobalDecl GD)
+        : CGF(CGF), OldCurGD(CGF.CurGD), OldCurFuncDecl(CGF.CurFuncDecl),
+          OldCurCodeDecl(CGF.CurCodeDecl),
+          OldCXXABIThisDecl(CGF.CXXABIThisDecl),
+          OldCXXABIThisValue(CGF.CXXABIThisValue),
+          OldCXXThisValue(CGF.CXXThisValue),
+          OldCXXABIThisAlignment(CGF.CXXABIThisAlignment),
+          OldCXXThisAlignment(CGF.CXXThisAlignment),
+          OldReturnValue(CGF.ReturnValue), OldFnRetTy(CGF.FnRetTy),
+          OldCXXInheritedCtorInitExprArgs(
+              std::move(CGF.CXXInheritedCtorInitExprArgs)) {
+      CGF.CurGD = GD;
+      CGF.CurFuncDecl = CGF.CurCodeDecl =
+          cast<CXXConstructorDecl>(GD.getDecl());
+      CGF.CXXABIThisDecl = nullptr;
+      CGF.CXXABIThisValue = nullptr;
+      CGF.CXXThisValue = nullptr;
+      CGF.CXXABIThisAlignment = CharUnits();
+      CGF.CXXThisAlignment = CharUnits();
+      CGF.ReturnValue = Address::invalid();
+      CGF.FnRetTy = QualType();
+      CGF.CXXInheritedCtorInitExprArgs.clear();
+    }
+    ~InlinedInheritingConstructorScope() {
+      CGF.CurGD = OldCurGD;
+      CGF.CurFuncDecl = OldCurFuncDecl;
+      CGF.CurCodeDecl = OldCurCodeDecl;
+      CGF.CXXABIThisDecl = OldCXXABIThisDecl;
+      CGF.CXXABIThisValue = OldCXXABIThisValue;
+      CGF.CXXThisValue = OldCXXThisValue;
+      CGF.CXXABIThisAlignment = OldCXXABIThisAlignment;
+      CGF.CXXThisAlignment = OldCXXThisAlignment;
+      CGF.ReturnValue = OldReturnValue;
+      CGF.FnRetTy = OldFnRetTy;
+      CGF.CXXInheritedCtorInitExprArgs =
+          std::move(OldCXXInheritedCtorInitExprArgs);
+    }
+
+  private:
+    CodeGenFunction &CGF;
+    GlobalDecl OldCurGD;
+    const Decl *OldCurFuncDecl;
+    const Decl *OldCurCodeDecl;
+    ImplicitParamDecl *OldCXXABIThisDecl;
+    llvm::Value *OldCXXABIThisValue;
+    llvm::Value *OldCXXThisValue;
+    CharUnits OldCXXABIThisAlignment;
+    CharUnits OldCXXThisAlignment;
+    Address OldReturnValue;
+    QualType OldFnRetTy;
+    CallArgList OldCXXInheritedCtorInitExprArgs;
+  };
+
 private:
   /// CXXThisDecl - When generating code for a C++ member function,
   /// this will hold the implicit 'this' declaration.
@@ -1067,6 +1146,10 @@
   /// this expression.
   Address CXXDefaultInitExprThis = Address::invalid();
 
+  /// The values of function arguments to use when evaluating
+  /// CXXInheritedCtorInitExprs within this context.
+  CallArgList CXXInheritedCtorInitExprArgs;
+
   /// CXXStructorImplicitParamDecl - When generating code for a constructor or
   /// destructor, this will hold the implicit argument (e.g. VTT).
   ImplicitParamDecl *CXXStructorImplicitParamDecl;
@@ -1150,10 +1233,7 @@
     return getInvokeDestImpl();
   }
 
-  bool currentFunctionUsesSEHTry() const {
-    const auto *FD = dyn_cast_or_null<FunctionDecl>(CurCodeDecl);
-    return FD && FD->usesSEHTry();
-  }
+  bool currentFunctionUsesSEHTry() const { return CurSEHParent != nullptr; }
 
   const TargetInfo &getTarget() const { return Target; }
   llvm::LLVMContext &getLLVMContext() { return CGM.getLLVMContext(); }
@@ -1293,6 +1373,8 @@
 
   const BlockByrefInfo &getBlockByrefInfo(const VarDecl *var);
 
+  QualType BuildFunctionArgList(GlobalDecl GD, FunctionArgList &Args);
+
   void GenerateCode(GlobalDecl GD, llvm::Function *Fn,
                     const CGFunctionInfo &FnInfo);
   /// \brief Emit code for the start of a function.
@@ -1389,6 +1471,7 @@
     CFITCK_NVCall,
     CFITCK_DerivedCast,
     CFITCK_UnrelatedCast,
+    CFITCK_ICall,
   };
 
   /// \brief Derived is the presumed address of an object of type T after a
@@ -1400,14 +1483,29 @@
 
   /// EmitVTablePtrCheckForCall - Virtual method MD is being called via VTable.
   /// If vptr CFI is enabled, emit a check that VTable is valid.
-  void EmitVTablePtrCheckForCall(const CXXMethodDecl *MD, llvm::Value *VTable,
+  void EmitVTablePtrCheckForCall(const CXXRecordDecl *RD, llvm::Value *VTable,
                                  CFITypeCheckKind TCK, SourceLocation Loc);
 
   /// EmitVTablePtrCheck - Emit a check that VTable is a valid virtual table for
-  /// RD using llvm.bitset.test.
+  /// RD using llvm.type.test.
   void EmitVTablePtrCheck(const CXXRecordDecl *RD, llvm::Value *VTable,
                           CFITypeCheckKind TCK, SourceLocation Loc);
 
+  /// If whole-program virtual table optimization is enabled, emit an assumption
+  /// that VTable is a member of RD's type identifier. Or, if vptr CFI is
+  /// enabled, emit a check that VTable is a member of RD's type identifier.
+  void EmitTypeMetadataCodeForVCall(const CXXRecordDecl *RD,
+                                    llvm::Value *VTable, SourceLocation Loc);
+
+  /// Returns whether we should perform a type checked load when loading a
+  /// virtual function for virtual calls to members of RD. This is generally
+  /// true when both vcall CFI and whole-program-vtables are enabled.
+  bool ShouldEmitVTableTypeCheckedLoad(const CXXRecordDecl *RD);
+
+  /// Emit a type checked load from the given vtable.
+  llvm::Value *EmitVTableTypeCheckedLoad(const CXXRecordDecl *RD, llvm::Value *VTable,
+                                         uint64_t VTableByteOffset);
+
   /// CanDevirtualizeMemberFunctionCalls - Checks whether virtual calls on given
   /// expr can be devirtualized.
   bool CanDevirtualizeMemberFunctionCall(const Expr *Base,
@@ -1423,6 +1521,10 @@
   /// instrumented with __cyg_profile_func_* calls
   bool ShouldInstrumentFunction();
 
+  /// ShouldXRayInstrument - Return true if the current function should be
+  /// instrumented with XRay nop sleds.
+  bool ShouldXRayInstrumentFunction() const;
+
   /// EmitFunctionInstrumentation - Emit LLVM code to call the specified
   /// instrumentation function with the current function and the call site, if
   /// function instrumentation is enabled.
@@ -1573,6 +1675,10 @@
                               AlignmentSource *Source = nullptr);
   LValue EmitLoadOfReferenceLValue(Address Ref, const ReferenceType *RefTy);
 
+  Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy,
+                            AlignmentSource *Source = nullptr);
+  LValue EmitLoadOfPointerLValue(Address Ptr, const PointerType *PtrTy);
+
   /// CreateTempAlloca - This creates a alloca and inserts it into the entry
   /// block. The caller is responsible for setting an appropriate alignment on
   /// the alloca.
@@ -1846,10 +1952,32 @@
   void EmitDelegatingCXXConstructorCall(const CXXConstructorDecl *Ctor,
                                         const FunctionArgList &Args);
 
+  /// Emit a call to an inheriting constructor (that is, one that invokes a
+  /// constructor inherited from a base class) by inlining its definition. This
+  /// is necessary if the ABI does not support forwarding the arguments to the
+  /// base class constructor (because they're variadic or similar).
+  void EmitInlinedInheritingCXXConstructorCall(const CXXConstructorDecl *Ctor,
+                                               CXXCtorType CtorType,
+                                               bool ForVirtualBase,
+                                               bool Delegating,
+                                               CallArgList &Args);
+
+  /// Emit a call to a constructor inherited from a base class, passing the
+  /// current constructor's arguments along unmodified (without even making
+  /// a copy).
+  void EmitInheritedCXXConstructorCall(const CXXConstructorDecl *D,
+                                       bool ForVirtualBase, Address This,
+                                       bool InheritedFromVBase,
+                                       const CXXInheritedCtorInitExpr *E);
+
   void EmitCXXConstructorCall(const CXXConstructorDecl *D, CXXCtorType Type,
                               bool ForVirtualBase, bool Delegating,
                               Address This, const CXXConstructExpr *E);
 
+  void EmitCXXConstructorCall(const CXXConstructorDecl *D, CXXCtorType Type,
+                              bool ForVirtualBase, bool Delegating,
+                              Address This, CallArgList &Args);
+
   /// Emit assumption load for all bases. Requires to be be called only on
   /// most-derived class and not under construction of the object.
   void EmitVTableAssumptionLoads(const CXXRecordDecl *ClassDecl, Address This);
@@ -1862,7 +1990,7 @@
                                       const CXXConstructExpr *E);
 
   void EmitCXXAggrConstructorCall(const CXXConstructorDecl *D,
-                                  const ConstantArrayType *ArrayTy,
+                                  const ArrayType *ArrayTy,
                                   Address ArrayPtr,
                                   const CXXConstructExpr *E,
                                   bool ZeroInitialization = false);
@@ -2205,6 +2333,8 @@
   void EmitCXXForRangeStmt(const CXXForRangeStmt &S,
                            ArrayRef<const Attr *> Attrs = None);
 
+  /// Returns calculated size of the specified type.
+  llvm::Value *getTypeSize(QualType Ty);
   LValue InitCapturedStruct(const CapturedStmt &S);
   llvm::Function *EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K);
   llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S);
@@ -2212,6 +2342,8 @@
   llvm::Function *GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S);
   void GenerateOpenMPCapturedVars(const CapturedStmt &S,
                                   SmallVectorImpl<llvm::Value *> &CapturedVars);
+  void emitOMPSimpleStore(LValue LVal, RValue RVal, QualType RValTy,
+                          SourceLocation Loc);
   /// \brief Perform element by element copying of arrays with type \a
   /// OriginalType from \a SrcAddr to \a DestAddr using copying procedure
   /// generated by \a CopyGen.
@@ -2260,6 +2392,9 @@
                                  OMPPrivateScope &PrivateScope);
   void EmitOMPPrivateClause(const OMPExecutableDirective &D,
                             OMPPrivateScope &PrivateScope);
+  void EmitOMPUseDevicePtrClause(
+      const OMPClause &C, OMPPrivateScope &PrivateScope,
+      const llvm::DenseMap<const ValueDecl *, Address> &CaptureDeviceAddrMap);
   /// \brief Emit code for copyin clause in \a D directive. The next code is
   /// generated at the start of outlined functions for directives:
   /// \code
@@ -2293,7 +2428,17 @@
   /// it is the last iteration of the loop code in associated directive, or to
   /// 'i1 false' otherwise. If this item is nullptr, no final check is required.
   void EmitOMPLastprivateClauseFinal(const OMPExecutableDirective &D,
+                                     bool NoFinals,
                                      llvm::Value *IsLastIterCond = nullptr);
+  /// Emit initial code for linear clauses.
+  void EmitOMPLinearClause(const OMPLoopDirective &D,
+                           CodeGenFunction::OMPPrivateScope &PrivateScope);
+  /// Emit final code for linear clauses.
+  /// \param CondGen Optional conditional code for final part of codegen for
+  /// linear clause.
+  void EmitOMPLinearClauseFinal(
+      const OMPLoopDirective &D,
+      const llvm::function_ref<llvm::Value *(CodeGenFunction &)> &CondGen);
   /// \brief Emit initial code for reduction variables. Creates reduction copies
   /// and initializes them with the values according to OpenMP standard.
   ///
@@ -2314,6 +2459,14 @@
   /// \param D Directive (possibly) with the 'linear' clause.
   void EmitOMPLinearClauseInit(const OMPLoopDirective &D);
 
+  typedef const llvm::function_ref<void(CodeGenFunction & /*CGF*/,
+                                        llvm::Value * /*OutlinedFn*/,
+                                        const OMPTaskDataTy & /*Data*/)>
+      TaskGenTy;
+  void EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
+                                 const RegionCodeGenTy &BodyGen,
+                                 const TaskGenTy &TaskGen, OMPTaskDataTy &Data);
+
   void EmitOMPParallelDirective(const OMPParallelDirective &S);
   void EmitOMPSimdDirective(const OMPSimdDirective &S);
   void EmitOMPForDirective(const OMPForDirective &S);
@@ -2336,14 +2489,38 @@
   void EmitOMPAtomicDirective(const OMPAtomicDirective &S);
   void EmitOMPTargetDirective(const OMPTargetDirective &S);
   void EmitOMPTargetDataDirective(const OMPTargetDataDirective &S);
+  void EmitOMPTargetEnterDataDirective(const OMPTargetEnterDataDirective &S);
+  void EmitOMPTargetExitDataDirective(const OMPTargetExitDataDirective &S);
+  void EmitOMPTargetUpdateDirective(const OMPTargetUpdateDirective &S);
+  void EmitOMPTargetParallelDirective(const OMPTargetParallelDirective &S);
+  void
+  EmitOMPTargetParallelForDirective(const OMPTargetParallelForDirective &S);
   void EmitOMPTeamsDirective(const OMPTeamsDirective &S);
   void
   EmitOMPCancellationPointDirective(const OMPCancellationPointDirective &S);
   void EmitOMPCancelDirective(const OMPCancelDirective &S);
+  void EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S);
   void EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S);
   void EmitOMPTaskLoopSimdDirective(const OMPTaskLoopSimdDirective &S);
   void EmitOMPDistributeDirective(const OMPDistributeDirective &S);
+  void EmitOMPDistributeLoop(const OMPDistributeDirective &S);
+  void EmitOMPDistributeParallelForDirective(
+      const OMPDistributeParallelForDirective &S);
+  void EmitOMPDistributeParallelForSimdDirective(
+      const OMPDistributeParallelForSimdDirective &S);
+  void EmitOMPDistributeSimdDirective(const OMPDistributeSimdDirective &S);
+  void EmitOMPTargetParallelForSimdDirective(
+      const OMPTargetParallelForSimdDirective &S);
+  void EmitOMPTargetSimdDirective(const OMPTargetSimdDirective &S);
+  void EmitOMPTeamsDistributeDirective(const OMPTeamsDistributeDirective &S);
 
+  /// Emit outlined function for the target directive.
+  static std::pair<llvm::Function * /*OutlinedFn*/,
+                   llvm::Constant * /*OutlinedFnID*/>
+  EmitOMPTargetDirectiveOutlinedFunction(CodeGenModule &CGM,
+                                         const OMPTargetDirective &S,
+                                         StringRef ParentName,
+                                         bool IsOffloadEntry);
   /// \brief Emit inner loop of the worksharing/simd construct.
   ///
   /// \param S Directive, for which the inner loop must be emitted.
@@ -2361,24 +2538,35 @@
       const llvm::function_ref<void(CodeGenFunction &)> &PostIncGen);
 
   JumpDest getOMPCancelDestination(OpenMPDirectiveKind Kind);
+  /// Emit initial code for loop counters of loop-based directives.
+  void EmitOMPPrivateLoopCounters(const OMPLoopDirective &S,
+                                  OMPPrivateScope &LoopScope);
 
 private:
-
   /// Helpers for the OpenMP loop directives.
   void EmitOMPLoopBody(const OMPLoopDirective &D, JumpDest LoopExit);
   void EmitOMPSimdInit(const OMPLoopDirective &D, bool IsMonotonic = false);
-  void EmitOMPSimdFinal(const OMPLoopDirective &D);
+  void EmitOMPSimdFinal(
+      const OMPLoopDirective &D,
+      const llvm::function_ref<llvm::Value *(CodeGenFunction &)> &CondGen);
   /// \brief Emit code for the worksharing loop-based directive.
   /// \return true, if this construct has any lastprivate clause, false -
   /// otherwise.
   bool EmitOMPWorksharingLoop(const OMPLoopDirective &S);
-  void EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind,
+  void EmitOMPOuterLoop(bool IsMonotonic, bool DynamicOrOrdered,
+      const OMPLoopDirective &S, OMPPrivateScope &LoopScope, bool Ordered,
+      Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk);
+  void EmitOMPForOuterLoop(const OpenMPScheduleTy &ScheduleKind,
                            bool IsMonotonic, const OMPLoopDirective &S,
                            OMPPrivateScope &LoopScope, bool Ordered, Address LB,
                            Address UB, Address ST, Address IL,
                            llvm::Value *Chunk);
+  void EmitOMPDistributeOuterLoop(
+      OpenMPDistScheduleClauseKind ScheduleKind,
+      const OMPDistributeDirective &S, OMPPrivateScope &LoopScope,
+      Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk);
   /// \brief Emit code for sections directive.
-  OpenMPDirectiveKind EmitSections(const OMPExecutableDirective &S);
+  void EmitSections(const OMPExecutableDirective &S);
 
 public:
 
@@ -2429,7 +2617,6 @@
   void EmitAtomicInit(Expr *E, LValue lvalue);
 
   bool LValueIsSuitableForInlineAtomic(LValue Src);
-  bool typeIsSuitableForInlineAtomic(QualType Ty, bool IsVolatile) const;
 
   RValue EmitAtomicLoad(LValue LV, SourceLocation SL,
                         AggValueSlot Slot = AggValueSlot::ignored());
@@ -2445,8 +2632,10 @@
 
   std::pair<RValue, llvm::Value *> EmitAtomicCompareExchange(
       LValue Obj, RValue Expected, RValue Desired, SourceLocation Loc,
-      llvm::AtomicOrdering Success = llvm::SequentiallyConsistent,
-      llvm::AtomicOrdering Failure = llvm::SequentiallyConsistent,
+      llvm::AtomicOrdering Success =
+          llvm::AtomicOrdering::SequentiallyConsistent,
+      llvm::AtomicOrdering Failure =
+          llvm::AtomicOrdering::SequentiallyConsistent,
       bool IsWeak = false, AggValueSlot Slot = AggValueSlot::ignored());
 
   void EmitAtomicUpdate(LValue LVal, llvm::AtomicOrdering AO,
@@ -2679,11 +2868,10 @@
                               ReturnValueSlot ReturnValue, llvm::Value *This,
                               llvm::Value *ImplicitParam,
                               QualType ImplicitParamTy, const CallExpr *E);
-  RValue EmitCXXStructorCall(const CXXMethodDecl *MD, llvm::Value *Callee,
-                             ReturnValueSlot ReturnValue, llvm::Value *This,
-                             llvm::Value *ImplicitParam,
-                             QualType ImplicitParamTy, const CallExpr *E,
-                             StructorType Type);
+  RValue EmitCXXDestructorCall(const CXXDestructorDecl *DD, llvm::Value *Callee,
+                               llvm::Value *This, llvm::Value *ImplicitParam,
+                               QualType ImplicitParamTy, const CallExpr *E,
+                               StructorType Type);
   RValue EmitCXXMemberCallExpr(const CXXMemberCallExpr *E,
                                ReturnValueSlot ReturnValue);
   RValue EmitCXXMemberOrOperatorMemberCallExpr(const CallExpr *CE,
@@ -2707,6 +2895,8 @@
   RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
                                 ReturnValueSlot ReturnValue);
 
+  RValue EmitCUDADevicePrintfCallExpr(const CallExpr *E,
+                                      ReturnValueSlot ReturnValue);
 
   RValue EmitBuiltinExpr(const FunctionDecl *FD,
                          unsigned BuiltinID, const CallExpr *E,
@@ -2982,13 +3172,15 @@
   /// ConstantFoldsToSimpleInteger - If the specified expression does not fold
   /// to a constant, or if it does but contains a label, return false.  If it
   /// constant folds return true and set the boolean result in Result.
-  bool ConstantFoldsToSimpleInteger(const Expr *Cond, bool &Result);
+  bool ConstantFoldsToSimpleInteger(const Expr *Cond, bool &Result,
+                                    bool AllowLabels = false);
 
   /// ConstantFoldsToSimpleInteger - If the specified expression does not fold
   /// to a constant, or if it does but contains a label, return false.  If it
   /// constant folds return true and set the folded value.
-  bool ConstantFoldsToSimpleInteger(const Expr *Cond, llvm::APSInt &Result);
-  
+  bool ConstantFoldsToSimpleInteger(const Expr *Cond, llvm::APSInt &Result,
+                                    bool AllowLabels = false);
+
   /// EmitBranchOnBoolExpr - Emit a branch on a boolean condition (e.g. for an
   /// if statement) to the specified blocks.  Based on the condition, this might
   /// try to simplify the codegen of the conditional based on the branch.
@@ -3018,8 +3210,9 @@
 
   /// \brief Emit a slow path cross-DSO CFI check which calls __cfi_slowpath
   /// if Cond if false.
-  void EmitCfiSlowPathCheck(llvm::Value *Cond, llvm::ConstantInt *TypeId,
-                            llvm::Value *Ptr);
+  void EmitCfiSlowPathCheck(SanitizerMask Kind, llvm::Value *Cond,
+                            llvm::ConstantInt *TypeId, llvm::Value *Ptr,
+                            ArrayRef<llvm::Constant *> StaticArgs);
 
   /// \brief Create a basic block that will call the trap intrinsic, and emit a
   /// conditional branch to it, for the -ftrapv checks.
@@ -3029,6 +3222,9 @@
   /// "trap-func-name" if specified.
   llvm::CallInst *EmitTrapCall(llvm::Intrinsic::ID IntrID);
 
+  /// \brief Emit a cross-DSO CFI failure handling function.
+  void EmitCfiCheckFail();
+
   /// \brief Create a check for a function parameter that may potentially be
   /// declared as non-null.
   void EmitNonNullArgCheck(RValue RV, QualType ArgType, SourceLocation ArgLoc,
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 32aa194..7383424 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -20,6 +20,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "CodeGenPGO.h"
 #include "CodeGenTBAA.h"
@@ -42,7 +43,6 @@
 #include "clang/Basic/Version.h"
 #include "clang/Frontend/CodeGenOptions.h"
 #include "clang/Sema/SemaDiagnostic.h"
-#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
@@ -86,17 +86,8 @@
     : Context(C), LangOpts(C.getLangOpts()), HeaderSearchOpts(HSO),
       PreprocessorOpts(PPO), CodeGenOpts(CGO), TheModule(M), Diags(diags),
       Target(C.getTargetInfo()), ABI(createCXXABI(*this)),
-      VMContext(M.getContext()), TBAA(nullptr), TheTargetCodeGenInfo(nullptr),
-      Types(*this), VTables(*this), ObjCRuntime(nullptr),
-      OpenCLRuntime(nullptr), OpenMPRuntime(nullptr), CUDARuntime(nullptr),
-      DebugInfo(nullptr), ObjCData(nullptr),
-      NoObjCARCExceptionsMetadata(nullptr), PGOReader(nullptr),
-      CFConstantStringClassRef(nullptr), ConstantStringClassRef(nullptr),
-      NSConstantStringType(nullptr), NSConcreteGlobalBlock(nullptr),
-      NSConcreteStackBlock(nullptr), BlockObjectAssign(nullptr),
-      BlockObjectDispose(nullptr), BlockDescriptorType(nullptr),
-      GenericBlockLiteralType(nullptr), LifetimeStartFn(nullptr),
-      LifetimeEndFn(nullptr), SanitizerMD(new SanitizerMetadata(*this)) {
+      VMContext(M.getContext()), Types(*this), VTables(*this),
+      SanitizerMD(new SanitizerMetadata(*this)) {
 
   // Initialize the type cache.
   llvm::LLVMContext &LLVMContext = M.getContext();
@@ -132,28 +123,30 @@
   // Enable TBAA unless it's suppressed. ThreadSanitizer needs TBAA even at O0.
   if (LangOpts.Sanitize.has(SanitizerKind::Thread) ||
       (!CodeGenOpts.RelaxedAliasing && CodeGenOpts.OptimizationLevel > 0))
-    TBAA = new CodeGenTBAA(Context, VMContext, CodeGenOpts, getLangOpts(),
-                           getCXXABI().getMangleContext());
+    TBAA.reset(new CodeGenTBAA(Context, VMContext, CodeGenOpts, getLangOpts(),
+                               getCXXABI().getMangleContext()));
 
   // If debug info or coverage generation is enabled, create the CGDebugInfo
   // object.
   if (CodeGenOpts.getDebugInfo() != codegenoptions::NoDebugInfo ||
       CodeGenOpts.EmitGcovArcs || CodeGenOpts.EmitGcovNotes)
-    DebugInfo = new CGDebugInfo(*this);
+    DebugInfo.reset(new CGDebugInfo(*this));
 
   Block.GlobalUniqueCount = 0;
 
   if (C.getLangOpts().ObjC1)
-    ObjCData = new ObjCEntrypoints();
+    ObjCData.reset(new ObjCEntrypoints());
 
-  if (!CodeGenOpts.InstrProfileInput.empty()) {
-    auto ReaderOrErr =
-        llvm::IndexedInstrProfReader::create(CodeGenOpts.InstrProfileInput);
-    if (std::error_code EC = ReaderOrErr.getError()) {
+  if (CodeGenOpts.hasProfileClangUse()) {
+    auto ReaderOrErr = llvm::IndexedInstrProfReader::create(
+        CodeGenOpts.ProfileInstrumentUsePath);
+    if (auto E = ReaderOrErr.takeError()) {
       unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
                                               "Could not read profile %0: %1");
-      getDiags().Report(DiagID) << CodeGenOpts.InstrProfileInput
-                                << EC.message();
+      llvm::handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EI) {
+        getDiags().Report(DiagID) << CodeGenOpts.ProfileInstrumentUsePath
+                                  << EI.message();
+      });
     } else
       PGOReader = std::move(ReaderOrErr.get());
   }
@@ -164,16 +157,7 @@
     CoverageMapping.reset(new CoverageMappingModuleGen(*this, *CoverageInfo));
 }
 
-CodeGenModule::~CodeGenModule() {
-  delete ObjCRuntime;
-  delete OpenCLRuntime;
-  delete OpenMPRuntime;
-  delete CUDARuntime;
-  delete TheTargetCodeGenInfo;
-  delete TBAA;
-  delete DebugInfo;
-  delete ObjCData;
-}
+CodeGenModule::~CodeGenModule() {}
 
 void CodeGenModule::createObjCRuntime() {
   // This is just isGNUFamily(), but we want to force implementors of
@@ -182,29 +166,42 @@
   case ObjCRuntime::GNUstep:
   case ObjCRuntime::GCC:
   case ObjCRuntime::ObjFW:
-    ObjCRuntime = CreateGNUObjCRuntime(*this);
+    ObjCRuntime.reset(CreateGNUObjCRuntime(*this));
     return;
 
   case ObjCRuntime::FragileMacOSX:
   case ObjCRuntime::MacOSX:
   case ObjCRuntime::iOS:
   case ObjCRuntime::WatchOS:
-    ObjCRuntime = CreateMacObjCRuntime(*this);
+    ObjCRuntime.reset(CreateMacObjCRuntime(*this));
     return;
   }
   llvm_unreachable("bad runtime kind");
 }
 
 void CodeGenModule::createOpenCLRuntime() {
-  OpenCLRuntime = new CGOpenCLRuntime(*this);
+  OpenCLRuntime.reset(new CGOpenCLRuntime(*this));
 }
 
 void CodeGenModule::createOpenMPRuntime() {
-  OpenMPRuntime = new CGOpenMPRuntime(*this);
+  // Select a specialized code generation class based on the target, if any.
+  // If it does not exist use the default implementation.
+  switch (getTarget().getTriple().getArch()) {
+
+  case llvm::Triple::nvptx:
+  case llvm::Triple::nvptx64:
+    assert(getLangOpts().OpenMPIsDevice &&
+           "OpenMP NVPTX is only prepared to deal with device code.");
+    OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
+    break;
+  default:
+    OpenMPRuntime.reset(new CGOpenMPRuntime(*this));
+    break;
+  }
 }
 
 void CodeGenModule::createCUDARuntime() {
-  CUDARuntime = CreateNVCUDARuntime(*this);
+  CUDARuntime.reset(CreateNVCUDARuntime(*this));
 }
 
 void CodeGenModule::addReplacement(StringRef Name, llvm::Constant *C) {
@@ -258,20 +255,21 @@
 
 // This is only used in aliases that we created and we know they have a
 // linear structure.
-static const llvm::GlobalObject *getAliasedGlobal(const llvm::GlobalAlias &GA) {
-  llvm::SmallPtrSet<const llvm::GlobalAlias*, 4> Visited;
-  const llvm::Constant *C = &GA;
+static const llvm::GlobalObject *getAliasedGlobal(
+    const llvm::GlobalIndirectSymbol &GIS) {
+  llvm::SmallPtrSet<const llvm::GlobalIndirectSymbol*, 4> Visited;
+  const llvm::Constant *C = &GIS;
   for (;;) {
     C = C->stripPointerCasts();
     if (auto *GO = dyn_cast<llvm::GlobalObject>(C))
       return GO;
     // stripPointerCasts will not walk over weak aliases.
-    auto *GA2 = dyn_cast<llvm::GlobalAlias>(C);
-    if (!GA2)
+    auto *GIS2 = dyn_cast<llvm::GlobalIndirectSymbol>(C);
+    if (!GIS2)
       return nullptr;
-    if (!Visited.insert(GA2).second)
+    if (!Visited.insert(GIS2).second)
       return nullptr;
-    C = GA2->getAliasee();
+    C = GIS2->getIndirectSymbol();
   }
 }
 
@@ -283,20 +281,35 @@
   DiagnosticsEngine &Diags = getDiags();
   for (const GlobalDecl &GD : Aliases) {
     const auto *D = cast<ValueDecl>(GD.getDecl());
-    const AliasAttr *AA = D->getAttr<AliasAttr>();
+    SourceLocation Location;
+    bool IsIFunc = D->hasAttr<IFuncAttr>();
+    if (const Attr *A = D->getDefiningAttr())
+      Location = A->getLocation();
+    else
+      llvm_unreachable("Not an alias or ifunc?");
     StringRef MangledName = getMangledName(GD);
     llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
-    auto *Alias = cast<llvm::GlobalAlias>(Entry);
+    auto *Alias  = cast<llvm::GlobalIndirectSymbol>(Entry);
     const llvm::GlobalValue *GV = getAliasedGlobal(*Alias);
     if (!GV) {
       Error = true;
-      Diags.Report(AA->getLocation(), diag::err_cyclic_alias);
+      Diags.Report(Location, diag::err_cyclic_alias) << IsIFunc;
     } else if (GV->isDeclaration()) {
       Error = true;
-      Diags.Report(AA->getLocation(), diag::err_alias_to_undefined);
+      Diags.Report(Location, diag::err_alias_to_undefined)
+          << IsIFunc << IsIFunc;
+    } else if (IsIFunc) {
+      // Check resolver function type.
+      llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(
+          GV->getType()->getPointerElementType());
+      assert(FTy);
+      if (!FTy->getReturnType()->isPointerTy())
+        Diags.Report(Location, diag::err_ifunc_resolver_return);
+      if (FTy->getNumParams())
+        Diags.Report(Location, diag::err_ifunc_resolver_params);
     }
 
-    llvm::Constant *Aliasee = Alias->getAliasee();
+    llvm::Constant *Aliasee = Alias->getIndirectSymbol();
     llvm::GlobalValue *AliaseeGV;
     if (auto CE = dyn_cast<llvm::ConstantExpr>(Aliasee))
       AliaseeGV = cast<llvm::GlobalValue>(CE->getOperand(0));
@@ -307,7 +320,7 @@
       StringRef AliasSection = SA->getName();
       if (AliasSection != AliaseeGV->getSection())
         Diags.Report(SA->getLocation(), diag::warn_alias_with_section)
-            << AliasSection;
+            << AliasSection << IsIFunc << IsIFunc;
     }
 
     // We have to handle alias to weak aliases in here. LLVM itself disallows
@@ -315,13 +328,13 @@
     // compatibility with gcc we implement it by just pointing the alias
     // to its aliasee's aliasee. We also warn, since the user is probably
     // expecting the link to be weak.
-    if (auto GA = dyn_cast<llvm::GlobalAlias>(AliaseeGV)) {
-      if (GA->mayBeOverridden()) {
-        Diags.Report(AA->getLocation(), diag::warn_alias_to_weak_alias)
-            << GV->getName() << GA->getName();
+    if (auto GA = dyn_cast<llvm::GlobalIndirectSymbol>(AliaseeGV)) {
+      if (GA->isInterposable()) {
+        Diags.Report(Location, diag::warn_alias_to_weak_alias)
+            << GV->getName() << GA->getName() << IsIFunc;
         Aliasee = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-            GA->getAliasee(), Alias->getType());
-        Alias->setAliasee(Aliasee);
+            GA->getIndirectSymbol(), Alias->getType());
+        Alias->setIndirectSymbol(Aliasee);
       }
     }
   }
@@ -331,7 +344,7 @@
   for (const GlobalDecl &GD : Aliases) {
     StringRef MangledName = getMangledName(GD);
     llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
-    auto *Alias = cast<llvm::GlobalAlias>(Entry);
+    auto *Alias = dyn_cast<llvm::GlobalIndirectSymbol>(Entry);
     Alias->replaceAllUsesWith(llvm::UndefValue::get(Alias->getType()));
     Alias->eraseFromParent();
   }
@@ -379,7 +392,7 @@
             OpenMPRuntime->emitRegistrationFunction())
       AddGlobalCtor(OpenMPRegistrationFunction, 0);
   if (PGOReader) {
-    getModule().setMaximumFunctionCount(PGOReader->getMaximumFunctionCount());
+    getModule().setProfileSummary(PGOReader->getSummary().getMD(VMContext));
     if (PGOStats.hasDiagnostics())
       PGOStats.reportDiagnostics(getDiags(), getCodeGenOpts().MainFileName);
   }
@@ -390,6 +403,8 @@
   EmitDeferredUnusedCoverageMappings();
   if (CoverageMapping)
     CoverageMapping->emit();
+  if (CodeGenOpts.SanitizeCfiCrossDso)
+    CodeGenFunction(*this).EmitCfiCheckFail();
   emitLLVMUsed();
   if (SanStats)
     SanStats->finish();
@@ -453,16 +468,19 @@
     getModule().addModuleFlag(llvm::Module::Override, "Cross-DSO CFI", 1);
   }
 
-  if (uint32_t PLevel = Context.getLangOpts().PICLevel) {
-    llvm::PICLevel::Level PL = llvm::PICLevel::Default;
-    switch (PLevel) {
-    case 0: break;
-    case 1: PL = llvm::PICLevel::Small; break;
-    case 2: PL = llvm::PICLevel::Large; break;
-    default: llvm_unreachable("Invalid PIC Level");
-    }
+  if (LangOpts.CUDAIsDevice && getTarget().getTriple().isNVPTX()) {
+    // Indicate whether __nvvm_reflect should be configured to flush denormal
+    // floating point values to 0.  (This corresponds to its "__CUDA_FTZ"
+    // property.)
+    getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
+                              LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0);
+  }
 
-    getModule().setPICLevel(PL);
+  if (uint32_t PLevel = Context.getLangOpts().PICLevel) {
+    assert(PLevel < 3 && "Invalid PIC Level");
+    getModule().setPICLevel(static_cast<llvm::PICLevel::Level>(PLevel));
+    if (Context.getLangOpts().PIE)
+      getModule().setPIELevel(static_cast<llvm::PIELevel::Level>(PLevel));
   }
 
   SimplifyPersonality();
@@ -479,6 +497,16 @@
   EmitVersionIdentMetadata();
 
   EmitTargetMetadata();
+
+  // Emit any deferred diagnostics gathered during codegen.  We didn't emit them
+  // when we first discovered them because that would have halted codegen,
+  // preventing us from gathering other deferred diags.
+  for (const PartialDiagnosticAt &DiagAt : DeferredDiags) {
+    SourceLocation Loc = DiagAt.first;
+    const PartialDiagnostic &PD = DiagAt.second;
+    DiagnosticBuilder Builder(getDiags().Report(Loc, PD.getDiagID()));
+    PD.Emit(Builder);
+  }
 }
 
 void CodeGenModule::UpdateCompletedType(const TagDecl *TD) {
@@ -486,6 +514,11 @@
   Types.UpdateCompletedType(TD);
 }
 
+void CodeGenModule::RefreshTypeCacheForClass(const CXXRecordDecl *RD) {
+  // Make sure that this type is translated.
+  Types.RefreshTypeCacheForClass(RD);
+}
+
 llvm::MDNode *CodeGenModule::getTBAAInfo(QualType QTy) {
   if (!TBAA)
     return nullptr;
@@ -741,6 +774,15 @@
                                    : llvm::GlobalValue::LinkOnceODRLinkage;
   }
 
+  if (isa<CXXConstructorDecl>(D) &&
+      cast<CXXConstructorDecl>(D)->isInheritingConstructor() &&
+      Context.getTargetInfo().getCXXABI().isMicrosoft()) {
+    // Our approach to inheriting constructors is fundamentally different from
+    // that used by the MS ABI, so keep our inheriting constructor thunks
+    // internal rather than trying to pick an unambiguous mangling for them.
+    return llvm::GlobalValue::InternalLinkage;
+  }
+
   return getLLVMLinkageForDeclarator(D, Linkage, /*isConstantVariable=*/false);
 }
 
@@ -763,8 +805,7 @@
     F->setDLLStorageClass(llvm::GlobalVariable::DefaultStorageClass);
 }
 
-llvm::ConstantInt *
-CodeGenModule::CreateCfiIdForTypeMetadata(llvm::Metadata *MD) {
+llvm::ConstantInt *CodeGenModule::CreateCrossDsoCfiTypeId(llvm::Metadata *MD) {
   llvm::MDString *MDS = dyn_cast<llvm::MDString>(MD);
   if (!MDS) return nullptr;
 
@@ -883,12 +924,6 @@
     F->removeFnAttr(llvm::Attribute::InlineHint);
   }
 
-  if (isa<CXXConstructorDecl>(D) || isa<CXXDestructorDecl>(D))
-    F->setUnnamedAddr(true);
-  else if (const auto *MD = dyn_cast<CXXMethodDecl>(D))
-    if (MD->isVirtual())
-      F->setUnnamedAddr(true);
-
   unsigned alignment = D->getMaxAlignment() / Context.getCharWidth();
   if (alignment)
     F->setAlignment(alignment);
@@ -971,8 +1006,8 @@
   }
 }
 
-void CodeGenModule::CreateFunctionBitSetEntry(const FunctionDecl *FD,
-                                              llvm::Function *F) {
+void CodeGenModule::CreateFunctionTypeMetadata(const FunctionDecl *FD,
+                                               llvm::Function *F) {
   // Only if we are checking indirect calls.
   if (!LangOpts.Sanitize.has(SanitizerKind::CFIICall))
     return;
@@ -993,25 +1028,13 @@
       return;
   }
 
-  llvm::NamedMDNode *BitsetsMD =
-      getModule().getOrInsertNamedMetadata("llvm.bitsets");
-
   llvm::Metadata *MD = CreateMetadataIdentifierForType(FD->getType());
-  llvm::Metadata *BitsetOps[] = {
-      MD, llvm::ConstantAsMetadata::get(F),
-      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int64Ty, 0))};
-  BitsetsMD->addOperand(llvm::MDTuple::get(getLLVMContext(), BitsetOps));
+  F->addTypeMetadata(0, MD);
 
   // Emit a hash-based bit set entry for cross-DSO calls.
-  if (CodeGenOpts.SanitizeCfiCrossDso) {
-    if (auto TypeId = CreateCfiIdForTypeMetadata(MD)) {
-      llvm::Metadata *BitsetOps2[] = {
-          llvm::ConstantAsMetadata::get(TypeId),
-          llvm::ConstantAsMetadata::get(F),
-          llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int64Ty, 0))};
-      BitsetsMD->addOperand(llvm::MDTuple::get(getLLVMContext(), BitsetOps2));
-    }
-  }
+  if (CodeGenOpts.SanitizeCfiCrossDso)
+    if (auto CrossDsoTypeId = CreateCrossDsoCfiTypeId(MD))
+      F->addTypeMetadata(0, llvm::ConstantAsMetadata::get(CrossDsoTypeId));
 }
 
 void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, llvm::Function *F,
@@ -1050,13 +1073,29 @@
   if (const SectionAttr *SA = FD->getAttr<SectionAttr>())
     F->setSection(SA->getName());
 
-  // A replaceable global allocation function does not act like a builtin by
-  // default, only if it is invoked by a new-expression or delete-expression.
-  if (FD->isReplaceableGlobalAllocationFunction())
+  if (FD->isReplaceableGlobalAllocationFunction()) {
+    // A replaceable global allocation function does not act like a builtin by
+    // default, only if it is invoked by a new-expression or delete-expression.
     F->addAttribute(llvm::AttributeSet::FunctionIndex,
                     llvm::Attribute::NoBuiltin);
 
-  CreateFunctionBitSetEntry(FD, F);
+    // A sane operator new returns a non-aliasing pointer.
+    // FIXME: Also add NonNull attribute to the return value
+    // for the non-nothrow forms?
+    auto Kind = FD->getDeclName().getCXXOverloadedOperator();
+    if (getCodeGenOpts().AssumeSaneOperatorNew &&
+        (Kind == OO_New || Kind == OO_Array_New))
+      F->addAttribute(llvm::AttributeSet::ReturnIndex,
+                      llvm::Attribute::NoAlias);
+  }
+
+  if (isa<CXXConstructorDecl>(FD) || isa<CXXDestructorDecl>(FD))
+    F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+  else if (const auto *MD = dyn_cast<CXXMethodDecl>(FD))
+    if (MD->isVirtual())
+      F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+
+  CreateFunctionTypeMetadata(FD, F);
 }
 
 void CodeGenModule::addUsedGlobal(llvm::GlobalValue *GV) {
@@ -1309,7 +1348,7 @@
       new llvm::GlobalVariable(getModule(), s->getType(), true,
                                llvm::GlobalValue::PrivateLinkage, s, ".str");
   gv->setSection(AnnotationSection);
-  gv->setUnnamedAddr(true);
+  gv->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   AStr = gv;
   return gv;
 }
@@ -1417,6 +1456,12 @@
       // Implicit template instantiations may change linkage if they are later
       // explicitly instantiated, so they should not be emitted eagerly.
       return false;
+  if (const auto *VD = dyn_cast<VarDecl>(Global))
+    if (Context.getInlineVariableDefinitionKind(VD) ==
+        ASTContext::InlineVariableDefinitionKind::WeakUnknown)
+      // A definition of an inline constexpr static data member may change
+      // linkage later if it's redeclared outside the class.
+      return false;
   // If OpenMP is enabled and threadprivates must be generated like TLS, delay
   // codegen for global variables, because they may be marked as threadprivate.
   if (LangOpts.OpenMP && LangOpts.OpenMPUseTLS &&
@@ -1430,12 +1475,12 @@
     const CXXUuidofExpr* E) {
   // Sema has verified that IIDSource has a __declspec(uuid()), and that its
   // well-formed.
-  StringRef Uuid = E->getUuidAsStringRef(Context);
+  StringRef Uuid = E->getUuidStr();
   std::string Name = "_GUID_" + Uuid.lower();
   std::replace(Name.begin(), Name.end(), '-', '_');
 
-  // Contains a 32-bit field.
-  CharUnits Alignment = CharUnits::fromQuantity(4);
+  // The UUID descriptor should be pointer aligned.
+  CharUnits Alignment = CharUnits::fromQuantity(PointerAlignInBytes);
 
   // Look for an existing global.
   if (llvm::GlobalVariable *GV = getModule().getNamedGlobal(Name))
@@ -1496,6 +1541,10 @@
   if (Global->hasAttr<AliasAttr>())
     return EmitAliasDefinition(GD);
 
+  // IFunc like an alias whose value is resolved at runtime by calling resolver.
+  if (Global->hasAttr<IFuncAttr>())
+    return emitIFuncDefinition(GD);
+
   // If this is CUDA, be selective about which declarations we emit.
   if (LangOpts.CUDA) {
     if (LangOpts.CUDAIsDevice) {
@@ -1505,18 +1554,32 @@
           !Global->hasAttr<CUDASharedAttr>())
         return;
     } else {
-      if (!Global->hasAttr<CUDAHostAttr>() && (
-            Global->hasAttr<CUDADeviceAttr>() ||
-            Global->hasAttr<CUDAConstantAttr>() ||
-            Global->hasAttr<CUDASharedAttr>()))
+      // We need to emit host-side 'shadows' for all global
+      // device-side variables because the CUDA runtime needs their
+      // size and host-side address in order to provide access to
+      // their device-side incarnations.
+
+      // So device-only functions are the only things we skip.
+      if (isa<FunctionDecl>(Global) && !Global->hasAttr<CUDAHostAttr>() &&
+          Global->hasAttr<CUDADeviceAttr>())
         return;
+
+      assert((isa<FunctionDecl>(Global) || isa<VarDecl>(Global)) &&
+             "Expected Variable or Function");
     }
   }
 
-  // If this is OpenMP device, check if it is legal to emit this global
-  // normally.
-  if (OpenMPRuntime && OpenMPRuntime->emitTargetGlobal(GD))
-    return;
+  if (LangOpts.OpenMP) {
+    // If this is OpenMP device, check if it is legal to emit this global
+    // normally.
+    if (OpenMPRuntime && OpenMPRuntime->emitTargetGlobal(GD))
+      return;
+    if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(Global)) {
+      if (MustBeEmitted(Global))
+        EmitOMPDeclareReduction(DRD);
+      return;
+    }
+  }
 
   // Ignore declarations, they will be emitted on their first use.
   if (const auto *FD = dyn_cast<FunctionDecl>(Global)) {
@@ -1538,10 +1601,23 @@
   } else {
     const auto *VD = cast<VarDecl>(Global);
     assert(VD->isFileVarDecl() && "Cannot emit local var decl as global.");
-
-    if (VD->isThisDeclarationADefinition() != VarDecl::Definition &&
-        !Context.isMSStaticDataMemberInlineDefinition(VD))
+    // We need to emit device-side global CUDA variables even if a
+    // variable does not have a definition -- we still need to define
+    // host-side shadow for it.
+    bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
+                           !VD->hasDefinition() &&
+                           (VD->hasAttr<CUDAConstantAttr>() ||
+                            VD->hasAttr<CUDADeviceAttr>());
+    if (!MustEmitForCuda &&
+        VD->isThisDeclarationADefinition() != VarDecl::Definition &&
+        !Context.isMSStaticDataMemberInlineDefinition(VD)) {
+      // If this declaration may have caused an inline variable definition to
+      // change linkage, make sure that it's emitted.
+      if (Context.getInlineVariableDefinitionKind(VD) ==
+          ASTContext::InlineVariableDefinitionKind::Strong)
+        GetAddrOfGlobalVar(VD);
       return;
+    }
   }
 
   // Defer code generation to first use when possible, e.g. if this is an inline
@@ -2019,7 +2095,7 @@
 
       // Check that D is not yet in DiagnosedConflictingDefinitions is required
       // to make sure that we issue an error only once.
-      if (lookupRepresentativeDecl(MangledName, OtherGD) &&
+      if (D && lookupRepresentativeDecl(MangledName, OtherGD) &&
           (D->getCanonicalDecl() != OtherGD.getCanonicalDecl().getDecl()) &&
           (OtherD = dyn_cast<VarDecl>(OtherGD.getDecl())) &&
           OtherD->hasInit() &&
@@ -2238,7 +2314,7 @@
 
 unsigned CodeGenModule::GetGlobalVarAddressSpace(const VarDecl *D,
                                                  unsigned AddrSpace) {
-  if (LangOpts.CUDA && LangOpts.CUDAIsDevice) {
+  if (D && LangOpts.CUDA && LangOpts.CUDAIsDevice) {
     if (D->hasAttr<CUDAConstantAttr>())
       AddrSpace = getContext().getTargetAddressSpace(LangAS::cuda_constant);
     else if (D->hasAttr<CUDASharedAttr>())
@@ -2318,8 +2394,13 @@
 /// Pass IsTentative as true if you want to create a tentative definition.
 void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
                                             bool IsTentative) {
-  llvm::Constant *Init = nullptr;
+  // OpenCL global variables of sampler type are translated to function calls,
+  // therefore no need to be translated.
   QualType ASTTy = D->getType();
+  if (getLangOpts().OpenCL && ASTTy->isSamplerT())
+    return;
+
+  llvm::Constant *Init = nullptr;
   CXXRecordDecl *RD = ASTTy->getBaseElementTypeUnsafe()->getAsCXXRecordDecl();
   bool NeedsGlobalCtor = false;
   bool NeedsGlobalDtor = RD && !RD->hasTrivialDestructor();
@@ -2327,18 +2408,13 @@
   const VarDecl *InitDecl;
   const Expr *InitExpr = D->getAnyInitializer(InitDecl);
 
-  // CUDA E.2.4.1 "__shared__ variables cannot have an initialization as part
-  // of their declaration."
-  if (getLangOpts().CPlusPlus && getLangOpts().CUDAIsDevice
-      && D->hasAttr<CUDASharedAttr>()) {
-    if (InitExpr) {
-      const auto *C = dyn_cast<CXXConstructExpr>(InitExpr);
-      if (C == nullptr || !C->getConstructor()->hasTrivialBody())
-        Error(D->getLocation(),
-              "__shared__ variable cannot have an initialization.");
-    }
+  // CUDA E.2.4.1 "__shared__ variables cannot have an initialization
+  // as part of their declaration."  Sema has already checked for
+  // error cases, so we just need to set Init to UndefValue.
+  if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice &&
+      D->hasAttr<CUDASharedAttr>())
     Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
-  } else if (!InitExpr) {
+  else if (!InitExpr) {
     // This is a tentative definition; tentative definitions are
     // implicitly initialized with { 0 }.
     //
@@ -2426,6 +2502,10 @@
   if (D->hasAttr<AnnotateAttr>())
     AddGlobalAnnotations(D, GV);
 
+  // Set the llvm linkage type as appropriate.
+  llvm::GlobalValue::LinkageTypes Linkage =
+      getLLVMLinkageVarDefinition(D, GV->isConstant());
+
   // CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
   // the device. [...]"
   // CUDA B.2.2 "The __constant__ qualifier, optionally used together with
@@ -2433,9 +2513,34 @@
   // Is accessible from all the threads within the grid and from the host
   // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
   // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
-  if (GV && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
-      (D->hasAttr<CUDAConstantAttr>() || D->hasAttr<CUDADeviceAttr>())) {
-    GV->setExternallyInitialized(true);
+  if (GV && LangOpts.CUDA) {
+    if (LangOpts.CUDAIsDevice) {
+      if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>())
+        GV->setExternallyInitialized(true);
+    } else {
+      // Host-side shadows of external declarations of device-side
+      // global variables become internal definitions. These have to
+      // be internal in order to prevent name conflicts with global
+      // host variables with the same name in a different TUs.
+      if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
+        Linkage = llvm::GlobalValue::InternalLinkage;
+
+        // Shadow variables and their properties must be registered
+        // with CUDA runtime.
+        unsigned Flags = 0;
+        if (!D->hasDefinition())
+          Flags |= CGCUDARuntime::ExternDeviceVar;
+        if (D->hasAttr<CUDAConstantAttr>())
+          Flags |= CGCUDARuntime::ConstantDeviceVar;
+        getCUDARuntime().registerDeviceVar(*GV, Flags);
+      } else if (D->hasAttr<CUDASharedAttr>())
+        // __shared__ variables are odd. Shadows do get created, but
+        // they are not registered with the CUDA runtime, so they
+        // can't really be used to access their device-side
+        // counterparts. It's not clear yet whether it's nvcc's bug or
+        // a feature, but we've got to do the same for compatibility.
+        Linkage = llvm::GlobalValue::InternalLinkage;
+    }
   }
   GV->setInitializer(Init);
 
@@ -2452,9 +2557,6 @@
 
   GV->setAlignment(getContext().getDeclAlign(D).getQuantity());
 
-  // Set the llvm linkage type as appropriate.
-  llvm::GlobalValue::LinkageTypes Linkage =
-      getLLVMLinkageVarDefinition(D, GV->isConstant());
 
   // On Darwin, if the normal linkage of a C++ thread_local variable is
   // LinkOnce or Weak, we keep the normal linkage to prevent multiple
@@ -2534,7 +2636,7 @@
   if (shouldBeInCOMDAT(CGM, *D))
     return true;
 
-  // Declarations with a required alignment do not have common linakge in MSVC
+  // Declarations with a required alignment do not have common linkage in MSVC
   // mode.
   if (Context.getTargetInfo().getCXXABI().isMicrosoft()) {
     if (D->hasAttr<AlignedAttr>())
@@ -2595,9 +2697,18 @@
   // explicit instantiations can occur in multiple translation units
   // and must all be equivalent. However, we are not allowed to
   // throw away these explicit instantiations.
-  if (Linkage == GVA_StrongODR)
-    return !Context.getLangOpts().AppleKext ? llvm::Function::WeakODRLinkage
-                                            : llvm::Function::ExternalLinkage;
+  //
+  // We don't currently support CUDA device code spread out across multiple TUs,
+  // so say that CUDA templates are either external (for kernels) or internal.
+  // This lets llvm perform aggressive inter-procedural optimizations.
+  if (Linkage == GVA_StrongODR) {
+    if (Context.getLangOpts().AppleKext)
+      return llvm::Function::ExternalLinkage;
+    if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice)
+      return D->hasAttr<CUDAGlobalAttr>() ? llvm::Function::ExternalLinkage
+                                          : llvm::Function::InternalLinkage;
+    return llvm::Function::WeakODRLinkage;
+  }
 
   // C++ doesn't have tentative definitions and thus cannot have common
   // linkage.
@@ -2754,6 +2865,10 @@
 }
 
 void CodeGenModule::HandleCXXStaticMemberVarInstantiation(VarDecl *VD) {
+  auto DK = VD->isThisDeclarationADefinition();
+  if (DK == VarDecl::Definition && VD->hasAttr<DLLImportAttr>())
+    return;
+
   TemplateSpecializationKind TSK = VD->getTemplateSpecializationKind();
   // If we have a definition, this might be a deferred decl. If the
   // instantiation is explicit, make sure we emit it at the end.
@@ -2767,6 +2882,33 @@
                                                  llvm::GlobalValue *GV) {
   const auto *D = cast<FunctionDecl>(GD.getDecl());
 
+  // Emit this function's deferred diagnostics, if none of them are errors.  If
+  // any of them are errors, don't codegen the function, but also don't emit any
+  // of the diagnostics just yet.  Emitting an error during codegen stops
+  // further codegen, and we want to display as many deferred diags as possible.
+  // We'll emit the now twice-deferred diags at the very end of codegen.
+  //
+  // (If a function has both error and non-error diags, we don't emit the
+  // non-error diags here, because order can be significant, e.g. with notes
+  // that follow errors.)
+  auto Diags = D->takeDeferredDiags();
+  bool HasError = llvm::any_of(Diags, [this](const PartialDiagnosticAt &PDAt) {
+    return getDiags().getDiagnosticLevel(PDAt.second.getDiagID(), PDAt.first) >=
+           DiagnosticsEngine::Error;
+  });
+  if (HasError) {
+    DeferredDiags.insert(DeferredDiags.end(),
+                         std::make_move_iterator(Diags.begin()),
+                         std::make_move_iterator(Diags.end()));
+    return;
+  }
+  for (PartialDiagnosticAt &PDAt : Diags) {
+    const SourceLocation &Loc = PDAt.first;
+    const PartialDiagnostic &PD = PDAt.second;
+    DiagnosticBuilder Builder(getDiags().Report(Loc, PD.getDiagID()));
+    PD.Emit(Builder);
+  }
+
   // Compute the function info and LLVM type.
   const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
   llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
@@ -2817,7 +2959,7 @@
   StringRef MangledName = getMangledName(GD);
 
   if (AA->getAliasee() == MangledName) {
-    Diags.Report(AA->getLocation(), diag::err_cyclic_alias);
+    Diags.Report(AA->getLocation(), diag::err_cyclic_alias) << 0;
     return;
   }
 
@@ -2848,7 +2990,7 @@
 
   if (Entry) {
     if (GA->getAliasee() == Entry) {
-      Diags.Report(AA->getLocation(), diag::err_cyclic_alias);
+      Diags.Report(AA->getLocation(), diag::err_cyclic_alias) << 0;
       return;
     }
 
@@ -2885,6 +3027,65 @@
   setAliasAttributes(D, GA);
 }
 
+void CodeGenModule::emitIFuncDefinition(GlobalDecl GD) {
+  const auto *D = cast<ValueDecl>(GD.getDecl());
+  const IFuncAttr *IFA = D->getAttr<IFuncAttr>();
+  assert(IFA && "Not an ifunc?");
+
+  StringRef MangledName = getMangledName(GD);
+
+  if (IFA->getResolver() == MangledName) {
+    Diags.Report(IFA->getLocation(), diag::err_cyclic_alias) << 1;
+    return;
+  }
+
+  // Report an error if some definition overrides ifunc.
+  llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
+  if (Entry && !Entry->isDeclaration()) {
+    GlobalDecl OtherGD;
+    if (lookupRepresentativeDecl(MangledName, OtherGD) &&
+        DiagnosedConflictingDefinitions.insert(GD).second) {
+      Diags.Report(D->getLocation(), diag::err_duplicate_mangled_name);
+      Diags.Report(OtherGD.getDecl()->getLocation(),
+                   diag::note_previous_definition);
+    }
+    return;
+  }
+
+  Aliases.push_back(GD);
+
+  llvm::Type *DeclTy = getTypes().ConvertTypeForMem(D->getType());
+  llvm::Constant *Resolver =
+      GetOrCreateLLVMFunction(IFA->getResolver(), DeclTy, GD,
+                              /*ForVTable=*/false);
+  llvm::GlobalIFunc *GIF =
+      llvm::GlobalIFunc::create(DeclTy, 0, llvm::Function::ExternalLinkage,
+                                "", Resolver, &getModule());
+  if (Entry) {
+    if (GIF->getResolver() == Entry) {
+      Diags.Report(IFA->getLocation(), diag::err_cyclic_alias) << 1;
+      return;
+    }
+    assert(Entry->isDeclaration());
+
+    // If there is a declaration in the module, then we had an extern followed
+    // by the ifunc, as in:
+    //   extern int test();
+    //   ...
+    //   int test() __attribute__((ifunc("resolver")));
+    //
+    // Remove it and replace uses of it with the ifunc.
+    GIF->takeName(Entry);
+
+    Entry->replaceAllUsesWith(llvm::ConstantExpr::getBitCast(GIF,
+                                                          Entry->getType()));
+    Entry->eraseFromParent();
+  } else
+    GIF->setName(MangledName);
+
+  SetCommonAttributes(D, GIF);
+}
+
 llvm::Function *CodeGenModule::getIntrinsic(unsigned IID,
                                             ArrayRef<llvm::Type*> Tys) {
   return llvm::Intrinsic::getDeclaration(&getModule(), (llvm::Intrinsic::ID)IID,
@@ -2949,19 +3150,40 @@
   llvm::Constant *Zero = llvm::Constant::getNullValue(Int32Ty);
   llvm::Constant *Zeros[] = { Zero, Zero };
   llvm::Value *V;
-  
+
   // If we don't already have it, get __CFConstantStringClassReference.
   if (!CFConstantStringClassRef) {
     llvm::Type *Ty = getTypes().ConvertType(getContext().IntTy);
     Ty = llvm::ArrayType::get(Ty, 0);
-    llvm::Constant *GV = CreateRuntimeVariable(Ty,
-                                           "__CFConstantStringClassReference");
+    llvm::Constant *GV =
+        CreateRuntimeVariable(Ty, "__CFConstantStringClassReference");
+
+    if (getTarget().getTriple().isOSBinFormatCOFF()) {
+      IdentifierInfo &II = getContext().Idents.get(GV->getName());
+      TranslationUnitDecl *TUDecl = getContext().getTranslationUnitDecl();
+      DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+      llvm::GlobalValue *CGV = cast<llvm::GlobalValue>(GV);
+
+      const VarDecl *VD = nullptr;
+      for (const auto &Result : DC->lookup(&II))
+        if ((VD = dyn_cast<VarDecl>(Result)))
+          break;
+
+      if (!VD || !VD->hasAttr<DLLExportAttr>()) {
+        CGV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
+        CGV->setLinkage(llvm::GlobalValue::ExternalLinkage);
+      } else {
+        CGV->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
+        CGV->setLinkage(llvm::GlobalValue::ExternalLinkage);
+      }
+    }
+
     // Decay array -> ptr
     V = llvm::ConstantExpr::getGetElementPtr(Ty, GV, Zeros);
     CFConstantStringClassRef = V;
-  }
-  else
+  } else {
     V = CFConstantStringClassRef;
+  }
 
   QualType CFTy = getContext().getCFConstantStringType();
 
@@ -2974,8 +3196,8 @@
 
   // Flags.
   llvm::Type *Ty = getTypes().ConvertType(getContext().UnsignedIntTy);
-  Fields[1] = isUTF16 ? llvm::ConstantInt::get(Ty, 0x07d0) :
-    llvm::ConstantInt::get(Ty, 0x07C8);
+  Fields[1] = isUTF16 ? llvm::ConstantInt::get(Ty, 0x07d0)
+                      : llvm::ConstantInt::get(Ty, 0x07C8);
 
   // String pointer.
   llvm::Constant *C = nullptr;
@@ -2993,21 +3215,20 @@
   auto *GV =
       new llvm::GlobalVariable(getModule(), C->getType(), /*isConstant=*/true,
                                llvm::GlobalValue::PrivateLinkage, C, ".str");
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   // Don't enforce the target's minimum global alignment, since the only use
   // of the string is via this class initializer.
-  // FIXME: We set the section explicitly to avoid a bug in ld64 224.1. Without
-  // it LLVM can merge the string with a non unnamed_addr one during LTO. Doing
-  // that changes the section it ends in, which surprises ld64.
-  if (isUTF16) {
-    CharUnits Align = getContext().getTypeAlignInChars(getContext().ShortTy);
-    GV->setAlignment(Align.getQuantity());
-    GV->setSection("__TEXT,__ustring");
-  } else {
-    CharUnits Align = getContext().getTypeAlignInChars(getContext().CharTy);
-    GV->setAlignment(Align.getQuantity());
-    GV->setSection("__TEXT,__cstring,cstring_literals");
-  }
+  CharUnits Align = isUTF16
+                        ? getContext().getTypeAlignInChars(getContext().ShortTy)
+                        : getContext().getTypeAlignInChars(getContext().CharTy);
+  GV->setAlignment(Align.getQuantity());
+
+  // FIXME: We set the section explicitly to avoid a bug in ld64 224.1.
+  // Without it LLVM can merge the string with a non unnamed_addr one during
+  // LTO.  Doing that changes the section it ends in, which surprises ld64.
+  if (getTarget().getTriple().isOSBinFormatMachO())
+    GV->setSection(isUTF16 ? "__TEXT,__ustring"
+                           : "__TEXT,__cstring,cstring_literals");
 
   // String.
   Fields[2] =
@@ -3025,11 +3246,21 @@
 
   // The struct.
   C = llvm::ConstantStruct::get(STy, Fields);
-  GV = new llvm::GlobalVariable(getModule(), C->getType(), true,
+  GV = new llvm::GlobalVariable(getModule(), C->getType(), /*isConstant=*/false,
                                 llvm::GlobalVariable::PrivateLinkage, C,
                                 "_unnamed_cfstring_");
-  GV->setSection("__DATA,__cfstring");
   GV->setAlignment(Alignment.getQuantity());
+  switch (getTarget().getTriple().getObjectFormat()) {
+  case llvm::Triple::UnknownObjectFormat:
+    llvm_unreachable("unknown file format");
+  case llvm::Triple::COFF:
+  case llvm::Triple::ELF:
+    GV->setSection("cfstring");
+    break;
+  case llvm::Triple::MachO:
+    GV->setSection("__DATA,__cfstring");
+    break;
+  }
   Entry.second = GV;
 
   return ConstantAddress(GV, Alignment);
@@ -3124,7 +3355,7 @@
 
   auto *GV = new llvm::GlobalVariable(getModule(), C->getType(), isConstant,
                                       Linkage, C, ".str");
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   // Don't enforce the target's minimum global alignment, since the only use
   // of the string is via this class initializer.
   CharUnits Align = getContext().getTypeAlignInChars(getContext().CharTy);
@@ -3243,7 +3474,7 @@
       M, C->getType(), !CGM.getLangOpts().WritableStrings, LT, C, GlobalName,
       nullptr, llvm::GlobalVariable::NotThreadLocal, AddrSpace);
   GV->setAlignment(Alignment.getQuantity());
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   if (GV->isWeakForLinker()) {
     assert(CGM.supportsCOMDAT() && "Only COFF uses weak string literals");
     GV->setComdat(M.getOrInsertComdat(GV->getName()));
@@ -3574,11 +3805,16 @@
     break;
 
   case Decl::Var:
+  case Decl::Decomposition:
     // Skip variable templates
     if (cast<VarDecl>(D)->getDescribedVarTemplate())
       return;
   case Decl::VarTemplateSpecialization:
     EmitGlobal(cast<VarDecl>(D));
+    if (auto *DD = dyn_cast<DecompositionDecl>(D))
+      for (auto *B : DD->bindings())
+        if (auto *HD = B->getHoldingVar())
+          EmitGlobal(HD);
     break;
 
   // Indirect fields from global anonymous structs and unions can be
@@ -3590,6 +3826,12 @@
   case Decl::Namespace:
     EmitNamespace(cast<NamespaceDecl>(D));
     break;
+  case Decl::CXXRecord:
+    // Emit any static data members, they may be definitions.
+    for (auto *I : cast<CXXRecordDecl>(D)->decls())
+      if (isa<VarDecl>(I) || isa<CXXRecordDecl>(I))
+        EmitTopLevelDecl(I);
+    break;
     // No code generation needed.
   case Decl::UsingShadow:
   case Decl::ClassTemplate:
@@ -3673,6 +3915,31 @@
     ObjCRuntime->RegisterAlias(cast<ObjCCompatibleAliasDecl>(D));
     break;
 
+  case Decl::PragmaComment: {
+    const auto *PCD = cast<PragmaCommentDecl>(D);
+    switch (PCD->getCommentKind()) {
+    case PCK_Unknown:
+      llvm_unreachable("unexpected pragma comment kind");
+    case PCK_Linker:
+      AppendLinkerOptions(PCD->getArg());
+      break;
+    case PCK_Lib:
+      AddDependentLib(PCD->getArg());
+      break;
+    case PCK_Compiler:
+    case PCK_ExeStr:
+    case PCK_User:
+      break; // We ignore all of these.
+    }
+    break;
+  }
+
+  case Decl::PragmaDetectMismatch: {
+    const auto *PDMD = cast<PragmaDetectMismatchDecl>(D);
+    AddDetectMismatch(PDMD->getName(), PDMD->getValue());
+    break;
+  }
+
   case Decl::LinkageSpec:
     EmitLinkageSpec(cast<LinkageSpecDecl>(D));
     break;
@@ -3692,13 +3959,19 @@
   case Decl::Import: {
     auto *Import = cast<ImportDecl>(D);
 
-    // Ignore import declarations that come from imported modules.
-    if (Import->getImportedOwningModule())
+    // If we've already imported this module, we're done.
+    if (!ImportedModules.insert(Import->getImportedModule()))
       break;
-    if (CGDebugInfo *DI = getModuleDebugInfo())
-      DI->EmitImportDecl(*Import);
 
-    ImportedModules.insert(Import->getImportedModule());
+    // Emit debug information for direct imports.
+    if (!Import->getImportedOwningModule()) {
+      if (CGDebugInfo *DI = getModuleDebugInfo())
+        DI->EmitImportDecl(*Import);
+    }
+
+    // Emit the module initializers.
+    for (auto *D : Context.getModuleInitializers(Import->getImportedModule()))
+      EmitTopLevelDecl(D);
     break;
   }
 
@@ -3715,6 +3988,10 @@
     break;
   }
 
+  case Decl::OMPDeclareReduction:
+    EmitOMPDeclareReduction(cast<OMPDeclareReductionDecl>(D));
+    break;
+
   default:
     // Make sure we handled everything we should, every other kind is a
     // non-top-level decl.  FIXME: Would be nice to have an isTopLevelDeclKind
@@ -3837,6 +4114,10 @@
 /// to such functions with an unmangled name from inline assembly within the
 /// same translation unit.
 void CodeGenModule::EmitStaticExternCAliases() {
+  // Don't do anything if we're generating CUDA device code -- the NVPTX
+  // assembly target doesn't support aliases.
+  if (Context.getTargetInfo().getTriple().isNVPTX())
+    return;
   for (auto &I : StaticExternCValues) {
     IdentifierInfo *Name = I.first;
     llvm::GlobalValue *Val = I.second;
@@ -4017,27 +4298,35 @@
   return InternalId;
 }
 
-void CodeGenModule::CreateVTableBitSetEntry(llvm::NamedMDNode *BitsetsMD,
-                                            llvm::GlobalVariable *VTable,
-                                            CharUnits Offset,
-                                            const CXXRecordDecl *RD) {
+/// Returns whether this module needs the "all-vtables" type identifier.
+bool CodeGenModule::NeedAllVtablesTypeId() const {
+  // Returns true if at least one of vtable-based CFI checkers is enabled and
+  // is not in the trapping mode.
+  return ((LangOpts.Sanitize.has(SanitizerKind::CFIVCall) &&
+           !CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIVCall)) ||
+          (LangOpts.Sanitize.has(SanitizerKind::CFINVCall) &&
+           !CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFINVCall)) ||
+          (LangOpts.Sanitize.has(SanitizerKind::CFIDerivedCast) &&
+           !CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIDerivedCast)) ||
+          (LangOpts.Sanitize.has(SanitizerKind::CFIUnrelatedCast) &&
+           !CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIUnrelatedCast)));
+}
+
+void CodeGenModule::AddVTableTypeMetadata(llvm::GlobalVariable *VTable,
+                                          CharUnits Offset,
+                                          const CXXRecordDecl *RD) {
   llvm::Metadata *MD =
       CreateMetadataIdentifierForType(QualType(RD->getTypeForDecl(), 0));
-  llvm::Metadata *BitsetOps[] = {
-      MD, llvm::ConstantAsMetadata::get(VTable),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(Int64Ty, Offset.getQuantity()))};
-  BitsetsMD->addOperand(llvm::MDTuple::get(getLLVMContext(), BitsetOps));
+  VTable->addTypeMetadata(Offset.getQuantity(), MD);
 
-  if (CodeGenOpts.SanitizeCfiCrossDso) {
-    if (auto TypeId = CreateCfiIdForTypeMetadata(MD)) {
-      llvm::Metadata *BitsetOps2[] = {
-          llvm::ConstantAsMetadata::get(TypeId),
-          llvm::ConstantAsMetadata::get(VTable),
-          llvm::ConstantAsMetadata::get(
-              llvm::ConstantInt::get(Int64Ty, Offset.getQuantity()))};
-      BitsetsMD->addOperand(llvm::MDTuple::get(getLLVMContext(), BitsetOps2));
-    }
+  if (CodeGenOpts.SanitizeCfiCrossDso)
+    if (auto CrossDsoTypeId = CreateCrossDsoCfiTypeId(MD))
+      VTable->addTypeMetadata(Offset.getQuantity(),
+                              llvm::ConstantAsMetadata::get(CrossDsoTypeId));
+
+  if (NeedAllVtablesTypeId()) {
+    llvm::Metadata *MD = llvm::MDString::get(getLLVMContext(), "all-vtables");
+    VTable->addTypeMetadata(Offset.getQuantity(), MD);
   }
 }
 
@@ -4076,3 +4365,13 @@
 
   return *SanStats;
 }
+llvm::Value *
+CodeGenModule::createOpenCLIntToSamplerConversion(const Expr *E,
+                                                  CodeGenFunction &CGF) {
+  llvm::Constant *C = EmitConstantExpr(E, E->getType(), &CGF);
+  auto SamplerT = getOpenCLRuntime().getSamplerType();
+  auto FTy = llvm::FunctionType::get(SamplerT, {C->getType()}, false);
+  return CGF.Builder.CreateCall(CreateRuntimeFunction(FTy,
+                                "__translate_sampler_initializer"),
+                                {C});
+}
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index b012127..ed18156 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -21,6 +21,7 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/ABI.h"
@@ -272,9 +273,9 @@
   std::unique_ptr<CGCXXABI> ABI;
   llvm::LLVMContext &VMContext;
 
-  CodeGenTBAA *TBAA;
+  std::unique_ptr<CodeGenTBAA> TBAA;
   
-  mutable const TargetCodeGenInfo *TheTargetCodeGenInfo;
+  mutable std::unique_ptr<TargetCodeGenInfo> TheTargetCodeGenInfo;
   
   // This should not be moved earlier, since its initialization depends on some
   // of the previous reference members being already initialized and also checks
@@ -284,13 +285,13 @@
   /// Holds information about C++ vtables.
   CodeGenVTables VTables;
 
-  CGObjCRuntime* ObjCRuntime;
-  CGOpenCLRuntime* OpenCLRuntime;
-  CGOpenMPRuntime* OpenMPRuntime;
-  CGCUDARuntime* CUDARuntime;
-  CGDebugInfo* DebugInfo;
-  ObjCEntrypoints *ObjCData;
-  llvm::MDNode *NoObjCARCExceptionsMetadata;
+  std::unique_ptr<CGObjCRuntime> ObjCRuntime;
+  std::unique_ptr<CGOpenCLRuntime> OpenCLRuntime;
+  std::unique_ptr<CGOpenMPRuntime> OpenMPRuntime;
+  std::unique_ptr<CGCUDARuntime> CUDARuntime;
+  std::unique_ptr<CGDebugInfo> DebugInfo;
+  std::unique_ptr<ObjCEntrypoints> ObjCData;
+  llvm::MDNode *NoObjCARCExceptionsMetadata = nullptr;
   std::unique_ptr<llvm::IndexedInstrProfReader> PGOReader;
   InstrProfStats PGOStats;
   std::unique_ptr<llvm::SanitizerStatReport> SanStats;
@@ -434,8 +435,8 @@
   llvm::WeakVH ConstantStringClassRef;
 
   /// \brief The LLVM type corresponding to NSConstantString.
-  llvm::StructType *NSConstantStringType;
-  
+  llvm::StructType *NSConstantStringType = nullptr;
+
   /// \brief The type used to describe the state of a fast enumeration in
   /// Objective-C's for..in loop.
   QualType ObjCFastEnumerationStateType;
@@ -455,24 +456,24 @@
   /// @name Cache for Blocks Runtime Globals
   /// @{
 
-  llvm::Constant *NSConcreteGlobalBlock;
-  llvm::Constant *NSConcreteStackBlock;
+  llvm::Constant *NSConcreteGlobalBlock = nullptr;
+  llvm::Constant *NSConcreteStackBlock = nullptr;
 
-  llvm::Constant *BlockObjectAssign;
-  llvm::Constant *BlockObjectDispose;
+  llvm::Constant *BlockObjectAssign = nullptr;
+  llvm::Constant *BlockObjectDispose = nullptr;
 
-  llvm::Type *BlockDescriptorType;
-  llvm::Type *GenericBlockLiteralType;
+  llvm::Type *BlockDescriptorType = nullptr;
+  llvm::Type *GenericBlockLiteralType = nullptr;
 
   struct {
     int GlobalUniqueCount;
   } Block;
 
   /// void @llvm.lifetime.start(i64 %size, i8* nocapture <ptr>)
-  llvm::Constant *LifetimeStartFn;
+  llvm::Constant *LifetimeStartFn = nullptr;
 
   /// void @llvm.lifetime.end(i64 %size, i8* nocapture <ptr>)
-  llvm::Constant *LifetimeEndFn;
+  llvm::Constant *LifetimeEndFn = nullptr;
 
   GlobalDecl initializedGlobalDecl;
 
@@ -489,6 +490,10 @@
   /// MDNodes.
   llvm::DenseMap<QualType, llvm::Metadata *> MetadataIdMap;
 
+  /// Diags gathered from FunctionDecl::takeDeferredDiags().  Emitted at the
+  /// very end of codegen.
+  std::vector<std::pair<SourceLocation, PartialDiagnostic>> DeferredDiags;
+
 public:
   CodeGenModule(ASTContext &C, const HeaderSearchOptions &headersearchopts,
                 const PreprocessorOptions &ppopts,
@@ -588,7 +593,7 @@
     TypeDescriptorMap[Ty] = C;
   }
 
-  CGDebugInfo *getModuleDebugInfo() { return DebugInfo; }
+  CGDebugInfo *getModuleDebugInfo() { return DebugInfo.get(); }
 
   llvm::MDNode *getNoObjCARCExceptionsMetadata() {
     if (!NoObjCARCExceptionsMetadata)
@@ -997,6 +1002,8 @@
 
   void EmitVTable(CXXRecordDecl *Class);
 
+  void RefreshTypeCacheForClass(const CXXRecordDecl *Class);
+
   /// \brief Appends Opts to the "Linker Options" metadata value.
   void AppendLinkerOptions(StringRef Opts);
 
@@ -1106,36 +1113,45 @@
   /// \param D Threadprivate declaration.
   void EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D);
 
-  /// Returns whether the given record is blacklisted from control flow
-  /// integrity checks.
-  bool IsCFIBlacklistedRecord(const CXXRecordDecl *RD);
+  /// \brief Emit a code for declare reduction construct.
+  void EmitOMPDeclareReduction(const OMPDeclareReductionDecl *D,
+                               CodeGenFunction *CGF = nullptr);
 
-  /// Emit bit set entries for the given vtable using the given layout if
-  /// vptr CFI is enabled.
-  void EmitVTableBitSetEntries(llvm::GlobalVariable *VTable,
-                               const VTableLayout &VTLayout);
+  /// Returns whether the given record has hidden LTO visibility and therefore
+  /// may participate in (single-module) CFI and whole-program vtable
+  /// optimization.
+  bool HasHiddenLTOVisibility(const CXXRecordDecl *RD);
 
-  /// Generate a cross-DSO type identifier for type.
-  llvm::ConstantInt *CreateCfiIdForTypeMetadata(llvm::Metadata *MD);
+  /// Emit type metadata for the given vtable using the given layout.
+  void EmitVTableTypeMetadata(llvm::GlobalVariable *VTable,
+                              const VTableLayout &VTLayout);
+
+  /// Generate a cross-DSO type identifier for MD.
+  llvm::ConstantInt *CreateCrossDsoCfiTypeId(llvm::Metadata *MD);
 
   /// Create a metadata identifier for the given type. This may either be an
   /// MDString (for external identifiers) or a distinct unnamed MDNode (for
   /// internal identifiers).
   llvm::Metadata *CreateMetadataIdentifierForType(QualType T);
 
-  /// Create a bitset entry for the given function and add it to BitsetsMD.
-  void CreateFunctionBitSetEntry(const FunctionDecl *FD, llvm::Function *F);
+  /// Create and attach type metadata to the given function.
+  void CreateFunctionTypeMetadata(const FunctionDecl *FD, llvm::Function *F);
 
-  /// Create a bitset entry for the given vtable and add it to BitsetsMD.
-  void CreateVTableBitSetEntry(llvm::NamedMDNode *BitsetsMD,
-                               llvm::GlobalVariable *VTable, CharUnits Offset,
-                               const CXXRecordDecl *RD);
+  /// Returns whether this module needs the "all-vtables" type identifier.
+  bool NeedAllVtablesTypeId() const;
+
+  /// Create and attach type metadata for the given vtable.
+  void AddVTableTypeMetadata(llvm::GlobalVariable *VTable, CharUnits Offset,
+                             const CXXRecordDecl *RD);
 
   /// \breif Get the declaration of std::terminate for the platform.
   llvm::Constant *getTerminateFn();
 
   llvm::SanitizerStatReport &getSanStats();
 
+  llvm::Value *
+  createOpenCLIntToSamplerConversion(const Expr *E, CodeGenFunction &CGF);
+
 private:
   llvm::Constant *
   GetOrCreateLLVMFunction(StringRef MangledName, llvm::Type *Ty, GlobalDecl D,
@@ -1160,6 +1176,7 @@
   void EmitGlobalFunctionDefinition(GlobalDecl GD, llvm::GlobalValue *GV);
   void EmitGlobalVarDefinition(const VarDecl *D, bool IsTentative = false);
   void EmitAliasDefinition(GlobalDecl GD);
+  void emitIFuncDefinition(GlobalDecl GD);
   void EmitObjCPropertyImplementations(const ObjCImplementationDecl *D);
   void EmitObjCIvarInitializations(ObjCImplementationDecl *D);
   
diff --git a/lib/CodeGen/CodeGenPGO.cpp b/lib/CodeGen/CodeGenPGO.cpp
index e77ddd1..4eefdd7 100644
--- a/lib/CodeGen/CodeGenPGO.cpp
+++ b/lib/CodeGen/CodeGenPGO.cpp
@@ -37,7 +37,7 @@
       PGOReader ? PGOReader->getVersion() : llvm::IndexedInstrProf::Version);
 
   // If we're generating a profile, create a variable for the name.
-  if (CGM.getCodeGenOpts().ProfileInstrGenerate)
+  if (CGM.getCodeGenOpts().hasProfileClangInstr())
     FuncNameVar = llvm::createPGOFuncNameVar(CGM.getModule(), Linkage, FuncName);
 }
 
@@ -411,7 +411,8 @@
     RecordStmtCount(S);
     Visit(S->getLoopVarStmt());
     Visit(S->getRangeStmt());
-    Visit(S->getBeginEndStmt());
+    Visit(S->getBeginStmt());
+    Visit(S->getEndStmt());
 
     uint64_t ParentCount = CurrentCount;
     BreakContinueStack.push_back(BreakContinue());
@@ -612,7 +613,7 @@
 
 void CodeGenPGO::assignRegionCounters(GlobalDecl GD, llvm::Function *Fn) {
   const Decl *D = GD.getDecl();
-  bool InstrumentRegions = CGM.getCodeGenOpts().ProfileInstrGenerate;
+  bool InstrumentRegions = CGM.getCodeGenOpts().hasProfileClangInstr();
   llvm::IndexedInstrProfReader *PGOReader = CGM.getPGOReader();
   if (!InstrumentRegions && !PGOReader)
     return;
@@ -658,12 +659,18 @@
   FunctionHash = Walker.Hash.finalize();
 }
 
-void CodeGenPGO::emitCounterRegionMapping(const Decl *D) {
+bool CodeGenPGO::skipRegionMappingForDecl(const Decl *D) {
   if (SkipCoverageMapping)
-    return;
-  // Don't map the functions inside the system headers
+    return true;
+
+  // Don't map the functions in system headers.
+  const auto &SM = CGM.getContext().getSourceManager();
   auto Loc = D->getBody()->getLocStart();
-  if (CGM.getContext().getSourceManager().isInSystemHeader(Loc))
+  return SM.isInSystemHeader(Loc);
+}
+
+void CodeGenPGO::emitCounterRegionMapping(const Decl *D) {
+  if (skipRegionMappingForDecl(D))
     return;
 
   std::string CoverageMapping;
@@ -684,11 +691,7 @@
 void
 CodeGenPGO::emitEmptyCounterMapping(const Decl *D, StringRef Name,
                                     llvm::GlobalValue::LinkageTypes Linkage) {
-  if (SkipCoverageMapping)
-    return;
-  // Don't map the functions inside the system headers
-  auto Loc = D->getBody()->getLocStart();
-  if (CGM.getContext().getSourceManager().isInSystemHeader(Loc))
+  if (skipRegionMappingForDecl(D))
     return;
 
   std::string CoverageMapping;
@@ -731,7 +734,7 @@
 }
 
 void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S) {
-  if (!CGM.getCodeGenOpts().ProfileInstrGenerate || !RegionCounterMap)
+  if (!CGM.getCodeGenOpts().hasProfileClangInstr() || !RegionCounterMap)
     return;
   if (!Builder.GetInsertBlock())
     return;
@@ -759,12 +762,12 @@
   if (isa<llvm::Constant>(ValuePtr))
     return;
 
-  bool InstrumentValueSites = CGM.getCodeGenOpts().ProfileInstrGenerate;
+  bool InstrumentValueSites = CGM.getCodeGenOpts().hasProfileClangInstr();
   if (InstrumentValueSites && RegionCounterMap) {
-    llvm::LLVMContext &Ctx = CGM.getLLVMContext();
-    auto *I8PtrTy = llvm::Type::getInt8PtrTy(Ctx);
+    auto BuilderInsertPoint = Builder.saveIP();
+    Builder.SetInsertPoint(ValueSite);
     llvm::Value *Args[5] = {
-        llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+        llvm::ConstantExpr::getBitCast(FuncNameVar, Builder.getInt8PtrTy()),
         Builder.getInt64(FunctionHash),
         Builder.CreatePtrToInt(ValuePtr, Builder.getInt64Ty()),
         Builder.getInt32(ValueKind),
@@ -772,6 +775,7 @@
     };
     Builder.CreateCall(
         CGM.getIntrinsic(llvm::Intrinsic::instrprof_value_profile), Args);
+    Builder.restoreIP(BuilderInsertPoint);
     return;
   }
 
@@ -785,35 +789,11 @@
     // pairs for each function.
     if (NumValueSites[ValueKind] >= ProfRecord->getNumValueSites(ValueKind))
       return;
-    uint32_t NV = ProfRecord->getNumValueDataForSite(ValueKind,
-                                                     NumValueSites[ValueKind]);
-    std::unique_ptr<InstrProfValueData[]> VD =
-        ProfRecord->getValueForSite(ValueKind, NumValueSites[ValueKind]);
 
-    uint64_t Sum = 0;
-    for (uint32_t I = 0; I < NV; ++I)
-      Sum += VD[I].Count;
+    llvm::annotateValueSite(CGM.getModule(), *ValueSite, *ProfRecord,
+                            (llvm::InstrProfValueKind)ValueKind,
+                            NumValueSites[ValueKind]);
 
-    llvm::LLVMContext &Ctx = CGM.getLLVMContext();
-    llvm::MDBuilder MDHelper(Ctx);
-    SmallVector<llvm::Metadata*, 3> Vals;
-    Vals.push_back(MDHelper.createString("VP"));
-    Vals.push_back(MDHelper.createConstant(
-        llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), ValueKind)));
-    Vals.push_back(MDHelper.createConstant(
-        llvm::ConstantInt::get(llvm::Type::getInt64Ty(Ctx), Sum)));
-
-    uint32_t MDCount = 3;
-    for (uint32_t I = 0; I < NV; ++I) {
-      Vals.push_back(MDHelper.createConstant(
-          llvm::ConstantInt::get(llvm::Type::getInt64Ty(Ctx), VD[I].Value)));
-      Vals.push_back(MDHelper.createConstant(
-          llvm::ConstantInt::get(llvm::Type::getInt64Ty(Ctx), VD[I].Count)));
-      if (--MDCount == 0)
-        break;
-    }
-    ValueSite->setMetadata(
-        llvm::LLVMContext::MD_prof, llvm::MDNode::get(Ctx, Vals));
     NumValueSites[ValueKind]++;
   }
 }
@@ -822,20 +802,21 @@
                                   bool IsInMainFile) {
   CGM.getPGOStats().addVisited(IsInMainFile);
   RegionCounts.clear();
-  llvm::ErrorOr<llvm::InstrProfRecord> RecordErrorOr =
+  llvm::Expected<llvm::InstrProfRecord> RecordExpected =
       PGOReader->getInstrProfRecord(FuncName, FunctionHash);
-  if (std::error_code EC = RecordErrorOr.getError()) {
-    if (EC == llvm::instrprof_error::unknown_function)
+  if (auto E = RecordExpected.takeError()) {
+    auto IPE = llvm::InstrProfError::take(std::move(E));
+    if (IPE == llvm::instrprof_error::unknown_function)
       CGM.getPGOStats().addMissing(IsInMainFile);
-    else if (EC == llvm::instrprof_error::hash_mismatch)
+    else if (IPE == llvm::instrprof_error::hash_mismatch)
       CGM.getPGOStats().addMismatched(IsInMainFile);
-    else if (EC == llvm::instrprof_error::malformed)
+    else if (IPE == llvm::instrprof_error::malformed)
       // TODO: Consider a more specific warning for this case.
       CGM.getPGOStats().addMismatched(IsInMainFile);
     return;
   }
   ProfRecord =
-      llvm::make_unique<llvm::InstrProfRecord>(std::move(RecordErrorOr.get()));
+      llvm::make_unique<llvm::InstrProfRecord>(std::move(RecordExpected.get()));
   RegionCounts = ProfRecord->Counts;
 }
 
diff --git a/lib/CodeGen/CodeGenPGO.h b/lib/CodeGen/CodeGenPGO.h
index a181cb9..4f229cd 100644
--- a/lib/CodeGen/CodeGenPGO.h
+++ b/lib/CodeGen/CodeGenPGO.h
@@ -18,9 +18,8 @@
 #include "CodeGenModule.h"
 #include "CodeGenTypes.h"
 #include "clang/Frontend/CodeGenOptions.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ProfileData/InstrProfReader.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include <array>
 #include <memory>
 
 namespace clang {
@@ -33,7 +32,7 @@
   std::string FuncName;
   llvm::GlobalVariable *FuncNameVar;
 
-  unsigned NumValueSites[llvm::IPVK_Last + 1];
+  std::array <unsigned, llvm::IPVK_Last + 1> NumValueSites;
   unsigned NumRegionCounters;
   uint64_t FunctionHash;
   std::unique_ptr<llvm::DenseMap<const Stmt *, unsigned>> RegionCounterMap;
@@ -47,7 +46,7 @@
 
 public:
   CodeGenPGO(CodeGenModule &CGM)
-      : CGM(CGM), NumValueSites{0}, NumRegionCounters(0),
+      : CGM(CGM), NumValueSites({{0}}), NumRegionCounters(0),
         FunctionHash(0), CurrentRegionCount(0), SkipCoverageMapping(false) {}
 
   /// Whether or not we have PGO region data for the current function. This is
@@ -102,6 +101,7 @@
                                llvm::Function *Fn);
   void loadRegionCounts(llvm::IndexedInstrProfReader *PGOReader,
                         bool IsInMainFile);
+  bool skipRegionMappingForDecl(const Decl *D);
   void emitCounterRegionMapping(const Decl *D);
 
 public:
diff --git a/lib/CodeGen/CodeGenTBAA.h b/lib/CodeGen/CodeGenTBAA.h
index 632cadd..ddb063d 100644
--- a/lib/CodeGen/CodeGenTBAA.h
+++ b/lib/CodeGen/CodeGenTBAA.h
@@ -15,14 +15,11 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_CODEGENTBAA_H
 #define LLVM_CLANG_LIB_CODEGEN_CODEGENTBAA_H
 
+#include "clang/AST/Type.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/MDBuilder.h"
-
-namespace llvm {
-  class LLVMContext;
-  class MDNode;
-}
+#include "llvm/IR/Metadata.h"
 
 namespace clang {
   class ASTContext;
diff --git a/lib/CodeGen/CodeGenTypes.cpp b/lib/CodeGen/CodeGenTypes.cpp
index 09d9bf1..ebe55c7 100644
--- a/lib/CodeGen/CodeGenTypes.cpp
+++ b/lib/CodeGen/CodeGenTypes.cpp
@@ -272,6 +272,17 @@
     DI->completeType(RD);
 }
 
+void CodeGenTypes::RefreshTypeCacheForClass(const CXXRecordDecl *RD) {
+  QualType T = Context.getRecordType(RD);
+  T = Context.getCanonicalType(T);
+
+  const Type *Ty = T.getTypePtr();
+  if (RecordsWithOpaqueMemberPointers.count(Ty)) {
+    TypeCache.clear();
+    RecordsWithOpaqueMemberPointers.clear();
+  }
+}
+
 static llvm::Type *getTypeForFormat(llvm::LLVMContext &VMContext,
                                     const llvm::fltSemantics &format,
                                     bool UseNativeHalf = false) {
@@ -438,6 +449,7 @@
     case BuiltinType::Float:
     case BuiltinType::Double:
     case BuiltinType::LongDouble:
+    case BuiltinType::Float128:
       ResultType = getTypeForFormat(getLLVMContext(),
                                     Context.getFloatTypeSemantics(T),
                                     /* UseNativeHalf = */ false);
@@ -453,18 +465,9 @@
       ResultType = llvm::IntegerType::get(getLLVMContext(), 128);
       break;
 
-    case BuiltinType::OCLImage1d:
-    case BuiltinType::OCLImage1dArray:
-    case BuiltinType::OCLImage1dBuffer:
-    case BuiltinType::OCLImage2d:
-    case BuiltinType::OCLImage2dArray:
-    case BuiltinType::OCLImage2dDepth:
-    case BuiltinType::OCLImage2dArrayDepth:
-    case BuiltinType::OCLImage2dMSAA:
-    case BuiltinType::OCLImage2dArrayMSAA:
-    case BuiltinType::OCLImage2dMSAADepth:
-    case BuiltinType::OCLImage2dArrayMSAADepth:
-    case BuiltinType::OCLImage3d:
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
     case BuiltinType::OCLSampler:
     case BuiltinType::OCLEvent:
     case BuiltinType::OCLClkEvent:
@@ -603,10 +606,13 @@
   }
 
   case Type::MemberPointer: {
-    if (!getCXXABI().isMemberPointerConvertible(cast<MemberPointerType>(Ty)))
-      return llvm::StructType::create(getLLVMContext());
-    ResultType = 
-      getCXXABI().ConvertMemberPointerType(cast<MemberPointerType>(Ty));
+    auto *MPTy = cast<MemberPointerType>(Ty);
+    if (!getCXXABI().isMemberPointerConvertible(MPTy)) {
+      RecordsWithOpaqueMemberPointers.insert(MPTy->getClass());
+      ResultType = llvm::StructType::create(getLLVMContext());
+    } else {
+      ResultType = getCXXABI().ConvertMemberPointerType(MPTy);
+    }
     break;
   }
 
diff --git a/lib/CodeGen/CodeGenTypes.h b/lib/CodeGen/CodeGenTypes.h
index 5a2f5c3..00df10d 100644
--- a/lib/CodeGen/CodeGenTypes.h
+++ b/lib/CodeGen/CodeGenTypes.h
@@ -15,15 +15,14 @@
 #define LLVM_CLANG_LIB_CODEGEN_CODEGENTYPES_H
 
 #include "CGCall.h"
-#include "clang/AST/GlobalDecl.h"
+#include "clang/Basic/ABI.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
+#include "clang/Sema/Sema.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Module.h"
-#include <vector>
 
 namespace llvm {
 class FunctionType;
-class Module;
 class DataLayout;
 class Type;
 class LLVMContext;
@@ -48,6 +47,7 @@
 class TargetInfo;
 class Type;
 typedef CanQual<Type> CanQualType;
+class GlobalDecl;
 
 namespace CodeGen {
 class ABIInfo;
@@ -162,6 +162,10 @@
   /// corresponding llvm::Type.
   llvm::DenseMap<const Type *, llvm::Type *> TypeCache;
 
+  llvm::SmallSet<const Type *, 8> RecordsWithOpaqueMemberPointers;
+
+  unsigned ClangCallConvToLLVMCallConv(CallingConv CC);
+
 public:
   CodeGenTypes(CodeGenModule &cgm);
   ~CodeGenTypes();
@@ -203,6 +207,11 @@
   bool isFuncTypeConvertible(const FunctionType *FT);
   bool isFuncParamTypeConvertible(QualType Ty);
 
+  /// Determine if a C++ inheriting constructor should have parameters matching
+  /// those of its inherited constructor.
+  bool inheritingCtorHasParams(const InheritedConstructor &Inherited,
+                               CXXCtorType Type);
+
   /// GetFunctionTypeForVTable - Get the LLVM function type for use in a vtable,
   /// given a CXXMethodDecl. If the method to has an incomplete return type,
   /// and/or incomplete argument types, this will return the opaque type.
diff --git a/lib/CodeGen/CoverageMappingGen.cpp b/lib/CodeGen/CoverageMappingGen.cpp
index 5603ea7..da6fa2a 100644
--- a/lib/CodeGen/CoverageMappingGen.cpp
+++ b/lib/CodeGen/CoverageMappingGen.cpp
@@ -18,11 +18,12 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ProfileData/CoverageMapping.h"
-#include "llvm/ProfileData/CoverageMappingReader.h"
-#include "llvm/ProfileData/CoverageMappingWriter.h"
+#include "llvm/ProfileData/Coverage/CoverageMapping.h"
+#include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
+#include "llvm/ProfileData/Coverage/CoverageMappingWriter.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -130,6 +131,16 @@
     return strcmp(SM.getBufferName(SM.getSpellingLoc(Loc)), "<built-in>") == 0;
   }
 
+  /// \brief Check whether \c Loc is included or expanded from \c Parent.
+  bool isNestedIn(SourceLocation Loc, FileID Parent) {
+    do {
+      Loc = getIncludeOrExpansionLoc(Loc);
+      if (Loc.isInvalid())
+        return false;
+    } while (!SM.isInFileID(Loc, Parent));
+    return true;
+  }
+
   /// \brief Get the start of \c S ignoring macro arguments and builtin macros.
   SourceLocation getStart(const Stmt *S) {
     SourceLocation Loc = S->getLocStart();
@@ -162,6 +173,10 @@
       if (!Visited.insert(File).second)
         continue;
 
+      // Do not map FileID's associated with system headers.
+      if (SM.isInSystemHeader(SM.getSpellingLoc(Loc)))
+        continue;
+
       unsigned Depth = 0;
       for (SourceLocation Parent = getIncludeOrExpansionLoc(Loc);
            Parent.isValid(); Parent = getIncludeOrExpansionLoc(Parent))
@@ -192,12 +207,6 @@
     return None;
   }
 
-  /// \brief Return true if the given clang's file id has a corresponding
-  /// coverage file id.
-  bool hasExistingCoverageFileID(FileID File) const {
-    return FileIDMapping.count(File);
-  }
-
   /// \brief Gather all the regions that were skipped by the preprocessor
   /// using the constructs like #if.
   void gatherSkippedRegions() {
@@ -247,6 +256,10 @@
       SourceLocation LocStart = Region.getStartLoc();
       assert(SM.getFileID(LocStart).isValid() && "region in invalid file");
 
+      // Ignore regions from system headers.
+      if (SM.isInSystemHeader(SM.getSpellingLoc(LocStart)))
+        continue;
+
       auto CovFileID = getCoverageFileID(LocStart);
       // Ignore regions that don't have a file, such as builtin macros.
       if (!CovFileID)
@@ -310,7 +323,27 @@
     if (!D->hasBody())
       return;
     auto Body = D->getBody();
-    SourceRegions.emplace_back(Counter(), getStart(Body), getEnd(Body));
+    SourceLocation Start = getStart(Body);
+    SourceLocation End = getEnd(Body);
+    if (!SM.isWrittenInSameFile(Start, End)) {
+      // Walk up to find the common ancestor.
+      // Correct the locations accordingly.
+      FileID StartFileID = SM.getFileID(Start);
+      FileID EndFileID = SM.getFileID(End);
+      while (StartFileID != EndFileID && !isNestedIn(End, StartFileID)) {
+        Start = getIncludeOrExpansionLoc(Start);
+        assert(Start.isValid() &&
+               "Declaration start location not nested within a known region");
+        StartFileID = SM.getFileID(Start);
+      }
+      while (StartFileID != EndFileID) {
+        End = getPreciseTokenLocEnd(getIncludeOrExpansionLoc(End));
+        assert(End.isValid() &&
+               "Declaration end location not nested within a known region");
+        EndFileID = SM.getFileID(End);
+      }
+    }
+    SourceRegions.emplace_back(Counter(), Start, End);
   }
 
   /// \brief Write the mapping data to the output stream
@@ -319,6 +352,9 @@
     gatherFileIDs(FileIDMapping);
     emitSourceRegions();
 
+    if (MappingRegions.empty())
+      return;
+
     CoverageMappingWriter Writer(FileIDMapping, None, MappingRegions);
     Writer.write(OS);
   }
@@ -357,10 +393,6 @@
     return addCounters(addCounters(C1, C2), C3);
   }
 
-  Counter addCounters(Counter C1, Counter C2, Counter C3, Counter C4) {
-    return addCounters(addCounters(C1, C2, C3), C4);
-  }
-
   /// \brief Return the region counter for the given statement.
   ///
   /// This should only be called on statements that have a dedicated counter.
@@ -471,16 +503,6 @@
       MostRecentLocation = getIncludeOrExpansionLoc(MostRecentLocation);
   }
 
-  /// \brief Check whether \c Loc is included or expanded from \c Parent.
-  bool isNestedIn(SourceLocation Loc, FileID Parent) {
-    do {
-      Loc = getIncludeOrExpansionLoc(Loc);
-      if (Loc.isInvalid())
-        return false;
-    } while (!SM.isInFileID(Loc, Parent));
-    return true;
-  }
-
   /// \brief Adjust regions and state when \c NewLoc exits a file.
   ///
   /// If moving from our most recently tracked location to \c NewLoc exits any
@@ -586,6 +608,9 @@
     emitExpansionRegions();
     gatherSkippedRegions();
 
+    if (MappingRegions.empty())
+      return;
+
     CoverageMappingWriter Writer(VirtualFileMapping, Builder.getExpressions(),
                                  MappingRegions);
     Writer.write(OS);
@@ -602,6 +627,11 @@
 
   void VisitDecl(const Decl *D) {
     Stmt *Body = D->getBody();
+
+    // Do not propagate region counts into system headers.
+    if (Body && SM.isInSystemHeader(SM.getSpellingLoc(getStart(Body))))
+      return;
+
     propagateCounts(getRegionCounter(Body), Body);
   }
 
@@ -847,7 +877,12 @@
 
   void VisitCXXTryStmt(const CXXTryStmt *S) {
     extendRegion(S);
-    Visit(S->getTryBlock());
+    // Handle macros that generate the "try" but not the rest.
+    extendRegion(S->getTryBlock());
+
+    Counter ParentCount = getRegion().getCounter();
+    propagateCounts(ParentCount, S->getTryBlock());
+
     for (unsigned I = 0, E = S->getNumHandlers(); I < E; ++I)
       Visit(S->getHandler(I));
 
@@ -897,16 +932,24 @@
     // propagate counts into them.
   }
 };
-}
 
-static bool isMachO(const CodeGenModule &CGM) {
+bool isMachO(const CodeGenModule &CGM) {
   return CGM.getTarget().getTriple().isOSBinFormatMachO();
 }
 
-static StringRef getCoverageSection(const CodeGenModule &CGM) {
+StringRef getCoverageSection(const CodeGenModule &CGM) {
   return llvm::getInstrProfCoverageSectionName(isMachO(CGM));
 }
 
+std::string normalizeFilename(StringRef Filename) {
+  llvm::SmallString<256> Path(Filename);
+  llvm::sys::fs::make_absolute(Path);
+  llvm::sys::path::remove_dots(Path, /*remove_dot_dots=*/true);
+  return Path.str().str();
+}
+
+} // end anonymous namespace
+
 static void dump(llvm::raw_ostream &OS, StringRef FunctionName,
                  ArrayRef<CounterExpression> Expressions,
                  ArrayRef<CounterMappingRegion> Regions) {
@@ -971,7 +1014,7 @@
     llvm::SmallVector<StringRef, 16> FilenameRefs;
     FilenameRefs.resize(FileEntries.size());
     for (const auto &Entry : FileEntries)
-      FilenameRefs[Entry.second] = Entry.first->getName();
+      FilenameRefs[Entry.second] = normalizeFilename(Entry.first->getName());
     RawCoverageMappingReader Reader(CoverageMapping, FilenameRefs, Filenames,
                                     Expressions, Regions);
     if (Reader.read())
@@ -992,11 +1035,8 @@
   FilenameStrs.resize(FileEntries.size());
   FilenameRefs.resize(FileEntries.size());
   for (const auto &Entry : FileEntries) {
-    llvm::SmallString<256> Path(Entry.first->getName());
-    llvm::sys::fs::make_absolute(Path);
-
     auto I = Entry.second;
-    FilenameStrs[I] = std::string(Path.begin(), Path.end());
+    FilenameStrs[I] = normalizeFilename(Entry.first->getName());
     FilenameRefs[I] = FilenameStrs[I];
   }
 
diff --git a/lib/CodeGen/CoverageMappingGen.h b/lib/CodeGen/CoverageMappingGen.h
index c202fe8..b6789c2 100644
--- a/lib/CodeGen/CoverageMappingGen.h
+++ b/lib/CodeGen/CoverageMappingGen.h
@@ -19,7 +19,6 @@
 #include "clang/Frontend/CodeGenOptions.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/lib/CodeGen/EHScopeStack.h b/lib/CodeGen/EHScopeStack.h
index 8352c75..4717a66 100644
--- a/lib/CodeGen/EHScopeStack.h
+++ b/lib/CodeGen/EHScopeStack.h
@@ -89,7 +89,10 @@
   InactiveCleanup = 0x4,
   InactiveEHCleanup = EHCleanup | InactiveCleanup,
   InactiveNormalCleanup = NormalCleanup | InactiveCleanup,
-  InactiveNormalAndEHCleanup = NormalAndEHCleanup | InactiveCleanup
+  InactiveNormalAndEHCleanup = NormalAndEHCleanup | InactiveCleanup,
+
+  LifetimeMarker = 0x8,
+  NormalEHLifetimeMarker = LifetimeMarker | NormalAndEHCleanup,
 };
 
 /// A stack of scopes which respond to exceptions, including cleanups
diff --git a/lib/CodeGen/ItaniumCXXABI.cpp b/lib/CodeGen/ItaniumCXXABI.cpp
index e016b99..2492c82 100644
--- a/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/lib/CodeGen/ItaniumCXXABI.cpp
@@ -443,6 +443,7 @@
            (isa<CXXDestructorDecl>(GD.getDecl()) &&
             GD.getDtorType() != Dtor_Deleting);
   }
+  bool canCallMismatchedFunctionType() const override { return false; }
 };
 }
 
@@ -1389,6 +1390,10 @@
 }
 
 void ItaniumCXXABI::EmitInstanceFunctionProlog(CodeGenFunction &CGF) {
+  // Naked functions have no prolog.
+  if (CGF.CurFuncDecl && CGF.CurFuncDecl->hasAttr<NakedAttr>())
+    return;
+
   /// Initialize the 'this' slot.
   EmitThisParam(CGF);
 
@@ -1489,7 +1494,7 @@
     EmitFundamentalRTTIDescriptors();
 
   if (!VTable->isDeclarationForLinker())
-    CGM.EmitVTableBitSetEntries(VTable, VTLayout);
+    CGM.EmitVTableTypeMetadata(VTable, VTLayout);
 }
 
 bool ItaniumCXXABI::isVirtualOffsetNeededForVTableField(
@@ -1521,8 +1526,8 @@
                               .getVTableLayout(VTableClass)
                               .getAddressPoint(Base);
   llvm::Value *Indices[] = {
-    llvm::ConstantInt::get(CGM.Int64Ty, 0),
-    llvm::ConstantInt::get(CGM.Int64Ty, AddressPoint)
+    llvm::ConstantInt::get(CGM.Int32Ty, 0),
+    llvm::ConstantInt::get(CGM.Int32Ty, AddressPoint)
   };
 
   return llvm::ConstantExpr::getInBoundsGetElementPtr(VTable->getValueType(),
@@ -1574,7 +1579,7 @@
 
   VTable = CGM.CreateOrReplaceCXXRuntimeVariable(
       Name, ArrayType, llvm::GlobalValue::ExternalLinkage);
-  VTable->setUnnamedAddr(true);
+  VTable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
 
   if (RD->hasAttr<DLLImportAttr>())
     VTable->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
@@ -1594,14 +1599,18 @@
   auto *MethodDecl = cast<CXXMethodDecl>(GD.getDecl());
   llvm::Value *VTable = CGF.GetVTablePtr(This, Ty, MethodDecl->getParent());
 
-  if (CGF.SanOpts.has(SanitizerKind::CFIVCall))
-    CGF.EmitVTablePtrCheckForCall(MethodDecl, VTable,
-                                  CodeGenFunction::CFITCK_VCall, Loc);
-
   uint64_t VTableIndex = CGM.getItaniumVTableContext().getMethodVTableIndex(GD);
-  llvm::Value *VFuncPtr =
-      CGF.Builder.CreateConstInBoundsGEP1_64(VTable, VTableIndex, "vfn");
-  return CGF.Builder.CreateAlignedLoad(VFuncPtr, CGF.getPointerAlign());
+  if (CGF.ShouldEmitVTableTypeCheckedLoad(MethodDecl->getParent())) {
+    return CGF.EmitVTableTypeCheckedLoad(
+        MethodDecl->getParent(), VTable,
+        VTableIndex * CGM.getContext().getTargetInfo().getPointerWidth(0) / 8);
+  } else {
+    CGF.EmitTypeMetadataCodeForVCall(MethodDecl->getParent(), VTable, Loc);
+
+    llvm::Value *VFuncPtr =
+        CGF.Builder.CreateConstInBoundsGEP1_64(VTable, VTableIndex, "vfn");
+    return CGF.Builder.CreateAlignedLoad(VFuncPtr, CGF.getPointerAlign());
+  }
 }
 
 llvm::Value *ItaniumCXXABI::EmitVirtualDestructorCall(
@@ -1906,10 +1915,18 @@
                                     bool shouldPerformInit) {
   CGBuilderTy &Builder = CGF.Builder;
 
-  // We only need to use thread-safe statics for local non-TLS variables;
-  // global initialization is always single-threaded.
+  // Inline variables that weren't instantiated from variable templates have
+  // partially-ordered initialization within their translation unit.
+  bool NonTemplateInline =
+      D.isInline() &&
+      !isTemplateInstantiation(D.getTemplateSpecializationKind());
+
+  // We only need to use thread-safe statics for local non-TLS variables and
+  // inline variables; other global initialization is always single-threaded
+  // or (through lazy dynamic loading in multiple threads) unsequenced.
   bool threadsafe = getContext().getLangOpts().ThreadsafeStatics &&
-                    D.isLocalVarDecl() && !D.getTLSKind();
+                    (D.isLocalVarDecl() || NonTemplateInline) &&
+                    !D.getTLSKind();
 
   // If we have a global variable with internal linkage and thread-safe statics
   // are disabled, we can just let the guard variable be of type i8.
@@ -1963,7 +1980,11 @@
     if (!D.isLocalVarDecl() && C &&
         CGM.getTarget().getTriple().isOSBinFormatELF()) {
       guard->setComdat(C);
-      CGF.CurFn->setComdat(C);
+      // An inline variable's guard function is run from the per-TU
+      // initialization function, not via a dedicated global ctor function, so
+      // we can't put it in a comdat.
+      if (!NonTemplateInline)
+        CGF.CurFn->setComdat(C);
     } else if (CGM.supportsCOMDAT() && guard->isWeakForLinker()) {
       guard->setComdat(CGM.getModule().getOrInsertComdat(guard->getName()));
     }
@@ -2001,7 +2022,7 @@
   //
   // In LLVM, we do this by marking the load Acquire.
   if (threadsafe)
-    LI->setAtomic(llvm::Acquire);
+    LI->setAtomic(llvm::AtomicOrdering::Acquire);
 
   // For ARM, we should only check the first bit, rather than the entire byte:
   //
@@ -2325,8 +2346,7 @@
   llvm::Function *Wrapper = getOrCreateThreadLocalWrapper(VD, Val);
 
   llvm::CallInst *CallVal = CGF.Builder.CreateCall(Wrapper);
-  if (isThreadWrapperReplaceable(VD, CGF.CGM))
-    CallVal->setCallingConv(llvm::CallingConv::CXX_FAST_TLS);
+  CallVal->setCallingConv(Wrapper->getCallingConv());
 
   LValue LV;
   if (VD->getType()->isReferenceType())
@@ -2505,6 +2525,11 @@
   //   long, unsigned long, long long, unsigned long long, float, double,
   //   long double, char16_t, char32_t, and the IEEE 754r decimal and
   //   half-precision floating point types.
+  //
+  // GCC also emits RTTI for __int128.
+  // FIXME: We do not emit RTTI information for decimal types here.
+
+  // Types added here must also be added to EmitFundamentalRTTIDescriptors.
   switch (Ty->getKind()) {
     case BuiltinType::Void:
     case BuiltinType::NullPtr:
@@ -2527,29 +2552,23 @@
     case BuiltinType::Float:
     case BuiltinType::Double:
     case BuiltinType::LongDouble:
+    case BuiltinType::Float128:
     case BuiltinType::Char16:
     case BuiltinType::Char32:
     case BuiltinType::Int128:
     case BuiltinType::UInt128:
-    case BuiltinType::OCLImage1d:
-    case BuiltinType::OCLImage1dArray:
-    case BuiltinType::OCLImage1dBuffer:
-    case BuiltinType::OCLImage2d:
-    case BuiltinType::OCLImage2dArray:
-    case BuiltinType::OCLImage2dDepth:
-    case BuiltinType::OCLImage2dArrayDepth:
-    case BuiltinType::OCLImage2dMSAA:
-    case BuiltinType::OCLImage2dArrayMSAA:
-    case BuiltinType::OCLImage2dMSAADepth:
-    case BuiltinType::OCLImage2dArrayMSAADepth:
-    case BuiltinType::OCLImage3d:
+      return true;
+
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
     case BuiltinType::OCLSampler:
     case BuiltinType::OCLEvent:
     case BuiltinType::OCLClkEvent:
     case BuiltinType::OCLQueue:
     case BuiltinType::OCLNDRange:
     case BuiltinType::OCLReserveID:
-      return true;
+      return false;
 
     case BuiltinType::Dependent:
 #define BUILTIN_TYPE(Id, SingletonId)
@@ -2878,7 +2897,7 @@
 
 llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(QualType Ty, bool Force) {
   // We want to operate on the canonical type.
-  Ty = CGM.getContext().getCanonicalType(Ty);
+  Ty = Ty.getCanonicalType();
 
   // Check if we've already emitted an RTTI descriptor for this type.
   SmallString<256> Name;
@@ -3341,6 +3360,7 @@
 }
 
 void ItaniumCXXABI::EmitFundamentalRTTIDescriptors() {
+  // Types added here must also be added to TypeInfoIsInStandardLibrary.
   QualType FundamentalTypes[] = {
       getContext().VoidTy,             getContext().NullPtrTy,
       getContext().BoolTy,             getContext().WCharTy,
@@ -3349,10 +3369,11 @@
       getContext().UnsignedShortTy,    getContext().IntTy,
       getContext().UnsignedIntTy,      getContext().LongTy,
       getContext().UnsignedLongTy,     getContext().LongLongTy,
-      getContext().UnsignedLongLongTy, getContext().HalfTy,
+      getContext().UnsignedLongLongTy, getContext().Int128Ty,
+      getContext().UnsignedInt128Ty,   getContext().HalfTy,
       getContext().FloatTy,            getContext().DoubleTy,
-      getContext().LongDoubleTy,       getContext().Char16Ty,
-      getContext().Char32Ty,
+      getContext().LongDoubleTy,       getContext().Float128Ty,
+      getContext().Char16Ty,           getContext().Char32Ty
   };
   for (const QualType &FundamentalType : FundamentalTypes)
     EmitFundamentalRTTIDescriptor(FundamentalType);
diff --git a/lib/CodeGen/Makefile b/lib/CodeGen/Makefile
deleted file mode 100644
index 6032dff..0000000
--- a/lib/CodeGen/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-##===- clang/lib/CodeGen/Makefile --------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-# This implements the AST -> LLVM code generation library for the 
-# C-Language front-end.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangCodeGen
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/CodeGen/MicrosoftCXXABI.cpp b/lib/CodeGen/MicrosoftCXXABI.cpp
index 71fc207..28312fc 100644
--- a/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -98,7 +98,7 @@
     const VBTableGlobals &VBGlobals = enumerateVBTables(RD);
     for (const VPtrInfo *VBT : *VBGlobals.VBTables) {
       const ASTRecordLayout &SubobjectLayout =
-          Context.getASTRecordLayout(VBT->BaseWithVPtr);
+          Context.getASTRecordLayout(VBT->IntroducingObject);
       CharUnits Offs = VBT->NonVirtualOffset;
       Offs += SubobjectLayout.getVBPtrOffset();
       if (VBT->getVBaseWithVPtr())
@@ -254,8 +254,8 @@
                           CXXDtorType Type, bool ForVirtualBase,
                           bool Delegating, Address This) override;
 
-  void emitVTableBitSetEntries(VPtrInfo *Info, const CXXRecordDecl *RD,
-                               llvm::GlobalVariable *VTable);
+  void emitVTableTypeMetadata(VPtrInfo *Info, const CXXRecordDecl *RD,
+                              llvm::GlobalVariable *VTable);
 
   void emitVTableDefinitions(CodeGenVTables &CGVT,
                              const CXXRecordDecl *RD) override;
@@ -551,7 +551,7 @@
     return  llvm::Constant::getAllOnesValue(CGM.IntTy);
   }
 
-  CharUnits getVirtualFunctionPrologueThisAdjustment(GlobalDecl GD);
+  CharUnits getVirtualFunctionPrologueThisAdjustment(GlobalDecl GD) override;
 
   void
   GetNullMemberPointerFields(const MemberPointerType *MPT,
@@ -942,7 +942,6 @@
   llvm::Value *Offset =
     GetVirtualBaseClassOffset(CGF, Value, SrcDecl, PolymorphicBase);
   llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(Value.getPointer(), Offset);
-  Offset = CGF.Builder.CreateTrunc(Offset, CGF.Int32Ty);
   CharUnits VBaseAlign =
     CGF.CGM.getVBaseAlignment(Value.getAlignment(), SrcDecl, PolymorphicBase);
   return std::make_pair(Address(Ptr, VBaseAlign), Offset);
@@ -976,8 +975,8 @@
                                          QualType SrcRecordTy,
                                          Address ThisPtr,
                                          llvm::Type *StdTypeInfoPtrTy) {
-  llvm::Value *Offset;
-  std::tie(ThisPtr, Offset) = performBaseAdjustment(CGF, ThisPtr, SrcRecordTy);
+  std::tie(ThisPtr, std::ignore) =
+      performBaseAdjustment(CGF, ThisPtr, SrcRecordTy);
   auto Typeid = emitRTtypeidCall(CGF, ThisPtr.getPointer()).getInstruction();
   return CGF.Builder.CreateBitCast(Typeid, StdTypeInfoPtrTy);
 }
@@ -1002,6 +1001,7 @@
   llvm::Value *Offset;
   std::tie(This, Offset) = performBaseAdjustment(CGF, This, SrcRecordTy);
   llvm::Value *ThisPtr = This.getPointer();
+  Offset = CGF.Builder.CreateTrunc(Offset, CGF.Int32Ty);
 
   // PVOID __RTDynamicCast(
   //   PVOID inptr,
@@ -1025,8 +1025,7 @@
 MicrosoftCXXABI::EmitDynamicCastToVoid(CodeGenFunction &CGF, Address Value,
                                        QualType SrcRecordTy,
                                        QualType DestTy) {
-  llvm::Value *Offset;
-  std::tie(Value, Offset) = performBaseAdjustment(CGF, Value, SrcRecordTy);
+  std::tie(Value, std::ignore) = performBaseAdjustment(CGF, Value, SrcRecordTy);
 
   // PVOID __RTCastToVoid(
   //   PVOID inptr)
@@ -1152,16 +1151,14 @@
 
     llvm::Value *VBaseOffset =
         GetVirtualBaseClassOffset(CGF, getThisAddress(CGF), RD, I->first);
-    // FIXME: it doesn't look right that we SExt in GetVirtualBaseClassOffset()
-    // just to Trunc back immediately.
-    VBaseOffset = Builder.CreateTruncOrBitCast(VBaseOffset, CGF.Int32Ty);
     uint64_t ConstantVBaseOffset =
         Layout.getVBaseClassOffset(I->first).getQuantity();
 
     // vtorDisp_for_vbase = vbptr[vbase_idx] - offsetof(RD, vbase).
     llvm::Value *VtorDispValue = Builder.CreateSub(
-        VBaseOffset, llvm::ConstantInt::get(CGM.Int32Ty, ConstantVBaseOffset),
+        VBaseOffset, llvm::ConstantInt::get(CGM.PtrDiffTy, ConstantVBaseOffset),
         "vtordisp.value");
+    VtorDispValue = Builder.CreateTruncOrBitCast(VtorDispValue, CGF.Int32Ty);
 
     if (!Int8This)
       Int8This = Builder.CreateBitCast(getThisValue(CGF),
@@ -1214,7 +1211,7 @@
     const VPtrInfo *VBT = (*VBGlobals.VBTables)[I];
     llvm::GlobalVariable *GV = VBGlobals.Globals[I];
     const ASTRecordLayout &SubobjectLayout =
-        Context.getASTRecordLayout(VBT->BaseWithVPtr);
+        Context.getASTRecordLayout(VBT->IntroducingObject);
     CharUnits Offs = VBT->NonVirtualOffset;
     Offs += SubobjectLayout.getVBPtrOffset();
     if (VBT->getVBaseWithVPtr())
@@ -1223,7 +1220,7 @@
     llvm::Value *GVPtr =
         CGF.Builder.CreateConstInBoundsGEP2_32(GV->getValueType(), GV, 0, 0);
     VBPtr = CGF.Builder.CreateElementBitCast(VBPtr, GVPtr->getType(),
-                                      "vbptr." + VBT->ReusingBase->getName());
+                                      "vbptr." + VBT->ObjectWithVPtr->getName());
     CGF.Builder.CreateStore(GVPtr, VBPtr);
   }
 }
@@ -1420,6 +1417,10 @@
 }
 
 void MicrosoftCXXABI::EmitInstanceFunctionProlog(CodeGenFunction &CGF) {
+  // Naked functions have no prolog.
+  if (CGF.CurFuncDecl && CGF.CurFuncDecl->hasAttr<NakedAttr>())
+    return;
+
   EmitThisParam(CGF);
 
   /// If this is a function that the ABI specifies returns 'this', initialize
@@ -1467,16 +1468,18 @@
 
   // Add the 'most_derived' argument second if we are variadic or last if not.
   const FunctionProtoType *FPT = D->getType()->castAs<FunctionProtoType>();
-  llvm::Value *MostDerivedArg =
-      llvm::ConstantInt::get(CGM.Int32Ty, Type == Ctor_Complete);
-  RValue RV = RValue::get(MostDerivedArg);
-  if (MostDerivedArg) {
-    if (FPT->isVariadic())
-      Args.insert(Args.begin() + 1,
-                  CallArg(RV, getContext().IntTy, /*needscopy=*/false));
-    else
-      Args.add(RV, getContext().IntTy);
+  llvm::Value *MostDerivedArg;
+  if (Delegating) {
+    MostDerivedArg = getStructorImplicitParamValue(CGF);
+  } else {
+    MostDerivedArg = llvm::ConstantInt::get(CGM.Int32Ty, Type == Ctor_Complete);
   }
+  RValue RV = RValue::get(MostDerivedArg);
+  if (FPT->isVariadic())
+    Args.insert(Args.begin() + 1,
+                CallArg(RV, getContext().IntTy, /*needscopy=*/false));
+  else
+    Args.add(RV, getContext().IntTy);
 
   return 1;  // Added one arg.
 }
@@ -1494,24 +1497,18 @@
                                                     This, false);
   }
 
-  CGF.EmitCXXStructorCall(DD, Callee, ReturnValueSlot(), This.getPointer(),
-                          /*ImplicitParam=*/nullptr,
-                          /*ImplicitParamTy=*/QualType(), nullptr,
-                          getFromDtorType(Type));
+  CGF.EmitCXXDestructorCall(DD, Callee, This.getPointer(),
+                            /*ImplicitParam=*/nullptr,
+                            /*ImplicitParamTy=*/QualType(), nullptr,
+                            getFromDtorType(Type));
 }
 
-void MicrosoftCXXABI::emitVTableBitSetEntries(VPtrInfo *Info,
-                                              const CXXRecordDecl *RD,
-                                              llvm::GlobalVariable *VTable) {
-  if (!getContext().getLangOpts().Sanitize.has(SanitizerKind::CFIVCall) &&
-      !getContext().getLangOpts().Sanitize.has(SanitizerKind::CFINVCall) &&
-      !getContext().getLangOpts().Sanitize.has(SanitizerKind::CFIDerivedCast) &&
-      !getContext().getLangOpts().Sanitize.has(SanitizerKind::CFIUnrelatedCast))
+void MicrosoftCXXABI::emitVTableTypeMetadata(VPtrInfo *Info,
+                                             const CXXRecordDecl *RD,
+                                             llvm::GlobalVariable *VTable) {
+  if (!CGM.getCodeGenOpts().PrepareForLTO)
     return;
 
-  llvm::NamedMDNode *BitsetsMD =
-      CGM.getModule().getOrInsertNamedMetadata("llvm.bitsets");
-
   // The location of the first virtual function pointer in the virtual table,
   // aka the "address point" on Itanium. This is at offset 0 if RTTI is
   // disabled, or sizeof(void*) if RTTI is enabled.
@@ -1521,22 +1518,20 @@
                 getContext().getTargetInfo().getPointerWidth(0))
           : CharUnits::Zero();
 
-  if (Info->PathToBaseWithVPtr.empty()) {
-    if (!CGM.IsCFIBlacklistedRecord(RD))
-      CGM.CreateVTableBitSetEntry(BitsetsMD, VTable, AddressPoint, RD);
+  if (Info->PathToIntroducingObject.empty()) {
+    CGM.AddVTableTypeMetadata(VTable, AddressPoint, RD);
     return;
   }
 
   // Add a bitset entry for the least derived base belonging to this vftable.
-  if (!CGM.IsCFIBlacklistedRecord(Info->PathToBaseWithVPtr.back()))
-    CGM.CreateVTableBitSetEntry(BitsetsMD, VTable, AddressPoint,
-                                Info->PathToBaseWithVPtr.back());
+  CGM.AddVTableTypeMetadata(VTable, AddressPoint,
+                            Info->PathToIntroducingObject.back());
 
   // Add a bitset entry for each derived class that is laid out at the same
   // offset as the least derived base.
-  for (unsigned I = Info->PathToBaseWithVPtr.size() - 1; I != 0; --I) {
-    const CXXRecordDecl *DerivedRD = Info->PathToBaseWithVPtr[I - 1];
-    const CXXRecordDecl *BaseRD = Info->PathToBaseWithVPtr[I];
+  for (unsigned I = Info->PathToIntroducingObject.size() - 1; I != 0; --I) {
+    const CXXRecordDecl *DerivedRD = Info->PathToIntroducingObject[I - 1];
+    const CXXRecordDecl *BaseRD = Info->PathToIntroducingObject[I];
 
     const ASTRecordLayout &Layout =
         getContext().getASTRecordLayout(DerivedRD);
@@ -1548,13 +1543,12 @@
       Offset = VBI->second.VBaseOffset;
     if (!Offset.isZero())
       return;
-    if (!CGM.IsCFIBlacklistedRecord(DerivedRD))
-      CGM.CreateVTableBitSetEntry(BitsetsMD, VTable, AddressPoint, DerivedRD);
+    CGM.AddVTableTypeMetadata(VTable, AddressPoint, DerivedRD);
   }
 
   // Finally do the same for the most derived class.
-  if (Info->FullOffsetInMDC.isZero() && !CGM.IsCFIBlacklistedRecord(RD))
-    CGM.CreateVTableBitSetEntry(BitsetsMD, VTable, AddressPoint, RD);
+  if (Info->FullOffsetInMDC.isZero())
+    CGM.AddVTableTypeMetadata(VTable, AddressPoint, RD);
 }
 
 void MicrosoftCXXABI::emitVTableDefinitions(CodeGenVTables &CGVT,
@@ -1567,12 +1561,14 @@
     if (VTable->hasInitializer())
       continue;
 
-    llvm::Constant *RTTI = getContext().getLangOpts().RTTIData
-                               ? getMSCompleteObjectLocator(RD, Info)
-                               : nullptr;
-
     const VTableLayout &VTLayout =
       VFTContext.getVFTableLayout(RD, Info->FullOffsetInMDC);
+
+    llvm::Constant *RTTI = nullptr;
+    if (any_of(VTLayout.vtable_components(),
+               [](const VTableComponent &VTC) { return VTC.isRTTIKind(); }))
+      RTTI = getMSCompleteObjectLocator(RD, Info);
+
     llvm::Constant *Init = CGVT.CreateVTableInitializer(
         RD, VTLayout.vtable_component_begin(),
         VTLayout.getNumVTableComponents(), VTLayout.vtable_thunk_begin(),
@@ -1580,7 +1576,7 @@
 
     VTable->setInitializer(Init);
 
-    emitVTableBitSetEntries(Info, RD, VTable);
+    emitVTableTypeMetadata(Info, RD, VTable);
   }
 }
 
@@ -1671,7 +1667,16 @@
   SmallString<256> VFTableName;
   mangleVFTableName(getMangleContext(), RD, VFPtr, VFTableName);
 
-  llvm::GlobalValue::LinkageTypes VFTableLinkage = CGM.getVTableLinkage(RD);
+  // Classes marked __declspec(dllimport) need vftables generated on the
+  // import-side in order to support features like constexpr.  No other
+  // translation unit relies on the emission of the local vftable, translation
+  // units are expected to generate them as needed.
+  //
+  // Because of this unique behavior, we maintain this logic here instead of
+  // getVTableLinkage.
+  llvm::GlobalValue::LinkageTypes VFTableLinkage =
+      RD->hasAttr<DLLImportAttr>() ? llvm::GlobalValue::LinkOnceODRLinkage
+                                   : CGM.getVTableLinkage(RD);
   bool VFTableComesFromAnotherTU =
       llvm::GlobalValue::isAvailableExternallyLinkage(VFTableLinkage) ||
       llvm::GlobalValue::isExternalLinkage(VFTableLinkage);
@@ -1705,7 +1710,7 @@
   VTable = new llvm::GlobalVariable(CGM.getModule(), VTableType,
                                     /*isConstant=*/true, VTableLinkage,
                                     /*Initializer=*/nullptr, VTableName);
-  VTable->setUnnamedAddr(true);
+  VTable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
 
   llvm::Comdat *C = nullptr;
   if (!VFTableComesFromAnotherTU &&
@@ -1733,7 +1738,7 @@
                                         /*AddressSpace=*/0, VFTableLinkage,
                                         VFTableName.str(), VTableGEP,
                                         &CGM.getModule());
-    VFTable->setUnnamedAddr(true);
+    VFTable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   } else {
     // We don't need a GlobalAlias to be a symbol for the VTable if we won't
     // be referencing any RTTI data.
@@ -1744,9 +1749,7 @@
   if (C)
     VTable->setComdat(C);
 
-  if (RD->hasAttr<DLLImportAttr>())
-    VFTable->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
-  else if (RD->hasAttr<DLLExportAttr>())
+  if (RD->hasAttr<DLLExportAttr>())
     VFTable->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
 
   VFTablesMap[ID] = VFTable;
@@ -1813,13 +1816,20 @@
 
   MicrosoftVTableContext::MethodVFTableLocation ML =
       CGM.getMicrosoftVTableContext().getMethodVFTableLocation(GD);
-  if (CGF.SanOpts.has(SanitizerKind::CFIVCall))
-    CGF.EmitVTablePtrCheck(getClassAtVTableLocation(getContext(), GD, ML),
-                           VTable, CodeGenFunction::CFITCK_VCall, Loc);
 
-  llvm::Value *VFuncPtr =
-      Builder.CreateConstInBoundsGEP1_64(VTable, ML.Index, "vfn");
-  return Builder.CreateAlignedLoad(VFuncPtr, CGF.getPointerAlign());
+  if (CGF.ShouldEmitVTableTypeCheckedLoad(MethodDecl->getParent())) {
+    return CGF.EmitVTableTypeCheckedLoad(
+        getClassAtVTableLocation(getContext(), GD, ML), VTable,
+        ML.Index * CGM.getContext().getTargetInfo().getPointerWidth(0) / 8);
+  } else {
+    if (CGM.getCodeGenOpts().PrepareForLTO)
+      CGF.EmitTypeMetadataCodeForVCall(
+          getClassAtVTableLocation(getContext(), GD, ML), VTable, Loc);
+
+    llvm::Value *VFuncPtr =
+        Builder.CreateConstInBoundsGEP1_64(VTable, ML.Index, "vfn");
+    return Builder.CreateAlignedLoad(VFuncPtr, CGF.getPointerAlign());
+  }
 }
 
 llvm::Value *MicrosoftCXXABI::EmitVirtualDestructorCall(
@@ -1843,10 +1853,9 @@
       DtorType == Dtor_Deleting);
 
   This = adjustThisArgumentForVirtualFunctionCall(CGF, GD, This, true);
-  RValue RV = CGF.EmitCXXStructorCall(Dtor, Callee, ReturnValueSlot(),
-                                      This.getPointer(),
-                                      ImplicitParam, Context.IntTy, CE,
-                                      StructorType::Deleting);
+  RValue RV =
+      CGF.EmitCXXDestructorCall(Dtor, Callee, This.getPointer(), ImplicitParam,
+                                Context.IntTy, CE, StructorType::Deleting);
   return RV.getScalarVal();
 }
 
@@ -1916,7 +1925,7 @@
   ThunkFn->addFnAttr("thunk");
 
   // These thunks can be compared, so they are not unnamed.
-  ThunkFn->setUnnamedAddr(false);
+  ThunkFn->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
 
   // Start codegen.
   CodeGenFunction CGF(CGM);
@@ -1967,13 +1976,13 @@
   StringRef Name = OutName.str();
 
   llvm::ArrayType *VBTableType =
-      llvm::ArrayType::get(CGM.IntTy, 1 + VBT.ReusingBase->getNumVBases());
+      llvm::ArrayType::get(CGM.IntTy, 1 + VBT.ObjectWithVPtr->getNumVBases());
 
   assert(!CGM.getModule().getNamedGlobal(Name) &&
          "vbtable with this name already exists: mangling bug?");
   llvm::GlobalVariable *GV =
       CGM.CreateOrReplaceCXXRuntimeVariable(Name, VBTableType, Linkage);
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
 
   if (RD->hasAttr<DLLImportAttr>())
     GV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
@@ -1989,24 +1998,24 @@
 void MicrosoftCXXABI::emitVBTableDefinition(const VPtrInfo &VBT,
                                             const CXXRecordDecl *RD,
                                             llvm::GlobalVariable *GV) const {
-  const CXXRecordDecl *ReusingBase = VBT.ReusingBase;
+  const CXXRecordDecl *ObjectWithVPtr = VBT.ObjectWithVPtr;
 
-  assert(RD->getNumVBases() && ReusingBase->getNumVBases() &&
+  assert(RD->getNumVBases() && ObjectWithVPtr->getNumVBases() &&
          "should only emit vbtables for classes with vbtables");
 
   const ASTRecordLayout &BaseLayout =
-      getContext().getASTRecordLayout(VBT.BaseWithVPtr);
+      getContext().getASTRecordLayout(VBT.IntroducingObject);
   const ASTRecordLayout &DerivedLayout = getContext().getASTRecordLayout(RD);
 
-  SmallVector<llvm::Constant *, 4> Offsets(1 + ReusingBase->getNumVBases(),
+  SmallVector<llvm::Constant *, 4> Offsets(1 + ObjectWithVPtr->getNumVBases(),
                                            nullptr);
 
-  // The offset from ReusingBase's vbptr to itself always leads.
+  // The offset from ObjectWithVPtr's vbptr to itself always leads.
   CharUnits VBPtrOffset = BaseLayout.getVBPtrOffset();
   Offsets[0] = llvm::ConstantInt::get(CGM.IntTy, -VBPtrOffset.getQuantity());
 
   MicrosoftVTableContext &Context = CGM.getMicrosoftVTableContext();
-  for (const auto &I : ReusingBase->vbases()) {
+  for (const auto &I : ObjectWithVPtr->vbases()) {
     const CXXRecordDecl *VBase = I.getType()->getAsCXXRecordDecl();
     CharUnits Offset = DerivedLayout.getVBaseClassOffset(VBase);
     assert(!Offset.isNegative());
@@ -2018,7 +2027,7 @@
           DerivedLayout.getVBaseClassOffset(VBT.getVBaseWithVPtr());
     Offset -= CompleteVBPtrOffset;
 
-    unsigned VBIndex = Context.getVBTableIndex(ReusingBase, VBase);
+    unsigned VBIndex = Context.getVBTableIndex(ObjectWithVPtr, VBase);
     assert(Offsets[VBIndex] == nullptr && "The same vbindex seen twice?");
     Offsets[VBIndex] = llvm::ConstantInt::get(CGM.IntTy, Offset.getQuantity());
   }
@@ -2030,6 +2039,9 @@
     llvm::ArrayType::get(CGM.IntTy, Offsets.size());
   llvm::Constant *Init = llvm::ConstantArray::get(VBTableType, Offsets);
   GV->setInitializer(Init);
+
+  if (RD->hasAttr<DLLImportAttr>())
+    GV->setLinkage(llvm::GlobalVariable::AvailableExternallyLinkage);
 }
 
 llvm::Value *MicrosoftCXXABI::performThisAdjustment(CodeGenFunction &CGF,
@@ -2302,7 +2314,7 @@
     CGBuilderTy &Builder = CGF.Builder;
     llvm::LoadInst *LI = Builder.CreateLoad(Guard);
     llvm::ConstantInt *Mask =
-        llvm::ConstantInt::get(CGF.IntTy, ~(1U << GuardNum));
+        llvm::ConstantInt::get(CGF.IntTy, ~(1ULL << GuardNum));
     Builder.CreateStore(Builder.CreateAnd(LI, Mask), Guard);
   }
 };
@@ -2415,7 +2427,7 @@
     // }
 
     // Test our bit from the guard variable.
-    llvm::ConstantInt *Bit = llvm::ConstantInt::get(GuardTy, 1U << GuardNum);
+    llvm::ConstantInt *Bit = llvm::ConstantInt::get(GuardTy, 1ULL << GuardNum);
     llvm::LoadInst *LI = Builder.CreateLoad(GuardAddr);
     llvm::Value *IsInitialized =
         Builder.CreateICmpNE(Builder.CreateAnd(LI, Bit), Zero);
@@ -3631,7 +3643,8 @@
 }
 
 static QualType decomposeTypeForEH(ASTContext &Context, QualType T,
-                                   bool &IsConst, bool &IsVolatile) {
+                                   bool &IsConst, bool &IsVolatile,
+                                   bool &IsUnaligned) {
   T = Context.getExceptionObjectType(T);
 
   // C++14 [except.handle]p3:
@@ -3641,10 +3654,12 @@
   //         - a qualification conversion
   IsConst = false;
   IsVolatile = false;
+  IsUnaligned = false;
   QualType PointeeType = T->getPointeeType();
   if (!PointeeType.isNull()) {
     IsConst = PointeeType.isConstQualified();
     IsVolatile = PointeeType.isVolatileQualified();
+    IsUnaligned = PointeeType.getQualifiers().hasUnaligned();
   }
 
   // Member pointer types like "const int A::*" are represented by having RTTI
@@ -3667,8 +3682,9 @@
   // TypeDescriptors for exceptions never have qualified pointer types,
   // qualifiers are stored seperately in order to support qualification
   // conversions.
-  bool IsConst, IsVolatile;
-  Type = decomposeTypeForEH(getContext(), Type, IsConst, IsVolatile);
+  bool IsConst, IsVolatile, IsUnaligned;
+  Type =
+      decomposeTypeForEH(getContext(), Type, IsConst, IsVolatile, IsUnaligned);
 
   bool IsReference = CatchHandlerType->isReferenceType();
 
@@ -3677,6 +3693,8 @@
     Flags |= 1;
   if (IsVolatile)
     Flags |= 2;
+  if (IsUnaligned)
+    Flags |= 4;
   if (IsReference)
     Flags |= 8;
 
@@ -3961,7 +3979,7 @@
   auto *GV = new llvm::GlobalVariable(
       CGM.getModule(), CTType, /*Constant=*/true, getLinkageForRTTI(T),
       llvm::ConstantStruct::get(CTType, Fields), MangledName);
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   GV->setSection(".xdata");
   if (GV->isWeakForLinker())
     GV->setComdat(CGM.getModule().getOrInsertComdat(GV->getName()));
@@ -4079,7 +4097,7 @@
   CTA = new llvm::GlobalVariable(
       CGM.getModule(), CTAType, /*Constant=*/true, getLinkageForRTTI(T),
       llvm::ConstantStruct::get(CTAType, Fields), MangledName);
-  CTA->setUnnamedAddr(true);
+  CTA->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   CTA->setSection(".xdata");
   if (CTA->isWeakForLinker())
     CTA->setComdat(CGM.getModule().getOrInsertComdat(CTA->getName()));
@@ -4087,8 +4105,8 @@
 }
 
 llvm::GlobalVariable *MicrosoftCXXABI::getThrowInfo(QualType T) {
-  bool IsConst, IsVolatile;
-  T = decomposeTypeForEH(getContext(), T, IsConst, IsVolatile);
+  bool IsConst, IsVolatile, IsUnaligned;
+  T = decomposeTypeForEH(getContext(), T, IsConst, IsVolatile, IsUnaligned);
 
   // The CatchableTypeArray enumerates the various (CV-unqualified) types that
   // the exception object may be caught as.
@@ -4104,8 +4122,8 @@
   SmallString<256> MangledName;
   {
     llvm::raw_svector_ostream Out(MangledName);
-    getMangleContext().mangleCXXThrowInfo(T, IsConst, IsVolatile, NumEntries,
-                                          Out);
+    getMangleContext().mangleCXXThrowInfo(T, IsConst, IsVolatile, IsUnaligned,
+                                          NumEntries, Out);
   }
 
   // Reuse a previously generated ThrowInfo if we have generated an appropriate
@@ -4121,6 +4139,8 @@
     Flags |= 1;
   if (IsVolatile)
     Flags |= 2;
+  if (IsUnaligned)
+    Flags |= 4;
 
   // The cleanup-function (a destructor) must be called when the exception
   // object's lifetime ends.
@@ -4146,7 +4166,7 @@
   auto *GV = new llvm::GlobalVariable(
       CGM.getModule(), TIType, /*Constant=*/true, getLinkageForRTTI(T),
       llvm::ConstantStruct::get(TIType, Fields), StringRef(MangledName));
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   GV->setSection(".xdata");
   if (GV->isWeakForLinker())
     GV->setComdat(CGM.getModule().getOrInsertComdat(GV->getName()));
diff --git a/lib/CodeGen/ModuleBuilder.cpp b/lib/CodeGen/ModuleBuilder.cpp
index 8c43925..952d162 100644
--- a/lib/CodeGen/ModuleBuilder.cpp
+++ b/lib/CodeGen/ModuleBuilder.cpp
@@ -38,13 +38,21 @@
     const CodeGenOptions CodeGenOpts;  // Intentionally copied in.
 
     unsigned HandlingTopLevelDecls;
+
+    /// Use this when emitting decls to block re-entrant decl emission. It will
+    /// emit all deferred decls on scope exit. Set EmitDeferred to false if decl
+    /// emission must be deferred longer, like at the end of a tag definition.
     struct HandlingTopLevelDeclRAII {
       CodeGeneratorImpl &Self;
-      HandlingTopLevelDeclRAII(CodeGeneratorImpl &Self) : Self(Self) {
+      bool EmitDeferred;
+      HandlingTopLevelDeclRAII(CodeGeneratorImpl &Self,
+                               bool EmitDeferred = true)
+          : Self(Self), EmitDeferred(EmitDeferred) {
         ++Self.HandlingTopLevelDecls;
       }
       ~HandlingTopLevelDeclRAII() {
-        if (--Self.HandlingTopLevelDecls == 0)
+        unsigned Level = --Self.HandlingTopLevelDecls;
+        if (Level == 0 && EmitDeferred)
           Self.EmitDeferredDecls();
       }
     };
@@ -111,15 +119,15 @@
       Ctx = &Context;
 
       M->setTargetTriple(Ctx->getTargetInfo().getTriple().getTriple());
-      M->setDataLayout(Ctx->getTargetInfo().getDataLayoutString());
+      M->setDataLayout(Ctx->getTargetInfo().getDataLayout());
       Builder.reset(new CodeGen::CodeGenModule(Context, HeaderSearchOpts,
                                                PreprocessorOpts, CodeGenOpts,
                                                *M, Diags, CoverageInfo));
 
       for (auto &&Lib : CodeGenOpts.DependentLibraries)
-        HandleDependentLibrary(Lib);
+        Builder->AddDependentLib(Lib);
       for (auto &&Opt : CodeGenOpts.LinkerOptions)
-        HandleLinkerOption(Opt);
+        Builder->AppendLinkerOptions(Opt);
     }
 
     void HandleCXXStaticMemberVarInstantiation(VarDecl *VD) override {
@@ -155,12 +163,23 @@
       DeferredInlineMethodDefinitions.clear();
     }
 
-    void HandleInlineMethodDefinition(CXXMethodDecl *D) override {
+    void HandleInlineFunctionDefinition(FunctionDecl *D) override {
       if (Diags.hasErrorOccurred())
         return;
 
       assert(D->doesThisDeclarationHaveABody());
 
+      // Handle friend functions.
+      if (D->isInIdentifierNamespace(Decl::IDNS_OrdinaryFriend)) {
+        if (Ctx->getTargetInfo().getCXXABI().isMicrosoft()
+            && !D->getLexicalDeclContext()->isDependentContext())
+          Builder->EmitTopLevelDecl(D);
+        return;
+      }
+
+      // Otherwise, must be a method.
+      auto MD = cast<CXXMethodDecl>(D);
+
       // We may want to emit this definition. However, that decision might be
       // based on computing the linkage, and we have to defer that in case we
       // are inside of something that will change the method's final linkage,
@@ -169,13 +188,13 @@
       //     void bar();
       //     void foo() { bar(); }
       //   } A;
-      DeferredInlineMethodDefinitions.push_back(D);
+      DeferredInlineMethodDefinitions.push_back(MD);
 
       // Provide some coverage mapping even for methods that aren't emitted.
       // Don't do this for templated classes though, as they may not be
       // instantiable.
-      if (!D->getParent()->getDescribedClassTemplate())
-        Builder->AddDeferredUnusedCoverageMapping(D);
+      if (!MD->getParent()->getDescribedClassTemplate())
+        Builder->AddDeferredUnusedCoverageMapping(MD);
     }
 
     /// HandleTagDeclDefinition - This callback is invoked each time a TagDecl
@@ -186,6 +205,10 @@
       if (Diags.hasErrorOccurred())
         return;
 
+      // Don't allow re-entrant calls to CodeGen triggered by PCH
+      // deserialization to emit deferred decls.
+      HandlingTopLevelDeclRAII HandlingDecl(*this, /*EmitDeferred=*/false);
+
       Builder->UpdateCompletedType(D);
 
       // For MSVC compatibility, treat declarations of static data members with
@@ -200,12 +223,25 @@
           }
         }
       }
+      // For OpenMP emit declare reduction functions, if required.
+      if (Ctx->getLangOpts().OpenMP) {
+        for (Decl *Member : D->decls()) {
+          if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(Member)) {
+            if (Ctx->DeclMustBeEmitted(DRD))
+              Builder->EmitGlobal(DRD);
+          }
+        }
+      }
     }
 
     void HandleTagDeclRequiredDefinition(const TagDecl *D) override {
       if (Diags.hasErrorOccurred())
         return;
 
+      // Don't allow re-entrant calls to CodeGen triggered by PCH
+      // deserialization to emit deferred decls.
+      HandlingTopLevelDeclRAII HandlingDecl(*this, /*EmitDeferred=*/false);
+
       if (CodeGen::CGDebugInfo *DI = Builder->getModuleDebugInfo())
         if (const RecordDecl *RD = dyn_cast<RecordDecl>(D))
           DI->completeRequiredType(RD);
@@ -226,6 +262,13 @@
       }
     }
 
+    void AssignInheritanceModel(CXXRecordDecl *RD) override {
+      if (Diags.hasErrorOccurred())
+        return;
+
+      Builder->RefreshTypeCacheForClass(RD);
+    }
+
     void CompleteTentativeDefinition(VarDecl *D) override {
       if (Diags.hasErrorOccurred())
         return;
@@ -239,19 +282,6 @@
 
       Builder->EmitVTable(RD);
     }
-
-    void HandleLinkerOption(llvm::StringRef Opts) override {
-      Builder->AppendLinkerOptions(Opts);
-    }
-
-    void HandleDetectMismatch(llvm::StringRef Name,
-                              llvm::StringRef Value) override {
-      Builder->AddDetectMismatch(Name, Value);
-    }
-
-    void HandleDependentLibrary(llvm::StringRef Lib) override {
-      Builder->AddDependentLib(Lib);
-    }
   };
 }
 
diff --git a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
index d560cb1..f2090f9 100644
--- a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
+++ b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetRegistry.h"
 #include <memory>
+#include <utility>
 
 using namespace clang;
 
@@ -54,7 +55,7 @@
   std::unique_ptr<llvm::LLVMContext> VMContext;
   std::unique_ptr<llvm::Module> M;
   std::unique_ptr<CodeGen::CodeGenModule> Builder;
-  raw_pwrite_stream *OS;
+  std::unique_ptr<raw_pwrite_stream> OS;
   std::shared_ptr<PCHBuffer> Buffer;
 
   /// Visit every type and emit debug info for it.
@@ -104,7 +105,7 @@
         return true;
 
       SmallVector<QualType, 16> ArgTypes;
-      for (auto i : D->params())
+      for (auto i : D->parameters())
         ArgTypes.push_back(i->getType());
       QualType RetTy = D->getReturnType();
       QualType FnTy = Ctx.getFunctionType(RetTy, ArgTypes,
@@ -123,7 +124,7 @@
       ArgTypes.push_back(D->getSelfType(Ctx, D->getClassInterface(),
                                         selfIsPseudoStrong, selfIsConsumed));
       ArgTypes.push_back(Ctx.getObjCSelType());
-      for (auto i : D->params())
+      for (auto i : D->parameters())
         ArgTypes.push_back(i->getType());
       QualType RetTy = D->getReturnType();
       QualType FnTy = Ctx.getFunctionType(RetTy, ArgTypes,
@@ -137,21 +138,22 @@
 public:
   PCHContainerGenerator(CompilerInstance &CI, const std::string &MainFileName,
                         const std::string &OutputFileName,
-                        raw_pwrite_stream *OS,
+                        std::unique_ptr<raw_pwrite_stream> OS,
                         std::shared_ptr<PCHBuffer> Buffer)
       : Diags(CI.getDiagnostics()), MainFileName(MainFileName),
         OutputFileName(OutputFileName), Ctx(nullptr),
         MMap(CI.getPreprocessor().getHeaderSearchInfo().getModuleMap()),
         HeaderSearchOpts(CI.getHeaderSearchOpts()),
         PreprocessorOpts(CI.getPreprocessorOpts()),
-        TargetOpts(CI.getTargetOpts()), LangOpts(CI.getLangOpts()), OS(OS),
-        Buffer(Buffer) {
+        TargetOpts(CI.getTargetOpts()), LangOpts(CI.getLangOpts()),
+        OS(std::move(OS)), Buffer(std::move(Buffer)) {
     // The debug info output isn't affected by CodeModel and
     // ThreadModel, but the backend expects them to be nonempty.
     CodeGenOpts.CodeModel = "default";
     CodeGenOpts.ThreadModel = "single";
     CodeGenOpts.DebugTypeExtRefs = true;
     CodeGenOpts.setDebugInfo(codegenoptions::FullDebugInfo);
+    CodeGenOpts.setDebuggerTuning(CI.getCodeGenOpts().getDebuggerTuning());
   }
 
   ~PCHContainerGenerator() override = default;
@@ -162,7 +164,7 @@
     Ctx = &Context;
     VMContext.reset(new llvm::LLVMContext());
     M.reset(new llvm::Module(MainFileName, *VMContext));
-    M->setDataLayout(Ctx->getTargetInfo().getDataLayoutString());
+    M->setDataLayout(Ctx->getTargetInfo().getDataLayout());
     Builder.reset(new CodeGen::CodeGenModule(
         *Ctx, HeaderSearchOpts, PreprocessorOpts, CodeGenOpts, *M, Diags));
 
@@ -235,7 +237,7 @@
       return;
 
     M->setTargetTriple(Ctx.getTargetInfo().getTriple().getTriple());
-    M->setDataLayout(Ctx.getTargetInfo().getDataLayoutString());
+    M->setDataLayout(Ctx.getTargetInfo().getDataLayout());
 
     // PCH files don't have a signature field in the control block,
     // but LLVM detects DWO CUs by looking for a non-zero DWO id.
@@ -279,20 +281,18 @@
     DEBUG({
       // Print the IR for the PCH container to the debug output.
       llvm::SmallString<0> Buffer;
-      llvm::raw_svector_ostream OS(Buffer);
-      clang::EmitBackendOutput(Diags, CodeGenOpts, TargetOpts, LangOpts,
-                               Ctx.getTargetInfo().getDataLayoutString(),
-                               M.get(), BackendAction::Backend_EmitLL, &OS);
+      clang::EmitBackendOutput(
+          Diags, CodeGenOpts, TargetOpts, LangOpts,
+          Ctx.getTargetInfo().getDataLayout(), M.get(),
+          BackendAction::Backend_EmitLL,
+          llvm::make_unique<llvm::raw_svector_ostream>(Buffer));
       llvm::dbgs() << Buffer;
     });
 
     // Use the LLVM backend to emit the pch container.
     clang::EmitBackendOutput(Diags, CodeGenOpts, TargetOpts, LangOpts,
-                             Ctx.getTargetInfo().getDataLayoutString(),
-                             M.get(), BackendAction::Backend_EmitObj, OS);
-
-    // Make sure the pch container hits disk.
-    OS->flush();
+                             Ctx.getTargetInfo().getDataLayout(), M.get(),
+                             BackendAction::Backend_EmitObj, std::move(OS));
 
     // Free the memory for the temporary buffer.
     llvm::SmallVector<char, 0> Empty;
@@ -305,33 +305,38 @@
 std::unique_ptr<ASTConsumer>
 ObjectFilePCHContainerWriter::CreatePCHContainerGenerator(
     CompilerInstance &CI, const std::string &MainFileName,
-    const std::string &OutputFileName, llvm::raw_pwrite_stream *OS,
+    const std::string &OutputFileName,
+    std::unique_ptr<llvm::raw_pwrite_stream> OS,
     std::shared_ptr<PCHBuffer> Buffer) const {
-  return llvm::make_unique<PCHContainerGenerator>(CI, MainFileName,
-                                                  OutputFileName, OS, Buffer);
+  return llvm::make_unique<PCHContainerGenerator>(
+      CI, MainFileName, OutputFileName, std::move(OS), Buffer);
 }
 
 void ObjectFilePCHContainerReader::ExtractPCH(
     llvm::MemoryBufferRef Buffer, llvm::BitstreamReader &StreamFile) const {
-  if (auto OF = llvm::object::ObjectFile::createObjectFile(Buffer)) {
-    auto *Obj = OF.get().get();
-    bool IsCOFF = isa<llvm::object::COFFObjectFile>(Obj);
+  auto OFOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
+  if (OFOrErr) {
+    auto &OF = OFOrErr.get();
+    bool IsCOFF = isa<llvm::object::COFFObjectFile>(*OF);
     // Find the clang AST section in the container.
-    for (auto &Section : OF->get()->sections()) {
+    for (auto &Section : OF->sections()) {
       StringRef Name;
       Section.getName(Name);
-      if ((!IsCOFF && Name == "__clangast") ||
-          ( IsCOFF && Name ==   "clangast")) {
+      if ((!IsCOFF && Name == "__clangast") || (IsCOFF && Name == "clangast")) {
         StringRef Buf;
         Section.getContents(Buf);
-        StreamFile.init((const unsigned char *)Buf.begin(),
-                        (const unsigned char *)Buf.end());
-        return;
+        return StreamFile.init((const unsigned char *)Buf.begin(),
+                               (const unsigned char *)Buf.end());
       }
     }
   }
-
-  // As a fallback, treat the buffer as a raw AST.
-  StreamFile.init((const unsigned char *)Buffer.getBufferStart(),
-                  (const unsigned char *)Buffer.getBufferEnd());
+  handleAllErrors(OFOrErr.takeError(), [&](const llvm::ErrorInfoBase &EIB) {
+    if (EIB.convertToErrorCode() ==
+        llvm::object::object_error::invalid_file_type)
+      // As a fallback, treat the buffer as a raw AST.
+      StreamFile.init((const unsigned char *)Buffer.getBufferStart(),
+                      (const unsigned char *)Buffer.getBufferEnd());
+    else
+      EIB.log(llvm::errs());
+  });
 }
diff --git a/lib/CodeGen/SwiftCallingConv.cpp b/lib/CodeGen/SwiftCallingConv.cpp
index 44b46f6..6c20f8c 100644
--- a/lib/CodeGen/SwiftCallingConv.cpp
+++ b/lib/CodeGen/SwiftCallingConv.cpp
@@ -254,8 +254,8 @@
 
 void SwiftAggLowering::addEntry(llvm::Type *type,
                                 CharUnits begin, CharUnits end) {
-  assert(!type ||
-         (!isa<llvm::StructType>(type) && !isa<llvm::ArrayType>(type)) &&
+  assert((!type ||
+          (!isa<llvm::StructType>(type) && !isa<llvm::ArrayType>(type))) &&
          "cannot add aggregate-typed data");
   assert(!type || begin.isMultipleOf(getNaturalAlignment(CGM, type)));
 
@@ -603,7 +603,7 @@
   // rounded up to a power of 2.
   auto size = (unsigned long long) getTypeStoreSize(CGM, type).getQuantity();
   if (!isPowerOf2(size)) {
-    size = 1U << (llvm::findLastSet(size, llvm::ZB_Undefined) + 1);
+    size = 1ULL << (llvm::findLastSet(size, llvm::ZB_Undefined) + 1);
   }
   assert(size >= CGM.getDataLayout().getABITypeAlignment(type));
   return CharUnits::fromQuantity(size);
@@ -827,4 +827,4 @@
     auto &argInfo = FI.arg_begin()[i];
     argInfo.info = classifyArgumentType(CGM, argInfo.type);
   }
-}
\ No newline at end of file
+}
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 2ac7a20..bdf3e4b 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -31,6 +31,31 @@
 using namespace clang;
 using namespace CodeGen;
 
+// Helper for coercing an aggregate argument or return value into an integer
+// array of the same size (including padding) and alignment.  This alternate
+// coercion happens only for the RenderScript ABI and can be removed after
+// runtimes that rely on it are no longer supported.
+//
+// RenderScript assumes that the size of the argument / return value in the IR
+// is the same as the size of the corresponding qualified type. This helper
+// coerces the aggregate type into an array of the same size (including
+// padding).  This coercion is used in lieu of expansion of struct members or
+// other canonical coercions that return a coerced-type of larger size.
+//
+// Ty          - The argument / return value type
+// Context     - The associated ASTContext
+// LLVMContext - The associated LLVMContext
+static ABIArgInfo coerceToIntArray(QualType Ty,
+                                   ASTContext &Context,
+                                   llvm::LLVMContext &LLVMContext) {
+  // Alignment and Size are measured in bits.
+  const uint64_t Size = Context.getTypeSize(Ty);
+  const uint64_t Alignment = Context.getTypeAlign(Ty);
+  llvm::Type *IntType = llvm::Type::getIntNTy(LLVMContext, Alignment);
+  const uint64_t NumElements = (Size + Alignment - 1) / Alignment;
+  return ABIArgInfo::getDirect(llvm::ArrayType::get(IntType, NumElements));
+}
+
 static void AssignToArrayRange(CodeGen::CGBuilderTy &Builder,
                                llvm::Value *Array,
                                llvm::Value *Value,
@@ -158,6 +183,8 @@
   return CGT.getTarget();
 }
 
+bool ABIInfo:: isAndroid() const { return getTarget().getTriple().isAndroid(); }
+
 bool ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
   return false;
 }
@@ -270,7 +297,8 @@
 
   // If the argument is smaller than a slot, and this is a big-endian
   // target, the argument will be right-adjusted in its slot.
-  if (DirectSize < SlotSize && CGF.CGM.getDataLayout().isBigEndian()) {
+  if (DirectSize < SlotSize && CGF.CGM.getDataLayout().isBigEndian() &&
+      !DirectTy->isStructTy()) {
     Addr = CGF.Builder.CreateConstInBoundsByteGEP(Addr, SlotSize - DirectSize);
   }
 
@@ -369,6 +397,10 @@
   Opt += Lib;
 }
 
+unsigned TargetCodeGenInfo::getOpenCLKernelCallingConv() const {
+  return llvm::CallingConv::C;
+}
+
 static bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays);
 
 /// isEmptyField - Return true iff a the field is "empty", that is it
@@ -409,7 +441,7 @@
 static bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays) {
   const RecordType *RT = T->getAs<RecordType>();
   if (!RT)
-    return 0;
+    return false;
   const RecordDecl *RD = RT->getDecl();
   if (RD->hasFlexibleArrayMember())
     return false;
@@ -501,73 +533,55 @@
   return Found;
 }
 
-static bool is32Or64BitBasicType(QualType Ty, ASTContext &Context) {
-  // Treat complex types as the element type.
-  if (const ComplexType *CTy = Ty->getAs<ComplexType>())
-    Ty = CTy->getElementType();
-
-  // Check for a type which we know has a simple scalar argument-passing
-  // convention without any padding.  (We're specifically looking for 32
-  // and 64-bit integer and integer-equivalents, float, and double.)
-  if (!Ty->getAs<BuiltinType>() && !Ty->hasPointerRepresentation() &&
-      !Ty->isEnumeralType() && !Ty->isBlockPointerType())
-    return false;
-
-  uint64_t Size = Context.getTypeSize(Ty);
-  return Size == 32 || Size == 64;
-}
-
-/// canExpandIndirectArgument - Test whether an argument type which is to be
-/// passed indirectly (on the stack) would have the equivalent layout if it was
-/// expanded into separate arguments. If so, we prefer to do the latter to avoid
-/// inhibiting optimizations.
-///
-// FIXME: This predicate is missing many cases, currently it just follows
-// llvm-gcc (checks that all fields are 32-bit or 64-bit primitive types). We
-// should probably make this smarter, or better yet make the LLVM backend
-// capable of handling it.
-static bool canExpandIndirectArgument(QualType Ty, ASTContext &Context) {
-  // We can only expand structure types.
-  const RecordType *RT = Ty->getAs<RecordType>();
-  if (!RT)
-    return false;
-
-  // We can only expand (C) structures.
-  //
-  // FIXME: This needs to be generalized to handle classes as well.
-  const RecordDecl *RD = RT->getDecl();
-  if (!RD->isStruct())
-    return false;
-
-  // We try to expand CLike CXXRecordDecl.
-  if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
-    if (!CXXRD->isCLike())
-      return false;
-  }
-
-  uint64_t Size = 0;
-
-  for (const auto *FD : RD->fields()) {
-    if (!is32Or64BitBasicType(FD->getType(), Context))
-      return false;
-
-    // FIXME: Reject bit-fields wholesale; there are two problems, we don't know
-    // how to expand them yet, and the predicate for telling if a bitfield still
-    // counts as "basic" is more complicated than what we were doing previously.
-    if (FD->isBitField())
-      return false;
-
-    Size += Context.getTypeSize(FD->getType());
-  }
-
-  // Make sure there are not any holes in the struct.
-  if (Size != Context.getTypeSize(Ty))
-    return false;
-
-  return true;
-}
-
 namespace {
+Address EmitVAArgInstr(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
+                       const ABIArgInfo &AI) {
+  // This default implementation defers to the llvm backend's va_arg
+  // instruction. It can handle only passing arguments directly
+  // (typically only handled in the backend for primitive types), or
+  // aggregates passed indirectly by pointer (NOTE: if the "byval"
+  // flag has ABI impact in the callee, this implementation cannot
+  // work.)
+
+  // Only a few cases are covered here at the moment -- those needed
+  // by the default abi.
+  llvm::Value *Val;
+
+  if (AI.isIndirect()) {
+    assert(!AI.getPaddingType() &&
+           "Unexpected PaddingType seen in arginfo in generic VAArg emitter!");
+    assert(
+        !AI.getIndirectRealign() &&
+        "Unexpected IndirectRealign seen in arginfo in generic VAArg emitter!");
+
+    auto TyInfo = CGF.getContext().getTypeInfoInChars(Ty);
+    CharUnits TyAlignForABI = TyInfo.second;
+
+    llvm::Type *BaseTy =
+        llvm::PointerType::getUnqual(CGF.ConvertTypeForMem(Ty));
+    llvm::Value *Addr =
+        CGF.Builder.CreateVAArg(VAListAddr.getPointer(), BaseTy);
+    return Address(Addr, TyAlignForABI);
+  } else {
+    assert((AI.isDirect() || AI.isExtend()) &&
+           "Unexpected ArgInfo Kind in generic VAArg emitter!");
+
+    assert(!AI.getInReg() &&
+           "Unexpected InReg seen in arginfo in generic VAArg emitter!");
+    assert(!AI.getPaddingType() &&
+           "Unexpected PaddingType seen in arginfo in generic VAArg emitter!");
+    assert(!AI.getDirectOffset() &&
+           "Unexpected DirectOffset seen in arginfo in generic VAArg emitter!");
+    assert(!AI.getCoerceToType() &&
+           "Unexpected CoerceToType seen in arginfo in generic VAArg emitter!");
+
+    Address Temp = CGF.CreateMemTemp(Ty, "varet");
+    Val = CGF.Builder.CreateVAArg(VAListAddr.getPointer(), CGF.ConvertType(Ty));
+    CGF.Builder.CreateStore(Val, Temp);
+    return Temp;
+  }
+}
+
 /// DefaultABIInfo - The default implementation for ABI specific
 /// details. This implementation provides information which results in
 /// self-consistent and sensible LLVM IR generation, but does not
@@ -587,7 +601,9 @@
   }
 
   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
-                    QualType Ty) const override;
+                    QualType Ty) const override {
+    return EmitVAArgInstr(CGF, VAListAddr, Ty, classifyArgumentType(Ty));
+  }
 };
 
 class DefaultTargetCodeGenInfo : public TargetCodeGenInfo {
@@ -596,11 +612,6 @@
     : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {}
 };
 
-Address DefaultABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
-                                  QualType Ty) const {
-  return Address::invalid();
-}
-
 ABIArgInfo DefaultABIInfo::classifyArgumentType(QualType Ty) const {
   Ty = useFirstFieldIfTransparentUnion(Ty);
 
@@ -652,13 +663,17 @@
   ABIArgInfo classifyArgumentType(QualType Ty) const;
 
   // DefaultABIInfo's classifyReturnType and classifyArgumentType are
-  // non-virtual, but computeInfo is virtual, so we overload that.
+  // non-virtual, but computeInfo and EmitVAArg are virtual, so we
+  // overload them.
   void computeInfo(CGFunctionInfo &FI) const override {
     if (!getCXXABI().classifyReturnType(FI))
       FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
     for (auto &Arg : FI.arguments())
       Arg.info = classifyArgumentType(Arg.type);
   }
+
+  Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                    QualType Ty) const override;
 };
 
 class WebAssemblyTargetCodeGenInfo final : public TargetCodeGenInfo {
@@ -710,6 +725,14 @@
   return DefaultABIInfo::classifyReturnType(RetTy);
 }
 
+Address WebAssemblyABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                                      QualType Ty) const {
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, /*Indirect=*/ false,
+                          getContext().getTypeInfoInChars(Ty),
+                          CharUnits::fromQuantity(4),
+                          /*AllowHigherAlign=*/ true);
+}
+
 //===----------------------------------------------------------------------===//
 // le32/PNaCl bitcode ABI Implementation
 //
@@ -745,7 +768,13 @@
 
 Address PNaClABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                                 QualType Ty) const {
-  return Address::invalid();
+  // The PNaCL ABI is a bit odd, in that varargs don't use normal
+  // function classification. Structs get passed directly for varargs
+  // functions, through a rewriting transform in
+  // pnacl-llvm/lib/Transforms/NaCl/ExpandVarArgs.cpp, which allows
+  // this target to actually support a va_arg instructions with an
+  // aggregate type, unlike other targets.
+  return EmitVAArgInstr(CGF, VAListAddr, Ty, ABIArgInfo::getDirect());
 }
 
 /// \brief Classify argument of given type \p Ty.
@@ -894,6 +923,8 @@
                                 bool &NeedsPadding) const;
   bool shouldPrimitiveUseInReg(QualType Ty, CCState &State) const;
 
+  bool canExpandIndirectArgument(QualType Ty) const;
+
   /// \brief Rewrite the function info so that all memory arguments use
   /// inalloca.
   void rewriteWithInAlloca(CGFunctionInfo &FI) const;
@@ -1114,6 +1145,72 @@
   return true;
 }
 
+static bool is32Or64BitBasicType(QualType Ty, ASTContext &Context) {
+  // Treat complex types as the element type.
+  if (const ComplexType *CTy = Ty->getAs<ComplexType>())
+    Ty = CTy->getElementType();
+
+  // Check for a type which we know has a simple scalar argument-passing
+  // convention without any padding.  (We're specifically looking for 32
+  // and 64-bit integer and integer-equivalents, float, and double.)
+  if (!Ty->getAs<BuiltinType>() && !Ty->hasPointerRepresentation() &&
+      !Ty->isEnumeralType() && !Ty->isBlockPointerType())
+    return false;
+
+  uint64_t Size = Context.getTypeSize(Ty);
+  return Size == 32 || Size == 64;
+}
+
+/// Test whether an argument type which is to be passed indirectly (on the
+/// stack) would have the equivalent layout if it was expanded into separate
+/// arguments. If so, we prefer to do the latter to avoid inhibiting
+/// optimizations.
+bool X86_32ABIInfo::canExpandIndirectArgument(QualType Ty) const {
+  // We can only expand structure types.
+  const RecordType *RT = Ty->getAs<RecordType>();
+  if (!RT)
+    return false;
+  const RecordDecl *RD = RT->getDecl();
+  if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    if (!IsWin32StructABI ) {
+      // On non-Windows, we have to conservatively match our old bitcode
+      // prototypes in order to be ABI-compatible at the bitcode level.
+      if (!CXXRD->isCLike())
+        return false;
+    } else {
+      // Don't do this for dynamic classes.
+      if (CXXRD->isDynamicClass())
+        return false;
+      // Don't do this if there are any non-empty bases.
+      for (const CXXBaseSpecifier &Base : CXXRD->bases()) {
+        if (!isEmptyRecord(getContext(), Base.getType(), /*AllowArrays=*/true))
+          return false;
+      }
+    }
+  }
+
+  uint64_t Size = 0;
+
+  for (const auto *FD : RD->fields()) {
+    // Scalar arguments on the stack get 4 byte alignment on x86. If the
+    // argument is smaller than 32-bits, expanding the struct will create
+    // alignment padding.
+    if (!is32Or64BitBasicType(FD->getType(), getContext()))
+      return false;
+
+    // FIXME: Reject bit-fields wholesale; there are two problems, we don't know
+    // how to expand them yet, and the predicate for telling if a bitfield still
+    // counts as "basic" is more complicated than what we were doing previously.
+    if (FD->isBitField())
+      return false;
+
+    Size += getContext().getTypeSize(FD->getType());
+  }
+
+  // We can do this if there was no alignment padding.
+  return Size == getContext().getTypeSize(Ty);
+}
+
 ABIArgInfo X86_32ABIInfo::getIndirectReturnResult(QualType RetTy, CCState &State) const {
   // If the return value is indirect, then the hidden argument is consuming one
   // integer register.
@@ -1174,6 +1271,10 @@
     if (!IsRetSmallStructInRegABI && !RetTy->isAnyComplexType())
       return getIndirectReturnResult(RetTy, State);
 
+    // Ignore empty structs/unions.
+    if (isEmptyRecord(getContext(), RetTy, true))
+      return ABIArgInfo::getIgnore();
+
     // Small structures which are register sized are generally returned
     // in a register.
     if (shouldReturnTypeInRegister(RetTy, getContext())) {
@@ -1326,6 +1427,12 @@
 bool X86_32ABIInfo::shouldAggregateUseDirect(QualType Ty, CCState &State, 
                                              bool &InReg,
                                              bool &NeedsPadding) const {
+  // On Windows, aggregates other than HFAs are never passed in registers, and
+  // they do not consume register slots. Homogenous floating-point aggregates
+  // (HFAs) have already been dealt with at this point.
+  if (IsWin32StructABI && isAggregateTypeForABI(Ty))
+    return false;
+
   NeedsPadding = false;
   InReg = !IsMCUABI;
 
@@ -1399,23 +1506,19 @@
   }
 
   if (isAggregateTypeForABI(Ty)) {
-    if (RT) {
-      // Structs are always byval on win32, regardless of what they contain.
-      if (IsWin32StructABI)
-        return getIndirectResult(Ty, true, State);
+    // Structures with flexible arrays are always indirect.
+    // FIXME: This should not be byval!
+    if (RT && RT->getDecl()->hasFlexibleArrayMember())
+      return getIndirectResult(Ty, true, State);
 
-      // Structures with flexible arrays are always indirect.
-      if (RT->getDecl()->hasFlexibleArrayMember())
-        return getIndirectResult(Ty, true, State);
-    }
-
-    // Ignore empty structs/unions.
-    if (isEmptyRecord(getContext(), Ty, true))
+    // Ignore empty structs/unions on non-Windows.
+    if (!IsWin32StructABI && isEmptyRecord(getContext(), Ty, true))
       return ABIArgInfo::getIgnore();
 
     llvm::LLVMContext &LLVMContext = getVMContext();
     llvm::IntegerType *Int32 = llvm::Type::getInt32Ty(LLVMContext);
-    bool NeedsPadding, InReg;
+    bool NeedsPadding = false;
+    bool InReg;
     if (shouldAggregateUseDirect(Ty, State, InReg, NeedsPadding)) {
       unsigned SizeInRegs = (getContext().getTypeSize(Ty) + 31) / 32;
       SmallVector<llvm::Type*, 3> Elements(SizeInRegs, Int32);
@@ -1433,9 +1536,8 @@
     // optimizations.
     // Don't do this for the MCU if there are still free integer registers
     // (see X86_64 ABI for full explanation).
-    if (getContext().getTypeSize(Ty) <= 4*32 &&
-        canExpandIndirectArgument(Ty, getContext()) &&
-        (!IsMCUABI || State.FreeRegs == 0))
+    if (getContext().getTypeSize(Ty) <= 4 * 32 &&
+        (!IsMCUABI || State.FreeRegs == 0) && canExpandIndirectArgument(Ty))
       return ABIArgInfo::getExpandWithPadding(
           State.CC == llvm::CallingConv::X86_FastCall ||
               State.CC == llvm::CallingConv::X86_VectorCall,
@@ -1555,11 +1657,14 @@
     return false;
   case ABIArgInfo::Direct:
   case ABIArgInfo::Extend:
-  case ABIArgInfo::Expand:
-  case ABIArgInfo::CoerceAndExpand:
     if (Info.getInReg())
       return false;
     return true;
+  case ABIArgInfo::Expand:
+  case ABIArgInfo::CoerceAndExpand:
+    // These are aggregate types which are never passed in registers when
+    // inalloca is involved.
+    return true;
   }
   llvm_unreachable("invalid enum");
 }
@@ -1844,6 +1949,17 @@
     return !getTarget().getTriple().isOSDarwin();
   }
 
+  /// GCC classifies <1 x long long> as SSE but compatibility with older clang
+  // compilers require us to classify it as INTEGER.
+  bool classifyIntegerMMXAsSSE() const {
+    const llvm::Triple &Triple = getTarget().getTriple();
+    if (Triple.isOSDarwin() || Triple.getOS() == llvm::Triple::PS4)
+      return false;
+    if (Triple.isOSFreeBSD() && Triple.getOSMajorVersion() >= 10)
+      return false;
+    return true;
+  }
+
   X86AVXABILevel AVXLevel;
   // Some ABIs (e.g. X32 ABI and Native Client OS) use 32 bit pointers on
   // 64-bit hardware.
@@ -2291,15 +2407,20 @@
       if (EB_Lo != EB_Hi)
         Hi = Lo;
     } else if (Size == 64) {
+      QualType ElementType = VT->getElementType();
+
       // gcc passes <1 x double> in memory. :(
-      if (VT->getElementType()->isSpecificBuiltinType(BuiltinType::Double))
+      if (ElementType->isSpecificBuiltinType(BuiltinType::Double))
         return;
 
-      // gcc passes <1 x long long> as INTEGER.
-      if (VT->getElementType()->isSpecificBuiltinType(BuiltinType::LongLong) ||
-          VT->getElementType()->isSpecificBuiltinType(BuiltinType::ULongLong) ||
-          VT->getElementType()->isSpecificBuiltinType(BuiltinType::Long) ||
-          VT->getElementType()->isSpecificBuiltinType(BuiltinType::ULong))
+      // gcc passes <1 x long long> as SSE but clang used to unconditionally
+      // pass them as integer.  For platforms where clang is the de facto
+      // platform compiler, we must continue to use integer.
+      if (!classifyIntegerMMXAsSSE() &&
+          (ElementType->isSpecificBuiltinType(BuiltinType::LongLong) ||
+           ElementType->isSpecificBuiltinType(BuiltinType::ULongLong) ||
+           ElementType->isSpecificBuiltinType(BuiltinType::Long) ||
+           ElementType->isSpecificBuiltinType(BuiltinType::ULong)))
         Current = Integer;
       else
         Current = SSE;
@@ -2371,8 +2492,8 @@
     uint64_t Size = getContext().getTypeSize(Ty);
 
     // AMD64-ABI 3.2.3p2: Rule 1. If the size of an object is larger
-    // than four eightbytes, ..., it has class MEMORY.
-    if (Size > 256)
+    // than eight eightbytes, ..., it has class MEMORY.
+    if (Size > 512)
       return;
 
     // AMD64-ABI 3.2.3p2: Rule 1. If ..., or it contains unaligned
@@ -2391,7 +2512,9 @@
     // The only case a 256-bit wide vector could be used is when the array
     // contains a single 256-bit element. Since Lo and Hi logic isn't extended
     // to work for sizes wider than 128, early check and fallback to memory.
-    if (Size > 128 && EltSize != 256)
+    //
+    if (Size > 128 &&
+        (Size != EltSize || Size > getNativeVectorSizeForAVXABI(AVXLevel)))
       return;
 
     for (uint64_t i=0, Offset=OffsetBase; i<ArraySize; ++i, Offset += EltSize) {
@@ -2412,8 +2535,8 @@
     uint64_t Size = getContext().getTypeSize(Ty);
 
     // AMD64-ABI 3.2.3p2: Rule 1. If the size of an object is larger
-    // than four eightbytes, ..., it has class MEMORY.
-    if (Size > 256)
+    // than eight eightbytes, ..., it has class MEMORY.
+    if (Size > 512)
       return;
 
     // AMD64-ABI 3.2.3p2: Rule 2. If a C++ object has either a non-trivial
@@ -2466,6 +2589,10 @@
       uint64_t Offset = OffsetBase + Layout.getFieldOffset(idx);
       bool BitField = i->isBitField();
 
+      // Ignore padding bit-fields.
+      if (BitField && i->isUnnamedBitfield())
+        continue;
+
       // AMD64-ABI 3.2.3p2: Rule 1. If the size of an object is larger than
       // four eightbytes, or it contains unaligned fields, it has class MEMORY.
       //
@@ -2473,7 +2600,8 @@
       // contains a single 256-bit element. Since Lo and Hi logic isn't extended
       // to work for sizes wider than 128, early check and fallback to memory.
       //
-      if (Size > 128 && getContext().getTypeSize(i->getType()) != 256) {
+      if (Size > 128 && (Size != getContext().getTypeSize(i->getType()) ||
+                         Size > getNativeVectorSizeForAVXABI(AVXLevel))) {
         Lo = Memory;
         postMerge(Size, Lo, Hi);
         return;
@@ -2497,10 +2625,7 @@
       // structure to be passed in memory even if unaligned, and
       // therefore they can straddle an eightbyte.
       if (BitField) {
-        // Ignore padding bit-fields.
-        if (i->isUnnamedBitfield())
-          continue;
-
+        assert(!i->isUnnamedBitfield());
         uint64_t Offset = OffsetBase + Layout.getFieldOffset(idx);
         uint64_t Size = i->getBitWidthValue(getContext());
 
@@ -2628,7 +2753,7 @@
 
   // We couldn't find the preferred IR vector type for 'Ty'.
   uint64_t Size = getContext().getTypeSize(Ty);
-  assert((Size == 128 || Size == 256) && "Invalid type found!");
+  assert((Size == 128 || Size == 256 || Size == 512) && "Invalid type found!");
 
   // Return a LLVM IR vector type based on the size of 'Ty'.
   return llvm::VectorType::get(llvm::Type::getDoubleTy(getVMContext()),
@@ -3561,12 +3686,15 @@
 
 }
 
+// TODO: this implementation is now likely redundant with
+// DefaultABIInfo::EmitVAArg.
 Address PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
                                       QualType Ty) const {
+  const unsigned OverflowLimit = 8;
   if (const ComplexType *CTy = Ty->getAs<ComplexType>()) {
     // TODO: Implement this. For now ignore.
     (void)CTy;
-    return Address::invalid();
+    return Address::invalid(); // FIXME?
   }
 
   // struct __va_list_tag {
@@ -3605,7 +3733,7 @@
   }
 
   llvm::Value *CC =
-      Builder.CreateICmpULT(NumRegs, Builder.getInt8(8), "cond");
+      Builder.CreateICmpULT(NumRegs, Builder.getInt8(OverflowLimit), "cond");
 
   llvm::BasicBlock *UsingRegs = CGF.createBasicBlock("using_regs");
   llvm::BasicBlock *UsingOverflow = CGF.createBasicBlock("using_overflow");
@@ -3657,6 +3785,8 @@
   {
     CGF.EmitBlock(UsingOverflow);
 
+    Builder.CreateStore(Builder.getInt8(OverflowLimit), NumRegsAddr);
+
     // Everything in the overflow area is rounded up to a size of at least 4.
     CharUnits OverflowAreaAlign = CharUnits::fromQuantity(4);
 
@@ -3748,7 +3878,7 @@
 
 namespace {
 /// PPC64_SVR4_ABIInfo - The 64-bit PowerPC ELF (SVR4) ABI information.
-class PPC64_SVR4_ABIInfo : public DefaultABIInfo {
+class PPC64_SVR4_ABIInfo : public ABIInfo {
 public:
   enum ABIKind {
     ELFv1 = 0,
@@ -3790,7 +3920,7 @@
 
 public:
   PPC64_SVR4_ABIInfo(CodeGen::CodeGenTypes &CGT, ABIKind Kind, bool HasQPX)
-    : DefaultABIInfo(CGT), Kind(Kind), HasQPX(HasQPX) {}
+      : ABIInfo(CGT), Kind(Kind), HasQPX(HasQPX) {}
 
   bool isPromotableTypeForABI(QualType Ty) const;
   CharUnits getParamTypeAlignment(QualType Ty) const;
@@ -4392,6 +4522,11 @@
   // Handle illegal vector types here.
   if (isIllegalVectorType(Ty)) {
     uint64_t Size = getContext().getTypeSize(Ty);
+    // Android promotes <2 x i8> to i16, not i32
+    if (isAndroid() && (Size <= 16)) {
+      llvm::Type *ResType = llvm::Type::getInt16Ty(getVMContext());
+      return ABIArgInfo::getDirect(ResType);
+    }
     if (Size <= 32) {
       llvm::Type *ResType = llvm::Type::getInt32Ty(getVMContext());
       return ABIArgInfo::getDirect(ResType);
@@ -4446,6 +4581,11 @@
   // Aggregates <= 16 bytes are passed directly in registers or on the stack.
   uint64_t Size = getContext().getTypeSize(Ty);
   if (Size <= 128) {
+    // On RenderScript, coerce Aggregates <= 16 bytes to an integer array of
+    // same size and alignment.
+    if (getTarget().isRenderScriptTarget()) {
+      return coerceToIntArray(Ty, getContext(), getVMContext());
+    }
     unsigned Alignment = getContext().getTypeAlign(Ty);
     Size = 64 * ((Size + 63) / 64); // round up to multiple of 8 bytes
 
@@ -4491,6 +4631,11 @@
   // Aggregates <= 16 bytes are returned directly in registers or on the stack.
   uint64_t Size = getContext().getTypeSize(RetTy);
   if (Size <= 128) {
+    // On RenderScript, coerce Aggregates <= 16 bytes to an integer array of
+    // same size and alignment.
+    if (getTarget().isRenderScriptTarget()) {
+      return coerceToIntArray(RetTy, getContext(), getVMContext());
+    }
     unsigned Alignment = getContext().getTypeAlign(RetTy);
     Size = 64 * ((Size + 63) / 64); // round up to multiple of 8 bytes
 
@@ -4512,8 +4657,8 @@
     // Check whether VT is legal.
     unsigned NumElements = VT->getNumElements();
     uint64_t Size = getContext().getTypeSize(VT);
-    // NumElements should be power of 2 between 1 and 16.
-    if ((NumElements & (NumElements - 1)) != 0 || NumElements > 16)
+    // NumElements should be power of 2.
+    if (!llvm::isPowerOf2_32(NumElements))
       return true;
     return Size != 64 && (Size != 128 || NumElements == 1);
   }
@@ -4802,7 +4947,7 @@
   // illegal vector types.  Lower VAArg here for these cases and use
   // the LLVM va_arg instruction for everything else.
   if (!isAggregateTypeForABI(Ty) && !isIllegalVectorType(Ty))
-    return Address::invalid();
+    return EmitVAArgInstr(CGF, VAListAddr, Ty, ABIArgInfo::getDirect());
 
   CharUnits SlotSize = CharUnits::fromQuantity(8);
 
@@ -4861,6 +5006,8 @@
     case llvm::Triple::EABIHF:
     case llvm::Triple::GNUEABI:
     case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::MuslEABI:
+    case llvm::Triple::MuslEABIHF:
       return true;
     default:
       return false;
@@ -4871,17 +5018,13 @@
     switch (getTarget().getTriple().getEnvironment()) {
     case llvm::Triple::EABIHF:
     case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::MuslEABIHF:
       return true;
     default:
       return false;
     }
   }
 
-  bool isAndroid() const {
-    return (getTarget().getTriple().getEnvironment() ==
-            llvm::Triple::Android);
-  }
-
   ABIKind getABIKind() const { return Kind; }
 
 private:
@@ -4981,29 +5124,24 @@
 };
 
 class WindowsARMTargetCodeGenInfo : public ARMTargetCodeGenInfo {
-  void addStackProbeSizeTargetAttribute(const Decl *D, llvm::GlobalValue *GV,
-                                        CodeGen::CodeGenModule &CGM) const;
-
 public:
   WindowsARMTargetCodeGenInfo(CodeGenTypes &CGT, ARMABIInfo::ABIKind K)
       : ARMTargetCodeGenInfo(CGT, K) {}
 
   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &CGM) const override;
+
+  void getDependentLibraryOption(llvm::StringRef Lib,
+                                 llvm::SmallString<24> &Opt) const override {
+    Opt = "/DEFAULTLIB:" + qualifyWindowsLibrary(Lib);
+  }
+
+  void getDetectMismatchOption(llvm::StringRef Name, llvm::StringRef Value,
+                               llvm::SmallString<32> &Opt) const override {
+    Opt = "/FAILIFMISMATCH:\"" + Name.str() + "=" + Value.str() + "\"";
+  }
 };
 
-void WindowsARMTargetCodeGenInfo::addStackProbeSizeTargetAttribute(
-    const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &CGM) const {
-  if (!isa<FunctionDecl>(D))
-    return;
-  if (CGM.getCodeGenOpts().StackProbeSize == 4096)
-    return;
-
-  llvm::Function *F = cast<llvm::Function>(GV);
-  F->addFnAttr("stack-probe-size",
-               llvm::utostr(CGM.getCodeGenOpts().StackProbeSize));
-}
-
 void WindowsARMTargetCodeGenInfo::setTargetAttributes(
     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &CGM) const {
   ARMTargetCodeGenInfo::setTargetAttributes(D, GV, CGM);
@@ -5113,7 +5251,7 @@
   // __fp16 gets passed as if it were an int or float, but with the top 16 bits
   // unspecified. This is not done for OpenCL as it handles the half type
   // natively, and does not need to interwork with AAPCS code.
-  if (Ty->isHalfType() && !getContext().getLangOpts().OpenCL) {
+  if (Ty->isHalfType() && !getContext().getLangOpts().NativeHalfArgsAndReturns) {
     llvm::Type *ResType = IsEffectivelyAAPCS_VFP ?
       llvm::Type::getFloatTy(getVMContext()) :
       llvm::Type::getInt32Ty(getVMContext());
@@ -5188,6 +5326,12 @@
                                    /*Realign=*/TyAlign > ABIAlign);
   }
 
+  // On RenderScript, coerce Aggregates <= 64 bytes to an integer array of
+  // same size and alignment.
+  if (getTarget().isRenderScriptTarget()) {
+    return coerceToIntArray(Ty, getContext(), getVMContext());
+  }
+
   // Otherwise, pass by coercing to a structure of the appropriate size.
   llvm::Type* ElemTy;
   unsigned SizeRegs;
@@ -5305,7 +5449,7 @@
   // __fp16 gets returned as if it were an int or float, but with the top 16
   // bits unspecified. This is not done for OpenCL as it handles the half type
   // natively, and does not need to interwork with AAPCS code.
-  if (RetTy->isHalfType() && !getContext().getLangOpts().OpenCL) {
+  if (RetTy->isHalfType() && !getContext().getLangOpts().NativeHalfArgsAndReturns) {
     llvm::Type *ResType = IsEffectivelyAAPCS_VFP ?
       llvm::Type::getFloatTy(getVMContext()) :
       llvm::Type::getInt32Ty(getVMContext());
@@ -5369,6 +5513,11 @@
   // are returned indirectly.
   uint64_t Size = getContext().getTypeSize(RetTy);
   if (Size <= 32) {
+    // On RenderScript, coerce Aggregates <= 4 bytes to an integer array of
+    // same size and alignment.
+    if (getTarget().isRenderScriptTarget()) {
+      return coerceToIntArray(RetTy, getContext(), getVMContext());
+    }
     if (getDataLayout().isBigEndian())
       // Return in 32 bit integer integer type (as if loaded by LDR, AAPCS 5.4)
       return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
@@ -6596,6 +6745,132 @@
 }
 
 //===----------------------------------------------------------------------===//
+// Lanai ABI Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+class LanaiABIInfo : public DefaultABIInfo {
+public:
+  LanaiABIInfo(CodeGen::CodeGenTypes &CGT) : DefaultABIInfo(CGT) {}
+
+  bool shouldUseInReg(QualType Ty, CCState &State) const;
+
+  void computeInfo(CGFunctionInfo &FI) const override {
+    CCState State(FI.getCallingConvention());
+    // Lanai uses 4 registers to pass arguments unless the function has the
+    // regparm attribute set.
+    if (FI.getHasRegParm()) {
+      State.FreeRegs = FI.getRegParm();
+    } else {
+      State.FreeRegs = 4;
+    }
+
+    if (!getCXXABI().classifyReturnType(FI))
+      FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
+    for (auto &I : FI.arguments())
+      I.info = classifyArgumentType(I.type, State);
+  }
+
+  ABIArgInfo getIndirectResult(QualType Ty, bool ByVal, CCState &State) const;
+  ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State) const;
+};
+} // end anonymous namespace
+
+bool LanaiABIInfo::shouldUseInReg(QualType Ty, CCState &State) const {
+  unsigned Size = getContext().getTypeSize(Ty);
+  unsigned SizeInRegs = llvm::alignTo(Size, 32U) / 32U;
+
+  if (SizeInRegs == 0)
+    return false;
+
+  if (SizeInRegs > State.FreeRegs) {
+    State.FreeRegs = 0;
+    return false;
+  }
+
+  State.FreeRegs -= SizeInRegs;
+
+  return true;
+}
+
+ABIArgInfo LanaiABIInfo::getIndirectResult(QualType Ty, bool ByVal,
+                                           CCState &State) const {
+  if (!ByVal) {
+    if (State.FreeRegs) {
+      --State.FreeRegs; // Non-byval indirects just use one pointer.
+      return getNaturalAlignIndirectInReg(Ty);
+    }
+    return getNaturalAlignIndirect(Ty, false);
+  }
+
+  // Compute the byval alignment.
+  const unsigned MinABIStackAlignInBytes = 4;
+  unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
+  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), /*ByVal=*/true,
+                                 /*Realign=*/TypeAlign >
+                                     MinABIStackAlignInBytes);
+}
+
+ABIArgInfo LanaiABIInfo::classifyArgumentType(QualType Ty,
+                                              CCState &State) const {
+  // Check with the C++ ABI first.
+  const RecordType *RT = Ty->getAs<RecordType>();
+  if (RT) {
+    CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
+    if (RAA == CGCXXABI::RAA_Indirect) {
+      return getIndirectResult(Ty, /*ByVal=*/false, State);
+    } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
+      return getNaturalAlignIndirect(Ty, /*ByRef=*/true);
+    }
+  }
+
+  if (isAggregateTypeForABI(Ty)) {
+    // Structures with flexible arrays are always indirect.
+    if (RT && RT->getDecl()->hasFlexibleArrayMember())
+      return getIndirectResult(Ty, /*ByVal=*/true, State);
+
+    // Ignore empty structs/unions.
+    if (isEmptyRecord(getContext(), Ty, true))
+      return ABIArgInfo::getIgnore();
+
+    llvm::LLVMContext &LLVMContext = getVMContext();
+    unsigned SizeInRegs = (getContext().getTypeSize(Ty) + 31) / 32;
+    if (SizeInRegs <= State.FreeRegs) {
+      llvm::IntegerType *Int32 = llvm::Type::getInt32Ty(LLVMContext);
+      SmallVector<llvm::Type *, 3> Elements(SizeInRegs, Int32);
+      llvm::Type *Result = llvm::StructType::get(LLVMContext, Elements);
+      State.FreeRegs -= SizeInRegs;
+      return ABIArgInfo::getDirectInReg(Result);
+    } else {
+      State.FreeRegs = 0;
+    }
+    return getIndirectResult(Ty, true, State);
+  }
+
+  // Treat an enum type as its underlying type.
+  if (const auto *EnumTy = Ty->getAs<EnumType>())
+    Ty = EnumTy->getDecl()->getIntegerType();
+
+  bool InReg = shouldUseInReg(Ty, State);
+  if (Ty->isPromotableIntegerType()) {
+    if (InReg)
+      return ABIArgInfo::getDirectInReg();
+    return ABIArgInfo::getExtend();
+  }
+  if (InReg)
+    return ABIArgInfo::getDirectInReg();
+  return ABIArgInfo::getDirect();
+}
+
+namespace {
+class LanaiTargetCodeGenInfo : public TargetCodeGenInfo {
+public:
+  LanaiTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
+      : TargetCodeGenInfo(new LanaiABIInfo(CGT)) {}
+};
+}
+
+//===----------------------------------------------------------------------===//
 // AMDGPU ABI Implementation
 //===----------------------------------------------------------------------===//
 
@@ -6607,10 +6882,13 @@
     : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {}
   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &M) const override;
+  unsigned getOpenCLKernelCallingConv() const override;
 };
 
 }
 
+static void appendOpenCLVersionMD (CodeGen::CodeGenModule &CGM);
+
 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
   const Decl *D,
   llvm::GlobalValue *GV,
@@ -6632,8 +6910,57 @@
     if (NumSGPR != 0)
       F->addFnAttr("amdgpu_num_sgpr", llvm::utostr(NumSGPR));
   }
+
+  appendOpenCLVersionMD(M);
+}
+
+
+unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
+  return llvm::CallingConv::AMDGPU_KERNEL;
 }
 
+//===----------------------------------------------------------------------===//
+// SPARC v8 ABI Implementation.
+// Based on the SPARC Compliance Definition version 2.4.1.
+//
+// Ensures that complex values are passed in registers.
+//
+namespace {
+class SparcV8ABIInfo : public DefaultABIInfo {
+public:
+  SparcV8ABIInfo(CodeGenTypes &CGT) : DefaultABIInfo(CGT) {}
+
+private:
+  ABIArgInfo classifyReturnType(QualType RetTy) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
+};
+} // end anonymous namespace
+
+
+ABIArgInfo
+SparcV8ABIInfo::classifyReturnType(QualType Ty) const {
+  if (Ty->isAnyComplexType()) {
+    return ABIArgInfo::getDirect();
+  }
+  else {
+    return DefaultABIInfo::classifyReturnType(Ty);
+  }
+}
+
+void SparcV8ABIInfo::computeInfo(CGFunctionInfo &FI) const {
+
+  FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
+  for (auto &Arg : FI.arguments())
+    Arg.info = classifyArgumentType(Arg.type);
+}
+
+namespace {
+class SparcV8TargetCodeGenInfo : public TargetCodeGenInfo {
+public:
+  SparcV8TargetCodeGenInfo(CodeGenTypes &CGT)
+    : TargetCodeGenInfo(new SparcV8ABIInfo(CGT)) {}
+};
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // SPARC v9 ABI Implementation.
@@ -7056,6 +7383,8 @@
 
 } // End anonymous namespace.
 
+// TODO: this implementation is likely now redundant with the default
+// EmitVAArg.
 Address XCoreABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                                 QualType Ty) const {
   CGBuilderTy &Builder = CGF.Builder;
@@ -7219,15 +7548,66 @@
   SmallStringEnc Enc;
   if (getTypeString(Enc, D, CGM, TSC)) {
     llvm::LLVMContext &Ctx = CGM.getModule().getContext();
-    llvm::SmallVector<llvm::Metadata *, 2> MDVals;
-    MDVals.push_back(llvm::ConstantAsMetadata::get(GV));
-    MDVals.push_back(llvm::MDString::get(Ctx, Enc.str()));
+    llvm::Metadata *MDVals[] = {llvm::ConstantAsMetadata::get(GV),
+                                llvm::MDString::get(Ctx, Enc.str())};
     llvm::NamedMDNode *MD =
       CGM.getModule().getOrInsertNamedMetadata("xcore.typestrings");
     MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
   }
 }
 
+//===----------------------------------------------------------------------===//
+// SPIR ABI Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+class SPIRTargetCodeGenInfo : public TargetCodeGenInfo {
+public:
+  SPIRTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
+    : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {}
+  void emitTargetMD(const Decl *D, llvm::GlobalValue *GV,
+                    CodeGen::CodeGenModule &M) const override;
+  unsigned getOpenCLKernelCallingConv() const override;
+};
+} // End anonymous namespace.
+
+/// Emit SPIR specific metadata: OpenCL and SPIR version.
+void SPIRTargetCodeGenInfo::emitTargetMD(const Decl *D, llvm::GlobalValue *GV,
+                                         CodeGen::CodeGenModule &CGM) const {
+  llvm::LLVMContext &Ctx = CGM.getModule().getContext();
+  llvm::Type *Int32Ty = llvm::Type::getInt32Ty(Ctx);
+  llvm::Module &M = CGM.getModule();
+  // SPIR v2.0 s2.12 - The SPIR version used by the module is stored in the
+  // opencl.spir.version named metadata.
+  llvm::Metadata *SPIRVerElts[] = {
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 2)),
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 0))};
+  llvm::NamedMDNode *SPIRVerMD =
+      M.getOrInsertNamedMetadata("opencl.spir.version");
+  SPIRVerMD->addOperand(llvm::MDNode::get(Ctx, SPIRVerElts));
+  appendOpenCLVersionMD(CGM);
+}
+
+static void appendOpenCLVersionMD (CodeGen::CodeGenModule &CGM) {
+  llvm::LLVMContext &Ctx = CGM.getModule().getContext();
+  llvm::Type *Int32Ty = llvm::Type::getInt32Ty(Ctx);
+  llvm::Module &M = CGM.getModule();
+  // SPIR v2.0 s2.13 - The OpenCL version used by the module is stored in the
+  // opencl.ocl.version named metadata node.
+  llvm::Metadata *OCLVerElts[] = {
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+          Int32Ty, CGM.getLangOpts().OpenCLVersion / 100)),
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+          Int32Ty, (CGM.getLangOpts().OpenCLVersion % 100) / 10))};
+  llvm::NamedMDNode *OCLVerMD =
+      M.getOrInsertNamedMetadata("opencl.ocl.version");
+  OCLVerMD->addOperand(llvm::MDNode::get(Ctx, OCLVerElts));
+}
+
+unsigned SPIRTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
+  return llvm::CallingConv::SPIR_KERNEL;
+}
+
 static bool appendType(SmallStringEnc &Enc, QualType QType,
                        const CodeGen::CodeGenModule &CGM,
                        TypeStringCache &TSC);
@@ -7569,29 +7949,35 @@
 }
 
 bool CodeGenModule::supportsCOMDAT() const {
-  return !getTriple().isOSBinFormatMachO();
+  return getTriple().supportsCOMDAT();
 }
 
 const TargetCodeGenInfo &CodeGenModule::getTargetCodeGenInfo() {
   if (TheTargetCodeGenInfo)
     return *TheTargetCodeGenInfo;
 
+  // Helper to set the unique_ptr while still keeping the return value.
+  auto SetCGInfo = [&](TargetCodeGenInfo *P) -> const TargetCodeGenInfo & {
+    this->TheTargetCodeGenInfo.reset(P);
+    return *P;
+  };
+
   const llvm::Triple &Triple = getTarget().getTriple();
   switch (Triple.getArch()) {
   default:
-    return *(TheTargetCodeGenInfo = new DefaultTargetCodeGenInfo(Types));
+    return SetCGInfo(new DefaultTargetCodeGenInfo(Types));
 
   case llvm::Triple::le32:
-    return *(TheTargetCodeGenInfo = new PNaClTargetCodeGenInfo(Types));
+    return SetCGInfo(new PNaClTargetCodeGenInfo(Types));
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
     if (Triple.getOS() == llvm::Triple::NaCl)
-      return *(TheTargetCodeGenInfo = new PNaClTargetCodeGenInfo(Types));
-    return *(TheTargetCodeGenInfo = new MIPSTargetCodeGenInfo(Types, true));
+      return SetCGInfo(new PNaClTargetCodeGenInfo(Types));
+    return SetCGInfo(new MIPSTargetCodeGenInfo(Types, true));
 
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
-    return *(TheTargetCodeGenInfo = new MIPSTargetCodeGenInfo(Types, false));
+    return SetCGInfo(new MIPSTargetCodeGenInfo(Types, false));
 
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be: {
@@ -7599,41 +7985,41 @@
     if (getTarget().getABI() == "darwinpcs")
       Kind = AArch64ABIInfo::DarwinPCS;
 
-    return *(TheTargetCodeGenInfo = new AArch64TargetCodeGenInfo(Types, Kind));
+    return SetCGInfo(new AArch64TargetCodeGenInfo(Types, Kind));
   }
 
   case llvm::Triple::wasm32:
   case llvm::Triple::wasm64:
-    return *(TheTargetCodeGenInfo = new WebAssemblyTargetCodeGenInfo(Types));
+    return SetCGInfo(new WebAssemblyTargetCodeGenInfo(Types));
 
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
-  case llvm::Triple::thumbeb:
-    {
-      if (Triple.getOS() == llvm::Triple::Win32) {
-        TheTargetCodeGenInfo =
-            new WindowsARMTargetCodeGenInfo(Types, ARMABIInfo::AAPCS_VFP);
-        return *TheTargetCodeGenInfo;
-      }
-
-      ARMABIInfo::ABIKind Kind = ARMABIInfo::AAPCS;
-      StringRef ABIStr = getTarget().getABI();
-      if (ABIStr == "apcs-gnu")
-        Kind = ARMABIInfo::APCS;
-      else if (ABIStr == "aapcs16")
-        Kind = ARMABIInfo::AAPCS16_VFP;
-      else if (CodeGenOpts.FloatABI == "hard" ||
-               (CodeGenOpts.FloatABI != "soft" &&
-                Triple.getEnvironment() == llvm::Triple::GNUEABIHF))
-        Kind = ARMABIInfo::AAPCS_VFP;
-
-      return *(TheTargetCodeGenInfo = new ARMTargetCodeGenInfo(Types, Kind));
+  case llvm::Triple::thumbeb: {
+    if (Triple.getOS() == llvm::Triple::Win32) {
+      return SetCGInfo(
+          new WindowsARMTargetCodeGenInfo(Types, ARMABIInfo::AAPCS_VFP));
     }
 
+    ARMABIInfo::ABIKind Kind = ARMABIInfo::AAPCS;
+    StringRef ABIStr = getTarget().getABI();
+    if (ABIStr == "apcs-gnu")
+      Kind = ARMABIInfo::APCS;
+    else if (ABIStr == "aapcs16")
+      Kind = ARMABIInfo::AAPCS16_VFP;
+    else if (CodeGenOpts.FloatABI == "hard" ||
+             (CodeGenOpts.FloatABI != "soft" &&
+              (Triple.getEnvironment() == llvm::Triple::GNUEABIHF ||
+               Triple.getEnvironment() == llvm::Triple::MuslEABIHF ||
+               Triple.getEnvironment() == llvm::Triple::EABIHF)))
+      Kind = ARMABIInfo::AAPCS_VFP;
+
+    return SetCGInfo(new ARMTargetCodeGenInfo(Types, Kind));
+  }
+
   case llvm::Triple::ppc:
-    return *(TheTargetCodeGenInfo = 
-             new PPC32TargetCodeGenInfo(Types, CodeGenOpts.FloatABI == "soft"));
+    return SetCGInfo(
+        new PPC32TargetCodeGenInfo(Types, CodeGenOpts.FloatABI == "soft"));
   case llvm::Triple::ppc64:
     if (Triple.isOSBinFormatELF()) {
       PPC64_SVR4_ABIInfo::ABIKind Kind = PPC64_SVR4_ABIInfo::ELFv1;
@@ -7641,10 +8027,9 @@
         Kind = PPC64_SVR4_ABIInfo::ELFv2;
       bool HasQPX = getTarget().getABI() == "elfv1-qpx";
 
-      return *(TheTargetCodeGenInfo =
-               new PPC64_SVR4_TargetCodeGenInfo(Types, Kind, HasQPX));
+      return SetCGInfo(new PPC64_SVR4_TargetCodeGenInfo(Types, Kind, HasQPX));
     } else
-      return *(TheTargetCodeGenInfo = new PPC64TargetCodeGenInfo(Types));
+      return SetCGInfo(new PPC64TargetCodeGenInfo(Types));
   case llvm::Triple::ppc64le: {
     assert(Triple.isOSBinFormatELF() && "PPC64 LE non-ELF not supported!");
     PPC64_SVR4_ABIInfo::ABIKind Kind = PPC64_SVR4_ABIInfo::ELFv2;
@@ -7652,25 +8037,23 @@
       Kind = PPC64_SVR4_ABIInfo::ELFv1;
     bool HasQPX = getTarget().getABI() == "elfv1-qpx";
 
-    return *(TheTargetCodeGenInfo =
-             new PPC64_SVR4_TargetCodeGenInfo(Types, Kind, HasQPX));
+    return SetCGInfo(new PPC64_SVR4_TargetCodeGenInfo(Types, Kind, HasQPX));
   }
 
   case llvm::Triple::nvptx:
   case llvm::Triple::nvptx64:
-    return *(TheTargetCodeGenInfo = new NVPTXTargetCodeGenInfo(Types));
+    return SetCGInfo(new NVPTXTargetCodeGenInfo(Types));
 
   case llvm::Triple::msp430:
-    return *(TheTargetCodeGenInfo = new MSP430TargetCodeGenInfo(Types));
+    return SetCGInfo(new MSP430TargetCodeGenInfo(Types));
 
   case llvm::Triple::systemz: {
     bool HasVector = getTarget().getABI() == "vector";
-    return *(TheTargetCodeGenInfo = new SystemZTargetCodeGenInfo(Types,
-                                                                 HasVector));
+    return SetCGInfo(new SystemZTargetCodeGenInfo(Types, HasVector));
   }
 
   case llvm::Triple::tce:
-    return *(TheTargetCodeGenInfo = new TCETargetCodeGenInfo(Types));
+    return SetCGInfo(new TCETargetCodeGenInfo(Types));
 
   case llvm::Triple::x86: {
     bool IsDarwinVectorABI = Triple.isOSDarwin();
@@ -7679,44 +8062,49 @@
     bool IsWin32FloatStructABI = Triple.isOSWindows() && !Triple.isOSCygMing();
 
     if (Triple.getOS() == llvm::Triple::Win32) {
-      return *(TheTargetCodeGenInfo = new WinX86_32TargetCodeGenInfo(
-                   Types, IsDarwinVectorABI, RetSmallStructInRegABI,
-                   IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters));
+      return SetCGInfo(new WinX86_32TargetCodeGenInfo(
+          Types, IsDarwinVectorABI, RetSmallStructInRegABI,
+          IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters));
     } else {
-      return *(TheTargetCodeGenInfo = new X86_32TargetCodeGenInfo(
-                   Types, IsDarwinVectorABI, RetSmallStructInRegABI,
-                   IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters,
-                   CodeGenOpts.FloatABI == "soft"));
+      return SetCGInfo(new X86_32TargetCodeGenInfo(
+          Types, IsDarwinVectorABI, RetSmallStructInRegABI,
+          IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters,
+          CodeGenOpts.FloatABI == "soft"));
     }
   }
 
   case llvm::Triple::x86_64: {
     StringRef ABI = getTarget().getABI();
-    X86AVXABILevel AVXLevel = (ABI == "avx512" ? X86AVXABILevel::AVX512 :
-                               ABI == "avx" ? X86AVXABILevel::AVX :
-                               X86AVXABILevel::None);
+    X86AVXABILevel AVXLevel =
+        (ABI == "avx512"
+             ? X86AVXABILevel::AVX512
+             : ABI == "avx" ? X86AVXABILevel::AVX : X86AVXABILevel::None);
 
     switch (Triple.getOS()) {
     case llvm::Triple::Win32:
-      return *(TheTargetCodeGenInfo =
-                   new WinX86_64TargetCodeGenInfo(Types, AVXLevel));
+      return SetCGInfo(new WinX86_64TargetCodeGenInfo(Types, AVXLevel));
     case llvm::Triple::PS4:
-      return *(TheTargetCodeGenInfo =
-                   new PS4TargetCodeGenInfo(Types, AVXLevel));
+      return SetCGInfo(new PS4TargetCodeGenInfo(Types, AVXLevel));
     default:
-      return *(TheTargetCodeGenInfo =
-                   new X86_64TargetCodeGenInfo(Types, AVXLevel));
+      return SetCGInfo(new X86_64TargetCodeGenInfo(Types, AVXLevel));
     }
   }
   case llvm::Triple::hexagon:
-    return *(TheTargetCodeGenInfo = new HexagonTargetCodeGenInfo(Types));
+    return SetCGInfo(new HexagonTargetCodeGenInfo(Types));
+  case llvm::Triple::lanai:
+    return SetCGInfo(new LanaiTargetCodeGenInfo(Types));
   case llvm::Triple::r600:
-    return *(TheTargetCodeGenInfo = new AMDGPUTargetCodeGenInfo(Types));
+    return SetCGInfo(new AMDGPUTargetCodeGenInfo(Types));
   case llvm::Triple::amdgcn:
-    return *(TheTargetCodeGenInfo = new AMDGPUTargetCodeGenInfo(Types));
+    return SetCGInfo(new AMDGPUTargetCodeGenInfo(Types));
+  case llvm::Triple::sparc:
+    return SetCGInfo(new SparcV8TargetCodeGenInfo(Types));
   case llvm::Triple::sparcv9:
-    return *(TheTargetCodeGenInfo = new SparcV9TargetCodeGenInfo(Types));
+    return SetCGInfo(new SparcV9TargetCodeGenInfo(Types));
   case llvm::Triple::xcore:
-    return *(TheTargetCodeGenInfo = new XCoreTargetCodeGenInfo(Types));
+    return SetCGInfo(new XCoreTargetCodeGenInfo(Types));
+  case llvm::Triple::spir:
+  case llvm::Triple::spir64:
+    return SetCGInfo(new SPIRTargetCodeGenInfo(Types));
   }
 }
diff --git a/lib/CodeGen/TargetInfo.h b/lib/CodeGen/TargetInfo.h
index 71f6b0a..e463825 100644
--- a/lib/CodeGen/TargetInfo.h
+++ b/lib/CodeGen/TargetInfo.h
@@ -217,6 +217,9 @@
   virtual void getDetectMismatchOption(llvm::StringRef Name,
                                        llvm::StringRef Value,
                                        llvm::SmallString<32> &Opt) const {}
+
+  /// Get LLVM calling convention for OpenCL kernel.
+  virtual unsigned getOpenCLKernelCallingConv() const;
 };
 
 } // namespace CodeGen
diff --git a/lib/Driver/Action.cpp b/lib/Driver/Action.cpp
index b45f290..29a4679 100644
--- a/lib/Driver/Action.cpp
+++ b/lib/Driver/Action.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Driver/Action.h"
+#include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
@@ -21,8 +22,8 @@
   switch (AC) {
   case InputClass: return "input";
   case BindArchClass: return "bind-arch";
-  case CudaDeviceClass: return "cuda-device";
-  case CudaHostClass: return "cuda-host";
+  case OffloadClass:
+    return "offload";
   case PreprocessJobClass: return "preprocessor";
   case PrecompileJobClass: return "precompiler";
   case AnalyzeJobClass: return "analyzer";
@@ -40,6 +41,82 @@
   llvm_unreachable("invalid class");
 }
 
+void Action::propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) {
+  // Offload action set its own kinds on their dependences.
+  if (Kind == OffloadClass)
+    return;
+
+  assert((OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFK_None) &&
+         "Setting device kind to a different device??");
+  assert(!ActiveOffloadKindMask && "Setting a device kind in a host action??");
+  OffloadingDeviceKind = OKind;
+  OffloadingArch = OArch;
+
+  for (auto *A : Inputs)
+    A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch);
+}
+
+void Action::propagateHostOffloadInfo(unsigned OKinds, const char *OArch) {
+  // Offload action set its own kinds on their dependences.
+  if (Kind == OffloadClass)
+    return;
+
+  assert(OffloadingDeviceKind == OFK_None &&
+         "Setting a host kind in a device action.");
+  ActiveOffloadKindMask |= OKinds;
+  OffloadingArch = OArch;
+
+  for (auto *A : Inputs)
+    A->propagateHostOffloadInfo(ActiveOffloadKindMask, OArch);
+}
+
+void Action::propagateOffloadInfo(const Action *A) {
+  if (unsigned HK = A->getOffloadingHostActiveKinds())
+    propagateHostOffloadInfo(HK, A->getOffloadingArch());
+  else
+    propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(),
+                               A->getOffloadingArch());
+}
+
+std::string Action::getOffloadingKindPrefix() const {
+  switch (OffloadingDeviceKind) {
+  case OFK_None:
+    break;
+  case OFK_Host:
+    llvm_unreachable("Host kind is not an offloading device kind.");
+    break;
+  case OFK_Cuda:
+    return "device-cuda";
+
+    // TODO: Add other programming models here.
+  }
+
+  if (!ActiveOffloadKindMask)
+    return "";
+
+  std::string Res("host");
+  if (ActiveOffloadKindMask & OFK_Cuda)
+    Res += "-cuda";
+
+  // TODO: Add other programming models here.
+
+  return Res;
+}
+
+std::string
+Action::getOffloadingFileNamePrefix(llvm::StringRef NormalizedTriple) const {
+  // A file prefix is only generated for device actions and consists of the
+  // offload kind and triple.
+  if (!OffloadingDeviceKind)
+    return "";
+
+  std::string Res("-");
+  Res += getOffloadingKindPrefix();
+  Res += "-";
+  Res += NormalizedTriple;
+  return Res;
+}
+
 void InputAction::anchor() {}
 
 InputAction::InputAction(const Arg &_Input, types::ID _Type)
@@ -51,45 +128,138 @@
 BindArchAction::BindArchAction(Action *Input, const char *_ArchName)
     : Action(BindArchClass, Input), ArchName(_ArchName) {}
 
-// Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual
-// compute arch, e.g. "compute_20".  Returns null if the input arch is null or
-// doesn't match an existing arch.
-static const char* GpuArchToComputeName(const char *ArchName) {
-  if (!ArchName)
-    return nullptr;
-  return llvm::StringSwitch<const char *>(ArchName)
-      .Cases("sm_20", "sm_21", "compute_20")
-      .Case("sm_30", "compute_30")
-      .Case("sm_32", "compute_32")
-      .Case("sm_35", "compute_35")
-      .Case("sm_37", "compute_37")
-      .Case("sm_50", "compute_50")
-      .Case("sm_52", "compute_52")
-      .Case("sm_53", "compute_53")
-      .Default(nullptr);
+void OffloadAction::anchor() {}
+
+OffloadAction::OffloadAction(const HostDependence &HDep)
+    : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) {
+  OffloadingArch = HDep.getBoundArch();
+  ActiveOffloadKindMask = HDep.getOffloadKinds();
+  HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+                                             HDep.getBoundArch());
 }
 
-void CudaDeviceAction::anchor() {}
+OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty)
+    : Action(OffloadClass, DDeps.getActions(), Ty),
+      DevToolChains(DDeps.getToolChains()) {
+  auto &OKinds = DDeps.getOffloadKinds();
+  auto &BArchs = DDeps.getBoundArchs();
 
-CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName,
-                                   bool AtTopLevel)
-    : Action(CudaDeviceClass, Input), GpuArchName(ArchName),
-      AtTopLevel(AtTopLevel) {
-  assert(!GpuArchName || IsValidGpuArchName(GpuArchName));
+  // If all inputs agree on the same kind, use it also for this action.
+  if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); }))
+    OffloadingDeviceKind = OKinds.front();
+
+  // If we have a single dependency, inherit the architecture from it.
+  if (OKinds.size() == 1)
+    OffloadingArch = BArchs.front();
+
+  // Propagate info to the dependencies.
+  for (unsigned i = 0, e = getInputs().size(); i != e; ++i)
+    getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]);
 }
 
-const char *CudaDeviceAction::getComputeArchName() const {
-  return GpuArchToComputeName(GpuArchName);
+OffloadAction::OffloadAction(const HostDependence &HDep,
+                             const DeviceDependences &DDeps)
+    : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()),
+      DevToolChains(DDeps.getToolChains()) {
+  // We use the kinds of the host dependence for this action.
+  OffloadingArch = HDep.getBoundArch();
+  ActiveOffloadKindMask = HDep.getOffloadKinds();
+  HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+                                             HDep.getBoundArch());
+
+  // Add device inputs and propagate info to the device actions. Do work only if
+  // we have dependencies.
+  for (unsigned i = 0, e = DDeps.getActions().size(); i != e; ++i)
+    if (auto *A = DDeps.getActions()[i]) {
+      getInputs().push_back(A);
+      A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i],
+                                    DDeps.getBoundArchs()[i]);
+    }
 }
 
-bool CudaDeviceAction::IsValidGpuArchName(llvm::StringRef ArchName) {
-  return GpuArchToComputeName(ArchName.data()) != nullptr;
+void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const {
+  if (!HostTC)
+    return;
+  assert(!getInputs().empty() && "No dependencies for offload action??");
+  auto *A = getInputs().front();
+  Work(A, HostTC, A->getOffloadingArch());
 }
 
-void CudaHostAction::anchor() {}
+void OffloadAction::doOnEachDeviceDependence(
+    const OffloadActionWorkTy &Work) const {
+  auto I = getInputs().begin();
+  auto E = getInputs().end();
+  if (I == E)
+    return;
 
-CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions)
-    : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {}
+  // We expect to have the same number of input dependences and device tool
+  // chains, except if we also have a host dependence. In that case we have one
+  // more dependence than we have device tool chains.
+  assert(getInputs().size() == DevToolChains.size() + (HostTC ? 1 : 0) &&
+         "Sizes of action dependences and toolchains are not consistent!");
+
+  // Skip host action
+  if (HostTC)
+    ++I;
+
+  auto TI = DevToolChains.begin();
+  for (; I != E; ++I, ++TI)
+    Work(*I, *TI, (*I)->getOffloadingArch());
+}
+
+void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const {
+  doOnHostDependence(Work);
+  doOnEachDeviceDependence(Work);
+}
+
+void OffloadAction::doOnEachDependence(bool IsHostDependence,
+                                       const OffloadActionWorkTy &Work) const {
+  if (IsHostDependence)
+    doOnHostDependence(Work);
+  else
+    doOnEachDeviceDependence(Work);
+}
+
+bool OffloadAction::hasHostDependence() const { return HostTC != nullptr; }
+
+Action *OffloadAction::getHostDependence() const {
+  assert(hasHostDependence() && "Host dependence does not exist!");
+  assert(!getInputs().empty() && "No dependencies for offload action??");
+  return HostTC ? getInputs().front() : nullptr;
+}
+
+bool OffloadAction::hasSingleDeviceDependence(
+    bool DoNotConsiderHostActions) const {
+  if (DoNotConsiderHostActions)
+    return getInputs().size() == (HostTC ? 2 : 1);
+  return !HostTC && getInputs().size() == 1;
+}
+
+Action *
+OffloadAction::getSingleDeviceDependence(bool DoNotConsiderHostActions) const {
+  assert(hasSingleDeviceDependence(DoNotConsiderHostActions) &&
+         "Single device dependence does not exist!");
+  // The previous assert ensures the number of entries in getInputs() is
+  // consistent with what we are doing here.
+  return HostTC ? getInputs()[1] : getInputs().front();
+}
+
+void OffloadAction::DeviceDependences::add(Action &A, const ToolChain &TC,
+                                           const char *BoundArch,
+                                           OffloadKind OKind) {
+  DeviceActions.push_back(&A);
+  DeviceToolChains.push_back(&TC);
+  DeviceBoundArchs.push_back(BoundArch);
+  DeviceOffloadKinds.push_back(OKind);
+}
+
+OffloadAction::HostDependence::HostDependence(Action &A, const ToolChain &TC,
+                                              const char *BoundArch,
+                                              const DeviceDependences &DDeps)
+    : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch) {
+  for (auto K : DDeps.getOffloadKinds())
+    HostOffloadKinds |= K;
+}
 
 void JobAction::anchor() {}
 
diff --git a/lib/Driver/CMakeLists.txt b/lib/Driver/CMakeLists.txt
index fa0430e..5b8422e 100644
--- a/lib/Driver/CMakeLists.txt
+++ b/lib/Driver/CMakeLists.txt
@@ -3,6 +3,11 @@
   Support
   )
 
+if(WIN32)
+  # MSVCToolChain.cpp uses version.dll.
+  set(system_libs version)
+endif()
+
 add_clang_library(clangDriver
   Action.cpp
   Compilation.cpp
@@ -26,4 +31,5 @@
 
   LINK_LIBS
   clangBasic
+  ${system_libs}
   )
diff --git a/lib/Driver/Compilation.cpp b/lib/Driver/Compilation.cpp
index 1c2eecd..6a2616f 100644
--- a/lib/Driver/Compilation.cpp
+++ b/lib/Driver/Compilation.cpp
@@ -24,10 +24,13 @@
 
 Compilation::Compilation(const Driver &D, const ToolChain &_DefaultToolChain,
                          InputArgList *_Args, DerivedArgList *_TranslatedArgs)
-    : TheDriver(D), DefaultToolChain(_DefaultToolChain),
-      CudaHostToolChain(&DefaultToolChain), CudaDeviceToolChain(nullptr),
+    : TheDriver(D), DefaultToolChain(_DefaultToolChain), ActiveOffloadMask(0u),
       Args(_Args), TranslatedArgs(_TranslatedArgs), Redirects(nullptr),
-      ForDiagnostics(false) {}
+      ForDiagnostics(false) {
+  // The offloading host toolchain is the default tool chain.
+  OrderedOffloadingToolchains.insert(
+      std::make_pair(Action::OFK_Host, &DefaultToolChain));
+}
 
 Compilation::~Compilation() {
   delete TranslatedArgs;
@@ -42,6 +45,7 @@
 
   // Free redirections of stdout/stderr.
   if (Redirects) {
+    delete Redirects[0];
     delete Redirects[1];
     delete Redirects[2];
     delete [] Redirects;
@@ -163,39 +167,17 @@
   return ExecutionFailed ? 1 : Res;
 }
 
-typedef SmallVectorImpl< std::pair<int, const Command *> > FailingCommandList;
-
-static bool ActionFailed(const Action *A,
-                         const FailingCommandList &FailingCommands) {
-
-  if (FailingCommands.empty())
-    return false;
-
-  for (FailingCommandList::const_iterator CI = FailingCommands.begin(),
-         CE = FailingCommands.end(); CI != CE; ++CI)
-    if (A == &(CI->second->getSource()))
-      return true;
-
-  for (Action::const_iterator AI = A->begin(), AE = A->end(); AI != AE; ++AI)
-    if (ActionFailed(*AI, FailingCommands))
-      return true;
-
-  return false;
-}
-
-static bool InputsOk(const Command &C,
-                     const FailingCommandList &FailingCommands) {
-  return !ActionFailed(&C.getSource(), FailingCommands);
-}
-
-void Compilation::ExecuteJobs(const JobList &Jobs,
-                              FailingCommandList &FailingCommands) const {
+void Compilation::ExecuteJobs(
+    const JobList &Jobs,
+    SmallVectorImpl<std::pair<int, const Command *>> &FailingCommands) const {
   for (const auto &Job : Jobs) {
-    if (!InputsOk(Job, FailingCommands))
-      continue;
     const Command *FailingCommand = nullptr;
-    if (int Res = ExecuteCommand(Job, FailingCommand))
+    if (int Res = ExecuteCommand(Job, FailingCommand)) {
       FailingCommands.push_back(std::make_pair(Res, FailingCommand));
+      // Bail as soon as one command fails, so we don't output duplicate error
+      // messages if we die on e.g. the same file.
+      return;
+    }
   }
 }
 
@@ -232,3 +214,7 @@
 StringRef Compilation::getSysRoot() const {
   return getDriver().SysRoot;
 }
+
+void Compilation::Redirect(const StringRef** Redirects) {
+  this->Redirects = Redirects;
+}
diff --git a/lib/Driver/CrossWindowsToolChain.cpp b/lib/Driver/CrossWindowsToolChain.cpp
index 57bf896..28036ea 100644
--- a/lib/Driver/CrossWindowsToolChain.cpp
+++ b/lib/Driver/CrossWindowsToolChain.cpp
@@ -11,6 +11,7 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/Path.h"
 
 using namespace clang::driver;
 using namespace clang::driver::toolchains;
@@ -62,6 +63,8 @@
     llvm::sys::path::append(ResourceDir, "include");
     addSystemInclude(DriverArgs, CC1Args, ResourceDir);
   }
+  for (const auto &P : DriverArgs.getAllArgValues(options::OPT_isystem_after))
+    addSystemInclude(DriverArgs, CC1Args, P);
   addExternCSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/include");
 }
 
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp
index eafc0af..5bbc157 100644
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -23,6 +23,7 @@
 #include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -31,7 +32,6 @@
 #include "llvm/Option/OptSpecifier.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -41,6 +41,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <memory>
+#include <utility>
 
 using namespace clang::driver;
 using namespace clang;
@@ -49,16 +50,16 @@
 Driver::Driver(StringRef ClangExecutable, StringRef DefaultTargetTriple,
                DiagnosticsEngine &Diags,
                IntrusiveRefCntPtr<vfs::FileSystem> VFS)
-    : Opts(createDriverOptTable()), Diags(Diags), VFS(VFS), Mode(GCCMode),
-      SaveTemps(SaveTempsNone), LTOMode(LTOK_None),
-      ClangExecutable(ClangExecutable),
+    : Opts(createDriverOptTable()), Diags(Diags), VFS(std::move(VFS)),
+      Mode(GCCMode), SaveTemps(SaveTempsNone), BitcodeEmbed(EmbedNone),
+      LTOMode(LTOK_None), ClangExecutable(ClangExecutable),
       SysRoot(DEFAULT_SYSROOT), UseStdLib(true),
-      DefaultTargetTriple(DefaultTargetTriple),
       DriverTitle("clang LLVM compiler"), CCPrintOptionsFilename(nullptr),
       CCPrintHeadersFilename(nullptr), CCLogDiagnosticsFilename(nullptr),
       CCCPrintBindings(false), CCPrintHeaders(false), CCLogDiagnostics(false),
-      CCGenDiagnostics(false), CCCGenericGCCName(""), CheckInputsExist(true),
-      CCCUsePCH(true), SuppressMissingInputWarning(false) {
+      CCGenDiagnostics(false), DefaultTargetTriple(DefaultTargetTriple),
+      CCCGenericGCCName(""), CheckInputsExist(true), CCCUsePCH(true),
+      SuppressMissingInputWarning(false) {
 
   // Provide a sane fallback if no VFS is specified.
   if (!this->VFS)
@@ -87,33 +88,41 @@
   llvm::DeleteContainerSeconds(ToolChains);
 }
 
-void Driver::ParseDriverMode(ArrayRef<const char *> Args) {
-  const std::string OptName =
-      getOpts().getOption(options::OPT_driver_mode).getPrefixedName();
+void Driver::ParseDriverMode(StringRef ProgramName,
+                             ArrayRef<const char *> Args) {
+  auto Default = ToolChain::getTargetAndModeFromProgramName(ProgramName);
+  StringRef DefaultMode(Default.second);
+  setDriverModeFromOption(DefaultMode);
 
   for (const char *ArgPtr : Args) {
     // Ingore nullptrs, they are response file's EOL markers
     if (ArgPtr == nullptr)
       continue;
     const StringRef Arg = ArgPtr;
-    if (!Arg.startswith(OptName))
-      continue;
-
-    const StringRef Value = Arg.drop_front(OptName.size());
-    const unsigned M = llvm::StringSwitch<unsigned>(Value)
-                           .Case("gcc", GCCMode)
-                           .Case("g++", GXXMode)
-                           .Case("cpp", CPPMode)
-                           .Case("cl", CLMode)
-                           .Default(~0U);
-
-    if (M != ~0U)
-      Mode = static_cast<DriverMode>(M);
-    else
-      Diag(diag::err_drv_unsupported_option_argument) << OptName << Value;
+    setDriverModeFromOption(Arg);
   }
 }
 
+void Driver::setDriverModeFromOption(StringRef Opt) {
+  const std::string OptName =
+      getOpts().getOption(options::OPT_driver_mode).getPrefixedName();
+  if (!Opt.startswith(OptName))
+    return;
+  StringRef Value = Opt.drop_front(OptName.size());
+
+  const unsigned M = llvm::StringSwitch<unsigned>(Value)
+                         .Case("gcc", GCCMode)
+                         .Case("g++", GXXMode)
+                         .Case("cpp", CPPMode)
+                         .Case("cl", CLMode)
+                         .Default(~0U);
+
+  if (M != ~0U)
+    Mode = static_cast<DriverMode>(M);
+  else
+    Diag(diag::err_drv_unsupported_option_argument) << OptName << Value;
+}
+
 InputArgList Driver::ParseArgStrings(ArrayRef<const char *> ArgStrings) {
   llvm::PrettyStackTraceString CrashInfo("Command line argument parsing");
 
@@ -146,7 +155,9 @@
   }
 
   for (const Arg *A : Args.filtered(options::OPT_UNKNOWN))
-    Diags.Report(diag::err_drv_unknown_argument) << A->getAsString(Args);
+    Diags.Report(IsCLMode() ? diag::warn_drv_unknown_argument_clang_cl :
+                              diag::err_drv_unknown_argument)
+      << A->getAsString(Args);
 
   return Args;
 }
@@ -276,6 +287,10 @@
     DAL->append(A);
   }
 
+  // Enforce -static if -miamcu is present.
+  if (Args.hasFlag(options::OPT_miamcu, options::OPT_mno_iamcu, false))
+    DAL->AddFlagArg(0, Opts->getOption(options::OPT_static));
+
 // Add a default value of -mlinker-version=, if one was given and the user
 // didn't specify one.
 #if defined(HOST_LINK_VERSION)
@@ -294,7 +309,8 @@
 ///
 /// This routine provides the logic to compute a target triple from various
 /// args passed to the driver and the default triple string.
-static llvm::Triple computeTargetTriple(StringRef DefaultTargetTriple,
+static llvm::Triple computeTargetTriple(const Driver &D,
+                                        StringRef DefaultTargetTriple,
                                         const ArgList &Args,
                                         StringRef DarwinArchName = "") {
   // FIXME: Already done in Compilation *Driver::BuildCompilation
@@ -339,8 +355,9 @@
     return Target;
 
   // Handle pseudo-target flags '-m64', '-mx32', '-m32' and '-m16'.
-  if (Arg *A = Args.getLastArg(options::OPT_m64, options::OPT_mx32,
-                               options::OPT_m32, options::OPT_m16)) {
+  Arg *A = Args.getLastArg(options::OPT_m64, options::OPT_mx32,
+                           options::OPT_m32, options::OPT_m16);
+  if (A) {
     llvm::Triple::ArchType AT = llvm::Triple::UnknownArch;
 
     if (A->getOption().matches(options::OPT_m64)) {
@@ -365,6 +382,25 @@
       Target.setArch(AT);
   }
 
+  // Handle -miamcu flag.
+  if (Args.hasFlag(options::OPT_miamcu, options::OPT_mno_iamcu, false)) {
+    if (Target.get32BitArchVariant().getArch() != llvm::Triple::x86)
+      D.Diag(diag::err_drv_unsupported_opt_for_target) << "-miamcu"
+                                                       << Target.str();
+
+    if (A && !A->getOption().matches(options::OPT_m32))
+      D.Diag(diag::err_drv_argument_not_allowed_with)
+          << "-miamcu" << A->getBaseArg().getAsString(Args);
+
+    Target.setArch(llvm::Triple::x86);
+    Target.setArchName("i586");
+    Target.setEnvironment(llvm::Triple::UnknownEnvironment);
+    Target.setEnvironmentName("");
+    Target.setOS(llvm::Triple::ELFIAMCU);
+    Target.setVendor(llvm::Triple::UnknownVendor);
+    Target.setVendorName("intel");
+  }
+
   return Target;
 }
 
@@ -394,14 +430,42 @@
   }
 }
 
+void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
+                                              InputList &Inputs) {
+
+  //
+  // CUDA
+  //
+  // We need to generate a CUDA toolchain if any of the inputs has a CUDA type.
+  if (llvm::any_of(Inputs, [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
+        return types::isCuda(I.first);
+      })) {
+    const ToolChain &TC = getToolChain(
+        C.getInputArgs(),
+        llvm::Triple(C.getSingleOffloadToolChain<Action::OFK_Host>()
+                             ->getTriple()
+                             .isArch64Bit()
+                         ? "nvptx64-nvidia-cuda"
+                         : "nvptx-nvidia-cuda"));
+    C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda);
+  }
+
+  //
+  // TODO: Add support for other offloading programming models here.
+  //
+
+  return;
+}
+
 Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
   llvm::PrettyStackTraceString CrashInfo("Compilation construction");
 
   // FIXME: Handle environment options which affect driver behavior, somewhere
   // (client?). GCC_EXEC_PREFIX, LPATH, CC_PRINT_OPTIONS.
 
-  if (char *env = ::getenv("COMPILER_PATH")) {
-    StringRef CompilerPath = env;
+  if (Optional<std::string> CompilerPathValue =
+          llvm::sys::Process::GetEnv("COMPILER_PATH")) {
+    StringRef CompilerPath = *CompilerPathValue;
     while (!CompilerPath.empty()) {
       std::pair<StringRef, StringRef> Split =
           CompilerPath.split(llvm::sys::EnvPathSeparator);
@@ -412,7 +476,7 @@
 
   // We look for the driver mode option early, because the mode can affect
   // how other options are parsed.
-  ParseDriverMode(ArgList.slice(1));
+  ParseDriverMode(ClangExecutable, ArgList.slice(1));
 
   // FIXME: What are we going to do with -V and -b?
 
@@ -479,6 +543,28 @@
 
   setLTOMode(Args);
 
+  // Ignore -fembed-bitcode options with LTO
+  // since the output will be bitcode anyway.
+  if (getLTOMode() == LTOK_None) {
+    if (Arg *A = Args.getLastArg(options::OPT_fembed_bitcode_EQ)) {
+      StringRef Name = A->getValue();
+      unsigned Model = llvm::StringSwitch<unsigned>(Name)
+          .Case("off", EmbedNone)
+          .Case("all", EmbedBitcode)
+          .Case("bitcode", EmbedBitcode)
+          .Case("marker", EmbedMarker)
+          .Default(~0U);
+      if (Model == ~0U) {
+        Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args)
+                                                  << Name;
+      } else
+        BitcodeEmbed = static_cast<BitcodeEmbedMode>(Model);
+    }
+  } else {
+    // claim the bitcode option under LTO so no warning is issued.
+    Args.ClaimAllArgs(options::OPT_fembed_bitcode_EQ);
+  }
+
   std::unique_ptr<llvm::opt::InputArgList> UArgs =
       llvm::make_unique<InputArgList>(std::move(Args));
 
@@ -486,16 +572,12 @@
   DerivedArgList *TranslatedArgs = TranslateInputArgs(*UArgs);
 
   // Owned by the host.
-  const ToolChain &TC =
-      getToolChain(*UArgs, computeTargetTriple(DefaultTargetTriple, *UArgs));
+  const ToolChain &TC = getToolChain(
+      *UArgs, computeTargetTriple(*this, DefaultTargetTriple, *UArgs));
 
   // The compilation takes ownership of Args.
   Compilation *C = new Compilation(*this, TC, UArgs.release(), TranslatedArgs);
 
-  C->setCudaDeviceToolChain(
-      &getToolChain(C->getArgs(), llvm::Triple(TC.getTriple().isArch64Bit()
-                                                   ? "nvptx64-nvidia-cuda"
-                                                   : "nvptx-nvidia-cuda")));
   if (!HandleImmediateArgs(*C))
     return C;
 
@@ -503,13 +585,15 @@
   InputList Inputs;
   BuildInputs(C->getDefaultToolChain(), *TranslatedArgs, Inputs);
 
+  // Populate the tool chains for the offloading devices, if any.
+  CreateOffloadingDeviceToolChains(*C, Inputs);
+
   // Construct the list of abstract actions to perform for this compilation. On
   // MachO targets this uses the driver-driver and universal actions.
   if (TC.getTriple().isOSBinFormatMachO())
     BuildUniversalActions(*C, C->getDefaultToolChain(), Inputs);
   else
-    BuildActions(*C, C->getDefaultToolChain(), C->getArgs(), Inputs,
-                 C->getActions());
+    BuildActions(*C, C->getArgs(), Inputs, C->getActions());
 
   if (CCCPrintPhases) {
     PrintActions(*C);
@@ -623,7 +707,7 @@
   if (TC.getTriple().isOSBinFormatMachO())
     BuildUniversalActions(C, TC, Inputs);
   else
-    BuildActions(C, TC, C.getArgs(), Inputs, C.getActions());
+    BuildActions(C, C.getArgs(), Inputs, C.getActions());
 
   BuildJobs(C);
 
@@ -947,19 +1031,34 @@
     os << "\"" << IA->getInputArg().getValue() << "\"";
   } else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
     os << '"' << BIA->getArchName() << '"' << ", {"
-       << PrintActions1(C, *BIA->begin(), Ids) << "}";
-  } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    os << '"'
-       << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)")
-       << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}";
+       << PrintActions1(C, *BIA->input_begin(), Ids) << "}";
+  } else if (OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+    bool IsFirst = true;
+    OA->doOnEachDependence(
+        [&](Action *A, const ToolChain *TC, const char *BoundArch) {
+          // E.g. for two CUDA device dependences whose bound arch is sm_20 and
+          // sm_35 this will generate:
+          // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device"
+          // (nvptx64-nvidia-cuda:sm_35) {#ID}
+          if (!IsFirst)
+            os << ", ";
+          os << '"';
+          if (TC)
+            os << A->getOffloadingKindPrefix();
+          else
+            os << "host";
+          os << " (";
+          os << TC->getTriple().normalize();
+
+          if (BoundArch)
+            os << ":" << BoundArch;
+          os << ")";
+          os << '"';
+          os << " {" << PrintActions1(C, A, Ids) << "}";
+          IsFirst = false;
+        });
   } else {
-    const ActionList *AL;
-    if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
-      os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}"
-         << ", gpu binaries ";
-      AL = &CHA->getDeviceActions();
-    } else
-      AL = &A->getInputs();
+    const ActionList *AL = &A->getInputs();
 
     if (AL->size()) {
       const char *Prefix = "{";
@@ -972,10 +1071,24 @@
       os << "{}";
   }
 
+  // Append offload info for all options other than the offloading action
+  // itself (e.g. (cuda-device, sm_20) or (cuda-host)).
+  std::string offload_str;
+  llvm::raw_string_ostream offload_os(offload_str);
+  if (!isa<OffloadAction>(A)) {
+    auto S = A->getOffloadingKindPrefix();
+    if (!S.empty()) {
+      offload_os << ", (" << S;
+      if (A->getOffloadingArch())
+        offload_os << ", " << A->getOffloadingArch();
+      offload_os << ")";
+    }
+  }
+
   unsigned Id = Ids.size();
   Ids[A] = Id;
   llvm::errs() << Id << ": " << os.str() << ", "
-               << types::getTypeName(A->getType()) << "\n";
+               << types::getTypeName(A->getType()) << offload_os.str() << "\n";
 
   return Id;
 }
@@ -995,7 +1108,7 @@
       isa<AssembleJobAction>(A))
     return true;
 
-  for (const Action *Input : *A)
+  for (const Action *Input : A->inputs())
     if (ContainsCompileOrAssembleAction(Input))
       return true;
 
@@ -1034,7 +1147,7 @@
     Archs.push_back(Args.MakeArgString(TC.getDefaultUniversalArchName()));
 
   ActionList SingleActions;
-  BuildActions(C, TC, Args, BAInputs, SingleActions);
+  BuildActions(C, Args, BAInputs, SingleActions);
 
   // Add in arch bindings for every top level action, as well as lipo and
   // dsymutil steps if needed.
@@ -1092,7 +1205,7 @@
 /// \brief Check that the file referenced by Value exists. If it doesn't,
 /// issue a diagnostic and return false.
 static bool DiagnoseInputExistence(const Driver &D, const DerivedArgList &Args,
-                                   StringRef Value) {
+                                   StringRef Value, types::ID Ty) {
   if (!D.getCheckInputsExist())
     return true;
 
@@ -1112,9 +1225,18 @@
   if (llvm::sys::fs::exists(Twine(Path)))
     return true;
 
-  if (D.IsCLMode() && !llvm::sys::path::is_absolute(Twine(Path)) &&
-      llvm::sys::Process::FindInEnvPath("LIB", Value))
-    return true;
+  if (D.IsCLMode()) {
+    if (!llvm::sys::path::is_absolute(Twine(Path)) &&
+        llvm::sys::Process::FindInEnvPath("LIB", Value))
+      return true;
+
+    if (Args.hasArg(options::OPT__SLASH_link) && Ty == types::TY_Object) {
+      // Arguments to the /link flag might cause the linker to search for object
+      // and library files in paths we don't know about. Don't error in such
+      // cases.
+      return true;
+    }
+  }
 
   D.Diag(clang::diag::err_drv_no_such_file) << Path;
   return false;
@@ -1230,19 +1352,19 @@
         }
       }
 
-      if (DiagnoseInputExistence(*this, Args, Value))
+      if (DiagnoseInputExistence(*this, Args, Value, Ty))
         Inputs.push_back(std::make_pair(Ty, A));
 
     } else if (A->getOption().matches(options::OPT__SLASH_Tc)) {
       StringRef Value = A->getValue();
-      if (DiagnoseInputExistence(*this, Args, Value)) {
+      if (DiagnoseInputExistence(*this, Args, Value, types::TY_C)) {
         Arg *InputArg = MakeInputArg(Args, Opts, A->getValue());
         Inputs.push_back(std::make_pair(types::TY_C, InputArg));
       }
       A->claim();
     } else if (A->getOption().matches(options::OPT__SLASH_Tp)) {
       StringRef Value = A->getValue();
-      if (DiagnoseInputExistence(*this, Args, Value)) {
+      if (DiagnoseInputExistence(*this, Args, Value, types::TY_CXX)) {
         Arg *InputArg = MakeInputArg(Args, Opts, A->getValue());
         Inputs.push_back(std::make_pair(types::TY_CXX, InputArg));
       }
@@ -1284,32 +1406,51 @@
 static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
                                 const Arg *InputArg, Action *HostAction,
                                 ActionList &Actions) {
-  Arg *PartialCompilationArg = Args.getLastArg(options::OPT_cuda_host_only,
-                                               options::OPT_cuda_device_only);
-  // Host-only compilation case.
-  if (PartialCompilationArg &&
-      PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only))
-    return C.MakeAction<CudaHostAction>(HostAction, ActionList());
+  Arg *PartialCompilationArg = Args.getLastArg(
+      options::OPT_cuda_host_only, options::OPT_cuda_device_only,
+      options::OPT_cuda_compile_host_device);
+  bool CompileHostOnly =
+      PartialCompilationArg &&
+      PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only);
+  bool CompileDeviceOnly =
+      PartialCompilationArg &&
+      PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only);
+  const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+  assert(HostTC && "No toolchain for host compilation.");
+  if (HostTC->getTriple().isNVPTX()) {
+    // We do not support targeting NVPTX for host compilation. Throw
+    // an error and abort pipeline construction early so we don't trip
+    // asserts that assume device-side compilation.
+    C.getDriver().Diag(diag::err_drv_cuda_nvptx_host);
+    return nullptr;
+  }
+
+  if (CompileHostOnly) {
+    OffloadAction::HostDependence HDep(*HostAction, *HostTC,
+                                       /*BoundArch=*/nullptr, Action::OFK_Cuda);
+    return C.MakeAction<OffloadAction>(HDep);
+  }
 
   // Collect all cuda_gpu_arch parameters, removing duplicates.
-  SmallVector<const char *, 4> GpuArchList;
-  llvm::StringSet<> GpuArchNames;
+  SmallVector<CudaArch, 4> GpuArchList;
+  llvm::SmallSet<CudaArch, 4> GpuArchs;
   for (Arg *A : Args) {
     if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
       continue;
     A->claim();
 
-    const auto& Arch = A->getValue();
-    if (!CudaDeviceAction::IsValidGpuArchName(Arch))
-      C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch;
-    else if (GpuArchNames.insert(Arch).second)
+    const auto &ArchStr = A->getValue();
+    CudaArch Arch = StringToCudaArch(ArchStr);
+    if (Arch == CudaArch::UNKNOWN)
+      C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
+    else if (GpuArchs.insert(Arch).second)
       GpuArchList.push_back(Arch);
   }
 
   // Default to sm_20 which is the lowest common denominator for supported GPUs.
   // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
   if (GpuArchList.empty())
-    GpuArchList.push_back("sm_20");
+    GpuArchList.push_back(CudaArch::SM_20);
 
   // Replicate inputs for each GPU architecture.
   Driver::InputList CudaDeviceInputs;
@@ -1317,11 +1458,8 @@
     CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
 
   // Build actions for all device inputs.
-  assert(C.getCudaDeviceToolChain() &&
-         "Missing toolchain for device-side compilation.");
   ActionList CudaDeviceActions;
-  C.getDriver().BuildActions(C, *C.getCudaDeviceToolChain(), Args,
-                             CudaDeviceInputs, CudaDeviceActions);
+  C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
   assert(GpuArchList.size() == CudaDeviceActions.size() &&
          "Failed to create actions for all devices");
 
@@ -1331,28 +1469,32 @@
         return a->getKind() != Action::AssembleJobClass;
       });
 
+  const ToolChain *CudaTC = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+
   // Figure out what to do with device actions -- pass them as inputs to the
   // host action or run each of them independently.
-  bool DeviceOnlyCompilation = PartialCompilationArg != nullptr;
-  if (PartialCompilation || DeviceOnlyCompilation) {
+  if (PartialCompilation || CompileDeviceOnly) {
     // In case of partial or device-only compilation results of device actions
     // are not consumed by the host action device actions have to be added to
     // top-level actions list with AtTopLevel=true and run independently.
 
     // -o is ambiguous if we have more than one top-level action.
     if (Args.hasArg(options::OPT_o) &&
-        (!DeviceOnlyCompilation || GpuArchList.size() > 1)) {
+        (!CompileDeviceOnly || GpuArchList.size() > 1)) {
       C.getDriver().Diag(
           clang::diag::err_drv_output_argument_with_multiple_files);
       return nullptr;
     }
 
-    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-      Actions.push_back(C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I],
-                                                       GpuArchList[I],
-                                                       /* AtTopLevel */ true));
+    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+      OffloadAction::DeviceDependences DDep;
+      DDep.add(*CudaDeviceActions[I], *CudaTC, CudaArchToString(GpuArchList[I]),
+               Action::OFK_Cuda);
+      Actions.push_back(
+          C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType()));
+    }
     // Kill host action in case of device-only compilation.
-    if (DeviceOnlyCompilation)
+    if (CompileDeviceOnly)
       return nullptr;
     return HostAction;
   }
@@ -1370,24 +1512,26 @@
     Action* BackendAction = AssembleAction->getInputs()[0];
     assert(BackendAction->getType() == types::TY_PP_Asm);
 
-    for (const auto& A : {AssembleAction, BackendAction}) {
-      DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
-          A, GpuArchList[I], /* AtTopLevel */ false));
+    for (auto &A : {AssembleAction, BackendAction}) {
+      OffloadAction::DeviceDependences DDep;
+      DDep.add(*A, *CudaTC, CudaArchToString(GpuArchList[I]), Action::OFK_Cuda);
+      DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
     }
   }
-  auto FatbinAction = C.MakeAction<CudaDeviceAction>(
-      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
-      /* GpuArchName = */ nullptr,
-      /* AtTopLevel = */ false);
+  auto FatbinAction =
+      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
   // Return a new host action that incorporates original host action and all
   // device actions.
-  return C.MakeAction<CudaHostAction>(std::move(HostAction),
-                                      ActionList({FatbinAction}));
+  OffloadAction::HostDependence HDep(*HostAction, *HostTC,
+                                     /*BoundArch=*/nullptr, Action::OFK_Cuda);
+  OffloadAction::DeviceDependences DDep;
+  DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda);
+  return C.MakeAction<OffloadAction>(HDep, DDep);
 }
 
-void Driver::BuildActions(Compilation &C, const ToolChain &TC,
-                          DerivedArgList &Args, const InputList &Inputs,
-                          ActionList &Actions) const {
+void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
+                          const InputList &Inputs, ActionList &Actions) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
 
   if (!SuppressMissingInputWarning && Inputs.empty()) {
@@ -1440,6 +1584,61 @@
     }
   }
 
+  // Diagnose unsupported forms of /Yc /Yu. Ignore /Yc/Yu for now if:
+  // * no filename after it
+  // * both /Yc and /Yu passed but with different filenames
+  // * corresponding file not also passed as /FI
+  Arg *YcArg = Args.getLastArg(options::OPT__SLASH_Yc);
+  Arg *YuArg = Args.getLastArg(options::OPT__SLASH_Yu);
+  if (YcArg && YcArg->getValue()[0] == '\0') {
+    Diag(clang::diag::warn_drv_ycyu_no_arg_clang_cl) << YcArg->getSpelling();
+    Args.eraseArg(options::OPT__SLASH_Yc);
+    YcArg = nullptr;
+  }
+  if (YuArg && YuArg->getValue()[0] == '\0') {
+    Diag(clang::diag::warn_drv_ycyu_no_arg_clang_cl) << YuArg->getSpelling();
+    Args.eraseArg(options::OPT__SLASH_Yu);
+    YuArg = nullptr;
+  }
+  if (YcArg && YuArg && strcmp(YcArg->getValue(), YuArg->getValue()) != 0) {
+    Diag(clang::diag::warn_drv_ycyu_different_arg_clang_cl);
+    Args.eraseArg(options::OPT__SLASH_Yc);
+    Args.eraseArg(options::OPT__SLASH_Yu);
+    YcArg = YuArg = nullptr;
+  }
+  if (YcArg || YuArg) {
+    StringRef Val = YcArg ? YcArg->getValue() : YuArg->getValue();
+    bool FoundMatchingInclude = false;
+    for (const Arg *Inc : Args.filtered(options::OPT_include)) {
+      // FIXME: Do case-insensitive matching and consider / and \ as equal.
+      if (Inc->getValue() == Val)
+        FoundMatchingInclude = true;
+    }
+    if (!FoundMatchingInclude) {
+      Diag(clang::diag::warn_drv_ycyu_no_fi_arg_clang_cl)
+          << (YcArg ? YcArg : YuArg)->getSpelling();
+      Args.eraseArg(options::OPT__SLASH_Yc);
+      Args.eraseArg(options::OPT__SLASH_Yu);
+      YcArg = YuArg = nullptr;
+    }
+  }
+  if (YcArg && Inputs.size() > 1) {
+    Diag(clang::diag::warn_drv_yc_multiple_inputs_clang_cl);
+    Args.eraseArg(options::OPT__SLASH_Yc);
+    YcArg = nullptr;
+  }
+  if (Args.hasArg(options::OPT__SLASH_Y_)) {
+    // /Y- disables all pch handling.  Rather than check for it everywhere,
+    // just remove clang-cl pch-related flags here.
+    Args.eraseArg(options::OPT__SLASH_Fp);
+    Args.eraseArg(options::OPT__SLASH_Yc);
+    Args.eraseArg(options::OPT__SLASH_Yu);
+    YcArg = YuArg = nullptr;
+  }
+
+  // Track the host offload kinds used on this compilation.
+  unsigned CompilationActiveOffloadHostKinds = 0u;
+
   // Construct the actions to perform.
   ActionList LinkerInputs;
 
@@ -1483,12 +1682,34 @@
       continue;
     }
 
+    if (YcArg) {
+      // Add a separate precompile phase for the compile phase.
+      if (FinalPhase >= phases::Compile) {
+        llvm::SmallVector<phases::ID, phases::MaxNumberOfPhases> PCHPL;
+        types::getCompilationPhases(types::TY_CXXHeader, PCHPL);
+        Arg *PchInputArg = MakeInputArg(Args, Opts, YcArg->getValue());
+
+        // Build the pipeline for the pch file.
+        Action *ClangClPch = C.MakeAction<InputAction>(*PchInputArg, InputType);
+        for (phases::ID Phase : PCHPL)
+          ClangClPch = ConstructPhaseAction(C, Args, Phase, ClangClPch);
+        assert(ClangClPch);
+        Actions.push_back(ClangClPch);
+        // The driver currently exits after the first failed command.  This
+        // relies on that behavior, to make sure if the pch generation fails,
+        // the main compilation won't run.
+      }
+    }
+
     phases::ID CudaInjectionPhase =
         (phases::Compile < FinalPhase &&
          llvm::find(PL, phases::Compile) != PL.end())
             ? phases::Compile
             : FinalPhase;
 
+    // Track the host offload kinds used on this input.
+    unsigned InputActiveOffloadHostKinds = 0u;
+
     // Build the pipeline for this file.
     Action *Current = C.MakeAction<InputAction>(*InputArg, InputType);
     for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
@@ -1514,27 +1735,42 @@
         continue;
 
       // Otherwise construct the appropriate action.
-      Current = ConstructPhaseAction(C, TC, Args, Phase, Current);
+      Current = ConstructPhaseAction(C, Args, Phase, Current);
 
       if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) {
         Current = buildCudaActions(C, Args, InputArg, Current, Actions);
         if (!Current)
           break;
+
+        // We produced a CUDA action for this input, so the host has to support
+        // CUDA.
+        InputActiveOffloadHostKinds |= Action::OFK_Cuda;
+        CompilationActiveOffloadHostKinds |= Action::OFK_Cuda;
       }
 
       if (Current->getType() == types::TY_Nothing)
         break;
     }
 
-    // If we ended with something, add to the output list.
-    if (Current)
+    // If we ended with something, add to the output list. Also, propagate the
+    // offload information to the top-level host action related with the current
+    // input.
+    if (Current) {
+      if (InputActiveOffloadHostKinds)
+        Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds,
+                                          /*BoundArch=*/nullptr);
       Actions.push_back(Current);
+    }
   }
 
-  // Add a link action if necessary.
-  if (!LinkerInputs.empty())
+  // Add a link action if necessary and propagate the offload information for
+  // the current compilation.
+  if (!LinkerInputs.empty()) {
     Actions.push_back(
         C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image));
+    Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds,
+                                             /*BoundArch=*/nullptr);
+  }
 
   // If we are linking, claim any options which are obviously only used for
   // compilation.
@@ -1546,14 +1782,14 @@
   // Claim ignored clang-cl options.
   Args.ClaimAllArgs(options::OPT_cl_ignored_Group);
 
-  // Claim --cuda-host-only arg which may be passed to non-CUDA
-  // compilations and should not trigger warnings there.
+  // Claim --cuda-host-only and --cuda-compile-host-device, which may be passed
+  // to non-CUDA compilations and should not trigger warnings there.
   Args.ClaimAllArgs(options::OPT_cuda_host_only);
+  Args.ClaimAllArgs(options::OPT_cuda_compile_host_device);
 }
 
-Action *Driver::ConstructPhaseAction(Compilation &C, const ToolChain &TC,
-                                     const ArgList &Args, phases::ID Phase,
-                                     Action *Input) const {
+Action *Driver::ConstructPhaseAction(Compilation &C, const ArgList &Args,
+                                     phases::ID Phase, Action *Input) const {
   llvm::PrettyStackTraceString CrashInfo("Constructing phase actions");
   // Build the appropriate action.
   switch (Phase) {
@@ -1670,7 +1906,8 @@
                        /*BoundArch*/ nullptr,
                        /*AtTopLevel*/ true,
                        /*MultipleArchs*/ ArchNames.size() > 1,
-                       /*LinkingOutput*/ LinkingOutput, CachedResults);
+                       /*LinkingOutput*/ LinkingOutput, CachedResults,
+                       /*BuildForOffloadDevice*/ false);
   }
 
   // If the user passed -Qunused-arguments or there were errors, don't warn
@@ -1682,8 +1919,9 @@
   // Claim -### here.
   (void)C.getArgs().hasArg(options::OPT__HASH_HASH_HASH);
 
-  // Claim --driver-mode, it was handled earlier.
+  // Claim --driver-mode, --rsp-quoting, it was handled earlier.
   (void)C.getArgs().hasArg(options::OPT_driver_mode);
+  (void)C.getArgs().hasArg(options::OPT_rsp_quoting);
 
   for (Arg *A : C.getArgs()) {
     // FIXME: It would be nice to be able to send the argument to the
@@ -1710,74 +1948,123 @@
           continue;
       }
 
-      Diag(clang::diag::warn_drv_unused_argument)
-          << A->getAsString(C.getArgs());
+      // In clang-cl, don't mention unknown arguments here since they have
+      // already been warned about.
+      if (!IsCLMode() || !A->getOption().matches(options::OPT_UNKNOWN))
+        Diag(clang::diag::warn_drv_unused_argument)
+            << A->getAsString(C.getArgs());
     }
   }
 }
-
+/// Collapse an offloading action looking for a job of the given type. The input
+/// action is changed to the input of the collapsed sequence. If we effectively
+/// had a collapse return the corresponding offloading action, otherwise return
+/// null.
+template <typename T>
+static OffloadAction *collapseOffloadingAction(Action *&CurAction) {
+  if (!CurAction)
+    return nullptr;
+  if (auto *OA = dyn_cast<OffloadAction>(CurAction)) {
+    if (OA->hasHostDependence())
+      if (auto *HDep = dyn_cast<T>(OA->getHostDependence())) {
+        CurAction = HDep;
+        return OA;
+      }
+    if (OA->hasSingleDeviceDependence())
+      if (auto *DDep = dyn_cast<T>(OA->getSingleDeviceDependence())) {
+        CurAction = DDep;
+        return OA;
+      }
+  }
+  return nullptr;
+}
 // Returns a Tool for a given JobAction.  In case the action and its
 // predecessors can be combined, updates Inputs with the inputs of the
 // first combined action. If one of the collapsed actions is a
 // CudaHostAction, updates CollapsedCHA with the pointer to it so the
 // caller can deal with extra handling such action requires.
 static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
-                                    const ToolChain *TC, const JobAction *JA,
+                                    bool EmbedBitcode, const ToolChain *TC,
+                                    const JobAction *JA,
                                     const ActionList *&Inputs,
-                                    const CudaHostAction *&CollapsedCHA) {
+                                    ActionList &CollapsedOffloadAction) {
   const Tool *ToolForJob = nullptr;
-  CollapsedCHA = nullptr;
+  CollapsedOffloadAction.clear();
 
   // See if we should look for a compiler with an integrated assembler. We match
   // bottom up, so what we are actually looking for is an assembler job with a
   // compiler input.
 
+  // Look through offload actions between assembler and backend actions.
+  Action *BackendJA = (isa<AssembleJobAction>(JA) && Inputs->size() == 1)
+                          ? *Inputs->begin()
+                          : nullptr;
+  auto *BackendOA = collapseOffloadingAction<BackendJobAction>(BackendJA);
+
   if (TC->useIntegratedAs() && !SaveTemps &&
       !C.getArgs().hasArg(options::OPT_via_file_asm) &&
       !C.getArgs().hasArg(options::OPT__SLASH_FA) &&
-      !C.getArgs().hasArg(options::OPT__SLASH_Fa) &&
-      isa<AssembleJobAction>(JA) && Inputs->size() == 1 &&
-      isa<BackendJobAction>(*Inputs->begin())) {
-    // A BackendJob is always preceded by a CompileJob, and without
-    // -save-temps they will always get combined together, so instead of
-    // checking the backend tool, check if the tool for the CompileJob
-    // has an integrated assembler.
-    const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
-    // Compile job may be wrapped in CudaHostAction, extract it if
-    // that's the case and update CollapsedCHA if we combine phases.
-    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
-    JobAction *CompileJA =
-        cast<CompileJobAction>(CHA ? *CHA->begin() : *BackendInputs->begin());
-    assert(CompileJA && "Backend job is not preceeded by compile job.");
-    const Tool *Compiler = TC->SelectTool(*CompileJA);
+      !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA &&
+      isa<BackendJobAction>(BackendJA)) {
+    // A BackendJob is always preceded by a CompileJob, and without -save-temps
+    // or -fembed-bitcode, they will always get combined together, so instead of
+    // checking the backend tool, check if the tool for the CompileJob has an
+    // integrated assembler. For -fembed-bitcode, CompileJob is still used to
+    // look up tools for BackendJob, but they need to match before we can split
+    // them.
+
+    // Look through offload actions between backend and compile actions.
+    Action *CompileJA = *BackendJA->getInputs().begin();
+    auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+    assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+           "Backend job is not preceeded by compile job.");
+    const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
     if (!Compiler)
       return nullptr;
+    // When using -fembed-bitcode, it is required to have the same tool (clang)
+    // for both CompilerJA and BackendJA. Otherwise, combine two stages.
+    if (EmbedBitcode) {
+      JobAction *InputJA = cast<JobAction>(*Inputs->begin());
+      const Tool *BackendTool = TC->SelectTool(*InputJA);
+      if (BackendTool == Compiler)
+        CompileJA = InputJA;
+    }
     if (Compiler->hasIntegratedAssembler()) {
       Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
-      CollapsedCHA = CHA;
+      // Save the collapsed offload actions because they may still contain
+      // device actions.
+      if (CompileOA)
+        CollapsedOffloadAction.push_back(CompileOA);
+      if (BackendOA)
+        CollapsedOffloadAction.push_back(BackendOA);
     }
   }
 
   // A backend job should always be combined with the preceding compile job
-  // unless OPT_save_temps is enabled and the compiler is capable of emitting
-  // LLVM IR as an intermediate output.
+  // unless OPT_save_temps or OPT_fembed_bitcode is enabled and the compiler is
+  // capable of emitting LLVM IR as an intermediate output.
   if (isa<BackendJobAction>(JA)) {
     // Check if the compiler supports emitting LLVM IR.
     assert(Inputs->size() == 1);
-    // Compile job may be wrapped in CudaHostAction, extract it if
-    // that's the case and update CollapsedCHA if we combine phases.
-    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
-    JobAction *CompileJA =
-        cast<CompileJobAction>(CHA ? *CHA->begin() : *Inputs->begin());
-    assert(CompileJA && "Backend job is not preceeded by compile job.");
-    const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+    // Look through offload actions between backend and compile actions.
+    Action *CompileJA = *JA->getInputs().begin();
+    auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+    assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+           "Backend job is not preceeded by compile job.");
+    const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
     if (!Compiler)
       return nullptr;
-    if (!Compiler->canEmitIR() || !SaveTemps) {
+    if (!Compiler->canEmitIR() ||
+        (!SaveTemps && !EmbedBitcode)) {
       Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
-      CollapsedCHA = CHA;
+
+      if (CompileOA)
+        CollapsedOffloadAction.push_back(CompileOA);
     }
   }
 
@@ -1788,12 +2075,21 @@
   // See if we should use an integrated preprocessor. We do so when we have
   // exactly one input, since this is the only use case we care about
   // (irrelevant since we don't support combine yet).
-  if (Inputs->size() == 1 && isa<PreprocessJobAction>(*Inputs->begin()) &&
+
+  // Look through offload actions after preprocessing.
+  Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr;
+  auto *PreprocessOA =
+      collapseOffloadingAction<PreprocessJobAction>(PreprocessJA);
+
+  if (PreprocessJA && isa<PreprocessJobAction>(PreprocessJA) &&
       !C.getArgs().hasArg(options::OPT_no_integrated_cpp) &&
       !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps &&
       !C.getArgs().hasArg(options::OPT_rewrite_objc) &&
-      ToolForJob->hasIntegratedCPP())
-    Inputs = &(*Inputs)[0]->getInputs();
+      ToolForJob->hasIntegratedCPP()) {
+    Inputs = &PreprocessJA->getInputs();
+    if (PreprocessOA)
+      CollapsedOffloadAction.push_back(PreprocessOA);
+  }
 
   return ToolForJob;
 }
@@ -1801,8 +2097,8 @@
 InputInfo Driver::BuildJobsForAction(
     Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch,
     bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults)
-    const {
+    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+    bool BuildForOffloadDevice) const {
   // The bound arch is not necessarily represented in the toolchain's triple --
   // for example, armv7 and armv7s both map to the same triple -- so we need
   // both in our map.
@@ -1816,9 +2112,9 @@
   if (CachedResult != CachedResults.end()) {
     return CachedResult->second;
   }
-  InputInfo Result =
-      BuildJobsForActionNoCache(C, A, TC, BoundArch, AtTopLevel, MultipleArchs,
-                                LinkingOutput, CachedResults);
+  InputInfo Result = BuildJobsForActionNoCache(
+      C, A, TC, BoundArch, AtTopLevel, MultipleArchs, LinkingOutput,
+      CachedResults, BuildForOffloadDevice);
   CachedResults[ActionTC] = Result;
   return Result;
 }
@@ -1826,21 +2122,65 @@
 InputInfo Driver::BuildJobsForActionNoCache(
     Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch,
     bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults)
-    const {
+    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+    bool BuildForOffloadDevice) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
 
-  InputInfoList CudaDeviceInputInfos;
-  if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
-    // Append outputs of device jobs to the input list.
-    for (const Action *DA : CHA->getDeviceActions()) {
-      CudaDeviceInputInfos.push_back(BuildJobsForAction(
-          C, DA, TC, nullptr, AtTopLevel,
-          /*MultipleArchs*/ false, LinkingOutput, CachedResults));
+  InputInfoList OffloadDependencesInputInfo;
+  if (const OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+    // The offload action is expected to be used in four different situations.
+    //
+    // a) Set a toolchain/architecture/kind for a host action:
+    //    Host Action 1 -> OffloadAction -> Host Action 2
+    //
+    // b) Set a toolchain/architecture/kind for a device action;
+    //    Device Action 1 -> OffloadAction -> Device Action 2
+    //
+    // c) Specify a device dependences to a host action;
+    //    Device Action 1  _
+    //                      \
+    //      Host Action 1  ---> OffloadAction -> Host Action 2
+    //
+    // d) Specify a host dependence to a device action.
+    //      Host Action 1  _
+    //                      \
+    //    Device Action 1  ---> OffloadAction -> Device Action 2
+    //
+    // For a) and b), we just return the job generated for the dependence. For
+    // c) and d) we override the current action with the host/device dependence
+    // if the current toolchain is host/device and set the offload dependences
+    // info with the jobs obtained from the device/host dependence(s).
+
+    // If there is a single device option, just generate the job for it.
+    if (OA->hasSingleDeviceDependence()) {
+      InputInfo DevA;
+      OA->doOnEachDeviceDependence([&](Action *DepA, const ToolChain *DepTC,
+                                       const char *DepBoundArch) {
+        DevA =
+            BuildJobsForAction(C, DepA, DepTC, DepBoundArch, AtTopLevel,
+                               /*MultipleArchs*/ !!DepBoundArch, LinkingOutput,
+                               CachedResults, /*BuildForOffloadDevice=*/true);
+      });
+      return DevA;
     }
-    // Override current action with a real host compile action and continue
-    // processing it.
-    A = *CHA->begin();
+
+    // If 'Action 2' is host, we generate jobs for the device dependences and
+    // override the current action with the host dependence. Otherwise, we
+    // generate the host dependences and override the action with the device
+    // dependence. The dependences can't therefore be a top-level action.
+    OA->doOnEachDependence(
+        /*IsHostDependence=*/BuildForOffloadDevice,
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+              C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false,
+              /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults,
+              /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() !=
+                  Action::OFK_None));
+        });
+
+    A = BuildForOffloadDevice
+            ? OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true)
+            : OA->getHostDependence();
   }
 
   if (const InputAction *IA = dyn_cast<InputAction>(A)) {
@@ -1860,47 +2200,41 @@
     const char *ArchName = BAA->getArchName();
 
     if (ArchName)
-      TC = &getToolChain(
-          C.getArgs(),
-          computeTargetTriple(DefaultTargetTriple, C.getArgs(), ArchName));
+      TC = &getToolChain(C.getArgs(),
+                         computeTargetTriple(*this, DefaultTargetTriple,
+                                             C.getArgs(), ArchName));
     else
       TC = &C.getDefaultToolChain();
 
-    return BuildJobsForAction(C, *BAA->begin(), TC, ArchName, AtTopLevel,
-                              MultipleArchs, LinkingOutput, CachedResults);
+    return BuildJobsForAction(C, *BAA->input_begin(), TC, ArchName, AtTopLevel,
+                              MultipleArchs, LinkingOutput, CachedResults,
+                              BuildForOffloadDevice);
   }
 
-  if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    // Initial processing of CudaDeviceAction carries host params.
-    // Call BuildJobsForAction() again, now with correct device parameters.
-    InputInfo II = BuildJobsForAction(
-        C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(),
-        CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput,
-        CachedResults);
-    // Currently II's Action is *CDA->begin().  Set it to CDA instead, so that
-    // one can retrieve II's GPU arch.
-    II.setAction(A);
-    return II;
-  }
 
   const ActionList *Inputs = &A->getInputs();
 
   const JobAction *JA = cast<JobAction>(A);
-  const CudaHostAction *CollapsedCHA = nullptr;
+  ActionList CollapsedOffloadActions;
+
   const Tool *T =
-      selectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs, CollapsedCHA);
+      selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA,
+                       Inputs, CollapsedOffloadActions);
   if (!T)
     return InputInfo();
 
-  // If we've collapsed action list that contained CudaHostAction we
-  // need to build jobs for device-side inputs it may have held.
-  if (CollapsedCHA) {
-    for (const Action *DA : CollapsedCHA->getDeviceActions()) {
-      CudaDeviceInputInfos.push_back(BuildJobsForAction(
-          C, DA, TC, "", AtTopLevel,
-          /*MultipleArchs*/ false, LinkingOutput, CachedResults));
-    }
-  }
+  // If we've collapsed action list that contained OffloadAction we
+  // need to build jobs for host/device-side inputs it may have held.
+  for (const auto *OA : CollapsedOffloadActions)
+    cast<OffloadAction>(OA)->doOnEachDependence(
+        /*IsHostDependence=*/BuildForOffloadDevice,
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+              C, DepA, DepTC, DepBoundArch, AtTopLevel,
+              /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults,
+              /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() !=
+                  Action::OFK_None));
+        });
 
   // Only use pipes when there is exactly one input.
   InputInfoList InputInfos;
@@ -1910,9 +2244,9 @@
     // FIXME: Clean this up.
     bool SubJobAtTopLevel =
         AtTopLevel && (isa<DsymutilJobAction>(A) || isa<VerifyJobAction>(A));
-    InputInfos.push_back(BuildJobsForAction(C, Input, TC, BoundArch,
-                                            SubJobAtTopLevel, MultipleArchs,
-                                            LinkingOutput, CachedResults));
+    InputInfos.push_back(BuildJobsForAction(
+        C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput,
+        CachedResults, BuildForOffloadDevice));
   }
 
   // Always use the first input as the base input.
@@ -1923,9 +2257,23 @@
   if (JA->getType() == types::TY_dSYM)
     BaseInput = InputInfos[0].getFilename();
 
-  // Append outputs of cuda device jobs to the input list
-  if (CudaDeviceInputInfos.size())
-    InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+  // Append outputs of offload device jobs to the input list
+  if (!OffloadDependencesInputInfo.empty())
+    InputInfos.append(OffloadDependencesInputInfo.begin(),
+                      OffloadDependencesInputInfo.end());
+
+  // Set the effective triple of the toolchain for the duration of this job.
+  llvm::Triple EffectiveTriple;
+  const ToolChain &ToolTC = T->getToolChain();
+  const ArgList &Args = C.getArgsForToolChain(TC, BoundArch);
+  if (InputInfos.size() != 1) {
+    EffectiveTriple = llvm::Triple(ToolTC.ComputeEffectiveClangTriple(Args));
+  } else {
+    // Pass along the input type if it can be unambiguously determined.
+    EffectiveTriple = llvm::Triple(
+        ToolTC.ComputeEffectiveClangTriple(Args, InputInfos[0].getType()));
+  }
+  RegisterEffectiveTriple TripleRAII(ToolTC, EffectiveTriple);
 
   // Determine the place to write output to, if any.
   InputInfo Result;
@@ -1933,7 +2281,8 @@
     Result = InputInfo(A, BaseInput);
   else
     Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch,
-                                             AtTopLevel, MultipleArchs),
+                                             AtTopLevel, MultipleArchs,
+                                             TC->getTriple().normalize()),
                        BaseInput);
 
   if (CCCPrintBindings && !CCGenDiagnostics) {
@@ -1993,7 +2342,8 @@
 const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
                                        const char *BaseInput,
                                        const char *BoundArch, bool AtTopLevel,
-                                       bool MultipleArchs) const {
+                                       bool MultipleArchs,
+                                       StringRef NormalizedTriple) const {
   llvm::PrettyStackTraceString CrashInfo("Computing output path");
   // Output to a user requested destination?
   if (AtTopLevel && !isa<DsymutilJobAction>(JA) && !isa<VerifyJobAction>(JA)) {
@@ -2079,11 +2429,15 @@
           MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image);
     } else if (MultipleArchs && BoundArch) {
       SmallString<128> Output(getDefaultImageName());
+      Output += JA.getOffloadingFileNamePrefix(NormalizedTriple);
       Output += "-";
       Output.append(BoundArch);
       NamedOutput = C.getArgs().MakeArgString(Output.c_str());
-    } else
+    } else {
       NamedOutput = getDefaultImageName();
+    }
+  } else if (JA.getType() == types::TY_PCH && IsCLMode()) {
+    NamedOutput = C.getArgs().MakeArgString(GetClPchPath(C, BaseName).c_str());
   } else {
     const char *Suffix = types::getTypeTempSuffix(JA.getType(), IsCLMode());
     assert(Suffix && "All types used for output should have a suffix.");
@@ -2092,6 +2446,7 @@
     if (!types::appendSuffixForType(JA.getType()))
       End = BaseName.rfind('.');
     SmallString<128> Suffixed(BaseName.substr(0, End));
+    Suffixed += JA.getOffloadingFileNamePrefix(NormalizedTriple);
     if (MultipleArchs && BoundArch) {
       Suffixed += "-";
       Suffixed.append(BoundArch);
@@ -2137,7 +2492,7 @@
   }
 
   // As an annoying special case, PCH generation doesn't strip the pathname.
-  if (JA.getType() == types::TY_PCH) {
+  if (JA.getType() == types::TY_PCH && !IsCLMode()) {
     llvm::sys::path::remove_filename(BasePath);
     if (BasePath.empty())
       BasePath = NamedOutput;
@@ -2249,12 +2604,34 @@
   return Path.str();
 }
 
+std::string Driver::GetClPchPath(Compilation &C, StringRef BaseName) const {
+  SmallString<128> Output;
+  if (Arg *FpArg = C.getArgs().getLastArg(options::OPT__SLASH_Fp)) {
+    // FIXME: If anybody needs it, implement this obscure rule:
+    // "If you specify a directory without a file name, the default file name
+    // is VCx0.pch., where x is the major version of Visual C++ in use."
+    Output = FpArg->getValue();
+
+    // "If you do not specify an extension as part of the path name, an
+    // extension of .pch is assumed. "
+    if (!llvm::sys::path::has_extension(Output))
+      Output += ".pch";
+  } else {
+    Output = BaseName;
+    llvm::sys::path::replace_extension(Output, ".pch");
+  }
+  return Output.str();
+}
+
 const ToolChain &Driver::getToolChain(const ArgList &Args,
                                       const llvm::Triple &Target) const {
 
   ToolChain *&TC = ToolChains[Target.str()];
   if (!TC) {
     switch (Target.getOS()) {
+    case llvm::Triple::Haiku:
+      TC = new toolchains::Haiku(*this, Target, Args);
+      break;
     case llvm::Triple::CloudABI:
       TC = new toolchains::CloudABI(*this, Target, Args);
       break;
@@ -2284,6 +2661,7 @@
       TC = new toolchains::Minix(*this, Target, Args);
       break;
     case llvm::Triple::Linux:
+    case llvm::Triple::ELFIAMCU:
       if (Target.getArch() == llvm::Triple::hexagon)
         TC = new toolchains::HexagonToolChain(*this, Target, Args);
       else if ((Target.getVendor() == llvm::Triple::MipsTechnologies) &&
@@ -2339,6 +2717,9 @@
       case llvm::Triple::hexagon:
         TC = new toolchains::HexagonToolChain(*this, Target, Args);
         break;
+      case llvm::Triple::lanai:
+        TC = new toolchains::LanaiToolChain(*this, Target, Args);
+        break;
       case llvm::Triple::xcore:
         TC = new toolchains::XCoreToolChain(*this, Target, Args);
         break;
@@ -2363,7 +2744,8 @@
 
 bool Driver::ShouldUseClangCompiler(const JobAction &JA) const {
   // Say "no" if there is not exactly one input of a type clang understands.
-  if (JA.size() != 1 || !types::isAcceptedByClang((*JA.begin())->getType()))
+  if (JA.size() != 1 ||
+      !types::isAcceptedByClang((*JA.input_begin())->getType()))
     return false;
 
   // And say "no" if this is not a kind of action clang understands.
diff --git a/lib/Driver/Job.cpp b/lib/Driver/Job.cpp
index a16bc42..2826cc4 100644
--- a/lib/Driver/Job.cpp
+++ b/lib/Driver/Job.cpp
@@ -7,18 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Driver/Job.h"
 #include "InputInfo.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Driver/Job.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -311,6 +312,29 @@
   return SecondaryStatus;
 }
 
+ForceSuccessCommand::ForceSuccessCommand(const Action &Source_,
+                                         const Tool &Creator_,
+                                         const char *Executable_,
+                                         const ArgStringList &Arguments_,
+                                         ArrayRef<InputInfo> Inputs)
+    : Command(Source_, Creator_, Executable_, Arguments_, Inputs) {}
+
+void ForceSuccessCommand::Print(raw_ostream &OS, const char *Terminator,
+                            bool Quote, CrashReportInfo *CrashInfo) const {
+  Command::Print(OS, "", Quote, CrashInfo);
+  OS << " || (exit 0)" << Terminator;
+}
+
+int ForceSuccessCommand::Execute(const StringRef **Redirects,
+                                 std::string *ErrMsg,
+                                 bool *ExecutionFailed) const {
+  int Status = Command::Execute(Redirects, ErrMsg, ExecutionFailed);
+  (void)Status;
+  if (ExecutionFailed)
+    *ExecutionFailed = false;
+  return 0;
+}
+
 void JobList::Print(raw_ostream &OS, const char *Terminator, bool Quote,
                     CrashReportInfo *CrashInfo) const {
   for (const auto &Job : *this)
diff --git a/lib/Driver/MSVCToolChain.cpp b/lib/Driver/MSVCToolChain.cpp
index 461dc29..f6d6544 100644
--- a/lib/Driver/MSVCToolChain.cpp
+++ b/lib/Driver/MSVCToolChain.cpp
@@ -19,8 +19,10 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include <cstdio>
 
@@ -94,23 +96,34 @@
 #ifdef USE_WIN32
 static bool readFullStringValue(HKEY hkey, const char *valueName,
                                 std::string &value) {
-  // FIXME: We should be using the W versions of the registry functions, but
-  // doing so requires UTF8 / UTF16 conversions similar to how we handle command
-  // line arguments.  The UTF8 conversion functions are not exposed publicly
-  // from LLVM though, so in order to do this we will probably need to create
-  // a registry abstraction in LLVMSupport that is Windows only.
+  std::wstring WideValueName;
+  if (!llvm::ConvertUTF8toWide(valueName, WideValueName))
+    return false;
+
   DWORD result = 0;
   DWORD valueSize = 0;
   DWORD type = 0;
   // First just query for the required size.
-  result = RegQueryValueEx(hkey, valueName, NULL, &type, NULL, &valueSize);
-  if (result != ERROR_SUCCESS || type != REG_SZ)
+  result = RegQueryValueExW(hkey, WideValueName.c_str(), NULL, &type, NULL,
+                            &valueSize);
+  if (result != ERROR_SUCCESS || type != REG_SZ || !valueSize)
     return false;
   std::vector<BYTE> buffer(valueSize);
-  result = RegQueryValueEx(hkey, valueName, NULL, NULL, &buffer[0], &valueSize);
-  if (result == ERROR_SUCCESS)
-    value.assign(reinterpret_cast<const char *>(buffer.data()));
-  return result;
+  result = RegQueryValueExW(hkey, WideValueName.c_str(), NULL, NULL, &buffer[0],
+                            &valueSize);
+  if (result == ERROR_SUCCESS) {
+    std::wstring WideValue(reinterpret_cast<const wchar_t *>(buffer.data()),
+                           valueSize / sizeof(wchar_t));
+    if (valueSize && WideValue.back() == L'\0') {
+        WideValue.pop_back();
+    }
+    // The destination buffer must be empty as an invariant of the conversion
+    // function; but this function is sometimes called in a loop that passes in
+    // the same buffer, however. Simply clear it out so we can overwrite it.
+    value.clear();
+    return llvm::convertWideToUTF8(WideValue, value);
+  }
+  return false;
 }
 #endif
 
@@ -146,19 +159,20 @@
       nextKey++;
     size_t partialKeyLength = keyEnd - keyPath;
     char partialKey[256];
-    if (partialKeyLength > sizeof(partialKey))
-      partialKeyLength = sizeof(partialKey);
+    if (partialKeyLength >= sizeof(partialKey))
+      partialKeyLength = sizeof(partialKey) - 1;
     strncpy(partialKey, keyPath, partialKeyLength);
     partialKey[partialKeyLength] = '\0';
     HKEY hTopKey = NULL;
-    lResult = RegOpenKeyEx(hRootKey, partialKey, 0, KEY_READ | KEY_WOW64_32KEY,
-                           &hTopKey);
+    lResult = RegOpenKeyExA(hRootKey, partialKey, 0, KEY_READ | KEY_WOW64_32KEY,
+                            &hTopKey);
     if (lResult == ERROR_SUCCESS) {
       char keyName[256];
       double bestValue = 0.0;
       DWORD index, size = sizeof(keyName) - 1;
-      for (index = 0; RegEnumKeyEx(hTopKey, index, keyName, &size, NULL,
-          NULL, NULL, NULL) == ERROR_SUCCESS; index++) {
+      for (index = 0; RegEnumKeyExA(hTopKey, index, keyName, &size, NULL, NULL,
+                                    NULL, NULL) == ERROR_SUCCESS;
+           index++) {
         const char *sp = keyName;
         while (*sp && !isDigit(*sp))
           sp++;
@@ -177,11 +191,10 @@
           bestName = keyName;
           // Append rest of key.
           bestName.append(nextKey);
-          lResult = RegOpenKeyEx(hTopKey, bestName.c_str(), 0,
-                                 KEY_READ | KEY_WOW64_32KEY, &hKey);
+          lResult = RegOpenKeyExA(hTopKey, bestName.c_str(), 0,
+                                  KEY_READ | KEY_WOW64_32KEY, &hKey);
           if (lResult == ERROR_SUCCESS) {
-            lResult = readFullStringValue(hKey, valueName, value);
-            if (lResult == ERROR_SUCCESS) {
+            if (readFullStringValue(hKey, valueName, value)) {
               bestValue = dvalue;
               if (phValue)
                 *phValue = bestName;
@@ -196,10 +209,9 @@
     }
   } else {
     lResult =
-        RegOpenKeyEx(hRootKey, keyPath, 0, KEY_READ | KEY_WOW64_32KEY, &hKey);
+        RegOpenKeyExA(hRootKey, keyPath, 0, KEY_READ | KEY_WOW64_32KEY, &hKey);
     if (lResult == ERROR_SUCCESS) {
-      lResult = readFullStringValue(hKey, valueName, value);
-      if (lResult == ERROR_SUCCESS)
+      if (readFullStringValue(hKey, valueName, value))
         returnValue = true;
       if (phValue)
         phValue->clear();
@@ -407,7 +419,10 @@
 
         SmallString<128> FilePath(PathSegment);
         llvm::sys::path::append(FilePath, "cl.exe");
-        if (llvm::sys::fs::can_execute(FilePath.c_str()) &&
+        // Checking if cl.exe exists is a small optimization over calling
+        // can_execute, which really only checks for existence but will also do
+        // extra checks for cl.exe.exe.  These add up when walking a long path.
+        if (llvm::sys::fs::exists(FilePath.c_str()) &&
             !llvm::sys::fs::equivalent(FilePath.c_str(), clangProgramPath)) {
           // If we found it on the PATH, use it exactly as is with no
           // modifications.
@@ -457,12 +472,51 @@
   return true;
 }
 
+VersionTuple MSVCToolChain::getMSVCVersionFromExe() const {
+  VersionTuple Version;
+#ifdef USE_WIN32
+  std::string BinPath;
+  if (!getVisualStudioBinariesFolder("", BinPath))
+    return Version;
+  SmallString<128> ClExe(BinPath);
+  llvm::sys::path::append(ClExe, "cl.exe");
+
+  std::wstring ClExeWide;
+  if (!llvm::ConvertUTF8toWide(ClExe.c_str(), ClExeWide))
+    return Version;
+
+  const DWORD VersionSize = ::GetFileVersionInfoSizeW(ClExeWide.c_str(),
+                                                      nullptr);
+  if (VersionSize == 0)
+    return Version;
+
+  SmallVector<uint8_t, 4 * 1024> VersionBlock(VersionSize);
+  if (!::GetFileVersionInfoW(ClExeWide.c_str(), 0, VersionSize,
+                             VersionBlock.data()))
+    return Version;
+
+  VS_FIXEDFILEINFO *FileInfo = nullptr;
+  UINT FileInfoSize = 0;
+  if (!::VerQueryValueW(VersionBlock.data(), L"\\",
+                        reinterpret_cast<LPVOID *>(&FileInfo), &FileInfoSize) ||
+      FileInfoSize < sizeof(*FileInfo))
+    return Version;
+
+  const unsigned Major = (FileInfo->dwFileVersionMS >> 16) & 0xFFFF;
+  const unsigned Minor = (FileInfo->dwFileVersionMS      ) & 0xFFFF;
+  const unsigned Micro = (FileInfo->dwFileVersionLS >> 16) & 0xFFFF;
+
+  Version = VersionTuple(Major, Minor, Micro);
+#endif
+  return Version;
+}
+
 // Get Visual Studio installation directory.
 bool MSVCToolChain::getVisualStudioInstallDir(std::string &path) const {
   // First check the environment variables that vsvars32.bat sets.
-  const char *vcinstalldir = getenv("VCINSTALLDIR");
-  if (vcinstalldir) {
-    path = vcinstalldir;
+  if (llvm::Optional<std::string> VcInstallDir =
+          llvm::sys::Process::GetEnv("VCINSTALLDIR")) {
+    path = std::move(*VcInstallDir);
     path = path.substr(0, path.find("\\VC"));
     return true;
   }
@@ -488,26 +542,26 @@
   }
 
   // Try the environment.
-  const char *vs120comntools = getenv("VS120COMNTOOLS");
-  const char *vs100comntools = getenv("VS100COMNTOOLS");
-  const char *vs90comntools = getenv("VS90COMNTOOLS");
-  const char *vs80comntools = getenv("VS80COMNTOOLS");
+  std::string vcomntools;
+  if (llvm::Optional<std::string> vs120comntools =
+          llvm::sys::Process::GetEnv("VS120COMNTOOLS"))
+    vcomntools = std::move(*vs120comntools);
+  else if (llvm::Optional<std::string> vs100comntools =
+               llvm::sys::Process::GetEnv("VS100COMNTOOLS"))
+    vcomntools = std::move(*vs100comntools);
+  else if (llvm::Optional<std::string> vs90comntools =
+               llvm::sys::Process::GetEnv("VS90COMNTOOLS"))
+    vcomntools = std::move(*vs90comntools);
+  else if (llvm::Optional<std::string> vs80comntools =
+               llvm::sys::Process::GetEnv("VS80COMNTOOLS"))
+    vcomntools = std::move(*vs80comntools);
 
-  const char *vscomntools = nullptr;
-
-  // Find any version we can
-  if (vs120comntools)
-    vscomntools = vs120comntools;
-  else if (vs100comntools)
-    vscomntools = vs100comntools;
-  else if (vs90comntools)
-    vscomntools = vs90comntools;
-  else if (vs80comntools)
-    vscomntools = vs80comntools;
-
-  if (vscomntools && *vscomntools) {
-    const char *p = strstr(vscomntools, "\\Common7\\Tools");
-    path = p ? std::string(vscomntools, p) : vscomntools;
+  // Find any version we can.
+  if (!vcomntools.empty()) {
+    size_t p = vcomntools.find("\\Common7\\Tools");
+    if (p != std::string::npos)
+      vcomntools.resize(p);
+    path = std::move(vcomntools);
     return true;
   }
   return false;
@@ -532,13 +586,18 @@
                                   "include");
   }
 
+  // Add %INCLUDE%-like directories from the -imsvc flag.
+  for (const auto &Path : DriverArgs.getAllArgValues(options::OPT__SLASH_imsvc))
+    addSystemInclude(DriverArgs, CC1Args, Path);
+
   if (DriverArgs.hasArg(options::OPT_nostdlibinc))
     return;
 
   // Honor %INCLUDE%. It should know essential search paths with vcvarsall.bat.
-  if (const char *cl_include_dir = getenv("INCLUDE")) {
+  if (llvm::Optional<std::string> cl_include_dir =
+          llvm::sys::Process::GetEnv("INCLUDE")) {
     SmallVector<StringRef, 8> Dirs;
-    StringRef(cl_include_dir)
+    StringRef(*cl_include_dir)
         .split(Dirs, ";", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
     for (StringRef Dir : Dirs)
       addSystemInclude(DriverArgs, CC1Args, Dir);
@@ -590,6 +649,7 @@
     return;
   }
 
+#if defined(LLVM_ON_WIN32)
   // As a fallback, select default install paths.
   // FIXME: Don't guess drives and paths like this on Windows.
   const StringRef Paths[] = {
@@ -600,6 +660,7 @@
     "C:/Program Files/Microsoft Visual Studio 8/VC/PlatformSDK/Include"
   };
   addSystemIncludes(DriverArgs, CC1Args, Paths);
+#endif
 }
 
 void MSVCToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
@@ -614,7 +675,7 @@
       ToolChain::ComputeEffectiveClangTriple(Args, InputType);
   llvm::Triple Triple(TripleStr);
   VersionTuple MSVT =
-      tools::visualstudio::getMSVCVersion(/*D=*/nullptr, Triple, Args,
+      tools::visualstudio::getMSVCVersion(/*D=*/nullptr, *this, Triple, Args,
                                           /*IsWindowsMSVC=*/true);
   if (MSVT.empty())
     return TripleStr;
@@ -664,7 +725,8 @@
             DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin));
             DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "2");
           }
-          if (SupportsForcingFramePointer)
+          if (SupportsForcingFramePointer &&
+              !DAL.hasArgNoClaim(options::OPT_fno_omit_frame_pointer))
             DAL.AddFlagArg(A,
                            Opts.getOption(options::OPT_fomit_frame_pointer));
           if (OptChar == '1' || OptChar == '2')
@@ -674,8 +736,20 @@
       }
       break;
     case 'b':
-      if (I + 1 != E && isdigit(OptStr[I + 1]))
+      if (I + 1 != E && isdigit(OptStr[I + 1])) {
+        switch (OptStr[I + 1]) {
+        case '0':
+          DAL.AddFlagArg(A, Opts.getOption(options::OPT_fno_inline));
+          break;
+        case '1':
+          DAL.AddFlagArg(A, Opts.getOption(options::OPT_finline_hint_functions));
+          break;
+        case '2':
+          DAL.AddFlagArg(A, Opts.getOption(options::OPT_finline_functions));
+          break;
+        }
         ++I;
+      }
       break;
     case 'g':
       break;
@@ -706,6 +780,12 @@
         else
           DAL.AddFlagArg(
               A, Opts.getOption(options::OPT_fno_omit_frame_pointer));
+      } else {
+        // Don't warn about /Oy- in 64-bit builds (where
+        // SupportsForcingFramePointer is false).  The flag having no effect
+        // there is a compiler-internal optimization, and people shouldn't have
+        // to special-case their build files for 64-bit clang-cl.
+        A->claim();
       }
       break;
     }
@@ -753,7 +833,12 @@
       continue;
     StringRef OptStr = A->getValue();
     for (size_t I = 0, E = OptStr.size(); I != E; ++I) {
-      const char &OptChar = *(OptStr.data() + I);
+      char OptChar = OptStr[I];
+      char PrevChar = I > 0 ? OptStr[I - 1] : '0';
+      if (PrevChar == 'b') {
+        // OptChar does not expand; it's an argument to the previous char.
+        continue;
+      }
       if (OptChar == '1' || OptChar == '2' || OptChar == 'x' || OptChar == 'd')
         ExpandChar = OptStr.data() + I;
     }
diff --git a/lib/Driver/Makefile b/lib/Driver/Makefile
deleted file mode 100644
index 454ab86..0000000
--- a/lib/Driver/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- clang/lib/Driver/Makefile ---------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangDriver
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Driver/Multilib.cpp b/lib/Driver/Multilib.cpp
index 34ad6a7..a88edf7 100644
--- a/lib/Driver/Multilib.cpp
+++ b/lib/Driver/Multilib.cpp
@@ -13,12 +13,10 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/YAMLParser.h"
diff --git a/lib/Driver/SanitizerArgs.cpp b/lib/Driver/SanitizerArgs.cpp
index cf8d39d..30cc3f4 100644
--- a/lib/Driver/SanitizerArgs.cpp
+++ b/lib/Driver/SanitizerArgs.cpp
@@ -39,6 +39,7 @@
   TrappingSupported =
       (Undefined & ~Vptr) | UnsignedIntegerOverflow | LocalBounds | CFI,
   TrappingDefault = CFI,
+  CFIClasses = CFIVCall | CFINVCall | CFIDerivedCast | CFIUnrelatedCast,
 };
 
 enum CoverageFeature {
@@ -49,6 +50,7 @@
   CoverageTraceBB = 1 << 4,
   CoverageTraceCmp = 1 << 5,
   Coverage8bitCounters = 1 << 6,
+  CoverageTracePC = 1 << 7,
 };
 
 /// Parse a -fsanitize= or -fno-sanitize= argument's values, diagnosing any
@@ -157,11 +159,10 @@
 }
 
 bool SanitizerArgs::needsUbsanRt() const {
-  return (Sanitizers.Mask & NeedsUbsanRt & ~TrapSanitizers.Mask) &&
-         !Sanitizers.has(Address) &&
-         !Sanitizers.has(Memory) &&
-         !Sanitizers.has(Thread) &&
-         !CfiCrossDso;
+  return ((Sanitizers.Mask & NeedsUbsanRt & ~TrapSanitizers.Mask) ||
+          CoverageFeatures) &&
+         !Sanitizers.has(Address) && !Sanitizers.has(Memory) &&
+         !Sanitizers.has(Thread) && !Sanitizers.has(DataFlow) && !CfiCrossDso;
 }
 
 bool SanitizerArgs::needsCfiRt() const {
@@ -267,6 +268,9 @@
     }
   }
 
+  // Enable toolchain specific default sanitizers if not explicitly disabled.
+  Kinds |= TC.getDefaultSanitizers() & ~AllRemove;
+
   // We disable the vptr sanitizer if it was enabled by group expansion but RTTI
   // is disabled.
   if ((Kinds & Vptr) &&
@@ -307,7 +311,12 @@
       std::make_pair(Leak, Memory), std::make_pair(KernelAddress, Address),
       std::make_pair(KernelAddress, Leak),
       std::make_pair(KernelAddress, Thread),
-      std::make_pair(KernelAddress, Memory)};
+      std::make_pair(KernelAddress, Memory),
+      std::make_pair(Efficiency, Address),
+      std::make_pair(Efficiency, Leak),
+      std::make_pair(Efficiency, Thread),
+      std::make_pair(Efficiency, Memory),
+      std::make_pair(Efficiency, KernelAddress)};
   for (auto G : IncompatibleGroups) {
     SanitizerMask Group = G.first;
     if (Kinds & Group) {
@@ -330,11 +339,13 @@
   for (const auto *Arg : Args) {
     const char *DeprecatedReplacement = nullptr;
     if (Arg->getOption().matches(options::OPT_fsanitize_recover)) {
-      DeprecatedReplacement = "-fsanitize-recover=undefined,integer";
+      DeprecatedReplacement =
+          "-fsanitize-recover=undefined,integer' or '-fsanitize-recover=all";
       RecoverableKinds |= expandSanitizerGroups(LegacyFsanitizeRecoverMask);
       Arg->claim();
     } else if (Arg->getOption().matches(options::OPT_fno_sanitize_recover)) {
-      DeprecatedReplacement = "-fno-sanitize-recover=undefined,integer";
+      DeprecatedReplacement = "-fno-sanitize-recover=undefined,integer' or "
+                              "'-fno-sanitize-recover=all";
       RecoverableKinds &= ~expandSanitizerGroups(LegacyFsanitizeRecoverMask);
       Arg->claim();
     } else if (Arg->getOption().matches(options::OPT_fsanitize_recover_EQ)) {
@@ -436,40 +447,54 @@
 
   // Parse -f(no-)?sanitize-coverage flags if coverage is supported by the
   // enabled sanitizers.
-  if (AllAddedKinds & SupportsCoverage) {
-    for (const auto *Arg : Args) {
-      if (Arg->getOption().matches(options::OPT_fsanitize_coverage)) {
-        Arg->claim();
-        int LegacySanitizeCoverage;
-        if (Arg->getNumValues() == 1 &&
-            !StringRef(Arg->getValue(0))
-                 .getAsInteger(0, LegacySanitizeCoverage) &&
-            LegacySanitizeCoverage >= 0 && LegacySanitizeCoverage <= 4) {
-          // TODO: Add deprecation notice for this form.
-          switch (LegacySanitizeCoverage) {
-          case 0:
-            CoverageFeatures = 0;
-            break;
-          case 1:
-            CoverageFeatures = CoverageFunc;
-            break;
-          case 2:
-            CoverageFeatures = CoverageBB;
-            break;
-          case 3:
-            CoverageFeatures = CoverageEdge;
-            break;
-          case 4:
-            CoverageFeatures = CoverageEdge | CoverageIndirCall;
-            break;
-          }
-          continue;
+  for (const auto *Arg : Args) {
+    if (Arg->getOption().matches(options::OPT_fsanitize_coverage)) {
+      int LegacySanitizeCoverage;
+      if (Arg->getNumValues() == 1 &&
+          !StringRef(Arg->getValue(0))
+               .getAsInteger(0, LegacySanitizeCoverage) &&
+          LegacySanitizeCoverage >= 0 && LegacySanitizeCoverage <= 4) {
+        switch (LegacySanitizeCoverage) {
+        case 0:
+          CoverageFeatures = 0;
+          Arg->claim();
+          break;
+        case 1:
+          D.Diag(diag::warn_drv_deprecated_arg) << Arg->getAsString(Args)
+                                                << "-fsanitize-coverage=func";
+          CoverageFeatures = CoverageFunc;
+          break;
+        case 2:
+          D.Diag(diag::warn_drv_deprecated_arg) << Arg->getAsString(Args)
+                                                << "-fsanitize-coverage=bb";
+          CoverageFeatures = CoverageBB;
+          break;
+        case 3:
+          D.Diag(diag::warn_drv_deprecated_arg) << Arg->getAsString(Args)
+                                                << "-fsanitize-coverage=edge";
+          CoverageFeatures = CoverageEdge;
+          break;
+        case 4:
+          D.Diag(diag::warn_drv_deprecated_arg)
+              << Arg->getAsString(Args)
+              << "-fsanitize-coverage=edge,indirect-calls";
+          CoverageFeatures = CoverageEdge | CoverageIndirCall;
+          break;
         }
-        CoverageFeatures |= parseCoverageFeatures(D, Arg);
-      } else if (Arg->getOption().matches(options::OPT_fno_sanitize_coverage)) {
-        Arg->claim();
-        CoverageFeatures &= ~parseCoverageFeatures(D, Arg);
+        continue;
       }
+      CoverageFeatures |= parseCoverageFeatures(D, Arg);
+
+      // Disable coverage and not claim the flags if there is at least one
+      // non-supporting sanitizer.
+      if (!(AllAddedKinds & ~setGroupBits(SupportsCoverage))) {
+        Arg->claim();
+      } else {
+        CoverageFeatures = 0;
+      }
+    } else if (Arg->getOption().matches(options::OPT_fno_sanitize_coverage)) {
+      Arg->claim();
+      CoverageFeatures &= ~parseCoverageFeatures(D, Arg);
     }
   }
   // Choose at most one coverage type: function, bb, or edge.
@@ -498,6 +523,10 @@
     D.Diag(clang::diag::err_drv_argument_only_allowed_with)
         << "-fsanitize-coverage=8bit-counters"
         << "-fsanitize-coverage=(func|bb|edge)";
+  // trace-pc w/o func/bb/edge implies edge.
+  if ((CoverageFeatures & CoverageTracePC) &&
+      !(CoverageFeatures & CoverageTypes))
+    CoverageFeatures |= CoverageEdge;
 
   if (AllAddedKinds & Address) {
     AsanSharedRuntime =
@@ -529,6 +558,14 @@
     }
   }
 
+  AsanUseAfterScope =
+      Args.hasArg(options::OPT_fsanitize_address_use_after_scope);
+  if (AsanUseAfterScope && !(AllAddedKinds & Address)) {
+    D.Diag(clang::diag::err_drv_argument_only_allowed_with)
+        << "-fsanitize-address-use-after-scope"
+        << "-fsanitize=address";
+  }
+
   // Parse -link-cxx-sanitizer flag.
   LinkCXXRuntimes =
       Args.hasArg(options::OPT_fsanitize_link_cxx_runtime) || D.CCCIsCXX();
@@ -568,6 +605,45 @@
 void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs,
                             types::ID InputType) const {
+  // Translate available CoverageFeatures to corresponding clang-cc1 flags.
+  // Do it even if Sanitizers.empty() since some forms of coverage don't require
+  // sanitizers.
+  std::pair<int, const char *> CoverageFlags[] = {
+    std::make_pair(CoverageFunc, "-fsanitize-coverage-type=1"),
+    std::make_pair(CoverageBB, "-fsanitize-coverage-type=2"),
+    std::make_pair(CoverageEdge, "-fsanitize-coverage-type=3"),
+    std::make_pair(CoverageIndirCall, "-fsanitize-coverage-indirect-calls"),
+    std::make_pair(CoverageTraceBB, "-fsanitize-coverage-trace-bb"),
+    std::make_pair(CoverageTraceCmp, "-fsanitize-coverage-trace-cmp"),
+    std::make_pair(Coverage8bitCounters, "-fsanitize-coverage-8bit-counters"),
+    std::make_pair(CoverageTracePC, "-fsanitize-coverage-trace-pc")};
+  for (auto F : CoverageFlags) {
+    if (CoverageFeatures & F.first)
+      CmdArgs.push_back(Args.MakeArgString(F.second));
+  }
+
+  if (TC.getTriple().isOSWindows() && needsUbsanRt()) {
+    // Instruct the code generator to embed linker directives in the object file
+    // that cause the required runtime libraries to be linked.
+    CmdArgs.push_back(Args.MakeArgString(
+        "--dependent-lib=" + TC.getCompilerRT(Args, "ubsan_standalone")));
+    if (types::isCXX(InputType))
+      CmdArgs.push_back(Args.MakeArgString(
+          "--dependent-lib=" + TC.getCompilerRT(Args, "ubsan_standalone_cxx")));
+  }
+  if (TC.getTriple().isOSWindows() && needsStatsRt()) {
+    CmdArgs.push_back(Args.MakeArgString("--dependent-lib=" +
+                                         TC.getCompilerRT(Args, "stats_client")));
+
+    // The main executable must export the stats runtime.
+    // FIXME: Only exporting from the main executable (e.g. based on whether the
+    // translation unit defines main()) would save a little space, but having
+    // multiple copies of the runtime shouldn't hurt.
+    CmdArgs.push_back(Args.MakeArgString("--dependent-lib=" +
+                                         TC.getCompilerRT(Args, "stats")));
+    addIncludeLinkerOption(TC, Args, CmdArgs, "__sanitizer_stats_register");
+  }
+
   if (Sanitizers.empty())
     return;
   CmdArgs.push_back(Args.MakeArgString("-fsanitize=" + toString(Sanitizers)));
@@ -607,20 +683,9 @@
   if (AsanFieldPadding)
     CmdArgs.push_back(Args.MakeArgString("-fsanitize-address-field-padding=" +
                                          llvm::utostr(AsanFieldPadding)));
-  // Translate available CoverageFeatures to corresponding clang-cc1 flags.
-  std::pair<int, const char *> CoverageFlags[] = {
-    std::make_pair(CoverageFunc, "-fsanitize-coverage-type=1"),
-    std::make_pair(CoverageBB, "-fsanitize-coverage-type=2"),
-    std::make_pair(CoverageEdge, "-fsanitize-coverage-type=3"),
-    std::make_pair(CoverageIndirCall, "-fsanitize-coverage-indirect-calls"),
-    std::make_pair(CoverageTraceBB, "-fsanitize-coverage-trace-bb"),
-    std::make_pair(CoverageTraceCmp, "-fsanitize-coverage-trace-cmp"),
-    std::make_pair(Coverage8bitCounters, "-fsanitize-coverage-8bit-counters")};
-  for (auto F : CoverageFlags) {
-    if (CoverageFeatures & F.first)
-      CmdArgs.push_back(Args.MakeArgString(F.second));
-  }
 
+  if (AsanUseAfterScope)
+    CmdArgs.push_back(Args.MakeArgString("-fsanitize-address-use-after-scope"));
 
   // MSan: Workaround for PR16386.
   // ASan: This is mainly to help LSan with cases such as
@@ -630,26 +695,14 @@
   if (Sanitizers.has(Memory) || Sanitizers.has(Address))
     CmdArgs.push_back(Args.MakeArgString("-fno-assume-sane-operator-new"));
 
-  if (TC.getTriple().isOSWindows() && needsUbsanRt()) {
-    // Instruct the code generator to embed linker directives in the object file
-    // that cause the required runtime libraries to be linked.
-    CmdArgs.push_back(Args.MakeArgString(
-        "--dependent-lib=" + TC.getCompilerRT(Args, "ubsan_standalone")));
-    if (types::isCXX(InputType))
-      CmdArgs.push_back(Args.MakeArgString(
-          "--dependent-lib=" + TC.getCompilerRT(Args, "ubsan_standalone_cxx")));
-  }
-  if (TC.getTriple().isOSWindows() && needsStatsRt()) {
-    CmdArgs.push_back(Args.MakeArgString("--dependent-lib=" +
-                                         TC.getCompilerRT(Args, "stats_client")));
-
-    // The main executable must export the stats runtime.
-    // FIXME: Only exporting from the main executable (e.g. based on whether the
-    // translation unit defines main()) would save a little space, but having
-    // multiple copies of the runtime shouldn't hurt.
-    CmdArgs.push_back(Args.MakeArgString("--dependent-lib=" +
-                                         TC.getCompilerRT(Args, "stats")));
-    addIncludeLinkerOption(TC, Args, CmdArgs, "__sanitizer_stats_register");
+  // Require -fvisibility= flag on non-Windows when compiling if vptr CFI is
+  // enabled.
+  if (Sanitizers.hasOneOf(CFIClasses) && !TC.getTriple().isOSWindows() &&
+      !Args.hasArg(options::OPT_fvisibility_EQ)) {
+    TC.getDriver().Diag(clang::diag::err_drv_argument_only_allowed_with)
+        << lastArgumentForMask(TC.getDriver(), Args,
+                               Sanitizers.Mask & CFIClasses)
+        << "-fvisibility=";
   }
 }
 
@@ -670,6 +723,10 @@
     if (A->getOption().matches(options::OPT_fsanitize_EQ) &&
         0 == strcmp("all", Value))
       Kind = 0;
+    // Similarly, don't accept -fsanitize=efficiency-all.
+    else if (A->getOption().matches(options::OPT_fsanitize_EQ) &&
+        0 == strcmp("efficiency-all", Value))
+      Kind = 0;
     else
       Kind = parseSanitizerValue(Value, /*AllowGroups=*/true);
 
@@ -696,6 +753,7 @@
         .Case("trace-bb", CoverageTraceBB)
         .Case("trace-cmp", CoverageTraceCmp)
         .Case("8bit-counters", Coverage8bitCounters)
+        .Case("trace-pc", CoverageTracePC)
         .Default(0);
     if (F == 0)
       D.Diag(clang::diag::err_drv_unsupported_option_argument)
diff --git a/lib/Driver/ToolChain.cpp b/lib/Driver/ToolChain.cpp
index 53bf63e..863bd0a 100644
--- a/lib/Driver/ToolChain.cpp
+++ b/lib/Driver/ToolChain.cpp
@@ -7,23 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Driver/ToolChain.h"
 #include "Tools.h"
 #include "clang/Basic/ObjCRuntime.h"
+#include "clang/Config/config.h"
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace clang::driver;
 using namespace clang::driver::tools;
@@ -67,7 +68,8 @@
 ToolChain::ToolChain(const Driver &D, const llvm::Triple &T,
                      const ArgList &Args)
     : D(D), Triple(T), Args(Args), CachedRTTIArg(GetRTTIArgument(Args)),
-      CachedRTTIMode(CalculateRTTIMode(Args, Triple, CachedRTTIArg)) {
+      CachedRTTIMode(CalculateRTTIMode(Args, Triple, CachedRTTIArg)),
+      EffectiveTriple() {
   if (Arg *A = Args.getLastArg(options::OPT_mthread_model))
     if (!isThreadModelSupported(A->getValue()))
       D.Diag(diag::err_drv_invalid_thread_model_for_target)
@@ -247,8 +249,7 @@
 
   case Action::InputClass:
   case Action::BindArchClass:
-  case Action::CudaDeviceClass:
-  case Action::CudaHostClass:
+  case Action::OffloadClass:
   case Action::LipoJobClass:
   case Action::DsymutilJobClass:
   case Action::VerifyDebugInfoJobClass:
@@ -487,8 +488,10 @@
       ArchName = "arm";
 
     // Assembly files should start in ARM mode, unless arch is M-profile.
+    // Windows is always thumb.
     if ((InputType != types::TY_PP_Asm && Args.hasFlag(options::OPT_mthumb,
-         options::OPT_mno_thumb, ThumbDefault)) || IsMProfile) {
+         options::OPT_mno_thumb, ThumbDefault)) || IsMProfile ||
+         getTriple().isOSWindows()) {
       if (IsBigEndian)
         ArchName = "thumbeb";
       else
@@ -522,36 +525,64 @@
   if (!needsProfileRT(Args)) return;
 
   CmdArgs.push_back(getCompilerRTArgString(Args, "profile"));
-  return;
 }
 
 ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType(
     const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
-    StringRef Value = A->getValue();
-    if (Value == "compiler-rt")
-      return ToolChain::RLT_CompilerRT;
-    if (Value == "libgcc")
-      return ToolChain::RLT_Libgcc;
-    getDriver().Diag(diag::err_drv_invalid_rtlib_name)
-      << A->getAsString(Args);
-  }
+  const Arg* A = Args.getLastArg(options::OPT_rtlib_EQ);
+  StringRef LibName = A ? A->getValue() : CLANG_DEFAULT_RTLIB;
+
+  // "platform" is only used in tests to override CLANG_DEFAULT_RTLIB
+  if (LibName == "compiler-rt")
+    return ToolChain::RLT_CompilerRT;
+  else if (LibName == "libgcc")
+    return ToolChain::RLT_Libgcc;
+  else if (LibName == "platform")
+    return GetDefaultRuntimeLibType();
+
+  if (A)
+    getDriver().Diag(diag::err_drv_invalid_rtlib_name) << A->getAsString(Args);
 
   return GetDefaultRuntimeLibType();
 }
 
+static bool ParseCXXStdlibType(const StringRef& Name,
+                               ToolChain::CXXStdlibType& Type) {
+  if (Name == "libc++")
+    Type = ToolChain::CST_Libcxx;
+  else if (Name == "libstdc++")
+    Type = ToolChain::CST_Libstdcxx;
+  else
+    return false;
+
+  return true;
+}
+
 ToolChain::CXXStdlibType ToolChain::GetCXXStdlibType(const ArgList &Args) const{
-  if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) {
+  ToolChain::CXXStdlibType Type;
+  bool HasValidType = false;
+  bool ForcePlatformDefault = false;
+
+  const Arg *A = Args.getLastArg(options::OPT_stdlib_EQ);
+  if (A) {
     StringRef Value = A->getValue();
-    if (Value == "libc++")
-      return ToolChain::CST_Libcxx;
-    if (Value == "libstdc++")
-      return ToolChain::CST_Libstdcxx;
-    getDriver().Diag(diag::err_drv_invalid_stdlib_name)
-      << A->getAsString(Args);
+    HasValidType = ParseCXXStdlibType(Value, Type);
+
+    // Only use in tests to override CLANG_DEFAULT_CXX_STDLIB!
+    if (Value == "platform")
+      ForcePlatformDefault = true;
+    else if (!HasValidType)
+      getDriver().Diag(diag::err_drv_invalid_stdlib_name)
+        << A->getAsString(Args);
   }
 
-  return ToolChain::CST_Libstdcxx;
+  // If no argument was provided or its value was invalid, look for the
+  // default unless forced or configured to take the platform default.
+  if (!HasValidType && (ForcePlatformDefault ||
+      !ParseCXXStdlibType(CLANG_DEFAULT_CXX_STDLIB, Type)))
+    Type = GetDefaultCXXStdlibType();
+
+  return Type;
 }
 
 /// \brief Utility function to add a system include directory to CC1 arguments.
@@ -666,10 +697,15 @@
   SanitizerMask Res = (Undefined & ~Vptr & ~Function) | (CFI & ~CFIICall) |
                       CFICastStrict | UnsignedIntegerOverflow | LocalBounds;
   if (getTriple().getArch() == llvm::Triple::x86 ||
-      getTriple().getArch() == llvm::Triple::x86_64)
+      getTriple().getArch() == llvm::Triple::x86_64 ||
+      getTriple().getArch() == llvm::Triple::wasm32 ||
+      getTriple().getArch() == llvm::Triple::wasm64)
     Res |= CFIICall;
   return Res;
 }
 
 void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                    ArgStringList &CC1Args) const {}
+
+void ToolChain::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
+                                    ArgStringList &CC1Args) const {}
diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp
index 3ae825f..4d438c2 100644
--- a/lib/Driver/ToolChains.cpp
+++ b/lib/Driver/ToolChains.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ToolChains.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Basic/ObjCRuntime.h"
 #include "clang/Basic/Version.h"
 #include "clang/Basic/VirtualFileSystem.h"
@@ -65,6 +66,16 @@
 
 bool MachO::HasNativeLLVMSupport() const { return true; }
 
+ToolChain::CXXStdlibType Darwin::GetDefaultCXXStdlibType() const {
+  // Default to use libc++ on OS X 10.9+ and iOS 7+.
+  if ((isTargetMacOS() && !isMacosxVersionLT(10, 9)) ||
+       (isTargetIOSBased() && !isIPhoneOSVersionLT(7, 0)) ||
+       isTargetWatchOSBased())
+    return ToolChain::CST_Libcxx;
+
+  return ToolChain::CST_Libstdcxx;
+}
+
 /// Darwin provides an ARC runtime starting in MacOS X 10.7 and iOS 5.0.
 ObjCRuntime Darwin::getDefaultObjCRuntime(bool isNonFragile) const {
   if (isTargetWatchOSBased())
@@ -165,13 +176,6 @@
 
 MachO::~MachO() {}
 
-std::string MachO::ComputeEffectiveClangTriple(const ArgList &Args,
-                                               types::ID InputType) const {
-  llvm::Triple Triple(ComputeLLVMTriple(Args, InputType));
-
-  return Triple.getTriple();
-}
-
 std::string Darwin::ComputeEffectiveClangTriple(const ArgList &Args,
                                                 types::ID InputType) const {
   llvm::Triple Triple(ComputeLLVMTriple(Args, InputType));
@@ -349,77 +353,62 @@
   return "";
 }
 
+StringRef Darwin::getOSLibraryNameSuffix() const {
+  switch(TargetPlatform) {
+  case DarwinPlatformKind::MacOS:
+    return "osx";
+  case DarwinPlatformKind::IPhoneOS:
+    return "ios";
+  case DarwinPlatformKind::IPhoneOSSimulator:
+    return "iossim";
+  case DarwinPlatformKind::TvOS:
+    return "tvos";
+  case DarwinPlatformKind::TvOSSimulator:
+    return "tvossim";
+  case DarwinPlatformKind::WatchOS:
+    return "watchos";
+  case DarwinPlatformKind::WatchOSSimulator:
+    return "watchossim";
+  }
+  llvm_unreachable("Unsupported platform");
+}
+
 void Darwin::addProfileRTLibs(const ArgList &Args,
                               ArgStringList &CmdArgs) const {
   if (!needsProfileRT(Args)) return;
 
-  // TODO: Clean this up once autoconf is gone
-  SmallString<128> P(getDriver().ResourceDir);
-  llvm::sys::path::append(P, "lib", "darwin");
-  const char *Library = "libclang_rt.profile_osx.a";
-
-  // Select the appropriate runtime library for the target.
-  if (isTargetWatchOS()) {
-    Library = "libclang_rt.profile_watchos.a";
-  } else if (isTargetWatchOSSimulator()) {
-    llvm::sys::path::append(P, "libclang_rt.profile_watchossim.a");
-    Library = getVFS().exists(P) ? "libclang_rt.profile_watchossim.a"
-                                 : "libclang_rt.profile_watchos.a";
-  } else if (isTargetTvOS()) {
-    Library = "libclang_rt.profile_tvos.a";
-  } else if (isTargetTvOSSimulator()) {
-    llvm::sys::path::append(P, "libclang_rt.profile_tvossim.a");
-    Library = getVFS().exists(P) ? "libclang_rt.profile_tvossim.a"
-                                 : "libclang_rt.profile_tvos.a";
-  } else if (isTargetIPhoneOS()) {
-    Library = "libclang_rt.profile_ios.a";
-  } else if (isTargetIOSSimulator()) {
-    llvm::sys::path::append(P, "libclang_rt.profile_iossim.a");
-    Library = getVFS().exists(P) ? "libclang_rt.profile_iossim.a"
-                                 : "libclang_rt.profile_ios.a";
-  } else {
-    assert(isTargetMacOS() && "unexpected non MacOS platform");
-  }
-  AddLinkRuntimeLib(Args, CmdArgs, Library,
+  AddLinkRuntimeLib(Args, CmdArgs, (Twine("libclang_rt.profile_") +
+       getOSLibraryNameSuffix() + ".a").str(),
                     /*AlwaysLink*/ true);
-  return;
 }
 
 void DarwinClang::AddLinkSanitizerLibArgs(const ArgList &Args,
                                           ArgStringList &CmdArgs,
                                           StringRef Sanitizer) const {
-  if (!Args.hasArg(options::OPT_dynamiclib) &&
-      !Args.hasArg(options::OPT_bundle)) {
-    // Sanitizer runtime libraries requires C++.
-    AddCXXStdlibLibArgs(Args, CmdArgs);
-  }
-  // ASan is not supported on watchOS.
-  assert(isTargetMacOS() || isTargetIOSSimulator());
-  StringRef OS = isTargetMacOS() ? "osx" : "iossim";
   AddLinkRuntimeLib(
       Args, CmdArgs,
-      (Twine("libclang_rt.") + Sanitizer + "_" + OS + "_dynamic.dylib").str(),
+      (Twine("libclang_rt.") + Sanitizer + "_" +
+       getOSLibraryNameSuffix() + "_dynamic.dylib").str(),
       /*AlwaysLink*/ true, /*IsEmbedded*/ false,
       /*AddRPath*/ true);
+}
 
-  if (GetCXXStdlibType(Args) == ToolChain::CST_Libcxx) {
-    // Add explicit dependcy on -lc++abi, as -lc++ doesn't re-export
-    // all RTTI-related symbols that UBSan uses.
-    CmdArgs.push_back("-lc++abi");
+ToolChain::RuntimeLibType DarwinClang::GetRuntimeLibType(
+    const ArgList &Args) const {
+  if (Arg* A = Args.getLastArg(options::OPT_rtlib_EQ)) {
+    StringRef Value = A->getValue();
+    if (Value != "compiler-rt")
+      getDriver().Diag(diag::err_drv_unsupported_rtlib_for_platform)
+          << Value << "darwin";
   }
+
+  return ToolChain::RLT_CompilerRT;
 }
 
 void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args,
                                         ArgStringList &CmdArgs) const {
-  // Darwin only supports the compiler-rt based runtime libraries.
-  switch (GetRuntimeLibType(Args)) {
-  case ToolChain::RLT_CompilerRT:
-    break;
-  default:
-    getDriver().Diag(diag::err_drv_unsupported_rtlib_for_platform)
-        << Args.getLastArg(options::OPT_rtlib_EQ)->getValue() << "darwin";
-    return;
-  }
+  // Call once to ensure diagnostic is printed if wrong value was specified
+  GetRuntimeLibType(Args);
 
   // Darwin doesn't support real static executables, don't link any runtime
   // libraries with -static.
@@ -450,6 +439,8 @@
                       /*AlwaysLink=*/true);
     AddLinkSanitizerLibArgs(Args, CmdArgs, "stats");
   }
+  if (Sanitize.needsEsanRt())
+    AddLinkSanitizerLibArgs(Args, CmdArgs, "esan");
 
   // Otherwise link libSystem, then the dynamic runtime library, and finally any
   // target specific static runtime library.
@@ -780,7 +771,6 @@
 
 void DarwinClang::AddCCKextLibArgs(const ArgList &Args,
                                    ArgStringList &CmdArgs) const {
-
   // For Darwin platforms, use the compiler-rt-based support library
   // instead of the gcc-provided one (which is also incidentally
   // only present in the gcc lib dir, which makes it hard to find).
@@ -1070,11 +1060,8 @@
     }
   }
 
-  // Default to use libc++ on OS X 10.9+ and iOS 7+.
-  if (((isTargetMacOS() && !isMacosxVersionLT(10, 9)) ||
-       (isTargetIOSBased() && !isIPhoneOSVersionLT(7, 0)) ||
-       isTargetWatchOSBased()) &&
-      !Args.getLastArg(options::OPT_stdlib_EQ))
+  if (!Args.getLastArg(options::OPT_stdlib_EQ) &&
+      GetCXXStdlibType(Args) == ToolChain::CST_Libcxx)
     DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_stdlib_EQ),
                       "libc++");
 
@@ -1117,6 +1104,13 @@
   return !Triple.isWatchABI();
 }
 
+bool Darwin::SupportsEmbeddedBitcode() const {
+  assert(TargetInitialized && "Target not initialized!");
+  if (isTargetIPhoneOS() && isIPhoneOSVersionLT(6, 0))
+    return false;
+  return true;
+}
+
 bool MachO::isPICDefault() const { return true; }
 
 bool MachO::isPIEDefault() const { return false; }
@@ -1260,8 +1254,7 @@
 SanitizerMask Darwin::getSupportedSanitizers() const {
   const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64;
   SanitizerMask Res = ToolChain::getSupportedSanitizers();
-  if (isTargetMacOS() || isTargetIOSSimulator())
-    Res |= SanitizerKind::Address;
+  Res |= SanitizerKind::Address;
   if (isTargetMacOS()) {
     if (!isMacosxVersionLT(10, 9))
       Res |= SanitizerKind::Vptr;
@@ -1292,6 +1285,8 @@
   if (First.first.getAsInteger(10, GoodVersion.Major) || GoodVersion.Major < 0)
     return BadVersion;
   GoodVersion.MajorStr = First.first.str();
+  if (First.second.empty())
+    return GoodVersion;
   if (Second.first.getAsInteger(10, GoodVersion.Minor) || GoodVersion.Minor < 0)
     return BadVersion;
   GoodVersion.MinorStr = Second.first.str();
@@ -1299,6 +1294,7 @@
   // First look for a number prefix and parse that if present. Otherwise just
   // stash the entire patch string in the suffix, and leave the number
   // unspecified. This covers versions strings such as:
+  //   5        (handled above)
   //   4.4
   //   4.4.0
   //   4.4.x
@@ -1404,9 +1400,17 @@
     // Then look for gcc installed alongside clang.
     Prefixes.push_back(D.InstalledDir + "/..");
 
-    // And finally in /usr.
-    if (D.SysRoot.empty())
+    // Then look for distribution supplied gcc installations.
+    if (D.SysRoot.empty()) {
+      // Look for RHEL devtoolsets.
+      Prefixes.push_back("/opt/rh/devtoolset-4/root/usr");
+      Prefixes.push_back("/opt/rh/devtoolset-3/root/usr");
+      Prefixes.push_back("/opt/rh/devtoolset-2/root/usr");
+      Prefixes.push_back("/opt/rh/devtoolset-1.1/root/usr");
+      Prefixes.push_back("/opt/rh/devtoolset-1.0/root/usr");
+      // And finally in /usr.
       Prefixes.push_back("/usr");
+    }
   }
 
   // Loop over the various components which exist and select the best GCC
@@ -1507,8 +1511,8 @@
                                             "mips-mti-linux-gnu",
                                             "mips-img-linux-gnu"};
   static const char *const MIPSELLibDirs[] = {"/lib"};
-  static const char *const MIPSELTriples[] = {
-      "mipsel-linux-gnu", "mipsel-linux-android", "mips-img-linux-gnu"};
+  static const char *const MIPSELTriples[] = {"mipsel-linux-gnu",
+                                              "mips-img-linux-gnu"};
 
   static const char *const MIPS64LibDirs[] = {"/lib64", "/lib"};
   static const char *const MIPS64Triples[] = {
@@ -1517,7 +1521,15 @@
   static const char *const MIPS64ELLibDirs[] = {"/lib64", "/lib"};
   static const char *const MIPS64ELTriples[] = {
       "mips64el-linux-gnu", "mips-mti-linux-gnu", "mips-img-linux-gnu",
-      "mips64el-linux-android", "mips64el-linux-gnuabi64"};
+      "mips64el-linux-gnuabi64"};
+
+  static const char *const MIPSELAndroidLibDirs[] = {"/lib", "/libr2",
+                                                     "/libr6"};
+  static const char *const MIPSELAndroidTriples[] = {"mipsel-linux-android"};
+  static const char *const MIPS64ELAndroidLibDirs[] = {"/lib64", "/lib",
+                                                       "/libr2", "/libr6"};
+  static const char *const MIPS64ELAndroidTriples[] = {
+      "mips64el-linux-android"};
 
   static const char *const PPCLibDirs[] = {"/lib32", "/lib"};
   static const char *const PPCTriples[] = {
@@ -1604,9 +1616,13 @@
     break;
   case llvm::Triple::x86:
     LibDirs.append(begin(X86LibDirs), end(X86LibDirs));
-    TripleAliases.append(begin(X86Triples), end(X86Triples));
-    BiarchLibDirs.append(begin(X86_64LibDirs), end(X86_64LibDirs));
-    BiarchTripleAliases.append(begin(X86_64Triples), end(X86_64Triples));
+    // MCU toolchain is 32 bit only and its triple alias is TargetTriple
+    // itself, which will be appended below.
+    if (!TargetTriple.isOSIAMCU()) {
+      TripleAliases.append(begin(X86Triples), end(X86Triples));
+      BiarchLibDirs.append(begin(X86_64LibDirs), end(X86_64LibDirs));
+      BiarchTripleAliases.append(begin(X86_64Triples), end(X86_64Triples));
+    }
     break;
   case llvm::Triple::mips:
     LibDirs.append(begin(MIPSLibDirs), end(MIPSLibDirs));
@@ -1615,11 +1631,22 @@
     BiarchTripleAliases.append(begin(MIPS64Triples), end(MIPS64Triples));
     break;
   case llvm::Triple::mipsel:
-    LibDirs.append(begin(MIPSELLibDirs), end(MIPSELLibDirs));
-    TripleAliases.append(begin(MIPSELTriples), end(MIPSELTriples));
-    TripleAliases.append(begin(MIPSTriples), end(MIPSTriples));
-    BiarchLibDirs.append(begin(MIPS64ELLibDirs), end(MIPS64ELLibDirs));
-    BiarchTripleAliases.append(begin(MIPS64ELTriples), end(MIPS64ELTriples));
+    if (TargetTriple.isAndroid()) {
+      LibDirs.append(begin(MIPSELAndroidLibDirs), end(MIPSELAndroidLibDirs));
+      TripleAliases.append(begin(MIPSELAndroidTriples),
+                           end(MIPSELAndroidTriples));
+      BiarchLibDirs.append(begin(MIPS64ELAndroidLibDirs),
+                           end(MIPS64ELAndroidLibDirs));
+      BiarchTripleAliases.append(begin(MIPS64ELAndroidTriples),
+                                 end(MIPS64ELAndroidTriples));
+
+    } else {
+      LibDirs.append(begin(MIPSELLibDirs), end(MIPSELLibDirs));
+      TripleAliases.append(begin(MIPSELTriples), end(MIPSELTriples));
+      TripleAliases.append(begin(MIPSTriples), end(MIPSTriples));
+      BiarchLibDirs.append(begin(MIPS64ELLibDirs), end(MIPS64ELLibDirs));
+      BiarchTripleAliases.append(begin(MIPS64ELTriples), end(MIPS64ELTriples));
+    }
     break;
   case llvm::Triple::mips64:
     LibDirs.append(begin(MIPS64LibDirs), end(MIPS64LibDirs));
@@ -1628,11 +1655,23 @@
     BiarchTripleAliases.append(begin(MIPSTriples), end(MIPSTriples));
     break;
   case llvm::Triple::mips64el:
-    LibDirs.append(begin(MIPS64ELLibDirs), end(MIPS64ELLibDirs));
-    TripleAliases.append(begin(MIPS64ELTriples), end(MIPS64ELTriples));
-    BiarchLibDirs.append(begin(MIPSELLibDirs), end(MIPSELLibDirs));
-    BiarchTripleAliases.append(begin(MIPSELTriples), end(MIPSELTriples));
-    BiarchTripleAliases.append(begin(MIPSTriples), end(MIPSTriples));
+    if (TargetTriple.isAndroid()) {
+      LibDirs.append(begin(MIPS64ELAndroidLibDirs),
+                     end(MIPS64ELAndroidLibDirs));
+      TripleAliases.append(begin(MIPS64ELAndroidTriples),
+                           end(MIPS64ELAndroidTriples));
+      BiarchLibDirs.append(begin(MIPSELAndroidLibDirs),
+                           end(MIPSELAndroidLibDirs));
+      BiarchTripleAliases.append(begin(MIPSELAndroidTriples),
+                                 end(MIPSELAndroidTriples));
+
+    } else {
+      LibDirs.append(begin(MIPS64ELLibDirs), end(MIPS64ELLibDirs));
+      TripleAliases.append(begin(MIPS64ELTriples), end(MIPS64ELTriples));
+      BiarchLibDirs.append(begin(MIPSELLibDirs), end(MIPSELLibDirs));
+      BiarchTripleAliases.append(begin(MIPSELTriples), end(MIPSELTriples));
+      BiarchTripleAliases.append(begin(MIPSTriples), end(MIPSTriples));
+    }
     break;
   case llvm::Triple::ppc:
     LibDirs.append(begin(PPCLibDirs), end(PPCLibDirs));
@@ -1682,9 +1721,33 @@
     BiarchTripleAliases.push_back(BiarchTriple.str());
 }
 
+// Parses the contents of version.txt in an CUDA installation.  It should
+// contain one line of the from e.g. "CUDA Version 7.5.2".
+static CudaVersion ParseCudaVersionFile(llvm::StringRef V) {
+  if (!V.startswith("CUDA Version "))
+    return CudaVersion::UNKNOWN;
+  V = V.substr(strlen("CUDA Version "));
+  int Major = -1, Minor = -1;
+  auto First = V.split('.');
+  auto Second = First.second.split('.');
+  if (First.first.getAsInteger(10, Major) ||
+      Second.first.getAsInteger(10, Minor))
+    return CudaVersion::UNKNOWN;
+
+  if (Major == 7 && Minor == 0) {
+    // This doesn't appear to ever happen -- version.txt doesn't exist in the
+    // CUDA 7 installs I've seen.  But no harm in checking.
+    return CudaVersion::CUDA_70;
+  }
+  if (Major == 7 && Minor == 5)
+    return CudaVersion::CUDA_75;
+  if (Major == 8 && Minor == 0)
+    return CudaVersion::CUDA_80;
+  return CudaVersion::UNKNOWN;
+}
+
 // \brief -- try common CUDA installation paths looking for files we need for
 // CUDA compilation.
-
 void Generic_GCC::CudaInstallationDetector::init(
     const llvm::Triple &TargetTriple, const llvm::opt::ArgList &Args) {
   SmallVector<std::string, 4> CudaPathCandidates;
@@ -1694,6 +1757,8 @@
         Args.getLastArgValue(options::OPT_cuda_path_EQ));
   else {
     CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda");
+    // FIXME: Uncomment this once we can compile the cuda 8 headers.
+    // CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0");
     CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.5");
     CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.0");
   }
@@ -1702,20 +1767,19 @@
     if (CudaPath.empty() || !D.getVFS().exists(CudaPath))
       continue;
 
-    CudaInstallPath = CudaPath;
-    CudaBinPath = CudaPath + "/bin";
-    CudaIncludePath = CudaInstallPath + "/include";
-    CudaLibDevicePath = CudaInstallPath + "/nvvm/libdevice";
-    CudaLibPath =
-        CudaInstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");
+    InstallPath = CudaPath;
+    BinPath = CudaPath + "/bin";
+    IncludePath = InstallPath + "/include";
+    LibDevicePath = InstallPath + "/nvvm/libdevice";
+    LibPath = InstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");
 
-    if (!(D.getVFS().exists(CudaIncludePath) &&
-          D.getVFS().exists(CudaBinPath) && D.getVFS().exists(CudaLibPath) &&
-          D.getVFS().exists(CudaLibDevicePath)))
+    auto &FS = D.getVFS();
+    if (!(FS.exists(IncludePath) && FS.exists(BinPath) && FS.exists(LibPath) &&
+          FS.exists(LibDevicePath)))
       continue;
 
     std::error_code EC;
-    for (llvm::sys::fs::directory_iterator LI(CudaLibDevicePath, EC), LE;
+    for (llvm::sys::fs::directory_iterator LI(LibDevicePath, EC), LE;
          !EC && LI != LE; LI = LI.increment(EC)) {
       StringRef FilePath = LI->path();
       StringRef FileName = llvm::sys::path::filename(FilePath);
@@ -1725,41 +1789,84 @@
         continue;
       StringRef GpuArch = FileName.slice(
           LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
-      CudaLibDeviceMap[GpuArch] = FilePath.str();
+      LibDeviceMap[GpuArch] = FilePath.str();
       // Insert map entries for specifc devices with this compute capability.
+      // NVCC's choice of libdevice library version is rather peculiar:
+      // http://docs.nvidia.com/cuda/libdevice-users-guide/basic-usage.html#version-selection
+      // TODO: this will need to be updated once CUDA-8 is released.
       if (GpuArch == "compute_20") {
-        CudaLibDeviceMap["sm_20"] = FilePath;
-        CudaLibDeviceMap["sm_21"] = FilePath;
+        LibDeviceMap["sm_20"] = FilePath;
+        LibDeviceMap["sm_21"] = FilePath;
+        LibDeviceMap["sm_32"] = FilePath;
       } else if (GpuArch == "compute_30") {
-        CudaLibDeviceMap["sm_30"] = FilePath;
-        CudaLibDeviceMap["sm_32"] = FilePath;
+        LibDeviceMap["sm_30"] = FilePath;
+        // compute_30 is the fallback libdevice variant for sm_30+,
+        // unless CUDA specifies different version for specific GPU
+        // arch.
+        LibDeviceMap["sm_50"] = FilePath;
+        LibDeviceMap["sm_52"] = FilePath;
+        LibDeviceMap["sm_53"] = FilePath;
+        // sm_6? are currently all aliases for sm_53 in LLVM and
+        // should use compute_30.
+        LibDeviceMap["sm_60"] = FilePath;
+        LibDeviceMap["sm_61"] = FilePath;
+        LibDeviceMap["sm_62"] = FilePath;
       } else if (GpuArch == "compute_35") {
-        CudaLibDeviceMap["sm_35"] = FilePath;
-        CudaLibDeviceMap["sm_37"] = FilePath;
+        LibDeviceMap["sm_35"] = FilePath;
+        LibDeviceMap["sm_37"] = FilePath;
+      } else if (GpuArch == "compute_50") {
+        // NVCC does not use compute_50 libdevice at all at the moment.
+        // The version that's shipped with CUDA-7.5 is a copy of compute_30.
       }
     }
 
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
+        FS.getBufferForFile(InstallPath + "/version.txt");
+    if (!VersionFile) {
+      // CUDA 7.0 doesn't have a version.txt, so guess that's our version if
+      // version.txt isn't present.
+      Version = CudaVersion::CUDA_70;
+    } else {
+      Version = ParseCudaVersionFile((*VersionFile)->getBuffer());
+    }
+
     IsValid = true;
     break;
   }
 }
 
+void Generic_GCC::CudaInstallationDetector::CheckCudaVersionSupportsArch(
+    CudaArch Arch) const {
+  if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
+      ArchsWithVersionTooLowErrors.count(Arch) > 0)
+    return;
+
+  auto RequiredVersion = MinVersionForCudaArch(Arch);
+  if (Version < RequiredVersion) {
+    ArchsWithVersionTooLowErrors.insert(Arch);
+    D.Diag(diag::err_drv_cuda_version_too_low)
+        << InstallPath << CudaArchToString(Arch) << CudaVersionToString(Version)
+        << CudaVersionToString(RequiredVersion);
+  }
+}
+
 void Generic_GCC::CudaInstallationDetector::print(raw_ostream &OS) const {
   if (isValid())
-    OS << "Found CUDA installation: " << CudaInstallPath << "\n";
+    OS << "Found CUDA installation: " << InstallPath << ", version "
+       << CudaVersionToString(Version) << "\n";
 }
 
 namespace {
 // Filter to remove Multilibs that don't exist as a suffix to Path
 class FilterNonExistent {
-  StringRef Base;
+  StringRef Base, File;
   vfs::FileSystem &VFS;
 
 public:
-  FilterNonExistent(StringRef Base, vfs::FileSystem &VFS)
-      : Base(Base), VFS(VFS) {}
+  FilterNonExistent(StringRef Base, StringRef File, vfs::FileSystem &VFS)
+      : Base(Base), File(File), VFS(VFS) {}
   bool operator()(const Multilib &M) {
-    return !VFS.exists(Base + M.gccSuffix() + "/crtbegin.o");
+    return !VFS.exists(Base + M.gccSuffix() + File);
   }
 };
 } // end anonymous namespace
@@ -1772,6 +1879,10 @@
     Flags.push_back(std::string("-") + Flag);
 }
 
+static bool isArmOrThumbArch(llvm::Triple::ArchType Arch) {
+  return Arch == llvm::Triple::arm || Arch == llvm::Triple::thumb;
+}
+
 static bool isMipsArch(llvm::Triple::ArchType Arch) {
   return Arch == llvm::Triple::mips || Arch == llvm::Triple::mipsel ||
          Arch == llvm::Triple::mips64 || Arch == llvm::Triple::mips64el;
@@ -1817,138 +1928,9 @@
   return Multilib(commonSuffix, commonSuffix, commonSuffix);
 }
 
-static bool findMIPSMultilibs(const Driver &D, const llvm::Triple &TargetTriple,
-                              StringRef Path, const ArgList &Args,
-                              DetectedMultilibs &Result) {
-  // Some MIPS toolchains put libraries and object files compiled
-  // using different options in to the sub-directoris which names
-  // reflects the flags used for compilation. For example sysroot
-  // directory might looks like the following examples:
-  //
-  // /usr
-  //   /lib      <= crt*.o files compiled with '-mips32'
-  // /mips16
-  //   /usr
-  //     /lib    <= crt*.o files compiled with '-mips16'
-  //   /el
-  //     /usr
-  //       /lib  <= crt*.o files compiled with '-mips16 -EL'
-  //
-  // or
-  //
-  // /usr
-  //   /lib      <= crt*.o files compiled with '-mips32r2'
-  // /mips16
-  //   /usr
-  //     /lib    <= crt*.o files compiled with '-mips32r2 -mips16'
-  // /mips32
-  //     /usr
-  //       /lib  <= crt*.o files compiled with '-mips32'
-
-  FilterNonExistent NonExistent(Path, D.getVFS());
-
-  // Check for FSF toolchain multilibs
-  MultilibSet FSFMipsMultilibs;
-  {
-    auto MArchMips32 = makeMultilib("/mips32")
-                           .flag("+m32")
-                           .flag("-m64")
-                           .flag("-mmicromips")
-                           .flag("+march=mips32");
-
-    auto MArchMicroMips = makeMultilib("/micromips")
-                              .flag("+m32")
-                              .flag("-m64")
-                              .flag("+mmicromips");
-
-    auto MArchMips64r2 = makeMultilib("/mips64r2")
-                             .flag("-m32")
-                             .flag("+m64")
-                             .flag("+march=mips64r2");
-
-    auto MArchMips64 = makeMultilib("/mips64").flag("-m32").flag("+m64").flag(
-        "-march=mips64r2");
-
-    auto MArchDefault = makeMultilib("")
-                            .flag("+m32")
-                            .flag("-m64")
-                            .flag("-mmicromips")
-                            .flag("+march=mips32r2");
-
-    auto Mips16 = makeMultilib("/mips16").flag("+mips16");
-
-    auto UCLibc = makeMultilib("/uclibc").flag("+muclibc");
-
-    auto MAbi64 =
-        makeMultilib("/64").flag("+mabi=n64").flag("-mabi=n32").flag("-m32");
-
-    auto BigEndian = makeMultilib("").flag("+EB").flag("-EL");
-
-    auto LittleEndian = makeMultilib("/el").flag("+EL").flag("-EB");
-
-    auto SoftFloat = makeMultilib("/sof").flag("+msoft-float");
-
-    auto Nan2008 = makeMultilib("/nan2008").flag("+mnan=2008");
-
-    FSFMipsMultilibs =
-        MultilibSet()
-            .Either(MArchMips32, MArchMicroMips, MArchMips64r2, MArchMips64,
-                    MArchDefault)
-            .Maybe(UCLibc)
-            .Maybe(Mips16)
-            .FilterOut("/mips64/mips16")
-            .FilterOut("/mips64r2/mips16")
-            .FilterOut("/micromips/mips16")
-            .Maybe(MAbi64)
-            .FilterOut("/micromips/64")
-            .FilterOut("/mips32/64")
-            .FilterOut("^/64")
-            .FilterOut("/mips16/64")
-            .Either(BigEndian, LittleEndian)
-            .Maybe(SoftFloat)
-            .Maybe(Nan2008)
-            .FilterOut(".*sof/nan2008")
-            .FilterOut(NonExistent)
-            .setIncludeDirsCallback([](StringRef InstallDir,
-                                       StringRef TripleStr, const Multilib &M) {
-              std::vector<std::string> Dirs;
-              Dirs.push_back((InstallDir + "/include").str());
-              std::string SysRootInc =
-                  InstallDir.str() + "/../../../../sysroot";
-              if (StringRef(M.includeSuffix()).startswith("/uclibc"))
-                Dirs.push_back(SysRootInc + "/uclibc/usr/include");
-              else
-                Dirs.push_back(SysRootInc + "/usr/include");
-              return Dirs;
-            });
-  }
-
-  // Check for Musl toolchain multilibs
-  MultilibSet MuslMipsMultilibs;
-  {
-    auto MArchMipsR2 = makeMultilib("")
-                           .osSuffix("/mips-r2-hard-musl")
-                           .flag("+EB")
-                           .flag("-EL")
-                           .flag("+march=mips32r2");
-
-    auto MArchMipselR2 = makeMultilib("/mipsel-r2-hard-musl")
-                             .flag("-EB")
-                             .flag("+EL")
-                             .flag("+march=mips32r2");
-
-    MuslMipsMultilibs = MultilibSet().Either(MArchMipsR2, MArchMipselR2);
-
-    // Specify the callback that computes the include directories.
-    MuslMipsMultilibs.setIncludeDirsCallback([](
-        StringRef InstallDir, StringRef TripleStr, const Multilib &M) {
-      std::vector<std::string> Dirs;
-      Dirs.push_back(
-          (InstallDir + "/../sysroot" + M.osSuffix() + "/usr/include").str());
-      return Dirs;
-    });
-  }
-
+static bool findMipsCsMultilibs(const Multilib::flags_list &Flags,
+                                FilterNonExistent &NonExistent,
+                                DetectedMultilibs &Result) {
   // Check for Code Sourcery toolchain multilibs
   MultilibSet CSMipsMultilibs;
   {
@@ -1992,26 +1974,17 @@
             .FilterOut("/mips16.*/64")
             .FilterOut("/micromips.*/64")
             .FilterOut(NonExistent)
-            .setIncludeDirsCallback([](StringRef InstallDir,
-                                       StringRef TripleStr, const Multilib &M) {
-              std::vector<std::string> Dirs;
-              Dirs.push_back((InstallDir + "/include").str());
-              std::string SysRootInc =
-                  InstallDir.str() + "/../../../../" + TripleStr.str();
+            .setIncludeDirsCallback([](const Multilib &M) {
+              std::vector<std::string> Dirs({"/include"});
               if (StringRef(M.includeSuffix()).startswith("/uclibc"))
-                Dirs.push_back(SysRootInc + "/libc/uclibc/usr/include");
+                Dirs.push_back(
+                    "/../../../../mips-linux-gnu/libc/uclibc/usr/include");
               else
-                Dirs.push_back(SysRootInc + "/libc/usr/include");
+                Dirs.push_back("/../../../../mips-linux-gnu/libc/usr/include");
               return Dirs;
             });
   }
 
-  MultilibSet AndroidMipsMultilibs =
-      MultilibSet()
-          .Maybe(Multilib("/mips-r2").flag("+march=mips32r2"))
-          .Maybe(Multilib("/mips-r6").flag("+march=mips32r6"))
-          .FilterOut(NonExistent);
-
   MultilibSet DebianMipsMultilibs;
   {
     Multilib MAbiN32 =
@@ -2030,7 +2003,270 @@
         MultilibSet().Either(M32, M64, MAbiN32).FilterOut(NonExistent);
   }
 
-  MultilibSet ImgMultilibs;
+  // Sort candidates. Toolchain that best meets the directories tree goes first.
+  // Then select the first toolchains matches command line flags.
+  MultilibSet *Candidates[] = {&CSMipsMultilibs, &DebianMipsMultilibs};
+  if (CSMipsMultilibs.size() < DebianMipsMultilibs.size())
+    std::iter_swap(Candidates, Candidates + 1);
+  for (const MultilibSet *Candidate : Candidates) {
+    if (Candidate->select(Flags, Result.SelectedMultilib)) {
+      if (Candidate == &DebianMipsMultilibs)
+        Result.BiarchSibling = Multilib();
+      Result.Multilibs = *Candidate;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool findMipsAndroidMultilibs(vfs::FileSystem &VFS, StringRef Path,
+                                     const Multilib::flags_list &Flags,
+                                     FilterNonExistent &NonExistent,
+                                     DetectedMultilibs &Result) {
+
+  MultilibSet AndroidMipsMultilibs =
+      MultilibSet()
+          .Maybe(Multilib("/mips-r2").flag("+march=mips32r2"))
+          .Maybe(Multilib("/mips-r6").flag("+march=mips32r6"))
+          .FilterOut(NonExistent);
+
+  MultilibSet AndroidMipselMultilibs =
+      MultilibSet()
+          .Either(Multilib().flag("+march=mips32"),
+                  Multilib("/mips-r2", "", "/mips-r2").flag("+march=mips32r2"),
+                  Multilib("/mips-r6", "", "/mips-r6").flag("+march=mips32r6"))
+          .FilterOut(NonExistent);
+
+  MultilibSet AndroidMips64elMultilibs =
+      MultilibSet()
+          .Either(
+              Multilib().flag("+march=mips64r6"),
+              Multilib("/32/mips-r1", "", "/mips-r1").flag("+march=mips32"),
+              Multilib("/32/mips-r2", "", "/mips-r2").flag("+march=mips32r2"),
+              Multilib("/32/mips-r6", "", "/mips-r6").flag("+march=mips32r6"))
+          .FilterOut(NonExistent);
+
+  MultilibSet *MS = &AndroidMipsMultilibs;
+  if (VFS.exists(Path + "/mips-r6"))
+    MS = &AndroidMipselMultilibs;
+  else if (VFS.exists(Path + "/32"))
+    MS = &AndroidMips64elMultilibs;
+  if (MS->select(Flags, Result.SelectedMultilib)) {
+    Result.Multilibs = *MS;
+    return true;
+  }
+  return false;
+}
+
+static bool findMipsMuslMultilibs(const Multilib::flags_list &Flags,
+                                  FilterNonExistent &NonExistent,
+                                  DetectedMultilibs &Result) {
+  // Musl toolchain multilibs
+  MultilibSet MuslMipsMultilibs;
+  {
+    auto MArchMipsR2 = makeMultilib("")
+                           .osSuffix("/mips-r2-hard-musl")
+                           .flag("+EB")
+                           .flag("-EL")
+                           .flag("+march=mips32r2");
+
+    auto MArchMipselR2 = makeMultilib("/mipsel-r2-hard-musl")
+                             .flag("-EB")
+                             .flag("+EL")
+                             .flag("+march=mips32r2");
+
+    MuslMipsMultilibs = MultilibSet().Either(MArchMipsR2, MArchMipselR2);
+
+    // Specify the callback that computes the include directories.
+    MuslMipsMultilibs.setIncludeDirsCallback([](const Multilib &M) {
+      return std::vector<std::string>(
+          {"/../sysroot" + M.osSuffix() + "/usr/include"});
+    });
+  }
+  if (MuslMipsMultilibs.select(Flags, Result.SelectedMultilib)) {
+    Result.Multilibs = MuslMipsMultilibs;
+    return true;
+  }
+  return false;
+}
+
+static bool findMipsMtiMultilibs(const Multilib::flags_list &Flags,
+                                 FilterNonExistent &NonExistent,
+                                 DetectedMultilibs &Result) {
+  // CodeScape MTI toolchain v1.2 and early.
+  MultilibSet MtiMipsMultilibsV1;
+  {
+    auto MArchMips32 = makeMultilib("/mips32")
+                           .flag("+m32")
+                           .flag("-m64")
+                           .flag("-mmicromips")
+                           .flag("+march=mips32");
+
+    auto MArchMicroMips = makeMultilib("/micromips")
+                              .flag("+m32")
+                              .flag("-m64")
+                              .flag("+mmicromips");
+
+    auto MArchMips64r2 = makeMultilib("/mips64r2")
+                             .flag("-m32")
+                             .flag("+m64")
+                             .flag("+march=mips64r2");
+
+    auto MArchMips64 = makeMultilib("/mips64").flag("-m32").flag("+m64").flag(
+        "-march=mips64r2");
+
+    auto MArchDefault = makeMultilib("")
+                            .flag("+m32")
+                            .flag("-m64")
+                            .flag("-mmicromips")
+                            .flag("+march=mips32r2");
+
+    auto Mips16 = makeMultilib("/mips16").flag("+mips16");
+
+    auto UCLibc = makeMultilib("/uclibc").flag("+muclibc");
+
+    auto MAbi64 =
+        makeMultilib("/64").flag("+mabi=n64").flag("-mabi=n32").flag("-m32");
+
+    auto BigEndian = makeMultilib("").flag("+EB").flag("-EL");
+
+    auto LittleEndian = makeMultilib("/el").flag("+EL").flag("-EB");
+
+    auto SoftFloat = makeMultilib("/sof").flag("+msoft-float");
+
+    auto Nan2008 = makeMultilib("/nan2008").flag("+mnan=2008");
+
+    MtiMipsMultilibsV1 =
+        MultilibSet()
+            .Either(MArchMips32, MArchMicroMips, MArchMips64r2, MArchMips64,
+                    MArchDefault)
+            .Maybe(UCLibc)
+            .Maybe(Mips16)
+            .FilterOut("/mips64/mips16")
+            .FilterOut("/mips64r2/mips16")
+            .FilterOut("/micromips/mips16")
+            .Maybe(MAbi64)
+            .FilterOut("/micromips/64")
+            .FilterOut("/mips32/64")
+            .FilterOut("^/64")
+            .FilterOut("/mips16/64")
+            .Either(BigEndian, LittleEndian)
+            .Maybe(SoftFloat)
+            .Maybe(Nan2008)
+            .FilterOut(".*sof/nan2008")
+            .FilterOut(NonExistent)
+            .setIncludeDirsCallback([](const Multilib &M) {
+              std::vector<std::string> Dirs({"/include"});
+              if (StringRef(M.includeSuffix()).startswith("/uclibc"))
+                Dirs.push_back("/../../../../sysroot/uclibc/usr/include");
+              else
+                Dirs.push_back("/../../../../sysroot/usr/include");
+              return Dirs;
+            });
+  }
+
+  // CodeScape IMG toolchain starting from v1.3.
+  MultilibSet MtiMipsMultilibsV2;
+  {
+    auto BeHard = makeMultilib("/mips-r2-hard")
+                      .flag("+EB")
+                      .flag("-msoft-float")
+                      .flag("-mnan=2008")
+                      .flag("-muclibc");
+    auto BeSoft = makeMultilib("/mips-r2-soft")
+                      .flag("+EB")
+                      .flag("+msoft-float")
+                      .flag("-mnan=2008");
+    auto ElHard = makeMultilib("/mipsel-r2-hard")
+                      .flag("+EL")
+                      .flag("-msoft-float")
+                      .flag("-mnan=2008")
+                      .flag("-muclibc");
+    auto ElSoft = makeMultilib("/mipsel-r2-soft")
+                      .flag("+EL")
+                      .flag("+msoft-float")
+                      .flag("-mnan=2008")
+                      .flag("-mmicromips");
+    auto BeHardNan = makeMultilib("/mips-r2-hard-nan2008")
+                         .flag("+EB")
+                         .flag("-msoft-float")
+                         .flag("+mnan=2008")
+                         .flag("-muclibc");
+    auto ElHardNan = makeMultilib("/mipsel-r2-hard-nan2008")
+                         .flag("+EL")
+                         .flag("-msoft-float")
+                         .flag("+mnan=2008")
+                         .flag("-muclibc")
+                         .flag("-mmicromips");
+    auto BeHardNanUclibc = makeMultilib("/mips-r2-hard-nan2008-uclibc")
+                               .flag("+EB")
+                               .flag("-msoft-float")
+                               .flag("+mnan=2008")
+                               .flag("+muclibc");
+    auto ElHardNanUclibc = makeMultilib("/mipsel-r2-hard-nan2008-uclibc")
+                               .flag("+EL")
+                               .flag("-msoft-float")
+                               .flag("+mnan=2008")
+                               .flag("+muclibc");
+    auto BeHardUclibc = makeMultilib("/mips-r2-hard-uclibc")
+                            .flag("+EB")
+                            .flag("-msoft-float")
+                            .flag("-mnan=2008")
+                            .flag("+muclibc");
+    auto ElHardUclibc = makeMultilib("/mipsel-r2-hard-uclibc")
+                            .flag("+EL")
+                            .flag("-msoft-float")
+                            .flag("-mnan=2008")
+                            .flag("+muclibc");
+    auto ElMicroHardNan = makeMultilib("/micromipsel-r2-hard-nan2008")
+                              .flag("+EL")
+                              .flag("-msoft-float")
+                              .flag("+mnan=2008")
+                              .flag("+mmicromips");
+    auto ElMicroSoft = makeMultilib("/micromipsel-r2-soft")
+                           .flag("+EL")
+                           .flag("+msoft-float")
+                           .flag("-mnan=2008")
+                           .flag("+mmicromips");
+
+    auto O32 =
+        makeMultilib("/lib").osSuffix("").flag("-mabi=n32").flag("-mabi=n64");
+    auto N32 =
+        makeMultilib("/lib32").osSuffix("").flag("+mabi=n32").flag("-mabi=n64");
+    auto N64 =
+        makeMultilib("/lib64").osSuffix("").flag("-mabi=n32").flag("+mabi=n64");
+
+    MtiMipsMultilibsV2 =
+        MultilibSet()
+            .Either({BeHard, BeSoft, ElHard, ElSoft, BeHardNan, ElHardNan,
+                     BeHardNanUclibc, ElHardNanUclibc, BeHardUclibc,
+                     ElHardUclibc, ElMicroHardNan, ElMicroSoft})
+            .Either(O32, N32, N64)
+            .FilterOut(NonExistent)
+            .setIncludeDirsCallback([](const Multilib &M) {
+              return std::vector<std::string>({"/../../../../sysroot" +
+                                               M.includeSuffix() +
+                                               "/../usr/include"});
+            })
+            .setFilePathsCallback([](const Multilib &M) {
+              return std::vector<std::string>(
+                  {"/../../../../mips-mti-linux-gnu/lib" + M.gccSuffix()});
+            });
+  }
+  for (auto Candidate : {&MtiMipsMultilibsV1, &MtiMipsMultilibsV2}) {
+    if (Candidate->select(Flags, Result.SelectedMultilib)) {
+      Result.Multilibs = *Candidate;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool findMipsImgMultilibs(const Multilib::flags_list &Flags,
+                                 FilterNonExistent &NonExistent,
+                                 DetectedMultilibs &Result) {
+  // CodeScape IMG toolchain v1.2 and early.
+  MultilibSet ImgMultilibsV1;
   {
     auto Mips64r6 = makeMultilib("/mips64r6").flag("+m64").flag("-m32");
 
@@ -2039,22 +2275,91 @@
     auto MAbi64 =
         makeMultilib("/64").flag("+mabi=n64").flag("-mabi=n32").flag("-m32");
 
-    ImgMultilibs =
+    ImgMultilibsV1 =
         MultilibSet()
             .Maybe(Mips64r6)
             .Maybe(MAbi64)
             .Maybe(LittleEndian)
             .FilterOut(NonExistent)
-            .setIncludeDirsCallback([](StringRef InstallDir,
-                                       StringRef TripleStr, const Multilib &M) {
-              std::vector<std::string> Dirs;
-              Dirs.push_back((InstallDir + "/include").str());
-              Dirs.push_back(
-                  (InstallDir + "/../../../../sysroot/usr/include").str());
-              return Dirs;
+            .setIncludeDirsCallback([](const Multilib &M) {
+              return std::vector<std::string>(
+                  {"/include", "/../../../../sysroot/usr/include"});
             });
   }
 
+  // CodeScape IMG toolchain starting from v1.3.
+  MultilibSet ImgMultilibsV2;
+  {
+    auto BeHard = makeMultilib("/mips-r6-hard")
+                      .flag("+EB")
+                      .flag("-msoft-float")
+                      .flag("-mmicromips");
+    auto BeSoft = makeMultilib("/mips-r6-soft")
+                      .flag("+EB")
+                      .flag("+msoft-float")
+                      .flag("-mmicromips");
+    auto ElHard = makeMultilib("/mipsel-r6-hard")
+                      .flag("+EL")
+                      .flag("-msoft-float")
+                      .flag("-mmicromips");
+    auto ElSoft = makeMultilib("/mipsel-r6-soft")
+                      .flag("+EL")
+                      .flag("+msoft-float")
+                      .flag("-mmicromips");
+    auto BeMicroHard = makeMultilib("/micromips-r6-hard")
+                           .flag("+EB")
+                           .flag("-msoft-float")
+                           .flag("+mmicromips");
+    auto BeMicroSoft = makeMultilib("/micromips-r6-soft")
+                           .flag("+EB")
+                           .flag("+msoft-float")
+                           .flag("+mmicromips");
+    auto ElMicroHard = makeMultilib("/micromipsel-r6-hard")
+                           .flag("+EL")
+                           .flag("-msoft-float")
+                           .flag("+mmicromips");
+    auto ElMicroSoft = makeMultilib("/micromipsel-r6-soft")
+                           .flag("+EL")
+                           .flag("+msoft-float")
+                           .flag("+mmicromips");
+
+    auto O32 =
+        makeMultilib("/lib").osSuffix("").flag("-mabi=n32").flag("-mabi=n64");
+    auto N32 =
+        makeMultilib("/lib32").osSuffix("").flag("+mabi=n32").flag("-mabi=n64");
+    auto N64 =
+        makeMultilib("/lib64").osSuffix("").flag("-mabi=n32").flag("+mabi=n64");
+
+    ImgMultilibsV2 =
+        MultilibSet()
+            .Either({BeHard, BeSoft, ElHard, ElSoft, BeMicroHard, BeMicroSoft,
+                     ElMicroHard, ElMicroSoft})
+            .Either(O32, N32, N64)
+            .FilterOut(NonExistent)
+            .setIncludeDirsCallback([](const Multilib &M) {
+              return std::vector<std::string>({"/../../../../sysroot" +
+                                               M.includeSuffix() +
+                                               "/../usr/include"});
+            })
+            .setFilePathsCallback([](const Multilib &M) {
+              return std::vector<std::string>(
+                  {"/../../../../mips-img-linux-gnu/lib" + M.gccSuffix()});
+            });
+  }
+  for (auto Candidate : {&ImgMultilibsV1, &ImgMultilibsV2}) {
+    if (Candidate->select(Flags, Result.SelectedMultilib)) {
+      Result.Multilibs = *Candidate;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool findMIPSMultilibs(const Driver &D, const llvm::Triple &TargetTriple,
+                              StringRef Path, const ArgList &Args,
+                              DetectedMultilibs &Result) {
+  FilterNonExistent NonExistent(Path, "/crtbegin.o", D.getVFS());
+
   StringRef CPUName;
   StringRef ABIName;
   tools::mips::getMipsCPUAndABI(Args, TargetTriple, CPUName, ABIName);
@@ -2074,6 +2379,7 @@
   addMultilibFlag(CPUName == "mips64r2" || CPUName == "mips64r3" ||
                       CPUName == "mips64r5" || CPUName == "octeon",
                   "march=mips64r2", Flags);
+  addMultilibFlag(CPUName == "mips64r6", "march=mips64r6", Flags);
   addMultilibFlag(isMicroMips(Args), "mmicromips", Flags);
   addMultilibFlag(tools::mips::isUCLibc(Args), "muclibc", Flags);
   addMultilibFlag(tools::mips::isNaN2008(Args, TargetTriple), "mnan=2008",
@@ -2085,67 +2391,83 @@
   addMultilibFlag(isMipsEL(TargetArch), "EL", Flags);
   addMultilibFlag(!isMipsEL(TargetArch), "EB", Flags);
 
-  if (TargetTriple.isAndroid()) {
-    // Select Android toolchain. It's the only choice in that case.
-    if (AndroidMipsMultilibs.select(Flags, Result.SelectedMultilib)) {
-      Result.Multilibs = AndroidMipsMultilibs;
-      return true;
-    }
-    return false;
-  }
+  if (TargetTriple.isAndroid())
+    return findMipsAndroidMultilibs(D.getVFS(), Path, Flags, NonExistent,
+                                    Result);
 
   if (TargetTriple.getVendor() == llvm::Triple::MipsTechnologies &&
       TargetTriple.getOS() == llvm::Triple::Linux &&
-      TargetTriple.getEnvironment() == llvm::Triple::UnknownEnvironment) {
-    if (MuslMipsMultilibs.select(Flags, Result.SelectedMultilib)) {
-      Result.Multilibs = MuslMipsMultilibs;
-      return true;
-    }
-    return false;
-  }
+      TargetTriple.getEnvironment() == llvm::Triple::UnknownEnvironment)
+    return findMipsMuslMultilibs(Flags, NonExistent, Result);
+
+  if (TargetTriple.getVendor() == llvm::Triple::MipsTechnologies &&
+      TargetTriple.getOS() == llvm::Triple::Linux &&
+      TargetTriple.getEnvironment() == llvm::Triple::GNU)
+    return findMipsMtiMultilibs(Flags, NonExistent, Result);
 
   if (TargetTriple.getVendor() == llvm::Triple::ImaginationTechnologies &&
       TargetTriple.getOS() == llvm::Triple::Linux &&
-      TargetTriple.getEnvironment() == llvm::Triple::GNU) {
-    // Select mips-img-linux-gnu toolchain.
-    if (ImgMultilibs.select(Flags, Result.SelectedMultilib)) {
-      Result.Multilibs = ImgMultilibs;
-      return true;
-    }
-    return false;
-  }
+      TargetTriple.getEnvironment() == llvm::Triple::GNU)
+    return findMipsImgMultilibs(Flags, NonExistent, Result);
 
-  // Sort candidates. Toolchain that best meets the directories goes first.
-  // Then select the first toolchains matches command line flags.
-  MultilibSet *candidates[] = {&DebianMipsMultilibs, &FSFMipsMultilibs,
-                               &CSMipsMultilibs};
-  std::sort(
-      std::begin(candidates), std::end(candidates),
-      [](MultilibSet *a, MultilibSet *b) { return a->size() > b->size(); });
-  for (const auto &candidate : candidates) {
-    if (candidate->select(Flags, Result.SelectedMultilib)) {
-      if (candidate == &DebianMipsMultilibs)
-        Result.BiarchSibling = Multilib();
-      Result.Multilibs = *candidate;
-      return true;
-    }
-  }
+  if (findMipsCsMultilibs(Flags, NonExistent, Result))
+    return true;
 
-  {
-    // Fallback to the regular toolchain-tree structure.
-    Multilib Default;
-    Result.Multilibs.push_back(Default);
-    Result.Multilibs.FilterOut(NonExistent);
+  // Fallback to the regular toolchain-tree structure.
+  Multilib Default;
+  Result.Multilibs.push_back(Default);
+  Result.Multilibs.FilterOut(NonExistent);
 
-    if (Result.Multilibs.select(Flags, Result.SelectedMultilib)) {
-      Result.BiarchSibling = Multilib();
-      return true;
-    }
+  if (Result.Multilibs.select(Flags, Result.SelectedMultilib)) {
+    Result.BiarchSibling = Multilib();
+    return true;
   }
 
   return false;
 }
 
+static void findAndroidArmMultilibs(const Driver &D,
+                                    const llvm::Triple &TargetTriple,
+                                    StringRef Path, const ArgList &Args,
+                                    DetectedMultilibs &Result) {
+  // Find multilibs with subdirectories like armv7-a, thumb, armv7-a/thumb.
+  FilterNonExistent NonExistent(Path, "/crtbegin.o", D.getVFS());
+  Multilib ArmV7Multilib = makeMultilib("/armv7-a")
+                               .flag("+armv7")
+                               .flag("-thumb");
+  Multilib ThumbMultilib = makeMultilib("/thumb")
+                               .flag("-armv7")
+                               .flag("+thumb");
+  Multilib ArmV7ThumbMultilib = makeMultilib("/armv7-a/thumb")
+                               .flag("+armv7")
+                               .flag("+thumb");
+  Multilib DefaultMultilib = makeMultilib("")
+                               .flag("-armv7")
+                               .flag("-thumb");
+  MultilibSet AndroidArmMultilibs =
+      MultilibSet()
+          .Either(ThumbMultilib, ArmV7Multilib,
+                  ArmV7ThumbMultilib, DefaultMultilib)
+          .FilterOut(NonExistent);
+
+  Multilib::flags_list Flags;
+  llvm::StringRef Arch = Args.getLastArgValue(options::OPT_march_EQ);
+  bool IsArmArch = TargetTriple.getArch() == llvm::Triple::arm;
+  bool IsThumbArch = TargetTriple.getArch() == llvm::Triple::thumb;
+  bool IsV7SubArch = TargetTriple.getSubArch() == llvm::Triple::ARMSubArch_v7;
+  bool IsThumbMode = IsThumbArch ||
+      Args.hasFlag(options::OPT_mthumb, options::OPT_mno_thumb, false) ||
+      (IsArmArch && llvm::ARM::parseArchISA(Arch) == llvm::ARM::IK_THUMB);
+  bool IsArmV7Mode = (IsArmArch || IsThumbArch) &&
+      (llvm::ARM::parseArchVersion(Arch) == 7 ||
+       (IsArmArch && Arch == "" && IsV7SubArch));
+  addMultilibFlag(IsArmV7Mode, "armv7", Flags);
+  addMultilibFlag(IsThumbMode, "thumb", Flags);
+
+  if (AndroidArmMultilibs.select(Flags, Result.SelectedMultilib))
+    Result.Multilibs = AndroidArmMultilibs;
+}
+
 static bool findBiarchMultilibs(const Driver &D,
                                 const llvm::Triple &TargetTriple,
                                 StringRef Path, const ArgList &Args,
@@ -2178,7 +2500,9 @@
                         .flag("-m64")
                         .flag("+mx32");
 
-  FilterNonExistent NonExistent(Path, D.getVFS());
+  // GCC toolchain for IAMCU doesn't have crtbegin.o, so look for libgcc.a.
+  FilterNonExistent NonExistent(
+      Path, TargetTriple.isOSIAMCU() ? "/libgcc.a" : "/crtbegin.o", D.getVFS());
 
   // Determine default multilib from: 32, 64, x32
   // Also handle cases such as 64 on 32, 32 on 64, etc.
@@ -2337,9 +2661,13 @@
 
       DetectedMultilibs Detected;
 
+      // Android standalone toolchain could have multilibs for ARM and Thumb.
       // Debian mips multilibs behave more like the rest of the biarch ones,
       // so handle them there
-      if (isMipsArch(TargetArch)) {
+      if (isArmOrThumbArch(TargetArch) && TargetTriple.isAndroid()) {
+        // It should also work without multilibs in a simplified toolchain.
+        findAndroidArmMultilibs(D, TargetTriple, LI->getName(), Args, Detected);
+      } else if (isMipsArch(TargetArch)) {
         if (!findMIPSMultilibs(D, TargetTriple, LI->getName(), Args, Detected))
           continue;
       } else if (!findBiarchMultilibs(D, TargetTriple, LI->getName(), Args,
@@ -2430,6 +2758,8 @@
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
   case llvm::Triple::systemz:
+  case llvm::Triple::mips:
+  case llvm::Triple::mipsel:
     return true;
   default:
     return false;
@@ -2469,7 +2799,6 @@
   return true;
 }
 
-
 void Generic_ELF::addClangTargetOptions(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
   const Generic_GCC::GCCVersion &V = GCCInstallation.getVersion();
@@ -2525,10 +2854,9 @@
 
   const auto &Callback = Multilibs.includeDirsCallback();
   if (Callback) {
-    const auto IncludePaths =
-        Callback(D.getInstalledDir(), getTripleString(), SelectedMultilib);
-    for (const auto &Path : IncludePaths)
-      addExternCSystemIncludeIfExists(DriverArgs, CC1Args, Path);
+    for (const auto &Path : Callback(SelectedMultilib))
+      addExternCSystemIncludeIfExists(DriverArgs, CC1Args,
+                                      D.getInstalledDir() + Path);
   }
 }
 
@@ -2573,11 +2901,10 @@
 
   const auto &Callback = Multilibs.includeDirsCallback();
   if (Callback) {
-    const auto IncludePaths = Callback(getDriver().getInstalledDir(),
-                                       getTripleString(), SelectedMultilib);
-    for (const auto &Path : IncludePaths) {
-      if (llvm::sys::fs::exists(Path + "/c++/v1")) {
-        addSystemInclude(DriverArgs, CC1Args, Path + "/c++/v1");
+    for (std::string Path : Callback(SelectedMultilib)) {
+      Path = getDriver().getInstalledDir() + Path + "/c++/v1";
+      if (llvm::sys::fs::exists(Path)) {
+        addSystemInclude(DriverArgs, CC1Args, Path);
         break;
       }
     }
@@ -2621,14 +2948,9 @@
   if (getVFS().exists(InstallRelDir = InstalledDir + "/../target"))
     return InstallRelDir;
 
-  std::string PrefixRelDir = std::string(LLVM_PREFIX) + "/target";
-  if (getVFS().exists(PrefixRelDir))
-    return PrefixRelDir;
-
   return InstallRelDir;
 }
 
-
 Optional<unsigned> HexagonToolChain::getSmallDataThreshold(
       const ArgList &Args) {
   StringRef Gn = "";
@@ -2647,7 +2969,6 @@
   return None;
 }
 
-
 void HexagonToolChain::getHexagonLibraryPaths(const ArgList &Args,
       ToolChain::path_list &LibPaths) const {
   const Driver &D = getDriver();
@@ -3023,6 +3344,61 @@
   return new tools::cloudabi::Linker(*this);
 }
 
+bool CloudABI::isPIEDefault() const {
+  // Only enable PIE on architectures that support PC-relative
+  // addressing. PC-relative addressing is required, as the process
+  // startup code must be able to relocate itself.
+  switch (getTriple().getArch()) {
+  case llvm::Triple::aarch64:
+  case llvm::Triple::x86_64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+SanitizerMask CloudABI::getSupportedSanitizers() const {
+  SanitizerMask Res = ToolChain::getSupportedSanitizers();
+  Res |= SanitizerKind::SafeStack;
+  return Res;
+}
+
+SanitizerMask CloudABI::getDefaultSanitizers() const {
+  return SanitizerKind::SafeStack;
+}
+
+/// Haiku - Haiku tool chain which can call as(1) and ld(1) directly.
+
+Haiku::Haiku(const Driver &D, const llvm::Triple& Triple, const ArgList &Args)
+  : Generic_ELF(D, Triple, Args) {
+
+}
+
+void Haiku::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
+                                          ArgStringList &CC1Args) const {
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc) ||
+      DriverArgs.hasArg(options::OPT_nostdincxx))
+    return;
+
+  switch (GetCXXStdlibType(DriverArgs)) {
+  case ToolChain::CST_Libcxx:
+    addSystemInclude(DriverArgs, CC1Args,
+                     getDriver().SysRoot + "/system/develop/headers/c++/v1");
+    break;
+  case ToolChain::CST_Libstdcxx:
+    addSystemInclude(DriverArgs, CC1Args,
+                     getDriver().SysRoot + "/system/develop/headers/c++");
+    addSystemInclude(DriverArgs, CC1Args,
+                     getDriver().SysRoot + "/system/develop/headers/c++/backward");
+
+    StringRef Triple = getTriple().str();
+    addSystemInclude(DriverArgs, CC1Args,
+                     getDriver().SysRoot + "/system/develop/headers/c++/" +
+                     Triple);
+    break;
+  }
+}
+
 /// OpenBSD - OpenBSD tool chain which can call as(1) and ld(1) directly.
 
 OpenBSD::OpenBSD(const Driver &D, const llvm::Triple &Triple,
@@ -3052,16 +3428,7 @@
 
 Tool *Bitrig::buildLinker() const { return new tools::bitrig::Linker(*this); }
 
-ToolChain::CXXStdlibType Bitrig::GetCXXStdlibType(const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) {
-    StringRef Value = A->getValue();
-    if (Value == "libstdc++")
-      return ToolChain::CST_Libstdcxx;
-    if (Value == "libc++")
-      return ToolChain::CST_Libcxx;
-
-    getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args);
-  }
+ToolChain::CXXStdlibType Bitrig::GetDefaultCXXStdlibType() const {
   return ToolChain::CST_Libcxx;
 }
 
@@ -3125,16 +3492,7 @@
     getFilePaths().push_back(getDriver().SysRoot + "/usr/lib");
 }
 
-ToolChain::CXXStdlibType FreeBSD::GetCXXStdlibType(const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) {
-    StringRef Value = A->getValue();
-    if (Value == "libstdc++")
-      return ToolChain::CST_Libstdcxx;
-    if (Value == "libc++")
-      return ToolChain::CST_Libcxx;
-
-    getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args);
-  }
+ToolChain::CXXStdlibType FreeBSD::GetDefaultCXXStdlibType() const {
   if (getTriple().getOSMajorVersion() >= 10)
     return ToolChain::CST_Libcxx;
   return ToolChain::CST_Libstdcxx;
@@ -3160,6 +3518,22 @@
   }
 }
 
+void FreeBSD::AddCXXStdlibLibArgs(const ArgList &Args,
+                                  ArgStringList &CmdArgs) const {
+  CXXStdlibType Type = GetCXXStdlibType(Args);
+  bool Profiling = Args.hasArg(options::OPT_pg);
+
+  switch (Type) {
+  case ToolChain::CST_Libcxx:
+    CmdArgs.push_back(Profiling ? "-lc++_p" : "-lc++");
+    break;
+
+  case ToolChain::CST_Libstdcxx:
+    CmdArgs.push_back(Profiling ? "-lstdc++_p" : "-lstdc++");
+    break;
+  }
+}
+
 Tool *FreeBSD::buildAssembler() const {
   return new tools::freebsd::Assembler(*this);
 }
@@ -3206,7 +3580,6 @@
 
 NetBSD::NetBSD(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     : Generic_ELF(D, Triple, Args) {
-
   if (getDriver().UseStdLib) {
     // When targeting a 32-bit platform, try the special directory used on
     // 64-bit hosts, and only fall back to the main library directory if that
@@ -3262,20 +3635,10 @@
 
 Tool *NetBSD::buildLinker() const { return new tools::netbsd::Linker(*this); }
 
-ToolChain::CXXStdlibType NetBSD::GetCXXStdlibType(const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) {
-    StringRef Value = A->getValue();
-    if (Value == "libstdc++")
-      return ToolChain::CST_Libstdcxx;
-    if (Value == "libc++")
-      return ToolChain::CST_Libcxx;
-
-    getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args);
-  }
-
+ToolChain::CXXStdlibType NetBSD::GetDefaultCXXStdlibType() const {
   unsigned Major, Minor, Micro;
   getTriple().getOSVersion(Major, Minor, Micro);
-  if (Major >= 7 || (Major == 6 && Minor == 99 && Micro >= 49) || Major == 0) {
+  if (Major >= 7 || Major == 0) {
     switch (getArch()) {
     case llvm::Triple::aarch64:
     case llvm::Triple::arm:
@@ -3285,6 +3648,8 @@
     case llvm::Triple::ppc:
     case llvm::Triple::ppc64:
     case llvm::Triple::ppc64le:
+    case llvm::Triple::sparc:
+    case llvm::Triple::sparcv9:
     case llvm::Triple::x86:
     case llvm::Triple::x86_64:
       return ToolChain::CST_Libcxx;
@@ -3415,7 +3780,6 @@
   DebianJessie,
   DebianStretch,
   Exherbo,
-  RHEL4,
   RHEL5,
   RHEL6,
   RHEL7,
@@ -3438,11 +3802,12 @@
   UbuntuVivid,
   UbuntuWily,
   UbuntuXenial,
+  UbuntuYakkety,
   UnknownDistro
 };
 
 static bool IsRedhat(enum Distro Distro) {
-  return Distro == Fedora || (Distro >= RHEL4 && Distro <= RHEL7);
+  return Distro == Fedora || (Distro >= RHEL5 && Distro <= RHEL7);
 }
 
 static bool IsOpenSUSE(enum Distro Distro) { return Distro == OpenSUSE; }
@@ -3452,7 +3817,7 @@
 }
 
 static bool IsUbuntu(enum Distro Distro) {
-  return Distro >= UbuntuHardy && Distro <= UbuntuXenial;
+  return Distro >= UbuntuHardy && Distro <= UbuntuYakkety;
 }
 
 static Distro DetectDistro(const Driver &D, llvm::Triple::ArchType Arch) {
@@ -3483,8 +3848,10 @@
                       .Case("vivid", UbuntuVivid)
                       .Case("wily", UbuntuWily)
                       .Case("xenial", UbuntuXenial)
+                      .Case("yakkety", UbuntuYakkety)
                       .Default(UnknownDistro);
-    return Version;
+    if (Version != UnknownDistro)
+      return Version;
   }
 
   File = llvm::MemoryBuffer::getFile("/etc/redhat-release");
@@ -3493,15 +3860,14 @@
     if (Data.startswith("Fedora release"))
       return Fedora;
     if (Data.startswith("Red Hat Enterprise Linux") ||
-        Data.startswith("CentOS")) {
+        Data.startswith("CentOS") ||
+        Data.startswith("Scientific Linux")) {
       if (Data.find("release 7") != StringRef::npos)
         return RHEL7;
       else if (Data.find("release 6") != StringRef::npos)
         return RHEL6;
       else if (Data.find("release 5") != StringRef::npos)
         return RHEL5;
-      else if (Data.find("release 4") != StringRef::npos)
-        return RHEL4;
     }
     return UnknownDistro;
   }
@@ -3646,6 +4012,15 @@
 
 static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) {
   if (isMipsArch(Triple.getArch())) {
+    if (Triple.isAndroid()) {
+      StringRef CPUName;
+      StringRef ABIName;
+      tools::mips::getMipsCPUAndABI(Args, Triple, CPUName, ABIName);
+      if (CPUName == "mips32r6")
+        return "libr6";
+      if (CPUName == "mips32r2")
+        return "libr2";
+    }
     // lib32 directory has a special meaning on MIPS targets.
     // It contains N32 ABI binaries. Use this folder if produce
     // code for N32 ABI only.
@@ -3674,6 +4049,15 @@
   return Triple.isArch32Bit() ? "lib" : "lib64";
 }
 
+static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
+                                  const Multilib &Multilib,
+                                  StringRef InstallPath,
+                                  ToolChain::path_list &Paths) {
+  if (const auto &PathsCallback = Multilibs.filePathsCallback())
+    for (const auto &Path : PathsCallback(Multilib))
+      addPathIfExists(D, InstallPath + Path, Paths);
+}
+
 Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     : Generic_ELF(D, Triple, Args) {
   GCCInstallation.init(Triple, Args);
@@ -3726,13 +4110,12 @@
       ExtraOpts.push_back("--hash-style=both");
   }
 
-  if (IsRedhat(Distro))
+  if (IsRedhat(Distro) && Distro != RHEL5 && Distro != RHEL6)
     ExtraOpts.push_back("--no-add-needed");
 
-  if ((IsDebian(Distro) && Distro >= DebianSqueeze) || IsOpenSUSE(Distro) ||
-      (IsRedhat(Distro) && Distro != RHEL4 && Distro != RHEL5) ||
-      (IsUbuntu(Distro) && Distro >= UbuntuKarmic))
-    ExtraOpts.push_back("--build-id");
+#ifdef ENABLE_LINKER_BUILD_ID
+  ExtraOpts.push_back("--build-id");
+#endif
 
   if (IsOpenSUSE(Distro))
     ExtraOpts.push_back("--enable-new-dtags");
@@ -3752,6 +4135,11 @@
     const llvm::Triple &GCCTriple = GCCInstallation.getTriple();
     const std::string &LibPath = GCCInstallation.getParentLibPath();
     const Multilib &Multilib = GCCInstallation.getMultilib();
+    const MultilibSet &Multilibs = GCCInstallation.getMultilibs();
+
+    // Add toolchain / multilib specific file paths.
+    addMultilibsFilePaths(D, Multilibs, Multilib,
+                          GCCInstallation.getInstallPath(), Paths);
 
     // Sourcery CodeBench MIPS toolchain holds some libraries under
     // a biarch-like suffix of the GCC installation.
@@ -3891,6 +4279,134 @@
   return std::string();
 }
 
+std::string Linux::getDynamicLinker(const ArgList &Args) const {
+  const llvm::Triple::ArchType Arch = getArch();
+  const llvm::Triple &Triple = getTriple();
+
+  const enum Distro Distro = DetectDistro(getDriver(), Arch);
+
+  if (Triple.isAndroid())
+    return Triple.isArch64Bit() ? "/system/bin/linker64" : "/system/bin/linker";
+
+  if (Triple.isMusl()) {
+    std::string ArchName;
+    bool IsArm = false;
+
+    switch (Arch) {
+    case llvm::Triple::arm:
+    case llvm::Triple::thumb:
+      ArchName = "arm";
+      IsArm = true;
+      break;
+    case llvm::Triple::armeb:
+    case llvm::Triple::thumbeb:
+      ArchName = "armeb";
+      IsArm = true;
+      break;
+    default:
+      ArchName = Triple.getArchName().str();
+    }
+    if (IsArm &&
+        (Triple.getEnvironment() == llvm::Triple::MuslEABIHF ||
+         tools::arm::getARMFloatABI(*this, Args) == tools::arm::FloatABI::Hard))
+      ArchName += "hf";
+
+    return "/lib/ld-musl-" + ArchName + ".so.1";
+  }
+
+  std::string LibDir;
+  std::string Loader;
+
+  switch (Arch) {
+  default:
+    llvm_unreachable("unsupported architecture");
+
+  case llvm::Triple::aarch64:
+    LibDir = "lib";
+    Loader = "ld-linux-aarch64.so.1";
+    break;
+  case llvm::Triple::aarch64_be:
+    LibDir = "lib";
+    Loader = "ld-linux-aarch64_be.so.1";
+    break;
+  case llvm::Triple::arm:
+  case llvm::Triple::thumb:
+  case llvm::Triple::armeb:
+  case llvm::Triple::thumbeb: {
+    const bool HF =
+        Triple.getEnvironment() == llvm::Triple::GNUEABIHF ||
+        tools::arm::getARMFloatABI(*this, Args) == tools::arm::FloatABI::Hard;
+
+    LibDir = "lib";
+    Loader = HF ? "ld-linux-armhf.so.3" : "ld-linux.so.3";
+    break;
+  }
+  case llvm::Triple::mips:
+  case llvm::Triple::mipsel:
+  case llvm::Triple::mips64:
+  case llvm::Triple::mips64el: {
+    bool LE = (Triple.getArch() == llvm::Triple::mipsel) ||
+              (Triple.getArch() == llvm::Triple::mips64el);
+    bool IsNaN2008 = tools::mips::isNaN2008(Args, Triple);
+
+    LibDir = "lib" + tools::mips::getMipsABILibSuffix(Args, Triple);
+
+    if (tools::mips::isUCLibc(Args))
+      Loader = IsNaN2008 ? "ld-uClibc-mipsn8.so.0" : "ld-uClibc.so.0";
+    else if (!Triple.hasEnvironment() &&
+             Triple.getVendor() == llvm::Triple::VendorType::MipsTechnologies)
+      Loader = LE ? "ld-musl-mipsel.so.1" : "ld-musl-mips.so.1";
+    else
+      Loader = IsNaN2008 ? "ld-linux-mipsn8.so.1" : "ld.so.1";
+
+    break;
+  }
+  case llvm::Triple::ppc:
+    LibDir = "lib";
+    Loader = "ld.so.1";
+    break;
+  case llvm::Triple::ppc64:
+    LibDir = "lib64";
+    Loader =
+        (tools::ppc::hasPPCAbiArg(Args, "elfv2")) ? "ld64.so.2" : "ld64.so.1";
+    break;
+  case llvm::Triple::ppc64le:
+    LibDir = "lib64";
+    Loader =
+        (tools::ppc::hasPPCAbiArg(Args, "elfv1")) ? "ld64.so.1" : "ld64.so.2";
+    break;
+  case llvm::Triple::sparc:
+  case llvm::Triple::sparcel:
+    LibDir = "lib";
+    Loader = "ld-linux.so.2";
+    break;
+  case llvm::Triple::sparcv9:
+    LibDir = "lib64";
+    Loader = "ld-linux.so.2";
+    break;
+  case llvm::Triple::systemz:
+    LibDir = "lib";
+    Loader = "ld64.so.1";
+    break;
+  case llvm::Triple::x86:
+    LibDir = "lib";
+    Loader = "ld-linux.so.2";
+    break;
+  case llvm::Triple::x86_64: {
+    bool X32 = Triple.getEnvironment() == llvm::Triple::GNUX32;
+
+    LibDir = X32 ? "libx32" : "lib64";
+    Loader = X32 ? "ld-linux-x32.so.2" : "ld-linux-x86-64.so.2";
+    break;
+  }
+  }
+
+  if (Distro == Exherbo && (Triple.getVendor() == llvm::Triple::UnknownVendor ||
+                            Triple.getVendor() == llvm::Triple::PC))
+    return "/usr/" + Triple.str() + "/lib/" + Loader;
+  return "/" + LibDir + "/" + Loader;
+}
+
 void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                       ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
@@ -3931,11 +4447,9 @@
   if (GCCInstallation.isValid()) {
     const auto &Callback = Multilibs.includeDirsCallback();
     if (Callback) {
-      const auto IncludePaths = Callback(GCCInstallation.getInstallPath(),
-                                         GCCInstallation.getTriple().str(),
-                                         GCCInstallation.getMultilib());
-      for (const auto &Path : IncludePaths)
-        addExternCSystemIncludeIfExists(DriverArgs, CC1Args, Path);
+      for (const auto &Path : Callback(GCCInstallation.getMultilib()))
+        addExternCSystemIncludeIfExists(
+            DriverArgs, CC1Args, GCCInstallation.getInstallPath() + Path);
     }
   }
 
@@ -4062,7 +4576,6 @@
   addExternCSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/include");
 }
 
-
 static std::string DetectLibcxxIncludePath(StringRef base) {
   std::error_code EC;
   int MaxVersion = 0;
@@ -4092,11 +4605,11 @@
   if (GetCXXStdlibType(DriverArgs) == ToolChain::CST_Libcxx) {
     const std::string LibCXXIncludePathCandidates[] = {
         DetectLibcxxIncludePath(getDriver().Dir + "/../include/c++"),
-
-        // We also check the system as for a long time this is the only place
-        // Clang looked.
-        // FIXME: We should really remove this. It doesn't make any sense.
-        DetectLibcxxIncludePath(getDriver().SysRoot + "/usr/include/c++")};
+        // If this is a development, non-installed, clang, libcxx will
+        // not be found at ../include/c++ but it likely to be found at
+        // one of the following two locations:
+        DetectLibcxxIncludePath(getDriver().SysRoot + "/usr/local/include/c++"),
+        DetectLibcxxIncludePath(getDriver().SysRoot + "/usr/include/c++") };
     for (const auto &IncludePath : LibCXXIncludePathCandidates) {
       if (IncludePath.empty() || !getVFS().exists(IncludePath))
         continue;
@@ -4137,6 +4650,7 @@
   const std::string LibStdCXXIncludePathCandidates[] = {
       // Gentoo is weird and places its headers inside the GCC install,
       // so if the first attempt to find the headers fails, try these patterns.
+      InstallDir.str() + "/include/g++-v" + Version.Text,
       InstallDir.str() + "/include/g++-v" + Version.MajorStr + "." +
           Version.MinorStr,
       InstallDir.str() + "/include/g++-v" + Version.MajorStr,
@@ -4161,10 +4675,23 @@
   if (DriverArgs.hasArg(options::OPT_nocudainc))
     return;
 
-  if (CudaInstallation.isValid()) {
-    addSystemInclude(DriverArgs, CC1Args, CudaInstallation.getIncludePath());
-    CC1Args.push_back("-include");
-    CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
+  if (!CudaInstallation.isValid()) {
+    getDriver().Diag(diag::err_drv_no_cuda_installation);
+    return;
+  }
+
+  addSystemInclude(DriverArgs, CC1Args, CudaInstallation.getIncludePath());
+  CC1Args.push_back("-include");
+  CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
+}
+
+void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
+                                ArgStringList &CC1Args) const {
+  if (GCCInstallation.isValid()) {
+    CC1Args.push_back("-isystem");
+    CC1Args.push_back(DriverArgs.MakeArgString(
+        GCCInstallation.getParentLibPath() + "/../" +
+        GCCInstallation.getTriple().str() + "/include"));
   }
 }
 
@@ -4192,6 +4719,8 @@
     Res |= SanitizerKind::Thread;
   if (IsX86_64 || IsMIPS64 || IsPowerPC64 || IsAArch64)
     Res |= SanitizerKind::Memory;
+  if (IsX86_64)
+    Res |= SanitizerKind::Efficiency;
   if (IsX86 || IsX86_64) {
     Res |= SanitizerKind::Function;
   }
@@ -4251,21 +4780,46 @@
   Linux::addClangTargetOptions(DriverArgs, CC1Args);
   CC1Args.push_back("-fcuda-is-device");
 
+  if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
+                         options::OPT_fno_cuda_flush_denormals_to_zero, false))
+    CC1Args.push_back("-fcuda-flush-denormals-to-zero");
+
+  if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
+                         options::OPT_fno_cuda_approx_transcendentals, false))
+    CC1Args.push_back("-fcuda-approx-transcendentals");
+
   if (DriverArgs.hasArg(options::OPT_nocudalib))
     return;
 
-  std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(
-      DriverArgs.getLastArgValue(options::OPT_march_EQ));
-  if (!LibDeviceFile.empty()) {
-    CC1Args.push_back("-mlink-cuda-bitcode");
-    CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
+  StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
+  assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
+  std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
 
-    // Libdevice in CUDA-7.0 requires PTX version that's more recent
-    // than LLVM defaults to. Use PTX4.2 which is the PTX version that
-    // came with CUDA-7.0.
-    CC1Args.push_back("-target-feature");
-    CC1Args.push_back("+ptx42");
+  if (LibDeviceFile.empty()) {
+    getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
+    return;
   }
+
+  CC1Args.push_back("-mlink-cuda-bitcode");
+  CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
+
+  // Libdevice in CUDA-7.0 requires PTX version that's more recent
+  // than LLVM defaults to. Use PTX4.2 which is the PTX version that
+  // came with CUDA-7.0.
+  CC1Args.push_back("-target-feature");
+  CC1Args.push_back("+ptx42");
+}
+
+void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                                       ArgStringList &CC1Args) const {
+  // Check our CUDA version if we're going to include the CUDA headers.
+  if (!DriverArgs.hasArg(options::OPT_nocudainc) &&
+      !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
+    StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
+    assert(!Arch.empty() && "Must have an explicit GPU arch.");
+    CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
+  }
+  Linux::AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
 llvm::opt::DerivedArgList *
@@ -4308,8 +4862,10 @@
     DAL->append(A);
   }
 
-  if (BoundArch)
+  if (BoundArch) {
+    DAL->eraseArg(options::OPT_march_EQ);
     DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+  }
   return DAL;
 }
 
@@ -4387,7 +4943,7 @@
 
 MyriadToolChain::MyriadToolChain(const Driver &D, const llvm::Triple &Triple,
                                  const ArgList &Args)
-    : Generic_GCC(D, Triple, Args) {
+    : Generic_ELF(D, Triple, Args) {
   // If a target of 'sparc-myriad-elf' is specified to clang, it wants to use
   // 'sparc-myriad--elf' (note the unknown OS) as the canonical triple.
   // This won't work to find gcc. Instead we give the installation detector an
@@ -4406,21 +4962,15 @@
 
   if (GCCInstallation.isValid()) {
     // The contents of LibDir are independent of the version of gcc.
-    // This contains libc, libg (a superset of libc), libm, libstdc++, libssp.
+    // This contains libc, libg, libm, libstdc++, libssp.
+    // The 'ma1x00' and 'nofpu' variants are irrelevant.
     SmallString<128> LibDir(GCCInstallation.getParentLibPath());
-    if (Triple.getArch() == llvm::Triple::sparcel)
-      llvm::sys::path::append(LibDir, "../sparc-myriad-elf/lib/le");
-    else
-      llvm::sys::path::append(LibDir, "../sparc-myriad-elf/lib");
+    llvm::sys::path::append(LibDir, "../sparc-myriad-elf/lib");
     addPathIfExists(D, LibDir, getFilePaths());
 
     // This directory contains crt{i,n,begin,end}.o as well as libgcc.
     // These files are tied to a particular version of gcc.
     SmallString<128> CompilerSupportDir(GCCInstallation.getInstallPath());
-    // There are actually 4 choices: {le,be} x {fpu,nofpu}
-    // but as this toolchain is for LEON sparc, it can assume FPU.
-    if (Triple.getArch() == llvm::Triple::sparcel)
-      llvm::sys::path::append(CompilerSupportDir, "le");
     addPathIfExists(D, CompilerSupportDir, getFilePaths());
   }
 }
@@ -4478,6 +5028,11 @@
 WebAssembly::WebAssembly(const Driver &D, const llvm::Triple &Triple,
                          const llvm::opt::ArgList &Args)
   : ToolChain(D, Triple, Args) {
+
+  assert(Triple.isArch32Bit() != Triple.isArch64Bit());
+  getFilePaths().push_back(
+      getDriver().SysRoot + "/lib" + (Triple.isArch32Bit() ? "32" : "64"));
+
   // Use LLD by default.
   DefaultLinker = "lld";
 }
@@ -4545,12 +5100,12 @@
   if (Args.hasArg(options::OPT_static))
     D.Diag(diag::err_drv_unsupported_opt_for_target) << "-static" << "PS4";
 
-  // Determine where to find the PS4 libraries. We use SCE_PS4_SDK_DIR
+  // Determine where to find the PS4 libraries. We use SCE_ORBIS_SDK_DIR
   // if it exists; otherwise use the driver's installation path, which
   // should be <SDK_DIR>/host_tools/bin.
 
   SmallString<512> PS4SDKDir;
-  if (const char *EnvValue = getenv("SCE_PS4_SDK_DIR")) {
+  if (const char *EnvValue = getenv("SCE_ORBIS_SDK_DIR")) {
     if (!llvm::sys::fs::exists(EnvValue))
       getDriver().Diag(clang::diag::warn_drv_ps4_sdk_dir) << EnvValue;
     PS4SDKDir = EnvValue;
diff --git a/lib/Driver/ToolChains.h b/lib/Driver/ToolChains.h
index b92f01e..61c559c 100644
--- a/lib/Driver/ToolChains.h
+++ b/lib/Driver/ToolChains.h
@@ -11,12 +11,13 @@
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_H
 
 #include "Tools.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Basic/VersionTuple.h"
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Multilib.h"
 #include "clang/Driver/ToolChain.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/Compiler.h"
 #include <set>
 #include <vector>
@@ -158,39 +159,52 @@
   GCCInstallationDetector GCCInstallation;
 
   // \brief A class to find a viable CUDA installation
-
   class CudaInstallationDetector {
-    bool IsValid;
+  private:
     const Driver &D;
-    std::string CudaInstallPath;
-    std::string CudaBinPath;
-    std::string CudaLibPath;
-    std::string CudaLibDevicePath;
-    std::string CudaIncludePath;
-    llvm::StringMap<std::string> CudaLibDeviceMap;
+    bool IsValid = false;
+    CudaVersion Version = CudaVersion::UNKNOWN;
+    std::string InstallPath;
+    std::string BinPath;
+    std::string LibPath;
+    std::string LibDevicePath;
+    std::string IncludePath;
+    llvm::StringMap<std::string> LibDeviceMap;
+
+    // CUDA architectures for which we have raised an error in
+    // CheckCudaVersionSupportsArch.
+    mutable llvm::SmallSet<CudaArch, 4> ArchsWithVersionTooLowErrors;
 
   public:
-    CudaInstallationDetector(const Driver &D) : IsValid(false), D(D) {}
+    CudaInstallationDetector(const Driver &D) : D(D) {}
     void init(const llvm::Triple &TargetTriple, const llvm::opt::ArgList &Args);
 
+    /// \brief Emit an error if Version does not support the given Arch.
+    ///
+    /// If either Version or Arch is unknown, does not emit an error.  Emits at
+    /// most one error per Arch.
+    void CheckCudaVersionSupportsArch(CudaArch Arch) const;
+
     /// \brief Check whether we detected a valid Cuda install.
     bool isValid() const { return IsValid; }
     /// \brief Print information about the detected CUDA installation.
     void print(raw_ostream &OS) const;
 
+    /// \brief Get the deteced Cuda install's version.
+    CudaVersion version() const { return Version; }
     /// \brief Get the detected Cuda installation path.
-    StringRef getInstallPath() const { return CudaInstallPath; }
+    StringRef getInstallPath() const { return InstallPath; }
     /// \brief Get the detected path to Cuda's bin directory.
-    StringRef getBinPath() const { return CudaBinPath; }
+    StringRef getBinPath() const { return BinPath; }
     /// \brief Get the detected Cuda Include path.
-    StringRef getIncludePath() const { return CudaIncludePath; }
+    StringRef getIncludePath() const { return IncludePath; }
     /// \brief Get the detected Cuda library path.
-    StringRef getLibPath() const { return CudaLibPath; }
+    StringRef getLibPath() const { return LibPath; }
     /// \brief Get the detected Cuda device library path.
-    StringRef getLibDevicePath() const { return CudaLibDevicePath; }
+    StringRef getLibDevicePath() const { return LibDevicePath; }
     /// \brief Get libdevice file for given architecture
     std::string getLibDeviceFile(StringRef Gpu) const {
-      return CudaLibDeviceMap.lookup(Gpu);
+      return LibDeviceMap.lookup(Gpu);
     }
   };
 
@@ -298,9 +312,6 @@
   /// @name ToolChain Implementation
   /// {
 
-  std::string ComputeEffectiveClangTriple(const llvm::opt::ArgList &Args,
-                                          types::ID InputType) const override;
-
   types::ID LookupTypeForExtension(const char *Ext) const override;
 
   bool HasNativeLLVMSupport() const override;
@@ -498,7 +509,8 @@
 
   StringRef getPlatformFamily() const;
   static StringRef getSDKName(StringRef isysroot);
-  
+  StringRef getOSLibraryNameSuffix() const;
+
 public:
   /// }
   /// @name ToolChain Implementation
@@ -513,6 +525,7 @@
   TranslateArgs(const llvm::opt::DerivedArgList &Args,
                 const char *BoundArch) const override;
 
+  CXXStdlibType GetDefaultCXXStdlibType() const override;
   ObjCRuntime getDefaultObjCRuntime(bool isNonFragile) const override;
   bool hasBlocksRuntime() const override;
 
@@ -542,6 +555,8 @@
 
   bool UseSjLjExceptions(const llvm::opt::ArgList &Args) const override;
 
+  bool SupportsEmbeddedBitcode() const override;
+
   SanitizerMask getSupportedSanitizers() const override;
 };
 
@@ -554,6 +569,8 @@
   /// @name Apple ToolChain Implementation
   /// {
 
+  RuntimeLibType GetRuntimeLibType(const llvm::opt::ArgList &Args) const override;
+
   void AddLinkRuntimeLibArgs(const llvm::opt::ArgList &Args,
                              llvm::opt::ArgStringList &CmdArgs) const override;
 
@@ -615,7 +632,9 @@
   void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
                            llvm::opt::ArgStringList &CmdArgs) const override;
 
-  bool isPIEDefault() const override { return false; }
+  bool isPIEDefault() const override;
+  SanitizerMask getSupportedSanitizers() const override;
+  SanitizerMask getDefaultSanitizers() const override;
 
 protected:
   Tool *buildLinker() const override;
@@ -673,6 +692,18 @@
   void findGccLibDir();
 };
 
+class LLVM_LIBRARY_VISIBILITY Haiku : public Generic_ELF {
+public:
+  Haiku(const Driver &D, const llvm::Triple &Triple,
+          const llvm::opt::ArgList &Args);
+
+  bool isPIEDefault() const override { return getTriple().getArch() == llvm::Triple::x86_64; }
+
+  void
+  AddClangCXXStdlibIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                              llvm::opt::ArgStringList &CC1Args) const override;
+};
+
 class LLVM_LIBRARY_VISIBILITY OpenBSD : public Generic_ELF {
 public:
   OpenBSD(const Driver &D, const llvm::Triple &Triple,
@@ -700,7 +731,7 @@
   bool IsMathErrnoDefault() const override { return false; }
   bool IsObjCNonFragileABIDefault() const override { return true; }
 
-  CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
+  CXXStdlibType GetDefaultCXXStdlibType() const override;
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
@@ -724,10 +755,12 @@
   bool IsMathErrnoDefault() const override { return false; }
   bool IsObjCNonFragileABIDefault() const override { return true; }
 
-  CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
+  CXXStdlibType GetDefaultCXXStdlibType() const override;
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
+  void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
+                           llvm::opt::ArgStringList &CmdArgs) const override;
 
   bool UseSjLjExceptions(const llvm::opt::ArgList &Args) const override;
   bool isPIEDefault() const override;
@@ -750,7 +783,7 @@
   bool IsMathErrnoDefault() const override { return false; }
   bool IsObjCNonFragileABIDefault() const override { return true; }
 
-  CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
+  CXXStdlibType GetDefaultCXXStdlibType() const override;
 
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
@@ -799,12 +832,16 @@
       llvm::opt::ArgStringList &CC1Args) const override;
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
+  void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
   bool isPIEDefault() const override;
   SanitizerMask getSupportedSanitizers() const override;
   void addProfileRTLibs(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const override;
   virtual std::string computeSysRoot() const;
 
+  virtual std::string getDynamicLinker(const llvm::opt::ArgList &Args) const;
+
   std::vector<std::string> ExtraOpts;
 
 protected:
@@ -827,6 +864,16 @@
   // ptxas.
   bool useIntegratedAs() const override { return false; }
 
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
+  const Generic_GCC::CudaInstallationDetector &cudaInstallation() const {
+    return CudaInstallation;
+  }
+  Generic_GCC::CudaInstallationDetector &cudaInstallation() {
+    return CudaInstallation;
+  }
+
 protected:
   Tool *buildAssembler() const override;  // ptxas
   Tool *buildLinker() const override;     // fatbinary (ok, not really a linker)
@@ -868,6 +915,14 @@
   std::string LibSuffix;
 };
 
+class LLVM_LIBRARY_VISIBILITY LanaiToolChain : public Generic_ELF {
+public:
+  LanaiToolChain(const Driver &D, const llvm::Triple &Triple,
+                 const llvm::opt::ArgList &Args)
+      : Generic_ELF(D, Triple, Args) {}
+  bool IsIntegratedAssemblerDefault() const override { return true; }
+};
+
 class LLVM_LIBRARY_VISIBILITY HexagonToolChain : public Linux {
 protected:
   GCCVersion GCCLibAndIncVersion;
@@ -912,6 +967,7 @@
 public:
   AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple,
             const llvm::opt::ArgList &Args);
+  unsigned GetDefaultDwarfVersion() const override { return 2; }
   bool IsIntegratedAssemblerDefault() const override { return true; }
 };
 
@@ -999,6 +1055,7 @@
   bool getVisualStudioInstallDir(std::string &path) const;
   bool getVisualStudioBinariesFolder(const char *clangProgramPath,
                                      std::string &path) const;
+  VersionTuple getMSVCVersionFromExe() const override;
 
   std::string ComputeEffectiveClangTriple(const llvm::opt::ArgList &Args,
                                           types::ID InputType) const override;
@@ -1076,7 +1133,7 @@
 
 /// MyriadToolChain - A tool chain using either clang or the external compiler
 /// installed by the Movidius SDK to perform all subcommands.
-class LLVM_LIBRARY_VISIBILITY MyriadToolChain : public Generic_GCC {
+class LLVM_LIBRARY_VISIBILITY MyriadToolChain : public Generic_ELF {
 public:
   MyriadToolChain(const Driver &D, const llvm::Triple &Triple,
                   const llvm::opt::ArgList &Args);
diff --git a/lib/Driver/Tools.cpp b/lib/Driver/Tools.cpp
index bbe6437..e59119a 100644
--- a/lib/Driver/Tools.cpp
+++ b/lib/Driver/Tools.cpp
@@ -96,6 +96,14 @@
           .Case("niagara2", "-Av8plusb")
           .Case("niagara3", "-Av8plusd")
           .Case("niagara4", "-Av8plusd")
+          .Case("leon2", "-Av8")
+          .Case("at697e", "-Av8")
+          .Case("at697f", "-Av8")
+          .Case("leon3", "-Av8")
+          .Case("ut699", "-Av8")
+          .Case("gr712rc", "-Av8")
+          .Case("leon4", "-Av8")
+          .Case("gr740", "-Av8")
           .Default("-Av8");
   }
 }
@@ -288,13 +296,34 @@
          !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput);
 }
 
+/// Apply \a Work on the current tool chain \a RegularToolChain and any other
+/// offloading tool chain that is associated with the current action \a JA.
+static void
+forAllAssociatedToolChains(Compilation &C, const JobAction &JA,
+                           const ToolChain &RegularToolChain,
+                           llvm::function_ref<void(const ToolChain &)> Work) {
+  // Apply Work on the current/regular tool chain.
+  Work(RegularToolChain);
+
+  // Apply Work on all the offloading tool chains associated with the current
+  // action.
+  if (JA.isHostOffloading(Action::OFK_Cuda))
+    Work(*C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+  else if (JA.isDeviceOffloading(Action::OFK_Cuda))
+    Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
+
+  //
+  // TODO: Add support for other offloading programming models here.
+  //
+}
+
 void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
                                     const Driver &D, const ArgList &Args,
                                     ArgStringList &CmdArgs,
                                     const InputInfo &Output,
-                                    const InputInfoList &Inputs,
-                                    const ToolChain *AuxToolChain) const {
+                                    const InputInfoList &Inputs) const {
   Arg *A;
+  const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();
 
   CheckPreprocessingOptions(D, Args);
 
@@ -381,14 +410,86 @@
     }
   }
 
+  // Add offload include arguments specific for CUDA.  This must happen before
+  // we -I or -include anything else, because we must pick up the CUDA headers
+  // from the particular CUDA installation, rather than from e.g.
+  // /usr/local/include.
+  if (JA.isOffloading(Action::OFK_Cuda))
+    getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
+
   // Add -i* options, and automatically translate to
   // -include-pch/-include-pth for transparent PCH support. It's
   // wonky, but we include looking for .gch so we can support seamless
   // replacement into a build system already set up to be generating
   // .gch files.
+  int YcIndex = -1, YuIndex = -1;
+  {
+    int AI = -1;
+    const Arg *YcArg = Args.getLastArg(options::OPT__SLASH_Yc);
+    const Arg *YuArg = Args.getLastArg(options::OPT__SLASH_Yu);
+    for (const Arg *A : Args.filtered(options::OPT_clang_i_Group)) {
+      // Walk the whole i_Group and skip non "-include" flags so that the index
+      // here matches the index in the next loop below.
+      ++AI;
+      if (!A->getOption().matches(options::OPT_include))
+        continue;
+      if (YcArg && strcmp(A->getValue(), YcArg->getValue()) == 0)
+        YcIndex = AI;
+      if (YuArg && strcmp(A->getValue(), YuArg->getValue()) == 0)
+        YuIndex = AI;
+    }
+  }
+  if (isa<PrecompileJobAction>(JA) && YcIndex != -1) {
+    Driver::InputList Inputs;
+    D.BuildInputs(getToolChain(), C.getArgs(), Inputs);
+    assert(Inputs.size() == 1 && "Need one input when building pch");
+    CmdArgs.push_back(Args.MakeArgString(Twine("-find-pch-source=") +
+                                         Inputs[0].second->getValue()));
+  }
+
   bool RenderedImplicitInclude = false;
+  int AI = -1;
   for (const Arg *A : Args.filtered(options::OPT_clang_i_Group)) {
-    if (A->getOption().matches(options::OPT_include)) {
+    ++AI;
+
+    if (getToolChain().getDriver().IsCLMode() &&
+        A->getOption().matches(options::OPT_include)) {
+      // In clang-cl mode, /Ycfoo.h means that all code up to a foo.h
+      // include is compiled into foo.h, and everything after goes into
+      // the .obj file. /Yufoo.h means that all includes prior to and including
+      // foo.h are completely skipped and replaced with a use of the pch file
+      // for foo.h.  (Each flag can have at most one value, multiple /Yc flags
+      // just mean that the last one wins.)  If /Yc and /Yu are both present
+      // and refer to the same file, /Yc wins.
+      // Note that OPT__SLASH_FI gets mapped to OPT_include.
+      // FIXME: The code here assumes that /Yc and /Yu refer to the same file.
+      // cl.exe seems to support both flags with different values, but that
+      // seems strange (which flag does /Fp now refer to?), so don't implement
+      // that until someone needs it.
+      int PchIndex = YcIndex != -1 ? YcIndex : YuIndex;
+      if (PchIndex != -1) {
+        if (isa<PrecompileJobAction>(JA)) {
+          // When building the pch, skip all includes after the pch.
+          assert(YcIndex != -1 && PchIndex == YcIndex);
+          if (AI >= YcIndex)
+            continue;
+        } else {
+          // When using the pch, skip all includes prior to the pch.
+          if (AI < PchIndex) {
+            A->claim();
+            continue;
+          }
+          if (AI == PchIndex) {
+            A->claim();
+            CmdArgs.push_back("-include-pch");
+            CmdArgs.push_back(
+                Args.MakeArgString(D.GetClPchPath(C, A->getValue())));
+            continue;
+          }
+        }
+      }
+    } else if (A->getOption().matches(options::OPT_include)) {
+      // Handling of gcc-style gch precompiled headers.
       bool IsFirstImplicitInclude = !RenderedImplicitInclude;
       RenderedImplicitInclude = true;
 
@@ -436,6 +537,13 @@
                                                        << A->getAsString(Args);
         }
       }
+    } else if (A->getOption().matches(options::OPT_isystem_after)) {
+      // Handling of paths which must come late.  These entries are handled by
+      // the toolchain itself after the resource dir is inserted in the right
+      // search order.
+      // Do not claim the argument so that the use of the argument does not
+      // silently go unnoticed on toolchains which do not honour the option.
+      continue;
     }
 
     // Not translated, render as usual.
@@ -485,26 +593,27 @@
   // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++.
   addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH");
 
-  // Optional AuxToolChain indicates that we need to include headers
-  // for more than one target. If that's the case, add include paths
-  // from AuxToolChain right after include paths of the same kind for
-  // the current target.
+  // While adding the include arguments, we also attempt to retrieve the
+  // arguments of related offloading toolchains or arguments that are specific
+  // of an offloading programming model.
 
   // Add C++ include arguments, if needed.
-  if (types::isCXX(Inputs[0].getType())) {
-    getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
-    if (AuxToolChain)
-      AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+  if (types::isCXX(Inputs[0].getType()))
+    forAllAssociatedToolChains(C, JA, getToolChain(),
+                               [&Args, &CmdArgs](const ToolChain &TC) {
+                                 TC.AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+                               });
+
+  // Add system include arguments for all targets but IAMCU.
+  if (!IsIAMCU)
+    forAllAssociatedToolChains(C, JA, getToolChain(),
+                               [&Args, &CmdArgs](const ToolChain &TC) {
+                                 TC.AddClangSystemIncludeArgs(Args, CmdArgs);
+                               });
+  else {
+    // For IAMCU add special include arguments.
+    getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs);
   }
-
-  // Add system include arguments.
-  getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs);
-  if (AuxToolChain)
-      AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
-
-  // Add CUDA include arguments, if needed.
-  if (types::isCuda(Inputs[0].getType()))
-    getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
 }
 
 // FIXME: Move to target hook.
@@ -658,7 +767,7 @@
 // -mfloat-abi=.
 arm::FloatABI arm::getARMFloatABI(const ToolChain &TC, const ArgList &Args) {
   const Driver &D = TC.getDriver();
-  const llvm::Triple Triple(TC.ComputeEffectiveClangTriple(Args));
+  const llvm::Triple &Triple = TC.getEffectiveTriple();
   auto SubArch = getARMSubArchVersionNumber(Triple);
   arm::FloatABI ABI = FloatABI::Invalid;
   if (Arg *A =
@@ -725,10 +834,12 @@
     default:
       switch (Triple.getEnvironment()) {
       case llvm::Triple::GNUEABIHF:
+      case llvm::Triple::MuslEABIHF:
       case llvm::Triple::EABIHF:
         ABI = FloatABI::Hard;
         break;
       case llvm::Triple::GNUEABI:
+      case llvm::Triple::MuslEABI:
       case llvm::Triple::EABI:
         // EABI is always AAPCS, and if it was not marked 'hard', it's softfp
         ABI = FloatABI::SoftFP;
@@ -885,10 +996,6 @@
       Features.push_back("-crc");
   }
 
-  if (Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v8_1a) {
-    Features.insert(Features.begin(), "+v8.1a");
-  }
-
   // Look for the last occurrence of -mlong-calls or -mno-long-calls. If
   // neither options are specified, see if we are compiling for kernel/kext and
   // decide whether to pass "+long-calls" based on the OS and its version.
@@ -910,6 +1017,10 @@
       // No v6M core supports unaligned memory access (v6M ARM ARM A3.2).
       if (Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v6m)
         D.Diag(diag::err_target_unsupported_unaligned) << "v6m";
+      // v8M Baseline follows on from v6M, so doesn't support unaligned memory
+      // access either.
+      else if (Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v8m_baseline)
+        D.Diag(diag::err_target_unsupported_unaligned) << "v8m.base";
     } else
       Features.push_back("+strict-align");
   } else {
@@ -974,6 +1085,8 @@
     case llvm::Triple::Android:
     case llvm::Triple::GNUEABI:
     case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::MuslEABI:
+    case llvm::Triple::MuslEABIHF:
       ABIName = "aapcs-linux";
       break;
     case llvm::Triple::EABIHF:
@@ -1055,8 +1168,7 @@
 
 void Clang::AddAArch64TargetArgs(const ArgList &Args,
                                  ArgStringList &CmdArgs) const {
-  std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args);
-  llvm::Triple Triple(TripleStr);
+  const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
   if (!Args.hasFlag(options::OPT_mred_zone, options::OPT_mno_red_zone, true) ||
       Args.hasArg(options::OPT_mkernel) ||
@@ -1118,8 +1230,10 @@
   }
 
   // MIPS64r6 is the default for Android MIPS64 (mips64el-linux-android).
-  if (Triple.isAndroid())
+  if (Triple.isAndroid()) {
+    DefMips32CPU = "mips32";
     DefMips64CPU = "mips64r6";
+  }
 
   // MIPS3 is the default for mips64*-unknown-openbsd.
   if (Triple.getOS() == llvm::Triple::OpenBSD)
@@ -1154,6 +1268,30 @@
     }
   }
 
+  if (ABIName.empty() &&
+      (Triple.getVendor() == llvm::Triple::MipsTechnologies ||
+       Triple.getVendor() == llvm::Triple::ImaginationTechnologies)) {
+    ABIName = llvm::StringSwitch<const char *>(CPUName)
+                  .Case("mips1", "o32")
+                  .Case("mips2", "o32")
+                  .Case("mips3", "n64")
+                  .Case("mips4", "n64")
+                  .Case("mips5", "n64")
+                  .Case("mips32", "o32")
+                  .Case("mips32r2", "o32")
+                  .Case("mips32r3", "o32")
+                  .Case("mips32r5", "o32")
+                  .Case("mips32r6", "o32")
+                  .Case("mips64", "n64")
+                  .Case("mips64r2", "n64")
+                  .Case("mips64r3", "n64")
+                  .Case("mips64r5", "n64")
+                  .Case("mips64r6", "n64")
+                  .Case("octeon", "n64")
+                  .Case("p5600", "o32")
+                  .Default("");
+  }
+
   if (ABIName.empty()) {
     // Deduce ABI name from the target triple.
     if (Triple.getArch() == llvm::Triple::mips ||
@@ -1166,7 +1304,7 @@
   if (CPUName.empty()) {
     // Deduce CPU name from ABI name.
     CPUName = llvm::StringSwitch<const char *>(ABIName)
-                  .Cases("o32", "eabi", DefMips32CPU)
+                  .Case("o32", DefMips32CPU)
                   .Cases("n32", "n64", DefMips64CPU)
                   .Default("");
   }
@@ -1292,8 +1430,9 @@
   AddTargetFeature(Args, Features, options::OPT_mmsa, options::OPT_mno_msa,
                    "msa");
 
-  // Add the last -mfp32/-mfpxx/-mfp64 or if none are given and the ABI is O32
-  // pass -mfpxx
+  // Add the last -mfp32/-mfpxx/-mfp64, if none are given and the ABI is O32
+  // pass -mfpxx, or if none are given and fp64a is default, pass fp64 and
+  // nooddspreg.
   if (Arg *A = Args.getLastArg(options::OPT_mfp32, options::OPT_mfpxx,
                                options::OPT_mfp64)) {
     if (A->getOption().matches(options::OPT_mfp32))
@@ -1306,6 +1445,9 @@
   } else if (mips::shouldUseFPXX(Args, Triple, CPUName, ABIName, FloatABI)) {
     Features.push_back(Args.MakeArgString("+fpxx"));
     Features.push_back(Args.MakeArgString("+nooddspreg"));
+  } else if (mips::isFP64ADefault(Triple, CPUName)) {
+    Features.push_back(Args.MakeArgString("+fp64"));
+    Features.push_back(Args.MakeArgString("+nooddspreg"));
   }
 
   AddTargetFeature(Args, Features, options::OPT_mno_odd_spreg,
@@ -1365,6 +1507,19 @@
     CmdArgs.push_back(Args.MakeArgString("-mips-ssection-threshold=" + v));
     A->claim();
   }
+
+  if (Arg *A = Args.getLastArg(options::OPT_mcompact_branches_EQ)) {
+    StringRef Val = StringRef(A->getValue());
+    if (mips::hasCompactBranches(CPUName)) {
+      if (Val == "never" || Val == "always" || Val == "optimal") {
+        CmdArgs.push_back("-mllvm");
+        CmdArgs.push_back(Args.MakeArgString("-mips-compact-branches=" + Val));
+      } else
+        D.Diag(diag::err_drv_unsupported_option_argument)
+            << A->getOption().getName() << Val;
+    } else
+      D.Diag(diag::warn_target_unsupported_compact_branches) << CPUName;
+  }
 }
 
 /// getPPCTargetCPU - Get the (LLVM) name of the PowerPC cpu we are targeting.
@@ -1414,6 +1569,7 @@
         .Case("power6x", "pwr6x")
         .Case("power7", "pwr7")
         .Case("power8", "pwr8")
+        .Case("power9", "pwr9")
         .Case("pwr3", "pwr3")
         .Case("pwr4", "pwr4")
         .Case("pwr5", "pwr5")
@@ -1422,6 +1578,7 @@
         .Case("pwr6x", "pwr6x")
         .Case("pwr7", "pwr7")
         .Case("pwr8", "pwr8")
+        .Case("pwr9", "pwr9")
         .Case("powerpc", "ppc")
         .Case("powerpc64", "ppc64")
         .Case("powerpc64le", "ppc64le")
@@ -1560,27 +1717,77 @@
   return "";
 }
 
+static std::string getLanaiTargetCPU(const ArgList &Args) {
+  if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
+    return A->getValue();
+  }
+  return "";
+}
+
+sparc::FloatABI sparc::getSparcFloatABI(const Driver &D,
+                                        const ArgList &Args) {
+  sparc::FloatABI ABI = sparc::FloatABI::Invalid;
+  if (Arg *A =
+          Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float,
+                          options::OPT_mfloat_abi_EQ)) {
+    if (A->getOption().matches(options::OPT_msoft_float))
+      ABI = sparc::FloatABI::Soft;
+    else if (A->getOption().matches(options::OPT_mhard_float))
+      ABI = sparc::FloatABI::Hard;
+    else {
+      ABI = llvm::StringSwitch<sparc::FloatABI>(A->getValue())
+                .Case("soft", sparc::FloatABI::Soft)
+                .Case("hard", sparc::FloatABI::Hard)
+                .Default(sparc::FloatABI::Invalid);
+      if (ABI == sparc::FloatABI::Invalid &&
+          !StringRef(A->getValue()).empty()) {
+        D.Diag(diag::err_drv_invalid_mfloat_abi) << A->getAsString(Args);
+        ABI = sparc::FloatABI::Hard;
+      }
+    }
+  }
+
+  // If unspecified, choose the default based on the platform.
+  // Only the hard-float ABI on Sparc is standardized, and it is the
+  // default. GCC also supports a nonstandard soft-float ABI mode, also
+  // implemented in LLVM. However as this is not standard we set the default
+  // to be hard-float.
+  if (ABI == sparc::FloatABI::Invalid) {
+    ABI = sparc::FloatABI::Hard;
+  }
+
+  return ABI;
+}
+
+static void getSparcTargetFeatures(const Driver &D, const ArgList &Args,
+                                 std::vector<const char *> &Features) {
+  sparc::FloatABI FloatABI = sparc::getSparcFloatABI(D, Args);
+  if (FloatABI == sparc::FloatABI::Soft)
+    Features.push_back("+soft-float");
+}
+
 void Clang::AddSparcTargetArgs(const ArgList &Args,
                                ArgStringList &CmdArgs) const {
-  const Driver &D = getToolChain().getDriver();
-  std::string Triple = getToolChain().ComputeEffectiveClangTriple(Args);
+  sparc::FloatABI FloatABI =
+      sparc::getSparcFloatABI(getToolChain().getDriver(), Args);
 
-  bool SoftFloatABI = false;
-  if (Arg *A =
-          Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float)) {
-    if (A->getOption().matches(options::OPT_msoft_float))
-      SoftFloatABI = true;
+  if (FloatABI == sparc::FloatABI::Soft) {
+    // Floating point operations and argument passing are soft.
+    CmdArgs.push_back("-msoft-float");
+    CmdArgs.push_back("-mfloat-abi");
+    CmdArgs.push_back("soft");
+  } else {
+    // Floating point operations and argument passing are hard.
+    assert(FloatABI == sparc::FloatABI::Hard && "Invalid float abi!");
+    CmdArgs.push_back("-mfloat-abi");
+    CmdArgs.push_back("hard");
   }
+}
 
-  // Only the hard-float ABI on Sparc is standardized, and it is the
-  // default. GCC also supports a nonstandard soft-float ABI mode, and
-  // perhaps LLVM should implement that, too. However, since llvm
-  // currently does not support Sparc soft-float, at all, display an
-  // error if it's requested.
-  if (SoftFloatABI) {
-    D.Diag(diag::err_drv_unsupported_opt_for_target) << "-msoft-float"
-                                                     << Triple;
-  }
+void Clang::AddSystemZTargetArgs(const ArgList &Args,
+                                 ArgStringList &CmdArgs) const {
+  if (Args.hasFlag(options::OPT_mbackchain, options::OPT_mno_backchain, false))
+    CmdArgs.push_back("-mbackchain");
 }
 
 static const char *getSystemZTargetCPU(const ArgList &Args) {
@@ -1777,6 +1984,9 @@
     return "hexagon" +
            toolchains::HexagonToolChain::GetTargetCPUVersion(Args).str();
 
+  case llvm::Triple::lanai:
+    return getLanaiTargetCPU(Args);
+
   case llvm::Triple::systemz:
     return getSystemZTargetCPU(Args);
 
@@ -1823,6 +2033,17 @@
 
   if (IsThinLTO)
     CmdArgs.push_back("-plugin-opt=thinlto");
+
+  // If an explicit debugger tuning argument appeared, pass it along.
+  if (Arg *A = Args.getLastArg(options::OPT_gTune_Group,
+                               options::OPT_ggdbN_Group)) {
+    if (A->getOption().matches(options::OPT_glldb))
+      CmdArgs.push_back("-plugin-opt=-debugger-tune=lldb");
+    else if (A->getOption().matches(options::OPT_gsce))
+      CmdArgs.push_back("-plugin-opt=-debugger-tune=sce");
+    else
+      CmdArgs.push_back("-plugin-opt=-debugger-tune=gdb");
+  }
 }
 
 /// This is a helper function for validating the optional refinement step
@@ -2051,6 +2272,13 @@
           << A->getOption().getName() << Value;
     }
   }
+
+  // Set flags to support MCU ABI.
+  if (Args.hasFlag(options::OPT_miamcu, options::OPT_mno_iamcu, false)) {
+    CmdArgs.push_back("-mfloat-abi");
+    CmdArgs.push_back("soft");
+    CmdArgs.push_back("-mstack-alignment=4");
+  }
 }
 
 void Clang::AddHexagonTargetArgs(const ArgList &Args,
@@ -2075,6 +2303,29 @@
   CmdArgs.push_back("-machine-sink-split=0");
 }
 
+void Clang::AddLanaiTargetArgs(const ArgList &Args,
+                               ArgStringList &CmdArgs) const {
+  if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
+    StringRef CPUName = A->getValue();
+
+    CmdArgs.push_back("-target-cpu");
+    CmdArgs.push_back(Args.MakeArgString(CPUName));
+  }
+  if (Arg *A = Args.getLastArg(options::OPT_mregparm_EQ)) {
+    StringRef Value = A->getValue();
+    // Only support mregparm=4 to support old usage. Report error for all other
+    // cases.
+    int Mregparm;
+    if (Value.getAsInteger(10, Mregparm)) {
+      if (Mregparm != 4) {
+        getToolChain().getDriver().Diag(
+            diag::err_drv_unsupported_option_argument)
+            << A->getOption().getName() << Value;
+      }
+    }
+  }
+}
+
 void Clang::AddWebAssemblyTargetArgs(const ArgList &Args,
                                      ArgStringList &CmdArgs) const {
   // Default to "hidden" visibility.
@@ -2092,22 +2343,8 @@
   text.split(Split, StringRef("+"), -1, false);
 
   for (StringRef Feature : Split) {
-    const char *result = llvm::StringSwitch<const char *>(Feature)
-                             .Case("fp", "+fp-armv8")
-                             .Case("simd", "+neon")
-                             .Case("crc", "+crc")
-                             .Case("crypto", "+crypto")
-                             .Case("fp16", "+fullfp16")
-                             .Case("profile", "+spe")
-                             .Case("nofp", "-fp-armv8")
-                             .Case("nosimd", "-neon")
-                             .Case("nocrc", "-crc")
-                             .Case("nocrypto", "-crypto")
-                             .Case("nofp16", "-fullfp16")
-                             .Case("noprofile", "-spe")
-                             .Default(nullptr);
-    if (result)
-      Features.push_back(result);
+    if (const char *FeatureName = llvm::AArch64::getArchExtFeature(Feature))
+      Features.push_back(FeatureName);
     else if (Feature == "neon" || Feature == "noneon")
       D.Diag(diag::err_drv_no_neon_modifier);
     else
@@ -2122,19 +2359,16 @@
                               std::vector<const char *> &Features) {
   std::pair<StringRef, StringRef> Split = Mcpu.split("+");
   CPU = Split.first;
-  if (CPU == "cortex-a53" || CPU == "cortex-a57" || CPU == "cortex-a72" ||
-      CPU == "cortex-a35" || CPU == "exynos-m1") {
-    Features.push_back("+neon");
-    Features.push_back("+crc");
-    Features.push_back("+crypto");
-  } else if (CPU == "cyclone") {
-    Features.push_back("+neon");
-    Features.push_back("+crypto");
-  } else if (CPU == "generic") {
+
+  if (CPU == "generic") {
     Features.push_back("+neon");
   } else {
-    return false;
-  }
+    unsigned ArchKind = llvm::AArch64::parseCPUArch(CPU);
+    unsigned Extersion = llvm::AArch64::getDefaultExtensions(CPU, ArchKind);
+
+    if (!llvm::AArch64::getExtensionFeatures(Extersion, Features))
+      return false;
+   }
 
   if (Split.second.size() && !DecodeAArch64Features(D, Split.second, Features))
     return false;
@@ -2149,17 +2383,10 @@
   std::string MarchLowerCase = March.lower();
   std::pair<StringRef, StringRef> Split = StringRef(MarchLowerCase).split("+");
 
-  if (Split.first == "armv8-a" || Split.first == "armv8a") {
-    // ok, no additional features.
-  } else if (Split.first == "armv8.1-a" || Split.first == "armv8.1a") {
-    Features.push_back("+v8.1a");
-  } else if (Split.first == "armv8.2-a" || Split.first == "armv8.2a" ) {
-    Features.push_back("+v8.2a");
-  } else {
-    return false;
-  }
-
-  if (Split.second.size() && !DecodeAArch64Features(D, Split.second, Features))
+  unsigned ArchKind = llvm::AArch64::parseArch(Split.first);
+  if (ArchKind == static_cast<unsigned>(llvm::AArch64::ArchKind::AK_INVALID) ||
+      !llvm::AArch64::getArchFeatures(ArchKind, Features) ||
+      (Split.second.size() && !DecodeAArch64Features(D, Split.second, Features)))
     return false;
 
   return true;
@@ -2286,6 +2513,23 @@
   handleTargetFeaturesGroup(Args, Features, options::OPT_m_wasm_Features_Group);
 }
 
+static void getAMDGPUTargetFeatures(const Driver &D, const ArgList &Args,
+                                    std::vector<const char *> &Features) {
+  if (const Arg *dAbi = Args.getLastArg(options::OPT_mamdgpu_debugger_abi)) {
+    StringRef value = dAbi->getValue();
+    if (value == "1.0") {
+      Features.push_back("+amdgpu-debugger-insert-nops");
+      Features.push_back("+amdgpu-debugger-reserve-regs");
+      Features.push_back("+amdgpu-debugger-emit-prologue");
+    } else {
+      D.Diag(diag::err_drv_clang_unsupported) << dAbi->getAsString(Args);
+    }
+  }
+
+  handleTargetFeaturesGroup(
+    Args, Features, options::OPT_m_amdgpu_Features_Group);
+}
+
 static void getTargetFeatures(const ToolChain &TC, const llvm::Triple &Triple,
                               const ArgList &Args, ArgStringList &CmdArgs,
                               bool ForAS) {
@@ -2330,6 +2574,15 @@
   case llvm::Triple::wasm32:
   case llvm::Triple::wasm64:
     getWebAssemblyTargetFeatures(Args, Features);
+    break; 
+  case llvm::Triple::sparc:
+  case llvm::Triple::sparcel:
+  case llvm::Triple::sparcv9:
+    getSparcTargetFeatures(D, Args, Features);
+    break;
+  case llvm::Triple::r600:
+  case llvm::Triple::amdgcn:
+    getAMDGPUTargetFeatures(D, Args, Features);
     break;
   }
 
@@ -2410,11 +2663,9 @@
   }
 
   if (types::isCXX(InputType)) {
-    // Disable C++ EH by default on XCore, PS4, and MSVC.
-    // FIXME: Remove MSVC from this list once things work.
-    bool CXXExceptionsEnabled = Triple.getArch() != llvm::Triple::xcore &&
-                                !Triple.isPS4CPU() &&
-                                !Triple.isWindowsMSVCEnvironment();
+    // Disable C++ EH by default on XCore and PS4.
+    bool CXXExceptionsEnabled =
+        Triple.getArch() != llvm::Triple::xcore && !Triple.isPS4CPU();
     Arg *ExceptionArg = Args.getLastArg(
         options::OPT_fcxx_exceptions, options::OPT_fno_cxx_exceptions,
         options::OPT_fexceptions, options::OPT_fno_exceptions);
@@ -2473,8 +2724,8 @@
   if (isa<CompileJobAction>(A) || isa<BackendJobAction>(A))
     return true;
 
-  for (const auto &Act : *A)
-    if (ContainsCompileAction(Act))
+  for (const auto &AI : A->inputs())
+    if (ContainsCompileAction(AI))
       return true;
 
   return false;
@@ -2579,6 +2830,27 @@
                    DefaultIncrementalLinkerCompatible))
     CmdArgs.push_back("-mincremental-linker-compatible");
 
+  switch (C.getDefaultToolChain().getArch()) {
+  case llvm::Triple::arm:
+  case llvm::Triple::armeb:
+  case llvm::Triple::thumb:
+  case llvm::Triple::thumbeb:
+    if (Arg *A = Args.getLastArg(options::OPT_mimplicit_it_EQ)) {
+      StringRef Value = A->getValue();
+      if (Value == "always" || Value == "never" || Value == "arm" ||
+          Value == "thumb") {
+        CmdArgs.push_back("-mllvm");
+        CmdArgs.push_back(Args.MakeArgString("-arm-implicit-it=" + Value));
+      } else {
+        D.Diag(diag::err_drv_unsupported_option_argument)
+            << A->getOption().getName() << Value;
+      }
+    }
+    break;
+  default:
+    break;
+  }
+
   // When passing -I arguments to the assembler we sometimes need to
   // unconditionally take the next argument.  For example, when parsing
   // '-Wa,-I -Wa,foo' we need to accept the -Wa,foo arg after seeing the
@@ -2589,6 +2861,8 @@
   // When using an integrated assembler, translate -Wa, and -Xassembler
   // options.
   bool CompressDebugSections = false;
+
+  bool UseRelaxRelocations = ENABLE_X86_RELAX_RELOCATIONS;
   const char *MipsTargetFeature = nullptr;
   for (const Arg *A :
        Args.filtered(options::OPT_Wa_COMMA, options::OPT_Xassembler)) {
@@ -2664,6 +2938,12 @@
       } else if (Value == "-nocompress-debug-sections" ||
                  Value == "--nocompress-debug-sections") {
         CompressDebugSections = false;
+      } else if (Value == "-mrelax-relocations=yes" ||
+                 Value == "--mrelax-relocations=yes") {
+        UseRelaxRelocations = true;
+      } else if (Value == "-mrelax-relocations=no" ||
+                 Value == "--mrelax-relocations=no") {
+        UseRelaxRelocations = false;
       } else if (Value.startswith("-I")) {
         CmdArgs.push_back(Value.data());
         // We need to consume the next argument if the current arg is a plain
@@ -2695,6 +2975,8 @@
     else
       D.Diag(diag::warn_debug_compression_unavailable);
   }
+  if (UseRelaxRelocations)
+    CmdArgs.push_back("--mrelax-relocations");
   if (MipsTargetFeature != nullptr) {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back(MipsTargetFeature);
@@ -2869,12 +3151,17 @@
     StaticRuntimes.push_back("safestack");
   if (SanArgs.needsCfiRt())
     StaticRuntimes.push_back("cfi");
-  if (SanArgs.needsCfiDiagRt())
+  if (SanArgs.needsCfiDiagRt()) {
     StaticRuntimes.push_back("cfi_diag");
+    if (SanArgs.linkCXXRuntimes())
+      StaticRuntimes.push_back("ubsan_standalone_cxx");
+  }
   if (SanArgs.needsStatsRt()) {
     NonWholeStaticRuntimes.push_back("stats");
     RequiredSymbols.push_back("__sanitizer_stats_register");
   }
+  if (SanArgs.needsEsanRt())
+    StaticRuntimes.push_back("esan");
 }
 
 // Should be called before we add system libraries (C++ ABI, libstdc++/libc++,
@@ -2910,6 +3197,33 @@
   return !StaticRuntimes.empty();
 }
 
+static bool addXRayRuntime(const ToolChain &TC, const ArgList &Args,
+                           ArgStringList &CmdArgs) {
+  if (Args.hasFlag(options::OPT_fxray_instrument,
+                   options::OPT_fnoxray_instrument, false)) {
+    CmdArgs.push_back("-whole-archive");
+    CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray", false));
+    CmdArgs.push_back("-no-whole-archive");
+    return true;
+  }
+  return false;
+}
+
+static void linkXRayRuntimeDeps(const ToolChain &TC, const ArgList &Args,
+                                ArgStringList &CmdArgs) {
+  CmdArgs.push_back("--no-as-needed");
+  CmdArgs.push_back("-lpthread");
+  CmdArgs.push_back("-lrt");
+  CmdArgs.push_back("-lm");
+  CmdArgs.push_back("-latomic");
+  if (TC.GetCXXStdlibType(Args) == ToolChain::CST_Libcxx)
+    CmdArgs.push_back("-lc++");
+  else
+    CmdArgs.push_back("-lstdc++");
+  if (TC.getTriple().getOS() != llvm::Triple::FreeBSD)
+    CmdArgs.push_back("-ldl");
+}
+
 static bool areOptimizationsEnabled(const ArgList &Args) {
   // Find the last -O arg and see if it is non-zero.
   if (Arg *A = Args.getLastArg(options::OPT_O_Group))
@@ -2931,7 +3245,7 @@
     break;
   }
 
-  if (Triple.isOSLinux()) {
+  if (Triple.isOSLinux() || Triple.getOS() == llvm::Triple::CloudABI) {
     switch (Triple.getArch()) {
     // Don't use a frame pointer on linux if optimizing for certain targets.
     case llvm::Triple::mips64:
@@ -3145,7 +3459,7 @@
   Result.append(UID.begin(), UID.end());
 }
 
-VersionTuple visualstudio::getMSVCVersion(const Driver *D,
+VersionTuple visualstudio::getMSVCVersion(const Driver *D, const ToolChain &TC,
                                           const llvm::Triple &Triple,
                                           const llvm::opt::ArgList &Args,
                                           bool IsWindowsMSVC) {
@@ -3187,7 +3501,14 @@
     if (Major || Minor || Micro)
       return VersionTuple(Major, Minor, Micro);
 
-    return VersionTuple(18);
+    if (IsWindowsMSVC) {
+      VersionTuple MSVT = TC.getMSVCVersionFromExe();
+      if (!MSVT.empty())
+        return MSVT;
+
+      // FIXME: Consider bumping this to 19 (MSVC2015) soon.
+      return VersionTuple(18);
+    }
   }
   return VersionTuple();
 }
@@ -3195,16 +3516,27 @@
 static void addPGOAndCoverageFlags(Compilation &C, const Driver &D,
                                    const InputInfo &Output, const ArgList &Args,
                                    ArgStringList &CmdArgs) {
+
+  auto *PGOGenerateArg = Args.getLastArg(options::OPT_fprofile_generate,
+                                         options::OPT_fprofile_generate_EQ,
+                                         options::OPT_fno_profile_generate);
+  if (PGOGenerateArg &&
+      PGOGenerateArg->getOption().matches(options::OPT_fno_profile_generate))
+    PGOGenerateArg = nullptr;
+
   auto *ProfileGenerateArg = Args.getLastArg(
       options::OPT_fprofile_instr_generate,
-      options::OPT_fprofile_instr_generate_EQ, options::OPT_fprofile_generate,
-      options::OPT_fprofile_generate_EQ,
+      options::OPT_fprofile_instr_generate_EQ,
       options::OPT_fno_profile_instr_generate);
   if (ProfileGenerateArg &&
       ProfileGenerateArg->getOption().matches(
           options::OPT_fno_profile_instr_generate))
     ProfileGenerateArg = nullptr;
 
+  if (PGOGenerateArg && ProfileGenerateArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << PGOGenerateArg->getSpelling() << ProfileGenerateArg->getSpelling();
+
   auto *ProfileUseArg = Args.getLastArg(
       options::OPT_fprofile_instr_use, options::OPT_fprofile_instr_use_EQ,
       options::OPT_fprofile_use, options::OPT_fprofile_use_EQ,
@@ -3213,6 +3545,10 @@
       ProfileUseArg->getOption().matches(options::OPT_fno_profile_instr_use))
     ProfileUseArg = nullptr;
 
+  if (PGOGenerateArg && ProfileUseArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << ProfileUseArg->getSpelling() << PGOGenerateArg->getSpelling();
+
   if (ProfileGenerateArg && ProfileUseArg)
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << ProfileGenerateArg->getSpelling() << ProfileUseArg->getSpelling();
@@ -3220,20 +3556,27 @@
   if (ProfileGenerateArg) {
     if (ProfileGenerateArg->getOption().matches(
             options::OPT_fprofile_instr_generate_EQ))
-      ProfileGenerateArg->render(Args, CmdArgs);
-    else if (ProfileGenerateArg->getOption().matches(
-                 options::OPT_fprofile_generate_EQ)) {
-      SmallString<128> Path(ProfileGenerateArg->getValue());
-      llvm::sys::path::append(Path, "default.profraw");
+      CmdArgs.push_back(Args.MakeArgString(Twine("-fprofile-instrument-path=") +
+                                           ProfileGenerateArg->getValue()));
+    // The default is to use Clang Instrumentation.
+    CmdArgs.push_back("-fprofile-instrument=clang");
+  }
+
+  if (PGOGenerateArg) {
+    CmdArgs.push_back("-fprofile-instrument=llvm");
+    if (PGOGenerateArg->getOption().matches(
+            options::OPT_fprofile_generate_EQ)) {
+      SmallString<128> Path(PGOGenerateArg->getValue());
+      llvm::sys::path::append(Path, "default_%m.profraw");
       CmdArgs.push_back(
-          Args.MakeArgString(Twine("-fprofile-instr-generate=") + Path));
-    } else
-      Args.AddAllArgs(CmdArgs, options::OPT_fprofile_instr_generate);
+          Args.MakeArgString(Twine("-fprofile-instrument-path=") + Path));
+    }
   }
 
   if (ProfileUseArg) {
     if (ProfileUseArg->getOption().matches(options::OPT_fprofile_instr_use_EQ))
-      ProfileUseArg->render(Args, CmdArgs);
+      CmdArgs.push_back(Args.MakeArgString(
+          Twine("-fprofile-instrument-use-path=") + ProfileUseArg->getValue()));
     else if ((ProfileUseArg->getOption().matches(
                   options::OPT_fprofile_use_EQ) ||
               ProfileUseArg->getOption().matches(
@@ -3243,7 +3586,7 @@
       if (Path.empty() || llvm::sys::fs::is_directory(Path))
         llvm::sys::path::append(Path, "default.profdata");
       CmdArgs.push_back(
-          Args.MakeArgString(Twine("-fprofile-instr-use=") + Path));
+          Args.MakeArgString(Twine("-fprofile-instrument-use-path=") + Path));
     }
   }
 
@@ -3434,25 +3777,71 @@
     // match that of llvm-gcc and Apple GCC before that.
     PIC = ToolChain.isPICDefault() && ToolChain.isPICDefaultForced();
 
-    return std::make_tuple(llvm::Reloc::DynamicNoPIC, PIC ? 2 : 0, false);
+    return std::make_tuple(llvm::Reloc::DynamicNoPIC, PIC ? 2U : 0U, false);
+  }
+
+  bool EmbeddedPISupported;
+  switch (ToolChain.getArch()) {
+    case llvm::Triple::arm:
+    case llvm::Triple::armeb:
+    case llvm::Triple::thumb:
+    case llvm::Triple::thumbeb:
+      EmbeddedPISupported = true;
+      break;
+    default:
+      EmbeddedPISupported = false;
+      break;
+  }
+
+  bool ROPI = false, RWPI = false;
+  Arg* LastROPIArg = Args.getLastArg(options::OPT_fropi, options::OPT_fno_ropi);
+  if (LastROPIArg && LastROPIArg->getOption().matches(options::OPT_fropi)) {
+    if (!EmbeddedPISupported)
+      ToolChain.getDriver().Diag(diag::err_drv_unsupported_opt_for_target)
+          << LastROPIArg->getSpelling() << ToolChain.getTriple().str();
+    ROPI = true;
+  }
+  Arg *LastRWPIArg = Args.getLastArg(options::OPT_frwpi, options::OPT_fno_rwpi);
+  if (LastRWPIArg && LastRWPIArg->getOption().matches(options::OPT_frwpi)) {
+    if (!EmbeddedPISupported)
+      ToolChain.getDriver().Diag(diag::err_drv_unsupported_opt_for_target)
+          << LastRWPIArg->getSpelling() << ToolChain.getTriple().str();
+    RWPI = true;
+  }
+
+  // ROPI and RWPI are not comaptible with PIC or PIE.
+  if ((ROPI || RWPI) && (PIC || PIE)) {
+    ToolChain.getDriver().Diag(diag::err_drv_ropi_rwpi_incompatible_with_pic);
   }
 
   if (PIC)
-    return std::make_tuple(llvm::Reloc::PIC_, IsPICLevelTwo ? 2 : 1, PIE);
+    return std::make_tuple(llvm::Reloc::PIC_, IsPICLevelTwo ? 2U : 1U, PIE);
 
-  return std::make_tuple(llvm::Reloc::Static, 0, false);
+  llvm::Reloc::Model RelocM = llvm::Reloc::Static;
+  if (ROPI && RWPI)
+    RelocM = llvm::Reloc::ROPI_RWPI;
+  else if (ROPI)
+    RelocM = llvm::Reloc::ROPI;
+  else if (RWPI)
+    RelocM = llvm::Reloc::RWPI;
+
+  return std::make_tuple(RelocM, 0U, false);
 }
 
 static const char *RelocationModelName(llvm::Reloc::Model Model) {
   switch (Model) {
-  case llvm::Reloc::Default:
-    return nullptr;
   case llvm::Reloc::Static:
     return "static";
   case llvm::Reloc::PIC_:
     return "pic";
   case llvm::Reloc::DynamicNoPIC:
     return "dynamic-no-pic";
+  case llvm::Reloc::ROPI:
+    return "ropi";
+  case llvm::Reloc::RWPI:
+    return "rwpi";
+  case llvm::Reloc::ROPI_RWPI:
+    return "ropi-rwpi";
   }
   llvm_unreachable("Unknown Reloc::Model kind");
 }
@@ -3472,8 +3861,8 @@
 void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                          const InputInfo &Output, const InputInfoList &Inputs,
                          const ArgList &Args, const char *LinkingOutput) const {
-  std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args);
-  const llvm::Triple Triple(TripleStr);
+  const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
+  const std::string &TripleStr = Triple.getTriple();
 
   bool KernelOrKext =
       Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext);
@@ -3485,6 +3874,7 @@
       getToolChain().getTriple().isWindowsCygwinEnvironment();
   bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
   bool IsPS4CPU = getToolChain().getTriple().isPS4CPU();
+  bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();
 
   // Check number of inputs for sanity. We need at least one input.
   assert(Inputs.size() >= 1 && "Must have at least one input.");
@@ -3492,9 +3882,13 @@
   // CUDA compilation may have multiple inputs (source file + results of
   // device-side compilations). All other jobs are expected to have exactly one
   // input.
-  bool IsCuda = types::isCuda(Input.getType());
+  bool IsCuda = JA.isOffloading(Action::OFK_Cuda);
   assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
 
+  // C++ is not supported for IAMCU.
+  if (IsIAMCU && types::isCXX(Input.getType()))
+    D.Diag(diag::err_drv_clang_unsupported) << "C++ for IAMCU";
+
   // Invoke ourselves in -cc1 mode.
   //
   // FIXME: Implement custom jobs for internal actions.
@@ -3504,23 +3898,21 @@
   CmdArgs.push_back("-triple");
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
-  const ToolChain *AuxToolChain = nullptr;
   if (IsCuda) {
-    // FIXME: We need a (better) way to pass information about
-    // particular compilation pass we're constructing here. For now we
-    // can check which toolchain we're using and pick the other one to
-    // extract the triple.
-    if (&getToolChain() == C.getCudaDeviceToolChain())
-      AuxToolChain = C.getCudaHostToolChain();
-    else if (&getToolChain() == C.getCudaHostToolChain())
-      AuxToolChain = C.getCudaDeviceToolChain();
+    // We have to pass the triple of the host if compiling for a CUDA device and
+    // vice-versa.
+    std::string NormalizedTriple;
+    if (JA.isDeviceOffloading(Action::OFK_Cuda))
+      NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Host>()
+                             ->getTriple()
+                             .normalize();
     else
-      llvm_unreachable("Can't figure out CUDA compilation mode.");
-    assert(AuxToolChain != nullptr && "No aux toolchain.");
+      NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+                             ->getTriple()
+                             .normalize();
+
     CmdArgs.push_back("-aux-triple");
-    CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str()));
-    CmdArgs.push_back("-fcuda-target-overloads");
-    CmdArgs.push_back("-fcuda-disable-target-call-checks");
+    CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
   }
 
   if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm ||
@@ -3619,6 +4011,17 @@
     Args.AddLastArg(CmdArgs, options::OPT_fthinlto_index_EQ);
   }
 
+  // Embed-bitcode option.
+  if (C.getDriver().embedBitcodeEnabled() &&
+      (isa<BackendJobAction>(JA) || isa<AssembleJobAction>(JA))) {
+    // Add flags implied by -fembed-bitcode.
+    Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ);
+    // Disable all llvm IR level optimizations.
+    CmdArgs.push_back("-disable-llvm-optzns");
+  }
+  if (C.getDriver().embedBitcodeMarkerOnly())
+    CmdArgs.push_back("-fembed-bitcode=marker");
+
   // We normally speed up the clang process a bit by skipping destructors at
   // exit, but when we're generating diagnostics we can rely on some of the
   // cleanup.
@@ -3723,6 +4126,13 @@
       ParsePICArgs(getToolChain(), Triple, Args);
 
   const char *RMName = RelocationModelName(RelocationModel);
+
+  if ((RelocationModel == llvm::Reloc::ROPI ||
+       RelocationModel == llvm::Reloc::ROPI_RWPI) &&
+      types::isCXX(Input.getType()) &&
+      !Args.hasArg(options::OPT_fallow_unsupported))
+    D.Diag(diag::err_drv_ropi_incompatible_with_cxx);
+
   if (RMName) {
     CmdArgs.push_back("-mrelocation-model");
     CmdArgs.push_back(RMName);
@@ -3730,10 +4140,8 @@
   if (PICLevel > 0) {
     CmdArgs.push_back("-pic-level");
     CmdArgs.push_back(PICLevel == 1 ? "1" : "2");
-    if (IsPIE) {
-      CmdArgs.push_back("-pie-level");
-      CmdArgs.push_back(PICLevel == 1 ? "1" : "2");
-    }
+    if (IsPIE)
+      CmdArgs.push_back("-pic-is-pie");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_meabi)) {
@@ -3772,6 +4180,14 @@
     A->claim();
   }
 
+  if (!Args.hasFlag(options::OPT_fjump_tables, options::OPT_fno_jump_tables,
+                    true))
+    CmdArgs.push_back("-fno-jump-tables");
+
+  if (!Args.hasFlag(options::OPT_fpreserve_as_comments,
+                    options::OPT_fno_preserve_as_comments, true))
+    CmdArgs.push_back("-fno-preserve-as-comments");
+
   if (Arg *A = Args.getLastArg(options::OPT_mregparm_EQ)) {
     CmdArgs.push_back("-mregparm");
     CmdArgs.push_back(A->getValue());
@@ -3791,7 +4207,7 @@
   }
 
   if (Args.hasFlag(options::OPT_mrtd, options::OPT_mno_rtd, false))
-    CmdArgs.push_back("-mrtd");
+    CmdArgs.push_back("-fdefault-calling-conv=stdcall");
 
   if (shouldUseFramePointer(Args, getToolChain().getTriple()))
     CmdArgs.push_back("-mdisable-fp-elim");
@@ -3987,9 +4403,11 @@
     CmdArgs.push_back("Arguments");
   }
 
-  // Enable -mconstructor-aliases except on darwin, where we have to
-  // work around a linker bug;  see <rdar://problem/7651567>.
-  if (!getToolChain().getTriple().isOSDarwin())
+  // Enable -mconstructor-aliases except on darwin, where we have to work around
+  // a linker bug (see <rdar://problem/7651567>), and CUDA device code, where
+  // aliases aren't supported.
+  if (!getToolChain().getTriple().isOSDarwin() &&
+      !getToolChain().getTriple().isNVPTX())
     CmdArgs.push_back("-mconstructor-aliases");
 
   // Darwin's kernel doesn't support guard variables; just die if we
@@ -4082,11 +4500,19 @@
     AddSparcTargetArgs(Args, CmdArgs);
     break;
 
+  case llvm::Triple::systemz:
+    AddSystemZTargetArgs(Args, CmdArgs);
+    break;
+
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     AddX86TargetArgs(Args, CmdArgs);
     break;
 
+  case llvm::Triple::lanai:
+    AddLanaiTargetArgs(Args, CmdArgs);
+    break;
+
   case llvm::Triple::hexagon:
     AddHexagonTargetArgs(Args, CmdArgs);
     break;
@@ -4113,8 +4539,9 @@
   bool EmitCodeView = false;
 
   // Add clang-cl arguments.
+  types::ID InputType = Input.getType();
   if (getToolChain().getDriver().IsCLMode())
-    AddClangCLArgs(Args, CmdArgs, &DebugInfoKind, &EmitCodeView);
+    AddClangCLArgs(Args, InputType, CmdArgs, &DebugInfoKind, &EmitCodeView);
 
   // Pass the linker version in use.
   if (Arg *A = Args.getLastArg(options::OPT_mlinker_version_EQ)) {
@@ -4127,7 +4554,6 @@
 
   // Explicitly error on some things we know we don't support and can't just
   // ignore.
-  types::ID InputType = Input.getType();
   if (!Args.hasArg(options::OPT_fallow_unsupported)) {
     Arg *Unsupported;
     if (types::isCXX(InputType) && getToolChain().getTriple().isOSDarwin() &&
@@ -4188,8 +4614,8 @@
                                options::OPT_gdwarf_4, options::OPT_gdwarf_5))
     DwarfVersion = DwarfVersionNum(A->getSpelling());
 
-  // Forward -gcodeview.
-  // 'EmitCodeView might have been set by CL-compatibility argument parsing.
+  // Forward -gcodeview. EmitCodeView might have been set by CL-compatibility
+  // argument parsing.
   if (Args.hasArg(options::OPT_gcodeview) || EmitCodeView) {
     // DwarfVersion remains at 0 if no explicit choice was made.
     CmdArgs.push_back("-gcodeview");
@@ -4277,6 +4703,17 @@
 
   Args.AddAllArgs(CmdArgs, options::OPT_finstrument_functions);
 
+  if (Args.hasFlag(options::OPT_fxray_instrument,
+                   options::OPT_fnoxray_instrument, false)) {
+    CmdArgs.push_back("-fxray-instrument");
+    if (const Arg *A =
+            Args.getLastArg(options::OPT_fxray_instruction_threshold_,
+                            options::OPT_fxray_instruction_threshold_EQ)) {
+      CmdArgs.push_back("-fxray-instruction-threshold");
+      CmdArgs.push_back(A->getValue());
+    }
+  }
+
   addPGOAndCoverageFlags(C, D, Output, Args, CmdArgs);
 
   // Add runtime flag for PS4 when PGO or Coverage are enabled.
@@ -4375,8 +4812,7 @@
   //
   // FIXME: Support -fpreprocessed
   if (types::getPreprocessedType(InputType) != types::TY_INVALID)
-    AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs,
-                            AuxToolChain);
+    AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs);
 
   // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes
   // that "The compiler can only warn and ignore the option if not recognized".
@@ -4630,9 +5066,13 @@
   Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_show_template_tree);
   Args.AddLastArg(CmdArgs, options::OPT_fno_elide_type);
 
-  // Forward flags for OpenMP
+  // Forward flags for OpenMP. We don't do this if the current action is an
+  // device offloading action.
+  //
+  // TODO: Allow OpenMP offload actions when they become available.
   if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
-                   options::OPT_fno_openmp, false))
+                   options::OPT_fno_openmp, false) &&
+      JA.isDeviceOffloading(Action::OFK_None)) {
     switch (getOpenMPRuntime(getToolChain(), Args)) {
     case OMPRT_OMP:
     case OMPRT_IOMP5:
@@ -4645,6 +5085,7 @@
       if (!Args.hasFlag(options::OPT_fopenmp_use_tls,
                         options::OPT_fnoopenmp_use_tls, /*Default=*/true))
         CmdArgs.push_back("-fnoopenmp-use-tls");
+      Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ);
       break;
     default:
       // By default, if Clang doesn't know how to generate useful OpenMP code
@@ -4655,6 +5096,7 @@
       // semantic analysis, etc.
       break;
     }
+  }
 
   const SanitizerArgs &Sanitize = getToolChain().getSanitizerArgs();
   Sanitize.addArgs(getToolChain(), Args, CmdArgs, InputType);
@@ -4723,15 +5165,10 @@
 
   // -stack-protector=0 is default.
   unsigned StackProtectorLevel = 0;
-  if (getToolChain().getSanitizerArgs().needsSafeStackRt()) {
-    Args.ClaimAllArgs(options::OPT_fno_stack_protector);
-    Args.ClaimAllArgs(options::OPT_fstack_protector_all);
-    Args.ClaimAllArgs(options::OPT_fstack_protector_strong);
-    Args.ClaimAllArgs(options::OPT_fstack_protector);
-  } else if (Arg *A = Args.getLastArg(options::OPT_fno_stack_protector,
-                                      options::OPT_fstack_protector_all,
-                                      options::OPT_fstack_protector_strong,
-                                      options::OPT_fstack_protector)) {
+  if (Arg *A = Args.getLastArg(options::OPT_fno_stack_protector,
+                               options::OPT_fstack_protector_all,
+                               options::OPT_fstack_protector_strong,
+                               options::OPT_fstack_protector)) {
     if (A->getOption().matches(options::OPT_fstack_protector)) {
       StackProtectorLevel = std::max<unsigned>(
           LangOptions::SSPOn,
@@ -4812,6 +5249,46 @@
     CmdArgs.push_back("-arm-restrict-it");
   }
 
+  // Forward -cl options to -cc1
+  if (Args.getLastArg(options::OPT_cl_opt_disable)) {
+    CmdArgs.push_back("-cl-opt-disable");
+  }
+  if (Args.getLastArg(options::OPT_cl_strict_aliasing)) {
+    CmdArgs.push_back("-cl-strict-aliasing");
+  }
+  if (Args.getLastArg(options::OPT_cl_single_precision_constant)) {
+    CmdArgs.push_back("-cl-single-precision-constant");
+  }
+  if (Args.getLastArg(options::OPT_cl_finite_math_only)) {
+    CmdArgs.push_back("-cl-finite-math-only");
+  }
+  if (Args.getLastArg(options::OPT_cl_kernel_arg_info)) {
+    CmdArgs.push_back("-cl-kernel-arg-info");
+  }
+  if (Args.getLastArg(options::OPT_cl_unsafe_math_optimizations)) {
+    CmdArgs.push_back("-cl-unsafe-math-optimizations");
+  }
+  if (Args.getLastArg(options::OPT_cl_fast_relaxed_math)) {
+    CmdArgs.push_back("-cl-fast-relaxed-math");
+  }
+  if (Args.getLastArg(options::OPT_cl_mad_enable)) {
+    CmdArgs.push_back("-cl-mad-enable");
+  }
+  if (Args.getLastArg(options::OPT_cl_no_signed_zeros)) {
+    CmdArgs.push_back("-cl-no-signed-zeros");
+  }
+  if (Arg *A = Args.getLastArg(options::OPT_cl_std_EQ)) {
+    std::string CLStdStr = "-cl-std=";
+    CLStdStr += A->getValue();
+    CmdArgs.push_back(Args.MakeArgString(CLStdStr));
+  }
+  if (Args.getLastArg(options::OPT_cl_denorms_are_zero)) {
+    CmdArgs.push_back("-cl-denorms-are-zero");
+  }
+  if (Args.getLastArg(options::OPT_cl_fp32_correctly_rounded_divide_sqrt)) {
+    CmdArgs.push_back("-cl-fp32-correctly-rounded-divide-sqrt");
+  }
+
   // Forward -f options with positive and negative forms; we translate
   // these by hand.
   if (Arg *A = Args.getLastArg(options::OPT_fprofile_sample_use_EQ)) {
@@ -4937,7 +5414,7 @@
 
   // -fmodule-name specifies the module that is currently being built (or
   // used for header checking by -fmodule-maps).
-  Args.AddLastArg(CmdArgs, options::OPT_fmodule_name);
+  Args.AddLastArg(CmdArgs, options::OPT_fmodule_name_EQ);
 
   // -fmodule-map-file can be used to specify files containing module
   // definitions.
@@ -4973,13 +5450,6 @@
     CmdArgs.push_back(Args.MakeArgString(Path));
   }
 
-  if (HaveModules) {
-    // -fprebuilt-module-path specifies where to load the prebuilt module files.
-    for (const Arg *A : Args.filtered(options::OPT_fprebuilt_module_path))
-      CmdArgs.push_back(Args.MakeArgString(
-          std::string("-fprebuilt-module-path=") + A->getValue()));
-  }
-
   // When building modules and generating crashdumps, we need to dump a module
   // dependency VFS alongside the output.
   if (HaveModules && C.isForDiagnostics()) {
@@ -5095,17 +5565,32 @@
 
   // -fms-compatibility-version=18.00 is default.
   VersionTuple MSVT = visualstudio::getMSVCVersion(
-      &D, getToolChain().getTriple(), Args, IsWindowsMSVC);
+      &D, getToolChain(), getToolChain().getTriple(), Args, IsWindowsMSVC);
   if (!MSVT.empty())
     CmdArgs.push_back(
         Args.MakeArgString("-fms-compatibility-version=" + MSVT.getAsString()));
 
   bool IsMSVC2015Compatible = MSVT.getMajor() >= 19;
   if (ImplyVCPPCXXVer) {
-    if (IsMSVC2015Compatible)
-      CmdArgs.push_back("-std=c++14");
-    else
-      CmdArgs.push_back("-std=c++11");
+    StringRef LanguageStandard;
+    if (const Arg *StdArg = Args.getLastArg(options::OPT__SLASH_std)) {
+      LanguageStandard = llvm::StringSwitch<StringRef>(StdArg->getValue())
+                             .Case("c++14", "-std=c++14")
+                             .Case("c++latest", "-std=c++1z")
+                             .Default("");
+      if (LanguageStandard.empty())
+        D.Diag(clang::diag::warn_drv_unused_argument)
+            << StdArg->getAsString(Args);
+    }
+
+    if (LanguageStandard.empty()) {
+      if (IsMSVC2015Compatible)
+        LanguageStandard = "-std=c++14";
+      else
+        LanguageStandard = "-std=c++11";
+    }
+
+    CmdArgs.push_back(LanguageStandard.data());
   }
 
   // -fno-borland-extensions is default.
@@ -5146,8 +5631,10 @@
   if (Args.hasArg(options::OPT_fno_inline))
     CmdArgs.push_back("-fno-inline");
 
-  if (Args.hasArg(options::OPT_fno_inline_functions))
-    CmdArgs.push_back("-fno-inline-functions");
+  if (Arg* InlineArg = Args.getLastArg(options::OPT_finline_functions,
+                                       options::OPT_finline_hint_functions,
+                                       options::OPT_fno_inline_functions))
+    InlineArg->render(Args, CmdArgs);
 
   ObjCRuntime objcRuntime = AddObjCRuntimeArgs(Args, CmdArgs, rewriteKind);
 
@@ -5256,7 +5743,8 @@
     addExceptionArgs(Args, InputType, getToolChain(), KernelOrKext, objcRuntime,
                      CmdArgs);
 
-  if (getToolChain().UseSjLjExceptions(Args))
+  if (Args.hasArg(options::OPT_fsjlj_exceptions) ||
+      getToolChain().UseSjLjExceptions(Args))
     CmdArgs.push_back("-fsjlj-exceptions");
 
   // C++ "sane" operator new.
@@ -5460,6 +5948,10 @@
   if (Arg *A = Args.getLastArg(options::OPT_fshow_overloads_EQ))
     A->render(Args, CmdArgs);
 
+  if (Arg *A = Args.getLastArg(
+          options::OPT_fsanitize_undefined_strip_path_components_EQ))
+    A->render(Args, CmdArgs);
+
   // -fdollars-in-identifiers default varies depending on platform and
   // language; only pass if specified.
   if (Arg *A = Args.getLastArg(options::OPT_fdollars_in_identifiers,
@@ -5491,7 +5983,7 @@
 
 // Default to -fno-builtin-str{cat,cpy} on Darwin for ARM.
 //
-// FIXME: This is disabled until clang -cc1 supports -fno-builtin-foo. PR4941.
+// FIXME: Now that PR4941 has been fixed this can be enabled.
 #if 0
   if (getToolChain().getTriple().isOSDarwin() &&
       (getToolChain().getArch() == llvm::Triple::arm ||
@@ -5562,7 +6054,13 @@
   // With -save-temps, we want to save the unoptimized bitcode output from the
   // CompileJobAction, use -disable-llvm-passes to get pristine IR generated
   // by the frontend.
-  if (C.getDriver().isSaveTempsEnabled() && isa<CompileJobAction>(JA))
+  // When -fembed-bitcode is enabled, optimized bitcode is emitted because it
+  // has slightly different breakdown between stages.
+  // FIXME: -fembed-bitcode -save-temps will save optimized bitcode instead of
+  // pristine IR generated by the frontend. Ideally, a new compile action should
+  // be added so both IR can be captured.
+  if (C.getDriver().isSaveTempsEnabled() &&
+      !C.getDriver().embedBitcodeEnabled() && isa<CompileJobAction>(JA))
     CmdArgs.push_back("-disable-llvm-passes");
 
   if (Output.getType() == types::TY_Dependencies) {
@@ -5624,6 +6122,17 @@
       CmdArgs.push_back(I->getFilename());
     }
 
+  bool WholeProgramVTables =
+      Args.hasFlag(options::OPT_fwhole_program_vtables,
+                   options::OPT_fno_whole_program_vtables, false);
+  if (WholeProgramVTables) {
+    if (!D.isUsingLTO())
+      D.Diag(diag::err_drv_argument_only_allowed_with)
+          << "-fwhole-program-vtables"
+          << "-flto";
+    CmdArgs.push_back("-fwhole-program-vtables");
+  }
+
   // Finally add the compile command to the compilation.
   if (Args.hasArg(options::OPT__SLASH_fallback) &&
       Output.getType() == types::TY_Object &&
@@ -5632,6 +6141,12 @@
         getCLFallback()->GetCommand(C, JA, Output, Inputs, Args, LinkingOutput);
     C.addCommand(llvm::make_unique<FallbackCommand>(
         JA, *this, Exec, CmdArgs, Inputs, std::move(CLCommand)));
+  } else if (Args.hasArg(options::OPT__SLASH_fallback) &&
+             isa<PrecompileJobAction>(JA)) {
+    // In /fallback builds, run the main compilation even if the pch generation
+    // fails, so that the main compilation's fallback to cl.exe runs.
+    C.addCommand(llvm::make_unique<ForceSuccessCommand>(JA, *this, Exec,
+                                                        CmdArgs, Inputs));
   } else {
     C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
   }
@@ -5639,7 +6154,7 @@
   // Handle the debug info splitting at object creation time if we're
   // creating an object.
   // TODO: Currently only works on linux with newer objcopy.
-  if (SplitDwarf && !isa<CompileJobAction>(JA) && !isa<BackendJobAction>(JA))
+  if (SplitDwarf && Output.getType() == types::TY_Object)
     SplitDebugInfo(getToolChain(), C, *this, JA, Args, Output, SplitDwarfOut);
 
   if (Arg *A = Args.getLastArg(options::OPT_pg))
@@ -5795,10 +6310,9 @@
 
 namespace {
 struct EHFlags {
-  EHFlags() : Synch(false), Asynch(false), NoExceptC(false) {}
-  bool Synch;
-  bool Asynch;
-  bool NoExceptC;
+  bool Synch = false;
+  bool Asynch = false;
+  bool NoUnwindC = false;
 };
 } // end anonymous namespace
 
@@ -5807,8 +6321,7 @@
 /// - s: Cleanup after "synchronous" exceptions, aka C++ exceptions.
 /// - a: Cleanup after "asynchronous" exceptions, aka structured exceptions.
 ///      The 'a' modifier is unimplemented and fundamentally hard in LLVM IR.
-/// - c: Assume that extern "C" functions are implicitly noexcept.  This
-///      modifier is an optimization, so we ignore it for now.
+/// - c: Assume that extern "C" functions are implicitly nounwind.
 /// The default is /EHs-c-, meaning cleanups are disabled.
 static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) {
   EHFlags EH;
@@ -5820,12 +6333,16 @@
       switch (EHVal[I]) {
       case 'a':
         EH.Asynch = maybeConsumeDash(EHVal, I);
+        if (EH.Asynch)
+          EH.Synch = false;
         continue;
       case 'c':
-        EH.NoExceptC = maybeConsumeDash(EHVal, I);
+        EH.NoUnwindC = maybeConsumeDash(EHVal, I);
         continue;
       case 's':
         EH.Synch = maybeConsumeDash(EHVal, I);
+        if (EH.Synch)
+          EH.Asynch = false;
         continue;
       default:
         break;
@@ -5834,11 +6351,20 @@
       break;
     }
   }
+  // The /GX, /GX- flags are only processed if there are not /EH flags.
+  // The default is that /GX is not specified.
+  if (EHArgs.empty() &&
+      Args.hasFlag(options::OPT__SLASH_GX, options::OPT__SLASH_GX_,
+                   /*default=*/false)) {
+    EH.Synch = true;
+    EH.NoUnwindC = true;
+  }
 
   return EH;
 }
 
-void Clang::AddClangCLArgs(const ArgList &Args, ArgStringList &CmdArgs,
+void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
+                           ArgStringList &CmdArgs,
                            codegenoptions::DebugInfoKind *DebugInfoKind,
                            bool *EmitCodeView) const {
   unsigned RTOptionID = options::OPT__SLASH_MT;
@@ -5870,11 +6396,13 @@
     if (Args.hasArg(options::OPT__SLASH_LDd))
       CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_MT");
+    CmdArgs.push_back("-flto-visibility-public-std");
     FlagForCRT = "--dependent-lib=libcmt";
     break;
   case options::OPT__SLASH_MTd:
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_MT");
+    CmdArgs.push_back("-flto-visibility-public-std");
     FlagForCRT = "--dependent-lib=libcmtd";
     break;
   default:
@@ -5903,23 +6431,37 @@
                    /*default=*/false))
     CmdArgs.push_back("-fno-rtti-data");
 
-  // Emit CodeView if -Z7 is present.
-  *EmitCodeView = Args.hasArg(options::OPT__SLASH_Z7);
-  bool EmitDwarf = Args.hasArg(options::OPT_gdwarf);
-  // If we are emitting CV but not DWARF, don't build information that LLVM
-  // can't yet process.
-  if (*EmitCodeView && !EmitDwarf)
-    *DebugInfoKind = codegenoptions::DebugLineTablesOnly;
-  if (*EmitCodeView)
+  // This controls whether or not we emit stack-protector instrumentation.
+  // In MSVC, Buffer Security Check (/GS) is on by default.
+  if (Args.hasFlag(options::OPT__SLASH_GS, options::OPT__SLASH_GS_,
+                   /*default=*/true)) {
+    CmdArgs.push_back("-stack-protector");
+    CmdArgs.push_back(Args.MakeArgString(Twine(LangOptions::SSPStrong)));
+  }
+
+  // Emit CodeView if -Z7, -Zd, or -gline-tables-only are present.
+  if (Arg *DebugInfoArg =
+          Args.getLastArg(options::OPT__SLASH_Z7, options::OPT__SLASH_Zd,
+                          options::OPT_gline_tables_only)) {
+    *EmitCodeView = true;
+    if (DebugInfoArg->getOption().matches(options::OPT__SLASH_Z7))
+      *DebugInfoKind = codegenoptions::LimitedDebugInfo;
+    else
+      *DebugInfoKind = codegenoptions::DebugLineTablesOnly;
     CmdArgs.push_back("-gcodeview");
+  } else {
+    *EmitCodeView = false;
+  }
 
   const Driver &D = getToolChain().getDriver();
   EHFlags EH = parseClangCLEHFlags(D, Args);
-  // FIXME: Do something with NoExceptC.
   if (EH.Synch || EH.Asynch) {
-    CmdArgs.push_back("-fcxx-exceptions");
+    if (types::isCXX(InputType))
+      CmdArgs.push_back("-fcxx-exceptions");
     CmdArgs.push_back("-fexceptions");
   }
+  if (types::isCXX(InputType) && EH.Synch && EH.NoUnwindC)
+    CmdArgs.push_back("-fexternc-nounwind");
 
   // /EP should expand to -E -P.
   if (Args.hasArg(options::OPT__SLASH_EP)) {
@@ -5966,6 +6508,15 @@
       CmdArgs.push_back("-fms-memptr-rep=virtual");
   }
 
+  if (Args.getLastArg(options::OPT__SLASH_Gd))
+     CmdArgs.push_back("-fdefault-calling-conv=cdecl");
+  else if (Args.getLastArg(options::OPT__SLASH_Gr))
+     CmdArgs.push_back("-fdefault-calling-conv=fastcall");
+  else if (Args.getLastArg(options::OPT__SLASH_Gz))
+     CmdArgs.push_back("-fdefault-calling-conv=stdcall");
+  else if (Args.getLastArg(options::OPT__SLASH_Gv))
+     CmdArgs.push_back("-fdefault-calling-conv=vectorcall");
+
   if (Arg *A = Args.getLastArg(options::OPT_vtordisp_mode_EQ))
     A->render(Args, CmdArgs);
 
@@ -5995,6 +6546,20 @@
   CmdArgs.push_back(ABIName.data());
 }
 
+void ClangAs::AddX86TargetArgs(const ArgList &Args,
+                               ArgStringList &CmdArgs) const {
+  if (Arg *A = Args.getLastArg(options::OPT_masm_EQ)) {
+    StringRef Value = A->getValue();
+    if (Value == "intel" || Value == "att") {
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back(Args.MakeArgString("-x86-asm-syntax=" + Value));
+    } else {
+      getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument)
+          << A->getOption().getName() << Value;
+    }
+  }
+}
+
 void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
                            const InputInfo &Output, const InputInfoList &Inputs,
                            const ArgList &Args,
@@ -6004,9 +6569,8 @@
   assert(Inputs.size() == 1 && "Unexpected number of inputs.");
   const InputInfo &Input = Inputs[0];
 
-  std::string TripleStr =
-      getToolChain().ComputeEffectiveClangTriple(Args, Input.getType());
-  const llvm::Triple Triple(TripleStr);
+  const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
+  const std::string &TripleStr = Triple.getTriple();
 
   // Don't warn about "clang -w -c foo.s"
   Args.ClaimAllArgs(options::OPT_w);
@@ -6142,6 +6706,11 @@
   case llvm::Triple::mips64el:
     AddMIPSTargetArgs(Args, CmdArgs);
     break;
+    
+  case llvm::Triple::x86:
+  case llvm::Triple::x86_64:
+    AddX86TargetArgs(Args, CmdArgs);
+    break;
   }
 
   // Consume all the warning flags. Usually this would be handled more
@@ -6287,7 +6856,7 @@
     }
   }
 
-  const std::string customGCCName = D.getCCCGenericGCCName();
+  const std::string &customGCCName = D.getCCCGenericGCCName();
   const char *GCCName;
   if (!customGCCName.empty())
     GCCName = customGCCName.c_str();
@@ -6612,6 +7181,7 @@
   std::string Linker = getToolChain().GetProgramPath(getShortName());
   ArgStringList CmdArgs;
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
+  CmdArgs.push_back("-shared");
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
   C.addCommand(llvm::make_unique<Command>(JA, *this, Args.MakeArgString(Linker),
@@ -6635,7 +7205,10 @@
                                 const InputInfoList &Inputs,
                                 const ArgList &Args,
                                 const char *LinkingOutput) const {
-  const char *Linker = Args.MakeArgString(getToolChain().GetLinkerPath());
+
+  const ToolChain &ToolChain = getToolChain();
+  const Driver &D = ToolChain.getDriver();
+  const char *Linker = Args.MakeArgString(ToolChain.GetLinkerPath());
   ArgStringList CmdArgs;
   CmdArgs.push_back("-flavor");
   CmdArgs.push_back("ld");
@@ -6647,9 +7220,48 @@
   if (areOptimizationsEnabled(Args))
     CmdArgs.push_back("--gc-sections");
 
-  AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
+  if (Args.hasArg(options::OPT_rdynamic))
+    CmdArgs.push_back("-export-dynamic");
+  if (Args.hasArg(options::OPT_s))
+    CmdArgs.push_back("--strip-all");
+  if (Args.hasArg(options::OPT_shared))
+    CmdArgs.push_back("-shared");
+  if (Args.hasArg(options::OPT_static))
+    CmdArgs.push_back("-Bstatic");
+
+  Args.AddAllArgs(CmdArgs, options::OPT_L);
+  ToolChain.AddFilePathLibArgs(Args, CmdArgs);
+
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
+    if (Args.hasArg(options::OPT_shared))
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("rcrt1.o")));
+    else if (Args.hasArg(options::OPT_pie))
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("Scrt1.o")));
+    else
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crt1.o")));
+
+    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crti.o")));
+  }
+
+  AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs);
+
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
+    if (D.CCCIsCXX())
+      ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
+
+    if (Args.hasArg(options::OPT_pthread))
+      CmdArgs.push_back("-lpthread");
+
+    CmdArgs.push_back("-lc");
+    CmdArgs.push_back("-lcompiler_rt");
+  }
+
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles))
+    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtn.o")));
+
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
+
   C.addCommand(llvm::make_unique<Command>(JA, *this, Linker, CmdArgs, Inputs));
 }
 
@@ -6768,6 +7380,14 @@
       .Default(NanLegacy);
 }
 
+bool mips::hasCompactBranches(StringRef &CPU) {
+  // mips32r6 and mips64r6 have compact branches.
+  return llvm::StringSwitch<bool>(CPU)
+      .Case("mips32r6", true)
+      .Case("mips64r6", true)
+      .Default(false);
+}
+
 bool mips::hasMipsAbiArg(const ArgList &Args, const char *Value) {
   Arg *A = Args.getLastArg(options::OPT_mabi_EQ);
   return A && (A->getValue() == StringRef(Value));
@@ -6793,10 +7413,21 @@
   return false;
 }
 
+bool mips::isFP64ADefault(const llvm::Triple &Triple, StringRef CPUName) {
+  if (!Triple.isAndroid())
+    return false;
+
+  // Android MIPS32R6 defaults to FP64A.
+  return llvm::StringSwitch<bool>(CPUName)
+      .Case("mips32r6", true)
+      .Default(false);
+}
+
 bool mips::isFPXXDefault(const llvm::Triple &Triple, StringRef CPUName,
                          StringRef ABIName, mips::FloatABI FloatABI) {
   if (Triple.getVendor() != llvm::Triple::ImaginationTechnologies &&
-      Triple.getVendor() != llvm::Triple::MipsTechnologies)
+      Triple.getVendor() != llvm::Triple::MipsTechnologies &&
+      !Triple.isAndroid())
     return false;
 
   if (ABIName != "32")
@@ -6926,6 +7557,14 @@
 
   // CloudABI only supports static linkage.
   CmdArgs.push_back("-Bstatic");
+  CmdArgs.push_back("--no-dynamic-linker");
+
+  // Provide PIE linker flags in case PIE is default for the architecture.
+  if (ToolChain.isPIEDefault()) {
+    CmdArgs.push_back("-pie");
+    CmdArgs.push_back("-zrelro");
+  }
+
   CmdArgs.push_back("--eh-frame-hdr");
   CmdArgs.push_back("--gc-sections");
 
@@ -7202,6 +7841,15 @@
     else
       CmdArgs.push_back("-no_pie");
   }
+  // for embed-bitcode, use -bitcode_bundle in linker command
+  if (C.getDriver().embedBitcodeEnabled() ||
+      C.getDriver().embedBitcodeMarkerOnly()) {
+    // Check if the toolchain supports bitcode build flow.
+    if (MachOTC.SupportsEmbeddedBitcode())
+      CmdArgs.push_back("-bitcode_bundle");
+    else
+      D.Diag(diag::err_drv_bitcode_unsupported_on_toolchain);
+  }
 
   Args.AddLastArg(CmdArgs, options::OPT_prebind);
   Args.AddLastArg(CmdArgs, options::OPT_noprebind);
@@ -8003,12 +8651,12 @@
   if (IsPIE)
     CmdArgs.push_back("-pie");
 
+  CmdArgs.push_back("--eh-frame-hdr");
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
   } else {
     if (Args.hasArg(options::OPT_rdynamic))
       CmdArgs.push_back("-export-dynamic");
-    CmdArgs.push_back("--eh-frame-hdr");
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-Bshareable");
     } else {
@@ -8268,6 +8916,7 @@
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-Bshareable");
     } else {
+      Args.AddAllArgs(CmdArgs, options::OPT_pie);
       CmdArgs.push_back("-dynamic-linker");
       CmdArgs.push_back("/libexec/ld.elf_so");
     }
@@ -8299,9 +8948,7 @@
     break;
   case llvm::Triple::armeb:
   case llvm::Triple::thumbeb:
-    arm::appendEBLinkFlags(
-        Args, CmdArgs,
-        llvm::Triple(getToolChain().ComputeEffectiveClangTriple(Args)));
+    arm::appendEBLinkFlags(Args, CmdArgs, getToolChain().getEffectiveTriple());
     CmdArgs.push_back("-m");
     switch (getToolChain().getTriple().getEnvironment()) {
     case llvm::Triple::EABI:
@@ -8369,15 +9016,15 @@
     if (!Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back(
           Args.MakeArgString(getToolChain().GetFilePath("crt0.o")));
-      CmdArgs.push_back(
-          Args.MakeArgString(getToolChain().GetFilePath("crti.o")));
-      CmdArgs.push_back(
-          Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o")));
-    } else {
-      CmdArgs.push_back(
-          Args.MakeArgString(getToolChain().GetFilePath("crti.o")));
+    }
+    CmdArgs.push_back(
+        Args.MakeArgString(getToolChain().GetFilePath("crti.o")));
+    if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie)) {
       CmdArgs.push_back(
           Args.MakeArgString(getToolChain().GetFilePath("crtbeginS.o")));
+    } else {
+      CmdArgs.push_back(
+          Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o")));
     }
   }
 
@@ -8394,7 +9041,7 @@
   unsigned Major, Minor, Micro;
   getToolChain().getTriple().getOSVersion(Major, Minor, Micro);
   bool useLibgcc = true;
-  if (Major >= 7 || (Major == 6 && Minor == 99 && Micro >= 49) || Major == 0) {
+  if (Major >= 7 || Major == 0) {
     switch (getToolChain().getArch()) {
     case llvm::Triple::aarch64:
     case llvm::Triple::arm:
@@ -8404,6 +9051,8 @@
     case llvm::Triple::ppc:
     case llvm::Triple::ppc64:
     case llvm::Triple::ppc64le:
+    case llvm::Triple::sparc:
+    case llvm::Triple::sparcv9:
     case llvm::Triple::x86:
     case llvm::Triple::x86_64:
       useLibgcc = false;
@@ -8441,12 +9090,12 @@
   }
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
-    if (!Args.hasArg(options::OPT_shared))
-      CmdArgs.push_back(
-          Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
-    else
+    if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie))
       CmdArgs.push_back(
           Args.MakeArgString(getToolChain().GetFilePath("crtendS.o")));
+    else
+      CmdArgs.push_back(
+          Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
     CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crtn.o")));
   }
 
@@ -8463,8 +9112,7 @@
                                        const char *LinkingOutput) const {
   claimNoWarnArgs(Args);
 
-  std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args);
-  llvm::Triple Triple = llvm::Triple(TripleStr);
+  const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
   ArgStringList CmdArgs;
 
@@ -8551,12 +9199,12 @@
     Args.AddLastArg(CmdArgs, options::OPT_march_EQ);
 
     // FIXME: remove krait check when GNU tools support krait cpu
-    // for now replace it with -march=armv7-a  to avoid a lower
+    // for now replace it with -mcpu=cortex-a15 to avoid a lower
     // march from being picked in the absence of a cpu flag.
     Arg *A;
     if ((A = Args.getLastArg(options::OPT_mcpu_EQ)) &&
         StringRef(A->getValue()).lower() == "krait")
-      CmdArgs.push_back("-march=armv7-a");
+      CmdArgs.push_back("-mcpu=cortex-a15");
     else
       Args.AddLastArg(CmdArgs, options::OPT_mcpu_EQ);
     Args.AddLastArg(CmdArgs, options::OPT_mfpu_EQ);
@@ -8678,6 +9326,7 @@
                       ArgStringList &CmdArgs, const ArgList &Args) {
   bool isAndroid = Triple.isAndroid();
   bool isCygMing = Triple.isOSCygMing();
+  bool IsIAMCU = Triple.isOSIAMCU();
   bool StaticLibgcc = Args.hasArg(options::OPT_static_libgcc) ||
                       Args.hasArg(options::OPT_static);
   if (!D.CCCIsCXX())
@@ -8694,7 +9343,7 @@
       CmdArgs.push_back("--no-as-needed");
   }
 
-  if (StaticLibgcc && !isAndroid)
+  if (StaticLibgcc && !isAndroid && !IsIAMCU)
     CmdArgs.push_back("-lgcc_eh");
   else if (!Args.hasArg(options::OPT_shared) && D.CCCIsCXX())
     CmdArgs.push_back("-lgcc");
@@ -8708,72 +9357,6 @@
     CmdArgs.push_back("-ldl");
 }
 
-static std::string getLinuxDynamicLinker(const ArgList &Args,
-                                         const toolchains::Linux &ToolChain) {
-  const llvm::Triple::ArchType Arch = ToolChain.getArch();
-
-  if (ToolChain.getTriple().isAndroid()) {
-    if (ToolChain.getTriple().isArch64Bit())
-      return "/system/bin/linker64";
-    else
-      return "/system/bin/linker";
-  } else if (Arch == llvm::Triple::x86 || Arch == llvm::Triple::sparc ||
-             Arch == llvm::Triple::sparcel)
-    return "/lib/ld-linux.so.2";
-  else if (Arch == llvm::Triple::aarch64)
-    return "/lib/ld-linux-aarch64.so.1";
-  else if (Arch == llvm::Triple::aarch64_be)
-    return "/lib/ld-linux-aarch64_be.so.1";
-  else if (Arch == llvm::Triple::arm || Arch == llvm::Triple::thumb) {
-    if (ToolChain.getTriple().getEnvironment() == llvm::Triple::GNUEABIHF ||
-        arm::getARMFloatABI(ToolChain, Args) == arm::FloatABI::Hard)
-      return "/lib/ld-linux-armhf.so.3";
-    else
-      return "/lib/ld-linux.so.3";
-  } else if (Arch == llvm::Triple::armeb || Arch == llvm::Triple::thumbeb) {
-    // TODO: check which dynamic linker name.
-    if (ToolChain.getTriple().getEnvironment() == llvm::Triple::GNUEABIHF ||
-        arm::getARMFloatABI(ToolChain, Args) == arm::FloatABI::Hard)
-      return "/lib/ld-linux-armhf.so.3";
-    else
-      return "/lib/ld-linux.so.3";
-  } else if (Arch == llvm::Triple::mips || Arch == llvm::Triple::mipsel ||
-             Arch == llvm::Triple::mips64 || Arch == llvm::Triple::mips64el) {
-    std::string LibDir =
-        "/lib" + mips::getMipsABILibSuffix(Args, ToolChain.getTriple());
-    StringRef LibName;
-    bool IsNaN2008 = mips::isNaN2008(Args, ToolChain.getTriple());
-    if (mips::isUCLibc(Args))
-      LibName = IsNaN2008 ? "ld-uClibc-mipsn8.so.0" : "ld-uClibc.so.0";
-    else if (!ToolChain.getTriple().hasEnvironment()) {
-      bool LE = (ToolChain.getTriple().getArch() == llvm::Triple::mipsel) ||
-                (ToolChain.getTriple().getArch() == llvm::Triple::mips64el);
-      LibName = LE ? "ld-musl-mipsel.so.1" : "ld-musl-mips.so.1";
-    } else
-      LibName = IsNaN2008 ? "ld-linux-mipsn8.so.1" : "ld.so.1";
-
-    return (LibDir + "/" + LibName).str();
-  } else if (Arch == llvm::Triple::ppc)
-    return "/lib/ld.so.1";
-  else if (Arch == llvm::Triple::ppc64) {
-    if (ppc::hasPPCAbiArg(Args, "elfv2"))
-      return "/lib64/ld64.so.2";
-    return "/lib64/ld64.so.1";
-  } else if (Arch == llvm::Triple::ppc64le) {
-    if (ppc::hasPPCAbiArg(Args, "elfv1"))
-      return "/lib64/ld64.so.1";
-    return "/lib64/ld64.so.2";
-  } else if (Arch == llvm::Triple::systemz)
-    return "/lib/ld64.so.1";
-  else if (Arch == llvm::Triple::sparcv9)
-    return "/lib64/ld-linux.so.2";
-  else if (Arch == llvm::Triple::x86_64 &&
-           ToolChain.getTriple().getEnvironment() == llvm::Triple::GNUX32)
-    return "/libx32/ld-linux-x32.so.2";
-  else
-    return "/lib64/ld-linux-x86-64.so.2";
-}
-
 static void AddRunTimeLibs(const ToolChain &TC, const Driver &D,
                            ArgStringList &CmdArgs, const ArgList &Args) {
   // Make use of compiler-rt if --rtlib option is used
@@ -8791,7 +9374,16 @@
     }
     break;
   case ToolChain::RLT_Libgcc:
-    AddLibgcc(TC.getTriple(), D, CmdArgs, Args);
+    // Make sure libgcc is not used under MSVC environment by default
+    if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
+      // Issue error diagnostic if libgcc is explicitly specified
+      // through command line as --rtlib option argument.
+      if (Args.hasArg(options::OPT_rtlib_EQ)) {
+        TC.getDriver().Diag(diag::err_drv_unsupported_rtlib_for_platform)
+            << Args.getLastArg(options::OPT_rtlib_EQ)->getValue() << "MSVC";
+      }
+    } else
+      AddLibgcc(TC.getTriple(), D, CmdArgs, Args);
     break;
   }
 }
@@ -8799,6 +9391,8 @@
 static const char *getLDMOption(const llvm::Triple &T, const ArgList &Args) {
   switch (T.getArch()) {
   case llvm::Triple::x86:
+    if (T.isOSIAMCU())
+      return "elf_iamcu";
     return "elf_i386";
   case llvm::Triple::aarch64:
     return "aarch64linux";
@@ -8809,7 +9403,7 @@
     return "armelf_linux_eabi";
   case llvm::Triple::armeb:
   case llvm::Triple::thumbeb:
-    return "armebelf_linux_eabi"; /* TODO: check which NAME.  */
+    return "armelfb_linux_eabi";
   case llvm::Triple::ppc:
     return "elf32ppclinux";
   case llvm::Triple::ppc64:
@@ -8853,11 +9447,11 @@
       static_cast<const toolchains::Linux &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
 
-  std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args);
-  llvm::Triple Triple = llvm::Triple(TripleStr);
+  const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
   const llvm::Triple::ArchType Arch = ToolChain.getArch();
   const bool isAndroid = ToolChain.getTriple().isAndroid();
+  const bool IsIAMCU = ToolChain.getTriple().isOSIAMCU();
   const bool IsPIE =
       !Args.hasArg(options::OPT_shared) && !Args.hasArg(options::OPT_static) &&
       (Args.hasArg(options::OPT_pie) || ToolChain.isPIEDefault());
@@ -8918,20 +9512,23 @@
     CmdArgs.push_back("-shared");
   }
 
-  if (Arch == llvm::Triple::arm || Arch == llvm::Triple::armeb ||
-      Arch == llvm::Triple::thumb || Arch == llvm::Triple::thumbeb ||
-      (!Args.hasArg(options::OPT_static) &&
-       !Args.hasArg(options::OPT_shared))) {
-    CmdArgs.push_back("-dynamic-linker");
-    CmdArgs.push_back(Args.MakeArgString(
-        D.DyldPrefix + getLinuxDynamicLinker(Args, ToolChain)));
+  if (!Args.hasArg(options::OPT_static)) {
+    if (Args.hasArg(options::OPT_rdynamic))
+      CmdArgs.push_back("-export-dynamic");
+
+    if (!Args.hasArg(options::OPT_shared)) {
+      const std::string Loader =
+          D.DyldPrefix + ToolChain.getDynamicLinker(Args);
+      CmdArgs.push_back("-dynamic-linker");
+      CmdArgs.push_back(Args.MakeArgString(Loader));
+    }
   }
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
-    if (!isAndroid) {
+    if (!isAndroid && !IsIAMCU) {
       const char *crt1 = nullptr;
       if (!Args.hasArg(options::OPT_shared)) {
         if (Args.hasArg(options::OPT_pg))
@@ -8947,18 +9544,22 @@
       CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crti.o")));
     }
 
-    const char *crtbegin;
-    if (Args.hasArg(options::OPT_static))
-      crtbegin = isAndroid ? "crtbegin_static.o" : "crtbeginT.o";
-    else if (Args.hasArg(options::OPT_shared))
-      crtbegin = isAndroid ? "crtbegin_so.o" : "crtbeginS.o";
-    else if (IsPIE)
-      crtbegin = isAndroid ? "crtbegin_dynamic.o" : "crtbeginS.o";
-    else
-      crtbegin = isAndroid ? "crtbegin_dynamic.o" : "crtbegin.o";
+    if (IsIAMCU)
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crt0.o")));
+    else {
+      const char *crtbegin;
+      if (Args.hasArg(options::OPT_static))
+        crtbegin = isAndroid ? "crtbegin_static.o" : "crtbeginT.o";
+      else if (Args.hasArg(options::OPT_shared))
+        crtbegin = isAndroid ? "crtbegin_so.o" : "crtbeginS.o";
+      else if (IsPIE)
+        crtbegin = isAndroid ? "crtbegin_dynamic.o" : "crtbeginS.o";
+      else
+        crtbegin = isAndroid ? "crtbegin_dynamic.o" : "crtbegin.o";
 
-    if (HasCRTBeginEndFiles)
-      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtbegin)));
+      if (HasCRTBeginEndFiles)
+        CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtbegin)));
+    }
 
     // Add crtfastmath.o if available and fast math is enabled.
     ToolChain.AddFastMathRuntimeIfAvailable(Args, CmdArgs);
@@ -8976,6 +9577,7 @@
     CmdArgs.push_back("--no-demangle");
 
   bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, CmdArgs);
+  bool NeedsXRayDeps = addXRayRuntime(ToolChain, Args, CmdArgs);
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs);
   // The profile runtime also needs access to system libraries.
   getToolChain().addProfileRTLibs(Args, CmdArgs);
@@ -9002,6 +9604,9 @@
       if (NeedsSanitizerDeps)
         linkSanitizerRuntimeDeps(ToolChain, CmdArgs);
 
+      if (NeedsXRayDeps)
+        linkXRayRuntimeDeps(ToolChain, Args, CmdArgs);
+
       bool WantPthread = Args.hasArg(options::OPT_pthread) ||
                          Args.hasArg(options::OPT_pthreads);
 
@@ -9037,15 +9642,29 @@
       if (WantPthread && !isAndroid)
         CmdArgs.push_back("-lpthread");
 
+      if (Args.hasArg(options::OPT_fsplit_stack))
+        CmdArgs.push_back("--wrap=pthread_create");
+
       CmdArgs.push_back("-lc");
 
+      // Add IAMCU specific libs, if needed.
+      if (IsIAMCU)
+        CmdArgs.push_back("-lgloss");
+
       if (Args.hasArg(options::OPT_static))
         CmdArgs.push_back("--end-group");
       else
         AddRunTimeLibs(ToolChain, D, CmdArgs, Args);
+
+      // Add IAMCU specific libs (outside the group), if needed.
+      if (IsIAMCU) {
+        CmdArgs.push_back("--as-needed");
+        CmdArgs.push_back("-lsoftfp");
+        CmdArgs.push_back("--no-as-needed");
+      }
     }
 
-    if (!Args.hasArg(options::OPT_nostartfiles)) {
+    if (!Args.hasArg(options::OPT_nostartfiles) && !IsIAMCU) {
       const char *crtend;
       if (Args.hasArg(options::OPT_shared))
         crtend = isAndroid ? "crtend_so.o" : "crtendS.o";
@@ -9536,9 +10155,14 @@
                                            WindowsSdkLibPath.c_str()));
   }
 
+  if (!C.getDriver().IsCLMode() && Args.hasArg(options::OPT_L))
+    for (const auto &LibPath : Args.getAllArgValues(options::OPT_L))
+      CmdArgs.push_back(Args.MakeArgString("-libpath:" + LibPath));
+
   CmdArgs.push_back("-nologo");
 
-  if (Args.hasArg(options::OPT_g_Group, options::OPT__SLASH_Z7))
+  if (Args.hasArg(options::OPT_g_Group, options::OPT__SLASH_Z7,
+                  options::OPT__SLASH_Zd))
     CmdArgs.push_back("-debug");
 
   bool DLL = Args.hasArg(options::OPT__SLASH_LD, options::OPT__SLASH_LDd,
@@ -9591,6 +10215,12 @@
     }
   }
 
+  // Add compiler-rt lib in case if it was explicitly
+  // specified as an argument for --rtlib option.
+  if (!Args.hasArg(options::OPT_nostdlib)) {
+    AddRunTimeLibs(TC, TC.getDriver(), CmdArgs, Args);
+  }
+
   // Add filenames, libraries, and other linker inputs.
   for (const auto &Input : Inputs) {
     if (Input.isFilename()) {
@@ -9699,6 +10329,11 @@
   if (Args.hasFlag(options::OPT__SLASH_GR_, options::OPT__SLASH_GR,
                    /*default=*/false))
     CmdArgs.push_back("/GR-");
+
+  if (Args.hasFlag(options::OPT__SLASH_GS_, options::OPT__SLASH_GS,
+                   /*default=*/false))
+    CmdArgs.push_back("/GS-");
+
   if (Arg *A = Args.getLastArg(options::OPT_ffunction_sections,
                                options::OPT_fno_function_sections))
     CmdArgs.push_back(A->getOption().getID() == options::OPT_ffunction_sections
@@ -9722,6 +10357,8 @@
   // Flags that can simply be passed through.
   Args.AddAllArgs(CmdArgs, options::OPT__SLASH_LD);
   Args.AddAllArgs(CmdArgs, options::OPT__SLASH_LDd);
+  Args.AddAllArgs(CmdArgs, options::OPT__SLASH_GX);
+  Args.AddAllArgs(CmdArgs, options::OPT__SLASH_GX_);
   Args.AddAllArgs(CmdArgs, options::OPT__SLASH_EH);
   Args.AddAllArgs(CmdArgs, options::OPT__SLASH_Zl);
 
@@ -9730,6 +10367,10 @@
                                options::OPT__SLASH_MT, options::OPT__SLASH_MTd))
     A->render(Args, CmdArgs);
 
+  // Pass through all unknown arguments so that the fallback command can see
+  // them too.
+  Args.AddAllArgs(CmdArgs, options::OPT_UNKNOWN);
+
   // Input filename.
   assert(Inputs.size() == 1);
   const InputInfo &II = Inputs[0];
@@ -10214,12 +10855,12 @@
     } else {
       for (const auto &Lib : {"asan_dynamic", "asan_dynamic_runtime_thunk"})
         CmdArgs.push_back(TC.getCompilerRTArgString(Args, Lib));
-        // Make sure the dynamic runtime thunk is not optimized out at link time
-        // to ensure proper SEH handling.
-        CmdArgs.push_back(Args.MakeArgString("--undefined"));
-        CmdArgs.push_back(Args.MakeArgString(TC.getArch() == llvm::Triple::x86
-                                                 ? "___asan_seh_interceptor"
-                                                 : "__asan_seh_interceptor"));
+      // Make sure the dynamic runtime thunk is not optimized out at link time
+      // to ensure proper SEH handling.
+      CmdArgs.push_back(Args.MakeArgString("--undefined"));
+      CmdArgs.push_back(Args.MakeArgString(TC.getArch() == llvm::Triple::x86
+                                               ? "___asan_seh_interceptor"
+                                               : "__asan_seh_interceptor"));
     }
   }
 
@@ -10247,7 +10888,6 @@
     CmdArgs.push_back("-S");
     CmdArgs.push_back("-fno-exceptions"); // Always do this even if unspecified.
   }
-  CmdArgs.push_back("-mcpu=myriad2");
   CmdArgs.push_back("-DMYRIAD2");
 
   // Append all -I, -iquote, -isystem paths, defines/undefines,
@@ -10257,7 +10897,8 @@
                             options::OPT_std_EQ, options::OPT_D, options::OPT_U,
                             options::OPT_f_Group, options::OPT_f_clang_Group,
                             options::OPT_g_Group, options::OPT_M_Group,
-                            options::OPT_O_Group, options::OPT_W_Group});
+                            options::OPT_O_Group, options::OPT_W_Group,
+                            options::OPT_mcpu_EQ});
 
   // If we're producing a dependency file, and assembly is the final action,
   // then the name of the target in the dependency file should be the '.o'
@@ -10297,7 +10938,10 @@
   assert(Output.getType() == types::TY_Object);
 
   CmdArgs.push_back("-no6thSlotCompression");
-  CmdArgs.push_back("-cv:myriad2"); // Chip Version
+  const Arg *CPUArg = Args.getLastArg(options::OPT_mcpu_EQ);
+  if (CPUArg)
+    CmdArgs.push_back(
+        Args.MakeArgString("-cv:" + StringRef(CPUArg->getValue())));
   CmdArgs.push_back("-noSPrefixing");
   CmdArgs.push_back("-a"); // Mystery option.
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler);
@@ -10411,7 +11055,7 @@
   CmdArgs.push_back(Input.getFilename());
 
   const char *Exec =
-      Args.MakeArgString(getToolChain().GetProgramPath("ps4-as"));
+      Args.MakeArgString(getToolChain().GetProgramPath("orbis-as"));
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
@@ -10479,7 +11123,7 @@
     CmdArgs.push_back("-lpthread");
   }
 
-  const char *Exec = Args.MakeArgString(ToolChain.GetProgramPath("ps4-ld"));
+  const char *Exec = Args.MakeArgString(ToolChain.GetProgramPath("orbis-ld"));
 
   C.addCommand(llvm::make_unique<Command>(JA, T, Exec, CmdArgs, Inputs));
 }
@@ -10652,9 +11296,9 @@
 
   const char *Exec =
 #ifdef LLVM_ON_WIN32
-      Args.MakeArgString(ToolChain.GetProgramPath("ps4-ld.gold"));
+      Args.MakeArgString(ToolChain.GetProgramPath("orbis-ld.gold"));
 #else
-      Args.MakeArgString(ToolChain.GetProgramPath("ps4-ld"));
+      Args.MakeArgString(ToolChain.GetProgramPath("orbis-ld"));
 #endif
 
   C.addCommand(llvm::make_unique<Command>(JA, T, Exec, CmdArgs, Inputs));
@@ -10696,28 +11340,60 @@
                                     const char *LinkingOutput) const {
   const auto &TC =
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
-  assert(TC.getArch() == llvm::Triple::nvptx ||
-         TC.getArch() == llvm::Triple::nvptx64);
+  assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
-  std::vector<std::string> gpu_archs =
-      Args.getAllArgValues(options::OPT_march_EQ);
-  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
-  const std::string& gpu_arch = gpu_archs[0];
+  // Obtain architecture from the action.
+  CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
+  assert(gpu_arch != CudaArch::UNKNOWN &&
+         "Device action expected to have an architecture.");
 
+  // Check that our installation's ptxas supports gpu_arch.
+  if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
+    TC.cudaInstallation().CheckCudaVersionSupportsArch(gpu_arch);
+  }
 
   ArgStringList CmdArgs;
   CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
+  if (Args.hasFlag(options::OPT_cuda_noopt_device_debug,
+                   options::OPT_no_cuda_noopt_device_debug, false)) {
+    // ptxas does not accept -g option if optimization is enabled, so
+    // we ignore the compiler's -O* options if we want debug info.
+    CmdArgs.push_back("-g");
+    CmdArgs.push_back("--dont-merge-basicblocks");
+    CmdArgs.push_back("--return-at-end");
+  } else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
+    // Map the -O we received to -O{0,1,2,3}.
+    //
+    // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's
+    // default, so it may correspond more closely to the spirit of clang -O2.
 
-  // Clang's default optimization level is -O0, but ptxas's default is -O3.
-  CmdArgs.push_back(Args.MakeArgString(
-      llvm::Twine("-O") +
-      Args.getLastArgValue(options::OPT_O_Group, "0").data()));
-
-  // Don't bother passing -g to ptxas: It's enabled by default at -O0, and
-  // not supported at other optimization levels.
+    // -O3 seems like the least-bad option when -Osomething is specified to
+    // clang but it isn't handled below.
+    StringRef OOpt = "3";
+    if (A->getOption().matches(options::OPT_O4) ||
+        A->getOption().matches(options::OPT_Ofast))
+      OOpt = "3";
+    else if (A->getOption().matches(options::OPT_O0))
+      OOpt = "0";
+    else if (A->getOption().matches(options::OPT_O)) {
+      // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options.
+      OOpt = llvm::StringSwitch<const char *>(A->getValue())
+                 .Case("1", "1")
+                 .Case("2", "2")
+                 .Case("3", "3")
+                 .Case("s", "2")
+                 .Case("z", "2")
+                 .Default("2");
+    }
+    CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
+  } else {
+    // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond
+    // to no optimizations, but ptxas's default is -O3.
+    CmdArgs.push_back("-O0");
+  }
 
   CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+  CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
   for (const auto& II : Inputs)
@@ -10740,8 +11416,7 @@
                                  const char *LinkingOutput) const {
   const auto &TC =
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
-  assert(TC.getArch() == llvm::Triple::nvptx ||
-         TC.getArch() == llvm::Triple::nvptx64);
+  assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
   ArgStringList CmdArgs;
   CmdArgs.push_back("--cuda");
@@ -10750,12 +11425,20 @@
   CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
 
   for (const auto& II : Inputs) {
-    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    auto *A = II.getAction();
+    assert(A->getInputs().size() == 1 &&
+           "Device offload action is expected to have a single input");
+    const char *gpu_arch_str = A->getOffloadingArch();
+    assert(gpu_arch_str &&
+           "Device action expected to have associated a GPU architecture!");
+    CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
+
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
-    const char *Arch = (II.getType() == types::TY_PP_Asm)
-                           ? A->getComputeArchName()
-                           : A->getGpuArchName();
+    const char *Arch =
+        (II.getType() == types::TY_PP_Asm)
+            ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch))
+            : gpu_arch_str;
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
                                          Arch + ",file=" + II.getFilename()));
   }
diff --git a/lib/Driver/Tools.h b/lib/Driver/Tools.h
index 1d348bb..6577ce9 100644
--- a/lib/Driver/Tools.h
+++ b/lib/Driver/Tools.h
@@ -57,8 +57,7 @@
                                const Driver &D, const llvm::opt::ArgList &Args,
                                llvm::opt::ArgStringList &CmdArgs,
                                const InputInfo &Output,
-                               const InputInfoList &Inputs,
-                               const ToolChain *AuxToolChain) const;
+                               const InputInfoList &Inputs) const;
 
   void AddAArch64TargetArgs(const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs) const;
@@ -82,6 +81,8 @@
                         llvm::opt::ArgStringList &CmdArgs) const;
   void AddHexagonTargetArgs(const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs) const;
+  void AddLanaiTargetArgs(const llvm::opt::ArgList &Args,
+                          llvm::opt::ArgStringList &CmdArgs) const;
   void AddWebAssemblyTargetArgs(const llvm::opt::ArgList &Args,
                                 llvm::opt::ArgStringList &CmdArgs) const;
 
@@ -91,7 +92,7 @@
                                  llvm::opt::ArgStringList &cmdArgs,
                                  RewriteKind rewrite) const;
 
-  void AddClangCLArgs(const llvm::opt::ArgList &Args,
+  void AddClangCLArgs(const llvm::opt::ArgList &Args, types::ID InputType,
                       llvm::opt::ArgStringList &CmdArgs,
                       codegenoptions::DebugInfoKind *DebugInfoKind,
                       bool *EmitCodeView) const;
@@ -124,6 +125,8 @@
       : Tool("clang::as", "clang integrated assembler", TC, RF_Full) {}
   void AddMIPSTargetArgs(const llvm::opt::ArgList &Args,
                          llvm::opt::ArgStringList &CmdArgs) const;
+  void AddX86TargetArgs(const llvm::opt::ArgList &Args,
+                        llvm::opt::ArgStringList &CmdArgs) const;
   bool hasGoodDiagnostics() const override { return true; }
   bool hasIntegratedAssembler() const override { return false; }
   bool hasIntegratedCPP() const override { return false; }
@@ -289,6 +292,7 @@
 };
 
 NanEncoding getSupportedNanEncoding(StringRef &CPU);
+bool hasCompactBranches(StringRef &CPU);
 void getMipsCPUAndABI(const llvm::opt::ArgList &Args,
                       const llvm::Triple &Triple, StringRef &CPUName,
                       StringRef &ABIName);
@@ -297,6 +301,7 @@
 bool hasMipsAbiArg(const llvm::opt::ArgList &Args, const char *Value);
 bool isUCLibc(const llvm::opt::ArgList &Args);
 bool isNaN2008(const llvm::opt::ArgList &Args, const llvm::Triple &Triple);
+bool isFP64ADefault(const llvm::Triple &Triple, StringRef CPUName);
 bool isFPXXDefault(const llvm::Triple &Triple, StringRef CPUName,
                    StringRef ABIName, mips::FloatABI FloatABI);
 bool shouldUseFPXX(const llvm::opt::ArgList &Args, const llvm::Triple &Triple,
@@ -680,7 +685,8 @@
 
 /// Visual studio tools.
 namespace visualstudio {
-VersionTuple getMSVCVersion(const Driver *D, const llvm::Triple &Triple,
+VersionTuple getMSVCVersion(const Driver *D, const ToolChain &TC,
+                            const llvm::Triple &Triple,
                             const llvm::opt::ArgList &Args, bool IsWindowsMSVC);
 
 class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
@@ -773,6 +779,16 @@
 FloatABI getPPCFloatABI(const Driver &D, const llvm::opt::ArgList &Args);
 } // end namespace ppc
 
+namespace sparc {
+enum class FloatABI {
+  Invalid,
+  Soft,
+  Hard,
+};
+
+FloatABI getSparcFloatABI(const Driver &D, const llvm::opt::ArgList &Args);
+} // end namespace sparc
+
 namespace XCore {
 // For XCore, we do not need to instantiate tools for PreProcess, PreCompile and
 // Compile.
diff --git a/lib/Driver/Types.cpp b/lib/Driver/Types.cpp
index 3b3b67f..f8e1e40 100644
--- a/lib/Driver/Types.cpp
+++ b/lib/Driver/Types.cpp
@@ -204,6 +204,7 @@
            .Case("pcm", TY_ModuleFile)
            .Case("pch", TY_PCH)
            .Case("gch", TY_PCH)
+           .Case("rs", TY_RenderScript)
            .Default(TY_INVALID);
 }
 
@@ -241,7 +242,6 @@
   }
   assert(0 < P.size() && "Not enough phases in list");
   assert(P.size() <= phases::MaxNumberOfPhases && "Too many phases in list");
-  return;
 }
 
 ID types::lookupCXXTypeForCType(ID Id) {
diff --git a/lib/Edit/Makefile b/lib/Edit/Makefile
deleted file mode 100644
index 92a67eb..0000000
--- a/lib/Edit/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- clang/lib/Edit/Makefile -----------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangEdit
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/Edit/RewriteObjCFoundationAPI.cpp b/lib/Edit/RewriteObjCFoundationAPI.cpp
index 482c0f6..0ae1ec7 100644
--- a/lib/Edit/RewriteObjCFoundationAPI.cpp
+++ b/lib/Edit/RewriteObjCFoundationAPI.cpp
@@ -1076,6 +1076,7 @@
     case CK_CopyAndAutoreleaseBlockObject:
     case CK_BuiltinFnToFnPtr:
     case CK_ZeroToOCLEvent:
+    case CK_IntToOCLSampler:
       return false;
 
     case CK_BooleanToSignedIntegral:
diff --git a/lib/Format/AffectedRangeManager.cpp b/lib/Format/AffectedRangeManager.cpp
new file mode 100644
index 0000000..5d4df19
--- /dev/null
+++ b/lib/Format/AffectedRangeManager.cpp
@@ -0,0 +1,150 @@
+//===--- AffectedRangeManager.cpp - Format C++ code -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements AffectRangeManager class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AffectedRangeManager.h"
+
+#include "FormatToken.h"
+#include "TokenAnnotator.h"
+
+namespace clang {
+namespace format {
+
+bool AffectedRangeManager::computeAffectedLines(
+    SmallVectorImpl<AnnotatedLine *>::iterator I,
+    SmallVectorImpl<AnnotatedLine *>::iterator E) {
+  bool SomeLineAffected = false;
+  const AnnotatedLine *PreviousLine = nullptr;
+  while (I != E) {
+    AnnotatedLine *Line = *I;
+    Line->LeadingEmptyLinesAffected = affectsLeadingEmptyLines(*Line->First);
+
+    // If a line is part of a preprocessor directive, it needs to be formatted
+    // if any token within the directive is affected.
+    if (Line->InPPDirective) {
+      FormatToken *Last = Line->Last;
+      SmallVectorImpl<AnnotatedLine *>::iterator PPEnd = I + 1;
+      while (PPEnd != E && !(*PPEnd)->First->HasUnescapedNewline) {
+        Last = (*PPEnd)->Last;
+        ++PPEnd;
+      }
+
+      if (affectsTokenRange(*Line->First, *Last,
+                            /*IncludeLeadingNewlines=*/false)) {
+        SomeLineAffected = true;
+        markAllAsAffected(I, PPEnd);
+      }
+      I = PPEnd;
+      continue;
+    }
+
+    if (nonPPLineAffected(Line, PreviousLine))
+      SomeLineAffected = true;
+
+    PreviousLine = Line;
+    ++I;
+  }
+  return SomeLineAffected;
+}
+
+bool AffectedRangeManager::affectsCharSourceRange(
+    const CharSourceRange &Range) {
+  for (SmallVectorImpl<CharSourceRange>::const_iterator I = Ranges.begin(),
+                                                        E = Ranges.end();
+       I != E; ++I) {
+    if (!SourceMgr.isBeforeInTranslationUnit(Range.getEnd(), I->getBegin()) &&
+        !SourceMgr.isBeforeInTranslationUnit(I->getEnd(), Range.getBegin()))
+      return true;
+  }
+  return false;
+}
+
+bool AffectedRangeManager::affectsTokenRange(const FormatToken &First,
+                                             const FormatToken &Last,
+                                             bool IncludeLeadingNewlines) {
+  SourceLocation Start = First.WhitespaceRange.getBegin();
+  if (!IncludeLeadingNewlines)
+    Start = Start.getLocWithOffset(First.LastNewlineOffset);
+  SourceLocation End = Last.getStartOfNonWhitespace();
+  End = End.getLocWithOffset(Last.TokenText.size());
+  CharSourceRange Range = CharSourceRange::getCharRange(Start, End);
+  return affectsCharSourceRange(Range);
+}
+
+bool AffectedRangeManager::affectsLeadingEmptyLines(const FormatToken &Tok) {
+  CharSourceRange EmptyLineRange = CharSourceRange::getCharRange(
+      Tok.WhitespaceRange.getBegin(),
+      Tok.WhitespaceRange.getBegin().getLocWithOffset(Tok.LastNewlineOffset));
+  return affectsCharSourceRange(EmptyLineRange);
+}
+
+void AffectedRangeManager::markAllAsAffected(
+    SmallVectorImpl<AnnotatedLine *>::iterator I,
+    SmallVectorImpl<AnnotatedLine *>::iterator E) {
+  while (I != E) {
+    (*I)->Affected = true;
+    markAllAsAffected((*I)->Children.begin(), (*I)->Children.end());
+    ++I;
+  }
+}
+
+bool AffectedRangeManager::nonPPLineAffected(
+    AnnotatedLine *Line, const AnnotatedLine *PreviousLine) {
+  bool SomeLineAffected = false;
+  Line->ChildrenAffected =
+      computeAffectedLines(Line->Children.begin(), Line->Children.end());
+  if (Line->ChildrenAffected)
+    SomeLineAffected = true;
+
+  // Stores whether one of the line's tokens is directly affected.
+  bool SomeTokenAffected = false;
+  // Stores whether we need to look at the leading newlines of the next token
+  // in order to determine whether it was affected.
+  bool IncludeLeadingNewlines = false;
+
+  // Stores whether the first child line of any of this line's tokens is
+  // affected.
+  bool SomeFirstChildAffected = false;
+
+  for (FormatToken *Tok = Line->First; Tok; Tok = Tok->Next) {
+    // Determine whether 'Tok' was affected.
+    if (affectsTokenRange(*Tok, *Tok, IncludeLeadingNewlines))
+      SomeTokenAffected = true;
+
+    // Determine whether the first child of 'Tok' was affected.
+    if (!Tok->Children.empty() && Tok->Children.front()->Affected)
+      SomeFirstChildAffected = true;
+
+    IncludeLeadingNewlines = Tok->Children.empty();
+  }
+
+  // Was this line moved, i.e. has it previously been on the same line as an
+  // affected line?
+  bool LineMoved = PreviousLine && PreviousLine->Affected &&
+                   Line->First->NewlinesBefore == 0;
+
+  bool IsContinuedComment =
+      Line->First->is(tok::comment) && Line->First->Next == nullptr &&
+      Line->First->NewlinesBefore < 2 && PreviousLine &&
+      PreviousLine->Affected && PreviousLine->Last->is(tok::comment);
+
+  if (SomeTokenAffected || SomeFirstChildAffected || LineMoved ||
+      IsContinuedComment) {
+    Line->Affected = true;
+    SomeLineAffected = true;
+  }
+  return SomeLineAffected;
+}
+
+} // namespace format
+} // namespace clang
diff --git a/lib/Format/AffectedRangeManager.h b/lib/Format/AffectedRangeManager.h
new file mode 100644
index 0000000..d8d5ee5
--- /dev/null
+++ b/lib/Format/AffectedRangeManager.h
@@ -0,0 +1,67 @@
+//===--- AffectedRangeManager.h - Format C++ code ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief AffectedRangeManager class manages affected ranges in the code.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_AFFECTEDRANGEMANAGER_H
+#define LLVM_CLANG_LIB_FORMAT_AFFECTEDRANGEMANAGER_H
+
+#include "clang/Basic/SourceManager.h"
+
+namespace clang {
+namespace format {
+
+struct FormatToken;
+class AnnotatedLine;
+
+class AffectedRangeManager {
+public:
+  AffectedRangeManager(const SourceManager &SourceMgr,
+                       const ArrayRef<CharSourceRange> Ranges)
+      : SourceMgr(SourceMgr), Ranges(Ranges.begin(), Ranges.end()) {}
+
+  // Determines which lines are affected by the SourceRanges given as input.
+  // Returns \c true if at least one line between I and E or one of their
+  // children is affected.
+  bool computeAffectedLines(SmallVectorImpl<AnnotatedLine *>::iterator I,
+                            SmallVectorImpl<AnnotatedLine *>::iterator E);
+
+  // Returns true if 'Range' intersects with one of the input ranges.
+  bool affectsCharSourceRange(const CharSourceRange &Range);
+
+private:
+  // Returns true if the range from 'First' to 'Last' intersects with one of the
+  // input ranges.
+  bool affectsTokenRange(const FormatToken &First, const FormatToken &Last,
+                         bool IncludeLeadingNewlines);
+
+  // Returns true if one of the input ranges intersect the leading empty lines
+  // before 'Tok'.
+  bool affectsLeadingEmptyLines(const FormatToken &Tok);
+
+  // Marks all lines between I and E as well as all their children as affected.
+  void markAllAsAffected(SmallVectorImpl<AnnotatedLine *>::iterator I,
+                         SmallVectorImpl<AnnotatedLine *>::iterator E);
+
+  // Determines whether 'Line' is affected by the SourceRanges given as input.
+  // Returns \c true if line or one if its children is affected.
+  bool nonPPLineAffected(AnnotatedLine *Line,
+                         const AnnotatedLine *PreviousLine);
+
+  const SourceManager &SourceMgr;
+  const SmallVector<CharSourceRange, 8> Ranges;
+};
+
+} // namespace format
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_FORMAT_AFFECTEDRANGEMANAGER_H
diff --git a/lib/Format/CMakeLists.txt b/lib/Format/CMakeLists.txt
index 2ce3834..cb46b9f 100644
--- a/lib/Format/CMakeLists.txt
+++ b/lib/Format/CMakeLists.txt
@@ -1,10 +1,14 @@
 set(LLVM_LINK_COMPONENTS support)
 
 add_clang_library(clangFormat
+  AffectedRangeManager.cpp
   BreakableToken.cpp
   ContinuationIndenter.cpp
   Format.cpp
   FormatToken.cpp
+  FormatTokenLexer.cpp
+  SortJavaScriptImports.cpp
+  TokenAnalyzer.cpp
   TokenAnnotator.cpp
   UnwrappedLineFormatter.cpp
   UnwrappedLineParser.cpp
diff --git a/lib/Format/ContinuationIndenter.cpp b/lib/Format/ContinuationIndenter.cpp
index 52c8062..f7f0fd5 100644
--- a/lib/Format/ContinuationIndenter.cpp
+++ b/lib/Format/ContinuationIndenter.cpp
@@ -19,7 +19,6 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
 #include "llvm/Support/Debug.h"
-#include <string>
 
 #define DEBUG_TYPE "format-formatter"
 
@@ -64,7 +63,7 @@
 
 ContinuationIndenter::ContinuationIndenter(const FormatStyle &Style,
                                            const AdditionalKeywords &Keywords,
-                                           SourceManager &SourceMgr,
+                                           const SourceManager &SourceMgr,
                                            WhitespaceManager &Whitespaces,
                                            encoding::Encoding Encoding,
                                            bool BinPackInconclusiveFunctions)
@@ -151,6 +150,7 @@
     return true;
   if ((startsNextParameter(Current, Style) || Previous.is(tok::semi) ||
        (Previous.is(TT_TemplateCloser) && Current.is(TT_StartOfName) &&
+        Style.Language == FormatStyle::LK_Cpp &&
         // FIXME: This is a temporary workaround for the case where clang-format
         // sets BreakBeforeParameter to avoid bin packing and this creates a
         // completely unnecessary line break after a template type that isn't
@@ -355,7 +355,17 @@
       Previous.isOneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
       State.Column > getNewLineColumn(State) &&
       (!Previous.Previous ||
-       !Previous.Previous->isOneOf(tok::kw_for, tok::kw_while, tok::kw_switch)))
+       !Previous.Previous->isOneOf(tok::kw_for, tok::kw_while,
+                                   tok::kw_switch)) &&
+      // Don't do this for simple (no expressions) one-argument function calls
+      // as that feels like needlessly wasting whitespace, e.g.:
+      //
+      //   caaaaaaaaaaaall(
+      //       caaaaaaaaaaaall(
+      //           caaaaaaaaaaaall(
+      //               caaaaaaaaaaaaaaaaaaaaaaall(aaaaaaaaaaaaaa, aaaaaaaaa))));
+      Current.FakeLParens.size() > 0 &&
+      Current.FakeLParens.back() > prec::Unknown)
     State.Stack.back().NoLineBreak = true;
 
   if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign &&
@@ -465,10 +475,13 @@
   //     // code
   //   }
   //
-  // is common and should be formatted like a free-standing function.
-  if (Style.Language != FormatStyle::LK_JavaScript ||
-      Current.NestingLevel != 0 || !PreviousNonComment->is(tok::equal) ||
-      !Current.is(Keywords.kw_function))
+  // is common and should be formatted like a free-standing function. The same
+  // goes for wrapping before the lambda return type arrow.
+  if (!Current.is(TT_LambdaArrow) &&
+      (Style.Language != FormatStyle::LK_JavaScript ||
+       Current.NestingLevel != 0 || !PreviousNonComment ||
+       !PreviousNonComment->is(tok::equal) ||
+       !Current.isOneOf(Keywords.kw_async, Keywords.kw_function)))
     State.Stack.back().NestedBlockIndent = State.Column;
 
   if (NextNonComment->isMemberAccess()) {
@@ -846,7 +859,7 @@
     // there is a line-break right after the operator.
     // Exclude relational operators, as there, it is always more desirable to
     // have the LHS 'left' of the RHS.
-    if (Previous && Previous->getPrecedence() > prec::Assignment &&
+    if (Previous && Previous->getPrecedence() != prec::Assignment &&
         Previous->isOneOf(TT_BinaryOperator, TT_ConditionalExpr) &&
         Previous->getPrecedence() != prec::Relational) {
       bool BreakBeforeOperator =
@@ -1034,6 +1047,9 @@
 
 unsigned ContinuationIndenter::addMultilineToken(const FormatToken &Current,
                                                  LineState &State) {
+  if (!Current.IsMultiline)
+    return 0;
+
   // Break before further function parameters on all levels.
   for (unsigned i = 0, e = State.Stack.size(); i != e; ++i)
     State.Stack[i].BreakBeforeParameter = true;
@@ -1113,10 +1129,10 @@
     } else {
       return 0;
     }
-  } else if (Current.is(TT_BlockComment) && Current.isTrailingComment()) {
-    if (!Style.ReflowComments ||
+  } else if (Current.is(TT_BlockComment)) {
+    if (!Current.isTrailingComment() || !Style.ReflowComments ||
         CommentPragmasRegex.match(Current.TokenText.substr(2)))
-      return 0;
+      return addMultilineToken(Current, State);
     Token.reset(new BreakableBlockComment(
         Current, State.Line->Level, StartColumn, Current.OriginalColumn,
         !Current.Previous, State.Line->InPPDirective, Encoding, Style));
diff --git a/lib/Format/ContinuationIndenter.h b/lib/Format/ContinuationIndenter.h
index 9b9154e..21ad653 100644
--- a/lib/Format/ContinuationIndenter.h
+++ b/lib/Format/ContinuationIndenter.h
@@ -38,7 +38,8 @@
   /// column \p FirstIndent.
   ContinuationIndenter(const FormatStyle &Style,
                        const AdditionalKeywords &Keywords,
-                       SourceManager &SourceMgr, WhitespaceManager &Whitespaces,
+                       const SourceManager &SourceMgr,
+                       WhitespaceManager &Whitespaces,
                        encoding::Encoding Encoding,
                        bool BinPackInconclusiveFunctions);
 
@@ -137,7 +138,7 @@
 
   FormatStyle Style;
   const AdditionalKeywords &Keywords;
-  SourceManager &SourceMgr;
+  const SourceManager &SourceMgr;
   WhitespaceManager &Whitespaces;
   encoding::Encoding Encoding;
   bool BinPackInconclusiveFunctions;
diff --git a/lib/Format/Encoding.h b/lib/Format/Encoding.h
index 592d720..148f7fd 100644
--- a/lib/Format/Encoding.h
+++ b/lib/Format/Encoding.h
@@ -17,6 +17,7 @@
 #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
 
 #include "clang/Basic/LLVM.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Unicode.h"
 
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index acd520e..b7d3c55 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -14,7 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Format/Format.h"
+#include "AffectedRangeManager.h"
 #include "ContinuationIndenter.h"
+#include "FormatTokenLexer.h"
+#include "SortJavaScriptImports.h"
+#include "TokenAnalyzer.h"
 #include "TokenAnnotator.h"
 #include "UnwrappedLineFormatter.h"
 #include "UnwrappedLineParser.h"
@@ -22,6 +26,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Basic/VirtualFileSystem.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Allocator.h"
@@ -29,7 +34,8 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/YAMLTraits.h"
-#include <queue>
+#include <algorithm>
+#include <memory>
 #include <string>
 
 #define DEBUG_TYPE "format-formatter"
@@ -68,6 +74,16 @@
     IO.enumCase(Value, "Always", FormatStyle::UT_Always);
     IO.enumCase(Value, "true", FormatStyle::UT_Always);
     IO.enumCase(Value, "ForIndentation", FormatStyle::UT_ForIndentation);
+    IO.enumCase(Value, "ForContinuationAndIndentation",
+                FormatStyle::UT_ForContinuationAndIndentation);
+  }
+};
+
+template <> struct ScalarEnumerationTraits<FormatStyle::JavaScriptQuoteStyle> {
+  static void enumeration(IO &IO, FormatStyle::JavaScriptQuoteStyle &Value) {
+    IO.enumCase(Value, "Leave", FormatStyle::JSQS_Leave);
+    IO.enumCase(Value, "Single", FormatStyle::JSQS_Single);
+    IO.enumCase(Value, "Double", FormatStyle::JSQS_Double);
   }
 };
 
@@ -292,10 +308,13 @@
                    Style.ExperimentalAutoDetectBinPacking);
     IO.mapOptional("ForEachMacros", Style.ForEachMacros);
     IO.mapOptional("IncludeCategories", Style.IncludeCategories);
+    IO.mapOptional("IncludeIsMainRegex", Style.IncludeIsMainRegex);
     IO.mapOptional("IndentCaseLabels", Style.IndentCaseLabels);
     IO.mapOptional("IndentWidth", Style.IndentWidth);
     IO.mapOptional("IndentWrappedFunctionNames",
                    Style.IndentWrappedFunctionNames);
+    IO.mapOptional("JavaScriptQuotes", Style.JavaScriptQuotes);
+    IO.mapOptional("JavaScriptWrapImports", Style.JavaScriptWrapImports);
     IO.mapOptional("KeepEmptyLinesAtTheStartOfBlocks",
                    Style.KeepEmptyLinesAtTheStartOfBlocks);
     IO.mapOptional("MacroBlockBegin", Style.MacroBlockBegin);
@@ -319,6 +338,7 @@
     IO.mapOptional("ReflowComments", Style.ReflowComments);
     IO.mapOptional("SortIncludes", Style.SortIncludes);
     IO.mapOptional("SpaceAfterCStyleCast", Style.SpaceAfterCStyleCast);
+    IO.mapOptional("SpaceAfterTemplateKeyword", Style.SpaceAfterTemplateKeyword);
     IO.mapOptional("SpaceBeforeAssignmentOperators",
                    Style.SpaceBeforeAssignmentOperators);
     IO.mapOptional("SpaceBeforeParens", Style.SpaceBeforeParens);
@@ -508,9 +528,12 @@
   LLVMStyle.IncludeCategories = {{"^\"(llvm|llvm-c|clang|clang-c)/", 2},
                                  {"^(<|\"(gtest|isl|json)/)", 3},
                                  {".*", 1}};
+  LLVMStyle.IncludeIsMainRegex = "$";
   LLVMStyle.IndentCaseLabels = false;
   LLVMStyle.IndentWrappedFunctionNames = false;
   LLVMStyle.IndentWidth = 2;
+  LLVMStyle.JavaScriptQuotes = FormatStyle::JSQS_Leave;
+  LLVMStyle.JavaScriptWrapImports = true;
   LLVMStyle.TabWidth = 8;
   LLVMStyle.MaxEmptyLinesToKeep = 1;
   LLVMStyle.KeepEmptyLinesAtTheStartOfBlocks = true;
@@ -522,6 +545,7 @@
   LLVMStyle.SpacesBeforeTrailingComments = 1;
   LLVMStyle.Standard = FormatStyle::LS_Cpp11;
   LLVMStyle.UseTab = FormatStyle::UT_Never;
+  LLVMStyle.JavaScriptQuotes = FormatStyle::JSQS_Leave;
   LLVMStyle.ReflowComments = true;
   LLVMStyle.SpacesInParentheses = false;
   LLVMStyle.SpacesInSquareBrackets = false;
@@ -529,6 +553,7 @@
   LLVMStyle.SpacesInContainerLiterals = true;
   LLVMStyle.SpacesInCStyleCastParentheses = false;
   LLVMStyle.SpaceAfterCStyleCast = false;
+  LLVMStyle.SpaceAfterTemplateKeyword = true;
   LLVMStyle.SpaceBeforeParens = FormatStyle::SBPO_ControlStatements;
   LLVMStyle.SpaceBeforeAssignmentOperators = true;
   LLVMStyle.SpacesInAngles = false;
@@ -559,6 +584,7 @@
   GoogleStyle.ConstructorInitializerAllOnOneLineOrOnePerLine = true;
   GoogleStyle.DerivePointerAlignment = true;
   GoogleStyle.IncludeCategories = {{"^<.*\\.h>", 1}, {"^<.*", 2}, {".*", 3}};
+  GoogleStyle.IncludeIsMainRegex = "([-_](test|unittest))?$";
   GoogleStyle.IndentCaseLabels = true;
   GoogleStyle.KeepEmptyLinesAtTheStartOfBlocks = false;
   GoogleStyle.ObjCSpaceAfterProperty = false;
@@ -587,9 +613,12 @@
     GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
     GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
     GoogleStyle.BreakBeforeTernaryOperators = false;
-    GoogleStyle.CommentPragmas = "@(export|visibility) {";
+    GoogleStyle.CommentPragmas = "@(export|requirecss|return|see|visibility) ";
     GoogleStyle.MaxEmptyLinesToKeep = 3;
+    GoogleStyle.NamespaceIndentation = FormatStyle::NI_All;
     GoogleStyle.SpacesInContainerLiterals = false;
+    GoogleStyle.JavaScriptQuotes = FormatStyle::JSQS_Single;
+    GoogleStyle.JavaScriptWrapImports = false;
   } else if (Language == FormatStyle::LK_Proto) {
     GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
     GoogleStyle.SpacesInContainerLiterals = false;
@@ -636,6 +665,7 @@
   MozillaStyle.ObjCSpaceBeforeProtocolList = false;
   MozillaStyle.PenaltyReturnTypeOnItsOwnLine = 200;
   MozillaStyle.PointerAlignment = FormatStyle::PAS_Left;
+  MozillaStyle.SpaceAfterTemplateKeyword = false;
   return MozillaStyle;
 }
 
@@ -763,734 +793,35 @@
 
 namespace {
 
-class FormatTokenLexer {
+class Formatter : public TokenAnalyzer {
 public:
-  FormatTokenLexer(SourceManager &SourceMgr, FileID ID, FormatStyle &Style,
-                   encoding::Encoding Encoding)
-      : FormatTok(nullptr), IsFirstToken(true), GreaterStashed(false),
-        LessStashed(false), Column(0), TrailingWhitespace(0),
-        SourceMgr(SourceMgr), ID(ID), Style(Style),
-        IdentTable(getFormattingLangOpts(Style)), Keywords(IdentTable),
-        Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(false),
-        MacroBlockBeginRegex(Style.MacroBlockBegin),
-        MacroBlockEndRegex(Style.MacroBlockEnd) {
-    Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
-                        getFormattingLangOpts(Style)));
-    Lex->SetKeepWhitespaceMode(true);
+  Formatter(const Environment &Env, const FormatStyle &Style,
+            bool *IncompleteFormat)
+      : TokenAnalyzer(Env, Style), IncompleteFormat(IncompleteFormat) {}
 
-    for (const std::string &ForEachMacro : Style.ForEachMacros)
-      ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
-    std::sort(ForEachMacros.begin(), ForEachMacros.end());
-  }
-
-  ArrayRef<FormatToken *> lex() {
-    assert(Tokens.empty());
-    assert(FirstInLineIndex == 0);
-    do {
-      Tokens.push_back(getNextToken());
-      if (Style.Language == FormatStyle::LK_JavaScript)
-        tryParseJSRegexLiteral();
-      tryMergePreviousTokens();
-      if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
-        FirstInLineIndex = Tokens.size() - 1;
-    } while (Tokens.back()->Tok.isNot(tok::eof));
-    return Tokens;
-  }
-
-  const AdditionalKeywords &getKeywords() { return Keywords; }
-
-private:
-  void tryMergePreviousTokens() {
-    if (tryMerge_TMacro())
-      return;
-    if (tryMergeConflictMarkers())
-      return;
-    if (tryMergeLessLess())
-      return;
-
-    if (Style.Language == FormatStyle::LK_JavaScript) {
-      if (tryMergeTemplateString())
-        return;
-
-      static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
-      static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
-                                                     tok::equal};
-      static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
-                                                    tok::greaterequal};
-      static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
-      // FIXME: Investigate what token type gives the correct operator priority.
-      if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
-        return;
-      if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
-        return;
-      if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
-        return;
-      if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
-        return;
-    }
-  }
-
-  bool tryMergeLessLess() {
-    // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
-    if (Tokens.size() < 3)
-      return false;
-
-    bool FourthTokenIsLess = false;
-    if (Tokens.size() > 3)
-      FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
-
-    auto First = Tokens.end() - 3;
-    if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
-        First[0]->isNot(tok::less) || FourthTokenIsLess)
-      return false;
-
-    // Only merge if there currently is no whitespace between the two "<".
-    if (First[1]->WhitespaceRange.getBegin() !=
-        First[1]->WhitespaceRange.getEnd())
-      return false;
-
-    First[0]->Tok.setKind(tok::lessless);
-    First[0]->TokenText = "<<";
-    First[0]->ColumnWidth += 1;
-    Tokens.erase(Tokens.end() - 2);
-    return true;
-  }
-
-  bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType) {
-    if (Tokens.size() < Kinds.size())
-      return false;
-
-    SmallVectorImpl<FormatToken *>::const_iterator First =
-        Tokens.end() - Kinds.size();
-    if (!First[0]->is(Kinds[0]))
-      return false;
-    unsigned AddLength = 0;
-    for (unsigned i = 1; i < Kinds.size(); ++i) {
-      if (!First[i]->is(Kinds[i]) ||
-          First[i]->WhitespaceRange.getBegin() !=
-              First[i]->WhitespaceRange.getEnd())
-        return false;
-      AddLength += First[i]->TokenText.size();
-    }
-    Tokens.resize(Tokens.size() - Kinds.size() + 1);
-    First[0]->TokenText = StringRef(First[0]->TokenText.data(),
-                                    First[0]->TokenText.size() + AddLength);
-    First[0]->ColumnWidth += AddLength;
-    First[0]->Type = NewType;
-    return true;
-  }
-
-  // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
-  bool precedesOperand(FormatToken *Tok) {
-    // NB: This is not entirely correct, as an r_paren can introduce an operand
-    // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
-    // corner case to not matter in practice, though.
-    return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
-                        tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
-                        tok::colon, tok::question, tok::tilde) ||
-           Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
-                        tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
-                        tok::kw_typeof, Keywords.kw_instanceof,
-                        Keywords.kw_in) ||
-           Tok->isBinaryOperator();
-  }
-
-  bool canPrecedeRegexLiteral(FormatToken *Prev) {
-    if (!Prev)
-      return true;
-
-    // Regex literals can only follow after prefix unary operators, not after
-    // postfix unary operators. If the '++' is followed by a non-operand
-    // introducing token, the slash here is the operand and not the start of a
-    // regex.
-    if (Prev->isOneOf(tok::plusplus, tok::minusminus))
-      return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
-
-    // The previous token must introduce an operand location where regex
-    // literals can occur.
-    if (!precedesOperand(Prev))
-      return false;
-
-    return true;
-  }
-
-  // Tries to parse a JavaScript Regex literal starting at the current token,
-  // if that begins with a slash and is in a location where JavaScript allows
-  // regex literals. Changes the current token to a regex literal and updates
-  // its text if successful.
-  void tryParseJSRegexLiteral() {
-    FormatToken *RegexToken = Tokens.back();
-    if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
-      return;
-
-    FormatToken *Prev = nullptr;
-    for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
-      // NB: Because previous pointers are not initialized yet, this cannot use
-      // Token.getPreviousNonComment.
-      if ((*I)->isNot(tok::comment)) {
-        Prev = *I;
-        break;
-      }
-    }
-
-    if (!canPrecedeRegexLiteral(Prev))
-      return;
-
-    // 'Manually' lex ahead in the current file buffer.
-    const char *Offset = Lex->getBufferLocation();
-    const char *RegexBegin = Offset - RegexToken->TokenText.size();
-    StringRef Buffer = Lex->getBuffer();
-    bool InCharacterClass = false;
-    bool HaveClosingSlash = false;
-    for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
-      // Regular expressions are terminated with a '/', which can only be
-      // escaped using '\' or a character class between '[' and ']'.
-      // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
-      switch (*Offset) {
-      case '\\':
-        // Skip the escaped character.
-        ++Offset;
-        break;
-      case '[':
-        InCharacterClass = true;
-        break;
-      case ']':
-        InCharacterClass = false;
-        break;
-      case '/':
-        if (!InCharacterClass)
-          HaveClosingSlash = true;
-        break;
-      }
-    }
-
-    RegexToken->Type = TT_RegexLiteral;
-    // Treat regex literals like other string_literals.
-    RegexToken->Tok.setKind(tok::string_literal);
-    RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
-    RegexToken->ColumnWidth = RegexToken->TokenText.size();
-
-    resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
-  }
-
-  bool tryMergeTemplateString() {
-    if (Tokens.size() < 2)
-      return false;
-
-    FormatToken *EndBacktick = Tokens.back();
-    // Backticks get lexed as tok::unknown tokens. If a template string contains
-    // a comment start, it gets lexed as a tok::comment, or tok::unknown if
-    // unterminated.
-    if (!EndBacktick->isOneOf(tok::comment, tok::string_literal,
-                              tok::char_constant, tok::unknown))
-      return false;
-    size_t CommentBacktickPos = EndBacktick->TokenText.find('`');
-    // Unknown token that's not actually a backtick, or a comment that doesn't
-    // contain a backtick.
-    if (CommentBacktickPos == StringRef::npos)
-      return false;
-
-    unsigned TokenCount = 0;
-    bool IsMultiline = false;
-    unsigned EndColumnInFirstLine =
-        EndBacktick->OriginalColumn + EndBacktick->ColumnWidth;
-    for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; I++) {
-      ++TokenCount;
-      if (I[0]->IsMultiline)
-        IsMultiline = true;
-
-      // If there was a preceding template string, this must be the start of a
-      // template string, not the end.
-      if (I[0]->is(TT_TemplateString))
-        return false;
-
-      if (I[0]->isNot(tok::unknown) || I[0]->TokenText != "`") {
-        // Keep track of the rhs offset of the last token to wrap across lines -
-        // its the rhs offset of the first line of the template string, used to
-        // determine its width.
-        if (I[0]->IsMultiline)
-          EndColumnInFirstLine = I[0]->OriginalColumn + I[0]->ColumnWidth;
-        // If the token has newlines, the token before it (if it exists) is the
-        // rhs end of the previous line.
-        if (I[0]->NewlinesBefore > 0 && (I + 1 != E)) {
-          EndColumnInFirstLine = I[1]->OriginalColumn + I[1]->ColumnWidth;
-          IsMultiline = true;
-        }
-        continue;
-      }
-
-      Tokens.resize(Tokens.size() - TokenCount);
-      Tokens.back()->Type = TT_TemplateString;
-      const char *EndOffset =
-          EndBacktick->TokenText.data() + 1 + CommentBacktickPos;
-      if (CommentBacktickPos != 0) {
-        // If the backtick was not the first character (e.g. in a comment),
-        // re-lex after the backtick position.
-        SourceLocation Loc = EndBacktick->Tok.getLocation();
-        resetLexer(SourceMgr.getFileOffset(Loc) + CommentBacktickPos + 1);
-      }
-      Tokens.back()->TokenText =
-          StringRef(Tokens.back()->TokenText.data(),
-                    EndOffset - Tokens.back()->TokenText.data());
-
-      unsigned EndOriginalColumn = EndBacktick->OriginalColumn;
-      if (EndOriginalColumn == 0) {
-        SourceLocation Loc = EndBacktick->Tok.getLocation();
-        EndOriginalColumn = SourceMgr.getSpellingColumnNumber(Loc);
-      }
-      // If the ` is further down within the token (e.g. in a comment).
-      EndOriginalColumn += CommentBacktickPos;
-
-      if (IsMultiline) {
-        // ColumnWidth is from backtick to last token in line.
-        // LastLineColumnWidth is 0 to backtick.
-        // x = `some content
-        //     until here`;
-        Tokens.back()->ColumnWidth =
-            EndColumnInFirstLine - Tokens.back()->OriginalColumn;
-        // +1 for the ` itself.
-        Tokens.back()->LastLineColumnWidth = EndOriginalColumn + 1;
-        Tokens.back()->IsMultiline = true;
-      } else {
-        // Token simply spans from start to end, +1 for the ` itself.
-        Tokens.back()->ColumnWidth =
-            EndOriginalColumn - Tokens.back()->OriginalColumn + 1;
-      }
-      return true;
-    }
-    return false;
-  }
-
-  bool tryMerge_TMacro() {
-    if (Tokens.size() < 4)
-      return false;
-    FormatToken *Last = Tokens.back();
-    if (!Last->is(tok::r_paren))
-      return false;
-
-    FormatToken *String = Tokens[Tokens.size() - 2];
-    if (!String->is(tok::string_literal) || String->IsMultiline)
-      return false;
-
-    if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
-      return false;
-
-    FormatToken *Macro = Tokens[Tokens.size() - 4];
-    if (Macro->TokenText != "_T")
-      return false;
-
-    const char *Start = Macro->TokenText.data();
-    const char *End = Last->TokenText.data() + Last->TokenText.size();
-    String->TokenText = StringRef(Start, End - Start);
-    String->IsFirst = Macro->IsFirst;
-    String->LastNewlineOffset = Macro->LastNewlineOffset;
-    String->WhitespaceRange = Macro->WhitespaceRange;
-    String->OriginalColumn = Macro->OriginalColumn;
-    String->ColumnWidth = encoding::columnWidthWithTabs(
-        String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
-    String->NewlinesBefore = Macro->NewlinesBefore;
-    String->HasUnescapedNewline = Macro->HasUnescapedNewline;
-
-    Tokens.pop_back();
-    Tokens.pop_back();
-    Tokens.pop_back();
-    Tokens.back() = String;
-    return true;
-  }
-
-  bool tryMergeConflictMarkers() {
-    if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
-      return false;
-
-    // Conflict lines look like:
-    // <marker> <text from the vcs>
-    // For example:
-    // >>>>>>> /file/in/file/system at revision 1234
-    //
-    // We merge all tokens in a line that starts with a conflict marker
-    // into a single token with a special token type that the unwrapped line
-    // parser will use to correctly rebuild the underlying code.
-
-    FileID ID;
-    // Get the position of the first token in the line.
-    unsigned FirstInLineOffset;
-    std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
-        Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
-    StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
-    // Calculate the offset of the start of the current line.
-    auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
-    if (LineOffset == StringRef::npos) {
-      LineOffset = 0;
-    } else {
-      ++LineOffset;
-    }
-
-    auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
-    StringRef LineStart;
-    if (FirstSpace == StringRef::npos) {
-      LineStart = Buffer.substr(LineOffset);
-    } else {
-      LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
-    }
-
-    TokenType Type = TT_Unknown;
-    if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
-      Type = TT_ConflictStart;
-    } else if (LineStart == "|||||||" || LineStart == "=======" ||
-               LineStart == "====") {
-      Type = TT_ConflictAlternative;
-    } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
-      Type = TT_ConflictEnd;
-    }
-
-    if (Type != TT_Unknown) {
-      FormatToken *Next = Tokens.back();
-
-      Tokens.resize(FirstInLineIndex + 1);
-      // We do not need to build a complete token here, as we will skip it
-      // during parsing anyway (as we must not touch whitespace around conflict
-      // markers).
-      Tokens.back()->Type = Type;
-      Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
-
-      Tokens.push_back(Next);
-      return true;
-    }
-
-    return false;
-  }
-
-  FormatToken *getStashedToken() {
-    // Create a synthesized second '>' or '<' token.
-    Token Tok = FormatTok->Tok;
-    StringRef TokenText = FormatTok->TokenText;
-
-    unsigned OriginalColumn = FormatTok->OriginalColumn;
-    FormatTok = new (Allocator.Allocate()) FormatToken;
-    FormatTok->Tok = Tok;
-    SourceLocation TokLocation =
-        FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
-    FormatTok->Tok.setLocation(TokLocation);
-    FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
-    FormatTok->TokenText = TokenText;
-    FormatTok->ColumnWidth = 1;
-    FormatTok->OriginalColumn = OriginalColumn + 1;
-
-    return FormatTok;
-  }
-
-  FormatToken *getNextToken() {
-    if (GreaterStashed) {
-      GreaterStashed = false;
-      return getStashedToken();
-    }
-    if (LessStashed) {
-      LessStashed = false;
-      return getStashedToken();
-    }
-
-    FormatTok = new (Allocator.Allocate()) FormatToken;
-    readRawToken(*FormatTok);
-    SourceLocation WhitespaceStart =
-        FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
-    FormatTok->IsFirst = IsFirstToken;
-    IsFirstToken = false;
-
-    // Consume and record whitespace until we find a significant token.
-    unsigned WhitespaceLength = TrailingWhitespace;
-    while (FormatTok->Tok.is(tok::unknown)) {
-      StringRef Text = FormatTok->TokenText;
-      auto EscapesNewline = [&](int pos) {
-        // A '\r' here is just part of '\r\n'. Skip it.
-        if (pos >= 0 && Text[pos] == '\r')
-          --pos;
-        // See whether there is an odd number of '\' before this.
-        unsigned count = 0;
-        for (; pos >= 0; --pos, ++count)
-          if (Text[pos] != '\\')
-            break;
-        return count & 1;
-      };
-      // FIXME: This miscounts tok:unknown tokens that are not just
-      // whitespace, e.g. a '`' character.
-      for (int i = 0, e = Text.size(); i != e; ++i) {
-        switch (Text[i]) {
-        case '\n':
-          ++FormatTok->NewlinesBefore;
-          FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
-          FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
-          Column = 0;
-          break;
-        case '\r':
-          FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
-          Column = 0;
-          break;
-        case '\f':
-        case '\v':
-          Column = 0;
-          break;
-        case ' ':
-          ++Column;
-          break;
-        case '\t':
-          Column += Style.TabWidth - Column % Style.TabWidth;
-          break;
-        case '\\':
-          if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
-            FormatTok->Type = TT_ImplicitStringLiteral;
-          break;
-        default:
-          FormatTok->Type = TT_ImplicitStringLiteral;
-          break;
-        }
-        if (FormatTok->Type == TT_ImplicitStringLiteral)
-          break;
-      }
-
-      if (FormatTok->is(TT_ImplicitStringLiteral))
-        break;
-      WhitespaceLength += FormatTok->Tok.getLength();
-
-      readRawToken(*FormatTok);
-    }
-
-    // In case the token starts with escaped newlines, we want to
-    // take them into account as whitespace - this pattern is quite frequent
-    // in macro definitions.
-    // FIXME: Add a more explicit test.
-    while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
-           FormatTok->TokenText[1] == '\n') {
-      ++FormatTok->NewlinesBefore;
-      WhitespaceLength += 2;
-      FormatTok->LastNewlineOffset = 2;
-      Column = 0;
-      FormatTok->TokenText = FormatTok->TokenText.substr(2);
-    }
-
-    FormatTok->WhitespaceRange = SourceRange(
-        WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
-
-    FormatTok->OriginalColumn = Column;
-
-    TrailingWhitespace = 0;
-    if (FormatTok->Tok.is(tok::comment)) {
-      // FIXME: Add the trimmed whitespace to Column.
-      StringRef UntrimmedText = FormatTok->TokenText;
-      FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
-      TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
-    } else if (FormatTok->Tok.is(tok::raw_identifier)) {
-      IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
-      FormatTok->Tok.setIdentifierInfo(&Info);
-      FormatTok->Tok.setKind(Info.getTokenID());
-      if (Style.Language == FormatStyle::LK_Java &&
-          FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
-                             tok::kw_operator)) {
-        FormatTok->Tok.setKind(tok::identifier);
-        FormatTok->Tok.setIdentifierInfo(nullptr);
-      } else if (Style.Language == FormatStyle::LK_JavaScript &&
-                 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
-                                    tok::kw_operator)) {
-        FormatTok->Tok.setKind(tok::identifier);
-        FormatTok->Tok.setIdentifierInfo(nullptr);
-      }
-    } else if (FormatTok->Tok.is(tok::greatergreater)) {
-      FormatTok->Tok.setKind(tok::greater);
-      FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
-      GreaterStashed = true;
-    } else if (FormatTok->Tok.is(tok::lessless)) {
-      FormatTok->Tok.setKind(tok::less);
-      FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
-      LessStashed = true;
-    }
-
-    // Now FormatTok is the next non-whitespace token.
-
-    StringRef Text = FormatTok->TokenText;
-    size_t FirstNewlinePos = Text.find('\n');
-    if (FirstNewlinePos == StringRef::npos) {
-      // FIXME: ColumnWidth actually depends on the start column, we need to
-      // take this into account when the token is moved.
-      FormatTok->ColumnWidth =
-          encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
-      Column += FormatTok->ColumnWidth;
-    } else {
-      FormatTok->IsMultiline = true;
-      // FIXME: ColumnWidth actually depends on the start column, we need to
-      // take this into account when the token is moved.
-      FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
-          Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
-
-      // The last line of the token always starts in column 0.
-      // Thus, the length can be precomputed even in the presence of tabs.
-      FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
-          Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth,
-          Encoding);
-      Column = FormatTok->LastLineColumnWidth;
-    }
-
-    if (Style.Language == FormatStyle::LK_Cpp) {
-      if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
-            Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
-                tok::pp_define) &&
-          std::find(ForEachMacros.begin(), ForEachMacros.end(),
-                    FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) {
-        FormatTok->Type = TT_ForEachMacro;
-      } else if (FormatTok->is(tok::identifier)) {
-        if (MacroBlockBeginRegex.match(Text)) {
-          FormatTok->Type = TT_MacroBlockBegin;
-        } else if (MacroBlockEndRegex.match(Text)) {
-          FormatTok->Type = TT_MacroBlockEnd;
-        }
-      }
-    }
-
-    return FormatTok;
-  }
-
-  FormatToken *FormatTok;
-  bool IsFirstToken;
-  bool GreaterStashed, LessStashed;
-  unsigned Column;
-  unsigned TrailingWhitespace;
-  std::unique_ptr<Lexer> Lex;
-  SourceManager &SourceMgr;
-  FileID ID;
-  FormatStyle &Style;
-  IdentifierTable IdentTable;
-  AdditionalKeywords Keywords;
-  encoding::Encoding Encoding;
-  llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
-  // Index (in 'Tokens') of the last token that starts a new line.
-  unsigned FirstInLineIndex;
-  SmallVector<FormatToken *, 16> Tokens;
-  SmallVector<IdentifierInfo *, 8> ForEachMacros;
-
-  bool FormattingDisabled;
-
-  llvm::Regex MacroBlockBeginRegex;
-  llvm::Regex MacroBlockEndRegex;
-
-  void readRawToken(FormatToken &Tok) {
-    Lex->LexFromRawLexer(Tok.Tok);
-    Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
-                              Tok.Tok.getLength());
-    // For formatting, treat unterminated string literals like normal string
-    // literals.
-    if (Tok.is(tok::unknown)) {
-      if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
-        Tok.Tok.setKind(tok::string_literal);
-        Tok.IsUnterminatedLiteral = true;
-      } else if (Style.Language == FormatStyle::LK_JavaScript &&
-                 Tok.TokenText == "''") {
-        Tok.Tok.setKind(tok::char_constant);
-      }
-    }
-
-    if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
-                                 Tok.TokenText == "/* clang-format on */")) {
-      FormattingDisabled = false;
-    }
-
-    Tok.Finalized = FormattingDisabled;
-
-    if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
-                                 Tok.TokenText == "/* clang-format off */")) {
-      FormattingDisabled = true;
-    }
-  }
-
-  void resetLexer(unsigned Offset) {
-    StringRef Buffer = SourceMgr.getBufferData(ID);
-    Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
-                        getFormattingLangOpts(Style), Buffer.begin(),
-                        Buffer.begin() + Offset, Buffer.end()));
-    Lex->SetKeepWhitespaceMode(true);
-    TrailingWhitespace = 0;
-  }
-};
-
-static StringRef getLanguageName(FormatStyle::LanguageKind Language) {
-  switch (Language) {
-  case FormatStyle::LK_Cpp:
-    return "C++";
-  case FormatStyle::LK_Java:
-    return "Java";
-  case FormatStyle::LK_JavaScript:
-    return "JavaScript";
-  case FormatStyle::LK_Proto:
-    return "Proto";
-  default:
-    return "Unknown";
-  }
-}
-
-class Formatter : public UnwrappedLineConsumer {
-public:
-  Formatter(const FormatStyle &Style, SourceManager &SourceMgr, FileID ID,
-            ArrayRef<CharSourceRange> Ranges)
-      : Style(Style), ID(ID), SourceMgr(SourceMgr),
-        Whitespaces(SourceMgr, Style,
-                    inputUsesCRLF(SourceMgr.getBufferData(ID))),
-        Ranges(Ranges.begin(), Ranges.end()), UnwrappedLines(1),
-        Encoding(encoding::detectEncoding(SourceMgr.getBufferData(ID))) {
-    DEBUG(llvm::dbgs() << "File encoding: "
-                       << (Encoding == encoding::Encoding_UTF8 ? "UTF8"
-                                                               : "unknown")
-                       << "\n");
-    DEBUG(llvm::dbgs() << "Language: " << getLanguageName(Style.Language)
-                       << "\n");
-  }
-
-  tooling::Replacements format(bool *IncompleteFormat) {
-    tooling::Replacements Result;
-    FormatTokenLexer Tokens(SourceMgr, ID, Style, Encoding);
-
-    UnwrappedLineParser Parser(Style, Tokens.getKeywords(), Tokens.lex(),
-                               *this);
-    Parser.parse();
-    assert(UnwrappedLines.rbegin()->empty());
-    for (unsigned Run = 0, RunE = UnwrappedLines.size(); Run + 1 != RunE;
-         ++Run) {
-      DEBUG(llvm::dbgs() << "Run " << Run << "...\n");
-      SmallVector<AnnotatedLine *, 16> AnnotatedLines;
-      for (unsigned i = 0, e = UnwrappedLines[Run].size(); i != e; ++i) {
-        AnnotatedLines.push_back(new AnnotatedLine(UnwrappedLines[Run][i]));
-      }
-      tooling::Replacements RunResult =
-          format(AnnotatedLines, Tokens, IncompleteFormat);
-      DEBUG({
-        llvm::dbgs() << "Replacements for run " << Run << ":\n";
-        for (tooling::Replacements::iterator I = RunResult.begin(),
-                                             E = RunResult.end();
-             I != E; ++I) {
-          llvm::dbgs() << I->toString() << "\n";
-        }
-      });
-      for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
-        delete AnnotatedLines[i];
-      }
-      Result.insert(RunResult.begin(), RunResult.end());
-      Whitespaces.reset();
-    }
-    return Result;
-  }
-
-  tooling::Replacements format(SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
-                               FormatTokenLexer &Tokens,
-                               bool *IncompleteFormat) {
-    TokenAnnotator Annotator(Style, Tokens.getKeywords());
-    for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
-      Annotator.annotate(*AnnotatedLines[i]);
-    }
+  tooling::Replacements
+  analyze(TokenAnnotator &Annotator,
+          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+          FormatTokenLexer &Tokens, tooling::Replacements &Result) override {
     deriveLocalStyle(AnnotatedLines);
+    AffectedRangeMgr.computeAffectedLines(AnnotatedLines.begin(),
+                                          AnnotatedLines.end());
+
+    if (Style.Language == FormatStyle::LK_JavaScript &&
+        Style.JavaScriptQuotes != FormatStyle::JSQS_Leave)
+      requoteJSStringLiteral(AnnotatedLines, Result);
+
     for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
       Annotator.calculateFormattingInformation(*AnnotatedLines[i]);
     }
-    computeAffectedLines(AnnotatedLines.begin(), AnnotatedLines.end());
 
     Annotator.setCommentLineLevels(AnnotatedLines);
-    ContinuationIndenter Indenter(Style, Tokens.getKeywords(), SourceMgr,
-                                  Whitespaces, Encoding,
+
+    WhitespaceManager Whitespaces(
+        Env.getSourceManager(), Style,
+        inputUsesCRLF(Env.getSourceManager().getBufferData(Env.getFileID())));
+    ContinuationIndenter Indenter(Style, Tokens.getKeywords(),
+                                  Env.getSourceManager(), Whitespaces, Encoding,
                                   BinPackInconclusiveFunctions);
     UnwrappedLineFormatter(&Indenter, &Whitespaces, Style, Tokens.getKeywords(),
                            IncompleteFormat)
@@ -1499,137 +830,85 @@
   }
 
 private:
-  // Determines which lines are affected by the SourceRanges given as input.
-  // Returns \c true if at least one line between I and E or one of their
-  // children is affected.
-  bool computeAffectedLines(SmallVectorImpl<AnnotatedLine *>::iterator I,
-                            SmallVectorImpl<AnnotatedLine *>::iterator E) {
-    bool SomeLineAffected = false;
-    const AnnotatedLine *PreviousLine = nullptr;
-    while (I != E) {
-      AnnotatedLine *Line = *I;
-      Line->LeadingEmptyLinesAffected = affectsLeadingEmptyLines(*Line->First);
-
-      // If a line is part of a preprocessor directive, it needs to be formatted
-      // if any token within the directive is affected.
-      if (Line->InPPDirective) {
-        FormatToken *Last = Line->Last;
-        SmallVectorImpl<AnnotatedLine *>::iterator PPEnd = I + 1;
-        while (PPEnd != E && !(*PPEnd)->First->HasUnescapedNewline) {
-          Last = (*PPEnd)->Last;
-          ++PPEnd;
-        }
-
-        if (affectsTokenRange(*Line->First, *Last,
-                              /*IncludeLeadingNewlines=*/false)) {
-          SomeLineAffected = true;
-          markAllAsAffected(I, PPEnd);
-        }
-        I = PPEnd;
+  // If the last token is a double/single-quoted string literal, generates a
+  // replacement with a single/double quoted string literal, re-escaping the
+  // contents in the process.
+  void requoteJSStringLiteral(SmallVectorImpl<AnnotatedLine *> &Lines,
+                              tooling::Replacements &Result) {
+    for (AnnotatedLine *Line : Lines) {
+      requoteJSStringLiteral(Line->Children, Result);
+      if (!Line->Affected)
         continue;
+      for (FormatToken *FormatTok = Line->First; FormatTok;
+           FormatTok = FormatTok->Next) {
+        StringRef Input = FormatTok->TokenText;
+        if (FormatTok->Finalized || !FormatTok->isStringLiteral() ||
+            // NB: testing for not starting with a double quote to avoid
+            // breaking
+            // `template strings`.
+            (Style.JavaScriptQuotes == FormatStyle::JSQS_Single &&
+             !Input.startswith("\"")) ||
+            (Style.JavaScriptQuotes == FormatStyle::JSQS_Double &&
+             !Input.startswith("\'")))
+          continue;
+
+        // Change start and end quote.
+        bool IsSingle = Style.JavaScriptQuotes == FormatStyle::JSQS_Single;
+        SourceLocation Start = FormatTok->Tok.getLocation();
+        auto Replace = [&](SourceLocation Start, unsigned Length,
+                           StringRef ReplacementText) {
+          auto Err = Result.add(tooling::Replacement(
+              Env.getSourceManager(), Start, Length, ReplacementText));
+          // FIXME: handle error. For now, print error message and skip the
+          // replacement for release version.
+          if (Err)
+            llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+          assert(!Err);
+        };
+        Replace(Start, 1, IsSingle ? "'" : "\"");
+        Replace(FormatTok->Tok.getEndLoc().getLocWithOffset(-1), 1,
+                IsSingle ? "'" : "\"");
+
+        // Escape internal quotes.
+        size_t ColumnWidth = FormatTok->TokenText.size();
+        bool Escaped = false;
+        for (size_t i = 1; i < Input.size() - 1; i++) {
+          switch (Input[i]) {
+          case '\\':
+            if (!Escaped && i + 1 < Input.size() &&
+                ((IsSingle && Input[i + 1] == '"') ||
+                 (!IsSingle && Input[i + 1] == '\''))) {
+              // Remove this \, it's escaping a " or ' that no longer needs
+              // escaping
+              ColumnWidth--;
+              Replace(Start.getLocWithOffset(i), 1, "");
+              continue;
+            }
+            Escaped = !Escaped;
+            break;
+          case '\"':
+          case '\'':
+            if (!Escaped && IsSingle == (Input[i] == '\'')) {
+              // Escape the quote.
+              Replace(Start.getLocWithOffset(i), 0, "\\");
+              ColumnWidth++;
+            }
+            Escaped = false;
+            break;
+          default:
+            Escaped = false;
+            break;
+          }
+        }
+
+        // For formatting, count the number of non-escaped single quotes in them
+        // and adjust ColumnWidth to take the added escapes into account.
+        // FIXME(martinprobst): this might conflict with code breaking a long
+        // string literal (which clang-format doesn't do, yet). For that to
+        // work, this code would have to modify TokenText directly.
+        FormatTok->ColumnWidth = ColumnWidth;
       }
-
-      if (nonPPLineAffected(Line, PreviousLine))
-        SomeLineAffected = true;
-
-      PreviousLine = Line;
-      ++I;
     }
-    return SomeLineAffected;
-  }
-
-  // Determines whether 'Line' is affected by the SourceRanges given as input.
-  // Returns \c true if line or one if its children is affected.
-  bool nonPPLineAffected(AnnotatedLine *Line,
-                         const AnnotatedLine *PreviousLine) {
-    bool SomeLineAffected = false;
-    Line->ChildrenAffected =
-        computeAffectedLines(Line->Children.begin(), Line->Children.end());
-    if (Line->ChildrenAffected)
-      SomeLineAffected = true;
-
-    // Stores whether one of the line's tokens is directly affected.
-    bool SomeTokenAffected = false;
-    // Stores whether we need to look at the leading newlines of the next token
-    // in order to determine whether it was affected.
-    bool IncludeLeadingNewlines = false;
-
-    // Stores whether the first child line of any of this line's tokens is
-    // affected.
-    bool SomeFirstChildAffected = false;
-
-    for (FormatToken *Tok = Line->First; Tok; Tok = Tok->Next) {
-      // Determine whether 'Tok' was affected.
-      if (affectsTokenRange(*Tok, *Tok, IncludeLeadingNewlines))
-        SomeTokenAffected = true;
-
-      // Determine whether the first child of 'Tok' was affected.
-      if (!Tok->Children.empty() && Tok->Children.front()->Affected)
-        SomeFirstChildAffected = true;
-
-      IncludeLeadingNewlines = Tok->Children.empty();
-    }
-
-    // Was this line moved, i.e. has it previously been on the same line as an
-    // affected line?
-    bool LineMoved = PreviousLine && PreviousLine->Affected &&
-                     Line->First->NewlinesBefore == 0;
-
-    bool IsContinuedComment =
-        Line->First->is(tok::comment) && Line->First->Next == nullptr &&
-        Line->First->NewlinesBefore < 2 && PreviousLine &&
-        PreviousLine->Affected && PreviousLine->Last->is(tok::comment);
-
-    if (SomeTokenAffected || SomeFirstChildAffected || LineMoved ||
-        IsContinuedComment) {
-      Line->Affected = true;
-      SomeLineAffected = true;
-    }
-    return SomeLineAffected;
-  }
-
-  // Marks all lines between I and E as well as all their children as affected.
-  void markAllAsAffected(SmallVectorImpl<AnnotatedLine *>::iterator I,
-                         SmallVectorImpl<AnnotatedLine *>::iterator E) {
-    while (I != E) {
-      (*I)->Affected = true;
-      markAllAsAffected((*I)->Children.begin(), (*I)->Children.end());
-      ++I;
-    }
-  }
-
-  // Returns true if the range from 'First' to 'Last' intersects with one of the
-  // input ranges.
-  bool affectsTokenRange(const FormatToken &First, const FormatToken &Last,
-                         bool IncludeLeadingNewlines) {
-    SourceLocation Start = First.WhitespaceRange.getBegin();
-    if (!IncludeLeadingNewlines)
-      Start = Start.getLocWithOffset(First.LastNewlineOffset);
-    SourceLocation End = Last.getStartOfNonWhitespace();
-    End = End.getLocWithOffset(Last.TokenText.size());
-    CharSourceRange Range = CharSourceRange::getCharRange(Start, End);
-    return affectsCharSourceRange(Range);
-  }
-
-  // Returns true if one of the input ranges intersect the leading empty lines
-  // before 'Tok'.
-  bool affectsLeadingEmptyLines(const FormatToken &Tok) {
-    CharSourceRange EmptyLineRange = CharSourceRange::getCharRange(
-        Tok.WhitespaceRange.getBegin(),
-        Tok.WhitespaceRange.getBegin().getLocWithOffset(Tok.LastNewlineOffset));
-    return affectsCharSourceRange(EmptyLineRange);
-  }
-
-  // Returns true if 'Range' intersects with one of the input ranges.
-  bool affectsCharSourceRange(const CharSourceRange &Range) {
-    for (SmallVectorImpl<CharSourceRange>::const_iterator I = Ranges.begin(),
-                                                          E = Ranges.end();
-         I != E; ++I) {
-      if (!SourceMgr.isBeforeInTranslationUnit(Range.getEnd(), I->getBegin()) &&
-          !SourceMgr.isBeforeInTranslationUnit(I->getEnd(), Range.getBegin()))
-        return true;
-    }
-    return false;
   }
 
   static bool inputUsesCRLF(StringRef Text) {
@@ -1638,7 +917,7 @@
 
   bool
   hasCpp03IncompatibleFormat(const SmallVectorImpl<AnnotatedLine *> &Lines) {
-    for (const AnnotatedLine* Line : Lines) {
+    for (const AnnotatedLine *Line : Lines) {
       if (hasCpp03IncompatibleFormat(Line->Children))
         return true;
       for (FormatToken *Tok = Line->First->Next; Tok; Tok = Tok->Next) {
@@ -1656,7 +935,7 @@
 
   int countVariableAlignments(const SmallVectorImpl<AnnotatedLine *> &Lines) {
     int AlignmentDiff = 0;
-    for (const AnnotatedLine* Line : Lines) {
+    for (const AnnotatedLine *Line : Lines) {
       AlignmentDiff += countVariableAlignments(Line->Children);
       for (FormatToken *Tok = Line->First; Tok && Tok->Next; Tok = Tok->Next) {
         if (!Tok->is(TT_PointerOrReference))
@@ -1703,24 +982,225 @@
         HasBinPackedFunction || !HasOnePerLineFunction;
   }
 
-  void consumeUnwrappedLine(const UnwrappedLine &TheLine) override {
-    assert(!UnwrappedLines.empty());
-    UnwrappedLines.back().push_back(TheLine);
-  }
-
-  void finishRun() override {
-    UnwrappedLines.push_back(SmallVector<UnwrappedLine, 16>());
-  }
-
-  FormatStyle Style;
-  FileID ID;
-  SourceManager &SourceMgr;
-  WhitespaceManager Whitespaces;
-  SmallVector<CharSourceRange, 8> Ranges;
-  SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines;
-
-  encoding::Encoding Encoding;
   bool BinPackInconclusiveFunctions;
+  bool *IncompleteFormat;
+};
+
+// This class clean up the erroneous/redundant code around the given ranges in
+// file.
+class Cleaner : public TokenAnalyzer {
+public:
+  Cleaner(const Environment &Env, const FormatStyle &Style)
+      : TokenAnalyzer(Env, Style),
+        DeletedTokens(FormatTokenLess(Env.getSourceManager())) {}
+
+  // FIXME: eliminate unused parameters.
+  tooling::Replacements
+  analyze(TokenAnnotator &Annotator,
+          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+          FormatTokenLexer &Tokens, tooling::Replacements &Result) override {
+    // FIXME: in the current implementation the granularity of affected range
+    // is an annotated line. However, this is not sufficient. Furthermore,
+    // redundant code introduced by replacements does not necessarily
+    // intercept with ranges of replacements that result in the redundancy.
+    // To determine if some redundant code is actually introduced by
+    // replacements(e.g. deletions), we need to come up with a more
+    // sophisticated way of computing affected ranges.
+    AffectedRangeMgr.computeAffectedLines(AnnotatedLines.begin(),
+                                          AnnotatedLines.end());
+
+    checkEmptyNamespace(AnnotatedLines);
+
+    for (auto &Line : AnnotatedLines) {
+      if (Line->Affected) {
+        cleanupRight(Line->First, tok::comma, tok::comma);
+        cleanupRight(Line->First, TT_CtorInitializerColon, tok::comma);
+        cleanupLeft(Line->First, TT_CtorInitializerComma, tok::l_brace);
+        cleanupLeft(Line->First, TT_CtorInitializerColon, tok::l_brace);
+      }
+    }
+
+    return generateFixes();
+  }
+
+private:
+  bool containsOnlyComments(const AnnotatedLine &Line) {
+    for (FormatToken *Tok = Line.First; Tok != nullptr; Tok = Tok->Next) {
+      if (Tok->isNot(tok::comment))
+        return false;
+    }
+    return true;
+  }
+
+  // Iterate through all lines and remove any empty (nested) namespaces.
+  void checkEmptyNamespace(SmallVectorImpl<AnnotatedLine *> &AnnotatedLines) {
+    for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
+      auto &Line = *AnnotatedLines[i];
+      if (Line.startsWith(tok::kw_namespace) ||
+          Line.startsWith(tok::kw_inline, tok::kw_namespace)) {
+        checkEmptyNamespace(AnnotatedLines, i, i);
+      }
+    }
+
+    for (auto Line : DeletedLines) {
+      FormatToken *Tok = AnnotatedLines[Line]->First;
+      while (Tok) {
+        deleteToken(Tok);
+        Tok = Tok->Next;
+      }
+    }
+  }
+
+  // The function checks if the namespace, which starts from \p CurrentLine, and
+  // its nested namespaces are empty and delete them if they are empty. It also
+  // sets \p NewLine to the last line checked.
+  // Returns true if the current namespace is empty.
+  bool checkEmptyNamespace(SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+                           unsigned CurrentLine, unsigned &NewLine) {
+    unsigned InitLine = CurrentLine, End = AnnotatedLines.size();
+    if (Style.BraceWrapping.AfterNamespace) {
+      // If the left brace is in a new line, we should consume it first so that
+      // it does not make the namespace non-empty.
+      // FIXME: error handling if there is no left brace.
+      if (!AnnotatedLines[++CurrentLine]->startsWith(tok::l_brace)) {
+        NewLine = CurrentLine;
+        return false;
+      }
+    } else if (!AnnotatedLines[CurrentLine]->endsWith(tok::l_brace)) {
+      return false;
+    }
+    while (++CurrentLine < End) {
+      if (AnnotatedLines[CurrentLine]->startsWith(tok::r_brace))
+        break;
+
+      if (AnnotatedLines[CurrentLine]->startsWith(tok::kw_namespace) ||
+          AnnotatedLines[CurrentLine]->startsWith(tok::kw_inline,
+                                                  tok::kw_namespace)) {
+        if (!checkEmptyNamespace(AnnotatedLines, CurrentLine, NewLine))
+          return false;
+        CurrentLine = NewLine;
+        continue;
+      }
+
+      if (containsOnlyComments(*AnnotatedLines[CurrentLine]))
+        continue;
+
+      // If there is anything other than comments or nested namespaces in the
+      // current namespace, the namespace cannot be empty.
+      NewLine = CurrentLine;
+      return false;
+    }
+
+    NewLine = CurrentLine;
+    if (CurrentLine >= End)
+      return false;
+
+    // Check if the empty namespace is actually affected by changed ranges.
+    if (!AffectedRangeMgr.affectsCharSourceRange(CharSourceRange::getCharRange(
+            AnnotatedLines[InitLine]->First->Tok.getLocation(),
+            AnnotatedLines[CurrentLine]->Last->Tok.getEndLoc())))
+      return false;
+
+    for (unsigned i = InitLine; i <= CurrentLine; ++i) {
+      DeletedLines.insert(i);
+    }
+
+    return true;
+  }
+
+  // Checks pairs {start, start->next},..., {end->previous, end} and deletes one
+  // of the token in the pair if the left token has \p LK token kind and the
+  // right token has \p RK token kind. If \p DeleteLeft is true, the left token
+  // is deleted on match; otherwise, the right token is deleted.
+  template <typename LeftKind, typename RightKind>
+  void cleanupPair(FormatToken *Start, LeftKind LK, RightKind RK,
+                   bool DeleteLeft) {
+    auto NextNotDeleted = [this](const FormatToken &Tok) -> FormatToken * {
+      for (auto *Res = Tok.Next; Res; Res = Res->Next)
+        if (!Res->is(tok::comment) &&
+            DeletedTokens.find(Res) == DeletedTokens.end())
+          return Res;
+      return nullptr;
+    };
+    for (auto *Left = Start; Left;) {
+      auto *Right = NextNotDeleted(*Left);
+      if (!Right)
+        break;
+      if (Left->is(LK) && Right->is(RK)) {
+        deleteToken(DeleteLeft ? Left : Right);
+        // If the right token is deleted, we should keep the left token
+        // unchanged and pair it with the new right token.
+        if (!DeleteLeft)
+          continue;
+      }
+      Left = Right;
+    }
+  }
+
+  template <typename LeftKind, typename RightKind>
+  void cleanupLeft(FormatToken *Start, LeftKind LK, RightKind RK) {
+    cleanupPair(Start, LK, RK, /*DeleteLeft=*/true);
+  }
+
+  template <typename LeftKind, typename RightKind>
+  void cleanupRight(FormatToken *Start, LeftKind LK, RightKind RK) {
+    cleanupPair(Start, LK, RK, /*DeleteLeft=*/false);
+  }
+
+  // Delete the given token.
+  inline void deleteToken(FormatToken *Tok) {
+    if (Tok)
+      DeletedTokens.insert(Tok);
+  }
+
+  tooling::Replacements generateFixes() {
+    tooling::Replacements Fixes;
+    std::vector<FormatToken *> Tokens;
+    std::copy(DeletedTokens.begin(), DeletedTokens.end(),
+              std::back_inserter(Tokens));
+
+    // Merge multiple continuous token deletions into one big deletion so that
+    // the number of replacements can be reduced. This makes computing affected
+    // ranges more efficient when we run reformat on the changed code.
+    unsigned Idx = 0;
+    while (Idx < Tokens.size()) {
+      unsigned St = Idx, End = Idx;
+      while ((End + 1) < Tokens.size() &&
+             Tokens[End]->Next == Tokens[End + 1]) {
+        End++;
+      }
+      auto SR = CharSourceRange::getCharRange(Tokens[St]->Tok.getLocation(),
+                                              Tokens[End]->Tok.getEndLoc());
+      auto Err =
+          Fixes.add(tooling::Replacement(Env.getSourceManager(), SR, ""));
+      // FIXME: better error handling. for now just print error message and skip
+      // for the release version.
+      if (Err)
+        llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+      assert(!Err && "Fixes must not conflict!");
+      Idx = End + 1;
+    }
+
+    return Fixes;
+  }
+
+  // Class for less-than inequality comparason for the set `RedundantTokens`.
+  // We store tokens in the order they appear in the translation unit so that
+  // we do not need to sort them in `generateFixes()`.
+  struct FormatTokenLess {
+    FormatTokenLess(const SourceManager &SM) : SM(SM) {}
+
+    bool operator()(const FormatToken *LHS, const FormatToken *RHS) const {
+      return SM.isBeforeInTranslationUnit(LHS->Tok.getLocation(),
+                                          RHS->Tok.getLocation());
+    }
+    const SourceManager &SM;
+  };
+
+  // Tokens to be deleted.
+  std::set<FormatToken *, FormatTokenLess> DeletedTokens;
+  // The line numbers of lines to be deleted.
+  std::set<unsigned> DeletedLines;
 };
 
 struct IncludeDirective {
@@ -1743,74 +1223,166 @@
   return false;
 }
 
-// Sorts a block of includes given by 'Includes' alphabetically adding the
-// necessary replacement to 'Replaces'. 'Includes' must be in strict source
-// order.
-static void sortIncludes(const FormatStyle &Style,
-                         const SmallVectorImpl<IncludeDirective> &Includes,
-                         ArrayRef<tooling::Range> Ranges, StringRef FileName,
-                         tooling::Replacements &Replaces, unsigned *Cursor) {
-  if (!affectsRange(Ranges, Includes.front().Offset,
-                    Includes.back().Offset + Includes.back().Text.size()))
+// Returns a pair (Index, OffsetToEOL) describing the position of the cursor
+// before sorting/deduplicating. Index is the index of the include under the
+// cursor in the original set of includes. If this include has duplicates, it is
+// the index of the first of the duplicates as the others are going to be
+// removed. OffsetToEOL describes the cursor's position relative to the end of
+// its current line.
+// If `Cursor` is not on any #include, `Index` will be UINT_MAX.
+static std::pair<unsigned, unsigned>
+FindCursorIndex(const SmallVectorImpl<IncludeDirective> &Includes,
+                const SmallVectorImpl<unsigned> &Indices, unsigned Cursor) {
+  unsigned CursorIndex = UINT_MAX;
+  unsigned OffsetToEOL = 0;
+  for (int i = 0, e = Includes.size(); i != e; ++i) {
+    unsigned Start = Includes[Indices[i]].Offset;
+    unsigned End = Start + Includes[Indices[i]].Text.size();
+    if (!(Cursor >= Start && Cursor < End))
+      continue;
+    CursorIndex = Indices[i];
+    OffsetToEOL = End - Cursor;
+    // Put the cursor on the only remaining #include among the duplicate
+    // #includes.
+    while (--i >= 0 && Includes[CursorIndex].Text == Includes[Indices[i]].Text)
+      CursorIndex = i;
+    break;
+  }
+  return std::make_pair(CursorIndex, OffsetToEOL);
+}
+
+// Sorts and deduplicate a block of includes given by 'Includes' alphabetically
+// adding the necessary replacement to 'Replaces'. 'Includes' must be in strict
+// source order.
+// #include directives with the same text will be deduplicated, and only the
+// first #include in the duplicate #includes remains. If the `Cursor` is
+// provided and put on a deleted #include, it will be moved to the remaining
+// #include in the duplicate #includes.
+static void sortCppIncludes(const FormatStyle &Style,
+                            const SmallVectorImpl<IncludeDirective> &Includes,
+                            ArrayRef<tooling::Range> Ranges, StringRef FileName,
+                            tooling::Replacements &Replaces, unsigned *Cursor) {
+  unsigned IncludesBeginOffset = Includes.front().Offset;
+  unsigned IncludesBlockSize = Includes.back().Offset +
+                               Includes.back().Text.size() -
+                               IncludesBeginOffset;
+  if (!affectsRange(Ranges, IncludesBeginOffset, IncludesBlockSize))
     return;
   SmallVector<unsigned, 16> Indices;
   for (unsigned i = 0, e = Includes.size(); i != e; ++i)
     Indices.push_back(i);
-  std::sort(Indices.begin(), Indices.end(), [&](unsigned LHSI, unsigned RHSI) {
-    return std::tie(Includes[LHSI].Category, Includes[LHSI].Filename) <
-           std::tie(Includes[RHSI].Category, Includes[RHSI].Filename);
-  });
+  std::stable_sort(
+      Indices.begin(), Indices.end(), [&](unsigned LHSI, unsigned RHSI) {
+        return std::tie(Includes[LHSI].Category, Includes[LHSI].Filename) <
+               std::tie(Includes[RHSI].Category, Includes[RHSI].Filename);
+      });
+  // The index of the include on which the cursor will be put after
+  // sorting/deduplicating.
+  unsigned CursorIndex;
+  // The offset from cursor to the end of line.
+  unsigned CursorToEOLOffset;
+  if (Cursor)
+    std::tie(CursorIndex, CursorToEOLOffset) =
+        FindCursorIndex(Includes, Indices, *Cursor);
+
+  // Deduplicate #includes.
+  Indices.erase(std::unique(Indices.begin(), Indices.end(),
+                            [&](unsigned LHSI, unsigned RHSI) {
+                              return Includes[LHSI].Text == Includes[RHSI].Text;
+                            }),
+                Indices.end());
 
   // If the #includes are out of order, we generate a single replacement fixing
   // the entire block. Otherwise, no replacement is generated.
-  bool OutOfOrder = false;
-  for (unsigned i = 1, e = Indices.size(); i != e; ++i) {
-    if (Indices[i] != i) {
-      OutOfOrder = true;
-      break;
-    }
-  }
-  if (!OutOfOrder)
+  if (Indices.size() == Includes.size() &&
+      std::is_sorted(Indices.begin(), Indices.end()))
     return;
 
   std::string result;
-  bool CursorMoved = false;
   for (unsigned Index : Indices) {
     if (!result.empty())
       result += "\n";
     result += Includes[Index].Text;
-
-    if (Cursor && !CursorMoved) {
-      unsigned Start = Includes[Index].Offset;
-      unsigned End = Start + Includes[Index].Text.size();
-      if (*Cursor >= Start && *Cursor < End) {
-        *Cursor = Includes.front().Offset + result.size() + *Cursor - End;
-        CursorMoved = true;
-      }
-    }
+    if (Cursor && CursorIndex == Index)
+      *Cursor = IncludesBeginOffset + result.size() - CursorToEOLOffset;
   }
 
-  // Sorting #includes shouldn't change their total number of characters.
-  // This would otherwise mess up 'Ranges'.
-  assert(result.size() ==
-         Includes.back().Offset + Includes.back().Text.size() -
-             Includes.front().Offset);
-
-  Replaces.insert(tooling::Replacement(FileName, Includes.front().Offset,
-                                       result.size(), result));
+  auto Err = Replaces.add(tooling::Replacement(
+      FileName, Includes.front().Offset, IncludesBlockSize, result));
+  // FIXME: better error handling. For now, just skip the replacement for the
+  // release version.
+  if (Err)
+    llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+  assert(!Err);
 }
 
-tooling::Replacements sortIncludes(const FormatStyle &Style, StringRef Code,
-                                   ArrayRef<tooling::Range> Ranges,
-                                   StringRef FileName, unsigned *Cursor) {
-  tooling::Replacements Replaces;
-  if (!Style.SortIncludes)
-    return Replaces;
+namespace {
 
+// This class manages priorities of #include categories and calculates
+// priorities for headers.
+class IncludeCategoryManager {
+public:
+  IncludeCategoryManager(const FormatStyle &Style, StringRef FileName)
+      : Style(Style), FileName(FileName) {
+    FileStem = llvm::sys::path::stem(FileName);
+    for (const auto &Category : Style.IncludeCategories)
+      CategoryRegexs.emplace_back(Category.Regex);
+    IsMainFile = FileName.endswith(".c") || FileName.endswith(".cc") ||
+                 FileName.endswith(".cpp") || FileName.endswith(".c++") ||
+                 FileName.endswith(".cxx") || FileName.endswith(".m") ||
+                 FileName.endswith(".mm");
+  }
+
+  // Returns the priority of the category which \p IncludeName belongs to.
+  // If \p CheckMainHeader is true and \p IncludeName is a main header, returns
+  // 0. Otherwise, returns the priority of the matching category or INT_MAX.
+  int getIncludePriority(StringRef IncludeName, bool CheckMainHeader) {
+    int Ret = INT_MAX;
+    for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i)
+      if (CategoryRegexs[i].match(IncludeName)) {
+        Ret = Style.IncludeCategories[i].Priority;
+        break;
+      }
+    if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName))
+      Ret = 0;
+    return Ret;
+  }
+
+private:
+  bool isMainHeader(StringRef IncludeName) const {
+    if (!IncludeName.startswith("\""))
+      return false;
+    StringRef HeaderStem =
+        llvm::sys::path::stem(IncludeName.drop_front(1).drop_back(1));
+    if (FileStem.startswith(HeaderStem)) {
+      llvm::Regex MainIncludeRegex(
+          (HeaderStem + Style.IncludeIsMainRegex).str());
+      if (MainIncludeRegex.match(FileStem))
+        return true;
+    }
+    return false;
+  }
+
+  const FormatStyle &Style;
+  bool IsMainFile;
+  StringRef FileName;
+  StringRef FileStem;
+  SmallVector<llvm::Regex, 4> CategoryRegexs;
+};
+
+const char IncludeRegexPattern[] =
+    R"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))";
+
+} // anonymous namespace
+
+tooling::Replacements sortCppIncludes(const FormatStyle &Style, StringRef Code,
+                                      ArrayRef<tooling::Range> Ranges,
+                                      StringRef FileName,
+                                      tooling::Replacements &Replaces,
+                                      unsigned *Cursor) {
   unsigned Prev = 0;
   unsigned SearchFrom = 0;
-  llvm::Regex IncludeRegex(
-      R"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))");
+  llvm::Regex IncludeRegex(IncludeRegexPattern);
   SmallVector<StringRef, 4> Matches;
   SmallVector<IncludeDirective, 16> IncludesInBlock;
 
@@ -1821,19 +1393,9 @@
   //
   // FIXME: Do some sanity checking, e.g. edit distance of the base name, to fix
   // cases where the first #include is unlikely to be the main header.
-  bool IsSource = FileName.endswith(".c") || FileName.endswith(".cc") ||
-                  FileName.endswith(".cpp") || FileName.endswith(".c++") ||
-                  FileName.endswith(".cxx") || FileName.endswith(".m") ||
-                  FileName.endswith(".mm");
-  StringRef FileStem = llvm::sys::path::stem(FileName);
+  IncludeCategoryManager Categories(Style, FileName);
   bool FirstIncludeBlock = true;
   bool MainIncludeFound = false;
-
-  // Create pre-compiled regular expressions for the #include categories.
-  SmallVector<llvm::Regex, 4> CategoryRegexs;
-  for (const auto &Category : Style.IncludeCategories)
-    CategoryRegexs.emplace_back(Category.Regex);
-
   bool FormattingOff = false;
 
   for (;;) {
@@ -1850,26 +1412,15 @@
     if (!FormattingOff && !Line.endswith("\\")) {
       if (IncludeRegex.match(Line, &Matches)) {
         StringRef IncludeName = Matches[2];
-        int Category = INT_MAX;
-        for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i) {
-          if (CategoryRegexs[i].match(IncludeName)) {
-            Category = Style.IncludeCategories[i].Priority;
-            break;
-          }
-        }
-        if (IsSource && !MainIncludeFound && Category > 0 &&
-            FirstIncludeBlock && IncludeName.startswith("\"")) {
-          StringRef HeaderStem =
-              llvm::sys::path::stem(IncludeName.drop_front(1).drop_back(1));
-          if (FileStem.startswith(HeaderStem)) {
-            Category = 0;
-            MainIncludeFound = true;
-          }
-        }
+        int Category = Categories.getIncludePriority(
+            IncludeName,
+            /*CheckMainHeader=*/!MainIncludeFound && FirstIncludeBlock);
+        if (Category == 0)
+          MainIncludeFound = true;
         IncludesInBlock.push_back({IncludeName, Line, Prev, Category});
       } else if (!IncludesInBlock.empty()) {
-        sortIncludes(Style, IncludesInBlock, Ranges, FileName, Replaces,
-                     Cursor);
+        sortCppIncludes(Style, IncludesInBlock, Ranges, FileName, Replaces,
+                        Cursor);
         IncludesInBlock.clear();
         FirstIncludeBlock = false;
       }
@@ -1880,47 +1431,286 @@
     SearchFrom = Pos + 1;
   }
   if (!IncludesInBlock.empty())
-    sortIncludes(Style, IncludesInBlock, Ranges, FileName, Replaces, Cursor);
+    sortCppIncludes(Style, IncludesInBlock, Ranges, FileName, Replaces, Cursor);
   return Replaces;
 }
 
-tooling::Replacements reformat(const FormatStyle &Style,
-                               SourceManager &SourceMgr, FileID ID,
-                               ArrayRef<CharSourceRange> Ranges,
+tooling::Replacements sortIncludes(const FormatStyle &Style, StringRef Code,
+                                   ArrayRef<tooling::Range> Ranges,
+                                   StringRef FileName, unsigned *Cursor) {
+  tooling::Replacements Replaces;
+  if (!Style.SortIncludes)
+    return Replaces;
+  if (Style.Language == FormatStyle::LanguageKind::LK_JavaScript)
+    return sortJavaScriptImports(Style, Code, Ranges, FileName);
+  sortCppIncludes(Style, Code, Ranges, FileName, Replaces, Cursor);
+  return Replaces;
+}
+
+template <typename T>
+static llvm::Expected<tooling::Replacements>
+processReplacements(T ProcessFunc, StringRef Code,
+                    const tooling::Replacements &Replaces,
+                    const FormatStyle &Style) {
+  if (Replaces.empty())
+    return tooling::Replacements();
+
+  auto NewCode = applyAllReplacements(Code, Replaces);
+  if (!NewCode)
+    return NewCode.takeError();
+  std::vector<tooling::Range> ChangedRanges = Replaces.getAffectedRanges();
+  StringRef FileName = Replaces.begin()->getFilePath();
+
+  tooling::Replacements FormatReplaces =
+      ProcessFunc(Style, *NewCode, ChangedRanges, FileName);
+
+  return Replaces.merge(FormatReplaces);
+}
+
+llvm::Expected<tooling::Replacements>
+formatReplacements(StringRef Code, const tooling::Replacements &Replaces,
+                   const FormatStyle &Style) {
+  // We need to use lambda function here since there are two versions of
+  // `sortIncludes`.
+  auto SortIncludes = [](const FormatStyle &Style, StringRef Code,
+                         std::vector<tooling::Range> Ranges,
+                         StringRef FileName) -> tooling::Replacements {
+    return sortIncludes(Style, Code, Ranges, FileName);
+  };
+  auto SortedReplaces =
+      processReplacements(SortIncludes, Code, Replaces, Style);
+  if (!SortedReplaces)
+    return SortedReplaces.takeError();
+
+  // We need to use lambda function here since there are two versions of
+  // `reformat`.
+  auto Reformat = [](const FormatStyle &Style, StringRef Code,
+                     std::vector<tooling::Range> Ranges,
+                     StringRef FileName) -> tooling::Replacements {
+    return reformat(Style, Code, Ranges, FileName);
+  };
+  return processReplacements(Reformat, Code, *SortedReplaces, Style);
+}
+
+namespace {
+
+inline bool isHeaderInsertion(const tooling::Replacement &Replace) {
+  return Replace.getOffset() == UINT_MAX &&
+         llvm::Regex(IncludeRegexPattern).match(Replace.getReplacementText());
+}
+
+void skipComments(Lexer &Lex, Token &Tok) {
+  while (Tok.is(tok::comment))
+    if (Lex.LexFromRawLexer(Tok))
+      return;
+}
+
+// Check if a sequence of tokens is like "#<Name> <raw_identifier>". If it is,
+// \p Tok will be the token after this directive; otherwise, it can be any token
+// after the given \p Tok (including \p Tok).
+bool checkAndConsumeDirectiveWithName(Lexer &Lex, StringRef Name, Token &Tok) {
+  bool Matched = Tok.is(tok::hash) && !Lex.LexFromRawLexer(Tok) &&
+                 Tok.is(tok::raw_identifier) &&
+                 Tok.getRawIdentifier() == Name && !Lex.LexFromRawLexer(Tok) &&
+                 Tok.is(tok::raw_identifier);
+  if (Matched)
+    Lex.LexFromRawLexer(Tok);
+  return Matched;
+}
+
+unsigned getOffsetAfterHeaderGuardsAndComments(StringRef FileName,
+                                               StringRef Code,
+                                               const FormatStyle &Style) {
+  std::unique_ptr<Environment> Env =
+      Environment::CreateVirtualEnvironment(Code, FileName, /*Ranges=*/{});
+  const SourceManager &SourceMgr = Env->getSourceManager();
+  Lexer Lex(Env->getFileID(), SourceMgr.getBuffer(Env->getFileID()), SourceMgr,
+            getFormattingLangOpts(Style));
+  Token Tok;
+  // Get the first token.
+  Lex.LexFromRawLexer(Tok);
+  skipComments(Lex, Tok);
+  unsigned AfterComments = SourceMgr.getFileOffset(Tok.getLocation());
+  if (checkAndConsumeDirectiveWithName(Lex, "ifndef", Tok)) {
+    skipComments(Lex, Tok);
+    if (checkAndConsumeDirectiveWithName(Lex, "define", Tok))
+      return SourceMgr.getFileOffset(Tok.getLocation());
+  }
+  return AfterComments;
+}
+
+// FIXME: we also need to insert a '\n' at the end of the code if we have an
+// insertion with offset Code.size(), and there is no '\n' at the end of the
+// code.
+// FIXME: do not insert headers into conditional #include blocks, e.g. #includes
+// surrounded by compile condition "#if...".
+// FIXME: insert empty lines between newly created blocks.
+tooling::Replacements
+fixCppIncludeInsertions(StringRef Code, const tooling::Replacements &Replaces,
+                        const FormatStyle &Style) {
+  if (Style.Language != FormatStyle::LanguageKind::LK_Cpp)
+    return Replaces;
+
+  tooling::Replacements HeaderInsertions;
+  tooling::Replacements Result;
+  for (const auto &R : Replaces) {
+    if (isHeaderInsertion(R)) {
+      // Replacements from \p Replaces must be conflict-free already, so we can
+      // simply consume the error.
+      llvm::consumeError(HeaderInsertions.add(R));
+    } else if (R.getOffset() == UINT_MAX) {
+      llvm::errs() << "Insertions other than header #include insertion are "
+                      "not supported! "
+                   << R.getReplacementText() << "\n";
+    } else {
+      llvm::consumeError(Result.add(R));
+    }
+  }
+  if (HeaderInsertions.empty())
+    return Replaces;
+
+  llvm::Regex IncludeRegex(IncludeRegexPattern);
+  llvm::Regex DefineRegex(R"(^[\t\ ]*#[\t\ ]*define[\t\ ]*[^\\]*$)");
+  SmallVector<StringRef, 4> Matches;
+
+  StringRef FileName = Replaces.begin()->getFilePath();
+  IncludeCategoryManager Categories(Style, FileName);
+
+  // Record the offset of the end of the last include in each category.
+  std::map<int, int> CategoryEndOffsets;
+  // All possible priorities.
+  // Add 0 for main header and INT_MAX for headers that are not in any category.
+  std::set<int> Priorities = {0, INT_MAX};
+  for (const auto &Category : Style.IncludeCategories)
+    Priorities.insert(Category.Priority);
+  int FirstIncludeOffset = -1;
+  // All new headers should be inserted after this offset.
+  unsigned MinInsertOffset =
+      getOffsetAfterHeaderGuardsAndComments(FileName, Code, Style);
+  StringRef TrimmedCode = Code.drop_front(MinInsertOffset);
+  SmallVector<StringRef, 32> Lines;
+  TrimmedCode.split(Lines, '\n');
+  unsigned Offset = MinInsertOffset;
+  unsigned NextLineOffset;
+  std::set<StringRef> ExistingIncludes;
+  for (auto Line : Lines) {
+    NextLineOffset = std::min(Code.size(), Offset + Line.size() + 1);
+    if (IncludeRegex.match(Line, &Matches)) {
+      StringRef IncludeName = Matches[2];
+      ExistingIncludes.insert(IncludeName);
+      int Category = Categories.getIncludePriority(
+          IncludeName, /*CheckMainHeader=*/FirstIncludeOffset < 0);
+      CategoryEndOffsets[Category] = NextLineOffset;
+      if (FirstIncludeOffset < 0)
+        FirstIncludeOffset = Offset;
+    }
+    Offset = NextLineOffset;
+  }
+
+  // Populate CategoryEndOfssets:
+  // - Ensure that CategoryEndOffset[Highest] is always populated.
+  // - If CategoryEndOffset[Priority] isn't set, use the next higher value that
+  //   is set, up to CategoryEndOffset[Highest].
+  auto Highest = Priorities.begin();
+  if (CategoryEndOffsets.find(*Highest) == CategoryEndOffsets.end()) {
+    if (FirstIncludeOffset >= 0)
+      CategoryEndOffsets[*Highest] = FirstIncludeOffset;
+    else
+      CategoryEndOffsets[*Highest] = MinInsertOffset;
+  }
+  // By this point, CategoryEndOffset[Highest] is always set appropriately:
+  //  - to an appropriate location before/after existing #includes, or
+  //  - to right after the header guard, or
+  //  - to the beginning of the file.
+  for (auto I = ++Priorities.begin(), E = Priorities.end(); I != E; ++I)
+    if (CategoryEndOffsets.find(*I) == CategoryEndOffsets.end())
+      CategoryEndOffsets[*I] = CategoryEndOffsets[*std::prev(I)];
+
+  for (const auto &R : HeaderInsertions) {
+    auto IncludeDirective = R.getReplacementText();
+    bool Matched = IncludeRegex.match(IncludeDirective, &Matches);
+    assert(Matched && "Header insertion replacement must have replacement text "
+                      "'#include ...'");
+    (void)Matched;
+    auto IncludeName = Matches[2];
+    if (ExistingIncludes.find(IncludeName) != ExistingIncludes.end()) {
+      DEBUG(llvm::dbgs() << "Skip adding existing include : " << IncludeName
+                         << "\n");
+      continue;
+    }
+    int Category =
+        Categories.getIncludePriority(IncludeName, /*CheckMainHeader=*/true);
+    Offset = CategoryEndOffsets[Category];
+    std::string NewInclude = !IncludeDirective.endswith("\n")
+                                 ? (IncludeDirective + "\n").str()
+                                 : IncludeDirective.str();
+    auto NewReplace = tooling::Replacement(FileName, Offset, 0, NewInclude);
+    auto Err = Result.add(NewReplace);
+    if (Err) {
+      llvm::consumeError(std::move(Err));
+      Result = Result.merge(tooling::Replacements(NewReplace));
+    }
+  }
+  return Result;
+}
+
+} // anonymous namespace
+
+llvm::Expected<tooling::Replacements>
+cleanupAroundReplacements(StringRef Code, const tooling::Replacements &Replaces,
+                          const FormatStyle &Style) {
+  // We need to use lambda function here since there are two versions of
+  // `cleanup`.
+  auto Cleanup = [](const FormatStyle &Style, StringRef Code,
+                    std::vector<tooling::Range> Ranges,
+                    StringRef FileName) -> tooling::Replacements {
+    return cleanup(Style, Code, Ranges, FileName);
+  };
+  // Make header insertion replacements insert new headers into correct blocks.
+  tooling::Replacements NewReplaces =
+      fixCppIncludeInsertions(Code, Replaces, Style);
+  return processReplacements(Cleanup, Code, NewReplaces, Style);
+}
+
+tooling::Replacements reformat(const FormatStyle &Style, SourceManager &SM,
+                               FileID ID, ArrayRef<CharSourceRange> Ranges,
                                bool *IncompleteFormat) {
   FormatStyle Expanded = expandPresets(Style);
   if (Expanded.DisableFormat)
     return tooling::Replacements();
-  Formatter formatter(Expanded, SourceMgr, ID, Ranges);
-  return formatter.format(IncompleteFormat);
+
+  Environment Env(SM, ID, Ranges);
+  Formatter Format(Env, Expanded, IncompleteFormat);
+  return Format.process();
 }
 
 tooling::Replacements reformat(const FormatStyle &Style, StringRef Code,
                                ArrayRef<tooling::Range> Ranges,
                                StringRef FileName, bool *IncompleteFormat) {
-  if (Style.DisableFormat)
+  FormatStyle Expanded = expandPresets(Style);
+  if (Expanded.DisableFormat)
     return tooling::Replacements();
 
-  IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
-      new vfs::InMemoryFileSystem);
-  FileManager Files(FileSystemOptions(), InMemoryFileSystem);
-  DiagnosticsEngine Diagnostics(
-      IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs),
-      new DiagnosticOptions);
-  SourceManager SourceMgr(Diagnostics, Files);
-  InMemoryFileSystem->addFile(
-      FileName, 0, llvm::MemoryBuffer::getMemBuffer(
-                       Code, FileName, /*RequiresNullTerminator=*/false));
-  FileID ID = SourceMgr.createFileID(Files.getFile(FileName), SourceLocation(),
-                                     clang::SrcMgr::C_User);
-  SourceLocation StartOfFile = SourceMgr.getLocForStartOfFile(ID);
-  std::vector<CharSourceRange> CharRanges;
-  for (const tooling::Range &Range : Ranges) {
-    SourceLocation Start = StartOfFile.getLocWithOffset(Range.getOffset());
-    SourceLocation End = Start.getLocWithOffset(Range.getLength());
-    CharRanges.push_back(CharSourceRange::getCharRange(Start, End));
-  }
-  return reformat(Style, SourceMgr, ID, CharRanges, IncompleteFormat);
+  std::unique_ptr<Environment> Env =
+      Environment::CreateVirtualEnvironment(Code, FileName, Ranges);
+  Formatter Format(*Env, Expanded, IncompleteFormat);
+  return Format.process();
+}
+
+tooling::Replacements cleanup(const FormatStyle &Style, SourceManager &SM,
+                              FileID ID, ArrayRef<CharSourceRange> Ranges) {
+  Environment Env(SM, ID, Ranges);
+  Cleaner Clean(Env, Style);
+  return Clean.process();
+}
+
+tooling::Replacements cleanup(const FormatStyle &Style, StringRef Code,
+                              ArrayRef<tooling::Range> Ranges,
+                              StringRef FileName) {
+  std::unique_ptr<Environment> Env =
+      Environment::CreateVirtualEnvironment(Code, FileName, Ranges);
+  Cleaner Clean(*Env, Style);
+  return Clean.process();
 }
 
 LangOptions getFormattingLangOpts(const FormatStyle &Style) {
@@ -1934,7 +1724,7 @@
   LangOpts.Bool = 1;
   LangOpts.ObjC1 = 1;
   LangOpts.ObjC2 = 1;
-  LangOpts.MicrosoftExt = 1; // To get kw___try, kw___finally.
+  LangOpts.MicrosoftExt = 1;    // To get kw___try, kw___finally.
   LangOpts.DeclSpecKeyword = 1; // To get __declspec.
   return LangOpts;
 }
@@ -1964,7 +1754,10 @@
 }
 
 FormatStyle getStyle(StringRef StyleName, StringRef FileName,
-                     StringRef FallbackStyle) {
+                     StringRef FallbackStyle, vfs::FileSystem *FS) {
+  if (!FS) {
+    FS = vfs::getRealFileSystem().get();
+  }
   FormatStyle Style = getLLVMStyle();
   Style.Language = getLanguageByFileName(FileName);
   if (!getPredefinedStyle(FallbackStyle, Style.Language, &Style)) {
@@ -1995,28 +1788,34 @@
   llvm::sys::fs::make_absolute(Path);
   for (StringRef Directory = Path; !Directory.empty();
        Directory = llvm::sys::path::parent_path(Directory)) {
-    if (!llvm::sys::fs::is_directory(Directory))
+
+    auto Status = FS->status(Directory);
+    if (!Status ||
+        Status->getType() != llvm::sys::fs::file_type::directory_file) {
       continue;
+    }
+
     SmallString<128> ConfigFile(Directory);
 
     llvm::sys::path::append(ConfigFile, ".clang-format");
     DEBUG(llvm::dbgs() << "Trying " << ConfigFile << "...\n");
-    bool IsFile = false;
-    // Ignore errors from is_regular_file: we only need to know if we can read
-    // the file or not.
-    llvm::sys::fs::is_regular_file(Twine(ConfigFile), IsFile);
 
+    Status = FS->status(ConfigFile.str());
+    bool IsFile =
+        Status && (Status->getType() == llvm::sys::fs::file_type::regular_file);
     if (!IsFile) {
       // Try _clang-format too, since dotfiles are not commonly used on Windows.
       ConfigFile = Directory;
       llvm::sys::path::append(ConfigFile, "_clang-format");
       DEBUG(llvm::dbgs() << "Trying " << ConfigFile << "...\n");
-      llvm::sys::fs::is_regular_file(Twine(ConfigFile), IsFile);
+      Status = FS->status(ConfigFile.str());
+      IsFile = Status &&
+               (Status->getType() == llvm::sys::fs::file_type::regular_file);
     }
 
     if (IsFile) {
       llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
-          llvm::MemoryBuffer::getFile(ConfigFile.c_str());
+          FS->getBufferForFile(ConfigFile.str());
       if (std::error_code EC = Text.getError()) {
         llvm::errs() << EC.message() << "\n";
         break;
diff --git a/lib/Format/FormatToken.cpp b/lib/Format/FormatToken.cpp
index d6cd450..180e537 100644
--- a/lib/Format/FormatToken.cpp
+++ b/lib/Format/FormatToken.cpp
@@ -13,9 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "ContinuationIndenter.h"
 #include "FormatToken.h"
-#include "clang/Format/Format.h"
+#include "ContinuationIndenter.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include <climits>
@@ -53,6 +52,7 @@
   case tok::kw_half:
   case tok::kw_float:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_wchar_t:
   case tok::kw_bool:
   case tok::kw___underlying_type:
diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h
index b683660..43b1625 100644
--- a/lib/Format/FormatToken.h
+++ b/lib/Format/FormatToken.h
@@ -54,6 +54,7 @@
   TYPE(JsComputedPropertyName) \
   TYPE(JsFatArrow) \
   TYPE(JsTypeColon) \
+  TYPE(JsTypeOperator) \
   TYPE(JsTypeOptionalQuestion) \
   TYPE(LambdaArrow) \
   TYPE(LambdaLSquare) \
@@ -144,7 +145,7 @@
   /// \brief Whether the token text contains newlines (escaped or not).
   bool IsMultiline = false;
 
-  /// \brief Indicates that this is the first token.
+  /// \brief Indicates that this is the first token of the file.
   bool IsFirst = false;
 
   /// \brief Whether there must be a line break before this token.
@@ -296,6 +297,20 @@
   }
   template <typename T> bool isNot(T Kind) const { return !is(Kind); }
 
+  /// \c true if this token starts a sequence with the given tokens in order,
+  /// following the ``Next`` pointers, ignoring comments.
+  template <typename A, typename... Ts>
+  bool startsSequence(A K1, Ts... Tokens) const {
+    return startsSequenceInternal(K1, Tokens...);
+  }
+
+  /// \c true if this token ends a sequence with the given tokens in order,
+  /// following the ``Previous`` pointers, ignoring comments.
+  template <typename A, typename... Ts>
+  bool endsSequence(A K1, Ts... Tokens) const {
+    return endsSequenceInternal(K1, Tokens...);
+  }
+
   bool isStringLiteral() const { return tok::isStringLiteral(Tok.getKind()); }
 
   bool isObjCAtKeyword(tok::ObjCKeywordKind Kind) const {
@@ -428,6 +443,34 @@
   // Disallow copying.
   FormatToken(const FormatToken &) = delete;
   void operator=(const FormatToken &) = delete;
+
+  template <typename A, typename... Ts>
+  bool startsSequenceInternal(A K1, Ts... Tokens) const {
+    if (is(tok::comment) && Next)
+      return Next->startsSequenceInternal(K1, Tokens...);
+    return is(K1) && Next && Next->startsSequenceInternal(Tokens...);
+  }
+
+  template <typename A>
+  bool startsSequenceInternal(A K1) const {
+    if (is(tok::comment) && Next)
+      return Next->startsSequenceInternal(K1);
+    return is(K1);
+  }
+
+  template <typename A, typename... Ts>
+  bool endsSequenceInternal(A K1) const {
+    if (is(tok::comment) && Previous)
+      return Previous->endsSequenceInternal(K1);
+    return is(K1);
+  }
+
+  template <typename A, typename... Ts>
+  bool endsSequenceInternal(A K1, Ts... Tokens) const {
+    if (is(tok::comment) && Previous)
+      return Previous->endsSequenceInternal(K1, Tokens...);
+    return is(K1) && Previous && Previous->endsSequenceInternal(Tokens...);
+  }
 };
 
 class ContinuationIndenter;
@@ -528,17 +571,24 @@
     kw_final = &IdentTable.get("final");
     kw_override = &IdentTable.get("override");
     kw_in = &IdentTable.get("in");
+    kw_of = &IdentTable.get("of");
     kw_CF_ENUM = &IdentTable.get("CF_ENUM");
     kw_CF_OPTIONS = &IdentTable.get("CF_OPTIONS");
     kw_NS_ENUM = &IdentTable.get("NS_ENUM");
     kw_NS_OPTIONS = &IdentTable.get("NS_OPTIONS");
 
+    kw_as = &IdentTable.get("as");
+    kw_async = &IdentTable.get("async");
+    kw_await = &IdentTable.get("await");
     kw_finally = &IdentTable.get("finally");
+    kw_from = &IdentTable.get("from");
     kw_function = &IdentTable.get("function");
     kw_import = &IdentTable.get("import");
     kw_is = &IdentTable.get("is");
     kw_let = &IdentTable.get("let");
+    kw_type = &IdentTable.get("type");
     kw_var = &IdentTable.get("var");
+    kw_yield = &IdentTable.get("yield");
 
     kw_abstract = &IdentTable.get("abstract");
     kw_assert = &IdentTable.get("assert");
@@ -571,6 +621,7 @@
   IdentifierInfo *kw_final;
   IdentifierInfo *kw_override;
   IdentifierInfo *kw_in;
+  IdentifierInfo *kw_of;
   IdentifierInfo *kw_CF_ENUM;
   IdentifierInfo *kw_CF_OPTIONS;
   IdentifierInfo *kw_NS_ENUM;
@@ -578,12 +629,18 @@
   IdentifierInfo *kw___except;
 
   // JavaScript keywords.
+  IdentifierInfo *kw_as;
+  IdentifierInfo *kw_async;
+  IdentifierInfo *kw_await;
   IdentifierInfo *kw_finally;
+  IdentifierInfo *kw_from;
   IdentifierInfo *kw_function;
   IdentifierInfo *kw_import;
   IdentifierInfo *kw_is;
   IdentifierInfo *kw_let;
+  IdentifierInfo *kw_type;
   IdentifierInfo *kw_var;
+  IdentifierInfo *kw_yield;
 
   // Java keywords.
   IdentifierInfo *kw_abstract;
diff --git a/lib/Format/FormatTokenLexer.cpp b/lib/Format/FormatTokenLexer.cpp
new file mode 100644
index 0000000..9778f84
--- /dev/null
+++ b/lib/Format/FormatTokenLexer.cpp
@@ -0,0 +1,597 @@
+//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements FormatTokenLexer, which tokenizes a source file
+/// into a FormatToken stream suitable for ClangFormat.
+///
+//===----------------------------------------------------------------------===//
+
+#include "FormatTokenLexer.h"
+#include "FormatToken.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/Support/Regex.h"
+
+namespace clang {
+namespace format {
+
+FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
+                                   const FormatStyle &Style,
+                                   encoding::Encoding Encoding)
+    : FormatTok(nullptr), IsFirstToken(true), GreaterStashed(false),
+      LessStashed(false), Column(0), TrailingWhitespace(0),
+      SourceMgr(SourceMgr), ID(ID), Style(Style),
+      IdentTable(getFormattingLangOpts(Style)), Keywords(IdentTable),
+      Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(false),
+      MacroBlockBeginRegex(Style.MacroBlockBegin),
+      MacroBlockEndRegex(Style.MacroBlockEnd) {
+  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
+                      getFormattingLangOpts(Style)));
+  Lex->SetKeepWhitespaceMode(true);
+
+  for (const std::string &ForEachMacro : Style.ForEachMacros)
+    ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
+  std::sort(ForEachMacros.begin(), ForEachMacros.end());
+}
+
+ArrayRef<FormatToken *> FormatTokenLexer::lex() {
+  assert(Tokens.empty());
+  assert(FirstInLineIndex == 0);
+  do {
+    Tokens.push_back(getNextToken());
+    if (Style.Language == FormatStyle::LK_JavaScript) {
+      tryParseJSRegexLiteral();
+      tryParseTemplateString();
+    }
+    tryMergePreviousTokens();
+    if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
+      FirstInLineIndex = Tokens.size() - 1;
+  } while (Tokens.back()->Tok.isNot(tok::eof));
+  return Tokens;
+}
+
+void FormatTokenLexer::tryMergePreviousTokens() {
+  if (tryMerge_TMacro())
+    return;
+  if (tryMergeConflictMarkers())
+    return;
+  if (tryMergeLessLess())
+    return;
+
+  if (Style.Language == FormatStyle::LK_JavaScript) {
+    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
+    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
+                                                   tok::equal};
+    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
+                                                  tok::greaterequal};
+    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
+    // FIXME: Investigate what token type gives the correct operator priority.
+    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
+      return;
+    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
+      return;
+    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
+      return;
+    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
+      return;
+  }
+}
+
+bool FormatTokenLexer::tryMergeLessLess() {
+  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
+  if (Tokens.size() < 3)
+    return false;
+
+  bool FourthTokenIsLess = false;
+  if (Tokens.size() > 3)
+    FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
+
+  auto First = Tokens.end() - 3;
+  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
+      First[0]->isNot(tok::less) || FourthTokenIsLess)
+    return false;
+
+  // Only merge if there currently is no whitespace between the two "<".
+  if (First[1]->WhitespaceRange.getBegin() !=
+      First[1]->WhitespaceRange.getEnd())
+    return false;
+
+  First[0]->Tok.setKind(tok::lessless);
+  First[0]->TokenText = "<<";
+  First[0]->ColumnWidth += 1;
+  Tokens.erase(Tokens.end() - 2);
+  return true;
+}
+
+bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
+                                      TokenType NewType) {
+  if (Tokens.size() < Kinds.size())
+    return false;
+
+  SmallVectorImpl<FormatToken *>::const_iterator First =
+      Tokens.end() - Kinds.size();
+  if (!First[0]->is(Kinds[0]))
+    return false;
+  unsigned AddLength = 0;
+  for (unsigned i = 1; i < Kinds.size(); ++i) {
+    if (!First[i]->is(Kinds[i]) ||
+        First[i]->WhitespaceRange.getBegin() !=
+            First[i]->WhitespaceRange.getEnd())
+      return false;
+    AddLength += First[i]->TokenText.size();
+  }
+  Tokens.resize(Tokens.size() - Kinds.size() + 1);
+  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
+                                  First[0]->TokenText.size() + AddLength);
+  First[0]->ColumnWidth += AddLength;
+  First[0]->Type = NewType;
+  return true;
+}
+
+// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
+bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
+  // NB: This is not entirely correct, as an r_paren can introduce an operand
+  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
+  // corner case to not matter in practice, though.
+  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
+                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
+                      tok::colon, tok::question, tok::tilde) ||
+         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
+                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
+                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
+         Tok->isBinaryOperator();
+}
+
+bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
+  if (!Prev)
+    return true;
+
+  // Regex literals can only follow after prefix unary operators, not after
+  // postfix unary operators. If the '++' is followed by a non-operand
+  // introducing token, the slash here is the operand and not the start of a
+  // regex.
+  if (Prev->isOneOf(tok::plusplus, tok::minusminus))
+    return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
+
+  // The previous token must introduce an operand location where regex
+  // literals can occur.
+  if (!precedesOperand(Prev))
+    return false;
+
+  return true;
+}
+
+// Tries to parse a JavaScript Regex literal starting at the current token,
+// if that begins with a slash and is in a location where JavaScript allows
+// regex literals. Changes the current token to a regex literal and updates
+// its text if successful.
+void FormatTokenLexer::tryParseJSRegexLiteral() {
+  FormatToken *RegexToken = Tokens.back();
+  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
+    return;
+
+  FormatToken *Prev = nullptr;
+  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
+    // NB: Because previous pointers are not initialized yet, this cannot use
+    // Token.getPreviousNonComment.
+    if ((*I)->isNot(tok::comment)) {
+      Prev = *I;
+      break;
+    }
+  }
+
+  if (!canPrecedeRegexLiteral(Prev))
+    return;
+
+  // 'Manually' lex ahead in the current file buffer.
+  const char *Offset = Lex->getBufferLocation();
+  const char *RegexBegin = Offset - RegexToken->TokenText.size();
+  StringRef Buffer = Lex->getBuffer();
+  bool InCharacterClass = false;
+  bool HaveClosingSlash = false;
+  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
+    // Regular expressions are terminated with a '/', which can only be
+    // escaped using '\' or a character class between '[' and ']'.
+    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
+    switch (*Offset) {
+    case '\\':
+      // Skip the escaped character.
+      ++Offset;
+      break;
+    case '[':
+      InCharacterClass = true;
+      break;
+    case ']':
+      InCharacterClass = false;
+      break;
+    case '/':
+      if (!InCharacterClass)
+        HaveClosingSlash = true;
+      break;
+    }
+  }
+
+  RegexToken->Type = TT_RegexLiteral;
+  // Treat regex literals like other string_literals.
+  RegexToken->Tok.setKind(tok::string_literal);
+  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
+  RegexToken->ColumnWidth = RegexToken->TokenText.size();
+
+  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
+}
+
+void FormatTokenLexer::tryParseTemplateString() {
+  FormatToken *BacktickToken = Tokens.back();
+  if (!BacktickToken->is(tok::unknown) || BacktickToken->TokenText != "`")
+    return;
+
+  // 'Manually' lex ahead in the current file buffer.
+  const char *Offset = Lex->getBufferLocation();
+  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
+  for (; Offset != Lex->getBuffer().end() && *Offset != '`'; ++Offset) {
+    if (*Offset == '\\')
+      ++Offset; // Skip the escaped character.
+  }
+
+  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
+  BacktickToken->Type = TT_TemplateString;
+  BacktickToken->Tok.setKind(tok::string_literal);
+  BacktickToken->TokenText = LiteralText;
+
+  // Adjust width for potentially multiline string literals.
+  size_t FirstBreak = LiteralText.find('\n');
+  StringRef FirstLineText = FirstBreak == StringRef::npos
+                                ? LiteralText
+                                : LiteralText.substr(0, FirstBreak);
+  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
+      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
+  size_t LastBreak = LiteralText.rfind('\n');
+  if (LastBreak != StringRef::npos) {
+    BacktickToken->IsMultiline = true;
+    unsigned StartColumn = 0; // The template tail spans the entire line.
+    BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
+        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
+        Style.TabWidth, Encoding);
+  }
+
+  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
+}
+
+bool FormatTokenLexer::tryMerge_TMacro() {
+  if (Tokens.size() < 4)
+    return false;
+  FormatToken *Last = Tokens.back();
+  if (!Last->is(tok::r_paren))
+    return false;
+
+  FormatToken *String = Tokens[Tokens.size() - 2];
+  if (!String->is(tok::string_literal) || String->IsMultiline)
+    return false;
+
+  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
+    return false;
+
+  FormatToken *Macro = Tokens[Tokens.size() - 4];
+  if (Macro->TokenText != "_T")
+    return false;
+
+  const char *Start = Macro->TokenText.data();
+  const char *End = Last->TokenText.data() + Last->TokenText.size();
+  String->TokenText = StringRef(Start, End - Start);
+  String->IsFirst = Macro->IsFirst;
+  String->LastNewlineOffset = Macro->LastNewlineOffset;
+  String->WhitespaceRange = Macro->WhitespaceRange;
+  String->OriginalColumn = Macro->OriginalColumn;
+  String->ColumnWidth = encoding::columnWidthWithTabs(
+      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
+  String->NewlinesBefore = Macro->NewlinesBefore;
+  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
+
+  Tokens.pop_back();
+  Tokens.pop_back();
+  Tokens.pop_back();
+  Tokens.back() = String;
+  return true;
+}
+
+bool FormatTokenLexer::tryMergeConflictMarkers() {
+  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
+    return false;
+
+  // Conflict lines look like:
+  // <marker> <text from the vcs>
+  // For example:
+  // >>>>>>> /file/in/file/system at revision 1234
+  //
+  // We merge all tokens in a line that starts with a conflict marker
+  // into a single token with a special token type that the unwrapped line
+  // parser will use to correctly rebuild the underlying code.
+
+  FileID ID;
+  // Get the position of the first token in the line.
+  unsigned FirstInLineOffset;
+  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
+      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
+  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
+  // Calculate the offset of the start of the current line.
+  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
+  if (LineOffset == StringRef::npos) {
+    LineOffset = 0;
+  } else {
+    ++LineOffset;
+  }
+
+  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
+  StringRef LineStart;
+  if (FirstSpace == StringRef::npos) {
+    LineStart = Buffer.substr(LineOffset);
+  } else {
+    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
+  }
+
+  TokenType Type = TT_Unknown;
+  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
+    Type = TT_ConflictStart;
+  } else if (LineStart == "|||||||" || LineStart == "=======" ||
+             LineStart == "====") {
+    Type = TT_ConflictAlternative;
+  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
+    Type = TT_ConflictEnd;
+  }
+
+  if (Type != TT_Unknown) {
+    FormatToken *Next = Tokens.back();
+
+    Tokens.resize(FirstInLineIndex + 1);
+    // We do not need to build a complete token here, as we will skip it
+    // during parsing anyway (as we must not touch whitespace around conflict
+    // markers).
+    Tokens.back()->Type = Type;
+    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
+
+    Tokens.push_back(Next);
+    return true;
+  }
+
+  return false;
+}
+
+FormatToken *FormatTokenLexer::getStashedToken() {
+  // Create a synthesized second '>' or '<' token.
+  Token Tok = FormatTok->Tok;
+  StringRef TokenText = FormatTok->TokenText;
+
+  unsigned OriginalColumn = FormatTok->OriginalColumn;
+  FormatTok = new (Allocator.Allocate()) FormatToken;
+  FormatTok->Tok = Tok;
+  SourceLocation TokLocation =
+      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
+  FormatTok->Tok.setLocation(TokLocation);
+  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
+  FormatTok->TokenText = TokenText;
+  FormatTok->ColumnWidth = 1;
+  FormatTok->OriginalColumn = OriginalColumn + 1;
+
+  return FormatTok;
+}
+
+FormatToken *FormatTokenLexer::getNextToken() {
+  if (GreaterStashed) {
+    GreaterStashed = false;
+    return getStashedToken();
+  }
+  if (LessStashed) {
+    LessStashed = false;
+    return getStashedToken();
+  }
+
+  FormatTok = new (Allocator.Allocate()) FormatToken;
+  readRawToken(*FormatTok);
+  SourceLocation WhitespaceStart =
+      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
+  FormatTok->IsFirst = IsFirstToken;
+  IsFirstToken = false;
+
+  // Consume and record whitespace until we find a significant token.
+  unsigned WhitespaceLength = TrailingWhitespace;
+  while (FormatTok->Tok.is(tok::unknown)) {
+    StringRef Text = FormatTok->TokenText;
+    auto EscapesNewline = [&](int pos) {
+      // A '\r' here is just part of '\r\n'. Skip it.
+      if (pos >= 0 && Text[pos] == '\r')
+        --pos;
+      // See whether there is an odd number of '\' before this.
+      unsigned count = 0;
+      for (; pos >= 0; --pos, ++count)
+        if (Text[pos] != '\\')
+          break;
+      return count & 1;
+    };
+    // FIXME: This miscounts tok:unknown tokens that are not just
+    // whitespace, e.g. a '`' character.
+    for (int i = 0, e = Text.size(); i != e; ++i) {
+      switch (Text[i]) {
+      case '\n':
+        ++FormatTok->NewlinesBefore;
+        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
+        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
+        Column = 0;
+        break;
+      case '\r':
+        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
+        Column = 0;
+        break;
+      case '\f':
+      case '\v':
+        Column = 0;
+        break;
+      case ' ':
+        ++Column;
+        break;
+      case '\t':
+        Column += Style.TabWidth - Column % Style.TabWidth;
+        break;
+      case '\\':
+        if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
+          FormatTok->Type = TT_ImplicitStringLiteral;
+        break;
+      default:
+        FormatTok->Type = TT_ImplicitStringLiteral;
+        break;
+      }
+      if (FormatTok->Type == TT_ImplicitStringLiteral)
+        break;
+    }
+
+    if (FormatTok->is(TT_ImplicitStringLiteral))
+      break;
+    WhitespaceLength += FormatTok->Tok.getLength();
+
+    readRawToken(*FormatTok);
+  }
+
+  // In case the token starts with escaped newlines, we want to
+  // take them into account as whitespace - this pattern is quite frequent
+  // in macro definitions.
+  // FIXME: Add a more explicit test.
+  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
+         FormatTok->TokenText[1] == '\n') {
+    ++FormatTok->NewlinesBefore;
+    WhitespaceLength += 2;
+    FormatTok->LastNewlineOffset = 2;
+    Column = 0;
+    FormatTok->TokenText = FormatTok->TokenText.substr(2);
+  }
+
+  FormatTok->WhitespaceRange = SourceRange(
+      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
+
+  FormatTok->OriginalColumn = Column;
+
+  TrailingWhitespace = 0;
+  if (FormatTok->Tok.is(tok::comment)) {
+    // FIXME: Add the trimmed whitespace to Column.
+    StringRef UntrimmedText = FormatTok->TokenText;
+    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
+    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
+  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
+    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
+    FormatTok->Tok.setIdentifierInfo(&Info);
+    FormatTok->Tok.setKind(Info.getTokenID());
+    if (Style.Language == FormatStyle::LK_Java &&
+        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
+                           tok::kw_operator)) {
+      FormatTok->Tok.setKind(tok::identifier);
+      FormatTok->Tok.setIdentifierInfo(nullptr);
+    } else if (Style.Language == FormatStyle::LK_JavaScript &&
+               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
+                                  tok::kw_operator)) {
+      FormatTok->Tok.setKind(tok::identifier);
+      FormatTok->Tok.setIdentifierInfo(nullptr);
+    }
+  } else if (FormatTok->Tok.is(tok::greatergreater)) {
+    FormatTok->Tok.setKind(tok::greater);
+    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
+    GreaterStashed = true;
+  } else if (FormatTok->Tok.is(tok::lessless)) {
+    FormatTok->Tok.setKind(tok::less);
+    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
+    LessStashed = true;
+  }
+
+  // Now FormatTok is the next non-whitespace token.
+
+  StringRef Text = FormatTok->TokenText;
+  size_t FirstNewlinePos = Text.find('\n');
+  if (FirstNewlinePos == StringRef::npos) {
+    // FIXME: ColumnWidth actually depends on the start column, we need to
+    // take this into account when the token is moved.
+    FormatTok->ColumnWidth =
+        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
+    Column += FormatTok->ColumnWidth;
+  } else {
+    FormatTok->IsMultiline = true;
+    // FIXME: ColumnWidth actually depends on the start column, we need to
+    // take this into account when the token is moved.
+    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
+        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
+
+    // The last line of the token always starts in column 0.
+    // Thus, the length can be precomputed even in the presence of tabs.
+    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
+        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
+    Column = FormatTok->LastLineColumnWidth;
+  }
+
+  if (Style.Language == FormatStyle::LK_Cpp) {
+    if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
+          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
+              tok::pp_define) &&
+        std::find(ForEachMacros.begin(), ForEachMacros.end(),
+                  FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) {
+      FormatTok->Type = TT_ForEachMacro;
+    } else if (FormatTok->is(tok::identifier)) {
+      if (MacroBlockBeginRegex.match(Text)) {
+        FormatTok->Type = TT_MacroBlockBegin;
+      } else if (MacroBlockEndRegex.match(Text)) {
+        FormatTok->Type = TT_MacroBlockEnd;
+      }
+    }
+  }
+
+  return FormatTok;
+}
+
+void FormatTokenLexer::readRawToken(FormatToken &Tok) {
+  Lex->LexFromRawLexer(Tok.Tok);
+  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
+                            Tok.Tok.getLength());
+  // For formatting, treat unterminated string literals like normal string
+  // literals.
+  if (Tok.is(tok::unknown)) {
+    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
+      Tok.Tok.setKind(tok::string_literal);
+      Tok.IsUnterminatedLiteral = true;
+    } else if (Style.Language == FormatStyle::LK_JavaScript &&
+               Tok.TokenText == "''") {
+      Tok.Tok.setKind(tok::string_literal);
+    }
+  }
+
+  if (Style.Language == FormatStyle::LK_JavaScript &&
+      Tok.is(tok::char_constant)) {
+    Tok.Tok.setKind(tok::string_literal);
+  }
+
+  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
+                               Tok.TokenText == "/* clang-format on */")) {
+    FormattingDisabled = false;
+  }
+
+  Tok.Finalized = FormattingDisabled;
+
+  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
+                               Tok.TokenText == "/* clang-format off */")) {
+    FormattingDisabled = true;
+  }
+}
+
+void FormatTokenLexer::resetLexer(unsigned Offset) {
+  StringRef Buffer = SourceMgr.getBufferData(ID);
+  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
+                      getFormattingLangOpts(Style), Buffer.begin(),
+                      Buffer.begin() + Offset, Buffer.end()));
+  Lex->SetKeepWhitespaceMode(true);
+  TrailingWhitespace = 0;
+}
+
+} // namespace format
+} // namespace clang
diff --git a/lib/Format/FormatTokenLexer.h b/lib/Format/FormatTokenLexer.h
new file mode 100644
index 0000000..fa8c888
--- /dev/null
+++ b/lib/Format/FormatTokenLexer.h
@@ -0,0 +1,97 @@
+//===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains FormatTokenLexer, which tokenizes a source file
+/// into a token stream suitable for ClangFormat.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
+#define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
+
+#include "Encoding.h"
+#include "FormatToken.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/Support/Regex.h"
+
+namespace clang {
+namespace format {
+
+class FormatTokenLexer {
+public:
+  FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
+                   const FormatStyle &Style, encoding::Encoding Encoding);
+
+  ArrayRef<FormatToken *> lex();
+
+  const AdditionalKeywords &getKeywords() { return Keywords; }
+
+private:
+  void tryMergePreviousTokens();
+
+  bool tryMergeLessLess();
+
+  bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);
+
+  // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
+  bool precedesOperand(FormatToken *Tok);
+
+  bool canPrecedeRegexLiteral(FormatToken *Prev);
+
+  // Tries to parse a JavaScript Regex literal starting at the current token,
+  // if that begins with a slash and is in a location where JavaScript allows
+  // regex literals. Changes the current token to a regex literal and updates
+  // its text if successful.
+  void tryParseJSRegexLiteral();
+
+  void tryParseTemplateString();
+
+  bool tryMerge_TMacro();
+
+  bool tryMergeConflictMarkers();
+
+  FormatToken *getStashedToken();
+
+  FormatToken *getNextToken();
+
+  FormatToken *FormatTok;
+  bool IsFirstToken;
+  bool GreaterStashed, LessStashed;
+  unsigned Column;
+  unsigned TrailingWhitespace;
+  std::unique_ptr<Lexer> Lex;
+  const SourceManager &SourceMgr;
+  FileID ID;
+  const FormatStyle &Style;
+  IdentifierTable IdentTable;
+  AdditionalKeywords Keywords;
+  encoding::Encoding Encoding;
+  llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
+  // Index (in 'Tokens') of the last token that starts a new line.
+  unsigned FirstInLineIndex;
+  SmallVector<FormatToken *, 16> Tokens;
+  SmallVector<IdentifierInfo *, 8> ForEachMacros;
+
+  bool FormattingDisabled;
+
+  llvm::Regex MacroBlockBeginRegex;
+  llvm::Regex MacroBlockEndRegex;
+
+  void readRawToken(FormatToken &Tok);
+
+  void resetLexer(unsigned Offset);
+};
+
+} // namespace format
+} // namespace clang
+
+#endif
diff --git a/lib/Format/Makefile b/lib/Format/Makefile
deleted file mode 100644
index f4d2b98..0000000
--- a/lib/Format/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- clang/lib/Format/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangFormat
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Format/SortJavaScriptImports.cpp b/lib/Format/SortJavaScriptImports.cpp
new file mode 100644
index 0000000..e800007
--- /dev/null
+++ b/lib/Format/SortJavaScriptImports.cpp
@@ -0,0 +1,448 @@
+//===--- SortJavaScriptImports.h - Sort ES6 Imports -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a sort operation for JavaScript ES6 imports.
+///
+//===----------------------------------------------------------------------===//
+
+#include "SortJavaScriptImports.h"
+#include "SortJavaScriptImports.h"
+#include "TokenAnalyzer.h"
+#include "TokenAnnotator.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <string>
+
+#define DEBUG_TYPE "format-formatter"
+
+namespace clang {
+namespace format {
+
+class FormatTokenLexer;
+
+using clang::format::FormatStyle;
+
+// An imported symbol in a JavaScript ES6 import/export, possibly aliased.
+struct JsImportedSymbol {
+  StringRef Symbol;
+  StringRef Alias;
+  SourceRange Range;
+
+  bool operator==(const JsImportedSymbol &RHS) const {
+    // Ignore Range for comparison, it is only used to stitch code together,
+    // but imports at different code locations are still conceptually the same.
+    return Symbol == RHS.Symbol && Alias == RHS.Alias;
+  }
+};
+
+// An ES6 module reference.
+//
+// ES6 implements a module system, where individual modules (~= source files)
+// can reference other modules, either importing symbols from them, or exporting
+// symbols from them:
+//   import {foo} from 'foo';
+//   export {foo};
+//   export {bar} from 'bar';
+//
+// `export`s with URLs are syntactic sugar for an import of the symbol from the
+// URL, followed by an export of the symbol, allowing this code to treat both
+// statements more or less identically, with the exception being that `export`s
+// are sorted last.
+//
+// imports and exports support individual symbols, but also a wildcard syntax:
+//   import * as prefix from 'foo';
+//   export * from 'bar';
+//
+// This struct represents both exports and imports to build up the information
+// required for sorting module references.
+struct JsModuleReference {
+  bool IsExport = false;
+  // Module references are sorted into these categories, in order.
+  enum ReferenceCategory {
+    SIDE_EFFECT,     // "import 'something';"
+    ABSOLUTE,        // from 'something'
+    RELATIVE_PARENT, // from '../*'
+    RELATIVE,        // from './*'
+  };
+  ReferenceCategory Category = ReferenceCategory::SIDE_EFFECT;
+  // The URL imported, e.g. `import .. from 'url';`. Empty for `export {a, b};`.
+  StringRef URL;
+  // Prefix from "import * as prefix". Empty for symbol imports and `export *`.
+  // Implies an empty names list.
+  StringRef Prefix;
+  // Symbols from `import {SymbolA, SymbolB, ...} from ...;`.
+  SmallVector<JsImportedSymbol, 1> Symbols;
+  // Textual position of the import/export, including preceding and trailing
+  // comments.
+  SourceRange Range;
+};
+
+bool operator<(const JsModuleReference &LHS, const JsModuleReference &RHS) {
+  if (LHS.IsExport != RHS.IsExport)
+    return LHS.IsExport < RHS.IsExport;
+  if (LHS.Category != RHS.Category)
+    return LHS.Category < RHS.Category;
+  if (LHS.Category == JsModuleReference::ReferenceCategory::SIDE_EFFECT)
+    // Side effect imports might be ordering sensitive. Consider them equal so
+    // that they maintain their relative order in the stable sort below.
+    // This retains transitivity because LHS.Category == RHS.Category here.
+    return false;
+  // Empty URLs sort *last* (for export {...};).
+  if (LHS.URL.empty() != RHS.URL.empty())
+    return LHS.URL.empty() < RHS.URL.empty();
+  if (int Res = LHS.URL.compare_lower(RHS.URL))
+    return Res < 0;
+  // '*' imports (with prefix) sort before {a, b, ...} imports.
+  if (LHS.Prefix.empty() != RHS.Prefix.empty())
+    return LHS.Prefix.empty() < RHS.Prefix.empty();
+  if (LHS.Prefix != RHS.Prefix)
+    return LHS.Prefix > RHS.Prefix;
+  return false;
+}
+
+// JavaScriptImportSorter sorts JavaScript ES6 imports and exports. It is
+// implemented as a TokenAnalyzer because ES6 imports have substantial syntactic
+// structure, making it messy to sort them using regular expressions.
+class JavaScriptImportSorter : public TokenAnalyzer {
+public:
+  JavaScriptImportSorter(const Environment &Env, const FormatStyle &Style)
+      : TokenAnalyzer(Env, Style),
+        FileContents(Env.getSourceManager().getBufferData(Env.getFileID())) {}
+
+  tooling::Replacements
+  analyze(TokenAnnotator &Annotator,
+          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+          FormatTokenLexer &Tokens, tooling::Replacements &) override {
+    tooling::Replacements Result;
+    AffectedRangeMgr.computeAffectedLines(AnnotatedLines.begin(),
+                                          AnnotatedLines.end());
+
+    const AdditionalKeywords &Keywords = Tokens.getKeywords();
+    SmallVector<JsModuleReference, 16> References;
+    AnnotatedLine *FirstNonImportLine;
+    std::tie(References, FirstNonImportLine) =
+        parseModuleReferences(Keywords, AnnotatedLines);
+
+    if (References.empty())
+      return Result;
+
+    SmallVector<unsigned, 16> Indices;
+    for (unsigned i = 0, e = References.size(); i != e; ++i)
+      Indices.push_back(i);
+    std::stable_sort(Indices.begin(), Indices.end(),
+                     [&](unsigned LHSI, unsigned RHSI) {
+                       return References[LHSI] < References[RHSI];
+                     });
+    bool ReferencesInOrder = std::is_sorted(Indices.begin(), Indices.end());
+
+    std::string ReferencesText;
+    bool SymbolsInOrder = true;
+    for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
+      JsModuleReference Reference = References[Indices[i]];
+      if (appendReference(ReferencesText, Reference))
+        SymbolsInOrder = false;
+      if (i + 1 < e) {
+        // Insert breaks between imports and exports.
+        ReferencesText += "\n";
+        // Separate imports groups with two line breaks, but keep all exports
+        // in a single group.
+        if (!Reference.IsExport &&
+            (Reference.IsExport != References[Indices[i + 1]].IsExport ||
+             Reference.Category != References[Indices[i + 1]].Category))
+          ReferencesText += "\n";
+      }
+    }
+
+    if (ReferencesInOrder && SymbolsInOrder)
+      return Result;
+
+    SourceRange InsertionPoint = References[0].Range;
+    InsertionPoint.setEnd(References[References.size() - 1].Range.getEnd());
+
+    // The loop above might collapse previously existing line breaks between
+    // import blocks, and thus shrink the file. SortIncludes must not shrink
+    // overall source length as there is currently no re-calculation of ranges
+    // after applying source sorting.
+    // This loop just backfills trailing spaces after the imports, which are
+    // harmless and will be stripped by the subsequent formatting pass.
+    // FIXME: A better long term fix is to re-calculate Ranges after sorting.
+    unsigned PreviousSize = getSourceText(InsertionPoint).size();
+    while (ReferencesText.size() < PreviousSize) {
+      ReferencesText += " ";
+    }
+
+    // Separate references from the main code body of the file.
+    if (FirstNonImportLine && FirstNonImportLine->First->NewlinesBefore < 2)
+      ReferencesText += "\n";
+
+    DEBUG(llvm::dbgs() << "Replacing imports:\n"
+                       << getSourceText(InsertionPoint) << "\nwith:\n"
+                       << ReferencesText << "\n");
+    auto Err = Result.add(tooling::Replacement(
+        Env.getSourceManager(), CharSourceRange::getCharRange(InsertionPoint),
+        ReferencesText));
+    // FIXME: better error handling. For now, just print error message and skip
+    // the replacement for the release version.
+    if (Err)
+      llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+    assert(!Err);
+
+    return Result;
+  }
+
+private:
+  FormatToken *Current;
+  FormatToken *LineEnd;
+
+  FormatToken invalidToken;
+
+  StringRef FileContents;
+
+  void skipComments() { Current = skipComments(Current); }
+
+  FormatToken *skipComments(FormatToken *Tok) {
+    while (Tok && Tok->is(tok::comment))
+      Tok = Tok->Next;
+    return Tok;
+  }
+
+  void nextToken() {
+    Current = Current->Next;
+    skipComments();
+    if (!Current || Current == LineEnd->Next) {
+      // Set the current token to an invalid token, so that further parsing on
+      // this line fails.
+      invalidToken.Tok.setKind(tok::unknown);
+      Current = &invalidToken;
+    }
+  }
+
+  StringRef getSourceText(SourceRange Range) {
+    return getSourceText(Range.getBegin(), Range.getEnd());
+  }
+
+  StringRef getSourceText(SourceLocation Begin, SourceLocation End) {
+    const SourceManager &SM = Env.getSourceManager();
+    return FileContents.substr(SM.getFileOffset(Begin),
+                               SM.getFileOffset(End) - SM.getFileOffset(Begin));
+  }
+
+  // Appends ``Reference`` to ``Buffer``, returning true if text within the
+  // ``Reference`` changed (e.g. symbol order).
+  bool appendReference(std::string &Buffer, JsModuleReference &Reference) {
+    // Sort the individual symbols within the import.
+    // E.g. `import {b, a} from 'x';` -> `import {a, b} from 'x';`
+    SmallVector<JsImportedSymbol, 1> Symbols = Reference.Symbols;
+    std::stable_sort(
+        Symbols.begin(), Symbols.end(),
+        [&](const JsImportedSymbol &LHS, const JsImportedSymbol &RHS) {
+          return LHS.Symbol.compare_lower(RHS.Symbol) < 0;
+        });
+    if (Symbols == Reference.Symbols) {
+      // No change in symbol order.
+      StringRef ReferenceStmt = getSourceText(Reference.Range);
+      Buffer += ReferenceStmt;
+      return false;
+    }
+    // Stitch together the module reference start...
+    SourceLocation SymbolsStart = Reference.Symbols.front().Range.getBegin();
+    SourceLocation SymbolsEnd = Reference.Symbols.back().Range.getEnd();
+    Buffer += getSourceText(Reference.Range.getBegin(), SymbolsStart);
+    // ... then the references in order ...
+    for (auto I = Symbols.begin(), E = Symbols.end(); I != E; ++I) {
+      if (I != Symbols.begin())
+        Buffer += ",";
+      Buffer += getSourceText(I->Range);
+    }
+    // ... followed by the module reference end.
+    Buffer += getSourceText(SymbolsEnd, Reference.Range.getEnd());
+    return true;
+  }
+
+  // Parses module references in the given lines. Returns the module references,
+  // and a pointer to the first "main code" line if that is adjacent to the
+  // affected lines of module references, nullptr otherwise.
+  std::pair<SmallVector<JsModuleReference, 16>, AnnotatedLine*>
+  parseModuleReferences(const AdditionalKeywords &Keywords,
+                        SmallVectorImpl<AnnotatedLine *> &AnnotatedLines) {
+    SmallVector<JsModuleReference, 16> References;
+    SourceLocation Start;
+    bool FoundLines = false;
+    AnnotatedLine *FirstNonImportLine = nullptr;
+    for (auto Line : AnnotatedLines) {
+      if (!Line->Affected) {
+        // Only sort the first contiguous block of affected lines.
+        if (FoundLines)
+          break;
+        else
+          continue;
+      }
+      Current = Line->First;
+      LineEnd = Line->Last;
+      skipComments();
+      if (Start.isInvalid() || References.empty())
+        // After the first file level comment, consider line comments to be part
+        // of the import that immediately follows them by using the previously
+        // set Start.
+        Start = Line->First->Tok.getLocation();
+      if (!Current)
+        continue; // Only comments on this line.
+      FoundLines = true;
+      JsModuleReference Reference;
+      Reference.Range.setBegin(Start);
+      if (!parseModuleReference(Keywords, Reference)) {
+        FirstNonImportLine = Line;
+        break;
+      }
+      Reference.Range.setEnd(LineEnd->Tok.getEndLoc());
+      DEBUG({
+        llvm::dbgs() << "JsModuleReference: {"
+                     << "is_export: " << Reference.IsExport
+                     << ", cat: " << Reference.Category
+                     << ", url: " << Reference.URL
+                     << ", prefix: " << Reference.Prefix;
+        for (size_t i = 0; i < Reference.Symbols.size(); ++i)
+          llvm::dbgs() << ", " << Reference.Symbols[i].Symbol << " as "
+                       << Reference.Symbols[i].Alias;
+        llvm::dbgs() << ", text: " << getSourceText(Reference.Range);
+        llvm::dbgs() << "}\n";
+      });
+      References.push_back(Reference);
+      Start = SourceLocation();
+    }
+    return std::make_pair(References, FirstNonImportLine);
+  }
+
+  // Parses a JavaScript/ECMAScript 6 module reference.
+  // See http://www.ecma-international.org/ecma-262/6.0/#sec-scripts-and-modules
+  // for grammar EBNF (production ModuleItem).
+  bool parseModuleReference(const AdditionalKeywords &Keywords,
+                            JsModuleReference &Reference) {
+    if (!Current || !Current->isOneOf(Keywords.kw_import, tok::kw_export))
+      return false;
+    Reference.IsExport = Current->is(tok::kw_export);
+
+    nextToken();
+    if (Current->isStringLiteral() && !Reference.IsExport) {
+      // "import 'side-effect';"
+      Reference.Category = JsModuleReference::ReferenceCategory::SIDE_EFFECT;
+      Reference.URL =
+          Current->TokenText.substr(1, Current->TokenText.size() - 2);
+      return true;
+    }
+
+    if (!parseModuleBindings(Keywords, Reference))
+      return false;
+    nextToken();
+
+    if (Current->is(Keywords.kw_from)) {
+      // imports have a 'from' clause, exports might not.
+      nextToken();
+      if (!Current->isStringLiteral())
+        return false;
+      // URL = TokenText without the quotes.
+      Reference.URL =
+          Current->TokenText.substr(1, Current->TokenText.size() - 2);
+      if (Reference.URL.startswith(".."))
+        Reference.Category =
+            JsModuleReference::ReferenceCategory::RELATIVE_PARENT;
+      else if (Reference.URL.startswith("."))
+        Reference.Category = JsModuleReference::ReferenceCategory::RELATIVE;
+      else
+        Reference.Category = JsModuleReference::ReferenceCategory::ABSOLUTE;
+    } else {
+      // w/o URL groups with "empty".
+      Reference.Category = JsModuleReference::ReferenceCategory::RELATIVE;
+    }
+    return true;
+  }
+
+  bool parseModuleBindings(const AdditionalKeywords &Keywords,
+                           JsModuleReference &Reference) {
+    if (parseStarBinding(Keywords, Reference))
+      return true;
+    return parseNamedBindings(Keywords, Reference);
+  }
+
+  bool parseStarBinding(const AdditionalKeywords &Keywords,
+                        JsModuleReference &Reference) {
+    // * as prefix from '...';
+    if (Current->isNot(tok::star))
+      return false;
+    nextToken();
+    if (Current->isNot(Keywords.kw_as))
+      return false;
+    nextToken();
+    if (Current->isNot(tok::identifier))
+      return false;
+    Reference.Prefix = Current->TokenText;
+    return true;
+  }
+
+  bool parseNamedBindings(const AdditionalKeywords &Keywords,
+                          JsModuleReference &Reference) {
+    if (Current->isNot(tok::l_brace))
+      return false;
+
+    // {sym as alias, sym2 as ...} from '...';
+    nextToken();
+    while (true) {
+      if (Current->is(tok::r_brace))
+        return true;
+      if (Current->isNot(tok::identifier))
+        return false;
+
+      JsImportedSymbol Symbol;
+      Symbol.Symbol = Current->TokenText;
+      // Make sure to include any preceding comments.
+      Symbol.Range.setBegin(
+          Current->getPreviousNonComment()->Next->WhitespaceRange.getBegin());
+      nextToken();
+
+      if (Current->is(Keywords.kw_as)) {
+        nextToken();
+        if (Current->isNot(tok::identifier))
+          return false;
+        Symbol.Alias = Current->TokenText;
+        nextToken();
+      }
+      Symbol.Range.setEnd(Current->Tok.getLocation());
+      Reference.Symbols.push_back(Symbol);
+
+      if (Current->is(tok::r_brace))
+        return true;
+      if (Current->isNot(tok::comma))
+        return false;
+      nextToken();
+    }
+  }
+};
+
+tooling::Replacements sortJavaScriptImports(const FormatStyle &Style,
+                                            StringRef Code,
+                                            ArrayRef<tooling::Range> Ranges,
+                                            StringRef FileName) {
+  // FIXME: Cursor support.
+  std::unique_ptr<Environment> Env =
+      Environment::CreateVirtualEnvironment(Code, FileName, Ranges);
+  JavaScriptImportSorter Sorter(*Env, Style);
+  return Sorter.process();
+}
+
+} // end namespace format
+} // end namespace clang
diff --git a/lib/Format/SortJavaScriptImports.h b/lib/Format/SortJavaScriptImports.h
new file mode 100644
index 0000000..f22a051
--- /dev/null
+++ b/lib/Format/SortJavaScriptImports.h
@@ -0,0 +1,36 @@
+//===--- SortJavaScriptImports.h - Sort ES6 Imports -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a sorter for JavaScript ES6 imports.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H
+#define LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H
+
+#include "clang/Basic/LLVM.h"
+#include "clang/Format/Format.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace clang {
+namespace format {
+
+// Sort JavaScript ES6 imports/exports in ``Code``. The generated replacements
+// only monotonically increase the length of the given code.
+tooling::Replacements sortJavaScriptImports(const FormatStyle &Style,
+                                            StringRef Code,
+                                            ArrayRef<tooling::Range> Ranges,
+                                            StringRef FileName);
+
+} // end namespace format
+} // end namespace clang
+
+#endif
diff --git a/lib/Format/TokenAnalyzer.cpp b/lib/Format/TokenAnalyzer.cpp
new file mode 100644
index 0000000..7baba62
--- /dev/null
+++ b/lib/Format/TokenAnalyzer.cpp
@@ -0,0 +1,146 @@
+//===--- TokenAnalyzer.cpp - Analyze Token Streams --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements an abstract TokenAnalyzer and associated helper
+/// classes. TokenAnalyzer can be extended to generate replacements based on
+/// an annotated and pre-processed token stream.
+///
+//===----------------------------------------------------------------------===//
+
+#include "TokenAnalyzer.h"
+#include "AffectedRangeManager.h"
+#include "Encoding.h"
+#include "FormatToken.h"
+#include "FormatTokenLexer.h"
+#include "TokenAnnotator.h"
+#include "UnwrappedLineParser.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "format-formatter"
+
+namespace clang {
+namespace format {
+
+// This sets up an virtual file system with file \p FileName containing \p
+// Code.
+std::unique_ptr<Environment>
+Environment::CreateVirtualEnvironment(StringRef Code, StringRef FileName,
+                                      ArrayRef<tooling::Range> Ranges) {
+  // This is referenced by `FileMgr` and will be released by `FileMgr` when it
+  // is deleted.
+  IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
+      new vfs::InMemoryFileSystem);
+  // This is passed to `SM` as reference, so the pointer has to be referenced
+  // in `Environment` so that `FileMgr` can out-live this function scope.
+  std::unique_ptr<FileManager> FileMgr(
+      new FileManager(FileSystemOptions(), InMemoryFileSystem));
+  // This is passed to `SM` as reference, so the pointer has to be referenced
+  // by `Environment` due to the same reason above.
+  std::unique_ptr<DiagnosticsEngine> Diagnostics(new DiagnosticsEngine(
+      IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs),
+      new DiagnosticOptions));
+  // This will be stored as reference, so the pointer has to be stored in
+  // due to the same reason above.
+  std::unique_ptr<SourceManager> VirtualSM(
+      new SourceManager(*Diagnostics, *FileMgr));
+  InMemoryFileSystem->addFile(
+      FileName, 0, llvm::MemoryBuffer::getMemBuffer(
+                       Code, FileName, /*RequiresNullTerminator=*/false));
+  FileID ID = VirtualSM->createFileID(FileMgr->getFile(FileName),
+                                      SourceLocation(), clang::SrcMgr::C_User);
+  assert(ID.isValid());
+  SourceLocation StartOfFile = VirtualSM->getLocForStartOfFile(ID);
+  std::vector<CharSourceRange> CharRanges;
+  for (const tooling::Range &Range : Ranges) {
+    SourceLocation Start = StartOfFile.getLocWithOffset(Range.getOffset());
+    SourceLocation End = Start.getLocWithOffset(Range.getLength());
+    CharRanges.push_back(CharSourceRange::getCharRange(Start, End));
+  }
+  return llvm::make_unique<Environment>(ID, std::move(FileMgr),
+                                        std::move(VirtualSM),
+                                        std::move(Diagnostics), CharRanges);
+}
+
+TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
+    : Style(Style), Env(Env),
+      AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()),
+      UnwrappedLines(1),
+      Encoding(encoding::detectEncoding(
+          Env.getSourceManager().getBufferData(Env.getFileID()))) {
+  DEBUG(
+      llvm::dbgs() << "File encoding: "
+                   << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
+                   << "\n");
+  DEBUG(llvm::dbgs() << "Language: " << getLanguageName(Style.Language)
+                     << "\n");
+}
+
+tooling::Replacements TokenAnalyzer::process() {
+  tooling::Replacements Result;
+  FormatTokenLexer Tokens(Env.getSourceManager(), Env.getFileID(), Style,
+                          Encoding);
+
+  UnwrappedLineParser Parser(Style, Tokens.getKeywords(), Tokens.lex(), *this);
+  Parser.parse();
+  assert(UnwrappedLines.rbegin()->empty());
+  for (unsigned Run = 0, RunE = UnwrappedLines.size(); Run + 1 != RunE; ++Run) {
+    DEBUG(llvm::dbgs() << "Run " << Run << "...\n");
+    SmallVector<AnnotatedLine *, 16> AnnotatedLines;
+
+    TokenAnnotator Annotator(Style, Tokens.getKeywords());
+    for (unsigned i = 0, e = UnwrappedLines[Run].size(); i != e; ++i) {
+      AnnotatedLines.push_back(new AnnotatedLine(UnwrappedLines[Run][i]));
+      Annotator.annotate(*AnnotatedLines.back());
+    }
+
+    tooling::Replacements RunResult =
+        analyze(Annotator, AnnotatedLines, Tokens, Result);
+
+    DEBUG({
+      llvm::dbgs() << "Replacements for run " << Run << ":\n";
+      for (tooling::Replacements::const_iterator I = RunResult.begin(),
+                                                 E = RunResult.end();
+           I != E; ++I) {
+        llvm::dbgs() << I->toString() << "\n";
+      }
+    });
+    for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
+      delete AnnotatedLines[i];
+    }
+    for (auto R : RunResult) {
+      auto Err = Result.add(R);
+      // FIXME: better error handling here. For now, simply return an empty
+      // Replacements to indicate failure.
+      if (Err) {
+        llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+        return tooling::Replacements();
+      }
+    }
+  }
+  return Result;
+}
+
+void TokenAnalyzer::consumeUnwrappedLine(const UnwrappedLine &TheLine) {
+  assert(!UnwrappedLines.empty());
+  UnwrappedLines.back().push_back(TheLine);
+}
+
+void TokenAnalyzer::finishRun() {
+  UnwrappedLines.push_back(SmallVector<UnwrappedLine, 16>());
+}
+
+} // end namespace format
+} // end namespace clang
diff --git a/lib/Format/TokenAnalyzer.h b/lib/Format/TokenAnalyzer.h
new file mode 100644
index 0000000..c1aa9c5
--- /dev/null
+++ b/lib/Format/TokenAnalyzer.h
@@ -0,0 +1,108 @@
+//===--- TokenAnalyzer.h - Analyze Token Streams ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares an abstract TokenAnalyzer, and associated helper
+/// classes. TokenAnalyzer can be extended to generate replacements based on
+/// an annotated and pre-processed token stream.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_TOKENANALYZER_H
+#define LLVM_CLANG_LIB_FORMAT_TOKENANALYZER_H
+
+#include "AffectedRangeManager.h"
+#include "Encoding.h"
+#include "FormatToken.h"
+#include "FormatTokenLexer.h"
+#include "TokenAnnotator.h"
+#include "UnwrappedLineParser.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "format-formatter"
+
+namespace clang {
+namespace format {
+
+class Environment {
+public:
+  Environment(SourceManager &SM, FileID ID, ArrayRef<CharSourceRange> Ranges)
+      : ID(ID), CharRanges(Ranges.begin(), Ranges.end()), SM(SM) {}
+
+  Environment(FileID ID, std::unique_ptr<FileManager> FileMgr,
+              std::unique_ptr<SourceManager> VirtualSM,
+              std::unique_ptr<DiagnosticsEngine> Diagnostics,
+              const std::vector<CharSourceRange> &CharRanges)
+      : ID(ID), CharRanges(CharRanges.begin(), CharRanges.end()),
+        SM(*VirtualSM), FileMgr(std::move(FileMgr)),
+        VirtualSM(std::move(VirtualSM)), Diagnostics(std::move(Diagnostics)) {}
+
+  // This sets up an virtual file system with file \p FileName containing \p
+  // Code.
+  static std::unique_ptr<Environment>
+  CreateVirtualEnvironment(StringRef Code, StringRef FileName,
+                           ArrayRef<tooling::Range> Ranges);
+
+  FileID getFileID() const { return ID; }
+
+  StringRef getFileName() const { return FileName; }
+
+  ArrayRef<CharSourceRange> getCharRanges() const { return CharRanges; }
+
+  const SourceManager &getSourceManager() const { return SM; }
+
+private:
+  FileID ID;
+  StringRef FileName;
+  SmallVector<CharSourceRange, 8> CharRanges;
+  SourceManager &SM;
+
+  // The order of these fields are important - they should be in the same order
+  // as they are created in `CreateVirtualEnvironment` so that they can be
+  // deleted in the reverse order as they are created.
+  std::unique_ptr<FileManager> FileMgr;
+  std::unique_ptr<SourceManager> VirtualSM;
+  std::unique_ptr<DiagnosticsEngine> Diagnostics;
+};
+
+class TokenAnalyzer : public UnwrappedLineConsumer {
+public:
+  TokenAnalyzer(const Environment &Env, const FormatStyle &Style);
+
+  tooling::Replacements process();
+
+protected:
+  virtual tooling::Replacements
+  analyze(TokenAnnotator &Annotator,
+          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+          FormatTokenLexer &Tokens, tooling::Replacements &Result) = 0;
+
+  void consumeUnwrappedLine(const UnwrappedLine &TheLine) override;
+
+  void finishRun() override;
+
+  FormatStyle Style;
+  // Stores Style, FileID and SourceManager etc.
+  const Environment &Env;
+  // AffectedRangeMgr stores ranges to be fixed.
+  AffectedRangeManager AffectedRangeMgr;
+  SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines;
+  encoding::Encoding Encoding;
+};
+
+} // end namespace format
+} // end namespace clang
+
+#endif
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp
index e3528b8..0599611 100644
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -134,6 +134,10 @@
 
     if (Left->is(TT_OverloadedOperatorLParen)) {
       Contexts.back().IsExpression = false;
+    } else if (Style.Language == FormatStyle::LK_JavaScript &&
+               Line.startsWith(Keywords.kw_type, tok::identifier)) {
+      // type X = (...);
+      Contexts.back().IsExpression = false;
     } else if (Left->Previous &&
         (Left->Previous->isOneOf(tok::kw_static_assert, tok::kw_decltype,
                                  tok::kw_if, tok::kw_while, tok::l_paren,
@@ -141,6 +145,16 @@
          Left->Previous->is(TT_BinaryOperator))) {
       // static_assert, if and while usually contain expressions.
       Contexts.back().IsExpression = true;
+    } else if (Style.Language == FormatStyle::LK_JavaScript && Left->Previous &&
+               (Left->Previous->is(Keywords.kw_function) ||
+                (Left->Previous->endsSequence(tok::identifier,
+                                              Keywords.kw_function)))) {
+      // function(...) or function f(...)
+      Contexts.back().IsExpression = false;
+    } else if (Style.Language == FormatStyle::LK_JavaScript && Left->Previous &&
+               Left->Previous->is(TT_JsTypeColon)) {
+      // let x: (SomeType);
+      Contexts.back().IsExpression = false;
     } else if (Left->Previous && Left->Previous->is(tok::r_square) &&
                Left->Previous->MatchingParen &&
                Left->Previous->MatchingParen->is(TT_LambdaLSquare)) {
@@ -172,8 +186,8 @@
       Left->Type = TT_ObjCMethodExpr;
     }
 
-    bool MightBeFunctionType = CurrentToken->isOneOf(tok::star, tok::amp) &&
-                               !Contexts[Contexts.size() - 2].IsExpression;
+    bool MightBeFunctionType = !Contexts[Contexts.size() - 2].IsExpression;
+    bool ProbablyFunctionType = CurrentToken->isOneOf(tok::star, tok::amp);
     bool HasMultipleLines = false;
     bool HasMultipleParametersOnALine = false;
     bool MightBeObjCForRangeLoop =
@@ -200,14 +214,15 @@
       if (CurrentToken->Previous->is(TT_PointerOrReference) &&
           CurrentToken->Previous->Previous->isOneOf(tok::l_paren,
                                                     tok::coloncolon))
-        MightBeFunctionType = true;
+        ProbablyFunctionType = true;
+      if (CurrentToken->is(tok::comma))
+        MightBeFunctionType = false;
       if (CurrentToken->Previous->is(TT_BinaryOperator))
         Contexts.back().IsExpression = true;
       if (CurrentToken->is(tok::r_paren)) {
-        if (MightBeFunctionType && CurrentToken->Next &&
+        if (MightBeFunctionType && ProbablyFunctionType && CurrentToken->Next &&
             (CurrentToken->Next->is(tok::l_paren) ||
-             (CurrentToken->Next->is(tok::l_square) &&
-              Line.MustBeDeclaration)))
+             (CurrentToken->Next->is(tok::l_square) && Line.MustBeDeclaration)))
           Left->Type = TT_FunctionTypeLParen;
         Left->MatchingParen = CurrentToken;
         CurrentToken->MatchingParen = Left;
@@ -312,9 +327,9 @@
         Left->Type = TT_JsComputedPropertyName;
       } else if (Style.Language == FormatStyle::LK_Proto ||
                  (Parent &&
-                  Parent->isOneOf(TT_BinaryOperator, tok::at, tok::comma,
-                                  tok::l_paren, tok::l_square, tok::question,
-                                  tok::colon, tok::kw_return,
+                  Parent->isOneOf(TT_BinaryOperator, TT_TemplateCloser, tok::at,
+                                  tok::comma, tok::l_paren, tok::l_square,
+                                  tok::question, tok::colon, tok::kw_return,
                                   // Should only be relevant to JavaScript:
                                   tok::kw_default))) {
         Left->Type = TT_ArrayInitializerLSquare;
@@ -409,7 +424,8 @@
                 (!Contexts.back().ColonIsDictLiteral ||
                  Style.Language != FormatStyle::LK_Cpp)) ||
                Style.Language == FormatStyle::LK_Proto) &&
-              Previous->Tok.getIdentifierInfo())
+              (Previous->Tok.getIdentifierInfo() ||
+               Previous->is(tok::string_literal)))
             Previous->Type = TT_SelectorName;
           if (CurrentToken->is(tok::colon) ||
               Style.Language == FormatStyle::LK_JavaScript)
@@ -423,7 +439,7 @@
   }
 
   void updateParameterCount(FormatToken *Left, FormatToken *Current) {
-    if (Current->is(tok::l_brace) && !Current->is(TT_DictLiteral))
+    if (Current->is(tok::l_brace) && Current->BlockKind == BK_Block)
       ++Left->BlockParameterCount;
     if (Current->is(tok::comma)) {
       ++Left->ParameterCount;
@@ -504,7 +520,7 @@
         Tok->Type = TT_BitFieldColon;
       } else if (Contexts.size() == 1 &&
                  !Line.First->isOneOf(tok::kw_enum, tok::kw_case)) {
-        if (Tok->Previous->is(tok::r_paren))
+        if (Tok->Previous->isOneOf(tok::r_paren, tok::kw_noexcept))
           Tok->Type = TT_CtorInitializerColon;
         else
           Tok->Type = TT_InheritanceColon;
@@ -517,6 +533,14 @@
         Tok->Type = TT_InlineASMColon;
       }
       break;
+    case tok::pipe:
+    case tok::amp:
+      // | and & in declarations/type expressions represent union and
+      // intersection types, respectively.
+      if (Style.Language == FormatStyle::LK_JavaScript &&
+          !Contexts.back().IsExpression)
+        Tok->Type = TT_JsTypeOperator;
+      break;
     case tok::kw_if:
     case tok::kw_while:
       if (CurrentToken && CurrentToken->is(tok::l_paren)) {
@@ -526,6 +550,9 @@
       }
       break;
     case tok::kw_for:
+      if (Style.Language == FormatStyle::LK_JavaScript && Tok->Previous &&
+          Tok->Previous->is(tok::period))
+        break;
       Contexts.back().ColonIsForRangeExpr = true;
       next();
       if (!parseParens())
@@ -612,7 +639,7 @@
       }
       // Declarations cannot be conditional expressions, this can only be part
       // of a type declaration.
-      if (Line.MustBeDeclaration &&
+      if (Line.MustBeDeclaration && !Contexts.back().IsExpression &&
           Style.Language == FormatStyle::LK_JavaScript)
         break;
       parseConditional();
@@ -675,10 +702,24 @@
   }
 
   LineType parsePreprocessorDirective() {
+    bool IsFirstToken = CurrentToken->IsFirst;
     LineType Type = LT_PreprocessorDirective;
     next();
     if (!CurrentToken)
       return Type;
+
+    if (Style.Language == FormatStyle::LK_JavaScript && IsFirstToken) {
+      // JavaScript files can contain shebang lines of the form:
+      // #!/usr/bin/env node
+      // Treat these like C++ #include directives.
+      while (CurrentToken) {
+        // Tokens cannot be comments here.
+        CurrentToken->Type = TT_ImplicitStringLiteral;
+        next();
+      }
+      return LT_ImportStatement;
+    }
+
     if (CurrentToken->Tok.is(tok::numeric_constant)) {
       CurrentToken->SpacesRequiredBefore = 1;
       return Type;
@@ -754,11 +795,29 @@
 
     bool KeywordVirtualFound = false;
     bool ImportStatement = false;
+
+    // import {...} from '...';
+    if (Style.Language == FormatStyle::LK_JavaScript &&
+        CurrentToken->is(Keywords.kw_import))
+      ImportStatement = true;
+
     while (CurrentToken) {
       if (CurrentToken->is(tok::kw_virtual))
         KeywordVirtualFound = true;
-      if (isImportStatement(*CurrentToken))
-        ImportStatement = true;
+      if (Style.Language == FormatStyle::LK_JavaScript) {
+        // export {...} from '...';
+        // An export followed by "from 'some string';" is a re-export from
+        // another module identified by a URI and is treated as a
+        // LT_ImportStatement (i.e. prevent wraps on it for long URIs).
+        // Just "export {...};" or "export class ..." should not be treated as
+        // an import in this sense.
+        if (Line.First->is(tok::kw_export) &&
+            CurrentToken->is(Keywords.kw_from) && CurrentToken->Next &&
+            CurrentToken->Next->isStringLiteral())
+          ImportStatement = true;
+        if (isClosureImportStatement(*CurrentToken))
+          ImportStatement = true;
+      }
       if (!consumeToken())
         return LT_Invalid;
     }
@@ -778,15 +837,15 @@
   }
 
 private:
-  bool isImportStatement(const FormatToken &Tok) {
+  bool isClosureImportStatement(const FormatToken &Tok) {
     // FIXME: Closure-library specific stuff should not be hard-coded but be
     // configurable.
-    return Style.Language == FormatStyle::LK_JavaScript &&
-           Tok.TokenText == "goog" && Tok.Next && Tok.Next->is(tok::period) &&
+    return Tok.TokenText == "goog" && Tok.Next && Tok.Next->is(tok::period) &&
            Tok.Next->Next && (Tok.Next->Next->TokenText == "module" ||
                               Tok.Next->Next->TokenText == "provide" ||
                               Tok.Next->Next->TokenText == "require" ||
-                              Tok.Next->Next->TokenText == "setTestOnly") &&
+                              Tok.Next->Next->TokenText == "setTestOnly" ||
+                              Tok.Next->Next->TokenText == "forwardDeclare") &&
            Tok.Next->Next->Next && Tok.Next->Next->Next->is(tok::l_paren);
   }
 
@@ -862,6 +921,9 @@
   void modifyContext(const FormatToken &Current) {
     if (Current.getPrecedence() == prec::Assignment &&
         !Line.First->isOneOf(tok::kw_template, tok::kw_using, tok::kw_return) &&
+        // Type aliases use `type X = ...;` in TypeScript.
+        !(Style.Language == FormatStyle::LK_JavaScript &&
+          Line.startsWith(Keywords.kw_type, tok::identifier)) &&
         (!Current.Previous || Current.Previous->isNot(tok::kw_operator))) {
       Contexts.back().IsExpression = true;
       if (!Line.startsWith(TT_UnaryOperator)) {
@@ -891,6 +953,10 @@
       Contexts.back().IsExpression = false;
     } else if (Current.is(TT_LambdaArrow) || Current.is(Keywords.kw_assert)) {
       Contexts.back().IsExpression = Style.Language == FormatStyle::LK_Java;
+    } else if (Current.Previous &&
+               Current.Previous->is(TT_CtorInitializerColon)) {
+      Contexts.back().IsExpression = true;
+      Contexts.back().InCtorInitializer = true;
     } else if (Current.isOneOf(tok::r_paren, tok::greater, tok::comma)) {
       for (FormatToken *Previous = Current.Previous;
            Previous && Previous->isOneOf(tok::star, tok::amp);
@@ -898,10 +964,6 @@
         Previous->Type = TT_PointerOrReference;
       if (Line.MustBeDeclaration && !Contexts.front().InCtorInitializer)
         Contexts.back().IsExpression = false;
-    } else if (Current.Previous &&
-               Current.Previous->is(TT_CtorInitializerColon)) {
-      Contexts.back().IsExpression = true;
-      Contexts.back().InCtorInitializer = true;
     } else if (Current.is(tok::kw_new)) {
       Contexts.back().CanBeExpression = false;
     } else if (Current.isOneOf(tok::semi, tok::exclaim)) {
@@ -947,7 +1009,7 @@
       Current.Type = TT_UnaryOperator;
     } else if (Current.is(tok::question)) {
       if (Style.Language == FormatStyle::LK_JavaScript &&
-          Line.MustBeDeclaration) {
+          Line.MustBeDeclaration && !Contexts.back().IsExpression) {
         // In JavaScript, `interface X { foo?(): bar; }` is an optional method
         // on the interface, not a ternary expression.
         Current.Type = TT_JsTypeOptionalQuestion;
@@ -973,7 +1035,8 @@
         Current.Type = TT_CastRParen;
       if (Current.MatchingParen && Current.Next &&
           !Current.Next->isBinaryOperator() &&
-          !Current.Next->isOneOf(tok::semi, tok::colon, tok::l_brace))
+          !Current.Next->isOneOf(tok::semi, tok::colon, tok::l_brace,
+                                 tok::period, tok::arrow, tok::coloncolon))
         if (FormatToken *BeforeParen = Current.MatchingParen->Previous)
           if (BeforeParen->is(tok::identifier) &&
               BeforeParen->TokenText == BeforeParen->TokenText.upper() &&
@@ -1044,6 +1107,9 @@
 
     if (Tok.Previous->isOneOf(TT_LeadingJavaAnnotation, Keywords.kw_instanceof))
       return false;
+    if (Style.Language == FormatStyle::LK_JavaScript &&
+        Tok.Previous->is(Keywords.kw_in))
+      return false;
 
     // Skip "const" as it does not have an influence on whether this is a name.
     FormatToken *PreviousNotConst = Tok.Previous;
@@ -1087,7 +1153,7 @@
 
     FormatToken *LeftOfParens = Tok.MatchingParen->getPreviousNonComment();
     if (LeftOfParens) {
-      // If there is an opening parenthesis left of the current parentheses,
+      // If there is a closing parenthesis left of the current parentheses,
       // look past it as these might be chained casts.
       if (LeftOfParens->is(tok::r_paren)) {
         if (!LeftOfParens->MatchingParen ||
@@ -1106,7 +1172,7 @@
       // Certain other tokens right before the parentheses are also signals that
       // this cannot be a cast.
       if (LeftOfParens->isOneOf(tok::at, tok::r_square, TT_OverloadedOperator,
-                                TT_TemplateCloser))
+                                TT_TemplateCloser, tok::ellipsis))
         return false;
     }
 
@@ -1140,9 +1206,9 @@
     if (!LeftOfParens)
       return false;
 
-    // If the following token is an identifier, this is a cast. All cases where
-    // this can be something else are handled above.
-    if (Tok.Next->is(tok::identifier))
+    // If the following token is an identifier or 'this', this is a cast. All
+    // cases where this can be something else are handled above.
+    if (Tok.Next->isOneOf(tok::identifier, tok::kw_this))
       return true;
 
     if (!Tok.Next->Next)
@@ -1399,11 +1465,15 @@
            Style.Language == FormatStyle::LK_JavaScript) &&
           Current->is(Keywords.kw_instanceof))
         return prec::Relational;
+      if (Style.Language == FormatStyle::LK_JavaScript &&
+          Current->is(Keywords.kw_in))
+        return prec::Relational;
       if (Current->is(TT_BinaryOperator) || Current->is(tok::comma))
         return Current->getPrecedence();
       if (Current->isOneOf(tok::period, tok::arrow))
         return PrecedenceArrowAndPeriod;
-      if (Style.Language == FormatStyle::LK_Java &&
+      if ((Style.Language == FormatStyle::LK_Java ||
+           Style.Language == FormatStyle::LK_JavaScript) &&
           Current->isOneOf(Keywords.kw_extends, Keywords.kw_implements,
                            Keywords.kw_throws))
         return 0;
@@ -1517,7 +1587,8 @@
 
 // This function heuristically determines whether 'Current' starts the name of a
 // function declaration.
-static bool isFunctionDeclarationName(const FormatToken &Current) {
+static bool isFunctionDeclarationName(const FormatToken &Current,
+                                      const AnnotatedLine &Line) {
   auto skipOperatorName = [](const FormatToken* Next) -> const FormatToken* {
     for (; Next; Next = Next->Next) {
       if (Next->is(TT_OverloadedOperatorLParen))
@@ -1537,6 +1608,7 @@
     return nullptr;
   };
 
+  // Find parentheses of parameter list.
   const FormatToken *Next = Current.Next;
   if (Current.is(tok::kw_operator)) {
     if (Current.Previous && Current.Previous->is(tok::coloncolon))
@@ -1566,14 +1638,22 @@
     }
   }
 
-  if (!Next || !Next->is(tok::l_paren))
+  // Check whether parameter list can be long to a function declaration.
+  if (!Next || !Next->is(tok::l_paren) || !Next->MatchingParen)
     return false;
+  // If the lines ends with "{", this is likely an function definition.
+  if (Line.Last->is(tok::l_brace))
+    return true;
   if (Next->Next == Next->MatchingParen)
+    return true; // Empty parentheses.
+  // If there is an &/&& after the r_paren, this is likely a function.
+  if (Next->MatchingParen->Next &&
+      Next->MatchingParen->Next->is(TT_PointerOrReference))
     return true;
   for (const FormatToken *Tok = Next->Next; Tok && Tok != Next->MatchingParen;
        Tok = Tok->Next) {
     if (Tok->is(tok::kw_const) || Tok->isSimpleTypeSpecifier() ||
-        Tok->isOneOf(TT_PointerOrReference, TT_StartOfName))
+        Tok->isOneOf(TT_PointerOrReference, TT_StartOfName, tok::ellipsis))
       return true;
     if (Tok->isOneOf(tok::l_brace, tok::string_literal, TT_ObjCMethodExpr) ||
         Tok->Tok.isLiteral())
@@ -1619,7 +1699,7 @@
   FormatToken *Current = Line.First->Next;
   bool InFunctionDecl = Line.MightBeFunctionDecl;
   while (Current) {
-    if (isFunctionDeclarationName(*Current))
+    if (isFunctionDeclarationName(*Current, Line))
       Current->Type = TT_FunctionDeclarationName;
     if (Current->is(TT_LineComment)) {
       if (Current->Previous->BlockKind == BK_BracedInit &&
@@ -1745,7 +1825,7 @@
     if (Style.Language == FormatStyle::LK_Proto)
       return 1;
     if (Left.is(tok::r_square))
-      return 25;
+      return 200;
     // Slightly prefer formatting local lambda definitions like functions.
     if (Right.is(TT_LambdaLSquare) && Left.is(tok::equal))
       return 35;
@@ -1777,6 +1857,8 @@
     return 500;
   if (Left.isOneOf(tok::kw_class, tok::kw_struct))
     return 5000;
+  if (Left.is(tok::comment))
+    return 1000;
 
   if (Left.isOneOf(TT_RangeBasedForLoopColon, TT_InheritanceColon))
     return 2;
@@ -1902,9 +1984,10 @@
   if (Right.isOneOf(tok::semi, tok::comma))
     return false;
   if (Right.is(tok::less) &&
-      (Left.is(tok::kw_template) ||
-       (Line.Type == LT_ObjCDecl && Style.ObjCSpaceBeforeProtocolList)))
+      Line.Type == LT_ObjCDecl && Style.ObjCSpaceBeforeProtocolList)
     return true;
+  if (Right.is(tok::less) && Left.is(tok::kw_template))
+    return Style.SpaceAfterTemplateKeyword;
   if (Left.isOneOf(tok::exclaim, tok::tilde))
     return false;
   if (Left.is(tok::at) &&
@@ -1919,15 +2002,14 @@
   if (Left.is(tok::less) || Right.isOneOf(tok::greater, tok::less))
     return false;
   if (Right.is(tok::ellipsis))
-    return Left.Tok.isLiteral();
+    return Left.Tok.isLiteral() || (Left.is(tok::identifier) && Left.Previous &&
+                                    Left.Previous->is(tok::kw_case));
   if (Left.is(tok::l_square) && Right.is(tok::amp))
     return false;
   if (Right.is(TT_PointerOrReference))
-    return (Left.is(tok::r_paren) && Left.MatchingParen &&
-            (Left.MatchingParen->is(TT_OverloadedOperatorLParen) ||
-             (Left.MatchingParen->Previous &&
-              Left.MatchingParen->Previous->is(TT_FunctionDeclarationName)))) ||
-           (Left.Tok.isLiteral() ||
+    return (Left.is(tok::r_paren) && Line.MightBeFunctionDecl) ||
+           (Left.Tok.isLiteral() || (Left.is(tok::kw_const) && Left.Previous &&
+                                     Left.Previous->is(tok::r_paren)) ||
             (!Left.isOneOf(TT_PointerOrReference, tok::l_paren) &&
              (Style.PointerAlignment != FormatStyle::PAS_Left ||
               Line.IsMultiVariableDeclStmt)));
@@ -2030,8 +2112,14 @@
         Left.isOneOf(Keywords.kw_returns, Keywords.kw_option))
       return true;
   } else if (Style.Language == FormatStyle::LK_JavaScript) {
-    if (Left.isOneOf(Keywords.kw_let, Keywords.kw_var, TT_JsFatArrow,
-                     Keywords.kw_in))
+    if (Left.is(TT_JsFatArrow))
+      return true;
+    if (Right.is(tok::star) &&
+        Left.isOneOf(Keywords.kw_function, Keywords.kw_yield))
+      return false;
+    if (Left.isOneOf(Keywords.kw_let, Keywords.kw_var, Keywords.kw_in,
+                     Keywords.kw_of, tok::kw_const) &&
+        (!Left.Previous || !Left.Previous->is(tok::period)))
       return true;
     if (Left.is(tok::kw_default) && Left.Previous &&
         Left.Previous->is(tok::kw_export))
@@ -2040,6 +2128,8 @@
       return true;
     if (Right.isOneOf(TT_JsTypeColon, TT_JsTypeOptionalQuestion))
       return false;
+    if (Left.is(TT_JsTypeOperator) || Right.is(TT_JsTypeOperator))
+      return false;
     if ((Left.is(tok::l_brace) || Right.is(tok::r_brace)) &&
         Line.First->isOneOf(Keywords.kw_import, tok::kw_export))
       return false;
@@ -2052,6 +2142,11 @@
       // locations that should have whitespace following are identified by the
       // above set of follower tokens.
       return false;
+    // Postfix non-null assertion operator, as in `foo!.bar()`.
+    if (Right.is(tok::exclaim) && (Left.isOneOf(tok::identifier, tok::r_paren,
+                                                tok::r_square, tok::r_brace) ||
+                                   Left.Tok.isLiteral()))
+      return false;
   } else if (Style.Language == FormatStyle::LK_Java) {
     if (Left.is(tok::r_square) && Right.is(tok::l_brace))
       return true;
@@ -2120,7 +2215,7 @@
   if (!Style.SpaceBeforeAssignmentOperators &&
       Right.getPrecedence() == prec::Assignment)
     return false;
-  if (Right.is(tok::coloncolon) && Left.isNot(tok::l_brace))
+  if (Right.is(tok::coloncolon) && !Left.isOneOf(tok::l_brace, tok::comment))
     return (Left.is(TT_TemplateOpener) &&
             Style.Standard == FormatStyle::LS_Cpp03) ||
            !(Left.isOneOf(tok::identifier, tok::l_paren, tok::r_paren,
@@ -2162,8 +2257,8 @@
 
   if (Style.Language == FormatStyle::LK_JavaScript) {
     // FIXME: This might apply to other languages and token kinds.
-    if (Right.is(tok::char_constant) && Left.is(tok::plus) && Left.Previous &&
-        Left.Previous->is(tok::char_constant))
+    if (Right.is(tok::string_literal) && Left.is(tok::plus) && Left.Previous &&
+        Left.Previous->is(tok::string_literal))
       return true;
     if (Left.is(TT_DictLiteral) && Left.is(tok::l_brace) && Line.Level == 0 &&
         Left.Previous && Left.Previous->is(tok::equal) &&
@@ -2249,9 +2344,6 @@
     return (Line.startsWith(tok::kw_enum) && Style.BraceWrapping.AfterEnum) ||
            (Line.startsWith(tok::kw_class) && Style.BraceWrapping.AfterClass) ||
            (Line.startsWith(tok::kw_struct) && Style.BraceWrapping.AfterStruct);
-  if (Style.Language == FormatStyle::LK_Proto && Left.isNot(tok::l_brace) &&
-      Right.is(TT_SelectorName))
-    return true;
   if (Left.is(TT_ObjCBlockLBrace) && !Style.AllowShortBlocksOnASingleLine)
     return true;
 
@@ -2286,6 +2378,12 @@
       return true;
     if (Right.NestingLevel == 0 && Right.is(Keywords.kw_is))
       return false;
+    if (Left.is(Keywords.kw_in))
+      return Style.BreakBeforeBinaryOperators == FormatStyle::BOS_None;
+    if (Right.is(Keywords.kw_in))
+      return Style.BreakBeforeBinaryOperators != FormatStyle::BOS_None;
+    if (Right.is(Keywords.kw_as))
+      return false; // must not break before as in 'x as type' casts
   }
 
   if (Left.is(tok::at))
@@ -2402,7 +2500,7 @@
        Left.getPrecedence() == prec::Assignment))
     return true;
   return Left.isOneOf(tok::comma, tok::coloncolon, tok::semi, tok::l_brace,
-                      tok::kw_class, tok::kw_struct) ||
+                      tok::kw_class, tok::kw_struct, tok::comment) ||
          Right.isMemberAccess() ||
          Right.isOneOf(TT_TrailingReturnArrow, TT_LambdaArrow, tok::lessless,
                        tok::colon, tok::l_square, tok::at) ||
diff --git a/lib/Format/TokenAnnotator.h b/lib/Format/TokenAnnotator.h
index 5329f1f..97daaf4 100644
--- a/lib/Format/TokenAnnotator.h
+++ b/lib/Format/TokenAnnotator.h
@@ -18,7 +18,6 @@
 
 #include "UnwrappedLineParser.h"
 #include "clang/Format/Format.h"
-#include <string>
 
 namespace clang {
 class SourceManager;
@@ -83,7 +82,15 @@
   /// \c true if this line starts with the given tokens in order, ignoring
   /// comments.
   template <typename... Ts> bool startsWith(Ts... Tokens) const {
-    return startsWith(First, Tokens...);
+    return First && First->startsSequence(Tokens...);
+  }
+
+  /// \c true if this line ends with the given tokens in reversed order,
+  /// ignoring comments.
+  /// For example, given tokens [T1, T2, T3, ...], the function returns true if
+  /// this line is like "... T3 T2 T1".
+  template <typename... Ts> bool endsWith(Ts... Tokens) const {
+    return Last && Last->endsSequence(Tokens...);
   }
 
   /// \c true if this line looks like a function definition instead of a
@@ -122,18 +129,6 @@
   // Disallow copying.
   AnnotatedLine(const AnnotatedLine &) = delete;
   void operator=(const AnnotatedLine &) = delete;
-
-  template <typename A, typename... Ts>
-  bool startsWith(FormatToken *Tok, A K1) const {
-    while (Tok && Tok->is(tok::comment))
-      Tok = Tok->Next;
-    return Tok && Tok->is(K1);
-  }
-
-  template <typename A, typename... Ts>
-  bool startsWith(FormatToken *Tok, A K1, Ts... Tokens) const {
-    return startsWith(Tok, K1) && startsWith(Tok->Next, Tokens...);
-  }
 };
 
 /// \brief Determines extra information about the tokens comprising an
diff --git a/lib/Format/UnwrappedLineFormatter.cpp b/lib/Format/UnwrappedLineFormatter.cpp
index f650569..07bfe3e 100644
--- a/lib/Format/UnwrappedLineFormatter.cpp
+++ b/lib/Format/UnwrappedLineFormatter.cpp
@@ -10,6 +10,7 @@
 #include "UnwrappedLineFormatter.h"
 #include "WhitespaceManager.h"
 #include "llvm/Support/Debug.h"
+#include <queue>
 
 #define DEBUG_TYPE "format-formatter"
 
@@ -847,7 +848,9 @@
       unsigned ColumnLimit = getColumnLimit(TheLine.InPPDirective, NextLine);
       bool FitsIntoOneLine =
           TheLine.Last->TotalLength + Indent <= ColumnLimit ||
-          TheLine.Type == LT_ImportStatement;
+          (TheLine.Type == LT_ImportStatement &&
+           (Style.Language != FormatStyle::LK_JavaScript ||
+            !Style.JavaScriptWrapImports));
 
       if (Style.ColumnLimit == 0)
         NoColumnLimitLineFormatter(Indenter, Whitespaces, Style, this)
@@ -863,7 +866,9 @@
       // If no token in the current line is affected, we still need to format
       // affected children.
       if (TheLine.ChildrenAffected)
-        format(TheLine.Children, DryRun);
+        for (const FormatToken *Tok = TheLine.First; Tok; Tok = Tok->Next)
+          if (!Tok->Children.empty())
+            format(Tok->Children, DryRun);
 
       // Adapt following lines on the current indent level to the same level
       // unless the current \c AnnotatedLine is not at the beginning of a line.
diff --git a/lib/Format/UnwrappedLineFormatter.h b/lib/Format/UnwrappedLineFormatter.h
index 478617d..7bcead9 100644
--- a/lib/Format/UnwrappedLineFormatter.h
+++ b/lib/Format/UnwrappedLineFormatter.h
@@ -19,8 +19,6 @@
 #include "ContinuationIndenter.h"
 #include "clang/Format/Format.h"
 #include <map>
-#include <queue>
-#include <string>
 
 namespace clang {
 namespace format {
diff --git a/lib/Format/UnwrappedLineParser.cpp b/lib/Format/UnwrappedLineParser.cpp
index 7b8f6e6..9f79ba6 100644
--- a/lib/Format/UnwrappedLineParser.cpp
+++ b/lib/Format/UnwrappedLineParser.cpp
@@ -363,6 +363,8 @@
           //
           // We exclude + and - as they can be ObjC visibility modifiers.
           ProbablyBracedList =
+              (Style.Language == FormatStyle::LK_JavaScript &&
+               NextTok->isOneOf(Keywords.kw_of, Keywords.kw_in)) ||
               NextTok->isOneOf(tok::comma, tok::period, tok::colon,
                                tok::r_paren, tok::r_square, tok::l_brace,
                                tok::l_square, tok::l_paren, tok::ellipsis) ||
@@ -428,6 +430,9 @@
     ++Line->Level;
   parseLevel(/*HasOpeningBrace=*/true);
 
+  if (eof())
+    return;
+
   if (MacroBlock ? !FormatTok->is(TT_MacroBlockEnd)
                  : !FormatTok->is(tok::r_brace)) {
     Line->Level = InitialLevel;
@@ -658,6 +663,85 @@
          Tok.isNot(tok::kw_noexcept);
 }
 
+static bool mustBeJSIdent(const AdditionalKeywords &Keywords,
+                          const FormatToken *FormatTok) {
+  // FIXME: This returns true for C/C++ keywords like 'struct'.
+  return FormatTok->is(tok::identifier) &&
+         (FormatTok->Tok.getIdentifierInfo() == nullptr ||
+          !FormatTok->isOneOf(Keywords.kw_in, Keywords.kw_of, Keywords.kw_as,
+                              Keywords.kw_async, Keywords.kw_await,
+                              Keywords.kw_yield, Keywords.kw_finally,
+                              Keywords.kw_function, Keywords.kw_import,
+                              Keywords.kw_is, Keywords.kw_let, Keywords.kw_var,
+                              Keywords.kw_abstract, Keywords.kw_extends,
+                              Keywords.kw_implements, Keywords.kw_instanceof,
+                              Keywords.kw_interface, Keywords.kw_throws));
+}
+
+static bool mustBeJSIdentOrValue(const AdditionalKeywords &Keywords,
+                                 const FormatToken *FormatTok) {
+  return FormatTok->Tok.isLiteral() || mustBeJSIdent(Keywords, FormatTok);
+}
+
+// isJSDeclOrStmt returns true if |FormatTok| starts a declaration or statement
+// when encountered after a value (see mustBeJSIdentOrValue).
+static bool isJSDeclOrStmt(const AdditionalKeywords &Keywords,
+                           const FormatToken *FormatTok) {
+  return FormatTok->isOneOf(
+      tok::kw_return, Keywords.kw_yield,
+      // conditionals
+      tok::kw_if, tok::kw_else,
+      // loops
+      tok::kw_for, tok::kw_while, tok::kw_do, tok::kw_continue, tok::kw_break,
+      // switch/case
+      tok::kw_switch, tok::kw_case,
+      // exceptions
+      tok::kw_throw, tok::kw_try, tok::kw_catch, Keywords.kw_finally,
+      // declaration
+      tok::kw_const, tok::kw_class, Keywords.kw_var, Keywords.kw_let,
+      Keywords.kw_async, Keywords.kw_function,
+      // import/export
+      Keywords.kw_import, tok::kw_export);
+}
+
+// readTokenWithJavaScriptASI reads the next token and terminates the current
+// line if JavaScript Automatic Semicolon Insertion must
+// happen between the current token and the next token.
+//
+// This method is conservative - it cannot cover all edge cases of JavaScript,
+// but only aims to correctly handle certain well known cases. It *must not*
+// return true in speculative cases.
+void UnwrappedLineParser::readTokenWithJavaScriptASI() {
+  FormatToken *Previous = FormatTok;
+  readToken();
+  FormatToken *Next = FormatTok;
+
+  bool IsOnSameLine =
+      CommentsBeforeNextToken.empty()
+          ? Next->NewlinesBefore == 0
+          : CommentsBeforeNextToken.front()->NewlinesBefore == 0;
+  if (IsOnSameLine)
+    return;
+
+  bool PreviousMustBeValue = mustBeJSIdentOrValue(Keywords, Previous);
+  if (PreviousMustBeValue && Line && Line->Tokens.size() > 1) {
+    // If the token before the previous one is an '@', the previous token is an
+    // annotation and can precede another identifier/value.
+    const FormatToken *PrePrevious = std::prev(Line->Tokens.end(), 2)->Tok;
+    if (PrePrevious->is(tok::at))
+      return;
+  }
+  if (Next->is(tok::exclaim) && PreviousMustBeValue)
+    addUnwrappedLine();
+  bool NextMustBeValue = mustBeJSIdentOrValue(Keywords, Next);
+  if (NextMustBeValue && (PreviousMustBeValue ||
+                          Previous->isOneOf(tok::r_square, tok::r_paren,
+                                            tok::plusplus, tok::minusminus)))
+    addUnwrappedLine();
+  if (PreviousMustBeValue && isJSDeclOrStmt(Keywords, Next))
+    addUnwrappedLine();
+}
+
 void UnwrappedLineParser::parseStructuralElement() {
   assert(!FormatTok->is(tok::l_brace));
   if (Style.Language == FormatStyle::LK_TableGen &&
@@ -798,10 +882,23 @@
                  /*MunchSemi=*/false);
       return;
     }
-    if (Style.Language == FormatStyle::LK_JavaScript &&
-        FormatTok->is(Keywords.kw_import)) {
-      parseJavaScriptEs6ImportExport();
-      return;
+    if (FormatTok->is(Keywords.kw_import)) {
+      if (Style.Language == FormatStyle::LK_JavaScript) {
+        parseJavaScriptEs6ImportExport();
+        return;
+      }
+      if (Style.Language == FormatStyle::LK_Proto) {
+        nextToken();
+        if (FormatTok->is(tok::kw_public))
+          nextToken();
+        if (!FormatTok->is(tok::string_literal))
+          return;
+        nextToken();
+        if (FormatTok->is(tok::semi))
+          nextToken();
+        addUnwrappedLine();
+        return;
+      }
     }
     if (FormatTok->isOneOf(Keywords.kw_signals, Keywords.kw_qsignals,
                            Keywords.kw_slots, Keywords.kw_qslots)) {
@@ -809,8 +906,8 @@
       if (FormatTok->is(tok::colon)) {
         nextToken();
         addUnwrappedLine();
+        return;
       }
-      return;
     }
     // In all other cases, parse the declaration.
     break;
@@ -818,6 +915,7 @@
     break;
   }
   do {
+    const FormatToken *Previous = getPreviousToken();
     switch (FormatTok->Tok.getKind()) {
     case tok::at:
       nextToken();
@@ -825,6 +923,12 @@
         parseBracedList();
       break;
     case tok::kw_enum:
+      // Ignore if this is part of "template <enum ...".
+      if (Previous && Previous->is(tok::less)) {
+        nextToken();
+        break;
+      }
+
       // parseEnum falls through and does not yet add an unwrapped line as an
       // enum definition can start a structural element.
       if (!parseEnum())
@@ -922,18 +1026,35 @@
       // Parse function literal unless 'function' is the first token in a line
       // in which case this should be treated as a free-standing function.
       if (Style.Language == FormatStyle::LK_JavaScript &&
-          FormatTok->is(Keywords.kw_function) && Line->Tokens.size() > 0) {
+          (FormatTok->is(Keywords.kw_function) ||
+           FormatTok->startsSequence(Keywords.kw_async,
+                                     Keywords.kw_function)) &&
+          Line->Tokens.size() > 0) {
         tryToParseJSFunction();
         break;
       }
       if ((Style.Language == FormatStyle::LK_JavaScript ||
            Style.Language == FormatStyle::LK_Java) &&
           FormatTok->is(Keywords.kw_interface)) {
+        if (Style.Language == FormatStyle::LK_JavaScript) {
+          // In JavaScript/TypeScript, "interface" can be used as a standalone
+          // identifier, e.g. in `var interface = 1;`. If "interface" is
+          // followed by another identifier, it is very like to be an actual
+          // interface declaration.
+          unsigned StoredPosition = Tokens->getPosition();
+          FormatToken *Next = Tokens->getNextToken();
+          FormatTok = Tokens->setPosition(StoredPosition);
+          if (Next && !mustBeJSIdent(Keywords, Next)) {
+            nextToken();
+            break;
+          }
+        }
         parseRecord();
         addUnwrappedLine();
         return;
       }
 
+      // See if the following token should start a new unwrapped line.
       StringRef Text = FormatTok->TokenText;
       nextToken();
       if (Line->Tokens.size() == 1 &&
@@ -941,6 +1062,7 @@
           // not labels.
           Style.Language != FormatStyle::LK_JavaScript) {
         if (FormatTok->Tok.is(tok::colon) && !Line->MustBeDeclaration) {
+          Line->Tokens.begin()->Tok->MustBreakBefore = true;
           parseLabel();
           return;
         }
@@ -1093,8 +1215,17 @@
 }
 
 void UnwrappedLineParser::tryToParseJSFunction() {
+  assert(FormatTok->is(Keywords.kw_function) ||
+         FormatTok->startsSequence(Keywords.kw_async, Keywords.kw_function));
+  if (FormatTok->is(Keywords.kw_async))
+    nextToken();
+  // Consume "function".
   nextToken();
 
+  // Consume * (generator function).
+  if (FormatTok->is(tok::star))
+    nextToken();
+
   // Consume function name.
   if (FormatTok->is(tok::identifier))
     nextToken();
@@ -1139,7 +1270,8 @@
   // replace this by using parseAssigmentExpression() inside.
   do {
     if (Style.Language == FormatStyle::LK_JavaScript) {
-      if (FormatTok->is(Keywords.kw_function)) {
+      if (FormatTok->is(Keywords.kw_function) ||
+          FormatTok->startsSequence(Keywords.kw_async, Keywords.kw_function)) {
         tryToParseJSFunction();
         continue;
       }
@@ -1237,7 +1369,8 @@
       break;
     case tok::identifier:
       if (Style.Language == FormatStyle::LK_JavaScript &&
-          FormatTok->is(Keywords.kw_function))
+          (FormatTok->is(Keywords.kw_function) ||
+           FormatTok->startsSequence(Keywords.kw_async, Keywords.kw_function)))
         tryToParseJSFunction();
       else
         nextToken();
@@ -1315,6 +1448,8 @@
       addUnwrappedLine();
       ++Line->Level;
       parseStructuralElement();
+      if (FormatTok->is(tok::eof))
+        addUnwrappedLine();
       --Line->Level;
     }
   } else if (NeedsUnwrappedLine) {
@@ -1503,6 +1638,10 @@
     addUnwrappedLine();
   }
   Line->Level = OldLineLevel;
+  if (FormatTok->isNot(tok::l_brace)) {
+    parseStructuralElement();
+    addUnwrappedLine();
+  }
 }
 
 void UnwrappedLineParser::parseCaseLabel() {
@@ -1550,7 +1689,8 @@
   // In TypeScript, "enum" can also be used as property name, e.g. in interface
   // declarations. An "enum" keyword followed by a colon would be a syntax
   // error and thus assume it is just an identifier.
-  if (Style.Language == FormatStyle::LK_JavaScript && FormatTok->is(tok::colon))
+  if (Style.Language == FormatStyle::LK_JavaScript &&
+      FormatTok->isOneOf(tok::colon, tok::question))
     return false;
 
   // Eat up enum class ...
@@ -1795,28 +1935,31 @@
 }
 
 void UnwrappedLineParser::parseJavaScriptEs6ImportExport() {
-  assert(FormatTok->isOneOf(Keywords.kw_import, tok::kw_export));
+  bool IsImport = FormatTok->is(Keywords.kw_import);
+  assert(IsImport || FormatTok->is(tok::kw_export));
   nextToken();
 
   // Consume the "default" in "export default class/function".
   if (FormatTok->is(tok::kw_default))
     nextToken();
 
-  // Consume "function" and "default function", so that these get parsed as
-  // free-standing JS functions, i.e. do not require a trailing semicolon.
+  // Consume "async function", "function" and "default function", so that these
+  // get parsed as free-standing JS functions, i.e. do not require a trailing
+  // semicolon.
+  if (FormatTok->is(Keywords.kw_async))
+    nextToken();
   if (FormatTok->is(Keywords.kw_function)) {
     nextToken();
     return;
   }
 
-  // Consume the "abstract" in "export abstract class".
-  if (FormatTok->is(Keywords.kw_abstract))
-    nextToken();
-
-  if (FormatTok->isOneOf(tok::kw_const, tok::kw_class, tok::kw_enum,
-                         Keywords.kw_interface, Keywords.kw_let,
-                         Keywords.kw_var))
-    return; // Fall through to parsing the corresponding structure.
+  // For imports, `export *`, `export {...}`, consume the rest of the line up
+  // to the terminating `;`. For everything else, just return and continue
+  // parsing the structural element, i.e. the declaration or expression for
+  // `export default`.
+  if (!IsImport && !FormatTok->isOneOf(tok::l_brace, tok::star) &&
+      !FormatTok->isStringLiteral())
+    return;
 
   while (!eof() && FormatTok->isNot(tok::semi)) {
     if (FormatTok->is(tok::l_brace)) {
@@ -1895,7 +2038,10 @@
     return;
   flushComments(isOnNewLine(*FormatTok));
   pushToken(FormatTok);
-  readToken();
+  if (Style.Language != FormatStyle::LK_JavaScript)
+    readToken();
+  else
+    readTokenWithJavaScriptASI();
 }
 
 const FormatToken *UnwrappedLineParser::getPreviousToken() {
diff --git a/lib/Format/UnwrappedLineParser.h b/lib/Format/UnwrappedLineParser.h
index 6d40ab4..9c78d33 100644
--- a/lib/Format/UnwrappedLineParser.h
+++ b/lib/Format/UnwrappedLineParser.h
@@ -81,6 +81,7 @@
   void parsePPElse();
   void parsePPEndIf();
   void parsePPUnknown();
+  void readTokenWithJavaScriptASI();
   void parseStructuralElement();
   bool tryToParseBracedList();
   bool parseBracedList(bool ContinueOnSemicolons = false);
diff --git a/lib/Format/WhitespaceManager.cpp b/lib/Format/WhitespaceManager.cpp
index 0673dfb..8ca307b 100644
--- a/lib/Format/WhitespaceManager.cpp
+++ b/lib/Format/WhitespaceManager.cpp
@@ -502,8 +502,13 @@
   if (StringRef(SourceMgr.getCharacterData(Range.getBegin()),
                 WhitespaceLength) == Text)
     return;
-  Replaces.insert(tooling::Replacement(
+  auto Err = Replaces.add(tooling::Replacement(
       SourceMgr, CharSourceRange::getCharRange(Range), Text));
+  // FIXME: better error handling. For now, just print an error message in the
+  // release version.
+  if (Err)
+    llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+  assert(!Err);
 }
 
 void WhitespaceManager::appendNewlineText(std::string &Text,
@@ -558,6 +563,14 @@
     }
     Text.append(Spaces, ' ');
     break;
+  case FormatStyle::UT_ForContinuationAndIndentation:
+    if (WhitespaceStartColumn == 0) {
+      unsigned Tabs = Spaces / Style.TabWidth;
+      Text.append(Tabs, '\t');
+      Spaces -= Tabs * Style.TabWidth;
+    }
+    Text.append(Spaces, ' ');
+    break;
   }
 }
 
diff --git a/lib/Format/WhitespaceManager.h b/lib/Format/WhitespaceManager.h
index 9ca9db6..3562347 100644
--- a/lib/Format/WhitespaceManager.h
+++ b/lib/Format/WhitespaceManager.h
@@ -37,7 +37,7 @@
 /// There may be multiple calls to \c breakToken for a given token.
 class WhitespaceManager {
 public:
-  WhitespaceManager(SourceManager &SourceMgr, const FormatStyle &Style,
+  WhitespaceManager(const SourceManager &SourceMgr, const FormatStyle &Style,
                     bool UseCRLF)
       : SourceMgr(SourceMgr), Style(Style), UseCRLF(UseCRLF) {}
 
@@ -203,7 +203,7 @@
                         unsigned Spaces, unsigned WhitespaceStartColumn);
 
   SmallVector<Change, 16> Changes;
-  SourceManager &SourceMgr;
+  const SourceManager &SourceMgr;
   tooling::Replacements Replaces;
   const FormatStyle &Style;
   bool UseCRLF;
diff --git a/lib/Frontend/ASTConsumers.cpp b/lib/Frontend/ASTConsumers.cpp
index 52776b6..bd2ee06 100644
--- a/lib/Frontend/ASTConsumers.cpp
+++ b/lib/Frontend/ASTConsumers.cpp
@@ -19,7 +19,6 @@
 #include "clang/AST/RecordLayout.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/Diagnostic.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Timer.h"
@@ -35,9 +34,9 @@
     typedef RecursiveASTVisitor<ASTPrinter> base;
 
   public:
-    ASTPrinter(raw_ostream *Out = nullptr, bool Dump = false,
+    ASTPrinter(std::unique_ptr<raw_ostream> Out = nullptr, bool Dump = false,
                StringRef FilterString = "", bool DumpLookups = false)
-        : Out(Out ? *Out : llvm::outs()), Dump(Dump),
+        : Out(Out ? *Out : llvm::outs()), OwnedOut(std::move(Out)), Dump(Dump),
           FilterString(FilterString), DumpLookups(DumpLookups) {}
 
     void HandleTranslationUnit(ASTContext &Context) override {
@@ -94,6 +93,7 @@
     }
 
     raw_ostream &Out;
+    std::unique_ptr<raw_ostream> OwnedOut;
     bool Dump;
     std::string FilterString;
     bool DumpLookups;
@@ -122,9 +122,11 @@
   };
 } // end anonymous namespace
 
-std::unique_ptr<ASTConsumer> clang::CreateASTPrinter(raw_ostream *Out,
-                                                     StringRef FilterString) {
-  return llvm::make_unique<ASTPrinter>(Out, /*Dump=*/false, FilterString);
+std::unique_ptr<ASTConsumer>
+clang::CreateASTPrinter(std::unique_ptr<raw_ostream> Out,
+                        StringRef FilterString) {
+  return llvm::make_unique<ASTPrinter>(std::move(Out), /*Dump=*/false,
+                                       FilterString);
 }
 
 std::unique_ptr<ASTConsumer> clang::CreateASTDumper(StringRef FilterString,
@@ -268,7 +270,7 @@
     // Print the parameters.
     Out << "(";
     bool PrintComma = false;
-    for (auto I : FD->params()) {
+    for (auto I : FD->parameters()) {
       if (PrintComma)
         Out << ", ";
       else
@@ -290,13 +292,12 @@
     // Print the parameters.
     Out << "(";
     bool PrintComma = false;
-    for (FunctionDecl::param_const_iterator I = D->param_begin(),
-           E = D->param_end(); I != E; ++I) {
+    for (ParmVarDecl *Parameter : D->parameters()) {
       if (PrintComma)
         Out << ", ";
       else
         PrintComma = true;
-      Out << **I;
+      Out << *Parameter;
     }
     Out << ")";
 
@@ -320,13 +321,12 @@
     // Print the parameters.
     Out << "(";
     bool PrintComma = false;
-    for (FunctionDecl::param_const_iterator I = D->param_begin(),
-           E = D->param_end(); I != E; ++I) {
+    for (ParmVarDecl *Parameter : D->parameters()) {
       if (PrintComma)
         Out << ", ";
       else
         PrintComma = true;
-      Out << **I;
+      Out << *Parameter;
     }
     Out << ")";
 
diff --git a/lib/Frontend/ASTUnit.cpp b/lib/Frontend/ASTUnit.cpp
index 9551e4c..aafbb48 100644
--- a/lib/Frontend/ASTUnit.cpp
+++ b/lib/Frontend/ASTUnit.cpp
@@ -41,7 +41,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <atomic>
@@ -920,17 +919,17 @@
   unsigned &Hash;
   std::vector<Decl *> TopLevelDecls;
   PrecompilePreambleAction *Action;
-  raw_ostream *Out;
+  std::unique_ptr<raw_ostream> Out;
 
 public:
   PrecompilePreambleConsumer(ASTUnit &Unit, PrecompilePreambleAction *Action,
                              const Preprocessor &PP, StringRef isysroot,
-                             raw_ostream *Out)
+                             std::unique_ptr<raw_ostream> Out)
       : PCHGenerator(PP, "", nullptr, isysroot, std::make_shared<PCHBuffer>(),
                      ArrayRef<llvm::IntrusiveRefCntPtr<ModuleFileExtension>>(),
                      /*AllowASTWithErrors=*/true),
         Unit(Unit), Hash(Unit.getCurrentTopLevelHashValue()), Action(Action),
-        Out(Out) {
+        Out(std::move(Out)) {
     Hash = 0;
   }
 
@@ -982,8 +981,9 @@
                                             StringRef InFile) {
   std::string Sysroot;
   std::string OutputFile;
-  raw_ostream *OS = GeneratePCHAction::ComputeASTConsumerArguments(
-      CI, InFile, Sysroot, OutputFile);
+  std::unique_ptr<raw_ostream> OS =
+      GeneratePCHAction::ComputeASTConsumerArguments(CI, InFile, Sysroot,
+                                                     OutputFile);
   if (!OS)
     return nullptr;
 
@@ -994,7 +994,7 @@
       llvm::make_unique<MacroDefinitionTrackerPPCallbacks>(
                                            Unit.getCurrentTopLevelHashValue()));
   return llvm::make_unique<PrecompilePreambleConsumer>(
-      Unit, this, CI.getPreprocessor(), Sysroot, OS);
+      Unit, this, CI.getPreprocessor(), Sysroot, std::move(OS));
 }
 
 static bool isNonDriverDiag(const StoredDiagnostic &StoredDiag) {
@@ -1040,7 +1040,7 @@
 
   // Create the compiler instance to use for building the AST.
   std::unique_ptr<CompilerInstance> Clang(
-      new CompilerInstance(PCHContainerOps));
+      new CompilerInstance(std::move(PCHContainerOps)));
 
   // Recover resources if we crash before exiting this method.
   llvm::CrashRecoveryContextCleanupRegistrar<CompilerInstance>
@@ -1138,11 +1138,9 @@
   if (!Act->BeginSourceFile(*Clang.get(), Clang->getFrontendOpts().Inputs[0]))
     goto error;
 
-  if (SavedMainFileBuffer) {
-    std::string ModName = getPreambleFile(this);
+  if (SavedMainFileBuffer)
     TranslateStoredDiagnostics(getFileManager(), getSourceManager(),
                                PreambleDiagnostics, StoredDiagnostics);
-  }
 
   if (!Act->Execute())
     goto error;
@@ -1380,7 +1378,7 @@
           
       // First, make a record of those files that have been overridden via
       // remapping or unsaved_files.
-      llvm::StringMap<PreambleFileHash> OverriddenFiles;
+      std::map<llvm::sys::fs::UniqueID, PreambleFileHash> OverriddenFiles;
       for (const auto &R : PreprocessorOpts.RemappedFiles) {
         if (AnyFileChanged)
           break;
@@ -1393,24 +1391,38 @@
           break;
         }
 
-        OverriddenFiles[R.first] = PreambleFileHash::createForFile(
+        OverriddenFiles[Status.getUniqueID()] = PreambleFileHash::createForFile(
             Status.getSize(), Status.getLastModificationTime().toEpochTime());
       }
 
       for (const auto &RB : PreprocessorOpts.RemappedFileBuffers) {
         if (AnyFileChanged)
           break;
-        OverriddenFiles[RB.first] =
+
+        vfs::Status Status;
+        if (FileMgr->getNoncachedStatValue(RB.first, Status)) {
+          AnyFileChanged = true;
+          break;
+        }
+
+        OverriddenFiles[Status.getUniqueID()] =
             PreambleFileHash::createForMemoryBuffer(RB.second);
       }
        
       // Check whether anything has changed.
-      for (llvm::StringMap<PreambleFileHash>::iterator 
+      for (llvm::StringMap<PreambleFileHash>::iterator
              F = FilesInPreamble.begin(), FEnd = FilesInPreamble.end();
            !AnyFileChanged && F != FEnd; 
            ++F) {
-        llvm::StringMap<PreambleFileHash>::iterator Overridden
-          = OverriddenFiles.find(F->first());
+        vfs::Status Status;
+        if (FileMgr->getNoncachedStatValue(F->first(), Status)) {
+          // If we can't stat the file, assume that something horrible happened.
+          AnyFileChanged = true;
+          break;
+        }
+
+        std::map<llvm::sys::fs::UniqueID, PreambleFileHash>::iterator Overridden
+          = OverriddenFiles.find(Status.getUniqueID());
         if (Overridden != OverriddenFiles.end()) {
           // This file was remapped; check whether the newly-mapped file 
           // matches up with the previous mapping.
@@ -1420,13 +1432,9 @@
         }
         
         // The file was not remapped; check whether it has changed on disk.
-        vfs::Status Status;
-        if (FileMgr->getNoncachedStatValue(F->first(), Status)) {
-          // If we can't stat the file, assume that something horrible happened.
-          AnyFileChanged = true;
-        } else if (Status.getSize() != uint64_t(F->second.Size) ||
-                   Status.getLastModificationTime().toEpochTime() !=
-                       uint64_t(F->second.ModTime))
+        if (Status.getSize() != uint64_t(F->second.Size) ||
+            Status.getLastModificationTime().toEpochTime() !=
+                uint64_t(F->second.ModTime))
           AnyFileChanged = true;
       }
           
@@ -1506,7 +1514,7 @@
   
   // Create the compiler instance to use for building the precompiled preamble.
   std::unique_ptr<CompilerInstance> Clang(
-      new CompilerInstance(PCHContainerOps));
+      new CompilerInstance(std::move(PCHContainerOps)));
 
   // Recover resources if we crash before exiting this method.
   llvm::CrashRecoveryContextCleanupRegistrar<CompilerInstance>
@@ -1768,7 +1776,7 @@
 
   // Create the compiler instance to use for building the AST.
   std::unique_ptr<CompilerInstance> Clang(
-      new CompilerInstance(PCHContainerOps));
+      new CompilerInstance(std::move(PCHContainerOps)));
 
   // Recover resources if we crash before exiting this method.
   llvm::CrashRecoveryContextCleanupRegistrar<CompilerInstance>
@@ -1888,7 +1896,7 @@
   llvm::CrashRecoveryContextCleanupRegistrar<llvm::MemoryBuffer>
     MemBufferCleanup(OverrideMainBuffer.get());
 
-  return Parse(PCHContainerOps, std::move(OverrideMainBuffer));
+  return Parse(std::move(PCHContainerOps), std::move(OverrideMainBuffer));
 }
 
 std::unique_ptr<ASTUnit> ASTUnit::LoadFromCompilerInvocation(
@@ -1921,7 +1929,7 @@
     llvm::CrashRecoveryContextReleaseRefCleanup<DiagnosticsEngine> >
     DiagCleanup(Diags.get());
 
-  if (AST->LoadFromCompilerInvocation(PCHContainerOps,
+  if (AST->LoadFromCompilerInvocation(std::move(PCHContainerOps),
                                       PrecompilePreambleAfterNParses))
     return nullptr;
   return AST;
@@ -2004,7 +2012,7 @@
   llvm::CrashRecoveryContextCleanupRegistrar<ASTUnit>
     ASTUnitCleanup(AST.get());
 
-  if (AST->LoadFromCompilerInvocation(PCHContainerOps,
+  if (AST->LoadFromCompilerInvocation(std::move(PCHContainerOps),
                                       PrecompilePreambleAfterNParses)) {
     // Some error occurred, if caller wants to examine diagnostics, pass it the
     // ASTUnit.
@@ -2054,7 +2062,8 @@
     getDiagnostics().setNumWarnings(NumWarningsInPreamble);
 
   // Parse the sources
-  bool Result = Parse(PCHContainerOps, std::move(OverrideMainBuffer));
+  bool Result =
+      Parse(std::move(PCHContainerOps), std::move(OverrideMainBuffer));
 
   // If we're caching global code-completion results, and the top-level 
   // declarations have changed, clear out the code-completion cache.
@@ -2796,7 +2805,6 @@
     switch (M.Kind) {
     case serialization::MK_ImplicitModule:
     case serialization::MK_ExplicitModule:
-    case serialization::MK_PrebuiltModule:
       return true; // skip dependencies.
     case serialization::MK_PCH:
       Mod = &M;
@@ -2816,7 +2824,7 @@
 }
 
 bool ASTUnit::isModuleFile() {
-  return isMainFileAST() && !ASTFileLangOpts.CurrentModule.empty();
+  return isMainFileAST() && ASTFileLangOpts.CompilingModule;
 }
 
 void ASTUnit::PreambleData::countLines() const {
diff --git a/lib/Frontend/CMakeLists.txt b/lib/Frontend/CMakeLists.txt
index 4768120..18abecd 100644
--- a/lib/Frontend/CMakeLists.txt
+++ b/lib/Frontend/CMakeLists.txt
@@ -3,9 +3,15 @@
 set(LLVM_LINK_COMPONENTS
   BitReader
   Option
+  ProfileData
   Support
   )
 
+set(optional_deps intrinsics_gen)
+if (CLANG_BUILT_STANDALONE)
+  set(optional_deps)
+endif()
+
 add_clang_library(clangFrontend
   ASTConsumers.cpp
   ASTMerge.cpp
@@ -43,7 +49,7 @@
 
   DEPENDS
   ClangDriverOptions
-  intrinsics_gen
+  ${optional_deps}
 
   LINK_LIBS
   clangAST
diff --git a/lib/Frontend/CacheTokens.cpp b/lib/Frontend/CacheTokens.cpp
index 87f3d17..1d24f12 100644
--- a/lib/Frontend/CacheTokens.cpp
+++ b/lib/Frontend/CacheTokens.cpp
@@ -12,12 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Frontend/Utils.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/FileSystemStatCache.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Frontend/Utils.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/PTHManager.h"
 #include "clang/Lex/Preprocessor.h"
@@ -28,7 +28,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/OnDiskHashTable.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 
 // FIXME: put this somewhere else?
 #ifndef S_ISDIR
@@ -241,7 +240,7 @@
       : Out(out), PP(pp), idcount(0), CurStrOffset(0) {}
 
   PTHMap &getPM() { return PM; }
-  void GeneratePTH(const std::string &MainFile);
+  void GeneratePTH(StringRef MainFile);
 };
 } // end anonymous namespace
 
@@ -479,7 +478,7 @@
   Off += 4;
 }
 
-void PTHWriter::GeneratePTH(const std::string &MainFile) {
+void PTHWriter::GeneratePTH(StringRef MainFile) {
   // Generate the prologue.
   Out << "cfe-pth" << '\0';
   Emit32(PTHManager::Version);
diff --git a/lib/Frontend/ChainedIncludesSource.cpp b/lib/Frontend/ChainedIncludesSource.cpp
index 51771bf..7687b24 100644
--- a/lib/Frontend/ChainedIncludesSource.cpp
+++ b/lib/Frontend/ChainedIncludesSource.cpp
@@ -17,7 +17,9 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Parse/ParseAST.h"
+#include "clang/Sema/MultiplexExternalSemaSource.h"
 #include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/ASTWriter.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -25,51 +27,52 @@
 using namespace clang;
 
 namespace {
-class ChainedIncludesSource : public ExternalSemaSource {
+class ChainedIncludesSourceImpl : public ExternalSemaSource {
 public:
-  ~ChainedIncludesSource() override;
-
-  ExternalSemaSource &getFinalReader() const { return *FinalReader; }
-
-  std::vector<CompilerInstance *> CIs;
-  IntrusiveRefCntPtr<ExternalSemaSource> FinalReader;
+  ChainedIncludesSourceImpl(std::vector<std::unique_ptr<CompilerInstance>> CIs)
+      : CIs(std::move(CIs)) {}
 
 protected:
   //===----------------------------------------------------------------------===//
   // ExternalASTSource interface.
   //===----------------------------------------------------------------------===//
 
-  Decl *GetExternalDecl(uint32_t ID) override;
-  Selector GetExternalSelector(uint32_t ID) override;
-  uint32_t GetNumExternalSelectors() override;
-  Stmt *GetExternalDeclStmt(uint64_t Offset) override;
-  CXXCtorInitializer **GetExternalCXXCtorInitializers(uint64_t Offset) override;
-  CXXBaseSpecifier *GetExternalCXXBaseSpecifiers(uint64_t Offset) override;
-  bool FindExternalVisibleDeclsByName(const DeclContext *DC,
-                                      DeclarationName Name) override;
-  void
-  FindExternalLexicalDecls(const DeclContext *DC,
-                           llvm::function_ref<bool(Decl::Kind)> IsKindWeWant,
-                           SmallVectorImpl<Decl *> &Result) override;
-  void CompleteType(TagDecl *Tag) override;
-  void CompleteType(ObjCInterfaceDecl *Class) override;
-  void StartedDeserializing() override;
-  void FinishedDeserializing() override;
-  void StartTranslationUnit(ASTConsumer *Consumer) override;
-  void PrintStats() override;
-
   /// Return the amount of memory used by memory buffers, breaking down
   /// by heap-backed versus mmap'ed memory.
-  void getMemoryBufferSizes(MemoryBufferSizes &sizes) const override;
+  void getMemoryBufferSizes(MemoryBufferSizes &sizes) const override {
+    for (unsigned i = 0, e = CIs.size(); i != e; ++i) {
+      if (const ExternalASTSource *eSrc =
+          CIs[i]->getASTContext().getExternalSource()) {
+        eSrc->getMemoryBufferSizes(sizes);
+      }
+    }
+  }
 
-  //===----------------------------------------------------------------------===//
-  // ExternalSemaSource interface.
-  //===----------------------------------------------------------------------===//
+private:
+  std::vector<std::unique_ptr<CompilerInstance>> CIs;
+};
 
-  void InitializeSema(Sema &S) override;
-  void ForgetSema() override;
-  void ReadMethodPool(Selector Sel) override;
-  bool LookupUnqualified(LookupResult &R, Scope *S) override;
+/// Members of ChainedIncludesSource, factored out so we can initialize
+/// them before we initialize the ExternalSemaSource base class.
+struct ChainedIncludesSourceMembers {
+  ChainedIncludesSourceMembers(
+      std::vector<std::unique_ptr<CompilerInstance>> CIs,
+      IntrusiveRefCntPtr<ExternalSemaSource> FinalReader)
+      : Impl(std::move(CIs)), FinalReader(std::move(FinalReader)) {}
+  ChainedIncludesSourceImpl Impl;
+  IntrusiveRefCntPtr<ExternalSemaSource> FinalReader;
+};
+
+/// Use MultiplexExternalSemaSource to dispatch all ExternalSemaSource
+/// calls to the final reader.
+class ChainedIncludesSource
+    : private ChainedIncludesSourceMembers,
+      public MultiplexExternalSemaSource {
+public:
+  ChainedIncludesSource(std::vector<std::unique_ptr<CompilerInstance>> CIs,
+                        IntrusiveRefCntPtr<ExternalSemaSource> FinalReader)
+      : ChainedIncludesSourceMembers(std::move(CIs), std::move(FinalReader)),
+        MultiplexExternalSemaSource(Impl, *this->FinalReader) {}
 };
 }
 
@@ -107,18 +110,13 @@
   return nullptr;
 }
 
-ChainedIncludesSource::~ChainedIncludesSource() {
-  for (unsigned i = 0, e = CIs.size(); i != e; ++i)
-    delete CIs[i];
-}
-
 IntrusiveRefCntPtr<ExternalSemaSource> clang::createChainedIncludesSource(
     CompilerInstance &CI, IntrusiveRefCntPtr<ExternalSemaSource> &Reader) {
 
   std::vector<std::string> &includes = CI.getPreprocessorOpts().ChainedIncludes;
   assert(!includes.empty() && "No '-chain-include' in options!");
 
-  IntrusiveRefCntPtr<ChainedIncludesSource> source(new ChainedIncludesSource());
+  std::vector<std::unique_ptr<CompilerInstance>> CIs;
   InputKind IK = CI.getFrontendOpts().Inputs[0].getKind();
 
   SmallVector<std::unique_ptr<llvm::MemoryBuffer>, 4> SerialBufs;
@@ -206,7 +204,7 @@
     SerialBufs.push_back(llvm::MemoryBuffer::getMemBufferCopy(
         StringRef(serialAST.data(), serialAST.size())));
     serialAST.clear();
-    source->CIs.push_back(Clang.release());
+    CIs.push_back(std::move(Clang));
   }
 
   assert(!SerialBufs.empty());
@@ -216,83 +214,6 @@
   if (!Reader)
     return nullptr;
 
-  source->FinalReader = Reader;
-  return source;
+  return IntrusiveRefCntPtr<ChainedIncludesSource>(
+      new ChainedIncludesSource(std::move(CIs), Reader));
 }
-
-//===----------------------------------------------------------------------===//
-// ExternalASTSource interface.
-//===----------------------------------------------------------------------===//
-
-Decl *ChainedIncludesSource::GetExternalDecl(uint32_t ID) {
-  return getFinalReader().GetExternalDecl(ID);
-}
-Selector ChainedIncludesSource::GetExternalSelector(uint32_t ID) {
-  return getFinalReader().GetExternalSelector(ID);
-}
-uint32_t ChainedIncludesSource::GetNumExternalSelectors() {
-  return getFinalReader().GetNumExternalSelectors();
-}
-Stmt *ChainedIncludesSource::GetExternalDeclStmt(uint64_t Offset) {
-  return getFinalReader().GetExternalDeclStmt(Offset);
-}
-CXXBaseSpecifier *
-ChainedIncludesSource::GetExternalCXXBaseSpecifiers(uint64_t Offset) {
-  return getFinalReader().GetExternalCXXBaseSpecifiers(Offset);
-}
-CXXCtorInitializer **
-ChainedIncludesSource::GetExternalCXXCtorInitializers(uint64_t Offset) {
-  return getFinalReader().GetExternalCXXCtorInitializers(Offset);
-}
-bool
-ChainedIncludesSource::FindExternalVisibleDeclsByName(const DeclContext *DC,
-                                                      DeclarationName Name) {
-  return getFinalReader().FindExternalVisibleDeclsByName(DC, Name);
-}
-void ChainedIncludesSource::FindExternalLexicalDecls(
-    const DeclContext *DC, llvm::function_ref<bool(Decl::Kind)> IsKindWeWant,
-    SmallVectorImpl<Decl *> &Result) {
-  return getFinalReader().FindExternalLexicalDecls(DC, IsKindWeWant, Result);
-}
-void ChainedIncludesSource::CompleteType(TagDecl *Tag) {
-  return getFinalReader().CompleteType(Tag);
-}
-void ChainedIncludesSource::CompleteType(ObjCInterfaceDecl *Class) {
-  return getFinalReader().CompleteType(Class);
-}
-void ChainedIncludesSource::StartedDeserializing() {
-  return getFinalReader().StartedDeserializing();
-}
-void ChainedIncludesSource::FinishedDeserializing() {
-  return getFinalReader().FinishedDeserializing();
-}
-void ChainedIncludesSource::StartTranslationUnit(ASTConsumer *Consumer) {
-  return getFinalReader().StartTranslationUnit(Consumer);
-}
-void ChainedIncludesSource::PrintStats() {
-  return getFinalReader().PrintStats();
-}
-void ChainedIncludesSource::getMemoryBufferSizes(MemoryBufferSizes &sizes)const{
-  for (unsigned i = 0, e = CIs.size(); i != e; ++i) {
-    if (const ExternalASTSource *eSrc =
-        CIs[i]->getASTContext().getExternalSource()) {
-      eSrc->getMemoryBufferSizes(sizes);
-    }
-  }
-
-  getFinalReader().getMemoryBufferSizes(sizes);
-}
-
-void ChainedIncludesSource::InitializeSema(Sema &S) {
-  return getFinalReader().InitializeSema(S);
-}
-void ChainedIncludesSource::ForgetSema() {
-  return getFinalReader().ForgetSema();
-}
-void ChainedIncludesSource::ReadMethodPool(Selector Sel) {
-  getFinalReader().ReadMethodPool(Sel);
-}
-bool ChainedIncludesSource::LookupUnqualified(LookupResult &R, Scope *S) {
-  return getFinalReader().LookupUnqualified(R, S);
-}
-
diff --git a/lib/Frontend/CompilerInstance.cpp b/lib/Frontend/CompilerInstance.cpp
index 5df9cad..84cc43e 100644
--- a/lib/Frontend/CompilerInstance.cpp
+++ b/lib/Frontend/CompilerInstance.cpp
@@ -30,6 +30,7 @@
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/PTHManager.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Serialization/ASTReader.h"
@@ -49,6 +50,7 @@
 #include <sys/stat.h>
 #include <system_error>
 #include <time.h>
+#include <utility>
 
 using namespace clang;
 
@@ -56,7 +58,8 @@
     std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     bool BuildingModule)
     : ModuleLoader(BuildingModule), Invocation(new CompilerInvocation()),
-      ModuleManager(nullptr), ThePCHContainerOperations(PCHContainerOps),
+      ModuleManager(nullptr),
+      ThePCHContainerOperations(std::move(PCHContainerOps)),
       BuildGlobalModuleIndex(false), HaveFullGlobalModuleIndex(false),
       ModuleBuildFailed(false) {}
 
@@ -126,7 +129,7 @@
   return ModuleManager;
 }
 void CompilerInstance::setModuleManager(IntrusiveRefCntPtr<ASTReader> Reader) {
-  ModuleManager = Reader;
+  ModuleManager = std::move(Reader);
 }
 
 std::shared_ptr<ModuleDependencyCollector>
@@ -136,7 +139,7 @@
 
 void CompilerInstance::setModuleDepCollector(
     std::shared_ptr<ModuleDependencyCollector> Collector) {
-  ModuleDepCollector = Collector;
+  ModuleDepCollector = std::move(Collector);
 }
 
 // Diagnostics
@@ -365,19 +368,19 @@
 
   // Handle generating header include information, if requested.
   if (DepOpts.ShowHeaderIncludes)
-    AttachHeaderIncludeGen(*PP, DepOpts.ExtraDeps);
+    AttachHeaderIncludeGen(*PP, DepOpts);
   if (!DepOpts.HeaderIncludeOutputFile.empty()) {
     StringRef OutputPath = DepOpts.HeaderIncludeOutputFile;
     if (OutputPath == "-")
       OutputPath = "";
-    AttachHeaderIncludeGen(*PP, DepOpts.ExtraDeps,
+    AttachHeaderIncludeGen(*PP, DepOpts,
                            /*ShowAllHeaders=*/true, OutputPath,
                            /*ShowDepth=*/false);
   }
 
   if (DepOpts.PrintShowIncludes) {
-    AttachHeaderIncludeGen(*PP, DepOpts.ExtraDeps,
-                           /*ShowAllHeaders=*/false, /*OutputPath=*/"",
+    AttachHeaderIncludeGen(*PP, DepOpts,
+                           /*ShowAllHeaders=*/true, /*OutputPath=*/"",
                            /*ShowDepth=*/true, /*MSStyle=*/true);
   }
 }
@@ -472,7 +475,7 @@
 // Code Completion
 
 static bool EnableCodeCompletion(Preprocessor &PP,
-                                 const std::string &Filename,
+                                 StringRef Filename,
                                  unsigned Line,
                                  unsigned Column) {
   // Tell the source manager to chop off the given file at a specific
@@ -559,15 +562,11 @@
 // Output Files
 
 void CompilerInstance::addOutputFile(OutputFile &&OutFile) {
-  assert(OutFile.OS && "Attempt to add empty stream to output list!");
   OutputFiles.push_back(std::move(OutFile));
 }
 
 void CompilerInstance::clearOutputFiles(bool EraseFiles) {
   for (OutputFile &OF : OutputFiles) {
-    // Manually close the stream before we rename it.
-    OF.OS.reset();
-
     if (!OF.TempFilename.empty()) {
       if (EraseFiles) {
         llvm::sys::fs::remove(OF.TempFilename);
@@ -587,13 +586,12 @@
       }
     } else if (!OF.Filename.empty() && EraseFiles)
       llvm::sys::fs::remove(OF.Filename);
-
   }
   OutputFiles.clear();
   NonSeekStream.reset();
 }
 
-raw_pwrite_stream *
+std::unique_ptr<raw_pwrite_stream>
 CompilerInstance::createDefaultOutputFile(bool Binary, StringRef InFile,
                                           StringRef Extension) {
   return createOutputFile(getFrontendOpts().OutputFile, Binary,
@@ -601,14 +599,11 @@
                           /*UseTemporary=*/true);
 }
 
-llvm::raw_null_ostream *CompilerInstance::createNullOutputFile() {
-  auto OS = llvm::make_unique<llvm::raw_null_ostream>();
-  llvm::raw_null_ostream *Ret = OS.get();
-  addOutputFile(OutputFile("", "", std::move(OS)));
-  return Ret;
+std::unique_ptr<raw_pwrite_stream> CompilerInstance::createNullOutputFile() {
+  return llvm::make_unique<llvm::raw_null_ostream>();
 }
 
-raw_pwrite_stream *
+std::unique_ptr<raw_pwrite_stream>
 CompilerInstance::createOutputFile(StringRef OutputPath, bool Binary,
                                    bool RemoveFileOnSignal, StringRef InFile,
                                    StringRef Extension, bool UseTemporary,
@@ -624,13 +619,12 @@
     return nullptr;
   }
 
-  raw_pwrite_stream *Ret = OS.get();
   // Add the output file -- but don't try to remove "-", since this means we are
   // using stdin.
-  addOutputFile(OutputFile((OutputPathName != "-") ? OutputPathName : "",
-                           TempPathName, std::move(OS)));
+  addOutputFile(
+      OutputFile((OutputPathName != "-") ? OutputPathName : "", TempPathName));
 
-  return Ret;
+  return OS;
 }
 
 std::unique_ptr<llvm::raw_pwrite_stream> CompilerInstance::createOutputFile(
@@ -735,16 +729,17 @@
 // Initialization Utilities
 
 bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input){
-  return InitializeSourceManager(Input, getDiagnostics(),
-                                 getFileManager(), getSourceManager(), 
-                                 getFrontendOpts());
+  return InitializeSourceManager(
+      Input, getDiagnostics(), getFileManager(), getSourceManager(),
+      hasPreprocessor() ? &getPreprocessor().getHeaderSearchInfo() : nullptr,
+      getDependencyOutputOpts(), getFrontendOpts());
 }
 
-bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input,
-                                               DiagnosticsEngine &Diags,
-                                               FileManager &FileMgr,
-                                               SourceManager &SourceMgr,
-                                               const FrontendOptions &Opts) {
+// static
+bool CompilerInstance::InitializeSourceManager(
+    const FrontendInputFile &Input, DiagnosticsEngine &Diags,
+    FileManager &FileMgr, SourceManager &SourceMgr, HeaderSearch *HS,
+    DependencyOutputOptions &DepOpts, const FrontendOptions &Opts) {
   SrcMgr::CharacteristicKind
     Kind = Input.isSystem() ? SrcMgr::C_System : SrcMgr::C_User;
 
@@ -760,7 +755,35 @@
 
   // Figure out where to get and map in the main file.
   if (InputFile != "-") {
-    const FileEntry *File = FileMgr.getFile(InputFile, /*OpenFile=*/true);
+    const FileEntry *File;
+    if (Opts.FindPchSource.empty()) {
+      File = FileMgr.getFile(InputFile, /*OpenFile=*/true);
+    } else {
+      // When building a pch file in clang-cl mode, the .h file is built as if
+      // it was included by a cc file.  Since the driver doesn't know about
+      // all include search directories, the frontend must search the input
+      // file through HeaderSearch here, as if it had been included by the
+      // cc file at Opts.FindPchSource.
+      const FileEntry *FindFile = FileMgr.getFile(Opts.FindPchSource);
+      if (!FindFile) {
+        Diags.Report(diag::err_fe_error_reading) << Opts.FindPchSource;
+        return false;
+      }
+      const DirectoryLookup *UnusedCurDir;
+      SmallVector<std::pair<const FileEntry *, const DirectoryEntry *>, 16>
+          Includers;
+      Includers.push_back(std::make_pair(FindFile, FindFile->getDir()));
+      File = HS->LookupFile(InputFile, SourceLocation(), /*isAngled=*/false,
+                            /*FromDir=*/nullptr,
+                            /*CurDir=*/UnusedCurDir, Includers,
+                            /*SearchPath=*/nullptr,
+                            /*RelativePath=*/nullptr,
+                            /*RequestingModule=*/nullptr,
+                            /*SuggestedModule=*/nullptr, /*SkipCache=*/true);
+      // Also add the header to /showIncludes output.
+      if (File)
+        DepOpts.ShowIncludesPretendHeader = File->getName();
+    }
     if (!File) {
       Diags.Report(diag::err_fe_error_reading) << InputFile;
       return false;
@@ -826,8 +849,9 @@
 
   // Create TargetInfo for the other side of CUDA compilation.
   if (getLangOpts().CUDA && !getFrontendOpts().AuxTriple.empty()) {
-    std::shared_ptr<TargetOptions> TO(new TargetOptions);
+    auto TO = std::make_shared<TargetOptions>();
     TO->Triple = getFrontendOpts().AuxTriple;
+    TO->HostTriple = getTarget().getTriple().str();
     setAuxTarget(TargetInfo::CreateTargetInfo(getDiagnostics(), TO));
   }
 
@@ -837,6 +861,9 @@
   // created. This complexity should be lifted elsewhere.
   getTarget().adjust(getLangOpts());
 
+  // Adjust target options based on codegen options.
+  getTarget().adjustTargetOptions(getCodeGenOpts(), getTargetOpts());
+
   // rewriter project will change target built-in bool type from its default. 
   if (getFrontendOpts().ProgramAction == frontend::RewriteObjC)
     getTarget().noSignedCharForObjCBool();
@@ -1407,8 +1434,7 @@
   // when both the preprocessor and parser see the same import declaration.
   if (ImportLoc.isValid() && LastModuleImportLoc == ImportLoc) {
     // Make the named module visible.
-    if (LastModuleImportResult && ModuleName != getLangOpts().CurrentModule &&
-        ModuleName != getLangOpts().ImplementationOfModule)
+    if (LastModuleImportResult && ModuleName != getLangOpts().CurrentModule)
       ModuleManager->makeModuleVisible(LastModuleImportResult, Visibility,
                                        ImportLoc);
     return LastModuleImportResult;
@@ -1422,33 +1448,14 @@
   if (Known != KnownModules.end()) {
     // Retrieve the cached top-level module.
     Module = Known->second;    
-  } else if (ModuleName == getLangOpts().CurrentModule ||
-             ModuleName == getLangOpts().ImplementationOfModule) {
+  } else if (ModuleName == getLangOpts().CurrentModule) {
     // This is the module we're building. 
     Module = PP->getHeaderSearchInfo().lookupModule(ModuleName);
     Known = KnownModules.insert(std::make_pair(Path[0].first, Module)).first;
   } else {
     // Search for a module with the given name.
     Module = PP->getHeaderSearchInfo().lookupModule(ModuleName);
-    HeaderSearchOptions &HSOpts =
-        PP->getHeaderSearchInfo().getHeaderSearchOpts();
-
-    std::string ModuleFileName;
-    bool LoadFromPrebuiltModulePath = false;
-    // We try to load the module from the prebuilt module paths. If not
-    // successful, we then try to find it in the module cache.
-    if (!HSOpts.PrebuiltModulePaths.empty()) {
-      // Load the module from the prebuilt module path.
-      ModuleFileName = PP->getHeaderSearchInfo().getModuleFileName(
-          ModuleName, "", /*UsePrebuiltPath*/ true);
-      if (!ModuleFileName.empty())
-        LoadFromPrebuiltModulePath = true;
-    }
-    if (!LoadFromPrebuiltModulePath && Module) {
-      // Load the module from the module cache.
-      ModuleFileName = PP->getHeaderSearchInfo().getModuleFileName(Module);
-    } else if (!LoadFromPrebuiltModulePath) {
-      // We can't find a module, error out here.
+    if (!Module) {
       getDiagnostics().Report(ModuleNameLoc, diag::err_module_not_found)
       << ModuleName
       << SourceRange(ImportLoc, ModuleNameLoc);
@@ -1456,8 +1463,10 @@
       return ModuleLoadResult();
     }
 
+    std::string ModuleFileName =
+        PP->getHeaderSearchInfo().getModuleFileName(Module);
     if (ModuleFileName.empty()) {
-      if (Module && Module->HasIncompatibleModuleFile) {
+      if (Module->HasIncompatibleModuleFile) {
         // We tried and failed to load a module file for this module. Fall
         // back to textual inclusion for its headers.
         return ModuleLoadResult(nullptr, /*missingExpected*/true);
@@ -1478,46 +1487,16 @@
       Timer.init("Loading " + ModuleFileName, *FrontendTimerGroup);
     llvm::TimeRegion TimeLoading(FrontendTimerGroup ? &Timer : nullptr);
 
-    // Try to load the module file. If we are trying to load from the prebuilt
-    // module path, we don't have the module map files and don't know how to
-    // rebuild modules.
-    unsigned ARRFlags = LoadFromPrebuiltModulePath ?
-                        ASTReader::ARR_ConfigurationMismatch :
-                        ASTReader::ARR_OutOfDate | ASTReader::ARR_Missing;
+    // Try to load the module file.
+    unsigned ARRFlags = ASTReader::ARR_OutOfDate | ASTReader::ARR_Missing;
     switch (ModuleManager->ReadAST(ModuleFileName,
-                                   LoadFromPrebuiltModulePath ?
-                                   serialization::MK_PrebuiltModule :
                                    serialization::MK_ImplicitModule,
-                                   ImportLoc,
-                                   ARRFlags)) {
-    case ASTReader::Success: {
-      if (LoadFromPrebuiltModulePath && !Module) {
-        Module = PP->getHeaderSearchInfo().lookupModule(ModuleName);
-        if (!Module || !Module->getASTFile() ||
-            FileMgr->getFile(ModuleFileName) != Module->getASTFile()) {
-          // Error out if Module does not refer to the file in the prebuilt
-          // module path.
-          getDiagnostics().Report(ModuleNameLoc, diag::err_module_prebuilt)
-              << ModuleName;
-          ModuleBuildFailed = true;
-          KnownModules[Path[0].first] = nullptr;
-          return ModuleLoadResult();
-        }
-      }
+                                   ImportLoc, ARRFlags)) {
+    case ASTReader::Success:
       break;
-    }
 
     case ASTReader::OutOfDate:
     case ASTReader::Missing: {
-      if (LoadFromPrebuiltModulePath) {
-        // We can't rebuild the module without a module map. Since ReadAST
-        // already produces diagnostics for these two cases, we simply
-        // error out here.
-        ModuleBuildFailed = true;
-        KnownModules[Path[0].first] = nullptr;
-        return ModuleLoadResult();
-      }
-
       // The module file is missing or out-of-date. Build it.
       assert(Module && "missing module file");
       // Check whether there is a cycle in the module graph.
@@ -1568,13 +1547,8 @@
       break;
     }
 
-    case ASTReader::ConfigurationMismatch:
-      if (LoadFromPrebuiltModulePath)
-        getDiagnostics().Report(SourceLocation(),
-                                diag::warn_module_config_mismatch)
-            << ModuleFileName;
-      // Fall through to error out.
     case ASTReader::VersionMismatch:
+    case ASTReader::ConfigurationMismatch:
     case ASTReader::HadErrors:
       ModuleLoader::HadFatalFailure = true;
       // FIXME: The ASTReader will already have complained, but can we shoehorn
@@ -1652,10 +1626,6 @@
     }
   }
 
-  // Don't make the module visible if we are in the implementation.
-  if (ModuleName == getLangOpts().ImplementationOfModule)
-    return ModuleLoadResult(Module, false);
-  
   // Make the named module visible, if it's not already part of the module
   // we are parsing.
   if (ModuleName != getLangOpts().CurrentModule) {
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index 8b0bb9c..974302f 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Frontend/CompilerInvocation.h"
 #include "TestModuleFileExtension.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/FileManager.h"
@@ -15,11 +16,11 @@
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/Util.h"
-#include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/LangStandard.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearchOptions.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/ModuleFileExtension.h"
 #include "llvm/ADT/Hashing.h"
@@ -33,6 +34,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
+#include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -40,6 +42,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include <atomic>
 #include <memory>
 #include <sys/stat.h>
@@ -235,6 +238,7 @@
   }
 
   Opts.ShowCheckerHelp = Args.hasArg(OPT_analyzer_checker_help);
+  Opts.ShowEnabledCheckerList = Args.hasArg(OPT_analyzer_list_enabled_checkers);
   Opts.DisableAllChecks = Args.hasArg(OPT_analyzer_disable_all_checks);
 
   Opts.visualizeExplodedGraphWithGraphViz =
@@ -375,6 +379,46 @@
   }
 }
 
+// Set the profile kind for fprofile-instrument.
+static void setPGOInstrumentor(CodeGenOptions &Opts, ArgList &Args,
+                               DiagnosticsEngine &Diags) {
+  Arg *A = Args.getLastArg(OPT_fprofile_instrument_EQ);
+  if (A == nullptr)
+    return;
+  StringRef S = A->getValue();
+  unsigned I = llvm::StringSwitch<unsigned>(S)
+                   .Case("none", CodeGenOptions::ProfileNone)
+                   .Case("clang", CodeGenOptions::ProfileClangInstr)
+                   .Case("llvm", CodeGenOptions::ProfileIRInstr)
+                   .Default(~0U);
+  if (I == ~0U) {
+    Diags.Report(diag::err_drv_invalid_pgo_instrumentor) << A->getAsString(Args)
+                                                         << S;
+    return;
+  }
+  CodeGenOptions::ProfileInstrKind Instrumentor =
+      static_cast<CodeGenOptions::ProfileInstrKind>(I);
+  Opts.setProfileInstr(Instrumentor);
+}
+
+// Set the profile kind using fprofile-instrument-use-path.
+static void setPGOUseInstrumentor(CodeGenOptions &Opts,
+                                  const Twine &ProfileName) {
+  auto ReaderOrErr = llvm::IndexedInstrProfReader::create(ProfileName);
+  // In error, return silently and let Clang PGOUse report the error message.
+  if (auto E = ReaderOrErr.takeError()) {
+    llvm::consumeError(std::move(E));
+    Opts.setProfileUse(CodeGenOptions::ProfileClangInstr);
+    return;
+  }
+  std::unique_ptr<llvm::IndexedInstrProfReader> PGOReader =
+    std::move(ReaderOrErr.get());
+  if (PGOReader->isIRLevelProfile())
+    Opts.setProfileUse(CodeGenOptions::ProfileIRInstr);
+  else
+    Opts.setProfileUse(CodeGenOptions::ProfileClangInstr);
+}
+
 static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
                              DiagnosticsEngine &Diags,
                              const TargetOptions &TargetOpts) {
@@ -400,13 +444,24 @@
                                  : CodeGenOptions::OnlyAlwaysInlining);
   // -fno-inline-functions overrides OptimizationLevel > 1.
   Opts.NoInline = Args.hasArg(OPT_fno_inline);
-  Opts.setInlining(Args.hasArg(OPT_fno_inline_functions) ?
-                     CodeGenOptions::OnlyAlwaysInlining : Opts.getInlining());
+  if (Arg* InlineArg = Args.getLastArg(options::OPT_finline_functions,
+                                       options::OPT_finline_hint_functions,
+                                       options::OPT_fno_inline_functions)) {
+    const Option& InlineOpt = InlineArg->getOption();
+    if (InlineOpt.matches(options::OPT_finline_functions))
+      Opts.setInlining(CodeGenOptions::NormalInlining);
+    else if (InlineOpt.matches(options::OPT_finline_hint_functions))
+      Opts.setInlining(CodeGenOptions::OnlyHintInlining);
+    else
+      Opts.setInlining(CodeGenOptions::OnlyAlwaysInlining);
+  }
 
   if (Arg *A = Args.getLastArg(OPT_fveclib)) {
     StringRef Name = A->getValue();
     if (Name == "Accelerate")
       Opts.setVecLib(CodeGenOptions::Accelerate);
+    else if (Name == "SVML")
+      Opts.setVecLib(CodeGenOptions::SVML);
     else if (Name == "none")
       Opts.setVecLib(CodeGenOptions::NoLibrary);
     else
@@ -428,22 +483,24 @@
   }
   if (Arg *A = Args.getLastArg(OPT_debugger_tuning_EQ)) {
     unsigned Val = llvm::StringSwitch<unsigned>(A->getValue())
-                       .Case("gdb", CodeGenOptions::DebuggerKindGDB)
-                       .Case("lldb", CodeGenOptions::DebuggerKindLLDB)
-                       .Case("sce", CodeGenOptions::DebuggerKindSCE)
+                       .Case("gdb", unsigned(llvm::DebuggerKind::GDB))
+                       .Case("lldb", unsigned(llvm::DebuggerKind::LLDB))
+                       .Case("sce", unsigned(llvm::DebuggerKind::SCE))
                        .Default(~0U);
     if (Val == ~0U)
       Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args)
                                                 << A->getValue();
     else
-      Opts.setDebuggerTuning(static_cast<CodeGenOptions::DebuggerKind>(Val));
+      Opts.setDebuggerTuning(static_cast<llvm::DebuggerKind>(Val));
   }
   Opts.DwarfVersion = getLastArgIntValue(Args, OPT_dwarf_version_EQ, 0, Diags);
   Opts.DebugColumnInfo = Args.hasArg(OPT_dwarf_column_info);
   Opts.EmitCodeView = Args.hasArg(OPT_gcodeview);
+  Opts.WholeProgramVTables = Args.hasArg(OPT_fwhole_program_vtables);
+  Opts.LTOVisibilityPublicStd = Args.hasArg(OPT_flto_visibility_public_std);
   Opts.SplitDwarfFile = Args.getLastArgValue(OPT_split_dwarf_file);
   Opts.DebugTypeExtRefs = Args.hasArg(OPT_dwarf_ext_refs);
-  Opts.DebugExplicitImport = Triple.isPS4CPU(); 
+  Opts.DebugExplicitImport = Triple.isPS4CPU();
 
   for (const auto &Arg : Args.getAllArgValues(OPT_fdebug_prefix_map_EQ))
     Opts.DebugPrefixMap.insert(StringRef(Arg).split('='));
@@ -471,20 +528,27 @@
     getAllNoBuiltinFuncValues(Args, Opts.NoBuiltinFuncs);
   Opts.UnrollLoops =
       Args.hasFlag(OPT_funroll_loops, OPT_fno_unroll_loops,
-                   (Opts.OptimizationLevel > 1 && !Opts.OptimizeSize));
+                   (Opts.OptimizationLevel > 1));
   Opts.RerollLoops = Args.hasArg(OPT_freroll_loops);
 
   Opts.DisableIntegratedAS = Args.hasArg(OPT_fno_integrated_as);
   Opts.Autolink = !Args.hasArg(OPT_fno_autolink);
   Opts.SampleProfileFile = Args.getLastArgValue(OPT_fprofile_sample_use_EQ);
-  Opts.ProfileInstrGenerate = Args.hasArg(OPT_fprofile_instr_generate) ||
-      Args.hasArg(OPT_fprofile_instr_generate_EQ);
-  Opts.InstrProfileOutput = Args.getLastArgValue(OPT_fprofile_instr_generate_EQ);
-  Opts.InstrProfileInput = Args.getLastArgValue(OPT_fprofile_instr_use_EQ);
+
+  setPGOInstrumentor(Opts, Args, Diags);
+  Opts.InstrProfileOutput =
+      Args.getLastArgValue(OPT_fprofile_instrument_path_EQ);
+  Opts.ProfileInstrumentUsePath =
+      Args.getLastArgValue(OPT_fprofile_instrument_use_path_EQ);
+  if (!Opts.ProfileInstrumentUsePath.empty())
+    setPGOUseInstrumentor(Opts, Opts.ProfileInstrumentUsePath);
+
   Opts.CoverageMapping =
       Args.hasFlag(OPT_fcoverage_mapping, OPT_fno_coverage_mapping, false);
   Opts.DumpCoverageMapping = Args.hasArg(OPT_dump_coverage_mapping);
   Opts.AsmVerbose = Args.hasArg(OPT_masm_verbose);
+  Opts.PreserveAsmComments = !Args.hasArg(OPT_fno_preserve_as_comments);
+  Opts.AssumeSaneOperatorNew = !Args.hasArg(OPT_fno_assume_sane_operator_new);
   Opts.ObjCAutoRefCountExceptions = Args.hasArg(OPT_fobjc_arc_exceptions);
   Opts.CXAAtExit = !Args.hasArg(OPT_fno_use_cxa_atexit);
   Opts.CXXCtorDtorAliases = Args.hasArg(OPT_mconstructor_aliases);
@@ -496,20 +560,6 @@
   Opts.DiscardValueNames = Args.hasArg(OPT_discard_value_names);
   Opts.DisableTailCalls = Args.hasArg(OPT_mdisable_tail_calls);
   Opts.FloatABI = Args.getLastArgValue(OPT_mfloat_abi);
-  if (Arg *A = Args.getLastArg(OPT_meabi)) {
-    StringRef Value = A->getValue();
-    llvm::EABI EABIVersion = llvm::StringSwitch<llvm::EABI>(Value)
-                                 .Case("default", llvm::EABI::Default)
-                                 .Case("4", llvm::EABI::EABI4)
-                                 .Case("5", llvm::EABI::EABI5)
-                                 .Case("gnu", llvm::EABI::GNU)
-                                 .Default(llvm::EABI::Unknown);
-    if (EABIVersion == llvm::EABI::Unknown)
-      Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args)
-                                                << Value;
-    else
-      Opts.EABIVersion = Value;
-  }
   Opts.LessPreciseFPMAD = Args.hasArg(OPT_cl_mad_enable);
   Opts.LimitFloatPrecision = Args.getLastArgValue(OPT_mlimit_float_precision);
   Opts.NoInfsFPMath = (Args.hasArg(OPT_menable_no_infinities) ||
@@ -519,7 +569,11 @@
                        Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
                        Args.hasArg(OPT_cl_finite_math_only) ||
                        Args.hasArg(OPT_cl_fast_relaxed_math));
-  Opts.NoSignedZeros = Args.hasArg(OPT_fno_signed_zeros);
+  Opts.NoSignedZeros = (Args.hasArg(OPT_fno_signed_zeros) ||
+                        Args.hasArg(OPT_cl_no_signed_zeros));
+  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
+  Opts.CorrectlyRoundedDivSqrt =
+      Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
   Opts.ReciprocalMath = Args.hasArg(OPT_freciprocal_math);
   Opts.NoZeroInitializedInBSS = Args.hasArg(OPT_mno_zero_initialized_in_bss);
   Opts.BackendOptions = Args.getAllArgValues(OPT_backend_option);
@@ -558,6 +612,8 @@
 
   Opts.MergeFunctions = Args.hasArg(OPT_fmerge_functions);
 
+  Opts.NoUseJumpTables = Args.hasArg(OPT_fno_jump_tables);
+
   Opts.PrepareForLTO = Args.hasArg(OPT_flto, OPT_flto_EQ);
   const Arg *A = Args.getLastArg(OPT_flto, OPT_flto_EQ);
   Opts.EmitSummaryIndex = A && A->containsValue("thin");
@@ -598,11 +654,54 @@
       }
     }
   }
+	// Handle -fembed-bitcode option.
+  if (Arg *A = Args.getLastArg(OPT_fembed_bitcode_EQ)) {
+    StringRef Name = A->getValue();
+    unsigned Model = llvm::StringSwitch<unsigned>(Name)
+        .Case("off", CodeGenOptions::Embed_Off)
+        .Case("all", CodeGenOptions::Embed_All)
+        .Case("bitcode", CodeGenOptions::Embed_Bitcode)
+        .Case("marker", CodeGenOptions::Embed_Marker)
+        .Default(~0U);
+    if (Model == ~0U) {
+      Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Name;
+      Success = false;
+    } else
+      Opts.setEmbedBitcode(
+          static_cast<CodeGenOptions::EmbedBitcodeKind>(Model));
+  }
+  // FIXME: For backend options that are not yet recorded as function
+  // attributes in the IR, keep track of them so we can embed them in a
+  // separate data section and use them when building the bitcode.
+  if (Opts.getEmbedBitcode() == CodeGenOptions::Embed_All) {
+    for (const auto &A : Args) {
+      // Do not encode output and input.
+      if (A->getOption().getID() == options::OPT_o ||
+          A->getOption().getID() == options::OPT_INPUT ||
+          A->getOption().getID() == options::OPT_x ||
+          A->getOption().getID() == options::OPT_fembed_bitcode ||
+          (A->getOption().getGroup().isValid() &&
+           A->getOption().getGroup().getID() == options::OPT_W_Group))
+        continue;
+      ArgStringList ASL;
+      A->render(Args, ASL);
+      for (const auto &arg : ASL) {
+        StringRef ArgStr(arg);
+        Opts.CmdArgs.insert(Opts.CmdArgs.end(), ArgStr.begin(), ArgStr.end());
+        // using \00 to seperate each commandline options.
+        Opts.CmdArgs.push_back('\0');
+      }
+    }
+  }
 
   Opts.InstrumentFunctions = Args.hasArg(OPT_finstrument_functions);
+  Opts.XRayInstrumentFunctions = Args.hasArg(OPT_fxray_instrument);
+  Opts.XRayInstructionThreshold =
+      getLastArgIntValue(Args, OPT_fxray_instruction_threshold_, 200, Diags);
   Opts.InstrumentForProfiling = Args.hasArg(OPT_pg);
   Opts.EmitOpenCLArgMetadata = Args.hasArg(OPT_cl_kernel_arg_info);
   Opts.CompressDebugSections = Args.hasArg(OPT_compress_debug_sections);
+  Opts.RelaxELFRelocations = Args.hasArg(OPT_mrelax_relocations);
   Opts.DebugCompilationDir = Args.getLastArgValue(OPT_fdebug_compilation_dir);
   for (auto A : Args.filtered(OPT_mlink_bitcode_file, OPT_mlink_cuda_bitcode)) {
     unsigned LinkFlags = llvm::Linker::Flags::None;
@@ -619,12 +718,15 @@
   Opts.SanitizeCoverageTraceCmp = Args.hasArg(OPT_fsanitize_coverage_trace_cmp);
   Opts.SanitizeCoverage8bitCounters =
       Args.hasArg(OPT_fsanitize_coverage_8bit_counters);
+  Opts.SanitizeCoverageTracePC = Args.hasArg(OPT_fsanitize_coverage_trace_pc);
   Opts.SanitizeMemoryTrackOrigins =
       getLastArgIntValue(Args, OPT_fsanitize_memory_track_origins_EQ, 0, Diags);
   Opts.SanitizeMemoryUseAfterDtor =
       Args.hasArg(OPT_fsanitize_memory_use_after_dtor);
   Opts.SanitizeCfiCrossDso = Args.hasArg(OPT_fsanitize_cfi_cross_dso);
   Opts.SanitizeStats = Args.hasArg(OPT_fsanitize_stats);
+  Opts.SanitizeAddressUseAfterScope =
+      Args.hasArg(OPT_fsanitize_address_use_after_scope);
   Opts.SSPBufferSize =
       getLastArgIntValue(Args, OPT_stack_protector_buffer_size, 8, Diags);
   Opts.StackRealignment = Args.hasArg(OPT_mstackrealign);
@@ -745,6 +847,11 @@
   Opts.CudaGpuBinaryFileNames =
       Args.getAllArgValues(OPT_fcuda_include_gpubinary);
 
+  Opts.Backchain = Args.hasArg(OPT_mbackchain);
+
+  Opts.EmitCheckPathComponentsToStrip = getLastArgIntValue(
+      Args, OPT_fsanitize_undefined_strip_path_components_EQ, 0, Diags);
+
   return Success;
 }
 
@@ -1046,18 +1153,10 @@
     Opts.Plugins.emplace_back(A->getValue(0));
     Opts.ProgramAction = frontend::PluginAction;
     Opts.ActionName = A->getValue();
-
-    for (const Arg *AA : Args.filtered(OPT_plugin_arg))
-      if (AA->getValue(0) == Opts.ActionName)
-        Opts.PluginArgs.emplace_back(AA->getValue(1));
   }
-
   Opts.AddPluginActions = Args.getAllArgValues(OPT_add_plugin);
-  Opts.AddPluginArgs.resize(Opts.AddPluginActions.size());
-  for (int i = 0, e = Opts.AddPluginActions.size(); i != e; ++i)
-    for (const Arg *A : Args.filtered(OPT_plugin_arg))
-      if (A->getValue(0) == Opts.AddPluginActions[i])
-        Opts.AddPluginArgs[i].emplace_back(A->getValue(1));
+  for (const Arg *AA : Args.filtered(OPT_plugin_arg))
+    Opts.PluginArgs[AA->getValue(0)].emplace_back(AA->getValue(1));
 
   for (const std::string &Arg :
          Args.getAllArgValues(OPT_ftest_module_file_extension_EQ)) {
@@ -1110,6 +1209,7 @@
   Opts.ModuleFiles = Args.getAllArgValues(OPT_fmodule_file);
   Opts.ModulesEmbedFiles = Args.getAllArgValues(OPT_fmodules_embed_file_EQ);
   Opts.ModulesEmbedAllFiles = Args.hasArg(OPT_fmodules_embed_all_files);
+  Opts.IncludeTimestamps = !Args.hasArg(OPT_fno_pch_timestamp);
 
   Opts.CodeCompleteOpts.IncludeMacros
     = Args.hasArg(OPT_code_completion_macros);
@@ -1124,6 +1224,7 @@
     = Args.getLastArgValue(OPT_foverride_record_layout_EQ);
   Opts.AuxTriple =
       llvm::Triple::normalize(Args.getLastArgValue(OPT_aux_triple));
+  Opts.FindPchSource = Args.getLastArgValue(OPT_find_pch_source_EQ);
 
   if (const Arg *A = Args.getLastArg(OPT_arcmt_check,
                                      OPT_arcmt_modify,
@@ -1211,6 +1312,7 @@
       .Case("objective-c++-header", IK_ObjCXX)
       .Cases("ast", "pcm", IK_AST)
       .Case("ir", IK_LLVM_IR)
+      .Case("renderscript", IK_RenderScript)
       .Default(IK_None);
     if (DashX == IK_None)
       Diags.Report(diag::err_drv_invalid_value)
@@ -1267,8 +1369,6 @@
   Opts.ResourceDir = Args.getLastArgValue(OPT_resource_dir);
   Opts.ModuleCachePath = Args.getLastArgValue(OPT_fmodules_cache_path);
   Opts.ModuleUserBuildPath = Args.getLastArgValue(OPT_fmodules_user_build_path);
-  for (const Arg *A : Args.filtered(OPT_fprebuilt_module_path))
-    Opts.AddPrebuiltModulePath(A->getValue());
   Opts.DisableModuleHash = Args.hasArg(OPT_fdisable_module_hash);
   Opts.ModulesValidateDiagnosticOptions =
       !Args.hasArg(OPT_fmodules_disable_diagnostic_validation);
@@ -1294,6 +1394,8 @@
 
   // Add -I..., -F..., and -index-header-map options in order.
   bool IsIndexHeaderMap = false;
+  bool IsSysrootSpecified =
+      Args.hasArg(OPT__sysroot_EQ) || Args.hasArg(OPT_isysroot);
   for (const Arg *A : Args.filtered(OPT_I, OPT_F, OPT_index_header_map)) {
     if (A->getOption().matches(OPT_index_header_map)) {
       // -index-header-map applies to the next -I or -F.
@@ -1304,8 +1406,18 @@
     frontend::IncludeDirGroup Group =
         IsIndexHeaderMap ? frontend::IndexHeaderMap : frontend::Angled;
 
-    Opts.AddPath(A->getValue(), Group,
-                 /*IsFramework=*/A->getOption().matches(OPT_F), true);
+    bool IsFramework = A->getOption().matches(OPT_F);
+    std::string Path = A->getValue();
+
+    if (IsSysrootSpecified && !IsFramework && A->getValue()[0] == '=') {
+      SmallString<32> Buffer;
+      llvm::sys::path::append(Buffer, Opts.Sysroot,
+                              llvm::StringRef(A->getValue()).substr(1));
+      Path = Buffer.str();
+    }
+
+    Opts.AddPath(Path.c_str(), Group, IsFramework,
+                 /*IgnoreSysroot*/ true);
     IsIndexHeaderMap = false;
   }
 
@@ -1366,7 +1478,16 @@
     Opts.ModuleSearchPaths.push_back(A->getValue());
 }
 
+static bool isOpenCL(LangStandard::Kind LangStd) {
+  return LangStd == LangStandard::lang_opencl ||
+         LangStd == LangStandard::lang_opencl11 ||
+         LangStd == LangStandard::lang_opencl12 ||
+         LangStd == LangStandard::lang_opencl20;
+}
+
 void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
+                                         const llvm::Triple &T,
+                                         PreprocessorOptions &PPOpts,
                                          LangStandard::Kind LangStd) {
   // Set some properties which depend solely on the input kind; it would be nice
   // to move these to the language standard, and have the driver resolve the
@@ -1399,7 +1520,11 @@
     case IK_PreprocessedC:
     case IK_ObjC:
     case IK_PreprocessedObjC:
-      LangStd = LangStandard::lang_gnu11;
+      // The PS4 uses C99 as the default C standard.
+      if (T.isPS4())
+        LangStd = LangStandard::lang_gnu99;
+      else
+        LangStd = LangStandard::lang_gnu11;
       break;
     case IK_CXX:
     case IK_PreprocessedCXX:
@@ -1407,6 +1532,9 @@
     case IK_PreprocessedObjCXX:
       LangStd = LangStandard::lang_gnucxx98;
       break;
+    case IK_RenderScript:
+      LangStd = LangStandard::lang_c99;
+      break;
     }
   }
 
@@ -1425,7 +1553,7 @@
   Opts.ImplicitInt = Std.hasImplicitInt();
 
   // Set OpenCL Version.
-  Opts.OpenCL = LangStd == LangStandard::lang_opencl || IK == IK_OpenCL;
+  Opts.OpenCL = isOpenCL(LangStd) || IK == IK_OpenCL;
   if (LangStd == LangStandard::lang_opencl)
     Opts.OpenCLVersion = 100;
   else if (LangStd == LangStandard::lang_opencl11)
@@ -1443,11 +1571,22 @@
     Opts.LaxVectorConversions = 0;
     Opts.DefaultFPContract = 1;
     Opts.NativeHalfType = 1;
+    Opts.NativeHalfArgsAndReturns = 1;
+    // Include default header file for OpenCL.
+    if (Opts.IncludeDefaultHeader) {
+      PPOpts.Includes.push_back("opencl-c.h");
+    }
   }
 
   Opts.CUDA = IK == IK_CUDA || IK == IK_PreprocessedCuda ||
               LangStd == LangStandard::lang_cuda;
 
+  Opts.RenderScript = IK == IK_RenderScript;
+  if (Opts.RenderScript) {
+    Opts.NativeHalfType = 1;
+    Opts.NativeHalfArgsAndReturns = 1;
+  }
+
   // OpenCL and C++ both have bool, true, false keywords.
   Opts.Bool = Opts.OpenCL || Opts.CPlusPlus;
 
@@ -1482,6 +1621,8 @@
 }
 
 static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
+                          const TargetOptions &TargetOpts,
+                          PreprocessorOptions &PPOpts,
                           DiagnosticsEngine &Diags) {
   // FIXME: Cleanup per-file based stuff.
   LangStandard::Kind LangStd = LangStandard::lang_unspecified;
@@ -1489,6 +1630,8 @@
     LangStd = llvm::StringSwitch<LangStandard::Kind>(A->getValue())
 #define LANGSTANDARD(id, name, desc, features) \
       .Case(name, LangStandard::lang_##id)
+#define LANGSTANDARD_ALIAS(id, alias) \
+      .Case(alias, LangStandard::lang_##id)
 #include "clang/Frontend/LangStandards.def"
       .Default(LangStandard::lang_unspecified);
     if (LangStd == LangStandard::lang_unspecified)
@@ -1516,7 +1659,7 @@
             << A->getAsString(Args) << "C++/ObjC++";
         break;
       case IK_OpenCL:
-        if (!Std.isC99())
+        if (!isOpenCL(LangStd))
           Diags.Report(diag::err_drv_argument_not_allowed_with)
             << A->getAsString(Args) << "OpenCL";
         break;
@@ -1537,10 +1680,10 @@
   if (const Arg *A = Args.getLastArg(OPT_cl_std_EQ)) {
     LangStandard::Kind OpenCLLangStd
     = llvm::StringSwitch<LangStandard::Kind>(A->getValue())
-    .Case("CL", LangStandard::lang_opencl)
-    .Case("CL1.1", LangStandard::lang_opencl11)
-    .Case("CL1.2", LangStandard::lang_opencl12)
-    .Case("CL2.0", LangStandard::lang_opencl20)
+    .Cases("cl", "CL", LangStandard::lang_opencl)
+    .Cases("cl1.1", "CL1.1", LangStandard::lang_opencl11)
+    .Cases("cl1.2", "CL1.2", LangStandard::lang_opencl12)
+    .Cases("cl2.0", "CL2.0", LangStandard::lang_opencl20)
     .Default(LangStandard::lang_unspecified);
 
     if (OpenCLLangStd == LangStandard::lang_unspecified) {
@@ -1551,7 +1694,22 @@
       LangStd = OpenCLLangStd;
   }
 
-  CompilerInvocation::setLangDefaults(Opts, IK, LangStd);
+  Opts.IncludeDefaultHeader = Args.hasArg(OPT_finclude_default_header);
+
+  llvm::Triple T(TargetOpts.Triple);
+  CompilerInvocation::setLangDefaults(Opts, IK, T, PPOpts, LangStd);
+
+  // -cl-strict-aliasing needs to emit diagnostic in the case where CL > 1.0.
+  // This option should be deprecated for CL > 1.0 because
+  // this option was added for compatibility with OpenCL 1.0.
+  if (Args.getLastArg(OPT_cl_strict_aliasing)
+       && Opts.OpenCLVersion > 100) {
+    std::string VerSpec = llvm::to_string(Opts.OpenCLVersion / 100) +
+                          std::string(".") +
+                          llvm::to_string((Opts.OpenCLVersion % 100) / 10);
+    Diags.Report(diag::warn_option_invalid_ocl_version)
+      << VerSpec << Args.getLastArg(OPT_cl_strict_aliasing)->getAsString(Args);
+  }
 
   // We abuse '-f[no-]gnu-keywords' to force overriding all GNU-extension
   // keywords. This behavior is provided by GCC's poorly named '-fasm' flag,
@@ -1567,14 +1725,17 @@
   if (Args.hasArg(OPT_fcuda_is_device))
     Opts.CUDAIsDevice = 1;
 
-  if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device))
-    Opts.CUDAAllowHostCallsFromHostDevice = 1;
+  if (Args.hasArg(OPT_fcuda_allow_variadic_functions))
+    Opts.CUDAAllowVariadicFunctions = 1;
 
-  if (Args.hasArg(OPT_fcuda_disable_target_call_checks))
-    Opts.CUDADisableTargetCallChecks = 1;
+  if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
+    Opts.CUDAHostDeviceConstexpr = 0;
 
-  if (Args.hasArg(OPT_fcuda_target_overloads))
-    Opts.CUDATargetOverloads = 1;
+  if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))
+    Opts.CUDADeviceFlushDenormalsToZero = 1;
+
+  if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
+    Opts.CUDADeviceApproxTranscendentals = 1;
 
   if (Opts.ObjC1) {
     if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
@@ -1719,19 +1880,22 @@
   Opts.ObjCExceptions = Args.hasArg(OPT_fobjc_exceptions);
   Opts.CXXExceptions = Args.hasArg(OPT_fcxx_exceptions);
   Opts.SjLjExceptions = Args.hasArg(OPT_fsjlj_exceptions);
+  Opts.ExternCNoUnwind = Args.hasArg(OPT_fexternc_nounwind);
   Opts.TraditionalCPP = Args.hasArg(OPT_traditional_cpp);
 
-  Opts.RTTI = !Args.hasArg(OPT_fno_rtti);
+  Opts.RTTI = Opts.CPlusPlus && !Args.hasArg(OPT_fno_rtti);
   Opts.RTTIData = Opts.RTTI && !Args.hasArg(OPT_fno_rtti_data);
-  Opts.Blocks = Args.hasArg(OPT_fblocks);
+  Opts.Blocks = Args.hasArg(OPT_fblocks) || (Opts.OpenCL
+    && Opts.OpenCLVersion >= 200);
   Opts.BlocksRuntimeOptional = Args.hasArg(OPT_fblocks_runtime_optional);
   Opts.Coroutines = Args.hasArg(OPT_fcoroutines);
-  Opts.Modules = Args.hasArg(OPT_fmodules);
+  Opts.ModulesTS = Args.hasArg(OPT_fmodules_ts);
+  Opts.Modules = Args.hasArg(OPT_fmodules) || Opts.ModulesTS;
   Opts.ModulesStrictDeclUse = Args.hasArg(OPT_fmodules_strict_decluse);
   Opts.ModulesDeclUse =
       Args.hasArg(OPT_fmodules_decluse) || Opts.ModulesStrictDeclUse;
   Opts.ModulesLocalVisibility =
-      Args.hasArg(OPT_fmodules_local_submodule_visibility);
+      Args.hasArg(OPT_fmodules_local_submodule_visibility) || Opts.ModulesTS;
   Opts.ModulesSearchAll = Opts.Modules &&
     !Args.hasArg(OPT_fno_modules_search_all) &&
     Args.hasArg(OPT_fmodules_search_all);
@@ -1746,7 +1910,6 @@
   if (!Opts.NoBuiltin)
     getAllNoBuiltinFuncValues(Args, Opts.NoBuiltinFuncs);
   Opts.NoMathBuiltin = Args.hasArg(OPT_fno_math_builtin);
-  Opts.AssumeSaneOperatorNew = !Args.hasArg(OPT_fno_assume_sane_operator_new);
   Opts.SizedDeallocation = Args.hasArg(OPT_fsized_deallocation);
   Opts.ConceptsTS = Args.hasArg(OPT_fconcepts_ts);
   Opts.HeinousExtensions = Args.hasArg(OPT_fheinous_gnu_extensions);
@@ -1775,8 +1938,9 @@
   Opts.EmitAllDecls = Args.hasArg(OPT_femit_all_decls);
   Opts.PackStruct = getLastArgIntValue(Args, OPT_fpack_struct_EQ, 0, Diags);
   Opts.MaxTypeAlign = getLastArgIntValue(Args, OPT_fmax_type_align_EQ, 0, Diags);
+  Opts.AlignDouble = Args.hasArg(OPT_malign_double);
   Opts.PICLevel = getLastArgIntValue(Args, OPT_pic_level, 0, Diags);
-  Opts.PIELevel = getLastArgIntValue(Args, OPT_pie_level, 0, Diags);
+  Opts.PIE = Args.hasArg(OPT_pic_is_pie);
   Opts.Static = Args.hasArg(OPT_static_define);
   Opts.DumpRecordLayoutsSimple = Args.hasArg(OPT_fdump_record_layouts_simple);
   Opts.DumpRecordLayouts = Opts.DumpRecordLayoutsSimple
@@ -1786,7 +1950,6 @@
   Opts.NoBitFieldTypeAlign = Args.hasArg(OPT_fno_bitfield_type_align);
   Opts.SinglePrecisionConstants = Args.hasArg(OPT_cl_single_precision_constant);
   Opts.FastRelaxedMath = Args.hasArg(OPT_cl_fast_relaxed_math);
-  Opts.MRTD = Args.hasArg(OPT_mrtd);
   Opts.HexagonQdsp6Compat = Args.hasArg(OPT_mqdsp6_compat);
   Opts.FakeAddressSpaceMap = Args.hasArg(OPT_ffake_address_space_map);
   Opts.ParseUnknownAnytype = Args.hasArg(OPT_funknown_anytype);
@@ -1794,14 +1957,16 @@
   Opts.DebuggerCastResultToId = Args.hasArg(OPT_fdebugger_cast_result_to_id);
   Opts.DebuggerObjCLiteral = Args.hasArg(OPT_fdebugger_objc_literal);
   Opts.ApplePragmaPack = Args.hasArg(OPT_fapple_pragma_pack);
-  Opts.CurrentModule = Args.getLastArgValue(OPT_fmodule_name);
+  Opts.CurrentModule = Args.getLastArgValue(OPT_fmodule_name_EQ);
   Opts.AppExt = Args.hasArg(OPT_fapplication_extension);
-  Opts.ImplementationOfModule =
-      Args.getLastArgValue(OPT_fmodule_implementation_of);
   Opts.ModuleFeatures = Args.getAllArgValues(OPT_fmodule_feature);
   std::sort(Opts.ModuleFeatures.begin(), Opts.ModuleFeatures.end());
   Opts.NativeHalfType |= Args.hasArg(OPT_fnative_half_type);
-  Opts.HalfArgsAndReturns = Args.hasArg(OPT_fallow_half_arguments_and_returns);
+  Opts.NativeHalfArgsAndReturns |= Args.hasArg(OPT_fnative_half_arguments_and_returns);
+  // Enable HalfArgsAndReturns if present in Args or if NativeHalfArgsAndReturns
+  // is enabled.
+  Opts.HalfArgsAndReturns = Args.hasArg(OPT_fallow_half_arguments_and_returns)
+                            | Opts.NativeHalfArgsAndReturns;
   Opts.APINotes = Args.hasArg(OPT_fapinotes);
   Opts.GNUAsm = !Args.hasArg(OPT_fno_gnu_inline_asm);
 
@@ -1816,12 +1981,6 @@
       Args.hasFlag(OPT_fdeclspec, OPT_fno_declspec,
                    (Opts.MicrosoftExt || Opts.Borland || Opts.CUDA));
 
-  if (!Opts.CurrentModule.empty() && !Opts.ImplementationOfModule.empty() &&
-      Opts.CurrentModule != Opts.ImplementationOfModule) {
-    Diags.Report(diag::err_conflicting_module_names)
-        << Opts.CurrentModule << Opts.ImplementationOfModule;
-  }
-
   // For now, we only support local submodule visibility in C++ (because we
   // heavily depend on the ODR for merging redefinitions).
   if (Opts.ModulesLocalVisibility && !Opts.CPlusPlus)
@@ -1868,15 +2027,79 @@
     Opts.setMSPointerToMemberRepresentationMethod(InheritanceModel);
   }
 
+  // Check for MS default calling conventions being specified.
+  if (Arg *A = Args.getLastArg(OPT_fdefault_calling_conv_EQ)) {
+    LangOptions::DefaultCallingConvention DefaultCC =
+        llvm::StringSwitch<LangOptions::DefaultCallingConvention>(
+            A->getValue())
+            .Case("cdecl", LangOptions::DCC_CDecl)
+            .Case("fastcall", LangOptions::DCC_FastCall)
+            .Case("stdcall", LangOptions::DCC_StdCall)
+            .Case("vectorcall", LangOptions::DCC_VectorCall)
+            .Default(LangOptions::DCC_None);
+    if (DefaultCC == LangOptions::DCC_None)
+      Diags.Report(diag::err_drv_invalid_value)
+          << "-fdefault-calling-conv=" << A->getValue();
+
+    llvm::Triple T(TargetOpts.Triple);
+    llvm::Triple::ArchType Arch = T.getArch();
+    bool emitError = (DefaultCC == LangOptions::DCC_FastCall ||
+                  DefaultCC == LangOptions::DCC_StdCall) &&
+                 Arch != llvm::Triple::x86;
+    emitError |= DefaultCC == LangOptions::DCC_VectorCall &&
+                 !(Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64);
+    if (emitError)
+      Diags.Report(diag::err_drv_argument_not_allowed_with)
+          << A->getSpelling() << T.getTriple();
+    else
+      Opts.setDefaultCallingConv(DefaultCC);
+  }
+
+  // -mrtd option
+  if (Arg *A = Args.getLastArg(OPT_mrtd)) {
+    if (Opts.getDefaultCallingConv() != LangOptions::DCC_None)
+      Diags.Report(diag::err_drv_argument_not_allowed_with)
+          << A->getSpelling() << "-fdefault-calling-conv";
+    else {
+      llvm::Triple T(TargetOpts.Triple);
+      if (T.getArch() != llvm::Triple::x86)
+        Diags.Report(diag::err_drv_argument_not_allowed_with)
+            << A->getSpelling() << T.getTriple();
+      else
+        Opts.setDefaultCallingConv(LangOptions::DCC_StdCall);
+    }
+  }
+
   // Check if -fopenmp is specified.
-  Opts.OpenMP = Args.hasArg(options::OPT_fopenmp);
+  Opts.OpenMP = Args.hasArg(options::OPT_fopenmp) ? 1 : 0;
   Opts.OpenMPUseTLS =
       Opts.OpenMP && !Args.hasArg(options::OPT_fnoopenmp_use_tls);
   Opts.OpenMPIsDevice =
       Opts.OpenMP && Args.hasArg(options::OPT_fopenmp_is_device);
 
+  if (Opts.OpenMP) {
+    int Version =
+        getLastArgIntValue(Args, OPT_fopenmp_version_EQ, Opts.OpenMP, Diags);
+    if (Version != 0)
+      Opts.OpenMP = Version;
+    // Provide diagnostic when a given target is not expected to be an OpenMP
+    // device or host.
+    if (!Opts.OpenMPIsDevice) {
+      switch (T.getArch()) {
+      default:
+        break;
+      // Add unsupported host targets here:
+      case llvm::Triple::nvptx:
+      case llvm::Triple::nvptx64:
+        Diags.Report(clang::diag::err_drv_omp_host_target_not_supported)
+            << TargetOpts.Triple;
+        break;
+      }
+    }
+  }
+
   // Get the OpenMP target triples if any.
-  if (Arg *A = Args.getLastArg(options::OPT_omptargets_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) {
 
     for (unsigned i = 0; i < A->getNumValues(); ++i) {
       llvm::Triple TT(A->getValue(i));
@@ -1890,7 +2113,7 @@
 
   // Get OpenMP host file path if any and report if a non existent file is
   // found
-  if (Arg *A = Args.getLastArg(options::OPT_omp_host_ir_file_path)) {
+  if (Arg *A = Args.getLastArg(options::OPT_fopenmp_host_ir_file_path)) {
     Opts.OMPHostIRFile = A->getValue();
     if (!llvm::sys::fs::exists(Opts.OMPHostIRFile))
       Diags.Report(clang::diag::err_drv_omp_host_ir_file_not_found)
@@ -1997,10 +2220,6 @@
   for (const Arg *A : Args.filtered(OPT_chain_include))
     Opts.ChainedIncludes.emplace_back(A->getValue());
 
-  // Include 'altivec.h' if -faltivec option present
-  if (Args.hasArg(OPT_faltivec))
-    Opts.Includes.emplace_back("altivec.h");
-
   for (const Arg *A : Args.filtered(OPT_remap_file)) {
     std::pair<StringRef, StringRef> Split = StringRef(A->getValue()).split(';');
 
@@ -2078,9 +2297,24 @@
   Opts.UseLineDirectives = Args.hasArg(OPT_fuse_line_directives);
 }
 
-static void ParseTargetArgs(TargetOptions &Opts, ArgList &Args) {
+static void ParseTargetArgs(TargetOptions &Opts, ArgList &Args,
+                            DiagnosticsEngine &Diags) {
   using namespace options;
   Opts.ABI = Args.getLastArgValue(OPT_target_abi);
+  if (Arg *A = Args.getLastArg(OPT_meabi)) {
+    StringRef Value = A->getValue();
+    llvm::EABI EABIVersion = llvm::StringSwitch<llvm::EABI>(Value)
+                                 .Case("default", llvm::EABI::Default)
+                                 .Case("4", llvm::EABI::EABI4)
+                                 .Case("5", llvm::EABI::EABI5)
+                                 .Case("gnu", llvm::EABI::GNU)
+                                 .Default(llvm::EABI::Unknown);
+    if (EABIVersion == llvm::EABI::Unknown)
+      Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args)
+                                                << Value;
+    else
+      Opts.EABIVersion = Value;
+  }
   Opts.CPU = Args.getLastArgValue(OPT_target_cpu);
   Opts.FPMath = Args.getLastArgValue(OPT_mfpmath);
   Opts.FeaturesAsWritten = Args.getAllArgValues(OPT_target_feature);
@@ -2105,6 +2339,7 @@
   InputArgList Args =
       Opts->ParseArgs(llvm::makeArrayRef(ArgBegin, ArgEnd), MissingArgIndex,
                       MissingArgCount, IncludedFlagsBitmask);
+  LangOptions &LangOpts = *Res.getLangOpts();
 
   // Check for missing argument error.
   if (MissingArgCount) {
@@ -2124,11 +2359,11 @@
   ParseDependencyOutputArgs(Res.getDependencyOutputOpts(), Args);
   Success &= ParseDiagnosticArgs(Res.getDiagnosticOpts(), Args, &Diags,
                                  false /*DefaultDiagColor*/);
-  ParseCommentArgs(Res.getLangOpts()->CommentOpts, Args);
+  ParseCommentArgs(LangOpts.CommentOpts, Args);
   ParseFileSystemArgs(Res.getFileSystemOpts(), Args);
   // FIXME: We shouldn't have to pass the DashX option around here
   InputKind DashX = ParseFrontendArgs(Res.getFrontendOpts(), Args, Diags);
-  ParseTargetArgs(Res.getTargetOpts(), Args);
+  ParseTargetArgs(Res.getTargetOpts(), Args, Diags);
   Success &= ParseCodeGenArgs(Res.getCodeGenOpts(), Args, DashX, Diags,
                               Res.getTargetOpts());
   ParseHeaderSearchArgs(Res.getHeaderSearchOpts(), Args);
@@ -2139,29 +2374,45 @@
     // PassManager in BackendUtil.cpp. They need to be initializd no matter
     // what the input type is.
     if (Args.hasArg(OPT_fobjc_arc))
-      Res.getLangOpts()->ObjCAutoRefCount = 1;
+      LangOpts.ObjCAutoRefCount = 1;
+    // PIClevel and PIELevel are needed during code generation and this should be
+    // set regardless of the input type.
+    LangOpts.PICLevel = getLastArgIntValue(Args, OPT_pic_level, 0, Diags);
+    LangOpts.PIE = Args.hasArg(OPT_pic_is_pie);
     parseSanitizerKinds("-fsanitize=", Args.getAllArgValues(OPT_fsanitize_EQ),
-                        Diags, Res.getLangOpts()->Sanitize);
+                        Diags, LangOpts.Sanitize);
   } else {
     // Other LangOpts are only initialzed when the input is not AST or LLVM IR.
-    ParseLangArgs(*Res.getLangOpts(), Args, DashX, Diags);
+    ParseLangArgs(LangOpts, Args, DashX, Res.getTargetOpts(),
+      Res.getPreprocessorOpts(), Diags);
     if (Res.getFrontendOpts().ProgramAction == frontend::RewriteObjC)
-      Res.getLangOpts()->ObjCExceptions = 1;
+      LangOpts.ObjCExceptions = 1;
 
     // -fapinotes requires -fapinotes-cache-path=<directory>.
-    if (Res.getLangOpts()->APINotes &&
+    if (LangOpts.APINotes &&
         Res.getFileSystemOpts().APINotesCachePath.empty()) {
       Diags.Report(diag::err_no_apinotes_cache_path);
       Success = false;
     }
   }
 
+  if (LangOpts.CUDA) {
+    // During CUDA device-side compilation, the aux triple is the
+    // triple used for host compilation.
+    if (LangOpts.CUDAIsDevice)
+      Res.getTargetOpts().HostTriple = Res.getFrontendOpts().AuxTriple;
+
+    // Set default FP_CONTRACT to FAST.
+    if (!Args.hasArg(OPT_ffp_contract))
+      Res.getCodeGenOpts().setFPContractMode(CodeGenOptions::FPC_Fast);
+  }
+
   // FIXME: Override value name discarding when asan or msan is used because the
   // backend passes depend on the name of the alloca in order to print out
   // names.
   Res.getCodeGenOpts().DiscardValueNames &=
-      !Res.getLangOpts()->Sanitize.has(SanitizerKind::Address) &&
-      !Res.getLangOpts()->Sanitize.has(SanitizerKind::Memory);
+      !LangOpts.Sanitize.has(SanitizerKind::Address) &&
+      !LangOpts.Sanitize.has(SanitizerKind::Memory);
 
   // FIXME: ParsePreprocessorArgs uses the FileManager to read the contents of
   // PCH file and find the original header name. Remove the need to do that in
@@ -2171,60 +2422,14 @@
   ParsePreprocessorArgs(Res.getPreprocessorOpts(), Args, FileMgr, Diags);
   ParsePreprocessorOutputArgs(Res.getPreprocessorOutputOpts(), Args,
                               Res.getFrontendOpts().ProgramAction);
-  return Success;
-}
 
-namespace {
-
-  class ModuleSignature {
-    SmallVector<uint64_t, 16> Data;
-    unsigned CurBit;
-    uint64_t CurValue;
-
-  public:
-    ModuleSignature() : CurBit(0), CurValue(0) { }
-
-    void add(uint64_t Value, unsigned Bits);
-    void add(StringRef Value);
-    void flush();
-
-    llvm::APInt getAsInteger() const;
-  };
-}
-
-void ModuleSignature::add(uint64_t Value, unsigned int NumBits) {
-  CurValue |= Value << CurBit;
-  if (CurBit + NumBits < 64) {
-    CurBit += NumBits;
-    return;
+  // Turn on -Wspir-compat for SPIR target.
+  llvm::Triple T(Res.getTargetOpts().Triple);
+  auto Arch = T.getArch();
+  if (Arch == llvm::Triple::spir || Arch == llvm::Triple::spir64) {
+    Res.getDiagnosticOpts().Warnings.push_back("spir-compat");
   }
-
-  // Add the current word.
-  Data.push_back(CurValue);
-
-  if (CurBit)
-    CurValue = Value >> (64-CurBit);
-  else
-    CurValue = 0;
-  CurBit = (CurBit+NumBits) & 63;
-}
-
-void ModuleSignature::flush() {
-  if (CurBit == 0)
-    return;
-
-  Data.push_back(CurValue);
-  CurBit = 0;
-  CurValue = 0;
-}
-
-void ModuleSignature::add(StringRef Value) {
-  for (auto &c : Value)
-    add(c, 8);
-}
-
-llvm::APInt ModuleSignature::getAsInteger() const {
-  return llvm::APInt(Data.size() * 64, Data);
+  return Success;
 }
 
 std::string CompilerInvocation::getModuleHash() const {
@@ -2294,7 +2499,7 @@
 
   // Extend the signature with the module file extensions.
   const FrontendOptions &frontendOpts = getFrontendOpts();
-  for (auto ext : frontendOpts.ModuleFileExtensions) {
+  for (const auto &ext : frontendOpts.ModuleFileExtensions) {
     code = ext->hashExtension(code);
   }
 
diff --git a/lib/Frontend/CreateInvocationFromCommandLine.cpp b/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 3019164..1e9e57a 100644
--- a/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -60,25 +60,25 @@
   }
 
   // We expect to get back exactly one command job, if we didn't something
-  // failed. CUDA compilation is an exception as it creates multiple jobs. If
-  // that's the case, we proceed with the first job. If caller needs particular
-  // CUDA job, it should be controlled via --cuda-{host|device}-only option
-  // passed to the driver.
+  // failed. Offload compilation is an exception as it creates multiple jobs. If
+  // that's the case, we proceed with the first job. If caller needs a
+  // particular job, it should be controlled via options (e.g.
+  // --cuda-{host|device}-only for CUDA) passed to the driver.
   const driver::JobList &Jobs = C->getJobs();
-  bool CudaCompilation = false;
+  bool OffloadCompilation = false;
   if (Jobs.size() > 1) {
     for (auto &A : C->getActions()){
       // On MacOSX real actions may end up being wrapped in BindArchAction
       if (isa<driver::BindArchAction>(A))
-        A = *A->begin();
-      if (isa<driver::CudaDeviceAction>(A)) {
-        CudaCompilation = true;
+        A = *A->input_begin();
+      if (isa<driver::OffloadAction>(A)) {
+        OffloadCompilation = true;
         break;
       }
     }
   }
   if (Jobs.size() == 0 || !isa<driver::Command>(*Jobs.begin()) ||
-      (Jobs.size() > 1 && !CudaCompilation)) {
+      (Jobs.size() > 1 && !OffloadCompilation)) {
     SmallString<256> Msg;
     llvm::raw_svector_ostream OS(Msg);
     Jobs.Print(OS, "; ", true);
diff --git a/lib/Frontend/DependencyFile.cpp b/lib/Frontend/DependencyFile.cpp
index 93d4a80..a9b6128 100644
--- a/lib/Frontend/DependencyFile.cpp
+++ b/lib/Frontend/DependencyFile.cpp
@@ -177,7 +177,7 @@
       SeenMissingHeader(false),
       IncludeModuleFiles(Opts.IncludeModuleFiles),
       OutputFormat(Opts.OutputFormat) {
-    for (auto ExtraDep : Opts.ExtraDeps) {
+    for (const auto &ExtraDep : Opts.ExtraDeps) {
       AddFilename(ExtraDep);
     }
   }
diff --git a/lib/Frontend/DiagnosticRenderer.cpp b/lib/Frontend/DiagnosticRenderer.cpp
index 302067a..177feac 100644
--- a/lib/Frontend/DiagnosticRenderer.cpp
+++ b/lib/Frontend/DiagnosticRenderer.cpp
@@ -9,7 +9,6 @@
 
 #include "clang/Frontend/DiagnosticRenderer.h"
 #include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Edit/Commit.h"
 #include "clang/Edit/EditedSource.h"
@@ -18,7 +17,6 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace clang;
@@ -167,7 +165,8 @@
                                           PresumedLoc PLoc,
                                           DiagnosticsEngine::Level Level,
                                           const SourceManager &SM) {
-  SourceLocation IncludeLoc = PLoc.getIncludeLoc();
+  SourceLocation IncludeLoc =
+      PLoc.isInvalid() ? SourceLocation() : PLoc.getIncludeLoc();
 
   // Skip redundant include stacks altogether.
   if (LastIncludeLoc == IncludeLoc)
@@ -617,7 +616,7 @@
   // Generate a note indicating the include location.
   SmallString<200> MessageStorage;
   llvm::raw_svector_ostream Message(MessageStorage);
-  if (PLoc.getFilename())
+  if (PLoc.isValid())
     Message << "while building module '" << ModuleName << "' imported from "
             << PLoc.getFilename() << ':' << PLoc.getLine() << ":";
   else
diff --git a/lib/Frontend/FrontendAction.cpp b/lib/Frontend/FrontendAction.cpp
index 82373e3..1bf1fe6 100644
--- a/lib/Frontend/FrontendAction.cpp
+++ b/lib/Frontend/FrontendAction.cpp
@@ -20,19 +20,20 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Parse/ParseAST.h"
 #include "clang/Serialization/ASTDeserializationListener.h"
 #include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/GlobalModuleIndex.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <system_error>
 using namespace clang;
 
-template class llvm::Registry<clang::PluginASTAction>;
+LLVM_INSTANTIATE_REGISTRY(FrontendPluginRegistry)
 
 namespace {
 
@@ -141,28 +142,46 @@
   if (!Consumer)
     return nullptr;
 
-  if (CI.getFrontendOpts().AddPluginActions.size() == 0)
+  // If there are no registered plugins we don't need to wrap the consumer
+  if (FrontendPluginRegistry::begin() == FrontendPluginRegistry::end())
     return Consumer;
 
-  // Make sure the non-plugin consumer is first, so that plugins can't
-  // modifiy the AST.
+  // Collect the list of plugins that go before the main action (in Consumers)
+  // or after it (in AfterConsumers)
   std::vector<std::unique_ptr<ASTConsumer>> Consumers;
-  Consumers.push_back(std::move(Consumer));
-
-  for (size_t i = 0, e = CI.getFrontendOpts().AddPluginActions.size();
-       i != e; ++i) { 
-    // This is O(|plugins| * |add_plugins|), but since both numbers are
-    // way below 50 in practice, that's ok.
-    for (FrontendPluginRegistry::iterator
-        it = FrontendPluginRegistry::begin(),
-        ie = FrontendPluginRegistry::end();
-        it != ie; ++it) {
-      if (it->getName() != CI.getFrontendOpts().AddPluginActions[i])
-        continue;
-      std::unique_ptr<PluginASTAction> P = it->instantiate();
-      if (P->ParseArgs(CI, CI.getFrontendOpts().AddPluginArgs[i]))
-        Consumers.push_back(P->CreateASTConsumer(CI, InFile));
+  std::vector<std::unique_ptr<ASTConsumer>> AfterConsumers;
+  for (FrontendPluginRegistry::iterator it = FrontendPluginRegistry::begin(),
+                                        ie = FrontendPluginRegistry::end();
+       it != ie; ++it) {
+    std::unique_ptr<PluginASTAction> P = it->instantiate();
+    PluginASTAction::ActionType ActionType = P->getActionType();
+    if (ActionType == PluginASTAction::Cmdline) {
+      // This is O(|plugins| * |add_plugins|), but since both numbers are
+      // way below 50 in practice, that's ok.
+      for (size_t i = 0, e = CI.getFrontendOpts().AddPluginActions.size();
+           i != e; ++i) {
+        if (it->getName() == CI.getFrontendOpts().AddPluginActions[i]) {
+          ActionType = PluginASTAction::AddAfterMainAction;
+          break;
+        }
+      }
     }
+    if ((ActionType == PluginASTAction::AddBeforeMainAction ||
+         ActionType == PluginASTAction::AddAfterMainAction) &&
+        P->ParseArgs(CI, CI.getFrontendOpts().PluginArgs[it->getName()])) {
+      std::unique_ptr<ASTConsumer> PluginConsumer = P->CreateASTConsumer(CI, InFile);
+      if (ActionType == PluginASTAction::AddBeforeMainAction) {
+        Consumers.push_back(std::move(PluginConsumer));
+      } else {
+        AfterConsumers.push_back(std::move(PluginConsumer));
+      }
+    }
+  }
+
+  // Add to Consumers the main consumer, then all the plugins that go after it
+  Consumers.push_back(std::move(Consumer));
+  for (auto &C : AfterConsumers) {
+    Consumers.push_back(std::move(C));
   }
 
   return llvm::make_unique<MultiplexConsumer>(std::move(Consumers));
diff --git a/lib/Frontend/FrontendActions.cpp b/lib/Frontend/FrontendActions.cpp
index eb91a59..9e15e3f 100644
--- a/lib/Frontend/FrontendActions.cpp
+++ b/lib/Frontend/FrontendActions.cpp
@@ -11,19 +11,18 @@
 #include "clang/AST/ASTConsumer.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Frontend/ASTConsumers.h"
-#include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/MultiplexConsumer.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearch.h"
-#include "clang/Lex/Pragma.h"
 #include "clang/Lex/Preprocessor.h"
-#include "clang/Parse/Parser.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/ASTWriter.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 #include <system_error>
@@ -48,8 +47,9 @@
 
 std::unique_ptr<ASTConsumer>
 ASTPrintAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
-  if (raw_ostream *OS = CI.createDefaultOutputFile(false, InFile))
-    return CreateASTPrinter(OS, CI.getFrontendOpts().ASTDumpFilter);
+  if (std::unique_ptr<raw_ostream> OS =
+          CI.createDefaultOutputFile(false, InFile))
+    return CreateASTPrinter(std::move(OS), CI.getFrontendOpts().ASTDumpFilter);
   return nullptr;
 }
 
@@ -80,7 +80,7 @@
 GeneratePCHAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
   std::string Sysroot;
   std::string OutputFile;
-  raw_pwrite_stream *OS =
+  std::unique_ptr<raw_pwrite_stream> OS =
       ComputeASTConsumerArguments(CI, InFile, Sysroot, OutputFile);
   if (!OS)
     return nullptr;
@@ -92,16 +92,21 @@
   std::vector<std::unique_ptr<ASTConsumer>> Consumers;
   Consumers.push_back(llvm::make_unique<PCHGenerator>(
                         CI.getPreprocessor(), OutputFile, nullptr, Sysroot,
-                        Buffer, CI.getFrontendOpts().ModuleFileExtensions));
+                        Buffer, CI.getFrontendOpts().ModuleFileExtensions,
+                        /*AllowASTWithErrors*/false,
+                        /*IncludeTimestamps*/
+                          +CI.getFrontendOpts().IncludeTimestamps));
   Consumers.push_back(CI.getPCHContainerWriter().CreatePCHContainerGenerator(
-      CI, InFile, OutputFile, OS, Buffer));
+      CI, InFile, OutputFile, std::move(OS), Buffer));
 
   return llvm::make_unique<MultiplexConsumer>(std::move(Consumers));
 }
 
-raw_pwrite_stream *GeneratePCHAction::ComputeASTConsumerArguments(
-    CompilerInstance &CI, StringRef InFile, std::string &Sysroot,
-    std::string &OutputFile) {
+std::unique_ptr<raw_pwrite_stream>
+GeneratePCHAction::ComputeASTConsumerArguments(CompilerInstance &CI,
+                                               StringRef InFile,
+                                               std::string &Sysroot,
+                                               std::string &OutputFile) {
   Sysroot = CI.getHeaderSearchOpts().Sysroot;
   if (CI.getFrontendOpts().RelocatablePCH && Sysroot.empty()) {
     CI.getDiagnostics().Report(diag::err_relocatable_without_isysroot);
@@ -111,7 +116,7 @@
   // We use createOutputFile here because this is exposed via libclang, and we
   // must disable the RemoveFileOnSignal behavior.
   // We use a temporary to avoid race conditions.
-  raw_pwrite_stream *OS =
+  std::unique_ptr<raw_pwrite_stream> OS =
       CI.createOutputFile(CI.getFrontendOpts().OutputFile, /*Binary=*/true,
                           /*RemoveFileOnSignal=*/false, InFile,
                           /*Extension=*/"", /*useTemporary=*/true);
@@ -127,7 +132,7 @@
                                         StringRef InFile) {
   std::string Sysroot;
   std::string OutputFile;
-  raw_pwrite_stream *OS =
+  std::unique_ptr<raw_pwrite_stream> OS =
       ComputeASTConsumerArguments(CI, InFile, Sysroot, OutputFile);
   if (!OS)
     return nullptr;
@@ -142,7 +147,7 @@
                         /*IncludeTimestamps=*/
                           +CI.getFrontendOpts().BuildingImplicitModule));
   Consumers.push_back(CI.getPCHContainerWriter().CreatePCHContainerGenerator(
-      CI, InFile, OutputFile, OS, Buffer));
+      CI, InFile, OutputFile, std::move(OS), Buffer));
   return llvm::make_unique<MultiplexConsumer>(std::move(Consumers));
 }
 
@@ -152,10 +157,10 @@
   return Includes;
 }
 
-static std::error_code addHeaderInclude(StringRef HeaderName,
-                                        SmallVectorImpl<char> &Includes,
-                                        const LangOptions &LangOpts,
-                                        bool IsExternC) {
+static void addHeaderInclude(StringRef HeaderName,
+                             SmallVectorImpl<char> &Includes,
+                             const LangOptions &LangOpts,
+                             bool IsExternC) {
   if (IsExternC && LangOpts.CPlusPlus)
     Includes += "extern \"C\" {\n";
   if (LangOpts.ObjC1)
@@ -168,7 +173,6 @@
   Includes += "\"\n";
   if (IsExternC && LangOpts.CPlusPlus)
     Includes += "}\n";
-  return std::error_code();
 }
 
 /// \brief Collect the set of header includes needed to construct the given 
@@ -194,22 +198,17 @@
       // file relative to the module build directory (the directory containing
       // the module map file) so this will find the same file that we found
       // while parsing the module map.
-      if (std::error_code Err = addHeaderInclude(H.NameAsWritten, Includes,
-                                                 LangOpts, Module->IsExternC))
-        return Err;
+      addHeaderInclude(H.NameAsWritten, Includes, LangOpts, Module->IsExternC);
     }
   }
   // Note that Module->PrivateHeaders will not be a TopHeader.
 
   if (Module::Header UmbrellaHeader = Module->getUmbrellaHeader()) {
     Module->addTopHeader(UmbrellaHeader.Entry);
-    if (Module->Parent) {
+    if (Module->Parent)
       // Include the umbrella header for submodules.
-      if (std::error_code Err = addHeaderInclude(UmbrellaHeader.NameAsWritten,
-                                                 Includes, LangOpts,
-                                                 Module->IsExternC))
-        return Err;
-    }
+      addHeaderInclude(UmbrellaHeader.NameAsWritten, Includes, LangOpts,
+                       Module->IsExternC);
   } else if (Module::DirectoryName UmbrellaDir = Module->getUmbrellaDir()) {
     // Add all of the headers we find in this subdirectory.
     std::error_code EC;
@@ -249,9 +248,7 @@
 
       // Include this header as part of the umbrella directory.
       Module->addTopHeader(Header);
-      if (std::error_code Err = addHeaderInclude(RelativeHeader, Includes,
-                                                 LangOpts, Module->IsExternC))
-        return Err;
+      addHeaderInclude(RelativeHeader, Includes, LangOpts, Module->IsExternC);
     }
 
     if (EC)
@@ -271,6 +268,8 @@
 
 bool GenerateModuleAction::BeginSourceFileAction(CompilerInstance &CI, 
                                                  StringRef Filename) {
+  CI.getLangOpts().CompilingModule = true;
+
   // Find the module map file.
   const FileEntry *ModuleMap =
       CI.getFileManager().getFile(Filename, /*openFile*/true);
@@ -360,10 +359,9 @@
   SmallString<256> HeaderContents;
   std::error_code Err = std::error_code();
   if (Module::Header UmbrellaHeader = Module->getUmbrellaHeader())
-    Err = addHeaderInclude(UmbrellaHeader.NameAsWritten, HeaderContents,
-                           CI.getLangOpts(), Module->IsExternC);
-  if (!Err)
-    Err = collectModuleHeaderIncludes(
+    addHeaderInclude(UmbrellaHeader.NameAsWritten, HeaderContents,
+                     CI.getLangOpts(), Module->IsExternC);
+  Err = collectModuleHeaderIncludes(
         CI.getLangOpts(), FileMgr,
         CI.getPreprocessor().getHeaderSearchInfo().getModuleMap(), Module,
         HeaderContents);
@@ -387,23 +385,24 @@
   return true;
 }
 
-raw_pwrite_stream *GenerateModuleAction::ComputeASTConsumerArguments(
-    CompilerInstance &CI, StringRef InFile, std::string &Sysroot,
-    std::string &OutputFile) {
+std::unique_ptr<raw_pwrite_stream>
+GenerateModuleAction::ComputeASTConsumerArguments(CompilerInstance &CI,
+                                                  StringRef InFile,
+                                                  std::string &Sysroot,
+                                                  std::string &OutputFile) {
   // If no output file was provided, figure out where this module would go
   // in the module cache.
   if (CI.getFrontendOpts().OutputFile.empty()) {
     HeaderSearch &HS = CI.getPreprocessor().getHeaderSearchInfo();
     CI.getFrontendOpts().OutputFile =
         HS.getModuleFileName(CI.getLangOpts().CurrentModule,
-                             ModuleMapForUniquing->getName(),
-                             /*UsePrebuiltPath=*/false);
+                             ModuleMapForUniquing->getName());
   }
 
   // We use createOutputFile here because this is exposed via libclang, and we
   // must disable the RemoveFileOnSignal behavior.
   // We use a temporary to avoid race conditions.
-  raw_pwrite_stream *OS =
+  std::unique_ptr<raw_pwrite_stream> OS =
       CI.createOutputFile(CI.getFrontendOpts().OutputFile, /*Binary=*/true,
                           /*RemoveFileOnSignal=*/false, InFile,
                           /*Extension=*/"", /*useTemporary=*/true,
@@ -415,6 +414,9 @@
   return OS;
 }
 
+SyntaxOnlyAction::~SyntaxOnlyAction() {
+}
+
 std::unique_ptr<ASTConsumer>
 SyntaxOnlyAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
   return llvm::make_unique<ASTConsumer>();
@@ -599,6 +601,13 @@
   };
 }
 
+bool DumpModuleInfoAction::BeginInvocation(CompilerInstance &CI) {
+  // The Object file reader also supports raw ast files and there is no point in
+  // being strict about the module file format in -module-file-info mode.
+  CI.getHeaderSearchOpts().ModuleFormat = "obj";
+  return true;
+}
+
 void DumpModuleInfoAction::ExecuteAction() {
   // Set up the output file.
   std::unique_ptr<llvm::raw_fd_ostream> OutFile;
@@ -611,13 +620,19 @@
   llvm::raw_ostream &Out = OutFile.get()? *OutFile.get() : llvm::outs();
 
   Out << "Information for module file '" << getCurrentFile() << "':\n";
+  auto &FileMgr = getCompilerInstance().getFileManager();
+  auto Buffer = FileMgr.getBufferForFile(getCurrentFile());
+  StringRef Magic = (*Buffer)->getMemBufferRef().getBuffer();
+  bool IsRaw = (Magic.size() >= 4 && Magic[0] == 'C' && Magic[1] == 'P' &&
+                Magic[2] == 'C' && Magic[3] == 'H');
+  Out << "  Module format: " << (IsRaw ? "raw" : "obj") << "\n";
+
   Preprocessor &PP = getCompilerInstance().getPreprocessor();
   DumpModuleInfoListener Listener(Out);
   HeaderSearchOptions &HSOpts =
       PP.getHeaderSearchInfo().getHeaderSearchOpts();
   ASTReader::readASTFileControlBlock(
-      getCurrentFile(), getCompilerInstance().getFileManager(),
-      getCompilerInstance().getPCHContainerReader(),
+      getCurrentFile(), FileMgr, getCompilerInstance().getPCHContainerReader(),
       /*FindModuleFileExtensions=*/true, Listener,
       HSOpts.ModulesValidateDiagnosticOptions);
 }
@@ -658,11 +673,12 @@
 
 void GeneratePTHAction::ExecuteAction() {
   CompilerInstance &CI = getCompilerInstance();
-  raw_pwrite_stream *OS = CI.createDefaultOutputFile(true, getCurrentFile());
+  std::unique_ptr<raw_pwrite_stream> OS =
+      CI.createDefaultOutputFile(true, getCurrentFile());
   if (!OS)
     return;
 
-  CacheTokens(CI.getPreprocessor(), OS);
+  CacheTokens(CI.getPreprocessor(), OS.get());
 }
 
 void PreprocessOnlyAction::ExecuteAction() {
@@ -718,14 +734,16 @@
       } else if (*cur == 0x0A)  // LF
         break;
 
-      ++cur, ++next;
+      ++cur;
+      ++next;
     }
   }
 
-  raw_ostream *OS = CI.createDefaultOutputFile(BinaryMode, getCurrentFile());
+  std::unique_ptr<raw_ostream> OS =
+      CI.createDefaultOutputFile(BinaryMode, getCurrentFile());
   if (!OS) return;
 
-  DoPrintPreprocessedInput(CI.getPreprocessor(), OS,
+  DoPrintPreprocessedInput(CI.getPreprocessor(), OS.get(),
                            CI.getPreprocessorOutputOpts());
 }
 
@@ -748,6 +766,7 @@
   case IK_PreprocessedObjCXX:
   case IK_AST:
   case IK_LLVM_IR:
+  case IK_RenderScript:
     // We can't do anything with these.
     return;
   }
diff --git a/lib/Frontend/HeaderIncludeGen.cpp b/lib/Frontend/HeaderIncludeGen.cpp
index 0bc1169..5bff4ec 100644
--- a/lib/Frontend/HeaderIncludeGen.cpp
+++ b/lib/Frontend/HeaderIncludeGen.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Frontend/DependencyOutputOptions.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -19,6 +20,7 @@
 class HeaderIncludesCallback : public PPCallbacks {
   SourceManager &SM;
   raw_ostream *OutputFile;
+  const DependencyOutputOptions &DepOpts;
   unsigned CurrentIncludeDepth;
   bool HasProcessedPredefines;
   bool OwnsOutputFile;
@@ -28,12 +30,13 @@
 
 public:
   HeaderIncludesCallback(const Preprocessor *PP, bool ShowAllHeaders_,
-                         raw_ostream *OutputFile_, bool OwnsOutputFile_,
-                         bool ShowDepth_, bool MSStyle_)
-    : SM(PP->getSourceManager()), OutputFile(OutputFile_),
-      CurrentIncludeDepth(0), HasProcessedPredefines(false),
-      OwnsOutputFile(OwnsOutputFile_), ShowAllHeaders(ShowAllHeaders_),
-      ShowDepth(ShowDepth_), MSStyle(MSStyle_) {}
+                         raw_ostream *OutputFile_,
+                         const DependencyOutputOptions &DepOpts,
+                         bool OwnsOutputFile_, bool ShowDepth_, bool MSStyle_)
+      : SM(PP->getSourceManager()), OutputFile(OutputFile_), DepOpts(DepOpts),
+        CurrentIncludeDepth(0), HasProcessedPredefines(false),
+        OwnsOutputFile(OwnsOutputFile_), ShowAllHeaders(ShowAllHeaders_),
+        ShowDepth(ShowDepth_), MSStyle(MSStyle_) {}
 
   ~HeaderIncludesCallback() override {
     if (OwnsOutputFile)
@@ -46,38 +49,37 @@
 };
 }
 
-static void PrintHeaderInfo(raw_ostream *OutputFile, const char* Filename,
+static void PrintHeaderInfo(raw_ostream *OutputFile, StringRef Filename,
                             bool ShowDepth, unsigned CurrentIncludeDepth,
                             bool MSStyle) {
-    // Write to a temporary string to avoid unnecessary flushing on errs().
-    SmallString<512> Pathname(Filename);
+  // Write to a temporary string to avoid unnecessary flushing on errs().
+  SmallString<512> Pathname(Filename);
+  if (!MSStyle)
+    Lexer::Stringify(Pathname);
+
+  SmallString<256> Msg;
+  if (MSStyle)
+    Msg += "Note: including file:";
+
+  if (ShowDepth) {
+    // The main source file is at depth 1, so skip one dot.
+    for (unsigned i = 1; i != CurrentIncludeDepth; ++i)
+      Msg += MSStyle ? ' ' : '.';
+
     if (!MSStyle)
-      Lexer::Stringify(Pathname);
+      Msg += ' ';
+  }
+  Msg += Pathname;
+  Msg += '\n';
 
-    SmallString<256> Msg;
-    if (MSStyle)
-      Msg += "Note: including file:";
-
-    if (ShowDepth) {
-      // The main source file is at depth 1, so skip one dot.
-      for (unsigned i = 1; i != CurrentIncludeDepth; ++i)
-        Msg += MSStyle ? ' ' : '.';
-
-      if (!MSStyle)
-        Msg += ' ';
-    }
-    Msg += Pathname;
-    Msg += '\n';
-
-    OutputFile->write(Msg.data(), Msg.size());
-    OutputFile->flush();
+  *OutputFile << Msg;
+  OutputFile->flush();
 }
 
 void clang::AttachHeaderIncludeGen(Preprocessor &PP,
-                                   const std::vector<std::string> &ExtraHeaders,
-                                   bool ShowAllHeaders,
-                                   StringRef OutputPath, bool ShowDepth,
-                                   bool MSStyle) {
+                                   const DependencyOutputOptions &DepOpts,
+                                   bool ShowAllHeaders, StringRef OutputPath,
+                                   bool ShowDepth, bool MSStyle) {
   raw_ostream *OutputFile = MSStyle ? &llvm::outs() : &llvm::errs();
   bool OwnsOutputFile = false;
 
@@ -97,20 +99,16 @@
     }
   }
 
-  // Print header info for extra headers, pretending they were discovered
-  // by the regular preprocessor. The primary use case is to support
-  // proper generation of Make / Ninja file dependencies for implicit includes,
-  // such as sanitizer blacklists. It's only important for cl.exe
-  // compatibility, the GNU way to generate rules is -M / -MM / -MD / -MMD.
-  for (auto Header : ExtraHeaders) {
-    PrintHeaderInfo(OutputFile, Header.c_str(), ShowDepth, 2, MSStyle);
-  }
-  PP.addPPCallbacks(llvm::make_unique<HeaderIncludesCallback>(&PP,
-                                                              ShowAllHeaders,
-                                                              OutputFile,
-                                                              OwnsOutputFile,
-                                                              ShowDepth,
-                                                              MSStyle));
+  // Print header info for extra headers, pretending they were discovered by
+  // the regular preprocessor. The primary use case is to support proper
+  // generation of Make / Ninja file dependencies for implicit includes, such
+  // as sanitizer blacklists. It's only important for cl.exe compatibility,
+  // the GNU way to generate rules is -M / -MM / -MD / -MMD.
+  for (const auto &Header : DepOpts.ExtraDeps)
+    PrintHeaderInfo(OutputFile, Header, ShowDepth, 2, MSStyle);
+  PP.addPPCallbacks(llvm::make_unique<HeaderIncludesCallback>(
+      &PP, ShowAllHeaders, OutputFile, DepOpts, OwnsOutputFile, ShowDepth,
+      MSStyle));
 }
 
 void HeaderIncludesCallback::FileChanged(SourceLocation Loc,
@@ -132,8 +130,13 @@
 
     // We track when we are done with the predefines by watching for the first
     // place where we drop back to a nesting depth of 1.
-    if (CurrentIncludeDepth == 1 && !HasProcessedPredefines)
+    if (CurrentIncludeDepth == 1 && !HasProcessedPredefines) {
+      if (!DepOpts.ShowIncludesPretendHeader.empty()) {
+        PrintHeaderInfo(OutputFile, DepOpts.ShowIncludesPretendHeader,
+                        ShowDepth, 2, MSStyle);
+      }
       HasProcessedPredefines = true;
+    }
 
     return;
   } else
@@ -144,11 +147,20 @@
   // line buffers.
   bool ShowHeader = (HasProcessedPredefines ||
                      (ShowAllHeaders && CurrentIncludeDepth > 2));
+  unsigned IncludeDepth = CurrentIncludeDepth;
+  if (!HasProcessedPredefines)
+    --IncludeDepth; // Ignore indent from <built-in>.
+  else if (!DepOpts.ShowIncludesPretendHeader.empty())
+    ++IncludeDepth; // Pretend inclusion by ShowIncludesPretendHeader.
 
   // Dump the header include information we are past the predefines buffer or
-  // are showing all headers.
-  if (ShowHeader && Reason == PPCallbacks::EnterFile) {
-    PrintHeaderInfo(OutputFile, UserLoc.getFilename(),
-                    ShowDepth, CurrentIncludeDepth, MSStyle);
+  // are showing all headers and this isn't the magic implicit <command line>
+  // header.
+  // FIXME: Identify headers in a more robust way than comparing their name to
+  // "<command line>" and "<built-in>" in a bunch of places.
+  if (ShowHeader && Reason == PPCallbacks::EnterFile &&
+      UserLoc.getFilename() != StringRef("<command line>")) {
+    PrintHeaderInfo(OutputFile, UserLoc.getFilename(), ShowDepth, IncludeDepth,
+                    MSStyle);
   }
 }
diff --git a/lib/Frontend/InitHeaderSearch.cpp b/lib/Frontend/InitHeaderSearch.cpp
index 26bab0d..24edb96 100644
--- a/lib/Frontend/InitHeaderSearch.cpp
+++ b/lib/Frontend/InitHeaderSearch.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Frontend/Utils.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Config/config.h" // C_INCLUDE_DIRS
+#include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderMap.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/HeaderSearchOptions.h"
@@ -25,7 +25,6 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -267,38 +266,39 @@
   }
 
   case llvm::Triple::Haiku:
-    AddPath("/boot/common/include", System, false);
-    AddPath("/boot/develop/headers/os", System, false);
-    AddPath("/boot/develop/headers/os/app", System, false);
-    AddPath("/boot/develop/headers/os/arch", System, false);
-    AddPath("/boot/develop/headers/os/device", System, false);
-    AddPath("/boot/develop/headers/os/drivers", System, false);
-    AddPath("/boot/develop/headers/os/game", System, false);
-    AddPath("/boot/develop/headers/os/interface", System, false);
-    AddPath("/boot/develop/headers/os/kernel", System, false);
-    AddPath("/boot/develop/headers/os/locale", System, false);
-    AddPath("/boot/develop/headers/os/mail", System, false);
-    AddPath("/boot/develop/headers/os/media", System, false);
-    AddPath("/boot/develop/headers/os/midi", System, false);
-    AddPath("/boot/develop/headers/os/midi2", System, false);
-    AddPath("/boot/develop/headers/os/net", System, false);
-    AddPath("/boot/develop/headers/os/storage", System, false);
-    AddPath("/boot/develop/headers/os/support", System, false);
-    AddPath("/boot/develop/headers/os/translation", System, false);
-    AddPath("/boot/develop/headers/os/add-ons/graphics", System, false);
-    AddPath("/boot/develop/headers/os/add-ons/input_server", System, false);
-    AddPath("/boot/develop/headers/os/add-ons/screen_saver", System, false);
-    AddPath("/boot/develop/headers/os/add-ons/tracker", System, false);
-    AddPath("/boot/develop/headers/os/be_apps/Deskbar", System, false);
-    AddPath("/boot/develop/headers/os/be_apps/NetPositive", System, false);
-    AddPath("/boot/develop/headers/os/be_apps/Tracker", System, false);
-    AddPath("/boot/develop/headers/cpp", System, false);
-    AddPath("/boot/develop/headers/cpp/i586-pc-haiku", System, false);
-    AddPath("/boot/develop/headers/3rdparty", System, false);
-    AddPath("/boot/develop/headers/bsd", System, false);
-    AddPath("/boot/develop/headers/glibc", System, false);
-    AddPath("/boot/develop/headers/posix", System, false);
-    AddPath("/boot/develop/headers",  System, false);
+    AddPath("/boot/system/non-packaged/develop/headers", System, false);
+    AddPath("/boot/system/develop/headers/os", System, false);
+    AddPath("/boot/system/develop/headers/os/app", System, false);
+    AddPath("/boot/system/develop/headers/os/arch", System, false);
+    AddPath("/boot/system/develop/headers/os/device", System, false);
+    AddPath("/boot/system/develop/headers/os/drivers", System, false);
+    AddPath("/boot/system/develop/headers/os/game", System, false);
+    AddPath("/boot/system/develop/headers/os/interface", System, false);
+    AddPath("/boot/system/develop/headers/os/kernel", System, false);
+    AddPath("/boot/system/develop/headers/os/locale", System, false);
+    AddPath("/boot/system/develop/headers/os/mail", System, false);
+    AddPath("/boot/system/develop/headers/os/media", System, false);
+    AddPath("/boot/system/develop/headers/os/midi", System, false);
+    AddPath("/boot/system/develop/headers/os/midi2", System, false);
+    AddPath("/boot/system/develop/headers/os/net", System, false);
+    AddPath("/boot/system/develop/headers/os/opengl", System, false);
+    AddPath("/boot/system/develop/headers/os/storage", System, false);
+    AddPath("/boot/system/develop/headers/os/support", System, false);
+    AddPath("/boot/system/develop/headers/os/translation", System, false);
+    AddPath("/boot/system/develop/headers/os/add-ons/graphics", System, false);
+    AddPath("/boot/system/develop/headers/os/add-ons/input_server", System, false);
+    AddPath("/boot/system/develop/headers/os/add-ons/mail_daemon", System, false);
+    AddPath("/boot/system/develop/headers/os/add-ons/registrar", System, false);
+    AddPath("/boot/system/develop/headers/os/add-ons/screen_saver", System, false);
+    AddPath("/boot/system/develop/headers/os/add-ons/tracker", System, false);
+    AddPath("/boot/system/develop/headers/os/be_apps/Deskbar", System, false);
+    AddPath("/boot/system/develop/headers/os/be_apps/NetPositive", System, false);
+    AddPath("/boot/system/develop/headers/os/be_apps/Tracker", System, false);
+    AddPath("/boot/system/develop/headers/3rdparty", System, false);
+    AddPath("/boot/system/develop/headers/bsd", System, false);
+    AddPath("/boot/system/develop/headers/glibc", System, false);
+    AddPath("/boot/system/develop/headers/posix", System, false);
+    AddPath("/boot/system/develop/headers",  System, false);
     break;
   case llvm::Triple::RTEMS:
     break;
@@ -326,7 +326,7 @@
     // <isysroot> gets prepended later in AddPath().
     std::string BaseSDKPath = "";
     if (!HasSysroot) {
-      const char *envValue = getenv("SCE_PS4_SDK_DIR");
+      const char *envValue = getenv("SCE_ORBIS_SDK_DIR");
       if (envValue)
         BaseSDKPath = envValue;
       else {
diff --git a/lib/Frontend/InitPreprocessor.cpp b/lib/Frontend/InitPreprocessor.cpp
index 15aa546..f8b912e 100644
--- a/lib/Frontend/InitPreprocessor.cpp
+++ b/lib/Frontend/InitPreprocessor.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Frontend/Utils.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/SourceManager.h"
@@ -19,15 +18,13 @@
 #include "clang/Basic/Version.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/FrontendOptions.h"
+#include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/PTHManager.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Serialization/ASTReader.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
 using namespace clang;
 
 static bool MacroBodyEndsInBackslash(StringRef MacroBody) {
@@ -408,6 +405,39 @@
   if (LangOpts.ObjC1)
     Builder.defineMacro("__OBJC__");
 
+  // OpenCL v1.0/1.1 s6.9, v1.2/2.0 s6.10: Preprocessor Directives and Macros.
+  if (LangOpts.OpenCL) {
+    // OpenCL v1.0 and v1.1 do not have a predefined macro to indicate the
+    // language standard with which the program is compiled. __OPENCL_VERSION__
+    // is for the OpenCL version supported by the OpenCL device, which is not
+    // necessarily the language standard with which the program is compiled.
+    // A shared OpenCL header file requires a macro to indicate the language
+    // standard. As a workaround, __OPENCL_C_VERSION__ is defined for
+    // OpenCL v1.0 and v1.1.
+    switch (LangOpts.OpenCLVersion) {
+    case 100:
+      Builder.defineMacro("__OPENCL_C_VERSION__", "100");
+      break;
+    case 110:
+      Builder.defineMacro("__OPENCL_C_VERSION__", "110");
+      break;
+    case 120:
+      Builder.defineMacro("__OPENCL_C_VERSION__", "120");
+      break;
+    case 200:
+      Builder.defineMacro("__OPENCL_C_VERSION__", "200");
+      break;
+    default:
+      llvm_unreachable("Unsupported OpenCL version");
+    }
+    Builder.defineMacro("CL_VERSION_1_0", "100");
+    Builder.defineMacro("CL_VERSION_1_1", "110");
+    Builder.defineMacro("CL_VERSION_1_2", "120");
+    Builder.defineMacro("CL_VERSION_2_0", "200");
+
+    if (LangOpts.FastRelaxedMath)
+      Builder.defineMacro("__FAST_RELAXED_MATH__");
+  }
   // Not "standard" per se, but available even with the -undef flag.
   if (LangOpts.AsmPreprocessor)
     Builder.defineMacro("__ASSEMBLER__");
@@ -793,8 +823,8 @@
   DefineFastIntType(64, true, TI, Builder);
   DefineFastIntType(64, false, TI, Builder);
 
-  if (const char *Prefix = TI.getUserLabelPrefix())
-    Builder.defineMacro("__USER_LABEL_PREFIX__", Prefix);
+  char UserLabelPrefix[2] = {TI.getDataLayout().getGlobalPrefix(), 0};
+  Builder.defineMacro("__USER_LABEL_PREFIX__", UserLabelPrefix);
 
   if (LangOpts.FastMath || LangOpts.FiniteMathOnly)
     Builder.defineMacro("__FINITE_MATH_ONLY__", "1");
@@ -811,7 +841,7 @@
     // FIXME: This is target-dependent.
     Builder.defineMacro("__GCC_ATOMIC_TEST_AND_SET_TRUEVAL", "1");
 
-    // Used by libstdc++ to implement ATOMIC_<foo>_LOCK_FREE.
+    // Used by libc++ and libstdc++ to implement ATOMIC_<foo>_LOCK_FREE.
     unsigned InlineWidthBits = TI.getMaxAtomicInlineWidth();
 #define DEFINE_LOCK_FREE_MACRO(TYPE, Type) \
     Builder.defineMacro("__GCC_ATOMIC_" #TYPE "_LOCK_FREE", \
@@ -840,10 +870,10 @@
   if (unsigned PICLevel = LangOpts.PICLevel) {
     Builder.defineMacro("__PIC__", Twine(PICLevel));
     Builder.defineMacro("__pic__", Twine(PICLevel));
-  }
-  if (unsigned PIELevel = LangOpts.PIELevel) {
-    Builder.defineMacro("__PIE__", Twine(PIELevel));
-    Builder.defineMacro("__pie__", Twine(PIELevel));
+    if (LangOpts.PIE) {
+      Builder.defineMacro("__PIE__", Twine(PICLevel));
+      Builder.defineMacro("__pie__", Twine(PICLevel));
+    }
   }
 
   // Macros to control C99 numerics and <float.h>
@@ -889,13 +919,24 @@
   }
 
   // OpenMP definition
-  if (LangOpts.OpenMP) {
-    // OpenMP 2.2:
-    //   In implementations that support a preprocessor, the _OPENMP
-    //   macro name is defined to have the decimal value yyyymm where
-    //   yyyy and mm are the year and the month designations of the
-    //   version of the OpenMP API that the implementation support.
+  // OpenMP 2.2:
+  //   In implementations that support a preprocessor, the _OPENMP
+  //   macro name is defined to have the decimal value yyyymm where
+  //   yyyy and mm are the year and the month designations of the
+  //   version of the OpenMP API that the implementation support.
+  switch (LangOpts.OpenMP) {
+  case 0:
+    break;
+  case 40:
     Builder.defineMacro("_OPENMP", "201307");
+    break;
+  case 45:
+    Builder.defineMacro("_OPENMP", "201511");
+    break;
+  default:
+    // Default version is OpenMP 3.1
+    Builder.defineMacro("_OPENMP", "201107");
+    break;
   }
 
   // CUDA device path compilaton
@@ -905,6 +946,26 @@
     Builder.defineMacro("__CUDA_ARCH__");
   }
 
+  // We need to communicate this to our CUDA header wrapper, which in turn
+  // informs the proper CUDA headers of this choice.
+  if (LangOpts.CUDADeviceApproxTranscendentals || LangOpts.FastMath) {
+    Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
+  }
+
+  // OpenCL definitions.
+  if (LangOpts.OpenCL) {
+#define OPENCLEXT(Ext) \
+    if (TI.getSupportedOpenCLOpts().is_##Ext##_supported( \
+        LangOpts.OpenCLVersion)) \
+      Builder.defineMacro(#Ext);
+#include "clang/Basic/OpenCLExtensions.def"
+  }
+
+  if (TI.hasInt128Type() && LangOpts.CPlusPlus && LangOpts.GNUMode) {
+    Builder.defineMacro("__GLIBCXX_TYPE_INT_N_0", "__int128");
+    Builder.defineMacro("__GLIBCXX_BITSIZE_INT_N_0", "128");
+  }
+
   // Get other target #defines.
   TI.getTargetDefines(LangOpts, Builder);
 }
@@ -972,6 +1033,10 @@
                          PP.getDiagnostics());
   }
 
+  // Exit the command line and go back to <built-in> (2 is LC_LEAVE).
+  if (!PP.getLangOpts().AsmPreprocessor)
+    Builder.append("# 1 \"<built-in>\" 2");
+
   // If -imacros are specified, include them now.  These are processed before
   // any -include directives.
   for (unsigned i = 0, e = InitOpts.MacroIncludes.size(); i != e; ++i)
@@ -990,10 +1055,6 @@
     AddImplicitInclude(Builder, Path);
   }
 
-  // Exit the command line and go back to <built-in> (2 is LC_LEAVE).
-  if (!PP.getLangOpts().AsmPreprocessor)
-    Builder.append("# 1 \"<built-in>\" 2");
-
   // Instruct the preprocessor to skip the preamble.
   PP.setSkipMainFilePreamble(InitOpts.PrecompiledPreambleBytes.first,
                              InitOpts.PrecompiledPreambleBytes.second);
diff --git a/lib/Frontend/Makefile b/lib/Frontend/Makefile
deleted file mode 100644
index 8554b76..0000000
--- a/lib/Frontend/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- clang/lib/Frontend/Makefile -------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-DIRS := Rewrite
-LIBRARYNAME := clangFrontend
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Frontend/ModuleDependencyCollector.cpp b/lib/Frontend/ModuleDependencyCollector.cpp
index ca11f9b..cc655f6 100644
--- a/lib/Frontend/ModuleDependencyCollector.cpp
+++ b/lib/Frontend/ModuleDependencyCollector.cpp
@@ -15,7 +15,6 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Serialization/ASTReader.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -135,6 +134,10 @@
   // allows crash reproducer scripts to work across machines.
   VFSWriter.setOverlayDir(VFSDir);
 
+  // Do not ignore non existent contents otherwise we might skip something
+  // that should have been collected here.
+  VFSWriter.setIgnoreNonExistentContents(false);
+
   // Explicitly set case sensitivity for the YAML writer. For that, find out
   // the sensitivity at the path where the headers all collected to.
   VFSWriter.setCaseSensitivity(isCaseSensitivePath(VFSDir));
diff --git a/lib/Frontend/MultiplexConsumer.cpp b/lib/Frontend/MultiplexConsumer.cpp
index d1931fe..8ef6df5 100644
--- a/lib/Frontend/MultiplexConsumer.cpp
+++ b/lib/Frontend/MultiplexConsumer.cpp
@@ -120,11 +120,14 @@
   void CompletedImplicitDefinition(const FunctionDecl *D) override;
   void StaticDataMemberInstantiated(const VarDecl *D) override;
   void DefaultArgumentInstantiated(const ParmVarDecl *D) override;
+  void DefaultMemberInitializerInstantiated(const FieldDecl *D) override;
   void AddedObjCCategoryToInterface(const ObjCCategoryDecl *CatD,
                                     const ObjCInterfaceDecl *IFD) override;
   void FunctionDefinitionInstantiated(const FunctionDecl *D) override;
   void DeclarationMarkedUsed(const Decl *D) override;
   void DeclarationMarkedOpenMPThreadPrivate(const Decl *D) override;
+  void DeclarationMarkedOpenMPDeclareTarget(const Decl *D,
+                                            const Attr *Attr) override;
   void RedefinedHiddenDefinition(const NamedDecl *D, Module *M) override;
   void AddedAttributeToRecord(const Attr *Attr, 
                               const RecordDecl *Record) override;
@@ -199,6 +202,11 @@
   for (size_t i = 0, e = Listeners.size(); i != e; ++i)
     Listeners[i]->DefaultArgumentInstantiated(D);
 }
+void MultiplexASTMutationListener::DefaultMemberInitializerInstantiated(
+                                                           const FieldDecl *D) {
+  for (size_t i = 0, e = Listeners.size(); i != e; ++i)
+    Listeners[i]->DefaultMemberInitializerInstantiated(D);
+}
 void MultiplexASTMutationListener::AddedObjCCategoryToInterface(
                                                  const ObjCCategoryDecl *CatD,
                                                  const ObjCInterfaceDecl *IFD) {
@@ -219,6 +227,11 @@
   for (size_t i = 0, e = Listeners.size(); i != e; ++i)
     Listeners[i]->DeclarationMarkedOpenMPThreadPrivate(D);
 }
+void MultiplexASTMutationListener::DeclarationMarkedOpenMPDeclareTarget(
+    const Decl *D, const Attr *Attr) {
+  for (auto *L : Listeners)
+    L->DeclarationMarkedOpenMPDeclareTarget(D, Attr);
+}
 void MultiplexASTMutationListener::RedefinedHiddenDefinition(const NamedDecl *D,
                                                              Module *M) {
   for (auto *L : Listeners)
@@ -272,9 +285,9 @@
   return Continue;
 }
 
-void MultiplexConsumer::HandleInlineMethodDefinition(CXXMethodDecl *D) {
+void MultiplexConsumer::HandleInlineFunctionDefinition(FunctionDecl *D) {
   for (auto &Consumer : Consumers)
-    Consumer->HandleInlineMethodDefinition(D);
+    Consumer->HandleInlineFunctionDefinition(D);
 }
 
 void MultiplexConsumer::HandleCXXStaticMemberVarInstantiation(VarDecl *VD) {
@@ -317,26 +330,16 @@
     Consumer->HandleImplicitImportDecl(D);
 }
 
-void MultiplexConsumer::HandleLinkerOption(llvm::StringRef Opts) {
-  for (auto &Consumer : Consumers)
-    Consumer->HandleLinkerOption(Opts);
-}
-
-void MultiplexConsumer::HandleDetectMismatch(llvm::StringRef Name, llvm::StringRef Value) {
-  for (auto &Consumer : Consumers)
-    Consumer->HandleDetectMismatch(Name, Value);
-}
-
-void MultiplexConsumer::HandleDependentLibrary(llvm::StringRef Lib) {
-  for (auto &Consumer : Consumers)
-    Consumer->HandleDependentLibrary(Lib);
-}
-
 void MultiplexConsumer::CompleteTentativeDefinition(VarDecl *D) {
   for (auto &Consumer : Consumers)
     Consumer->CompleteTentativeDefinition(D);
 }
 
+void MultiplexConsumer::AssignInheritanceModel(CXXRecordDecl *RD) {
+  for (auto &Consumer : Consumers)
+    Consumer->AssignInheritanceModel(RD);
+}
+
 void MultiplexConsumer::HandleVTable(CXXRecordDecl *RD) {
   for (auto &Consumer : Consumers)
     Consumer->HandleVTable(RD);
diff --git a/lib/Frontend/PCHContainerOperations.cpp b/lib/Frontend/PCHContainerOperations.cpp
index fd84678..2d4edde 100644
--- a/lib/Frontend/PCHContainerOperations.cpp
+++ b/lib/Frontend/PCHContainerOperations.cpp
@@ -13,9 +13,10 @@
 
 #include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/AST/ASTConsumer.h"
+#include "clang/Lex/ModuleLoader.h"
 #include "llvm/Bitcode/BitstreamReader.h"
 #include "llvm/Support/raw_ostream.h"
-#include "clang/Lex/ModuleLoader.h"
+#include <utility>
 
 using namespace clang;
 
@@ -27,12 +28,12 @@
 /// \brief A PCHContainerGenerator that writes out the PCH to a flat file.
 class RawPCHContainerGenerator : public ASTConsumer {
   std::shared_ptr<PCHBuffer> Buffer;
-  raw_pwrite_stream *OS;
+  std::unique_ptr<raw_pwrite_stream> OS;
 
 public:
-  RawPCHContainerGenerator(llvm::raw_pwrite_stream *OS,
+  RawPCHContainerGenerator(std::unique_ptr<llvm::raw_pwrite_stream> OS,
                            std::shared_ptr<PCHBuffer> Buffer)
-      : Buffer(Buffer), OS(OS) {}
+      : Buffer(std::move(Buffer)), OS(std::move(OS)) {}
 
   ~RawPCHContainerGenerator() override = default;
 
@@ -52,9 +53,9 @@
 
 std::unique_ptr<ASTConsumer> RawPCHContainerWriter::CreatePCHContainerGenerator(
     CompilerInstance &CI, const std::string &MainFileName,
-    const std::string &OutputFileName, llvm::raw_pwrite_stream *OS,
+    const std::string &OutputFileName, std::unique_ptr<llvm::raw_pwrite_stream> OS,
     std::shared_ptr<PCHBuffer> Buffer) const {
-  return llvm::make_unique<RawPCHContainerGenerator>(OS, Buffer);
+  return llvm::make_unique<RawPCHContainerGenerator>(std::move(OS), Buffer);
 }
 
 void RawPCHContainerReader::ExtractPCH(
diff --git a/lib/Frontend/PrintPreprocessedOutput.cpp b/lib/Frontend/PrintPreprocessedOutput.cpp
index a58c935..77b80e6 100644
--- a/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -326,8 +326,20 @@
   if (Imported) {
     startNewLineIfNeeded();
     MoveToLine(HashLoc);
-    OS << "@import " << Imported->getFullModuleName() << ";"
-       << " /* clang -E: implicit import for \"" << File->getName() << "\" */";
+    if (PP.getLangOpts().ObjC2) {
+      OS << "@import " << Imported->getFullModuleName() << ";"
+         << " /* clang -E: implicit import for \"" << File->getName()
+         << "\" */";
+    } else {
+      // FIXME: Preseve whether this was a
+      // #include/#include_next/#include_macros/#import.
+      OS << "#include "
+         << (IsAngled ? '<' : '"')
+         << FileName
+         << (IsAngled ? '>' : '"')
+         << " /* clang -E: implicit import for module "
+         << Imported->getFullModuleName() << " */";
+    }
     // Since we want a newline after the @import, but not a #<line>, start a new
     // line immediately.
     EmittedTokensOnThisLine = true;
@@ -369,18 +381,16 @@
   setEmittedDirectiveOnThisLine();
 }
 
-static void outputPrintable(llvm::raw_ostream& OS,
-                                             const std::string &Str) {
-    for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-      unsigned char Char = Str[i];
-      if (isPrintable(Char) && Char != '\\' && Char != '"')
-        OS << (char)Char;
-      else  // Output anything hard as an octal escape.
-        OS << '\\'
-           << (char)('0'+ ((Char >> 6) & 7))
-           << (char)('0'+ ((Char >> 3) & 7))
-           << (char)('0'+ ((Char >> 0) & 7));
-    }
+static void outputPrintable(raw_ostream &OS, StringRef Str) {
+  for (unsigned char Char : Str) {
+    if (isPrintable(Char) && Char != '\\' && Char != '"')
+      OS << (char)Char;
+    else // Output anything hard as an octal escape.
+      OS << '\\'
+         << (char)('0' + ((Char >> 6) & 7))
+         << (char)('0' + ((Char >> 3) & 7))
+         << (char)('0' + ((Char >> 0) & 7));
+  }
 }
 
 void PrintPPOutputPPCallbacks::PragmaMessage(SourceLocation Loc,
@@ -547,8 +557,10 @@
     // If we have \n\r or \r\n, skip both and count as one line.
     if (Len != 1 &&
         (TokStr[1] == '\n' || TokStr[1] == '\r') &&
-        TokStr[0] != TokStr[1])
-      ++TokStr, --Len;
+        TokStr[0] != TokStr[1]) {
+      ++TokStr;
+      --Len;
+    }
   }
 
   if (NumNewlines == 0) return;
@@ -577,6 +589,15 @@
     Callbacks->MoveToLine(PragmaTok.getLocation());
     Callbacks->OS.write(Prefix, strlen(Prefix));
 
+    if (ShouldExpandTokens) {
+      // The first token does not have expanded macros. Expand them, if
+      // required.
+      auto Toks = llvm::make_unique<Token[]>(1);
+      Toks[0] = PragmaTok;
+      PP.EnterTokenStream(std::move(Toks), /*NumToks=*/1,
+                          /*DisableMacroExpansion=*/false);
+      PP.Lex(PragmaTok);
+    }
     Token PrevToken;
     Token PrevPrevToken;
     PrevToken.startToken();
diff --git a/lib/Frontend/Rewrite/FrontendActions.cpp b/lib/Frontend/Rewrite/FrontendActions.cpp
index d6e1568..2e76e2e 100644
--- a/lib/Frontend/Rewrite/FrontendActions.cpp
+++ b/lib/Frontend/Rewrite/FrontendActions.cpp
@@ -9,13 +9,12 @@
 
 #include "clang/Rewrite/Frontend/FrontendActions.h"
 #include "clang/AST/ASTConsumer.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/Preprocessor.h"
-#include "clang/Parse/Parser.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Rewrite/Frontend/ASTConsumers.h"
 #include "clang/Rewrite/Frontend/FixItRewriter.h"
 #include "clang/Rewrite/Frontend/Rewriters.h"
@@ -23,6 +22,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
+#include <utility>
 
 using namespace clang;
 
@@ -32,8 +32,9 @@
 
 std::unique_ptr<ASTConsumer>
 HTMLPrintAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
-  if (raw_ostream *OS = CI.createDefaultOutputFile(false, InFile))
-    return CreateHTMLPrinter(OS, CI.getPreprocessor());
+  if (std::unique_ptr<raw_ostream> OS =
+          CI.createDefaultOutputFile(false, InFile))
+    return CreateHTMLPrinter(std::move(OS), CI.getPreprocessor());
   return nullptr;
 }
 
@@ -60,8 +61,8 @@
 
 public:
   FixItActionSuffixInserter(std::string NewSuffix, bool FixWhatYouCan)
-    : NewSuffix(NewSuffix) {
-      this->FixWhatYouCan = FixWhatYouCan;
+      : NewSuffix(std::move(NewSuffix)) {
+    this->FixWhatYouCan = FixWhatYouCan;
   }
 
   std::string RewriteFilename(const std::string &Filename, int &fd) override {
@@ -151,14 +152,15 @@
 
 std::unique_ptr<ASTConsumer>
 RewriteObjCAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
-  if (raw_ostream *OS = CI.createDefaultOutputFile(false, InFile, "cpp")) {
+  if (std::unique_ptr<raw_ostream> OS =
+          CI.createDefaultOutputFile(false, InFile, "cpp")) {
     if (CI.getLangOpts().ObjCRuntime.isNonFragile())
       return CreateModernObjCRewriter(
-          InFile, OS, CI.getDiagnostics(), CI.getLangOpts(),
+          InFile, std::move(OS), CI.getDiagnostics(), CI.getLangOpts(),
           CI.getDiagnosticOpts().NoRewriteMacros,
           (CI.getCodeGenOpts().getDebugInfo() != codegenoptions::NoDebugInfo));
-    return CreateObjCRewriter(InFile, OS,
-                              CI.getDiagnostics(), CI.getLangOpts(),
+    return CreateObjCRewriter(InFile, std::move(OS), CI.getDiagnostics(),
+                              CI.getLangOpts(),
                               CI.getDiagnosticOpts().NoRewriteMacros);
   }
   return nullptr;
@@ -172,25 +174,28 @@
 
 void RewriteMacrosAction::ExecuteAction() {
   CompilerInstance &CI = getCompilerInstance();
-  raw_ostream *OS = CI.createDefaultOutputFile(true, getCurrentFile());
+  std::unique_ptr<raw_ostream> OS =
+      CI.createDefaultOutputFile(true, getCurrentFile());
   if (!OS) return;
 
-  RewriteMacrosInInput(CI.getPreprocessor(), OS);
+  RewriteMacrosInInput(CI.getPreprocessor(), OS.get());
 }
 
 void RewriteTestAction::ExecuteAction() {
   CompilerInstance &CI = getCompilerInstance();
-  raw_ostream *OS = CI.createDefaultOutputFile(false, getCurrentFile());
+  std::unique_ptr<raw_ostream> OS =
+      CI.createDefaultOutputFile(false, getCurrentFile());
   if (!OS) return;
 
-  DoRewriteTest(CI.getPreprocessor(), OS);
+  DoRewriteTest(CI.getPreprocessor(), OS.get());
 }
 
 void RewriteIncludesAction::ExecuteAction() {
   CompilerInstance &CI = getCompilerInstance();
-  raw_ostream *OS = CI.createDefaultOutputFile(true, getCurrentFile());
+  std::unique_ptr<raw_ostream> OS =
+      CI.createDefaultOutputFile(true, getCurrentFile());
   if (!OS) return;
 
-  RewriteIncludesInInput(CI.getPreprocessor(), OS,
+  RewriteIncludesInInput(CI.getPreprocessor(), OS.get(),
                          CI.getPreprocessorOutputOpts());
 }
diff --git a/lib/Frontend/Rewrite/HTMLPrint.cpp b/lib/Frontend/Rewrite/HTMLPrint.cpp
index 22ccfe6..15af644 100644
--- a/lib/Frontend/Rewrite/HTMLPrint.cpp
+++ b/lib/Frontend/Rewrite/HTMLPrint.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Rewrite/Frontend/ASTConsumers.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
@@ -21,7 +20,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Rewrite/Core/HTMLRewrite.h"
 #include "clang/Rewrite/Core/Rewriter.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "clang/Rewrite/Frontend/ASTConsumers.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace clang;
 
@@ -32,14 +31,14 @@
 namespace {
   class HTMLPrinter : public ASTConsumer {
     Rewriter R;
-    raw_ostream *Out;
+    std::unique_ptr<raw_ostream> Out;
     Preprocessor &PP;
     bool SyntaxHighlight, HighlightMacros;
 
   public:
-    HTMLPrinter(raw_ostream *OS, Preprocessor &pp,
+    HTMLPrinter(std::unique_ptr<raw_ostream> OS, Preprocessor &pp,
                 bool _SyntaxHighlight, bool _HighlightMacros)
-      : Out(OS), PP(pp), SyntaxHighlight(_SyntaxHighlight),
+      : Out(std::move(OS)), PP(pp), SyntaxHighlight(_SyntaxHighlight),
         HighlightMacros(_HighlightMacros) {}
 
     void Initialize(ASTContext &context) override;
@@ -47,11 +46,10 @@
   };
 }
 
-std::unique_ptr<ASTConsumer> clang::CreateHTMLPrinter(raw_ostream *OS,
-                                                      Preprocessor &PP,
-                                                      bool SyntaxHighlight,
-                                                      bool HighlightMacros) {
-  return llvm::make_unique<HTMLPrinter>(OS, PP, SyntaxHighlight,
+std::unique_ptr<ASTConsumer>
+clang::CreateHTMLPrinter(std::unique_ptr<raw_ostream> OS, Preprocessor &PP,
+                         bool SyntaxHighlight, bool HighlightMacros) {
+  return llvm::make_unique<HTMLPrinter>(std::move(OS), PP, SyntaxHighlight,
                                         HighlightMacros);
 }
 
diff --git a/lib/Frontend/Rewrite/InclusionRewriter.cpp b/lib/Frontend/Rewrite/InclusionRewriter.cpp
index ca82262..b761c34 100644
--- a/lib/Frontend/Rewrite/InclusionRewriter.cpp
+++ b/lib/Frontend/Rewrite/InclusionRewriter.cpp
@@ -450,7 +450,9 @@
               WriteLineInfo(FileName, Line - 1, FileType, "");
             StringRef LineInfoExtra;
             SourceLocation Loc = HashToken.getLocation();
-            if (const Module *Mod = FindModuleAtLocation(Loc))
+            if (const Module *Mod = PP.getLangOpts().ObjC2
+                                        ? FindModuleAtLocation(Loc)
+                                        : nullptr)
               WriteImplicitModuleImport(Mod);
             else if (const IncludedFile *Inc = FindIncludeAtLocation(Loc)) {
               // include and recursively process the file
diff --git a/lib/Frontend/Rewrite/Makefile b/lib/Frontend/Rewrite/Makefile
deleted file mode 100644
index 1d56547..0000000
--- a/lib/Frontend/Rewrite/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-##===- clang/lib/Rewrite/Makefile --------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-# This implements code transformation / rewriting facilities.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../../..
-LIBRARYNAME := clangRewriteFrontend
-
-include $(CLANG_LEVEL)/Makefile
-
-ifeq ($(ENABLE_CLANG_ARCMT),1)
-  CXX.Flags += -DCLANG_ENABLE_OBJC_REWRITER
-endif
-
diff --git a/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 4d57330..ad21751 100644
--- a/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -72,7 +72,7 @@
     Stmt *CurrentBody;
     ParentMap *PropParentMap; // created lazily.
     std::string InFileName;
-    raw_ostream* OutFile;
+    std::unique_ptr<raw_ostream> OutFile;
     std::string Preamble;
     
     TypeDecl *ProtocolTypeDecl;
@@ -135,7 +135,6 @@
     
     SmallVector<DeclRefExpr *, 32> BlockDeclRefs;
 
-    
     // Block related declarations.
     SmallVector<ValueDecl *, 8> BlockByCopyDecls;
     llvm::SmallPtrSet<ValueDecl *, 8> BlockByCopyDeclsPtrSet;
@@ -186,6 +185,7 @@
 
   public:
     llvm::DenseMap<ObjCMethodDecl*, std::string> MethodInternalNames;
+
     // Top Level Driver code.
     bool HandleTopLevelDecl(DeclGroupRef D) override {
       for (DeclGroupRef::iterator I = D.begin(), E = D.end(); I != E; ++I) {
@@ -235,14 +235,13 @@
             RewriteObjCQualifiedInterfaceTypes(TD);
         }
       }
-      return;
     }
     
     void HandleTopLevelSingleDecl(Decl *D);
     void HandleDeclInMainFile(Decl *D);
-    RewriteModernObjC(std::string inFile, raw_ostream *OS,
-                DiagnosticsEngine &D, const LangOptions &LOpts,
-                bool silenceMacroWarn, bool LineInfo);
+    RewriteModernObjC(std::string inFile, std::unique_ptr<raw_ostream> OS,
+                      DiagnosticsEngine &D, const LangOptions &LOpts,
+                      bool silenceMacroWarn, bool LineInfo);
 
     ~RewriteModernObjC() override {}
 
@@ -367,7 +366,6 @@
     Stmt *RewriteContinueStmt(ContinueStmt *S);
     void RewriteCastExpr(CStyleCastExpr *CE);
     void RewriteImplicitCastObjCExpr(CastExpr *IE);
-    void RewriteLinkageSpec(LinkageSpecDecl *LSD);
     
     // Computes ivar bitfield group no.
     unsigned ObjCIvarBitfieldGroupNo(ObjCIvarDecl *IV);
@@ -448,9 +446,6 @@
                                     std::string &Result);
     void RewriteObjCProtocolMetaData(ObjCProtocolDecl *Protocol,
                                      std::string &Result);
-    void RewriteObjCProtocolListMetaData(
-                   const ObjCList<ObjCProtocolDecl> &Prots,
-                   StringRef prefix, StringRef ClassName, std::string &Result);
     void RewriteObjCClassMetaData(ObjCImplementationDecl *IDecl,
                                           std::string &Result);
     void RewriteClassSetupInitHook(std::string &Result);
@@ -523,7 +518,6 @@
     QualType getSuperStructType();
     QualType getConstantStringStructType();
     QualType convertFunctionTypeOfBlocks(const FunctionType *FT);
-    bool BufferContainsPPDirectives(const char *startBuf, const char *endBuf);
     
     void convertToUnqualifiedObjCType(QualType &T) {
       if (T->isObjCQualifiedIdType()) {
@@ -562,6 +556,7 @@
       }
       return false;
     }
+
     bool PointerTypeTakesAnyBlockArguments(QualType QT);
     bool PointerTypeTakesAnyObjCQualifiedType(QualType QT);
     void GetExtentOfArgList(const char *Name, const char *&LParen,
@@ -608,8 +603,7 @@
                                    /*Pascal=*/false, StrType, SourceLocation());
     }
   };
-  
-}
+} // end anonymous namespace
 
 void RewriteModernObjC::RewriteBlocksInFunctionProtoType(QualType funcType,
                                                    NamedDecl *D) {
@@ -644,12 +638,13 @@
   return Ext == "h" || Ext == "hh" || Ext == "H";
 }
 
-RewriteModernObjC::RewriteModernObjC(std::string inFile, raw_ostream* OS,
-                         DiagnosticsEngine &D, const LangOptions &LOpts,
-                         bool silenceMacroWarn,
-                         bool LineInfo)
-      : Diags(D), LangOpts(LOpts), InFileName(inFile), OutFile(OS),
-        SilenceRewriteMacroWarning(silenceMacroWarn), GenerateLineInfo(LineInfo) {
+RewriteModernObjC::RewriteModernObjC(std::string inFile,
+                                     std::unique_ptr<raw_ostream> OS,
+                                     DiagnosticsEngine &D,
+                                     const LangOptions &LOpts,
+                                     bool silenceMacroWarn, bool LineInfo)
+    : Diags(D), LangOpts(LOpts), InFileName(inFile), OutFile(std::move(OS)),
+      SilenceRewriteMacroWarning(silenceMacroWarn), GenerateLineInfo(LineInfo) {
   IsHeader = IsHeaderFile(inFile);
   RewriteFailedDiag = Diags.getCustomDiagID(DiagnosticsEngine::Warning,
                "rewriting sub-expression within a macro (may not be correct)");
@@ -665,10 +660,12 @@
 }
 
 std::unique_ptr<ASTConsumer> clang::CreateModernObjCRewriter(
-    const std::string &InFile, raw_ostream *OS, DiagnosticsEngine &Diags,
-    const LangOptions &LOpts, bool SilenceRewriteMacroWarning, bool LineInfo) {
-  return llvm::make_unique<RewriteModernObjC>(
-      InFile, OS, Diags, LOpts, SilenceRewriteMacroWarning, LineInfo);
+    const std::string &InFile, std::unique_ptr<raw_ostream> OS,
+    DiagnosticsEngine &Diags, const LangOptions &LOpts,
+    bool SilenceRewriteMacroWarning, bool LineInfo) {
+  return llvm::make_unique<RewriteModernObjC>(InFile, std::move(OS), Diags,
+                                              LOpts, SilenceRewriteMacroWarning,
+                                              LineInfo);
 }
 
 void RewriteModernObjC::InitializeCommon(ASTContext &context) {
@@ -743,10 +740,6 @@
     if (PD->isThisDeclarationADefinition())
       RewriteProtocolDecl(PD);
   } else if (LinkageSpecDecl *LSD = dyn_cast<LinkageSpecDecl>(D)) {
-    // FIXME. This will not work in all situations and leaving it out
-    // is harmless.
-    // RewriteLinkageSpec(LSD);
-    
     // Recurse into linkage specifications
     for (DeclContext::decl_iterator DI = LSD->decls_begin(),
                                  DIEnd = LSD->decls_end();
@@ -853,7 +846,6 @@
   else
     WriteInternalIvarName(ClassDecl, D, IvarOffsetName);
   
-  
   std::string S = "(*(";
   QualType IvarT = D->getType();
   if (D->isBitField())
@@ -1068,11 +1060,11 @@
 
 void RewriteModernObjC::RewriteForwardClassEpilogue(ObjCInterfaceDecl *ClassDecl,
                                               const std::string &typedefString) {
-    SourceLocation startLoc = ClassDecl->getLocStart();
-    const char *startBuf = SM->getCharacterData(startLoc);
-    const char *semiPtr = strchr(startBuf, ';'); 
-    // Replace the @class with typedefs corresponding to the classes.
-    ReplaceText(startLoc, semiPtr-startBuf+1, typedefString);  
+  SourceLocation startLoc = ClassDecl->getLocStart();
+  const char *startBuf = SM->getCharacterData(startLoc);
+  const char *semiPtr = strchr(startBuf, ';'); 
+  // Replace the @class with typedefs corresponding to the classes.
+  ReplaceText(startLoc, semiPtr-startBuf+1, typedefString);  
 }
 
 void RewriteModernObjC::RewriteForwardClassDecl(DeclGroupRef D) {
@@ -1212,22 +1204,6 @@
   ReplaceText(LocStart, 0, "// ");
 }
 
-void 
-RewriteModernObjC::RewriteLinkageSpec(LinkageSpecDecl *LSD) {
-  SourceLocation LocStart = LSD->getExternLoc();
-  if (LocStart.isInvalid())
-    llvm_unreachable("Invalid extern SourceLocation");
-  
-  ReplaceText(LocStart, 0, "// ");
-  if (!LSD->hasBraces())
-    return;
-  // FIXME. We don't rewrite well if '{' is not on same line as 'extern'.
-  SourceLocation LocRBrace = LSD->getRBraceLoc();
-  if (LocRBrace.isInvalid())
-    llvm_unreachable("Invalid rbrace SourceLocation");
-  ReplaceText(LocRBrace, 0, "// ");
-}
-
 void RewriteModernObjC::RewriteTypeIntoString(QualType T, std::string &ResultStr,
                                         const FunctionType *&FPRetType) {
   if (T->isObjCQualifiedIdType())
@@ -1313,7 +1289,7 @@
   ResultStr += " _cmd";
 
   // Method arguments.
-  for (const auto *PDecl : OMD->params()) {
+  for (const auto *PDecl : OMD->parameters()) {
     ResultStr += ", ";
     if (PDecl->getType()->isObjCQualifiedIdType()) {
       ResultStr += "id ";
@@ -1354,6 +1330,7 @@
     }
   }
 }
+
 void RewriteModernObjC::RewriteImplementationDecl(Decl *OID) {
   ObjCImplementationDecl *IMD = dyn_cast<ObjCImplementationDecl>(OID);
   ObjCCategoryImplDecl *CID = dyn_cast<ObjCCategoryImplDecl>(OID);
@@ -1940,7 +1917,6 @@
     Diags.Report(Context->getFullLoc(S->getLocStart()),
                  TryFinallyContainsReturnDiag);
   }
-  return;
 }
 
 Stmt *RewriteModernObjC::RewriteObjCAutoreleasePoolStmt(ObjCAutoreleasePoolStmt  *S) {
@@ -2809,11 +2785,10 @@
                                      Context->UnsignedIntTy, SourceLocation());
   MsgExprs.push_back(cnt);
   
-  
   SmallVector<QualType, 4> ArgTypes;
   ArgTypes.push_back(Context->getObjCClassType());
   ArgTypes.push_back(Context->getObjCSelType());
-  for (const auto *PI : ArrayMethod->params())
+  for (const auto *PI : ArrayMethod->parameters())
     ArgTypes.push_back(PI->getType());
   
   QualType returnType = Exp->getType();
@@ -2921,8 +2896,6 @@
                              CK_BitCast,
                              DictLiteralKeyME);
   
-  
-  
   // Synthesize a call to objc_msgSend().
   SmallVector<Expr*, 32> MsgExprs;
   SmallVector<Expr*, 4> ClsExprs;
@@ -2959,11 +2932,10 @@
                                      Context->UnsignedIntTy, SourceLocation());
   MsgExprs.push_back(cnt);
   
-  
   SmallVector<QualType, 8> ArgTypes;
   ArgTypes.push_back(Context->getObjCClassType());
   ArgTypes.push_back(Context->getObjCSelType());
-  for (const auto *PI : DictMethod->params()) {
+  for (const auto *PI : DictMethod->parameters()) {
     QualType T = PI->getType();
     if (const PointerType* PT = T->getAs<PointerType>()) {
       QualType PointeeTy = PT->getPointeeType();
@@ -3176,7 +3148,6 @@
   str += "\t    memset((void*)&s, 0, sizeof(s));\n";
   str += "\t  else\n";
   
-  
   str += "\t    s = (("; str += castType.getAsString(Context->getPrintingPolicy());
   str += ")(void *)objc_msgSend_stret)(receiver, sel";
   for (unsigned i = 2; i < ArgTypes.size(); i++) {
@@ -3188,7 +3159,6 @@
   }
   str += ");\n";
   
-  
   str += "\t}\n";
   str += "\t"; str += returnType.getAsString(Context->getPrintingPolicy());
   str += " s;\n";
@@ -3530,7 +3500,7 @@
   ArgTypes.push_back(Context->getObjCSelType());
   if (ObjCMethodDecl *OMD = Exp->getMethodDecl()) {
     // Push any user argument types.
-    for (const auto *PI : OMD->params()) {
+    for (const auto *PI : OMD->parameters()) {
       QualType t = PI->getType()->isObjCQualifiedIdType()
                      ? Context->getObjCIdType()
                      : PI->getType();
@@ -3635,33 +3605,6 @@
   ProtocolExprDecls.insert(Exp->getProtocol()->getCanonicalDecl());
   // delete Exp; leak for now, see RewritePropertyOrImplicitSetter() usage for more info.
   return castExpr;
-
-}
-
-bool RewriteModernObjC::BufferContainsPPDirectives(const char *startBuf,
-                                             const char *endBuf) {
-  while (startBuf < endBuf) {
-    if (*startBuf == '#') {
-      // Skip whitespace.
-      for (++startBuf; startBuf[0] == ' ' || startBuf[0] == '\t'; ++startBuf)
-        ;
-      if (!strncmp(startBuf, "if", strlen("if")) ||
-          !strncmp(startBuf, "ifdef", strlen("ifdef")) ||
-          !strncmp(startBuf, "ifndef", strlen("ifndef")) ||
-          !strncmp(startBuf, "define", strlen("define")) ||
-          !strncmp(startBuf, "undef", strlen("undef")) ||
-          !strncmp(startBuf, "else", strlen("else")) ||
-          !strncmp(startBuf, "elif", strlen("elif")) ||
-          !strncmp(startBuf, "endif", strlen("endif")) ||
-          !strncmp(startBuf, "pragma", strlen("pragma")) ||
-          !strncmp(startBuf, "include", strlen("include")) ||
-          !strncmp(startBuf, "import", strlen("import")) ||
-          !strncmp(startBuf, "include_next", strlen("include_next")))
-        return true;
-    }
-    startBuf++;
-  }
-  return false;
 }
 
 /// IsTagDefinedInsideClass - This routine checks that a named tagged type 
@@ -3688,7 +3631,6 @@
     TagLocation = ED->getLocation();
     return Context->getSourceManager().isBeforeInTranslationUnit(
                                           IDecl->getLocation(), TagLocation);
-
   }
   return false;
 }
@@ -3820,7 +3762,6 @@
     if (IsNamedDefinition)
       GlobalDefinedTags.insert(TD);
   }
-    
 }
 
 unsigned RewriteModernObjC::ObjCIvarBitfieldGroupNo(ObjCIvarDecl *IV) {
@@ -3911,7 +3852,6 @@
   Result += "__GRBF_";
   unsigned GroupNo = ObjCIvarBitfieldGroupNo(IV);
   Result += utostr(GroupNo);
-  return;
 }
 
 /// ObjCIvarBitfieldGroupType - Names struct type for ivar bitfield group.
@@ -3924,7 +3864,6 @@
   Result += "__T_";
   unsigned GroupNo = ObjCIvarBitfieldGroupNo(IV);
   Result += utostr(GroupNo);
-  return;
 }
 
 /// ObjCIvarBitfieldGroupOffset - Names symbol for ivar bitfield group field offset.
@@ -4063,7 +4002,6 @@
 // Meta Data Emission
 //===----------------------------------------------------------------------===//
 
-
 /// RewriteImplementations - This routine rewrites all method implementations
 /// and emits meta-data.
 
@@ -4543,8 +4481,6 @@
         HasLocalVariableExternalStorage(DRE->getDecl()))
       // FIXME: Handle enums.
       BlockDeclRefs.push_back(DRE);
-
-  return;
 }
 
 void RewriteModernObjC::GetInnerBlockDeclRefExprs(Stmt *S,
@@ -4572,8 +4508,6 @@
           ImportedLocalExternalDecls.insert(Var);
     }
   }
-  
-  return;
 }
 
 /// convertObjCTypeToCStyleType - This routine converts such objc types
@@ -4658,7 +4592,7 @@
                = dyn_cast<PseudoObjectExpr>(BlockExp)) {
     CPT = POE->getType()->castAs<BlockPointerType>();
   } else {
-    assert(1 && "RewriteBlockClass: Bad type");
+    assert(false && "RewriteBlockClass: Bad type");
   }
   assert(CPT && "RewriteBlockClass: Bad type");
   const FunctionType *FT = CPT->getPointeeType()->getAs<FunctionType>();
@@ -4828,7 +4762,6 @@
       break;
     }
   }
-  return;
 }
 
 void RewriteModernObjC::RewriteImplicitCastObjCExpr(CastExpr *IC) {
@@ -4844,8 +4777,6 @@
   Str += TypeString;
   Str += ")";
   InsertText(IC->getSubExpr()->getLocStart(), Str);
-
-  return;
 }
 
 void RewriteModernObjC::RewriteBlockPointerFunctionArgs(FunctionDecl *FD) {
@@ -4880,7 +4811,6 @@
       break;
     }
   }
-  return;
 }
 
 bool RewriteModernObjC::PointerTypeTakesAnyBlockArguments(QualType QT) {
@@ -5017,11 +4947,8 @@
     OrigLength++;
   }
   ReplaceText(Start, OrigLength, buf);
-  
-  return;
 }
 
-
 /// SynthesizeByrefCopyDestroyHelper - This routine synthesizes:
 /// void __Block_byref_id_object_copy(struct Block_byref_id_object *dst,
 ///                    struct Block_byref_id_object *src) {
@@ -5242,7 +5169,6 @@
     
     InsertText(separatorLoc, lastDecl ? "}" : "};\n");
   }
-  return;
 }
 
 void RewriteModernObjC::CollectBlockDeclRefInfo(BlockExpr *Exp) {
@@ -5284,7 +5210,6 @@
 
 Stmt *RewriteModernObjC::SynthBlockInitExpr(BlockExpr *Exp,
                      const SmallVectorImpl<DeclRefExpr *> &InnerBlockDeclRefs) {
-  
   const BlockDecl *block = Exp->getBlockDecl();
   
   Blocks.push_back(Exp);
@@ -5292,7 +5217,7 @@
   CollectBlockDeclRefInfo(Exp);
   
   // Add inner imported variables now used in current block.
- int countOfInnerDecls = 0;
+  int countOfInnerDecls = 0;
   if (!InnerBlockDeclRefs.empty()) {
     for (unsigned i = 0; i < InnerBlockDeclRefs.size(); i++) {
       DeclRefExpr *Exp = InnerBlockDeclRefs[i];
@@ -7073,52 +6998,6 @@
   // Mark this protocol as having been generated.
   if (!ObjCSynthesizedProtocols.insert(PDecl->getCanonicalDecl()).second)
     llvm_unreachable("protocol already synthesized");
-  
-}
-
-void RewriteModernObjC::RewriteObjCProtocolListMetaData(
-                                const ObjCList<ObjCProtocolDecl> &Protocols,
-                                StringRef prefix, StringRef ClassName,
-                                std::string &Result) {
-  if (Protocols.empty()) return;
-  
-  for (unsigned i = 0; i != Protocols.size(); i++)
-    RewriteObjCProtocolMetaData(Protocols[i], Result);
-  
-  // Output the top lovel protocol meta-data for the class.
-  /* struct _objc_protocol_list {
-   struct _objc_protocol_list *next;
-   int    protocol_count;
-   struct _objc_protocol *class_protocols[];
-   }
-   */
-  Result += "\n";
-  if (LangOpts.MicrosoftExt)
-    Result += "__declspec(allocate(\".cat_cls_meth$B\")) ";
-  Result += "static struct {\n";
-  Result += "\tstruct _objc_protocol_list *next;\n";
-  Result += "\tint    protocol_count;\n";
-  Result += "\tstruct _objc_protocol *class_protocols[";
-  Result += utostr(Protocols.size());
-  Result += "];\n} _OBJC_";
-  Result += prefix;
-  Result += "_PROTOCOLS_";
-  Result += ClassName;
-  Result += " __attribute__ ((used, section (\"__OBJC, __cat_cls_meth\")))= "
-  "{\n\t0, ";
-  Result += utostr(Protocols.size());
-  Result += "\n";
-  
-  Result += "\t,{&_OBJC_PROTOCOL_";
-  Result += Protocols[0]->getNameAsString();
-  Result += " \n";
-  
-  for (unsigned i = 1; i != Protocols.size(); i++) {
-    Result += "\t ,&_OBJC_PROTOCOL_";
-    Result += Protocols[i]->getNameAsString();
-    Result += "\n";
-  }
-  Result += "\t }\n};\n";
 }
 
 /// hasObjCExceptionAttribute - Return true if this class or any super
@@ -7215,14 +7094,12 @@
                                  /* Container */IDecl,
                                  "_OBJC_$_PROP_LIST_",
                                  CDecl->getNameAsString());
-
   
   // Data for initializing _class_ro_t  metaclass meta-data
   uint32_t flags = CLS_META;
   std::string InstanceSize;
   std::string InstanceStart;
   
-  
   bool classIsHidden = CDecl->getVisibility() == HiddenVisibility;
   if (classIsHidden)
     flags |= OBJC2_CLS_HIDDEN;
@@ -7290,7 +7167,6 @@
   
   if (ImplementationIsNonLazy(IDecl))
     DefinedNonLazyClasses.push_back(CDecl);
-                
 }
 
 void RewriteModernObjC::RewriteClassSetupInitHook(std::string &Result) {
@@ -7473,7 +7349,6 @@
   // Determine if this category is also "non-lazy".
   if (ImplementationIsNonLazy(IDecl))
     DefinedNonLazyCategories.push_back(CDecl);
-    
 }
 
 void RewriteModernObjC::RewriteCategorySetupInitHook(std::string &Result) {
@@ -7708,4 +7583,4 @@
     return Replacement;  
 }
 
-#endif
+#endif // CLANG_ENABLE_OBJC_REWRITER
diff --git a/lib/Frontend/Rewrite/RewriteObjC.cpp b/lib/Frontend/Rewrite/RewriteObjC.cpp
index 67b2bde..5967e40 100644
--- a/lib/Frontend/Rewrite/RewriteObjC.cpp
+++ b/lib/Frontend/Rewrite/RewriteObjC.cpp
@@ -37,7 +37,6 @@
 namespace {
   class RewriteObjC : public ASTConsumer {
   protected:
-    
     enum {
       BLOCK_FIELD_IS_OBJECT   =  3,  /* id, NSObject, __attribute__((NSObject)),
                                         block, ... */
@@ -72,7 +71,7 @@
     Stmt *CurrentBody;
     ParentMap *PropParentMap; // created lazily.
     std::string InFileName;
-    raw_ostream* OutFile;
+    std::unique_ptr<raw_ostream> OutFile;
     std::string Preamble;
     
     TypeDecl *ProtocolTypeDecl;
@@ -158,14 +157,15 @@
         : R(R), SavedValue(R.DisableReplaceStmt) {
         R.DisableReplaceStmt = true;
       }
+
       ~DisableReplaceStmtScope() {
         R.DisableReplaceStmt = SavedValue;
       }
     };
+
     void InitializeCommon(ASTContext &context);
 
   public:
-
     // Top Level Driver code.
     bool HandleTopLevelDecl(DeclGroupRef D) override {
       for (DeclGroupRef::iterator I = D.begin(), E = D.end(); I != E; ++I) {
@@ -187,9 +187,10 @@
       }
       return true;
     }
+
     void HandleTopLevelSingleDecl(Decl *D);
     void HandleDeclInMainFile(Decl *D);
-    RewriteObjC(std::string inFile, raw_ostream *OS,
+    RewriteObjC(std::string inFile, std::unique_ptr<raw_ostream> OS,
                 DiagnosticsEngine &D, const LangOptions &LOpts,
                 bool silenceMacroWarn);
 
@@ -505,12 +506,10 @@
   
   class RewriteObjCFragileABI : public RewriteObjC {
   public:
-    
-    RewriteObjCFragileABI(std::string inFile, raw_ostream *OS,
-                DiagnosticsEngine &D, const LangOptions &LOpts,
-                bool silenceMacroWarn) : RewriteObjC(inFile, OS,
-                                                     D, LOpts,
-                                                     silenceMacroWarn) {}
+    RewriteObjCFragileABI(std::string inFile, std::unique_ptr<raw_ostream> OS,
+                          DiagnosticsEngine &D, const LangOptions &LOpts,
+                          bool silenceMacroWarn)
+        : RewriteObjC(inFile, std::move(OS), D, LOpts, silenceMacroWarn) {}
 
     ~RewriteObjCFragileABI() override {}
     void Initialize(ASTContext &context) override;
@@ -540,7 +539,7 @@
                                       std::string &Result) override;
     Stmt *RewriteObjCIvarRefExpr(ObjCIvarRefExpr *IV) override;
   };
-}
+} // end anonymous namespace
 
 void RewriteObjC::RewriteBlocksInFunctionProtoType(QualType funcType,
                                                    NamedDecl *D) {
@@ -575,11 +574,11 @@
   return Ext == "h" || Ext == "hh" || Ext == "H";
 }
 
-RewriteObjC::RewriteObjC(std::string inFile, raw_ostream* OS,
+RewriteObjC::RewriteObjC(std::string inFile, std::unique_ptr<raw_ostream> OS,
                          DiagnosticsEngine &D, const LangOptions &LOpts,
                          bool silenceMacroWarn)
-      : Diags(D), LangOpts(LOpts), InFileName(inFile), OutFile(OS),
-        SilenceRewriteMacroWarning(silenceMacroWarn) {
+    : Diags(D), LangOpts(LOpts), InFileName(inFile), OutFile(std::move(OS)),
+      SilenceRewriteMacroWarning(silenceMacroWarn) {
   IsHeader = IsHeaderFile(inFile);
   RewriteFailedDiag = Diags.getCustomDiagID(DiagnosticsEngine::Warning,
                "rewriting sub-expression within a macro (may not be correct)");
@@ -590,11 +589,12 @@
 }
 
 std::unique_ptr<ASTConsumer>
-clang::CreateObjCRewriter(const std::string &InFile, raw_ostream *OS,
+clang::CreateObjCRewriter(const std::string &InFile,
+                          std::unique_ptr<raw_ostream> OS,
                           DiagnosticsEngine &Diags, const LangOptions &LOpts,
                           bool SilenceRewriteMacroWarning) {
-  return llvm::make_unique<RewriteObjCFragileABI>(InFile, OS, Diags, LOpts,
-                                                  SilenceRewriteMacroWarning);
+  return llvm::make_unique<RewriteObjCFragileABI>(
+      InFile, std::move(OS), Diags, LOpts, SilenceRewriteMacroWarning);
 }
 
 void RewriteObjC::InitializeCommon(ASTContext &context) {
@@ -1118,7 +1118,7 @@
   ResultStr += " _cmd";
 
   // Method arguments.
-  for (const auto *PDecl : OMD->params()) {
+  for (const auto *PDecl : OMD->parameters()) {
     ResultStr += ", ";
     if (PDecl->getType()->isObjCQualifiedIdType()) {
       ResultStr += "id ";
@@ -1159,6 +1159,7 @@
     }
   }
 }
+
 void RewriteObjC::RewriteImplementationDecl(Decl *OID) {
   ObjCImplementationDecl *IMD = dyn_cast<ObjCImplementationDecl>(OID);
   ObjCCategoryImplDecl *CID = dyn_cast<ObjCCategoryImplDecl>(OID);
@@ -1720,7 +1721,6 @@
     Diags.Report(Context->getFullLoc(S->getLocStart()),
                  TryFinallyContainsReturnDiag);
   }
-  return;
 }
 
 void RewriteObjC::HasReturnStmts(Stmt *S, bool &hasReturns) 
@@ -1730,32 +1730,29 @@
     if (SubStmt)
       HasReturnStmts(SubStmt, hasReturns);
 
- if (isa<ReturnStmt>(S))
-   hasReturns = true;
- return;
+  if (isa<ReturnStmt>(S))
+    hasReturns = true;
 }
 
 void RewriteObjC::RewriteTryReturnStmts(Stmt *S) {
- // Perform a bottom up traversal of all children.
- for (Stmt *SubStmt : S->children())
-   if (SubStmt) {
-     RewriteTryReturnStmts(SubStmt);
-   }
- if (isa<ReturnStmt>(S)) {
-   SourceLocation startLoc = S->getLocStart();
-   const char *startBuf = SM->getCharacterData(startLoc);
+  // Perform a bottom up traversal of all children.
+  for (Stmt *SubStmt : S->children())
+    if (SubStmt) {
+      RewriteTryReturnStmts(SubStmt);
+    }
+  if (isa<ReturnStmt>(S)) {
+    SourceLocation startLoc = S->getLocStart();
+    const char *startBuf = SM->getCharacterData(startLoc);
+    const char *semiBuf = strchr(startBuf, ';');
+    assert((*semiBuf == ';') && "RewriteTryReturnStmts: can't find ';'");
+    SourceLocation onePastSemiLoc = startLoc.getLocWithOffset(semiBuf-startBuf+1);
 
-   const char *semiBuf = strchr(startBuf, ';');
-   assert((*semiBuf == ';') && "RewriteTryReturnStmts: can't find ';'");
-   SourceLocation onePastSemiLoc = startLoc.getLocWithOffset(semiBuf-startBuf+1);
-
-   std::string buf;
-   buf = "{ objc_exception_try_exit(&_stack); return";
+    std::string buf;
+    buf = "{ objc_exception_try_exit(&_stack); return";
    
-   ReplaceText(startLoc, 6, buf);
-   InsertText(onePastSemiLoc, "}");
- }
- return;
+    ReplaceText(startLoc, 6, buf);
+    InsertText(onePastSemiLoc, "}");
+  }
 }
 
 void RewriteObjC::RewriteSyncReturnStmts(Stmt *S, std::string syncExitBuf) {
@@ -1780,7 +1777,6 @@
     ReplaceText(startLoc, 6, buf);
     InsertText(onePastSemiLoc, "}");
   }
-  return;
 }
 
 Stmt *RewriteObjC::RewriteObjCTryStmt(ObjCAtTryStmt *S) {
@@ -2287,7 +2283,6 @@
   }
 }
 
-
 void RewriteObjC::RewriteBlockLiteralFunctionDecl(FunctionDecl *FD) {
   SourceLocation FunLocStart = FD->getTypeSpecStartLoc();
   const FunctionType *funcType = FD->getType()->getAs<FunctionType>();
@@ -2615,10 +2610,8 @@
   CallExpr *STCE = new (Context) CallExpr(
       *Context, PE, MsgExprs, FT->getReturnType(), VK_RValue, SourceLocation());
   return STCE;
-  
 }
 
-
 Stmt *RewriteObjC::SynthMessageExpr(ObjCMessageExpr *Exp,
                                     SourceLocation StartLoc,
                                     SourceLocation EndLoc) {
@@ -2924,7 +2917,7 @@
   ArgTypes.push_back(Context->getObjCSelType());
   if (ObjCMethodDecl *OMD = Exp->getMethodDecl()) {
     // Push any user argument types.
-    for (const auto *PI : OMD->params()) {
+    for (const auto *PI : OMD->parameters()) {
       QualType t = PI->getType()->isObjCQualifiedIdType()
                      ? Context->getObjCIdType()
                      : PI->getType();
@@ -3059,7 +3052,6 @@
   ProtocolExprDecls.insert(Exp->getProtocol()->getCanonicalDecl());
   // delete Exp; leak for now, see RewritePropertyOrImplicitSetter() usage for more info.
   return castExpr;
-
 }
 
 bool RewriteObjC::BufferContainsPPDirectives(const char *startBuf,
@@ -3224,7 +3216,6 @@
 // Meta Data Emission
 //===----------------------------------------------------------------------===//
 
-
 /// RewriteImplementations - This routine rewrites all method implementations
 /// and emits meta-data.
 
@@ -3665,8 +3656,6 @@
         HasLocalVariableExternalStorage(DRE->getDecl()))
       // FIXME: Handle enums.
       BlockDeclRefs.push_back(DRE);
-
-  return;
 }
 
 void RewriteObjC::GetInnerBlockDeclRefExprs(Stmt *S,
@@ -3694,8 +3683,6 @@
           ImportedLocalExternalDecls.insert(Var);
     }
   }
-  
-  return;
 }
 
 /// convertFunctionTypeOfBlocks - This routine converts a function type
@@ -3761,7 +3748,7 @@
                = dyn_cast<PseudoObjectExpr>(BlockExp)) {
     CPT = POE->getType()->castAs<BlockPointerType>();
   } else {
-    assert(1 && "RewriteBlockClass: Bad type");
+    assert(false && "RewriteBlockClass: Bad type");
   }
   assert(CPT && "RewriteBlockClass: Bad type");
   const FunctionType *FT = CPT->getPointeeType()->getAs<FunctionType>();
@@ -3931,7 +3918,6 @@
       break;
     }
   }
-  return;
 }
 
 void RewriteObjC::RewriteBlockPointerFunctionArgs(FunctionDecl *FD) {
@@ -3966,7 +3952,6 @@
       break;
     }
   }
-  return;
 }
 
 bool RewriteObjC::PointerTypeTakesAnyBlockArguments(QualType QT) {
@@ -4103,11 +4088,8 @@
     OrigLength++;
   }
   ReplaceText(Start, OrigLength, buf);
-  
-  return;
 }
 
-
 /// SynthesizeByrefCopyDestroyHelper - This routine synthesizes:
 /// void __Block_byref_id_object_copy(struct Block_byref_id_object *dst,
 ///                    struct Block_byref_id_object *src) {
@@ -4328,7 +4310,6 @@
 
     InsertText(semiLoc, "}");
   }
-  return;
 }
 
 void RewriteObjC::CollectBlockDeclRefInfo(BlockExpr *Exp) {
@@ -4494,7 +4475,6 @@
           Exp = new (Context) UnaryOperator(Exp, UO_AddrOf, QT, VK_RValue,
                                             OK_Ordinary, SourceLocation());
         }
-        
       }
       InitExprs.push_back(Exp);
     }
@@ -5241,7 +5221,6 @@
   // Mark this protocol as having been generated.
   if (!ObjCSynthesizedProtocols.insert(PDecl->getCanonicalDecl()).second)
     llvm_unreachable("protocol already synthesized");
-  
 }
 
 void RewriteObjCFragileABI::RewriteObjCProtocolListMetaData(
@@ -5910,4 +5889,4 @@
   return Replacement;  
 }
 
-#endif
+#endif // CLANG_ENABLE_OBJC_REWRITER
diff --git a/lib/Frontend/Rewrite/RewriteTest.cpp b/lib/Frontend/Rewrite/RewriteTest.cpp
index 722c5e8..b0791f4 100644
--- a/lib/Frontend/Rewrite/RewriteTest.cpp
+++ b/lib/Frontend/Rewrite/RewriteTest.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Rewrite/Frontend/Rewriters.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Rewrite/Core/TokenRewriter.h"
+#include "clang/Rewrite/Frontend/Rewriters.h"
 #include "llvm/Support/raw_ostream.h"
 
-void clang::DoRewriteTest(Preprocessor &PP, raw_ostream* OS) {
+void clang::DoRewriteTest(Preprocessor &PP, raw_ostream *OS) {
   SourceManager &SM = PP.getSourceManager();
   const LangOptions &LangOpts = PP.getLangOpts();
 
diff --git a/lib/Frontend/SerializedDiagnosticPrinter.cpp b/lib/Frontend/SerializedDiagnosticPrinter.cpp
index 1bf10d2..6c8d618 100644
--- a/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -10,9 +10,7 @@
 #include "clang/Frontend/SerializedDiagnosticPrinter.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Basic/Version.h"
 #include "clang/Frontend/DiagnosticRenderer.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/SerializedDiagnosticReader.h"
@@ -24,7 +22,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
-#include <vector>
+#include <utility>
 
 using namespace clang;
 using namespace clang::serialized_diags;
@@ -147,7 +145,7 @@
 
   explicit SDiagsWriter(IntrusiveRefCntPtr<SharedState> State)
       : LangOpts(nullptr), OriginalInstance(false), MergeChildRecords(false),
-        State(State) {}
+        State(std::move(State)) {}
 
 public:
   SDiagsWriter(StringRef File, DiagnosticOptions *Diags, bool MergeChildRecords)
diff --git a/lib/Frontend/SerializedDiagnosticReader.cpp b/lib/Frontend/SerializedDiagnosticReader.cpp
index 0ebbd22..89d2cf6 100644
--- a/lib/Frontend/SerializedDiagnosticReader.cpp
+++ b/lib/Frontend/SerializedDiagnosticReader.cpp
@@ -11,7 +11,6 @@
 #include "clang/Basic/FileManager.h"
 #include "clang/Frontend/SerializedDiagnostics.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
 
 using namespace clang;
 using namespace clang::serialized_diags;
diff --git a/lib/Frontend/TestModuleFileExtension.cpp b/lib/Frontend/TestModuleFileExtension.cpp
index d1b20c4..b43d45f 100644
--- a/lib/Frontend/TestModuleFileExtension.cpp
+++ b/lib/Frontend/TestModuleFileExtension.cpp
@@ -38,9 +38,7 @@
     OS << "Hello from " << Ext->BlockName << " v" << Ext->MajorVersion << "."
        << Ext->MinorVersion;
   }
-  SmallVector<uint64_t, 4> Record;
-  Record.push_back(FIRST_EXTENSION_RECORD_ID);
-  Record.push_back(Message.size());
+  uint64_t Record[] = {FIRST_EXTENSION_RECORD_ID, Message.size()};
   Stream.EmitRecordWithBlob(Abbrev, Record, Message);
 }
 
diff --git a/lib/Frontend/TextDiagnostic.cpp b/lib/Frontend/TextDiagnostic.cpp
index d4e156d..4c39c09 100644
--- a/lib/Frontend/TextDiagnostic.cpp
+++ b/lib/Frontend/TextDiagnostic.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Locale.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -819,7 +818,15 @@
   switch (DiagOpts->getFormat()) {
   case DiagnosticOptions::Clang:
   case DiagnosticOptions::Vi:    OS << ':';    break;
-  case DiagnosticOptions::MSVC:  OS << ") : "; break;
+  case DiagnosticOptions::MSVC:
+    // MSVC2013 and before print 'file(4) : error'. MSVC2015 gets rid of the
+    // space and prints 'file(4): error'.
+    OS << ')';
+    if (LangOpts.MSCompatibilityVersion &&
+        !LangOpts.isCompatibleWithMSVC(LangOptions::MSVC2015))
+      OS << ' ';
+    OS << ": ";
+    break;
   }
 
   if (DiagOpts->ShowSourceRanges && !Ranges.empty()) {
@@ -875,7 +882,7 @@
 void TextDiagnostic::emitIncludeLocation(SourceLocation Loc,
                                          PresumedLoc PLoc,
                                          const SourceManager &SM) {
-  if (DiagOpts->ShowLocation && PLoc.getFilename())
+  if (DiagOpts->ShowLocation && PLoc.isValid())
     OS << "In file included from " << PLoc.getFilename() << ':'
        << PLoc.getLine() << ":\n";
   else
@@ -885,7 +892,7 @@
 void TextDiagnostic::emitImportLocation(SourceLocation Loc, PresumedLoc PLoc,
                                         StringRef ModuleName,
                                         const SourceManager &SM) {
-  if (DiagOpts->ShowLocation && PLoc.getFilename())
+  if (DiagOpts->ShowLocation && PLoc.isValid())
     OS << "In module '" << ModuleName << "' imported from "
        << PLoc.getFilename() << ':' << PLoc.getLine() << ":\n";
   else
@@ -896,7 +903,7 @@
                                                 PresumedLoc PLoc,
                                                 StringRef ModuleName,
                                                 const SourceManager &SM) {
-  if (DiagOpts->ShowLocation && PLoc.getFilename())
+  if (DiagOpts->ShowLocation && PLoc.isValid())
     OS << "While building module '" << ModuleName << "' imported from "
       << PLoc.getFilename() << ':' << PLoc.getLine() << ":\n";
   else
@@ -1082,10 +1089,13 @@
 
   // Get information about the buffer it points into.
   bool Invalid = false;
-  const char *BufStart = SM.getBufferData(FID, &Invalid).data();
+  StringRef BufData = SM.getBufferData(FID, &Invalid);
   if (Invalid)
     return;
 
+  const char *BufStart = BufData.data();
+  const char *BufEnd = BufStart + BufData.size();
+
   unsigned LineNo = SM.getLineNumber(FID, FileOffset);
   unsigned ColNo = SM.getColumnNumber(FID, FileOffset);
   
@@ -1101,15 +1111,20 @@
   // Compute the line end.  Scan forward from the error position to the end of
   // the line.
   const char *LineEnd = TokPtr;
-  while (*LineEnd != '\n' && *LineEnd != '\r' && *LineEnd != '\0')
+  while (*LineEnd != '\n' && *LineEnd != '\r' && LineEnd != BufEnd)
     ++LineEnd;
 
   // Arbitrarily stop showing snippets when the line is too long.
   if (size_t(LineEnd - LineStart) > MaxLineLengthToPrint)
     return;
 
+  // Trim trailing null-bytes.
+  StringRef Line(LineStart, LineEnd - LineStart);
+  while (Line.size() > ColNo && Line.back() == '\0')
+    Line = Line.drop_back();
+
   // Copy the line of code into an std::string for ease of manipulation.
-  std::string SourceLine(LineStart, LineEnd);
+  std::string SourceLine(Line.begin(), Line.end());
 
   // Build the byte to column map.
   const SourceColumnMap sourceColMap(SourceLine, DiagOpts->TabStop);
diff --git a/lib/Frontend/TextDiagnosticPrinter.cpp b/lib/Frontend/TextDiagnosticPrinter.cpp
index 66b46b7..17646b4 100644
--- a/lib/Frontend/TextDiagnosticPrinter.cpp
+++ b/lib/Frontend/TextDiagnosticPrinter.cpp
@@ -13,13 +13,11 @@
 
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/TextDiagnostic.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace clang;
diff --git a/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 116590e..13cb52a 100644
--- a/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -66,7 +66,9 @@
          it != ie; ++it) {
       if (it->getName() == CI.getFrontendOpts().ActionName) {
         std::unique_ptr<PluginASTAction> P(it->instantiate());
-        if (!P->ParseArgs(CI, CI.getFrontendOpts().PluginArgs))
+        if ((P->getActionType() != PluginASTAction::ReplaceAction &&
+             P->getActionType() != PluginASTAction::Cmdline) ||
+            !P->ParseArgs(CI, CI.getFrontendOpts().PluginArgs[it->getName()]))
           return nullptr;
         return std::move(P);
       }
@@ -194,6 +196,18 @@
         << Path << Error;
   }
 
+  // Check if any of the loaded plugins replaces the main AST action
+  for (FrontendPluginRegistry::iterator it = FrontendPluginRegistry::begin(),
+                                        ie = FrontendPluginRegistry::end();
+       it != ie; ++it) {
+    std::unique_ptr<PluginASTAction> P(it->instantiate());
+    if (P->getActionType() == PluginASTAction::ReplaceAction) {
+      Clang->getFrontendOpts().ProgramAction = clang::frontend::PluginAction;
+      Clang->getFrontendOpts().ActionName = it->getName();
+      break;
+    }
+  }
+
   // Honor -mllvm.
   //
   // FIXME: Remove this, one day.
@@ -215,6 +229,11 @@
     ento::printCheckerHelp(llvm::outs(), Clang->getFrontendOpts().Plugins);
     return true;
   }
+  if (Clang->getAnalyzerOpts()->ShowEnabledCheckerList) {
+    ento::printEnabledCheckerList(llvm::outs(),
+                                  Clang->getFrontendOpts().Plugins,
+                                  *Clang->getAnalyzerOpts());
+  }
 #endif
 
   // If there were errors in processing arguments, don't do anything else.
diff --git a/lib/FrontendTool/Makefile b/lib/FrontendTool/Makefile
deleted file mode 100644
index dfd2820..0000000
--- a/lib/FrontendTool/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- clang/lib/FrontendTool/Makefile ---------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangFrontendTool
-
-include $(CLANG_LEVEL)/Makefile
-include $(CLANG_LEVEL)/../../Makefile.config
-
-ifeq ($(ENABLE_CLANG_ARCMT),1)
-  CXX.Flags += -DCLANG_ENABLE_ARCMT
-  CXX.Flags += -DCLANG_ENABLE_OBJC_REWRITER
-endif
-
-ifeq ($(ENABLE_CLANG_STATIC_ANALYZER),1)
-  CXX.Flags += -DCLANG_ENABLE_STATIC_ANALYZER
-endif
-
-
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt
index 813b727..600fece 100644
--- a/lib/Headers/CMakeLists.txt
+++ b/lib/Headers/CMakeLists.txt
@@ -3,22 +3,32 @@
   altivec.h
   ammintrin.h
   arm_acle.h
+  armintr.h
   avx2intrin.h
   avx512bwintrin.h
   avx512cdintrin.h
+  avx512dqintrin.h
   avx512erintrin.h
   avx512fintrin.h
+  avx512ifmaintrin.h
+  avx512ifmavlintrin.h
+  avx512pfintrin.h
+  avx512vbmiintrin.h
+  avx512vbmivlintrin.h
   avx512vlbwintrin.h
-  avx512vlintrin.h
-  avx512dqintrin.h
+  avx512vlcdintrin.h
   avx512vldqintrin.h
-  pkuintrin.h
+  avx512vlintrin.h
   avxintrin.h
   bmi2intrin.h
   bmiintrin.h
+  __clang_cuda_cmath.h
+  __clang_cuda_intrinsics.h
+  __clang_cuda_math_forward_declares.h
   __clang_cuda_runtime_wrapper.h
   cpuid.h
   cuda_builtin_vars.h
+  clflushoptintrin.h
   emmintrin.h
   f16cintrin.h
   float.h
@@ -29,7 +39,7 @@
   htmxlintrin.h
   ia32intrin.h
   immintrin.h
-  Intrin.h
+  intrin.h
   inttypes.h
   iso646.h
   limits.h
@@ -38,7 +48,10 @@
   mmintrin.h
   mm_malloc.h
   module.modulemap
+  mwaitxintrin.h
   nmmintrin.h
+  opencl-c.h
+  pkuintrin.h
   pmmintrin.h
   popcntintrin.h
   prfchwintrin.h
@@ -62,15 +75,15 @@
   vadefs.h
   varargs.h
   vecintrin.h
-  __wmmintrin_aes.h
   wmmintrin.h
+  __wmmintrin_aes.h
   __wmmintrin_pclmul.h
   x86intrin.h
   xmmintrin.h
   xopintrin.h
+  xsavecintrin.h
   xsaveintrin.h
   xsaveoptintrin.h
-  xsavecintrin.h
   xsavesintrin.h
   xtestintrin.h
   )
diff --git a/lib/Headers/Intrin.h b/lib/Headers/Intrin.h
deleted file mode 100644
index 6c1d0d1..0000000
--- a/lib/Headers/Intrin.h
+++ /dev/null
@@ -1,958 +0,0 @@
-/* ===-------- Intrin.h ---------------------------------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-/* Only include this if we're compiling for the windows platform. */
-#ifndef _MSC_VER
-#include_next <Intrin.h>
-#else
-
-#ifndef __INTRIN_H
-#define __INTRIN_H
-
-/* First include the standard intrinsics. */
-#if defined(__i386__) || defined(__x86_64__)
-#include <x86intrin.h>
-#endif
-
-/* For the definition of jmp_buf. */
-#if __STDC_HOSTED__
-#include <setjmp.h>
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if defined(__MMX__)
-/* And the random ones that aren't in those files. */
-__m64 _m_from_float(float);
-float _m_to_float(__m64);
-#endif
-
-/* Other assorted instruction intrinsics. */
-void __addfsbyte(unsigned long, unsigned char);
-void __addfsdword(unsigned long, unsigned long);
-void __addfsword(unsigned long, unsigned short);
-void __code_seg(const char *);
-static __inline__
-void __cpuid(int[4], int);
-static __inline__
-void __cpuidex(int[4], int, int);
-void __debugbreak(void);
-__int64 __emul(int, int);
-unsigned __int64 __emulu(unsigned int, unsigned int);
-void __cdecl __fastfail(unsigned int);
-unsigned int __getcallerseflags(void);
-static __inline__
-void __halt(void);
-unsigned char __inbyte(unsigned short);
-void __inbytestring(unsigned short, unsigned char *, unsigned long);
-void __incfsbyte(unsigned long);
-void __incfsdword(unsigned long);
-void __incfsword(unsigned long);
-unsigned long __indword(unsigned short);
-void __indwordstring(unsigned short, unsigned long *, unsigned long);
-void __int2c(void);
-void __invlpg(void *);
-unsigned short __inword(unsigned short);
-void __inwordstring(unsigned short, unsigned short *, unsigned long);
-void __lidt(void *);
-unsigned __int64 __ll_lshift(unsigned __int64, int);
-__int64 __ll_rshift(__int64, int);
-void __llwpcb(void *);
-unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
-void __lwpval32(unsigned int, unsigned int, unsigned int);
-unsigned int __lzcnt(unsigned int);
-unsigned short __lzcnt16(unsigned short);
-static __inline__
-void __movsb(unsigned char *, unsigned char const *, size_t);
-static __inline__
-void __movsd(unsigned long *, unsigned long const *, size_t);
-static __inline__
-void __movsw(unsigned short *, unsigned short const *, size_t);
-void __nop(void);
-void __nvreg_restore_fence(void);
-void __nvreg_save_fence(void);
-void __outbyte(unsigned short, unsigned char);
-void __outbytestring(unsigned short, unsigned char *, unsigned long);
-void __outdword(unsigned short, unsigned long);
-void __outdwordstring(unsigned short, unsigned long *, unsigned long);
-void __outword(unsigned short, unsigned short);
-void __outwordstring(unsigned short, unsigned short *, unsigned long);
-static __inline__
-unsigned int __popcnt(unsigned int);
-static __inline__
-unsigned short __popcnt16(unsigned short);
-unsigned long __readcr0(void);
-unsigned long __readcr2(void);
-static __inline__
-unsigned long __readcr3(void);
-unsigned long __readcr4(void);
-unsigned long __readcr8(void);
-unsigned int __readdr(unsigned int);
-#ifdef __i386__
-static __inline__
-unsigned char __readfsbyte(unsigned long);
-static __inline__
-unsigned long __readfsdword(unsigned long);
-static __inline__
-unsigned __int64 __readfsqword(unsigned long);
-static __inline__
-unsigned short __readfsword(unsigned long);
-#endif
-static __inline__
-unsigned __int64 __readmsr(unsigned long);
-unsigned __int64 __readpmc(unsigned long);
-unsigned long __segmentlimit(unsigned long);
-void __sidt(void *);
-void *__slwpcb(void);
-static __inline__
-void __stosb(unsigned char *, unsigned char, size_t);
-static __inline__
-void __stosd(unsigned long *, unsigned long, size_t);
-static __inline__
-void __stosw(unsigned short *, unsigned short, size_t);
-void __svm_clgi(void);
-void __svm_invlpga(void *, int);
-void __svm_skinit(int);
-void __svm_stgi(void);
-void __svm_vmload(size_t);
-void __svm_vmrun(size_t);
-void __svm_vmsave(size_t);
-void __ud2(void);
-unsigned __int64 __ull_rshift(unsigned __int64, int);
-void __vmx_off(void);
-void __vmx_vmptrst(unsigned __int64 *);
-void __wbinvd(void);
-void __writecr0(unsigned int);
-static __inline__
-void __writecr3(unsigned int);
-void __writecr4(unsigned int);
-void __writecr8(unsigned int);
-void __writedr(unsigned int, unsigned int);
-void __writefsbyte(unsigned long, unsigned char);
-void __writefsdword(unsigned long, unsigned long);
-void __writefsqword(unsigned long, unsigned __int64);
-void __writefsword(unsigned long, unsigned short);
-void __writemsr(unsigned long, unsigned __int64);
-static __inline__
-void *_AddressOfReturnAddress(void);
-static __inline__
-unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
-static __inline__
-unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
-static __inline__
-unsigned char _bittest(long const *, long);
-static __inline__
-unsigned char _bittestandcomplement(long *, long);
-static __inline__
-unsigned char _bittestandreset(long *, long);
-static __inline__
-unsigned char _bittestandset(long *, long);
-unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
-unsigned long __cdecl _byteswap_ulong(unsigned long);
-unsigned short __cdecl _byteswap_ushort(unsigned short);
-void __cdecl _disable(void);
-void __cdecl _enable(void);
-long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
-static __inline__
-long _InterlockedAnd(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedAnd16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedAnd8(char volatile *_Value, char _Mask);
-unsigned char _interlockedbittestandreset(long volatile *, long);
-static __inline__
-unsigned char _interlockedbittestandset(long volatile *, long);
-static __inline__
-long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
-                                         long _Exchange, long _Comparand);
-long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
-long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
-static __inline__
-short _InterlockedCompareExchange16(short volatile *_Destination,
-                                    short _Exchange, short _Comparand);
-static __inline__
-__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
-                                      __int64 _Exchange, __int64 _Comparand);
-__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
-                                                 __int64);
-__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
-                                                 __int64);
-static __inline__
-char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
-                                  char _Comparand);
-void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
-                                                    void *);
-void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
-                                                    void *);
-static __inline__
-long __cdecl _InterlockedDecrement(long volatile *_Addend);
-static __inline__
-short _InterlockedDecrement16(short volatile *_Addend);
-long _InterlockedExchange(long volatile *_Target, long _Value);
-static __inline__
-short _InterlockedExchange16(short volatile *_Target, short _Value);
-static __inline__
-char _InterlockedExchange8(char volatile *_Target, char _Value);
-static __inline__
-long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
-long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
-long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
-static __inline__
-short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
-__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
-__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
-static __inline__
-char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
-static __inline__
-long __cdecl _InterlockedIncrement(long volatile *_Addend);
-static __inline__
-short _InterlockedIncrement16(short volatile *_Addend);
-static __inline__
-long _InterlockedOr(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedOr16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedOr8(char volatile *_Value, char _Mask);
-static __inline__
-long _InterlockedXor(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedXor16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedXor8(char volatile *_Value, char _Mask);
-void __cdecl _invpcid(unsigned int, void *);
-static __inline__
-unsigned long __cdecl _lrotl(unsigned long, int);
-static __inline__
-unsigned long __cdecl _lrotr(unsigned long, int);
-static __inline__
-static __inline__
-void _ReadBarrier(void);
-static __inline__
-void _ReadWriteBarrier(void);
-static __inline__
-void *_ReturnAddress(void);
-unsigned int _rorx_u32(unsigned int, const unsigned int);
-static __inline__
-unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
-static __inline__
-unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
-static __inline__
-unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
-static __inline__
-unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
-static __inline__
-unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
-static __inline__
-unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
-static __inline__
-unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
-static __inline__
-unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
-int _sarx_i32(int, unsigned int);
-#if __STDC_HOSTED__
-int __cdecl _setjmp(jmp_buf);
-#endif
-unsigned int _shlx_u32(unsigned int, unsigned int);
-unsigned int _shrx_u32(unsigned int, unsigned int);
-void _Store_HLERelease(long volatile *, long);
-void _Store64_HLERelease(__int64 volatile *, __int64);
-void _StorePointer_HLERelease(void *volatile *, void *);
-static __inline__
-void _WriteBarrier(void);
-unsigned __int32 xbegin(void);
-void _xend(void);
-static __inline__
-#define _XCR_XFEATURE_ENABLED_MASK 0
-unsigned __int64 __cdecl _xgetbv(unsigned int);
-void __cdecl _xsetbv(unsigned int, unsigned __int64);
-
-/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
-#ifdef __x86_64__
-void __addgsbyte(unsigned long, unsigned char);
-void __addgsdword(unsigned long, unsigned long);
-void __addgsqword(unsigned long, unsigned __int64);
-void __addgsword(unsigned long, unsigned short);
-static __inline__
-void __faststorefence(void);
-void __incgsbyte(unsigned long);
-void __incgsdword(unsigned long);
-void __incgsqword(unsigned long);
-void __incgsword(unsigned long);
-unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
-void __lwpval64(unsigned __int64, unsigned int, unsigned int);
-unsigned __int64 __lzcnt64(unsigned __int64);
-static __inline__
-void __movsq(unsigned long long *, unsigned long long const *, size_t);
-__int64 __mulh(__int64, __int64);
-static __inline__
-unsigned __int64 __popcnt64(unsigned __int64);
-static __inline__
-unsigned char __readgsbyte(unsigned long);
-static __inline__
-unsigned long __readgsdword(unsigned long);
-static __inline__
-unsigned __int64 __readgsqword(unsigned long);
-unsigned short __readgsword(unsigned long);
-unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
-                                unsigned __int64 _HighPart,
-                                unsigned char _Shift);
-unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
-                                 unsigned __int64 _HighPart,
-                                 unsigned char _Shift);
-static __inline__
-void __stosq(unsigned __int64 *, unsigned __int64, size_t);
-unsigned char __vmx_on(unsigned __int64 *);
-unsigned char __vmx_vmclear(unsigned __int64 *);
-unsigned char __vmx_vmlaunch(void);
-unsigned char __vmx_vmptrld(unsigned __int64 *);
-unsigned char __vmx_vmread(size_t, size_t *);
-unsigned char __vmx_vmresume(void);
-unsigned char __vmx_vmwrite(size_t, size_t);
-void __writegsbyte(unsigned long, unsigned char);
-void __writegsdword(unsigned long, unsigned long);
-void __writegsqword(unsigned long, unsigned __int64);
-void __writegsword(unsigned long, unsigned short);
-static __inline__
-unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
-unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
-unsigned char _bittest64(__int64 const *, __int64);
-static __inline__
-unsigned char _bittestandcomplement64(__int64 *, __int64);
-static __inline__
-unsigned char _bittestandreset64(__int64 *, __int64);
-static __inline__
-unsigned char _bittestandset64(__int64 *, __int64);
-unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
-long _InterlockedAnd_np(long volatile *_Value, long _Mask);
-short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
-__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
-char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
-unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
-static __inline__
-unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
-long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
-                                    long _Comparand);
-unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
-                                             __int64 _ExchangeHigh,
-                                             __int64 _ExchangeLow,
-                                             __int64 *_CompareandResult);
-unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
-                                                __int64 _ExchangeHigh,
-                                                __int64 _ExchangeLow,
-                                                __int64 *_ComparandResult);
-short _InterlockedCompareExchange16_np(short volatile *_Destination,
-                                       short _Exchange, short _Comparand);
-__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
-                                                 __int64);
-__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
-                                                 __int64);
-__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
-                                         __int64 _Exchange, __int64 _Comparand);
-void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
-                                         void *_Exchange, void *_Comparand);
-void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
-                                            void *_Exchange, void *_Comparand);
-static __inline__
-__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-static __inline__
-__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-static __inline__
-__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
-static __inline__
-__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-long _InterlockedOr_np(long volatile *_Value, long _Mask);
-short _InterlockedOr16_np(short volatile *_Value, short _Mask);
-static __inline__
-__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
-char _InterlockedOr8_np(char volatile *_Value, char _Mask);
-long _InterlockedXor_np(long volatile *_Value, long _Mask);
-short _InterlockedXor16_np(short volatile *_Value, short _Mask);
-static __inline__
-__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
-char _InterlockedXor8_np(char volatile *_Value, char _Mask);
-static __inline__
-__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
-                __int64 *_HighProduct);
-unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
-__int64 _sarx_i64(__int64, unsigned int);
-#if __STDC_HOSTED__
-int __cdecl _setjmpex(jmp_buf);
-#endif
-unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
-unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
-/*
- * Multiply two 64-bit integers and obtain a 64-bit result.
- * The low-half is returned directly and the high half is in an out parameter.
- */
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
-         unsigned __int64 *_HighProduct) {
-  unsigned __int128 _FullProduct =
-      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
-  *_HighProduct = _FullProduct >> 64;
-  return _FullProduct;
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
-  unsigned __int128 _FullProduct =
-      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
-  return _FullProduct >> 64;
-}
-
-#endif /* __x86_64__ */
-
-/*----------------------------------------------------------------------------*\
-|* Multiplication
-\*----------------------------------------------------------------------------*/
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-__emul(int __in1, int __in2) {
-  return (__int64)__in1 * (__int64)__in2;
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__emulu(unsigned int __in1, unsigned int __in2) {
-  return (unsigned __int64)__in1 * (unsigned __int64)__in2;
-}
-/*----------------------------------------------------------------------------*\
-|* Bit Twiddling
-\*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_rotl8(unsigned char _Value, unsigned char _Shift) {
-  _Shift &= 0x7;
-  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_rotr8(unsigned char _Value, unsigned char _Shift) {
-  _Shift &= 0x7;
-  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-_rotl16(unsigned short _Value, unsigned char _Shift) {
-  _Shift &= 0xf;
-  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-_rotr16(unsigned short _Value, unsigned char _Shift) {
-  _Shift &= 0xf;
-  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rotl(unsigned int _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rotr(unsigned int _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-_lrotl(unsigned long _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-_lrotr(unsigned long _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
-}
-static
-__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_rotl64(unsigned __int64 _Value, int _Shift) {
-  _Shift &= 0x3f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
-}
-static
-__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_rotr64(unsigned __int64 _Value, int _Shift) {
-  _Shift &= 0x3f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
-}
-/*----------------------------------------------------------------------------*\
-|* Bit Counting and Testing
-\*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = __builtin_ctzl(_Mask);
-  return 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = 31 - __builtin_clzl(_Mask);
-  return 1;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__popcnt16(unsigned short _Value) {
-  return __builtin_popcount((int)_Value);
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__popcnt(unsigned int _Value) {
-  return __builtin_popcount(_Value);
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittest(long const *_BitBase, long _BitPos) {
-  return (*_BitBase >> _BitPos) & 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandcomplement(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase ^ (1 << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandreset(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase & ~(1 << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandset(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase | (1 << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
-  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
-  return (_PrevVal >> _BitPos) & 1;
-}
-#ifdef __x86_64__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = __builtin_ctzll(_Mask);
-  return 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = 63 - __builtin_clzll(_Mask);
-  return 1;
-}
-static __inline__
-unsigned __int64 __DEFAULT_FN_ATTRS
-__popcnt64(unsigned __int64 _Value) {
-  return __builtin_popcountll(_Value);
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
-  return (*_BitBase >> _BitPos) & 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase ^ (1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase & ~(1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase | (1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
-  long long _PrevVal =
-      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
-  return (_PrevVal >> _BitPos) & 1;
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange Add
-\*----------------------------------------------------------------------------*/
-static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
-}
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange Sub
-\*----------------------------------------------------------------------------*/
-static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
-}
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
-}
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Increment
-\*----------------------------------------------------------------------------*/
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedIncrement16(short volatile *_Value) {
-  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedIncrement64(__int64 volatile *_Value) {
-  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Decrement
-\*----------------------------------------------------------------------------*/
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedDecrement16(short volatile *_Value) {
-  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedDecrement64(__int64 volatile *_Value) {
-  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked And
-\*----------------------------------------------------------------------------*/
-static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedAnd8(char volatile *_Value, char _Mask) {
-  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedAnd16(short volatile *_Value, short _Mask) {
-  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedAnd(long volatile *_Value, long _Mask) {
-  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Or
-\*----------------------------------------------------------------------------*/
-static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedOr8(char volatile *_Value, char _Mask) {
-  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedOr16(short volatile *_Value, short _Mask) {
-  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedOr(long volatile *_Value, long _Mask) {
-  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Xor
-\*----------------------------------------------------------------------------*/
-static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedXor8(char volatile *_Value, char _Mask) {
-  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedXor16(short volatile *_Value, short _Mask) {
-  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedXor(long volatile *_Value, long _Mask) {
-  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange
-\*----------------------------------------------------------------------------*/
-static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchange8(char volatile *_Target, char _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
-  return _Value;
-}
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchange16(short volatile *_Target, short _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
-  return _Value;
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
-  return _Value;
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Compare Exchange
-\*----------------------------------------------------------------------------*/
-static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange8(char volatile *_Destination,
-                             char _Exchange, char _Comparand) {
-  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-  return _Comparand;
-}
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange16(short volatile *_Destination,
-                              short _Exchange, short _Comparand) {
-  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-  return _Comparand;
-}
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange64(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand) {
-  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-  return _Comparand;
-}
-/*----------------------------------------------------------------------------*\
-|* Barriers
-\*----------------------------------------------------------------------------*/
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadWriteBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_WriteBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__faststorefence(void) {
-  __atomic_thread_fence(__ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* readfs, readgs
-|* (Pointers in address space #256 and #257 are relative to the GS and FS
-|* segment registers, respectively.)
-\*----------------------------------------------------------------------------*/
-#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
-    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
-    (__offset))
-
-#ifdef __i386__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-__readfsbyte(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned char, __offset);
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__readfsqword(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__readfsword(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned short, __offset);
-}
-#endif
-#ifdef __x86_64__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-__readgsbyte(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned char, __offset);
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-__readgsdword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned long, __offset);
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__readgsqword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__readgsword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned short, __offset);
-}
-#endif
-#undef __ptr_to_addr_space
-/*----------------------------------------------------------------------------*\
-|* movs, stos
-\*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
-  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
-  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
-  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
-  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
-}
-#endif
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
-  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
-  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
-}
-#endif
-
-/*----------------------------------------------------------------------------*\
-|* Misc
-\*----------------------------------------------------------------------------*/
-static __inline__ void * __DEFAULT_FN_ATTRS
-_AddressOfReturnAddress(void) {
-  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
-}
-static __inline__ void * __DEFAULT_FN_ATTRS
-_ReturnAddress(void) {
-  return __builtin_return_address(0);
-}
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuid(int __info[4], int __level) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level));
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuidex(int __info[4], int __level, int __ecx) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(__ecx));
-}
-static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
-_xgetbv(unsigned int __xcr_no) {
-  unsigned int __eax, __edx;
-  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
-  return ((unsigned __int64)__edx << 32) | __eax;
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__halt(void) {
-  __asm__ volatile ("hlt");
-}
-#endif
-
-/*----------------------------------------------------------------------------*\
-|* Privileged intrinsics
-\*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__readmsr(unsigned long __register) {
-  // Loads the contents of a 64-bit model specific register (MSR) specified in
-  // the ECX register into registers EDX:EAX. The EDX register is loaded with
-  // the high-order 32 bits of the MSR and the EAX register is loaded with the
-  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
-  // read, the values returned to EDX:EAX in unimplemented bit locations are
-  // undefined.
-  unsigned long __edx;
-  unsigned long __eax;
-  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
-  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
-}
-
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-__readcr3(void) {
-  unsigned long __cr3_val;
-  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
-  return __cr3_val;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-__writecr3(unsigned int __cr3_val) {
-  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
-}
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __INTRIN_H */
-#endif /* _MSC_VER */
diff --git a/lib/Headers/Makefile b/lib/Headers/Makefile
deleted file mode 100644
index 903acac..0000000
--- a/lib/Headers/Makefile
+++ /dev/null
@@ -1,64 +0,0 @@
-##===- clang/lib/Headers/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-
-BUILT_SOURCES = arm_neon.h.inc
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-CLANG_VERSION := $(word 3,$(shell grep "CLANG_VERSION " \
-	$(PROJ_OBJ_DIR)/$(CLANG_LEVEL)/include/clang/Basic/Version.inc))
-
-HeaderDir := $(PROJ_OBJ_ROOT)/$(BuildMode)/lib/clang/$(CLANG_VERSION)/include
-
-HEADERS := $(notdir $(wildcard $(PROJ_SRC_DIR)/*.h))
-
-OBJHEADERS := $(addprefix $(HeaderDir)/, $(HEADERS))
-
-
-$(OBJHEADERS): $(HeaderDir)/%.h: $(PROJ_SRC_DIR)/%.h $(HeaderDir)/.dir $(HeaderDir)/arm_neon.h
-	$(Verb) cp $< $@
-	$(Echo) Copying $(notdir $<) to build dir
-
-$(HeaderDir)/arm_neon.h: $(BUILT_SOURCES) $(HeaderDir)/.dir
-	$(Verb) cp $< $@
-	$(Echo) Copying $(notdir $<) to build dir
-
-$(HeaderDir)/module.modulemap: $(PROJ_SRC_DIR)/module.modulemap $(HeaderDir)/.dir
-	$(Verb) cp $< $@
-	$(Echo) Copying $(notdir $<) to build dir
-
-
-# Hook into the standard Makefile rules.
-all-local:: $(OBJHEADERS) $(HeaderDir)/module.modulemap
-
-PROJ_headers := $(DESTDIR)$(PROJ_prefix)/lib/clang/$(CLANG_VERSION)/include
-
-INSTHEADERS := $(addprefix $(PROJ_headers)/, $(HEADERS))
-INSTHEADERS += $(PROJ_headers)/arm_neon.h
-
-$(PROJ_headers):
-	$(Verb) $(MKDIR) $@
-
-$(INSTHEADERS): $(PROJ_headers)/%.h: $(HeaderDir)/%.h | $(PROJ_headers)
-	$(Verb) $(DataInstall) $< $(PROJ_headers)
-	$(Echo) Installing compiler include file: $(notdir $<)
-
-$(PROJ_headers)/module.modulemap: $(HeaderDir)/module.modulemap | $(PROJ_headers)
-	$(Verb) $(DataInstall) $< $(PROJ_headers)
-	$(Echo) Installing compiler module map file: $(notdir $<)
-
-
-install-local:: $(INSTHEADERS) $(PROJ_headers)/module.modulemap
-
-$(ObjDir)/arm_neon.h.inc.tmp : $(CLANG_LEVEL)/include/clang/Basic/arm_neon.td $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang arm_neon.h.inc with tblgen"
-	$(Verb) $(ClangTableGen) -gen-arm-neon -o $(call SYSPATH, $@) $<
diff --git a/lib/Headers/__clang_cuda_cmath.h b/lib/Headers/__clang_cuda_cmath.h
new file mode 100644
index 0000000..ae7ff2f
--- /dev/null
+++ b/lib/Headers/__clang_cuda_cmath.h
@@ -0,0 +1,148 @@
+/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_CMATH_H__
+#define __CLANG_CUDA_CMATH_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// CUDA lets us use various std math functions on the device side.  This file
+// works in concert with __clang_cuda_math_forward_declares.h to make this work.
+//
+// Specifically, the forward-declares header declares __device__ overloads for
+// these functions in the global namespace, then pulls them into namespace std
+// with 'using' statements.  Then this file implements those functions, after
+// the implementations have been pulled in.
+//
+// It's important that we declare the functions in the global namespace and pull
+// them into namespace std with using statements, as opposed to simply declaring
+// these functions in namespace std, because our device functions need to
+// overload the standard library functions, which may be declared in the global
+// namespace or in std, depending on the degree of conformance of the stdlib
+// implementation.  Declaring in the global namespace and pulling into namespace
+// std covers all of the known knowns.
+
+#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
+
+__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
+__DEVICE__ long abs(long __n) { return ::labs(__n); }
+__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
+__DEVICE__ double abs(double __x) { return ::fabs(__x); }
+__DEVICE__ float acos(float __x) { return ::acosf(__x); }
+__DEVICE__ float asin(float __x) { return ::asinf(__x); }
+__DEVICE__ float atan(float __x) { return ::atanf(__x); }
+__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
+__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
+__DEVICE__ float cos(float __x) { return ::cosf(__x); }
+__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
+__DEVICE__ float exp(float __x) { return ::expf(__x); }
+__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
+__DEVICE__ float floor(float __x) { return ::floorf(__x); }
+__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
+__DEVICE__ int fpclassify(float __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ int fpclassify(double __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ float frexp(float __arg, int *__exp) {
+  return ::frexpf(__arg, __exp);
+}
+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ bool isgreater(float __x, float __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreater(double __x, double __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(float __x, float __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(double __x, double __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isless(float __x, float __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool isless(double __x, double __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool islessequal(float __x, float __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessequal(double __x, double __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessgreater(float __x, float __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool islessgreater(double __x, double __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isunordered(float __x, float __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ bool isunordered(double __x, double __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ float ldexp(float __arg, int __exp) {
+  return ::ldexpf(__arg, __exp);
+}
+__DEVICE__ float log(float __x) { return ::logf(__x); }
+__DEVICE__ float log10(float __x) { return ::log10f(__x); }
+__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
+__DEVICE__ float nexttoward(float __from, float __to) {
+  return __builtin_nexttowardf(__from, __to);
+}
+__DEVICE__ double nexttoward(double __from, double __to) {
+  return __builtin_nexttoward(__from, __to);
+}
+__DEVICE__ float pow(float __base, float __exp) {
+  return ::powf(__base, __exp);
+}
+__DEVICE__ float pow(float __base, int __iexp) {
+  return ::powif(__base, __iexp);
+}
+__DEVICE__ double pow(double __base, int __iexp) {
+  return ::powi(__base, __iexp);
+}
+__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
+__DEVICE__ float sin(float __x) { return ::sinf(__x); }
+__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
+__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
+__DEVICE__ float tan(float __x) { return ::tanf(__x); }
+__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
+
+#undef __DEVICE__
+
+#endif
diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h
new file mode 100644
index 0000000..3df41fa
--- /dev/null
+++ b/lib/Headers/__clang_cuda_intrinsics.h
@@ -0,0 +1,322 @@
+/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_INTRINSICS_H__
+#define __CLANG_CUDA_INTRINSICS_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// sm_30 intrinsics: __shfl_{up,down,xor}.
+
+#define __SM_30_INTRINSICS_H__
+#define __SM_30_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#pragma push_macro("__MAKE_SHUFFLES")
+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask)    \
+  inline __device__ int __FnName(int __in, int __offset,                       \
+                                 int __width = warpSize) {                     \
+    return __IntIntrinsic(__in, __offset,                                      \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(float __in, int __offset,                   \
+                                   int __width = warpSize) {                   \
+    return __FloatIntrinsic(__in, __offset,                                    \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __in, int __offset,     \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(static_cast<int>(__in), __offset, __width));                \
+  }                                                                            \
+  inline __device__ long long __FnName(long long __in, int __offset,           \
+                                       int __width = warpSize) {               \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__in) == sizeof(__Bits));                            \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__in, &__tmp, sizeof(__in));                                       \
+    __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
+    __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
+    long long __out;                                                           \
+    memcpy(&__out, &__tmp, sizeof(__tmp));                                     \
+    return __out;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned long long __in, int __offset, int __width = warpSize) {         \
+    return static_cast<unsigned long long>(                                    \
+        ::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \
+  }                                                                            \
+  inline __device__ double __FnName(double __in, int __offset,                 \
+                                    int __width = warpSize) {                  \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__in));                             \
+    memcpy(&__tmp, &__in, sizeof(__in));                                       \
+    __tmp = ::__FnName(__tmp, __offset, __width);                              \
+    double __out;                                                              \
+    memcpy(&__out, &__tmp, sizeof(__out));                                     \
+    return __out;                                                              \
+  }
+
+__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
+__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
+__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
+
+#pragma pop_macro("__MAKE_SHUFFLES")
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
+
+// Prevent the vanilla sm_32 intrinsics header from being included.
+#define __SM_32_INTRINSICS_H__
+#define __SM_32_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
+inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
+inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
+inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
+inline __device__ long long __ldg(const long long *ptr) {
+  return __nvvm_ldg_ll(ptr);
+}
+inline __device__ unsigned char __ldg(const unsigned char *ptr) {
+  return __nvvm_ldg_uc(ptr);
+}
+inline __device__ unsigned short __ldg(const unsigned short *ptr) {
+  return __nvvm_ldg_us(ptr);
+}
+inline __device__ unsigned int __ldg(const unsigned int *ptr) {
+  return __nvvm_ldg_ui(ptr);
+}
+inline __device__ unsigned long __ldg(const unsigned long *ptr) {
+  return __nvvm_ldg_ul(ptr);
+}
+inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
+  return __nvvm_ldg_ull(ptr);
+}
+inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
+inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
+
+inline __device__ char2 __ldg(const char2 *ptr) {
+  typedef char c2 __attribute__((ext_vector_type(2)));
+  // We can assume that ptr is aligned at least to char2's alignment, but the
+  // load will assume that ptr is aligned to char2's alignment.  This is only
+  // safe if alignof(c2) <= alignof(char2).
+  c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
+  char2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ char4 __ldg(const char4 *ptr) {
+  typedef char c4 __attribute__((ext_vector_type(4)));
+  c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
+  char4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ short2 __ldg(const short2 *ptr) {
+  typedef short s2 __attribute__((ext_vector_type(2)));
+  s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
+  short2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ short4 __ldg(const short4 *ptr) {
+  typedef short s4 __attribute__((ext_vector_type(4)));
+  s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
+  short4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ int2 __ldg(const int2 *ptr) {
+  typedef int i2 __attribute__((ext_vector_type(2)));
+  i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
+  int2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ int4 __ldg(const int4 *ptr) {
+  typedef int i4 __attribute__((ext_vector_type(4)));
+  i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
+  int4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ longlong2 __ldg(const longlong2 *ptr) {
+  typedef long long ll2 __attribute__((ext_vector_type(2)));
+  ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
+  longlong2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+inline __device__ uchar2 __ldg(const uchar2 *ptr) {
+  typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
+  uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
+  uchar2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ uchar4 __ldg(const uchar4 *ptr) {
+  typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
+  uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
+  uchar4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ ushort2 __ldg(const ushort2 *ptr) {
+  typedef unsigned short us2 __attribute__((ext_vector_type(2)));
+  us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
+  ushort2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ ushort4 __ldg(const ushort4 *ptr) {
+  typedef unsigned short us4 __attribute__((ext_vector_type(4)));
+  us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
+  ushort4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ uint2 __ldg(const uint2 *ptr) {
+  typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
+  ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
+  uint2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ uint4 __ldg(const uint4 *ptr) {
+  typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
+  ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
+  uint4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
+  typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
+  ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
+  ulonglong2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+inline __device__ float2 __ldg(const float2 *ptr) {
+  typedef float f2 __attribute__((ext_vector_type(2)));
+  f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
+  float2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ float4 __ldg(const float4 *ptr) {
+  typedef float f4 __attribute__((ext_vector_type(4)));
+  f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
+  float4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ double2 __ldg(const double2 *ptr) {
+  typedef double d2 __attribute__((ext_vector_type(2)));
+  d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
+  double2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+// TODO: Implement these as intrinsics, so the backend can work its magic on
+// these.  Alternatively, we could implement these as plain C and try to get
+// llvm to recognize the relevant patterns.
+inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
+                                           unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
+                                            unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.l.clamp.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
+                                           unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.r.wrap.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
+                                            unsigned shiftWidth) {
+  unsigned ret;
+  asm("shf.r.clamp.b32 %0, %1, %2, %3;"
+      : "=r"(ret)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return ret;
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
diff --git a/lib/Headers/__clang_cuda_math_forward_declares.h b/lib/Headers/__clang_cuda_math_forward_declares.h
new file mode 100644
index 0000000..3f2834d
--- /dev/null
+++ b/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -0,0 +1,263 @@
+/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// This file forward-declares of some math functions we (or the CUDA headers)
+// will define later.  We need to do this, and do it before cmath is included,
+// because the standard library may have constexpr math functions.  In the
+// absence of a prior __device__ decl, those constexpr functions may become
+// implicitly host+device.  host+device functions can't be overloaded, so that
+// would preclude the use of our own __device__ overloads for these functions.
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__                                                             \
+  static __inline__ __attribute__((always_inline)) __attribute__((device))
+
+__DEVICE__ double abs(double);
+__DEVICE__ float abs(float);
+__DEVICE__ int abs(int);
+__DEVICE__ long abs(long);
+__DEVICE__ long long abs(long long);
+__DEVICE__ double acos(double);
+__DEVICE__ float acos(float);
+__DEVICE__ double acosh(double);
+__DEVICE__ float acosh(float);
+__DEVICE__ double asin(double);
+__DEVICE__ float asin(float);
+__DEVICE__ double asinh(double);
+__DEVICE__ float asinh(float);
+__DEVICE__ double atan2(double, double);
+__DEVICE__ float atan2(float, float);
+__DEVICE__ double atan(double);
+__DEVICE__ float atan(float);
+__DEVICE__ double atanh(double);
+__DEVICE__ float atanh(float);
+__DEVICE__ double cbrt(double);
+__DEVICE__ float cbrt(float);
+__DEVICE__ double ceil(double);
+__DEVICE__ float ceil(float);
+__DEVICE__ double copysign(double, double);
+__DEVICE__ float copysign(float, float);
+__DEVICE__ double cos(double);
+__DEVICE__ float cos(float);
+__DEVICE__ double cosh(double);
+__DEVICE__ float cosh(float);
+__DEVICE__ double erfc(double);
+__DEVICE__ float erfc(float);
+__DEVICE__ double erf(double);
+__DEVICE__ float erf(float);
+__DEVICE__ double exp2(double);
+__DEVICE__ float exp2(float);
+__DEVICE__ double exp(double);
+__DEVICE__ float exp(float);
+__DEVICE__ double expm1(double);
+__DEVICE__ float expm1(float);
+__DEVICE__ double fabs(double);
+__DEVICE__ float fabs(float);
+__DEVICE__ double fdim(double, double);
+__DEVICE__ float fdim(float, float);
+__DEVICE__ double floor(double);
+__DEVICE__ float floor(float);
+__DEVICE__ double fma(double, double, double);
+__DEVICE__ float fma(float, float, float);
+__DEVICE__ double fmax(double, double);
+__DEVICE__ float fmax(float, float);
+__DEVICE__ double fmin(double, double);
+__DEVICE__ float fmin(float, float);
+__DEVICE__ double fmod(double, double);
+__DEVICE__ float fmod(float, float);
+__DEVICE__ int fpclassify(double);
+__DEVICE__ int fpclassify(float);
+__DEVICE__ double frexp(double, int *);
+__DEVICE__ float frexp(float, int *);
+__DEVICE__ double hypot(double, double);
+__DEVICE__ float hypot(float, float);
+__DEVICE__ int ilogb(double);
+__DEVICE__ int ilogb(float);
+__DEVICE__ bool isfinite(double);
+__DEVICE__ bool isfinite(float);
+__DEVICE__ bool isgreater(double, double);
+__DEVICE__ bool isgreaterequal(double, double);
+__DEVICE__ bool isgreaterequal(float, float);
+__DEVICE__ bool isgreater(float, float);
+__DEVICE__ bool isinf(double);
+__DEVICE__ bool isinf(float);
+__DEVICE__ bool isless(double, double);
+__DEVICE__ bool islessequal(double, double);
+__DEVICE__ bool islessequal(float, float);
+__DEVICE__ bool isless(float, float);
+__DEVICE__ bool islessgreater(double, double);
+__DEVICE__ bool islessgreater(float, float);
+__DEVICE__ bool isnan(double);
+__DEVICE__ bool isnan(float);
+__DEVICE__ bool isnormal(double);
+__DEVICE__ bool isnormal(float);
+__DEVICE__ bool isunordered(double, double);
+__DEVICE__ bool isunordered(float, float);
+__DEVICE__ long labs(long);
+__DEVICE__ double ldexp(double, int);
+__DEVICE__ float ldexp(float, int);
+__DEVICE__ double lgamma(double);
+__DEVICE__ float lgamma(float);
+__DEVICE__ long long llabs(long long);
+__DEVICE__ long long llrint(double);
+__DEVICE__ long long llrint(float);
+__DEVICE__ double log10(double);
+__DEVICE__ float log10(float);
+__DEVICE__ double log1p(double);
+__DEVICE__ float log1p(float);
+__DEVICE__ double log2(double);
+__DEVICE__ float log2(float);
+__DEVICE__ double logb(double);
+__DEVICE__ float logb(float);
+__DEVICE__ double log(double);
+__DEVICE__ float log(float);
+__DEVICE__ long lrint(double);
+__DEVICE__ long lrint(float);
+__DEVICE__ long lround(double);
+__DEVICE__ long lround(float);
+__DEVICE__ double modf(double, double *);
+__DEVICE__ float modf(float, float *);
+__DEVICE__ double nan(const char *);
+__DEVICE__ float nanf(const char *);
+__DEVICE__ double nearbyint(double);
+__DEVICE__ float nearbyint(float);
+__DEVICE__ double nextafter(double, double);
+__DEVICE__ float nextafter(float, float);
+__DEVICE__ double nexttoward(double, double);
+__DEVICE__ float nexttoward(float, float);
+__DEVICE__ double pow(double, double);
+__DEVICE__ double pow(double, int);
+__DEVICE__ float pow(float, float);
+__DEVICE__ float pow(float, int);
+__DEVICE__ double remainder(double, double);
+__DEVICE__ float remainder(float, float);
+__DEVICE__ double remquo(double, double, int *);
+__DEVICE__ float remquo(float, float, int *);
+__DEVICE__ double rint(double);
+__DEVICE__ float rint(float);
+__DEVICE__ double round(double);
+__DEVICE__ float round(float);
+__DEVICE__ double scalbln(double, long);
+__DEVICE__ float scalbln(float, long);
+__DEVICE__ double scalbn(double, int);
+__DEVICE__ float scalbn(float, int);
+__DEVICE__ bool signbit(double);
+__DEVICE__ bool signbit(float);
+__DEVICE__ double sin(double);
+__DEVICE__ float sin(float);
+__DEVICE__ double sinh(double);
+__DEVICE__ float sinh(float);
+__DEVICE__ double sqrt(double);
+__DEVICE__ float sqrt(float);
+__DEVICE__ double tan(double);
+__DEVICE__ float tan(float);
+__DEVICE__ double tanh(double);
+__DEVICE__ float tanh(float);
+__DEVICE__ double tgamma(double);
+__DEVICE__ float tgamma(float);
+__DEVICE__ double trunc(double);
+__DEVICE__ float trunc(float);
+
+namespace std {
+using ::abs;
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isinf;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnan;
+using ::isnormal;
+using ::isunordered;
+using ::labs;
+using ::ldexp;
+using ::lgamma;
+using ::llabs;
+using ::llrint;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::modf;
+using ::nan;
+using ::nanf;
+using ::nearbyint;
+using ::nextafter;
+using ::nexttoward;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+} // namespace std
+
+#pragma pop_macro("__DEVICE__")
+
+#endif
diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h
index 8e5f033..05a85fa 100644
--- a/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -42,10 +42,14 @@
 
 #if defined(__CUDA__) && defined(__clang__)
 
+// Include some forward declares that must come before cmath.
+#include <__clang_cuda_math_forward_declares.h>
+
 // Include some standard headers to avoid CUDA headers including them
 // while some required macros (like __THROW) are in a weird state.
-#include <stdlib.h>
 #include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
 
 // Preserve common macros that will be changed below by us or by CUDA
 // headers.
@@ -79,17 +83,15 @@
 // definitions from .hpp files.
 #define __DEVICE_FUNCTIONS_H__
 #define __MATH_FUNCTIONS_H__
+#define __COMMON_FUNCTIONS_H__
 
 #undef __CUDACC__
 #define __CUDABE__
 // Disables definitions of device-side runtime support stubs in
 // cuda_device_runtime_api.h
-#define __CUDADEVRT_INTERNAL__
+#include "driver_types.h"
 #include "host_config.h"
 #include "host_defines.h"
-#include "driver_types.h"
-#include "common_functions.h"
-#undef __CUDADEVRT_INTERNAL__
 
 #undef __CUDABE__
 #define __CUDACC__
@@ -100,11 +102,11 @@
 
 // CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
 // not have at the moment. Emulate them with a builtin memcpy/memset.
-#define __nvvm_memcpy(s,d,n,a) __builtin_memcpy(s,d,n)
-#define __nvvm_memset(d,c,n,a) __builtin_memset(d,c,n)
+#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
+#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
 
-#include "crt/host_runtime.h"
 #include "crt/device_runtime.h"
+#include "crt/host_runtime.h"
 // device_runtime.h defines __cxa_* macros that will conflict with
 // cxxabi.h.
 // FIXME: redefine these as __device__ functions.
@@ -140,7 +142,20 @@
 #pragma push_macro("__forceinline__")
 #define __forceinline__ __device__ __inline__ __attribute__((always_inline))
 #include "device_functions.hpp"
+
+// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
+// get the slow-but-accurate or fast-but-inaccurate versions of functions like
+// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
+//
+// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
+// slow divides), so we need to scope our define carefully here.
+#pragma push_macro("__USE_FAST_MATH__")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __USE_FAST_MATH__
+#endif
 #include "math_functions.hpp"
+#pragma pop_macro("__USE_FAST_MATH__")
+
 #include "math_functions_dbl_ptx3.hpp"
 #pragma pop_macro("__forceinline__")
 
@@ -152,21 +167,21 @@
 // Alas, additional overloads for these functions are hard to get to.
 // Considering that we only need these overloads for a few functions,
 // we can provide them here.
-static inline float rsqrt(float a) { return rsqrtf(a); }
-static inline float rcbrt(float a) { return rcbrtf(a); }
-static inline float sinpi(float a) { return sinpif(a); }
-static inline float cospi(float a) { return cospif(a); }
-static inline void sincospi(float a, float *b, float *c) {
-  return sincospi(a, b, c);
+static inline float rsqrt(float __a) { return rsqrtf(__a); }
+static inline float rcbrt(float __a) { return rcbrtf(__a); }
+static inline float sinpi(float __a) { return sinpif(__a); }
+static inline float cospi(float __a) { return cospif(__a); }
+static inline void sincospi(float __a, float *__b, float *__c) {
+  return sincospif(__a, __b, __c);
 }
-static inline float erfcinv(float a) { return erfcinvf(a); }
-static inline float normcdfinv(float a) { return normcdfinvf(a); }
-static inline float normcdf(float a) { return normcdff(a); }
-static inline float erfcx(float a) { return erfcxf(a); }
+static inline float erfcinv(float __a) { return erfcinvf(__a); }
+static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
+static inline float normcdf(float __a) { return normcdff(__a); }
+static inline float erfcx(float __a) { return erfcxf(__a); }
 
 // For some reason single-argument variant is not always declared by
 // CUDA headers. Alas, device_functions.hpp included below needs it.
-static inline __device__ void __brkpt(int c) { __brkpt(); }
+static inline __device__ void __brkpt(int __c) { __brkpt(); }
 
 // Now include *.hpp with definitions of various GPU functions.  Alas,
 // a lot of thins get declared/defined with __host__ attribute which
@@ -178,17 +193,34 @@
 #undef __CUDABE__
 #define __CUDACC__
 #undef __DEVICE_FUNCTIONS_HPP__
-#include "device_functions.hpp"
 #include "device_atomic_functions.hpp"
+#include "device_functions.hpp"
 #include "sm_20_atomic_functions.hpp"
-#include "sm_32_atomic_functions.hpp"
 #include "sm_20_intrinsics.hpp"
-// sm_30_intrinsics.h has declarations that use default argument, so
-// we have to include it and it will in turn include .hpp
-#include "sm_30_intrinsics.h"
-#include "sm_32_intrinsics.hpp"
+#include "sm_32_atomic_functions.hpp"
+
+// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
+// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
+// define them using builtins so that the optimizer can reason about and across
+// these instructions.  In particular, using intrinsics for ldg gets us the
+// [addr+imm] addressing mode, which, although it doesn't actually exist in the
+// hardware, seems to generate faster machine code because ptxas can more easily
+// reason about our code.
+
 #undef __MATH_FUNCTIONS_HPP__
+
+// math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
+// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
+// math_function.hpp's ::signbit.  It's guarded by #undef signbit, but that's
+// conditional on __GNUC__.  :)
+#pragma push_macro("signbit")
+#pragma push_macro("__GNUC__")
+#undef __GNUC__
+#define signbit __ignored_cuda_signbit
 #include "math_functions.hpp"
+#pragma pop_macro("__GNUC__")
+#pragma pop_macro("signbit")
+
 #pragma pop_macro("__host__")
 
 #include "texture_indirect_functions.h"
@@ -200,17 +232,103 @@
 // Set up compiler macros expected to be seen during compilation.
 #undef __CUDABE__
 #define __CUDACC__
-#define __NVCC__
 
-#if defined(__CUDA_ARCH__)
-// We need to emit IR declaration for non-existing __nvvm_reflect() to
-// let backend know that it should be treated as const nothrow
-// function which is what NVVMReflect pass expects to see.
-extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *);
-static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
-  return __nvvm_reflect("NONE");
+extern "C" {
+// Device-side CUDA system calls.
+// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
+// We need these declarations and wrappers for device-side
+// malloc/free/printf calls to work without relying on
+// -fcuda-disable-target-call-checks option.
+__device__ int vprintf(const char *, const char *);
+__device__ void free(void *) __attribute((nothrow));
+__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
+__device__ void __assertfail(const char *__message, const char *__file,
+                             unsigned __line, const char *__function,
+                             size_t __charSize) __attribute__((noreturn));
+
+// In order for standard assert() macro on linux to work we need to
+// provide device-side __assert_fail()
+__device__ static inline void __assert_fail(const char *__message,
+                                            const char *__file, unsigned __line,
+                                            const char *__function) {
+  __assertfail(__message, __file, __line, __function, sizeof(char));
 }
+
+// Clang will convert printf into vprintf, but we still need
+// device-side declaration for it.
+__device__ int printf(const char *, ...);
+} // extern "C"
+
+// We also need device-side std::malloc and std::free.
+namespace std {
+__device__ static inline void free(void *__ptr) { ::free(__ptr); }
+__device__ static inline void *malloc(size_t __size) {
+  return ::malloc(__size);
+}
+} // namespace std
+
+// Out-of-line implementations from cuda_builtin_vars.h.  These need to come
+// after we've pulled in the definition of uint3 and dim3.
+
+__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
+  uint3 ret;
+  ret.x = x;
+  ret.y = y;
+  ret.z = z;
+  return ret;
+}
+
+__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
+  uint3 ret;
+  ret.x = x;
+  ret.y = y;
+  ret.z = z;
+  return ret;
+}
+
+__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
+__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
+#include <__clang_cuda_cmath.h>
+#include <__clang_cuda_intrinsics.h>
+
+// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
+// mode, giving them their "proper" types of dim3 and uint3.  This is
+// incompatible with the types we give in cuda_builtin_vars.h.  As as hack,
+// force-include the header (nvcc doesn't include it by default) but redefine
+// dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are only
+// used here for the redeclarations of blockDim and threadIdx.)
+#pragma push_macro("dim3")
+#pragma push_macro("uint3")
+#define dim3 __cuda_builtin_blockDim_t
+#define uint3 __cuda_builtin_threadIdx_t
+#include "curand_mtgp32_kernel.h"
+#pragma pop_macro("dim3")
+#pragma pop_macro("uint3")
+#pragma pop_macro("__USE_FAST_MATH__")
+
+// Device overrides for placement new and delete.
+#pragma push_macro("CUDA_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define CUDA_NOEXCEPT noexcept
+#else
+#define CUDA_NOEXCEPT
 #endif
 
+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+#pragma pop_macro("CUDA_NOEXCEPT")
+
 #endif // __CUDA__
 #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/lib/Headers/__wmmintrin_aes.h b/lib/Headers/__wmmintrin_aes.h
index 100799e..211518e 100644
--- a/lib/Headers/__wmmintrin_aes.h
+++ b/lib/Headers/__wmmintrin_aes.h
@@ -28,36 +28,121 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
 
+/// \brief Performs a single round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VAESENC instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesenc_si128(__m128i __V, __m128i __R)
 {
-  return (__m128i)__builtin_ia32_aesenc128(__V, __R);
+  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
 }
 
+/// \brief Performs the final round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VAESENCLAST instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesenclast_si128(__m128i __V, __m128i __R)
 {
-  return (__m128i)__builtin_ia32_aesenclast128(__V, __R);
+  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
 }
 
+/// \brief Performs a single round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VAESDEC instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesdec_si128(__m128i __V, __m128i __R)
 {
-  return (__m128i)__builtin_ia32_aesdec128(__V, __R);
+  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
 }
 
+/// \brief Performs the final round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VAESDECLAST instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesdeclast_si128(__m128i __V, __m128i __R)
 {
-  return (__m128i)__builtin_ia32_aesdeclast128(__V, __R);
+  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
 }
 
+/// \brief Applies the AES InvMixColumns() transformation to an expanded key
+///    contained in the source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VAESIMC instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the expanded key.
+/// \returns A 128-bit integer vector containing the transformed value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesimc_si128(__m128i __V)
 {
-  return (__m128i)__builtin_ia32_aesimc128(__V);
+  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
 }
 
+/// \brief Generates a round key for AES encyption, operating on 128-bit data
+///    specified in the first source operand and using an 8-bit round constant
+///    specified by the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c AESKEYGENASSIST instruction.
+///
+/// \param C
+///    A 128-bit integer vector that is used to generate the AES encryption key.
+/// \param R
+///    An 8-bit round constant used to generate the AES encryption key.
+/// \returns A 128-bit round key for AES encryption.
 #define _mm_aeskeygenassist_si128(C, R) \
   (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
 
diff --git a/lib/Headers/__wmmintrin_pclmul.h b/lib/Headers/__wmmintrin_pclmul.h
index 68e944e..d4e073f 100644
--- a/lib/Headers/__wmmintrin_pclmul.h
+++ b/lib/Headers/__wmmintrin_pclmul.h
@@ -23,6 +23,34 @@
 #ifndef _WMMINTRIN_PCLMUL_H
 #define _WMMINTRIN_PCLMUL_H
 
+/// \brief Multiplies two 64-bit integer values, which are selected from source
+///    operands using the immediate-value operand. The multiplication is a
+///    carry-less multiplication, and the 128-bit integer product is stored in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPCLMULQDQ instruction.
+///
+/// \param __X
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __Y
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __I
+///    An immediate value specifying which 64-bit values to select from the
+///    operands.
+///    Bit 0 is used to select a value from operand __X,
+///    and bit 4 is used to select a value from operand __Y:
+///    Bit[0]=0 indicates that bits[63:0] of operand __X are used.
+///    Bit[0]=1 indicates that bits[127:64] of operand __X are used.
+///    Bit[4]=0 indicates that bits[63:0] of operand __Y are used.
+///    Bit[4]=1 indicates that bits[127:64] of operand __Y are used.
+/// \returns The 128-bit integer vector containing the result of the carry-less
+///    multiplication of the selected 64-bit values.
 #define _mm_clmulepi64_si128(__X, __Y, __I) \
   ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
                                         (__v2di)(__m128i)(__Y), (char)(__I)))
diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index a5b4f74..74a1914 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h
@@ -36,67 +36,65 @@
 
 #define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
 
-static vector signed char __ATTRS_o_ai vec_perm(vector signed char __a,
-                                                vector signed char __b,
-                                                vector unsigned char __c);
+static __inline__ vector signed char __ATTRS_o_ai vec_perm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c);
 
-static vector unsigned char __ATTRS_o_ai vec_perm(vector unsigned char __a,
-                                                  vector unsigned char __b,
-                                                  vector unsigned char __c);
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c);
 
-static vector bool char __ATTRS_o_ai vec_perm(vector bool char __a,
-                                              vector bool char __b,
-                                              vector unsigned char __c);
+static __inline__ vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c);
 
-static vector short __ATTRS_o_ai vec_perm(vector signed short __a,
-                                          vector signed short __b,
-                                          vector unsigned char __c);
+static __inline__ vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                                     vector signed short __b,
+                                                     vector unsigned char __c);
 
-static vector unsigned short __ATTRS_o_ai vec_perm(vector unsigned short __a,
-                                                   vector unsigned short __b,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c);
+
+static __inline__ vector bool short __ATTRS_o_ai vec_perm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c);
+
+static __inline__ vector pixel __ATTRS_o_ai vec_perm(vector pixel __a,
+                                                     vector pixel __b,
+                                                     vector unsigned char __c);
+
+static __inline__ vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                                   vector signed int __b,
                                                    vector unsigned char __c);
 
-static vector bool short __ATTRS_o_ai vec_perm(vector bool short __a,
-                                               vector bool short __b,
-                                               vector unsigned char __c);
+static __inline__ vector unsigned int __ATTRS_o_ai vec_perm(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned char __c);
 
-static vector pixel __ATTRS_o_ai vec_perm(vector pixel __a, vector pixel __b,
-                                          vector unsigned char __c);
+static __inline__ vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c);
 
-static vector int __ATTRS_o_ai vec_perm(vector signed int __a,
-                                        vector signed int __b,
-                                        vector unsigned char __c);
-
-static vector unsigned int __ATTRS_o_ai vec_perm(vector unsigned int __a,
-                                                 vector unsigned int __b,
-                                                 vector unsigned char __c);
-
-static vector bool int __ATTRS_o_ai vec_perm(vector bool int __a,
-                                             vector bool int __b,
-                                             vector unsigned char __c);
-
-static vector float __ATTRS_o_ai vec_perm(vector float __a, vector float __b,
-                                          vector unsigned char __c);
+static __inline__ vector float __ATTRS_o_ai vec_perm(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned char __c);
 
 #ifdef __VSX__
-static vector long long __ATTRS_o_ai vec_perm(vector signed long long __a,
-                                              vector signed long long __b,
-                                              vector unsigned char __c);
+static __inline__ vector long long __ATTRS_o_ai
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c);
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_perm(vector unsigned long long __a, vector unsigned long long __b,
          vector unsigned char __c);
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_perm(vector bool long long __a, vector bool long long __b,
          vector unsigned char __c);
 
-static vector double __ATTRS_o_ai vec_perm(vector double __a, vector double __b,
-                                           vector unsigned char __c);
+static __inline__ vector double __ATTRS_o_ai vec_perm(vector double __a,
+                                                      vector double __b,
+                                                      vector unsigned char __c);
 #endif
 
-static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
-                                                 vector unsigned char __b);
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b);
 
 /* vec_abs */
 
@@ -104,36 +102,41 @@
 #define __builtin_altivec_abs_v8hi vec_abs
 #define __builtin_altivec_abs_v4si vec_abs
 
-static vector signed char __ATTRS_o_ai vec_abs(vector signed char __a) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_abs(vector signed char __a) {
   return __builtin_altivec_vmaxsb(__a, -__a);
 }
 
-static vector signed short __ATTRS_o_ai vec_abs(vector signed short __a) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_abs(vector signed short __a) {
   return __builtin_altivec_vmaxsh(__a, -__a);
 }
 
-static vector signed int __ATTRS_o_ai vec_abs(vector signed int __a) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_abs(vector signed int __a) {
   return __builtin_altivec_vmaxsw(__a, -__a);
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_abs(vector signed long long __a) {
   return __builtin_altivec_vmaxsd(__a, -__a);
 }
 #endif
 
-static vector float __ATTRS_o_ai vec_abs(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_abs(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvabssp(__a);
+#else
   vector unsigned int __res =
       (vector unsigned int)__a & (vector unsigned int)(0x7FFFFFFF);
   return (vector float)__res;
+#endif
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector double __ATTRS_o_ai vec_abs(vector double __a) {
-  vector unsigned long long __res = { 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF };
-  __res &= (vector unsigned int)__a;
-  return (vector double)__res;
+static __inline__ vector double __ATTRS_o_ai vec_abs(vector double __a) {
+  return __builtin_vsx_xvabsdp(__a);
 }
 #endif
 
@@ -142,138 +145,146 @@
 #define __builtin_altivec_abss_v8hi vec_abss
 #define __builtin_altivec_abss_v4si vec_abss
 
-static vector signed char __ATTRS_o_ai vec_abss(vector signed char __a) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_abss(vector signed char __a) {
   return __builtin_altivec_vmaxsb(
       __a, __builtin_altivec_vsubsbs((vector signed char)(0), __a));
 }
 
-static vector signed short __ATTRS_o_ai vec_abss(vector signed short __a) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_abss(vector signed short __a) {
   return __builtin_altivec_vmaxsh(
       __a, __builtin_altivec_vsubshs((vector signed short)(0), __a));
 }
 
-static vector signed int __ATTRS_o_ai vec_abss(vector signed int __a) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_abss(vector signed int __a) {
   return __builtin_altivec_vmaxsw(
       __a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
 }
 
 /* vec_add */
 
-static vector signed char __ATTRS_o_ai vec_add(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector signed char __b) {
   return __a + __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_add(vector bool char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a + __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_add(vector signed char __a,
-                                               vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector bool char __b) {
   return __a + (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_add(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector unsigned char __b) {
   return __a + __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_add(vector bool char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a + __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_add(vector unsigned char __a,
-                                                 vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector bool char __b) {
   return __a + (vector unsigned char)__b;
 }
 
-static vector short __ATTRS_o_ai vec_add(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_add(vector short __a,
+                                                    vector short __b) {
   return __a + __b;
 }
 
-static vector short __ATTRS_o_ai vec_add(vector bool short __a,
-                                         vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_add(vector bool short __a,
+                                                    vector short __b) {
   return (vector short)__a + __b;
 }
 
-static vector short __ATTRS_o_ai vec_add(vector short __a,
-                                         vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_add(vector short __a,
+                                                    vector bool short __b) {
   return __a + (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_add(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector unsigned short __b) {
   return __a + __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_add(vector bool short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a + __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_add(vector unsigned short __a,
-                                                  vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector bool short __b) {
   return __a + (vector unsigned short)__b;
 }
 
-static vector int __ATTRS_o_ai vec_add(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_add(vector int __a,
+                                                  vector int __b) {
   return __a + __b;
 }
 
-static vector int __ATTRS_o_ai vec_add(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_add(vector bool int __a,
+                                                  vector int __b) {
   return (vector int)__a + __b;
 }
 
-static vector int __ATTRS_o_ai vec_add(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_add(vector int __a,
+                                                  vector bool int __b) {
   return __a + (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_add(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector unsigned int __b) {
   return __a + __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_add(vector bool int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a + __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_add(vector unsigned int __a,
-                                                vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector bool int __b) {
   return __a + (vector unsigned int)__b;
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_add(vector signed long long __a, vector signed long long __b) {
   return __a + __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_add(vector unsigned long long __a, vector unsigned long long __b) {
   return __a + __b;
 }
 
-static vector signed __int128 __ATTRS_o_ai vec_add(vector signed __int128 __a,
-                                                   vector signed __int128 __b) {
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_add(vector signed __int128 __a, vector signed __int128 __b) {
   return __a + __b;
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_add(vector unsigned __int128 __a, vector unsigned __int128 __b) {
   return __a + __b;
 }
 #endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 
-static vector float __ATTRS_o_ai vec_add(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_add(vector float __a,
+                                                    vector float __b) {
   return __a + __b;
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai
-vec_add(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_add(vector double __a,
+                                                     vector double __b) {
   return __a + __b;
 }
 #endif // __VSX__
@@ -281,13 +292,13 @@
 /* vec_adde */
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_adde(vector signed __int128 __a, vector signed __int128 __b,
          vector signed __int128 __c) {
   return __builtin_altivec_vaddeuqm(__a, __b, __c);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b,
          vector unsigned __int128 __c) {
   return __builtin_altivec_vaddeuqm(__a, __b, __c);
@@ -297,13 +308,13 @@
 /* vec_addec */
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_addec(vector signed __int128 __a, vector signed __int128 __b,
           vector signed __int128 __c) {
   return __builtin_altivec_vaddecuq(__a, __b, __c);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b,
           vector unsigned __int128 __c) {
   return __builtin_altivec_vaddecuq(__a, __b, __c);
@@ -314,33 +325,33 @@
 
 #define __builtin_altivec_vaddubm vec_vaddubm
 
-static vector signed char __ATTRS_o_ai vec_vaddubm(vector signed char __a,
-                                                   vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector signed char __b) {
   return __a + __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vaddubm(vector bool char __a,
-                                                   vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a + __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vaddubm(vector signed char __a,
-                                                   vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector bool char __b) {
   return __a + (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector unsigned char __a,
-                                                     vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector unsigned char __b) {
   return __a + __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector bool char __a,
-                                                     vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a + __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector unsigned char __a,
-                                                     vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector bool char __b) {
   return __a + (vector unsigned char)__b;
 }
 
@@ -348,33 +359,33 @@
 
 #define __builtin_altivec_vadduhm vec_vadduhm
 
-static vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
-                                             vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                                        vector short __b) {
   return __a + __b;
 }
 
-static vector short __ATTRS_o_ai vec_vadduhm(vector bool short __a,
-                                             vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector bool short __a,
+                                                        vector short __b) {
   return (vector short)__a + __b;
 }
 
-static vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
-                                             vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                                        vector bool short __b) {
   return __a + (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vadduhm(vector unsigned short __a, vector unsigned short __b) {
   return __a + __b;
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vadduhm(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a + __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vadduhm(vector unsigned short __a,
-                                                      vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector bool short __b) {
   return __a + (vector unsigned short)__b;
 }
 
@@ -382,32 +393,33 @@
 
 #define __builtin_altivec_vadduwm vec_vadduwm
 
-static vector int __ATTRS_o_ai vec_vadduwm(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
+                                                      vector int __b) {
   return __a + __b;
 }
 
-static vector int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
-                                           vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
+                                                      vector int __b) {
   return (vector int)__a + __b;
 }
 
-static vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
-                                           vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
+                                                      vector bool int __b) {
   return __a + (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector unsigned int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector unsigned int __b) {
   return __a + __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a + __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector unsigned int __a,
-                                                    vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector bool int __b) {
   return __a + (vector unsigned int)__b;
 }
 
@@ -415,33 +427,32 @@
 
 #define __builtin_altivec_vaddfp vec_vaddfp
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vaddfp(vector float __a, vector float __b) {
   return __a + __b;
 }
 
 /* vec_addc */
 
-static vector signed int __ATTRS_o_ai vec_addc(vector signed int __a,
-                                               vector signed int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_addc(vector signed int __a, vector signed int __b) {
   return (vector signed int)__builtin_altivec_vaddcuw((vector unsigned int)__a,
                                                       (vector unsigned int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_addc(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_addc(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vaddcuw(__a, __b);
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_addc(vector signed __int128 __a, vector signed __int128 __b) {
   return (vector signed __int128)__builtin_altivec_vaddcuq(
-    (vector unsigned __int128)__a,
-    (vector unsigned __int128)__b);
+      (vector unsigned __int128)__a, (vector unsigned __int128)__b);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_addc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
   return __builtin_altivec_vaddcuq(__a, __b);
 }
@@ -449,222 +460,227 @@
 
 /* vec_vaddcuw */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vaddcuw(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vaddcuw(__a, __b);
 }
 
 /* vec_adds */
 
-static vector signed char __ATTRS_o_ai vec_adds(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vaddsbs(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_adds(vector bool char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector signed char __b) {
   return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_adds(vector signed char __a,
-                                                vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector bool char __b) {
   return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_adds(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vaddubs(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_adds(vector bool char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector unsigned char __b) {
   return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_adds(vector unsigned char __a,
-                                                  vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector bool char __b) {
   return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
 }
 
-static vector short __ATTRS_o_ai vec_adds(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector short __a,
+                                                     vector short __b) {
   return __builtin_altivec_vaddshs(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_adds(vector bool short __a,
-                                          vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector bool short __a,
+                                                     vector short __b) {
   return __builtin_altivec_vaddshs((vector short)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_adds(vector short __a,
-                                          vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector short __a,
+                                                     vector bool short __b) {
   return __builtin_altivec_vaddshs(__a, (vector short)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_adds(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vadduhs(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_adds(vector bool short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector unsigned short __b) {
   return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_adds(vector unsigned short __a,
-                                                   vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector bool short __b) {
   return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
 }
 
-static vector int __ATTRS_o_ai vec_adds(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector int __a,
+                                                   vector int __b) {
   return __builtin_altivec_vaddsws(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_adds(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector bool int __a,
+                                                   vector int __b) {
   return __builtin_altivec_vaddsws((vector int)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_adds(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector int __a,
+                                                   vector bool int __b) {
   return __builtin_altivec_vaddsws(__a, (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_adds(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vadduws(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_adds(vector bool int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector unsigned int __b) {
   return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_adds(vector unsigned int __a,
-                                                 vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector bool int __b) {
   return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
 }
 
 /* vec_vaddsbs */
 
-static vector signed char __ATTRS_o_ai vec_vaddsbs(vector signed char __a,
-                                                   vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vaddsbs(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vaddsbs(vector bool char __a,
-                                                   vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector bool char __a, vector signed char __b) {
   return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vaddsbs(vector signed char __a,
-                                                   vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector bool char __b) {
   return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
 }
 
 /* vec_vaddubs */
 
-static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector unsigned char __a,
-                                                     vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vaddubs(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector bool char __a,
-                                                     vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector bool char __a, vector unsigned char __b) {
   return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector unsigned char __a,
-                                                     vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector bool char __b) {
   return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
 }
 
 /* vec_vaddshs */
 
-static vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
-                                             vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                                        vector short __b) {
   return __builtin_altivec_vaddshs(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_vaddshs(vector bool short __a,
-                                             vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector bool short __a,
+                                                        vector short __b) {
   return __builtin_altivec_vaddshs((vector short)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
-                                             vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                                        vector bool short __b) {
   return __builtin_altivec_vaddshs(__a, (vector short)__b);
 }
 
 /* vec_vadduhs */
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vadduhs(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vadduhs(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vadduhs(vector bool short __a, vector unsigned short __b) {
   return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vadduhs(vector unsigned short __a,
-                                                      vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector bool short __b) {
   return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
 }
 
 /* vec_vaddsws */
 
-static vector int __ATTRS_o_ai vec_vaddsws(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
+                                                      vector int __b) {
   return __builtin_altivec_vaddsws(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_vaddsws(vector bool int __a,
-                                           vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector bool int __a,
+                                                      vector int __b) {
   return __builtin_altivec_vaddsws((vector int)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
-                                           vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
+                                                      vector bool int __b) {
   return __builtin_altivec_vaddsws(__a, (vector int)__b);
 }
 
 /* vec_vadduws */
 
-static vector unsigned int __ATTRS_o_ai vec_vadduws(vector unsigned int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vadduws(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vadduws(vector bool int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector bool int __a, vector unsigned int __b) {
   return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vadduws(vector unsigned int __a,
-                                                    vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector bool int __b) {
   return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 /* vec_vadduqm */
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vadduqm(vector signed __int128 __a, vector signed __int128 __b) {
   return __a + __b;
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_vadduqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
   return __a + __b;
 }
 
 /* vec_vaddeuqm */
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vaddeuqm(vector signed __int128 __a, vector signed __int128 __b,
              vector signed __int128 __c) {
   return __builtin_altivec_vaddeuqm(__a, __b, __c);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_vaddeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
              vector unsigned __int128 __c) {
   return __builtin_altivec_vaddeuqm(__a, __b, __c);
@@ -672,25 +688,25 @@
 
 /* vec_vaddcuq */
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vaddcuq(vector signed __int128 __a, vector signed __int128 __b) {
   return __builtin_altivec_vaddcuq(__a, __b);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_vaddcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
   return __builtin_altivec_vaddcuq(__a, __b);
 }
 
 /* vec_vaddecuq */
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vaddecuq(vector signed __int128 __a, vector signed __int128 __b,
              vector signed __int128 __c) {
   return __builtin_altivec_vaddecuq(__a, __b, __c);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_vaddecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
              vector unsigned __int128 __c) {
   return __builtin_altivec_vaddecuq(__a, __b, __c);
@@ -701,338 +717,351 @@
 
 #define __builtin_altivec_vand vec_and
 
-static vector signed char __ATTRS_o_ai vec_and(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector signed char __b) {
   return __a & __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_and(vector bool char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a & __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_and(vector signed char __a,
-                                               vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector bool char __b) {
   return __a & (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_and(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector unsigned char __b) {
   return __a & __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_and(vector bool char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a & __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_and(vector unsigned char __a,
-                                                 vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector bool char __b) {
   return __a & (vector unsigned char)__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_and(vector bool char __a,
-                                             vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_and(vector bool char __a,
+                                                        vector bool char __b) {
   return __a & __b;
 }
 
-static vector short __ATTRS_o_ai vec_and(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_and(vector short __a,
+                                                    vector short __b) {
   return __a & __b;
 }
 
-static vector short __ATTRS_o_ai vec_and(vector bool short __a,
-                                         vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_and(vector bool short __a,
+                                                    vector short __b) {
   return (vector short)__a & __b;
 }
 
-static vector short __ATTRS_o_ai vec_and(vector short __a,
-                                         vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_and(vector short __a,
+                                                    vector bool short __b) {
   return __a & (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_and(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector unsigned short __b) {
   return __a & __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_and(vector bool short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a & __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_and(vector unsigned short __a,
-                                                  vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector bool short __b) {
   return __a & (vector unsigned short)__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_and(vector bool short __a,
-                                              vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_and(vector bool short __a, vector bool short __b) {
   return __a & __b;
 }
 
-static vector int __ATTRS_o_ai vec_and(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_and(vector int __a,
+                                                  vector int __b) {
   return __a & __b;
 }
 
-static vector int __ATTRS_o_ai vec_and(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_and(vector bool int __a,
+                                                  vector int __b) {
   return (vector int)__a & __b;
 }
 
-static vector int __ATTRS_o_ai vec_and(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_and(vector int __a,
+                                                  vector bool int __b) {
   return __a & (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_and(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector unsigned int __b) {
   return __a & __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_and(vector bool int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a & __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_and(vector unsigned int __a,
-                                                vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector bool int __b) {
   return __a & (vector unsigned int)__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_and(vector bool int __a,
-                                            vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_and(vector bool int __a,
+                                                       vector bool int __b) {
   return __a & __b;
 }
 
-static vector float __ATTRS_o_ai vec_and(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_and(vector float __a,
+                                                    vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_and(vector bool int __a,
-                                         vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_and(vector bool int __a,
+                                                    vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_and(vector float __a,
-                                         vector bool int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_and(vector float __a,
+                                                    vector bool int __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & (vector unsigned int)__b;
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_and(vector bool long long __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_and(vector bool long long __a,
+                                                     vector double __b) {
   vector unsigned long long __res =
       (vector unsigned long long)__a & (vector unsigned long long)__b;
   return (vector double)__res;
 }
 
-static vector double __ATTRS_o_ai vec_and(vector double __a, vector bool long long __b) {
+static __inline__ vector double __ATTRS_o_ai
+vec_and(vector double __a, vector bool long long __b) {
   vector unsigned long long __res =
       (vector unsigned long long)__a & (vector unsigned long long)__b;
   return (vector double)__res;
 }
 
-static vector double __ATTRS_o_ai vec_and(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_and(vector double __a,
+                                                     vector double __b) {
   vector unsigned long long __res =
       (vector unsigned long long)__a & (vector unsigned long long)__b;
   return (vector double)__res;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_and(vector signed long long __a, vector signed long long __b) {
   return __a & __b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_and(vector bool long long __a, vector signed long long __b) {
   return (vector signed long long)__a & __b;
 }
 
-static vector signed long long __ATTRS_o_ai vec_and(vector signed long long __a,
-                                                    vector bool long long __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_and(vector signed long long __a, vector bool long long __b) {
   return __a & (vector signed long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_and(vector unsigned long long __a, vector unsigned long long __b) {
   return __a & __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_and(vector bool long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)__a & __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_and(vector unsigned long long __a, vector bool long long __b) {
   return __a & (vector unsigned long long)__b;
 }
 
-static vector bool long long __ATTRS_o_ai vec_and(vector bool long long __a,
-                                                  vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector bool long long __b) {
   return __a & __b;
 }
 #endif
 
 /* vec_vand */
 
-static vector signed char __ATTRS_o_ai vec_vand(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector signed char __b) {
   return __a & __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vand(vector bool char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a & __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vand(vector signed char __a,
-                                                vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector bool char __b) {
   return __a & (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vand(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector unsigned char __b) {
   return __a & __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vand(vector bool char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a & __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vand(vector unsigned char __a,
-                                                  vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector bool char __b) {
   return __a & (vector unsigned char)__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_vand(vector bool char __a,
-                                              vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                                         vector bool char __b) {
   return __a & __b;
 }
 
-static vector short __ATTRS_o_ai vec_vand(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector short __a,
+                                                     vector short __b) {
   return __a & __b;
 }
 
-static vector short __ATTRS_o_ai vec_vand(vector bool short __a,
-                                          vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                                     vector short __b) {
   return (vector short)__a & __b;
 }
 
-static vector short __ATTRS_o_ai vec_vand(vector short __a,
-                                          vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector short __a,
+                                                     vector bool short __b) {
   return __a & (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vand(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector unsigned short __b) {
   return __a & __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vand(vector bool short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a & __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vand(vector unsigned short __a,
-                                                   vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector bool short __b) {
   return __a & (vector unsigned short)__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_vand(vector bool short __a,
-                                               vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector bool short __b) {
   return __a & __b;
 }
 
-static vector int __ATTRS_o_ai vec_vand(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector int __a,
+                                                   vector int __b) {
   return __a & __b;
 }
 
-static vector int __ATTRS_o_ai vec_vand(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                   vector int __b) {
   return (vector int)__a & __b;
 }
 
-static vector int __ATTRS_o_ai vec_vand(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector int __a,
+                                                   vector bool int __b) {
   return __a & (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vand(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector unsigned int __b) {
   return __a & __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vand(vector bool int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a & __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vand(vector unsigned int __a,
-                                                 vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector bool int __b) {
   return __a & (vector unsigned int)__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_vand(vector bool int __a,
-                                             vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                        vector bool int __b) {
   return __a & __b;
 }
 
-static vector float __ATTRS_o_ai vec_vand(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector float __a,
+                                                     vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vand(vector bool int __a,
-                                          vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                     vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vand(vector float __a,
-                                          vector bool int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector float __a,
+                                                     vector bool int __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & (vector unsigned int)__b;
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vand(vector signed long long __a, vector signed long long __b) {
   return __a & __b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vand(vector bool long long __a, vector signed long long __b) {
   return (vector signed long long)__a & __b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vand(vector signed long long __a, vector bool long long __b) {
   return __a & (vector signed long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vand(vector unsigned long long __a, vector unsigned long long __b) {
   return __a & __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vand(vector bool long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)__a & __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vand(vector unsigned long long __a, vector bool long long __b) {
   return __a & (vector unsigned long long)__b;
 }
 
-static vector bool long long __ATTRS_o_ai vec_vand(vector bool long long __a,
-                                                   vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector bool long long __b) {
   return __a & __b;
 }
 #endif
@@ -1041,419 +1070,432 @@
 
 #define __builtin_altivec_vandc vec_andc
 
-static vector signed char __ATTRS_o_ai vec_andc(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector signed char __b) {
   return __a & ~__b;
 }
 
-static vector signed char __ATTRS_o_ai vec_andc(vector bool char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a & ~__b;
 }
 
-static vector signed char __ATTRS_o_ai vec_andc(vector signed char __a,
-                                                vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector bool char __b) {
   return __a & ~(vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_andc(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector unsigned char __b) {
   return __a & ~__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_andc(vector bool char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a & ~__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_andc(vector unsigned char __a,
-                                                  vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector bool char __b) {
   return __a & ~(vector unsigned char)__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_andc(vector bool char __a,
-                                              vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                                         vector bool char __b) {
   return __a & ~__b;
 }
 
-static vector short __ATTRS_o_ai vec_andc(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector short __a,
+                                                     vector short __b) {
   return __a & ~__b;
 }
 
-static vector short __ATTRS_o_ai vec_andc(vector bool short __a,
-                                          vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                                     vector short __b) {
   return (vector short)__a & ~__b;
 }
 
-static vector short __ATTRS_o_ai vec_andc(vector short __a,
-                                          vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector short __a,
+                                                     vector bool short __b) {
   return __a & ~(vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_andc(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector unsigned short __b) {
   return __a & ~__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_andc(vector bool short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a & ~__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_andc(vector unsigned short __a,
-                                                   vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector bool short __b) {
   return __a & ~(vector unsigned short)__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_andc(vector bool short __a,
-                                               vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector bool short __b) {
   return __a & ~__b;
 }
 
-static vector int __ATTRS_o_ai vec_andc(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector int __a,
+                                                   vector int __b) {
   return __a & ~__b;
 }
 
-static vector int __ATTRS_o_ai vec_andc(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                   vector int __b) {
   return (vector int)__a & ~__b;
 }
 
-static vector int __ATTRS_o_ai vec_andc(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector int __a,
+                                                   vector bool int __b) {
   return __a & ~(vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_andc(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector unsigned int __b) {
   return __a & ~__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_andc(vector bool int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a & ~__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_andc(vector unsigned int __a,
-                                                 vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector bool int __b) {
   return __a & ~(vector unsigned int)__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_andc(vector bool int __a,
-                                             vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                        vector bool int __b) {
   return __a & ~__b;
 }
 
-static vector float __ATTRS_o_ai vec_andc(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector float __a,
+                                                     vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & ~(vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_andc(vector bool int __a,
-                                          vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                     vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & ~(vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_andc(vector float __a,
-                                          vector bool int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector float __a,
+                                                     vector bool int __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & ~(vector unsigned int)__b;
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai
-vec_andc(vector bool long long __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_andc(vector bool long long __a,
+                                                      vector double __b) {
   vector unsigned long long __res =
       (vector unsigned long long)__a & ~(vector unsigned long long)__b;
   return (vector double)__res;
 }
 
-static vector double __ATTRS_o_ai
+static __inline__ vector double __ATTRS_o_ai
 vec_andc(vector double __a, vector bool long long __b) {
   vector unsigned long long __res =
       (vector unsigned long long)__a & ~(vector unsigned long long)__b;
   return (vector double)__res;
 }
 
-static vector double __ATTRS_o_ai vec_andc(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_andc(vector double __a,
+                                                      vector double __b) {
   vector unsigned long long __res =
       (vector unsigned long long)__a & ~(vector unsigned long long)__b;
   return (vector double)__res;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_andc(vector signed long long __a, vector signed long long __b) {
   return __a & ~__b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_andc(vector bool long long __a, vector signed long long __b) {
   return (vector signed long long)__a & ~__b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_andc(vector signed long long __a, vector bool long long __b) {
   return __a & ~(vector signed long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
   return __a & ~__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_andc(vector bool long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)__a & ~__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_andc(vector unsigned long long __a, vector bool long long __b) {
   return __a & ~(vector unsigned long long)__b;
 }
 
-static vector bool long long __ATTRS_o_ai vec_andc(vector bool long long __a,
-                                                   vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector bool long long __b) {
   return __a & ~__b;
 }
 #endif
 
 /* vec_vandc */
 
-static vector signed char __ATTRS_o_ai vec_vandc(vector signed char __a,
-                                                 vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector signed char __b) {
   return __a & ~__b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vandc(vector bool char __a,
-                                                 vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a & ~__b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vandc(vector signed char __a,
-                                                 vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector bool char __b) {
   return __a & ~(vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vandc(vector unsigned char __a,
-                                                   vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector unsigned char __b) {
   return __a & ~__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vandc(vector bool char __a,
-                                                   vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a & ~__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vandc(vector unsigned char __a,
-                                                   vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector bool char __b) {
   return __a & ~(vector unsigned char)__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_vandc(vector bool char __a,
-                                               vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector bool char __b) {
   return __a & ~__b;
 }
 
-static vector short __ATTRS_o_ai vec_vandc(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector short __a,
+                                                      vector short __b) {
   return __a & ~__b;
 }
 
-static vector short __ATTRS_o_ai vec_vandc(vector bool short __a,
-                                           vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                                      vector short __b) {
   return (vector short)__a & ~__b;
 }
 
-static vector short __ATTRS_o_ai vec_vandc(vector short __a,
-                                           vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector short __a,
+                                                      vector bool short __b) {
   return __a & ~(vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vandc(vector unsigned short __a,
-                                                    vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector unsigned short __b) {
   return __a & ~__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vandc(vector bool short __a,
-                                                    vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a & ~__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vandc(vector unsigned short __a,
-                                                    vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector bool short __b) {
   return __a & ~(vector unsigned short)__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_vandc(vector bool short __a,
-                                                vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector bool short __b) {
   return __a & ~__b;
 }
 
-static vector int __ATTRS_o_ai vec_vandc(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector int __a,
+                                                    vector int __b) {
   return __a & ~__b;
 }
 
-static vector int __ATTRS_o_ai vec_vandc(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                    vector int __b) {
   return (vector int)__a & ~__b;
 }
 
-static vector int __ATTRS_o_ai vec_vandc(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector int __a,
+                                                    vector bool int __b) {
   return __a & ~(vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vandc(vector unsigned int __a,
-                                                  vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector unsigned int __b) {
   return __a & ~__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vandc(vector bool int __a,
-                                                  vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a & ~__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vandc(vector unsigned int __a,
-                                                  vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector bool int __b) {
   return __a & ~(vector unsigned int)__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_vandc(vector bool int __a,
-                                              vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                         vector bool int __b) {
   return __a & ~__b;
 }
 
-static vector float __ATTRS_o_ai vec_vandc(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector float __a,
+                                                      vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & ~(vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vandc(vector bool int __a,
-                                           vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                      vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & ~(vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vandc(vector float __a,
-                                           vector bool int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector float __a,
+                                                      vector bool int __b) {
   vector unsigned int __res =
       (vector unsigned int)__a & ~(vector unsigned int)__b;
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vandc(vector signed long long __a, vector signed long long __b) {
   return __a & ~__b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vandc(vector bool long long __a, vector signed long long __b) {
   return (vector signed long long)__a & ~__b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vandc(vector signed long long __a, vector bool long long __b) {
   return __a & ~(vector signed long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vandc(vector unsigned long long __a, vector unsigned long long __b) {
   return __a & ~__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vandc(vector bool long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)__a & ~__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vandc(vector unsigned long long __a, vector bool long long __b) {
   return __a & ~(vector unsigned long long)__b;
 }
 
-static vector bool long long __ATTRS_o_ai vec_vandc(vector bool long long __a,
-                                                    vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector bool long long __b) {
   return __a & ~__b;
 }
 #endif
 
 /* vec_avg */
 
-static vector signed char __ATTRS_o_ai vec_avg(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_avg(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vavgsb(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_avg(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_avg(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vavgub(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_avg(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_avg(vector short __a,
+                                                    vector short __b) {
   return __builtin_altivec_vavgsh(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_avg(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_avg(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vavguh(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_avg(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_avg(vector int __a,
+                                                  vector int __b) {
   return __builtin_altivec_vavgsw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_avg(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_avg(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vavguw(__a, __b);
 }
 
 /* vec_vavgsb */
 
-static vector signed char __attribute__((__always_inline__))
+static __inline__ vector signed char __attribute__((__always_inline__))
 vec_vavgsb(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vavgsb(__a, __b);
 }
 
 /* vec_vavgub */
 
-static vector unsigned char __attribute__((__always_inline__))
+static __inline__ vector unsigned char __attribute__((__always_inline__))
 vec_vavgub(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vavgub(__a, __b);
 }
 
 /* vec_vavgsh */
 
-static vector short __attribute__((__always_inline__))
+static __inline__ vector short __attribute__((__always_inline__))
 vec_vavgsh(vector short __a, vector short __b) {
   return __builtin_altivec_vavgsh(__a, __b);
 }
 
 /* vec_vavguh */
 
-static vector unsigned short __attribute__((__always_inline__))
+static __inline__ vector unsigned short __attribute__((__always_inline__))
 vec_vavguh(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vavguh(__a, __b);
 }
 
 /* vec_vavgsw */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vavgsw(vector int __a, vector int __b) {
   return __builtin_altivec_vavgsw(__a, __b);
 }
 
 /* vec_vavguw */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vavguw(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vavguw(__a, __b);
 }
 
 /* vec_ceil */
 
-static vector float __ATTRS_o_ai vec_ceil(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_ceil(vector float __a) {
 #ifdef __VSX__
   return __builtin_vsx_xvrspip(__a);
 #else
@@ -1462,82 +1504,83 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_ceil(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_ceil(vector double __a) {
   return __builtin_vsx_xvrdpip(__a);
 }
 #endif
 
 /* vec_vrfip */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vrfip(vector float __a) {
   return __builtin_altivec_vrfip(__a);
 }
 
 /* vec_cmpb */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_cmpb(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpbfp(__a, __b);
 }
 
 /* vec_vcmpbfp */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vcmpbfp(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpbfp(__a, __b);
 }
 
 /* vec_cmpeq */
 
-static vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector signed char __a, vector signed char __b) {
   return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
                                                       (vector char)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_cmpeq(vector unsigned char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
                                                       (vector char)__b);
 }
 
-static vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
-                                                vector short __b) {
+static __inline__ vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
+                                                           vector short __b) {
   return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_cmpeq(vector unsigned short __a,
-                                                vector unsigned short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
                                                        (vector short)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a, vector int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a,
+                                                         vector int __b) {
   return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_cmpeq(vector unsigned int __a,
-                                              vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
                                                      (vector int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpeq(vector signed long long __a, vector signed long long __b) {
   return (vector bool long long)__builtin_altivec_vcmpequd(__a, __b);
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)__builtin_altivec_vcmpequd(
       (vector long long)__a, (vector long long)__b);
 }
 #endif
 
-static vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
-                                              vector float __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
+                                                         vector float __b) {
 #ifdef __VSX__
   return (vector bool int)__builtin_vsx_xvcmpeqsp(__a, __b);
 #else
@@ -1546,58 +1589,58 @@
 }
 
 #ifdef __VSX__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpeq(vector double __a, vector double __b) {
   return (vector bool long long)__builtin_vsx_xvcmpeqdp(__a, __b);
 }
 #endif
 
-
 /* vec_cmpgt */
 
-static vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpgt(vector signed char __a, vector signed char __b) {
   return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
 }
 
-static vector bool char __ATTRS_o_ai vec_cmpgt(vector unsigned char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_cmpgt(vector short __a,
-                                                vector short __b) {
+static __inline__ vector bool short __ATTRS_o_ai vec_cmpgt(vector short __a,
+                                                           vector short __b) {
   return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_cmpgt(vector unsigned short __a,
-                                                vector unsigned short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_cmpgt(vector int __a, vector int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpgt(vector int __a,
+                                                         vector int __b) {
   return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_cmpgt(vector unsigned int __a,
-                                              vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
 }
 
 #ifdef __POWER8_VECTOR__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpgt(vector signed long long __a, vector signed long long __b) {
   return (vector bool long long)__builtin_altivec_vcmpgtsd(__a, __b);
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)__builtin_altivec_vcmpgtud(__a, __b);
 }
 #endif
 
-static vector bool int __ATTRS_o_ai vec_cmpgt(vector float __a,
-                                              vector float __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpgt(vector float __a,
+                                                         vector float __b) {
 #ifdef __VSX__
   return (vector bool int)__builtin_vsx_xvcmpgtsp(__a, __b);
 #else
@@ -1606,7 +1649,7 @@
 }
 
 #ifdef __VSX__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpgt(vector double __a, vector double __b) {
   return (vector bool long long)__builtin_vsx_xvcmpgtdp(__a, __b);
 }
@@ -1614,38 +1657,38 @@
 
 /* vec_cmpge */
 
-static vector bool char __ATTRS_o_ai
-vec_cmpge (vector signed char __a, vector signed char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpge(vector signed char __a, vector signed char __b) {
   return ~(vec_cmpgt(__b, __a));
 }
 
-static vector bool char __ATTRS_o_ai
-vec_cmpge (vector unsigned char __a, vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpge(vector unsigned char __a, vector unsigned char __b) {
   return ~(vec_cmpgt(__b, __a));
 }
 
-static vector bool short __ATTRS_o_ai
-vec_cmpge (vector signed short __a, vector signed short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpge(vector signed short __a, vector signed short __b) {
   return ~(vec_cmpgt(__b, __a));
 }
 
-static vector bool short __ATTRS_o_ai
-vec_cmpge (vector unsigned short __a, vector unsigned short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpge(vector unsigned short __a, vector unsigned short __b) {
   return ~(vec_cmpgt(__b, __a));
 }
 
-static vector bool int __ATTRS_o_ai
-vec_cmpge (vector signed int __a, vector signed int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpge(vector signed int __a, vector signed int __b) {
   return ~(vec_cmpgt(__b, __a));
 }
 
-static vector bool int __ATTRS_o_ai
-vec_cmpge (vector unsigned int __a, vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpge(vector unsigned int __a, vector unsigned int __b) {
   return ~(vec_cmpgt(__b, __a));
 }
 
-static vector bool int __ATTRS_o_ai
-vec_cmpge(vector float __a, vector float __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpge(vector float __a,
+                                                         vector float __b) {
 #ifdef __VSX__
   return (vector bool int)__builtin_vsx_xvcmpgesp(__a, __b);
 #else
@@ -1654,19 +1697,19 @@
 }
 
 #ifdef __VSX__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpge(vector double __a, vector double __b) {
   return (vector bool long long)__builtin_vsx_xvcmpgedp(__a, __b);
 }
 #endif
 
 #ifdef __POWER8_VECTOR__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpge(vector signed long long __a, vector signed long long __b) {
   return ~(vec_cmpgt(__b, __a));
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
   return ~(vec_cmpgt(__b, __a));
 }
@@ -1674,111 +1717,111 @@
 
 /* vec_vcmpgefp */
 
-static vector bool int __attribute__((__always_inline__))
+static __inline__ vector bool int __attribute__((__always_inline__))
 vec_vcmpgefp(vector float __a, vector float __b) {
   return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
 }
 
 /* vec_vcmpgtsb */
 
-static vector bool char __attribute__((__always_inline__))
+static __inline__ vector bool char __attribute__((__always_inline__))
 vec_vcmpgtsb(vector signed char __a, vector signed char __b) {
   return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
 }
 
 /* vec_vcmpgtub */
 
-static vector bool char __attribute__((__always_inline__))
+static __inline__ vector bool char __attribute__((__always_inline__))
 vec_vcmpgtub(vector unsigned char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
 }
 
 /* vec_vcmpgtsh */
 
-static vector bool short __attribute__((__always_inline__))
+static __inline__ vector bool short __attribute__((__always_inline__))
 vec_vcmpgtsh(vector short __a, vector short __b) {
   return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
 }
 
 /* vec_vcmpgtuh */
 
-static vector bool short __attribute__((__always_inline__))
+static __inline__ vector bool short __attribute__((__always_inline__))
 vec_vcmpgtuh(vector unsigned short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
 }
 
 /* vec_vcmpgtsw */
 
-static vector bool int __attribute__((__always_inline__))
+static __inline__ vector bool int __attribute__((__always_inline__))
 vec_vcmpgtsw(vector int __a, vector int __b) {
   return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
 }
 
 /* vec_vcmpgtuw */
 
-static vector bool int __attribute__((__always_inline__))
+static __inline__ vector bool int __attribute__((__always_inline__))
 vec_vcmpgtuw(vector unsigned int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
 }
 
 /* vec_vcmpgtfp */
 
-static vector bool int __attribute__((__always_inline__))
+static __inline__ vector bool int __attribute__((__always_inline__))
 vec_vcmpgtfp(vector float __a, vector float __b) {
   return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
 }
 
 /* vec_cmple */
 
-static vector bool char __ATTRS_o_ai
-vec_cmple (vector signed char __a, vector signed char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmple(vector signed char __a, vector signed char __b) {
   return vec_cmpge(__b, __a);
 }
 
-static vector bool char __ATTRS_o_ai
-vec_cmple (vector unsigned char __a, vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmple(vector unsigned char __a, vector unsigned char __b) {
   return vec_cmpge(__b, __a);
 }
 
-static vector bool short __ATTRS_o_ai
-vec_cmple (vector signed short __a, vector signed short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmple(vector signed short __a, vector signed short __b) {
   return vec_cmpge(__b, __a);
 }
 
-static vector bool short __ATTRS_o_ai
-vec_cmple (vector unsigned short __a, vector unsigned short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmple(vector unsigned short __a, vector unsigned short __b) {
   return vec_cmpge(__b, __a);
 }
 
-static vector bool int __ATTRS_o_ai
-vec_cmple (vector signed int __a, vector signed int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmple(vector signed int __a, vector signed int __b) {
   return vec_cmpge(__b, __a);
 }
 
-static vector bool int __ATTRS_o_ai
-vec_cmple (vector unsigned int __a, vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmple(vector unsigned int __a, vector unsigned int __b) {
   return vec_cmpge(__b, __a);
 }
 
-static vector bool int __ATTRS_o_ai
-vec_cmple(vector float __a, vector float __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_cmple(vector float __a,
+                                                         vector float __b) {
   return vec_cmpge(__b, __a);
 }
 
 #ifdef __VSX__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmple(vector double __a, vector double __b) {
   return vec_cmpge(__b, __a);
 }
 #endif
 
 #ifdef __POWER8_VECTOR__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmple(vector signed long long __a, vector signed long long __b) {
   return vec_cmpge(__b, __a);
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_cmpge(__b, __a);
 }
@@ -1786,83 +1829,90 @@
 
 /* vec_cmplt */
 
-static vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmplt(vector signed char __a, vector signed char __b) {
   return vec_cmpgt(__b, __a);
 }
 
-static vector bool char __ATTRS_o_ai vec_cmplt(vector unsigned char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmplt(vector unsigned char __a, vector unsigned char __b) {
   return vec_cmpgt(__b, __a);
 }
 
-static vector bool short __ATTRS_o_ai vec_cmplt(vector short __a,
-                                                vector short __b) {
+static __inline__ vector bool short __ATTRS_o_ai vec_cmplt(vector short __a,
+                                                           vector short __b) {
   return vec_cmpgt(__b, __a);
 }
 
-static vector bool short __ATTRS_o_ai vec_cmplt(vector unsigned short __a,
-                                                vector unsigned short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmplt(vector unsigned short __a, vector unsigned short __b) {
   return vec_cmpgt(__b, __a);
 }
 
-static vector bool int __ATTRS_o_ai vec_cmplt(vector int __a, vector int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_cmplt(vector int __a,
+                                                         vector int __b) {
   return vec_cmpgt(__b, __a);
 }
 
-static vector bool int __ATTRS_o_ai vec_cmplt(vector unsigned int __a,
-                                              vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmplt(vector unsigned int __a, vector unsigned int __b) {
   return vec_cmpgt(__b, __a);
 }
 
-static vector bool int __ATTRS_o_ai vec_cmplt(vector float __a,
-                                              vector float __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_cmplt(vector float __a,
+                                                         vector float __b) {
   return vec_cmpgt(__b, __a);
 }
 
 #ifdef __VSX__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmplt(vector double __a, vector double __b) {
   return vec_cmpgt(__b, __a);
 }
 #endif
 
 #ifdef __POWER8_VECTOR__
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmplt(vector signed long long __a, vector signed long long __b) {
   return vec_cmpgt(__b, __a);
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_cmpgt(__b, __a);
 }
 
 /* vec_cntlz */
 
-static vector signed char __ATTRS_o_ai vec_cntlz(vector signed char __a) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_cntlz(vector signed char __a) {
   return __builtin_altivec_vclzb(__a);
 }
-static vector unsigned char __ATTRS_o_ai vec_cntlz(vector unsigned char __a) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_cntlz(vector unsigned char __a) {
   return __builtin_altivec_vclzb(__a);
 }
-static vector signed short __ATTRS_o_ai vec_cntlz(vector signed short __a) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_cntlz(vector signed short __a) {
   return __builtin_altivec_vclzh(__a);
 }
-static vector unsigned short __ATTRS_o_ai vec_cntlz(vector unsigned short __a) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_cntlz(vector unsigned short __a) {
   return __builtin_altivec_vclzh(__a);
 }
-static vector signed int __ATTRS_o_ai vec_cntlz(vector signed int __a) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_cntlz(vector signed int __a) {
   return __builtin_altivec_vclzw(__a);
 }
-static vector unsigned int __ATTRS_o_ai vec_cntlz(vector unsigned int __a) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_cntlz(vector unsigned int __a) {
   return __builtin_altivec_vclzw(__a);
 }
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_cntlz(vector signed long long __a) {
   return __builtin_altivec_vclzd(__a);
 }
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_cntlz(vector unsigned long long __a) {
   return __builtin_altivec_vclzd(__a);
 }
@@ -1871,36 +1921,38 @@
 /* vec_cpsgn */
 
 #ifdef __VSX__
-static vector float __ATTRS_o_ai vec_cpsgn(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_cpsgn(vector float __a,
+                                                      vector float __b) {
   return __builtin_vsx_xvcpsgnsp(__a, __b);
 }
 
-static vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
-                                            vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
+                                                       vector double __b) {
   return __builtin_vsx_xvcpsgndp(__a, __b);
 }
 #endif
 
 /* vec_ctf */
 
-static vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
   return __builtin_altivec_vcfsx(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a, int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
+                                                    int __b) {
   return __builtin_altivec_vcfux((vector int)__a, __b);
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_ctf(vector unsigned long long __a,
-                                          int __b) {
+static __inline__ vector double __ATTRS_o_ai
+vec_ctf(vector unsigned long long __a, int __b) {
   vector double __ret = __builtin_convertvector(__a, vector double);
   __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
   return __ret;
 }
 
-static vector double __ATTRS_o_ai vec_ctf(vector signed long long __a,
-                                          int __b) {
+static __inline__ vector double __ATTRS_o_ai
+vec_ctf(vector signed long long __a, int __b) {
   vector double __ret = __builtin_convertvector(__a, vector double);
   __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
   return __ret;
@@ -1909,27 +1961,27 @@
 
 /* vec_vcfsx */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vcfsx(vector int __a, int __b) {
   return __builtin_altivec_vcfsx(__a, __b);
 }
 
 /* vec_vcfux */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vcfux(vector unsigned int __a, int __b) {
   return __builtin_altivec_vcfux((vector int)__a, __b);
 }
 
 /* vec_cts */
 
-static vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
   return __builtin_altivec_vctsxs(__a, __b);
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai vec_cts(vector double __a,
-                                                    int __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cts(vector double __a, int __b) {
   __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
   return __builtin_convertvector(__a, vector signed long long);
 }
@@ -1937,20 +1989,21 @@
 
 /* vec_vctsxs */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vctsxs(vector float __a, int __b) {
   return __builtin_altivec_vctsxs(__a, __b);
 }
 
 /* vec_ctu */
 
-static vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a, int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
+                                                           int __b) {
   return __builtin_altivec_vctuxs(__a, __b);
 }
 
 #ifdef __VSX__
-static vector unsigned long long __ATTRS_o_ai vec_ctu(vector double __a,
-                                                      int __b) {
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_ctu(vector double __a, int __b) {
   __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
   return __builtin_convertvector(__a, vector unsigned long long);
 }
@@ -1958,7 +2011,7 @@
 
 /* vec_vctuxs */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vctuxs(vector float __a, int __b) {
   return __builtin_altivec_vctuxs(__a, __b);
 }
@@ -1966,13 +2019,15 @@
 /* vec_double */
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_double (vector signed long long __a) {
-  vector double __ret = { __a[0], __a[1] };
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector signed long long __a) {
+  vector double __ret = {__a[0], __a[1]};
   return __ret;
 }
 
-static vector double __ATTRS_o_ai vec_double (vector unsigned long long __a) {
-  vector double __ret = { __a[0], __a[1] };
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector unsigned long long __a) {
+  vector double __ret = {__a[0], __a[1]};
   return __ret;
 }
 #endif
@@ -1982,178 +2037,172 @@
 /* Integer vector divides (vectors are scalarized, elements divided
    and the vectors reassembled).
 */
-static vector signed char __ATTRS_o_ai vec_div(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_div(vector signed char __a, vector signed char __b) {
   return __a / __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_div(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_div(vector unsigned char __a, vector unsigned char __b) {
   return __a / __b;
 }
 
-static vector signed short __ATTRS_o_ai vec_div(vector signed short __a,
-                                                vector signed short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_div(vector signed short __a, vector signed short __b) {
   return __a / __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_div(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_div(vector unsigned short __a, vector unsigned short __b) {
   return __a / __b;
 }
 
-static vector signed int __ATTRS_o_ai vec_div(vector signed int __a,
-                                              vector signed int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_div(vector signed int __a, vector signed int __b) {
   return __a / __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_div(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_div(vector unsigned int __a, vector unsigned int __b) {
   return __a / __b;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_div(vector signed long long __a, vector signed long long __b) {
   return __a / __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_div(vector unsigned long long __a, vector unsigned long long __b) {
   return __a / __b;
 }
 
-static vector float __ATTRS_o_ai vec_div(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_div(vector float __a,
+                                                    vector float __b) {
   return __a / __b;
 }
 
-static vector double __ATTRS_o_ai vec_div(vector double __a,
-                                          vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a,
+                                                     vector double __b) {
   return __a / __b;
 }
 #endif
 
 /* vec_dss */
 
-static void __attribute__((__always_inline__)) vec_dss(int __a) {
+static __inline__ void __attribute__((__always_inline__)) vec_dss(int __a) {
   __builtin_altivec_dss(__a);
 }
 
 /* vec_dssall */
 
-static void __attribute__((__always_inline__)) vec_dssall(void) {
+static __inline__ void __attribute__((__always_inline__)) vec_dssall(void) {
   __builtin_altivec_dssall();
 }
 
 /* vec_dst */
-
-static void __attribute__((__always_inline__))
-vec_dst(const void *__a, int __b, int __c) {
-  __builtin_altivec_dst(__a, __b, __c);
-}
+#define vec_dst(__PTR, __CW, __STR) \
+  __extension__(                    \
+      { __builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR)); })
 
 /* vec_dstst */
-
-static void __attribute__((__always_inline__))
-vec_dstst(const void *__a, int __b, int __c) {
-  __builtin_altivec_dstst(__a, __b, __c);
-}
+#define vec_dstst(__PTR, __CW, __STR) \
+  __extension__(                      \
+      { __builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR)); })
 
 /* vec_dststt */
-
-static void __attribute__((__always_inline__))
-vec_dststt(const void *__a, int __b, int __c) {
-  __builtin_altivec_dststt(__a, __b, __c);
-}
+#define vec_dststt(__PTR, __CW, __STR) \
+  __extension__(                       \
+      { __builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR)); })
 
 /* vec_dstt */
-
-static void __attribute__((__always_inline__))
-vec_dstt(const void *__a, int __b, int __c) {
-  __builtin_altivec_dstt(__a, __b, __c);
-}
+#define vec_dstt(__PTR, __CW, __STR) \
+  __extension__(                     \
+      { __builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR)); })
 
 /* vec_eqv */
 
 #ifdef __POWER8_VECTOR__
-static vector signed char __ATTRS_o_ai vec_eqv(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_eqv(vector signed char __a, vector signed char __b) {
   return (vector signed char)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                                   (vector unsigned int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_eqv(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_eqv(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                                     (vector unsigned int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_eqv(vector bool char __a,
-                                             vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_eqv(vector bool char __a,
+                                                        vector bool char __b) {
   return (vector bool char)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                                 (vector unsigned int)__b);
 }
 
-static vector signed short __ATTRS_o_ai vec_eqv(vector signed short __a,
-                                                vector signed short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_eqv(vector signed short __a, vector signed short __b) {
   return (vector signed short)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                                    (vector unsigned int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_eqv(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_eqv(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                                      (vector unsigned int)__b);
 }
 
-static vector bool short __ATTRS_o_ai vec_eqv(vector bool short __a,
-                                              vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_eqv(vector bool short __a, vector bool short __b) {
   return (vector bool short)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                                  (vector unsigned int)__b);
 }
 
-static vector signed int __ATTRS_o_ai vec_eqv(vector signed int __a,
-                                              vector signed int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_eqv(vector signed int __a, vector signed int __b) {
   return (vector signed int)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                                  (vector unsigned int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_eqv(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_eqv(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_vsx_xxleqv(__a, __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_eqv(vector bool int __a,
-                                            vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_eqv(vector bool int __a,
+                                                       vector bool int __b) {
   return (vector bool int)__builtin_vsx_xxleqv((vector unsigned int)__a,
-                                                 (vector unsigned int)__b);
+                                               (vector unsigned int)__b);
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_eqv(vector signed long long __a, vector signed long long __b) {
-  return (vector signed long long)
-    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+  return (vector signed long long)__builtin_vsx_xxleqv(
+      (vector unsigned int)__a, (vector unsigned int)__b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
-  return (vector unsigned long long)
-    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+  return (vector unsigned long long)__builtin_vsx_xxleqv(
+      (vector unsigned int)__a, (vector unsigned int)__b);
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_eqv(vector bool long long __a, vector bool long long __b) {
-  return (vector bool long long)
-    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+  return (vector bool long long)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                     (vector unsigned int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_eqv(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_eqv(vector float __a,
+                                                    vector float __b) {
   return (vector float)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                             (vector unsigned int)__b);
 }
 
-static vector double __ATTRS_o_ai vec_eqv(vector double __a,
-                                          vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_eqv(vector double __a,
+                                                     vector double __b) {
   return (vector double)__builtin_vsx_xxleqv((vector unsigned int)__a,
                                              (vector unsigned int)__b);
 }
@@ -2161,21 +2210,21 @@
 
 /* vec_expte */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_expte(vector float __a) {
   return __builtin_altivec_vexptefp(__a);
 }
 
 /* vec_vexptefp */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vexptefp(vector float __a) {
   return __builtin_altivec_vexptefp(__a);
 }
 
 /* vec_floor */
 
-static vector float __ATTRS_o_ai vec_floor(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_floor(vector float __a) {
 #ifdef __VSX__
   return __builtin_vsx_xvrspim(__a);
 #else
@@ -2184,439 +2233,460 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_floor(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_floor(vector double __a) {
   return __builtin_vsx_xvrdpim(__a);
 }
 #endif
 
 /* vec_vrfim */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vrfim(vector float __a) {
   return __builtin_altivec_vrfim(__a);
 }
 
 /* vec_ld */
 
-static vector signed char __ATTRS_o_ai vec_ld(int __a,
-                                              const vector signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ld(int __a, const vector signed char *__b) {
   return (vector signed char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_ld(int __a, const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ld(int __a, const signed char *__b) {
   return (vector signed char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_ld(int __a, const vector unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_ld(int __a,
-                                                const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector bool char __ATTRS_o_ai vec_ld(int __a,
-                                            const vector bool char *__b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_ld(int __a, const vector bool char *__b) {
   return (vector bool char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_ld(int __a, const vector short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector short *__b) {
   return (vector short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_ld(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_ld(int __a, const short *__b) {
   return (vector short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_ld(int __a, const vector unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_ld(int __a,
-                                                 const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_ld(int __a,
-                                             const vector bool short *__b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_ld(int __a, const vector bool short *__b) {
   return (vector bool short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector pixel __ATTRS_o_ai vec_ld(int __a, const vector pixel *__b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector pixel *__b) {
   return (vector pixel)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_ld(int __a, const vector int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_ld(int __a,
+                                                 const vector int *__b) {
   return (vector int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_ld(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_ld(int __a, const int *__b) {
   return (vector int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_ld(int __a,
-                                               const vector unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_ld(int __a,
-                                               const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_ld(int __a,
-                                           const vector bool int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_ld(int __a, const vector bool int *__b) {
   return (vector bool int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_ld(int __a, const vector float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector float *__b) {
   return (vector float)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_ld(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_ld(int __a, const float *__b) {
   return (vector float)__builtin_altivec_lvx(__a, __b);
 }
 
 /* vec_lvx */
 
-static vector signed char __ATTRS_o_ai vec_lvx(int __a,
-                                               const vector signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const vector signed char *__b) {
   return (vector signed char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_lvx(int __a,
-                                               const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const signed char *__b) {
   return (vector signed char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_lvx(int __a, const vector unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_lvx(int __a,
-                                                 const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector bool char __ATTRS_o_ai vec_lvx(int __a,
-                                             const vector bool char *__b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvx(int __a, const vector bool char *__b) {
   return (vector bool char)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_lvx(int __a, const vector short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector short *__b) {
   return (vector short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_lvx(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvx(int __a, const short *__b) {
   return (vector short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_lvx(int __a, const vector unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_lvx(int __a,
-                                                  const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_lvx(int __a,
-                                              const vector bool short *__b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvx(int __a, const vector bool short *__b) {
   return (vector bool short)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector pixel __ATTRS_o_ai vec_lvx(int __a, const vector pixel *__b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector pixel *__b) {
   return (vector pixel)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_lvx(int __a, const vector int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvx(int __a,
+                                                  const vector int *__b) {
   return (vector int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_lvx(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvx(int __a, const int *__b) {
   return (vector int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_lvx(int __a, const vector unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_lvx(int __a,
-                                                const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_lvx(int __a,
-                                            const vector bool int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvx(int __a, const vector bool int *__b) {
   return (vector bool int)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_lvx(int __a, const vector float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector float *__b) {
   return (vector float)__builtin_altivec_lvx(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_lvx(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvx(int __a, const float *__b) {
   return (vector float)__builtin_altivec_lvx(__a, __b);
 }
 
 /* vec_lde */
 
-static vector signed char __ATTRS_o_ai vec_lde(int __a,
-                                               const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lde(int __a, const signed char *__b) {
   return (vector signed char)__builtin_altivec_lvebx(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_lde(int __a,
-                                                 const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lde(int __a, const unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_lde(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lde(int __a, const short *__b) {
   return (vector short)__builtin_altivec_lvehx(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_lde(int __a,
-                                                  const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lde(int __a, const unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_lde(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lde(int __a, const int *__b) {
   return (vector int)__builtin_altivec_lvewx(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_lde(int __a,
-                                                const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lde(int __a, const unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_lde(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lde(int __a, const float *__b) {
   return (vector float)__builtin_altivec_lvewx(__a, __b);
 }
 
 /* vec_lvebx */
 
-static vector signed char __ATTRS_o_ai vec_lvebx(int __a,
-                                                 const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvebx(int __a, const signed char *__b) {
   return (vector signed char)__builtin_altivec_lvebx(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_lvebx(int __a,
-                                                   const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvebx(int __a, const unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
 }
 
 /* vec_lvehx */
 
-static vector short __ATTRS_o_ai vec_lvehx(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvehx(int __a,
+                                                      const short *__b) {
   return (vector short)__builtin_altivec_lvehx(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_lvehx(int __a,
-                                                    const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvehx(int __a, const unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
 }
 
 /* vec_lvewx */
 
-static vector int __ATTRS_o_ai vec_lvewx(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvewx(int __a, const int *__b) {
   return (vector int)__builtin_altivec_lvewx(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_lvewx(int __a,
-                                                  const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvewx(int __a, const unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_lvewx(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvewx(int __a,
+                                                      const float *__b) {
   return (vector float)__builtin_altivec_lvewx(__a, __b);
 }
 
 /* vec_ldl */
 
-static vector signed char __ATTRS_o_ai vec_ldl(int __a,
-                                               const vector signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const vector signed char *__b) {
   return (vector signed char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_ldl(int __a,
-                                               const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const signed char *__b) {
   return (vector signed char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_ldl(int __a, const vector unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_ldl(int __a,
-                                                 const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector bool char __ATTRS_o_ai vec_ldl(int __a,
-                                             const vector bool char *__b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_ldl(int __a, const vector bool char *__b) {
   return (vector bool char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_ldl(int __a, const vector short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector short *__b) {
   return (vector short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_ldl(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_ldl(int __a, const short *__b) {
   return (vector short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_ldl(int __a, const vector unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_ldl(int __a,
-                                                  const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_ldl(int __a,
-                                              const vector bool short *__b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_ldl(int __a, const vector bool short *__b) {
   return (vector bool short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector pixel __ATTRS_o_ai vec_ldl(int __a, const vector pixel *__b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector pixel *__b) {
   return (vector pixel short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_ldl(int __a, const vector int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_ldl(int __a,
+                                                  const vector int *__b) {
   return (vector int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_ldl(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_ldl(int __a, const int *__b) {
   return (vector int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_ldl(int __a, const vector unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_ldl(int __a,
-                                                const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_ldl(int __a,
-                                            const vector bool int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_ldl(int __a, const vector bool int *__b) {
   return (vector bool int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_ldl(int __a, const vector float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector float *__b) {
   return (vector float)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_ldl(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_ldl(int __a, const float *__b) {
   return (vector float)__builtin_altivec_lvxl(__a, __b);
 }
 
 /* vec_lvxl */
 
-static vector signed char __ATTRS_o_ai vec_lvxl(int __a,
-                                                const vector signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const vector signed char *__b) {
   return (vector signed char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_lvxl(int __a,
-                                                const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const signed char *__b) {
   return (vector signed char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_lvxl(int __a, const vector unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_lvxl(int __a,
-                                                  const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector bool char __ATTRS_o_ai vec_lvxl(int __a,
-                                              const vector bool char *__b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool char *__b) {
   return (vector bool char)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_lvxl(int __a, const vector short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector short *__b) {
   return (vector short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_lvxl(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const short *__b) {
   return (vector short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_lvxl(int __a, const vector unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_lvxl(int __a,
-                                                   const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned short *__b) {
   return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_lvxl(int __a,
-                                               const vector bool short *__b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool short *__b) {
   return (vector bool short)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector pixel __ATTRS_o_ai vec_lvxl(int __a, const vector pixel *__b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector pixel *__b) {
   return (vector pixel)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_lvxl(int __a, const vector int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvxl(int __a,
+                                                   const vector int *__b) {
   return (vector int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_lvxl(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvxl(int __a, const int *__b) {
   return (vector int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_lvxl(int __a, const vector unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_lvxl(int __a,
-                                                 const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned int *__b) {
   return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_lvxl(int __a,
-                                             const vector bool int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool int *__b) {
   return (vector bool int)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_lvxl(int __a, const vector float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector float *__b) {
   return (vector float)__builtin_altivec_lvxl(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_lvxl(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const float *__b) {
   return (vector float)__builtin_altivec_lvxl(__a, __b);
 }
 
 /* vec_loge */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_loge(vector float __a) {
   return __builtin_altivec_vlogefp(__a);
 }
 
 /* vec_vlogefp */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vlogefp(vector float __a) {
   return __builtin_altivec_vlogefp(__a);
 }
@@ -2624,7 +2694,7 @@
 /* vec_lvsl */
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsl(int __a, const signed char *__b) {
   vector unsigned char mask =
@@ -2634,14 +2704,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
-                                                  const signed char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const signed char *__b) {
   return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsl(int __a, const unsigned char *__b) {
   vector unsigned char mask =
@@ -2651,14 +2721,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
-                                                  const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsl(int __a, const short *__b) {
   vector unsigned char mask =
@@ -2668,13 +2738,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const short *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const short *__b) {
   return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsl(int __a, const unsigned short *__b) {
   vector unsigned char mask =
@@ -2684,14 +2755,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
-                                                  const unsigned short *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned short *__b) {
   return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsl(int __a, const int *__b) {
   vector unsigned char mask =
@@ -2701,13 +2772,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const int *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const int *__b) {
   return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsl(int __a, const unsigned int *__b) {
   vector unsigned char mask =
@@ -2717,14 +2789,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
-                                                  const unsigned int *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned int *__b) {
   return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsl(int __a, const float *__b) {
   vector unsigned char mask =
@@ -2734,7 +2806,8 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const float *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const float *__b) {
   return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
 }
 #endif
@@ -2742,7 +2815,7 @@
 /* vec_lvsr */
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsr(int __a, const signed char *__b) {
   vector unsigned char mask =
@@ -2752,14 +2825,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
-                                                  const signed char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const signed char *__b) {
   return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsr(int __a, const unsigned char *__b) {
   vector unsigned char mask =
@@ -2769,14 +2842,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
-                                                  const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned char *__b) {
   return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsr(int __a, const short *__b) {
   vector unsigned char mask =
@@ -2786,13 +2859,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const short *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const short *__b) {
   return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsr(int __a, const unsigned short *__b) {
   vector unsigned char mask =
@@ -2802,14 +2876,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
-                                                  const unsigned short *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned short *__b) {
   return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsr(int __a, const int *__b) {
   vector unsigned char mask =
@@ -2819,13 +2893,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const int *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const int *__b) {
   return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsr(int __a, const unsigned int *__b) {
   vector unsigned char mask =
@@ -2835,14 +2910,14 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
-                                                  const unsigned int *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned int *__b) {
   return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
     __attribute__((__deprecated__("use assignment for unaligned little endian \
 loads/stores"))) vec_lvsr(int __a, const float *__b) {
   vector unsigned char mask =
@@ -2852,47 +2927,48 @@
   return vec_perm(mask, mask, reverse);
 }
 #else
-static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const float *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const float *__b) {
   return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
 }
 #endif
 
 /* vec_madd */
-static vector signed short __ATTRS_o_ai
+static __inline__ vector signed short __ATTRS_o_ai
 vec_mladd(vector signed short, vector signed short, vector signed short);
-static vector signed short __ATTRS_o_ai
+static __inline__ vector signed short __ATTRS_o_ai
 vec_mladd(vector signed short, vector unsigned short, vector unsigned short);
-static vector signed short __ATTRS_o_ai
+static __inline__ vector signed short __ATTRS_o_ai
 vec_mladd(vector unsigned short, vector signed short, vector signed short);
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_mladd(vector unsigned short, vector unsigned short, vector unsigned short);
 
-static vector signed short __ATTRS_o_ai
-vec_madd(vector signed short __a, vector signed short __b,
-         vector signed short __c) {
-  return  vec_mladd(__a, __b, __c);
+static __inline__ vector signed short __ATTRS_o_ai vec_madd(
+    vector signed short __a, vector signed short __b, vector signed short __c) {
+  return vec_mladd(__a, __b, __c);
 }
 
-static vector signed short __ATTRS_o_ai
+static __inline__ vector signed short __ATTRS_o_ai
 vec_madd(vector signed short __a, vector unsigned short __b,
          vector unsigned short __c) {
   return vec_mladd(__a, __b, __c);
 }
 
-static vector signed short __ATTRS_o_ai
+static __inline__ vector signed short __ATTRS_o_ai
 vec_madd(vector unsigned short __a, vector signed short __b,
          vector signed short __c) {
   return vec_mladd(__a, __b, __c);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_madd(vector unsigned short __a, vector unsigned short __b,
          vector unsigned short __c) {
   return vec_mladd(__a, __b, __c);
 }
 
-static vector float __ATTRS_o_ai
-vec_madd(vector float __a, vector float __b, vector float __c) {
+static __inline__ vector float __ATTRS_o_ai vec_madd(vector float __a,
+                                                     vector float __b,
+                                                     vector float __c) {
 #ifdef __VSX__
   return __builtin_vsx_xvmaddasp(__a, __b, __c);
 #else
@@ -2901,29 +2977,30 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai
-vec_madd(vector double __a, vector double __b, vector double __c) {
+static __inline__ vector double __ATTRS_o_ai vec_madd(vector double __a,
+                                                      vector double __b,
+                                                      vector double __c) {
   return __builtin_vsx_xvmaddadp(__a, __b, __c);
 }
 #endif
 
 /* vec_vmaddfp */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vmaddfp(vector float __a, vector float __b, vector float __c) {
   return __builtin_altivec_vmaddfp(__a, __b, __c);
 }
 
 /* vec_madds */
 
-static vector signed short __attribute__((__always_inline__))
+static __inline__ vector signed short __attribute__((__always_inline__))
 vec_madds(vector signed short __a, vector signed short __b,
           vector signed short __c) {
   return __builtin_altivec_vmhaddshs(__a, __b, __c);
 }
 
 /* vec_vmhaddshs */
-static vector signed short __attribute__((__always_inline__))
+static __inline__ vector signed short __attribute__((__always_inline__))
 vec_vmhaddshs(vector signed short __a, vector signed short __b,
               vector signed short __c) {
   return __builtin_altivec_vmhaddshs(__a, __b, __c);
@@ -2932,138 +3009,145 @@
 /* vec_msub */
 
 #ifdef __VSX__
-static vector float __ATTRS_o_ai
-vec_msub(vector float __a, vector float __b, vector float __c) {
+static __inline__ vector float __ATTRS_o_ai vec_msub(vector float __a,
+                                                     vector float __b,
+                                                     vector float __c) {
   return __builtin_vsx_xvmsubasp(__a, __b, __c);
 }
 
-static vector double __ATTRS_o_ai
-vec_msub(vector double __a, vector double __b, vector double __c) {
+static __inline__ vector double __ATTRS_o_ai vec_msub(vector double __a,
+                                                      vector double __b,
+                                                      vector double __c) {
   return __builtin_vsx_xvmsubadp(__a, __b, __c);
 }
 #endif
 
 /* vec_max */
 
-static vector signed char __ATTRS_o_ai vec_max(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vmaxsb(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_max(vector bool char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector bool char __a, vector signed char __b) {
   return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_max(vector signed char __a,
-                                               vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector bool char __b) {
   return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_max(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vmaxub(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_max(vector bool char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector bool char __a, vector unsigned char __b) {
   return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_max(vector unsigned char __a,
-                                                 vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector bool char __b) {
   return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
 }
 
-static vector short __ATTRS_o_ai vec_max(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_max(vector short __a,
+                                                    vector short __b) {
   return __builtin_altivec_vmaxsh(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_max(vector bool short __a,
-                                         vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_max(vector bool short __a,
+                                                    vector short __b) {
   return __builtin_altivec_vmaxsh((vector short)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_max(vector short __a,
-                                         vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_max(vector short __a,
+                                                    vector bool short __b) {
   return __builtin_altivec_vmaxsh(__a, (vector short)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_max(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vmaxuh(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_max(vector bool short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector bool short __a, vector unsigned short __b) {
   return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_max(vector unsigned short __a,
-                                                  vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector bool short __b) {
   return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
 }
 
-static vector int __ATTRS_o_ai vec_max(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_max(vector int __a,
+                                                  vector int __b) {
   return __builtin_altivec_vmaxsw(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_max(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_max(vector bool int __a,
+                                                  vector int __b) {
   return __builtin_altivec_vmaxsw((vector int)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_max(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_max(vector int __a,
+                                                  vector bool int __b) {
   return __builtin_altivec_vmaxsw(__a, (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_max(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vmaxuw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_max(vector bool int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector bool int __a, vector unsigned int __b) {
   return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_max(vector unsigned int __a,
-                                                vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector bool int __b) {
   return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_max(vector signed long long __a, vector signed long long __b) {
   return __builtin_altivec_vmaxsd(__a, __b);
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_max(vector bool long long __a, vector signed long long __b) {
   return __builtin_altivec_vmaxsd((vector signed long long)__a, __b);
 }
 
-static vector signed long long __ATTRS_o_ai vec_max(vector signed long long __a,
-                                                    vector bool long long __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector bool long long __b) {
   return __builtin_altivec_vmaxsd(__a, (vector signed long long)__b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_max(vector unsigned long long __a, vector unsigned long long __b) {
   return __builtin_altivec_vmaxud(__a, __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_max(vector bool long long __a, vector unsigned long long __b) {
   return __builtin_altivec_vmaxud((vector unsigned long long)__a, __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_max(vector unsigned long long __a, vector bool long long __b) {
   return __builtin_altivec_vmaxud(__a, (vector unsigned long long)__b);
 }
 #endif
 
-static vector float __ATTRS_o_ai vec_max(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_max(vector float __a,
+                                                    vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvmaxsp(__a, __b);
 #else
@@ -3072,114 +3156,117 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_max(vector double __a,
-                                          vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_max(vector double __a,
+                                                     vector double __b) {
   return __builtin_vsx_xvmaxdp(__a, __b);
 }
 #endif
 
 /* vec_vmaxsb */
 
-static vector signed char __ATTRS_o_ai vec_vmaxsb(vector signed char __a,
-                                                  vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vmaxsb(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vmaxsb(vector bool char __a,
-                                                  vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector bool char __a, vector signed char __b) {
   return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vmaxsb(vector signed char __a,
-                                                  vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector bool char __b) {
   return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
 }
 
 /* vec_vmaxub */
 
-static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector unsigned char __a,
-                                                    vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vmaxub(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector bool char __a,
-                                                    vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector bool char __a, vector unsigned char __b) {
   return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector unsigned char __a,
-                                                    vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector bool char __b) {
   return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
 }
 
 /* vec_vmaxsh */
 
-static vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
-                                            vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                                       vector short __b) {
   return __builtin_altivec_vmaxsh(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_vmaxsh(vector bool short __a,
-                                            vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector bool short __a,
+                                                       vector short __b) {
   return __builtin_altivec_vmaxsh((vector short)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
-                                            vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                                       vector bool short __b) {
   return __builtin_altivec_vmaxsh(__a, (vector short)__b);
 }
 
 /* vec_vmaxuh */
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vmaxuh(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vmaxuh(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vmaxuh(vector bool short __a, vector unsigned short __b) {
   return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vmaxuh(vector unsigned short __a,
-                                                     vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector bool short __b) {
   return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
 }
 
 /* vec_vmaxsw */
 
-static vector int __ATTRS_o_ai vec_vmaxsw(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector int __a,
+                                                     vector int __b) {
   return __builtin_altivec_vmaxsw(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_vmaxsw(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector bool int __a,
+                                                     vector int __b) {
   return __builtin_altivec_vmaxsw((vector int)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_vmaxsw(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector int __a,
+                                                     vector bool int __b) {
   return __builtin_altivec_vmaxsw(__a, (vector int)__b);
 }
 
 /* vec_vmaxuw */
 
-static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector unsigned int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vmaxuw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector bool int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector bool int __a, vector unsigned int __b) {
   return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector unsigned int __a,
-                                                   vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector bool int __b) {
   return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
 }
 
 /* vec_vmaxfp */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vmaxfp(vector float __a, vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvmaxsp(__a, __b);
@@ -3190,39 +3277,39 @@
 
 /* vec_mergeh */
 
-static vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a,
-                                                  vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mergeh(vector signed char __a, vector signed char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
                                          0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
                                          0x06, 0x16, 0x07, 0x17));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_mergeh(vector unsigned char __a,
-                                                    vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mergeh(vector unsigned char __a, vector unsigned char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
                                          0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
                                          0x06, 0x16, 0x07, 0x17));
 }
 
-static vector bool char __ATTRS_o_ai vec_mergeh(vector bool char __a,
-                                                vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_mergeh(vector bool char __a, vector bool char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
                                          0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
                                          0x06, 0x16, 0x07, 0x17));
 }
 
-static vector short __ATTRS_o_ai vec_mergeh(vector short __a,
-                                            vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_mergeh(vector short __a,
+                                                       vector short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
                                          0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
                                          0x06, 0x07, 0x16, 0x17));
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
@@ -3230,47 +3317,48 @@
                                          0x06, 0x07, 0x16, 0x17));
 }
 
-static vector bool short __ATTRS_o_ai vec_mergeh(vector bool short __a,
-                                                 vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_mergeh(vector bool short __a, vector bool short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
                                          0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
                                          0x06, 0x07, 0x16, 0x17));
 }
 
-static vector pixel __ATTRS_o_ai vec_mergeh(vector pixel __a,
-                                            vector pixel __b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_mergeh(vector pixel __a,
+                                                       vector pixel __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
                                          0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
                                          0x06, 0x07, 0x16, 0x17));
 }
 
-static vector int __ATTRS_o_ai vec_mergeh(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_mergeh(vector int __a,
+                                                     vector int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
                                          0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_mergeh(vector unsigned int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergeh(vector unsigned int __a, vector unsigned int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
                                          0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector bool int __ATTRS_o_ai vec_mergeh(vector bool int __a,
-                                               vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_mergeh(vector bool int __a,
+                                                          vector bool int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
                                          0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector float __ATTRS_o_ai vec_mergeh(vector float __a,
-                                            vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_mergeh(vector float __a,
+                                                       vector float __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
                                          0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
@@ -3278,91 +3366,81 @@
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_mergeh(vector signed long long __a, vector signed long long __b) {
   return vec_perm(__a, __b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_mergeh(vector signed long long __a, vector bool long long __b) {
   return vec_perm(__a, (vector signed long long)__b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_mergeh(vector bool long long __a, vector signed long long __b) {
   return vec_perm((vector signed long long)__a, __b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_perm(__a, __b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mergeh(vector unsigned long long __a, vector bool long long __b) {
   return vec_perm(__a, (vector unsigned long long)__b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mergeh(vector bool long long __a, vector unsigned long long __b) {
   return vec_perm((vector unsigned long long)__a, __b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_mergeh(vector bool long long __a, vector bool long long __b) {
   return vec_perm(__a, __b,
-                 (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                        0x04, 0x05, 0x06, 0x07,
-                                        0x10, 0x11, 0x12, 0x13,
-                                        0x14, 0x15, 0x16, 0x17));
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
 }
 
-static vector double __ATTRS_o_ai vec_mergeh(vector double __a,
-                                             vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                                        vector double __b) {
   return vec_perm(__a, __b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
-static vector double __ATTRS_o_ai vec_mergeh(vector double __a,
-                                             vector bool long long __b) {
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeh(vector double __a, vector bool long long __b) {
   return vec_perm(__a, (vector double)__b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
-static vector double __ATTRS_o_ai vec_mergeh(vector bool long long __a,
-                                             vector double __b) {
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector double __b) {
   return vec_perm((vector double)__a, __b,
-                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
-                                         0x04, 0x05, 0x06, 0x07,
-                                         0x10, 0x11, 0x12, 0x13,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                          0x14, 0x15, 0x16, 0x17));
 }
 #endif
@@ -3371,24 +3449,24 @@
 
 #define __builtin_altivec_vmrghb vec_vmrghb
 
-static vector signed char __ATTRS_o_ai vec_vmrghb(vector signed char __a,
-                                                  vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmrghb(vector signed char __a, vector signed char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
                                          0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
                                          0x06, 0x16, 0x07, 0x17));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vmrghb(vector unsigned char __a,
-                                                    vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmrghb(vector unsigned char __a, vector unsigned char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
                                          0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
                                          0x06, 0x16, 0x07, 0x17));
 }
 
-static vector bool char __ATTRS_o_ai vec_vmrghb(vector bool char __a,
-                                                vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vmrghb(vector bool char __a, vector bool char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
                                          0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
@@ -3399,15 +3477,15 @@
 
 #define __builtin_altivec_vmrghh vec_vmrghh
 
-static vector short __ATTRS_o_ai vec_vmrghh(vector short __a,
-                                            vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vmrghh(vector short __a,
+                                                       vector short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
                                          0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
                                          0x06, 0x07, 0x16, 0x17));
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vmrghh(vector unsigned short __a, vector unsigned short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
@@ -3415,16 +3493,16 @@
                                          0x06, 0x07, 0x16, 0x17));
 }
 
-static vector bool short __ATTRS_o_ai vec_vmrghh(vector bool short __a,
-                                                 vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vmrghh(vector bool short __a, vector bool short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
                                          0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
                                          0x06, 0x07, 0x16, 0x17));
 }
 
-static vector pixel __ATTRS_o_ai vec_vmrghh(vector pixel __a,
-                                            vector pixel __b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_vmrghh(vector pixel __a,
+                                                       vector pixel __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
                                          0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
@@ -3435,31 +3513,32 @@
 
 #define __builtin_altivec_vmrghw vec_vmrghw
 
-static vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a,
+                                                     vector int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
                                          0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vmrghw(vector unsigned int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmrghw(vector unsigned int __a, vector unsigned int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
                                          0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector bool int __ATTRS_o_ai vec_vmrghw(vector bool int __a,
-                                               vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vmrghw(vector bool int __a,
+                                                          vector bool int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
                                          0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
                                          0x14, 0x15, 0x16, 0x17));
 }
 
-static vector float __ATTRS_o_ai vec_vmrghw(vector float __a,
-                                            vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vmrghw(vector float __a,
+                                                       vector float __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
                                          0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
@@ -3468,39 +3547,39 @@
 
 /* vec_mergel */
 
-static vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a,
-                                                  vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mergel(vector signed char __a, vector signed char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
                                          0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
                                          0x0E, 0x1E, 0x0F, 0x1F));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_mergel(vector unsigned char __a,
-                                                    vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mergel(vector unsigned char __a, vector unsigned char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
                                          0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
                                          0x0E, 0x1E, 0x0F, 0x1F));
 }
 
-static vector bool char __ATTRS_o_ai vec_mergel(vector bool char __a,
-                                                vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_mergel(vector bool char __a, vector bool char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
                                          0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
                                          0x0E, 0x1E, 0x0F, 0x1F));
 }
 
-static vector short __ATTRS_o_ai vec_mergel(vector short __a,
-                                            vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_mergel(vector short __a,
+                                                       vector short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
                                          0x0E, 0x0F, 0x1E, 0x1F));
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_mergel(vector unsigned short __a, vector unsigned short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
@@ -3508,47 +3587,48 @@
                                          0x0E, 0x0F, 0x1E, 0x1F));
 }
 
-static vector bool short __ATTRS_o_ai vec_mergel(vector bool short __a,
-                                                 vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_mergel(vector bool short __a, vector bool short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
                                          0x0E, 0x0F, 0x1E, 0x1F));
 }
 
-static vector pixel __ATTRS_o_ai vec_mergel(vector pixel __a,
-                                            vector pixel __b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_mergel(vector pixel __a,
+                                                       vector pixel __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
                                          0x0E, 0x0F, 0x1E, 0x1F));
 }
 
-static vector int __ATTRS_o_ai vec_mergel(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_mergel(vector int __a,
+                                                     vector int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_mergel(vector unsigned int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergel(vector unsigned int __a, vector unsigned int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-static vector bool int __ATTRS_o_ai vec_mergel(vector bool int __a,
-                                               vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_mergel(vector bool int __a,
+                                                          vector bool int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-static vector float __ATTRS_o_ai vec_mergel(vector float __a,
-                                            vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_mergel(vector float __a,
+                                                       vector float __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
@@ -3556,84 +3636,74 @@
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_mergel(vector signed long long __a, vector signed long long __b) {
   return vec_perm(__a, __b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_mergel(vector signed long long __a, vector bool long long __b) {
   return vec_perm(__a, (vector signed long long)__b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_mergel(vector bool long long __a, vector signed long long __b) {
   return vec_perm((vector signed long long)__a, __b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_perm(__a, __b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mergel(vector unsigned long long __a, vector bool long long __b) {
   return vec_perm(__a, (vector unsigned long long)__b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mergel(vector bool long long __a, vector unsigned long long __b) {
   return vec_perm((vector unsigned long long)__a, __b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_mergel(vector bool long long __a, vector bool long long __b) {
   return vec_perm(__a, __b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector double __ATTRS_o_ai
-vec_mergel(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_mergel(vector double __a,
+                                                        vector double __b) {
   return vec_perm(__a, __b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector double __ATTRS_o_ai
+static __inline__ vector double __ATTRS_o_ai
 vec_mergel(vector double __a, vector bool long long __b) {
   return vec_perm(__a, (vector double)__b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
-static vector double __ATTRS_o_ai
+static __inline__ vector double __ATTRS_o_ai
 vec_mergel(vector bool long long __a, vector double __b) {
   return vec_perm((vector double)__a, __b,
-                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
-                                         0x0C, 0x0D, 0x0E, 0x0F,
-                                         0x18, 0X19, 0x1A, 0x1B,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 #endif
@@ -3642,24 +3712,24 @@
 
 #define __builtin_altivec_vmrglb vec_vmrglb
 
-static vector signed char __ATTRS_o_ai vec_vmrglb(vector signed char __a,
-                                                  vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmrglb(vector signed char __a, vector signed char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
                                          0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
                                          0x0E, 0x1E, 0x0F, 0x1F));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vmrglb(vector unsigned char __a,
-                                                    vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmrglb(vector unsigned char __a, vector unsigned char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
                                          0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
                                          0x0E, 0x1E, 0x0F, 0x1F));
 }
 
-static vector bool char __ATTRS_o_ai vec_vmrglb(vector bool char __a,
-                                                vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vmrglb(vector bool char __a, vector bool char __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
                                          0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
@@ -3670,15 +3740,15 @@
 
 #define __builtin_altivec_vmrglh vec_vmrglh
 
-static vector short __ATTRS_o_ai vec_vmrglh(vector short __a,
-                                            vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vmrglh(vector short __a,
+                                                       vector short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
                                          0x0E, 0x0F, 0x1E, 0x1F));
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vmrglh(vector unsigned short __a, vector unsigned short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
@@ -3686,16 +3756,16 @@
                                          0x0E, 0x0F, 0x1E, 0x1F));
 }
 
-static vector bool short __ATTRS_o_ai vec_vmrglh(vector bool short __a,
-                                                 vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vmrglh(vector bool short __a, vector bool short __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
                                          0x0E, 0x0F, 0x1E, 0x1F));
 }
 
-static vector pixel __ATTRS_o_ai vec_vmrglh(vector pixel __a,
-                                            vector pixel __b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_vmrglh(vector pixel __a,
+                                                       vector pixel __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
@@ -3706,215 +3776,226 @@
 
 #define __builtin_altivec_vmrglw vec_vmrglw
 
-static vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a,
+                                                     vector int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vmrglw(vector unsigned int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmrglw(vector unsigned int __a, vector unsigned int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-static vector bool int __ATTRS_o_ai vec_vmrglw(vector bool int __a,
-                                               vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vmrglw(vector bool int __a,
+                                                          vector bool int __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-static vector float __ATTRS_o_ai vec_vmrglw(vector float __a,
-                                            vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vmrglw(vector float __a,
+                                                       vector float __b) {
   return vec_perm(__a, __b,
                   (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
                                          0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-
 #ifdef __POWER8_VECTOR__
 /* vec_mergee */
 
-static vector bool int __ATTRS_o_ai
-vec_mergee(vector bool int __a, vector bool int __b) {
-  return vec_perm(__a, __b, (vector unsigned char)
-                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+static __inline__ vector bool int __ATTRS_o_ai vec_mergee(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
 }
 
-static vector signed int __ATTRS_o_ai
+static __inline__ vector signed int __ATTRS_o_ai
 vec_mergee(vector signed int __a, vector signed int __b) {
-  return vec_perm(__a, __b, (vector unsigned char)
-                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_mergee(vector unsigned int __a, vector unsigned int __b) {
-  return vec_perm(__a, __b, (vector unsigned char)
-                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
 }
 
 /* vec_mergeo */
 
-static vector bool int  __ATTRS_o_ai
-vec_mergeo(vector bool int __a, vector bool int __b) {
-  return vec_perm(__a, __b, (vector unsigned char)
-                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+static __inline__ vector bool int __ATTRS_o_ai vec_mergeo(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-static vector signed int  __ATTRS_o_ai
+static __inline__ vector signed int __ATTRS_o_ai
 vec_mergeo(vector signed int __a, vector signed int __b) {
-  return vec_perm(__a, __b, (vector unsigned char)
-                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
 }
 
-static vector unsigned int  __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_mergeo(vector unsigned int __a, vector unsigned int __b) {
-  return vec_perm(__a, __b, (vector unsigned char)
-                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
 }
 
 #endif
 
 /* vec_mfvscr */
 
-static vector unsigned short __attribute__((__always_inline__))
+static __inline__ vector unsigned short __attribute__((__always_inline__))
 vec_mfvscr(void) {
   return __builtin_altivec_mfvscr();
 }
 
 /* vec_min */
 
-static vector signed char __ATTRS_o_ai vec_min(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vminsb(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_min(vector bool char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector bool char __a, vector signed char __b) {
   return __builtin_altivec_vminsb((vector signed char)__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_min(vector signed char __a,
-                                               vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector bool char __b) {
   return __builtin_altivec_vminsb(__a, (vector signed char)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_min(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vminub(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_min(vector bool char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector bool char __a, vector unsigned char __b) {
   return __builtin_altivec_vminub((vector unsigned char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_min(vector unsigned char __a,
-                                                 vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector bool char __b) {
   return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
 }
 
-static vector short __ATTRS_o_ai vec_min(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_min(vector short __a,
+                                                    vector short __b) {
   return __builtin_altivec_vminsh(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_min(vector bool short __a,
-                                         vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_min(vector bool short __a,
+                                                    vector short __b) {
   return __builtin_altivec_vminsh((vector short)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_min(vector short __a,
-                                         vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_min(vector short __a,
+                                                    vector bool short __b) {
   return __builtin_altivec_vminsh(__a, (vector short)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_min(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vminuh(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_min(vector bool short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector bool short __a, vector unsigned short __b) {
   return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_min(vector unsigned short __a,
-                                                  vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector bool short __b) {
   return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
 }
 
-static vector int __ATTRS_o_ai vec_min(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_min(vector int __a,
+                                                  vector int __b) {
   return __builtin_altivec_vminsw(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_min(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_min(vector bool int __a,
+                                                  vector int __b) {
   return __builtin_altivec_vminsw((vector int)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_min(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_min(vector int __a,
+                                                  vector bool int __b) {
   return __builtin_altivec_vminsw(__a, (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_min(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vminuw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_min(vector bool int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector bool int __a, vector unsigned int __b) {
   return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_min(vector unsigned int __a,
-                                                vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector bool int __b) {
   return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_min(vector signed long long __a, vector signed long long __b) {
   return __builtin_altivec_vminsd(__a, __b);
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_min(vector bool long long __a, vector signed long long __b) {
   return __builtin_altivec_vminsd((vector signed long long)__a, __b);
 }
 
-static vector signed long long __ATTRS_o_ai vec_min(vector signed long long __a,
-                                                    vector bool long long __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector bool long long __b) {
   return __builtin_altivec_vminsd(__a, (vector signed long long)__b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_min(vector unsigned long long __a, vector unsigned long long __b) {
   return __builtin_altivec_vminud(__a, __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_min(vector bool long long __a, vector unsigned long long __b) {
   return __builtin_altivec_vminud((vector unsigned long long)__a, __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_min(vector unsigned long long __a, vector bool long long __b) {
   return __builtin_altivec_vminud(__a, (vector unsigned long long)__b);
 }
 #endif
 
-static vector float __ATTRS_o_ai vec_min(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_min(vector float __a,
+                                                    vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvminsp(__a, __b);
 #else
@@ -3923,114 +4004,117 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_min(vector double __a,
-                                          vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_min(vector double __a,
+                                                     vector double __b) {
   return __builtin_vsx_xvmindp(__a, __b);
 }
 #endif
 
 /* vec_vminsb */
 
-static vector signed char __ATTRS_o_ai vec_vminsb(vector signed char __a,
-                                                  vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vminsb(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vminsb(vector bool char __a,
-                                                  vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector bool char __a, vector signed char __b) {
   return __builtin_altivec_vminsb((vector signed char)__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vminsb(vector signed char __a,
-                                                  vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector bool char __b) {
   return __builtin_altivec_vminsb(__a, (vector signed char)__b);
 }
 
 /* vec_vminub */
 
-static vector unsigned char __ATTRS_o_ai vec_vminub(vector unsigned char __a,
-                                                    vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vminub(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vminub(vector bool char __a,
-                                                    vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector bool char __a, vector unsigned char __b) {
   return __builtin_altivec_vminub((vector unsigned char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vminub(vector unsigned char __a,
-                                                    vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector bool char __b) {
   return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
 }
 
 /* vec_vminsh */
 
-static vector short __ATTRS_o_ai vec_vminsh(vector short __a,
-                                            vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                                       vector short __b) {
   return __builtin_altivec_vminsh(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_vminsh(vector bool short __a,
-                                            vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector bool short __a,
+                                                       vector short __b) {
   return __builtin_altivec_vminsh((vector short)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_vminsh(vector short __a,
-                                            vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                                       vector bool short __b) {
   return __builtin_altivec_vminsh(__a, (vector short)__b);
 }
 
 /* vec_vminuh */
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vminuh(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vminuh(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vminuh(vector bool short __a, vector unsigned short __b) {
   return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vminuh(vector unsigned short __a,
-                                                     vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector bool short __b) {
   return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
 }
 
 /* vec_vminsw */
 
-static vector int __ATTRS_o_ai vec_vminsw(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector int __a,
+                                                     vector int __b) {
   return __builtin_altivec_vminsw(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_vminsw(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector bool int __a,
+                                                     vector int __b) {
   return __builtin_altivec_vminsw((vector int)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_vminsw(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector int __a,
+                                                     vector bool int __b) {
   return __builtin_altivec_vminsw(__a, (vector int)__b);
 }
 
 /* vec_vminuw */
 
-static vector unsigned int __ATTRS_o_ai vec_vminuw(vector unsigned int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vminuw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vminuw(vector bool int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector bool int __a, vector unsigned int __b) {
   return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vminuw(vector unsigned int __a,
-                                                   vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector bool int __b) {
   return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
 }
 
 /* vec_vminfp */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vminfp(vector float __a, vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvminsp(__a, __b);
@@ -4043,49 +4127,48 @@
 
 #define __builtin_altivec_vmladduhm vec_mladd
 
-static vector short __ATTRS_o_ai vec_mladd(vector short __a, vector short __b,
-                                           vector short __c) {
+static __inline__ vector short __ATTRS_o_ai vec_mladd(vector short __a,
+                                                      vector short __b,
+                                                      vector short __c) {
   return __a * __b + __c;
 }
 
-static vector short __ATTRS_o_ai vec_mladd(vector short __a,
-                                           vector unsigned short __b,
-                                           vector unsigned short __c) {
+static __inline__ vector short __ATTRS_o_ai vec_mladd(
+    vector short __a, vector unsigned short __b, vector unsigned short __c) {
   return __a * (vector short)__b + (vector short)__c;
 }
 
-static vector short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
-                                           vector short __b, vector short __c) {
+static __inline__ vector short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
+                                                      vector short __b,
+                                                      vector short __c) {
   return (vector short)__a * __b + __c;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
-                                                    vector unsigned short __b,
-                                                    vector unsigned short __c) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
   return __a * __b + __c;
 }
 
 /* vec_vmladduhm */
 
-static vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
-                                               vector short __b,
-                                               vector short __c) {
+static __inline__ vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
+                                                          vector short __b,
+                                                          vector short __c) {
   return __a * __b + __c;
 }
 
-static vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
-                                               vector unsigned short __b,
-                                               vector unsigned short __c) {
+static __inline__ vector short __ATTRS_o_ai vec_vmladduhm(
+    vector short __a, vector unsigned short __b, vector unsigned short __c) {
   return __a * (vector short)__b + (vector short)__c;
 }
 
-static vector short __ATTRS_o_ai vec_vmladduhm(vector unsigned short __a,
-                                               vector short __b,
-                                               vector short __c) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector short __b, vector short __c) {
   return (vector short)__a * __b + __c;
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vmladduhm(vector unsigned short __a, vector unsigned short __b,
               vector unsigned short __c) {
   return __a * __b + __c;
@@ -4093,53 +4176,54 @@
 
 /* vec_mradds */
 
-static vector short __attribute__((__always_inline__))
+static __inline__ vector short __attribute__((__always_inline__))
 vec_mradds(vector short __a, vector short __b, vector short __c) {
   return __builtin_altivec_vmhraddshs(__a, __b, __c);
 }
 
 /* vec_vmhraddshs */
 
-static vector short __attribute__((__always_inline__))
+static __inline__ vector short __attribute__((__always_inline__))
 vec_vmhraddshs(vector short __a, vector short __b, vector short __c) {
   return __builtin_altivec_vmhraddshs(__a, __b, __c);
 }
 
 /* vec_msum */
 
-static vector int __ATTRS_o_ai vec_msum(vector signed char __a,
-                                        vector unsigned char __b,
-                                        vector int __c) {
+static __inline__ vector int __ATTRS_o_ai vec_msum(vector signed char __a,
+                                                   vector unsigned char __b,
+                                                   vector int __c) {
   return __builtin_altivec_vmsummbm(__a, __b, __c);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_msum(vector unsigned char __a,
-                                                 vector unsigned char __b,
-                                                 vector unsigned int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned int __c) {
   return __builtin_altivec_vmsumubm(__a, __b, __c);
 }
 
-static vector int __ATTRS_o_ai vec_msum(vector short __a, vector short __b,
-                                        vector int __c) {
+static __inline__ vector int __ATTRS_o_ai vec_msum(vector short __a,
+                                                   vector short __b,
+                                                   vector int __c) {
   return __builtin_altivec_vmsumshm(__a, __b, __c);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_msum(vector unsigned short __a,
-                                                 vector unsigned short __b,
-                                                 vector unsigned int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned int __c) {
   return __builtin_altivec_vmsumuhm(__a, __b, __c);
 }
 
 /* vec_vmsummbm */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vmsummbm(vector signed char __a, vector unsigned char __b, vector int __c) {
   return __builtin_altivec_vmsummbm(__a, __b, __c);
 }
 
 /* vec_vmsumubm */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vmsumubm(vector unsigned char __a, vector unsigned char __b,
              vector unsigned int __c) {
   return __builtin_altivec_vmsumubm(__a, __b, __c);
@@ -4147,14 +4231,14 @@
 
 /* vec_vmsumshm */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vmsumshm(vector short __a, vector short __b, vector int __c) {
   return __builtin_altivec_vmsumshm(__a, __b, __c);
 }
 
 /* vec_vmsumuhm */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vmsumuhm(vector unsigned short __a, vector unsigned short __b,
              vector unsigned int __c) {
   return __builtin_altivec_vmsumuhm(__a, __b, __c);
@@ -4162,27 +4246,28 @@
 
 /* vec_msums */
 
-static vector int __ATTRS_o_ai vec_msums(vector short __a, vector short __b,
-                                         vector int __c) {
+static __inline__ vector int __ATTRS_o_ai vec_msums(vector short __a,
+                                                    vector short __b,
+                                                    vector int __c) {
   return __builtin_altivec_vmsumshs(__a, __b, __c);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_msums(vector unsigned short __a,
-                                                  vector unsigned short __b,
-                                                  vector unsigned int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msums(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
   return __builtin_altivec_vmsumuhs(__a, __b, __c);
 }
 
 /* vec_vmsumshs */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vmsumshs(vector short __a, vector short __b, vector int __c) {
   return __builtin_altivec_vmsumshs(__a, __b, __c);
 }
 
 /* vec_vmsumuhs */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vmsumuhs(vector unsigned short __a, vector unsigned short __b,
              vector unsigned int __c) {
   return __builtin_altivec_vmsumuhs(__a, __b, __c);
@@ -4190,47 +4275,47 @@
 
 /* vec_mtvscr */
 
-static void __ATTRS_o_ai vec_mtvscr(vector signed char __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector signed char __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector unsigned char __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned char __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector bool char __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool char __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector short __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector short __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector unsigned short __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned short __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector bool short __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool short __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector pixel __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector pixel __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector int __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector int __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector unsigned int __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned int __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector bool int __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool int __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
-static void __ATTRS_o_ai vec_mtvscr(vector float __a) {
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector float __a) {
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
@@ -4240,55 +4325,56 @@
    elements separately, then truncating the results and moving to the
    result vector.
 */
-static vector signed char __ATTRS_o_ai vec_mul(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mul(vector signed char __a, vector signed char __b) {
   return __a * __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_mul(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mul(vector unsigned char __a, vector unsigned char __b) {
   return __a * __b;
 }
 
-static vector signed short __ATTRS_o_ai vec_mul(vector signed short __a,
-                                                vector signed short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mul(vector signed short __a, vector signed short __b) {
   return __a * __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_mul(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mul(vector unsigned short __a, vector unsigned short __b) {
   return __a * __b;
 }
 
-static vector signed int __ATTRS_o_ai vec_mul(vector signed int __a,
-                                              vector signed int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mul(vector signed int __a, vector signed int __b) {
   return __a * __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_mul(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mul(vector unsigned int __a, vector unsigned int __b) {
   return __a * __b;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_mul(vector signed long long __a, vector signed long long __b) {
   return __a * __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mul(vector unsigned long long __a, vector unsigned long long __b) {
   return __a * __b;
 }
 #endif
 
-static vector float __ATTRS_o_ai vec_mul(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_mul(vector float __a,
+                                                    vector float __b) {
   return __a * __b;
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai
-vec_mul(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_mul(vector double __a,
+                                                     vector double __b) {
   return __a * __b;
 }
 #endif
@@ -4298,8 +4384,8 @@
 
 /* vec_mule */
 
-static vector short __ATTRS_o_ai vec_mule(vector signed char __a,
-                                          vector signed char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a,
+                                                     vector signed char __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulosb(__a, __b);
 #else
@@ -4307,8 +4393,8 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_mule(vector unsigned char __a,
-                                                   vector unsigned char __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mule(vector unsigned char __a, vector unsigned char __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmuloub(__a, __b);
 #else
@@ -4316,7 +4402,8 @@
 #endif
 }
 
-static vector int __ATTRS_o_ai vec_mule(vector short __a, vector short __b) {
+static __inline__ vector int __ATTRS_o_ai vec_mule(vector short __a,
+                                                   vector short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulosh(__a, __b);
 #else
@@ -4324,8 +4411,8 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai vec_mule(vector unsigned short __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mule(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulouh(__a, __b);
 #else
@@ -4334,8 +4421,8 @@
 }
 
 #ifdef __POWER8_VECTOR__
-static vector signed long long __ATTRS_o_ai vec_mule(vector signed int __a,
-                                                     vector signed int __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mule(vector signed int __a, vector signed int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulosw(__a, __b);
 #else
@@ -4343,7 +4430,7 @@
 #endif
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mule(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulouw(__a, __b);
@@ -4355,7 +4442,7 @@
 
 /* vec_vmulesb */
 
-static vector short __attribute__((__always_inline__))
+static __inline__ vector short __attribute__((__always_inline__))
 vec_vmulesb(vector signed char __a, vector signed char __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulosb(__a, __b);
@@ -4366,7 +4453,7 @@
 
 /* vec_vmuleub */
 
-static vector unsigned short __attribute__((__always_inline__))
+static __inline__ vector unsigned short __attribute__((__always_inline__))
 vec_vmuleub(vector unsigned char __a, vector unsigned char __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmuloub(__a, __b);
@@ -4377,7 +4464,7 @@
 
 /* vec_vmulesh */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vmulesh(vector short __a, vector short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulosh(__a, __b);
@@ -4388,7 +4475,7 @@
 
 /* vec_vmuleuh */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulouh(__a, __b);
@@ -4399,8 +4486,8 @@
 
 /* vec_mulo */
 
-static vector short __ATTRS_o_ai vec_mulo(vector signed char __a,
-                                          vector signed char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_mulo(vector signed char __a,
+                                                     vector signed char __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulesb(__a, __b);
 #else
@@ -4408,8 +4495,8 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_mulo(vector unsigned char __a,
-                                                   vector unsigned char __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mulo(vector unsigned char __a, vector unsigned char __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmuleub(__a, __b);
 #else
@@ -4417,7 +4504,8 @@
 #endif
 }
 
-static vector int __ATTRS_o_ai vec_mulo(vector short __a, vector short __b) {
+static __inline__ vector int __ATTRS_o_ai vec_mulo(vector short __a,
+                                                   vector short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulesh(__a, __b);
 #else
@@ -4425,8 +4513,8 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai vec_mulo(vector unsigned short __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mulo(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmuleuh(__a, __b);
 #else
@@ -4435,8 +4523,8 @@
 }
 
 #ifdef __POWER8_VECTOR__
-static vector signed long long __ATTRS_o_ai vec_mulo(vector signed int __a,
-                                                     vector signed int __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mulo(vector signed int __a, vector signed int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulesw(__a, __b);
 #else
@@ -4444,7 +4532,7 @@
 #endif
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_mulo(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmuleuw(__a, __b);
@@ -4456,7 +4544,7 @@
 
 /* vec_vmulosb */
 
-static vector short __attribute__((__always_inline__))
+static __inline__ vector short __attribute__((__always_inline__))
 vec_vmulosb(vector signed char __a, vector signed char __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulesb(__a, __b);
@@ -4467,7 +4555,7 @@
 
 /* vec_vmuloub */
 
-static vector unsigned short __attribute__((__always_inline__))
+static __inline__ vector unsigned short __attribute__((__always_inline__))
 vec_vmuloub(vector unsigned char __a, vector unsigned char __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmuleub(__a, __b);
@@ -4478,7 +4566,7 @@
 
 /* vec_vmulosh */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vmulosh(vector short __a, vector short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmulesh(__a, __b);
@@ -4489,7 +4577,7 @@
 
 /* vec_vmulouh */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vmulouh(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vmuleuh(__a, __b);
@@ -4501,140 +4589,137 @@
 /*  vec_nand */
 
 #ifdef __POWER8_VECTOR__
-static vector signed char __ATTRS_o_ai vec_nand(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector signed char __a, vector signed char __b) {
   return ~(__a & __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_nand(vector signed char __a,
-                                                vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector signed char __a, vector bool char __b) {
   return ~(__a & __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_nand(vector bool char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector bool char __a, vector signed char __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_nand(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector unsigned char __a, vector unsigned char __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_nand(vector unsigned char __a,
-                                                  vector bool char __b) {
-  return ~(__a & __b);
-
-}
-
-static vector unsigned char __ATTRS_o_ai vec_nand(vector bool char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector unsigned char __a, vector bool char __b) {
   return ~(__a & __b);
 }
 
-static vector bool char __ATTRS_o_ai vec_nand(vector bool char __a,
-                                              vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector bool char __a, vector unsigned char __b) {
   return ~(__a & __b);
 }
 
-static vector signed short __ATTRS_o_ai vec_nand(vector signed short __a,
-                                                 vector signed short __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                         vector bool char __b) {
   return ~(__a & __b);
 }
 
-static vector signed short __ATTRS_o_ai vec_nand(vector signed short __a,
-                                                 vector bool short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector signed short __a, vector signed short __b) {
   return ~(__a & __b);
 }
 
-static vector signed short __ATTRS_o_ai vec_nand(vector bool short __a,
-                                                 vector signed short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector signed short __a, vector bool short __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_nand(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector bool short __a, vector signed short __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_nand(vector unsigned short __a,
-                                                   vector bool short __b) {
-  return ~(__a & __b);
-
-}
-
-static vector bool short __ATTRS_o_ai vec_nand(vector bool short __a,
-                                               vector bool short __b) {
-  return ~(__a & __b);
-
-}
-
-static vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
-                                               vector signed int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nand(vector unsigned short __a, vector unsigned short __b) {
   return ~(__a & __b);
 }
 
-static vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
-                                               vector bool int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nand(vector unsigned short __a, vector bool short __b) {
   return ~(__a & __b);
 }
 
-static vector signed int __ATTRS_o_ai vec_nand(vector bool int __a,
-                                               vector signed int __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_nand(vector bool short __a, vector bool short __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_nand(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_nand(vector signed int __a, vector signed int __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_nand(vector unsigned int __a,
-                                                 vector bool int __b) {
+static __inline__ vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                                          vector bool int __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_nand(vector bool int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_nand(vector bool int __a, vector signed int __b) {
   return ~(__a & __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
-                                             vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector unsigned int __a, vector unsigned int __b) {
   return ~(__a & __b);
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector unsigned int __a, vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector bool int __a, vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                                        vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_nand(vector signed long long __a, vector signed long long __b) {
   return ~(__a & __b);
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_nand(vector signed long long __a, vector bool long long __b) {
   return ~(__a & __b);
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_nand(vector bool long long __a, vector signed long long __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_nand(vector unsigned long long __a, vector bool long long __b) {
   return ~(__a & __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_nand(vector bool long long __a, vector unsigned long long __b) {
   return ~(__a & __b);
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_nand(vector bool long long __a, vector bool long long __b) {
   return ~(__a & __b);
 }
@@ -4644,21 +4729,24 @@
 /* vec_nmadd */
 
 #ifdef __VSX__
-static vector float __ATTRS_o_ai
-vec_nmadd(vector float __a, vector float __b, vector float __c) {
+static __inline__ vector float __ATTRS_o_ai vec_nmadd(vector float __a,
+                                                      vector float __b,
+                                                      vector float __c) {
   return __builtin_vsx_xvnmaddasp(__a, __b, __c);
 }
 
-static vector double __ATTRS_o_ai
-vec_nmadd(vector double __a, vector double __b, vector double __c) {
+static __inline__ vector double __ATTRS_o_ai vec_nmadd(vector double __a,
+                                                       vector double __b,
+                                                       vector double __c) {
   return __builtin_vsx_xvnmaddadp(__a, __b, __c);
 }
 #endif
 
 /* vec_nmsub */
 
-static vector float __ATTRS_o_ai
-vec_nmsub(vector float __a, vector float __b, vector float __c) {
+static __inline__ vector float __ATTRS_o_ai vec_nmsub(vector float __a,
+                                                      vector float __b,
+                                                      vector float __c) {
 #ifdef __VSX__
   return __builtin_vsx_xvnmsubasp(__a, __b, __c);
 #else
@@ -4667,15 +4755,16 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai
-vec_nmsub(vector double __a, vector double __b, vector double __c) {
+static __inline__ vector double __ATTRS_o_ai vec_nmsub(vector double __a,
+                                                       vector double __b,
+                                                       vector double __c) {
   return __builtin_vsx_xvnmsubadp(__a, __b, __c);
 }
 #endif
 
 /* vec_vnmsubfp */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vnmsubfp(vector float __a, vector float __b, vector float __c) {
   return __builtin_altivec_vnmsubfp(__a, __b, __c);
 }
@@ -4684,58 +4773,61 @@
 
 #define __builtin_altivec_vnor vec_nor
 
-static vector signed char __ATTRS_o_ai vec_nor(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nor(vector signed char __a, vector signed char __b) {
   return ~(__a | __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_nor(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nor(vector unsigned char __a, vector unsigned char __b) {
   return ~(__a | __b);
 }
 
-static vector bool char __ATTRS_o_ai vec_nor(vector bool char __a,
-                                             vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_nor(vector bool char __a,
+                                                        vector bool char __b) {
   return ~(__a | __b);
 }
 
-static vector short __ATTRS_o_ai vec_nor(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_nor(vector short __a,
+                                                    vector short __b) {
   return ~(__a | __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_nor(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nor(vector unsigned short __a, vector unsigned short __b) {
   return ~(__a | __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_nor(vector bool short __a,
-                                              vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_nor(vector bool short __a, vector bool short __b) {
   return ~(__a | __b);
 }
 
-static vector int __ATTRS_o_ai vec_nor(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_nor(vector int __a,
+                                                  vector int __b) {
   return ~(__a | __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_nor(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nor(vector unsigned int __a, vector unsigned int __b) {
   return ~(__a | __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_nor(vector bool int __a,
-                                            vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_nor(vector bool int __a,
+                                                       vector bool int __b) {
   return ~(__a | __b);
 }
 
-static vector float __ATTRS_o_ai vec_nor(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_nor(vector float __a,
+                                                    vector float __b) {
   vector unsigned int __res =
       ~((vector unsigned int)__a | (vector unsigned int)__b);
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai
-vec_nor(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_nor(vector double __a,
+                                                     vector double __b) {
   vector unsigned long long __res =
       ~((vector unsigned long long)__a | (vector unsigned long long)__b);
   return (vector double)__res;
@@ -4744,68 +4836,71 @@
 
 /* vec_vnor */
 
-static vector signed char __ATTRS_o_ai vec_vnor(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vnor(vector signed char __a, vector signed char __b) {
   return ~(__a | __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vnor(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vnor(vector unsigned char __a, vector unsigned char __b) {
   return ~(__a | __b);
 }
 
-static vector bool char __ATTRS_o_ai vec_vnor(vector bool char __a,
-                                              vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_vnor(vector bool char __a,
+                                                         vector bool char __b) {
   return ~(__a | __b);
 }
 
-static vector short __ATTRS_o_ai vec_vnor(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vnor(vector short __a,
+                                                     vector short __b) {
   return ~(__a | __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vnor(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vnor(vector unsigned short __a, vector unsigned short __b) {
   return ~(__a | __b);
 }
 
-static vector bool short __ATTRS_o_ai vec_vnor(vector bool short __a,
-                                               vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vnor(vector bool short __a, vector bool short __b) {
   return ~(__a | __b);
 }
 
-static vector int __ATTRS_o_ai vec_vnor(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vnor(vector int __a,
+                                                   vector int __b) {
   return ~(__a | __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vnor(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vnor(vector unsigned int __a, vector unsigned int __b) {
   return ~(__a | __b);
 }
 
-static vector bool int __ATTRS_o_ai vec_vnor(vector bool int __a,
-                                             vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vnor(vector bool int __a,
+                                                        vector bool int __b) {
   return ~(__a | __b);
 }
 
-static vector float __ATTRS_o_ai vec_vnor(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vnor(vector float __a,
+                                                     vector float __b) {
   vector unsigned int __res =
       ~((vector unsigned int)__a | (vector unsigned int)__b);
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_nor(vector signed long long __a, vector signed long long __b) {
   return ~(__a | __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
   return ~(__a | __b);
 }
 
-static vector bool long long __ATTRS_o_ai vec_nor(vector bool long long __a,
-                                                  vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_nor(vector bool long long __a, vector bool long long __b) {
   return ~(__a | __b);
 }
 #endif
@@ -4814,315 +4909,323 @@
 
 #define __builtin_altivec_vor vec_or
 
-static vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
-                                              vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector signed char __b) {
   return __a | __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_or(vector bool char __a,
-                                              vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_or(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a | __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
-                                              vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
+                                                         vector bool char __b) {
   return __a | (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_or(vector unsigned char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector unsigned char __b) {
   return __a | __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_or(vector bool char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a | __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_or(vector unsigned char __a,
-                                                vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector bool char __b) {
   return __a | (vector unsigned char)__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_or(vector bool char __a,
-                                            vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_or(vector bool char __a,
+                                                       vector bool char __b) {
   return __a | __b;
 }
 
-static vector short __ATTRS_o_ai vec_or(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_or(vector short __a,
+                                                   vector short __b) {
   return __a | __b;
 }
 
-static vector short __ATTRS_o_ai vec_or(vector bool short __a,
-                                        vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_or(vector bool short __a,
+                                                   vector short __b) {
   return (vector short)__a | __b;
 }
 
-static vector short __ATTRS_o_ai vec_or(vector short __a,
-                                        vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_or(vector short __a,
+                                                   vector bool short __b) {
   return __a | (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_or(vector unsigned short __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector unsigned short __b) {
   return __a | __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_or(vector bool short __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a | __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_or(vector unsigned short __a,
-                                                 vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector bool short __b) {
   return __a | (vector unsigned short)__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_or(vector bool short __a,
-                                             vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai vec_or(vector bool short __a,
+                                                        vector bool short __b) {
   return __a | __b;
 }
 
-static vector int __ATTRS_o_ai vec_or(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_or(vector int __a,
+                                                 vector int __b) {
   return __a | __b;
 }
 
-static vector int __ATTRS_o_ai vec_or(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_or(vector bool int __a,
+                                                 vector int __b) {
   return (vector int)__a | __b;
 }
 
-static vector int __ATTRS_o_ai vec_or(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_or(vector int __a,
+                                                 vector bool int __b) {
   return __a | (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_or(vector unsigned int __a,
-                                               vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector unsigned int __b) {
   return __a | __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_or(vector bool int __a,
-                                               vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a | __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_or(vector unsigned int __a,
-                                               vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector bool int __b) {
   return __a | (vector unsigned int)__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_or(vector bool int __a,
-                                           vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_or(vector bool int __a,
+                                                      vector bool int __b) {
   return __a | __b;
 }
 
-static vector float __ATTRS_o_ai vec_or(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_or(vector float __a,
+                                                   vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a | (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_or(vector bool int __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_or(vector bool int __a,
+                                                   vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a | (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_or(vector float __a, vector bool int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_or(vector float __a,
+                                                   vector bool int __b) {
   vector unsigned int __res =
       (vector unsigned int)__a | (vector unsigned int)__b;
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_or(vector bool long long __a,
-                                         vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_or(vector bool long long __a,
+                                                    vector double __b) {
   return (vector unsigned long long)__a | (vector unsigned long long)__b;
 }
 
-static vector double __ATTRS_o_ai vec_or(vector double __a,
-                                         vector bool long long __b) {
+static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
+                                                    vector bool long long __b) {
   return (vector unsigned long long)__a | (vector unsigned long long)__b;
 }
 
-static vector double __ATTRS_o_ai vec_or(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
+                                                    vector double __b) {
   vector unsigned long long __res =
       (vector unsigned long long)__a | (vector unsigned long long)__b;
   return (vector double)__res;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_or(vector signed long long __a, vector signed long long __b) {
   return __a | __b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_or(vector bool long long __a, vector signed long long __b) {
   return (vector signed long long)__a | __b;
 }
 
-static vector signed long long __ATTRS_o_ai vec_or(vector signed long long __a,
-                                                   vector bool long long __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_or(vector signed long long __a, vector bool long long __b) {
   return __a | (vector signed long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_or(vector unsigned long long __a, vector unsigned long long __b) {
   return __a | __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_or(vector bool long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)__a | __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_or(vector unsigned long long __a, vector bool long long __b) {
   return __a | (vector unsigned long long)__b;
 }
 
-static vector bool long long __ATTRS_o_ai vec_or(vector bool long long __a,
-                                                 vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector bool long long __b) {
   return __a | __b;
 }
 #endif
 
 #ifdef __POWER8_VECTOR__
-static vector signed char __ATTRS_o_ai vec_orc(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector signed char __a, vector signed char __b) {
   return __a | ~__b;
 }
 
-static vector signed char __ATTRS_o_ai vec_orc(vector signed char __a,
-                                               vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector signed char __a, vector bool char __b) {
   return __a | ~__b;
 }
 
-static vector signed char __ATTRS_o_ai vec_orc(vector bool char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector bool char __a, vector signed char __b) {
   return __a | ~__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_orc(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector unsigned char __a, vector unsigned char __b) {
   return __a | ~__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_orc(vector unsigned char __a,
-                                                 vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector unsigned char __a, vector bool char __b) {
   return __a | ~__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_orc(vector bool char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector bool char __a, vector unsigned char __b) {
   return __a | ~__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_orc(vector bool char __a,
-                                             vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                                        vector bool char __b) {
   return __a | ~__b;
 }
 
-static vector signed short __ATTRS_o_ai vec_orc(vector signed short __a,
-                                                vector signed short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector signed short __a, vector signed short __b) {
   return __a | ~__b;
 }
 
-static vector signed short __ATTRS_o_ai vec_orc(vector signed short __a,
-                                                vector bool short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector signed short __a, vector bool short __b) {
   return __a | ~__b;
 }
 
-static vector signed short __ATTRS_o_ai vec_orc(vector bool short __a,
-                                                vector signed short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector signed short __b) {
   return __a | ~__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_orc(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_orc(vector unsigned short __a, vector unsigned short __b) {
   return __a | ~__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_orc(vector unsigned short __a,
-                                                  vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_orc(vector unsigned short __a, vector bool short __b) {
   return __a | ~__b;
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_orc(vector bool short __a, vector unsigned short __b) {
   return __a | ~__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_orc(vector bool short __a,
-                                              vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector bool short __b) {
   return __a | ~__b;
 }
 
-static vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
-                                              vector signed int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_orc(vector signed int __a, vector signed int __b) {
   return __a | ~__b;
 }
 
-static vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
-                                              vector bool int __b) {
+static __inline__ vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                                         vector bool int __b) {
   return __a | ~__b;
 }
 
-static vector signed int __ATTRS_o_ai vec_orc(vector bool int __a,
-                                              vector signed int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_orc(vector bool int __a, vector signed int __b) {
   return __a | ~__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_orc(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector unsigned int __a, vector unsigned int __b) {
   return __a | ~__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_orc(vector unsigned int __a,
-                                                vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector unsigned int __a, vector bool int __b) {
   return __a | ~__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_orc(vector bool int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector bool int __a, vector unsigned int __b) {
   return __a | ~__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
-                                            vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                                       vector bool int __b) {
   return __a | ~__b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_orc(vector signed long long __a, vector signed long long __b) {
   return __a | ~__b;
 }
 
-static vector signed long long __ATTRS_o_ai vec_orc(vector signed long long __a,
-                                                    vector bool long long __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_orc(vector signed long long __a, vector bool long long __b) {
   return __a | ~__b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_orc(vector bool long long __a, vector signed long long __b) {
   return __a | ~__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
   return __a | ~__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_orc(vector unsigned long long __a, vector bool long long __b) {
   return __a | ~__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_orc(vector bool long long __a, vector unsigned long long __b) {
   return __a | ~__b;
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_orc(vector bool long long __a, vector bool long long __b) {
   return __a | ~__b;
 }
@@ -5130,160 +5233,165 @@
 
 /* vec_vor */
 
-static vector signed char __ATTRS_o_ai vec_vor(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector signed char __b) {
   return __a | __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vor(vector bool char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a | __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vor(vector signed char __a,
-                                               vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector bool char __b) {
   return __a | (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vor(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector unsigned char __b) {
   return __a | __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vor(vector bool char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a | __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vor(vector unsigned char __a,
-                                                 vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector bool char __b) {
   return __a | (vector unsigned char)__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_vor(vector bool char __a,
-                                             vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                                        vector bool char __b) {
   return __a | __b;
 }
 
-static vector short __ATTRS_o_ai vec_vor(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector short __a,
+                                                    vector short __b) {
   return __a | __b;
 }
 
-static vector short __ATTRS_o_ai vec_vor(vector bool short __a,
-                                         vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                                    vector short __b) {
   return (vector short)__a | __b;
 }
 
-static vector short __ATTRS_o_ai vec_vor(vector short __a,
-                                         vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector short __a,
+                                                    vector bool short __b) {
   return __a | (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vor(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector unsigned short __b) {
   return __a | __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vor(vector bool short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a | __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vor(vector unsigned short __a,
-                                                  vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector bool short __b) {
   return __a | (vector unsigned short)__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_vor(vector bool short __a,
-                                              vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector bool short __b) {
   return __a | __b;
 }
 
-static vector int __ATTRS_o_ai vec_vor(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector int __a,
+                                                  vector int __b) {
   return __a | __b;
 }
 
-static vector int __ATTRS_o_ai vec_vor(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                  vector int __b) {
   return (vector int)__a | __b;
 }
 
-static vector int __ATTRS_o_ai vec_vor(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector int __a,
+                                                  vector bool int __b) {
   return __a | (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vor(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector unsigned int __b) {
   return __a | __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vor(vector bool int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a | __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vor(vector unsigned int __a,
-                                                vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector bool int __b) {
   return __a | (vector unsigned int)__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_vor(vector bool int __a,
-                                            vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                       vector bool int __b) {
   return __a | __b;
 }
 
-static vector float __ATTRS_o_ai vec_vor(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector float __a,
+                                                    vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a | (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vor(vector bool int __a,
-                                         vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                    vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a | (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vor(vector float __a,
-                                         vector bool int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector float __a,
+                                                    vector bool int __b) {
   vector unsigned int __res =
       (vector unsigned int)__a | (vector unsigned int)__b;
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vor(vector signed long long __a, vector signed long long __b) {
   return __a | __b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vor(vector bool long long __a, vector signed long long __b) {
   return (vector signed long long)__a | __b;
 }
 
-static vector signed long long __ATTRS_o_ai vec_vor(vector signed long long __a,
-                                                    vector bool long long __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vor(vector signed long long __a, vector bool long long __b) {
   return __a | (vector signed long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vor(vector unsigned long long __a, vector unsigned long long __b) {
   return __a | __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vor(vector bool long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)__a | __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vor(vector unsigned long long __a, vector bool long long __b) {
   return __a | (vector unsigned long long)__b;
 }
 
-static vector bool long long __ATTRS_o_ai vec_vor(vector bool long long __a,
-                                                  vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector bool long long __b) {
   return __a | __b;
 }
 #endif
@@ -5293,8 +5401,8 @@
 /* The various vector pack instructions have a big-endian bias, so for
    little endian we must handle reversed element numbering.  */
 
-static vector signed char __ATTRS_o_ai vec_pack(vector signed short __a,
-                                                vector signed short __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_pack(vector signed short __a, vector signed short __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector signed char)vec_perm(
       __a, __b,
@@ -5308,8 +5416,8 @@
 #endif
 }
 
-static vector unsigned char __ATTRS_o_ai vec_pack(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_pack(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned char)vec_perm(
       __a, __b,
@@ -5323,8 +5431,8 @@
 #endif
 }
 
-static vector bool char __ATTRS_o_ai vec_pack(vector bool short __a,
-                                              vector bool short __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_pack(vector bool short __a, vector bool short __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool char)vec_perm(
       __a, __b,
@@ -5338,7 +5446,8 @@
 #endif
 }
 
-static vector short __ATTRS_o_ai vec_pack(vector int __a, vector int __b) {
+static __inline__ vector short __ATTRS_o_ai vec_pack(vector int __a,
+                                                     vector int __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector short)vec_perm(
       __a, __b,
@@ -5352,8 +5461,8 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_pack(vector unsigned int __a,
-                                                   vector unsigned int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_pack(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned short)vec_perm(
       __a, __b,
@@ -5367,8 +5476,8 @@
 #endif
 }
 
-static vector bool short __ATTRS_o_ai vec_pack(vector bool int __a,
-                                               vector bool int __b) {
+static __inline__ vector bool short __ATTRS_o_ai vec_pack(vector bool int __a,
+                                                          vector bool int __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool short)vec_perm(
       __a, __b,
@@ -5383,8 +5492,8 @@
 }
 
 #ifdef __VSX__
-static vector signed int __ATTRS_o_ai vec_pack(vector signed long long __a,
-                                               vector signed long long __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_pack(vector signed long long __a, vector signed long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector signed int)vec_perm(
       __a, __b,
@@ -5397,7 +5506,7 @@
                              0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
 #endif
 }
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned int)vec_perm(
@@ -5412,8 +5521,8 @@
 #endif
 }
 
-static vector bool int __ATTRS_o_ai vec_pack(vector bool long long __a,
-                                             vector bool long long __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_pack(vector bool long long __a, vector bool long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool int)vec_perm(
       __a, __b,
@@ -5433,8 +5542,8 @@
 
 #define __builtin_altivec_vpkuhum vec_vpkuhum
 
-static vector signed char __ATTRS_o_ai vec_vpkuhum(vector signed short __a,
-                                                   vector signed short __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vpkuhum(vector signed short __a, vector signed short __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector signed char)vec_perm(
       __a, __b,
@@ -5448,7 +5557,7 @@
 #endif
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_vpkuhum(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned char)vec_perm(
@@ -5463,8 +5572,8 @@
 #endif
 }
 
-static vector bool char __ATTRS_o_ai vec_vpkuhum(vector bool short __a,
-                                                 vector bool short __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vpkuhum(vector bool short __a, vector bool short __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool char)vec_perm(
       __a, __b,
@@ -5482,7 +5591,8 @@
 
 #define __builtin_altivec_vpkuwum vec_vpkuwum
 
-static vector short __ATTRS_o_ai vec_vpkuwum(vector int __a, vector int __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vpkuwum(vector int __a,
+                                                        vector int __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector short)vec_perm(
       __a, __b,
@@ -5496,8 +5606,8 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vpkuwum(vector unsigned int __a,
-                                                      vector unsigned int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkuwum(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned short)vec_perm(
       __a, __b,
@@ -5511,8 +5621,8 @@
 #endif
 }
 
-static vector bool short __ATTRS_o_ai vec_vpkuwum(vector bool int __a,
-                                                  vector bool int __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vpkuwum(vector bool int __a, vector bool int __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool short)vec_perm(
       __a, __b,
@@ -5531,8 +5641,8 @@
 #ifdef __POWER8_VECTOR__
 #define __builtin_altivec_vpkudum vec_vpkudum
 
-static vector int __ATTRS_o_ai vec_vpkudum(vector long long __a,
-                                           vector long long __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vpkudum(vector long long __a,
+                                                      vector long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector int)vec_perm(
       __a, __b,
@@ -5546,7 +5656,7 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_vpkudum(vector unsigned long long __a, vector unsigned long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned int)vec_perm(
@@ -5561,8 +5671,8 @@
 #endif
 }
 
-static vector bool int __ATTRS_o_ai vec_vpkudum(vector bool long long __a,
-                                                vector bool long long __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vpkudum(vector bool long long __a, vector bool long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool int)vec_perm(
       (vector long long)__a, (vector long long)__b,
@@ -5579,7 +5689,7 @@
 
 /* vec_packpx */
 
-static vector pixel __attribute__((__always_inline__))
+static __inline__ vector pixel __attribute__((__always_inline__))
 vec_packpx(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
@@ -5590,7 +5700,7 @@
 
 /* vec_vpkpx */
 
-static vector pixel __attribute__((__always_inline__))
+static __inline__ vector pixel __attribute__((__always_inline__))
 vec_vpkpx(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
@@ -5601,8 +5711,8 @@
 
 /* vec_packs */
 
-static vector signed char __ATTRS_o_ai vec_packs(vector short __a,
-                                                 vector short __b) {
+static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a,
+                                                            vector short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkshss(__b, __a);
 #else
@@ -5610,8 +5720,8 @@
 #endif
 }
 
-static vector unsigned char __ATTRS_o_ai vec_packs(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packs(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkuhus(__b, __a);
 #else
@@ -5619,8 +5729,8 @@
 #endif
 }
 
-static vector signed short __ATTRS_o_ai vec_packs(vector int __a,
-                                                  vector int __b) {
+static __inline__ vector signed short __ATTRS_o_ai vec_packs(vector int __a,
+                                                             vector int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkswss(__b, __a);
 #else
@@ -5628,8 +5738,8 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_packs(vector unsigned int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packs(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkuwus(__b, __a);
 #else
@@ -5638,8 +5748,8 @@
 }
 
 #ifdef __POWER8_VECTOR__
-static vector int __ATTRS_o_ai vec_packs(vector long long __a,
-                                         vector long long __b) {
+static __inline__ vector int __ATTRS_o_ai vec_packs(vector long long __a,
+                                                    vector long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpksdss(__b, __a);
 #else
@@ -5647,7 +5757,7 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkudus(__b, __a);
@@ -5659,7 +5769,7 @@
 
 /* vec_vpkshss */
 
-static vector signed char __attribute__((__always_inline__))
+static __inline__ vector signed char __attribute__((__always_inline__))
 vec_vpkshss(vector short __a, vector short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkshss(__b, __a);
@@ -5671,8 +5781,8 @@
 /* vec_vpksdss */
 
 #ifdef __POWER8_VECTOR__
-static vector int __ATTRS_o_ai vec_vpksdss(vector long long __a,
-                                           vector long long __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vpksdss(vector long long __a,
+                                                      vector long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpksdss(__b, __a);
 #else
@@ -5683,7 +5793,7 @@
 
 /* vec_vpkuhus */
 
-static vector unsigned char __attribute__((__always_inline__))
+static __inline__ vector unsigned char __attribute__((__always_inline__))
 vec_vpkuhus(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkuhus(__b, __a);
@@ -5695,7 +5805,7 @@
 /* vec_vpkudus */
 
 #ifdef __POWER8_VECTOR__
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vpkudus(vector unsigned long long __a, vector unsigned long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkudus(__b, __a);
@@ -5707,7 +5817,7 @@
 
 /* vec_vpkswss */
 
-static vector signed short __attribute__((__always_inline__))
+static __inline__ vector signed short __attribute__((__always_inline__))
 vec_vpkswss(vector int __a, vector int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkswss(__b, __a);
@@ -5718,7 +5828,7 @@
 
 /* vec_vpkuwus */
 
-static vector unsigned short __attribute__((__always_inline__))
+static __inline__ vector unsigned short __attribute__((__always_inline__))
 vec_vpkuwus(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkuwus(__b, __a);
@@ -5729,8 +5839,8 @@
 
 /* vec_packsu */
 
-static vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a,
-                                                    vector short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packsu(vector short __a, vector short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkshus(__b, __a);
 #else
@@ -5738,8 +5848,8 @@
 #endif
 }
 
-static vector unsigned char __ATTRS_o_ai vec_packsu(vector unsigned short __a,
-                                                    vector unsigned short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packsu(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkuhus(__b, __a);
 #else
@@ -5747,8 +5857,8 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_packsu(vector int __a,
-                                                     vector int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packsu(vector int __a, vector int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkswus(__b, __a);
 #else
@@ -5756,8 +5866,8 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_packsu(vector unsigned int __a,
-                                                     vector unsigned int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packsu(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkuwus(__b, __a);
 #else
@@ -5766,8 +5876,8 @@
 }
 
 #ifdef __POWER8_VECTOR__
-static vector unsigned int __ATTRS_o_ai vec_packsu(vector long long __a,
-                                                   vector long long __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_packsu(vector long long __a, vector long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpksdus(__b, __a);
 #else
@@ -5775,7 +5885,7 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkudus(__b, __a);
@@ -5787,8 +5897,8 @@
 
 /* vec_vpkshus */
 
-static vector unsigned char __ATTRS_o_ai vec_vpkshus(vector short __a,
-                                                     vector short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector short __a, vector short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkshus(__b, __a);
 #else
@@ -5796,7 +5906,7 @@
 #endif
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_vpkshus(vector unsigned short __a, vector unsigned short __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkuhus(__b, __a);
@@ -5807,8 +5917,8 @@
 
 /* vec_vpkswus */
 
-static vector unsigned short __ATTRS_o_ai vec_vpkswus(vector int __a,
-                                                      vector int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector int __a, vector int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkswus(__b, __a);
 #else
@@ -5816,8 +5926,8 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vpkswus(vector unsigned int __a,
-                                                      vector unsigned int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector unsigned int __a, vector unsigned int __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpkuwus(__b, __a);
 #else
@@ -5828,8 +5938,8 @@
 /* vec_vpksdus */
 
 #ifdef __POWER8_VECTOR__
-static vector unsigned int __ATTRS_o_ai vec_vpksdus(vector long long __a,
-                                                    vector long long __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vpksdus(vector long long __a, vector long long __b) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vpksdus(__b, __a);
 #else
@@ -5848,9 +5958,8 @@
 // in that the vec_xor can be recognized as a vec_nor (and for P8 and
 // later, possibly a vec_nand).
 
-static vector signed char __ATTRS_o_ai vec_perm(vector signed char __a,
-                                                vector signed char __b,
-                                                vector unsigned char __c) {
+static __inline__ vector signed char __ATTRS_o_ai vec_perm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5863,9 +5972,9 @@
 #endif
 }
 
-static vector unsigned char __ATTRS_o_ai vec_perm(vector unsigned char __a,
-                                                  vector unsigned char __b,
-                                                  vector unsigned char __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5878,9 +5987,8 @@
 #endif
 }
 
-static vector bool char __ATTRS_o_ai vec_perm(vector bool char __a,
-                                              vector bool char __b,
-                                              vector unsigned char __c) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5893,9 +6001,9 @@
 #endif
 }
 
-static vector short __ATTRS_o_ai vec_perm(vector signed short __a,
-                                          vector signed short __b,
-                                          vector unsigned char __c) {
+static __inline__ vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                                     vector signed short __b,
+                                                     vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5908,9 +6016,9 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_perm(vector unsigned short __a,
-                                                   vector unsigned short __b,
-                                                   vector unsigned char __c) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5923,9 +6031,8 @@
 #endif
 }
 
-static vector bool short __ATTRS_o_ai vec_perm(vector bool short __a,
-                                               vector bool short __b,
-                                               vector unsigned char __c) {
+static __inline__ vector bool short __ATTRS_o_ai vec_perm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5938,8 +6045,9 @@
 #endif
 }
 
-static vector pixel __ATTRS_o_ai vec_perm(vector pixel __a, vector pixel __b,
-                                          vector unsigned char __c) {
+static __inline__ vector pixel __ATTRS_o_ai vec_perm(vector pixel __a,
+                                                     vector pixel __b,
+                                                     vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5952,9 +6060,9 @@
 #endif
 }
 
-static vector int __ATTRS_o_ai vec_perm(vector signed int __a,
-                                        vector signed int __b,
-                                        vector unsigned char __c) {
+static __inline__ vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                                   vector signed int __b,
+                                                   vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5965,9 +6073,9 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai vec_perm(vector unsigned int __a,
-                                                 vector unsigned int __b,
-                                                 vector unsigned char __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5980,9 +6088,8 @@
 #endif
 }
 
-static vector bool int __ATTRS_o_ai vec_perm(vector bool int __a,
-                                             vector bool int __b,
-                                             vector unsigned char __c) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -5995,8 +6102,9 @@
 #endif
 }
 
-static vector float __ATTRS_o_ai vec_perm(vector float __a, vector float __b,
-                                          vector unsigned char __c) {
+static __inline__ vector float __ATTRS_o_ai vec_perm(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -6010,9 +6118,9 @@
 }
 
 #ifdef __VSX__
-static vector long long __ATTRS_o_ai vec_perm(vector signed long long __a,
-                                              vector signed long long __b,
-                                              vector unsigned char __c) {
+static __inline__ vector long long __ATTRS_o_ai
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -6025,7 +6133,7 @@
 #endif
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_perm(vector unsigned long long __a, vector unsigned long long __b,
          vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
@@ -6040,7 +6148,7 @@
 #endif
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_perm(vector bool long long __a, vector bool long long __b,
          vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
@@ -6055,8 +6163,8 @@
 #endif
 }
 
-static vector double __ATTRS_o_ai vec_perm(vector double __a, vector double __b,
-                                           vector unsigned char __c) {
+static __inline__ vector double __ATTRS_o_ai
+vec_perm(vector double __a, vector double __b, vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
@@ -6072,92 +6180,86 @@
 
 /* vec_vperm */
 
-static vector signed char __ATTRS_o_ai vec_vperm(vector signed char __a,
-                                                 vector signed char __b,
-                                                 vector unsigned char __c) {
+static __inline__ vector signed char __ATTRS_o_ai vec_vperm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vperm(vector unsigned char __a,
-                                                   vector unsigned char __b,
-                                                   vector unsigned char __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vperm(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector bool char __ATTRS_o_ai vec_vperm(vector bool char __a,
-                                               vector bool char __b,
-                                               vector unsigned char __c) {
+static __inline__ vector bool char __ATTRS_o_ai vec_vperm(
+    vector bool char __a, vector bool char __b, vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector short __ATTRS_o_ai vec_vperm(vector short __a, vector short __b,
-                                           vector unsigned char __c) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vperm(vector short __a, vector short __b, vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vperm(vector unsigned short __a,
-                                                    vector unsigned short __b,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vperm(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_vperm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai
+vec_vperm(vector pixel __a, vector pixel __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vperm(vector int __a,
+                                                    vector int __b,
                                                     vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector bool short __ATTRS_o_ai vec_vperm(vector bool short __a,
-                                                vector bool short __b,
-                                                vector unsigned char __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vperm(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector pixel __ATTRS_o_ai vec_vperm(vector pixel __a, vector pixel __b,
-                                           vector unsigned char __c) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vperm(vector bool int __a, vector bool int __b, vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector int __ATTRS_o_ai vec_vperm(vector int __a, vector int __b,
-                                         vector unsigned char __c) {
-  return vec_perm(__a, __b, __c);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_vperm(vector unsigned int __a,
-                                                  vector unsigned int __b,
-                                                  vector unsigned char __c) {
-  return vec_perm(__a, __b, __c);
-}
-
-static vector bool int __ATTRS_o_ai vec_vperm(vector bool int __a,
-                                              vector bool int __b,
-                                              vector unsigned char __c) {
-  return vec_perm(__a, __b, __c);
-}
-
-static vector float __ATTRS_o_ai vec_vperm(vector float __a, vector float __b,
-                                           vector unsigned char __c) {
+static __inline__ vector float __ATTRS_o_ai
+vec_vperm(vector float __a, vector float __b, vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
 #ifdef __VSX__
-static vector long long __ATTRS_o_ai vec_vperm(vector long long __a,
-                                               vector long long __b,
-                                               vector unsigned char __c) {
+static __inline__ vector long long __ATTRS_o_ai vec_vperm(
+    vector long long __a, vector long long __b, vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vperm(vector unsigned long long __a, vector unsigned long long __b,
           vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 
-static vector double __ATTRS_o_ai vec_vperm(vector double __a,
-                                            vector double __b,
-                                            vector unsigned char __c) {
+static __inline__ vector double __ATTRS_o_ai
+vec_vperm(vector double __a, vector double __b, vector unsigned char __c) {
   return vec_perm(__a, __b, __c);
 }
 #endif
 
 /* vec_re */
 
-static vector float __ATTRS_o_ai
-vec_re(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a) {
 #ifdef __VSX__
   return __builtin_vsx_xvresp(__a);
 #else
@@ -6166,56 +6268,57 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_re(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_re(vector double __a) {
   return __builtin_vsx_xvredp(__a);
 }
 #endif
 
 /* vec_vrefp */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vrefp(vector float __a) {
   return __builtin_altivec_vrefp(__a);
 }
 
 /* vec_rl */
 
-static vector signed char __ATTRS_o_ai vec_rl(vector signed char __a,
-                                              vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_rl(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_rl(vector unsigned char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_rl(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_rl(vector short __a,
-                                        vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_rl(vector short __a,
+                                                   vector unsigned short __b) {
   return __builtin_altivec_vrlh(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_rl(vector unsigned short __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_rl(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_rl(vector int __a, vector unsigned int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_rl(vector int __a,
+                                                 vector unsigned int __b) {
   return __builtin_altivec_vrlw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_rl(vector unsigned int __a,
-                                               vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rl(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
 }
 
 #ifdef __POWER8_VECTOR__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_rl(vector signed long long __a, vector unsigned long long __b) {
   return __builtin_altivec_vrld(__a, __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
   return __builtin_altivec_vrld(__a, __b);
 }
@@ -6223,43 +6326,43 @@
 
 /* vec_vrlb */
 
-static vector signed char __ATTRS_o_ai vec_vrlb(vector signed char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vrlb(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vrlb(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vrlb(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
 }
 
 /* vec_vrlh */
 
-static vector short __ATTRS_o_ai vec_vrlh(vector short __a,
-                                          vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vrlh(vector short __a, vector unsigned short __b) {
   return __builtin_altivec_vrlh(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vrlh(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vrlh(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
 }
 
 /* vec_vrlw */
 
-static vector int __ATTRS_o_ai vec_vrlw(vector int __a,
-                                        vector unsigned int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vrlw(vector int __a,
+                                                   vector unsigned int __b) {
   return __builtin_altivec_vrlw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vrlw(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vrlw(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
 }
 
 /* vec_round */
 
-static vector float __ATTRS_o_ai vec_round(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_round(vector float __a) {
 #ifdef __VSX__
   return __builtin_vsx_xvrspi(__a);
 #else
@@ -6268,36 +6371,34 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_round(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
   return __builtin_vsx_xvrdpi(__a);
 }
 
 /* vec_rint */
 
-static vector float __ATTRS_o_ai
-vec_rint(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_rint(vector float __a) {
   return __builtin_vsx_xvrspic(__a);
 }
 
-static vector double __ATTRS_o_ai
-vec_rint(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_rint(vector double __a) {
   return __builtin_vsx_xvrdpic(__a);
 }
 
 /* vec_nearbyint */
 
-static vector float __ATTRS_o_ai vec_nearbyint(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_nearbyint(vector float __a) {
   return __builtin_vsx_xvrspi(__a);
 }
 
-static vector double __ATTRS_o_ai vec_nearbyint(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_nearbyint(vector double __a) {
   return __builtin_vsx_xvrdpi(__a);
 }
 #endif
 
 /* vec_vrfin */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vrfin(vector float __a) {
   return __builtin_altivec_vrfin(__a);
 }
@@ -6305,19 +6406,18 @@
 /* vec_sqrt */
 
 #ifdef __VSX__
-static vector float __ATTRS_o_ai vec_sqrt(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_sqrt(vector float __a) {
   return __builtin_vsx_xvsqrtsp(__a);
 }
 
-static vector double __ATTRS_o_ai vec_sqrt(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_sqrt(vector double __a) {
   return __builtin_vsx_xvsqrtdp(__a);
 }
 #endif
 
 /* vec_rsqrte */
 
-static vector float __ATTRS_o_ai
-vec_rsqrte(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a) {
 #ifdef __VSX__
   return __builtin_vsx_xvrsqrtesp(__a);
 #else
@@ -6326,14 +6426,14 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_rsqrte(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_rsqrte(vector double __a) {
   return __builtin_vsx_xvrsqrtedp(__a);
 }
 #endif
 
 /* vec_vrsqrtefp */
 
-static __vector float __attribute__((__always_inline__))
+static __inline__ __vector float __attribute__((__always_inline__))
 vec_vrsqrtefp(vector float __a) {
   return __builtin_altivec_vrsqrtefp(__a);
 }
@@ -6342,257 +6442,250 @@
 
 #define __builtin_altivec_vsel_4si vec_sel
 
-static vector signed char __ATTRS_o_ai vec_sel(vector signed char __a,
-                                               vector signed char __b,
-                                               vector unsigned char __c) {
+static __inline__ vector signed char __ATTRS_o_ai vec_sel(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
   return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
 }
 
-static vector signed char __ATTRS_o_ai vec_sel(vector signed char __a,
-                                               vector signed char __b,
-                                               vector bool char __c) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c) {
   return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sel(vector unsigned char __a,
-                                                 vector unsigned char __b,
-                                                 vector unsigned char __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector unsigned char __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sel(vector unsigned char __a,
-                                                 vector unsigned char __b,
-                                                 vector bool char __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_sel(
+    vector unsigned char __a, vector unsigned char __b, vector bool char __c) {
   return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
 }
 
-static vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
-                                             vector bool char __b,
-                                             vector unsigned char __c) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
   return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
 }
 
-static vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
-                                             vector bool char __b,
-                                             vector bool char __c) {
+static __inline__ vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
+                                                        vector bool char __b,
+                                                        vector bool char __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector short __ATTRS_o_ai vec_sel(vector short __a, vector short __b,
-                                         vector unsigned short __c) {
+static __inline__ vector short __ATTRS_o_ai vec_sel(vector short __a,
+                                                    vector short __b,
+                                                    vector unsigned short __c) {
   return (__a & ~(vector short)__c) | (__b & (vector short)__c);
 }
 
-static vector short __ATTRS_o_ai vec_sel(vector short __a, vector short __b,
-                                         vector bool short __c) {
+static __inline__ vector short __ATTRS_o_ai vec_sel(vector short __a,
+                                                    vector short __b,
+                                                    vector bool short __c) {
   return (__a & ~(vector short)__c) | (__b & (vector short)__c);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sel(vector unsigned short __a,
-                                                  vector unsigned short __b,
-                                                  vector unsigned short __c) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector unsigned short __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sel(vector unsigned short __a,
-                                                  vector unsigned short __b,
-                                                  vector bool short __c) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector bool short __c) {
   return (__a & ~(vector unsigned short)__c) |
          (__b & (vector unsigned short)__c);
 }
 
-static vector bool short __ATTRS_o_ai vec_sel(vector bool short __a,
-                                              vector bool short __b,
-                                              vector unsigned short __c) {
+static __inline__ vector bool short __ATTRS_o_ai vec_sel(
+    vector bool short __a, vector bool short __b, vector unsigned short __c) {
   return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
 }
 
-static vector bool short __ATTRS_o_ai vec_sel(vector bool short __a,
-                                              vector bool short __b,
-                                              vector bool short __c) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector int __ATTRS_o_ai vec_sel(vector int __a, vector int __b,
-                                       vector unsigned int __c) {
+static __inline__ vector int __ATTRS_o_ai vec_sel(vector int __a,
+                                                  vector int __b,
+                                                  vector unsigned int __c) {
   return (__a & ~(vector int)__c) | (__b & (vector int)__c);
 }
 
-static vector int __ATTRS_o_ai vec_sel(vector int __a, vector int __b,
-                                       vector bool int __c) {
+static __inline__ vector int __ATTRS_o_ai vec_sel(vector int __a,
+                                                  vector int __b,
+                                                  vector bool int __c) {
   return (__a & ~(vector int)__c) | (__b & (vector int)__c);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sel(vector unsigned int __a,
-                                                vector unsigned int __b,
-                                                vector unsigned int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sel(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sel(vector unsigned int __a,
-                                                vector unsigned int __b,
-                                                vector bool int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
   return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
 }
 
-static vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
-                                            vector bool int __b,
-                                            vector unsigned int __c) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
   return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
 }
 
-static vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
-                                            vector bool int __b,
-                                            vector bool int __c) {
+static __inline__ vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
+                                                       vector bool int __b,
+                                                       vector bool int __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector float __ATTRS_o_ai vec_sel(vector float __a, vector float __b,
-                                         vector unsigned int __c) {
+static __inline__ vector float __ATTRS_o_ai vec_sel(vector float __a,
+                                                    vector float __b,
+                                                    vector unsigned int __c) {
   vector int __res = ((vector int)__a & ~(vector int)__c) |
                      ((vector int)__b & (vector int)__c);
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_sel(vector float __a, vector float __b,
-                                         vector bool int __c) {
+static __inline__ vector float __ATTRS_o_ai vec_sel(vector float __a,
+                                                    vector float __b,
+                                                    vector bool int __c) {
   vector int __res = ((vector int)__a & ~(vector int)__c) |
                      ((vector int)__b & (vector int)__c);
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_sel(vector double __a, vector double __b,
-                                          vector bool long long __c) {
+static __inline__ vector double __ATTRS_o_ai
+vec_sel(vector double __a, vector double __b, vector bool long long __c) {
   vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
-                     ((vector long long)__b & (vector long long)__c);
+                           ((vector long long)__b & (vector long long)__c);
   return (vector double)__res;
 }
 
-static vector double __ATTRS_o_ai vec_sel(vector double __a, vector double __b,
-                                          vector unsigned long long __c) {
+static __inline__ vector double __ATTRS_o_ai
+vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
   vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
-                     ((vector long long)__b & (vector long long)__c);
+                           ((vector long long)__b & (vector long long)__c);
   return (vector double)__res;
 }
 #endif
 
 /* vec_vsel */
 
-static vector signed char __ATTRS_o_ai vec_vsel(vector signed char __a,
-                                                vector signed char __b,
-                                                vector unsigned char __c) {
+static __inline__ vector signed char __ATTRS_o_ai vec_vsel(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
   return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
 }
 
-static vector signed char __ATTRS_o_ai vec_vsel(vector signed char __a,
-                                                vector signed char __b,
-                                                vector bool char __c) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector bool char __c) {
   return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsel(vector unsigned char __a,
-                                                  vector unsigned char __b,
-                                                  vector unsigned char __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsel(vector unsigned char __a,
-                                                  vector unsigned char __b,
-                                                  vector bool char __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_vsel(
+    vector unsigned char __a, vector unsigned char __b, vector bool char __c) {
   return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
 }
 
-static vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
-                                              vector bool char __b,
-                                              vector unsigned char __c) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
   return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
 }
 
-static vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
-                                              vector bool char __b,
-                                              vector bool char __c) {
+static __inline__ vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
+                                                         vector bool char __b,
+                                                         vector bool char __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector short __ATTRS_o_ai vec_vsel(vector short __a, vector short __b,
-                                          vector unsigned short __c) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector unsigned short __c) {
   return (__a & ~(vector short)__c) | (__b & (vector short)__c);
 }
 
-static vector short __ATTRS_o_ai vec_vsel(vector short __a, vector short __b,
-                                          vector bool short __c) {
+static __inline__ vector short __ATTRS_o_ai vec_vsel(vector short __a,
+                                                     vector short __b,
+                                                     vector bool short __c) {
   return (__a & ~(vector short)__c) | (__b & (vector short)__c);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsel(vector unsigned short __a,
-                                                   vector unsigned short __b,
-                                                   vector unsigned short __c) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned short __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsel(vector unsigned short __a,
-                                                   vector unsigned short __b,
-                                                   vector bool short __c) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b,
+         vector bool short __c) {
   return (__a & ~(vector unsigned short)__c) |
          (__b & (vector unsigned short)__c);
 }
 
-static vector bool short __ATTRS_o_ai vec_vsel(vector bool short __a,
-                                               vector bool short __b,
-                                               vector unsigned short __c) {
+static __inline__ vector bool short __ATTRS_o_ai vec_vsel(
+    vector bool short __a, vector bool short __b, vector unsigned short __c) {
   return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
 }
 
-static vector bool short __ATTRS_o_ai vec_vsel(vector bool short __a,
-                                               vector bool short __b,
-                                               vector bool short __c) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector bool short __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector int __ATTRS_o_ai vec_vsel(vector int __a, vector int __b,
-                                        vector unsigned int __c) {
+static __inline__ vector int __ATTRS_o_ai vec_vsel(vector int __a,
+                                                   vector int __b,
+                                                   vector unsigned int __c) {
   return (__a & ~(vector int)__c) | (__b & (vector int)__c);
 }
 
-static vector int __ATTRS_o_ai vec_vsel(vector int __a, vector int __b,
-                                        vector bool int __c) {
+static __inline__ vector int __ATTRS_o_ai vec_vsel(vector int __a,
+                                                   vector int __b,
+                                                   vector bool int __c) {
   return (__a & ~(vector int)__c) | (__b & (vector int)__c);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsel(vector unsigned int __a,
-                                                 vector unsigned int __b,
-                                                 vector unsigned int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsel(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsel(vector unsigned int __a,
-                                                 vector unsigned int __b,
-                                                 vector bool int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsel(
+    vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
   return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
 }
 
-static vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
-                                             vector bool int __b,
-                                             vector unsigned int __c) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
   return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
 }
 
-static vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
-                                             vector bool int __b,
-                                             vector bool int __c) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
+                                                        vector bool int __b,
+                                                        vector bool int __c) {
   return (__a & ~__c) | (__b & __c);
 }
 
-static vector float __ATTRS_o_ai vec_vsel(vector float __a, vector float __b,
-                                          vector unsigned int __c) {
+static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned int __c) {
   vector int __res = ((vector int)__a & ~(vector int)__c) |
                      ((vector int)__b & (vector int)__c);
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vsel(vector float __a, vector float __b,
-                                          vector bool int __c) {
+static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
+                                                     vector float __b,
+                                                     vector bool int __c) {
   vector int __res = ((vector int)__a & ~(vector int)__c) |
                      ((vector int)__b & (vector int)__c);
   return (vector float)__res;
@@ -6600,42 +6693,43 @@
 
 /* vec_sl */
 
-static vector signed char __ATTRS_o_ai vec_sl(vector signed char __a,
-                                              vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sl(vector signed char __a, vector unsigned char __b) {
   return __a << (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sl(vector unsigned char __a, vector unsigned char __b) {
   return __a << __b;
 }
 
-static vector short __ATTRS_o_ai vec_sl(vector short __a,
-                                        vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
+                                                   vector unsigned short __b) {
   return __a << (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sl(vector unsigned short __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sl(vector unsigned short __a, vector unsigned short __b) {
   return __a << __b;
 }
 
-static vector int __ATTRS_o_ai vec_sl(vector int __a, vector unsigned int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
+                                                 vector unsigned int __b) {
   return __a << (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sl(vector unsigned int __a,
-                                               vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sl(vector unsigned int __a, vector unsigned int __b) {
   return __a << __b;
 }
 
 #ifdef __POWER8_VECTOR__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_sl(vector signed long long __a, vector unsigned long long __b) {
   return __a << (vector long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
   return __a << __b;
 }
@@ -6645,13 +6739,13 @@
 
 #define __builtin_altivec_vslb vec_vslb
 
-static vector signed char __ATTRS_o_ai vec_vslb(vector signed char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslb(vector signed char __a, vector unsigned char __b) {
   return vec_sl(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vslb(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslb(vector unsigned char __a, vector unsigned char __b) {
   return vec_sl(__a, __b);
 }
 
@@ -6659,13 +6753,13 @@
 
 #define __builtin_altivec_vslh vec_vslh
 
-static vector short __ATTRS_o_ai vec_vslh(vector short __a,
-                                          vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vslh(vector short __a, vector unsigned short __b) {
   return vec_sl(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vslh(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslh(vector unsigned short __a, vector unsigned short __b) {
   return vec_sl(__a, __b);
 }
 
@@ -6673,13 +6767,13 @@
 
 #define __builtin_altivec_vslw vec_vslw
 
-static vector int __ATTRS_o_ai vec_vslw(vector int __a,
-                                        vector unsigned int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vslw(vector int __a,
+                                                   vector unsigned int __b) {
   return vec_sl(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vslw(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslw(vector unsigned int __a, vector unsigned int __b) {
   return vec_sl(__a, __b);
 }
 
@@ -6687,17 +6781,15 @@
 
 #define __builtin_altivec_vsldoi_4si vec_sld
 
-static vector signed char __ATTRS_o_ai vec_sld(vector signed char __a,
-                                               vector signed char __b,
-                                               unsigned const int __c) {
+static __inline__ vector signed char __ATTRS_o_ai vec_sld(
+    vector signed char __a, vector signed char __b, unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6707,17 +6799,16 @@
 #endif
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sld(vector unsigned char __a,
-                                                 vector unsigned char __b,
-                                                 unsigned const int __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sld(vector unsigned char __a, vector unsigned char __b,
+        unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6727,17 +6818,15 @@
 #endif
 }
 
-static vector bool char __ATTRS_o_ai vec_sld(vector bool char __a,
-                                             vector bool char __b,
-                                             unsigned const int __c) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sld(vector bool char __a, vector bool char __b, unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6747,17 +6836,15 @@
 #endif
 }
 
-static vector signed short __ATTRS_o_ai vec_sld(vector signed short __a,
-                                                vector signed short __b,
-                                                unsigned const int __c) {
+static __inline__ vector signed short __ATTRS_o_ai vec_sld(
+    vector signed short __a, vector signed short __b, unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6767,17 +6854,16 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sld(vector unsigned short __a,
-                                                  vector unsigned short __b,
-                                                  unsigned const int __c) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sld(vector unsigned short __a, vector unsigned short __b,
+        unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6787,17 +6873,15 @@
 #endif
 }
 
-static vector bool short __ATTRS_o_ai vec_sld(vector bool short __a,
-                                              vector bool short __b,
-                                              unsigned const int __c) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sld(vector bool short __a, vector bool short __b, unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6807,16 +6891,16 @@
 #endif
 }
 
-static vector pixel __ATTRS_o_ai vec_sld(vector pixel __a, vector pixel __b,
-                                         unsigned const int __c) {
+static __inline__ vector pixel __ATTRS_o_ai vec_sld(vector pixel __a,
+                                                    vector pixel __b,
+                                                    unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6826,17 +6910,15 @@
 #endif
 }
 
-static vector signed int __ATTRS_o_ai vec_sld(vector signed int __a,
-                                              vector signed int __b,
-                                              unsigned const int __c) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sld(vector signed int __a, vector signed int __b, unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6846,17 +6928,15 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sld(vector unsigned int __a,
-                                                vector unsigned int __b,
-                                                unsigned const int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sld(
+    vector unsigned int __a, vector unsigned int __b, unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6866,17 +6946,16 @@
 #endif
 }
 
-static vector bool int __ATTRS_o_ai vec_sld(vector bool int __a,
-                                            vector bool int __b,
-                                            unsigned const int __c) {
+static __inline__ vector bool int __ATTRS_o_ai vec_sld(vector bool int __a,
+                                                       vector bool int __b,
+                                                       unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6886,16 +6965,16 @@
 #endif
 }
 
-static vector float __ATTRS_o_ai vec_sld(vector float __a, vector float __b,
-                                         unsigned const int __c) {
+static __inline__ vector float __ATTRS_o_ai vec_sld(vector float __a,
+                                                    vector float __b,
+                                                    unsigned const int __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6907,17 +6986,15 @@
 
 /* vec_vsldoi */
 
-static vector signed char __ATTRS_o_ai vec_vsldoi(vector signed char __a,
-                                                  vector signed char __b,
-                                                  unsigned char __c) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsldoi(vector signed char __a, vector signed char __b, unsigned char __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6927,17 +7004,15 @@
 #endif
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsldoi(vector unsigned char __a,
-                                                    vector unsigned char __b,
-                                                    unsigned char __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai vec_vsldoi(
+    vector unsigned char __a, vector unsigned char __b, unsigned char __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6947,16 +7022,16 @@
 #endif
 }
 
-static vector short __ATTRS_o_ai vec_vsldoi(vector short __a, vector short __b,
-                                            unsigned char __c) {
+static __inline__ vector short __ATTRS_o_ai vec_vsldoi(vector short __a,
+                                                       vector short __b,
+                                                       unsigned char __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6966,17 +7041,53 @@
 #endif
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsldoi(vector unsigned short __a,
-                                                     vector unsigned short __b,
+static __inline__ vector unsigned short __ATTRS_o_ai vec_vsldoi(
+    vector unsigned short __a, vector unsigned short __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsldoi(vector pixel __a,
+                                                       vector pixel __b,
+                                                       unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsldoi(vector int __a,
+                                                     vector int __b,
                                                      unsigned char __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -6986,16 +7097,15 @@
 #endif
 }
 
-static vector pixel __ATTRS_o_ai vec_vsldoi(vector pixel __a, vector pixel __b,
-                                            unsigned char __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsldoi(
+    vector unsigned int __a, vector unsigned int __b, unsigned char __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -7005,55 +7115,16 @@
 #endif
 }
 
-static vector int __ATTRS_o_ai vec_vsldoi(vector int __a, vector int __b,
-                                          unsigned char __c) {
+static __inline__ vector float __ATTRS_o_ai vec_vsldoi(vector float __a,
+                                                       vector float __b,
+                                                       unsigned char __c) {
   unsigned char __d = __c & 0x0F;
 #ifdef __LITTLE_ENDIAN__
   return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
-#else
-  return vec_perm(
-      __a, __b,
-      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
-                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
-                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
-#endif
-}
-
-static vector unsigned int __ATTRS_o_ai vec_vsldoi(vector unsigned int __a,
-                                                   vector unsigned int __b,
-                                                   unsigned char __c) {
-  unsigned char __d = __c & 0x0F;
-#ifdef __LITTLE_ENDIAN__
-  return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
-#else
-  return vec_perm(
-      __a, __b,
-      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
-                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
-                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
-#endif
-}
-
-static vector float __ATTRS_o_ai vec_vsldoi(vector float __a, vector float __b,
-                                            unsigned char __c) {
-  unsigned char __d = __c & 0x0F;
-#ifdef __LITTLE_ENDIAN__
-  return vec_perm(
-      __b, __a,
-      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
-                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
-                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
-                             31 - __d));
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
 #else
   return vec_perm(
       __a, __b,
@@ -7065,654 +7136,655 @@
 
 /* vec_sll */
 
-static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vsl((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
-                                               vector unsigned short __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_altivec_vsl((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
-                                               vector unsigned int __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_altivec_vsl((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned short __b) {
   return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned int __b) {
   return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
-                                             vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_altivec_vsl((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
-                                             vector unsigned short __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_altivec_vsl((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
-                                             vector unsigned int __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_altivec_vsl((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_sll(vector short __a,
-                                         vector unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned char __b) {
   return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_sll(vector short __a,
-                                         vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned short __b) {
   return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_sll(vector short __a,
-                                         vector unsigned int __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned int __b) {
   return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
                                                   vector unsigned char __b) {
-  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
-                                                      (vector int)__b);
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
                                                   vector unsigned short __b) {
-  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
-                                                      (vector int)__b);
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
                                                   vector unsigned int __b) {
-  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
-                                                      (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
-                                              vector unsigned char __b) {
-  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
-                                              vector unsigned short __b) {
-  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
-                                              vector unsigned int __b) {
-  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
-                                         vector unsigned char __b) {
-  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
-                                         vector unsigned short __b) {
-  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
-                                         vector unsigned int __b) {
-  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_sll(vector int __a,
-                                       vector unsigned char __b) {
   return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
 }
 
-static vector int __ATTRS_o_ai vec_sll(vector int __a,
-                                       vector unsigned short __b) {
-  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_sll(vector int __a,
-                                       vector unsigned int __b) {
-  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned char __b) {
   return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
-                                                vector unsigned short __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
-                                            vector unsigned char __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_altivec_vsl((vector int)__a,
                                                 (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
-                                            vector unsigned short __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_altivec_vsl((vector int)__a,
                                                 (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
-                                            vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_altivec_vsl((vector int)__a,
                                                 (vector int)__b);
 }
 
 /* vec_vsl */
 
-static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vsl((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
-                                               vector unsigned short __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_altivec_vsl((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
-                                               vector unsigned int __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_altivec_vsl((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned short __b) {
   return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned int __b) {
   return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
-                                             vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_altivec_vsl((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
-                                             vector unsigned short __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_altivec_vsl((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
-                                             vector unsigned int __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_altivec_vsl((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vsl(vector short __a,
-                                         vector unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned char __b) {
   return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vsl(vector short __a,
-                                         vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned short __b) {
   return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vsl(vector short __a,
-                                         vector unsigned int __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned int __b) {
   return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
                                                   vector unsigned char __b) {
-  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
-                                                      (vector int)__b);
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
                                                   vector unsigned short __b) {
-  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
-                                                      (vector int)__b);
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
                                                   vector unsigned int __b) {
-  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
-                                                      (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
-                                              vector unsigned char __b) {
-  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
-                                              vector unsigned short __b) {
-  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
-                                              vector unsigned int __b) {
-  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
-                                         vector unsigned char __b) {
-  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
-                                         vector unsigned short __b) {
-  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
-                                         vector unsigned int __b) {
-  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_vsl(vector int __a,
-                                       vector unsigned char __b) {
   return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
 }
 
-static vector int __ATTRS_o_ai vec_vsl(vector int __a,
-                                       vector unsigned short __b) {
-  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_vsl(vector int __a,
-                                       vector unsigned int __b) {
-  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned char __b) {
   return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
-                                                vector unsigned short __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
-                                            vector unsigned char __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_altivec_vsl((vector int)__a,
                                                 (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
-                                            vector unsigned short __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_altivec_vsl((vector int)__a,
                                                 (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
-                                            vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_altivec_vsl((vector int)__a,
                                                 (vector int)__b);
 }
 
 /* vec_slo */
 
-static vector signed char __ATTRS_o_ai vec_slo(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector signed char __b) {
   return (vector signed char)__builtin_altivec_vslo((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_slo(vector signed char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vslo((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_slo(vector unsigned char __a,
-                                                 vector signed char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector signed char __b) {
   return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
                                                       (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_slo(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
                                                       (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_slo(vector short __a,
-                                         vector signed char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                                    vector signed char __b) {
   return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_slo(vector short __a,
-                                         vector unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                                    vector unsigned char __b) {
   return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_slo(vector unsigned short __a,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                                    vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_slo(vector int __a,
                                                   vector signed char __b) {
-  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
-                                                       (vector int)__b);
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_slo(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_slo(vector int __a,
                                                   vector unsigned char __b) {
-  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
-                                                       (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
-                                         vector signed char __b) {
-  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
-                                         vector unsigned char __b) {
-  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_slo(vector int __a, vector signed char __b) {
   return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
 }
 
-static vector int __ATTRS_o_ai vec_slo(vector int __a,
-                                       vector unsigned char __b) {
-  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_slo(vector unsigned int __a,
-                                                vector signed char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector signed char __b) {
   return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_slo(vector unsigned int __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector unsigned char __b) {
   return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_slo(vector float __a,
-                                         vector signed char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                                    vector signed char __b) {
   return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_slo(vector float __a,
-                                         vector unsigned char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                                    vector unsigned char __b) {
   return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
 /* vec_vslo */
 
-static vector signed char __ATTRS_o_ai vec_vslo(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector signed char __b) {
   return (vector signed char)__builtin_altivec_vslo((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vslo(vector signed char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vslo((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vslo(vector unsigned char __a,
-                                                  vector signed char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector signed char __b) {
   return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
                                                       (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vslo(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
                                                       (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vslo(vector short __a,
-                                          vector signed char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                                     vector signed char __b) {
   return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vslo(vector short __a,
-                                          vector unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                                     vector unsigned char __b) {
   return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vslo(vector unsigned short __a,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                                     vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                                     vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vslo(vector int __a,
                                                    vector signed char __b) {
-  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
-                                                       (vector int)__b);
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vslo(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_vslo(vector int __a,
                                                    vector unsigned char __b) {
-  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
-                                                       (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
-                                          vector signed char __b) {
-  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
-                                          vector unsigned char __b) {
-  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_vslo(vector int __a,
-                                        vector signed char __b) {
   return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
 }
 
-static vector int __ATTRS_o_ai vec_vslo(vector int __a,
-                                        vector unsigned char __b) {
-  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_vslo(vector unsigned int __a,
-                                                 vector signed char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector signed char __b) {
   return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vslo(vector unsigned int __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector unsigned char __b) {
   return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_vslo(vector float __a,
-                                          vector signed char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                                     vector signed char __b) {
   return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_vslo(vector float __a,
-                                          vector unsigned char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                                     vector unsigned char __b) {
   return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
 /* vec_splat */
 
-static vector signed char __ATTRS_o_ai vec_splat(vector signed char __a,
-                                                 unsigned const int __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_splat(vector signed char __a, unsigned const int __b) {
   return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_splat(vector unsigned char __a,
-                                                   unsigned const int __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splat(vector unsigned char __a, unsigned const int __b) {
   return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
 }
 
-static vector bool char __ATTRS_o_ai vec_splat(vector bool char __a,
-                                               unsigned const int __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_splat(vector bool char __a, unsigned const int __b) {
   return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
 }
 
-static vector signed short __ATTRS_o_ai vec_splat(vector signed short __a,
-                                                  unsigned const int __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_splat(vector signed short __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x07) * 2;
   unsigned char b1 = b0 + 1;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
-                                         b0, b1, b0, b1, b0, b1, b0, b1));
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
 }
 
-static vector unsigned short __ATTRS_o_ai vec_splat(vector unsigned short __a,
-                                                    unsigned const int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splat(vector unsigned short __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x07) * 2;
   unsigned char b1 = b0 + 1;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
-                                         b0, b1, b0, b1, b0, b1, b0, b1));
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
 }
 
-static vector bool short __ATTRS_o_ai vec_splat(vector bool short __a,
-                                                unsigned const int __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_splat(vector bool short __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x07) * 2;
   unsigned char b1 = b0 + 1;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
-                                         b0, b1, b0, b1, b0, b1, b0, b1));
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
 }
 
-static vector pixel __ATTRS_o_ai vec_splat(vector pixel __a,
-                                           unsigned const int __b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_splat(vector pixel __a,
+                                                      unsigned const int __b) {
   unsigned char b0 = (__b & 0x07) * 2;
   unsigned char b1 = b0 + 1;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
-                                         b0, b1, b0, b1, b0, b1, b0, b1));
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
 }
 
-static vector signed int __ATTRS_o_ai vec_splat(vector signed int __a,
-                                                unsigned const int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_splat(vector signed int __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x03) * 4;
   unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
-                                         b1, b2, b3, b0, b1, b2, b3));
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_splat(vector unsigned int __a,
-                                                  unsigned const int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splat(vector unsigned int __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x03) * 4;
   unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
-                                         b1, b2, b3, b0, b1, b2, b3));
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
 }
 
-static vector bool int __ATTRS_o_ai vec_splat(vector bool int __a,
-                                              unsigned const int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_splat(vector bool int __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x03) * 4;
   unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
-                                         b1, b2, b3, b0, b1, b2, b3));
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
 }
 
-static vector float __ATTRS_o_ai vec_splat(vector float __a,
-                                           unsigned const int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_splat(vector float __a,
+                                                      unsigned const int __b) {
   unsigned char b0 = (__b & 0x03) * 4;
   unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
-                                         b1, b2, b3, b0, b1, b2, b3));
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_splat(vector double __a,
-                                            unsigned const int __b) {
+static __inline__ vector double __ATTRS_o_ai vec_splat(vector double __a,
+                                                       unsigned const int __b) {
   unsigned char b0 = (__b & 0x01) * 8;
-  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
-                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
-                                         b0, b1, b2, b3, b4, b5, b6, b7));
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
 }
-static vector bool long long __ATTRS_o_ai vec_splat(vector bool long long __a,
-                                                    unsigned const int __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_splat(vector bool long long __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x01) * 8;
-  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
-                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
-                                         b0, b1, b2, b3, b4, b5, b6, b7));
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
 }
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_splat(vector signed long long __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x01) * 8;
-  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
-                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
-                                         b0, b1, b2, b3, b4, b5, b6, b7));
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
 }
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_splat(vector unsigned long long __a, unsigned const int __b) {
   unsigned char b0 = (__b & 0x01) * 8;
-  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
-                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
-                                         b0, b1, b2, b3, b4, b5, b6, b7));
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
 }
 #endif
 
@@ -7720,18 +7792,18 @@
 
 #define __builtin_altivec_vspltb vec_vspltb
 
-static vector signed char __ATTRS_o_ai vec_vspltb(vector signed char __a,
-                                                  unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vspltb(vector signed char __a, unsigned char __b) {
   return vec_perm(__a, __a, (vector unsigned char)(__b));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vspltb(vector unsigned char __a,
-                                                    unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vspltb(vector unsigned char __a, unsigned char __b) {
   return vec_perm(__a, __a, (vector unsigned char)(__b));
 }
 
-static vector bool char __ATTRS_o_ai vec_vspltb(vector bool char __a,
-                                                unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_vspltb(vector bool char __a,
+                                                           unsigned char __b) {
   return vec_perm(__a, __a, (vector unsigned char)(__b));
 }
 
@@ -7739,8 +7811,8 @@
 
 #define __builtin_altivec_vsplth vec_vsplth
 
-static vector short __ATTRS_o_ai vec_vsplth(vector short __a,
-                                            unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsplth(vector short __a,
+                                                       unsigned char __b) {
   __b *= 2;
   unsigned char b1 = __b + 1;
   return vec_perm(__a, __a,
@@ -7748,8 +7820,8 @@
                                          __b, b1, __b, b1, __b, b1, __b, b1));
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsplth(vector unsigned short __a,
-                                                     unsigned char __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsplth(vector unsigned short __a, unsigned char __b) {
   __b *= 2;
   unsigned char b1 = __b + 1;
   return vec_perm(__a, __a,
@@ -7757,8 +7829,8 @@
                                          __b, b1, __b, b1, __b, b1, __b, b1));
 }
 
-static vector bool short __ATTRS_o_ai vec_vsplth(vector bool short __a,
-                                                 unsigned char __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsplth(vector bool short __a, unsigned char __b) {
   __b *= 2;
   unsigned char b1 = __b + 1;
   return vec_perm(__a, __a,
@@ -7766,8 +7838,8 @@
                                          __b, b1, __b, b1, __b, b1, __b, b1));
 }
 
-static vector pixel __ATTRS_o_ai vec_vsplth(vector pixel __a,
-                                            unsigned char __b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_vsplth(vector pixel __a,
+                                                       unsigned char __b) {
   __b *= 2;
   unsigned char b1 = __b + 1;
   return vec_perm(__a, __a,
@@ -7779,7 +7851,8 @@
 
 #define __builtin_altivec_vspltw vec_vspltw
 
-static vector int __ATTRS_o_ai vec_vspltw(vector int __a, unsigned char __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vspltw(vector int __a,
+                                                     unsigned char __b) {
   __b *= 4;
   unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
   return vec_perm(__a, __a,
@@ -7787,8 +7860,8 @@
                                          b1, b2, b3, __b, b1, b2, b3));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vspltw(vector unsigned int __a,
-                                                   unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vspltw(vector unsigned int __a, unsigned char __b) {
   __b *= 4;
   unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
   return vec_perm(__a, __a,
@@ -7796,8 +7869,8 @@
                                          b1, b2, b3, __b, b1, b2, b3));
 }
 
-static vector bool int __ATTRS_o_ai vec_vspltw(vector bool int __a,
-                                               unsigned char __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vspltw(vector bool int __a,
+                                                          unsigned char __b) {
   __b *= 4;
   unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
   return vec_perm(__a, __a,
@@ -7805,8 +7878,8 @@
                                          b1, b2, b3, __b, b1, b2, b3));
 }
 
-static vector float __ATTRS_o_ai vec_vspltw(vector float __a,
-                                            unsigned char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vspltw(vector float __a,
+                                                       unsigned char __b) {
   __b *= 4;
   unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
   return vec_perm(__a, __a,
@@ -7819,14 +7892,16 @@
 #define __builtin_altivec_vspltisb vec_splat_s8
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector signed char __ATTRS_o_ai vec_splat_s8(signed char __a) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_splat_s8(signed char __a) {
   return (vector signed char)(__a);
 }
 
 /* vec_vspltisb */
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector signed char __ATTRS_o_ai vec_vspltisb(signed char __a) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vspltisb(signed char __a) {
   return (vector signed char)(__a);
 }
 
@@ -7835,14 +7910,14 @@
 #define __builtin_altivec_vspltish vec_splat_s16
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector short __ATTRS_o_ai vec_splat_s16(signed char __a) {
+static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a) {
   return (vector short)(__a);
 }
 
 /* vec_vspltish */
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector short __ATTRS_o_ai vec_vspltish(signed char __a) {
+static __inline__ vector short __ATTRS_o_ai vec_vspltish(signed char __a) {
   return (vector short)(__a);
 }
 
@@ -7851,81 +7926,84 @@
 #define __builtin_altivec_vspltisw vec_splat_s32
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector int __ATTRS_o_ai vec_splat_s32(signed char __a) {
+static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a) {
   return (vector int)(__a);
 }
 
 /* vec_vspltisw */
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector int __ATTRS_o_ai vec_vspltisw(signed char __a) {
+static __inline__ vector int __ATTRS_o_ai vec_vspltisw(signed char __a) {
   return (vector int)(__a);
 }
 
 /* vec_splat_u8 */
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector unsigned char __ATTRS_o_ai vec_splat_u8(unsigned char __a) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splat_u8(unsigned char __a) {
   return (vector unsigned char)(__a);
 }
 
 /* vec_splat_u16 */
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector unsigned short __ATTRS_o_ai vec_splat_u16(signed char __a) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splat_u16(signed char __a) {
   return (vector unsigned short)(__a);
 }
 
 /* vec_splat_u32 */
 
 // FIXME: parameter should be treated as 5-bit signed literal
-static vector unsigned int __ATTRS_o_ai vec_splat_u32(signed char __a) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splat_u32(signed char __a) {
   return (vector unsigned int)(__a);
 }
 
 /* vec_sr */
 
-static vector signed char __ATTRS_o_ai vec_sr(vector signed char __a,
-                                              vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sr(vector signed char __a, vector unsigned char __b) {
   vector unsigned char __res = (vector unsigned char)__a >> __b;
   return (vector signed char)__res;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sr(vector unsigned char __a, vector unsigned char __b) {
   return __a >> __b;
 }
 
-static vector signed short __ATTRS_o_ai vec_sr(vector signed short __a,
-                                        vector unsigned short __b) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_sr(vector signed short __a, vector unsigned short __b) {
   vector unsigned short __res = (vector unsigned short)__a >> __b;
   return (vector signed short)__res;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sr(vector unsigned short __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sr(vector unsigned short __a, vector unsigned short __b) {
   return __a >> __b;
 }
 
-static vector signed int __ATTRS_o_ai vec_sr(vector signed int __a,
-                                             vector unsigned int __b) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sr(vector signed int __a, vector unsigned int __b) {
   vector unsigned int __res = (vector unsigned int)__a >> __b;
   return (vector signed int)__res;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sr(vector unsigned int __a,
-                                               vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sr(vector unsigned int __a, vector unsigned int __b) {
   return __a >> __b;
 }
 
 #ifdef __POWER8_VECTOR__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_sr(vector signed long long __a, vector unsigned long long __b) {
   vector unsigned long long __res = (vector unsigned long long)__a >> __b;
   return (vector signed long long)__res;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
   return __a >> __b;
 }
@@ -7935,13 +8013,13 @@
 
 #define __builtin_altivec_vsrb vec_vsrb
 
-static vector signed char __ATTRS_o_ai vec_vsrb(vector signed char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsrb(vector signed char __a, vector unsigned char __b) {
   return __a >> (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsrb(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsrb(vector unsigned char __a, vector unsigned char __b) {
   return __a >> __b;
 }
 
@@ -7949,13 +8027,13 @@
 
 #define __builtin_altivec_vsrh vec_vsrh
 
-static vector short __ATTRS_o_ai vec_vsrh(vector short __a,
-                                          vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vsrh(vector short __a, vector unsigned short __b) {
   return __a >> (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsrh(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsrh(vector unsigned short __a, vector unsigned short __b) {
   return __a >> __b;
 }
 
@@ -7963,55 +8041,55 @@
 
 #define __builtin_altivec_vsrw vec_vsrw
 
-static vector int __ATTRS_o_ai vec_vsrw(vector int __a,
-                                        vector unsigned int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vsrw(vector int __a,
+                                                   vector unsigned int __b) {
   return __a >> (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsrw(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsrw(vector unsigned int __a, vector unsigned int __b) {
   return __a >> __b;
 }
 
 /* vec_sra */
 
-static vector signed char __ATTRS_o_ai vec_sra(vector signed char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sra(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sra(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sra(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_sra(vector short __a,
-                                         vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sra(vector short __a,
+                                                    vector unsigned short __b) {
   return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sra(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sra(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_sra(vector int __a,
-                                       vector unsigned int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_sra(vector int __a,
+                                                  vector unsigned int __b) {
   return __builtin_altivec_vsraw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sra(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sra(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
 }
 
 #ifdef __POWER8_VECTOR__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_sra(vector signed long long __a, vector unsigned long long __b) {
   return __a >> __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)((vector signed long long)__a >> __b);
 }
@@ -8019,1324 +8097,1373 @@
 
 /* vec_vsrab */
 
-static vector signed char __ATTRS_o_ai vec_vsrab(vector signed char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsrab(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsrab(vector unsigned char __a,
-                                                   vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsrab(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
 }
 
 /* vec_vsrah */
 
-static vector short __ATTRS_o_ai vec_vsrah(vector short __a,
-                                           vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vsrah(vector short __a, vector unsigned short __b) {
   return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsrah(vector unsigned short __a,
-                                                    vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsrah(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
 }
 
 /* vec_vsraw */
 
-static vector int __ATTRS_o_ai vec_vsraw(vector int __a,
-                                         vector unsigned int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vsraw(vector int __a,
+                                                    vector unsigned int __b) {
   return __builtin_altivec_vsraw(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsraw(vector unsigned int __a,
-                                                  vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsraw(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
 }
 
 /* vec_srl */
 
-static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vsr((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
-                                               vector unsigned short __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_altivec_vsr((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
-                                               vector unsigned int __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_altivec_vsr((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned short __b) {
   return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned int __b) {
   return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
-                                             vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_altivec_vsr((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
-                                             vector unsigned short __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_altivec_vsr((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
-                                             vector unsigned int __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_altivec_vsr((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_srl(vector short __a,
-                                         vector unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned char __b) {
   return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_srl(vector short __a,
-                                         vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned short __b) {
   return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_srl(vector short __a,
-                                         vector unsigned int __b) {
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned int __b) {
   return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
                                                   vector unsigned char __b) {
-  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
-                                                      (vector int)__b);
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
                                                   vector unsigned short __b) {
-  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
-                                                      (vector int)__b);
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
                                                   vector unsigned int __b) {
-  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
-                                                      (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
-                                              vector unsigned char __b) {
-  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
-                                              vector unsigned short __b) {
-  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
-                                              vector unsigned int __b) {
-  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
-                                         vector unsigned char __b) {
-  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
-                                         vector unsigned short __b) {
-  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
-                                         vector unsigned int __b) {
-  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_srl(vector int __a,
-                                       vector unsigned char __b) {
   return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
 }
 
-static vector int __ATTRS_o_ai vec_srl(vector int __a,
-                                       vector unsigned short __b) {
-  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_srl(vector int __a,
-                                       vector unsigned int __b) {
-  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned char __b) {
   return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
-                                                vector unsigned short __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
-                                            vector unsigned char __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_altivec_vsr((vector int)__a,
                                                 (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
-                                            vector unsigned short __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_altivec_vsr((vector int)__a,
                                                 (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
-                                            vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_altivec_vsr((vector int)__a,
                                                 (vector int)__b);
 }
 
 /* vec_vsr */
 
-static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vsr((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
-                                               vector unsigned short __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_altivec_vsr((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
-                                               vector unsigned int __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_altivec_vsr((vector int)__a,
                                                    (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
-                                                 vector unsigned short __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned short __b) {
   return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned int __b) {
   return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
-                                             vector unsigned char __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_altivec_vsr((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
-                                             vector unsigned short __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_altivec_vsr((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
-                                             vector unsigned int __b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_altivec_vsr((vector int)__a,
                                                  (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vsr(vector short __a,
-                                         vector unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned char __b) {
   return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vsr(vector short __a,
-                                         vector unsigned short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned short __b) {
   return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vsr(vector short __a,
-                                         vector unsigned int __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned int __b) {
   return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
                                                   vector unsigned char __b) {
-  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
-                                                      (vector int)__b);
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
                                                   vector unsigned short __b) {
-  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
-                                                      (vector int)__b);
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
                                                   vector unsigned int __b) {
-  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
-                                                      (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
-                                              vector unsigned char __b) {
-  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
-                                              vector unsigned short __b) {
-  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
-                                              vector unsigned int __b) {
-  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
-                                                  (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
-                                         vector unsigned char __b) {
-  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
-                                         vector unsigned short __b) {
-  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
-                                         vector unsigned int __b) {
-  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_vsr(vector int __a,
-                                       vector unsigned char __b) {
   return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
 }
 
-static vector int __ATTRS_o_ai vec_vsr(vector int __a,
-                                       vector unsigned short __b) {
-  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_vsr(vector int __a,
-                                       vector unsigned int __b) {
-  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned char __b) {
   return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
-                                                vector unsigned short __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
-                                            vector unsigned char __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_altivec_vsr((vector int)__a,
                                                 (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
-                                            vector unsigned short __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_altivec_vsr((vector int)__a,
                                                 (vector int)__b);
 }
 
-static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
-                                            vector unsigned int __b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_altivec_vsr((vector int)__a,
                                                 (vector int)__b);
 }
 
 /* vec_sro */
 
-static vector signed char __ATTRS_o_ai vec_sro(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector signed char __b) {
   return (vector signed char)__builtin_altivec_vsro((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_sro(vector signed char __a,
-                                               vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vsro((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sro(vector unsigned char __a,
-                                                 vector signed char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector signed char __b) {
   return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
                                                       (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sro(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
                                                       (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_sro(vector short __a,
-                                         vector signed char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                                    vector signed char __b) {
   return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_sro(vector short __a,
-                                         vector unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                                    vector unsigned char __b) {
   return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sro(vector unsigned short __a,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                                    vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sro(vector int __a,
                                                   vector signed char __b) {
-  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
-                                                       (vector int)__b);
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sro(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_sro(vector int __a,
                                                   vector unsigned char __b) {
-  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
-                                                       (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
-                                         vector signed char __b) {
-  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
-                                         vector unsigned char __b) {
-  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_sro(vector int __a, vector signed char __b) {
   return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
 }
 
-static vector int __ATTRS_o_ai vec_sro(vector int __a,
-                                       vector unsigned char __b) {
-  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_sro(vector unsigned int __a,
-                                                vector signed char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector signed char __b) {
   return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sro(vector unsigned int __a,
-                                                vector unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector unsigned char __b) {
   return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_sro(vector float __a,
-                                         vector signed char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                                    vector signed char __b) {
   return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_sro(vector float __a,
-                                         vector unsigned char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                                    vector unsigned char __b) {
   return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
 /* vec_vsro */
 
-static vector signed char __ATTRS_o_ai vec_vsro(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector signed char __b) {
   return (vector signed char)__builtin_altivec_vsro((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vsro(vector signed char __a,
-                                                vector unsigned char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector unsigned char __b) {
   return (vector signed char)__builtin_altivec_vsro((vector int)__a,
                                                     (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsro(vector unsigned char __a,
-                                                  vector signed char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector signed char __b) {
   return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
                                                       (vector int)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsro(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector unsigned char __b) {
   return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
                                                       (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vsro(vector short __a,
-                                          vector signed char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                                     vector signed char __b) {
   return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
-static vector short __ATTRS_o_ai vec_vsro(vector short __a,
-                                          vector unsigned char __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                                     vector unsigned char __b) {
   return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsro(vector unsigned short __a,
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                                     vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                                     vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsro(vector int __a,
                                                    vector signed char __b) {
-  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
-                                                       (vector int)__b);
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsro(vector unsigned short __a,
+static __inline__ vector int __ATTRS_o_ai vec_vsro(vector int __a,
                                                    vector unsigned char __b) {
-  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
-                                                       (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
-                                          vector signed char __b) {
-  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
-}
-
-static vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
-                                          vector unsigned char __b) {
-  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
-}
-
-static vector int __ATTRS_o_ai vec_vsro(vector int __a,
-                                        vector signed char __b) {
   return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
 }
 
-static vector int __ATTRS_o_ai vec_vsro(vector int __a,
-                                        vector unsigned char __b) {
-  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
-}
-
-static vector unsigned int __ATTRS_o_ai vec_vsro(vector unsigned int __a,
-                                                 vector signed char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector signed char __b) {
   return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsro(vector unsigned int __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector unsigned char __b) {
   return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
                                                      (vector int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_vsro(vector float __a,
-                                          vector signed char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                                     vector signed char __b) {
   return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
-static vector float __ATTRS_o_ai vec_vsro(vector float __a,
-                                          vector unsigned char __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                                     vector unsigned char __b) {
   return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
 /* vec_st */
 
-static void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
-                                vector signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                           vector signed char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
-                                signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                           signed char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
-                                vector unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                           vector unsigned char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
-                                unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                           unsigned char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
-                                signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           signed char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
-                                unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           unsigned char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
-                                vector bool char *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           vector bool char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector short __a, int __b, vector short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector short __a, int __b,
+                                           vector short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector short __a, int __b,
+                                           short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
-                                vector unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                           vector unsigned short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
-                                unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                           unsigned short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
-                                unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           unsigned short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
-                                vector bool short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           vector bool short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector pixel __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
-                                unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           unsigned short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector pixel __a, int __b, vector pixel *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           vector pixel *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector int __a, int __b, vector int *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector int __a, int __b,
+                                           vector int *__c) {
   __builtin_altivec_stvx(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector int __a, int __b, int *__c) {
   __builtin_altivec_stvx(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
-                                vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                           vector unsigned int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
-                                unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                           unsigned int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
-                                unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           unsigned int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
-                                vector bool int *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           vector bool int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector float __a, int __b, vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector float __a, int __b,
+                                           vector float *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_st(vector float __a, int __b, float *__c) {
+static __inline__ void __ATTRS_o_ai vec_st(vector float __a, int __b,
+                                           float *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
 /* vec_stvx */
 
-static void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
-                                  vector signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                             vector signed char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
-                                  signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                             signed char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
-                                  vector unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                             vector unsigned char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
-                                  unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                             unsigned char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
-                                  signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             signed char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
-                                  unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             unsigned char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
-                                  vector bool char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             vector bool char *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
-                                  vector short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
+                                             vector short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
+                                             short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
-                                  vector unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                             vector unsigned short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
-                                  unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                             unsigned short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
-                                  unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             unsigned short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
-                                  vector bool short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             vector bool short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
-                                  unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             unsigned short *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
-                                  vector pixel *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             vector pixel *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector int __a, int __b, vector int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector int __a, int __b,
+                                             vector int *__c) {
   __builtin_altivec_stvx(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector int __a, int __b,
+                                             int *__c) {
   __builtin_altivec_stvx(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
-                                  vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                             vector unsigned int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
-                                  unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                             unsigned int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
-                                  unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             unsigned int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
-                                  vector bool int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             vector bool int *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
-                                  vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
+                                             vector float *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvx(vector float __a, int __b, float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
+                                             float *__c) {
   __builtin_altivec_stvx((vector int)__a, __b, __c);
 }
 
 /* vec_ste */
 
-static void __ATTRS_o_ai vec_ste(vector signed char __a, int __b,
-                                 signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector signed char __a, int __b,
+                                            signed char *__c) {
   __builtin_altivec_stvebx((vector char)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector unsigned char __a, int __b,
-                                 unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned char __a, int __b,
+                                            unsigned char *__c) {
   __builtin_altivec_stvebx((vector char)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
-                                 signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                            signed char *__c) {
   __builtin_altivec_stvebx((vector char)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
-                                 unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                            unsigned char *__c) {
   __builtin_altivec_stvebx((vector char)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector short __a, int __b,
+                                            short *__c) {
   __builtin_altivec_stvehx(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector unsigned short __a, int __b,
-                                 unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned short __a, int __b,
+                                            unsigned short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector bool short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
+                                            short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
-                                 unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
+                                            unsigned short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector pixel __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
+                                            short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
-                                 unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
+                                            unsigned short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector int __a, int __b, int *__c) {
   __builtin_altivec_stvewx(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector unsigned int __a, int __b,
-                                 unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned int __a, int __b,
+                                            unsigned int *__c) {
   __builtin_altivec_stvewx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector bool int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
+                                            int *__c) {
   __builtin_altivec_stvewx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
-                                 unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
+                                            unsigned int *__c) {
   __builtin_altivec_stvewx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_ste(vector float __a, int __b, float *__c) {
+static __inline__ void __ATTRS_o_ai vec_ste(vector float __a, int __b,
+                                            float *__c) {
   __builtin_altivec_stvewx((vector int)__a, __b, __c);
 }
 
 /* vec_stvebx */
 
-static void __ATTRS_o_ai vec_stvebx(vector signed char __a, int __b,
-                                    signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector signed char __a, int __b,
+                                               signed char *__c) {
   __builtin_altivec_stvebx((vector char)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvebx(vector unsigned char __a, int __b,
-                                    unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
   __builtin_altivec_stvebx((vector char)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
-                                    signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                               signed char *__c) {
   __builtin_altivec_stvebx((vector char)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
-                                    unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                               unsigned char *__c) {
   __builtin_altivec_stvebx((vector char)__a, __b, __c);
 }
 
 /* vec_stvehx */
 
-static void __ATTRS_o_ai vec_stvehx(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector short __a, int __b,
+                                               short *__c) {
   __builtin_altivec_stvehx(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvehx(vector unsigned short __a, int __b,
-                                    unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
-                                    short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                               short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
-                                    unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                               unsigned short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
+                                               short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
-                                    unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
+                                               unsigned short *__c) {
   __builtin_altivec_stvehx((vector short)__a, __b, __c);
 }
 
 /* vec_stvewx */
 
-static void __ATTRS_o_ai vec_stvewx(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector int __a, int __b,
+                                               int *__c) {
   __builtin_altivec_stvewx(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvewx(vector unsigned int __a, int __b,
-                                    unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
   __builtin_altivec_stvewx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
+                                               int *__c) {
   __builtin_altivec_stvewx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
-                                    unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
+                                               unsigned int *__c) {
   __builtin_altivec_stvewx((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvewx(vector float __a, int __b, float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector float __a, int __b,
+                                               float *__c) {
   __builtin_altivec_stvewx((vector int)__a, __b, __c);
 }
 
 /* vec_stl */
 
-static void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
-                                 vector signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                            vector signed char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
-                                 signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                            signed char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
-                                 vector unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                            vector unsigned char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
-                                 unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                            unsigned char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
-                                 signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            signed char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
-                                 unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            unsigned char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
-                                 vector bool char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            vector bool char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector short __a, int __b, vector short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector short __a, int __b,
+                                            vector short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector short __a, int __b,
+                                            short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
-                                 vector unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                            vector unsigned short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
-                                 unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                            unsigned short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
-                                 unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            unsigned short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
-                                 vector bool short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            vector bool short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
-                                 unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            unsigned short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b, vector pixel *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            vector pixel *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector int __a, int __b, vector int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector int __a, int __b,
+                                            vector int *__c) {
   __builtin_altivec_stvxl(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector int __a, int __b, int *__c) {
   __builtin_altivec_stvxl(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
-                                 vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                            vector unsigned int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
-                                 unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                            unsigned int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
-                                 unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            unsigned int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
-                                 vector bool int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            vector bool int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector float __a, int __b, vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector float __a, int __b,
+                                            vector float *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stl(vector float __a, int __b, float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stl(vector float __a, int __b,
+                                            float *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
 /* vec_stvxl */
 
-static void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
-                                   vector signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                              vector signed char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
-                                   signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                              signed char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
-                                   vector unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
-                                   unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
-                                   signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              signed char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
-                                   unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              unsigned char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
-                                   vector bool char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              vector bool char *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
-                                   vector short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
+                                              vector short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
+                                              short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector unsigned short __a, int __b,
-                                   vector unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector unsigned short __a, int __b,
-                                   unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
-                                   unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              unsigned short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
-                                   vector bool short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              vector bool short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
-                                   unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              unsigned short *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
-                                   vector pixel *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              vector pixel *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector int __a, int __b, vector int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector int __a, int __b,
+                                              vector int *__c) {
   __builtin_altivec_stvxl(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector int __a, int __b,
+                                              int *__c) {
   __builtin_altivec_stvxl(__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
-                                   vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
-                                   unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
-                                   unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              unsigned int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
-                                   vector bool int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              vector bool int *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
-                                   vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
+                                              vector float *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvxl(vector float __a, int __b, float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
+                                              float *__c) {
   __builtin_altivec_stvxl((vector int)__a, __b, __c);
 }
 
 /* vec_sub */
 
-static vector signed char __ATTRS_o_ai vec_sub(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector signed char __b) {
   return __a - __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_sub(vector bool char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a - __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_sub(vector signed char __a,
-                                               vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector bool char __b) {
   return __a - (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sub(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector unsigned char __b) {
   return __a - __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sub(vector bool char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a - __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_sub(vector unsigned char __a,
-                                                 vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector bool char __b) {
   return __a - (vector unsigned char)__b;
 }
 
-static vector short __ATTRS_o_ai vec_sub(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector short __a,
+                                                    vector short __b) {
   return __a - __b;
 }
 
-static vector short __ATTRS_o_ai vec_sub(vector bool short __a,
-                                         vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector bool short __a,
+                                                    vector short __b) {
   return (vector short)__a - __b;
 }
 
-static vector short __ATTRS_o_ai vec_sub(vector short __a,
-                                         vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector short __a,
+                                                    vector bool short __b) {
   return __a - (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sub(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector unsigned short __b) {
   return __a - __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sub(vector bool short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a - __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_sub(vector unsigned short __a,
-                                                  vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector bool short __b) {
   return __a - (vector unsigned short)__b;
 }
 
-static vector int __ATTRS_o_ai vec_sub(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector int __a,
+                                                  vector int __b) {
   return __a - __b;
 }
 
-static vector int __ATTRS_o_ai vec_sub(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector bool int __a,
+                                                  vector int __b) {
   return (vector int)__a - __b;
 }
 
-static vector int __ATTRS_o_ai vec_sub(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector int __a,
+                                                  vector bool int __b) {
   return __a - (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sub(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector unsigned int __b) {
   return __a - __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sub(vector bool int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a - __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sub(vector unsigned int __a,
-                                                vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector bool int __b) {
   return __a - (vector unsigned int)__b;
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector signed __int128 __ATTRS_o_ai vec_sub(vector signed __int128 __a,
-                                                   vector signed __int128 __b) {
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sub(vector signed __int128 __a, vector signed __int128 __b) {
   return __a - __b;
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_sub(vector unsigned __int128 __a, vector unsigned __int128 __b) {
   return __a - __b;
 }
 #endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_sub(vector signed long long __a, vector signed long long __b) {
   return __a - __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_sub(vector unsigned long long __a, vector unsigned long long __b) {
   return __a - __b;
 }
 
-static vector double __ATTRS_o_ai
-vec_sub(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_sub(vector double __a,
+                                                     vector double __b) {
   return __a - __b;
 }
 #endif
 
-static vector float __ATTRS_o_ai vec_sub(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_sub(vector float __a,
+                                                    vector float __b) {
   return __a - __b;
 }
 
@@ -9344,33 +9471,33 @@
 
 #define __builtin_altivec_vsububm vec_vsububm
 
-static vector signed char __ATTRS_o_ai vec_vsububm(vector signed char __a,
-                                                   vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector signed char __b) {
   return __a - __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vsububm(vector bool char __a,
-                                                   vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a - __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vsububm(vector signed char __a,
-                                                   vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector bool char __b) {
   return __a - (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsububm(vector unsigned char __a,
-                                                     vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector unsigned char __b) {
   return __a - __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsububm(vector bool char __a,
-                                                     vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a - __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsububm(vector unsigned char __a,
-                                                     vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector bool char __b) {
   return __a - (vector unsigned char)__b;
 }
 
@@ -9378,33 +9505,33 @@
 
 #define __builtin_altivec_vsubuhm vec_vsubuhm
 
-static vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
-                                             vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                                        vector short __b) {
   return __a - __b;
 }
 
-static vector short __ATTRS_o_ai vec_vsubuhm(vector bool short __a,
-                                             vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector bool short __a,
+                                                        vector short __b) {
   return (vector short)__a - __b;
 }
 
-static vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
-                                             vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                                        vector bool short __b) {
   return __a - (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vsubuhm(vector unsigned short __a, vector unsigned short __b) {
   return __a - __b;
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vsubuhm(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a - __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsubuhm(vector unsigned short __a,
-                                                      vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector bool short __b) {
   return __a - (vector unsigned short)__b;
 }
 
@@ -9412,32 +9539,33 @@
 
 #define __builtin_altivec_vsubuwm vec_vsubuwm
 
-static vector int __ATTRS_o_ai vec_vsubuwm(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
+                                                      vector int __b) {
   return __a - __b;
 }
 
-static vector int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
-                                           vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
+                                                      vector int __b) {
   return (vector int)__a - __b;
 }
 
-static vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
-                                           vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
+                                                      vector bool int __b) {
   return __a - (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector unsigned int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector unsigned int __b) {
   return __a - __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a - __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector unsigned int __a,
-                                                    vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector bool int __b) {
   return __a - (vector unsigned int)__b;
 }
 
@@ -9445,25 +9573,25 @@
 
 #define __builtin_altivec_vsubfp vec_vsubfp
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vsubfp(vector float __a, vector float __b) {
   return __a - __b;
 }
 
 /* vec_subc */
 
-static vector unsigned int __ATTRS_o_ai vec_subc(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subc(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vsubcuw(__a, __b);
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_subc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
   return __builtin_altivec_vsubcuq(__a, __b);
 }
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_subc(vector signed __int128 __a, vector signed __int128 __b) {
   return __builtin_altivec_vsubcuq(__a, __b);
 }
@@ -9471,222 +9599,227 @@
 
 /* vec_vsubcuw */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vsubcuw(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vsubcuw(__a, __b);
 }
 
 /* vec_subs */
 
-static vector signed char __ATTRS_o_ai vec_subs(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vsubsbs(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_subs(vector bool char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector signed char __b) {
   return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_subs(vector signed char __a,
-                                                vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector bool char __b) {
   return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_subs(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vsububs(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_subs(vector bool char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector unsigned char __b) {
   return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_subs(vector unsigned char __a,
-                                                  vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector bool char __b) {
   return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
 }
 
-static vector short __ATTRS_o_ai vec_subs(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector short __a,
+                                                     vector short __b) {
   return __builtin_altivec_vsubshs(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_subs(vector bool short __a,
-                                          vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector bool short __a,
+                                                     vector short __b) {
   return __builtin_altivec_vsubshs((vector short)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_subs(vector short __a,
-                                          vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector short __a,
+                                                     vector bool short __b) {
   return __builtin_altivec_vsubshs(__a, (vector short)__b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_subs(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vsubuhs(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_subs(vector bool short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector unsigned short __b) {
   return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_subs(vector unsigned short __a,
-                                                   vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector bool short __b) {
   return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
 }
 
-static vector int __ATTRS_o_ai vec_subs(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector int __a,
+                                                   vector int __b) {
   return __builtin_altivec_vsubsws(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_subs(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector bool int __a,
+                                                   vector int __b) {
   return __builtin_altivec_vsubsws((vector int)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_subs(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector int __a,
+                                                   vector bool int __b) {
   return __builtin_altivec_vsubsws(__a, (vector int)__b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_subs(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vsubuws(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_subs(vector bool int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector unsigned int __b) {
   return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_subs(vector unsigned int __a,
-                                                 vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector bool int __b) {
   return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
 }
 
 /* vec_vsubsbs */
 
-static vector signed char __ATTRS_o_ai vec_vsubsbs(vector signed char __a,
-                                                   vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector signed char __b) {
   return __builtin_altivec_vsubsbs(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vsubsbs(vector bool char __a,
-                                                   vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector bool char __a, vector signed char __b) {
   return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vsubsbs(vector signed char __a,
-                                                   vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector bool char __b) {
   return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
 }
 
 /* vec_vsububs */
 
-static vector unsigned char __ATTRS_o_ai vec_vsububs(vector unsigned char __a,
-                                                     vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vsububs(__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsububs(vector bool char __a,
-                                                     vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector bool char __a, vector unsigned char __b) {
   return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vsububs(vector unsigned char __a,
-                                                     vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector bool char __b) {
   return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
 }
 
 /* vec_vsubshs */
 
-static vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
-                                             vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                                        vector short __b) {
   return __builtin_altivec_vsubshs(__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_vsubshs(vector bool short __a,
-                                             vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector bool short __a,
+                                                        vector short __b) {
   return __builtin_altivec_vsubshs((vector short)__a, __b);
 }
 
-static vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
-                                             vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                                        vector bool short __b) {
   return __builtin_altivec_vsubshs(__a, (vector short)__b);
 }
 
 /* vec_vsubuhs */
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vsubuhs(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_vsubuhs(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_vsubuhs(vector bool short __a, vector unsigned short __b) {
   return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vsubuhs(vector unsigned short __a,
-                                                      vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector bool short __b) {
   return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
 }
 
 /* vec_vsubsws */
 
-static vector int __ATTRS_o_ai vec_vsubsws(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
+                                                      vector int __b) {
   return __builtin_altivec_vsubsws(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_vsubsws(vector bool int __a,
-                                           vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector bool int __a,
+                                                      vector int __b) {
   return __builtin_altivec_vsubsws((vector int)__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
-                                           vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
+                                                      vector bool int __b) {
   return __builtin_altivec_vsubsws(__a, (vector int)__b);
 }
 
 /* vec_vsubuws */
 
-static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector unsigned int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vsubuws(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector bool int __a,
-                                                    vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector bool int __a, vector unsigned int __b) {
   return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector unsigned int __a,
-                                                    vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector bool int __b) {
   return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 /* vec_vsubuqm */
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vsubuqm(vector signed __int128 __a, vector signed __int128 __b) {
   return __a - __b;
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
   return __a - __b;
 }
 
 /* vec_vsubeuqm */
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
              vector signed __int128 __c) {
   return __builtin_altivec_vsubeuqm(__a, __b, __c);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
              vector unsigned __int128 __c) {
   return __builtin_altivec_vsubeuqm(__a, __b, __c);
@@ -9694,25 +9827,25 @@
 
 /* vec_vsubcuq */
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vsubcuq(vector signed __int128 __a, vector signed __int128 __b) {
   return __builtin_altivec_vsubcuq(__a, __b);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_vsubcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
   return __builtin_altivec_vsubcuq(__a, __b);
 }
 
 /* vec_vsubecuq */
 
-static vector signed __int128 __ATTRS_o_ai
+static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vsubecuq(vector signed __int128 __a, vector signed __int128 __b,
              vector signed __int128 __c) {
   return __builtin_altivec_vsubecuq(__a, __b, __c);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
              vector unsigned __int128 __c) {
   return __builtin_altivec_vsubecuq(__a, __b, __c);
@@ -9721,38 +9854,38 @@
 
 /* vec_sum4s */
 
-static vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
-                                         vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
+                                                    vector int __b) {
   return __builtin_altivec_vsum4sbs(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai vec_sum4s(vector unsigned char __a,
-                                                  vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sum4s(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_altivec_vsum4ubs(__a, __b);
 }
 
-static vector int __ATTRS_o_ai vec_sum4s(vector signed short __a,
-                                         vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed short __a,
+                                                    vector int __b) {
   return __builtin_altivec_vsum4shs(__a, __b);
 }
 
 /* vec_vsum4sbs */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vsum4sbs(vector signed char __a, vector int __b) {
   return __builtin_altivec_vsum4sbs(__a, __b);
 }
 
 /* vec_vsum4ubs */
 
-static vector unsigned int __attribute__((__always_inline__))
+static __inline__ vector unsigned int __attribute__((__always_inline__))
 vec_vsum4ubs(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_altivec_vsum4ubs(__a, __b);
 }
 
 /* vec_vsum4shs */
 
-static vector int __attribute__((__always_inline__))
+static __inline__ vector int __attribute__((__always_inline__))
 vec_vsum4shs(vector signed short __a, vector int __b) {
   return __builtin_altivec_vsum4shs(__a, __b);
 }
@@ -9765,7 +9898,7 @@
    programmer wants elements 1 and 3 in both cases, so for little
    endian we must perform some permutes.  */
 
-static vector signed int __attribute__((__always_inline__))
+static __inline__ vector signed int __attribute__((__always_inline__))
 vec_sum2s(vector int __a, vector int __b) {
 #ifdef __LITTLE_ENDIAN__
   vector int __c = (vector signed int)vec_perm(
@@ -9782,7 +9915,7 @@
 
 /* vec_vsum2sws */
 
-static vector signed int __attribute__((__always_inline__))
+static __inline__ vector signed int __attribute__((__always_inline__))
 vec_vsum2sws(vector int __a, vector int __b) {
 #ifdef __LITTLE_ENDIAN__
   vector int __c = (vector signed int)vec_perm(
@@ -9805,7 +9938,7 @@
    wants element 3 in both cases, so for little endian we must perform
    some permutes.  */
 
-static vector signed int __attribute__((__always_inline__))
+static __inline__ vector signed int __attribute__((__always_inline__))
 vec_sums(vector signed int __a, vector signed int __b) {
 #ifdef __LITTLE_ENDIAN__
   __b = (vector signed int)vec_splat(__b, 3);
@@ -9818,7 +9951,7 @@
 
 /* vec_vsumsws */
 
-static vector signed int __attribute__((__always_inline__))
+static __inline__ vector signed int __attribute__((__always_inline__))
 vec_vsumsws(vector signed int __a, vector signed int __b) {
 #ifdef __LITTLE_ENDIAN__
   __b = (vector signed int)vec_splat(__b, 3);
@@ -9831,8 +9964,7 @@
 
 /* vec_trunc */
 
-static vector float __ATTRS_o_ai
-vec_trunc(vector float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_trunc(vector float __a) {
 #ifdef __VSX__
   return __builtin_vsx_xvrspiz(__a);
 #else
@@ -9841,14 +9973,14 @@
 }
 
 #ifdef __VSX__
-static vector double __ATTRS_o_ai vec_trunc(vector double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_trunc(vector double __a) {
   return __builtin_vsx_xvrdpiz(__a);
 }
 #endif
 
 /* vec_vrfiz */
 
-static vector float __attribute__((__always_inline__))
+static __inline__ vector float __attribute__((__always_inline__))
 vec_vrfiz(vector float __a) {
   return __builtin_altivec_vrfiz(__a);
 }
@@ -9858,7 +9990,8 @@
 /* The vector unpack instructions all have a big-endian bias, so for
    little endian we must reverse the meanings of "high" and "low."  */
 
-static vector short __ATTRS_o_ai vec_unpackh(vector signed char __a) {
+static __inline__ vector short __ATTRS_o_ai
+vec_unpackh(vector signed char __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupklsb((vector char)__a);
 #else
@@ -9866,7 +9999,8 @@
 #endif
 }
 
-static vector bool short __ATTRS_o_ai vec_unpackh(vector bool char __a) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_unpackh(vector bool char __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
 #else
@@ -9874,7 +10008,7 @@
 #endif
 }
 
-static vector int __ATTRS_o_ai vec_unpackh(vector short __a) {
+static __inline__ vector int __ATTRS_o_ai vec_unpackh(vector short __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupklsh(__a);
 #else
@@ -9882,7 +10016,8 @@
 #endif
 }
 
-static vector bool int __ATTRS_o_ai vec_unpackh(vector bool short __a) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_unpackh(vector bool short __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
 #else
@@ -9890,7 +10025,8 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai vec_unpackh(vector pixel __a) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unpackh(vector pixel __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
 #else
@@ -9899,7 +10035,7 @@
 }
 
 #ifdef __POWER8_VECTOR__
-static vector long long __ATTRS_o_ai vec_unpackh(vector int __a) {
+static __inline__ vector long long __ATTRS_o_ai vec_unpackh(vector int __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupklsw(__a);
 #else
@@ -9907,7 +10043,8 @@
 #endif
 }
 
-static vector bool long long __ATTRS_o_ai vec_unpackh(vector bool int __a) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_unpackh(vector bool int __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
 #else
@@ -9918,7 +10055,8 @@
 
 /* vec_vupkhsb */
 
-static vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vupkhsb(vector signed char __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupklsb((vector char)__a);
 #else
@@ -9926,7 +10064,8 @@
 #endif
 }
 
-static vector bool short __ATTRS_o_ai vec_vupkhsb(vector bool char __a) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vupkhsb(vector bool char __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
 #else
@@ -9936,7 +10075,7 @@
 
 /* vec_vupkhsh */
 
-static vector int __ATTRS_o_ai vec_vupkhsh(vector short __a) {
+static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupklsh(__a);
 #else
@@ -9944,7 +10083,8 @@
 #endif
 }
 
-static vector bool int __ATTRS_o_ai vec_vupkhsh(vector bool short __a) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vupkhsh(vector bool short __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
 #else
@@ -9952,7 +10092,8 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vupkhsh(vector pixel __a) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vupkhsh(vector pixel __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
 #else
@@ -9963,7 +10104,7 @@
 /* vec_vupkhsw */
 
 #ifdef __POWER8_VECTOR__
-static vector long long __ATTRS_o_ai vec_vupkhsw(vector int __a) {
+static __inline__ vector long long __ATTRS_o_ai vec_vupkhsw(vector int __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupklsw(__a);
 #else
@@ -9971,7 +10112,8 @@
 #endif
 }
 
-static vector bool long long __ATTRS_o_ai vec_vupkhsw(vector bool int __a) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vupkhsw(vector bool int __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
 #else
@@ -9982,7 +10124,8 @@
 
 /* vec_unpackl */
 
-static vector short __ATTRS_o_ai vec_unpackl(vector signed char __a) {
+static __inline__ vector short __ATTRS_o_ai
+vec_unpackl(vector signed char __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupkhsb((vector char)__a);
 #else
@@ -9990,7 +10133,8 @@
 #endif
 }
 
-static vector bool short __ATTRS_o_ai vec_unpackl(vector bool char __a) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_unpackl(vector bool char __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
 #else
@@ -9998,7 +10142,7 @@
 #endif
 }
 
-static vector int __ATTRS_o_ai vec_unpackl(vector short __a) {
+static __inline__ vector int __ATTRS_o_ai vec_unpackl(vector short __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupkhsh(__a);
 #else
@@ -10006,7 +10150,8 @@
 #endif
 }
 
-static vector bool int __ATTRS_o_ai vec_unpackl(vector bool short __a) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_unpackl(vector bool short __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
 #else
@@ -10014,7 +10159,8 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai vec_unpackl(vector pixel __a) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unpackl(vector pixel __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
 #else
@@ -10023,7 +10169,7 @@
 }
 
 #ifdef __POWER8_VECTOR__
-static vector long long __ATTRS_o_ai vec_unpackl(vector int __a) {
+static __inline__ vector long long __ATTRS_o_ai vec_unpackl(vector int __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupkhsw(__a);
 #else
@@ -10031,7 +10177,8 @@
 #endif
 }
 
-static vector bool long long __ATTRS_o_ai vec_unpackl(vector bool int __a) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_unpackl(vector bool int __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
 #else
@@ -10042,7 +10189,8 @@
 
 /* vec_vupklsb */
 
-static vector short __ATTRS_o_ai vec_vupklsb(vector signed char __a) {
+static __inline__ vector short __ATTRS_o_ai
+vec_vupklsb(vector signed char __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupkhsb((vector char)__a);
 #else
@@ -10050,7 +10198,8 @@
 #endif
 }
 
-static vector bool short __ATTRS_o_ai vec_vupklsb(vector bool char __a) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vupklsb(vector bool char __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
 #else
@@ -10060,7 +10209,7 @@
 
 /* vec_vupklsh */
 
-static vector int __ATTRS_o_ai vec_vupklsh(vector short __a) {
+static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupkhsh(__a);
 #else
@@ -10068,7 +10217,8 @@
 #endif
 }
 
-static vector bool int __ATTRS_o_ai vec_vupklsh(vector bool short __a) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vupklsh(vector bool short __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
 #else
@@ -10076,7 +10226,8 @@
 #endif
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vupklsh(vector pixel __a) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vupklsh(vector pixel __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
 #else
@@ -10087,7 +10238,7 @@
 /* vec_vupklsw */
 
 #ifdef __POWER8_VECTOR__
-static vector long long __ATTRS_o_ai vec_vupklsw(vector int __a) {
+static __inline__ vector long long __ATTRS_o_ai vec_vupklsw(vector int __a) {
 #ifdef __LITTLE_ENDIAN__
   return __builtin_altivec_vupkhsw(__a);
 #else
@@ -10095,7 +10246,8 @@
 #endif
 }
 
-static vector bool long long __ATTRS_o_ai vec_vupklsw(vector bool int __a) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vupklsw(vector bool int __a) {
 #ifdef __LITTLE_ENDIAN__
   return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
 #else
@@ -10108,248 +10260,437 @@
 
 #ifdef __VSX__
 
-static vector signed int __ATTRS_o_ai vec_vsx_ld(int __a,
-                                                 const vector signed int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed int *__b) {
   return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector signed int __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed int *__b) {
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_vsx_ld(int __a, const vector unsigned int *__b) {
   return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
 }
 
-static vector float __ATTRS_o_ai vec_vsx_ld(int __a, const vector float *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector float *__b) {
   return (vector float)__builtin_vsx_lxvw4x(__a, __b);
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector float __ATTRS_o_ai vec_vsx_ld(int __a,
+                                                       const float *__b) {
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vsx_ld(int __a, const vector signed long long *__b) {
   return (vector signed long long)__builtin_vsx_lxvd2x(__a, __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vsx_ld(int __a, const vector unsigned long long *__b) {
   return (vector unsigned long long)__builtin_vsx_lxvd2x(__a, __b);
 }
 
-static vector double __ATTRS_o_ai vec_vsx_ld(int __a,
-                                             const vector double *__b) {
+static __inline__ vector double __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector double *__b) {
   return (vector double)__builtin_vsx_lxvd2x(__a, __b);
 }
 
+static __inline__ vector double __ATTRS_o_ai
+vec_vsx_ld(int __a, const double *__b) {
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed short *__b) {
+  return (vector signed short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed short *__b) {
+  return (vector signed short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
 #endif
 
 /* vec_vsx_st */
 
 #ifdef __VSX__
 
-static void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
-                                    vector signed int *__c) {
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               vector bool int *__c) {
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
-                                    vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               signed int *__c) {
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
-                                    vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               unsigned int *__c) {
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_vsx_st(vector signed long long __a, int __b,
-                                    vector signed long long *__c) {
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
+                                               vector signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
+                                               signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                               vector float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                               float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed long long __a,
+                                               int __b,
+                                               vector signed long long *__c) {
   __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_vsx_st(vector unsigned long long __a, int __b,
-                                    vector unsigned long long *__c) {
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned long long __a,
+                                               int __b,
+                                               vector unsigned long long *__c) {
   __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
 }
 
-static void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
-                                    vector double *__c) {
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                               vector double *__c) {
   __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
 }
 
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                               double *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               vector bool short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed short __a, int __b,
+                                               vector signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed short __a, int __b,
+                                               signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               vector bool char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed char __a, int __b,
+                                               vector signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed char __a, int __b,
+                                               signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
 #endif
 
 /* vec_xor */
 
 #define __builtin_altivec_vxor vec_xor
 
-static vector signed char __ATTRS_o_ai vec_xor(vector signed char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector signed char __b) {
   return __a ^ __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_xor(vector bool char __a,
-                                               vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a ^ __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_xor(vector signed char __a,
-                                               vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector bool char __b) {
   return __a ^ (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b) {
   return __a ^ __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_xor(vector bool char __a,
-                                                 vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a ^ __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
-                                                 vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector bool char __b) {
   return __a ^ (vector unsigned char)__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_xor(vector bool char __a,
-                                             vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                                        vector bool char __b) {
   return __a ^ __b;
 }
 
-static vector short __ATTRS_o_ai vec_xor(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector short __a,
+                                                    vector short __b) {
   return __a ^ __b;
 }
 
-static vector short __ATTRS_o_ai vec_xor(vector bool short __a,
-                                         vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                                    vector short __b) {
   return (vector short)__a ^ __b;
 }
 
-static vector short __ATTRS_o_ai vec_xor(vector short __a,
-                                         vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector short __a,
+                                                    vector bool short __b) {
   return __a ^ (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_xor(vector unsigned short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector unsigned short __b) {
   return __a ^ __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_xor(vector bool short __a,
-                                                  vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a ^ __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_xor(vector unsigned short __a,
-                                                  vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector bool short __b) {
   return __a ^ (vector unsigned short)__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_xor(vector bool short __a,
-                                              vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector bool short __b) {
   return __a ^ __b;
 }
 
-static vector int __ATTRS_o_ai vec_xor(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector int __a,
+                                                  vector int __b) {
   return __a ^ __b;
 }
 
-static vector int __ATTRS_o_ai vec_xor(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                  vector int __b) {
   return (vector int)__a ^ __b;
 }
 
-static vector int __ATTRS_o_ai vec_xor(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector int __a,
+                                                  vector bool int __b) {
   return __a ^ (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_xor(vector unsigned int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector unsigned int __b) {
   return __a ^ __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_xor(vector bool int __a,
-                                                vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a ^ __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_xor(vector unsigned int __a,
-                                                vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector bool int __b) {
   return __a ^ (vector unsigned int)__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_xor(vector bool int __a,
-                                            vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                       vector bool int __b) {
   return __a ^ __b;
 }
 
-static vector float __ATTRS_o_ai vec_xor(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector float __a,
+                                                    vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a ^ (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_xor(vector bool int __a,
-                                         vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                    vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a ^ (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_xor(vector float __a,
-                                         vector bool int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector float __a,
+                                                    vector bool int __b) {
   vector unsigned int __res =
       (vector unsigned int)__a ^ (vector unsigned int)__b;
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_xor(vector signed long long __a, vector signed long long __b) {
   return __a ^ __b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_xor(vector bool long long __a, vector signed long long __b) {
   return (vector signed long long)__a ^ __b;
 }
 
-static vector signed long long __ATTRS_o_ai vec_xor(vector signed long long __a,
-                                                    vector bool long long __b) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xor(vector signed long long __a, vector bool long long __b) {
   return __a ^ (vector signed long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_xor(vector unsigned long long __a, vector unsigned long long __b) {
   return __a ^ __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_xor(vector bool long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)__a ^ __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_xor(vector unsigned long long __a, vector bool long long __b) {
   return __a ^ (vector unsigned long long)__b;
 }
 
-static vector bool long long __ATTRS_o_ai vec_xor(vector bool long long __a,
-                                                  vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector bool long long __b) {
   return __a ^ __b;
 }
 
-static vector double __ATTRS_o_ai
-vec_xor(vector double __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_xor(vector double __a,
+                                                     vector double __b) {
   return (vector double)((vector unsigned long long)__a ^
-                          (vector unsigned long long)__b);
+                         (vector unsigned long long)__b);
 }
 
-static vector double __ATTRS_o_ai
+static __inline__ vector double __ATTRS_o_ai
 vec_xor(vector double __a, vector bool long long __b) {
   return (vector double)((vector unsigned long long)__a ^
-                         (vector unsigned long long) __b);
+                         (vector unsigned long long)__b);
 }
 
-static vector double __ATTRS_o_ai
-vec_xor(vector bool long long __a, vector double __b) {
+static __inline__ vector double __ATTRS_o_ai vec_xor(vector bool long long __a,
+                                                     vector double __b) {
   return (vector double)((vector unsigned long long)__a ^
                          (vector unsigned long long)__b);
 }
@@ -10357,160 +10698,165 @@
 
 /* vec_vxor */
 
-static vector signed char __ATTRS_o_ai vec_vxor(vector signed char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector signed char __b) {
   return __a ^ __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vxor(vector bool char __a,
-                                                vector signed char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector signed char __b) {
   return (vector signed char)__a ^ __b;
 }
 
-static vector signed char __ATTRS_o_ai vec_vxor(vector signed char __a,
-                                                vector bool char __b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector bool char __b) {
   return __a ^ (vector signed char)__b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vxor(vector unsigned char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector unsigned char __b) {
   return __a ^ __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vxor(vector bool char __a,
-                                                  vector unsigned char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector unsigned char __b) {
   return (vector unsigned char)__a ^ __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_vxor(vector unsigned char __a,
-                                                  vector bool char __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector bool char __b) {
   return __a ^ (vector unsigned char)__b;
 }
 
-static vector bool char __ATTRS_o_ai vec_vxor(vector bool char __a,
-                                              vector bool char __b) {
+static __inline__ vector bool char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                                         vector bool char __b) {
   return __a ^ __b;
 }
 
-static vector short __ATTRS_o_ai vec_vxor(vector short __a, vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector short __a,
+                                                     vector short __b) {
   return __a ^ __b;
 }
 
-static vector short __ATTRS_o_ai vec_vxor(vector bool short __a,
-                                          vector short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                                     vector short __b) {
   return (vector short)__a ^ __b;
 }
 
-static vector short __ATTRS_o_ai vec_vxor(vector short __a,
-                                          vector bool short __b) {
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector short __a,
+                                                     vector bool short __b) {
   return __a ^ (vector short)__b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vxor(vector unsigned short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector unsigned short __b) {
   return __a ^ __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vxor(vector bool short __a,
-                                                   vector unsigned short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector unsigned short __b) {
   return (vector unsigned short)__a ^ __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_vxor(vector unsigned short __a,
-                                                   vector bool short __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector bool short __b) {
   return __a ^ (vector unsigned short)__b;
 }
 
-static vector bool short __ATTRS_o_ai vec_vxor(vector bool short __a,
-                                               vector bool short __b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector bool short __b) {
   return __a ^ __b;
 }
 
-static vector int __ATTRS_o_ai vec_vxor(vector int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector int __a,
+                                                   vector int __b) {
   return __a ^ __b;
 }
 
-static vector int __ATTRS_o_ai vec_vxor(vector bool int __a, vector int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                   vector int __b) {
   return (vector int)__a ^ __b;
 }
 
-static vector int __ATTRS_o_ai vec_vxor(vector int __a, vector bool int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector int __a,
+                                                   vector bool int __b) {
   return __a ^ (vector int)__b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vxor(vector unsigned int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector unsigned int __b) {
   return __a ^ __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vxor(vector bool int __a,
-                                                 vector unsigned int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector unsigned int __b) {
   return (vector unsigned int)__a ^ __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_vxor(vector unsigned int __a,
-                                                 vector bool int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector bool int __b) {
   return __a ^ (vector unsigned int)__b;
 }
 
-static vector bool int __ATTRS_o_ai vec_vxor(vector bool int __a,
-                                             vector bool int __b) {
+static __inline__ vector bool int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                        vector bool int __b) {
   return __a ^ __b;
 }
 
-static vector float __ATTRS_o_ai vec_vxor(vector float __a, vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector float __a,
+                                                     vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a ^ (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vxor(vector bool int __a,
-                                          vector float __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                     vector float __b) {
   vector unsigned int __res =
       (vector unsigned int)__a ^ (vector unsigned int)__b;
   return (vector float)__res;
 }
 
-static vector float __ATTRS_o_ai vec_vxor(vector float __a,
-                                          vector bool int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector float __a,
+                                                     vector bool int __b) {
   vector unsigned int __res =
       (vector unsigned int)__a ^ (vector unsigned int)__b;
   return (vector float)__res;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vxor(vector signed long long __a, vector signed long long __b) {
   return __a ^ __b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vxor(vector bool long long __a, vector signed long long __b) {
   return (vector signed long long)__a ^ __b;
 }
 
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_vxor(vector signed long long __a, vector bool long long __b) {
   return __a ^ (vector signed long long)__b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vxor(vector unsigned long long __a, vector unsigned long long __b) {
   return __a ^ __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vxor(vector bool long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)__a ^ __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_vxor(vector unsigned long long __a, vector bool long long __b) {
   return __a ^ (vector unsigned long long)__b;
 }
 
-static vector bool long long __ATTRS_o_ai vec_vxor(vector bool long long __a,
-                                                   vector bool long long __b) {
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector bool long long __b) {
   return __a ^ __b;
 }
 #endif
@@ -10519,674 +10865,702 @@
 
 /* vec_extract */
 
-static signed char __ATTRS_o_ai vec_extract(vector signed char __a, int __b) {
+static __inline__ signed char __ATTRS_o_ai vec_extract(vector signed char __a,
+                                                       int __b) {
   return __a[__b];
 }
 
-static unsigned char __ATTRS_o_ai vec_extract(vector unsigned char __a,
-                                              int __b) {
+static __inline__ unsigned char __ATTRS_o_ai
+vec_extract(vector unsigned char __a, int __b) {
   return __a[__b];
 }
 
-static unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
-                                              int __b) {
+static __inline__ unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
+                                                         int __b) {
   return __a[__b];
 }
 
-static signed short __ATTRS_o_ai vec_extract(vector signed short __a, int __b) {
+static __inline__ signed short __ATTRS_o_ai vec_extract(vector signed short __a,
+                                                        int __b) {
   return __a[__b];
 }
 
-static unsigned short __ATTRS_o_ai vec_extract(vector unsigned short __a,
-                                               int __b) {
+static __inline__ unsigned short __ATTRS_o_ai
+vec_extract(vector unsigned short __a, int __b) {
   return __a[__b];
 }
 
-static unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
-                                               int __b) {
+static __inline__ unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
+                                                          int __b) {
   return __a[__b];
 }
 
-static signed int __ATTRS_o_ai vec_extract(vector signed int __a, int __b) {
+static __inline__ signed int __ATTRS_o_ai vec_extract(vector signed int __a,
+                                                      int __b) {
   return __a[__b];
 }
 
-static unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a, int __b) {
+static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a,
+                                                        int __b) {
   return __a[__b];
 }
 
-static unsigned int __ATTRS_o_ai vec_extract(vector bool int __a, int __b) {
+static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector bool int __a,
+                                                        int __b) {
   return __a[__b];
 }
 
 #ifdef __VSX__
-static signed long long __ATTRS_o_ai vec_extract(vector signed long long __a,
-                                                 int __b) {
+static __inline__ signed long long __ATTRS_o_ai
+vec_extract(vector signed long long __a, int __b) {
   return __a[__b];
 }
 
-static unsigned long long __ATTRS_o_ai
+static __inline__ unsigned long long __ATTRS_o_ai
 vec_extract(vector unsigned long long __a, int __b) {
   return __a[__b];
 }
 
-static unsigned long long __ATTRS_o_ai vec_extract(vector bool long long __a,
-                                                   int __b) {
+static __inline__ unsigned long long __ATTRS_o_ai
+vec_extract(vector bool long long __a, int __b) {
   return __a[__b];
 }
 
-static double __ATTRS_o_ai vec_extract(vector double __a, int __b) {
+static __inline__ double __ATTRS_o_ai vec_extract(vector double __a, int __b) {
   return __a[__b];
 }
 #endif
 
-static float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
+static __inline__ float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
   return __a[__b];
 }
 
 /* vec_insert */
 
-static vector signed char __ATTRS_o_ai vec_insert(signed char __a,
-                                                  vector signed char __b,
-                                                  int __c) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_insert(signed char __a, vector signed char __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_insert(unsigned char __a,
-                                                    vector unsigned char __b,
-                                                    int __c) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_insert(unsigned char __a, vector unsigned char __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
-                                                vector bool char __b,
-                                                int __c) {
+static __inline__ vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
+                                                           vector bool char __b,
+                                                           int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector signed short __ATTRS_o_ai vec_insert(signed short __a,
-                                                   vector signed short __b,
-                                                   int __c) {
+static __inline__ vector signed short __ATTRS_o_ai
+vec_insert(signed short __a, vector signed short __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_insert(unsigned short __a,
-                                                     vector unsigned short __b,
-                                                     int __c) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector unsigned short __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector bool short __ATTRS_o_ai vec_insert(unsigned short __a,
-                                                 vector bool short __b,
-                                                 int __c) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector bool short __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector signed int __ATTRS_o_ai vec_insert(signed int __a,
-                                                 vector signed int __b,
-                                                 int __c) {
+static __inline__ vector signed int __ATTRS_o_ai
+vec_insert(signed int __a, vector signed int __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_insert(unsigned int __a,
-                                                   vector unsigned int __b,
-                                                   int __c) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_insert(unsigned int __a, vector unsigned int __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
-                                               vector bool int __b,
-                                               int __c) {
+static __inline__ vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
+                                                          vector bool int __b,
+                                                          int __c) {
   __b[__c] = __a;
   return __b;
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai
+static __inline__ vector signed long long __ATTRS_o_ai
 vec_insert(signed long long __a, vector signed long long __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
 
-static vector bool long long __ATTRS_o_ai
+static __inline__ vector bool long long __ATTRS_o_ai
 vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
   __b[__c] = __a;
   return __b;
 }
-static vector double __ATTRS_o_ai vec_insert(double __a, vector double __b,
-                                             int __c) {
+static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
+                                                        vector double __b,
+                                                        int __c) {
   __b[__c] = __a;
   return __b;
 }
 #endif
 
-static vector float __ATTRS_o_ai vec_insert(float __a, vector float __b,
-                                            int __c) {
+static __inline__ vector float __ATTRS_o_ai vec_insert(float __a,
+                                                       vector float __b,
+                                                       int __c) {
   __b[__c] = __a;
   return __b;
 }
 
 /* vec_lvlx */
 
-static vector signed char __ATTRS_o_ai vec_lvlx(int __a,
-                                                const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const signed char *__b) {
   return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
                   vec_lvsl(__a, __b));
 }
 
-static vector signed char __ATTRS_o_ai vec_lvlx(int __a,
-                                                const vector signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const vector signed char *__b) {
   return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_lvlx(int __a,
-                                                  const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned char *__b) {
   return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_lvlx(int __a, const vector unsigned char *__b) {
   return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool char __ATTRS_o_ai vec_lvlx(int __a,
-                                              const vector bool char *__b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool char *__b) {
   return vec_perm(vec_ld(__a, __b), (vector bool char)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector short __ATTRS_o_ai vec_lvlx(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const short *__b) {
   return vec_perm(vec_ld(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
 }
 
-static vector short __ATTRS_o_ai vec_lvlx(int __a, const vector short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector short *__b) {
   return vec_perm(vec_ld(__a, __b), (vector short)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned short __ATTRS_o_ai vec_lvlx(int __a,
-                                                   const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned short *__b) {
   return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_lvlx(int __a, const vector unsigned short *__b) {
   return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool short __ATTRS_o_ai vec_lvlx(int __a,
-                                               const vector bool short *__b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool short *__b) {
   return vec_perm(vec_ld(__a, __b), (vector bool short)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector pixel __ATTRS_o_ai vec_lvlx(int __a, const vector pixel *__b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector pixel *__b) {
   return vec_perm(vec_ld(__a, __b), (vector pixel)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector int __ATTRS_o_ai vec_lvlx(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvlx(int __a, const int *__b) {
   return vec_perm(vec_ld(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
 }
 
-static vector int __ATTRS_o_ai vec_lvlx(int __a, const vector int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvlx(int __a,
+                                                   const vector int *__b) {
   return vec_perm(vec_ld(__a, __b), (vector int)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_lvlx(int __a,
-                                                 const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned int *__b) {
   return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_lvlx(int __a, const vector unsigned int *__b) {
   return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool int __ATTRS_o_ai vec_lvlx(int __a,
-                                             const vector bool int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool int *__b) {
   return vec_perm(vec_ld(__a, __b), (vector bool int)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector float __ATTRS_o_ai vec_lvlx(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const float *__b) {
   return vec_perm(vec_ld(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
 }
 
-static vector float __ATTRS_o_ai vec_lvlx(int __a, const vector float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector float *__b) {
   return vec_perm(vec_ld(__a, __b), (vector float)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
 /* vec_lvlxl */
 
-static vector signed char __ATTRS_o_ai vec_lvlxl(int __a,
-                                                 const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const signed char *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
                   vec_lvsl(__a, __b));
 }
 
-static vector signed char __ATTRS_o_ai
+static __inline__ vector signed char __ATTRS_o_ai
 vec_lvlxl(int __a, const vector signed char *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_lvlxl(int __a,
-                                                   const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned char *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_lvlxl(int __a, const vector unsigned char *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool char __ATTRS_o_ai vec_lvlxl(int __a,
-                                               const vector bool char *__b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool char *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector bool char)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector short __ATTRS_o_ai vec_lvlxl(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const short *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
 }
 
-static vector short __ATTRS_o_ai vec_lvlxl(int __a, const vector short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const vector short *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector short)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned short __ATTRS_o_ai vec_lvlxl(int __a,
-                                                    const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned short *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_lvlxl(int __a, const vector unsigned short *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool short __ATTRS_o_ai vec_lvlxl(int __a,
-                                                const vector bool short *__b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool short *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector bool short)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector pixel __ATTRS_o_ai vec_lvlxl(int __a, const vector pixel *__b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const vector pixel *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector pixel)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector int __ATTRS_o_ai vec_lvlxl(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvlxl(int __a, const int *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
 }
 
-static vector int __ATTRS_o_ai vec_lvlxl(int __a, const vector int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvlxl(int __a,
+                                                    const vector int *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector int)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_lvlxl(int __a,
-                                                  const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned int *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_lvlxl(int __a, const vector unsigned int *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool int __ATTRS_o_ai vec_lvlxl(int __a,
-                                              const vector bool int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool int *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector bool int)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector float __ATTRS_o_ai vec_lvlxl(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const float *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
 }
 
-static vector float __ATTRS_o_ai vec_lvlxl(int __a, vector float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      vector float *__b) {
   return vec_perm(vec_ldl(__a, __b), (vector float)(0),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
 /* vec_lvrx */
 
-static vector signed char __ATTRS_o_ai vec_lvrx(int __a,
-                                                const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const signed char *__b) {
   return vec_perm((vector signed char)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, __b));
 }
 
-static vector signed char __ATTRS_o_ai vec_lvrx(int __a,
-                                                const vector signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const vector signed char *__b) {
   return vec_perm((vector signed char)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_lvrx(int __a,
-                                                  const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned char *__b) {
   return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_lvrx(int __a, const vector unsigned char *__b) {
   return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool char __ATTRS_o_ai vec_lvrx(int __a,
-                                              const vector bool char *__b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool char *__b) {
   return vec_perm((vector bool char)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector short __ATTRS_o_ai vec_lvrx(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const short *__b) {
   return vec_perm((vector short)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
 }
 
-static vector short __ATTRS_o_ai vec_lvrx(int __a, const vector short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector short *__b) {
   return vec_perm((vector short)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned short __ATTRS_o_ai vec_lvrx(int __a,
-                                                   const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned short *__b) {
   return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_lvrx(int __a, const vector unsigned short *__b) {
   return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool short __ATTRS_o_ai vec_lvrx(int __a,
-                                               const vector bool short *__b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool short *__b) {
   return vec_perm((vector bool short)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector pixel __ATTRS_o_ai vec_lvrx(int __a, const vector pixel *__b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector pixel *__b) {
   return vec_perm((vector pixel)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector int __ATTRS_o_ai vec_lvrx(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvrx(int __a, const int *__b) {
   return vec_perm((vector int)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
 }
 
-static vector int __ATTRS_o_ai vec_lvrx(int __a, const vector int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvrx(int __a,
+                                                   const vector int *__b) {
   return vec_perm((vector int)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_lvrx(int __a,
-                                                 const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned int *__b) {
   return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_lvrx(int __a, const vector unsigned int *__b) {
   return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool int __ATTRS_o_ai vec_lvrx(int __a,
-                                             const vector bool int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool int *__b) {
   return vec_perm((vector bool int)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector float __ATTRS_o_ai vec_lvrx(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const float *__b) {
   return vec_perm((vector float)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
 }
 
-static vector float __ATTRS_o_ai vec_lvrx(int __a, const vector float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector float *__b) {
   return vec_perm((vector float)(0), vec_ld(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
 /* vec_lvrxl */
 
-static vector signed char __ATTRS_o_ai vec_lvrxl(int __a,
-                                                 const signed char *__b) {
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const signed char *__b) {
   return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, __b));
 }
 
-static vector signed char __ATTRS_o_ai
+static __inline__ vector signed char __ATTRS_o_ai
 vec_lvrxl(int __a, const vector signed char *__b) {
   return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned char __ATTRS_o_ai vec_lvrxl(int __a,
-                                                   const unsigned char *__b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned char *__b) {
   return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_lvrxl(int __a, const vector unsigned char *__b) {
   return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool char __ATTRS_o_ai vec_lvrxl(int __a,
-                                               const vector bool char *__b) {
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool char *__b) {
   return vec_perm((vector bool char)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector short __ATTRS_o_ai vec_lvrxl(int __a, const short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const short *__b) {
   return vec_perm((vector short)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
 }
 
-static vector short __ATTRS_o_ai vec_lvrxl(int __a, const vector short *__b) {
+static __inline__ vector short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector short *__b) {
   return vec_perm((vector short)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned short __ATTRS_o_ai vec_lvrxl(int __a,
-                                                    const unsigned short *__b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned short *__b) {
   return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_lvrxl(int __a, const vector unsigned short *__b) {
   return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool short __ATTRS_o_ai vec_lvrxl(int __a,
-                                                const vector bool short *__b) {
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool short *__b) {
   return vec_perm((vector bool short)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector pixel __ATTRS_o_ai vec_lvrxl(int __a, const vector pixel *__b) {
+static __inline__ vector pixel __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector pixel *__b) {
   return vec_perm((vector pixel)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector int __ATTRS_o_ai vec_lvrxl(int __a, const int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvrxl(int __a, const int *__b) {
   return vec_perm((vector int)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
 }
 
-static vector int __ATTRS_o_ai vec_lvrxl(int __a, const vector int *__b) {
+static __inline__ vector int __ATTRS_o_ai vec_lvrxl(int __a,
+                                                    const vector int *__b) {
   return vec_perm((vector int)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector unsigned int __ATTRS_o_ai vec_lvrxl(int __a,
-                                                  const unsigned int *__b) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned int *__b) {
   return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, __b));
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_lvrxl(int __a, const vector unsigned int *__b) {
   return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector bool int __ATTRS_o_ai vec_lvrxl(int __a,
-                                              const vector bool int *__b) {
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool int *__b) {
   return vec_perm((vector bool int)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
-static vector float __ATTRS_o_ai vec_lvrxl(int __a, const float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const float *__b) {
   return vec_perm((vector float)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
 }
 
-static vector float __ATTRS_o_ai vec_lvrxl(int __a, const vector float *__b) {
+static __inline__ vector float __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector float *__b) {
   return vec_perm((vector float)(0), vec_ldl(__a, __b),
                   vec_lvsl(__a, (unsigned char *)__b));
 }
 
 /* vec_stvlx */
 
-static void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
-                                   signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                              signed char *__c) {
   return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
-                                   vector signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                              vector signed char *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
-                                   unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
   return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
-                                   vector unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector bool char __a, int __b,
-                                   vector bool char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool char __a, int __b,
+                                              vector bool char *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
+                                              short *__c) {
   return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
-                                   vector short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
+                                              vector short *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector unsigned short __a, int __b,
-                                   unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
   return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector unsigned short __a, int __b,
-                                   vector unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector bool short __a, int __b,
-                                   vector bool short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool short __a, int __b,
+                                              vector bool short *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector pixel __a, int __b,
-                                   vector pixel *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector pixel __a, int __b,
+                                              vector pixel *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector int __a, int __b,
+                                              int *__c) {
   return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector int __a, int __b, vector int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector int __a, int __b,
+                                              vector int *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
-                                   unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
   return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
-                                   vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector bool int __a, int __b,
-                                   vector bool int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool int __a, int __b,
+                                              vector bool int *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlx(vector float __a, int __b,
-                                   vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector float __a, int __b,
+                                              vector float *__c) {
   return vec_st(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
@@ -11194,111 +11568,116 @@
 
 /* vec_stvlxl */
 
-static void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
-                                    signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                               signed char *__c) {
   return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
-                                    vector signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                               vector signed char *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a, int __b,
-                                    unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
   return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a, int __b,
-                                    vector unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector bool char __a, int __b,
-                                    vector bool char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool char __a, int __b,
+                                               vector bool char *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
+                                               short *__c) {
   return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
-                                    vector short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
+                                               vector short *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a, int __b,
-                                    unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
   return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a, int __b,
-                                    vector unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector bool short __a, int __b,
-                                    vector bool short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool short __a, int __b,
+                                               vector bool short *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector pixel __a, int __b,
-                                    vector pixel *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector pixel __a, int __b,
+                                               vector pixel *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b,
+                                               int *__c) {
   return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b, vector int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b,
+                                               vector int *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
-                                    unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
   return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
-                                    vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector bool int __a, int __b,
-                                    vector bool int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool int __a, int __b,
+                                               vector bool int *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvlxl(vector float __a, int __b,
-                                    vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector float __a, int __b,
+                                               vector float *__c) {
   return vec_stl(
       vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
@@ -11306,111 +11685,115 @@
 
 /* vec_stvrx */
 
-static void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
-                                   signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                              signed char *__c) {
   return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
-                                   vector signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                              vector signed char *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
-                                   unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
   return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
-                                   vector unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector bool char __a, int __b,
-                                   vector bool char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool char __a, int __b,
+                                              vector bool char *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
+                                              short *__c) {
   return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
-                                   vector short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
+                                              vector short *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector unsigned short __a, int __b,
-                                   unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
   return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector unsigned short __a, int __b,
-                                   vector unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector bool short __a, int __b,
-                                   vector bool short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool short __a, int __b,
+                                              vector bool short *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector pixel __a, int __b,
-                                   vector pixel *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector pixel __a, int __b,
+                                              vector pixel *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector int __a, int __b,
+                                              int *__c) {
   return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector int __a, int __b, vector int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector int __a, int __b,
+                                              vector int *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
-                                   unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
   return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                 __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
-                                   vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector bool int __a, int __b,
-                                   vector bool int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool int __a, int __b,
+                                              vector bool int *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrx(vector float __a, int __b,
-                                   vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector float __a, int __b,
+                                              vector float *__c) {
   return vec_st(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
@@ -11418,111 +11801,116 @@
 
 /* vec_stvrxl */
 
-static void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
-                                    signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                               signed char *__c) {
   return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
-                                    vector signed char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                               vector signed char *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a, int __b,
-                                    unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
   return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a, int __b,
-                                    vector unsigned char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector bool char __a, int __b,
-                                    vector bool char *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool char __a, int __b,
+                                               vector bool char *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b, short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
+                                               short *__c) {
   return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
-                                    vector short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
+                                               vector short *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a, int __b,
-                                    unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
   return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a, int __b,
-                                    vector unsigned short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector bool short __a, int __b,
-                                    vector bool short *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool short __a, int __b,
+                                               vector bool short *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector pixel __a, int __b,
-                                    vector pixel *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector pixel __a, int __b,
+                                               vector pixel *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b, int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b,
+                                               int *__c) {
   return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b, vector int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b,
+                                               vector int *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
-                                    unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
   return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
                  __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
-                                    vector unsigned int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector bool int __a, int __b,
-                                    vector bool int *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool int __a, int __b,
+                                               vector bool int *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
 }
 
-static void __ATTRS_o_ai vec_stvrxl(vector float __a, int __b,
-                                    vector float *__c) {
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector float __a, int __b,
+                                               vector float *__c) {
   return vec_stl(
       vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
       __b, __c);
@@ -11530,45 +11918,47 @@
 
 /* vec_promote */
 
-static vector signed char __ATTRS_o_ai vec_promote(signed char __a, int __b) {
+static __inline__ vector signed char __ATTRS_o_ai vec_promote(signed char __a,
+                                                              int __b) {
   vector signed char __res = (vector signed char)(0);
   __res[__b] = __a;
   return __res;
 }
 
-static vector unsigned char __ATTRS_o_ai vec_promote(unsigned char __a,
-                                                     int __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_promote(unsigned char __a, int __b) {
   vector unsigned char __res = (vector unsigned char)(0);
   __res[__b] = __a;
   return __res;
 }
 
-static vector short __ATTRS_o_ai vec_promote(short __a, int __b) {
+static __inline__ vector short __ATTRS_o_ai vec_promote(short __a, int __b) {
   vector short __res = (vector short)(0);
   __res[__b] = __a;
   return __res;
 }
 
-static vector unsigned short __ATTRS_o_ai vec_promote(unsigned short __a,
-                                                      int __b) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_promote(unsigned short __a, int __b) {
   vector unsigned short __res = (vector unsigned short)(0);
   __res[__b] = __a;
   return __res;
 }
 
-static vector int __ATTRS_o_ai vec_promote(int __a, int __b) {
+static __inline__ vector int __ATTRS_o_ai vec_promote(int __a, int __b) {
   vector int __res = (vector int)(0);
   __res[__b] = __a;
   return __res;
 }
 
-static vector unsigned int __ATTRS_o_ai vec_promote(unsigned int __a, int __b) {
+static __inline__ vector unsigned int __ATTRS_o_ai vec_promote(unsigned int __a,
+                                                               int __b) {
   vector unsigned int __res = (vector unsigned int)(0);
   __res[__b] = __a;
   return __res;
 }
 
-static vector float __ATTRS_o_ai vec_promote(float __a, int __b) {
+static __inline__ vector float __ATTRS_o_ai vec_promote(float __a, int __b) {
   vector float __res = (vector float)(0);
   __res[__b] = __a;
   return __res;
@@ -11576,56 +11966,63 @@
 
 /* vec_splats */
 
-static vector signed char __ATTRS_o_ai vec_splats(signed char __a) {
+static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a) {
   return (vector signed char)(__a);
 }
 
-static vector unsigned char __ATTRS_o_ai vec_splats(unsigned char __a) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splats(unsigned char __a) {
   return (vector unsigned char)(__a);
 }
 
-static vector short __ATTRS_o_ai vec_splats(short __a) {
+static __inline__ vector short __ATTRS_o_ai vec_splats(short __a) {
   return (vector short)(__a);
 }
 
-static vector unsigned short __ATTRS_o_ai vec_splats(unsigned short __a) {
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splats(unsigned short __a) {
   return (vector unsigned short)(__a);
 }
 
-static vector int __ATTRS_o_ai vec_splats(int __a) { return (vector int)(__a); }
+static __inline__ vector int __ATTRS_o_ai vec_splats(int __a) {
+  return (vector int)(__a);
+}
 
-static vector unsigned int __ATTRS_o_ai vec_splats(unsigned int __a) {
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splats(unsigned int __a) {
   return (vector unsigned int)(__a);
 }
 
 #ifdef __VSX__
-static vector signed long long __ATTRS_o_ai vec_splats(signed long long __a) {
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_splats(signed long long __a) {
   return (vector signed long long)(__a);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_splats(unsigned long long __a) {
   return (vector unsigned long long)(__a);
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
-static vector signed __int128 __ATTRS_o_ai vec_splats(signed __int128 __a) {
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_splats(signed __int128 __a) {
   return (vector signed __int128)(__a);
 }
 
-static vector unsigned __int128 __ATTRS_o_ai
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
 vec_splats(unsigned __int128 __a) {
   return (vector unsigned __int128)(__a);
 }
 
 #endif
 
-static vector double __ATTRS_o_ai vec_splats(double __a) {
+static __inline__ vector double __ATTRS_o_ai vec_splats(double __a) {
   return (vector double)(__a);
 }
 #endif
 
-static vector float __ATTRS_o_ai vec_splats(float __a) {
+static __inline__ vector float __ATTRS_o_ai vec_splats(float __a) {
   return (vector float)(__a);
 }
 
@@ -11633,168 +12030,177 @@
 
 /* vec_all_eq */
 
-static int __ATTRS_o_ai vec_all_eq(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector pixel __a, vector pixel __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector pixel __a,
+                                              vector pixel __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
                                       (vector int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
                                       (vector long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
                                       (vector long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
-                                   vector long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
                                       (vector long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
                                       (vector long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
                                       (vector long long)__b);
 }
 #endif
 
-static int __ATTRS_o_ai vec_all_eq(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpeqsp_p(__CR6_LT, __a, __b);
 #else
@@ -11803,160 +12209,169 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_all_eq(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpeqdp_p(__CR6_LT, __a, __b);
 }
 #endif
 
 /* vec_all_ge */
 
-static int __ATTRS_o_ai vec_all_ge(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, (vector signed char)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, (vector short)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, (vector int)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
                                       (vector unsigned int)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, (vector unsigned int)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
                                       (vector unsigned int)__a);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __b, __a);
 }
-static int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, (vector signed long long)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
                                       (vector unsigned long long)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b,
                                       (vector unsigned long long)__a);
 }
 
-static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
                                       (vector unsigned long long)__a);
 }
 #endif
 
-static int __ATTRS_o_ai vec_all_ge(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __a, __b);
 #else
@@ -11965,160 +12380,169 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_all_ge(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __a, __b);
 }
 #endif
 
 /* vec_all_gt */
 
-static int __ATTRS_o_ai vec_all_gt(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, (vector signed char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, (vector unsigned int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
                                       (vector unsigned int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
                                       (vector unsigned int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, __b);
 }
-static int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a,
                                       (vector unsigned long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
                                       (vector unsigned long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
                                       (vector unsigned long long)__b);
 }
 #endif
 
-static int __ATTRS_o_ai vec_all_gt(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __a, __b);
 #else
@@ -12127,168 +12551,177 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_all_gt(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __a, __b);
 }
 #endif
 
 /* vec_all_in */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_all_in(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpbfp_p(__CR6_EQ, __a, __b);
 }
 
 /* vec_all_le */
 
-static int __ATTRS_o_ai vec_all_le(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, (vector signed char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, (vector unsigned int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
                                       (vector unsigned int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
                                       (vector unsigned int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_all_le(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a,
                                       (vector unsigned long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
                                       (vector unsigned long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
                                       (vector unsigned long long)__b);
 }
 #endif
 
-static int __ATTRS_o_ai vec_all_le(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __b, __a);
 #else
@@ -12297,161 +12730,170 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_all_le(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_all_le(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __b, __a);
 }
 #endif
 
 /* vec_all_lt */
 
-static int __ATTRS_o_ai vec_all_lt(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_LT, (vector signed char)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_LT, (vector short)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_LT, (vector int)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
                                       (vector unsigned int)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, (vector unsigned int)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
                                       (vector unsigned int)__a);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_LT, (vector signed long long)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
                                       (vector unsigned long long)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b,
                                       (vector unsigned long long)__a);
 }
 
-static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
                                       (vector unsigned long long)__a);
 }
 #endif
 
-static int __ATTRS_o_ai vec_all_lt(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __b, __a);
 #else
@@ -12460,14 +12902,15 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_all_lt(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __b, __a);
 }
 #endif
 
 /* vec_all_nan */
 
-static int __ATTRS_o_ai vec_all_nan(vector float __a) {
+static __inline__ int __ATTRS_o_ai vec_all_nan(vector float __a) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ, __a, __a);
 #else
@@ -12476,176 +12919,185 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_all_nan(vector double __a) {
+static __inline__ int __ATTRS_o_ai vec_all_nan(vector double __a) {
   return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __a);
 }
 #endif
 
 /* vec_all_ne */
 
-static int __ATTRS_o_ai vec_all_ne(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector pixel __a, vector pixel __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector pixel __a,
+                                              vector pixel __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
                                       (vector int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector long long)__a,
                                       (vector long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
                                       (vector signed long long)__b);
 }
 #endif
 
-static int __ATTRS_o_ai vec_all_ne(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
 #else
@@ -12654,15 +13106,16 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_all_ne(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
 }
 #endif
 
 /* vec_all_nge */
 
-static int __ATTRS_o_ai
-vec_all_nge(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_all_nge(vector float __a,
+                                               vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgesp_p(__CR6_EQ, __a, __b);
 #else
@@ -12671,16 +13124,16 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai
-vec_all_nge(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_all_nge(vector double __a,
+                                               vector double __b) {
   return __builtin_vsx_xvcmpgedp_p(__CR6_EQ, __a, __b);
 }
 #endif
 
 /* vec_all_ngt */
 
-static int __ATTRS_o_ai
-vec_all_ngt(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ngt(vector float __a,
+                                               vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ, __a, __b);
 #else
@@ -12689,198 +13142,207 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai
-vec_all_ngt(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_all_ngt(vector double __a,
+                                               vector double __b) {
   return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ, __a, __b);
 }
 #endif
 
 /* vec_all_nle */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_all_nle(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __b, __a);
 }
 
 /* vec_all_nlt */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_all_nlt(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __b, __a);
 }
 
 /* vec_all_numeric */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_all_numeric(vector float __a) {
   return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __a);
 }
 
 /* vec_any_eq */
 
-static int __ATTRS_o_ai vec_any_eq(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector pixel __a, vector pixel __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector pixel __a,
+                                              vector pixel __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector long long)__a,
                                       (vector long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(
       __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpequd_p(
       __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpequd_p(
       __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(
       __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
 }
 #endif
 
-static int __ATTRS_o_ai vec_any_eq(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ_REV, __a, __b);
 #else
@@ -12889,168 +13351,177 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_any_eq(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ_REV, __a, __b);
 }
 #endif
 
 /* vec_any_ge */
 
-static int __ATTRS_o_ai vec_any_ge(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, (vector signed char)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, (vector short)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, (vector int)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
                                       (vector unsigned int)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b,
                                       (vector unsigned int)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
                                       (vector unsigned int)__a);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV,
                                       (vector signed long long)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
                                       (vector unsigned long long)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
                                       (vector unsigned long long)__b,
                                       (vector unsigned long long)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b,
                                       (vector unsigned long long)__a);
 }
 
-static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
                                       (vector unsigned long long)__b,
                                       (vector unsigned long long)__a);
 }
 #endif
 
-static int __ATTRS_o_ai vec_any_ge(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __a, __b);
 #else
@@ -13059,168 +13530,177 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_any_ge(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __a, __b);
 }
 #endif
 
 /* vec_any_gt */
 
-static int __ATTRS_o_ai vec_any_gt(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a,
                                       (vector signed char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a,
                                       (vector unsigned int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
                                       (vector unsigned int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
                                       (vector unsigned int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a,
                                       (vector unsigned long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
                                       (vector unsigned long long)__a,
                                       (vector unsigned long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
                                       (vector unsigned long long)__a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
                                       (vector unsigned long long)__a,
                                       (vector unsigned long long)__b);
 }
 #endif
 
-static int __ATTRS_o_ai vec_any_gt(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __a, __b);
 #else
@@ -13229,168 +13709,177 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_any_gt(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __a, __b);
 }
 #endif
 
 /* vec_any_le */
 
-static int __ATTRS_o_ai vec_any_le(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a,
                                       (vector signed char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
                                       (vector unsigned char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
                                       (vector unsigned short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a,
                                       (vector unsigned int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
                                       (vector unsigned int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
                                       __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
                                       (vector unsigned int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_any_le(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a,
                                       (vector unsigned long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
                                       (vector unsigned long long)__a,
                                       (vector unsigned long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
                                       (vector unsigned long long)__a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
                                       (vector unsigned long long)__a,
                                       (vector unsigned long long)__b);
 }
 #endif
 
-static int __ATTRS_o_ai vec_any_le(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __b, __a);
 #else
@@ -13399,168 +13888,177 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_any_le(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_any_le(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __b, __a);
 }
 #endif
 
 /* vec_any_lt */
 
-static int __ATTRS_o_ai vec_any_lt(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, (vector signed char)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
                                       (vector unsigned char)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, (vector short)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
                                       (vector unsigned short)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, (vector int)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
                                       __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
                                       (vector unsigned int)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b,
                                       (vector unsigned int)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
                                       (vector unsigned int)__a);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV,
                                       (vector signed long long)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
                                       (vector unsigned long long)__b, __a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
                                       (vector unsigned long long)__b,
                                       (vector unsigned long long)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b,
                                       (vector unsigned long long)__a);
 }
 
-static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
                                       (vector unsigned long long)__b,
                                       (vector unsigned long long)__a);
 }
 #endif
 
-static int __ATTRS_o_ai vec_any_lt(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __b, __a);
 #else
@@ -13569,182 +14067,193 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_any_lt(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __b, __a);
 }
 #endif
 
 /* vec_any_nan */
 
-static int __attribute__((__always_inline__)) vec_any_nan(vector float __a) {
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nan(vector float __a) {
   return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __a);
 }
 
 /* vec_any_ne */
 
-static int __ATTRS_o_ai vec_any_ne(vector signed char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector signed char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
-                                   vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool char __a,
-                                   vector signed char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector signed char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool char __a,
-                                   vector unsigned char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector unsigned char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool char __a, vector bool char __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector bool char __b) {
   return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
                                       (vector char)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector short __a, vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool short __a, vector short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool short __a,
-                                   vector unsigned short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector unsigned short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool short __a,
-                                   vector bool short __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector bool short __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector pixel __a, vector pixel __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector pixel __a,
+                                              vector pixel __b) {
   return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
                                       (vector short)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector int __a, vector int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
-                                   vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool int __a, vector int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool int __a,
-                                   vector unsigned int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector unsigned int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool int __a, vector bool int __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector bool int __b) {
   return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
                                       (vector int)__b);
 }
 
 #ifdef __POWER8_VECTOR__
-static int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
                                       (vector long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
                                       (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(
       __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
-                                   vector signed long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector signed long long __b) {
   return __builtin_altivec_vcmpequd_p(
       __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
-                                   vector unsigned long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector unsigned long long __b) {
   return __builtin_altivec_vcmpequd_p(
       __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
 }
 
-static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
-                                   vector bool long long __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector bool long long __b) {
   return __builtin_altivec_vcmpequd_p(
       __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
 }
 #endif
 
-static int __ATTRS_o_ai vec_any_ne(vector float __a, vector float __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector float __a,
+                                              vector float __b) {
 #ifdef __VSX__
   return __builtin_vsx_xvcmpeqsp_p(__CR6_LT_REV, __a, __b);
 #else
@@ -13753,49 +14262,50 @@
 }
 
 #ifdef __VSX__
-static int __ATTRS_o_ai vec_any_ne(vector double __a, vector double __b) {
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector double __a,
+                                              vector double __b) {
   return __builtin_vsx_xvcmpeqdp_p(__CR6_LT_REV, __a, __b);
 }
 #endif
 
 /* vec_any_nge */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_any_nge(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __a, __b);
 }
 
 /* vec_any_ngt */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_any_ngt(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __a, __b);
 }
 
 /* vec_any_nle */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_any_nle(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __b, __a);
 }
 
 /* vec_any_nlt */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_any_nlt(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __b, __a);
 }
 
 /* vec_any_numeric */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_any_numeric(vector float __a) {
   return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __a);
 }
 
 /* vec_any_out */
 
-static int __attribute__((__always_inline__))
+static __inline__ int __attribute__((__always_inline__))
 vec_any_out(vector float __a, vector float __b) {
   return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
 }
@@ -13820,30 +14330,30 @@
 #define vec_ncipher_be __builtin_altivec_crypto_vncipher
 #define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
 
-static vector unsigned long long __attribute__((__always_inline__))
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
 __builtin_crypto_vsbox(vector unsigned long long __a) {
   return __builtin_altivec_crypto_vsbox(__a);
 }
 
-static vector unsigned long long __attribute__((__always_inline__))
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
 __builtin_crypto_vcipher(vector unsigned long long __a,
                          vector unsigned long long __b) {
   return __builtin_altivec_crypto_vcipher(__a, __b);
 }
 
-static vector unsigned long long __attribute__((__always_inline__))
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
 __builtin_crypto_vcipherlast(vector unsigned long long __a,
                              vector unsigned long long __b) {
   return __builtin_altivec_crypto_vcipherlast(__a, __b);
 }
 
-static vector unsigned long long __attribute__((__always_inline__))
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
 __builtin_crypto_vncipher(vector unsigned long long __a,
                           vector unsigned long long __b) {
   return __builtin_altivec_crypto_vncipher(__a, __b);
 }
 
-static vector unsigned long long __attribute__((__always_inline__))
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
 __builtin_crypto_vncipherlast(vector unsigned long long __a,
                               vector unsigned long long __b) {
   return __builtin_altivec_crypto_vncipherlast(__a, __b);
@@ -13852,20 +14362,20 @@
 #define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
 #define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
 
-#define vec_shasigma_be(X, Y, Z) \
-  _Generic((X), vector unsigned int: __builtin_crypto_vshasigmaw, \
-                vector unsigned long long: __builtin_crypto_vshasigmad) \
-((X), (Y), (Z))
+#define vec_shasigma_be(X, Y, Z)                                               \
+  _Generic((X), vector unsigned int                                            \
+           : __builtin_crypto_vshasigmaw, vector unsigned long long            \
+           : __builtin_crypto_vshasigmad)((X), (Y), (Z))
 #endif
 
 #ifdef __POWER8_VECTOR__
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 __builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b,
                           vector unsigned char __c) {
   return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 __builtin_crypto_vpermxor(vector unsigned short __a, vector unsigned short __b,
                           vector unsigned short __c) {
   return (vector unsigned short)__builtin_altivec_crypto_vpermxor(
@@ -13873,73 +14383,72 @@
       (vector unsigned char)__c);
 }
 
-static vector unsigned int __ATTRS_o_ai __builtin_crypto_vpermxor(
+static __inline__ vector unsigned int __ATTRS_o_ai __builtin_crypto_vpermxor(
     vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
   return (vector unsigned int)__builtin_altivec_crypto_vpermxor(
       (vector unsigned char)__a, (vector unsigned char)__b,
       (vector unsigned char)__c);
 }
 
-static vector unsigned long long __ATTRS_o_ai __builtin_crypto_vpermxor(
-    vector unsigned long long __a, vector unsigned long long __b,
-    vector unsigned long long __c) {
+static __inline__ vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned long long __a,
+                          vector unsigned long long __b,
+                          vector unsigned long long __c) {
   return (vector unsigned long long)__builtin_altivec_crypto_vpermxor(
       (vector unsigned char)__a, (vector unsigned char)__b,
       (vector unsigned char)__c);
 }
 
-static vector unsigned char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 __builtin_crypto_vpmsumb(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_crypto_vpmsumb(__a, __b);
 }
 
-static vector unsigned short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 __builtin_crypto_vpmsumb(vector unsigned short __a, vector unsigned short __b) {
   return __builtin_altivec_crypto_vpmsumh(__a, __b);
 }
 
-static vector unsigned int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 __builtin_crypto_vpmsumb(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_crypto_vpmsumw(__a, __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 __builtin_crypto_vpmsumb(vector unsigned long long __a,
                          vector unsigned long long __b) {
   return __builtin_altivec_crypto_vpmsumd(__a, __b);
 }
 
-static vector signed char __ATTRS_o_ai vec_vgbbd (vector signed char __a)
-{
-  return __builtin_altivec_vgbbd((vector unsigned char) __a);
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vgbbd(vector signed char __a) {
+  return __builtin_altivec_vgbbd((vector unsigned char)__a);
 }
 
 #define vec_pmsum_be __builtin_crypto_vpmsumb
 #define vec_gb __builtin_altivec_vgbbd
 
-static vector unsigned char __ATTRS_o_ai vec_vgbbd (vector unsigned char __a)
-{
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vgbbd(vector unsigned char __a) {
   return __builtin_altivec_vgbbd(__a);
 }
 
-static vector long long __ATTRS_o_ai
-vec_vbpermq (vector signed char __a, vector signed char __b)
-{
-  return __builtin_altivec_vbpermq((vector unsigned char) __a,
-                                   (vector unsigned char) __b);
+static __inline__ vector long long __ATTRS_o_ai
+vec_vbpermq(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char)__a,
+                                   (vector unsigned char)__b);
 }
 
-static vector long long __ATTRS_o_ai
-vec_vbpermq (vector unsigned char __a, vector unsigned char __b)
-{
+static __inline__ vector long long __ATTRS_o_ai
+vec_vbpermq(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_altivec_vbpermq(__a, __b);
 }
 
 #ifdef __powerpc64__
-static vector unsigned long long __attribute__((__always_inline__))
-vec_bperm (vector unsigned __int128 __a, vector unsigned char __b) {
-  return __builtin_altivec_vbpermq((vector unsigned char) __a,
-                                   (vector unsigned char) __b);
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char)__a,
+                                   (vector unsigned char)__b);
 }
 #endif
 #endif
diff --git a/lib/Headers/ammintrin.h b/lib/Headers/ammintrin.h
index 4880fd7..8985bb4 100644
--- a/lib/Headers/ammintrin.h
+++ b/lib/Headers/ammintrin.h
@@ -38,9 +38,7 @@
 /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
 /// \endcode
 ///
-/// \code
 /// This intrinsic corresponds to the \c EXTRQ instruction.
-/// \endcode
 ///
 /// \param x
 ///    The value from which bits are extracted.
@@ -49,10 +47,10 @@
 ///    are zero, the length is interpreted as 64.
 /// \param idx
 ///    Bits [5:0] specify the index of the least significant bit; the other
-///    bits are ignored. If the sum of the index and length is greater than
-///    64, the result is undefined. If the length and index are both zero,
-///    bits [63:0] of parameter x are extracted. If the length is zero
-///    but the index is non-zero, the result is undefined.
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter x are extracted. If the length is zero but the index
+///    is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector whose lower 64 bits contain the bits
 ///    extracted from the source operand.
 #define _mm_extracti_si64(x, len, idx) \
@@ -64,20 +62,17 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code
 /// This intrinsic corresponds to the \c EXTRQ instruction.
-/// \endcode
 ///
 /// \param __x
 ///    The value from which bits are extracted.
 /// \param __y
-///    Specifies the index of the least significant bit at [13:8]
-///    and the length at [5:0]; all other bits are ignored.
-///    If bits [5:0] are zero, the length is interpreted as 64.
-///    If the sum of the index and length is greater than 64, the result is
-///    undefined. If the length and index are both zero, bits [63:0] of
-///    parameter __x are extracted. If the length is zero but the index is
-///    non-zero, the result is undefined.
+///    Specifies the index of the least significant bit at [13:8] and the
+///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
+///    length is interpreted as 64. If the sum of the index and length is
+///    greater than 64, the result is undefined. If the length and index are
+///    both zero, bits [63:0] of parameter __x are extracted. If the length is
+///    zero but the index is non-zero, the result is undefined.
 /// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
 ///    from the source operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -86,9 +81,9 @@
   return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
 }
 
-/// \brief Inserts bits of a specified length from the source integer vector
-///    y into the lower 64 bits of the destination integer vector x at the
-///    index idx and of the length len.
+/// \brief Inserts bits of a specified length from the source integer vector y
+///    into the lower 64 bits of the destination integer vector x at the index
+///    idx and of the length len.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -97,9 +92,7 @@
 /// const int idx);
 /// \endcode
 ///
-/// \code
 /// This intrinsic corresponds to the \c INSERTQ instruction.
-/// \endcode
 ///
 /// \param x
 ///    The destination operand where bits will be inserted. The inserted bits
@@ -113,14 +106,14 @@
 ///    are zero, the length is interpreted as 64.
 /// \param idx
 ///    Bits [5:0] specify the index of the least significant bit; the other
-///    bits are ignored. If the sum of the index and length is greater than
-///    64, the result is undefined. If the length and index are both zero,
-///    bits [63:0] of parameter y are inserted into parameter x. If the
-///    length is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits
-///    of destination operand x with the specified bitfields replaced by the
-///    lower bits of source operand y. The upper 64 bits of the return value
-///    are undefined.
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter y are inserted into parameter x. If the length is
+///    zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand x with the specified bitfields replaced by the lower
+///    bits of source operand y. The upper 64 bits of the return value are
+///    undefined.
 
 #define _mm_inserti_si64(x, y, len, idx) \
   ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
@@ -128,14 +121,12 @@
                                     (char)(len), (char)(idx)))
 
 /// \brief Inserts bits of a specified length from the source integer vector
-///    __y into the lower 64 bits of the destination integer vector __x at
-///    the index and of the length specified by __y.
+///    __y into the lower 64 bits of the destination integer vector __x at the
+///    index and of the length specified by __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code
 /// This intrinsic corresponds to the \c INSERTQ instruction.
-/// \endcode
 ///
 /// \param __x
 ///    The destination operand where bits will be inserted. The inserted bits
@@ -145,14 +136,14 @@
 ///    The source operand containing the bits to be extracted. The extracted
 ///    bits are the least significant bits of operand __y with length specified
 ///    by bits [69:64]. These are inserted into the destination at the index
-///    specified by bits [77:72]; all other bits are ignored.
-///    If bits [69:64] are zero, the length is interpreted as 64.
-///    If the sum of the index and length is greater than 64, the result is
-///    undefined. If the length and index are both zero, bits [63:0] of
-///    parameter __y are inserted into parameter __x. If the length
-///    is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits
-///    of destination operand __x with the specified bitfields replaced by the
+///    specified by bits [77:72]; all other bits are ignored. If bits [69:64]
+///    are zero, the length is interpreted as 64. If the sum of the index and
+///    length is greater than 64, the result is undefined. If the length and
+///    index are both zero, bits [63:0] of parameter __y are inserted into
+///    parameter __x. If the length is zero but the index is non-zero, the
+///    result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand __x with the specified bitfields replaced by the
 ///    lower bits of source operand __y. The upper 64 bits of the return value
 ///    are undefined.
 
@@ -168,15 +159,12 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code
 /// This intrinsic corresponds to the \c MOVNTSD instruction.
-/// \endcode
 ///
 /// \param __p
 ///    The 64-bit memory location used to store the register value.
 /// \param __a
-///    The 64-bit double-precision floating-point register value to
-///    be stored.
+///    The 64-bit double-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_sd(double *__p, __m128d __a)
 {
@@ -189,15 +177,12 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code
 /// This intrinsic corresponds to the \c MOVNTSS instruction.
-/// \endcode
 ///
 /// \param __p
 ///    The 32-bit memory location used to store the register value.
 /// \param __a
-///    The 32-bit single-precision floating-point register value to
-///    be stored.
+///    The 32-bit single-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_ss(float *__p, __m128 __a)
 {
diff --git a/lib/Headers/arm_acle.h b/lib/Headers/arm_acle.h
index 4be1d09..8423e62 100644
--- a/lib/Headers/arm_acle.h
+++ b/lib/Headers/arm_acle.h
@@ -72,9 +72,11 @@
 
 /* 8.5 Swap */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __swp(uint32_t x, volatile uint32_t *p) {
+__swp(uint32_t __x, volatile uint32_t *__p) {
   uint32_t v;
-  do v = __builtin_arm_ldrex(p); while (__builtin_arm_strex(x, p));
+  do
+    v = __builtin_arm_ldrex(__p);
+  while (__builtin_arm_strex(__x, __p));
   return v;
 }
 
@@ -110,113 +112,115 @@
 /* 9.2 Miscellaneous data-processing intrinsics */
 /* ROR */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __ror(uint32_t x, uint32_t y) {
-  y %= 32;
-  if (y == 0)  return x;
-  return (x >> y) | (x << (32 - y));
+__ror(uint32_t __x, uint32_t __y) {
+  __y %= 32;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (32 - __y));
 }
 
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-  __rorll(uint64_t x, uint32_t y) {
-  y %= 64;
-  if (y == 0)  return x;
-  return (x >> y) | (x << (64 - y));
+__rorll(uint64_t __x, uint32_t __y) {
+  __y %= 64;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (64 - __y));
 }
 
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-  __rorl(unsigned long x, uint32_t y) {
+__rorl(unsigned long __x, uint32_t __y) {
 #if __SIZEOF_LONG__ == 4
-  return __ror(x, y);
+  return __ror(__x, __y);
 #else
-  return __rorll(x, y);
+  return __rorll(__x, __y);
 #endif
 }
 
 
 /* CLZ */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __clz(uint32_t t) {
-  return __builtin_clz(t);
+__clz(uint32_t __t) {
+  return __builtin_clz(__t);
 }
 
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-  __clzl(unsigned long t) {
-  return __builtin_clzl(t);
+__clzl(unsigned long __t) {
+  return __builtin_clzl(__t);
 }
 
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-  __clzll(uint64_t t) {
-  return __builtin_clzll(t);
+__clzll(uint64_t __t) {
+  return __builtin_clzll(__t);
 }
 
 /* REV */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __rev(uint32_t t) {
-  return __builtin_bswap32(t);
+__rev(uint32_t __t) {
+  return __builtin_bswap32(__t);
 }
 
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-  __revl(unsigned long t) {
+__revl(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
-  return __builtin_bswap32(t);
+  return __builtin_bswap32(__t);
 #else
-  return __builtin_bswap64(t);
+  return __builtin_bswap64(__t);
 #endif
 }
 
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-  __revll(uint64_t t) {
-  return __builtin_bswap64(t);
+__revll(uint64_t __t) {
+  return __builtin_bswap64(__t);
 }
 
 /* REV16 */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __rev16(uint32_t t) {
-  return __ror(__rev(t), 16);
+__rev16(uint32_t __t) {
+  return __ror(__rev(__t), 16);
 }
 
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-  __rev16ll(uint64_t t) {
-  return (((uint64_t)__rev16(t >> 32)) << 32) | __rev16(t);
+__rev16ll(uint64_t __t) {
+  return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
 }
 
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-  __rev16l(unsigned long t) {
+__rev16l(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
-    return __rev16(t);
+    return __rev16(__t);
 #else
-    return __rev16ll(t);
+    return __rev16ll(__t);
 #endif
 }
 
 /* REVSH */
 static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
-  __revsh(int16_t t) {
-  return __builtin_bswap16(t);
+__revsh(int16_t __t) {
+  return __builtin_bswap16(__t);
 }
 
 /* RBIT */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __rbit(uint32_t t) {
-  return __builtin_arm_rbit(t);
+__rbit(uint32_t __t) {
+  return __builtin_arm_rbit(__t);
 }
 
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-  __rbitll(uint64_t t) {
+__rbitll(uint64_t __t) {
 #if __ARM_32BIT_STATE
-  return (((uint64_t) __builtin_arm_rbit(t)) << 32) |
-    __builtin_arm_rbit(t >> 32);
+  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
+         __builtin_arm_rbit(__t >> 32);
 #else
-  return __builtin_arm_rbit64(t);
+  return __builtin_arm_rbit64(__t);
 #endif
 }
 
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-  __rbitl(unsigned long t) {
+__rbitl(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
-  return __rbit(t);
+  return __rbit(__t);
 #else
-  return __rbitll(t);
+  return __rbitll(__t);
 #endif
 }
 
@@ -235,61 +239,61 @@
 /* 9.4.2 Saturating addition and subtraction intrinsics */
 #if __ARM_32BIT_STATE
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-  __qadd(int32_t t, int32_t v) {
-  return __builtin_arm_qadd(t, v);
+__qadd(int32_t __t, int32_t __v) {
+  return __builtin_arm_qadd(__t, __v);
 }
 
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-  __qsub(int32_t t, int32_t v) {
-  return __builtin_arm_qsub(t, v);
+__qsub(int32_t __t, int32_t __v) {
+  return __builtin_arm_qsub(__t, __v);
 }
 
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__qdbl(int32_t t) {
-  return __builtin_arm_qadd(t, t);
+__qdbl(int32_t __t) {
+  return __builtin_arm_qadd(__t, __t);
 }
 #endif
 
 /* 9.7 CRC32 intrinsics */
 #if __ARM_FEATURE_CRC32
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __crc32b(uint32_t a, uint8_t b) {
-  return __builtin_arm_crc32b(a, b);
+__crc32b(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32b(__a, __b);
 }
 
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __crc32h(uint32_t a, uint16_t b) {
-  return __builtin_arm_crc32h(a, b);
+__crc32h(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32h(__a, __b);
 }
 
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __crc32w(uint32_t a, uint32_t b) {
-  return __builtin_arm_crc32w(a, b);
+__crc32w(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32w(__a, __b);
 }
 
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __crc32d(uint32_t a, uint64_t b) {
-  return __builtin_arm_crc32d(a, b);
+__crc32d(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32d(__a, __b);
 }
 
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __crc32cb(uint32_t a, uint8_t b) {
-  return __builtin_arm_crc32cb(a, b);
+__crc32cb(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32cb(__a, __b);
 }
 
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __crc32ch(uint32_t a, uint16_t b) {
-  return __builtin_arm_crc32ch(a, b);
+__crc32ch(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32ch(__a, __b);
 }
 
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __crc32cw(uint32_t a, uint32_t b) {
-  return __builtin_arm_crc32cw(a, b);
+__crc32cw(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32cw(__a, __b);
 }
 
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-  __crc32cd(uint32_t a, uint64_t b) {
-  return __builtin_arm_crc32cd(a, b);
+__crc32cd(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32cd(__a, __b);
 }
 #endif
 
diff --git a/lib/Headers/armintr.h b/lib/Headers/armintr.h
new file mode 100644
index 0000000..933afcb
--- /dev/null
+++ b/lib/Headers/armintr.h
@@ -0,0 +1,45 @@
+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <armintr.h>
+#else
+
+#ifndef __ARMINTR_H
+#define __ARMINTR_H
+
+typedef enum
+{
+  _ARM_BARRIER_SY    = 0xF,
+  _ARM_BARRIER_ST    = 0xE,
+  _ARM_BARRIER_ISH   = 0xB,
+  _ARM_BARRIER_ISHST = 0xA,
+  _ARM_BARRIER_NSH   = 0x7,
+  _ARM_BARRIER_NSHST = 0x6,
+  _ARM_BARRIER_OSH   = 0x3,
+  _ARM_BARRIER_OSHST = 0x2
+} _ARMINTR_BARRIER_TYPE;
+
+#endif /* __ARMINTR_H */
+#endif /* _MSC_VER */
diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h
index f786572..13bcbef 100644
--- a/lib/Headers/avx2intrin.h
+++ b/lib/Headers/avx2intrin.h
@@ -32,7 +32,9 @@
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2")))
 
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
-#define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M))
+#define _mm256_mpsadbw_epu8(X, Y, M) \
+  (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+                                     (__v32qi)(__m256i)(Y), (int)(M))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_abs_epi8(__m256i __a)
@@ -79,25 +81,25 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_add_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v32qi)__a + (__v32qi)__b);
+  return (__m256i)((__v32qu)__a + (__v32qu)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_add_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v16hi)__a + (__v16hi)__b);
+  return (__m256i)((__v16hu)__a + (__v16hu)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_add_epi32(__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v8si)__a + (__v8si)__b);
+  return (__m256i)((__v8su)__a + (__v8su)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_add_epi64(__m256i __a, __m256i __b)
 {
-  return __a + __b;
+  return (__m256i)((__v4du)__a + (__v4du)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -131,13 +133,13 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_and_si256(__m256i __a, __m256i __b)
 {
-  return __a & __b;
+  return (__m256i)((__v4du)__a & (__v4du)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_andnot_si256(__m256i __a, __m256i __b)
 {
-  return ~__a & __b;
+  return (__m256i)(~(__v4du)__a & (__v4du)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -200,7 +202,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 {
-  return (__m256i)(__a == __b);
+  return (__m256i)((__v4di)__a == (__v4di)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -226,7 +228,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 {
-  return (__m256i)(__a > __b);
+  return (__m256i)((__v4di)__a > (__v4di)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -358,73 +360,79 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi8_epi16(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi8_epi32(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi8_epi64(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi16_epi32(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V);
+  return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi16_epi64(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_epi64(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V);
+  return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu8_epi16(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V);
+  return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu8_epi32(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu8_epi64(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu16_epi32(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V);
+  return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu16_epi64(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu32_epi64(__m128i __V)
 {
-  return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V);
+  return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
 }
 
 static __inline__  __m256i __DEFAULT_FN_ATTRS
@@ -454,13 +462,13 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mullo_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v16hi)__a * (__v16hi)__b);
+  return (__m256i)((__v16hu)__a * (__v16hu)__b);
 }
 
 static __inline__  __m256i __DEFAULT_FN_ATTRS
 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v8si)__a * (__v8si)__b);
+  return (__m256i)((__v8su)__a * (__v8su)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -472,7 +480,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_or_si256(__m256i __a, __m256i __b)
 {
-  return __a | __b;
+  return (__m256i)((__v4du)__a | (__v4du)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -489,38 +497,42 @@
 
 #define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
   (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
-                                   (__v8si)_mm256_setzero_si256(), \
-                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
-                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
-                                   4 + (((imm) & 0x03) >> 0), \
-                                   4 + (((imm) & 0x0c) >> 2), \
-                                   4 + (((imm) & 0x30) >> 4), \
-                                   4 + (((imm) & 0xc0) >> 6)); })
+                                   (__v8si)_mm256_undefined_si256(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4 + (((imm) >> 0) & 0x3), \
+                                   4 + (((imm) >> 2) & 0x3), \
+                                   4 + (((imm) >> 4) & 0x3), \
+                                   4 + (((imm) >> 6) & 0x3)); })
 
 #define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
   (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
-                                   (__v16hi)_mm256_setzero_si256(), \
+                                   (__v16hi)_mm256_undefined_si256(), \
                                    0, 1, 2, 3, \
-                                   4 + (((imm) & 0x03) >> 0), \
-                                   4 + (((imm) & 0x0c) >> 2), \
-                                   4 + (((imm) & 0x30) >> 4), \
-                                   4 + (((imm) & 0xc0) >> 6), \
+                                   4  + (((imm) >> 0) & 0x3), \
+                                   4  + (((imm) >> 2) & 0x3), \
+                                   4  + (((imm) >> 4) & 0x3), \
+                                   4  + (((imm) >> 6) & 0x3), \
                                    8, 9, 10, 11, \
-                                   12 + (((imm) & 0x03) >> 0), \
-                                   12 + (((imm) & 0x0c) >> 2), \
-                                   12 + (((imm) & 0x30) >> 4), \
-                                   12 + (((imm) & 0xc0) >> 6)); })
+                                   12 + (((imm) >> 0) & 0x3), \
+                                   12 + (((imm) >> 2) & 0x3), \
+                                   12 + (((imm) >> 4) & 0x3), \
+                                   12 + (((imm) >> 6) & 0x3)); })
 
 #define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
   (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
-                                   (__v16hi)_mm256_setzero_si256(), \
-                                   (imm) & 0x3,((imm) & 0xc) >> 2, \
-                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   (__v16hi)_mm256_undefined_si256(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
                                    4, 5, 6, 7, \
-                                   8 + (((imm) & 0x03) >> 0), \
-                                   8 + (((imm) & 0x0c) >> 2), \
-                                   8 + (((imm) & 0x30) >> 4), \
-                                   8 + (((imm) & 0xc0) >> 6), \
+                                   8 + (((imm) >> 0) & 0x3), \
+                                   8 + (((imm) >> 2) & 0x3), \
+                                   8 + (((imm) >> 4) & 0x3), \
+                                   8 + (((imm) >> 6) & 0x3), \
                                    12, 13, 14, 15); })
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -541,8 +553,42 @@
     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
 }
 
-#define _mm256_slli_si256(a, count) __extension__ ({ \
-  (__m256i)__builtin_ia32_pslldqi256((__m256i)(a), (count)*8); })
+#define _mm256_slli_si256(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector(                                          \
+        (__v32qi)_mm256_setzero_si256(),                                     \
+        (__v32qi)(__m256i)(a),                                               \
+        ((char)(imm)&0xF0) ?  0 : ((char)(imm)>0x0 ? 16 : 32) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  1 : ((char)(imm)>0x1 ? 17 : 33) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  2 : ((char)(imm)>0x2 ? 18 : 34) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  3 : ((char)(imm)>0x3 ? 19 : 35) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  4 : ((char)(imm)>0x4 ? 20 : 36) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  5 : ((char)(imm)>0x5 ? 21 : 37) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  6 : ((char)(imm)>0x6 ? 22 : 38) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  7 : ((char)(imm)>0x7 ? 23 : 39) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  8 : ((char)(imm)>0x8 ? 24 : 40) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  9 : ((char)(imm)>0x9 ? 25 : 41) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 : 42) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 : 43) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 : 44) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 : 45) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 : 46) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 : 47) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 : 48) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 : 49) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 : 50) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 : 51) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 : 52) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 : 53) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 : 54) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 : 55) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 : 56) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 : 57) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 : 58) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 : 59) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 63) - (char)(imm)); })
 
 #define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
 
@@ -573,13 +619,13 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_slli_epi64(__m256i __a, int __count)
 {
-  return __builtin_ia32_psllqi256(__a, __count);
+  return __builtin_ia32_psllqi256((__v4di)__a, __count);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sll_epi64(__m256i __a, __m128i __count)
 {
-  return __builtin_ia32_psllq256(__a, __count);
+  return __builtin_ia32_psllq256((__v4di)__a, __count);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -606,8 +652,42 @@
   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
 }
 
-#define _mm256_srli_si256(a, count) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrldqi256((__m256i)(a), (count)*8); })
+#define _mm256_srli_si256(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector(                                           \
+        (__v32qi)(__m256i)(a),                                               \
+        (__v32qi)_mm256_setzero_si256(),                                     \
+        ((char)(imm)&0xF0) ? 32 : (char)(imm) + ((char)(imm)>0xF ? 16 : 0),  \
+        ((char)(imm)&0xF0) ? 33 : (char)(imm) + ((char)(imm)>0xE ? 17 : 1),  \
+        ((char)(imm)&0xF0) ? 34 : (char)(imm) + ((char)(imm)>0xD ? 18 : 2),  \
+        ((char)(imm)&0xF0) ? 35 : (char)(imm) + ((char)(imm)>0xC ? 19 : 3),  \
+        ((char)(imm)&0xF0) ? 36 : (char)(imm) + ((char)(imm)>0xB ? 20 : 4),  \
+        ((char)(imm)&0xF0) ? 37 : (char)(imm) + ((char)(imm)>0xA ? 21 : 5),  \
+        ((char)(imm)&0xF0) ? 38 : (char)(imm) + ((char)(imm)>0x9 ? 22 : 6),  \
+        ((char)(imm)&0xF0) ? 39 : (char)(imm) + ((char)(imm)>0x8 ? 23 : 7),  \
+        ((char)(imm)&0xF0) ? 40 : (char)(imm) + ((char)(imm)>0x7 ? 24 : 8),  \
+        ((char)(imm)&0xF0) ? 41 : (char)(imm) + ((char)(imm)>0x6 ? 25 : 9),  \
+        ((char)(imm)&0xF0) ? 42 : (char)(imm) + ((char)(imm)>0x5 ? 26 : 10), \
+        ((char)(imm)&0xF0) ? 43 : (char)(imm) + ((char)(imm)>0x4 ? 27 : 11), \
+        ((char)(imm)&0xF0) ? 44 : (char)(imm) + ((char)(imm)>0x3 ? 28 : 12), \
+        ((char)(imm)&0xF0) ? 45 : (char)(imm) + ((char)(imm)>0x2 ? 29 : 13), \
+        ((char)(imm)&0xF0) ? 46 : (char)(imm) + ((char)(imm)>0x1 ? 30 : 14), \
+        ((char)(imm)&0xF0) ? 47 : (char)(imm) + ((char)(imm)>0x0 ? 31 : 15), \
+        ((char)(imm)&0xF0) ? 48 : (char)(imm) + ((char)(imm)>0xF ? 32 : 16), \
+        ((char)(imm)&0xF0) ? 49 : (char)(imm) + ((char)(imm)>0xE ? 33 : 17), \
+        ((char)(imm)&0xF0) ? 50 : (char)(imm) + ((char)(imm)>0xD ? 34 : 18), \
+        ((char)(imm)&0xF0) ? 51 : (char)(imm) + ((char)(imm)>0xC ? 35 : 19), \
+        ((char)(imm)&0xF0) ? 52 : (char)(imm) + ((char)(imm)>0xB ? 36 : 20), \
+        ((char)(imm)&0xF0) ? 53 : (char)(imm) + ((char)(imm)>0xA ? 37 : 21), \
+        ((char)(imm)&0xF0) ? 54 : (char)(imm) + ((char)(imm)>0x9 ? 38 : 22), \
+        ((char)(imm)&0xF0) ? 55 : (char)(imm) + ((char)(imm)>0x8 ? 39 : 23), \
+        ((char)(imm)&0xF0) ? 56 : (char)(imm) + ((char)(imm)>0x7 ? 40 : 24), \
+        ((char)(imm)&0xF0) ? 57 : (char)(imm) + ((char)(imm)>0x6 ? 41 : 25), \
+        ((char)(imm)&0xF0) ? 58 : (char)(imm) + ((char)(imm)>0x5 ? 42 : 26), \
+        ((char)(imm)&0xF0) ? 59 : (char)(imm) + ((char)(imm)>0x4 ? 43 : 27), \
+        ((char)(imm)&0xF0) ? 60 : (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \
+        ((char)(imm)&0xF0) ? 61 : (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \
+        ((char)(imm)&0xF0) ? 62 : (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \
+        ((char)(imm)&0xF0) ? 63 : (char)(imm) + ((char)(imm)>0x0 ? 47 : 31)); })
 
 #define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
 
@@ -638,37 +718,37 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srli_epi64(__m256i __a, int __count)
 {
-  return __builtin_ia32_psrlqi256(__a, __count);
+  return __builtin_ia32_psrlqi256((__v4di)__a, __count);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srl_epi64(__m256i __a, __m128i __count)
 {
-  return __builtin_ia32_psrlq256(__a, __count);
+  return __builtin_ia32_psrlq256((__v4di)__a, __count);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sub_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v32qi)__a - (__v32qi)__b);
+  return (__m256i)((__v32qu)__a - (__v32qu)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sub_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v16hi)__a - (__v16hi)__b);
+  return (__m256i)((__v16hu)__a - (__v16hu)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sub_epi32(__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v8si)__a - (__v8si)__b);
+  return (__m256i)((__v8su)__a - (__v8su)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sub_epi64(__m256i __a, __m256i __b)
 {
-  return __a - __b;
+  return (__m256i)((__v4du)__a - (__v4du)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -716,7 +796,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3);
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -740,13 +820,13 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2);
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_xor_si256(__m256i __a, __m256i __b)
 {
-  return __a ^ __b;
+  return (__m256i)((__v4du)__a ^ (__v4du)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -764,7 +844,7 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_broadcastsd_pd(__m128d __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 0);
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
@@ -782,7 +862,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastsi128_si256(__m128i __X)
 {
-  return (__m256i)__builtin_shufflevector(__X, __X, 0, 1, 0, 1);
+  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
 }
 
 #define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
@@ -826,7 +906,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastq_epi64(__m128i __X)
 {
-  return (__m256i)__builtin_shufflevector(__X, __X, 0, 0, 0, 0);
+  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -851,7 +931,7 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_broadcastq_epi64(__m128i __X)
 {
-  return (__m128i)__builtin_shufflevector(__X, __X, 0, 0);
+  return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -862,9 +942,11 @@
 
 #define _mm256_permute4x64_pd(V, M) __extension__ ({ \
   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
-                                   (__v4df)_mm256_setzero_pd(), \
-                                   (M) & 0x3, ((M) & 0xc) >> 2, \
-                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((M) >> 0) & 0x3, \
+                                   ((M) >> 2) & 0x3, \
+                                   ((M) >> 4) & 0x3, \
+                                   ((M) >> 6) & 0x3); })
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
@@ -874,16 +956,18 @@
 
 #define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
   (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
-                                   (__v4di)_mm256_setzero_si256(), \
-                                   (M) & 0x3, ((M) & 0xc) >> 2, \
-                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((M) >> 0) & 0x3, \
+                                   ((M) >> 2) & 0x3, \
+                                   ((M) >> 4) & 0x3, \
+                                   ((M) >> 6) & 0x3); })
 
 #define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
   (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })
 
 #define _mm256_extracti128_si256(V, M) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
-                                   (__v4di)_mm256_setzero_si256(), \
+                                   (__v4di)_mm256_undefined_si256(), \
                                    (((M) & 1) ? 2 : 0), \
                                    (((M) & 1) ? 3 : 1) ); })
 
@@ -904,7 +988,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskload_epi64(long long const *__X, __m256i __M)
 {
-  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M);
+  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -928,7 +1012,7 @@
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
 {
-  __builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y);
+  __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
@@ -940,7 +1024,7 @@
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
 {
-  __builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y);
+  __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -958,13 +1042,13 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 {
-  return (__m256i)__builtin_ia32_psllv4di(__X, __Y);
+  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sllv_epi64(__m128i __X, __m128i __Y)
 {
-  return (__m128i)__builtin_ia32_psllv2di(__X, __Y);
+  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -994,13 +1078,13 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 {
-  return (__m256i)__builtin_ia32_psrlv4di(__X, __Y);
+  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srlv_epi64(__m128i __X, __m128i __Y)
 {
-  return (__m128i)__builtin_ia32_psrlv2di(__X, __Y);
+  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
 }
 
 #define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h
index f289ed7..d3c5a6c 100644
--- a/lib/Headers/avx512bwintrin.h
+++ b/lib/Headers/avx512bwintrin.h
@@ -30,30 +30,28 @@
 
 typedef unsigned int __mmask32;
 typedef unsigned long long __mmask64;
-typedef char __v64qi __attribute__ ((__vector_size__ (64)));
-typedef short __v32hi __attribute__ ((__vector_size__ (64)));
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
 
-static  __inline __v64qi __DEFAULT_FN_ATTRS
+static  __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_setzero_qi(void) {
-  return (__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0 };
+  return (__m512i)(__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
-static  __inline __v32hi __DEFAULT_FN_ATTRS
+static  __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_setzero_hi(void) {
-  return (__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       0, 0, 0, 0, 0, 0, 0, 0 };
+  return (__m512i)(__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
 /* Integer compare */
@@ -348,7 +346,7 @@
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi8 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v64qi) __A + (__v64qi) __B);
+  return (__m512i) ((__v64qu) __A + (__v64qu) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -369,7 +367,7 @@
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sub_epi8 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v64qi) __A - (__v64qi) __B);
+  return (__m512i) ((__v64qu) __A - (__v64qu) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -390,7 +388,7 @@
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v32hi) __A + (__v32hi) __B);
+  return (__m512i) ((__v32hu) __A + (__v32hu) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -411,7 +409,7 @@
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sub_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v32hi) __A - (__v32hi) __B);
+  return (__m512i) ((__v32hu) __A - (__v32hu) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -432,7 +430,7 @@
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mullo_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v32hi) __A * (__v32hi) __B);
+  return (__m512i) ((__v32hu) __A * (__v32hu) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -454,17 +452,17 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
 {
-  return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A,
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
               (__v64qi) __W,
-              (__mmask64) __U);
+              (__v64qi) __A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
 {
-  return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A,
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
               (__v32hi) __W,
-              (__mmask32) __U);
+              (__v32hi) __A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1396,145 +1394,1015 @@
               __M);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_unpackhi_epi8 (__m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
-                 (__v64qi) __B,
-                 (__v64qi) _mm512_setzero_qi(),
-                 (__mmask64) -1);
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
-         __m512i __B) {
-  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
-                 (__v64qi) __B,
-                 (__v64qi) __W,
-                 (__mmask64) __U);
+_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
+                                          8,  64+8,   9, 64+9,
+                                          10, 64+10, 11, 64+11,
+                                          12, 64+12, 13, 64+13,
+                                          14, 64+14, 15, 64+15,
+                                          24, 64+24, 25, 64+25,
+                                          26, 64+26, 27, 64+27,
+                                          28, 64+28, 29, 64+29,
+                                          30, 64+30, 31, 64+31,
+                                          40, 64+40, 41, 64+41,
+                                          42, 64+42, 43, 64+43,
+                                          44, 64+44, 45, 64+45,
+                                          46, 64+46, 47, 64+47,
+                                          56, 64+56, 57, 64+57,
+                                          58, 64+58, 59, 64+59,
+                                          60, 64+60, 61, 64+61,
+                                          62, 64+62, 63, 64+63);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
-                 (__v64qi) __B,
-                 (__v64qi) _mm512_setzero_qi(),
-                 (__mmask64) __U);
+_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpackhi_epi8(__A, __B),
+                                        (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_unpackhi_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
-                 (__v32hi) __B,
-                 (__v32hi) _mm512_setzero_hi(),
-                 (__mmask32) -1);
+_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpackhi_epi8(__A, __B),
+                                        (__v64qi)_mm512_setzero_qi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-          __m512i __B) {
-  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
-                 (__v32hi) __B,
-                 (__v32hi) __W,
-                 (__mmask32) __U);
+_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
+                                          4,  32+4,   5, 32+5,
+                                          6,  32+6,   7, 32+7,
+                                          12, 32+12, 13, 32+13,
+                                          14, 32+14, 15, 32+15,
+                                          20, 32+20, 21, 32+21,
+                                          22, 32+22, 23, 32+23,
+                                          28, 32+28, 29, 32+29,
+                                          30, 32+30, 31, 32+31);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
-                 (__v32hi) __B,
-                 (__v32hi) _mm512_setzero_hi(),
-                 (__mmask32) __U);
+_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpackhi_epi16(__A, __B),
+                                       (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_unpacklo_epi8 (__m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
-                 (__v64qi) __B,
-                 (__v64qi) _mm512_setzero_qi(),
-                 (__mmask64) -1);
+_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpackhi_epi16(__A, __B),
+                                       (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
-         __m512i __B) {
-  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
-                 (__v64qi) __B,
-                 (__v64qi) __W,
-                 (__mmask64) __U);
+_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
+                                          0,  64+0,   1, 64+1,
+                                          2,  64+2,   3, 64+3,
+                                          4,  64+4,   5, 64+5,
+                                          6,  64+6,   7, 64+7,
+                                          16, 64+16, 17, 64+17,
+                                          18, 64+18, 19, 64+19,
+                                          20, 64+20, 21, 64+21,
+                                          22, 64+22, 23, 64+23,
+                                          32, 64+32, 33, 64+33,
+                                          34, 64+34, 35, 64+35,
+                                          36, 64+36, 37, 64+37,
+                                          38, 64+38, 39, 64+39,
+                                          48, 64+48, 49, 64+49,
+                                          50, 64+50, 51, 64+51,
+                                          52, 64+52, 53, 64+53,
+                                          54, 64+54, 55, 64+55);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
-                 (__v64qi) __B,
-                 (__v64qi) _mm512_setzero_qi(),
-                 (__mmask64) __U);
+_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpacklo_epi8(__A, __B),
+                                        (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_unpacklo_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
-                 (__v32hi) __B,
-                 (__v32hi) _mm512_setzero_hi(),
-                 (__mmask32) -1);
+_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpacklo_epi8(__A, __B),
+                                        (__v64qi)_mm512_setzero_qi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-          __m512i __B) {
-  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
-                 (__v32hi) __B,
-                 (__v32hi) __W,
-                 (__mmask32) __U);
+_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
+                                          0,  32+0,   1, 32+1,
+                                          2,  32+2,   3, 32+3,
+                                          8,  32+8,   9, 32+9,
+                                          10, 32+10, 11, 32+11,
+                                          16, 32+16, 17, 32+17,
+                                          18, 32+18, 19, 32+19,
+                                          24, 32+24, 25, 32+25,
+                                          26, 32+26, 27, 32+27);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
-                 (__v32hi) __B,
-                 (__v32hi) _mm512_setzero_hi(),
-                 (__mmask32) __U);
+_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpacklo_epi16(__A, __B),
+                                       (__v32hi)__W);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpacklo_epi16(__A, __B),
+                                       (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi16 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+                (__v32hi)
+                _mm512_setzero_hi (),
+                (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+                (__v32hi) __W,
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+                (__v32hi)
+                _mm512_setzero_hi(),
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi16 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+                (__v32hi)
+                _mm512_setzero_hi (),
+                (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+                (__v32hi) __W,
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+                (__v32hi)
+                _mm512_setzero_hi(),
+                (__mmask32) __U);
+}
+
+
 #define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                         (__v64qi)(__m512i)(b), \
-                                         (p), (__mmask64)-1); })
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)-1); })
 
 #define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                         (__v64qi)(__m512i)(b), \
-                                         (p), (__mmask64)(m)); })
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)(m)); })
 
 #define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), \
-                                          (p), (__mmask64)-1); })
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)-1); })
 
 #define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), \
-                                          (p), (__mmask64)(m)); })
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)(m)); })
 
 #define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                         (__v32hi)(__m512i)(b), \
-                                         (p), (__mmask32)-1); })
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)-1); })
 
 #define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                         (__v32hi)(__m512i)(b), \
-                                         (p), (__mmask32)(m)); })
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)(m)); })
 
 #define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), \
-                                          (p), (__mmask32)-1); })
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)-1); })
 
 #define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), \
-                                          (p), (__mmask32)(m)); })
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)(m)); })
+
+#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
+                                   (__v32hi)_mm512_undefined_epi32(), \
+                                   0, 1, 2, 3, \
+                                   4  + (((imm) >> 0) & 0x3), \
+                                   4  + (((imm) >> 2) & 0x3), \
+                                   4  + (((imm) >> 4) & 0x3), \
+                                   4  + (((imm) >> 6) & 0x3), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) >> 0) & 0x3), \
+                                   12 + (((imm) >> 2) & 0x3), \
+                                   12 + (((imm) >> 4) & 0x3), \
+                                   12 + (((imm) >> 6) & 0x3), \
+                                   16, 17, 18, 19, \
+                                   20 + (((imm) >> 0) & 0x3), \
+                                   20 + (((imm) >> 2) & 0x3), \
+                                   20 + (((imm) >> 4) & 0x3), \
+                                   20 + (((imm) >> 6) & 0x3), \
+                                   24, 25, 26, 27, \
+                                   28 + (((imm) >> 0) & 0x3), \
+                                   28 + (((imm) >> 2) & 0x3), \
+                                   28 + (((imm) >> 4) & 0x3), \
+                                   28 + (((imm) >> 6) & 0x3)); })
+
+#define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflehi_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)(__m512i)(W)); })
+
+#define _mm512_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflehi_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)_mm512_setzero_hi()); })
+
+#define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
+                                   (__v32hi)_mm512_undefined_epi32(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) >> 0) & 0x3), \
+                                   8 + (((imm) >> 2) & 0x3), \
+                                   8 + (((imm) >> 4) & 0x3), \
+                                   8 + (((imm) >> 6) & 0x3), \
+                                   12, 13, 14, 15, \
+                                   16 + (((imm) >> 0) & 0x3), \
+                                   16 + (((imm) >> 2) & 0x3), \
+                                   16 + (((imm) >> 4) & 0x3), \
+                                   16 + (((imm) >> 6) & 0x3), \
+                                   20, 21, 22, 23, \
+                                   24 + (((imm) >> 0) & 0x3), \
+                                   24 + (((imm) >> 2) & 0x3), \
+                                   24 + (((imm) >> 4) & 0x3), \
+                                   24 + (((imm) >> 6) & 0x3), \
+                                   28, 29, 30, 31); })
+
+
+#define _mm512_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)(__m512i)(W)); })
+
+
+#define _mm512_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)_mm512_setzero_hi()); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi)
+              _mm512_setzero_hi (),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi)
+              _mm512_setzero_hi (),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi16 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi)
+             _mm512_setzero_hi (),
+             (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+           __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi)
+             _mm512_setzero_hi (),
+             (__mmask32) __U);
+}
+
+#define _mm512_slli_epi16(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
+                                         (__v32hi)_mm512_setzero_hi(), \
+                                         (__mmask32)-1); })
+
+#define _mm512_mask_slli_epi16(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
+                                         (__v32hi)(__m512i)(W), \
+                                         (__mmask32)(U)); })
+
+#define _mm512_maskz_slli_epi16(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
+                                         (__v32hi)_mm512_setzero_hi(), \
+                                         (__mmask32)(U)); })
+
+#define _mm512_bslli_epi128(a, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector(                                          \
+       (__v64qi)_mm512_setzero_si512(),                                      \
+       (__v64qi)(__m512i)(a),                                                \
+       ((char)(imm)&0xF0) ?  0 : ((char)(imm)>0x0 ? 16 :  64) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  1 : ((char)(imm)>0x1 ? 17 :  65) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  2 : ((char)(imm)>0x2 ? 18 :  66) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  3 : ((char)(imm)>0x3 ? 19 :  67) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  4 : ((char)(imm)>0x4 ? 20 :  68) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  5 : ((char)(imm)>0x5 ? 21 :  69) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  6 : ((char)(imm)>0x6 ? 22 :  70) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  7 : ((char)(imm)>0x7 ? 23 :  71) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  8 : ((char)(imm)>0x8 ? 24 :  72) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  9 : ((char)(imm)>0x9 ? 25 :  73) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 :  74) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 :  75) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 :  76) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 :  77) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 :  78) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 :  79) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 :  80) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 :  81) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 :  82) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 :  83) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 :  84) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 :  85) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 :  86) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 :  87) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 :  88) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 :  89) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 :  90) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 :  91) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 :  92) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 :  93) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 :  94) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 :  95) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 32 : ((char)(imm)>0x0 ? 48 :  96) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 33 : ((char)(imm)>0x1 ? 49 :  97) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 34 : ((char)(imm)>0x2 ? 50 :  98) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 35 : ((char)(imm)>0x3 ? 51 :  99) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 36 : ((char)(imm)>0x4 ? 52 : 100) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 37 : ((char)(imm)>0x5 ? 53 : 101) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 38 : ((char)(imm)>0x6 ? 54 : 102) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 39 : ((char)(imm)>0x7 ? 55 : 103) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 40 : ((char)(imm)>0x8 ? 56 : 104) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 41 : ((char)(imm)>0x9 ? 57 : 105) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 42 : ((char)(imm)>0xA ? 58 : 106) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 43 : ((char)(imm)>0xB ? 59 : 107) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 44 : ((char)(imm)>0xC ? 60 : 108) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 45 : ((char)(imm)>0xD ? 61 : 109) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 46 : ((char)(imm)>0xE ? 62 : 110) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 47 : ((char)(imm)>0xF ? 63 : 111) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 48 : ((char)(imm)>0x0 ? 64 : 112) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 49 : ((char)(imm)>0x1 ? 65 : 113) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 50 : ((char)(imm)>0x2 ? 66 : 114) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 51 : ((char)(imm)>0x3 ? 67 : 115) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 52 : ((char)(imm)>0x4 ? 68 : 116) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 53 : ((char)(imm)>0x5 ? 69 : 117) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 54 : ((char)(imm)>0x6 ? 70 : 118) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 55 : ((char)(imm)>0x7 ? 71 : 119) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 56 : ((char)(imm)>0x8 ? 72 : 120) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 57 : ((char)(imm)>0x9 ? 73 : 121) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 58 : ((char)(imm)>0xA ? 74 : 122) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 59 : ((char)(imm)>0xB ? 75 : 123) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 60 : ((char)(imm)>0xC ? 76 : 124) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 61 : ((char)(imm)>0xD ? 77 : 125) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 62 : ((char)(imm)>0xE ? 78 : 126) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi)
+              _mm512_setzero_hi (),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi)
+              _mm512_setzero_hi (),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi)
+              _mm512_setzero_hi (),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi)
+              _mm512_setzero_hi (),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi16 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi)
+             _mm512_setzero_hi (),
+             (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+           __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi) __W,
+            (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi)
+             _mm512_setzero_hi (),
+            (__mmask32) __U);
+}
+
+#define _mm512_srai_epi16(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
+                                         (__v32hi)_mm512_setzero_hi(), \
+                                         (__mmask32)-1); })
+
+#define _mm512_mask_srai_epi16(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
+                                         (__v32hi)(__m512i)(W), \
+                                         (__mmask32)(U)); })
+
+#define _mm512_maskz_srai_epi16(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
+                                         (__v32hi)_mm512_setzero_hi(), \
+                                         (__mmask32)(U)); })
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi16 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi)
+             _mm512_setzero_hi (),
+             (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+           __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+             (__v8hi) __B,
+             (__v32hi)
+             _mm512_setzero_hi (),
+             (__mmask32) __U);
+}
+
+#define _mm512_srli_epi16(A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
+                                         (__v32hi)_mm512_setzero_hi(), \
+                                         (__mmask32)-1); })
+
+#define _mm512_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
+                                         (__v32hi)(__m512i)(W), \
+                                         (__mmask32)(U)); })
+
+#define _mm512_maskz_srli_epi16(U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
+                                         (__v32hi)_mm512_setzero_hi(), \
+                                         (__mmask32)(U)); })
+
+#define _mm512_bsrli_epi128(a, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector(                     \
+      (__v64qi)(__m512i)(a),                      \
+      (__v64qi)_mm512_setzero_si512(),            \
+      ((char)(imm)&0xF0) ?  64 : (char)(imm) + ((char)(imm)>0xF ?  48 : 0),  \
+      ((char)(imm)&0xF0) ?  65 : (char)(imm) + ((char)(imm)>0xE ?  49 : 1),  \
+      ((char)(imm)&0xF0) ?  66 : (char)(imm) + ((char)(imm)>0xD ?  50 : 2),  \
+      ((char)(imm)&0xF0) ?  67 : (char)(imm) + ((char)(imm)>0xC ?  51 : 3),  \
+      ((char)(imm)&0xF0) ?  68 : (char)(imm) + ((char)(imm)>0xB ?  52 : 4),  \
+      ((char)(imm)&0xF0) ?  69 : (char)(imm) + ((char)(imm)>0xA ?  53 : 5),  \
+      ((char)(imm)&0xF0) ?  70 : (char)(imm) + ((char)(imm)>0x9 ?  54 : 6),  \
+      ((char)(imm)&0xF0) ?  71 : (char)(imm) + ((char)(imm)>0x8 ?  55 : 7),  \
+      ((char)(imm)&0xF0) ?  72 : (char)(imm) + ((char)(imm)>0x7 ?  56 : 8),  \
+      ((char)(imm)&0xF0) ?  73 : (char)(imm) + ((char)(imm)>0x6 ?  57 : 9),  \
+      ((char)(imm)&0xF0) ?  74 : (char)(imm) + ((char)(imm)>0x5 ?  58 : 10), \
+      ((char)(imm)&0xF0) ?  75 : (char)(imm) + ((char)(imm)>0x4 ?  59 : 11), \
+      ((char)(imm)&0xF0) ?  76 : (char)(imm) + ((char)(imm)>0x3 ?  60 : 12), \
+      ((char)(imm)&0xF0) ?  77 : (char)(imm) + ((char)(imm)>0x2 ?  61 : 13), \
+      ((char)(imm)&0xF0) ?  78 : (char)(imm) + ((char)(imm)>0x1 ?  62 : 14), \
+      ((char)(imm)&0xF0) ?  79 : (char)(imm) + ((char)(imm)>0x0 ?  63 : 15), \
+      ((char)(imm)&0xF0) ?  80 : (char)(imm) + ((char)(imm)>0xF ?  64 : 16), \
+      ((char)(imm)&0xF0) ?  81 : (char)(imm) + ((char)(imm)>0xE ?  65 : 17), \
+      ((char)(imm)&0xF0) ?  82 : (char)(imm) + ((char)(imm)>0xD ?  66 : 18), \
+      ((char)(imm)&0xF0) ?  83 : (char)(imm) + ((char)(imm)>0xC ?  67 : 19), \
+      ((char)(imm)&0xF0) ?  84 : (char)(imm) + ((char)(imm)>0xB ?  68 : 20), \
+      ((char)(imm)&0xF0) ?  85 : (char)(imm) + ((char)(imm)>0xA ?  69 : 21), \
+      ((char)(imm)&0xF0) ?  86 : (char)(imm) + ((char)(imm)>0x9 ?  70 : 22), \
+      ((char)(imm)&0xF0) ?  87 : (char)(imm) + ((char)(imm)>0x8 ?  71 : 23), \
+      ((char)(imm)&0xF0) ?  88 : (char)(imm) + ((char)(imm)>0x7 ?  72 : 24), \
+      ((char)(imm)&0xF0) ?  89 : (char)(imm) + ((char)(imm)>0x6 ?  73 : 25), \
+      ((char)(imm)&0xF0) ?  90 : (char)(imm) + ((char)(imm)>0x5 ?  74 : 26), \
+      ((char)(imm)&0xF0) ?  91 : (char)(imm) + ((char)(imm)>0x4 ?  75 : 27), \
+      ((char)(imm)&0xF0) ?  92 : (char)(imm) + ((char)(imm)>0x3 ?  76 : 28), \
+      ((char)(imm)&0xF0) ?  93 : (char)(imm) + ((char)(imm)>0x2 ?  77 : 29), \
+      ((char)(imm)&0xF0) ?  94 : (char)(imm) + ((char)(imm)>0x1 ?  78 : 30), \
+      ((char)(imm)&0xF0) ?  95 : (char)(imm) + ((char)(imm)>0x0 ?  79 : 31), \
+      ((char)(imm)&0xF0) ?  96 : (char)(imm) + ((char)(imm)>0xF ?  80 : 32), \
+      ((char)(imm)&0xF0) ?  97 : (char)(imm) + ((char)(imm)>0xE ?  81 : 33), \
+      ((char)(imm)&0xF0) ?  98 : (char)(imm) + ((char)(imm)>0xD ?  82 : 34), \
+      ((char)(imm)&0xF0) ?  99 : (char)(imm) + ((char)(imm)>0xC ?  83 : 35), \
+      ((char)(imm)&0xF0) ? 100 : (char)(imm) + ((char)(imm)>0xB ?  84 : 36), \
+      ((char)(imm)&0xF0) ? 101 : (char)(imm) + ((char)(imm)>0xA ?  85 : 37), \
+      ((char)(imm)&0xF0) ? 102 : (char)(imm) + ((char)(imm)>0x9 ?  86 : 38), \
+      ((char)(imm)&0xF0) ? 103 : (char)(imm) + ((char)(imm)>0x8 ?  87 : 39), \
+      ((char)(imm)&0xF0) ? 104 : (char)(imm) + ((char)(imm)>0x7 ?  88 : 40), \
+      ((char)(imm)&0xF0) ? 105 : (char)(imm) + ((char)(imm)>0x6 ?  89 : 41), \
+      ((char)(imm)&0xF0) ? 106 : (char)(imm) + ((char)(imm)>0x5 ?  90 : 42), \
+      ((char)(imm)&0xF0) ? 107 : (char)(imm) + ((char)(imm)>0x4 ?  91 : 43), \
+      ((char)(imm)&0xF0) ? 108 : (char)(imm) + ((char)(imm)>0x3 ?  92 : 44), \
+      ((char)(imm)&0xF0) ? 109 : (char)(imm) + ((char)(imm)>0x2 ?  93 : 45), \
+      ((char)(imm)&0xF0) ? 110 : (char)(imm) + ((char)(imm)>0x1 ?  94 : 46), \
+      ((char)(imm)&0xF0) ? 111 : (char)(imm) + ((char)(imm)>0x0 ?  95 : 47), \
+      ((char)(imm)&0xF0) ? 112 : (char)(imm) + ((char)(imm)>0xF ?  96 : 48), \
+      ((char)(imm)&0xF0) ? 113 : (char)(imm) + ((char)(imm)>0xE ?  97 : 49), \
+      ((char)(imm)&0xF0) ? 114 : (char)(imm) + ((char)(imm)>0xD ?  98 : 50), \
+      ((char)(imm)&0xF0) ? 115 : (char)(imm) + ((char)(imm)>0xC ?  99 : 51), \
+      ((char)(imm)&0xF0) ? 116 : (char)(imm) + ((char)(imm)>0xB ? 100 : 52), \
+      ((char)(imm)&0xF0) ? 117 : (char)(imm) + ((char)(imm)>0xA ? 101 : 53), \
+      ((char)(imm)&0xF0) ? 118 : (char)(imm) + ((char)(imm)>0x9 ? 102 : 54), \
+      ((char)(imm)&0xF0) ? 119 : (char)(imm) + ((char)(imm)>0x8 ? 103 : 55), \
+      ((char)(imm)&0xF0) ? 120 : (char)(imm) + ((char)(imm)>0x7 ? 104 : 56), \
+      ((char)(imm)&0xF0) ? 121 : (char)(imm) + ((char)(imm)>0x6 ? 105 : 57), \
+      ((char)(imm)&0xF0) ? 122 : (char)(imm) + ((char)(imm)>0x5 ? 106 : 58), \
+      ((char)(imm)&0xF0) ? 123 : (char)(imm) + ((char)(imm)>0x4 ? 107 : 59), \
+      ((char)(imm)&0xF0) ? 124 : (char)(imm) + ((char)(imm)>0x3 ? 108 : 60), \
+      ((char)(imm)&0xF0) ? 125 : (char)(imm) + ((char)(imm)>0x2 ? 109 : 61), \
+      ((char)(imm)&0xF0) ? 126 : (char)(imm) + ((char)(imm)>0x1 ? 110 : 62), \
+      ((char)(imm)&0xF0) ? 127 : (char)(imm) + ((char)(imm)>0x0 ? 111 : 63)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+                (__v32hi) __A,
+                (__v32hi) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+                (__v32hi) __A,
+                (__v32hi) _mm512_setzero_hi ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+                (__v64qi) __A,
+                (__v64qi) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+                (__v64qi) __A,
+                (__v64qi) _mm512_setzero_hi ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+                 (__v64qi) __O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+                 (__v64qi)
+                 _mm512_setzero_qi(),
+                 __M);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+                (__mmask64) __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+                (__mmask32) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
+                 (__v32hi)
+                 _mm512_setzero_hi (),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
+                 (__v64qi)
+                 _mm512_setzero_hi (),
+                 (__mmask64) __U);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
+{
+  __builtin_ia32_storedquhi512_mask ((__v32hi *) __P,
+             (__v32hi) __A,
+             (__mmask32) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
+{
+  __builtin_ia32_storedquqi512_mask ((__v64qi *) __P,
+             (__v64qi) __A,
+             (__mmask64) __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_test_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+            (__v64qi) __B,
+            (__mmask64) -1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+            (__v64qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_test_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+            (__v32hi) __B,
+            (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+            (__v32hi) __B, __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+             (__v64qi) __B,
+             (__mmask64) -1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+             (__v64qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+             (__v32hi) __B,
+             (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+             (__v32hi) __B, __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_movepi8_mask (__m512i __A)
+{
+  return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_movepi16_mask (__m512i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi8 (__mmask64 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi16 (__mmask32 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastb_epi8 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v16qi) __A,
+                                          (__v16qi)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__M,
+                                             (__v64qi) _mm512_broadcastb_epi8(__A),
+                                             (__v64qi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__M,
+                                             (__v64qi) _mm512_broadcastb_epi8(__A),
+                                             (__v64qi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                 (__v32hi) __O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                 (__v32hi) _mm512_setzero_hi(),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastw_epi16 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v8hi) __A,
+                                          (__v8hi)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__M,
+                                             (__v32hi) _mm512_broadcastw_epi16(__A),
+                                             (__v32hi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__M,
+                                             (__v32hi) _mm512_broadcastw_epi16(__A),
+                                             (__v32hi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) _mm512_undefined_epi32 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) __W,
+                 (__mmask32) __M);
+}
+
+#define _mm512_alignr_epi8(A, B, N) __extension__ ({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)_mm512_undefined_pd(), \
+                                          (__mmask64)-1); })
+
+#define _mm512_mask_alignr_epi8(W, U, A, B, N) __extension__({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)(__m512i)(W), \
+                                          (__mmask64)(U)); })
+
+#define _mm512_maskz_alignr_epi8(U, A, B, N) __extension__({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)_mm512_setzero_si512(), \
+                                          (__mmask64)(U)); })
+
+#define _mm512_dbsad_epu8(A, B, imm) __extension__ ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)_mm512_undefined_epi32(), \
+                                           (__mmask32)-1); })
+
+#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)(__m512i)(W), \
+                                           (__mmask32)(U)); })
+
+#define _mm512_maskz_dbsad_epu8(U, A, B, imm) ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)_mm512_setzero_hi(), \
+                                           (__mmask32)(U)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sad_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
+               (__v64qi) __B);
+}
+
 
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/avx512cdintrin.h b/lib/Headers/avx512cdintrin.h
index 3894b29..23c4235 100644
--- a/lib/Headers/avx512cdintrin.h
+++ b/lib/Headers/avx512cdintrin.h
@@ -126,6 +126,19 @@
              (__v8di) _mm512_setzero_si512 (),
              (__mmask8) __U);
 }
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+}
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif
diff --git a/lib/Headers/avx512dqintrin.h b/lib/Headers/avx512dqintrin.h
index afee490..b60bec1 100644
--- a/lib/Headers/avx512dqintrin.h
+++ b/lib/Headers/avx512dqintrin.h
@@ -33,7 +33,7 @@
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mullo_epi64 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v8di) __A * (__v8di) __B);
+  return (__m512i) ((__v8du) __A * (__v8du) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -55,7 +55,7 @@
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_xor_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8di) __A ^ (__v8di) __B);
+  return (__m512d) ((__v8du) __A ^ (__v8du) __B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -77,7 +77,7 @@
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_xor_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16si) __A ^ (__v16si) __B);
+  return (__m512) ((__v16su) __A ^ (__v16su) __B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -99,7 +99,7 @@
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_or_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8di) __A | (__v8di) __B);
+  return (__m512d) ((__v8du) __A | (__v8du) __B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -121,7 +121,7 @@
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_or_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16si) __A | (__v16si) __B);
+  return (__m512) ((__v16su) __A | (__v16su) __B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -143,7 +143,7 @@
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_and_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8di) __A & (__v8di) __B);
+  return (__m512d) ((__v8du) __A & (__v8du) __B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -165,7 +165,7 @@
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_and_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16si) __A & (__v16si) __B);
+  return (__m512) ((__v16su) __A & (__v16su) __B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -187,11 +187,7 @@
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_andnot_pd (__m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
-              (__v8df) __B,
-              (__v8df)
-              _mm512_setzero_pd (),
-              (__mmask8) -1);
+  return (__m512d)(~(__v8du)__A & (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -213,11 +209,7 @@
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_andnot_ps (__m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
-             (__v16sf) __B,
-             (__v16sf)
-             _mm512_setzero_ps (),
-             (__mmask16) -1);
+  return (__m512)(~(__v16su)__A & (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -261,17 +253,20 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundpd_epi64(__A, __R) __extension__ ({              \
-  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,               \
-                (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+#define _mm512_cvt_roundpd_epi64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvt_roundpd_epi64(__W, __U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,                 \
-                (__v8di) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)(__m512i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvt_roundpd_epi64(__U, __A, __R) __extension__ ({   \
-  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,        \
-                (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R); })
+#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvtpd_epu64 (__m512d __A) {
@@ -297,17 +292,20 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundpd_epu64(__A, __R) __extension__ ({               \
-  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,               \
-                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+#define _mm512_cvt_roundpd_epu64(A, R) __extension__ ({               \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvt_roundpd_epu64(__W, __U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,                \
-                 (__v8di) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvt_roundpd_epu64(__U, __A, __R) __extension__ ({     \
-  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,                \
-                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) __extension__ ({     \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvtps_epi64 (__m256 __A) {
@@ -333,17 +331,20 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundps_epi64(__A, __R) __extension__ ({             \
-  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,              \
-                (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+#define _mm512_cvt_roundps_epi64(A, R) __extension__ ({             \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvt_roundps_epi64(__W, __U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,                 \
-                (__v8di) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)(__m512i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvt_roundps_epi64(__U, __A, __R) __extension__ ({   \
-  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,               \
-                (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvt_roundps_epi64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvtps_epu64 (__m256 __A) {
@@ -369,17 +370,20 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundps_epu64(__A, __R) __extension__ ({              \
-  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,              \
-                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+#define _mm512_cvt_roundps_epu64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvt_roundps_epu64(__W, __U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,                \
-                 (__v8di) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvt_roundps_epu64(__U, __A, __R) __extension__ ({   \
-  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,              \
-                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvt_roundps_epu64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -406,17 +410,20 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundepi64_pd(__A, __R) __extension__ ({          \
-  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,           \
-                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+#define _mm512_cvt_roundepi64_pd(A, R) __extension__ ({          \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvt_roundepi64_pd(__W, __U, __A, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,                 \
-                (__v8df) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvt_roundepi64_pd(__U, __A, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,             \
-                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm512_cvtepi64_ps (__m512i __A) {
@@ -442,17 +449,20 @@
                _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundepi64_ps(__A, __R) __extension__ ({        \
-  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,          \
-               (__v8sf) _mm256_setzero_ps(), (__mmask8) -1, __R);})
+#define _mm512_cvt_roundepi64_ps(A, R) __extension__ ({        \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvt_roundepi64_ps(__W, __U, __A, __R) __extension__ ({ \
-  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,                  \
-               (__v8sf) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                          (int)(R)); })
 
-#define _mm512_maskz_cvt_roundepi64_ps(__U, __A, __R) __extension__ ({ \
-  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,              \
-               (__v8sf) _mm256_setzero_ps(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -479,17 +489,20 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundpd_epi64(__A, __R) __extension__ ({             \
-  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,              \
-                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+#define _mm512_cvtt_roundpd_epi64(A, R) __extension__ ({             \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvtt_roundpd_epi64(__W, __U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,                 \
-                 (__v8di) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvtt_roundpd_epi64(__U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,             \
-                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvttpd_epu64 (__m512d __A) {
@@ -515,17 +528,20 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundpd_epu64(__A, __R) __extension__ ({              \
-  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,              \
-                  (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+#define _mm512_cvtt_roundpd_epu64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvtt_roundpd_epu64(__W, __U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,                \
-                  (__v8di) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvtt_roundpd_epu64(__U, __A, __R) __extension__ ({   \
-  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,              \
-                  (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvttps_epi64 (__m256 __A) {
@@ -551,17 +567,20 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundps_epi64(__A, __R) __extension__ ({            \
-  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,             \
-                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+#define _mm512_cvtt_roundps_epi64(A, R) __extension__ ({            \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvtt_roundps_epi64(__W, __U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,                 \
-                 (__v8di) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvtt_roundps_epi64(__U, __A, __R) __extension__ ({  \
-  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,              \
-                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) __extension__ ({  \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvttps_epu64 (__m256 __A) {
@@ -587,17 +606,20 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundps_epu64(__A, __R) __extension__ ({            \
-  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,            \
-                  (__v8di) _mm512_setzero_si512(),(__mmask8) -1, __R);})
+#define _mm512_cvtt_roundps_epu64(A, R) __extension__ ({            \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvtt_roundps_epu64(__W, __U, __A, __R) __extension__ ({ \
-  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,                \
-                  (__v8di) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
 
-#define _mm512_maskz_cvtt_roundps_epu64(__U, __A, __R) __extension__ ({  \
-  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,             \
-                  (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) __extension__ ({  \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepu64_pd (__m512i __A) {
@@ -623,18 +645,21 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundepu64_pd(__A, __R) __extension__ ({          \
-  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,          \
-                 (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+#define _mm512_cvt_roundepu64_pd(A, R) __extension__ ({          \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvt_roundepu64_pd(__W, __U, __A, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,                \
-                 (__v8df) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
-#define _mm512_maskz_cvt_roundepu64_pd(__U, __A, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,            \
-                 (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
@@ -661,117 +686,637 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundepu64_ps(__A, __R) __extension__ ({         \
-  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,          \
-                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1, __R);})
+#define _mm512_cvt_roundepu64_ps(A, R) __extension__ ({         \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_cvt_roundepu64_ps(__W, __U, __A, __R) __extension__ ({ \
-  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,                 \
-                (__v8sf) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                           (int)(R)); })
 
-#define _mm512_maskz_cvt_roundepu64_ps(__U, __A, __R) __extension__ ({ \
-  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,             \
-                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U, __R);})
+#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm512_range_pd(__A, __B, __C) __extension__ ({                     \
-  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,\
-               (__v8df) _mm512_setzero_pd(), (__mmask8) -1,                 \
-               _MM_FROUND_CUR_DIRECTION);})
+#define _mm512_range_pd(A, B, C) __extension__ ({                     \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
 
-#define _mm512_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({      \
-  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,\
-               (__v8df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+#define _mm512_mask_range_pd(W, U, A, B, C) __extension__ ({      \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
 
-#define _mm512_maskz_range_pd(__U, __A, __B, __C) __extension__ ({           \
-  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C, \
-               (__v8df) _mm512_setzero_pd(), (__mmask8) __U,                 \
-               _MM_FROUND_CUR_DIRECTION);})
+#define _mm512_maskz_range_pd(U, A, B, C) __extension__ ({           \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
 
-#define _mm512_range_round_pd(__A, __B, __C, __R) __extension__ ({           \
-  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C, \
-               (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+#define _mm512_range_round_pd(A, B, C, R) __extension__ ({           \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_range_round_pd(__W, __U, __A, __B, __C, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,      \
-               (__v8df) __W, (__mmask8) __U, __R);})
+#define _mm512_mask_range_round_pd(W, U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                          (int)(R)); })
 
-#define _mm512_maskz_range_round_pd(__U, __A, __B, __C, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,   \
-               (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+#define _mm512_maskz_range_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U), (int)(R)); })
 
-#define _mm512_range_ps(__A, __B, __C) __extension__ ({                       \
-  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B, __C, \
-               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1,                 \
-               _MM_FROUND_CUR_DIRECTION);})
+#define _mm512_range_ps(A, B, C) __extension__ ({                       \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
 
-#define _mm512_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({         \
-  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,       \
-               __C, (__v16sf) __W, (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+#define _mm512_mask_range_ps(W, U, A, B, C) __extension__ ({         \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
 
-#define _mm512_maskz_range_ps(__U, __A, __B, __C) __extension__ ({      \
-  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,(__v16sf) __B, \
-              __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) __U,      \
-              _MM_FROUND_CUR_DIRECTION);})
+#define _mm512_maskz_range_ps(U, A, B, C) __extension__ ({      \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
 
-#define _mm512_range_round_ps(__A, __B, __C, __R) __extension__ ({         \
-  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,   \
-                __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R);})
+#define _mm512_range_round_ps(A, B, C, R) __extension__ ({         \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)-1, (int)(R)); })
 
-#define _mm512_mask_range_round_ps(__W, __U, __A, __B, __C, __R) __extension__ ({ \
-  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,          \
-                __C, (__v16sf) __W, (__mmask16) __U, __R);})
+#define _mm512_mask_range_round_ps(W, U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                         (int)(R)); })
 
-#define _mm512_maskz_range_round_ps(__U, __A, __B, __C, __R) __extension__ ({ \
-  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,      \
-                __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
+#define _mm512_maskz_range_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U), (int)(R)); })
 
-#define _mm512_reduce_pd(__A, __B) __extension__ ({             \
-  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
-                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);})
+#define _mm_range_round_ss(A, B, C, R) __extension__ ({           \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8) -1, (int)(C),\
+                                               (int)(R)); })
 
-#define _mm512_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
-  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,    \
-                (__v8df) __W,(__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_reduce_pd(__U, __A, __B) __extension__ ({  \
-  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
-                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+#define _mm_mask_range_round_ss(W, U, A, B, C, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W),\
+                                               (__mmask8)(U), (int)(C),\
+                                               (int)(R)); })
 
-#define _mm512_reduce_ps(__A, __B) __extension__ ({              \
-  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,  \
-               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);})
+#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({   \
-  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
-               (__v16sf) __W, (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+#define _mm_maskz_range_round_ss(U, A, B, C, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(C),\
+                                               (int)(R)); })
 
-#define _mm512_maskz_reduce_ps(__U, __A, __B) __extension__ ({       \
-  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
-               (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_reduce_round_pd(__A, __B, __R) __extension__ ({\
-  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
-                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+#define _mm_range_round_sd(A, B, C, R) __extension__ ({           \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8) -1, (int)(C),\
+                                                (int)(R)); })
 
-#define _mm512_mask_reduce_round_pd(__W, __U, __A, __B, __R) __extension__ ({\
-  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,    \
-                (__v8df) __W,(__mmask8) __U, __R);})
+#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_reduce_round_pd(__U, __A, __B, __R) __extension__ ({\
-  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
-                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+#define _mm_mask_range_round_sd(W, U, A, B, C, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W),\
+                                                (__mmask8)(U), (int)(C),\
+                                                (int)(R)); })
 
-#define _mm512_reduce_round_ps(__A, __B, __R) __extension__ ({\
-  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,  \
-               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R);})
+#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_reduce_round_ps(__W, __U, __A, __B, __R) __extension__ ({\
-  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
-               (__v16sf) __W, (__mmask16) __U, __R);})
+#define _mm_maskz_range_round_sd(U, A, B, C, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(C),\
+                                                (int)(R)); })
 
-#define _mm512_maskz_reduce_round_ps(__U, __A, __B, __R) __extension__ ({\
-  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
-               (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
+#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_reduce_pd(A, B) __extension__ ({             \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_reduce_pd(U, A, B) __extension__ ({  \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_reduce_ps(A, B) __extension__ ({              \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_reduce_ps(W, U, A, B) __extension__ ({   \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_reduce_ps(U, A, B) __extension__ ({       \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_reduce_round_pd(A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_reduce_round_pd(W, U, A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_reduce_round_pd(U, A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_reduce_round_ps(A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_reduce_round_ps(W, U, A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_reduce_round_ps(U, A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm_reduce_ss(A, B, C) __extension__ ({              \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+                                       (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_reduce_ss(W, U, A, B, C) __extension__ ({   \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                       (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_reduce_ss(U, A, B, C) __extension__ ({       \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), \
+                                       (__mmask8)(U), (int)(C), \
+                                       _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_reduce_round_ss(A, B, C, R) __extension__ ({              \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+                                       (int)(C), (int)(R)); })
+
+#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) __extension__ ({   \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                       (int)(C), (int)(R)); })
+
+#define _mm_maskz_reduce_round_ss(U, A, B, C, R) __extension__ ({       \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), \
+                                       (__mmask8)(U), (int)(C), (int)(R)); })
+
+#define _mm_reduce_sd(A, B, C) __extension__ ({              \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)-1, (int)(C), \
+                                        _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_reduce_sd(W, U, A, B, C) __extension__ ({   \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                        (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_reduce_sd(U, A, B, C) __extension__ ({       \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)(U), (int)(C), \
+                                        _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_reduce_round_sd(A, B, C, R) __extension__ ({              \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)-1, (int)(C), (int)(R)); })
+
+#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) __extension__ ({   \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                        (int)(C), (int)(R)); })
+
+#define _mm_maskz_reduce_round_sd(U, A, B, C, R) __extension__ ({       \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)(U), (int)(C), (int)(R)); })
+                     
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_movepi32_mask (__m512i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_movepi64_mask (__m512i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
+}
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x2 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)_mm512_undefined_ps(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)
+                __O, __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)_mm512_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x8 (__m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                _mm512_undefined_ps(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                (__v16sf)__O,
+                __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                (__v16sf)_mm512_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcast_f64x2 (__m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)_mm512_undefined_pd(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)
+                 __O, __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)_mm512_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x2 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)_mm512_setzero_si512(),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)
+                 __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)_mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x8 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)_mm512_setzero_si512(),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)__O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i64x2 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)_mm512_setzero_si512(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)
+                 __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)_mm512_setzero_si512 (),
+                 __M);
+}
+
+#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1); })
+
+#define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                           (__v8sf)(__m256)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U)); })
+
+#define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+                                                (int)(imm), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1); })
+
+#define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+                                                (int)(imm), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U)); })
+
+#define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+                                                (int)(imm), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U)); })
+
+#define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1); })
+
+#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                            (__v8si)(__m256i)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+                                                (int)(imm), \
+                                                (__v2di)_mm_setzero_di(), \
+                                                (__mmask8)-1); })
+
+#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+                                                (int)(imm), \
+                                                (__v2di)(__m128i)(W), \
+                                                (__mmask8)(U)); })
+
+#define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+                                                (int)(imm), \
+                                                (__v2di)_mm_setzero_di(), \
+                                                (__mmask8)(U)); })
+
+#define _mm512_insertf32x8(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
+                                          (__v8sf)(__m256)(B), (int)(imm), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1); })
+
+#define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
+                                          (__v8sf)(__m256)(B), (int)(imm), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
+                                          (__v8sf)(__m256)(B), (int)(imm), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_insertf64x2(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(imm), \
+                                               (__v8df)_mm512_setzero_pd(), \
+                                               (__mmask8)-1); })
+
+#define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(imm), \
+                                               (__v8df)(__m512d)(W), \
+                                               (__mmask8)(U)); })
+
+#define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(imm), \
+                                               (__v8df)_mm512_setzero_pd(), \
+                                               (__mmask8)(U)); })
+
+#define _mm512_inserti32x8(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
+                                           (__v8si)(__m256i)(B), (int)(imm), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)-1); })
+
+#define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
+                                           (__v8si)(__m256i)(B), (int)(imm), \
+                                           (__v16si)(__m512i)(W), \
+                                           (__mmask16)(U)); })
+
+#define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
+                                           (__v8si)(__m256i)(B), (int)(imm), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)(U)); })
+
+#define _mm512_inserti64x2(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
+                                               (__v2di)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v8di)_mm512_setzero_si512(), \
+                                               (__mmask8)-1); })
+
+#define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
+                                               (__v2di)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v8di)(__m512i)(W), \
+                                               (__mmask8)(U)); })
+
+#define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
+                                               (__v2di)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v8di)_mm512_setzero_si512(), \
+                                               (__mmask8)(U)); })
+
+#define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                              (int)(imm), (__mmask16)(U)); })
+
+#define _mm512_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                              (int)(imm), (__mmask16)-1); })
+
+#define _mm512_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm512_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_fpclass_sd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_fpclass_sd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                          (__mmask8)(U)); })
+
+#define _mm_fpclass_ss_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_fpclass_ss_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                          (__mmask8)(U)); })
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/lib/Headers/avx512erintrin.h b/lib/Headers/avx512erintrin.h
index 40a9121..8ff212c 100644
--- a/lib/Headers/avx512erintrin.h
+++ b/lib/Headers/avx512erintrin.h
@@ -31,66 +31,66 @@
 #define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)-1, (R)); })
+                                      (__mmask8)-1, (int)(R)); })
 
 #define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                      (__v8df)(__m512d)(S), \
-                                      (__mmask8)(M), (R)); })
+                                      (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                      (int)(R)); })
 
 #define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)(M), (R)); })
+                                      (__mmask8)(M), (int)(R)); })
 
 #define _mm512_exp2a23_pd(A) \
-   _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_mask_exp2a23_pd(S, M, A) \
-   _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_maskz_exp2a23_pd(M, A) \
-   _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask8)-1, (R)); })
+                                     (__mmask16)-1, (int)(R)); })
 
 #define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                     (__v16sf)(__m512)(S), \
-                                     (__mmask8)(M), (R)); })
+                                     (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                     (int)(R)); })
 
 #define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask8)(M), (R)); })
+                                     (__mmask16)(M), (int)(R)); })
 
 #define _mm512_exp2a23_ps(A) \
-   _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_mask_exp2a23_ps(S, M, A) \
-   _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_maskz_exp2a23_ps(M, A) \
-   _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 // rsqrt28
 #define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)-1, (R)); })
+                                         (__mmask8)-1, (int)(R)); })
 
 #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(S), \
-                                         (__mmask8)(M), (R)); })
+                                         (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                         (int)(R)); })
 
 #define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)(M), (R)); })
+                                         (__mmask8)(M), (int)(R)); })
 
 #define _mm512_rsqrt28_pd(A) \
   _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@@ -104,17 +104,17 @@
 #define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)-1, (R)); })
+                                        (__mmask16)-1, (int)(R)); })
 
 #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                        (__v16sf)(__m512)(S), \
-                                        (__mmask16)(M), (R)); })
+                                        (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                        (int)(R)); })
 
 #define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)(M), (R)); })
+                                        (__mmask16)(M), (int)(R)); })
 
 #define _mm512_rsqrt28_ps(A) \
   _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@@ -126,22 +126,22 @@
   _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)_mm_setzero_ps(), \
-                                        (__mmask8)-1, (R)); })
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, (int)(R)); })
 
 #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(S), \
-                                        (__mmask8)(M), (R)); })
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)(__m128)(S), \
+                                              (__mmask8)(M), (int)(R)); })
 
 #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)_mm_setzero_ps(), \
-                                        (__mmask8)(M), (R)); })
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(M), (int)(R)); })
 
 #define _mm_rsqrt28_ss(A, B) \
   _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -153,22 +153,22 @@
   _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)_mm_setzero_pd(), \
-                                         (__mmask8)-1, (R)); })
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, (int)(R)); })
 
 #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(S), \
-                                         (__mmask8)(M), (R)); })
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)(__m128d)(S), \
+                                               (__mmask8)(M), (int)(R)); })
 
 #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)_mm_setzero_pd(), \
-                                         (__mmask8)(M), (R)); })
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(M), (int)(R)); })
 
 #define _mm_rsqrt28_sd(A, B) \
   _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -177,23 +177,23 @@
   _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_maskz_rsqrt28_sd(M, A, B) \
-  _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
 // rcp28
 #define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (R)); })
+                                       (__mmask8)-1, (int)(R)); })
 
 #define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(S), \
-                                       (__mmask8)(M), (R)); })
+                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                       (int)(R)); })
 
 #define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (R)); })
+                                       (__mmask8)(M), (int)(R)); })
 
 #define _mm512_rcp28_pd(A) \
   _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@@ -207,17 +207,17 @@
 #define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (R)); })
+                                      (__mmask16)-1, (int)(R)); })
 
 #define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(S), \
-                                      (__mmask16)(M), (R)); })
+                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                      (int)(R)); })
 
 #define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (R)); })
+                                      (__mmask16)(M), (int)(R)); })
 
 #define _mm512_rcp28_ps(A) \
   _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@@ -229,22 +229,22 @@
   _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
-                                      (__v4sf)(__m128)(B), \
-                                      (__v4sf)_mm_setzero_ps(), \
-                                      (__mmask8)-1, (R)); })
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)-1, (int)(R)); })
 
 #define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
-                                      (__v4sf)(__m128)(B), \
-                                      (__v4sf)(__m128)(S), \
-                                      (__mmask8)(M), (R)); })
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)(__m128)(S), \
+                                            (__mmask8)(M), (int)(R)); })
 
 #define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
-                                      (__v4sf)(__m128)(B), \
-                                      (__v4sf)_mm_setzero_ps(), \
-                                      (__mmask8)(M), (R)); })
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)(M), (int)(R)); })
 
 #define _mm_rcp28_ss(A, B) \
   _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -256,22 +256,22 @@
   _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
-                                       (__v2df)(__m128d)(B), \
-                                       (__v2df)_mm_setzero_pd(), \
-                                       (__mmask8)-1, (R)); })
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)-1, (int)(R)); })
 
 #define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
-                                       (__v2df)(__m128d)(B), \
-                                       (__v2df)(__m128d)(S), \
-                                       (__mmask8)(M), (R)); })
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)(__m128d)(S), \
+                                             (__mmask8)(M), (int)(R)); })
 
 #define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
-                                       (__v2df)(__m128d)(B), \
-                                       (__v2df)_mm_setzero_pd(), \
-                                       (__mmask8)(M), (R)); })
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)(M), (int)(R)); })
 
 #define _mm_rcp28_sd(A, B) \
   _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index 5a976cc..a159d42 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -27,11 +27,19 @@
 #ifndef __AVX512FINTRIN_H
 #define __AVX512FINTRIN_H
 
+typedef char __v64qi __attribute__((__vector_size__(64)));
+typedef short __v32hi __attribute__((__vector_size__(64)));
 typedef double __v8df __attribute__((__vector_size__(64)));
 typedef float __v16sf __attribute__((__vector_size__(64)));
 typedef long long __v8di __attribute__((__vector_size__(64)));
 typedef int __v16si __attribute__((__vector_size__(64)));
 
+/* Unsigned types */
+typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
+typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
+typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
+typedef unsigned int __v16su __attribute__((__vector_size__(64)));
+
 typedef float __m512 __attribute__((__vector_size__(64)));
 typedef double __m512d __attribute__((__vector_size__(64)));
 typedef long long __m512i __attribute__((__vector_size__(64)));
@@ -46,6 +54,124 @@
 #define _MM_FROUND_TO_ZERO          0x03
 #define _MM_FROUND_CUR_DIRECTION    0x04
 
+/* Constants for integer comparison predicates */
+typedef enum {
+    _MM_CMPINT_EQ,      /* Equal */
+    _MM_CMPINT_LT,      /* Less than */
+    _MM_CMPINT_LE,      /* Less than or Equal */
+    _MM_CMPINT_UNUSED,
+    _MM_CMPINT_NE,      /* Not Equal */
+    _MM_CMPINT_NLT,     /* Not Less than */
+#define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
+    _MM_CMPINT_NLE      /* Not Less than or Equal */
+#define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
+} _MM_CMPINT_ENUM;
+
+typedef enum
+{
+  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
+  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
+  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
+  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
+  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
+  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
+  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
+  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
+  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
+  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
+  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
+  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
+  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
+  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
+  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
+  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
+  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
+  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
+  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
+  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
+  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
+  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
+  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
+  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
+  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
+  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
+  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
+  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
+  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
+  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
+  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
+  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
+  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
+  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
+  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
+  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
+  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
+  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
+  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
+  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
+  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
+  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
+  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
+  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
+  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
+  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
+  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
+  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
+  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
+  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
+  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
+  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
+  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
+  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
+  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
+  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
+  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
+  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
+  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
+  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
+  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
+  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
+  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
+  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
+  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
+  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
+  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
+  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
+  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
+  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
+  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
+  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
+  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
+  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
+  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
+  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
+  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
+  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
+  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
+  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
+  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
+  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
+  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
+  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
+  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
+  _MM_PERM_DDDD = 0xFF
+} _MM_PERM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
+  _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
+  _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
+  _MM_MANT_NORM_p75_1p5   /* interval [0.75, 1.5) */
+} _MM_MANTISSA_NORM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_SIGN_src,    /* sign = sign(SRC)     */
+  _MM_MANT_SIGN_zero,   /* sign = 0             */
+  _MM_MANT_SIGN_nan   /* DEST = NaN if sign(SRC) = 1 */
+} _MM_MANTISSA_SIGN_ENUM;
+
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
 
@@ -57,30 +183,81 @@
   return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
+#define _mm512_setzero_epi32 _mm512_setzero_si512
+
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_undefined_pd()
+_mm512_undefined_pd(void)
 {
   return (__m512d)__builtin_ia32_undef512();
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_undefined()
+_mm512_undefined(void)
 {
   return (__m512)__builtin_ia32_undef512();
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_undefined_ps()
+_mm512_undefined_ps(void)
 {
   return (__m512)__builtin_ia32_undef512();
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_undefined_epi32()
+_mm512_undefined_epi32(void)
 {
   return (__m512i)__builtin_ia32_undef512();
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastd_epi32 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v4si) __A,
+                                          (__v4si)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__M,
+                                             (__v16si) _mm512_broadcastd_epi32(__A),
+                                             (__v16si) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__M,
+                                             (__v16si) _mm512_broadcastd_epi32(__A),
+                                             (__v16si) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastq_epi64 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v2di) __A,
+                                          (__v2di) _mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                             (__v8di) _mm512_broadcastq_epi64(__A),
+                                             (__v8di) __O);
+
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                             (__v8di) _mm512_broadcastq_epi64(__A),
+                                             (__v8di) _mm512_setzero_si512());
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
 {
@@ -112,6 +289,9 @@
   return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
 }
+
+#define _mm512_setzero _mm512_setzero_ps
+
 static  __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_setzero_pd(void)
 {
@@ -132,6 +312,28 @@
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi8(char __w)
+{
+  return (__m512i)(__v64qi){ __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w  };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi16(short __w)
+{
+  return (__m512i)(__v32hi){ __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_set1_epi32(int __s)
 {
   return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
@@ -145,21 +347,62 @@
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_broadcastss_ps(__m128 __X)
+_mm512_broadcastss_ps(__m128 __A)
 {
-  float __f = __X[0];
-  return (__v16sf){ __f, __f, __f, __f,
-                    __f, __f, __f, __f,
-                    __f, __f, __f, __f,
-                    __f, __f, __f, __f };
+  return (__m512)__builtin_shufflevector((__v4sf) __A,
+                                         (__v4sf)_mm_undefined_ps(),
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_broadcastsd_pd(__m128d __X)
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
 {
-  double __d = __X[0];
-  return (__v8df){ __d, __d, __d, __d,
-                   __d, __d, __d, __d };
+  return  (__m512i)(__v16si)
+   { __D, __C, __B, __A, __D, __C, __B, __A,
+     __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set4_epi64 (long long __A, long long __B, long long __C,
+       long long __D)
+{
+  return  (__m512i) (__v8di)
+   { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_set4_pd (double __A, double __B, double __C, double __D)
+{
+  return  (__m512d)
+   { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_set4_ps (float __A, float __B, float __C, float __D)
+{
+  return  (__m512)
+   { __D, __C, __B, __A, __D, __C, __B, __A,
+     __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+#define _mm512_setr4_epi32(e0,e1,e2,e3)               \
+  _mm512_set4_epi32((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_epi64(e0,e1,e2,e3)               \
+  _mm512_set4_epi64((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_pd(e0,e1,e2,e3)                \
+  _mm512_set4_pd((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_ps(e0,e1,e2,e3)                \
+  _mm512_set4_ps((e3),(e2),(e1),(e0))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcastsd_pd(__m128d __A)
+{
+  return (__m512d)__builtin_shufflevector((__v2df) __A,
+                                          (__v2df) _mm_undefined_pd(),
+                                          0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /* Cast between vector types */
@@ -183,272 +426,327 @@
   return __builtin_shufflevector(__a, __a, 0, 1);
 }
 
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm512_castpd512_pd256 (__m512d __A)
+{
+  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
+}
+
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm512_castps512_ps128(__m512 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
 }
 
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm512_castps512_ps256 (__m512 __A)
+{
+  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castpd_ps (__m512d __A)
+{
+  return (__m512) (__A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_castpd_si512 (__m512d __A)
+{
+  return (__m512i) (__A);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_castpd128_pd512 (__m128d __A)
+{
+  return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castps_pd (__m512 __A)
+{
+  return (__m512d) (__A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_castps_si512 (__m512 __A)
+{
+  return (__m512i) (__A);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_castps128_ps512 (__m128 __A)
+{
+    return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_castsi128_si512 (__m128i __A)
+{
+   return  __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_castsi256_si512 (__m256i __A)
+{
+   return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castsi512_ps (__m512i __A)
+{
+  return (__m512) (__A);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castsi512_pd (__m512i __A)
+{
+  return (__m512d) (__A);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm512_castsi512_si128 (__m512i __A)
+{
+  return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm512_castsi512_si256 (__m512i __A)
+{
+  return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
+}
+
 /* Bitwise operators */
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_epi32(__m512i __a, __m512i __b)
 {
-  return __a & __b;
+  return (__m512i)((__v16su)__a & (__v16su)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
-              (__v16si) __b,
-              (__v16si) __src,
-              (__mmask16) __k);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                (__v16si) _mm512_and_epi32(__a, __b),
+                (__v16si) __src);
 }
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
-              (__v16si) __b,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __k);
+  return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
+                                         __k, __a, __b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_epi64(__m512i __a, __m512i __b)
 {
-  return __a & __b;
+  return (__m512i)((__v8du)__a & (__v8du)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
-              (__v8di) __b,
-              (__v8di) __src,
-              (__mmask8) __k);
+    return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
+                (__v8di) _mm512_and_epi64(__a, __b),
+                (__v8di) __src);
 }
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
-              (__v8di) __b,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) __k);
+  return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
+                                         __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_andnot_epi32 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)(~(__v16su)(__A) & (__v16su)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_andnot_epi32(__A, __B),
+                                         (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
+  return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
+                                           __U, __A, __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_andnot_epi64 (__m512i __A, __m512i __B)
+_mm512_andnot_epi64(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di) __W, __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_andnot_epi64(__A, __B),
+                                          (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_pd (),
-              __U);
+  return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
+                                           __U, __A, __B);
 }
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_or_epi32(__m512i __a, __m512i __b)
 {
-  return __a | __b;
+  return (__m512i)((__v16su)__a | (__v16su)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
-              (__v16si) __b,
-              (__v16si) __src,
-              (__mmask16) __k);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                                             (__v16si)_mm512_or_epi32(__a, __b),
+                                             (__v16si)__src);
 }
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
-              (__v16si) __b,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __k);
+  return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_or_epi64(__m512i __a, __m512i __b)
 {
-  return __a | __b;
+  return (__m512i)((__v8du)__a | (__v8du)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
-              (__v8di) __b,
-              (__v8di) __src,
-              (__mmask8) __k);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
+                                             (__v8di)_mm512_or_epi64(__a, __b),
+                                             (__v8di)__src);
 }
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
-              (__v8di) __b,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) __k);
+  return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_xor_epi32(__m512i __a, __m512i __b)
 {
-  return __a ^ __b;
+  return (__m512i)((__v16su)__a ^ (__v16su)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
-              (__v16si) __b,
-              (__v16si) __src,
-              (__mmask16) __k);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                                            (__v16si)_mm512_xor_epi32(__a, __b),
+                                            (__v16si)__src);
 }
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
-              (__v16si) __b,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __k);
+  return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_xor_epi64(__m512i __a, __m512i __b)
 {
-  return __a ^ __b;
+  return (__m512i)((__v8du)__a ^ (__v8du)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
-              (__v8di) __b,
-              (__v8di) __src,
-              (__mmask8) __k);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
+                                             (__v8di)_mm512_xor_epi64(__a, __b),
+                                             (__v8di)__src);
 }
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
-  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
-              (__v8di) __b,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) __k);
+  return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_si512(__m512i __a, __m512i __b)
 {
-  return __a & __b;
+  return (__m512i)((__v8du)__a & (__v8du)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_or_si512(__m512i __a, __m512i __b)
 {
-  return __a | __b;
+  return (__m512i)((__v8du)__a | (__v8du)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_xor_si512(__m512i __a, __m512i __b)
 {
-  return __a ^ __b;
+  return (__m512i)((__v8du)__a ^ (__v8du)__b);
 }
+
 /* Arithmetic */
 
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_add_pd(__m512d __a, __m512d __b)
 {
-  return __a + __b;
+  return (__m512d)((__v8df)__a + (__v8df)__b);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_add_ps(__m512 __a, __m512 __b)
 {
-  return __a + __b;
+  return (__m512)((__v16sf)__a + (__v16sf)__b);
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_mul_pd(__m512d __a, __m512d __b)
 {
-  return __a * __b;
+  return (__m512d)((__v8df)__a * (__v8df)__b);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_mul_ps(__m512 __a, __m512 __b)
 {
-  return __a * __b;
+  return (__m512)((__v16sf)__a * (__v16sf)__b);
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_sub_pd(__m512d __a, __m512d __b)
 {
-  return __a - __b;
+  return (__m512d)((__v8df)__a - (__v8df)__b);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_sub_ps(__m512 __a, __m512 __b)
 {
-  return __a - __b;
+  return (__m512)((__v16sf)__a - (__v16sf)__b);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi64 (__m512i __A, __m512i __B)
 {
-  return (__m512i) ((__v8di) __A + (__v8di) __B);
+  return (__m512i) ((__v8du) __A + (__v8du) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -473,7 +771,7 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sub_epi64 (__m512i __A, __m512i __B)
 {
-  return (__m512i) ((__v8di) __A - (__v8di) __B);
+  return (__m512i) ((__v8du) __A - (__v8du) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -498,7 +796,7 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi32 (__m512i __A, __m512i __B)
 {
-  return (__m512i) ((__v16si) __A + (__v16si) __B);
+  return (__m512i) ((__v16su) __A + (__v16su) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -523,7 +821,7 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sub_epi32 (__m512i __A, __m512i __B)
 {
-  return (__m512i) ((__v16si) __A - (__v16si) __B);
+  return (__m512i) ((__v16su) __A - (__v16su) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -545,6 +843,24 @@
              (__mmask16) __U);
 }
 
+#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_max_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_max_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_undefined_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
 static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_max_pd(__m512d __A, __m512d __B)
 {
@@ -556,6 +872,45 @@
              _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_max_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_max_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
+
+#define _mm512_max_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_undefined_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
 static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_max_ps(__m512 __A, __m512 __B)
 {
@@ -567,9 +922,30 @@
             _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf) __W,
                 (__mmask8) __U,
@@ -578,28 +954,34 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf)  _mm_setzero_ps (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_max_round_ss(__A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+#define _mm_max_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_max_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_max_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
 
-#define _mm_maskz_max_round_ss(__U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+#define _mm_maskz_max_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df) __W,
                 (__mmask8) __U,
@@ -608,24 +990,30 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df)  _mm_setzero_pd (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_max_round_sd(__A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm_max_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_max_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_max_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm_maskz_max_round_sd(__U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+#define _mm_maskz_max_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
 
 static __inline __m512i
 __DEFAULT_FN_ATTRS
@@ -638,6 +1026,24 @@
               (__mmask16) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epu32(__m512i __A, __m512i __B)
 {
@@ -648,6 +1054,24 @@
               (__mmask16) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epi64(__m512i __A, __m512i __B)
 {
@@ -658,6 +1082,24 @@
               (__mmask8) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epu64(__m512i __A, __m512i __B)
 {
@@ -668,6 +1110,42 @@
               (__mmask8) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+#define _mm512_mask_min_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_min_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_min_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_undefined_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
 static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_min_pd(__m512d __A, __m512d __B)
 {
@@ -679,6 +1157,45 @@
              _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_min_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_min_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
+
+#define _mm512_min_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_undefined_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
 static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_min_ps(__m512 __A, __m512 __B)
 {
@@ -690,9 +1207,30 @@
             _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf) __W,
                 (__mmask8) __U,
@@ -701,28 +1239,34 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf)  _mm_setzero_ps (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_min_round_ss(__A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+#define _mm_min_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_min_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_min_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
 
-#define _mm_maskz_min_round_ss(__U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+#define _mm_maskz_min_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df) __W,
                 (__mmask8) __U,
@@ -731,24 +1275,30 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df)  _mm_setzero_pd (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_min_round_sd(__A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm_min_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_min_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_min_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm_maskz_min_round_sd(__U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+#define _mm_maskz_min_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
 
 static __inline __m512i
 __DEFAULT_FN_ATTRS
@@ -761,6 +1311,24 @@
               (__mmask16) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epu32(__m512i __A, __m512i __B)
 {
@@ -771,6 +1339,24 @@
               (__mmask16) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epi64(__m512i __A, __m512i __B)
 {
@@ -781,6 +1367,24 @@
               (__mmask8) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epu64(__m512i __A, __m512i __B)
 {
@@ -791,6 +1395,24 @@
               (__mmask8) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mul_epi32(__m512i __X, __m512i __Y)
 {
@@ -850,7 +1472,7 @@
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mullo_epi32 (__m512i __A, __m512i __B)
 {
-  return (__m512i) ((__v16si) __A * (__v16si) __B);
+  return (__m512i) ((__v16su) __A * (__v16su) __B);
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
@@ -871,6 +1493,21 @@
               (__v16si) __W, __M);
 }
 
+#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_sqrt_round_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(U), (int)(R)); })
+
+#define _mm512_sqrt_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_undefined_pd(), \
+                                         (__mmask8)-1, (int)(R)); })
+
 static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_sqrt_pd(__m512d __a)
 {
@@ -880,6 +1517,40 @@
                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                   (__v8df) __W,
+                   (__mmask8) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                   (__v8df)
+                   _mm512_setzero_pd (),
+                   (__mmask8) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_sqrt_round_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_sqrt_round_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(U), (int)(R)); })
+
+#define _mm512_sqrt_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_undefined_ps(), \
+                                        (__mmask16)-1, (int)(R)); })
+
 static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_sqrt_ps(__m512 __a)
 {
@@ -889,6 +1560,24 @@
                                                _MM_FROUND_CUR_DIRECTION);
 }
 
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
 static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_rsqrt14_pd(__m512d __A)
 {
@@ -897,6 +1586,23 @@
                  _mm512_setzero_pd (),
                  (__mmask8) -1);}
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                  (__v8df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U);
+}
+
 static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_rsqrt14_ps(__m512 __A)
 {
@@ -906,26 +1612,79 @@
                 (__mmask16) -1);
 }
 
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
 static  __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt14_ss(__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __A,
+  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
              (__v4sf) __B,
              (__v4sf)
              _mm_setzero_ps (),
              (__mmask8) -1);
 }
 
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_ps (),
+          (__mmask8) __U);
+}
+
 static  __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_rsqrt14_sd(__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __A,
+  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
               (__v2df) __B,
               (__v2df)
               _mm_setzero_pd (),
               (__mmask8) -1);
 }
 
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U);
+}
+
 static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_rcp14_pd(__m512d __A)
 {
@@ -935,6 +1694,23 @@
                (__mmask8) -1);
 }
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
 static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_rcp14_ps(__m512 __A)
 {
@@ -943,26 +1719,80 @@
               _mm512_setzero_ps (),
               (__mmask16) -1);
 }
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                   (__v16sf)
+                   _mm512_setzero_ps (),
+                   (__mmask16) __U);
+}
+
 static  __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp14_ss(__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __A,
+  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
                  (__v4sf) __B,
                  (__v4sf)
                  _mm_setzero_ps (),
                  (__mmask8) -1);
 }
 
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_ps (),
+          (__mmask8) __U);
+}
+
 static  __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_rcp14_sd(__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __A,
+  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
             (__v2df) __B,
             (__v2df)
             _mm_setzero_pd (),
             (__mmask8) -1);
 }
 
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U);
+}
+
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_floor_ps(__m512 __A)
 {
@@ -972,6 +1802,15 @@
                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                   _MM_FROUND_FLOOR,
+                   (__v16sf) __W, __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_floor_pd(__m512d __A)
 {
@@ -981,6 +1820,24 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                _MM_FROUND_FLOOR,
+                (__v8df) __W, __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                   _MM_FROUND_CEIL,
+                   (__v16sf) __W, __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_ceil_ps(__m512 __A)
 {
@@ -999,6 +1856,15 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                _MM_FROUND_CEIL,
+                (__v8df) __W, __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_abs_epi64(__m512i __A)
 {
@@ -1008,6 +1874,23 @@
              (__mmask8) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_abs_epi32(__m512i __A)
 {
@@ -1017,9 +1900,26 @@
              (__mmask16) -1);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U);
+}
+
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf) __W,
                 (__mmask8) __U,
@@ -1028,28 +1928,34 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf)  _mm_setzero_ps (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_add_round_ss(__A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+#define _mm_add_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_add_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_add_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
 
-#define _mm_maskz_add_round_ss(__U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+#define _mm_maskz_add_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df) __W,
                 (__mmask8) __U,
@@ -1058,23 +1964,29 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df)  _mm_setzero_pd (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
-#define _mm_add_round_sd(__A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm_add_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_add_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_add_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm_maskz_add_round_sd(__U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+#define _mm_maskz_add_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -1112,33 +2024,45 @@
             _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_add_round_pd(__A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, \
-               (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm512_add_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_add_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_addpd512_mask((__v8df) __A, (__v8df) __B, \
-                (__v8df) __W, (__mmask8) __U, __R); })
+#define _mm512_mask_add_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
 
-#define _mm512_maskz_add_round_pd(__U, __A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, \
-                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R); })
+#define _mm512_maskz_add_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
 
-#define _mm512_add_round_ps(__A, __B, __R) __extension__ ({ \
-  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
-                (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R); })
+#define _mm512_add_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
 
-#define _mm512_mask_add_round_ps(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
-                (__v16sf) __W, (__mmask16)__U, __R); })
+#define _mm512_mask_add_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
 
-#define _mm512_maskz_add_round_ps(__U, __A, __B, __R) __extension__ ({ \
-  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
-                (__v16sf) _mm512_setzero_ps(), (__mmask16)__U, __R); })
+#define _mm512_maskz_add_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf) __W,
                 (__mmask8) __U,
@@ -1147,27 +2071,33 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf)  _mm_setzero_ps (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
-#define _mm_sub_round_ss(__A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+#define _mm_sub_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_sub_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_sub_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
 
-#define _mm_maskz_sub_round_ss(__U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+#define _mm_maskz_sub_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df) __W,
                 (__mmask8) __U,
@@ -1176,24 +2106,30 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df)  _mm_setzero_pd (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_sub_round_sd(__A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm_sub_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_sub_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_sub_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm_maskz_sub_round_sd(__U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+#define _mm_maskz_sub_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -1233,33 +2169,45 @@
             _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_sub_round_pd(__A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B,\
-             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_sub_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, \
-             (__v8df) __W, (__mmask8) __U, __R); })
+#define _mm512_mask_sub_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
 
-#define _mm512_maskz_sub_round_pd(__U, __A, __B, __R) __extension__ ({ \
-   (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, \
-             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+#define _mm512_maskz_sub_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
 
-#define _mm512_sub_round_ps(__A, __B, __R) __extension__ ({ \
-  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+#define _mm512_sub_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
 
-#define _mm512_mask_sub_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
-  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) __W, (__mmask16) __U, __R); });
+#define _mm512_mask_sub_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
 
-#define _mm512_maskz_sub_round_ps(__U, __A, __B, __R)  __extension__ ({ \
-  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+#define _mm512_maskz_sub_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf) __W,
                 (__mmask8) __U,
@@ -1268,27 +2216,33 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf)  _mm_setzero_ps (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
-#define _mm_mul_round_ss(__A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+#define _mm_mul_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_mul_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_mul_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
 
-#define _mm_maskz_mul_round_ss(__U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+#define _mm_maskz_mul_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df) __W,
                 (__mmask8) __U,
@@ -1297,24 +2251,30 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df)  _mm_setzero_pd (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mul_round_sd(__A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm_mul_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_mul_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_mul_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm_maskz_mul_round_sd(__U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+#define _mm_maskz_mul_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -1354,33 +2314,45 @@
             _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_mul_round_pd(__A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B,\
-             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_mul_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, \
-             (__v8df) __W, (__mmask8) __U, __R); })
+#define _mm512_mask_mul_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
 
-#define _mm512_maskz_mul_round_pd(__U, __A, __B, __R) __extension__ ({ \
-   (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, \
-             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+#define _mm512_maskz_mul_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
 
-#define _mm512_mul_round_ps(__A, __B, __R) __extension__ ({ \
-  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+#define _mm512_mul_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
 
-#define _mm512_mask_mul_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
-  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) __W, (__mmask16) __U, __R); });
+#define _mm512_mask_mul_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
 
-#define _mm512_maskz_mul_round_ps(__U, __A, __B, __R)  __extension__ ({ \
-  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+#define _mm512_maskz_mul_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf) __W,
                 (__mmask8) __U,
@@ -1389,28 +2361,34 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+  return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
                 (__v4sf)  _mm_setzero_ps (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_div_round_ss(__A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+#define _mm_div_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_div_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_div_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
 
-#define _mm_maskz_div_round_ss(__U, __A, __B, __R) __extension__ ({ \
-  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
-                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+#define _mm_maskz_div_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df) __W,
                 (__mmask8) __U,
@@ -1419,24 +2397,36 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+  return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
                 (__v2df)  _mm_setzero_pd (),
                 (__mmask8) __U,
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_div_round_sd(__A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm_div_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
 
-#define _mm_mask_div_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  __W, (__mmask8) __U,__R); })
+#define _mm_mask_div_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
 
-#define _mm_maskz_div_round_sd(__U, __A, __B, __R) __extension__ ({ \
-  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
-                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+#define _mm_maskz_div_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_div_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a/(__v8df)__b);
+}
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -1457,6 +2447,12 @@
              _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_div_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a/(__v16sf)__b);
+}
+
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
@@ -1476,108 +2472,186 @@
             _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_div_round_pd(__A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B,\
-             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+#define _mm512_div_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
 
-#define _mm512_mask_div_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
-  (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B, \
-             (__v8df) __W, (__mmask8) __U, __R); })
+#define _mm512_mask_div_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
 
-#define _mm512_maskz_div_round_pd(__U, __A, __B, __R) __extension__ ({ \
-   (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B, \
-             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+#define _mm512_maskz_div_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
 
-#define _mm512_div_round_ps(__A, __B, __R) __extension__ ({ \
-  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+#define _mm512_div_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
 
-#define _mm512_mask_div_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
-  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) __W, (__mmask16) __U, __R); });
+#define _mm512_mask_div_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
 
-#define _mm512_maskz_div_round_ps(__U, __A, __B, __R)  __extension__ ({ \
-  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
-            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+#define _mm512_maskz_div_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
 
 #define _mm512_roundscale_ps(A, B) __extension__ ({ \
-  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(A), (B), (__v16sf)(A), \
-                                         -1, _MM_FROUND_CUR_DIRECTION); })
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
+                                         (__v16sf)(__m512)(A), (__mmask16)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_ps(A, B, C, imm) __extension__ ({\
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_roundscale_ps(A, B, imm) __extension__ ({\
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(A), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(A), (int)(R)); })
+
+#define _mm512_roundscale_round_ps(A, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                         (__v16sf)_mm512_undefined_ps(), \
+                                         (__mmask16)-1, (int)(R)); })
 
 #define _mm512_roundscale_pd(A, B) __extension__ ({ \
-  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(A), (B), (__v8df)(A), \
-                                          -1, _MM_FROUND_CUR_DIRECTION); })
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
+                                          (__v8df)(__m512d)(A), (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_pd(A, B, C, imm) __extension__ ({\
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_roundscale_pd(A, B, imm) __extension__ ({\
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(A), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(A), (int)(R)); })
+
+#define _mm512_roundscale_round_pd(A, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                          (__v8df)_mm512_undefined_pd(), \
+                                          (__mmask8)-1, (int)(R)); })
 
 #define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
-                                             (__v8df) (B), (__v8df) (C), \
-                                             (__mmask8) -1, (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
+                                           (int)(R)); })
 
 
 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
-                                             (__v8df) (B), (__v8df) (C), \
-                                             (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) (A), \
-                                              (__v8df) (B), (__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) (A), \
-                                              (__v8df) (B), (__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
-                                             (__v8df) (B), -(__v8df) (C), \
-                                             (__mmask8) -1, (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R)); })
 
 
 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
-                                             (__v8df) (B), -(__v8df) (C), \
-                                             (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) (A), \
-                                              (__v8df) (B), -(__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) (A), \
-                                             (__v8df) (B), (__v8df) (C), \
-                                             (__mmask8) -1, (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
+                                           (int)(R)); })
 
 
 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) (A), \
-                                              (__v8df) (B), (__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) (A), \
-                                              (__v8df) (B), (__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) (A), \
-                                             (__v8df) (B), -(__v8df) (C), \
-                                             (__mmask8) -1, (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R)); })
 
 
 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) (A), \
-                                              (__v8df) (B), -(__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -1701,75 +2775,87 @@
 }
 
 #define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
-                                            (__v16sf) (B), (__v16sf) (C), \
-                                            (__mmask16) -1, (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
+                                          (int)(R)); })
 
 
 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
-                                            (__v16sf) (B), (__v16sf) (C), \
-                                            (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) (A), \
-                                             (__v16sf) (B), (__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) (A), \
-                                             (__v16sf) (B), (__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
-                                            (__v16sf) (B), -(__v16sf) (C), \
-                                            (__mmask16) -1, (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R)); })
 
 
 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
-                                            (__v16sf) (B), -(__v16sf) (C), \
-                                            (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) (A), \
-                                             (__v16sf) (B), -(__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) (A), \
-                                            (__v16sf) (B), (__v16sf) (C), \
-                                            (__mmask16) -1, (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
+                                          (int)(R)); })
 
 
 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) (A), \
-                                             (__v16sf) (B), (__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) (A), \
-                                             (__v16sf) (B), (__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) (A), \
-                                            (__v16sf) (B), -(__v16sf) (C), \
-                                            (__mmask16) -1, (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R)); })
 
 
 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) (A), \
-                                             (__v16sf) (B), -(__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -1893,45 +2979,52 @@
 }
 
 #define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
-                                                (__v8df) (B), (__v8df) (C), \
-                                                (__mmask8) -1, (R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8df)(__m512d)(C), \
+                                              (__mmask8)-1, (int)(R)); })
 
 
 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
-                                                (__v8df) (B), (__v8df) (C), \
-                                                (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8df)(__m512d)(C), \
+                                              (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) (A), \
-                                                 (__v8df) (B), (__v8df) (C), \
-                                                 (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) (A), \
-                                                 (__v8df) (B), (__v8df) (C), \
-                                                 (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
-                                                (__v8df) (B), -(__v8df) (C), \
-                                                (__mmask8) -1, (R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              -(__v8df)(__m512d)(C), \
+                                              (__mmask8)-1, (int)(R)); })
 
 
 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
-                                                (__v8df) (B), -(__v8df) (C), \
-                                                (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              -(__v8df)(__m512d)(C), \
+                                              (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) (A), \
-                                                 (__v8df) (B), -(__v8df) (C), \
-                                                 (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               -(__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -2005,45 +3098,52 @@
 }
 
 #define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
-                                               (__v16sf) (B), (__v16sf) (C), \
-                                               (__mmask16) -1, (R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16sf)(__m512)(C), \
+                                             (__mmask16)-1, (int)(R)); })
 
 
 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
-                                               (__v16sf) (B), (__v16sf) (C), \
-                                               (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16sf)(__m512)(C), \
+                                             (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) (A), \
-                                                (__v16sf) (B), (__v16sf) (C), \
-                                                (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) (A), \
-                                                (__v16sf) (B), (__v16sf) (C), \
-                                                (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
-                                               (__v16sf) (B), -(__v16sf) (C), \
-                                               (__mmask16) -1, (R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             -(__v16sf)(__m512)(C), \
+                                             (__mmask16)-1, (int)(R)); })
 
 
 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
-                                               (__v16sf) (B), -(__v16sf) (C), \
-                                               (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             -(__v16sf)(__m512)(C), \
+                                             (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) (A), \
-                                                (__v16sf) (B), -(__v16sf) (C), \
-                                                (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              -(__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -2117,9 +3217,10 @@
 }
 
 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) (A), \
-                                              (__v8df) (B), (__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -2133,9 +3234,10 @@
 }
 
 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) (A), \
-                                             (__v16sf) (B), (__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -2149,9 +3251,10 @@
 }
 
 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) (A), \
-                                                 (__v8df) (B), (__v8df) (C), \
-                                                 (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -2165,9 +3268,10 @@
 }
 
 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) (A), \
-                                                (__v16sf) (B), (__v16sf) (C), \
-                                                (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -2181,9 +3285,10 @@
 }
 
 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) (A), \
-                                              (__v8df) (B), (__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -2197,9 +3302,10 @@
 }
 
 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) (A), \
-                                             (__v16sf) (B), (__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -2213,15 +3319,17 @@
 }
 
 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) (A), \
-                                              (__v8df) (B), (__v8df) (C), \
-                                              (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
 
 
 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) (A), \
-                                               (__v8df) (B), (__v8df) (C), \
-                                               (__mmask8) (U), (R)); })
+  (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -2245,15 +3353,17 @@
 }
 
 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) (A), \
-                                             (__v16sf) (B), (__v16sf) (C), \
-                                             (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
 
 
 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) (A), \
-                                              (__v16sf) (B), (__v16sf) (C), \
-                                              (__mmask16) (U), (R)); })
+  (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -2289,6 +3399,29 @@
                                                        (__v16si) __B,
                                                        (__mmask16) -1);
 }
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
+                                __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16si) __A,
+                                                        (__v16si) __B,
+                                                        (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
+                                 __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16si) __A,
+                                                        (__v16si) __B,
+                                                        (__mmask16) __U);
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
 {
@@ -2299,98 +3432,140 @@
                                                        (__mmask8) -1);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
-_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
+                                __m512i __B)
 {
-  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
-                                                        /* idx */ ,
-                                                        (__v8df) __A,
-                                                        (__v8df) __B,
-                                                        (__mmask8) -1);
-}
-static __inline __m512 __DEFAULT_FN_ATTRS
-_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
-{
-  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
                                                        /* idx */ ,
-                                                       (__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       (__mmask16) -1);
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) __U);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
+                                                        /* idx */ ,
+                                                        (__v8di) __A,
+                                                        (__v8di) __B,
+                                                        (__mmask8) __U);
 }
 
 #define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
   (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
-                                         (__v8di)(__m512i)(B), \
-                                         (I), (__v8di)_mm512_setzero_si512(), \
+                                         (__v8di)(__m512i)(B), (int)(I), \
+                                         (__v8di)_mm512_setzero_si512(), \
                                          (__mmask8)-1); })
 
+#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
+                                         (__v8di)(__m512i)(B), (int)(imm), \
+                                         (__v8di)(__m512i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
+                                         (__v8di)(__m512i)(B), (int)(imm), \
+                                         (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)(U)); })
+
 #define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
   (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
-                                         (__v16si)(__m512i)(B), \
-                                         (I), (__v16si)_mm512_setzero_si512(), \
+                                         (__v16si)(__m512i)(B), (int)(I), \
+                                         (__v16si)_mm512_setzero_si512(), \
                                          (__mmask16)-1); })
 
+#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
+  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
+                                         (__v16si)(__m512i)(B), (int)(imm), \
+                                         (__v16si)(__m512i)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
+                                         (__v16si)(__m512i)(B), (int)(imm), \
+                                         (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)(U)); })
 /* Vector Extract */
 
 #define _mm512_extractf64x4_pd(A, I) __extension__ ({                    \
-      (__m256d)                                                          \
-        __builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A),           \
-                                         (I),                            \
-                                         (__v4df)_mm256_setzero_si256(), \
-                                         (__mmask8) -1); })
+  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
+                                            (__v4df)_mm256_setzero_si256(), \
+                                            (__mmask8)-1); })
+
+#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
+  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                            (__v4df)(__m256d)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
+  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)(U)); })
 
 #define _mm512_extractf32x4_ps(A, I) __extension__ ({                    \
-      (__m128)                                                           \
-        __builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A),           \
-                                         (I),                            \
-                                         (__v4sf)_mm_setzero_ps(),       \
-                                         (__mmask8) -1); })
+  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1); })
 
+#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
+  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                           (__v4sf)(__m128)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
+  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U)); })
 /* Vector Blend */
 
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
 {
-  return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
                  (__v8df) __W,
-                 (__mmask8) __U);
+                 (__v8df) __A);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
 {
-  return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
                 (__v16sf) __W,
-                (__mmask16) __U);
+                (__v16sf) __A);
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
 {
-  return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
                 (__v8di) __W,
-                (__mmask8) __U);
+                (__v8di) __A);
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
 {
-  return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
                 (__v16si) __W,
-                (__mmask16) __U);
+                (__v16si) __A);
 }
 
 /* Compare */
 
 #define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (P), (__mmask16)-1, (R)); })
+                                          (__v16sf)(__m512)(B), (int)(P), \
+                                          (__mmask16)-1, (int)(R)); })
 
 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (P), (__mmask16)(U), (R)); })
+                                          (__v16sf)(__m512)(B), (int)(P), \
+                                          (__mmask16)(U), (int)(R)); })
 
 #define _mm512_cmp_ps_mask(A, B, P) \
   _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
@@ -2400,13 +3575,13 @@
 
 #define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(B), \
-                                         (P), (__mmask8)-1, (R)); })
+                                         (__v8df)(__m512d)(B), (int)(P), \
+                                         (__mmask8)-1, (int)(R)); })
 
 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(B), \
-                                         (P), (__mmask8)(U), (R)); })
+                                         (__v8df)(__m512d)(B), (int)(P), \
+                                         (__mmask8)(U), (int)(R)); })
 
 #define _mm512_cmp_pd_mask(A, B, P) \
   _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
@@ -2416,6 +3591,22 @@
 
 /* Conversion */
 
+#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_undefined_epi32(), \
+                                             (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)(__m512i)(W), \
+                                             (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_setzero_si512(), \
+                                             (__mmask16)(U), (int)(R)); })
+
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_cvttps_epu32(__m512 __A)
 {
@@ -2426,15 +3617,80 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                   (__v16si) __W,
+                   (__mmask16) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                   (__v16si) _mm512_setzero_si512 (),
+                   (__mmask16) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
 #define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
-  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, (R)); })
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
 
 #define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
-  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
                                            (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)-1, (R)); })
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) _mm512_undefined_ps (),
+                 (__mmask16) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
 
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepi32_pd(__m256i __A)
@@ -2445,6 +3701,49 @@
                 (__mmask8) -1);
 }
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) _mm512_undefined_ps (),
+                (__mmask16) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) __W,
+                (__mmask16) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) _mm512_setzero_ps (),
+                (__mmask16) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepu32_pd(__m256i __A)
 {
@@ -2454,15 +3753,109 @@
                 (__mmask8) -1);
 }
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+                  (__v8df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+                  (__v8df) _mm512_setzero_pd (),
+                  (__mmask8) __U);
+}
+
 #define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
-  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(A), \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1, (R)); })
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                          (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtpd_ps (__m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) _mm256_undefined_ps (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_undefined_si256(), \
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_cvt_roundps_ph(U, W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)(__m256i)(U), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_maskz_cvt_roundps_ph(W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            (__mmask16)(W)); })
 
 #define _mm512_cvtps_ph(A, I) __extension__ ({ \
-  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(A), (I), \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
                                             (__v16hi)_mm256_setzero_si256(), \
-                                            -1); })
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_cvtps_ph(U, W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)(__m256i)(U), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_maskz_cvtps_ph(W, A, I) __extension__ ({\
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_cvt_roundph_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundph_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundph_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
 
 static  __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_cvtph_ps(__m256i __A)
@@ -2474,15 +3867,39 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_cvttps_epi32(__m512 __a)
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
 {
-  return (__m512i)
-    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
-                                     (__v16si) _mm512_setzero_si512 (),
-                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                 (__v16sf) _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)(__m256i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)(U), (int)(R)); })
+
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm512_cvttpd_epi32(__m512d __a)
 {
@@ -2492,67 +3909,437 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
-  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(A), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)-1, (R)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                  (__v8si) _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
 
 #define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
-  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(A), \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
                                             (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)-1, (R)); })
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)(__m512i)(W), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)(U), (int)(R)); })
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi32(__m512 __a)
+{
+  return (__m512i)
+    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
+                                     (__v16si) _mm512_setzero_si512 (),
+                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                  (__v16si) _mm512_setzero_si512 (),
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
 
 #define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
-  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(A), \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
                                            (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)-1, (R)); })
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                           (__v16si)(__m512i)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epi32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epi32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si) _mm512_undefined_epi32 (),
+                 (__mmask16) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
 
 #define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
-  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(A), \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
                                            (__v8si)_mm256_setzero_si256(), \
-                                           (__mmask8)-1, (R)); })
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8si)(__m256i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8si)_mm256_setzero_si256(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epi32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si)
+                 _mm256_undefined_si256 (),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
 
 #define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
-  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(A), \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
                                             (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)-1, (R)); })
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)(__m512i)(W), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epu32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epu32 ( __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
+                  (__v16si)\
+                  _mm512_undefined_epi32 (),\
+                  (__mmask16) -1,\
+                  _MM_FROUND_CUR_DIRECTION);\
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                  (__v16si) 
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U ,
+                  _MM_FROUND_CUR_DIRECTION);
+}
 
 #define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
-  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(A), \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
                                             (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8) -1, (R)); })
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_undefined_si256 (),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
 
 /* Unpack and Interleave */
+
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_unpackhi_pd(__m512d __a, __m512d __b)
 {
-  return __builtin_shufflevector(__a, __b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
+                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
+                                           (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
+                                           (__v8df)_mm512_setzero_pd());
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_unpacklo_pd(__m512d __a, __m512d __b)
 {
-  return __builtin_shufflevector(__a, __b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
+                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
+                                           (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
+                                           (__v8df)_mm512_setzero_pd());
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_unpackhi_ps(__m512 __a, __m512 __b)
 {
-  return __builtin_shufflevector(__a, __b,
-                                 2,    18,    3,    19,
-                                 2+4,  18+4,  3+4,  19+4,
-                                 2+8,  18+8,  3+8,  19+8,
-                                 2+12, 18+12, 3+12, 19+12);
+  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
+                                         2,    18,    3,    19,
+                                         2+4,  18+4,  3+4,  19+4,
+                                         2+8,  18+8,  3+8,  19+8,
+                                         2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
+                                          (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
+                                          (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_unpacklo_ps(__m512 __a, __m512 __b)
 {
-  return __builtin_shufflevector(__a, __b,
-                                 0,    16,    1,    17,
-                                 0+4,  16+4,  1+4,  17+4,
-                                 0+8,  16+8,  1+8,  17+8,
-                                 0+12, 16+12, 1+12, 17+12);
+  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
+                                         0,    16,    1,    17,
+                                         0+4,  16+4,  1+4,  17+4,
+                                         0+8,  16+8,  1+8,  17+8,
+                                         0+12, 16+12, 1+12, 17+12);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
+                                          (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
+                                          (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
+                                          2,    18,    3,    19,
+                                          2+4,  18+4,  3+4,  19+4,
+                                          2+8,  18+8,  3+8,  19+8,
+                                          2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
+                                       (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
+                                       (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
+                                          0,    16,    1,    17,
+                                          0+4,  16+4,  1+4,  17+4,
+                                          0+8,  16+8,  1+8,  17+8,
+                                          0+12, 16+12, 1+12, 17+12);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
+                                       (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
+                                       (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
+                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
+                                        (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
+                                        (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
+                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
+                                        (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
+                                        (__v8di)_mm512_setzero_si512());
 }
 
 /* Bit Test */
@@ -2565,6 +4352,13 @@
             (__mmask16) -1);
 }
 
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+                 (__v16si) __B, __U);
+}
+
 static __inline __mmask8 __DEFAULT_FN_ATTRS
 _mm512_test_epi64_mask(__m512i __A, __m512i __B)
 {
@@ -2573,57 +4367,88 @@
                  (__mmask8) -1);
 }
 
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
+}
+
+
 /* SIMD load ops */
 
 static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_loadu_si512 (void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
 {
-  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *)__P,
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
                                                      (__v16si)
                                                      _mm512_setzero_si512 (),
                                                      (__mmask16) __U);
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
 {
-  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *)__P,
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
                                                      (__v8di)
                                                      _mm512_setzero_si512 (),
                                                      (__mmask8) __U);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
 {
-  return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *)__P,
+  return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
                                                   (__v16sf)
                                                   _mm512_setzero_ps (),
                                                   (__mmask16) __U);
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
 {
-  return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *)__P,
-                                                   (__v8df)
-                                                   _mm512_setzero_pd (),
-                                                   (__mmask8) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
-{
-  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
-                                                  (__v16sf)
-                                                  _mm512_setzero_ps (),
-                                                  (__mmask16) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
-{
-  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
                                                    (__v8df)
                                                    _mm512_setzero_pd (),
                                                    (__mmask8) __U);
@@ -2656,6 +4481,23 @@
                                                   (__mmask16) -1);
 }
 
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
 static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_load_pd(double const *__p)
 {
@@ -2665,45 +4507,87 @@
                                                    (__mmask8) -1);
 }
 
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+                          (__v8df) __W,
+                          (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_si512 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_epi32 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_epi64 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
 /* SIMD store ops */
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
 {
-  __builtin_ia32_storedqudi512_mask ((__v8di *)__P, (__v8di) __A,
+  __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
                                      (__mmask8) __U);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
+            (__mmask16) -1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
 {
-  __builtin_ia32_storedqusi512_mask ((__v16si *)__P, (__v16si) __A,
+  __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
                                      (__mmask16) __U);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
 {
-  __builtin_ia32_storeupd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+  __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm512_storeu_pd(void *__P, __m512d __A)
 {
-  __builtin_ia32_storeupd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)-1);
+  __builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)-1);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
 {
-  __builtin_ia32_storeups512_mask ((__v16sf *)__P, (__v16sf) __A,
+  __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
                                    (__mmask16) __U);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm512_storeu_ps(void *__P, __m512 __A)
 {
-  __builtin_ia32_storeups512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)-1);
+  __builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)-1);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
@@ -2731,6 +4615,24 @@
   *(__m512*)__P = __A;
 }
 
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_si512 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_epi32 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_epi64 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
 /* Mask ops */
 
 static __inline __mmask16 __DEFAULT_FN_ATTRS
@@ -3029,46 +4931,4655 @@
                                                 __u);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+                (__v16si)
+                _mm512_setzero_si512 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+                (__v16si)
+                _mm512_setzero_si512 (),
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi64 (__m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi32 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+                (__v16si)
+                _mm512_setzero_si512 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+                (__v16si)
+                _mm512_setzero_si512 (),
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+                (__v16si)
+                _mm512_setzero_si512 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+                (__v16si)
+                _mm512_setzero_si512 (),
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_epi64 (__m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu16_epi32 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+                (__v16si)
+                _mm512_setzero_si512 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+                (__v16si)
+                _mm512_setzero_si512 (),
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu16_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+                (__v8di)
+                _mm512_setzero_si512 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rorv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rorv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+
+
 #define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
-                                         (__v16si)(__m512i)(b), (p), \
+                                         (__v16si)(__m512i)(b), (int)(p), \
                                          (__mmask16)-1); })
 
 #define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
-                                          (__v16si)(__m512i)(b), (p), \
+                                          (__v16si)(__m512i)(b), (int)(p), \
                                           (__mmask16)-1); })
 
 #define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
-                                        (__v8di)(__m512i)(b), (p), \
+                                        (__v8di)(__m512i)(b), (int)(p), \
                                         (__mmask8)-1); })
 
 #define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
-                                         (__v8di)(__m512i)(b), (p), \
+                                         (__v8di)(__m512i)(b), (int)(p), \
                                          (__mmask8)-1); })
 
 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
-                                         (__v16si)(__m512i)(b), (p), \
+                                         (__v16si)(__m512i)(b), (int)(p), \
                                          (__mmask16)(m)); })
 
 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
-                                          (__v16si)(__m512i)(b), (p), \
+                                          (__v16si)(__m512i)(b), (int)(p), \
                                           (__mmask16)(m)); })
 
 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
-                                        (__v8di)(__m512i)(b), (p), \
+                                        (__v8di)(__m512i)(b), (int)(p), \
                                         (__mmask8)(m)); })
 
 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
-                                         (__v8di)(__m512i)(b), (p), \
+                                         (__v8di)(__m512i)(b), (int)(p), \
                                          (__mmask8)(m)); })
 
+#define _mm512_rol_epi32(a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)-1); })
+
+#define _mm512_mask_rol_epi32(W, U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)(__m512i)(W), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_maskz_rol_epi32(U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_rol_epi64(a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)-1); })
+
+#define _mm512_mask_rol_epi64(W, U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)(__m512i)(W), (__mmask8)(U)); })
+
+#define _mm512_maskz_rol_epi64(U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rolv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rolv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+#define _mm512_ror_epi32(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)-1); })
+
+#define _mm512_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)(__m512i)(W), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_ror_epi64(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)-1); })
+
+#define _mm512_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)(__m512i)(W), (__mmask8)(U)); })
+
+#define _mm512_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)(U)); })
+
+#define _mm512_slli_epi32(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)-1); })
+
+#define _mm512_mask_slli_epi32(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)(__m512i)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_maskz_slli_epi32(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_slli_epi64(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)-1); })
+
+#define _mm512_mask_slli_epi64(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)(__m512i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm512_maskz_slli_epi64(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)(U)); })
+
+
+
+#define _mm512_srli_epi32(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)-1); })
+
+#define _mm512_mask_srli_epi32(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)(__m512i)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_maskz_srli_epi32(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_srli_epi64(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)-1); })
+
+#define _mm512_mask_srli_epi64(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)(__m512i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm512_maskz_srli_epi64(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)(U)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
+          (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+                 (__v16si) __A,
+                 (__v16si) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+                 (__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+                 (__v8di) __A,
+                 (__v8di) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+                 (__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_movedup_pd (__m512d __A)
+{
+  return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
+                                          0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_movedup_pd(__A),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_movedup_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_fixupimm_round_pd(A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)-1, \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), \
+                                              (int)(imm), (__mmask8)(U), \
+                                              (int)(R)); })
+
+#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), \
+                                              (int)(imm), (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_fixupimm_round_ps(A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1, \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U), \
+                                             (int)(R)); })
+
+#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_fixupimm_round_sd(A, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm_fixupimm_sd(A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_fixupimm_sd(A, U, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2di)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2di)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_fixupimm_round_ss(A, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)(U), (int)(R)); })
+
+#define _mm_fixupimm_ss(A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_fixupimm_ss(A, U, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4si)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4si)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_getexp_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)-1, (int)(R)); })
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_getexp_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
+                 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)(__m128d)(W), \
+                                                 (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_sd(U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)(U), (int)(R)); })
+
+#define _mm_getexp_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_getexp_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+                (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)(__m128)(W), \
+                                                (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_pd (),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_ss(U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)(U), (int)(R)); })
+
+#define _mm_getmant_round_sd(A, B, C, D, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, (int)(R)); })
+
+#define _mm_getmant_sd(A, B, C, D)  __extension__ ({ \
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_sd(W, U, A, B, C, D) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R)({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_getmant_sd(U, A, B, C, D) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U), (int)(R)); })
+
+#define _mm_getmant_round_ss(A, B, C, D, R) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_getmant_ss(A, B, C, D) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_ss(W, U, A, B, C, D) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R)({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_getmant_ss(U, A, B, C, D) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_pd(), \
+                                              (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kmov (__mmask16 __A)
+{
+  return  __A;
+}
+
+#define _mm_comi_round_sd(A, B, P, R) __extension__ ({\
+  (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
+                              (int)(P), (int)(R)); })
+
+#define _mm_comi_round_ss(A, B, P, R) __extension__ ({\
+  (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
+                              (int)(P), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
+         __mmask16 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
+                   (__v16si) __I
+                   /* idx */ ,
+                   (__v16si) __B,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_undefined_pd (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+                                            (__v16si)(__m512i)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+                                            (__v16si)(__m512i)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U)); })
+
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
+                                             (__v16si)(__m512i)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U)); })
+
+#define _mm512_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+                                            (__v8di)(__m512i)(B), \
+                                            (__v8di)(__m512i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+                                            (__v8di)(__m512i)(B), \
+                                            (__v8di)(__m512i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
+                                             (__v8di)(__m512i)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
+
+#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvt_roundsd_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvt_roundsd_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvtsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
+                                                  (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
+                 __A,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#endif
+
+#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvtss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
+                                                  (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvtss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
+                 __A,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvtt_roundsd_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_i32 (__m128d __A)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvtt_roundsd_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_i64 (__m128d __A)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvttsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
+                                                   (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
+                  __A,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvtt_roundss_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_i32 (__m128 __A)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvtt_roundss_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_i64 (__m128 __A)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvttss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
+                                                   (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvttss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
+                  __A,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
+            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
+              (__v8di) __I
+              /* idx */ ,
+              (__v8df) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
+            __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
+                   (__v16si) __I
+                   /* idx */ ,
+                   (__v16sf) __B,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
+         __mmask8 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
+                   (__v8di) __I
+                   /* idx */ ,
+                   (__v8di) __B,
+                   (__mmask8) __U);
+}
+
+#define _mm512_permute_pd(X, C) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x1), \
+                                   0 + (((C) >> 1) & 0x1), \
+                                   2 + (((C) >> 2) & 0x1), \
+                                   2 + (((C) >> 3) & 0x1), \
+                                   4 + (((C) >> 4) & 0x1), \
+                                   4 + (((C) >> 5) & 0x1), \
+                                   6 + (((C) >> 6) & 0x1), \
+                                   6 + (((C) >> 7) & 0x1)); })
+
+#define _mm512_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permute_pd((X), (C)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permute_pd((X), (C)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_permute_ps(X, C) __extension__ ({ \
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                   0  + (((C) >> 0) & 0x3), \
+                                   0  + (((C) >> 2) & 0x3), \
+                                   0  + (((C) >> 4) & 0x3), \
+                                   0  + (((C) >> 6) & 0x3), \
+                                   4  + (((C) >> 0) & 0x3), \
+                                   4  + (((C) >> 2) & 0x3), \
+                                   4  + (((C) >> 4) & 0x3), \
+                                   4  + (((C) >> 6) & 0x3), \
+                                   8  + (((C) >> 0) & 0x3), \
+                                   8  + (((C) >> 2) & 0x3), \
+                                   8  + (((C) >> 4) & 0x3), \
+                                   8  + (((C) >> 6) & 0x3), \
+                                   12 + (((C) >> 0) & 0x3), \
+                                   12 + (((C) >> 2) & 0x3), \
+                                   12 + (((C) >> 4) & 0x3), \
+                                   12 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_permute_ps((X), (C)), \
+                                      (__v16sf)(__m512)(W)); })
+
+#define _mm512_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_permute_ps((X), (C)), \
+                                      (__v16sf)_mm512_setzero_ps()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_permutevar_pd (__m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+              (__v8di) __C,
+              (__v8df)
+              _mm512_undefined_pd (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+              (__v8di) __C,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+              (__v8di) __C,
+              (__v8df)
+              _mm512_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_permutevar_ps (__m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                   (__v16si) __C,
+                   (__v16sf)
+                   _mm512_undefined_ps (),
+                   (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                   (__v16si) __C,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                   (__v16si) __C,
+                   (__v16sf)
+                   _mm512_setzero_ps (),
+                   (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                    /* idx */ ,
+                    (__v8df) __A,
+                    (__v8df) __B,
+                    (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                    /* idx */ ,
+                    (__v8df) __A,
+                    (__v8df) __B,
+                    (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
+            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
+                                                         /* idx */ ,
+                                                         (__v8df) __A,
+                                                         (__v8df) __B,
+                                                         (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                         /* idx */ ,
+                                                         (__v16sf) __A,
+                                                         (__v16sf) __B,
+                                                         (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                         /* idx */ ,
+                                                         (__v16sf) __A,
+                                                         (__v16sf) __B,
+                                                         (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
+            __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16sf) __A,
+                                                        (__v16sf) __B,
+                                                        (__mmask16) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+             (__v16si) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+             (__v16si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+            (__v8di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+            (__v8di) __B, __U);
+}
+
+#define _mm512_cvtt_roundpd_epu32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_undefined_si256(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)(__m256i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_setzero_si256(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_undefined_si256 (),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_roundscale_round_sd(A, B, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(imm), \
+                                                (int)(R)); })
+
+#define _mm_roundscale_sd(A, B, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(imm), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_sd(W, U, A, B, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U), (int)(imm), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U), (int)(I), \
+                                                (int)(R)); })
+
+#define _mm_maskz_roundscale_sd(U, A, B, I) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(I), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(I), \
+                                                (int)(R)); })
+
+#define _mm_roundscale_round_ss(A, B, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(imm), \
+                                               (int)(R)); })
+
+#define _mm_roundscale_ss(A, B, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(imm), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_ss(W, U, A, B, I) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U), (int)(I), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U), (int)(I), \
+                                               (int)(R)); })
+
+#define _mm_maskz_roundscale_ss(U, A, B, I) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(I), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(I), \
+                                               (int)(R)); })
+
+#define _mm512_scalef_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_scalef_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_scalef_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_scalef_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_undefined_ps (),
+               (__mmask16) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf) __W,
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_setzero_ps (),
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_scalef_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
+              (__v2df)( __B), (__v2df) _mm_setzero_pd(),
+              (__mmask8) -1,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_scalef_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
+             (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_srai_epi32(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)-1); })
+
+#define _mm512_mask_srai_epi32(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)(__m512i)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_maskz_srai_epi32(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                         (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_srai_epi64(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)-1); })
+
+#define _mm512_mask_srai_epi64(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)(__m512i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm512_maskz_srai_epi64(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                         (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)(U)); })
+
+#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_undefined_ps(), \
+                                         (__mmask16)-1); })
+
+#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)(__m512)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_undefined_pd(), \
+                                          (__mmask8)-1); })
+
+#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)(__m512d)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)_mm512_setzero_si512(), \
+                                          (__mmask16)-1); })
+
+#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)(__m512i)(W), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)_mm512_setzero_si512(), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)_mm512_setzero_si512(), \
+                                          (__mmask8)-1); })
+
+#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)(__m512i)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)_mm512_setzero_si512(), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_shuffle_pd(A, B, M) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), \
+                                   0  + (((M) >> 0) & 0x1), \
+                                   8  + (((M) >> 1) & 0x1), \
+                                   2  + (((M) >> 2) & 0x1), \
+                                   10 + (((M) >> 3) & 0x1), \
+                                   4  + (((M) >> 4) & 0x1), \
+                                   12 + (((M) >> 5) & 0x1), \
+                                   6  + (((M) >> 6) & 0x1), \
+                                   14 + (((M) >> 7) & 0x1)); })
+
+#define _mm512_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_shuffle_ps(A, B, M) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), \
+                                   0  + (((M) >> 0) & 0x3), \
+                                   0  + (((M) >> 2) & 0x3), \
+                                   16 + (((M) >> 4) & 0x3), \
+                                   16 + (((M) >> 6) & 0x3), \
+                                   4  + (((M) >> 0) & 0x3), \
+                                   4  + (((M) >> 2) & 0x3), \
+                                   20 + (((M) >> 4) & 0x3), \
+                                   20 + (((M) >> 6) & 0x3), \
+                                   8  + (((M) >> 0) & 0x3), \
+                                   8  + (((M) >> 2) & 0x3), \
+                                   24 + (((M) >> 4) & 0x3), \
+                                   24 + (((M) >> 6) & 0x3), \
+                                   12 + (((M) >> 0) & 0x3), \
+                                   12 + (((M) >> 2) & 0x3), \
+                                   28 + (((M) >> 4) & 0x3), \
+                                   28 + (((M) >> 6) & 0x3)); })
+
+#define _mm512_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+                                      (__v16sf)(__m512)(W)); })
+
+#define _mm512_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+                                      (__v16sf)_mm512_setzero_ps()); })
+
+#define _mm_sqrt_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm_sqrt_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x4 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf)
+                 _mm512_undefined_ps (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf) __O,
+                 __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcast_f64x4 (__m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df)
+                  _mm512_undefined_pd (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df) __O,
+                  __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x4 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si)
+                  _mm512_undefined_epi32 (),
+                  (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si) __O,
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i64x4 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di)
+                  _mm512_undefined_epi32 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di) __O,
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512(__M,
+                                              (__v8df) _mm512_broadcastsd_pd(__A),
+                                              (__v8df) __O);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512(__M,
+                                              (__v8df) _mm512_broadcastsd_pd(__A),
+                                              (__v8df) _mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512(__M,
+                                             (__v16sf) _mm512_broadcastss_ps(__A),
+                                             (__v16sf) __O);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512(__M,
+                                             (__v16sf) _mm512_broadcastss_ps(__A),
+                                             (__v16sf) _mm512_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) _mm_undefined_si128 (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) _mm256_undefined_si256 (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) _mm_undefined_si128 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) _mm256_undefined_si256 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) _mm256_setzero_si256 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) _mm_undefined_si128 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) _mm256_undefined_si256 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) _mm256_setzero_si256 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) _mm_undefined_si128 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) _mm_undefined_si128 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) _mm256_undefined_si256 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) _mm_undefined_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) _mm256_undefined_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) _mm_undefined_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                            (__v4si)_mm_undefined_si128(), \
+                                            (__mmask8)-1); })
+
+#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                            (__v4si)(__m128i)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                            (__v4si)_mm_setzero_si128(), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+                                            (__v4di)_mm256_undefined_si256(), \
+                                            (__mmask8)-1); })
+
+#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+                                            (__v4di)(__m256i)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+                                            (__v4di)_mm256_setzero_si256(), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
+                                           (__v4df)(__m256d)(B), (int)(imm), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1); })
+
+#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
+                                           (__v4df)(__m256d)(B), (int)(imm), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
+                                           (__v4df)(__m256d)(B), (int)(imm), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U)); })
+
+#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
+                                           (__v4di)(__m256i)(B), (int)(imm), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)-1); })
+
+#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
+                                           (__v4di)(__m256i)(B), (int)(imm), \
+                                           (__v8di)(__m512i)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
+                                           (__v4di)(__m256i)(B), (int)(imm), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)(U)); })
+
+#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
+                                          (__v4sf)(__m128)(B), (int)(imm), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1); })
+
+#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
+                                          (__v4sf)(__m128)(B), (int)(imm), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
+                                          (__v4sf)(__m128)(B), (int)(imm), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
+                                           (__v4si)(__m128i)(B), (int)(imm), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)-1); })
+
+#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
+                                           (__v4si)(__m128i)(B), (int)(imm), \
+                                           (__v16si)(__m512i)(W), \
+                                           (__mmask16)(U)); })
+
+#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
+                                           (__v4si)(__m128i)(B), (int)(imm), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)(U)); })
+
+#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_undefined_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_getmant_pd(A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_getmant_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_getmant_ps(A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_getexp_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_getexp_round_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_getexp_round_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_getexp_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_getexp_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_getexp_round_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_getexp_round_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_getexp_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_undefined_ps (),
+               (__mmask16) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_setzero_ps (),
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_i64gather_ps(index, addr, scale) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
+                                       (float const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\
+                                  __addr, __scale) __extension__({\
+__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
+                              __addr,(__v8di) __index, __mask, __scale);\
+})
+
+#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
+                                        (int const *)(addr), \
+                                        (__v8di)(__m512i)(index), \
+                                        (__mmask8)-1, (int)(scale)); })
+
+#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v8di)(__m512i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64gather_pd(index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
+                                       (double const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
+                                       (double const *)(addr), \
+                                       (__v8di)(__m512i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64gather_epi64(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_pd(), \
+                                       (long long const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
+                                       (long long const *)(addr), \
+                                       (__v8di)(__m512i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_ps(index, addr, scale) __extension__ ({\
+  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
+                                       (float const *)(addr), \
+                                       (__v16sf)(__m512)(index), \
+                                       (__mmask16)-1, (int)(scale)); })
+
+#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v16sf)(__m512)(index), \
+                                       (__mmask16)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_epi32(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
+                                        (int const *)(addr), \
+                                        (__v16si)(__m512i)(index), \
+                                        (__mmask16)-1, (int)(scale)); })
+
+#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v16si)(__m512i)(index), \
+                                        (__mmask16)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_pd(index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
+                                       (double const *)(addr), \
+                                       (__v8si)(__m256i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
+                                       (double const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_epi64(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
+                                       (long long const *)(addr), \
+                                       (__v8si)(__m256i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
+                                       (long long const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64scatter_ps(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)-1, \
+                                (__v8di)(__m512i)(index), \
+                                (__v8sf)(__m256)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)(mask), \
+                                (__v8di)(__m512i)(index), \
+                                (__v8sf)(__m256)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_epi32(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)-1, \
+                                (__v8di)(__m512i)(index), \
+                                (__v8si)(__m256i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)(mask), \
+                                (__v8di)(__m512i)(index), \
+                                (__v8si)(__m256i)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_pd(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)-1, \
+                               (__v8di)(__m512i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)(mask), \
+                               (__v8di)(__m512i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_epi64(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)-1, \
+                               (__v8di)(__m512i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)(mask), \
+                               (__v8di)(__m512i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_ps(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)-1, \
+                                (__v16si)(__m512i)(index), \
+                                (__v16sf)(__m512)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)(mask), \
+                                (__v16si)(__m512i)(index), \
+                                (__v16sf)(__m512)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_epi32(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)-1, \
+                                (__v16si)(__m512i)(index), \
+                                (__v16si)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)(mask), \
+                                (__v16si)(__m512i)(index), \
+                                (__v16si)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_pd(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_epi64(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+          -(__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+                                        -(__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
+          (__v4sf) __B,
+          -(__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         -(__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          -(__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         -(__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\
+  (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
+          -(__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
+                                        -(__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
+          (__v4sf) __B,
+          -(__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         -(__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
+          (__v4sf) __X,
+          -(__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
+  (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         -(__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2df)(__m128d)(C), (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+          -(__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
+          (__v2df) __B,
+          -(__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          -(__v2df)(__m128d)(C), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+          (__v2df) __X,
+          -(__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          -(__v2df)(__m128d)(Y), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2df)(__m128d)(C), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
+          -(__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
+          (__v2df) __B,
+          -(__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          -(__v2df)(__m128d)(C), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) (__W),
+          (__v2df) __X,
+          -(__v2df) (__Y),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          -(__v2df)(__m128d)(Y), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm512_permutex_pd(X, C) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x3), \
+                                   0 + (((C) >> 2) & 0x3), \
+                                   0 + (((C) >> 4) & 0x3), \
+                                   0 + (((C) >> 6) & 0x3), \
+                                   4 + (((C) >> 0) & 0x3), \
+                                   4 + (((C) >> 2) & 0x3), \
+                                   4 + (((C) >> 4) & 0x3), \
+                                   4 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permutex_pd(W, U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permutex_pd((X), (C)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_permutex_pd(U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permutex_pd((X), (C)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_permutex_epi64(X, C) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(X), \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   0 + (((C) >> 0) & 0x3), \
+                                   0 + (((C) >> 2) & 0x3), \
+                                   0 + (((C) >> 4) & 0x3), \
+                                   0 + (((C) >> 6) & 0x3), \
+                                   4 + (((C) >> 0) & 0x3), \
+                                   4 + (((C) >> 2) & 0x3), \
+                                   4 + (((C) >> 4) & 0x3), \
+                                   4 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_permutex_epi64((X), (C)), \
+                                      (__v8di)(__m512i)(W)); })
+
+#define _mm512_maskz_permutex_epi64(U, X, C) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_permutex_epi64((X), (C)), \
+                                      (__v8di)_mm512_setzero_si512()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) _mm512_undefined_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) _mm512_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) _mm512_undefined_epi32 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
+             __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) __W,
+                 __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) _mm512_undefined_ps (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) _mm512_setzero_ps (),
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) _mm512_undefined_epi32 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
+             __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) __W,
+                 __M);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kand (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kandn (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestc (__mmask16 __A, __mmask16 __B)
+{
+  return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestz (__mmask16 __A, __mmask16 __B)
+{
+  return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxnor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_si512 (__m512i * __P, __m512i __A)
+{
+  __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_stream_load_si512 (void *__P)
+{
+  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_pd (double *__P, __m512d __A)
+{
+  __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_ps (float *__P, __m512 __A)
+{
+  __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                  (__v8df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U);
+}
+
+#define _mm_cmp_round_ss_mask(X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm_cmp_ss_mask(X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)-1, \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_cmp_ss_mask(M, X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)(M), \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_cmp_round_sd_mask(X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm_cmp_sd_mask(X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)-1, \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_cmp_sd_mask(M, X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)(M), \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_movehdup_ps (__m512 __A)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
+                         1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_movehdup_ps(__A),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_movehdup_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_moveldup_ps (__m512 __A)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
+                         0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_moveldup_ps(__A),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_moveldup_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_shuffle_epi32(A, I) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   0  + (((I) >> 0) & 0x3), \
+                                   0  + (((I) >> 2) & 0x3), \
+                                   0  + (((I) >> 4) & 0x3), \
+                                   0  + (((I) >> 6) & 0x3), \
+                                   4  + (((I) >> 0) & 0x3), \
+                                   4  + (((I) >> 2) & 0x3), \
+                                   4  + (((I) >> 4) & 0x3), \
+                                   4  + (((I) >> 6) & 0x3), \
+                                   8  + (((I) >> 0) & 0x3), \
+                                   8  + (((I) >> 2) & 0x3), \
+                                   8  + (((I) >> 4) & 0x3), \
+                                   8  + (((I) >> 6) & 0x3), \
+                                   12 + (((I) >> 0) & 0x3), \
+                                   12 + (((I) >> 2) & 0x3), \
+                                   12 + (((I) >> 4) & 0x3), \
+                                   12 + (((I) >> 6) & 0x3)); })
+
+#define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                      (__v16si)(__m512i)(W)); })
+
+#define _mm512_maskz_shuffle_epi32(U, A, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                      (__v16si)_mm512_setzero_si512()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                (__v8di) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+              (__v8df) _mm512_setzero_pd(),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+              (__v8di) _mm512_setzero_pd(),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+                   (__v16sf) _mm512_setzero_ps(),
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+              (__v16si) _mm512_setzero_ps(),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_setzero_ps(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                (__v16si) _mm512_setzero_ps(),
+                (__mmask16) __U);
+}
+
+#define _mm512_cvt_roundps_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtps_pd (__m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df)
+                _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+              (__v8df) __A,
+              (__v8df) __W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+              (__v8df) __A,
+              (__v8df) _mm512_setzero_pd ());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+             (__v16sf) __A,
+             (__v16sf) __W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+             (__v16sf) __A,
+             (__v16sf) _mm512_setzero_ps ());
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
+            (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
+            (__mmask16) __U);
+}
+
+#define _mm_cvt_roundsd_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)_mm_undefined_ps(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
+{
+  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
+                                             (__v2df)(__B),
+                                             (__v4sf)(__W), 
+                                             (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
+{
+  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
+                                             (__v2df)(__B),
+                                             (__v4sf)_mm_setzero_ps(), 
+                                             (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtss_i32 _mm_cvtss_si32
+#define _mm_cvtsd_i32 _mm_cvtsd_si32
+#define _mm_cvti32_sd _mm_cvtsi32_sd
+#define _mm_cvti32_ss _mm_cvtsi32_ss
+#ifdef __x86_64__
+#define _mm_cvtss_i64 _mm_cvtss_si64
+#define _mm_cvtsd_i64 _mm_cvtsd_si64
+#define _mm_cvti64_sd _mm_cvtsi64_sd
+#define _mm_cvti64_ss _mm_cvtsi64_ss
+#endif
+
+#ifdef __x86_64__
+#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+                                     (int)(R)); })
+
+#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+                                     (int)(R)); })
+#endif
+
+#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+
+#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+                                    (int)(R)); })
+
+#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+                                    (int)(R)); })
+#endif
+
+#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)_mm_undefined_pd(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_cvt_roundss_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
+{
+  return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
+                                              (__v4sf)(__B),
+                                              (__v2df)(__W),
+                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
+{
+  return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
+                                              (__v4sf)(__B),
+                                              (__v2df)_mm_setzero_pd(), 
+                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtu32_sd (__m128d __A, unsigned __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
+                                      (unsigned long long)(B), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
+                                     (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtu32_ss (__m128 __A, unsigned __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
+                                     (unsigned long long)(B), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
+                 __M);
+}
+
+#ifdef __x86_64__
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
+                 __M);
+}
+#endif
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+     int __E, int __F, int __G, int __H,
+     int __I, int __J, int __K, int __L,
+     int __M, int __N, int __O, int __P)
+{
+  return __extension__ (__m512i)(__v16si)
+  { __P, __O, __N, __M, __L, __K, __J, __I,
+    __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,           \
+       e8,e9,e10,e11,e12,e13,e14,e15)          \
+  _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
+                   (e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_set_epi64 (long long __A, long long __B, long long __C,
+     long long __D, long long __E, long long __F,
+     long long __G, long long __H)
+{
+  return __extension__ (__m512i) (__v8di)
+  { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
+  _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_set_pd (double __A, double __B, double __C, double __D,
+        double __E, double __F, double __G, double __H)
+{
+  return __extension__ (__m512d)
+  { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
+  _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_set_ps (float __A, float __B, float __C, float __D,
+        float __E, float __F, float __G, float __H,
+        float __I, float __J, float __K, float __L,
+        float __M, float __N, float __O, float __P)
+{
+  return __extension__ (__m512)
+  { __P, __O, __N, __M, __L, __K, __J, __I,
+    __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
+  _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
+                (e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_abs_ps(__m512 A)
+{
+  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_abs_ps(__m512 W, __mmask16 K, __m512 A)
+{
+  return (__m512)_mm512_mask_and_epi32((__m512i)W, K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_abs_pd(__m512d A)
+{
+  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A) ;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_abs_pd(__m512d W, __mmask8 K, __m512d A)
+{
+  return (__m512d)_mm512_mask_and_epi64((__v8di)W, K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A);
+}
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif // __AVX512FINTRIN_H
diff --git a/lib/Headers/avx512ifmaintrin.h b/lib/Headers/avx512ifmaintrin.h
new file mode 100644
index 0000000..5defbae
--- /dev/null
+++ b/lib/Headers/avx512ifmaintrin.h
@@ -0,0 +1,92 @@
+/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAINTRIN_H
+#define __IFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
+                   (__v8di) __Y,
+                   (__v8di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
+                   (__v8di) __X,
+                   (__v8di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
+              (__v8di) __Y,
+              (__v8di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
+                   (__v8di) __Y,
+                   (__v8di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
+                   (__v8di) __X,
+                   (__v8di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
+              (__v8di) __Y,
+              (__v8di) __Z,
+              (__mmask8) __M);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/lib/Headers/avx512ifmavlintrin.h b/lib/Headers/avx512ifmavlintrin.h
new file mode 100644
index 0000000..131ee5c
--- /dev/null
+++ b/lib/Headers/avx512ifmavlintrin.h
@@ -0,0 +1,149 @@
+/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAVLINTRIN_H
+#define __IFMAVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
+                   (__v2di) __Y,
+                   (__v2di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
+                   (__v2di) __X,
+                   (__v2di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
+                   (__v4di) __Y,
+                   (__v4di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
+                   (__v4di) __X,
+                   (__v4di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
+                   (__v2di) __Y,
+                   (__v2di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
+                   (__v2di) __X,
+                   (__v2di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
+                   (__v4di) __Y,
+                   (__v4di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
+                   (__v4di) __X,
+                   (__v4di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __Z,
+              (__mmask8) __M);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/lib/Headers/avx512pfintrin.h b/lib/Headers/avx512pfintrin.h
new file mode 100644
index 0000000..c7fa3cf
--- /dev/null
+++ b/lib/Headers/avx512pfintrin.h
@@ -0,0 +1,111 @@
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512PFINTRIN_H
+#define __AVX512PFINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
+
+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+              
+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
+                             (__v16si)(__m512i)(index), (int const *)(addr), \
+                             (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfdps((__mmask16) -1, \
+                             (__v16si)(__m512i)(index), (int const *)(addr), \
+                             (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+
+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+              
+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (int const *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (int const *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
+                              (__v16si)(__m512i)(index), (int *)(addr), \
+                              (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/lib/Headers/avx512vbmiintrin.h b/lib/Headers/avx512vbmiintrin.h
new file mode 100644
index 0000000..837238e
--- /dev/null
+++ b/lib/Headers/avx512vbmiintrin.h
@@ -0,0 +1,137 @@
+/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIINTRIN_H
+#define __VBMIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
+         __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
+              (__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __B,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __A,
+              (__v64qi) __B,
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __A,
+              (__v64qi) __B,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
+               /* idx */ ,
+               (__v64qi) __A,
+               (__v64qi) __B,
+               (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_undefined_epi32 (),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_setzero_si512(),
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) __W,
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) __W,
+                (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) _mm512_setzero_si512 (),
+                (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) _mm512_undefined_epi32 (),
+                (__mmask64) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/lib/Headers/avx512vbmivlintrin.h b/lib/Headers/avx512vbmivlintrin.h
new file mode 100644
index 0000000..105c6d1
--- /dev/null
+++ b/lib/Headers/avx512vbmivlintrin.h
@@ -0,0 +1,247 @@
+/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIVLINTRIN_H
+#define __VBMIVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
+              (__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __B,
+              (__mmask16)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
+         __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
+              (__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __B,
+              (__mmask32)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __A,
+              (__v16qi) __B,
+              (__mmask16) -
+              1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __A,
+              (__v16qi) __B,
+              (__mmask16)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
+               /* idx */ ,
+               (__v16qi) __A,
+               (__v16qi) __B,
+               (__mmask16)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __A,
+              (__v32qi) __B,
+              (__mmask32) -
+              1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __A,
+              (__v32qi) __B,
+              (__mmask32)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
+               /* idx */ ,
+               (__v32qi) __A,
+               (__v32qi) __B,
+               (__mmask32)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_undefined_si128 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_setzero_si128 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) __W,
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_undefined_si256 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_setzero_si256 (),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) __W,
+                 (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi) __W,
+                (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi)
+                _mm_setzero_si128 (),
+                (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi)
+                _mm_undefined_si128 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi) __W,
+                (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi)
+                _mm256_setzero_si256 (),
+                (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi)
+                _mm256_undefined_si256 (),
+                (__mmask32) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/lib/Headers/avx512vlbwintrin.h b/lib/Headers/avx512vlbwintrin.h
index b4542d6..990e992 100644
--- a/lib/Headers/avx512vlbwintrin.h
+++ b/lib/Headers/avx512vlbwintrin.h
@@ -31,6 +31,11 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw")))
 
+static  __inline __m128i __DEFAULT_FN_ATTRS
+_mm_setzero_hi(void){
+    return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
 /* Integer compare */
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
@@ -781,33 +786,33 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
 {
-  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+              (__v16qi) __W,
+              (__v16qi) __A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
 {
-  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
                (__v32qi) __W,
-               (__mmask32) __U);
+               (__v32qi) __A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
 {
-  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
                (__v8hi) __W,
-               (__mmask8) __U);
+               (__v8hi) __A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
 {
-  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
                (__v16hi) __W,
-               (__mmask16) __U);
+               (__v16hi) __A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -1994,6 +1999,25 @@
                __M);
 }
 
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm256_cvtepi16_epi8 (__m256i __A) {
   return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
@@ -2015,6 +2039,23 @@
                __M);
 }
 
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+  __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+  __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
+}
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
@@ -2116,220 +2157,1249 @@
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
-      __m128i __B) {
-  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpackhi_epi8(__A, __B),
+                                           (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128(),
-               (__mmask16) __U);
+_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpackhi_epi8(__A, __B),
+                                           (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
-         __m256i __B) {
-  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpackhi_epi8(__A, __B),
+                                        (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256(),
-               (__mmask32) __U);
+_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpackhi_epi8(__A, __B),
+                                        (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-       __m128i __B) {
-  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpackhi_epi16(__A, __B),
+                                           (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpackhi_epi16(__A, __B),
+                                           (__v8hi) _mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-          __m256i __B) {
-  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpackhi_epi16(__A, __B),
+                                       (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpackhi_epi16(__A, __B),
+                                       (__v16hi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
-      __m128i __B) {
-  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpacklo_epi8(__A, __B),
+                                           (__v16qi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128(),
-               (__mmask16) __U);
+_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpacklo_epi8(__A, __B),
+                                           (__v16qi)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
-         __m256i __B) {
-  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpacklo_epi8(__A, __B),
+                                        (__v32qi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256(),
-               (__mmask32) __U);
+_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpacklo_epi8(__A, __B),
+                                        (__v32qi)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-       __m128i __B) {
-  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpacklo_epi16(__A, __B),
+                                           (__v8hi)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpacklo_epi16(__A, __B),
+                                           (__v8hi) _mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-          __m256i __B) {
-  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpacklo_epi16(__A, __B),
+                                       (__v16hi)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpacklo_epi16(__A, __B),
+                                       (__v16hi)_mm256_setzero_si256());
 }
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
+                (__v8hi) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
+                (__v8hi)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
+                (__v16hi) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
+                (__v16hi)
+                _mm256_setzero_si256 (),
+                (__mmask16) __U);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
+                (__v8hi) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
+                (__v8hi)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
+                (__v16hi) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
+                (__v16hi)
+                _mm256_setzero_si256 (),
+                (__mmask16) __U);
+}
+
+
 #define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
-                                         (__v16qi)(__m128i)(b), \
-                                         (p), (__mmask16)-1); })
+                                         (__v16qi)(__m128i)(b), (int)(p), \
+                                         (__mmask16)-1); })
 
 #define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
-                                         (__v16qi)(__m128i)(b), \
-                                         (p), (__mmask16)(m)); })
+                                         (__v16qi)(__m128i)(b), (int)(p), \
+                                         (__mmask16)(m)); })
 
 #define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
-                                          (__v16qi)(__m128i)(b), \
-                                          (p), (__mmask16)-1); })
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)-1); })
 
 #define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
-                                          (__v16qi)(__m128i)(b), \
-                                          (p), (__mmask16)(m)); })
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)(m)); })
 
 #define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \
   (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
-                                         (__v32qi)(__m256i)(b), \
-                                         (p), (__mmask32)-1); })
+                                         (__v32qi)(__m256i)(b), (int)(p), \
+                                         (__mmask32)-1); })
 
 #define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
   (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
-                                         (__v32qi)(__m256i)(b), \
-                                         (p), (__mmask32)(m)); })
+                                         (__v32qi)(__m256i)(b), (int)(p), \
+                                         (__mmask32)(m)); })
 
 #define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \
   (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
-                                          (__v32qi)(__m256i)(b), \
-                                          (p), (__mmask32)-1); })
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)-1); })
 
 #define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
   (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
-                                          (__v32qi)(__m256i)(b), \
-                                          (p), (__mmask32)(m)); })
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)(m)); })
 
 #define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
-                                        (__v8hi)(__m128i)(b), \
-                                        (p), (__mmask8)-1); })
+                                        (__v8hi)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
 
 #define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
-                                        (__v8hi)(__m128i)(b), \
-                                        (p), (__mmask8)(m)); })
+                                        (__v8hi)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
 
 #define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
-                                         (__v8hi)(__m128i)(b), \
-                                         (p), (__mmask8)-1); })
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
 #define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
-                                         (__v8hi)(__m128i)(b), \
-                                         (p), (__mmask8)(m)); })
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
 #define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
-                                         (__v16hi)(__m256i)(b), \
-                                         (p), (__mmask16)-1); })
+                                         (__v16hi)(__m256i)(b), (int)(p), \
+                                         (__mmask16)-1); })
 
 #define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
-                                         (__v16hi)(__m256i)(b), \
-                                         (p), (__mmask16)(m)); })
+                                         (__v16hi)(__m256i)(b), (int)(p), \
+                                         (__mmask16)(m)); })
 
 #define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
-                                          (__v16hi)(__m256i)(b), \
-                                          (p), (__mmask16)-1); })
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)-1); })
 
 #define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
-                                          (__v16hi)(__m256i)(b), \
-                                          (p), (__mmask16)(m)); })
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)(m)); })
+
+#define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+                                      (__v8hi)(__m128i)(W)); })
+
+#define _mm_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+                                      (__v8hi)_mm_setzero_hi()); })
+
+#define _mm256_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+                                      (__v16hi)(__m256i)(W)); })
+
+#define _mm256_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+                                      (__v16hi)_mm256_setzero_si256()); })
+
+#define _mm_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+                                      (__v8hi)(__m128i)(W)); })
+
+#define _mm_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+                                      (__v8hi)_mm_setzero_hi()); })
+
+#define _mm256_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v16hi)(__m256i)(W)); })
+
+#define _mm256_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v16hi)_mm256_setzero_si256()); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi)
+              _mm256_setzero_si256 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi)
+              _mm256_setzero_si256 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_hi (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
+             (__v8hi) __B,
+             (__v16hi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
+             (__v8hi) __B,
+             (__v16hi)
+             _mm256_setzero_si256 (),
+             (__mmask16) __U);
+}
+
+#define _mm_mask_slli_epi16(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \
+                                         (__v8hi)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_slli_epi16(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \
+                                         (__v8hi)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_mask_slli_epi16(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \
+                                         (__v16hi)(__m256i)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm256_maskz_slli_epi16(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \
+                                         (__v16hi)_mm256_setzero_si256(), \
+                                         (__mmask16)(U)); })
+
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi)
+              _mm256_setzero_si256 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi)
+              _mm256_setzero_si256 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_hi (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi)
+              _mm256_setzero_si256 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi)
+              _mm256_setzero_si256 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_hi (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
+             (__v8hi) __B,
+             (__v16hi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
+             (__v8hi) __B,
+             (__v16hi)
+             _mm256_setzero_si256 (),
+             (__mmask16) __U);
+}
+
+#define _mm_mask_srai_epi16(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
+                                         (__v8hi)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_srai_epi16(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
+                                         (__v8hi)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_mask_srai_epi16(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
+                                         (__v16hi)(__m256i)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm256_maskz_srai_epi16(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
+                                         (__v16hi)_mm256_setzero_si256(), \
+                                         (__mmask16)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
+             (__v8hi) __B,
+             (__v16hi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
+             (__v8hi) __B,
+             (__v16hi)
+             _mm256_setzero_si256 (),
+             (__mmask16) __U);
+}
+
+#define _mm_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
+                                         (__v8hi)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_srli_epi16(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
+                                         (__v8hi)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
+                                         (__v16hi)(__m256i)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm256_maskz_srli_epi16(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
+                                         (__v16hi)_mm256_setzero_si256(), \
+                                         (__mmask16)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+                (__v8hi) __A,
+                (__v8hi) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+                (__v8hi) __A,
+                (__v8hi) _mm_setzero_hi ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+                (__v16hi) __A,
+                (__v16hi) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+                (__v16hi) __A,
+                (__v16hi) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+                (__v16qi) __A,
+                (__v16qi) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+                (__v16qi) __A,
+                (__v16qi) _mm_setzero_hi ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+                (__v32qi) __A,
+                (__v32qi) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+                (__v32qi) __A,
+                (__v32qi) _mm256_setzero_si256 ());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+                 (__v16qi) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+                 (__v16qi)
+                 _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+                 (__v32qi) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+                 (__v32qi)
+                 _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
+                 (__v8hi) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
+                 (__v8hi)
+                 _mm_setzero_hi (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
+                 (__v16hi) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
+                 (__v16hi)
+                 _mm256_setzero_si256 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
+                 (__v16qi) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
+                 (__v16qi)
+                 _mm_setzero_si128 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
+                 (__v32qi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
+                 (__v32qi)
+                 _mm256_setzero_si256 (),
+                 (__mmask32) __U);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedquhi128_mask ((__v8hi *) __P,
+             (__v8hi) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
+{
+  __builtin_ia32_storedquhi256_mask ((__v16hi *) __P,
+             (__v16hi) __A,
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
+{
+  __builtin_ia32_storedquqi128_mask ((__v16qi *) __P,
+             (__v16qi) __A,
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
+{
+  __builtin_ia32_storedquqi256_mask ((__v32qi *) __P,
+             (__v32qi) __A,
+             (__mmask32) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_test_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+            (__v16qi) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+            (__v16qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_test_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+            (__v32qi) __B,
+            (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+            (__v32qi) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                 (__v8hi) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                 (__v8hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_test_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+            (__v16hi) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+            (__v16hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_testn_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+             (__v16qi) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+             (__v16qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+             (__v32qi) __B,
+             (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+             (__v32qi) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+            (__v8hi) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+            (__v8hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+             (__v16hi) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+             (__v16hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_movepi8_mask (__m128i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_movepi8_mask (__m256i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi16_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_movepi16_mask (__m256i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi8 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi8 (__mmask32 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi16 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi16 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__M,
+                                             (__v16qi) _mm_broadcastb_epi8(__A),
+                                             (__v16qi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__M,
+                                             (__v16qi) _mm_broadcastb_epi8(__A),
+                                             (__v16qi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__M,
+                                             (__v32qi) _mm256_broadcastb_epi8(__A),
+                                             (__v32qi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__M,
+                                             (__v32qi) _mm256_broadcastb_epi8(__A),
+                                             (__v32qi) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__M,
+                                             (__v8hi) _mm_broadcastw_epi16(__A),
+                                             (__v8hi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__M,
+                                             (__v8hi) _mm_broadcastw_epi16(__A),
+                                             (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__M,
+                                             (__v16hi) _mm256_broadcastw_epi16(__A),
+                                             (__v16hi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__M,
+                                             (__v16hi) _mm256_broadcastw_epi16(__A),
+                                             (__v16hi) _mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                 (__v16hi) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                 (__v16hi) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                 (__v8hi) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                 (__v8hi) _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) _mm_undefined_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) _mm_setzero_si128 (),
+                 (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) __W,
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) _mm256_undefined_si256 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) _mm256_setzero_si256 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) __W,
+                 (__mmask16) __M);
+}
+
+#define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \
+                                          (__v16qi)(__m128i)(B), (int)(N), \
+                                          (__v16qi)(__m128i)(W), \
+                                          (__mmask16)(U)); })
+
+#define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \
+                                          (__v16qi)(__m128i)(B), (int)(N), \
+                                          (__v16qi)_mm_setzero_si128(), \
+                                          (__mmask16)(U)); })
+
+#define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
+  (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \
+                                          (__v32qi)(__m256i)(B), (int)(N), \
+                                          (__v32qi)(__m256i)(W), \
+                                          (__mmask32)(U)); })
+
+#define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
+  (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \
+                                          (__v32qi)(__m256i)(B), (int)(N), \
+                                          (__v32qi)_mm256_setzero_si256(), \
+                                          (__mmask32)(U)); })
+
+#define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)_mm_setzero_hi(), \
+                                           (__mmask8)-1); })
+
+#define _mm_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)(__m128i)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)_mm_setzero_si128(), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_dbsad_epu8(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)_mm256_setzero_si256(), \
+                                           (__mmask16)-1); })
+
+#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)(__m256i)(W), \
+                                           (__mmask16)(U)); })
+
+#define _mm256_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)_mm256_setzero_si256(), \
+                                           (__mmask16)(U)); })
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/lib/Headers/avx512vlcdintrin.h b/lib/Headers/avx512vlcdintrin.h
new file mode 100644
index 0000000..7b02e2e
--- /dev/null
+++ b/lib/Headers/avx512vlcdintrin.h
@@ -0,0 +1,263 @@
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLCDINTRIN_H
+#define __AVX512VLCDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di)
+               _mm_setzero_di (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di)  _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLCDINTRIN_H */
diff --git a/lib/Headers/avx512vldqintrin.h b/lib/Headers/avx512vldqintrin.h
index dfd858e..8187bcd 100644
--- a/lib/Headers/avx512vldqintrin.h
+++ b/lib/Headers/avx512vldqintrin.h
@@ -33,7 +33,7 @@
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
-  return (__m256i) ((__v4di) __A * (__v4di) __B);
+  return (__m256i) ((__v4du) __A * (__v4du) __B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -55,7 +55,7 @@
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi64 (__m128i __A, __m128i __B) {
-  return (__m128i) ((__v2di) __A * (__v2di) __B);
+  return (__m128i) ((__v2du) __A * (__v2du) __B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -852,101 +852,413 @@
                 (__mmask8) __U);
 }
 
-#define _mm_range_pd(__A, __B, __C) __extension__ ({                         \
-  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) -1); })
+#define _mm_range_pd(A, B, C) __extension__ ({                         \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)_mm_setzero_pd(), \
+                                          (__mmask8)-1); })
 
-#define _mm_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({          \
-  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
-                (__v2df) __W, (__mmask8) __U); })
+#define _mm_mask_range_pd(W, U, A, B, C) __extension__ ({          \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)(__m128d)(W), \
+                                          (__mmask8)(U)); })
 
-#define _mm_maskz_range_pd(__U, __A, __B, __C) __extension__ ({              \
-  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) __U); })
+#define _mm_maskz_range_pd(U, A, B, C) __extension__ ({              \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)_mm_setzero_pd(), \
+                                          (__mmask8)(U)); })
 
-#define _mm256_range_pd(__A, __B, __C) __extension__ ({                      \
-  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
-                (__v4df) _mm256_setzero_pd(), (__mmask8) -1); })
+#define _mm256_range_pd(A, B, C) __extension__ ({                      \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)_mm256_setzero_pd(), \
+                                          (__mmask8)-1); })
 
-#define _mm256_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({       \
-  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
-                (__v4df) __W, (__mmask8) __U); })
+#define _mm256_mask_range_pd(W, U, A, B, C) __extension__ ({       \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)(__m256d)(W), \
+                                          (__mmask8)(U)); })
 
-#define _mm256_maskz_range_pd(__U, __A, __B, __C) __extension__ ({           \
-  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
-                (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+#define _mm256_maskz_range_pd(U, A, B, C) __extension__ ({           \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)_mm256_setzero_pd(), \
+                                          (__mmask8)(U)); })
 
-#define _mm_range_ps(__A, __B, __C) __extension__ ({                         \
-  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+#define _mm_range_ps(A, B, C) __extension__ ({                         \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)_mm_setzero_ps(), \
+                                         (__mmask8)-1); })
 
-#define _mm_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({          \
-  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
-                (__v4sf) __W, (__mmask8) __U); })
+#define _mm_mask_range_ps(W, U, A, B, C) __extension__ ({          \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)(__m128)(W), (__mmask8)(U)); })
 
-#define _mm_maskz_range_ps(__U, __A, __B, __C) __extension__ ({              \
-  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+#define _mm_maskz_range_ps(U, A, B, C) __extension__ ({              \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)_mm_setzero_ps(), \
+                                         (__mmask8)(U)); })
 
-#define _mm256_range_ps(__A, __B, __C) __extension__ ({                      \
-  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
-                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+#define _mm256_range_ps(A, B, C) __extension__ ({                      \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)_mm256_setzero_ps(), \
+                                         (__mmask8)-1); })
 
-#define _mm256_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({       \
-  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
-                (__v8sf) __W, (__mmask8) __U); })
+#define _mm256_mask_range_ps(W, U, A, B, C) __extension__ ({       \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)(__m256)(W), (__mmask8)(U)); })
 
-#define _mm256_maskz_range_ps(__U, __A, __B, __C) __extension__ ({           \
-  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
-                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+#define _mm256_maskz_range_ps(U, A, B, C) __extension__ ({           \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)_mm256_setzero_ps(), \
+                                         (__mmask8)(U)); })
 
-#define _mm_reduce_pd(__A, __B) __extension__ ({                \
-  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) -1); })
+#define _mm_reduce_pd(A, B) __extension__ ({                \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1); })
 
-#define _mm_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
-  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
-                (__v2df) __W, (__mmask8) __U); })
+#define _mm_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U)); })
 
-#define _mm_maskz_reduce_pd(__U, __A, __B) __extension__ ({     \
-  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
-                (__v2df) _mm_setzero_pd(), (__mmask8) __U); })
+#define _mm_maskz_reduce_pd(U, A, B) __extension__ ({     \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U)); })
 
-#define _mm256_reduce_pd(__A, __B) __extension__ ({                \
-  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
-                (__v4df) _mm256_setzero_pd(), (__mmask8) -1); })
+#define _mm256_reduce_pd(A, B) __extension__ ({                \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)_mm256_setzero_pd(), \
+                                           (__mmask8)-1); })
 
-#define _mm256_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
-  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
-                (__v4df) __W, (__mmask8) __U); })
+#define _mm256_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)(__m256d)(W), \
+                                           (__mmask8)(U)); })
 
-#define _mm256_maskz_reduce_pd(__U, __A, __B) __extension__ ({     \
-  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
-                (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+#define _mm256_maskz_reduce_pd(U, A, B) __extension__ ({     \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)_mm256_setzero_pd(), \
+                                           (__mmask8)(U)); })
 
-#define _mm_reduce_ps(__A, __B) __extension__ ({                   \
-  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+#define _mm_reduce_ps(A, B) __extension__ ({                   \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1); })
 
-#define _mm_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({    \
-  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
-                (__v4sf) __W, (__mmask8) __U); })
+#define _mm_mask_reduce_ps(W, U, A, B) __extension__ ({    \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)(__m128)(W), \
+                                          (__mmask8)(U)); })
 
-#define _mm_maskz_reduce_ps(__U, __A, __B) __extension__ ({        \
-  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
-                (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+#define _mm_maskz_reduce_ps(U, A, B) __extension__ ({        \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U)); })
 
-#define _mm256_reduce_ps(__A, __B) __extension__ ({                \
-  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
-                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+#define _mm256_reduce_ps(A, B) __extension__ ({                \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1); })
 
-#define _mm256_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({ \
-  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
-                (__v8sf) __W, (__mmask8) __U); })
+#define _mm256_mask_reduce_ps(W, U, A, B) __extension__ ({ \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)(__m256)(W), \
+                                          (__mmask8)(U)); })
 
-#define _mm256_maskz_reduce_ps(__U, __A, __B) __extension__ ({     \
-  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
-                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+#define _mm256_maskz_reduce_ps(U, A, B) __extension__ ({     \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U)); })
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi32_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_movepi32_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi32 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi32 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi64_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_movepi64_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_f32x2 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf)_mm256_undefined_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf) __O,
+                __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_f64x2 (__m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df)_mm256_undefined_pd(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df) __O,
+                 __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df) _mm256_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcast_i32x2 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si)_mm_undefined_si128(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si) _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i32x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si)_mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i64x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di)_mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di) _mm256_setzero_si256 (),
+                 __M);
+}
+
+#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+                                                (int)(imm), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1); })
+
+#define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+                                                (int)(imm), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U)); })
+
+#define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+                                                (int)(imm), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U)); })
+
+#define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+                                                (int)(imm), \
+                                                (__v2di)_mm_setzero_di(), \
+                                                (__mmask8)-1); })
+
+#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+                                                (int)(imm), \
+                                                (__v2di)(__m128i)(W), \
+                                                (__mmask8)(U)); })
+
+#define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+                                                (int)(imm), \
+                                                (__v2di)_mm_setzero_di(), \
+                                                (__mmask8)(U)); })
+
+#define _mm256_insertf64x2(A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(imm), \
+                                               (__v4df)_mm256_setzero_pd(), \
+                                               (__mmask8)-1); })
+
+#define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(imm), \
+                                               (__v4df)(__m256d)(W), \
+                                               (__mmask8)(U)); })
+
+#define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(imm), \
+                                               (__v4df)_mm256_setzero_pd(), \
+                                               (__mmask8)(U)); })
+
+#define _mm256_inserti64x2(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
+                                               (__v2di)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v4di)_mm256_setzero_si256(), \
+                                               (__mmask8)-1); })
+
+#define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
+                                               (__v2di)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v4di)(__m256i)(W), \
+                                               (__mmask8)(U)); })
+
+#define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
+                                               (__v2di)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v4di)_mm256_setzero_si256(), \
+                                               (__mmask8)(U)); })
+
+#define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__mmask8)-1); })
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/lib/Headers/avx512vlintrin.h b/lib/Headers/avx512vlintrin.h
index 8f13536..3c90f64 100644
--- a/lib/Headers/avx512vlintrin.h
+++ b/lib/Headers/avx512vlintrin.h
@@ -29,17 +29,22 @@
 #define __AVX512VLINTRIN_H
 
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
-#define __DEFAULT_FN_ATTRS_BOTH __attribute__((__always_inline__, __nodebug__, __target__("avx512vl, avx512bw")))
+
+/* Doesn't require avx512vl, used in avx512dqintrin.h */
+static  __inline __m128i __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+_mm_setzero_di(void) {
+  return (__m128i)(__v2di){ 0LL, 0LL};
+}
 
 /* Integer compare */
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
                                                   __u);
@@ -57,13 +62,13 @@
                                                 __u);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
                                                   __u);
@@ -81,13 +86,13 @@
                                                 __u);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
                                                   __u);
@@ -105,13 +110,13 @@
                                                 __u);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
                                                   __u);
@@ -226,13 +231,13 @@
                                                 __u);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
                                                   __u);
@@ -250,13 +255,13 @@
                                                 __u);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
                                                   __u);
@@ -274,13 +279,13 @@
                                                 __u);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
                                                   __u);
@@ -298,13 +303,13 @@
                                                 __u);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
                                                   __u);
@@ -885,437 +890,352 @@
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_and_si256(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+_mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_and_si128(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-        __m256i __B)
+_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                          (__v8si)_mm256_andnot_si256(__A, __B),
+                                          (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+  return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(),
+                                           __U, __A, __B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-           __m128i __B)
+_mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_andnot_si128(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              (__mmask8) __U);
+  return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-          __m256i __B)
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
-            (__v8si) __B,
-            (__v8si) __W,
-            (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_or_si256(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
-            (__v8si) __B,
-            (__v8si)
-            _mm256_setzero_si256 (),
-            (__mmask8) __U);
+  return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+_mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
-            (__v4si) __B,
-            (__v4si) __W,
-            (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_or_si128(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
-            (__v4si) __B,
-            (__v4si)
-            _mm_setzero_si128 (),
-            (__mmask8) __U);
+  return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_xor_si256(__A, __B),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_xor_si128(__A, __B),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+_mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di) __W, __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_and_si256(__A, __B),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di)
-             _mm256_setzero_pd (),
-             __U);
+  return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+_mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_and_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                          (__v4di)_mm256_andnot_si256(__A, __B),
+                                          (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_andnot_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_or_si256(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_or_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_xor_si256(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W, __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_xor_si128(__A, __B),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_pd (),
-             __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-        __m256i __B)
-{
-  return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W, __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_pd (),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-           __m128i __B)
-{
-  return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W, __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_pd (),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-          __m256i __B)
-{
-  return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
-            (__v4di) __B,
-            (__v4di) __W,
-            (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
-            (__v4di) __B,
-            (__v4di)
-            _mm256_setzero_si256 (),
-            (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
-            (__v2di) __B,
-            (__v2di) __W,
-            (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
-            (__v2di) __B,
-            (__v2di)
-            _mm_setzero_si128 (),
-            (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
-{
-  return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
-{
-  return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B);
 }
 
 #define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
-                                        (__v4si)(__m128i)(b), \
-                                        (p), (__mmask8)-1); })
+                                        (__v4si)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
 
 #define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
-                                        (__v4si)(__m128i)(b), \
-                                        (p), (__mmask8)(m)); })
+                                        (__v4si)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
 
 #define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
-                                         (__v4si)(__m128i)(b), \
-                                         (p), (__mmask8)-1); })
+                                         (__v4si)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
 #define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
-                                         (__v4si)(__m128i)(b), \
-                                         (p), (__mmask8)(m)); })
+                                         (__v4si)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
 #define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
-                                        (__v8si)(__m256i)(b), \
-                                        (p), (__mmask8)-1); })
+                                        (__v8si)(__m256i)(b), (int)(p), \
+                                        (__mmask8)-1); })
 
 #define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
-                                        (__v8si)(__m256i)(b), \
-                                        (p), (__mmask8)(m)); })
+                                        (__v8si)(__m256i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
 
 #define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
-                                         (__v8si)(__m256i)(b), \
-                                         (p), (__mmask8)-1); })
+                                         (__v8si)(__m256i)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
 #define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
-                                         (__v8si)(__m256i)(b), \
-                                         (p), (__mmask8)(m)); })
+                                         (__v8si)(__m256i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
 #define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
-                                        (__v2di)(__m128i)(b), \
-                                        (p), (__mmask8)-1); })
+                                        (__v2di)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
 
 #define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
-                                        (__v2di)(__m128i)(b), \
-                                        (p), (__mmask8)(m)); })
+                                        (__v2di)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
 
 #define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
-                                         (__v2di)(__m128i)(b), \
-                                         (p), (__mmask8)-1); })
+                                         (__v2di)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
 #define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
-                                         (__v2di)(__m128i)(b), \
-                                         (p), (__mmask8)(m)); })
+                                         (__v2di)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
 #define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
-                                        (__v4di)(__m256i)(b), \
-                                        (p), (__mmask8)-1); })
+                                        (__v4di)(__m256i)(b), (int)(p), \
+                                        (__mmask8)-1); })
 
 #define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
-                                        (__v4di)(__m256i)(b), \
-                                        (p), (__mmask8)(m)); })
+                                        (__v4di)(__m256i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
 
 #define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
-                                         (__v4di)(__m256i)(b), \
-                                         (p), (__mmask8)-1); })
+                                         (__v4di)(__m256i)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
 #define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
   (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
-                                         (__v4di)(__m256i)(b), \
-                                         (p), (__mmask8)(m)); })
+                                         (__v4di)(__m256i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
 #define _mm256_cmp_ps_mask(a, b, p)  __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
-                                         (__v8sf)(__m256)(b), \
-                                         (p), (__mmask8)-1); })
+                                         (__v8sf)(__m256)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
 #define _mm256_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
-                                         (__v8sf)(__m256)(b), \
-                                         (p), (__mmask8)(m)); })
+                                         (__v8sf)(__m256)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
 #define _mm256_cmp_pd_mask(a, b, p)  __extension__ ({ \
-  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \
-                                         (__v4df)(__m256)(b), \
-                                         (p), (__mmask8)-1); })
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
 #define _mm256_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
-  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \
-                                         (__v4df)(__m256)(b), \
-                                         (p), (__mmask8)(m)); })
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
-#define _mm128_cmp_ps_mask(a, b, p)  __extension__ ({ \
+#define _mm_cmp_ps_mask(a, b, p)  __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
-                                         (__v4sf)(__m128)(b), \
-                                         (p), (__mmask8)-1); })
+                                         (__v4sf)(__m128)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
-#define _mm128_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+#define _mm_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
   (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
-                                         (__v4sf)(__m128)(b), \
-                                         (p), (__mmask8)(m)); })
+                                         (__v4sf)(__m128)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
-#define _mm128_cmp_pd_mask(a, b, p)  __extension__ ({ \
-  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
-                                         (__v2df)(__m128)(b), \
-                                         (p), (__mmask8)-1); })
+#define _mm_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+                                         (__v2df)(__m128d)(b), (int)(p), \
+                                         (__mmask8)-1); })
 
-#define _mm128_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
-  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
-                                         (__v2df)(__m128)(b), \
-                                         (p), (__mmask8)(m)); })
+#define _mm_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+                                         (__v2df)(__m128d)(b), (int)(p), \
+                                         (__mmask8)(m)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
@@ -2044,58 +1964,58 @@
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
-  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
                 (__v4si) __W,
-                (__mmask8) __U);
+                (__v4si) __A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
-  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
                 (__v8si) __W,
-                (__mmask8) __U);
+                (__v8si) __A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
-  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
                  (__v2df) __W,
-                 (__mmask8) __U);
+                 (__v2df) __A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
-  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
                  (__v4df) __W,
-                 (__mmask8) __U);
+                 (__v4df) __A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
-  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
                 (__v4sf) __W,
-                (__mmask8) __U);
+                (__v4sf) __A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
-  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
                 (__v8sf) __W,
-                (__mmask8) __U);
+                (__v8sf) __A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
-  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
                 (__v2di) __W,
-                (__mmask8) __U);
+                (__v2di) __A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
-  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
                 (__v4di) __W,
-                (__mmask8) __U);
+                (__v4di) __A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -3833,61 +3753,79 @@
               __M);
 }
 
-#define _mm_roundscale_pd(__A, __imm) __extension__ ({ \
-  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, \
-                   __imm, (__v2df) _mm_setzero_pd (), (__mmask8) -1); })
+#define _mm_roundscale_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)-1); })
 
 
-#define _mm_mask_roundscale_pd(__W, __U, __A, __imm) __extension__ ({ \
-  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, __imm, \
-                   (__v2df) __W, (__mmask8) __U); })
+#define _mm_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U)); })
 
 
-#define _mm_maskz_roundscale_pd(__U, __A, __imm) __extension__ ({ \
-  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, __imm, \
-                   (__v2df) _mm_setzero_pd (), (__mmask8) __U); })
+#define _mm_maskz_roundscale_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U)); })
 
 
-#define _mm256_roundscale_pd(__A, __imm) __extension__ ({ \
-  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
-                   (__v4df) _mm256_setzero_pd (), (__mmask8) -1); })
+#define _mm256_roundscale_pd(A, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)-1); })
 
 
-#define _mm256_mask_roundscale_pd(__W, __U, __A, __imm) __extension__ ({ \
-  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
-                   (__v4df) __W, (__mmask8) __U); })
+#define _mm256_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)(__m256d)(W), \
+                                              (__mmask8)(U)); })
 
 
-#define _mm256_maskz_roundscale_pd(__U, __A, __imm)  __extension__ ({ \
-  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
-                   (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+#define _mm256_maskz_roundscale_pd(U, A, imm)  __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)(U)); })
 
-#define _mm_roundscale_ps(__A, __imm)  __extension__ ({ \
-  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
-                  (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+#define _mm_roundscale_ps(A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)-1); })
 
 
-#define _mm_mask_roundscale_ps(__W, __U, __A, __imm)  __extension__ ({ \
-  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
-                  (__v4sf) __W, (__mmask8) __U); })
+#define _mm_mask_roundscale_ps(W, U, A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U)); })
 
 
-#define _mm_maskz_roundscale_ps(__U, __A, __imm)  __extension__ ({ \
-  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
-                  (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+#define _mm_maskz_roundscale_ps(U, A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U)); })
 
-#define _mm256_roundscale_ps(__A, __imm)  __extension__ ({ \
-  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,__imm, \
-                  (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+#define _mm256_roundscale_ps(A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)-1); })
 
-#define _mm256_mask_roundscale_ps(__W, __U, __A,__imm)  __extension__ ({ \
-  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, __imm, \
-                  (__v8sf) __W, (__mmask8) __U); })
+#define _mm256_mask_roundscale_ps(W, U, A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)(__m256)(W), \
+                                             (__mmask8)(U)); })
 
 
-#define _mm256_maskz_roundscale_ps(__U, __A, __imm)  __extension__ ({ \
-  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, __imm, \
-                  (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+#define _mm256_maskz_roundscale_ps(U, A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)(U)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_scalef_pd (__m128d __A, __m128d __B) {
@@ -3996,153 +3934,165 @@
                (__mmask8) __U);
 }
 
-#define _mm_i64scatter_pd(__addr,__index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv2df(__addr, (__mmask8) 0xFF, (__v2di) __index, \
-                              (__v2df) __v1, __scale); })
+#define _mm_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
 
-#define _mm_mask_i64scatter_pd(__addr, __mask, __index, __v1, \
-                               __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index, \
-                               (__v2df) __v1, __scale); })
+#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
 
+#define _mm_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
 
-#define _mm_i64scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF, \
-        (__v2di) __index, (__v2di) __v1, __scale); })
+#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
 
-#define _mm_mask_i64scatter_epi64(__addr, __mask, __index, __v1,\
-                                  __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index,\
-        (__v2di) __v1, __scale); })
+#define _mm256_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
 
-#define _mm256_i64scatter_pd(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF,\
-        (__v4di) __index, (__v4df) __v1, __scale); })
+#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
 
-#define _mm256_mask_i64scatter_pd(__addr, __mask, __index, __v1,\
-                                   __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index,\
-        (__v4df) __v1, __scale); })
+#define _mm256_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
 
-#define _mm256_i64scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF, (__v4di) __index,\
-                               (__v4di) __v1, __scale); })
+#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
 
-#define _mm256_mask_i64scatter_epi64(__addr, __mask, __index, __v1,\
-                                      __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index,\
-        (__v4di) __v1, __scale); })
+#define _mm_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
 
-#define _mm_i64scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF,\
-        (__v2di) __index, (__v4sf) __v1, __scale); })
+#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
 
-#define _mm_mask_i64scatter_ps(__addr, __mask, __index, __v1, \
-                                __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index,\
-        (__v4sf) __v1, __scale); })
+#define _mm_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
 
-#define _mm_i64scatter_epi32(__addr, __index, __v1, \
-                              __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF,\
-        (__v2di) __index, (__v4si) __v1, __scale); })
+#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
 
-#define _mm_mask_i64scatter_epi32(__addr, __mask, __index, __v1,\
-         __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index,\
-        (__v4si) __v1, __scale); })
+#define _mm256_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
 
-#define _mm256_i64scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF, (__v4di) __index, \
-                              (__v4sf) __v1, __scale); })
+#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
 
-#define _mm256_mask_i64scatter_ps(__addr, __mask, __index, __v1, \
-                                   __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index, \
-        (__v4sf) __v1, __scale); })
+#define _mm256_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
 
-#define _mm256_i64scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF, \
-        (__v4di) __index, (__v4si) __v1, __scale); })
+#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({  \
+  __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
 
-#define _mm256_mask_i64scatter_epi32(__addr, __mask, __index, __v1, \
-                                      __scale) __extension__ ({  \
-  __builtin_ia32_scatterdiv8si(__addr, __mask, (__v4di) __index, \
-        (__v4si) __v1, __scale); })
+#define _mm_i32scatter_pd(addr, index, v1, scale) __extension__ ({      \
+  __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
 
-#define _mm_i32scatter_pd(__addr, __index, __v1,         \
-                          __scale) __extension__ ({      \
-  __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF, \
-        (__v4si) __index, (__v2df) __v1, __scale); })
+#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({        \
+  __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
 
-#define _mm_mask_i32scatter_pd(__addr, __mask, __index, __v1,    \
-                                __scale) __extension__ ({        \
-  __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index,\
-         (__v2df) __v1, __scale); })
+#define _mm_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
 
-#define _mm_i32scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF,                       \
-        (__v4si) __index, (__v2di) __v1, __scale); })
+#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
 
-#define _mm_mask_i32scatter_epi64(__addr, __mask, __index, __v1, \
-         __scale) __extension__ ({                                \
-  __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index, \
-        (__v2di) __v1, __scale); })
+#define _mm256_i32scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
 
-#define _mm256_i32scatter_pd(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF,                      \
-        (__v4si) __index, (__v4df) __v1, __scale); })
+#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
 
-#define _mm256_mask_i32scatter_pd(__addr, __mask, __index, __v1, \
-         __scale) __extension__ ({                                \
-  __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index, \
-        (__v4df) __v1, __scale); })
+#define _mm256_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
 
-#define _mm256_i32scatter_epi64(__addr, __index, __v1,    \
-                                __scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF,  \
-        (__v4si) __index, (__v4di) __v1, __scale); })
+#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
 
-#define _mm256_mask_i32scatter_epi64(__addr, __mask, __index, __v1, \
-            __scale) __extension__ ({                               \
-  __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index,   \
-        (__v4di) __v1, __scale); })
+#define _mm_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
 
-#define _mm_i32scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF,                   \
-        (__v4si) __index, (__v4sf) __v1, __scale); })
+#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
 
-#define _mm_mask_i32scatter_ps(__addr, __mask, __index, __v1,     \
-                               __scale) __extension__ ({          \
-  __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index, \
-        (__v4sf) __v1, __scale); })
+#define _mm_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
 
-#define _mm_i32scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF,                       \
-        (__v4si) __index, (__v4si) __v1, __scale); })
+#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
 
-#define _mm_mask_i32scatter_epi32(__addr, __mask, __index, __v1, \
-                                  __scale) __extension__ ({      \
-  __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index,\
-        (__v4si) __v1, __scale); })
+#define _mm256_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+                               (int)(scale)); })
 
-#define _mm256_i32scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF,                      \
-        (__v8si) __index, (__v8sf) __v1, __scale); })
+#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+                               (int)(scale)); })
 
-#define _mm256_mask_i32scatter_ps(__addr, __mask, __index, __v1, \
-                                   __scale) __extension__ ({     \
-  __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index,\
-        (__v8sf) __v1, __scale); })
+#define _mm256_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8si)(__m256i)(v1), (int)(scale)); })
 
-#define _mm256_i32scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
-  __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF,                         \
-        (__v8si) __index, (__v8si) __v1, __scale); })
-
-#define _mm256_mask_i32scatter_epi32(__addr, __mask, __index, __v1, \
-            __scale) __extension__ ({                                \
-  __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index,    \
-        (__v8si) __v1, __scale); })
+#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8si)(__m256i)(v1), (int)(scale)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) {
@@ -4600,7 +4550,4623 @@
               __U);
 }
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
+                (__v2di)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
+                (__v4di)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
+                (__v2di)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
+                (__v4di)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
+                (__v2di)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
+                (__v4di)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
+                (__v2di)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
+                (__v4di)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
+                (__v2di)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
+                (__v4di)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
+                (__v2di)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
+                (__v4di)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+
+#define _mm_rol_epi32(a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_rol_epi32(w, u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)(__m128i)(w), (__mmask8)(u)); })
+
+#define _mm_maskz_rol_epi32(u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)(u)); })
+
+#define _mm256_rol_epi32(a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_rol_epi32(w, u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)(__m256i)(w), (__mmask8)(u)); })
+
+#define _mm256_maskz_rol_epi32(u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)(u)); })
+
+#define _mm_rol_epi64(a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_rol_epi64(w, u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)(__m128i)(w), (__mmask8)(u)); })
+
+#define _mm_maskz_rol_epi64(u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)(u)); })
+
+#define _mm256_rol_epi64(a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_rol_epi64(w, u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)(__m256i)(w), (__mmask8)(u)); })
+
+#define _mm256_maskz_rol_epi64(u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)(u)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rolv_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rolv_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rolv_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rolv_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+#define _mm_ror_epi32(A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)(__m128i)(W), (__mmask8)(U)); })
+
+#define _mm_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)(U)); })
+
+#define _mm256_ror_epi32(A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)(__m256i)(W), (__mmask8)(U)); })
+
+#define _mm256_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)(U)); })
+
+#define _mm_ror_epi64(A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)(__m128i)(W), (__mmask8)(U)); })
+
+#define _mm_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)(U)); })
+
+#define _mm256_ror_epi64(A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)(__m256i)(W), (__mmask8)(U)); })
+
+#define _mm256_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
+             (__v4si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
+             (__v4si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+#define _mm_mask_slli_epi32(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                         (__v4si)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_slli_epi32(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                         (__v4si)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_mask_slli_epi32(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                         (__v8si)(__m256i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_maskz_slli_epi32(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                         (__v8si)_mm256_setzero_si256(), \
+                                         (__mmask8)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_di (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+#define _mm_mask_slli_epi64(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                         (__v2di)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_slli_epi64(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                         (__v2di)_mm_setzero_di(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_mask_slli_epi64(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                         (__v4di)(__m256i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_maskz_slli_epi64(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                         (__v4di)_mm256_setzero_si256(), \
+                                         (__mmask8)(U)); })
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rorv_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rorv_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rorv_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rorv_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+         __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
+             (__v2di) __Y,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
+             (__v2di) __Y,
+             (__v2di)
+             _mm_setzero_di (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+      __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
+             (__v4di) __Y,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
+             (__v4di) __Y,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+         __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
+             (__v4si) __Y,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
+             (__v4si) __Y,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+      __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
+             (__v8si) __Y,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
+             (__v8si) __Y,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+         __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
+             (__v2di) __Y,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
+             (__v2di) __Y,
+             (__v2di)
+             _mm_setzero_di (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+      __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
+             (__v4di) __Y,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
+             (__v4di) __Y,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+         __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
+             (__v4si) __Y,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
+             (__v4si) __Y,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+      __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
+             (__v8si) __Y,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
+             (__v8si) __Y,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
+             (__v4si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
+             (__v4si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+#define _mm_mask_srli_epi32(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \
+                                         (__v4si)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_srli_epi32(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \
+                                         (__v4si)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_mask_srli_epi32(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \
+                                         (__v8si)(__m256i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_maskz_srli_epi32(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \
+                                         (__v8si)_mm256_setzero_si256(), \
+                                         (__mmask8)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_di (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+#define _mm_mask_srli_epi64(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
+                                         (__v2di)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_srli_epi64(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
+                                         (__v2di)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_mask_srli_epi64(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
+                                         (__v4di)(__m256i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_maskz_srli_epi64(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
+                                         (__v4di)_mm256_setzero_si256(), \
+                                         (__mmask8)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+         __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
+             (__v4si) __Y,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
+             (__v4si) __Y,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+      __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
+             (__v8si) __Y,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
+             (__v8si) __Y,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+         __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+      __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+                 (__v4si) __A,
+                 (__v4si) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+                 (__v4si) __A,
+                 (__v4si) _mm_setzero_si128 ());
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+                 (__v8si) __A,
+                 (__v8si) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+                 (__v8si) __A,
+                 (__v8si) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+              (__v4si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+              (__v8si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
+          (__v4si) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
+          (__v8si) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+                 (__v2di) __A,
+                 (__v2di) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+                 (__v2di) __A,
+                 (__v2di) _mm_setzero_di ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+                 (__v4di) __A,
+                 (__v4di) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+                 (__v4di) __A,
+                 (__v4di) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+              (__v2di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+              (__v4di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
+          (__v2di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
+          (__v4di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_movedup_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_movedup_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_movedup_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_movedup_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+
+#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \
+  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
+                                                  (__v4si)(__m128i)(O), \
+                                                  (__mmask8)(M)); })
+
+#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \
+  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
+                                                  (__v4si)_mm_setzero_si128(), \
+                                                  (__mmask8)(M)); })
+
+#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \
+  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
+                                                  (__v8si)(__m256i)(O), \
+                                                  (__mmask8)(M)); })
+
+#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \
+  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
+                                                  (__v8si)_mm256_setzero_si256(), \
+                                                  (__mmask8)(M)); })
+
+#ifdef __x86_64__
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 __M);
+}
+#endif
+
+#define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2di)(__m128i)(C), \
+                                              (int)(imm), (__mmask8)(U)); })
+
+#define _mm256_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+                                             (__v4df)(__m256d)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+                                             (__v4df)(__m256d)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (__v4di)(__m256i)(C), \
+                                              (int)(imm), (__mmask8)(U)); })
+
+#define _mm_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4si)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+                                            (__v8sf)(__m256)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+                                            (__v8sf)(__m256)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), \
+                                             (__v8si)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+               (__v2df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+               (__v4df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+              (__v4sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+              (__v8sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
+                 (__v2di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
+                 (__v4di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
+               (__v2df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
+               (__v4df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
+              (__v4sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
+              (__v8sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeapd128_mask ((__v2df *) __P,
+           (__v2df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeapd256_mask ((__v4df *) __P,
+           (__v4df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
+           (__v4sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
+           (__v8sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqudi128_mask ((__v2di *) __P,
+             (__v2di) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqudi256_mask ((__v4di *) __P,
+             (__v4di) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqusi128_mask ((__v4si *) __P,
+             (__v4si) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqusi256_mask ((__v8si *) __P,
+             (__v8si) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeupd128_mask ((__v2df *) __P,
+           (__v2df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeupd256_mask ((__v4df *) __P,
+           (__v4df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeups128_mask ((__v4sf *) __P,
+           (__v4sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeups256_mask ((__v8sf *) __P,
+           (__v8sf) __A,
+           (__mmask8) __U);
+}
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpackhi_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpackhi_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpackhi_pd(__A, __B),
+                                           (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpackhi_pd(__A, __B),
+                                           (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpackhi_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpackhi_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpackhi_ps(__A, __B),
+                                           (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpackhi_ps(__A, __B),
+                                           (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpacklo_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpacklo_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpacklo_pd(__A, __B),
+                                           (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpacklo_pd(__A, __B),
+                                           (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpacklo_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpacklo_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpacklo_ps(__A, __B),
+                                           (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpacklo_ps(__A, __B),
+                                           (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rcp14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_rcp14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_permute_pd((X), (C)), \
+                                       (__v2df)(__m128d)(W)); })
+
+#define _mm_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_permute_pd((X), (C)), \
+                                       (__v2df)_mm_setzero_pd()); })
+
+#define _mm256_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permute_pd((X), (C)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permute_pd((X), (C)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_permute_ps((X), (C)), \
+                                      (__v4sf)(__m128)(W)); })
+
+#define _mm_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_permute_ps((X), (C)), \
+                                      (__v4sf)_mm_setzero_ps()); })
+
+#define _mm256_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_permute_ps((X), (C)), \
+                                      (__v8sf)(__m256)(W)); })
+
+#define _mm256_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_permute_ps((X), (C)), \
+                                      (__v8sf)_mm256_setzero_ps()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
+      __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+                 (__v2di) __C,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+                 (__v2di) __C,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
+         __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+              (__v4di) __C,
+              (__v4df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+              (__v4di) __C,
+              (__v4df)
+              _mm256_setzero_pd (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
+      __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+                (__v4si) __C,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+                (__v4si) __C,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
+         __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+                   (__v8si) __C,
+                   (__v8sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+                   (__v8si) __C,
+                   (__v8sf)
+                   _mm256_setzero_ps (),
+                   (__mmask8) __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                 (__v4si) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                 (__v4si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_test_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                 (__v8si) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                 (__v8si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                 (__v2di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                 (__v2di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_test_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                 (__v4di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                 (__v4di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+            (__v4si) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+            (__v4si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+            (__v8si) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+            (__v8si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+            (__v2di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+            (__v2di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+            (__v4di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+            (__v4di) __B, __U);
+}
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpackhi_epi32(__A, __B),
+                                           (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpackhi_epi32(__A, __B),
+                                           (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpackhi_epi32(__A, __B),
+                                        (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpackhi_epi32(__A, __B),
+                                        (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpackhi_epi64(__A, __B),
+                                           (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpackhi_epi64(__A, __B),
+                                           (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpackhi_epi64(__A, __B),
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpackhi_epi64(__A, __B),
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpacklo_epi32(__A, __B),
+                                           (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpacklo_epi32(__A, __B),
+                                           (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpacklo_epi32(__A, __B),
+                                        (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpacklo_epi32(__A, __B),
+                                        (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpacklo_epi64(__A, __B),
+                                           (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpacklo_epi64(__A, __B),
+                                           (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpacklo_epi64(__A, __B),
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpacklo_epi64(__A, __B),
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+             (__v4si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+             (__v4si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+#define _mm_mask_srai_epi32(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \
+                                         (__v4si)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_srai_epi32(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \
+                                         (__v4si)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_mask_srai_epi32(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \
+                                         (__v8si)(__m256i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_maskz_srai_epi32(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \
+                                         (__v8si)_mm256_setzero_si256(), \
+                                         (__mmask8)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_di (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_di (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+#define _mm_srai_epi64(A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
+                                         (__v2di)_mm_setzero_di(), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_srai_epi64(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
+                                         (__v2di)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_srai_epi64(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
+                                         (__v2di)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_srai_epi64(A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
+                                         (__v4di)_mm256_setzero_si256(), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_srai_epi64(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
+                                         (__v4di)(__m256i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_maskz_srai_epi64(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
+                                         (__v4di)_mm256_setzero_si256(), \
+                                         (__mmask8)(U)); })
+
+#define _mm_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+                                            (__v4si)(__m128i)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+                                            (__v4si)(__m128i)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
+                                             (__v4si)(__m128i)(B), \
+                                             (__v4si)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+                                            (__v8si)(__m256i)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+                                            (__v8si)(__m256i)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
+                                             (__v8si)(__m256i)(B), \
+                                             (__v8si)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+                                            (__v2di)(__m128i)(B), \
+                                            (__v2di)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+                                            (__v2di)(__m128i)(B), \
+                                            (__v2di)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
+                                             (__v2di)(__m128i)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+                                            (__v4di)(__m256i)(B), \
+                                            (__v4di)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+                                            (__v4di)(__m256i)(B), \
+                                            (__v4di)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
+                                             (__v4di)(__m256i)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+
+
+#define _mm256_shuffle_f32x4(A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)(__m256)(W), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_shuffle_f64x2(A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)(__m256d)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_shuffle_i32x4(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)_mm256_setzero_si256(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)(__m256i)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)_mm256_setzero_si256(), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_shuffle_i64x2(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)_mm256_setzero_si256(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)(__m256i)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)_mm256_setzero_si256(), \
+                                              (__mmask8)(U)); })
+
+#define _mm_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+                                       (__v2df)(__m128d)(W)); })
+
+#define _mm_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+                                       (__v2df)_mm_setzero_pd()); })
+
+#define _mm256_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+                                      (__v4sf)(__m128)(W)); })
+
+#define _mm_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+                                      (__v4sf)_mm_setzero_ps()); })
+
+#define _mm256_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+                                      (__v8sf)(__m256)(W)); })
+
+#define _mm256_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+                                      (__v8sf)_mm256_setzero_ps()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rsqrt14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_rsqrt14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_f32x4 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf)_mm256_undefined_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf) __O,
+                __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i32x4 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
+                 (__v8si)_mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
+                 (__v8si)
+                 __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+                 __A,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256(__M,
+                                              (__v4df) _mm256_broadcastsd_pd(__A),
+                                              (__v4df) __O);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256(__M,
+                                              (__v4df) _mm256_broadcastsd_pd(__A),
+                                              (__v4df) _mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128(__M,
+                                             (__v4sf) _mm_broadcastss_ps(__A),
+                                             (__v4sf) __O);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128(__M,
+                                             (__v4sf) _mm_broadcastss_ps(__A),
+                                             (__v4sf) _mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256(__M,
+                                             (__v8sf) _mm256_broadcastss_ps(__A),
+                                             (__v8sf) __O);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256(__M,
+                                             (__v8sf) _mm256_broadcastss_ps(__A),
+                                             (__v8sf) _mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__M,
+                                             (__v4si) _mm_broadcastd_epi32(__A),
+                                             (__v4si) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__M,
+                                             (__v4si) _mm_broadcastd_epi32(__A),
+                                             (__v4si) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__M,
+                                             (__v8si) _mm256_broadcastd_epi32(__A),
+                                             (__v8si) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__M,
+                                             (__v8si) _mm256_broadcastd_epi32(__A),
+                                             (__v8si) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                             (__v2di) _mm_broadcastq_epi64(__A),
+                                             (__v2di) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                             (__v2di) _mm_broadcastq_epi64(__A),
+                                             (__v2di) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                             (__v4di) _mm256_broadcastq_epi64(__A),
+                                             (__v4di) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                             (__v4di) _mm256_broadcastq_epi64(__A),
+                                             (__v4di) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi)_mm_setzero_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi)__O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si)__O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) _mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  return __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi)_mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi)__O,
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+#define _mm256_extractf32x4_ps(A, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+                                               (int)(imm), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1); })
+
+#define _mm256_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+                                               (int)(imm), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U)); })
+
+#define _mm256_maskz_extractf32x4_ps(U, A, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+                                               (int)(imm), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U)); })
+
+#define _mm256_extracti32x4_epi32(A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+                                                (int)(imm), \
+                                                (__v4si)_mm_setzero_si128(), \
+                                                (__mmask8)-1); })
+
+#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+                                                (int)(imm), \
+                                                (__v4si)(__m128i)(W), \
+                                                (__mmask8)(U)); })
+
+#define _mm256_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+                                                (int)(imm), \
+                                                (__v4si)_mm_setzero_si128(), \
+                                                (__mmask8)(U)); })
+
+#define _mm256_insertf32x4(A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
+                                              (__v4sf)(__m128)(B), (int)(imm), \
+                                              (__v8sf)_mm256_setzero_ps(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
+                                              (__v4sf)(__m128)(B), (int)(imm), \
+                                              (__v8sf)(__m256)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
+                                              (__v4sf)(__m128)(B), (int)(imm), \
+                                              (__v8sf)_mm256_setzero_ps(), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_inserti32x4(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
+                                               (__v4si)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v8si)_mm256_setzero_si256(), \
+                                               (__mmask8)-1); })
+
+#define _mm256_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
+                                               (__v4si)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v8si)(__m256i)(W), \
+                                               (__mmask8)(U)); })
+
+#define _mm256_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
+                                               (__v4si)(__m128i)(B), \
+                                               (int)(imm), \
+                                               (__v8si)_mm256_setzero_si256(), \
+                                               (__mmask8)(U)); })
+
+#define _mm_getmant_pd(A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_getmant_pd(W, U, A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_getmant_pd(U, A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_getmant_pd(A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)(__m256d)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)(U)); })
+
+#define _mm_getmant_ps(A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1); })
+
+#define _mm_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)(__m128)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_getmant_ps(A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1); })
+
+#define _mm256_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)(__m256)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U)); })
+
+#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v2di)(__m128i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v4di)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v4si)(__m128i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v8si)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_permutex_pd(X, C) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(X), \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+
+#define _mm256_mask_permutex_pd(W, U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permutex_pd((X), (C)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_permutex_pd(U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permutex_pd((X), (C)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm256_permutex_epi64(X, C) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(X), \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+
+#define _mm256_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                      (__v4di)_mm256_permutex_epi64((X), (C)), \
+                                      (__v4di)(__m256i)(W)); })
+
+#define _mm256_maskz_permutex_epi64(U, X, C) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                      (__v4di)_mm256_permutex_epi64((X), (C)), \
+                                      (__v4di)_mm256_setzero_si256()); })
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) _mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
+          __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) _mm256_setzero_si256 (),
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) _mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
+             __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) __W,
+                 __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
+          __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutexvar_ps (__m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) _mm256_undefined_si256 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+             __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) __W,
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) _mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+#define _mm_alignr_epi32(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
+                                         (__v4si)(__m128i)(B), (int)(imm), \
+                                         (__v4si)_mm_undefined_si128(), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
+                                         (__v4si)(__m128i)(B), (int)(imm), \
+                                         (__v4si)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
+                                         (__v4si)(__m128i)(B), (int)(imm), \
+                                         (__v4si)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
+                                         (__v8si)(__m256i)(B), (int)(imm), \
+                                         (__v8si)_mm256_undefined_si256(), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
+                                         (__v8si)(__m256i)(B), (int)(imm), \
+                                         (__v8si)(__m256i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
+                                         (__v8si)(__m256i)(B), (int)(imm), \
+                                         (__v8si)_mm256_setzero_si256(), \
+                                         (__mmask8)(U)); })
+
+#define _mm_alignr_epi64(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
+                                         (__v2di)(__m128i)(B), (int)(imm), \
+                                         (__v2di)_mm_setzero_di(), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
+                                         (__v2di)(__m128i)(B), (int)(imm), \
+                                         (__v2di)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
+                                         (__v2di)(__m128i)(B), (int)(imm), \
+                                         (__v2di)_mm_setzero_di(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
+                                         (__v4di)(__m256i)(B), (int)(imm), \
+                                         (__v4di)_mm256_undefined_pd(), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
+                                         (__v4di)(__m256i)(B), (int)(imm), \
+                                         (__v4di)(__m256i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
+                                         (__v4di)(__m256i)(B), (int)(imm), \
+                                         (__v4di)_mm256_setzero_si256(), \
+                                         (__mmask8)(U)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_movehdup_ps(__A),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_movehdup_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_movehdup_ps(__A),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_movehdup_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_moveldup_ps(__A),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_moveldup_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_moveldup_ps(__A),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_moveldup_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+#define _mm256_mask_shuffle_epi32(W, U, A, I) __extension__({\
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                      (__v8si)(__m256i)(W)); })
+
+#define _mm256_maskz_shuffle_epi32(U, A, I) __extension__({\
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                      (__v8si)_mm256_setzero_si256()); })
+
+#define _mm_mask_shuffle_epi32(W, U, A, I) __extension__({\
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                      (__v4si)(__m128i)(W)); })
+
+#define _mm_maskz_shuffle_epi32(U, A, I) __extension__({\
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                      (__v4si)_mm_setzero_si128()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+              (__v2df) __A,
+              (__v2df) __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+              (__v2df) __A,
+              (__v2df) _mm_setzero_pd ());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+              (__v4df) __A,
+              (__v4df) __W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+              (__v4df) __A,
+              (__v4df) _mm256_setzero_pd ());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+             (__v4sf) __A,
+             (__v4sf) __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+             (__v4sf) __A,
+             (__v4sf) _mm_setzero_ps ());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+             (__v8sf) __A,
+             (__v8sf) __W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+             (__v8sf) __A,
+             (__v8sf) _mm256_setzero_ps ());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                  (__v8hi) __W,
+                                                  (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                  (__v8hi) _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+#define _mm_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+                                         (__v8hi)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+                                         (__v8hi)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                      (__v8hi) __W,
+                                                      (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                      (__v8hi) _mm_setzero_si128(),
+                                                      (__mmask8) __U);
+}
+#define _mm256_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+                                            (__v8hi)(__m128i)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+                                            (__v8hi)_mm_setzero_si128(), \
+                                            (__mmask8)(U)); })
+
+
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_BOTH
 
 #endif /* __AVX512VLINTRIN_H */
diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h
index 6d1ca54..32e8546 100644
--- a/lib/Headers/avxintrin.h
+++ b/lib/Headers/avxintrin.h
@@ -35,6 +35,12 @@
 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
 
+/* Unsigned types */
+typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
+typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
+
 /* We need an explicitly signed variant for char. Note that this shouldn't
  * appear in the interface though. */
 typedef signed char __v32qs __attribute__((__vector_size__(32)));
@@ -47,193 +53,703 @@
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
 
 /* Arithmetic */
+/// \brief Adds two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the sums of both
+///    operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_add_pd(__m256d __a, __m256d __b)
 {
-  return __a+__b;
+  return (__m256d)((__v4df)__a+(__v4df)__b);
 }
 
+/// \brief Adds two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the sums of both
+///    operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_add_ps(__m256 __a, __m256 __b)
 {
-  return __a+__b;
+  return (__m256)((__v8sf)__a+(__v8sf)__b);
 }
 
+/// \brief Subtracts two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the minuend.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the differences between
+///    both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_sub_pd(__m256d __a, __m256d __b)
 {
-  return __a-__b;
+  return (__m256d)((__v4df)__a-(__v4df)__b);
 }
 
+/// \brief Subtracts two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the minuend.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the differences between
+///    both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_sub_ps(__m256 __a, __m256 __b)
 {
-  return __a-__b;
+  return (__m256)((__v8sf)__a-(__v8sf)__b);
 }
 
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the left source operand.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the right source operand.
+/// \returns A 256-bit vector of [4 x double] containing the alternating sums
+///    and differences between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_addsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the left source operand.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the right source operand.
+/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
+///    differences between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_addsub_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Divides two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the dividend.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the divisor.
+/// \returns A 256-bit vector of [4 x double] containing the quotients of both
+///    operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_div_pd(__m256d __a, __m256d __b)
 {
-  return __a / __b;
+  return (__m256d)((__v4df)__a/(__v4df)__b);
 }
 
+/// \brief Divides two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the dividend.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the divisor.
+/// \returns A 256-bit vector of [8 x float] containing the quotients of both
+///    operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_div_ps(__m256 __a, __m256 __b)
 {
-  return __a / __b;
+  return (__m256)((__v8sf)__a/(__v8sf)__b);
 }
 
+/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the maximum values
+///    between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_max_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the maximum values
+///    between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_max_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the minimum values
+///    between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_min_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the minimum values
+///    between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_min_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Multiplies two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the products of both
+///    operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_mul_pd(__m256d __a, __m256d __b)
 {
-  return __a * __b;
+  return (__m256d)((__v4df)__a * (__v4df)__b);
 }
 
+/// \brief Multiplies two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the products of both
+///    operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_mul_ps(__m256 __a, __m256 __b)
 {
-  return __a * __b;
+  return (__m256)((__v8sf)__a * (__v8sf)__b);
 }
 
+/// \brief Calculates the square roots of the values in a 256-bit vector of
+///    [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the square roots of the
+///    values in the operand.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_sqrt_pd(__m256d __a)
 {
   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
 }
 
+/// \brief Calculates the square roots of the values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the square roots of the
+///    values in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_sqrt_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
 }
 
+/// \brief Calculates the reciprocal square roots of the values in a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
+///    roots of the values in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_rsqrt_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
 }
 
+/// \brief Calculates the reciprocals of the values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
+///    values in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_rcp_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
 }
 
+/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
+///    by the byte operand. The source values are rounded to integer values and
+///    returned as 64-bit double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_round_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An integer value that specifies the rounding operation.
+///    Bits [7:4] are reserved.
+///    Bit [3] is a precision exception value:
+///    0: A normal PE exception is used.
+///    1: The PE field is not updated.
+///    Bit [2] is the rounding control source:
+///    0: Use bits [1:0] of M.
+///    1: Use the current MXCSR setting.
+///    Bits [1:0] contain the rounding control definition:
+///    00: Nearest.
+///    01: Downward (toward negative infinity).
+///    10: Upward (toward positive infinity).
+///    11: Truncated.
+/// \returns A 256-bit vector of [4 x double] containing the rounded values.
 #define _mm256_round_pd(V, M) __extension__ ({ \
     (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
 
+/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
+///    specified by the byte operand. The source values are rounded to integer
+///    values and returned as floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_round_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An integer value that specifies the rounding operation.
+///    Bits [7:4] are reserved.
+///    Bit [3] is a precision exception value:
+///    0: A normal PE exception is used.
+///    1: The PE field is not updated.
+///    Bit [2] is the rounding control source:
+///    0: Use bits [1:0] of M.
+///    1: Use the current MXCSR setting.
+///    Bits [1:0] contain the rounding control definition:
+///    00: Nearest.
+///    01: Downward (toward negative infinity).
+///    10: Upward (toward positive infinity).
+///    11: Truncated.
+/// \returns A 256-bit vector of [8 x float] containing the rounded values.
 #define _mm256_round_ps(V, M) __extension__ ({ \
   (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
 
+/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
+///    source values are rounded up to integer values and returned as 64-bit
+///    double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_ceil_pd(__m256d V);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
+
+/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
+///    The source values are rounded down to integer values and returned as
+///    64-bit double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_floor_pd(__m256d V);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the rounded down
+///    values.
 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+
+/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
+///    source values are rounded up to integer values and returned as
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_ceil_ps(__m256 V);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
+
+/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
+///    source values are rounded down to integer values and returned as
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_floor_ps(__m256 V);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
 
 /* Logical */
+/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+///    values between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_and_pd(__m256d __a, __m256d __b)
 {
-  return (__m256d)((__v4di)__a & (__v4di)__b);
+  return (__m256d)((__v4du)__a & (__v4du)__b);
 }
 
+/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+///    values between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_and_ps(__m256 __a, __m256 __b)
 {
-  return (__m256)((__v8si)__a & (__v8si)__b);
+  return (__m256)((__v8su)__a & (__v8su)__b);
 }
 
+/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the right source operand.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+///    values of the second operand and the one's complement of the first
+///    operand.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_andnot_pd(__m256d __a, __m256d __b)
 {
-  return (__m256d)(~(__v4di)__a & (__v4di)__b);
+  return (__m256d)(~(__v4du)__a & (__v4du)__b);
 }
 
+/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the right source operand.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+///    values of the second operand and the one's complement of the first
+///    operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_andnot_ps(__m256 __a, __m256 __b)
 {
-  return (__m256)(~(__v8si)__a & (__v8si)__b);
+  return (__m256)(~(__v8su)__a & (__v8su)__b);
 }
 
+/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
+///    values between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_or_pd(__m256d __a, __m256d __b)
 {
-  return (__m256d)((__v4di)__a | (__v4di)__b);
+  return (__m256d)((__v4du)__a | (__v4du)__b);
 }
 
+/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
+///    values between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_or_ps(__m256 __a, __m256 __b)
 {
-  return (__m256)((__v8si)__a | (__v8si)__b);
+  return (__m256)((__v8su)__a | (__v8su)__b);
 }
 
+/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
+///    values between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_xor_pd(__m256d __a, __m256d __b)
 {
-  return (__m256d)((__v4di)__a ^ (__v4di)__b);
+  return (__m256d)((__v4du)__a ^ (__v4du)__b);
 }
 
+/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
+///    values between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_xor_ps(__m256 __a, __m256 __b)
 {
-  return (__m256)((__v8si)__a ^ (__v8si)__b);
+  return (__m256)((__v8su)__a ^ (__v8su)__b);
 }
 
 /* Horizontal arithmetic */
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal sums of the values are returned in the even-indexed
+///    elements of a vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal sums of the values are returned in the odd-indexed
+///    elements of a vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
+///    both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_hadd_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal sums of the values are returned in the elements with
+///    index 0, 1, 4, 5 of a vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal sums of the values are returned in the elements with
+///    index 2, 3, 6, 7 of a vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
+///    both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_hadd_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    even-indexed elements of a vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    odd-indexed elements of a vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the horizontal
+///    differences of both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_hsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the horizontal
+///    differences of both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_hsub_ps(__m256 __a, __m256 __b)
 {
@@ -241,71 +757,600 @@
 }
 
 /* Vector permutations */
+/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
+///    by the 128-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __c
+///    A 128-bit integer vector operand specifying how the values are to be
+///    copied.
+///    Bit [1]:
+///    0: Bits [63:0] of the source are copied to bits [63:0] of the
+///    returned vector.
+///    1: Bits [127:64] of the source are copied to bits [63:0] of the
+///    returned vector.
+///    Bit [65]:
+///    0: Bits [63:0] of the source are copied to bits [127:64] of the
+///    returned vector.
+///    1: Bits [127:64] of the source are copied to bits [127:64] of the
+///    returned vector.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm_permutevar_pd(__m128d __a, __m128i __c)
 {
   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
 }
 
+/// \brief Copies the values in a 256-bit vector of [4 x double] as
+///    specified by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __c
+///    A 256-bit integer vector operand specifying how the values are to be
+///    copied.
+///    Bit [1]:
+///    0: Bits [63:0] of the source are copied to bits [63:0] of the
+///    returned vector.
+///    1: Bits [127:64] of the source are copied to bits [63:0] of the
+///    returned vector.
+///    Bit [65]:
+///    0: Bits [63:0] of the source are copied to bits [127:64] of the
+///    returned vector.
+///    1: Bits [127:64] of the source are copied to bits [127:64] of the
+///    returned vector.
+///    Bit [129]:
+///    0: Bits [191:128] of the source are copied to bits [191:128] of the
+///    returned vector.
+///    1: Bits [255:192] of the source are copied to bits [191:128] of the
+///    returned vector.
+///    Bit [193]:
+///    0: Bits [191:128] of the source are copied to bits [255:192] of the
+///    returned vector.
+///    1: Bits [255:192] of the source are copied to bits [255:192] of the
+///    returned vector.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_permutevar_pd(__m256d __a, __m256i __c)
 {
   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
 }
 
+/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
+///    specified by the 128-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __c
+///    A 128-bit integer vector operand specifying how the values are to be
+///    copied.
+///    Bits [1:0]:
+///    00: Bits [31:0] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    Bits [33:32]:
+///    00: Bits [31:0] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    Bits [65:64]:
+///    00: Bits [31:0] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    Bits [97:96]:
+///    00: Bits [31:0] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [127:96] of the
+///    returned vector.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_permutevar_ps(__m128 __a, __m128i __c)
 {
   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
 }
 
+/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
+///    specified by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __c
+///    A 256-bit integer vector operand specifying how the values are to be
+///    copied.
+///    Bits [1:0]:
+///    00: Bits [31:0] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    Bits [33:32]:
+///    00: Bits [31:0] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    Bits [65:64]:
+///    00: Bits [31:0] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    Bits [97:96]:
+///    00: Bits [31:0] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    Bits [129:128]:
+///    00: Bits [159:128] of the source are copied to bits [159:128] of the
+///    returned vector.
+///    01: Bits [191:160] of the source are copied to bits [159:128] of the
+///    returned vector.
+///    10: Bits [223:192] of the source are copied to bits [159:128] of the
+///    returned vector.
+///    11: Bits [255:224] of the source are copied to bits [159:128] of the
+///    returned vector.
+///    Bits [161:160]:
+///    00: Bits [159:128] of the source are copied to bits [191:160] of the
+///    returned vector.
+///    01: Bits [191:160] of the source are copied to bits [191:160] of the
+///    returned vector.
+///    10: Bits [223:192] of the source are copied to bits [191:160] of the
+///    returned vector.
+///    11: Bits [255:224] of the source are copied to bits [191:160] of the
+///    returned vector.
+///    Bits [193:192]:
+///    00: Bits [159:128] of the source are copied to bits [223:192] of the
+///    returned vector.
+///    01: Bits [191:160] of the source are copied to bits [223:192] of the
+///    returned vector.
+///    10: Bits [223:192] of the source are copied to bits [223:192] of the
+///    returned vector.
+///    11: Bits [255:224] of the source are copied to bits [223:192] of the
+///    returned vector.
+///    Bits [225:224]:
+///    00: Bits [159:128] of the source are copied to bits [255:224] of the
+///    returned vector.
+///    01: Bits [191:160] of the source are copied to bits [255:224] of the
+///    returned vector.
+///    10: Bits [223:192] of the source are copied to bits [255:224] of the
+///    returned vector.
+///    11: Bits [255:224] of the source are copied to bits [255:224] of the
+///    returned vector.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_permutevar_ps(__m256 __a, __m256i __c)
 {
   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
 }
 
+/// \brief Copies the values in a 128-bit vector of [2 x double] as
+///    specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_permute_pd(__m128d A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+///
+/// \param A
+///    A 128-bit vector of [2 x double].
+/// \param C
+///    An immediate integer operand specifying how the values are to be copied.
+///    Bit [0]:
+///    0: Bits [63:0] of the source are copied to bits [63:0] of the
+///    returned vector.
+///    1: Bits [127:64] of the source are copied to bits [63:0] of the
+///    returned vector.
+///    Bit [1]:
+///    0: Bits [63:0] of the source are copied to bits [127:64] of the
+///    returned vector.
+///    1: Bits [127:64] of the source are copied to bits [127:64] of the
+///    returned vector.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_permute_pd(A, C) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
-                                   (__v2df)_mm_setzero_pd(), \
-                                   (C) & 0x1, ((C) & 0x2) >> 1); })
+                                   (__v2df)_mm_undefined_pd(), \
+                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
 
+/// \brief Copies the values in a 256-bit vector of [4 x double] as
+///    specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_permute_pd(__m256d A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+///
+/// \param A
+///    A 256-bit vector of [4 x double].
+/// \param C
+///    An immediate integer operand specifying how the values are to be copied.
+///    Bit [0]:
+///    0: Bits [63:0] of the source are copied to bits [63:0] of the
+///    returned vector.
+///    1: Bits [127:64] of the source are copied to bits [63:0] of the
+///    returned vector.
+///    Bit [1]:
+///    0: Bits [63:0] of the source are copied to bits [127:64] of the
+///    returned vector.
+///    1: Bits [127:64] of the source are copied to bits [127:64] of the
+///    returned vector.
+///    Bit [2]:
+///    0: Bits [191:128] of the source are copied to bits [191:128] of the
+///    returned vector.
+///    1: Bits [255:192] of the source are copied to bits [191:128] of the
+///    returned vector.
+///    Bit [3]:
+///    0: Bits [191:128] of the source are copied to bits [255:192] of the
+///    returned vector.
+///    1: Bits [255:192] of the source are copied to bits [255:192] of the
+///    returned vector.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute_pd(A, C) __extension__ ({ \
   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
-                                   (__v4df)_mm256_setzero_pd(), \
-                                   (C) & 0x1, ((C) & 0x2) >> 1, \
-                                   2 + (((C) & 0x4) >> 2), \
-                                   2 + (((C) & 0x8) >> 3)); })
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x1), \
+                                   0 + (((C) >> 1) & 0x1), \
+                                   2 + (((C) >> 2) & 0x1), \
+                                   2 + (((C) >> 3) & 0x1)); })
 
+/// \brief Copies the values in a 128-bit vector of [4 x float] as
+///    specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_permute_ps(__m128 A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+///
+/// \param A
+///    A 128-bit vector of [4 x float].
+/// \param C
+///    An immediate integer operand specifying how the values are to be copied.
+///    Bits [1:0]:
+///    00: Bits [31:0] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    Bits [3:2]:
+///    00: Bits [31:0] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    Bits [5:4]:
+///    00: Bits [31:0] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    Bits [7:6]:
+///    00: Bits [31:0] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [127:96] of the
+///    returned vector.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_permute_ps(A, C) __extension__ ({ \
   (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
-                                  (__v4sf)_mm_setzero_ps(), \
-                                   (C) & 0x3, ((C) & 0xc) >> 2, \
-                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+                                  (__v4sf)_mm_undefined_ps(), \
+                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
 
+/// \brief Copies the values in a 256-bit vector of [8 x float] as
+///    specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_permute_ps(__m256 A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+///
+/// \param A
+///    A 256-bit vector of [8 x float].
+/// \param C
+///    An immediate integer operand specifying how the values are to be copied.
+///    Bits [1:0]:
+///    00: Bits [31:0] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [31:0] of the
+///    returned vector.
+///    Bits [3:2]:
+///    00: Bits [31:0] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [63:32] of the
+///    returned vector.
+///    Bits [5:4]:
+///    00: Bits [31:0] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [95:64] of the
+///    returned vector.
+///    Bits [7:6]:
+///    00: Bits [31:0] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    01: Bits [63:32] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    10: Bits [95:64] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    11: Bits [127:96] of the source are copied to bits [127:96] of the
+///    returned vector.
+///    Bits [1:0]:
+///    00: Bits [159:128] of the source are copied to bits [159:128] of the
+///    returned vector.
+///    01: Bits [191:160] of the source are copied to bits [159:128] of the
+///    returned vector.
+///    10: Bits [223:192] of the source are copied to bits [159:128] of the
+///    returned vector.
+///    11: Bits [255:224] of the source are copied to bits [159:128] of the
+///    returned vector.
+///    Bits [3:2]:
+///    00: Bits [159:128] of the source are copied to bits [191:160] of the
+///    returned vector.
+///    01: Bits [191:160] of the source are copied to bits [191:160] of the
+///    returned vector.
+///    10: Bits [223:192] of the source are copied to bits [191:160] of the
+///    returned vector.
+///    11: Bits [255:224] of the source are copied to bits [191:160] of the
+///    returned vector.
+///    Bits [5:4]:
+///    00: Bits [159:128] of the source are copied to bits [223:192] of the
+///    returned vector.
+///    01: Bits [191:160] of the source are copied to bits [223:192] of the
+///    returned vector.
+///    10: Bits [223:192] of the source are copied to bits [223:192] of the
+///    returned vector.
+///    11: Bits [255:224] of the source are copied to bits [223:192] of the
+///    returned vector.
+///    Bits [7:6]:
+///    00: Bits [159:128] of the source are copied to bits [255:224] of the
+///    returned vector.
+///    01: Bits [191:160] of the source are copied to bits [255:224] of the
+///    returned vector.
+///    10: Bits [223:192] of the source are copied to bits [255:224] of the
+///    returned vector.
+///    11: Bits [255:224] of the source are copied to bits [255:224] of the
+///    returned vector.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute_ps(A, C) __extension__ ({ \
   (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
-                                  (__v8sf)_mm256_setzero_ps(), \
-                                  (C) & 0x3, ((C) & 0xc) >> 2, \
-                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
-                                  4 + (((C) & 0x03) >> 0), \
-                                  4 + (((C) & 0x0c) >> 2), \
-                                  4 + (((C) & 0x30) >> 4), \
-                                  4 + (((C) & 0xc0) >> 6)); })
+                                  (__v8sf)_mm256_undefined_ps(), \
+                                  0 + (((C) >> 0) & 0x3), \
+                                  0 + (((C) >> 2) & 0x3), \
+                                  0 + (((C) >> 4) & 0x3), \
+                                  0 + (((C) >> 6) & 0x3), \
+                                  4 + (((C) >> 0) & 0x3), \
+                                  4 + (((C) >> 2) & 0x3), \
+                                  4 + (((C) >> 4) & 0x3), \
+                                  4 + (((C) >> 6) & 0x3)); })
 
+/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
+///    [4 x double], as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double].
+/// \param V2
+///    A 256-bit vector of [4 x double.
+/// \param M
+///    An immediate integer operand specifying how the values are to be
+///    permuted.
+///    Bits [1:0]:
+///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
+///    destination.
+///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
+///    destination.
+///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
+///    destination.
+///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
+///    destination.
+///    Bits [5:4]:
+///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
+///    destination.
+///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
+///    destination.
+///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
+///    destination.
+///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
                                            (__v4df)(__m256d)(V2), (M)); })
 
+/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
+///    [8 x float], as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float].
+/// \param V2
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer operand specifying how the values are to be
+///    permuted.
+///    Bits [1:0]:
+///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
+///    destination.
+///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
+///    destination.
+///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
+///    destination.
+///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
+///    destination.
+///    Bits [5:4]:
+///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
+///    destination.
+///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
+///    destination.
+///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
+///    destination.
+///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+///    destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
                                           (__v8sf)(__m256)(V2), (M)); })
 
+/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
+///    as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+///
+/// \param V1
+///    A 256-bit integer vector.
+/// \param V2
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer operand specifying how the values are to be copied.
+///    Bits [1:0]:
+///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
+///    destination.
+///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
+///    destination.
+///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
+///    destination.
+///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
+///    destination.
+///    Bits [5:4]:
+///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
+///    destination.
+///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
+///    destination.
+///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
+///    destination.
+///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+///    destination.
+/// \returns A 256-bit integer vector containing the copied values.
 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
                                            (__v8si)(__m256i)(V2), (M)); })
 
 /* Vector Blend */
+/// \brief Merges 64-bit double-precision data values stored in either of the
+///    two 256-bit vectors of [4 x double], as specified by the immediate
+///    integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double].
+/// \param V2
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An immediate integer operand, with mask bits [3:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
+///    element in operand V1 is copied to the same position in the destination.
+///    When a mask bit is 1, the corresponding 64-bit element in operand V2 is
+///    copied to the same position in the destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
                                    (__v4df)(__m256d)(V2), \
@@ -314,6 +1359,30 @@
                                    (((M) & 0x04) ? 6 : 2), \
                                    (((M) & 0x08) ? 7 : 3)); })
 
+/// \brief Merges 32-bit single-precision data values stored in either of the
+///    two 256-bit vectors of [8 x float], as specified by the immediate
+///    integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float].
+/// \param V2
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer operand, with mask bits [7:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
+///    element in operand V1 is copied to the same position in the destination.
+///    When a mask bit is 1, the corresponding 32-bit element in operand V2 is
+///    copied to the same position in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
                                   (__v8sf)(__m256)(V2), \
@@ -326,6 +1395,27 @@
                                   (((M) & 0x40) ? 14 : 6), \
                                   (((M) & 0x80) ? 15 : 7)); })
 
+/// \brief Merges 64-bit double-precision data values stored in either of the
+///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \param __c
+///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
+///    how the values are to be copied. The position of the mask bit corresponds
+///    to the most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 64-bit element in operand __a is copied to the same
+///    position in the destination. When a mask bit is 1, the corresponding
+///    64-bit element in operand __b is copied to the same position in the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
 {
@@ -333,6 +1423,27 @@
     (__v4df)__a, (__v4df)__b, (__v4df)__c);
 }
 
+/// \brief Merges 32-bit single-precision data values stored in either of the
+///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \param __c
+///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
+///    and 31 specifying how the values are to be copied. The position of the
+///    mask bit corresponds to the most significant bit of a copied value. When
+///    a mask bit is 0, the corresponding 32-bit element in operand __a is
+///    copied to the same position in the destination. When a mask bit is 1, the
+///    corresponding 32-bit element in operand __b is copied to the same
+///    position in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 {
@@ -341,30 +1452,154 @@
 }
 
 /* Vector Dot Product */
+/// \brief Computes two dot products in parallel, using the lower and upper
+///    halves of two [8 x float] vectors as input to the two computations, and
+///    returning the two dot products in the lower and upper halves of the
+///    [8 x float] result. The immediate integer operand controls which
+///    input elements will contribute to the dot product, and where the final
+///    results are returned. In general, for each dot product, the four
+///    corresponding elements of the input vectors are multiplied; the first
+///    two and second two products are summed, then the two sums are added to
+///    form the final result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
+///
+/// \param V1
+///    A vector of [8 x float] values, treated as two [4 x float] vectors.
+/// \param V2
+///    A vector of [8 x float] values, treated as two [4 x float] vectors.
+/// \param M
+///    An immediate integer argument. Bits [7:4] determine which elements of
+///    the input vectors are used, with bit [4] corresponding to the lowest
+///    element and bit [7] corresponding to the highest element of each [4 x
+///    float] subvector. If a bit is set, the corresponding elements from the
+///    two input vectors are used as an input for dot product; otherwise that
+///    input is treated as zero. Bits [3:0] determine which elements of the
+///    result will receive a copy of the final dot product, with bit [0]
+///    corresponding to the lowest element and bit [3] corresponding to the
+///    highest element of each [4 x float] subvector. If a bit is set, the dot
+///    product is returned in the corresponding element; otherwise that element
+///    is set to zero. The bitmask is applied in the same way to each of the
+///    two parallel dot product computations.
+/// \returns A 256-bit vector of [8 x float] containing the two dot products.
 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
                                  (__v8sf)(__m256)(V2), (M)); })
 
 /* Vector shuffle */
+/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
+///    specified by the immediate value operand. The four selected elements in
+///    each operand are copied to the destination according to the bits
+///    specified in the immediate operand. The selected elements from the first
+///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
+///    destination, and the selected elements from the second 256-bit operand
+///    are copied to bits [127:64] and bits [255:192] of the destination. For
+///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
+///    the 256-bit destination vector would contain the following values: b[7],
+///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float]. The four selected elements in this
+///    operand are copied to bits [63:0] and bits [191:128] in the destination,
+///    according to the bits specified in the immediate operand.
+/// \param b
+///    A 256-bit vector of [8 x float]. The four selected elements in this
+///    operand are copied to bits [127:64] and bits [255:192] in the
+///    destination, according to the bits specified in the immediate operand.
+/// \param mask
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a and b. Bits [3:0] specify the values copied from operand a.
+///    Bits [7:4] specify the values copied from operand b.
+///    The destinations within the 256-bit destination are assigned values as
+///    follows, according to the bit value assignments described below:
+///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
+///    destination.
+///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
+///    destination.
+///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
+///    destination.
+///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
+///    the destination.
+///    Bit value assignments:
+///    00: Bits [31:0] and [159:128] are copied from the selected operand.
+///    01: Bits [63:32] and [191:160] are copied from the selected operand.
+///    10: Bits [95:64] and [223:192] are copied from the selected operand.
+///    11: Bits [127:96] and [255:224] are copied from the selected operand.
+/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
-        (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
-                                        (__v8sf)(__m256)(b), \
-                                        (mask) & 0x3, \
-                                        ((mask) & 0xc) >> 2, \
-                                        (((mask) & 0x30) >> 4) + 8, \
-                                        (((mask) & 0xc0) >> 6) + 8, \
-                                        ((mask) & 0x3) + 4, \
-                                        (((mask) & 0xc) >> 2) + 4, \
-                                        (((mask) & 0x30) >> 4) + 12, \
-                                        (((mask) & 0xc0) >> 6) + 12); })
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), \
+                                  0  + (((mask) >> 0) & 0x3), \
+                                  0  + (((mask) >> 2) & 0x3), \
+                                  8  + (((mask) >> 4) & 0x3), \
+                                  8  + (((mask) >> 6) & 0x3), \
+                                  4  + (((mask) >> 0) & 0x3), \
+                                  4  + (((mask) >> 2) & 0x3), \
+                                  12 + (((mask) >> 4) & 0x3), \
+                                  12 + (((mask) >> 6) & 0x3)); })
 
+/// \brief Selects four double-precision values from the 256-bit operands of
+///    [4 x double], as specified by the immediate value operand. The selected
+///    elements from the first 256-bit operand are copied to bits [63:0] and
+///    bits [191:128] in the destination, and the selected elements from the
+///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
+///    the destination. For example, if bits [3:0] of the immediate operand
+///    contain a value of 0xF, the 256-bit destination vector would contain the
+///    following values: b[3], a[3], b[1], a[1].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double].
+/// \param b
+///    A 256-bit vector of [4 x double].
+/// \param mask
+///    An immediate value containing 8-bit values specifying which elements to
+///    copy from a and b:
+///    Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
+///    destination.
+///    Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
+///    destination.
+///    Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
+///    destination.
+///    Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
+///    destination.
+///    Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
+///    destination.
+///    Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
+///    destination.
+///    Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
+///    destination.
+///    Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
-        (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
-                                         (__v4df)(__m256d)(b), \
-                                         (mask) & 0x1, \
-                                         (((mask) & 0x2) >> 1) + 4, \
-                                         (((mask) & 0x4) >> 2) + 2, \
-                                         (((mask) & 0x8) >> 3) + 6); })
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), \
+                                   0 + (((mask) >> 0) & 0x1), \
+                                   4 + (((mask) >> 1) & 0x1), \
+                                   2 + (((mask) >> 2) & 0x1), \
+                                   6 + (((mask) >> 3) & 0x1)); })
 
 /* Compare */
 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
@@ -400,30 +1635,235 @@
 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
 
+/// \brief Compares each of the corresponding double-precision values of two
+///    128-bit vectors of [2 x double], using the operation specified by the
+///    immediate integer operand. Returns a [2 x double] vector consisting of
+///    two doubles corresponding to the two comparison results: zero if the
+///    comparison is false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h: Less than
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
+///                        operands)
+///    03h, 0Bh, 13h, 1Bh: Unordered
+///    04h, 0Ch, 14h, 1Ch: Not equal
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands)
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
   (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
                                 (__v2df)(__m128d)(b), (c)); })
 
+/// \brief Compares each of the corresponding values of two 128-bit vectors of
+///    [4 x float], using the operation specified by the immediate integer
+///    operand. Returns a [4 x float] vector consisting of four floats
+///    corresponding to the four comparison results: zero if the comparison is
+///    false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h: Less than
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
+///                        operands)
+///    03h, 0Bh, 13h, 1Bh: Unordered
+///    04h, 0Ch, 14h, 1Ch: Not equal
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                       (swapped operands)
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
   (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
                                (__v4sf)(__m128)(b), (c)); })
 
+/// \brief Compares each of the corresponding double-precision values of two
+///    256-bit vectors of [4 x double], using the operation specified by the
+///    immediate integer operand. Returns a [4 x double] vector consisting of
+///    four doubles corresponding to the four comparison results: zero if the
+///    comparison is false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double].
+/// \param b
+///    A 256-bit vector of [4 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h: Less than
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
+///                        operands)
+///    03h, 0Bh, 13h, 1Bh: Unordered
+///    04h, 0Ch, 14h, 1Ch: Not equal
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands)
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 256-bit vector of [4 x double] containing the comparison results.
 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
   (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
                                    (__v4df)(__m256d)(b), (c)); })
 
+/// \brief Compares each of the corresponding values of two 256-bit vectors of
+///    [8 x float], using the operation specified by the immediate integer
+///    operand. Returns a [8 x float] vector consisting of eight floats
+///    corresponding to the eight comparison results: zero if the comparison is
+///    false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float].
+/// \param b
+///    A 256-bit vector of [8 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h: Less than
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
+///                        operands)
+///    03h, 0Bh, 13h, 1Bh: Unordered
+///    04h, 0Ch, 14h, 1Ch: Not equal
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                       (swapped operands)
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 256-bit vector of [8 x float] containing the comparison results.
 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
   (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
                                   (__v8sf)(__m256)(b), (c)); })
 
+/// \brief Compares each of the corresponding scalar double-precision values of
+///    two 128-bit vectors of [2 x double], using the operation specified by the
+///    immediate integer operand. If the result is true, all 64 bits of the
+///    destination vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h: Less than
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
+///                        operands)
+///    03h, 0Bh, 13h, 1Bh: Unordered
+///    04h, 0Ch, 14h, 1Ch: Not equal
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                       (swapped operands)
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
   (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
                                 (__v2df)(__m128d)(b), (c)); })
 
+/// \brief Compares each of the corresponding scalar values of two 128-bit
+///    vectors of [4 x float], using the operation specified by the immediate
+///    integer operand. If the result is true, all 32 bits of the destination
+///    vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h: Less than
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
+///                        operands)
+///    03h, 0Bh, 13h, 1Bh: Unordered
+///    04h, 0Ch, 14h, 1Ch: Not equal
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                       (swapped operands)
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
   (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
                                (__v4sf)(__m128)(b), (c)); })
 
+/// \brief Takes a [8 x i32] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
+///   EXTRACTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x i32].
+/// \param __imm
+///    An immediate integer operand with bits [2:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 32 bits of extended
+///    packed data.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi32(__m256i __a, const int __imm)
 {
@@ -431,21 +1871,66 @@
   return __b[__imm & 7];
 }
 
+/// \brief Takes a [16 x i16] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
+///    EXTRACTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [16 x i16].
+/// \param __imm
+///    An immediate integer operand with bits [3:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
+///    packed data.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi16(__m256i __a, const int __imm)
 {
   __v16hi __b = (__v16hi)__a;
-  return __b[__imm & 15];
+  return (unsigned short)__b[__imm & 15];
 }
 
+/// \brief Takes a [32 x i8] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
+///    EXTRACTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [32 x i8].
+/// \param __imm
+///    An immediate integer operand with bits [4:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
+///    packed data.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi8(__m256i __a, const int __imm)
 {
   __v32qi __b = (__v32qi)__a;
-  return __b[__imm & 31];
+  return (unsigned char)__b[__imm & 31];
 }
 
 #ifdef __x86_64__
+/// \brief Takes a [4 x i64] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
+///    EXTRACTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [4 x i64].
+/// \param __imm
+///    An immediate integer operand with bits [1:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 64-bit integer containing the extracted 64 bits of extended
+///    packed data.
 static __inline long long  __DEFAULT_FN_ATTRS
 _mm256_extract_epi64(__m256i __a, const int __imm)
 {
@@ -454,6 +1939,24 @@
 }
 #endif
 
+/// \brief Takes a [8 x i32] vector and replaces the vector element value
+///    indexed by the immediate constant operand by a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
+///    INSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A vector of [8 x i32] to be used by the insert operation.
+/// \param __b
+///    An integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector __a, after replacing its element indexed by __imm
+///     with __b.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
 {
@@ -462,6 +1965,25 @@
   return (__m256i)__c;
 }
 
+
+/// \brief Takes a [16 x i16] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
+///    INSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A vector of [16 x i16] to be used by the insert operation.
+/// \param __b
+///    An i16 integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector __a, after replacing its element indexed by __imm
+///     with __b.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
 {
@@ -470,6 +1992,24 @@
   return (__m256i)__c;
 }
 
+/// \brief Takes a [32 x i8] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
+///    INSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A vector of [32 x i8] to be used by the insert operation.
+/// \param __b
+///    An i8 integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector __a, after replacing its element indexed by __imm
+///    with __b.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
 {
@@ -479,6 +2019,24 @@
 }
 
 #ifdef __x86_64__
+/// \brief Takes a [4 x i64] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
+///    INSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A vector of [4 x i64] to be used by the insert operation.
+/// \param __b
+///    A 64-bit integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector __a, after replacing its element indexed by __imm
+///     with __b.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
 {
@@ -489,24 +2047,61 @@
 #endif
 
 /* Conversion */
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_pd(__m128i __a)
 {
-  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
+  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
 }
 
+/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit vector of [8 x float] containing the converted values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_ps(__m256i __a)
 {
   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
 }
 
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm256_cvtpd_ps(__m256d __a)
 {
   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
 }
 
+/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtps_epi32(__m256 __a)
 {
@@ -516,7 +2111,7 @@
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtps_pd(__m128 __a)
 {
-  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
+  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
 }
 
 static __inline __m128i __DEFAULT_FN_ATTRS
@@ -537,48 +2132,67 @@
   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
 }
 
+static __inline double __DEFAULT_FN_ATTRS
+_mm256_cvtsd_f64(__m256d __a)
+{
+ return __a[0];
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_cvtsi256_si32(__m256i __a)
+{
+ __v8si __b = (__v8si)__a;
+ return __b[0];
+}
+
+static __inline float __DEFAULT_FN_ATTRS
+_mm256_cvtss_f32(__m256 __a)
+{
+ return __a[0];
+}
+
 /* Vector replicate */
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_movehdup_ps(__m256 __a)
 {
-  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
 }
 
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_moveldup_ps(__m256 __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
 }
 
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_movedup_pd(__m256d __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
 }
 
 /* Unpack and Interleave */
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
 {
-  return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
 }
 
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
 {
-  return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
 }
 
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
 {
-  return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
 }
 
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
 {
-  return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
 }
 
 /* Bit Test */
@@ -723,13 +2337,13 @@
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_pd(__m128d const *__a)
 {
-  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
+  return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
 }
 
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ps(__m128 const *__a)
 {
-  return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
+  return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
 }
 
 /* SIMD load ops */
@@ -800,13 +2414,19 @@
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_pd(double *__p, __m256d __a)
 {
-  __builtin_ia32_storeupd256(__p, (__v4df)__a);
+  struct __storeu_pd {
+    __m256d __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pd*)__p)->__v = __a;
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_ps(float *__p, __m256 __a)
 {
-  __builtin_ia32_storeups256(__p, (__v8sf)__a);
+  struct __storeu_ps {
+    __m256 __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ps*)__p)->__v = __a;
 }
 
 static __inline void __DEFAULT_FN_ATTRS
@@ -818,7 +2438,10 @@
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_si256(__m256i *__p, __m256i __a)
 {
-  __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
+  struct __storeu_si256 {
+    __m256i __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si256*)__p)->__v = __a;
 }
 
 /* Conditional load ops */
@@ -876,36 +2499,36 @@
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_si256(__m256i *__a, __m256i __b)
 {
-  __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_pd(double *__a, __m256d __b)
 {
-  __builtin_ia32_movntpd256(__a, (__v4df)__b);
+  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_ps(float *__p, __m256 __a)
 {
-  __builtin_ia32_movntps256(__p, (__v8sf)__a);
+  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
 }
 
 /* Create vectors */
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_undefined_pd()
+_mm256_undefined_pd(void)
 {
   return (__m256d)__builtin_ia32_undef256();
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_undefined_ps()
+_mm256_undefined_ps(void)
 {
   return (__m256)__builtin_ia32_undef256();
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_undefined_si256()
+_mm256_undefined_si256(void)
 {
   return (__m256i)__builtin_ia32_undef256();
 }
@@ -1117,37 +2740,37 @@
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm256_castpd256_pd128(__m256d __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 1);
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
 }
 
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm256_castps256_ps128(__m256 __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
 }
 
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_castsi256_si128(__m256i __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 1);
+  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
 }
 
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castpd128_pd256(__m128d __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
 }
 
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castps128_ps256(__m128 __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castsi128_si256(__m128i __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
 }
 
 /*
@@ -1194,7 +2817,7 @@
 #define _mm256_extractf128_ps(V, M) __extension__ ({ \
   (__m128)__builtin_shufflevector( \
     (__v8sf)(__m256)(V), \
-    (__v8sf)(_mm256_setzero_ps()), \
+    (__v8sf)(_mm256_undefined_ps()), \
     (((M) & 1) ? 4 : 0), \
     (((M) & 1) ? 5 : 1), \
     (((M) & 1) ? 6 : 2), \
@@ -1203,14 +2826,14 @@
 #define _mm256_extractf128_pd(V, M) __extension__ ({ \
   (__m128d)__builtin_shufflevector( \
     (__v4df)(__m256d)(V), \
-    (__v4df)(_mm256_setzero_pd()), \
+    (__v4df)(_mm256_undefined_pd()), \
     (((M) & 1) ? 2 : 0), \
     (((M) & 1) ? 3 : 1) );})
 
 #define _mm256_extractf128_si256(V, M) __extension__ ({ \
   (__m128i)__builtin_shufflevector( \
     (__v4di)(__m256i)(V), \
-    (__v4di)(_mm256_setzero_si256()), \
+    (__v4di)(_mm256_undefined_si256()), \
     (((M) & 1) ? 2 : 0), \
     (((M) & 1) ? 3 : 1) );})
 
@@ -1218,35 +2841,22 @@
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
 {
-  struct __loadu_ps {
-    __m128 __v;
-  } __attribute__((__packed__, __may_alias__));
-
-  __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
-  return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
+  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
+  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
 }
 
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
 {
-  struct __loadu_pd {
-    __m128d __v;
-  } __attribute__((__packed__, __may_alias__));
-
-  __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
-  return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
+  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
+  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
 }
 
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
 {
-  struct __loadu_si128 {
-    __m128i __v;
-  } __attribute__((__packed__, __may_alias__));
-  __m256i __v256 = _mm256_castsi128_si256(
-    ((struct __loadu_si128*)__addr_lo)->__v);
-  return _mm256_insertf128_si256(__v256,
-                                 ((struct __loadu_si128*)__addr_hi)->__v, 1);
+  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
+  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
 }
 
 /* SIMD store ops (unaligned) */
@@ -1256,9 +2866,9 @@
   __m128 __v128;
 
   __v128 = _mm256_castps256_ps128(__a);
-  __builtin_ia32_storeups(__addr_lo, __v128);
+  _mm_storeu_ps(__addr_lo, __v128);
   __v128 = _mm256_extractf128_ps(__a, 1);
-  __builtin_ia32_storeups(__addr_hi, __v128);
+  _mm_storeu_ps(__addr_hi, __v128);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
@@ -1267,9 +2877,9 @@
   __m128d __v128;
 
   __v128 = _mm256_castpd256_pd128(__a);
-  __builtin_ia32_storeupd(__addr_lo, __v128);
+  _mm_storeu_pd(__addr_lo, __v128);
   __v128 = _mm256_extractf128_pd(__a, 1);
-  __builtin_ia32_storeupd(__addr_hi, __v128);
+  _mm_storeu_pd(__addr_hi, __v128);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
@@ -1278,14 +2888,14 @@
   __m128i __v128;
 
   __v128 = _mm256_castsi256_si128(__a);
-  __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
+  _mm_storeu_si128(__addr_lo, __v128);
   __v128 = _mm256_extractf128_si256(__a, 1);
-  __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
+  _mm_storeu_si128(__addr_hi, __v128);
 }
 
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set_m128 (__m128 __hi, __m128 __lo) {
-  return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7);
+  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline __m256d __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/bmiintrin.h b/lib/Headers/bmiintrin.h
index da98792..30acfae 100644
--- a/lib/Headers/bmiintrin.h
+++ b/lib/Headers/bmiintrin.h
@@ -28,12 +28,107 @@
 #ifndef __BMIINTRIN_H
 #define __BMIINTRIN_H
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _tzcnt_u16(unsigned short a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param a
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
 #define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _andn_u32(unsigned int a, unsigned int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param a
+///    An unsigned integer containing one of the operands.
+/// \param b
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
 #define _andn_u32(a, b)   (__andn_u32((a), (b)))
+
 /* _bextr_u32 != __bextr_u32 */
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsi_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param a
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
 #define _blsi_u32(a)      (__blsi_u32((a)))
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsmsk_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param a
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
 #define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsr_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param a
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
 #define _blsr_u32(a)      (__blsr_u32((a)))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _tzcnt_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param a
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
 #define _tzcnt_u32(a)     (__tzcnt_u32((a)))
 
 /* Define the default attributes for the functions in this file. */
@@ -44,12 +139,35 @@
    to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
 static __inline__ unsigned short __RELAXED_FN_ATTRS
 __tzcnt_u16(unsigned short __X)
 {
   return __X ? __builtin_ctzs(__X) : 16;
 }
 
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __andn_u32(unsigned int __X, unsigned int __Y)
 {
@@ -57,6 +175,21 @@
 }
 
 /* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
+///    specify the index of the least significant bit. Bits [15:8] specify the
+///    number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __bextr_u32(unsigned int __X, unsigned int __Y)
 {
@@ -64,45 +197,214 @@
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 {
   return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsi_u32(unsigned int __X)
 {
   return __X & -__X;
 }
 
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsmsk_u32(unsigned int __X)
 {
   return __X ^ (__X - 1);
 }
 
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsr_u32(unsigned int __X)
 {
   return __X & (__X - 1);
 }
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
   return __X ? __builtin_ctz(__X) : 32;
 }
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ int __RELAXED_FN_ATTRS
+_mm_tzcnt_32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
 #ifdef __x86_64__
 
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param b
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
 #define _andn_u64(a, b)   (__andn_u64((a), (b)))
+
 /* _bextr_u64 != __bextr_u64 */
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsi_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
 #define _blsi_u64(a)      (__blsi_u64((a)))
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsmsk_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns A unsigned 64-bit integer containing the newly created mask.
 #define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsr_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
 #define _blsr_u64(a)      (__blsr_u64((a)))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _tzcnt_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
 
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __andn_u64 (unsigned long long __X, unsigned long long __Y)
 {
@@ -110,6 +412,21 @@
 }
 
 /* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
+///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
+///    the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __bextr_u64(unsigned long long __X, unsigned long long __Y)
 {
@@ -117,36 +434,112 @@
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///     in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 {
   return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsi_u64(unsigned long long __X)
 {
   return __X & -__X;
 }
 
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns A unsigned 64-bit integer containing the newly created mask.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsmsk_u64(unsigned long long __X)
 {
   return __X ^ (__X - 1);
 }
 
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsr_u64(unsigned long long __X)
 {
   return __X & (__X - 1);
 }
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
   return __X ? __builtin_ctzll(__X) : 64;
 }
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ long long __RELAXED_FN_ATTRS
+_mm_tzcnt_64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
 #endif /* __x86_64__ */
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/clflushoptintrin.h b/lib/Headers/clflushoptintrin.h
new file mode 100644
index 0000000..60e0ead
--- /dev/null
+++ b/lib/Headers/clflushoptintrin.h
@@ -0,0 +1,41 @@
+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLFLUSHOPTINTRIN_H
+#define __CLFLUSHOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clflushopt(char * __m) {
+  __builtin_ia32_clflushopt(__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/lib/Headers/cpuid.h b/lib/Headers/cpuid.h
index 5da02e0..400dcfa 100644
--- a/lib/Headers/cpuid.h
+++ b/lib/Headers/cpuid.h
@@ -82,6 +82,7 @@
 /* Features in %ecx for level 1 */
 #define bit_SSE3        0x00000001
 #define bit_PCLMULQDQ   0x00000002
+#define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
 #define bit_DTES64      0x00000004
 #define bit_MONITOR     0x00000008
 #define bit_DSCPL       0x00000010
@@ -98,15 +99,19 @@
 #define bit_PCID        0x00020000
 #define bit_DCA         0x00040000
 #define bit_SSE41       0x00080000
+#define bit_SSE4_1      bit_SSE41       /* for gcc compat */
 #define bit_SSE42       0x00100000
+#define bit_SSE4_2      bit_SSE42       /* for gcc compat */
 #define bit_x2APIC      0x00200000
 #define bit_MOVBE       0x00400000
 #define bit_POPCNT      0x00800000
 #define bit_TSCDeadline 0x01000000
 #define bit_AESNI       0x02000000
+#define bit_AES         bit_AESNI       /* for gcc compat */
 #define bit_XSAVE       0x04000000
 #define bit_OSXSAVE     0x08000000
 #define bit_AVX         0x10000000
+#define bit_F16C        0x20000000
 #define bit_RDRND       0x40000000
 
 /* Features in %edx for level 1 */
@@ -119,6 +124,7 @@
 #define bit_PAE         0x00000040
 #define bit_MCE         0x00000080
 #define bit_CX8         0x00000100
+#define bit_CMPXCHG8B   bit_CX8         /* for gcc compat */
 #define bit_APIC        0x00000200
 #define bit_SEP         0x00000800
 #define bit_MTRR        0x00001000
@@ -133,7 +139,7 @@
 #define bit_ACPI        0x00400000
 #define bit_MMX         0x00800000
 #define bit_FXSR        0x01000000
-#define bit_FXSAVE      bit_FXSR    /* for gcc compat */
+#define bit_FXSAVE      bit_FXSR        /* for gcc compat */
 #define bit_SSE         0x02000000
 #define bit_SSE2        0x04000000
 #define bit_SS          0x08000000
diff --git a/lib/Headers/cuda_builtin_vars.h b/lib/Headers/cuda_builtin_vars.h
index 901356b..6f5eb9c 100644
--- a/lib/Headers/cuda_builtin_vars.h
+++ b/lib/Headers/cuda_builtin_vars.h
@@ -24,16 +24,20 @@
 #ifndef __CUDA_BUILTIN_VARS_H
 #define __CUDA_BUILTIN_VARS_H
 
+// Forward declares from vector_types.h.
+struct uint3;
+struct dim3;
+
 // The file implements built-in CUDA variables using __declspec(property).
 // https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
 // All read accesses of built-in variable fields get converted into calls to a
-// getter function which in turn would call appropriate builtin to fetch the
+// getter function which in turn calls the appropriate builtin to fetch the
 // value.
 //
 // Example:
 //    int x = threadIdx.x;
 // IR output:
-//  %0 = call i32 @llvm.ptx.read.tid.x() #3
+//  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
 // PTX output:
 //  mov.u32     %r2, %tid.x;
 
@@ -60,33 +64,45 @@
   __attribute__((device)) TypeName *operator&() const __DELETE
 
 struct __cuda_builtin_threadIdx_t {
-  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_tid_x());
-  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_tid_y());
-  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_tid_z());
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
+  // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
+  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator uint3() const;
 private:
   __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
 };
 
 struct __cuda_builtin_blockIdx_t {
-  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ctaid_x());
-  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ctaid_y());
-  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ctaid_z());
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
+  // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
+  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator uint3() const;
 private:
   __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
 };
 
 struct __cuda_builtin_blockDim_t {
-  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ntid_x());
-  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ntid_y());
-  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ntid_z());
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
+  // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
+  // dim3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
 private:
   __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
 };
 
 struct __cuda_builtin_gridDim_t {
-  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_nctaid_x());
-  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_nctaid_y());
-  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_nctaid_z());
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
+  // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
+  // dim3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
 private:
   __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
 };
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index cfc2c71..1590138 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h
@@ -35,6 +35,11 @@
 typedef short __v8hi __attribute__((__vector_size__(16)));
 typedef char __v16qi __attribute__((__vector_size__(16)));
 
+/* Unsigned types */
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
+typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
+
 /* We need an explicitly signed variant for char. Note that this shouldn't
  * appear in the interface though. */
 typedef signed char __v16qs __attribute__((__vector_size__(16)));
@@ -54,7 +59,7 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_pd(__m128d __a, __m128d __b)
 {
-  return __a + __b;
+  return (__m128d)((__v2df)__a + (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -67,7 +72,7 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_pd(__m128d __a, __m128d __b)
 {
-  return __a - __b;
+  return (__m128d)((__v2df)__a - (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -80,7 +85,7 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_pd(__m128d __a, __m128d __b)
 {
-  return __a * __b;
+  return (__m128d)((__v2df)__a * (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -93,325 +98,326 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_pd(__m128d __a, __m128d __b)
 {
-  return __a / __b;
+  return (__m128d)((__v2df)__a / (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_sd(__m128d __a, __m128d __b)
 {
-  __m128d __c = __builtin_ia32_sqrtsd(__b);
+  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
   return (__m128d) { __c[0], __a[1] };
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_pd(__m128d __a)
 {
-  return __builtin_ia32_sqrtpd(__a);
+  return __builtin_ia32_sqrtpd((__v2df)__a);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_minsd(__a, __b);
+  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_pd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_minpd(__a, __b);
+  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_maxsd(__a, __b);
+  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_pd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_maxpd(__a, __b);
+  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_and_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4si)__a & (__v4si)__b);
+  return (__m128d)((__v4su)__a & (__v4su)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_andnot_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)(~(__v4si)__a & (__v4si)__b);
+  return (__m128d)(~(__v4su)__a & (__v4su)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_or_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4si)__a | (__v4si)__b);
+  return (__m128d)((__v4su)__a | (__v4su)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_xor_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4si)__a ^ (__v4si)__b);
+  return (__m128d)((__v4su)__a ^ (__v4su)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmplepd(__a, __b);
+  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
+  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmplepd(__b, __a);
+  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
+  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
+  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmplesd(__a, __b);
+  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_sd(__m128d __a, __m128d __b)
 {
-  __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
+  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
   return (__m128d) { __c[0], __a[1] };
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_sd(__m128d __a, __m128d __b)
 {
-  __m128d __c = __builtin_ia32_cmplesd(__b, __a);
+  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
   return (__m128d) { __c[0], __a[1] };
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
+  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_sd(__m128d __a, __m128d __b)
 {
-  __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
+  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
   return (__m128d) { __c[0], __a[1] };
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_sd(__m128d __a, __m128d __b)
 {
-  __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
+  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
   return (__m128d) { __c[0], __a[1] };
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_comisdeq(__a, __b);
+  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_comisdlt(__a, __b);
+  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_comisdle(__a, __b);
+  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_comisdgt(__a, __b);
+  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_comisdge(__a, __b);
+  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_comisdneq(__a, __b);
+  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_ucomisdeq(__a, __b);
+  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_ucomisdlt(__a, __b);
+  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_ucomisdle(__a, __b);
+  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_ucomisdgt(__a, __b);
+  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_ucomisdge(__a, __b);
+  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_ucomisdneq(__a, __b);
+  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpd_ps(__m128d __a)
 {
-  return __builtin_ia32_cvtpd2ps(__a);
+  return __builtin_ia32_cvtpd2ps((__v2df)__a);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtps_pd(__m128 __a)
 {
-  return __builtin_ia32_cvtps2pd(__a);
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtepi32_pd(__m128i __a)
 {
-  return __builtin_ia32_cvtdq2pd((__v4si)__a);
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtpd_epi32(__m128d __a)
 {
-  return __builtin_ia32_cvtpd2dq(__a);
+  return __builtin_ia32_cvtpd2dq((__v2df)__a);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsd_si32(__m128d __a)
 {
-  return __builtin_ia32_cvtsd2si(__a);
+  return __builtin_ia32_cvtsd2si((__v2df)__a);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsd_ss(__m128 __a, __m128d __b)
 {
-  __a[0] = __b[0];
-  return __a;
+  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -431,25 +437,25 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttpd_epi32(__m128d __a)
 {
-  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
+  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttsd_si32(__m128d __a)
 {
-  return __a[0];
+  return __builtin_ia32_cvttsd2si((__v2df)__a);
 }
 
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtpd_pi32(__m128d __a)
 {
-  return (__m64)__builtin_ia32_cvtpd2pi(__a);
+  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
 }
 
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttpd_pi32(__m128d __a)
 {
-  return (__m64)__builtin_ia32_cvttpd2pi(__a);
+  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -486,7 +492,7 @@
 _mm_loadr_pd(double const *__dp)
 {
   __m128d __u = *(__m128d*)__dp;
-  return __builtin_shufflevector(__u, __u, 1, 0);
+  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -498,6 +504,16 @@
   return ((struct __loadu_pd*)__dp)->__v;
 }
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si64(void const *__a)
+{
+  struct __loadu_si64 {
+    long long __v;
+  } __attribute__((__packed__, __may_alias__));
+  long long __u = ((struct __loadu_si64*)__a)->__v;
+  return (__m128i){__u, 0L};
+}
+
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_sd(double const *__dp)
 {
@@ -529,7 +545,7 @@
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_undefined_pd()
+_mm_undefined_pd(void)
 {
   return (__m128d)__builtin_ia32_undef128();
 }
@@ -580,31 +596,37 @@
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store1_pd(double *__dp, __m128d __a)
+_mm_store_pd(double *__dp, __m128d __a)
 {
-  struct __mm_store1_pd_struct {
-    double __u[2];
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
-  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
+  *(__m128d*)__dp = __a;
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_pd(double *__dp, __m128d __a)
+_mm_store1_pd(double *__dp, __m128d __a)
 {
-  *(__m128d *)__dp = __a;
+  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+  _mm_store_pd(__dp, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd1(double *__dp, __m128d __a)
+{
+  return _mm_store1_pd(__dp, __a);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_pd(double *__dp, __m128d __a)
 {
-  __builtin_ia32_storeupd(__dp, __a);
+  struct __storeu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pd*)__dp)->__v = __a;
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storer_pd(double *__dp, __m128d __a)
 {
-  __a = __builtin_shufflevector(__a, __a, 1, 0);
+  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
   *(__m128d *)__dp = __a;
 }
 
@@ -629,31 +651,31 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi8(__m128i __a, __m128i __b)
 {
-  return (__m128i)((__v16qi)__a + (__v16qi)__b);
+  return (__m128i)((__v16qu)__a + (__v16qu)__b);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi16(__m128i __a, __m128i __b)
 {
-  return (__m128i)((__v8hi)__a + (__v8hi)__b);
+  return (__m128i)((__v8hu)__a + (__v8hu)__b);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi32(__m128i __a, __m128i __b)
 {
-  return (__m128i)((__v4si)__a + (__v4si)__b);
+  return (__m128i)((__v4su)__a + (__v4su)__b);
 }
 
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_si64(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_paddq(__a, __b);
+  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi64(__m128i __a, __m128i __b)
 {
-  return __a + __b;
+  return (__m128i)((__v2du)__a + (__v2du)__b);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -734,268 +756,792 @@
   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies the corresponding elements of two [8 x short] vectors and
+///    returns a vector containing the low-order 16 bits of each 32-bit product
+///    in the corresponding element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi16(__m128i __a, __m128i __b)
 {
-  return (__m128i)((__v8hi)__a * (__v8hi)__b);
+  return (__m128i)((__v8hu)__a * (__v8hu)__b);
 }
 
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
+///    of the two 64-bit integer vectors and returns the 64-bit unsigned
+///    product.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULUDQ instruction.
+///
+/// \param __a
+///    A 64-bit integer containing one of the source operands.
+/// \param __b
+///    A 64-bit integer containing one of the source operands.
+/// \returns A 64-bit integer vector containing the product of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mul_su32(__m64 __a, __m64 __b)
 {
   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
 }
 
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower
+///    bits of the corresponding elements of two [2 x i64] vectors, and returns
+///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
+///
+/// \param __a
+///    A [2 x i64] vector containing one of the source operands.
+/// \param __b
+///    A [2 x i64] vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the product of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mul_epu32(__m128i __a, __m128i __b)
 {
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Computes the absolute differences of corresponding 8-bit integer
+///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
+///    separately sums the second 8 absolute differences. Packss these two
+///    unsigned 16-bit integer sums into the upper and lower elements of a
+///    [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the sums of the sets of absolute
+///    differences between both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sad_epu8(__m128i __a, __m128i __b)
 {
   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts the corresponding 8-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi8(__m128i __a, __m128i __b)
 {
-  return (__m128i)((__v16qi)__a - (__v16qi)__b);
+  return (__m128i)((__v16qu)__a - (__v16qu)__b);
 }
 
+/// \brief Subtracts the corresponding 16-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi16(__m128i __a, __m128i __b)
 {
-  return (__m128i)((__v8hi)__a - (__v8hi)__b);
+  return (__m128i)((__v8hu)__a - (__v8hu)__b);
 }
 
+/// \brief Subtracts the corresponding 32-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi32(__m128i __a, __m128i __b)
 {
-  return (__m128i)((__v4si)__a - (__v4si)__b);
+  return (__m128i)((__v4su)__a - (__v4su)__b);
 }
 
+/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
+///    difference to the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBQ instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the minuend.
+/// \param __b
+///    A 64-bit integer vector containing the subtrahend.
+/// \returns A 64-bit integer vector containing the difference of the values in
+///    the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_si64(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psubq(__a, __b);
+  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
 }
 
+/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi64(__m128i __a, __m128i __b)
 {
-  return __a - __b;
+  return (__m128i)((__v2du)__a - (__v2du)__b);
 }
 
+/// \brief Subtracts corresponding 8-bit signed integer values in the input and
+///    returns the differences in the corresponding bytes in the destination.
+///    Differences greater than 7Fh are saturated to 7Fh, and differences less
+///    than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts corresponding 16-bit signed integer values in the input and
+///    returns the differences in the corresponding bytes in the destination.
+///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
+///    than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
+///    and returns the differences in the corresponding bytes in the
+///    destination. Differences less than 00h are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+///    differences of the values in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
+///    and returns the differences in the corresponding bytes in the
+///    destination. Differences less than 0000h are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+///    differences of the values in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Performs a bitwise AND of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPAND / PAND instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise AND of the values
+///    in both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_and_si128(__m128i __a, __m128i __b)
 {
-  return __a & __b;
+  return (__m128i)((__v2du)__a & (__v2du)__b);
 }
 
+/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
+///    one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
+///
+/// \param __a
+///    A 128-bit vector containing the left source operand. The one's complement
+///    of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector containing the right source operand.
+/// \returns A 128-bit integer vector containing the bitwise AND of the one's
+///    complement of the first operand and the values in the second operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_andnot_si128(__m128i __a, __m128i __b)
 {
-  return ~__a & __b;
+  return (__m128i)(~(__v2du)__a & (__v2du)__b);
 }
-
+/// \brief Performs a bitwise OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPOR / POR instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise OR of the values
+///    in both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_or_si128(__m128i __a, __m128i __b)
 {
-  return __a | __b;
+  return (__m128i)((__v2du)__a | (__v2du)__b);
 }
 
+/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
+///    values in both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_xor_si128(__m128i __a, __m128i __b)
 {
-  return __a ^ __b;
+  return (__m128i)((__v2du)__a ^ (__v2du)__b);
 }
 
-#define _mm_slli_si128(a, imm) __extension__ ({                         \
-  (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
-                                   (__v16qi)(__m128i)(a),               \
-                                   ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
-                                   ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
+/// \brief Left-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_slli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to left-shift
+///    operand a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
+#define _mm_slli_si128(a, imm) __extension__ ({                              \
+  (__m128i)__builtin_shufflevector(                                          \
+                                 (__v16qi)_mm_setzero_si128(),               \
+                                 (__v16qi)(__m128i)(a),                      \
+                                 ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
 
 #define _mm_bslli_si128(a, imm) \
   _mm_slli_si128((a), (imm))
 
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
 }
 
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
 }
 
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
 }
 
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi64(__m128i __a, int __count)
 {
-  return __builtin_ia32_psllqi128(__a, __count);
+  return __builtin_ia32_psllqi128((__v2di)__a, __count);
 }
 
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi64(__m128i __a, __m128i __count)
 {
-  return __builtin_ia32_psllq128(__a, __count);
+  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
 }
 
-#define _mm_srli_si128(a, imm) __extension__ ({                          \
-  (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
-                                   (__v16qi)_mm_setzero_si128(),         \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9,  \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
-                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
+/// \brief Right-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_srli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to right-shift operand
+///    a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
+#define _mm_srli_si128(a, imm) __extension__ ({                              \
+  (__m128i)__builtin_shufflevector(                                          \
+                                 (__v16qi)(__m128i)(a),                      \
+                                 (__v16qi)_mm_setzero_si128(),               \
+                                 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
+                                 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
+                                 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
+                                 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
+                                 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
+                                 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
+                                 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
+                                 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
+                                 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
+                                 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
+                                 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
+                                 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
+                                 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
+                                 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
+                                 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
+                                 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
 
 #define _mm_bsrli_si128(a, imm) \
   _mm_srli_si128((a), (imm))
 
+/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
 }
 
+/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
 }
 
+/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
 }
 
+/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi64(__m128i __a, int __count)
 {
-  return __builtin_ia32_psrlqi128(__a, __count);
+  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
 }
 
+/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi64(__m128i __a, __m128i __count)
 {
-  return __builtin_ia32_psrlq128(__a, __count);
+  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
 }
 
+/// \brief Compares each of the corresponding 8-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false, FFh
+///    for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a == (__v16qi)__b);
 }
 
+/// \brief Compares each of the corresponding 16-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
+///    for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a == (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false,
+///    FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a == (__v4si)__b);
 }
 
+/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are
+///    greater than those in the second operand. Each comparison yields 0h for
+///    false, FFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
 {
@@ -1004,30 +1550,100 @@
   return (__m128i)((__v16qs)__a > (__v16qs)__b);
 }
 
+/// \brief Compares each of the corresponding signed 16-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are greater than those in the second operand. Each comparison yields 0h
+///    for false, FFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a > (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding signed 32-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are greater than those in the second operand. Each comparison yields 0h
+///    for false, FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a > (__v4si)__b);
 }
 
+/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are less
+///    than those in the second operand. Each comparison yields 0h for false,
+///    FFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi8(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi8(__b, __a);
 }
 
+/// \brief Compares each of the corresponding signed 16-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are less than those in the second operand. Each comparison yields 0h for
+///    false, FFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi16(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi16(__b, __a);
 }
 
+/// \brief Compares each of the corresponding signed 32-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are less than those in the second operand. Each comparison yields 0h for
+///    false, FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi32(__m128i __a, __m128i __b)
 {
@@ -1035,6 +1651,23 @@
 }
 
 #ifdef __x86_64__
+/// \brief Converts a 64-bit signed integer value from the second operand into a
+///    double-precision value and returns it in the lower element of a [2 x
+///    double] vector; the upper element of the returned vector is copied from
+///    the upper element of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
+///    copied to the upper 64 bits of the destination.
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied from
+///    the upper 64 bits of the first operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtsi64_sd(__m128d __a, long long __b)
 {
@@ -1042,37 +1675,98 @@
   return __a;
 }
 
+/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+///    64-bit signed integer value, according to the current rounding mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 64-bit signed integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtsd_si64(__m128d __a)
 {
-  return __builtin_ia32_cvtsd2si64(__a);
+  return __builtin_ia32_cvtsd2si64((__v2df)__a);
 }
 
+/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+///    64-bit signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 64-bit signed integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttsd_si64(__m128d __a)
 {
-  return __a[0];
+  return __builtin_ia32_cvttsd2si64((__v2df)__a);
 }
 #endif
 
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtepi32_ps(__m128i __a)
 {
   return __builtin_ia32_cvtdq2ps((__v4si)__a);
 }
 
+/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit integer vector of [4 x i32] containing the converted
+///    values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtps_epi32(__m128 __a)
 {
-  return (__m128i)__builtin_ia32_cvtps2dq(__a);
+  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
 }
 
+/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
+///    truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x i32] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttps_epi32(__m128 __a)
 {
-  return (__m128i)__builtin_ia32_cvttps2dq(__a);
+  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
 }
 
+/// \brief Returns a vector of [4 x i32] where the lowest element is the input
+///    operand and the remaining elements are zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+///
+/// \param __a
+///    A 32-bit signed integer operand.
+/// \returns A 128-bit vector of [4 x i32].
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi32_si128(int __a)
 {
@@ -1080,6 +1774,16 @@
 }
 
 #ifdef __x86_64__
+/// \brief Returns a vector of [2 x i64] where the lower element is the input
+///    operand and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+///
+/// \param __a
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x i64] containing the converted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si128(long long __a)
 {
@@ -1087,6 +1791,17 @@
 }
 #endif
 
+/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
+///    32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+///
+/// \param __a
+///    A vector of [4 x i32]. The least significant 32 bits are moved to the
+///    destination.
+/// \returns A 32-bit signed integer containing the moved value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsi128_si32(__m128i __a)
 {
@@ -1095,6 +1810,17 @@
 }
 
 #ifdef __x86_64__
+/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
+///    64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+///
+/// \param __a
+///    A vector of [2 x i64]. The least significant 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit signed integer containing the moved value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtsi128_si64(__m128i __a)
 {
@@ -1102,12 +1828,32 @@
 }
 #endif
 
+/// \brief Moves packed integer values from an aligned 128-bit memory location
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
+///
+/// \param __p
+///    An aligned pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_load_si128(__m128i const *__p)
 {
   return *__p;
 }
 
+/// \brief Moves packed integer values from an unaligned 128-bit memory location
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadu_si128(__m128i const *__p)
 {
@@ -1117,6 +1863,18 @@
   return ((struct __loadu_si128*)__p)->__v;
 }
 
+/// \brief Returns a vector of [2 x i64] where the lower element is taken from
+///    the lower element of the operand, and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+///
+/// \param __p
+///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
+///    the destination.
+/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
+///    moved value. The higher order bits are cleared.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadl_epi64(__m128i const *__p)
 {
@@ -1126,120 +1884,486 @@
   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
 }
 
+/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
+///    This could be used as an argument to another intrinsic function where the
+///    argument is required but the value is not actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x i32] with unspecified content.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_undefined_si128()
+_mm_undefined_si128(void)
 {
   return (__m128i)__builtin_ia32_undef128();
 }
 
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits of the
+///    destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits of the
+///    destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi64x(long long __q1, long long __q0)
 {
   return (__m128i){ __q0, __q1 };
 }
 
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits of the
+///    destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits of the
+///    destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi64(__m64 __q1, __m64 __q0)
 {
   return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
+/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i3
+///    A 32-bit integer value used to initialize bits [127:96] of the
+///    destination vector.
+/// \param __i2
+///    A 32-bit integer value used to initialize bits [95:64] of the destination
+///    vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of the destination
+///    vector.
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the destination
+///    vector.
+/// \returns An initialized 128-bit vector of [4 x i32] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
 {
   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
+/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w7
+///    A 16-bit integer value used to initialize bits [127:112] of the
+///    destination vector.
+/// \param __w6
+///    A 16-bit integer value used to initialize bits [111:96] of the
+///    destination vector.
+/// \param __w5
+///    A 16-bit integer value used to initialize bits [95:80] of the destination
+///    vector.
+/// \param __w4
+///    A 16-bit integer value used to initialize bits [79:64] of the destination
+///    vector.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of the destination
+///    vector.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of the destination
+///    vector.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of the destination
+///    vector.
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the destination
+///    vector.
+/// \returns An initialized 128-bit vector of [8 x i16] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
 {
   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
+/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
+///    the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b15
+///    Initializes bits [127:120] of the destination vector.
+/// \param __b14
+///    Initializes bits [119:112] of the destination vector.
+/// \param __b13
+///    Initializes bits [111:104] of the destination vector.
+/// \param __b12
+///    Initializes bits [103:96] of the destination vector.
+/// \param __b11
+///    Initializes bits [95:88] of the destination vector.
+/// \param __b10
+///    Initializes bits [87:80] of the destination vector.
+/// \param __b9
+///    Initializes bits [79:72] of the destination vector.
+/// \param __b8
+///    Initializes bits [71:64] of the destination vector.
+/// \param __b7
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b6
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b5
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b4
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b3
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b2
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b1
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b0
+///    Initializes bits [7:0] of the destination vector.
+/// \returns An initialized 128-bit vector of [16 x i8] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
 {
   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
+/// \brief Initializes both values in a 128-bit integer vector with the
+///    specified 64-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q
+///    Integer value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit integer vector of [2 x i64] with both
+///    elements containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64x(long long __q)
 {
   return (__m128i){ __q, __q };
 }
 
+/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
+///    specified 64-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q
+///    A 64-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [2 x i64] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64(__m64 __q)
 {
   return (__m128i){ (long long)__q, (long long)__q };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
+///    specified 32-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i
+///    A 32-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [4 x i32] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi32(int __i)
 {
   return (__m128i)(__v4si){ __i, __i, __i, __i };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
+///    specified 16-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w
+///    A 16-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [8 x i16] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi16(short __w)
 {
   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
+///    specified 8-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b
+///    An 8-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [16 x i8] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi8(char __b)
 {
   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLQDQ / PUNPCKLQDQ instruction.
+///
+/// \param __q0
+///    A 64-bit integral value used to initialize the lower 64 bits of the
+///    result.
+/// \param __q1
+///    A 64-bit integral value used to initialize the upper 64 bits of the
+///    result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi64(__m64 __q0, __m64 __q1)
 {
   return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
 {
   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w4
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w5
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w6
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w7
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
 {
   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b8
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b9
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
 {
   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
+/// \brief Creates a 128-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+///
+/// \returns An initialized 128-bit integer vector with all elements set to
+///    zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setzero_si128(void)
 {
   return (__m128i){ 0LL, 0LL };
 }
 
+/// \brief Stores a 128-bit integer vector to a memory location aligned on a
+///    128-bit boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that will receive the integer
+///    values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_si128(__m128i *__p, __m128i __b)
 {
   *__p = __b;
 }
 
+/// \brief Stores a 128-bit integer vector to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_si128(__m128i *__p, __m128i __b)
 {
-  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
+  struct __storeu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si128*)__p)->__v = __b;
 }
 
+/// \brief Moves bytes selected by the mask from the first operand to the
+///    specified unaligned memory location. When a mask bit is 1, the
+///    corresponding byte is written, otherwise it is not written. To minimize
+///    caching, the date is flagged as non-temporal (unlikely to be used again
+///    soon). Exception and trap behavior for elements not selected for storage
+///    to memory are implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMASKMOVDQU / MASKMOVDQU instruction.
+///
+/// \param __d
+///    A 128-bit integer vector containing the values to be moved.
+/// \param __n
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each byte represents the mask bits.
+/// \param __p
+///    A pointer to an unaligned 128-bit memory location where the specified
+///    values are moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
 {
   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
 }
 
+/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
+///    a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the lower 64 bits
+///    of the integer vector parameter.
+/// \param __a
+///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
+///    value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_epi64(__m128i *__p, __m128i __a)
 {
@@ -1249,18 +2373,54 @@
   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
 }
 
+/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A vector of [2 x double] containing the 64-bit values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pd(double *__p, __m128d __a)
 {
-  __builtin_ia32_movntpd(__p, __a);
+  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
 }
 
+/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A 128-bit integer vector containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si128(__m128i *__p, __m128i __a)
 {
-  __builtin_ia32_movntdq(__p, __a);
+  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
 }
 
+/// \brief Stores a 32-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MOVNTI instruction.
+///
+/// \param __p
+///    A pointer to the 32-bit memory location used to store the value.
+/// \param __a
+///    A 32-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si32(int *__p, int __a)
 {
@@ -1268,6 +2428,18 @@
 }
 
 #ifdef __x86_64__
+/// \brief Stores a 64-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MOVNTIQ instruction.
+///
+/// \param __p
+///    A pointer to the 64-bit memory location used to store the value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si64(long long *__p, long long __a)
 {
@@ -1275,42 +2447,158 @@
 }
 #endif
 
+/// \brief The cache line containing __p is flushed and invalidated from all
+///    caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CLFLUSH instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    flushed.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_clflush(void const *__p)
 {
   __builtin_ia32_clflush(__p);
 }
 
+/// \brief Forces strong memory ordering (serialization) between load
+///    instructions preceding this instruction and load instructions following
+///    this instruction, ensuring the system completes all previous loads before
+///    executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LFENCE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_lfence(void)
 {
   __builtin_ia32_lfence();
 }
 
+/// \brief Forces strong memory ordering (serialization) between load and store
+///    instructions preceding this instruction and load and store instructions
+///    following this instruction, ensuring that the system completes all
+///    previous memory accesses before executing subsequent memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MFENCE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mfence(void)
 {
   __builtin_ia32_mfence();
 }
 
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPACKSSWB / PACKSSWB instruction.
+///
+/// \param __a
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the lower 64 bits of the result.
+/// \param __b
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPACKSSDW / PACKSSDW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit unsigned integers, and packs the results into the
+///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPACKUSWB / PACKUSWB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
+///    the immediate-value parameter as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __imm
+///    An immediate value. Bits [3:0] selects values from __a to be assigned to
+///    bits[15:0] of the result.
+///    000: assign values from bits [15:0] of __a.
+///    001: assign values from bits [31:16] of __a.
+///    010: assign values from bits [47:32] of __a.
+///    011: assign values from bits [63:48] of __a.
+///    100: assign values from bits [79:64] of __a.
+///    101: assign values from bits [95:80] of __a.
+///    110: assign values from bits [111:96] of __a.
+///    111: assign values from bits [127:112] of __a.
+/// \returns An integer, whose lower 16 bits are selected from the 128-bit
+///    integer vector parameter and the remaining bits are assigned zeros.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_extract_epi16(__m128i __a, int __imm)
 {
@@ -1318,6 +2606,26 @@
   return (unsigned short)__b[__imm & 7];
 }
 
+/// \brief Constructs a 128-bit integer vector by first making a copy of the
+///    128-bit integer vector parameter, and then inserting the lower 16 bits
+///    of an integer parameter into an offset specified by the immediate-value
+///    parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
+///    result and then one of the eight elements in the result is replaced by
+///    the lower 16 bits of __b.
+/// \param __b
+///    An integer. The lower 16 bits of this parameter are written to the
+///    result beginning at an offset specified by __imm.
+/// \param __imm
+///    An immediate value specifying the bit offset in the result at which the
+///    lower 16 bits of__b are written.
+/// \returns A 128-bit integer vector containing the constructed values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_epi16(__m128i __a, int __b, int __imm)
 {
@@ -1326,158 +2634,585 @@
   return (__m128i)__c;
 }
 
+/// \brief Copies the values of the most significant bits from each 8-bit
+///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
+///    value, zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMOVMSKB / PMOVMSKB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bits from each 8-bit element in __a, written
+///    to bits [15:0]. The other bits are assigned zeros. 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_epi8(__m128i __a)
 {
   return __builtin_ia32_pmovmskb128((__v16qi)__a);
 }
 
+/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
+///    elements of a 128-bit integer vector parameter, using the immediate-value
+///    parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a. The destinations within the 128-bit destination are assigned
+///    values as follows:
+///    Bits [1:0] are used to assign values to bits [31:0] of the result.
+///    Bits [3:2] are used to assign values to bits [63:32] of the result.
+///    Bits [5:4] are used to assign values to bits [95:64] of the result.
+///    Bits [7:6] are used to assign values to bits [127:96] of the result.
+///    Bit value assignments:
+///    00: assign values from bits [31:0] of a.
+///    01: assign values from bits [63:32] of a.
+///    10: assign values from bits [95:64] of a.
+///    11: assign values from bits [127:96] of a.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
-                                   (__v4si)_mm_setzero_si128(), \
-                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
-                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
+                                   (__v4si)_mm_undefined_si128(), \
+                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
+                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
 
+/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
+///    [127:64] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from a.
+///    Bits[1:0] are used to assign values to bits [15:0] of the result.
+///    Bits[3:2] are used to assign values to bits [31:16] of the result.
+///    Bits[5:4] are used to assign values to bits [47:32] of the result.
+///    Bits[7:6] are used to assign values to bits [63:48] of the result.
+///    Bit value assignments:
+///    00: assign values from bits [15:0] of a.
+///    01: assign values from bits [31:16] of a.
+///    10: assign values from bits [47:32] of a.
+///    11: assign values from bits [63:48] of a.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
-                                   (__v8hi)_mm_setzero_si128(), \
-                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
-                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   (__v8hi)_mm_undefined_si128(), \
+                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
+                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
                                    4, 5, 6, 7); })
 
+/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSHUFHW / PSHUFHW instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
+///    [63:0] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from a.
+///    Bits[1:0] are used to assign values to bits [79:64] of the result.
+///    Bits[3:2] are used to assign values to bits [95:80] of the result.
+///    Bits[5:4] are used to assign values to bits [111:96] of the result.
+///    Bits[7:6] are used to assign values to bits [127:112] of the result.
+///    Bit value assignments:
+///    00: assign values from bits [79:64] of a.
+///    01: assign values from bits [95:80] of a.
+///    10: assign values from bits [111:96] of a.
+///    11: assign values from bits [127:112] of a.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
-                                   (__v8hi)_mm_setzero_si128(), \
+                                   (__v8hi)_mm_undefined_si128(), \
                                    0, 1, 2, 3, \
-                                   4 + (((imm) & 0x03) >> 0), \
-                                   4 + (((imm) & 0x0c) >> 2), \
-                                   4 + (((imm) & 0x30) >> 4), \
-                                   4 + (((imm) & 0xc0) >> 6)); })
+                                   4 + (((imm) >> 0) & 0x3), \
+                                   4 + (((imm) >> 2) & 0x3), \
+                                   4 + (((imm) >> 4) & 0x3), \
+                                   4 + (((imm) >> 6) & 0x3)); })
 
+/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
+///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKHBW / PUNPCKHBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [7:0] of the result
+///    Bits [79:72] are written to bits [23:16] of the result.
+///    Bits [87:80] are written to bits [39:32] of the result.
+///    Bits [95:88] are written to bits [55:48] of the result.
+///    Bits [103:96] are written to bits [71:64] of the result.
+///    Bits [111:104] are written to bits [87:80] of the result.
+///    Bits [119:112] are written to bits [103:96] of the result.
+///    Bits [127:120] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [15:8] of the result.
+///    Bits [79:72] are written to bits [31:24] of the result.
+///    Bits [87:80] are written to bits [47:40] of the result.
+///    Bits [95:88] are written to bits [63:56] of the result.
+///    Bits [103:96] are written to bits [79:72] of the result.
+///    Bits [111:104] are written to bits [95:88] of the result.
+///    Bits [119:112] are written to bits [111:104] of the result.
+///    Bits [127:120] are written to bits [127:120] of the destination.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
+/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
+///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKHWD / PUNPCKHWD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [15:0] of the result.
+///    Bits [95:80] are written to bits [47:32] of the result.
+///    Bits [111:96] are written to bits [79:64] of the result.
+///    Bits [127:112] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [31:16] of the result.
+///    Bits [95:80] are written to bits [63:48] of the result.
+///    Bits [111:96] are written to bits [95:80] of the result.
+///    Bits [127:112] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
 }
 
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKHDQ / PUNPCKHDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+///    Bits [95:64] are written to bits [31:0] of the destination.
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+///    Bits [95:64] are written to bits [64:32] of the destination.
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKHQDQ / PUNPCKHQDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
+  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
 }
 
+/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLBW / PUNPCKLBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [7:0] of the result.
+///    Bits [15:8] are written to bits [23:16] of the result.
+///    Bits [23:16] are written to bits [39:32] of the result.
+///    Bits [31:24] are written to bits [55:48] of the result.
+///    Bits [39:32] are written to bits [71:64] of the result.
+///    Bits [47:40] are written to bits [87:80] of the result.
+///    Bits [55:48] are written to bits [103:96] of the result.
+///    Bits [63:56] are written to bits [119:112] of the destination.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result.
+///    Bits [15:8] are written to bits [31:24] of the result.
+///    Bits [23:16] are written to bits [47:40] of the result.
+///    Bits [31:24] are written to bits [63:56] of the result.
+///    Bits [39:32] are written to bits [79:72] of the result.
+///    Bits [47:40] are written to bits [95:88] of the result.
+///    Bits [55:48] are written to bits [111:104] of the result.
+///    Bits [63:56] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
 }
 
+/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
+///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
+///    [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLWD / PUNPCKLWD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result.
+///    Bits [31:16] are written to bits [47:32] of the result.
+///    Bits [47:32] are written to bits [79:64] of the result.
+///    Bits [63:48] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result.
+///    Bits [31:16] are written to bits [63:48] of the result.
+///    Bits [47:32] are written to bits [95:80] of the result.
+///    Bits [63:48] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
 }
 
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLDQ / PUNPCKLDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+///    Bits [31:0] are written to bits [31:0] of the destination.
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+///    Bits [31:0] are written to bits [64:32] of the destination.
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
 }
 
+/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLQDQ / PUNPCKLQDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
+  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
 }
 
+/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+///    integer. 
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_movepi64_pi64(__m128i __a)
 {
   return (__m64)__a[0];
 }
 
+/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+///    upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ / MOVD instruction.
+///
+/// \param __a
+///    A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_movpi64_epi64(__m64 __a)
 {
   return (__m128i){ (long long)__a, 0 };
 }
 
+/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+///    integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_move_epi64(__m128i __a)
 {
-  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
+  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double].
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpackhi_pd(__m128d __a, __m128d __b)
 {
-  return __builtin_shufflevector(__a, __b, 1, 2+1);
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
 }
 
+/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double].
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpacklo_pd(__m128d __a, __m128d __b)
 {
-  return __builtin_shufflevector(__a, __b, 0, 2+0);
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
 }
 
+/// \brief Extracts the sign bits of the double-precision values in the 128-bit
+///    vector of [2 x double], zero-extends the value, and writes it to the
+///    low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVMSKPD / MOVMSKPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values with sign bits to
+///    be extracted.
+/// \returns The sign bits from each of the double-precision elements in __a,
+///    written to bits [1:0]. The remaining bits are assigned values of zero.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pd(__m128d __a)
 {
-  return __builtin_ia32_movmskpd(__a);
+  return __builtin_ia32_movmskpd((__v2df)__a);
 }
 
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
+///    128-bit vector parameters of [2 x double], using the immediate-value
+///     parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param i
+///    An 8-bit immediate value. The least significant two bits specify which
+///    elements to copy from a and b:
+///    Bit[0] = 0: lower element of a copied to lower element of result. 
+///    Bit[0] = 1: upper element of a copied to lower element of result.
+///    Bit[1] = 0: lower element of b copied to upper element of result.
+///    Bit[1] = 1: upper element of b copied to upper element of result.
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
-                                   (i) & 1, (((i) & 2) >> 1) + 2); })
+                                   0 + (((i) >> 0) & 0x1), \
+                                   2 + (((i) >> 1) & 0x1)); })
 
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castpd_ps(__m128d __a)
 {
   return (__m128)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castpd_si128(__m128d __a)
 {
   return (__m128i)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castps_pd(__m128 __a)
 {
   return (__m128d)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castps_si128(__m128 __a)
 {
   return (__m128i)__a;
 }
 
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castsi128_ps(__m128i __a)
 {
   return (__m128)__a;
 }
 
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castsi128_pd(__m128i __a)
 {
   return (__m128d)__a;
 }
 
+/// \brief Indicates that a spin loop is being executed for the purposes of
+///    optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PAUSE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_pause(void)
 {
diff --git a/lib/Headers/f16cintrin.h b/lib/Headers/f16cintrin.h
index c655d98..415bf73 100644
--- a/lib/Headers/f16cintrin.h
+++ b/lib/Headers/f16cintrin.h
@@ -29,11 +29,90 @@
 #define __F16CINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
 
-#define _mm_cvtps_ph(a, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
+/// \brief Converts a 16-bit half-precision float value into a 32-bit float
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTPH2PS instruction.
+///
+/// \param __a
+///    A 16-bit half-precision float value.
+/// \returns The converted 32-bit float value.
+static __inline float __DEFAULT_FN_ATTRS
+_cvtsh_ss(unsigned short __a)
+{
+  __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
+  __v4sf r = __builtin_ia32_vcvtph2ps(v);
+  return r[0];
+}
 
+/// \brief Converts a 32-bit single-precision float value to a 16-bit
+///    half-precision float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _cvtss_sh(float a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTPS2PH instruction.
+///
+/// \param a
+///    A 32-bit single-precision float value to be converted to a 16-bit
+///    half-precision float value.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]:
+///    000: Nearest
+///    001: Down
+///    010: Up
+///    011: Truncate
+///    1XX: Use MXCSR.RC for rounding
+/// \returns The converted 16-bit half-precision float value.
+#define _cvtss_sh(a, imm)  \
+  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                      (imm)))[0]))
+
+/// \brief Converts a 128-bit vector containing 32-bit float values into a
+///    128-bit vector containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTPS2PH instruction.
+///
+/// \param a
+///    A 128-bit vector containing 32-bit float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]:
+///    000: Nearest
+///    001: Down
+///    010: Up
+///    011: Truncate
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing converted 16-bit half-precision float
+///    values. The lower 64 bits are used to store the converted 16-bit
+///    half-precision floating-point values.
+#define _mm_cvtps_ph(a, imm) \
+  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
+
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 128-bit vector containing 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTPH2PS instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values. The lower
+///    64 bits are used in the conversion.
+/// \returns A 128-bit vector of [4 x float] containing converted float values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_cvtph_ps(__m128i __a)
 {
diff --git a/lib/Headers/float.h b/lib/Headers/float.h
index 238cf76..a28269e 100644
--- a/lib/Headers/float.h
+++ b/lib/Headers/float.h
@@ -39,7 +39,9 @@
 #  undef FLT_MANT_DIG
 #  undef DBL_MANT_DIG
 #  undef LDBL_MANT_DIG
-#  undef DECIMAL_DIG
+#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#    undef DECIMAL_DIG
+#  endif
 #  undef FLT_DIG
 #  undef DBL_DIG
 #  undef LDBL_DIG
@@ -68,6 +70,9 @@
 #    undef FLT_TRUE_MIN
 #    undef DBL_TRUE_MIN
 #    undef LDBL_TRUE_MIN
+#    undef FLT_DECIMAL_DIG
+#    undef DBL_DECIMAL_DIG
+#    undef LDBL_DECIMAL_DIG
 #  endif
 #endif
 
@@ -81,7 +86,9 @@
 #define DBL_MANT_DIG __DBL_MANT_DIG__
 #define LDBL_MANT_DIG __LDBL_MANT_DIG__
 
-#define DECIMAL_DIG __DECIMAL_DIG__
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#  define DECIMAL_DIG __DECIMAL_DIG__
+#endif
 
 #define FLT_DIG __FLT_DIG__
 #define DBL_DIG __DBL_DIG__
@@ -119,6 +126,9 @@
 #  define FLT_TRUE_MIN __FLT_DENORM_MIN__
 #  define DBL_TRUE_MIN __DBL_DENORM_MIN__
 #  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
+#  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
+#  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
 #endif
 
 #endif /* __FLOAT_H */
diff --git a/lib/Headers/fma4intrin.h b/lib/Headers/fma4intrin.h
index f117887..11aa8ce 100644
--- a/lib/Headers/fma4intrin.h
+++ b/lib/Headers/fma4intrin.h
@@ -36,193 +36,193 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/fmaintrin.h b/lib/Headers/fmaintrin.h
index 114a143..0e2ef0b 100644
--- a/lib/Headers/fmaintrin.h
+++ b/lib/Headers/fmaintrin.h
@@ -34,193 +34,193 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/fxsrintrin.h b/lib/Headers/fxsrintrin.h
index ac6026a..f77aa48 100644
--- a/lib/Headers/fxsrintrin.h
+++ b/lib/Headers/fxsrintrin.h
@@ -36,19 +36,21 @@
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave64(void *__p) {
-  return __builtin_ia32_fxsave64(__p);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
 _fxrstor(void *__p) {
   return __builtin_ia32_fxrstor(__p);
 }
 
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p) {
+  return __builtin_ia32_fxsave64(__p);
+}
+
 static __inline__ void __DEFAULT_FN_ATTRS
 _fxrstor64(void *__p) {
   return __builtin_ia32_fxrstor64(__p);
 }
+#endif
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/lib/Headers/htmintrin.h b/lib/Headers/htmintrin.h
index 0088c7c..69c8d7b 100644
--- a/lib/Headers/htmintrin.h
+++ b/lib/Headers/htmintrin.h
@@ -164,24 +164,24 @@
 /* Helper intrinsics to retry tbegin in case of transient failure.  */
 
 static __inline int __attribute__((__always_inline__, __nodebug__))
-__builtin_tbegin_retry_null (int retry)
+__builtin_tbegin_retry_null (int __retry)
 {
   int cc, i = 0;
 
   while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
-         && i++ < retry)
+         && i++ < __retry)
     __builtin_tx_assist(i);
 
   return cc;
 }
 
 static __inline int __attribute__((__always_inline__, __nodebug__))
-__builtin_tbegin_retry_tdb (void *tdb, int retry)
+__builtin_tbegin_retry_tdb (void *__tdb, int __retry)
 {
   int cc, i = 0;
 
-  while ((cc = __builtin_tbegin(tdb)) == _HTM_TBEGIN_TRANSIENT
-         && i++ < retry)
+  while ((cc = __builtin_tbegin(__tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
     __builtin_tx_assist(i);
 
   return cc;
@@ -193,24 +193,24 @@
    __builtin_tbegin_retry_tdb(tdb, retry))
 
 static __inline int __attribute__((__always_inline__, __nodebug__))
-__builtin_tbegin_retry_nofloat_null (int retry)
+__builtin_tbegin_retry_nofloat_null (int __retry)
 {
   int cc, i = 0;
 
   while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
-         && i++ < retry)
+         && i++ < __retry)
     __builtin_tx_assist(i);
 
   return cc;
 }
 
 static __inline int __attribute__((__always_inline__, __nodebug__))
-__builtin_tbegin_retry_nofloat_tdb (void *tdb, int retry)
+__builtin_tbegin_retry_nofloat_tdb (void *__tdb, int __retry)
 {
   int cc, i = 0;
 
-  while ((cc = __builtin_tbegin_nofloat(tdb)) == _HTM_TBEGIN_TRANSIENT
-         && i++ < retry)
+  while ((cc = __builtin_tbegin_nofloat(__tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
     __builtin_tx_assist(i);
 
   return cc;
diff --git a/lib/Headers/htmxlintrin.h b/lib/Headers/htmxlintrin.h
index c7571ec..16dc705 100644
--- a/lib/Headers/htmxlintrin.h
+++ b/lib/Headers/htmxlintrin.h
@@ -62,18 +62,18 @@
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_begin (void* const TM_buff)
+__TM_begin (void* const __TM_buff)
 {
-  *_TEXASRL_PTR (TM_buff) = 0;
+  *_TEXASRL_PTR (__TM_buff) = 0;
   if (__builtin_expect (__builtin_tbegin (0), 1))
     return _HTM_TBEGIN_STARTED;
 #ifdef __powerpc64__
-  *_TEXASR_PTR (TM_buff) = __builtin_get_texasr ();
+  *_TEXASR_PTR (__TM_buff) = __builtin_get_texasr ();
 #else
-  *_TEXASRU_PTR (TM_buff) = __builtin_get_texasru ();
-  *_TEXASRL_PTR (TM_buff) = __builtin_get_texasr ();
+  *_TEXASRU_PTR (__TM_buff) = __builtin_get_texasru ();
+  *_TEXASRL_PTR (__TM_buff) = __builtin_get_texasr ();
 #endif
-  *_TFIAR_PTR (TM_buff) = __builtin_get_tfiar ();
+  *_TFIAR_PTR (__TM_buff) = __builtin_get_tfiar ();
   return 0;
 }
 
@@ -95,9 +95,9 @@
 
 extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_named_abort (unsigned char const code)
+__TM_named_abort (unsigned char const __code)
 {
-  __builtin_tabort (code);
+  __builtin_tabort (__code);
 }
 
 extern __inline void
@@ -116,47 +116,47 @@
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_is_user_abort (void* const TM_buff)
+__TM_is_user_abort (void* const __TM_buff)
 {
-  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
   return _TEXASRU_ABORT (texasru);
 }
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_is_named_user_abort (void* const TM_buff, unsigned char *code)
+__TM_is_named_user_abort (void* const __TM_buff, unsigned char *__code)
 {
-  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
 
-  *code = _TEXASRU_FAILURE_CODE (texasru);
+  *__code = _TEXASRU_FAILURE_CODE (texasru);
   return _TEXASRU_ABORT (texasru);
 }
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_is_illegal (void* const TM_buff)
+__TM_is_illegal (void* const __TM_buff)
 {
-  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
   return _TEXASRU_DISALLOWED (texasru);
 }
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_is_footprint_exceeded (void* const TM_buff)
+__TM_is_footprint_exceeded (void* const __TM_buff)
 {
-  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
   return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
 }
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_nesting_depth (void* const TM_buff)
+__TM_nesting_depth (void* const __TM_buff)
 {
   texasrl_t texasrl;
 
   if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
     {
-      texasrl = *_TEXASRL_PTR (TM_buff);
+      texasrl = *_TEXASRL_PTR (__TM_buff);
       if (!_TEXASR_FAILURE_SUMMARY (texasrl))
         texasrl = 0;
     }
@@ -168,15 +168,15 @@
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_is_nested_too_deep(void* const TM_buff)
+__TM_is_nested_too_deep(void* const __TM_buff)
 {
-  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
   return _TEXASRU_NESTING_OVERFLOW (texasru);
 }
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_is_conflict(void* const TM_buff)
+__TM_is_conflict(void* const __TM_buff)
 {
   texasru_t texasru = *_TEXASRU_PTR (TM_buff);
   /* Return TEXASR bits 11 (Self-Induced Conflict) through
@@ -186,24 +186,24 @@
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_is_failure_persistent(void* const TM_buff)
+__TM_is_failure_persistent(void* const __TM_buff)
 {
-  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
   return _TEXASRU_FAILURE_PERSISTENT (texasru);
 }
 
 extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_failure_address(void* const TM_buff)
+__TM_failure_address(void* const __TM_buff)
 {
-  return *_TFIAR_PTR (TM_buff);
+  return *_TFIAR_PTR (__TM_buff);
 }
 
 extern __inline long long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-__TM_failure_code(void* const TM_buff)
+__TM_failure_code(void* const __TM_buff)
 {
-  return *_TEXASR_PTR (TM_buff);
+  return *_TEXASR_PTR (__TM_buff);
 }
 
 #ifdef __cplusplus
@@ -227,9 +227,9 @@
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_begin (void* const tdb)
+__TM_begin (void* const __tdb)
 {
-  return __builtin_tbegin_nofloat (tdb);
+  return __builtin_tbegin_nofloat (__tdb);
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
@@ -245,22 +245,22 @@
 }
 
 static __inline void __attribute__((__always_inline__, __nodebug__))
-__TM_named_abort (unsigned char const code)
+__TM_named_abort (unsigned char const __code)
 {
-  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + code);
+  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + __code);
 }
 
 static __inline void __attribute__((__always_inline__, __nodebug__))
-__TM_non_transactional_store (void* const addr, long long const value)
+__TM_non_transactional_store (void* const __addr, long long const __value)
 {
-  __builtin_non_tx_store ((uint64_t*)addr, (uint64_t)value);
+  __builtin_non_tx_store ((uint64_t*)__addr, (uint64_t)__value);
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_nesting_depth (void* const tdb_ptr)
+__TM_nesting_depth (void* const __tdb_ptr)
 {
   int depth = __builtin_tx_nesting_depth ();
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
 
   if (depth != 0)
     return depth;
@@ -273,9 +273,9 @@
 /* Transaction failure diagnostics */
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_is_user_abort (void* const tdb_ptr)
+__TM_is_user_abort (void* const __tdb_ptr)
 {
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
 
   if (tdb->format != 1)
     return 0;
@@ -284,25 +284,25 @@
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_is_named_user_abort (void* const tdb_ptr, unsigned char* code)
+__TM_is_named_user_abort (void* const __tdb_ptr, unsigned char* __code)
 {
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
 
   if (tdb->format != 1)
     return 0;
 
   if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
     {
-      *code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
+      *__code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
       return 1;
     }
   return 0;
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_is_illegal (void* const tdb_ptr)
+__TM_is_illegal (void* const __tdb_ptr)
 {
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
 
   return (tdb->format == 1
 	  && (tdb->abort_code == 4 /* unfiltered program interruption */
@@ -310,9 +310,9 @@
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_is_footprint_exceeded (void* const tdb_ptr)
+__TM_is_footprint_exceeded (void* const __tdb_ptr)
 {
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
 
   return (tdb->format == 1
 	  && (tdb->abort_code == 7 /* fetch overflow */
@@ -320,17 +320,17 @@
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_is_nested_too_deep (void* const tdb_ptr)
+__TM_is_nested_too_deep (void* const __tdb_ptr)
 {
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
 
   return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_is_conflict (void* const tdb_ptr)
+__TM_is_conflict (void* const __tdb_ptr)
 {
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
 
   return (tdb->format == 1
 	  && (tdb->abort_code == 9 /* fetch conflict */
@@ -338,22 +338,22 @@
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_is_failure_persistent (long const result)
+__TM_is_failure_persistent (long const __result)
 {
-  return result == _HTM_TBEGIN_PERSISTENT;
+  return __result == _HTM_TBEGIN_PERSISTENT;
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_failure_address (void* const tdb_ptr)
+__TM_failure_address (void* const __tdb_ptr)
 {
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
   return tdb->atia;
 }
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
-__TM_failure_code (void* const tdb_ptr)
+__TM_failure_code (void* const __tdb_ptr)
 {
-  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
 
   return tdb->abort_code;
 }
diff --git a/lib/Headers/ia32intrin.h b/lib/Headers/ia32intrin.h
index b2f82bb..397f3fd 100644
--- a/lib/Headers/ia32intrin.h
+++ b/lib/Headers/ia32intrin.h
@@ -74,4 +74,6 @@
 
 #define _rdtsc() __rdtsc()
 
+#define _rdpmc(A) __rdpmc(A)
+
 #endif /* __IA32INTRIN_H */
diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h
index 6376461..4b27523 100644
--- a/lib/Headers/immintrin.h
+++ b/lib/Headers/immintrin.h
@@ -24,22 +24,45 @@
 #ifndef __IMMINTRIN_H
 #define __IMMINTRIN_H
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
 #include <mmintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
 #include <xmmintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
 #include <emmintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
 #include <pmmintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
 #include <tmmintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__SSE4_2__) || defined(__SSE4_1__))
 #include <smmintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AES__) || defined(__PCLMUL__))
 #include <wmmintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
+#include <clflushoptintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
 #include <avxintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
 #include <avx2intrin.h>
 
 /* The 256-bit versions of functions in f16cintrin.h.
@@ -54,33 +77,90 @@
 {
   return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
 }
+#endif /* __AVX2__ */
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
 #include <bmiintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
 #include <bmi2intrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
 #include <lzcntintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
 #include <fmaintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
 #include <avx512fintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
 #include <avx512vlintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
 #include <avx512bwintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
 #include <avx512cdintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512BW__))
 #include <avx512vlbwintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512CD__))
+#include <avx512vlcdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512DQ__))
 #include <avx512vldqintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
 #include <avx512erintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
+#include <avx512ifmaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
+#include <avx512ifmavlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
+#include <avx512vbmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
+#include <avx512vbmivlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
+#include <avx512pfintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
 #include <pkuintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
 {
@@ -93,6 +173,18 @@
   return __builtin_ia32_rdrand32_step(__p);
 }
 
+/* __bit_scan_forward */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bit_scan_forward(int __A) {
+  return __builtin_ctz(__A);
+}
+
+/* __bit_scan_reverse */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bit_scan_reverse(int __A) {
+  return 31 - __builtin_clz(__A);
+}
+
 #ifdef __x86_64__
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand64_step(unsigned long long *__p)
@@ -100,7 +192,9 @@
   return __builtin_ia32_rdrand64_step(__p);
 }
 #endif
+#endif /* __RDRND__ */
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readfsbase_u32(void)
@@ -149,23 +243,38 @@
 {
   return __builtin_ia32_wrgsbase64(__V);
 }
+
+#endif
+#endif /* __FSGSBASE__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
+#include <rtmintrin.h>
+#include <xtestintrin.h>
 #endif
 
-#include <rtmintrin.h>
-
-#include <xtestintrin.h>
-
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
 #include <shaintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
 #include <fxsrintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
 #include <xsaveintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
 #include <xsaveoptintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
 #include <xsavecintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
 #include <xsavesintrin.h>
+#endif
 
 /* Some intrinsics inside adxintrin.h are available only on processors with ADX,
  * whereas others are also available at all times. */
diff --git a/lib/Headers/intrin.h b/lib/Headers/intrin.h
new file mode 100644
index 0000000..268ac4a
--- /dev/null
+++ b/lib/Headers/intrin.h
@@ -0,0 +1,961 @@
+/* ===-------- intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+#if defined(__arm__)
+#include <armintr.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+float _m_to_float(__m64);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+void __debugbreak(void);
+__int64 __emul(int, int);
+unsigned __int64 __emulu(unsigned int, unsigned int);
+void __cdecl __fastfail(unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+void __llwpcb(void *);
+unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
+void __lwpval32(unsigned int, unsigned int, unsigned int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+static __inline__
+unsigned int __popcnt(unsigned int);
+static __inline__
+unsigned short __popcnt16(unsigned short);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned long __readfsdword(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+void *__slwpcb(void);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+void __ud2(void);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+unsigned long __cdecl _byteswap_ulong(unsigned long);
+unsigned short __cdecl _byteswap_ushort(unsigned short);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+static __inline__
+long _InterlockedAnd(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedAnd16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedAnd8(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+static __inline__
+unsigned char _interlockedbittestandset(long volatile *, long);
+static __inline__
+long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
+                                         long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+static __inline__
+short _InterlockedCompareExchange16(short volatile *_Destination,
+                                    short _Exchange, short _Comparand);
+static __inline__
+__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
+                                      __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+static __inline__
+char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
+                                  char _Comparand);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+static __inline__
+long __cdecl _InterlockedDecrement(long volatile *_Addend);
+static __inline__
+short _InterlockedDecrement16(short volatile *_Addend);
+long _InterlockedExchange(long volatile *_Target, long _Value);
+static __inline__
+short _InterlockedExchange16(short volatile *_Target, short _Value);
+static __inline__
+char _InterlockedExchange8(char volatile *_Target, char _Value);
+static __inline__
+long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+static __inline__
+short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+static __inline__
+char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
+static __inline__
+long __cdecl _InterlockedIncrement(long volatile *_Addend);
+static __inline__
+short _InterlockedIncrement16(short volatile *_Addend);
+static __inline__
+long _InterlockedOr(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedOr16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedOr8(char volatile *_Value, char _Mask);
+static __inline__
+long _InterlockedXor(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedXor16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedXor8(char volatile *_Value, char _Mask);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__
+unsigned long __cdecl _lrotl(unsigned long, int);
+static __inline__
+unsigned long __cdecl _lrotr(unsigned long, int);
+static __inline__
+void _ReadBarrier(void);
+static __inline__
+void _ReadWriteBarrier(void);
+static __inline__
+void *_ReturnAddress(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+static __inline__
+unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
+static __inline__
+unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+static __inline__
+void _WriteBarrier(void);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+#define _XCR_XFEATURE_ENABLED_MASK 0
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
+void __lwpval64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __popcnt64(unsigned __int64);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
+                                         void *_Exchange, void *_Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+static __inline__
+__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
+                __int64 *_HighProduct);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmpex(jmp_buf);
+#endif
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
+/*
+ * Multiply two 64-bit integers and obtain a 64-bit result.
+ * The low-half is returned directly and the high half is in an out parameter.
+ */
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
+         unsigned __int64 *_HighProduct) {
+  unsigned __int128 _FullProduct =
+      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
+  *_HighProduct = _FullProduct >> 64;
+  return _FullProduct;
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
+  unsigned __int128 _FullProduct =
+      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
+  return _FullProduct >> 64;
+}
+
+#endif /* __x86_64__ */
+
+/*----------------------------------------------------------------------------*\
+|* Multiplication
+\*----------------------------------------------------------------------------*/
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+__emul(int __in1, int __in2) {
+  return (__int64)__in1 * (__int64)__in2;
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__emulu(unsigned int __in1, unsigned int __in2) {
+  return (unsigned __int64)__in1 * (unsigned __int64)__in2;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Twiddling
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_rotl8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_rotr8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+_rotl16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+_rotr16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rotl(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rotr(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+_lrotl(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+_lrotr(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_rotl64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_rotr64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 31 - __builtin_clzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__popcnt16(unsigned short _Value) {
+  return __builtin_popcount((int)_Value);
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__popcnt(unsigned int _Value) {
+  return __builtin_popcount(_Value);
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest(long const *_BitBase, long _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzll(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 63 - __builtin_clzll(_Mask);
+  return 1;
+}
+static __inline__
+unsigned __int64 __DEFAULT_FN_ATTRS
+__popcnt64(unsigned __int64 _Value) {
+  return __builtin_popcountll(_Value);
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
+  long long _PrevVal =
+      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Sub
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+/*----------------------------------------------------------------------------*\
+|* Barriers
+\*----------------------------------------------------------------------------*/
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__faststorefence(void) {
+  __atomic_thread_fence(__ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* readfs, readgs
+|* (Pointers in address space #256 and #257 are relative to the GS and FS
+|* segment registers, respectively.)
+\*----------------------------------------------------------------------------*/
+#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
+    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
+    (__offset))
+
+#ifdef __i386__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readfsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned char, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readfsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned short, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readfsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readgsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned char, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readgsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned short, __offset);
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readgsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readgsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
+}
+#endif
+#undef __ptr_to_addr_space
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
+  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+static __inline__ void * __DEFAULT_FN_ATTRS
+_AddressOfReturnAddress(void) {
+  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
+}
+static __inline__ void * __DEFAULT_FN_ATTRS
+_ReturnAddress(void) {
+  return __builtin_return_address(0);
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */
diff --git a/lib/Headers/inttypes.h b/lib/Headers/inttypes.h
index 3d59d14..1d8eaba 100644
--- a/lib/Headers/inttypes.h
+++ b/lib/Headers/inttypes.h
@@ -23,6 +23,10 @@
 #ifndef __CLANG_INTTYPES_H
 #define __CLANG_INTTYPES_H
 
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#error MSVC does not have inttypes.h prior to Visual Studio 2013
+#endif
+
 #include_next <inttypes.h>
 
 #if defined(_MSC_VER) && _MSC_VER < 1900
diff --git a/lib/Headers/mm3dnow.h b/lib/Headers/mm3dnow.h
index cb93faf..294866c 100644
--- a/lib/Headers/mm3dnow.h
+++ b/lib/Headers/mm3dnow.h
@@ -33,7 +33,7 @@
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
 
 static __inline__ void __DEFAULT_FN_ATTRS
-_m_femms() {
+_m_femms(void) {
   __builtin_ia32_femms();
 }
 
diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h
index 162cb1a..cefd605 100644
--- a/lib/Headers/mmintrin.h
+++ b/lib/Headers/mmintrin.h
@@ -26,6 +26,7 @@
 
 typedef long long __m64 __attribute__((__vector_size__(8)));
 
+typedef long long __v1di __attribute__((__vector_size__(8)));
 typedef int __v2si __attribute__((__vector_size__(8)));
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
@@ -33,366 +34,1314 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
 
+/// \brief Clears the MMX state by setting the state of the x87 stack registers
+///    to empty.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c EMMS instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_empty(void)
 {
     __builtin_ia32_emms();
 }
 
+/// \brief Constructs a 64-bit integer vector, setting the lower 32 bits to the
+///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+///
+/// \param __i
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
+///    parameter. The upper 32 bits are set to 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtsi32_si64(int __i)
 {
     return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
 }
 
+/// \brief Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
+///    signed integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector.
+/// \returns A 32-bit signed integer value containing the lower 32 bits of the
+///    parameter.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si32(__m64 __m)
 {
     return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
 }
 
+/// \brief Casts a 64-bit signed integer value into a 64-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction.
+///
+/// \param __i
+///    A 64-bit signed integer.
+/// \returns A 64-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtsi64_m64(long long __i)
 {
     return (__m64)__i;
 }
 
+/// \brief Casts a 64-bit integer vector into a 64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector.
+/// \returns A 64-bit signed integer containing the same bitwise pattern as the
+///    parameter.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtm64_si64(__m64 __m)
 {
     return (long long)__m;
 }
 
+/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
+///    a 64-bit integer vector of [8 x i8] as the result. Positive values
+///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
+///    are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PACKSSWB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit signed integer with
+///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80. The converted
+///    [4 x i8] values are written to the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit signed integer with
+///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80. The converted
+///    [4 x i8] values are written to the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the converted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Converts 32-bit signed integers from both 64-bit integer vector
+///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
+///    a 64-bit integer vector of [4 x i16] as the result. Positive values
+///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
+///    0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PACKSSDW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
+///    32-bit signed integer and is converted to a 16-bit signed integer with
+///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000. The converted
+///    [2 x i16] values are written to the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
+///    32-bit signed integer and is converted to a 16-bit signed integer with
+///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000. The converted
+///    [2 x i16] values are written to the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the converted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+///    parameters of [4 x i16] into 8-bit unsigned integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
+///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
+///    to 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PACKUSWB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0 are saturated to 0. The converted [4 x i8] values are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0 are saturated to 0. The converted [4 x i8] values are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the converted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
+///    and interleaves them into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUNPCKHBW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [39:32] are written to bits [7:0] of the result.
+///    Bits [47:40] are written to bits [23:16] of the result.
+///    Bits [55:48] are written to bits [39:32] of the result.
+///    Bits [63:56] are written to bits [55:48] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [39:32] are written to bits [15:8] of the result.
+///    Bits [47:40] are written to bits [31:24] of the result.
+///    Bits [55:48] are written to bits [47:40] of the result.
+///    Bits [63:56] are written to bits [63:56] of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUNPCKHWD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [47:32] are written to bits [15:0] of the result.
+///    Bits [63:48] are written to bits [47:32] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [47:32] are written to bits [31:16] of the result.
+///    Bits [63:48] are written to bits [63:48] of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUNPCKHDQ instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
+///    and interleaves them into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUNPCKLBW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [7:0] are written to bits [7:0] of the result.
+///    Bits [15:8] are written to bits [23:16] of the result.
+///    Bits [23:16] are written to bits [39:32] of the result.
+///    Bits [31:24] are written to bits [55:48] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result.
+///    Bits [15:8] are written to bits [31:24] of the result.
+///    Bits [23:16] are written to bits [47:40] of the result.
+///    Bits [31:24] are written to bits [63:56] of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUNPCKLWD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result.
+///    Bits [31:16] are written to bits [47:32] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result.
+///    Bits [31:16] are written to bits [63:48] of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUNPCKLDQ instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Adds each 8-bit integer element of the first 64-bit integer vector
+///    of [8 x i8] to the corresponding 8-bit integer element of the second
+///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
+///    packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PADDB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
+///    parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Adds each 16-bit integer element of the first 64-bit integer vector
+///    of [4 x i16] to the corresponding 16-bit integer element of the second
+///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
+///    packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PADDW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
+///    parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Adds each 32-bit integer element of the first 64-bit integer vector
+///    of [2 x i32] to the corresponding 32-bit integer element of the second
+///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
+///    packed into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PADDD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
+///    parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Adds each 8-bit signed integer element of the first 64-bit integer
+///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
+///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
+///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
+///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PADDSB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
+///    of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Adds each 16-bit signed integer element of the first 64-bit integer
+///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
+///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
+///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
+///    saturated to 0x8000. The results are packed into a 64-bit integer vector
+///    of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PADDSW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
+///    of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Adds each 8-bit unsigned integer element of the first 64-bit integer
+///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
+///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
+///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
+///    [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PADDUSB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    unsigned sums of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Adds each 16-bit unsigned integer element of the first 64-bit integer
+///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
+///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
+///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PADDUSW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    unsigned sums of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Subtracts each 8-bit integer element of the second 64-bit integer
+///    vector of [8 x i8] from the corresponding 8-bit integer element of the
+///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
+///    are packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
+///    both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Subtracts each 16-bit integer element of the second 64-bit integer
+///    vector of [4 x i16] from the corresponding 16-bit integer element of the
+///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
+///    results are packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
+///    both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Subtracts each 32-bit integer element of the second 64-bit integer
+///    vector of [2 x i32] from the corresponding 32-bit integer element of the
+///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
+///    results are packed into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
+/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
+///    both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Subtracts each 8-bit signed integer element of the second 64-bit
+///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
+///    element of the first 64-bit integer vector of [8 x i8]. Positive results
+///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
+///    are saturated to 0x80. The results are packed into a 64-bit integer
+///    vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBSB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    differences of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Subtracts each 16-bit signed integer element of the second 64-bit
+///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
+///    element of the first 64-bit integer vector of [4 x i16]. Positive results
+///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
+///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBSW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    differences of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
+///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
+///    element of the first 64-bit integer vector of [8 x i8]. If an element of
+///    the first vector is less than the corresponding element of the second
+///    vector, the result is saturated to 0. The results are packed into a
+///    64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBUSB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    differences of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
+///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
+///    integer element of the first 64-bit integer vector of [4 x i16]. If an
+///    element of the first vector is less than the corresponding element of the
+///    second vector, the result is saturated to 0. The results are packed into
+///    a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBUSW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    differences of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16] and get four
+///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
+///    The lower 32 bits of these two sums are packed into a 64-bit integer
+///    vector of [2 x i32]. For example, bits [15:0] of both parameters are
+///    multiplied, bits [31:16] of both parameters are multiplied, and the sum
+///    of both results is written to bits [31:0] of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMADDWD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
+///    products of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
+///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULHW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
+///    of the products of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
+///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULLW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
+///    of the products of both parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Left-shifts each 16-bit signed integer element of the first
+///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
+///    of bits specified by the second parameter, which is a 64-bit integer. The
+///    lower 16 bits of the results are packed into a 64-bit integer vector of
+///    [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSLLW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
+///    values. If __count is greater or equal to 16, the result is set to all 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
 }
 
+/// \brief Left-shifts each 16-bit signed integer element of a 64-bit integer
+///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    The lower 16 bits of the results are packed into a 64-bit integer vector
+///    of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSLLW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
+///    values. If __count is greater or equal to 16, the result is set to all 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi16(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
 }
 
+/// \brief Left-shifts each 32-bit signed integer element of the first
+///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
+///    of bits specified by the second parameter, which is a 64-bit integer. The
+///    lower 32 bits of the results are packed into a 64-bit integer vector of
+///    [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSLLD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
+///    values. If __count is greater or equal to 32, the result is set to all 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
 }
 
+/// \brief Left-shifts each 32-bit signed integer element of a 64-bit integer
+///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    The lower 32 bits of the results are packed into a 64-bit integer vector
+///    of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSLLD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
+///    values. If __count is greater or equal to 32, the result is set to all 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
 }
 
+/// \brief Left-shifts the first 64-bit integer parameter by the number of bits
+///    specified by the second 64-bit integer parameter. The lower 64 bits of
+///    result are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSLLQ instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector containing the left-shifted value. If
+///     __count is greater or equal to 64, the result is set to 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllq(__m, __count);
+    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
 }
 
+/// \brief Left-shifts the first parameter, which is a 64-bit integer, by the
+///    number of bits specified by the second parameter, which is a 32-bit
+///    integer. The lower 64 bits of result are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSLLQ instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector containing the left-shifted value. If
+///     __count is greater or equal to 64, the result is set to 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllqi(__m, __count);
+    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
 }
 
+/// \brief Right-shifts each 16-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [4 x i16], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are filled with the sign bit of the initial value of each 16-bit
+///    element. The 16-bit results are packed into a 64-bit integer vector of
+///    [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRAW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
 }
 
+/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    High-order bits are filled with the sign bit of the initial value of each
+///    16-bit element. The 16-bit results are packed into a 64-bit integer
+///    vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRAW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srai_pi16(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
 }
 
+/// \brief Right-shifts each 32-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [2 x i32], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are filled with the sign bit of the initial value of each 32-bit
+///    element. The 32-bit results are packed into a 64-bit integer vector of
+///    [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRAD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
 }
 
+/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    High-order bits are filled with the sign bit of the initial value of each
+///    32-bit element. The 32-bit results are packed into a 64-bit integer
+///    vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRAD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srai_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
 }
 
+/// \brief Right-shifts each 16-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [4 x i16], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are cleared. The 16-bit results are packed into a 64-bit integer
+///    vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRLW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
 }
 
+/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRLW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_pi16(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
 }
 
+/// \brief Right-shifts each 32-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [2 x i32], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are cleared. The 32-bit results are packed into a 64-bit integer
+///    vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRLD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
 }
 
+/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
+///    integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRLD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
 }
 
+/// \brief Right-shifts the first 64-bit integer parameter by the number of bits
+///    specified by the second 64-bit integer parameter. High-order bits are
+///    cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRLQ instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlq(__m, __count);
+    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
 }
 
+/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
+///    number of bits specified by the second parameter, which is a 32-bit
+///    integer. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSRLQ instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlqi(__m, __count);
+    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
 }
 
+/// \brief Performs a bitwise AND of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PAND instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise AND of both
+///    parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pand(__m1, __m2);
+    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
 }
 
+/// \brief Performs a bitwise NOT of the first 64-bit integer vector, and then
+///    performs a bitwise AND of the intermediate result and the second 64-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PANDN instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector. The one's complement of this parameter is used
+///    in the bitwise AND.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise AND of the second
+///    parameter and the one's complement of the first parameter.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pandn(__m1, __m2);
+    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
 }
 
+/// \brief Performs a bitwise OR of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c POR instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise OR of both
+///    parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_por(__m1, __m2);
+    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
 }
 
+/// \brief Performs a bitwise exclusive OR of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PXOR instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
+///    parameters.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pxor(__m1, __m2);
+    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
 }
 
+/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+///    [8 x i8] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PCMPEQB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
+///    results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+///    [4 x i16] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PCMPEQW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
+///    results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+///    [2 x i32] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PCMPEQD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
+///    results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+///    [8 x i8] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PCMPGTB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
+///    results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+///    [4 x i16] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PCMPGTW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
+///    results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+///    [2 x i32] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PCMPGTD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
+///    results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Constructs a 64-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the the \c VXORPS / XORPS instruction.
+///
+/// \returns An initialized 64-bit integer vector with all elements set to zero.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setzero_si64(void)
 {
     return (__m64){ 0LL };
 }
 
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i1
+///    A 32-bit integer value used to initialize the upper 32 bits of the
+///    result.
+/// \param __i0
+///    A 32-bit integer value used to initialize the lower 32 bits of the
+///    result.
+/// \returns An initialized 64-bit integer vector.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi32(int __i1, int __i0)
 {
     return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
 }
 
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __s3
+///    A 16-bit integer value used to initialize bits [63:48] of the result.
+/// \param __s2
+///    A 16-bit integer value used to initialize bits [47:32] of the result.
+/// \param __s1
+///    A 16-bit integer value used to initialize bits [31:16] of the result.
+/// \param __s0
+///    A 16-bit integer value used to initialize bits [15:0] of the result.
+/// \returns An initialized 64-bit integer vector.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 {
     return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
 }
 
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b7
+///    An 8-bit integer value used to initialize bits [63:56] of the result.
+/// \param __b6
+///    An 8-bit integer value used to initialize bits [55:48] of the result.
+/// \param __b5
+///    An 8-bit integer value used to initialize bits [47:40] of the result.
+/// \param __b4
+///    An 8-bit integer value used to initialize bits [39:32] of the result.
+/// \param __b3
+///    An 8-bit integer value used to initialize bits [31:24] of the result.
+/// \param __b2
+///    An 8-bit integer value used to initialize bits [23:16] of the result.
+/// \param __b1
+///    An 8-bit integer value used to initialize bits [15:8] of the result.
+/// \param __b0
+///    An 8-bit integer value used to initialize bits [7:0] of the result.
+/// \returns An initialized 64-bit integer vector.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
             char __b1, char __b0)
@@ -401,36 +1350,129 @@
                                                __b4, __b5, __b6, __b7);
 }
 
+/// \brief Constructs a 64-bit integer vector of [2 x i32], with each of the
+///    32-bit integer vector elements set to the specified 32-bit integer
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction.
+///
+/// \param __i
+///    A 32-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [2 x i32].
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi32(int __i)
 {
     return _mm_set_pi32(__i, __i);
 }
 
+/// \brief Constructs a 64-bit integer vector of [4 x i16], with each of the
+///    16-bit integer vector elements set to the specified 16-bit integer
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction.
+///
+/// \param __w
+///    A 16-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [4 x i16].
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi16(short __w)
 {
     return _mm_set_pi16(__w, __w, __w, __w);
 }
 
+/// \brief Constructs a 64-bit integer vector of [8 x i8], with each of the
+///    8-bit integer vector elements set to the specified 8-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLBW + VPSHUFLW / \c PUNPCKLBW +
+///    PSHUFLW instruction.
+///
+/// \param __b
+///    An 8-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [8 x i8].
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi8(char __b)
 {
     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
 }
 
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integer value used to initialize the lower 32 bits of the
+///    result.
+/// \param __i1
+///    A 32-bit integer value used to initialize the upper 32 bits of the
+///    result.
+/// \returns An initialized 64-bit integer vector.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi32(int __i0, int __i1)
 {
     return _mm_set_pi32(__i1, __i0);
 }
 
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of the result.
+/// \returns An initialized 64-bit integer vector.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 {
     return _mm_set_pi16(__w3, __w2, __w1, __w0);
 }
 
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integer value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integer value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integer value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integer value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integer value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integer value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integer value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integer value used to initialize bits [63:56] of the result.
+/// \returns An initialized 64-bit integer vector.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
              char __b6, char __b7)
diff --git a/lib/Headers/module.modulemap b/lib/Headers/module.modulemap
index 5acabc9..4654b3d 100644
--- a/lib/Headers/module.modulemap
+++ b/lib/Headers/module.modulemap
@@ -60,6 +60,7 @@
     textual header "lzcntintrin.h"
     textual header "xopintrin.h"
     textual header "fma4intrin.h"
+    textual header "mwaitxintrin.h"
 
     explicit module mm_malloc {
       header "mm_malloc.h"
@@ -67,6 +68,7 @@
     }
 
     explicit module cpuid {
+      requires gnuinlineasm
       header "cpuid.h"
     }
 
@@ -75,6 +77,7 @@
     }
 
     explicit module sse {
+      export mm_malloc
       export mmx
       export sse2 // note: for hackish <emmintrin.h> dependency
       header "xmmintrin.h"
@@ -155,3 +158,8 @@
 module _Builtin_stddef_max_align_t [system] [extern_c] {
   header "__stddef_max_align_t.h"
 }
+
+module opencl_c {
+  requires opencl
+  header "opencl-c.h"
+}
diff --git a/lib/Headers/mwaitxintrin.h b/lib/Headers/mwaitxintrin.h
new file mode 100644
index 0000000..635f2ac
--- /dev/null
+++ b/lib/Headers/mwaitxintrin.h
@@ -0,0 +1,47 @@
+/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _MWAITXINTRIN_H
+#define _MWAITXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitorx(void const * __p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitorx((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
+{
+  __builtin_ia32_mwaitx(__extensions, __hints, __clock);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _MWAITXINTRIN_H */
diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h
new file mode 100644
index 0000000..ddd2cde
--- /dev/null
+++ b/lib/Headers/opencl-c.h
@@ -0,0 +1,17041 @@
+//===--- opencl-c.h - OpenCL C language builtin function header -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OPENCL_H_
+#define _OPENCL_H_
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#ifndef cl_khr_depth_images
+#define cl_khr_depth_images
+#endif //cl_khr_depth_images
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#define __ovld __attribute__((overloadable))
+
+// Optimizations
+#define __purefn __attribute__((pure))
+#define __cnfn __attribute__((const))
+
+// built-in scalar data types:
+
+/**
+ * An unsigned 8-bit integer.
+ */
+typedef unsigned char uchar;
+
+/**
+ * An unsigned 16-bit integer.
+ */
+typedef unsigned short ushort;
+
+/**
+ * An unsigned 32-bit integer.
+ */
+typedef unsigned int uint;
+
+/**
+ * An unsigned 64-bit integer.
+ */
+typedef unsigned long ulong;
+
+/**
+ * The unsigned integer type of the result of the sizeof operator. This
+ * is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
+ * defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
+ * CL_DEVICE_ADDRESS_BITS is 64-bits.
+ */
+typedef __SIZE_TYPE__ size_t;
+
+/**
+ * A signed integer type that is the result of subtracting two pointers.
+ * This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
+ * defined in table 4.3 is 32-bits and is a 64-bit signed integer if
+ * CL_DEVICE_ADDRESS_BITS is 64-bits.
+ */
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+/**
+* A signed integer type with the property that any valid pointer to
+* void can be converted to this type, then converted back to pointer
+* to void, and the result will compare equal to the original pointer.
+*/
+typedef __INTPTR_TYPE__ intptr_t;
+
+/**
+* An unsigned integer type with the property that any valid pointer to
+* void can be converted to this type, then converted back to pointer
+* to void, and the result will compare equal to the original pointer.
+*/
+typedef __UINTPTR_TYPE__ uintptr_t;
+
+// built-in vector data types:
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef char char8 __attribute__((ext_vector_type(8)));
+typedef char char16 __attribute__((ext_vector_type(16)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar8 __attribute__((ext_vector_type(8)));
+typedef uchar uchar16 __attribute__((ext_vector_type(16)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef short short8 __attribute__((ext_vector_type(8)));
+typedef short short16 __attribute__((ext_vector_type(16)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort8 __attribute__((ext_vector_type(8)));
+typedef ushort ushort16 __attribute__((ext_vector_type(16)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef uint uint8 __attribute__((ext_vector_type(8)));
+typedef uint uint16 __attribute__((ext_vector_type(16)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef long long3 __attribute__((ext_vector_type(3)));
+typedef long long4 __attribute__((ext_vector_type(4)));
+typedef long long8 __attribute__((ext_vector_type(8)));
+typedef long long16 __attribute__((ext_vector_type(16)));
+typedef ulong ulong2 __attribute__((ext_vector_type(2)));
+typedef ulong ulong3 __attribute__((ext_vector_type(3)));
+typedef ulong ulong4 __attribute__((ext_vector_type(4)));
+typedef ulong ulong8 __attribute__((ext_vector_type(8)));
+typedef ulong ulong16 __attribute__((ext_vector_type(16)));
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float8 __attribute__((ext_vector_type(8)));
+typedef float float16 __attribute__((ext_vector_type(16)));
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+typedef half half2 __attribute__((ext_vector_type(2)));
+typedef half half3 __attribute__((ext_vector_type(3)));
+typedef half half4 __attribute__((ext_vector_type(4)));
+typedef half half8 __attribute__((ext_vector_type(8)));
+typedef half half16 __attribute__((ext_vector_type(16)));
+#endif
+#ifdef cl_khr_fp64
+#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef double double3 __attribute__((ext_vector_type(3)));
+typedef double double4 __attribute__((ext_vector_type(4)));
+typedef double double8 __attribute__((ext_vector_type(8)));
+typedef double double16 __attribute__((ext_vector_type(16)));
+#endif
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define NULL ((void*)0)
+#endif
+
+/**
+ * Value of maximum non-infinite single-precision floating-point
+ * number.
+ */
+#define MAXFLOAT 0x1.fffffep127f
+
+/**
+ * A positive float constant expression. HUGE_VALF evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VALF (__builtin_huge_valf())
+
+/**
+ * A positive double constant expression. HUGE_VAL evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VAL (__builtin_huge_val())
+
+/**
+ * A constant expression of type float representing positive or
+ * unsigned infinity.
+ */
+#define INFINITY (__builtin_inff())
+
+/**
+ * A constant expression of type float representing a quiet NaN.
+ */
+#define NAN as_float(INT_MAX)
+
+#define FP_ILOGB0    INT_MIN
+#define FP_ILOGBNAN    INT_MAX
+
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define M_E_F         2.71828182845904523536028747135266250f
+#define M_LOG2E_F     1.44269504088896340735992468100189214f
+#define M_LOG10E_F    0.434294481903251827651128918916605082f
+#define M_LN2_F       0.693147180559945309417232121458176568f
+#define M_LN10_F      2.30258509299404568401799145468436421f
+#define M_PI_F        3.14159265358979323846264338327950288f
+#define M_PI_2_F      1.57079632679489661923132169163975144f
+#define M_PI_4_F      0.785398163397448309615660845819875721f
+#define M_1_PI_F      0.318309886183790671537767526745028724f
+#define M_2_PI_F      0.636619772367581343075535053490057448f
+#define M_2_SQRTPI_F  1.12837916709551257389615890312154517f
+#define M_SQRT2_F     1.41421356237309504880168872420969808f
+#define M_SQRT1_2_F   0.707106781186547524400844362104849039f
+
+#define DBL_DIG 15
+#define DBL_MANT_DIG 53
+#define DBL_MAX_10_EXP +308
+#define DBL_MAX_EXP +1024
+#define DBL_MIN_10_EXP -307
+#define DBL_MIN_EXP -1021
+#define DBL_RADIX 2
+#define DBL_MAX 0x1.fffffffffffffp1023
+#define DBL_MIN 0x1.0p-1022
+#define DBL_EPSILON 0x1.0p-52
+
+#define M_E           0x1.5bf0a8b145769p+1
+#define M_LOG2E       0x1.71547652b82fep+0
+#define M_LOG10E      0x1.bcb7b1526e50ep-2
+#define M_LN2         0x1.62e42fefa39efp-1
+#define M_LN10        0x1.26bb1bbb55516p+1
+#define M_PI          0x1.921fb54442d18p+1
+#define M_PI_2        0x1.921fb54442d18p+0
+#define M_PI_4        0x1.921fb54442d18p-1
+#define M_1_PI        0x1.45f306dc9c883p-2
+#define M_2_PI        0x1.45f306dc9c883p-1
+#define M_2_SQRTPI    0x1.20dd750429b6dp+0
+#define M_SQRT2       0x1.6a09e667f3bcdp+0
+#define M_SQRT1_2     0x1.6a09e667f3bcdp-1
+
+#ifdef cl_khr_fp16
+
+#define HALF_DIG 3
+#define HALF_MANT_DIG 11
+#define HALF_MAX_10_EXP +4
+#define HALF_MAX_EXP +16
+#define HALF_MIN_10_EXP -4
+#define HALF_MIN_EXP -13
+#define HALF_RADIX 2
+#define HALF_MAX ((0x1.ffcp15h))
+#define HALF_MIN ((0x1.0p-14h))
+#define HALF_EPSILON ((0x1.0p-10h))
+
+#define M_E_H         2.71828182845904523536028747135266250h
+#define M_LOG2E_H     1.44269504088896340735992468100189214h
+#define M_LOG10E_H    0.434294481903251827651128918916605082h
+#define M_LN2_H       0.693147180559945309417232121458176568h
+#define M_LN10_H      2.30258509299404568401799145468436421h
+#define M_PI_H        3.14159265358979323846264338327950288h
+#define M_PI_2_H      1.57079632679489661923132169163975144h
+#define M_PI_4_H      0.785398163397448309615660845819875721h
+#define M_1_PI_H      0.318309886183790671537767526745028724h
+#define M_2_PI_H      0.636619772367581343075535053490057448h
+#define M_2_SQRTPI_H  1.12837916709551257389615890312154517h
+#define M_SQRT2_H     1.41421356237309504880168872420969808h
+#define M_SQRT1_2_H   0.707106781186547524400844362104849039h
+
+#endif //cl_khr_fp16
+
+#define CHAR_BIT    8
+#define SCHAR_MAX  127
+#define SCHAR_MIN  (-128)
+#define UCHAR_MAX  255
+#define CHAR_MAX  SCHAR_MAX
+#define CHAR_MIN  SCHAR_MIN
+#define USHRT_MAX  65535
+#define SHRT_MAX  32767
+#define SHRT_MIN  (-32768)
+#define UINT_MAX  0xffffffff
+#define INT_MAX    2147483647
+#define INT_MIN    (-2147483647-1)
+#define ULONG_MAX  0xffffffffffffffffUL
+#define LONG_MAX  0x7fffffffffffffffL
+#define LONG_MIN  (-0x7fffffffffffffffL-1)
+
+// OpenCL v1.1/1.2/2.0 s6.2.3 - Explicit conversions
+
+char __ovld __cnfn convert_char_rte(char);
+char __ovld __cnfn convert_char_sat_rte(char);
+char __ovld __cnfn convert_char_rtz(char);
+char __ovld __cnfn convert_char_sat_rtz(char);
+char __ovld __cnfn convert_char_rtp(char);
+char __ovld __cnfn convert_char_sat_rtp(char);
+char __ovld __cnfn convert_char_rtn(char);
+char __ovld __cnfn convert_char_sat_rtn(char);
+char __ovld __cnfn convert_char(char);
+char __ovld __cnfn convert_char_sat(char);
+char __ovld __cnfn convert_char_rte(uchar);
+char __ovld __cnfn convert_char_sat_rte(uchar);
+char __ovld __cnfn convert_char_rtz(uchar);
+char __ovld __cnfn convert_char_sat_rtz(uchar);
+char __ovld __cnfn convert_char_rtp(uchar);
+char __ovld __cnfn convert_char_sat_rtp(uchar);
+char __ovld __cnfn convert_char_rtn(uchar);
+char __ovld __cnfn convert_char_sat_rtn(uchar);
+char __ovld __cnfn convert_char(uchar);
+char __ovld __cnfn convert_char_sat(uchar);
+char __ovld __cnfn convert_char_rte(short);
+char __ovld __cnfn convert_char_sat_rte(short);
+char __ovld __cnfn convert_char_rtz(short);
+char __ovld __cnfn convert_char_sat_rtz(short);
+char __ovld __cnfn convert_char_rtp(short);
+char __ovld __cnfn convert_char_sat_rtp(short);
+char __ovld __cnfn convert_char_rtn(short);
+char __ovld __cnfn convert_char_sat_rtn(short);
+char __ovld __cnfn convert_char(short);
+char __ovld __cnfn convert_char_sat(short);
+char __ovld __cnfn convert_char_rte(ushort);
+char __ovld __cnfn convert_char_sat_rte(ushort);
+char __ovld __cnfn convert_char_rtz(ushort);
+char __ovld __cnfn convert_char_sat_rtz(ushort);
+char __ovld __cnfn convert_char_rtp(ushort);
+char __ovld __cnfn convert_char_sat_rtp(ushort);
+char __ovld __cnfn convert_char_rtn(ushort);
+char __ovld __cnfn convert_char_sat_rtn(ushort);
+char __ovld __cnfn convert_char(ushort);
+char __ovld __cnfn convert_char_sat(ushort);
+char __ovld __cnfn convert_char_rte(int);
+char __ovld __cnfn convert_char_sat_rte(int);
+char __ovld __cnfn convert_char_rtz(int);
+char __ovld __cnfn convert_char_sat_rtz(int);
+char __ovld __cnfn convert_char_rtp(int);
+char __ovld __cnfn convert_char_sat_rtp(int);
+char __ovld __cnfn convert_char_rtn(int);
+char __ovld __cnfn convert_char_sat_rtn(int);
+char __ovld __cnfn convert_char(int);
+char __ovld __cnfn convert_char_sat(int);
+char __ovld __cnfn convert_char_rte(uint);
+char __ovld __cnfn convert_char_sat_rte(uint);
+char __ovld __cnfn convert_char_rtz(uint);
+char __ovld __cnfn convert_char_sat_rtz(uint);
+char __ovld __cnfn convert_char_rtp(uint);
+char __ovld __cnfn convert_char_sat_rtp(uint);
+char __ovld __cnfn convert_char_rtn(uint);
+char __ovld __cnfn convert_char_sat_rtn(uint);
+char __ovld __cnfn convert_char(uint);
+char __ovld __cnfn convert_char_sat(uint);
+char __ovld __cnfn convert_char_rte(long);
+char __ovld __cnfn convert_char_sat_rte(long);
+char __ovld __cnfn convert_char_rtz(long);
+char __ovld __cnfn convert_char_sat_rtz(long);
+char __ovld __cnfn convert_char_rtp(long);
+char __ovld __cnfn convert_char_sat_rtp(long);
+char __ovld __cnfn convert_char_rtn(long);
+char __ovld __cnfn convert_char_sat_rtn(long);
+char __ovld __cnfn convert_char(long);
+char __ovld __cnfn convert_char_sat(long);
+char __ovld __cnfn convert_char_rte(ulong);
+char __ovld __cnfn convert_char_sat_rte(ulong);
+char __ovld __cnfn convert_char_rtz(ulong);
+char __ovld __cnfn convert_char_sat_rtz(ulong);
+char __ovld __cnfn convert_char_rtp(ulong);
+char __ovld __cnfn convert_char_sat_rtp(ulong);
+char __ovld __cnfn convert_char_rtn(ulong);
+char __ovld __cnfn convert_char_sat_rtn(ulong);
+char __ovld __cnfn convert_char(ulong);
+char __ovld __cnfn convert_char_sat(ulong);
+char __ovld __cnfn convert_char_rte(float);
+char __ovld __cnfn convert_char_sat_rte(float);
+char __ovld __cnfn convert_char_rtz(float);
+char __ovld __cnfn convert_char_sat_rtz(float);
+char __ovld __cnfn convert_char_rtp(float);
+char __ovld __cnfn convert_char_sat_rtp(float);
+char __ovld __cnfn convert_char_rtn(float);
+char __ovld __cnfn convert_char_sat_rtn(float);
+char __ovld __cnfn convert_char(float);
+char __ovld __cnfn convert_char_sat(float);
+uchar __ovld __cnfn convert_uchar_rte(char);
+uchar __ovld __cnfn convert_uchar_sat_rte(char);
+uchar __ovld __cnfn convert_uchar_rtz(char);
+uchar __ovld __cnfn convert_uchar_sat_rtz(char);
+uchar __ovld __cnfn convert_uchar_rtp(char);
+uchar __ovld __cnfn convert_uchar_sat_rtp(char);
+uchar __ovld __cnfn convert_uchar_rtn(char);
+uchar __ovld __cnfn convert_uchar_sat_rtn(char);
+uchar __ovld __cnfn convert_uchar(char);
+uchar __ovld __cnfn convert_uchar_sat(char);
+uchar __ovld __cnfn convert_uchar_rte(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rte(uchar);
+uchar __ovld __cnfn convert_uchar_rtz(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtz(uchar);
+uchar __ovld __cnfn convert_uchar_rtp(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtp(uchar);
+uchar __ovld __cnfn convert_uchar_rtn(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtn(uchar);
+uchar __ovld __cnfn convert_uchar(uchar);
+uchar __ovld __cnfn convert_uchar_sat(uchar);
+uchar __ovld __cnfn convert_uchar_rte(short);
+uchar __ovld __cnfn convert_uchar_sat_rte(short);
+uchar __ovld __cnfn convert_uchar_rtz(short);
+uchar __ovld __cnfn convert_uchar_sat_rtz(short);
+uchar __ovld __cnfn convert_uchar_rtp(short);
+uchar __ovld __cnfn convert_uchar_sat_rtp(short);
+uchar __ovld __cnfn convert_uchar_rtn(short);
+uchar __ovld __cnfn convert_uchar_sat_rtn(short);
+uchar __ovld __cnfn convert_uchar(short);
+uchar __ovld __cnfn convert_uchar_sat(short);
+uchar __ovld __cnfn convert_uchar_rte(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rte(ushort);
+uchar __ovld __cnfn convert_uchar_rtz(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtz(ushort);
+uchar __ovld __cnfn convert_uchar_rtp(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtp(ushort);
+uchar __ovld __cnfn convert_uchar_rtn(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtn(ushort);
+uchar __ovld __cnfn convert_uchar(ushort);
+uchar __ovld __cnfn convert_uchar_sat(ushort);
+uchar __ovld __cnfn convert_uchar_rte(int);
+uchar __ovld __cnfn convert_uchar_sat_rte(int);
+uchar __ovld __cnfn convert_uchar_rtz(int);
+uchar __ovld __cnfn convert_uchar_sat_rtz(int);
+uchar __ovld __cnfn convert_uchar_rtp(int);
+uchar __ovld __cnfn convert_uchar_sat_rtp(int);
+uchar __ovld __cnfn convert_uchar_rtn(int);
+uchar __ovld __cnfn convert_uchar_sat_rtn(int);
+uchar __ovld __cnfn convert_uchar(int);
+uchar __ovld __cnfn convert_uchar_sat(int);
+uchar __ovld __cnfn convert_uchar_rte(uint);
+uchar __ovld __cnfn convert_uchar_sat_rte(uint);
+uchar __ovld __cnfn convert_uchar_rtz(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtz(uint);
+uchar __ovld __cnfn convert_uchar_rtp(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtp(uint);
+uchar __ovld __cnfn convert_uchar_rtn(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtn(uint);
+uchar __ovld __cnfn convert_uchar(uint);
+uchar __ovld __cnfn convert_uchar_sat(uint);
+uchar __ovld __cnfn convert_uchar_rte(long);
+uchar __ovld __cnfn convert_uchar_sat_rte(long);
+uchar __ovld __cnfn convert_uchar_rtz(long);
+uchar __ovld __cnfn convert_uchar_sat_rtz(long);
+uchar __ovld __cnfn convert_uchar_rtp(long);
+uchar __ovld __cnfn convert_uchar_sat_rtp(long);
+uchar __ovld __cnfn convert_uchar_rtn(long);
+uchar __ovld __cnfn convert_uchar_sat_rtn(long);
+uchar __ovld __cnfn convert_uchar(long);
+uchar __ovld __cnfn convert_uchar_sat(long);
+uchar __ovld __cnfn convert_uchar_rte(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rte(ulong);
+uchar __ovld __cnfn convert_uchar_rtz(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtz(ulong);
+uchar __ovld __cnfn convert_uchar_rtp(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtp(ulong);
+uchar __ovld __cnfn convert_uchar_rtn(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtn(ulong);
+uchar __ovld __cnfn convert_uchar(ulong);
+uchar __ovld __cnfn convert_uchar_sat(ulong);
+uchar __ovld __cnfn convert_uchar_rte(float);
+uchar __ovld __cnfn convert_uchar_sat_rte(float);
+uchar __ovld __cnfn convert_uchar_rtz(float);
+uchar __ovld __cnfn convert_uchar_sat_rtz(float);
+uchar __ovld __cnfn convert_uchar_rtp(float);
+uchar __ovld __cnfn convert_uchar_sat_rtp(float);
+uchar __ovld __cnfn convert_uchar_rtn(float);
+uchar __ovld __cnfn convert_uchar_sat_rtn(float);
+uchar __ovld __cnfn convert_uchar(float);
+uchar __ovld __cnfn convert_uchar_sat(float);
+
+short __ovld __cnfn convert_short_rte(char);
+short __ovld __cnfn convert_short_sat_rte(char);
+short __ovld __cnfn convert_short_rtz(char);
+short __ovld __cnfn convert_short_sat_rtz(char);
+short __ovld __cnfn convert_short_rtp(char);
+short __ovld __cnfn convert_short_sat_rtp(char);
+short __ovld __cnfn convert_short_rtn(char);
+short __ovld __cnfn convert_short_sat_rtn(char);
+short __ovld __cnfn convert_short(char);
+short __ovld __cnfn convert_short_sat(char);
+short __ovld __cnfn convert_short_rte(uchar);
+short __ovld __cnfn convert_short_sat_rte(uchar);
+short __ovld __cnfn convert_short_rtz(uchar);
+short __ovld __cnfn convert_short_sat_rtz(uchar);
+short __ovld __cnfn convert_short_rtp(uchar);
+short __ovld __cnfn convert_short_sat_rtp(uchar);
+short __ovld __cnfn convert_short_rtn(uchar);
+short __ovld __cnfn convert_short_sat_rtn(uchar);
+short __ovld __cnfn convert_short(uchar);
+short __ovld __cnfn convert_short_sat(uchar);
+short __ovld __cnfn convert_short_rte(short);
+short __ovld __cnfn convert_short_sat_rte(short);
+short __ovld __cnfn convert_short_rtz(short);
+short __ovld __cnfn convert_short_sat_rtz(short);
+short __ovld __cnfn convert_short_rtp(short);
+short __ovld __cnfn convert_short_sat_rtp(short);
+short __ovld __cnfn convert_short_rtn(short);
+short __ovld __cnfn convert_short_sat_rtn(short);
+short __ovld __cnfn convert_short(short);
+short __ovld __cnfn convert_short_sat(short);
+short __ovld __cnfn convert_short_rte(ushort);
+short __ovld __cnfn convert_short_sat_rte(ushort);
+short __ovld __cnfn convert_short_rtz(ushort);
+short __ovld __cnfn convert_short_sat_rtz(ushort);
+short __ovld __cnfn convert_short_rtp(ushort);
+short __ovld __cnfn convert_short_sat_rtp(ushort);
+short __ovld __cnfn convert_short_rtn(ushort);
+short __ovld __cnfn convert_short_sat_rtn(ushort);
+short __ovld __cnfn convert_short(ushort);
+short __ovld __cnfn convert_short_sat(ushort);
+short __ovld __cnfn convert_short_rte(int);
+short __ovld __cnfn convert_short_sat_rte(int);
+short __ovld __cnfn convert_short_rtz(int);
+short __ovld __cnfn convert_short_sat_rtz(int);
+short __ovld __cnfn convert_short_rtp(int);
+short __ovld __cnfn convert_short_sat_rtp(int);
+short __ovld __cnfn convert_short_rtn(int);
+short __ovld __cnfn convert_short_sat_rtn(int);
+short __ovld __cnfn convert_short(int);
+short __ovld __cnfn convert_short_sat(int);
+short __ovld __cnfn convert_short_rte(uint);
+short __ovld __cnfn convert_short_sat_rte(uint);
+short __ovld __cnfn convert_short_rtz(uint);
+short __ovld __cnfn convert_short_sat_rtz(uint);
+short __ovld __cnfn convert_short_rtp(uint);
+short __ovld __cnfn convert_short_sat_rtp(uint);
+short __ovld __cnfn convert_short_rtn(uint);
+short __ovld __cnfn convert_short_sat_rtn(uint);
+short __ovld __cnfn convert_short(uint);
+short __ovld __cnfn convert_short_sat(uint);
+short __ovld __cnfn convert_short_rte(long);
+short __ovld __cnfn convert_short_sat_rte(long);
+short __ovld __cnfn convert_short_rtz(long);
+short __ovld __cnfn convert_short_sat_rtz(long);
+short __ovld __cnfn convert_short_rtp(long);
+short __ovld __cnfn convert_short_sat_rtp(long);
+short __ovld __cnfn convert_short_rtn(long);
+short __ovld __cnfn convert_short_sat_rtn(long);
+short __ovld __cnfn convert_short(long);
+short __ovld __cnfn convert_short_sat(long);
+short __ovld __cnfn convert_short_rte(ulong);
+short __ovld __cnfn convert_short_sat_rte(ulong);
+short __ovld __cnfn convert_short_rtz(ulong);
+short __ovld __cnfn convert_short_sat_rtz(ulong);
+short __ovld __cnfn convert_short_rtp(ulong);
+short __ovld __cnfn convert_short_sat_rtp(ulong);
+short __ovld __cnfn convert_short_rtn(ulong);
+short __ovld __cnfn convert_short_sat_rtn(ulong);
+short __ovld __cnfn convert_short(ulong);
+short __ovld __cnfn convert_short_sat(ulong);
+short __ovld __cnfn convert_short_rte(float);
+short __ovld __cnfn convert_short_sat_rte(float);
+short __ovld __cnfn convert_short_rtz(float);
+short __ovld __cnfn convert_short_sat_rtz(float);
+short __ovld __cnfn convert_short_rtp(float);
+short __ovld __cnfn convert_short_sat_rtp(float);
+short __ovld __cnfn convert_short_rtn(float);
+short __ovld __cnfn convert_short_sat_rtn(float);
+short __ovld __cnfn convert_short(float);
+short __ovld __cnfn convert_short_sat(float);
+ushort __ovld __cnfn convert_ushort_rte(char);
+ushort __ovld __cnfn convert_ushort_sat_rte(char);
+ushort __ovld __cnfn convert_ushort_rtz(char);
+ushort __ovld __cnfn convert_ushort_sat_rtz(char);
+ushort __ovld __cnfn convert_ushort_rtp(char);
+ushort __ovld __cnfn convert_ushort_sat_rtp(char);
+ushort __ovld __cnfn convert_ushort_rtn(char);
+ushort __ovld __cnfn convert_ushort_sat_rtn(char);
+ushort __ovld __cnfn convert_ushort(char);
+ushort __ovld __cnfn convert_ushort_sat(char);
+ushort __ovld __cnfn convert_ushort_rte(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rte(uchar);
+ushort __ovld __cnfn convert_ushort_rtz(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtz(uchar);
+ushort __ovld __cnfn convert_ushort_rtp(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtp(uchar);
+ushort __ovld __cnfn convert_ushort_rtn(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtn(uchar);
+ushort __ovld __cnfn convert_ushort(uchar);
+ushort __ovld __cnfn convert_ushort_sat(uchar);
+ushort __ovld __cnfn convert_ushort_rte(short);
+ushort __ovld __cnfn convert_ushort_sat_rte(short);
+ushort __ovld __cnfn convert_ushort_rtz(short);
+ushort __ovld __cnfn convert_ushort_sat_rtz(short);
+ushort __ovld __cnfn convert_ushort_rtp(short);
+ushort __ovld __cnfn convert_ushort_sat_rtp(short);
+ushort __ovld __cnfn convert_ushort_rtn(short);
+ushort __ovld __cnfn convert_ushort_sat_rtn(short);
+ushort __ovld __cnfn convert_ushort(short);
+ushort __ovld __cnfn convert_ushort_sat(short);
+ushort __ovld __cnfn convert_ushort_rte(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rte(ushort);
+ushort __ovld __cnfn convert_ushort_rtz(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtz(ushort);
+ushort __ovld __cnfn convert_ushort_rtp(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtp(ushort);
+ushort __ovld __cnfn convert_ushort_rtn(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtn(ushort);
+ushort __ovld __cnfn convert_ushort(ushort);
+ushort __ovld __cnfn convert_ushort_sat(ushort);
+ushort __ovld __cnfn convert_ushort_rte(int);
+ushort __ovld __cnfn convert_ushort_sat_rte(int);
+ushort __ovld __cnfn convert_ushort_rtz(int);
+ushort __ovld __cnfn convert_ushort_sat_rtz(int);
+ushort __ovld __cnfn convert_ushort_rtp(int);
+ushort __ovld __cnfn convert_ushort_sat_rtp(int);
+ushort __ovld __cnfn convert_ushort_rtn(int);
+ushort __ovld __cnfn convert_ushort_sat_rtn(int);
+ushort __ovld __cnfn convert_ushort(int);
+ushort __ovld __cnfn convert_ushort_sat(int);
+ushort __ovld __cnfn convert_ushort_rte(uint);
+ushort __ovld __cnfn convert_ushort_sat_rte(uint);
+ushort __ovld __cnfn convert_ushort_rtz(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtz(uint);
+ushort __ovld __cnfn convert_ushort_rtp(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtp(uint);
+ushort __ovld __cnfn convert_ushort_rtn(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtn(uint);
+ushort __ovld __cnfn convert_ushort(uint);
+ushort __ovld __cnfn convert_ushort_sat(uint);
+ushort __ovld __cnfn convert_ushort_rte(long);
+ushort __ovld __cnfn convert_ushort_sat_rte(long);
+ushort __ovld __cnfn convert_ushort_rtz(long);
+ushort __ovld __cnfn convert_ushort_sat_rtz(long);
+ushort __ovld __cnfn convert_ushort_rtp(long);
+ushort __ovld __cnfn convert_ushort_sat_rtp(long);
+ushort __ovld __cnfn convert_ushort_rtn(long);
+ushort __ovld __cnfn convert_ushort_sat_rtn(long);
+ushort __ovld __cnfn convert_ushort(long);
+ushort __ovld __cnfn convert_ushort_sat(long);
+ushort __ovld __cnfn convert_ushort_rte(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rte(ulong);
+ushort __ovld __cnfn convert_ushort_rtz(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtz(ulong);
+ushort __ovld __cnfn convert_ushort_rtp(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtp(ulong);
+ushort __ovld __cnfn convert_ushort_rtn(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtn(ulong);
+ushort __ovld __cnfn convert_ushort(ulong);
+ushort __ovld __cnfn convert_ushort_sat(ulong);
+ushort __ovld __cnfn convert_ushort_rte(float);
+ushort __ovld __cnfn convert_ushort_sat_rte(float);
+ushort __ovld __cnfn convert_ushort_rtz(float);
+ushort __ovld __cnfn convert_ushort_sat_rtz(float);
+ushort __ovld __cnfn convert_ushort_rtp(float);
+ushort __ovld __cnfn convert_ushort_sat_rtp(float);
+ushort __ovld __cnfn convert_ushort_rtn(float);
+ushort __ovld __cnfn convert_ushort_sat_rtn(float);
+ushort __ovld __cnfn convert_ushort(float);
+ushort __ovld __cnfn convert_ushort_sat(float);
+int __ovld __cnfn convert_int_rte(char);
+int __ovld __cnfn convert_int_sat_rte(char);
+int __ovld __cnfn convert_int_rtz(char);
+int __ovld __cnfn convert_int_sat_rtz(char);
+int __ovld __cnfn convert_int_rtp(char);
+int __ovld __cnfn convert_int_sat_rtp(char);
+int __ovld __cnfn convert_int_rtn(char);
+int __ovld __cnfn convert_int_sat_rtn(char);
+int __ovld __cnfn convert_int(char);
+int __ovld __cnfn convert_int_sat(char);
+int __ovld __cnfn convert_int_rte(uchar);
+int __ovld __cnfn convert_int_sat_rte(uchar);
+int __ovld __cnfn convert_int_rtz(uchar);
+int __ovld __cnfn convert_int_sat_rtz(uchar);
+int __ovld __cnfn convert_int_rtp(uchar);
+int __ovld __cnfn convert_int_sat_rtp(uchar);
+int __ovld __cnfn convert_int_rtn(uchar);
+int __ovld __cnfn convert_int_sat_rtn(uchar);
+int __ovld __cnfn convert_int(uchar);
+int __ovld __cnfn convert_int_sat(uchar);
+int __ovld __cnfn convert_int_rte(short);
+int __ovld __cnfn convert_int_sat_rte(short);
+int __ovld __cnfn convert_int_rtz(short);
+int __ovld __cnfn convert_int_sat_rtz(short);
+int __ovld __cnfn convert_int_rtp(short);
+int __ovld __cnfn convert_int_sat_rtp(short);
+int __ovld __cnfn convert_int_rtn(short);
+int __ovld __cnfn convert_int_sat_rtn(short);
+int __ovld __cnfn convert_int(short);
+int __ovld __cnfn convert_int_sat(short);
+int __ovld __cnfn convert_int_rte(ushort);
+int __ovld __cnfn convert_int_sat_rte(ushort);
+int __ovld __cnfn convert_int_rtz(ushort);
+int __ovld __cnfn convert_int_sat_rtz(ushort);
+int __ovld __cnfn convert_int_rtp(ushort);
+int __ovld __cnfn convert_int_sat_rtp(ushort);
+int __ovld __cnfn convert_int_rtn(ushort);
+int __ovld __cnfn convert_int_sat_rtn(ushort);
+int __ovld __cnfn convert_int(ushort);
+int __ovld __cnfn convert_int_sat(ushort);
+int __ovld __cnfn convert_int_rte(int);
+int __ovld __cnfn convert_int_sat_rte(int);
+int __ovld __cnfn convert_int_rtz(int);
+int __ovld __cnfn convert_int_sat_rtz(int);
+int __ovld __cnfn convert_int_rtp(int);
+int __ovld __cnfn convert_int_sat_rtp(int);
+int __ovld __cnfn convert_int_rtn(int);
+int __ovld __cnfn convert_int_sat_rtn(int);
+int __ovld __cnfn convert_int(int);
+int __ovld __cnfn convert_int_sat(int);
+int __ovld __cnfn convert_int_rte(uint);
+int __ovld __cnfn convert_int_sat_rte(uint);
+int __ovld __cnfn convert_int_rtz(uint);
+int __ovld __cnfn convert_int_sat_rtz(uint);
+int __ovld __cnfn convert_int_rtp(uint);
+int __ovld __cnfn convert_int_sat_rtp(uint);
+int __ovld __cnfn convert_int_rtn(uint);
+int __ovld __cnfn convert_int_sat_rtn(uint);
+int __ovld __cnfn convert_int(uint);
+int __ovld __cnfn convert_int_sat(uint);
+int __ovld __cnfn convert_int_rte(long);
+int __ovld __cnfn convert_int_sat_rte(long);
+int __ovld __cnfn convert_int_rtz(long);
+int __ovld __cnfn convert_int_sat_rtz(long);
+int __ovld __cnfn convert_int_rtp(long);
+int __ovld __cnfn convert_int_sat_rtp(long);
+int __ovld __cnfn convert_int_rtn(long);
+int __ovld __cnfn convert_int_sat_rtn(long);
+int __ovld __cnfn convert_int(long);
+int __ovld __cnfn convert_int_sat(long);
+int __ovld __cnfn convert_int_rte(ulong);
+int __ovld __cnfn convert_int_sat_rte(ulong);
+int __ovld __cnfn convert_int_rtz(ulong);
+int __ovld __cnfn convert_int_sat_rtz(ulong);
+int __ovld __cnfn convert_int_rtp(ulong);
+int __ovld __cnfn convert_int_sat_rtp(ulong);
+int __ovld __cnfn convert_int_rtn(ulong);
+int __ovld __cnfn convert_int_sat_rtn(ulong);
+int __ovld __cnfn convert_int(ulong);
+int __ovld __cnfn convert_int_sat(ulong);
+int __ovld __cnfn convert_int_rte(float);
+int __ovld __cnfn convert_int_sat_rte(float);
+int __ovld __cnfn convert_int_rtz(float);
+int __ovld __cnfn convert_int_sat_rtz(float);
+int __ovld __cnfn convert_int_rtp(float);
+int __ovld __cnfn convert_int_sat_rtp(float);
+int __ovld __cnfn convert_int_rtn(float);
+int __ovld __cnfn convert_int_sat_rtn(float);
+int __ovld __cnfn convert_int(float);
+int __ovld __cnfn convert_int_sat(float);
+uint __ovld __cnfn convert_uint_rte(char);
+uint __ovld __cnfn convert_uint_sat_rte(char);
+uint __ovld __cnfn convert_uint_rtz(char);
+uint __ovld __cnfn convert_uint_sat_rtz(char);
+uint __ovld __cnfn convert_uint_rtp(char);
+uint __ovld __cnfn convert_uint_sat_rtp(char);
+uint __ovld __cnfn convert_uint_rtn(char);
+uint __ovld __cnfn convert_uint_sat_rtn(char);
+uint __ovld __cnfn convert_uint(char);
+uint __ovld __cnfn convert_uint_sat(char);
+uint __ovld __cnfn convert_uint_rte(uchar);
+uint __ovld __cnfn convert_uint_sat_rte(uchar);
+uint __ovld __cnfn convert_uint_rtz(uchar);
+uint __ovld __cnfn convert_uint_sat_rtz(uchar);
+uint __ovld __cnfn convert_uint_rtp(uchar);
+uint __ovld __cnfn convert_uint_sat_rtp(uchar);
+uint __ovld __cnfn convert_uint_rtn(uchar);
+uint __ovld __cnfn convert_uint_sat_rtn(uchar);
+uint __ovld __cnfn convert_uint(uchar);
+uint __ovld __cnfn convert_uint_sat(uchar);
+uint __ovld __cnfn convert_uint_rte(short);
+uint __ovld __cnfn convert_uint_sat_rte(short);
+uint __ovld __cnfn convert_uint_rtz(short);
+uint __ovld __cnfn convert_uint_sat_rtz(short);
+uint __ovld __cnfn convert_uint_rtp(short);
+uint __ovld __cnfn convert_uint_sat_rtp(short);
+uint __ovld __cnfn convert_uint_rtn(short);
+uint __ovld __cnfn convert_uint_sat_rtn(short);
+uint __ovld __cnfn convert_uint(short);
+uint __ovld __cnfn convert_uint_sat(short);
+uint __ovld __cnfn convert_uint_rte(ushort);
+uint __ovld __cnfn convert_uint_sat_rte(ushort);
+uint __ovld __cnfn convert_uint_rtz(ushort);
+uint __ovld __cnfn convert_uint_sat_rtz(ushort);
+uint __ovld __cnfn convert_uint_rtp(ushort);
+uint __ovld __cnfn convert_uint_sat_rtp(ushort);
+uint __ovld __cnfn convert_uint_rtn(ushort);
+uint __ovld __cnfn convert_uint_sat_rtn(ushort);
+uint __ovld __cnfn convert_uint(ushort);
+uint __ovld __cnfn convert_uint_sat(ushort);
+uint __ovld __cnfn convert_uint_rte(int);
+uint __ovld __cnfn convert_uint_sat_rte(int);
+uint __ovld __cnfn convert_uint_rtz(int);
+uint __ovld __cnfn convert_uint_sat_rtz(int);
+uint __ovld __cnfn convert_uint_rtp(int);
+uint __ovld __cnfn convert_uint_sat_rtp(int);
+uint __ovld __cnfn convert_uint_rtn(int);
+uint __ovld __cnfn convert_uint_sat_rtn(int);
+uint __ovld __cnfn convert_uint(int);
+uint __ovld __cnfn convert_uint_sat(int);
+uint __ovld __cnfn convert_uint_rte(uint);
+uint __ovld __cnfn convert_uint_sat_rte(uint);
+uint __ovld __cnfn convert_uint_rtz(uint);
+uint __ovld __cnfn convert_uint_sat_rtz(uint);
+uint __ovld __cnfn convert_uint_rtp(uint);
+uint __ovld __cnfn convert_uint_sat_rtp(uint);
+uint __ovld __cnfn convert_uint_rtn(uint);
+uint __ovld __cnfn convert_uint_sat_rtn(uint);
+uint __ovld __cnfn convert_uint(uint);
+uint __ovld __cnfn convert_uint_sat(uint);
+uint __ovld __cnfn convert_uint_rte(long);
+uint __ovld __cnfn convert_uint_sat_rte(long);
+uint __ovld __cnfn convert_uint_rtz(long);
+uint __ovld __cnfn convert_uint_sat_rtz(long);
+uint __ovld __cnfn convert_uint_rtp(long);
+uint __ovld __cnfn convert_uint_sat_rtp(long);
+uint __ovld __cnfn convert_uint_rtn(long);
+uint __ovld __cnfn convert_uint_sat_rtn(long);
+uint __ovld __cnfn convert_uint(long);
+uint __ovld __cnfn convert_uint_sat(long);
+uint __ovld __cnfn convert_uint_rte(ulong);
+uint __ovld __cnfn convert_uint_sat_rte(ulong);
+uint __ovld __cnfn convert_uint_rtz(ulong);
+uint __ovld __cnfn convert_uint_sat_rtz(ulong);
+uint __ovld __cnfn convert_uint_rtp(ulong);
+uint __ovld __cnfn convert_uint_sat_rtp(ulong);
+uint __ovld __cnfn convert_uint_rtn(ulong);
+uint __ovld __cnfn convert_uint_sat_rtn(ulong);
+uint __ovld __cnfn convert_uint(ulong);
+uint __ovld __cnfn convert_uint_sat(ulong);
+uint __ovld __cnfn convert_uint_rte(float);
+uint __ovld __cnfn convert_uint_sat_rte(float);
+uint __ovld __cnfn convert_uint_rtz(float);
+uint __ovld __cnfn convert_uint_sat_rtz(float);
+uint __ovld __cnfn convert_uint_rtp(float);
+uint __ovld __cnfn convert_uint_sat_rtp(float);
+uint __ovld __cnfn convert_uint_rtn(float);
+uint __ovld __cnfn convert_uint_sat_rtn(float);
+uint __ovld __cnfn convert_uint(float);
+uint __ovld __cnfn convert_uint_sat(float);
+long __ovld __cnfn convert_long_rte(char);
+long __ovld __cnfn convert_long_sat_rte(char);
+long __ovld __cnfn convert_long_rtz(char);
+long __ovld __cnfn convert_long_sat_rtz(char);
+long __ovld __cnfn convert_long_rtp(char);
+long __ovld __cnfn convert_long_sat_rtp(char);
+long __ovld __cnfn convert_long_rtn(char);
+long __ovld __cnfn convert_long_sat_rtn(char);
+long __ovld __cnfn convert_long(char);
+long __ovld __cnfn convert_long_sat(char);
+long __ovld __cnfn convert_long_rte(uchar);
+long __ovld __cnfn convert_long_sat_rte(uchar);
+long __ovld __cnfn convert_long_rtz(uchar);
+long __ovld __cnfn convert_long_sat_rtz(uchar);
+long __ovld __cnfn convert_long_rtp(uchar);
+long __ovld __cnfn convert_long_sat_rtp(uchar);
+long __ovld __cnfn convert_long_rtn(uchar);
+long __ovld __cnfn convert_long_sat_rtn(uchar);
+long __ovld __cnfn convert_long(uchar);
+long __ovld __cnfn convert_long_sat(uchar);
+long __ovld __cnfn convert_long_rte(short);
+long __ovld __cnfn convert_long_sat_rte(short);
+long __ovld __cnfn convert_long_rtz(short);
+long __ovld __cnfn convert_long_sat_rtz(short);
+long __ovld __cnfn convert_long_rtp(short);
+long __ovld __cnfn convert_long_sat_rtp(short);
+long __ovld __cnfn convert_long_rtn(short);
+long __ovld __cnfn convert_long_sat_rtn(short);
+long __ovld __cnfn convert_long(short);
+long __ovld __cnfn convert_long_sat(short);
+long __ovld __cnfn convert_long_rte(ushort);
+long __ovld __cnfn convert_long_sat_rte(ushort);
+long __ovld __cnfn convert_long_rtz(ushort);
+long __ovld __cnfn convert_long_sat_rtz(ushort);
+long __ovld __cnfn convert_long_rtp(ushort);
+long __ovld __cnfn convert_long_sat_rtp(ushort);
+long __ovld __cnfn convert_long_rtn(ushort);
+long __ovld __cnfn convert_long_sat_rtn(ushort);
+long __ovld __cnfn convert_long(ushort);
+long __ovld __cnfn convert_long_sat(ushort);
+long __ovld __cnfn convert_long_rte(int);
+long __ovld __cnfn convert_long_sat_rte(int);
+long __ovld __cnfn convert_long_rtz(int);
+long __ovld __cnfn convert_long_sat_rtz(int);
+long __ovld __cnfn convert_long_rtp(int);
+long __ovld __cnfn convert_long_sat_rtp(int);
+long __ovld __cnfn convert_long_rtn(int);
+long __ovld __cnfn convert_long_sat_rtn(int);
+long __ovld __cnfn convert_long(int);
+long __ovld __cnfn convert_long_sat(int);
+long __ovld __cnfn convert_long_rte(uint);
+long __ovld __cnfn convert_long_sat_rte(uint);
+long __ovld __cnfn convert_long_rtz(uint);
+long __ovld __cnfn convert_long_sat_rtz(uint);
+long __ovld __cnfn convert_long_rtp(uint);
+long __ovld __cnfn convert_long_sat_rtp(uint);
+long __ovld __cnfn convert_long_rtn(uint);
+long __ovld __cnfn convert_long_sat_rtn(uint);
+long __ovld __cnfn convert_long(uint);
+long __ovld __cnfn convert_long_sat(uint);
+long __ovld __cnfn convert_long_rte(long);
+long __ovld __cnfn convert_long_sat_rte(long);
+long __ovld __cnfn convert_long_rtz(long);
+long __ovld __cnfn convert_long_sat_rtz(long);
+long __ovld __cnfn convert_long_rtp(long);
+long __ovld __cnfn convert_long_sat_rtp(long);
+long __ovld __cnfn convert_long_rtn(long);
+long __ovld __cnfn convert_long_sat_rtn(long);
+long __ovld __cnfn convert_long(long);
+long __ovld __cnfn convert_long_sat(long);
+long __ovld __cnfn convert_long_rte(ulong);
+long __ovld __cnfn convert_long_sat_rte(ulong);
+long __ovld __cnfn convert_long_rtz(ulong);
+long __ovld __cnfn convert_long_sat_rtz(ulong);
+long __ovld __cnfn convert_long_rtp(ulong);
+long __ovld __cnfn convert_long_sat_rtp(ulong);
+long __ovld __cnfn convert_long_rtn(ulong);
+long __ovld __cnfn convert_long_sat_rtn(ulong);
+long __ovld __cnfn convert_long(ulong);
+long __ovld __cnfn convert_long_sat(ulong);
+long __ovld __cnfn convert_long_rte(float);
+long __ovld __cnfn convert_long_sat_rte(float);
+long __ovld __cnfn convert_long_rtz(float);
+long __ovld __cnfn convert_long_sat_rtz(float);
+long __ovld __cnfn convert_long_rtp(float);
+long __ovld __cnfn convert_long_sat_rtp(float);
+long __ovld __cnfn convert_long_rtn(float);
+long __ovld __cnfn convert_long_sat_rtn(float);
+long __ovld __cnfn convert_long(float);
+long __ovld __cnfn convert_long_sat(float);
+ulong __ovld __cnfn convert_ulong_rte(char);
+ulong __ovld __cnfn convert_ulong_sat_rte(char);
+ulong __ovld __cnfn convert_ulong_rtz(char);
+ulong __ovld __cnfn convert_ulong_sat_rtz(char);
+ulong __ovld __cnfn convert_ulong_rtp(char);
+ulong __ovld __cnfn convert_ulong_sat_rtp(char);
+ulong __ovld __cnfn convert_ulong_rtn(char);
+ulong __ovld __cnfn convert_ulong_sat_rtn(char);
+ulong __ovld __cnfn convert_ulong(char);
+ulong __ovld __cnfn convert_ulong_sat(char);
+ulong __ovld __cnfn convert_ulong_rte(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rte(uchar);
+ulong __ovld __cnfn convert_ulong_rtz(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtz(uchar);
+ulong __ovld __cnfn convert_ulong_rtp(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtp(uchar);
+ulong __ovld __cnfn convert_ulong_rtn(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtn(uchar);
+ulong __ovld __cnfn convert_ulong(uchar);
+ulong __ovld __cnfn convert_ulong_sat(uchar);
+ulong __ovld __cnfn convert_ulong_rte(short);
+ulong __ovld __cnfn convert_ulong_sat_rte(short);
+ulong __ovld __cnfn convert_ulong_rtz(short);
+ulong __ovld __cnfn convert_ulong_sat_rtz(short);
+ulong __ovld __cnfn convert_ulong_rtp(short);
+ulong __ovld __cnfn convert_ulong_sat_rtp(short);
+ulong __ovld __cnfn convert_ulong_rtn(short);
+ulong __ovld __cnfn convert_ulong_sat_rtn(short);
+ulong __ovld __cnfn convert_ulong(short);
+ulong __ovld __cnfn convert_ulong_sat(short);
+ulong __ovld __cnfn convert_ulong_rte(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rte(ushort);
+ulong __ovld __cnfn convert_ulong_rtz(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtz(ushort);
+ulong __ovld __cnfn convert_ulong_rtp(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtp(ushort);
+ulong __ovld __cnfn convert_ulong_rtn(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtn(ushort);
+ulong __ovld __cnfn convert_ulong(ushort);
+ulong __ovld __cnfn convert_ulong_sat(ushort);
+ulong __ovld __cnfn convert_ulong_rte(int);
+ulong __ovld __cnfn convert_ulong_sat_rte(int);
+ulong __ovld __cnfn convert_ulong_rtz(int);
+ulong __ovld __cnfn convert_ulong_sat_rtz(int);
+ulong __ovld __cnfn convert_ulong_rtp(int);
+ulong __ovld __cnfn convert_ulong_sat_rtp(int);
+ulong __ovld __cnfn convert_ulong_rtn(int);
+ulong __ovld __cnfn convert_ulong_sat_rtn(int);
+ulong __ovld __cnfn convert_ulong(int);
+ulong __ovld __cnfn convert_ulong_sat(int);
+ulong __ovld __cnfn convert_ulong_rte(uint);
+ulong __ovld __cnfn convert_ulong_sat_rte(uint);
+ulong __ovld __cnfn convert_ulong_rtz(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtz(uint);
+ulong __ovld __cnfn convert_ulong_rtp(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtp(uint);
+ulong __ovld __cnfn convert_ulong_rtn(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtn(uint);
+ulong __ovld __cnfn convert_ulong(uint);
+ulong __ovld __cnfn convert_ulong_sat(uint);
+ulong __ovld __cnfn convert_ulong_rte(long);
+ulong __ovld __cnfn convert_ulong_sat_rte(long);
+ulong __ovld __cnfn convert_ulong_rtz(long);
+ulong __ovld __cnfn convert_ulong_sat_rtz(long);
+ulong __ovld __cnfn convert_ulong_rtp(long);
+ulong __ovld __cnfn convert_ulong_sat_rtp(long);
+ulong __ovld __cnfn convert_ulong_rtn(long);
+ulong __ovld __cnfn convert_ulong_sat_rtn(long);
+ulong __ovld __cnfn convert_ulong(long);
+ulong __ovld __cnfn convert_ulong_sat(long);
+ulong __ovld __cnfn convert_ulong_rte(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rte(ulong);
+ulong __ovld __cnfn convert_ulong_rtz(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtz(ulong);
+ulong __ovld __cnfn convert_ulong_rtp(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtp(ulong);
+ulong __ovld __cnfn convert_ulong_rtn(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtn(ulong);
+ulong __ovld __cnfn convert_ulong(ulong);
+ulong __ovld __cnfn convert_ulong_sat(ulong);
+ulong __ovld __cnfn convert_ulong_rte(float);
+ulong __ovld __cnfn convert_ulong_sat_rte(float);
+ulong __ovld __cnfn convert_ulong_rtz(float);
+ulong __ovld __cnfn convert_ulong_sat_rtz(float);
+ulong __ovld __cnfn convert_ulong_rtp(float);
+ulong __ovld __cnfn convert_ulong_sat_rtp(float);
+ulong __ovld __cnfn convert_ulong_rtn(float);
+ulong __ovld __cnfn convert_ulong_sat_rtn(float);
+ulong __ovld __cnfn convert_ulong(float);
+ulong __ovld __cnfn convert_ulong_sat(float);
+float __ovld __cnfn convert_float_rte(char);
+float __ovld __cnfn convert_float_rtz(char);
+float __ovld __cnfn convert_float_rtp(char);
+float __ovld __cnfn convert_float_rtn(char);
+float __ovld __cnfn convert_float(char);
+float __ovld __cnfn convert_float_rte(uchar);
+float __ovld __cnfn convert_float_rtz(uchar);
+float __ovld __cnfn convert_float_rtp(uchar);
+float __ovld __cnfn convert_float_rtn(uchar);
+float __ovld __cnfn convert_float(uchar);
+float __ovld __cnfn convert_float_rte(short);
+float __ovld __cnfn convert_float_rtz(short);
+float __ovld __cnfn convert_float_rtp(short);
+float __ovld __cnfn convert_float_rtn(short);
+float __ovld __cnfn convert_float(short);
+float __ovld __cnfn convert_float_rte(ushort);
+float __ovld __cnfn convert_float_rtz(ushort);
+float __ovld __cnfn convert_float_rtp(ushort);
+float __ovld __cnfn convert_float_rtn(ushort);
+float __ovld __cnfn convert_float(ushort);
+float __ovld __cnfn convert_float_rte(int);
+float __ovld __cnfn convert_float_rtz(int);
+float __ovld __cnfn convert_float_rtp(int);
+float __ovld __cnfn convert_float_rtn(int);
+float __ovld __cnfn convert_float(int);
+float __ovld __cnfn convert_float_rte(uint);
+float __ovld __cnfn convert_float_rtz(uint);
+float __ovld __cnfn convert_float_rtp(uint);
+float __ovld __cnfn convert_float_rtn(uint);
+float __ovld __cnfn convert_float(uint);
+float __ovld __cnfn convert_float_rte(long);
+float __ovld __cnfn convert_float_rtz(long);
+float __ovld __cnfn convert_float_rtp(long);
+float __ovld __cnfn convert_float_rtn(long);
+float __ovld __cnfn convert_float(long);
+float __ovld __cnfn convert_float_rte(ulong);
+float __ovld __cnfn convert_float_rtz(ulong);
+float __ovld __cnfn convert_float_rtp(ulong);
+float __ovld __cnfn convert_float_rtn(ulong);
+float __ovld __cnfn convert_float(ulong);
+float __ovld __cnfn convert_float_rte(float);
+float __ovld __cnfn convert_float_rtz(float);
+float __ovld __cnfn convert_float_rtp(float);
+float __ovld __cnfn convert_float_rtn(float);
+float __ovld __cnfn convert_float(float);
+char2 __ovld __cnfn convert_char2_rte(char2);
+char2 __ovld __cnfn convert_char2_sat_rte(char2);
+char2 __ovld __cnfn convert_char2_rtz(char2);
+char2 __ovld __cnfn convert_char2_sat_rtz(char2);
+char2 __ovld __cnfn convert_char2_rtp(char2);
+char2 __ovld __cnfn convert_char2_sat_rtp(char2);
+char2 __ovld __cnfn convert_char2_rtn(char2);
+char2 __ovld __cnfn convert_char2_sat_rtn(char2);
+char2 __ovld __cnfn convert_char2(char2);
+char2 __ovld __cnfn convert_char2_sat(char2);
+char2 __ovld __cnfn convert_char2_rte(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rte(uchar2);
+char2 __ovld __cnfn convert_char2_rtz(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtz(uchar2);
+char2 __ovld __cnfn convert_char2_rtp(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtp(uchar2);
+char2 __ovld __cnfn convert_char2_rtn(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtn(uchar2);
+char2 __ovld __cnfn convert_char2(uchar2);
+char2 __ovld __cnfn convert_char2_sat(uchar2);
+char2 __ovld __cnfn convert_char2_rte(short2);
+char2 __ovld __cnfn convert_char2_sat_rte(short2);
+char2 __ovld __cnfn convert_char2_rtz(short2);
+char2 __ovld __cnfn convert_char2_sat_rtz(short2);
+char2 __ovld __cnfn convert_char2_rtp(short2);
+char2 __ovld __cnfn convert_char2_sat_rtp(short2);
+char2 __ovld __cnfn convert_char2_rtn(short2);
+char2 __ovld __cnfn convert_char2_sat_rtn(short2);
+char2 __ovld __cnfn convert_char2(short2);
+char2 __ovld __cnfn convert_char2_sat(short2);
+char2 __ovld __cnfn convert_char2_rte(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rte(ushort2);
+char2 __ovld __cnfn convert_char2_rtz(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtz(ushort2);
+char2 __ovld __cnfn convert_char2_rtp(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtp(ushort2);
+char2 __ovld __cnfn convert_char2_rtn(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtn(ushort2);
+char2 __ovld __cnfn convert_char2(ushort2);
+char2 __ovld __cnfn convert_char2_sat(ushort2);
+char2 __ovld __cnfn convert_char2_rte(int2);
+char2 __ovld __cnfn convert_char2_sat_rte(int2);
+char2 __ovld __cnfn convert_char2_rtz(int2);
+char2 __ovld __cnfn convert_char2_sat_rtz(int2);
+char2 __ovld __cnfn convert_char2_rtp(int2);
+char2 __ovld __cnfn convert_char2_sat_rtp(int2);
+char2 __ovld __cnfn convert_char2_rtn(int2);
+char2 __ovld __cnfn convert_char2_sat_rtn(int2);
+char2 __ovld __cnfn convert_char2(int2);
+char2 __ovld __cnfn convert_char2_sat(int2);
+char2 __ovld __cnfn convert_char2_rte(uint2);
+char2 __ovld __cnfn convert_char2_sat_rte(uint2);
+char2 __ovld __cnfn convert_char2_rtz(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtz(uint2);
+char2 __ovld __cnfn convert_char2_rtp(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtp(uint2);
+char2 __ovld __cnfn convert_char2_rtn(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtn(uint2);
+char2 __ovld __cnfn convert_char2(uint2);
+char2 __ovld __cnfn convert_char2_sat(uint2);
+char2 __ovld __cnfn convert_char2_rte(long2);
+char2 __ovld __cnfn convert_char2_sat_rte(long2);
+char2 __ovld __cnfn convert_char2_rtz(long2);
+char2 __ovld __cnfn convert_char2_sat_rtz(long2);
+char2 __ovld __cnfn convert_char2_rtp(long2);
+char2 __ovld __cnfn convert_char2_sat_rtp(long2);
+char2 __ovld __cnfn convert_char2_rtn(long2);
+char2 __ovld __cnfn convert_char2_sat_rtn(long2);
+char2 __ovld __cnfn convert_char2(long2);
+char2 __ovld __cnfn convert_char2_sat(long2);
+char2 __ovld __cnfn convert_char2_rte(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rte(ulong2);
+char2 __ovld __cnfn convert_char2_rtz(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtz(ulong2);
+char2 __ovld __cnfn convert_char2_rtp(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtp(ulong2);
+char2 __ovld __cnfn convert_char2_rtn(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtn(ulong2);
+char2 __ovld __cnfn convert_char2(ulong2);
+char2 __ovld __cnfn convert_char2_sat(ulong2);
+char2 __ovld __cnfn convert_char2_rte(float2);
+char2 __ovld __cnfn convert_char2_sat_rte(float2);
+char2 __ovld __cnfn convert_char2_rtz(float2);
+char2 __ovld __cnfn convert_char2_sat_rtz(float2);
+char2 __ovld __cnfn convert_char2_rtp(float2);
+char2 __ovld __cnfn convert_char2_sat_rtp(float2);
+char2 __ovld __cnfn convert_char2_rtn(float2);
+char2 __ovld __cnfn convert_char2_sat_rtn(float2);
+char2 __ovld __cnfn convert_char2(float2);
+char2 __ovld __cnfn convert_char2_sat(float2);
+uchar2 __ovld __cnfn convert_uchar2_rte(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(char2);
+uchar2 __ovld __cnfn convert_uchar2(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat(char2);
+uchar2 __ovld __cnfn convert_uchar2_rte(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(uchar2);
+uchar2 __ovld __cnfn convert_uchar2(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rte(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(short2);
+uchar2 __ovld __cnfn convert_uchar2(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat(short2);
+uchar2 __ovld __cnfn convert_uchar2_rte(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(ushort2);
+uchar2 __ovld __cnfn convert_uchar2(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rte(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(int2);
+uchar2 __ovld __cnfn convert_uchar2(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat(int2);
+uchar2 __ovld __cnfn convert_uchar2_rte(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(uint2);
+uchar2 __ovld __cnfn convert_uchar2(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rte(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(long2);
+uchar2 __ovld __cnfn convert_uchar2(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat(long2);
+uchar2 __ovld __cnfn convert_uchar2_rte(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(ulong2);
+uchar2 __ovld __cnfn convert_uchar2(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rte(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(float2);
+uchar2 __ovld __cnfn convert_uchar2(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat(float2);
+short2 __ovld __cnfn convert_short2_rte(char2);
+short2 __ovld __cnfn convert_short2_sat_rte(char2);
+short2 __ovld __cnfn convert_short2_rtz(char2);
+short2 __ovld __cnfn convert_short2_sat_rtz(char2);
+short2 __ovld __cnfn convert_short2_rtp(char2);
+short2 __ovld __cnfn convert_short2_sat_rtp(char2);
+short2 __ovld __cnfn convert_short2_rtn(char2);
+short2 __ovld __cnfn convert_short2_sat_rtn(char2);
+short2 __ovld __cnfn convert_short2(char2);
+short2 __ovld __cnfn convert_short2_sat(char2);
+short2 __ovld __cnfn convert_short2_rte(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rte(uchar2);
+short2 __ovld __cnfn convert_short2_rtz(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtz(uchar2);
+short2 __ovld __cnfn convert_short2_rtp(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtp(uchar2);
+short2 __ovld __cnfn convert_short2_rtn(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtn(uchar2);
+short2 __ovld __cnfn convert_short2(uchar2);
+short2 __ovld __cnfn convert_short2_sat(uchar2);
+short2 __ovld __cnfn convert_short2_rte(short2);
+short2 __ovld __cnfn convert_short2_sat_rte(short2);
+short2 __ovld __cnfn convert_short2_rtz(short2);
+short2 __ovld __cnfn convert_short2_sat_rtz(short2);
+short2 __ovld __cnfn convert_short2_rtp(short2);
+short2 __ovld __cnfn convert_short2_sat_rtp(short2);
+short2 __ovld __cnfn convert_short2_rtn(short2);
+short2 __ovld __cnfn convert_short2_sat_rtn(short2);
+short2 __ovld __cnfn convert_short2(short2);
+short2 __ovld __cnfn convert_short2_sat(short2);
+short2 __ovld __cnfn convert_short2_rte(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rte(ushort2);
+short2 __ovld __cnfn convert_short2_rtz(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtz(ushort2);
+short2 __ovld __cnfn convert_short2_rtp(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtp(ushort2);
+short2 __ovld __cnfn convert_short2_rtn(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtn(ushort2);
+short2 __ovld __cnfn convert_short2(ushort2);
+short2 __ovld __cnfn convert_short2_sat(ushort2);
+short2 __ovld __cnfn convert_short2_rte(int2);
+short2 __ovld __cnfn convert_short2_sat_rte(int2);
+short2 __ovld __cnfn convert_short2_rtz(int2);
+short2 __ovld __cnfn convert_short2_sat_rtz(int2);
+short2 __ovld __cnfn convert_short2_rtp(int2);
+short2 __ovld __cnfn convert_short2_sat_rtp(int2);
+short2 __ovld __cnfn convert_short2_rtn(int2);
+short2 __ovld __cnfn convert_short2_sat_rtn(int2);
+short2 __ovld __cnfn convert_short2(int2);
+short2 __ovld __cnfn convert_short2_sat(int2);
+short2 __ovld __cnfn convert_short2_rte(uint2);
+short2 __ovld __cnfn convert_short2_sat_rte(uint2);
+short2 __ovld __cnfn convert_short2_rtz(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtz(uint2);
+short2 __ovld __cnfn convert_short2_rtp(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtp(uint2);
+short2 __ovld __cnfn convert_short2_rtn(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtn(uint2);
+short2 __ovld __cnfn convert_short2(uint2);
+short2 __ovld __cnfn convert_short2_sat(uint2);
+short2 __ovld __cnfn convert_short2_rte(long2);
+short2 __ovld __cnfn convert_short2_sat_rte(long2);
+short2 __ovld __cnfn convert_short2_rtz(long2);
+short2 __ovld __cnfn convert_short2_sat_rtz(long2);
+short2 __ovld __cnfn convert_short2_rtp(long2);
+short2 __ovld __cnfn convert_short2_sat_rtp(long2);
+short2 __ovld __cnfn convert_short2_rtn(long2);
+short2 __ovld __cnfn convert_short2_sat_rtn(long2);
+short2 __ovld __cnfn convert_short2(long2);
+short2 __ovld __cnfn convert_short2_sat(long2);
+short2 __ovld __cnfn convert_short2_rte(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rte(ulong2);
+short2 __ovld __cnfn convert_short2_rtz(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtz(ulong2);
+short2 __ovld __cnfn convert_short2_rtp(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtp(ulong2);
+short2 __ovld __cnfn convert_short2_rtn(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtn(ulong2);
+short2 __ovld __cnfn convert_short2(ulong2);
+short2 __ovld __cnfn convert_short2_sat(ulong2);
+short2 __ovld __cnfn convert_short2_rte(float2);
+short2 __ovld __cnfn convert_short2_sat_rte(float2);
+short2 __ovld __cnfn convert_short2_rtz(float2);
+short2 __ovld __cnfn convert_short2_sat_rtz(float2);
+short2 __ovld __cnfn convert_short2_rtp(float2);
+short2 __ovld __cnfn convert_short2_sat_rtp(float2);
+short2 __ovld __cnfn convert_short2_rtn(float2);
+short2 __ovld __cnfn convert_short2_sat_rtn(float2);
+short2 __ovld __cnfn convert_short2(float2);
+short2 __ovld __cnfn convert_short2_sat(float2);
+ushort2 __ovld __cnfn convert_ushort2_rte(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(char2);
+ushort2 __ovld __cnfn convert_ushort2(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat(char2);
+ushort2 __ovld __cnfn convert_ushort2_rte(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(uchar2);
+ushort2 __ovld __cnfn convert_ushort2(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rte(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(short2);
+ushort2 __ovld __cnfn convert_ushort2(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat(short2);
+ushort2 __ovld __cnfn convert_ushort2_rte(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(ushort2);
+ushort2 __ovld __cnfn convert_ushort2(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rte(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(int2);
+ushort2 __ovld __cnfn convert_ushort2(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat(int2);
+ushort2 __ovld __cnfn convert_ushort2_rte(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(uint2);
+ushort2 __ovld __cnfn convert_ushort2(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rte(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(long2);
+ushort2 __ovld __cnfn convert_ushort2(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat(long2);
+ushort2 __ovld __cnfn convert_ushort2_rte(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(ulong2);
+ushort2 __ovld __cnfn convert_ushort2(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rte(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(float2);
+ushort2 __ovld __cnfn convert_ushort2(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat(float2);
+int2 __ovld __cnfn convert_int2_rte(char2);
+int2 __ovld __cnfn convert_int2_sat_rte(char2);
+int2 __ovld __cnfn convert_int2_rtz(char2);
+int2 __ovld __cnfn convert_int2_sat_rtz(char2);
+int2 __ovld __cnfn convert_int2_rtp(char2);
+int2 __ovld __cnfn convert_int2_sat_rtp(char2);
+int2 __ovld __cnfn convert_int2_rtn(char2);
+int2 __ovld __cnfn convert_int2_sat_rtn(char2);
+int2 __ovld __cnfn convert_int2(char2);
+int2 __ovld __cnfn convert_int2_sat(char2);
+int2 __ovld __cnfn convert_int2_rte(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rte(uchar2);
+int2 __ovld __cnfn convert_int2_rtz(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtz(uchar2);
+int2 __ovld __cnfn convert_int2_rtp(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtp(uchar2);
+int2 __ovld __cnfn convert_int2_rtn(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtn(uchar2);
+int2 __ovld __cnfn convert_int2(uchar2);
+int2 __ovld __cnfn convert_int2_sat(uchar2);
+int2 __ovld __cnfn convert_int2_rte(short2);
+int2 __ovld __cnfn convert_int2_sat_rte(short2);
+int2 __ovld __cnfn convert_int2_rtz(short2);
+int2 __ovld __cnfn convert_int2_sat_rtz(short2);
+int2 __ovld __cnfn convert_int2_rtp(short2);
+int2 __ovld __cnfn convert_int2_sat_rtp(short2);
+int2 __ovld __cnfn convert_int2_rtn(short2);
+int2 __ovld __cnfn convert_int2_sat_rtn(short2);
+int2 __ovld __cnfn convert_int2(short2);
+int2 __ovld __cnfn convert_int2_sat(short2);
+int2 __ovld __cnfn convert_int2_rte(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rte(ushort2);
+int2 __ovld __cnfn convert_int2_rtz(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtz(ushort2);
+int2 __ovld __cnfn convert_int2_rtp(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtp(ushort2);
+int2 __ovld __cnfn convert_int2_rtn(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtn(ushort2);
+int2 __ovld __cnfn convert_int2(ushort2);
+int2 __ovld __cnfn convert_int2_sat(ushort2);
+int2 __ovld __cnfn convert_int2_rte(int2);
+int2 __ovld __cnfn convert_int2_sat_rte(int2);
+int2 __ovld __cnfn convert_int2_rtz(int2);
+int2 __ovld __cnfn convert_int2_sat_rtz(int2);
+int2 __ovld __cnfn convert_int2_rtp(int2);
+int2 __ovld __cnfn convert_int2_sat_rtp(int2);
+int2 __ovld __cnfn convert_int2_rtn(int2);
+int2 __ovld __cnfn convert_int2_sat_rtn(int2);
+int2 __ovld __cnfn convert_int2(int2);
+int2 __ovld __cnfn convert_int2_sat(int2);
+int2 __ovld __cnfn convert_int2_rte(uint2);
+int2 __ovld __cnfn convert_int2_sat_rte(uint2);
+int2 __ovld __cnfn convert_int2_rtz(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtz(uint2);
+int2 __ovld __cnfn convert_int2_rtp(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtp(uint2);
+int2 __ovld __cnfn convert_int2_rtn(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtn(uint2);
+int2 __ovld __cnfn convert_int2(uint2);
+int2 __ovld __cnfn convert_int2_sat(uint2);
+int2 __ovld __cnfn convert_int2_rte(long2);
+int2 __ovld __cnfn convert_int2_sat_rte(long2);
+int2 __ovld __cnfn convert_int2_rtz(long2);
+int2 __ovld __cnfn convert_int2_sat_rtz(long2);
+int2 __ovld __cnfn convert_int2_rtp(long2);
+int2 __ovld __cnfn convert_int2_sat_rtp(long2);
+int2 __ovld __cnfn convert_int2_rtn(long2);
+int2 __ovld __cnfn convert_int2_sat_rtn(long2);
+int2 __ovld __cnfn convert_int2(long2);
+int2 __ovld __cnfn convert_int2_sat(long2);
+int2 __ovld __cnfn convert_int2_rte(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rte(ulong2);
+int2 __ovld __cnfn convert_int2_rtz(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtz(ulong2);
+int2 __ovld __cnfn convert_int2_rtp(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtp(ulong2);
+int2 __ovld __cnfn convert_int2_rtn(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtn(ulong2);
+int2 __ovld __cnfn convert_int2(ulong2);
+int2 __ovld __cnfn convert_int2_sat(ulong2);
+int2 __ovld __cnfn convert_int2_rte(float2);
+int2 __ovld __cnfn convert_int2_sat_rte(float2);
+int2 __ovld __cnfn convert_int2_rtz(float2);
+int2 __ovld __cnfn convert_int2_sat_rtz(float2);
+int2 __ovld __cnfn convert_int2_rtp(float2);
+int2 __ovld __cnfn convert_int2_sat_rtp(float2);
+int2 __ovld __cnfn convert_int2_rtn(float2);
+int2 __ovld __cnfn convert_int2_sat_rtn(float2);
+int2 __ovld __cnfn convert_int2(float2);
+int2 __ovld __cnfn convert_int2_sat(float2);
+uint2 __ovld __cnfn convert_uint2_rte(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(char2);
+uint2 __ovld __cnfn convert_uint2_rtz(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(char2);
+uint2 __ovld __cnfn convert_uint2_rtp(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(char2);
+uint2 __ovld __cnfn convert_uint2_rtn(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(char2);
+uint2 __ovld __cnfn convert_uint2(char2);
+uint2 __ovld __cnfn convert_uint2_sat(char2);
+uint2 __ovld __cnfn convert_uint2_rte(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtz(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtp(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtn(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(uchar2);
+uint2 __ovld __cnfn convert_uint2(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat(uchar2);
+uint2 __ovld __cnfn convert_uint2_rte(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(short2);
+uint2 __ovld __cnfn convert_uint2_rtz(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(short2);
+uint2 __ovld __cnfn convert_uint2_rtp(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(short2);
+uint2 __ovld __cnfn convert_uint2_rtn(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(short2);
+uint2 __ovld __cnfn convert_uint2(short2);
+uint2 __ovld __cnfn convert_uint2_sat(short2);
+uint2 __ovld __cnfn convert_uint2_rte(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtz(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtp(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtn(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(ushort2);
+uint2 __ovld __cnfn convert_uint2(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat(ushort2);
+uint2 __ovld __cnfn convert_uint2_rte(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(int2);
+uint2 __ovld __cnfn convert_uint2_rtz(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(int2);
+uint2 __ovld __cnfn convert_uint2_rtp(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(int2);
+uint2 __ovld __cnfn convert_uint2_rtn(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(int2);
+uint2 __ovld __cnfn convert_uint2(int2);
+uint2 __ovld __cnfn convert_uint2_sat(int2);
+uint2 __ovld __cnfn convert_uint2_rte(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(uint2);
+uint2 __ovld __cnfn convert_uint2_rtz(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(uint2);
+uint2 __ovld __cnfn convert_uint2_rtp(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(uint2);
+uint2 __ovld __cnfn convert_uint2_rtn(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(uint2);
+uint2 __ovld __cnfn convert_uint2(uint2);
+uint2 __ovld __cnfn convert_uint2_sat(uint2);
+uint2 __ovld __cnfn convert_uint2_rte(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(long2);
+uint2 __ovld __cnfn convert_uint2_rtz(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(long2);
+uint2 __ovld __cnfn convert_uint2_rtp(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(long2);
+uint2 __ovld __cnfn convert_uint2_rtn(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(long2);
+uint2 __ovld __cnfn convert_uint2(long2);
+uint2 __ovld __cnfn convert_uint2_sat(long2);
+uint2 __ovld __cnfn convert_uint2_rte(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtz(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtp(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtn(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(ulong2);
+uint2 __ovld __cnfn convert_uint2(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat(ulong2);
+uint2 __ovld __cnfn convert_uint2_rte(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(float2);
+uint2 __ovld __cnfn convert_uint2_rtz(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(float2);
+uint2 __ovld __cnfn convert_uint2_rtp(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(float2);
+uint2 __ovld __cnfn convert_uint2_rtn(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(float2);
+uint2 __ovld __cnfn convert_uint2(float2);
+uint2 __ovld __cnfn convert_uint2_sat(float2);
+long2 __ovld __cnfn convert_long2_rte(char2);
+long2 __ovld __cnfn convert_long2_sat_rte(char2);
+long2 __ovld __cnfn convert_long2_rtz(char2);
+long2 __ovld __cnfn convert_long2_sat_rtz(char2);
+long2 __ovld __cnfn convert_long2_rtp(char2);
+long2 __ovld __cnfn convert_long2_sat_rtp(char2);
+long2 __ovld __cnfn convert_long2_rtn(char2);
+long2 __ovld __cnfn convert_long2_sat_rtn(char2);
+long2 __ovld __cnfn convert_long2(char2);
+long2 __ovld __cnfn convert_long2_sat(char2);
+long2 __ovld __cnfn convert_long2_rte(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rte(uchar2);
+long2 __ovld __cnfn convert_long2_rtz(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtz(uchar2);
+long2 __ovld __cnfn convert_long2_rtp(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtp(uchar2);
+long2 __ovld __cnfn convert_long2_rtn(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtn(uchar2);
+long2 __ovld __cnfn convert_long2(uchar2);
+long2 __ovld __cnfn convert_long2_sat(uchar2);
+long2 __ovld __cnfn convert_long2_rte(short2);
+long2 __ovld __cnfn convert_long2_sat_rte(short2);
+long2 __ovld __cnfn convert_long2_rtz(short2);
+long2 __ovld __cnfn convert_long2_sat_rtz(short2);
+long2 __ovld __cnfn convert_long2_rtp(short2);
+long2 __ovld __cnfn convert_long2_sat_rtp(short2);
+long2 __ovld __cnfn convert_long2_rtn(short2);
+long2 __ovld __cnfn convert_long2_sat_rtn(short2);
+long2 __ovld __cnfn convert_long2(short2);
+long2 __ovld __cnfn convert_long2_sat(short2);
+long2 __ovld __cnfn convert_long2_rte(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rte(ushort2);
+long2 __ovld __cnfn convert_long2_rtz(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtz(ushort2);
+long2 __ovld __cnfn convert_long2_rtp(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtp(ushort2);
+long2 __ovld __cnfn convert_long2_rtn(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtn(ushort2);
+long2 __ovld __cnfn convert_long2(ushort2);
+long2 __ovld __cnfn convert_long2_sat(ushort2);
+long2 __ovld __cnfn convert_long2_rte(int2);
+long2 __ovld __cnfn convert_long2_sat_rte(int2);
+long2 __ovld __cnfn convert_long2_rtz(int2);
+long2 __ovld __cnfn convert_long2_sat_rtz(int2);
+long2 __ovld __cnfn convert_long2_rtp(int2);
+long2 __ovld __cnfn convert_long2_sat_rtp(int2);
+long2 __ovld __cnfn convert_long2_rtn(int2);
+long2 __ovld __cnfn convert_long2_sat_rtn(int2);
+long2 __ovld __cnfn convert_long2(int2);
+long2 __ovld __cnfn convert_long2_sat(int2);
+long2 __ovld __cnfn convert_long2_rte(uint2);
+long2 __ovld __cnfn convert_long2_sat_rte(uint2);
+long2 __ovld __cnfn convert_long2_rtz(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtz(uint2);
+long2 __ovld __cnfn convert_long2_rtp(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtp(uint2);
+long2 __ovld __cnfn convert_long2_rtn(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtn(uint2);
+long2 __ovld __cnfn convert_long2(uint2);
+long2 __ovld __cnfn convert_long2_sat(uint2);
+long2 __ovld __cnfn convert_long2_rte(long2);
+long2 __ovld __cnfn convert_long2_sat_rte(long2);
+long2 __ovld __cnfn convert_long2_rtz(long2);
+long2 __ovld __cnfn convert_long2_sat_rtz(long2);
+long2 __ovld __cnfn convert_long2_rtp(long2);
+long2 __ovld __cnfn convert_long2_sat_rtp(long2);
+long2 __ovld __cnfn convert_long2_rtn(long2);
+long2 __ovld __cnfn convert_long2_sat_rtn(long2);
+long2 __ovld __cnfn convert_long2(long2);
+long2 __ovld __cnfn convert_long2_sat(long2);
+long2 __ovld __cnfn convert_long2_rte(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rte(ulong2);
+long2 __ovld __cnfn convert_long2_rtz(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtz(ulong2);
+long2 __ovld __cnfn convert_long2_rtp(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtp(ulong2);
+long2 __ovld __cnfn convert_long2_rtn(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtn(ulong2);
+long2 __ovld __cnfn convert_long2(ulong2);
+long2 __ovld __cnfn convert_long2_sat(ulong2);
+long2 __ovld __cnfn convert_long2_rte(float2);
+long2 __ovld __cnfn convert_long2_sat_rte(float2);
+long2 __ovld __cnfn convert_long2_rtz(float2);
+long2 __ovld __cnfn convert_long2_sat_rtz(float2);
+long2 __ovld __cnfn convert_long2_rtp(float2);
+long2 __ovld __cnfn convert_long2_sat_rtp(float2);
+long2 __ovld __cnfn convert_long2_rtn(float2);
+long2 __ovld __cnfn convert_long2_sat_rtn(float2);
+long2 __ovld __cnfn convert_long2(float2);
+long2 __ovld __cnfn convert_long2_sat(float2);
+ulong2 __ovld __cnfn convert_ulong2_rte(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(char2);
+ulong2 __ovld __cnfn convert_ulong2(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat(char2);
+ulong2 __ovld __cnfn convert_ulong2_rte(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(uchar2);
+ulong2 __ovld __cnfn convert_ulong2(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rte(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(short2);
+ulong2 __ovld __cnfn convert_ulong2(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat(short2);
+ulong2 __ovld __cnfn convert_ulong2_rte(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(ushort2);
+ulong2 __ovld __cnfn convert_ulong2(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rte(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(int2);
+ulong2 __ovld __cnfn convert_ulong2(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat(int2);
+ulong2 __ovld __cnfn convert_ulong2_rte(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(uint2);
+ulong2 __ovld __cnfn convert_ulong2(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rte(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(long2);
+ulong2 __ovld __cnfn convert_ulong2(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat(long2);
+ulong2 __ovld __cnfn convert_ulong2_rte(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(ulong2);
+ulong2 __ovld __cnfn convert_ulong2(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rte(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(float2);
+ulong2 __ovld __cnfn convert_ulong2(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat(float2);
+float2 __ovld __cnfn convert_float2_rte(char2);
+float2 __ovld __cnfn convert_float2_rtz(char2);
+float2 __ovld __cnfn convert_float2_rtp(char2);
+float2 __ovld __cnfn convert_float2_rtn(char2);
+float2 __ovld __cnfn convert_float2(char2);
+float2 __ovld __cnfn convert_float2_rte(uchar2);
+float2 __ovld __cnfn convert_float2_rtz(uchar2);
+float2 __ovld __cnfn convert_float2_rtp(uchar2);
+float2 __ovld __cnfn convert_float2_rtn(uchar2);
+float2 __ovld __cnfn convert_float2(uchar2);
+float2 __ovld __cnfn convert_float2_rte(short2);
+float2 __ovld __cnfn convert_float2_rtz(short2);
+float2 __ovld __cnfn convert_float2_rtp(short2);
+float2 __ovld __cnfn convert_float2_rtn(short2);
+float2 __ovld __cnfn convert_float2(short2);
+float2 __ovld __cnfn convert_float2_rte(ushort2);
+float2 __ovld __cnfn convert_float2_rtz(ushort2);
+float2 __ovld __cnfn convert_float2_rtp(ushort2);
+float2 __ovld __cnfn convert_float2_rtn(ushort2);
+float2 __ovld __cnfn convert_float2(ushort2);
+float2 __ovld __cnfn convert_float2_rte(int2);
+float2 __ovld __cnfn convert_float2_rtz(int2);
+float2 __ovld __cnfn convert_float2_rtp(int2);
+float2 __ovld __cnfn convert_float2_rtn(int2);
+float2 __ovld __cnfn convert_float2(int2);
+float2 __ovld __cnfn convert_float2_rte(uint2);
+float2 __ovld __cnfn convert_float2_rtz(uint2);
+float2 __ovld __cnfn convert_float2_rtp(uint2);
+float2 __ovld __cnfn convert_float2_rtn(uint2);
+float2 __ovld __cnfn convert_float2(uint2);
+float2 __ovld __cnfn convert_float2_rte(long2);
+float2 __ovld __cnfn convert_float2_rtz(long2);
+float2 __ovld __cnfn convert_float2_rtp(long2);
+float2 __ovld __cnfn convert_float2_rtn(long2);
+float2 __ovld __cnfn convert_float2(long2);
+float2 __ovld __cnfn convert_float2_rte(ulong2);
+float2 __ovld __cnfn convert_float2_rtz(ulong2);
+float2 __ovld __cnfn convert_float2_rtp(ulong2);
+float2 __ovld __cnfn convert_float2_rtn(ulong2);
+float2 __ovld __cnfn convert_float2(ulong2);
+float2 __ovld __cnfn convert_float2_rte(float2);
+float2 __ovld __cnfn convert_float2_rtz(float2);
+float2 __ovld __cnfn convert_float2_rtp(float2);
+float2 __ovld __cnfn convert_float2_rtn(float2);
+float2 __ovld __cnfn convert_float2(float2);
+char3 __ovld __cnfn convert_char3_rte(char3);
+char3 __ovld __cnfn convert_char3_sat_rte(char3);
+char3 __ovld __cnfn convert_char3_rtz(char3);
+char3 __ovld __cnfn convert_char3_sat_rtz(char3);
+char3 __ovld __cnfn convert_char3_rtp(char3);
+char3 __ovld __cnfn convert_char3_sat_rtp(char3);
+char3 __ovld __cnfn convert_char3_rtn(char3);
+char3 __ovld __cnfn convert_char3_sat_rtn(char3);
+char3 __ovld __cnfn convert_char3(char3);
+char3 __ovld __cnfn convert_char3_sat(char3);
+char3 __ovld __cnfn convert_char3_rte(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rte(uchar3);
+char3 __ovld __cnfn convert_char3_rtz(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtz(uchar3);
+char3 __ovld __cnfn convert_char3_rtp(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtp(uchar3);
+char3 __ovld __cnfn convert_char3_rtn(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtn(uchar3);
+char3 __ovld __cnfn convert_char3(uchar3);
+char3 __ovld __cnfn convert_char3_sat(uchar3);
+char3 __ovld __cnfn convert_char3_rte(short3);
+char3 __ovld __cnfn convert_char3_sat_rte(short3);
+char3 __ovld __cnfn convert_char3_rtz(short3);
+char3 __ovld __cnfn convert_char3_sat_rtz(short3);
+char3 __ovld __cnfn convert_char3_rtp(short3);
+char3 __ovld __cnfn convert_char3_sat_rtp(short3);
+char3 __ovld __cnfn convert_char3_rtn(short3);
+char3 __ovld __cnfn convert_char3_sat_rtn(short3);
+char3 __ovld __cnfn convert_char3(short3);
+char3 __ovld __cnfn convert_char3_sat(short3);
+char3 __ovld __cnfn convert_char3_rte(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rte(ushort3);
+char3 __ovld __cnfn convert_char3_rtz(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtz(ushort3);
+char3 __ovld __cnfn convert_char3_rtp(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtp(ushort3);
+char3 __ovld __cnfn convert_char3_rtn(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtn(ushort3);
+char3 __ovld __cnfn convert_char3(ushort3);
+char3 __ovld __cnfn convert_char3_sat(ushort3);
+char3 __ovld __cnfn convert_char3_rte(int3);
+char3 __ovld __cnfn convert_char3_sat_rte(int3);
+char3 __ovld __cnfn convert_char3_rtz(int3);
+char3 __ovld __cnfn convert_char3_sat_rtz(int3);
+char3 __ovld __cnfn convert_char3_rtp(int3);
+char3 __ovld __cnfn convert_char3_sat_rtp(int3);
+char3 __ovld __cnfn convert_char3_rtn(int3);
+char3 __ovld __cnfn convert_char3_sat_rtn(int3);
+char3 __ovld __cnfn convert_char3(int3);
+char3 __ovld __cnfn convert_char3_sat(int3);
+char3 __ovld __cnfn convert_char3_rte(uint3);
+char3 __ovld __cnfn convert_char3_sat_rte(uint3);
+char3 __ovld __cnfn convert_char3_rtz(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtz(uint3);
+char3 __ovld __cnfn convert_char3_rtp(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtp(uint3);
+char3 __ovld __cnfn convert_char3_rtn(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtn(uint3);
+char3 __ovld __cnfn convert_char3(uint3);
+char3 __ovld __cnfn convert_char3_sat(uint3);
+char3 __ovld __cnfn convert_char3_rte(long3);
+char3 __ovld __cnfn convert_char3_sat_rte(long3);
+char3 __ovld __cnfn convert_char3_rtz(long3);
+char3 __ovld __cnfn convert_char3_sat_rtz(long3);
+char3 __ovld __cnfn convert_char3_rtp(long3);
+char3 __ovld __cnfn convert_char3_sat_rtp(long3);
+char3 __ovld __cnfn convert_char3_rtn(long3);
+char3 __ovld __cnfn convert_char3_sat_rtn(long3);
+char3 __ovld __cnfn convert_char3(long3);
+char3 __ovld __cnfn convert_char3_sat(long3);
+char3 __ovld __cnfn convert_char3_rte(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rte(ulong3);
+char3 __ovld __cnfn convert_char3_rtz(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtz(ulong3);
+char3 __ovld __cnfn convert_char3_rtp(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtp(ulong3);
+char3 __ovld __cnfn convert_char3_rtn(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtn(ulong3);
+char3 __ovld __cnfn convert_char3(ulong3);
+char3 __ovld __cnfn convert_char3_sat(ulong3);
+char3 __ovld __cnfn convert_char3_rte(float3);
+char3 __ovld __cnfn convert_char3_sat_rte(float3);
+char3 __ovld __cnfn convert_char3_rtz(float3);
+char3 __ovld __cnfn convert_char3_sat_rtz(float3);
+char3 __ovld __cnfn convert_char3_rtp(float3);
+char3 __ovld __cnfn convert_char3_sat_rtp(float3);
+char3 __ovld __cnfn convert_char3_rtn(float3);
+char3 __ovld __cnfn convert_char3_sat_rtn(float3);
+char3 __ovld __cnfn convert_char3(float3);
+char3 __ovld __cnfn convert_char3_sat(float3);
+uchar3 __ovld __cnfn convert_uchar3_rte(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(char3);
+uchar3 __ovld __cnfn convert_uchar3(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat(char3);
+uchar3 __ovld __cnfn convert_uchar3_rte(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(uchar3);
+uchar3 __ovld __cnfn convert_uchar3(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rte(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(short3);
+uchar3 __ovld __cnfn convert_uchar3(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat(short3);
+uchar3 __ovld __cnfn convert_uchar3_rte(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(ushort3);
+uchar3 __ovld __cnfn convert_uchar3(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rte(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(int3);
+uchar3 __ovld __cnfn convert_uchar3(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat(int3);
+uchar3 __ovld __cnfn convert_uchar3_rte(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(uint3);
+uchar3 __ovld __cnfn convert_uchar3(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rte(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(long3);
+uchar3 __ovld __cnfn convert_uchar3(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat(long3);
+uchar3 __ovld __cnfn convert_uchar3_rte(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(ulong3);
+uchar3 __ovld __cnfn convert_uchar3(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rte(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(float3);
+uchar3 __ovld __cnfn convert_uchar3(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat(float3);
+short3 __ovld __cnfn convert_short3_rte(char3);
+short3 __ovld __cnfn convert_short3_sat_rte(char3);
+short3 __ovld __cnfn convert_short3_rtz(char3);
+short3 __ovld __cnfn convert_short3_sat_rtz(char3);
+short3 __ovld __cnfn convert_short3_rtp(char3);
+short3 __ovld __cnfn convert_short3_sat_rtp(char3);
+short3 __ovld __cnfn convert_short3_rtn(char3);
+short3 __ovld __cnfn convert_short3_sat_rtn(char3);
+short3 __ovld __cnfn convert_short3(char3);
+short3 __ovld __cnfn convert_short3_sat(char3);
+short3 __ovld __cnfn convert_short3_rte(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rte(uchar3);
+short3 __ovld __cnfn convert_short3_rtz(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtz(uchar3);
+short3 __ovld __cnfn convert_short3_rtp(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtp(uchar3);
+short3 __ovld __cnfn convert_short3_rtn(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtn(uchar3);
+short3 __ovld __cnfn convert_short3(uchar3);
+short3 __ovld __cnfn convert_short3_sat(uchar3);
+short3 __ovld __cnfn convert_short3_rte(short3);
+short3 __ovld __cnfn convert_short3_sat_rte(short3);
+short3 __ovld __cnfn convert_short3_rtz(short3);
+short3 __ovld __cnfn convert_short3_sat_rtz(short3);
+short3 __ovld __cnfn convert_short3_rtp(short3);
+short3 __ovld __cnfn convert_short3_sat_rtp(short3);
+short3 __ovld __cnfn convert_short3_rtn(short3);
+short3 __ovld __cnfn convert_short3_sat_rtn(short3);
+short3 __ovld __cnfn convert_short3(short3);
+short3 __ovld __cnfn convert_short3_sat(short3);
+short3 __ovld __cnfn convert_short3_rte(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rte(ushort3);
+short3 __ovld __cnfn convert_short3_rtz(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtz(ushort3);
+short3 __ovld __cnfn convert_short3_rtp(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtp(ushort3);
+short3 __ovld __cnfn convert_short3_rtn(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtn(ushort3);
+short3 __ovld __cnfn convert_short3(ushort3);
+short3 __ovld __cnfn convert_short3_sat(ushort3);
+short3 __ovld __cnfn convert_short3_rte(int3);
+short3 __ovld __cnfn convert_short3_sat_rte(int3);
+short3 __ovld __cnfn convert_short3_rtz(int3);
+short3 __ovld __cnfn convert_short3_sat_rtz(int3);
+short3 __ovld __cnfn convert_short3_rtp(int3);
+short3 __ovld __cnfn convert_short3_sat_rtp(int3);
+short3 __ovld __cnfn convert_short3_rtn(int3);
+short3 __ovld __cnfn convert_short3_sat_rtn(int3);
+short3 __ovld __cnfn convert_short3(int3);
+short3 __ovld __cnfn convert_short3_sat(int3);
+short3 __ovld __cnfn convert_short3_rte(uint3);
+short3 __ovld __cnfn convert_short3_sat_rte(uint3);
+short3 __ovld __cnfn convert_short3_rtz(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtz(uint3);
+short3 __ovld __cnfn convert_short3_rtp(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtp(uint3);
+short3 __ovld __cnfn convert_short3_rtn(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtn(uint3);
+short3 __ovld __cnfn convert_short3(uint3);
+short3 __ovld __cnfn convert_short3_sat(uint3);
+short3 __ovld __cnfn convert_short3_rte(long3);
+short3 __ovld __cnfn convert_short3_sat_rte(long3);
+short3 __ovld __cnfn convert_short3_rtz(long3);
+short3 __ovld __cnfn convert_short3_sat_rtz(long3);
+short3 __ovld __cnfn convert_short3_rtp(long3);
+short3 __ovld __cnfn convert_short3_sat_rtp(long3);
+short3 __ovld __cnfn convert_short3_rtn(long3);
+short3 __ovld __cnfn convert_short3_sat_rtn(long3);
+short3 __ovld __cnfn convert_short3(long3);
+short3 __ovld __cnfn convert_short3_sat(long3);
+short3 __ovld __cnfn convert_short3_rte(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rte(ulong3);
+short3 __ovld __cnfn convert_short3_rtz(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtz(ulong3);
+short3 __ovld __cnfn convert_short3_rtp(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtp(ulong3);
+short3 __ovld __cnfn convert_short3_rtn(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtn(ulong3);
+short3 __ovld __cnfn convert_short3(ulong3);
+short3 __ovld __cnfn convert_short3_sat(ulong3);
+short3 __ovld __cnfn convert_short3_rte(float3);
+short3 __ovld __cnfn convert_short3_sat_rte(float3);
+short3 __ovld __cnfn convert_short3_rtz(float3);
+short3 __ovld __cnfn convert_short3_sat_rtz(float3);
+short3 __ovld __cnfn convert_short3_rtp(float3);
+short3 __ovld __cnfn convert_short3_sat_rtp(float3);
+short3 __ovld __cnfn convert_short3_rtn(float3);
+short3 __ovld __cnfn convert_short3_sat_rtn(float3);
+short3 __ovld __cnfn convert_short3(float3);
+short3 __ovld __cnfn convert_short3_sat(float3);
+ushort3 __ovld __cnfn convert_ushort3_rte(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(char3);
+ushort3 __ovld __cnfn convert_ushort3(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat(char3);
+ushort3 __ovld __cnfn convert_ushort3_rte(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(uchar3);
+ushort3 __ovld __cnfn convert_ushort3(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rte(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(short3);
+ushort3 __ovld __cnfn convert_ushort3(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat(short3);
+ushort3 __ovld __cnfn convert_ushort3_rte(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(ushort3);
+ushort3 __ovld __cnfn convert_ushort3(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rte(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(int3);
+ushort3 __ovld __cnfn convert_ushort3(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat(int3);
+ushort3 __ovld __cnfn convert_ushort3_rte(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(uint3);
+ushort3 __ovld __cnfn convert_ushort3(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rte(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(long3);
+ushort3 __ovld __cnfn convert_ushort3(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat(long3);
+ushort3 __ovld __cnfn convert_ushort3_rte(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(ulong3);
+ushort3 __ovld __cnfn convert_ushort3(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rte(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(float3);
+ushort3 __ovld __cnfn convert_ushort3(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat(float3);
+int3 __ovld __cnfn convert_int3_rte(char3);
+int3 __ovld __cnfn convert_int3_sat_rte(char3);
+int3 __ovld __cnfn convert_int3_rtz(char3);
+int3 __ovld __cnfn convert_int3_sat_rtz(char3);
+int3 __ovld __cnfn convert_int3_rtp(char3);
+int3 __ovld __cnfn convert_int3_sat_rtp(char3);
+int3 __ovld __cnfn convert_int3_rtn(char3);
+int3 __ovld __cnfn convert_int3_sat_rtn(char3);
+int3 __ovld __cnfn convert_int3(char3);
+int3 __ovld __cnfn convert_int3_sat(char3);
+int3 __ovld __cnfn convert_int3_rte(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rte(uchar3);
+int3 __ovld __cnfn convert_int3_rtz(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtz(uchar3);
+int3 __ovld __cnfn convert_int3_rtp(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtp(uchar3);
+int3 __ovld __cnfn convert_int3_rtn(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtn(uchar3);
+int3 __ovld __cnfn convert_int3(uchar3);
+int3 __ovld __cnfn convert_int3_sat(uchar3);
+int3 __ovld __cnfn convert_int3_rte(short3);
+int3 __ovld __cnfn convert_int3_sat_rte(short3);
+int3 __ovld __cnfn convert_int3_rtz(short3);
+int3 __ovld __cnfn convert_int3_sat_rtz(short3);
+int3 __ovld __cnfn convert_int3_rtp(short3);
+int3 __ovld __cnfn convert_int3_sat_rtp(short3);
+int3 __ovld __cnfn convert_int3_rtn(short3);
+int3 __ovld __cnfn convert_int3_sat_rtn(short3);
+int3 __ovld __cnfn convert_int3(short3);
+int3 __ovld __cnfn convert_int3_sat(short3);
+int3 __ovld __cnfn convert_int3_rte(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rte(ushort3);
+int3 __ovld __cnfn convert_int3_rtz(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtz(ushort3);
+int3 __ovld __cnfn convert_int3_rtp(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtp(ushort3);
+int3 __ovld __cnfn convert_int3_rtn(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtn(ushort3);
+int3 __ovld __cnfn convert_int3(ushort3);
+int3 __ovld __cnfn convert_int3_sat(ushort3);
+int3 __ovld __cnfn convert_int3_rte(int3);
+int3 __ovld __cnfn convert_int3_sat_rte(int3);
+int3 __ovld __cnfn convert_int3_rtz(int3);
+int3 __ovld __cnfn convert_int3_sat_rtz(int3);
+int3 __ovld __cnfn convert_int3_rtp(int3);
+int3 __ovld __cnfn convert_int3_sat_rtp(int3);
+int3 __ovld __cnfn convert_int3_rtn(int3);
+int3 __ovld __cnfn convert_int3_sat_rtn(int3);
+int3 __ovld __cnfn convert_int3(int3);
+int3 __ovld __cnfn convert_int3_sat(int3);
+int3 __ovld __cnfn convert_int3_rte(uint3);
+int3 __ovld __cnfn convert_int3_sat_rte(uint3);
+int3 __ovld __cnfn convert_int3_rtz(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtz(uint3);
+int3 __ovld __cnfn convert_int3_rtp(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtp(uint3);
+int3 __ovld __cnfn convert_int3_rtn(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtn(uint3);
+int3 __ovld __cnfn convert_int3(uint3);
+int3 __ovld __cnfn convert_int3_sat(uint3);
+int3 __ovld __cnfn convert_int3_rte(long3);
+int3 __ovld __cnfn convert_int3_sat_rte(long3);
+int3 __ovld __cnfn convert_int3_rtz(long3);
+int3 __ovld __cnfn convert_int3_sat_rtz(long3);
+int3 __ovld __cnfn convert_int3_rtp(long3);
+int3 __ovld __cnfn convert_int3_sat_rtp(long3);
+int3 __ovld __cnfn convert_int3_rtn(long3);
+int3 __ovld __cnfn convert_int3_sat_rtn(long3);
+int3 __ovld __cnfn convert_int3(long3);
+int3 __ovld __cnfn convert_int3_sat(long3);
+int3 __ovld __cnfn convert_int3_rte(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rte(ulong3);
+int3 __ovld __cnfn convert_int3_rtz(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtz(ulong3);
+int3 __ovld __cnfn convert_int3_rtp(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtp(ulong3);
+int3 __ovld __cnfn convert_int3_rtn(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtn(ulong3);
+int3 __ovld __cnfn convert_int3(ulong3);
+int3 __ovld __cnfn convert_int3_sat(ulong3);
+int3 __ovld __cnfn convert_int3_rte(float3);
+int3 __ovld __cnfn convert_int3_sat_rte(float3);
+int3 __ovld __cnfn convert_int3_rtz(float3);
+int3 __ovld __cnfn convert_int3_sat_rtz(float3);
+int3 __ovld __cnfn convert_int3_rtp(float3);
+int3 __ovld __cnfn convert_int3_sat_rtp(float3);
+int3 __ovld __cnfn convert_int3_rtn(float3);
+int3 __ovld __cnfn convert_int3_sat_rtn(float3);
+int3 __ovld __cnfn convert_int3(float3);
+int3 __ovld __cnfn convert_int3_sat(float3);
+uint3 __ovld __cnfn convert_uint3_rte(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(char3);
+uint3 __ovld __cnfn convert_uint3_rtz(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(char3);
+uint3 __ovld __cnfn convert_uint3_rtp(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(char3);
+uint3 __ovld __cnfn convert_uint3_rtn(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(char3);
+uint3 __ovld __cnfn convert_uint3(char3);
+uint3 __ovld __cnfn convert_uint3_sat(char3);
+uint3 __ovld __cnfn convert_uint3_rte(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtz(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtp(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtn(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(uchar3);
+uint3 __ovld __cnfn convert_uint3(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat(uchar3);
+uint3 __ovld __cnfn convert_uint3_rte(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(short3);
+uint3 __ovld __cnfn convert_uint3_rtz(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(short3);
+uint3 __ovld __cnfn convert_uint3_rtp(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(short3);
+uint3 __ovld __cnfn convert_uint3_rtn(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(short3);
+uint3 __ovld __cnfn convert_uint3(short3);
+uint3 __ovld __cnfn convert_uint3_sat(short3);
+uint3 __ovld __cnfn convert_uint3_rte(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtz(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtp(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtn(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(ushort3);
+uint3 __ovld __cnfn convert_uint3(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat(ushort3);
+uint3 __ovld __cnfn convert_uint3_rte(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(int3);
+uint3 __ovld __cnfn convert_uint3_rtz(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(int3);
+uint3 __ovld __cnfn convert_uint3_rtp(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(int3);
+uint3 __ovld __cnfn convert_uint3_rtn(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(int3);
+uint3 __ovld __cnfn convert_uint3(int3);
+uint3 __ovld __cnfn convert_uint3_sat(int3);
+uint3 __ovld __cnfn convert_uint3_rte(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(uint3);
+uint3 __ovld __cnfn convert_uint3_rtz(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(uint3);
+uint3 __ovld __cnfn convert_uint3_rtp(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(uint3);
+uint3 __ovld __cnfn convert_uint3_rtn(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(uint3);
+uint3 __ovld __cnfn convert_uint3(uint3);
+uint3 __ovld __cnfn convert_uint3_sat(uint3);
+uint3 __ovld __cnfn convert_uint3_rte(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(long3);
+uint3 __ovld __cnfn convert_uint3_rtz(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(long3);
+uint3 __ovld __cnfn convert_uint3_rtp(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(long3);
+uint3 __ovld __cnfn convert_uint3_rtn(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(long3);
+uint3 __ovld __cnfn convert_uint3(long3);
+uint3 __ovld __cnfn convert_uint3_sat(long3);
+uint3 __ovld __cnfn convert_uint3_rte(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtz(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtp(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtn(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(ulong3);
+uint3 __ovld __cnfn convert_uint3(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat(ulong3);
+uint3 __ovld __cnfn convert_uint3_rte(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(float3);
+uint3 __ovld __cnfn convert_uint3_rtz(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(float3);
+uint3 __ovld __cnfn convert_uint3_rtp(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(float3);
+uint3 __ovld __cnfn convert_uint3_rtn(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(float3);
+uint3 __ovld __cnfn convert_uint3(float3);
+uint3 __ovld __cnfn convert_uint3_sat(float3);
+long3 __ovld __cnfn convert_long3_rte(char3);
+long3 __ovld __cnfn convert_long3_sat_rte(char3);
+long3 __ovld __cnfn convert_long3_rtz(char3);
+long3 __ovld __cnfn convert_long3_sat_rtz(char3);
+long3 __ovld __cnfn convert_long3_rtp(char3);
+long3 __ovld __cnfn convert_long3_sat_rtp(char3);
+long3 __ovld __cnfn convert_long3_rtn(char3);
+long3 __ovld __cnfn convert_long3_sat_rtn(char3);
+long3 __ovld __cnfn convert_long3(char3);
+long3 __ovld __cnfn convert_long3_sat(char3);
+long3 __ovld __cnfn convert_long3_rte(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rte(uchar3);
+long3 __ovld __cnfn convert_long3_rtz(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtz(uchar3);
+long3 __ovld __cnfn convert_long3_rtp(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtp(uchar3);
+long3 __ovld __cnfn convert_long3_rtn(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtn(uchar3);
+long3 __ovld __cnfn convert_long3(uchar3);
+long3 __ovld __cnfn convert_long3_sat(uchar3);
+long3 __ovld __cnfn convert_long3_rte(short3);
+long3 __ovld __cnfn convert_long3_sat_rte(short3);
+long3 __ovld __cnfn convert_long3_rtz(short3);
+long3 __ovld __cnfn convert_long3_sat_rtz(short3);
+long3 __ovld __cnfn convert_long3_rtp(short3);
+long3 __ovld __cnfn convert_long3_sat_rtp(short3);
+long3 __ovld __cnfn convert_long3_rtn(short3);
+long3 __ovld __cnfn convert_long3_sat_rtn(short3);
+long3 __ovld __cnfn convert_long3(short3);
+long3 __ovld __cnfn convert_long3_sat(short3);
+long3 __ovld __cnfn convert_long3_rte(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rte(ushort3);
+long3 __ovld __cnfn convert_long3_rtz(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtz(ushort3);
+long3 __ovld __cnfn convert_long3_rtp(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtp(ushort3);
+long3 __ovld __cnfn convert_long3_rtn(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtn(ushort3);
+long3 __ovld __cnfn convert_long3(ushort3);
+long3 __ovld __cnfn convert_long3_sat(ushort3);
+long3 __ovld __cnfn convert_long3_rte(int3);
+long3 __ovld __cnfn convert_long3_sat_rte(int3);
+long3 __ovld __cnfn convert_long3_rtz(int3);
+long3 __ovld __cnfn convert_long3_sat_rtz(int3);
+long3 __ovld __cnfn convert_long3_rtp(int3);
+long3 __ovld __cnfn convert_long3_sat_rtp(int3);
+long3 __ovld __cnfn convert_long3_rtn(int3);
+long3 __ovld __cnfn convert_long3_sat_rtn(int3);
+long3 __ovld __cnfn convert_long3(int3);
+long3 __ovld __cnfn convert_long3_sat(int3);
+long3 __ovld __cnfn convert_long3_rte(uint3);
+long3 __ovld __cnfn convert_long3_sat_rte(uint3);
+long3 __ovld __cnfn convert_long3_rtz(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtz(uint3);
+long3 __ovld __cnfn convert_long3_rtp(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtp(uint3);
+long3 __ovld __cnfn convert_long3_rtn(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtn(uint3);
+long3 __ovld __cnfn convert_long3(uint3);
+long3 __ovld __cnfn convert_long3_sat(uint3);
+long3 __ovld __cnfn convert_long3_rte(long3);
+long3 __ovld __cnfn convert_long3_sat_rte(long3);
+long3 __ovld __cnfn convert_long3_rtz(long3);
+long3 __ovld __cnfn convert_long3_sat_rtz(long3);
+long3 __ovld __cnfn convert_long3_rtp(long3);
+long3 __ovld __cnfn convert_long3_sat_rtp(long3);
+long3 __ovld __cnfn convert_long3_rtn(long3);
+long3 __ovld __cnfn convert_long3_sat_rtn(long3);
+long3 __ovld __cnfn convert_long3(long3);
+long3 __ovld __cnfn convert_long3_sat(long3);
+long3 __ovld __cnfn convert_long3_rte(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rte(ulong3);
+long3 __ovld __cnfn convert_long3_rtz(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtz(ulong3);
+long3 __ovld __cnfn convert_long3_rtp(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtp(ulong3);
+long3 __ovld __cnfn convert_long3_rtn(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtn(ulong3);
+long3 __ovld __cnfn convert_long3(ulong3);
+long3 __ovld __cnfn convert_long3_sat(ulong3);
+long3 __ovld __cnfn convert_long3_rte(float3);
+long3 __ovld __cnfn convert_long3_sat_rte(float3);
+long3 __ovld __cnfn convert_long3_rtz(float3);
+long3 __ovld __cnfn convert_long3_sat_rtz(float3);
+long3 __ovld __cnfn convert_long3_rtp(float3);
+long3 __ovld __cnfn convert_long3_sat_rtp(float3);
+long3 __ovld __cnfn convert_long3_rtn(float3);
+long3 __ovld __cnfn convert_long3_sat_rtn(float3);
+long3 __ovld __cnfn convert_long3(float3);
+long3 __ovld __cnfn convert_long3_sat(float3);
+ulong3 __ovld __cnfn convert_ulong3_rte(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(char3);
+ulong3 __ovld __cnfn convert_ulong3(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat(char3);
+ulong3 __ovld __cnfn convert_ulong3_rte(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(uchar3);
+ulong3 __ovld __cnfn convert_ulong3(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rte(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(short3);
+ulong3 __ovld __cnfn convert_ulong3(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat(short3);
+ulong3 __ovld __cnfn convert_ulong3_rte(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(ushort3);
+ulong3 __ovld __cnfn convert_ulong3(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rte(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(int3);
+ulong3 __ovld __cnfn convert_ulong3(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat(int3);
+ulong3 __ovld __cnfn convert_ulong3_rte(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(uint3);
+ulong3 __ovld __cnfn convert_ulong3(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rte(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(long3);
+ulong3 __ovld __cnfn convert_ulong3(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat(long3);
+ulong3 __ovld __cnfn convert_ulong3_rte(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(ulong3);
+ulong3 __ovld __cnfn convert_ulong3(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rte(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(float3);
+ulong3 __ovld __cnfn convert_ulong3(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat(float3);
+float3 __ovld __cnfn convert_float3_rte(char3);
+float3 __ovld __cnfn convert_float3_rtz(char3);
+float3 __ovld __cnfn convert_float3_rtp(char3);
+float3 __ovld __cnfn convert_float3_rtn(char3);
+float3 __ovld __cnfn convert_float3(char3);
+float3 __ovld __cnfn convert_float3_rte(uchar3);
+float3 __ovld __cnfn convert_float3_rtz(uchar3);
+float3 __ovld __cnfn convert_float3_rtp(uchar3);
+float3 __ovld __cnfn convert_float3_rtn(uchar3);
+float3 __ovld __cnfn convert_float3(uchar3);
+float3 __ovld __cnfn convert_float3_rte(short3);
+float3 __ovld __cnfn convert_float3_rtz(short3);
+float3 __ovld __cnfn convert_float3_rtp(short3);
+float3 __ovld __cnfn convert_float3_rtn(short3);
+float3 __ovld __cnfn convert_float3(short3);
+float3 __ovld __cnfn convert_float3_rte(ushort3);
+float3 __ovld __cnfn convert_float3_rtz(ushort3);
+float3 __ovld __cnfn convert_float3_rtp(ushort3);
+float3 __ovld __cnfn convert_float3_rtn(ushort3);
+float3 __ovld __cnfn convert_float3(ushort3);
+float3 __ovld __cnfn convert_float3_rte(int3);
+float3 __ovld __cnfn convert_float3_rtz(int3);
+float3 __ovld __cnfn convert_float3_rtp(int3);
+float3 __ovld __cnfn convert_float3_rtn(int3);
+float3 __ovld __cnfn convert_float3(int3);
+float3 __ovld __cnfn convert_float3_rte(uint3);
+float3 __ovld __cnfn convert_float3_rtz(uint3);
+float3 __ovld __cnfn convert_float3_rtp(uint3);
+float3 __ovld __cnfn convert_float3_rtn(uint3);
+float3 __ovld __cnfn convert_float3(uint3);
+float3 __ovld __cnfn convert_float3_rte(long3);
+float3 __ovld __cnfn convert_float3_rtz(long3);
+float3 __ovld __cnfn convert_float3_rtp(long3);
+float3 __ovld __cnfn convert_float3_rtn(long3);
+float3 __ovld __cnfn convert_float3(long3);
+float3 __ovld __cnfn convert_float3_rte(ulong3);
+float3 __ovld __cnfn convert_float3_rtz(ulong3);
+float3 __ovld __cnfn convert_float3_rtp(ulong3);
+float3 __ovld __cnfn convert_float3_rtn(ulong3);
+float3 __ovld __cnfn convert_float3(ulong3);
+float3 __ovld __cnfn convert_float3_rte(float3);
+float3 __ovld __cnfn convert_float3_rtz(float3);
+float3 __ovld __cnfn convert_float3_rtp(float3);
+float3 __ovld __cnfn convert_float3_rtn(float3);
+float3 __ovld __cnfn convert_float3(float3);
+char4 __ovld __cnfn convert_char4_rte(char4);
+char4 __ovld __cnfn convert_char4_sat_rte(char4);
+char4 __ovld __cnfn convert_char4_rtz(char4);
+char4 __ovld __cnfn convert_char4_sat_rtz(char4);
+char4 __ovld __cnfn convert_char4_rtp(char4);
+char4 __ovld __cnfn convert_char4_sat_rtp(char4);
+char4 __ovld __cnfn convert_char4_rtn(char4);
+char4 __ovld __cnfn convert_char4_sat_rtn(char4);
+char4 __ovld __cnfn convert_char4(char4);
+char4 __ovld __cnfn convert_char4_sat(char4);
+char4 __ovld __cnfn convert_char4_rte(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rte(uchar4);
+char4 __ovld __cnfn convert_char4_rtz(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtz(uchar4);
+char4 __ovld __cnfn convert_char4_rtp(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtp(uchar4);
+char4 __ovld __cnfn convert_char4_rtn(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtn(uchar4);
+char4 __ovld __cnfn convert_char4(uchar4);
+char4 __ovld __cnfn convert_char4_sat(uchar4);
+char4 __ovld __cnfn convert_char4_rte(short4);
+char4 __ovld __cnfn convert_char4_sat_rte(short4);
+char4 __ovld __cnfn convert_char4_rtz(short4);
+char4 __ovld __cnfn convert_char4_sat_rtz(short4);
+char4 __ovld __cnfn convert_char4_rtp(short4);
+char4 __ovld __cnfn convert_char4_sat_rtp(short4);
+char4 __ovld __cnfn convert_char4_rtn(short4);
+char4 __ovld __cnfn convert_char4_sat_rtn(short4);
+char4 __ovld __cnfn convert_char4(short4);
+char4 __ovld __cnfn convert_char4_sat(short4);
+char4 __ovld __cnfn convert_char4_rte(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rte(ushort4);
+char4 __ovld __cnfn convert_char4_rtz(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtz(ushort4);
+char4 __ovld __cnfn convert_char4_rtp(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtp(ushort4);
+char4 __ovld __cnfn convert_char4_rtn(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtn(ushort4);
+char4 __ovld __cnfn convert_char4(ushort4);
+char4 __ovld __cnfn convert_char4_sat(ushort4);
+char4 __ovld __cnfn convert_char4_rte(int4);
+char4 __ovld __cnfn convert_char4_sat_rte(int4);
+char4 __ovld __cnfn convert_char4_rtz(int4);
+char4 __ovld __cnfn convert_char4_sat_rtz(int4);
+char4 __ovld __cnfn convert_char4_rtp(int4);
+char4 __ovld __cnfn convert_char4_sat_rtp(int4);
+char4 __ovld __cnfn convert_char4_rtn(int4);
+char4 __ovld __cnfn convert_char4_sat_rtn(int4);
+char4 __ovld __cnfn convert_char4(int4);
+char4 __ovld __cnfn convert_char4_sat(int4);
+char4 __ovld __cnfn convert_char4_rte(uint4);
+char4 __ovld __cnfn convert_char4_sat_rte(uint4);
+char4 __ovld __cnfn convert_char4_rtz(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtz(uint4);
+char4 __ovld __cnfn convert_char4_rtp(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtp(uint4);
+char4 __ovld __cnfn convert_char4_rtn(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtn(uint4);
+char4 __ovld __cnfn convert_char4(uint4);
+char4 __ovld __cnfn convert_char4_sat(uint4);
+char4 __ovld __cnfn convert_char4_rte(long4);
+char4 __ovld __cnfn convert_char4_sat_rte(long4);
+char4 __ovld __cnfn convert_char4_rtz(long4);
+char4 __ovld __cnfn convert_char4_sat_rtz(long4);
+char4 __ovld __cnfn convert_char4_rtp(long4);
+char4 __ovld __cnfn convert_char4_sat_rtp(long4);
+char4 __ovld __cnfn convert_char4_rtn(long4);
+char4 __ovld __cnfn convert_char4_sat_rtn(long4);
+char4 __ovld __cnfn convert_char4(long4);
+char4 __ovld __cnfn convert_char4_sat(long4);
+char4 __ovld __cnfn convert_char4_rte(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rte(ulong4);
+char4 __ovld __cnfn convert_char4_rtz(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtz(ulong4);
+char4 __ovld __cnfn convert_char4_rtp(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtp(ulong4);
+char4 __ovld __cnfn convert_char4_rtn(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtn(ulong4);
+char4 __ovld __cnfn convert_char4(ulong4);
+char4 __ovld __cnfn convert_char4_sat(ulong4);
+char4 __ovld __cnfn convert_char4_rte(float4);
+char4 __ovld __cnfn convert_char4_sat_rte(float4);
+char4 __ovld __cnfn convert_char4_rtz(float4);
+char4 __ovld __cnfn convert_char4_sat_rtz(float4);
+char4 __ovld __cnfn convert_char4_rtp(float4);
+char4 __ovld __cnfn convert_char4_sat_rtp(float4);
+char4 __ovld __cnfn convert_char4_rtn(float4);
+char4 __ovld __cnfn convert_char4_sat_rtn(float4);
+char4 __ovld __cnfn convert_char4(float4);
+char4 __ovld __cnfn convert_char4_sat(float4);
+uchar4 __ovld __cnfn convert_uchar4_rte(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(char4);
+uchar4 __ovld __cnfn convert_uchar4(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat(char4);
+uchar4 __ovld __cnfn convert_uchar4_rte(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(uchar4);
+uchar4 __ovld __cnfn convert_uchar4(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rte(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(short4);
+uchar4 __ovld __cnfn convert_uchar4(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat(short4);
+uchar4 __ovld __cnfn convert_uchar4_rte(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(ushort4);
+uchar4 __ovld __cnfn convert_uchar4(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rte(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(int4);
+uchar4 __ovld __cnfn convert_uchar4(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat(int4);
+uchar4 __ovld __cnfn convert_uchar4_rte(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(uint4);
+uchar4 __ovld __cnfn convert_uchar4(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rte(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(long4);
+uchar4 __ovld __cnfn convert_uchar4(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat(long4);
+uchar4 __ovld __cnfn convert_uchar4_rte(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(ulong4);
+uchar4 __ovld __cnfn convert_uchar4(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rte(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(float4);
+uchar4 __ovld __cnfn convert_uchar4(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat(float4);
+short4 __ovld __cnfn convert_short4_rte(char4);
+short4 __ovld __cnfn convert_short4_sat_rte(char4);
+short4 __ovld __cnfn convert_short4_rtz(char4);
+short4 __ovld __cnfn convert_short4_sat_rtz(char4);
+short4 __ovld __cnfn convert_short4_rtp(char4);
+short4 __ovld __cnfn convert_short4_sat_rtp(char4);
+short4 __ovld __cnfn convert_short4_rtn(char4);
+short4 __ovld __cnfn convert_short4_sat_rtn(char4);
+short4 __ovld __cnfn convert_short4(char4);
+short4 __ovld __cnfn convert_short4_sat(char4);
+short4 __ovld __cnfn convert_short4_rte(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rte(uchar4);
+short4 __ovld __cnfn convert_short4_rtz(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtz(uchar4);
+short4 __ovld __cnfn convert_short4_rtp(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtp(uchar4);
+short4 __ovld __cnfn convert_short4_rtn(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtn(uchar4);
+short4 __ovld __cnfn convert_short4(uchar4);
+short4 __ovld __cnfn convert_short4_sat(uchar4);
+short4 __ovld __cnfn convert_short4_rte(short4);
+short4 __ovld __cnfn convert_short4_sat_rte(short4);
+short4 __ovld __cnfn convert_short4_rtz(short4);
+short4 __ovld __cnfn convert_short4_sat_rtz(short4);
+short4 __ovld __cnfn convert_short4_rtp(short4);
+short4 __ovld __cnfn convert_short4_sat_rtp(short4);
+short4 __ovld __cnfn convert_short4_rtn(short4);
+short4 __ovld __cnfn convert_short4_sat_rtn(short4);
+short4 __ovld __cnfn convert_short4(short4);
+short4 __ovld __cnfn convert_short4_sat(short4);
+short4 __ovld __cnfn convert_short4_rte(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rte(ushort4);
+short4 __ovld __cnfn convert_short4_rtz(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtz(ushort4);
+short4 __ovld __cnfn convert_short4_rtp(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtp(ushort4);
+short4 __ovld __cnfn convert_short4_rtn(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtn(ushort4);
+short4 __ovld __cnfn convert_short4(ushort4);
+short4 __ovld __cnfn convert_short4_sat(ushort4);
+short4 __ovld __cnfn convert_short4_rte(int4);
+short4 __ovld __cnfn convert_short4_sat_rte(int4);
+short4 __ovld __cnfn convert_short4_rtz(int4);
+short4 __ovld __cnfn convert_short4_sat_rtz(int4);
+short4 __ovld __cnfn convert_short4_rtp(int4);
+short4 __ovld __cnfn convert_short4_sat_rtp(int4);
+short4 __ovld __cnfn convert_short4_rtn(int4);
+short4 __ovld __cnfn convert_short4_sat_rtn(int4);
+short4 __ovld __cnfn convert_short4(int4);
+short4 __ovld __cnfn convert_short4_sat(int4);
+short4 __ovld __cnfn convert_short4_rte(uint4);
+short4 __ovld __cnfn convert_short4_sat_rte(uint4);
+short4 __ovld __cnfn convert_short4_rtz(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtz(uint4);
+short4 __ovld __cnfn convert_short4_rtp(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtp(uint4);
+short4 __ovld __cnfn convert_short4_rtn(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtn(uint4);
+short4 __ovld __cnfn convert_short4(uint4);
+short4 __ovld __cnfn convert_short4_sat(uint4);
+short4 __ovld __cnfn convert_short4_rte(long4);
+short4 __ovld __cnfn convert_short4_sat_rte(long4);
+short4 __ovld __cnfn convert_short4_rtz(long4);
+short4 __ovld __cnfn convert_short4_sat_rtz(long4);
+short4 __ovld __cnfn convert_short4_rtp(long4);
+short4 __ovld __cnfn convert_short4_sat_rtp(long4);
+short4 __ovld __cnfn convert_short4_rtn(long4);
+short4 __ovld __cnfn convert_short4_sat_rtn(long4);
+short4 __ovld __cnfn convert_short4(long4);
+short4 __ovld __cnfn convert_short4_sat(long4);
+short4 __ovld __cnfn convert_short4_rte(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rte(ulong4);
+short4 __ovld __cnfn convert_short4_rtz(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtz(ulong4);
+short4 __ovld __cnfn convert_short4_rtp(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtp(ulong4);
+short4 __ovld __cnfn convert_short4_rtn(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtn(ulong4);
+short4 __ovld __cnfn convert_short4(ulong4);
+short4 __ovld __cnfn convert_short4_sat(ulong4);
+short4 __ovld __cnfn convert_short4_rte(float4);
+short4 __ovld __cnfn convert_short4_sat_rte(float4);
+short4 __ovld __cnfn convert_short4_rtz(float4);
+short4 __ovld __cnfn convert_short4_sat_rtz(float4);
+short4 __ovld __cnfn convert_short4_rtp(float4);
+short4 __ovld __cnfn convert_short4_sat_rtp(float4);
+short4 __ovld __cnfn convert_short4_rtn(float4);
+short4 __ovld __cnfn convert_short4_sat_rtn(float4);
+short4 __ovld __cnfn convert_short4(float4);
+short4 __ovld __cnfn convert_short4_sat(float4);
+ushort4 __ovld __cnfn convert_ushort4_rte(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(char4);
+ushort4 __ovld __cnfn convert_ushort4(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat(char4);
+ushort4 __ovld __cnfn convert_ushort4_rte(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(uchar4);
+ushort4 __ovld __cnfn convert_ushort4(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rte(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(short4);
+ushort4 __ovld __cnfn convert_ushort4(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat(short4);
+ushort4 __ovld __cnfn convert_ushort4_rte(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(ushort4);
+ushort4 __ovld __cnfn convert_ushort4(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rte(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(int4);
+ushort4 __ovld __cnfn convert_ushort4(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat(int4);
+ushort4 __ovld __cnfn convert_ushort4_rte(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(uint4);
+ushort4 __ovld __cnfn convert_ushort4(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rte(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(long4);
+ushort4 __ovld __cnfn convert_ushort4(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat(long4);
+ushort4 __ovld __cnfn convert_ushort4_rte(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(ulong4);
+ushort4 __ovld __cnfn convert_ushort4(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rte(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(float4);
+ushort4 __ovld __cnfn convert_ushort4(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat(float4);
+int4 __ovld __cnfn convert_int4_rte(char4);
+int4 __ovld __cnfn convert_int4_sat_rte(char4);
+int4 __ovld __cnfn convert_int4_rtz(char4);
+int4 __ovld __cnfn convert_int4_sat_rtz(char4);
+int4 __ovld __cnfn convert_int4_rtp(char4);
+int4 __ovld __cnfn convert_int4_sat_rtp(char4);
+int4 __ovld __cnfn convert_int4_rtn(char4);
+int4 __ovld __cnfn convert_int4_sat_rtn(char4);
+int4 __ovld __cnfn convert_int4(char4);
+int4 __ovld __cnfn convert_int4_sat(char4);
+int4 __ovld __cnfn convert_int4_rte(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rte(uchar4);
+int4 __ovld __cnfn convert_int4_rtz(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtz(uchar4);
+int4 __ovld __cnfn convert_int4_rtp(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtp(uchar4);
+int4 __ovld __cnfn convert_int4_rtn(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtn(uchar4);
+int4 __ovld __cnfn convert_int4(uchar4);
+int4 __ovld __cnfn convert_int4_sat(uchar4);
+int4 __ovld __cnfn convert_int4_rte(short4);
+int4 __ovld __cnfn convert_int4_sat_rte(short4);
+int4 __ovld __cnfn convert_int4_rtz(short4);
+int4 __ovld __cnfn convert_int4_sat_rtz(short4);
+int4 __ovld __cnfn convert_int4_rtp(short4);
+int4 __ovld __cnfn convert_int4_sat_rtp(short4);
+int4 __ovld __cnfn convert_int4_rtn(short4);
+int4 __ovld __cnfn convert_int4_sat_rtn(short4);
+int4 __ovld __cnfn convert_int4(short4);
+int4 __ovld __cnfn convert_int4_sat(short4);
+int4 __ovld __cnfn convert_int4_rte(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rte(ushort4);
+int4 __ovld __cnfn convert_int4_rtz(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtz(ushort4);
+int4 __ovld __cnfn convert_int4_rtp(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtp(ushort4);
+int4 __ovld __cnfn convert_int4_rtn(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtn(ushort4);
+int4 __ovld __cnfn convert_int4(ushort4);
+int4 __ovld __cnfn convert_int4_sat(ushort4);
+int4 __ovld __cnfn convert_int4_rte(int4);
+int4 __ovld __cnfn convert_int4_sat_rte(int4);
+int4 __ovld __cnfn convert_int4_rtz(int4);
+int4 __ovld __cnfn convert_int4_sat_rtz(int4);
+int4 __ovld __cnfn convert_int4_rtp(int4);
+int4 __ovld __cnfn convert_int4_sat_rtp(int4);
+int4 __ovld __cnfn convert_int4_rtn(int4);
+int4 __ovld __cnfn convert_int4_sat_rtn(int4);
+int4 __ovld __cnfn convert_int4(int4);
+int4 __ovld __cnfn convert_int4_sat(int4);
+int4 __ovld __cnfn convert_int4_rte(uint4);
+int4 __ovld __cnfn convert_int4_sat_rte(uint4);
+int4 __ovld __cnfn convert_int4_rtz(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtz(uint4);
+int4 __ovld __cnfn convert_int4_rtp(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtp(uint4);
+int4 __ovld __cnfn convert_int4_rtn(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtn(uint4);
+int4 __ovld __cnfn convert_int4(uint4);
+int4 __ovld __cnfn convert_int4_sat(uint4);
+int4 __ovld __cnfn convert_int4_rte(long4);
+int4 __ovld __cnfn convert_int4_sat_rte(long4);
+int4 __ovld __cnfn convert_int4_rtz(long4);
+int4 __ovld __cnfn convert_int4_sat_rtz(long4);
+int4 __ovld __cnfn convert_int4_rtp(long4);
+int4 __ovld __cnfn convert_int4_sat_rtp(long4);
+int4 __ovld __cnfn convert_int4_rtn(long4);
+int4 __ovld __cnfn convert_int4_sat_rtn(long4);
+int4 __ovld __cnfn convert_int4(long4);
+int4 __ovld __cnfn convert_int4_sat(long4);
+int4 __ovld __cnfn convert_int4_rte(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rte(ulong4);
+int4 __ovld __cnfn convert_int4_rtz(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtz(ulong4);
+int4 __ovld __cnfn convert_int4_rtp(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtp(ulong4);
+int4 __ovld __cnfn convert_int4_rtn(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtn(ulong4);
+int4 __ovld __cnfn convert_int4(ulong4);
+int4 __ovld __cnfn convert_int4_sat(ulong4);
+int4 __ovld __cnfn convert_int4_rte(float4);
+int4 __ovld __cnfn convert_int4_sat_rte(float4);
+int4 __ovld __cnfn convert_int4_rtz(float4);
+int4 __ovld __cnfn convert_int4_sat_rtz(float4);
+int4 __ovld __cnfn convert_int4_rtp(float4);
+int4 __ovld __cnfn convert_int4_sat_rtp(float4);
+int4 __ovld __cnfn convert_int4_rtn(float4);
+int4 __ovld __cnfn convert_int4_sat_rtn(float4);
+int4 __ovld __cnfn convert_int4(float4);
+int4 __ovld __cnfn convert_int4_sat(float4);
+uint4 __ovld __cnfn convert_uint4_rte(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(char4);
+uint4 __ovld __cnfn convert_uint4_rtz(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(char4);
+uint4 __ovld __cnfn convert_uint4_rtp(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(char4);
+uint4 __ovld __cnfn convert_uint4_rtn(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(char4);
+uint4 __ovld __cnfn convert_uint4(char4);
+uint4 __ovld __cnfn convert_uint4_sat(char4);
+uint4 __ovld __cnfn convert_uint4_rte(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtz(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtp(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtn(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(uchar4);
+uint4 __ovld __cnfn convert_uint4(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat(uchar4);
+uint4 __ovld __cnfn convert_uint4_rte(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(short4);
+uint4 __ovld __cnfn convert_uint4_rtz(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(short4);
+uint4 __ovld __cnfn convert_uint4_rtp(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(short4);
+uint4 __ovld __cnfn convert_uint4_rtn(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(short4);
+uint4 __ovld __cnfn convert_uint4(short4);
+uint4 __ovld __cnfn convert_uint4_sat(short4);
+uint4 __ovld __cnfn convert_uint4_rte(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtz(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtp(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtn(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(ushort4);
+uint4 __ovld __cnfn convert_uint4(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat(ushort4);
+uint4 __ovld __cnfn convert_uint4_rte(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(int4);
+uint4 __ovld __cnfn convert_uint4_rtz(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(int4);
+uint4 __ovld __cnfn convert_uint4_rtp(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(int4);
+uint4 __ovld __cnfn convert_uint4_rtn(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(int4);
+uint4 __ovld __cnfn convert_uint4(int4);
+uint4 __ovld __cnfn convert_uint4_sat(int4);
+uint4 __ovld __cnfn convert_uint4_rte(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(uint4);
+uint4 __ovld __cnfn convert_uint4_rtz(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(uint4);
+uint4 __ovld __cnfn convert_uint4_rtp(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(uint4);
+uint4 __ovld __cnfn convert_uint4_rtn(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(uint4);
+uint4 __ovld __cnfn convert_uint4(uint4);
+uint4 __ovld __cnfn convert_uint4_sat(uint4);
+uint4 __ovld __cnfn convert_uint4_rte(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(long4);
+uint4 __ovld __cnfn convert_uint4_rtz(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(long4);
+uint4 __ovld __cnfn convert_uint4_rtp(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(long4);
+uint4 __ovld __cnfn convert_uint4_rtn(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(long4);
+uint4 __ovld __cnfn convert_uint4(long4);
+uint4 __ovld __cnfn convert_uint4_sat(long4);
+uint4 __ovld __cnfn convert_uint4_rte(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtz(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtp(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtn(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(ulong4);
+uint4 __ovld __cnfn convert_uint4(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat(ulong4);
+uint4 __ovld __cnfn convert_uint4_rte(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(float4);
+uint4 __ovld __cnfn convert_uint4_rtz(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(float4);
+uint4 __ovld __cnfn convert_uint4_rtp(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(float4);
+uint4 __ovld __cnfn convert_uint4_rtn(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(float4);
+uint4 __ovld __cnfn convert_uint4(float4);
+uint4 __ovld __cnfn convert_uint4_sat(float4);
+long4 __ovld __cnfn convert_long4_rte(char4);
+long4 __ovld __cnfn convert_long4_sat_rte(char4);
+long4 __ovld __cnfn convert_long4_rtz(char4);
+long4 __ovld __cnfn convert_long4_sat_rtz(char4);
+long4 __ovld __cnfn convert_long4_rtp(char4);
+long4 __ovld __cnfn convert_long4_sat_rtp(char4);
+long4 __ovld __cnfn convert_long4_rtn(char4);
+long4 __ovld __cnfn convert_long4_sat_rtn(char4);
+long4 __ovld __cnfn convert_long4(char4);
+long4 __ovld __cnfn convert_long4_sat(char4);
+long4 __ovld __cnfn convert_long4_rte(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rte(uchar4);
+long4 __ovld __cnfn convert_long4_rtz(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtz(uchar4);
+long4 __ovld __cnfn convert_long4_rtp(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtp(uchar4);
+long4 __ovld __cnfn convert_long4_rtn(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtn(uchar4);
+long4 __ovld __cnfn convert_long4(uchar4);
+long4 __ovld __cnfn convert_long4_sat(uchar4);
+long4 __ovld __cnfn convert_long4_rte(short4);
+long4 __ovld __cnfn convert_long4_sat_rte(short4);
+long4 __ovld __cnfn convert_long4_rtz(short4);
+long4 __ovld __cnfn convert_long4_sat_rtz(short4);
+long4 __ovld __cnfn convert_long4_rtp(short4);
+long4 __ovld __cnfn convert_long4_sat_rtp(short4);
+long4 __ovld __cnfn convert_long4_rtn(short4);
+long4 __ovld __cnfn convert_long4_sat_rtn(short4);
+long4 __ovld __cnfn convert_long4(short4);
+long4 __ovld __cnfn convert_long4_sat(short4);
+long4 __ovld __cnfn convert_long4_rte(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rte(ushort4);
+long4 __ovld __cnfn convert_long4_rtz(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtz(ushort4);
+long4 __ovld __cnfn convert_long4_rtp(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtp(ushort4);
+long4 __ovld __cnfn convert_long4_rtn(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtn(ushort4);
+long4 __ovld __cnfn convert_long4(ushort4);
+long4 __ovld __cnfn convert_long4_sat(ushort4);
+long4 __ovld __cnfn convert_long4_rte(int4);
+long4 __ovld __cnfn convert_long4_sat_rte(int4);
+long4 __ovld __cnfn convert_long4_rtz(int4);
+long4 __ovld __cnfn convert_long4_sat_rtz(int4);
+long4 __ovld __cnfn convert_long4_rtp(int4);
+long4 __ovld __cnfn convert_long4_sat_rtp(int4);
+long4 __ovld __cnfn convert_long4_rtn(int4);
+long4 __ovld __cnfn convert_long4_sat_rtn(int4);
+long4 __ovld __cnfn convert_long4(int4);
+long4 __ovld __cnfn convert_long4_sat(int4);
+long4 __ovld __cnfn convert_long4_rte(uint4);
+long4 __ovld __cnfn convert_long4_sat_rte(uint4);
+long4 __ovld __cnfn convert_long4_rtz(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtz(uint4);
+long4 __ovld __cnfn convert_long4_rtp(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtp(uint4);
+long4 __ovld __cnfn convert_long4_rtn(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtn(uint4);
+long4 __ovld __cnfn convert_long4(uint4);
+long4 __ovld __cnfn convert_long4_sat(uint4);
+long4 __ovld __cnfn convert_long4_rte(long4);
+long4 __ovld __cnfn convert_long4_sat_rte(long4);
+long4 __ovld __cnfn convert_long4_rtz(long4);
+long4 __ovld __cnfn convert_long4_sat_rtz(long4);
+long4 __ovld __cnfn convert_long4_rtp(long4);
+long4 __ovld __cnfn convert_long4_sat_rtp(long4);
+long4 __ovld __cnfn convert_long4_rtn(long4);
+long4 __ovld __cnfn convert_long4_sat_rtn(long4);
+long4 __ovld __cnfn convert_long4(long4);
+long4 __ovld __cnfn convert_long4_sat(long4);
+long4 __ovld __cnfn convert_long4_rte(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rte(ulong4);
+long4 __ovld __cnfn convert_long4_rtz(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtz(ulong4);
+long4 __ovld __cnfn convert_long4_rtp(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtp(ulong4);
+long4 __ovld __cnfn convert_long4_rtn(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtn(ulong4);
+long4 __ovld __cnfn convert_long4(ulong4);
+long4 __ovld __cnfn convert_long4_sat(ulong4);
+long4 __ovld __cnfn convert_long4_rte(float4);
+long4 __ovld __cnfn convert_long4_sat_rte(float4);
+long4 __ovld __cnfn convert_long4_rtz(float4);
+long4 __ovld __cnfn convert_long4_sat_rtz(float4);
+long4 __ovld __cnfn convert_long4_rtp(float4);
+long4 __ovld __cnfn convert_long4_sat_rtp(float4);
+long4 __ovld __cnfn convert_long4_rtn(float4);
+long4 __ovld __cnfn convert_long4_sat_rtn(float4);
+long4 __ovld __cnfn convert_long4(float4);
+long4 __ovld __cnfn convert_long4_sat(float4);
+ulong4 __ovld __cnfn convert_ulong4_rte(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(char4);
+ulong4 __ovld __cnfn convert_ulong4(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat(char4);
+ulong4 __ovld __cnfn convert_ulong4_rte(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(uchar4);
+ulong4 __ovld __cnfn convert_ulong4(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rte(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(short4);
+ulong4 __ovld __cnfn convert_ulong4(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat(short4);
+ulong4 __ovld __cnfn convert_ulong4_rte(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(ushort4);
+ulong4 __ovld __cnfn convert_ulong4(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rte(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(int4);
+ulong4 __ovld __cnfn convert_ulong4(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat(int4);
+ulong4 __ovld __cnfn convert_ulong4_rte(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(uint4);
+ulong4 __ovld __cnfn convert_ulong4(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rte(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(long4);
+ulong4 __ovld __cnfn convert_ulong4(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat(long4);
+ulong4 __ovld __cnfn convert_ulong4_rte(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(ulong4);
+ulong4 __ovld __cnfn convert_ulong4(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rte(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(float4);
+ulong4 __ovld __cnfn convert_ulong4(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat(float4);
+float4 __ovld __cnfn convert_float4_rte(char4);
+float4 __ovld __cnfn convert_float4_rtz(char4);
+float4 __ovld __cnfn convert_float4_rtp(char4);
+float4 __ovld __cnfn convert_float4_rtn(char4);
+float4 __ovld __cnfn convert_float4(char4);
+float4 __ovld __cnfn convert_float4_rte(uchar4);
+float4 __ovld __cnfn convert_float4_rtz(uchar4);
+float4 __ovld __cnfn convert_float4_rtp(uchar4);
+float4 __ovld __cnfn convert_float4_rtn(uchar4);
+float4 __ovld __cnfn convert_float4(uchar4);
+float4 __ovld __cnfn convert_float4_rte(short4);
+float4 __ovld __cnfn convert_float4_rtz(short4);
+float4 __ovld __cnfn convert_float4_rtp(short4);
+float4 __ovld __cnfn convert_float4_rtn(short4);
+float4 __ovld __cnfn convert_float4(short4);
+float4 __ovld __cnfn convert_float4_rte(ushort4);
+float4 __ovld __cnfn convert_float4_rtz(ushort4);
+float4 __ovld __cnfn convert_float4_rtp(ushort4);
+float4 __ovld __cnfn convert_float4_rtn(ushort4);
+float4 __ovld __cnfn convert_float4(ushort4);
+float4 __ovld __cnfn convert_float4_rte(int4);
+float4 __ovld __cnfn convert_float4_rtz(int4);
+float4 __ovld __cnfn convert_float4_rtp(int4);
+float4 __ovld __cnfn convert_float4_rtn(int4);
+float4 __ovld __cnfn convert_float4(int4);
+float4 __ovld __cnfn convert_float4_rte(uint4);
+float4 __ovld __cnfn convert_float4_rtz(uint4);
+float4 __ovld __cnfn convert_float4_rtp(uint4);
+float4 __ovld __cnfn convert_float4_rtn(uint4);
+float4 __ovld __cnfn convert_float4(uint4);
+float4 __ovld __cnfn convert_float4_rte(long4);
+float4 __ovld __cnfn convert_float4_rtz(long4);
+float4 __ovld __cnfn convert_float4_rtp(long4);
+float4 __ovld __cnfn convert_float4_rtn(long4);
+float4 __ovld __cnfn convert_float4(long4);
+float4 __ovld __cnfn convert_float4_rte(ulong4);
+float4 __ovld __cnfn convert_float4_rtz(ulong4);
+float4 __ovld __cnfn convert_float4_rtp(ulong4);
+float4 __ovld __cnfn convert_float4_rtn(ulong4);
+float4 __ovld __cnfn convert_float4(ulong4);
+float4 __ovld __cnfn convert_float4_rte(float4);
+float4 __ovld __cnfn convert_float4_rtz(float4);
+float4 __ovld __cnfn convert_float4_rtp(float4);
+float4 __ovld __cnfn convert_float4_rtn(float4);
+float4 __ovld __cnfn convert_float4(float4);
+char8 __ovld __cnfn convert_char8_rte(char8);
+char8 __ovld __cnfn convert_char8_sat_rte(char8);
+char8 __ovld __cnfn convert_char8_rtz(char8);
+char8 __ovld __cnfn convert_char8_sat_rtz(char8);
+char8 __ovld __cnfn convert_char8_rtp(char8);
+char8 __ovld __cnfn convert_char8_sat_rtp(char8);
+char8 __ovld __cnfn convert_char8_rtn(char8);
+char8 __ovld __cnfn convert_char8_sat_rtn(char8);
+char8 __ovld __cnfn convert_char8(char8);
+char8 __ovld __cnfn convert_char8_sat(char8);
+char8 __ovld __cnfn convert_char8_rte(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rte(uchar8);
+char8 __ovld __cnfn convert_char8_rtz(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtz(uchar8);
+char8 __ovld __cnfn convert_char8_rtp(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtp(uchar8);
+char8 __ovld __cnfn convert_char8_rtn(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtn(uchar8);
+char8 __ovld __cnfn convert_char8(uchar8);
+char8 __ovld __cnfn convert_char8_sat(uchar8);
+char8 __ovld __cnfn convert_char8_rte(short8);
+char8 __ovld __cnfn convert_char8_sat_rte(short8);
+char8 __ovld __cnfn convert_char8_rtz(short8);
+char8 __ovld __cnfn convert_char8_sat_rtz(short8);
+char8 __ovld __cnfn convert_char8_rtp(short8);
+char8 __ovld __cnfn convert_char8_sat_rtp(short8);
+char8 __ovld __cnfn convert_char8_rtn(short8);
+char8 __ovld __cnfn convert_char8_sat_rtn(short8);
+char8 __ovld __cnfn convert_char8(short8);
+char8 __ovld __cnfn convert_char8_sat(short8);
+char8 __ovld __cnfn convert_char8_rte(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rte(ushort8);
+char8 __ovld __cnfn convert_char8_rtz(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtz(ushort8);
+char8 __ovld __cnfn convert_char8_rtp(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtp(ushort8);
+char8 __ovld __cnfn convert_char8_rtn(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtn(ushort8);
+char8 __ovld __cnfn convert_char8(ushort8);
+char8 __ovld __cnfn convert_char8_sat(ushort8);
+char8 __ovld __cnfn convert_char8_rte(int8);
+char8 __ovld __cnfn convert_char8_sat_rte(int8);
+char8 __ovld __cnfn convert_char8_rtz(int8);
+char8 __ovld __cnfn convert_char8_sat_rtz(int8);
+char8 __ovld __cnfn convert_char8_rtp(int8);
+char8 __ovld __cnfn convert_char8_sat_rtp(int8);
+char8 __ovld __cnfn convert_char8_rtn(int8);
+char8 __ovld __cnfn convert_char8_sat_rtn(int8);
+char8 __ovld __cnfn convert_char8(int8);
+char8 __ovld __cnfn convert_char8_sat(int8);
+char8 __ovld __cnfn convert_char8_rte(uint8);
+char8 __ovld __cnfn convert_char8_sat_rte(uint8);
+char8 __ovld __cnfn convert_char8_rtz(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtz(uint8);
+char8 __ovld __cnfn convert_char8_rtp(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtp(uint8);
+char8 __ovld __cnfn convert_char8_rtn(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtn(uint8);
+char8 __ovld __cnfn convert_char8(uint8);
+char8 __ovld __cnfn convert_char8_sat(uint8);
+char8 __ovld __cnfn convert_char8_rte(long8);
+char8 __ovld __cnfn convert_char8_sat_rte(long8);
+char8 __ovld __cnfn convert_char8_rtz(long8);
+char8 __ovld __cnfn convert_char8_sat_rtz(long8);
+char8 __ovld __cnfn convert_char8_rtp(long8);
+char8 __ovld __cnfn convert_char8_sat_rtp(long8);
+char8 __ovld __cnfn convert_char8_rtn(long8);
+char8 __ovld __cnfn convert_char8_sat_rtn(long8);
+char8 __ovld __cnfn convert_char8(long8);
+char8 __ovld __cnfn convert_char8_sat(long8);
+char8 __ovld __cnfn convert_char8_rte(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rte(ulong8);
+char8 __ovld __cnfn convert_char8_rtz(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtz(ulong8);
+char8 __ovld __cnfn convert_char8_rtp(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtp(ulong8);
+char8 __ovld __cnfn convert_char8_rtn(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtn(ulong8);
+char8 __ovld __cnfn convert_char8(ulong8);
+char8 __ovld __cnfn convert_char8_sat(ulong8);
+char8 __ovld __cnfn convert_char8_rte(float8);
+char8 __ovld __cnfn convert_char8_sat_rte(float8);
+char8 __ovld __cnfn convert_char8_rtz(float8);
+char8 __ovld __cnfn convert_char8_sat_rtz(float8);
+char8 __ovld __cnfn convert_char8_rtp(float8);
+char8 __ovld __cnfn convert_char8_sat_rtp(float8);
+char8 __ovld __cnfn convert_char8_rtn(float8);
+char8 __ovld __cnfn convert_char8_sat_rtn(float8);
+char8 __ovld __cnfn convert_char8(float8);
+char8 __ovld __cnfn convert_char8_sat(float8);
+uchar8 __ovld __cnfn convert_uchar8_rte(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(char8);
+uchar8 __ovld __cnfn convert_uchar8(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat(char8);
+uchar8 __ovld __cnfn convert_uchar8_rte(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(uchar8);
+uchar8 __ovld __cnfn convert_uchar8(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rte(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(short8);
+uchar8 __ovld __cnfn convert_uchar8(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat(short8);
+uchar8 __ovld __cnfn convert_uchar8_rte(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(ushort8);
+uchar8 __ovld __cnfn convert_uchar8(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rte(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(int8);
+uchar8 __ovld __cnfn convert_uchar8(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat(int8);
+uchar8 __ovld __cnfn convert_uchar8_rte(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(uint8);
+uchar8 __ovld __cnfn convert_uchar8(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rte(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(long8);
+uchar8 __ovld __cnfn convert_uchar8(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat(long8);
+uchar8 __ovld __cnfn convert_uchar8_rte(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(ulong8);
+uchar8 __ovld __cnfn convert_uchar8(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rte(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(float8);
+uchar8 __ovld __cnfn convert_uchar8(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat(float8);
+short8 __ovld __cnfn convert_short8_rte(char8);
+short8 __ovld __cnfn convert_short8_sat_rte(char8);
+short8 __ovld __cnfn convert_short8_rtz(char8);
+short8 __ovld __cnfn convert_short8_sat_rtz(char8);
+short8 __ovld __cnfn convert_short8_rtp(char8);
+short8 __ovld __cnfn convert_short8_sat_rtp(char8);
+short8 __ovld __cnfn convert_short8_rtn(char8);
+short8 __ovld __cnfn convert_short8_sat_rtn(char8);
+short8 __ovld __cnfn convert_short8(char8);
+short8 __ovld __cnfn convert_short8_sat(char8);
+short8 __ovld __cnfn convert_short8_rte(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rte(uchar8);
+short8 __ovld __cnfn convert_short8_rtz(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtz(uchar8);
+short8 __ovld __cnfn convert_short8_rtp(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtp(uchar8);
+short8 __ovld __cnfn convert_short8_rtn(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtn(uchar8);
+short8 __ovld __cnfn convert_short8(uchar8);
+short8 __ovld __cnfn convert_short8_sat(uchar8);
+short8 __ovld __cnfn convert_short8_rte(short8);
+short8 __ovld __cnfn convert_short8_sat_rte(short8);
+short8 __ovld __cnfn convert_short8_rtz(short8);
+short8 __ovld __cnfn convert_short8_sat_rtz(short8);
+short8 __ovld __cnfn convert_short8_rtp(short8);
+short8 __ovld __cnfn convert_short8_sat_rtp(short8);
+short8 __ovld __cnfn convert_short8_rtn(short8);
+short8 __ovld __cnfn convert_short8_sat_rtn(short8);
+short8 __ovld __cnfn convert_short8(short8);
+short8 __ovld __cnfn convert_short8_sat(short8);
+short8 __ovld __cnfn convert_short8_rte(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rte(ushort8);
+short8 __ovld __cnfn convert_short8_rtz(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtz(ushort8);
+short8 __ovld __cnfn convert_short8_rtp(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtp(ushort8);
+short8 __ovld __cnfn convert_short8_rtn(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtn(ushort8);
+short8 __ovld __cnfn convert_short8(ushort8);
+short8 __ovld __cnfn convert_short8_sat(ushort8);
+short8 __ovld __cnfn convert_short8_rte(int8);
+short8 __ovld __cnfn convert_short8_sat_rte(int8);
+short8 __ovld __cnfn convert_short8_rtz(int8);
+short8 __ovld __cnfn convert_short8_sat_rtz(int8);
+short8 __ovld __cnfn convert_short8_rtp(int8);
+short8 __ovld __cnfn convert_short8_sat_rtp(int8);
+short8 __ovld __cnfn convert_short8_rtn(int8);
+short8 __ovld __cnfn convert_short8_sat_rtn(int8);
+short8 __ovld __cnfn convert_short8(int8);
+short8 __ovld __cnfn convert_short8_sat(int8);
+short8 __ovld __cnfn convert_short8_rte(uint8);
+short8 __ovld __cnfn convert_short8_sat_rte(uint8);
+short8 __ovld __cnfn convert_short8_rtz(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtz(uint8);
+short8 __ovld __cnfn convert_short8_rtp(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtp(uint8);
+short8 __ovld __cnfn convert_short8_rtn(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtn(uint8);
+short8 __ovld __cnfn convert_short8(uint8);
+short8 __ovld __cnfn convert_short8_sat(uint8);
+short8 __ovld __cnfn convert_short8_rte(long8);
+short8 __ovld __cnfn convert_short8_sat_rte(long8);
+short8 __ovld __cnfn convert_short8_rtz(long8);
+short8 __ovld __cnfn convert_short8_sat_rtz(long8);
+short8 __ovld __cnfn convert_short8_rtp(long8);
+short8 __ovld __cnfn convert_short8_sat_rtp(long8);
+short8 __ovld __cnfn convert_short8_rtn(long8);
+short8 __ovld __cnfn convert_short8_sat_rtn(long8);
+short8 __ovld __cnfn convert_short8(long8);
+short8 __ovld __cnfn convert_short8_sat(long8);
+short8 __ovld __cnfn convert_short8_rte(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rte(ulong8);
+short8 __ovld __cnfn convert_short8_rtz(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtz(ulong8);
+short8 __ovld __cnfn convert_short8_rtp(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtp(ulong8);
+short8 __ovld __cnfn convert_short8_rtn(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtn(ulong8);
+short8 __ovld __cnfn convert_short8(ulong8);
+short8 __ovld __cnfn convert_short8_sat(ulong8);
+short8 __ovld __cnfn convert_short8_rte(float8);
+short8 __ovld __cnfn convert_short8_sat_rte(float8);
+short8 __ovld __cnfn convert_short8_rtz(float8);
+short8 __ovld __cnfn convert_short8_sat_rtz(float8);
+short8 __ovld __cnfn convert_short8_rtp(float8);
+short8 __ovld __cnfn convert_short8_sat_rtp(float8);
+short8 __ovld __cnfn convert_short8_rtn(float8);
+short8 __ovld __cnfn convert_short8_sat_rtn(float8);
+short8 __ovld __cnfn convert_short8(float8);
+short8 __ovld __cnfn convert_short8_sat(float8);
+ushort8 __ovld __cnfn convert_ushort8_rte(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(char8);
+ushort8 __ovld __cnfn convert_ushort8(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat(char8);
+ushort8 __ovld __cnfn convert_ushort8_rte(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(uchar8);
+ushort8 __ovld __cnfn convert_ushort8(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rte(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(short8);
+ushort8 __ovld __cnfn convert_ushort8(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat(short8);
+ushort8 __ovld __cnfn convert_ushort8_rte(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(ushort8);
+ushort8 __ovld __cnfn convert_ushort8(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rte(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(int8);
+ushort8 __ovld __cnfn convert_ushort8(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat(int8);
+ushort8 __ovld __cnfn convert_ushort8_rte(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(uint8);
+ushort8 __ovld __cnfn convert_ushort8(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rte(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(long8);
+ushort8 __ovld __cnfn convert_ushort8(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat(long8);
+ushort8 __ovld __cnfn convert_ushort8_rte(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(ulong8);
+ushort8 __ovld __cnfn convert_ushort8(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rte(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(float8);
+ushort8 __ovld __cnfn convert_ushort8(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat(float8);
+int8 __ovld __cnfn convert_int8_rte(char8);
+int8 __ovld __cnfn convert_int8_sat_rte(char8);
+int8 __ovld __cnfn convert_int8_rtz(char8);
+int8 __ovld __cnfn convert_int8_sat_rtz(char8);
+int8 __ovld __cnfn convert_int8_rtp(char8);
+int8 __ovld __cnfn convert_int8_sat_rtp(char8);
+int8 __ovld __cnfn convert_int8_rtn(char8);
+int8 __ovld __cnfn convert_int8_sat_rtn(char8);
+int8 __ovld __cnfn convert_int8(char8);
+int8 __ovld __cnfn convert_int8_sat(char8);
+int8 __ovld __cnfn convert_int8_rte(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rte(uchar8);
+int8 __ovld __cnfn convert_int8_rtz(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtz(uchar8);
+int8 __ovld __cnfn convert_int8_rtp(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtp(uchar8);
+int8 __ovld __cnfn convert_int8_rtn(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtn(uchar8);
+int8 __ovld __cnfn convert_int8(uchar8);
+int8 __ovld __cnfn convert_int8_sat(uchar8);
+int8 __ovld __cnfn convert_int8_rte(short8);
+int8 __ovld __cnfn convert_int8_sat_rte(short8);
+int8 __ovld __cnfn convert_int8_rtz(short8);
+int8 __ovld __cnfn convert_int8_sat_rtz(short8);
+int8 __ovld __cnfn convert_int8_rtp(short8);
+int8 __ovld __cnfn convert_int8_sat_rtp(short8);
+int8 __ovld __cnfn convert_int8_rtn(short8);
+int8 __ovld __cnfn convert_int8_sat_rtn(short8);
+int8 __ovld __cnfn convert_int8(short8);
+int8 __ovld __cnfn convert_int8_sat(short8);
+int8 __ovld __cnfn convert_int8_rte(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rte(ushort8);
+int8 __ovld __cnfn convert_int8_rtz(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtz(ushort8);
+int8 __ovld __cnfn convert_int8_rtp(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtp(ushort8);
+int8 __ovld __cnfn convert_int8_rtn(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtn(ushort8);
+int8 __ovld __cnfn convert_int8(ushort8);
+int8 __ovld __cnfn convert_int8_sat(ushort8);
+int8 __ovld __cnfn convert_int8_rte(int8);
+int8 __ovld __cnfn convert_int8_sat_rte(int8);
+int8 __ovld __cnfn convert_int8_rtz(int8);
+int8 __ovld __cnfn convert_int8_sat_rtz(int8);
+int8 __ovld __cnfn convert_int8_rtp(int8);
+int8 __ovld __cnfn convert_int8_sat_rtp(int8);
+int8 __ovld __cnfn convert_int8_rtn(int8);
+int8 __ovld __cnfn convert_int8_sat_rtn(int8);
+int8 __ovld __cnfn convert_int8(int8);
+int8 __ovld __cnfn convert_int8_sat(int8);
+int8 __ovld __cnfn convert_int8_rte(uint8);
+int8 __ovld __cnfn convert_int8_sat_rte(uint8);
+int8 __ovld __cnfn convert_int8_rtz(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtz(uint8);
+int8 __ovld __cnfn convert_int8_rtp(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtp(uint8);
+int8 __ovld __cnfn convert_int8_rtn(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtn(uint8);
+int8 __ovld __cnfn convert_int8(uint8);
+int8 __ovld __cnfn convert_int8_sat(uint8);
+int8 __ovld __cnfn convert_int8_rte(long8);
+int8 __ovld __cnfn convert_int8_sat_rte(long8);
+int8 __ovld __cnfn convert_int8_rtz(long8);
+int8 __ovld __cnfn convert_int8_sat_rtz(long8);
+int8 __ovld __cnfn convert_int8_rtp(long8);
+int8 __ovld __cnfn convert_int8_sat_rtp(long8);
+int8 __ovld __cnfn convert_int8_rtn(long8);
+int8 __ovld __cnfn convert_int8_sat_rtn(long8);
+int8 __ovld __cnfn convert_int8(long8);
+int8 __ovld __cnfn convert_int8_sat(long8);
+int8 __ovld __cnfn convert_int8_rte(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rte(ulong8);
+int8 __ovld __cnfn convert_int8_rtz(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtz(ulong8);
+int8 __ovld __cnfn convert_int8_rtp(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtp(ulong8);
+int8 __ovld __cnfn convert_int8_rtn(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtn(ulong8);
+int8 __ovld __cnfn convert_int8(ulong8);
+int8 __ovld __cnfn convert_int8_sat(ulong8);
+int8 __ovld __cnfn convert_int8_rte(float8);
+int8 __ovld __cnfn convert_int8_sat_rte(float8);
+int8 __ovld __cnfn convert_int8_rtz(float8);
+int8 __ovld __cnfn convert_int8_sat_rtz(float8);
+int8 __ovld __cnfn convert_int8_rtp(float8);
+int8 __ovld __cnfn convert_int8_sat_rtp(float8);
+int8 __ovld __cnfn convert_int8_rtn(float8);
+int8 __ovld __cnfn convert_int8_sat_rtn(float8);
+int8 __ovld __cnfn convert_int8(float8);
+int8 __ovld __cnfn convert_int8_sat(float8);
+uint8 __ovld __cnfn convert_uint8_rte(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(char8);
+uint8 __ovld __cnfn convert_uint8_rtz(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(char8);
+uint8 __ovld __cnfn convert_uint8_rtp(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(char8);
+uint8 __ovld __cnfn convert_uint8_rtn(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(char8);
+uint8 __ovld __cnfn convert_uint8(char8);
+uint8 __ovld __cnfn convert_uint8_sat(char8);
+uint8 __ovld __cnfn convert_uint8_rte(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtz(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtp(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtn(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(uchar8);
+uint8 __ovld __cnfn convert_uint8(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat(uchar8);
+uint8 __ovld __cnfn convert_uint8_rte(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(short8);
+uint8 __ovld __cnfn convert_uint8_rtz(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(short8);
+uint8 __ovld __cnfn convert_uint8_rtp(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(short8);
+uint8 __ovld __cnfn convert_uint8_rtn(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(short8);
+uint8 __ovld __cnfn convert_uint8(short8);
+uint8 __ovld __cnfn convert_uint8_sat(short8);
+uint8 __ovld __cnfn convert_uint8_rte(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtz(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtp(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtn(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(ushort8);
+uint8 __ovld __cnfn convert_uint8(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat(ushort8);
+uint8 __ovld __cnfn convert_uint8_rte(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(int8);
+uint8 __ovld __cnfn convert_uint8_rtz(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(int8);
+uint8 __ovld __cnfn convert_uint8_rtp(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(int8);
+uint8 __ovld __cnfn convert_uint8_rtn(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(int8);
+uint8 __ovld __cnfn convert_uint8(int8);
+uint8 __ovld __cnfn convert_uint8_sat(int8);
+uint8 __ovld __cnfn convert_uint8_rte(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(uint8);
+uint8 __ovld __cnfn convert_uint8_rtz(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(uint8);
+uint8 __ovld __cnfn convert_uint8_rtp(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(uint8);
+uint8 __ovld __cnfn convert_uint8_rtn(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(uint8);
+uint8 __ovld __cnfn convert_uint8(uint8);
+uint8 __ovld __cnfn convert_uint8_sat(uint8);
+uint8 __ovld __cnfn convert_uint8_rte(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(long8);
+uint8 __ovld __cnfn convert_uint8_rtz(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(long8);
+uint8 __ovld __cnfn convert_uint8_rtp(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(long8);
+uint8 __ovld __cnfn convert_uint8_rtn(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(long8);
+uint8 __ovld __cnfn convert_uint8(long8);
+uint8 __ovld __cnfn convert_uint8_sat(long8);
+uint8 __ovld __cnfn convert_uint8_rte(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtz(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtp(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtn(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(ulong8);
+uint8 __ovld __cnfn convert_uint8(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat(ulong8);
+uint8 __ovld __cnfn convert_uint8_rte(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(float8);
+uint8 __ovld __cnfn convert_uint8_rtz(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(float8);
+uint8 __ovld __cnfn convert_uint8_rtp(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(float8);
+uint8 __ovld __cnfn convert_uint8_rtn(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(float8);
+uint8 __ovld __cnfn convert_uint8(float8);
+uint8 __ovld __cnfn convert_uint8_sat(float8);
+long8 __ovld __cnfn convert_long8_rte(char8);
+long8 __ovld __cnfn convert_long8_sat_rte(char8);
+long8 __ovld __cnfn convert_long8_rtz(char8);
+long8 __ovld __cnfn convert_long8_sat_rtz(char8);
+long8 __ovld __cnfn convert_long8_rtp(char8);
+long8 __ovld __cnfn convert_long8_sat_rtp(char8);
+long8 __ovld __cnfn convert_long8_rtn(char8);
+long8 __ovld __cnfn convert_long8_sat_rtn(char8);
+long8 __ovld __cnfn convert_long8(char8);
+long8 __ovld __cnfn convert_long8_sat(char8);
+long8 __ovld __cnfn convert_long8_rte(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rte(uchar8);
+long8 __ovld __cnfn convert_long8_rtz(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtz(uchar8);
+long8 __ovld __cnfn convert_long8_rtp(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtp(uchar8);
+long8 __ovld __cnfn convert_long8_rtn(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtn(uchar8);
+long8 __ovld __cnfn convert_long8(uchar8);
+long8 __ovld __cnfn convert_long8_sat(uchar8);
+long8 __ovld __cnfn convert_long8_rte(short8);
+long8 __ovld __cnfn convert_long8_sat_rte(short8);
+long8 __ovld __cnfn convert_long8_rtz(short8);
+long8 __ovld __cnfn convert_long8_sat_rtz(short8);
+long8 __ovld __cnfn convert_long8_rtp(short8);
+long8 __ovld __cnfn convert_long8_sat_rtp(short8);
+long8 __ovld __cnfn convert_long8_rtn(short8);
+long8 __ovld __cnfn convert_long8_sat_rtn(short8);
+long8 __ovld __cnfn convert_long8(short8);
+long8 __ovld __cnfn convert_long8_sat(short8);
+long8 __ovld __cnfn convert_long8_rte(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rte(ushort8);
+long8 __ovld __cnfn convert_long8_rtz(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtz(ushort8);
+long8 __ovld __cnfn convert_long8_rtp(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtp(ushort8);
+long8 __ovld __cnfn convert_long8_rtn(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtn(ushort8);
+long8 __ovld __cnfn convert_long8(ushort8);
+long8 __ovld __cnfn convert_long8_sat(ushort8);
+long8 __ovld __cnfn convert_long8_rte(int8);
+long8 __ovld __cnfn convert_long8_sat_rte(int8);
+long8 __ovld __cnfn convert_long8_rtz(int8);
+long8 __ovld __cnfn convert_long8_sat_rtz(int8);
+long8 __ovld __cnfn convert_long8_rtp(int8);
+long8 __ovld __cnfn convert_long8_sat_rtp(int8);
+long8 __ovld __cnfn convert_long8_rtn(int8);
+long8 __ovld __cnfn convert_long8_sat_rtn(int8);
+long8 __ovld __cnfn convert_long8(int8);
+long8 __ovld __cnfn convert_long8_sat(int8);
+long8 __ovld __cnfn convert_long8_rte(uint8);
+long8 __ovld __cnfn convert_long8_sat_rte(uint8);
+long8 __ovld __cnfn convert_long8_rtz(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtz(uint8);
+long8 __ovld __cnfn convert_long8_rtp(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtp(uint8);
+long8 __ovld __cnfn convert_long8_rtn(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtn(uint8);
+long8 __ovld __cnfn convert_long8(uint8);
+long8 __ovld __cnfn convert_long8_sat(uint8);
+long8 __ovld __cnfn convert_long8_rte(long8);
+long8 __ovld __cnfn convert_long8_sat_rte(long8);
+long8 __ovld __cnfn convert_long8_rtz(long8);
+long8 __ovld __cnfn convert_long8_sat_rtz(long8);
+long8 __ovld __cnfn convert_long8_rtp(long8);
+long8 __ovld __cnfn convert_long8_sat_rtp(long8);
+long8 __ovld __cnfn convert_long8_rtn(long8);
+long8 __ovld __cnfn convert_long8_sat_rtn(long8);
+long8 __ovld __cnfn convert_long8(long8);
+long8 __ovld __cnfn convert_long8_sat(long8);
+long8 __ovld __cnfn convert_long8_rte(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rte(ulong8);
+long8 __ovld __cnfn convert_long8_rtz(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtz(ulong8);
+long8 __ovld __cnfn convert_long8_rtp(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtp(ulong8);
+long8 __ovld __cnfn convert_long8_rtn(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtn(ulong8);
+long8 __ovld __cnfn convert_long8(ulong8);
+long8 __ovld __cnfn convert_long8_sat(ulong8);
+long8 __ovld __cnfn convert_long8_rte(float8);
+long8 __ovld __cnfn convert_long8_sat_rte(float8);
+long8 __ovld __cnfn convert_long8_rtz(float8);
+long8 __ovld __cnfn convert_long8_sat_rtz(float8);
+long8 __ovld __cnfn convert_long8_rtp(float8);
+long8 __ovld __cnfn convert_long8_sat_rtp(float8);
+long8 __ovld __cnfn convert_long8_rtn(float8);
+long8 __ovld __cnfn convert_long8_sat_rtn(float8);
+long8 __ovld __cnfn convert_long8(float8);
+long8 __ovld __cnfn convert_long8_sat(float8);
+ulong8 __ovld __cnfn convert_ulong8_rte(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(char8);
+ulong8 __ovld __cnfn convert_ulong8(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat(char8);
+ulong8 __ovld __cnfn convert_ulong8_rte(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(uchar8);
+ulong8 __ovld __cnfn convert_ulong8(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rte(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(short8);
+ulong8 __ovld __cnfn convert_ulong8(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat(short8);
+ulong8 __ovld __cnfn convert_ulong8_rte(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(ushort8);
+ulong8 __ovld __cnfn convert_ulong8(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rte(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(int8);
+ulong8 __ovld __cnfn convert_ulong8(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat(int8);
+ulong8 __ovld __cnfn convert_ulong8_rte(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(uint8);
+ulong8 __ovld __cnfn convert_ulong8(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rte(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(long8);
+ulong8 __ovld __cnfn convert_ulong8(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat(long8);
+ulong8 __ovld __cnfn convert_ulong8_rte(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(ulong8);
+ulong8 __ovld __cnfn convert_ulong8(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rte(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(float8);
+ulong8 __ovld __cnfn convert_ulong8(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat(float8);
+float8 __ovld __cnfn convert_float8_rte(char8);
+float8 __ovld __cnfn convert_float8_rtz(char8);
+float8 __ovld __cnfn convert_float8_rtp(char8);
+float8 __ovld __cnfn convert_float8_rtn(char8);
+float8 __ovld __cnfn convert_float8(char8);
+float8 __ovld __cnfn convert_float8_rte(uchar8);
+float8 __ovld __cnfn convert_float8_rtz(uchar8);
+float8 __ovld __cnfn convert_float8_rtp(uchar8);
+float8 __ovld __cnfn convert_float8_rtn(uchar8);
+float8 __ovld __cnfn convert_float8(uchar8);
+float8 __ovld __cnfn convert_float8_rte(short8);
+float8 __ovld __cnfn convert_float8_rtz(short8);
+float8 __ovld __cnfn convert_float8_rtp(short8);
+float8 __ovld __cnfn convert_float8_rtn(short8);
+float8 __ovld __cnfn convert_float8(short8);
+float8 __ovld __cnfn convert_float8_rte(ushort8);
+float8 __ovld __cnfn convert_float8_rtz(ushort8);
+float8 __ovld __cnfn convert_float8_rtp(ushort8);
+float8 __ovld __cnfn convert_float8_rtn(ushort8);
+float8 __ovld __cnfn convert_float8(ushort8);
+float8 __ovld __cnfn convert_float8_rte(int8);
+float8 __ovld __cnfn convert_float8_rtz(int8);
+float8 __ovld __cnfn convert_float8_rtp(int8);
+float8 __ovld __cnfn convert_float8_rtn(int8);
+float8 __ovld __cnfn convert_float8(int8);
+float8 __ovld __cnfn convert_float8_rte(uint8);
+float8 __ovld __cnfn convert_float8_rtz(uint8);
+float8 __ovld __cnfn convert_float8_rtp(uint8);
+float8 __ovld __cnfn convert_float8_rtn(uint8);
+float8 __ovld __cnfn convert_float8(uint8);
+float8 __ovld __cnfn convert_float8_rte(long8);
+float8 __ovld __cnfn convert_float8_rtz(long8);
+float8 __ovld __cnfn convert_float8_rtp(long8);
+float8 __ovld __cnfn convert_float8_rtn(long8);
+float8 __ovld __cnfn convert_float8(long8);
+float8 __ovld __cnfn convert_float8_rte(ulong8);
+float8 __ovld __cnfn convert_float8_rtz(ulong8);
+float8 __ovld __cnfn convert_float8_rtp(ulong8);
+float8 __ovld __cnfn convert_float8_rtn(ulong8);
+float8 __ovld __cnfn convert_float8(ulong8);
+float8 __ovld __cnfn convert_float8_rte(float8);
+float8 __ovld __cnfn convert_float8_rtz(float8);
+float8 __ovld __cnfn convert_float8_rtp(float8);
+float8 __ovld __cnfn convert_float8_rtn(float8);
+float8 __ovld __cnfn convert_float8(float8);
+char16 __ovld __cnfn convert_char16_rte(char16);
+char16 __ovld __cnfn convert_char16_sat_rte(char16);
+char16 __ovld __cnfn convert_char16_rtz(char16);
+char16 __ovld __cnfn convert_char16_sat_rtz(char16);
+char16 __ovld __cnfn convert_char16_rtp(char16);
+char16 __ovld __cnfn convert_char16_sat_rtp(char16);
+char16 __ovld __cnfn convert_char16_rtn(char16);
+char16 __ovld __cnfn convert_char16_sat_rtn(char16);
+char16 __ovld __cnfn convert_char16(char16);
+char16 __ovld __cnfn convert_char16_sat(char16);
+char16 __ovld __cnfn convert_char16_rte(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rte(uchar16);
+char16 __ovld __cnfn convert_char16_rtz(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtz(uchar16);
+char16 __ovld __cnfn convert_char16_rtp(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtp(uchar16);
+char16 __ovld __cnfn convert_char16_rtn(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtn(uchar16);
+char16 __ovld __cnfn convert_char16(uchar16);
+char16 __ovld __cnfn convert_char16_sat(uchar16);
+char16 __ovld __cnfn convert_char16_rte(short16);
+char16 __ovld __cnfn convert_char16_sat_rte(short16);
+char16 __ovld __cnfn convert_char16_rtz(short16);
+char16 __ovld __cnfn convert_char16_sat_rtz(short16);
+char16 __ovld __cnfn convert_char16_rtp(short16);
+char16 __ovld __cnfn convert_char16_sat_rtp(short16);
+char16 __ovld __cnfn convert_char16_rtn(short16);
+char16 __ovld __cnfn convert_char16_sat_rtn(short16);
+char16 __ovld __cnfn convert_char16(short16);
+char16 __ovld __cnfn convert_char16_sat(short16);
+char16 __ovld __cnfn convert_char16_rte(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rte(ushort16);
+char16 __ovld __cnfn convert_char16_rtz(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtz(ushort16);
+char16 __ovld __cnfn convert_char16_rtp(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtp(ushort16);
+char16 __ovld __cnfn convert_char16_rtn(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtn(ushort16);
+char16 __ovld __cnfn convert_char16(ushort16);
+char16 __ovld __cnfn convert_char16_sat(ushort16);
+char16 __ovld __cnfn convert_char16_rte(int16);
+char16 __ovld __cnfn convert_char16_sat_rte(int16);
+char16 __ovld __cnfn convert_char16_rtz(int16);
+char16 __ovld __cnfn convert_char16_sat_rtz(int16);
+char16 __ovld __cnfn convert_char16_rtp(int16);
+char16 __ovld __cnfn convert_char16_sat_rtp(int16);
+char16 __ovld __cnfn convert_char16_rtn(int16);
+char16 __ovld __cnfn convert_char16_sat_rtn(int16);
+char16 __ovld __cnfn convert_char16(int16);
+char16 __ovld __cnfn convert_char16_sat(int16);
+char16 __ovld __cnfn convert_char16_rte(uint16);
+char16 __ovld __cnfn convert_char16_sat_rte(uint16);
+char16 __ovld __cnfn convert_char16_rtz(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtz(uint16);
+char16 __ovld __cnfn convert_char16_rtp(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtp(uint16);
+char16 __ovld __cnfn convert_char16_rtn(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtn(uint16);
+char16 __ovld __cnfn convert_char16(uint16);
+char16 __ovld __cnfn convert_char16_sat(uint16);
+char16 __ovld __cnfn convert_char16_rte(long16);
+char16 __ovld __cnfn convert_char16_sat_rte(long16);
+char16 __ovld __cnfn convert_char16_rtz(long16);
+char16 __ovld __cnfn convert_char16_sat_rtz(long16);
+char16 __ovld __cnfn convert_char16_rtp(long16);
+char16 __ovld __cnfn convert_char16_sat_rtp(long16);
+char16 __ovld __cnfn convert_char16_rtn(long16);
+char16 __ovld __cnfn convert_char16_sat_rtn(long16);
+char16 __ovld __cnfn convert_char16(long16);
+char16 __ovld __cnfn convert_char16_sat(long16);
+char16 __ovld __cnfn convert_char16_rte(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rte(ulong16);
+char16 __ovld __cnfn convert_char16_rtz(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtz(ulong16);
+char16 __ovld __cnfn convert_char16_rtp(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtp(ulong16);
+char16 __ovld __cnfn convert_char16_rtn(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtn(ulong16);
+char16 __ovld __cnfn convert_char16(ulong16);
+char16 __ovld __cnfn convert_char16_sat(ulong16);
+char16 __ovld __cnfn convert_char16_rte(float16);
+char16 __ovld __cnfn convert_char16_sat_rte(float16);
+char16 __ovld __cnfn convert_char16_rtz(float16);
+char16 __ovld __cnfn convert_char16_sat_rtz(float16);
+char16 __ovld __cnfn convert_char16_rtp(float16);
+char16 __ovld __cnfn convert_char16_sat_rtp(float16);
+char16 __ovld __cnfn convert_char16_rtn(float16);
+char16 __ovld __cnfn convert_char16_sat_rtn(float16);
+char16 __ovld __cnfn convert_char16(float16);
+char16 __ovld __cnfn convert_char16_sat(float16);
+uchar16 __ovld __cnfn convert_uchar16_rte(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(char16);
+uchar16 __ovld __cnfn convert_uchar16(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat(char16);
+uchar16 __ovld __cnfn convert_uchar16_rte(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(uchar16);
+uchar16 __ovld __cnfn convert_uchar16(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rte(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(short16);
+uchar16 __ovld __cnfn convert_uchar16(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat(short16);
+uchar16 __ovld __cnfn convert_uchar16_rte(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(ushort16);
+uchar16 __ovld __cnfn convert_uchar16(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rte(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(int16);
+uchar16 __ovld __cnfn convert_uchar16(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat(int16);
+uchar16 __ovld __cnfn convert_uchar16_rte(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(uint16);
+uchar16 __ovld __cnfn convert_uchar16(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rte(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(long16);
+uchar16 __ovld __cnfn convert_uchar16(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat(long16);
+uchar16 __ovld __cnfn convert_uchar16_rte(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(ulong16);
+uchar16 __ovld __cnfn convert_uchar16(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rte(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(float16);
+uchar16 __ovld __cnfn convert_uchar16(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat(float16);
+short16 __ovld __cnfn convert_short16_rte(char16);
+short16 __ovld __cnfn convert_short16_sat_rte(char16);
+short16 __ovld __cnfn convert_short16_rtz(char16);
+short16 __ovld __cnfn convert_short16_sat_rtz(char16);
+short16 __ovld __cnfn convert_short16_rtp(char16);
+short16 __ovld __cnfn convert_short16_sat_rtp(char16);
+short16 __ovld __cnfn convert_short16_rtn(char16);
+short16 __ovld __cnfn convert_short16_sat_rtn(char16);
+short16 __ovld __cnfn convert_short16(char16);
+short16 __ovld __cnfn convert_short16_sat(char16);
+short16 __ovld __cnfn convert_short16_rte(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rte(uchar16);
+short16 __ovld __cnfn convert_short16_rtz(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtz(uchar16);
+short16 __ovld __cnfn convert_short16_rtp(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtp(uchar16);
+short16 __ovld __cnfn convert_short16_rtn(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtn(uchar16);
+short16 __ovld __cnfn convert_short16(uchar16);
+short16 __ovld __cnfn convert_short16_sat(uchar16);
+short16 __ovld __cnfn convert_short16_rte(short16);
+short16 __ovld __cnfn convert_short16_sat_rte(short16);
+short16 __ovld __cnfn convert_short16_rtz(short16);
+short16 __ovld __cnfn convert_short16_sat_rtz(short16);
+short16 __ovld __cnfn convert_short16_rtp(short16);
+short16 __ovld __cnfn convert_short16_sat_rtp(short16);
+short16 __ovld __cnfn convert_short16_rtn(short16);
+short16 __ovld __cnfn convert_short16_sat_rtn(short16);
+short16 __ovld __cnfn convert_short16(short16);
+short16 __ovld __cnfn convert_short16_sat(short16);
+short16 __ovld __cnfn convert_short16_rte(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rte(ushort16);
+short16 __ovld __cnfn convert_short16_rtz(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtz(ushort16);
+short16 __ovld __cnfn convert_short16_rtp(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtp(ushort16);
+short16 __ovld __cnfn convert_short16_rtn(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtn(ushort16);
+short16 __ovld __cnfn convert_short16(ushort16);
+short16 __ovld __cnfn convert_short16_sat(ushort16);
+short16 __ovld __cnfn convert_short16_rte(int16);
+short16 __ovld __cnfn convert_short16_sat_rte(int16);
+short16 __ovld __cnfn convert_short16_rtz(int16);
+short16 __ovld __cnfn convert_short16_sat_rtz(int16);
+short16 __ovld __cnfn convert_short16_rtp(int16);
+short16 __ovld __cnfn convert_short16_sat_rtp(int16);
+short16 __ovld __cnfn convert_short16_rtn(int16);
+short16 __ovld __cnfn convert_short16_sat_rtn(int16);
+short16 __ovld __cnfn convert_short16(int16);
+short16 __ovld __cnfn convert_short16_sat(int16);
+short16 __ovld __cnfn convert_short16_rte(uint16);
+short16 __ovld __cnfn convert_short16_sat_rte(uint16);
+short16 __ovld __cnfn convert_short16_rtz(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtz(uint16);
+short16 __ovld __cnfn convert_short16_rtp(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtp(uint16);
+short16 __ovld __cnfn convert_short16_rtn(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtn(uint16);
+short16 __ovld __cnfn convert_short16(uint16);
+short16 __ovld __cnfn convert_short16_sat(uint16);
+short16 __ovld __cnfn convert_short16_rte(long16);
+short16 __ovld __cnfn convert_short16_sat_rte(long16);
+short16 __ovld __cnfn convert_short16_rtz(long16);
+short16 __ovld __cnfn convert_short16_sat_rtz(long16);
+short16 __ovld __cnfn convert_short16_rtp(long16);
+short16 __ovld __cnfn convert_short16_sat_rtp(long16);
+short16 __ovld __cnfn convert_short16_rtn(long16);
+short16 __ovld __cnfn convert_short16_sat_rtn(long16);
+short16 __ovld __cnfn convert_short16(long16);
+short16 __ovld __cnfn convert_short16_sat(long16);
+short16 __ovld __cnfn convert_short16_rte(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rte(ulong16);
+short16 __ovld __cnfn convert_short16_rtz(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtz(ulong16);
+short16 __ovld __cnfn convert_short16_rtp(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtp(ulong16);
+short16 __ovld __cnfn convert_short16_rtn(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtn(ulong16);
+short16 __ovld __cnfn convert_short16(ulong16);
+short16 __ovld __cnfn convert_short16_sat(ulong16);
+short16 __ovld __cnfn convert_short16_rte(float16);
+short16 __ovld __cnfn convert_short16_sat_rte(float16);
+short16 __ovld __cnfn convert_short16_rtz(float16);
+short16 __ovld __cnfn convert_short16_sat_rtz(float16);
+short16 __ovld __cnfn convert_short16_rtp(float16);
+short16 __ovld __cnfn convert_short16_sat_rtp(float16);
+short16 __ovld __cnfn convert_short16_rtn(float16);
+short16 __ovld __cnfn convert_short16_sat_rtn(float16);
+short16 __ovld __cnfn convert_short16(float16);
+short16 __ovld __cnfn convert_short16_sat(float16);
+ushort16 __ovld __cnfn convert_ushort16_rte(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(char16);
+ushort16 __ovld __cnfn convert_ushort16(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat(char16);
+ushort16 __ovld __cnfn convert_ushort16_rte(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(uchar16);
+ushort16 __ovld __cnfn convert_ushort16(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rte(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(short16);
+ushort16 __ovld __cnfn convert_ushort16(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat(short16);
+ushort16 __ovld __cnfn convert_ushort16_rte(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(ushort16);
+ushort16 __ovld __cnfn convert_ushort16(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rte(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(int16);
+ushort16 __ovld __cnfn convert_ushort16(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat(int16);
+ushort16 __ovld __cnfn convert_ushort16_rte(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(uint16);
+ushort16 __ovld __cnfn convert_ushort16(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rte(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(long16);
+ushort16 __ovld __cnfn convert_ushort16(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat(long16);
+ushort16 __ovld __cnfn convert_ushort16_rte(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(ulong16);
+ushort16 __ovld __cnfn convert_ushort16(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rte(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(float16);
+ushort16 __ovld __cnfn convert_ushort16(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat(float16);
+int16 __ovld __cnfn convert_int16_rte(char16);
+int16 __ovld __cnfn convert_int16_sat_rte(char16);
+int16 __ovld __cnfn convert_int16_rtz(char16);
+int16 __ovld __cnfn convert_int16_sat_rtz(char16);
+int16 __ovld __cnfn convert_int16_rtp(char16);
+int16 __ovld __cnfn convert_int16_sat_rtp(char16);
+int16 __ovld __cnfn convert_int16_rtn(char16);
+int16 __ovld __cnfn convert_int16_sat_rtn(char16);
+int16 __ovld __cnfn convert_int16(char16);
+int16 __ovld __cnfn convert_int16_sat(char16);
+int16 __ovld __cnfn convert_int16_rte(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rte(uchar16);
+int16 __ovld __cnfn convert_int16_rtz(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtz(uchar16);
+int16 __ovld __cnfn convert_int16_rtp(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtp(uchar16);
+int16 __ovld __cnfn convert_int16_rtn(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtn(uchar16);
+int16 __ovld __cnfn convert_int16(uchar16);
+int16 __ovld __cnfn convert_int16_sat(uchar16);
+int16 __ovld __cnfn convert_int16_rte(short16);
+int16 __ovld __cnfn convert_int16_sat_rte(short16);
+int16 __ovld __cnfn convert_int16_rtz(short16);
+int16 __ovld __cnfn convert_int16_sat_rtz(short16);
+int16 __ovld __cnfn convert_int16_rtp(short16);
+int16 __ovld __cnfn convert_int16_sat_rtp(short16);
+int16 __ovld __cnfn convert_int16_rtn(short16);
+int16 __ovld __cnfn convert_int16_sat_rtn(short16);
+int16 __ovld __cnfn convert_int16(short16);
+int16 __ovld __cnfn convert_int16_sat(short16);
+int16 __ovld __cnfn convert_int16_rte(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rte(ushort16);
+int16 __ovld __cnfn convert_int16_rtz(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtz(ushort16);
+int16 __ovld __cnfn convert_int16_rtp(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtp(ushort16);
+int16 __ovld __cnfn convert_int16_rtn(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtn(ushort16);
+int16 __ovld __cnfn convert_int16(ushort16);
+int16 __ovld __cnfn convert_int16_sat(ushort16);
+int16 __ovld __cnfn convert_int16_rte(int16);
+int16 __ovld __cnfn convert_int16_sat_rte(int16);
+int16 __ovld __cnfn convert_int16_rtz(int16);
+int16 __ovld __cnfn convert_int16_sat_rtz(int16);
+int16 __ovld __cnfn convert_int16_rtp(int16);
+int16 __ovld __cnfn convert_int16_sat_rtp(int16);
+int16 __ovld __cnfn convert_int16_rtn(int16);
+int16 __ovld __cnfn convert_int16_sat_rtn(int16);
+int16 __ovld __cnfn convert_int16(int16);
+int16 __ovld __cnfn convert_int16_sat(int16);
+int16 __ovld __cnfn convert_int16_rte(uint16);
+int16 __ovld __cnfn convert_int16_sat_rte(uint16);
+int16 __ovld __cnfn convert_int16_rtz(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtz(uint16);
+int16 __ovld __cnfn convert_int16_rtp(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtp(uint16);
+int16 __ovld __cnfn convert_int16_rtn(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtn(uint16);
+int16 __ovld __cnfn convert_int16(uint16);
+int16 __ovld __cnfn convert_int16_sat(uint16);
+int16 __ovld __cnfn convert_int16_rte(long16);
+int16 __ovld __cnfn convert_int16_sat_rte(long16);
+int16 __ovld __cnfn convert_int16_rtz(long16);
+int16 __ovld __cnfn convert_int16_sat_rtz(long16);
+int16 __ovld __cnfn convert_int16_rtp(long16);
+int16 __ovld __cnfn convert_int16_sat_rtp(long16);
+int16 __ovld __cnfn convert_int16_rtn(long16);
+int16 __ovld __cnfn convert_int16_sat_rtn(long16);
+int16 __ovld __cnfn convert_int16(long16);
+int16 __ovld __cnfn convert_int16_sat(long16);
+int16 __ovld __cnfn convert_int16_rte(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rte(ulong16);
+int16 __ovld __cnfn convert_int16_rtz(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtz(ulong16);
+int16 __ovld __cnfn convert_int16_rtp(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtp(ulong16);
+int16 __ovld __cnfn convert_int16_rtn(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtn(ulong16);
+int16 __ovld __cnfn convert_int16(ulong16);
+int16 __ovld __cnfn convert_int16_sat(ulong16);
+int16 __ovld __cnfn convert_int16_rte(float16);
+int16 __ovld __cnfn convert_int16_sat_rte(float16);
+int16 __ovld __cnfn convert_int16_rtz(float16);
+int16 __ovld __cnfn convert_int16_sat_rtz(float16);
+int16 __ovld __cnfn convert_int16_rtp(float16);
+int16 __ovld __cnfn convert_int16_sat_rtp(float16);
+int16 __ovld __cnfn convert_int16_rtn(float16);
+int16 __ovld __cnfn convert_int16_sat_rtn(float16);
+int16 __ovld __cnfn convert_int16(float16);
+int16 __ovld __cnfn convert_int16_sat(float16);
+uint16 __ovld __cnfn convert_uint16_rte(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(char16);
+uint16 __ovld __cnfn convert_uint16_rtz(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(char16);
+uint16 __ovld __cnfn convert_uint16_rtp(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(char16);
+uint16 __ovld __cnfn convert_uint16_rtn(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(char16);
+uint16 __ovld __cnfn convert_uint16(char16);
+uint16 __ovld __cnfn convert_uint16_sat(char16);
+uint16 __ovld __cnfn convert_uint16_rte(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtz(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtp(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtn(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(uchar16);
+uint16 __ovld __cnfn convert_uint16(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat(uchar16);
+uint16 __ovld __cnfn convert_uint16_rte(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(short16);
+uint16 __ovld __cnfn convert_uint16_rtz(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(short16);
+uint16 __ovld __cnfn convert_uint16_rtp(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(short16);
+uint16 __ovld __cnfn convert_uint16_rtn(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(short16);
+uint16 __ovld __cnfn convert_uint16(short16);
+uint16 __ovld __cnfn convert_uint16_sat(short16);
+uint16 __ovld __cnfn convert_uint16_rte(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtz(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtp(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtn(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(ushort16);
+uint16 __ovld __cnfn convert_uint16(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat(ushort16);
+uint16 __ovld __cnfn convert_uint16_rte(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(int16);
+uint16 __ovld __cnfn convert_uint16_rtz(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(int16);
+uint16 __ovld __cnfn convert_uint16_rtp(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(int16);
+uint16 __ovld __cnfn convert_uint16_rtn(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(int16);
+uint16 __ovld __cnfn convert_uint16(int16);
+uint16 __ovld __cnfn convert_uint16_sat(int16);
+uint16 __ovld __cnfn convert_uint16_rte(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(uint16);
+uint16 __ovld __cnfn convert_uint16_rtz(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(uint16);
+uint16 __ovld __cnfn convert_uint16_rtp(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(uint16);
+uint16 __ovld __cnfn convert_uint16_rtn(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(uint16);
+uint16 __ovld __cnfn convert_uint16(uint16);
+uint16 __ovld __cnfn convert_uint16_sat(uint16);
+uint16 __ovld __cnfn convert_uint16_rte(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(long16);
+uint16 __ovld __cnfn convert_uint16_rtz(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(long16);
+uint16 __ovld __cnfn convert_uint16_rtp(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(long16);
+uint16 __ovld __cnfn convert_uint16_rtn(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(long16);
+uint16 __ovld __cnfn convert_uint16(long16);
+uint16 __ovld __cnfn convert_uint16_sat(long16);
+uint16 __ovld __cnfn convert_uint16_rte(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtz(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtp(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtn(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(ulong16);
+uint16 __ovld __cnfn convert_uint16(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat(ulong16);
+uint16 __ovld __cnfn convert_uint16_rte(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(float16);
+uint16 __ovld __cnfn convert_uint16_rtz(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(float16);
+uint16 __ovld __cnfn convert_uint16_rtp(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(float16);
+uint16 __ovld __cnfn convert_uint16_rtn(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(float16);
+uint16 __ovld __cnfn convert_uint16(float16);
+uint16 __ovld __cnfn convert_uint16_sat(float16);
+long16 __ovld __cnfn convert_long16_rte(char16);
+long16 __ovld __cnfn convert_long16_sat_rte(char16);
+long16 __ovld __cnfn convert_long16_rtz(char16);
+long16 __ovld __cnfn convert_long16_sat_rtz(char16);
+long16 __ovld __cnfn convert_long16_rtp(char16);
+long16 __ovld __cnfn convert_long16_sat_rtp(char16);
+long16 __ovld __cnfn convert_long16_rtn(char16);
+long16 __ovld __cnfn convert_long16_sat_rtn(char16);
+long16 __ovld __cnfn convert_long16(char16);
+long16 __ovld __cnfn convert_long16_sat(char16);
+long16 __ovld __cnfn convert_long16_rte(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rte(uchar16);
+long16 __ovld __cnfn convert_long16_rtz(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtz(uchar16);
+long16 __ovld __cnfn convert_long16_rtp(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtp(uchar16);
+long16 __ovld __cnfn convert_long16_rtn(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtn(uchar16);
+long16 __ovld __cnfn convert_long16(uchar16);
+long16 __ovld __cnfn convert_long16_sat(uchar16);
+long16 __ovld __cnfn convert_long16_rte(short16);
+long16 __ovld __cnfn convert_long16_sat_rte(short16);
+long16 __ovld __cnfn convert_long16_rtz(short16);
+long16 __ovld __cnfn convert_long16_sat_rtz(short16);
+long16 __ovld __cnfn convert_long16_rtp(short16);
+long16 __ovld __cnfn convert_long16_sat_rtp(short16);
+long16 __ovld __cnfn convert_long16_rtn(short16);
+long16 __ovld __cnfn convert_long16_sat_rtn(short16);
+long16 __ovld __cnfn convert_long16(short16);
+long16 __ovld __cnfn convert_long16_sat(short16);
+long16 __ovld __cnfn convert_long16_rte(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rte(ushort16);
+long16 __ovld __cnfn convert_long16_rtz(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtz(ushort16);
+long16 __ovld __cnfn convert_long16_rtp(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtp(ushort16);
+long16 __ovld __cnfn convert_long16_rtn(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtn(ushort16);
+long16 __ovld __cnfn convert_long16(ushort16);
+long16 __ovld __cnfn convert_long16_sat(ushort16);
+long16 __ovld __cnfn convert_long16_rte(int16);
+long16 __ovld __cnfn convert_long16_sat_rte(int16);
+long16 __ovld __cnfn convert_long16_rtz(int16);
+long16 __ovld __cnfn convert_long16_sat_rtz(int16);
+long16 __ovld __cnfn convert_long16_rtp(int16);
+long16 __ovld __cnfn convert_long16_sat_rtp(int16);
+long16 __ovld __cnfn convert_long16_rtn(int16);
+long16 __ovld __cnfn convert_long16_sat_rtn(int16);
+long16 __ovld __cnfn convert_long16(int16);
+long16 __ovld __cnfn convert_long16_sat(int16);
+long16 __ovld __cnfn convert_long16_rte(uint16);
+long16 __ovld __cnfn convert_long16_sat_rte(uint16);
+long16 __ovld __cnfn convert_long16_rtz(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtz(uint16);
+long16 __ovld __cnfn convert_long16_rtp(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtp(uint16);
+long16 __ovld __cnfn convert_long16_rtn(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtn(uint16);
+long16 __ovld __cnfn convert_long16(uint16);
+long16 __ovld __cnfn convert_long16_sat(uint16);
+long16 __ovld __cnfn convert_long16_rte(long16);
+long16 __ovld __cnfn convert_long16_sat_rte(long16);
+long16 __ovld __cnfn convert_long16_rtz(long16);
+long16 __ovld __cnfn convert_long16_sat_rtz(long16);
+long16 __ovld __cnfn convert_long16_rtp(long16);
+long16 __ovld __cnfn convert_long16_sat_rtp(long16);
+long16 __ovld __cnfn convert_long16_rtn(long16);
+long16 __ovld __cnfn convert_long16_sat_rtn(long16);
+long16 __ovld __cnfn convert_long16(long16);
+long16 __ovld __cnfn convert_long16_sat(long16);
+long16 __ovld __cnfn convert_long16_rte(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rte(ulong16);
+long16 __ovld __cnfn convert_long16_rtz(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtz(ulong16);
+long16 __ovld __cnfn convert_long16_rtp(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtp(ulong16);
+long16 __ovld __cnfn convert_long16_rtn(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtn(ulong16);
+long16 __ovld __cnfn convert_long16(ulong16);
+long16 __ovld __cnfn convert_long16_sat(ulong16);
+long16 __ovld __cnfn convert_long16_rte(float16);
+long16 __ovld __cnfn convert_long16_sat_rte(float16);
+long16 __ovld __cnfn convert_long16_rtz(float16);
+long16 __ovld __cnfn convert_long16_sat_rtz(float16);
+long16 __ovld __cnfn convert_long16_rtp(float16);
+long16 __ovld __cnfn convert_long16_sat_rtp(float16);
+long16 __ovld __cnfn convert_long16_rtn(float16);
+long16 __ovld __cnfn convert_long16_sat_rtn(float16);
+long16 __ovld __cnfn convert_long16(float16);
+long16 __ovld __cnfn convert_long16_sat(float16);
+ulong16 __ovld __cnfn convert_ulong16_rte(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(char16);
+ulong16 __ovld __cnfn convert_ulong16(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat(char16);
+ulong16 __ovld __cnfn convert_ulong16_rte(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(uchar16);
+ulong16 __ovld __cnfn convert_ulong16(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rte(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(short16);
+ulong16 __ovld __cnfn convert_ulong16(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat(short16);
+ulong16 __ovld __cnfn convert_ulong16_rte(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(ushort16);
+ulong16 __ovld __cnfn convert_ulong16(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rte(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(int16);
+ulong16 __ovld __cnfn convert_ulong16(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat(int16);
+ulong16 __ovld __cnfn convert_ulong16_rte(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(uint16);
+ulong16 __ovld __cnfn convert_ulong16(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rte(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(long16);
+ulong16 __ovld __cnfn convert_ulong16(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat(long16);
+ulong16 __ovld __cnfn convert_ulong16_rte(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(ulong16);
+ulong16 __ovld __cnfn convert_ulong16(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rte(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(float16);
+ulong16 __ovld __cnfn convert_ulong16(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat(float16);
+float16 __ovld __cnfn convert_float16_rte(char16);
+float16 __ovld __cnfn convert_float16_rtz(char16);
+float16 __ovld __cnfn convert_float16_rtp(char16);
+float16 __ovld __cnfn convert_float16_rtn(char16);
+float16 __ovld __cnfn convert_float16(char16);
+float16 __ovld __cnfn convert_float16_rte(uchar16);
+float16 __ovld __cnfn convert_float16_rtz(uchar16);
+float16 __ovld __cnfn convert_float16_rtp(uchar16);
+float16 __ovld __cnfn convert_float16_rtn(uchar16);
+float16 __ovld __cnfn convert_float16(uchar16);
+float16 __ovld __cnfn convert_float16_rte(short16);
+float16 __ovld __cnfn convert_float16_rtz(short16);
+float16 __ovld __cnfn convert_float16_rtp(short16);
+float16 __ovld __cnfn convert_float16_rtn(short16);
+float16 __ovld __cnfn convert_float16(short16);
+float16 __ovld __cnfn convert_float16_rte(ushort16);
+float16 __ovld __cnfn convert_float16_rtz(ushort16);
+float16 __ovld __cnfn convert_float16_rtp(ushort16);
+float16 __ovld __cnfn convert_float16_rtn(ushort16);
+float16 __ovld __cnfn convert_float16(ushort16);
+float16 __ovld __cnfn convert_float16_rte(int16);
+float16 __ovld __cnfn convert_float16_rtz(int16);
+float16 __ovld __cnfn convert_float16_rtp(int16);
+float16 __ovld __cnfn convert_float16_rtn(int16);
+float16 __ovld __cnfn convert_float16(int16);
+float16 __ovld __cnfn convert_float16_rte(uint16);
+float16 __ovld __cnfn convert_float16_rtz(uint16);
+float16 __ovld __cnfn convert_float16_rtp(uint16);
+float16 __ovld __cnfn convert_float16_rtn(uint16);
+float16 __ovld __cnfn convert_float16(uint16);
+float16 __ovld __cnfn convert_float16_rte(long16);
+float16 __ovld __cnfn convert_float16_rtz(long16);
+float16 __ovld __cnfn convert_float16_rtp(long16);
+float16 __ovld __cnfn convert_float16_rtn(long16);
+float16 __ovld __cnfn convert_float16(long16);
+float16 __ovld __cnfn convert_float16_rte(ulong16);
+float16 __ovld __cnfn convert_float16_rtz(ulong16);
+float16 __ovld __cnfn convert_float16_rtp(ulong16);
+float16 __ovld __cnfn convert_float16_rtn(ulong16);
+float16 __ovld __cnfn convert_float16(ulong16);
+float16 __ovld __cnfn convert_float16_rte(float16);
+float16 __ovld __cnfn convert_float16_rtz(float16);
+float16 __ovld __cnfn convert_float16_rtp(float16);
+float16 __ovld __cnfn convert_float16_rtn(float16);
+float16 __ovld __cnfn convert_float16(float16);
+
+// Conversions with double data type parameters or return value.
+
+#ifdef cl_khr_fp64
+char __ovld __cnfn convert_char(double);
+char __ovld __cnfn convert_char_rte(double);
+char __ovld __cnfn convert_char_rtn(double);
+char __ovld __cnfn convert_char_rtp(double);
+char __ovld __cnfn convert_char_rtz(double);
+char __ovld __cnfn convert_char_sat(double);
+char __ovld __cnfn convert_char_sat_rte(double);
+char __ovld __cnfn convert_char_sat_rtn(double);
+char __ovld __cnfn convert_char_sat_rtp(double);
+char __ovld __cnfn convert_char_sat_rtz(double);
+char2 __ovld __cnfn convert_char2(double2);
+char2 __ovld __cnfn convert_char2_rte(double2);
+char2 __ovld __cnfn convert_char2_rtn(double2);
+char2 __ovld __cnfn convert_char2_rtp(double2);
+char2 __ovld __cnfn convert_char2_rtz(double2);
+char2 __ovld __cnfn convert_char2_sat(double2);
+char2 __ovld __cnfn convert_char2_sat_rte(double2);
+char2 __ovld __cnfn convert_char2_sat_rtn(double2);
+char2 __ovld __cnfn convert_char2_sat_rtp(double2);
+char2 __ovld __cnfn convert_char2_sat_rtz(double2);
+char3 __ovld __cnfn convert_char3(double3);
+char3 __ovld __cnfn convert_char3_rte(double3);
+char3 __ovld __cnfn convert_char3_rtn(double3);
+char3 __ovld __cnfn convert_char3_rtp(double3);
+char3 __ovld __cnfn convert_char3_rtz(double3);
+char3 __ovld __cnfn convert_char3_sat(double3);
+char3 __ovld __cnfn convert_char3_sat_rte(double3);
+char3 __ovld __cnfn convert_char3_sat_rtn(double3);
+char3 __ovld __cnfn convert_char3_sat_rtp(double3);
+char3 __ovld __cnfn convert_char3_sat_rtz(double3);
+char4 __ovld __cnfn convert_char4(double4);
+char4 __ovld __cnfn convert_char4_rte(double4);
+char4 __ovld __cnfn convert_char4_rtn(double4);
+char4 __ovld __cnfn convert_char4_rtp(double4);
+char4 __ovld __cnfn convert_char4_rtz(double4);
+char4 __ovld __cnfn convert_char4_sat(double4);
+char4 __ovld __cnfn convert_char4_sat_rte(double4);
+char4 __ovld __cnfn convert_char4_sat_rtn(double4);
+char4 __ovld __cnfn convert_char4_sat_rtp(double4);
+char4 __ovld __cnfn convert_char4_sat_rtz(double4);
+char8 __ovld __cnfn convert_char8(double8);
+char8 __ovld __cnfn convert_char8_rte(double8);
+char8 __ovld __cnfn convert_char8_rtn(double8);
+char8 __ovld __cnfn convert_char8_rtp(double8);
+char8 __ovld __cnfn convert_char8_rtz(double8);
+char8 __ovld __cnfn convert_char8_sat(double8);
+char8 __ovld __cnfn convert_char8_sat_rte(double8);
+char8 __ovld __cnfn convert_char8_sat_rtn(double8);
+char8 __ovld __cnfn convert_char8_sat_rtp(double8);
+char8 __ovld __cnfn convert_char8_sat_rtz(double8);
+char16 __ovld __cnfn convert_char16(double16);
+char16 __ovld __cnfn convert_char16_rte(double16);
+char16 __ovld __cnfn convert_char16_rtn(double16);
+char16 __ovld __cnfn convert_char16_rtp(double16);
+char16 __ovld __cnfn convert_char16_rtz(double16);
+char16 __ovld __cnfn convert_char16_sat(double16);
+char16 __ovld __cnfn convert_char16_sat_rte(double16);
+char16 __ovld __cnfn convert_char16_sat_rtn(double16);
+char16 __ovld __cnfn convert_char16_sat_rtp(double16);
+char16 __ovld __cnfn convert_char16_sat_rtz(double16);
+
+uchar __ovld __cnfn convert_uchar(double);
+uchar __ovld __cnfn convert_uchar_rte(double);
+uchar __ovld __cnfn convert_uchar_rtn(double);
+uchar __ovld __cnfn convert_uchar_rtp(double);
+uchar __ovld __cnfn convert_uchar_rtz(double);
+uchar __ovld __cnfn convert_uchar_sat(double);
+uchar __ovld __cnfn convert_uchar_sat_rte(double);
+uchar __ovld __cnfn convert_uchar_sat_rtn(double);
+uchar __ovld __cnfn convert_uchar_sat_rtp(double);
+uchar __ovld __cnfn convert_uchar_sat_rtz(double);
+uchar2 __ovld __cnfn convert_uchar2(double2);
+uchar2 __ovld __cnfn convert_uchar2_rte(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(double2);
+uchar3 __ovld __cnfn convert_uchar3(double3);
+uchar3 __ovld __cnfn convert_uchar3_rte(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(double3);
+uchar4 __ovld __cnfn convert_uchar4(double4);
+uchar4 __ovld __cnfn convert_uchar4_rte(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(double4);
+uchar8 __ovld __cnfn convert_uchar8(double8);
+uchar8 __ovld __cnfn convert_uchar8_rte(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(double8);
+uchar16 __ovld __cnfn convert_uchar16(double16);
+uchar16 __ovld __cnfn convert_uchar16_rte(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(double16);
+
+short __ovld __cnfn convert_short(double);
+short __ovld __cnfn convert_short_rte(double);
+short __ovld __cnfn convert_short_rtn(double);
+short __ovld __cnfn convert_short_rtp(double);
+short __ovld __cnfn convert_short_rtz(double);
+short __ovld __cnfn convert_short_sat(double);
+short __ovld __cnfn convert_short_sat_rte(double);
+short __ovld __cnfn convert_short_sat_rtn(double);
+short __ovld __cnfn convert_short_sat_rtp(double);
+short __ovld __cnfn convert_short_sat_rtz(double);
+short2 __ovld __cnfn convert_short2(double2);
+short2 __ovld __cnfn convert_short2_rte(double2);
+short2 __ovld __cnfn convert_short2_rtn(double2);
+short2 __ovld __cnfn convert_short2_rtp(double2);
+short2 __ovld __cnfn convert_short2_rtz(double2);
+short2 __ovld __cnfn convert_short2_sat(double2);
+short2 __ovld __cnfn convert_short2_sat_rte(double2);
+short2 __ovld __cnfn convert_short2_sat_rtn(double2);
+short2 __ovld __cnfn convert_short2_sat_rtp(double2);
+short2 __ovld __cnfn convert_short2_sat_rtz(double2);
+short3 __ovld __cnfn convert_short3(double3);
+short3 __ovld __cnfn convert_short3_rte(double3);
+short3 __ovld __cnfn convert_short3_rtn(double3);
+short3 __ovld __cnfn convert_short3_rtp(double3);
+short3 __ovld __cnfn convert_short3_rtz(double3);
+short3 __ovld __cnfn convert_short3_sat(double3);
+short3 __ovld __cnfn convert_short3_sat_rte(double3);
+short3 __ovld __cnfn convert_short3_sat_rtn(double3);
+short3 __ovld __cnfn convert_short3_sat_rtp(double3);
+short3 __ovld __cnfn convert_short3_sat_rtz(double3);
+short4 __ovld __cnfn convert_short4(double4);
+short4 __ovld __cnfn convert_short4_rte(double4);
+short4 __ovld __cnfn convert_short4_rtn(double4);
+short4 __ovld __cnfn convert_short4_rtp(double4);
+short4 __ovld __cnfn convert_short4_rtz(double4);
+short4 __ovld __cnfn convert_short4_sat(double4);
+short4 __ovld __cnfn convert_short4_sat_rte(double4);
+short4 __ovld __cnfn convert_short4_sat_rtn(double4);
+short4 __ovld __cnfn convert_short4_sat_rtp(double4);
+short4 __ovld __cnfn convert_short4_sat_rtz(double4);
+short8 __ovld __cnfn convert_short8(double8);
+short8 __ovld __cnfn convert_short8_rte(double8);
+short8 __ovld __cnfn convert_short8_rtn(double8);
+short8 __ovld __cnfn convert_short8_rtp(double8);
+short8 __ovld __cnfn convert_short8_rtz(double8);
+short8 __ovld __cnfn convert_short8_sat(double8);
+short8 __ovld __cnfn convert_short8_sat_rte(double8);
+short8 __ovld __cnfn convert_short8_sat_rtn(double8);
+short8 __ovld __cnfn convert_short8_sat_rtp(double8);
+short8 __ovld __cnfn convert_short8_sat_rtz(double8);
+short16 __ovld __cnfn convert_short16(double16);
+short16 __ovld __cnfn convert_short16_rte(double16);
+short16 __ovld __cnfn convert_short16_rtn(double16);
+short16 __ovld __cnfn convert_short16_rtp(double16);
+short16 __ovld __cnfn convert_short16_rtz(double16);
+short16 __ovld __cnfn convert_short16_sat(double16);
+short16 __ovld __cnfn convert_short16_sat_rte(double16);
+short16 __ovld __cnfn convert_short16_sat_rtn(double16);
+short16 __ovld __cnfn convert_short16_sat_rtp(double16);
+short16 __ovld __cnfn convert_short16_sat_rtz(double16);
+
+ushort __ovld __cnfn convert_ushort(double);
+ushort __ovld __cnfn convert_ushort_rte(double);
+ushort __ovld __cnfn convert_ushort_rtn(double);
+ushort __ovld __cnfn convert_ushort_rtp(double);
+ushort __ovld __cnfn convert_ushort_rtz(double);
+ushort __ovld __cnfn convert_ushort_sat(double);
+ushort __ovld __cnfn convert_ushort_sat_rte(double);
+ushort __ovld __cnfn convert_ushort_sat_rtn(double);
+ushort __ovld __cnfn convert_ushort_sat_rtp(double);
+ushort __ovld __cnfn convert_ushort_sat_rtz(double);
+ushort2 __ovld __cnfn convert_ushort2(double2);
+ushort2 __ovld __cnfn convert_ushort2_rte(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(double2);
+ushort3 __ovld __cnfn convert_ushort3(double3);
+ushort3 __ovld __cnfn convert_ushort3_rte(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(double3);
+ushort4 __ovld __cnfn convert_ushort4(double4);
+ushort4 __ovld __cnfn convert_ushort4_rte(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(double4);
+ushort8 __ovld __cnfn convert_ushort8(double8);
+ushort8 __ovld __cnfn convert_ushort8_rte(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(double8);
+ushort16 __ovld __cnfn convert_ushort16(double16);
+ushort16 __ovld __cnfn convert_ushort16_rte(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(double16);
+
+int __ovld __cnfn convert_int(double);
+int __ovld __cnfn convert_int_rte(double);
+int __ovld __cnfn convert_int_rtn(double);
+int __ovld __cnfn convert_int_rtp(double);
+int __ovld __cnfn convert_int_rtz(double);
+int __ovld __cnfn convert_int_sat(double);
+int __ovld __cnfn convert_int_sat_rte(double);
+int __ovld __cnfn convert_int_sat_rtn(double);
+int __ovld __cnfn convert_int_sat_rtp(double);
+int __ovld __cnfn convert_int_sat_rtz(double);
+int2 __ovld __cnfn convert_int2(double2);
+int2 __ovld __cnfn convert_int2_rte(double2);
+int2 __ovld __cnfn convert_int2_rtn(double2);
+int2 __ovld __cnfn convert_int2_rtp(double2);
+int2 __ovld __cnfn convert_int2_rtz(double2);
+int2 __ovld __cnfn convert_int2_sat(double2);
+int2 __ovld __cnfn convert_int2_sat_rte(double2);
+int2 __ovld __cnfn convert_int2_sat_rtn(double2);
+int2 __ovld __cnfn convert_int2_sat_rtp(double2);
+int2 __ovld __cnfn convert_int2_sat_rtz(double2);
+int3 __ovld __cnfn convert_int3(double3);
+int3 __ovld __cnfn convert_int3_rte(double3);
+int3 __ovld __cnfn convert_int3_rtn(double3);
+int3 __ovld __cnfn convert_int3_rtp(double3);
+int3 __ovld __cnfn convert_int3_rtz(double3);
+int3 __ovld __cnfn convert_int3_sat(double3);
+int3 __ovld __cnfn convert_int3_sat_rte(double3);
+int3 __ovld __cnfn convert_int3_sat_rtn(double3);
+int3 __ovld __cnfn convert_int3_sat_rtp(double3);
+int3 __ovld __cnfn convert_int3_sat_rtz(double3);
+int4 __ovld __cnfn convert_int4(double4);
+int4 __ovld __cnfn convert_int4_rte(double4);
+int4 __ovld __cnfn convert_int4_rtn(double4);
+int4 __ovld __cnfn convert_int4_rtp(double4);
+int4 __ovld __cnfn convert_int4_rtz(double4);
+int4 __ovld __cnfn convert_int4_sat(double4);
+int4 __ovld __cnfn convert_int4_sat_rte(double4);
+int4 __ovld __cnfn convert_int4_sat_rtn(double4);
+int4 __ovld __cnfn convert_int4_sat_rtp(double4);
+int4 __ovld __cnfn convert_int4_sat_rtz(double4);
+int8 __ovld __cnfn convert_int8(double8);
+int8 __ovld __cnfn convert_int8_rte(double8);
+int8 __ovld __cnfn convert_int8_rtn(double8);
+int8 __ovld __cnfn convert_int8_rtp(double8);
+int8 __ovld __cnfn convert_int8_rtz(double8);
+int8 __ovld __cnfn convert_int8_sat(double8);
+int8 __ovld __cnfn convert_int8_sat_rte(double8);
+int8 __ovld __cnfn convert_int8_sat_rtn(double8);
+int8 __ovld __cnfn convert_int8_sat_rtp(double8);
+int8 __ovld __cnfn convert_int8_sat_rtz(double8);
+int16 __ovld __cnfn convert_int16(double16);
+int16 __ovld __cnfn convert_int16_rte(double16);
+int16 __ovld __cnfn convert_int16_rtn(double16);
+int16 __ovld __cnfn convert_int16_rtp(double16);
+int16 __ovld __cnfn convert_int16_rtz(double16);
+int16 __ovld __cnfn convert_int16_sat(double16);
+int16 __ovld __cnfn convert_int16_sat_rte(double16);
+int16 __ovld __cnfn convert_int16_sat_rtn(double16);
+int16 __ovld __cnfn convert_int16_sat_rtp(double16);
+int16 __ovld __cnfn convert_int16_sat_rtz(double16);
+
+uint __ovld __cnfn convert_uint(double);
+uint __ovld __cnfn convert_uint_rte(double);
+uint __ovld __cnfn convert_uint_rtn(double);
+uint __ovld __cnfn convert_uint_rtp(double);
+uint __ovld __cnfn convert_uint_rtz(double);
+uint __ovld __cnfn convert_uint_sat(double);
+uint __ovld __cnfn convert_uint_sat_rte(double);
+uint __ovld __cnfn convert_uint_sat_rtn(double);
+uint __ovld __cnfn convert_uint_sat_rtp(double);
+uint __ovld __cnfn convert_uint_sat_rtz(double);
+uint2 __ovld __cnfn convert_uint2(double2);
+uint2 __ovld __cnfn convert_uint2_rte(double2);
+uint2 __ovld __cnfn convert_uint2_rtn(double2);
+uint2 __ovld __cnfn convert_uint2_rtp(double2);
+uint2 __ovld __cnfn convert_uint2_rtz(double2);
+uint2 __ovld __cnfn convert_uint2_sat(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(double2);
+uint3 __ovld __cnfn convert_uint3(double3);
+uint3 __ovld __cnfn convert_uint3_rte(double3);
+uint3 __ovld __cnfn convert_uint3_rtn(double3);
+uint3 __ovld __cnfn convert_uint3_rtp(double3);
+uint3 __ovld __cnfn convert_uint3_rtz(double3);
+uint3 __ovld __cnfn convert_uint3_sat(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(double3);
+uint4 __ovld __cnfn convert_uint4(double4);
+uint4 __ovld __cnfn convert_uint4_rte(double4);
+uint4 __ovld __cnfn convert_uint4_rtn(double4);
+uint4 __ovld __cnfn convert_uint4_rtp(double4);
+uint4 __ovld __cnfn convert_uint4_rtz(double4);
+uint4 __ovld __cnfn convert_uint4_sat(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(double4);
+uint8 __ovld __cnfn convert_uint8(double8);
+uint8 __ovld __cnfn convert_uint8_rte(double8);
+uint8 __ovld __cnfn convert_uint8_rtn(double8);
+uint8 __ovld __cnfn convert_uint8_rtp(double8);
+uint8 __ovld __cnfn convert_uint8_rtz(double8);
+uint8 __ovld __cnfn convert_uint8_sat(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(double8);
+uint16 __ovld __cnfn convert_uint16(double16);
+uint16 __ovld __cnfn convert_uint16_rte(double16);
+uint16 __ovld __cnfn convert_uint16_rtn(double16);
+uint16 __ovld __cnfn convert_uint16_rtp(double16);
+uint16 __ovld __cnfn convert_uint16_rtz(double16);
+uint16 __ovld __cnfn convert_uint16_sat(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(double16);
+
+long __ovld __cnfn convert_long(double);
+long __ovld __cnfn convert_long_rte(double);
+long __ovld __cnfn convert_long_rtn(double);
+long __ovld __cnfn convert_long_rtp(double);
+long __ovld __cnfn convert_long_rtz(double);
+long __ovld __cnfn convert_long_sat(double);
+long __ovld __cnfn convert_long_sat_rte(double);
+long __ovld __cnfn convert_long_sat_rtn(double);
+long __ovld __cnfn convert_long_sat_rtp(double);
+long __ovld __cnfn convert_long_sat_rtz(double);
+long2 __ovld __cnfn convert_long2(double2);
+long2 __ovld __cnfn convert_long2_rte(double2);
+long2 __ovld __cnfn convert_long2_rtn(double2);
+long2 __ovld __cnfn convert_long2_rtp(double2);
+long2 __ovld __cnfn convert_long2_rtz(double2);
+long2 __ovld __cnfn convert_long2_sat(double2);
+long2 __ovld __cnfn convert_long2_sat_rte(double2);
+long2 __ovld __cnfn convert_long2_sat_rtn(double2);
+long2 __ovld __cnfn convert_long2_sat_rtp(double2);
+long2 __ovld __cnfn convert_long2_sat_rtz(double2);
+long3 __ovld __cnfn convert_long3(double3);
+long3 __ovld __cnfn convert_long3_rte(double3);
+long3 __ovld __cnfn convert_long3_rtn(double3);
+long3 __ovld __cnfn convert_long3_rtp(double3);
+long3 __ovld __cnfn convert_long3_rtz(double3);
+long3 __ovld __cnfn convert_long3_sat(double3);
+long3 __ovld __cnfn convert_long3_sat_rte(double3);
+long3 __ovld __cnfn convert_long3_sat_rtn(double3);
+long3 __ovld __cnfn convert_long3_sat_rtp(double3);
+long3 __ovld __cnfn convert_long3_sat_rtz(double3);
+long4 __ovld __cnfn convert_long4(double4);
+long4 __ovld __cnfn convert_long4_rte(double4);
+long4 __ovld __cnfn convert_long4_rtn(double4);
+long4 __ovld __cnfn convert_long4_rtp(double4);
+long4 __ovld __cnfn convert_long4_rtz(double4);
+long4 __ovld __cnfn convert_long4_sat(double4);
+long4 __ovld __cnfn convert_long4_sat_rte(double4);
+long4 __ovld __cnfn convert_long4_sat_rtn(double4);
+long4 __ovld __cnfn convert_long4_sat_rtp(double4);
+long4 __ovld __cnfn convert_long4_sat_rtz(double4);
+long8 __ovld __cnfn convert_long8(double8);
+long8 __ovld __cnfn convert_long8_rte(double8);
+long8 __ovld __cnfn convert_long8_rtn(double8);
+long8 __ovld __cnfn convert_long8_rtp(double8);
+long8 __ovld __cnfn convert_long8_rtz(double8);
+long8 __ovld __cnfn convert_long8_sat(double8);
+long8 __ovld __cnfn convert_long8_sat_rte(double8);
+long8 __ovld __cnfn convert_long8_sat_rtn(double8);
+long8 __ovld __cnfn convert_long8_sat_rtp(double8);
+long8 __ovld __cnfn convert_long8_sat_rtz(double8);
+long16 __ovld __cnfn convert_long16(double16);
+long16 __ovld __cnfn convert_long16_rte(double16);
+long16 __ovld __cnfn convert_long16_rtn(double16);
+long16 __ovld __cnfn convert_long16_rtp(double16);
+long16 __ovld __cnfn convert_long16_rtz(double16);
+long16 __ovld __cnfn convert_long16_sat(double16);
+long16 __ovld __cnfn convert_long16_sat_rte(double16);
+long16 __ovld __cnfn convert_long16_sat_rtn(double16);
+long16 __ovld __cnfn convert_long16_sat_rtp(double16);
+long16 __ovld __cnfn convert_long16_sat_rtz(double16);
+
+ulong __ovld __cnfn convert_ulong(double);
+ulong __ovld __cnfn convert_ulong_rte(double);
+ulong __ovld __cnfn convert_ulong_rtn(double);
+ulong __ovld __cnfn convert_ulong_rtp(double);
+ulong __ovld __cnfn convert_ulong_rtz(double);
+ulong __ovld __cnfn convert_ulong_sat(double);
+ulong __ovld __cnfn convert_ulong_sat_rte(double);
+ulong __ovld __cnfn convert_ulong_sat_rtn(double);
+ulong __ovld __cnfn convert_ulong_sat_rtp(double);
+ulong __ovld __cnfn convert_ulong_sat_rtz(double);
+ulong2 __ovld __cnfn convert_ulong2(double2);
+ulong2 __ovld __cnfn convert_ulong2_rte(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(double2);
+ulong3 __ovld __cnfn convert_ulong3(double3);
+ulong3 __ovld __cnfn convert_ulong3_rte(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(double3);
+ulong4 __ovld __cnfn convert_ulong4(double4);
+ulong4 __ovld __cnfn convert_ulong4_rte(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(double4);
+ulong8 __ovld __cnfn convert_ulong8(double8);
+ulong8 __ovld __cnfn convert_ulong8_rte(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(double8);
+ulong16 __ovld __cnfn convert_ulong16(double16);
+ulong16 __ovld __cnfn convert_ulong16_rte(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(double16);
+
+float __ovld __cnfn convert_float(double);
+float __ovld __cnfn convert_float_rte(double);
+float __ovld __cnfn convert_float_rtn(double);
+float __ovld __cnfn convert_float_rtp(double);
+float __ovld __cnfn convert_float_rtz(double);
+float2 __ovld __cnfn convert_float2(double2);
+float2 __ovld __cnfn convert_float2_rte(double2);
+float2 __ovld __cnfn convert_float2_rtn(double2);
+float2 __ovld __cnfn convert_float2_rtp(double2);
+float2 __ovld __cnfn convert_float2_rtz(double2);
+float3 __ovld __cnfn convert_float3(double3);
+float3 __ovld __cnfn convert_float3_rte(double3);
+float3 __ovld __cnfn convert_float3_rtn(double3);
+float3 __ovld __cnfn convert_float3_rtp(double3);
+float3 __ovld __cnfn convert_float3_rtz(double3);
+float4 __ovld __cnfn convert_float4(double4);
+float4 __ovld __cnfn convert_float4_rte(double4);
+float4 __ovld __cnfn convert_float4_rtn(double4);
+float4 __ovld __cnfn convert_float4_rtp(double4);
+float4 __ovld __cnfn convert_float4_rtz(double4);
+float8 __ovld __cnfn convert_float8(double8);
+float8 __ovld __cnfn convert_float8_rte(double8);
+float8 __ovld __cnfn convert_float8_rtn(double8);
+float8 __ovld __cnfn convert_float8_rtp(double8);
+float8 __ovld __cnfn convert_float8_rtz(double8);
+float16 __ovld __cnfn convert_float16(double16);
+float16 __ovld __cnfn convert_float16_rte(double16);
+float16 __ovld __cnfn convert_float16_rtn(double16);
+float16 __ovld __cnfn convert_float16_rtp(double16);
+float16 __ovld __cnfn convert_float16_rtz(double16);
+
+double __ovld __cnfn convert_double(char);
+double __ovld __cnfn convert_double(double);
+double __ovld __cnfn convert_double(float);
+double __ovld __cnfn convert_double(int);
+double __ovld __cnfn convert_double(long);
+double __ovld __cnfn convert_double(short);
+double __ovld __cnfn convert_double(uchar);
+double __ovld __cnfn convert_double(uint);
+double __ovld __cnfn convert_double(ulong);
+double __ovld __cnfn convert_double(ushort);
+double __ovld __cnfn convert_double_rte(char);
+double __ovld __cnfn convert_double_rte(double);
+double __ovld __cnfn convert_double_rte(float);
+double __ovld __cnfn convert_double_rte(int);
+double __ovld __cnfn convert_double_rte(long);
+double __ovld __cnfn convert_double_rte(short);
+double __ovld __cnfn convert_double_rte(uchar);
+double __ovld __cnfn convert_double_rte(uint);
+double __ovld __cnfn convert_double_rte(ulong);
+double __ovld __cnfn convert_double_rte(ushort);
+double __ovld __cnfn convert_double_rtn(char);
+double __ovld __cnfn convert_double_rtn(double);
+double __ovld __cnfn convert_double_rtn(float);
+double __ovld __cnfn convert_double_rtn(int);
+double __ovld __cnfn convert_double_rtn(long);
+double __ovld __cnfn convert_double_rtn(short);
+double __ovld __cnfn convert_double_rtn(uchar);
+double __ovld __cnfn convert_double_rtn(uint);
+double __ovld __cnfn convert_double_rtn(ulong);
+double __ovld __cnfn convert_double_rtn(ushort);
+double __ovld __cnfn convert_double_rtp(char);
+double __ovld __cnfn convert_double_rtp(double);
+double __ovld __cnfn convert_double_rtp(float);
+double __ovld __cnfn convert_double_rtp(int);
+double __ovld __cnfn convert_double_rtp(long);
+double __ovld __cnfn convert_double_rtp(short);
+double __ovld __cnfn convert_double_rtp(uchar);
+double __ovld __cnfn convert_double_rtp(uint);
+double __ovld __cnfn convert_double_rtp(ulong);
+double __ovld __cnfn convert_double_rtp(ushort);
+double __ovld __cnfn convert_double_rtz(char);
+double __ovld __cnfn convert_double_rtz(double);
+double __ovld __cnfn convert_double_rtz(float);
+double __ovld __cnfn convert_double_rtz(int);
+double __ovld __cnfn convert_double_rtz(long);
+double __ovld __cnfn convert_double_rtz(short);
+double __ovld __cnfn convert_double_rtz(uchar);
+double __ovld __cnfn convert_double_rtz(uint);
+double __ovld __cnfn convert_double_rtz(ulong);
+double __ovld __cnfn convert_double_rtz(ushort);
+double2 __ovld __cnfn convert_double2(char2);
+double2 __ovld __cnfn convert_double2(double2);
+double2 __ovld __cnfn convert_double2(float2);
+double2 __ovld __cnfn convert_double2(int2);
+double2 __ovld __cnfn convert_double2(long2);
+double2 __ovld __cnfn convert_double2(short2);
+double2 __ovld __cnfn convert_double2(uchar2);
+double2 __ovld __cnfn convert_double2(uint2);
+double2 __ovld __cnfn convert_double2(ulong2);
+double2 __ovld __cnfn convert_double2(ushort2);
+double2 __ovld __cnfn convert_double2_rte(char2);
+double2 __ovld __cnfn convert_double2_rte(double2);
+double2 __ovld __cnfn convert_double2_rte(float2);
+double2 __ovld __cnfn convert_double2_rte(int2);
+double2 __ovld __cnfn convert_double2_rte(long2);
+double2 __ovld __cnfn convert_double2_rte(short2);
+double2 __ovld __cnfn convert_double2_rte(uchar2);
+double2 __ovld __cnfn convert_double2_rte(uint2);
+double2 __ovld __cnfn convert_double2_rte(ulong2);
+double2 __ovld __cnfn convert_double2_rte(ushort2);
+double2 __ovld __cnfn convert_double2_rtn(char2);
+double2 __ovld __cnfn convert_double2_rtn(double2);
+double2 __ovld __cnfn convert_double2_rtn(float2);
+double2 __ovld __cnfn convert_double2_rtn(int2);
+double2 __ovld __cnfn convert_double2_rtn(long2);
+double2 __ovld __cnfn convert_double2_rtn(short2);
+double2 __ovld __cnfn convert_double2_rtn(uchar2);
+double2 __ovld __cnfn convert_double2_rtn(uint2);
+double2 __ovld __cnfn convert_double2_rtn(ulong2);
+double2 __ovld __cnfn convert_double2_rtn(ushort2);
+double2 __ovld __cnfn convert_double2_rtp(char2);
+double2 __ovld __cnfn convert_double2_rtp(double2);
+double2 __ovld __cnfn convert_double2_rtp(float2);
+double2 __ovld __cnfn convert_double2_rtp(int2);
+double2 __ovld __cnfn convert_double2_rtp(long2);
+double2 __ovld __cnfn convert_double2_rtp(short2);
+double2 __ovld __cnfn convert_double2_rtp(uchar2);
+double2 __ovld __cnfn convert_double2_rtp(uint2);
+double2 __ovld __cnfn convert_double2_rtp(ulong2);
+double2 __ovld __cnfn convert_double2_rtp(ushort2);
+double2 __ovld __cnfn convert_double2_rtz(char2);
+double2 __ovld __cnfn convert_double2_rtz(double2);
+double2 __ovld __cnfn convert_double2_rtz(float2);
+double2 __ovld __cnfn convert_double2_rtz(int2);
+double2 __ovld __cnfn convert_double2_rtz(long2);
+double2 __ovld __cnfn convert_double2_rtz(short2);
+double2 __ovld __cnfn convert_double2_rtz(uchar2);
+double2 __ovld __cnfn convert_double2_rtz(uint2);
+double2 __ovld __cnfn convert_double2_rtz(ulong2);
+double2 __ovld __cnfn convert_double2_rtz(ushort2);
+double3 __ovld __cnfn convert_double3(char3);
+double3 __ovld __cnfn convert_double3(double3);
+double3 __ovld __cnfn convert_double3(float3);
+double3 __ovld __cnfn convert_double3(int3);
+double3 __ovld __cnfn convert_double3(long3);
+double3 __ovld __cnfn convert_double3(short3);
+double3 __ovld __cnfn convert_double3(uchar3);
+double3 __ovld __cnfn convert_double3(uint3);
+double3 __ovld __cnfn convert_double3(ulong3);
+double3 __ovld __cnfn convert_double3(ushort3);
+double3 __ovld __cnfn convert_double3_rte(char3);
+double3 __ovld __cnfn convert_double3_rte(double3);
+double3 __ovld __cnfn convert_double3_rte(float3);
+double3 __ovld __cnfn convert_double3_rte(int3);
+double3 __ovld __cnfn convert_double3_rte(long3);
+double3 __ovld __cnfn convert_double3_rte(short3);
+double3 __ovld __cnfn convert_double3_rte(uchar3);
+double3 __ovld __cnfn convert_double3_rte(uint3);
+double3 __ovld __cnfn convert_double3_rte(ulong3);
+double3 __ovld __cnfn convert_double3_rte(ushort3);
+double3 __ovld __cnfn convert_double3_rtn(char3);
+double3 __ovld __cnfn convert_double3_rtn(double3);
+double3 __ovld __cnfn convert_double3_rtn(float3);
+double3 __ovld __cnfn convert_double3_rtn(int3);
+double3 __ovld __cnfn convert_double3_rtn(long3);
+double3 __ovld __cnfn convert_double3_rtn(short3);
+double3 __ovld __cnfn convert_double3_rtn(uchar3);
+double3 __ovld __cnfn convert_double3_rtn(uint3);
+double3 __ovld __cnfn convert_double3_rtn(ulong3);
+double3 __ovld __cnfn convert_double3_rtn(ushort3);
+double3 __ovld __cnfn convert_double3_rtp(char3);
+double3 __ovld __cnfn convert_double3_rtp(double3);
+double3 __ovld __cnfn convert_double3_rtp(float3);
+double3 __ovld __cnfn convert_double3_rtp(int3);
+double3 __ovld __cnfn convert_double3_rtp(long3);
+double3 __ovld __cnfn convert_double3_rtp(short3);
+double3 __ovld __cnfn convert_double3_rtp(uchar3);
+double3 __ovld __cnfn convert_double3_rtp(uint3);
+double3 __ovld __cnfn convert_double3_rtp(ulong3);
+double3 __ovld __cnfn convert_double3_rtp(ushort3);
+double3 __ovld __cnfn convert_double3_rtz(char3);
+double3 __ovld __cnfn convert_double3_rtz(double3);
+double3 __ovld __cnfn convert_double3_rtz(float3);
+double3 __ovld __cnfn convert_double3_rtz(int3);
+double3 __ovld __cnfn convert_double3_rtz(long3);
+double3 __ovld __cnfn convert_double3_rtz(short3);
+double3 __ovld __cnfn convert_double3_rtz(uchar3);
+double3 __ovld __cnfn convert_double3_rtz(uint3);
+double3 __ovld __cnfn convert_double3_rtz(ulong3);
+double3 __ovld __cnfn convert_double3_rtz(ushort3);
+double4 __ovld __cnfn convert_double4(char4);
+double4 __ovld __cnfn convert_double4(double4);
+double4 __ovld __cnfn convert_double4(float4);
+double4 __ovld __cnfn convert_double4(int4);
+double4 __ovld __cnfn convert_double4(long4);
+double4 __ovld __cnfn convert_double4(short4);
+double4 __ovld __cnfn convert_double4(uchar4);
+double4 __ovld __cnfn convert_double4(uint4);
+double4 __ovld __cnfn convert_double4(ulong4);
+double4 __ovld __cnfn convert_double4(ushort4);
+double4 __ovld __cnfn convert_double4_rte(char4);
+double4 __ovld __cnfn convert_double4_rte(double4);
+double4 __ovld __cnfn convert_double4_rte(float4);
+double4 __ovld __cnfn convert_double4_rte(int4);
+double4 __ovld __cnfn convert_double4_rte(long4);
+double4 __ovld __cnfn convert_double4_rte(short4);
+double4 __ovld __cnfn convert_double4_rte(uchar4);
+double4 __ovld __cnfn convert_double4_rte(uint4);
+double4 __ovld __cnfn convert_double4_rte(ulong4);
+double4 __ovld __cnfn convert_double4_rte(ushort4);
+double4 __ovld __cnfn convert_double4_rtn(char4);
+double4 __ovld __cnfn convert_double4_rtn(double4);
+double4 __ovld __cnfn convert_double4_rtn(float4);
+double4 __ovld __cnfn convert_double4_rtn(int4);
+double4 __ovld __cnfn convert_double4_rtn(long4);
+double4 __ovld __cnfn convert_double4_rtn(short4);
+double4 __ovld __cnfn convert_double4_rtn(uchar4);
+double4 __ovld __cnfn convert_double4_rtn(uint4);
+double4 __ovld __cnfn convert_double4_rtn(ulong4);
+double4 __ovld __cnfn convert_double4_rtn(ushort4);
+double4 __ovld __cnfn convert_double4_rtp(char4);
+double4 __ovld __cnfn convert_double4_rtp(double4);
+double4 __ovld __cnfn convert_double4_rtp(float4);
+double4 __ovld __cnfn convert_double4_rtp(int4);
+double4 __ovld __cnfn convert_double4_rtp(long4);
+double4 __ovld __cnfn convert_double4_rtp(short4);
+double4 __ovld __cnfn convert_double4_rtp(uchar4);
+double4 __ovld __cnfn convert_double4_rtp(uint4);
+double4 __ovld __cnfn convert_double4_rtp(ulong4);
+double4 __ovld __cnfn convert_double4_rtp(ushort4);
+double4 __ovld __cnfn convert_double4_rtz(char4);
+double4 __ovld __cnfn convert_double4_rtz(double4);
+double4 __ovld __cnfn convert_double4_rtz(float4);
+double4 __ovld __cnfn convert_double4_rtz(int4);
+double4 __ovld __cnfn convert_double4_rtz(long4);
+double4 __ovld __cnfn convert_double4_rtz(short4);
+double4 __ovld __cnfn convert_double4_rtz(uchar4);
+double4 __ovld __cnfn convert_double4_rtz(uint4);
+double4 __ovld __cnfn convert_double4_rtz(ulong4);
+double4 __ovld __cnfn convert_double4_rtz(ushort4);
+double8 __ovld __cnfn convert_double8(char8);
+double8 __ovld __cnfn convert_double8(double8);
+double8 __ovld __cnfn convert_double8(float8);
+double8 __ovld __cnfn convert_double8(int8);
+double8 __ovld __cnfn convert_double8(long8);
+double8 __ovld __cnfn convert_double8(short8);
+double8 __ovld __cnfn convert_double8(uchar8);
+double8 __ovld __cnfn convert_double8(uint8);
+double8 __ovld __cnfn convert_double8(ulong8);
+double8 __ovld __cnfn convert_double8(ushort8);
+double8 __ovld __cnfn convert_double8_rte(char8);
+double8 __ovld __cnfn convert_double8_rte(double8);
+double8 __ovld __cnfn convert_double8_rte(float8);
+double8 __ovld __cnfn convert_double8_rte(int8);
+double8 __ovld __cnfn convert_double8_rte(long8);
+double8 __ovld __cnfn convert_double8_rte(short8);
+double8 __ovld __cnfn convert_double8_rte(uchar8);
+double8 __ovld __cnfn convert_double8_rte(uint8);
+double8 __ovld __cnfn convert_double8_rte(ulong8);
+double8 __ovld __cnfn convert_double8_rte(ushort8);
+double8 __ovld __cnfn convert_double8_rtn(char8);
+double8 __ovld __cnfn convert_double8_rtn(double8);
+double8 __ovld __cnfn convert_double8_rtn(float8);
+double8 __ovld __cnfn convert_double8_rtn(int8);
+double8 __ovld __cnfn convert_double8_rtn(long8);
+double8 __ovld __cnfn convert_double8_rtn(short8);
+double8 __ovld __cnfn convert_double8_rtn(uchar8);
+double8 __ovld __cnfn convert_double8_rtn(uint8);
+double8 __ovld __cnfn convert_double8_rtn(ulong8);
+double8 __ovld __cnfn convert_double8_rtn(ushort8);
+double8 __ovld __cnfn convert_double8_rtp(char8);
+double8 __ovld __cnfn convert_double8_rtp(double8);
+double8 __ovld __cnfn convert_double8_rtp(float8);
+double8 __ovld __cnfn convert_double8_rtp(int8);
+double8 __ovld __cnfn convert_double8_rtp(long8);
+double8 __ovld __cnfn convert_double8_rtp(short8);
+double8 __ovld __cnfn convert_double8_rtp(uchar8);
+double8 __ovld __cnfn convert_double8_rtp(uint8);
+double8 __ovld __cnfn convert_double8_rtp(ulong8);
+double8 __ovld __cnfn convert_double8_rtp(ushort8);
+double8 __ovld __cnfn convert_double8_rtz(char8);
+double8 __ovld __cnfn convert_double8_rtz(double8);
+double8 __ovld __cnfn convert_double8_rtz(float8);
+double8 __ovld __cnfn convert_double8_rtz(int8);
+double8 __ovld __cnfn convert_double8_rtz(long8);
+double8 __ovld __cnfn convert_double8_rtz(short8);
+double8 __ovld __cnfn convert_double8_rtz(uchar8);
+double8 __ovld __cnfn convert_double8_rtz(uint8);
+double8 __ovld __cnfn convert_double8_rtz(ulong8);
+double8 __ovld __cnfn convert_double8_rtz(ushort8);
+double16 __ovld __cnfn convert_double16(char16);
+double16 __ovld __cnfn convert_double16(double16);
+double16 __ovld __cnfn convert_double16(float16);
+double16 __ovld __cnfn convert_double16(int16);
+double16 __ovld __cnfn convert_double16(long16);
+double16 __ovld __cnfn convert_double16(short16);
+double16 __ovld __cnfn convert_double16(uchar16);
+double16 __ovld __cnfn convert_double16(uint16);
+double16 __ovld __cnfn convert_double16(ulong16);
+double16 __ovld __cnfn convert_double16(ushort16);
+double16 __ovld __cnfn convert_double16_rte(char16);
+double16 __ovld __cnfn convert_double16_rte(double16);
+double16 __ovld __cnfn convert_double16_rte(float16);
+double16 __ovld __cnfn convert_double16_rte(int16);
+double16 __ovld __cnfn convert_double16_rte(long16);
+double16 __ovld __cnfn convert_double16_rte(short16);
+double16 __ovld __cnfn convert_double16_rte(uchar16);
+double16 __ovld __cnfn convert_double16_rte(uint16);
+double16 __ovld __cnfn convert_double16_rte(ulong16);
+double16 __ovld __cnfn convert_double16_rte(ushort16);
+double16 __ovld __cnfn convert_double16_rtn(char16);
+double16 __ovld __cnfn convert_double16_rtn(double16);
+double16 __ovld __cnfn convert_double16_rtn(float16);
+double16 __ovld __cnfn convert_double16_rtn(int16);
+double16 __ovld __cnfn convert_double16_rtn(long16);
+double16 __ovld __cnfn convert_double16_rtn(short16);
+double16 __ovld __cnfn convert_double16_rtn(uchar16);
+double16 __ovld __cnfn convert_double16_rtn(uint16);
+double16 __ovld __cnfn convert_double16_rtn(ulong16);
+double16 __ovld __cnfn convert_double16_rtn(ushort16);
+double16 __ovld __cnfn convert_double16_rtp(char16);
+double16 __ovld __cnfn convert_double16_rtp(double16);
+double16 __ovld __cnfn convert_double16_rtp(float16);
+double16 __ovld __cnfn convert_double16_rtp(int16);
+double16 __ovld __cnfn convert_double16_rtp(long16);
+double16 __ovld __cnfn convert_double16_rtp(short16);
+double16 __ovld __cnfn convert_double16_rtp(uchar16);
+double16 __ovld __cnfn convert_double16_rtp(uint16);
+double16 __ovld __cnfn convert_double16_rtp(ulong16);
+double16 __ovld __cnfn convert_double16_rtp(ushort16);
+double16 __ovld __cnfn convert_double16_rtz(char16);
+double16 __ovld __cnfn convert_double16_rtz(double16);
+double16 __ovld __cnfn convert_double16_rtz(float16);
+double16 __ovld __cnfn convert_double16_rtz(int16);
+double16 __ovld __cnfn convert_double16_rtz(long16);
+double16 __ovld __cnfn convert_double16_rtz(short16);
+double16 __ovld __cnfn convert_double16_rtz(uchar16);
+double16 __ovld __cnfn convert_double16_rtz(uint16);
+double16 __ovld __cnfn convert_double16_rtz(ulong16);
+double16 __ovld __cnfn convert_double16_rtz(ushort16);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+// Convert half types to non-double types.
+uchar __ovld __cnfn convert_uchar(half);
+uchar __ovld __cnfn convert_uchar_rte(half);
+uchar __ovld __cnfn convert_uchar_rtp(half);
+uchar __ovld __cnfn convert_uchar_rtn(half);
+uchar __ovld __cnfn convert_uchar_rtz(half);
+uchar __ovld __cnfn convert_uchar_sat(half);
+uchar __ovld __cnfn convert_uchar_sat_rte(half);
+uchar __ovld __cnfn convert_uchar_sat_rtp(half);
+uchar __ovld __cnfn convert_uchar_sat_rtn(half);
+uchar __ovld __cnfn convert_uchar_sat_rtz(half);
+uchar2 __ovld __cnfn convert_uchar2(half2);
+uchar2 __ovld __cnfn convert_uchar2_rte(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(half2);
+uchar3 __ovld __cnfn convert_uchar3(half3);
+uchar3 __ovld __cnfn convert_uchar3_rte(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(half3);
+uchar4 __ovld __cnfn convert_uchar4(half4);
+uchar4 __ovld __cnfn convert_uchar4_rte(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(half4);
+uchar8 __ovld __cnfn convert_uchar8(half8);
+uchar8 __ovld __cnfn convert_uchar8_rte(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(half8);
+uchar16 __ovld __cnfn convert_uchar16(half16);
+uchar16 __ovld __cnfn convert_uchar16_rte(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(half16);
+ushort __ovld __cnfn convert_ushort(half);
+ushort __ovld __cnfn convert_ushort_rte(half);
+ushort __ovld __cnfn convert_ushort_rtp(half);
+ushort __ovld __cnfn convert_ushort_rtn(half);
+ushort __ovld __cnfn convert_ushort_rtz(half);
+ushort __ovld __cnfn convert_ushort_sat(half);
+ushort __ovld __cnfn convert_ushort_sat_rte(half);
+ushort __ovld __cnfn convert_ushort_sat_rtp(half);
+ushort __ovld __cnfn convert_ushort_sat_rtn(half);
+ushort __ovld __cnfn convert_ushort_sat_rtz(half);
+ushort2 __ovld __cnfn convert_ushort2(half2);
+ushort2 __ovld __cnfn convert_ushort2_rte(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(half2);
+ushort3 __ovld __cnfn convert_ushort3(half3);
+ushort3 __ovld __cnfn convert_ushort3_rte(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(half3);
+ushort4 __ovld __cnfn convert_ushort4(half4);
+ushort4 __ovld __cnfn convert_ushort4_rte(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(half4);
+ushort8 __ovld __cnfn convert_ushort8(half8);
+ushort8 __ovld __cnfn convert_ushort8_rte(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(half8);
+ushort16 __ovld __cnfn convert_ushort16(half16);
+ushort16 __ovld __cnfn convert_ushort16_rte(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(half16);
+uint __ovld __cnfn convert_uint(half);
+uint __ovld __cnfn convert_uint_rte(half);
+uint __ovld __cnfn convert_uint_rtp(half);
+uint __ovld __cnfn convert_uint_rtn(half);
+uint __ovld __cnfn convert_uint_rtz(half);
+uint __ovld __cnfn convert_uint_sat(half);
+uint __ovld __cnfn convert_uint_sat_rte(half);
+uint __ovld __cnfn convert_uint_sat_rtp(half);
+uint __ovld __cnfn convert_uint_sat_rtn(half);
+uint __ovld __cnfn convert_uint_sat_rtz(half);
+uint2 __ovld __cnfn convert_uint2(half2);
+uint2 __ovld __cnfn convert_uint2_rte(half2);
+uint2 __ovld __cnfn convert_uint2_rtp(half2);
+uint2 __ovld __cnfn convert_uint2_rtn(half2);
+uint2 __ovld __cnfn convert_uint2_rtz(half2);
+uint2 __ovld __cnfn convert_uint2_sat(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(half2);
+uint3 __ovld __cnfn convert_uint3(half3);
+uint3 __ovld __cnfn convert_uint3_rte(half3);
+uint3 __ovld __cnfn convert_uint3_rtp(half3);
+uint3 __ovld __cnfn convert_uint3_rtn(half3);
+uint3 __ovld __cnfn convert_uint3_rtz(half3);
+uint3 __ovld __cnfn convert_uint3_sat(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(half3);
+uint4 __ovld __cnfn convert_uint4(half4);
+uint4 __ovld __cnfn convert_uint4_rte(half4);
+uint4 __ovld __cnfn convert_uint4_rtp(half4);
+uint4 __ovld __cnfn convert_uint4_rtn(half4);
+uint4 __ovld __cnfn convert_uint4_rtz(half4);
+uint4 __ovld __cnfn convert_uint4_sat(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(half4);
+uint8 __ovld __cnfn convert_uint8(half8);
+uint8 __ovld __cnfn convert_uint8_rte(half8);
+uint8 __ovld __cnfn convert_uint8_rtp(half8);
+uint8 __ovld __cnfn convert_uint8_rtn(half8);
+uint8 __ovld __cnfn convert_uint8_rtz(half8);
+uint8 __ovld __cnfn convert_uint8_sat(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(half8);
+uint16 __ovld __cnfn convert_uint16(half16);
+uint16 __ovld __cnfn convert_uint16_rte(half16);
+uint16 __ovld __cnfn convert_uint16_rtp(half16);
+uint16 __ovld __cnfn convert_uint16_rtn(half16);
+uint16 __ovld __cnfn convert_uint16_rtz(half16);
+uint16 __ovld __cnfn convert_uint16_sat(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(half16);
+ulong __ovld __cnfn convert_ulong(half);
+ulong __ovld __cnfn convert_ulong_rte(half);
+ulong __ovld __cnfn convert_ulong_rtp(half);
+ulong __ovld __cnfn convert_ulong_rtn(half);
+ulong __ovld __cnfn convert_ulong_rtz(half);
+ulong __ovld __cnfn convert_ulong_sat(half);
+ulong __ovld __cnfn convert_ulong_sat_rte(half);
+ulong __ovld __cnfn convert_ulong_sat_rtp(half);
+ulong __ovld __cnfn convert_ulong_sat_rtn(half);
+ulong __ovld __cnfn convert_ulong_sat_rtz(half);
+ulong2 __ovld __cnfn convert_ulong2(half2);
+ulong2 __ovld __cnfn convert_ulong2_rte(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(half2);
+ulong3 __ovld __cnfn convert_ulong3(half3);
+ulong3 __ovld __cnfn convert_ulong3_rte(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(half3);
+ulong4 __ovld __cnfn convert_ulong4(half4);
+ulong4 __ovld __cnfn convert_ulong4_rte(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(half4);
+ulong8 __ovld __cnfn convert_ulong8(half8);
+ulong8 __ovld __cnfn convert_ulong8_rte(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(half8);
+ulong16 __ovld __cnfn convert_ulong16(half16);
+ulong16 __ovld __cnfn convert_ulong16_rte(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(half16);
+char __ovld __cnfn convert_char(half);
+char __ovld __cnfn convert_char_rte(half);
+char __ovld __cnfn convert_char_rtp(half);
+char __ovld __cnfn convert_char_rtn(half);
+char __ovld __cnfn convert_char_rtz(half);
+char __ovld __cnfn convert_char_sat(half);
+char __ovld __cnfn convert_char_sat_rte(half);
+char __ovld __cnfn convert_char_sat_rtp(half);
+char __ovld __cnfn convert_char_sat_rtn(half);
+char __ovld __cnfn convert_char_sat_rtz(half);
+char2 __ovld __cnfn convert_char2(half2);
+char2 __ovld __cnfn convert_char2_rte(half2);
+char2 __ovld __cnfn convert_char2_rtp(half2);
+char2 __ovld __cnfn convert_char2_rtn(half2);
+char2 __ovld __cnfn convert_char2_rtz(half2);
+char2 __ovld __cnfn convert_char2_sat(half2);
+char2 __ovld __cnfn convert_char2_sat_rte(half2);
+char2 __ovld __cnfn convert_char2_sat_rtp(half2);
+char2 __ovld __cnfn convert_char2_sat_rtn(half2);
+char2 __ovld __cnfn convert_char2_sat_rtz(half2);
+char3 __ovld __cnfn convert_char3(half3);
+char3 __ovld __cnfn convert_char3_rte(half3);
+char3 __ovld __cnfn convert_char3_rtp(half3);
+char3 __ovld __cnfn convert_char3_rtn(half3);
+char3 __ovld __cnfn convert_char3_rtz(half3);
+char3 __ovld __cnfn convert_char3_sat(half3);
+char3 __ovld __cnfn convert_char3_sat_rte(half3);
+char3 __ovld __cnfn convert_char3_sat_rtp(half3);
+char3 __ovld __cnfn convert_char3_sat_rtn(half3);
+char3 __ovld __cnfn convert_char3_sat_rtz(half3);
+char4 __ovld __cnfn convert_char4(half4);
+char4 __ovld __cnfn convert_char4_rte(half4);
+char4 __ovld __cnfn convert_char4_rtp(half4);
+char4 __ovld __cnfn convert_char4_rtn(half4);
+char4 __ovld __cnfn convert_char4_rtz(half4);
+char4 __ovld __cnfn convert_char4_sat(half4);
+char4 __ovld __cnfn convert_char4_sat_rte(half4);
+char4 __ovld __cnfn convert_char4_sat_rtp(half4);
+char4 __ovld __cnfn convert_char4_sat_rtn(half4);
+char4 __ovld __cnfn convert_char4_sat_rtz(half4);
+char8 __ovld __cnfn convert_char8(half8);
+char8 __ovld __cnfn convert_char8_rte(half8);
+char8 __ovld __cnfn convert_char8_rtp(half8);
+char8 __ovld __cnfn convert_char8_rtn(half8);
+char8 __ovld __cnfn convert_char8_rtz(half8);
+char8 __ovld __cnfn convert_char8_sat(half8);
+char8 __ovld __cnfn convert_char8_sat_rte(half8);
+char8 __ovld __cnfn convert_char8_sat_rtp(half8);
+char8 __ovld __cnfn convert_char8_sat_rtn(half8);
+char8 __ovld __cnfn convert_char8_sat_rtz(half8);
+char16 __ovld __cnfn convert_char16(half16);
+char16 __ovld __cnfn convert_char16_rte(half16);
+char16 __ovld __cnfn convert_char16_rtp(half16);
+char16 __ovld __cnfn convert_char16_rtn(half16);
+char16 __ovld __cnfn convert_char16_rtz(half16);
+char16 __ovld __cnfn convert_char16_sat(half16);
+char16 __ovld __cnfn convert_char16_sat_rte(half16);
+char16 __ovld __cnfn convert_char16_sat_rtp(half16);
+char16 __ovld __cnfn convert_char16_sat_rtn(half16);
+char16 __ovld __cnfn convert_char16_sat_rtz(half16);
+short __ovld __cnfn convert_short(half);
+short __ovld __cnfn convert_short_rte(half);
+short __ovld __cnfn convert_short_rtp(half);
+short __ovld __cnfn convert_short_rtn(half);
+short __ovld __cnfn convert_short_rtz(half);
+short __ovld __cnfn convert_short_sat(half);
+short __ovld __cnfn convert_short_sat_rte(half);
+short __ovld __cnfn convert_short_sat_rtp(half);
+short __ovld __cnfn convert_short_sat_rtn(half);
+short __ovld __cnfn convert_short_sat_rtz(half);
+short2 __ovld __cnfn convert_short2(half2);
+short2 __ovld __cnfn convert_short2_rte(half2);
+short2 __ovld __cnfn convert_short2_rtp(half2);
+short2 __ovld __cnfn convert_short2_rtn(half2);
+short2 __ovld __cnfn convert_short2_rtz(half2);
+short2 __ovld __cnfn convert_short2_sat(half2);
+short2 __ovld __cnfn convert_short2_sat_rte(half2);
+short2 __ovld __cnfn convert_short2_sat_rtp(half2);
+short2 __ovld __cnfn convert_short2_sat_rtn(half2);
+short2 __ovld __cnfn convert_short2_sat_rtz(half2);
+short3 __ovld __cnfn convert_short3(half3);
+short3 __ovld __cnfn convert_short3_rte(half3);
+short3 __ovld __cnfn convert_short3_rtp(half3);
+short3 __ovld __cnfn convert_short3_rtn(half3);
+short3 __ovld __cnfn convert_short3_rtz(half3);
+short3 __ovld __cnfn convert_short3_sat(half3);
+short3 __ovld __cnfn convert_short3_sat_rte(half3);
+short3 __ovld __cnfn convert_short3_sat_rtp(half3);
+short3 __ovld __cnfn convert_short3_sat_rtn(half3);
+short3 __ovld __cnfn convert_short3_sat_rtz(half3);
+short4 __ovld __cnfn convert_short4(half4);
+short4 __ovld __cnfn convert_short4_rte(half4);
+short4 __ovld __cnfn convert_short4_rtp(half4);
+short4 __ovld __cnfn convert_short4_rtn(half4);
+short4 __ovld __cnfn convert_short4_rtz(half4);
+short4 __ovld __cnfn convert_short4_sat(half4);
+short4 __ovld __cnfn convert_short4_sat_rte(half4);
+short4 __ovld __cnfn convert_short4_sat_rtp(half4);
+short4 __ovld __cnfn convert_short4_sat_rtn(half4);
+short4 __ovld __cnfn convert_short4_sat_rtz(half4);
+short8 __ovld __cnfn convert_short8(half8);
+short8 __ovld __cnfn convert_short8_rte(half8);
+short8 __ovld __cnfn convert_short8_rtp(half8);
+short8 __ovld __cnfn convert_short8_rtn(half8);
+short8 __ovld __cnfn convert_short8_rtz(half8);
+short8 __ovld __cnfn convert_short8_sat(half8);
+short8 __ovld __cnfn convert_short8_sat_rte(half8);
+short8 __ovld __cnfn convert_short8_sat_rtp(half8);
+short8 __ovld __cnfn convert_short8_sat_rtn(half8);
+short8 __ovld __cnfn convert_short8_sat_rtz(half8);
+short16 __ovld __cnfn convert_short16(half16);
+short16 __ovld __cnfn convert_short16_rte(half16);
+short16 __ovld __cnfn convert_short16_rtp(half16);
+short16 __ovld __cnfn convert_short16_rtn(half16);
+short16 __ovld __cnfn convert_short16_rtz(half16);
+short16 __ovld __cnfn convert_short16_sat(half16);
+short16 __ovld __cnfn convert_short16_sat_rte(half16);
+short16 __ovld __cnfn convert_short16_sat_rtp(half16);
+short16 __ovld __cnfn convert_short16_sat_rtn(half16);
+short16 __ovld __cnfn convert_short16_sat_rtz(half16);
+int __ovld __cnfn convert_int(half);
+int __ovld __cnfn convert_int_rte(half);
+int __ovld __cnfn convert_int_rtp(half);
+int __ovld __cnfn convert_int_rtn(half);
+int __ovld __cnfn convert_int_rtz(half);
+int __ovld __cnfn convert_int_sat(half);
+int __ovld __cnfn convert_int_sat_rte(half);
+int __ovld __cnfn convert_int_sat_rtp(half);
+int __ovld __cnfn convert_int_sat_rtn(half);
+int __ovld __cnfn convert_int_sat_rtz(half);
+int2 __ovld __cnfn convert_int2(half2);
+int2 __ovld __cnfn convert_int2_rte(half2);
+int2 __ovld __cnfn convert_int2_rtp(half2);
+int2 __ovld __cnfn convert_int2_rtn(half2);
+int2 __ovld __cnfn convert_int2_rtz(half2);
+int2 __ovld __cnfn convert_int2_sat(half2);
+int2 __ovld __cnfn convert_int2_sat_rte(half2);
+int2 __ovld __cnfn convert_int2_sat_rtp(half2);
+int2 __ovld __cnfn convert_int2_sat_rtn(half2);
+int2 __ovld __cnfn convert_int2_sat_rtz(half2);
+int3 __ovld __cnfn convert_int3(half3);
+int3 __ovld __cnfn convert_int3_rte(half3);
+int3 __ovld __cnfn convert_int3_rtp(half3);
+int3 __ovld __cnfn convert_int3_rtn(half3);
+int3 __ovld __cnfn convert_int3_rtz(half3);
+int3 __ovld __cnfn convert_int3_sat(half3);
+int3 __ovld __cnfn convert_int3_sat_rte(half3);
+int3 __ovld __cnfn convert_int3_sat_rtp(half3);
+int3 __ovld __cnfn convert_int3_sat_rtn(half3);
+int3 __ovld __cnfn convert_int3_sat_rtz(half3);
+int4 __ovld __cnfn convert_int4(half4);
+int4 __ovld __cnfn convert_int4_rte(half4);
+int4 __ovld __cnfn convert_int4_rtp(half4);
+int4 __ovld __cnfn convert_int4_rtn(half4);
+int4 __ovld __cnfn convert_int4_rtz(half4);
+int4 __ovld __cnfn convert_int4_sat(half4);
+int4 __ovld __cnfn convert_int4_sat_rte(half4);
+int4 __ovld __cnfn convert_int4_sat_rtp(half4);
+int4 __ovld __cnfn convert_int4_sat_rtn(half4);
+int4 __ovld __cnfn convert_int4_sat_rtz(half4);
+int8 __ovld __cnfn convert_int8(half8);
+int8 __ovld __cnfn convert_int8_rte(half8);
+int8 __ovld __cnfn convert_int8_rtp(half8);
+int8 __ovld __cnfn convert_int8_rtn(half8);
+int8 __ovld __cnfn convert_int8_rtz(half8);
+int8 __ovld __cnfn convert_int8_sat(half8);
+int8 __ovld __cnfn convert_int8_sat_rte(half8);
+int8 __ovld __cnfn convert_int8_sat_rtp(half8);
+int8 __ovld __cnfn convert_int8_sat_rtn(half8);
+int8 __ovld __cnfn convert_int8_sat_rtz(half8);
+int16 __ovld __cnfn convert_int16(half16);
+int16 __ovld __cnfn convert_int16_rte(half16);
+int16 __ovld __cnfn convert_int16_rtp(half16);
+int16 __ovld __cnfn convert_int16_rtn(half16);
+int16 __ovld __cnfn convert_int16_rtz(half16);
+int16 __ovld __cnfn convert_int16_sat(half16);
+int16 __ovld __cnfn convert_int16_sat_rte(half16);
+int16 __ovld __cnfn convert_int16_sat_rtp(half16);
+int16 __ovld __cnfn convert_int16_sat_rtn(half16);
+int16 __ovld __cnfn convert_int16_sat_rtz(half16);
+long __ovld __cnfn convert_long(half);
+long __ovld __cnfn convert_long_rte(half);
+long __ovld __cnfn convert_long_rtp(half);
+long __ovld __cnfn convert_long_rtn(half);
+long __ovld __cnfn convert_long_rtz(half);
+long __ovld __cnfn convert_long_sat(half);
+long __ovld __cnfn convert_long_sat_rte(half);
+long __ovld __cnfn convert_long_sat_rtp(half);
+long __ovld __cnfn convert_long_sat_rtn(half);
+long __ovld __cnfn convert_long_sat_rtz(half);
+long2 __ovld __cnfn convert_long2(half2);
+long2 __ovld __cnfn convert_long2_rte(half2);
+long2 __ovld __cnfn convert_long2_rtp(half2);
+long2 __ovld __cnfn convert_long2_rtn(half2);
+long2 __ovld __cnfn convert_long2_rtz(half2);
+long2 __ovld __cnfn convert_long2_sat(half2);
+long2 __ovld __cnfn convert_long2_sat_rte(half2);
+long2 __ovld __cnfn convert_long2_sat_rtp(half2);
+long2 __ovld __cnfn convert_long2_sat_rtn(half2);
+long2 __ovld __cnfn convert_long2_sat_rtz(half2);
+long3 __ovld __cnfn convert_long3(half3);
+long3 __ovld __cnfn convert_long3_rte(half3);
+long3 __ovld __cnfn convert_long3_rtp(half3);
+long3 __ovld __cnfn convert_long3_rtn(half3);
+long3 __ovld __cnfn convert_long3_rtz(half3);
+long3 __ovld __cnfn convert_long3_sat(half3);
+long3 __ovld __cnfn convert_long3_sat_rte(half3);
+long3 __ovld __cnfn convert_long3_sat_rtp(half3);
+long3 __ovld __cnfn convert_long3_sat_rtn(half3);
+long3 __ovld __cnfn convert_long3_sat_rtz(half3);
+long4 __ovld __cnfn convert_long4(half4);
+long4 __ovld __cnfn convert_long4_rte(half4);
+long4 __ovld __cnfn convert_long4_rtp(half4);
+long4 __ovld __cnfn convert_long4_rtn(half4);
+long4 __ovld __cnfn convert_long4_rtz(half4);
+long4 __ovld __cnfn convert_long4_sat(half4);
+long4 __ovld __cnfn convert_long4_sat_rte(half4);
+long4 __ovld __cnfn convert_long4_sat_rtp(half4);
+long4 __ovld __cnfn convert_long4_sat_rtn(half4);
+long4 __ovld __cnfn convert_long4_sat_rtz(half4);
+long8 __ovld __cnfn convert_long8(half8);
+long8 __ovld __cnfn convert_long8_rte(half8);
+long8 __ovld __cnfn convert_long8_rtp(half8);
+long8 __ovld __cnfn convert_long8_rtn(half8);
+long8 __ovld __cnfn convert_long8_rtz(half8);
+long8 __ovld __cnfn convert_long8_sat(half8);
+long8 __ovld __cnfn convert_long8_sat_rte(half8);
+long8 __ovld __cnfn convert_long8_sat_rtp(half8);
+long8 __ovld __cnfn convert_long8_sat_rtn(half8);
+long8 __ovld __cnfn convert_long8_sat_rtz(half8);
+long16 __ovld __cnfn convert_long16(half16);
+long16 __ovld __cnfn convert_long16_rte(half16);
+long16 __ovld __cnfn convert_long16_rtp(half16);
+long16 __ovld __cnfn convert_long16_rtn(half16);
+long16 __ovld __cnfn convert_long16_rtz(half16);
+long16 __ovld __cnfn convert_long16_sat(half16);
+long16 __ovld __cnfn convert_long16_sat_rte(half16);
+long16 __ovld __cnfn convert_long16_sat_rtp(half16);
+long16 __ovld __cnfn convert_long16_sat_rtn(half16);
+long16 __ovld __cnfn convert_long16_sat_rtz(half16);
+float __ovld __cnfn convert_float(half);
+float __ovld __cnfn convert_float_rte(half);
+float __ovld __cnfn convert_float_rtp(half);
+float __ovld __cnfn convert_float_rtn(half);
+float __ovld __cnfn convert_float_rtz(half);
+float2 __ovld __cnfn convert_float2(half2);
+float2 __ovld __cnfn convert_float2_rte(half2);
+float2 __ovld __cnfn convert_float2_rtp(half2);
+float2 __ovld __cnfn convert_float2_rtn(half2);
+float2 __ovld __cnfn convert_float2_rtz(half2);
+float3 __ovld __cnfn convert_float3(half3);
+float3 __ovld __cnfn convert_float3_rte(half3);
+float3 __ovld __cnfn convert_float3_rtp(half3);
+float3 __ovld __cnfn convert_float3_rtn(half3);
+float3 __ovld __cnfn convert_float3_rtz(half3);
+float4 __ovld __cnfn convert_float4(half4);
+float4 __ovld __cnfn convert_float4_rte(half4);
+float4 __ovld __cnfn convert_float4_rtp(half4);
+float4 __ovld __cnfn convert_float4_rtn(half4);
+float4 __ovld __cnfn convert_float4_rtz(half4);
+float8 __ovld __cnfn convert_float8(half8);
+float8 __ovld __cnfn convert_float8_rte(half8);
+float8 __ovld __cnfn convert_float8_rtp(half8);
+float8 __ovld __cnfn convert_float8_rtn(half8);
+float8 __ovld __cnfn convert_float8_rtz(half8);
+float16 __ovld __cnfn convert_float16(half16);
+float16 __ovld __cnfn convert_float16_rte(half16);
+float16 __ovld __cnfn convert_float16_rtp(half16);
+float16 __ovld __cnfn convert_float16_rtn(half16);
+float16 __ovld __cnfn convert_float16_rtz(half16);
+
+// Convert non-double types to half types.
+half __ovld __cnfn convert_half(uchar);
+half __ovld __cnfn convert_half(ushort);
+half __ovld __cnfn convert_half(uint);
+half __ovld __cnfn convert_half(ulong);
+half __ovld __cnfn convert_half(char);
+half __ovld __cnfn convert_half(short);
+half __ovld __cnfn convert_half(int);
+half __ovld __cnfn convert_half(long);
+half __ovld __cnfn convert_half(float);
+half __ovld __cnfn convert_half(half);
+half __ovld __cnfn convert_half_rte(uchar);
+half __ovld __cnfn convert_half_rte(ushort);
+half __ovld __cnfn convert_half_rte(uint);
+half __ovld __cnfn convert_half_rte(ulong);
+half __ovld __cnfn convert_half_rte(char);
+half __ovld __cnfn convert_half_rte(short);
+half __ovld __cnfn convert_half_rte(int);
+half __ovld __cnfn convert_half_rte(long);
+half __ovld __cnfn convert_half_rte(float);
+half __ovld __cnfn convert_half_rte(half);
+half __ovld __cnfn convert_half_rtp(uchar);
+half __ovld __cnfn convert_half_rtp(ushort);
+half __ovld __cnfn convert_half_rtp(uint);
+half __ovld __cnfn convert_half_rtp(ulong);
+half __ovld __cnfn convert_half_rtp(char);
+half __ovld __cnfn convert_half_rtp(short);
+half __ovld __cnfn convert_half_rtp(int);
+half __ovld __cnfn convert_half_rtp(long);
+half __ovld __cnfn convert_half_rtp(float);
+half __ovld __cnfn convert_half_rtp(half);
+half __ovld __cnfn convert_half_rtn(uchar);
+half __ovld __cnfn convert_half_rtn(ushort);
+half __ovld __cnfn convert_half_rtn(uint);
+half __ovld __cnfn convert_half_rtn(ulong);
+half __ovld __cnfn convert_half_rtn(char);
+half __ovld __cnfn convert_half_rtn(short);
+half __ovld __cnfn convert_half_rtn(int);
+half __ovld __cnfn convert_half_rtn(long);
+half __ovld __cnfn convert_half_rtn(float);
+half __ovld __cnfn convert_half_rtn(half);
+half __ovld __cnfn convert_half_rtz(uchar);
+half __ovld __cnfn convert_half_rtz(ushort);
+half __ovld __cnfn convert_half_rtz(uint);
+half __ovld __cnfn convert_half_rtz(ulong);
+half __ovld __cnfn convert_half_rtz(char);
+half __ovld __cnfn convert_half_rtz(short);
+half __ovld __cnfn convert_half_rtz(int);
+half __ovld __cnfn convert_half_rtz(long);
+half __ovld __cnfn convert_half_rtz(float);
+half __ovld __cnfn convert_half_rtz(half);
+half2 __ovld __cnfn convert_half2(char2);
+half2 __ovld __cnfn convert_half2(uchar2);
+half2 __ovld __cnfn convert_half2(short2);
+half2 __ovld __cnfn convert_half2(ushort2);
+half2 __ovld __cnfn convert_half2(int2);
+half2 __ovld __cnfn convert_half2(uint2);
+half2 __ovld __cnfn convert_half2(long2);
+half2 __ovld __cnfn convert_half2(ulong2);
+half2 __ovld __cnfn convert_half2(float2);
+half2 __ovld __cnfn convert_half2(half2);
+half2 __ovld __cnfn convert_half2_rte(char2);
+half2 __ovld __cnfn convert_half2_rte(uchar2);
+half2 __ovld __cnfn convert_half2_rte(short2);
+half2 __ovld __cnfn convert_half2_rte(ushort2);
+half2 __ovld __cnfn convert_half2_rte(int2);
+half2 __ovld __cnfn convert_half2_rte(uint2);
+half2 __ovld __cnfn convert_half2_rte(long2);
+half2 __ovld __cnfn convert_half2_rte(ulong2);
+half2 __ovld __cnfn convert_half2_rte(float2);
+half2 __ovld __cnfn convert_half2_rte(half2);
+half2 __ovld __cnfn convert_half2_rtp(char2);
+half2 __ovld __cnfn convert_half2_rtp(uchar2);
+half2 __ovld __cnfn convert_half2_rtp(short2);
+half2 __ovld __cnfn convert_half2_rtp(ushort2);
+half2 __ovld __cnfn convert_half2_rtp(int2);
+half2 __ovld __cnfn convert_half2_rtp(uint2);
+half2 __ovld __cnfn convert_half2_rtp(long2);
+half2 __ovld __cnfn convert_half2_rtp(ulong2);
+half2 __ovld __cnfn convert_half2_rtp(float2);
+half2 __ovld __cnfn convert_half2_rtp(half2);
+half2 __ovld __cnfn convert_half2_rtn(char2);
+half2 __ovld __cnfn convert_half2_rtn(uchar2);
+half2 __ovld __cnfn convert_half2_rtn(short2);
+half2 __ovld __cnfn convert_half2_rtn(ushort2);
+half2 __ovld __cnfn convert_half2_rtn(int2);
+half2 __ovld __cnfn convert_half2_rtn(uint2);
+half2 __ovld __cnfn convert_half2_rtn(long2);
+half2 __ovld __cnfn convert_half2_rtn(ulong2);
+half2 __ovld __cnfn convert_half2_rtn(float2);
+half2 __ovld __cnfn convert_half2_rtn(half2);
+half2 __ovld __cnfn convert_half2_rtz(char2);
+half2 __ovld __cnfn convert_half2_rtz(uchar2);
+half2 __ovld __cnfn convert_half2_rtz(short2);
+half2 __ovld __cnfn convert_half2_rtz(ushort2);
+half2 __ovld __cnfn convert_half2_rtz(int2);
+half2 __ovld __cnfn convert_half2_rtz(uint2);
+half2 __ovld __cnfn convert_half2_rtz(long2);
+half2 __ovld __cnfn convert_half2_rtz(ulong2);
+half2 __ovld __cnfn convert_half2_rtz(float2);
+half2 __ovld __cnfn convert_half2_rtz(half2);
+half3 __ovld __cnfn convert_half3(char3);
+half3 __ovld __cnfn convert_half3(uchar3);
+half3 __ovld __cnfn convert_half3(short3);
+half3 __ovld __cnfn convert_half3(ushort3);
+half3 __ovld __cnfn convert_half3(int3);
+half3 __ovld __cnfn convert_half3(uint3);
+half3 __ovld __cnfn convert_half3(long3);
+half3 __ovld __cnfn convert_half3(ulong3);
+half3 __ovld __cnfn convert_half3(float3);
+half3 __ovld __cnfn convert_half3(half3);
+half3 __ovld __cnfn convert_half3_rte(char3);
+half3 __ovld __cnfn convert_half3_rte(uchar3);
+half3 __ovld __cnfn convert_half3_rte(short3);
+half3 __ovld __cnfn convert_half3_rte(ushort3);
+half3 __ovld __cnfn convert_half3_rte(int3);
+half3 __ovld __cnfn convert_half3_rte(uint3);
+half3 __ovld __cnfn convert_half3_rte(long3);
+half3 __ovld __cnfn convert_half3_rte(ulong3);
+half3 __ovld __cnfn convert_half3_rte(float3);
+half3 __ovld __cnfn convert_half3_rte(half3);
+half3 __ovld __cnfn convert_half3_rtp(char3);
+half3 __ovld __cnfn convert_half3_rtp(uchar3);
+half3 __ovld __cnfn convert_half3_rtp(short3);
+half3 __ovld __cnfn convert_half3_rtp(ushort3);
+half3 __ovld __cnfn convert_half3_rtp(int3);
+half3 __ovld __cnfn convert_half3_rtp(uint3);
+half3 __ovld __cnfn convert_half3_rtp(long3);
+half3 __ovld __cnfn convert_half3_rtp(ulong3);
+half3 __ovld __cnfn convert_half3_rtp(float3);
+half3 __ovld __cnfn convert_half3_rtp(half3);
+half3 __ovld __cnfn convert_half3_rtn(char3);
+half3 __ovld __cnfn convert_half3_rtn(uchar3);
+half3 __ovld __cnfn convert_half3_rtn(short3);
+half3 __ovld __cnfn convert_half3_rtn(ushort3);
+half3 __ovld __cnfn convert_half3_rtn(int3);
+half3 __ovld __cnfn convert_half3_rtn(uint3);
+half3 __ovld __cnfn convert_half3_rtn(long3);
+half3 __ovld __cnfn convert_half3_rtn(ulong3);
+half3 __ovld __cnfn convert_half3_rtn(float3);
+half3 __ovld __cnfn convert_half3_rtn(half3);
+half3 __ovld __cnfn convert_half3_rtz(char3);
+half3 __ovld __cnfn convert_half3_rtz(uchar3);
+half3 __ovld __cnfn convert_half3_rtz(short3);
+half3 __ovld __cnfn convert_half3_rtz(ushort3);
+half3 __ovld __cnfn convert_half3_rtz(int3);
+half3 __ovld __cnfn convert_half3_rtz(uint3);
+half3 __ovld __cnfn convert_half3_rtz(long3);
+half3 __ovld __cnfn convert_half3_rtz(ulong3);
+half3 __ovld __cnfn convert_half3_rtz(float3);
+half3 __ovld __cnfn convert_half3_rtz(half3);
+half4 __ovld __cnfn convert_half4(char4);
+half4 __ovld __cnfn convert_half4(uchar4);
+half4 __ovld __cnfn convert_half4(short4);
+half4 __ovld __cnfn convert_half4(ushort4);
+half4 __ovld __cnfn convert_half4(int4);
+half4 __ovld __cnfn convert_half4(uint4);
+half4 __ovld __cnfn convert_half4(long4);
+half4 __ovld __cnfn convert_half4(ulong4);
+half4 __ovld __cnfn convert_half4(float4);
+half4 __ovld __cnfn convert_half4(half4);
+half4 __ovld __cnfn convert_half4_rte(char4);
+half4 __ovld __cnfn convert_half4_rte(uchar4);
+half4 __ovld __cnfn convert_half4_rte(short4);
+half4 __ovld __cnfn convert_half4_rte(ushort4);
+half4 __ovld __cnfn convert_half4_rte(int4);
+half4 __ovld __cnfn convert_half4_rte(uint4);
+half4 __ovld __cnfn convert_half4_rte(long4);
+half4 __ovld __cnfn convert_half4_rte(ulong4);
+half4 __ovld __cnfn convert_half4_rte(float4);
+half4 __ovld __cnfn convert_half4_rte(half4);
+half4 __ovld __cnfn convert_half4_rtp(char4);
+half4 __ovld __cnfn convert_half4_rtp(uchar4);
+half4 __ovld __cnfn convert_half4_rtp(short4);
+half4 __ovld __cnfn convert_half4_rtp(ushort4);
+half4 __ovld __cnfn convert_half4_rtp(int4);
+half4 __ovld __cnfn convert_half4_rtp(uint4);
+half4 __ovld __cnfn convert_half4_rtp(long4);
+half4 __ovld __cnfn convert_half4_rtp(ulong4);
+half4 __ovld __cnfn convert_half4_rtp(float4);
+half4 __ovld __cnfn convert_half4_rtp(half4);
+half4 __ovld __cnfn convert_half4_rtn(char4);
+half4 __ovld __cnfn convert_half4_rtn(uchar4);
+half4 __ovld __cnfn convert_half4_rtn(short4);
+half4 __ovld __cnfn convert_half4_rtn(ushort4);
+half4 __ovld __cnfn convert_half4_rtn(int4);
+half4 __ovld __cnfn convert_half4_rtn(uint4);
+half4 __ovld __cnfn convert_half4_rtn(long4);
+half4 __ovld __cnfn convert_half4_rtn(ulong4);
+half4 __ovld __cnfn convert_half4_rtn(float4);
+half4 __ovld __cnfn convert_half4_rtn(half4);
+half4 __ovld __cnfn convert_half4_rtz(char4);
+half4 __ovld __cnfn convert_half4_rtz(uchar4);
+half4 __ovld __cnfn convert_half4_rtz(short4);
+half4 __ovld __cnfn convert_half4_rtz(ushort4);
+half4 __ovld __cnfn convert_half4_rtz(int4);
+half4 __ovld __cnfn convert_half4_rtz(uint4);
+half4 __ovld __cnfn convert_half4_rtz(long4);
+half4 __ovld __cnfn convert_half4_rtz(ulong4);
+half4 __ovld __cnfn convert_half4_rtz(float4);
+half4 __ovld __cnfn convert_half4_rtz(half4);
+half8 __ovld __cnfn convert_half8(char8);
+half8 __ovld __cnfn convert_half8(uchar8);
+half8 __ovld __cnfn convert_half8(short8);
+half8 __ovld __cnfn convert_half8(ushort8);
+half8 __ovld __cnfn convert_half8(int8);
+half8 __ovld __cnfn convert_half8(uint8);
+half8 __ovld __cnfn convert_half8(long8);
+half8 __ovld __cnfn convert_half8(ulong8);
+half8 __ovld __cnfn convert_half8(float8);
+half8 __ovld __cnfn convert_half8(half8);
+half8 __ovld __cnfn convert_half8_rte(char8);
+half8 __ovld __cnfn convert_half8_rte(uchar8);
+half8 __ovld __cnfn convert_half8_rte(short8);
+half8 __ovld __cnfn convert_half8_rte(ushort8);
+half8 __ovld __cnfn convert_half8_rte(int8);
+half8 __ovld __cnfn convert_half8_rte(uint8);
+half8 __ovld __cnfn convert_half8_rte(long8);
+half8 __ovld __cnfn convert_half8_rte(ulong8);
+half8 __ovld __cnfn convert_half8_rte(float8);
+half8 __ovld __cnfn convert_half8_rte(half8);
+half8 __ovld __cnfn convert_half8_rtp(char8);
+half8 __ovld __cnfn convert_half8_rtp(uchar8);
+half8 __ovld __cnfn convert_half8_rtp(short8);
+half8 __ovld __cnfn convert_half8_rtp(ushort8);
+half8 __ovld __cnfn convert_half8_rtp(int8);
+half8 __ovld __cnfn convert_half8_rtp(uint8);
+half8 __ovld __cnfn convert_half8_rtp(long8);
+half8 __ovld __cnfn convert_half8_rtp(ulong8);
+half8 __ovld __cnfn convert_half8_rtp(float8);
+half8 __ovld __cnfn convert_half8_rtp(half8);
+half8 __ovld __cnfn convert_half8_rtn(char8);
+half8 __ovld __cnfn convert_half8_rtn(uchar8);
+half8 __ovld __cnfn convert_half8_rtn(short8);
+half8 __ovld __cnfn convert_half8_rtn(ushort8);
+half8 __ovld __cnfn convert_half8_rtn(int8);
+half8 __ovld __cnfn convert_half8_rtn(uint8);
+half8 __ovld __cnfn convert_half8_rtn(long8);
+half8 __ovld __cnfn convert_half8_rtn(ulong8);
+half8 __ovld __cnfn convert_half8_rtn(float8);
+half8 __ovld __cnfn convert_half8_rtn(half8);
+half8 __ovld __cnfn convert_half8_rtz(char8);
+half8 __ovld __cnfn convert_half8_rtz(uchar8);
+half8 __ovld __cnfn convert_half8_rtz(short8);
+half8 __ovld __cnfn convert_half8_rtz(ushort8);
+half8 __ovld __cnfn convert_half8_rtz(int8);
+half8 __ovld __cnfn convert_half8_rtz(uint8);
+half8 __ovld __cnfn convert_half8_rtz(long8);
+half8 __ovld __cnfn convert_half8_rtz(ulong8);
+half8 __ovld __cnfn convert_half8_rtz(float8);
+half8 __ovld __cnfn convert_half8_rtz(half8);
+half16 __ovld __cnfn convert_half16(char16);
+half16 __ovld __cnfn convert_half16(uchar16);
+half16 __ovld __cnfn convert_half16(short16);
+half16 __ovld __cnfn convert_half16(ushort16);
+half16 __ovld __cnfn convert_half16(int16);
+half16 __ovld __cnfn convert_half16(uint16);
+half16 __ovld __cnfn convert_half16(long16);
+half16 __ovld __cnfn convert_half16(ulong16);
+half16 __ovld __cnfn convert_half16(float16);
+half16 __ovld __cnfn convert_half16(half16);
+half16 __ovld __cnfn convert_half16_rte(char16);
+half16 __ovld __cnfn convert_half16_rte(uchar16);
+half16 __ovld __cnfn convert_half16_rte(short16);
+half16 __ovld __cnfn convert_half16_rte(ushort16);
+half16 __ovld __cnfn convert_half16_rte(int16);
+half16 __ovld __cnfn convert_half16_rte(uint16);
+half16 __ovld __cnfn convert_half16_rte(long16);
+half16 __ovld __cnfn convert_half16_rte(ulong16);
+half16 __ovld __cnfn convert_half16_rte(float16);
+half16 __ovld __cnfn convert_half16_rte(half16);
+half16 __ovld __cnfn convert_half16_rtp(char16);
+half16 __ovld __cnfn convert_half16_rtp(uchar16);
+half16 __ovld __cnfn convert_half16_rtp(short16);
+half16 __ovld __cnfn convert_half16_rtp(ushort16);
+half16 __ovld __cnfn convert_half16_rtp(int16);
+half16 __ovld __cnfn convert_half16_rtp(uint16);
+half16 __ovld __cnfn convert_half16_rtp(long16);
+half16 __ovld __cnfn convert_half16_rtp(ulong16);
+half16 __ovld __cnfn convert_half16_rtp(float16);
+half16 __ovld __cnfn convert_half16_rtp(half16);
+half16 __ovld __cnfn convert_half16_rtn(char16);
+half16 __ovld __cnfn convert_half16_rtn(uchar16);
+half16 __ovld __cnfn convert_half16_rtn(short16);
+half16 __ovld __cnfn convert_half16_rtn(ushort16);
+half16 __ovld __cnfn convert_half16_rtn(int16);
+half16 __ovld __cnfn convert_half16_rtn(uint16);
+half16 __ovld __cnfn convert_half16_rtn(long16);
+half16 __ovld __cnfn convert_half16_rtn(ulong16);
+half16 __ovld __cnfn convert_half16_rtn(float16);
+half16 __ovld __cnfn convert_half16_rtn(half16);
+half16 __ovld __cnfn convert_half16_rtz(char16);
+half16 __ovld __cnfn convert_half16_rtz(uchar16);
+half16 __ovld __cnfn convert_half16_rtz(short16);
+half16 __ovld __cnfn convert_half16_rtz(ushort16);
+half16 __ovld __cnfn convert_half16_rtz(int16);
+half16 __ovld __cnfn convert_half16_rtz(uint16);
+half16 __ovld __cnfn convert_half16_rtz(long16);
+half16 __ovld __cnfn convert_half16_rtz(ulong16);
+half16 __ovld __cnfn convert_half16_rtz(float16);
+half16 __ovld __cnfn convert_half16_rtz(half16);
+
+// Convert half types to double types.
+#ifdef cl_khr_fp64
+double __ovld __cnfn convert_double(half);
+double __ovld __cnfn convert_double_rte(half);
+double __ovld __cnfn convert_double_rtp(half);
+double __ovld __cnfn convert_double_rtn(half);
+double __ovld __cnfn convert_double_rtz(half);
+double2 __ovld __cnfn convert_double2(half2);
+double2 __ovld __cnfn convert_double2_rte(half2);
+double2 __ovld __cnfn convert_double2_rtp(half2);
+double2 __ovld __cnfn convert_double2_rtn(half2);
+double2 __ovld __cnfn convert_double2_rtz(half2);
+double3 __ovld __cnfn convert_double3(half3);
+double3 __ovld __cnfn convert_double3_rte(half3);
+double3 __ovld __cnfn convert_double3_rtp(half3);
+double3 __ovld __cnfn convert_double3_rtn(half3);
+double3 __ovld __cnfn convert_double3_rtz(half3);
+double4 __ovld __cnfn convert_double4(half4);
+double4 __ovld __cnfn convert_double4_rte(half4);
+double4 __ovld __cnfn convert_double4_rtp(half4);
+double4 __ovld __cnfn convert_double4_rtn(half4);
+double4 __ovld __cnfn convert_double4_rtz(half4);
+double8 __ovld __cnfn convert_double8(half8);
+double8 __ovld __cnfn convert_double8_rte(half8);
+double8 __ovld __cnfn convert_double8_rtp(half8);
+double8 __ovld __cnfn convert_double8_rtn(half8);
+double8 __ovld __cnfn convert_double8_rtz(half8);
+double16 __ovld __cnfn convert_double16(half16);
+double16 __ovld __cnfn convert_double16_rte(half16);
+double16 __ovld __cnfn convert_double16_rtp(half16);
+double16 __ovld __cnfn convert_double16_rtn(half16);
+double16 __ovld __cnfn convert_double16_rtz(half16);
+
+// Convert double types to half types.
+half __ovld __cnfn convert_half(double);
+half __ovld __cnfn convert_half_rte(double);
+half __ovld __cnfn convert_half_rtp(double);
+half __ovld __cnfn convert_half_rtn(double);
+half __ovld __cnfn convert_half_rtz(double);
+half2 __ovld __cnfn convert_half2(double2);
+half2 __ovld __cnfn convert_half2_rte(double2);
+half2 __ovld __cnfn convert_half2_rtp(double2);
+half2 __ovld __cnfn convert_half2_rtn(double2);
+half2 __ovld __cnfn convert_half2_rtz(double2);
+half3 __ovld __cnfn convert_half3(double3);
+half3 __ovld __cnfn convert_half3_rte(double3);
+half3 __ovld __cnfn convert_half3_rtp(double3);
+half3 __ovld __cnfn convert_half3_rtn(double3);
+half3 __ovld __cnfn convert_half3_rtz(double3);
+half4 __ovld __cnfn convert_half4(double4);
+half4 __ovld __cnfn convert_half4_rte(double4);
+half4 __ovld __cnfn convert_half4_rtp(double4);
+half4 __ovld __cnfn convert_half4_rtn(double4);
+half4 __ovld __cnfn convert_half4_rtz(double4);
+half8 __ovld __cnfn convert_half8(double8);
+half8 __ovld __cnfn convert_half8_rte(double8);
+half8 __ovld __cnfn convert_half8_rtp(double8);
+half8 __ovld __cnfn convert_half8_rtn(double8);
+half8 __ovld __cnfn convert_half8_rtz(double8);
+half16 __ovld __cnfn convert_half16(double16);
+half16 __ovld __cnfn convert_half16_rte(double16);
+half16 __ovld __cnfn convert_half16_rtp(double16);
+half16 __ovld __cnfn convert_half16_rtn(double16);
+half16 __ovld __cnfn convert_half16_rtz(double16);
+#endif //cl_khr_fp64
+
+#endif // cl_khr_fp16
+
+/**
+ * OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
+ * Reinterprets a data type as another data type of the same size
+ */
+char __ovld __cnfn as_char(char);
+char __ovld __cnfn as_char(uchar);
+
+char2 __ovld __cnfn as_char2(char2);
+char2 __ovld __cnfn as_char2(uchar2);
+char2 __ovld __cnfn as_char2(short);
+char2 __ovld __cnfn as_char2(ushort);
+
+char3 __ovld __cnfn as_char3(char3);
+char3 __ovld __cnfn as_char3(char4);
+char3 __ovld __cnfn as_char3(uchar3);
+char3 __ovld __cnfn as_char3(uchar4);
+char3 __ovld __cnfn as_char3(short2);
+char3 __ovld __cnfn as_char3(ushort2);
+char3 __ovld __cnfn as_char3(int);
+char3 __ovld __cnfn as_char3(uint);
+char3 __ovld __cnfn as_char3(float);
+
+char4 __ovld __cnfn as_char4(char3);
+char4 __ovld __cnfn as_char4(char4);
+char4 __ovld __cnfn as_char4(uchar3);
+char4 __ovld __cnfn as_char4(uchar4);
+char4 __ovld __cnfn as_char4(short2);
+char4 __ovld __cnfn as_char4(ushort2);
+char4 __ovld __cnfn as_char4(int);
+char4 __ovld __cnfn as_char4(uint);
+char4 __ovld __cnfn as_char4(float);
+
+char8 __ovld __cnfn as_char8(char8);
+char8 __ovld __cnfn as_char8(uchar8);
+char8 __ovld __cnfn as_char8(short3);
+char8 __ovld __cnfn as_char8(short4);
+char8 __ovld __cnfn as_char8(ushort3);
+char8 __ovld __cnfn as_char8(ushort4);
+char8 __ovld __cnfn as_char8(int2);
+char8 __ovld __cnfn as_char8(uint2);
+char8 __ovld __cnfn as_char8(long);
+char8 __ovld __cnfn as_char8(ulong);
+char8 __ovld __cnfn as_char8(float2);
+
+char16 __ovld __cnfn as_char16(char16);
+char16 __ovld __cnfn as_char16(uchar16);
+char16 __ovld __cnfn as_char16(short8);
+char16 __ovld __cnfn as_char16(ushort8);
+char16 __ovld __cnfn as_char16(int3);
+char16 __ovld __cnfn as_char16(int4);
+char16 __ovld __cnfn as_char16(uint3);
+char16 __ovld __cnfn as_char16(uint4);
+char16 __ovld __cnfn as_char16(long2);
+char16 __ovld __cnfn as_char16(ulong2);
+char16 __ovld __cnfn as_char16(float3);
+char16 __ovld __cnfn as_char16(float4);
+
+uchar __ovld __cnfn as_uchar(char);
+uchar __ovld __cnfn as_uchar(uchar);
+
+uchar2 __ovld __cnfn as_uchar2(char2);
+uchar2 __ovld __cnfn as_uchar2(uchar2);
+uchar2 __ovld __cnfn as_uchar2(short);
+uchar2 __ovld __cnfn as_uchar2(ushort);
+
+uchar3 __ovld __cnfn as_uchar3(char3);
+uchar3 __ovld __cnfn as_uchar3(char4);
+uchar3 __ovld __cnfn as_uchar3(uchar3);
+uchar3 __ovld __cnfn as_uchar3(uchar4);
+uchar3 __ovld __cnfn as_uchar3(short2);
+uchar3 __ovld __cnfn as_uchar3(ushort2);
+uchar3 __ovld __cnfn as_uchar3(int);
+uchar3 __ovld __cnfn as_uchar3(uint);
+uchar3 __ovld __cnfn as_uchar3(float);
+
+uchar4 __ovld __cnfn as_uchar4(char3);
+uchar4 __ovld __cnfn as_uchar4(char4);
+uchar4 __ovld __cnfn as_uchar4(uchar3);
+uchar4 __ovld __cnfn as_uchar4(uchar4);
+uchar4 __ovld __cnfn as_uchar4(short2);
+uchar4 __ovld __cnfn as_uchar4(ushort2);
+uchar4 __ovld __cnfn as_uchar4(int);
+uchar4 __ovld __cnfn as_uchar4(uint);
+uchar4 __ovld __cnfn as_uchar4(float);
+
+uchar8 __ovld __cnfn as_uchar8(char8);
+uchar8 __ovld __cnfn as_uchar8(uchar8);
+uchar8 __ovld __cnfn as_uchar8(short3);
+uchar8 __ovld __cnfn as_uchar8(short4);
+uchar8 __ovld __cnfn as_uchar8(ushort3);
+uchar8 __ovld __cnfn as_uchar8(ushort4);
+uchar8 __ovld __cnfn as_uchar8(int2);
+uchar8 __ovld __cnfn as_uchar8(uint2);
+uchar8 __ovld __cnfn as_uchar8(long);
+uchar8 __ovld __cnfn as_uchar8(ulong);
+uchar8 __ovld __cnfn as_uchar8(float2);
+
+uchar16 __ovld __cnfn as_uchar16(char16);
+uchar16 __ovld __cnfn as_uchar16(uchar16);
+uchar16 __ovld __cnfn as_uchar16(short8);
+uchar16 __ovld __cnfn as_uchar16(ushort8);
+uchar16 __ovld __cnfn as_uchar16(int3);
+uchar16 __ovld __cnfn as_uchar16(int4);
+uchar16 __ovld __cnfn as_uchar16(uint3);
+uchar16 __ovld __cnfn as_uchar16(uint4);
+uchar16 __ovld __cnfn as_uchar16(long2);
+uchar16 __ovld __cnfn as_uchar16(ulong2);
+uchar16 __ovld __cnfn as_uchar16(float3);
+uchar16 __ovld __cnfn as_uchar16(float4);
+
+short __ovld __cnfn as_short(char2);
+short __ovld __cnfn as_short(uchar2);
+short __ovld __cnfn as_short(short);
+short __ovld __cnfn as_short(ushort);
+
+short2 __ovld __cnfn as_short2(char3);
+short2 __ovld __cnfn as_short2(char4);
+short2 __ovld __cnfn as_short2(uchar3);
+short2 __ovld __cnfn as_short2(uchar4);
+short2 __ovld __cnfn as_short2(short2);
+short2 __ovld __cnfn as_short2(ushort2);
+short2 __ovld __cnfn as_short2(int);
+short2 __ovld __cnfn as_short2(uint);
+short2 __ovld __cnfn as_short2(float);
+
+short3 __ovld __cnfn as_short3(char8);
+short3 __ovld __cnfn as_short3(uchar8);
+short3 __ovld __cnfn as_short3(short3);
+short3 __ovld __cnfn as_short3(short4);
+short3 __ovld __cnfn as_short3(ushort3);
+short3 __ovld __cnfn as_short3(ushort4);
+short3 __ovld __cnfn as_short3(int2);
+short3 __ovld __cnfn as_short3(uint2);
+short3 __ovld __cnfn as_short3(long);
+short3 __ovld __cnfn as_short3(ulong);
+short3 __ovld __cnfn as_short3(float2);
+
+short4 __ovld __cnfn as_short4(char8);
+short4 __ovld __cnfn as_short4(uchar8);
+short4 __ovld __cnfn as_short4(short3);
+short4 __ovld __cnfn as_short4(short4);
+short4 __ovld __cnfn as_short4(ushort3);
+short4 __ovld __cnfn as_short4(ushort4);
+short4 __ovld __cnfn as_short4(int2);
+short4 __ovld __cnfn as_short4(uint2);
+short4 __ovld __cnfn as_short4(long);
+short4 __ovld __cnfn as_short4(ulong);
+short4 __ovld __cnfn as_short4(float2);
+
+short8 __ovld __cnfn as_short8(char16);
+short8 __ovld __cnfn as_short8(uchar16);
+short8 __ovld __cnfn as_short8(short8);
+short8 __ovld __cnfn as_short8(ushort8);
+short8 __ovld __cnfn as_short8(int3);
+short8 __ovld __cnfn as_short8(int4);
+short8 __ovld __cnfn as_short8(uint3);
+short8 __ovld __cnfn as_short8(uint4);
+short8 __ovld __cnfn as_short8(long2);
+short8 __ovld __cnfn as_short8(ulong2);
+short8 __ovld __cnfn as_short8(float3);
+short8 __ovld __cnfn as_short8(float4);
+
+short16 __ovld __cnfn as_short16(short16);
+short16 __ovld __cnfn as_short16(ushort16);
+short16 __ovld __cnfn as_short16(int8);
+short16 __ovld __cnfn as_short16(uint8);
+short16 __ovld __cnfn as_short16(long3);
+short16 __ovld __cnfn as_short16(long4);
+short16 __ovld __cnfn as_short16(ulong3);
+short16 __ovld __cnfn as_short16(ulong4);
+short16 __ovld __cnfn as_short16(float8);
+
+ushort __ovld __cnfn as_ushort(char2);
+ushort __ovld __cnfn as_ushort(uchar2);
+ushort __ovld __cnfn as_ushort(short);
+ushort __ovld __cnfn as_ushort(ushort);
+
+ushort2 __ovld __cnfn as_ushort2(char3);
+ushort2 __ovld __cnfn as_ushort2(char4);
+ushort2 __ovld __cnfn as_ushort2(uchar3);
+ushort2 __ovld __cnfn as_ushort2(uchar4);
+ushort2 __ovld __cnfn as_ushort2(short2);
+ushort2 __ovld __cnfn as_ushort2(ushort2);
+ushort2 __ovld __cnfn as_ushort2(int);
+ushort2 __ovld __cnfn as_ushort2(uint);
+ushort2 __ovld __cnfn as_ushort2(float);
+
+ushort3 __ovld __cnfn as_ushort3(char8);
+ushort3 __ovld __cnfn as_ushort3(uchar8);
+ushort3 __ovld __cnfn as_ushort3(short3);
+ushort3 __ovld __cnfn as_ushort3(short4);
+ushort3 __ovld __cnfn as_ushort3(ushort3);
+ushort3 __ovld __cnfn as_ushort3(ushort4);
+ushort3 __ovld __cnfn as_ushort3(int2);
+ushort3 __ovld __cnfn as_ushort3(uint2);
+ushort3 __ovld __cnfn as_ushort3(long);
+ushort3 __ovld __cnfn as_ushort3(ulong);
+ushort3 __ovld __cnfn as_ushort3(float2);
+
+ushort4 __ovld __cnfn as_ushort4(char8);
+ushort4 __ovld __cnfn as_ushort4(uchar8);
+ushort4 __ovld __cnfn as_ushort4(short3);
+ushort4 __ovld __cnfn as_ushort4(short4);
+ushort4 __ovld __cnfn as_ushort4(ushort3);
+ushort4 __ovld __cnfn as_ushort4(ushort4);
+ushort4 __ovld __cnfn as_ushort4(int2);
+ushort4 __ovld __cnfn as_ushort4(uint2);
+ushort4 __ovld __cnfn as_ushort4(long);
+ushort4 __ovld __cnfn as_ushort4(ulong);
+ushort4 __ovld __cnfn as_ushort4(float2);
+
+ushort8 __ovld __cnfn as_ushort8(char16);
+ushort8 __ovld __cnfn as_ushort8(uchar16);
+ushort8 __ovld __cnfn as_ushort8(short8);
+ushort8 __ovld __cnfn as_ushort8(ushort8);
+ushort8 __ovld __cnfn as_ushort8(int3);
+ushort8 __ovld __cnfn as_ushort8(int4);
+ushort8 __ovld __cnfn as_ushort8(uint3);
+ushort8 __ovld __cnfn as_ushort8(uint4);
+ushort8 __ovld __cnfn as_ushort8(long2);
+ushort8 __ovld __cnfn as_ushort8(ulong2);
+ushort8 __ovld __cnfn as_ushort8(float3);
+ushort8 __ovld __cnfn as_ushort8(float4);
+
+ushort16 __ovld __cnfn as_ushort16(short16);
+ushort16 __ovld __cnfn as_ushort16(ushort16);
+ushort16 __ovld __cnfn as_ushort16(int8);
+ushort16 __ovld __cnfn as_ushort16(uint8);
+ushort16 __ovld __cnfn as_ushort16(long3);
+ushort16 __ovld __cnfn as_ushort16(long4);
+ushort16 __ovld __cnfn as_ushort16(ulong3);
+ushort16 __ovld __cnfn as_ushort16(ulong4);
+ushort16 __ovld __cnfn as_ushort16(float8);
+
+int __ovld __cnfn as_int(char3);
+int __ovld __cnfn as_int(char4);
+int __ovld __cnfn as_int(uchar3);
+int __ovld __cnfn as_int(uchar4);
+int __ovld __cnfn as_int(short2);
+int __ovld __cnfn as_int(ushort2);
+int __ovld __cnfn as_int(int);
+int __ovld __cnfn as_int(uint);
+int __ovld __cnfn as_int(float);
+
+int2 __ovld __cnfn as_int2(char8);
+int2 __ovld __cnfn as_int2(uchar8);
+int2 __ovld __cnfn as_int2(short3);
+int2 __ovld __cnfn as_int2(short4);
+int2 __ovld __cnfn as_int2(ushort3);
+int2 __ovld __cnfn as_int2(ushort4);
+int2 __ovld __cnfn as_int2(int2);
+int2 __ovld __cnfn as_int2(uint2);
+int2 __ovld __cnfn as_int2(long);
+int2 __ovld __cnfn as_int2(ulong);
+int2 __ovld __cnfn as_int2(float2);
+
+int3 __ovld __cnfn as_int3(char16);
+int3 __ovld __cnfn as_int3(uchar16);
+int3 __ovld __cnfn as_int3(short8);
+int3 __ovld __cnfn as_int3(ushort8);
+int3 __ovld __cnfn as_int3(int3);
+int3 __ovld __cnfn as_int3(int4);
+int3 __ovld __cnfn as_int3(uint3);
+int3 __ovld __cnfn as_int3(uint4);
+int3 __ovld __cnfn as_int3(long2);
+int3 __ovld __cnfn as_int3(ulong2);
+int3 __ovld __cnfn as_int3(float3);
+int3 __ovld __cnfn as_int3(float4);
+
+int4 __ovld __cnfn as_int4(char16);
+int4 __ovld __cnfn as_int4(uchar16);
+int4 __ovld __cnfn as_int4(short8);
+int4 __ovld __cnfn as_int4(ushort8);
+int4 __ovld __cnfn as_int4(int3);
+int4 __ovld __cnfn as_int4(int4);
+int4 __ovld __cnfn as_int4(uint3);
+int4 __ovld __cnfn as_int4(uint4);
+int4 __ovld __cnfn as_int4(long2);
+int4 __ovld __cnfn as_int4(ulong2);
+int4 __ovld __cnfn as_int4(float3);
+int4 __ovld __cnfn as_int4(float4);
+
+int8 __ovld __cnfn as_int8(short16);
+int8 __ovld __cnfn as_int8(ushort16);
+int8 __ovld __cnfn as_int8(int8);
+int8 __ovld __cnfn as_int8(uint8);
+int8 __ovld __cnfn as_int8(long3);
+int8 __ovld __cnfn as_int8(long4);
+int8 __ovld __cnfn as_int8(ulong3);
+int8 __ovld __cnfn as_int8(ulong4);
+int8 __ovld __cnfn as_int8(float8);
+
+int16 __ovld __cnfn as_int16(int16);
+int16 __ovld __cnfn as_int16(uint16);
+int16 __ovld __cnfn as_int16(long8);
+int16 __ovld __cnfn as_int16(ulong8);
+int16 __ovld __cnfn as_int16(float16);
+
+uint __ovld __cnfn as_uint(char3);
+uint __ovld __cnfn as_uint(char4);
+uint __ovld __cnfn as_uint(uchar3);
+uint __ovld __cnfn as_uint(uchar4);
+uint __ovld __cnfn as_uint(short2);
+uint __ovld __cnfn as_uint(ushort2);
+uint __ovld __cnfn as_uint(int);
+uint __ovld __cnfn as_uint(uint);
+uint __ovld __cnfn as_uint(float);
+
+uint2 __ovld __cnfn as_uint2(char8);
+uint2 __ovld __cnfn as_uint2(uchar8);
+uint2 __ovld __cnfn as_uint2(short3);
+uint2 __ovld __cnfn as_uint2(short4);
+uint2 __ovld __cnfn as_uint2(ushort3);
+uint2 __ovld __cnfn as_uint2(ushort4);
+uint2 __ovld __cnfn as_uint2(int2);
+uint2 __ovld __cnfn as_uint2(uint2);
+uint2 __ovld __cnfn as_uint2(long);
+uint2 __ovld __cnfn as_uint2(ulong);
+uint2 __ovld __cnfn as_uint2(float2);
+
+uint3 __ovld __cnfn as_uint3(char16);
+uint3 __ovld __cnfn as_uint3(uchar16);
+uint3 __ovld __cnfn as_uint3(short8);
+uint3 __ovld __cnfn as_uint3(ushort8);
+uint3 __ovld __cnfn as_uint3(int3);
+uint3 __ovld __cnfn as_uint3(int4);
+uint3 __ovld __cnfn as_uint3(uint3);
+uint3 __ovld __cnfn as_uint3(uint4);
+uint3 __ovld __cnfn as_uint3(long2);
+uint3 __ovld __cnfn as_uint3(ulong2);
+uint3 __ovld __cnfn as_uint3(float3);
+uint3 __ovld __cnfn as_uint3(float4);
+
+uint4 __ovld __cnfn as_uint4(char16);
+uint4 __ovld __cnfn as_uint4(uchar16);
+uint4 __ovld __cnfn as_uint4(short8);
+uint4 __ovld __cnfn as_uint4(ushort8);
+uint4 __ovld __cnfn as_uint4(int3);
+uint4 __ovld __cnfn as_uint4(int4);
+uint4 __ovld __cnfn as_uint4(uint3);
+uint4 __ovld __cnfn as_uint4(uint4);
+uint4 __ovld __cnfn as_uint4(long2);
+uint4 __ovld __cnfn as_uint4(ulong2);
+uint4 __ovld __cnfn as_uint4(float3);
+uint4 __ovld __cnfn as_uint4(float4);
+
+uint8 __ovld __cnfn as_uint8(short16);
+uint8 __ovld __cnfn as_uint8(ushort16);
+uint8 __ovld __cnfn as_uint8(int8);
+uint8 __ovld __cnfn as_uint8(uint8);
+uint8 __ovld __cnfn as_uint8(long3);
+uint8 __ovld __cnfn as_uint8(long4);
+uint8 __ovld __cnfn as_uint8(ulong3);
+uint8 __ovld __cnfn as_uint8(ulong4);
+uint8 __ovld __cnfn as_uint8(float8);
+
+uint16 __ovld __cnfn as_uint16(int16);
+uint16 __ovld __cnfn as_uint16(uint16);
+uint16 __ovld __cnfn as_uint16(long8);
+uint16 __ovld __cnfn as_uint16(ulong8);
+uint16 __ovld __cnfn as_uint16(float16);
+
+long __ovld __cnfn as_long(char8);
+long __ovld __cnfn as_long(uchar8);
+long __ovld __cnfn as_long(short3);
+long __ovld __cnfn as_long(short4);
+long __ovld __cnfn as_long(ushort3);
+long __ovld __cnfn as_long(ushort4);
+long __ovld __cnfn as_long(int2);
+long __ovld __cnfn as_long(uint2);
+long __ovld __cnfn as_long(long);
+long __ovld __cnfn as_long(ulong);
+long __ovld __cnfn as_long(float2);
+
+long2 __ovld __cnfn as_long2(char16);
+long2 __ovld __cnfn as_long2(uchar16);
+long2 __ovld __cnfn as_long2(short8);
+long2 __ovld __cnfn as_long2(ushort8);
+long2 __ovld __cnfn as_long2(int3);
+long2 __ovld __cnfn as_long2(int4);
+long2 __ovld __cnfn as_long2(uint3);
+long2 __ovld __cnfn as_long2(uint4);
+long2 __ovld __cnfn as_long2(long2);
+long2 __ovld __cnfn as_long2(ulong2);
+long2 __ovld __cnfn as_long2(float3);
+long2 __ovld __cnfn as_long2(float4);
+
+long3 __ovld __cnfn as_long3(short16);
+long3 __ovld __cnfn as_long3(ushort16);
+long3 __ovld __cnfn as_long3(int8);
+long3 __ovld __cnfn as_long3(uint8);
+long3 __ovld __cnfn as_long3(long3);
+long3 __ovld __cnfn as_long3(long4);
+long3 __ovld __cnfn as_long3(ulong3);
+long3 __ovld __cnfn as_long3(ulong4);
+long3 __ovld __cnfn as_long3(float8);
+
+long4 __ovld __cnfn as_long4(short16);
+long4 __ovld __cnfn as_long4(ushort16);
+long4 __ovld __cnfn as_long4(int8);
+long4 __ovld __cnfn as_long4(uint8);
+long4 __ovld __cnfn as_long4(long3);
+long4 __ovld __cnfn as_long4(long4);
+long4 __ovld __cnfn as_long4(ulong3);
+long4 __ovld __cnfn as_long4(ulong4);
+long4 __ovld __cnfn as_long4(float8);
+
+long8 __ovld __cnfn as_long8(int16);
+long8 __ovld __cnfn as_long8(uint16);
+long8 __ovld __cnfn as_long8(long8);
+long8 __ovld __cnfn as_long8(ulong8);
+long8 __ovld __cnfn as_long8(float16);
+
+long16 __ovld __cnfn as_long16(long16);
+long16 __ovld __cnfn as_long16(ulong16);
+
+ulong __ovld __cnfn as_ulong(char8);
+ulong __ovld __cnfn as_ulong(uchar8);
+ulong __ovld __cnfn as_ulong(short3);
+ulong __ovld __cnfn as_ulong(short4);
+ulong __ovld __cnfn as_ulong(ushort3);
+ulong __ovld __cnfn as_ulong(ushort4);
+ulong __ovld __cnfn as_ulong(int2);
+ulong __ovld __cnfn as_ulong(uint2);
+ulong __ovld __cnfn as_ulong(long);
+ulong __ovld __cnfn as_ulong(ulong);
+ulong __ovld __cnfn as_ulong(float2);
+
+ulong2 __ovld __cnfn as_ulong2(char16);
+ulong2 __ovld __cnfn as_ulong2(uchar16);
+ulong2 __ovld __cnfn as_ulong2(short8);
+ulong2 __ovld __cnfn as_ulong2(ushort8);
+ulong2 __ovld __cnfn as_ulong2(int3);
+ulong2 __ovld __cnfn as_ulong2(int4);
+ulong2 __ovld __cnfn as_ulong2(uint3);
+ulong2 __ovld __cnfn as_ulong2(uint4);
+ulong2 __ovld __cnfn as_ulong2(long2);
+ulong2 __ovld __cnfn as_ulong2(ulong2);
+ulong2 __ovld __cnfn as_ulong2(float3);
+ulong2 __ovld __cnfn as_ulong2(float4);
+
+ulong3 __ovld __cnfn as_ulong3(short16);
+ulong3 __ovld __cnfn as_ulong3(ushort16);
+ulong3 __ovld __cnfn as_ulong3(int8);
+ulong3 __ovld __cnfn as_ulong3(uint8);
+ulong3 __ovld __cnfn as_ulong3(long3);
+ulong3 __ovld __cnfn as_ulong3(long4);
+ulong3 __ovld __cnfn as_ulong3(ulong3);
+ulong3 __ovld __cnfn as_ulong3(ulong4);
+ulong3 __ovld __cnfn as_ulong3(float8);
+
+ulong4 __ovld __cnfn as_ulong4(short16);
+ulong4 __ovld __cnfn as_ulong4(ushort16);
+ulong4 __ovld __cnfn as_ulong4(int8);
+ulong4 __ovld __cnfn as_ulong4(uint8);
+ulong4 __ovld __cnfn as_ulong4(long3);
+ulong4 __ovld __cnfn as_ulong4(long4);
+ulong4 __ovld __cnfn as_ulong4(ulong3);
+ulong4 __ovld __cnfn as_ulong4(ulong4);
+ulong4 __ovld __cnfn as_ulong4(float8);
+
+ulong8 __ovld __cnfn as_ulong8(int16);
+ulong8 __ovld __cnfn as_ulong8(uint16);
+ulong8 __ovld __cnfn as_ulong8(long8);
+ulong8 __ovld __cnfn as_ulong8(ulong8);
+ulong8 __ovld __cnfn as_ulong8(float16);
+
+ulong16 __ovld __cnfn as_ulong16(long16);
+ulong16 __ovld __cnfn as_ulong16(ulong16);
+
+float __ovld __cnfn as_float(char3);
+float __ovld __cnfn as_float(char4);
+float __ovld __cnfn as_float(uchar3);
+float __ovld __cnfn as_float(uchar4);
+float __ovld __cnfn as_float(short2);
+float __ovld __cnfn as_float(ushort2);
+float __ovld __cnfn as_float(int);
+float __ovld __cnfn as_float(uint);
+float __ovld __cnfn as_float(float);
+
+float2 __ovld __cnfn as_float2(char8);
+float2 __ovld __cnfn as_float2(uchar8);
+float2 __ovld __cnfn as_float2(short3);
+float2 __ovld __cnfn as_float2(short4);
+float2 __ovld __cnfn as_float2(ushort3);
+float2 __ovld __cnfn as_float2(ushort4);
+float2 __ovld __cnfn as_float2(int2);
+float2 __ovld __cnfn as_float2(uint2);
+float2 __ovld __cnfn as_float2(long);
+float2 __ovld __cnfn as_float2(ulong);
+float2 __ovld __cnfn as_float2(float2);
+
+float3 __ovld __cnfn as_float3(char16);
+float3 __ovld __cnfn as_float3(uchar16);
+float3 __ovld __cnfn as_float3(short8);
+float3 __ovld __cnfn as_float3(ushort8);
+float3 __ovld __cnfn as_float3(int3);
+float3 __ovld __cnfn as_float3(int4);
+float3 __ovld __cnfn as_float3(uint3);
+float3 __ovld __cnfn as_float3(uint4);
+float3 __ovld __cnfn as_float3(long2);
+float3 __ovld __cnfn as_float3(ulong2);
+float3 __ovld __cnfn as_float3(float3);
+float3 __ovld __cnfn as_float3(float4);
+
+float4 __ovld __cnfn as_float4(char16);
+float4 __ovld __cnfn as_float4(uchar16);
+float4 __ovld __cnfn as_float4(short8);
+float4 __ovld __cnfn as_float4(ushort8);
+float4 __ovld __cnfn as_float4(int3);
+float4 __ovld __cnfn as_float4(int4);
+float4 __ovld __cnfn as_float4(uint3);
+float4 __ovld __cnfn as_float4(uint4);
+float4 __ovld __cnfn as_float4(long2);
+float4 __ovld __cnfn as_float4(ulong2);
+float4 __ovld __cnfn as_float4(float3);
+float4 __ovld __cnfn as_float4(float4);
+
+float8 __ovld __cnfn as_float8(short16);
+float8 __ovld __cnfn as_float8(ushort16);
+float8 __ovld __cnfn as_float8(int8);
+float8 __ovld __cnfn as_float8(uint8);
+float8 __ovld __cnfn as_float8(long3);
+float8 __ovld __cnfn as_float8(long4);
+float8 __ovld __cnfn as_float8(ulong3);
+float8 __ovld __cnfn as_float8(ulong4);
+float8 __ovld __cnfn as_float8(float8);
+
+float16 __ovld __cnfn as_float16(int16);
+float16 __ovld __cnfn as_float16(uint16);
+float16 __ovld __cnfn as_float16(long8);
+float16 __ovld __cnfn as_float16(ulong8);
+float16 __ovld __cnfn as_float16(float16);
+
+#ifdef cl_khr_fp64
+char8 __ovld __cnfn as_char8(double);
+char16 __ovld __cnfn as_char16(double2);
+uchar8 __ovld __cnfn as_uchar8(double);
+uchar16 __ovld __cnfn as_uchar16(double2);
+short3 __ovld __cnfn as_short3(double);
+short4 __ovld __cnfn as_short4(double);
+short8 __ovld __cnfn as_short8(double2);
+short16 __ovld __cnfn as_short16(double3);
+short16 __ovld __cnfn as_short16(double4);
+ushort3 __ovld __cnfn as_ushort3(double);
+ushort4 __ovld __cnfn as_ushort4(double);
+ushort8 __ovld __cnfn as_ushort8(double2);
+ushort16 __ovld __cnfn as_ushort16(double3);
+ushort16 __ovld __cnfn as_ushort16(double4);
+int2 __ovld __cnfn as_int2(double);
+int3 __ovld __cnfn as_int3(double2);
+int4 __ovld __cnfn as_int4(double2);
+int8 __ovld __cnfn as_int8(double3);
+int8 __ovld __cnfn as_int8(double4);
+int16 __ovld __cnfn as_int16(double8);
+uint2 __ovld __cnfn as_uint2(double);
+uint3 __ovld __cnfn as_uint3(double2);
+uint4 __ovld __cnfn as_uint4(double2);
+uint8 __ovld __cnfn as_uint8(double3);
+uint8 __ovld __cnfn as_uint8(double4);
+uint16 __ovld __cnfn as_uint16(double8);
+long __ovld __cnfn as_long(double);
+long2 __ovld __cnfn as_long2(double2);
+long3 __ovld __cnfn as_long3(double3);
+long3 __ovld __cnfn as_long3(double4);
+long4 __ovld __cnfn as_long4(double3);
+long4 __ovld __cnfn as_long4(double4);
+long8 __ovld __cnfn as_long8(double8);
+long16 __ovld __cnfn as_long16(double16);
+ulong __ovld __cnfn as_ulong(double);
+ulong2 __ovld __cnfn as_ulong2(double2);
+ulong3 __ovld __cnfn as_ulong3(double3);
+ulong3 __ovld __cnfn as_ulong3(double4);
+ulong4 __ovld __cnfn as_ulong4(double3);
+ulong4 __ovld __cnfn as_ulong4(double4);
+ulong8 __ovld __cnfn as_ulong8(double8);
+ulong16 __ovld __cnfn as_ulong16(double16);
+float2 __ovld __cnfn as_float2(double);
+float3 __ovld __cnfn as_float3(double2);
+float4 __ovld __cnfn as_float4(double2);
+float8 __ovld __cnfn as_float8(double3);
+float8 __ovld __cnfn as_float8(double4);
+float16 __ovld __cnfn as_float16(double8);
+double __ovld __cnfn as_double(char8);
+double __ovld __cnfn as_double(uchar8);
+double __ovld __cnfn as_double(short3);
+double __ovld __cnfn as_double(short4);
+double __ovld __cnfn as_double(ushort3);
+double __ovld __cnfn as_double(ushort4);
+double __ovld __cnfn as_double(int2);
+double __ovld __cnfn as_double(uint2);
+double __ovld __cnfn as_double(long);
+double __ovld __cnfn as_double(ulong);
+double __ovld __cnfn as_double(float2);
+double __ovld __cnfn as_double(double);
+double2 __ovld __cnfn as_double2(char16);
+double2 __ovld __cnfn as_double2(uchar16);
+double2 __ovld __cnfn as_double2(short8);
+double2 __ovld __cnfn as_double2(ushort8);
+double2 __ovld __cnfn as_double2(int3);
+double2 __ovld __cnfn as_double2(int4);
+double2 __ovld __cnfn as_double2(uint3);
+double2 __ovld __cnfn as_double2(uint4);
+double2 __ovld __cnfn as_double2(long2);
+double2 __ovld __cnfn as_double2(ulong2);
+double2 __ovld __cnfn as_double2(float3);
+double2 __ovld __cnfn as_double2(float4);
+double2 __ovld __cnfn as_double2(double2);
+double3 __ovld __cnfn as_double3(short16);
+double3 __ovld __cnfn as_double3(ushort16);
+double3 __ovld __cnfn as_double3(int8);
+double3 __ovld __cnfn as_double3(uint8);
+double3 __ovld __cnfn as_double3(long3);
+double3 __ovld __cnfn as_double3(long4);
+double3 __ovld __cnfn as_double3(ulong3);
+double3 __ovld __cnfn as_double3(ulong4);
+double3 __ovld __cnfn as_double3(float8);
+double3 __ovld __cnfn as_double3(double3);
+double3 __ovld __cnfn as_double3(double4);
+double4 __ovld __cnfn as_double4(short16);
+double4 __ovld __cnfn as_double4(ushort16);
+double4 __ovld __cnfn as_double4(int8);
+double4 __ovld __cnfn as_double4(uint8);
+double4 __ovld __cnfn as_double4(long3);
+double4 __ovld __cnfn as_double4(long4);
+double4 __ovld __cnfn as_double4(ulong3);
+double4 __ovld __cnfn as_double4(ulong4);
+double4 __ovld __cnfn as_double4(float8);
+double4 __ovld __cnfn as_double4(double3);
+double4 __ovld __cnfn as_double4(double4);
+double8 __ovld __cnfn as_double8(int16);
+double8 __ovld __cnfn as_double8(uint16);
+double8 __ovld __cnfn as_double8(long8);
+double8 __ovld __cnfn as_double8(ulong8);
+double8 __ovld __cnfn as_double8(float16);
+double8 __ovld __cnfn as_double8(double8);
+double16 __ovld __cnfn as_double16(long16);
+double16 __ovld __cnfn as_double16(ulong16);
+double16 __ovld __cnfn as_double16(double16);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+char2 __ovld __cnfn as_char2(half);
+char3 __ovld __cnfn as_char3(half2);
+char4 __ovld __cnfn as_char4(half2);
+char8 __ovld __cnfn as_char8(half3);
+char8 __ovld __cnfn as_char8(half4);
+char16 __ovld __cnfn as_char16(half8);
+uchar2 __ovld __cnfn as_uchar2(half);
+uchar3 __ovld __cnfn as_uchar3(half2);
+uchar4 __ovld __cnfn as_uchar4(half2);
+uchar8 __ovld __cnfn as_uchar8(half3);
+uchar8 __ovld __cnfn as_uchar8(half4);
+uchar16 __ovld __cnfn as_uchar16(half8);
+short __ovld __cnfn as_short(half);
+short2 __ovld __cnfn as_short2(half2);
+short3 __ovld __cnfn as_short3(half3);
+short3 __ovld __cnfn as_short3(half4);
+short4 __ovld __cnfn as_short4(half3);
+short4 __ovld __cnfn as_short4(half4);
+short8 __ovld __cnfn as_short8(half8);
+short16 __ovld __cnfn as_short16(half16);
+ushort __ovld __cnfn as_ushort(half);
+ushort2 __ovld __cnfn as_ushort2(half2);
+ushort3 __ovld __cnfn as_ushort3(half3);
+ushort3 __ovld __cnfn as_ushort3(half4);
+ushort4 __ovld __cnfn as_ushort4(half3);
+ushort4 __ovld __cnfn as_ushort4(half4);
+ushort8 __ovld __cnfn as_ushort8(half8);
+ushort16 __ovld __cnfn as_ushort16(half16);
+int __ovld __cnfn as_int(half2);
+int2 __ovld __cnfn as_int2(half3);
+int2 __ovld __cnfn as_int2(half4);
+int3 __ovld __cnfn as_int3(half8);
+int4 __ovld __cnfn as_int4(half8);
+int8 __ovld __cnfn as_int8(half16);
+uint __ovld __cnfn as_uint(half2);
+uint2 __ovld __cnfn as_uint2(half3);
+uint2 __ovld __cnfn as_uint2(half4);
+uint3 __ovld __cnfn as_uint3(half8);
+uint4 __ovld __cnfn as_uint4(half8);
+uint8 __ovld __cnfn as_uint8(half16);
+long __ovld __cnfn as_long(half3);
+long __ovld __cnfn as_long(half4);
+long2 __ovld __cnfn as_long2(half8);
+long3 __ovld __cnfn as_long3(half16);
+long4 __ovld __cnfn as_long4(half16);
+ulong __ovld __cnfn as_ulong(half3);
+ulong __ovld __cnfn as_ulong(half4);
+ulong2 __ovld __cnfn as_ulong2(half8);
+ulong3 __ovld __cnfn as_ulong3(half16);
+ulong4 __ovld __cnfn as_ulong4(half16);
+half __ovld __cnfn as_half(char2);
+half __ovld __cnfn as_half(uchar2);
+half __ovld __cnfn as_half(short);
+half __ovld __cnfn as_half(ushort);
+half __ovld __cnfn as_half(half);
+half2 __ovld __cnfn as_half2(char3);
+half2 __ovld __cnfn as_half2(char4);
+half2 __ovld __cnfn as_half2(uchar3);
+half2 __ovld __cnfn as_half2(uchar4);
+half2 __ovld __cnfn as_half2(short2);
+half2 __ovld __cnfn as_half2(ushort2);
+half2 __ovld __cnfn as_half2(int);
+half2 __ovld __cnfn as_half2(uint);
+half2 __ovld __cnfn as_half2(half2);
+half2 __ovld __cnfn as_half2(float);
+half3 __ovld __cnfn as_half3(char8);
+half3 __ovld __cnfn as_half3(uchar8);
+half3 __ovld __cnfn as_half3(short3);
+half3 __ovld __cnfn as_half3(short4);
+half3 __ovld __cnfn as_half3(ushort3);
+half3 __ovld __cnfn as_half3(ushort4);
+half3 __ovld __cnfn as_half3(int2);
+half3 __ovld __cnfn as_half3(uint2);
+half3 __ovld __cnfn as_half3(long);
+half3 __ovld __cnfn as_half3(ulong);
+half3 __ovld __cnfn as_half3(half3);
+half3 __ovld __cnfn as_half3(half4);
+half3 __ovld __cnfn as_half3(float2);
+half4 __ovld __cnfn as_half4(char8);
+half4 __ovld __cnfn as_half4(uchar8);
+half4 __ovld __cnfn as_half4(short3);
+half4 __ovld __cnfn as_half4(short4);
+half4 __ovld __cnfn as_half4(ushort3);
+half4 __ovld __cnfn as_half4(ushort4);
+half4 __ovld __cnfn as_half4(int2);
+half4 __ovld __cnfn as_half4(uint2);
+half4 __ovld __cnfn as_half4(long);
+half4 __ovld __cnfn as_half4(ulong);
+half4 __ovld __cnfn as_half4(half3);
+half4 __ovld __cnfn as_half4(half4);
+half4 __ovld __cnfn as_half4(float2);
+half8 __ovld __cnfn as_half8(char16);
+half8 __ovld __cnfn as_half8(uchar16);
+half8 __ovld __cnfn as_half8(short8);
+half8 __ovld __cnfn as_half8(ushort8);
+half8 __ovld __cnfn as_half8(int3);
+half8 __ovld __cnfn as_half8(int4);
+half8 __ovld __cnfn as_half8(uint3);
+half8 __ovld __cnfn as_half8(uint4);
+half8 __ovld __cnfn as_half8(long2);
+half8 __ovld __cnfn as_half8(ulong2);
+half8 __ovld __cnfn as_half8(half8);
+half8 __ovld __cnfn as_half8(float3);
+half8 __ovld __cnfn as_half8(float4);
+half16 __ovld __cnfn as_half16(short16);
+half16 __ovld __cnfn as_half16(ushort16);
+half16 __ovld __cnfn as_half16(int8);
+half16 __ovld __cnfn as_half16(uint8);
+half16 __ovld __cnfn as_half16(long3);
+half16 __ovld __cnfn as_half16(long4);
+half16 __ovld __cnfn as_half16(ulong3);
+half16 __ovld __cnfn as_half16(ulong4);
+half16 __ovld __cnfn as_half16(half16);
+half16 __ovld __cnfn as_half16(float8);
+float __ovld __cnfn as_float(half2);
+float2 __ovld __cnfn as_float2(half3);
+float2 __ovld __cnfn as_float2(half4);
+float3 __ovld __cnfn as_float3(half8);
+float4 __ovld __cnfn as_float4(half8);
+float8 __ovld __cnfn as_float8(half16);
+
+#ifdef cl_khr_fp64
+half3 __ovld __cnfn as_half3(double);
+half4 __ovld __cnfn as_half4(double);
+half8 __ovld __cnfn as_half8(double2);
+half16 __ovld __cnfn as_half16(double3);
+half16 __ovld __cnfn as_half16(double4);
+double __ovld __cnfn as_double(half3);
+double __ovld __cnfn as_double(half4);
+double2 __ovld __cnfn as_double2(half8);
+double3 __ovld __cnfn as_double3(half16);
+double4 __ovld __cnfn as_double4(half16);
+#endif //cl_khr_fp64
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
+
+#define __kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+#define kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+// OpenCL v1.1 s6.11.1, v1.2 s6.12.1, v2.0 s6.13.1 - Work-item Functions
+
+/**
+ * Returns the number of dimensions in use. This is the
+ * value given to the work_dim argument specified in
+ * clEnqueueNDRangeKernel.
+ * For clEnqueueTask, this returns 1.
+ */
+uint __ovld __cnfn get_work_dim(void);
+
+/**
+ * Returns the number of global work-items specified for
+ * dimension identified by dimindx. This value is given by
+ * the global_work_size argument to
+ * clEnqueueNDRangeKernel. Valid values of dimindx
+ * are 0 to get_work_dim() - 1. For other values of
+ * dimindx, get_global_size() returns 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_global_size(uint dimindx);
+
+/**
+ * Returns the unique global work-item ID value for
+ * dimension identified by dimindx. The global work-item
+ * ID specifies the work-item ID based on the number of
+ * global work-items specified to execute the kernel. Valid
+ * values of dimindx are 0 to get_work_dim() - 1. For
+ * other values of dimindx, get_global_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_global_id(uint dimindx);
+
+/**
+ * Returns the number of local work-items specified in
+ * dimension identified by dimindx. This value is given by
+ * the local_work_size argument to
+ * clEnqueueNDRangeKernel if local_work_size is not
+ * NULL; otherwise the OpenCL implementation chooses
+ * an appropriate local_work_size value which is returned
+ * by this function. Valid values of dimindx are 0 to
+ * get_work_dim() - 1. For other values of dimindx,
+ * get_local_size() returns 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_local_size(uint dimindx);
+
+/**
+ * Returns the unique local work-item ID i.e. a work-item
+ * within a specific work-group for dimension identified by
+ * dimindx. Valid values of dimindx are 0 to
+ * get_work_dim() - 1. For other values of dimindx,
+ * get_local_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_local_id(uint dimindx);
+
+/**
+ * Returns the number of work-groups that will execute a
+ * kernel for dimension identified by dimindx.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values of dimindx, get_num_groups () returns
+ * 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_num_groups(uint dimindx);
+
+/**
+ * get_group_id returns the work-group ID which is a
+ * number from 0 .. get_num_groups(dimindx) - 1.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values, get_group_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_group_id(uint dimindx);
+
+/**
+ * get_global_offset returns the offset values specified in
+ * global_work_offset argument to
+ * clEnqueueNDRangeKernel.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values, get_global_offset() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_global_offset(uint dimindx);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+size_t __ovld get_enqueued_local_size(uint dimindx);
+size_t __ovld get_global_linear_id(void);
+size_t __ovld get_local_linear_id(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.2, v1.2 s6.12.2, v2.0 s6.13.2 - Math functions
+
+/**
+ * Arc cosine function.
+ */
+float __ovld __cnfn acos(float);
+float2 __ovld __cnfn acos(float2);
+float3 __ovld __cnfn acos(float3);
+float4 __ovld __cnfn acos(float4);
+float8 __ovld __cnfn acos(float8);
+float16 __ovld __cnfn acos(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acos(double);
+double2 __ovld __cnfn acos(double2);
+double3 __ovld __cnfn acos(double3);
+double4 __ovld __cnfn acos(double4);
+double8 __ovld __cnfn acos(double8);
+double16 __ovld __cnfn acos(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acos(half);
+half2 __ovld __cnfn acos(half2);
+half3 __ovld __cnfn acos(half3);
+half4 __ovld __cnfn acos(half4);
+half8 __ovld __cnfn acos(half8);
+half16 __ovld __cnfn acos(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Inverse hyperbolic cosine.
+ */
+float __ovld __cnfn acosh(float);
+float2 __ovld __cnfn acosh(float2);
+float3 __ovld __cnfn acosh(float3);
+float4 __ovld __cnfn acosh(float4);
+float8 __ovld __cnfn acosh(float8);
+float16 __ovld __cnfn acosh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acosh(double);
+double2 __ovld __cnfn acosh(double2);
+double3 __ovld __cnfn acosh(double3);
+double4 __ovld __cnfn acosh(double4);
+double8 __ovld __cnfn acosh(double8);
+double16 __ovld __cnfn acosh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acosh(half);
+half2 __ovld __cnfn acosh(half2);
+half3 __ovld __cnfn acosh(half3);
+half4 __ovld __cnfn acosh(half4);
+half8 __ovld __cnfn acosh(half8);
+half16 __ovld __cnfn acosh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute acos (x) / PI.
+ */
+float __ovld __cnfn acospi(float x);
+float2 __ovld __cnfn acospi(float2 x);
+float3 __ovld __cnfn acospi(float3 x);
+float4 __ovld __cnfn acospi(float4 x);
+float8 __ovld __cnfn acospi(float8 x);
+float16 __ovld __cnfn acospi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acospi(double x);
+double2 __ovld __cnfn acospi(double2 x);
+double3 __ovld __cnfn acospi(double3 x);
+double4 __ovld __cnfn acospi(double4 x);
+double8 __ovld __cnfn acospi(double8 x);
+double16 __ovld __cnfn acospi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acospi(half x);
+half2 __ovld __cnfn acospi(half2 x);
+half3 __ovld __cnfn acospi(half3 x);
+half4 __ovld __cnfn acospi(half4 x);
+half8 __ovld __cnfn acospi(half8 x);
+half16 __ovld __cnfn acospi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc sine function.
+ */
+float __ovld __cnfn asin(float);
+float2 __ovld __cnfn asin(float2);
+float3 __ovld __cnfn asin(float3);
+float4 __ovld __cnfn asin(float4);
+float8 __ovld __cnfn asin(float8);
+float16 __ovld __cnfn asin(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asin(double);
+double2 __ovld __cnfn asin(double2);
+double3 __ovld __cnfn asin(double3);
+double4 __ovld __cnfn asin(double4);
+double8 __ovld __cnfn asin(double8);
+double16 __ovld __cnfn asin(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asin(half);
+half2 __ovld __cnfn asin(half2);
+half3 __ovld __cnfn asin(half3);
+half4 __ovld __cnfn asin(half4);
+half8 __ovld __cnfn asin(half8);
+half16 __ovld __cnfn asin(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Inverse hyperbolic sine.
+ */
+float __ovld __cnfn asinh(float);
+float2 __ovld __cnfn asinh(float2);
+float3 __ovld __cnfn asinh(float3);
+float4 __ovld __cnfn asinh(float4);
+float8 __ovld __cnfn asinh(float8);
+float16 __ovld __cnfn asinh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asinh(double);
+double2 __ovld __cnfn asinh(double2);
+double3 __ovld __cnfn asinh(double3);
+double4 __ovld __cnfn asinh(double4);
+double8 __ovld __cnfn asinh(double8);
+double16 __ovld __cnfn asinh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asinh(half);
+half2 __ovld __cnfn asinh(half2);
+half3 __ovld __cnfn asinh(half3);
+half4 __ovld __cnfn asinh(half4);
+half8 __ovld __cnfn asinh(half8);
+half16 __ovld __cnfn asinh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute asin (x) / PI.
+ */
+float __ovld __cnfn asinpi(float x);
+float2 __ovld __cnfn asinpi(float2 x);
+float3 __ovld __cnfn asinpi(float3 x);
+float4 __ovld __cnfn asinpi(float4 x);
+float8 __ovld __cnfn asinpi(float8 x);
+float16 __ovld __cnfn asinpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asinpi(double x);
+double2 __ovld __cnfn asinpi(double2 x);
+double3 __ovld __cnfn asinpi(double3 x);
+double4 __ovld __cnfn asinpi(double4 x);
+double8 __ovld __cnfn asinpi(double8 x);
+double16 __ovld __cnfn asinpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asinpi(half x);
+half2 __ovld __cnfn asinpi(half2 x);
+half3 __ovld __cnfn asinpi(half3 x);
+half4 __ovld __cnfn asinpi(half4 x);
+half8 __ovld __cnfn asinpi(half8 x);
+half16 __ovld __cnfn asinpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc tangent function.
+ */
+float __ovld __cnfn atan(float y_over_x);
+float2 __ovld __cnfn atan(float2 y_over_x);
+float3 __ovld __cnfn atan(float3 y_over_x);
+float4 __ovld __cnfn atan(float4 y_over_x);
+float8 __ovld __cnfn atan(float8 y_over_x);
+float16 __ovld __cnfn atan(float16 y_over_x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan(double y_over_x);
+double2 __ovld __cnfn atan(double2 y_over_x);
+double3 __ovld __cnfn atan(double3 y_over_x);
+double4 __ovld __cnfn atan(double4 y_over_x);
+double8 __ovld __cnfn atan(double8 y_over_x);
+double16 __ovld __cnfn atan(double16 y_over_x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan(half y_over_x);
+half2 __ovld __cnfn atan(half2 y_over_x);
+half3 __ovld __cnfn atan(half3 y_over_x);
+half4 __ovld __cnfn atan(half4 y_over_x);
+half8 __ovld __cnfn atan(half8 y_over_x);
+half16 __ovld __cnfn atan(half16 y_over_x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc tangent of y / x.
+ */
+float __ovld __cnfn atan2(float y, float x);
+float2 __ovld __cnfn atan2(float2 y, float2 x);
+float3 __ovld __cnfn atan2(float3 y, float3 x);
+float4 __ovld __cnfn atan2(float4 y, float4 x);
+float8 __ovld __cnfn atan2(float8 y, float8 x);
+float16 __ovld __cnfn atan2(float16 y, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan2(double y, double x);
+double2 __ovld __cnfn atan2(double2 y, double2 x);
+double3 __ovld __cnfn atan2(double3 y, double3 x);
+double4 __ovld __cnfn atan2(double4 y, double4 x);
+double8 __ovld __cnfn atan2(double8 y, double8 x);
+double16 __ovld __cnfn atan2(double16 y, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan2(half y, half x);
+half2 __ovld __cnfn atan2(half2 y, half2 x);
+half3 __ovld __cnfn atan2(half3 y, half3 x);
+half4 __ovld __cnfn atan2(half4 y, half4 x);
+half8 __ovld __cnfn atan2(half8 y, half8 x);
+half16 __ovld __cnfn atan2(half16 y, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Hyperbolic arc tangent.
+ */
+float __ovld __cnfn atanh(float);
+float2 __ovld __cnfn atanh(float2);
+float3 __ovld __cnfn atanh(float3);
+float4 __ovld __cnfn atanh(float4);
+float8 __ovld __cnfn atanh(float8);
+float16 __ovld __cnfn atanh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atanh(double);
+double2 __ovld __cnfn atanh(double2);
+double3 __ovld __cnfn atanh(double3);
+double4 __ovld __cnfn atanh(double4);
+double8 __ovld __cnfn atanh(double8);
+double16 __ovld __cnfn atanh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atanh(half);
+half2 __ovld __cnfn atanh(half2);
+half3 __ovld __cnfn atanh(half3);
+half4 __ovld __cnfn atanh(half4);
+half8 __ovld __cnfn atanh(half8);
+half16 __ovld __cnfn atanh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute atan (x) / PI.
+ */
+float __ovld __cnfn atanpi(float x);
+float2 __ovld __cnfn atanpi(float2 x);
+float3 __ovld __cnfn atanpi(float3 x);
+float4 __ovld __cnfn atanpi(float4 x);
+float8 __ovld __cnfn atanpi(float8 x);
+float16 __ovld __cnfn atanpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atanpi(double x);
+double2 __ovld __cnfn atanpi(double2 x);
+double3 __ovld __cnfn atanpi(double3 x);
+double4 __ovld __cnfn atanpi(double4 x);
+double8 __ovld __cnfn atanpi(double8 x);
+double16 __ovld __cnfn atanpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atanpi(half x);
+half2 __ovld __cnfn atanpi(half2 x);
+half3 __ovld __cnfn atanpi(half3 x);
+half4 __ovld __cnfn atanpi(half4 x);
+half8 __ovld __cnfn atanpi(half8 x);
+half16 __ovld __cnfn atanpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute atan2 (y, x) / PI.
+ */
+float __ovld __cnfn atan2pi(float y, float x);
+float2 __ovld __cnfn atan2pi(float2 y, float2 x);
+float3 __ovld __cnfn atan2pi(float3 y, float3 x);
+float4 __ovld __cnfn atan2pi(float4 y, float4 x);
+float8 __ovld __cnfn atan2pi(float8 y, float8 x);
+float16 __ovld __cnfn atan2pi(float16 y, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan2pi(double y, double x);
+double2 __ovld __cnfn atan2pi(double2 y, double2 x);
+double3 __ovld __cnfn atan2pi(double3 y, double3 x);
+double4 __ovld __cnfn atan2pi(double4 y, double4 x);
+double8 __ovld __cnfn atan2pi(double8 y, double8 x);
+double16 __ovld __cnfn atan2pi(double16 y, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan2pi(half y, half x);
+half2 __ovld __cnfn atan2pi(half2 y, half2 x);
+half3 __ovld __cnfn atan2pi(half3 y, half3 x);
+half4 __ovld __cnfn atan2pi(half4 y, half4 x);
+half8 __ovld __cnfn atan2pi(half8 y, half8 x);
+half16 __ovld __cnfn atan2pi(half16 y, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cube-root.
+ */
+float __ovld __cnfn cbrt(float);
+float2 __ovld __cnfn cbrt(float2);
+float3 __ovld __cnfn cbrt(float3);
+float4 __ovld __cnfn cbrt(float4);
+float8 __ovld __cnfn cbrt(float8);
+float16 __ovld __cnfn cbrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cbrt(double);
+double2 __ovld __cnfn cbrt(double2);
+double3 __ovld __cnfn cbrt(double3);
+double4 __ovld __cnfn cbrt(double4);
+double8 __ovld __cnfn cbrt(double8);
+double16 __ovld __cnfn cbrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cbrt(half);
+half2 __ovld __cnfn cbrt(half2);
+half3 __ovld __cnfn cbrt(half3);
+half4 __ovld __cnfn cbrt(half4);
+half8 __ovld __cnfn cbrt(half8);
+half16 __ovld __cnfn cbrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to positive
+ * infinity rounding mode.
+ */
+float __ovld __cnfn ceil(float);
+float2 __ovld __cnfn ceil(float2);
+float3 __ovld __cnfn ceil(float3);
+float4 __ovld __cnfn ceil(float4);
+float8 __ovld __cnfn ceil(float8);
+float16 __ovld __cnfn ceil(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn ceil(double);
+double2 __ovld __cnfn ceil(double2);
+double3 __ovld __cnfn ceil(double3);
+double4 __ovld __cnfn ceil(double4);
+double8 __ovld __cnfn ceil(double8);
+double16 __ovld __cnfn ceil(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn ceil(half);
+half2 __ovld __cnfn ceil(half2);
+half3 __ovld __cnfn ceil(half3);
+half4 __ovld __cnfn ceil(half4);
+half8 __ovld __cnfn ceil(half8);
+half16 __ovld __cnfn ceil(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x with its sign changed to match the sign of y.
+ */
+float __ovld __cnfn copysign(float x, float y);
+float2 __ovld __cnfn copysign(float2 x, float2 y);
+float3 __ovld __cnfn copysign(float3 x, float3 y);
+float4 __ovld __cnfn copysign(float4 x, float4 y);
+float8 __ovld __cnfn copysign(float8 x, float8 y);
+float16 __ovld __cnfn copysign(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn copysign(double x, double y);
+double2 __ovld __cnfn copysign(double2 x, double2 y);
+double3 __ovld __cnfn copysign(double3 x, double3 y);
+double4 __ovld __cnfn copysign(double4 x, double4 y);
+double8 __ovld __cnfn copysign(double8 x, double8 y);
+double16 __ovld __cnfn copysign(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn copysign(half x, half y);
+half2 __ovld __cnfn copysign(half2 x, half2 y);
+half3 __ovld __cnfn copysign(half3 x, half3 y);
+half4 __ovld __cnfn copysign(half4 x, half4 y);
+half8 __ovld __cnfn copysign(half8 x, half8 y);
+half16 __ovld __cnfn copysign(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cosine.
+ */
+float __ovld __cnfn cos(float);
+float2 __ovld __cnfn cos(float2);
+float3 __ovld __cnfn cos(float3);
+float4 __ovld __cnfn cos(float4);
+float8 __ovld __cnfn cos(float8);
+float16 __ovld __cnfn cos(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cos(double);
+double2 __ovld __cnfn cos(double2);
+double3 __ovld __cnfn cos(double3);
+double4 __ovld __cnfn cos(double4);
+double8 __ovld __cnfn cos(double8);
+double16 __ovld __cnfn cos(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cos(half);
+half2 __ovld __cnfn cos(half2);
+half3 __ovld __cnfn cos(half3);
+half4 __ovld __cnfn cos(half4);
+half8 __ovld __cnfn cos(half8);
+half16 __ovld __cnfn cos(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute hyperbolic cosine.
+ */
+float __ovld __cnfn cosh(float);
+float2 __ovld __cnfn cosh(float2);
+float3 __ovld __cnfn cosh(float3);
+float4 __ovld __cnfn cosh(float4);
+float8 __ovld __cnfn cosh(float8);
+float16 __ovld __cnfn cosh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cosh(double);
+double2 __ovld __cnfn cosh(double2);
+double3 __ovld __cnfn cosh(double3);
+double4 __ovld __cnfn cosh(double4);
+double8 __ovld __cnfn cosh(double8);
+double16 __ovld __cnfn cosh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cosh(half);
+half2 __ovld __cnfn cosh(half2);
+half3 __ovld __cnfn cosh(half3);
+half4 __ovld __cnfn cosh(half4);
+half8 __ovld __cnfn cosh(half8);
+half16 __ovld __cnfn cosh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cos (PI * x).
+ */
+float __ovld __cnfn cospi(float x);
+float2 __ovld __cnfn cospi(float2 x);
+float3 __ovld __cnfn cospi(float3 x);
+float4 __ovld __cnfn cospi(float4 x);
+float8 __ovld __cnfn cospi(float8 x);
+float16 __ovld __cnfn cospi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cospi(double x);
+double2 __ovld __cnfn cospi(double2 x);
+double3 __ovld __cnfn cospi(double3 x);
+double4 __ovld __cnfn cospi(double4 x);
+double8 __ovld __cnfn cospi(double8 x);
+double16 __ovld __cnfn cospi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cospi(half x);
+half2 __ovld __cnfn cospi(half2 x);
+half3 __ovld __cnfn cospi(half3 x);
+half4 __ovld __cnfn cospi(half4 x);
+half8 __ovld __cnfn cospi(half8 x);
+half16 __ovld __cnfn cospi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Complementary error function.
+ */
+float __ovld __cnfn erfc(float);
+float2 __ovld __cnfn erfc(float2);
+float3 __ovld __cnfn erfc(float3);
+float4 __ovld __cnfn erfc(float4);
+float8 __ovld __cnfn erfc(float8);
+float16 __ovld __cnfn erfc(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn erfc(double);
+double2 __ovld __cnfn erfc(double2);
+double3 __ovld __cnfn erfc(double3);
+double4 __ovld __cnfn erfc(double4);
+double8 __ovld __cnfn erfc(double8);
+double16 __ovld __cnfn erfc(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn erfc(half);
+half2 __ovld __cnfn erfc(half2);
+half3 __ovld __cnfn erfc(half3);
+half4 __ovld __cnfn erfc(half4);
+half8 __ovld __cnfn erfc(half8);
+half16 __ovld __cnfn erfc(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Error function encountered in integrating the
+ * normal distribution.
+ */
+float __ovld __cnfn erf(float);
+float2 __ovld __cnfn erf(float2);
+float3 __ovld __cnfn erf(float3);
+float4 __ovld __cnfn erf(float4);
+float8 __ovld __cnfn erf(float8);
+float16 __ovld __cnfn erf(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn erf(double);
+double2 __ovld __cnfn erf(double2);
+double3 __ovld __cnfn erf(double3);
+double4 __ovld __cnfn erf(double4);
+double8 __ovld __cnfn erf(double8);
+double16 __ovld __cnfn erf(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn erf(half);
+half2 __ovld __cnfn erf(half2);
+half3 __ovld __cnfn erf(half3);
+half4 __ovld __cnfn erf(half4);
+half8 __ovld __cnfn erf(half8);
+half16 __ovld __cnfn erf(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the base e exponential function of x.
+ */
+float __ovld __cnfn exp(float x);
+float2 __ovld __cnfn exp(float2 x);
+float3 __ovld __cnfn exp(float3 x);
+float4 __ovld __cnfn exp(float4 x);
+float8 __ovld __cnfn exp(float8 x);
+float16 __ovld __cnfn exp(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp(double x);
+double2 __ovld __cnfn exp(double2 x);
+double3 __ovld __cnfn exp(double3 x);
+double4 __ovld __cnfn exp(double4 x);
+double8 __ovld __cnfn exp(double8 x);
+double16 __ovld __cnfn exp(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp(half x);
+half2 __ovld __cnfn exp(half2 x);
+half3 __ovld __cnfn exp(half3 x);
+half4 __ovld __cnfn exp(half4 x);
+half8 __ovld __cnfn exp(half8 x);
+half16 __ovld __cnfn exp(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Exponential base 2 function.
+ */
+float __ovld __cnfn exp2(float);
+float2 __ovld __cnfn exp2(float2);
+float3 __ovld __cnfn exp2(float3);
+float4 __ovld __cnfn exp2(float4);
+float8 __ovld __cnfn exp2(float8);
+float16 __ovld __cnfn exp2(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp2(double);
+double2 __ovld __cnfn exp2(double2);
+double3 __ovld __cnfn exp2(double3);
+double4 __ovld __cnfn exp2(double4);
+double8 __ovld __cnfn exp2(double8);
+double16 __ovld __cnfn exp2(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp2(half);
+half2 __ovld __cnfn exp2(half2);
+half3 __ovld __cnfn exp2(half3);
+half4 __ovld __cnfn exp2(half4);
+half8 __ovld __cnfn exp2(half8);
+half16 __ovld __cnfn exp2(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Exponential base 10 function.
+ */
+float __ovld __cnfn exp10(float);
+float2 __ovld __cnfn exp10(float2);
+float3 __ovld __cnfn exp10(float3);
+float4 __ovld __cnfn exp10(float4);
+float8 __ovld __cnfn exp10(float8);
+float16 __ovld __cnfn exp10(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp10(double);
+double2 __ovld __cnfn exp10(double2);
+double3 __ovld __cnfn exp10(double3);
+double4 __ovld __cnfn exp10(double4);
+double8 __ovld __cnfn exp10(double8);
+double16 __ovld __cnfn exp10(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp10(half);
+half2 __ovld __cnfn exp10(half2);
+half3 __ovld __cnfn exp10(half3);
+half4 __ovld __cnfn exp10(half4);
+half8 __ovld __cnfn exp10(half8);
+half16 __ovld __cnfn exp10(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute e^x- 1.0.
+ */
+float __ovld __cnfn expm1(float x);
+float2 __ovld __cnfn expm1(float2 x);
+float3 __ovld __cnfn expm1(float3 x);
+float4 __ovld __cnfn expm1(float4 x);
+float8 __ovld __cnfn expm1(float8 x);
+float16 __ovld __cnfn expm1(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn expm1(double x);
+double2 __ovld __cnfn expm1(double2 x);
+double3 __ovld __cnfn expm1(double3 x);
+double4 __ovld __cnfn expm1(double4 x);
+double8 __ovld __cnfn expm1(double8 x);
+double16 __ovld __cnfn expm1(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn expm1(half x);
+half2 __ovld __cnfn expm1(half2 x);
+half3 __ovld __cnfn expm1(half3 x);
+half4 __ovld __cnfn expm1(half4 x);
+half8 __ovld __cnfn expm1(half8 x);
+half16 __ovld __cnfn expm1(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute absolute value of a floating-point number.
+ */
+float __ovld __cnfn fabs(float);
+float2 __ovld __cnfn fabs(float2);
+float3 __ovld __cnfn fabs(float3);
+float4 __ovld __cnfn fabs(float4);
+float8 __ovld __cnfn fabs(float8);
+float16 __ovld __cnfn fabs(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fabs(double);
+double2 __ovld __cnfn fabs(double2);
+double3 __ovld __cnfn fabs(double3);
+double4 __ovld __cnfn fabs(double4);
+double8 __ovld __cnfn fabs(double8);
+double16 __ovld __cnfn fabs(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fabs(half);
+half2 __ovld __cnfn fabs(half2);
+half3 __ovld __cnfn fabs(half3);
+half4 __ovld __cnfn fabs(half4);
+half8 __ovld __cnfn fabs(half8);
+half16 __ovld __cnfn fabs(half16);
+#endif //cl_khr_fp16
+
+/**
+ * x - y if x > y, +0 if x is less than or equal to y.
+ */
+float __ovld __cnfn fdim(float x, float y);
+float2 __ovld __cnfn fdim(float2 x, float2 y);
+float3 __ovld __cnfn fdim(float3 x, float3 y);
+float4 __ovld __cnfn fdim(float4 x, float4 y);
+float8 __ovld __cnfn fdim(float8 x, float8 y);
+float16 __ovld __cnfn fdim(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fdim(double x, double y);
+double2 __ovld __cnfn fdim(double2 x, double2 y);
+double3 __ovld __cnfn fdim(double3 x, double3 y);
+double4 __ovld __cnfn fdim(double4 x, double4 y);
+double8 __ovld __cnfn fdim(double8 x, double8 y);
+double16 __ovld __cnfn fdim(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fdim(half x, half y);
+half2 __ovld __cnfn fdim(half2 x, half2 y);
+half3 __ovld __cnfn fdim(half3 x, half3 y);
+half4 __ovld __cnfn fdim(half4 x, half4 y);
+half8 __ovld __cnfn fdim(half8 x, half8 y);
+half16 __ovld __cnfn fdim(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to -ve
+ * infinity rounding mode.
+ */
+float __ovld __cnfn floor(float);
+float2 __ovld __cnfn floor(float2);
+float3 __ovld __cnfn floor(float3);
+float4 __ovld __cnfn floor(float4);
+float8 __ovld __cnfn floor(float8);
+float16 __ovld __cnfn floor(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn floor(double);
+double2 __ovld __cnfn floor(double2);
+double3 __ovld __cnfn floor(double3);
+double4 __ovld __cnfn floor(double4);
+double8 __ovld __cnfn floor(double8);
+double16 __ovld __cnfn floor(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn floor(half);
+half2 __ovld __cnfn floor(half2);
+half3 __ovld __cnfn floor(half3);
+half4 __ovld __cnfn floor(half4);
+half8 __ovld __cnfn floor(half8);
+half16 __ovld __cnfn floor(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the correctly rounded floating-point
+ * representation of the sum of c with the infinitely
+ * precise product of a and b. Rounding of
+ * intermediate products shall not occur. Edge case
+ * behavior is per the IEEE 754-2008 standard.
+ */
+float __ovld __cnfn fma(float a, float b, float c);
+float2 __ovld __cnfn fma(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn fma(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn fma(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn fma(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn fma(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fma(double a, double b, double c);
+double2 __ovld __cnfn fma(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn fma(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn fma(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn fma(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn fma(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fma(half a, half b, half c);
+half2 __ovld __cnfn fma(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn fma(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn fma(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn fma(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn fma(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if x < y, otherwise it returns x. If one
+ * argument is a NaN, fmax() returns the other
+ * argument. If both arguments are NaNs, fmax()
+ * returns a NaN.
+ */
+float __ovld __cnfn fmax(float x, float y);
+float2 __ovld __cnfn fmax(float2 x, float2 y);
+float3 __ovld __cnfn fmax(float3 x, float3 y);
+float4 __ovld __cnfn fmax(float4 x, float4 y);
+float8 __ovld __cnfn fmax(float8 x, float8 y);
+float16 __ovld __cnfn fmax(float16 x, float16 y);
+float2 __ovld __cnfn fmax(float2 x, float y);
+float3 __ovld __cnfn fmax(float3 x, float y);
+float4 __ovld __cnfn fmax(float4 x, float y);
+float8 __ovld __cnfn fmax(float8 x, float y);
+float16 __ovld __cnfn fmax(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmax(double x, double y);
+double2 __ovld __cnfn fmax(double2 x, double2 y);
+double3 __ovld __cnfn fmax(double3 x, double3 y);
+double4 __ovld __cnfn fmax(double4 x, double4 y);
+double8 __ovld __cnfn fmax(double8 x, double8 y);
+double16 __ovld __cnfn fmax(double16 x, double16 y);
+double2 __ovld __cnfn fmax(double2 x, double y);
+double3 __ovld __cnfn fmax(double3 x, double y);
+double4 __ovld __cnfn fmax(double4 x, double y);
+double8 __ovld __cnfn fmax(double8 x, double y);
+double16 __ovld __cnfn fmax(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmax(half x, half y);
+half2 __ovld __cnfn fmax(half2 x, half2 y);
+half3 __ovld __cnfn fmax(half3 x, half3 y);
+half4 __ovld __cnfn fmax(half4 x, half4 y);
+half8 __ovld __cnfn fmax(half8 x, half8 y);
+half16 __ovld __cnfn fmax(half16 x, half16 y);
+half2 __ovld __cnfn fmax(half2 x, half y);
+half3 __ovld __cnfn fmax(half3 x, half y);
+half4 __ovld __cnfn fmax(half4 x, half y);
+half8 __ovld __cnfn fmax(half8 x, half y);
+half16 __ovld __cnfn fmax(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if y < x, otherwise it returns x. If one
+ * argument is a NaN, fmin() returns the other
+ * argument. If both arguments are NaNs, fmin()
+ * returns a NaN.
+ */
+float __ovld __cnfn fmin(float x, float y);
+float2 __ovld __cnfn fmin(float2 x, float2 y);
+float3 __ovld __cnfn fmin(float3 x, float3 y);
+float4 __ovld __cnfn fmin(float4 x, float4 y);
+float8 __ovld __cnfn fmin(float8 x, float8 y);
+float16 __ovld __cnfn fmin(float16 x, float16 y);
+float2 __ovld __cnfn fmin(float2 x, float y);
+float3 __ovld __cnfn fmin(float3 x, float y);
+float4 __ovld __cnfn fmin(float4 x, float y);
+float8 __ovld __cnfn fmin(float8 x, float y);
+float16 __ovld __cnfn fmin(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmin(double x, double y);
+double2 __ovld __cnfn fmin(double2 x, double2 y);
+double3 __ovld __cnfn fmin(double3 x, double3 y);
+double4 __ovld __cnfn fmin(double4 x, double4 y);
+double8 __ovld __cnfn fmin(double8 x, double8 y);
+double16 __ovld __cnfn fmin(double16 x, double16 y);
+double2 __ovld __cnfn fmin(double2 x, double y);
+double3 __ovld __cnfn fmin(double3 x, double y);
+double4 __ovld __cnfn fmin(double4 x, double y);
+double8 __ovld __cnfn fmin(double8 x, double y);
+double16 __ovld __cnfn fmin(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmin(half x, half y);
+half2 __ovld __cnfn fmin(half2 x, half2 y);
+half3 __ovld __cnfn fmin(half3 x, half3 y);
+half4 __ovld __cnfn fmin(half4 x, half4 y);
+half8 __ovld __cnfn fmin(half8 x, half8 y);
+half16 __ovld __cnfn fmin(half16 x, half16 y);
+half2 __ovld __cnfn fmin(half2 x, half y);
+half3 __ovld __cnfn fmin(half3 x, half y);
+half4 __ovld __cnfn fmin(half4 x, half y);
+half8 __ovld __cnfn fmin(half8 x, half y);
+half16 __ovld __cnfn fmin(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Modulus. Returns x - y * trunc (x/y).
+ */
+float __ovld __cnfn fmod(float x, float y);
+float2 __ovld __cnfn fmod(float2 x, float2 y);
+float3 __ovld __cnfn fmod(float3 x, float3 y);
+float4 __ovld __cnfn fmod(float4 x, float4 y);
+float8 __ovld __cnfn fmod(float8 x, float8 y);
+float16 __ovld __cnfn fmod(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmod(double x, double y);
+double2 __ovld __cnfn fmod(double2 x, double2 y);
+double3 __ovld __cnfn fmod(double3 x, double3 y);
+double4 __ovld __cnfn fmod(double4 x, double4 y);
+double8 __ovld __cnfn fmod(double8 x, double8 y);
+double16 __ovld __cnfn fmod(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmod(half x, half y);
+half2 __ovld __cnfn fmod(half2 x, half2 y);
+half3 __ovld __cnfn fmod(half3 x, half3 y);
+half4 __ovld __cnfn fmod(half4 x, half4 y);
+half8 __ovld __cnfn fmod(half8 x, half8 y);
+half16 __ovld __cnfn fmod(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns fmin(x - floor (x), 0x1.fffffep-1f ).
+ * floor(x) is returned in iptr.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld fract(float x, float *iptr);
+float2 __ovld fract(float2 x, float2 *iptr);
+float3 __ovld fract(float3 x, float3 *iptr);
+float4 __ovld fract(float4 x, float4 *iptr);
+float8 __ovld fract(float8 x, float8 *iptr);
+float16 __ovld fract(float16 x, float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld fract(double x, double *iptr);
+double2 __ovld fract(double2 x, double2 *iptr);
+double3 __ovld fract(double3 x, double3 *iptr);
+double4 __ovld fract(double4 x, double4 *iptr);
+double8 __ovld fract(double8 x, double8 *iptr);
+double16 __ovld fract(double16 x, double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld fract(half x, half *iptr);
+half2 __ovld fract(half2 x, half2 *iptr);
+half3 __ovld fract(half3 x, half3 *iptr);
+half4 __ovld fract(half4 x, half4 *iptr);
+half8 __ovld fract(half8 x, half8 *iptr);
+half16 __ovld fract(half16 x, half16 *iptr);
+#endif //cl_khr_fp16
+#else
+float __ovld fract(float x, __global float *iptr);
+float2 __ovld fract(float2 x, __global float2 *iptr);
+float3 __ovld fract(float3 x, __global float3 *iptr);
+float4 __ovld fract(float4 x, __global float4 *iptr);
+float8 __ovld fract(float8 x, __global float8 *iptr);
+float16 __ovld fract(float16 x, __global float16 *iptr);
+float __ovld fract(float x, __local float *iptr);
+float2 __ovld fract(float2 x, __local float2 *iptr);
+float3 __ovld fract(float3 x, __local float3 *iptr);
+float4 __ovld fract(float4 x, __local float4 *iptr);
+float8 __ovld fract(float8 x, __local float8 *iptr);
+float16 __ovld fract(float16 x, __local float16 *iptr);
+float __ovld fract(float x, __private float *iptr);
+float2 __ovld fract(float2 x, __private float2 *iptr);
+float3 __ovld fract(float3 x, __private float3 *iptr);
+float4 __ovld fract(float4 x, __private float4 *iptr);
+float8 __ovld fract(float8 x, __private float8 *iptr);
+float16 __ovld fract(float16 x, __private float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld fract(double x, __global double *iptr);
+double2 __ovld fract(double2 x, __global double2 *iptr);
+double3 __ovld fract(double3 x, __global double3 *iptr);
+double4 __ovld fract(double4 x, __global double4 *iptr);
+double8 __ovld fract(double8 x, __global double8 *iptr);
+double16 __ovld fract(double16 x, __global double16 *iptr);
+double __ovld fract(double x, __local double *iptr);
+double2 __ovld fract(double2 x, __local double2 *iptr);
+double3 __ovld fract(double3 x, __local double3 *iptr);
+double4 __ovld fract(double4 x, __local double4 *iptr);
+double8 __ovld fract(double8 x, __local double8 *iptr);
+double16 __ovld fract(double16 x, __local double16 *iptr);
+double __ovld fract(double x, __private double *iptr);
+double2 __ovld fract(double2 x, __private double2 *iptr);
+double3 __ovld fract(double3 x, __private double3 *iptr);
+double4 __ovld fract(double4 x, __private double4 *iptr);
+double8 __ovld fract(double8 x, __private double8 *iptr);
+double16 __ovld fract(double16 x, __private double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld fract(half x, __global half *iptr);
+half2 __ovld fract(half2 x, __global half2 *iptr);
+half3 __ovld fract(half3 x, __global half3 *iptr);
+half4 __ovld fract(half4 x, __global half4 *iptr);
+half8 __ovld fract(half8 x, __global half8 *iptr);
+half16 __ovld fract(half16 x, __global half16 *iptr);
+half __ovld fract(half x, __local half *iptr);
+half2 __ovld fract(half2 x, __local half2 *iptr);
+half3 __ovld fract(half3 x, __local half3 *iptr);
+half4 __ovld fract(half4 x, __local half4 *iptr);
+half8 __ovld fract(half8 x, __local half8 *iptr);
+half16 __ovld fract(half16 x, __local half16 *iptr);
+half __ovld fract(half x, __private half *iptr);
+half2 __ovld fract(half2 x, __private half2 *iptr);
+half3 __ovld fract(half3 x, __private half3 *iptr);
+half4 __ovld fract(half4 x, __private half4 *iptr);
+half8 __ovld fract(half8 x, __private half8 *iptr);
+half16 __ovld fract(half16 x, __private half16 *iptr);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Extract mantissa and exponent from x. For each
+ * component the mantissa returned is a float with
+ * magnitude in the interval [1/2, 1) or 0. Each
+ * component of x equals mantissa returned * 2^exp.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld frexp(float x, int *exp);
+float2 __ovld frexp(float2 x, int2 *exp);
+float3 __ovld frexp(float3 x, int3 *exp);
+float4 __ovld frexp(float4 x, int4 *exp);
+float8 __ovld frexp(float8 x, int8 *exp);
+float16 __ovld frexp(float16 x, int16 *exp);
+#ifdef cl_khr_fp64
+double __ovld frexp(double x, int *exp);
+double2 __ovld frexp(double2 x, int2 *exp);
+double3 __ovld frexp(double3 x, int3 *exp);
+double4 __ovld frexp(double4 x, int4 *exp);
+double8 __ovld frexp(double8 x, int8 *exp);
+double16 __ovld frexp(double16 x, int16 *exp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld frexp(half x, int *exp);
+half2 __ovld frexp(half2 x, int2 *exp);
+half3 __ovld frexp(half3 x, int3 *exp);
+half4 __ovld frexp(half4 x, int4 *exp);
+half8 __ovld frexp(half8 x, int8 *exp);
+half16 __ovld frexp(half16 x, int16 *exp);
+#endif //cl_khr_fp16
+#else
+float __ovld frexp(float x, __global int *exp);
+float2 __ovld frexp(float2 x, __global int2 *exp);
+float3 __ovld frexp(float3 x, __global int3 *exp);
+float4 __ovld frexp(float4 x, __global int4 *exp);
+float8 __ovld frexp(float8 x, __global int8 *exp);
+float16 __ovld frexp(float16 x, __global int16 *exp);
+float __ovld frexp(float x, __local int *exp);
+float2 __ovld frexp(float2 x, __local int2 *exp);
+float3 __ovld frexp(float3 x, __local int3 *exp);
+float4 __ovld frexp(float4 x, __local int4 *exp);
+float8 __ovld frexp(float8 x, __local int8 *exp);
+float16 __ovld frexp(float16 x, __local int16 *exp);
+float __ovld frexp(float x, __private int *exp);
+float2 __ovld frexp(float2 x, __private int2 *exp);
+float3 __ovld frexp(float3 x, __private int3 *exp);
+float4 __ovld frexp(float4 x, __private int4 *exp);
+float8 __ovld frexp(float8 x, __private int8 *exp);
+float16 __ovld frexp(float16 x, __private int16 *exp);
+#ifdef cl_khr_fp64
+double __ovld frexp(double x, __global int *exp);
+double2 __ovld frexp(double2 x, __global int2 *exp);
+double3 __ovld frexp(double3 x, __global int3 *exp);
+double4 __ovld frexp(double4 x, __global int4 *exp);
+double8 __ovld frexp(double8 x, __global int8 *exp);
+double16 __ovld frexp(double16 x, __global int16 *exp);
+double __ovld frexp(double x, __local int *exp);
+double2 __ovld frexp(double2 x, __local int2 *exp);
+double3 __ovld frexp(double3 x, __local int3 *exp);
+double4 __ovld frexp(double4 x, __local int4 *exp);
+double8 __ovld frexp(double8 x, __local int8 *exp);
+double16 __ovld frexp(double16 x, __local int16 *exp);
+double __ovld frexp(double x, __private int *exp);
+double2 __ovld frexp(double2 x, __private int2 *exp);
+double3 __ovld frexp(double3 x, __private int3 *exp);
+double4 __ovld frexp(double4 x, __private int4 *exp);
+double8 __ovld frexp(double8 x, __private int8 *exp);
+double16 __ovld frexp(double16 x, __private int16 *exp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld frexp(half x, __global int *exp);
+half2 __ovld frexp(half2 x, __global int2 *exp);
+half3 __ovld frexp(half3 x, __global int3 *exp);
+half4 __ovld frexp(half4 x, __global int4 *exp);
+half8 __ovld frexp(half8 x, __global int8 *exp);
+half16 __ovld frexp(half16 x, __global int16 *exp);
+half __ovld frexp(half x, __local int *exp);
+half2 __ovld frexp(half2 x, __local int2 *exp);
+half3 __ovld frexp(half3 x, __local int3 *exp);
+half4 __ovld frexp(half4 x, __local int4 *exp);
+half8 __ovld frexp(half8 x, __local int8 *exp);
+half16 __ovld frexp(half16 x, __local int16 *exp);
+half __ovld frexp(half x, __private int *exp);
+half2 __ovld frexp(half2 x, __private int2 *exp);
+half3 __ovld frexp(half3 x, __private int3 *exp);
+half4 __ovld frexp(half4 x, __private int4 *exp);
+half8 __ovld frexp(half8 x, __private int8 *exp);
+half16 __ovld frexp(half16 x, __private int16 *exp);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute the value of the square root of x^2 + y^2
+ * without undue overflow or underflow.
+ */
+float __ovld __cnfn hypot(float x, float y);
+float2 __ovld __cnfn hypot(float2 x, float2 y);
+float3 __ovld __cnfn hypot(float3 x, float3 y);
+float4 __ovld __cnfn hypot(float4 x, float4 y);
+float8 __ovld __cnfn hypot(float8 x, float8 y);
+float16 __ovld __cnfn hypot(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn hypot(double x, double y);
+double2 __ovld __cnfn hypot(double2 x, double2 y);
+double3 __ovld __cnfn hypot(double3 x, double3 y);
+double4 __ovld __cnfn hypot(double4 x, double4 y);
+double8 __ovld __cnfn hypot(double8 x, double8 y);
+double16 __ovld __cnfn hypot(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn hypot(half x, half y);
+half2 __ovld __cnfn hypot(half2 x, half2 y);
+half3 __ovld __cnfn hypot(half3 x, half3 y);
+half4 __ovld __cnfn hypot(half4 x, half4 y);
+half8 __ovld __cnfn hypot(half8 x, half8 y);
+half16 __ovld __cnfn hypot(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Return the exponent as an integer value.
+ */
+int __ovld __cnfn ilogb(float x);
+int2 __ovld __cnfn ilogb(float2 x);
+int3 __ovld __cnfn ilogb(float3 x);
+int4 __ovld __cnfn ilogb(float4 x);
+int8 __ovld __cnfn ilogb(float8 x);
+int16 __ovld __cnfn ilogb(float16 x);
+#ifdef cl_khr_fp64
+int __ovld __cnfn ilogb(double x);
+int2 __ovld __cnfn ilogb(double2 x);
+int3 __ovld __cnfn ilogb(double3 x);
+int4 __ovld __cnfn ilogb(double4 x);
+int8 __ovld __cnfn ilogb(double8 x);
+int16 __ovld __cnfn ilogb(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn ilogb(half x);
+int2 __ovld __cnfn ilogb(half2 x);
+int3 __ovld __cnfn ilogb(half3 x);
+int4 __ovld __cnfn ilogb(half4 x);
+int8 __ovld __cnfn ilogb(half8 x);
+int16 __ovld __cnfn ilogb(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Multiply x by 2 to the power n.
+ */
+float __ovld __cnfn ldexp(float x, int n);
+float2 __ovld __cnfn ldexp(float2 x, int2 n);
+float3 __ovld __cnfn ldexp(float3 x, int3 n);
+float4 __ovld __cnfn ldexp(float4 x, int4 n);
+float8 __ovld __cnfn ldexp(float8 x, int8 n);
+float16 __ovld __cnfn ldexp(float16 x, int16 n);
+float2 __ovld __cnfn ldexp(float2 x, int n);
+float3 __ovld __cnfn ldexp(float3 x, int n);
+float4 __ovld __cnfn ldexp(float4 x, int n);
+float8 __ovld __cnfn ldexp(float8 x, int n);
+float16 __ovld __cnfn ldexp(float16 x, int n);
+#ifdef cl_khr_fp64
+double __ovld __cnfn ldexp(double x, int n);
+double2 __ovld __cnfn ldexp(double2 x, int2 n);
+double3 __ovld __cnfn ldexp(double3 x, int3 n);
+double4 __ovld __cnfn ldexp(double4 x, int4 n);
+double8 __ovld __cnfn ldexp(double8 x, int8 n);
+double16 __ovld __cnfn ldexp(double16 x, int16 n);
+double2 __ovld __cnfn ldexp(double2 x, int n);
+double3 __ovld __cnfn ldexp(double3 x, int n);
+double4 __ovld __cnfn ldexp(double4 x, int n);
+double8 __ovld __cnfn ldexp(double8 x, int n);
+double16 __ovld __cnfn ldexp(double16 x, int n);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn ldexp(half x, int n);
+half2 __ovld __cnfn ldexp(half2 x, int2 n);
+half3 __ovld __cnfn ldexp(half3 x, int3 n);
+half4 __ovld __cnfn ldexp(half4 x, int4 n);
+half8 __ovld __cnfn ldexp(half8 x, int8 n);
+half16 __ovld __cnfn ldexp(half16 x, int16 n);
+half2 __ovld __cnfn ldexp(half2 x, int n);
+half3 __ovld __cnfn ldexp(half3 x, int n);
+half4 __ovld __cnfn ldexp(half4 x, int n);
+half8 __ovld __cnfn ldexp(half8 x, int n);
+half16 __ovld __cnfn ldexp(half16 x, int n);
+#endif //cl_khr_fp16
+
+/**
+ * Log gamma function. Returns the natural
+ * logarithm of the absolute value of the gamma
+ * function. The sign of the gamma function is
+ * returned in the signp argument of lgamma_r.
+ */
+float __ovld __cnfn lgamma(float x);
+float2 __ovld __cnfn lgamma(float2 x);
+float3 __ovld __cnfn lgamma(float3 x);
+float4 __ovld __cnfn lgamma(float4 x);
+float8 __ovld __cnfn lgamma(float8 x);
+float16 __ovld __cnfn lgamma(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn lgamma(double x);
+double2 __ovld __cnfn lgamma(double2 x);
+double3 __ovld __cnfn lgamma(double3 x);
+double4 __ovld __cnfn lgamma(double4 x);
+double8 __ovld __cnfn lgamma(double8 x);
+double16 __ovld __cnfn lgamma(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn lgamma(half x);
+half2 __ovld __cnfn lgamma(half2 x);
+half3 __ovld __cnfn lgamma(half3 x);
+half4 __ovld __cnfn lgamma(half4 x);
+half8 __ovld __cnfn lgamma(half8 x);
+half16 __ovld __cnfn lgamma(half16 x);
+#endif //cl_khr_fp16
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld lgamma_r(float x, int *signp);
+float2 __ovld lgamma_r(float2 x, int2 *signp);
+float3 __ovld lgamma_r(float3 x, int3 *signp);
+float4 __ovld lgamma_r(float4 x, int4 *signp);
+float8 __ovld lgamma_r(float8 x, int8 *signp);
+float16 __ovld lgamma_r(float16 x, int16 *signp);
+#ifdef cl_khr_fp64
+double __ovld lgamma_r(double x, int *signp);
+double2 __ovld lgamma_r(double2 x, int2 *signp);
+double3 __ovld lgamma_r(double3 x, int3 *signp);
+double4 __ovld lgamma_r(double4 x, int4 *signp);
+double8 __ovld lgamma_r(double8 x, int8 *signp);
+double16 __ovld lgamma_r(double16 x, int16 *signp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld lgamma_r(half x, int *signp);
+half2 __ovld lgamma_r(half2 x, int2 *signp);
+half3 __ovld lgamma_r(half3 x, int3 *signp);
+half4 __ovld lgamma_r(half4 x, int4 *signp);
+half8 __ovld lgamma_r(half8 x, int8 *signp);
+half16 __ovld lgamma_r(half16 x, int16 *signp);
+#endif //cl_khr_fp16
+#else
+float __ovld lgamma_r(float x, __global int *signp);
+float2 __ovld lgamma_r(float2 x, __global int2 *signp);
+float3 __ovld lgamma_r(float3 x, __global int3 *signp);
+float4 __ovld lgamma_r(float4 x, __global int4 *signp);
+float8 __ovld lgamma_r(float8 x, __global int8 *signp);
+float16 __ovld lgamma_r(float16 x, __global int16 *signp);
+float __ovld lgamma_r(float x, __local int *signp);
+float2 __ovld lgamma_r(float2 x, __local int2 *signp);
+float3 __ovld lgamma_r(float3 x, __local int3 *signp);
+float4 __ovld lgamma_r(float4 x, __local int4 *signp);
+float8 __ovld lgamma_r(float8 x, __local int8 *signp);
+float16 __ovld lgamma_r(float16 x, __local int16 *signp);
+float __ovld lgamma_r(float x, __private int *signp);
+float2 __ovld lgamma_r(float2 x, __private int2 *signp);
+float3 __ovld lgamma_r(float3 x, __private int3 *signp);
+float4 __ovld lgamma_r(float4 x, __private int4 *signp);
+float8 __ovld lgamma_r(float8 x, __private int8 *signp);
+float16 __ovld lgamma_r(float16 x, __private int16 *signp);
+#ifdef cl_khr_fp64
+double __ovld lgamma_r(double x, __global int *signp);
+double2 __ovld lgamma_r(double2 x, __global int2 *signp);
+double3 __ovld lgamma_r(double3 x, __global int3 *signp);
+double4 __ovld lgamma_r(double4 x, __global int4 *signp);
+double8 __ovld lgamma_r(double8 x, __global int8 *signp);
+double16 __ovld lgamma_r(double16 x, __global int16 *signp);
+double __ovld lgamma_r(double x, __local int *signp);
+double2 __ovld lgamma_r(double2 x, __local int2 *signp);
+double3 __ovld lgamma_r(double3 x, __local int3 *signp);
+double4 __ovld lgamma_r(double4 x, __local int4 *signp);
+double8 __ovld lgamma_r(double8 x, __local int8 *signp);
+double16 __ovld lgamma_r(double16 x, __local int16 *signp);
+double __ovld lgamma_r(double x, __private int *signp);
+double2 __ovld lgamma_r(double2 x, __private int2 *signp);
+double3 __ovld lgamma_r(double3 x, __private int3 *signp);
+double4 __ovld lgamma_r(double4 x, __private int4 *signp);
+double8 __ovld lgamma_r(double8 x, __private int8 *signp);
+double16 __ovld lgamma_r(double16 x, __private int16 *signp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld lgamma_r(half x, __global int *signp);
+half2 __ovld lgamma_r(half2 x, __global int2 *signp);
+half3 __ovld lgamma_r(half3 x, __global int3 *signp);
+half4 __ovld lgamma_r(half4 x, __global int4 *signp);
+half8 __ovld lgamma_r(half8 x, __global int8 *signp);
+half16 __ovld lgamma_r(half16 x, __global int16 *signp);
+half __ovld lgamma_r(half x, __local int *signp);
+half2 __ovld lgamma_r(half2 x, __local int2 *signp);
+half3 __ovld lgamma_r(half3 x, __local int3 *signp);
+half4 __ovld lgamma_r(half4 x, __local int4 *signp);
+half8 __ovld lgamma_r(half8 x, __local int8 *signp);
+half16 __ovld lgamma_r(half16 x, __local int16 *signp);
+half __ovld lgamma_r(half x, __private int *signp);
+half2 __ovld lgamma_r(half2 x, __private int2 *signp);
+half3 __ovld lgamma_r(half3 x, __private int3 *signp);
+half4 __ovld lgamma_r(half4 x, __private int4 *signp);
+half8 __ovld lgamma_r(half8 x, __private int8 *signp);
+half16 __ovld lgamma_r(half16 x, __private int16 *signp);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute natural logarithm.
+ */
+float __ovld __cnfn log(float);
+float2 __ovld __cnfn log(float2);
+float3 __ovld __cnfn log(float3);
+float4 __ovld __cnfn log(float4);
+float8 __ovld __cnfn log(float8);
+float16 __ovld __cnfn log(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log(double);
+double2 __ovld __cnfn log(double2);
+double3 __ovld __cnfn log(double3);
+double4 __ovld __cnfn log(double4);
+double8 __ovld __cnfn log(double8);
+double16 __ovld __cnfn log(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log(half);
+half2 __ovld __cnfn log(half2);
+half3 __ovld __cnfn log(half3);
+half4 __ovld __cnfn log(half4);
+half8 __ovld __cnfn log(half8);
+half16 __ovld __cnfn log(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base 2 logarithm.
+ */
+float __ovld __cnfn log2(float);
+float2 __ovld __cnfn log2(float2);
+float3 __ovld __cnfn log2(float3);
+float4 __ovld __cnfn log2(float4);
+float8 __ovld __cnfn log2(float8);
+float16 __ovld __cnfn log2(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log2(double);
+double2 __ovld __cnfn log2(double2);
+double3 __ovld __cnfn log2(double3);
+double4 __ovld __cnfn log2(double4);
+double8 __ovld __cnfn log2(double8);
+double16 __ovld __cnfn log2(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log2(half);
+half2 __ovld __cnfn log2(half2);
+half3 __ovld __cnfn log2(half3);
+half4 __ovld __cnfn log2(half4);
+half8 __ovld __cnfn log2(half8);
+half16 __ovld __cnfn log2(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base 10 logarithm.
+ */
+float __ovld __cnfn log10(float);
+float2 __ovld __cnfn log10(float2);
+float3 __ovld __cnfn log10(float3);
+float4 __ovld __cnfn log10(float4);
+float8 __ovld __cnfn log10(float8);
+float16 __ovld __cnfn log10(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log10(double);
+double2 __ovld __cnfn log10(double2);
+double3 __ovld __cnfn log10(double3);
+double4 __ovld __cnfn log10(double4);
+double8 __ovld __cnfn log10(double8);
+double16 __ovld __cnfn log10(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log10(half);
+half2 __ovld __cnfn log10(half2);
+half3 __ovld __cnfn log10(half3);
+half4 __ovld __cnfn log10(half4);
+half8 __ovld __cnfn log10(half8);
+half16 __ovld __cnfn log10(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base e logarithm of (1.0 + x).
+ */
+float __ovld __cnfn log1p(float x);
+float2 __ovld __cnfn log1p(float2 x);
+float3 __ovld __cnfn log1p(float3 x);
+float4 __ovld __cnfn log1p(float4 x);
+float8 __ovld __cnfn log1p(float8 x);
+float16 __ovld __cnfn log1p(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log1p(double x);
+double2 __ovld __cnfn log1p(double2 x);
+double3 __ovld __cnfn log1p(double3 x);
+double4 __ovld __cnfn log1p(double4 x);
+double8 __ovld __cnfn log1p(double8 x);
+double16 __ovld __cnfn log1p(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log1p(half x);
+half2 __ovld __cnfn log1p(half2 x);
+half3 __ovld __cnfn log1p(half3 x);
+half4 __ovld __cnfn log1p(half4 x);
+half8 __ovld __cnfn log1p(half8 x);
+half16 __ovld __cnfn log1p(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the exponent of x, which is the integral
+ * part of logr | x |.
+ */
+float __ovld __cnfn logb(float x);
+float2 __ovld __cnfn logb(float2 x);
+float3 __ovld __cnfn logb(float3 x);
+float4 __ovld __cnfn logb(float4 x);
+float8 __ovld __cnfn logb(float8 x);
+float16 __ovld __cnfn logb(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn logb(double x);
+double2 __ovld __cnfn logb(double2 x);
+double3 __ovld __cnfn logb(double3 x);
+double4 __ovld __cnfn logb(double4 x);
+double8 __ovld __cnfn logb(double8 x);
+double16 __ovld __cnfn logb(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn logb(half x);
+half2 __ovld __cnfn logb(half2 x);
+half3 __ovld __cnfn logb(half3 x);
+half4 __ovld __cnfn logb(half4 x);
+half8 __ovld __cnfn logb(half8 x);
+half16 __ovld __cnfn logb(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * mad approximates a * b + c. Whether or how the
+ * product of a * b is rounded and how supernormal or
+ * subnormal intermediate products are handled is not
+ * defined. mad is intended to be used where speed is
+ * preferred over accuracy.
+ */
+float __ovld __cnfn mad(float a, float b, float c);
+float2 __ovld __cnfn mad(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn mad(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn mad(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn mad(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn mad(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn mad(double a, double b, double c);
+double2 __ovld __cnfn mad(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn mad(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn mad(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn mad(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn mad(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn mad(half a, half b, half c);
+half2 __ovld __cnfn mad(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn mad(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn mad(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn mad(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn mad(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x if | x | > | y |, y if | y | > | x |, otherwise
+ * fmax(x, y).
+ */
+float __ovld __cnfn maxmag(float x, float y);
+float2 __ovld __cnfn maxmag(float2 x, float2 y);
+float3 __ovld __cnfn maxmag(float3 x, float3 y);
+float4 __ovld __cnfn maxmag(float4 x, float4 y);
+float8 __ovld __cnfn maxmag(float8 x, float8 y);
+float16 __ovld __cnfn maxmag(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn maxmag(double x, double y);
+double2 __ovld __cnfn maxmag(double2 x, double2 y);
+double3 __ovld __cnfn maxmag(double3 x, double3 y);
+double4 __ovld __cnfn maxmag(double4 x, double4 y);
+double8 __ovld __cnfn maxmag(double8 x, double8 y);
+double16 __ovld __cnfn maxmag(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn maxmag(half x, half y);
+half2 __ovld __cnfn maxmag(half2 x, half2 y);
+half3 __ovld __cnfn maxmag(half3 x, half3 y);
+half4 __ovld __cnfn maxmag(half4 x, half4 y);
+half8 __ovld __cnfn maxmag(half8 x, half8 y);
+half16 __ovld __cnfn maxmag(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x if | x | < | y |, y if | y | < | x |, otherwise
+ * fmin(x, y).
+ */
+float __ovld __cnfn minmag(float x, float y);
+float2 __ovld __cnfn minmag(float2 x, float2 y);
+float3 __ovld __cnfn minmag(float3 x, float3 y);
+float4 __ovld __cnfn minmag(float4 x, float4 y);
+float8 __ovld __cnfn minmag(float8 x, float8 y);
+float16 __ovld __cnfn minmag(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn minmag(double x, double y);
+double2 __ovld __cnfn minmag(double2 x, double2 y);
+double3 __ovld __cnfn minmag(double3 x, double3 y);
+double4 __ovld __cnfn minmag(double4 x, double4 y);
+double8 __ovld __cnfn minmag(double8 x, double8 y);
+double16 __ovld __cnfn minmag(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn minmag(half x, half y);
+half2 __ovld __cnfn minmag(half2 x, half2 y);
+half3 __ovld __cnfn minmag(half3 x, half3 y);
+half4 __ovld __cnfn minmag(half4 x, half4 y);
+half8 __ovld __cnfn minmag(half8 x, half8 y);
+half16 __ovld __cnfn minmag(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Decompose a floating-point number. The modf
+ * function breaks the argument x into integral and
+ * fractional parts, each of which has the same sign as
+ * the argument. It stores the integral part in the object
+ * pointed to by iptr.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld modf(float x, float *iptr);
+float2 __ovld modf(float2 x, float2 *iptr);
+float3 __ovld modf(float3 x, float3 *iptr);
+float4 __ovld modf(float4 x, float4 *iptr);
+float8 __ovld modf(float8 x, float8 *iptr);
+float16 __ovld modf(float16 x, float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld modf(double x, double *iptr);
+double2 __ovld modf(double2 x, double2 *iptr);
+double3 __ovld modf(double3 x, double3 *iptr);
+double4 __ovld modf(double4 x, double4 *iptr);
+double8 __ovld modf(double8 x, double8 *iptr);
+double16 __ovld modf(double16 x, double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld modf(half x, half *iptr);
+half2 __ovld modf(half2 x, half2 *iptr);
+half3 __ovld modf(half3 x, half3 *iptr);
+half4 __ovld modf(half4 x, half4 *iptr);
+half8 __ovld modf(half8 x, half8 *iptr);
+half16 __ovld modf(half16 x, half16 *iptr);
+#endif //cl_khr_fp16
+#else
+float __ovld modf(float x, __global float *iptr);
+float2 __ovld modf(float2 x, __global float2 *iptr);
+float3 __ovld modf(float3 x, __global float3 *iptr);
+float4 __ovld modf(float4 x, __global float4 *iptr);
+float8 __ovld modf(float8 x, __global float8 *iptr);
+float16 __ovld modf(float16 x, __global float16 *iptr);
+float __ovld modf(float x, __local float *iptr);
+float2 __ovld modf(float2 x, __local float2 *iptr);
+float3 __ovld modf(float3 x, __local float3 *iptr);
+float4 __ovld modf(float4 x, __local float4 *iptr);
+float8 __ovld modf(float8 x, __local float8 *iptr);
+float16 __ovld modf(float16 x, __local float16 *iptr);
+float __ovld modf(float x, __private float *iptr);
+float2 __ovld modf(float2 x, __private float2 *iptr);
+float3 __ovld modf(float3 x, __private float3 *iptr);
+float4 __ovld modf(float4 x, __private float4 *iptr);
+float8 __ovld modf(float8 x, __private float8 *iptr);
+float16 __ovld modf(float16 x, __private float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld modf(double x, __global double *iptr);
+double2 __ovld modf(double2 x, __global double2 *iptr);
+double3 __ovld modf(double3 x, __global double3 *iptr);
+double4 __ovld modf(double4 x, __global double4 *iptr);
+double8 __ovld modf(double8 x, __global double8 *iptr);
+double16 __ovld modf(double16 x, __global double16 *iptr);
+double __ovld modf(double x, __local double *iptr);
+double2 __ovld modf(double2 x, __local double2 *iptr);
+double3 __ovld modf(double3 x, __local double3 *iptr);
+double4 __ovld modf(double4 x, __local double4 *iptr);
+double8 __ovld modf(double8 x, __local double8 *iptr);
+double16 __ovld modf(double16 x, __local double16 *iptr);
+double __ovld modf(double x, __private double *iptr);
+double2 __ovld modf(double2 x, __private double2 *iptr);
+double3 __ovld modf(double3 x, __private double3 *iptr);
+double4 __ovld modf(double4 x, __private double4 *iptr);
+double8 __ovld modf(double8 x, __private double8 *iptr);
+double16 __ovld modf(double16 x, __private double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld modf(half x, __global half *iptr);
+half2 __ovld modf(half2 x, __global half2 *iptr);
+half3 __ovld modf(half3 x, __global half3 *iptr);
+half4 __ovld modf(half4 x, __global half4 *iptr);
+half8 __ovld modf(half8 x, __global half8 *iptr);
+half16 __ovld modf(half16 x, __global half16 *iptr);
+half __ovld modf(half x, __local half *iptr);
+half2 __ovld modf(half2 x, __local half2 *iptr);
+half3 __ovld modf(half3 x, __local half3 *iptr);
+half4 __ovld modf(half4 x, __local half4 *iptr);
+half8 __ovld modf(half8 x, __local half8 *iptr);
+half16 __ovld modf(half16 x, __local half16 *iptr);
+half __ovld modf(half x, __private half *iptr);
+half2 __ovld modf(half2 x, __private half2 *iptr);
+half3 __ovld modf(half3 x, __private half3 *iptr);
+half4 __ovld modf(half4 x, __private half4 *iptr);
+half8 __ovld modf(half8 x, __private half8 *iptr);
+half16 __ovld modf(half16 x, __private half16 *iptr);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Returns a quiet NaN. The nancode may be placed
+ * in the significand of the resulting NaN.
+ */
+float __ovld __cnfn nan(uint nancode);
+float2 __ovld __cnfn nan(uint2 nancode);
+float3 __ovld __cnfn nan(uint3 nancode);
+float4 __ovld __cnfn nan(uint4 nancode);
+float8 __ovld __cnfn nan(uint8 nancode);
+float16 __ovld __cnfn nan(uint16 nancode);
+#ifdef cl_khr_fp64
+double __ovld __cnfn nan(ulong nancode);
+double2 __ovld __cnfn nan(ulong2 nancode);
+double3 __ovld __cnfn nan(ulong3 nancode);
+double4 __ovld __cnfn nan(ulong4 nancode);
+double8 __ovld __cnfn nan(ulong8 nancode);
+double16 __ovld __cnfn nan(ulong16 nancode);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn nan(ushort nancode);
+half2 __ovld __cnfn nan(ushort2 nancode);
+half3 __ovld __cnfn nan(ushort3 nancode);
+half4 __ovld __cnfn nan(ushort4 nancode);
+half8 __ovld __cnfn nan(ushort8 nancode);
+half16 __ovld __cnfn nan(ushort16 nancode);
+#endif //cl_khr_fp16
+
+/**
+ * Computes the next representable single-precision
+ * floating-point value following x in the direction of
+ * y. Thus, if y is less than x, nextafter() returns the
+ * largest representable floating-point number less
+ * than x.
+ */
+float __ovld __cnfn nextafter(float x, float y);
+float2 __ovld __cnfn nextafter(float2 x, float2 y);
+float3 __ovld __cnfn nextafter(float3 x, float3 y);
+float4 __ovld __cnfn nextafter(float4 x, float4 y);
+float8 __ovld __cnfn nextafter(float8 x, float8 y);
+float16 __ovld __cnfn nextafter(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn nextafter(double x, double y);
+double2 __ovld __cnfn nextafter(double2 x, double2 y);
+double3 __ovld __cnfn nextafter(double3 x, double3 y);
+double4 __ovld __cnfn nextafter(double4 x, double4 y);
+double8 __ovld __cnfn nextafter(double8 x, double8 y);
+double16 __ovld __cnfn nextafter(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn nextafter(half x, half y);
+half2 __ovld __cnfn nextafter(half2 x, half2 y);
+half3 __ovld __cnfn nextafter(half3 x, half3 y);
+half4 __ovld __cnfn nextafter(half4 x, half4 y);
+half8 __ovld __cnfn nextafter(half8 x, half8 y);
+half16 __ovld __cnfn nextafter(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y.
+ */
+float __ovld __cnfn pow(float x, float y);
+float2 __ovld __cnfn pow(float2 x, float2 y);
+float3 __ovld __cnfn pow(float3 x, float3 y);
+float4 __ovld __cnfn pow(float4 x, float4 y);
+float8 __ovld __cnfn pow(float8 x, float8 y);
+float16 __ovld __cnfn pow(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn pow(double x, double y);
+double2 __ovld __cnfn pow(double2 x, double2 y);
+double3 __ovld __cnfn pow(double3 x, double3 y);
+double4 __ovld __cnfn pow(double4 x, double4 y);
+double8 __ovld __cnfn pow(double8 x, double8 y);
+double16 __ovld __cnfn pow(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn pow(half x, half y);
+half2 __ovld __cnfn pow(half2 x, half2 y);
+half3 __ovld __cnfn pow(half3 x, half3 y);
+half4 __ovld __cnfn pow(half4 x, half4 y);
+half8 __ovld __cnfn pow(half8 x, half8 y);
+half16 __ovld __cnfn pow(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y, where y is an integer.
+ */
+float __ovld __cnfn pown(float x, int y);
+float2 __ovld __cnfn pown(float2 x, int2 y);
+float3 __ovld __cnfn pown(float3 x, int3 y);
+float4 __ovld __cnfn pown(float4 x, int4 y);
+float8 __ovld __cnfn pown(float8 x, int8 y);
+float16 __ovld __cnfn pown(float16 x, int16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn pown(double x, int y);
+double2 __ovld __cnfn pown(double2 x, int2 y);
+double3 __ovld __cnfn pown(double3 x, int3 y);
+double4 __ovld __cnfn pown(double4 x, int4 y);
+double8 __ovld __cnfn pown(double8 x, int8 y);
+double16 __ovld __cnfn pown(double16 x, int16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn pown(half x, int y);
+half2 __ovld __cnfn pown(half2 x, int2 y);
+half3 __ovld __cnfn pown(half3 x, int3 y);
+half4 __ovld __cnfn pown(half4 x, int4 y);
+half8 __ovld __cnfn pown(half8 x, int8 y);
+half16 __ovld __cnfn pown(half16 x, int16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y, where x is >= 0.
+ */
+float __ovld __cnfn powr(float x, float y);
+float2 __ovld __cnfn powr(float2 x, float2 y);
+float3 __ovld __cnfn powr(float3 x, float3 y);
+float4 __ovld __cnfn powr(float4 x, float4 y);
+float8 __ovld __cnfn powr(float8 x, float8 y);
+float16 __ovld __cnfn powr(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn powr(double x, double y);
+double2 __ovld __cnfn powr(double2 x, double2 y);
+double3 __ovld __cnfn powr(double3 x, double3 y);
+double4 __ovld __cnfn powr(double4 x, double4 y);
+double8 __ovld __cnfn powr(double8 x, double8 y);
+double16 __ovld __cnfn powr(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn powr(half x, half y);
+half2 __ovld __cnfn powr(half2 x, half2 y);
+half3 __ovld __cnfn powr(half3 x, half3 y);
+half4 __ovld __cnfn powr(half4 x, half4 y);
+half8 __ovld __cnfn powr(half8 x, half8 y);
+half16 __ovld __cnfn powr(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the value r such that r = x - n*y, where n
+ * is the integer nearest the exact value of x/y. If there
+ * are two integers closest to x/y, n shall be the even
+ * one. If r is zero, it is given the same sign as x.
+ */
+float __ovld __cnfn remainder(float x, float y);
+float2 __ovld __cnfn remainder(float2 x, float2 y);
+float3 __ovld __cnfn remainder(float3 x, float3 y);
+float4 __ovld __cnfn remainder(float4 x, float4 y);
+float8 __ovld __cnfn remainder(float8 x, float8 y);
+float16 __ovld __cnfn remainder(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn remainder(double x, double y);
+double2 __ovld __cnfn remainder(double2 x, double2 y);
+double3 __ovld __cnfn remainder(double3 x, double3 y);
+double4 __ovld __cnfn remainder(double4 x, double4 y);
+double8 __ovld __cnfn remainder(double8 x, double8 y);
+double16 __ovld __cnfn remainder(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn remainder(half x, half y);
+half2 __ovld __cnfn remainder(half2 x, half2 y);
+half3 __ovld __cnfn remainder(half3 x, half3 y);
+half4 __ovld __cnfn remainder(half4 x, half4 y);
+half8 __ovld __cnfn remainder(half8 x, half8 y);
+half16 __ovld __cnfn remainder(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * The remquo function computes the value r such
+ * that r = x - n*y, where n is the integer nearest the
+ * exact value of x/y. If there are two integers closest
+ * to x/y, n shall be the even one. If r is zero, it is
+ * given the same sign as x. This is the same value
+ * that is returned by the remainder function.
+ * remquo also calculates the lower seven bits of the
+ * integral quotient x/y, and gives that value the same
+ * sign as x/y. It stores this signed value in the object
+ * pointed to by quo.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld remquo(float x, float y, int *quo);
+float2 __ovld remquo(float2 x, float2 y, int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, int16 *quo);
+#ifdef cl_khr_fp64
+double __ovld remquo(double x, double y, int *quo);
+double2 __ovld remquo(double2 x, double2 y, int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, int16 *quo);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld remquo(half x, half y, int *quo);
+half2 __ovld remquo(half2 x, half2 y, int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, int16 *quo);
+
+#endif //cl_khr_fp16
+#else
+float __ovld remquo(float x, float y, __global int *quo);
+float2 __ovld remquo(float2 x, float2 y, __global int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __global int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __global int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __global int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __global int16 *quo);
+float __ovld remquo(float x, float y, __local int *quo);
+float2 __ovld remquo(float2 x, float2 y, __local int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __local int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __local int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __local int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __local int16 *quo);
+float __ovld remquo(float x, float y, __private int *quo);
+float2 __ovld remquo(float2 x, float2 y, __private int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __private int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __private int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __private int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __private int16 *quo);
+#ifdef cl_khr_fp64
+double __ovld remquo(double x, double y, __global int *quo);
+double2 __ovld remquo(double2 x, double2 y, __global int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __global int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __global int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __global int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __global int16 *quo);
+double __ovld remquo(double x, double y, __local int *quo);
+double2 __ovld remquo(double2 x, double2 y, __local int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __local int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __local int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __local int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __local int16 *quo);
+double __ovld remquo(double x, double y, __private int *quo);
+double2 __ovld remquo(double2 x, double2 y, __private int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __private int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __private int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __private int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __private int16 *quo);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld remquo(half x, half y, __global int *quo);
+half2 __ovld remquo(half2 x, half2 y, __global int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __global int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __global int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __global int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __global int16 *quo);
+half __ovld remquo(half x, half y, __local int *quo);
+half2 __ovld remquo(half2 x, half2 y, __local int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __local int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __local int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __local int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __local int16 *quo);
+half __ovld remquo(half x, half y, __private int *quo);
+half2 __ovld remquo(half2 x, half2 y, __private int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __private int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __private int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __private int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __private int16 *quo);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+/**
+ * Round to integral value (using round to nearest
+ * even rounding mode) in floating-point format.
+ * Refer to section 7.1 for description of rounding
+ * modes.
+ */
+float __ovld __cnfn rint(float);
+float2 __ovld __cnfn rint(float2);
+float3 __ovld __cnfn rint(float3);
+float4 __ovld __cnfn rint(float4);
+float8 __ovld __cnfn rint(float8);
+float16 __ovld __cnfn rint(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rint(double);
+double2 __ovld __cnfn rint(double2);
+double3 __ovld __cnfn rint(double3);
+double4 __ovld __cnfn rint(double4);
+double8 __ovld __cnfn rint(double8);
+double16 __ovld __cnfn rint(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rint(half);
+half2 __ovld __cnfn rint(half2);
+half3 __ovld __cnfn rint(half3);
+half4 __ovld __cnfn rint(half4);
+half8 __ovld __cnfn rint(half8);
+half16 __ovld __cnfn rint(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power 1/y.
+ */
+float __ovld __cnfn rootn(float x, int y);
+float2 __ovld __cnfn rootn(float2 x, int2 y);
+float3 __ovld __cnfn rootn(float3 x, int3 y);
+float4 __ovld __cnfn rootn(float4 x, int4 y);
+float8 __ovld __cnfn rootn(float8 x, int8 y);
+float16 __ovld __cnfn rootn(float16 x, int16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rootn(double x, int y);
+double2 __ovld __cnfn rootn(double2 x, int2 y);
+double3 __ovld __cnfn rootn(double3 x, int3 y);
+double4 __ovld __cnfn rootn(double4 x, int4 y);
+double8 __ovld __cnfn rootn(double8 x, int8 y);
+double16 __ovld __cnfn rootn(double16 x, int16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rootn(half x, int y);
+half2 __ovld __cnfn rootn(half2 x, int2 y);
+half3 __ovld __cnfn rootn(half3 x, int3 y);
+half4 __ovld __cnfn rootn(half4 x, int4 y);
+half8 __ovld __cnfn rootn(half8 x, int8 y);
+half16 __ovld __cnfn rootn(half16 x, int16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Return the integral value nearest to x rounding
+ * halfway cases away from zero, regardless of the
+ * current rounding direction.
+ */
+float __ovld __cnfn round(float x);
+float2 __ovld __cnfn round(float2 x);
+float3 __ovld __cnfn round(float3 x);
+float4 __ovld __cnfn round(float4 x);
+float8 __ovld __cnfn round(float8 x);
+float16 __ovld __cnfn round(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn round(double x);
+double2 __ovld __cnfn round(double2 x);
+double3 __ovld __cnfn round(double3 x);
+double4 __ovld __cnfn round(double4 x);
+double8 __ovld __cnfn round(double8 x);
+double16 __ovld __cnfn round(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn round(half x);
+half2 __ovld __cnfn round(half2 x);
+half3 __ovld __cnfn round(half3 x);
+half4 __ovld __cnfn round(half4 x);
+half8 __ovld __cnfn round(half8 x);
+half16 __ovld __cnfn round(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute inverse square root.
+ */
+float __ovld __cnfn rsqrt(float);
+float2 __ovld __cnfn rsqrt(float2);
+float3 __ovld __cnfn rsqrt(float3);
+float4 __ovld __cnfn rsqrt(float4);
+float8 __ovld __cnfn rsqrt(float8);
+float16 __ovld __cnfn rsqrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rsqrt(double);
+double2 __ovld __cnfn rsqrt(double2);
+double3 __ovld __cnfn rsqrt(double3);
+double4 __ovld __cnfn rsqrt(double4);
+double8 __ovld __cnfn rsqrt(double8);
+double16 __ovld __cnfn rsqrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rsqrt(half);
+half2 __ovld __cnfn rsqrt(half2);
+half3 __ovld __cnfn rsqrt(half3);
+half4 __ovld __cnfn rsqrt(half4);
+half8 __ovld __cnfn rsqrt(half8);
+half16 __ovld __cnfn rsqrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sine.
+ */
+float __ovld __cnfn sin(float);
+float2 __ovld __cnfn sin(float2);
+float3 __ovld __cnfn sin(float3);
+float4 __ovld __cnfn sin(float4);
+float8 __ovld __cnfn sin(float8);
+float16 __ovld __cnfn sin(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sin(double);
+double2 __ovld __cnfn sin(double2);
+double3 __ovld __cnfn sin(double3);
+double4 __ovld __cnfn sin(double4);
+double8 __ovld __cnfn sin(double8);
+double16 __ovld __cnfn sin(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sin(half);
+half2 __ovld __cnfn sin(half2);
+half3 __ovld __cnfn sin(half3);
+half4 __ovld __cnfn sin(half4);
+half8 __ovld __cnfn sin(half8);
+half16 __ovld __cnfn sin(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sine and cosine of x. The computed sine
+ * is the return value and computed cosine is returned
+ * in cosval.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld sincos(float x, float *cosval);
+float2 __ovld sincos(float2 x, float2 *cosval);
+float3 __ovld sincos(float3 x, float3 *cosval);
+float4 __ovld sincos(float4 x, float4 *cosval);
+float8 __ovld sincos(float8 x, float8 *cosval);
+float16 __ovld sincos(float16 x, float16 *cosval);
+#ifdef cl_khr_fp64
+double __ovld sincos(double x, double *cosval);
+double2 __ovld sincos(double2 x, double2 *cosval);
+double3 __ovld sincos(double3 x, double3 *cosval);
+double4 __ovld sincos(double4 x, double4 *cosval);
+double8 __ovld sincos(double8 x, double8 *cosval);
+double16 __ovld sincos(double16 x, double16 *cosval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld sincos(half x, half *cosval);
+half2 __ovld sincos(half2 x, half2 *cosval);
+half3 __ovld sincos(half3 x, half3 *cosval);
+half4 __ovld sincos(half4 x, half4 *cosval);
+half8 __ovld sincos(half8 x, half8 *cosval);
+half16 __ovld sincos(half16 x, half16 *cosval);
+#endif //cl_khr_fp16
+#else
+float __ovld sincos(float x, __global float *cosval);
+float2 __ovld sincos(float2 x, __global float2 *cosval);
+float3 __ovld sincos(float3 x, __global float3 *cosval);
+float4 __ovld sincos(float4 x, __global float4 *cosval);
+float8 __ovld sincos(float8 x, __global float8 *cosval);
+float16 __ovld sincos(float16 x, __global float16 *cosval);
+float __ovld sincos(float x, __local float *cosval);
+float2 __ovld sincos(float2 x, __local float2 *cosval);
+float3 __ovld sincos(float3 x, __local float3 *cosval);
+float4 __ovld sincos(float4 x, __local float4 *cosval);
+float8 __ovld sincos(float8 x, __local float8 *cosval);
+float16 __ovld sincos(float16 x, __local float16 *cosval);
+float __ovld sincos(float x, __private float *cosval);
+float2 __ovld sincos(float2 x, __private float2 *cosval);
+float3 __ovld sincos(float3 x, __private float3 *cosval);
+float4 __ovld sincos(float4 x, __private float4 *cosval);
+float8 __ovld sincos(float8 x, __private float8 *cosval);
+float16 __ovld sincos(float16 x, __private float16 *cosval);
+#ifdef cl_khr_fp64
+double __ovld sincos(double x, __global double *cosval);
+double2 __ovld sincos(double2 x, __global double2 *cosval);
+double3 __ovld sincos(double3 x, __global double3 *cosval);
+double4 __ovld sincos(double4 x, __global double4 *cosval);
+double8 __ovld sincos(double8 x, __global double8 *cosval);
+double16 __ovld sincos(double16 x, __global double16 *cosval);
+double __ovld sincos(double x, __local double *cosval);
+double2 __ovld sincos(double2 x, __local double2 *cosval);
+double3 __ovld sincos(double3 x, __local double3 *cosval);
+double4 __ovld sincos(double4 x, __local double4 *cosval);
+double8 __ovld sincos(double8 x, __local double8 *cosval);
+double16 __ovld sincos(double16 x, __local double16 *cosval);
+double __ovld sincos(double x, __private double *cosval);
+double2 __ovld sincos(double2 x, __private double2 *cosval);
+double3 __ovld sincos(double3 x, __private double3 *cosval);
+double4 __ovld sincos(double4 x, __private double4 *cosval);
+double8 __ovld sincos(double8 x, __private double8 *cosval);
+double16 __ovld sincos(double16 x, __private double16 *cosval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld sincos(half x, __global half *cosval);
+half2 __ovld sincos(half2 x, __global half2 *cosval);
+half3 __ovld sincos(half3 x, __global half3 *cosval);
+half4 __ovld sincos(half4 x, __global half4 *cosval);
+half8 __ovld sincos(half8 x, __global half8 *cosval);
+half16 __ovld sincos(half16 x, __global half16 *cosval);
+half __ovld sincos(half x, __local half *cosval);
+half2 __ovld sincos(half2 x, __local half2 *cosval);
+half3 __ovld sincos(half3 x, __local half3 *cosval);
+half4 __ovld sincos(half4 x, __local half4 *cosval);
+half8 __ovld sincos(half8 x, __local half8 *cosval);
+half16 __ovld sincos(half16 x, __local half16 *cosval);
+half __ovld sincos(half x, __private half *cosval);
+half2 __ovld sincos(half2 x, __private half2 *cosval);
+half3 __ovld sincos(half3 x, __private half3 *cosval);
+half4 __ovld sincos(half4 x, __private half4 *cosval);
+half8 __ovld sincos(half8 x, __private half8 *cosval);
+half16 __ovld sincos(half16 x, __private half16 *cosval);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute hyperbolic sine.
+ */
+float __ovld __cnfn sinh(float);
+float2 __ovld __cnfn sinh(float2);
+float3 __ovld __cnfn sinh(float3);
+float4 __ovld __cnfn sinh(float4);
+float8 __ovld __cnfn sinh(float8);
+float16 __ovld __cnfn sinh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sinh(double);
+double2 __ovld __cnfn sinh(double2);
+double3 __ovld __cnfn sinh(double3);
+double4 __ovld __cnfn sinh(double4);
+double8 __ovld __cnfn sinh(double8);
+double16 __ovld __cnfn sinh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sinh(half);
+half2 __ovld __cnfn sinh(half2);
+half3 __ovld __cnfn sinh(half3);
+half4 __ovld __cnfn sinh(half4);
+half8 __ovld __cnfn sinh(half8);
+half16 __ovld __cnfn sinh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sin (PI * x).
+ */
+float __ovld __cnfn sinpi(float x);
+float2 __ovld __cnfn sinpi(float2 x);
+float3 __ovld __cnfn sinpi(float3 x);
+float4 __ovld __cnfn sinpi(float4 x);
+float8 __ovld __cnfn sinpi(float8 x);
+float16 __ovld __cnfn sinpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sinpi(double x);
+double2 __ovld __cnfn sinpi(double2 x);
+double3 __ovld __cnfn sinpi(double3 x);
+double4 __ovld __cnfn sinpi(double4 x);
+double8 __ovld __cnfn sinpi(double8 x);
+double16 __ovld __cnfn sinpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sinpi(half x);
+half2 __ovld __cnfn sinpi(half2 x);
+half3 __ovld __cnfn sinpi(half3 x);
+half4 __ovld __cnfn sinpi(half4 x);
+half8 __ovld __cnfn sinpi(half8 x);
+half16 __ovld __cnfn sinpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute square root.
+ */
+float __ovld __cnfn sqrt(float);
+float2 __ovld __cnfn sqrt(float2);
+float3 __ovld __cnfn sqrt(float3);
+float4 __ovld __cnfn sqrt(float4);
+float8 __ovld __cnfn sqrt(float8);
+float16 __ovld __cnfn sqrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sqrt(double);
+double2 __ovld __cnfn sqrt(double2);
+double3 __ovld __cnfn sqrt(double3);
+double4 __ovld __cnfn sqrt(double4);
+double8 __ovld __cnfn sqrt(double8);
+double16 __ovld __cnfn sqrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sqrt(half);
+half2 __ovld __cnfn sqrt(half2);
+half3 __ovld __cnfn sqrt(half3);
+half4 __ovld __cnfn sqrt(half4);
+half8 __ovld __cnfn sqrt(half8);
+half16 __ovld __cnfn sqrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute tangent.
+ */
+float __ovld __cnfn tan(float);
+float2 __ovld __cnfn tan(float2);
+float3 __ovld __cnfn tan(float3);
+float4 __ovld __cnfn tan(float4);
+float8 __ovld __cnfn tan(float8);
+float16 __ovld __cnfn tan(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tan(double);
+double2 __ovld __cnfn tan(double2);
+double3 __ovld __cnfn tan(double3);
+double4 __ovld __cnfn tan(double4);
+double8 __ovld __cnfn tan(double8);
+double16 __ovld __cnfn tan(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tan(half);
+half2 __ovld __cnfn tan(half2);
+half3 __ovld __cnfn tan(half3);
+half4 __ovld __cnfn tan(half4);
+half8 __ovld __cnfn tan(half8);
+half16 __ovld __cnfn tan(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute hyperbolic tangent.
+ */
+float __ovld __cnfn tanh(float);
+float2 __ovld __cnfn tanh(float2);
+float3 __ovld __cnfn tanh(float3);
+float4 __ovld __cnfn tanh(float4);
+float8 __ovld __cnfn tanh(float8);
+float16 __ovld __cnfn tanh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tanh(double);
+double2 __ovld __cnfn tanh(double2);
+double3 __ovld __cnfn tanh(double3);
+double4 __ovld __cnfn tanh(double4);
+double8 __ovld __cnfn tanh(double8);
+double16 __ovld __cnfn tanh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tanh(half);
+half2 __ovld __cnfn tanh(half2);
+half3 __ovld __cnfn tanh(half3);
+half4 __ovld __cnfn tanh(half4);
+half8 __ovld __cnfn tanh(half8);
+half16 __ovld __cnfn tanh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute tan (PI * x).
+ */
+float __ovld __cnfn tanpi(float x);
+float2 __ovld __cnfn tanpi(float2 x);
+float3 __ovld __cnfn tanpi(float3 x);
+float4 __ovld __cnfn tanpi(float4 x);
+float8 __ovld __cnfn tanpi(float8 x);
+float16 __ovld __cnfn tanpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tanpi(double x);
+double2 __ovld __cnfn tanpi(double2 x);
+double3 __ovld __cnfn tanpi(double3 x);
+double4 __ovld __cnfn tanpi(double4 x);
+double8 __ovld __cnfn tanpi(double8 x);
+double16 __ovld __cnfn tanpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tanpi(half x);
+half2 __ovld __cnfn tanpi(half2 x);
+half3 __ovld __cnfn tanpi(half3 x);
+half4 __ovld __cnfn tanpi(half4 x);
+half8 __ovld __cnfn tanpi(half8 x);
+half16 __ovld __cnfn tanpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the gamma function.
+ */
+float __ovld __cnfn tgamma(float);
+float2 __ovld __cnfn tgamma(float2);
+float3 __ovld __cnfn tgamma(float3);
+float4 __ovld __cnfn tgamma(float4);
+float8 __ovld __cnfn tgamma(float8);
+float16 __ovld __cnfn tgamma(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tgamma(double);
+double2 __ovld __cnfn tgamma(double2);
+double3 __ovld __cnfn tgamma(double3);
+double4 __ovld __cnfn tgamma(double4);
+double8 __ovld __cnfn tgamma(double8);
+double16 __ovld __cnfn tgamma(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tgamma(half);
+half2 __ovld __cnfn tgamma(half2);
+half3 __ovld __cnfn tgamma(half3);
+half4 __ovld __cnfn tgamma(half4);
+half8 __ovld __cnfn tgamma(half8);
+half16 __ovld __cnfn tgamma(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to zero
+ * rounding mode.
+ */
+float __ovld __cnfn trunc(float);
+float2 __ovld __cnfn trunc(float2);
+float3 __ovld __cnfn trunc(float3);
+float4 __ovld __cnfn trunc(float4);
+float8 __ovld __cnfn trunc(float8);
+float16 __ovld __cnfn trunc(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn trunc(double);
+double2 __ovld __cnfn trunc(double2);
+double3 __ovld __cnfn trunc(double3);
+double4 __ovld __cnfn trunc(double4);
+double8 __ovld __cnfn trunc(double8);
+double16 __ovld __cnfn trunc(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn trunc(half);
+half2 __ovld __cnfn trunc(half2);
+half3 __ovld __cnfn trunc(half3);
+half4 __ovld __cnfn trunc(half4);
+half8 __ovld __cnfn trunc(half8);
+half16 __ovld __cnfn trunc(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cosine. x must be in the range -2^16 ... +2^16.
+ */
+float __ovld __cnfn half_cos(float x);
+float2 __ovld __cnfn half_cos(float2 x);
+float3 __ovld __cnfn half_cos(float3 x);
+float4 __ovld __cnfn half_cos(float4 x);
+float8 __ovld __cnfn half_cos(float8 x);
+float16 __ovld __cnfn half_cos(float16 x);
+
+/**
+ * Compute x / y.
+ */
+float __ovld __cnfn half_divide(float x, float y);
+float2 __ovld __cnfn half_divide(float2 x, float2 y);
+float3 __ovld __cnfn half_divide(float3 x, float3 y);
+float4 __ovld __cnfn half_divide(float4 x, float4 y);
+float8 __ovld __cnfn half_divide(float8 x, float8 y);
+float16 __ovld __cnfn half_divide(float16 x, float16 y);
+
+/**
+ * Compute the base- e exponential of x.
+ */
+float __ovld __cnfn half_exp(float x);
+float2 __ovld __cnfn half_exp(float2 x);
+float3 __ovld __cnfn half_exp(float3 x);
+float4 __ovld __cnfn half_exp(float4 x);
+float8 __ovld __cnfn half_exp(float8 x);
+float16 __ovld __cnfn half_exp(float16 x);
+
+/**
+ * Compute the base- 2 exponential of x.
+ */
+float __ovld __cnfn half_exp2(float x);
+float2 __ovld __cnfn half_exp2(float2 x);
+float3 __ovld __cnfn half_exp2(float3 x);
+float4 __ovld __cnfn half_exp2(float4 x);
+float8 __ovld __cnfn half_exp2(float8 x);
+float16 __ovld __cnfn half_exp2(float16 x);
+
+/**
+ * Compute the base- 10 exponential of x.
+ */
+float __ovld __cnfn half_exp10(float x);
+float2 __ovld __cnfn half_exp10(float2 x);
+float3 __ovld __cnfn half_exp10(float3 x);
+float4 __ovld __cnfn half_exp10(float4 x);
+float8 __ovld __cnfn half_exp10(float8 x);
+float16 __ovld __cnfn half_exp10(float16 x);
+
+/**
+ * Compute natural logarithm.
+ */
+float __ovld __cnfn half_log(float x);
+float2 __ovld __cnfn half_log(float2 x);
+float3 __ovld __cnfn half_log(float3 x);
+float4 __ovld __cnfn half_log(float4 x);
+float8 __ovld __cnfn half_log(float8 x);
+float16 __ovld __cnfn half_log(float16 x);
+
+/**
+ * Compute a base 2 logarithm.
+ */
+float __ovld __cnfn half_log2(float x);
+float2 __ovld __cnfn half_log2(float2 x);
+float3 __ovld __cnfn half_log2(float3 x);
+float4 __ovld __cnfn half_log2(float4 x);
+float8 __ovld __cnfn half_log2(float8 x);
+float16 __ovld __cnfn half_log2(float16 x);
+
+/**
+ * Compute a base 10 logarithm.
+ */
+float __ovld __cnfn half_log10(float x);
+float2 __ovld __cnfn half_log10(float2 x);
+float3 __ovld __cnfn half_log10(float3 x);
+float4 __ovld __cnfn half_log10(float4 x);
+float8 __ovld __cnfn half_log10(float8 x);
+float16 __ovld __cnfn half_log10(float16 x);
+
+/**
+ * Compute x to the power y, where x is >= 0.
+ */
+float __ovld __cnfn half_powr(float x, float y);
+float2 __ovld __cnfn half_powr(float2 x, float2 y);
+float3 __ovld __cnfn half_powr(float3 x, float3 y);
+float4 __ovld __cnfn half_powr(float4 x, float4 y);
+float8 __ovld __cnfn half_powr(float8 x, float8 y);
+float16 __ovld __cnfn half_powr(float16 x, float16 y);
+
+/**
+ * Compute reciprocal.
+ */
+float __ovld __cnfn half_recip(float x);
+float2 __ovld __cnfn half_recip(float2 x);
+float3 __ovld __cnfn half_recip(float3 x);
+float4 __ovld __cnfn half_recip(float4 x);
+float8 __ovld __cnfn half_recip(float8 x);
+float16 __ovld __cnfn half_recip(float16 x);
+
+/**
+ * Compute inverse square root.
+ */
+float __ovld __cnfn half_rsqrt(float x);
+float2 __ovld __cnfn half_rsqrt(float2 x);
+float3 __ovld __cnfn half_rsqrt(float3 x);
+float4 __ovld __cnfn half_rsqrt(float4 x);
+float8 __ovld __cnfn half_rsqrt(float8 x);
+float16 __ovld __cnfn half_rsqrt(float16 x);
+
+/**
+ * Compute sine. x must be in the range -2^16 ... +2^16.
+ */
+float __ovld __cnfn half_sin(float x);
+float2 __ovld __cnfn half_sin(float2 x);
+float3 __ovld __cnfn half_sin(float3 x);
+float4 __ovld __cnfn half_sin(float4 x);
+float8 __ovld __cnfn half_sin(float8 x);
+float16 __ovld __cnfn half_sin(float16 x);
+
+/**
+ * Compute square root.
+ */
+float __ovld __cnfn half_sqrt(float x);
+float2 __ovld __cnfn half_sqrt(float2 x);
+float3 __ovld __cnfn half_sqrt(float3 x);
+float4 __ovld __cnfn half_sqrt(float4 x);
+float8 __ovld __cnfn half_sqrt(float8 x);
+float16 __ovld __cnfn half_sqrt(float16 x);
+
+/**
+ * Compute tangent. x must be in the range -216 ... +216.
+ */
+float __ovld __cnfn half_tan(float x);
+float2 __ovld __cnfn half_tan(float2 x);
+float3 __ovld __cnfn half_tan(float3 x);
+float4 __ovld __cnfn half_tan(float4 x);
+float8 __ovld __cnfn half_tan(float8 x);
+float16 __ovld __cnfn half_tan(float16 x);
+
+/**
+ * Compute cosine over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_cos(float x);
+float2 __ovld __cnfn native_cos(float2 x);
+float3 __ovld __cnfn native_cos(float3 x);
+float4 __ovld __cnfn native_cos(float4 x);
+float8 __ovld __cnfn native_cos(float8 x);
+float16 __ovld __cnfn native_cos(float16 x);
+
+/**
+ * Compute x / y over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_divide(float x, float y);
+float2 __ovld __cnfn native_divide(float2 x, float2 y);
+float3 __ovld __cnfn native_divide(float3 x, float3 y);
+float4 __ovld __cnfn native_divide(float4 x, float4 y);
+float8 __ovld __cnfn native_divide(float8 x, float8 y);
+float16 __ovld __cnfn native_divide(float16 x, float16 y);
+
+/**
+ * Compute the base- e exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp(float x);
+float2 __ovld __cnfn native_exp(float2 x);
+float3 __ovld __cnfn native_exp(float3 x);
+float4 __ovld __cnfn native_exp(float4 x);
+float8 __ovld __cnfn native_exp(float8 x);
+float16 __ovld __cnfn native_exp(float16 x);
+
+/**
+ * Compute the base- 2 exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp2(float x);
+float2 __ovld __cnfn native_exp2(float2 x);
+float3 __ovld __cnfn native_exp2(float3 x);
+float4 __ovld __cnfn native_exp2(float4 x);
+float8 __ovld __cnfn native_exp2(float8 x);
+float16 __ovld __cnfn native_exp2(float16 x);
+
+/**
+ * Compute the base- 10 exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp10(float x);
+float2 __ovld __cnfn native_exp10(float2 x);
+float3 __ovld __cnfn native_exp10(float3 x);
+float4 __ovld __cnfn native_exp10(float4 x);
+float8 __ovld __cnfn native_exp10(float8 x);
+float16 __ovld __cnfn native_exp10(float16 x);
+
+/**
+ * Compute natural logarithm over an implementationdefined
+ * range. The maximum error is implementation
+ * defined.
+ */
+float __ovld __cnfn native_log(float x);
+float2 __ovld __cnfn native_log(float2 x);
+float3 __ovld __cnfn native_log(float3 x);
+float4 __ovld __cnfn native_log(float4 x);
+float8 __ovld __cnfn native_log(float8 x);
+float16 __ovld __cnfn native_log(float16 x);
+
+/**
+ * Compute a base 2 logarithm over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_log2(float x);
+float2 __ovld __cnfn native_log2(float2 x);
+float3 __ovld __cnfn native_log2(float3 x);
+float4 __ovld __cnfn native_log2(float4 x);
+float8 __ovld __cnfn native_log2(float8 x);
+float16 __ovld __cnfn native_log2(float16 x);
+
+/**
+ * Compute a base 10 logarithm over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_log10(float x);
+float2 __ovld __cnfn native_log10(float2 x);
+float3 __ovld __cnfn native_log10(float3 x);
+float4 __ovld __cnfn native_log10(float4 x);
+float8 __ovld __cnfn native_log10(float8 x);
+float16 __ovld __cnfn native_log10(float16 x);
+
+/**
+ * Compute x to the power y, where x is >= 0. The range of
+ * x and y are implementation-defined. The maximum error
+ * is implementation-defined.
+ */
+float __ovld __cnfn native_powr(float x, float y);
+float2 __ovld __cnfn native_powr(float2 x, float2 y);
+float3 __ovld __cnfn native_powr(float3 x, float3 y);
+float4 __ovld __cnfn native_powr(float4 x, float4 y);
+float8 __ovld __cnfn native_powr(float8 x, float8 y);
+float16 __ovld __cnfn native_powr(float16 x, float16 y);
+
+/**
+ * Compute reciprocal over an implementation-defined
+ * range. The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_recip(float x);
+float2 __ovld __cnfn native_recip(float2 x);
+float3 __ovld __cnfn native_recip(float3 x);
+float4 __ovld __cnfn native_recip(float4 x);
+float8 __ovld __cnfn native_recip(float8 x);
+float16 __ovld __cnfn native_recip(float16 x);
+
+/**
+ * Compute inverse square root over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_rsqrt(float x);
+float2 __ovld __cnfn native_rsqrt(float2 x);
+float3 __ovld __cnfn native_rsqrt(float3 x);
+float4 __ovld __cnfn native_rsqrt(float4 x);
+float8 __ovld __cnfn native_rsqrt(float8 x);
+float16 __ovld __cnfn native_rsqrt(float16 x);
+
+/**
+ * Compute sine over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_sin(float x);
+float2 __ovld __cnfn native_sin(float2 x);
+float3 __ovld __cnfn native_sin(float3 x);
+float4 __ovld __cnfn native_sin(float4 x);
+float8 __ovld __cnfn native_sin(float8 x);
+float16 __ovld __cnfn native_sin(float16 x);
+
+/**
+ * Compute square root over an implementation-defined
+ * range. The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_sqrt(float x);
+float2 __ovld __cnfn native_sqrt(float2 x);
+float3 __ovld __cnfn native_sqrt(float3 x);
+float4 __ovld __cnfn native_sqrt(float4 x);
+float8 __ovld __cnfn native_sqrt(float8 x);
+float16 __ovld __cnfn native_sqrt(float16 x);
+
+/**
+ * Compute tangent over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_tan(float x);
+float2 __ovld __cnfn native_tan(float2 x);
+float3 __ovld __cnfn native_tan(float3 x);
+float4 __ovld __cnfn native_tan(float4 x);
+float8 __ovld __cnfn native_tan(float8 x);
+float16 __ovld __cnfn native_tan(float16 x);
+
+// OpenCL v1.1 s6.11.3, v1.2 s6.12.3, v2.0 s6.13.3 - Integer Functions
+
+/**
+ * Returns | x |.
+ */
+uchar __ovld __cnfn abs(char x);
+uchar __ovld __cnfn abs(uchar x);
+uchar2 __ovld __cnfn abs(char2 x);
+uchar2 __ovld __cnfn abs(uchar2 x);
+uchar3 __ovld __cnfn abs(char3 x);
+uchar3 __ovld __cnfn abs(uchar3 x);
+uchar4 __ovld __cnfn abs(char4 x);
+uchar4 __ovld __cnfn abs(uchar4 x);
+uchar8 __ovld __cnfn abs(char8 x);
+uchar8 __ovld __cnfn abs(uchar8 x);
+uchar16 __ovld __cnfn abs(char16 x);
+uchar16 __ovld __cnfn abs(uchar16 x);
+ushort __ovld __cnfn abs(short x);
+ushort __ovld __cnfn abs(ushort x);
+ushort2 __ovld __cnfn abs(short2 x);
+ushort2 __ovld __cnfn abs(ushort2 x);
+ushort3 __ovld __cnfn abs(short3 x);
+ushort3 __ovld __cnfn abs(ushort3 x);
+ushort4 __ovld __cnfn abs(short4 x);
+ushort4 __ovld __cnfn abs(ushort4 x);
+ushort8 __ovld __cnfn abs(short8 x);
+ushort8 __ovld __cnfn abs(ushort8 x);
+ushort16 __ovld __cnfn abs(short16 x);
+ushort16 __ovld __cnfn abs(ushort16 x);
+uint __ovld __cnfn abs(int x);
+uint __ovld __cnfn abs(uint x);
+uint2 __ovld __cnfn abs(int2 x);
+uint2 __ovld __cnfn abs(uint2 x);
+uint3 __ovld __cnfn abs(int3 x);
+uint3 __ovld __cnfn abs(uint3 x);
+uint4 __ovld __cnfn abs(int4 x);
+uint4 __ovld __cnfn abs(uint4 x);
+uint8 __ovld __cnfn abs(int8 x);
+uint8 __ovld __cnfn abs(uint8 x);
+uint16 __ovld __cnfn abs(int16 x);
+uint16 __ovld __cnfn abs(uint16 x);
+ulong __ovld __cnfn abs(long x);
+ulong __ovld __cnfn abs(ulong x);
+ulong2 __ovld __cnfn abs(long2 x);
+ulong2 __ovld __cnfn abs(ulong2 x);
+ulong3 __ovld __cnfn abs(long3 x);
+ulong3 __ovld __cnfn abs(ulong3 x);
+ulong4 __ovld __cnfn abs(long4 x);
+ulong4 __ovld __cnfn abs(ulong4 x);
+ulong8 __ovld __cnfn abs(long8 x);
+ulong8 __ovld __cnfn abs(ulong8 x);
+ulong16 __ovld __cnfn abs(long16 x);
+ulong16 __ovld __cnfn abs(ulong16 x);
+
+/**
+ * Returns | x - y | without modulo overflow.
+ */
+uchar __ovld __cnfn abs_diff(char x, char y);
+uchar __ovld __cnfn abs_diff(uchar x, uchar y);
+uchar2 __ovld __cnfn abs_diff(char2 x, char2 y);
+uchar2 __ovld __cnfn abs_diff(uchar2 x, uchar2 y);
+uchar3 __ovld __cnfn abs_diff(char3 x, char3 y);
+uchar3 __ovld __cnfn abs_diff(uchar3 x, uchar3 y);
+uchar4 __ovld __cnfn abs_diff(char4 x, char4 y);
+uchar4 __ovld __cnfn abs_diff(uchar4 x, uchar4 y);
+uchar8 __ovld __cnfn abs_diff(char8 x, char8 y);
+uchar8 __ovld __cnfn abs_diff(uchar8 x, uchar8 y);
+uchar16 __ovld __cnfn abs_diff(char16 x, char16 y);
+uchar16 __ovld __cnfn abs_diff(uchar16 x, uchar16 y);
+ushort __ovld __cnfn abs_diff(short x, short y);
+ushort __ovld __cnfn abs_diff(ushort x, ushort y);
+ushort2 __ovld __cnfn abs_diff(short2 x, short2 y);
+ushort2 __ovld __cnfn abs_diff(ushort2 x, ushort2 y);
+ushort3 __ovld __cnfn abs_diff(short3 x, short3 y);
+ushort3 __ovld __cnfn abs_diff(ushort3 x, ushort3 y);
+ushort4 __ovld __cnfn abs_diff(short4 x, short4 y);
+ushort4 __ovld __cnfn abs_diff(ushort4 x, ushort4 y);
+ushort8 __ovld __cnfn abs_diff(short8 x, short8 y);
+ushort8 __ovld __cnfn abs_diff(ushort8 x, ushort8 y);
+ushort16 __ovld __cnfn abs_diff(short16 x, short16 y);
+ushort16 __ovld __cnfn abs_diff(ushort16 x, ushort16 y);
+uint __ovld __cnfn abs_diff(int x, int y);
+uint __ovld __cnfn abs_diff(uint x, uint y);
+uint2 __ovld __cnfn abs_diff(int2 x, int2 y);
+uint2 __ovld __cnfn abs_diff(uint2 x, uint2 y);
+uint3 __ovld __cnfn abs_diff(int3 x, int3 y);
+uint3 __ovld __cnfn abs_diff(uint3 x, uint3 y);
+uint4 __ovld __cnfn abs_diff(int4 x, int4 y);
+uint4 __ovld __cnfn abs_diff(uint4 x, uint4 y);
+uint8 __ovld __cnfn abs_diff(int8 x, int8 y);
+uint8 __ovld __cnfn abs_diff(uint8 x, uint8 y);
+uint16 __ovld __cnfn abs_diff(int16 x, int16 y);
+uint16 __ovld __cnfn abs_diff(uint16 x, uint16 y);
+ulong __ovld __cnfn abs_diff(long x, long y);
+ulong __ovld __cnfn abs_diff(ulong x, ulong y);
+ulong2 __ovld __cnfn abs_diff(long2 x, long2 y);
+ulong2 __ovld __cnfn abs_diff(ulong2 x, ulong2 y);
+ulong3 __ovld __cnfn abs_diff(long3 x, long3 y);
+ulong3 __ovld __cnfn abs_diff(ulong3 x, ulong3 y);
+ulong4 __ovld __cnfn abs_diff(long4 x, long4 y);
+ulong4 __ovld __cnfn abs_diff(ulong4 x, ulong4 y);
+ulong8 __ovld __cnfn abs_diff(long8 x, long8 y);
+ulong8 __ovld __cnfn abs_diff(ulong8 x, ulong8 y);
+ulong16 __ovld __cnfn abs_diff(long16 x, long16 y);
+ulong16 __ovld __cnfn abs_diff(ulong16 x, ulong16 y);
+
+/**
+ * Returns x + y and saturates the result.
+ */
+char __ovld __cnfn add_sat(char x, char y);
+uchar __ovld __cnfn add_sat(uchar x, uchar y);
+char2 __ovld __cnfn add_sat(char2 x, char2 y);
+uchar2 __ovld __cnfn add_sat(uchar2 x, uchar2 y);
+char3 __ovld __cnfn add_sat(char3 x, char3 y);
+uchar3 __ovld __cnfn add_sat(uchar3 x, uchar3 y);
+char4 __ovld __cnfn add_sat(char4 x, char4 y);
+uchar4 __ovld __cnfn add_sat(uchar4 x, uchar4 y);
+char8 __ovld __cnfn add_sat(char8 x, char8 y);
+uchar8 __ovld __cnfn add_sat(uchar8 x, uchar8 y);
+char16 __ovld __cnfn add_sat(char16 x, char16 y);
+uchar16 __ovld __cnfn add_sat(uchar16 x, uchar16 y);
+short __ovld __cnfn add_sat(short x, short y);
+ushort __ovld __cnfn add_sat(ushort x, ushort y);
+short2 __ovld __cnfn add_sat(short2 x, short2 y);
+ushort2 __ovld __cnfn add_sat(ushort2 x, ushort2 y);
+short3 __ovld __cnfn add_sat(short3 x, short3 y);
+ushort3 __ovld __cnfn add_sat(ushort3 x, ushort3 y);
+short4 __ovld __cnfn add_sat(short4 x, short4 y);
+ushort4 __ovld __cnfn add_sat(ushort4 x, ushort4 y);
+short8 __ovld __cnfn add_sat(short8 x, short8 y);
+ushort8 __ovld __cnfn add_sat(ushort8 x, ushort8 y);
+short16 __ovld __cnfn add_sat(short16 x, short16 y);
+ushort16 __ovld __cnfn add_sat(ushort16 x, ushort16 y);
+int __ovld __cnfn add_sat(int x, int y);
+uint __ovld __cnfn add_sat(uint x, uint y);
+int2 __ovld __cnfn add_sat(int2 x, int2 y);
+uint2 __ovld __cnfn add_sat(uint2 x, uint2 y);
+int3 __ovld __cnfn add_sat(int3 x, int3 y);
+uint3 __ovld __cnfn add_sat(uint3 x, uint3 y);
+int4 __ovld __cnfn add_sat(int4 x, int4 y);
+uint4 __ovld __cnfn add_sat(uint4 x, uint4 y);
+int8 __ovld __cnfn add_sat(int8 x, int8 y);
+uint8 __ovld __cnfn add_sat(uint8 x, uint8 y);
+int16 __ovld __cnfn add_sat(int16 x, int16 y);
+uint16 __ovld __cnfn add_sat(uint16 x, uint16 y);
+long __ovld __cnfn add_sat(long x, long y);
+ulong __ovld __cnfn add_sat(ulong x, ulong y);
+long2 __ovld __cnfn add_sat(long2 x, long2 y);
+ulong2 __ovld __cnfn add_sat(ulong2 x, ulong2 y);
+long3 __ovld __cnfn add_sat(long3 x, long3 y);
+ulong3 __ovld __cnfn add_sat(ulong3 x, ulong3 y);
+long4 __ovld __cnfn add_sat(long4 x, long4 y);
+ulong4 __ovld __cnfn add_sat(ulong4 x, ulong4 y);
+long8 __ovld __cnfn add_sat(long8 x, long8 y);
+ulong8 __ovld __cnfn add_sat(ulong8 x, ulong8 y);
+long16 __ovld __cnfn add_sat(long16 x, long16 y);
+ulong16 __ovld __cnfn add_sat(ulong16 x, ulong16 y);
+
+/**
+ * Returns (x + y) >> 1. The intermediate sum does
+ * not modulo overflow.
+ */
+char __ovld __cnfn hadd(char x, char y);
+uchar __ovld __cnfn hadd(uchar x, uchar y);
+char2 __ovld __cnfn hadd(char2 x, char2 y);
+uchar2 __ovld __cnfn hadd(uchar2 x, uchar2 y);
+char3 __ovld __cnfn hadd(char3 x, char3 y);
+uchar3 __ovld __cnfn hadd(uchar3 x, uchar3 y);
+char4 __ovld __cnfn hadd(char4 x, char4 y);
+uchar4 __ovld __cnfn hadd(uchar4 x, uchar4 y);
+char8 __ovld __cnfn hadd(char8 x, char8 y);
+uchar8 __ovld __cnfn hadd(uchar8 x, uchar8 y);
+char16 __ovld __cnfn hadd(char16 x, char16 y);
+uchar16 __ovld __cnfn hadd(uchar16 x, uchar16 y);
+short __ovld __cnfn hadd(short x, short y);
+ushort __ovld __cnfn hadd(ushort x, ushort y);
+short2 __ovld __cnfn hadd(short2 x, short2 y);
+ushort2 __ovld __cnfn hadd(ushort2 x, ushort2 y);
+short3 __ovld __cnfn hadd(short3 x, short3 y);
+ushort3 __ovld __cnfn hadd(ushort3 x, ushort3 y);
+short4 __ovld __cnfn hadd(short4 x, short4 y);
+ushort4 __ovld __cnfn hadd(ushort4 x, ushort4 y);
+short8 __ovld __cnfn hadd(short8 x, short8 y);
+ushort8 __ovld __cnfn hadd(ushort8 x, ushort8 y);
+short16 __ovld __cnfn hadd(short16 x, short16 y);
+ushort16 __ovld __cnfn hadd(ushort16 x, ushort16 y);
+int __ovld __cnfn hadd(int x, int y);
+uint __ovld __cnfn hadd(uint x, uint y);
+int2 __ovld __cnfn hadd(int2 x, int2 y);
+uint2 __ovld __cnfn hadd(uint2 x, uint2 y);
+int3 __ovld __cnfn hadd(int3 x, int3 y);
+uint3 __ovld __cnfn hadd(uint3 x, uint3 y);
+int4 __ovld __cnfn hadd(int4 x, int4 y);
+uint4 __ovld __cnfn hadd(uint4 x, uint4 y);
+int8 __ovld __cnfn hadd(int8 x, int8 y);
+uint8 __ovld __cnfn hadd(uint8 x, uint8 y);
+int16 __ovld __cnfn hadd(int16 x, int16 y);
+uint16 __ovld __cnfn hadd(uint16 x, uint16 y);
+long __ovld __cnfn hadd(long x, long y);
+ulong __ovld __cnfn hadd(ulong x, ulong y);
+long2 __ovld __cnfn hadd(long2 x, long2 y);
+ulong2 __ovld __cnfn hadd(ulong2 x, ulong2 y);
+long3 __ovld __cnfn hadd(long3 x, long3 y);
+ulong3 __ovld __cnfn hadd(ulong3 x, ulong3 y);
+long4 __ovld __cnfn hadd(long4 x, long4 y);
+ulong4 __ovld __cnfn hadd(ulong4 x, ulong4 y);
+long8 __ovld __cnfn hadd(long8 x, long8 y);
+ulong8 __ovld __cnfn hadd(ulong8 x, ulong8 y);
+long16 __ovld __cnfn hadd(long16 x, long16 y);
+ulong16 __ovld __cnfn hadd(ulong16 x, ulong16 y);
+
+/**
+ * Returns (x + y + 1) >> 1. The intermediate sum
+ * does not modulo overflow.
+ */
+char __ovld __cnfn rhadd(char x, char y);
+uchar __ovld __cnfn rhadd(uchar x, uchar y);
+char2 __ovld __cnfn rhadd(char2 x, char2 y);
+uchar2 __ovld __cnfn rhadd(uchar2 x, uchar2 y);
+char3 __ovld __cnfn rhadd(char3 x, char3 y);
+uchar3 __ovld __cnfn rhadd(uchar3 x, uchar3 y);
+char4 __ovld __cnfn rhadd(char4 x, char4 y);
+uchar4 __ovld __cnfn rhadd(uchar4 x, uchar4 y);
+char8 __ovld __cnfn rhadd(char8 x, char8 y);
+uchar8 __ovld __cnfn rhadd(uchar8 x, uchar8 y);
+char16 __ovld __cnfn rhadd(char16 x, char16 y);
+uchar16 __ovld __cnfn rhadd(uchar16 x, uchar16 y);
+short __ovld __cnfn rhadd(short x, short y);
+ushort __ovld __cnfn rhadd(ushort x, ushort y);
+short2 __ovld __cnfn rhadd(short2 x, short2 y);
+ushort2 __ovld __cnfn rhadd(ushort2 x, ushort2 y);
+short3 __ovld __cnfn rhadd(short3 x, short3 y);
+ushort3 __ovld __cnfn rhadd(ushort3 x, ushort3 y);
+short4 __ovld __cnfn rhadd(short4 x, short4 y);
+ushort4 __ovld __cnfn rhadd(ushort4 x, ushort4 y);
+short8 __ovld __cnfn rhadd(short8 x, short8 y);
+ushort8 __ovld __cnfn rhadd(ushort8 x, ushort8 y);
+short16 __ovld __cnfn rhadd(short16 x, short16 y);
+ushort16 __ovld __cnfn rhadd(ushort16 x, ushort16 y);
+int __ovld __cnfn rhadd(int x, int y);
+uint __ovld __cnfn rhadd(uint x, uint y);
+int2 __ovld __cnfn rhadd(int2 x, int2 y);
+uint2 __ovld __cnfn rhadd(uint2 x, uint2 y);
+int3 __ovld __cnfn rhadd(int3 x, int3 y);
+uint3 __ovld __cnfn rhadd(uint3 x, uint3 y);
+int4 __ovld __cnfn rhadd(int4 x, int4 y);
+uint4 __ovld __cnfn rhadd(uint4 x, uint4 y);
+int8 __ovld __cnfn rhadd(int8 x, int8 y);
+uint8 __ovld __cnfn rhadd(uint8 x, uint8 y);
+int16 __ovld __cnfn rhadd(int16 x, int16 y);
+uint16 __ovld __cnfn rhadd(uint16 x, uint16 y);
+long __ovld __cnfn rhadd(long x, long y);
+ulong __ovld __cnfn rhadd(ulong x, ulong y);
+long2 __ovld __cnfn rhadd(long2 x, long2 y);
+ulong2 __ovld __cnfn rhadd(ulong2 x, ulong2 y);
+long3 __ovld __cnfn rhadd(long3 x, long3 y);
+ulong3 __ovld __cnfn rhadd(ulong3 x, ulong3 y);
+long4 __ovld __cnfn rhadd(long4 x, long4 y);
+ulong4 __ovld __cnfn rhadd(ulong4 x, ulong4 y);
+long8 __ovld __cnfn rhadd(long8 x, long8 y);
+ulong8 __ovld __cnfn rhadd(ulong8 x, ulong8 y);
+long16 __ovld __cnfn rhadd(long16 x, long16 y);
+ulong16 __ovld __cnfn rhadd(ulong16 x, ulong16 y);
+
+/**
+ * Returns min(max(x, minval), maxval).
+ * Results are undefined if minval > maxval.
+ */
+char __ovld __cnfn clamp(char x, char minval, char maxval);
+uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
+char2 __ovld __cnfn clamp(char2 x, char2 minval, char2 maxval);
+uchar2 __ovld __cnfn clamp(uchar2 x, uchar2 minval, uchar2 maxval);
+char3 __ovld __cnfn clamp(char3 x, char3 minval, char3 maxval);
+uchar3 __ovld __cnfn clamp(uchar3 x, uchar3 minval, uchar3 maxval);
+char4 __ovld __cnfn clamp(char4 x, char4 minval, char4 maxval);
+uchar4 __ovld __cnfn clamp(uchar4 x, uchar4 minval, uchar4 maxval);
+char8 __ovld __cnfn clamp(char8 x, char8 minval, char8 maxval);
+uchar8 __ovld __cnfn clamp(uchar8 x, uchar8 minval, uchar8 maxval);
+char16 __ovld __cnfn clamp(char16 x, char16 minval, char16 maxval);
+uchar16 __ovld __cnfn clamp(uchar16 x, uchar16 minval, uchar16 maxval);
+short __ovld __cnfn clamp(short x, short minval, short maxval);
+ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
+short2 __ovld __cnfn clamp(short2 x, short2 minval, short2 maxval);
+ushort2 __ovld __cnfn clamp(ushort2 x, ushort2 minval, ushort2 maxval);
+short3 __ovld __cnfn clamp(short3 x, short3 minval, short3 maxval);
+ushort3 __ovld __cnfn clamp(ushort3 x, ushort3 minval, ushort3 maxval);
+short4 __ovld __cnfn clamp(short4 x, short4 minval, short4 maxval);
+ushort4 __ovld __cnfn clamp(ushort4 x, ushort4 minval, ushort4 maxval);
+short8 __ovld __cnfn clamp(short8 x, short8 minval, short8 maxval);
+ushort8 __ovld __cnfn clamp(ushort8 x, ushort8 minval, ushort8 maxval);
+short16 __ovld __cnfn clamp(short16 x, short16 minval, short16 maxval);
+ushort16 __ovld __cnfn clamp(ushort16 x, ushort16 minval, ushort16 maxval);
+int __ovld __cnfn clamp(int x, int minval, int maxval);
+uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
+int2 __ovld __cnfn clamp(int2 x, int2 minval, int2 maxval);
+uint2 __ovld __cnfn clamp(uint2 x, uint2 minval, uint2 maxval);
+int3 __ovld __cnfn clamp(int3 x, int3 minval, int3 maxval);
+uint3 __ovld __cnfn clamp(uint3 x, uint3 minval, uint3 maxval);
+int4 __ovld __cnfn clamp(int4 x, int4 minval, int4 maxval);
+uint4 __ovld __cnfn clamp(uint4 x, uint4 minval, uint4 maxval);
+int8 __ovld __cnfn clamp(int8 x, int8 minval, int8 maxval);
+uint8 __ovld __cnfn clamp(uint8 x, uint8 minval, uint8 maxval);
+int16 __ovld __cnfn clamp(int16 x, int16 minval, int16 maxval);
+uint16 __ovld __cnfn clamp(uint16 x, uint16 minval, uint16 maxval);
+long __ovld __cnfn clamp(long x, long minval, long maxval);
+ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
+long2 __ovld __cnfn clamp(long2 x, long2 minval, long2 maxval);
+ulong2 __ovld __cnfn clamp(ulong2 x, ulong2 minval, ulong2 maxval);
+long3 __ovld __cnfn clamp(long3 x, long3 minval, long3 maxval);
+ulong3 __ovld __cnfn clamp(ulong3 x, ulong3 minval, ulong3 maxval);
+long4 __ovld __cnfn clamp(long4 x, long4 minval, long4 maxval);
+ulong4 __ovld __cnfn clamp(ulong4 x, ulong4 minval, ulong4 maxval);
+long8 __ovld __cnfn clamp(long8 x, long8 minval, long8 maxval);
+ulong8 __ovld __cnfn clamp(ulong8 x, ulong8 minval, ulong8 maxval);
+long16 __ovld __cnfn clamp(long16 x, long16 minval, long16 maxval);
+ulong16 __ovld __cnfn clamp(ulong16 x, ulong16 minval, ulong16 maxval);
+char __ovld __cnfn clamp(char x, char minval, char maxval);
+uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
+char2 __ovld __cnfn clamp(char2 x, char minval, char maxval);
+uchar2 __ovld __cnfn clamp(uchar2 x, uchar minval, uchar maxval);
+char3 __ovld __cnfn clamp(char3 x, char minval, char maxval);
+uchar3 __ovld __cnfn clamp(uchar3 x, uchar minval, uchar maxval);
+char4 __ovld __cnfn clamp(char4 x, char minval, char maxval);
+uchar4 __ovld __cnfn clamp(uchar4 x, uchar minval, uchar maxval);
+char8 __ovld __cnfn clamp(char8 x, char minval, char maxval);
+uchar8 __ovld __cnfn clamp(uchar8 x, uchar minval, uchar maxval);
+char16 __ovld __cnfn clamp(char16 x, char minval, char maxval);
+uchar16 __ovld __cnfn clamp(uchar16 x, uchar minval, uchar maxval);
+short __ovld __cnfn clamp(short x, short minval, short maxval);
+ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
+short2 __ovld __cnfn clamp(short2 x, short minval, short maxval);
+ushort2 __ovld __cnfn clamp(ushort2 x, ushort minval, ushort maxval);
+short3 __ovld __cnfn clamp(short3 x, short minval, short maxval);
+ushort3 __ovld __cnfn clamp(ushort3 x, ushort minval, ushort maxval);
+short4 __ovld __cnfn clamp(short4 x, short minval, short maxval);
+ushort4 __ovld __cnfn clamp(ushort4 x, ushort minval, ushort maxval);
+short8 __ovld __cnfn clamp(short8 x, short minval, short maxval);
+ushort8 __ovld __cnfn clamp(ushort8 x, ushort minval, ushort maxval);
+short16 __ovld __cnfn clamp(short16 x, short minval, short maxval);
+ushort16 __ovld __cnfn clamp(ushort16 x, ushort minval, ushort maxval);
+int __ovld __cnfn clamp(int x, int minval, int maxval);
+uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
+int2 __ovld __cnfn clamp(int2 x, int minval, int maxval);
+uint2 __ovld __cnfn clamp(uint2 x, uint minval, uint maxval);
+int3 __ovld __cnfn clamp(int3 x, int minval, int maxval);
+uint3 __ovld __cnfn clamp(uint3 x, uint minval, uint maxval);
+int4 __ovld __cnfn clamp(int4 x, int minval, int maxval);
+uint4 __ovld __cnfn clamp(uint4 x, uint minval, uint maxval);
+int8 __ovld __cnfn clamp(int8 x, int minval, int maxval);
+uint8 __ovld __cnfn clamp(uint8 x, uint minval, uint maxval);
+int16 __ovld __cnfn clamp(int16 x, int minval, int maxval);
+uint16 __ovld __cnfn clamp(uint16 x, uint minval, uint maxval);
+long __ovld __cnfn clamp(long x, long minval, long maxval);
+ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
+long2 __ovld __cnfn clamp(long2 x, long minval, long maxval);
+ulong2 __ovld __cnfn clamp(ulong2 x, ulong minval, ulong maxval);
+long3 __ovld __cnfn clamp(long3 x, long minval, long maxval);
+ulong3 __ovld __cnfn clamp(ulong3 x, ulong minval, ulong maxval);
+long4 __ovld __cnfn clamp(long4 x, long minval, long maxval);
+ulong4 __ovld __cnfn clamp(ulong4 x, ulong minval, ulong maxval);
+long8 __ovld __cnfn clamp(long8 x, long minval, long maxval);
+ulong8 __ovld __cnfn clamp(ulong8 x, ulong minval, ulong maxval);
+long16 __ovld __cnfn clamp(long16 x, long minval, long maxval);
+ulong16 __ovld __cnfn clamp(ulong16 x, ulong minval, ulong maxval);
+
+/**
+ * Returns the number of leading 0-bits in x, starting
+ * at the most significant bit position.
+ */
+char __ovld __cnfn clz(char x);
+uchar __ovld __cnfn clz(uchar x);
+char2 __ovld __cnfn clz(char2 x);
+uchar2 __ovld __cnfn clz(uchar2 x);
+char3 __ovld __cnfn clz(char3 x);
+uchar3 __ovld __cnfn clz(uchar3 x);
+char4 __ovld __cnfn clz(char4 x);
+uchar4 __ovld __cnfn clz(uchar4 x);
+char8 __ovld __cnfn clz(char8 x);
+uchar8 __ovld __cnfn clz(uchar8 x);
+char16 __ovld __cnfn clz(char16 x);
+uchar16 __ovld __cnfn clz(uchar16 x);
+short __ovld __cnfn clz(short x);
+ushort __ovld __cnfn clz(ushort x);
+short2 __ovld __cnfn clz(short2 x);
+ushort2 __ovld __cnfn clz(ushort2 x);
+short3 __ovld __cnfn clz(short3 x);
+ushort3 __ovld __cnfn clz(ushort3 x);
+short4 __ovld __cnfn clz(short4 x);
+ushort4 __ovld __cnfn clz(ushort4 x);
+short8 __ovld __cnfn clz(short8 x);
+ushort8 __ovld __cnfn clz(ushort8 x);
+short16 __ovld __cnfn clz(short16 x);
+ushort16 __ovld __cnfn clz(ushort16 x);
+int __ovld __cnfn clz(int x);
+uint __ovld __cnfn clz(uint x);
+int2 __ovld __cnfn clz(int2 x);
+uint2 __ovld __cnfn clz(uint2 x);
+int3 __ovld __cnfn clz(int3 x);
+uint3 __ovld __cnfn clz(uint3 x);
+int4 __ovld __cnfn clz(int4 x);
+uint4 __ovld __cnfn clz(uint4 x);
+int8 __ovld __cnfn clz(int8 x);
+uint8 __ovld __cnfn clz(uint8 x);
+int16 __ovld __cnfn clz(int16 x);
+uint16 __ovld __cnfn clz(uint16 x);
+long __ovld __cnfn clz(long x);
+ulong __ovld __cnfn clz(ulong x);
+long2 __ovld __cnfn clz(long2 x);
+ulong2 __ovld __cnfn clz(ulong2 x);
+long3 __ovld __cnfn clz(long3 x);
+ulong3 __ovld __cnfn clz(ulong3 x);
+long4 __ovld __cnfn clz(long4 x);
+ulong4 __ovld __cnfn clz(ulong4 x);
+long8 __ovld __cnfn clz(long8 x);
+ulong8 __ovld __cnfn clz(ulong8 x);
+long16 __ovld __cnfn clz(long16 x);
+ulong16 __ovld __cnfn clz(ulong16 x);
+
+/**
+ * Returns the count of trailing 0-bits in x. If x is 0,
+ * returns the size in bits of the type of x or
+ * component type of x, if x is a vector.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+char __ovld ctz(char x);
+uchar __ovld ctz(uchar x);
+char2 __ovld ctz(char2 x);
+uchar2 __ovld ctz(uchar2 x);
+char3 __ovld ctz(char3 x);
+uchar3 __ovld ctz(uchar3 x);
+char4 __ovld ctz(char4 x);
+uchar4 __ovld ctz(uchar4 x);
+char8 __ovld ctz(char8 x);
+uchar8 __ovld ctz(uchar8 x);
+char16 __ovld ctz(char16 x);
+uchar16 __ovld ctz(uchar16 x);
+short __ovld ctz(short x);
+ushort __ovld ctz(ushort x);
+short2 __ovld ctz(short2 x);
+ushort2 __ovld ctz(ushort2 x);
+short3 __ovld ctz(short3 x);
+ushort3 __ovld ctz(ushort3 x);
+short4 __ovld ctz(short4 x);
+ushort4 __ovld ctz(ushort4 x);
+short8 __ovld ctz(short8 x);
+ushort8 __ovld ctz(ushort8 x);
+short16 __ovld ctz(short16 x);
+ushort16 __ovld ctz(ushort16 x);
+int __ovld ctz(int x);
+uint __ovld ctz(uint x);
+int2 __ovld ctz(int2 x);
+uint2 __ovld ctz(uint2 x);
+int3 __ovld ctz(int3 x);
+uint3 __ovld ctz(uint3 x);
+int4 __ovld ctz(int4 x);
+uint4 __ovld ctz(uint4 x);
+int8 __ovld ctz(int8 x);
+uint8 __ovld ctz(uint8 x);
+int16 __ovld ctz(int16 x);
+uint16 __ovld ctz(uint16 x);
+long __ovld ctz(long x);
+ulong __ovld ctz(ulong x);
+long2 __ovld ctz(long2 x);
+ulong2 __ovld ctz(ulong2 x);
+long3 __ovld ctz(long3 x);
+ulong3 __ovld ctz(ulong3 x);
+long4 __ovld ctz(long4 x);
+ulong4 __ovld ctz(ulong4 x);
+long8 __ovld ctz(long8 x);
+ulong8 __ovld ctz(ulong8 x);
+long16 __ovld ctz(long16 x);
+ulong16 __ovld ctz(ulong16 x);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Returns mul_hi(a, b) + c.
+ */
+char __ovld __cnfn mad_hi(char a, char b, char c);
+uchar __ovld __cnfn mad_hi(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn mad_hi(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn mad_hi(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn mad_hi(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn mad_hi(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn mad_hi(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn mad_hi(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn mad_hi(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn mad_hi(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn mad_hi(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn mad_hi(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn mad_hi(short a, short b, short c);
+ushort __ovld __cnfn mad_hi(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn mad_hi(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn mad_hi(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn mad_hi(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn mad_hi(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn mad_hi(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn mad_hi(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn mad_hi(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn mad_hi(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn mad_hi(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn mad_hi(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn mad_hi(int a, int b, int c);
+uint __ovld __cnfn mad_hi(uint a, uint b, uint c);
+int2 __ovld __cnfn mad_hi(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn mad_hi(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn mad_hi(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn mad_hi(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn mad_hi(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn mad_hi(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn mad_hi(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn mad_hi(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn mad_hi(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn mad_hi(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn mad_hi(long a, long b, long c);
+ulong __ovld __cnfn mad_hi(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn mad_hi(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn mad_hi(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn mad_hi(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn mad_hi(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn mad_hi(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn mad_hi(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn mad_hi(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn mad_hi(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn mad_hi(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn mad_hi(ulong16 a, ulong16 b, ulong16 c);
+
+/**
+ * Returns a * b + c and saturates the result.
+ */
+char __ovld __cnfn mad_sat(char a, char b, char c);
+uchar __ovld __cnfn mad_sat(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn mad_sat(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn mad_sat(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn mad_sat(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn mad_sat(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn mad_sat(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn mad_sat(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn mad_sat(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn mad_sat(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn mad_sat(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn mad_sat(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn mad_sat(short a, short b, short c);
+ushort __ovld __cnfn mad_sat(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn mad_sat(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn mad_sat(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn mad_sat(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn mad_sat(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn mad_sat(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn mad_sat(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn mad_sat(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn mad_sat(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn mad_sat(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn mad_sat(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn mad_sat(int a, int b, int c);
+uint __ovld __cnfn mad_sat(uint a, uint b, uint c);
+int2 __ovld __cnfn mad_sat(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn mad_sat(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn mad_sat(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn mad_sat(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn mad_sat(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn mad_sat(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn mad_sat(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn mad_sat(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn mad_sat(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn mad_sat(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn mad_sat(long a, long b, long c);
+ulong __ovld __cnfn mad_sat(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn mad_sat(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn mad_sat(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn mad_sat(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn mad_sat(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn mad_sat(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn mad_sat(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn mad_sat(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn mad_sat(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn mad_sat(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn mad_sat(ulong16 a, ulong16 b, ulong16 c);
+
+/**
+ * Returns y if x < y, otherwise it returns x.
+ */
+char __ovld __cnfn max(char x, char y);
+uchar __ovld __cnfn max(uchar x, uchar y);
+char2 __ovld __cnfn max(char2 x, char2 y);
+uchar2 __ovld __cnfn max(uchar2 x, uchar2 y);
+char3 __ovld __cnfn max(char3 x, char3 y);
+uchar3 __ovld __cnfn max(uchar3 x, uchar3 y);
+char4 __ovld __cnfn max(char4 x, char4 y);
+uchar4 __ovld __cnfn max(uchar4 x, uchar4 y);
+char8 __ovld __cnfn max(char8 x, char8 y);
+uchar8 __ovld __cnfn max(uchar8 x, uchar8 y);
+char16 __ovld __cnfn max(char16 x, char16 y);
+uchar16 __ovld __cnfn max(uchar16 x, uchar16 y);
+short __ovld __cnfn max(short x, short y);
+ushort __ovld __cnfn max(ushort x, ushort y);
+short2 __ovld __cnfn max(short2 x, short2 y);
+ushort2 __ovld __cnfn max(ushort2 x, ushort2 y);
+short3 __ovld __cnfn max(short3 x, short3 y);
+ushort3 __ovld __cnfn max(ushort3 x, ushort3 y);
+short4 __ovld __cnfn max(short4 x, short4 y);
+ushort4 __ovld __cnfn max(ushort4 x, ushort4 y);
+short8 __ovld __cnfn max(short8 x, short8 y);
+ushort8 __ovld __cnfn max(ushort8 x, ushort8 y);
+short16 __ovld __cnfn max(short16 x, short16 y);
+ushort16 __ovld __cnfn max(ushort16 x, ushort16 y);
+int __ovld __cnfn max(int x, int y);
+uint __ovld __cnfn max(uint x, uint y);
+int2 __ovld __cnfn max(int2 x, int2 y);
+uint2 __ovld __cnfn max(uint2 x, uint2 y);
+int3 __ovld __cnfn max(int3 x, int3 y);
+uint3 __ovld __cnfn max(uint3 x, uint3 y);
+int4 __ovld __cnfn max(int4 x, int4 y);
+uint4 __ovld __cnfn max(uint4 x, uint4 y);
+int8 __ovld __cnfn max(int8 x, int8 y);
+uint8 __ovld __cnfn max(uint8 x, uint8 y);
+int16 __ovld __cnfn max(int16 x, int16 y);
+uint16 __ovld __cnfn max(uint16 x, uint16 y);
+long __ovld __cnfn max(long x, long y);
+ulong __ovld __cnfn max(ulong x, ulong y);
+long2 __ovld __cnfn max(long2 x, long2 y);
+ulong2 __ovld __cnfn max(ulong2 x, ulong2 y);
+long3 __ovld __cnfn max(long3 x, long3 y);
+ulong3 __ovld __cnfn max(ulong3 x, ulong3 y);
+long4 __ovld __cnfn max(long4 x, long4 y);
+ulong4 __ovld __cnfn max(ulong4 x, ulong4 y);
+long8 __ovld __cnfn max(long8 x, long8 y);
+ulong8 __ovld __cnfn max(ulong8 x, ulong8 y);
+long16 __ovld __cnfn max(long16 x, long16 y);
+ulong16 __ovld __cnfn max(ulong16 x, ulong16 y);
+char __ovld __cnfn max(char x, char y);
+uchar __ovld __cnfn max(uchar x, uchar y);
+char2 __ovld __cnfn max(char2 x, char y);
+uchar2 __ovld __cnfn max(uchar2 x, uchar y);
+char3 __ovld __cnfn max(char3 x, char y);
+uchar3 __ovld __cnfn max(uchar3 x, uchar y);
+char4 __ovld __cnfn max(char4 x, char y);
+uchar4 __ovld __cnfn max(uchar4 x, uchar y);
+char8 __ovld __cnfn max(char8 x, char y);
+uchar8 __ovld __cnfn max(uchar8 x, uchar y);
+char16 __ovld __cnfn max(char16 x, char y);
+uchar16 __ovld __cnfn max(uchar16 x, uchar y);
+short __ovld __cnfn max(short x, short y);
+ushort __ovld __cnfn max(ushort x, ushort y);
+short2 __ovld __cnfn max(short2 x, short y);
+ushort2 __ovld __cnfn max(ushort2 x, ushort y);
+short3 __ovld __cnfn max(short3 x, short y);
+ushort3 __ovld __cnfn max(ushort3 x, ushort y);
+short4 __ovld __cnfn max(short4 x, short y);
+ushort4 __ovld __cnfn max(ushort4 x, ushort y);
+short8 __ovld __cnfn max(short8 x, short y);
+ushort8 __ovld __cnfn max(ushort8 x, ushort y);
+short16 __ovld __cnfn max(short16 x, short y);
+ushort16 __ovld __cnfn max(ushort16 x, ushort y);
+int __ovld __cnfn max(int x, int y);
+uint __ovld __cnfn max(uint x, uint y);
+int2 __ovld __cnfn max(int2 x, int y);
+uint2 __ovld __cnfn max(uint2 x, uint y);
+int3 __ovld __cnfn max(int3 x, int y);
+uint3 __ovld __cnfn max(uint3 x, uint y);
+int4 __ovld __cnfn max(int4 x, int y);
+uint4 __ovld __cnfn max(uint4 x, uint y);
+int8 __ovld __cnfn max(int8 x, int y);
+uint8 __ovld __cnfn max(uint8 x, uint y);
+int16 __ovld __cnfn max(int16 x, int y);
+uint16 __ovld __cnfn max(uint16 x, uint y);
+long __ovld __cnfn max(long x, long y);
+ulong __ovld __cnfn max(ulong x, ulong y);
+long2 __ovld __cnfn max(long2 x, long y);
+ulong2 __ovld __cnfn max(ulong2 x, ulong y);
+long3 __ovld __cnfn max(long3 x, long y);
+ulong3 __ovld __cnfn max(ulong3 x, ulong y);
+long4 __ovld __cnfn max(long4 x, long y);
+ulong4 __ovld __cnfn max(ulong4 x, ulong y);
+long8 __ovld __cnfn max(long8 x, long y);
+ulong8 __ovld __cnfn max(ulong8 x, ulong y);
+long16 __ovld __cnfn max(long16 x, long y);
+ulong16 __ovld __cnfn max(ulong16 x, ulong y);
+
+/**
+ * Returns y if y < x, otherwise it returns x.
+ */
+char __ovld __cnfn min(char x, char y);
+uchar __ovld __cnfn min(uchar x, uchar y);
+char2 __ovld __cnfn min(char2 x, char2 y);
+uchar2 __ovld __cnfn min(uchar2 x, uchar2 y);
+char3 __ovld __cnfn min(char3 x, char3 y);
+uchar3 __ovld __cnfn min(uchar3 x, uchar3 y);
+char4 __ovld __cnfn min(char4 x, char4 y);
+uchar4 __ovld __cnfn min(uchar4 x, uchar4 y);
+char8 __ovld __cnfn min(char8 x, char8 y);
+uchar8 __ovld __cnfn min(uchar8 x, uchar8 y);
+char16 __ovld __cnfn min(char16 x, char16 y);
+uchar16 __ovld __cnfn min(uchar16 x, uchar16 y);
+short __ovld __cnfn min(short x, short y);
+ushort __ovld __cnfn min(ushort x, ushort y);
+short2 __ovld __cnfn min(short2 x, short2 y);
+ushort2 __ovld __cnfn min(ushort2 x, ushort2 y);
+short3 __ovld __cnfn min(short3 x, short3 y);
+ushort3 __ovld __cnfn min(ushort3 x, ushort3 y);
+short4 __ovld __cnfn min(short4 x, short4 y);
+ushort4 __ovld __cnfn min(ushort4 x, ushort4 y);
+short8 __ovld __cnfn min(short8 x, short8 y);
+ushort8 __ovld __cnfn min(ushort8 x, ushort8 y);
+short16 __ovld __cnfn min(short16 x, short16 y);
+ushort16 __ovld __cnfn min(ushort16 x, ushort16 y);
+int __ovld __cnfn min(int x, int y);
+uint __ovld __cnfn min(uint x, uint y);
+int2 __ovld __cnfn min(int2 x, int2 y);
+uint2 __ovld __cnfn min(uint2 x, uint2 y);
+int3 __ovld __cnfn min(int3 x, int3 y);
+uint3 __ovld __cnfn min(uint3 x, uint3 y);
+int4 __ovld __cnfn min(int4 x, int4 y);
+uint4 __ovld __cnfn min(uint4 x, uint4 y);
+int8 __ovld __cnfn min(int8 x, int8 y);
+uint8 __ovld __cnfn min(uint8 x, uint8 y);
+int16 __ovld __cnfn min(int16 x, int16 y);
+uint16 __ovld __cnfn min(uint16 x, uint16 y);
+long __ovld __cnfn min(long x, long y);
+ulong __ovld __cnfn min(ulong x, ulong y);
+long2 __ovld __cnfn min(long2 x, long2 y);
+ulong2 __ovld __cnfn min(ulong2 x, ulong2 y);
+long3 __ovld __cnfn min(long3 x, long3 y);
+ulong3 __ovld __cnfn min(ulong3 x, ulong3 y);
+long4 __ovld __cnfn min(long4 x, long4 y);
+ulong4 __ovld __cnfn min(ulong4 x, ulong4 y);
+long8 __ovld __cnfn min(long8 x, long8 y);
+ulong8 __ovld __cnfn min(ulong8 x, ulong8 y);
+long16 __ovld __cnfn min(long16 x, long16 y);
+ulong16 __ovld __cnfn min(ulong16 x, ulong16 y);
+char __ovld __cnfn min(char x, char y);
+uchar __ovld __cnfn min(uchar x, uchar y);
+char2 __ovld __cnfn min(char2 x, char y);
+uchar2 __ovld __cnfn min(uchar2 x, uchar y);
+char3 __ovld __cnfn min(char3 x, char y);
+uchar3 __ovld __cnfn min(uchar3 x, uchar y);
+char4 __ovld __cnfn min(char4 x, char y);
+uchar4 __ovld __cnfn min(uchar4 x, uchar y);
+char8 __ovld __cnfn min(char8 x, char y);
+uchar8 __ovld __cnfn min(uchar8 x, uchar y);
+char16 __ovld __cnfn min(char16 x, char y);
+uchar16 __ovld __cnfn min(uchar16 x, uchar y);
+short __ovld __cnfn min(short x, short y);
+ushort __ovld __cnfn min(ushort x, ushort y);
+short2 __ovld __cnfn min(short2 x, short y);
+ushort2 __ovld __cnfn min(ushort2 x, ushort y);
+short3 __ovld __cnfn min(short3 x, short y);
+ushort3 __ovld __cnfn min(ushort3 x, ushort y);
+short4 __ovld __cnfn min(short4 x, short y);
+ushort4 __ovld __cnfn min(ushort4 x, ushort y);
+short8 __ovld __cnfn min(short8 x, short y);
+ushort8 __ovld __cnfn min(ushort8 x, ushort y);
+short16 __ovld __cnfn min(short16 x, short y);
+ushort16 __ovld __cnfn min(ushort16 x, ushort y);
+int __ovld __cnfn min(int x, int y);
+uint __ovld __cnfn min(uint x, uint y);
+int2 __ovld __cnfn min(int2 x, int y);
+uint2 __ovld __cnfn min(uint2 x, uint y);
+int3 __ovld __cnfn min(int3 x, int y);
+uint3 __ovld __cnfn min(uint3 x, uint y);
+int4 __ovld __cnfn min(int4 x, int y);
+uint4 __ovld __cnfn min(uint4 x, uint y);
+int8 __ovld __cnfn min(int8 x, int y);
+uint8 __ovld __cnfn min(uint8 x, uint y);
+int16 __ovld __cnfn min(int16 x, int y);
+uint16 __ovld __cnfn min(uint16 x, uint y);
+long __ovld __cnfn min(long x, long y);
+ulong __ovld __cnfn min(ulong x, ulong y);
+long2 __ovld __cnfn min(long2 x, long y);
+ulong2 __ovld __cnfn min(ulong2 x, ulong y);
+long3 __ovld __cnfn min(long3 x, long y);
+ulong3 __ovld __cnfn min(ulong3 x, ulong y);
+long4 __ovld __cnfn min(long4 x, long y);
+ulong4 __ovld __cnfn min(ulong4 x, ulong y);
+long8 __ovld __cnfn min(long8 x, long y);
+ulong8 __ovld __cnfn min(ulong8 x, ulong y);
+long16 __ovld __cnfn min(long16 x, long y);
+ulong16 __ovld __cnfn min(ulong16 x, ulong y);
+
+/**
+ * Computes x * y and returns the high half of the
+ * product of x and y.
+ */
+char __ovld __cnfn mul_hi(char x, char y);
+uchar __ovld __cnfn mul_hi(uchar x, uchar y);
+char2 __ovld __cnfn mul_hi(char2 x, char2 y);
+uchar2 __ovld __cnfn mul_hi(uchar2 x, uchar2 y);
+char3 __ovld __cnfn mul_hi(char3 x, char3 y);
+uchar3 __ovld __cnfn mul_hi(uchar3 x, uchar3 y);
+char4 __ovld __cnfn mul_hi(char4 x, char4 y);
+uchar4 __ovld __cnfn mul_hi(uchar4 x, uchar4 y);
+char8 __ovld __cnfn mul_hi(char8 x, char8 y);
+uchar8 __ovld __cnfn mul_hi(uchar8 x, uchar8 y);
+char16 __ovld __cnfn mul_hi(char16 x, char16 y);
+uchar16 __ovld __cnfn mul_hi(uchar16 x, uchar16 y);
+short __ovld __cnfn mul_hi(short x, short y);
+ushort __ovld __cnfn mul_hi(ushort x, ushort y);
+short2 __ovld __cnfn mul_hi(short2 x, short2 y);
+ushort2 __ovld __cnfn mul_hi(ushort2 x, ushort2 y);
+short3 __ovld __cnfn mul_hi(short3 x, short3 y);
+ushort3 __ovld __cnfn mul_hi(ushort3 x, ushort3 y);
+short4 __ovld __cnfn mul_hi(short4 x, short4 y);
+ushort4 __ovld __cnfn mul_hi(ushort4 x, ushort4 y);
+short8 __ovld __cnfn mul_hi(short8 x, short8 y);
+ushort8 __ovld __cnfn mul_hi(ushort8 x, ushort8 y);
+short16 __ovld __cnfn mul_hi(short16 x, short16 y);
+ushort16 __ovld __cnfn mul_hi(ushort16 x, ushort16 y);
+int __ovld __cnfn mul_hi(int x, int y);
+uint __ovld __cnfn mul_hi(uint x, uint y);
+int2 __ovld __cnfn mul_hi(int2 x, int2 y);
+uint2 __ovld __cnfn mul_hi(uint2 x, uint2 y);
+int3 __ovld __cnfn mul_hi(int3 x, int3 y);
+uint3 __ovld __cnfn mul_hi(uint3 x, uint3 y);
+int4 __ovld __cnfn mul_hi(int4 x, int4 y);
+uint4 __ovld __cnfn mul_hi(uint4 x, uint4 y);
+int8 __ovld __cnfn mul_hi(int8 x, int8 y);
+uint8 __ovld __cnfn mul_hi(uint8 x, uint8 y);
+int16 __ovld __cnfn mul_hi(int16 x, int16 y);
+uint16 __ovld __cnfn mul_hi(uint16 x, uint16 y);
+long __ovld __cnfn mul_hi(long x, long y);
+ulong __ovld __cnfn mul_hi(ulong x, ulong y);
+long2 __ovld __cnfn mul_hi(long2 x, long2 y);
+ulong2 __ovld __cnfn mul_hi(ulong2 x, ulong2 y);
+long3 __ovld __cnfn mul_hi(long3 x, long3 y);
+ulong3 __ovld __cnfn mul_hi(ulong3 x, ulong3 y);
+long4 __ovld __cnfn mul_hi(long4 x, long4 y);
+ulong4 __ovld __cnfn mul_hi(ulong4 x, ulong4 y);
+long8 __ovld __cnfn mul_hi(long8 x, long8 y);
+ulong8 __ovld __cnfn mul_hi(ulong8 x, ulong8 y);
+long16 __ovld __cnfn mul_hi(long16 x, long16 y);
+ulong16 __ovld __cnfn mul_hi(ulong16 x, ulong16 y);
+
+/**
+ * For each element in v, the bits are shifted left by
+ * the number of bits given by the corresponding
+ * element in i (subject to usual shift modulo rules
+ * described in section 6.3). Bits shifted off the left
+ * side of the element are shifted back in from the
+ * right.
+ */
+char __ovld __cnfn rotate(char v, char i);
+uchar __ovld __cnfn rotate(uchar v, uchar i);
+char2 __ovld __cnfn rotate(char2 v, char2 i);
+uchar2 __ovld __cnfn rotate(uchar2 v, uchar2 i);
+char3 __ovld __cnfn rotate(char3 v, char3 i);
+uchar3 __ovld __cnfn rotate(uchar3 v, uchar3 i);
+char4 __ovld __cnfn rotate(char4 v, char4 i);
+uchar4 __ovld __cnfn rotate(uchar4 v, uchar4 i);
+char8 __ovld __cnfn rotate(char8 v, char8 i);
+uchar8 __ovld __cnfn rotate(uchar8 v, uchar8 i);
+char16 __ovld __cnfn rotate(char16 v, char16 i);
+uchar16 __ovld __cnfn rotate(uchar16 v, uchar16 i);
+short __ovld __cnfn rotate(short v, short i);
+ushort __ovld __cnfn rotate(ushort v, ushort i);
+short2 __ovld __cnfn rotate(short2 v, short2 i);
+ushort2 __ovld __cnfn rotate(ushort2 v, ushort2 i);
+short3 __ovld __cnfn rotate(short3 v, short3 i);
+ushort3 __ovld __cnfn rotate(ushort3 v, ushort3 i);
+short4 __ovld __cnfn rotate(short4 v, short4 i);
+ushort4 __ovld __cnfn rotate(ushort4 v, ushort4 i);
+short8 __ovld __cnfn rotate(short8 v, short8 i);
+ushort8 __ovld __cnfn rotate(ushort8 v, ushort8 i);
+short16 __ovld __cnfn rotate(short16 v, short16 i);
+ushort16 __ovld __cnfn rotate(ushort16 v, ushort16 i);
+int __ovld __cnfn rotate(int v, int i);
+uint __ovld __cnfn rotate(uint v, uint i);
+int2 __ovld __cnfn rotate(int2 v, int2 i);
+uint2 __ovld __cnfn rotate(uint2 v, uint2 i);
+int3 __ovld __cnfn rotate(int3 v, int3 i);
+uint3 __ovld __cnfn rotate(uint3 v, uint3 i);
+int4 __ovld __cnfn rotate(int4 v, int4 i);
+uint4 __ovld __cnfn rotate(uint4 v, uint4 i);
+int8 __ovld __cnfn rotate(int8 v, int8 i);
+uint8 __ovld __cnfn rotate(uint8 v, uint8 i);
+int16 __ovld __cnfn rotate(int16 v, int16 i);
+uint16 __ovld __cnfn rotate(uint16 v, uint16 i);
+long __ovld __cnfn rotate(long v, long i);
+ulong __ovld __cnfn rotate(ulong v, ulong i);
+long2 __ovld __cnfn rotate(long2 v, long2 i);
+ulong2 __ovld __cnfn rotate(ulong2 v, ulong2 i);
+long3 __ovld __cnfn rotate(long3 v, long3 i);
+ulong3 __ovld __cnfn rotate(ulong3 v, ulong3 i);
+long4 __ovld __cnfn rotate(long4 v, long4 i);
+ulong4 __ovld __cnfn rotate(ulong4 v, ulong4 i);
+long8 __ovld __cnfn rotate(long8 v, long8 i);
+ulong8 __ovld __cnfn rotate(ulong8 v, ulong8 i);
+long16 __ovld __cnfn rotate(long16 v, long16 i);
+ulong16 __ovld __cnfn rotate(ulong16 v, ulong16 i);
+
+/**
+ * Returns x - y and saturates the result.
+ */
+char __ovld __cnfn sub_sat(char x, char y);
+uchar __ovld __cnfn sub_sat(uchar x, uchar y);
+char2 __ovld __cnfn sub_sat(char2 x, char2 y);
+uchar2 __ovld __cnfn sub_sat(uchar2 x, uchar2 y);
+char3 __ovld __cnfn sub_sat(char3 x, char3 y);
+uchar3 __ovld __cnfn sub_sat(uchar3 x, uchar3 y);
+char4 __ovld __cnfn sub_sat(char4 x, char4 y);
+uchar4 __ovld __cnfn sub_sat(uchar4 x, uchar4 y);
+char8 __ovld __cnfn sub_sat(char8 x, char8 y);
+uchar8 __ovld __cnfn sub_sat(uchar8 x, uchar8 y);
+char16 __ovld __cnfn sub_sat(char16 x, char16 y);
+uchar16 __ovld __cnfn sub_sat(uchar16 x, uchar16 y);
+short __ovld __cnfn sub_sat(short x, short y);
+ushort __ovld __cnfn sub_sat(ushort x, ushort y);
+short2 __ovld __cnfn sub_sat(short2 x, short2 y);
+ushort2 __ovld __cnfn sub_sat(ushort2 x, ushort2 y);
+short3 __ovld __cnfn sub_sat(short3 x, short3 y);
+ushort3 __ovld __cnfn sub_sat(ushort3 x, ushort3 y);
+short4 __ovld __cnfn sub_sat(short4 x, short4 y);
+ushort4 __ovld __cnfn sub_sat(ushort4 x, ushort4 y);
+short8 __ovld __cnfn sub_sat(short8 x, short8 y);
+ushort8 __ovld __cnfn sub_sat(ushort8 x, ushort8 y);
+short16 __ovld __cnfn sub_sat(short16 x, short16 y);
+ushort16 __ovld __cnfn sub_sat(ushort16 x, ushort16 y);
+int __ovld __cnfn sub_sat(int x, int y);
+uint __ovld __cnfn sub_sat(uint x, uint y);
+int2 __ovld __cnfn sub_sat(int2 x, int2 y);
+uint2 __ovld __cnfn sub_sat(uint2 x, uint2 y);
+int3 __ovld __cnfn sub_sat(int3 x, int3 y);
+uint3 __ovld __cnfn sub_sat(uint3 x, uint3 y);
+int4 __ovld __cnfn sub_sat(int4 x, int4 y);
+uint4 __ovld __cnfn sub_sat(uint4 x, uint4 y);
+int8 __ovld __cnfn sub_sat(int8 x, int8 y);
+uint8 __ovld __cnfn sub_sat(uint8 x, uint8 y);
+int16 __ovld __cnfn sub_sat(int16 x, int16 y);
+uint16 __ovld __cnfn sub_sat(uint16 x, uint16 y);
+long __ovld __cnfn sub_sat(long x, long y);
+ulong __ovld __cnfn sub_sat(ulong x, ulong y);
+long2 __ovld __cnfn sub_sat(long2 x, long2 y);
+ulong2 __ovld __cnfn sub_sat(ulong2 x, ulong2 y);
+long3 __ovld __cnfn sub_sat(long3 x, long3 y);
+ulong3 __ovld __cnfn sub_sat(ulong3 x, ulong3 y);
+long4 __ovld __cnfn sub_sat(long4 x, long4 y);
+ulong4 __ovld __cnfn sub_sat(ulong4 x, ulong4 y);
+long8 __ovld __cnfn sub_sat(long8 x, long8 y);
+ulong8 __ovld __cnfn sub_sat(ulong8 x, ulong8 y);
+long16 __ovld __cnfn sub_sat(long16 x, long16 y);
+ulong16 __ovld __cnfn sub_sat(ulong16 x, ulong16 y);
+
+/**
+ * result[i] = ((short)hi[i] << 8) | lo[i]
+ * result[i] = ((ushort)hi[i] << 8) | lo[i]
+ */
+short __ovld __cnfn upsample(char hi, uchar lo);
+ushort __ovld __cnfn upsample(uchar hi, uchar lo);
+short2 __ovld __cnfn upsample(char2 hi, uchar2 lo);
+short3 __ovld __cnfn upsample(char3 hi, uchar3 lo);
+short4 __ovld __cnfn upsample(char4 hi, uchar4 lo);
+short8 __ovld __cnfn upsample(char8 hi, uchar8 lo);
+short16 __ovld __cnfn upsample(char16 hi, uchar16 lo);
+ushort2 __ovld __cnfn upsample(uchar2 hi, uchar2 lo);
+ushort3 __ovld __cnfn upsample(uchar3 hi, uchar3 lo);
+ushort4 __ovld __cnfn upsample(uchar4 hi, uchar4 lo);
+ushort8 __ovld __cnfn upsample(uchar8 hi, uchar8 lo);
+ushort16 __ovld __cnfn upsample(uchar16 hi, uchar16 lo);
+
+/**
+ * result[i] = ((int)hi[i] << 16) | lo[i]
+ * result[i] = ((uint)hi[i] << 16) | lo[i]
+ */
+int __ovld __cnfn upsample(short hi, ushort lo);
+uint __ovld __cnfn upsample(ushort hi, ushort lo);
+int2 __ovld __cnfn upsample(short2 hi, ushort2 lo);
+int3 __ovld __cnfn upsample(short3 hi, ushort3 lo);
+int4 __ovld __cnfn upsample(short4 hi, ushort4 lo);
+int8 __ovld __cnfn upsample(short8 hi, ushort8 lo);
+int16 __ovld __cnfn upsample(short16 hi, ushort16 lo);
+uint2 __ovld __cnfn upsample(ushort2 hi, ushort2 lo);
+uint3 __ovld __cnfn upsample(ushort3 hi, ushort3 lo);
+uint4 __ovld __cnfn upsample(ushort4 hi, ushort4 lo);
+uint8 __ovld __cnfn upsample(ushort8 hi, ushort8 lo);
+uint16 __ovld __cnfn upsample(ushort16 hi, ushort16 lo);
+/**
+ * result[i] = ((long)hi[i] << 32) | lo[i]
+ * result[i] = ((ulong)hi[i] << 32) | lo[i]
+ */
+long __ovld __cnfn upsample(int hi, uint lo);
+ulong __ovld __cnfn upsample(uint hi, uint lo);
+long2 __ovld __cnfn upsample(int2 hi, uint2 lo);
+long3 __ovld __cnfn upsample(int3 hi, uint3 lo);
+long4 __ovld __cnfn upsample(int4 hi, uint4 lo);
+long8 __ovld __cnfn upsample(int8 hi, uint8 lo);
+long16 __ovld __cnfn upsample(int16 hi, uint16 lo);
+ulong2 __ovld __cnfn upsample(uint2 hi, uint2 lo);
+ulong3 __ovld __cnfn upsample(uint3 hi, uint3 lo);
+ulong4 __ovld __cnfn upsample(uint4 hi, uint4 lo);
+ulong8 __ovld __cnfn upsample(uint8 hi, uint8 lo);
+ulong16 __ovld __cnfn upsample(uint16 hi, uint16 lo);
+
+/*
+ * popcount(x): returns the number of set bit in x
+ */
+char __ovld __cnfn popcount(char x);
+uchar __ovld __cnfn popcount(uchar x);
+char2 __ovld __cnfn popcount(char2 x);
+uchar2 __ovld __cnfn popcount(uchar2 x);
+char3 __ovld __cnfn popcount(char3 x);
+uchar3 __ovld __cnfn popcount(uchar3 x);
+char4 __ovld __cnfn popcount(char4 x);
+uchar4 __ovld __cnfn popcount(uchar4 x);
+char8 __ovld __cnfn popcount(char8 x);
+uchar8 __ovld __cnfn popcount(uchar8 x);
+char16 __ovld __cnfn popcount(char16 x);
+uchar16 __ovld __cnfn popcount(uchar16 x);
+short __ovld __cnfn popcount(short x);
+ushort __ovld __cnfn popcount(ushort x);
+short2 __ovld __cnfn popcount(short2 x);
+ushort2 __ovld __cnfn popcount(ushort2 x);
+short3 __ovld __cnfn popcount(short3 x);
+ushort3 __ovld __cnfn popcount(ushort3 x);
+short4 __ovld __cnfn popcount(short4 x);
+ushort4 __ovld __cnfn popcount(ushort4 x);
+short8 __ovld __cnfn popcount(short8 x);
+ushort8 __ovld __cnfn popcount(ushort8 x);
+short16 __ovld __cnfn popcount(short16 x);
+ushort16 __ovld __cnfn popcount(ushort16 x);
+int __ovld __cnfn popcount(int x);
+uint __ovld __cnfn popcount(uint x);
+int2 __ovld __cnfn popcount(int2 x);
+uint2 __ovld __cnfn popcount(uint2 x);
+int3 __ovld __cnfn popcount(int3 x);
+uint3 __ovld __cnfn popcount(uint3 x);
+int4 __ovld __cnfn popcount(int4 x);
+uint4 __ovld __cnfn popcount(uint4 x);
+int8 __ovld __cnfn popcount(int8 x);
+uint8 __ovld __cnfn popcount(uint8 x);
+int16 __ovld __cnfn popcount(int16 x);
+uint16 __ovld __cnfn popcount(uint16 x);
+long __ovld __cnfn popcount(long x);
+ulong __ovld __cnfn popcount(ulong x);
+long2 __ovld __cnfn popcount(long2 x);
+ulong2 __ovld __cnfn popcount(ulong2 x);
+long3 __ovld __cnfn popcount(long3 x);
+ulong3 __ovld __cnfn popcount(ulong3 x);
+long4 __ovld __cnfn popcount(long4 x);
+ulong4 __ovld __cnfn popcount(ulong4 x);
+long8 __ovld __cnfn popcount(long8 x);
+ulong8 __ovld __cnfn popcount(ulong8 x);
+long16 __ovld __cnfn popcount(long16 x);
+ulong16 __ovld __cnfn popcount(ulong16 x);
+
+/**
+ * Multiply two 24-bit integer values x and y and add
+ * the 32-bit integer result to the 32-bit integer z.
+ * Refer to definition of mul24 to see how the 24-bit
+ * integer multiplication is performed.
+ */
+int __ovld __cnfn mad24(int x, int y, int z);
+uint __ovld __cnfn mad24(uint x, uint y, uint z);
+int2 __ovld __cnfn mad24(int2 x, int2 y, int2 z);
+uint2 __ovld __cnfn mad24(uint2 x, uint2 y, uint2 z);
+int3 __ovld __cnfn mad24(int3 x, int3 y, int3 z);
+uint3 __ovld __cnfn mad24(uint3 x, uint3 y, uint3 z);
+int4 __ovld __cnfn mad24(int4 x, int4 y, int4 z);
+uint4 __ovld __cnfn mad24(uint4 x, uint4 y, uint4 z);
+int8 __ovld __cnfn mad24(int8 x, int8 y, int8 z);
+uint8 __ovld __cnfn mad24(uint8 x, uint8 y, uint8 z);
+int16 __ovld __cnfn mad24(int16 x, int16 y, int16 z);
+uint16 __ovld __cnfn mad24(uint16 x, uint16 y, uint16 z);
+
+/**
+ * Multiply two 24-bit integer values x and y. x and y
+ * are 32-bit integers but only the low 24-bits are used
+ * to perform the multiplication. mul24 should only
+ * be used when values in x and y are in the range [-
+ * 2^23, 2^23-1] if x and y are signed integers and in the
+ * range [0, 2^24-1] if x and y are unsigned integers. If
+ * x and y are not in this range, the multiplication
+ * result is implementation-defined.
+ */
+int __ovld __cnfn mul24(int x, int y);
+uint __ovld __cnfn mul24(uint x, uint y);
+int2 __ovld __cnfn mul24(int2 x, int2 y);
+uint2 __ovld __cnfn mul24(uint2 x, uint2 y);
+int3 __ovld __cnfn mul24(int3 x, int3 y);
+uint3 __ovld __cnfn mul24(uint3 x, uint3 y);
+int4 __ovld __cnfn mul24(int4 x, int4 y);
+uint4 __ovld __cnfn mul24(uint4 x, uint4 y);
+int8 __ovld __cnfn mul24(int8 x, int8 y);
+uint8 __ovld __cnfn mul24(uint8 x, uint8 y);
+int16 __ovld __cnfn mul24(int16 x, int16 y);
+uint16 __ovld __cnfn mul24(uint16 x, uint16 y);
+
+// OpenCL v1.1 s6.11.4, v1.2 s6.12.4, v2.0 s6.13.4 - Common Functions
+
+/**
+ * Returns fmin(fmax(x, minval), maxval).
+ * Results are undefined if minval > maxval.
+ */
+float __ovld __cnfn clamp(float x, float minval, float maxval);
+float2 __ovld __cnfn clamp(float2 x, float2 minval, float2 maxval);
+float3 __ovld __cnfn clamp(float3 x, float3 minval, float3 maxval);
+float4 __ovld __cnfn clamp(float4 x, float4 minval, float4 maxval);
+float8 __ovld __cnfn clamp(float8 x, float8 minval, float8 maxval);
+float16 __ovld __cnfn clamp(float16 x, float16 minval, float16 maxval);
+float2 __ovld __cnfn clamp(float2 x, float minval, float maxval);
+float3 __ovld __cnfn clamp(float3 x, float minval, float maxval);
+float4 __ovld __cnfn clamp(float4 x, float minval, float maxval);
+float8 __ovld __cnfn clamp(float8 x, float minval, float maxval);
+float16 __ovld __cnfn clamp(float16 x, float minval, float maxval);
+#ifdef cl_khr_fp64
+double __ovld __cnfn clamp(double x, double minval, double maxval);
+double2 __ovld __cnfn clamp(double2 x, double2 minval, double2 maxval);
+double3 __ovld __cnfn clamp(double3 x, double3 minval, double3 maxval);
+double4 __ovld __cnfn clamp(double4 x, double4 minval, double4 maxval);
+double8 __ovld __cnfn clamp(double8 x, double8 minval, double8 maxval);
+double16 __ovld __cnfn clamp(double16 x, double16 minval, double16 maxval);
+double2 __ovld __cnfn clamp(double2 x, double minval, double maxval);
+double3 __ovld __cnfn clamp(double3 x, double minval, double maxval);
+double4 __ovld __cnfn clamp(double4 x, double minval, double maxval);
+double8 __ovld __cnfn clamp(double8 x, double minval, double maxval);
+double16 __ovld __cnfn clamp(double16 x, double minval, double maxval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn clamp(half x, half minval, half maxval);
+half2 __ovld __cnfn clamp(half2 x, half2 minval, half2 maxval);
+half3 __ovld __cnfn clamp(half3 x, half3 minval, half3 maxval);
+half4 __ovld __cnfn clamp(half4 x, half4 minval, half4 maxval);
+half8 __ovld __cnfn clamp(half8 x, half8 minval, half8 maxval);
+half16 __ovld __cnfn clamp(half16 x, half16 minval, half16 maxval);
+half2 __ovld __cnfn clamp(half2 x, half minval, half maxval);
+half3 __ovld __cnfn clamp(half3 x, half minval, half maxval);
+half4 __ovld __cnfn clamp(half4 x, half minval, half maxval);
+half8 __ovld __cnfn clamp(half8 x, half minval, half maxval);
+half16 __ovld __cnfn clamp(half16 x, half minval, half maxval);
+#endif //cl_khr_fp16
+
+/**
+ * Converts radians to degrees, i.e. (180 / PI) *
+ * radians.
+ */
+float __ovld __cnfn degrees(float radians);
+float2 __ovld __cnfn degrees(float2 radians);
+float3 __ovld __cnfn degrees(float3 radians);
+float4 __ovld __cnfn degrees(float4 radians);
+float8 __ovld __cnfn degrees(float8 radians);
+float16 __ovld __cnfn degrees(float16 radians);
+#ifdef cl_khr_fp64
+double __ovld __cnfn degrees(double radians);
+double2 __ovld __cnfn degrees(double2 radians);
+double3 __ovld __cnfn degrees(double3 radians);
+double4 __ovld __cnfn degrees(double4 radians);
+double8 __ovld __cnfn degrees(double8 radians);
+double16 __ovld __cnfn degrees(double16 radians);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn degrees(half radians);
+half2 __ovld __cnfn degrees(half2 radians);
+half3 __ovld __cnfn degrees(half3 radians);
+half4 __ovld __cnfn degrees(half4 radians);
+half8 __ovld __cnfn degrees(half8 radians);
+half16 __ovld __cnfn degrees(half16 radians);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if x < y, otherwise it returns x. If x and y
+ * are infinite or NaN, the return values are undefined.
+ */
+float __ovld __cnfn max(float x, float y);
+float2 __ovld __cnfn max(float2 x, float2 y);
+float3 __ovld __cnfn max(float3 x, float3 y);
+float4 __ovld __cnfn max(float4 x, float4 y);
+float8 __ovld __cnfn max(float8 x, float8 y);
+float16 __ovld __cnfn max(float16 x, float16 y);
+float2 __ovld __cnfn max(float2 x, float y);
+float3 __ovld __cnfn max(float3 x, float y);
+float4 __ovld __cnfn max(float4 x, float y);
+float8 __ovld __cnfn max(float8 x, float y);
+float16 __ovld __cnfn max(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn max(double x, double y);
+double2 __ovld __cnfn max(double2 x, double2 y);
+double3 __ovld __cnfn max(double3 x, double3 y);
+double4 __ovld __cnfn max(double4 x, double4 y);
+double8 __ovld __cnfn max(double8 x, double8 y);
+double16 __ovld __cnfn max(double16 x, double16 y);
+double2 __ovld __cnfn max(double2 x, double y);
+double3 __ovld __cnfn max(double3 x, double y);
+double4 __ovld __cnfn max(double4 x, double y);
+double8 __ovld __cnfn max(double8 x, double y);
+double16 __ovld __cnfn max(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn max(half x, half y);
+half2 __ovld __cnfn max(half2 x, half2 y);
+half3 __ovld __cnfn max(half3 x, half3 y);
+half4 __ovld __cnfn max(half4 x, half4 y);
+half8 __ovld __cnfn max(half8 x, half8 y);
+half16 __ovld __cnfn max(half16 x, half16 y);
+half2 __ovld __cnfn max(half2 x, half y);
+half3 __ovld __cnfn max(half3 x, half y);
+half4 __ovld __cnfn max(half4 x, half y);
+half8 __ovld __cnfn max(half8 x, half y);
+half16 __ovld __cnfn max(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if y < x, otherwise it returns x. If x and y
+ * are infinite or NaN, the return values are undefined.
+ */
+float __ovld __cnfn min(float x, float y);
+float2 __ovld __cnfn min(float2 x, float2 y);
+float3 __ovld __cnfn min(float3 x, float3 y);
+float4 __ovld __cnfn min(float4 x, float4 y);
+float8 __ovld __cnfn min(float8 x, float8 y);
+float16 __ovld __cnfn min(float16 x, float16 y);
+float2 __ovld __cnfn min(float2 x, float y);
+float3 __ovld __cnfn min(float3 x, float y);
+float4 __ovld __cnfn min(float4 x, float y);
+float8 __ovld __cnfn min(float8 x, float y);
+float16 __ovld __cnfn min(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn min(double x, double y);
+double2 __ovld __cnfn min(double2 x, double2 y);
+double3 __ovld __cnfn min(double3 x, double3 y);
+double4 __ovld __cnfn min(double4 x, double4 y);
+double8 __ovld __cnfn min(double8 x, double8 y);
+double16 __ovld __cnfn min(double16 x, double16 y);
+double2 __ovld __cnfn min(double2 x, double y);
+double3 __ovld __cnfn min(double3 x, double y);
+double4 __ovld __cnfn min(double4 x, double y);
+double8 __ovld __cnfn min(double8 x, double y);
+double16 __ovld __cnfn min(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn min(half x, half y);
+half2 __ovld __cnfn min(half2 x, half2 y);
+half3 __ovld __cnfn min(half3 x, half3 y);
+half4 __ovld __cnfn min(half4 x, half4 y);
+half8 __ovld __cnfn min(half8 x, half8 y);
+half16 __ovld __cnfn min(half16 x, half16 y);
+half2 __ovld __cnfn min(half2 x, half y);
+half3 __ovld __cnfn min(half3 x, half y);
+half4 __ovld __cnfn min(half4 x, half y);
+half8 __ovld __cnfn min(half8 x, half y);
+half16 __ovld __cnfn min(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the linear blend of x & y implemented as:
+ * x + (y - x) * a
+ * a must be a value in the range 0.0 ... 1.0. If a is not
+ * in the range 0.0 ... 1.0, the return values are
+ * undefined.
+ */
+float __ovld __cnfn mix(float x, float y, float a);
+float2 __ovld __cnfn mix(float2 x, float2 y, float2 a);
+float3 __ovld __cnfn mix(float3 x, float3 y, float3 a);
+float4 __ovld __cnfn mix(float4 x, float4 y, float4 a);
+float8 __ovld __cnfn mix(float8 x, float8 y, float8 a);
+float16 __ovld __cnfn mix(float16 x, float16 y, float16 a);
+float2 __ovld __cnfn mix(float2 x, float2 y, float a);
+float3 __ovld __cnfn mix(float3 x, float3 y, float a);
+float4 __ovld __cnfn mix(float4 x, float4 y, float a);
+float8 __ovld __cnfn mix(float8 x, float8 y, float a);
+float16 __ovld __cnfn mix(float16 x, float16 y, float a);
+#ifdef cl_khr_fp64
+double __ovld __cnfn mix(double x, double y, double a);
+double2 __ovld __cnfn mix(double2 x, double2 y, double2 a);
+double3 __ovld __cnfn mix(double3 x, double3 y, double3 a);
+double4 __ovld __cnfn mix(double4 x, double4 y, double4 a);
+double8 __ovld __cnfn mix(double8 x, double8 y, double8 a);
+double16 __ovld __cnfn mix(double16 x, double16 y, double16 a);
+double2 __ovld __cnfn mix(double2 x, double2 y, double a);
+double3 __ovld __cnfn mix(double3 x, double3 y, double a);
+double4 __ovld __cnfn mix(double4 x, double4 y, double a);
+double8 __ovld __cnfn mix(double8 x, double8 y, double a);
+double16 __ovld __cnfn mix(double16 x, double16 y, double a);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn mix(half x, half y, half a);
+half2 __ovld __cnfn mix(half2 x, half2 y, half2 a);
+half3 __ovld __cnfn mix(half3 x, half3 y, half3 a);
+half4 __ovld __cnfn mix(half4 x, half4 y, half4 a);
+half8 __ovld __cnfn mix(half8 x, half8 y, half8 a);
+half16 __ovld __cnfn mix(half16 x, half16 y, half16 a);
+half2 __ovld __cnfn mix(half2 x, half2 y, half a);
+half3 __ovld __cnfn mix(half3 x, half3 y, half a);
+half4 __ovld __cnfn mix(half4 x, half4 y, half a);
+half8 __ovld __cnfn mix(half8 x, half8 y, half a);
+half16 __ovld __cnfn mix(half16 x, half16 y, half a);
+#endif //cl_khr_fp16
+
+/**
+ * Converts degrees to radians, i.e. (PI / 180) *
+ * degrees.
+ */
+float __ovld __cnfn radians(float degrees);
+float2 __ovld __cnfn radians(float2 degrees);
+float3 __ovld __cnfn radians(float3 degrees);
+float4 __ovld __cnfn radians(float4 degrees);
+float8 __ovld __cnfn radians(float8 degrees);
+float16 __ovld __cnfn radians(float16 degrees);
+#ifdef cl_khr_fp64
+double __ovld __cnfn radians(double degrees);
+double2 __ovld __cnfn radians(double2 degrees);
+double3 __ovld __cnfn radians(double3 degrees);
+double4 __ovld __cnfn radians(double4 degrees);
+double8 __ovld __cnfn radians(double8 degrees);
+double16 __ovld __cnfn radians(double16 degrees);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn radians(half degrees);
+half2 __ovld __cnfn radians(half2 degrees);
+half3 __ovld __cnfn radians(half3 degrees);
+half4 __ovld __cnfn radians(half4 degrees);
+half8 __ovld __cnfn radians(half8 degrees);
+half16 __ovld __cnfn radians(half16 degrees);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 0.0 if x < edge, otherwise it returns 1.0.
+ */
+float __ovld __cnfn step(float edge, float x);
+float2 __ovld __cnfn step(float2 edge, float2 x);
+float3 __ovld __cnfn step(float3 edge, float3 x);
+float4 __ovld __cnfn step(float4 edge, float4 x);
+float8 __ovld __cnfn step(float8 edge, float8 x);
+float16 __ovld __cnfn step(float16 edge, float16 x);
+float2 __ovld __cnfn step(float edge, float2 x);
+float3 __ovld __cnfn step(float edge, float3 x);
+float4 __ovld __cnfn step(float edge, float4 x);
+float8 __ovld __cnfn step(float edge, float8 x);
+float16 __ovld __cnfn step(float edge, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn step(double edge, double x);
+double2 __ovld __cnfn step(double2 edge, double2 x);
+double3 __ovld __cnfn step(double3 edge, double3 x);
+double4 __ovld __cnfn step(double4 edge, double4 x);
+double8 __ovld __cnfn step(double8 edge, double8 x);
+double16 __ovld __cnfn step(double16 edge, double16 x);
+double2 __ovld __cnfn step(double edge, double2 x);
+double3 __ovld __cnfn step(double edge, double3 x);
+double4 __ovld __cnfn step(double edge, double4 x);
+double8 __ovld __cnfn step(double edge, double8 x);
+double16 __ovld __cnfn step(double edge, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn step(half edge, half x);
+half2 __ovld __cnfn step(half2 edge, half2 x);
+half3 __ovld __cnfn step(half3 edge, half3 x);
+half4 __ovld __cnfn step(half4 edge, half4 x);
+half8 __ovld __cnfn step(half8 edge, half8 x);
+half16 __ovld __cnfn step(half16 edge, half16 x);
+half __ovld __cnfn step(half edge, half x);
+half2 __ovld __cnfn step(half edge, half2 x);
+half3 __ovld __cnfn step(half edge, half3 x);
+half4 __ovld __cnfn step(half edge, half4 x);
+half8 __ovld __cnfn step(half edge, half8 x);
+half16 __ovld __cnfn step(half edge, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 0.0 if x <= edge0 and 1.0 if x >= edge1 and
+ * performs smooth Hermite interpolation between 0
+ * and 1when edge0 < x < edge1. This is useful in
+ * cases where you would want a threshold function
+ * with a smooth transition.
+ * This is equivalent to:
+ * gentype t;
+ * t = clamp ((x - edge0) / (edge1 - edge0), 0, 1);
+ * return t * t * (3 - 2 * t);
+ * Results are undefined if edge0 >= edge1 or if x,
+ * edge0 or edge1 is a NaN.
+ */
+float __ovld __cnfn smoothstep(float edge0, float edge1, float x);
+float2 __ovld __cnfn smoothstep(float2 edge0, float2 edge1, float2 x);
+float3 __ovld __cnfn smoothstep(float3 edge0, float3 edge1, float3 x);
+float4 __ovld __cnfn smoothstep(float4 edge0, float4 edge1, float4 x);
+float8 __ovld __cnfn smoothstep(float8 edge0, float8 edge1, float8 x);
+float16 __ovld __cnfn smoothstep(float16 edge0, float16 edge1, float16 x);
+float2 __ovld __cnfn smoothstep(float edge0, float edge1, float2 x);
+float3 __ovld __cnfn smoothstep(float edge0, float edge1, float3 x);
+float4 __ovld __cnfn smoothstep(float edge0, float edge1, float4 x);
+float8 __ovld __cnfn smoothstep(float edge0, float edge1, float8 x);
+float16 __ovld __cnfn smoothstep(float edge0, float edge1, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn smoothstep(double edge0, double edge1, double x);
+double2 __ovld __cnfn smoothstep(double2 edge0, double2 edge1, double2 x);
+double3 __ovld __cnfn smoothstep(double3 edge0, double3 edge1, double3 x);
+double4 __ovld __cnfn smoothstep(double4 edge0, double4 edge1, double4 x);
+double8 __ovld __cnfn smoothstep(double8 edge0, double8 edge1, double8 x);
+double16 __ovld __cnfn smoothstep(double16 edge0, double16 edge1, double16 x);
+double2 __ovld __cnfn smoothstep(double edge0, double edge1, double2 x);
+double3 __ovld __cnfn smoothstep(double edge0, double edge1, double3 x);
+double4 __ovld __cnfn smoothstep(double edge0, double edge1, double4 x);
+double8 __ovld __cnfn smoothstep(double edge0, double edge1, double8 x);
+double16 __ovld __cnfn smoothstep(double edge0, double edge1, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
+half2 __ovld __cnfn smoothstep(half2 edge0, half2 edge1, half2 x);
+half3 __ovld __cnfn smoothstep(half3 edge0, half3 edge1, half3 x);
+half4 __ovld __cnfn smoothstep(half4 edge0, half4 edge1, half4 x);
+half8 __ovld __cnfn smoothstep(half8 edge0, half8 edge1, half8 x);
+half16 __ovld __cnfn smoothstep(half16 edge0, half16 edge1, half16 x);
+half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
+half2 __ovld __cnfn smoothstep(half edge0, half edge1, half2 x);
+half3 __ovld __cnfn smoothstep(half edge0, half edge1, half3 x);
+half4 __ovld __cnfn smoothstep(half edge0, half edge1, half4 x);
+half8 __ovld __cnfn smoothstep(half edge0, half edge1, half8 x);
+half16 __ovld __cnfn smoothstep(half edge0, half edge1, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 1.0 if x > 0, -0.0 if x = -0.0, +0.0 if x =
+ * +0.0, or -1.0 if x < 0. Returns 0.0 if x is a NaN.
+ */
+float __ovld __cnfn sign(float x);
+float2 __ovld __cnfn sign(float2 x);
+float3 __ovld __cnfn sign(float3 x);
+float4 __ovld __cnfn sign(float4 x);
+float8 __ovld __cnfn sign(float8 x);
+float16 __ovld __cnfn sign(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sign(double x);
+double2 __ovld __cnfn sign(double2 x);
+double3 __ovld __cnfn sign(double3 x);
+double4 __ovld __cnfn sign(double4 x);
+double8 __ovld __cnfn sign(double8 x);
+double16 __ovld __cnfn sign(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sign(half x);
+half2 __ovld __cnfn sign(half2 x);
+half3 __ovld __cnfn sign(half3 x);
+half4 __ovld __cnfn sign(half4 x);
+half8 __ovld __cnfn sign(half8 x);
+half16 __ovld __cnfn sign(half16 x);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.5, v1.2 s6.12.5, v2.0 s6.13.5 - Geometric Functions
+
+/**
+ * Returns the cross product of p0.xyz and p1.xyz. The
+ * w component of float4 result returned will be 0.0.
+ */
+float4 __ovld __cnfn cross(float4 p0, float4 p1);
+float3 __ovld __cnfn cross(float3 p0, float3 p1);
+#ifdef cl_khr_fp64
+double4 __ovld __cnfn cross(double4 p0, double4 p1);
+double3 __ovld __cnfn cross(double3 p0, double3 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half4 __ovld __cnfn cross(half4 p0, half4 p1);
+half3 __ovld __cnfn cross(half3 p0, half3 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Compute dot product.
+ */
+float __ovld __cnfn dot(float p0, float p1);
+float __ovld __cnfn dot(float2 p0, float2 p1);
+float __ovld __cnfn dot(float3 p0, float3 p1);
+float __ovld __cnfn dot(float4 p0, float4 p1);
+#ifdef cl_khr_fp64
+double __ovld __cnfn dot(double p0, double p1);
+double __ovld __cnfn dot(double2 p0, double2 p1);
+double __ovld __cnfn dot(double3 p0, double3 p1);
+double __ovld __cnfn dot(double4 p0, double4 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn dot(half p0, half p1);
+half __ovld __cnfn dot(half2 p0, half2 p1);
+half __ovld __cnfn dot(half3 p0, half3 p1);
+half __ovld __cnfn dot(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the distance between p0 and p1. This is
+ * calculated as length(p0 - p1).
+ */
+float __ovld __cnfn distance(float p0, float p1);
+float __ovld __cnfn distance(float2 p0, float2 p1);
+float __ovld __cnfn distance(float3 p0, float3 p1);
+float __ovld __cnfn distance(float4 p0, float4 p1);
+#ifdef cl_khr_fp64
+double __ovld __cnfn distance(double p0, double p1);
+double __ovld __cnfn distance(double2 p0, double2 p1);
+double __ovld __cnfn distance(double3 p0, double3 p1);
+double __ovld __cnfn distance(double4 p0, double4 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn distance(half p0, half p1);
+half __ovld __cnfn distance(half2 p0, half2 p1);
+half __ovld __cnfn distance(half3 p0, half3 p1);
+half __ovld __cnfn distance(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Return the length of vector p, i.e.,
+ * sqrt(p.x2 + p.y 2 + ...)
+ */
+float __ovld __cnfn length(float p);
+float __ovld __cnfn length(float2 p);
+float __ovld __cnfn length(float3 p);
+float __ovld __cnfn length(float4 p);
+#ifdef cl_khr_fp64
+double __ovld __cnfn length(double p);
+double __ovld __cnfn length(double2 p);
+double __ovld __cnfn length(double3 p);
+double __ovld __cnfn length(double4 p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn length(half p);
+half __ovld __cnfn length(half2 p);
+half __ovld __cnfn length(half3 p);
+half __ovld __cnfn length(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns a vector in the same direction as p but with a
+ * length of 1.
+ */
+float __ovld __cnfn normalize(float p);
+float2 __ovld __cnfn normalize(float2 p);
+float3 __ovld __cnfn normalize(float3 p);
+float4 __ovld __cnfn normalize(float4 p);
+#ifdef cl_khr_fp64
+double __ovld __cnfn normalize(double p);
+double2 __ovld __cnfn normalize(double2 p);
+double3 __ovld __cnfn normalize(double3 p);
+double4 __ovld __cnfn normalize(double4 p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn normalize(half p);
+half2 __ovld __cnfn normalize(half2 p);
+half3 __ovld __cnfn normalize(half3 p);
+half4 __ovld __cnfn normalize(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns fast_length(p0 - p1).
+ */
+float __ovld __cnfn fast_distance(float p0, float p1);
+float __ovld __cnfn fast_distance(float2 p0, float2 p1);
+float __ovld __cnfn fast_distance(float3 p0, float3 p1);
+float __ovld __cnfn fast_distance(float4 p0, float4 p1);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_distance(half p0, half p1);
+half __ovld __cnfn fast_distance(half2 p0, half2 p1);
+half __ovld __cnfn fast_distance(half3 p0, half3 p1);
+half __ovld __cnfn fast_distance(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the length of vector p computed as:
+ * half_sqrt(p.x2 + p.y2 + ...)
+ */
+float __ovld __cnfn fast_length(float p);
+float __ovld __cnfn fast_length(float2 p);
+float __ovld __cnfn fast_length(float3 p);
+float __ovld __cnfn fast_length(float4 p);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_length(half p);
+half __ovld __cnfn fast_length(half2 p);
+half __ovld __cnfn fast_length(half3 p);
+half __ovld __cnfn fast_length(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns a vector in the same direction as p but with a
+ * length of 1. fast_normalize is computed as:
+ * p * half_rsqrt (p.x^2 + p.y^2 + ... )
+ * The result shall be within 8192 ulps error from the
+ * infinitely precise result of
+ * if (all(p == 0.0f))
+ * result = p;
+ * else
+ * result = p / sqrt (p.x^2 + p.y^2 + ...);
+ * with the following exceptions:
+ * 1) If the sum of squares is greater than FLT_MAX
+ * then the value of the floating-point values in the
+ * result vector are undefined.
+ * 2) If the sum of squares is less than FLT_MIN then
+ * the implementation may return back p.
+ * 3) If the device is in "denorms are flushed to zero"
+ * mode, individual operand elements with magnitude
+ * less than sqrt(FLT_MIN) may be flushed to zero
+ * before proceeding with the calculation.
+ */
+float __ovld __cnfn fast_normalize(float p);
+float2 __ovld __cnfn fast_normalize(float2 p);
+float3 __ovld __cnfn fast_normalize(float3 p);
+float4 __ovld __cnfn fast_normalize(float4 p);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_normalize(half p);
+half2 __ovld __cnfn fast_normalize(half2 p);
+half3 __ovld __cnfn fast_normalize(half3 p);
+half4 __ovld __cnfn fast_normalize(half4 p);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.6, v1.2 s6.12.6, v2.0 s6.13.6 - Relational Functions
+
+/**
+ * intn isequal (floatn x, floatn y)
+ * Returns the component-wise compare of x == y.
+ */
+int __ovld __cnfn isequal(float x, float y);
+int2 __ovld __cnfn isequal(float2 x, float2 y);
+int3 __ovld __cnfn isequal(float3 x, float3 y);
+int4 __ovld __cnfn isequal(float4 x, float4 y);
+int8 __ovld __cnfn isequal(float8 x, float8 y);
+int16 __ovld __cnfn isequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isequal(double x, double y);
+long2 __ovld __cnfn isequal(double2 x, double2 y);
+long3 __ovld __cnfn isequal(double3 x, double3 y);
+long4 __ovld __cnfn isequal(double4 x, double4 y);
+long8 __ovld __cnfn isequal(double8 x, double8 y);
+long16 __ovld __cnfn isequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isequal(half x, half y);
+short2 __ovld __cnfn isequal(half2 x, half2 y);
+short3 __ovld __cnfn isequal(half3 x, half3 y);
+short4 __ovld __cnfn isequal(half4 x, half4 y);
+short8 __ovld __cnfn isequal(half8 x, half8 y);
+short16 __ovld __cnfn isequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x != y.
+ */
+int __ovld __cnfn isnotequal(float x, float y);
+int2 __ovld __cnfn isnotequal(float2 x, float2 y);
+int3 __ovld __cnfn isnotequal(float3 x, float3 y);
+int4 __ovld __cnfn isnotequal(float4 x, float4 y);
+int8 __ovld __cnfn isnotequal(float8 x, float8 y);
+int16 __ovld __cnfn isnotequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnotequal(double x, double y);
+long2 __ovld __cnfn isnotequal(double2 x, double2 y);
+long3 __ovld __cnfn isnotequal(double3 x, double3 y);
+long4 __ovld __cnfn isnotequal(double4 x, double4 y);
+long8 __ovld __cnfn isnotequal(double8 x, double8 y);
+long16 __ovld __cnfn isnotequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnotequal(half x, half y);
+short2 __ovld __cnfn isnotequal(half2 x, half2 y);
+short3 __ovld __cnfn isnotequal(half3 x, half3 y);
+short4 __ovld __cnfn isnotequal(half4 x, half4 y);
+short8 __ovld __cnfn isnotequal(half8 x, half8 y);
+short16 __ovld __cnfn isnotequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x > y.
+ */
+int __ovld __cnfn isgreater(float x, float y);
+int2 __ovld __cnfn isgreater(float2 x, float2 y);
+int3 __ovld __cnfn isgreater(float3 x, float3 y);
+int4 __ovld __cnfn isgreater(float4 x, float4 y);
+int8 __ovld __cnfn isgreater(float8 x, float8 y);
+int16 __ovld __cnfn isgreater(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isgreater(double x, double y);
+long2 __ovld __cnfn isgreater(double2 x, double2 y);
+long3 __ovld __cnfn isgreater(double3 x, double3 y);
+long4 __ovld __cnfn isgreater(double4 x, double4 y);
+long8 __ovld __cnfn isgreater(double8 x, double8 y);
+long16 __ovld __cnfn isgreater(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isgreater(half x, half y);
+short2 __ovld __cnfn isgreater(half2 x, half2 y);
+short3 __ovld __cnfn isgreater(half3 x, half3 y);
+short4 __ovld __cnfn isgreater(half4 x, half4 y);
+short8 __ovld __cnfn isgreater(half8 x, half8 y);
+short16 __ovld __cnfn isgreater(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x >= y.
+ */
+int __ovld __cnfn isgreaterequal(float x, float y);
+int2 __ovld __cnfn isgreaterequal(float2 x, float2 y);
+int3 __ovld __cnfn isgreaterequal(float3 x, float3 y);
+int4 __ovld __cnfn isgreaterequal(float4 x, float4 y);
+int8 __ovld __cnfn isgreaterequal(float8 x, float8 y);
+int16 __ovld __cnfn isgreaterequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isgreaterequal(double x, double y);
+long2 __ovld __cnfn isgreaterequal(double2 x, double2 y);
+long3 __ovld __cnfn isgreaterequal(double3 x, double3 y);
+long4 __ovld __cnfn isgreaterequal(double4 x, double4 y);
+long8 __ovld __cnfn isgreaterequal(double8 x, double8 y);
+long16 __ovld __cnfn isgreaterequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isgreaterequal(half x, half y);
+short2 __ovld __cnfn isgreaterequal(half2 x, half2 y);
+short3 __ovld __cnfn isgreaterequal(half3 x, half3 y);
+short4 __ovld __cnfn isgreaterequal(half4 x, half4 y);
+short8 __ovld __cnfn isgreaterequal(half8 x, half8 y);
+short16 __ovld __cnfn isgreaterequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x < y.
+ */
+int __ovld __cnfn isless(float x, float y);
+int2 __ovld __cnfn isless(float2 x, float2 y);
+int3 __ovld __cnfn isless(float3 x, float3 y);
+int4 __ovld __cnfn isless(float4 x, float4 y);
+int8 __ovld __cnfn isless(float8 x, float8 y);
+int16 __ovld __cnfn isless(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isless(double x, double y);
+long2 __ovld __cnfn isless(double2 x, double2 y);
+long3 __ovld __cnfn isless(double3 x, double3 y);
+long4 __ovld __cnfn isless(double4 x, double4 y);
+long8 __ovld __cnfn isless(double8 x, double8 y);
+long16 __ovld __cnfn isless(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isless(half x, half y);
+short2 __ovld __cnfn isless(half2 x, half2 y);
+short3 __ovld __cnfn isless(half3 x, half3 y);
+short4 __ovld __cnfn isless(half4 x, half4 y);
+short8 __ovld __cnfn isless(half8 x, half8 y);
+short16 __ovld __cnfn isless(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x <= y.
+ */
+int __ovld __cnfn islessequal(float x, float y);
+int2 __ovld __cnfn islessequal(float2 x, float2 y);
+int3 __ovld __cnfn islessequal(float3 x, float3 y);
+int4 __ovld __cnfn islessequal(float4 x, float4 y);
+int8 __ovld __cnfn islessequal(float8 x, float8 y);
+int16 __ovld __cnfn islessequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn islessequal(double x, double y);
+long2 __ovld __cnfn islessequal(double2 x, double2 y);
+long3 __ovld __cnfn islessequal(double3 x, double3 y);
+long4 __ovld __cnfn islessequal(double4 x, double4 y);
+long8 __ovld __cnfn islessequal(double8 x, double8 y);
+long16 __ovld __cnfn islessequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn islessequal(half x, half y);
+short2 __ovld __cnfn islessequal(half2 x, half2 y);
+short3 __ovld __cnfn islessequal(half3 x, half3 y);
+short4 __ovld __cnfn islessequal(half4 x, half4 y);
+short8 __ovld __cnfn islessequal(half8 x, half8 y);
+short16 __ovld __cnfn islessequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of
+ * (x < y) || (x > y) .
+ */
+int __ovld __cnfn islessgreater(float x, float y);
+int2 __ovld __cnfn islessgreater(float2 x, float2 y);
+int3 __ovld __cnfn islessgreater(float3 x, float3 y);
+int4 __ovld __cnfn islessgreater(float4 x, float4 y);
+int8 __ovld __cnfn islessgreater(float8 x, float8 y);
+int16 __ovld __cnfn islessgreater(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn islessgreater(double x, double y);
+long2 __ovld __cnfn islessgreater(double2 x, double2 y);
+long3 __ovld __cnfn islessgreater(double3 x, double3 y);
+long4 __ovld __cnfn islessgreater(double4 x, double4 y);
+long8 __ovld __cnfn islessgreater(double8 x, double8 y);
+long16 __ovld __cnfn islessgreater(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn islessgreater(half x, half y);
+short2 __ovld __cnfn islessgreater(half2 x, half2 y);
+short3 __ovld __cnfn islessgreater(half3 x, half3 y);
+short4 __ovld __cnfn islessgreater(half4 x, half4 y);
+short8 __ovld __cnfn islessgreater(half8 x, half8 y);
+short16 __ovld __cnfn islessgreater(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test for finite value.
+ */
+int __ovld __cnfn isfinite(float);
+int2 __ovld __cnfn isfinite(float2);
+int3 __ovld __cnfn isfinite(float3);
+int4 __ovld __cnfn isfinite(float4);
+int8 __ovld __cnfn isfinite(float8);
+int16 __ovld __cnfn isfinite(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isfinite(double);
+long2 __ovld __cnfn isfinite(double2);
+long3 __ovld __cnfn isfinite(double3);
+long4 __ovld __cnfn isfinite(double4);
+long8 __ovld __cnfn isfinite(double8);
+long16 __ovld __cnfn isfinite(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isfinite(half);
+short2 __ovld __cnfn isfinite(half2);
+short3 __ovld __cnfn isfinite(half3);
+short4 __ovld __cnfn isfinite(half4);
+short8 __ovld __cnfn isfinite(half8);
+short16 __ovld __cnfn isfinite(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for infinity value (+ve or -ve) .
+ */
+int __ovld __cnfn isinf(float);
+int2 __ovld __cnfn isinf(float2);
+int3 __ovld __cnfn isinf(float3);
+int4 __ovld __cnfn isinf(float4);
+int8 __ovld __cnfn isinf(float8);
+int16 __ovld __cnfn isinf(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isinf(double);
+long2 __ovld __cnfn isinf(double2);
+long3 __ovld __cnfn isinf(double3);
+long4 __ovld __cnfn isinf(double4);
+long8 __ovld __cnfn isinf(double8);
+long16 __ovld __cnfn isinf(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isinf(half);
+short2 __ovld __cnfn isinf(half2);
+short3 __ovld __cnfn isinf(half3);
+short4 __ovld __cnfn isinf(half4);
+short8 __ovld __cnfn isinf(half8);
+short16 __ovld __cnfn isinf(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for a NaN.
+ */
+int __ovld __cnfn isnan(float);
+int2 __ovld __cnfn isnan(float2);
+int3 __ovld __cnfn isnan(float3);
+int4 __ovld __cnfn isnan(float4);
+int8 __ovld __cnfn isnan(float8);
+int16 __ovld __cnfn isnan(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnan(double);
+long2 __ovld __cnfn isnan(double2);
+long3 __ovld __cnfn isnan(double3);
+long4 __ovld __cnfn isnan(double4);
+long8 __ovld __cnfn isnan(double8);
+long16 __ovld __cnfn isnan(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnan(half);
+short2 __ovld __cnfn isnan(half2);
+short3 __ovld __cnfn isnan(half3);
+short4 __ovld __cnfn isnan(half4);
+short8 __ovld __cnfn isnan(half8);
+short16 __ovld __cnfn isnan(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for a normal value.
+ */
+int __ovld __cnfn isnormal(float);
+int2 __ovld __cnfn isnormal(float2);
+int3 __ovld __cnfn isnormal(float3);
+int4 __ovld __cnfn isnormal(float4);
+int8 __ovld __cnfn isnormal(float8);
+int16 __ovld __cnfn isnormal(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnormal(double);
+long2 __ovld __cnfn isnormal(double2);
+long3 __ovld __cnfn isnormal(double3);
+long4 __ovld __cnfn isnormal(double4);
+long8 __ovld __cnfn isnormal(double8);
+long16 __ovld __cnfn isnormal(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnormal(half);
+short2 __ovld __cnfn isnormal(half2);
+short3 __ovld __cnfn isnormal(half3);
+short4 __ovld __cnfn isnormal(half4);
+short8 __ovld __cnfn isnormal(half8);
+short16 __ovld __cnfn isnormal(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test if arguments are ordered. isordered() takes
+ * arguments x and y, and returns the result
+ * isequal(x, x) && isequal(y, y).
+ */
+int __ovld __cnfn isordered(float x, float y);
+int2 __ovld __cnfn isordered(float2 x, float2 y);
+int3 __ovld __cnfn isordered(float3 x, float3 y);
+int4 __ovld __cnfn isordered(float4 x, float4 y);
+int8 __ovld __cnfn isordered(float8 x, float8 y);
+int16 __ovld __cnfn isordered(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isordered(double x, double y);
+long2 __ovld __cnfn isordered(double2 x, double2 y);
+long3 __ovld __cnfn isordered(double3 x, double3 y);
+long4 __ovld __cnfn isordered(double4 x, double4 y);
+long8 __ovld __cnfn isordered(double8 x, double8 y);
+long16 __ovld __cnfn isordered(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isordered(half x, half y);
+short2 __ovld __cnfn isordered(half2 x, half2 y);
+short3 __ovld __cnfn isordered(half3 x, half3 y);
+short4 __ovld __cnfn isordered(half4 x, half4 y);
+short8 __ovld __cnfn isordered(half8 x, half8 y);
+short16 __ovld __cnfn isordered(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test if arguments are unordered. isunordered()
+ * takes arguments x and y, returning non-zero if x or y
+ * is NaN, and zero otherwise.
+ */
+int __ovld __cnfn isunordered(float x, float y);
+int2 __ovld __cnfn isunordered(float2 x, float2 y);
+int3 __ovld __cnfn isunordered(float3 x, float3 y);
+int4 __ovld __cnfn isunordered(float4 x, float4 y);
+int8 __ovld __cnfn isunordered(float8 x, float8 y);
+int16 __ovld __cnfn isunordered(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isunordered(double x, double y);
+long2 __ovld __cnfn isunordered(double2 x, double2 y);
+long3 __ovld __cnfn isunordered(double3 x, double3 y);
+long4 __ovld __cnfn isunordered(double4 x, double4 y);
+long8 __ovld __cnfn isunordered(double8 x, double8 y);
+long16 __ovld __cnfn isunordered(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isunordered(half x, half y);
+short2 __ovld __cnfn isunordered(half2 x, half2 y);
+short3 __ovld __cnfn isunordered(half3 x, half3 y);
+short4 __ovld __cnfn isunordered(half4 x, half4 y);
+short8 __ovld __cnfn isunordered(half8 x, half8 y);
+short16 __ovld __cnfn isunordered(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test for sign bit. The scalar version of the function
+ * returns a 1 if the sign bit in the float is set else returns
+ * 0. The vector version of the function returns the
+ * following for each component in floatn: a -1 if the
+ * sign bit in the float is set else returns 0.
+ */
+int __ovld __cnfn signbit(float);
+int2 __ovld __cnfn signbit(float2);
+int3 __ovld __cnfn signbit(float3);
+int4 __ovld __cnfn signbit(float4);
+int8 __ovld __cnfn signbit(float8);
+int16 __ovld __cnfn signbit(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn signbit(double);
+long2 __ovld __cnfn signbit(double2);
+long3 __ovld __cnfn signbit(double3);
+long4 __ovld __cnfn signbit(double4);
+long8 __ovld __cnfn signbit(double8);
+long16 __ovld __cnfn signbit(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn signbit(half);
+short2 __ovld __cnfn signbit(half2);
+short3 __ovld __cnfn signbit(half3);
+short4 __ovld __cnfn signbit(half4);
+short8 __ovld __cnfn signbit(half8);
+short16 __ovld __cnfn signbit(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 1 if the most significant bit in any component
+ * of x is set; otherwise returns 0.
+ */
+int __ovld __cnfn any(char x);
+int __ovld __cnfn any(char2 x);
+int __ovld __cnfn any(char3 x);
+int __ovld __cnfn any(char4 x);
+int __ovld __cnfn any(char8 x);
+int __ovld __cnfn any(char16 x);
+int __ovld __cnfn any(short x);
+int __ovld __cnfn any(short2 x);
+int __ovld __cnfn any(short3 x);
+int __ovld __cnfn any(short4 x);
+int __ovld __cnfn any(short8 x);
+int __ovld __cnfn any(short16 x);
+int __ovld __cnfn any(int x);
+int __ovld __cnfn any(int2 x);
+int __ovld __cnfn any(int3 x);
+int __ovld __cnfn any(int4 x);
+int __ovld __cnfn any(int8 x);
+int __ovld __cnfn any(int16 x);
+int __ovld __cnfn any(long x);
+int __ovld __cnfn any(long2 x);
+int __ovld __cnfn any(long3 x);
+int __ovld __cnfn any(long4 x);
+int __ovld __cnfn any(long8 x);
+int __ovld __cnfn any(long16 x);
+
+/**
+ * Returns 1 if the most significant bit in all components
+ * of x is set; otherwise returns 0.
+ */
+int __ovld __cnfn all(char x);
+int __ovld __cnfn all(char2 x);
+int __ovld __cnfn all(char3 x);
+int __ovld __cnfn all(char4 x);
+int __ovld __cnfn all(char8 x);
+int __ovld __cnfn all(char16 x);
+int __ovld __cnfn all(short x);
+int __ovld __cnfn all(short2 x);
+int __ovld __cnfn all(short3 x);
+int __ovld __cnfn all(short4 x);
+int __ovld __cnfn all(short8 x);
+int __ovld __cnfn all(short16 x);
+int __ovld __cnfn all(int x);
+int __ovld __cnfn all(int2 x);
+int __ovld __cnfn all(int3 x);
+int __ovld __cnfn all(int4 x);
+int __ovld __cnfn all(int8 x);
+int __ovld __cnfn all(int16 x);
+int __ovld __cnfn all(long x);
+int __ovld __cnfn all(long2 x);
+int __ovld __cnfn all(long3 x);
+int __ovld __cnfn all(long4 x);
+int __ovld __cnfn all(long8 x);
+int __ovld __cnfn all(long16 x);
+
+/**
+ * Each bit of the result is the corresponding bit of a if
+ * the corresponding bit of c is 0. Otherwise it is the
+ * corresponding bit of b.
+ */
+char __ovld __cnfn bitselect(char a, char b, char c);
+uchar __ovld __cnfn bitselect(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn bitselect(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn bitselect(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn bitselect(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn bitselect(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn bitselect(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn bitselect(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn bitselect(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn bitselect(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn bitselect(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn bitselect(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn bitselect(short a, short b, short c);
+ushort __ovld __cnfn bitselect(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn bitselect(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn bitselect(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn bitselect(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn bitselect(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn bitselect(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn bitselect(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn bitselect(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn bitselect(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn bitselect(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn bitselect(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn bitselect(int a, int b, int c);
+uint __ovld __cnfn bitselect(uint a, uint b, uint c);
+int2 __ovld __cnfn bitselect(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn bitselect(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn bitselect(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn bitselect(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn bitselect(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn bitselect(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn bitselect(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn bitselect(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn bitselect(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn bitselect(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn bitselect(long a, long b, long c);
+ulong __ovld __cnfn bitselect(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn bitselect(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn bitselect(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn bitselect(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn bitselect(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn bitselect(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn bitselect(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn bitselect(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn bitselect(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn bitselect(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn bitselect(ulong16 a, ulong16 b, ulong16 c);
+float __ovld __cnfn bitselect(float a, float b, float c);
+float2 __ovld __cnfn bitselect(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn bitselect(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn bitselect(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn bitselect(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn bitselect(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn bitselect(double a, double b, double c);
+double2 __ovld __cnfn bitselect(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn bitselect(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn bitselect(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn bitselect(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn bitselect(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn bitselect(half a, half b, half c);
+half2 __ovld __cnfn bitselect(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn bitselect(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn bitselect(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn bitselect(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn bitselect(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * For each component of a vector type,
+ * result[i] = if MSB of c[i] is set ? b[i] : a[i].
+ * For a scalar type, result = c ? b : a.
+ */
+char __ovld __cnfn select(char a, char b, char c);
+uchar __ovld __cnfn select(uchar a, uchar b, char c);
+char2 __ovld __cnfn select(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, char2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, char3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, char4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c);
+short __ovld __cnfn select(short a, short b, char c);
+ushort __ovld __cnfn select(ushort a, ushort b, char c);
+short2 __ovld __cnfn select(short2 a, short2 b, char2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, char2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, char3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, char3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, char4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, char4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, char8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, char8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, char16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, char16 c);
+int __ovld __cnfn select(int a, int b, char c);
+uint __ovld __cnfn select(uint a, uint b, char c);
+int2 __ovld __cnfn select(int2 a, int2 b, char2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, char2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, char3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, char3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, char4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, char4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, char8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, char8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, char16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, char16 c);
+long __ovld __cnfn select(long a, long b, char c);
+ulong __ovld __cnfn select(ulong a, ulong b, char c);
+long2 __ovld __cnfn select(long2 a, long2 b, char2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, char2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, char3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, char3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, char4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, char4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, char8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, char8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, char16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, char16 c);
+float __ovld __cnfn select(float a, float b, char c);
+float2 __ovld __cnfn select(float2 a, float2 b, char2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, char3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, char4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, char8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, char16 c);
+char __ovld __cnfn select(char a, char b, short c);
+uchar __ovld __cnfn select(uchar a, uchar b, short c);
+char2 __ovld __cnfn select(char2 a, char2 b, short2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, short2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, short3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, short3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, short4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, short4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, short8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, short8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, short16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, short16 c);
+short __ovld __cnfn select(short a, short b, short c);
+ushort __ovld __cnfn select(ushort a, ushort b, short c);
+short2 __ovld __cnfn select(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, short2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, short3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, short4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c);
+int __ovld __cnfn select(int a, int b, short c);
+uint __ovld __cnfn select(uint a, uint b, short c);
+int2 __ovld __cnfn select(int2 a, int2 b, short2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, short2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, short3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, short3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, short4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, short4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, short8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, short8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, short16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, short16 c);
+long __ovld __cnfn select(long a, long b, short c);
+ulong __ovld __cnfn select(ulong a, ulong b, short c);
+long2 __ovld __cnfn select(long2 a, long2 b, short2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, short2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, short3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, short3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, short4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, short4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, short8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, short8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, short16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, short16 c);
+float __ovld __cnfn select(float a, float b, short c);
+float2 __ovld __cnfn select(float2 a, float2 b, short2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, short3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, short4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, short8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, short16 c);
+char __ovld __cnfn select(char a, char b, int c);
+uchar __ovld __cnfn select(uchar a, uchar b, int c);
+char2 __ovld __cnfn select(char2 a, char2 b, int2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, int2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, int3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, int3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, int4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, int4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, int8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, int8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, int16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, int16 c);
+short __ovld __cnfn select(short a, short b, int c);
+ushort __ovld __cnfn select(ushort a, ushort b, int c);
+short2 __ovld __cnfn select(short2 a, short2 b, int2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, int2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, int3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, int3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, int4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, int4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, int8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, int8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, int16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, int16 c);
+int __ovld __cnfn select(int a, int b, int c);
+uint __ovld __cnfn select(uint a, uint b, int c);
+int2 __ovld __cnfn select(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, int2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, int3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, int4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c);
+long __ovld __cnfn select(long a, long b, int c);
+ulong __ovld __cnfn select(ulong a, ulong b, int c);
+long2 __ovld __cnfn select(long2 a, long2 b, int2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, int2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, int3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, int3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, int4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, int4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, int8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, int8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, int16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, int16 c);
+float __ovld __cnfn select(float a, float b, int c);
+float2 __ovld __cnfn select(float2 a, float2 b, int2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, int3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, int4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, int8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, int16 c);
+char __ovld __cnfn select(char a, char b, long c);
+uchar __ovld __cnfn select(uchar a, uchar b, long c);
+char2 __ovld __cnfn select(char2 a, char2 b, long2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, long2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, long3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, long3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, long4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, long4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, long8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, long8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, long16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, long16 c);
+short __ovld __cnfn select(short a, short b, long c);
+ushort __ovld __cnfn select(ushort a, ushort b, long c);
+short2 __ovld __cnfn select(short2 a, short2 b, long2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, long2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, long3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, long3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, long4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, long4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, long8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, long8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, long16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, long16 c);
+int __ovld __cnfn select(int a, int b, long c);
+uint __ovld __cnfn select(uint a, uint b, long c);
+int2 __ovld __cnfn select(int2 a, int2 b, long2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, long2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, long3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, long3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, long4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, long4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, long8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, long8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, long16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, long16 c);
+long __ovld __cnfn select(long a, long b, long c);
+ulong __ovld __cnfn select(ulong a, ulong b, long c);
+long2 __ovld __cnfn select(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, long2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, long3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, long4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c);
+float __ovld __cnfn select(float a, float b, long c);
+float2 __ovld __cnfn select(float2 a, float2 b, long2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, long3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, long4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, long8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, long16 c);
+char __ovld __cnfn select(char a, char b, uchar c);
+uchar __ovld __cnfn select(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, uchar3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, uchar4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, uchar8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn select(short a, short b, uchar c);
+ushort __ovld __cnfn select(ushort a, ushort b, uchar c);
+short2 __ovld __cnfn select(short2 a, short2 b, uchar2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uchar2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, uchar3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uchar3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, uchar4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uchar4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, uchar8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uchar8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, uchar16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uchar16 c);
+int __ovld __cnfn select(int a, int b, uchar c);
+uint __ovld __cnfn select(uint a, uint b, uchar c);
+int2 __ovld __cnfn select(int2 a, int2 b, uchar2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, uchar2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, uchar3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, uchar3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, uchar4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, uchar4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, uchar8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, uchar8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, uchar16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, uchar16 c);
+long __ovld __cnfn select(long a, long b, uchar c);
+ulong __ovld __cnfn select(ulong a, ulong b, uchar c);
+long2 __ovld __cnfn select(long2 a, long2 b, uchar2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uchar2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, uchar3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uchar3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, uchar4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uchar4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, uchar8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uchar8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, uchar16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uchar16 c);
+float __ovld __cnfn select(float a, float b, uchar c);
+float2 __ovld __cnfn select(float2 a, float2 b, uchar2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, uchar3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, uchar4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, uchar8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, uchar16 c);
+char __ovld __cnfn select(char a, char b, ushort c);
+uchar __ovld __cnfn select(uchar a, uchar b, ushort c);
+char2 __ovld __cnfn select(char2 a, char2 b, ushort2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ushort2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, ushort3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ushort3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, ushort4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ushort4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, ushort8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ushort8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, ushort16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ushort16 c);
+short __ovld __cnfn select(short a, short b, ushort c);
+ushort __ovld __cnfn select(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, ushort3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, ushort4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, ushort8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn select(int a, int b, ushort c);
+uint __ovld __cnfn select(uint a, uint b, ushort c);
+int2 __ovld __cnfn select(int2 a, int2 b, ushort2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, ushort2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, ushort3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, ushort3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, ushort4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, ushort4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, ushort8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, ushort8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, ushort16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, ushort16 c);
+long __ovld __cnfn select(long a, long b, ushort c);
+ulong __ovld __cnfn select(ulong a, ulong b, ushort c);
+long2 __ovld __cnfn select(long2 a, long2 b, ushort2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ushort2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, ushort3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ushort3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, ushort4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ushort4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, ushort8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ushort8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, ushort16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ushort16 c);
+float __ovld __cnfn select(float a, float b, ushort c);
+float2 __ovld __cnfn select(float2 a, float2 b, ushort2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, ushort3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, ushort4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, ushort8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, ushort16 c);
+char __ovld __cnfn select(char a, char b, uint c);
+uchar __ovld __cnfn select(uchar a, uchar b, uint c);
+char2 __ovld __cnfn select(char2 a, char2 b, uint2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uint2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, uint3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uint3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, uint4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uint4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, uint8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uint8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, uint16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uint16 c);
+short __ovld __cnfn select(short a, short b, uint c);
+ushort __ovld __cnfn select(ushort a, ushort b, uint c);
+short2 __ovld __cnfn select(short2 a, short2 b, uint2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uint2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, uint3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uint3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, uint4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uint4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, uint8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uint8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, uint16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uint16 c);
+int __ovld __cnfn select(int a, int b, uint c);
+uint __ovld __cnfn select(uint a, uint b, uint c);
+int2 __ovld __cnfn select(int2 a, int2 b, uint2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, uint3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, uint4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, uint8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, uint16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn select(long a, long b, uint c);
+ulong __ovld __cnfn select(ulong a, ulong b, uint c);
+long2 __ovld __cnfn select(long2 a, long2 b, uint2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uint2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, uint3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uint3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, uint4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uint4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, uint8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uint8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, uint16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uint16 c);
+float __ovld __cnfn select(float a, float b, uint c);
+float2 __ovld __cnfn select(float2 a, float2 b, uint2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, uint3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, uint4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, uint8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, uint16 c);
+char __ovld __cnfn select(char a, char b, ulong c);
+uchar __ovld __cnfn select(uchar a, uchar b, ulong c);
+char2 __ovld __cnfn select(char2 a, char2 b, ulong2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ulong2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, ulong3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ulong3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, ulong4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ulong4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, ulong8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ulong8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, ulong16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ulong16 c);
+short __ovld __cnfn select(short a, short b, ulong c);
+ushort __ovld __cnfn select(ushort a, ushort b, ulong c);
+short2 __ovld __cnfn select(short2 a, short2 b, ulong2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ulong2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, ulong3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ulong3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, ulong4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ulong4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, ulong8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ulong8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, ulong16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ulong16 c);
+int __ovld __cnfn select(int a, int b, ulong c);
+uint __ovld __cnfn select(uint a, uint b, ulong c);
+int2 __ovld __cnfn select(int2 a, int2 b, ulong2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, ulong2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, ulong3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, ulong3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, ulong4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, ulong4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, ulong8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, ulong8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, ulong16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, ulong16 c);
+long __ovld __cnfn select(long a, long b, ulong c);
+ulong __ovld __cnfn select(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, ulong3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, ulong4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, ulong8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c);
+float __ovld __cnfn select(float a, float b, ulong c);
+float2 __ovld __cnfn select(float2 a, float2 b, ulong2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, ulong3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, ulong4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, ulong8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, ulong16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn select(double a, double b, long c);
+double2 __ovld __cnfn select(double2 a, double2 b, long2 c);
+double3 __ovld __cnfn select(double3 a, double3 b, long3 c);
+double4 __ovld __cnfn select(double4 a, double4 b, long4 c);
+double8 __ovld __cnfn select(double8 a, double8 b, long8 c);
+double16 __ovld __cnfn select(double16 a, double16 b, long16 c);
+double __ovld __cnfn select(double a, double b, ulong c);
+double2 __ovld __cnfn select(double2 a, double2 b, ulong2 c);
+double3 __ovld __cnfn select(double3 a, double3 b, ulong3 c);
+double4 __ovld __cnfn select(double4 a, double4 b, ulong4 c);
+double8 __ovld __cnfn select(double8 a, double8 b, ulong8 c);
+double16 __ovld __cnfn select(double16 a, double16 b, ulong16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn select(half a, half b, short c);
+half2 __ovld __cnfn select(half2 a, half2 b, short2 c);
+half3 __ovld __cnfn select(half3 a, half3 b, short3 c);
+half4 __ovld __cnfn select(half4 a, half4 b, short4 c);
+half8 __ovld __cnfn select(half8 a, half8 b, short8 c);
+half16 __ovld __cnfn select(half16 a, half16 b, short16 c);
+half __ovld __cnfn select(half a, half b, ushort c);
+half2 __ovld __cnfn select(half2 a, half2 b, ushort2 c);
+half3 __ovld __cnfn select(half3 a, half3 b, ushort3 c);
+half4 __ovld __cnfn select(half4 a, half4 b, ushort4 c);
+half8 __ovld __cnfn select(half8 a, half8 b, ushort8 c);
+half16 __ovld __cnfn select(half16 a, half16 b, ushort16 c);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.7, v1.2 s6.12.7, v2.0 s6.13.7 - Vector Data Load and Store Functions
+// OpenCL extensions v1.1 s9.6.6, v1.2 s9.5.6, v2.0 s9.4.6 - Vector Data Load and Store Functions for Half Type
+/**
+ * Use generic type gentype to indicate the built-in data types
+ * char, uchar, short, ushort, int, uint, long, ulong, float,
+ * double or half.
+ *
+ * vloadn return sizeof (gentypen) bytes of data read from address (p + (offset * n)).
+ *
+ * vstoren write sizeof (gentypen) bytes given by data to address (p + (offset * n)).
+ *
+ * The address computed as (p + (offset * n)) must be 
+ * 8-bit aligned if gentype is char, uchar;
+ * 16-bit aligned if gentype is short, ushort, half;
+ * 32-bit aligned if gentype is int, uint, float;
+ * 64-bit aligned if gentype is long, ulong, double.
+ */
+
+char2 __ovld vload2(size_t offset, const __constant char *p);
+uchar2 __ovld vload2(size_t offset, const __constant uchar *p);
+short2 __ovld vload2(size_t offset, const __constant short *p);
+ushort2 __ovld vload2(size_t offset, const __constant ushort *p);
+int2 __ovld vload2(size_t offset, const __constant int *p);
+uint2 __ovld vload2(size_t offset, const __constant uint *p);
+long2 __ovld vload2(size_t offset, const __constant long *p);
+ulong2 __ovld vload2(size_t offset, const __constant ulong *p);
+float2 __ovld vload2(size_t offset, const __constant float *p);
+char3 __ovld vload3(size_t offset, const __constant char *p);
+uchar3 __ovld vload3(size_t offset, const __constant uchar *p);
+short3 __ovld vload3(size_t offset, const __constant short *p);
+ushort3 __ovld vload3(size_t offset, const __constant ushort *p);
+int3 __ovld vload3(size_t offset, const __constant int *p);
+uint3 __ovld vload3(size_t offset, const __constant uint *p);
+long3 __ovld vload3(size_t offset, const __constant long *p);
+ulong3 __ovld vload3(size_t offset, const __constant ulong *p);
+float3 __ovld vload3(size_t offset, const __constant float *p);
+char4 __ovld vload4(size_t offset, const __constant char *p);
+uchar4 __ovld vload4(size_t offset, const __constant uchar *p);
+short4 __ovld vload4(size_t offset, const __constant short *p);
+ushort4 __ovld vload4(size_t offset, const __constant ushort *p);
+int4 __ovld vload4(size_t offset, const __constant int *p);
+uint4 __ovld vload4(size_t offset, const __constant uint *p);
+long4 __ovld vload4(size_t offset, const __constant long *p);
+ulong4 __ovld vload4(size_t offset, const __constant ulong *p);
+float4 __ovld vload4(size_t offset, const __constant float *p);
+char8 __ovld vload8(size_t offset, const __constant char *p);
+uchar8 __ovld vload8(size_t offset, const __constant uchar *p);
+short8 __ovld vload8(size_t offset, const __constant short *p);
+ushort8 __ovld vload8(size_t offset, const __constant ushort *p);
+int8 __ovld vload8(size_t offset, const __constant int *p);
+uint8 __ovld vload8(size_t offset, const __constant uint *p);
+long8 __ovld vload8(size_t offset, const __constant long *p);
+ulong8 __ovld vload8(size_t offset, const __constant ulong *p);
+float8 __ovld vload8(size_t offset, const __constant float *p);
+char16 __ovld vload16(size_t offset, const __constant char *p);
+uchar16 __ovld vload16(size_t offset, const __constant uchar *p);
+short16 __ovld vload16(size_t offset, const __constant short *p);
+ushort16 __ovld vload16(size_t offset, const __constant ushort *p);
+int16 __ovld vload16(size_t offset, const __constant int *p);
+uint16 __ovld vload16(size_t offset, const __constant uint *p);
+long16 __ovld vload16(size_t offset, const __constant long *p);
+ulong16 __ovld vload16(size_t offset, const __constant ulong *p);
+float16 __ovld vload16(size_t offset, const __constant float *p);
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const __constant double *p);
+double3 __ovld vload3(size_t offset, const __constant double *p);
+double4 __ovld vload4(size_t offset, const __constant double *p);
+double8 __ovld vload8(size_t offset, const __constant double *p);
+double16 __ovld vload16(size_t offset, const __constant double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const __constant half *p);
+half2 __ovld vload2(size_t offset, const __constant half *p);
+half3 __ovld vload3(size_t offset, const __constant half *p);
+half4 __ovld vload4(size_t offset, const __constant half *p);
+half8 __ovld vload8(size_t offset, const __constant half *p);
+half16 __ovld vload16(size_t offset, const __constant half *p);
+#endif //cl_khr_fp16
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+char2 __ovld vload2(size_t offset, const char *p);
+uchar2 __ovld vload2(size_t offset, const uchar *p);
+short2 __ovld vload2(size_t offset, const short *p);
+ushort2 __ovld vload2(size_t offset, const ushort *p);
+int2 __ovld vload2(size_t offset, const int *p);
+uint2 __ovld vload2(size_t offset, const uint *p);
+long2 __ovld vload2(size_t offset, const long *p);
+ulong2 __ovld vload2(size_t offset, const ulong *p);
+float2 __ovld vload2(size_t offset, const float *p);
+char3 __ovld vload3(size_t offset, const char *p);
+uchar3 __ovld vload3(size_t offset, const uchar *p);
+short3 __ovld vload3(size_t offset, const short *p);
+ushort3 __ovld vload3(size_t offset, const ushort *p);
+int3 __ovld vload3(size_t offset, const int *p);
+uint3 __ovld vload3(size_t offset, const uint *p);
+long3 __ovld vload3(size_t offset, const long *p);
+ulong3 __ovld vload3(size_t offset, const ulong *p);
+float3 __ovld vload3(size_t offset, const float *p);
+char4 __ovld vload4(size_t offset, const char *p);
+uchar4 __ovld vload4(size_t offset, const uchar *p);
+short4 __ovld vload4(size_t offset, const short *p);
+ushort4 __ovld vload4(size_t offset, const ushort *p);
+int4 __ovld vload4(size_t offset, const int *p);
+uint4 __ovld vload4(size_t offset, const uint *p);
+long4 __ovld vload4(size_t offset, const long *p);
+ulong4 __ovld vload4(size_t offset, const ulong *p);
+float4 __ovld vload4(size_t offset, const float *p);
+char8 __ovld vload8(size_t offset, const char *p);
+uchar8 __ovld vload8(size_t offset, const uchar *p);
+short8 __ovld vload8(size_t offset, const short *p);
+ushort8 __ovld vload8(size_t offset, const ushort *p);
+int8 __ovld vload8(size_t offset, const int *p);
+uint8 __ovld vload8(size_t offset, const uint *p);
+long8 __ovld vload8(size_t offset, const long *p);
+ulong8 __ovld vload8(size_t offset, const ulong *p);
+float8 __ovld vload8(size_t offset, const float *p);
+char16 __ovld vload16(size_t offset, const char *p);
+uchar16 __ovld vload16(size_t offset, const uchar *p);
+short16 __ovld vload16(size_t offset, const short *p);
+ushort16 __ovld vload16(size_t offset, const ushort *p);
+int16 __ovld vload16(size_t offset, const int *p);
+uint16 __ovld vload16(size_t offset, const uint *p);
+long16 __ovld vload16(size_t offset, const long *p);
+ulong16 __ovld vload16(size_t offset, const ulong *p);
+float16 __ovld vload16(size_t offset, const float *p);
+
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const double *p);
+double3 __ovld vload3(size_t offset, const double *p);
+double4 __ovld vload4(size_t offset, const double *p);
+double8 __ovld vload8(size_t offset, const double *p);
+double16 __ovld vload16(size_t offset, const double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const half *p);
+half2 __ovld vload2(size_t offset, const half *p);
+half3 __ovld vload3(size_t offset, const half *p);
+half4 __ovld vload4(size_t offset, const half *p);
+half8 __ovld vload8(size_t offset, const half *p);
+half16 __ovld vload16(size_t offset, const half *p);
+#endif //cl_khr_fp16
+#else
+char2 __ovld vload2(size_t offset, const __global char *p);
+uchar2 __ovld vload2(size_t offset, const __global uchar *p);
+short2 __ovld vload2(size_t offset, const __global short *p);
+ushort2 __ovld vload2(size_t offset, const __global ushort *p);
+int2 __ovld vload2(size_t offset, const __global int *p);
+uint2 __ovld vload2(size_t offset, const __global uint *p);
+long2 __ovld vload2(size_t offset, const __global long *p);
+ulong2 __ovld vload2(size_t offset, const __global ulong *p);
+float2 __ovld vload2(size_t offset, const __global float *p);
+char3 __ovld vload3(size_t offset, const __global char *p);
+uchar3 __ovld vload3(size_t offset, const __global uchar *p);
+short3 __ovld vload3(size_t offset, const __global short *p);
+ushort3 __ovld vload3(size_t offset, const __global ushort *p);
+int3 __ovld vload3(size_t offset, const __global int *p);
+uint3 __ovld vload3(size_t offset, const __global uint *p);
+long3 __ovld vload3(size_t offset, const __global long *p);
+ulong3 __ovld vload3(size_t offset, const __global ulong *p);
+float3 __ovld vload3(size_t offset, const __global float *p);
+char4 __ovld vload4(size_t offset, const __global char *p);
+uchar4 __ovld vload4(size_t offset, const __global uchar *p);
+short4 __ovld vload4(size_t offset, const __global short *p);
+ushort4 __ovld vload4(size_t offset, const __global ushort *p);
+int4 __ovld vload4(size_t offset, const __global int *p);
+uint4 __ovld vload4(size_t offset, const __global uint *p);
+long4 __ovld vload4(size_t offset, const __global long *p);
+ulong4 __ovld vload4(size_t offset, const __global ulong *p);
+float4 __ovld vload4(size_t offset, const __global float *p);
+char8 __ovld vload8(size_t offset, const __global char *p);
+uchar8 __ovld vload8(size_t offset, const __global uchar *p);
+short8 __ovld vload8(size_t offset, const __global short *p);
+ushort8 __ovld vload8(size_t offset, const __global ushort *p);
+int8 __ovld vload8(size_t offset, const __global int *p);
+uint8 __ovld vload8(size_t offset, const __global uint *p);
+long8 __ovld vload8(size_t offset, const __global long *p);
+ulong8 __ovld vload8(size_t offset, const __global ulong *p);
+float8 __ovld vload8(size_t offset, const __global float *p);
+char16 __ovld vload16(size_t offset, const __global char *p);
+uchar16 __ovld vload16(size_t offset, const __global uchar *p);
+short16 __ovld vload16(size_t offset, const __global short *p);
+ushort16 __ovld vload16(size_t offset, const __global ushort *p);
+int16 __ovld vload16(size_t offset, const __global int *p);
+uint16 __ovld vload16(size_t offset, const __global uint *p);
+long16 __ovld vload16(size_t offset, const __global long *p);
+ulong16 __ovld vload16(size_t offset, const __global ulong *p);
+float16 __ovld vload16(size_t offset, const __global float *p);
+char2 __ovld vload2(size_t offset, const __local char *p);
+uchar2 __ovld vload2(size_t offset, const __local uchar *p);
+short2 __ovld vload2(size_t offset, const __local short *p);
+ushort2 __ovld vload2(size_t offset, const __local ushort *p);
+int2 __ovld vload2(size_t offset, const __local int *p);
+uint2 __ovld vload2(size_t offset, const __local uint *p);
+long2 __ovld vload2(size_t offset, const __local long *p);
+ulong2 __ovld vload2(size_t offset, const __local ulong *p);
+float2 __ovld vload2(size_t offset, const __local float *p);
+char3 __ovld vload3(size_t offset, const __local char *p);
+uchar3 __ovld vload3(size_t offset, const __local uchar *p);
+short3 __ovld vload3(size_t offset, const __local short *p);
+ushort3 __ovld vload3(size_t offset, const __local ushort *p);
+int3 __ovld vload3(size_t offset, const __local int *p);
+uint3 __ovld vload3(size_t offset, const __local uint *p);
+long3 __ovld vload3(size_t offset, const __local long *p);
+ulong3 __ovld vload3(size_t offset, const __local ulong *p);
+float3 __ovld vload3(size_t offset, const __local float *p);
+char4 __ovld vload4(size_t offset, const __local char *p);
+uchar4 __ovld vload4(size_t offset, const __local uchar *p);
+short4 __ovld vload4(size_t offset, const __local short *p);
+ushort4 __ovld vload4(size_t offset, const __local ushort *p);
+int4 __ovld vload4(size_t offset, const __local int *p);
+uint4 __ovld vload4(size_t offset, const __local uint *p);
+long4 __ovld vload4(size_t offset, const __local long *p);
+ulong4 __ovld vload4(size_t offset, const __local ulong *p);
+float4 __ovld vload4(size_t offset, const __local float *p);
+char8 __ovld vload8(size_t offset, const __local char *p);
+uchar8 __ovld vload8(size_t offset, const __local uchar *p);
+short8 __ovld vload8(size_t offset, const __local short *p);
+ushort8 __ovld vload8(size_t offset, const __local ushort *p);
+int8 __ovld vload8(size_t offset, const __local int *p);
+uint8 __ovld vload8(size_t offset, const __local uint *p);
+long8 __ovld vload8(size_t offset, const __local long *p);
+ulong8 __ovld vload8(size_t offset, const __local ulong *p);
+float8 __ovld vload8(size_t offset, const __local float *p);
+char16 __ovld vload16(size_t offset, const __local char *p);
+uchar16 __ovld vload16(size_t offset, const __local uchar *p);
+short16 __ovld vload16(size_t offset, const __local short *p);
+ushort16 __ovld vload16(size_t offset, const __local ushort *p);
+int16 __ovld vload16(size_t offset, const __local int *p);
+uint16 __ovld vload16(size_t offset, const __local uint *p);
+long16 __ovld vload16(size_t offset, const __local long *p);
+ulong16 __ovld vload16(size_t offset, const __local ulong *p);
+float16 __ovld vload16(size_t offset, const __local float *p);
+char2 __ovld vload2(size_t offset, const __private char *p);
+uchar2 __ovld vload2(size_t offset, const __private uchar *p);
+short2 __ovld vload2(size_t offset, const __private short *p);
+ushort2 __ovld vload2(size_t offset, const __private ushort *p);
+int2 __ovld vload2(size_t offset, const __private int *p);
+uint2 __ovld vload2(size_t offset, const __private uint *p);
+long2 __ovld vload2(size_t offset, const __private long *p);
+ulong2 __ovld vload2(size_t offset, const __private ulong *p);
+float2 __ovld vload2(size_t offset, const __private float *p);
+char3 __ovld vload3(size_t offset, const __private char *p);
+uchar3 __ovld vload3(size_t offset, const __private uchar *p);
+short3 __ovld vload3(size_t offset, const __private short *p);
+ushort3 __ovld vload3(size_t offset, const __private ushort *p);
+int3 __ovld vload3(size_t offset, const __private int *p);
+uint3 __ovld vload3(size_t offset, const __private uint *p);
+long3 __ovld vload3(size_t offset, const __private long *p);
+ulong3 __ovld vload3(size_t offset, const __private ulong *p);
+float3 __ovld vload3(size_t offset, const __private float *p);
+char4 __ovld vload4(size_t offset, const __private char *p);
+uchar4 __ovld vload4(size_t offset, const __private uchar *p);
+short4 __ovld vload4(size_t offset, const __private short *p);
+ushort4 __ovld vload4(size_t offset, const __private ushort *p);
+int4 __ovld vload4(size_t offset, const __private int *p);
+uint4 __ovld vload4(size_t offset, const __private uint *p);
+long4 __ovld vload4(size_t offset, const __private long *p);
+ulong4 __ovld vload4(size_t offset, const __private ulong *p);
+float4 __ovld vload4(size_t offset, const __private float *p);
+char8 __ovld vload8(size_t offset, const __private char *p);
+uchar8 __ovld vload8(size_t offset, const __private uchar *p);
+short8 __ovld vload8(size_t offset, const __private short *p);
+ushort8 __ovld vload8(size_t offset, const __private ushort *p);
+int8 __ovld vload8(size_t offset, const __private int *p);
+uint8 __ovld vload8(size_t offset, const __private uint *p);
+long8 __ovld vload8(size_t offset, const __private long *p);
+ulong8 __ovld vload8(size_t offset, const __private ulong *p);
+float8 __ovld vload8(size_t offset, const __private float *p);
+char16 __ovld vload16(size_t offset, const __private char *p);
+uchar16 __ovld vload16(size_t offset, const __private uchar *p);
+short16 __ovld vload16(size_t offset, const __private short *p);
+ushort16 __ovld vload16(size_t offset, const __private ushort *p);
+int16 __ovld vload16(size_t offset, const __private int *p);
+uint16 __ovld vload16(size_t offset, const __private uint *p);
+long16 __ovld vload16(size_t offset, const __private long *p);
+ulong16 __ovld vload16(size_t offset, const __private ulong *p);
+float16 __ovld vload16(size_t offset, const __private float *p);
+
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const __global double *p);
+double3 __ovld vload3(size_t offset, const __global double *p);
+double4 __ovld vload4(size_t offset, const __global double *p);
+double8 __ovld vload8(size_t offset, const __global double *p);
+double16 __ovld vload16(size_t offset, const __global double *p);
+double2 __ovld vload2(size_t offset, const __local double *p);
+double3 __ovld vload3(size_t offset, const __local double *p);
+double4 __ovld vload4(size_t offset, const __local double *p);
+double8 __ovld vload8(size_t offset, const __local double *p);
+double16 __ovld vload16(size_t offset, const __local double *p);
+double2 __ovld vload2(size_t offset, const __private double *p);
+double3 __ovld vload3(size_t offset, const __private double *p);
+double4 __ovld vload4(size_t offset, const __private double *p);
+double8 __ovld vload8(size_t offset, const __private double *p);
+double16 __ovld vload16(size_t offset, const __private double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const __global half *p);
+half2 __ovld vload2(size_t offset, const __global half *p);
+half3 __ovld vload3(size_t offset, const __global half *p);
+half4 __ovld vload4(size_t offset, const __global half *p);
+half8 __ovld vload8(size_t offset, const __global half *p);
+half16 __ovld vload16(size_t offset, const __global half *p);
+half __ovld vload(size_t offset, const __local half *p);
+half2 __ovld vload2(size_t offset, const __local half *p);
+half3 __ovld vload3(size_t offset, const __local half *p);
+half4 __ovld vload4(size_t offset, const __local half *p);
+half8 __ovld vload8(size_t offset, const __local half *p);
+half16 __ovld vload16(size_t offset, const __local half *p);
+half __ovld vload(size_t offset, const __private half *p);
+half2 __ovld vload2(size_t offset, const __private half *p);
+half3 __ovld vload3(size_t offset, const __private half *p);
+half4 __ovld vload4(size_t offset, const __private half *p);
+half8 __ovld vload8(size_t offset, const __private half *p);
+half16 __ovld vload16(size_t offset, const __private half *p);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore2(char2 data, size_t offset, char *p);
+void __ovld vstore2(uchar2 data, size_t offset, uchar *p);
+void __ovld vstore2(short2 data, size_t offset, short *p);
+void __ovld vstore2(ushort2 data, size_t offset, ushort *p);
+void __ovld vstore2(int2 data, size_t offset, int *p);
+void __ovld vstore2(uint2 data, size_t offset, uint *p);
+void __ovld vstore2(long2 data, size_t offset, long *p);
+void __ovld vstore2(ulong2 data, size_t offset, ulong *p);
+void __ovld vstore2(float2 data, size_t offset, float *p);
+void __ovld vstore3(char3 data, size_t offset, char *p);
+void __ovld vstore3(uchar3 data, size_t offset, uchar *p);
+void __ovld vstore3(short3 data, size_t offset, short *p);
+void __ovld vstore3(ushort3 data, size_t offset, ushort *p);
+void __ovld vstore3(int3 data, size_t offset, int *p);
+void __ovld vstore3(uint3 data, size_t offset, uint *p);
+void __ovld vstore3(long3 data, size_t offset, long *p);
+void __ovld vstore3(ulong3 data, size_t offset, ulong *p);
+void __ovld vstore3(float3 data, size_t offset, float *p);
+void __ovld vstore4(char4 data, size_t offset, char *p);
+void __ovld vstore4(uchar4 data, size_t offset, uchar *p);
+void __ovld vstore4(short4 data, size_t offset, short *p);
+void __ovld vstore4(ushort4 data, size_t offset, ushort *p);
+void __ovld vstore4(int4 data, size_t offset, int *p);
+void __ovld vstore4(uint4 data, size_t offset, uint *p);
+void __ovld vstore4(long4 data, size_t offset, long *p);
+void __ovld vstore4(ulong4 data, size_t offset, ulong *p);
+void __ovld vstore4(float4 data, size_t offset, float *p);
+void __ovld vstore8(char8 data, size_t offset, char *p);
+void __ovld vstore8(uchar8 data, size_t offset, uchar *p);
+void __ovld vstore8(short8 data, size_t offset, short *p);
+void __ovld vstore8(ushort8 data, size_t offset, ushort *p);
+void __ovld vstore8(int8 data, size_t offset, int *p);
+void __ovld vstore8(uint8 data, size_t offset, uint *p);
+void __ovld vstore8(long8 data, size_t offset, long *p);
+void __ovld vstore8(ulong8 data, size_t offset, ulong *p);
+void __ovld vstore8(float8 data, size_t offset, float *p);
+void __ovld vstore16(char16 data, size_t offset, char *p);
+void __ovld vstore16(uchar16 data, size_t offset, uchar *p);
+void __ovld vstore16(short16 data, size_t offset, short *p);
+void __ovld vstore16(ushort16 data, size_t offset, ushort *p);
+void __ovld vstore16(int16 data, size_t offset, int *p);
+void __ovld vstore16(uint16 data, size_t offset, uint *p);
+void __ovld vstore16(long16 data, size_t offset, long *p);
+void __ovld vstore16(ulong16 data, size_t offset, ulong *p);
+void __ovld vstore16(float16 data, size_t offset, float *p);
+#ifdef cl_khr_fp64
+void __ovld vstore2(double2 data, size_t offset, double *p);
+void __ovld vstore3(double3 data, size_t offset, double *p);
+void __ovld vstore4(double4 data, size_t offset, double *p);
+void __ovld vstore8(double8 data, size_t offset, double *p);
+void __ovld vstore16(double16 data, size_t offset, double *p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld vstore(half data, size_t offset, half *p);
+void __ovld vstore2(half2 data, size_t offset, half *p);
+void __ovld vstore3(half3 data, size_t offset, half *p);
+void __ovld vstore4(half4 data, size_t offset, half *p);
+void __ovld vstore8(half8 data, size_t offset, half *p);
+void __ovld vstore16(half16 data, size_t offset, half *p);
+#endif //cl_khr_fp16
+#else
+void __ovld vstore2(char2 data, size_t offset, __global char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __global uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __global short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __global ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __global int *p);
+void __ovld vstore2(uint2 data, size_t offset, __global uint *p);
+void __ovld vstore2(long2 data, size_t offset, __global long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __global ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __global float *p);
+void __ovld vstore3(char3 data, size_t offset, __global char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __global uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __global short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __global ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __global int *p);
+void __ovld vstore3(uint3 data, size_t offset, __global uint *p);
+void __ovld vstore3(long3 data, size_t offset, __global long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __global ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __global float *p);
+void __ovld vstore4(char4 data, size_t offset, __global char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __global uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __global short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __global ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __global int *p);
+void __ovld vstore4(uint4 data, size_t offset, __global uint *p);
+void __ovld vstore4(long4 data, size_t offset, __global long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __global ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __global float *p);
+void __ovld vstore8(char8 data, size_t offset, __global char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __global uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __global short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __global ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __global int *p);
+void __ovld vstore8(uint8 data, size_t offset, __global uint *p);
+void __ovld vstore8(long8 data, size_t offset, __global long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __global ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __global float *p);
+void __ovld vstore16(char16 data, size_t offset, __global char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __global uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __global short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __global ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __global int *p);
+void __ovld vstore16(uint16 data, size_t offset, __global uint *p);
+void __ovld vstore16(long16 data, size_t offset, __global long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __global ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __global float *p);
+void __ovld vstore2(char2 data, size_t offset, __local char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __local uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __local short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __local ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __local int *p);
+void __ovld vstore2(uint2 data, size_t offset, __local uint *p);
+void __ovld vstore2(long2 data, size_t offset, __local long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __local ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __local float *p);
+void __ovld vstore3(char3 data, size_t offset, __local char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __local uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __local short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __local ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __local int *p);
+void __ovld vstore3(uint3 data, size_t offset, __local uint *p);
+void __ovld vstore3(long3 data, size_t offset, __local long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __local ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __local float *p);
+void __ovld vstore4(char4 data, size_t offset, __local char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __local uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __local short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __local ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __local int *p);
+void __ovld vstore4(uint4 data, size_t offset, __local uint *p);
+void __ovld vstore4(long4 data, size_t offset, __local long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __local ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __local float *p);
+void __ovld vstore8(char8 data, size_t offset, __local char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __local uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __local short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __local ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __local int *p);
+void __ovld vstore8(uint8 data, size_t offset, __local uint *p);
+void __ovld vstore8(long8 data, size_t offset, __local long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __local ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __local float *p);
+void __ovld vstore16(char16 data, size_t offset, __local char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __local uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __local short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __local ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __local int *p);
+void __ovld vstore16(uint16 data, size_t offset, __local uint *p);
+void __ovld vstore16(long16 data, size_t offset, __local long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __local ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __local float *p);
+void __ovld vstore2(char2 data, size_t offset, __private char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __private uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __private short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __private ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __private int *p);
+void __ovld vstore2(uint2 data, size_t offset, __private uint *p);
+void __ovld vstore2(long2 data, size_t offset, __private long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __private ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __private float *p);
+void __ovld vstore3(char3 data, size_t offset, __private char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __private uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __private short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __private ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __private int *p);
+void __ovld vstore3(uint3 data, size_t offset, __private uint *p);
+void __ovld vstore3(long3 data, size_t offset, __private long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __private ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __private float *p);
+void __ovld vstore4(char4 data, size_t offset, __private char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __private uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __private short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __private ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __private int *p);
+void __ovld vstore4(uint4 data, size_t offset, __private uint *p);
+void __ovld vstore4(long4 data, size_t offset, __private long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __private ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __private float *p);
+void __ovld vstore8(char8 data, size_t offset, __private char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __private uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __private short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __private ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __private int *p);
+void __ovld vstore8(uint8 data, size_t offset, __private uint *p);
+void __ovld vstore8(long8 data, size_t offset, __private long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __private ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __private float *p);
+void __ovld vstore16(char16 data, size_t offset, __private char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __private uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __private short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __private ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __private int *p);
+void __ovld vstore16(uint16 data, size_t offset, __private uint *p);
+void __ovld vstore16(long16 data, size_t offset, __private long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __private ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __private float *p);
+#ifdef cl_khr_fp64
+void __ovld vstore2(double2 data, size_t offset, __global double *p);
+void __ovld vstore3(double3 data, size_t offset, __global double *p);
+void __ovld vstore4(double4 data, size_t offset, __global double *p);
+void __ovld vstore8(double8 data, size_t offset, __global double *p);
+void __ovld vstore16(double16 data, size_t offset, __global double *p);
+void __ovld vstore2(double2 data, size_t offset, __local double *p);
+void __ovld vstore3(double3 data, size_t offset, __local double *p);
+void __ovld vstore4(double4 data, size_t offset, __local double *p);
+void __ovld vstore8(double8 data, size_t offset, __local double *p);
+void __ovld vstore16(double16 data, size_t offset, __local double *p);
+void __ovld vstore2(double2 data, size_t offset, __private double *p);
+void __ovld vstore3(double3 data, size_t offset, __private double *p);
+void __ovld vstore4(double4 data, size_t offset, __private double *p);
+void __ovld vstore8(double8 data, size_t offset, __private double *p);
+void __ovld vstore16(double16 data, size_t offset, __private double *p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld vstore(half data, size_t offset, __global half *p);
+void __ovld vstore2(half2 data, size_t offset, __global half *p);
+void __ovld vstore3(half3 data, size_t offset, __global half *p);
+void __ovld vstore4(half4 data, size_t offset, __global half *p);
+void __ovld vstore8(half8 data, size_t offset, __global half *p);
+void __ovld vstore16(half16 data, size_t offset, __global half *p);
+void __ovld vstore(half data, size_t offset, __local half *p);
+void __ovld vstore2(half2 data, size_t offset, __local half *p);
+void __ovld vstore3(half3 data, size_t offset, __local half *p);
+void __ovld vstore4(half4 data, size_t offset, __local half *p);
+void __ovld vstore8(half8 data, size_t offset, __local half *p);
+void __ovld vstore16(half16 data, size_t offset, __local half *p);
+void __ovld vstore(half data, size_t offset, __private half *p);
+void __ovld vstore2(half2 data, size_t offset, __private half *p);
+void __ovld vstore3(half3 data, size_t offset, __private half *p);
+void __ovld vstore4(half4 data, size_t offset, __private half *p);
+void __ovld vstore8(half8 data, size_t offset, __private half *p);
+void __ovld vstore16(half16 data, size_t offset, __private half *p);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Read sizeof (half) bytes of data from address
+ * (p + offset). The data read is interpreted as a
+ * half value. The half value is converted to a
+ * float value and the float value is returned.
+ * The read address computed as (p + offset)
+ * must be 16-bit aligned.
+ */
+float __ovld vload_half(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld vload_half(size_t offset, const half *p);
+#else
+float __ovld vload_half(size_t offset, const __global half *p);
+float __ovld vload_half(size_t offset, const __local half *p);
+float __ovld vload_half(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Read sizeof (halfn) bytes of data from address
+ * (p + (offset * n)). The data read is interpreted
+ * as a halfn value. The halfn value read is
+ * converted to a floatn value and the floatn
+ * value is returned. The read address computed
+ * as (p + (offset * n)) must be 16-bit aligned.
+ */
+float2 __ovld vload_half2(size_t offset, const __constant half *p);
+float3 __ovld vload_half3(size_t offset, const __constant half *p);
+float4 __ovld vload_half4(size_t offset, const __constant half *p);
+float8 __ovld vload_half8(size_t offset, const __constant half *p);
+float16 __ovld vload_half16(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float2 __ovld vload_half2(size_t offset, const half *p);
+float3 __ovld vload_half3(size_t offset, const half *p);
+float4 __ovld vload_half4(size_t offset, const half *p);
+float8 __ovld vload_half8(size_t offset, const half *p);
+float16 __ovld vload_half16(size_t offset, const half *p);
+#else
+float2 __ovld vload_half2(size_t offset, const __global half *p);
+float3 __ovld vload_half3(size_t offset, const __global half *p);
+float4 __ovld vload_half4(size_t offset, const __global half *p);
+float8 __ovld vload_half8(size_t offset, const __global half *p);
+float16 __ovld vload_half16(size_t offset, const __global half *p);
+float2 __ovld vload_half2(size_t offset, const __local half *p);
+float3 __ovld vload_half3(size_t offset, const __local half *p);
+float4 __ovld vload_half4(size_t offset, const __local half *p);
+float8 __ovld vload_half8(size_t offset, const __local half *p);
+float16 __ovld vload_half16(size_t offset, const __local half *p);
+float2 __ovld vload_half2(size_t offset, const __private half *p);
+float3 __ovld vload_half3(size_t offset, const __private half *p);
+float4 __ovld vload_half4(size_t offset, const __private half *p);
+float8 __ovld vload_half8(size_t offset, const __private half *p);
+float16 __ovld vload_half16(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The float value given by data is first
+ * converted to a half value using the appropriate
+ * rounding mode. The half value is then written
+ * to address computed as (p + offset). The
+ * address computed as (p + offset) must be 16-
+ * bit aligned.
+ * vstore_half use the current rounding mode.
+ * The default current rounding mode is round to
+ * nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore_half(float data, size_t offset, half *p);
+void __ovld vstore_half_rte(float data, size_t offset, half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half(double data, size_t offset, half *p);
+void __ovld vstore_half_rte(double data, size_t offset, half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, half *p);
+#endif //cl_khr_fp64
+#else
+void __ovld vstore_half(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __global half *p);
+void __ovld vstore_half(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __local half *p);
+void __ovld vstore_half(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __private half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __global half *p);
+void __ovld vstore_half(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __local half *p);
+void __ovld vstore_half(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The floatn value given by data is converted to
+ * a halfn value using the appropriate rounding
+ * mode. The halfn value is then written to
+ * address computed as (p + (offset * n)). The
+ * address computed as (p + (offset * n)) must be
+ * 16-bit aligned.
+ * vstore_halfn uses the current rounding mode.
+ * The default current rounding mode is round to
+ * nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore_half2(float2 data, size_t offset, half *p);
+void __ovld vstore_half3(float3 data, size_t offset, half *p);
+void __ovld vstore_half4(float4 data, size_t offset, half *p);
+void __ovld vstore_half8(float8 data, size_t offset, half *p);
+void __ovld vstore_half16(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half2(double2 data, size_t offset, half *p);
+void __ovld vstore_half3(double3 data, size_t offset, half *p);
+void __ovld vstore_half4(double4 data, size_t offset, half *p);
+void __ovld vstore_half8(double8 data, size_t offset, half *p);
+void __ovld vstore_half16(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, half *p);
+#endif //cl_khr_fp64
+#else
+void __ovld vstore_half2(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __private half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half2(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * For n = 1, 2, 4, 8 and 16 read sizeof (halfn)
+ * bytes of data from address (p + (offset * n)).
+ * The data read is interpreted as a halfn value.
+ * The halfn value read is converted to a floatn
+ * value and the floatn value is returned.
+ * The address computed as (p + (offset * n))
+ * must be aligned to sizeof (halfn) bytes.
+ * For n = 3, vloada_half3 reads a half3 from
+ * address (p + (offset * 4)) and returns a float3.
+ * The address computed as (p + (offset * 4))
+ * must be aligned to sizeof (half) * 4 bytes.
+ */
+float __ovld vloada_half(size_t offset, const __constant half *p);
+float2 __ovld vloada_half2(size_t offset, const __constant half *p);
+float3 __ovld vloada_half3(size_t offset, const __constant half *p);
+float4 __ovld vloada_half4(size_t offset, const __constant half *p);
+float8 __ovld vloada_half8(size_t offset, const __constant half *p);
+float16 __ovld vloada_half16(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld vloada_half(size_t offset, const half *p);
+float2 __ovld vloada_half2(size_t offset, const half *p);
+float3 __ovld vloada_half3(size_t offset, const half *p);
+float4 __ovld vloada_half4(size_t offset, const half *p);
+float8 __ovld vloada_half8(size_t offset, const half *p);
+float16 __ovld vloada_half16(size_t offset, const half *p);
+#else
+float __ovld vloada_half(size_t offset, const __global half *p);
+float2 __ovld vloada_half2(size_t offset, const __global half *p);
+float3 __ovld vloada_half3(size_t offset, const __global half *p);
+float4 __ovld vloada_half4(size_t offset, const __global half *p);
+float8 __ovld vloada_half8(size_t offset, const __global half *p);
+float16 __ovld vloada_half16(size_t offset, const __global half *p);
+float __ovld vloada_half(size_t offset, const __local half *p);
+float2 __ovld vloada_half2(size_t offset, const __local half *p);
+float3 __ovld vloada_half3(size_t offset, const __local half *p);
+float4 __ovld vloada_half4(size_t offset, const __local half *p);
+float8 __ovld vloada_half8(size_t offset, const __local half *p);
+float16 __ovld vloada_half16(size_t offset, const __local half *p);
+float __ovld vloada_half(size_t offset, const __private half *p);
+float2 __ovld vloada_half2(size_t offset, const __private half *p);
+float3 __ovld vloada_half3(size_t offset, const __private half *p);
+float4 __ovld vloada_half4(size_t offset, const __private half *p);
+float8 __ovld vloada_half8(size_t offset, const __private half *p);
+float16 __ovld vloada_half16(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The floatn value given by data is converted to
+ * a halfn value using the appropriate rounding
+ * mode.
+ * For n = 1, 2, 4, 8 and 16, the halfn value is
+ * written to the address computed as (p + (offset
+ * * n)). The address computed as (p + (offset *
+ * n)) must be aligned to sizeof (halfn) bytes.
+ * For n = 3, the half3 value is written to the
+ * address computed as (p + (offset * 4)). The
+ * address computed as (p + (offset * 4)) must be
+ * aligned to sizeof (half) * 4 bytes.
+ * vstorea_halfn uses the current rounding
+ * mode. The default current rounding mode is
+ * round to nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstorea_half(float data, size_t offset, half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, half *p);
+
+#ifdef cl_khr_fp64
+void __ovld vstorea_half(double data, size_t offset, half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, half *p);
+#endif //cl_khr_fp64
+
+#else
+void __ovld vstorea_half(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __private half *p);
+
+#ifdef cl_khr_fp64
+void __ovld vstorea_half(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtn(double2 data,size_t offset, __private half *p);
+void __ovld vstorea_half3_rtn(double3 data,size_t offset, __private half *p);
+void __ovld vstorea_half4_rtn(double4 data,size_t offset, __private half *p);
+void __ovld vstorea_half8_rtn(double8 data,size_t offset, __private half *p);
+void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
+
+// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
+typedef uint cl_mem_fence_flags;
+
+/**
+ * Queue a memory fence to ensure correct
+ * ordering of memory operations to local memory
+ */
+#define CLK_LOCAL_MEM_FENCE    0x01
+
+/**
+ * Queue a memory fence to ensure correct
+ * ordering of memory operations to global memory
+ */
+#define CLK_GLOBAL_MEM_FENCE   0x02
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+/**
+ * Queue a memory fence to ensure correct ordering of memory
+ * operations between work-items of a work-group to
+ * image memory.
+ */
+#define CLK_IMAGE_MEM_FENCE  0x04
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * All work-items in a work-group executing the kernel
+ * on a processor must execute this function before any
+ * are allowed to continue execution beyond the barrier.
+ * This function must be encountered by all work-items in
+ * a work-group executing the kernel.
+ * If barrier is inside a conditional statement, then all
+ * work-items must enter the conditional if any work-item
+ * enters the conditional statement and executes the
+ * barrier.
+ * If barrer is inside a loop, all work-items must execute
+ * the barrier for each iteration of the loop before any are
+ * allowed to continue execution beyond the barrier.
+ * The barrier function also queues a memory fence
+ * (reads and writes) to ensure correct ordering of
+ * memory operations to local or global memory.
+ * The flags argument specifies the memory address space
+ * and can be set to a combination of the following literal
+ * values.
+ * CLK_LOCAL_MEM_FENCE - The barrier function
+ * will either flush any variables stored in local memory
+ * or queue a memory fence to ensure correct ordering of
+ * memory operations to local memory.
+ * CLK_GLOBAL_MEM_FENCE - The barrier function
+ * will queue a memory fence to ensure correct ordering
+ * of memory operations to global memory. This can be
+ * useful when work-items, for example, write to buffer or
+ * image objects and then want to read the updated data.
+ */
+
+void __ovld barrier(cl_mem_fence_flags flags);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+typedef enum memory_scope
+{
+  memory_scope_work_item,
+  memory_scope_work_group,
+  memory_scope_device,
+  memory_scope_all_svm_devices,
+  memory_scope_sub_group
+} memory_scope;
+
+void __ovld work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld work_group_barrier(cl_mem_fence_flags flags);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
+
+/**
+ * Orders loads and stores of a work-item
+ * executing a kernel. This means that loads
+ * and stores preceding the mem_fence will
+ * be committed to memory before any loads
+ * and stores following the mem_fence.
+ * The flags argument specifies the memory
+ * address space and can be set to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld mem_fence(cl_mem_fence_flags flags);
+
+/**
+ * Read memory barrier that orders only
+ * loads.
+ * The flags argument specifies the memory
+ * address space and can be set to to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld read_mem_fence(cl_mem_fence_flags flags);
+
+/**
+ * Write memory barrier that orders only
+ * stores.
+ * The flags argument specifies the memory
+ * address space and can be set to to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld write_mem_fence(cl_mem_fence_flags flags);
+
+// OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+cl_mem_fence_flags __ovld get_fence(const void *ptr);
+cl_mem_fence_flags __ovld get_fence(void *ptr);
+
+/** 
+ * Builtin functions to_global, to_local, and to_private need to be declared as Clang builtin functions
+ * and checked in Sema since they should be declared as
+ *   addr gentype* to_addr (gentype*);
+ * where gentype is builtin type or user defined type.
+ */
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.10, v1.2 s6.12.10, v2.0 s6.13.10 - Async Copies from Global to Local Memory, Local to Global Memory, and Prefetch
+
+/**
+ * event_t async_work_group_copy (
+ * __global gentype *dst,
+ * const __local gentype *src,
+ * size_t num_elements,
+ * event_t event)
+ * Perform an async copy of num_elements
+ * gentype elements from src to dst. The async
+ * copy is performed by all work-items in a workgroup
+ * and this built-in function must therefore
+ * be encountered by all work-items in a workgroup
+ * executing the kernel with the same
+ * argument values; otherwise the results are
+ * undefined.
+ * Returns an event object that can be used by
+ * wait_group_events to wait for the async copy
+ * to finish. The event argument can also be used
+ * to associate the async_work_group_copy with
+ * a previous async copy allowing an event to be
+ * shared by multiple async copies; otherwise event
+ * should be zero.
+ * If event argument is non-zero, the event object
+ * supplied in event argument will be returned.
+ * This function does not perform any implicit
+ * synchronization of source data such as using a
+ * barrier before performing the copy.
+ */
+event_t __ovld async_work_group_copy(__local char *dst, const __global char *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar *dst, const __global uchar *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short *dst, const __global short *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort *dst, const __global ushort *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int *dst, const __global int *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint *dst, const __global uint *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long *dst, const __global long *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong *dst, const __global ulong *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float *dst, const __global float *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char2 *dst, const __global char2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar2 *dst, const __global uchar2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short2 *dst, const __global short2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort2 *dst, const __global ushort2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int2 *dst, const __global int2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint2 *dst, const __global uint2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long2 *dst, const __global long2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong2 *dst, const __global ulong2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float2 *dst, const __global float2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char3 *dst, const __global char3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar3 *dst, const __global uchar3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short3 *dst, const __global short3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort3 *dst, const __global ushort3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int3 *dst, const __global int3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint3 *dst, const __global uint3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long3 *dst, const __global long3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong3 *dst, const __global ulong3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float3 *dst, const __global float3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char4 *dst, const __global char4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar4 *dst, const __global uchar4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short4 *dst, const __global short4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort4 *dst, const __global ushort4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int4 *dst, const __global int4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint4 *dst, const __global uint4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long4 *dst, const __global long4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong4 *dst, const __global ulong4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float4 *dst, const __global float4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char8 *dst, const __global char8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar8 *dst, const __global uchar8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short8 *dst, const __global short8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort8 *dst, const __global ushort8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int8 *dst, const __global int8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint8 *dst, const __global uint8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long8 *dst, const __global long8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong8 *dst, const __global ulong8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float8 *dst, const __global float8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char16 *dst, const __global char16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar16 *dst, const __global uchar16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short16 *dst, const __global short16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort16 *dst, const __global ushort16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int16 *dst, const __global int16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint16 *dst, const __global uint16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long16 *dst, const __global long16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong16 *dst, const __global ulong16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float16 *dst, const __global float16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char *dst, const __local char *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar *dst, const __local uchar *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short *dst, const __local short *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort *dst, const __local ushort *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int *dst, const __local int *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint *dst, const __local uint *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long *dst, const __local long *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong *dst, const __local ulong *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float *dst, const __local float *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char2 *dst, const __local char2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar2 *dst, const __local uchar2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short2 *dst, const __local short2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort2 *dst, const __local ushort2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int2 *dst, const __local int2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint2 *dst, const __local uint2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long2 *dst, const __local long2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong2 *dst, const __local ulong2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float2 *dst, const __local float2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char3 *dst, const __local char3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar3 *dst, const __local uchar3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short3 *dst, const __local short3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort3 *dst, const __local ushort3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int3 *dst, const __local int3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint3 *dst, const __local uint3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long3 *dst, const __local long3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong3 *dst, const __local ulong3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float3 *dst, const __local float3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char4 *dst, const __local char4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar4 *dst, const __local uchar4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short4 *dst, const __local short4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort4 *dst, const __local ushort4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int4 *dst, const __local int4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint4 *dst, const __local uint4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long4 *dst, const __local long4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong4 *dst, const __local ulong4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float4 *dst, const __local float4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char8 *dst, const __local char8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar8 *dst, const __local uchar8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short8 *dst, const __local short8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort8 *dst, const __local ushort8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int8 *dst, const __local int8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint8 *dst, const __local uint8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long8 *dst, const __local long8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong8 *dst, const __local ulong8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float8 *dst, const __local float8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char16 *dst, const __local char16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar16 *dst, const __local uchar16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short16 *dst, const __local short16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort16 *dst, const __local ushort16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int16 *dst, const __local int16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint16 *dst, const __local uint16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long16 *dst, const __local long16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong16 *dst, const __local ulong16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float16 *dst, const __local float16 *src, size_t num_elements, event_t event);
+#ifdef cl_khr_fp64
+event_t __ovld async_work_group_copy(__local double *dst, const __global double *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double2 *dst, const __global double2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double3 *dst, const __global double3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double4 *dst, const __global double4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double8 *dst, const __global double8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double16 *dst, const __global double16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double *dst, const __local double *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double2 *dst, const __local double2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double3 *dst, const __local double3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double4 *dst, const __local double4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double8 *dst, const __local double8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double16 *dst, const __local double16 *src, size_t num_elements, event_t event);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+event_t __ovld async_work_group_copy(__local half *dst, const __global half *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half2 *dst, const __global half2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half3 *dst, const __global half3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half4 *dst, const __global half4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half8 *dst, const __global half8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half16 *dst, const __global half16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half *dst, const __local half *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half2 *dst, const __local half2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half3 *dst, const __local half3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half4 *dst, const __local half4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half8 *dst, const __local half8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half16 *dst, const __local half16 *src, size_t num_elements, event_t event);
+#endif //cl_khr_fp16
+
+/**
+ * Perform an async gather of num_elements
+ * gentype elements from src to dst. The
+ * src_stride is the stride in elements for each
+ * gentype element read from src. The dst_stride
+ * is the stride in elements for each gentype
+ * element written to dst. The async gather is
+ * performed by all work-items in a work-group.
+ * This built-in function must therefore be
+ * encountered by all work-items in a work-group
+ * executing the kernel with the same argument
+ * values; otherwise the results are undefined.
+ * Returns an event object that can be used by
+ * wait_group_events to wait for the async copy
+ * to finish. The event argument can also be used
+ * to associate the
+ * async_work_group_strided_copy with a
+ * previous async copy allowing an event to be
+ * shared by multiple async copies; otherwise event
+ * should be zero.
+ * If event argument is non-zero, the event object
+ * supplied in event argument will be returned.
+ * This function does not perform any implicit
+ * synchronization of source data such as using a
+ * barrier before performing the copy.
+ */
+event_t __ovld async_work_group_strided_copy(__local char *dst, const __global char *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar *dst, const __global uchar *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short *dst, const __global short *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort *dst, const __global ushort *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int *dst, const __global int *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint *dst, const __global uint *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long *dst, const __global long *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong *dst, const __global ulong *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float *dst, const __global float *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char2 *dst, const __global char2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar2 *dst, const __global uchar2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short2 *dst, const __global short2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort2 *dst, const __global ushort2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int2 *dst, const __global int2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint2 *dst, const __global uint2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long2 *dst, const __global long2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong2 *dst, const __global ulong2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float2 *dst, const __global float2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char3 *dst, const __global char3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar3 *dst, const __global uchar3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short3 *dst, const __global short3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort3 *dst, const __global ushort3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int3 *dst, const __global int3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint3 *dst, const __global uint3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long3 *dst, const __global long3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong3 *dst, const __global ulong3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float3 *dst, const __global float3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char4 *dst, const __global char4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar4 *dst, const __global uchar4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short4 *dst, const __global short4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort4 *dst, const __global ushort4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int4 *dst, const __global int4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint4 *dst, const __global uint4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long4 *dst, const __global long4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong4 *dst, const __global ulong4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float4 *dst, const __global float4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char8 *dst, const __global char8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar8 *dst, const __global uchar8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short8 *dst, const __global short8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort8 *dst, const __global ushort8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int8 *dst, const __global int8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint8 *dst, const __global uint8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long8 *dst, const __global long8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong8 *dst, const __global ulong8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float8 *dst, const __global float8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char16 *dst, const __global char16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar16 *dst, const __global uchar16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short16 *dst, const __global short16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort16 *dst, const __global ushort16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int16 *dst, const __global int16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint16 *dst, const __global uint16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long16 *dst, const __global long16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong16 *dst, const __global ulong16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float16 *dst, const __global float16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char *dst, const __local char *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar *dst, const __local uchar *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short *dst, const __local short *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort *dst, const __local ushort *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int *dst, const __local int *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint *dst, const __local uint *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long *dst, const __local long *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong *dst, const __local ulong *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float *dst, const __local float *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char2 *dst, const __local char2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar2 *dst, const __local uchar2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short2 *dst, const __local short2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort2 *dst, const __local ushort2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int2 *dst, const __local int2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint2 *dst, const __local uint2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long2 *dst, const __local long2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong2 *dst, const __local ulong2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float2 *dst, const __local float2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char3 *dst, const __local char3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar3 *dst, const __local uchar3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short3 *dst, const __local short3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort3 *dst, const __local ushort3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int3 *dst, const __local int3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint3 *dst, const __local uint3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long3 *dst, const __local long3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong3 *dst, const __local ulong3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float3 *dst, const __local float3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char4 *dst, const __local char4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar4 *dst, const __local uchar4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short4 *dst, const __local short4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort4 *dst, const __local ushort4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int4 *dst, const __local int4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint4 *dst, const __local uint4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long4 *dst, const __local long4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong4 *dst, const __local ulong4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float4 *dst, const __local float4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char8 *dst, const __local char8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar8 *dst, const __local uchar8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short8 *dst, const __local short8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort8 *dst, const __local ushort8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int8 *dst, const __local int8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint8 *dst, const __local uint8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long8 *dst, const __local long8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong8 *dst, const __local ulong8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float8 *dst, const __local float8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char16 *dst, const __local char16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar16 *dst, const __local uchar16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short16 *dst, const __local short16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort16 *dst, const __local ushort16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int16 *dst, const __local int16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint16 *dst, const __local uint16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long16 *dst, const __local long16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong16 *dst, const __local ulong16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float16 *dst, const __local float16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#ifdef cl_khr_fp64
+event_t __ovld async_work_group_strided_copy(__local double *dst, const __global double *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double2 *dst, const __global double2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double3 *dst, const __global double3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double4 *dst, const __global double4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double8 *dst, const __global double8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double16 *dst, const __global double16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double *dst, const __local double *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double2 *dst, const __local double2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double3 *dst, const __local double3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double4 *dst, const __local double4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double8 *dst, const __local double8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double16 *dst, const __local double16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+event_t __ovld async_work_group_strided_copy(__local half *dst, const __global half *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half2 *dst, const __global half2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half3 *dst, const __global half3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half4 *dst, const __global half4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half8 *dst, const __global half8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half16 *dst, const __global half16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half *dst, const __local half *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half2 *dst, const __local half2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half3 *dst, const __local half3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half4 *dst, const __local half4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half8 *dst, const __local half8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half16 *dst, const __local half16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#endif //cl_khr_fp16
+
+/**
+ * Wait for events that identify the
+ * async_work_group_copy operations to
+ * complete. The event objects specified in
+ * event_list will be released after the wait is
+ * performed.
+ * This function must be encountered by all workitems
+ * in a work-group executing the kernel with
+ * the same num_events and event objects specified
+ * in event_list; otherwise the results are undefined.
+ */
+void __ovld wait_group_events(int num_events, event_t *event_list);
+
+/**
+ * Prefetch num_elements * sizeof(gentype)
+ * bytes into the global cache. The prefetch
+ * instruction is applied to a work-item in a workgroup
+ * and does not affect the functional
+ * behavior of the kernel.
+ */
+void __ovld prefetch(const __global char *p, size_t num_elements);
+void __ovld prefetch(const __global uchar *p, size_t num_elements);
+void __ovld prefetch(const __global short *p, size_t num_elements);
+void __ovld prefetch(const __global ushort *p, size_t num_elements);
+void __ovld prefetch(const __global int *p, size_t num_elements);
+void __ovld prefetch(const __global uint *p, size_t num_elements);
+void __ovld prefetch(const __global long *p, size_t num_elements);
+void __ovld prefetch(const __global ulong *p, size_t num_elements);
+void __ovld prefetch(const __global float *p, size_t num_elements);
+void __ovld prefetch(const __global char2 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar2 *p, size_t num_elements);
+void __ovld prefetch(const __global short2 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort2 *p, size_t num_elements);
+void __ovld prefetch(const __global int2 *p, size_t num_elements);
+void __ovld prefetch(const __global uint2 *p, size_t num_elements);
+void __ovld prefetch(const __global long2 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong2 *p, size_t num_elements);
+void __ovld prefetch(const __global float2 *p, size_t num_elements);
+void __ovld prefetch(const __global char3 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar3 *p, size_t num_elements);
+void __ovld prefetch(const __global short3 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort3 *p, size_t num_elements);
+void __ovld prefetch(const __global int3 *p, size_t num_elements);
+void __ovld prefetch(const __global uint3 *p, size_t num_elements);
+void __ovld prefetch(const __global long3 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong3 *p, size_t num_elements);
+void __ovld prefetch(const __global float3 *p, size_t num_elements);
+void __ovld prefetch(const __global char4 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar4 *p, size_t num_elements);
+void __ovld prefetch(const __global short4 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort4 *p, size_t num_elements);
+void __ovld prefetch(const __global int4 *p, size_t num_elements);
+void __ovld prefetch(const __global uint4 *p, size_t num_elements);
+void __ovld prefetch(const __global long4 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong4 *p, size_t num_elements);
+void __ovld prefetch(const __global float4 *p, size_t num_elements);
+void __ovld prefetch(const __global char8 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar8 *p, size_t num_elements);
+void __ovld prefetch(const __global short8 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort8 *p, size_t num_elements);
+void __ovld prefetch(const __global int8 *p, size_t num_elements);
+void __ovld prefetch(const __global uint8 *p, size_t num_elements);
+void __ovld prefetch(const __global long8 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong8 *p, size_t num_elements);
+void __ovld prefetch(const __global float8 *p, size_t num_elements);
+void __ovld prefetch(const __global char16 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar16 *p, size_t num_elements);
+void __ovld prefetch(const __global short16 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort16 *p, size_t num_elements);
+void __ovld prefetch(const __global int16 *p, size_t num_elements);
+void __ovld prefetch(const __global uint16 *p, size_t num_elements);
+void __ovld prefetch(const __global long16 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong16 *p, size_t num_elements);
+void __ovld prefetch(const __global float16 *p, size_t num_elements);
+#ifdef cl_khr_fp64
+void __ovld prefetch(const __global double *p, size_t num_elements);
+void __ovld prefetch(const __global double2 *p, size_t num_elements);
+void __ovld prefetch(const __global double3 *p, size_t num_elements);
+void __ovld prefetch(const __global double4 *p, size_t num_elements);
+void __ovld prefetch(const __global double8 *p, size_t num_elements);
+void __ovld prefetch(const __global double16 *p, size_t num_elements);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld prefetch(const __global half *p, size_t num_elements);
+void __ovld prefetch(const __global half2 *p, size_t num_elements);
+void __ovld prefetch(const __global half3 *p, size_t num_elements);
+void __ovld prefetch(const __global half4 *p, size_t num_elements);
+void __ovld prefetch(const __global half8 *p, size_t num_elements);
+void __ovld prefetch(const __global half16 *p, size_t num_elements);
+#endif // cl_khr_fp16
+
+// OpenCL v1.1 s6.11.1, v1.2 s6.12.11 - Atomic Functions
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+#endif
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old + val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_add(volatile __global int *p, int val);
+unsigned int __ovld atomic_add(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_add(volatile __local int *p, int val);
+unsigned int __ovld atomic_add(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_add(volatile __global int *p, int val);
+unsigned int __ovld atom_add(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_add(volatile __local int *p, int val);
+unsigned int __ovld atom_add(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_add(volatile __global long *p, long val);
+unsigned long __ovld atom_add(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_add(volatile __local long *p, long val);
+unsigned long __ovld atom_add(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old) stored at location pointed by p.
+ * Compute (old - val) and store result at location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_sub(volatile __global int *p, int val);
+unsigned int __ovld atomic_sub(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_sub(volatile __local int *p, int val);
+unsigned int __ovld atomic_sub(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_sub(volatile __global int *p, int val);
+unsigned int __ovld atom_sub(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_sub(volatile __local int *p, int val);
+unsigned int __ovld atom_sub(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_sub(volatile __global long *p, long val);
+unsigned long __ovld atom_sub(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_sub(volatile __local long *p, long val);
+unsigned long __ovld atom_sub(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Swaps the old value stored at location p
+ * with new value given by val. Returns old
+ * value.
+ */
+int __ovld atomic_xchg(volatile __global int *p, int val);
+unsigned int __ovld atomic_xchg(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_xchg(volatile __local int *p, int val);
+unsigned int __ovld atomic_xchg(volatile __local unsigned int *p, unsigned int val);
+float __ovld atomic_xchg(volatile __global float *p, float val);
+float __ovld atomic_xchg(volatile __local float *p, float val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_xchg(volatile __global int *p, int val);
+int __ovld atom_xchg(volatile __local int *p, int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
+unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_xchg(volatile __global long *p, long val);
+long __ovld atom_xchg(volatile __local long *p, long val);
+unsigned long __ovld atom_xchg(volatile __global unsigned long *p, unsigned long val);
+unsigned long __ovld atom_xchg(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old + 1) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_inc(volatile __global int *p);
+unsigned int __ovld atomic_inc(volatile __global unsigned int *p);
+int __ovld atomic_inc(volatile __local int *p);
+unsigned int __ovld atomic_inc(volatile __local unsigned int *p);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_inc(volatile __global int *p);
+unsigned int __ovld atom_inc(volatile __global unsigned int *p);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_inc(volatile __local int *p);
+unsigned int __ovld atom_inc(volatile __local unsigned int *p);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_inc(volatile __global long *p);
+unsigned long __ovld atom_inc(volatile __global unsigned long *p);
+long __ovld atom_inc(volatile __local long *p);
+unsigned long __ovld atom_inc(volatile __local unsigned long *p);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old - 1) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_dec(volatile __global int *p);
+unsigned int __ovld atomic_dec(volatile __global unsigned int *p);
+int __ovld atomic_dec(volatile __local int *p);
+unsigned int __ovld atomic_dec(volatile __local unsigned int *p);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_dec(volatile __global int *p);
+unsigned int __ovld atom_dec(volatile __global unsigned int *p);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_dec(volatile __local int *p);
+unsigned int __ovld atom_dec(volatile __local unsigned int *p);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_dec(volatile __global long *p);
+unsigned long __ovld atom_dec(volatile __global unsigned long *p);
+long __ovld atom_dec(volatile __local long *p);
+unsigned long __ovld atom_dec(volatile __local unsigned long *p);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old == cmp) ? val : old and store result at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
+unsigned int __ovld atomic_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
+int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
+unsigned int __ovld atomic_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
+unsigned int __ovld atom_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
+unsigned int __ovld atom_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
+unsigned long __ovld atom_cmpxchg(volatile __global unsigned long *p, unsigned long cmp, unsigned long val);
+long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
+unsigned long __ovld atom_cmpxchg(volatile __local unsigned long *p, unsigned long cmp, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * min(old, val) and store minimum value at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_min(volatile __global int *p, int val);
+unsigned int __ovld atomic_min(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_min(volatile __local int *p, int val);
+unsigned int __ovld atomic_min(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_min(volatile __global int *p, int val);
+unsigned int __ovld atom_min(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_min(volatile __local int *p, int val);
+unsigned int __ovld atom_min(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_min(volatile __global long *p, long val);
+unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+long __ovld atom_min(volatile __local long *p, long val);
+unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * max(old, val) and store maximum value at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_max(volatile __global int *p, int val);
+unsigned int __ovld atomic_max(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_max(volatile __local int *p, int val);
+unsigned int __ovld atomic_max(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_max(volatile __global int *p, int val);
+unsigned int __ovld atom_max(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_max(volatile __local int *p, int val);
+unsigned int __ovld atom_max(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_max(volatile __global long *p, long val);
+unsigned long __ovld atom_max(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_max(volatile __local long *p, long val);
+unsigned long __ovld atom_max(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old & val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_and(volatile __global int *p, int val);
+unsigned int __ovld atomic_and(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_and(volatile __local int *p, int val);
+unsigned int __ovld atomic_and(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_and(volatile __global int *p, int val);
+unsigned int __ovld atom_and(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_and(volatile __local int *p, int val);
+unsigned int __ovld atom_and(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_and(volatile __global long *p, long val);
+unsigned long __ovld atom_and(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_and(volatile __local long *p, long val);
+unsigned long __ovld atom_and(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old | val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_or(volatile __global int *p, int val);
+unsigned int __ovld atomic_or(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_or(volatile __local int *p, int val);
+unsigned int __ovld atomic_or(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_or(volatile __global int *p, int val);
+unsigned int __ovld atom_or(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_or(volatile __local int *p, int val);
+unsigned int __ovld atom_or(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_or(volatile __global long *p, long val);
+unsigned long __ovld atom_or(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_or(volatile __local long *p, long val);
+unsigned long __ovld atom_or(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old ^ val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_xor(volatile __global int *p, int val);
+unsigned int __ovld atomic_xor(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_xor(volatile __local int *p, int val);
+unsigned int __ovld atomic_xor(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_xor(volatile __global int *p, int val);
+unsigned int __ovld atom_xor(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_xor(volatile __local int *p, int val);
+unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
+#endif
+
+// OpenCL v2.0 s6.13.11 - Atomics Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#ifndef ATOMIC_VAR_INIT
+#define ATOMIC_VAR_INIT(x) (x)
+#endif //ATOMIC_VAR_INIT
+#define ATOMIC_FLAG_INIT 0
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum memory_order
+{
+  memory_order_relaxed,
+  memory_order_acquire,
+  memory_order_release,
+  memory_order_acq_rel,
+  memory_order_seq_cst
+} memory_order;
+
+// double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+#endif
+
+// atomic_init()
+void __ovld atomic_init(volatile atomic_int *object, int value);
+void __ovld atomic_init(volatile atomic_uint *object, uint value);
+void __ovld atomic_init(volatile atomic_float *object, float value);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+void __ovld atomic_init(volatile atomic_long *object, long value);
+void __ovld atomic_init(volatile atomic_ulong *object, ulong value);
+#ifdef cl_khr_fp64
+void __ovld atomic_init(volatile atomic_double *object, double value);
+#endif //cl_khr_fp64
+#endif
+
+// atomic_work_item_fence()
+void __ovld atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);
+
+// atomic_fetch()
+
+int __ovld atomic_fetch_add(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_add(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_sub(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_sub(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_or(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_or(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_xor(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_xor(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_and(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_and(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_min(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_min(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_min(volatile atomic_uint *object, int operand);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_max(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_max(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_max(volatile atomic_uint *object, int operand);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+long __ovld atomic_fetch_add(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_add(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_sub(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_sub(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_or(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_or(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_xor(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_xor(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_and(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_and(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_min(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, long operand);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_max(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, long operand);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
+#endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+
+// OpenCL v2.0 s6.13.11.7.5:
+// add/sub: atomic type argument can be uintptr_t/intptr_t, value type argument can be ptrdiff_t.
+// or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t.
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) 
+uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_sub(volatile atomic_uintptr_t *object, ptrdiff_t operand);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
+
+uintptr_t __ovld atomic_fetch_or(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_xor(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_and(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_min(volatile atomic_uintptr_t *object, intptr_t opermax);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder, memory_scope scope);
+uintptr_t __ovld atomic_fetch_max(volatile atomic_uintptr_t *object, intptr_t opermax);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder, memory_scope scope);
+
+intptr_t __ovld atomic_fetch_or(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_or_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_or_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_xor(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_and(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_and_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_and_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_min(volatile atomic_intptr_t *object, uintptr_t opermax);
+intptr_t __ovld atomic_fetch_min_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder);
+intptr_t __ovld atomic_fetch_min_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder, memory_scope scope);
+intptr_t __ovld atomic_fetch_max(volatile atomic_intptr_t *object, uintptr_t opermax);
+intptr_t __ovld atomic_fetch_max_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder);
+intptr_t __ovld atomic_fetch_max_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder, memory_scope scope);
+#endif
+
+// atomic_store()
+
+void __ovld atomic_store(volatile atomic_int *object, int desired);
+void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_uint *object, uint desired);
+void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_float *object, float desired);
+void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+void __ovld atomic_store(volatile atomic_double *object, double desired);
+void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+void __ovld atomic_store(volatile atomic_long *object, long desired);
+void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_ulong *object, ulong desired);
+void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+#endif
+
+// atomic_load()
+
+int __ovld atomic_load(volatile atomic_int *object);
+int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order);
+int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order, memory_scope scope);
+uint __ovld atomic_load(volatile atomic_uint *object);
+uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order);
+uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order, memory_scope scope);
+float __ovld atomic_load(volatile atomic_float *object);
+float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order);
+float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+double __ovld atomic_load(volatile atomic_double *object);
+double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order);
+double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+long __ovld atomic_load(volatile atomic_long *object);
+long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order);
+long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order, memory_scope scope);
+ulong __ovld atomic_load(volatile atomic_ulong *object);
+ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order);
+ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order, memory_scope scope);
+#endif
+
+// atomic_exchange()
+
+int __ovld atomic_exchange(volatile atomic_int *object, int desired);
+int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order);
+int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
+uint __ovld atomic_exchange(volatile atomic_uint *object, uint desired);
+uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order);
+uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
+float __ovld atomic_exchange(volatile atomic_float *object, float desired);
+float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order);
+float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+double __ovld atomic_exchange(volatile atomic_double *object, double desired);
+double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order);
+double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+long __ovld atomic_exchange(volatile atomic_long *object, long desired);
+long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order);
+long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
+ulong __ovld atomic_exchange(volatile atomic_ulong *object, ulong desired);
+ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
+ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+#endif
+
+// atomic_compare_exchange_strong() and atomic_compare_exchange_weak()
+
+bool __ovld atomic_compare_exchange_strong(volatile atomic_int *object, int *expected, int desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_uint *object, uint *expected, uint desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_int *object, int *expected, int desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_uint *object, uint *expected, uint desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_float *object, float *expected, float desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_float *object, float *expected, float desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+bool __ovld atomic_compare_exchange_strong(volatile atomic_double *object, double *expected, double desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_double *object, double *expected, double desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
+#endif //cl_khr_fp64
+bool __ovld atomic_compare_exchange_strong(volatile atomic_long *object, long *expected, long desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_long *object, long *expected, long desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_ulong *object, ulong *expected, ulong desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_ulong *object, ulong *expected, ulong desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
+#endif
+
+// atomic_flag_test_and_set() and atomic_flag_clear()
+
+bool __ovld atomic_flag_test_and_set(volatile atomic_flag *object);
+bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
+bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+void __ovld atomic_flag_clear(volatile atomic_flag *object);
+void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
+void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.12, v1.2 s6.12.12, v2.0 s6.13.12 - Miscellaneous Vector Functions
+
+/**
+ * The shuffle and shuffle2 built-in functions construct
+ * a permutation of elements from one or two input
+ * vectors respectively that are of the same type,
+ * returning a vector with the same element type as the
+ * input and length that is the same as the shuffle mask.
+ * The size of each element in the mask must match the
+ * size of each element in the result. For shuffle, only
+ * the ilogb(2m-1) least significant bits of each mask
+ * element are considered. For shuffle2, only the
+ * ilogb(2m-1)+1 least significant bits of each mask
+ * element are considered. Other bits in the mask shall
+ * be ignored.
+ * The elements of the input vectors are numbered from
+ * left to right across one or both of the vectors. For this
+ * purpose, the number of elements in a vector is given
+ * by vec_step(gentypem). The shuffle mask operand
+ * specifies, for each element of the result vector, which
+ * element of the one or two input vectors the result
+ * element gets.
+ * Examples:
+ * uint4 mask = (uint4)(3, 2,
+ * 1, 0);
+ * float4 a;
+ * float4 r = shuffle(a, mask);
+ * // r.s0123 = a.wzyx
+ * uint8 mask = (uint8)(0, 1, 2, 3,
+ * 4, 5, 6, 7);
+ * float4 a, b;
+ * float8 r = shuffle2(a, b, mask);
+ * // r.s0123 = a.xyzw
+ * // r.s4567 = b.xyzw
+ * uint4 mask;
+ * float8 a;
+ * float4 b;
+ * b = shuffle(a, mask);
+ * Examples that are not valid are:
+ * uint8 mask;
+ * short16 a;
+ * short8 b;
+ * b = shuffle(a, mask); <- not valid
+ */
+char2 __ovld __cnfn shuffle(char2 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char4 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char8 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char16 x, uchar2 mask);
+
+uchar2 __ovld __cnfn shuffle(uchar2 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar4 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar8 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar16 x, uchar2 mask);
+
+short2 __ovld __cnfn shuffle(short2 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short4 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short8 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short16 x, ushort2 mask);
+
+ushort2 __ovld __cnfn shuffle(ushort2 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort4 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort8 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort16 x, ushort2 mask);
+
+int2 __ovld __cnfn shuffle(int2 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int4 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int8 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int16 x, uint2 mask);
+
+uint2 __ovld __cnfn shuffle(uint2 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint4 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint8 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint16 x, uint2 mask);
+
+long2 __ovld __cnfn shuffle(long2 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long4 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long8 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long16 x, ulong2 mask);
+
+ulong2 __ovld __cnfn shuffle(ulong2 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong4 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong8 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong16 x, ulong2 mask);
+
+float2 __ovld __cnfn shuffle(float2 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float4 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float8 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float16 x, uint2 mask);
+
+char4 __ovld __cnfn shuffle(char2 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char4 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char8 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char16 x, uchar4 mask);
+
+uchar4 __ovld __cnfn shuffle(uchar2 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar4 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar8 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar16 x, uchar4 mask);
+
+short4 __ovld __cnfn shuffle(short2 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short4 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short8 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short16 x, ushort4 mask);
+
+ushort4 __ovld __cnfn shuffle(ushort2 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort4 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort8 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort16 x, ushort4 mask);
+
+int4 __ovld __cnfn shuffle(int2 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int4 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int8 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int16 x, uint4 mask);
+
+uint4 __ovld __cnfn shuffle(uint2 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint4 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint8 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint16 x, uint4 mask);
+
+long4 __ovld __cnfn shuffle(long2 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long4 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long8 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long16 x, ulong4 mask);
+
+ulong4 __ovld __cnfn shuffle(ulong2 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong4 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong8 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong16 x, ulong4 mask);
+
+float4 __ovld __cnfn shuffle(float2 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float4 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float8 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float16 x, uint4 mask);
+
+char8 __ovld __cnfn shuffle(char2 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char4 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char8 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char16 x, uchar8 mask);
+
+uchar8 __ovld __cnfn shuffle(uchar2 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar4 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar8 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar16 x, uchar8 mask);
+
+short8 __ovld __cnfn shuffle(short2 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short4 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short8 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short16 x, ushort8 mask);
+
+ushort8 __ovld __cnfn shuffle(ushort2 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort4 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort8 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort16 x, ushort8 mask);
+
+int8 __ovld __cnfn shuffle(int2 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int4 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int8 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int16 x, uint8 mask);
+
+uint8 __ovld __cnfn shuffle(uint2 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint4 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint8 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint16 x, uint8 mask);
+
+long8 __ovld __cnfn shuffle(long2 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long4 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long8 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long16 x, ulong8 mask);
+
+ulong8 __ovld __cnfn shuffle(ulong2 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong4 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong8 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong16 x, ulong8 mask);
+
+float8 __ovld __cnfn shuffle(float2 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float4 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float8 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float16 x, uint8 mask);
+
+char16 __ovld __cnfn shuffle(char2 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char4 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char8 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char16 x, uchar16 mask);
+
+uchar16 __ovld __cnfn shuffle(uchar2 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar4 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar8 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar16 x, uchar16 mask);
+
+short16 __ovld __cnfn shuffle(short2 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short4 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short8 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short16 x, ushort16 mask);
+
+ushort16 __ovld __cnfn shuffle(ushort2 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort4 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort8 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort16 x, ushort16 mask);
+
+int16 __ovld __cnfn shuffle(int2 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int4 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int8 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int16 x, uint16 mask);
+
+uint16 __ovld __cnfn shuffle(uint2 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint4 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint8 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint16 x, uint16 mask);
+
+long16 __ovld __cnfn shuffle(long2 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long4 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long8 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long16 x, ulong16 mask);
+
+ulong16 __ovld __cnfn shuffle(ulong2 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong4 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong8 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong16 x, ulong16 mask);
+
+float16 __ovld __cnfn shuffle(float2 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float4 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float8 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float16 x, uint16 mask);
+
+#ifdef cl_khr_fp64
+double2 __ovld __cnfn shuffle(double2 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double4 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double8 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double16 x, ulong2 mask);
+
+double4 __ovld __cnfn shuffle(double2 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double4 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double8 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double16 x, ulong4 mask);
+
+double8 __ovld __cnfn shuffle(double2 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double4 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double8 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double16 x, ulong8 mask);
+
+double16 __ovld __cnfn shuffle(double2 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double4 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double8 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double16 x, ulong16 mask);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half2 __ovld __cnfn shuffle(half2 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half4 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half8 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half16 x, ushort2 mask);
+
+half4 __ovld __cnfn shuffle(half2 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half4 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half8 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half16 x, ushort4 mask);
+
+half8 __ovld __cnfn shuffle(half2 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half4 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half8 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half16 x, ushort8 mask);
+
+half16 __ovld __cnfn shuffle(half2 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half4 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half8 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half16 x, ushort16 mask);
+#endif //cl_khr_fp16
+
+char2 __ovld __cnfn shuffle2(char2 x, char2 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char4 x, char4 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char8 x, char8 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char16 x, char16 y, uchar2 mask);
+
+uchar2 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar2 mask);
+
+short2 __ovld __cnfn shuffle2(short2 x, short2 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short4 x, short4 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short8 x, short8 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short16 x, short16 y, ushort2 mask);
+
+ushort2 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort2 mask);
+
+int2 __ovld __cnfn shuffle2(int2 x, int2 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int4 x, int4 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int8 x, int8 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int16 x, int16 y, uint2 mask);
+
+uint2 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint2 mask);
+
+long2 __ovld __cnfn shuffle2(long2 x, long2 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long4 x, long4 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long8 x, long8 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long16 x, long16 y, ulong2 mask);
+
+ulong2 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong2 mask);
+
+float2 __ovld __cnfn shuffle2(float2 x, float2 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float4 x, float4 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float8 x, float8 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float16 x, float16 y, uint2 mask);
+
+char4 __ovld __cnfn shuffle2(char2 x, char2 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char4 x, char4 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char8 x, char8 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char16 x, char16 y, uchar4 mask);
+
+uchar4 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar4 mask);
+
+short4 __ovld __cnfn shuffle2(short2 x, short2 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short4 x, short4 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short8 x, short8 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short16 x, short16 y, ushort4 mask);
+
+ushort4 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort4 mask);
+
+int4 __ovld __cnfn shuffle2(int2 x, int2 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int4 x, int4 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int8 x, int8 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int16 x, int16 y, uint4 mask);
+
+uint4 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint4 mask);
+
+long4 __ovld __cnfn shuffle2(long2 x, long2 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long4 x, long4 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long8 x, long8 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long16 x, long16 y, ulong4 mask);
+
+ulong4 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong4 mask);
+
+float4 __ovld __cnfn shuffle2(float2 x, float2 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float4 x, float4 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float8 x, float8 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float16 x, float16 y, uint4 mask);
+
+char8 __ovld __cnfn shuffle2(char2 x, char2 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char4 x, char4 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char8 x, char8 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char16 x, char16 y, uchar8 mask);
+
+uchar8 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar8 mask);
+
+short8 __ovld __cnfn shuffle2(short2 x, short2 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short4 x, short4 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short8 x, short8 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short16 x, short16 y, ushort8 mask);
+
+ushort8 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort8 mask);
+
+int8 __ovld __cnfn shuffle2(int2 x, int2 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int4 x, int4 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int8 x, int8 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int16 x, int16 y, uint8 mask);
+
+uint8 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint8 mask);
+
+long8 __ovld __cnfn shuffle2(long2 x, long2 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long4 x, long4 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long8 x, long8 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long16 x, long16 y, ulong8 mask);
+
+ulong8 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong8 mask);
+
+float8 __ovld __cnfn shuffle2(float2 x, float2 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float4 x, float4 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float8 x, float8 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float16 x, float16 y, uint8 mask);
+
+char16 __ovld __cnfn shuffle2(char2 x, char2 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char4 x, char4 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char8 x, char8 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char16 x, char16 y, uchar16 mask);
+
+uchar16 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar16 mask);
+
+short16 __ovld __cnfn shuffle2(short2 x, short2 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short4 x, short4 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short8 x, short8 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short16 x, short16 y, ushort16 mask);
+
+ushort16 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort16 mask);
+
+int16 __ovld __cnfn shuffle2(int2 x, int2 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int4 x, int4 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int8 x, int8 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int16 x, int16 y, uint16 mask);
+
+uint16 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint16 mask);
+
+long16 __ovld __cnfn shuffle2(long2 x, long2 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long4 x, long4 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long8 x, long8 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long16 x, long16 y, ulong16 mask);
+
+ulong16 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong16 mask);
+
+float16 __ovld __cnfn shuffle2(float2 x, float2 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float4 x, float4 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float8 x, float8 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float16 x, float16 y, uint16 mask);
+
+#ifdef cl_khr_fp64
+double2 __ovld __cnfn shuffle2(double2 x, double2 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double4 x, double4 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double8 x, double8 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double16 x, double16 y, ulong2 mask);
+
+double4 __ovld __cnfn shuffle2(double2 x, double2 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double4 x, double4 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double8 x, double8 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double16 x, double16 y, ulong4 mask);
+
+double8 __ovld __cnfn shuffle2(double2 x, double2 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double4 x, double4 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double8 x, double8 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double16 x, double16 y, ulong8 mask);
+
+double16 __ovld __cnfn shuffle2(double2 x, double2 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double4 x, double4 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double8 x, double8 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double16 x, double16 y, ulong16 mask);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half2 __ovld __cnfn shuffle2(half2 x, half2 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half4 x, half4 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half8 x, half8 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half16 x, half16 y, ushort2 mask);
+
+half4 __ovld __cnfn shuffle2(half2 x, half2 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half4 x, half4 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half8 x, half8 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half16 x, half16 y, ushort4 mask);
+
+half8 __ovld __cnfn shuffle2(half2 x, half2 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half4 x, half4 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half8 x, half8 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half16 x, half16 y, ushort8 mask);
+
+half16 __ovld __cnfn shuffle2(half2 x, half2 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half4 x, half4 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half8 x, half8 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
+#endif //cl_khr_fp16
+
+// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
+
+int printf(__constant const char* st, ...);
+
+// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
+
+// These values need to match the runtime equivalent
+//
+// Addressing Mode.
+//
+#define CLK_ADDRESS_NONE                0
+#define CLK_ADDRESS_CLAMP_TO_EDGE       2
+#define CLK_ADDRESS_CLAMP               4
+#define CLK_ADDRESS_REPEAT              6
+#define CLK_ADDRESS_MIRRORED_REPEAT     8
+
+//
+// Coordination Normalization
+//
+#define CLK_NORMALIZED_COORDS_FALSE     0
+#define CLK_NORMALIZED_COORDS_TRUE      1
+
+//
+// Filtering Mode.
+//
+#define CLK_FILTER_NEAREST              0x10
+#define CLK_FILTER_LINEAR               0x20
+
+/**
+ * Use the coordinate (coord.xy) to do an element lookup in
+ * the 2D image object specified by image.
+ *
+ * Use the coordinate (coord.x, coord.y, coord.z) to do
+ * an element lookup in the 3D image object specified
+ * by image. coord.w is ignored.
+ *
+ * Use the coordinate (coord.z) to index into the
+ * 2D image array object specified by image_array
+ * and (coord.x, coord.y) to do an element lookup in
+ * the 2D image object specified by image.
+ *
+ * Use the coordinate (x) to do an element lookup in
+ * the 1D image object specified by image.
+ *
+ * Use the coordinate (coord.y) to index into the
+ * 1D image array object specified by image_array
+ * and (coord.x) to do an element lookup in
+ * the 1D image object specified by image.
+ *
+ * Use the coordinate (cood.xy) and sample to do an
+ * element lookup in the 2D multi-sample image specified
+ * by image.
+ *
+ * Use coord.xy and sample to do an element
+ * lookup in the 2D multi-sample image layer
+ * identified by index coord.z in the 2D multi-sample
+ * image array specified by image.
+ *
+ * For mipmap images, use the mip-level specified by
+ * the Level-of-Detail (lod) or use gradients for LOD
+ * computation.
+ *
+ * read_imagef returns floating-point values in the
+ * range [0.0 ... 1.0] for image objects created with
+ * image_channel_data_type set to one of the predefined
+ * packed formats or CL_UNORM_INT8, or
+ * CL_UNORM_INT16.
+ *
+ * read_imagef returns floating-point values in the
+ * range [-1.0 ... 1.0] for image objects created with
+ * image_channel_data_type set to CL_SNORM_INT8,
+ * or CL_SNORM_INT16.
+ *
+ * read_imagef returns floating-point values for image
+ * objects created with image_channel_data_type set to
+ * CL_HALF_FLOAT or CL_FLOAT.
+ *
+ * read_imagei and read_imageui return
+ * unnormalized signed integer and unsigned integer
+ * values respectively. Each channel will be stored in a
+ * 32-bit integer.
+ *
+ * read_imagei can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_SIGNED_INT8,
+ * CL_SIGNED_INT16 and
+ * CL_SIGNED_INT32.
+ * If the image_channel_data_type is not one of the
+ * above values, the values returned by read_imagei
+ * are undefined.
+ *
+ * read_imageui can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_UNSIGNED_INT8,
+ * CL_UNSIGNED_INT16 and
+ * CL_UNSIGNED_INT32.
+ * If the image_channel_data_type is not one of the
+ * above values, the values returned by read_imageui
+ * are undefined.
+ *
+ * The read_image{i|ui} calls support a nearest filter
+ * only. The filter_mode specified in sampler
+ * must be set to CLK_FILTER_NEAREST; otherwise
+ * the values returned are undefined.
+ 
+ * The read_image{f|i|ui} calls that take
+ * integer coordinates must use a sampler with
+ * normalized coordinates set to
+ * CLK_NORMALIZED_COORDS_FALSE and
+ * addressing mode set to
+ * CLK_ADDRESS_CLAMP_TO_EDGE,
+ * CLK_ADDRESS_CLAMP or CLK_ADDRESS_NONE;
+ * otherwise the values returned are undefined.
+ *
+ * Values returned by read_imagef for image objects
+ * with image_channel_data_type values not specified
+ * in the description above are undefined.
+ */
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, int2 coord);
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord);
+
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, int4 coord);
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord);
+
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, int coord);
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord);
+
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord);
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, int2 coord);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord);
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, int4 coord);
+#endif //cl_khr_depth_images
+
+#if defined(cl_khr_gl_msaa_sharing)
+float4 __purefn __ovld read_imagef(read_only image2d_msaa_t image, int2 coord, int sample);
+int4 __purefn __ovld read_imagei(read_only image2d_msaa_t image, int2 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_only image2d_msaa_t image, int2 coord, int sample);
+
+float __purefn __ovld read_imagef(read_only image2d_msaa_depth_t image, int2 coord, int sample);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_msaa_t image, int4 coord, int sample);
+int4 __purefn __ovld read_imagei(read_only image2d_array_msaa_t image, int4 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_msaa_t image, int4 coord, int sample);
+
+float __purefn __ovld read_imagef(read_only image2d_array_msaa_depth_t image, int4 coord, int sample);
+#endif //cl_khr_gl_msaa_sharing
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+#endif //cl_khr_mipmap_image
+
+/**
+* Sampler-less Image Access
+*/
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_buffer_t image, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_buffer_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_buffer_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image, int4 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, int2 coord);
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, int4 coord);
+#endif //cl_khr_depth_images
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, int4 coord);
+
+// Image read functions returning half4 type
+#ifdef cl_khr_fp16
+half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, int coord);
+half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, float coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, float2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, float2 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, float4 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, float4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_t image, int coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_buffer_t image, int coord);
+#endif //cl_khr_fp16
+
+// Image read functions for read_write images
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float4 __purefn __ovld read_imagef(read_write image1d_t image, int coord);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_write image1d_buffer_t image, int coord);
+int4 __purefn __ovld read_imagei(read_write image1d_buffer_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_buffer_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image, int4 coord);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, int4 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, int2 coord);
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, int4 coord);
+#endif //cl_khr_depth_images
+
+#if cl_khr_gl_msaa_sharing
+float4 __purefn __ovld read_imagef(read_write image2d_msaa_t image, int2 coord, int sample);
+int4 __purefn __ovld read_imagei(read_write image2d_msaa_t image, int2 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_write image2d_msaa_t image, int2 coord, int sample);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_msaa_t image, int4 coord, int sample);
+int4 __purefn __ovld read_imagei(read_write image2d_array_msaa_t image, int4 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_msaa_t image, int4 coord, int sample);
+
+float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 coord, int sample);
+float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample);
+#endif //cl_khr_gl_msaa_sharing
+
+#ifdef cl_khr_mipmap_image
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+#endif //cl_khr_mipmap_image
+
+// Image read functions returning half4 type
+#ifdef cl_khr_fp16
+half4 __purefn __ovld read_imageh(read_write image1d_t image, int coord);
+half4 __purefn __ovld read_imageh(read_write image2d_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_write image3d_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_write image1d_array_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_write image2d_array_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_write image1d_buffer_t image, int coord);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y) in the 2D image object specified by image.
+ * (coord.x, coord.y) are considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1, and 0
+ * ... image height - 1.
+
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y) in the 2D image object specified by index
+ * (coord.z) of the 2D image array object image_array.
+ * (coord.x, coord.y) are considered to be unnormalized
+ * coordinates and must be in the range 0 ... image width
+ * - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord) in the 1D image (buffer) object specified by image.
+ * coord is considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord.x) in the 1D image object specified by index
+ * (coord.y) of the 1D image array object image_array.
+ * x is considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y, coord.z) in the 3D image object specified by image.
+ * coord.x & coord.y are considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1, and 0
+ * ... image height - 1.
+ *
+ * For mipmap images, use mip-level specified by lod.
+ *
+ * Appropriate data format conversion to the specified
+ * image format is done before writing the color value.
+ *
+ * write_imagef can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the pre-defined packed formats or set to
+ * CL_SNORM_INT8, CL_UNORM_INT8,
+ * CL_SNORM_INT16, CL_UNORM_INT16,
+ * CL_HALF_FLOAT or CL_FLOAT. Appropriate data
+ * format conversion will be done to convert channel
+ * data from a floating-point value to actual data format
+ * in which the channels are stored.
+ *
+ * write_imagei can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_SIGNED_INT8,
+ * CL_SIGNED_INT16 and
+ * CL_SIGNED_INT32.
+ *
+ * write_imageui can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_UNSIGNED_INT8,
+ * CL_UNSIGNED_INT16 and
+ * CL_UNSIGNED_INT32.
+ *
+ * The behavior of write_imagef, write_imagei and
+ * write_imageui for image objects created with
+ * image_channel_data_type values not specified in
+ * the description above or with (x, y) coordinate
+ * values that are not in the range (0 ... image width -1,
+ * 0 ... image height - 1), respectively, is undefined.
+ */
+void __ovld write_imagef(write_only image2d_t image, int2 coord, float4 color);
+void __ovld write_imagei(write_only image2d_t image, int2 coord, int4 color);
+void __ovld write_imageui(write_only image2d_t image, int2 coord, uint4 color);
+
+void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, float4 color);
+void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int4 color);
+void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_t image, int coord, float4 color);
+void __ovld write_imagei(write_only image1d_t image, int coord, int4 color);
+void __ovld write_imageui(write_only image1d_t image, int coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_buffer_t image, int coord, float4 color);
+void __ovld write_imagei(write_only image1d_buffer_t image, int coord, int4 color);
+void __ovld write_imageui(write_only image1d_buffer_t image, int coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, float4 color);
+void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color);
+void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color);
+
+void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color);
+void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color);
+void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color);
+
+#ifdef cl_khr_depth_images
+void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, float color);
+void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, float color);
+#endif //cl_khr_depth_images
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
+void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
+void __ovld write_imageui(write_only image1d_t image, int coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_t image, int2 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image2d_t image, int2 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image2d_t image, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);
+void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);
+
+void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
+#endif //cl_khr_mipmap_image
+
+// Image write functions for half4 type
+#ifdef cl_khr_fp16
+void __ovld write_imageh(write_only image1d_t image, int coord, half4 color);
+void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color);
+void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color);
+void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color);
+void __ovld write_imageh(write_only image2d_array_t image, int4 coord, half4 color);
+void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 color);
+#endif //cl_khr_fp16
+
+// Image write functions for read_write images
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld write_imagef(read_write image2d_t image, int2 coord, float4 color);
+void __ovld write_imagei(read_write image2d_t image, int2 coord, int4 color);
+void __ovld write_imageui(read_write image2d_t image, int2 coord, uint4 color);
+
+void __ovld write_imagef(read_write image2d_array_t image_array, int4 coord, float4 color);
+void __ovld write_imagei(read_write image2d_array_t image_array, int4 coord, int4 color);
+void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_t image, int coord, float4 color);
+void __ovld write_imagei(read_write image1d_t image, int coord, int4 color);
+void __ovld write_imageui(read_write image1d_t image, int coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_buffer_t image, int coord, float4 color);
+void __ovld write_imagei(read_write image1d_buffer_t image, int coord, int4 color);
+void __ovld write_imageui(read_write image1d_buffer_t image, int coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, float4 color);
+void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color);
+void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color);
+
+void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color);
+void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color);
+void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color);
+
+#ifdef cl_khr_depth_images
+void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float color);
+void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, float color);
+#endif //cl_khr_depth_images
+
+#ifdef cl_khr_mipmap_image
+void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
+void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
+void __ovld write_imageui(read_write image1d_t image, int coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_t image, int2 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image2d_t image, int2 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image2d_t image, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_array_t image_array, int4 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image2d_array_t image_array, int4 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color);
+void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color);
+
+void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
+#endif //cl_khr_mipmap_image
+
+// Image write functions for half4 type
+#ifdef cl_khr_fp16
+void __ovld write_imageh(read_write image1d_t image, int coord, half4 color);
+void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color);
+void __ovld write_imageh(read_write image3d_t image, int4 coord, half4 color);
+void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 color);
+void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color);
+void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// Note: In OpenCL v1.0/1.1/1.2, image argument of image query builtin functions does not have
+// access qualifier, which by default assume read_only access qualifier. Image query builtin
+// functions with write_only image argument should also be declared.
+
+/**
+ * Return the image width in pixels.
+ *
+  */
+int __ovld __cnfn get_image_width(read_only image1d_t image);
+int __ovld __cnfn get_image_width(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_width(read_only image2d_t image);
+int __ovld __cnfn get_image_width(read_only image3d_t image);
+int __ovld __cnfn get_image_width(read_only image1d_array_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_width(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_width(write_only image1d_t image);
+int __ovld __cnfn get_image_width(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_width(write_only image2d_t image);
+int __ovld __cnfn get_image_width(write_only image3d_t image);
+int __ovld __cnfn get_image_width(write_only image1d_array_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_width(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_width(read_write image1d_t image);
+int __ovld __cnfn get_image_width(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_width(read_write image2d_t image);
+int __ovld __cnfn get_image_width(read_write image3d_t image);
+int __ovld __cnfn get_image_width(read_write image1d_array_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_width(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image height in pixels.
+ */
+int __ovld __cnfn get_image_height(read_only image2d_t image);
+int __ovld __cnfn get_image_height(read_only image3d_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_height(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_height(write_only image2d_t image);
+int __ovld __cnfn get_image_height(write_only image3d_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_height(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_height(read_write image2d_t image);
+int __ovld __cnfn get_image_height(read_write image3d_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_height(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image depth in pixels.
+ */
+int __ovld __cnfn get_image_depth(read_only image3d_t image);
+
+int __ovld __cnfn get_image_depth(write_only image3d_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_depth(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+/**
+ * Return the image miplevels.
+ */
+
+int __ovld get_image_num_mip_levels(read_only image1d_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_t image);
+int __ovld get_image_num_mip_levels(read_only image3d_t image);
+
+int __ovld get_image_num_mip_levels(write_only image1d_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_t image);
+int __ovld get_image_num_mip_levels(write_only image3d_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_mip_levels(read_write image1d_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_t image);
+int __ovld get_image_num_mip_levels(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int __ovld get_image_num_mip_levels(read_only image1d_array_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_array_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_depth_t image);
+
+int __ovld get_image_num_mip_levels(write_only image1d_array_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_array_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_depth_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_mip_levels(read_write image1d_array_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_array_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#endif //cl_khr_mipmap_image
+
+/**
+ * Return the channel data type. Valid values are:
+ * CLK_SNORM_INT8
+ * CLK_SNORM_INT16
+ * CLK_UNORM_INT8
+ * CLK_UNORM_INT16
+ * CLK_UNORM_SHORT_565
+ * CLK_UNORM_SHORT_555
+ * CLK_UNORM_SHORT_101010
+ * CLK_SIGNED_INT8
+ * CLK_SIGNED_INT16
+ * CLK_SIGNED_INT32
+ * CLK_UNSIGNED_INT8
+ * CLK_UNSIGNED_INT16
+ * CLK_UNSIGNED_INT32
+ * CLK_HALF_FLOAT
+ * CLK_FLOAT
+ */
+
+//
+// Channel Datatype.
+//
+#define CLK_SNORM_INT8        0x10D0
+#define CLK_SNORM_INT16       0x10D1
+#define CLK_UNORM_INT8        0x10D2
+#define CLK_UNORM_INT16       0x10D3
+#define CLK_UNORM_SHORT_565   0x10D4
+#define CLK_UNORM_SHORT_555   0x10D5
+#define CLK_UNORM_INT_101010  0x10D6
+#define CLK_SIGNED_INT8       0x10D7
+#define CLK_SIGNED_INT16      0x10D8
+#define CLK_SIGNED_INT32      0x10D9
+#define CLK_UNSIGNED_INT8     0x10DA
+#define CLK_UNSIGNED_INT16    0x10DB
+#define CLK_UNSIGNED_INT32    0x10DC
+#define CLK_HALF_FLOAT        0x10DD
+#define CLK_FLOAT             0x10DE
+#define CLK_UNORM_INT24       0x10DF
+
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image channel order. Valid values are:
+ * CLK_A
+ * CLK_R
+ * CLK_Rx
+ * CLK_RG
+ * CLK_RGx
+ * CLK_RA
+ * CLK_RGB
+ * CLK_RGBx
+ * CLK_RGBA
+ * CLK_ARGB
+ * CLK_BGRA
+ * CLK_INTENSITY
+ * CLK_LUMINANCE
+ */
+// Channel order, numbering must be aligned with cl_channel_order in cl.h
+//
+#define CLK_R         0x10B0
+#define CLK_A         0x10B1
+#define CLK_RG        0x10B2
+#define CLK_RA        0x10B3
+#define CLK_RGB       0x10B4
+#define CLK_RGBA      0x10B5
+#define CLK_BGRA      0x10B6
+#define CLK_ARGB      0x10B7
+#define CLK_INTENSITY 0x10B8
+#define CLK_LUMINANCE 0x10B9
+#define CLK_Rx                0x10BA
+#define CLK_RGx               0x10BB
+#define CLK_RGBx              0x10BC
+#define CLK_DEPTH             0x10BD
+#define CLK_DEPTH_STENCIL     0x10BE
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define CLK_sRGB              0x10BF
+#define CLK_sRGBA             0x10C1
+#define CLK_sRGBx             0x10C0
+#define CLK_sBGRA             0x10C2
+#define CLK_ABGR              0x10C3
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image3d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_channel_order(write_only image1d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image3d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_channel_order(read_write image1d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image3d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the 2D image width and height as an int2
+ * type. The width is returned in the x component, and
+ * the height in the y component.
+ */
+int2 __ovld __cnfn get_image_dim(read_only image2d_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(read_only image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int2 __ovld __cnfn get_image_dim(write_only image2d_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(write_only image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int2 __ovld __cnfn get_image_dim(read_write image2d_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the 3D image width, height, and depth as an
+ * int4 type. The width is returned in the x
+ * component, height in the y component, depth in the z
+ * component and the w component is 0.
+ */
+int4 __ovld __cnfn get_image_dim(read_only image3d_t image);
+int4 __ovld __cnfn get_image_dim(write_only image3d_t image);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int4 __ovld __cnfn get_image_dim(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image array size.
+ */
+
+size_t __ovld __cnfn get_image_array_size(read_only image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+
+size_t __ovld __cnfn get_image_array_size(write_only image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+size_t __ovld __cnfn get_image_array_size(read_write image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+* Return the number of samples associated with image
+*/
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld get_image_num_samples(read_only image2d_msaa_t image);
+int __ovld get_image_num_samples(read_only image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
+
+int __ovld get_image_num_samples(write_only image2d_msaa_t image);
+int __ovld get_image_num_samples(write_only image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_samples(read_write image2d_msaa_t image);
+int __ovld get_image_num_samples(read_write image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif
+
+// OpenCL v2.0 s6.13.15 - Work-group Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld work_group_all(int predicate);
+int __ovld work_group_any(int predicate);
+
+#ifdef cl_khr_fp16
+half __ovld work_group_broadcast(half a, size_t local_id);
+half __ovld work_group_broadcast(half a, size_t x, size_t y);
+half __ovld work_group_broadcast(half a, size_t x, size_t y, size_t z);
+#endif
+int __ovld work_group_broadcast(int a, size_t local_id);
+int __ovld work_group_broadcast(int a, size_t x, size_t y);
+int __ovld work_group_broadcast(int a, size_t x, size_t y, size_t z);
+uint __ovld work_group_broadcast(uint a, size_t local_id);
+uint __ovld work_group_broadcast(uint a, size_t x, size_t y);
+uint __ovld work_group_broadcast(uint a, size_t x, size_t y, size_t z);
+long __ovld work_group_broadcast(long a, size_t local_id);
+long __ovld work_group_broadcast(long a, size_t x, size_t y);
+long __ovld work_group_broadcast(long a, size_t x, size_t y, size_t z);
+ulong __ovld work_group_broadcast(ulong a, size_t local_id);
+ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y);
+ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
+float __ovld work_group_broadcast(float a, size_t local_id);
+float __ovld work_group_broadcast(float a, size_t x, size_t y);
+float __ovld work_group_broadcast(float a, size_t x, size_t y, size_t z);
+#ifdef cl_khr_fp64
+double __ovld work_group_broadcast(double a, size_t local_id);
+double __ovld work_group_broadcast(double a, size_t x, size_t y);
+double __ovld work_group_broadcast(double a, size_t x, size_t y, size_t z);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld work_group_reduce_add(half x);
+half __ovld work_group_reduce_min(half x);
+half __ovld work_group_reduce_max(half x);
+half __ovld work_group_scan_exclusive_add(half x);
+half __ovld work_group_scan_exclusive_min(half x);
+half __ovld work_group_scan_exclusive_max(half x);
+half __ovld work_group_scan_inclusive_add(half x);
+half __ovld work_group_scan_inclusive_min(half x);
+half __ovld work_group_scan_inclusive_max(half x);
+#endif
+int __ovld work_group_reduce_add(int x);
+int __ovld work_group_reduce_min(int x);
+int __ovld work_group_reduce_max(int x);
+int __ovld work_group_scan_exclusive_add(int x);
+int __ovld work_group_scan_exclusive_min(int x);
+int __ovld work_group_scan_exclusive_max(int x);
+int __ovld work_group_scan_inclusive_add(int x);
+int __ovld work_group_scan_inclusive_min(int x);
+int __ovld work_group_scan_inclusive_max(int x);
+uint __ovld work_group_reduce_add(uint x);
+uint __ovld work_group_reduce_min(uint x);
+uint __ovld work_group_reduce_max(uint x);
+uint __ovld work_group_scan_exclusive_add(uint x);
+uint __ovld work_group_scan_exclusive_min(uint x);
+uint __ovld work_group_scan_exclusive_max(uint x);
+uint __ovld work_group_scan_inclusive_add(uint x);
+uint __ovld work_group_scan_inclusive_min(uint x);
+uint __ovld work_group_scan_inclusive_max(uint x);
+long __ovld work_group_reduce_add(long x);
+long __ovld work_group_reduce_min(long x);
+long __ovld work_group_reduce_max(long x);
+long __ovld work_group_scan_exclusive_add(long x);
+long __ovld work_group_scan_exclusive_min(long x);
+long __ovld work_group_scan_exclusive_max(long x);
+long __ovld work_group_scan_inclusive_add(long x);
+long __ovld work_group_scan_inclusive_min(long x);
+long __ovld work_group_scan_inclusive_max(long x);
+ulong __ovld work_group_reduce_add(ulong x);
+ulong __ovld work_group_reduce_min(ulong x);
+ulong __ovld work_group_reduce_max(ulong x);
+ulong __ovld work_group_scan_exclusive_add(ulong x);
+ulong __ovld work_group_scan_exclusive_min(ulong x);
+ulong __ovld work_group_scan_exclusive_max(ulong x);
+ulong __ovld work_group_scan_inclusive_add(ulong x);
+ulong __ovld work_group_scan_inclusive_min(ulong x);
+ulong __ovld work_group_scan_inclusive_max(ulong x);
+float __ovld work_group_reduce_add(float x);
+float __ovld work_group_reduce_min(float x);
+float __ovld work_group_reduce_max(float x);
+float __ovld work_group_scan_exclusive_add(float x);
+float __ovld work_group_scan_exclusive_min(float x);
+float __ovld work_group_scan_exclusive_max(float x);
+float __ovld work_group_scan_inclusive_add(float x);
+float __ovld work_group_scan_inclusive_min(float x);
+float __ovld work_group_scan_inclusive_max(float x);
+#ifdef cl_khr_fp64
+double __ovld work_group_reduce_add(double x);
+double __ovld work_group_reduce_min(double x);
+double __ovld work_group_reduce_max(double x);
+double __ovld work_group_scan_exclusive_add(double x);
+double __ovld work_group_scan_exclusive_min(double x);
+double __ovld work_group_scan_exclusive_max(double x);
+double __ovld work_group_scan_inclusive_add(double x);
+double __ovld work_group_scan_inclusive_min(double x);
+double __ovld work_group_scan_inclusive_max(double x);
+#endif //cl_khr_fp64
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v2.0 s6.13.16 - Pipe Functions
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define PIPE_RESERVE_ID_VALID_BIT (1U << 30)
+#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
+bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+
+// OpenCL v2.0 s6.13.17 - Enqueue Kernels
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+#define CLK_SUCCESS                                 0
+#define CLK_ENQUEUE_FAILURE                         -101
+#define CLK_INVALID_QUEUE                           -102
+#define CLK_INVALID_NDRANGE                         -160
+#define CLK_INVALID_EVENT_WAIT_LIST                 -57
+#define CLK_DEVICE_QUEUE_FULL                       -161
+#define CLK_INVALID_ARG_SIZE                        -51
+#define CLK_EVENT_ALLOCATION_FAILURE                -100
+#define CLK_OUT_OF_RESOURCES                        -5
+
+#define CLK_NULL_QUEUE                              0
+#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
+
+// execution model related definitions
+#define CLK_ENQUEUE_FLAGS_NO_WAIT                   0x0
+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL               0x1
+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP           0x2
+
+typedef int kernel_enqueue_flags_t;
+typedef int clk_profiling_info;
+
+// Profiling info name (see capture_event_profiling_info)
+#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
+
+#define MAX_WORK_DIM        3
+
+// ToDo: Remove definition of ndrange_t in Clang as an opaque type and add back
+// the following ndrange_t definition.
+#if 0
+typedef struct {
+    unsigned int workDimension;
+    size_t globalWorkOffset[MAX_WORK_DIM];
+    size_t globalWorkSize[MAX_WORK_DIM];
+    size_t localWorkSize[MAX_WORK_DIM];
+} ndrange_t;
+#endif
+
+ndrange_t __ovld ndrange_1D(size_t);
+ndrange_t __ovld ndrange_1D(size_t, size_t);
+ndrange_t __ovld ndrange_1D(size_t, size_t, size_t);
+
+ndrange_t __ovld ndrange_2D(const size_t[2]);
+ndrange_t __ovld ndrange_2D(const size_t[2], const size_t[2]);
+ndrange_t __ovld ndrange_2D(const size_t[2], const size_t[2], const size_t[2]);
+
+ndrange_t __ovld ndrange_3D(const size_t[3]);
+ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3]);
+ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3], const size_t[3]);
+
+int __ovld enqueue_marker(queue_t, uint, const __private clk_event_t*, __private clk_event_t*);
+
+void __ovld retain_event(clk_event_t);
+
+void __ovld release_event(clk_event_t);
+
+clk_event_t create_user_event(void);
+
+void __ovld set_user_event_status(clk_event_t e, int state);
+
+bool is_valid_event (clk_event_t event);
+
+void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);
+
+queue_t __ovld get_default_queue(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL Extension v2.0 s9.17 - Sub-groups
+
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+// Shared Sub Group Functions
+uint    __ovld get_sub_group_size(void);
+uint    __ovld get_max_sub_group_size(void);
+uint    __ovld get_num_sub_groups(void);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+uint    __ovld get_enqueued_num_sub_groups(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+uint    __ovld get_sub_group_id(void);
+uint    __ovld get_sub_group_local_id(void);
+
+void    __ovld sub_group_barrier(cl_mem_fence_flags flags);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void    __ovld sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int     __ovld sub_group_all(int predicate);
+int     __ovld sub_group_any(int predicate);
+
+int     __ovld sub_group_broadcast(int   x, uint sub_group_local_id);
+uint    __ovld sub_group_broadcast(uint  x, uint sub_group_local_id);
+long    __ovld sub_group_broadcast(long  x, uint sub_group_local_id);
+ulong   __ovld sub_group_broadcast(ulong x, uint sub_group_local_id);
+float   __ovld sub_group_broadcast(float x, uint sub_group_local_id);
+
+int     __ovld sub_group_reduce_add(int   x);
+uint    __ovld sub_group_reduce_add(uint  x);
+long    __ovld sub_group_reduce_add(long  x);
+ulong   __ovld sub_group_reduce_add(ulong x);
+float   __ovld sub_group_reduce_add(float x);
+int     __ovld sub_group_reduce_min(int   x);
+uint    __ovld sub_group_reduce_min(uint  x);
+long    __ovld sub_group_reduce_min(long  x);
+ulong   __ovld sub_group_reduce_min(ulong x);
+float   __ovld sub_group_reduce_min(float x);
+int     __ovld sub_group_reduce_max(int   x);
+uint    __ovld sub_group_reduce_max(uint  x);
+long    __ovld sub_group_reduce_max(long  x);
+ulong   __ovld sub_group_reduce_max(ulong x);
+float   __ovld sub_group_reduce_max(float x);
+
+int     __ovld sub_group_scan_exclusive_add(int   x);
+uint    __ovld sub_group_scan_exclusive_add(uint  x);
+long    __ovld sub_group_scan_exclusive_add(long  x);
+ulong   __ovld sub_group_scan_exclusive_add(ulong x);
+float   __ovld sub_group_scan_exclusive_add(float x);
+int     __ovld sub_group_scan_exclusive_min(int   x);
+uint    __ovld sub_group_scan_exclusive_min(uint  x);
+long    __ovld sub_group_scan_exclusive_min(long  x);
+ulong   __ovld sub_group_scan_exclusive_min(ulong x);
+float   __ovld sub_group_scan_exclusive_min(float x);
+int     __ovld sub_group_scan_exclusive_max(int   x);
+uint    __ovld sub_group_scan_exclusive_max(uint  x);
+long    __ovld sub_group_scan_exclusive_max(long  x);
+ulong   __ovld sub_group_scan_exclusive_max(ulong x);
+float   __ovld sub_group_scan_exclusive_max(float x);
+
+int     __ovld sub_group_scan_inclusive_add(int   x);
+uint    __ovld sub_group_scan_inclusive_add(uint  x);
+long    __ovld sub_group_scan_inclusive_add(long  x);
+ulong   __ovld sub_group_scan_inclusive_add(ulong x);
+float   __ovld sub_group_scan_inclusive_add(float x);
+int     __ovld sub_group_scan_inclusive_min(int   x);
+uint    __ovld sub_group_scan_inclusive_min(uint  x);
+long    __ovld sub_group_scan_inclusive_min(long  x);
+ulong   __ovld sub_group_scan_inclusive_min(ulong x);
+float   __ovld sub_group_scan_inclusive_min(float x);
+int     __ovld sub_group_scan_inclusive_max(int   x);
+uint    __ovld sub_group_scan_inclusive_max(uint  x);
+long    __ovld sub_group_scan_inclusive_max(long  x);
+ulong   __ovld sub_group_scan_inclusive_max(ulong x);
+float   __ovld sub_group_scan_inclusive_max(float x);
+
+#ifdef cl_khr_fp16
+half    __ovld sub_group_broadcast(half x, uint sub_group_local_id);
+half    __ovld sub_group_reduce_add(half x);
+half    __ovld sub_group_reduce_min(half x);
+half    __ovld sub_group_reduce_max(half x);
+half    __ovld sub_group_scan_exclusive_add(half x);
+half    __ovld sub_group_scan_exclusive_min(half x);
+half    __ovld sub_group_scan_exclusive_max(half x);
+half    __ovld sub_group_scan_inclusive_add(half x);
+half    __ovld sub_group_scan_inclusive_min(half x);
+half    __ovld sub_group_scan_inclusive_max(half x);
+#endif //cl_khr_fp16
+
+#ifdef cl_khr_fp64
+double  __ovld sub_group_broadcast(double x, uint sub_group_local_id);
+double  __ovld sub_group_reduce_add(double x);
+double  __ovld sub_group_reduce_min(double x);
+double  __ovld sub_group_reduce_max(double x);
+double  __ovld sub_group_scan_exclusive_add(double x);
+double  __ovld sub_group_scan_exclusive_min(double x);
+double  __ovld sub_group_scan_exclusive_max(double x);
+double  __ovld sub_group_scan_inclusive_add(double x);
+double  __ovld sub_group_scan_inclusive_min(double x);
+double  __ovld sub_group_scan_inclusive_max(double x);
+#endif //cl_khr_fp64
+
+#endif //cl_khr_subgroups cl_intel_subgroups
+
+#ifdef cl_amd_media_ops
+uint __ovld amd_bitalign(uint a, uint b, uint c);
+uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bitalign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bitalign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bitalign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bitalign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_bytealign(uint a, uint b, uint c);
+uint2 __ovld amd_bytealign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bytealign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bytealign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bytealign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bytealign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_lerp(uint a, uint b, uint c);
+uint2 __ovld amd_lerp(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_lerp(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_lerp(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_lerp(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_lerp(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_pack(float4 v);
+
+uint __ovld amd_sad4(uint4 x, uint4 y, uint z);
+
+uint __ovld amd_sadhi(uint a, uint b, uint c);
+uint2 __ovld amd_sadhi(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sadhi(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sadhi(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sadhi(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sadhi(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_sad(uint a, uint b, uint c);
+uint2 __ovld amd_sad(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sad(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sad(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sad(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sad(uint16 a, uint16 b, uint16 c);
+
+float __ovld amd_unpack0(uint a);
+float2 __ovld amd_unpack0(uint2 a);
+float3 __ovld amd_unpack0(uint3 a);
+float4 __ovld amd_unpack0(uint4 a);
+float8 __ovld amd_unpack0(uint8 a);
+float16 __ovld amd_unpack0(uint16 a);
+
+float __ovld amd_unpack1(uint a);
+float2 __ovld amd_unpack1(uint2 a);
+float3 __ovld amd_unpack1(uint3 a);
+float4 __ovld amd_unpack1(uint4 a);
+float8 __ovld amd_unpack1(uint8 a);
+float16 __ovld amd_unpack1(uint16 a);
+
+float __ovld amd_unpack2(uint a);
+float2 __ovld amd_unpack2(uint2 a);
+float3 __ovld amd_unpack2(uint3 a);
+float4 __ovld amd_unpack2(uint4 a);
+float8 __ovld amd_unpack2(uint8 a);
+float16 __ovld amd_unpack2(uint16 a);
+
+float __ovld amd_unpack3(uint a);
+float2 __ovld amd_unpack3(uint2 a);
+float3 __ovld amd_unpack3(uint3 a);
+float4 __ovld amd_unpack3(uint4 a);
+float8 __ovld amd_unpack3(uint8 a);
+float16 __ovld amd_unpack3(uint16 a);
+#endif // cl_amd_media_ops
+
+#ifdef cl_amd_media_ops2
+int __ovld amd_bfe(int src0, uint src1, uint src2);
+int2 __ovld amd_bfe(int2 src0, uint2 src1, uint2 src2);
+int3 __ovld amd_bfe(int3 src0, uint3 src1, uint3 src2);
+int4 __ovld amd_bfe(int4 src0, uint4 src1, uint4 src2);
+int8 __ovld amd_bfe(int8 src0, uint8 src1, uint8 src2);
+int16 __ovld amd_bfe(int16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfe(uint src0, uint src1, uint src2);
+uint2 __ovld amd_bfe(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_bfe(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_bfe(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_bfe(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_bfe(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfm(uint src0, uint src1);
+uint2 __ovld amd_bfm(uint2 src0, uint2 src1);
+uint3 __ovld amd_bfm(uint3 src0, uint3 src1);
+uint4 __ovld amd_bfm(uint4 src0, uint4 src1);
+uint8 __ovld amd_bfm(uint8 src0, uint8 src1);
+uint16 __ovld amd_bfm(uint16 src0, uint16 src1);
+
+float __ovld amd_max3(float src0, float src1, float src2);
+float2 __ovld amd_max3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_max3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_max3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_max3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_max3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_max3(int src0, int src1, int src2);
+int2 __ovld amd_max3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_max3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_max3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_max3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_max3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_max3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_max3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_max3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_max3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_max3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_max3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_median3(float src0, float src1, float src2);
+float2 __ovld amd_median3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_median3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_median3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_median3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_median3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_median3(int src0, int src1, int src2);
+int2 __ovld amd_median3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_median3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_median3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_median3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_median3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_median3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_median3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_median3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_median3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_median3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_median3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_min3(float src0, float src1, float src);
+float2 __ovld amd_min3(float2 src0, float2 src1, float2 src);
+float3 __ovld amd_min3(float3 src0, float3 src1, float3 src);
+float4 __ovld amd_min3(float4 src0, float4 src1, float4 src);
+float8 __ovld amd_min3(float8 src0, float8 src1, float8 src);
+float16 __ovld amd_min3(float16 src0, float16 src1, float16 src);
+
+int __ovld amd_min3(int src0, int src1, int src2);
+int2 __ovld amd_min3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_min3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_min3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_min3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_min3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_min3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_min3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_min3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_min3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_min3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_min3(uint16 src0, uint16 src1, uint16 src2);
+
+ulong __ovld amd_mqsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_mqsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_mqsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_mqsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_mqsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_mqsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+ulong __ovld amd_qsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_qsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_qsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_qsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_qsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_qsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+uint __ovld amd_msad(uint src0, uint src1, uint src2);
+uint2 __ovld amd_msad(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_msad(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_msad(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_msad(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_msad(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadd(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadd(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadd(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadd(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadd(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadd(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadw(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadw(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadw(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadw(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
+#endif // cl_amd_media_ops2
+
+// Disable any extensions we may have enabled previously.
+#pragma OPENCL EXTENSION all : disable
+
+#undef __cnfn
+#undef __ovld
+#endif //_OPENCL_H_
diff --git a/lib/Headers/pkuintrin.h b/lib/Headers/pkuintrin.h
index ad12348..9e54594 100644
--- a/lib/Headers/pkuintrin.h
+++ b/lib/Headers/pkuintrin.h
@@ -38,9 +38,9 @@
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
-_wrpkru(unsigned int val)
+_wrpkru(unsigned int __val)
 {
-  return __builtin_ia32_wrpkru(val);
+  return __builtin_ia32_wrpkru(__val);
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/pmmintrin.h b/lib/Headers/pmmintrin.h
index 0ff9409..5b10580 100644
--- a/lib/Headers/pmmintrin.h
+++ b/lib/Headers/pmmintrin.h
@@ -27,68 +27,235 @@
 #include <emmintrin.h>
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
 
+/// \brief Loads data from an unaligned memory location to elements in a 128-bit
+///    vector. If the address of the data is not 16-byte aligned, the
+///    instruction may read two adjacent aligned blocks of memory to retrieve
+///    the requested data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VLDDQU instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit integer vector containing integer values.
+/// \returns A 128-bit vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_lddqu_si128(__m128i const *__p)
 {
   return (__m128i)__builtin_ia32_lddqu((char const *)__p);
 }
 
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VADDSUBPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the right source operand.
+/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
+///    differences of both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_addsub_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_addsubps(__a, __b);
+  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VHADDPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
+///    both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_hadd_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_haddps(__a, __b);
+  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VHSUBPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the lower
+///    bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the upper
+///    bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal
+///    differences of both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_hsub_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_hsubps(__a, __b);
+  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
+///    vector of [4 x float] to float values stored in a 128-bit vector of
+///    [4 x float].
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+///    the destination.
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVSHDUP instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movehdup_ps(__m128 __a)
 {
-  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
 }
 
+/// \brief Duplicates low-order (even-indexed) values from a 128-bit
+///    vector of [4 x float] to float values stored in a 128-bit vector of
+///    [4 x float].
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination.
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVSLDUP instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_moveldup_ps(__m128 __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
 }
 
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VADDSUBPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the alternating sums
+///    and differences of both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_addsub_pd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_addsubpd(__a, __b);
+  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Horizontally adds the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VHADDPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
+///    both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_hadd_pd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_haddpd(__a, __b);
+  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VHSUBPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal
+///    differences of both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_hsub_pd(__m128d __a, __m128d __b)
 {
-  return __builtin_ia32_hsubpd(__a, __b);
+  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Moves and duplicates one double-precision value to double-precision
+///    values stored in a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_loaddup_pd(double const * dp);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VMOVDDUP instruction.
+///
+/// \param dp
+///    A pointer to a double-precision value to be moved and duplicated.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
 #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
 
+/// \brief Moves and duplicates the double-precision value in the lower bits of
+///    a 128-bit vector of [2 x double] to double-precision values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVDDUP instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
+///    [127:64] and [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_movedup_pd(__m128d __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 0);
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 }
 
 #define _MM_DENORMALS_ZERO_ON   (0x0040)
@@ -99,12 +266,40 @@
 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
 
+/// \brief Establishes a linear address memory range to be monitored and puts
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MONITOR instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is determined by
+///    CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
 {
   __builtin_ia32_monitor((void *)__p, __extensions, __hints);
 }
 
+/// \brief Used with the MONITOR instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MWAIT instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state, which may vary by
+///    processor.
+/// \param __hints
+///    Optional hints for the monitoring state, which may vary by processor.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mwait(unsigned __extensions, unsigned __hints)
 {
diff --git a/lib/Headers/popcntintrin.h b/lib/Headers/popcntintrin.h
index 6fcda65..7e2f167 100644
--- a/lib/Headers/popcntintrin.h
+++ b/lib/Headers/popcntintrin.h
@@ -27,12 +27,32 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
 
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c POPCNT instruction.
+///
+/// \param __A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_popcnt_u32(unsigned int __A)
 {
   return __builtin_popcount(__A);
 }
 
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c POPCNT instruction.
+///
+/// \param __A
+///    A signed 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
 static __inline__ int __DEFAULT_FN_ATTRS
 _popcnt32(int __A)
 {
@@ -40,12 +60,32 @@
 }
 
 #ifdef __x86_64__
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c POPCNT instruction.
+///
+/// \param __A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_popcnt_u64(unsigned long long __A)
 {
   return __builtin_popcountll(__A);
 }
 
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c POPCNT instruction.
+///
+/// \param __A
+///    A signed 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _popcnt64(long long __A)
 {
diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h
index 69ad07f..e48ab03 100644
--- a/lib/Headers/smmintrin.h
+++ b/lib/Headers/smmintrin.h
@@ -121,7 +121,7 @@
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
+  return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
 }
 
 static __inline__  __m128i __DEFAULT_FN_ATTRS
@@ -220,16 +220,16 @@
 #define _mm_insert_epi8(X, I, N) (__extension__                           \
                                   ({ __v16qi __a = (__v16qi)(__m128i)(X); \
                                      __a[(N) & 15] = (I);                 \
-                                     __a;}))
+                                     (__m128i)__a;}))
 #define _mm_insert_epi32(X, I, N) (__extension__                         \
                                    ({ __v4si __a = (__v4si)(__m128i)(X); \
                                       __a[(N) & 3] = (I);                \
-                                      __a;}))
+                                      (__m128i)__a;}))
 #ifdef __x86_64__
 #define _mm_insert_epi64(X, I, N) (__extension__                         \
                                    ({ __v2di __a = (__v2di)(__m128i)(X); \
                                       __a[(N) & 1] = (I);                \
-                                      __a;}))
+                                      (__m128i)__a;}))
 #endif /* __x86_64__ */
 
 /* Extract int from packed integer array at index.  This returns the element
@@ -299,7 +299,6 @@
 {
   /* This function always performs a signed extension, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
-  typedef signed char __v16qs __attribute__((__vector_size__(16)));
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }
 
@@ -325,37 +324,37 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi16(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi32(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi64(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi32(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi64(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu32_epi64(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
diff --git a/lib/Headers/tbmintrin.h b/lib/Headers/tbmintrin.h
index 785961c..1d0d746 100644
--- a/lib/Headers/tbmintrin.h
+++ b/lib/Headers/tbmintrin.h
@@ -36,57 +36,57 @@
                                            (unsigned int)(b)))
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcfill_u32(unsigned int a)
+__blcfill_u32(unsigned int __a)
 {
-  return a & (a + 1);
+  return __a & (__a + 1);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blci_u32(unsigned int a)
+__blci_u32(unsigned int __a)
 {
-  return a | ~(a + 1);
+  return __a | ~(__a + 1);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcic_u32(unsigned int a)
+__blcic_u32(unsigned int __a)
 {
-  return ~a & (a + 1);
+  return ~__a & (__a + 1);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcmsk_u32(unsigned int a)
+__blcmsk_u32(unsigned int __a)
 {
-  return a ^ (a + 1);
+  return __a ^ (__a + 1);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcs_u32(unsigned int a)
+__blcs_u32(unsigned int __a)
 {
-  return a | (a + 1);
+  return __a | (__a + 1);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsfill_u32(unsigned int a)
+__blsfill_u32(unsigned int __a)
 {
-  return a | (a - 1);
+  return __a | (__a - 1);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsic_u32(unsigned int a)
+__blsic_u32(unsigned int __a)
 {
-  return ~a | (a - 1);
+  return ~__a | (__a - 1);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__t1mskc_u32(unsigned int a)
+__t1mskc_u32(unsigned int __a)
 {
-  return ~a | (a + 1);
+  return ~__a | (__a + 1);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__tzmsk_u32(unsigned int a)
+__tzmsk_u32(unsigned int __a)
 {
-  return ~a & (a - 1);
+  return ~__a & (__a - 1);
 }
 
 #ifdef __x86_64__
@@ -95,57 +95,57 @@
                                                  (unsigned long long)(b)))
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcfill_u64(unsigned long long a)
+__blcfill_u64(unsigned long long __a)
 {
-  return a & (a + 1);
+  return __a & (__a + 1);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blci_u64(unsigned long long a)
+__blci_u64(unsigned long long __a)
 {
-  return a | ~(a + 1);
+  return __a | ~(__a + 1);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcic_u64(unsigned long long a)
+__blcic_u64(unsigned long long __a)
 {
-  return ~a & (a + 1);
+  return ~__a & (__a + 1);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcmsk_u64(unsigned long long a)
+__blcmsk_u64(unsigned long long __a)
 {
-  return a ^ (a + 1);
+  return __a ^ (__a + 1);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcs_u64(unsigned long long a)
+__blcs_u64(unsigned long long __a)
 {
-  return a | (a + 1);
+  return __a | (__a + 1);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsfill_u64(unsigned long long a)
+__blsfill_u64(unsigned long long __a)
 {
-  return a | (a - 1);
+  return __a | (__a - 1);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsic_u64(unsigned long long a)
+__blsic_u64(unsigned long long __a)
 {
-  return ~a | (a - 1);
+  return ~__a | (__a - 1);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__t1mskc_u64(unsigned long long a)
+__t1mskc_u64(unsigned long long __a)
 {
-  return ~a | (a + 1);
+  return ~__a | (__a + 1);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__tzmsk_u64(unsigned long long a)
+__tzmsk_u64(unsigned long long __a)
 {
-  return ~a & (a - 1);
+  return ~__a & (__a - 1);
 }
 #endif
 
diff --git a/lib/Headers/tmmintrin.h b/lib/Headers/tmmintrin.h
index 0002890..a72796b 100644
--- a/lib/Headers/tmmintrin.h
+++ b/lib/Headers/tmmintrin.h
@@ -29,187 +29,739 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
 
+/// \brief Computes the absolute value of each of the packed 8-bit signed
+///    integers in the source operand and stores the 8-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSB instruction.
+///
+/// \param __a
+///    A 64-bit vector of [8 x i8].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 8-bit signed
+///    integers in the source operand and stores the 8-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSB instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi8(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 16-bit signed
+///    integers in the source operand and stores the 16-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 16-bit signed
+///    integers in the source operand and stores the 16-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi16(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 32-bit signed
+///    integers in the source operand and stores the 32-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 32-bit signed
+///    integers in the source operand and stores the 32-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi32(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 }
 
+/// \brief Concatenates the two 128-bit integer vector operands, and
+///    right-shifts the result by the number of bytes specified in the immediate
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PALIGNR instruction.
+///
+/// \param a
+///    A 128-bit vector of [16 x i8] containing one of the source operands.
+/// \param b
+///    A 128-bit vector of [16 x i8] containing one of the source operands.
+/// \param n
+///    An immediate operand specifying how many bytes to right-shift the result.
+/// \returns A 128-bit integer vector containing the concatenated right-shifted
+///    value.
 #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
                                      (__v16qi)(__m128i)(b), (n)); })
 
+/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
+///    the result by the number of bytes specified in the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PALIGNR instruction.
+///
+/// \param a
+///    A 64-bit vector of [8 x i8] containing one of the source operands.
+/// \param b
+///    A 64-bit vector of [8 x i8] containing one of the source operands.
+/// \param n
+///    An immediate operand specifying how many bytes to right-shift the result.
+/// \returns A 64-bit integer vector containing the concatenated right-shifted
+///    value.
 #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
+///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadd_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
+///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadd_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
+///    operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
+///    operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
+///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+///    sums of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadds_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
+///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+///    sums of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
+///    of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsub_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
+///    of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsub_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
+///    of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
+///    of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
+///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+///    saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+///    differences of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsubs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
+///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+///    saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+///    differences of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+///    values contained in the first source operand and packed 8-bit signed
+///    integer values contained in the second source operand, adds pairs of
+///    contiguous products with signed saturation, and writes the 16-bit sums to
+///    the corresponding bits in the destination. For example, bits [7:0] of
+///    both operands are multiplied, bits [15:8] of both operands are
+///    multiplied, and the sum of both results is written to bits [15:0] of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the first source operand.
+/// \param __b
+///    A 128-bit integer vector containing the second source operand.
+/// \returns A 128-bit integer vector containing the sums of products of both
+///    operands:
+///    R0 := (__a0 * __b0) + (__a1 * __b1)
+///    R1 := (__a2 * __b2) + (__a3 * __b3)
+///    R2 := (__a4 * __b4) + (__a5 * __b5)
+///    R3 := (__a6 * __b6) + (__a7 * __b7)
+///    R4 := (__a8 * __b8) + (__a9 * __b9)
+///    R5 := (__a10 * __b10) + (__a11 * __b11)
+///    R6 := (__a12 * __b12) + (__a13 * __b13)
+///    R7 := (__a14 * __b14) + (__a15 * __b15)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+///    values contained in the first source operand and packed 8-bit signed
+///    integer values contained in the second source operand, adds pairs of
+///    contiguous products with signed saturation, and writes the 16-bit sums to
+///    the corresponding bits in the destination. For example, bits [7:0] of
+///    both operands are multiplied, bits [15:8] of both operands are
+///    multiplied, and the sum of both results is written to bits [15:0] of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMADDUBSW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the first source operand.
+/// \param __b
+///    A 64-bit integer vector containing the second source operand.
+/// \returns A 64-bit integer vector containing the sums of products of both
+///    operands:
+///    R0 := (__a0 * __b0) + (__a1 * __b1)
+///    R1 := (__a2 * __b2) + (__a3 * __b3)
+///    R2 := (__a4 * __b4) + (__a5 * __b5)
+///    R3 := (__a6 * __b6) + (__a7 * __b7)
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+///    products to the 18 most significant bits by right-shifting, rounds the
+///    truncated value by adding 1, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMULHRSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands.
+/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
+///    products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+///    products to the 18 most significant bits by right-shifting, rounds the
+///    truncated value by adding 1, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULHRSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands.
+/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
+///    products of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
+///    destination or clears 8-bit values in the destination, as specified by
+///    the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSHUFB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control bytes corresponding to
+///    positions in the destination:
+///    Bit 7:
+///    1: Clear the corresponding byte in the destination.
+///    0: Copy the selected source byte to the corresponding byte in the
+///    destination.
+///    Bits [6:4] Reserved.
+///    Bits [3:0] select the source byte to be copied.
+/// \returns A 128-bit integer vector containing the copied or cleared values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shuffle_epi8(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
+///    destination or clears 8-bit values in the destination, as specified by
+///    the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSHUFB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control bytes corresponding to
+///    positions in the destination:
+///    Bit 7:
+///    1: Clear the corresponding byte in the destination.
+///    0: Copy the selected source byte to the corresponding byte in the
+///    destination.
+///    Bits [3:0] select the source byte to be copied.
+/// \returns A 64-bit integer vector containing the copied or cleared values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief For each 8-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    byte in the second source is negative, calculate the two's complement of
+///    the corresponding byte in the first source, and write that value to the
+///    destination. If the byte in the second source is positive, copy the
+///    corresponding byte from the first source to the destination. If the byte
+///    in the second source is zero, clear the corresponding byte in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGNB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control bytes corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi8(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    word in the second source is negative, calculate the two's complement of
+///    the corresponding word in the first source, and write that value to the
+///    destination. If the word in the second source is positive, copy the
+///    corresponding word from the first source to the destination. If the word
+///    in the second source is zero, clear the corresponding word in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGNW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control words corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    doubleword in the second source is negative, calculate the two's
+///    complement of the corresponding word in the first source, and write that
+///    value to the destination. If the doubleword in the second source is
+///    positive, copy the corresponding word from the first source to the
+///    destination. If the doubleword in the second source is zero, clear the
+///    corresponding word in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGND instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control doublewords corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief For each 8-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    byte in the second source is negative, calculate the two's complement of
+///    the corresponding byte in the first source, and write that value to the
+///    destination. If the byte in the second source is positive, copy the
+///    corresponding byte from the first source to the destination. If the byte
+///    in the second source is zero, clear the corresponding byte in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGNB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control bytes corresponding to
+///    positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    word in the second source is negative, calculate the two's complement of
+///    the corresponding word in the first source, and write that value to the
+///    destination. If the word in the second source is positive, copy the
+///    corresponding word from the first source to the destination. If the word
+///    in the second source is zero, clear the corresponding word in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGNW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control words corresponding to
+///    positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    doubleword in the second source is negative, calculate the two's
+///    complement of the corresponding doubleword in the first source, and
+///    write that value to the destination. If the doubleword in the second
+///    source is positive, copy the corresponding doubleword from the first
+///    source to the destination. If the doubleword in the second source is
+///    zero, clear the corresponding doubleword in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGND instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing two control doublewords corresponding
+///    to positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
diff --git a/lib/Headers/unwind.h b/lib/Headers/unwind.h
index 303d792..4f74a34 100644
--- a/lib/Headers/unwind.h
+++ b/lib/Headers/unwind.h
@@ -79,6 +79,10 @@
 struct _Unwind_Exception;
 typedef enum {
   _URC_NO_REASON = 0,
+#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
+    !defined(__ARM_DWARF_EH__)
+  _URC_OK = 0, /* used by ARM EHABI */
+#endif
   _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
 
   _URC_FATAL_PHASE2_ERROR = 2,
@@ -88,7 +92,11 @@
   _URC_END_OF_STACK = 5,
   _URC_HANDLER_FOUND = 6,
   _URC_INSTALL_CONTEXT = 7,
-  _URC_CONTINUE_UNWIND = 8
+  _URC_CONTINUE_UNWIND = 8,
+#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
+    !defined(__ARM_DWARF_EH__)
+  _URC_FAILURE = 9 /* used by ARM EHABI */
+#endif
 } _Unwind_Reason_Code;
 
 typedef enum {
@@ -150,6 +158,15 @@
   _UVRSR_FAILED = 2
 } _Unwind_VRS_Result;
 
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__)
+typedef uint32_t _Unwind_State;
+#define _US_VIRTUAL_UNWIND_FRAME  ((_Unwind_State)0)
+#define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1)
+#define _US_UNWIND_FRAME_RESUME   ((_Unwind_State)2)
+#define _US_ACTION_MASK           ((_Unwind_State)3)
+#define _US_FORCE_UNWIND          ((_Unwind_State)8)
+#endif
+
 _Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
   _Unwind_VRS_RegClass __regclass,
   uint32_t __regno,
diff --git a/lib/Headers/x86intrin.h b/lib/Headers/x86intrin.h
index 4d8077e..81a404f 100644
--- a/lib/Headers/x86intrin.h
+++ b/lib/Headers/x86intrin.h
@@ -28,29 +28,57 @@
 
 #include <immintrin.h>
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__3dNOW__)
 #include <mm3dnow.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
 #include <bmiintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
 #include <bmi2intrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
 #include <lzcntintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
 #include <popcntintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
 #include <rdseedintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
 #include <prfchwintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE4A__)
 #include <ammintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA4__)
 #include <fma4intrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XOP__)
 #include <xopintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__TBM__)
 #include <tbmintrin.h>
+#endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
 #include <f16cintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
+#include <mwaitxintrin.h>
+#endif
 
 /* FIXME: LWP */
 
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index ae0b2cd..373fc76 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -30,6 +30,9 @@
 typedef float __v4sf __attribute__((__vector_size__(16)));
 typedef float __m128 __attribute__((__vector_size__(16)));
 
+/* Unsigned types */
+typedef unsigned int __v4su __attribute__((__vector_size__(16)));
+
 /* This header should only be included in a hosted environment as it depends on
  * a standard library to provide allocation routines. */
 #if __STDC_HOSTED__
@@ -39,6 +42,21 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
 
+/// \brief Adds the 32-bit float values in the low-order bits of the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
+///    of the lower 32 bits of both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_add_ss(__m128 __a, __m128 __b)
 {
@@ -46,12 +64,41 @@
   return __a;
 }
 
+/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
+///    the addition.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the sums of both
+///    operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_add_ps(__m128 __a, __m128 __b)
 {
-  return __a + __b;
+  return (__m128)((__v4sf)__a + (__v4sf)__b);
 }
 
+/// \brief Subtracts the 32-bit float value in the low-order bits of the second
+///    operand from the corresponding value in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
+///    of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
+///    bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    difference of the lower 32 bits of both operands. The upper 96 bits are
+///    copied from the upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sub_ss(__m128 __a, __m128 __b)
 {
@@ -59,12 +106,42 @@
   return __a;
 }
 
+/// \brief Subtracts each of the values of the second operand from the first
+///    operand, both of which are 128-bit vectors of [4 x float] and returns
+///    the results of the subtraction.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the differences between
+///    both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sub_ps(__m128 __a, __m128 __b)
 {
-  return __a - __b;
+  return (__m128)((__v4sf)__a - (__v4sf)__b);
 }
 
+/// \brief Multiplies two 32-bit float values in the low-order bits of the
+///    operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the product of the lower
+///    32 bits of both operands. The upper 96 bits are copied from the upper 96
+///    bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mul_ss(__m128 __a, __m128 __b)
 {
@@ -72,12 +149,41 @@
   return __a;
 }
 
+/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
+///    results of the multiplication.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the products of both
+///    operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mul_ps(__m128 __a, __m128 __b)
 {
-  return __a * __b;
+  return (__m128)((__v4sf)__a * (__v4sf)__b);
 }
 
+/// \brief Divides the value in the low-order 32 bits of the first operand by
+///    the corresponding value in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
+///    bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
+///    of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of the
+///    lower 32 bits of both operands. The upper 96 bits are copied from the
+///    upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_div_ss(__m128 __a, __m128 __b)
 {
@@ -85,329 +191,1091 @@
   return __a;
 }
 
+/// \brief Divides two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the divisor.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of both
+///    operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_div_ps(__m128 __a, __m128 __b)
 {
-  return __a / __b;
+  return (__m128)((__v4sf)__a / (__v4sf)__b);
 }
 
+/// \brief Calculates the square root of the value stored in the low-order bits
+///    of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the square root of the
+///    value in the low-order bits of the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sqrt_ss(__m128 __a)
 {
-  __m128 __c = __builtin_ia32_sqrtss(__a);
+  __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
+/// \brief Calculates the square roots of the values stored in a 128-bit vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the square roots of the
+///    values in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sqrt_ps(__m128 __a)
 {
-  return __builtin_ia32_sqrtps(__a);
+  return __builtin_ia32_sqrtps((__v4sf)__a);
 }
 
+/// \brief Calculates the approximate reciprocal of the value stored in the
+///    low-order bits of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocal of the value in the low-order bits of the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp_ss(__m128 __a)
 {
-  __m128 __c = __builtin_ia32_rcpss(__a);
+  __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
+/// \brief Calculates the approximate reciprocals of the values stored in a
+///    128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocals of the values in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp_ps(__m128 __a)
 {
-  return __builtin_ia32_rcpps(__a);
+  return __builtin_ia32_rcpps((__v4sf)__a);
 }
 
+/// \brief Calculates the approximate reciprocal of the square root of the value
+///    stored in the low-order bits of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocal of the square root of the value in the low-order bits of the
+///    operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt_ss(__m128 __a)
 {
-  __m128 __c = __builtin_ia32_rsqrtss(__a);
+  __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
+/// \brief Calculates the approximate reciprocals of the square roots of the
+///    values stored in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocals of the square roots of the values in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt_ps(__m128 __a)
 {
-  return __builtin_ia32_rsqrtps(__a);
+  return __builtin_ia32_rsqrtps((__v4sf)__a);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands and returns the lesser value in the low-order bits of the
+///    vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    minimum value between both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_min_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_minss(__a, __b);
+  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 128-bit vectors of [4 x float] and returns the
+///    lesser of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \returns A 128-bit vector of [4 x float] containing the minimum values
+///    between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_min_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_minps(__a, __b);
+  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands and returns the greater value in the low-order bits of
+///    a vector [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    maximum value between both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_max_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_maxss(__a, __b);
+  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \returns A 128-bit vector of [4 x float] containing the maximum values
+///    between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_max_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_maxps(__a, __b);
+  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
+///
+/// \param __a
+///    A 128-bit vector containing one of the source operands.
+/// \param __b
+///    A 128-bit vector containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+///    values between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_and_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)((__v4si)__a & (__v4si)__b);
+  return (__m128)((__v4su)__a & (__v4su)__b);
 }
 
+/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
+///    the one's complement of the values contained in the first source
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the first source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the second source operand.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+///    one's complement of the first operand and the values in the second
+///    operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_andnot_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)(~(__v4si)__a & (__v4si)__b);
+  return (__m128)(~(__v4su)__a & (__v4su)__b);
 }
 
+/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VORPS / ORPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
+///    values between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_or_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)((__v4si)__a | (__v4si)__b);
+  return (__m128)((__v4su)__a | (__v4su)__b);
 }
 
+/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
+///    of the values between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_xor_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)((__v4si)__a ^ (__v4si)__b);
+  return (__m128)((__v4su)__a ^ (__v4su)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for equality and returns the result of the comparison in the
+///    low-order bits of a vector [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpeq_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpeqss(__a, __b);
+  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpeq_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpeqps(__a, __b);
+  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is less than the
+///    corresponding value in the second operand and returns the result of the
+///    comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmplt_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpltss(__a, __b);
+  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmplt_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpltps(__a, __b);
+  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is less than or
+///    equal to the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmple_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpless(__a, __b);
+  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmple_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpleps(__a, __b);
+  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is greater than
+///    the corresponding value in the second operand and returns the result of
+///    the comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpgt_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_shufflevector(__a,
-                                         __builtin_ia32_cmpltss(__b, __a),
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
                                          4, 1, 2, 3);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpgt_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpltps(__b, __a);
+  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is greater than
+///    or equal to the corresponding value in the second operand and returns
+///    the result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpge_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_shufflevector(__a,
-                                         __builtin_ia32_cmpless(__b, __a),
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
                                          4, 1, 2, 3);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpge_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpleps(__b, __a);
+  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for inequality and returns the result of the comparison in the
+///    low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpneq_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpneqss(__a, __b);
+  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] for inequality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpneq_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpneqps(__a, __b);
+  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not less than
+///    the corresponding value in the second operand and returns the result of
+///    the comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpnltss(__a, __b);
+  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpnltps(__a, __b);
+  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not less than
+///    or equal to the corresponding value in the second operand and returns
+///    the result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnle_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpnless(__a, __b);
+  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnle_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpnleps(__a, __b);
+  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not greater
+///    than the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpngt_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_shufflevector(__a,
-                                         __builtin_ia32_cmpnltss(__b, __a),
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
                                          4, 1, 2, 3);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpngt_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpnltps(__b, __a);
+  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not greater
+///    than or equal to the corresponding value in the second operand and
+///    returns the result of the comparison in the low-order bits of a vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnge_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_shufflevector(__a,
-                                         __builtin_ia32_cmpnless(__b, __a),
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
                                          4, 1, 2, 3);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnge_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpnleps(__b, __a);
+  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is ordered with
+///    respect to the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpord_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpordss(__a, __b);
+  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are ordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpord_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpordps(__a, __b);
+  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is unordered
+///    with respect to the corresponding value in the second operand and
+///    returns the result of the comparison in the low-order bits of a vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpunord_ss(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpunordss(__a, __b);
+  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are unordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpunord_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)__builtin_ia32_cmpunordps(__a, __b);
+  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for equality and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_comieq(__a, __b);
+  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is less than the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_comilt(__a, __b);
+  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is less than or equal to the
+///    second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_comile(__a, __b);
+  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is greater than the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_comigt(__a, __b);
+  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is greater than or equal to
+///    the second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_comige(__a, __b);
+  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is not equal to the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_comineq(__a, __b);
+  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine equality and returns
+///    the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_ucomieq(__a, __b);
+  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    less than the second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_ucomilt(__a, __b);
+  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand
+///    is less than or equal to the second operand and returns the result of
+///    the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_ucomile(__a, __b);
+  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand
+///    is greater than the second operand and returns the result of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_ucomigt(__a, __b);
+  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    greater than or equal to the second operand and returns the result of
+///    the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_ucomige(__a, __b);
+  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine inequality and returns
+///    the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_ia32_ucomineq(__a, __b);
+  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtss_si32(__m128 __a)
 {
-  return __builtin_ia32_cvtss2si(__a);
+  return __builtin_ia32_cvtss2si((__v4sf)__a);
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvt_ss2si(__m128 __a)
 {
@@ -416,56 +1284,163 @@
 
 #ifdef __x86_64__
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 64-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtss_si64(__m128 __a)
 {
-  return __builtin_ia32_cvtss2si64(__a);
+  return __builtin_ia32_cvtss2si64((__v4sf)__a);
 }
 
 #endif
 
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPS2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvtps2pi(__a);
+  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
 }
 
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPS2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvt_ps2pi(__m128 __a)
 {
   return _mm_cvtps_pi32(__a);
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttss_si32(__m128 __a)
 {
-  return __a[0];
+  return __builtin_ia32_cvttss2si((__v4sf)__a);
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtt_ss2si(__m128 __a)
 {
   return _mm_cvttss_si32(__a);
 }
 
+#ifdef __x86_64__
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 64-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttss_si64(__m128 __a)
 {
-  return __a[0];
+  return __builtin_ia32_cvttss2si64((__v4sf)__a);
 }
+#endif
 
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
+///    when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvttps2pi(__a);
+  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
 }
 
+/// \brief Converts two low-order float values in a 128-bit vector of [4 x
+///    float] into a 64-bit vector of [2 x i32], truncating the result when it
+///    is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTTPS2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtt_ps2pi(__m128 __a)
 {
   return _mm_cvttps_pi32(__a);
 }
 
+/// \brief Converts a 32-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination vector are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsi32_ss(__m128 __a, int __b)
 {
@@ -473,6 +1448,22 @@
   return __a;
 }
 
+/// \brief Converts a 32-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvt_si2ss(__m128 __a, int __b)
 {
@@ -481,6 +1472,22 @@
 
 #ifdef __x86_64__
 
+/// \brief Converts a 64-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsi64_ss(__m128 __a, long long __b)
 {
@@ -490,24 +1497,84 @@
 
 #endif
 
+/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
+///    floating point values and writes them to the lower 64-bits of the
+///    destination. The remaining higher order elements of the destination are
+///    copied from the corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPI2PS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
+///    and written to the corresponding low-order elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied from
+///    the upper 64 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
-  return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
+  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
 }
 
+/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
+///    floating point values and writes them to the lower 64-bits of the
+///    destination. The remaining higher order elements of the destination are
+///    copied from the corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPI2PS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
+///    and written to the corresponding low-order elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted value from the second operand. The upper 64 bits are copied
+///    from the upper 64 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
   return _mm_cvtpi32_ps(__a, __b);
 }
 
+/// \brief Extracts a float value contained in the lower 32 bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the extraction.
+/// \returns A 32-bit float containing the extracted value.
 static __inline__ float __DEFAULT_FN_ATTRS
 _mm_cvtss_f32(__m128 __a)
 {
   return __a[0];
 }
 
+/// \brief Loads two packed float values from the address __p into the
+///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
+///     are copied from the low-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
+///    of the destination.
+/// \param __p
+///    A pointer to two packed float values. Bits [63:0] are written to bits
+///    [127:64] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadh_pi(__m128 __a, const __m64 *__p)
 {
@@ -520,6 +1587,21 @@
   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
 }
 
+/// \brief Loads two packed float values from the address __p into the low-order
+///    bits of a 128-bit vector of [4 x float]. The high-order bits are copied
+///    from the high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
+///    [127:64] of the destination.
+/// \param __p
+///    A pointer to two packed float values. Bits [63:0] are written to bits
+///    [63:0] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadl_pi(__m128 __a, const __m64 *__p)
 {
@@ -532,6 +1614,21 @@
   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits of the vector are initialized with the single-precision
+///    floating-point value loaded from a specified memory location. The upper
+///    96 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location containing a single-precision
+///    floating-point value.
+/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
+///    lower 32 bits contain the value loaded from the memory location. The
+///    upper 96 bits are set to zero.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ss(const float *__p)
 {
@@ -542,6 +1639,18 @@
   return (__m128){ __u, 0, 0, 0 };
 }
 
+/// \brief Loads a 32-bit float value and duplicates it to all four vector
+///    elements of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
+///    instruction.
+///
+/// \param __p
+///    A pointer to a float value to be loaded and duplicated.
+/// \returns A 128-bit vector of [4 x float] containing the loaded
+///    and duplicated values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load1_ps(const float *__p)
 {
@@ -554,12 +1663,34 @@
 
 #define        _mm_load_ps1(p) _mm_load1_ps(p)
 
+/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \returns A 128-bit vector of [4 x float] containing the loaded valus.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ps(const float *__p)
 {
   return *(__m128*)__p;
 }
 
+/// \brief Loads a 128-bit floating-point vector of [4 x float] from an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadu_ps(const float *__p)
 {
@@ -569,25 +1700,72 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
+/// \brief Loads four packed float values, in reverse order, from an aligned
+///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
+///    in reverse order.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadr_ps(const float *__p)
 {
   __m128 __a = _mm_load_ps(__p);
-  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
 }
 
+/// \brief Create a 128-bit vector of [4 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x float] containing undefined values.
+
 static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_undefined_ps()
+_mm_undefined_ps(void)
 {
   return (__m128)__builtin_ia32_undef128();
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits of the vector are initialized with the specified single-precision
+///    floating-point value. The upper 96 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize the lower 32
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
+///    lower 32 bits contain the value provided in the source operand. The
+///    upper 96 bits are set to zero.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ss(float __w)
 {
   return (__m128){ __w, 0, 0, 0 };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
+///    of the four single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set1_ps(float __w)
 {
@@ -595,42 +1773,139 @@
 }
 
 /* Microsoft specific. */
+/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
+///    of the four single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ps1(float __w)
 {
     return _mm_set1_ps(__w);
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]
+///    initialized with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __z
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __y
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __x
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __w
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ps(float __z, float __y, float __x, float __w)
 {
   return (__m128){ __w, __x, __y, __z };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float],
+///    initialized in reverse order with the specified 32-bit single-precision
+///    float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __z
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \param __y
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __x
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __w
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setr_ps(float __z, float __y, float __x, float __w)
 {
   return (__m128){ __z, __y, __x, __w };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [4 x float] with
+///    all elements set to zero.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setzero_ps(void)
 {
   return (__m128){ 0, 0, 0, 0 };
 }
 
+/// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeh_pi(__m64 *__p, __m128 __a)
 {
-  __builtin_ia32_storehps((__v2si *)__p, __a);
+  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
 }
 
+/// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
+///     memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_pi(__m64 *__p, __m128 __a)
 {
-  __builtin_ia32_storelps((__v2si *)__p, __a);
+  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
 }
 
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
+///     memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ss(float *__p, __m128 __a)
 {
@@ -640,35 +1915,101 @@
   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
 }
 
+/// \brief Stores float values from a 128-bit vector of [4 x float] to an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_ps(float *__p, __m128 __a)
 {
-  __builtin_ia32_storeups(__p, __a);
+  struct __storeu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ps*)__p)->__v = __a;
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store1_ps(float *__p, __m128 __a)
-{
-  __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
-  _mm_storeu_ps(__p, __a);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_ps1(float *__p, __m128 __a)
-{
-    return _mm_store1_ps(__p, __a);
-}
-
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+///    four contiguous elements in an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+///    of the four contiguous elements pointed by __p.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps(float *__p, __m128 __a)
 {
-  *(__m128 *)__p = __a;
+  *(__m128*)__p = __a;
 }
 
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+///    four contiguous elements in an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+///    of the four contiguous elements pointed by __p.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
+  _mm_store_ps(__p, __a);
+}
+
+/// \brief Stores float values from a 128-bit vector of [4 x float] to an
+///    aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps1(float *__p, __m128 __a)
+{
+  return _mm_store1_ps(__p, __a);
+}
+
+/// \brief Stores float values from a 128-bit vector of [4 x float] to an
+///    aligned memory location in reverse order.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storer_ps(float *__p, __m128 __a)
 {
-  __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
   _mm_store_ps(__p, __a);
 }
 
@@ -681,153 +2022,599 @@
 /* FIXME: We have to #define this because "sel" must be a constant integer, and
    Sema doesn't do any form of constant propagation yet. */
 
+/// \brief Loads one cache line of data from the specified address to a location
+///    closer to the processor.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _mm_prefetch(const void * a, const int sel);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PREFETCHNTA instruction.
+///
+/// \param a
+///    A pointer to a memory location containing a cache line of data.
+/// \param sel
+///    A predefined integer constant specifying the type of prefetch operation:
+///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
+///    The PREFETCHNTA instruction will be generated.
+///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
+///    be generated.
+///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
+///    be generated.
+///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
+///    be generated.
 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
 #endif
 
+/// \brief Stores a 64-bit integer in the specified aligned memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MOVNTQ instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location used to store the register value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(__m64 *__p, __m64 __a)
 {
   __builtin_ia32_movntq(__p, __a);
 }
 
+/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
+///    128-bit aligned memory location. To minimize caching, the data is flagged
+///    as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit aligned memory location that will receive the
+///    integer values.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_ps(float *__p, __m128 __a)
 {
-  __builtin_ia32_movntps(__p, __a);
+  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
 }
 
+/// \brief Forces strong memory ordering (serialization) between store
+///    instructions preceding this instruction and store instructions following
+///    this instruction, ensuring the system completes all previous stores
+///    before executing subsequent stores.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c SFENCE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_sfence(void)
 {
   __builtin_ia32_sfence();
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_extract_pi16(__m64 __a, int __n)
-{
-  __v4hi __b = (__v4hi)__a;
-  return (unsigned short)__b[__n & 3];
-}
+/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
+///    returns it, as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16].
+/// \param __n
+///    An immediate integer operand that determines which bits are extracted:
+///    0: Bits [15:0] are copied to the destination.
+///    1: Bits [31:16] are copied to the destination.
+///    2: Bits [47:32] are copied to the destination.
+///    3: Bits [63:48] are copied to the destination.
+/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
+#define _mm_extract_pi16(a, n) __extension__ ({ \
+  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
 
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_insert_pi16(__m64 __a, int __d, int __n)
-{
-   __v4hi __b = (__v4hi)__a;
-   __b[__n & 3] = __d;
-   return (__m64)__b;
-}
+/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
+///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
+///    specified by the immediate operand __n.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16].
+/// \param __d
+///    An integer. The lower 16-bit value from this operand is written to the
+///    destination at the offset specified by operand __n.
+/// \param __n
+///    An immediate integer operant that determines which the bits to be used
+///    in the destination.
+///    0: Bits [15:0] are copied to the destination.
+///    1: Bits [31:16] are copied to the destination.
+///    2: Bits [47:32] are copied to the destination.
+///    3: Bits [63:48] are copied to the destination.
+///    The remaining bits in the destination are copied from the corresponding
+///    bits in operand __a.
+/// \returns A 64-bit integer vector containing the copied packed data from the
+///    operands.
+#define _mm_insert_pi16(a, d, n) __extension__ ({ \
+  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
 
+/// \brief Compares each of the corresponding packed 16-bit integer values of
+///    the 64-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMAXSW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 64-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMAXUB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit integer values of
+///    the 64-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMINSW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 64-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMINUB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Takes the most significant bit from each 8-bit element in a 64-bit
+///    integer vector to create a 16-bit mask value. Zero-extends the value to
+///    32-bit integer and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMOVMSKB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bit from each 8-bit element in the operand,
+///    written to bits [15:0].
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pi8(__m64 __a)
 {
   return __builtin_ia32_pmovmskb((__v8qi)__a);
 }
 
+/// \brief Multiplies packed 16-bit unsigned integer values and writes the
+///    high-order 16 bits of each 32-bit product to the corresponding bits in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULHUW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the products of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+///    destination, as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSHUFW instruction.
+///
+/// \code
+/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
+/// \endcode
+///
+/// \param a
+///    A 64-bit integer vector containing the values to be shuffled.
+/// \param n
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a. The destinations within the 64-bit destination are assigned
+///    values as follows:
+///    Bits [1:0] are used to assign values to bits [15:0] in the destination.
+///    Bits [3:2] are used to assign values to bits [31:16] in the destination.
+///    Bits [5:4] are used to assign values to bits [47:32] in the destination.
+///    Bits [7:6] are used to assign values to bits [63:48] in the destination.
+///    Bit value assignments:
+///    00: assigned from bits [15:0] of a.
+///    01: assigned from bits [31:16] of a.
+///    10: assigned from bits [47:32] of a.
+///    11: assigned from bits [63:48] of a.
+/// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
 
+/// \brief Conditionally copies the values from each 8-bit element in the first
+///    64-bit integer vector operand to the specified memory location, as
+///    specified by the most significant bit in the corresponding element in the
+///    second 64-bit integer vector operand. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MASKMOVQ instruction.
+///
+/// \param __d
+///    A 64-bit integer vector containing the values with elements to be copied.
+/// \param __n
+///    A 64-bit integer vector operand. The most significant bit from each 8-bit
+///    element determines whether the corresponding element in operand __d is
+///    copied. If the most significant bit of a given element is 1, the
+///    corresponding element in operand __d is copied.
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the conditionally
+///    copied integer values. The address of the memory location does not have
+///    to be aligned.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
 }
 
+/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PAVGB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PAVGW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
+///    64-bit vector operands and computes the absolute value for each of the
+///    difference. Then sum of the 8 absolute differences is written to the
+///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSADBW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
+///    sets of absolute differences between both operands. The upper bits are
+///    cleared.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
+///    integer value. There are several groups of macros associated with this
+///    intrinsic, including:
+///    * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
+///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
+///      _MM_GET_EXCEPTION_STATE().
+///    * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
+///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
+///    * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
+///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
+///    * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
+///    * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
+///      _MM_GET_DENORMALS_ZERO_MODE().
+///
+///    For example, the expression below checks if an overflow exception has
+///    occurred:
+///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
+///
+///    The following example gets the current rounding mode:
+///      _MM_GET_ROUNDING_MODE()
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
+///
+/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
+///    register.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_getcsr(void)
 {
   return __builtin_ia32_stmxcsr();
 }
 
+/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
+///    are several groups of macros associated with this intrinsic, including:
+///    * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
+///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
+///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
+///    * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
+///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
+///      of these macros.
+///    * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
+///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
+///    * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
+///      one of these macros.
+///    * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
+///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
+///
+///    For example, the following expression causes subsequent floating-point
+///    operations to round up:
+///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
+///
+///    The following example sets the DAZ and FTZ flags:
+///      void setFlags() {
+///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
+///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
+///      }
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
+///
+/// \param __i
+///    A 32-bit unsigned integer value to be written to the MXCSR register.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_setcsr(unsigned int __i)
 {
   __builtin_ia32_ldmxcsr(__i);
 }
 
+/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
+///    specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param mask
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a and b.
+///    Bits [3:0] specify the values copied from operand a.
+///    Bits [7:4] specify the values copied from operand b. The destinations
+///    within the 128-bit destination are assigned values as follows:
+///    Bits [1:0] are used to assign values to bits [31:0] in the destination.
+///    Bits [3:2] are used to assign values to bits [63:32] in the destination.
+///    Bits [5:4] are used to assign values to bits [95:64] in the destination.
+///    Bits [7:6] are used to assign values to bits [127:96] in the destination.
+///    Bit value assignments:
+///    00: Bits [31:0] copied from the specified operand.
+///    01: Bits [63:32] copied from the specified operand.
+///    10: Bits [95:64] copied from the specified operand.
+///    11: Bits [127:96] copied from the specified operand.
+/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
   (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
-                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
-                                  (((mask) & 0x30) >> 4) + 4, \
-                                  (((mask) & 0xc0) >> 6) + 4); })
+                                  0 + (((mask) >> 0) & 0x3), \
+                                  0 + (((mask) >> 2) & 0x3), \
+                                  4 + (((mask) >> 4) & 0x3), \
+                                  4 + (((mask) >> 6) & 0x3)); })
 
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+///    Bits [95:64] are written to bits [31:0] of the destination.
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float].
+///    Bits [95:64] are written to bits [63:32] of the destination.
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_unpackhi_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
 }
 
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+///    Bits [31:0] are written to bits [31:0] of the destination.
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float].
+///    Bits [31:0] are written to bits [63:32] of the destination.
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_unpacklo_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits are set to the lower 32 bits of the second parameter. The upper
+///    96 bits are set to the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
+///    written to the upper 96 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
+///    written to the lower 32 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_move_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    64 bits are set to the upper 64 bits of the second parameter. The upper
+///    64 bits are set to the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
+///    written to the upper 64 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
+///    written to the lower 64 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movehl_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    64 bits are set to the lower 64 bits of the first parameter. The upper
+///    64 bits are set to the lower 64 bits of the second parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
+///    written to the upper 64 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movelh_ps(__m128 __a, __m128 __b)
 {
-  return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
 }
 
+/// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
+///    from the corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi16_ps(__m64 __a)
 {
@@ -846,6 +2633,18 @@
   return __r;
 }
 
+/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
+///    128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
+///    destination are copied from the corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpu16_ps(__m64 __a)
 {
@@ -863,6 +2662,18 @@
   return __r;
 }
 
+/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
+///    into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
+///    from the corresponding lower 4 elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi8_ps(__m64 __a)
 {
@@ -875,6 +2686,19 @@
   return _mm_cvtpi16_ps(__b);
 }
 
+/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
+///    vector of [8 x u8] into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
+///    destination are copied from the corresponding lower 4 elements in this
+///    operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpu8_ps(__m64 __a)
 {
@@ -886,6 +2710,22 @@
   return _mm_cvtpi16_ps(__b);
 }
 
+/// \brief Converts the two 32-bit signed integer values from each 64-bit vector
+///    operand of [2 x i32] into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
+///    copied from the elements in this operand.
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
+///    copied from the elements in this operand.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    copied and converted values from the first operand. The upper 64 bits
+///    contain the copied and converted values from the second operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
@@ -898,6 +2738,22 @@
   return _mm_cvtpi32_ps(__c, __a);
 }
 
+/// \brief Converts each single-precision floating-point element of a 128-bit
+///    floating-point vector of [4 x float] into a 16-bit signed integer, and
+///    packs the results into a 64-bit integer vector of [4 x i16]. If the
+///    floating-point element is NaN or infinity, or if the floating-point
+///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
+///    to 0x8000. Otherwise if the floating-point element is greater
+///    than 0x7FFF, it is converted to 0x7FFF.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 64-bit integer vector of [4 x i16] containing the converted
+///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi16(__m128 __a)
 {
@@ -910,6 +2766,23 @@
   return _mm_packs_pi32(__b, __c);
 }
 
+/// \brief Converts each single-precision floating-point element of a 128-bit
+///    floating-point vector of [4 x float] into an 8-bit signed integer, and
+///    packs the results into the lower 32 bits of a 64-bit integer vector of
+///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
+///    floating-point element is NaN or infinity, or if the floating-point
+///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
+///    to 0x80. Otherwise if the floating-point element is greater
+///    than 0x7F, it is converted to 0x7F.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
+///
+/// \param __a
+///    128-bit floating-point vector of [4 x float].
+/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
+///    converted values and the uppper 32 bits are set to zero.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi8(__m128 __a)
 {
@@ -921,16 +2794,28 @@
   return _mm_packs_pi16(__b, __c);
 }
 
+/// \brief Extracts the sign bits from each single-precision floating-point
+///    element of a 128-bit floating-point vector of [4 x float] and returns the
+///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
+///    single-precision floating-point element of the parameter. Bits [31:4] are
+///    set to zero.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_ps(__m128 __a)
 {
-  return __builtin_ia32_movmskps(__a);
+  return __builtin_ia32_movmskps((__v4sf)__a);
 }
 
 
-#ifdef _MSC_VER
-#define _MM_ALIGN16 __declspec(align(16))
-#endif
+#define _MM_ALIGN16 __attribute__((aligned(16)))
 
 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
 
@@ -1003,7 +2888,7 @@
 #undef __DEFAULT_FN_ATTRS
 
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
-#if defined(__SSE2__) && !__has_feature(modules)
+#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
 #include <emmintrin.h>
 #endif
 
diff --git a/lib/Headers/xopintrin.h b/lib/Headers/xopintrin.h
index f07f51c..bdf0cec 100644
--- a/lib/Headers/xopintrin.h
+++ b/lib/Headers/xopintrin.h
@@ -198,13 +198,13 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
 {
-  return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
+  return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
 {
-  return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
+  return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
diff --git a/lib/Index/CMakeLists.txt b/lib/Index/CMakeLists.txt
index 7e39e49..8f51ccb 100644
--- a/lib/Index/CMakeLists.txt
+++ b/lib/Index/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  Core
   Support
   )
 
diff --git a/lib/Index/CodegenNameGenerator.cpp b/lib/Index/CodegenNameGenerator.cpp
index d663cc3..92740b0 100644
--- a/lib/Index/CodegenNameGenerator.cpp
+++ b/lib/Index/CodegenNameGenerator.cpp
@@ -31,7 +31,7 @@
 
   Implementation(ASTContext &Ctx)
     : MC(Ctx.createMangleContext()),
-      DL(Ctx.getTargetInfo().getDataLayoutString()) {}
+      DL(Ctx.getTargetInfo().getDataLayout()) {}
 
   bool writeName(const Decl *D, raw_ostream &OS) {
     // First apply frontend mangling.
@@ -76,8 +76,6 @@
 
     ASTContext &Ctx = ND->getASTContext();
     std::unique_ptr<MangleContext> M(Ctx.createMangleContext());
-    std::unique_ptr<llvm::DataLayout> DL(
-        new llvm::DataLayout(Ctx.getTargetInfo().getDataLayoutString()));
 
     std::vector<std::string> Manglings;
 
diff --git a/lib/Index/CommentToXML.cpp b/lib/Index/CommentToXML.cpp
index 15f1696..c4beef2 100644
--- a/lib/Index/CommentToXML.cpp
+++ b/lib/Index/CommentToXML.cpp
@@ -592,9 +592,8 @@
 
 void CommentASTToXMLConverter::formatTextOfDeclaration(
     const DeclInfo *DI, SmallString<128> &Declaration) {
-  // FIXME. formatting API expects null terminated input string.
-  // There might be more efficient way of doing this.
-  std::string StringDecl = Declaration.str();
+  // Formatting API expects null terminated input string.
+  StringRef StringDecl(Declaration.c_str(), Declaration.size());
 
   // Formatter specific code.
   // Form a unique in memory buffer name.
diff --git a/lib/Index/IndexBody.cpp b/lib/Index/IndexBody.cpp
index 1db1114..62f4e88 100644
--- a/lib/Index/IndexBody.cpp
+++ b/lib/Index/IndexBody.cpp
@@ -147,13 +147,10 @@
   }
 
   bool VisitDesignatedInitExpr(DesignatedInitExpr *E) {
-    for (DesignatedInitExpr::reverse_designators_iterator
-           D = E->designators_rbegin(), DEnd = E->designators_rend();
-           D != DEnd; ++D) {
-      if (D->isFieldDesignator() && D->getField())
-        return IndexCtx.handleReference(D->getField(), D->getFieldLoc(),
-                                        Parent, ParentDC, SymbolRoleSet(),
-                                        {}, E);
+    for (DesignatedInitExpr::Designator &D : llvm::reverse(E->designators())) {
+      if (D.isFieldDesignator() && D.getField())
+        return IndexCtx.handleReference(D.getField(), D.getFieldLoc(), Parent,
+                                        ParentDC, SymbolRoleSet(), {}, E);
     }
     return true;
   }
@@ -311,11 +308,9 @@
       bool shouldWalkTypesOfTypeLocs() const { return false; }
 
       bool VisitDesignatedInitExpr(DesignatedInitExpr *E) {
-        for (DesignatedInitExpr::reverse_designators_iterator
-               D = E->designators_rbegin(), DEnd = E->designators_rend();
-               D != DEnd; ++D) {
-          if (D->isFieldDesignator())
-            return IndexCtx.handleReference(D->getField(), D->getFieldLoc(),
+        for (DesignatedInitExpr::Designator &D : llvm::reverse(E->designators())) {
+          if (D.isFieldDesignator())
+            return IndexCtx.handleReference(D.getField(), D.getFieldLoc(),
                                             Parent, ParentDC, SymbolRoleSet(),
                                             {}, E);
         }
diff --git a/lib/Index/IndexDecl.cpp b/lib/Index/IndexDecl.cpp
index 5f5c49a..eb3e151 100644
--- a/lib/Index/IndexDecl.cpp
+++ b/lib/Index/IndexDecl.cpp
@@ -67,7 +67,7 @@
         }
       } else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
         if (FD->isThisDeclarationADefinition()) {
-          for (auto PI : FD->params()) {
+          for (auto PI : FD->parameters()) {
             IndexCtx.handleDecl(PI);
           }
         }
@@ -79,7 +79,7 @@
     if (!IndexCtx.handleDecl(D, (unsigned)SymbolRole::Dynamic))
       return false;
     IndexCtx.indexTypeSourceInfo(D->getReturnTypeSourceInfo(), D);
-    for (const auto *I : D->params())
+    for (const auto *I : D->parameters())
       handleDeclarator(I, D);
 
     if (D->isThisDeclarationADefinition()) {
diff --git a/lib/Index/Makefile b/lib/Index/Makefile
deleted file mode 100644
index c53fccd..0000000
--- a/lib/Index/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- clang/lib/Index/Makefile ----------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangIndex
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Index/USRGeneration.cpp b/lib/Index/USRGeneration.cpp
index 0ec1397..6a114f9 100644
--- a/lib/Index/USRGeneration.cpp
+++ b/lib/Index/USRGeneration.cpp
@@ -12,7 +12,6 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/DeclVisitor.h"
 #include "clang/Lex/PreprocessingRecord.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -90,18 +89,23 @@
   void VisitVarDecl(const VarDecl *D);
   void VisitNonTypeTemplateParmDecl(const NonTypeTemplateParmDecl *D);
   void VisitTemplateTemplateParmDecl(const TemplateTemplateParmDecl *D);
+
   void VisitLinkageSpecDecl(const LinkageSpecDecl *D) {
     IgnoreResults = true;
   }
+
   void VisitUsingDirectiveDecl(const UsingDirectiveDecl *D) {
     IgnoreResults = true;
   }
+
   void VisitUsingDecl(const UsingDecl *D) {
     IgnoreResults = true;
   }
+
   void VisitUnresolvedUsingValueDecl(const UnresolvedUsingValueDecl *D) {
     IgnoreResults = true;
   }
+
   void VisitUnresolvedUsingTypenameDecl(const UnresolvedUsingTypenameDecl *D) {
     IgnoreResults = true;
   }
@@ -126,14 +130,17 @@
   void GenObjCClass(StringRef cls) {
     generateUSRForObjCClass(cls, Out);
   }
+
   /// Generate a USR for an Objective-C class category.
   void GenObjCCategory(StringRef cls, StringRef cat) {
     generateUSRForObjCCategory(cls, cat, Out);
   }
+
   /// Generate a USR fragment for an Objective-C property.
   void GenObjCProperty(StringRef prop, bool isClassProp) {
     generateUSRForObjCProperty(prop, isClassProp, Out);
   }
+
   /// Generate a USR for an Objective-C protocol.
   void GenObjCProtocol(StringRef prot) {
     generateUSRForObjCProtocol(prot, Out);
@@ -148,7 +155,6 @@
   ///  the decl had no name.
   bool EmitDeclName(const NamedDecl *D);
 };
-
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -226,7 +232,7 @@
   }
 
   // Mangle in type information for the arguments.
-  for (auto PD : D->params()) {
+  for (auto PD : D->parameters()) {
     Out << '#';
     VisitType(PD->getType());
   }
@@ -293,13 +299,11 @@
 void USRGenerator::VisitNonTypeTemplateParmDecl(
                                         const NonTypeTemplateParmDecl *D) {
   GenLoc(D, /*IncludeOffset=*/true);
-  return;
 }
 
 void USRGenerator::VisitTemplateTemplateParmDecl(
                                         const TemplateTemplateParmDecl *D) {
   GenLoc(D, /*IncludeOffset=*/true);
-  return;
 }
 
 void USRGenerator::VisitNamespaceDecl(const NamespaceDecl *D) {
@@ -515,7 +519,6 @@
 
 void USRGenerator::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *D) {
   GenLoc(D, /*IncludeOffset=*/true);
-  return;
 }
 
 bool USRGenerator::GenLoc(const Decl *D, bool IncludeOffset) {
@@ -614,24 +617,17 @@
           c = 'd'; break;
         case BuiltinType::LongDouble:
           c = 'D'; break;
+        case BuiltinType::Float128:
+          c = 'Q'; break;
         case BuiltinType::NullPtr:
           c = 'n'; break;
 #define BUILTIN_TYPE(Id, SingletonId)
 #define PLACEHOLDER_TYPE(Id, SingletonId) case BuiltinType::Id:
 #include "clang/AST/BuiltinTypes.def"
         case BuiltinType::Dependent:
-        case BuiltinType::OCLImage1d:
-        case BuiltinType::OCLImage1dArray:
-        case BuiltinType::OCLImage1dBuffer:
-        case BuiltinType::OCLImage2d:
-        case BuiltinType::OCLImage2dArray:
-        case BuiltinType::OCLImage2dDepth:
-        case BuiltinType::OCLImage2dArrayDepth:
-        case BuiltinType::OCLImage2dMSAA:
-        case BuiltinType::OCLImage2dArrayMSAA:
-        case BuiltinType::OCLImage2dMSAADepth:
-        case BuiltinType::OCLImage2dArrayMSAADepth:
-        case BuiltinType::OCLImage3d:
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+        case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
         case BuiltinType::OCLEvent:
         case BuiltinType::OCLClkEvent:
         case BuiltinType::OCLQueue:
@@ -908,4 +904,3 @@
   Out << MD->getName()->getName();
   return false;
 }
-
diff --git a/lib/Lex/HeaderSearch.cpp b/lib/Lex/HeaderSearch.cpp
index ea65197..77ced23 100644
--- a/lib/Lex/HeaderSearch.cpp
+++ b/lib/Lex/HeaderSearch.cpp
@@ -14,7 +14,6 @@
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/IdentifierTable.h"
-#include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/Lex/ExternalPreprocessorSource.h"
 #include "clang/Lex/HeaderMap.h"
 #include "clang/Lex/HeaderSearchOptions.h"
@@ -27,8 +26,8 @@
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstdio>
+#include <utility>
 #if defined(LLVM_ON_UNIX)
 #include <limits.h>
 #endif
@@ -56,9 +55,9 @@
                            SourceManager &SourceMgr, DiagnosticsEngine &Diags,
                            const LangOptions &LangOpts,
                            const TargetInfo *Target)
-    : HSOpts(HSOpts), Diags(Diags), FileMgr(SourceMgr.getFileManager()),
-      FrameworkMap(64), ModMap(SourceMgr, Diags, LangOpts, Target, *this),
-      LangOpts(LangOpts) {
+    : HSOpts(std::move(HSOpts)), Diags(Diags),
+      FileMgr(SourceMgr.getFileManager()), FrameworkMap(64),
+      ModMap(SourceMgr, Diags, LangOpts, Target, *this) {
   AngledDirIdx = 0;
   SystemDirIdx = 0;
   NoCurDirSearch = false;
@@ -122,29 +121,11 @@
 std::string HeaderSearch::getModuleFileName(Module *Module) {
   const FileEntry *ModuleMap =
       getModuleMap().getModuleMapFileForUniquing(Module);
-  return getModuleFileName(Module->Name, ModuleMap->getName(),
-                           /*UsePrebuiltPath*/false);
+  return getModuleFileName(Module->Name, ModuleMap->getName());
 }
 
 std::string HeaderSearch::getModuleFileName(StringRef ModuleName,
-                                            StringRef ModuleMapPath,
-                                            bool UsePrebuiltPath) {
-  if (UsePrebuiltPath) {
-    if (HSOpts->PrebuiltModulePaths.empty())
-      return std::string();
-
-    // Go though each prebuilt module path and try to find the pcm file.
-    for (const std::string &Dir : HSOpts->PrebuiltModulePaths) {
-      SmallString<256> Result(Dir);
-      llvm::sys::fs::make_absolute(Result);
-
-      llvm::sys::path::append(Result, ModuleName + ".pcm");
-      if (getFileMgr().getFile(Result.str()))
-        return Result.str().str();
-    }
-    return std::string();
-  }
-
+                                            StringRef ModuleMapPath) {
   // If we don't have a module cache path or aren't supposed to use one, we
   // can't do anything.
   if (getModuleCachePath().empty())
@@ -268,8 +249,9 @@
 }
 
 const FileEntry *HeaderSearch::getFileAndSuggestModule(
-    StringRef FileName, const DirectoryEntry *Dir, bool IsSystemHeaderDir,
-    Module *RequestingModule, ModuleMap::KnownHeader *SuggestedModule) {
+    StringRef FileName, SourceLocation IncludeLoc, const DirectoryEntry *Dir,
+    bool IsSystemHeaderDir, Module *RequestingModule,
+    ModuleMap::KnownHeader *SuggestedModule) {
   // If we have a module map that might map this header, load it and
   // check whether we'll have a suggestion for a module.
   const FileEntry *File = getFileMgr().getFile(FileName, /*OpenFile=*/true);
@@ -290,6 +272,7 @@
 const FileEntry *DirectoryLookup::LookupFile(
     StringRef &Filename,
     HeaderSearch &HS,
+    SourceLocation IncludeLoc,
     SmallVectorImpl<char> *SearchPath,
     SmallVectorImpl<char> *RelativePath,
     Module *RequestingModule,
@@ -315,7 +298,7 @@
       RelativePath->append(Filename.begin(), Filename.end());
     }
 
-    return HS.getFileAndSuggestModule(TmpDir, getDir(),
+    return HS.getFileAndSuggestModule(TmpDir, IncludeLoc, getDir(),
                                       isSystemHeaderDirectory(),
                                       RequestingModule, SuggestedModule);
   }
@@ -603,7 +586,7 @@
       RelativePath->append(Filename.begin(), Filename.end());
     }
     // Otherwise, just return the file.
-    return getFileAndSuggestModule(Filename, nullptr,
+    return getFileAndSuggestModule(Filename, IncludeLoc, nullptr,
                                    /*IsSystemHeaderDir*/false,
                                    RequestingModule, SuggestedModule);
   }
@@ -640,7 +623,7 @@
           Includer ? getFileInfo(Includer).DirInfo != SrcMgr::C_User :
           BuildSystemModule;
       if (const FileEntry *FE = getFileAndSuggestModule(
-              TmpDir, IncluderAndDir.second, IncluderIsSystemHeader,
+              TmpDir, IncludeLoc, IncluderAndDir.second, IncluderIsSystemHeader,
               RequestingModule, SuggestedModule)) {
         if (!Includer) {
           assert(First && "only first includer can have no file");
@@ -731,7 +714,7 @@
     bool InUserSpecifiedSystemFramework = false;
     bool HasBeenMapped = false;
     const FileEntry *FE = SearchDirs[i].LookupFile(
-        Filename, *this, SearchPath, RelativePath, RequestingModule,
+        Filename, *this, IncludeLoc, SearchPath, RelativePath, RequestingModule,
         SuggestedModule, InUserSpecifiedSystemFramework, HasBeenMapped,
         MappedName);
     if (HasBeenMapped) {
@@ -1442,3 +1425,54 @@
 
   SearchDir.setSearchedAllModuleMaps(true);
 }
+
+std::string HeaderSearch::suggestPathToFileForDiagnostics(const FileEntry *File,
+                                                          bool *IsSystem) {
+  // FIXME: We assume that the path name currently cached in the FileEntry is
+  // the most appropriate one for this analysis (and that it's spelled the same
+  // way as the corresponding header search path).
+  const char *Name = File->getName();
+
+  unsigned BestPrefixLength = 0;
+  unsigned BestSearchDir;
+
+  for (unsigned I = 0; I != SearchDirs.size(); ++I) {
+    // FIXME: Support this search within frameworks and header maps.
+    if (!SearchDirs[I].isNormalDir())
+      continue;
+
+    const char *Dir = SearchDirs[I].getDir()->getName();
+    for (auto NI = llvm::sys::path::begin(Name),
+              NE = llvm::sys::path::end(Name),
+              DI = llvm::sys::path::begin(Dir),
+              DE = llvm::sys::path::end(Dir);
+         /*termination condition in loop*/; ++NI, ++DI) {
+      // '.' components in Name are ignored.
+      while (NI != NE && *NI == ".")
+        ++NI;
+      if (NI == NE)
+        break;
+
+      // '.' components in Dir are ignored.
+      while (DI != DE && *DI == ".")
+        ++DI;
+      if (DI == DE) {
+        // Dir is a prefix of Name, up to '.' components and choice of path
+        // separators.
+        unsigned PrefixLength = NI - llvm::sys::path::begin(Name);
+        if (PrefixLength > BestPrefixLength) {
+          BestPrefixLength = PrefixLength;
+          BestSearchDir = I;
+        }
+        break;
+      }
+
+      if (*NI != *DI)
+        break;
+    }
+  }
+
+  if (IsSystem)
+    *IsSystem = BestPrefixLength ? BestSearchDir >= SystemDirIdx : false;
+  return Name + BestPrefixLength;
+}
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index a61dbec..9f7638d 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -719,7 +719,9 @@
   while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
     if (CharNo == 0)
       return TokStart.getLocWithOffset(PhysOffset);
-    ++TokPtr, --CharNo, ++PhysOffset;
+    ++TokPtr;
+    --CharNo;
+    ++PhysOffset;
   }
   
   // If we have a character that may be a trigraph or escaped newline, use a
@@ -1531,7 +1533,15 @@
     // preprocessor, which may macro expand it or something.
     if (II->isHandleIdentifierCase())
       return PP->HandleIdentifier(Result);
-    
+
+    if (II->getTokenID() == tok::identifier && isCodeCompletionPoint(CurPtr)
+        && II->getPPKeywordID() == tok::pp_not_keyword
+        && II->getObjCKeywordID() == tok::objc_not_keyword) {
+      // Return the code-completion token.
+      Result.setKind(tok::code_completion);
+      cutOffLexing();
+      return true;
+    }
     return true;
   }
 
@@ -1605,14 +1615,15 @@
 
   // If we have a hex FP constant, continue.
   if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
-    // Outside C99, we accept hexadecimal floating point numbers as a
+    // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
     // not-quite-conforming extension. Only do so if this looks like it's
     // actually meant to be a hexfloat, and not if it has a ud-suffix.
     bool IsHexFloat = true;
     if (!LangOpts.C99) {
       if (!isHexaLiteral(BufferPtr, LangOpts))
         IsHexFloat = false;
-      else if (std::find(BufferPtr, CurPtr, '_') != CurPtr)
+      else if (!getLangOpts().CPlusPlus1z &&
+               std::find(BufferPtr, CurPtr, '_') != CurPtr)
         IsHexFloat = false;
     }
     if (IsHexFloat)
@@ -2633,8 +2644,8 @@
     return false;
   
   // Check to see if we have <<<<<<< or >>>>.
-  if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") &&
-      (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> "))
+  if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
+      !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
     return false;
 
   // If we have a situation where we don't care about conflict markers, ignore
@@ -3505,6 +3516,9 @@
     if (Char == '=') {
       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
       Kind = tok::caretequal;
+    } else if (LangOpts.OpenCL && Char == '^') {
+      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
+      Kind = tok::caretcaret;
     } else {
       Kind = tok::caret;
     }
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index 5b1c493..23bbace 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -18,6 +18,7 @@
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -522,8 +523,10 @@
   isLong = false;
   isUnsigned = false;
   isLongLong = false;
+  isHalf = false;
   isFloat = false;
   isImaginary = false;
+  isFloat128 = false;
   MicrosoftInteger = 0;
   hadError = false;
 
@@ -536,34 +539,10 @@
     s = SkipDigits(s);
     if (s == ThisTokEnd) {
       // Done.
-    } else if (isHexDigit(*s) && !(*s == 'e' || *s == 'E')) {
-      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
-              diag::err_invalid_digit) << StringRef(s, 1) << 0;
-      hadError = true;
-      return;
-    } else if (*s == '.') {
-      checkSeparator(TokLoc, s, CSK_AfterDigits);
-      s++;
-      saw_period = true;
-      checkSeparator(TokLoc, s, CSK_BeforeDigits);
-      s = SkipDigits(s);
-    }
-    if ((*s == 'e' || *s == 'E')) { // exponent
-      checkSeparator(TokLoc, s, CSK_AfterDigits);
-      const char *Exponent = s;
-      s++;
-      saw_exponent = true;
-      if (*s == '+' || *s == '-')  s++; // sign
-      checkSeparator(TokLoc, s, CSK_BeforeDigits);
-      const char *first_non_digit = SkipDigits(s);
-      if (first_non_digit != s) {
-        s = first_non_digit;
-      } else {
-        PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent - ThisTokBegin),
-                diag::err_exponent_has_no_digits);
-        hadError = true;
+    } else {
+      ParseDecimalOrOctalCommon(TokLoc);
+      if (hadError)
         return;
-      }
     }
   }
 
@@ -579,12 +558,28 @@
   // we break out of the loop.
   for (; s != ThisTokEnd; ++s) {
     switch (*s) {
+    case 'h':      // FP Suffix for "half".
+    case 'H':
+      // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
+      if (!PP.getLangOpts().Half) break;
+      if (!isFPConstant) break;  // Error for integer constant.
+      if (isHalf || isFloat || isLong) break; // HH, FH, LH invalid.
+      isHalf = true;
+      continue;  // Success.
     case 'f':      // FP Suffix for "float"
     case 'F':
       if (!isFPConstant) break;  // Error for integer constant.
-      if (isFloat || isLong) break; // FF, LF invalid.
+      if (isHalf || isFloat || isLong || isFloat128)
+        break; // HF, FF, LF, QF invalid.
       isFloat = true;
       continue;  // Success.
+    case 'q':    // FP Suffix for "__float128"
+    case 'Q':
+      if (!isFPConstant) break;  // Error for integer constant.
+      if (isHalf || isFloat || isLong || isFloat128)
+        break; // HQ, FQ, LQ, QQ invalid.
+      isFloat128 = true;
+      continue;  // Success.
     case 'u':
     case 'U':
       if (isFPConstant) break;  // Error for floating constant.
@@ -594,7 +589,7 @@
     case 'l':
     case 'L':
       if (isLong || isLongLong) break;  // Cannot be repeated.
-      if (isFloat) break;               // LF invalid.
+      if (isHalf || isFloat || isFloat128) break;     // LH, LF, LQ invalid.
 
       // Check for long long.  The L's need to be adjacent and the same case.
       if (s[1] == s[0]) {
@@ -671,6 +666,7 @@
       isUnsigned = false;
       isLongLong = false;
       isFloat = false;
+      isHalf = false;
       isImaginary = false;
       MicrosoftInteger = 0;
 
@@ -693,6 +689,49 @@
   }
 }
 
+/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
+/// numbers. It issues an error for illegal digits, and handles floating point
+/// parsing. If it detects a floating point number, the radix is set to 10.
+void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
+  assert((radix == 8 || radix == 10) && "Unexpected radix");
+
+  // If we have a hex digit other than 'e' (which denotes a FP exponent) then
+  // the code is using an incorrect base.
+  if (isHexDigit(*s) && *s != 'e' && *s != 'E') {
+    PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
+            diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 1 : 0);
+    hadError = true;
+    return;
+  }
+
+  if (*s == '.') {
+    checkSeparator(TokLoc, s, CSK_AfterDigits);
+    s++;
+    radix = 10;
+    saw_period = true;
+    checkSeparator(TokLoc, s, CSK_BeforeDigits);
+    s = SkipDigits(s); // Skip suffix.
+  }
+  if (*s == 'e' || *s == 'E') { // exponent
+    checkSeparator(TokLoc, s, CSK_AfterDigits);
+    const char *Exponent = s;
+    s++;
+    radix = 10;
+    saw_exponent = true;
+    if (*s == '+' || *s == '-')  s++; // sign
+    const char *first_non_digit = SkipDigits(s);
+    if (containsDigits(s, first_non_digit)) {
+      checkSeparator(TokLoc, s, CSK_BeforeDigits);
+      s = first_non_digit;
+    } else {
+      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
+              diag::err_exponent_has_no_digits);
+      hadError = true;
+      return;
+    }
+  }
+}
+
 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
 /// treat it as an invalid suffix.
@@ -752,21 +791,24 @@
     radix = 16;
     DigitsBegin = s;
     s = SkipHexDigits(s);
-    bool noSignificand = (s == DigitsBegin);
+    bool HasSignificandDigits = containsDigits(DigitsBegin, s);
     if (s == ThisTokEnd) {
       // Done.
     } else if (*s == '.') {
       s++;
       saw_period = true;
       const char *floatDigitsBegin = s;
-      checkSeparator(TokLoc, s, CSK_BeforeDigits);
       s = SkipHexDigits(s);
-      noSignificand &= (floatDigitsBegin == s);
+      if (containsDigits(floatDigitsBegin, s))
+        HasSignificandDigits = true;
+      if (HasSignificandDigits)
+        checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
     }
 
-    if (noSignificand) {
+    if (!HasSignificandDigits) {
       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
-        diag::err_hexconstant_requires) << 1;
+              diag::err_hex_constant_requires)
+          << PP.getLangOpts().CPlusPlus << 1;
       hadError = true;
       return;
     }
@@ -780,7 +822,7 @@
       saw_exponent = true;
       if (*s == '+' || *s == '-')  s++; // sign
       const char *first_non_digit = SkipDigits(s);
-      if (first_non_digit == s) {
+      if (!containsDigits(s, first_non_digit)) {
         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
                 diag::err_exponent_has_no_digits);
         hadError = true;
@@ -790,10 +832,15 @@
       s = first_non_digit;
 
       if (!PP.getLangOpts().HexFloats)
-        PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
+        PP.Diag(TokLoc, PP.getLangOpts().CPlusPlus
+                            ? diag::ext_hex_literal_invalid
+                            : diag::ext_hex_constant_invalid);
+      else if (PP.getLangOpts().CPlusPlus1z)
+        PP.Diag(TokLoc, diag::warn_cxx1z_hex_literal);
     } else if (saw_period) {
-      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
-              diag::err_hexconstant_requires) << 0;
+      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
+              diag::err_hex_constant_requires)
+          << PP.getLangOpts().CPlusPlus << 0;
       hadError = true;
     }
     return;
@@ -843,40 +890,7 @@
     }
   }
 
-  // If we have a hex digit other than 'e' (which denotes a FP exponent) then
-  // the code is using an incorrect base.
-  if (isHexDigit(*s) && *s != 'e' && *s != 'E') {
-    PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
-            diag::err_invalid_digit) << StringRef(s, 1) << 1;
-    hadError = true;
-    return;
-  }
-
-  if (*s == '.') {
-    s++;
-    radix = 10;
-    saw_period = true;
-    checkSeparator(TokLoc, s, CSK_BeforeDigits);
-    s = SkipDigits(s); // Skip suffix.
-  }
-  if (*s == 'e' || *s == 'E') { // exponent
-    checkSeparator(TokLoc, s, CSK_AfterDigits);
-    const char *Exponent = s;
-    s++;
-    radix = 10;
-    saw_exponent = true;
-    if (*s == '+' || *s == '-')  s++; // sign
-    const char *first_non_digit = SkipDigits(s);
-    if (first_non_digit != s) {
-      checkSeparator(TokLoc, s, CSK_BeforeDigits);
-      s = first_non_digit;
-    } else {
-      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
-              diag::err_exponent_has_no_digits);
-      hadError = true;
-      return;
-    }
-  }
+  ParseDecimalOrOctalCommon(TokLoc);
 }
 
 static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
diff --git a/lib/Lex/Makefile b/lib/Lex/Makefile
deleted file mode 100644
index d80fb55..0000000
--- a/lib/Lex/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-##===- clang/lib/Lex/Makefile ------------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-#  This implements the Lexer library for the C-Language front-end.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-include $(CLANG_LEVEL)/../../Makefile.config
-
-LIBRARYNAME := clangLex
-
-ifeq ($(ARCH),PowerPC)
-CXX.Flags += -maltivec
-endif
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/Lex/ModuleMap.cpp b/lib/Lex/ModuleMap.cpp
index deddeb4..9d3cff6 100644
--- a/lib/Lex/ModuleMap.cpp
+++ b/lib/Lex/ModuleMap.cpp
@@ -89,16 +89,13 @@
                      HeaderSearch &HeaderInfo)
     : SourceMgr(SourceMgr), Diags(Diags), LangOpts(LangOpts), Target(Target),
       HeaderInfo(HeaderInfo), BuiltinIncludeDir(nullptr),
-      CompilingModule(nullptr), SourceModule(nullptr), NumCreatedModules(0) {
+      SourceModule(nullptr), NumCreatedModules(0) {
   MMapLangOpts.LineComment = true;
 }
 
 ModuleMap::~ModuleMap() {
-  for (llvm::StringMap<Module *>::iterator I = Modules.begin(), 
-                                        IEnd = Modules.end();
-       I != IEnd; ++I) {
-    delete I->getValue();
-  }
+  for (auto &M : Modules)
+    delete M.getValue();
 }
 
 void ModuleMap::setTarget(const TargetInfo &Target) {
@@ -212,29 +209,25 @@
 
 static bool violatesPrivateInclude(Module *RequestingModule,
                                    const FileEntry *IncFileEnt,
-                                   ModuleMap::ModuleHeaderRole Role,
-                                   Module *RequestedModule) {
-  bool IsPrivateRole = Role & ModuleMap::PrivateHeader;
+                                   ModuleMap::KnownHeader Header) {
 #ifndef NDEBUG
-  if (IsPrivateRole) {
+  if (Header.getRole() & ModuleMap::PrivateHeader) {
     // Check for consistency between the module header role
     // as obtained from the lookup and as obtained from the module.
     // This check is not cheap, so enable it only for debugging.
     bool IsPrivate = false;
     SmallVectorImpl<Module::Header> *HeaderList[] = {
-        &RequestedModule->Headers[Module::HK_Private],
-        &RequestedModule->Headers[Module::HK_PrivateTextual]};
+        &Header.getModule()->Headers[Module::HK_Private],
+        &Header.getModule()->Headers[Module::HK_PrivateTextual]};
     for (auto *Hs : HeaderList)
       IsPrivate |=
           std::find_if(Hs->begin(), Hs->end(), [&](const Module::Header &H) {
             return H.Entry == IncFileEnt;
           }) != Hs->end();
-    assert((!IsPrivateRole || IsPrivate) && "inconsistent headers and roles");
+    assert(IsPrivate && "inconsistent headers and roles");
   }
 #endif
-  return IsPrivateRole && (!RequestingModule ||
-                           RequestedModule->getTopLevelModule() !=
-                               RequestingModule->getTopLevelModule());
+  return !Header.isAccessibleFrom(RequestingModule);
 }
 
 static Module *getTopLevelOrNull(Module *M) {
@@ -242,6 +235,7 @@
 }
 
 void ModuleMap::diagnoseHeaderInclusion(Module *RequestingModule,
+                                        bool RequestingModuleIsModuleInterface,
                                         SourceLocation FilenameLoc,
                                         StringRef Filename,
                                         const FileEntry *File) {
@@ -261,8 +255,7 @@
   if (Known != Headers.end()) {
     for (const KnownHeader &Header : Known->second) {
       // Remember private headers for later printing of a diagnostic.
-      if (violatesPrivateInclude(RequestingModule, File, Header.getRole(),
-                                 Header.getModule())) {
+      if (violatesPrivateInclude(RequestingModule, File, Header)) {
         Private = Header.getModule();
         continue;
       }
@@ -304,7 +297,9 @@
   if (LangOpts.ModulesStrictDeclUse) {
     Diags.Report(FilenameLoc, diag::err_undeclared_use_of_module)
         << RequestingModule->getFullModuleName() << Filename;
-  } else if (RequestingModule) {
+  } else if (RequestingModule && RequestingModuleIsModuleInterface &&
+             LangOpts.CompilingModule) {
+    // Do not diagnose when we are not compiling a module. 
     diag::kind DiagID = RequestingModule->getTopLevelModule()->IsFramework ?
         diag::warn_non_modular_include_in_framework_module :
         diag::warn_non_modular_include_in_module;
@@ -344,8 +339,8 @@
     ModuleMap::KnownHeader Result;
     // Iterate over all modules that 'File' is part of to find the best fit.
     for (KnownHeader &H : Known->second) {
-      // Prefer a header from the current module over all others.
-      if (H.getModule()->getTopLevelModule() == CompilingModule)
+      // Prefer a header from the source module over all others.
+      if (H.getModule()->getTopLevelModule() == SourceModule)
         return MakeResult(H);
       if (!Result || isBetterKnownHeader(H, Result))
         Result = H;
@@ -557,17 +552,11 @@
   // Create a new module with this name.
   Module *Result = new Module(Name, SourceLocation(), Parent,
                               IsFramework, IsExplicit, NumCreatedModules++);
-  if (LangOpts.CurrentModule == Name) {
-    SourceModule = Result;
-    SourceModuleName = Name;
-  }
   if (!Parent) {
+    if (LangOpts.CurrentModule == Name)
+      SourceModule = Result;
     Modules[Name] = Result;
     ModuleScopeIDs[Result] = CurrentModuleScopeID;
-    if (!LangOpts.CurrentModule.empty() && !CompilingModule &&
-        Name == LangOpts.CurrentModule) {
-      CompilingModule = Result;
-    }
   }
   return std::make_pair(Result, true);
 }
@@ -711,9 +700,10 @@
     ModuleScopeIDs[Result] = CurrentModuleScopeID;
   InferredModuleAllowedBy[Result] = ModuleMapFile;
   Result->IsInferred = true;
-  if (LangOpts.CurrentModule == ModuleName) {
-    SourceModule = Result;
-    SourceModuleName = ModuleName;
+  if (!Parent) {
+    if (LangOpts.CurrentModule == ModuleName)
+      SourceModule = Result;
+    Modules[ModuleName] = Result;
   }
 
   Result->IsSystem |= Attrs.IsSystem;
@@ -721,9 +711,6 @@
   Result->ConfigMacrosExhaustive |= Attrs.IsExhaustive;
   Result->Directory = FrameworkDir;
 
-  if (!Parent)
-    Modules[ModuleName] = Result;
-  
   // umbrella header "umbrella-header-name"
   //
   // The "Headers/" component of the name is implied because this is
@@ -836,7 +823,8 @@
   HeaderList.push_back(KH);
   Mod->Headers[headerRoleToKind(Role)].push_back(std::move(Header));
 
-  bool isCompilingModuleHeader = Mod->getTopLevelModule() == CompilingModule;
+  bool isCompilingModuleHeader =
+      LangOpts.CompilingModule && Mod->getTopLevelModule() == SourceModule;
   if (!Imported || isCompilingModuleHeader) {
     // When we import HeaderFileInfo, the external source is expected to
     // set the isModuleHeader flag itself.
@@ -948,6 +936,9 @@
   if (Loc.isInvalid())
     return nullptr;
 
+  if (UmbrellaDirs.empty() && Headers.empty())
+    return nullptr;
+
   // Use the expansion location to determine which module we're in.
   FullSourceLoc ExpansionLoc = Loc.getExpansionLoc();
   if (!ExpansionLoc.isFileID())
@@ -1439,7 +1430,9 @@
   
   // Parse the optional attribute list.
   Attributes Attrs;
-  parseOptionalAttributes(Attrs);
+  if (parseOptionalAttributes(Attrs))
+    return;
+
   
   // Parse the opening brace.
   if (!Tok.is(MMToken::LBrace)) {
@@ -2119,7 +2112,9 @@
 
   // Parse the optional attributes.
   Attributes Attrs;
-  parseOptionalAttributes(Attrs);
+  if (parseOptionalAttributes(Attrs))
+    return;
+
   if (Attrs.IsExhaustive && !ActiveModule->Parent) {
     ActiveModule->ConfigMacrosExhaustive = true;
   }
@@ -2267,7 +2262,8 @@
 
   // Parse optional attributes.
   Attributes Attrs;
-  parseOptionalAttributes(Attrs);
+  if (parseOptionalAttributes(Attrs))
+    return;
 
   if (ActiveModule) {
     // Note that we have an inferred submodule.
diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index f5e1bc4..031829e 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/lib/Lex/PPDirectives.cpp
@@ -12,7 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "clang/Lex/Preprocessor.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/CodeCompletionHandler.h"
@@ -23,10 +22,15 @@
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/ModuleLoader.h"
 #include "clang/Lex/Pragma.h"
-#include "llvm/ADT/APInt.h"
+#include "clang/Lex/Preprocessor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SaveAndRestore.h"
+
 using namespace clang;
 
 //===----------------------------------------------------------------------===//
@@ -136,6 +140,84 @@
   return MD_NoWarn;
 }
 
+// Return true if we want to issue a diagnostic by default if we
+// encounter this name in a #include with the wrong case. For now,
+// this includes the standard C and C++ headers, Posix headers,
+// and Boost headers. Improper case for these #includes is a
+// potential portability issue.
+static bool warnByDefaultOnWrongCase(StringRef Include) {
+  // If the first component of the path is "boost", treat this like a standard header
+  // for the purposes of diagnostics.
+  if (::llvm::sys::path::begin(Include)->equals_lower("boost"))
+    return true;
+
+  // "condition_variable" is the longest standard header name at 18 characters.
+  // If the include file name is longer than that, it can't be a standard header.
+  static const size_t MaxStdHeaderNameLen = 18u;
+  if (Include.size() > MaxStdHeaderNameLen)
+    return false;
+
+  // Lowercase and normalize the search string.
+  SmallString<32> LowerInclude{Include};
+  for (char &Ch : LowerInclude) {
+    // In the ASCII range?
+    if (static_cast<unsigned char>(Ch) > 0x7f)
+      return false; // Can't be a standard header
+    // ASCII lowercase:
+    if (Ch >= 'A' && Ch <= 'Z')
+      Ch += 'a' - 'A';
+    // Normalize path separators for comparison purposes.
+    else if (::llvm::sys::path::is_separator(Ch))
+      Ch = '/';
+  }
+
+  // The standard C/C++ and Posix headers
+  return llvm::StringSwitch<bool>(LowerInclude)
+    // C library headers
+    .Cases("assert.h", "complex.h", "ctype.h", "errno.h", "fenv.h", true)
+    .Cases("float.h", "inttypes.h", "iso646.h", "limits.h", "locale.h", true)
+    .Cases("math.h", "setjmp.h", "signal.h", "stdalign.h", "stdarg.h", true)
+    .Cases("stdatomic.h", "stdbool.h", "stddef.h", "stdint.h", "stdio.h", true)
+    .Cases("stdlib.h", "stdnoreturn.h", "string.h", "tgmath.h", "threads.h", true)
+    .Cases("time.h", "uchar.h", "wchar.h", "wctype.h", true)
+
+    // C++ headers for C library facilities
+    .Cases("cassert", "ccomplex", "cctype", "cerrno", "cfenv", true)
+    .Cases("cfloat", "cinttypes", "ciso646", "climits", "clocale", true)
+    .Cases("cmath", "csetjmp", "csignal", "cstdalign", "cstdarg", true)
+    .Cases("cstdbool", "cstddef", "cstdint", "cstdio", "cstdlib", true)
+    .Cases("cstring", "ctgmath", "ctime", "cuchar", "cwchar", true)
+    .Case("cwctype", true)
+
+    // C++ library headers
+    .Cases("algorithm", "fstream", "list", "regex", "thread", true)
+    .Cases("array", "functional", "locale", "scoped_allocator", "tuple", true)
+    .Cases("atomic", "future", "map", "set", "type_traits", true)
+    .Cases("bitset", "initializer_list", "memory", "shared_mutex", "typeindex", true)
+    .Cases("chrono", "iomanip", "mutex", "sstream", "typeinfo", true)
+    .Cases("codecvt", "ios", "new", "stack", "unordered_map", true)
+    .Cases("complex", "iosfwd", "numeric", "stdexcept", "unordered_set", true)
+    .Cases("condition_variable", "iostream", "ostream", "streambuf", "utility", true)
+    .Cases("deque", "istream", "queue", "string", "valarray", true)
+    .Cases("exception", "iterator", "random", "strstream", "vector", true)
+    .Cases("forward_list", "limits", "ratio", "system_error", true)
+
+    // POSIX headers (which aren't also C headers)
+    .Cases("aio.h", "arpa/inet.h", "cpio.h", "dirent.h", "dlfcn.h", true)
+    .Cases("fcntl.h", "fmtmsg.h", "fnmatch.h", "ftw.h", "glob.h", true)
+    .Cases("grp.h", "iconv.h", "langinfo.h", "libgen.h", "monetary.h", true)
+    .Cases("mqueue.h", "ndbm.h", "net/if.h", "netdb.h", "netinet/in.h", true)
+    .Cases("netinet/tcp.h", "nl_types.h", "poll.h", "pthread.h", "pwd.h", true)
+    .Cases("regex.h", "sched.h", "search.h", "semaphore.h", "spawn.h", true)
+    .Cases("strings.h", "stropts.h", "sys/ipc.h", "sys/mman.h", "sys/msg.h", true)
+    .Cases("sys/resource.h", "sys/select.h",  "sys/sem.h", "sys/shm.h", "sys/socket.h", true)
+    .Cases("sys/stat.h", "sys/statvfs.h", "sys/time.h", "sys/times.h", "sys/types.h", true)
+    .Cases("sys/uio.h", "sys/un.h", "sys/utsname.h", "sys/wait.h", "syslog.h", true)
+    .Cases("tar.h", "termios.h", "trace.h", "ulimit.h", true)
+    .Cases("unistd.h", "utime.h", "utmpx.h", "wordexp.h", true)
+    .Default(false);
+}
+
 bool Preprocessor::CheckMacroName(Token &MacroNameTok, MacroUse isDefineUndef,
                                   bool *ShadowFlag) {
   // Missing macro name?
@@ -272,8 +354,6 @@
   }
 }
 
-
-
 /// SkipExcludedConditionalBlock - We just read a \#if or related directive and
 /// decided that the subsequent tokens are in the \#if'd out portion of the
 /// file.  Lex the rest of the file, until we see an \#endif.  If
@@ -310,7 +390,7 @@
       setCodeCompletionReached();
       continue;
     }
-    
+
     // If this is the end of the buffer, we have an error.
     if (Tok.is(tok::eof)) {
       // Emit errors for each unterminated conditional on the stack, including
@@ -497,7 +577,6 @@
 }
 
 void Preprocessor::PTHSkipExcludedConditionalBlock() {
-
   while (1) {
     assert(CurPTHLexer);
     assert(CurPTHLexer->LexingRawMode == false);
@@ -571,28 +650,27 @@
     }
 
     // Otherwise, skip this block and go to the next one.
-    continue;
   }
 }
 
 Module *Preprocessor::getModuleForLocation(SourceLocation Loc) {
-  ModuleMap &ModMap = HeaderInfo.getModuleMap();
-  if (SourceMgr.isInMainFile(Loc)) {
-    if (Module *CurMod = getCurrentModule())
-      return CurMod;                               // Compiling a module.
-    return HeaderInfo.getModuleMap().SourceModule; // Compiling a source.
+  if (!SourceMgr.isInMainFile(Loc)) {
+    // Try to determine the module of the include directive.
+    // FIXME: Look into directly passing the FileEntry from LookupFile instead.
+    FileID IDOfIncl = SourceMgr.getFileID(SourceMgr.getExpansionLoc(Loc));
+    if (const FileEntry *EntryOfIncl = SourceMgr.getFileEntryForID(IDOfIncl)) {
+      // The include comes from an included file.
+      return HeaderInfo.getModuleMap()
+          .findModuleForHeader(EntryOfIncl)
+          .getModule();
+    }
   }
-  // Try to determine the module of the include directive.
-  // FIXME: Look into directly passing the FileEntry from LookupFile instead.
-  FileID IDOfIncl = SourceMgr.getFileID(SourceMgr.getExpansionLoc(Loc));
-  if (const FileEntry *EntryOfIncl = SourceMgr.getFileEntryForID(IDOfIncl)) {
-    // The include comes from a file.
-    return ModMap.findModuleForHeader(EntryOfIncl).getModule();
-  } else {
-    // The include does not come from a file,
-    // so it is probably a module compilation.
-    return getCurrentModule();
-  }
+
+  // This is either in the main file or not in a file at all. It belongs
+  // to the current module, if there is one.
+  return getLangOpts().CurrentModule.empty()
+             ? nullptr
+             : HeaderInfo.lookupModule(getLangOpts().CurrentModule);
 }
 
 Module *Preprocessor::getModuleContainingLocation(SourceLocation Loc) {
@@ -600,6 +678,62 @@
       FullSourceLoc(Loc, SourceMgr));
 }
 
+const FileEntry *
+Preprocessor::getModuleHeaderToIncludeForDiagnostics(SourceLocation IncLoc,
+                                                     SourceLocation Loc) {
+  // If we have a module import syntax, we shouldn't include a header to
+  // make a particular module visible.
+  if (getLangOpts().ObjC2)
+    return nullptr;
+
+  // Figure out which module we'd want to import.
+  Module *M = getModuleContainingLocation(Loc);
+  if (!M)
+    return nullptr;
+
+  Module *TopM = M->getTopLevelModule();
+  Module *IncM = getModuleForLocation(IncLoc);
+
+  // Walk up through the include stack, looking through textual headers of M
+  // until we hit a non-textual header that we can #include. (We assume textual
+  // headers of a module with non-textual headers aren't meant to be used to
+  // import entities from the module.)
+  auto &SM = getSourceManager();
+  while (!Loc.isInvalid() && !SM.isInMainFile(Loc)) {
+    auto ID = SM.getFileID(SM.getExpansionLoc(Loc));
+    auto *FE = SM.getFileEntryForID(ID);
+
+    bool InTextualHeader = false;
+    for (auto Header : HeaderInfo.getModuleMap().findAllModulesForHeader(FE)) {
+      if (!Header.getModule()->isSubModuleOf(TopM))
+        continue;
+
+      if (!(Header.getRole() & ModuleMap::TextualHeader)) {
+        // If this is an accessible, non-textual header of M's top-level module
+        // that transitively includes the given location and makes the
+        // corresponding module visible, this is the thing to #include.
+        if (Header.isAccessibleFrom(IncM))
+          return FE;
+
+        // It's in a private header; we can't #include it.
+        // FIXME: If there's a public header in some module that re-exports it,
+        // then we could suggest including that, but it's not clear that's the
+        // expected way to make this entity visible.
+        continue;
+      }
+
+      InTextualHeader = true;
+    }
+
+    if (!InTextualHeader)
+      break;
+
+    Loc = SM.getIncludeLoc(ID);
+  }
+
+  return nullptr;
+}
+
 const FileEntry *Preprocessor::LookupFile(
     SourceLocation FilenameLoc,
     StringRef Filename,
@@ -611,7 +745,8 @@
     SmallVectorImpl<char> *RelativePath,
     ModuleMap::KnownHeader *SuggestedModule,
     bool SkipCache) {
-  Module *RequestingModule = getModuleForLocation(FilenameLoc); 
+  Module *RequestingModule = getModuleForLocation(FilenameLoc);
+  bool RequestingModuleIsModuleInterface = !SourceMgr.isInMainFile(FilenameLoc);
 
   // If the header lookup mechanism may be relative to the current inclusion
   // stack, record the parent #includes.
@@ -689,7 +824,8 @@
   if (FE) {
     if (SuggestedModule && !LangOpts.AsmPreprocessor)
       HeaderInfo.getModuleMap().diagnoseHeaderInclusion(
-          RequestingModule, FilenameLoc, Filename, FE);
+          RequestingModule, RequestingModuleIsModuleInterface, FilenameLoc,
+          Filename, FE);
     return FE;
   }
 
@@ -705,7 +841,8 @@
                                                     SuggestedModule))) {
         if (SuggestedModule && !LangOpts.AsmPreprocessor)
           HeaderInfo.getModuleMap().diagnoseHeaderInclusion(
-              RequestingModule, FilenameLoc, Filename, FE);
+              RequestingModule, RequestingModuleIsModuleInterface, FilenameLoc,
+              Filename, FE);
         return FE;
       }
     }
@@ -720,7 +857,8 @@
                 RequestingModule, SuggestedModule))) {
           if (SuggestedModule && !LangOpts.AsmPreprocessor)
             HeaderInfo.getModuleMap().diagnoseHeaderInclusion(
-                RequestingModule, FilenameLoc, Filename, FE);
+                RequestingModule, RequestingModuleIsModuleInterface,
+                FilenameLoc, Filename, FE);
           return FE;
         }
       }
@@ -731,7 +869,6 @@
   return nullptr;
 }
 
-
 //===----------------------------------------------------------------------===//
 // Preprocessor Directive Handling.
 //===----------------------------------------------------------------------===//
@@ -743,9 +880,11 @@
     if (pp->MacroExpansionInDirectivesOverride)
       pp->DisableMacroExpansion = false;
   }
+
   ~ResetMacroExpansionHelper() {
     PP->DisableMacroExpansion = save;
   }
+
 private:
   Preprocessor *PP;
   bool save;
@@ -852,7 +991,7 @@
       return HandleIncludeDirective(SavedHash.getLocation(), Result);
     case tok::pp___include_macros:
       // Handle -imacros.
-      return HandleIncludeMacrosDirective(SavedHash.getLocation(), Result); 
+      return HandleIncludeMacrosDirective(SavedHash.getLocation(), Result);
 
     // C99 6.10.3 - Macro Replacement.
     case tok::pp_define:
@@ -891,12 +1030,12 @@
     case tok::pp_unassert:
       //isExtension = true;  // FIXME: implement #unassert
       break;
-        
+
     case tok::pp___public_macro:
       if (getLangOpts().Modules)
         return HandleMacroPublicDirective(Result);
       break;
-        
+
     case tok::pp___private_macro:
       if (getLangOpts().Modules)
         return HandleMacroPrivateDirective(Result);
@@ -910,20 +1049,20 @@
   // various pseudo-ops.  Just return the # token and push back the following
   // token to be lexed next time.
   if (getLangOpts().AsmPreprocessor) {
-    Token *Toks = new Token[2];
+    auto Toks = llvm::make_unique<Token[]>(2);
     // Return the # and the token after it.
     Toks[0] = SavedHash;
     Toks[1] = Result;
-    
+
     // If the second token is a hashhash token, then we need to translate it to
     // unknown so the token lexer doesn't try to perform token pasting.
     if (Result.is(tok::hashhash))
       Toks[1].setKind(tok::unknown);
-    
+
     // Enter this token stream so that we re-lex the tokens.  Make sure to
     // enable macro expansion, in case the token after the # is an identifier
     // that is expanded.
-    EnterTokenStream(Toks, 2, false, true);
+    EnterTokenStream(std::move(Toks), 2, false);
     return;
   }
 
@@ -956,7 +1095,7 @@
   unsigned ActualLength = PP.getSpelling(DigitTok, DigitTokBegin, &Invalid);
   if (Invalid)
     return true;
-  
+
   // Verify that we have a simple digit-sequence, and compute the value.  This
   // is always a simple digit string computed in decimal, so we do this manually
   // here.
@@ -1007,7 +1146,7 @@
   unsigned LineNo;
   if (GetLineValue(DigitTok, LineNo, diag::err_pp_line_requires_integer,*this))
     return;
-  
+
   if (LineNo == 0)
     Diag(DigitTok, diag::ext_pp_line_zero);
 
@@ -1090,7 +1229,7 @@
     PresumedLoc PLoc = SM.getPresumedLoc(FlagTok.getLocation());
     if (PLoc.isInvalid())
       return true;
-    
+
     // If there is no include loc (main file) or if the include loc is in a
     // different physical file, then we aren't in a "1" line marker flag region.
     SourceLocation IncLoc = PLoc.getIncludeLoc();
@@ -1213,7 +1352,6 @@
   }
 }
 
-
 /// HandleUserDiagnosticDirective - Handle a #warning or #error directive.
 ///
 void Preprocessor::HandleUserDiagnosticDirective(Token &Tok,
@@ -1279,7 +1417,7 @@
 void Preprocessor::HandleMacroPublicDirective(Token &Tok) {
   Token MacroNameTok;
   ReadMacroName(MacroNameTok, MU_Undef);
-  
+
   // Error reading macro name?  If so, diagnostic already issued.
   if (MacroNameTok.is(tok::eod))
     return;
@@ -1290,13 +1428,13 @@
   IdentifierInfo *II = MacroNameTok.getIdentifierInfo();
   // Okay, we finally have a valid identifier to undef.
   MacroDirective *MD = getLocalMacroDirective(II);
-  
+
   // If the macro is not defined, this is an error.
   if (!MD) {
     Diag(MacroNameTok, diag::err_pp_visibility_non_macro) << II;
     return;
   }
-  
+
   // Note that this macro has now been exported.
   appendMacroDirective(II, AllocateVisibilityMacroDirective(
                                 MacroNameTok.getLocation(), /*IsPublic=*/true));
@@ -1306,24 +1444,24 @@
 void Preprocessor::HandleMacroPrivateDirective(Token &Tok) {
   Token MacroNameTok;
   ReadMacroName(MacroNameTok, MU_Undef);
-  
+
   // Error reading macro name?  If so, diagnostic already issued.
   if (MacroNameTok.is(tok::eod))
     return;
-  
+
   // Check to see if this is the last token on the #__private_macro line.
   CheckEndOfDirective("__private_macro");
-  
+
   IdentifierInfo *II = MacroNameTok.getIdentifierInfo();
   // Okay, we finally have a valid identifier to undef.
   MacroDirective *MD = getLocalMacroDirective(II);
-  
+
   // If the macro is not defined, this is an error.
   if (!MD) {
     Diag(MacroNameTok, diag::err_pp_visibility_non_macro) << II;
     return;
   }
-  
+
   // Note that this macro has now been marked private.
   appendMacroDirective(II, AllocateVisibilityMacroDirective(
                                MacroNameTok.getLocation(), /*IsPublic=*/false));
@@ -1398,7 +1536,7 @@
   Lex(CurTok);
   while (CurTok.isNot(tok::eod)) {
     End = CurTok.getLocation();
-    
+
     // FIXME: Provide code completion for #includes.
     if (CurTok.is(tok::code_completion)) {
       setCodeCompletionReached();
@@ -1445,13 +1583,13 @@
                                  tok::TokenKind Kind, void *AnnotationVal) {
   // FIXME: Produce this as the current token directly, rather than
   // allocating a new token for it.
-  Token *Tok = new Token[1];
+  auto Tok = llvm::make_unique<Token[]>(1);
   Tok[0].startToken();
   Tok[0].setKind(Kind);
   Tok[0].setLocation(Begin);
   Tok[0].setAnnotationEndLoc(End);
   Tok[0].setAnnotationValue(AnnotationVal);
-  PP.EnterTokenStream(Tok, 1, true, true);
+  PP.EnterTokenStream(std::move(Tok), 1, true);
 }
 
 /// \brief Produce a diagnostic informing the user that a #include or similar
@@ -1469,24 +1607,24 @@
     PathString += Path[I].first->getName();
   }
   int IncludeKind = 0;
-  
+
   switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) {
   case tok::pp_include:
     IncludeKind = 0;
     break;
-    
+
   case tok::pp_import:
     IncludeKind = 1;
-    break;        
-      
+    break;
+
   case tok::pp_include_next:
     IncludeKind = 2;
     break;
-      
+
   case tok::pp___include_macros:
     IncludeKind = 3;
     break;
-      
+
   default:
     llvm_unreachable("unknown include directive kind");
   }
@@ -1499,17 +1637,49 @@
                                       ("@import " + PathString + ";").str());
 }
 
+// Given a vector of path components and a string containing the real
+// path to the file, build a properly-cased replacement in the vector,
+// and return true if the replacement should be suggested.
+static bool trySimplifyPath(SmallVectorImpl<StringRef> &Components,
+                            StringRef RealPathName) {
+  auto RealPathComponentIter = llvm::sys::path::rbegin(RealPathName);
+  auto RealPathComponentEnd = llvm::sys::path::rend(RealPathName);
+  int Cnt = 0;
+  bool SuggestReplacement = false;
+  // Below is a best-effort to handle ".." in paths. It is admittedly
+  // not 100% correct in the presence of symlinks.
+  for (auto &Component : llvm::reverse(Components)) {
+    if ("." == Component) {
+    } else if (".." == Component) {
+      ++Cnt;
+    } else if (Cnt) {
+      --Cnt;
+    } else if (RealPathComponentIter != RealPathComponentEnd) {
+      if (Component != *RealPathComponentIter) {
+        // If these path components differ by more than just case, then we
+        // may be looking at symlinked paths. Bail on this diagnostic to avoid
+        // noisy false positives.
+        SuggestReplacement = RealPathComponentIter->equals_lower(Component);
+        if (!SuggestReplacement)
+          break;
+        Component = *RealPathComponentIter;
+      }
+      ++RealPathComponentIter;
+    }
+  }
+  return SuggestReplacement;
+}
+
 /// HandleIncludeDirective - The "\#include" tokens have just been read, read
 /// the file to be included from the lexer, then include it!  This is a common
 /// routine with functionality shared between \#include, \#include_next and
 /// \#import.  LookupFrom is set when this is a \#include_next directive, it
 /// specifies the file to start searching from.
-void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc, 
+void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
                                           Token &IncludeTok,
                                           const DirectoryLookup *LookupFrom,
                                           const FileEntry *LookupFromFile,
                                           bool isImport) {
-
   Token FilenameTok;
   CurPPLexer->LexIncludeFilename(FilenameTok);
 
@@ -1518,7 +1688,7 @@
   StringRef Filename;
   SourceLocation End;
   SourceLocation CharEnd; // the end of this directive, in characters
-  
+
   switch (FilenameTok.getKind()) {
   case tok::eod:
     // If the token kind is EOD, the error has already been diagnosed.
@@ -1589,8 +1759,8 @@
   }
 
   if (HeaderInfo.HasIncludeAliasMap()) {
-    // Map the filename with the brackets still attached.  If the name doesn't 
-    // map to anything, fall back on the filename we've already gotten the 
+    // Map the filename with the brackets still attached.  If the name doesn't
+    // map to anything, fall back on the filename we've already gotten the
     // spelling for.
     StringRef NewName = HeaderInfo.MapHeaderToIncludeAlias(OriginalFilename);
     if (!NewName.empty())
@@ -1627,7 +1797,7 @@
           // Add the recovery path to the list of search paths.
           DirectoryLookup DL(DE, SrcMgr::C_User, false);
           HeaderInfo.AddSearchPath(DL, isAngled);
-          
+
           // Try the lookup again, skipping the cache.
           File = LookupFile(
               FilenameLoc,
@@ -1639,7 +1809,7 @@
     }
 
     if (!SuppressIncludeNotFoundError) {
-      // If the file could not be located and it was included via angle 
+      // If the file could not be located and it was included via angle
       // brackets, we can attempt a lookup as though it were a quoted path to
       // provide the user with a possible fixit.
       if (isAngled) {
@@ -1652,8 +1822,8 @@
             &SuggestedModule);
         if (File) {
           SourceRange Range(FilenameTok.getLocation(), CharEnd);
-          Diag(FilenameTok, diag::err_pp_file_not_found_not_fatal) << 
-            Filename << 
+          Diag(FilenameTok, diag::err_pp_file_not_found_not_fatal) <<
+            Filename <<
             FixItHint::CreateReplacement(Range, "\"" + Filename.str() + "\"");
         }
       }
@@ -1675,13 +1845,15 @@
   // are processing this module textually (because we're building the module).
   if (File && SuggestedModule && getLangOpts().Modules &&
       SuggestedModule.getModule()->getTopLevelModuleName() !=
-          getLangOpts().CurrentModule &&
-      SuggestedModule.getModule()->getTopLevelModuleName() !=
-          getLangOpts().ImplementationOfModule) {
-
+          getLangOpts().CurrentModule) {
     // If this include corresponds to a module but that module is
     // unavailable, diagnose the situation and bail out.
-    if (!SuggestedModule.getModule()->isAvailable()) {
+    // FIXME: Remove this; loadModule does the same check (but produces
+    // slightly worse diagnostics).
+    if (!SuggestedModule.getModule()->isAvailable() &&
+        !SuggestedModule.getModule()
+             ->getTopLevelModule()
+             ->HasIncompatibleModuleFile) {
       clang::Module::Requirement Requirement;
       clang::Module::UnresolvedHeaderDirective MissingHeader;
       clang::Module *ShadowingModule = nullptr;
@@ -1718,7 +1890,7 @@
     // We only do this in Objective-C, where we have a module-import syntax.
     if (getLangOpts().ObjC2)
       diagnoseAutoModuleImport(*this, HashLoc, IncludeTok, Path, CharEnd);
-    
+
     // Load the module to import its macros. We'll make the declarations
     // visible when the parser gets here.
     // FIXME: Pass SuggestedModule in here rather than converting it to a path
@@ -1766,7 +1938,7 @@
 
   if (!File)
     return;
-  
+
   // The #included file will be considered to be a system header if either it is
   // in a system include directory, or if the #includer is a system include
   // header.
@@ -1777,6 +1949,39 @@
   // FIXME: If we have a suggested module, and we've already visited this file,
   // don't bother entering it again. We know it has no further effect.
 
+  // Issue a diagnostic if the name of the file on disk has a different case
+  // than the one we're about to open.
+  const bool CheckIncludePathPortability =
+    File && !File->tryGetRealPathName().empty();
+
+  if (CheckIncludePathPortability) {
+    StringRef Name = LangOpts.MSVCCompat ? NormalizedPath.str() : Filename;
+    StringRef RealPathName = File->tryGetRealPathName();
+    SmallVector<StringRef, 16> Components(llvm::sys::path::begin(Name),
+                                          llvm::sys::path::end(Name));
+
+    if (trySimplifyPath(Components, RealPathName)) {
+      SmallString<128> Path;
+      Path.reserve(Name.size()+2);
+      Path.push_back(isAngled ? '<' : '"');
+      for (auto Component : Components) {
+        Path.append(Component);
+        // Append the separator the user used, or the close quote
+        Path.push_back(
+          Path.size() <= Filename.size() ? Filename[Path.size()-1] :
+            (isAngled ? '>' : '"'));
+      }
+      auto Replacement = Path.str().str();
+      // For user files and known standard headers, by default we issue a diagnostic.
+      // For other system headers, we don't. They can be controlled separately.
+      auto DiagId = (FileCharacter == SrcMgr::C_User || warnByDefaultOnWrongCase(Name)) ?
+          diag::pp_nonportable_path : diag::pp_nonportable_system_path;
+      SourceRange Range(FilenameTok.getLocation(), CharEnd);
+      Diag(FilenameTok, DiagId) << Replacement <<
+        FixItHint::CreateReplacement(Range, Replacement);
+    }
+  }
+
   // Ask HeaderInfo if we should enter this #include file.  If not, #including
   // this file will have no effect.
   if (ShouldEnter &&
@@ -1870,7 +2075,7 @@
   // so we can continue processing from there.
   Diag(Tok, diag::err_pp_import_directive_ms );
 
-  // Read tokens until we get to the end of the directive.  Note that the 
+  // Read tokens until we get to the end of the directive.  Note that the
   // directive can be split over multiple lines using the backslash character.
   DiscardUntilEndOfDirective();
 }
@@ -1937,7 +2142,7 @@
       return true;
     case tok::ellipsis:  // #define X(... -> C99 varargs
       if (!LangOpts.C99)
-        Diag(Tok, LangOpts.CPlusPlus11 ? 
+        Diag(Tok, LangOpts.CPlusPlus11 ?
              diag::warn_cxx98_compat_variadic_macro :
              diag::ext_variadic_macro);
 
@@ -2154,7 +2359,6 @@
       // Get the next token of the macro.
       LexUnexpandedToken(Tok);
     }
-
   } else {
     // Otherwise, read the body of a function-like macro.  While we are at it,
     // check C99 6.10.3.2p1: ensure that # operators are followed by macro
@@ -2162,7 +2366,7 @@
     while (Tok.isNot(tok::eod)) {
       LastTok = Tok;
 
-      if (Tok.isNot(tok::hash) && Tok.isNot(tok::hashhash)) {
+      if (!Tok.isOneOf(tok::hash, tok::hashat, tok::hashhash)) {
         MI->AddTokenToBody(Tok);
 
         // Get the next token of the macro.
@@ -2183,11 +2387,10 @@
       }
 
       if (Tok.is(tok::hashhash)) {
-        
         // If we see token pasting, check if it looks like the gcc comma
         // pasting extension.  We'll use this information to suppress
         // diagnostics later on.
-        
+
         // Get the next token of the macro.
         LexUnexpandedToken(Tok);
 
@@ -2222,7 +2425,8 @@
           MI->AddTokenToBody(LastTok);
           continue;
         } else {
-          Diag(Tok, diag::err_pp_stringize_not_parameter);
+          Diag(Tok, diag::err_pp_stringize_not_parameter)
+            << LastTok.is(tok::hashat);
 
           // Disable __VA_ARGS__ again.
           Ident__VA_ARGS__->setIsPoisoned(true);
@@ -2299,7 +2503,7 @@
       if (!OtherMI->isUsed() && OtherMI->isWarnIfUnused())
         Diag(OtherMI->getDefinitionLoc(), diag::pp_macro_not_used);
 
-      // Warn if defining "__LINE__" and other builtins, per C99 6.10.8/4 and 
+      // Warn if defining "__LINE__" and other builtins, per C99 6.10.8/4 and
       // C++ [cpp.predefined]p4, but allow it as an extension.
       if (OtherMI->isBuiltinMacro())
         Diag(MacroNameTok, diag::ext_pp_redef_builtin_macro);
@@ -2372,7 +2576,6 @@
                        AllocateUndefMacroDirective(MacroNameTok.getLocation()));
 }
 
-
 //===----------------------------------------------------------------------===//
 // Preprocessor Conditional Directive Handling.
 //===----------------------------------------------------------------------===//
@@ -2561,7 +2764,7 @@
 
   // If this is a #elif with a #else before it, report the error.
   if (CI.FoundElse) Diag(ElifToken, diag::pp_err_elif_after_else);
-  
+
   if (Callbacks)
     Callbacks->Elif(ElifToken.getLocation(),
                     SourceRange(ConditionalBegin, ConditionalEnd),
diff --git a/lib/Lex/PPExpressions.cpp b/lib/Lex/PPExpressions.cpp
index c40598c..94075ec 100644
--- a/lib/Lex/PPExpressions.cpp
+++ b/lib/Lex/PPExpressions.cpp
@@ -33,12 +33,18 @@
 /// conditional and the source range covered by it.
 class PPValue {
   SourceRange Range;
+  IdentifierInfo *II;
 public:
   llvm::APSInt Val;
 
   // Default ctor - Construct an 'invalid' PPValue.
   PPValue(unsigned BitWidth) : Val(BitWidth) {}
 
+  // If this value was produced by directly evaluating an identifier, produce
+  // that identifier.
+  IdentifierInfo *getIdentifier() const { return II; }
+  void setIdentifier(IdentifierInfo *II) { this->II = II; }
+
   unsigned getBitWidth() const { return Val.getBitWidth(); }
   bool isUnsigned() const { return Val.isUnsigned(); }
 
@@ -140,6 +146,51 @@
     PP.LexNonComment(PeekTok);
   }
 
+  // [cpp.cond]p4:
+  //   Prior to evaluation, macro invocations in the list of preprocessing
+  //   tokens that will become the controlling constant expression are replaced
+  //   (except for those macro names modified by the 'defined' unary operator),
+  //   just as in normal text. If the token 'defined' is generated as a result
+  //   of this replacement process or use of the 'defined' unary operator does
+  //   not match one of the two specified forms prior to macro replacement, the
+  //   behavior is undefined.
+  // This isn't an idle threat, consider this program:
+  //   #define FOO
+  //   #define BAR defined(FOO)
+  //   #if BAR
+  //   ...
+  //   #else
+  //   ...
+  //   #endif
+  // clang and gcc will pick the #if branch while Visual Studio will take the
+  // #else branch.  Emit a warning about this undefined behavior.
+  if (beginLoc.isMacroID()) {
+    bool IsFunctionTypeMacro =
+        PP.getSourceManager()
+            .getSLocEntry(PP.getSourceManager().getFileID(beginLoc))
+            .getExpansion()
+            .isFunctionMacroExpansion();
+    // For object-type macros, it's easy to replace
+    //   #define FOO defined(BAR)
+    // with
+    //   #if defined(BAR)
+    //   #define FOO 1
+    //   #else
+    //   #define FOO 0
+    //   #endif
+    // and doing so makes sense since compilers handle this differently in
+    // practice (see example further up).  But for function-type macros,
+    // there is no good way to write
+    //   # define FOO(x) (defined(M_ ## x) && M_ ## x)
+    // in a different way, and compilers seem to agree on how to behave here.
+    // So warn by default on object-type macros, but only warn in -pedantic
+    // mode on function-type macros.
+    if (IsFunctionTypeMacro)
+      PP.Diag(beginLoc, diag::warn_defined_in_function_type_macro);
+    else
+      PP.Diag(beginLoc, diag::warn_defined_in_object_type_macro);
+  }
+
   // Invoke the 'defined' callback.
   if (PPCallbacks *Callbacks = PP.getPPCallbacks()) {
     Callbacks->Defined(macroToken, Macro,
@@ -164,6 +215,8 @@
                           bool ValueLive, Preprocessor &PP) {
   DT.State = DefinedTracker::Unknown;
 
+  Result.setIdentifier(nullptr);
+
   if (PeekTok.is(tok::code_completion)) {
     if (PP.getCodeCompletionHandler())
       PP.getCodeCompletionHandler()->CodeCompletePreprocessorExpression();
@@ -177,8 +230,8 @@
   if (IdentifierInfo *II = PeekTok.getIdentifierInfo()) {
     // Handle "defined X" and "defined(X)".
     if (II->isStr("defined"))
-      return(EvaluateDefined(Result, PeekTok, DT, ValueLive, PP));
-    
+      return EvaluateDefined(Result, PeekTok, DT, ValueLive, PP);
+
     // If this identifier isn't 'defined' or one of the special
     // preprocessor keywords and it wasn't macro expanded, it turns
     // into a simple 0, unless it is the C++ keyword "true", in which case it
@@ -189,6 +242,7 @@
       PP.Diag(PeekTok, diag::warn_pp_undef_identifier) << II;
     Result.Val = II->getTokenID() == tok::kw_true;
     Result.Val.setIsUnsigned(false);  // "0" is signed intmax_t 0.
+    Result.setIdentifier(II);
     Result.setRange(PeekTok.getLocation());
     PP.LexNonComment(PeekTok);
     return false;
@@ -347,6 +401,7 @@
       DT.State = DefinedTracker::Unknown;
     }
     Result.setRange(Start, PeekTok.getLocation());
+    Result.setIdentifier(nullptr);
     PP.LexNonComment(PeekTok);  // Eat the ).
     return false;
   }
@@ -356,6 +411,7 @@
     PP.LexNonComment(PeekTok);
     if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true;
     Result.setBegin(Start);
+    Result.setIdentifier(nullptr);
     return false;
   }
   case tok::minus: {
@@ -363,6 +419,7 @@
     PP.LexNonComment(PeekTok);
     if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true;
     Result.setBegin(Loc);
+    Result.setIdentifier(nullptr);
 
     // C99 6.5.3.3p3: The sign of the result matches the sign of the operand.
     Result.Val = -Result.Val;
@@ -383,6 +440,7 @@
     PP.LexNonComment(PeekTok);
     if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true;
     Result.setBegin(Start);
+    Result.setIdentifier(nullptr);
 
     // C99 6.5.3.3p4: The sign of the result matches the sign of the operand.
     Result.Val = ~Result.Val;
@@ -398,6 +456,7 @@
     Result.Val = !Result.Val;
     // C99 6.5.3.3p5: The sign of the result is 'int', aka it is signed.
     Result.Val.setIsUnsigned(false);
+    Result.setIdentifier(nullptr);
 
     if (DT.State == DefinedTracker::DefinedMacro)
       DT.State = DefinedTracker::NotDefinedMacro;
@@ -446,6 +505,15 @@
   }
 }
 
+static void diagnoseUnexpectedOperator(Preprocessor &PP, PPValue &LHS,
+                                       Token &Tok) {
+  if (Tok.is(tok::l_paren) && LHS.getIdentifier())
+    PP.Diag(LHS.getRange().getBegin(), diag::err_pp_expr_bad_token_lparen)
+        << LHS.getIdentifier();
+  else
+    PP.Diag(Tok.getLocation(), diag::err_pp_expr_bad_token_binop)
+        << LHS.getRange();
+}
 
 /// EvaluateDirectiveSubExpr - Evaluate the subexpression whose first token is
 /// PeekTok, and whose precedence is PeekPrec.  This returns the result in LHS.
@@ -459,8 +527,7 @@
   unsigned PeekPrec = getPrecedence(PeekTok.getKind());
   // If this token isn't valid, report the error.
   if (PeekPrec == ~0U) {
-    PP.Diag(PeekTok.getLocation(), diag::err_pp_expr_bad_token_binop)
-      << LHS.getRange();
+    diagnoseUnexpectedOperator(PP, LHS, PeekTok);
     return true;
   }
 
@@ -503,8 +570,7 @@
 
     // If this token isn't valid, report the error.
     if (PeekPrec == ~0U) {
-      PP.Diag(PeekTok.getLocation(), diag::err_pp_expr_bad_token_binop)
-        << RHS.getRange();
+      diagnoseUnexpectedOperator(PP, RHS, PeekTok);
       return true;
     }
 
@@ -605,8 +671,10 @@
     case tok::greatergreater: {
       // Determine whether overflow is about to happen.
       unsigned ShAmt = static_cast<unsigned>(RHS.Val.getLimitedValue());
-      if (ShAmt >= LHS.getBitWidth())
-        Overflow = true, ShAmt = LHS.getBitWidth()-1;
+      if (ShAmt >= LHS.getBitWidth()) {
+        Overflow = true;
+        ShAmt = LHS.getBitWidth()-1;
+      }
       Res = LHS.Val >> ShAmt;
       break;
     }
@@ -722,6 +790,7 @@
     // Put the result back into 'LHS' for our next iteration.
     LHS.Val = Res;
     LHS.setEnd(RHS.getRange().getEnd());
+    RHS.setIdentifier(nullptr);
   }
 }
 
diff --git a/lib/Lex/PPLexerChange.cpp b/lib/Lex/PPLexerChange.cpp
index 545388a..e2eceaf 100644
--- a/lib/Lex/PPLexerChange.cpp
+++ b/lib/Lex/PPLexerChange.cpp
@@ -685,7 +685,7 @@
     return true;
   // Otherwise, we only need module macros if we're actually compiling a module
   // interface.
-  return !getLangOpts().CurrentModule.empty();
+  return getLangOpts().CompilingModule;
 }
 
 void Preprocessor::LeaveSubmodule() {
diff --git a/lib/Lex/PPMacroExpansion.cpp b/lib/Lex/PPMacroExpansion.cpp
index f1e230c..5029935 100644
--- a/lib/Lex/PPMacroExpansion.cpp
+++ b/lib/Lex/PPMacroExpansion.cpp
@@ -12,16 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Lex/Preprocessor.h"
 #include "clang/Basic/Attributes.h"
-#include "clang/Basic/FileManager.h"
-#include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/CodeCompletionHandler.h"
+#include "clang/Lex/DirectoryLookup.h"
 #include "clang/Lex/ExternalPreprocessorSource.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/MacroArgs.h"
 #include "clang/Lex/MacroInfo.h"
+#include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -330,18 +329,11 @@
   Ident__is_identifier    = RegisterBuiltinMacro(*this, "__is_identifier");
 
   // Modules.
-  if (LangOpts.Modules) {
-    Ident__building_module  = RegisterBuiltinMacro(*this, "__building_module");
-
-    // __MODULE__
-    if (!LangOpts.CurrentModule.empty())
-      Ident__MODULE__ = RegisterBuiltinMacro(*this, "__MODULE__");
-    else
-      Ident__MODULE__ = nullptr;
-  } else {
-    Ident__building_module = nullptr;
+  Ident__building_module  = RegisterBuiltinMacro(*this, "__building_module");
+  if (!LangOpts.CurrentModule.empty())
+    Ident__MODULE__ = RegisterBuiltinMacro(*this, "__MODULE__");
+  else
     Ident__MODULE__ = nullptr;
-  }
 }
 
 /// isTrivialSingleTokenExpansion - Return true if MI, which has a single token
@@ -730,6 +722,7 @@
   // heap allocations in the common case.
   SmallVector<Token, 64> ArgTokens;
   bool ContainsCodeCompletionTok = false;
+  bool FoundElidedComma = false;
 
   SourceLocation TooManyArgsLoc;
 
@@ -761,17 +754,20 @@
           // Do not lose the EOF/EOD.  Return it to the client.
           MacroName = Tok;
           return nullptr;
-        } else {
-          // Do not lose the EOF/EOD.
-          Token *Toks = new Token[1];
-          Toks[0] = Tok;
-          EnterTokenStream(Toks, 1, true, true);
-          break;
         }
+        // Do not lose the EOF/EOD.
+        auto Toks = llvm::make_unique<Token[]>(1);
+        Toks[0] = Tok;
+        EnterTokenStream(std::move(Toks), 1, true);
+        break;
       } else if (Tok.is(tok::r_paren)) {
         // If we found the ) token, the macro arg list is done.
         if (NumParens-- == 0) {
           MacroEnd = Tok.getLocation();
+          if (!ArgTokens.empty() &&
+              ArgTokens.back().commaAfterElided()) {
+            FoundElidedComma = true;
+          }
           break;
         }
       } else if (Tok.is(tok::l_paren)) {
@@ -916,7 +912,7 @@
       // then we have an empty "()" argument empty list.  This is fine, even if
       // the macro expects one argument (the argument is just empty).
       isVarargsElided = MI->isVariadic();
-    } else if (MI->isVariadic() &&
+    } else if ((FoundElidedComma || MI->isVariadic()) &&
                (NumActuals+1 == MinArgsExpected ||  // A(x, ...) -> A(X)
                 (NumActuals == 0 && MinArgsExpected == 2))) {// A(x,...) -> A()
       // Varargs where the named vararg parameter is missing: OK as extension.
@@ -1049,9 +1045,8 @@
 
 /// HasFeature - Return true if we recognize and implement the feature
 /// specified by the identifier as a standard language feature.
-static bool HasFeature(const Preprocessor &PP, const IdentifierInfo *II) {
+static bool HasFeature(const Preprocessor &PP, StringRef Feature) {
   const LangOptions &LangOpts = PP.getLangOpts();
-  StringRef Feature = II->getName();
 
   // Normalize the feature name, __foo__ becomes foo.
   if (Feature.startswith("__") && Feature.endswith("__") && Feature.size() >= 4)
@@ -1099,6 +1094,8 @@
       .Case("memory_sanitizer", LangOpts.Sanitize.has(SanitizerKind::Memory))
       .Case("thread_sanitizer", LangOpts.Sanitize.has(SanitizerKind::Thread))
       .Case("dataflow_sanitizer", LangOpts.Sanitize.has(SanitizerKind::DataFlow))
+      .Case("efficiency_sanitizer",
+            LangOpts.Sanitize.hasOneOf(SanitizerKind::Efficiency))
       // Objective-C features
       .Case("objc_arr", LangOpts.ObjCAutoRefCount) // FIXME: REMOVE?
       .Case("objc_arc", LangOpts.ObjCAutoRefCount)
@@ -1191,6 +1188,8 @@
       // FIXME: Should this be __has_feature or __has_extension?
       //.Case("raw_invocation_type", LangOpts.CPlusPlus)
       // Type traits
+      // N.B. Additional type traits should not be added to the following list.
+      // Instead, they should be detected by has_extension.
       .Case("has_nothrow_assign", LangOpts.CPlusPlus)
       .Case("has_nothrow_copy", LangOpts.CPlusPlus)
       .Case("has_nothrow_constructor", LangOpts.CPlusPlus)
@@ -1211,7 +1210,7 @@
       .Case("is_standard_layout", LangOpts.CPlusPlus)
       .Case("is_pod", LangOpts.CPlusPlus)
       .Case("is_polymorphic", LangOpts.CPlusPlus)
-      .Case("is_sealed", LangOpts.MicrosoftExt)
+      .Case("is_sealed", LangOpts.CPlusPlus && LangOpts.MicrosoftExt)
       .Case("is_trivial", LangOpts.CPlusPlus)
       .Case("is_trivially_assignable", LangOpts.CPlusPlus)
       .Case("is_trivially_constructible", LangOpts.CPlusPlus)
@@ -1227,8 +1226,8 @@
 /// HasExtension - Return true if we recognize and implement the feature
 /// specified by the identifier, either as an extension or a standard language
 /// feature.
-static bool HasExtension(const Preprocessor &PP, const IdentifierInfo *II) {
-  if (HasFeature(PP, II))
+static bool HasExtension(const Preprocessor &PP, StringRef Extension) {
+  if (HasFeature(PP, Extension))
     return true;
 
   // If the use of an extension results in an error diagnostic, extensions are
@@ -1238,7 +1237,6 @@
     return false;
 
   const LangOptions &LangOpts = PP.getLangOpts();
-  StringRef Extension = II->getName();
 
   // Normalize the extension name, __foo__ becomes foo.
   if (Extension.startswith("__") && Extension.endswith("__") &&
@@ -1422,47 +1420,120 @@
   return EvaluateHasIncludeCommon(Tok, II, PP, Lookup, LookupFromFile);
 }
 
-/// \brief Process __building_module(identifier) expression.
-/// \returns true if we are building the named module, false otherwise.
-static bool EvaluateBuildingModule(Token &Tok,
-                                   IdentifierInfo *II, Preprocessor &PP) {
-  // Get '('.
-  PP.LexNonComment(Tok);
-
-  // Ensure we have a '('.
+/// \brief Process single-argument builtin feature-like macros that return
+/// integer values.
+static void EvaluateFeatureLikeBuiltinMacro(llvm::raw_svector_ostream& OS,
+                                            Token &Tok, IdentifierInfo *II,
+                                            Preprocessor &PP,
+                                            llvm::function_ref<
+                                              int(Token &Tok,
+                                                  bool &HasLexedNextTok)> Op) {
+  // Parse the initial '('.
+  PP.LexUnexpandedToken(Tok);
   if (Tok.isNot(tok::l_paren)) {
     PP.Diag(Tok.getLocation(), diag::err_pp_expected_after) << II
                                                             << tok::l_paren;
-    return false;
+
+    // Provide a dummy '0' value on output stream to elide further errors.
+    if (!Tok.isOneOf(tok::eof, tok::eod)) {
+      OS << 0;
+      Tok.setKind(tok::numeric_constant);
+    }
+    return;
   }
 
-  // Save '(' location for possible missing ')' message.
+  unsigned ParenDepth = 1;
   SourceLocation LParenLoc = Tok.getLocation();
+  llvm::Optional<int> Result;
 
-  // Get the module name.
-  PP.LexNonComment(Tok);
+  Token ResultTok;
+  bool SuppressDiagnostic = false;
+  while (true) {
+    // Parse next token.
+    PP.LexUnexpandedToken(Tok);
 
-  // Ensure that we have an identifier.
-  if (Tok.isNot(tok::identifier)) {
-    PP.Diag(Tok.getLocation(), diag::err_expected_id_building_module);
-    return false;
+already_lexed:
+    switch (Tok.getKind()) {
+      case tok::eof:
+      case tok::eod:
+        // Don't provide even a dummy value if the eod or eof marker is
+        // reached.  Simply provide a diagnostic.
+        PP.Diag(Tok.getLocation(), diag::err_unterm_macro_invoc);
+        return;
+
+      case tok::comma:
+        if (!SuppressDiagnostic) {
+          PP.Diag(Tok.getLocation(), diag::err_too_many_args_in_macro_invoc);
+          SuppressDiagnostic = true;
+        }
+        continue;
+
+      case tok::l_paren:
+        ++ParenDepth;
+        if (Result.hasValue())
+          break;
+        if (!SuppressDiagnostic) {
+          PP.Diag(Tok.getLocation(), diag::err_pp_nested_paren) << II;
+          SuppressDiagnostic = true;
+        }
+        continue;
+
+      case tok::r_paren:
+        if (--ParenDepth > 0)
+          continue;
+
+        // The last ')' has been reached; return the value if one found or
+        // a diagnostic and a dummy value.
+        if (Result.hasValue())
+          OS << Result.getValue();
+        else {
+          OS << 0;
+          if (!SuppressDiagnostic)
+            PP.Diag(Tok.getLocation(), diag::err_too_few_args_in_macro_invoc);
+        }
+        Tok.setKind(tok::numeric_constant);
+        return;
+
+      default: {
+        // Parse the macro argument, if one not found so far.
+        if (Result.hasValue())
+          break;
+
+        bool HasLexedNextToken = false;
+        Result = Op(Tok, HasLexedNextToken);
+        ResultTok = Tok;
+        if (HasLexedNextToken)
+          goto already_lexed;
+        continue;
+      }
+    }
+
+    // Diagnose missing ')'.
+    if (!SuppressDiagnostic) {
+      if (auto Diag = PP.Diag(Tok.getLocation(), diag::err_pp_expected_after)) {
+        if (IdentifierInfo *LastII = ResultTok.getIdentifierInfo())
+          Diag << LastII;
+        else
+          Diag << ResultTok.getKind();
+        Diag << tok::r_paren << ResultTok.getLocation();
+      }
+      PP.Diag(LParenLoc, diag::note_matching) << tok::l_paren;
+      SuppressDiagnostic = true;
+    }
   }
+}
 
-  bool Result
-    = Tok.getIdentifierInfo()->getName() == PP.getLangOpts().CurrentModule;
+/// \brief Helper function to return the IdentifierInfo structure of a Token
+/// or generate a diagnostic if none available.
+static IdentifierInfo *ExpectFeatureIdentifierInfo(Token &Tok,
+                                                   Preprocessor &PP,
+                                                   signed DiagID) {
+  IdentifierInfo *II;
+  if (!Tok.isAnnotation() && (II = Tok.getIdentifierInfo()))
+    return II;
 
-  // Get ')'.
-  PP.LexNonComment(Tok);
-
-  // Ensure we have a trailing ).
-  if (Tok.isNot(tok::r_paren)) {
-    PP.Diag(Tok.getLocation(), diag::err_pp_expected_after) << II
-                                                            << tok::r_paren;
-    PP.Diag(LParenLoc, diag::note_matching) << tok::l_paren;
-    return false;
-  }
-
-  return Result;
+  PP.Diag(Tok.getLocation(), DiagID);
+  return nullptr;
 }
 
 /// ExpandBuiltinMacro - If an identifier token is read that is to be expanded
@@ -1598,84 +1669,82 @@
     // __COUNTER__ expands to a simple numeric value.
     OS << CounterValue++;
     Tok.setKind(tok::numeric_constant);
-  } else if (II == Ident__has_feature   ||
-             II == Ident__has_extension ||
-             II == Ident__has_builtin   ||
-             II == Ident__is_identifier ||
-             II == Ident__has_attribute ||
-             II == Ident__has_declspec  ||
-             II == Ident__has_cpp_attribute) {
-    // The argument to these builtins should be a parenthesized identifier.
-    SourceLocation StartLoc = Tok.getLocation();
-
-    bool IsValid = false;
-    IdentifierInfo *FeatureII = nullptr;
-    IdentifierInfo *ScopeII = nullptr;
-
-    // Read the '('.
-    LexUnexpandedToken(Tok);
-    if (Tok.is(tok::l_paren)) {
-      // Read the identifier
-      LexUnexpandedToken(Tok);
-      if ((FeatureII = Tok.getIdentifierInfo())) {
-        // If we're checking __has_cpp_attribute, it is possible to receive a
-        // scope token. Read the "::", if it's available.
-        LexUnexpandedToken(Tok);
-        bool IsScopeValid = true;
-        if (II == Ident__has_cpp_attribute && Tok.is(tok::coloncolon)) {
-          LexUnexpandedToken(Tok);
-          // The first thing we read was not the feature, it was the scope.
-          ScopeII = FeatureII;
-          if ((FeatureII = Tok.getIdentifierInfo()))
-            LexUnexpandedToken(Tok);
-          else
-            IsScopeValid = false;          
+  } else if (II == Ident__has_feature) {
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [this](Token &Tok, bool &HasLexedNextToken) -> int {
+        IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this,
+                                           diag::err_feature_check_malformed);
+        return II && HasFeature(*this, II->getName());
+      });
+  } else if (II == Ident__has_extension) {
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [this](Token &Tok, bool &HasLexedNextToken) -> int {
+        IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this,
+                                           diag::err_feature_check_malformed);
+        return II && HasExtension(*this, II->getName());
+      });
+  } else if (II == Ident__has_builtin) {
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [this](Token &Tok, bool &HasLexedNextToken) -> int {
+        IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this,
+                                           diag::err_feature_check_malformed);
+        if (!II)
+          return false;
+        else if (II->getBuiltinID() != 0)
+          return true;
+        else {
+          const LangOptions &LangOpts = getLangOpts();
+          return llvm::StringSwitch<bool>(II->getName())
+                      .Case("__make_integer_seq", LangOpts.CPlusPlus)
+                      .Case("__type_pack_element", LangOpts.CPlusPlus)
+                      .Default(false);
         }
-        // Read the closing paren.
-        if (IsScopeValid && Tok.is(tok::r_paren))
-          IsValid = true;
-      }
-      // Eat tokens until ')'.
-      while (Tok.isNot(tok::r_paren) && Tok.isNot(tok::eod) &&
-             Tok.isNot(tok::eof))
+      });
+  } else if (II == Ident__is_identifier) {
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [](Token &Tok, bool &HasLexedNextToken) -> int {
+        return Tok.is(tok::identifier);
+      });
+  } else if (II == Ident__has_attribute) {
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [this](Token &Tok, bool &HasLexedNextToken) -> int {
+        IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this,
+                                           diag::err_feature_check_malformed);
+        return II ? hasAttribute(AttrSyntax::GNU, nullptr, II,
+                                 getTargetInfo(), getLangOpts()) : 0;
+      });
+  } else if (II == Ident__has_declspec) {
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [this](Token &Tok, bool &HasLexedNextToken) -> int {
+        IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this,
+                                           diag::err_feature_check_malformed);
+        return II ? hasAttribute(AttrSyntax::Declspec, nullptr, II,
+                                 getTargetInfo(), getLangOpts()) : 0;
+      });
+  } else if (II == Ident__has_cpp_attribute) {
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [this](Token &Tok, bool &HasLexedNextToken) -> int {
+        IdentifierInfo *ScopeII = nullptr;
+        IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this,
+                                           diag::err_feature_check_malformed);
+        if (!II)
+          return false;
+
+        // It is possible to receive a scope token.  Read the "::", if it is
+        // available, and the subsequent identifier.
         LexUnexpandedToken(Tok);
-    }
+        if (Tok.isNot(tok::coloncolon))
+          HasLexedNextToken = true;
+        else {
+          ScopeII = II;
+          LexUnexpandedToken(Tok);
+          II = ExpectFeatureIdentifierInfo(Tok, *this,
+                                           diag::err_feature_check_malformed);
+        }
 
-    int Value = 0;
-    if (!IsValid)
-      Diag(StartLoc, diag::err_feature_check_malformed);
-    else if (II == Ident__is_identifier)
-      Value = FeatureII->getTokenID() == tok::identifier;
-    else if (II == Ident__has_builtin) {
-      // Check for a builtin is trivial.
-      if (FeatureII->getBuiltinID() != 0) {
-        Value = true;
-      } else {
-        StringRef Feature = FeatureII->getName();
-        Value = llvm::StringSwitch<bool>(Feature)
-                    .Case("__make_integer_seq", getLangOpts().CPlusPlus)
-                    .Default(false);
-      }
-    } else if (II == Ident__has_attribute)
-      Value = hasAttribute(AttrSyntax::GNU, nullptr, FeatureII,
-                           getTargetInfo(), getLangOpts());
-    else if (II == Ident__has_cpp_attribute)
-      Value = hasAttribute(AttrSyntax::CXX, ScopeII, FeatureII,
-                           getTargetInfo(), getLangOpts());
-    else if (II == Ident__has_declspec)
-      Value = hasAttribute(AttrSyntax::Declspec, nullptr, FeatureII,
-                           getTargetInfo(), getLangOpts());
-    else if (II == Ident__has_extension)
-      Value = HasExtension(*this, FeatureII);
-    else {
-      assert(II == Ident__has_feature && "Must be feature check");
-      Value = HasFeature(*this, FeatureII);
-    }
-
-    if (!IsValid)
-      return;
-    OS << Value;
-    Tok.setKind(tok::numeric_constant);
+        return II ? hasAttribute(AttrSyntax::CXX, ScopeII, II,
+                                 getTargetInfo(), getLangOpts()) : 0;
+      });
   } else if (II == Ident__has_include ||
              II == Ident__has_include_next) {
     // The argument to these two builtins should be a parenthesized
@@ -1693,64 +1762,44 @@
     Tok.setKind(tok::numeric_constant);
   } else if (II == Ident__has_warning) {
     // The argument should be a parenthesized string literal.
-    // The argument to these builtins should be a parenthesized identifier.
-    SourceLocation StartLoc = Tok.getLocation();    
-    bool IsValid = false;
-    bool Value = false;
-    // Read the '('.
-    LexUnexpandedToken(Tok);
-    do {
-      if (Tok.isNot(tok::l_paren)) {
-        Diag(StartLoc, diag::err_warning_check_malformed);
-        break;
-      }
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [this](Token &Tok, bool &HasLexedNextToken) -> int {
+        std::string WarningName;
+        SourceLocation StrStartLoc = Tok.getLocation();
 
-      LexUnexpandedToken(Tok);
-      std::string WarningName;
-      SourceLocation StrStartLoc = Tok.getLocation();
-      if (!FinishLexStringLiteral(Tok, WarningName, "'__has_warning'",
-                                  /*MacroExpansion=*/false)) {
-        // Eat tokens until ')'.
-        while (Tok.isNot(tok::r_paren) && Tok.isNot(tok::eod) &&
-               Tok.isNot(tok::eof))
-          LexUnexpandedToken(Tok);
-        break;
-      }
+        HasLexedNextToken = Tok.is(tok::string_literal);
+        if (!FinishLexStringLiteral(Tok, WarningName, "'__has_warning'",
+                                    /*MacroExpansion=*/false))
+          return false;
 
-      // Is the end a ')'?
-      if (!(IsValid = Tok.is(tok::r_paren))) {
-        Diag(StartLoc, diag::err_warning_check_malformed);
-        break;
-      }
+        // FIXME: Should we accept "-R..." flags here, or should that be
+        // handled by a separate __has_remark?
+        if (WarningName.size() < 3 || WarningName[0] != '-' ||
+            WarningName[1] != 'W') {
+          Diag(StrStartLoc, diag::warn_has_warning_invalid_option);
+          return false;
+        }
 
-      // FIXME: Should we accept "-R..." flags here, or should that be handled
-      // by a separate __has_remark?
-      if (WarningName.size() < 3 || WarningName[0] != '-' ||
-          WarningName[1] != 'W') {
-        Diag(StrStartLoc, diag::warn_has_warning_invalid_option);
-        break;
-      }
-
-      // Finally, check if the warning flags maps to a diagnostic group.
-      // We construct a SmallVector here to talk to getDiagnosticIDs().
-      // Although we don't use the result, this isn't a hot path, and not
-      // worth special casing.
-      SmallVector<diag::kind, 10> Diags;
-      Value = !getDiagnostics().getDiagnosticIDs()->
-        getDiagnosticsInGroup(diag::Flavor::WarningOrError,
-                              WarningName.substr(2), Diags);
-    } while (false);
-
-    if (!IsValid)
-      return;
-    OS << (int)Value;
-    Tok.setKind(tok::numeric_constant);
+        // Finally, check if the warning flags maps to a diagnostic group.
+        // We construct a SmallVector here to talk to getDiagnosticIDs().
+        // Although we don't use the result, this isn't a hot path, and not
+        // worth special casing.
+        SmallVector<diag::kind, 10> Diags;
+        return !getDiagnostics().getDiagnosticIDs()->
+                getDiagnosticsInGroup(diag::Flavor::WarningOrError,
+                                      WarningName.substr(2), Diags);
+      });
   } else if (II == Ident__building_module) {
     // The argument to this builtin should be an identifier. The
     // builtin evaluates to 1 when that identifier names the module we are
     // currently building.
-    OS << (int)EvaluateBuildingModule(Tok, II, *this);
-    Tok.setKind(tok::numeric_constant);
+    EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this,
+      [this](Token &Tok, bool &HasLexedNextToken) -> int {
+        IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this,
+                                       diag::err_expected_id_building_module);
+        return getLangOpts().CompilingModule && II &&
+               (II->getName() == getLangOpts().CurrentModule);
+      });
   } else if (II == Ident__MODULE__) {
     // The current module as an identifier.
     OS << getLangOpts().CurrentModule;
diff --git a/lib/Lex/PTHLexer.cpp b/lib/Lex/PTHLexer.cpp
index 5f63d35..dc678cf 100644
--- a/lib/Lex/PTHLexer.cpp
+++ b/lib/Lex/PTHLexer.cpp
@@ -21,7 +21,6 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/Token.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <memory>
diff --git a/lib/Lex/Pragma.cpp b/lib/Lex/Pragma.cpp
index afb41a2..1ce4b43 100644
--- a/lib/Lex/Pragma.cpp
+++ b/lib/Lex/Pragma.cpp
@@ -28,8 +28,6 @@
 #include <algorithm>
 using namespace clang;
 
-#include "llvm/Support/raw_ostream.h"
-
 // Out-of-line destructor to provide a home for the class.
 PragmaHandler::~PragmaHandler() {
 }
@@ -354,7 +352,9 @@
 /// HandlePragmaOnce - Handle \#pragma once.  OnceTok is the 'once'.
 ///
 void Preprocessor::HandlePragmaOnce(Token &OnceTok) {
-  if (isInPrimaryFile()) {
+  // Don't honor the 'once' when handling the primary source file, unless
+  // this is a prefix to a TU, which indicates we're generating a PCH file.
+  if (isInPrimaryFile() && TUKind != TU_Prefix) {
     Diag(OnceTok, diag::pp_pragma_once_in_main_file);
     return;
   }
@@ -938,13 +938,13 @@
     }
 
     SourceLocation NameLoc = Tok.getLocation();
-    Token *Toks = PP.getPreprocessorAllocator().Allocate<Token>(1);
-    Toks->startToken();
-    Toks->setKind(tok::annot_pragma_captured);
-    Toks->setLocation(NameLoc);
+    MutableArrayRef<Token> Toks(
+        PP.getPreprocessorAllocator().Allocate<Token>(1), 1);
+    Toks[0].startToken();
+    Toks[0].setKind(tok::annot_pragma_captured);
+    Toks[0].setLocation(NameLoc);
 
-    PP.EnterTokenStream(Toks, 1, /*DisableMacroExpansion=*/true,
-                        /*OwnsTokens=*/false);
+    PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
   }
 
 // Disable MSVC warning about runtime stack overflow.
@@ -1024,10 +1024,19 @@
       return;
     }
 
-    if (PP.getDiagnostics().setSeverityForGroup(
-            WarningName[1] == 'W' ? diag::Flavor::WarningOrError
-                                  : diag::Flavor::Remark,
-            WarningName.substr(2), SV, DiagLoc))
+    diag::Flavor Flavor = WarningName[1] == 'W' ? diag::Flavor::WarningOrError
+                                                : diag::Flavor::Remark;
+    StringRef Group = StringRef(WarningName).substr(2);
+    bool unknownDiag = false;
+    if (Group == "everything") {
+      // Special handling for pragma clang diagnostic ... "-Weverything".
+      // There is no formal group named "everything", so there has to be a
+      // special case for it.
+      PP.getDiagnostics().setSeverityForAll(Flavor, SV, DiagLoc);
+    } else
+      unknownDiag = PP.getDiagnostics().setSeverityForGroup(Flavor, Group, SV,
+                                                            DiagLoc);
+    if (unknownDiag)
       PP.Diag(StringLoc, diag::warn_pragma_diagnostic_unknown_warning)
         << WarningName;
     else if (Callbacks)
@@ -1481,6 +1490,13 @@
     AddPragmaHandler(new PragmaRegionHandler("region"));
     AddPragmaHandler(new PragmaRegionHandler("endregion"));
   }
+
+  // Pragmas added by plugins
+  for (PragmaHandlerRegistry::iterator it = PragmaHandlerRegistry::begin(),
+                                       ie = PragmaHandlerRegistry::end();
+       it != ie; ++it) {
+    AddPragmaHandler(it->instantiate().release());
+  }
 }
 
 /// Ignore all pragmas, useful for modes such as -Eonly which would otherwise
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index 142d9ce..f0d6872 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -43,16 +43,19 @@
 #include "clang/Lex/PreprocessingRecord.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Lex/ScratchBuffer.h"
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include <utility>
 using namespace clang;
 
+LLVM_INSTANTIATE_REGISTRY(PragmaHandlerRegistry)
+
 //===----------------------------------------------------------------------===//
 ExternalPreprocessorSource::~ExternalPreprocessorSource() { }
 
@@ -62,7 +65,7 @@
                            ModuleLoader &TheModuleLoader,
                            IdentifierInfoLookup *IILookup, bool OwnsHeaders,
                            TranslationUnitKind TUKind)
-    : PPOpts(PPOpts), Diags(&diags), LangOpts(opts), Target(nullptr),
+    : PPOpts(std::move(PPOpts)), Diags(&diags), LangOpts(opts), Target(nullptr),
       AuxTarget(nullptr), FileMgr(Headers.getFileMgr()), SourceMgr(SM),
       ScratchBuf(new ScratchBuffer(SourceMgr)), HeaderInfo(Headers),
       TheModuleLoader(TheModuleLoader), ExternalSource(nullptr),
@@ -71,7 +74,7 @@
       IncrementalProcessing(false), TUKind(TUKind), CodeComplete(nullptr),
       CodeCompletionFile(nullptr), CodeCompletionOffset(0),
       LastTokenWasAt(false), ModuleImportExpectsIdentifier(false),
-      CodeCompletionReached(0), MainFileDir(nullptr),
+      CodeCompletionReached(0), CodeCompletionII(0), MainFileDir(nullptr),
       SkipMainFilePreamble(0, true), CurPPLexer(nullptr), CurDirLookup(nullptr),
       CurLexerKind(CLK_Lexer), CurSubmodule(nullptr), Callbacks(nullptr),
       CurSubmoduleState(&NullSubmoduleState), MacroArgCache(nullptr),
@@ -477,7 +480,7 @@
 }
 
 Module *Preprocessor::getCurrentModule() {
-  if (getLangOpts().CurrentModule.empty())
+  if (!getLangOpts().CompilingModule)
     return nullptr;
 
   return getHeaderSearchInfo().lookupModule(getLangOpts().CurrentModule);
@@ -741,6 +744,9 @@
     }
   } while (!ReturnedToken);
 
+  if (Result.is(tok::code_completion))
+    setCodeCompletionIdentifierInfo(Result.getIdentifierInfo());
+
   LastTokenWasAt = Result.is(tok::at);
 }
 
diff --git a/lib/Lex/TokenLexer.cpp b/lib/Lex/TokenLexer.cpp
index e9155d6..994bae6 100644
--- a/lib/Lex/TokenLexer.cpp
+++ b/lib/Lex/TokenLexer.cpp
@@ -18,8 +18,8 @@
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/SmallString.h"
-using namespace clang;
 
+using namespace clang;
 
 /// Create a TokenLexer for the specified macro with the specified actual
 /// arguments.  Note that this ctor takes ownership of the ActualArgs pointer.
@@ -76,8 +76,6 @@
   Macro->DisableMacro();
 }
 
-
-
 /// Create a TokenLexer for the specified token stream.  This does not
 /// take ownership of the specified token vector.
 void TokenLexer::Init(const Token *TokArray, unsigned NumToks,
@@ -107,7 +105,6 @@
   }
 }
 
-
 void TokenLexer::destroy() {
   // If this was a function-like macro that actually uses its arguments, delete
   // the expanded tokens.
@@ -154,12 +151,17 @@
   // Remove the comma.
   ResultToks.pop_back();
 
-  // If the comma was right after another paste (e.g. "X##,##__VA_ARGS__"),
-  // then removal of the comma should produce a placemarker token (in C99
-  // terms) which we model by popping off the previous ##, giving us a plain
-  // "X" when __VA_ARGS__ is empty.
-  if (!ResultToks.empty() && ResultToks.back().is(tok::hashhash))
-    ResultToks.pop_back();
+  if (!ResultToks.empty()) {
+    // If the comma was right after another paste (e.g. "X##,##__VA_ARGS__"),
+    // then removal of the comma should produce a placemarker token (in C99
+    // terms) which we model by popping off the previous ##, giving us a plain
+    // "X" when __VA_ARGS__ is empty.
+    if (ResultToks.back().is(tok::hashhash))
+      ResultToks.pop_back();
+
+    // Remember that this comma was elided.
+    ResultToks.back().setFlag(Token::CommaAfterElided);
+  }
 
   // Never add a space, even if the comma, ##, or arg had a space.
   NextTokGetsSpace = false;
@@ -169,7 +171,6 @@
 /// Expand the arguments of a function-like macro so that we can quickly
 /// return preexpanded tokens from Tokens.
 void TokenLexer::ExpandFunctionArguments() {
-
   SmallVector<Token, 128> ResultToks;
 
   // Loop through 'Tokens', expanding them into ResultToks.  Keep
@@ -389,8 +390,6 @@
       MaybeRemoveCommaBeforeVaArgs(ResultToks,
                                    /*HasPasteOperator=*/true,
                                    Macro, ArgNo, PP);
-
-    continue;
   }
 
   // If anything changed, install this as the new Tokens list.
@@ -788,9 +787,6 @@
     if (CurLoc.isFileID() != NextLoc.isFileID())
       break; // Token from different kind of FileID.
 
-    if (CurLoc.isMacroID() && !SM.isWrittenInSameFile(CurLoc, NextLoc))
-      break; // Token from a different macro.
-
     int RelOffs;
     if (!SM.isInSameSLocAddrSpace(CurLoc, NextLoc, &RelOffs))
       break; // Token from different local/loaded location.
@@ -798,6 +794,10 @@
     // "characters" away.
     if (RelOffs < 0 || RelOffs > 50)
       break;
+
+    if (CurLoc.isMacroID() && !SM.isWrittenInSameFile(CurLoc, NextLoc))
+      break; // Token from a different macro.
+
     CurLoc = NextLoc;
   }
 
diff --git a/lib/Makefile b/lib/Makefile
deleted file mode 100755
index e627d5a..0000000
--- a/lib/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-##===- lib/Makefile ----------------------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-CLANG_LEVEL := ..
-
-# ARCMigrate and Rewrite are always needed because of libclang.
-PARALLEL_DIRS = Headers Basic APINotes Lex Parse AST Sema CodeGen Analysis \
-                Frontend FrontendTool Tooling Driver Format Edit Rewrite \
-                Serialization Index ASTMatchers
-
-include $(CLANG_LEVEL)/../../Makefile.config
-
-ifeq ($(ENABLE_CLANG_STATIC_ANALYZER),1)
-PARALLEL_DIRS += StaticAnalyzer
-endif
-
-ifeq ($(ENABLE_CLANG_ARCMT),1)
-PARALLEL_DIRS += ARCMigrate
-endif
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Parse/Makefile b/lib/Parse/Makefile
deleted file mode 100644
index 5ec7c33..0000000
--- a/lib/Parse/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- clang/lib/Parse/Makefile ----------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-#  This implements the Parser library for the C-Language front-end.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangParse
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/Parse/ParseAST.cpp b/lib/Parse/ParseAST.cpp
index ccf9479..1fb57a0 100644
--- a/lib/Parse/ParseAST.cpp
+++ b/lib/Parse/ParseAST.cpp
@@ -19,7 +19,6 @@
 #include "clang/Parse/ParseDiagnostic.h"
 #include "clang/Parse/Parser.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
-#include "clang/Sema/ExternalSemaSource.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaConsumer.h"
 #include "llvm/Support/CrashRecoveryContext.h"
diff --git a/lib/Parse/ParseCXXInlineMethods.cpp b/lib/Parse/ParseCXXInlineMethods.cpp
index 2d94619..098e3ac 100644
--- a/lib/Parse/ParseCXXInlineMethods.cpp
+++ b/lib/Parse/ParseCXXInlineMethods.cpp
@@ -53,7 +53,8 @@
     }
   }
 
-  HandleMemberFunctionDeclDelays(D, FnD);
+  if (FnD)
+    HandleMemberFunctionDeclDelays(D, FnD);
 
   D.complete(FnD);
 
@@ -101,6 +102,12 @@
     return FnD;
   }
 
+  if (SkipFunctionBodies && (!FnD || Actions.canSkipFunctionBody(FnD)) &&
+      trySkippingFunctionBody()) {
+    Actions.ActOnSkippedFunctionBody(FnD);
+    return FnD;
+  }
+
   // In delayed template parsing mode, if we are within a class template
   // or if we are about to parse function member template then consume
   // the tokens and store them for parsing at the end of the translation unit.
@@ -326,7 +333,7 @@
 
       // Parse the default argument from its saved token stream.
       Toks->push_back(Tok); // So that the current token doesn't get lost
-      PP.EnterTokenStream(&Toks->front(), Toks->size(), true, false);
+      PP.EnterTokenStream(*Toks, true);
 
       // Consume the previously-pushed token.
       ConsumeAnyToken();
@@ -381,7 +388,7 @@
       assert (!OldParam->hasUnparsedDefaultArg());
       if (OldParam->hasUninstantiatedDefaultArg())
         Param->setUninstantiatedDefaultArg(
-                                      Param->getUninstantiatedDefaultArg());
+            OldParam->getUninstantiatedDefaultArg());
       else
         Param->setDefaultArg(OldParam->getInit());
     }
@@ -400,7 +407,7 @@
 
     // Parse the default argument from its saved token stream.
     Toks->push_back(Tok); // So that the current token doesn't get lost
-    PP.EnterTokenStream(&Toks->front(), Toks->size(), true, false);
+    PP.EnterTokenStream(*Toks, true);
 
     // Consume the previously-pushed token.
     ConsumeAnyToken();
@@ -505,7 +512,7 @@
   // Append the current token at the end of the new token stream so that it
   // doesn't get lost.
   LM.Toks.push_back(Tok);
-  PP.EnterTokenStream(LM.Toks.data(), LM.Toks.size(), true, false);
+  PP.EnterTokenStream(LM.Toks, true);
 
   // Consume the previously pushed token.
   ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
@@ -564,8 +571,10 @@
   if (Tok.is(tok::eof) && Tok.getEofData() == LM.D)
     ConsumeAnyToken();
 
-  if (CXXMethodDecl *MD = dyn_cast_or_null<CXXMethodDecl>(LM.D))
-    Actions.ActOnFinishInlineMethodDef(MD);
+  if (auto *FD = dyn_cast_or_null<FunctionDecl>(LM.D))
+    if (isa<CXXMethodDecl>(FD) ||
+        FD->isInIdentifierNamespace(Decl::IDNS_OrdinaryFriend))
+      Actions.ActOnFinishInlineFunctionDef(FD);
 }
 
 /// ParseLexedMemberInitializers - We finished parsing the member specification
@@ -618,7 +627,7 @@
   // Append the current token at the end of the new token stream so that it
   // doesn't get lost.
   MI.Toks.push_back(Tok);
-  PP.EnterTokenStream(MI.Toks.data(), MI.Toks.size(), true, false);
+  PP.EnterTokenStream(MI.Toks, true);
 
   // Consume the previously pushed token.
   ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
@@ -972,10 +981,10 @@
     // Put back the original tokens.
     Self.SkipUntil(EndKind, StopAtSemi | StopBeforeMatch);
     if (Toks.size()) {
-      Token *Buffer = new Token[Toks.size()];
-      std::copy(Toks.begin() + 1, Toks.end(), Buffer);
+      auto Buffer = llvm::make_unique<Token[]>(Toks.size());
+      std::copy(Toks.begin() + 1, Toks.end(), Buffer.get());
       Buffer[Toks.size() - 1] = Self.Tok;
-      Self.PP.EnterTokenStream(Buffer, Toks.size(), true, /*Owned*/true);
+      Self.PP.EnterTokenStream(std::move(Buffer), Toks.size(), true);
 
       Self.Tok = Toks.front();
     }
diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp
index d41a1ac..bc510d0 100644
--- a/lib/Parse/ParseDecl.cpp
+++ b/lib/Parse/ParseDecl.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ScopedPrinter.h"
 
 using namespace clang;
 
@@ -645,7 +646,6 @@
     case tok::kw___ptr64:
     case tok::kw___w64:
     case tok::kw___ptr32:
-    case tok::kw___unaligned:
     case tok::kw___sptr:
     case tok::kw___uptr: {
       IdentifierInfo *AttrName = Tok.getIdentifierInfo();
@@ -706,7 +706,7 @@
   }
 }
 
-void Parser::ParseOpenCLAttributes(ParsedAttributes &attrs) {
+void Parser::ParseOpenCLKernelAttributes(ParsedAttributes &attrs) {
   // Treat these like attributes
   while (Tok.is(tok::kw___kernel)) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
@@ -756,7 +756,7 @@
 ///   simple-integer ',' simple-integer
 ///   simple-integer ',' simple-integer ',' simple-integer
 VersionTuple Parser::ParseVersionTuple(SourceRange &Range) {
-  Range = Tok.getLocation();
+  Range = SourceRange(Tok.getLocation(), Tok.getEndLoc());
 
   if (!Tok.is(tok::numeric_constant)) {
     Diag(Tok, diag::err_expected_version);
@@ -884,7 +884,7 @@
 ///
 /// version-arg:
 ///   'introduced' '=' version
-///   'deprecated' '=' version
+///   'deprecated' ['=' version]
 ///   'obsoleted' = version
 ///   'unavailable'
 /// opt-replacement:
@@ -972,6 +972,21 @@
       continue;
     }
 
+    if (Keyword == Ident_deprecated && Platform->Ident &&
+        Platform->Ident->getName() == "swift") {
+      // For swift, we deprecate for all versions.
+      if (!Changes[Deprecated].KeywordLoc.isInvalid()) {
+        Diag(KeywordLoc, diag::err_availability_redundant)
+          << Keyword
+          << SourceRange(Changes[Deprecated].KeywordLoc);
+      }
+
+      Changes[Deprecated].KeywordLoc = KeywordLoc;
+      // Use a fake version here.
+      Changes[Deprecated].Version = VersionTuple(1);
+      continue;
+    }
+
     if (Tok.isNot(tok::equal)) {
       Diag(Tok, diag::err_expected_after) << Keyword << tok::equal;
       SkipUntil(tok::r_paren, StopAtSemi);
@@ -1254,7 +1269,7 @@
   // Append the current token at the end of the new token stream so that it
   // doesn't get lost.
   LA.Toks.push_back(Tok);
-  PP.EnterTokenStream(LA.Toks.data(), LA.Toks.size(), true, false);
+  PP.EnterTokenStream(LA.Toks, true);
   // Consume the previously pushed token.
   ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
 
@@ -1469,8 +1484,8 @@
   while (AL) {
     AttributeList *Next = AL->getNext();
 
-    // We only consider attributes using the appropriate '__declspec' spelling,
-    // this behavior doesn't extend to any other spellings.
+    // We only consider attributes using the appropriate '__declspec' spelling.
+    // This behavior doesn't extend to any other spellings.
     if (AL->getKind() == AttributeList::AT_Aligned &&
         AL->isDeclspecAttribute()) {
       // Stitch the attribute into the tag's attribute list.
@@ -1597,9 +1612,14 @@
     ProhibitAttributes(Attrs);
     DeclEnd = Tok.getLocation();
     if (RequireSemi) ConsumeToken();
+    RecordDecl *AnonRecord = nullptr;
     Decl *TheDecl = Actions.ParsedFreeStandingDeclSpec(getCurScope(), AS_none,
-                                                       DS);
+                                                       DS, AnonRecord);
     DS.complete(TheDecl);
+    if (AnonRecord) {
+      Decl* decls[] = {AnonRecord, TheDecl};
+      return Actions.BuildDeclaratorGroup(decls, /*TypeMayContainAuto=*/false);
+    }
     return Actions.ConvertDeclToDeclGroup(TheDecl);
   }
 
@@ -2042,7 +2062,7 @@
         TemplateParameterLists FakedParamLists;
         FakedParamLists.push_back(Actions.ActOnTemplateParameterList(
             0, SourceLocation(), TemplateInfo.TemplateLoc, LAngleLoc, None,
-            LAngleLoc));
+            LAngleLoc, nullptr));
 
         ThisDecl =
             Actions.ActOnTemplateDeclarator(getCurScope(), FakedParamLists, D);
@@ -2106,7 +2126,8 @@
       if (Init.isInvalid()) {
         SmallVector<tok::TokenKind, 2> StopTokens;
         StopTokens.push_back(tok::comma);
-        if (D.getContext() == Declarator::ForContext)
+        if (D.getContext() == Declarator::ForContext ||
+            D.getContext() == Declarator::InitStmtContext)
           StopTokens.push_back(tok::r_paren);
         SkipUntil(StopTokens, StopAtSemi | StopBeforeMatch);
         Actions.ActOnInitializerError(ThisDecl);
@@ -2317,6 +2338,24 @@
     return false;
   }
 
+  if (getLangOpts().CPlusPlus && (!SS || SS->isEmpty()) &&
+      getLangOpts().MSVCCompat) {
+    // Lookup of an unqualified type name has failed in MSVC compatibility mode.
+    // Give Sema a chance to recover if we are in a template with dependent base
+    // classes.
+    if (ParsedType T = Actions.ActOnMSVCUnknownTypeName(
+            *Tok.getIdentifierInfo(), Tok.getLocation(),
+            DSC == DSC_template_type_arg)) {
+      const char *PrevSpec;
+      unsigned DiagID;
+      DS.SetTypeSpecType(DeclSpec::TST_typename, Loc, PrevSpec, DiagID, T,
+                         Actions.getASTContext().getPrintingPolicy());
+      DS.SetRangeEnd(Tok.getLocation());
+      ConsumeToken();
+      return false;
+    }
+  }
+
   // Otherwise, if we don't consume this token, we are going to emit an
   // error anyway.  Try to recover from various common problems.  Check
   // to see if this was a reference to a tag name without a tag specified.
@@ -2699,7 +2738,7 @@
   bool AttrsLastTime = false;
   ParsedAttributesWithRange attrs(AttrFactory);
   // We use Sema's policy to get bool macros right.
-  const PrintingPolicy &Policy = Actions.getPrintingPolicy();
+  PrintingPolicy Policy = Actions.getPrintingPolicy();
   while (1) {
     bool isInvalid = false;
     bool isStorageClass = false;
@@ -3024,16 +3063,6 @@
         Actions.getTypeName(*Tok.getIdentifierInfo(),
                             Tok.getLocation(), getCurScope());
 
-      // MSVC: If we weren't able to parse a default template argument, and it's
-      // just a simple identifier, create a DependentNameType.  This will allow
-      // us to defer the name lookup to template instantiation time, as long we
-      // forge a NestedNameSpecifier for the current context.
-      if (!TypeRep && DSContext == DSC_template_type_arg &&
-          getLangOpts().MSVCCompat && getCurScope()->isTemplateParamScope()) {
-        TypeRep = Actions.ActOnDelayedDefaultTemplateArg(
-            *Tok.getIdentifierInfo(), Tok.getLocation());
-      }
-
       // If this is not a typedef name, don't parse it as part of the declspec,
       // it must be an implicit int or an error.
       if (!TypeRep) {
@@ -3125,6 +3154,11 @@
       break;
     }
 
+    case tok::kw___unaligned:
+      isInvalid = DS.SetTypeQual(DeclSpec::TQ_unaligned, Loc, PrevSpec, DiagID,
+                                 getLangOpts());
+      break;
+
     case tok::kw___sptr:
     case tok::kw___uptr:
     case tok::kw___ptr64:
@@ -3135,7 +3169,6 @@
     case tok::kw___fastcall:
     case tok::kw___thiscall:
     case tok::kw___vectorcall:
-    case tok::kw___unaligned:
       ParseMicrosoftTypeAttributes(DS.getAttributes());
       continue;
 
@@ -3146,7 +3179,7 @@
 
     // OpenCL single token adornments.
     case tok::kw___kernel:
-      ParseOpenCLAttributes(DS.getAttributes());
+      ParseOpenCLKernelAttributes(DS.getAttributes());
       continue;
 
     // Nullability type specifiers.
@@ -3344,6 +3377,10 @@
       isInvalid = DS.SetTypeSpecType(DeclSpec::TST_double, Loc, PrevSpec,
                                      DiagID, Policy);
       break;
+    case tok::kw___float128:
+      isInvalid = DS.SetTypeSpecType(DeclSpec::TST_float128, Loc, PrevSpec,
+                                     DiagID, Policy);
+      break;
     case tok::kw_wchar_t:
       isInvalid = DS.SetTypeSpecType(DeclSpec::TST_wchar, Loc, PrevSpec,
                                      DiagID, Policy);
@@ -3401,6 +3438,12 @@
       }
       isInvalid = DS.SetTypePipe(true, Loc, PrevSpec, DiagID, Policy);
       break;
+#define GENERIC_IMAGE_TYPE(ImgType, Id) \
+  case tok::kw_##ImgType##_t: \
+    isInvalid = DS.SetTypeSpecType(DeclSpec::TST_##ImgType##_t, Loc, PrevSpec, \
+                                   DiagID, Policy); \
+    break;
+#include "clang/Basic/OpenCLImageTypes.def"
     case tok::kw___unknown_anytype:
       isInvalid = DS.SetTypeSpecType(TST_unknown_anytype, Loc,
                                      PrevSpec, DiagID, Policy);
@@ -3469,6 +3512,22 @@
       ParseDecltypeSpecifier(DS);
       continue;
 
+    case tok::annot_pragma_pack:
+      HandlePragmaPack();
+      continue;
+
+    case tok::annot_pragma_ms_pragma:
+      HandlePragmaMSPragma();
+      continue;
+
+    case tok::annot_pragma_ms_vtordisp:
+      HandlePragmaMSVtorDisp();
+      continue;
+
+    case tok::annot_pragma_ms_pointers_to_members:
+      HandlePragmaMSPointersToMembers();
+      continue;
+
     case tok::kw___underlying_type:
       ParseUnderlyingTypeSpecifier(DS);
       continue;
@@ -3539,9 +3598,13 @@
       if (DiagID == diag::ext_duplicate_declspec)
         Diag(Tok, DiagID)
           << PrevSpec << FixItHint::CreateRemoval(Tok.getLocation());
-      else if (DiagID == diag::err_opencl_unknown_type_specifier)
-        Diag(Tok, DiagID) << PrevSpec << isStorageClass;
-      else
+      else if (DiagID == diag::err_opencl_unknown_type_specifier) {
+        const int OpenCLVer = getLangOpts().OpenCLVersion;
+        std::string VerSpec = llvm::to_string(OpenCLVer / 100) +
+                              std::string (".") +
+                              llvm::to_string((OpenCLVer % 100) / 10);
+        Diag(Tok, DiagID) << VerSpec << PrevSpec << isStorageClass;
+      } else
         Diag(Tok, DiagID) << PrevSpec;
     }
 
@@ -3587,8 +3650,10 @@
   // If there are no declarators, this is a free-standing declaration
   // specifier. Let the actions module cope with it.
   if (Tok.is(tok::semi)) {
+    RecordDecl *AnonRecord = nullptr;
     Decl *TheDecl = Actions.ParsedFreeStandingDeclSpec(getCurScope(), AS_none,
-                                                       DS);
+                                                       DS, AnonRecord);
+    assert(!AnonRecord && "Did not expect anonymous struct or union here");
     DS.complete(TheDecl);
     return;
   }
@@ -3691,12 +3756,12 @@
 
     if (Tok.is(tok::annot_pragma_openmp)) {
       // Result can be ignored, because it must be always empty.
-      auto Res = ParseOpenMPDeclarativeDirective();
-      assert(!Res);
-      // Silence possible warnings.
-      (void)Res;
+      AccessSpecifier AS = AS_none;
+      ParsedAttributesWithRange Attrs(AttrFactory);
+      (void)ParseOpenMPDeclarativeDirectiveWithExtDecl(AS, Attrs);
       continue;
     }
+
     if (!Tok.is(tok::at)) {
       auto CFieldCallback = [&](ParsingFieldDeclarator &FD) {
         // Install the declarator into the current TagDecl.
@@ -4210,7 +4275,7 @@
 
     if (Tok.is(tok::identifier)) {
       // We're missing a comma between enumerators.
-      SourceLocation Loc = PP.getLocForEndOfToken(PrevTokLocation);
+      SourceLocation Loc = getEndOfPreviousToken();
       Diag(Loc, diag::err_enumerator_list_missing_comma)
         << FixItHint::CreateInsertion(Loc, ", ");
       continue;
@@ -4283,27 +4348,6 @@
   }
 }
 
-/// isTypeSpecifierQualifier - Return true if the current token could be the
-/// start of a type-qualifier-list.
-bool Parser::isTypeQualifier() const {
-  switch (Tok.getKind()) {
-  default: return false;
-  // type-qualifier
-  case tok::kw_const:
-  case tok::kw_volatile:
-  case tok::kw_restrict:
-  case tok::kw___private:
-  case tok::kw___local:
-  case tok::kw___global:
-  case tok::kw___constant:
-  case tok::kw___generic:
-  case tok::kw___read_only:
-  case tok::kw___read_write:
-  case tok::kw___write_only:
-    return true;
-  }
-}
-
 /// isKnownToBeTypeSpecifier - Return true if we know that the specified token
 /// is definitely a type-specifier.  Return false if it isn't part of a type
 /// specifier or if we're not sure.
@@ -4328,12 +4372,15 @@
   case tok::kw_half:
   case tok::kw_float:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_bool:
   case tok::kw__Bool:
   case tok::kw__Decimal32:
   case tok::kw__Decimal64:
   case tok::kw__Decimal128:
   case tok::kw___vector:
+#define GENERIC_IMAGE_TYPE(ImgType, Id) case tok::kw_##ImgType##_t:
+#include "clang/Basic/OpenCLImageTypes.def"
 
     // struct-or-union-specifier (C99) or class-specifier (C++)
   case tok::kw_class:
@@ -4400,12 +4447,15 @@
   case tok::kw_half:
   case tok::kw_float:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_bool:
   case tok::kw__Bool:
   case tok::kw__Decimal32:
   case tok::kw__Decimal64:
   case tok::kw__Decimal128:
   case tok::kw___vector:
+#define GENERIC_IMAGE_TYPE(ImgType, Id) case tok::kw_##ImgType##_t:
+#include "clang/Basic/OpenCLImageTypes.def"
 
     // struct-or-union-specifier (C99) or class-specifier (C++)
   case tok::kw_class:
@@ -4552,6 +4602,7 @@
   case tok::kw_half:
   case tok::kw_float:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_bool:
   case tok::kw__Bool:
   case tok::kw__Decimal32:
@@ -4642,6 +4693,8 @@
   case tok::kw___read_only:
   case tok::kw___read_write:
   case tok::kw___write_only:
+#define GENERIC_IMAGE_TYPE(ImgType, Id) case tok::kw_##ImgType##_t:
+#include "clang/Basic/OpenCLImageTypes.def"
 
     return true;
   }
@@ -4834,6 +4887,10 @@
       ParseOpenCLQualifiers(DS.getAttributes());
       break;
 
+    case tok::kw___unaligned:
+      isInvalid = DS.SetTypeQual(DeclSpec::TQ_unaligned, Loc, PrevSpec, DiagID,
+                                 getLangOpts());
+      break;
     case tok::kw___uptr:
       // GNU libc headers in C mode use '__uptr' as an identifer which conflicts
       // with the MS modifier keyword.
@@ -4851,7 +4908,6 @@
     case tok::kw___fastcall:
     case tok::kw___thiscall:
     case tok::kw___vectorcall:
-    case tok::kw___unaligned:
       if (AttrReqs & AR_DeclspecAttributesParsed) {
         ParseMicrosoftTypeAttributes(DS.getAttributes());
         continue;
@@ -4989,7 +5045,7 @@
   // Member pointers get special handling, since there's no place for the
   // scope spec in the generic path below.
   if (getLangOpts().CPlusPlus &&
-      (Tok.is(tok::coloncolon) ||
+      (Tok.is(tok::coloncolon) || Tok.is(tok::kw_decltype) ||
        (Tok.is(tok::identifier) &&
         (NextToken().is(tok::coloncolon) || NextToken().is(tok::less))) ||
        Tok.is(tok::annot_cxxscope))) {
@@ -5033,7 +5089,8 @@
   tok::TokenKind Kind = Tok.getKind();
 
   if (D.getDeclSpec().isTypeSpecPipe() && !isPipeDeclerator(D)) {
-    DeclSpec &DS = D.getMutableDeclSpec();
+    DeclSpec DS(AttrFactory);
+    ParseTypeQualifierListOpt(DS);
 
     D.AddTypeInfo(
         DeclaratorChunk::getPipe(DS.getTypeQualifiers(), DS.getPipeLoc()),
@@ -5073,7 +5130,8 @@
                                                 DS.getConstSpecLoc(),
                                                 DS.getVolatileSpecLoc(),
                                                 DS.getRestrictSpecLoc(),
-                                                DS.getAtomicSpecLoc()),
+                                                DS.getAtomicSpecLoc(),
+                                                DS.getUnalignedSpecLoc()),
                     DS.getAttributes(),
                     SourceLocation());
     else
@@ -5193,12 +5251,22 @@
 ///          '~' class-name
 ///         template-id
 ///
+/// C++17 adds the following, which we also handle here:
+///
+///       simple-declaration:
+///         <decl-spec> '[' identifier-list ']' brace-or-equal-initializer ';'
+///
 /// Note, any additional constructs added here may need corresponding changes
 /// in isConstructorDeclarator.
 void Parser::ParseDirectDeclarator(Declarator &D) {
   DeclaratorScopeObj DeclScopeObj(*this, D.getCXXScopeSpec());
 
   if (getLangOpts().CPlusPlus && D.mayHaveIdentifier()) {
+    // This might be a C++17 structured binding.
+    if (Tok.is(tok::l_square) && !D.mayOmitIdentifier() &&
+        D.getCXXScopeSpec().isEmpty())
+      return ParseDecompositionDeclarator(D);
+
     // Don't parse FOO:BAR as if it were a typo for FOO::BAR inside a class, in
     // this context it is a bitfield. Also in range-based for statement colon
     // may delimit for-range-declaration.
@@ -5428,6 +5496,70 @@
   }
 }
 
+void Parser::ParseDecompositionDeclarator(Declarator &D) {
+  assert(Tok.is(tok::l_square));
+
+  // If this doesn't look like a structured binding, maybe it's a misplaced
+  // array declarator.
+  // FIXME: Consume the l_square first so we don't need extra lookahead for
+  // this.
+  if (!(NextToken().is(tok::identifier) &&
+        GetLookAheadToken(2).isOneOf(tok::comma, tok::r_square)) &&
+      !(NextToken().is(tok::r_square) &&
+        GetLookAheadToken(2).isOneOf(tok::equal, tok::l_brace)))
+    return ParseMisplacedBracketDeclarator(D);
+
+  BalancedDelimiterTracker T(*this, tok::l_square);
+  T.consumeOpen();
+
+  SmallVector<DecompositionDeclarator::Binding, 32> Bindings;
+  while (Tok.isNot(tok::r_square)) {
+    if (!Bindings.empty()) {
+      if (Tok.is(tok::comma))
+        ConsumeToken();
+      else {
+        if (Tok.is(tok::identifier)) {
+          SourceLocation EndLoc = getEndOfPreviousToken();
+          Diag(EndLoc, diag::err_expected)
+              << tok::comma << FixItHint::CreateInsertion(EndLoc, ",");
+        } else {
+          Diag(Tok, diag::err_expected_comma_or_rsquare);
+        }
+
+        SkipUntil(tok::r_square, tok::comma, tok::identifier,
+                  StopAtSemi | StopBeforeMatch);
+        if (Tok.is(tok::comma))
+          ConsumeToken();
+        else if (Tok.isNot(tok::identifier))
+          break;
+      }
+    }
+
+    if (Tok.isNot(tok::identifier)) {
+      Diag(Tok, diag::err_expected) << tok::identifier;
+      break;
+    }
+
+    Bindings.push_back({Tok.getIdentifierInfo(), Tok.getLocation()});
+    ConsumeToken();
+  }
+
+  if (Tok.isNot(tok::r_square))
+    // We've already diagnosed a problem here.
+    T.skipToEnd();
+  else {
+    // C++17 does not allow the identifier-list in a structured binding
+    // to be empty.
+    if (Bindings.empty())
+      Diag(Tok.getLocation(), diag::ext_decomp_decl_empty);
+
+    T.consumeClose();
+  }
+
+  return D.setDecompositionBindings(T.getOpenLocation(), Bindings,
+                                    T.getCloseLocation());
+}
+
 /// ParseParenDeclarator - We parsed the declarator D up to a paren.  This is
 /// only called before the identifier, so these are most likely just grouping
 /// parens for precedence.  If we find that these are actually function
@@ -6074,6 +6206,9 @@
                                             T.getCloseLocation()),
                   attrs, T.getCloseLocation());
     return;
+  } else if (Tok.getKind() == tok::code_completion) {
+    Actions.CodeCompleteBracketDeclarator(getCurScope());
+    return cutOffParsing();
   }
 
   // If valid, this location is the position where we read the 'static' keyword.
diff --git a/lib/Parse/ParseDeclCXX.cpp b/lib/Parse/ParseDeclCXX.cpp
index 5a90b6c..200a42e 100644
--- a/lib/Parse/ParseDeclCXX.cpp
+++ b/lib/Parse/ParseDeclCXX.cpp
@@ -1100,9 +1100,25 @@
   // FIXME: we should emit semantic diagnostic when declaration
   // attribute is in type attribute position.
   case tok::kw___attribute:     // struct foo __attribute__((used)) x;
+  case tok::annot_pragma_pack:  // struct foo {...} _Pragma(pack(pop));
+  // struct foo {...} _Pragma(section(...));
+  case tok::annot_pragma_ms_pragma:
+  // struct foo {...} _Pragma(vtordisp(pop));
+  case tok::annot_pragma_ms_vtordisp:
+  // struct foo {...} _Pragma(pointers_to_members(...));
+  case tok::annot_pragma_ms_pointers_to_members:
     return true;
   case tok::colon:
     return CouldBeBitfield;     // enum E { ... }   :         2;
+  // Microsoft compatibility
+  case tok::kw___cdecl:         // struct foo {...} __cdecl      x;
+  case tok::kw___fastcall:      // struct foo {...} __fastcall   x;
+  case tok::kw___stdcall:       // struct foo {...} __stdcall    x;
+  case tok::kw___thiscall:      // struct foo {...} __thiscall   x;
+  case tok::kw___vectorcall:    // struct foo {...} __vectorcall x;
+    // We will diagnose these calling-convention specifiers on non-function
+    // declarations later, so claim they are valid after a type specifier.
+    return getLangOpts().MicrosoftExt;
   // Type qualifiers
   case tok::kw_const:           // struct foo {...} const     x;
   case tok::kw_volatile:        // struct foo {...} volatile  x;
@@ -1260,6 +1276,7 @@
       Tok.isOneOf(tok::kw___is_abstract,
                   tok::kw___is_arithmetic,
                   tok::kw___is_array,
+                  tok::kw___is_assignable,
                   tok::kw___is_base_of,
                   tok::kw___is_class,
                   tok::kw___is_complete_type,
@@ -1394,7 +1411,7 @@
       // Strip off the last template parameter list if it was empty, since
       // we've removed its template argument list.
       if (TemplateParams && TemplateInfo.LastParameterListWasEmpty) {
-        if (TemplateParams && TemplateParams->size() > 1) {
+        if (TemplateParams->size() > 1) {
           TemplateParams->pop_back();
         } else {
           TemplateParams = nullptr;
@@ -1661,7 +1678,7 @@
           // template specialization.
           FakedParamLists.push_back(Actions.ActOnTemplateParameterList(
               0, SourceLocation(), TemplateInfo.TemplateLoc, LAngleLoc, None,
-              LAngleLoc));
+              LAngleLoc, nullptr));
           TemplateParams = &FakedParamLists;
         }
       }
@@ -1988,6 +2005,7 @@
 ///       virt-specifier:
 ///         override
 ///         final
+///         __final
 VirtSpecifiers::Specifier Parser::isCXX11VirtSpecifier(const Token &Tok) const {
   if (!getLangOpts().CPlusPlus || Tok.isNot(tok::identifier))
     return VirtSpecifiers::VS_None;
@@ -1997,6 +2015,8 @@
   // Initialize the contextual keywords.
   if (!Ident_final) {
     Ident_final = &PP.getIdentifierTable().get("final");
+    if (getLangOpts().GNUKeywords)
+      Ident_GNU_final = &PP.getIdentifierTable().get("__final");
     if (getLangOpts().MicrosoftExt)
       Ident_sealed = &PP.getIdentifierTable().get("sealed");
     Ident_override = &PP.getIdentifierTable().get("override");
@@ -2011,6 +2031,9 @@
   if (II == Ident_final)
     return VirtSpecifiers::VS_Final;
 
+  if (II == Ident_GNU_final)
+    return VirtSpecifiers::VS_GNU_Final;
+
   return VirtSpecifiers::VS_None;
 }
 
@@ -2050,6 +2073,8 @@
         << VirtSpecifiers::getSpecifierName(Specifier);
     } else if (Specifier == VirtSpecifiers::VS_Sealed) {
       Diag(Tok.getLocation(), diag::ext_ms_sealed_keyword);
+    } else if (Specifier == VirtSpecifiers::VS_GNU_Final) {
+      Diag(Tok.getLocation(), diag::ext_warn_gnu_final);
     } else {
       Diag(Tok.getLocation(),
            getLangOpts().CPlusPlus11
@@ -2066,6 +2091,7 @@
 bool Parser::isCXX11FinalKeyword() const {
   VirtSpecifiers::Specifier Specifier = isCXX11VirtSpecifier();
   return Specifier == VirtSpecifiers::VS_Final ||
+         Specifier == VirtSpecifiers::VS_GNU_Final || 
          Specifier == VirtSpecifiers::VS_Sealed;
 }
 
@@ -2400,10 +2426,15 @@
     if (DS.isFriendSpecified())
       ProhibitAttributes(FnAttrs);
 
-    Decl *TheDecl =
-      Actions.ParsedFreeStandingDeclSpec(getCurScope(), AS, DS, TemplateParams);
+    RecordDecl *AnonRecord = nullptr;
+    Decl *TheDecl = Actions.ParsedFreeStandingDeclSpec(
+        getCurScope(), AS, DS, TemplateParams, false, AnonRecord);
     DS.complete(TheDecl);
-    return DeclGroupPtrTy::make(DeclGroupRef(TheDecl));
+    if (AnonRecord) {
+      Decl* decls[] = {AnonRecord, TheDecl};
+      return Actions.BuildDeclaratorGroup(decls, /*TypeMayContainAuto=*/false);
+    }
+    return Actions.ConvertDeclToDeclGroup(TheDecl);
   }
 
   ParsingDeclarator DeclaratorInfo(*this, DS, Declarator::MemberContext);
@@ -2907,7 +2938,8 @@
   }
 
   if (Tok.is(tok::annot_pragma_openmp))
-    return ParseOpenMPDeclarativeDirective();
+    return ParseOpenMPDeclarativeDirectiveWithExtDecl(AS, AccessAttrs, TagType,
+                                                      TagDecl);
 
   // Parse all the comma separated declarators.
   return ParseCXXClassMemberDeclaration(AS, AccessAttrs.getList());
@@ -2975,6 +3007,7 @@
   if (getLangOpts().CPlusPlus && Tok.is(tok::identifier)) {
     VirtSpecifiers::Specifier Specifier = isCXX11VirtSpecifier(Tok);
     assert((Specifier == VirtSpecifiers::VS_Final ||
+            Specifier == VirtSpecifiers::VS_GNU_Final || 
             Specifier == VirtSpecifiers::VS_Sealed) &&
            "not a class definition");
     FinalLoc = ConsumeToken();
@@ -2990,6 +3023,8 @@
         << VirtSpecifiers::getSpecifierName(Specifier);
     else if (Specifier == VirtSpecifiers::VS_Sealed)
       Diag(FinalLoc, diag::ext_ms_sealed_keyword);
+    else if (Specifier == VirtSpecifiers::VS_GNU_Final)
+      Diag(FinalLoc, diag::ext_warn_gnu_final);
 
     // Parse any C++11 attributes after 'final' keyword.
     // These attributes are not allowed to appear here,
@@ -3398,10 +3433,11 @@
     NoexceptExpr = ParseConstantExpression();
     T.consumeClose();
     // The argument must be contextually convertible to bool. We use
-    // ActOnBooleanCondition for this purpose.
+    // CheckBooleanCondition for this purpose.
+    // FIXME: Add a proper Sema entry point for this.
     if (!NoexceptExpr.isInvalid()) {
-      NoexceptExpr = Actions.ActOnBooleanCondition(getCurScope(), KeywordLoc,
-                                                   NoexceptExpr.get());
+      NoexceptExpr =
+          Actions.CheckBooleanCondition(KeywordLoc, NoexceptExpr.get());
       NoexceptRange = SourceRange(KeywordLoc, T.getCloseLocation());
     } else {
       NoexceptType = EST_None;
@@ -3631,7 +3667,10 @@
   case AttributeList::AT_FallThrough:
   case AttributeList::AT_CXX11NoReturn:
     return true;
-
+  case AttributeList::AT_WarnUnusedResult:
+    return !ScopeName && AttrName->getName().equals("nodiscard");
+  case AttributeList::AT_Unused:
+    return !ScopeName && AttrName->getName().equals("maybe_unused");
   default:
     return false;
   }
@@ -3690,6 +3729,7 @@
         // The attribute was allowed to have arguments, but none were provided
         // even though the attribute parsed successfully. This is an error.
         Diag(LParenLoc, diag::err_attribute_requires_arguments) << AttrName;
+        Attr->setInvalid(true);
       } else if (!Attr->getMaxArgs()) {
         // The attribute parsed successfully, but was not allowed to have any
         // arguments. It doesn't matter whether any were provided -- the
@@ -3697,6 +3737,7 @@
         Diag(LParenLoc, diag::err_cxx11_attribute_forbids_arguments)
             << AttrName
             << FixItHint::CreateRemoval(SourceRange(LParenLoc, *EndLoc));
+        Attr->setInvalid(true);
       }
     }
   }
@@ -3743,6 +3784,23 @@
   ConsumeBracket();
   ConsumeBracket();
 
+  SourceLocation CommonScopeLoc;
+  IdentifierInfo *CommonScopeName = nullptr;
+  if (Tok.is(tok::kw_using)) {
+    Diag(Tok.getLocation(), getLangOpts().CPlusPlus1z
+                                ? diag::warn_cxx14_compat_using_attribute_ns
+                                : diag::ext_using_attribute_ns);
+    ConsumeToken();
+
+    CommonScopeName = TryParseCXX11AttributeIdentifier(CommonScopeLoc);
+    if (!CommonScopeName) {
+      Diag(Tok.getLocation(), diag::err_expected) << tok::identifier;
+      SkipUntil(tok::r_square, tok::colon, StopBeforeMatch);
+    }
+    if (!TryConsumeToken(tok::colon) && CommonScopeName)
+      Diag(Tok.getLocation(), diag::err_expected) << tok::colon;
+  }
+
   llvm::SmallDenseMap<IdentifierInfo*, SourceLocation, 4> SeenAttrs;
 
   while (Tok.isNot(tok::r_square)) {
@@ -3771,6 +3829,16 @@
       }
     }
 
+    if (CommonScopeName) {
+      if (ScopeName) {
+        Diag(ScopeLoc, diag::err_using_attribute_ns_conflict)
+            << SourceRange(CommonScopeLoc);
+      } else {
+        ScopeName = CommonScopeName;
+        ScopeLoc = CommonScopeLoc;
+      }
+    }
+
     bool StandardAttr = IsBuiltInOrStandardCXX11Attribute(AttrName, ScopeName);
     bool AttrParsed = false;
 
diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
index ad23804..3788b18 100644
--- a/lib/Parse/ParseExpr.cpp
+++ b/lib/Parse/ParseExpr.cpp
@@ -21,15 +21,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "clang/Parse/Parser.h"
 #include "RAIIObjectsForParser.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/PrettyStackTrace.h"
+#include "clang/Parse/Parser.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/TypoCorrection.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 using namespace clang;
 
@@ -263,6 +262,9 @@
     Token OpToken = Tok;
     ConsumeToken();
 
+    if (OpToken.is(tok::caretcaret)) {
+      return ExprError(Diag(Tok, diag::err_opencl_logical_exclusive_or));
+    }
     // Bail out when encountering a comma followed by a token which can't
     // possibly be the start of an expression. For instance:
     //   int f() { return 1, }
@@ -428,6 +430,7 @@
       }
     }
 
+    ExprResult OrigLHS = LHS;
     if (!LHS.isInvalid()) {
       // Combine the LHS and RHS into the LHS (e.g. build AST).
       if (TernaryMiddle.isInvalid()) {
@@ -442,13 +445,23 @@
 
         LHS = Actions.ActOnBinOp(getCurScope(), OpToken.getLocation(),
                                  OpToken.getKind(), LHS.get(), RHS.get());
-      } else
+
+      } else {
         LHS = Actions.ActOnConditionalOp(OpToken.getLocation(), ColonLoc,
                                          LHS.get(), TernaryMiddle.get(),
                                          RHS.get());
-    } else
-      // Ensure potential typos in the RHS aren't left undiagnosed.
+      }
+      // In this case, ActOnBinOp or ActOnConditionalOp performed the
+      // CorrectDelayedTyposInExpr check.
+      if (!getLangOpts().CPlusPlus)
+        continue;
+    }
+    // Ensure potential typos aren't left undiagnosed.
+    if (LHS.isInvalid()) {
+      Actions.CorrectDelayedTyposInExpr(OrigLHS);
+      Actions.CorrectDelayedTyposInExpr(TernaryMiddle);
       Actions.CorrectDelayedTyposInExpr(RHS);
+    }
   }
 }
 
@@ -513,7 +526,7 @@
 /// \p isAddressOfOperand exists because an id-expression that is the operand
 /// of address-of gets special treatment due to member pointers. NotCastExpr
 /// is set to true if the token is not the start of a cast-expression, and no
-/// diagnostic is emitted in this case.
+/// diagnostic is emitted in this case and no tokens are consumed.
 ///
 /// \verbatim
 ///       cast-expression: [C99 6.5.4]
@@ -787,6 +800,7 @@
           REVERTIBLE_TYPE_TRAIT(__is_abstract);
           REVERTIBLE_TYPE_TRAIT(__is_arithmetic);
           REVERTIBLE_TYPE_TRAIT(__is_array);
+          REVERTIBLE_TYPE_TRAIT(__is_assignable);
           REVERTIBLE_TYPE_TRAIT(__is_base_of);
           REVERTIBLE_TYPE_TRAIT(__is_class);
           REVERTIBLE_TYPE_TRAIT(__is_complete_type);
@@ -995,6 +1009,8 @@
   case tok::kw__Generic:   // primary-expression: generic-selection [C11 6.5.1]
     Res = ParseGenericSelectionExpression();
     break;
+  case tok::kw___builtin_available:
+    return ParseAvailabilityCheckExpr(Tok.getLocation());
   case tok::kw___builtin_va_arg:
   case tok::kw___builtin_offsetof:
   case tok::kw___builtin_choose_expr:
@@ -1010,15 +1026,24 @@
     //   unary-expression:
     //     ++ cast-expression
     //     -- cast-expression
-    SourceLocation SavedLoc = ConsumeToken();
+    Token SavedTok = Tok;
+    ConsumeToken();
     // One special case is implicitly handled here: if the preceding tokens are
     // an ambiguous cast expression, such as "(T())++", then we recurse to
     // determine whether the '++' is prefix or postfix.
     Res = ParseCastExpression(!getLangOpts().CPlusPlus,
                               /*isAddressOfOperand*/false, NotCastExpr,
                               NotTypeCast);
+    if (NotCastExpr) {
+      // If we return with NotCastExpr = true, we must not consume any tokens,
+      // so put the token back where we found it.
+      assert(Res.isInvalid());
+      UnconsumeToken(SavedTok);
+      return ExprError();
+    }
     if (!Res.isInvalid())
-      Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get());
+      Res = Actions.ActOnUnaryOp(getCurScope(), SavedTok.getLocation(),
+                                 SavedKind, Res.get());
     return Res;
   }
   case tok::amp: {         // unary-expression: '&' cast-expression
@@ -1148,10 +1173,14 @@
   case tok::kw_half:
   case tok::kw_float:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_void:
   case tok::kw_typename:
   case tok::kw_typeof:
-  case tok::kw___vector: {
+  case tok::kw___vector:
+#define GENERIC_IMAGE_TYPE(ImgType, Id) case tok::kw_##ImgType##_t:
+#include "clang/Basic/OpenCLImageTypes.def"
+  {
     if (!getLangOpts().CPlusPlus) {
       Diag(Tok, diag::err_expected_expression);
       return ExprError();
@@ -1416,8 +1445,10 @@
 
       // Reject array indices starting with a lambda-expression. '[[' is
       // reserved for attributes.
-      if (CheckProhibitedCXX11Attribute())
+      if (CheckProhibitedCXX11Attribute()) {
+        (void)Actions.CorrectDelayedTyposInExpr(LHS);
         return ExprError();
+      }
 
       BalancedDelimiterTracker T(*this, tok::l_square);
       T.consumeOpen();
@@ -1445,6 +1476,7 @@
 
       SourceLocation RLoc = Tok.getLocation();
 
+      ExprResult OrigLHS = LHS;
       if (!LHS.isInvalid() && !Idx.isInvalid() && !Length.isInvalid() &&
           Tok.is(tok::r_square)) {
         if (ColonLoc.isValid()) {
@@ -1455,7 +1487,10 @@
                                                 Idx.get(), RLoc);
         }
       } else {
-        (void)Actions.CorrectDelayedTyposInExpr(LHS);
+        LHS = ExprError();
+      }
+      if (LHS.isInvalid()) {
+        (void)Actions.CorrectDelayedTyposInExpr(OrigLHS);
         (void)Actions.CorrectDelayedTyposInExpr(Idx);
         (void)Actions.CorrectDelayedTyposInExpr(Length);
         LHS = ExprError();
@@ -2836,3 +2871,117 @@
   tok::TokenKind Kind = Tok.getKind();
   return Actions.ActOnObjCBoolLiteral(ConsumeToken(), Kind);
 }
+
+/// Validate availability spec list, emitting diagnostics if necessary. Returns
+/// true if invalid.
+static bool CheckAvailabilitySpecList(Parser &P,
+                                      ArrayRef<AvailabilitySpec> AvailSpecs) {
+  llvm::SmallSet<StringRef, 4> Platforms;
+  bool HasOtherPlatformSpec = false;
+  bool Valid = true;
+  for (const auto &Spec : AvailSpecs) {
+    if (Spec.isOtherPlatformSpec()) {
+      if (HasOtherPlatformSpec) {
+        P.Diag(Spec.getBeginLoc(), diag::err_availability_query_repeated_star);
+        Valid = false;
+      }
+
+      HasOtherPlatformSpec = true;
+      continue;
+    }
+
+    bool Inserted = Platforms.insert(Spec.getPlatform()).second;
+    if (!Inserted) {
+      // Rule out multiple version specs referring to the same platform.
+      // For example, we emit an error for:
+      // @available(macos 10.10, macos 10.11, *)
+      StringRef Platform = Spec.getPlatform();
+      P.Diag(Spec.getBeginLoc(), diag::err_availability_query_repeated_platform)
+          << Spec.getEndLoc() << Platform;
+      Valid = false;
+    }
+  }
+
+  if (!HasOtherPlatformSpec) {
+    SourceLocation InsertWildcardLoc = AvailSpecs.back().getEndLoc();
+    P.Diag(InsertWildcardLoc, diag::err_availability_query_wildcard_required)
+        << FixItHint::CreateInsertion(InsertWildcardLoc, ", *");
+    return true;
+  }
+
+  return !Valid;
+}
+
+/// Parse availability query specification.
+///
+///  availability-spec:
+///     '*'
+///     identifier version-tuple
+Optional<AvailabilitySpec> Parser::ParseAvailabilitySpec() {
+  if (Tok.is(tok::star)) {
+    return AvailabilitySpec(ConsumeToken());
+  } else {
+    // Parse the platform name.
+    if (Tok.isNot(tok::identifier)) {
+      Diag(Tok, diag::err_avail_query_expected_platform_name);
+      return None;
+    }
+
+    IdentifierLoc *PlatformIdentifier = ParseIdentifierLoc();
+    SourceRange VersionRange;
+    VersionTuple Version = ParseVersionTuple(VersionRange);
+
+    if (Version.empty())
+      return None;
+
+    StringRef Platform = PlatformIdentifier->Ident->getName();
+
+    if (AvailabilityAttr::getPrettyPlatformName(Platform).empty()) {
+      Diag(PlatformIdentifier->Loc,
+           diag::err_avail_query_unrecognized_platform_name)
+          << Platform;
+      return None;
+    }
+
+    return AvailabilitySpec(Version, Platform, PlatformIdentifier->Loc,
+                            VersionRange.getEnd());
+  }
+}
+
+ExprResult Parser::ParseAvailabilityCheckExpr(SourceLocation BeginLoc) {
+  assert(Tok.is(tok::kw___builtin_available) ||
+         Tok.isObjCAtKeyword(tok::objc_available));
+
+  // Eat the available or __builtin_available.
+  ConsumeToken();
+
+  BalancedDelimiterTracker Parens(*this, tok::l_paren);
+  if (Parens.expectAndConsume())
+    return ExprError();
+
+  SmallVector<AvailabilitySpec, 4> AvailSpecs;
+  bool HasError = false;
+  while (true) {
+    Optional<AvailabilitySpec> Spec = ParseAvailabilitySpec();
+    if (!Spec)
+      HasError = true;
+    else
+      AvailSpecs.push_back(*Spec);
+
+    if (!TryConsumeToken(tok::comma))
+      break;
+  }
+
+  if (HasError) {
+    SkipUntil(tok::r_paren, StopAtSemi);
+    return ExprError();
+  }
+
+  CheckAvailabilitySpecList(*this, AvailSpecs);
+
+  if (Parens.consumeClose())
+    return ExprError();
+
+  return Actions.ActOnObjCAvailabilityCheckExpr(AvailSpecs, BeginLoc,
+                                                Parens.getCloseLocation());
+}
diff --git a/lib/Parse/ParseExprCXX.cpp b/lib/Parse/ParseExprCXX.cpp
index 8d19ba7..35661d5 100644
--- a/lib/Parse/ParseExprCXX.cpp
+++ b/lib/Parse/ParseExprCXX.cpp
@@ -427,13 +427,13 @@
     //   namespace-name '::'
     //   nested-name-specifier identifier '::'
     Token Next = NextToken();
-    
+    Sema::NestedNameSpecInfo IdInfo(&II, Tok.getLocation(), Next.getLocation(),
+                                    ObjectType);
+
     // If we get foo:bar, this is almost certainly a typo for foo::bar.  Recover
     // and emit a fixit hint for it.
     if (Next.is(tok::colon) && !ColonIsSacred) {
-      if (Actions.IsInvalidUnlessNestedName(getCurScope(), SS, II, 
-                                            Tok.getLocation(), 
-                                            Next.getLocation(), ObjectType,
+      if (Actions.IsInvalidUnlessNestedName(getCurScope(), SS, IdInfo,
                                             EnteringContext) &&
           // If the token after the colon isn't an identifier, it's still an
           // error, but they probably meant something else strange so don't
@@ -459,8 +459,7 @@
 
     if (Next.is(tok::coloncolon)) {
       if (CheckForDestructor && GetLookAheadToken(2).is(tok::tilde) &&
-          !Actions.isNonTypeNestedNameSpecifier(
-              getCurScope(), SS, Tok.getLocation(), II, ObjectType)) {
+          !Actions.isNonTypeNestedNameSpecifier(getCurScope(), SS, IdInfo)) {
         *MayBePseudoDestructor = true;
         return false;
       }
@@ -496,8 +495,8 @@
 
       bool IsCorrectedToColon = false;
       bool *CorrectionFlagPtr = ColonIsSacred ? &IsCorrectedToColon : nullptr;
-      if (Actions.ActOnCXXNestedNameSpecifier(getCurScope(), II, IdLoc, CCLoc,
-                                              ObjectType, EnteringContext, SS,
+      if (Actions.ActOnCXXNestedNameSpecifier(getCurScope(), IdInfo,
+                                              EnteringContext, SS,
                                               false, CorrectionFlagPtr)) {
         // Identifier is not recognized as a nested name, but we can have
         // mistyped '::' instead of ':'.
@@ -849,8 +848,16 @@
     IdentifierInfo *Id = nullptr;
     SourceLocation EllipsisLoc;
     ExprResult Init;
-    
-    if (Tok.is(tok::kw_this)) {
+
+    if (Tok.is(tok::star)) {
+      Loc = ConsumeToken(); 
+      if (Tok.is(tok::kw_this)) {
+        ConsumeToken();     
+        Kind = LCK_StarThis;      
+      } else {
+        return DiagResult(diag::err_expected_star_this_capture);
+      }
+    } else if (Tok.is(tok::kw_this)) {
       Kind = LCK_This;
       Loc = ConsumeToken();
     } else {
@@ -995,6 +1002,7 @@
     //          return y;
     //     }
     //   };
+    // }
     // If x was not const, the second use would require 'L' to capture, and
     // that would be an error.
 
@@ -1045,6 +1053,58 @@
   return false;
 }
 
+static void
+tryConsumeMutableOrConstexprToken(Parser &P, SourceLocation &MutableLoc,
+                                  SourceLocation &ConstexprLoc,
+                                  SourceLocation &DeclEndLoc) {
+  assert(MutableLoc.isInvalid());
+  assert(ConstexprLoc.isInvalid());
+  // Consume constexpr-opt mutable-opt in any sequence, and set the DeclEndLoc
+  // to the final of those locations. Emit an error if we have multiple
+  // copies of those keywords and recover.
+
+  while (true) {
+    switch (P.getCurToken().getKind()) {
+    case tok::kw_mutable: {
+      if (MutableLoc.isValid()) {
+        P.Diag(P.getCurToken().getLocation(),
+               diag::err_lambda_decl_specifier_repeated)
+            << 0 << FixItHint::CreateRemoval(P.getCurToken().getLocation());
+      }
+      MutableLoc = P.ConsumeToken();
+      DeclEndLoc = MutableLoc;
+      break /*switch*/;
+    }
+    case tok::kw_constexpr:
+      if (ConstexprLoc.isValid()) {
+        P.Diag(P.getCurToken().getLocation(),
+               diag::err_lambda_decl_specifier_repeated)
+            << 1 << FixItHint::CreateRemoval(P.getCurToken().getLocation());
+      }
+      ConstexprLoc = P.ConsumeToken();
+      DeclEndLoc = ConstexprLoc;
+      break /*switch*/;
+    default:
+      return;
+    }
+  }
+}
+
+static void
+addConstexprToLambdaDeclSpecifier(Parser &P, SourceLocation ConstexprLoc,
+                                  DeclSpec &DS) {
+  if (ConstexprLoc.isValid()) {
+    P.Diag(ConstexprLoc, !P.getLangOpts().CPlusPlus1z
+                             ? diag::ext_constexpr_on_lambda_cxx1z
+                             : diag::warn_cxx14_compat_constexpr_on_lambda);
+    const char *PrevSpec = nullptr;
+    unsigned DiagID = 0;
+    DS.SetConstexprSpec(ConstexprLoc, PrevSpec, DiagID);
+    assert(PrevSpec == nullptr && DiagID == 0 &&
+           "Constexpr cannot have been set previously!");
+  }
+}
+
 /// ParseLambdaExpressionAfterIntroducer - Parse the rest of a lambda
 /// expression.
 ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
@@ -1103,10 +1163,13 @@
     // compatible with MSVC.
     MaybeParseMicrosoftDeclSpecs(Attr, &DeclEndLoc);
 
-    // Parse 'mutable'[opt].
+    // Parse mutable-opt and/or constexpr-opt, and update the DeclEndLoc.
     SourceLocation MutableLoc;
-    if (TryConsumeToken(tok::kw_mutable, MutableLoc))
-      DeclEndLoc = MutableLoc;
+    SourceLocation ConstexprLoc;
+    tryConsumeMutableOrConstexprToken(*this, MutableLoc, ConstexprLoc,
+                                      DeclEndLoc);
+    
+    addConstexprToLambdaDeclSpecifier(*this, ConstexprLoc, DS);
 
     // Parse exception-specification[opt].
     ExceptionSpecificationType ESpecType = EST_None;
@@ -1164,7 +1227,8 @@
                                            LParenLoc, FunLocalRangeEnd, D,
                                            TrailingReturnType),
                   Attr, DeclEndLoc);
-  } else if (Tok.isOneOf(tok::kw_mutable, tok::arrow, tok::kw___attribute) ||
+  } else if (Tok.isOneOf(tok::kw_mutable, tok::arrow, tok::kw___attribute,
+                         tok::kw_constexpr) ||
              (Tok.is(tok::l_square) && NextToken().is(tok::l_square))) {
     // It's common to forget that one needs '()' before 'mutable', an attribute
     // specifier, or the result type. Deal with this.
@@ -1174,6 +1238,7 @@
     case tok::arrow: TokKind = 1; break;
     case tok::kw___attribute:
     case tok::l_square: TokKind = 2; break;
+    case tok::kw_constexpr: TokKind = 3; break;
     default: llvm_unreachable("Unknown token kind");
     }
 
@@ -1661,46 +1726,58 @@
 /// [GNU]   type-specifier-seq declarator simple-asm-expr[opt] attributes[opt]
 ///             '=' assignment-expression
 ///
-/// \param ExprOut if the condition was parsed as an expression, the parsed
-/// expression.
+/// In C++1z, a condition may in some contexts be preceded by an
+/// optional init-statement. This function will parse that too.
 ///
-/// \param DeclOut if the condition was parsed as a declaration, the parsed
-/// declaration.
+/// \param InitStmt If non-null, an init-statement is permitted, and if present
+/// will be parsed and stored here.
 ///
 /// \param Loc The location of the start of the statement that requires this
 /// condition, e.g., the "for" in a for loop.
 ///
-/// \param ConvertToBoolean Whether the condition expression should be
-/// converted to a boolean value.
-///
-/// \returns true if there was a parsing, false otherwise.
-bool Parser::ParseCXXCondition(ExprResult &ExprOut,
-                               Decl *&DeclOut,
-                               SourceLocation Loc,
-                               bool ConvertToBoolean) {
+/// \returns The parsed condition.
+Sema::ConditionResult Parser::ParseCXXCondition(StmtResult *InitStmt,
+                                                SourceLocation Loc,
+                                                Sema::ConditionKind CK) {
   if (Tok.is(tok::code_completion)) {
     Actions.CodeCompleteOrdinaryName(getCurScope(), Sema::PCC_Condition);
     cutOffParsing();
-    return true;
+    return Sema::ConditionError();
   }
 
   ParsedAttributesWithRange attrs(AttrFactory);
   MaybeParseCXX11Attributes(attrs);
 
-  if (!isCXXConditionDeclaration()) {
+  // Determine what kind of thing we have.
+  switch (isCXXConditionDeclarationOrInitStatement(InitStmt)) {
+  case ConditionOrInitStatement::Expression: {
     ProhibitAttributes(attrs);
 
     // Parse the expression.
-    ExprOut = ParseExpression(); // expression
-    DeclOut = nullptr;
-    if (ExprOut.isInvalid())
-      return true;
+    ExprResult Expr = ParseExpression(); // expression
+    if (Expr.isInvalid())
+      return Sema::ConditionError();
 
-    // If required, convert to a boolean value.
-    if (ConvertToBoolean)
-      ExprOut
-        = Actions.ActOnBooleanCondition(getCurScope(), Loc, ExprOut.get());
-    return ExprOut.isInvalid();
+    if (InitStmt && Tok.is(tok::semi)) {
+      *InitStmt = Actions.ActOnExprStmt(Expr.get());
+      ConsumeToken();
+      return ParseCXXCondition(nullptr, Loc, CK);
+    }
+
+    return Actions.ActOnCondition(getCurScope(), Loc, Expr.get(), CK);
+  }
+
+  case ConditionOrInitStatement::InitStmtDecl: {
+    SourceLocation DeclStart = Tok.getLocation(), DeclEnd;
+    DeclGroupPtrTy DG = ParseSimpleDeclaration(
+        Declarator::InitStmtContext, DeclEnd, attrs, /*RequireSemi=*/true);
+    *InitStmt = Actions.ActOnDeclStmt(DG, DeclStart, DeclEnd);
+    return ParseCXXCondition(nullptr, Loc, CK);
+  }
+
+  case ConditionOrInitStatement::ConditionDecl:
+  case ConditionOrInitStatement::Error:
+    break;
   }
 
   // type-specifier-seq
@@ -1718,7 +1795,7 @@
     ExprResult AsmLabel(ParseSimpleAsm(&Loc));
     if (AsmLabel.isInvalid()) {
       SkipUntil(tok::semi, StopAtSemi);
-      return true;
+      return Sema::ConditionError();
     }
     DeclaratorInfo.setAsmLabel(AsmLabel.get());
     DeclaratorInfo.SetRangeEnd(Loc);
@@ -1730,8 +1807,9 @@
   // Type-check the declaration itself.
   DeclResult Dcl = Actions.ActOnCXXConditionDeclaration(getCurScope(), 
                                                         DeclaratorInfo);
-  DeclOut = Dcl.get();
-  ExprOut = ExprError();
+  if (Dcl.isInvalid())
+    return Sema::ConditionError();
+  Decl *DeclOut = Dcl.get();
 
   // '=' assignment-expression
   // If a '==' or '+=' is found, suggest a fixit to '='.
@@ -1751,12 +1829,11 @@
     SourceLocation LParen = ConsumeParen(), RParen = LParen;
     if (SkipUntil(tok::r_paren, StopAtSemi | StopBeforeMatch))
       RParen = ConsumeParen();
-    Diag(DeclOut ? DeclOut->getLocation() : LParen,
+    Diag(DeclOut->getLocation(),
          diag::err_expected_init_in_condition_lparen)
       << SourceRange(LParen, RParen);
   } else {
-    Diag(DeclOut ? DeclOut->getLocation() : Tok.getLocation(),
-         diag::err_expected_init_in_condition);
+    Diag(DeclOut->getLocation(), diag::err_expected_init_in_condition);
   }
 
   if (!InitExpr.isInvalid())
@@ -1765,12 +1842,8 @@
   else
     Actions.ActOnInitializerError(DeclOut);
 
-  // FIXME: Build a reference to this declaration? Convert it to bool?
-  // (This is currently handled by Sema).
-
   Actions.FinalizeDeclaration(DeclOut);
-  
-  return false;
+  return Actions.ActOnConditionVariable(DeclOut, Loc, CK);
 }
 
 /// ParseCXXSimpleTypeSpecifier - [C++ 7.1.5.2] Simple type specifiers.
@@ -1866,6 +1939,9 @@
   case tok::kw_double:
     DS.SetTypeSpecType(DeclSpec::TST_double, Loc, PrevSpec, DiagID, Policy);
     break;
+  case tok::kw___float128:
+    DS.SetTypeSpecType(DeclSpec::TST_float128, Loc, PrevSpec, DiagID, Policy);
+    break;
   case tok::kw_wchar_t:
     DS.SetTypeSpecType(DeclSpec::TST_wchar, Loc, PrevSpec, DiagID, Policy);
     break;
@@ -3096,8 +3172,7 @@
   Toks.push_back(Tok);
   // Re-enter the stored parenthesized tokens into the token stream, so we may
   // parse them now.
-  PP.EnterTokenStream(Toks.data(), Toks.size(),
-                      true/*DisableMacroExpansion*/, false/*OwnsTokens*/);
+  PP.EnterTokenStream(Toks, true /*DisableMacroExpansion*/);
   // Drop the current token and bring the first cached one. It's the same token
   // as when we entered this function.
   ConsumeAnyToken();
diff --git a/lib/Parse/ParseInit.cpp b/lib/Parse/ParseInit.cpp
index 2cdb9d3..4a68942 100644
--- a/lib/Parse/ParseInit.cpp
+++ b/lib/Parse/ParseInit.cpp
@@ -11,13 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Parse/Parser.h"
 #include "RAIIObjectsForParser.h"
 #include "clang/Parse/ParseDiagnostic.h"
+#include "clang/Parse/Parser.h"
 #include "clang/Sema/Designator.h"
 #include "clang/Sema/Scope.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Support/raw_ostream.h"
 using namespace clang;
 
 
diff --git a/lib/Parse/ParseObjc.cpp b/lib/Parse/ParseObjc.cpp
index 4f15a00..403688a 100644
--- a/lib/Parse/ParseObjc.cpp
+++ b/lib/Parse/ParseObjc.cpp
@@ -21,6 +21,7 @@
 #include "clang/Sema/Scope.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+
 using namespace clang;
 
 /// Skips attributes after an Objective-C @ directive. Emits a diagnostic.
@@ -99,16 +100,20 @@
   Sema &Actions;
   Scope *S;
   ObjCTypeParamList *Params;
+
 public:
   ObjCTypeParamListScope(Sema &Actions, Scope *S)
       : Actions(Actions), S(S), Params(nullptr) {}
+
   ~ObjCTypeParamListScope() {
     leave();
   }
+
   void enter(ObjCTypeParamList *P) {
     assert(!Params);
     Params = P;
   }
+
   void leave() {
     if (Params)
       Actions.popObjCTypeParamList(S, Params);
@@ -2001,7 +2006,6 @@
   }
   HelperActionsForIvarDeclarations(interfaceDecl, atLoc,
                                    T, AllIvarDecls, false);
-  return;
 }
 
 ///   objc-protocol-declaration:
@@ -2654,6 +2658,12 @@
 /// StashAwayMethodOrFunctionBodyTokens -  Consume the tokens and store them 
 /// for later parsing.
 void Parser::StashAwayMethodOrFunctionBodyTokens(Decl *MDecl) {
+  if (SkipFunctionBodies && (!MDecl || Actions.canSkipFunctionBody(MDecl)) &&
+      trySkippingFunctionBody()) {
+    Actions.ActOnSkippedFunctionBody(MDecl);
+    return;
+  }
+
   LexedMethod* LM = new LexedMethod(this, MDecl);
   CurParsedObjCImpl->LateParsedObjCMethods.push_back(LM);
   CachedTokens &Toks = LM->Toks;
@@ -2850,6 +2860,8 @@
       return ParsePostfixExpressionSuffix(ParseObjCProtocolExpression(AtLoc));
     case tok::objc_selector:
       return ParsePostfixExpressionSuffix(ParseObjCSelectorExpression(AtLoc));
+    case tok::objc_available:
+      return ParseAvailabilityCheckExpr(AtLoc);
       default: {
         const char *str = nullptr;
         if (GetLookAheadToken(1).is(tok::l_brace)) {
@@ -2978,7 +2990,6 @@
       InMessageExpression)
     return false;
   
-  
   ParsedType Type;
 
   if (Tok.is(tok::annot_typename)) 
@@ -3407,6 +3418,7 @@
   ExprVector ElementExprs;                   // array elements.
   ConsumeBracket(); // consume the l_square.
 
+  bool HasInvalidEltExpr = false;
   while (Tok.isNot(tok::r_square)) {
     // Parse list of array element expressions (all must be id types).
     ExprResult Res(ParseAssignmentExpression());
@@ -3418,11 +3430,15 @@
       return Res;
     }    
     
+    Res = Actions.CorrectDelayedTyposInExpr(Res.get());
+    if (Res.isInvalid())
+      HasInvalidEltExpr = true;
+
     // Parse the ellipsis that indicates a pack expansion.
     if (Tok.is(tok::ellipsis))
       Res = Actions.ActOnPackExpansion(Res.get(), ConsumeToken());    
     if (Res.isInvalid())
-      return true;
+      HasInvalidEltExpr = true;
 
     ElementExprs.push_back(Res.get());
 
@@ -3433,6 +3449,10 @@
                                                             << tok::comma);
   }
   SourceLocation EndLoc = ConsumeBracket(); // location of ']'
+
+  if (HasInvalidEltExpr)
+    return ExprError();
+
   MultiExprArg Args(ElementExprs);
   return Actions.BuildObjCArrayLiteral(SourceRange(AtLoc, EndLoc), Args);
 }
@@ -3440,6 +3460,7 @@
 ExprResult Parser::ParseObjCDictionaryLiteral(SourceLocation AtLoc) {
   SmallVector<ObjCDictionaryElement, 4> Elements; // dictionary elements.
   ConsumeBrace(); // consume the l_square.
+  bool HasInvalidEltExpr = false;
   while (Tok.isNot(tok::r_brace)) {
     // Parse the comma separated key : value expressions.
     ExprResult KeyExpr;
@@ -3469,7 +3490,15 @@
       return ValueExpr;
     }
     
-    // Parse the ellipsis that designates this as a pack expansion.
+    // Check the key and value for possible typos
+    KeyExpr = Actions.CorrectDelayedTyposInExpr(KeyExpr.get());
+    ValueExpr = Actions.CorrectDelayedTyposInExpr(ValueExpr.get());
+    if (KeyExpr.isInvalid() || ValueExpr.isInvalid())
+      HasInvalidEltExpr = true;
+
+    // Parse the ellipsis that designates this as a pack expansion. Do not
+    // ActOnPackExpansion here, leave it to template instantiation time where
+    // we can get better diagnostics.
     SourceLocation EllipsisLoc;
     if (getLangOpts().CPlusPlus)
       TryConsumeToken(tok::ellipsis, EllipsisLoc);
@@ -3486,6 +3515,9 @@
                                                             << tok::comma);
   }
   SourceLocation EndLoc = ConsumeBrace();
+
+  if (HasInvalidEltExpr)
+    return ExprError();
   
   // Create the ObjCDictionaryLiteral.
   return Actions.BuildObjCDictionaryLiteral(SourceRange(AtLoc, EndLoc),
@@ -3607,7 +3639,7 @@
                                              T.getOpenLocation(),
                                              T.getCloseLocation(),
                                              !HasOptionalParen);
- }
+}
 
 void Parser::ParseLexedObjCMethodDefs(LexedMethod &LM, bool parseMethod) {
   // MCDecl might be null due to error in method or c-function  prototype, etc.
@@ -3625,8 +3657,8 @@
   // Append the current token at the end of the new token stream so that it
   // doesn't get lost.
   LM.Toks.push_back(Tok);
-  PP.EnterTokenStream(LM.Toks.data(), LM.Toks.size(), true, false);
-  
+  PP.EnterTokenStream(LM.Toks, true);
+
   // Consume the previously pushed token.
   ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
     
@@ -3665,6 +3697,4 @@
       while (Tok.getLocation() != OrigLoc && Tok.isNot(tok::eof))
         ConsumeAnyToken();
   }
-  
-  return;
 }
diff --git a/lib/Parse/ParseOpenMP.cpp b/lib/Parse/ParseOpenMP.cpp
index 0531847..d4cdc8e 100644
--- a/lib/Parse/ParseOpenMP.cpp
+++ b/lib/Parse/ParseOpenMP.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RAIIObjectsForParser.h"
-#include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Parse/ParseDiagnostic.h"
@@ -26,78 +25,539 @@
 // OpenMP declarative directives.
 //===----------------------------------------------------------------------===//
 
+namespace {
+enum OpenMPDirectiveKindEx {
+  OMPD_cancellation = OMPD_unknown + 1,
+  OMPD_data,
+  OMPD_declare,
+  OMPD_end,
+  OMPD_end_declare,
+  OMPD_enter,
+  OMPD_exit,
+  OMPD_point,
+  OMPD_reduction,
+  OMPD_target_enter,
+  OMPD_target_exit,
+  OMPD_update,
+  OMPD_distribute_parallel
+};
+
+class ThreadprivateListParserHelper final {
+  SmallVector<Expr *, 4> Identifiers;
+  Parser *P;
+
+public:
+  ThreadprivateListParserHelper(Parser *P) : P(P) {}
+  void operator()(CXXScopeSpec &SS, DeclarationNameInfo NameInfo) {
+    ExprResult Res =
+        P->getActions().ActOnOpenMPIdExpression(P->getCurScope(), SS, NameInfo);
+    if (Res.isUsable())
+      Identifiers.push_back(Res.get());
+  }
+  llvm::ArrayRef<Expr *> getIdentifiers() const { return Identifiers; }
+};
+} // namespace
+
+// Map token string to extended OMP token kind that are
+// OpenMPDirectiveKind + OpenMPDirectiveKindEx.
+static unsigned getOpenMPDirectiveKindEx(StringRef S) {
+  auto DKind = getOpenMPDirectiveKind(S);
+  if (DKind != OMPD_unknown)
+    return DKind;
+
+  return llvm::StringSwitch<unsigned>(S)
+      .Case("cancellation", OMPD_cancellation)
+      .Case("data", OMPD_data)
+      .Case("declare", OMPD_declare)
+      .Case("end", OMPD_end)
+      .Case("enter", OMPD_enter)
+      .Case("exit", OMPD_exit)
+      .Case("point", OMPD_point)
+      .Case("reduction", OMPD_reduction)
+      .Case("update", OMPD_update)
+      .Default(OMPD_unknown);
+}
+
 static OpenMPDirectiveKind ParseOpenMPDirectiveKind(Parser &P) {
   // Array of foldings: F[i][0] F[i][1] ===> F[i][2].
   // E.g.: OMPD_for OMPD_simd ===> OMPD_for_simd
   // TODO: add other combined directives in topological order.
-  const OpenMPDirectiveKind F[][3] = {
-      {OMPD_unknown /*cancellation*/, OMPD_unknown /*point*/,
-       OMPD_cancellation_point},
-      {OMPD_target, OMPD_unknown /*data*/, OMPD_target_data},
-      {OMPD_for, OMPD_simd, OMPD_for_simd},
-      {OMPD_parallel, OMPD_for, OMPD_parallel_for},
-      {OMPD_parallel_for, OMPD_simd, OMPD_parallel_for_simd},
-      {OMPD_parallel, OMPD_sections, OMPD_parallel_sections},
-      {OMPD_taskloop, OMPD_simd, OMPD_taskloop_simd}};
+  static const unsigned F[][3] = {
+    { OMPD_cancellation, OMPD_point, OMPD_cancellation_point },
+    { OMPD_declare, OMPD_reduction, OMPD_declare_reduction },
+    { OMPD_declare, OMPD_simd, OMPD_declare_simd },
+    { OMPD_declare, OMPD_target, OMPD_declare_target },
+    { OMPD_distribute, OMPD_parallel, OMPD_distribute_parallel },
+    { OMPD_distribute_parallel, OMPD_for, OMPD_distribute_parallel_for },
+    { OMPD_distribute_parallel_for, OMPD_simd, 
+      OMPD_distribute_parallel_for_simd },
+    { OMPD_distribute, OMPD_simd, OMPD_distribute_simd },
+    { OMPD_end, OMPD_declare, OMPD_end_declare },
+    { OMPD_end_declare, OMPD_target, OMPD_end_declare_target },
+    { OMPD_target, OMPD_data, OMPD_target_data },
+    { OMPD_target, OMPD_enter, OMPD_target_enter },
+    { OMPD_target, OMPD_exit, OMPD_target_exit },
+    { OMPD_target, OMPD_update, OMPD_target_update },
+    { OMPD_target_enter, OMPD_data, OMPD_target_enter_data },
+    { OMPD_target_exit, OMPD_data, OMPD_target_exit_data },
+    { OMPD_for, OMPD_simd, OMPD_for_simd },
+    { OMPD_parallel, OMPD_for, OMPD_parallel_for },
+    { OMPD_parallel_for, OMPD_simd, OMPD_parallel_for_simd },
+    { OMPD_parallel, OMPD_sections, OMPD_parallel_sections },
+    { OMPD_taskloop, OMPD_simd, OMPD_taskloop_simd },
+    { OMPD_target, OMPD_parallel, OMPD_target_parallel },
+    { OMPD_target, OMPD_simd, OMPD_target_simd },
+    { OMPD_target_parallel, OMPD_for, OMPD_target_parallel_for },
+    { OMPD_target_parallel_for, OMPD_simd, OMPD_target_parallel_for_simd },
+    { OMPD_teams, OMPD_distribute, OMPD_teams_distribute }
+  };
+  enum { CancellationPoint = 0, DeclareReduction = 1, TargetData = 2 };
   auto Tok = P.getCurToken();
-  auto DKind =
+  unsigned DKind =
       Tok.isAnnotation()
-          ? OMPD_unknown
-          : getOpenMPDirectiveKind(P.getPreprocessor().getSpelling(Tok));
+          ? static_cast<unsigned>(OMPD_unknown)
+          : getOpenMPDirectiveKindEx(P.getPreprocessor().getSpelling(Tok));
+  if (DKind == OMPD_unknown)
+    return OMPD_unknown;
 
-  bool TokenMatched = false;
   for (unsigned i = 0; i < llvm::array_lengthof(F); ++i) {
-    if (!Tok.isAnnotation() && DKind == OMPD_unknown) {
-      TokenMatched =
-          (i == 0) &&
-          !P.getPreprocessor().getSpelling(Tok).compare("cancellation");
-    } else {
-      TokenMatched = DKind == F[i][0] && DKind != OMPD_unknown;
-    }
+    if (DKind != F[i][0])
+      continue;
 
-    if (TokenMatched) {
-      Tok = P.getPreprocessor().LookAhead(0);
-      auto TokenIsAnnotation = Tok.isAnnotation();
-      auto SDKind =
-          TokenIsAnnotation
-              ? OMPD_unknown
-              : getOpenMPDirectiveKind(P.getPreprocessor().getSpelling(Tok));
+    Tok = P.getPreprocessor().LookAhead(0);
+    unsigned SDKind =
+        Tok.isAnnotation()
+            ? static_cast<unsigned>(OMPD_unknown)
+            : getOpenMPDirectiveKindEx(P.getPreprocessor().getSpelling(Tok));
+    if (SDKind == OMPD_unknown)
+      continue;
 
-      if (!TokenIsAnnotation && SDKind == OMPD_unknown) {
-        TokenMatched =
-            ((i == 0) &&
-             !P.getPreprocessor().getSpelling(Tok).compare("point")) ||
-            ((i == 1) && !P.getPreprocessor().getSpelling(Tok).compare("data"));
-      } else {
-        TokenMatched = SDKind == F[i][1] && SDKind != OMPD_unknown;
-      }
-
-      if (TokenMatched) {
-        P.ConsumeToken();
-        DKind = F[i][2];
-      }
+    if (SDKind == F[i][1]) {
+      P.ConsumeToken();
+      DKind = F[i][2];
     }
   }
-  return DKind;
+  return DKind < OMPD_unknown ? static_cast<OpenMPDirectiveKind>(DKind)
+                              : OMPD_unknown;
+}
+
+static DeclarationName parseOpenMPReductionId(Parser &P) {
+  Token Tok = P.getCurToken();
+  Sema &Actions = P.getActions();
+  OverloadedOperatorKind OOK = OO_None;
+  // Allow to use 'operator' keyword for C++ operators
+  bool WithOperator = false;
+  if (Tok.is(tok::kw_operator)) {
+    P.ConsumeToken();
+    Tok = P.getCurToken();
+    WithOperator = true;
+  }
+  switch (Tok.getKind()) {
+  case tok::plus: // '+'
+    OOK = OO_Plus;
+    break;
+  case tok::minus: // '-'
+    OOK = OO_Minus;
+    break;
+  case tok::star: // '*'
+    OOK = OO_Star;
+    break;
+  case tok::amp: // '&'
+    OOK = OO_Amp;
+    break;
+  case tok::pipe: // '|'
+    OOK = OO_Pipe;
+    break;
+  case tok::caret: // '^'
+    OOK = OO_Caret;
+    break;
+  case tok::ampamp: // '&&'
+    OOK = OO_AmpAmp;
+    break;
+  case tok::pipepipe: // '||'
+    OOK = OO_PipePipe;
+    break;
+  case tok::identifier: // identifier
+    if (!WithOperator)
+      break;
+  default:
+    P.Diag(Tok.getLocation(), diag::err_omp_expected_reduction_identifier);
+    P.SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end,
+                Parser::StopBeforeMatch);
+    return DeclarationName();
+  }
+  P.ConsumeToken();
+  auto &DeclNames = Actions.getASTContext().DeclarationNames;
+  return OOK == OO_None ? DeclNames.getIdentifier(Tok.getIdentifierInfo())
+                        : DeclNames.getCXXOperatorName(OOK);
+}
+
+/// \brief Parse 'omp declare reduction' construct.
+///
+///       declare-reduction-directive:
+///        annot_pragma_openmp 'declare' 'reduction'
+///        '(' <reduction_id> ':' <type> {',' <type>} ':' <expression> ')'
+///        ['initializer' '(' ('omp_priv' '=' <expression>)|<function_call> ')']
+///        annot_pragma_openmp_end
+/// <reduction_id> is either a base language identifier or one of the following
+/// operators: '+', '-', '*', '&', '|', '^', '&&' and '||'.
+///
+Parser::DeclGroupPtrTy
+Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) {
+  // Parse '('.
+  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
+  if (T.expectAndConsume(diag::err_expected_lparen_after,
+                         getOpenMPDirectiveName(OMPD_declare_reduction))) {
+    SkipUntil(tok::annot_pragma_openmp_end, StopBeforeMatch);
+    return DeclGroupPtrTy();
+  }
+
+  DeclarationName Name = parseOpenMPReductionId(*this);
+  if (Name.isEmpty() && Tok.is(tok::annot_pragma_openmp_end))
+    return DeclGroupPtrTy();
+
+  // Consume ':'.
+  bool IsCorrect = !ExpectAndConsume(tok::colon);
+
+  if (!IsCorrect && Tok.is(tok::annot_pragma_openmp_end))
+    return DeclGroupPtrTy();
+
+  IsCorrect = IsCorrect && !Name.isEmpty();
+
+  if (Tok.is(tok::colon) || Tok.is(tok::annot_pragma_openmp_end)) {
+    Diag(Tok.getLocation(), diag::err_expected_type);
+    IsCorrect = false;
+  }
+
+  if (!IsCorrect && Tok.is(tok::annot_pragma_openmp_end))
+    return DeclGroupPtrTy();
+
+  SmallVector<std::pair<QualType, SourceLocation>, 8> ReductionTypes;
+  // Parse list of types until ':' token.
+  do {
+    ColonProtectionRAIIObject ColonRAII(*this);
+    SourceRange Range;
+    TypeResult TR = ParseTypeName(&Range, Declarator::PrototypeContext, AS);
+    if (TR.isUsable()) {
+      auto ReductionType =
+          Actions.ActOnOpenMPDeclareReductionType(Range.getBegin(), TR);
+      if (!ReductionType.isNull()) {
+        ReductionTypes.push_back(
+            std::make_pair(ReductionType, Range.getBegin()));
+      }
+    } else {
+      SkipUntil(tok::comma, tok::colon, tok::annot_pragma_openmp_end,
+                StopBeforeMatch);
+    }
+
+    if (Tok.is(tok::colon) || Tok.is(tok::annot_pragma_openmp_end))
+      break;
+
+    // Consume ','.
+    if (ExpectAndConsume(tok::comma)) {
+      IsCorrect = false;
+      if (Tok.is(tok::annot_pragma_openmp_end)) {
+        Diag(Tok.getLocation(), diag::err_expected_type);
+        return DeclGroupPtrTy();
+      }
+    }
+  } while (Tok.isNot(tok::annot_pragma_openmp_end));
+
+  if (ReductionTypes.empty()) {
+    SkipUntil(tok::annot_pragma_openmp_end, StopBeforeMatch);
+    return DeclGroupPtrTy();
+  }
+
+  if (!IsCorrect && Tok.is(tok::annot_pragma_openmp_end))
+    return DeclGroupPtrTy();
+
+  // Consume ':'.
+  if (ExpectAndConsume(tok::colon))
+    IsCorrect = false;
+
+  if (Tok.is(tok::annot_pragma_openmp_end)) {
+    Diag(Tok.getLocation(), diag::err_expected_expression);
+    return DeclGroupPtrTy();
+  }
+
+  DeclGroupPtrTy DRD = Actions.ActOnOpenMPDeclareReductionDirectiveStart(
+      getCurScope(), Actions.getCurLexicalContext(), Name, ReductionTypes, AS);
+
+  // Parse <combiner> expression and then parse initializer if any for each
+  // correct type.
+  unsigned I = 0, E = ReductionTypes.size();
+  for (auto *D : DRD.get()) {
+    TentativeParsingAction TPA(*this);
+    ParseScope OMPDRScope(this, Scope::FnScope | Scope::DeclScope |
+                                    Scope::OpenMPDirectiveScope);
+    // Parse <combiner> expression.
+    Actions.ActOnOpenMPDeclareReductionCombinerStart(getCurScope(), D);
+    ExprResult CombinerResult =
+        Actions.ActOnFinishFullExpr(ParseAssignmentExpression().get(),
+                                    D->getLocation(), /*DiscardedValue=*/true);
+    Actions.ActOnOpenMPDeclareReductionCombinerEnd(D, CombinerResult.get());
+
+    if (CombinerResult.isInvalid() && Tok.isNot(tok::r_paren) &&
+        Tok.isNot(tok::annot_pragma_openmp_end)) {
+      TPA.Commit();
+      IsCorrect = false;
+      break;
+    }
+    IsCorrect = !T.consumeClose() && IsCorrect && CombinerResult.isUsable();
+    ExprResult InitializerResult;
+    if (Tok.isNot(tok::annot_pragma_openmp_end)) {
+      // Parse <initializer> expression.
+      if (Tok.is(tok::identifier) &&
+          Tok.getIdentifierInfo()->isStr("initializer"))
+        ConsumeToken();
+      else {
+        Diag(Tok.getLocation(), diag::err_expected) << "'initializer'";
+        TPA.Commit();
+        IsCorrect = false;
+        break;
+      }
+      // Parse '('.
+      BalancedDelimiterTracker T(*this, tok::l_paren,
+                                 tok::annot_pragma_openmp_end);
+      IsCorrect =
+          !T.expectAndConsume(diag::err_expected_lparen_after, "initializer") &&
+          IsCorrect;
+      if (Tok.isNot(tok::annot_pragma_openmp_end)) {
+        ParseScope OMPDRScope(this, Scope::FnScope | Scope::DeclScope |
+                                        Scope::OpenMPDirectiveScope);
+        // Parse expression.
+        Actions.ActOnOpenMPDeclareReductionInitializerStart(getCurScope(), D);
+        InitializerResult = Actions.ActOnFinishFullExpr(
+            ParseAssignmentExpression().get(), D->getLocation(),
+            /*DiscardedValue=*/true);
+        Actions.ActOnOpenMPDeclareReductionInitializerEnd(
+            D, InitializerResult.get());
+        if (InitializerResult.isInvalid() && Tok.isNot(tok::r_paren) &&
+            Tok.isNot(tok::annot_pragma_openmp_end)) {
+          TPA.Commit();
+          IsCorrect = false;
+          break;
+        }
+        IsCorrect =
+            !T.consumeClose() && IsCorrect && !InitializerResult.isInvalid();
+      }
+    }
+
+    ++I;
+    // Revert parsing if not the last type, otherwise accept it, we're done with
+    // parsing.
+    if (I != E)
+      TPA.Revert();
+    else
+      TPA.Commit();
+  }
+  return Actions.ActOnOpenMPDeclareReductionDirectiveEnd(getCurScope(), DRD,
+                                                         IsCorrect);
+}
+
+namespace {
+/// RAII that recreates function context for correct parsing of clauses of
+/// 'declare simd' construct.
+/// OpenMP, 2.8.2 declare simd Construct
+/// The expressions appearing in the clauses of this directive are evaluated in
+/// the scope of the arguments of the function declaration or definition.
+class FNContextRAII final {
+  Parser &P;
+  Sema::CXXThisScopeRAII *ThisScope;
+  Parser::ParseScope *TempScope;
+  Parser::ParseScope *FnScope;
+  bool HasTemplateScope = false;
+  bool HasFunScope = false;
+  FNContextRAII() = delete;
+  FNContextRAII(const FNContextRAII &) = delete;
+  FNContextRAII &operator=(const FNContextRAII &) = delete;
+
+public:
+  FNContextRAII(Parser &P, Parser::DeclGroupPtrTy Ptr) : P(P) {
+    Decl *D = *Ptr.get().begin();
+    NamedDecl *ND = dyn_cast<NamedDecl>(D);
+    RecordDecl *RD = dyn_cast_or_null<RecordDecl>(D->getDeclContext());
+    Sema &Actions = P.getActions();
+
+    // Allow 'this' within late-parsed attributes.
+    ThisScope = new Sema::CXXThisScopeRAII(Actions, RD, /*TypeQuals=*/0,
+                                           ND && ND->isCXXInstanceMember());
+
+    // If the Decl is templatized, add template parameters to scope.
+    HasTemplateScope = D->isTemplateDecl();
+    TempScope =
+        new Parser::ParseScope(&P, Scope::TemplateParamScope, HasTemplateScope);
+    if (HasTemplateScope)
+      Actions.ActOnReenterTemplateScope(Actions.getCurScope(), D);
+
+    // If the Decl is on a function, add function parameters to the scope.
+    HasFunScope = D->isFunctionOrFunctionTemplate();
+    FnScope = new Parser::ParseScope(&P, Scope::FnScope | Scope::DeclScope,
+                                     HasFunScope);
+    if (HasFunScope)
+      Actions.ActOnReenterFunctionContext(Actions.getCurScope(), D);
+  }
+  ~FNContextRAII() {
+    if (HasFunScope) {
+      P.getActions().ActOnExitFunctionContext();
+      FnScope->Exit(); // Pop scope, and remove Decls from IdResolver
+    }
+    if (HasTemplateScope)
+      TempScope->Exit();
+    delete FnScope;
+    delete TempScope;
+    delete ThisScope;
+  }
+};
+} // namespace
+
+/// Parses clauses for 'declare simd' directive.
+///    clause:
+///      'inbranch' | 'notinbranch'
+///      'simdlen' '(' <expr> ')'
+///      { 'uniform' '(' <argument_list> ')' }
+///      { 'aligned '(' <argument_list> [ ':' <alignment> ] ')' }
+///      { 'linear '(' <argument_list> [ ':' <step> ] ')' }
+static bool parseDeclareSimdClauses(
+    Parser &P, OMPDeclareSimdDeclAttr::BranchStateTy &BS, ExprResult &SimdLen,
+    SmallVectorImpl<Expr *> &Uniforms, SmallVectorImpl<Expr *> &Aligneds,
+    SmallVectorImpl<Expr *> &Alignments, SmallVectorImpl<Expr *> &Linears,
+    SmallVectorImpl<unsigned> &LinModifiers, SmallVectorImpl<Expr *> &Steps) {
+  SourceRange BSRange;
+  const Token &Tok = P.getCurToken();
+  bool IsError = false;
+  while (Tok.isNot(tok::annot_pragma_openmp_end)) {
+    if (Tok.isNot(tok::identifier))
+      break;
+    OMPDeclareSimdDeclAttr::BranchStateTy Out;
+    IdentifierInfo *II = Tok.getIdentifierInfo();
+    StringRef ClauseName = II->getName();
+    // Parse 'inranch|notinbranch' clauses.
+    if (OMPDeclareSimdDeclAttr::ConvertStrToBranchStateTy(ClauseName, Out)) {
+      if (BS != OMPDeclareSimdDeclAttr::BS_Undefined && BS != Out) {
+        P.Diag(Tok, diag::err_omp_declare_simd_inbranch_notinbranch)
+            << ClauseName
+            << OMPDeclareSimdDeclAttr::ConvertBranchStateTyToStr(BS) << BSRange;
+        IsError = true;
+      }
+      BS = Out;
+      BSRange = SourceRange(Tok.getLocation(), Tok.getEndLoc());
+      P.ConsumeToken();
+    } else if (ClauseName.equals("simdlen")) {
+      if (SimdLen.isUsable()) {
+        P.Diag(Tok, diag::err_omp_more_one_clause)
+            << getOpenMPDirectiveName(OMPD_declare_simd) << ClauseName << 0;
+        IsError = true;
+      }
+      P.ConsumeToken();
+      SourceLocation RLoc;
+      SimdLen = P.ParseOpenMPParensExpr(ClauseName, RLoc);
+      if (SimdLen.isInvalid())
+        IsError = true;
+    } else {
+      OpenMPClauseKind CKind = getOpenMPClauseKind(ClauseName);
+      if (CKind == OMPC_uniform || CKind == OMPC_aligned ||
+          CKind == OMPC_linear) {
+        Parser::OpenMPVarListDataTy Data;
+        auto *Vars = &Uniforms;
+        if (CKind == OMPC_aligned)
+          Vars = &Aligneds;
+        else if (CKind == OMPC_linear)
+          Vars = &Linears;
+
+        P.ConsumeToken();
+        if (P.ParseOpenMPVarList(OMPD_declare_simd,
+                                 getOpenMPClauseKind(ClauseName), *Vars, Data))
+          IsError = true;
+        if (CKind == OMPC_aligned)
+          Alignments.append(Aligneds.size() - Alignments.size(), Data.TailExpr);
+        else if (CKind == OMPC_linear) {
+          if (P.getActions().CheckOpenMPLinearModifier(Data.LinKind,
+                                                       Data.DepLinMapLoc))
+            Data.LinKind = OMPC_LINEAR_val;
+          LinModifiers.append(Linears.size() - LinModifiers.size(),
+                              Data.LinKind);
+          Steps.append(Linears.size() - Steps.size(), Data.TailExpr);
+        }
+      } else
+        // TODO: add parsing of other clauses.
+        break;
+    }
+    // Skip ',' if any.
+    if (Tok.is(tok::comma))
+      P.ConsumeToken();
+  }
+  return IsError;
+}
+
+/// Parse clauses for '#pragma omp declare simd'.
+Parser::DeclGroupPtrTy
+Parser::ParseOMPDeclareSimdClauses(Parser::DeclGroupPtrTy Ptr,
+                                   CachedTokens &Toks, SourceLocation Loc) {
+  PP.EnterToken(Tok);
+  PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
+  // Consume the previously pushed token.
+  ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
+
+  FNContextRAII FnContext(*this, Ptr);
+  OMPDeclareSimdDeclAttr::BranchStateTy BS =
+      OMPDeclareSimdDeclAttr::BS_Undefined;
+  ExprResult Simdlen;
+  SmallVector<Expr *, 4> Uniforms;
+  SmallVector<Expr *, 4> Aligneds;
+  SmallVector<Expr *, 4> Alignments;
+  SmallVector<Expr *, 4> Linears;
+  SmallVector<unsigned, 4> LinModifiers;
+  SmallVector<Expr *, 4> Steps;
+  bool IsError =
+      parseDeclareSimdClauses(*this, BS, Simdlen, Uniforms, Aligneds,
+                              Alignments, Linears, LinModifiers, Steps);
+  // Need to check for extra tokens.
+  if (Tok.isNot(tok::annot_pragma_openmp_end)) {
+    Diag(Tok, diag::warn_omp_extra_tokens_at_eol)
+        << getOpenMPDirectiveName(OMPD_declare_simd);
+    while (Tok.isNot(tok::annot_pragma_openmp_end))
+      ConsumeAnyToken();
+  }
+  // Skip the last annot_pragma_openmp_end.
+  SourceLocation EndLoc = ConsumeToken();
+  if (!IsError) {
+    return Actions.ActOnOpenMPDeclareSimdDirective(
+        Ptr, BS, Simdlen.get(), Uniforms, Aligneds, Alignments, Linears,
+        LinModifiers, Steps, SourceRange(Loc, EndLoc));
+  }
+  return Ptr;
 }
 
 /// \brief Parsing of declarative OpenMP directives.
 ///
 ///       threadprivate-directive:
 ///         annot_pragma_openmp 'threadprivate' simple-variable-list
+///         annot_pragma_openmp_end
 ///
-Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirective() {
+///       declare-reduction-directive:
+///        annot_pragma_openmp 'declare' 'reduction' [...]
+///        annot_pragma_openmp_end
+///
+///       declare-simd-directive:
+///         annot_pragma_openmp 'declare simd' {<clause> [,]}
+///         annot_pragma_openmp_end
+///         <function declaration/definition>
+///
+Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
+    AccessSpecifier &AS, ParsedAttributesWithRange &Attrs,
+    DeclSpec::TST TagType, Decl *Tag) {
   assert(Tok.is(tok::annot_pragma_openmp) && "Not an OpenMP directive!");
   ParenBraceBracketBalancer BalancerRAIIObj(*this);
 
   SourceLocation Loc = ConsumeToken();
-  SmallVector<Expr *, 5> Identifiers;
   auto DKind = ParseOpenMPDirectiveKind(*this);
 
   switch (DKind) {
-  case OMPD_threadprivate:
+  case OMPD_threadprivate: {
     ConsumeToken();
-    if (!ParseOpenMPSimpleVarList(OMPD_threadprivate, Identifiers, true)) {
+    ThreadprivateListParserHelper Helper(this);
+    if (!ParseOpenMPSimpleVarList(OMPD_threadprivate, Helper, true)) {
       // The last seen token is annot_pragma_openmp_end - need to check for
       // extra tokens.
       if (Tok.isNot(tok::annot_pragma_openmp_end)) {
@@ -107,9 +567,140 @@
       }
       // Skip the last annot_pragma_openmp_end.
       ConsumeToken();
-      return Actions.ActOnOpenMPThreadprivateDirective(Loc, Identifiers);
+      return Actions.ActOnOpenMPThreadprivateDirective(Loc,
+                                                       Helper.getIdentifiers());
     }
     break;
+  }
+  case OMPD_declare_reduction:
+    ConsumeToken();
+    if (auto Res = ParseOpenMPDeclareReductionDirective(AS)) {
+      // The last seen token is annot_pragma_openmp_end - need to check for
+      // extra tokens.
+      if (Tok.isNot(tok::annot_pragma_openmp_end)) {
+        Diag(Tok, diag::warn_omp_extra_tokens_at_eol)
+            << getOpenMPDirectiveName(OMPD_declare_reduction);
+        while (Tok.isNot(tok::annot_pragma_openmp_end))
+          ConsumeAnyToken();
+      }
+      // Skip the last annot_pragma_openmp_end.
+      ConsumeToken();
+      return Res;
+    }
+    break;
+  case OMPD_declare_simd: {
+    // The syntax is:
+    // { #pragma omp declare simd }
+    // <function-declaration-or-definition>
+    //
+    ConsumeToken();
+    CachedTokens Toks;
+    while(Tok.isNot(tok::annot_pragma_openmp_end)) {
+      Toks.push_back(Tok);
+      ConsumeAnyToken();
+    }
+    Toks.push_back(Tok);
+    ConsumeAnyToken();
+
+    DeclGroupPtrTy Ptr;
+    if (Tok.is(tok::annot_pragma_openmp))
+      Ptr = ParseOpenMPDeclarativeDirectiveWithExtDecl(AS, Attrs, TagType, Tag);
+    else if (Tok.isNot(tok::r_brace) && !isEofOrEom()) {
+      // Here we expect to see some function declaration.
+      if (AS == AS_none) {
+        assert(TagType == DeclSpec::TST_unspecified);
+        MaybeParseCXX11Attributes(Attrs);
+        MaybeParseMicrosoftAttributes(Attrs);
+        ParsingDeclSpec PDS(*this);
+        Ptr = ParseExternalDeclaration(Attrs, &PDS);
+      } else {
+        Ptr =
+            ParseCXXClassMemberDeclarationWithPragmas(AS, Attrs, TagType, Tag);
+      }
+    }
+    if (!Ptr) {
+      Diag(Loc, diag::err_omp_decl_in_declare_simd);
+      return DeclGroupPtrTy();
+    }
+    return ParseOMPDeclareSimdClauses(Ptr, Toks, Loc);
+  }
+  case OMPD_declare_target: {
+    SourceLocation DTLoc = ConsumeAnyToken();
+    if (Tok.isNot(tok::annot_pragma_openmp_end)) {
+      // OpenMP 4.5 syntax with list of entities.
+      llvm::SmallSetVector<const NamedDecl*, 16> SameDirectiveDecls;
+      while (Tok.isNot(tok::annot_pragma_openmp_end)) {
+        OMPDeclareTargetDeclAttr::MapTypeTy MT =
+            OMPDeclareTargetDeclAttr::MT_To;
+        if (Tok.is(tok::identifier)) {
+          IdentifierInfo *II = Tok.getIdentifierInfo();
+          StringRef ClauseName = II->getName();
+          // Parse 'to|link' clauses.
+          if (!OMPDeclareTargetDeclAttr::ConvertStrToMapTypeTy(ClauseName,
+                                                               MT)) {
+            Diag(Tok, diag::err_omp_declare_target_unexpected_clause)
+                << ClauseName;
+            break;
+          }
+          ConsumeToken();
+        }
+        auto Callback = [this, MT, &SameDirectiveDecls](
+            CXXScopeSpec &SS, DeclarationNameInfo NameInfo) {
+          Actions.ActOnOpenMPDeclareTargetName(getCurScope(), SS, NameInfo, MT,
+                                               SameDirectiveDecls);
+        };
+        if (ParseOpenMPSimpleVarList(OMPD_declare_target, Callback, true))
+          break;
+
+        // Consume optional ','.
+        if (Tok.is(tok::comma))
+          ConsumeToken();
+      }
+      SkipUntil(tok::annot_pragma_openmp_end, StopBeforeMatch);
+      ConsumeAnyToken();
+      return DeclGroupPtrTy();
+    }
+
+    // Skip the last annot_pragma_openmp_end.
+    ConsumeAnyToken();
+
+    if (!Actions.ActOnStartOpenMPDeclareTargetDirective(DTLoc))
+      return DeclGroupPtrTy();
+
+    DKind = ParseOpenMPDirectiveKind(*this);
+    while (DKind != OMPD_end_declare_target && DKind != OMPD_declare_target &&
+           Tok.isNot(tok::eof) && Tok.isNot(tok::r_brace)) {
+      ParsedAttributesWithRange attrs(AttrFactory);
+      MaybeParseCXX11Attributes(attrs);
+      MaybeParseMicrosoftAttributes(attrs);
+      ParseExternalDeclaration(attrs);
+      if (Tok.isAnnotation() && Tok.is(tok::annot_pragma_openmp)) {
+        TentativeParsingAction TPA(*this);
+        ConsumeToken();
+        DKind = ParseOpenMPDirectiveKind(*this);
+        if (DKind != OMPD_end_declare_target)
+          TPA.Revert();
+        else
+          TPA.Commit();
+      }
+    }
+
+    if (DKind == OMPD_end_declare_target) {
+      ConsumeAnyToken();
+      if (Tok.isNot(tok::annot_pragma_openmp_end)) {
+        Diag(Tok, diag::warn_omp_extra_tokens_at_eol)
+            << getOpenMPDirectiveName(OMPD_end_declare_target);
+        SkipUntil(tok::annot_pragma_openmp_end, StopBeforeMatch);
+      }
+      // Skip the last annot_pragma_openmp_end.
+      ConsumeAnyToken();
+    } else {
+      Diag(Tok, diag::err_expected_end_declare_target);
+      Diag(DTLoc, diag::note_matching) << "'#pragma omp declare target'";
+    }
+    Actions.ActOnFinishOpenMPDeclareTargetDirective();
+    return DeclGroupPtrTy();
+  }
   case OMPD_unknown:
     Diag(Tok, diag::err_omp_unknown_directive);
     break;
@@ -138,14 +729,28 @@
   case OMPD_cancellation_point:
   case OMPD_cancel:
   case OMPD_target_data:
+  case OMPD_target_enter_data:
+  case OMPD_target_exit_data:
+  case OMPD_target_parallel:
+  case OMPD_target_parallel_for:
   case OMPD_taskloop:
   case OMPD_taskloop_simd:
   case OMPD_distribute:
+  case OMPD_end_declare_target:
+  case OMPD_target_update:
+  case OMPD_distribute_parallel_for:
+  case OMPD_distribute_parallel_for_simd:
+  case OMPD_distribute_simd:
+  case OMPD_target_parallel_for_simd:
+  case OMPD_target_simd:
+  case OMPD_teams_distribute:
     Diag(Tok, diag::err_omp_unexpected_directive)
         << getOpenMPDirectiveName(DKind);
     break;
   }
-  SkipUntil(tok::annot_pragma_openmp_end);
+  while (Tok.isNot(tok::annot_pragma_openmp_end))
+    ConsumeAnyToken();
+  ConsumeAnyToken();
   return nullptr;
 }
 
@@ -155,21 +760,31 @@
 ///         annot_pragma_openmp 'threadprivate' simple-variable-list
 ///         annot_pragma_openmp_end
 ///
+///       declare-reduction-directive:
+///         annot_pragma_openmp 'declare' 'reduction' '(' <reduction_id> ':'
+///         <type> {',' <type>} ':' <expression> ')' ['initializer' '('
+///         ('omp_priv' '=' <expression>|<function_call>) ')']
+///         annot_pragma_openmp_end
+///
 ///       executable-directive:
 ///         annot_pragma_openmp 'parallel' | 'simd' | 'for' | 'sections' |
 ///         'section' | 'single' | 'master' | 'critical' [ '(' <name> ')' ] |
 ///         'parallel for' | 'parallel sections' | 'task' | 'taskyield' |
 ///         'barrier' | 'taskwait' | 'flush' | 'ordered' | 'atomic' |
 ///         'for simd' | 'parallel for simd' | 'target' | 'target data' |
-///         'taskgroup' | 'teams' | 'taskloop' | 'taskloop simd' {clause} |
-///         'distribute'
+///         'taskgroup' | 'teams' | 'taskloop' | 'taskloop simd' |
+///         'distribute' | 'target enter data' | 'target exit data' |
+///         'target parallel' | 'target parallel for' |
+///         'target update' | 'distribute parallel for' |
+///         'distribute paralle for simd' | 'distribute simd' |
+///         'target parallel for simd' | 'target simd' |
+///         'teams distribute' {clause}
 ///         annot_pragma_openmp_end
 ///
 StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
     AllowedContsructsKind Allowed) {
   assert(Tok.is(tok::annot_pragma_openmp) && "Not an OpenMP directive!");
   ParenBraceBracketBalancer BalancerRAIIObj(*this);
-  SmallVector<Expr *, 5> Identifiers;
   SmallVector<OMPClause *, 5> Clauses;
   SmallVector<llvm::PointerIntPair<OMPClause *, 1, bool>, OMPC_unknown + 1>
   FirstClauses(OMPC_unknown + 1);
@@ -185,13 +800,14 @@
   bool FlushHasClause = false;
 
   switch (DKind) {
-  case OMPD_threadprivate:
+  case OMPD_threadprivate: {
     if (Allowed != ACK_Any) {
       Diag(Tok, diag::err_omp_immediate_directive)
           << getOpenMPDirectiveName(DKind) << 0;
     }
     ConsumeToken();
-    if (!ParseOpenMPSimpleVarList(OMPD_threadprivate, Identifiers, false)) {
+    ThreadprivateListParserHelper Helper(this);
+    if (!ParseOpenMPSimpleVarList(OMPD_threadprivate, Helper, false)) {
       // The last seen token is annot_pragma_openmp_end - need to check for
       // extra tokens.
       if (Tok.isNot(tok::annot_pragma_openmp_end)) {
@@ -199,12 +815,29 @@
             << getOpenMPDirectiveName(OMPD_threadprivate);
         SkipUntil(tok::annot_pragma_openmp_end, StopBeforeMatch);
       }
-      DeclGroupPtrTy Res =
-          Actions.ActOnOpenMPThreadprivateDirective(Loc, Identifiers);
+      DeclGroupPtrTy Res = Actions.ActOnOpenMPThreadprivateDirective(
+          Loc, Helper.getIdentifiers());
       Directive = Actions.ActOnDeclStmt(Res, Loc, Tok.getLocation());
     }
     SkipUntil(tok::annot_pragma_openmp_end);
     break;
+  }
+  case OMPD_declare_reduction:
+    ConsumeToken();
+    if (auto Res = ParseOpenMPDeclareReductionDirective(/*AS=*/AS_none)) {
+      // The last seen token is annot_pragma_openmp_end - need to check for
+      // extra tokens.
+      if (Tok.isNot(tok::annot_pragma_openmp_end)) {
+        Diag(Tok, diag::warn_omp_extra_tokens_at_eol)
+            << getOpenMPDirectiveName(OMPD_declare_reduction);
+        while (Tok.isNot(tok::annot_pragma_openmp_end))
+          ConsumeAnyToken();
+      }
+      ConsumeAnyToken();
+      Directive = Actions.ActOnDeclStmt(Res, Loc, Tok.getLocation());
+    } else
+      SkipUntil(tok::annot_pragma_openmp_end);
+    break;
   case OMPD_flush:
     if (PP.LookAhead(0).is(tok::l_paren)) {
       FlushHasClause = true;
@@ -217,6 +850,9 @@
   case OMPD_taskwait:
   case OMPD_cancellation_point:
   case OMPD_cancel:
+  case OMPD_target_enter_data:
+  case OMPD_target_exit_data:
+  case OMPD_target_update:
     if (Allowed == ACK_StatementsOpenMPNonStandalone) {
       Diag(Tok, diag::err_omp_immediate_directive)
           << getOpenMPDirectiveName(DKind) << 0;
@@ -242,9 +878,17 @@
   case OMPD_teams:
   case OMPD_taskgroup:
   case OMPD_target_data:
+  case OMPD_target_parallel:
+  case OMPD_target_parallel_for:
   case OMPD_taskloop:
   case OMPD_taskloop_simd:
-  case OMPD_distribute: {
+  case OMPD_distribute:
+  case OMPD_distribute_parallel_for:
+  case OMPD_distribute_parallel_for_simd:
+  case OMPD_distribute_simd:
+  case OMPD_target_parallel_for_simd:
+  case OMPD_target_simd:
+  case OMPD_teams_distribute: {
     ConsumeToken();
     // Parse directive name of the 'critical' directive if any.
     if (DKind == OMPD_critical) {
@@ -331,6 +975,13 @@
     OMPDirectiveScope.Exit();
     break;
   }
+  case OMPD_declare_simd:
+  case OMPD_declare_target:
+  case OMPD_end_declare_target:
+    Diag(Tok, diag::err_omp_unexpected_directive)
+        << getOpenMPDirectiveName(DKind);
+    SkipUntil(tok::annot_pragma_openmp_end);
+    break;
   case OMPD_unknown:
     Diag(Tok, diag::err_omp_unknown_directive);
     SkipUntil(tok::annot_pragma_openmp_end);
@@ -339,16 +990,15 @@
   return Directive;
 }
 
-/// \brief Parses list of simple variables for '#pragma omp threadprivate'
-/// directive.
-///
-///   simple-variable-list:
-///         '(' id-expression {, id-expression} ')'
-///
-bool Parser::ParseOpenMPSimpleVarList(OpenMPDirectiveKind Kind,
-                                      SmallVectorImpl<Expr *> &VarList,
-                                      bool AllowScopeSpecifier) {
-  VarList.clear();
+// Parses simple list:
+//   simple-variable-list:
+//         '(' id-expression {, id-expression} ')'
+//
+bool Parser::ParseOpenMPSimpleVarList(
+    OpenMPDirectiveKind Kind,
+    const llvm::function_ref<void(CXXScopeSpec &, DeclarationNameInfo)> &
+        Callback,
+    bool AllowScopeSpecifier) {
   // Parse '('.
   BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
   if (T.expectAndConsume(diag::err_expected_lparen_after,
@@ -385,11 +1035,7 @@
           << tok::identifier
           << SourceRange(PrevTok.getLocation(), PrevTokLocation);
     } else {
-      DeclarationNameInfo NameInfo = Actions.GetNameFromUnqualifiedId(Name);
-      ExprResult Res =
-          Actions.ActOnOpenMPIdExpression(getCurScope(), SS, NameInfo);
-      if (Res.isUsable())
-        VarList.push_back(Res.get());
+      Callback(SS, Actions.GetNameFromUnqualifiedId(Name));
     }
     // Consume ','.
     if (Tok.is(tok::comma)) {
@@ -405,7 +1051,7 @@
   // Parse ')'.
   IsCorrect = !T.consumeClose() && IsCorrect;
 
-  return !IsCorrect && VarList.empty();
+  return !IsCorrect;
 }
 
 /// \brief Parsing of OpenMP clauses.
@@ -420,7 +1066,8 @@
 ///       update-clause | capture-clause | seq_cst-clause | device-clause |
 ///       simdlen-clause | threads-clause | simd-clause | num_teams-clause |
 ///       thread_limit-clause | priority-clause | grainsize-clause |
-///       nogroup-clause | num_tasks-clause | hint-clause
+///       nogroup-clause | num_tasks-clause | hint-clause | to-clause |
+///       from-clause | is_device_ptr-clause
 ///
 OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
                                      OpenMPClauseKind CKind, bool FirstClause) {
@@ -495,8 +1142,11 @@
     break;
   case OMPC_schedule:
   case OMPC_dist_schedule:
+  case OMPC_defaultmap:
     // OpenMP [2.7.1, Restrictions, p. 3]
     //  Only one schedule clause can appear on a loop directive.
+    // OpenMP [2.10.4, Restrictions, p. 106]
+    //  At most one defaultmap clause can appear on the directive.
     if (!FirstClause) {
       Diag(Tok, diag::err_omp_more_one_clause)
           << getOpenMPDirectiveName(DKind) << getOpenMPClauseName(CKind) << 0;
@@ -541,6 +1191,10 @@
   case OMPC_flush:
   case OMPC_depend:
   case OMPC_map:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
     Clause = ParseOpenMPVarListClause(DKind, CKind);
     break;
   case OMPC_unknown:
@@ -549,6 +1203,7 @@
     SkipUntil(tok::annot_pragma_openmp_end, StopBeforeMatch);
     break;
   case OMPC_threadprivate:
+  case OMPC_uniform:
     Diag(Tok, diag::err_omp_unexpected_clause) << getOpenMPClauseName(CKind)
                                                << getOpenMPDirectiveName(DKind);
     SkipUntil(tok::comma, tok::annot_pragma_openmp_end, StopBeforeMatch);
@@ -557,6 +1212,28 @@
   return ErrorFound ? nullptr : Clause;
 }
 
+/// Parses simple expression in parens for single-expression clauses of OpenMP
+/// constructs.
+/// \param RLoc Returned location of right paren.
+ExprResult Parser::ParseOpenMPParensExpr(StringRef ClauseName,
+                                         SourceLocation &RLoc) {
+  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
+  if (T.expectAndConsume(diag::err_expected_lparen_after, ClauseName.data()))
+    return ExprError();
+
+  SourceLocation ELoc = Tok.getLocation();
+  ExprResult LHS(ParseCastExpression(
+      /*isUnaryExpression=*/false, /*isAddressOfOperand=*/false, NotTypeCast));
+  ExprResult Val(ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+  Val = Actions.ActOnFinishFullExpr(Val.get(), ELoc);
+
+  // Parse ')'.
+  T.consumeClose();
+
+  RLoc = T.getCloseLocation();
+  return Val;
+}
+
 /// \brief Parsing of OpenMP clauses with single expressions like 'final',
 /// 'collapse', 'safelen', 'num_threads', 'simdlen', 'num_teams',
 /// 'thread_limit', 'simdlen', 'priority', 'grainsize', 'num_tasks' or 'hint'.
@@ -590,25 +1267,15 @@
 ///
 OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind) {
   SourceLocation Loc = ConsumeToken();
+  SourceLocation LLoc = Tok.getLocation();
+  SourceLocation RLoc;
 
-  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
-  if (T.expectAndConsume(diag::err_expected_lparen_after,
-                         getOpenMPClauseName(Kind)))
-    return nullptr;
-
-  SourceLocation ELoc = Tok.getLocation();
-  ExprResult LHS(ParseCastExpression(false, false, NotTypeCast));
-  ExprResult Val(ParseRHSOfBinaryExpression(LHS, prec::Conditional));
-  Val = Actions.ActOnFinishFullExpr(Val.get(), ELoc);
-
-  // Parse ')'.
-  T.consumeClose();
+  ExprResult Val = ParseOpenMPParensExpr(getOpenMPClauseName(Kind), RLoc);
 
   if (Val.isInvalid())
     return nullptr;
 
-  return Actions.ActOnOpenMPSingleExprClause(
-      Kind, Val.get(), Loc, T.getOpenLocation(), T.getCloseLocation());
+  return Actions.ActOnOpenMPSingleExprClause(Kind, Val.get(), Loc, LLoc, RLoc);
 }
 
 /// \brief Parsing of simple OpenMP clauses like 'default' or 'proc_bind'.
@@ -686,6 +1353,9 @@
 ///    if-clause:
 ///      'if' '(' [ directive-name-modifier ':' ] expression ')'
 ///
+///    defaultmap:
+///      'defaultmap' '(' modifier ':' kind ')'
+///
 OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind) {
   SourceLocation Loc = ConsumeToken();
   SourceLocation DelimLoc;
@@ -754,6 +1424,26 @@
       ConsumeAnyToken();
     if (Arg.back() == OMPC_DIST_SCHEDULE_static && Tok.is(tok::comma))
       DelimLoc = ConsumeAnyToken();
+  } else if (Kind == OMPC_defaultmap) {
+    // Get a defaultmap modifier
+    Arg.push_back(getOpenMPSimpleClauseType(
+        Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok)));
+    KLoc.push_back(Tok.getLocation());
+    if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+        Tok.isNot(tok::annot_pragma_openmp_end))
+      ConsumeAnyToken();
+    // Parse ':'
+    if (Tok.is(tok::colon))
+      ConsumeAnyToken();
+    else if (Arg.back() != OMPC_DEFAULTMAP_MODIFIER_unknown)
+      Diag(Tok, diag::warn_pragma_expected_colon) << "defaultmap modifier";
+    // Get a defaultmap kind
+    Arg.push_back(getOpenMPSimpleClauseType(
+        Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok)));
+    KLoc.push_back(Tok.getLocation());
+    if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+        Tok.isNot(tok::annot_pragma_openmp_end))
+      ConsumeAnyToken();
   } else {
     assert(Kind == OMPC_if);
     KLoc.push_back(Tok.getLocation());
@@ -835,6 +1525,210 @@
                               TemplateKWLoc, ReductionId);
 }
 
+/// Parses clauses with list.
+bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
+                                OpenMPClauseKind Kind,
+                                SmallVectorImpl<Expr *> &Vars,
+                                OpenMPVarListDataTy &Data) {
+  UnqualifiedId UnqualifiedReductionId;
+  bool InvalidReductionId = false;
+  bool MapTypeModifierSpecified = false;
+
+  // Parse '('.
+  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
+  if (T.expectAndConsume(diag::err_expected_lparen_after,
+                         getOpenMPClauseName(Kind)))
+    return true;
+
+  bool NeedRParenForLinear = false;
+  BalancedDelimiterTracker LinearT(*this, tok::l_paren,
+                                  tok::annot_pragma_openmp_end);
+  // Handle reduction-identifier for reduction clause.
+  if (Kind == OMPC_reduction) {
+    ColonProtectionRAIIObject ColonRAII(*this);
+    if (getLangOpts().CPlusPlus)
+      ParseOptionalCXXScopeSpecifier(Data.ReductionIdScopeSpec,
+                                     /*ObjectType=*/nullptr,
+                                     /*EnteringContext=*/false);
+    InvalidReductionId = ParseReductionId(*this, Data.ReductionIdScopeSpec,
+                                          UnqualifiedReductionId);
+    if (InvalidReductionId) {
+      SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end,
+                StopBeforeMatch);
+    }
+    if (Tok.is(tok::colon))
+      Data.ColonLoc = ConsumeToken();
+    else
+      Diag(Tok, diag::warn_pragma_expected_colon) << "reduction identifier";
+    if (!InvalidReductionId)
+      Data.ReductionId =
+          Actions.GetNameFromUnqualifiedId(UnqualifiedReductionId);
+  } else if (Kind == OMPC_depend) {
+  // Handle dependency type for depend clause.
+    ColonProtectionRAIIObject ColonRAII(*this);
+    Data.DepKind =
+        static_cast<OpenMPDependClauseKind>(getOpenMPSimpleClauseType(
+            Kind, Tok.is(tok::identifier) ? PP.getSpelling(Tok) : ""));
+    Data.DepLinMapLoc = Tok.getLocation();
+
+    if (Data.DepKind == OMPC_DEPEND_unknown) {
+      SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end,
+                StopBeforeMatch);
+    } else {
+      ConsumeToken();
+      // Special processing for depend(source) clause.
+      if (DKind == OMPD_ordered && Data.DepKind == OMPC_DEPEND_source) {
+        // Parse ')'.
+        T.consumeClose();
+        return false;
+      }
+    }
+    if (Tok.is(tok::colon))
+      Data.ColonLoc = ConsumeToken();
+    else {
+      Diag(Tok, DKind == OMPD_ordered ? diag::warn_pragma_expected_colon_r_paren
+                                      : diag::warn_pragma_expected_colon)
+          << "dependency type";
+    }
+  } else if (Kind == OMPC_linear) {
+    // Try to parse modifier if any.
+    if (Tok.is(tok::identifier) && PP.LookAhead(0).is(tok::l_paren)) {
+      Data.LinKind = static_cast<OpenMPLinearClauseKind>(
+          getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok)));
+      Data.DepLinMapLoc = ConsumeToken();
+      LinearT.consumeOpen();
+      NeedRParenForLinear = true;
+    }
+  } else if (Kind == OMPC_map) {
+    // Handle map type for map clause.
+    ColonProtectionRAIIObject ColonRAII(*this);
+
+    /// The map clause modifier token can be either a identifier or the C++
+    /// delete keyword.
+    auto &&IsMapClauseModifierToken = [](const Token &Tok) -> bool {
+      return Tok.isOneOf(tok::identifier, tok::kw_delete);
+    };
+
+    // The first identifier may be a list item, a map-type or a
+    // map-type-modifier. The map modifier can also be delete which has the same
+    // spelling of the C++ delete keyword.
+    Data.MapType =
+        IsMapClauseModifierToken(Tok)
+            ? static_cast<OpenMPMapClauseKind>(
+                  getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok)))
+            : OMPC_MAP_unknown;
+    Data.DepLinMapLoc = Tok.getLocation();
+    bool ColonExpected = false;
+
+    if (IsMapClauseModifierToken(Tok)) {
+      if (PP.LookAhead(0).is(tok::colon)) {
+        if (Data.MapType == OMPC_MAP_unknown)
+          Diag(Tok, diag::err_omp_unknown_map_type);
+        else if (Data.MapType == OMPC_MAP_always)
+          Diag(Tok, diag::err_omp_map_type_missing);
+        ConsumeToken();
+      } else if (PP.LookAhead(0).is(tok::comma)) {
+        if (IsMapClauseModifierToken(PP.LookAhead(1)) &&
+            PP.LookAhead(2).is(tok::colon)) {
+          Data.MapTypeModifier = Data.MapType;
+          if (Data.MapTypeModifier != OMPC_MAP_always) {
+            Diag(Tok, diag::err_omp_unknown_map_type_modifier);
+            Data.MapTypeModifier = OMPC_MAP_unknown;
+          } else
+            MapTypeModifierSpecified = true;
+
+          ConsumeToken();
+          ConsumeToken();
+
+          Data.MapType =
+              IsMapClauseModifierToken(Tok)
+                  ? static_cast<OpenMPMapClauseKind>(
+                        getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok)))
+                  : OMPC_MAP_unknown;
+          if (Data.MapType == OMPC_MAP_unknown ||
+              Data.MapType == OMPC_MAP_always)
+            Diag(Tok, diag::err_omp_unknown_map_type);
+          ConsumeToken();
+        } else {
+          Data.MapType = OMPC_MAP_tofrom;
+          Data.IsMapTypeImplicit = true;
+        }
+      } else {
+        Data.MapType = OMPC_MAP_tofrom;
+        Data.IsMapTypeImplicit = true;
+      }
+    } else {
+      Data.MapType = OMPC_MAP_tofrom;
+      Data.IsMapTypeImplicit = true;
+    }
+
+    if (Tok.is(tok::colon))
+      Data.ColonLoc = ConsumeToken();
+    else if (ColonExpected)
+      Diag(Tok, diag::warn_pragma_expected_colon) << "map type";
+  }
+
+  bool IsComma =
+      (Kind != OMPC_reduction && Kind != OMPC_depend && Kind != OMPC_map) ||
+      (Kind == OMPC_reduction && !InvalidReductionId) ||
+      (Kind == OMPC_map && Data.MapType != OMPC_MAP_unknown &&
+       (!MapTypeModifierSpecified ||
+        Data.MapTypeModifier == OMPC_MAP_always)) ||
+      (Kind == OMPC_depend && Data.DepKind != OMPC_DEPEND_unknown);
+  const bool MayHaveTail = (Kind == OMPC_linear || Kind == OMPC_aligned);
+  while (IsComma || (Tok.isNot(tok::r_paren) && Tok.isNot(tok::colon) &&
+                     Tok.isNot(tok::annot_pragma_openmp_end))) {
+    ColonProtectionRAIIObject ColonRAII(*this, MayHaveTail);
+    // Parse variable
+    ExprResult VarExpr =
+        Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+    if (VarExpr.isUsable())
+      Vars.push_back(VarExpr.get());
+    else {
+      SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end,
+                StopBeforeMatch);
+    }
+    // Skip ',' if any
+    IsComma = Tok.is(tok::comma);
+    if (IsComma)
+      ConsumeToken();
+    else if (Tok.isNot(tok::r_paren) &&
+             Tok.isNot(tok::annot_pragma_openmp_end) &&
+             (!MayHaveTail || Tok.isNot(tok::colon)))
+      Diag(Tok, diag::err_omp_expected_punc)
+          << ((Kind == OMPC_flush) ? getOpenMPDirectiveName(OMPD_flush)
+                                   : getOpenMPClauseName(Kind))
+          << (Kind == OMPC_flush);
+  }
+
+  // Parse ')' for linear clause with modifier.
+  if (NeedRParenForLinear)
+    LinearT.consumeClose();
+
+  // Parse ':' linear-step (or ':' alignment).
+  const bool MustHaveTail = MayHaveTail && Tok.is(tok::colon);
+  if (MustHaveTail) {
+    Data.ColonLoc = Tok.getLocation();
+    SourceLocation ELoc = ConsumeToken();
+    ExprResult Tail = ParseAssignmentExpression();
+    Tail = Actions.ActOnFinishFullExpr(Tail.get(), ELoc);
+    if (Tail.isUsable())
+      Data.TailExpr = Tail.get();
+    else
+      SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end,
+                StopBeforeMatch);
+  }
+
+  // Parse ')'.
+  T.consumeClose();
+  if ((Kind == OMPC_depend && Data.DepKind != OMPC_DEPEND_unknown &&
+       Vars.empty()) ||
+      (Kind != OMPC_depend && Kind != OMPC_map && Vars.empty()) ||
+      (MustHaveTail && !Data.TailExpr) || InvalidReductionId)
+    return true;
+  return false;
+}
+
 /// \brief Parsing of OpenMP clause 'private', 'firstprivate', 'lastprivate',
 /// 'shared', 'copyin', 'copyprivate', 'flush' or 'reduction'.
 ///
@@ -861,6 +1755,14 @@
 ///    map-clause:
 ///       'map' '(' [ [ always , ]
 ///          to | from | tofrom | alloc | release | delete ':' ] list ')';
+///    to-clause:
+///       'to' '(' list ')'
+///    from-clause:
+///       'from' '(' list ')'
+///    use_device_ptr-clause:
+///       'use_device_ptr' '(' list ')'
+///    is_device_ptr-clause:
+///       'is_device_ptr' '(' list ')'
 ///
 /// For 'linear' clause linear-list may have the following forms:
 ///  list
@@ -870,214 +1772,16 @@
                                             OpenMPClauseKind Kind) {
   SourceLocation Loc = Tok.getLocation();
   SourceLocation LOpen = ConsumeToken();
-  SourceLocation ColonLoc = SourceLocation();
-  // Optional scope specifier and unqualified id for reduction identifier.
-  CXXScopeSpec ReductionIdScopeSpec;
-  UnqualifiedId ReductionId;
-  bool InvalidReductionId = false;
-  OpenMPDependClauseKind DepKind = OMPC_DEPEND_unknown;
-  // OpenMP 4.1 [2.15.3.7, linear Clause]
-  //  If no modifier is specified it is assumed to be val.
-  OpenMPLinearClauseKind LinearModifier = OMPC_LINEAR_val;
-  OpenMPMapClauseKind MapType = OMPC_MAP_unknown;
-  OpenMPMapClauseKind MapTypeModifier = OMPC_MAP_unknown;
-  bool MapTypeModifierSpecified = false;
-  bool UnexpectedId = false;
-  SourceLocation DepLinMapLoc;
+  SmallVector<Expr *, 4> Vars;
+  OpenMPVarListDataTy Data;
 
-  // Parse '('.
-  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
-  if (T.expectAndConsume(diag::err_expected_lparen_after,
-                         getOpenMPClauseName(Kind)))
+  if (ParseOpenMPVarList(DKind, Kind, Vars, Data))
     return nullptr;
 
-  bool NeedRParenForLinear = false;
-  BalancedDelimiterTracker LinearT(*this, tok::l_paren,
-                                  tok::annot_pragma_openmp_end);
-  // Handle reduction-identifier for reduction clause.
-  if (Kind == OMPC_reduction) {
-    ColonProtectionRAIIObject ColonRAII(*this);
-    if (getLangOpts().CPlusPlus) {
-      ParseOptionalCXXScopeSpecifier(ReductionIdScopeSpec, nullptr, false);
-    }
-    InvalidReductionId =
-        ParseReductionId(*this, ReductionIdScopeSpec, ReductionId);
-    if (InvalidReductionId) {
-      SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end,
-                StopBeforeMatch);
-    }
-    if (Tok.is(tok::colon)) {
-      ColonLoc = ConsumeToken();
-    } else {
-      Diag(Tok, diag::warn_pragma_expected_colon) << "reduction identifier";
-    }
-  } else if (Kind == OMPC_depend) {
-  // Handle dependency type for depend clause.
-    ColonProtectionRAIIObject ColonRAII(*this);
-    DepKind = static_cast<OpenMPDependClauseKind>(getOpenMPSimpleClauseType(
-        Kind, Tok.is(tok::identifier) ? PP.getSpelling(Tok) : ""));
-    DepLinMapLoc = Tok.getLocation();
-
-    if (DepKind == OMPC_DEPEND_unknown) {
-      SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end,
-                StopBeforeMatch);
-    } else {
-      ConsumeToken();
-      // Special processing for depend(source) clause.
-      if (DKind == OMPD_ordered && DepKind == OMPC_DEPEND_source) {
-        // Parse ')'.
-        T.consumeClose();
-        return Actions.ActOnOpenMPVarListClause(
-            Kind, llvm::None, /*TailExpr=*/nullptr, Loc, LOpen,
-            /*ColonLoc=*/SourceLocation(), Tok.getLocation(),
-            ReductionIdScopeSpec, DeclarationNameInfo(), DepKind,
-            LinearModifier, MapTypeModifier, MapType, DepLinMapLoc);
-      }
-    }
-    if (Tok.is(tok::colon)) {
-      ColonLoc = ConsumeToken();
-    } else {
-      Diag(Tok, DKind == OMPD_ordered ? diag::warn_pragma_expected_colon_r_paren
-                                      : diag::warn_pragma_expected_colon)
-          << "dependency type";
-    }
-  } else if (Kind == OMPC_linear) {
-    // Try to parse modifier if any.
-    if (Tok.is(tok::identifier) && PP.LookAhead(0).is(tok::l_paren)) {
-      LinearModifier = static_cast<OpenMPLinearClauseKind>(
-          getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok)));
-      DepLinMapLoc = ConsumeToken();
-      LinearT.consumeOpen();
-      NeedRParenForLinear = true;
-    }
-  } else if (Kind == OMPC_map) {
-    // Handle map type for map clause.
-    ColonProtectionRAIIObject ColonRAII(*this);
-
-    // the first identifier may be a list item, a map-type or
-    //   a map-type-modifier
-    MapType = static_cast<OpenMPMapClauseKind>(getOpenMPSimpleClauseType(
-        Kind, Tok.is(tok::identifier) ? PP.getSpelling(Tok) : ""));
-    DepLinMapLoc = Tok.getLocation();
-    bool ColonExpected = false;
-
-    if (Tok.is(tok::identifier)) {
-      if (PP.LookAhead(0).is(tok::colon)) {
-        MapType = static_cast<OpenMPMapClauseKind>(getOpenMPSimpleClauseType(
-            Kind, Tok.is(tok::identifier) ? PP.getSpelling(Tok) : ""));
-        if (MapType == OMPC_MAP_unknown) {
-          Diag(Tok, diag::err_omp_unknown_map_type);
-        } else if (MapType == OMPC_MAP_always) {
-          Diag(Tok, diag::err_omp_map_type_missing);
-        }
-        ConsumeToken();
-      } else if (PP.LookAhead(0).is(tok::comma)) {
-        if (PP.LookAhead(1).is(tok::identifier) &&
-            PP.LookAhead(2).is(tok::colon)) {
-          MapTypeModifier =
-              static_cast<OpenMPMapClauseKind>(getOpenMPSimpleClauseType(
-                   Kind, Tok.is(tok::identifier) ? PP.getSpelling(Tok) : ""));
-          if (MapTypeModifier != OMPC_MAP_always) {
-            Diag(Tok, diag::err_omp_unknown_map_type_modifier);
-            MapTypeModifier = OMPC_MAP_unknown;
-          } else {
-            MapTypeModifierSpecified = true;
-          }
-
-          ConsumeToken();
-          ConsumeToken();
-
-          MapType = static_cast<OpenMPMapClauseKind>(getOpenMPSimpleClauseType(
-              Kind, Tok.is(tok::identifier) ? PP.getSpelling(Tok) : ""));
-          if (MapType == OMPC_MAP_unknown || MapType == OMPC_MAP_always) {
-            Diag(Tok, diag::err_omp_unknown_map_type);
-          }
-          ConsumeToken();
-        } else {
-          MapType = OMPC_MAP_tofrom;
-        }
-      } else {
-        MapType = OMPC_MAP_tofrom;
-      }
-    } else {
-      UnexpectedId = true;
-    }
-
-    if (Tok.is(tok::colon)) {
-      ColonLoc = ConsumeToken();
-    } else if (ColonExpected) {
-      Diag(Tok, diag::warn_pragma_expected_colon) << "map type";
-    }
-  }
-
-  SmallVector<Expr *, 5> Vars;
-  bool IsComma =
-      ((Kind != OMPC_reduction) && (Kind != OMPC_depend) &&
-       (Kind != OMPC_map)) ||
-      ((Kind == OMPC_reduction) && !InvalidReductionId) ||
-      ((Kind == OMPC_map) && (UnexpectedId || MapType != OMPC_MAP_unknown) &&
-       (!MapTypeModifierSpecified ||
-        (MapTypeModifierSpecified && MapTypeModifier == OMPC_MAP_always))) ||
-      ((Kind == OMPC_depend) && DepKind != OMPC_DEPEND_unknown);
-  const bool MayHaveTail = (Kind == OMPC_linear || Kind == OMPC_aligned);
-  while (IsComma || (Tok.isNot(tok::r_paren) && Tok.isNot(tok::colon) &&
-                     Tok.isNot(tok::annot_pragma_openmp_end))) {
-    ColonProtectionRAIIObject ColonRAII(*this, MayHaveTail);
-    // Parse variable
-    ExprResult VarExpr =
-        Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
-    if (VarExpr.isUsable()) {
-      Vars.push_back(VarExpr.get());
-    } else {
-      SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end,
-                StopBeforeMatch);
-    }
-    // Skip ',' if any
-    IsComma = Tok.is(tok::comma);
-    if (IsComma)
-      ConsumeToken();
-    else if (Tok.isNot(tok::r_paren) &&
-             Tok.isNot(tok::annot_pragma_openmp_end) &&
-             (!MayHaveTail || Tok.isNot(tok::colon)))
-      Diag(Tok, diag::err_omp_expected_punc)
-          << ((Kind == OMPC_flush) ? getOpenMPDirectiveName(OMPD_flush)
-                                   : getOpenMPClauseName(Kind))
-          << (Kind == OMPC_flush);
-  }
-
-  // Parse ')' for linear clause with modifier.
-  if (NeedRParenForLinear)
-    LinearT.consumeClose();
-
-  // Parse ':' linear-step (or ':' alignment).
-  Expr *TailExpr = nullptr;
-  const bool MustHaveTail = MayHaveTail && Tok.is(tok::colon);
-  if (MustHaveTail) {
-    ColonLoc = Tok.getLocation();
-    SourceLocation ELoc = ConsumeToken();
-    ExprResult Tail = ParseAssignmentExpression();
-    Tail = Actions.ActOnFinishFullExpr(Tail.get(), ELoc);
-    if (Tail.isUsable())
-      TailExpr = Tail.get();
-    else
-      SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end,
-                StopBeforeMatch);
-  }
-
-  // Parse ')'.
-  T.consumeClose();
-  if ((Kind == OMPC_depend && DepKind != OMPC_DEPEND_unknown && Vars.empty()) ||
-      (Kind != OMPC_depend && Vars.empty()) || (MustHaveTail && !TailExpr) ||
-      (Kind == OMPC_map && MapType == OMPC_MAP_unknown) ||
-      InvalidReductionId) {
-    return nullptr;
-  }
-
   return Actions.ActOnOpenMPVarListClause(
-      Kind, Vars, TailExpr, Loc, LOpen, ColonLoc, Tok.getLocation(),
-      ReductionIdScopeSpec,
-      ReductionId.isValid() ? Actions.GetNameFromUnqualifiedId(ReductionId)
-                            : DeclarationNameInfo(),
-      DepKind, LinearModifier, MapTypeModifier, MapType, DepLinMapLoc);
+      Kind, Vars, Data.TailExpr, Loc, LOpen, Data.ColonLoc, Tok.getLocation(),
+      Data.ReductionIdScopeSpec, Data.ReductionId, Data.DepKind, Data.LinKind,
+      Data.MapTypeModifier, Data.MapType, Data.IsMapTypeImplicit,
+      Data.DepLinMapLoc);
 }
 
diff --git a/lib/Parse/ParsePragma.cpp b/lib/Parse/ParsePragma.cpp
index 5b4f935..bff5d11 100644
--- a/lib/Parse/ParsePragma.cpp
+++ b/lib/Parse/ParsePragma.cpp
@@ -13,6 +13,7 @@
 
 #include "RAIIObjectsForParser.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/Basic/PragmaKinds.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Parse/ParseDiagnostic.h"
@@ -336,11 +337,9 @@
 
 namespace {
 struct PragmaPackInfo {
-  Sema::PragmaPackKind Kind;
-  IdentifierInfo *Name;
+  Sema::PragmaMsStackAction Action;
+  StringRef SlotLabel;
   Token Alignment;
-  SourceLocation LParenLoc;
-  SourceLocation RParenLoc;
 };
 } // end anonymous namespace
 
@@ -355,15 +354,14 @@
     if (Alignment.isInvalid())
       return;
   }
-  Actions.ActOnPragmaPack(Info->Kind, Info->Name, Alignment.get(), PragmaLoc,
-                          Info->LParenLoc, Info->RParenLoc);
+  Actions.ActOnPragmaPack(PragmaLoc, Info->Action, Info->SlotLabel,
+                          Alignment.get());
 }
 
 void Parser::HandlePragmaMSStruct() {
   assert(Tok.is(tok::annot_pragma_msstruct));
-  Sema::PragmaMSStructKind Kind =
-    static_cast<Sema::PragmaMSStructKind>(
-    reinterpret_cast<uintptr_t>(Tok.getAnnotationValue()));
+  PragmaMSStructKind Kind = static_cast<PragmaMSStructKind>(
+      reinterpret_cast<uintptr_t>(Tok.getAnnotationValue()));
   Actions.ActOnPragmaMSStruct(Kind);
   ConsumeToken(); // The annotation token.
 }
@@ -470,14 +468,24 @@
   ConsumeToken(); // The annotation token.
 
   OpenCLOptions &f = Actions.getOpenCLOptions();
+  auto CLVer = getLangOpts().OpenCLVersion;
+  auto &Supp = getTargetInfo().getSupportedOpenCLOpts();
   // OpenCL 1.1 9.1: "The all variant sets the behavior for all extensions,
   // overriding all previously issued extension directives, but only if the
   // behavior is set to disable."
   if (state == 0 && ename->isStr("all")) {
-#define OPENCLEXT(nm)   f.nm = 0;
+#define OPENCLEXT(nm) \
+    if (Supp.is_##nm##_supported_extension(CLVer)) \
+      f.nm = 0;
 #include "clang/Basic/OpenCLExtensions.def"
   }
-#define OPENCLEXT(nm) else if (ename->isStr(#nm)) { f.nm = state; }
+#define OPENCLEXT(nm) else if (ename->isStr(#nm)) \
+   if (Supp.is_##nm##_supported_extension(CLVer)) \
+     f.nm = state; \
+   else if (Supp.is_##nm##_supported_core(CLVer)) \
+     PP.Diag(NameLoc, diag::warn_pragma_extension_is_core) << ename; \
+   else \
+     PP.Diag(NameLoc, diag::warn_pragma_unsupported_extension) << ename;
 #include "clang/Basic/OpenCLExtensions.def"
   else {
     PP.Diag(NameLoc, diag::warn_pragma_unknown_extension) << ename;
@@ -497,18 +505,19 @@
 void Parser::HandlePragmaMSVtorDisp() {
   assert(Tok.is(tok::annot_pragma_ms_vtordisp));
   uintptr_t Value = reinterpret_cast<uintptr_t>(Tok.getAnnotationValue());
-  Sema::PragmaVtorDispKind Kind =
-      static_cast<Sema::PragmaVtorDispKind>((Value >> 16) & 0xFFFF);
+  Sema::PragmaMsStackAction Action =
+      static_cast<Sema::PragmaMsStackAction>((Value >> 16) & 0xFFFF);
   MSVtorDispAttr::Mode Mode = MSVtorDispAttr::Mode(Value & 0xFFFF);
   SourceLocation PragmaLoc = ConsumeToken(); // The annotation token.
-  Actions.ActOnPragmaMSVtorDisp(Kind, PragmaLoc, Mode);
+  Actions.ActOnPragmaMSVtorDisp(Action, PragmaLoc, Mode);
 }
 
 void Parser::HandlePragmaMSPragma() {
   assert(Tok.is(tok::annot_pragma_ms_pragma));
   // Grab the tokens out of the annotation and enter them into the stream.
-  auto TheTokens = (std::pair<Token*, size_t> *)Tok.getAnnotationValue();
-  PP.EnterTokenStream(TheTokens->first, TheTokens->second, true, true);
+  auto TheTokens =
+      (std::pair<std::unique_ptr<Token[]>, size_t> *)Tok.getAnnotationValue();
+  PP.EnterTokenStream(std::move(TheTokens->first), TheTokens->second, true);
   SourceLocation PragmaLocation = ConsumeToken(); // The annotation token.
   assert(Tok.isAnyIdentifier());
   StringRef PragmaName = Tok.getIdentifierInfo()->getName();
@@ -798,14 +807,13 @@
   Hint.OptionLoc = IdentifierLoc::create(
       Actions.Context, Info->Option.getLocation(), OptionInfo);
 
-  const Token *Toks = Info->Toks.data();
-  size_t TokSize = Info->Toks.size();
+  llvm::ArrayRef<Token> Toks = Info->Toks;
 
   // Return a valid hint if pragma unroll or nounroll were specified
   // without an argument.
   bool PragmaUnroll = PragmaNameInfo->getName() == "unroll";
   bool PragmaNoUnroll = PragmaNameInfo->getName() == "nounroll";
-  if (TokSize == 0 && (PragmaUnroll || PragmaNoUnroll)) {
+  if (Toks.empty() && (PragmaUnroll || PragmaNoUnroll)) {
     ConsumeToken(); // The annotation token.
     Hint.Range = Info->PragmaName.getLocation();
     return true;
@@ -813,25 +821,30 @@
 
   // The constant expression is always followed by an eof token, which increases
   // the TokSize by 1.
-  assert(TokSize > 0 &&
+  assert(!Toks.empty() &&
          "PragmaLoopHintInfo::Toks must contain at least one token.");
 
   // If no option is specified the argument is assumed to be a constant expr.
   bool OptionUnroll = false;
+  bool OptionDistribute = false;
   bool StateOption = false;
   if (OptionInfo) { // Pragma Unroll does not specify an option.
     OptionUnroll = OptionInfo->isStr("unroll");
+    OptionDistribute = OptionInfo->isStr("distribute");
     StateOption = llvm::StringSwitch<bool>(OptionInfo->getName())
                       .Case("vectorize", true)
                       .Case("interleave", true)
-                      .Default(false) || OptionUnroll;
+                      .Default(false) ||
+                  OptionUnroll || OptionDistribute;
   }
 
+  bool AssumeSafetyArg = !OptionUnroll && !OptionDistribute;
   // Verify loop hint has an argument.
   if (Toks[0].is(tok::eof)) {
     ConsumeToken(); // The annotation token.
     Diag(Toks[0].getLocation(), diag::err_pragma_loop_missing_argument)
-        << /*StateArgument=*/StateOption << /*FullKeyword=*/OptionUnroll;
+        << /*StateArgument=*/StateOption << /*FullKeyword=*/OptionUnroll
+        << /*AssumeSafetyKeyword=*/AssumeSafetyArg;
     return false;
   }
 
@@ -845,21 +858,21 @@
                  llvm::StringSwitch<bool>(StateInfo->getName())
                      .Cases("enable", "disable", true)
                      .Case("full", OptionUnroll)
-                     .Case("assume_safety", !OptionUnroll)
+                     .Case("assume_safety", AssumeSafetyArg)
                      .Default(false);
     if (!Valid) {
       Diag(Toks[0].getLocation(), diag::err_pragma_invalid_keyword)
-          << /*FullKeyword=*/OptionUnroll;
+          << /*FullKeyword=*/OptionUnroll
+          << /*AssumeSafetyKeyword=*/AssumeSafetyArg;
       return false;
     }
-    if (TokSize > 2)
+    if (Toks.size() > 2)
       Diag(Tok.getLocation(), diag::warn_pragma_extra_tokens_at_eol)
           << PragmaLoopHintString(Info->PragmaName, Info->Option);
     Hint.StateLoc = IdentifierLoc::create(Actions.Context, StateLoc, StateInfo);
   } else {
     // Enter constant expression including eof terminator into token stream.
-    PP.EnterTokenStream(Toks, TokSize, /*DisableMacroExpansion=*/false,
-                        /*OwnsTokens=*/false);
+    PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/false);
     ConsumeToken(); // The annotation token.
 
     ExprResult R = ParseConstantExpression();
@@ -884,7 +897,7 @@
   }
 
   Hint.Range = SourceRange(Info->PragmaName.getLocation(),
-                           Info->Toks[TokSize - 1].getLocation());
+                           Info->Toks.back().getLocation());
   return true;
 }
 
@@ -937,15 +950,14 @@
     return;
   }
 
-  Token *Toks = new Token[1];
+  auto Toks = llvm::make_unique<Token[]>(1);
   Toks[0].startToken();
   Toks[0].setKind(tok::annot_pragma_vis);
   Toks[0].setLocation(VisLoc);
   Toks[0].setAnnotationEndLoc(EndLoc);
   Toks[0].setAnnotationValue(
                           const_cast<void*>(static_cast<const void*>(VisType)));
-  PP.EnterTokenStream(Toks, 1, /*DisableMacroExpansion=*/true,
-                      /*OwnsTokens=*/true);
+  PP.EnterTokenStream(std::move(Toks), 1, /*DisableMacroExpansion=*/true);
 }
 
 // #pragma pack(...) comes in the following delicious flavors:
@@ -964,11 +976,10 @@
     return;
   }
 
-  Sema::PragmaPackKind Kind = Sema::PPK_Default;
-  IdentifierInfo *Name = nullptr;
+  Sema::PragmaMsStackAction Action = Sema::PSK_Reset;
+  StringRef SlotLabel;
   Token Alignment;
   Alignment.startToken();
-  SourceLocation LParenLoc = Tok.getLocation();
   PP.Lex(Tok);
   if (Tok.is(tok::numeric_constant)) {
     Alignment = Tok;
@@ -978,18 +989,18 @@
     // In MSVC/gcc, #pragma pack(4) sets the alignment without affecting
     // the push/pop stack.
     // In Apple gcc, #pragma pack(4) is equivalent to #pragma pack(push, 4)
-    if (PP.getLangOpts().ApplePragmaPack)
-      Kind = Sema::PPK_Push;
+    Action =
+        PP.getLangOpts().ApplePragmaPack ? Sema::PSK_Push_Set : Sema::PSK_Set;
   } else if (Tok.is(tok::identifier)) {
     const IdentifierInfo *II = Tok.getIdentifierInfo();
     if (II->isStr("show")) {
-      Kind = Sema::PPK_Show;
+      Action = Sema::PSK_Show;
       PP.Lex(Tok);
     } else {
       if (II->isStr("push")) {
-        Kind = Sema::PPK_Push;
+        Action = Sema::PSK_Push;
       } else if (II->isStr("pop")) {
-        Kind = Sema::PPK_Pop;
+        Action = Sema::PSK_Pop;
       } else {
         PP.Diag(Tok.getLocation(), diag::warn_pragma_invalid_action) << "pack";
         return;
@@ -1000,11 +1011,12 @@
         PP.Lex(Tok);
 
         if (Tok.is(tok::numeric_constant)) {
+          Action = (Sema::PragmaMsStackAction)(Action | Sema::PSK_Set);
           Alignment = Tok;
 
           PP.Lex(Tok);
         } else if (Tok.is(tok::identifier)) {
-          Name = Tok.getIdentifierInfo();
+          SlotLabel = Tok.getIdentifierInfo()->getName();
           PP.Lex(Tok);
 
           if (Tok.is(tok::comma)) {
@@ -1015,6 +1027,7 @@
               return;
             }
 
+            Action = (Sema::PragmaMsStackAction)(Action | Sema::PSK_Set);
             Alignment = Tok;
 
             PP.Lex(Tok);
@@ -1029,7 +1042,7 @@
     // In MSVC/gcc, #pragma pack() resets the alignment without affecting
     // the push/pop stack.
     // In Apple gcc #pragma pack() is equivalent to #pragma pack(pop).
-    Kind = Sema::PPK_Pop;
+    Action = Sema::PSK_Pop;
   }
 
   if (Tok.isNot(tok::r_paren)) {
@@ -1044,27 +1057,20 @@
     return;
   }
 
-  PragmaPackInfo *Info = 
-    (PragmaPackInfo*) PP.getPreprocessorAllocator().Allocate(
-      sizeof(PragmaPackInfo), llvm::alignOf<PragmaPackInfo>());
-  new (Info) PragmaPackInfo();
-  Info->Kind = Kind;
-  Info->Name = Name;
+  PragmaPackInfo *Info =
+      PP.getPreprocessorAllocator().Allocate<PragmaPackInfo>(1);
+  Info->Action = Action;
+  Info->SlotLabel = SlotLabel;
   Info->Alignment = Alignment;
-  Info->LParenLoc = LParenLoc;
-  Info->RParenLoc = RParenLoc;
 
-  Token *Toks = 
-    (Token*) PP.getPreprocessorAllocator().Allocate(
-      sizeof(Token) * 1, llvm::alignOf<Token>());
-  new (Toks) Token();
+  MutableArrayRef<Token> Toks(PP.getPreprocessorAllocator().Allocate<Token>(1),
+                              1);
   Toks[0].startToken();
   Toks[0].setKind(tok::annot_pragma_pack);
   Toks[0].setLocation(PackLoc);
   Toks[0].setAnnotationEndLoc(RParenLoc);
   Toks[0].setAnnotationValue(static_cast<void*>(Info));
-  PP.EnterTokenStream(Toks, 1, /*DisableMacroExpansion=*/true,
-                      /*OwnsTokens=*/false);
+  PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
 }
 
 // #pragma ms_struct on
@@ -1072,8 +1078,8 @@
 void PragmaMSStructHandler::HandlePragma(Preprocessor &PP, 
                                          PragmaIntroducerKind Introducer,
                                          Token &MSStructTok) {
-  Sema::PragmaMSStructKind Kind = Sema::PMSST_OFF;
-  
+  PragmaMSStructKind Kind = PMSST_OFF;
+
   Token Tok;
   PP.Lex(Tok);
   if (Tok.isNot(tok::identifier)) {
@@ -1083,7 +1089,7 @@
   SourceLocation EndLoc = Tok.getLocation();
   const IdentifierInfo *II = Tok.getIdentifierInfo();
   if (II->isStr("on")) {
-    Kind = Sema::PMSST_ON;
+    Kind = PMSST_ON;
     PP.Lex(Tok);
   }
   else if (II->isStr("off") || II->isStr("reset"))
@@ -1099,18 +1105,15 @@
     return;
   }
 
-  Token *Toks =
-    (Token*) PP.getPreprocessorAllocator().Allocate(
-      sizeof(Token) * 1, llvm::alignOf<Token>());
-  new (Toks) Token();
+  MutableArrayRef<Token> Toks(PP.getPreprocessorAllocator().Allocate<Token>(1),
+                              1);
   Toks[0].startToken();
   Toks[0].setKind(tok::annot_pragma_msstruct);
   Toks[0].setLocation(MSStructTok.getLocation());
   Toks[0].setAnnotationEndLoc(EndLoc);
   Toks[0].setAnnotationValue(reinterpret_cast<void*>(
                              static_cast<uintptr_t>(Kind)));
-  PP.EnterTokenStream(Toks, 1, /*DisableMacroExpansion=*/true,
-                      /*OwnsTokens=*/false);
+  PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
 }
 
 // #pragma 'align' '=' {'native','natural','mac68k','power','reset'}
@@ -1170,18 +1173,15 @@
     return;
   }
 
-  Token *Toks =
-    (Token*) PP.getPreprocessorAllocator().Allocate(
-      sizeof(Token) * 1, llvm::alignOf<Token>());
-  new (Toks) Token();
+  MutableArrayRef<Token> Toks(PP.getPreprocessorAllocator().Allocate<Token>(1),
+                              1);
   Toks[0].startToken();
   Toks[0].setKind(tok::annot_pragma_align);
   Toks[0].setLocation(FirstTok.getLocation());
   Toks[0].setAnnotationEndLoc(EndLoc);
   Toks[0].setAnnotationValue(reinterpret_cast<void*>(
                              static_cast<uintptr_t>(Kind)));
-  PP.EnterTokenStream(Toks, 1, /*DisableMacroExpansion=*/true,
-                      /*OwnsTokens=*/false);
+  PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
 }
 
 void PragmaAlignHandler::HandlePragma(Preprocessor &PP, 
@@ -1263,9 +1263,9 @@
   // This allows us to cache a "#pragma unused" that occurs inside an inline
   // C++ member function.
 
-  Token *Toks = 
-    (Token*) PP.getPreprocessorAllocator().Allocate(
-      sizeof(Token) * 2 * Identifiers.size(), llvm::alignOf<Token>());
+  MutableArrayRef<Token> Toks(
+      PP.getPreprocessorAllocator().Allocate<Token>(2 * Identifiers.size()),
+      2 * Identifiers.size());
   for (unsigned i=0; i != Identifiers.size(); i++) {
     Token &pragmaUnusedTok = Toks[2*i], &idTok = Toks[2*i+1];
     pragmaUnusedTok.startToken();
@@ -1273,8 +1273,7 @@
     pragmaUnusedTok.setLocation(UnusedLoc);
     idTok = Identifiers[i];
   }
-  PP.EnterTokenStream(Toks, 2*Identifiers.size(),
-                      /*DisableMacroExpansion=*/true, /*OwnsTokens=*/false);
+  PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
 }
 
 // #pragma weak identifier
@@ -1314,9 +1313,8 @@
   }
 
   if (HasAlias) {
-    Token *Toks = 
-      (Token*) PP.getPreprocessorAllocator().Allocate(
-        sizeof(Token) * 3, llvm::alignOf<Token>());
+    MutableArrayRef<Token> Toks(
+        PP.getPreprocessorAllocator().Allocate<Token>(3), 3);
     Token &pragmaUnusedTok = Toks[0];
     pragmaUnusedTok.startToken();
     pragmaUnusedTok.setKind(tok::annot_pragma_weakalias);
@@ -1324,20 +1322,17 @@
     pragmaUnusedTok.setAnnotationEndLoc(AliasName.getLocation());
     Toks[1] = WeakName;
     Toks[2] = AliasName;
-    PP.EnterTokenStream(Toks, 3,
-                        /*DisableMacroExpansion=*/true, /*OwnsTokens=*/false);
+    PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
   } else {
-    Token *Toks = 
-      (Token*) PP.getPreprocessorAllocator().Allocate(
-        sizeof(Token) * 2, llvm::alignOf<Token>());
+    MutableArrayRef<Token> Toks(
+        PP.getPreprocessorAllocator().Allocate<Token>(2), 2);
     Token &pragmaUnusedTok = Toks[0];
     pragmaUnusedTok.startToken();
     pragmaUnusedTok.setKind(tok::annot_pragma_weak);
     pragmaUnusedTok.setLocation(WeakLoc);
     pragmaUnusedTok.setAnnotationEndLoc(WeakLoc);
     Toks[1] = WeakName;
-    PP.EnterTokenStream(Toks, 2,
-                        /*DisableMacroExpansion=*/true, /*OwnsTokens=*/false);
+    PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
   }
 }
 
@@ -1373,9 +1368,8 @@
     return;
   }
 
-  Token *Toks = 
-    (Token*) PP.getPreprocessorAllocator().Allocate(
-      sizeof(Token) * 3, llvm::alignOf<Token>());
+  MutableArrayRef<Token> Toks(PP.getPreprocessorAllocator().Allocate<Token>(3),
+                              3);
   Token &pragmaRedefTok = Toks[0];
   pragmaRedefTok.startToken();
   pragmaRedefTok.setKind(tok::annot_pragma_redefine_extname);
@@ -1383,8 +1377,7 @@
   pragmaRedefTok.setAnnotationEndLoc(AliasName.getLocation());
   Toks[1] = RedefName;
   Toks[2] = AliasName;
-  PP.EnterTokenStream(Toks, 3,
-                      /*DisableMacroExpansion=*/true, /*OwnsTokens=*/false);
+  PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
 }
 
 
@@ -1396,18 +1389,15 @@
   if (PP.LexOnOffSwitch(OOS))
     return;
 
-  Token *Toks =
-    (Token*) PP.getPreprocessorAllocator().Allocate(
-      sizeof(Token) * 1, llvm::alignOf<Token>());
-  new (Toks) Token();
+  MutableArrayRef<Token> Toks(PP.getPreprocessorAllocator().Allocate<Token>(1),
+                              1);
   Toks[0].startToken();
   Toks[0].setKind(tok::annot_pragma_fp_contract);
   Toks[0].setLocation(Tok.getLocation());
   Toks[0].setAnnotationEndLoc(Tok.getLocation());
   Toks[0].setAnnotationValue(reinterpret_cast<void*>(
                              static_cast<uintptr_t>(OOS)));
-  PP.EnterTokenStream(Toks, 1, /*DisableMacroExpansion=*/true,
-                      /*OwnsTokens=*/false);
+  PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
 }
 
 void 
@@ -1455,17 +1445,14 @@
   }
 
   OpenCLExtData data(ename, state);
-  Token *Toks =
-    (Token*) PP.getPreprocessorAllocator().Allocate(
-      sizeof(Token) * 1, llvm::alignOf<Token>());
-  new (Toks) Token();
+  MutableArrayRef<Token> Toks(PP.getPreprocessorAllocator().Allocate<Token>(1),
+                              1);
   Toks[0].startToken();
   Toks[0].setKind(tok::annot_pragma_opencl_extension);
   Toks[0].setLocation(NameLoc);
   Toks[0].setAnnotationValue(data.getOpaqueValue());
   Toks[0].setAnnotationEndLoc(StateLoc);
-  PP.EnterTokenStream(Toks, 1, /*DisableMacroExpansion=*/true,
-                      /*OwnsTokens=*/false);
+  PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true);
 
   if (PP.getPPCallbacks())
     PP.getPPCallbacks()->PragmaOpenCLExtension(NameLoc, ename, 
@@ -1509,10 +1496,10 @@
   Tok.setLocation(EodLoc);
   Pragma.push_back(Tok);
 
-  Token *Toks = new Token[Pragma.size()];
-  std::copy(Pragma.begin(), Pragma.end(), Toks);
-  PP.EnterTokenStream(Toks, Pragma.size(),
-                      /*DisableMacroExpansion=*/false, /*OwnsTokens=*/true);
+  auto Toks = llvm::make_unique<Token[]>(Pragma.size());
+  std::copy(Pragma.begin(), Pragma.end(), Toks.get());
+  PP.EnterTokenStream(std::move(Toks), Pragma.size(),
+                      /*DisableMacroExpansion=*/false);
 }
 
 /// \brief Handle '#pragma pointers_to_members'
@@ -1632,7 +1619,7 @@
   }
   PP.Lex(Tok);
 
-  Sema::PragmaVtorDispKind Kind = Sema::PVDK_Set;
+  Sema::PragmaMsStackAction Action = Sema::PSK_Set;
   const IdentifierInfo *II = Tok.getIdentifierInfo();
   if (II) {
     if (II->isStr("push")) {
@@ -1643,24 +1630,24 @@
         return;
       }
       PP.Lex(Tok);
-      Kind = Sema::PVDK_Push;
+      Action = Sema::PSK_Push_Set;
       // not push, could be on/off
     } else if (II->isStr("pop")) {
       // #pragma vtordisp(pop)
       PP.Lex(Tok);
-      Kind = Sema::PVDK_Pop;
+      Action = Sema::PSK_Pop;
     }
     // not push or pop, could be on/off
   } else {
     if (Tok.is(tok::r_paren)) {
       // #pragma vtordisp()
-      Kind = Sema::PVDK_Reset;
+      Action = Sema::PSK_Reset;
     }
   }
 
 
   uint64_t Value = 0;
-  if (Kind == Sema::PVDK_Push || Kind == Sema::PVDK_Set) {
+  if (Action & Sema::PSK_Push || Action & Sema::PSK_Set) {
     const IdentifierInfo *II = Tok.getIdentifierInfo();
     if (II && II->isStr("off")) {
       PP.Lex(Tok);
@@ -1702,7 +1689,7 @@
   AnnotTok.setLocation(VtorDispLoc);
   AnnotTok.setAnnotationEndLoc(EndLoc);
   AnnotTok.setAnnotationValue(reinterpret_cast<void *>(
-      static_cast<uintptr_t>((Kind << 16) | (Value & 0xFFFF))));
+      static_cast<uintptr_t>((Action << 16) | (Value & 0xFFFF))));
   PP.EnterToken(AnnotTok);
 }
 
@@ -1728,10 +1715,11 @@
   TokenVector.push_back(EoF);
   // We must allocate this array with new because EnterTokenStream is going to
   // delete it later.
-  Token *TokenArray = new Token[TokenVector.size()];
-  std::copy(TokenVector.begin(), TokenVector.end(), TokenArray);
+  auto TokenArray = llvm::make_unique<Token[]>(TokenVector.size());
+  std::copy(TokenVector.begin(), TokenVector.end(), TokenArray.get());
   auto Value = new (PP.getPreprocessorAllocator())
-      std::pair<Token*, size_t>(std::make_pair(TokenArray, TokenVector.size()));
+      std::pair<std::unique_ptr<Token[]>, size_t>(std::move(TokenArray),
+                                                  TokenVector.size());
   AnnotTok.setAnnotationValue(Value);
   PP.EnterToken(AnnotTok);
 }
@@ -1749,10 +1737,10 @@
 void PragmaDetectMismatchHandler::HandlePragma(Preprocessor &PP,
                                                PragmaIntroducerKind Introducer,
                                                Token &Tok) {
-  SourceLocation CommentLoc = Tok.getLocation();
+  SourceLocation DetectMismatchLoc = Tok.getLocation();
   PP.Lex(Tok);
   if (Tok.isNot(tok::l_paren)) {
-    PP.Diag(CommentLoc, diag::err_expected) << tok::l_paren;
+    PP.Diag(DetectMismatchLoc, diag::err_expected) << tok::l_paren;
     return;
   }
 
@@ -1787,10 +1775,10 @@
 
   // If the pragma is lexically sound, notify any interested PPCallbacks.
   if (PP.getPPCallbacks())
-    PP.getPPCallbacks()->PragmaDetectMismatch(CommentLoc, NameString,
+    PP.getPPCallbacks()->PragmaDetectMismatch(DetectMismatchLoc, NameString,
                                               ValueString);
 
-  Actions.ActOnPragmaDetectMismatch(NameString, ValueString);
+  Actions.ActOnPragmaDetectMismatch(DetectMismatchLoc, NameString, ValueString);
 }
 
 /// \brief Handle the microsoft \#pragma comment extension.
@@ -1821,22 +1809,22 @@
 
   // Verify that this is one of the 5 whitelisted options.
   IdentifierInfo *II = Tok.getIdentifierInfo();
-  Sema::PragmaMSCommentKind Kind =
-    llvm::StringSwitch<Sema::PragmaMSCommentKind>(II->getName())
-    .Case("linker",   Sema::PCK_Linker)
-    .Case("lib",      Sema::PCK_Lib)
-    .Case("compiler", Sema::PCK_Compiler)
-    .Case("exestr",   Sema::PCK_ExeStr)
-    .Case("user",     Sema::PCK_User)
-    .Default(Sema::PCK_Unknown);
-  if (Kind == Sema::PCK_Unknown) {
+  PragmaMSCommentKind Kind =
+    llvm::StringSwitch<PragmaMSCommentKind>(II->getName())
+    .Case("linker",   PCK_Linker)
+    .Case("lib",      PCK_Lib)
+    .Case("compiler", PCK_Compiler)
+    .Case("exestr",   PCK_ExeStr)
+    .Case("user",     PCK_User)
+    .Default(PCK_Unknown);
+  if (Kind == PCK_Unknown) {
     PP.Diag(Tok.getLocation(), diag::err_pragma_comment_unknown_kind);
     return;
   }
 
   // On PS4, issue a warning about any pragma comments other than
   // #pragma comment lib.
-  if (PP.getTargetInfo().getTriple().isPS4() && Kind != Sema::PCK_Lib) {
+  if (PP.getTargetInfo().getTriple().isPS4() && Kind != PCK_Lib) {
     PP.Diag(Tok.getLocation(), diag::warn_pragma_comment_ignored)
       << II->getName();
     return;
@@ -1872,7 +1860,7 @@
   if (PP.getPPCallbacks())
     PP.getPPCallbacks()->PragmaComment(CommentLoc, II, ArgumentString);
 
-  Actions.ActOnPragmaMSComment(Kind, ArgumentString);
+  Actions.ActOnPragmaMSComment(CommentLoc, Kind, ArgumentString);
 }
 
 // #pragma clang optimize off
@@ -2023,6 +2011,7 @@
                            .Case("vectorize", true)
                            .Case("interleave", true)
                            .Case("unroll", true)
+                           .Case("distribute", true)
                            .Case("vectorize_width", true)
                            .Case("interleave_count", true)
                            .Case("unroll_count", true)
@@ -2062,12 +2051,11 @@
     return;
   }
 
-  Token *TokenArray = new Token[TokenList.size()];
-  std::copy(TokenList.begin(), TokenList.end(), TokenArray);
+  auto TokenArray = llvm::make_unique<Token[]>(TokenList.size());
+  std::copy(TokenList.begin(), TokenList.end(), TokenArray.get());
 
-  PP.EnterTokenStream(TokenArray, TokenList.size(),
-                      /*DisableMacroExpansion=*/false,
-                      /*OwnsTokens=*/true);
+  PP.EnterTokenStream(std::move(TokenArray), TokenList.size(),
+                      /*DisableMacroExpansion=*/false);
 }
 
 /// \brief Handle the loop unroll optimization pragmas.
@@ -2130,12 +2118,12 @@
   }
 
   // Generate the hint token.
-  Token *TokenArray = new Token[1];
+  auto TokenArray = llvm::make_unique<Token[]>(1);
   TokenArray[0].startToken();
   TokenArray[0].setKind(tok::annot_pragma_loop_hint);
   TokenArray[0].setLocation(PragmaName.getLocation());
   TokenArray[0].setAnnotationEndLoc(PragmaName.getLocation());
   TokenArray[0].setAnnotationValue(static_cast<void *>(Info));
-  PP.EnterTokenStream(TokenArray, 1, /*DisableMacroExpansion=*/false,
-                      /*OwnsTokens=*/true);
+  PP.EnterTokenStream(std::move(TokenArray), 1,
+                      /*DisableMacroExpansion=*/false);
 }
diff --git a/lib/Parse/ParseStmt.cpp b/lib/Parse/ParseStmt.cpp
index 6c27b27..d0557b8 100644
--- a/lib/Parse/ParseStmt.cpp
+++ b/lib/Parse/ParseStmt.cpp
@@ -12,18 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Parse/Parser.h"
 #include "RAIIObjectsForParser.h"
-#include "clang/AST/ASTContext.h"
 #include "clang/Basic/Attributes.h"
-#include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/PrettyStackTrace.h"
+#include "clang/Parse/Parser.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/LoopHint.h"
 #include "clang/Sema/PrettyDeclStackTrace.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/TypoCorrection.h"
-#include "llvm/ADT/SmallString.h"
 using namespace clang;
 
 //===----------------------------------------------------------------------===//
@@ -107,6 +104,8 @@
 
   ParsedAttributesWithRange Attrs(AttrFactory);
   MaybeParseCXX11Attributes(Attrs, nullptr, /*MightBeObjCMessageSend*/ true);
+  if (!MaybeParseOpenCLUnrollHintAttribute(Attrs))
+    return StmtError();
 
   StmtResult Res = ParseStatementOrDeclarationAfterAttributes(
       Stmts, Allowed, TrailingElseLoc, Attrs);
@@ -207,7 +206,8 @@
   }
 
   default: {
-    if ((getLangOpts().CPlusPlus || Allowed == ACK_Any) &&
+    if ((getLangOpts().CPlusPlus || getLangOpts().MicrosoftExt ||
+         Allowed == ACK_Any) &&
         isDeclarationStatement()) {
       SourceLocation DeclStart = Tok.getLocation(), DeclEnd;
       DeclGroupPtrTy Decl = ParseDeclaration(Declarator::BlockContext,
@@ -1041,7 +1041,8 @@
 
 /// ParseParenExprOrCondition:
 /// [C  ]     '(' expression ')'
-/// [C++]     '(' condition ')'       [not allowed if OnlyAllowCondition=true]
+/// [C++]     '(' condition ')'
+/// [C++1z]   '(' init-statement[opt] condition ')'
 ///
 /// This function parses and performs error recovery on the specified condition
 /// or expression (depending on whether we're in C++ or C mode).  This function
@@ -1050,29 +1051,29 @@
 /// should try to recover harder.  It returns false if the condition is
 /// successfully parsed.  Note that a successful parse can still have semantic
 /// errors in the condition.
-bool Parser::ParseParenExprOrCondition(ExprResult &ExprResult,
-                                       Decl *&DeclResult,
+bool Parser::ParseParenExprOrCondition(StmtResult *InitStmt,
+                                       Sema::ConditionResult &Cond,
                                        SourceLocation Loc,
-                                       bool ConvertToBoolean) {
+                                       Sema::ConditionKind CK) {
   BalancedDelimiterTracker T(*this, tok::l_paren);
   T.consumeOpen();
 
   if (getLangOpts().CPlusPlus)
-    ParseCXXCondition(ExprResult, DeclResult, Loc, ConvertToBoolean);
+    Cond = ParseCXXCondition(InitStmt, Loc, CK);
   else {
-    ExprResult = ParseExpression();
-    DeclResult = nullptr;
+    ExprResult CondExpr = ParseExpression();
 
     // If required, convert to a boolean value.
-    if (!ExprResult.isInvalid() && ConvertToBoolean)
-      ExprResult
-        = Actions.ActOnBooleanCondition(getCurScope(), Loc, ExprResult.get());
+    if (CondExpr.isInvalid())
+      Cond = Sema::ConditionError();
+    else
+      Cond = Actions.ActOnCondition(getCurScope(), Loc, CondExpr.get(), CK);
   }
 
   // If the parser was confused by the condition and we don't have a ')', try to
   // recover by skipping ahead to a semi and bailing out.  If condexp is
   // semantically invalid but we have well formed code, keep going.
-  if (ExprResult.isInvalid() && !DeclResult && Tok.isNot(tok::r_paren)) {
+  if (Cond.isInvalid() && Tok.isNot(tok::r_paren)) {
     SkipUntil(tok::semi);
     // Skipping may have stopped if it found the containing ')'.  If so, we can
     // continue parsing the if statement.
@@ -1107,6 +1108,14 @@
   assert(Tok.is(tok::kw_if) && "Not an if stmt!");
   SourceLocation IfLoc = ConsumeToken();  // eat the 'if'.
 
+  bool IsConstexpr = false;
+  if (Tok.is(tok::kw_constexpr)) {
+    Diag(Tok, getLangOpts().CPlusPlus1z ? diag::warn_cxx14_compat_constexpr_if
+                                        : diag::ext_constexpr_if);
+    IsConstexpr = true;
+    ConsumeToken();
+  }
+
   if (Tok.isNot(tok::l_paren)) {
     Diag(Tok, diag::err_expected_lparen_after) << "if";
     SkipUntil(tok::semi);
@@ -1130,12 +1139,16 @@
   ParseScope IfScope(this, Scope::DeclScope | Scope::ControlScope, C99orCXX);
 
   // Parse the condition.
-  ExprResult CondExp;
-  Decl *CondVar = nullptr;
-  if (ParseParenExprOrCondition(CondExp, CondVar, IfLoc, true))
+  StmtResult InitStmt;
+  Sema::ConditionResult Cond;
+  if (ParseParenExprOrCondition(&InitStmt, Cond, IfLoc,
+                                IsConstexpr ? Sema::ConditionKind::ConstexprIf
+                                            : Sema::ConditionKind::Boolean))
     return StmtError();
 
-  FullExprArg FullCondExp(Actions.MakeFullExpr(CondExp.get(), IfLoc));
+  llvm::Optional<bool> ConstexprCondition;
+  if (IsConstexpr)
+    ConstexprCondition = Cond.getKnownValue();
 
   // C99 6.8.4p3 - In C99, the body of the if statement is a scope, even if
   // there is no compound stmt.  C90 does not have this clause.  We only do this
@@ -1161,7 +1174,13 @@
   SourceLocation ThenStmtLoc = Tok.getLocation();
 
   SourceLocation InnerStatementTrailingElseLoc;
-  StmtResult ThenStmt(ParseStatement(&InnerStatementTrailingElseLoc));
+  StmtResult ThenStmt;
+  {
+    EnterExpressionEvaluationContext PotentiallyDiscarded(
+        Actions, Sema::DiscardedStatement, nullptr, false,
+        /*ShouldEnter=*/ConstexprCondition && !*ConstexprCondition);
+    ThenStmt = ParseStatement(&InnerStatementTrailingElseLoc);
+  }
 
   // Pop the 'if' scope if needed.
   InnerScope.Exit();
@@ -1187,8 +1206,12 @@
     // The substatement in a selection-statement (each substatement, in the else
     // form of the if statement) implicitly defines a local scope.
     //
-    ParseScope InnerScope(this, Scope::DeclScope, C99orCXX, Tok.is(tok::l_brace));
+    ParseScope InnerScope(this, Scope::DeclScope, C99orCXX,
+                          Tok.is(tok::l_brace));
 
+    EnterExpressionEvaluationContext PotentiallyDiscarded(
+        Actions, Sema::DiscardedStatement, nullptr, false,
+        /*ShouldEnter=*/ConstexprCondition && *ConstexprCondition);
     ElseStmt = ParseStatement();
 
     // Pop the 'else' scope if needed.
@@ -1219,8 +1242,8 @@
   if (ElseStmt.isInvalid())
     ElseStmt = Actions.ActOnNullStmt(ElseStmtLoc);
 
-  return Actions.ActOnIfStmt(IfLoc, FullCondExp, CondVar, ThenStmt.get(),
-                             ElseLoc, ElseStmt.get());
+  return Actions.ActOnIfStmt(IfLoc, IsConstexpr, InitStmt.get(), Cond,
+                             ThenStmt.get(), ElseLoc, ElseStmt.get());
 }
 
 /// ParseSwitchStatement
@@ -1257,13 +1280,14 @@
   ParseScope SwitchScope(this, ScopeFlags);
 
   // Parse the condition.
-  ExprResult Cond;
-  Decl *CondVar = nullptr;
-  if (ParseParenExprOrCondition(Cond, CondVar, SwitchLoc, false))
+  StmtResult InitStmt;
+  Sema::ConditionResult Cond;
+  if (ParseParenExprOrCondition(&InitStmt, Cond, SwitchLoc,
+                                Sema::ConditionKind::Switch))
     return StmtError();
 
-  StmtResult Switch
-    = Actions.ActOnStartOfSwitchStmt(SwitchLoc, Cond.get(), CondVar);
+  StmtResult Switch =
+      Actions.ActOnStartOfSwitchStmt(SwitchLoc, InitStmt.get(), Cond);
 
   if (Switch.isInvalid()) {
     // Skip the switch body.
@@ -1345,13 +1369,11 @@
   ParseScope WhileScope(this, ScopeFlags);
 
   // Parse the condition.
-  ExprResult Cond;
-  Decl *CondVar = nullptr;
-  if (ParseParenExprOrCondition(Cond, CondVar, WhileLoc, true))
+  Sema::ConditionResult Cond;
+  if (ParseParenExprOrCondition(nullptr, Cond, WhileLoc,
+                                Sema::ConditionKind::Boolean))
     return StmtError();
 
-  FullExprArg FullCond(Actions.MakeFullExpr(Cond.get(), WhileLoc));
-
   // C99 6.8.5p5 - In C99, the body of the while statement is a scope, even if
   // there is no compound stmt.  C90 does not have this clause.  We only do this
   // if the body isn't a compound statement to avoid push/pop in common cases.
@@ -1372,10 +1394,10 @@
   InnerScope.Exit();
   WhileScope.Exit();
 
-  if ((Cond.isInvalid() && !CondVar) || Body.isInvalid())
+  if (Cond.isInvalid() || Body.isInvalid())
     return StmtError();
 
-  return Actions.ActOnWhileStmt(WhileLoc, FullCond, CondVar, Body.get());
+  return Actions.ActOnWhileStmt(WhileLoc, Cond, Body.get());
 }
 
 /// ParseDoStatement
@@ -1533,12 +1555,10 @@
 
   bool ForEach = false, ForRange = false;
   StmtResult FirstPart;
-  bool SecondPartIsInvalid = false;
-  FullExprArg SecondPart(Actions);
+  Sema::ConditionResult SecondPart;
   ExprResult Collection;
   ForRangeInit ForRangeInit;
   FullExprArg ThirdPart(Actions);
-  Decl *SecondVar = nullptr;
 
   if (Tok.is(tok::code_completion)) {
     Actions.CodeCompleteOrdinaryName(getCurScope(),
@@ -1643,7 +1663,7 @@
       Diag(Tok, diag::err_for_range_expected_decl)
         << FirstPart.get()->getSourceRange();
       SkipUntil(tok::r_paren, StopBeforeMatch);
-      SecondPartIsInvalid = true;
+      SecondPart = Sema::ConditionError();
     } else {
       if (!Value.isInvalid()) {
         Diag(Tok, diag::err_expected_semi_for);
@@ -1658,29 +1678,29 @@
 
   // Parse the second part of the for specifier.
   getCurScope()->AddFlags(Scope::BreakScope | Scope::ContinueScope);
-  if (!ForEach && !ForRange) {
-    assert(!SecondPart.get() && "Shouldn't have a second expression yet.");
+  if (!ForEach && !ForRange && !SecondPart.isInvalid()) {
     // Parse the second part of the for specifier.
     if (Tok.is(tok::semi)) {  // for (...;;
       // no second part.
     } else if (Tok.is(tok::r_paren)) {
       // missing both semicolons.
     } else {
-      ExprResult Second;
       if (getLangOpts().CPlusPlus)
-        ParseCXXCondition(Second, SecondVar, ForLoc, true);
+        SecondPart =
+            ParseCXXCondition(nullptr, ForLoc, Sema::ConditionKind::Boolean);
       else {
-        Second = ParseExpression();
-        if (!Second.isInvalid())
-          Second = Actions.ActOnBooleanCondition(getCurScope(), ForLoc,
-                                                 Second.get());
+        ExprResult SecondExpr = ParseExpression();
+        if (SecondExpr.isInvalid())
+          SecondPart = Sema::ConditionError();
+        else
+          SecondPart =
+              Actions.ActOnCondition(getCurScope(), ForLoc, SecondExpr.get(),
+                                     Sema::ConditionKind::Boolean);
       }
-      SecondPartIsInvalid = Second.isInvalid();
-      SecondPart = Actions.MakeFullExpr(Second.get(), ForLoc);
     }
 
     if (Tok.isNot(tok::semi)) {
-      if (!SecondPartIsInvalid || SecondVar)
+      if (!SecondPart.isInvalid())
         Diag(Tok, diag::err_expected_semi_for);
       else
         // Skip until semicolon or rparen, don't consume it.
@@ -1716,9 +1736,11 @@
   StmtResult ForEachStmt;
 
   if (ForRange) {
+    ExprResult CorrectedRange =
+        Actions.CorrectDelayedTyposInExpr(ForRangeInit.RangeExpr.get());
     ForRangeStmt = Actions.ActOnCXXForRangeStmt(
         getCurScope(), ForLoc, CoawaitLoc, FirstPart.get(),
-        ForRangeInit.ColonLoc, ForRangeInit.RangeExpr.get(),
+        ForRangeInit.ColonLoc, CorrectedRange.get(),
         T.getCloseLocation(), Sema::BFRK_Build);
 
   // Similarly, we need to do the semantic analysis for a for-range
@@ -1777,8 +1799,8 @@
     return Actions.FinishCXXForRangeStmt(ForRangeStmt.get(), Body.get());
 
   return Actions.ActOnForStmt(ForLoc, T.getOpenLocation(), FirstPart.get(),
-                              SecondPart, SecondVar, ThirdPart,
-                              T.getCloseLocation(), Body.get());
+                              SecondPart, ThirdPart, T.getCloseLocation(),
+                              Body.get());
 }
 
 /// ParseGotoStatement
@@ -1912,19 +1934,14 @@
   assert(Tok.is(tok::l_brace));
   SourceLocation LBraceLoc = Tok.getLocation();
 
-  if (SkipFunctionBodies && (!Decl || Actions.canSkipFunctionBody(Decl)) &&
-      trySkippingFunctionBody()) {
-    BodyScope.Exit();
-    return Actions.ActOnSkippedFunctionBody(Decl);
-  }
-
   PrettyDeclStackTraceEntry CrashInfo(Actions, Decl, LBraceLoc,
                                       "parsing function body");
 
   // Save and reset current vtordisp stack if we have entered a C++ method body.
   bool IsCXXMethod =
       getLangOpts().CPlusPlus && Decl && isa<CXXMethodDecl>(Decl);
-  Sema::VtorDispStackRAII SavedVtorDispStack(Actions, IsCXXMethod);
+  Sema::PragmaStackSentinelRAII
+    PragmaStackSentinel(Actions, "InternalPragmaState", IsCXXMethod);
 
   // Do not enter a scope for the brace, as the arguments are in the same scope
   // (the function body) as the body itself.  Instead, just read the statement
@@ -1959,16 +1976,11 @@
   else
     Actions.ActOnDefaultCtorInitializers(Decl);
 
-  if (SkipFunctionBodies && Actions.canSkipFunctionBody(Decl) &&
-      trySkippingFunctionBody()) {
-    BodyScope.Exit();
-    return Actions.ActOnSkippedFunctionBody(Decl);
-  }
-
   // Save and reset current vtordisp stack if we have entered a C++ method body.
   bool IsCXXMethod =
       getLangOpts().CPlusPlus && Decl && isa<CXXMethodDecl>(Decl);
-  Sema::VtorDispStackRAII SavedVtorDispStack(Actions, IsCXXMethod);
+  Sema::PragmaStackSentinelRAII
+    PragmaStackSentinel(Actions, "InternalPragmaState", IsCXXMethod);
 
   SourceLocation LBraceLoc = Tok.getLocation();
   StmtResult FnBody(ParseCXXTryBlockCommon(TryLoc, /*FnTry*/true));
@@ -1984,27 +1996,43 @@
 }
 
 bool Parser::trySkippingFunctionBody() {
-  assert(Tok.is(tok::l_brace));
   assert(SkipFunctionBodies &&
          "Should only be called when SkipFunctionBodies is enabled");
-
   if (!PP.isCodeCompletionEnabled()) {
-    ConsumeBrace();
-    SkipUntil(tok::r_brace);
+    SkipFunctionBody();
     return true;
   }
 
   // We're in code-completion mode. Skip parsing for all function bodies unless
   // the body contains the code-completion point.
   TentativeParsingAction PA(*this);
-  ConsumeBrace();
-  if (SkipUntil(tok::r_brace, StopAtCodeCompletion)) {
+  bool IsTryCatch = Tok.is(tok::kw_try);
+  CachedTokens Toks;
+  bool ErrorInPrologue = ConsumeAndStoreFunctionPrologue(Toks);
+  if (llvm::any_of(Toks, [](const Token &Tok) {
+        return Tok.is(tok::code_completion);
+      })) {
+    PA.Revert();
+    return false;
+  }
+  if (ErrorInPrologue) {
     PA.Commit();
+    SkipMalformedDecl();
     return true;
   }
-
-  PA.Revert();
-  return false;
+  if (!SkipUntil(tok::r_brace, StopAtCodeCompletion)) {
+    PA.Revert();
+    return false;
+  }
+  while (IsTryCatch && Tok.is(tok::kw_catch)) {
+    if (!SkipUntil(tok::l_brace, StopAtCodeCompletion) ||
+        !SkipUntil(tok::r_brace, StopAtCodeCompletion)) {
+      PA.Revert();
+      return false;
+    }
+  }
+  PA.Commit();
+  return true;
 }
 
 /// ParseCXXTryBlock - Parse a C++ try-block.
@@ -2206,3 +2234,19 @@
   }
   Braces.consumeClose();
 }
+
+bool Parser::ParseOpenCLUnrollHintAttribute(ParsedAttributes &Attrs) {
+  MaybeParseGNUAttributes(Attrs);
+
+  if (Attrs.empty())
+    return true;
+
+  if (Attrs.getList()->getKind() != AttributeList::AT_OpenCLUnrollHint)
+    return true;
+
+  if (!(Tok.is(tok::kw_for) || Tok.is(tok::kw_while) || Tok.is(tok::kw_do))) {
+    Diag(Tok, diag::err_opencl_unroll_hint_on_non_loop);
+    return false;
+  }
+  return true;
+}
diff --git a/lib/Parse/ParseStmtAsm.cpp b/lib/Parse/ParseStmtAsm.cpp
index 0cc3d05..1f63dc2 100644
--- a/lib/Parse/ParseStmtAsm.cpp
+++ b/lib/Parse/ParseStmtAsm.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstPrinter.h"
@@ -199,9 +200,7 @@
   // Also copy the current token over.
   LineToks.push_back(Tok);
 
-  PP.EnterTokenStream(LineToks.begin(), LineToks.size(),
-                      /*disable macros*/ true,
-                      /*owns tokens*/ false);
+  PP.EnterTokenStream(LineToks, /*DisableMacroExpansions*/ true);
 
   // Clear the current token and advance to the first token in LineToks.
   ConsumeAnyToken();
@@ -336,6 +335,33 @@
   return false;
 }
 
+/// isTypeQualifier - Return true if the current token could be the
+/// start of a type-qualifier-list.
+static bool isTypeQualifier(const Token &Tok) {
+  switch (Tok.getKind()) {
+  default: return false;
+  // type-qualifier
+  case tok::kw_const:
+  case tok::kw_volatile:
+  case tok::kw_restrict:
+  case tok::kw___private:
+  case tok::kw___local:
+  case tok::kw___global:
+  case tok::kw___constant:
+  case tok::kw___generic:
+  case tok::kw___read_only:
+  case tok::kw___read_write:
+  case tok::kw___write_only:
+    return true;
+  }
+}
+
+// Determine if this is a GCC-style asm statement.
+static bool isGCCAsmStatement(const Token &TokAfterAsm) {
+  return TokAfterAsm.is(tok::l_paren) || TokAfterAsm.is(tok::kw_goto) ||
+         isTypeQualifier(TokAfterAsm);
+}
+
 /// ParseMicrosoftAsmStatement. When -fms-extensions/-fasm-blocks is enabled,
 /// this routine is called to collect the tokens for an MS asm statement.
 ///
@@ -391,6 +417,7 @@
     if (!InAsmComment && Tok.is(tok::l_brace)) {
       // Consume the opening brace.
       SkippedStartOfLine = Tok.isAtStartOfLine();
+      AsmToks.push_back(Tok);
       EndLoc = ConsumeBrace();
       BraceNesting++;
       LBraceLocs.push_back(EndLoc);
@@ -415,11 +442,11 @@
       if (ExpLoc.first != FID ||
           SrcMgr.getLineNumber(ExpLoc.first, ExpLoc.second) != LineNo) {
         // If this is a single-line __asm, we're done, except if the next
-        // line begins with an __asm too, in which case we finish a comment
+        // line is MS-style asm too, in which case we finish a comment
         // if needed and then keep processing the next line as a single
         // line __asm.
         bool isAsm = Tok.is(tok::kw_asm);
-        if (SingleLineMode && !isAsm)
+        if (SingleLineMode && (!isAsm || isGCCAsmStatement(NextToken())))
           break;
         // We're no longer in a comment.
         InAsmComment = false;
@@ -443,6 +470,11 @@
         BraceCount == (savedBraceCount + BraceNesting)) {
       // Consume the closing brace.
       SkippedStartOfLine = Tok.isAtStartOfLine();
+      // Don't want to add the closing brace of the whole asm block
+      if (SingleLineMode || BraceNesting > 1) {
+        Tok.clearFlag(Token::LeadingSpace);
+        AsmToks.push_back(Tok);
+      }
       EndLoc = ConsumeBrace();
       BraceNesting--;
       // Finish if all of the opened braces in the inline asm section were
@@ -526,18 +558,22 @@
   if (buildMSAsmString(PP, AsmLoc, AsmToks, TokOffsets, AsmString))
     return StmtError();
 
+  TargetOptions TO = Actions.Context.getTargetInfo().getTargetOpts();
+  std::string FeaturesStr =
+      llvm::join(TO.Features.begin(), TO.Features.end(), ",");
+
   std::unique_ptr<llvm::MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TT));
   std::unique_ptr<llvm::MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TT));
   // Get the instruction descriptor.
   std::unique_ptr<llvm::MCInstrInfo> MII(TheTarget->createMCInstrInfo());
   std::unique_ptr<llvm::MCObjectFileInfo> MOFI(new llvm::MCObjectFileInfo());
   std::unique_ptr<llvm::MCSubtargetInfo> STI(
-      TheTarget->createMCSubtargetInfo(TT, "", ""));
+      TheTarget->createMCSubtargetInfo(TT, TO.CPU, FeaturesStr));
 
   llvm::SourceMgr TempSrcMgr;
   llvm::MCContext Ctx(MAI.get(), MRI.get(), MOFI.get(), &TempSrcMgr);
-  MOFI->InitMCObjectFileInfo(TheTriple, llvm::Reloc::Default,
-                             llvm::CodeModel::Default, Ctx);
+  MOFI->InitMCObjectFileInfo(TheTriple, /*PIC*/ false, llvm::CodeModel::Default,
+                             Ctx);
   std::unique_ptr<llvm::MemoryBuffer> Buffer =
       llvm::MemoryBuffer::getMemBuffer(AsmString, "<MS inline asm>");
 
@@ -634,8 +670,7 @@
   assert(Tok.is(tok::kw_asm) && "Not an asm stmt");
   SourceLocation AsmLoc = ConsumeToken();
 
-  if (getLangOpts().AsmBlocks && Tok.isNot(tok::l_paren) &&
-      !isTypeQualifier()) {
+  if (getLangOpts().AsmBlocks && !isGCCAsmStatement(Tok)) {
     msAsm = true;
     return ParseMicrosoftAsmStatement(AsmLoc);
   }
@@ -655,6 +690,14 @@
 
   // Remember if this was a volatile asm.
   bool isVolatile = DS.getTypeQualifiers() & DeclSpec::TQ_volatile;
+
+  // TODO: support "asm goto" constructs (PR#9295).
+  if (Tok.is(tok::kw_goto)) {
+    Diag(Tok, diag::err_asm_goto_not_supported_yet);
+    SkipUntil(tok::r_paren, StopAtSemi);
+    return StmtError();
+  }
+
   if (Tok.isNot(tok::l_paren)) {
     Diag(Tok, diag::err_expected_lparen_after) << "asm";
     SkipUntil(tok::r_paren, StopAtSemi);
diff --git a/lib/Parse/ParseTemplate.cpp b/lib/Parse/ParseTemplate.cpp
index 7cf8504..1f54376 100644
--- a/lib/Parse/ParseTemplate.cpp
+++ b/lib/Parse/ParseTemplate.cpp
@@ -11,12 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Parse/Parser.h"
 #include "RAIIObjectsForParser.h"
-#include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/Parse/ParseDiagnostic.h"
+#include "clang/Parse/Parser.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
@@ -122,20 +121,15 @@
       return nullptr;
     }
 
-    ParamLists.push_back(
-      Actions.ActOnTemplateParameterList(CurTemplateDepthTracker.getDepth(), 
-                                         ExportLoc,
-                                         TemplateLoc, LAngleLoc,
-                                         TemplateParams, RAngleLoc));
-
+    ExprResult OptionalRequiresClauseConstraintER;
     if (!TemplateParams.empty()) {
       isSpecialization = false;
       ++CurTemplateDepthTracker;
 
       if (TryConsumeToken(tok::kw_requires)) {
-        ExprResult ER =
+        OptionalRequiresClauseConstraintER =
             Actions.CorrectDelayedTyposInExpr(ParseConstraintExpression());
-        if (!ER.isUsable()) {
+        if (!OptionalRequiresClauseConstraintER.isUsable()) {
           // Skip until the semi-colon or a '}'.
           SkipUntil(tok::r_brace, StopAtSemi | StopBeforeMatch);
           TryConsumeToken(tok::semi);
@@ -145,6 +139,10 @@
     } else {
       LastParamListWasEmpty = true;
     }
+
+    ParamLists.push_back(Actions.ActOnTemplateParameterList(
+        CurTemplateDepthTracker.getDepth(), ExportLoc, TemplateLoc, LAngleLoc,
+        TemplateParams, RAngleLoc, OptionalRequiresClauseConstraintER.get()));
   } while (Tok.isOneOf(tok::kw_export, tok::kw_template));
 
   unsigned NewFlags = getCurScope()->getFlags() & ~Scope::TemplateParamScope;
@@ -212,11 +210,15 @@
   if (Tok.is(tok::semi)) {
     ProhibitAttributes(prefixAttrs);
     DeclEnd = ConsumeToken();
+    RecordDecl *AnonRecord = nullptr;
     Decl *Decl = Actions.ParsedFreeStandingDeclSpec(
         getCurScope(), AS, DS,
         TemplateInfo.TemplateParams ? *TemplateInfo.TemplateParams
                                     : MultiTemplateParamsArg(),
-        TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation);
+        TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation,
+        AnonRecord);
+    assert(!AnonRecord &&
+           "Anonymous unions/structs should not be valid with template");
     DS.complete(Decl);
     return Decl;
   }
@@ -283,7 +285,7 @@
         TemplateParameterLists FakedParamLists;
         FakedParamLists.push_back(Actions.ActOnTemplateParameterList(
             0, SourceLocation(), TemplateInfo.TemplateLoc, LAngleLoc, None,
-            LAngleLoc));
+            LAngleLoc, nullptr));
 
         return ParseFunctionDefinition(
             DeclaratorInfo, ParsedTemplateInfo(&FakedParamLists,
@@ -634,7 +636,7 @@
     Actions.ActOnTemplateParameterList(Depth, SourceLocation(),
                                        TemplateLoc, LAngleLoc,
                                        TemplateParams,
-                                       RAngleLoc);
+                                       RAngleLoc, nullptr);
 
   // Grab a default argument (if available).
   // Per C++0x [basic.scope.pdecl]p9, we parse the default argument before
@@ -1368,7 +1370,7 @@
   // Append the current token at the end of the new token stream so that it
   // doesn't get lost.
   LPT.Toks.push_back(Tok);
-  PP.EnterTokenStream(LPT.Toks.data(), LPT.Toks.size(), true, false);
+  PP.EnterTokenStream(LPT.Toks, true);
 
   // Consume the previously pushed token.
   ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
diff --git a/lib/Parse/ParseTentative.cpp b/lib/Parse/ParseTentative.cpp
index 6fbcfd9..556fbf3 100644
--- a/lib/Parse/ParseTentative.cpp
+++ b/lib/Parse/ParseTentative.cpp
@@ -74,11 +74,18 @@
 ///
 /// simple-declaration:
 ///   decl-specifier-seq init-declarator-list[opt] ';'
+///   decl-specifier-seq ref-qualifier[opt] '[' identifier-list ']'
+///                      brace-or-equal-initializer ';'    [C++17]
 ///
 /// (if AllowForRangeDecl specified)
 /// for ( for-range-declaration : for-range-initializer ) statement
+///
 /// for-range-declaration: 
-///    attribute-specifier-seqopt type-specifier-seq declarator
+///    decl-specifier-seq declarator
+///    decl-specifier-seq ref-qualifier[opt] '[' identifier-list ']'
+/// 
+/// In any of the above cases there can be a preceding attribute-specifier-seq,
+/// but the caller is expected to handle that.
 bool Parser::isCXXSimpleDeclaration(bool AllowForRangeDecl) {
   // C++ 6.8p1:
   // There is an ambiguity in the grammar involving expression-statements and
@@ -125,10 +132,11 @@
   // Ok, we have a simple-type-specifier/typename-specifier followed by a '(',
   // or an identifier which doesn't resolve as anything. We need tentative
   // parsing...
-
-  TentativeParsingAction PA(*this);
-  TPR = TryParseSimpleDeclaration(AllowForRangeDecl);
-  PA.Revert();
+ 
+  {
+    RevertingTentativeParsingAction PA(*this);
+    TPR = TryParseSimpleDeclaration(AllowForRangeDecl);
+  }
 
   // In case of an error, let the declaration parsing code handle it.
   if (TPR == TPResult::Error)
@@ -329,10 +337,70 @@
   return TPResult::Ambiguous;
 }
 
-/// isCXXConditionDeclaration - Disambiguates between a declaration or an
-/// expression for a condition of a if/switch/while/for statement.
-/// If during the disambiguation process a parsing error is encountered,
-/// the function returns true to let the declaration parsing code handle it.
+struct Parser::ConditionDeclarationOrInitStatementState {
+  Parser &P;
+  bool CanBeExpression = true;
+  bool CanBeCondition = true;
+  bool CanBeInitStatement;
+
+  ConditionDeclarationOrInitStatementState(Parser &P, bool CanBeInitStatement)
+      : P(P), CanBeInitStatement(CanBeInitStatement) {}
+
+  void markNotExpression() {
+    CanBeExpression = false;
+
+    if (CanBeCondition && CanBeInitStatement) {
+      // FIXME: Unify the parsing codepaths for condition variables and
+      // simple-declarations so that we don't need to eagerly figure out which
+      // kind we have here. (Just parse init-declarators until we reach a
+      // semicolon or right paren.)
+      RevertingTentativeParsingAction PA(P);
+      P.SkipUntil(tok::r_paren, tok::semi, StopBeforeMatch);
+      if (P.Tok.isNot(tok::r_paren))
+        CanBeCondition = false;
+      if (P.Tok.isNot(tok::semi))
+        CanBeInitStatement = false;
+    }
+  }
+
+  bool markNotCondition() {
+    CanBeCondition = false;
+    return !CanBeInitStatement || !CanBeExpression;
+  }
+
+  bool update(TPResult IsDecl) {
+    switch (IsDecl) {
+    case TPResult::True:
+      markNotExpression();
+      return true;
+    case TPResult::False:
+      CanBeCondition = CanBeInitStatement = false;
+      return true;
+    case TPResult::Ambiguous:
+      return false;
+    case TPResult::Error:
+      CanBeExpression = CanBeCondition = CanBeInitStatement = false;
+      return true;
+    }
+    llvm_unreachable("unknown tentative parse result");
+  }
+
+  ConditionOrInitStatement result() const {
+    assert(CanBeExpression + CanBeCondition + CanBeInitStatement < 2 &&
+           "result called but not yet resolved");
+    if (CanBeExpression)
+      return ConditionOrInitStatement::Expression;
+    if (CanBeCondition)
+      return ConditionOrInitStatement::ConditionDecl;
+    if (CanBeInitStatement)
+      return ConditionOrInitStatement::InitStmtDecl;
+    return ConditionOrInitStatement::Error;
+  }
+};
+
+/// \brief Disambiguates between a declaration in a condition, a
+/// simple-declaration in an init-statement, and an expression for
+/// a condition of a if/switch statement.
 ///
 ///       condition:
 ///         expression
@@ -341,47 +409,64 @@
 /// [C++11] type-specifier-seq declarator braced-init-list
 /// [GNU]   type-specifier-seq declarator simple-asm-expr[opt] attributes[opt]
 ///             '=' assignment-expression
+///       simple-declaration:
+///         decl-specifier-seq init-declarator-list[opt] ';'
 ///
-bool Parser::isCXXConditionDeclaration() {
-  TPResult TPR = isCXXDeclarationSpecifier();
-  if (TPR != TPResult::Ambiguous)
-    return TPR != TPResult::False; // Returns true for TPResult::True or
-                                   // TPResult::Error.
+/// Note that, unlike isCXXSimpleDeclaration, we must disambiguate all the way
+/// to the ';' to disambiguate cases like 'int(x))' (an expression) from
+/// 'int(x);' (a simple-declaration in an init-statement).
+Parser::ConditionOrInitStatement
+Parser::isCXXConditionDeclarationOrInitStatement(bool CanBeInitStatement) {
+  ConditionDeclarationOrInitStatementState State(*this, CanBeInitStatement);
 
-  // FIXME: Add statistics about the number of ambiguous statements encountered
-  // and how they were resolved (number of declarations+number of expressions).
+  if (State.update(isCXXDeclarationSpecifier()))
+    return State.result();
 
-  // Ok, we have a simple-type-specifier/typename-specifier followed by a '('.
-  // We need tentative parsing...
+  // It might be a declaration; we need tentative parsing.
+  RevertingTentativeParsingAction PA(*this);
 
-  TentativeParsingAction PA(*this);
-
-  // type-specifier-seq
-  TryConsumeDeclarationSpecifier();
+  // FIXME: A tag definition unambiguously tells us this is an init-statement.
+  if (State.update(TryConsumeDeclarationSpecifier()))
+    return State.result();
   assert(Tok.is(tok::l_paren) && "Expected '('");
 
-  // declarator
-  TPR = TryParseDeclarator(false/*mayBeAbstract*/);
+  while (true) {
+    // Consume a declarator.
+    if (State.update(TryParseDeclarator(false/*mayBeAbstract*/)))
+      return State.result();
 
-  // In case of an error, let the declaration parsing code handle it.
-  if (TPR == TPResult::Error)
-    TPR = TPResult::True;
+    // Attributes, asm label, or an initializer imply this is not an expression.
+    // FIXME: Disambiguate properly after an = instead of assuming that it's a
+    // valid declaration.
+    if (Tok.isOneOf(tok::equal, tok::kw_asm, tok::kw___attribute) ||
+        (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace))) {
+      State.markNotExpression();
+      return State.result();
+    }
 
-  if (TPR == TPResult::Ambiguous) {
-    // '='
-    // [GNU] simple-asm-expr[opt] attributes[opt]
-    if (Tok.isOneOf(tok::equal, tok::kw_asm, tok::kw___attribute))
-      TPR = TPResult::True;
-    else if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace))
-      TPR = TPResult::True;
-    else
-      TPR = TPResult::False;
+    // At this point, it can't be a condition any more, because a condition
+    // must have a brace-or-equal-initializer.
+    if (State.markNotCondition())
+      return State.result();
+
+    // A parenthesized initializer could be part of an expression or a
+    // simple-declaration.
+    if (Tok.is(tok::l_paren)) {
+      ConsumeParen();
+      SkipUntil(tok::r_paren, StopAtSemi);
+    }
+
+    if (!TryConsumeToken(tok::comma))
+      break;
   }
 
-  PA.Revert();
-
-  assert(TPR == TPResult::True || TPR == TPResult::False);
-  return TPR == TPResult::True;
+  // We reached the end. If it can now be some kind of decl, then it is.
+  if (State.CanBeCondition && Tok.is(tok::r_paren))
+    return ConditionOrInitStatement::ConditionDecl;
+  else if (State.CanBeInitStatement && Tok.is(tok::semi))
+    return ConditionOrInitStatement::InitStmtDecl;
+  else
+    return ConditionOrInitStatement::Expression;
 }
 
   /// \brief Determine whether the next set of tokens contains a type-id.
@@ -423,7 +508,7 @@
   // Ok, we have a simple-type-specifier/typename-specifier followed by a '('.
   // We need tentative parsing...
 
-  TentativeParsingAction PA(*this);
+  RevertingTentativeParsingAction PA(*this);
 
   // type-specifier-seq
   TryConsumeDeclarationSpecifier();
@@ -456,8 +541,6 @@
       TPR = TPResult::False;
   }
 
-  PA.Revert();
-
   assert(TPR == TPResult::True || TPR == TPResult::False);
   return TPR == TPResult::True;
 }
@@ -508,7 +591,7 @@
   if (!Disambiguate && !getLangOpts().ObjC1)
     return CAK_AttributeSpecifier;
 
-  TentativeParsingAction PA(*this);
+  RevertingTentativeParsingAction PA(*this);
 
   // Opening brackets were checked for above.
   ConsumeBracket();
@@ -520,8 +603,6 @@
     bool IsAttribute = SkipUntil(tok::r_square);
     IsAttribute &= Tok.is(tok::r_square);
 
-    PA.Revert();
-
     return IsAttribute ? CAK_AttributeSpecifier : CAK_InvalidAttributeSpecifier;
   }
 
@@ -542,8 +623,6 @@
     // A lambda cannot end with ']]', and an attribute must.
     bool IsAttribute = Tok.is(tok::r_square);
 
-    PA.Revert();
-
     if (IsAttribute)
       // Case 1: C++11 attribute.
       return CAK_AttributeSpecifier;
@@ -564,7 +643,6 @@
   while (Tok.isNot(tok::r_square)) {
     if (Tok.is(tok::comma)) {
       // Case 1: Stray commas can only occur in attributes.
-      PA.Revert();
       return CAK_AttributeSpecifier;
     }
 
@@ -611,8 +689,6 @@
     }
   }
 
-  PA.Revert();
-
   if (IsAttribute)
     // Case 1: C++11 statement attribute.
     return CAK_AttributeSpecifier;
@@ -833,7 +909,7 @@
       // '(' abstract-declarator ')'
       if (Tok.isOneOf(tok::kw___attribute, tok::kw___declspec, tok::kw___cdecl,
                       tok::kw___stdcall, tok::kw___fastcall, tok::kw___thiscall,
-                      tok::kw___vectorcall, tok::kw___unaligned))
+                      tok::kw___vectorcall))
         return TPResult::True; // attributes indicate declaration
       TPResult TPR = TryParseDeclarator(mayBeAbstract, mayHaveIdentifier);
       if (TPR != TPResult::Ambiguous)
@@ -946,6 +1022,7 @@
   case tok::kw_char:
   case tok::kw_const:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_enum:
   case tok::kw_half:
   case tok::kw_float:
@@ -987,6 +1064,8 @@
   case tok::kw___pixel:
   case tok::kw___bool:
   case tok::kw__Atomic:
+#define GENERIC_IMAGE_TYPE(ImgType, Id) case tok::kw_##ImgType##_t:
+#include "clang/Basic/OpenCLImageTypes.def"
   case tok::kw___unknown_anytype:
     return TPResult::False;
 
@@ -1317,7 +1396,7 @@
                                                      Tok.getAnnotationRange(),
                                                      SS);
         if (SS.getScopeRep() && SS.getScopeRep()->isDependent()) {
-          TentativeParsingAction PA(*this);
+          RevertingTentativeParsingAction PA(*this);
           ConsumeToken();
           ConsumeToken();
           bool isIdentifier = Tok.is(tok::identifier);
@@ -1325,7 +1404,6 @@
           if (!isIdentifier)
             TPR = isCXXDeclarationSpecifier(BracedCastResult,
                                             HasMissingTypename);
-          PA.Revert();
 
           if (isIdentifier ||
               TPR == TPResult::True || TPR == TPResult::Error)
@@ -1337,6 +1415,8 @@
             *HasMissingTypename = true;
             return TPResult::Ambiguous;
           }
+
+          // FIXME: Fails to either revert or commit the tentative parse!
         } else {
           // Try to resolve the name. If it doesn't exist, assume it was
           // intended to name a type and keep disambiguating.
@@ -1388,15 +1468,13 @@
     // In Objective-C, we might have a protocol-qualified type.
     if (getLangOpts().ObjC1 && NextToken().is(tok::less)) {
       // Tentatively parse the protocol qualifiers.
-      TentativeParsingAction PA(*this);
+      RevertingTentativeParsingAction PA(*this);
       ConsumeToken(); // The type token
       
       TPResult TPR = TryParseProtocolQualifiers();
       bool isFollowedByParen = Tok.is(tok::l_paren);
       bool isFollowedByBrace = Tok.is(tok::l_brace);
       
-      PA.Revert();
-      
       if (TPR == TPResult::Error)
         return TPResult::Error;
       
@@ -1424,6 +1502,7 @@
   case tok::kw_half:
   case tok::kw_float:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_void:
   case tok::annot_decltype:
     if (NextToken().is(tok::l_paren))
@@ -1448,14 +1527,12 @@
     if (NextToken().isNot(tok::l_paren))
       return TPResult::True;
 
-    TentativeParsingAction PA(*this);
+    RevertingTentativeParsingAction PA(*this);
 
     TPResult TPR = TryParseTypeofSpecifier();
     bool isFollowedByParen = Tok.is(tok::l_paren);
     bool isFollowedByBrace = Tok.is(tok::l_brace);
 
-    PA.Revert();
-
     if (TPR == TPResult::Error)
       return TPResult::Error;
 
@@ -1515,6 +1592,7 @@
   case tok::kw_half:
   case tok::kw_float:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_void:
   case tok::kw___unknown_anytype:
   case tok::kw___auto_type:
@@ -1594,7 +1672,7 @@
   // ambiguities mentioned in 6.8, the resolution is to consider any construct
   // that could possibly be a declaration a declaration.
 
-  TentativeParsingAction PA(*this);
+  RevertingTentativeParsingAction PA(*this);
 
   ConsumeParen();
   bool InvalidAsDeclaration = false;
@@ -1618,8 +1696,6 @@
     }
   }
 
-  PA.Revert();
-
   if (IsAmbiguous && TPR == TPResult::Ambiguous)
     *IsAmbiguous = true;
 
diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
index c2c7330..f442bd7 100644
--- a/lib/Parse/Parser.cpp
+++ b/lib/Parse/Parser.cpp
@@ -20,7 +20,6 @@
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
-#include "llvm/Support/raw_ostream.h"
 using namespace clang;
 
 
@@ -474,6 +473,7 @@
   Ident_final = nullptr;
   Ident_sealed = nullptr;
   Ident_override = nullptr;
+  Ident_GNU_final = nullptr;
 
   Ident_super = &PP.getIdentifierTable().get("super");
 
@@ -659,8 +659,10 @@
   case tok::annot_pragma_opencl_extension:
     HandlePragmaOpenCLExtension();
     return nullptr;
-  case tok::annot_pragma_openmp:
-    return ParseOpenMPDeclarativeDirective();
+  case tok::annot_pragma_openmp: {
+    AccessSpecifier AS = AS_none;
+    return ParseOpenMPDeclarativeDirectiveWithExtDecl(AS, attrs);
+  }
   case tok::annot_pragma_ms_pointers_to_members:
     HandlePragmaMSPointersToMembers();
     return nullptr;
@@ -885,8 +887,14 @@
   if (Tok.is(tok::semi)) {
     ProhibitAttributes(attrs);
     ConsumeToken();
-    Decl *TheDecl = Actions.ParsedFreeStandingDeclSpec(getCurScope(), AS, DS);
+    RecordDecl *AnonRecord = nullptr;
+    Decl *TheDecl = Actions.ParsedFreeStandingDeclSpec(getCurScope(), AS_none,
+                                                       DS, AnonRecord);
     DS.complete(TheDecl);
+    if (AnonRecord) {
+      Decl* decls[] = {AnonRecord, TheDecl};
+      return Actions.BuildDeclaratorGroup(decls, /*TypeMayContainAuto=*/false);
+    }
     return Actions.ConvertDeclToDeclGroup(TheDecl);
   }
 
@@ -1036,6 +1044,12 @@
     D.complete(DP);
     D.getMutableDeclSpec().abort();
 
+    if (SkipFunctionBodies && (!DP || Actions.canSkipFunctionBody(DP)) &&
+        trySkippingFunctionBody()) {
+      BodyScope.Exit();
+      return Actions.ActOnSkippedFunctionBody(DP);
+    }
+
     CachedTokens Toks;
     LexTemplateFunctionForLateParsing(Toks);
 
@@ -1128,6 +1142,13 @@
     return Res;
   }
 
+  if (SkipFunctionBodies && (!Res || Actions.canSkipFunctionBody(Res)) &&
+      trySkippingFunctionBody()) {
+    BodyScope.Exit();
+    Actions.ActOnSkippedFunctionBody(Res);
+    return Actions.ActOnFinishFunctionBody(Res, nullptr, false);
+  }
+
   if (Tok.is(tok::kw_try))
     return ParseFunctionTryBlock(Res, BodyScope);
 
diff --git a/lib/Rewrite/HTMLRewrite.cpp b/lib/Rewrite/HTMLRewrite.cpp
index 275fbd0..2d82d8f 100644
--- a/lib/Rewrite/HTMLRewrite.cpp
+++ b/lib/Rewrite/HTMLRewrite.cpp
@@ -502,7 +502,7 @@
 
   // Enter the tokens we just lexed.  This will cause them to be macro expanded
   // but won't enter sub-files (because we removed #'s).
-  TmpPP.EnterTokenStream(&TokenStream[0], TokenStream.size(), false, false);
+  TmpPP.EnterTokenStream(TokenStream, false);
 
   TokenConcatenation ConcatInfo(TmpPP);
 
diff --git a/lib/Rewrite/Makefile b/lib/Rewrite/Makefile
deleted file mode 100644
index 5fef9b2..0000000
--- a/lib/Rewrite/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- clang/lib/Rewrite/Makefile --------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-# This implements code transformation / rewriting facilities.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangRewrite
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/Rewrite/RewriteRope.cpp b/lib/Rewrite/RewriteRope.cpp
index 451ad07..030ab77 100644
--- a/lib/Rewrite/RewriteRope.cpp
+++ b/lib/Rewrite/RewriteRope.cpp
@@ -350,8 +350,10 @@
     PieceOffs += getPiece(i).size();
 
   // If we exactly include the last one, include it in the region to delete.
-  if (Offset+NumBytes == PieceOffs+getPiece(i).size())
-    PieceOffs += getPiece(i).size(), ++i;
+  if (Offset+NumBytes == PieceOffs+getPiece(i).size()) {
+    PieceOffs += getPiece(i).size();
+    ++i;
+  }
 
   // If we completely cover some RopePieces, erase them now.
   if (i != StartPiece) {
diff --git a/lib/Sema/AnalysisBasedWarnings.cpp b/lib/Sema/AnalysisBasedWarnings.cpp
index 91ad465..9b3fbd8 100644
--- a/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/lib/Sema/AnalysisBasedWarnings.cpp
@@ -37,12 +37,8 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaInternal.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/ImmutableMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -50,7 +46,6 @@
 #include <algorithm>
 #include <deque>
 #include <iterator>
-#include <vector>
 
 using namespace clang;
 
@@ -889,7 +884,7 @@
   // the initializer of that declaration & we didn't already suggest
   // an initialization fixit.
   if (!SuggestInitializationFixit(S, VD))
-    S.Diag(VD->getLocStart(), diag::note_uninit_var_def)
+    S.Diag(VD->getLocStart(), diag::note_var_declared_here)
       << VD->getDeclName();
 
   return true;
@@ -1071,6 +1066,34 @@
   };
 } // anonymous namespace
 
+static StringRef getFallthroughAttrSpelling(Preprocessor &PP,
+                                            SourceLocation Loc) {
+  TokenValue FallthroughTokens[] = {
+    tok::l_square, tok::l_square,
+    PP.getIdentifierInfo("fallthrough"),
+    tok::r_square, tok::r_square
+  };
+
+  TokenValue ClangFallthroughTokens[] = {
+    tok::l_square, tok::l_square, PP.getIdentifierInfo("clang"),
+    tok::coloncolon, PP.getIdentifierInfo("fallthrough"),
+    tok::r_square, tok::r_square
+  };
+
+  bool PreferClangAttr = !PP.getLangOpts().CPlusPlus1z;
+
+  StringRef MacroName;
+  if (PreferClangAttr)
+    MacroName = PP.getLastMacroWithSpelling(Loc, ClangFallthroughTokens);
+  if (MacroName.empty())
+    MacroName = PP.getLastMacroWithSpelling(Loc, FallthroughTokens);
+  if (MacroName.empty() && !PreferClangAttr)
+    MacroName = PP.getLastMacroWithSpelling(Loc, ClangFallthroughTokens);
+  if (MacroName.empty())
+    MacroName = PreferClangAttr ? "[[clang::fallthrough]]" : "[[fallthrough]]";
+  return MacroName;
+}
+
 static void DiagnoseSwitchLabelsFallthrough(Sema &S, AnalysisDeclContext &AC,
                                             bool PerFunction) {
   // Only perform this analysis when using C++11.  There is no good workflow
@@ -1129,15 +1152,7 @@
         }
         if (!(B->empty() && Term && isa<BreakStmt>(Term))) {
           Preprocessor &PP = S.getPreprocessor();
-          TokenValue Tokens[] = {
-            tok::l_square, tok::l_square, PP.getIdentifierInfo("clang"),
-            tok::coloncolon, PP.getIdentifierInfo("fallthrough"),
-            tok::r_square, tok::r_square
-          };
-          StringRef AnnotationSpelling = "[[clang::fallthrough]]";
-          StringRef MacroName = PP.getLastMacroWithSpelling(L, Tokens);
-          if (!MacroName.empty())
-            AnnotationSpelling = MacroName;
+          StringRef AnnotationSpelling = getFallthroughAttrSpelling(PP, L);
           SmallString<64> TextToInsert(AnnotationSpelling);
           TextToInsert += "; ";
           S.Diag(L, diag::note_insert_fallthrough_fixit) <<
@@ -1151,7 +1166,7 @@
   }
 
   for (const auto *F : FM.getFallthroughStmts())
-    S.Diag(F->getLocStart(), diag::warn_fallthrough_attr_invalid_placement);
+    S.Diag(F->getLocStart(), diag::err_fallthrough_attr_invalid_placement);
 }
 
 static bool isInLoop(const ASTContext &Ctx, const ParentMap &PM,
@@ -1302,27 +1317,27 @@
       Ivar
     } ObjectKind;
 
-    const NamedDecl *D = Key.getProperty();
-    if (isa<VarDecl>(D))
+    const NamedDecl *KeyProp = Key.getProperty();
+    if (isa<VarDecl>(KeyProp))
       ObjectKind = Variable;
-    else if (isa<ObjCPropertyDecl>(D))
+    else if (isa<ObjCPropertyDecl>(KeyProp))
       ObjectKind = Property;
-    else if (isa<ObjCMethodDecl>(D))
+    else if (isa<ObjCMethodDecl>(KeyProp))
       ObjectKind = ImplicitProperty;
-    else if (isa<ObjCIvarDecl>(D))
+    else if (isa<ObjCIvarDecl>(KeyProp))
       ObjectKind = Ivar;
     else
       llvm_unreachable("Unexpected weak object kind!");
 
     // Do not warn about IBOutlet weak property receivers being set to null
     // since they are typically only used from the main thread.
-    if (const ObjCPropertyDecl *Prop = dyn_cast<ObjCPropertyDecl>(D))
+    if (const ObjCPropertyDecl *Prop = dyn_cast<ObjCPropertyDecl>(KeyProp))
       if (Prop->hasAttr<IBOutletAttr>())
         continue;
 
     // Show the first time the object was read.
     S.Diag(FirstRead->getLocStart(), DiagKind)
-      << int(ObjectKind) << D << int(FunctionKind)
+      << int(ObjectKind) << KeyProp << int(FunctionKind)
       << FirstRead->getSourceRange();
 
     // Print all the other accesses as notes.
@@ -2044,7 +2059,8 @@
       !Diags.isIgnored(diag::warn_unannotated_fallthrough, D->getLocStart());
   bool FallThroughDiagPerFunction = !Diags.isIgnored(
       diag::warn_unannotated_fallthrough_per_function, D->getLocStart());
-  if (FallThroughDiagFull || FallThroughDiagPerFunction) {
+  if (FallThroughDiagFull || FallThroughDiagPerFunction ||
+      fscope->HasFallthroughStmt) {
     DiagnoseSwitchLabelsFallthrough(S, AC, !FallThroughDiagFull);
   }
 
diff --git a/lib/Sema/AttributeList.cpp b/lib/Sema/AttributeList.cpp
index 3c61c95..41ccdc9 100644
--- a/lib/Sema/AttributeList.cpp
+++ b/lib/Sema/AttributeList.cpp
@@ -20,7 +20,6 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
 using namespace clang;
 
 IdentifierLoc *IdentifierLoc::create(ASTContext &Ctx, SourceLocation Loc,
@@ -159,6 +158,7 @@
   unsigned HasCustomParsing : 1;
   unsigned IsTargetSpecific : 1;
   unsigned IsType : 1;
+  unsigned IsStmt : 1;
   unsigned IsKnownToGCC : 1;
 
   bool (*DiagAppertainsToDecl)(Sema &S, const AttributeList &Attr,
@@ -204,6 +204,10 @@
   return getInfo(*this).IsType;
 }
 
+bool AttributeList::isStmtAttr() const {
+  return getInfo(*this).IsStmt;
+}
+
 bool AttributeList::existsInTarget(const TargetInfo &Target) const {
   return getInfo(*this).ExistsInTarget(Target);
 }
diff --git a/lib/Sema/CMakeLists.txt b/lib/Sema/CMakeLists.txt
index 1f49e84..c92879a 100644
--- a/lib/Sema/CMakeLists.txt
+++ b/lib/Sema/CMakeLists.txt
@@ -2,6 +2,10 @@
   Support
   )
 
+if (MSVC)
+  set_source_files_properties(SemaExpr.cpp PROPERTIES COMPILE_FLAGS /bigobj)
+endif()
+
 add_clang_library(clangSema
   AnalysisBasedWarnings.cpp
   AttributeList.cpp
diff --git a/lib/Sema/CodeCompleteConsumer.cpp b/lib/Sema/CodeCompleteConsumer.cpp
index 18e9a59..769141e 100644
--- a/lib/Sema/CodeCompleteConsumer.cpp
+++ b/lib/Sema/CodeCompleteConsumer.cpp
@@ -17,6 +17,7 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/Sema.h"
+#include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
@@ -309,7 +310,7 @@
         if (!Interface) {
           // Assign an empty StringRef but with non-null data to distinguish
           // between empty because we didn't process the DeclContext yet.
-          CachedParentName = StringRef((const char *)~0U, 0);
+          CachedParentName = StringRef((const char *)(uintptr_t)~0U, 0);
           return StringRef();
         }
         
@@ -428,6 +429,26 @@
 
 CodeCompleteConsumer::~CodeCompleteConsumer() { }
 
+bool PrintingCodeCompleteConsumer::isResultFilteredOut(StringRef Filter,
+                                                CodeCompletionResult Result) {
+  switch (Result.Kind) {
+  case CodeCompletionResult::RK_Declaration: {
+    return !(Result.Declaration->getIdentifier() &&
+            Result.Declaration->getIdentifier()->getName().startswith(Filter));
+  }
+  case CodeCompletionResult::RK_Keyword: {
+    return !StringRef(Result.Keyword).startswith(Filter);
+  }
+  case CodeCompletionResult::RK_Macro: {
+    return !Result.Macro->getName().startswith(Filter);
+  }
+  case CodeCompletionResult::RK_Pattern: {
+    return !StringRef(Result.Pattern->getAsString()).startswith(Filter);
+  }
+  }
+  llvm_unreachable("Unknown code completion result Kind.");
+}
+
 void 
 PrintingCodeCompleteConsumer::ProcessCodeCompleteResults(Sema &SemaRef,
                                                  CodeCompletionContext Context,
@@ -435,8 +456,12 @@
                                                          unsigned NumResults) {
   std::stable_sort(Results, Results + NumResults);
   
+  StringRef Filter = SemaRef.getPreprocessor().getCodeCompletionFilter();
+
   // Print the results.
   for (unsigned I = 0; I != NumResults; ++I) {
+    if(!Filter.empty() && isResultFilteredOut(Filter, Results[I]))
+      continue;
     OS << "COMPLETION: ";
     switch (Results[I].Kind) {
     case CodeCompletionResult::RK_Declaration:
diff --git a/lib/Sema/DeclSpec.cpp b/lib/Sema/DeclSpec.cpp
index 6f6c4ca..42d4633 100644
--- a/lib/Sema/DeclSpec.cpp
+++ b/lib/Sema/DeclSpec.cpp
@@ -15,10 +15,10 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/LocInfoType.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TargetInfo.h"
-#include "clang/Sema/LocInfoType.h"
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaDiagnostic.h"
@@ -220,11 +220,11 @@
     // parameter list there (in an effort to avoid new/delete traffic).  If it
     // is already used (consider a function returning a function pointer) or too
     // small (function with too many parameters), go to the heap.
-    if (!TheDeclarator.InlineParamsUsed &&
+    if (!TheDeclarator.InlineStorageUsed &&
         NumParams <= llvm::array_lengthof(TheDeclarator.InlineParams)) {
       I.Fun.Params = TheDeclarator.InlineParams;
       I.Fun.DeleteParams = false;
-      TheDeclarator.InlineParamsUsed = true;
+      TheDeclarator.InlineStorageUsed = true;
     } else {
       I.Fun.Params = new DeclaratorChunk::ParamInfo[NumParams];
       I.Fun.DeleteParams = true;
@@ -258,6 +258,38 @@
   return I;
 }
 
+void Declarator::setDecompositionBindings(
+    SourceLocation LSquareLoc,
+    ArrayRef<DecompositionDeclarator::Binding> Bindings,
+    SourceLocation RSquareLoc) {
+  assert(!hasName() && "declarator given multiple names!");
+
+  BindingGroup.LSquareLoc = LSquareLoc;
+  BindingGroup.RSquareLoc = RSquareLoc;
+  BindingGroup.NumBindings = Bindings.size();
+  Range.setEnd(RSquareLoc);
+
+  // We're now past the identifier.
+  SetIdentifier(nullptr, LSquareLoc);
+  Name.EndLocation = RSquareLoc;
+
+  // Allocate storage for bindings and stash them away.
+  if (Bindings.size()) {
+    if (!InlineStorageUsed &&
+        Bindings.size() <= llvm::array_lengthof(InlineBindings)) {
+      BindingGroup.Bindings = InlineBindings;
+      BindingGroup.DeleteBindings = false;
+      InlineStorageUsed = true;
+    } else {
+      BindingGroup.Bindings =
+          new DecompositionDeclarator::Binding[Bindings.size()];
+      BindingGroup.DeleteBindings = true;
+    }
+    std::uninitialized_copy(Bindings.begin(), Bindings.end(),
+                            BindingGroup.Bindings);
+  }
+}
+
 bool Declarator::isDeclarationOfFunction() const {
   for (unsigned i = 0, i_end = DeclTypeInfo.size(); i < i_end; ++i) {
     switch (DeclTypeInfo[i].Kind) {
@@ -289,6 +321,7 @@
     case TST_decimal32:
     case TST_decimal64:
     case TST_double:
+    case TST_float128:
     case TST_enum:
     case TST_error:
     case TST_float:
@@ -302,6 +335,8 @@
     case TST_unspecified:
     case TST_void:
     case TST_wchar:
+#define GENERIC_IMAGE_TYPE(ImgType, Id) case TST_##ImgType##_t:
+#include "clang/Basic/OpenCLImageTypes.def"
       return false;
 
     case TST_decltype_auto:
@@ -455,6 +490,7 @@
   case DeclSpec::TST_half:        return "half";
   case DeclSpec::TST_float:       return "float";
   case DeclSpec::TST_double:      return "double";
+  case DeclSpec::TST_float128:    return "__float128";
   case DeclSpec::TST_bool:        return Policy.Bool ? "bool" : "_Bool";
   case DeclSpec::TST_decimal32:   return "_Decimal32";
   case DeclSpec::TST_decimal64:   return "_Decimal64";
@@ -474,6 +510,10 @@
   case DeclSpec::TST_underlyingType: return "__underlying_type";
   case DeclSpec::TST_unknown_anytype: return "__unknown_anytype";
   case DeclSpec::TST_atomic: return "_Atomic";
+#define GENERIC_IMAGE_TYPE(ImgType, Id) \
+  case DeclSpec::TST_##ImgType##_t: \
+    return #ImgType "_t";
+#include "clang/Basic/OpenCLImageTypes.def"
   case DeclSpec::TST_error:       return "(error)";
   }
   llvm_unreachable("Unknown typespec!");
@@ -486,6 +526,7 @@
   case DeclSpec::TQ_restrict:    return "restrict";
   case DeclSpec::TQ_volatile:    return "volatile";
   case DeclSpec::TQ_atomic:      return "_Atomic";
+  case DeclSpec::TQ_unaligned:   return "__unaligned";
   }
   llvm_unreachable("Unknown typespec!");
 }
@@ -787,6 +828,7 @@
   case TQ_const:    TQ_constLoc = Loc; return false;
   case TQ_restrict: TQ_restrictLoc = Loc; return false;
   case TQ_volatile: TQ_volatileLoc = Loc; return false;
+  case TQ_unaligned: TQ_unalignedLoc = Loc; return false;
   case TQ_atomic:   TQ_atomicLoc = Loc; return false;
   }
 
@@ -953,10 +995,10 @@
        TypeSpecSign != TSS_unspecified ||
        TypeAltiVecVector || TypeAltiVecPixel || TypeAltiVecBool ||
        TypeQualifiers)) {
-    const unsigned NumLocs = 8;
+    const unsigned NumLocs = 9;
     SourceLocation ExtraLocs[NumLocs] = {
       TSWLoc, TSCLoc, TSSLoc, AltiVecLoc,
-      TQ_constLoc, TQ_restrictLoc, TQ_volatileLoc, TQ_atomicLoc
+      TQ_constLoc, TQ_restrictLoc, TQ_volatileLoc, TQ_atomicLoc, TQ_unalignedLoc
     };
     FixItHint Hints[NumLocs];
     SourceLocation FirstLoc;
@@ -1257,6 +1299,7 @@
   switch (VS) {
   default: llvm_unreachable("Unknown specifier!");
   case VS_Override: VS_overrideLoc = Loc; break;
+  case VS_GNU_Final:
   case VS_Sealed:
   case VS_Final:    VS_finalLoc = Loc; break;
   }
@@ -1269,6 +1312,7 @@
   default: llvm_unreachable("Unknown specifier");
   case VS_Override: return "override";
   case VS_Final: return "final";
+  case VS_GNU_Final: return "__final";
   case VS_Sealed: return "sealed";
   }
 }
diff --git a/lib/Sema/DelayedDiagnostic.cpp b/lib/Sema/DelayedDiagnostic.cpp
index ceea04f..f695030 100644
--- a/lib/Sema/DelayedDiagnostic.cpp
+++ b/lib/Sema/DelayedDiagnostic.cpp
@@ -20,7 +20,7 @@
 using namespace sema;
 
 DelayedDiagnostic
-DelayedDiagnostic::makeAvailability(Sema::AvailabilityDiagnostic AD,
+DelayedDiagnostic::makeAvailability(AvailabilityResult AD,
                                     SourceLocation Loc,
                                     const NamedDecl *D,
                                     const ObjCInterfaceDecl *UnknownObjCClass,
@@ -29,14 +29,14 @@
                                     bool ObjCPropertyAccess) {
   DelayedDiagnostic DD;
   switch (AD) {
-    case Sema::AD_Deprecation:
-      DD.Kind = Deprecation;
-      break;
-    case Sema::AD_Unavailable:
-      DD.Kind = Unavailable;
-      break;
-    case Sema::AD_Partial:
-      llvm_unreachable("AD_Partial diags should not be delayed");
+  case AR_Deprecated:
+    DD.Kind = Deprecation;
+    break;
+  case AR_Unavailable:
+    DD.Kind = Unavailable;
+    break;
+  default:
+    llvm_unreachable("partial diags should not be delayed");
   }
   DD.Triggered = false;
   DD.Loc = Loc;
diff --git a/lib/Sema/IdentifierResolver.cpp b/lib/Sema/IdentifierResolver.cpp
index 53263ba..0bdb194 100644
--- a/lib/Sema/IdentifierResolver.cpp
+++ b/lib/Sema/IdentifierResolver.cpp
@@ -381,7 +381,7 @@
     PP.getExternalSource()->updateOutOfDateIdentifier(II);
   
   if (II.isFromAST())
-    II.setChangedSinceDeserialization();
+    II.setFETokenInfoChangedSinceDeserialization();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Sema/JumpDiagnostics.cpp b/lib/Sema/JumpDiagnostics.cpp
index c394d24..0d0c27d 100644
--- a/lib/Sema/JumpDiagnostics.cpp
+++ b/lib/Sema/JumpDiagnostics.cpp
@@ -270,7 +270,8 @@
 /// coherent VLA scope with a specified parent node.  Walk through the
 /// statements, adding any labels or gotos to LabelAndGotoScopes and recursively
 /// walking the AST as needed.
-void JumpScopeChecker::BuildScopeInformation(Stmt *S, unsigned &origParentScope) {
+void JumpScopeChecker::BuildScopeInformation(Stmt *S,
+                                             unsigned &origParentScope) {
   // If this is a statement, rather than an expression, scopes within it don't
   // propagate out into the enclosing scope.  Otherwise we have to worry
   // about block literals, which have the lifetime of their enclosing statement.
@@ -278,7 +279,7 @@
   unsigned &ParentScope = ((isa<Expr>(S) && !isa<StmtExpr>(S))
                             ? origParentScope : independentParentScope);
 
-  bool SkipFirstSubStmt = false;
+  unsigned StmtsToSkip = 0u;
 
   // If we found a label, remember that it is in ParentScope scope.
   switch (S->getStmtClass()) {
@@ -303,11 +304,15 @@
     break;
 
   case Stmt::SwitchStmtClass:
-    // Evaluate the condition variable before entering the scope of the switch
-    // statement.
+    // Evaluate the C++17 init stmt and condition variable
+    // before entering the scope of the switch statement.
+    if (Stmt *Init = cast<SwitchStmt>(S)->getInit()) {
+      BuildScopeInformation(Init, ParentScope);
+      ++StmtsToSkip;
+    }
     if (VarDecl *Var = cast<SwitchStmt>(S)->getConditionVariable()) {
       BuildScopeInformation(Var, ParentScope);
-      SkipFirstSubStmt = true;
+      ++StmtsToSkip;
     }
     // Fall through
 
@@ -318,71 +323,228 @@
     Jumps.push_back(S);
     break;
 
+  case Stmt::IfStmtClass: {
+    IfStmt *IS = cast<IfStmt>(S);
+    if (!(IS->isConstexpr() || IS->isObjCAvailabilityCheck()))
+      break;
+
+    unsigned Diag = IS->isConstexpr() ? diag::note_protected_by_constexpr_if
+                                      : diag::note_protected_by_if_available;
+
+    if (VarDecl *Var = IS->getConditionVariable())
+      BuildScopeInformation(Var, ParentScope);
+
+    // Cannot jump into the middle of the condition.
+    unsigned NewParentScope = Scopes.size();
+    Scopes.push_back(GotoScope(ParentScope, Diag, 0, IS->getLocStart()));
+    BuildScopeInformation(IS->getCond(), NewParentScope);
+
+    // Jumps into either arm of an 'if constexpr' are not allowed.
+    NewParentScope = Scopes.size();
+    Scopes.push_back(GotoScope(ParentScope, Diag, 0, IS->getLocStart()));
+    BuildScopeInformation(IS->getThen(), NewParentScope);
+    if (Stmt *Else = IS->getElse()) {
+      NewParentScope = Scopes.size();
+      Scopes.push_back(GotoScope(ParentScope, Diag, 0, IS->getLocStart()));
+      BuildScopeInformation(Else, NewParentScope);
+    }
+    return;
+  }
+
   case Stmt::CXXTryStmtClass: {
     CXXTryStmt *TS = cast<CXXTryStmt>(S);
-    unsigned newParentScope;
-    Scopes.push_back(GotoScope(ParentScope,
-                               diag::note_protected_by_cxx_try,
-                               diag::note_exits_cxx_try,
-                               TS->getSourceRange().getBegin()));
-    if (Stmt *TryBlock = TS->getTryBlock())
-      BuildScopeInformation(TryBlock, (newParentScope = Scopes.size()-1));
+    {
+      unsigned NewParentScope = Scopes.size();
+      Scopes.push_back(GotoScope(ParentScope,
+                                 diag::note_protected_by_cxx_try,
+                                 diag::note_exits_cxx_try,
+                                 TS->getSourceRange().getBegin()));
+      if (Stmt *TryBlock = TS->getTryBlock())
+        BuildScopeInformation(TryBlock, NewParentScope);
+    }
 
     // Jump from the catch into the try is not allowed either.
     for (unsigned I = 0, E = TS->getNumHandlers(); I != E; ++I) {
       CXXCatchStmt *CS = TS->getHandler(I);
+      unsigned NewParentScope = Scopes.size();
       Scopes.push_back(GotoScope(ParentScope,
                                  diag::note_protected_by_cxx_catch,
                                  diag::note_exits_cxx_catch,
                                  CS->getSourceRange().getBegin()));
-      BuildScopeInformation(CS->getHandlerBlock(),
-                            (newParentScope = Scopes.size()-1));
+      BuildScopeInformation(CS->getHandlerBlock(), NewParentScope);
     }
     return;
   }
 
   case Stmt::SEHTryStmtClass: {
     SEHTryStmt *TS = cast<SEHTryStmt>(S);
-    unsigned newParentScope;
-    Scopes.push_back(GotoScope(ParentScope,
-                               diag::note_protected_by_seh_try,
-                               diag::note_exits_seh_try,
-                               TS->getSourceRange().getBegin()));
-    if (Stmt *TryBlock = TS->getTryBlock())
-      BuildScopeInformation(TryBlock, (newParentScope = Scopes.size()-1));
+    {
+      unsigned NewParentScope = Scopes.size();
+      Scopes.push_back(GotoScope(ParentScope,
+                                 diag::note_protected_by_seh_try,
+                                 diag::note_exits_seh_try,
+                                 TS->getSourceRange().getBegin()));
+      if (Stmt *TryBlock = TS->getTryBlock())
+        BuildScopeInformation(TryBlock, NewParentScope);
+    }
 
     // Jump from __except or __finally into the __try are not allowed either.
     if (SEHExceptStmt *Except = TS->getExceptHandler()) {
+      unsigned NewParentScope = Scopes.size();
       Scopes.push_back(GotoScope(ParentScope,
                                  diag::note_protected_by_seh_except,
                                  diag::note_exits_seh_except,
                                  Except->getSourceRange().getBegin()));
-      BuildScopeInformation(Except->getBlock(),
-                            (newParentScope = Scopes.size()-1));
+      BuildScopeInformation(Except->getBlock(), NewParentScope);
     } else if (SEHFinallyStmt *Finally = TS->getFinallyHandler()) {
+      unsigned NewParentScope = Scopes.size();
       Scopes.push_back(GotoScope(ParentScope,
                                  diag::note_protected_by_seh_finally,
                                  diag::note_exits_seh_finally,
                                  Finally->getSourceRange().getBegin()));
-      BuildScopeInformation(Finally->getBlock(),
-                            (newParentScope = Scopes.size()-1));
+      BuildScopeInformation(Finally->getBlock(), NewParentScope);
     }
 
     return;
   }
 
+  case Stmt::DeclStmtClass: {
+    // If this is a declstmt with a VLA definition, it defines a scope from here
+    // to the end of the containing context.
+    DeclStmt *DS = cast<DeclStmt>(S);
+    // The decl statement creates a scope if any of the decls in it are VLAs
+    // or have the cleanup attribute.
+    for (auto *I : DS->decls())
+      BuildScopeInformation(I, origParentScope);
+    return;
+  }
+
+  case Stmt::ObjCAtTryStmtClass: {
+    // Disallow jumps into any part of an @try statement by pushing a scope and
+    // walking all sub-stmts in that scope.
+    ObjCAtTryStmt *AT = cast<ObjCAtTryStmt>(S);
+    // Recursively walk the AST for the @try part.
+    {
+      unsigned NewParentScope = Scopes.size();
+      Scopes.push_back(GotoScope(ParentScope,
+                                 diag::note_protected_by_objc_try,
+                                 diag::note_exits_objc_try,
+                                 AT->getAtTryLoc()));
+      if (Stmt *TryPart = AT->getTryBody())
+        BuildScopeInformation(TryPart, NewParentScope);
+    }
+
+    // Jump from the catch to the finally or try is not valid.
+    for (unsigned I = 0, N = AT->getNumCatchStmts(); I != N; ++I) {
+      ObjCAtCatchStmt *AC = AT->getCatchStmt(I);
+      unsigned NewParentScope = Scopes.size();
+      Scopes.push_back(GotoScope(ParentScope,
+                                 diag::note_protected_by_objc_catch,
+                                 diag::note_exits_objc_catch,
+                                 AC->getAtCatchLoc()));
+      // @catches are nested and it isn't
+      BuildScopeInformation(AC->getCatchBody(), NewParentScope);
+    }
+
+    // Jump from the finally to the try or catch is not valid.
+    if (ObjCAtFinallyStmt *AF = AT->getFinallyStmt()) {
+      unsigned NewParentScope = Scopes.size();
+      Scopes.push_back(GotoScope(ParentScope,
+                                 diag::note_protected_by_objc_finally,
+                                 diag::note_exits_objc_finally,
+                                 AF->getAtFinallyLoc()));
+      BuildScopeInformation(AF, NewParentScope);
+    }
+
+    return;
+  }
+
+  case Stmt::ObjCAtSynchronizedStmtClass: {
+    // Disallow jumps into the protected statement of an @synchronized, but
+    // allow jumps into the object expression it protects.
+    ObjCAtSynchronizedStmt *AS = cast<ObjCAtSynchronizedStmt>(S);
+    // Recursively walk the AST for the @synchronized object expr, it is
+    // evaluated in the normal scope.
+    BuildScopeInformation(AS->getSynchExpr(), ParentScope);
+
+    // Recursively walk the AST for the @synchronized part, protected by a new
+    // scope.
+    unsigned NewParentScope = Scopes.size();
+    Scopes.push_back(GotoScope(ParentScope,
+                               diag::note_protected_by_objc_synchronized,
+                               diag::note_exits_objc_synchronized,
+                               AS->getAtSynchronizedLoc()));
+    BuildScopeInformation(AS->getSynchBody(), NewParentScope);
+    return;
+  }
+
+  case Stmt::ObjCAutoreleasePoolStmtClass: {
+    // Disallow jumps into the protected statement of an @autoreleasepool.
+    ObjCAutoreleasePoolStmt *AS = cast<ObjCAutoreleasePoolStmt>(S);
+    // Recursively walk the AST for the @autoreleasepool part, protected by a
+    // new scope.
+    unsigned NewParentScope = Scopes.size();
+    Scopes.push_back(GotoScope(ParentScope,
+                               diag::note_protected_by_objc_autoreleasepool,
+                               diag::note_exits_objc_autoreleasepool,
+                               AS->getAtLoc()));
+    BuildScopeInformation(AS->getSubStmt(), NewParentScope);
+    return;
+  }
+
+  case Stmt::ExprWithCleanupsClass: {
+    // Disallow jumps past full-expressions that use blocks with
+    // non-trivial cleanups of their captures.  This is theoretically
+    // implementable but a lot of work which we haven't felt up to doing.
+    ExprWithCleanups *EWC = cast<ExprWithCleanups>(S);
+    for (unsigned i = 0, e = EWC->getNumObjects(); i != e; ++i) {
+      const BlockDecl *BDecl = EWC->getObject(i);
+      for (const auto &CI : BDecl->captures()) {
+        VarDecl *variable = CI.getVariable();
+        BuildScopeInformation(variable, BDecl, origParentScope);
+      }
+    }
+    break;
+  }
+
+  case Stmt::MaterializeTemporaryExprClass: {
+    // Disallow jumps out of scopes containing temporaries lifetime-extended to
+    // automatic storage duration.
+    MaterializeTemporaryExpr *MTE = cast<MaterializeTemporaryExpr>(S);
+    if (MTE->getStorageDuration() == SD_Automatic) {
+      SmallVector<const Expr *, 4> CommaLHS;
+      SmallVector<SubobjectAdjustment, 4> Adjustments;
+      const Expr *ExtendedObject =
+          MTE->GetTemporaryExpr()->skipRValueSubobjectAdjustments(
+              CommaLHS, Adjustments);
+      if (ExtendedObject->getType().isDestructedType()) {
+        Scopes.push_back(GotoScope(ParentScope, 0,
+                                   diag::note_exits_temporary_dtor,
+                                   ExtendedObject->getExprLoc()));
+        origParentScope = Scopes.size()-1;
+      }
+    }
+    break;
+  }
+
+  case Stmt::CaseStmtClass:
+  case Stmt::DefaultStmtClass:
+  case Stmt::LabelStmtClass:
+    LabelAndGotoScopes[S] = ParentScope;
+    break;
+
   default:
     break;
   }
 
   for (Stmt *SubStmt : S->children()) {
-    if (SkipFirstSubStmt) {
-      SkipFirstSubStmt = false;
+    if (!SubStmt)
+        continue;
+    if (StmtsToSkip) {
+      --StmtsToSkip;
       continue;
     }
 
-    if (!SubStmt) continue;
-
     // Cases, labels, and defaults aren't "scope parents".  It's also
     // important to handle these iteratively instead of recursively in
     // order to avoid blowing out the stack.
@@ -401,117 +563,6 @@
       SubStmt = Next;
     }
 
-    // If this is a declstmt with a VLA definition, it defines a scope from here
-    // to the end of the containing context.
-    if (DeclStmt *DS = dyn_cast<DeclStmt>(SubStmt)) {
-      // The decl statement creates a scope if any of the decls in it are VLAs
-      // or have the cleanup attribute.
-      for (auto *I : DS->decls())
-        BuildScopeInformation(I, ParentScope);
-      continue;
-    }
-    // Disallow jumps into any part of an @try statement by pushing a scope and
-    // walking all sub-stmts in that scope.
-    if (ObjCAtTryStmt *AT = dyn_cast<ObjCAtTryStmt>(SubStmt)) {
-      unsigned newParentScope;
-      // Recursively walk the AST for the @try part.
-      Scopes.push_back(GotoScope(ParentScope,
-                                 diag::note_protected_by_objc_try,
-                                 diag::note_exits_objc_try,
-                                 AT->getAtTryLoc()));
-      if (Stmt *TryPart = AT->getTryBody())
-        BuildScopeInformation(TryPart, (newParentScope = Scopes.size()-1));
-
-      // Jump from the catch to the finally or try is not valid.
-      for (unsigned I = 0, N = AT->getNumCatchStmts(); I != N; ++I) {
-        ObjCAtCatchStmt *AC = AT->getCatchStmt(I);
-        Scopes.push_back(GotoScope(ParentScope,
-                                   diag::note_protected_by_objc_catch,
-                                   diag::note_exits_objc_catch,
-                                   AC->getAtCatchLoc()));
-        // @catches are nested and it isn't
-        BuildScopeInformation(AC->getCatchBody(),
-                              (newParentScope = Scopes.size()-1));
-      }
-
-      // Jump from the finally to the try or catch is not valid.
-      if (ObjCAtFinallyStmt *AF = AT->getFinallyStmt()) {
-        Scopes.push_back(GotoScope(ParentScope,
-                                   diag::note_protected_by_objc_finally,
-                                   diag::note_exits_objc_finally,
-                                   AF->getAtFinallyLoc()));
-        BuildScopeInformation(AF, (newParentScope = Scopes.size()-1));
-      }
-
-      continue;
-    }
-
-    unsigned newParentScope;
-    // Disallow jumps into the protected statement of an @synchronized, but
-    // allow jumps into the object expression it protects.
-    if (ObjCAtSynchronizedStmt *AS =
-            dyn_cast<ObjCAtSynchronizedStmt>(SubStmt)) {
-      // Recursively walk the AST for the @synchronized object expr, it is
-      // evaluated in the normal scope.
-      BuildScopeInformation(AS->getSynchExpr(), ParentScope);
-
-      // Recursively walk the AST for the @synchronized part, protected by a new
-      // scope.
-      Scopes.push_back(GotoScope(ParentScope,
-                                 diag::note_protected_by_objc_synchronized,
-                                 diag::note_exits_objc_synchronized,
-                                 AS->getAtSynchronizedLoc()));
-      BuildScopeInformation(AS->getSynchBody(),
-                            (newParentScope = Scopes.size()-1));
-      continue;
-    }
-
-    // Disallow jumps into the protected statement of an @autoreleasepool.
-    if (ObjCAutoreleasePoolStmt *AS =
-            dyn_cast<ObjCAutoreleasePoolStmt>(SubStmt)) {
-      // Recursively walk the AST for the @autoreleasepool part, protected by a
-      // new scope.
-      Scopes.push_back(GotoScope(ParentScope,
-                                 diag::note_protected_by_objc_autoreleasepool,
-                                 diag::note_exits_objc_autoreleasepool,
-                                 AS->getAtLoc()));
-      BuildScopeInformation(AS->getSubStmt(),
-                            (newParentScope = Scopes.size() - 1));
-      continue;
-    }
-
-    // Disallow jumps past full-expressions that use blocks with
-    // non-trivial cleanups of their captures.  This is theoretically
-    // implementable but a lot of work which we haven't felt up to doing.
-    if (ExprWithCleanups *EWC = dyn_cast<ExprWithCleanups>(SubStmt)) {
-      for (unsigned i = 0, e = EWC->getNumObjects(); i != e; ++i) {
-        const BlockDecl *BDecl = EWC->getObject(i);
-        for (const auto &CI : BDecl->captures()) {
-          VarDecl *variable = CI.getVariable();
-          BuildScopeInformation(variable, BDecl, ParentScope);
-        }
-      }
-    }
-
-    // Disallow jumps out of scopes containing temporaries lifetime-extended to
-    // automatic storage duration.
-    if (MaterializeTemporaryExpr *MTE =
-            dyn_cast<MaterializeTemporaryExpr>(SubStmt)) {
-      if (MTE->getStorageDuration() == SD_Automatic) {
-        SmallVector<const Expr *, 4> CommaLHS;
-        SmallVector<SubobjectAdjustment, 4> Adjustments;
-        const Expr *ExtendedObject =
-            MTE->GetTemporaryExpr()->skipRValueSubobjectAdjustments(
-                CommaLHS, Adjustments);
-        if (ExtendedObject->getType().isDestructedType()) {
-          Scopes.push_back(GotoScope(ParentScope, 0,
-                                     diag::note_exits_temporary_dtor,
-                                     ExtendedObject->getExprLoc()));
-          ParentScope = Scopes.size()-1;
-        }
-      }
-    }
-
     // Recursively walk the AST.
     BuildScopeInformation(SubStmt, ParentScope);
   }
diff --git a/lib/Sema/Makefile b/lib/Sema/Makefile
deleted file mode 100644
index 2c02739..0000000
--- a/lib/Sema/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-##===- clang/lib/Sema/Makefile -----------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-#  This implements the semantic analyzer and AST builder library for the 
-#  C-Language front-end.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangSema
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/Sema/ScopeInfo.cpp b/lib/Sema/ScopeInfo.cpp
index 8c2c502..76240bc 100644
--- a/lib/Sema/ScopeInfo.cpp
+++ b/lib/Sema/ScopeInfo.cpp
@@ -28,6 +28,8 @@
   HasBranchIntoScope = false;
   HasIndirectGoto = false;
   HasDroppedStmt = false;
+  HasOMPDeclareReductionCombiner = false;
+  HasPotentialAvailabilityViolations = false;
   ObjCShouldCallSuper = false;
   ObjCIsDesignatedInit = false;
   ObjCWarnForNoDesignatedInitChain = false;
@@ -214,7 +216,7 @@
 
   // Has there been a read from the object using this Expr?
   FunctionScopeInfo::WeakUseVector::reverse_iterator ThisUse =
-    std::find(Uses->second.rbegin(), Uses->second.rend(), WeakUseTy(E, true));
+      llvm::find(llvm::reverse(Uses->second), WeakUseTy(E, true));
   if (ThisUse == Uses->second.rend())
     return;
 
diff --git a/lib/Sema/Sema.cpp b/lib/Sema/Sema.cpp
index 33e4c11..cb90791 100644
--- a/lib/Sema/Sema.cpp
+++ b/lib/Sema/Sema.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/DeclCXX.h"
@@ -22,7 +21,6 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/PartialDiagnostic.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/HeaderSearch.h"
@@ -30,14 +28,15 @@
 #include "clang/Sema/CXXFieldCollector.h"
 #include "clang/Sema/DelayedDiagnostic.h"
 #include "clang/Sema/ExternalSemaSource.h"
+#include "clang/Sema/Initialization.h"
 #include "clang/Sema/MultiplexExternalSemaSource.h"
 #include "clang/Sema/ObjCMethodList.h"
 #include "clang/Sema/PrettyDeclStackTrace.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaConsumer.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/TemplateDeduction.h"
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 using namespace clang;
@@ -52,13 +51,14 @@
 PrintingPolicy Sema::getPrintingPolicy(const ASTContext &Context,
                                        const Preprocessor &PP) {
   PrintingPolicy Policy = Context.getPrintingPolicy();
+  // Our printing policy is copied over the ASTContext printing policy whenever
+  // a diagnostic is emitted, so recompute it.
   Policy.Bool = Context.getLangOpts().Bool;
   if (!Policy.Bool) {
-    if (const MacroInfo *
-          BoolMacro = PP.getMacroInfo(&Context.Idents.get("bool"))) {
+    if (const MacroInfo *BoolMacro = PP.getMacroInfo(Context.getBoolName())) {
       Policy.Bool = BoolMacro->isObjectLike() &&
-        BoolMacro->getNumTokens() == 1 &&
-        BoolMacro->getReplacementToken(0).is(tok::kw__Bool);
+                    BoolMacro->getNumTokens() == 1 &&
+                    BoolMacro->getReplacementToken(0).is(tok::kw__Bool);
     }
   }
 
@@ -80,14 +80,15 @@
     APINotes(SourceMgr, LangOpts), CollectStats(false),
     CodeCompleter(CodeCompleter),
     CurContext(nullptr), OriginalLexicalContext(nullptr),
-    PackContext(nullptr), MSStructPragmaOn(false),
+    MSStructPragmaOn(false),
     MSPointerToMemberRepresentationMethod(
         LangOpts.getMSPointerToMemberRepresentationMethod()),
-    VtorDispModeStack(1, MSVtorDispAttr::Mode(LangOpts.VtorDispMode)),
-    DataSegStack(nullptr), BSSSegStack(nullptr), ConstSegStack(nullptr),
-    CodeSegStack(nullptr), CurInitSeg(nullptr), VisContext(nullptr),
+    VtorDispStack(MSVtorDispAttr::Mode(LangOpts.VtorDispMode)),
+    PackStack(0), DataSegStack(nullptr), BSSSegStack(nullptr),
+    ConstSegStack(nullptr), CodeSegStack(nullptr), CurInitSeg(nullptr),
+    VisContext(nullptr),
     IsBuildingRecoveryCallExpr(false),
-    ExprNeedsCleanups(false), LateTemplateParser(nullptr),
+    Cleanup{}, LateTemplateParser(nullptr),
     LateTemplateParserCleanup(nullptr),
     OpaqueParser(nullptr), IdResolver(pp), StdInitializerList(nullptr),
     CXXTypeInfoDecl(nullptr), MSVCGuidDecl(nullptr),
@@ -123,7 +124,8 @@
   // Tell diagnostics how to render things from the AST library.
   Diags.SetArgToStringFn(&FormatASTNodeDiagnosticArgument, &Context);
 
-  ExprEvalContexts.emplace_back(PotentiallyEvaluated, 0, false, nullptr, false);
+  ExprEvalContexts.emplace_back(PotentiallyEvaluated, 0, CleanupInfo{}, nullptr,
+                                false);
 
   FunctionScopes.push_back(new FunctionScopeInfo(Diags));
 
@@ -207,25 +209,17 @@
     addImplicitTypedef("size_t", Context.getSizeType());
   }
 
-  // Initialize predefined OpenCL types.
+  // Initialize predefined OpenCL types and supported optional core features.
   if (getLangOpts().OpenCL) {
-    addImplicitTypedef("image1d_t", Context.OCLImage1dTy);
-    addImplicitTypedef("image1d_array_t", Context.OCLImage1dArrayTy);
-    addImplicitTypedef("image1d_buffer_t", Context.OCLImage1dBufferTy);
-    addImplicitTypedef("image2d_t", Context.OCLImage2dTy);
-    addImplicitTypedef("image2d_array_t", Context.OCLImage2dArrayTy);
-    addImplicitTypedef("image3d_t", Context.OCLImage3dTy);
+#define OPENCLEXT(Ext) \
+     if (Context.getTargetInfo().getSupportedOpenCLOpts().is_##Ext##_supported_core( \
+         getLangOpts().OpenCLVersion)) \
+       getOpenCLOptions().Ext = 1;
+#include "clang/Basic/OpenCLExtensions.def"
+
     addImplicitTypedef("sampler_t", Context.OCLSamplerTy);
     addImplicitTypedef("event_t", Context.OCLEventTy);
     if (getLangOpts().OpenCLVersion >= 200) {
-      addImplicitTypedef("image2d_depth_t", Context.OCLImage2dDepthTy);
-      addImplicitTypedef("image2d_array_depth_t",
-                         Context.OCLImage2dArrayDepthTy);
-      addImplicitTypedef("image2d_msaa_t", Context.OCLImage2dMSAATy);
-      addImplicitTypedef("image2d_array_msaa_t", Context.OCLImage2dArrayMSAATy);
-      addImplicitTypedef("image2d_msaa_depth_t", Context.OCLImage2dMSAADepthTy);
-      addImplicitTypedef("image2d_array_msaa_depth_t",
-                         Context.OCLImage2dArrayMSAADepthTy);
       addImplicitTypedef("clk_event_t", Context.OCLClkEventTy);
       addImplicitTypedef("queue_t", Context.OCLQueueTy);
       addImplicitTypedef("ndrange_t", Context.OCLNDRangeTy);
@@ -267,7 +261,6 @@
 
 Sema::~Sema() {
   llvm::DeleteContainerSeconds(LateParsedTemplateMap);
-  if (PackContext) FreePackedContext();
   if (VisContext) FreeVisContext();
   // Kill all the active scopes.
   for (unsigned I = 1, E = FunctionScopes.size(); I != E; ++I)
@@ -476,7 +469,8 @@
   return false;
 }
 
-/// Obtains a sorted list of functions that are undefined but ODR-used.
+/// Obtains a sorted list of functions and variables that are undefined but
+/// ODR-used.
 void Sema::getUndefinedButUsed(
     SmallVectorImpl<std::pair<NamedDecl *, SourceLocation> > &Undefined) {
   for (const auto &UndefinedUse : UndefinedButUsed) {
@@ -495,9 +489,10 @@
           !FD->getMostRecentDecl()->isInlined())
         continue;
     } else {
-      if (cast<VarDecl>(ND)->hasDefinition() != VarDecl::DeclarationOnly)
+      auto *VD = cast<VarDecl>(ND);
+      if (VD->hasDefinition() != VarDecl::DeclarationOnly)
         continue;
-      if (ND->isExternallyVisible())
+      if (VD->isExternallyVisible() && !VD->getMostRecentDecl()->isInline())
         continue;
     }
 
@@ -529,10 +524,16 @@
     if (!ND->isExternallyVisible()) {
       S.Diag(ND->getLocation(), diag::warn_undefined_internal)
         << isa<VarDecl>(ND) << ND;
-    } else {
-      assert(cast<FunctionDecl>(ND)->getMostRecentDecl()->isInlined() &&
+    } else if (auto *FD = dyn_cast<FunctionDecl>(ND)) {
+      (void)FD;
+      assert(FD->getMostRecentDecl()->isInlined() &&
              "used object requires definition but isn't inline or internal?");
+      // FIXME: This is ill-formed; we should reject.
       S.Diag(ND->getLocation(), diag::warn_undefined_inline) << ND;
+    } else {
+      assert(cast<VarDecl>(ND)->getMostRecentDecl()->isInline() &&
+             "used var requires definition but isn't inline or internal?");
+      S.Diag(ND->getLocation(), diag::err_undefined_inline_var) << ND;
     }
     if (I->second.isValid())
       S.Diag(I->second, diag::note_used_here);
@@ -810,6 +811,7 @@
                                    diag::err_tentative_def_incomplete_type))
       VD->setInvalidDecl();
 
+    // No initialization is performed for a tentative definition.
     CheckCompleteVariableDeclaration(VD);
 
     // Notify the consumer that we've completed a tentative definition.
@@ -1273,10 +1275,10 @@
   }
   OS << Message;
 
-  if (TheDecl && isa<NamedDecl>(TheDecl)) {
-    std::string Name = cast<NamedDecl>(TheDecl)->getNameAsString();
-    if (!Name.empty())
-      OS << " '" << Name << '\'';
+  if (auto *ND = dyn_cast_or_null<NamedDecl>(TheDecl)) {
+    OS << " '";
+    ND->getNameForDiagnostic(OS, ND->getASTContext().getPrintingPolicy(), true);
+    OS << "'";
   }
 
   OS << '\n';
@@ -1501,7 +1503,8 @@
 void Sema::PushCapturedRegionScope(Scope *S, CapturedDecl *CD, RecordDecl *RD,
                                    CapturedRegionKind K) {
   CapturingScopeInfo *CSI = new CapturedRegionScopeInfo(
-      getDiagnostics(), S, CD, RD, CD->getContextParam(), K);
+      getDiagnostics(), S, CD, RD, CD->getContextParam(), K,
+      (getLangOpts().OpenMP && K == CR_OpenMP) ? getOpenMPNestingLevel() : 0);
   CSI->ReturnType = Context.VoidTy;
   FunctionScopes.push_back(CSI);
 }
diff --git a/lib/Sema/SemaAccess.cpp b/lib/Sema/SemaAccess.cpp
index e9772bc..98a918b 100644
--- a/lib/Sema/SemaAccess.cpp
+++ b/lib/Sema/SemaAccess.cpp
@@ -291,9 +291,10 @@
   SmallVector<const CXXRecordDecl*, 8> Queue; // actually a stack
 
   while (true) {
-    if (Derived->isDependentContext() && !Derived->hasDefinition())
+    if (Derived->isDependentContext() && !Derived->hasDefinition() &&
+        !Derived->isLambda())
       return AR_dependent;
-    
+
     for (const auto &I : Derived->bases()) {
       const CXXRecordDecl *RD;
 
@@ -410,14 +411,8 @@
     return AR_accessible;
 
   if (EC.isDependent()) {
-    CanQualType FriendTy
-      = S.Context.getCanonicalType(S.Context.getTypeDeclType(Friend));
-
-    for (EffectiveContext::record_iterator
-           I = EC.Records.begin(), E = EC.Records.end(); I != E; ++I) {
-      CanQualType ContextTy
-        = S.Context.getCanonicalType(S.Context.getTypeDeclType(*I));
-      if (MightInstantiateTo(S, ContextTy, FriendTy))
+    for (const CXXRecordDecl *Context : EC.Records) {
+      if (MightInstantiateTo(Context, Friend))
         return AR_dependent;
     }
   }
@@ -1615,10 +1610,10 @@
 /// Checks access to a constructor.
 Sema::AccessResult Sema::CheckConstructorAccess(SourceLocation UseLoc,
                                                 CXXConstructorDecl *Constructor,
+                                                DeclAccessPair Found,
                                                 const InitializedEntity &Entity,
-                                                AccessSpecifier Access,
                                                 bool IsCopyBindingRefToTemp) {
-  if (!getLangOpts().AccessControl || Access == AS_public)
+  if (!getLangOpts().AccessControl || Found.getAccess() == AS_public)
     return AR_accessible;
 
   PartialDiagnostic PD(PDiag());
@@ -1652,17 +1647,17 @@
 
   }
 
-  return CheckConstructorAccess(UseLoc, Constructor, Entity, Access, PD);
+  return CheckConstructorAccess(UseLoc, Constructor, Found, Entity, PD);
 }
 
 /// Checks access to a constructor.
 Sema::AccessResult Sema::CheckConstructorAccess(SourceLocation UseLoc,
                                                 CXXConstructorDecl *Constructor,
+                                                DeclAccessPair Found,
                                                 const InitializedEntity &Entity,
-                                                AccessSpecifier Access,
                                                 const PartialDiagnostic &PD) {
   if (!getLangOpts().AccessControl ||
-      Access == AS_public)
+      Found.getAccess() == AS_public)
     return AR_accessible;
 
   CXXRecordDecl *NamingClass = Constructor->getParent();
@@ -1670,16 +1665,28 @@
   // Initializing a base sub-object is an instance method call on an
   // object of the derived class.  Otherwise, we have an instance method
   // call on an object of the constructed type.
+  //
+  // FIXME: If we have a parent, we're initializing the base class subobject
+  // in aggregate initialization. It's not clear whether the object class
+  // should be the base class or the derived class in that case.
   CXXRecordDecl *ObjectClass;
-  if (Entity.getKind() == InitializedEntity::EK_Base) {
+  if ((Entity.getKind() == InitializedEntity::EK_Base ||
+       Entity.getKind() == InitializedEntity::EK_Delegating) &&
+      !Entity.getParent()) {
     ObjectClass = cast<CXXConstructorDecl>(CurContext)->getParent();
+  } else if (auto *Shadow =
+                 dyn_cast<ConstructorUsingShadowDecl>(Found.getDecl())) {
+    // If we're using an inheriting constructor to construct an object,
+    // the object class is the derived class, not the base class.
+    ObjectClass = Shadow->getParent();
   } else {
     ObjectClass = NamingClass;
   }
 
-  AccessTarget AccessEntity(Context, AccessTarget::Member, NamingClass,
-                            DeclAccessPair::make(Constructor, Access),
-                            Context.getTypeDeclType(ObjectClass));
+  AccessTarget AccessEntity(
+      Context, AccessTarget::Member, NamingClass,
+      DeclAccessPair::make(Constructor, Found.getAccess()),
+      Context.getTypeDeclType(ObjectClass));
   AccessEntity.setDiag(PD);
 
   return CheckAccess(*this, UseLoc, AccessEntity);
@@ -1767,9 +1774,9 @@
   // while the ParsingDeclarator is active.
   EffectiveContext EC(CurContext);
   switch (CheckEffectiveAccess(*this, EC, target->getLocation(), entity)) {
-  case AR_accessible: return Sema::AR_accessible;
-  case AR_inaccessible: return Sema::AR_inaccessible;
-  case AR_dependent: return Sema::AR_dependent;
+  case ::AR_accessible: return Sema::AR_accessible;
+  case ::AR_inaccessible: return Sema::AR_inaccessible;
+  case ::AR_dependent: return Sema::AR_dependent;
   }
   llvm_unreachable("invalid access result");
 }
diff --git a/lib/Sema/SemaAttr.cpp b/lib/Sema/SemaAttr.cpp
index f314571..bad9e70 100644
--- a/lib/Sema/SemaAttr.cpp
+++ b/lib/Sema/SemaAttr.cpp
@@ -12,116 +12,50 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Expr.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/SemaInternal.h"
 using namespace clang;
 
 //===----------------------------------------------------------------------===//
 // Pragma 'pack' and 'options align'
 //===----------------------------------------------------------------------===//
 
-namespace {
-  struct PackStackEntry {
-    // We just use a sentinel to represent when the stack is set to mac68k
-    // alignment.
-    static const unsigned kMac68kAlignmentSentinel = ~0U;
-
-    unsigned Alignment;
-    IdentifierInfo *Name;
-  };
-
-  /// PragmaPackStack - Simple class to wrap the stack used by #pragma
-  /// pack.
-  class PragmaPackStack {
-    typedef std::vector<PackStackEntry> stack_ty;
-
-    /// Alignment - The current user specified alignment.
-    unsigned Alignment;
-
-    /// Stack - Entries in the #pragma pack stack, consisting of saved
-    /// alignments and optional names.
-    stack_ty Stack;
-
-  public:
-    PragmaPackStack() : Alignment(0) {}
-
-    void setAlignment(unsigned A) { Alignment = A; }
-    unsigned getAlignment() { return Alignment; }
-
-    /// push - Push the current alignment onto the stack, optionally
-    /// using the given \arg Name for the record, if non-zero.
-    void push(IdentifierInfo *Name) {
-      PackStackEntry PSE = { Alignment, Name };
-      Stack.push_back(PSE);
-    }
-
-    /// pop - Pop a record from the stack and restore the current
-    /// alignment to the previous value. If \arg Name is non-zero then
-    /// the first such named record is popped, otherwise the top record
-    /// is popped. Returns true if the pop succeeded.
-    bool pop(IdentifierInfo *Name, bool IsReset);
-  };
-}  // end anonymous namespace.
-
-bool PragmaPackStack::pop(IdentifierInfo *Name, bool IsReset) {
-  // If name is empty just pop top.
-  if (!Name) {
-    // An empty stack is a special case...
-    if (Stack.empty()) {
-      // If this isn't a reset, it is always an error.
-      if (!IsReset)
-        return false;
-
-      // Otherwise, it is an error only if some alignment has been set.
-      if (!Alignment)
-        return false;
-
-      // Otherwise, reset to the default alignment.
-      Alignment = 0;
-    } else {
-      Alignment = Stack.back().Alignment;
-      Stack.pop_back();
-    }
-
-    return true;
+Sema::PragmaStackSentinelRAII::PragmaStackSentinelRAII(Sema &S,
+                                                       StringRef SlotLabel,
+                                                       bool ShouldAct)
+    : S(S), SlotLabel(SlotLabel), ShouldAct(ShouldAct) {
+  if (ShouldAct) {
+    S.VtorDispStack.SentinelAction(PSK_Push, SlotLabel);
+    S.DataSegStack.SentinelAction(PSK_Push, SlotLabel);
+    S.BSSSegStack.SentinelAction(PSK_Push, SlotLabel);
+    S.ConstSegStack.SentinelAction(PSK_Push, SlotLabel);
+    S.CodeSegStack.SentinelAction(PSK_Push, SlotLabel);
   }
-
-  // Otherwise, find the named record.
-  for (unsigned i = Stack.size(); i != 0; ) {
-    --i;
-    if (Stack[i].Name == Name) {
-      // Found it, pop up to and including this record.
-      Alignment = Stack[i].Alignment;
-      Stack.erase(Stack.begin() + i, Stack.end());
-      return true;
-    }
-  }
-
-  return false;
 }
 
-
-/// FreePackedContext - Deallocate and null out PackContext.
-void Sema::FreePackedContext() {
-  delete static_cast<PragmaPackStack*>(PackContext);
-  PackContext = nullptr;
+Sema::PragmaStackSentinelRAII::~PragmaStackSentinelRAII() {
+  if (ShouldAct) {
+    S.VtorDispStack.SentinelAction(PSK_Pop, SlotLabel);
+    S.DataSegStack.SentinelAction(PSK_Pop, SlotLabel);
+    S.BSSSegStack.SentinelAction(PSK_Pop, SlotLabel);
+    S.ConstSegStack.SentinelAction(PSK_Pop, SlotLabel);
+    S.CodeSegStack.SentinelAction(PSK_Pop, SlotLabel);
+  }
 }
 
 void Sema::AddAlignmentAttributesForRecord(RecordDecl *RD) {
-  // If there is no pack context, we don't need any attributes.
-  if (!PackContext)
+  // If there is no pack value, we don't need any attributes.
+  if (!PackStack.CurrentValue)
     return;
 
-  PragmaPackStack *Stack = static_cast<PragmaPackStack*>(PackContext);
-
   // Otherwise, check to see if we need a max field alignment attribute.
-  if (unsigned Alignment = Stack->getAlignment()) {
-    if (Alignment == PackStackEntry::kMac68kAlignmentSentinel)
+  if (unsigned Alignment = PackStack.CurrentValue) {
+    if (Alignment == Sema::kMac68kAlignmentSentinel)
       RD->addAttr(AlignMac68kAttr::CreateImplicit(Context));
     else
       RD->addAttr(MaxFieldAlignmentAttr::CreateImplicit(Context,
@@ -136,18 +70,15 @@
   // FIXME: We should merge AddAlignmentAttributesForRecord with
   // AddMsStructLayoutForRecord into AddPragmaAttributesForRecord, which takes
   // all active pragmas and applies them as attributes to class definitions.
-  if (VtorDispModeStack.back() != getLangOpts().VtorDispMode)
+  if (VtorDispStack.CurrentValue != getLangOpts().VtorDispMode)
     RD->addAttr(
-        MSVtorDispAttr::CreateImplicit(Context, VtorDispModeStack.back()));
+        MSVtorDispAttr::CreateImplicit(Context, VtorDispStack.CurrentValue));
 }
 
 void Sema::ActOnPragmaOptionsAlign(PragmaOptionsAlignKind Kind,
                                    SourceLocation PragmaLoc) {
-  if (!PackContext)
-    PackContext = new PragmaPackStack();
-
-  PragmaPackStack *Context = static_cast<PragmaPackStack*>(PackContext);
-
+  PragmaMsStackAction Action = Sema::PSK_Reset;
+  unsigned Alignment = 0;
   switch (Kind) {
     // For all targets we support native and natural are the same.
     //
@@ -155,15 +86,15 @@
   case POAK_Native:
   case POAK_Power:
   case POAK_Natural:
-    Context->push(nullptr);
-    Context->setAlignment(0);
+    Action = Sema::PSK_Push_Set;
+    Alignment = 0;
     break;
 
     // Note that '#pragma options align=packed' is not equivalent to attribute
     // packed, it has a different precedence relative to attribute aligned.
   case POAK_Packed:
-    Context->push(nullptr);
-    Context->setAlignment(1);
+    Action = Sema::PSK_Push_Set;
+    Alignment = 1;
     break;
 
   case POAK_Mac68k:
@@ -172,24 +103,31 @@
       Diag(PragmaLoc, diag::err_pragma_options_align_mac68k_target_unsupported);
       return;
     }
-    Context->push(nullptr);
-    Context->setAlignment(PackStackEntry::kMac68kAlignmentSentinel);
+    Action = Sema::PSK_Push_Set;
+    Alignment = Sema::kMac68kAlignmentSentinel;
     break;
 
   case POAK_Reset:
     // Reset just pops the top of the stack, or resets the current alignment to
     // default.
-    if (!Context->pop(nullptr, /*IsReset=*/true)) {
-      Diag(PragmaLoc, diag::warn_pragma_options_align_reset_failed)
-        << "stack empty";
+    Action = Sema::PSK_Pop;
+    if (PackStack.Stack.empty()) {
+      if (PackStack.CurrentValue) {
+        Action = Sema::PSK_Reset;
+      } else {
+        Diag(PragmaLoc, diag::warn_pragma_options_align_reset_failed)
+            << "stack empty";
+        return;
+      }
     }
     break;
   }
+
+  PackStack.Act(PragmaLoc, Action, StringRef(), Alignment);
 }
 
-void Sema::ActOnPragmaPack(PragmaPackKind Kind, IdentifierInfo *Name,
-                           Expr *alignment, SourceLocation PragmaLoc,
-                           SourceLocation LParenLoc, SourceLocation RParenLoc) {
+void Sema::ActOnPragmaPack(SourceLocation PragmaLoc, PragmaMsStackAction Action,
+                           StringRef SlotLabel, Expr *alignment) {
   Expr *Alignment = static_cast<Expr *>(alignment);
 
   // If specified then alignment must be a "small" power of two.
@@ -210,87 +148,48 @@
 
     AlignmentVal = (unsigned) Val.getZExtValue();
   }
-
-  if (!PackContext)
-    PackContext = new PragmaPackStack();
-
-  PragmaPackStack *Context = static_cast<PragmaPackStack*>(PackContext);
-
-  switch (Kind) {
-  case Sema::PPK_Default: // pack([n])
-    Context->setAlignment(AlignmentVal);
-    break;
-
-  case Sema::PPK_Show: // pack(show)
+  if (Action == Sema::PSK_Show) {
     // Show the current alignment, making sure to show the right value
     // for the default.
-    AlignmentVal = Context->getAlignment();
     // FIXME: This should come from the target.
+    AlignmentVal = PackStack.CurrentValue;
     if (AlignmentVal == 0)
       AlignmentVal = 8;
-    if (AlignmentVal == PackStackEntry::kMac68kAlignmentSentinel)
+    if (AlignmentVal == Sema::kMac68kAlignmentSentinel)
       Diag(PragmaLoc, diag::warn_pragma_pack_show) << "mac68k";
     else
       Diag(PragmaLoc, diag::warn_pragma_pack_show) << AlignmentVal;
-    break;
-
-  case Sema::PPK_Push: // pack(push [, id] [, [n])
-    Context->push(Name);
-    // Set the new alignment if specified.
-    if (Alignment)
-      Context->setAlignment(AlignmentVal);
-    break;
-
-  case Sema::PPK_Pop: // pack(pop [, id] [,  n])
-    // MSDN, C/C++ Preprocessor Reference > Pragma Directives > pack:
-    // "#pragma pack(pop, identifier, n) is undefined"
-    if (Alignment && Name)
-      Diag(PragmaLoc, diag::warn_pragma_pack_pop_identifer_and_alignment);
-
-    // Do the pop.
-    if (!Context->pop(Name, /*IsReset=*/false)) {
-      // If a name was specified then failure indicates the name
-      // wasn't found. Otherwise failure indicates the stack was
-      // empty.
-      Diag(PragmaLoc, diag::warn_pragma_pop_failed)
-          << "pack" << (Name ? "no record matching name" : "stack empty");
-
-      // FIXME: Warn about popping named records as MSVC does.
-    } else {
-      // Pop succeeded, set the new alignment if specified.
-      if (Alignment)
-        Context->setAlignment(AlignmentVal);
-    }
-    break;
   }
+  // MSDN, C/C++ Preprocessor Reference > Pragma Directives > pack:
+  // "#pragma pack(pop, identifier, n) is undefined"
+  if (Action & Sema::PSK_Pop) {
+    if (Alignment && !SlotLabel.empty())
+      Diag(PragmaLoc, diag::warn_pragma_pack_pop_identifer_and_alignment);
+    if (PackStack.Stack.empty())
+      Diag(PragmaLoc, diag::warn_pragma_pop_failed) << "pack" << "stack empty";
+  }
+
+  PackStack.Act(PragmaLoc, Action, SlotLabel, AlignmentVal);
 }
 
 void Sema::ActOnPragmaMSStruct(PragmaMSStructKind Kind) { 
   MSStructPragmaOn = (Kind == PMSST_ON);
 }
 
-void Sema::ActOnPragmaMSComment(PragmaMSCommentKind Kind, StringRef Arg) {
-  // FIXME: Serialize this.
-  switch (Kind) {
-  case PCK_Unknown:
-    llvm_unreachable("unexpected pragma comment kind");
-  case PCK_Linker:
-    Consumer.HandleLinkerOption(Arg);
-    return;
-  case PCK_Lib:
-    Consumer.HandleDependentLibrary(Arg);
-    return;
-  case PCK_Compiler:
-  case PCK_ExeStr:
-  case PCK_User:
-    return;  // We ignore all of these.
-  }
-  llvm_unreachable("invalid pragma comment kind");
+void Sema::ActOnPragmaMSComment(SourceLocation CommentLoc,
+                                PragmaMSCommentKind Kind, StringRef Arg) {
+  auto *PCD = PragmaCommentDecl::Create(
+      Context, Context.getTranslationUnitDecl(), CommentLoc, Kind, Arg);
+  Context.getTranslationUnitDecl()->addDecl(PCD);
+  Consumer.HandleTopLevelDecl(DeclGroupRef(PCD));
 }
 
-void Sema::ActOnPragmaDetectMismatch(StringRef Name, StringRef Value) {
-  // FIXME: Serialize this.
-  Consumer.HandleDetectMismatch(Name, Value);
+void Sema::ActOnPragmaDetectMismatch(SourceLocation Loc, StringRef Name,
+                                     StringRef Value) {
+  auto *PDMD = PragmaDetectMismatchDecl::Create(
+      Context, Context.getTranslationUnitDecl(), Loc, Name, Value);
+  Context.getTranslationUnitDecl()->addDecl(PDMD);
+  Consumer.HandleTopLevelDecl(DeclGroupRef(PDMD));
 }
 
 void Sema::ActOnPragmaMSPointersToMembers(
@@ -300,29 +199,13 @@
   ImplicitMSInheritanceAttrLoc = PragmaLoc;
 }
 
-void Sema::ActOnPragmaMSVtorDisp(PragmaVtorDispKind Kind,
+void Sema::ActOnPragmaMSVtorDisp(PragmaMsStackAction Action,
                                  SourceLocation PragmaLoc,
                                  MSVtorDispAttr::Mode Mode) {
-  switch (Kind) {
-  case PVDK_Set:
-    VtorDispModeStack.back() = Mode;
-    break;
-  case PVDK_Push:
-    VtorDispModeStack.push_back(Mode);
-    break;
-  case PVDK_Reset:
-    VtorDispModeStack.clear();
-    VtorDispModeStack.push_back(MSVtorDispAttr::Mode(LangOpts.VtorDispMode));
-    break;
-  case PVDK_Pop:
-    VtorDispModeStack.pop_back();
-    if (VtorDispModeStack.empty()) {
-      Diag(PragmaLoc, diag::warn_pragma_pop_failed) << "vtordisp"
-                                                    << "stack empty";
-      VtorDispModeStack.push_back(MSVtorDispAttr::Mode(LangOpts.VtorDispMode));
-    }
-    break;
-  }
+  if (Action & PSK_Pop && VtorDispStack.Stack.empty())
+    Diag(PragmaLoc, diag::warn_pragma_pop_failed) << "vtordisp"
+                                                  << "stack empty";
+  VtorDispStack.Act(PragmaLoc, Action, StringRef(), Mode);
 }
 
 template<typename ValueType>
@@ -331,7 +214,7 @@
                                        llvm::StringRef StackSlotLabel,
                                        ValueType Value) {
   if (Action == PSK_Reset) {
-    CurrentValue = nullptr;
+    CurrentValue = DefaultValue;
     return;
   }
   if (Action & PSK_Push)
@@ -339,8 +222,9 @@
   else if (Action & PSK_Pop) {
     if (!StackSlotLabel.empty()) {
       // If we've got a label, try to find it and jump there.
-      auto I = std::find_if(Stack.rbegin(), Stack.rend(),
-        [&](const Slot &x) { return x.StackSlotLabel == StackSlotLabel; });
+      auto I = llvm::find_if(llvm::reverse(Stack), [&](const Slot &x) {
+        return x.StackSlotLabel == StackSlotLabel;
+      });
       // If we found the label so pop from there.
       if (I != Stack.rend()) {
         CurrentValue = I->Value;
@@ -467,7 +351,8 @@
   if (VD->isUsed())
     Diag(PragmaLoc, diag::warn_used_but_marked_unused) << Name;
 
-  VD->addAttr(UnusedAttr::CreateImplicit(Context, IdTok.getLocation()));
+  VD->addAttr(UnusedAttr::CreateImplicit(Context, UnusedAttr::GNU_unused,
+                                         IdTok.getLocation()));
 }
 
 void Sema::AddCFAuditedAttribute(Decl *D) {
diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp
index 61dfdd3..6f94e54 100644
--- a/lib/Sema/SemaCUDA.cpp
+++ b/lib/Sema/SemaCUDA.cpp
@@ -11,11 +11,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/Sema.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Sema/Lookup.h"
+#include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaDiagnostic.h"
+#include "clang/Sema/Template.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 using namespace clang;
@@ -67,33 +70,30 @@
 // Ph - preference in host mode
 // Pd - preference in device mode
 // H  - handled in (x)
-// Preferences: b-best, f-fallback, l-last resort, n-never.
+// Preferences: N:native, SS:same side, HD:host-device, WS:wrong side, --:never.
 //
-// | F  | T  | Ph | Pd |  H  |
-// |----+----+----+----+-----+
-// | d  | d  | b  | b  | (b) |
-// | d  | g  | n  | n  | (a) |
-// | d  | h  | l  | l  | (e) |
-// | d  | hd | f  | f  | (c) |
-// | g  | d  | b  | b  | (b) |
-// | g  | g  | n  | n  | (a) |
-// | g  | h  | l  | l  | (e) |
-// | g  | hd | f  | f  | (c) |
-// | h  | d  | l  | l  | (e) |
-// | h  | g  | b  | b  | (b) |
-// | h  | h  | b  | b  | (b) |
-// | h  | hd | f  | f  | (c) |
-// | hd | d  | l  | f  | (d) |
-// | hd | g  | f  | n  |(d/a)|
-// | hd | h  | f  | l  | (d) |
-// | hd | hd | b  | b  | (b) |
+// | F  | T  | Ph  | Pd  |  H  |
+// |----+----+-----+-----+-----+
+// | d  | d  | N   | N   | (c) |
+// | d  | g  | --  | --  | (a) |
+// | d  | h  | --  | --  | (e) |
+// | d  | hd | HD  | HD  | (b) |
+// | g  | d  | N   | N   | (c) |
+// | g  | g  | --  | --  | (a) |
+// | g  | h  | --  | --  | (e) |
+// | g  | hd | HD  | HD  | (b) |
+// | h  | d  | --  | --  | (e) |
+// | h  | g  | N   | N   | (c) |
+// | h  | h  | N   | N   | (c) |
+// | h  | hd | HD  | HD  | (b) |
+// | hd | d  | WS  | SS  | (d) |
+// | hd | g  | SS  | --  |(d/a)|
+// | hd | h  | SS  | WS  | (d) |
+// | hd | hd | HD  | HD  | (b) |
 
 Sema::CUDAFunctionPreference
 Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
                              const FunctionDecl *Callee) {
-  assert(getLangOpts().CUDATargetOverloads &&
-         "Should not be called w/o enabled target overloads.");
-
   assert(Callee && "Callee must be valid.");
   CUDAFunctionTarget CalleeTarget = IdentifyCUDATarget(Callee);
   CUDAFunctionTarget CallerTarget =
@@ -111,130 +111,62 @@
        (CallerTarget == CFT_HostDevice && getLangOpts().CUDAIsDevice)))
     return CFP_Never;
 
-  // (b) Best case scenarios
+  // (b) Calling HostDevice is OK for everyone.
+  if (CalleeTarget == CFT_HostDevice)
+    return CFP_HostDevice;
+
+  // (c) Best case scenarios
   if (CalleeTarget == CallerTarget ||
       (CallerTarget == CFT_Host && CalleeTarget == CFT_Global) ||
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
-    return CFP_Best;
-
-  // (c) Calling HostDevice is OK as a fallback that works for everyone.
-  if (CalleeTarget == CFT_HostDevice)
-    return CFP_Fallback;
-
-  // Figure out what should be returned 'last resort' cases. Normally
-  // those would not be allowed, but we'll consider them if
-  // CUDADisableTargetCallChecks is true.
-  CUDAFunctionPreference QuestionableResult =
-      getLangOpts().CUDADisableTargetCallChecks ? CFP_LastResort : CFP_Never;
+    return CFP_Native;
 
   // (d) HostDevice behavior depends on compilation mode.
   if (CallerTarget == CFT_HostDevice) {
-    // Calling a function that matches compilation mode is OK.
-    // Calling a function from the other side is frowned upon.
-    if (getLangOpts().CUDAIsDevice)
-      return CalleeTarget == CFT_Device ? CFP_Fallback : QuestionableResult;
-    else
-      return (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)
-                 ? CFP_Fallback
-                 : QuestionableResult;
+    // It's OK to call a compilation-mode matching function from an HD one.
+    if ((getLangOpts().CUDAIsDevice && CalleeTarget == CFT_Device) ||
+        (!getLangOpts().CUDAIsDevice &&
+         (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)))
+      return CFP_SameSide;
+
+    // Calls from HD to non-mode-matching functions (i.e., to host functions
+    // when compiling in device mode or to device functions when compiling in
+    // host mode) are allowed at the sema level, but eventually rejected if
+    // they're ever codegened.  TODO: Reject said calls earlier.
+    return CFP_WrongSide;
   }
 
   // (e) Calling across device/host boundary is not something you should do.
   if ((CallerTarget == CFT_Host && CalleeTarget == CFT_Device) ||
       (CallerTarget == CFT_Device && CalleeTarget == CFT_Host) ||
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Host))
-    return QuestionableResult;
+    return CFP_Never;
 
   llvm_unreachable("All cases should've been handled by now.");
 }
 
-bool Sema::CheckCUDATarget(const FunctionDecl *Caller,
-                           const FunctionDecl *Callee) {
-  // With target overloads enabled, we only disallow calling
-  // combinations with CFP_Never.
-  if (getLangOpts().CUDATargetOverloads)
-    return IdentifyCUDAPreference(Caller,Callee) == CFP_Never;
-
-  // The CUDADisableTargetCallChecks short-circuits this check: we assume all
-  // cross-target calls are valid.
-  if (getLangOpts().CUDADisableTargetCallChecks)
-    return false;
-
-  CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller),
-                     CalleeTarget = IdentifyCUDATarget(Callee);
-
-  // If one of the targets is invalid, the check always fails, no matter what
-  // the other target is.
-  if (CallerTarget == CFT_InvalidTarget || CalleeTarget == CFT_InvalidTarget)
-    return true;
-
-  // CUDA B.1.1 "The __device__ qualifier declares a function that is [...]
-  // Callable from the device only."
-  if (CallerTarget == CFT_Host && CalleeTarget == CFT_Device)
-    return true;
-
-  // CUDA B.1.2 "The __global__ qualifier declares a function that is [...]
-  // Callable from the host only."
-  // CUDA B.1.3 "The __host__ qualifier declares a function that is [...]
-  // Callable from the host only."
-  if ((CallerTarget == CFT_Device || CallerTarget == CFT_Global) &&
-      (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global))
-    return true;
-
-  // CUDA B.1.3 "The __device__ and __host__ qualifiers can be used together
-  // however, in which case the function is compiled for both the host and the
-  // device. The __CUDA_ARCH__ macro [...] can be used to differentiate code
-  // paths between host and device."
-  if (CallerTarget == CFT_HostDevice && CalleeTarget != CFT_HostDevice) {
-    // If the caller is implicit then the check always passes.
-    if (Caller->isImplicit()) return false;
-
-    bool InDeviceMode = getLangOpts().CUDAIsDevice;
-    if (!InDeviceMode && CalleeTarget != CFT_Host)
-        return true;
-    if (InDeviceMode && CalleeTarget != CFT_Device) {
-      // Allow host device functions to call host functions if explicitly
-      // requested.
-      if (CalleeTarget == CFT_Host &&
-          getLangOpts().CUDAAllowHostCallsFromHostDevice) {
-        Diag(Caller->getLocation(),
-             diag::warn_host_calls_from_host_device)
-            << Callee->getNameAsString() << Caller->getNameAsString();
-        return false;
-      }
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-template <typename T, typename FetchDeclFn>
-static void EraseUnwantedCUDAMatchesImpl(Sema &S, const FunctionDecl *Caller,
-                                         llvm::SmallVectorImpl<T> &Matches,
-                                         FetchDeclFn FetchDecl) {
-  assert(S.getLangOpts().CUDATargetOverloads &&
-         "Should not be called w/o enabled target overloads.");
+template <typename T>
+static void EraseUnwantedCUDAMatchesImpl(
+    Sema &S, const FunctionDecl *Caller, llvm::SmallVectorImpl<T> &Matches,
+    std::function<const FunctionDecl *(const T &)> FetchDecl) {
   if (Matches.size() <= 1)
     return;
 
+  // Gets the CUDA function preference for a call from Caller to Match.
+  auto GetCFP = [&](const T &Match) {
+    return S.IdentifyCUDAPreference(Caller, FetchDecl(Match));
+  };
+
   // Find the best call preference among the functions in Matches.
-  Sema::CUDAFunctionPreference P, BestCFP = Sema::CFP_Never;
-  for (auto const &Match : Matches) {
-    P = S.IdentifyCUDAPreference(Caller, FetchDecl(Match));
-    if (P > BestCFP)
-      BestCFP = P;
-  }
+  Sema::CUDAFunctionPreference BestCFP = GetCFP(*std::max_element(
+      Matches.begin(), Matches.end(),
+      [&](const T &M1, const T &M2) { return GetCFP(M1) < GetCFP(M2); }));
 
   // Erase all functions with lower priority.
-  for (unsigned I = 0, N = Matches.size(); I != N;)
-    if (S.IdentifyCUDAPreference(Caller, FetchDecl(Matches[I])) < BestCFP) {
-      Matches[I] = Matches[--N];
-      Matches.resize(N);
-    } else {
-      ++I;
-    }
+  Matches.erase(
+      llvm::remove_if(Matches,
+                      [&](const T &Match) { return GetCFP(Match) < BestCFP; }),
+      Matches.end());
 }
 
 void Sema::EraseUnwantedCUDAMatches(const FunctionDecl *Caller,
@@ -273,12 +205,9 @@
 resolveCalleeCUDATargetConflict(Sema::CUDAFunctionTarget Target1,
                                 Sema::CUDAFunctionTarget Target2,
                                 Sema::CUDAFunctionTarget *ResolvedTarget) {
-  if (Target1 == Sema::CFT_Global && Target2 == Sema::CFT_Global) {
-    // TODO: this shouldn't happen, really. Methods cannot be marked __global__.
-    // Clang should detect this earlier and produce an error. Then this
-    // condition can be changed to an assertion.
-    return true;
-  }
+  // Only free functions and static member functions may be global.
+  assert(Target1 != Sema::CFT_Global);
+  assert(Target2 != Sema::CFT_Global);
 
   if (Target1 == Sema::CFT_HostDevice) {
     *ResolvedTarget = Target2;
@@ -422,3 +351,167 @@
 
   return false;
 }
+
+bool Sema::isEmptyCudaConstructor(SourceLocation Loc, CXXConstructorDecl *CD) {
+  if (!CD->isDefined() && CD->isTemplateInstantiation())
+    InstantiateFunctionDefinition(Loc, CD->getFirstDecl());
+
+  // (E.2.3.1, CUDA 7.5) A constructor for a class type is considered
+  // empty at a point in the translation unit, if it is either a
+  // trivial constructor
+  if (CD->isTrivial())
+    return true;
+
+  // ... or it satisfies all of the following conditions:
+  // The constructor function has been defined.
+  // The constructor function has no parameters,
+  // and the function body is an empty compound statement.
+  if (!(CD->hasTrivialBody() && CD->getNumParams() == 0))
+    return false;
+
+  // Its class has no virtual functions and no virtual base classes.
+  if (CD->getParent()->isDynamicClass())
+    return false;
+
+  // The only form of initializer allowed is an empty constructor.
+  // This will recursively check all base classes and member initializers
+  if (!llvm::all_of(CD->inits(), [&](const CXXCtorInitializer *CI) {
+        if (const CXXConstructExpr *CE =
+                dyn_cast<CXXConstructExpr>(CI->getInit()))
+          return isEmptyCudaConstructor(Loc, CE->getConstructor());
+        return false;
+      }))
+    return false;
+
+  return true;
+}
+
+bool Sema::isEmptyCudaDestructor(SourceLocation Loc, CXXDestructorDecl *DD) {
+  // No destructor -> no problem.
+  if (!DD)
+    return true;
+
+  if (!DD->isDefined() && DD->isTemplateInstantiation())
+    InstantiateFunctionDefinition(Loc, DD->getFirstDecl());
+
+  // (E.2.3.1, CUDA 7.5) A destructor for a class type is considered
+  // empty at a point in the translation unit, if it is either a
+  // trivial constructor
+  if (DD->isTrivial())
+    return true;
+
+  // ... or it satisfies all of the following conditions:
+  // The destructor function has been defined.
+  // and the function body is an empty compound statement.
+  if (!DD->hasTrivialBody())
+    return false;
+
+  const CXXRecordDecl *ClassDecl = DD->getParent();
+
+  // Its class has no virtual functions and no virtual base classes.
+  if (ClassDecl->isDynamicClass())
+    return false;
+
+  // Only empty destructors are allowed. This will recursively check
+  // destructors for all base classes...
+  if (!llvm::all_of(ClassDecl->bases(), [&](const CXXBaseSpecifier &BS) {
+        if (CXXRecordDecl *RD = BS.getType()->getAsCXXRecordDecl())
+          return isEmptyCudaDestructor(Loc, RD->getDestructor());
+        return true;
+      }))
+    return false;
+
+  // ... and member fields.
+  if (!llvm::all_of(ClassDecl->fields(), [&](const FieldDecl *Field) {
+        if (CXXRecordDecl *RD = Field->getType()
+                                    ->getBaseElementTypeUnsafe()
+                                    ->getAsCXXRecordDecl())
+          return isEmptyCudaDestructor(Loc, RD->getDestructor());
+        return true;
+      }))
+    return false;
+
+  return true;
+}
+
+// With -fcuda-host-device-constexpr, an unattributed constexpr function is
+// treated as implicitly __host__ __device__, unless:
+//  * it is a variadic function (device-side variadic functions are not
+//    allowed), or
+//  * a __device__ function with this signature was already declared, in which
+//    case in which case we output an error, unless the __device__ decl is in a
+//    system header, in which case we leave the constexpr function unattributed.
+void Sema::maybeAddCUDAHostDeviceAttrs(Scope *S, FunctionDecl *NewD,
+                                       const LookupResult &Previous) {
+  assert(getLangOpts().CUDA && "May be called only for CUDA compilations.");
+  if (!getLangOpts().CUDAHostDeviceConstexpr || !NewD->isConstexpr() ||
+      NewD->isVariadic() || NewD->hasAttr<CUDAHostAttr>() ||
+      NewD->hasAttr<CUDADeviceAttr>() || NewD->hasAttr<CUDAGlobalAttr>())
+    return;
+
+  // Is D a __device__ function with the same signature as NewD, ignoring CUDA
+  // attributes?
+  auto IsMatchingDeviceFn = [&](NamedDecl *D) {
+    if (UsingShadowDecl *Using = dyn_cast<UsingShadowDecl>(D))
+      D = Using->getTargetDecl();
+    FunctionDecl *OldD = D->getAsFunction();
+    return OldD && OldD->hasAttr<CUDADeviceAttr>() &&
+           !OldD->hasAttr<CUDAHostAttr>() &&
+           !IsOverload(NewD, OldD, /* UseMemberUsingDeclRules = */ false,
+                       /* ConsiderCudaAttrs = */ false);
+  };
+  auto It = llvm::find_if(Previous, IsMatchingDeviceFn);
+  if (It != Previous.end()) {
+    // We found a __device__ function with the same name and signature as NewD
+    // (ignoring CUDA attrs).  This is an error unless that function is defined
+    // in a system header, in which case we simply return without making NewD
+    // host+device.
+    NamedDecl *Match = *It;
+    if (!getSourceManager().isInSystemHeader(Match->getLocation())) {
+      Diag(NewD->getLocation(),
+           diag::err_cuda_unattributed_constexpr_cannot_overload_device)
+          << NewD->getName();
+      Diag(Match->getLocation(),
+           diag::note_cuda_conflicting_device_function_declared_here);
+    }
+    return;
+  }
+
+  NewD->addAttr(CUDAHostAttr::CreateImplicit(Context));
+  NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
+}
+
+bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
+  assert(getLangOpts().CUDA &&
+         "Should only be called during CUDA compilation.");
+  assert(Callee && "Callee may not be null.");
+  FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
+  if (!Caller)
+    return true;
+
+  Sema::CUDAFunctionPreference Pref = IdentifyCUDAPreference(Caller, Callee);
+  if (Pref == Sema::CFP_Never) {
+    Diag(Loc, diag::err_ref_bad_target) << IdentifyCUDATarget(Callee) << Callee
+                                        << IdentifyCUDATarget(Caller);
+    Diag(Callee->getLocation(), diag::note_previous_decl) << Callee;
+    return false;
+  }
+  if (Pref == Sema::CFP_WrongSide) {
+    // We have to do this odd dance to create our PartialDiagnostic because we
+    // want its storage to be allocated with operator new, not in an arena.
+    PartialDiagnostic ErrPD{PartialDiagnostic::NullDiagnostic()};
+    ErrPD.Reset(diag::err_ref_bad_target);
+    ErrPD << IdentifyCUDATarget(Callee) << Callee << IdentifyCUDATarget(Caller);
+    Caller->addDeferredDiag({Loc, std::move(ErrPD)});
+
+    PartialDiagnostic NotePD{PartialDiagnostic::NullDiagnostic()};
+    NotePD.Reset(diag::note_previous_decl);
+    NotePD << Callee;
+    Caller->addDeferredDiag({Callee->getLocation(), std::move(NotePD)});
+
+    // This is not immediately an error, so return true.  The deferred errors
+    // will be emitted if and when Caller is codegen'ed.
+    return true;
+  }
+  return true;
+}
diff --git a/lib/Sema/SemaCXXScopeSpec.cpp b/lib/Sema/SemaCXXScopeSpec.cpp
index f7aace6..637a631 100644
--- a/lib/Sema/SemaCXXScopeSpec.cpp
+++ b/lib/Sema/SemaCXXScopeSpec.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "TypeLocBuilder.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclTemplate.h"
@@ -20,9 +19,9 @@
 #include "clang/Basic/PartialDiagnostic.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/raw_ostream.h"
 using namespace clang;
 
 /// \brief Find the current instantiation that associated with the given type.
@@ -117,8 +116,18 @@
           // specializations, we're entering into the definition of that
           // class template partial specialization.
           if (ClassTemplatePartialSpecializationDecl *PartialSpec
-                = ClassTemplate->findPartialSpecialization(ContextType))
+                = ClassTemplate->findPartialSpecialization(ContextType)) {
+            // A declaration of the partial specialization must be visible.
+            // We can always recover here, because this only happens when we're
+            // entering the context, and that can't happen in a SFINAE context.
+            assert(!isSFINAEContext() &&
+                   "partial specialization scope specifier in SFINAE context?");
+            if (!hasVisibleDeclaration(PartialSpec))
+              diagnoseMissingImport(SS.getLastQualifierNameLoc(), PartialSpec,
+                                    MissingImportKind::PartialSpecialization,
+                                    /*Recover*/true);
             return PartialSpec;
+          }
         }
       } else if (const RecordType *RecordT = NNSType->getAs<RecordType>()) {
         // The nested name specifier refers to a member of a class template.
@@ -195,6 +204,8 @@
   TagDecl *tag = dyn_cast<TagDecl>(DC);
 
   // If this is a dependent type, then we consider it complete.
+  // FIXME: This is wrong; we should require a (visible) definition to
+  // exist in this case too.
   if (!tag || tag->isDependentContext())
     return false;
 
@@ -218,10 +229,23 @@
   // Fixed enum types are complete, but they aren't valid as scopes
   // until we see a definition, so awkwardly pull out this special
   // case.
-  // FIXME: The definition might not be visible; complain if it is not.
   const EnumType *enumType = dyn_cast_or_null<EnumType>(tagType);
-  if (!enumType || enumType->getDecl()->isCompleteDefinition())
+  if (!enumType)
     return false;
+  if (enumType->getDecl()->isCompleteDefinition()) {
+    // If we know about the definition but it is not visible, complain.
+    NamedDecl *SuggestedDef = nullptr;
+    if (!hasVisibleDefinition(enumType->getDecl(), &SuggestedDef,
+                              /*OnlyNeedComplete*/false)) {
+      // If the user is going to see an error here, recover by making the
+      // definition visible.
+      bool TreatAsComplete = !isSFINAEContext();
+      diagnoseMissingImport(loc, SuggestedDef, MissingImportKind::Definition,
+                            /*Recover*/TreatAsComplete);
+      return !TreatAsComplete;
+    }
+    return false;
+  }
 
   // Try to instantiate the definition, if this is a specialization of an
   // enumeration temploid.
@@ -356,12 +380,11 @@
 }
 
 bool Sema::isNonTypeNestedNameSpecifier(Scope *S, CXXScopeSpec &SS,
-                                        SourceLocation IdLoc,
-                                        IdentifierInfo &II,
-                                        ParsedType ObjectTypePtr) {
-  QualType ObjectType = GetTypeFromParser(ObjectTypePtr);
-  LookupResult Found(*this, &II, IdLoc, LookupNestedNameSpecifierName);
-  
+                                        NestedNameSpecInfo &IdInfo) {
+  QualType ObjectType = GetTypeFromParser(IdInfo.ObjectType);
+  LookupResult Found(*this, IdInfo.Identifier, IdInfo.IdentifierLoc,
+                     LookupNestedNameSpecifierName);
+
   // Determine where to perform name lookup
   DeclContext *LookupCtx = nullptr;
   bool isDependent = false;
@@ -424,11 +447,8 @@
 /// by ActOnCXXNestedNameSpecifier.
 ///
 /// \param S Scope in which the nested-name-specifier occurs.
-/// \param Identifier Identifier in the sequence "identifier" "::".
-/// \param IdentifierLoc Location of the \p Identifier.
-/// \param CCLoc Location of "::" following Identifier.
-/// \param ObjectType Type of postfix expression if the nested-name-specifier
-///        occurs in construct like: <tt>ptr->nns::f</tt>.
+/// \param IdInfo Parser information about an identifier in the
+///        nested-name-spec.
 /// \param EnteringContext If true, enter the context specified by the
 ///        nested-name-specifier.
 /// \param SS Optional nested name specifier preceding the identifier.
@@ -454,17 +474,15 @@
 /// dependent context, for example. Nor will it extend \p SS with the scope
 /// specifier.
 bool Sema::BuildCXXNestedNameSpecifier(Scope *S,
-                                       IdentifierInfo &Identifier,
-                                       SourceLocation IdentifierLoc,
-                                       SourceLocation CCLoc,
-                                       QualType ObjectType,
+                                       NestedNameSpecInfo &IdInfo,
                                        bool EnteringContext,
                                        CXXScopeSpec &SS,
                                        NamedDecl *ScopeLookupResult,
                                        bool ErrorRecoveryLookup,
                                        bool *IsCorrectedToColon) {
-  LookupResult Found(*this, &Identifier, IdentifierLoc, 
+  LookupResult Found(*this, IdInfo.Identifier, IdInfo.IdentifierLoc,
                      LookupNestedNameSpecifierName);
+  QualType ObjectType = GetTypeFromParser(IdInfo.ObjectType);
 
   // Determine where to perform name lookup
   DeclContext *LookupCtx = nullptr;
@@ -549,7 +567,7 @@
     // base object type or prior nested-name-specifier, so this
     // nested-name-specifier refers to an unknown specialization. Just build
     // a dependent nested-name-specifier.
-    SS.Extend(Context, &Identifier, IdentifierLoc, CCLoc);
+    SS.Extend(Context, IdInfo.Identifier, IdInfo.IdentifierLoc, IdInfo.CCLoc);
     return false;
   }
 
@@ -568,18 +586,19 @@
       // allowed, suggest replacement to ':'.
       if (IsCorrectedToColon) {
         *IsCorrectedToColon = true;
-        Diag(CCLoc, diag::err_nested_name_spec_is_not_class)
-            << &Identifier << getLangOpts().CPlusPlus
-            << FixItHint::CreateReplacement(CCLoc, ":");
+        Diag(IdInfo.CCLoc, diag::err_nested_name_spec_is_not_class)
+            << IdInfo.Identifier << getLangOpts().CPlusPlus
+            << FixItHint::CreateReplacement(IdInfo.CCLoc, ":");
         if (NamedDecl *ND = R.getAsSingle<NamedDecl>())
           Diag(ND->getLocation(), diag::note_declared_at);
         return true;
       }
       // Replacement '::' -> ':' is not allowed, just issue respective error.
       Diag(R.getNameLoc(), diag::err_expected_class_or_namespace)
-          << &Identifier << getLangOpts().CPlusPlus;
+          << IdInfo.Identifier << getLangOpts().CPlusPlus;
       if (NamedDecl *ND = R.getAsSingle<NamedDecl>())
-        Diag(ND->getLocation(), diag::note_entity_declared_at) << &Identifier;
+        Diag(ND->getLocation(), diag::note_entity_declared_at)
+            << IdInfo.Identifier;
       return true;
     }
   }
@@ -606,11 +625,15 @@
         diagnoseTypo(Corrected, PDiag(diag::err_undeclared_var_use_suggest)
                                   << Name);
 
+      if (Corrected.getCorrectionSpecifier())
+        SS.MakeTrivial(Context, Corrected.getCorrectionSpecifier(),
+                       SourceRange(Found.getNameLoc()));
+
       if (NamedDecl *ND = Corrected.getFoundDecl())
         Found.addDecl(ND);
       Found.setLookupName(Corrected.getCorrection());
     } else {
-      Found.setLookupName(&Identifier);
+      Found.setLookupName(IdInfo.Identifier);
     }
   }
 
@@ -620,7 +643,7 @@
   bool AcceptSpec = isAcceptableNestedNameSpecifier(SD, &IsExtension);
   if (!AcceptSpec && IsExtension) {
     AcceptSpec = true;
-    Diag(IdentifierLoc, diag::ext_nested_name_spec_is_enum);
+    Diag(IdInfo.IdentifierLoc, diag::ext_nested_name_spec_is_enum);
   }
   if (AcceptSpec) {
     if (!ObjectType.isNull() && !ObjectTypeSearchedInScope &&
@@ -637,7 +660,7 @@
       // Note that C++11 does *not* perform this redundant lookup.
       NamedDecl *OuterDecl;
       if (S) {
-        LookupResult FoundOuter(*this, &Identifier, IdentifierLoc, 
+        LookupResult FoundOuter(*this, IdInfo.Identifier, IdInfo.IdentifierLoc,
                                 LookupNestedNameSpecifierName);
         LookupName(FoundOuter, S);
         OuterDecl = FoundOuter.getAsSingle<NamedDecl>();
@@ -653,9 +676,9 @@
         if (ErrorRecoveryLookup)
           return true;
 
-         Diag(IdentifierLoc, 
+         Diag(IdInfo.IdentifierLoc,
               diag::err_nested_name_member_ref_lookup_ambiguous)
-           << &Identifier;
+           << IdInfo.Identifier;
          Diag(SD->getLocation(), diag::note_ambig_member_ref_object_type)
            << ObjectType;
          Diag(OuterDecl->getLocation(), diag::note_ambig_member_ref_scope);
@@ -674,16 +697,15 @@
       return false;
 
     // The use of a nested name specifier may trigger deprecation warnings.
-    DiagnoseUseOfDecl(SD, CCLoc);
+    DiagnoseUseOfDecl(SD, IdInfo.CCLoc);
 
-    
     if (NamespaceDecl *Namespace = dyn_cast<NamespaceDecl>(SD)) {
-      SS.Extend(Context, Namespace, IdentifierLoc, CCLoc);
+      SS.Extend(Context, Namespace, IdInfo.IdentifierLoc, IdInfo.CCLoc);
       return false;
     }
 
     if (NamespaceAliasDecl *Alias = dyn_cast<NamespaceAliasDecl>(SD)) {
-      SS.Extend(Context, Alias, IdentifierLoc, CCLoc);
+      SS.Extend(Context, Alias, IdInfo.IdentifierLoc, IdInfo.CCLoc);
       return false;
     }
 
@@ -693,41 +715,41 @@
     if (isa<InjectedClassNameType>(T)) {
       InjectedClassNameTypeLoc InjectedTL
         = TLB.push<InjectedClassNameTypeLoc>(T);
-      InjectedTL.setNameLoc(IdentifierLoc);
+      InjectedTL.setNameLoc(IdInfo.IdentifierLoc);
     } else if (isa<RecordType>(T)) {
       RecordTypeLoc RecordTL = TLB.push<RecordTypeLoc>(T);
-      RecordTL.setNameLoc(IdentifierLoc);
+      RecordTL.setNameLoc(IdInfo.IdentifierLoc);
     } else if (isa<TypedefType>(T)) {
       TypedefTypeLoc TypedefTL = TLB.push<TypedefTypeLoc>(T);
-      TypedefTL.setNameLoc(IdentifierLoc);
+      TypedefTL.setNameLoc(IdInfo.IdentifierLoc);
     } else if (isa<EnumType>(T)) {
       EnumTypeLoc EnumTL = TLB.push<EnumTypeLoc>(T);
-      EnumTL.setNameLoc(IdentifierLoc);
+      EnumTL.setNameLoc(IdInfo.IdentifierLoc);
     } else if (isa<TemplateTypeParmType>(T)) {
       TemplateTypeParmTypeLoc TemplateTypeTL
         = TLB.push<TemplateTypeParmTypeLoc>(T);
-      TemplateTypeTL.setNameLoc(IdentifierLoc);
+      TemplateTypeTL.setNameLoc(IdInfo.IdentifierLoc);
     } else if (isa<UnresolvedUsingType>(T)) {
       UnresolvedUsingTypeLoc UnresolvedTL
         = TLB.push<UnresolvedUsingTypeLoc>(T);
-      UnresolvedTL.setNameLoc(IdentifierLoc);
+      UnresolvedTL.setNameLoc(IdInfo.IdentifierLoc);
     } else if (isa<SubstTemplateTypeParmType>(T)) {
       SubstTemplateTypeParmTypeLoc TL 
         = TLB.push<SubstTemplateTypeParmTypeLoc>(T);
-      TL.setNameLoc(IdentifierLoc);
+      TL.setNameLoc(IdInfo.IdentifierLoc);
     } else if (isa<SubstTemplateTypeParmPackType>(T)) {
       SubstTemplateTypeParmPackTypeLoc TL
         = TLB.push<SubstTemplateTypeParmPackTypeLoc>(T);
-      TL.setNameLoc(IdentifierLoc);
+      TL.setNameLoc(IdInfo.IdentifierLoc);
     } else {
       llvm_unreachable("Unhandled TypeDecl node in nested-name-specifier");
     }
 
     if (T->isEnumeralType())
-      Diag(IdentifierLoc, diag::warn_cxx98_compat_enum_nested_name_spec);
+      Diag(IdInfo.IdentifierLoc, diag::warn_cxx98_compat_enum_nested_name_spec);
 
     SS.Extend(Context, SourceLocation(), TLB.getTypeLocInContext(Context, T),
-              CCLoc);
+              IdInfo.CCLoc);
     return false;
   }
 
@@ -766,9 +788,11 @@
     if (DC->isDependentContext() && DC->isFunctionOrMethod()) {
       CXXRecordDecl *ContainingClass = dyn_cast<CXXRecordDecl>(DC->getParent());
       if (ContainingClass && ContainingClass->hasAnyDependentBases()) {
-        Diag(IdentifierLoc, diag::ext_undeclared_unqual_id_with_dependent_base)
-            << &Identifier << ContainingClass;
-        SS.Extend(Context, &Identifier, IdentifierLoc, CCLoc);
+        Diag(IdInfo.IdentifierLoc,
+             diag::ext_undeclared_unqual_id_with_dependent_base)
+            << IdInfo.Identifier << ContainingClass;
+        SS.Extend(Context, IdInfo.Identifier, IdInfo.IdentifierLoc,
+                  IdInfo.CCLoc);
         return false;
       }
     }
@@ -776,28 +800,27 @@
 
   if (!Found.empty()) {
     if (TypeDecl *TD = Found.getAsSingle<TypeDecl>())
-      Diag(IdentifierLoc, diag::err_expected_class_or_namespace)
+      Diag(IdInfo.IdentifierLoc, diag::err_expected_class_or_namespace)
           << QualType(TD->getTypeForDecl(), 0) << getLangOpts().CPlusPlus;
     else {
-      Diag(IdentifierLoc, diag::err_expected_class_or_namespace)
-          << &Identifier << getLangOpts().CPlusPlus;
+      Diag(IdInfo.IdentifierLoc, diag::err_expected_class_or_namespace)
+          << IdInfo.Identifier << getLangOpts().CPlusPlus;
       if (NamedDecl *ND = Found.getAsSingle<NamedDecl>())
-        Diag(ND->getLocation(), diag::note_entity_declared_at) << &Identifier;
+        Diag(ND->getLocation(), diag::note_entity_declared_at)
+            << IdInfo.Identifier;
     }
   } else if (SS.isSet())
-    Diag(IdentifierLoc, diag::err_no_member) << &Identifier << LookupCtx
-                                             << SS.getRange();
+    Diag(IdInfo.IdentifierLoc, diag::err_no_member) << IdInfo.Identifier
+        << LookupCtx << SS.getRange();
   else
-    Diag(IdentifierLoc, diag::err_undeclared_var_use) << &Identifier;
+    Diag(IdInfo.IdentifierLoc, diag::err_undeclared_var_use)
+        << IdInfo.Identifier;
 
   return true;
 }
 
 bool Sema::ActOnCXXNestedNameSpecifier(Scope *S,
-                                       IdentifierInfo &Identifier,
-                                       SourceLocation IdentifierLoc,
-                                       SourceLocation CCLoc,
-                                       ParsedType ObjectType,
+                                       NestedNameSpecInfo &IdInfo,
                                        bool EnteringContext,
                                        CXXScopeSpec &SS,
                                        bool ErrorRecoveryLookup,
@@ -805,9 +828,8 @@
   if (SS.isInvalid())
     return true;
 
-  return BuildCXXNestedNameSpecifier(S, Identifier, IdentifierLoc, CCLoc,
-                                     GetTypeFromParser(ObjectType),
-                                     EnteringContext, SS, 
+  return BuildCXXNestedNameSpecifier(S, IdInfo,
+                                     EnteringContext, SS,
                                      /*ScopeLookupResult=*/nullptr, false,
                                      IsCorrectedToColon);
 }
@@ -842,17 +864,12 @@
 ///
 /// The arguments are the same as those passed to ActOnCXXNestedNameSpecifier.
 bool Sema::IsInvalidUnlessNestedName(Scope *S, CXXScopeSpec &SS,
-                                     IdentifierInfo &Identifier, 
-                                     SourceLocation IdentifierLoc,
-                                     SourceLocation ColonLoc,
-                                     ParsedType ObjectType,
+                                     NestedNameSpecInfo &IdInfo,
                                      bool EnteringContext) {
   if (SS.isInvalid())
     return false;
 
-  return !BuildCXXNestedNameSpecifier(S, Identifier, IdentifierLoc, ColonLoc,
-                                      GetTypeFromParser(ObjectType),
-                                      EnteringContext, SS, 
+  return !BuildCXXNestedNameSpecifier(S, IdInfo, EnteringContext, SS,
                                       /*ScopeLookupResult=*/nullptr, true);
 }
 
diff --git a/lib/Sema/SemaCast.cpp b/lib/Sema/SemaCast.cpp
index 7207d04..e19020c 100644
--- a/lib/Sema/SemaCast.cpp
+++ b/lib/Sema/SemaCast.cpp
@@ -22,6 +22,7 @@
 #include "clang/AST/RecordLayout.h"
 #include "clang/Basic/PartialDiagnostic.h"
 #include "clang/Basic/TargetInfo.h"
+#include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Initialization.h"
 #include "llvm/ADT/SmallVector.h"
 #include <set>
@@ -255,6 +256,7 @@
       Op.CheckConstCast();
       if (Op.SrcExpr.isInvalid())
         return ExprError();
+      DiscardMisalignedMemberAddress(DestType.getTypePtr(), E);
     }
     return Op.complete(CXXConstCastExpr::Create(Context, Op.ResultType,
                                   Op.ValueKind, Op.SrcExpr.get(), DestTInfo,
@@ -278,6 +280,7 @@
       Op.CheckReinterpretCast();
       if (Op.SrcExpr.isInvalid())
         return ExprError();
+      DiscardMisalignedMemberAddress(DestType.getTypePtr(), E);
     }
     return Op.complete(CXXReinterpretCastExpr::Create(Context, Op.ResultType,
                                     Op.ValueKind, Op.Kind, Op.SrcExpr.get(),
@@ -290,6 +293,7 @@
       Op.CheckStaticCast();
       if (Op.SrcExpr.isInvalid())
         return ExprError();
+      DiscardMisalignedMemberAddress(DestType.getTypePtr(), E);
     }
     
     return Op.complete(CXXStaticCastExpr::Create(Context, Op.ResultType,
@@ -640,8 +644,8 @@
     // If we're dynamic_casting from a prvalue to an rvalue reference, we need
     // to materialize the prvalue before we bind the reference to it.
     if (SrcExpr.get()->isRValue())
-      SrcExpr = new (Self.Context) MaterializeTemporaryExpr(
-          SrcType, SrcExpr.get(), /*IsLValueReference*/false);
+      SrcExpr = Self.CreateMaterializeTemporaryExpr(
+          SrcType, SrcExpr.get(), /*IsLValueReference*/ false);
     SrcPointee = SrcType;
   }
 
@@ -1313,16 +1317,13 @@
     }
     std::string PathDisplayStr;
     std::set<unsigned> DisplayedPaths;
-    for (CXXBasePaths::paths_iterator PI = Paths.begin(), PE = Paths.end();
-         PI != PE; ++PI) {
-      if (DisplayedPaths.insert(PI->back().SubobjectNumber).second) {
+    for (clang::CXXBasePath &Path : Paths) {
+      if (DisplayedPaths.insert(Path.back().SubobjectNumber).second) {
         // We haven't displayed a path to this particular base
         // class subobject yet.
         PathDisplayStr += "\n    ";
-        for (CXXBasePath::const_reverse_iterator EI = PI->rbegin(),
-                                                 EE = PI->rend();
-             EI != EE; ++EI)
-          PathDisplayStr += EI->Base->getType().getAsString() + " -> ";
+        for (CXXBasePathElement &PE : llvm::reverse(Path))
+          PathDisplayStr += PE.Base->getType().getAsString() + " -> ";
         PathDisplayStr += QualType(DestType).getAsString();
       }
     }
@@ -1402,8 +1403,10 @@
 
   // Lock down the inheritance model right now in MS ABI, whether or not the
   // pointee types are the same.
-  if (Self.Context.getTargetInfo().getCXXABI().isMicrosoft())
+  if (Self.Context.getTargetInfo().getCXXABI().isMicrosoft()) {
     (void)Self.isCompleteType(OpRange.getBegin(), SrcType);
+    (void)Self.isCompleteType(OpRange.getBegin(), DestType);
+  }
 
   // T == T, modulo cv
   if (!Self.Context.hasSameUnqualifiedType(SrcMemPtr->getPointeeType(),
@@ -1646,8 +1649,8 @@
   if (NeedToMaterializeTemporary)
     // This is a const_cast from a class prvalue to an rvalue reference type.
     // Materialize a temporary to store the result of the conversion.
-    SrcExpr = new (Self.Context) MaterializeTemporaryExpr(
-        SrcType, SrcExpr.get(), /*IsLValueReference*/ false);
+    SrcExpr = Self.CreateMaterializeTemporaryExpr(SrcType, SrcExpr.get(),
+                                                  /*IsLValueReference*/ false);
 
   return TC_Success;
 }
@@ -1724,6 +1727,97 @@
     }
 }
 
+/// Diagnose casts that change the calling convention of a pointer to a function
+/// defined in the current TU.
+static void DiagnoseCallingConvCast(Sema &Self, const ExprResult &SrcExpr,
+                                    QualType DstType, SourceRange OpRange) {
+  // Check if this cast would change the calling convention of a function
+  // pointer type.
+  QualType SrcType = SrcExpr.get()->getType();
+  if (Self.Context.hasSameType(SrcType, DstType) ||
+      !SrcType->isFunctionPointerType() || !DstType->isFunctionPointerType())
+    return;
+  const auto *SrcFTy =
+      SrcType->castAs<PointerType>()->getPointeeType()->castAs<FunctionType>();
+  const auto *DstFTy =
+      DstType->castAs<PointerType>()->getPointeeType()->castAs<FunctionType>();
+  CallingConv SrcCC = SrcFTy->getCallConv();
+  CallingConv DstCC = DstFTy->getCallConv();
+  if (SrcCC == DstCC)
+    return;
+
+  // We have a calling convention cast. Check if the source is a pointer to a
+  // known, specific function that has already been defined.
+  Expr *Src = SrcExpr.get()->IgnoreParenImpCasts();
+  if (auto *UO = dyn_cast<UnaryOperator>(Src))
+    if (UO->getOpcode() == UO_AddrOf)
+      Src = UO->getSubExpr()->IgnoreParenImpCasts();
+  auto *DRE = dyn_cast<DeclRefExpr>(Src);
+  if (!DRE)
+    return;
+  auto *FD = dyn_cast<FunctionDecl>(DRE->getDecl());
+  const FunctionDecl *Definition;
+  if (!FD || !FD->hasBody(Definition))
+    return;
+
+  // Only warn if we are casting from the default convention to a non-default
+  // convention. This can happen when the programmer forgot to apply the calling
+  // convention to the function definition and then inserted this cast to
+  // satisfy the type system.
+  CallingConv DefaultCC = Self.getASTContext().getDefaultCallingConvention(
+      FD->isVariadic(), FD->isCXXInstanceMember());
+  if (DstCC == DefaultCC || SrcCC != DefaultCC)
+    return;
+
+  // Diagnose this cast, as it is probably bad.
+  StringRef SrcCCName = FunctionType::getNameForCallConv(SrcCC);
+  StringRef DstCCName = FunctionType::getNameForCallConv(DstCC);
+  Self.Diag(OpRange.getBegin(), diag::warn_cast_calling_conv)
+      << SrcCCName << DstCCName << OpRange;
+
+  // The checks above are cheaper than checking if the diagnostic is enabled.
+  // However, it's worth checking if the warning is enabled before we construct
+  // a fixit.
+  if (Self.Diags.isIgnored(diag::warn_cast_calling_conv, OpRange.getBegin()))
+    return;
+
+  // Try to suggest a fixit to change the calling convention of the function
+  // whose address was taken. Try to use the latest macro for the convention.
+  // For example, users probably want to write "WINAPI" instead of "__stdcall"
+  // to match the Windows header declarations.
+  SourceLocation NameLoc = Definition->getNameInfo().getLoc();
+  Preprocessor &PP = Self.getPreprocessor();
+  SmallVector<TokenValue, 6> AttrTokens;
+  SmallString<64> CCAttrText;
+  llvm::raw_svector_ostream OS(CCAttrText);
+  if (Self.getLangOpts().MicrosoftExt) {
+    // __stdcall or __vectorcall
+    OS << "__" << DstCCName;
+    IdentifierInfo *II = PP.getIdentifierInfo(OS.str());
+    AttrTokens.push_back(II->isKeyword(Self.getLangOpts())
+                             ? TokenValue(II->getTokenID())
+                             : TokenValue(II));
+  } else {
+    // __attribute__((stdcall)) or __attribute__((vectorcall))
+    OS << "__attribute__((" << DstCCName << "))";
+    AttrTokens.push_back(tok::kw___attribute);
+    AttrTokens.push_back(tok::l_paren);
+    AttrTokens.push_back(tok::l_paren);
+    IdentifierInfo *II = PP.getIdentifierInfo(DstCCName);
+    AttrTokens.push_back(II->isKeyword(Self.getLangOpts())
+                             ? TokenValue(II->getTokenID())
+                             : TokenValue(II));
+    AttrTokens.push_back(tok::r_paren);
+    AttrTokens.push_back(tok::r_paren);
+  }
+  StringRef AttrSpelling = PP.getLastMacroWithSpelling(NameLoc, AttrTokens);
+  if (!AttrSpelling.empty())
+    CCAttrText = AttrSpelling;
+  OS << ' ';
+  Self.Diag(NameLoc, diag::note_change_calling_conv_fixit)
+      << FD << DstCCName << FixItHint::CreateInsertion(NameLoc, CCAttrText);
+}
+
 static void checkIntToPointerCast(bool CStyle, SourceLocation Loc,
                                   const Expr *SrcExpr, QualType DestType,
                                   Sema &Self) {
@@ -1768,24 +1862,12 @@
       Result.isUsable())
     return true;
 
-  DeclAccessPair DAP;
-  FunctionDecl *Found = Self.resolveAddressOfOnlyViableOverloadCandidate(E, DAP);
-  if (!Found)
+  // No guarantees that ResolveAndFixSingleFunctionTemplateSpecialization
+  // preserves Result.
+  Result = E;
+  if (!Self.resolveAndFixAddressOfOnlyViableOverloadCandidate(Result))
     return false;
-
-  // It seems that if we encounter a call to a function that is both unavailable
-  // and inaccessible, we'll emit multiple diags for said call. Hence, we run
-  // both checks below unconditionally.
-  Self.DiagnoseUseOfDecl(Found, E->getExprLoc());
-  Self.CheckAddressOfMemberAccess(E, DAP);
-
-  Expr *Fixed = Self.FixOverloadedFunctionReference(E, DAP, Found);
-  if (Fixed->getType()->isFunctionType())
-    Result = Self.DefaultFunctionArrayConversion(Fixed, /*Diagnose=*/false);
-  else
-    Result = Fixed;
-
-  return !Result.isInvalid();
+  return Result.isUsable();
 }
 
 static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
@@ -2040,7 +2122,9 @@
   }
   if (CStyle)
     DiagnoseCastOfObjCSEL(Self, SrcExpr, DestType);
-  
+
+  DiagnoseCallingConvCast(Self, SrcExpr, DestType, OpRange);
+
   // Not casting away constness, so the only remaining check is for compatible
   // pointer categories.
 
@@ -2345,6 +2429,22 @@
       return;
     }
 
+    // OpenCL v2.0 s6.13.10 - Allow casts from '0' to event_t type.
+    if (Self.getLangOpts().OpenCL && DestType->isEventT()) {
+      llvm::APSInt CastInt;
+      if (SrcExpr.get()->EvaluateAsInt(CastInt, Self.Context)) {
+        if (0 == CastInt) {
+          Kind = CK_ZeroToOCLEvent;
+          return;
+        }
+        Self.Diag(OpRange.getBegin(),
+                  diag::error_opencl_cast_non_zero_to_event_t)
+                  << CastInt.toString(10) << SrcExpr.get()->getSourceRange();
+        SrcExpr = ExprError();
+        return;
+      }
+    }
+
     // Reject any other conversions to non-scalar types.
     Self.Diag(OpRange.getBegin(), diag::err_typecheck_cond_expect_scalar)
       << DestType << SrcExpr.get()->getSourceRange();
@@ -2459,6 +2559,7 @@
   }
   
   DiagnoseCastOfObjCSEL(Self, SrcExpr, DestType);
+  DiagnoseCallingConvCast(Self, SrcExpr, DestType, OpRange);
   DiagnoseBadFunctionCast(Self, SrcExpr, DestType);
   Kind = Self.PrepareScalarCast(SrcExpr, DestType);
   if (SrcExpr.isInvalid())
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index 90ae3ce..09a1601 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/DeclCXX.h"
@@ -33,14 +32,15 @@
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Locale.h"
-#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/raw_ostream.h"
-#include <limits>
+
 using namespace clang;
 using namespace sema;
 
@@ -260,6 +260,459 @@
   return false;
 }
 
+static inline bool isBlockPointer(Expr *Arg) {
+  return Arg->getType()->isBlockPointerType();
+}
+
+/// OpenCL C v2.0, s6.13.17.2 - Checks that the block parameters are all local
+/// void*, which is a requirement of device side enqueue.
+static bool checkOpenCLBlockArgs(Sema &S, Expr *BlockArg) {
+  const BlockPointerType *BPT =
+      cast<BlockPointerType>(BlockArg->getType().getCanonicalType());
+  ArrayRef<QualType> Params =
+      BPT->getPointeeType()->getAs<FunctionProtoType>()->getParamTypes();
+  unsigned ArgCounter = 0;
+  bool IllegalParams = false;
+  // Iterate through the block parameters until either one is found that is not
+  // a local void*, or the block is valid.
+  for (ArrayRef<QualType>::iterator I = Params.begin(), E = Params.end();
+       I != E; ++I, ++ArgCounter) {
+    if (!(*I)->isPointerType() || !(*I)->getPointeeType()->isVoidType() ||
+        (*I)->getPointeeType().getQualifiers().getAddressSpace() !=
+            LangAS::opencl_local) {
+      // Get the location of the error. If a block literal has been passed
+      // (BlockExpr) then we can point straight to the offending argument,
+      // else we just point to the variable reference.
+      SourceLocation ErrorLoc;
+      if (isa<BlockExpr>(BlockArg)) {
+        BlockDecl *BD = cast<BlockExpr>(BlockArg)->getBlockDecl();
+        ErrorLoc = BD->getParamDecl(ArgCounter)->getLocStart();
+      } else if (isa<DeclRefExpr>(BlockArg)) {
+        ErrorLoc = cast<DeclRefExpr>(BlockArg)->getLocStart();
+      }
+      S.Diag(ErrorLoc,
+             diag::err_opencl_enqueue_kernel_blocks_non_local_void_args);
+      IllegalParams = true;
+    }
+  }
+
+  return IllegalParams;
+}
+
+/// OpenCL C v2.0, s6.13.17.6 - Check the argument to the
+/// get_kernel_work_group_size
+/// and get_kernel_preferred_work_group_size_multiple builtin functions.
+static bool SemaOpenCLBuiltinKernelWorkGroupSize(Sema &S, CallExpr *TheCall) {
+  if (checkArgCount(S, TheCall, 1))
+    return true;
+
+  Expr *BlockArg = TheCall->getArg(0);
+  if (!isBlockPointer(BlockArg)) {
+    S.Diag(BlockArg->getLocStart(),
+           diag::err_opencl_enqueue_kernel_expected_type) << "block";
+    return true;
+  }
+  return checkOpenCLBlockArgs(S, BlockArg);
+}
+
+static bool checkOpenCLEnqueueLocalSizeArgs(Sema &S, CallExpr *TheCall,
+                                            unsigned Start, unsigned End);
+
+/// OpenCL v2.0, s6.13.17.1 - Check that sizes are provided for all
+/// 'local void*' parameter of passed block.
+static bool checkOpenCLEnqueueVariadicArgs(Sema &S, CallExpr *TheCall,
+                                           Expr *BlockArg,
+                                           unsigned NumNonVarArgs) {
+  const BlockPointerType *BPT =
+      cast<BlockPointerType>(BlockArg->getType().getCanonicalType());
+  unsigned NumBlockParams =
+      BPT->getPointeeType()->getAs<FunctionProtoType>()->getNumParams();
+  unsigned TotalNumArgs = TheCall->getNumArgs();
+
+  // For each argument passed to the block, a corresponding uint needs to
+  // be passed to describe the size of the local memory.
+  if (TotalNumArgs != NumBlockParams + NumNonVarArgs) {
+    S.Diag(TheCall->getLocStart(),
+           diag::err_opencl_enqueue_kernel_local_size_args);
+    return true;
+  }
+
+  // Check that the sizes of the local memory are specified by integers.
+  return checkOpenCLEnqueueLocalSizeArgs(S, TheCall, NumNonVarArgs,
+                                         TotalNumArgs - 1);
+}
+
+/// OpenCL C v2.0, s6.13.17 - Enqueue kernel function contains four different
+/// overload formats specified in Table 6.13.17.1.
+/// int enqueue_kernel(queue_t queue,
+///                    kernel_enqueue_flags_t flags,
+///                    const ndrange_t ndrange,
+///                    void (^block)(void))
+/// int enqueue_kernel(queue_t queue,
+///                    kernel_enqueue_flags_t flags,
+///                    const ndrange_t ndrange,
+///                    uint num_events_in_wait_list,
+///                    clk_event_t *event_wait_list,
+///                    clk_event_t *event_ret,
+///                    void (^block)(void))
+/// int enqueue_kernel(queue_t queue,
+///                    kernel_enqueue_flags_t flags,
+///                    const ndrange_t ndrange,
+///                    void (^block)(local void*, ...),
+///                    uint size0, ...)
+/// int enqueue_kernel(queue_t queue,
+///                    kernel_enqueue_flags_t flags,
+///                    const ndrange_t ndrange,
+///                    uint num_events_in_wait_list,
+///                    clk_event_t *event_wait_list,
+///                    clk_event_t *event_ret,
+///                    void (^block)(local void*, ...),
+///                    uint size0, ...)
+static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
+  unsigned NumArgs = TheCall->getNumArgs();
+
+  if (NumArgs < 4) {
+    S.Diag(TheCall->getLocStart(), diag::err_typecheck_call_too_few_args);
+    return true;
+  }
+
+  Expr *Arg0 = TheCall->getArg(0);
+  Expr *Arg1 = TheCall->getArg(1);
+  Expr *Arg2 = TheCall->getArg(2);
+  Expr *Arg3 = TheCall->getArg(3);
+
+  // First argument always needs to be a queue_t type.
+  if (!Arg0->getType()->isQueueT()) {
+    S.Diag(TheCall->getArg(0)->getLocStart(),
+           diag::err_opencl_enqueue_kernel_expected_type)
+        << S.Context.OCLQueueTy;
+    return true;
+  }
+
+  // Second argument always needs to be a kernel_enqueue_flags_t enum value.
+  if (!Arg1->getType()->isIntegerType()) {
+    S.Diag(TheCall->getArg(1)->getLocStart(),
+           diag::err_opencl_enqueue_kernel_expected_type)
+        << "'kernel_enqueue_flags_t' (i.e. uint)";
+    return true;
+  }
+
+  // Third argument is always an ndrange_t type.
+  if (!Arg2->getType()->isNDRangeT()) {
+    S.Diag(TheCall->getArg(2)->getLocStart(),
+           diag::err_opencl_enqueue_kernel_expected_type)
+        << S.Context.OCLNDRangeTy;
+    return true;
+  }
+
+  // With four arguments, there is only one form that the function could be
+  // called in: no events and no variable arguments.
+  if (NumArgs == 4) {
+    // check that the last argument is the right block type.
+    if (!isBlockPointer(Arg3)) {
+      S.Diag(Arg3->getLocStart(), diag::err_opencl_enqueue_kernel_expected_type)
+          << "block";
+      return true;
+    }
+    // we have a block type, check the prototype
+    const BlockPointerType *BPT =
+        cast<BlockPointerType>(Arg3->getType().getCanonicalType());
+    if (BPT->getPointeeType()->getAs<FunctionProtoType>()->getNumParams() > 0) {
+      S.Diag(Arg3->getLocStart(),
+             diag::err_opencl_enqueue_kernel_blocks_no_args);
+      return true;
+    }
+    return false;
+  }
+  // we can have block + varargs.
+  if (isBlockPointer(Arg3))
+    return (checkOpenCLBlockArgs(S, Arg3) ||
+            checkOpenCLEnqueueVariadicArgs(S, TheCall, Arg3, 4));
+  // last two cases with either exactly 7 args or 7 args and varargs.
+  if (NumArgs >= 7) {
+    // check common block argument.
+    Expr *Arg6 = TheCall->getArg(6);
+    if (!isBlockPointer(Arg6)) {
+      S.Diag(Arg6->getLocStart(), diag::err_opencl_enqueue_kernel_expected_type)
+          << "block";
+      return true;
+    }
+    if (checkOpenCLBlockArgs(S, Arg6))
+      return true;
+
+    // Forth argument has to be any integer type.
+    if (!Arg3->getType()->isIntegerType()) {
+      S.Diag(TheCall->getArg(3)->getLocStart(),
+             diag::err_opencl_enqueue_kernel_expected_type)
+          << "integer";
+      return true;
+    }
+    // check remaining common arguments.
+    Expr *Arg4 = TheCall->getArg(4);
+    Expr *Arg5 = TheCall->getArg(5);
+
+    // Fith argument is always passed as pointers to clk_event_t.
+    if (!Arg4->getType()->getPointeeOrArrayElementType()->isClkEventT()) {
+      S.Diag(TheCall->getArg(4)->getLocStart(),
+             diag::err_opencl_enqueue_kernel_expected_type)
+          << S.Context.getPointerType(S.Context.OCLClkEventTy);
+      return true;
+    }
+
+    // Sixth argument is always passed as pointers to clk_event_t.
+    if (!(Arg5->getType()->isPointerType() &&
+          Arg5->getType()->getPointeeType()->isClkEventT())) {
+      S.Diag(TheCall->getArg(5)->getLocStart(),
+             diag::err_opencl_enqueue_kernel_expected_type)
+          << S.Context.getPointerType(S.Context.OCLClkEventTy);
+      return true;
+    }
+
+    if (NumArgs == 7)
+      return false;
+
+    return checkOpenCLEnqueueVariadicArgs(S, TheCall, Arg6, 7);
+  }
+
+  // None of the specific case has been detected, give generic error
+  S.Diag(TheCall->getLocStart(),
+         diag::err_opencl_enqueue_kernel_incorrect_args);
+  return true;
+}
+
+/// Returns OpenCL access qual.
+static OpenCLAccessAttr *getOpenCLArgAccess(const Decl *D) {
+    return D->getAttr<OpenCLAccessAttr>();
+}
+
+/// Returns true if pipe element type is different from the pointer.
+static bool checkOpenCLPipeArg(Sema &S, CallExpr *Call) {
+  const Expr *Arg0 = Call->getArg(0);
+  // First argument type should always be pipe.
+  if (!Arg0->getType()->isPipeType()) {
+    S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_first_arg)
+        << Call->getDirectCallee() << Arg0->getSourceRange();
+    return true;
+  }
+  OpenCLAccessAttr *AccessQual =
+      getOpenCLArgAccess(cast<DeclRefExpr>(Arg0)->getDecl());
+  // Validates the access qualifier is compatible with the call.
+  // OpenCL v2.0 s6.13.16 - The access qualifiers for pipe should only be
+  // read_only and write_only, and assumed to be read_only if no qualifier is
+  // specified.
+  switch (Call->getDirectCallee()->getBuiltinID()) {
+  case Builtin::BIread_pipe:
+  case Builtin::BIreserve_read_pipe:
+  case Builtin::BIcommit_read_pipe:
+  case Builtin::BIwork_group_reserve_read_pipe:
+  case Builtin::BIsub_group_reserve_read_pipe:
+  case Builtin::BIwork_group_commit_read_pipe:
+  case Builtin::BIsub_group_commit_read_pipe:
+    if (!(!AccessQual || AccessQual->isReadOnly())) {
+      S.Diag(Arg0->getLocStart(),
+             diag::err_opencl_builtin_pipe_invalid_access_modifier)
+          << "read_only" << Arg0->getSourceRange();
+      return true;
+    }
+    break;
+  case Builtin::BIwrite_pipe:
+  case Builtin::BIreserve_write_pipe:
+  case Builtin::BIcommit_write_pipe:
+  case Builtin::BIwork_group_reserve_write_pipe:
+  case Builtin::BIsub_group_reserve_write_pipe:
+  case Builtin::BIwork_group_commit_write_pipe:
+  case Builtin::BIsub_group_commit_write_pipe:
+    if (!(AccessQual && AccessQual->isWriteOnly())) {
+      S.Diag(Arg0->getLocStart(),
+             diag::err_opencl_builtin_pipe_invalid_access_modifier)
+          << "write_only" << Arg0->getSourceRange();
+      return true;
+    }
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+/// Returns true if pipe element type is different from the pointer.
+static bool checkOpenCLPipePacketType(Sema &S, CallExpr *Call, unsigned Idx) {
+  const Expr *Arg0 = Call->getArg(0);
+  const Expr *ArgIdx = Call->getArg(Idx);
+  const PipeType *PipeTy = cast<PipeType>(Arg0->getType());
+  const QualType EltTy = PipeTy->getElementType();
+  const PointerType *ArgTy = ArgIdx->getType()->getAs<PointerType>();
+  // The Idx argument should be a pointer and the type of the pointer and
+  // the type of pipe element should also be the same.
+  if (!ArgTy ||
+      !S.Context.hasSameType(
+          EltTy, ArgTy->getPointeeType()->getCanonicalTypeInternal())) {
+    S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
+        << Call->getDirectCallee() << S.Context.getPointerType(EltTy)
+        << ArgIdx->getType() << ArgIdx->getSourceRange();
+    return true;
+  }
+  return false;
+}
+
+// \brief Performs semantic analysis for the read/write_pipe call.
+// \param S Reference to the semantic analyzer.
+// \param Call A pointer to the builtin call.
+// \return True if a semantic error has been found, false otherwise.
+static bool SemaBuiltinRWPipe(Sema &S, CallExpr *Call) {
+  // OpenCL v2.0 s6.13.16.2 - The built-in read/write
+  // functions have two forms.
+  switch (Call->getNumArgs()) {
+  case 2: {
+    if (checkOpenCLPipeArg(S, Call))
+      return true;
+    // The call with 2 arguments should be
+    // read/write_pipe(pipe T, T*).
+    // Check packet type T.
+    if (checkOpenCLPipePacketType(S, Call, 1))
+      return true;
+  } break;
+
+  case 4: {
+    if (checkOpenCLPipeArg(S, Call))
+      return true;
+    // The call with 4 arguments should be
+    // read/write_pipe(pipe T, reserve_id_t, uint, T*).
+    // Check reserve_id_t.
+    if (!Call->getArg(1)->getType()->isReserveIDT()) {
+      S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
+          << Call->getDirectCallee() << S.Context.OCLReserveIDTy
+          << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
+      return true;
+    }
+
+    // Check the index.
+    const Expr *Arg2 = Call->getArg(2);
+    if (!Arg2->getType()->isIntegerType() &&
+        !Arg2->getType()->isUnsignedIntegerType()) {
+      S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
+          << Call->getDirectCallee() << S.Context.UnsignedIntTy
+          << Arg2->getType() << Arg2->getSourceRange();
+      return true;
+    }
+
+    // Check packet type T.
+    if (checkOpenCLPipePacketType(S, Call, 3))
+      return true;
+  } break;
+  default:
+    S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_arg_num)
+        << Call->getDirectCallee() << Call->getSourceRange();
+    return true;
+  }
+
+  return false;
+}
+
+// \brief Performs a semantic analysis on the {work_group_/sub_group_
+//        /_}reserve_{read/write}_pipe
+// \param S Reference to the semantic analyzer.
+// \param Call The call to the builtin function to be analyzed.
+// \return True if a semantic error was found, false otherwise.
+static bool SemaBuiltinReserveRWPipe(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 2))
+    return true;
+
+  if (checkOpenCLPipeArg(S, Call))
+    return true;
+
+  // Check the reserve size.
+  if (!Call->getArg(1)->getType()->isIntegerType() &&
+      !Call->getArg(1)->getType()->isUnsignedIntegerType()) {
+    S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
+        << Call->getDirectCallee() << S.Context.UnsignedIntTy
+        << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
+    return true;
+  }
+
+  return false;
+}
+
+// \brief Performs a semantic analysis on {work_group_/sub_group_
+//        /_}commit_{read/write}_pipe
+// \param S Reference to the semantic analyzer.
+// \param Call The call to the builtin function to be analyzed.
+// \return True if a semantic error was found, false otherwise.
+static bool SemaBuiltinCommitRWPipe(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 2))
+    return true;
+
+  if (checkOpenCLPipeArg(S, Call))
+    return true;
+
+  // Check reserve_id_t.
+  if (!Call->getArg(1)->getType()->isReserveIDT()) {
+    S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
+        << Call->getDirectCallee() << S.Context.OCLReserveIDTy
+        << Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
+    return true;
+  }
+
+  return false;
+}
+
+// \brief Performs a semantic analysis on the call to built-in Pipe
+//        Query Functions.
+// \param S Reference to the semantic analyzer.
+// \param Call The call to the builtin function to be analyzed.
+// \return True if a semantic error was found, false otherwise.
+static bool SemaBuiltinPipePackets(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 1))
+    return true;
+
+  if (!Call->getArg(0)->getType()->isPipeType()) {
+    S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_first_arg)
+        << Call->getDirectCallee() << Call->getArg(0)->getSourceRange();
+    return true;
+  }
+
+  return false;
+}
+// \brief OpenCL v2.0 s6.13.9 - Address space qualifier functions.
+// \brief Performs semantic analysis for the to_global/local/private call.
+// \param S Reference to the semantic analyzer.
+// \param BuiltinID ID of the builtin function.
+// \param Call A pointer to the builtin call.
+// \return True if a semantic error has been found, false otherwise.
+static bool SemaOpenCLBuiltinToAddr(Sema &S, unsigned BuiltinID,
+                                    CallExpr *Call) {
+  if (Call->getNumArgs() != 1) {
+    S.Diag(Call->getLocStart(), diag::err_opencl_builtin_to_addr_arg_num)
+        << Call->getDirectCallee() << Call->getSourceRange();
+    return true;
+  }
+
+  auto RT = Call->getArg(0)->getType();
+  if (!RT->isPointerType() || RT->getPointeeType()
+      .getAddressSpace() == LangAS::opencl_constant) {
+    S.Diag(Call->getLocStart(), diag::err_opencl_builtin_to_addr_invalid_arg)
+        << Call->getArg(0) << Call->getDirectCallee() << Call->getSourceRange();
+    return true;
+  }
+
+  RT = RT->getPointeeType();
+  auto Qual = RT.getQualifiers();
+  switch (BuiltinID) {
+  case Builtin::BIto_global:
+    Qual.setAddressSpace(LangAS::opencl_global);
+    break;
+  case Builtin::BIto_local:
+    Qual.setAddressSpace(LangAS::opencl_local);
+    break;
+  default:
+    Qual.removeAddressSpace();
+  }
+  Call->setType(S.Context.getPointerType(S.Context.getQualifiedType(
+      RT.getUnqualifiedType(), Qual)));
+
+  return false;
+}
+
 ExprResult
 Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
                                CallExpr *TheCall) {
@@ -532,27 +985,22 @@
   case Builtin::BI__builtin___vsnprintf_chk:
     SemaBuiltinMemChkCall(*this, FDecl, TheCall, 1, 3);
     break;
-
   case Builtin::BI__builtin_call_with_static_chain:
     if (SemaBuiltinCallWithStaticChain(*this, TheCall))
       return ExprError();
     break;
-
   case Builtin::BI__exception_code:
-  case Builtin::BI_exception_code: {
+  case Builtin::BI_exception_code:
     if (SemaBuiltinSEHScopeCheck(*this, TheCall, Scope::SEHExceptScope,
                                  diag::err_seh___except_block))
       return ExprError();
     break;
-  }
   case Builtin::BI__exception_info:
-  case Builtin::BI_exception_info: {
+  case Builtin::BI_exception_info:
     if (SemaBuiltinSEHScopeCheck(*this, TheCall, Scope::SEHFilterScope,
                                  diag::err_seh___except_filter))
       return ExprError();
     break;
-  }
-
   case Builtin::BI__GetExceptionInfo:
     if (checkArgCount(*this, TheCall, 1))
       return ExprError();
@@ -565,13 +1013,62 @@
 
     TheCall->setType(Context.VoidPtrTy);
     break;
-
+  // OpenCL v2.0, s6.13.16 - Pipe functions
+  case Builtin::BIread_pipe:
+  case Builtin::BIwrite_pipe:
+    // Since those two functions are declared with var args, we need a semantic
+    // check for the argument.
+    if (SemaBuiltinRWPipe(*this, TheCall))
+      return ExprError();
+    break;
+  case Builtin::BIreserve_read_pipe:
+  case Builtin::BIreserve_write_pipe:
+  case Builtin::BIwork_group_reserve_read_pipe:
+  case Builtin::BIwork_group_reserve_write_pipe:
+  case Builtin::BIsub_group_reserve_read_pipe:
+  case Builtin::BIsub_group_reserve_write_pipe:
+    if (SemaBuiltinReserveRWPipe(*this, TheCall))
+      return ExprError();
+    // Since return type of reserve_read/write_pipe built-in function is
+    // reserve_id_t, which is not defined in the builtin def file , we used int
+    // as return type and need to override the return type of these functions.
+    TheCall->setType(Context.OCLReserveIDTy);
+    break;
+  case Builtin::BIcommit_read_pipe:
+  case Builtin::BIcommit_write_pipe:
+  case Builtin::BIwork_group_commit_read_pipe:
+  case Builtin::BIwork_group_commit_write_pipe:
+  case Builtin::BIsub_group_commit_read_pipe:
+  case Builtin::BIsub_group_commit_write_pipe:
+    if (SemaBuiltinCommitRWPipe(*this, TheCall))
+      return ExprError();
+    break;
+  case Builtin::BIget_pipe_num_packets:
+  case Builtin::BIget_pipe_max_packets:
+    if (SemaBuiltinPipePackets(*this, TheCall))
+      return ExprError();
+    break;
   case Builtin::BI__builtin_os_log_format:
   case Builtin::BI__builtin_os_log_format_buffer_size:
     if (SemaBuiltinOSLogFormat(TheCall)) {
       return ExprError();
     }
     break;
+  case Builtin::BIto_global:
+  case Builtin::BIto_local:
+  case Builtin::BIto_private:
+    if (SemaOpenCLBuiltinToAddr(*this, BuiltinID, TheCall))
+      return ExprError();
+    break;
+  // OpenCL v2.0, s6.13.17 - Enqueue kernel functions.
+  case Builtin::BIenqueue_kernel:
+    if (SemaOpenCLBuiltinEnqueueKernel(*this, TheCall))
+      return ExprError();
+    break;
+  case Builtin::BIget_kernel_work_group_size:
+  case Builtin::BIget_kernel_preferred_work_group_size_multiple:
+    if (SemaOpenCLBuiltinKernelWorkGroupSize(*this, TheCall))
+      return ExprError();
   }
 
   // Since the target specific builtins for each arch overlap, only check those
@@ -851,7 +1348,6 @@
     return true;
   }
 
-
   if (IsLdrex) {
     TheCall->setType(ValType);
     return false;
@@ -1099,19 +1595,108 @@
 }
 
 bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
-  unsigned i = 0, l = 0, u = 0;
+  int i = 0, l = 0, u = 0;
   switch (BuiltinID) {
-  default: return false;
+  default:
+    return false;
   case X86::BI__builtin_cpu_supports:
     return SemaBuiltinCpuSupports(*this, TheCall);
   case X86::BI__builtin_ms_va_start:
     return SemaBuiltinMSVAStart(TheCall);
-  case X86::BI_mm_prefetch: i = 1; l = 0; u = 3; break;
-  case X86::BI__builtin_ia32_sha1rnds4: i = 2, l = 0; u = 3; break;
+  case X86::BI__builtin_ia32_addcarryx_u64:
+  case X86::BI__builtin_ia32_addcarry_u64:
+  case X86::BI__builtin_ia32_subborrow_u64:
+  case X86::BI__builtin_ia32_readeflags_u64:
+  case X86::BI__builtin_ia32_writeeflags_u64:
+  case X86::BI__builtin_ia32_bextr_u64:
+  case X86::BI__builtin_ia32_bextri_u64:
+  case X86::BI__builtin_ia32_bzhi_di:
+  case X86::BI__builtin_ia32_pdep_di:
+  case X86::BI__builtin_ia32_pext_di:
+  case X86::BI__builtin_ia32_crc32di:
+  case X86::BI__builtin_ia32_fxsave64:
+  case X86::BI__builtin_ia32_fxrstor64:
+  case X86::BI__builtin_ia32_xsave64:
+  case X86::BI__builtin_ia32_xrstor64:
+  case X86::BI__builtin_ia32_xsaveopt64:
+  case X86::BI__builtin_ia32_xrstors64:
+  case X86::BI__builtin_ia32_xsavec64:
+  case X86::BI__builtin_ia32_xsaves64:
+  case X86::BI__builtin_ia32_rdfsbase64:
+  case X86::BI__builtin_ia32_rdgsbase64:
+  case X86::BI__builtin_ia32_wrfsbase64:
+  case X86::BI__builtin_ia32_wrgsbase64:
+  case X86::BI__builtin_ia32_pbroadcastq512_gpr_mask:
+  case X86::BI__builtin_ia32_pbroadcastq256_gpr_mask:
+  case X86::BI__builtin_ia32_pbroadcastq128_gpr_mask:
+  case X86::BI__builtin_ia32_vcvtsd2si64:
+  case X86::BI__builtin_ia32_vcvtsd2usi64:
+  case X86::BI__builtin_ia32_vcvtss2si64:
+  case X86::BI__builtin_ia32_vcvtss2usi64:
+  case X86::BI__builtin_ia32_vcvttsd2si64:
+  case X86::BI__builtin_ia32_vcvttsd2usi64:
+  case X86::BI__builtin_ia32_vcvttss2si64:
+  case X86::BI__builtin_ia32_vcvttss2usi64:
+  case X86::BI__builtin_ia32_cvtss2si64:
+  case X86::BI__builtin_ia32_cvttss2si64:
+  case X86::BI__builtin_ia32_cvtsd2si64:
+  case X86::BI__builtin_ia32_cvttsd2si64:
+  case X86::BI__builtin_ia32_cvtsi2sd64:
+  case X86::BI__builtin_ia32_cvtsi2ss64:
+  case X86::BI__builtin_ia32_cvtusi2sd64:
+  case X86::BI__builtin_ia32_cvtusi2ss64:
+  case X86::BI__builtin_ia32_rdseed64_step: {
+    // These builtins only work on x86-64 targets.
+    const llvm::Triple &TT = Context.getTargetInfo().getTriple();
+    if (TT.getArch() != llvm::Triple::x86_64)
+      return Diag(TheCall->getCallee()->getLocStart(),
+                  diag::err_x86_builtin_32_bit_tgt);
+    return false;
+  }
+  case X86::BI__builtin_ia32_extractf64x4_mask:
+  case X86::BI__builtin_ia32_extracti64x4_mask:
+  case X86::BI__builtin_ia32_extractf32x8_mask:
+  case X86::BI__builtin_ia32_extracti32x8_mask:
+  case X86::BI__builtin_ia32_extractf64x2_256_mask:
+  case X86::BI__builtin_ia32_extracti64x2_256_mask:
+  case X86::BI__builtin_ia32_extractf32x4_256_mask:
+  case X86::BI__builtin_ia32_extracti32x4_256_mask:
+    i = 1; l = 0; u = 1;
+    break;
+  case X86::BI_mm_prefetch:
+  case X86::BI__builtin_ia32_extractf32x4_mask:
+  case X86::BI__builtin_ia32_extracti32x4_mask:
+  case X86::BI__builtin_ia32_extractf64x2_512_mask:
+  case X86::BI__builtin_ia32_extracti64x2_512_mask:
+    i = 1; l = 0; u = 3;
+    break;
+  case X86::BI__builtin_ia32_insertf32x8_mask:
+  case X86::BI__builtin_ia32_inserti32x8_mask:
+  case X86::BI__builtin_ia32_insertf64x4_mask:
+  case X86::BI__builtin_ia32_inserti64x4_mask:
+  case X86::BI__builtin_ia32_insertf64x2_256_mask:
+  case X86::BI__builtin_ia32_inserti64x2_256_mask:
+  case X86::BI__builtin_ia32_insertf32x4_256_mask:
+  case X86::BI__builtin_ia32_inserti32x4_256_mask:
+    i = 2; l = 0; u = 1;
+    break;
+  case X86::BI__builtin_ia32_sha1rnds4:
+  case X86::BI__builtin_ia32_shuf_f32x4_256_mask:
+  case X86::BI__builtin_ia32_shuf_f64x2_256_mask:
+  case X86::BI__builtin_ia32_shuf_i32x4_256_mask:
+  case X86::BI__builtin_ia32_shuf_i64x2_256_mask:
+  case X86::BI__builtin_ia32_insertf64x2_512_mask:
+  case X86::BI__builtin_ia32_inserti64x2_512_mask:
+  case X86::BI__builtin_ia32_insertf32x4_mask:
+  case X86::BI__builtin_ia32_inserti32x4_mask:
+    i = 2; l = 0; u = 3;
+    break;
   case X86::BI__builtin_ia32_vpermil2pd:
   case X86::BI__builtin_ia32_vpermil2pd256:
   case X86::BI__builtin_ia32_vpermil2ps:
-  case X86::BI__builtin_ia32_vpermil2ps256: i = 3, l = 0; u = 3; break;
+  case X86::BI__builtin_ia32_vpermil2ps256:
+    i = 3; l = 0; u = 3;
+    break;
   case X86::BI__builtin_ia32_cmpb128_mask:
   case X86::BI__builtin_ia32_cmpw128_mask:
   case X86::BI__builtin_ia32_cmpd128_mask:
@@ -1135,21 +1720,7 @@
   case X86::BI__builtin_ia32_ucmpb512_mask:
   case X86::BI__builtin_ia32_ucmpw512_mask:
   case X86::BI__builtin_ia32_ucmpd512_mask:
-  case X86::BI__builtin_ia32_ucmpq512_mask: i = 2; l = 0; u = 7; break;
-  case X86::BI__builtin_ia32_roundps:
-  case X86::BI__builtin_ia32_roundpd:
-  case X86::BI__builtin_ia32_roundps256:
-  case X86::BI__builtin_ia32_roundpd256: i = 1, l = 0; u = 15; break;
-  case X86::BI__builtin_ia32_roundss:
-  case X86::BI__builtin_ia32_roundsd: i = 2, l = 0; u = 15; break;
-  case X86::BI__builtin_ia32_cmpps:
-  case X86::BI__builtin_ia32_cmpss:
-  case X86::BI__builtin_ia32_cmppd:
-  case X86::BI__builtin_ia32_cmpsd:
-  case X86::BI__builtin_ia32_cmpps256:
-  case X86::BI__builtin_ia32_cmppd256:
-  case X86::BI__builtin_ia32_cmpps512_mask:
-  case X86::BI__builtin_ia32_cmppd512_mask: i = 2; l = 0; u = 31; break;
+  case X86::BI__builtin_ia32_ucmpq512_mask:
   case X86::BI__builtin_ia32_vpcomub:
   case X86::BI__builtin_ia32_vpcomuw:
   case X86::BI__builtin_ia32_vpcomud:
@@ -1157,7 +1728,197 @@
   case X86::BI__builtin_ia32_vpcomb:
   case X86::BI__builtin_ia32_vpcomw:
   case X86::BI__builtin_ia32_vpcomd:
-  case X86::BI__builtin_ia32_vpcomq: i = 2; l = 0; u = 7; break;
+  case X86::BI__builtin_ia32_vpcomq:
+    i = 2; l = 0; u = 7;
+    break;
+  case X86::BI__builtin_ia32_roundps:
+  case X86::BI__builtin_ia32_roundpd:
+  case X86::BI__builtin_ia32_roundps256:
+  case X86::BI__builtin_ia32_roundpd256:
+    i = 1; l = 0; u = 15;
+    break;
+  case X86::BI__builtin_ia32_roundss:
+  case X86::BI__builtin_ia32_roundsd:
+  case X86::BI__builtin_ia32_rangepd128_mask:
+  case X86::BI__builtin_ia32_rangepd256_mask:
+  case X86::BI__builtin_ia32_rangepd512_mask:
+  case X86::BI__builtin_ia32_rangeps128_mask:
+  case X86::BI__builtin_ia32_rangeps256_mask:
+  case X86::BI__builtin_ia32_rangeps512_mask:
+  case X86::BI__builtin_ia32_getmantsd_round_mask:
+  case X86::BI__builtin_ia32_getmantss_round_mask:
+    i = 2; l = 0; u = 15;
+    break;
+  case X86::BI__builtin_ia32_cmpps:
+  case X86::BI__builtin_ia32_cmpss:
+  case X86::BI__builtin_ia32_cmppd:
+  case X86::BI__builtin_ia32_cmpsd:
+  case X86::BI__builtin_ia32_cmpps256:
+  case X86::BI__builtin_ia32_cmppd256:
+  case X86::BI__builtin_ia32_cmpps128_mask:
+  case X86::BI__builtin_ia32_cmppd128_mask:
+  case X86::BI__builtin_ia32_cmpps256_mask:
+  case X86::BI__builtin_ia32_cmppd256_mask:
+  case X86::BI__builtin_ia32_cmpps512_mask:
+  case X86::BI__builtin_ia32_cmppd512_mask:
+  case X86::BI__builtin_ia32_cmpsd_mask:
+  case X86::BI__builtin_ia32_cmpss_mask:
+    i = 2; l = 0; u = 31;
+    break;
+  case X86::BI__builtin_ia32_xabort:
+    i = 0; l = -128; u = 255;
+    break;
+  case X86::BI__builtin_ia32_pshufw:
+  case X86::BI__builtin_ia32_aeskeygenassist128:
+    i = 1; l = -128; u = 255;
+    break;
+  case X86::BI__builtin_ia32_vcvtps2ph:
+  case X86::BI__builtin_ia32_vcvtps2ph256:
+  case X86::BI__builtin_ia32_rndscaleps_128_mask:
+  case X86::BI__builtin_ia32_rndscalepd_128_mask:
+  case X86::BI__builtin_ia32_rndscaleps_256_mask:
+  case X86::BI__builtin_ia32_rndscalepd_256_mask:
+  case X86::BI__builtin_ia32_rndscaleps_mask:
+  case X86::BI__builtin_ia32_rndscalepd_mask:
+  case X86::BI__builtin_ia32_reducepd128_mask:
+  case X86::BI__builtin_ia32_reducepd256_mask:
+  case X86::BI__builtin_ia32_reducepd512_mask:
+  case X86::BI__builtin_ia32_reduceps128_mask:
+  case X86::BI__builtin_ia32_reduceps256_mask:
+  case X86::BI__builtin_ia32_reduceps512_mask:
+  case X86::BI__builtin_ia32_prold512_mask:
+  case X86::BI__builtin_ia32_prolq512_mask:
+  case X86::BI__builtin_ia32_prold128_mask:
+  case X86::BI__builtin_ia32_prold256_mask:
+  case X86::BI__builtin_ia32_prolq128_mask:
+  case X86::BI__builtin_ia32_prolq256_mask:
+  case X86::BI__builtin_ia32_prord128_mask:
+  case X86::BI__builtin_ia32_prord256_mask:
+  case X86::BI__builtin_ia32_prorq128_mask:
+  case X86::BI__builtin_ia32_prorq256_mask:
+  case X86::BI__builtin_ia32_psllwi512_mask:
+  case X86::BI__builtin_ia32_psllwi128_mask:
+  case X86::BI__builtin_ia32_psllwi256_mask:
+  case X86::BI__builtin_ia32_psrldi128_mask:
+  case X86::BI__builtin_ia32_psrldi256_mask:
+  case X86::BI__builtin_ia32_psrldi512_mask:
+  case X86::BI__builtin_ia32_psrlqi128_mask:
+  case X86::BI__builtin_ia32_psrlqi256_mask:
+  case X86::BI__builtin_ia32_psrlqi512_mask:
+  case X86::BI__builtin_ia32_psrawi512_mask:
+  case X86::BI__builtin_ia32_psrawi128_mask:
+  case X86::BI__builtin_ia32_psrawi256_mask:
+  case X86::BI__builtin_ia32_psrlwi512_mask:
+  case X86::BI__builtin_ia32_psrlwi128_mask:
+  case X86::BI__builtin_ia32_psrlwi256_mask:
+  case X86::BI__builtin_ia32_psradi128_mask:
+  case X86::BI__builtin_ia32_psradi256_mask:
+  case X86::BI__builtin_ia32_psradi512_mask:
+  case X86::BI__builtin_ia32_psraqi128_mask:
+  case X86::BI__builtin_ia32_psraqi256_mask:
+  case X86::BI__builtin_ia32_psraqi512_mask:
+  case X86::BI__builtin_ia32_pslldi128_mask:
+  case X86::BI__builtin_ia32_pslldi256_mask:
+  case X86::BI__builtin_ia32_pslldi512_mask:
+  case X86::BI__builtin_ia32_psllqi128_mask:
+  case X86::BI__builtin_ia32_psllqi256_mask:
+  case X86::BI__builtin_ia32_psllqi512_mask:
+  case X86::BI__builtin_ia32_fpclasspd128_mask:
+  case X86::BI__builtin_ia32_fpclasspd256_mask:
+  case X86::BI__builtin_ia32_fpclassps128_mask:
+  case X86::BI__builtin_ia32_fpclassps256_mask:
+  case X86::BI__builtin_ia32_fpclassps512_mask:
+  case X86::BI__builtin_ia32_fpclasspd512_mask:
+  case X86::BI__builtin_ia32_fpclasssd_mask:
+  case X86::BI__builtin_ia32_fpclassss_mask:
+    i = 1; l = 0; u = 255;
+    break;
+  case X86::BI__builtin_ia32_palignr:
+  case X86::BI__builtin_ia32_insertps128:
+  case X86::BI__builtin_ia32_dpps:
+  case X86::BI__builtin_ia32_dppd:
+  case X86::BI__builtin_ia32_dpps256:
+  case X86::BI__builtin_ia32_mpsadbw128:
+  case X86::BI__builtin_ia32_mpsadbw256:
+  case X86::BI__builtin_ia32_pcmpistrm128:
+  case X86::BI__builtin_ia32_pcmpistri128:
+  case X86::BI__builtin_ia32_pcmpistria128:
+  case X86::BI__builtin_ia32_pcmpistric128:
+  case X86::BI__builtin_ia32_pcmpistrio128:
+  case X86::BI__builtin_ia32_pcmpistris128:
+  case X86::BI__builtin_ia32_pcmpistriz128:
+  case X86::BI__builtin_ia32_pclmulqdq128:
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256:
+  case X86::BI__builtin_ia32_permti256:
+    i = 2; l = -128; u = 255;
+    break;
+  case X86::BI__builtin_ia32_palignr128:
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr128_mask:
+  case X86::BI__builtin_ia32_palignr256_mask:
+  case X86::BI__builtin_ia32_palignr512_mask:
+  case X86::BI__builtin_ia32_alignq512_mask:
+  case X86::BI__builtin_ia32_alignd512_mask:
+  case X86::BI__builtin_ia32_alignd128_mask:
+  case X86::BI__builtin_ia32_alignd256_mask:
+  case X86::BI__builtin_ia32_alignq128_mask:
+  case X86::BI__builtin_ia32_alignq256_mask:
+  case X86::BI__builtin_ia32_vcomisd:
+  case X86::BI__builtin_ia32_vcomiss:
+  case X86::BI__builtin_ia32_shuf_f32x4_mask:
+  case X86::BI__builtin_ia32_shuf_f64x2_mask:
+  case X86::BI__builtin_ia32_shuf_i32x4_mask:
+  case X86::BI__builtin_ia32_shuf_i64x2_mask:
+  case X86::BI__builtin_ia32_dbpsadbw128_mask:
+  case X86::BI__builtin_ia32_dbpsadbw256_mask:
+  case X86::BI__builtin_ia32_dbpsadbw512_mask:
+    i = 2; l = 0; u = 255;
+    break;
+  case X86::BI__builtin_ia32_fixupimmpd512_mask:
+  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
+  case X86::BI__builtin_ia32_fixupimmps512_mask:
+  case X86::BI__builtin_ia32_fixupimmps512_maskz:
+  case X86::BI__builtin_ia32_fixupimmsd_mask:
+  case X86::BI__builtin_ia32_fixupimmsd_maskz:
+  case X86::BI__builtin_ia32_fixupimmss_mask:
+  case X86::BI__builtin_ia32_fixupimmss_maskz:
+  case X86::BI__builtin_ia32_fixupimmpd128_mask:
+  case X86::BI__builtin_ia32_fixupimmpd128_maskz:
+  case X86::BI__builtin_ia32_fixupimmpd256_mask:
+  case X86::BI__builtin_ia32_fixupimmpd256_maskz:
+  case X86::BI__builtin_ia32_fixupimmps128_mask:
+  case X86::BI__builtin_ia32_fixupimmps128_maskz:
+  case X86::BI__builtin_ia32_fixupimmps256_mask:
+  case X86::BI__builtin_ia32_fixupimmps256_maskz:
+  case X86::BI__builtin_ia32_pternlogd512_mask:
+  case X86::BI__builtin_ia32_pternlogd512_maskz:
+  case X86::BI__builtin_ia32_pternlogq512_mask:
+  case X86::BI__builtin_ia32_pternlogq512_maskz:
+  case X86::BI__builtin_ia32_pternlogd128_mask:
+  case X86::BI__builtin_ia32_pternlogd128_maskz:
+  case X86::BI__builtin_ia32_pternlogd256_mask:
+  case X86::BI__builtin_ia32_pternlogd256_maskz:
+  case X86::BI__builtin_ia32_pternlogq128_mask:
+  case X86::BI__builtin_ia32_pternlogq128_maskz:
+  case X86::BI__builtin_ia32_pternlogq256_mask:
+  case X86::BI__builtin_ia32_pternlogq256_maskz:
+    i = 3; l = 0; u = 255;
+    break;
+  case X86::BI__builtin_ia32_pcmpestrm128:
+  case X86::BI__builtin_ia32_pcmpestri128:
+  case X86::BI__builtin_ia32_pcmpestria128:
+  case X86::BI__builtin_ia32_pcmpestric128:
+  case X86::BI__builtin_ia32_pcmpestrio128:
+  case X86::BI__builtin_ia32_pcmpestris128:
+  case X86::BI__builtin_ia32_pcmpestriz128:
+    i = 4; l = -128; u = 255;
+    break;
+  case X86::BI__builtin_ia32_rndscalesd_round_mask:
+  case X86::BI__builtin_ia32_rndscaless_round_mask:
+    i = 4; l = 0; u = 255;
+    break;
   }
   return SemaBuiltinConstantArgRange(TheCall, i, l, u);
 }
@@ -1542,10 +2303,10 @@
 }
 
 static bool isValidOrderingForOp(int64_t Ordering, AtomicExpr::AtomicOp Op) {
-  if (Ordering < AtomicExpr::AO_ABI_memory_order_relaxed ||
-      Ordering > AtomicExpr::AO_ABI_memory_order_seq_cst)
+  if (!llvm::isValidAtomicOrderingCABI(Ordering))
     return false;
 
+  auto OrderingCABI = (llvm::AtomicOrderingCABI)Ordering;
   switch (Op) {
   case AtomicExpr::AO__c11_atomic_init:
     llvm_unreachable("There is no ordering argument for an init");
@@ -1553,15 +2314,15 @@
   case AtomicExpr::AO__c11_atomic_load:
   case AtomicExpr::AO__atomic_load_n:
   case AtomicExpr::AO__atomic_load:
-    return Ordering != AtomicExpr::AO_ABI_memory_order_release &&
-           Ordering != AtomicExpr::AO_ABI_memory_order_acq_rel;
+    return OrderingCABI != llvm::AtomicOrderingCABI::release &&
+           OrderingCABI != llvm::AtomicOrderingCABI::acq_rel;
 
   case AtomicExpr::AO__c11_atomic_store:
   case AtomicExpr::AO__atomic_store:
   case AtomicExpr::AO__atomic_store_n:
-    return Ordering != AtomicExpr::AO_ABI_memory_order_consume &&
-           Ordering != AtomicExpr::AO_ABI_memory_order_acquire &&
-           Ordering != AtomicExpr::AO_ABI_memory_order_acq_rel;
+    return OrderingCABI != llvm::AtomicOrderingCABI::consume &&
+           OrderingCABI != llvm::AtomicOrderingCABI::acquire &&
+           OrderingCABI != llvm::AtomicOrderingCABI::acq_rel;
 
   default:
     return true;
@@ -1580,6 +2341,8 @@
     // C    __c11_atomic_load(A *, int)
     Load,
     // void __atomic_load(A *, CP, int)
+    LoadCopy,
+    // void __atomic_store(A *, CP, int)
     Copy,
     // C    __c11_atomic_add(A *, M, int)
     Arithmetic,
@@ -1592,8 +2355,8 @@
     // bool __atomic_compare_exchange(A *, C *, CP, bool, int, int)
     GNUCmpXchg
   } Form = Init;
-  const unsigned NumArgs[] = { 2, 2, 3, 3, 3, 4, 5, 6 };
-  const unsigned NumVals[] = { 1, 0, 1, 1, 1, 2, 2, 3 };
+  const unsigned NumArgs[] = { 2, 2, 3, 3, 3, 3, 4, 5, 6 };
+  const unsigned NumVals[] = { 1, 0, 1, 1, 1, 1, 2, 2, 3 };
   // where:
   //   C is an appropriate type,
   //   A is volatile _Atomic(C) for __c11 builtins and is C for GNU builtins,
@@ -1623,8 +2386,11 @@
     Form = Load;
     break;
 
-  case AtomicExpr::AO__c11_atomic_store:
   case AtomicExpr::AO__atomic_load:
+    Form = LoadCopy;
+    break;
+
+  case AtomicExpr::AO__c11_atomic_store:
   case AtomicExpr::AO__atomic_store:
   case AtomicExpr::AO__atomic_store_n:
     Form = Copy;
@@ -1688,7 +2454,11 @@
 
   // Inspect the first argument of the atomic operation.
   Expr *Ptr = TheCall->getArg(0);
-  Ptr = DefaultFunctionArrayLvalueConversion(Ptr).get();
+  ExprResult ConvertedPtr = DefaultFunctionArrayLvalueConversion(Ptr);
+  if (ConvertedPtr.isInvalid())
+    return ExprError();
+
+  Ptr = ConvertedPtr.get();
   const PointerType *pointerType = Ptr->getType()->getAs<PointerType>();
   if (!pointerType) {
     Diag(DRE->getLocStart(), diag::err_atomic_builtin_must_be_pointer)
@@ -1711,7 +2481,7 @@
       return ExprError();
     }
     ValType = AtomTy->getAs<AtomicType>()->getValueType();
-  } else if (Form != Load && Op != AtomicExpr::AO__atomic_load) {
+  } else if (Form != Load && Form != LoadCopy) {
     if (ValType.isConstQualified()) {
       Diag(DRE->getLocStart(), diag::err_atomic_op_needs_non_const_pointer)
         << Ptr->getType() << Ptr->getSourceRange();
@@ -1772,10 +2542,11 @@
 
   // atomic_fetch_or takes a pointer to a volatile 'A'.  We shouldn't let the
   // volatile-ness of the pointee-type inject itself into the result or the
-  // other operands.
+  // other operands. Similarly atomic_load can take a pointer to a const 'A'.
   ValType.removeLocalVolatile();
+  ValType.removeLocalConst();
   QualType ResultType = ValType;
-  if (Form == Copy || Form == GNUXchg || Form == Init)
+  if (Form == Copy || Form == LoadCopy || Form == GNUXchg || Form == Init)
     ResultType = Context.VoidTy;
   else if (Form == C11CmpXchg || Form == GNUCmpXchg)
     ResultType = Context.BoolTy;
@@ -1786,10 +2557,6 @@
   if (!IsC11 && !IsN)
     ByValType = Ptr->getType();
 
-  // FIXME: __atomic_load allows the first argument to be a a pointer to const
-  // but not the second argument. We need to manually remove possible const
-  // qualifiers.
-
   // The first argument --- the pointer --- has a fixed type; we
   // deduce the types of the rest of the arguments accordingly.  Walk
   // the remaining arguments, converting them to the deduced value type.
@@ -1856,6 +2623,7 @@
   case Load:
     SubExprs.push_back(TheCall->getArg(1)); // Order
     break;
+  case LoadCopy:
   case Copy:
   case Arithmetic:
   case Xchg:
@@ -1905,7 +2673,6 @@
   return AE;
 }
 
-
 /// checkBuiltinArgument - Given a call to a builtin function, perform
 /// normal type-checking on the given argument, updating the call in
 /// place.  This is useful when a builtin function requires custom
@@ -2483,11 +3250,11 @@
       // Get the last formal in the current function.
       const ParmVarDecl *LastArg;
       if (CurBlock)
-        LastArg = *(CurBlock->TheDecl->param_end()-1);
+        LastArg = CurBlock->TheDecl->parameters().back();
       else if (FunctionDecl *FD = getCurFunctionDecl())
-        LastArg = *(FD->param_end()-1);
+        LastArg = FD->parameters().back();
       else
-        LastArg = *(getCurMethodDecl()->param_end()-1);
+        LastArg = getCurMethodDecl()->parameters().back();
       SecondArgIsLastNamedArgument = PV == LastArg;
 
       Type = PV->getType();
@@ -2695,8 +3462,7 @@
 
   // Determine which of the following types of shufflevector we're checking:
   // 1) unary, vector mask: (lhs, mask)
-  // 2) binary, vector mask: (lhs, rhs, mask)
-  // 3) binary, scalar mask: (lhs, rhs, index, ..., index)
+  // 2) binary, scalar mask: (lhs, rhs, index, ..., index)
   QualType resType = TheCall->getArg(0)->getType();
   unsigned numElements = 0;
 
@@ -3115,7 +3881,6 @@
   return false;
 }
 
-
 /// SemaBuiltinSetjmp - Handle __builtin_setjmp(void *env[5]).
 /// This checks that the target supports __builtin_setjmp.
 bool Sema::SemaBuiltinSetjmp(CallExpr *TheCall) {
@@ -3126,12 +3891,68 @@
 }
 
 namespace {
+class UncoveredArgHandler {
+  enum { Unknown = -1, AllCovered = -2 };
+  signed FirstUncoveredArg;
+  SmallVector<const Expr *, 4> DiagnosticExprs;
+
+public:
+  UncoveredArgHandler() : FirstUncoveredArg(Unknown) { }
+
+  bool hasUncoveredArg() const {
+    return (FirstUncoveredArg >= 0);
+  }
+
+  unsigned getUncoveredArg() const {
+    assert(hasUncoveredArg() && "no uncovered argument");
+    return FirstUncoveredArg;
+  }
+
+  void setAllCovered() {
+    // A string has been found with all arguments covered, so clear out
+    // the diagnostics.
+    DiagnosticExprs.clear();
+    FirstUncoveredArg = AllCovered;
+  }
+
+  void Update(signed NewFirstUncoveredArg, const Expr *StrExpr) {
+    assert(NewFirstUncoveredArg >= 0 && "Outside range");
+
+    // Don't update if a previous string covers all arguments.
+    if (FirstUncoveredArg == AllCovered)
+      return;
+
+    // UncoveredArgHandler tracks the highest uncovered argument index
+    // and with it all the strings that match this index.
+    if (NewFirstUncoveredArg == FirstUncoveredArg)
+      DiagnosticExprs.push_back(StrExpr);
+    else if (NewFirstUncoveredArg > FirstUncoveredArg) {
+      DiagnosticExprs.clear();
+      DiagnosticExprs.push_back(StrExpr);
+      FirstUncoveredArg = NewFirstUncoveredArg;
+    }
+  }
+
+  void Diagnose(Sema &S, bool IsFunctionCall, const Expr *ArgExpr);
+};
+
 enum StringLiteralCheckType {
   SLCT_NotALiteral,
   SLCT_UncheckedLiteral,
   SLCT_CheckedLiteral
 };
-}
+} // end anonymous namespace
+
+static void CheckFormatString(Sema &S, const StringLiteral *FExpr,
+                              const Expr *OrigFormatExpr,
+                              ArrayRef<const Expr *> Args,
+                              bool HasVAListArg, unsigned format_idx,
+                              unsigned firstDataArg,
+                              Sema::FormatStringType Type,
+                              bool inFunctionCall,
+                              Sema::VariadicCallType CallType,
+                              llvm::SmallBitVector &CheckedVarArgs,
+                              UncoveredArgHandler &UncoveredArg);
 
 // Determine if an expression is a string literal or constant string.
 // If this function returns false on the arguments to a function expecting a
@@ -3142,7 +3963,8 @@
                       bool HasVAListArg, unsigned format_idx,
                       unsigned firstDataArg, Sema::FormatStringType Type,
                       Sema::VariadicCallType CallType, bool InFunctionCall,
-                      llvm::SmallBitVector &CheckedVarArgs) {
+                      llvm::SmallBitVector &CheckedVarArgs,
+                      UncoveredArgHandler &UncoveredArg) {
  tryAgain:
   if (E->isTypeDependent() || E->isValueDependent())
     return SLCT_NotALiteral;
@@ -3163,17 +3985,39 @@
     // completely checked only if both sub-expressions were checked.
     const AbstractConditionalOperator *C =
         cast<AbstractConditionalOperator>(E);
-    StringLiteralCheckType Left =
-        checkFormatStringExpr(S, C->getTrueExpr(), Args,
-                              HasVAListArg, format_idx, firstDataArg,
-                              Type, CallType, InFunctionCall, CheckedVarArgs);
-    if (Left == SLCT_NotALiteral)
-      return SLCT_NotALiteral;
+
+    // Determine whether it is necessary to check both sub-expressions, for
+    // example, because the condition expression is a constant that can be
+    // evaluated at compile time.
+    bool CheckLeft = true, CheckRight = true;
+
+    bool Cond;
+    if (C->getCond()->EvaluateAsBooleanCondition(Cond, S.getASTContext())) {
+      if (Cond)
+        CheckRight = false;
+      else
+        CheckLeft = false;
+    }
+
+    StringLiteralCheckType Left;
+    if (!CheckLeft)
+      Left = SLCT_UncheckedLiteral;
+    else {
+      Left = checkFormatStringExpr(S, C->getTrueExpr(), Args,
+                                   HasVAListArg, format_idx, firstDataArg,
+                                   Type, CallType, InFunctionCall,
+                                   CheckedVarArgs, UncoveredArg);
+      if (Left == SLCT_NotALiteral || !CheckRight)
+        return Left;
+    }
+
     StringLiteralCheckType Right =
         checkFormatStringExpr(S, C->getFalseExpr(), Args,
                               HasVAListArg, format_idx, firstDataArg,
-                              Type, CallType, InFunctionCall, CheckedVarArgs);
-    return Left < Right ? Left : Right;
+                              Type, CallType, InFunctionCall, CheckedVarArgs,
+                              UncoveredArg);
+
+    return (CheckLeft && Left < Right) ? Left : Right;
   }
 
   case Stmt::ImplicitCastExprClass: {
@@ -3224,7 +4068,8 @@
           return checkFormatStringExpr(S, Init, Args,
                                        HasVAListArg, format_idx,
                                        firstDataArg, Type, CallType,
-                                       /*InFunctionCall*/false, CheckedVarArgs);
+                                       /*InFunctionCall*/false, CheckedVarArgs,
+                                       UncoveredArg);
         }
       }
 
@@ -3279,7 +4124,7 @@
         return checkFormatStringExpr(S, Arg, Args,
                                      HasVAListArg, format_idx, firstDataArg,
                                      Type, CallType, InFunctionCall,
-                                     CheckedVarArgs);
+                                     CheckedVarArgs, UncoveredArg);
       } else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)) {
         unsigned BuiltinID = FD->getBuiltinID();
         if (BuiltinID == Builtin::BI__builtin___CFStringMakeConstantString ||
@@ -3288,7 +4133,8 @@
           return checkFormatStringExpr(S, Arg, Args,
                                        HasVAListArg, format_idx,
                                        firstDataArg, Type, CallType,
-                                       InFunctionCall, CheckedVarArgs);
+                                       InFunctionCall, CheckedVarArgs,
+                                       UncoveredArg);
         }
       }
     }
@@ -3305,8 +4151,9 @@
       StrE = cast<StringLiteral>(E);
 
     if (StrE) {
-      S.CheckFormatString(StrE, E, Args, HasVAListArg, format_idx, firstDataArg,
-                          Type, InFunctionCall, CallType, CheckedVarArgs);
+      CheckFormatString(S, StrE, E, Args, HasVAListArg, format_idx,
+                        firstDataArg, Type, InFunctionCall, CallType,
+                        CheckedVarArgs, UncoveredArg);
       return SLCT_CheckedLiteral;
     }
 
@@ -3375,10 +4222,20 @@
   // C string (e.g. "%d")
   // ObjC string uses the same format specifiers as C string, so we can use
   // the same format string checking logic for both ObjC and C strings.
+  UncoveredArgHandler UncoveredArg;
   StringLiteralCheckType CT =
       checkFormatStringExpr(*this, OrigFormatExpr, Args, HasVAListArg,
                             format_idx, firstDataArg, Type, CallType,
-                            /*IsFunctionCall*/true, CheckedVarArgs);
+                            /*IsFunctionCall*/true, CheckedVarArgs,
+                            UncoveredArg);
+
+  // Generate a diagnostic where an uncovered argument is detected.
+  if (UncoveredArg.hasUncoveredArg()) {
+    unsigned ArgIdx = UncoveredArg.getUncoveredArg() + firstDataArg;
+    assert(ArgIdx < Args.size() && "ArgIdx outside bounds");
+    UncoveredArg.Diagnose(*this, /*IsFunctionCall*/true, Args[ArgIdx]);
+  }
+
   if (CT != SLCT_NotALiteral)
     // Literal format string found, check done!
     return CT == SLCT_CheckedLiteral;
@@ -3441,6 +4298,8 @@
   bool inFunctionCall;
   Sema::VariadicCallType CallType;
   llvm::SmallBitVector &CheckedVarArgs;
+  UncoveredArgHandler &UncoveredArg;
+
 public:
   CheckFormatHandler(Sema &s, const StringLiteral *fexpr,
                      const Expr *origFormatExpr,
@@ -3449,13 +4308,15 @@
                      ArrayRef<const Expr *> Args,
                      unsigned formatIdx, bool inFunctionCall,
                      Sema::VariadicCallType callType,
-                     llvm::SmallBitVector &CheckedVarArgs)
+                     llvm::SmallBitVector &CheckedVarArgs,
+                     UncoveredArgHandler &UncoveredArg)
     : S(s), FExpr(fexpr), OrigFormatExpr(origFormatExpr), FSType(type),
-      FirstDataArg(firstDataArg), NumDataArgs(numDataArgs), Beg(beg),
-      HasVAListArg(hasVAListArg), Args(Args), FormatIdx(formatIdx),
+      FirstDataArg(firstDataArg), NumDataArgs(numDataArgs),
+      Beg(beg), HasVAListArg(hasVAListArg),
+      Args(Args), FormatIdx(formatIdx),
       usesPositionalArgs(false), atFirstArg(true),
       inFunctionCall(inFunctionCall), CallType(callType),
-      CheckedVarArgs(CheckedVarArgs) {
+      CheckedVarArgs(CheckedVarArgs), UncoveredArg(UncoveredArg) {
     CoveredArgs.resize(numDataArgs);
     CoveredArgs.reset();
   }
@@ -3490,12 +4351,11 @@
   void HandleNullChar(const char *nullCharacter) override;
 
   template <typename Range>
-  static void EmitFormatDiagnostic(Sema &S, bool inFunctionCall,
-                                   const Expr *ArgumentExpr,
-                                   PartialDiagnostic PDiag,
-                                   SourceLocation StringLoc,
-                                   bool IsStringLocation, Range StringRange,
-                                   ArrayRef<FixItHint> Fixit = None);
+  static void
+  EmitFormatDiagnostic(Sema &S, bool inFunctionCall, const Expr *ArgumentExpr,
+                       const PartialDiagnostic &PDiag, SourceLocation StringLoc,
+                       bool IsStringLocation, Range StringRange,
+                       ArrayRef<FixItHint> Fixit = None);
 
 protected:
   bool HandleInvalidConversionSpecifier(unsigned argIndex, SourceLocation Loc,
@@ -3524,7 +4384,7 @@
                             bool IsStringLocation, Range StringRange,
                             ArrayRef<FixItHint> Fixit = None);
 };
-}
+} // end anonymous namespace
 
 SourceRange CheckFormatHandler::getFormatStringRange() {
   return OrigFormatExpr->getSourceRange();
@@ -3686,26 +4546,44 @@
 }
 
 void CheckFormatHandler::DoneProcessing() {
-    // Does the number of data arguments exceed the number of
-    // format conversions in the format string?
+  // Does the number of data arguments exceed the number of
+  // format conversions in the format string?
   if (!HasVAListArg) {
       // Find any arguments that weren't covered.
     CoveredArgs.flip();
     signed notCoveredArg = CoveredArgs.find_first();
     if (notCoveredArg >= 0) {
       assert((unsigned)notCoveredArg < NumDataArgs);
-      if (const Expr *E = getDataArg((unsigned) notCoveredArg)) {
-        SourceLocation Loc = E->getLocStart();
-        if (!S.getSourceManager().isInSystemMacro(Loc)) {
-          EmitFormatDiagnostic(S.PDiag(diag::warn_printf_data_arg_not_used),
-                               Loc, /*IsStringLocation*/false,
-                               getFormatStringRange());
-        }
-      }
+      UncoveredArg.Update(notCoveredArg, OrigFormatExpr);
+    } else {
+      UncoveredArg.setAllCovered();
     }
   }
 }
 
+void UncoveredArgHandler::Diagnose(Sema &S, bool IsFunctionCall,
+                                   const Expr *ArgExpr) {
+  assert(hasUncoveredArg() && DiagnosticExprs.size() > 0 &&
+         "Invalid state");
+
+  if (!ArgExpr)
+    return;
+
+  SourceLocation Loc = ArgExpr->getLocStart();
+
+  if (S.getSourceManager().isInSystemMacro(Loc))
+    return;
+
+  PartialDiagnostic PDiag = S.PDiag(diag::warn_printf_data_arg_not_used);
+  for (auto E : DiagnosticExprs)
+    PDiag << E->getSourceRange();
+
+  CheckFormatHandler::EmitFormatDiagnostic(
+                                  S, IsFunctionCall, DiagnosticExprs[0],
+                                  PDiag, Loc, /*IsStringLocation*/false,
+                                  DiagnosticExprs[0]->getSourceRange());
+}
+
 bool
 CheckFormatHandler::HandleInvalidConversionSpecifier(unsigned argIndex,
                                                      SourceLocation Loc,
@@ -3713,7 +4591,6 @@
                                                      unsigned specifierLen,
                                                      const char *csStart,
                                                      unsigned csLen) {
-  
   bool keepGoing = true;
   if (argIndex < NumDataArgs) {
     // Consider the argument coverered, even though the specifier doesn't
@@ -3789,6 +4666,10 @@
     EmitFormatDiagnostic(
       PDiag, getLocationOfByte(CS.getStart()), /*IsStringLocation*/true,
       getSpecifierRange(startSpecifier, specifierLen));
+
+    // Since more arguments than conversion tokens are given, by extension
+    // all arguments are covered, so mark this as so.
+    UncoveredArg.setAllCovered();
     return false;
   }
   return true;
@@ -3831,14 +4712,11 @@
 /// templated so it can accept either a CharSourceRange or a SourceRange.
 ///
 /// \param FixIt optional fix it hint for the format string.
-template<typename Range>
-void CheckFormatHandler::EmitFormatDiagnostic(Sema &S, bool InFunctionCall,
-                                              const Expr *ArgumentExpr,
-                                              PartialDiagnostic PDiag,
-                                              SourceLocation Loc,
-                                              bool IsStringLocation,
-                                              Range StringRange,
-                                              ArrayRef<FixItHint> FixIt) {
+template <typename Range>
+void CheckFormatHandler::EmitFormatDiagnostic(
+    Sema &S, bool InFunctionCall, const Expr *ArgumentExpr,
+    const PartialDiagnostic &PDiag, SourceLocation Loc, bool IsStringLocation,
+    Range StringRange, ArrayRef<FixItHint> FixIt) {
   if (InFunctionCall) {
     const Sema::SemaDiagnosticBuilder &D = S.Diag(Loc, PDiag);
     D << StringRange;
@@ -3869,10 +4747,12 @@
                      ArrayRef<const Expr *> Args,
                      unsigned formatIdx, bool inFunctionCall,
                      Sema::VariadicCallType CallType,
-                     llvm::SmallBitVector &CheckedVarArgs)
+                     llvm::SmallBitVector &CheckedVarArgs,
+                     UncoveredArgHandler &UncoveredArg)
     : CheckFormatHandler(s, fexpr, origFormatExpr, type, firstDataArg,
                          numDataArgs, beg, hasVAListArg, Args,
-                         formatIdx, inFunctionCall, CallType, CheckedVarArgs)
+                         formatIdx, inFunctionCall, CallType, CheckedVarArgs,
+                         UncoveredArg)
   {}
 
   bool isObjCContext() const {
@@ -3925,7 +4805,7 @@
                                            const char *conversionPosition) 
                                              override;
 };
-}
+} // end anonymous namespace
 
 bool CheckPrintfHandler::HandleInvalidPrintfConversionSpecifier(
                                       const analyze_printf::PrintfSpecifier &FS,
@@ -3944,7 +4824,6 @@
                                const analyze_format_string::OptionalAmount &Amt,
                                unsigned k, const char *startSpecifier,
                                unsigned specifierLen) {
-
   if (Amt.hasDataArgument()) {
     if (!HasVAListArg) {
       unsigned argIndex = Amt.getArgIndex();
@@ -4156,7 +5035,6 @@
                                             &FS,
                                           const char *startSpecifier,
                                           unsigned specifierLen) {
-
   using namespace analyze_format_string;
   using namespace analyze_printf;  
   const PrintfConversionSpecifier &CS = FS.getConversionSpecifier();
@@ -4569,7 +5447,6 @@
                            E->getLocStart(),
                            /*IsStringLocation*/ false, SpecRange,
                            FixItHint::CreateReplacement(SpecRange, os.str()));
-
     } else {
       // The canonical type for formatting this value is different from the
       // actual type of the expression. (This occurs, for example, with Darwin's
@@ -4709,10 +5586,12 @@
                     ArrayRef<const Expr *> Args,
                     unsigned formatIdx, bool inFunctionCall,
                     Sema::VariadicCallType CallType,
-                    llvm::SmallBitVector &CheckedVarArgs)
+                    llvm::SmallBitVector &CheckedVarArgs,
+                    UncoveredArgHandler &UncoveredArg)
     : CheckFormatHandler(s, fexpr, origFormatExpr, type, firstDataArg,
-                         numDataArgs, beg, hasVAListArg, Args, formatIdx,
-                         inFunctionCall, CallType, CheckedVarArgs)
+                         numDataArgs, beg, hasVAListArg,
+                         Args, formatIdx, inFunctionCall, CallType,
+                         CheckedVarArgs, UncoveredArg)
   {}
   
   bool HandleScanfSpecifier(const analyze_scanf::ScanfSpecifier &FS,
@@ -4726,7 +5605,7 @@
 
   void HandleIncompleteScanList(const char *start, const char *end) override;
 };
-}
+} // end anonymous namespace
 
 void CheckScanfHandler::HandleIncompleteScanList(const char *start,
                                                  const char *end) {
@@ -4753,7 +5632,6 @@
                                        const analyze_scanf::ScanfSpecifier &FS,
                                        const char *startSpecifier,
                                        unsigned specifierLen) {
-  
   using namespace analyze_scanf;
   using namespace analyze_format_string;  
 
@@ -4873,28 +5751,31 @@
   return true;
 }
 
-void Sema::CheckFormatString(const StringLiteral *FExpr,
-                             const Expr *OrigFormatExpr,
-                             ArrayRef<const Expr *> Args,
-                             bool HasVAListArg, unsigned format_idx,
-                             unsigned firstDataArg, FormatStringType Type,
-                             bool inFunctionCall, VariadicCallType CallType,
-                             llvm::SmallBitVector &CheckedVarArgs) {
-
+static void CheckFormatString(Sema &S, const StringLiteral *FExpr,
+                              const Expr *OrigFormatExpr,
+                              ArrayRef<const Expr *> Args,
+                              bool HasVAListArg, unsigned format_idx,
+                              unsigned firstDataArg,
+                              Sema::FormatStringType Type,
+                              bool inFunctionCall,
+                              Sema::VariadicCallType CallType,
+                              llvm::SmallBitVector &CheckedVarArgs,
+                              UncoveredArgHandler &UncoveredArg) {
   // CHECK: is the format string a wide literal?
   if (!FExpr->isAscii() && !FExpr->isUTF8()) {
     CheckFormatHandler::EmitFormatDiagnostic(
-      *this, inFunctionCall, Args[format_idx],
-      PDiag(diag::warn_format_string_is_wide_literal), FExpr->getLocStart(),
+      S, inFunctionCall, Args[format_idx],
+      S.PDiag(diag::warn_format_string_is_wide_literal), FExpr->getLocStart(),
       /*IsStringLocation*/true, OrigFormatExpr->getSourceRange());
     return;
   }
-  
+
   // Str - The format string.  NOTE: this is NOT null-terminated!
   StringRef StrRef = FExpr->getString();
   const char *Str = StrRef.data();
   // Account for cases where the string literal is truncated in a declaration.
-  const ConstantArrayType *T = Context.getAsConstantArrayType(FExpr->getType());
+  const ConstantArrayType *T =
+    S.Context.getAsConstantArrayType(FExpr->getType());
   assert(T && "String literal not of constant array type!");
   size_t TypeSize = T->getSize().getZExtValue();
   size_t StrLen = std::min(std::max(TypeSize, size_t(1)) - 1, StrRef.size());
@@ -4905,8 +5786,8 @@
   if (TypeSize <= StrRef.size() &&
       StrRef.substr(0, TypeSize).find('\0') == StringRef::npos) {
     CheckFormatHandler::EmitFormatDiagnostic(
-        *this, inFunctionCall, Args[format_idx],
-        PDiag(diag::warn_printf_format_string_not_null_terminated),
+        S, inFunctionCall, Args[format_idx],
+        S.PDiag(diag::warn_printf_format_string_not_null_terminated),
         FExpr->getLocStart(),
         /*IsStringLocation=*/true, OrigFormatExpr->getSourceRange());
     return;
@@ -4915,32 +5796,34 @@
   // CHECK: empty format string?
   if (StrLen == 0 && numDataArgs > 0) {
     CheckFormatHandler::EmitFormatDiagnostic(
-      *this, inFunctionCall, Args[format_idx],
-      PDiag(diag::warn_empty_format_string), FExpr->getLocStart(),
+      S, inFunctionCall, Args[format_idx],
+      S.PDiag(diag::warn_empty_format_string), FExpr->getLocStart(),
       /*IsStringLocation*/true, OrigFormatExpr->getSourceRange());
     return;
   }
-  
-  if (Type == FST_Printf || Type == FST_NSString ||
-      Type == FST_FreeBSDKPrintf || Type == FST_OSLog ||
-      Type == FST_OSTrace) {
-    CheckPrintfHandler H(*this, FExpr, OrigFormatExpr, Type, firstDataArg,
+
+  if (Type == Sema::FST_Printf || Type == Sema::FST_NSString ||
+      Type == Sema::FST_FreeBSDKPrintf || Type == Sema::FST_OSLog ||
+      Type == Sema::FST_OSTrace) {
+    CheckPrintfHandler H(S, FExpr, OrigFormatExpr, Type, firstDataArg,
                          numDataArgs, Str, HasVAListArg, Args, format_idx,
-                         inFunctionCall, CallType, CheckedVarArgs);
-  
+                         inFunctionCall, CallType, CheckedVarArgs,
+                         UncoveredArg);
+
     if (!analyze_format_string::ParsePrintfString(H, Str, Str + StrLen,
-                                                  getLangOpts(),
-                                                  Context.getTargetInfo(),
-                                                  Type == FST_FreeBSDKPrintf))
+                                                  S.getLangOpts(),
+                                                  S.Context.getTargetInfo(),
+                                            Type == Sema::FST_FreeBSDKPrintf))
       H.DoneProcessing();
-  } else if (Type == FST_Scanf) {
-    CheckScanfHandler H(*this, FExpr, OrigFormatExpr, Type, firstDataArg, numDataArgs,
+  } else if (Type == Sema::FST_Scanf) {
+    CheckScanfHandler H(S, FExpr, OrigFormatExpr, Type, firstDataArg, numDataArgs,
                         Str, HasVAListArg, Args, format_idx,
-                        inFunctionCall, CallType, CheckedVarArgs);
-    
+                        inFunctionCall, CallType, CheckedVarArgs,
+                        UncoveredArg);
+
     if (!analyze_format_string::ParseScanfString(H, Str, Str + StrLen,
-                                                 getLangOpts(),
-                                                 Context.getTargetInfo()))
+                                                 S.getLangOpts(),
+                                                 S.Context.getTargetInfo()))
       H.DoneProcessing();
   } // TODO: handle other formats
 }
@@ -5353,7 +6236,6 @@
 
   emitReplacement(*this, Call->getExprLoc(),
                   Call->getCallee()->getSourceRange(), NewAbsKind, ArgType);
-  return;
 }
 
 //===--- CHECK: Standard memory functions ---------------------------------===//
@@ -5455,13 +6337,15 @@
 
   // It is possible to have a non-standard definition of memset.  Validate
   // we have enough arguments, and if not, abort further checking.
-  unsigned ExpectedNumArgs = (BId == Builtin::BIstrndup ? 2 : 3);
+  unsigned ExpectedNumArgs =
+      (BId == Builtin::BIstrndup || BId == Builtin::BIbzero ? 2 : 3);
   if (Call->getNumArgs() < ExpectedNumArgs)
     return;
 
-  unsigned LastArg = (BId == Builtin::BImemset ||
+  unsigned LastArg = (BId == Builtin::BImemset || BId == Builtin::BIbzero ||
                       BId == Builtin::BIstrndup ? 1 : 2);
-  unsigned LenArg = (BId == Builtin::BIstrndup ? 1 : 2);
+  unsigned LenArg =
+      (BId == Builtin::BIbzero || BId == Builtin::BIstrndup ? 1 : 2);
   const Expr *LenExpr = Call->getArg(LenArg)->IgnoreParenImpCasts();
 
   if (CheckMemorySizeofForComparison(*this, LenExpr, FnName,
@@ -5473,6 +6357,13 @@
   const Expr *SizeOfArg = getSizeOfExprArg(LenExpr);
   llvm::FoldingSetNodeID SizeOfArgID;
 
+  // Although widely used, 'bzero' is not a standard function. Be more strict
+  // with the argument types before allowing diagnostics and only allow the
+  // form bzero(ptr, sizeof(...)).
+  QualType FirstArgTy = Call->getArg(0)->IgnoreParenImpCasts()->getType();
+  if (BId == Builtin::BIbzero && !FirstArgTy->getAs<PointerType>())
+    return;
+
   for (unsigned ArgIdx = 0; ArgIdx != LastArg; ++ArgIdx) {
     const Expr *Dest = Call->getArg(ArgIdx)->IgnoreParenImpCasts();
     SourceRange ArgRange = Call->getArg(ArgIdx)->getSourceRange();
@@ -5606,7 +6497,6 @@
         << FixItHint::CreateInsertion(ArgRange.getBegin(), "(void*)"));
     break;
   }
-
 }
 
 // A little helper routine: ignore addition and subtraction of integer literals.
@@ -5821,10 +6711,12 @@
 
 //===--- CHECK: Return Address of Stack Variable --------------------------===//
 
-static Expr *EvalVal(Expr *E, SmallVectorImpl<DeclRefExpr *> &refVars,
-                     Decl *ParentDecl);
-static Expr *EvalAddr(Expr* E, SmallVectorImpl<DeclRefExpr *> &refVars,
-                      Decl *ParentDecl);
+static const Expr *EvalVal(const Expr *E,
+                           SmallVectorImpl<const DeclRefExpr *> &refVars,
+                           const Decl *ParentDecl);
+static const Expr *EvalAddr(const Expr *E,
+                            SmallVectorImpl<const DeclRefExpr *> &refVars,
+                            const Decl *ParentDecl);
 
 /// CheckReturnStackAddr - Check if a return statement returns the address
 ///   of a stack variable.
@@ -5832,8 +6724,8 @@
 CheckReturnStackAddr(Sema &S, Expr *RetValExp, QualType lhsType,
                      SourceLocation ReturnLoc) {
 
-  Expr *stackE = nullptr;
-  SmallVector<DeclRefExpr *, 8> refVars;
+  const Expr *stackE = nullptr;
+  SmallVector<const DeclRefExpr *, 8> refVars;
 
   // Perform checking for returned stack addresses, local blocks,
   // label addresses or references to temporaries.
@@ -5847,6 +6739,12 @@
   if (!stackE)
     return; // Nothing suspicious was found.
 
+  // Parameters are initalized in the calling scope, so taking the address
+  // of a parameter reference doesn't need a warning.
+  for (auto *DRE : refVars)
+    if (isa<ParmVarDecl>(DRE->getDecl()))
+      return;
+
   SourceLocation diagLoc;
   SourceRange diagRange;
   if (refVars.empty()) {
@@ -5861,7 +6759,8 @@
     diagRange = refVars[0]->getSourceRange();
   }
 
-  if (DeclRefExpr *DR = dyn_cast<DeclRefExpr>(stackE)) { //address of local var.
+  if (const DeclRefExpr *DR = dyn_cast<DeclRefExpr>(stackE)) {
+    // address of local var
     S.Diag(diagLoc, diag::warn_ret_stack_addr_ref) << lhsType->isReferenceType()
      << DR->getDecl()->getDeclName() << diagRange;
   } else if (isa<BlockExpr>(stackE)) { // local block.
@@ -5869,6 +6768,13 @@
   } else if (isa<AddrLabelExpr>(stackE)) { // address of label.
     S.Diag(diagLoc, diag::warn_ret_addr_label) << diagRange;
   } else { // local temporary.
+    // If there is an LValue->RValue conversion, then the value of the
+    // reference type is used, not the reference.
+    if (auto *ICE = dyn_cast<ImplicitCastExpr>(RetValExp)) {
+      if (ICE->getCastKind() == CK_LValueToRValue) {
+        return;
+      }
+    }
     S.Diag(diagLoc, diag::warn_ret_local_temp_addr_ref)
      << lhsType->isReferenceType() << diagRange;
   }
@@ -5876,12 +6782,12 @@
   // Display the "trail" of reference variables that we followed until we
   // found the problematic expression using notes.
   for (unsigned i = 0, e = refVars.size(); i != e; ++i) {
-    VarDecl *VD = cast<VarDecl>(refVars[i]->getDecl());
+    const VarDecl *VD = cast<VarDecl>(refVars[i]->getDecl());
     // If this var binds to another reference var, show the range of the next
     // var, otherwise the var binds to the problematic expression, in which case
     // show the range of the expression.
-    SourceRange range = (i < e-1) ? refVars[i+1]->getSourceRange()
-                                  : stackE->getSourceRange();
+    SourceRange range = (i < e - 1) ? refVars[i + 1]->getSourceRange()
+                                    : stackE->getSourceRange();
     S.Diag(VD->getLocation(), diag::note_ref_var_local_bind)
         << VD->getDeclName() << range;
   }
@@ -5913,8 +6819,9 @@
 ///   * arbitrary interplay between "&" and "*" operators
 ///   * pointer arithmetic from an address of a stack variable
 ///   * taking the address of an array element where the array is on the stack
-static Expr *EvalAddr(Expr *E, SmallVectorImpl<DeclRefExpr *> &refVars,
-                      Decl *ParentDecl) {
+static const Expr *EvalAddr(const Expr *E,
+                            SmallVectorImpl<const DeclRefExpr *> &refVars,
+                            const Decl *ParentDecl) {
   if (E->isTypeDependent())
     return nullptr;
 
@@ -5931,13 +6838,13 @@
   // EvalAddr and EvalVal appropriately.
   switch (E->getStmtClass()) {
   case Stmt::DeclRefExprClass: {
-    DeclRefExpr *DR = cast<DeclRefExpr>(E);
+    const DeclRefExpr *DR = cast<DeclRefExpr>(E);
 
     // If we leave the immediate function, the lifetime isn't about to end.
     if (DR->refersToEnclosingVariableOrCapture())
       return nullptr;
 
-    if (VarDecl *V = dyn_cast<VarDecl>(DR->getDecl()))
+    if (const VarDecl *V = dyn_cast<VarDecl>(DR->getDecl()))
       // If this is a reference variable, follow through to the expression that
       // it points to.
       if (V->hasLocalStorage() &&
@@ -5953,44 +6860,44 @@
   case Stmt::UnaryOperatorClass: {
     // The only unary operator that make sense to handle here
     // is AddrOf.  All others don't make sense as pointers.
-    UnaryOperator *U = cast<UnaryOperator>(E);
+    const UnaryOperator *U = cast<UnaryOperator>(E);
 
     if (U->getOpcode() == UO_AddrOf)
       return EvalVal(U->getSubExpr(), refVars, ParentDecl);
-    else
-      return nullptr;
+    return nullptr;
   }
 
   case Stmt::BinaryOperatorClass: {
     // Handle pointer arithmetic.  All other binary operators are not valid
     // in this context.
-    BinaryOperator *B = cast<BinaryOperator>(E);
+    const BinaryOperator *B = cast<BinaryOperator>(E);
     BinaryOperatorKind op = B->getOpcode();
 
     if (op != BO_Add && op != BO_Sub)
       return nullptr;
 
-    Expr *Base = B->getLHS();
+    const Expr *Base = B->getLHS();
 
     // Determine which argument is the real pointer base.  It could be
     // the RHS argument instead of the LHS.
-    if (!Base->getType()->isPointerType()) Base = B->getRHS();
+    if (!Base->getType()->isPointerType())
+      Base = B->getRHS();
 
-    assert (Base->getType()->isPointerType());
+    assert(Base->getType()->isPointerType());
     return EvalAddr(Base, refVars, ParentDecl);
   }
 
   // For conditional operators we need to see if either the LHS or RHS are
   // valid DeclRefExpr*s.  If one of them is valid, we return it.
   case Stmt::ConditionalOperatorClass: {
-    ConditionalOperator *C = cast<ConditionalOperator>(E);
+    const ConditionalOperator *C = cast<ConditionalOperator>(E);
 
     // Handle the GNU extension for missing LHS.
     // FIXME: That isn't a ConditionalOperator, so doesn't get here.
-    if (Expr *LHSExpr = C->getLHS()) {
+    if (const Expr *LHSExpr = C->getLHS()) {
       // In C++, we can have a throw-expression, which has 'void' type.
       if (!LHSExpr->getType()->isVoidType())
-        if (Expr *LHS = EvalAddr(LHSExpr, refVars, ParentDecl))
+        if (const Expr *LHS = EvalAddr(LHSExpr, refVars, ParentDecl))
           return LHS;
     }
 
@@ -6023,7 +6930,7 @@
   case Stmt::CXXDynamicCastExprClass:
   case Stmt::CXXConstCastExprClass:
   case Stmt::CXXReinterpretCastExprClass: {
-    Expr* SubExpr = cast<CastExpr>(E)->getSubExpr();
+    const Expr* SubExpr = cast<CastExpr>(E)->getSubExpr();
     switch (cast<CastExpr>(E)->getCastKind()) {
     case CK_LValueToRValue:
     case CK_NoOp:
@@ -6053,157 +6960,161 @@
   }
 
   case Stmt::MaterializeTemporaryExprClass:
-    if (Expr *Result = EvalAddr(
-                         cast<MaterializeTemporaryExpr>(E)->GetTemporaryExpr(),
-                                refVars, ParentDecl))
+    if (const Expr *Result =
+            EvalAddr(cast<MaterializeTemporaryExpr>(E)->GetTemporaryExpr(),
+                     refVars, ParentDecl))
       return Result;
-      
     return E;
-      
+
   // Everything else: we simply don't reason about them.
   default:
     return nullptr;
   }
 }
 
-
 ///  EvalVal - This function is complements EvalAddr in the mutual recursion.
 ///   See the comments for EvalAddr for more details.
-static Expr *EvalVal(Expr *E, SmallVectorImpl<DeclRefExpr *> &refVars,
-                     Decl *ParentDecl) {
-do {
-  // We should only be called for evaluating non-pointer expressions, or
-  // expressions with a pointer type that are not used as references but instead
-  // are l-values (e.g., DeclRefExpr with a pointer type).
+static const Expr *EvalVal(const Expr *E,
+                           SmallVectorImpl<const DeclRefExpr *> &refVars,
+                           const Decl *ParentDecl) {
+  do {
+    // We should only be called for evaluating non-pointer expressions, or
+    // expressions with a pointer type that are not used as references but
+    // instead
+    // are l-values (e.g., DeclRefExpr with a pointer type).
 
-  // Our "symbolic interpreter" is just a dispatch off the currently
-  // viewed AST node.  We then recursively traverse the AST by calling
-  // EvalAddr and EvalVal appropriately.
+    // Our "symbolic interpreter" is just a dispatch off the currently
+    // viewed AST node.  We then recursively traverse the AST by calling
+    // EvalAddr and EvalVal appropriately.
 
-  E = E->IgnoreParens();
-  switch (E->getStmtClass()) {
-  case Stmt::ImplicitCastExprClass: {
-    ImplicitCastExpr *IE = cast<ImplicitCastExpr>(E);
-    if (IE->getValueKind() == VK_LValue) {
-      E = IE->getSubExpr();
-      continue;
-    }
-    return nullptr;
-  }
-
-  case Stmt::ExprWithCleanupsClass:
-    return EvalVal(cast<ExprWithCleanups>(E)->getSubExpr(), refVars,ParentDecl);
-
-  case Stmt::DeclRefExprClass: {
-    // When we hit a DeclRefExpr we are looking at code that refers to a
-    // variable's name. If it's not a reference variable we check if it has
-    // local storage within the function, and if so, return the expression.
-    DeclRefExpr *DR = cast<DeclRefExpr>(E);
-
-    // If we leave the immediate function, the lifetime isn't about to end.
-    if (DR->refersToEnclosingVariableOrCapture())
+    E = E->IgnoreParens();
+    switch (E->getStmtClass()) {
+    case Stmt::ImplicitCastExprClass: {
+      const ImplicitCastExpr *IE = cast<ImplicitCastExpr>(E);
+      if (IE->getValueKind() == VK_LValue) {
+        E = IE->getSubExpr();
+        continue;
+      }
       return nullptr;
+    }
 
-    if (VarDecl *V = dyn_cast<VarDecl>(DR->getDecl())) {
-      // Check if it refers to itself, e.g. "int& i = i;".
-      if (V == ParentDecl)
-        return DR;
+    case Stmt::ExprWithCleanupsClass:
+      return EvalVal(cast<ExprWithCleanups>(E)->getSubExpr(), refVars,
+                     ParentDecl);
 
-      if (V->hasLocalStorage()) {
-        if (!V->getType()->isReferenceType())
+    case Stmt::DeclRefExprClass: {
+      // When we hit a DeclRefExpr we are looking at code that refers to a
+      // variable's name. If it's not a reference variable we check if it has
+      // local storage within the function, and if so, return the expression.
+      const DeclRefExpr *DR = cast<DeclRefExpr>(E);
+
+      // If we leave the immediate function, the lifetime isn't about to end.
+      if (DR->refersToEnclosingVariableOrCapture())
+        return nullptr;
+
+      if (const VarDecl *V = dyn_cast<VarDecl>(DR->getDecl())) {
+        // Check if it refers to itself, e.g. "int& i = i;".
+        if (V == ParentDecl)
           return DR;
 
-        // Reference variable, follow through to the expression that
-        // it points to.
-        if (V->hasInit()) {
-          // Add the reference variable to the "trail".
-          refVars.push_back(DR);
-          return EvalVal(V->getInit(), refVars, V);
+        if (V->hasLocalStorage()) {
+          if (!V->getType()->isReferenceType())
+            return DR;
+
+          // Reference variable, follow through to the expression that
+          // it points to.
+          if (V->hasInit()) {
+            // Add the reference variable to the "trail".
+            refVars.push_back(DR);
+            return EvalVal(V->getInit(), refVars, V);
+          }
         }
       }
+
+      return nullptr;
     }
 
-    return nullptr;
-  }
+    case Stmt::UnaryOperatorClass: {
+      // The only unary operator that make sense to handle here
+      // is Deref.  All others don't resolve to a "name."  This includes
+      // handling all sorts of rvalues passed to a unary operator.
+      const UnaryOperator *U = cast<UnaryOperator>(E);
 
-  case Stmt::UnaryOperatorClass: {
-    // The only unary operator that make sense to handle here
-    // is Deref.  All others don't resolve to a "name."  This includes
-    // handling all sorts of rvalues passed to a unary operator.
-    UnaryOperator *U = cast<UnaryOperator>(E);
+      if (U->getOpcode() == UO_Deref)
+        return EvalAddr(U->getSubExpr(), refVars, ParentDecl);
 
-    if (U->getOpcode() == UO_Deref)
-      return EvalAddr(U->getSubExpr(), refVars, ParentDecl);
+      return nullptr;
+    }
 
-    return nullptr;
-  }
+    case Stmt::ArraySubscriptExprClass: {
+      // Array subscripts are potential references to data on the stack.  We
+      // retrieve the DeclRefExpr* for the array variable if it indeed
+      // has local storage.
+      const auto *ASE = cast<ArraySubscriptExpr>(E);
+      if (ASE->isTypeDependent())
+        return nullptr;
+      return EvalAddr(ASE->getBase(), refVars, ParentDecl);
+    }
 
-  case Stmt::ArraySubscriptExprClass: {
-    // Array subscripts are potential references to data on the stack.  We
-    // retrieve the DeclRefExpr* for the array variable if it indeed
-    // has local storage.
-    return EvalAddr(cast<ArraySubscriptExpr>(E)->getBase(), refVars,ParentDecl);
-  }
+    case Stmt::OMPArraySectionExprClass: {
+      return EvalAddr(cast<OMPArraySectionExpr>(E)->getBase(), refVars,
+                      ParentDecl);
+    }
 
-  case Stmt::OMPArraySectionExprClass: {
-    return EvalAddr(cast<OMPArraySectionExpr>(E)->getBase(), refVars,
-                    ParentDecl);
-  }
+    case Stmt::ConditionalOperatorClass: {
+      // For conditional operators we need to see if either the LHS or RHS are
+      // non-NULL Expr's.  If one is non-NULL, we return it.
+      const ConditionalOperator *C = cast<ConditionalOperator>(E);
 
-  case Stmt::ConditionalOperatorClass: {
-    // For conditional operators we need to see if either the LHS or RHS are
-    // non-NULL Expr's.  If one is non-NULL, we return it.
-    ConditionalOperator *C = cast<ConditionalOperator>(E);
+      // Handle the GNU extension for missing LHS.
+      if (const Expr *LHSExpr = C->getLHS()) {
+        // In C++, we can have a throw-expression, which has 'void' type.
+        if (!LHSExpr->getType()->isVoidType())
+          if (const Expr *LHS = EvalVal(LHSExpr, refVars, ParentDecl))
+            return LHS;
+      }
 
-    // Handle the GNU extension for missing LHS.
-    if (Expr *LHSExpr = C->getLHS()) {
       // In C++, we can have a throw-expression, which has 'void' type.
-      if (!LHSExpr->getType()->isVoidType())
-        if (Expr *LHS = EvalVal(LHSExpr, refVars, ParentDecl))
-          return LHS;
+      if (C->getRHS()->getType()->isVoidType())
+        return nullptr;
+
+      return EvalVal(C->getRHS(), refVars, ParentDecl);
     }
 
-    // In C++, we can have a throw-expression, which has 'void' type.
-    if (C->getRHS()->getType()->isVoidType())
-      return nullptr;
+    // Accesses to members are potential references to data on the stack.
+    case Stmt::MemberExprClass: {
+      const MemberExpr *M = cast<MemberExpr>(E);
 
-    return EvalVal(C->getRHS(), refVars, ParentDecl);
-  }
+      // Check for indirect access.  We only want direct field accesses.
+      if (M->isArrow())
+        return nullptr;
 
-  // Accesses to members are potential references to data on the stack.
-  case Stmt::MemberExprClass: {
-    MemberExpr *M = cast<MemberExpr>(E);
+      // Check whether the member type is itself a reference, in which case
+      // we're not going to refer to the member, but to what the member refers
+      // to.
+      if (M->getMemberDecl()->getType()->isReferenceType())
+        return nullptr;
 
-    // Check for indirect access.  We only want direct field accesses.
-    if (M->isArrow())
-      return nullptr;
+      return EvalVal(M->getBase(), refVars, ParentDecl);
+    }
 
-    // Check whether the member type is itself a reference, in which case
-    // we're not going to refer to the member, but to what the member refers to.
-    if (M->getMemberDecl()->getType()->isReferenceType())
-      return nullptr;
-
-    return EvalVal(M->getBase(), refVars, ParentDecl);
-  }
-
-  case Stmt::MaterializeTemporaryExprClass:
-    if (Expr *Result = EvalVal(
-                          cast<MaterializeTemporaryExpr>(E)->GetTemporaryExpr(),
-                               refVars, ParentDecl))
-      return Result;
-      
-    return E;
-
-  default:
-    // Check that we don't return or take the address of a reference to a
-    // temporary. This is only useful in C++.
-    if (!E->isTypeDependent() && E->isRValue())
+    case Stmt::MaterializeTemporaryExprClass:
+      if (const Expr *Result =
+              EvalVal(cast<MaterializeTemporaryExpr>(E)->GetTemporaryExpr(),
+                      refVars, ParentDecl))
+        return Result;
       return E;
 
-    // Everything else: we simply don't reason about them.
-    return nullptr;
-  }
-} while (true);
+    default:
+      // Check that we don't return or take the address of a reference to a
+      // temporary. This is only useful in C++.
+      if (!E->isTypeDependent() && E->isRValue())
+        return E;
+
+      // Everything else: we simply don't reason about them.
+      return nullptr;
+    }
+  } while (true);
 }
 
 void
@@ -6255,7 +7166,6 @@
       if (DRL->getDecl() == DRR->getDecl())
         return;
 
-
   // Special case: check for comparisons against literals that can be exactly
   //  represented by APFloat.  In such cases, do not emit a warning.  This
   //  is a heuristic: often comparison against such literals are used to
@@ -6381,8 +7291,7 @@
   }
 };
 
-static IntRange GetValueRange(ASTContext &C, llvm::APSInt &value,
-                              unsigned MaxWidth) {
+IntRange GetValueRange(ASTContext &C, llvm::APSInt &value, unsigned MaxWidth) {
   if (value.isSigned() && value.isNegative())
     return IntRange(value.getMinSignedBits(), false);
 
@@ -6394,8 +7303,8 @@
   return IntRange(value.getActiveBits(), true);
 }
 
-static IntRange GetValueRange(ASTContext &C, APValue &result, QualType Ty,
-                              unsigned MaxWidth) {
+IntRange GetValueRange(ASTContext &C, APValue &result, QualType Ty,
+                       unsigned MaxWidth) {
   if (result.isInt())
     return GetValueRange(C, result.getInt(), MaxWidth);
 
@@ -6423,7 +7332,7 @@
   return IntRange(MaxWidth, Ty->isUnsignedIntegerOrEnumerationType());
 }
 
-static QualType GetExprType(Expr *E) {
+QualType GetExprType(const Expr *E) {
   QualType Ty = E->getType();
   if (const AtomicType *AtomicRHS = Ty->getAs<AtomicType>())
     Ty = AtomicRHS->getValueType();
@@ -6434,7 +7343,7 @@
 /// range of values it might take.
 ///
 /// \param MaxWidth - the width to which the value will be truncated
-static IntRange GetExprRange(ASTContext &C, Expr *E, unsigned MaxWidth) {
+IntRange GetExprRange(ASTContext &C, const Expr *E, unsigned MaxWidth) {
   E = E->IgnoreParens();
 
   // Try a full evaluation first.
@@ -6445,7 +7354,7 @@
   // I think we only want to look through implicit casts here; if the
   // user has an explicit widening cast, we should treat the value as
   // being of the new, wider type.
-  if (ImplicitCastExpr *CE = dyn_cast<ImplicitCastExpr>(E)) {
+  if (const auto *CE = dyn_cast<ImplicitCastExpr>(E)) {
     if (CE->getCastKind() == CK_NoOp || CE->getCastKind() == CK_LValueToRValue)
       return GetExprRange(C, CE->getSubExpr(), MaxWidth);
 
@@ -6472,7 +7381,7 @@
                     SubRange.NonNegative || OutputTypeRange.NonNegative);
   }
 
-  if (ConditionalOperator *CO = dyn_cast<ConditionalOperator>(E)) {
+  if (const auto *CO = dyn_cast<ConditionalOperator>(E)) {
     // If we can fold the condition, just take that operand.
     bool CondResult;
     if (CO->getCond()->EvaluateAsBooleanCondition(CondResult, C))
@@ -6486,7 +7395,7 @@
     return IntRange::join(L, R);
   }
 
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(E)) {
+  if (const auto *BO = dyn_cast<BinaryOperator>(E)) {
     switch (BO->getOpcode()) {
 
     // Boolean-valued operations are single-bit and positive.
@@ -6626,7 +7535,7 @@
     return IntRange::join(L, R);
   }
 
-  if (UnaryOperator *UO = dyn_cast<UnaryOperator>(E)) {
+  if (const auto *UO = dyn_cast<UnaryOperator>(E)) {
     switch (UO->getOpcode()) {
     // Boolean-valued operations are white-listed.
     case UO_LNot:
@@ -6642,26 +7551,26 @@
     }
   }
 
-  if (OpaqueValueExpr *OVE = dyn_cast<OpaqueValueExpr>(E))
+  if (const auto *OVE = dyn_cast<OpaqueValueExpr>(E))
     return GetExprRange(C, OVE->getSourceExpr(), MaxWidth);
 
-  if (FieldDecl *BitField = E->getSourceBitField())
+  if (const auto *BitField = E->getSourceBitField())
     return IntRange(BitField->getBitWidthValue(C),
                     BitField->getType()->isUnsignedIntegerOrEnumerationType());
 
   return IntRange::forValueOfType(C, GetExprType(E));
 }
 
-static IntRange GetExprRange(ASTContext &C, Expr *E) {
+IntRange GetExprRange(ASTContext &C, const Expr *E) {
   return GetExprRange(C, E, C.getIntWidth(GetExprType(E)));
 }
 
 /// Checks whether the given value, which currently has the given
 /// source semantics, has the same value when coerced through the
 /// target semantics.
-static bool IsSameFloatAfterCast(const llvm::APFloat &value,
-                                 const llvm::fltSemantics &Src,
-                                 const llvm::fltSemantics &Tgt) {
+bool IsSameFloatAfterCast(const llvm::APFloat &value,
+                          const llvm::fltSemantics &Src,
+                          const llvm::fltSemantics &Tgt) {
   llvm::APFloat truncated = value;
 
   bool ignored;
@@ -6676,9 +7585,9 @@
 /// target semantics.
 ///
 /// The value might be a vector of floats (or a complex number).
-static bool IsSameFloatAfterCast(const APValue &value,
-                                 const llvm::fltSemantics &Src,
-                                 const llvm::fltSemantics &Tgt) {
+bool IsSameFloatAfterCast(const APValue &value,
+                          const llvm::fltSemantics &Src,
+                          const llvm::fltSemantics &Tgt) {
   if (value.isFloat())
     return IsSameFloatAfterCast(value.getFloat(), Src, Tgt);
 
@@ -6694,9 +7603,9 @@
           IsSameFloatAfterCast(value.getComplexFloatImag(), Src, Tgt));
 }
 
-static void AnalyzeImplicitConversions(Sema &S, Expr *E, SourceLocation CC);
+void AnalyzeImplicitConversions(Sema &S, Expr *E, SourceLocation CC);
 
-static bool IsZero(Sema &S, Expr *E) {
+bool IsZero(Sema &S, Expr *E) {
   // Suppress cases where we are comparing against an enum constant.
   if (const DeclRefExpr *DR =
       dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts()))
@@ -6711,7 +7620,7 @@
   return E->isIntegerConstantExpr(Value, S.Context) && Value == 0;
 }
 
-static bool HasEnumType(Expr *E) {
+bool HasEnumType(Expr *E) {
   // Strip off implicit integral promotions.
   while (ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(E)) {
     if (ICE->getCastKind() != CK_IntegralCast &&
@@ -6723,7 +7632,7 @@
   return E->getType()->isEnumeralType();
 }
 
-static void CheckTrivialUnsignedComparison(Sema &S, BinaryOperator *E) {
+void CheckTrivialUnsignedComparison(Sema &S, BinaryOperator *E) {
   // Disable warning in template instantiations.
   if (!S.ActiveTemplateInstantiations.empty())
     return;
@@ -6751,10 +7660,9 @@
   }
 }
 
-static void DiagnoseOutOfRangeComparison(Sema &S, BinaryOperator *E,
-                                         Expr *Constant, Expr *Other,
-                                         llvm::APSInt Value,
-                                         bool RhsConstant) {
+void DiagnoseOutOfRangeComparison(Sema &S, BinaryOperator *E, Expr *Constant,
+                                  Expr *Other, const llvm::APSInt &Value,
+                                  bool RhsConstant) {
   // Disable warning in template instantiations.
   if (!S.ActiveTemplateInstantiations.empty())
     return;
@@ -6962,7 +7870,7 @@
 
 /// Analyze the operands of the given comparison.  Implements the
 /// fallback case from AnalyzeComparison.
-static void AnalyzeImpConvsInComparison(Sema &S, BinaryOperator *E) {
+void AnalyzeImpConvsInComparison(Sema &S, BinaryOperator *E) {
   AnalyzeImplicitConversions(S, E->getLHS(), E->getOperatorLoc());
   AnalyzeImplicitConversions(S, E->getRHS(), E->getOperatorLoc());
 }
@@ -6970,7 +7878,7 @@
 /// \brief Implements -Wsign-compare.
 ///
 /// \param E the binary operator to check for warnings
-static void AnalyzeComparison(Sema &S, BinaryOperator *E) {
+void AnalyzeComparison(Sema &S, BinaryOperator *E) {
   // The type the comparison is being performed in.
   QualType T = E->getLHS()->getType();
 
@@ -7071,8 +7979,8 @@
 /// Analyzes an attempt to assign the given value to a bitfield.
 ///
 /// Returns true if there was something fishy about the attempt.
-static bool AnalyzeBitFieldAssignment(Sema &S, FieldDecl *Bitfield, Expr *Init,
-                                      SourceLocation InitLoc) {
+bool AnalyzeBitFieldAssignment(Sema &S, FieldDecl *Bitfield, Expr *Init,
+                               SourceLocation InitLoc) {
   assert(Bitfield->isBitField());
   if (Bitfield->isInvalidDecl())
     return false;
@@ -7097,6 +8005,12 @@
   unsigned OriginalWidth = Value.getBitWidth();
   unsigned FieldWidth = Bitfield->getBitWidthValue(S.Context);
 
+  if (Value.isSigned() && Value.isNegative())
+    if (UnaryOperator *UO = dyn_cast<UnaryOperator>(OriginalInit))
+      if (UO->getOpcode() == UO_Minus)
+        if (isa<IntegerLiteral>(UO->getSubExpr()))
+          OriginalWidth = Value.getMinSignedBits();
+
   if (OriginalWidth <= FieldWidth)
     return false;
 
@@ -7126,7 +8040,7 @@
 
 /// Analyze the given simple or compound assignment for warning-worthy
 /// operations.
-static void AnalyzeAssignment(Sema &S, BinaryOperator *E) {
+void AnalyzeAssignment(Sema &S, BinaryOperator *E) {
   // Just recurse on the LHS.
   AnalyzeImplicitConversions(S, E->getLHS(), E->getOperatorLoc());
 
@@ -7145,9 +8059,9 @@
 }
 
 /// Diagnose an implicit cast;  purely a helper for CheckImplicitConversion.
-static void DiagnoseImpCast(Sema &S, Expr *E, QualType SourceType, QualType T, 
-                            SourceLocation CContext, unsigned diag,
-                            bool pruneControlFlow = false) {
+void DiagnoseImpCast(Sema &S, Expr *E, QualType SourceType, QualType T, 
+                     SourceLocation CContext, unsigned diag,
+                     bool pruneControlFlow = false) {
   if (pruneControlFlow) {
     S.DiagRuntimeBehavior(E->getExprLoc(), E,
                           S.PDiag(diag)
@@ -7160,25 +8074,75 @@
 }
 
 /// Diagnose an implicit cast;  purely a helper for CheckImplicitConversion.
-static void DiagnoseImpCast(Sema &S, Expr *E, QualType T,
-                            SourceLocation CContext, unsigned diag,
-                            bool pruneControlFlow = false) {
+void DiagnoseImpCast(Sema &S, Expr *E, QualType T, SourceLocation CContext,
+                     unsigned diag, bool pruneControlFlow = false) {
   DiagnoseImpCast(S, E, E->getType(), T, CContext, diag, pruneControlFlow);
 }
 
-/// Diagnose an implicit cast from a literal expression. Does not warn when the
-/// cast wouldn't lose information.
-void DiagnoseFloatingLiteralImpCast(Sema &S, FloatingLiteral *FL, QualType T,
-                                    SourceLocation CContext) {
-  // Try to convert the literal exactly to an integer. If we can, don't warn.
+
+/// Diagnose an implicit cast from a floating point value to an integer value.
+void DiagnoseFloatingImpCast(Sema &S, Expr *E, QualType T,
+
+                             SourceLocation CContext) {
+  const bool IsBool = T->isSpecificBuiltinType(BuiltinType::Bool);
+  const bool PruneWarnings = !S.ActiveTemplateInstantiations.empty();
+
+  Expr *InnerE = E->IgnoreParenImpCasts();
+  // We also want to warn on, e.g., "int i = -1.234"
+  if (UnaryOperator *UOp = dyn_cast<UnaryOperator>(InnerE))
+    if (UOp->getOpcode() == UO_Minus || UOp->getOpcode() == UO_Plus)
+      InnerE = UOp->getSubExpr()->IgnoreParenImpCasts();
+
+  const bool IsLiteral =
+      isa<FloatingLiteral>(E) || isa<FloatingLiteral>(InnerE);
+
+  llvm::APFloat Value(0.0);
+  bool IsConstant =
+    E->EvaluateAsFloat(Value, S.Context, Expr::SE_AllowSideEffects);
+  if (!IsConstant) {
+    return DiagnoseImpCast(S, E, T, CContext,
+                           diag::warn_impcast_float_integer, PruneWarnings);
+  }
+
   bool isExact = false;
-  const llvm::APFloat &Value = FL->getValue();
+
   llvm::APSInt IntegerValue(S.Context.getIntWidth(T),
                             T->hasUnsignedIntegerRepresentation());
-  if (Value.convertToInteger(IntegerValue,
-                             llvm::APFloat::rmTowardZero, &isExact)
-      == llvm::APFloat::opOK && isExact)
-    return;
+  if (Value.convertToInteger(IntegerValue, llvm::APFloat::rmTowardZero,
+                             &isExact) == llvm::APFloat::opOK &&
+      isExact) {
+    if (IsLiteral) return;
+    return DiagnoseImpCast(S, E, T, CContext, diag::warn_impcast_float_integer,
+                           PruneWarnings);
+  }
+
+  unsigned DiagID = 0;
+  if (IsLiteral) {
+    // Warn on floating point literal to integer.
+    DiagID = diag::warn_impcast_literal_float_to_integer;
+  } else if (IntegerValue == 0) {
+    if (Value.isZero()) {  // Skip -0.0 to 0 conversion.
+      return DiagnoseImpCast(S, E, T, CContext,
+                             diag::warn_impcast_float_integer, PruneWarnings);
+    }
+    // Warn on non-zero to zero conversion.
+    DiagID = diag::warn_impcast_float_to_integer_zero;
+  } else {
+    if (IntegerValue.isUnsigned()) {
+      if (!IntegerValue.isMaxValue()) {
+        return DiagnoseImpCast(S, E, T, CContext,
+                               diag::warn_impcast_float_integer, PruneWarnings);
+      }
+    } else {  // IntegerValue.isSigned()
+      if (!IntegerValue.isMaxSignedValue() &&
+          !IntegerValue.isMinSignedValue()) {
+        return DiagnoseImpCast(S, E, T, CContext,
+                               diag::warn_impcast_float_integer, PruneWarnings);
+      }
+    }
+    // Warn on evaluatable floating point expression to integer conversion.
+    DiagID = diag::warn_impcast_float_to_integer;
+  }
 
   // FIXME: Force the precision of the source value down so we don't print
   // digits which are usually useless (we don't really care here if we
@@ -7191,14 +8155,22 @@
   Value.toString(PrettySourceValue, precision);
 
   SmallString<16> PrettyTargetValue;
-  if (T->isSpecificBuiltinType(BuiltinType::Bool))
+  if (IsBool)
     PrettyTargetValue = Value.isZero() ? "false" : "true";
   else
     IntegerValue.toString(PrettyTargetValue);
 
-  S.Diag(FL->getExprLoc(), diag::warn_impcast_literal_float_to_integer)
-    << FL->getType() << T.getUnqualifiedType() << PrettySourceValue
-    << PrettyTargetValue << FL->getSourceRange() << SourceRange(CContext);
+  if (PruneWarnings) {
+    S.DiagRuntimeBehavior(E->getExprLoc(), E,
+                          S.PDiag(DiagID)
+                              << E->getType() << T.getUnqualifiedType()
+                              << PrettySourceValue << PrettyTargetValue
+                              << E->getSourceRange() << SourceRange(CContext));
+  } else {
+    S.Diag(E->getExprLoc(), DiagID)
+        << E->getType() << T.getUnqualifiedType() << PrettySourceValue
+        << PrettyTargetValue << E->getSourceRange() << SourceRange(CContext);
+  }
 }
 
 std::string PrettyPrintInRange(const llvm::APSInt &Value, IntRange Range) {
@@ -7210,7 +8182,7 @@
   return ValueInRange.toString(10);
 }
 
-static bool IsImplicitBoolFloatConversion(Sema &S, Expr *Ex, bool ToBool) {
+bool IsImplicitBoolFloatConversion(Sema &S, Expr *Ex, bool ToBool) {
   if (!isa<ImplicitCastExpr>(Ex))
     return false;
 
@@ -7250,8 +8222,7 @@
   }
 }
 
-static void DiagnoseNullConversion(Sema &S, Expr *E, QualType T,
-                                   SourceLocation CC) {
+void DiagnoseNullConversion(Sema &S, Expr *E, QualType T, SourceLocation CC) {
   if (S.Diags.isIgnored(diag::warn_impcast_null_pointer_to_integer,
                         E->getExprLoc()))
     return;
@@ -7273,14 +8244,21 @@
 
   SourceLocation Loc = E->getSourceRange().getBegin();
 
+  // Venture through the macro stacks to get to the source of macro arguments.
+  // The new location is a better location than the complete location that was
+  // passed in.
+  while (S.SourceMgr.isMacroArgExpansion(Loc))
+    Loc = S.SourceMgr.getImmediateMacroCallerLoc(Loc);
+
+  while (S.SourceMgr.isMacroArgExpansion(CC))
+    CC = S.SourceMgr.getImmediateMacroCallerLoc(CC);
+
   // __null is usually wrapped in a macro.  Go up a macro if that is the case.
-  if (NullKind == Expr::NPCK_GNUNull) {
-    if (Loc.isMacroID()) {
-      StringRef MacroName = Lexer::getImmediateMacroNameForDiagnostics(
-          Loc, S.SourceMgr, S.getLangOpts());
-      if (MacroName == "NULL")
-        Loc = S.SourceMgr.getImmediateExpansionRange(Loc).first;
-    }
+  if (NullKind == Expr::NPCK_GNUNull && Loc.isMacroID()) {
+    StringRef MacroName = Lexer::getImmediateMacroNameForDiagnostics(
+        Loc, S.SourceMgr, S.getLangOpts());
+    if (MacroName == "NULL")
+      Loc = S.SourceMgr.getImmediateExpansionRange(Loc).first;
   }
 
   // Only warn if the null and context location are in the same macro expansion.
@@ -7293,17 +8271,15 @@
                                       S.getFixItZeroLiteralForType(T, Loc));
 }
 
-static void checkObjCArrayLiteral(Sema &S, QualType TargetType,
-                                  ObjCArrayLiteral *ArrayLiteral);
-static void checkObjCDictionaryLiteral(Sema &S, QualType TargetType,
-                                       ObjCDictionaryLiteral *DictionaryLiteral);
+void checkObjCArrayLiteral(Sema &S, QualType TargetType,
+                           ObjCArrayLiteral *ArrayLiteral);
+void checkObjCDictionaryLiteral(Sema &S, QualType TargetType,
+                                ObjCDictionaryLiteral *DictionaryLiteral);
 
 /// Check a single element within a collection literal against the
 /// target element type.
-static void checkObjCCollectionLiteralElement(Sema &S,
-                                              QualType TargetElementType,
-                                              Expr *Element,
-                                              unsigned ElementKind) {
+void checkObjCCollectionLiteralElement(Sema &S, QualType TargetElementType,
+                                       Expr *Element, unsigned ElementKind) {
   // Skip a bitcast to 'id' or qualified 'id'.
   if (auto ICE = dyn_cast<ImplicitCastExpr>(Element)) {
     if (ICE->getCastKind() == CK_BitCast &&
@@ -7332,8 +8308,8 @@
 
 /// Check an Objective-C array literal being converted to the given
 /// target type.
-static void checkObjCArrayLiteral(Sema &S, QualType TargetType,
-                                  ObjCArrayLiteral *ArrayLiteral) {
+void checkObjCArrayLiteral(Sema &S, QualType TargetType,
+                           ObjCArrayLiteral *ArrayLiteral) {
   if (!S.NSArrayDecl)
     return;
 
@@ -7360,9 +8336,8 @@
 
 /// Check an Objective-C dictionary literal being converted to the given
 /// target type.
-static void checkObjCDictionaryLiteral(
-              Sema &S, QualType TargetType,
-              ObjCDictionaryLiteral *DictionaryLiteral) {
+void checkObjCDictionaryLiteral(Sema &S, QualType TargetType,
+                                ObjCDictionaryLiteral *DictionaryLiteral) {
   if (!S.NSDictionaryDecl)
     return;
 
@@ -7388,6 +8363,32 @@
   }
 }
 
+// Helper function to filter out cases for constant width constant conversion.
+// Don't warn on char array initialization or for non-decimal values.
+bool isSameWidthConstantConversion(Sema &S, Expr *E, QualType T,
+                                   SourceLocation CC) {
+  // If initializing from a constant, and the constant starts with '0',
+  // then it is a binary, octal, or hexadecimal.  Allow these constants
+  // to fill all the bits, even if there is a sign change.
+  if (auto *IntLit = dyn_cast<IntegerLiteral>(E->IgnoreParenImpCasts())) {
+    const char FirstLiteralCharacter =
+        S.getSourceManager().getCharacterData(IntLit->getLocStart())[0];
+    if (FirstLiteralCharacter == '0')
+      return false;
+  }
+
+  // If the CC location points to a '{', and the type is char, then assume
+  // assume it is an array initialization.
+  if (CC.isValid() && T->isCharType()) {
+    const char FirstContextCharacter =
+        S.getSourceManager().getCharacterData(CC)[0];
+    if (FirstContextCharacter == '{')
+      return false;
+  }
+
+  return true;
+}
+
 void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
                              SourceLocation CC, bool *ICContext = nullptr) {
   if (E->isTypeDependent() || E->isValueDependent()) return;
@@ -7492,7 +8493,6 @@
           return;
 
         DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_float_precision);
-
       }
       // ... or possibly if we're increasing rank, too
       else if (TargetBT->getKind() > SourceBT->getKind()) {
@@ -7504,22 +8504,12 @@
       return;
     }
 
-    // If the target is integral, always warn.    
+    // If the target is integral, always warn.
     if (TargetBT && TargetBT->isInteger()) {
       if (S.SourceMgr.isInSystemMacro(CC))
         return;
-      
-      Expr *InnerE = E->IgnoreParenImpCasts();
-      // We also want to warn on, e.g., "int i = -1.234"
-      if (UnaryOperator *UOp = dyn_cast<UnaryOperator>(InnerE))
-        if (UOp->getOpcode() == UO_Minus || UOp->getOpcode() == UO_Plus)
-          InnerE = UOp->getSubExpr()->IgnoreParenImpCasts();
 
-      if (FloatingLiteral *FL = dyn_cast<FloatingLiteral>(InnerE)) {
-        DiagnoseFloatingLiteralImpCast(S, FL, T, CC);
-      } else {
-        DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_float_integer);
-      }
+      DiagnoseFloatingImpCast(S, E, T, CC);
     }
 
     // Detect the case where a call result is converted from floating-point to
@@ -7551,6 +8541,8 @@
 
   DiagnoseNullConversion(S, E, T, CC);
 
+  S.DiscardMisalignedMemberAddress(Target, E);
+
   if (!Source->isIntegerType() || !Target->isIntegerType())
     return;
 
@@ -7566,7 +8558,7 @@
     // If the source is a constant, use a default-on diagnostic.
     // TODO: this should happen for bitfield stores, too.
     llvm::APSInt Value(32);
-    if (E->isIntegerConstantExpr(Value, S.Context)) {
+    if (E->EvaluateAsInt(Value, S.Context, Expr::SE_AllowSideEffects)) {
       if (S.SourceMgr.isInSystemMacro(CC))
         return;
 
@@ -7591,10 +8583,34 @@
     return DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_integer_precision);
   }
 
+  if (TargetRange.Width == SourceRange.Width && !TargetRange.NonNegative &&
+      SourceRange.NonNegative && Source->isSignedIntegerType()) {
+    // Warn when doing a signed to signed conversion, warn if the positive
+    // source value is exactly the width of the target type, which will
+    // cause a negative value to be stored.
+
+    llvm::APSInt Value;
+    if (E->EvaluateAsInt(Value, S.Context, Expr::SE_AllowSideEffects) &&
+        !S.SourceMgr.isInSystemMacro(CC)) {
+      if (isSameWidthConstantConversion(S, E, T, CC)) {
+        std::string PrettySourceValue = Value.toString(10);
+        std::string PrettyTargetValue = PrettyPrintInRange(Value, TargetRange);
+
+        S.DiagRuntimeBehavior(
+            E->getExprLoc(), E,
+            S.PDiag(diag::warn_impcast_integer_precision_constant)
+                << PrettySourceValue << PrettyTargetValue << E->getType() << T
+                << E->getSourceRange() << clang::SourceRange(CC));
+        return;
+      }
+    }
+
+    // Fall through for non-constants to give a sign conversion warning.
+  }
+
   if ((TargetRange.NonNegative && !SourceRange.NonNegative) ||
       (!TargetRange.NonNegative && SourceRange.NonNegative &&
        SourceRange.Width == TargetRange.Width)) {
-        
     if (S.SourceMgr.isInSystemMacro(CC))
       return;
 
@@ -7637,8 +8653,6 @@
         return DiagnoseImpCast(S, E, SourceType, T, CC, 
                                diag::warn_impcast_different_enum_types);
       }
-  
-  return;
 }
 
 void CheckConditionalOperator(Sema &S, ConditionalOperator *E,
@@ -7654,7 +8668,6 @@
   AnalyzeImplicitConversions(S, E, CC);
   if (E->getType() != T)
     return CheckImplicitConversion(S, E, T, CC, &ICContext);
-  return;
 }
 
 void CheckConditionalOperator(Sema &S, ConditionalOperator *E,
@@ -7687,7 +8700,7 @@
 
 /// CheckBoolLikeConversion - Check conversion of given expression to boolean.
 /// Input argument E is a logical expression.
-static void CheckBoolLikeConversion(Sema &S, Expr *E, SourceLocation CC) {
+void CheckBoolLikeConversion(Sema &S, Expr *E, SourceLocation CC) {
   if (S.getLangOpts().Bool)
     return;
   CheckImplicitConversion(S, E->IgnoreParenImpCasts(), S.Context.BoolTy, CC);
@@ -7791,10 +8804,31 @@
 
 } // end anonymous namespace
 
+static bool checkOpenCLEnqueueLocalSizeArgs(Sema &S, CallExpr *TheCall,
+                                            unsigned Start, unsigned End) {
+  bool IllegalParams = false;
+  for (unsigned I = Start; I <= End; ++I) {
+    QualType Ty = TheCall->getArg(I)->getType();
+    // Taking into account implicit conversions,
+    // allow any integer within 32 bits range
+    if (!Ty->isIntegerType() ||
+        S.Context.getTypeSizeInChars(Ty).getQuantity() > 4) {
+      S.Diag(TheCall->getArg(I)->getLocStart(),
+             diag::err_opencl_enqueue_kernel_invalid_local_size_type);
+      IllegalParams = true;
+    }
+    // Potentially emit standard warnings for implicit conversions if enabled
+    // using -Wconversion.
+    CheckImplicitConversion(S, TheCall->getArg(I), S.Context.UnsignedIntTy,
+                            TheCall->getArg(I)->getLocStart());
+  }
+  return IllegalParams;
+}
+
 // Helper function for Sema::DiagnoseAlwaysNonNullPointer.
 // Returns true when emitting a warning about taking the address of a reference.
 static bool CheckForReference(Sema &SemaRef, const Expr *E,
-                              PartialDiagnostic PD) {
+                              const PartialDiagnostic &PD) {
   E = E->IgnoreParenImpCasts();
 
   const FunctionDecl *FD = nullptr;
@@ -7889,7 +8923,8 @@
     }
   }
 
-  auto ComplainAboutNonnullParamOrCall = [&](bool IsParam) {
+  auto ComplainAboutNonnullParamOrCall = [&](const Attr *NonnullAttr) {
+    bool IsParam = isa<NonNullAttr>(NonnullAttr);
     std::string Str;
     llvm::raw_string_ostream S(Str);
     E->printPretty(S, nullptr, getPrintingPolicy());
@@ -7897,13 +8932,14 @@
                                 : diag::warn_cast_nonnull_to_bool;
     Diag(E->getExprLoc(), DiagID) << IsParam << S.str()
       << E->getSourceRange() << Range << IsEqual;
+    Diag(NonnullAttr->getLocation(), diag::note_declared_nonnull) << IsParam;
   };
 
   // If we have a CallExpr that is tagged with returns_nonnull, we can complain.
   if (auto *Call = dyn_cast<CallExpr>(E->IgnoreParenImpCasts())) {
     if (auto *Callee = Call->getDirectCallee()) {
-      if (Callee->hasAttr<ReturnsNonNullAttr>()) {
-        ComplainAboutNonnullParamOrCall(false);
+      if (const Attr *A = Callee->getAttr<ReturnsNonNullAttr>()) {
+        ComplainAboutNonnullParamOrCall(A);
         return;
       }
     }
@@ -7925,25 +8961,25 @@
   if (const auto* PV = dyn_cast<ParmVarDecl>(D)) {
     if (getCurFunction() &&
         !getCurFunction()->ModifiedNonNullParams.count(PV)) {
-      if (PV->hasAttr<NonNullAttr>()) {
-        ComplainAboutNonnullParamOrCall(true);
+      if (const Attr *A = PV->getAttr<NonNullAttr>()) {
+        ComplainAboutNonnullParamOrCall(A);
         return;
       }
 
       if (const auto *FD = dyn_cast<FunctionDecl>(PV->getDeclContext())) {
-        auto ParamIter = std::find(FD->param_begin(), FD->param_end(), PV);
+        auto ParamIter = llvm::find(FD->parameters(), PV);
         assert(ParamIter != FD->param_end());
         unsigned ParamNo = std::distance(FD->param_begin(), ParamIter);
 
         for (const auto *NonNull : FD->specific_attrs<NonNullAttr>()) {
           if (!NonNull->args_size()) {
-              ComplainAboutNonnullParamOrCall(true);
+              ComplainAboutNonnullParamOrCall(NonNull);
               return;
           }
 
           for (unsigned ArgNo : NonNull->args()) {
             if (ArgNo == ParamNo) {
-              ComplainAboutNonnullParamOrCall(true);
+              ComplainAboutNonnullParamOrCall(NonNull);
               return;
             }
           }
@@ -8025,7 +9061,6 @@
       << FixItHint::CreateInsertion(getLocForEndOfToken(E->getLocEnd()), "()");
 }
 
-
 /// Diagnoses "dangerous" implicit conversions within the given
 /// expression (which is a full expression).  Implements -Wconversion
 /// and -Wsign-compare.
@@ -8091,7 +9126,7 @@
     struct Value {
       explicit Value(unsigned Parent) : Parent(Parent), Merged(false) {}
       unsigned Parent : 31;
-      bool Merged : 1;
+      unsigned Merged : 1;
     };
     SmallVector<Value, 8> Values;
 
@@ -8203,12 +9238,11 @@
       Self.ModAsSideEffect = &ModAsSideEffect;
     }
     ~SequencedSubexpression() {
-      for (auto MI = ModAsSideEffect.rbegin(), ME = ModAsSideEffect.rend();
-           MI != ME; ++MI) {
-        UsageInfo &U = Self.UsageMap[MI->first];
+      for (auto &M : llvm::reverse(ModAsSideEffect)) {
+        UsageInfo &U = Self.UsageMap[M.first];
         auto &SideEffectUsage = U.Uses[UK_ModAsSideEffect];
-        Self.addUsage(U, MI->first, SideEffectUsage.Use, UK_ModAsValue);
-        SideEffectUsage = MI->second;
+        Self.addUsage(U, M.first, SideEffectUsage.Use, UK_ModAsValue);
+        SideEffectUsage = M.second;
       }
       Self.ModAsSideEffect = OldModAsSideEffect;
     }
@@ -8411,6 +9445,7 @@
     notePostMod(O, BO, SemaRef.getLangOpts().CPlusPlus ? UK_ModAsValue
                                                        : UK_ModAsSideEffect);
   }
+
   void VisitCompoundAssignOperator(CompoundAssignOperator *CAO) {
     VisitBinAssign(CAO);
   }
@@ -8560,7 +9595,7 @@
       Tree.merge(Elts[I]);
   }
 };
-}
+} // end anonymous namespace
 
 void Sema::CheckUnsequencedOperations(Expr *E) {
   SmallVector<Expr *, 8> WorkList;
@@ -8574,9 +9609,11 @@
 void Sema::CheckCompletedExpr(Expr *E, SourceLocation CheckLoc,
                               bool IsConstexpr) {
   CheckImplicitConversions(E, CheckLoc);
-  CheckUnsequencedOperations(E);
+  if (!E->isInstantiationDependent())
+    CheckUnsequencedOperations(E);
   if (!IsConstexpr && !E->isValueDependent())
     CheckForIntOverflow(E);
+  DiagnoseMisalignedMembers();
 }
 
 void Sema::CheckBitFieldInitialization(SourceLocation InitLoc,
@@ -8619,13 +9656,10 @@
 /// takes care of any checks that cannot be performed on the
 /// declaration itself, e.g., that the types of each of the function
 /// parameters are complete.
-bool Sema::CheckParmsForFunctionDef(ParmVarDecl *const *P,
-                                    ParmVarDecl *const *PEnd,
+bool Sema::CheckParmsForFunctionDef(ArrayRef<ParmVarDecl *> Parameters,
                                     bool CheckParameterNames) {
   bool HasInvalidParm = false;
-  for (; P != PEnd; ++P) {
-    ParmVarDecl *Param = *P;
-    
+  for (ParmVarDecl *Param : Parameters) {
     // C99 6.7.5.3p4: the parameters in a parameter type list in a
     // function declarator that is part of a function definition of
     // that function shall not have incomplete type.
@@ -8733,21 +9767,12 @@
     << TRange << Op->getSourceRange();
 }
 
-static const Type* getElementType(const Expr *BaseExpr) {
-  const Type* EltType = BaseExpr->getType().getTypePtr();
-  if (EltType->isAnyPointerType())
-    return EltType->getPointeeType().getTypePtr();
-  else if (EltType->isArrayType())
-    return EltType->getBaseElementTypeUnsafe();
-  return EltType;
-}
-
 /// \brief Check whether this array fits the idiom of a size-one tail padded
 /// array member of a struct.
 ///
 /// We avoid emitting out-of-bounds access warnings for such arrays as they are
 /// commonly used to emulate flexible arrays in C89 code.
-static bool IsTailPaddedMemberArray(Sema &S, llvm::APInt Size,
+static bool IsTailPaddedMemberArray(Sema &S, const llvm::APInt &Size,
                                     const NamedDecl *ND) {
   if (Size != 1 || !ND) return false;
 
@@ -8796,7 +9821,8 @@
   if (IndexExpr->isValueDependent())
     return;
 
-  const Type *EffectiveType = getElementType(BaseExpr);
+  const Type *EffectiveType =
+      BaseExpr->getType()->getPointeeOrArrayElementType();
   BaseExpr = BaseExpr->IgnoreParenCasts();
   const ConstantArrayType *ArrayTy =
     Context.getAsConstantArrayType(BaseExpr->getType());
@@ -8820,7 +9846,7 @@
     if (!size.isStrictlyPositive())
       return;
 
-    const Type* BaseType = getElementType(BaseExpr);
+    const Type *BaseType = BaseExpr->getType()->getPointeeOrArrayElementType();
     if (BaseType != EffectiveType) {
       // Make sure we're comparing apples to apples when comparing index to size
       uint64_t ptrarith_typesize = Context.getTypeSize(EffectiveType);
@@ -8970,7 +9996,7 @@
       Range = e->getSourceRange();
     }
   };
-}
+} // end anonymous namespace
 
 /// Consider whether capturing the given variable can possibly lead to
 /// a retain cycle.
@@ -9116,7 +10142,7 @@
       }
     }
   };
-}
+} // end anonymous namespace
 
 /// Check whether the given argument is a block which captures a
 /// variable.
@@ -9352,7 +10378,6 @@
       }
     }
   }
-
 }
 
 /// Check a message send to see if it's likely to cause a retain cycle.
@@ -9556,7 +10581,7 @@
 
   return true;
 }
-} // Unnamed namespace
+} // end anonymous namespace
 
 void Sema::DiagnoseEmptyStmtBody(SourceLocation StmtLoc,
                                  const Stmt *Body,
@@ -9652,7 +10677,6 @@
 /// DiagnoseSelfMove - Emits a warning if a value is moved to itself.
 void Sema::DiagnoseSelfMove(const Expr *LHSExpr, const Expr *RHSExpr,
                              SourceLocation OpLoc) {
-
   if (Diags.isIgnored(diag::warn_sizeof_pointer_expr_memaccess, OpLoc))
     return;
 
@@ -9891,7 +10915,7 @@
 
   return false;
 }
-}
+} // end anonymous namespace
 
 //===--- CHECK: pointer_with_type_tag attribute: datatypes should match ----//
 
@@ -10022,7 +11046,7 @@
   TypeInfo = I->second;
   return true;
 }
-} // unnamed namespace
+} // end anonymous namespace
 
 void Sema::RegisterTypeTagForDatatype(const IdentifierInfo *ArgumentKind,
                                       uint64_t MagicValue, QualType Type,
@@ -10055,7 +11079,7 @@
          (T1Kind == BuiltinType::Char_U && T2Kind == BuiltinType::UChar) ||
          (T1Kind == BuiltinType::Char_S && T2Kind == BuiltinType::SChar);
 }
-} // unnamed namespace
+} // end anonymous namespace
 
 void Sema::CheckArgumentWithTypeTag(const ArgumentWithTypeTagAttr *Attr,
                                     const Expr * const *ExprArgs) {
@@ -10135,3 +11159,67 @@
         << ArgumentExpr->getSourceRange()
         << TypeTagExpr->getSourceRange();
 }
+
+void Sema::AddPotentialMisalignedMembers(Expr *E, RecordDecl *RD, ValueDecl *MD,
+                                         CharUnits Alignment) {
+  MisalignedMembers.emplace_back(E, RD, MD, Alignment);
+}
+
+void Sema::DiagnoseMisalignedMembers() {
+  for (MisalignedMember &m : MisalignedMembers) {
+    Diag(m.E->getLocStart(), diag::warn_taking_address_of_packed_member)
+        << m.MD << m.RD << m.E->getSourceRange();
+  }
+  MisalignedMembers.clear();
+}
+
+void Sema::DiscardMisalignedMemberAddress(const Type *T, Expr *E) {
+  if (!T->isPointerType())
+    return;
+  if (isa<UnaryOperator>(E) &&
+      cast<UnaryOperator>(E)->getOpcode() == UO_AddrOf) {
+    auto *Op = cast<UnaryOperator>(E)->getSubExpr()->IgnoreParens();
+    if (isa<MemberExpr>(Op)) {
+      auto MA = std::find(MisalignedMembers.begin(), MisalignedMembers.end(),
+                          MisalignedMember(Op));
+      if (MA != MisalignedMembers.end() &&
+          Context.getTypeAlignInChars(T->getPointeeType()) <= MA->Alignment)
+        MisalignedMembers.erase(MA);
+    }
+  }
+}
+
+void Sema::RefersToMemberWithReducedAlignment(
+    Expr *E,
+    std::function<void(Expr *, RecordDecl *, ValueDecl *, CharUnits)> Action) {
+  const auto *ME = dyn_cast<MemberExpr>(E);
+  while (ME && isa<FieldDecl>(ME->getMemberDecl())) {
+    QualType BaseType = ME->getBase()->getType();
+    if (ME->isArrow())
+      BaseType = BaseType->getPointeeType();
+    RecordDecl *RD = BaseType->getAs<RecordType>()->getDecl();
+
+    ValueDecl *MD = ME->getMemberDecl();
+    bool ByteAligned = Context.getTypeAlignInChars(MD->getType()).isOne();
+    if (ByteAligned) // Attribute packed does not have any effect.
+      break;
+
+    if (!ByteAligned &&
+        (RD->hasAttr<PackedAttr>() || (MD->hasAttr<PackedAttr>()))) {
+      CharUnits Alignment = std::min(Context.getTypeAlignInChars(MD->getType()),
+                                     Context.getTypeAlignInChars(BaseType));
+      // Notify that this expression designates a member with reduced alignment
+      Action(E, RD, MD, Alignment);
+      break;
+    }
+    ME = dyn_cast<MemberExpr>(ME->getBase());
+  }
+}
+
+void Sema::CheckAddressOfPackedMember(Expr *rhs) {
+  using namespace std::placeholders;
+  RefersToMemberWithReducedAlignment(
+      rhs, std::bind(&Sema::AddPotentialMisalignedMembers, std::ref(*this), _1,
+                     _2, _3, _4));
+}
+
diff --git a/lib/Sema/SemaCodeComplete.cpp b/lib/Sema/SemaCodeComplete.cpp
index e94492e..36babc4 100644
--- a/lib/Sema/SemaCodeComplete.cpp
+++ b/lib/Sema/SemaCodeComplete.cpp
@@ -19,7 +19,6 @@
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
-#include "clang/Sema/ExternalSemaSource.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Overload.h"
 #include "clang/Sema/Scope.h"
@@ -1535,7 +1534,6 @@
                                    ResultBuilder &Results) {
   CodeCompletionAllocator &Allocator = Results.getAllocator();
   CodeCompletionBuilder Builder(Allocator, Results.getCodeCompletionTUInfo());
-  PrintingPolicy Policy = getCompletionPrintingPolicy(SemaRef);
   
   typedef CodeCompletionResult Result;
   switch (CCC) {
@@ -3063,6 +3061,7 @@
     case Decl::ClassTemplatePartialSpecialization:
       return CXCursor_ClassTemplatePartialSpecialization;
     case Decl::UsingDirective:     return CXCursor_UsingDirective;
+    case Decl::StaticAssert:       return CXCursor_StaticAssert;
     case Decl::TranslationUnit:    return CXCursor_TranslationUnit;
       
     case Decl::Using:
@@ -3226,7 +3225,7 @@
   
   // We need to have names for all of the parameters, if we're going to 
   // generate a forwarding call.
-  for (auto P : Method->params())
+  for (auto P : Method->parameters())
     if (!P->getDeclName())
       return;
 
@@ -3258,7 +3257,7 @@
                                          Overridden->getNameAsString()));
     Builder.AddChunk(CodeCompletionString::CK_LeftParen);
     bool FirstParam = true;
-    for (auto P : Method->params()) {
+    for (auto P : Method->parameters()) {
       if (FirstParam)
         FirstParam = false;
       else
@@ -3829,12 +3828,19 @@
   if (getLangOpts().C11 &&
       !(DS.getTypeQualifiers() & DeclSpec::TQ_atomic))
     Results.AddResult("_Atomic");
+  if (getLangOpts().MSVCCompat &&
+      !(DS.getTypeQualifiers() & DeclSpec::TQ_unaligned))
+    Results.AddResult("__unaligned");
   Results.ExitScope();
   HandleCodeCompleteResults(this, CodeCompleter, 
                             Results.getCompletionContext(),
                             Results.data(), Results.size());
 }
 
+void Sema::CodeCompleteBracketDeclarator(Scope *S) {
+  CodeCompleteExpression(S, QualType(getASTContext().getSizeType()));
+}
+
 void Sema::CodeCompleteCase(Scope *S) {
   if (getCurFunction()->SwitchStack.empty() || !CodeCompleter)
     return;
diff --git a/lib/Sema/SemaCoroutine.cpp b/lib/Sema/SemaCoroutine.cpp
index 4b4fd6b..c8715ff 100644
--- a/lib/Sema/SemaCoroutine.cpp
+++ b/lib/Sema/SemaCoroutine.cpp
@@ -244,7 +244,7 @@
   // If the expression is a temporary, materialize it as an lvalue so that we
   // can use it multiple times.
   if (E->getValueKind() == VK_RValue)
-    E = new (Context) MaterializeTemporaryExpr(E->getType(), E, true);
+    E = CreateMaterializeTemporaryExpr(E->getType(), E, true);
 
   // Build the await_ready, await_suspend, await_resume calls.
   ReadySuspendResumeResult RSS = buildCoawaitCalls(*this, Loc, E);
@@ -311,7 +311,7 @@
   // If the expression is a temporary, materialize it as an lvalue so that we
   // can use it multiple times.
   if (E->getValueKind() == VK_RValue)
-    E = new (Context) MaterializeTemporaryExpr(E->getType(), E, true);
+    E = CreateMaterializeTemporaryExpr(E->getType(), E, true);
 
   // Build the await_ready, await_suspend, await_resume calls.
   ReadySuspendResumeResult RSS = buildCoawaitCalls(*this, Loc, E);
diff --git a/lib/Sema/SemaDecl.cpp b/lib/Sema/SemaDecl.cpp
index 6158d33..4d503cd 100644
--- a/lib/Sema/SemaDecl.cpp
+++ b/lib/Sema/SemaDecl.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "TypeLocBuilder.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
@@ -41,12 +40,14 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Triple.h"
 #include <algorithm>
 #include <cstring>
 #include <functional>
+
 using namespace clang;
 using namespace sema;
 
@@ -88,7 +89,7 @@
   bool AllowClassTemplates;
 };
 
-}
+} // end anonymous namespace
 
 /// \brief Determine whether the token kind starts a simple-type-specifier.
 bool Sema::isSimpleTypeSpecifier(tok::TokenKind Kind) const {
@@ -107,6 +108,7 @@
   case tok::kw_half:
   case tok::kw_float:
   case tok::kw_double:
+  case tok::kw___float128:
   case tok::kw_wchar_t:
   case tok::kw_bool:
   case tok::kw___underlying_type:
@@ -134,7 +136,7 @@
   FoundNonType,
   FoundType
 };
-} // namespace
+} // end anonymous namespace
 
 /// \brief Tries to perform unqualified lookup of the type decls in bases for
 /// dependent class.
@@ -161,11 +163,17 @@
       auto *TD = TST->getTemplateName().getAsTemplateDecl();
       if (!TD)
         continue;
-      auto *BasePrimaryTemplate =
-          dyn_cast_or_null<CXXRecordDecl>(TD->getTemplatedDecl());
-      if (!BasePrimaryTemplate)
-        continue;
-      BaseRD = BasePrimaryTemplate;
+      if (auto *BasePrimaryTemplate =
+          dyn_cast_or_null<CXXRecordDecl>(TD->getTemplatedDecl())) {
+        if (BasePrimaryTemplate->getCanonicalDecl() != RD->getCanonicalDecl())
+          BaseRD = BasePrimaryTemplate;
+        else if (auto *CTD = dyn_cast<ClassTemplateDecl>(TD)) {
+          if (const ClassTemplatePartialSpecializationDecl *PS =
+                  CTD->findPartialSpecialization(Base.getType()))
+            if (PS->getCanonicalDecl() != RD->getCanonicalDecl())
+              BaseRD = PS;
+        }
+      }
     }
     if (BaseRD) {
       for (NamedDecl *ND : BaseRD->lookup(&II)) {
@@ -272,7 +280,7 @@
         // so build a dependent node to describe the type.
         if (WantNontrivialTypeSourceInfo)
           return ActOnTypenameType(S, SourceLocation(), *SS, II, NameLoc).get();
-        
+
         NestedNameSpecifierLoc QualifierLoc = SS->getWithLocInContext(Context);
         QualType T = CheckTypenameType(ETK_None, SourceLocation(), QualifierLoc,
                                        II, NameLoc);
@@ -281,7 +289,7 @@
 
       return nullptr;
     }
-    
+
     if (!LookupCtx->isDependentContext() &&
         RequireCompleteDeclContext(*SS, LookupCtx))
       return nullptr;
@@ -302,7 +310,7 @@
     if (ObjectTypePtr && Result.empty()) {
       // C++ [basic.lookup.classref]p3:
       //   If the unqualified-id is ~type-name, the type-name is looked up
-      //   in the context of the entire postfix-expression. If the type T of 
+      //   in the context of the entire postfix-expression. If the type T of
       //   the object expression is of a class type C, the type-name is also
       //   looked up in the scope of class C. At least one of the lookups shall
       //   find a name that refers to (possibly cv-qualified) T.
@@ -430,7 +438,7 @@
         // Construct a type with type-source information.
         TypeLocBuilder Builder;
         Builder.pushTypeSpec(T).setNameLoc(NameLoc);
-        
+
         T = getElaboratedType(ETK_None, *SS, T);
         ElaboratedTypeLoc ElabTL = Builder.push<ElaboratedTypeLoc>(T);
         ElabTL.setElaboratedKeywordLoc(SourceLocation());
@@ -471,17 +479,53 @@
   llvm_unreachable("something isn't in TU scope?");
 }
 
-ParsedType Sema::ActOnDelayedDefaultTemplateArg(const IdentifierInfo &II,
-                                                SourceLocation NameLoc) {
-  // Accepting an undeclared identifier as a default argument for a template
-  // type parameter is a Microsoft extension.
-  Diag(NameLoc, diag::ext_ms_delayed_template_argument) << &II;
+/// Find the parent class with dependent bases of the innermost enclosing method
+/// context. Do not look for enclosing CXXRecordDecls directly, or we will end
+/// up allowing unqualified dependent type names at class-level, which MSVC
+/// correctly rejects.
+static const CXXRecordDecl *
+findRecordWithDependentBasesOfEnclosingMethod(const DeclContext *DC) {
+  for (; DC && DC->isDependentContext(); DC = DC->getLookupParent()) {
+    DC = DC->getPrimaryContext();
+    if (const auto *MD = dyn_cast<CXXMethodDecl>(DC))
+      if (MD->getParent()->hasAnyDependentBases())
+        return MD->getParent();
+  }
+  return nullptr;
+}
 
-  // Build a fake DependentNameType that will perform lookup into CurContext at
-  // instantiation time.  The name specifier isn't dependent, so template
-  // instantiation won't transform it.  It will retry the lookup, however.
-  NestedNameSpecifier *NNS =
-      synthesizeCurrentNestedNameSpecifier(Context, CurContext);
+ParsedType Sema::ActOnMSVCUnknownTypeName(const IdentifierInfo &II,
+                                          SourceLocation NameLoc,
+                                          bool IsTemplateTypeArg) {
+  assert(getLangOpts().MSVCCompat && "shouldn't be called in non-MSVC mode");
+
+  NestedNameSpecifier *NNS = nullptr;
+  if (IsTemplateTypeArg && getCurScope()->isTemplateParamScope()) {
+    // If we weren't able to parse a default template argument, delay lookup
+    // until instantiation time by making a non-dependent DependentTypeName. We
+    // pretend we saw a NestedNameSpecifier referring to the current scope, and
+    // lookup is retried.
+    // FIXME: This hurts our diagnostic quality, since we get errors like "no
+    // type named 'Foo' in 'current_namespace'" when the user didn't write any
+    // name specifiers.
+    NNS = synthesizeCurrentNestedNameSpecifier(Context, CurContext);
+    Diag(NameLoc, diag::ext_ms_delayed_template_argument) << &II;
+  } else if (const CXXRecordDecl *RD =
+                 findRecordWithDependentBasesOfEnclosingMethod(CurContext)) {
+    // Build a DependentNameType that will perform lookup into RD at
+    // instantiation time.
+    NNS = NestedNameSpecifier::Create(Context, nullptr, RD->isTemplateDecl(),
+                                      RD->getTypeForDecl());
+
+    // Diagnose that this identifier was undeclared, and retry the lookup during
+    // template instantiation.
+    Diag(NameLoc, diag::ext_undeclared_unqual_id_with_dependent_base) << &II
+                                                                      << RD;
+  } else {
+    // This is not a situation that we should recover from.
+    return ParsedType();
+  }
+
   QualType T = Context.getDependentNameType(ETK_None, NNS, &II);
 
   // Build type location information.  We synthesized the qualifier, so we have
@@ -548,7 +592,7 @@
       if (Context.hasSameUnqualifiedType(QualType(Ty, 1), Base.getType()))
         return true;
     return S->isFunctionPrototypeScope();
-  } 
+  }
   return CurContext->isFunctionOrMethod() || S->isFunctionPrototypeScope();
 }
 
@@ -623,11 +667,11 @@
 
   // FIXME: Should we move the logic that tries to recover from a missing tag
   // (struct, union, enum) from Parser::ParseImplicitInt here, instead?
-  
+
   if (!SS || (!SS->isSet() && !SS->isInvalid()))
     Diag(IILoc, diag::err_unknown_typename) << II;
   else if (DeclContext *DC = computeDeclContext(*SS, false))
-    Diag(IILoc, diag::err_typename_nested_not_found) 
+    Diag(IILoc, diag::err_typename_nested_not_found)
       << II << DC << SS->getRange();
   else if (isDependentScopeSpecifier(*SS)) {
     unsigned DiagID = diag::err_typename_missing;
@@ -641,25 +685,25 @@
     SuggestedType = ActOnTypenameType(S, SourceLocation(),
                                       *SS, *II, IILoc).get();
   } else {
-    assert(SS && SS->isInvalid() && 
+    assert(SS && SS->isInvalid() &&
            "Invalid scope specifier has already been diagnosed");
   }
 }
 
 /// \brief Determine whether the given result set contains either a type name
-/// or 
+/// or
 static bool isResultTypeOrTemplate(LookupResult &R, const Token &NextToken) {
   bool CheckTemplate = R.getSema().getLangOpts().CPlusPlus &&
                        NextToken.is(tok::less);
-  
+
   for (LookupResult::iterator I = R.begin(), IEnd = R.end(); I != IEnd; ++I) {
     if (isa<TypeDecl>(*I) || isa<ObjCInterfaceDecl>(*I))
       return true;
-    
+
     if (CheckTemplate && isa<TemplateDecl>(*I))
       return true;
   }
-  
+
   return false;
 }
 
@@ -736,8 +780,8 @@
   ObjCMethodDecl *CurMethod = getCurMethodDecl();
 
   if (NextToken.is(tok::coloncolon)) {
-    BuildCXXNestedNameSpecifier(S, *Name, NameLoc, NextToken.getLocation(),
-                                QualType(), false, SS, nullptr, false);
+    NestedNameSpecInfo IdInfo(Name, NameLoc, NextToken.getLocation());
+    BuildCXXNestedNameSpecifier(S, IdInfo, false, SS, nullptr, false);
   }
 
   LookupResult Result(*this, Name, NameLoc, LookupOrdinaryName);
@@ -751,7 +795,7 @@
       return TypeInBase;
   }
 
-  // Perform lookup for Objective-C instance variables (including automatically 
+  // Perform lookup for Objective-C instance variables (including automatically
   // synthesized instance variables), if we're in an Objective-C method.
   // FIXME: This lookup really, really needs to be folded in to the normal
   // unqualified lookup mechanism.
@@ -760,10 +804,10 @@
     if (E.get() || E.isInvalid())
       return E;
   }
-  
+
   bool SecondTry = false;
   bool IsFilteredTemplateName = false;
-  
+
 Corrected:
   switch (Result.getResultKind()) {
   case LookupResult::NotFound:
@@ -774,18 +818,18 @@
       // FIXME: Reference?
       if (getLangOpts().CPlusPlus)
         return BuildDeclarationNameExpr(SS, Result, /*ADL=*/true);
-      
+
       // C90 6.3.2.2:
-      //   If the expression that precedes the parenthesized argument list in a 
-      //   function call consists solely of an identifier, and if no 
-      //   declaration is visible for this identifier, the identifier is 
+      //   If the expression that precedes the parenthesized argument list in a
+      //   function call consists solely of an identifier, and if no
+      //   declaration is visible for this identifier, the identifier is
       //   implicitly declared exactly as if, in the innermost block containing
       //   the function call, the declaration
       //
-      //     extern int identifier (); 
+      //     extern int identifier ();
       //
-      //   appeared. 
-      // 
+      //   appeared.
+      //
       // We also allow this in C99 as an extension.
       if (NamedDecl *D = ImplicitlyDefineFunction(NameLoc, *Name, S)) {
         Result.addDecl(D);
@@ -793,9 +837,9 @@
         return BuildDeclarationNameExpr(SS, Result, /*ADL=*/false);
       }
     }
-    
-    // In C, we first see whether there is a tag type by the same name, in 
-    // which case it's likely that the user just forgot to write "enum", 
+
+    // In C, we first see whether there is a tag type by the same name, in
+    // which case it's likely that the user just forgot to write "enum",
     // "struct", or "union".
     if (!getLangOpts().CPlusPlus && !SecondTry &&
         isTagTypeWithMissingTag(*this, Result, S, SS, Name, NameLoc)) {
@@ -807,7 +851,7 @@
     if (!SecondTry && CCC) {
       SecondTry = true;
       if (TypoCorrection Corrected = CorrectTypo(Result.getLookupNameInfo(),
-                                                 Result.getLookupKind(), S, 
+                                                 Result.getLookupKind(), S,
                                                  &SS, std::move(CCC),
                                                  CTK_ErrorRecovery)) {
         unsigned UnqualifiedDiag = diag::err_undeclared_var_use_suggest;
@@ -819,8 +863,8 @@
             UnderlyingFirstDecl && isa<TemplateDecl>(UnderlyingFirstDecl)) {
           UnqualifiedDiag = diag::err_no_template_suggest;
           QualifiedDiag = diag::err_no_member_template_suggest;
-        } else if (UnderlyingFirstDecl && 
-                   (isa<TypeDecl>(UnderlyingFirstDecl) || 
+        } else if (UnderlyingFirstDecl &&
+                   (isa<TypeDecl>(UnderlyingFirstDecl) ||
                     isa<ObjCInterfaceDecl>(UnderlyingFirstDecl) ||
                     isa<ObjCCompatibleAliasDecl>(UnderlyingFirstDecl))) {
           UnqualifiedDiag = diag::err_unknown_typename_suggest;
@@ -861,28 +905,28 @@
           ExprResult E(LookupInObjCMethod(Result, S, Ivar->getIdentifier()));
           return E;
         }
-        
+
         goto Corrected;
       }
     }
-      
+
     // We failed to correct; just fall through and let the parser deal with it.
     Result.suppressDiagnostics();
     return NameClassification::Unknown();
-      
+
   case LookupResult::NotFoundInCurrentInstantiation: {
-    // We performed name lookup into the current instantiation, and there were 
+    // We performed name lookup into the current instantiation, and there were
     // dependent bases, so we treat this result the same way as any other
     // dependent nested-name-specifier.
-      
+
     // C++ [temp.res]p2:
-    //   A name used in a template declaration or definition and that is 
-    //   dependent on a template-parameter is assumed not to name a type 
-    //   unless the applicable name lookup finds a type name or the name is 
+    //   A name used in a template declaration or definition and that is
+    //   dependent on a template-parameter is assumed not to name a type
+    //   unless the applicable name lookup finds a type name or the name is
     //   qualified by the keyword typename.
     //
     // FIXME: If the next token is '<', we might want to ask the parser to
-    // perform some heroics to see if we actually have a 
+    // perform some heroics to see if we actually have a
     // template-argument-list, which would indicate a missing 'template'
     // keyword here.
     return ActOnDependentIdExpression(SS, /*TemplateKWLoc=*/SourceLocation(),
@@ -894,7 +938,7 @@
   case LookupResult::FoundOverloaded:
   case LookupResult::FoundUnresolvedValue:
     break;
-      
+
   case LookupResult::Ambiguous:
     if (getLangOpts().CPlusPlus && NextToken.is(tok::less) &&
         hasAnyAcceptableTemplateNames(Result)) {
@@ -915,29 +959,29 @@
         break;
       }
     }
-      
+
     // Diagnose the ambiguity and return an error.
     return NameClassification::Error();
   }
-  
+
   if (getLangOpts().CPlusPlus && NextToken.is(tok::less) &&
       (IsFilteredTemplateName || hasAnyAcceptableTemplateNames(Result))) {
     // C++ [temp.names]p3:
     //   After name lookup (3.4) finds that a name is a template-name or that
     //   an operator-function-id or a literal- operator-id refers to a set of
-    //   overloaded functions any member of which is a function template if 
+    //   overloaded functions any member of which is a function template if
     //   this is followed by a <, the < is always taken as the delimiter of a
     //   template-argument-list and never as the less-than operator.
     if (!IsFilteredTemplateName)
       FilterAcceptableTemplateNames(Result);
-    
+
     if (!Result.empty()) {
       bool IsFunctionTemplate;
       bool IsVarTemplate;
       TemplateName Template;
       if (Result.end() - Result.begin() > 1) {
         IsFunctionTemplate = true;
-        Template = Context.getOverloadedTemplateName(Result.begin(), 
+        Template = Context.getOverloadedTemplateName(Result.begin(),
                                                      Result.end());
       } else {
         TemplateDecl *TD
@@ -946,19 +990,19 @@
         IsVarTemplate = isa<VarTemplateDecl>(TD);
 
         if (SS.isSet() && !SS.isInvalid())
-          Template = Context.getQualifiedTemplateName(SS.getScopeRep(), 
+          Template = Context.getQualifiedTemplateName(SS.getScopeRep(),
                                                     /*TemplateKeyword=*/false,
                                                       TD);
         else
           Template = TemplateName(TD);
       }
-      
+
       if (IsFunctionTemplate) {
         // Function templates always go through overload resolution, at which
         // point we'll perform the various checks (e.g., accessibility) we need
         // to based on which function we selected.
         Result.suppressDiagnostics();
-        
+
         return NameClassification::FunctionTemplate(Template);
       }
 
@@ -984,17 +1028,17 @@
             dyn_cast<ObjCCompatibleAliasDecl>(FirstDecl))
       Class = Alias->getClassInterface();
   }
-  
+
   if (Class) {
     DiagnoseUseOfDecl(Class, NameLoc);
-    
+
     if (NextToken.is(tok::period)) {
       // Interface. <something> is parsed as a property reference expression.
       // Just return "unknown" as a fall-through for now.
       Result.suppressDiagnostics();
       return NameClassification::Unknown();
     }
-    
+
     QualType T = Context.getObjCInterfaceType(Class);
     return ParsedType::make(T);
   }
@@ -1018,7 +1062,7 @@
       return buildNestedType(*this, SS, T, NameLoc);
     return ParsedType::make(T);
   }
-  
+
   if (FirstDecl->isCXXClassMember())
     return BuildPossibleImplicitMemberExpr(SS, SourceLocation(), Result,
                                            nullptr, S);
@@ -1035,15 +1079,15 @@
   // Functions defined inline within classes aren't parsed until we've
   // finished parsing the top-level class, so the top-level class is
   // the context we'll need to return to.
-  // A Lambda call operator whose parent is a class must not be treated 
+  // A Lambda call operator whose parent is a class must not be treated
   // as an inline member function.  A Lambda can be used legally
   // either as an in-class member initializer or a default argument.  These
   // are parsed once the class has been marked complete and so the containing
   // context would be the nested class (when the lambda is defined in one);
-  // If the class is not complete, then the lambda is being used in an 
+  // If the class is not complete, then the lambda is being used in an
   // ill-formed fashion (such as to specify the width of a bit-field, or
-  // in an array-bound) - in which case we still want to return the 
-  // lexically containing DC (which could be a nested class). 
+  // in an array-bound) - in which case we still want to return the
+  // lexically containing DC (which could be a nested class).
   if (isa<FunctionDecl>(DC) && !isLambdaCallOperator(DC)) {
     DC = DC->getLexicalParent();
 
@@ -1143,7 +1187,6 @@
   // disappear.
 }
 
-
 void Sema::ActOnReenterFunctionContext(Scope* S, Decl *D) {
   // We assume that the caller has already called
   // ActOnReenterTemplateScope so getTemplatedDecl() works.
@@ -1168,7 +1211,6 @@
   }
 }
 
-
 void Sema::ActOnExitFunctionContext() {
   // Same implementation as PopDeclContext, but returns to the lexical parent,
   // rather than the top-level class.
@@ -1177,7 +1219,6 @@
   assert(CurContext && "Popped translation unit!");
 }
 
-
 /// \brief Determine whether we allow overloading of the function
 /// PrevDecl with another declaration.
 ///
@@ -1226,7 +1267,7 @@
       cast<FunctionDecl>(D)->isFunctionTemplateSpecialization())
     return;
 
-  // If this replaces anything in the current scope, 
+  // If this replaces anything in the current scope,
   IdentifierResolver::iterator I = IdResolver.begin(D->getDeclName()),
                                IEnd = IdResolver.end();
   for (; I != IEnd; ++I) {
@@ -1240,7 +1281,7 @@
   }
 
   S->AddDecl(D);
-  
+
   if (isa<LabelDecl>(D) && !cast<LabelDecl>(D)->isGnuLocal()) {
     // Implicitly-generated labels may end up getting generated in an order that
     // isn't strictly lexical, which breaks name lookup. Be careful to insert
@@ -1253,7 +1294,7 @@
       } else if (IDC->Encloses(CurContext))
         break;
     }
-    
+
     IdResolver.InsertDeclAfter(I, D);
   } else {
     IdResolver.AddDecl(D);
@@ -1416,6 +1457,9 @@
     if (VD->isStaticDataMember() &&
         VD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
       return false;
+
+    if (VD->isInline() && !isMainFileLoc(*this, VD->getLocation()))
+      return false;
   } else {
     return false;
   }
@@ -1469,7 +1513,7 @@
 
   if (isa<TypedefNameDecl>(D))
     return true;
-  
+
   // White-list anything that isn't a local variable.
   if (!isa<VarDecl>(D) || isa<ParmVarDecl>(D) || isa<ImplicitParamDecl>(D))
     return false;
@@ -1487,7 +1531,7 @@
     }
 
     // If we failed to complete the type for some reason, or if the type is
-    // dependent, don't diagnose the variable. 
+    // dependent, don't diagnose the variable.
     if (Ty->isIncompleteType() || Ty->isDependentType())
       return false;
 
@@ -1517,7 +1561,7 @@
 
     // TODO: __attribute__((unused)) templates?
   }
-  
+
   return true;
 }
 
@@ -1531,7 +1575,6 @@
     Hint = FixItHint::CreateRemoval(CharSourceRange::
                                     getCharRange(D->getLocStart(), AfterColon));
   }
-  return;
 }
 
 void Sema::DiagnoseUnusedNestedTypedefs(const RecordDecl *D) {
@@ -1558,7 +1601,7 @@
     UnusedLocalTypedefNameCandidates.insert(TD);
     return;
   }
-  
+
   FixItHint Hint;
   GenerateFixForUnusedDecl(D, Context, Hint);
 
@@ -1608,13 +1651,23 @@
       if (const auto *RD = dyn_cast<RecordDecl>(D))
         DiagnoseUnusedNestedTypedefs(RD);
     }
-    
+
     // If this was a forward reference to a label, verify it was defined.
     if (LabelDecl *LD = dyn_cast<LabelDecl>(D))
       CheckPoppedLabel(LD, *this);
-    
-    // Remove this name from our lexical scope.
+
+    // Remove this name from our lexical scope, and warn on it if we haven't
+    // already.
     IdResolver.RemoveDecl(D);
+    auto ShadowI = ShadowingDecls.find(D);
+    if (ShadowI != ShadowingDecls.end()) {
+      if (const auto *FD = dyn_cast<FieldDecl>(ShadowI->second)) {
+        Diag(D->getLocation(), diag::warn_ctor_parm_shadows_field)
+            << D << FD << FD->getParent();
+        Diag(FD->getLocation(), diag::note_previous_declaration);
+      }
+      ShadowingDecls.erase(ShadowI);
+    }
   }
 }
 
@@ -1697,7 +1750,7 @@
   if (!II->isStr("objc_msgSendSuper"))
     return;
   ASTContext &Context = ThisSema.Context;
-    
+
   LookupResult Result(ThisSema, &Context.Idents.get("objc_super"),
                       SourceLocation(), Sema::LookupTagName);
   ThisSema.LookupName(Result, S);
@@ -1748,6 +1801,9 @@
           << Context.BuiltinInfo.getName(ID);
   }
 
+  if (R.isNull())
+    return nullptr;
+
   DeclContext *Parent = Context.getTranslationUnitDecl();
   if (getLangOpts().CPlusPlus) {
     LinkageSpecDecl *CLinkageDecl =
@@ -1855,13 +1911,13 @@
     if (Old->getLocation().isValid())
       Diag(Old->getLocation(), diag::note_previous_definition);
     New->setInvalidDecl();
-    return true;    
+    return true;
   }
-  
+
   if (OldType != NewType &&
       !OldType->isDependentType() &&
       !NewType->isDependentType() &&
-      !Context.hasSameType(OldType, NewType)) { 
+      !Context.hasSameType(OldType, NewType)) {
     int Kind = isa<TypeAliasDecl>(Old) ? 1 : 0;
     Diag(New->getLocation(), diag::err_redefinition_different_typedef)
       << Kind << NewType << OldType;
@@ -2000,7 +2056,7 @@
       return;
 
     // C++0x [dcl.typedef]p4:
-    //   In a given class scope, a typedef specifier can be used to redefine 
+    //   In a given class scope, a typedef specifier can be used to redefine
     //   any class-name declared in that scope that is not also a typedef-name
     //   to refer to the type to which it already refers.
     //
@@ -2032,7 +2088,7 @@
   // Modules always permit redefinition of typedefs, as does C11.
   if (getLangOpts().Modules || getLangOpts().C11)
     return;
-  
+
   // If we have a redefinition of a typedef in C, emit a warning.  This warning
   // is normally mapped to an error, but can be controlled with
   // -Wtypedef-redefinition.  If either the original or the redefinition is
@@ -2260,6 +2316,8 @@
   if (NewAttr) {
     NewAttr->setInherited(true);
     D->addAttr(NewAttr);
+    if (isa<MSInheritanceAttr>(NewAttr))
+      S.Consumer.AssignInheritanceModel(cast<CXXRecordDecl>(D));
     return true;
   }
 
@@ -2275,11 +2333,8 @@
       return Def;
     return VD->getActingDefinition();
   }
-  if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
-    const FunctionDecl* Def;
-    if (FD->isDefined(Def))
-      return Def;
-  }
+  if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
+    return FD->getDefinition();
   return nullptr;
 }
 
@@ -2304,7 +2359,7 @@
   for (unsigned I = 0, E = NewAttributes.size(); I != E;) {
     const Attr *NewAttribute = NewAttributes[I];
 
-    if (isa<AliasAttr>(NewAttribute)) {
+    if (isa<AliasAttr>(NewAttribute) || isa<IFuncAttr>(NewAttribute)) {
       if (FunctionDecl *FD = dyn_cast<FunctionDecl>(New)) {
         Sema::SkipBodyInfo SkipBody;
         S.CheckForFunctionRedefinition(FD, cast<FunctionDecl>(Def), &SkipBody);
@@ -2347,7 +2402,7 @@
       ++I;
       continue;
     } else if (const AlignedAttr *AA = dyn_cast<AlignedAttr>(NewAttribute)) {
-      if (AA->isAlignas()) { 
+      if (AA->isAlignas()) {
         // C++11 [dcl.align]p6:
         //   if any declaration of an entity has an alignment-specifier,
         //   every defining declaration of that entity shall specify an
@@ -2404,6 +2459,24 @@
     }
   }
 
+  // Re-declaration cannot add abi_tag's.
+  if (const auto *NewAbiTagAttr = New->getAttr<AbiTagAttr>()) {
+    if (const auto *OldAbiTagAttr = Old->getAttr<AbiTagAttr>()) {
+      for (const auto &NewTag : NewAbiTagAttr->tags()) {
+        if (std::find(OldAbiTagAttr->tags_begin(), OldAbiTagAttr->tags_end(),
+                      NewTag) == OldAbiTagAttr->tags_end()) {
+          Diag(NewAbiTagAttr->getLocation(),
+               diag::err_new_abi_tag_on_redeclaration)
+              << NewTag;
+          Diag(OldAbiTagAttr->getLocation(), diag::note_previous_declaration);
+        }
+      }
+    } else {
+      Diag(NewAbiTagAttr->getLocation(), diag::err_abi_tag_on_redeclaration);
+      Diag(Old->getLocation(), diag::note_previous_declaration);
+    }
+  }
+
   if (!Old->hasAttrs())
     return;
 
@@ -2527,7 +2600,7 @@
   QualType PromotedType;
 };
 
-}
+} // end anonymous namespace
 
 /// getSpecialMember - get the special member enum for a method.
 Sema::CXXSpecialMember Sema::getSpecialMember(const CXXMethodDecl *MD) {
@@ -2807,11 +2880,11 @@
       Diag(OldLocation, diag::note_previous_declaration);
       return true;
     }
-    
+
     NewTypeInfo = NewTypeInfo.withProducesResult(true);
     RequiresAdjustment = true;
   }
-  
+
   if (RequiresAdjustment) {
     const FunctionType *AdjustedType = New->getType()->getAs<FunctionType>();
     AdjustedType = Context.adjustFunctionType(AdjustedType, NewTypeInfo);
@@ -2964,11 +3037,11 @@
           NewMethod->setImplicit();
         } else {
           Diag(NewMethod->getLocation(),
-               diag::err_definition_of_implicitly_declared_member) 
+               diag::err_definition_of_implicitly_declared_member)
             << New << getSpecialMember(OldMethod);
           return true;
         }
-      } else if (OldMethod->isExplicitlyDefaulted() && !isFriend) {
+      } else if (OldMethod->getFirstDecl()->isExplicitlyDefaulted() && !isFriend) {
         Diag(NewMethod->getLocation(),
              diag::err_definition_of_explicitly_defaulted_member)
           << getSpecialMember(OldMethod);
@@ -3229,10 +3302,8 @@
   return false;
 }
 
-
 void Sema::mergeObjCMethodDecls(ObjCMethodDecl *newMethod,
                                 ObjCMethodDecl *oldMethod) {
-
   // Merge the attributes, including deprecated/unavailable
   AvailabilityMergeKind MergeKind =
     isa<ObjCProtocolDecl>(oldMethod->getDeclContext())
@@ -3253,6 +3324,22 @@
   CheckObjCMethodOverride(newMethod, oldMethod);
 }
 
+static void diagnoseVarDeclTypeMismatch(Sema &S, VarDecl *New, VarDecl* Old) {
+  assert(!S.Context.hasSameType(New->getType(), Old->getType()));
+
+  S.Diag(New->getLocation(), New->isThisDeclarationADefinition()
+         ? diag::err_redefinition_different_type
+         : diag::err_redeclaration_different_type)
+    << New->getDeclName() << New->getType() << Old->getType();
+
+  diag::kind PrevDiag;
+  SourceLocation OldLocation;
+  std::tie(PrevDiag, OldLocation)
+    = getNoteDiagForInvalidRedeclaration(Old, New);
+  S.Diag(OldLocation, PrevDiag);
+  New->setInvalidDecl();
+}
+
 /// MergeVarDeclTypes - We parsed a variable 'New' which has the same name and
 /// scope as a previous declaration 'Old'.  Figure out how to merge their types,
 /// emitting diagnostics as appropriate.
@@ -3279,21 +3366,40 @@
     //   object or function shall be identical, except that declarations for an
     //   array object can specify array types that differ by the presence or
     //   absence of a major array bound (8.3.4).
-    else if (Old->getType()->isIncompleteArrayType() &&
-             New->getType()->isArrayType()) {
+    else if (Old->getType()->isArrayType() && New->getType()->isArrayType()) {
       const ArrayType *OldArray = Context.getAsArrayType(Old->getType());
       const ArrayType *NewArray = Context.getAsArrayType(New->getType());
-      if (Context.hasSameType(OldArray->getElementType(),
-                              NewArray->getElementType()))
-        MergedT = New->getType();
-    } else if (Old->getType()->isArrayType() &&
-               New->getType()->isIncompleteArrayType()) {
-      const ArrayType *OldArray = Context.getAsArrayType(Old->getType());
-      const ArrayType *NewArray = Context.getAsArrayType(New->getType());
-      if (Context.hasSameType(OldArray->getElementType(),
-                              NewArray->getElementType()))
-        MergedT = Old->getType();
-    } else if (New->getType()->isObjCObjectPointerType() &&
+
+      // We are merging a variable declaration New into Old. If it has an array
+      // bound, and that bound differs from Old's bound, we should diagnose the
+      // mismatch.
+      if (!NewArray->isIncompleteArrayType() && !NewArray->isDependentType()) {
+        for (VarDecl *PrevVD = Old->getMostRecentDecl(); PrevVD;
+             PrevVD = PrevVD->getPreviousDecl()) {
+          const ArrayType *PrevVDTy = Context.getAsArrayType(PrevVD->getType());
+          if (PrevVDTy->isIncompleteArrayType() || PrevVDTy->isDependentType())
+            continue;
+
+          if (!Context.hasSameType(NewArray, PrevVDTy))
+            return diagnoseVarDeclTypeMismatch(*this, New, PrevVD);
+        }
+      }
+
+      if (OldArray->isIncompleteArrayType() && NewArray->isArrayType()) {
+        if (Context.hasSameType(OldArray->getElementType(),
+                                NewArray->getElementType()))
+          MergedT = New->getType();
+      }
+      // FIXME: Check visibility. New is hidden but has a complete type. If New
+      // has no array bound, it should not inherit one from Old, if Old is not
+      // visible.
+      else if (OldArray->isArrayType() && NewArray->isIncompleteArrayType()) {
+        if (Context.hasSameType(OldArray->getElementType(),
+                                NewArray->getElementType()))
+          MergedT = Old->getType();
+      }
+    }
+    else if (New->getType()->isObjCObjectPointerType() &&
                Old->getType()->isObjCObjectPointerType()) {
       MergedT = Context.mergeObjCGCQualifiers(New->getType(),
                                               Old->getType());
@@ -3319,27 +3425,7 @@
         New->setType(Context.DependentTy);
       return;
     }
-
-    // FIXME: Even if this merging succeeds, some other non-visible declaration
-    // of this variable might have an incompatible type. For instance:
-    //
-    //   extern int arr[];
-    //   void f() { extern int arr[2]; }
-    //   void g() { extern int arr[3]; }
-    //
-    // Neither C nor C++ requires a diagnostic for this, but we should still try
-    // to diagnose it.
-    Diag(New->getLocation(), New->isThisDeclarationADefinition()
-                                 ? diag::err_redefinition_different_type
-                                 : diag::err_redeclaration_different_type)
-        << New->getDeclName() << New->getType() << Old->getType();
-
-    diag::kind PrevDiag;
-    SourceLocation OldLocation;
-    std::tie(PrevDiag, OldLocation) =
-        getNoteDiagForInvalidRedeclaration(Old, New);
-    Diag(OldLocation, PrevDiag);
-    return New->setInvalidDecl();
+    return diagnoseVarDeclTypeMismatch(*this, New, Old);
   }
 
   // Don't actually update the type on the new declaration if the old
@@ -3433,17 +3519,17 @@
 
   // C++ [class.mem]p1:
   //   A member shall not be declared twice in the member-specification [...]
-  // 
+  //
   // Here, we need only consider static data members.
   if (Old->isStaticDataMember() && !New->isOutOfLine()) {
-    Diag(New->getLocation(), diag::err_duplicate_member) 
+    Diag(New->getLocation(), diag::err_duplicate_member)
       << New->getIdentifier();
     Diag(Old->getLocation(), diag::note_previous_declaration);
     New->setInvalidDecl();
   }
-  
+
   mergeDeclAttributes(New, Old);
-  // Warn if an already-declared variable is made a weak_import in a subsequent 
+  // Warn if an already-declared variable is made a weak_import in a subsequent
   // declaration
   if (New->hasAttr<WeakImportAttr>() &&
       Old->getStorageClass() == SC_None &&
@@ -3541,6 +3627,23 @@
     return New->setInvalidDecl();
   }
 
+  if (New->isInline() && !Old->getMostRecentDecl()->isInline()) {
+    if (VarDecl *Def = Old->getDefinition()) {
+      // C++1z [dcl.fcn.spec]p4:
+      //   If the definition of a variable appears in a translation unit before
+      //   its first declaration as inline, the program is ill-formed.
+      Diag(New->getLocation(), diag::err_inline_decl_follows_def) << New;
+      Diag(Def->getLocation(), diag::note_previous_definition);
+    }
+  }
+
+  // If this redeclaration makes the function inline, we may need to add it to
+  // UndefinedButUsed.
+  if (!Old->isInline() && New->isInline() && Old->isUsed(false) &&
+      !Old->getDefinition() && !New->isThisDeclarationADefinition())
+    UndefinedButUsed.insert(std::make_pair(Old->getCanonicalDecl(),
+                                           SourceLocation()));
+
   if (New->getTLSKind() != Old->getTLSKind()) {
     if (!Old->getTLSKind()) {
       Diag(New->getLocation(), diag::err_thread_non_thread) << New->getDeclName();
@@ -3572,6 +3675,12 @@
          New->getDeclContext()->isDependentContext())) {
       // The previous definition is hidden, and multiple definitions are
       // permitted (in separate TUs). Form another definition of it.
+    } else if (Old->isStaticDataMember() &&
+               Old->getCanonicalDecl()->isInline() &&
+               Old->getCanonicalDecl()->isConstexpr()) {
+      // This definition won't be a definition any more once it's been merged.
+      Diag(New->getLocation(),
+           diag::warn_deprecated_redundant_constexpr_static_def);
     } else {
       Diag(New->getLocation(), diag::err_redefinition) << New;
       Diag(Def->getLocation(), diag::note_previous_definition);
@@ -3600,13 +3709,18 @@
   New->setAccess(Old->getAccess());
   if (NewTemplate)
     NewTemplate->setAccess(New->getAccess());
+
+  if (Old->isInline())
+    New->setImplicitlyInline();
 }
 
 /// ParsedFreeStandingDeclSpec - This method is invoked when a declspec with
 /// no declarator (e.g. "struct foo;") is parsed.
-Decl *Sema::ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS,
-                                       DeclSpec &DS) {
-  return ParsedFreeStandingDeclSpec(S, AS, DS, MultiTemplateParamsArg());
+Decl *
+Sema::ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS, DeclSpec &DS,
+                                 RecordDecl *&AnonRecord) {
+  return ParsedFreeStandingDeclSpec(S, AS, DS, MultiTemplateParamsArg(), false,
+                                    AnonRecord);
 }
 
 // The MS ABI changed between VS2013 and VS2015 with regard to numbers used to
@@ -3712,10 +3826,11 @@
 /// ParsedFreeStandingDeclSpec - This method is invoked when a declspec with
 /// no declarator (e.g. "struct foo;") is parsed. It also accepts template
 /// parameters to cope with template friend declarations.
-Decl *Sema::ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS,
-                                       DeclSpec &DS,
-                                       MultiTemplateParamsArg TemplateParams,
-                                       bool IsExplicitInstantiation) {
+Decl *
+Sema::ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS, DeclSpec &DS,
+                                 MultiTemplateParamsArg TemplateParams,
+                                 bool IsExplicitInstantiation,
+                                 RecordDecl *&AnonRecord) {
   Decl *TagD = nullptr;
   TagDecl *Tag = nullptr;
   if (DS.getTypeSpecType() == DeclSpec::TST_class ||
@@ -3753,6 +3868,10 @@
            << DS.getSourceRange();
   }
 
+  if (DS.isInlineSpecified())
+    Diag(DS.getInlineSpecLoc(), diag::err_inline_non_function)
+        << getLangOpts().CPlusPlus1z;
+
   if (DS.isConstexprSpecified()) {
     // C++0x [dcl.constexpr]p1: constexpr can only be applied to declarations
     // and definitions of functions and variables.
@@ -3810,9 +3929,19 @@
     if (!Record->getDeclName() && Record->isCompleteDefinition() &&
         DS.getStorageClassSpec() != DeclSpec::SCS_typedef) {
       if (getLangOpts().CPlusPlus ||
-          Record->getDeclContext()->isRecord())
+          Record->getDeclContext()->isRecord()) {
+        // If CurContext is a DeclContext that can contain statements,
+        // RecursiveASTVisitor won't visit the decls that
+        // BuildAnonymousStructOrUnion() will put into CurContext.
+        // Also store them here so that they can be part of the
+        // DeclStmt that gets created in this case.
+        // FIXME: Also return the IndirectFieldDecls created by
+        // BuildAnonymousStructOr union, for the same reason?
+        if (CurContext->isFunctionOrMethod())
+          AnonRecord = Record;
         return BuildAnonymousStructOrUnion(S, DS, AS, Record,
                                            Context.getPrintingPolicy());
+      }
 
       DeclaresAnything = false;
     }
@@ -3934,6 +4063,8 @@
     // Restrict is covered above.
     if (DS.getTypeQualifiers() & DeclSpec::TQ_atomic)
       Diag(DS.getAtomicSpecLoc(), DiagID) << "_Atomic";
+    if (DS.getTypeQualifiers() & DeclSpec::TQ_unaligned)
+      Diag(DS.getUnalignedSpecLoc(), DiagID) << "__unaligned";
   }
 
   // Warn about ignored type attributes, for example:
@@ -4000,12 +4131,10 @@
 ///
 /// This routine is recursive, injecting the names of nested anonymous
 /// structs/unions into the owning context and scope as well.
-static bool InjectAnonymousStructOrUnionMembers(Sema &SemaRef, Scope *S,
-                                         DeclContext *Owner,
-                                         RecordDecl *AnonRecord,
-                                         AccessSpecifier AS,
-                                         SmallVectorImpl<NamedDecl *> &Chaining,
-                                         bool MSAnonStruct) {
+static bool
+InjectAnonymousStructOrUnionMembers(Sema &SemaRef, Scope *S, DeclContext *Owner,
+                                    RecordDecl *AnonRecord, AccessSpecifier AS,
+                                    SmallVectorImpl<NamedDecl *> &Chaining) {
   bool Invalid = false;
 
   // Look every FieldDecl and IndirectFieldDecl with a name.
@@ -4041,7 +4170,7 @@
 
         IndirectFieldDecl *IndirectField = IndirectFieldDecl::Create(
             SemaRef.Context, Owner, VD->getLocation(), VD->getIdentifier(),
-            VD->getType(), NamedChain, Chaining.size());
+            VD->getType(), {NamedChain, Chaining.size()});
 
         for (const auto *Attr : VD->attrs())
           IndirectField->addAttr(Attr->clone(SemaRef.Context));
@@ -4151,7 +4280,7 @@
             cast<NamespaceDecl>(Owner)->getDeclName()))) {
         Diag(Record->getLocation(), diag::err_anonymous_union_not_static)
           << FixItHint::CreateInsertion(Record->getLocation(), "static ");
-  
+
         // Recover by adding 'static'.
         DS.SetStorageClassSpec(*this, DeclSpec::SCS_static, SourceLocation(),
                                PrevSpec, DiagID, Policy);
@@ -4164,9 +4293,9 @@
         Diag(DS.getStorageClassSpecLoc(),
              diag::err_anonymous_union_with_storage_spec)
           << FixItHint::CreateRemoval(DS.getStorageClassSpecLoc());
-  
+
         // Recover by removing the storage specifier.
-        DS.SetStorageClassSpec(*this, DeclSpec::SCS_unspecified, 
+        DS.SetStorageClassSpec(*this, DeclSpec::SCS_unspecified,
                                SourceLocation(),
                                PrevSpec, DiagID, Context.getPrintingPolicy());
       }
@@ -4193,6 +4322,11 @@
              diag::ext_anonymous_struct_union_qualified)
           << Record->isUnion() << "_Atomic"
           << FixItHint::CreateRemoval(DS.getAtomicSpecLoc());
+      if (DS.getTypeQualifiers() & DeclSpec::TQ_unaligned)
+        Diag(DS.getUnalignedSpecLoc(),
+             diag::ext_anonymous_struct_union_qualified)
+          << Record->isUnion() << "__unaligned"
+          << FixItHint::CreateRemoval(DS.getUnalignedSpecLoc());
 
       DS.ClearTypeQualifiers();
     }
@@ -4262,7 +4396,7 @@
           DK = diag::err_anonymous_record_with_function;
         else if (isa<VarDecl>(Mem))
           DK = diag::err_anonymous_record_with_static;
-        
+
         // Visual C++ allows type definition in anonymous struct or union.
         if (getLangOpts().MicrosoftExt &&
             DK == diag::err_anonymous_record_with_type)
@@ -4348,8 +4482,7 @@
   SmallVector<NamedDecl*, 2> Chain;
   Chain.push_back(Anon);
 
-  if (InjectAnonymousStructOrUnionMembers(*this, S, Owner, Record, AS,
-                                          Chain, false))
+  if (InjectAnonymousStructOrUnionMembers(*this, S, Owner, Record, AS, Chain))
     Invalid = true;
 
   if (VarDecl *NewVD = dyn_cast<VarDecl>(Anon)) {
@@ -4421,7 +4554,7 @@
   if (RequireCompleteType(Anon->getLocation(), RecTy,
                           diag::err_field_incomplete) ||
       InjectAnonymousStructOrUnionMembers(*this, S, CurContext, RecordDef,
-                                          AS_none, Chain, true)) {
+                                          AS_none, Chain)) {
     Anon->setInvalidDecl();
     ParentDecl->setInvalidDecl();
   }
@@ -4670,7 +4803,7 @@
 }
 
 /// DiagnoseClassNameShadow - Implement C++ [class.mem]p13:
-///   If T is the name of a class, then each of the following shall have a 
+///   If T is the name of a class, then each of the following shall have a
 ///   name different from T:
 ///     - every static data member of class T;
 ///     - every member function of class T
@@ -4691,12 +4824,12 @@
   return false;
 }
 
-/// \brief Diagnose a declaration whose declarator-id has the given 
+/// \brief Diagnose a declaration whose declarator-id has the given
 /// nested-name-specifier.
 ///
 /// \param SS The nested-name-specifier of the declarator-id.
 ///
-/// \param DC The declaration context to which the nested-name-specifier 
+/// \param DC The declaration context to which the nested-name-specifier
 /// resolves.
 ///
 /// \param Name The name of the entity being declared.
@@ -4742,15 +4875,15 @@
       Diag(Loc, diag::err_invalid_declarator_global_scope)
         << Name << SS.getRange();
     else if (isa<FunctionDecl>(Cur))
-      Diag(Loc, diag::err_invalid_declarator_in_function) 
+      Diag(Loc, diag::err_invalid_declarator_in_function)
         << Name << SS.getRange();
     else if (isa<BlockDecl>(Cur))
-      Diag(Loc, diag::err_invalid_declarator_in_block) 
+      Diag(Loc, diag::err_invalid_declarator_in_block)
         << Name << SS.getRange();
     else
       Diag(Loc, diag::err_invalid_declarator_scope)
       << Name << cast<NamedDecl>(Cur) << cast<NamedDecl>(DC) << SS.getRange();
-    
+
     return true;
   }
 
@@ -4759,7 +4892,7 @@
     Diag(Loc, diag::err_member_qualification)
       << Name << SS.getRange();
     SS.clear();
-    
+
     // C++ constructors and destructors with incorrect scopes can break
     // our AST invariants by having the wrong underlying types. If
     // that's the case, then drop this declaration entirely.
@@ -4768,10 +4901,10 @@
         !Context.hasSameType(Name.getCXXNameType(),
                              Context.getTypeDeclType(cast<CXXRecordDecl>(Cur))))
       return true;
-    
+
     return false;
   }
-  
+
   // C++11 [dcl.meaning]p1:
   //   [...] "The nested-name-specifier of the qualified declarator-id shall
   //   not begin with a decltype-specifer"
@@ -4794,7 +4927,9 @@
 
   // All of these full declarators require an identifier.  If it doesn't have
   // one, the ParsedFreeStandingDeclSpec action should be used.
-  if (!Name) {
+  if (D.isDecompositionDeclarator()) {
+    return ActOnDecompositionDeclarator(S, D, TemplateParamLists);
+  } else if (!Name) {
     if (!D.isInvalidType())  // Reject this if we think it is valid.
       Diag(D.getDeclSpec().getLocStart(),
            diag::err_declarator_need_ident)
@@ -4813,7 +4948,7 @@
   if (D.getCXXScopeSpec().isInvalid())
     D.setInvalidType();
   else if (D.getCXXScopeSpec().isSet()) {
-    if (DiagnoseUnexpandedParameterPack(D.getCXXScopeSpec(), 
+    if (DiagnoseUnexpandedParameterPack(D.getCXXScopeSpec(),
                                         UPPC_DeclarationQualifier))
       return nullptr;
 
@@ -4832,7 +4967,7 @@
     }
     bool IsDependentContext = DC->isDependentContext();
 
-    if (!IsDependentContext && 
+    if (!IsDependentContext &&
         RequireCompleteDeclContext(D.getCXXScopeSpec(), DC))
       return nullptr;
 
@@ -4912,11 +5047,11 @@
     LookupQualifiedName(Previous, DC);
 
     // C++ [dcl.meaning]p1:
-    //   When the declarator-id is qualified, the declaration shall refer to a 
-    //  previously declared member of the class or namespace to which the 
+    //   When the declarator-id is qualified, the declaration shall refer to a
+    //  previously declared member of the class or namespace to which the
     //  qualifier refers (or, in the case of a namespace, of an element of the
     //  inline namespace set of that namespace (7.3.1)) or to a specialization
-    //  thereof; [...] 
+    //  thereof; [...]
     //
     // Note that we already checked the context above, and that we do not have
     // enough information to make sure that Previous contains the declaration
@@ -4932,10 +5067,10 @@
     // In this case, Previous will point to the overload set
     // containing the two f's declared in X, but neither of them
     // matches.
-    
+
     // C++ [dcl.meaning]p1:
-    //   [...] the member shall not merely have been introduced by a 
-    //   using-declaration in the scope of the class or namespace nominated by 
+    //   [...] the member shall not merely have been introduced by a
+    //   using-declaration in the scope of the class or namespace nominated by
     //   the nested-name-specifier of the declarator-id.
     RemoveUsingDecls(Previous);
   }
@@ -5003,10 +5138,9 @@
   if (!New)
     return nullptr;
 
-  // If this has an identifier and is not an invalid redeclaration or 
-  // function template specialization, add it to the scope stack.
-  if (New->getDeclName() && AddToScope &&
-       !(D.isRedeclaration() && New->isInvalidDecl())) {
+  // If this has an identifier and is not a function template specialization,
+  // add it to the scope stack.
+  if (New->getDeclName() && AddToScope) {
     // Only make a locally-scoped extern declaration visible if it is the first
     // declaration of this entity. Qualified lookup for such an entity should
     // only find this declaration if there is no visible declaration of it.
@@ -5016,6 +5150,9 @@
       CurContext->addHiddenDecl(New);
   }
 
+  if (isInOpenMPDeclareTargetContext())
+    checkDeclIsAllowedInOpenMPTarget(nullptr, New);
+
   return New;
 }
 
@@ -5032,10 +5169,10 @@
   // constant expression folding, like struct {char x[(int)(char*)2];}
   SizeIsNegative = false;
   Oversized = 0;
-  
+
   if (T->isDependentType())
     return QualType();
-  
+
   QualifierCollector Qs;
   const Type *Ty = Qs.strip(T);
 
@@ -5084,7 +5221,7 @@
     Oversized = Res;
     return QualType();
   }
-  
+
   return Context.getConstantArrayType(VLATy->getElementType(),
                                       Res, ArrayType::Normal, 0);
 }
@@ -5162,11 +5299,7 @@
 /// does not identify a function.
 void Sema::DiagnoseFunctionSpecifiers(const DeclSpec &DS) {
   // FIXME: We should probably indicate the identifier in question to avoid
-  // confusion for constructs like "inline int a(), b;"
-  if (DS.isInlineSpecified())
-    Diag(DS.getInlineSpecLoc(),
-         diag::err_inline_non_function);
-
+  // confusion for constructs like "virtual int a(), b;"
   if (DS.isVirtualSpecified())
     Diag(DS.getVirtualSpecLoc(),
          diag::err_virtual_non_function);
@@ -5195,6 +5328,9 @@
 
   DiagnoseFunctionSpecifiers(D.getDeclSpec());
 
+  if (D.getDeclSpec().isInlineSpecified())
+    Diag(D.getDeclSpec().getInlineSpecLoc(), diag::err_inline_non_function)
+        << getLangOpts().CPlusPlus1z;
   if (D.getDeclSpec().isConstexprSpecified())
     Diag(D.getDeclSpec().getConstexprSpecLoc(), diag::err_invalid_constexpr)
       << 1;
@@ -5249,7 +5385,7 @@
         else if (T->isVariableArrayType())
           Diag(NewTD->getLocation(), diag::err_vla_decl_in_file_scope);
         else if (Oversized.getBoolValue())
-          Diag(NewTD->getLocation(), diag::err_array_too_large) 
+          Diag(NewTD->getLocation(), diag::err_array_too_large)
             << Oversized.toString(10);
         else
           Diag(NewTD->getLocation(), diag::err_vm_decl_in_file_scope);
@@ -5259,7 +5395,6 @@
   }
 }
 
-
 /// ActOnTypedefNameDecl - Perform semantic checking for a declaration which
 /// declares a typedef-name, either using the 'typedef' type specifier or via
 /// a C++0x [dcl.typedef]p2 alias-declaration: 'using T = A;'.
@@ -5331,12 +5466,12 @@
     if (!OuterContext->isFunctionOrMethod())
       // This rule only applies to block-scope declarations.
       return false;
-    
+
     DeclContext *PrevOuterContext = PrevDecl->getDeclContext();
     if (PrevOuterContext->isRecord())
       // We found a member function: ignore it.
       return false;
-    
+
     // Find the innermost enclosing namespace for the new and
     // previous declarations.
     OuterContext = OuterContext->getEnclosingNamespaceContext();
@@ -5387,7 +5522,7 @@
     type = Context.getLifetimeQualifiedType(type, lifetime);
     decl->setType(type);
   }
-  
+
   if (VarDecl *var = dyn_cast<VarDecl>(decl)) {
     // Thread-local variables cannot have lifetime.
     if (lifetime && lifetime != Qualifiers::OCL_ExplicitNone &&
@@ -5397,7 +5532,7 @@
       return true;
     }
   }
-  
+
   return false;
 }
 
@@ -5426,7 +5561,7 @@
       if (const auto *Attr = VD->getAttr<AliasAttr>()) {
         assert(VD->isThisDeclarationADefinition() &&
                !VD->isExternallyVisible() && "Broken AliasAttr handled late!");
-        S.Diag(Attr->getLocation(), diag::err_alias_is_definition) << VD;
+        S.Diag(Attr->getLocation(), diag::err_alias_is_definition) << VD << 0;
         VD->dropAttr<AliasAttr>();
       }
     }
@@ -5466,9 +5601,13 @@
 
 static void checkDLLAttributeRedeclaration(Sema &S, NamedDecl *OldDecl,
                                            NamedDecl *NewDecl,
-                                           bool IsSpecialization) {
-  if (TemplateDecl *OldTD = dyn_cast<TemplateDecl>(OldDecl))
+                                           bool IsSpecialization,
+                                           bool IsDefinition) {
+  if (TemplateDecl *OldTD = dyn_cast<TemplateDecl>(OldDecl)) {
     OldDecl = OldTD->getTemplatedDecl();
+    if (!IsSpecialization)
+      IsDefinition = false;
+  }
   if (TemplateDecl *NewTD = dyn_cast<TemplateDecl>(NewDecl))
     NewDecl = NewTD->getTemplatedDecl();
 
@@ -5524,14 +5663,17 @@
 
   // A redeclaration is not allowed to drop a dllimport attribute, the only
   // exceptions being inline function definitions, local extern declarations,
-  // and qualified friend declarations.
-  // NB: MSVC converts such a declaration to dllexport.
+  // qualified friend declarations or special MSVC extension: in the last case,
+  // the declaration is treated as if it were marked dllexport.
   bool IsInline = false, IsStaticDataMember = false, IsQualifiedFriend = false;
-  if (const auto *VD = dyn_cast<VarDecl>(NewDecl))
+  bool IsMicrosoft = S.Context.getTargetInfo().getCXXABI().isMicrosoft();
+  if (const auto *VD = dyn_cast<VarDecl>(NewDecl)) {
     // Ignore static data because out-of-line definitions are diagnosed
     // separately.
     IsStaticDataMember = VD->isStaticDataMember();
-  else if (const auto *FD = dyn_cast<FunctionDecl>(NewDecl)) {
+    IsDefinition = VD->isThisDeclarationADefinition(S.Context) !=
+                   VarDecl::DeclarationOnly;
+  } else if (const auto *FD = dyn_cast<FunctionDecl>(NewDecl)) {
     IsInline = FD->isInlined();
     IsQualifiedFriend = FD->getQualifier() &&
                         FD->getFriendObjectKind() == Decl::FOK_Declared;
@@ -5539,15 +5681,25 @@
 
   if (OldImportAttr && !HasNewAttr && !IsInline && !IsStaticDataMember &&
       !NewDecl->isLocalExternDecl() && !IsQualifiedFriend) {
-    S.Diag(NewDecl->getLocation(),
-           diag::warn_redeclaration_without_attribute_prev_attribute_ignored)
-      << NewDecl << OldImportAttr;
-    S.Diag(OldDecl->getLocation(), diag::note_previous_declaration);
-    S.Diag(OldImportAttr->getLocation(), diag::note_previous_attribute);
-    OldDecl->dropAttr<DLLImportAttr>();
-    NewDecl->dropAttr<DLLImportAttr>();
-  } else if (IsInline && OldImportAttr &&
-             !S.Context.getTargetInfo().getCXXABI().isMicrosoft()) {
+    if (IsMicrosoft && IsDefinition) {
+      S.Diag(NewDecl->getLocation(),
+             diag::warn_redeclaration_without_import_attribute)
+          << NewDecl;
+      S.Diag(OldDecl->getLocation(), diag::note_previous_declaration);
+      NewDecl->dropAttr<DLLImportAttr>();
+      NewDecl->addAttr(::new (S.Context) DLLExportAttr(
+          NewImportAttr->getRange(), S.Context,
+          NewImportAttr->getSpellingListIndex()));
+    } else {
+      S.Diag(NewDecl->getLocation(),
+             diag::warn_redeclaration_without_attribute_prev_attribute_ignored)
+          << NewDecl << OldImportAttr;
+      S.Diag(OldDecl->getLocation(), diag::note_previous_declaration);
+      S.Diag(OldImportAttr->getLocation(), diag::note_previous_attribute);
+      OldDecl->dropAttr<DLLImportAttr>();
+      NewDecl->dropAttr<DLLImportAttr>();
+    }
+  } else if (IsInline && OldImportAttr && !IsMicrosoft) {
     // In MinGW, seeing a function declared inline drops the dllimport attribute.
     OldDecl->dropAttr<DLLImportAttr>();
     NewDecl->dropAttr<DLLImportAttr>();
@@ -5613,10 +5765,9 @@
     if (!D->isInExternCContext() || D->template hasAttr<OverloadableAttr>())
       return false;
 
-    // So do CUDA's host/device attributes if overloading is enabled.
-    if (S.getLangOpts().CUDA && S.getLangOpts().CUDATargetOverloads &&
-        (D->template hasAttr<CUDADeviceAttr>() ||
-         D->template hasAttr<CUDAHostAttr>()))
+    // So do CUDA's host/device attributes.
+    if (S.getLangOpts().CUDA && (D->template hasAttr<CUDADeviceAttr>() ||
+                                 D->template hasAttr<CUDAHostAttr>()))
       return false;
   }
   return D->isExternC();
@@ -5624,7 +5775,7 @@
 
 static bool shouldConsiderLinkage(const VarDecl *VD) {
   const DeclContext *DC = VD->getDeclContext()->getRedeclContext();
-  if (DC->isFunctionOrMethod())
+  if (DC->isFunctionOrMethod() || isa<OMPDeclareReductionDecl>(DC))
     return VD->hasExternalStorage();
   if (DC->isFileContext())
     return true;
@@ -5635,7 +5786,8 @@
 
 static bool shouldConsiderLinkage(const FunctionDecl *FD) {
   const DeclContext *DC = FD->getDeclContext()->getRedeclContext();
-  if (DC->isFileContext() || DC->isFunctionOrMethod())
+  if (DC->isFileContext() || DC->isFunctionOrMethod() ||
+      isa<OMPDeclareReductionDecl>(DC))
     return true;
   if (DC->isRecord())
     return false;
@@ -5701,14 +5853,41 @@
   llvm_unreachable("Unknown type of decl!");
 }
 
-NamedDecl *
-Sema::ActOnVariableDeclarator(Scope *S, Declarator &D, DeclContext *DC,
-                              TypeSourceInfo *TInfo, LookupResult &Previous,
-                              MultiTemplateParamsArg TemplateParamLists,
-                              bool &AddToScope) {
+NamedDecl *Sema::ActOnVariableDeclarator(
+    Scope *S, Declarator &D, DeclContext *DC, TypeSourceInfo *TInfo,
+    LookupResult &Previous, MultiTemplateParamsArg TemplateParamLists,
+    bool &AddToScope, ArrayRef<BindingDecl *> Bindings) {
   QualType R = TInfo->getType();
   DeclarationName Name = GetNameForDeclarator(D).getName();
 
+  IdentifierInfo *II = Name.getAsIdentifierInfo();
+
+  if (D.isDecompositionDeclarator()) {
+    AddToScope = false;
+    // Take the name of the first declarator as our name for diagnostic
+    // purposes.
+    auto &Decomp = D.getDecompositionDeclarator();
+    if (!Decomp.bindings().empty()) {
+      II = Decomp.bindings()[0].Name;
+      Name = II;
+    }
+  } else if (!II) {
+    Diag(D.getIdentifierLoc(), diag::err_bad_variable_name)
+      << Name;
+    return nullptr;
+  }
+
+  // OpenCL v2.0 s6.9.b - Image type can only be used as a function argument.
+  // OpenCL v2.0 s6.13.16.1 - Pipe type can only be used as a function
+  // argument.
+  if (getLangOpts().OpenCL && (R->isImageType() || R->isPipeType())) {
+    Diag(D.getIdentifierLoc(),
+         diag::err_opencl_type_can_only_be_used_as_function_parameter)
+        << R;
+    D.setInvalidType();
+    return nullptr;
+  }
+
   DeclSpec::SCS SCSpec = D.getDeclSpec().getStorageClassSpec();
   StorageClass SC = StorageClassSpecToVarDeclStorageClass(D.getDeclSpec());
 
@@ -5765,13 +5944,6 @@
       << FixItHint::CreateRemoval(D.getDeclSpec().getStorageClassSpecLoc());
   }
 
-  IdentifierInfo *II = Name.getAsIdentifierInfo();
-  if (!II) {
-    Diag(D.getIdentifierLoc(), diag::err_bad_variable_name)
-      << Name;
-    return nullptr;
-  }
-
   DiagnoseFunctionSpecifiers(D.getDeclSpec());
 
   if (!DC->isRecord() && S->getFnParent() == nullptr) {
@@ -5855,7 +6027,7 @@
       case SC_PrivateExtern:
         llvm_unreachable("C storage class in c++!");
       }
-    }    
+    }
 
     if (SC == SC_Static && CurContext->isRecord()) {
       if (const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(DC)) {
@@ -5940,6 +6112,10 @@
         return nullptr;
       NewVD = cast<VarDecl>(Res.get());
       AddToScope = false;
+    } else if (D.isDecompositionDeclarator()) {
+      NewVD = DecompositionDecl::Create(Context, DC, D.getLocStart(),
+                                        D.getIdentifierLoc(), R, TInfo, SC,
+                                        Bindings);
     } else
       NewVD = VarDecl::Create(Context, DC, D.getLocStart(),
                               D.getIdentifierLoc(), II, R, TInfo, SC);
@@ -5972,11 +6148,18 @@
       NewVD->setTemplateParameterListsInfo(
           Context, TemplateParamLists.drop_back(VDTemplateParamLists));
 
-    if (D.getDeclSpec().isConstexprSpecified())
+    if (D.getDeclSpec().isConstexprSpecified()) {
       NewVD->setConstexpr(true);
+      // C++1z [dcl.spec.constexpr]p1:
+      //   A static data member declared with the constexpr specifier is
+      //   implicitly an inline variable.
+      if (NewVD->isStaticDataMember() && getLangOpts().CPlusPlus1z)
+        NewVD->setImplicitlyInline();
+    }
 
     if (D.getDeclSpec().isConceptSpecified()) {
-      NewVD->setConcept(true);
+      if (VarTemplateDecl *VTD = NewVD->getDescribedVarTemplate())
+        VTD->setConcept();
 
       // C++ Concepts TS [dcl.spec.concept]p2: A concept definition shall not
       // be declared with the thread_local, inline, friend, or constexpr
@@ -5994,6 +6177,41 @@
             << 0 << 3;
         NewVD->setInvalidDecl(true);
       }
+
+      // C++ Concepts TS [dcl.spec.concept]p1: The concept specifier shall be
+      // applied only to the definition of a function template or variable
+      // template, declared in namespace scope.
+      if (IsVariableTemplateSpecialization) {
+        Diag(D.getDeclSpec().getConceptSpecLoc(),
+             diag::err_concept_specified_specialization)
+            << (IsPartialSpecialization ? 2 : 1);
+      }
+
+      // C++ Concepts TS [dcl.spec.concept]p6: A variable concept has the
+      // following restrictions:
+      // - The declared type shall have the type bool.
+      if (!Context.hasSameType(NewVD->getType(), Context.BoolTy) &&
+          !NewVD->isInvalidDecl()) {
+        Diag(D.getIdentifierLoc(), diag::err_variable_concept_bool_decl);
+        NewVD->setInvalidDecl(true);
+      }
+    }
+  }
+
+  if (D.getDeclSpec().isInlineSpecified()) {
+    if (!getLangOpts().CPlusPlus) {
+      Diag(D.getDeclSpec().getInlineSpecLoc(), diag::err_inline_non_function)
+          << 0;
+    } else if (CurContext->isFunctionOrMethod()) {
+      // 'inline' is not allowed on block scope variable declaration.
+      Diag(D.getDeclSpec().getInlineSpecLoc(),
+           diag::err_inline_declaration_block_scope) << Name
+        << FixItHint::CreateRemoval(D.getDeclSpec().getInlineSpecLoc());
+    } else {
+      Diag(D.getDeclSpec().getInlineSpecLoc(),
+           getLangOpts().CPlusPlus1z ? diag::warn_cxx14_compat_inline_variable
+                                     : diag::ext_inline_variable);
+      NewVD->setInlineSpecified();
     }
   }
 
@@ -6003,8 +6221,13 @@
   if (NewTemplate)
     NewTemplate->setLexicalDeclContext(CurContext);
 
-  if (IsLocalExternDecl)
-    NewVD->setLocalExternDecl();
+  if (IsLocalExternDecl) {
+    if (D.isDecompositionDeclarator())
+      for (auto *B : Bindings)
+        B->setLocalExternDecl();
+    else
+      NewVD->setLocalExternDecl();
+  }
 
   bool EmitTLSUnsupportedError = false;
   if (DeclSpec::TSCS TSCS = D.getDeclSpec().getThreadStorageClassSpec()) {
@@ -6076,6 +6299,8 @@
       NewVD->setModulePrivate();
       if (NewTemplate)
         NewTemplate->setModulePrivate();
+      for (auto *B : Bindings)
+        B->setModulePrivate();
     }
   }
 
@@ -6217,6 +6442,25 @@
     if (!IsVariableTemplateSpecialization)
       D.setRedeclaration(CheckVariableDeclaration(NewVD, Previous));
 
+    // C++ Concepts TS [dcl.spec.concept]p7: A program shall not declare [...]
+    // an explicit specialization (14.8.3) or a partial specialization of a
+    // concept definition.
+    if (IsVariableTemplateSpecialization &&
+        !D.getDeclSpec().isConceptSpecified() && !Previous.empty() &&
+        Previous.isSingleResult()) {
+      NamedDecl *PreviousDecl = Previous.getFoundDecl();
+      if (VarTemplateDecl *VarTmpl = dyn_cast<VarTemplateDecl>(PreviousDecl)) {
+        if (VarTmpl->isConcept()) {
+          Diag(NewVD->getLocation(), diag::err_concept_specialized)
+              << 1                            /*variable*/
+              << (IsPartialSpecialization ? 2 /*partially specialized*/
+                                          : 1 /*explicitly specialized*/);
+          Diag(VarTmpl->getLocation(), diag::note_previous_declaration);
+          NewVD->setInvalidDecl();
+        }
+      }
+    }
+
     if (NewTemplate) {
       VarTemplateDecl *PrevVarTemplate =
           NewVD->getPreviousDecl()
@@ -6264,7 +6508,7 @@
   }
 
   // Special handling of variable named 'main'.
-  if (Name.isIdentifier() && Name.getAsIdentifierInfo()->isStr("main") &&
+  if (Name.getAsIdentifierInfo() && Name.getAsIdentifierInfo()->isStr("main") &&
       NewVD->getDeclContext()->getRedeclContext()->isTranslationUnit() &&
       !getLangOpts().Freestanding && !NewVD->getDescribedVarTemplate()) {
 
@@ -6282,7 +6526,7 @@
   if (D.isRedeclaration() && !Previous.empty()) {
     checkDLLAttributeRedeclaration(
         *this, dyn_cast<NamedDecl>(Previous.getRepresentativeDecl()), NewVD,
-        IsExplicitSpecialization);
+        IsExplicitSpecialization, D.isFunctionDefinition());
   }
 
   if (NewTemplate) {
@@ -6295,6 +6539,17 @@
   return NewVD;
 }
 
+/// Enum describing the %select options in diag::warn_decl_shadow.
+enum ShadowedDeclKind { SDK_Local, SDK_Global, SDK_StaticMember, SDK_Field };
+
+/// Determine what kind of declaration we're shadowing.
+static ShadowedDeclKind computeShadowedDeclKind(const NamedDecl *ShadowedDecl,
+                                                const DeclContext *OldDC) {
+  if (isa<RecordDecl>(OldDC))
+    return isa<FieldDecl>(ShadowedDecl) ? SDK_Field : SDK_StaticMember;
+  return OldDC->isFileContext() ? SDK_Global : SDK_Local;
+}
+
 /// \brief Diagnose variable or built-in function shadowing.  Implements
 /// -Wshadow.
 ///
@@ -6323,12 +6578,23 @@
   if (!isa<VarDecl>(ShadowedDecl) && !isa<FieldDecl>(ShadowedDecl))
     return;
 
-  // Fields are not shadowed by variables in C++ static methods.
-  if (isa<FieldDecl>(ShadowedDecl))
+  if (FieldDecl *FD = dyn_cast<FieldDecl>(ShadowedDecl)) {
+    // Fields are not shadowed by variables in C++ static methods.
     if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(NewDC))
       if (MD->isStatic())
         return;
 
+    // Fields shadowed by constructor parameters are a special case. Usually
+    // the constructor initializes the field with the parameter.
+    if (isa<CXXConstructorDecl>(NewDC) && isa<ParmVarDecl>(D)) {
+      // Remember that this was shadowed so we can either warn about its
+      // modification or its existence depending on warning settings.
+      D = D->getCanonicalDecl();
+      ShadowingDecls.insert({D, FD});
+      return;
+    }
+  }
+
   if (VarDecl *shadowedVar = dyn_cast<VarDecl>(ShadowedDecl))
     if (shadowedVar->isExternC()) {
       // For shadowing external vars, make sure that we point to the global
@@ -6350,29 +6616,19 @@
 
     // TODO: should we warn about static data members shadowing
     // static data members from base classes?
-    
+
     // TODO: don't diagnose for inaccessible shadowed members.
     // This is hard to do perfectly because we might friend the
     // shadowing context, but that's just a false negative.
   }
 
-  // Determine what kind of declaration we're shadowing.
-  unsigned Kind;
-  if (isa<RecordDecl>(OldDC)) {
-    if (isa<FieldDecl>(ShadowedDecl))
-      Kind = 3; // field
-    else
-      Kind = 2; // static data member
-  } else if (OldDC->isFileContext())
-    Kind = 1; // global
-  else
-    Kind = 0; // local
 
   DeclarationName Name = R.getLookupName();
 
   // Emit warning and note.
   if (getSourceManager().isInSystemMacro(R.getNameLoc()))
     return;
+  ShadowedDeclKind Kind = computeShadowedDeclKind(ShadowedDecl, OldDC);
   Diag(R.getNameLoc(), diag::warn_decl_shadow) << Name << Kind << OldDC;
   Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
 }
@@ -6388,6 +6644,30 @@
   CheckShadow(S, D, R);
 }
 
+/// Check if 'E', which is an expression that is about to be modified, refers
+/// to a constructor parameter that shadows a field.
+void Sema::CheckShadowingDeclModification(Expr *E, SourceLocation Loc) {
+  // Quickly ignore expressions that can't be shadowing ctor parameters.
+  if (!getLangOpts().CPlusPlus || ShadowingDecls.empty())
+    return;
+  E = E->IgnoreParenImpCasts();
+  auto *DRE = dyn_cast<DeclRefExpr>(E);
+  if (!DRE)
+    return;
+  const NamedDecl *D = cast<NamedDecl>(DRE->getDecl()->getCanonicalDecl());
+  auto I = ShadowingDecls.find(D);
+  if (I == ShadowingDecls.end())
+    return;
+  const NamedDecl *ShadowedDecl = I->second;
+  const DeclContext *OldDC = ShadowedDecl->getDeclContext();
+  Diag(Loc, diag::warn_modifying_shadowing_decl) << D << OldDC;
+  Diag(D->getLocation(), diag::note_var_declared_here) << D;
+  Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+
+  // Avoid issuing multiple warnings about the same decl.
+  ShadowingDecls.erase(I);
+}
+
 /// Check for conflict between this global or extern "C" declaration and
 /// previous global or extern "C" declarations. This is only used in C++.
 template<typename T>
@@ -6538,7 +6818,7 @@
     return;
   }
 
-  // OpenCL v1.2 s6.8 -- The static qualifier is valid only in program
+  // OpenCL v1.2 s6.8 - The static qualifier is valid only in program
   // scope.
   if (getLangOpts().OpenCLVersion == 120 &&
       !getOpenCLOptions().cl_clang_storage_class_specifiers &&
@@ -6548,40 +6828,64 @@
     return;
   }
 
-  // OpenCL v1.2 s6.5 - All program scope variables must be declared in the
-  // __constant address space.
-  // OpenCL v2.0 s6.5.1 - Variables defined at program scope and static
-  // variables inside a function can also be declared in the global
-  // address space.
   if (getLangOpts().OpenCL) {
-    if (NewVD->isFileVarDecl()) {
+    // OpenCL v2.0 s6.12.5 - The __block storage type is not supported.
+    if (NewVD->hasAttr<BlocksAttr>()) {
+      Diag(NewVD->getLocation(), diag::err_opencl_block_storage_type);
+      return;
+    }
+
+    if (T->isBlockPointerType()) {
+      // OpenCL v2.0 s6.12.5 - Any block declaration must be const qualified and
+      // can't use 'extern' storage class.
+      if (!T.isConstQualified()) {
+        Diag(NewVD->getLocation(), diag::err_opencl_invalid_block_declaration)
+            << 0 /*const*/;
+        NewVD->setInvalidDecl();
+        return;
+      }
+      if (NewVD->hasExternalStorage()) {
+        Diag(NewVD->getLocation(), diag::err_opencl_extern_block_declaration);
+        NewVD->setInvalidDecl();
+        return;
+      }
+      // OpenCL v2.0 s6.12.5 - Blocks with variadic arguments are not supported.
+      // TODO: this check is not enough as it doesn't diagnose the typedef
+      const BlockPointerType *BlkTy = T->getAs<BlockPointerType>();
+      const FunctionProtoType *FTy =
+          BlkTy->getPointeeType()->getAs<FunctionProtoType>();
+      if (FTy && FTy->isVariadic()) {
+        Diag(NewVD->getLocation(), diag::err_opencl_block_proto_variadic)
+            << T << NewVD->getSourceRange();
+        NewVD->setInvalidDecl();
+        return;
+      }
+    }
+    // OpenCL v1.2 s6.5 - All program scope variables must be declared in the
+    // __constant address space.
+    // OpenCL v2.0 s6.5.1 - Variables defined at program scope and static
+    // variables inside a function can also be declared in the global
+    // address space.
+    if (NewVD->isFileVarDecl() || NewVD->isStaticLocal() ||
+        NewVD->hasExternalStorage()) {
       if (!T->isSamplerT() &&
           !(T.getAddressSpace() == LangAS::opencl_constant ||
             (T.getAddressSpace() == LangAS::opencl_global &&
              getLangOpts().OpenCLVersion == 200))) {
+        int Scope = NewVD->isStaticLocal() | NewVD->hasExternalStorage() << 1;
         if (getLangOpts().OpenCLVersion == 200)
           Diag(NewVD->getLocation(), diag::err_opencl_global_invalid_addr_space)
-              << "global or constant";
+              << Scope << "global or constant";
         else
           Diag(NewVD->getLocation(), diag::err_opencl_global_invalid_addr_space)
-              << "constant";
+              << Scope << "constant";
         NewVD->setInvalidDecl();
         return;
       }
     } else {
-      // OpenCL v2.0 s6.5.1 - Variables defined at program scope and static
-      // variables inside a function can also be declared in the global
-      // address space.
-      if (NewVD->isStaticLocal() &&
-          !(T.getAddressSpace() == LangAS::opencl_constant ||
-            (T.getAddressSpace() == LangAS::opencl_global &&
-             getLangOpts().OpenCLVersion == 200))) {
-        if (getLangOpts().OpenCLVersion == 200)
-          Diag(NewVD->getLocation(), diag::err_opencl_global_invalid_addr_space)
-              << "global or constant";
-        else
-          Diag(NewVD->getLocation(), diag::err_opencl_global_invalid_addr_space)
-              << "constant";
+      if (T.getAddressSpace() == LangAS::opencl_global) {
+        Diag(NewVD->getLocation(), diag::err_opencl_function_variable)
+            << 1 /*is any function*/ << "global";
         NewVD->setInvalidDecl();
         return;
       }
@@ -6592,11 +6896,11 @@
         FunctionDecl *FD = getCurFunctionDecl();
         if (FD && !FD->hasAttr<OpenCLKernelAttr>()) {
           if (T.getAddressSpace() == LangAS::opencl_constant)
-            Diag(NewVD->getLocation(), diag::err_opencl_non_kernel_variable)
-                << "constant";
+            Diag(NewVD->getLocation(), diag::err_opencl_function_variable)
+                << 0 /*non-kernel only*/ << "constant";
           else
-            Diag(NewVD->getLocation(), diag::err_opencl_non_kernel_variable)
-                << "local";
+            Diag(NewVD->getLocation(), diag::err_opencl_function_variable)
+                << 0 /*non-kernel only*/ << "local";
           NewVD->setInvalidDecl();
           return;
         }
@@ -6613,7 +6917,7 @@
       Diag(NewVD->getLocation(), diag::warn_attribute_weak_on_local);
     }
   }
-  
+
   bool isVM = T->isVariablyModifiedType();
   if (isVM || NewVD->hasAttr<CleanupAttr>() ||
       NewVD->hasAttr<BlocksAttr>())
@@ -6829,7 +7133,7 @@
     MultiTemplateParamsArg TemplateParamLists;
     bool AddToScope;
   };
-}
+} // end anonymous namespace
 
 namespace {
 
@@ -6873,7 +7177,7 @@
   CXXRecordDecl *ExpectedParent;
 };
 
-}
+} // end anonymous namespace
 
 /// \brief Generate diagnostics for an invalid function redeclaration.
 ///
@@ -7080,9 +7384,9 @@
       (D.isFunctionDeclarator() && D.getFunctionTypeInfo().hasPrototype) ||
       (!isa<FunctionType>(R.getTypePtr()) && R->isFunctionProtoType());
 
-    NewFD = FunctionDecl::Create(SemaRef.Context, DC, 
-                                 D.getLocStart(), NameInfo, R, 
-                                 TInfo, SC, isInline, 
+    NewFD = FunctionDecl::Create(SemaRef.Context, DC,
+                                 D.getLocStart(), NameInfo, R,
+                                 TInfo, SC, isInline,
                                  HasPrototype, false);
     if (D.isInvalidType())
       NewFD->setInvalidDecl();
@@ -7491,8 +7795,8 @@
           Diag(NewFD->getLocation(), diag::err_destructor_template);
           NewFD->setInvalidDecl();
         }
-        
-        // If we're adding a template to a dependent context, we may need to 
+
+        // If we're adding a template to a dependent context, we may need to
         // rebuilding some of the types used within the template parameter list,
         // now that we know what the current instantiation is.
         if (DC->isDependentContext()) {
@@ -7500,7 +7804,6 @@
           if (RebuildTemplateParamsInCurrentInstantiation(TemplateParams))
             Invalid = true;
         }
-        
 
         FunctionTemplate = FunctionTemplateDecl::Create(Context, DC,
                                                         NewFD->getLocation(),
@@ -7569,7 +7872,7 @@
              diag::err_virtual_non_function);
       } else if (!CurContext->isRecord()) {
         // 'virtual' was specified outside of the class.
-        Diag(D.getDeclSpec().getVirtualSpecLoc(), 
+        Diag(D.getDeclSpec().getVirtualSpecLoc(),
              diag::err_virtual_out_of_class)
           << FixItHint::CreateRemoval(D.getDeclSpec().getVirtualSpecLoc());
       } else if (NewFD->getDescribedFunctionTemplate()) {
@@ -7607,12 +7910,12 @@
     }
 
     // C++ [dcl.fct.spec]p3:
-    //  The inline specifier shall not appear on a block scope function 
+    //  The inline specifier shall not appear on a block scope function
     //  declaration.
     if (isInline && !NewFD->isInvalidDecl()) {
       if (CurContext->isFunctionOrMethod()) {
         // 'inline' is not allowed on block scope function declaration.
-        Diag(D.getDeclSpec().getInlineSpecLoc(), 
+        Diag(D.getDeclSpec().getInlineSpecLoc(),
              diag::err_inline_declaration_block_scope) << Name
           << FixItHint::CreateRemoval(D.getDeclSpec().getInlineSpecLoc());
       }
@@ -7620,22 +7923,22 @@
 
     // C++ [dcl.fct.spec]p6:
     //  The explicit specifier shall be used only in the declaration of a
-    //  constructor or conversion function within its class definition; 
+    //  constructor or conversion function within its class definition;
     //  see 12.3.1 and 12.3.2.
     if (isExplicit && !NewFD->isInvalidDecl()) {
       if (!CurContext->isRecord()) {
         // 'explicit' was specified outside of the class.
-        Diag(D.getDeclSpec().getExplicitSpecLoc(), 
+        Diag(D.getDeclSpec().getExplicitSpecLoc(),
              diag::err_explicit_out_of_class)
           << FixItHint::CreateRemoval(D.getDeclSpec().getExplicitSpecLoc());
-      } else if (!isa<CXXConstructorDecl>(NewFD) && 
+      } else if (!isa<CXXConstructorDecl>(NewFD) &&
                  !isa<CXXConversionDecl>(NewFD)) {
         // 'explicit' was specified on a function that wasn't a constructor
         // or conversion function.
         Diag(D.getDeclSpec().getExplicitSpecLoc(),
              diag::err_explicit_non_ctor_or_conv_function)
           << FixItHint::CreateRemoval(D.getDeclSpec().getExplicitSpecLoc());
-      }      
+      }
     }
 
     if (isConstexpr) {
@@ -7651,6 +7954,10 @@
     }
 
     if (isConcept) {
+      // This is a function concept.
+      if (FunctionTemplateDecl *FTD = NewFD->getDescribedFunctionTemplate())
+        FTD->setConcept();
+
       // C++ Concepts TS [dcl.spec.concept]p1: The concept specifier shall be
       // applied only to the definition of a function template [...]
       if (!D.isFunctionDefinition()) {
@@ -7676,6 +7983,14 @@
 
         // C++ Concepts TS [dcl.spec.concept]p5: A function concept has the
         // following restrictions:
+        // - The declared return type shall have the type bool.
+        if (!Context.hasSameType(FPT->getReturnType(), Context.BoolTy)) {
+          Diag(D.getIdentifierLoc(), diag::err_function_concept_bool_ret);
+          NewFD->setInvalidDecl();
+        }
+
+        // C++ Concepts TS [dcl.spec.concept]p5: A function concept has the
+        // following restrictions:
         // - The declaration's parameter list shall be equivalent to an empty
         //   parameter list.
         if (FPT->getNumParams() > 0 || FPT->isVariadic())
@@ -7709,6 +8024,16 @@
             << 1 << 3;
         NewFD->setInvalidDecl(true);
       }
+
+      // C++ Concepts TS [dcl.spec.concept]p1: The concept specifier shall be
+      // applied only to the definition of a function template or variable
+      // template, declared in namespace scope.
+      if (isFunctionTemplateSpecialization) {
+        Diag(D.getDeclSpec().getConceptSpecLoc(),
+             diag::err_concept_specified_specialization) << 1;
+        NewFD->setInvalidDecl(true);
+        return NewFD;
+      }
     }
 
     // If __module_private__ was specified, mark the function accordingly.
@@ -7742,11 +8067,11 @@
       case FDK_Declaration:
       case FDK_Definition:
         break;
-        
+
       case FDK_Defaulted:
         NewFD->setDefaulted();
         break;
-        
+
       case FDK_Deleted:
         NewFD->setDeletedAsWritten();
         break;
@@ -7755,7 +8080,7 @@
     if (isa<CXXMethodDecl>(NewFD) && DC == CurContext &&
         D.isFunctionDefinition()) {
       // C++ [class.mfct]p2:
-      //   A member function may be defined (8.4) in its class definition, in 
+      //   A member function may be defined (8.4) in its class definition, in
       //   which case it is an inline member function (7.1.2)
       NewFD->setImplicitlyInline();
     }
@@ -7833,7 +8158,6 @@
           NewFD->setInvalidDecl();
       }
     }
-
   } else if (const FunctionProtoType *FT = R->getAs<FunctionProtoType>()) {
     // When we're declaring a function with a typedef, typeof, etc as in the
     // following example, we'll need to synthesize (unnamed)
@@ -7898,6 +8222,9 @@
   // Handle attributes.
   ProcessDeclAttributes(S, NewFD, D);
 
+  if (getLangOpts().CUDA)
+    maybeAddCUDAHostDeviceAttrs(S, NewFD, Previous);
+
   if (getLangOpts().OpenCL) {
     // OpenCL v1.1 s6.5: Using an address space qualifier in a function return
     // type declaration will generate a compilation error.
@@ -7960,7 +8287,7 @@
            diag::ext_operator_new_delete_declared_inline)
         << NewFD->getDeclName();
 
-    // If the declarator is a template-id, translate the parser's template 
+    // If the declarator is a template-id, translate the parser's template
     // argument list into our AST format.
     if (D.getName().getKind() == UnqualifiedId::IK_TemplateId) {
       TemplateIdAnnotation *TemplateId = D.getName().TemplateId;
@@ -7970,9 +8297,9 @@
                                          TemplateId->NumArgs);
       translateTemplateArguments(TemplateArgsPtr,
                                  TemplateArgs);
-    
+
       HasExplicitTemplateArgs = true;
-    
+
       if (NewFD->isInvalidDecl()) {
         HasExplicitTemplateArgs = false;
       } else if (FunctionTemplate) {
@@ -8008,7 +8335,7 @@
     if (isFunctionTemplateSpecialization && isFriend &&
         (NewFD->getType()->isDependentType() || DC->isDependentContext() ||
          TemplateSpecializationType::anyDependentTemplateArguments(
-            TemplateArgs.getArgumentArray(), TemplateArgs.size(),
+            TemplateArgs,
             InstantiationDependent))) {
       assert(HasExplicitTemplateArgs &&
              "friend function specialization without template args");
@@ -8016,10 +8343,10 @@
                                                        Previous))
         NewFD->setInvalidDecl();
     } else if (isFunctionTemplateSpecialization) {
-      if (CurContext->isDependentContext() && CurContext->isRecord() 
+      if (CurContext->isDependentContext() && CurContext->isRecord()
           && !isFriend) {
         isDependentClassScopeExplicitSpecialization = true;
-        Diag(NewFD->getLocation(), getLangOpts().MicrosoftExt ? 
+        Diag(NewFD->getLocation(), getLangOpts().MicrosoftExt ?
           diag::ext_function_specialization_in_class :
           diag::err_function_specialization_in_class)
           << NewFD->getDeclName();
@@ -8028,7 +8355,7 @@
                                                            : nullptr),
                                                      Previous))
         NewFD->setInvalidDecl();
-      
+
       // C++ [dcl.stc]p1:
       //   A storage-class-specifier shall not be specified in an explicit
       //   specialization (14.7.3)
@@ -8041,14 +8368,13 @@
             << SC
             << FixItHint::CreateRemoval(
                                       D.getDeclSpec().getStorageClassSpecLoc());
-            
+
         else
-          Diag(NewFD->getLocation(), 
+          Diag(NewFD->getLocation(),
                diag::ext_explicit_specialization_storage_class)
             << FixItHint::CreateRemoval(
                                       D.getDeclSpec().getStorageClassSpecLoc());
       }
-      
     } else if (isExplicitSpecialization && isa<CXXMethodDecl>(NewFD)) {
       if (CheckMemberSpecialization(NewFD, Previous))
           NewFD->setInvalidDecl();
@@ -8094,7 +8420,7 @@
     // If we have a function template, check the template parameter
     // list. This will check and merge default template arguments.
     if (FunctionTemplate) {
-      FunctionTemplateDecl *PrevTemplate = 
+      FunctionTemplateDecl *PrevTemplate =
                                      FunctionTemplate->getPreviousDecl();
       CheckTemplateParameterList(FunctionTemplate->getTemplateParameters(),
                        PrevTemplate ? PrevTemplate->getTemplateParameters()
@@ -8103,8 +8429,8 @@
                               ? (D.isFunctionDefinition()
                                    ? TPC_FriendFunctionTemplateDefinition
                                    : TPC_FriendFunctionTemplate)
-                              : (D.getCXXScopeSpec().isSet() && 
-                                 DC && DC->isRecord() && 
+                              : (D.getCXXScopeSpec().isSet() &&
+                                 DC && DC->isRecord() &&
                                  DC->isDependentContext())
                                   ? TPC_ClassTemplateMember
                                   : TPC_FunctionTemplate);
@@ -8167,7 +8493,6 @@
           return Result;
         }
       }
-
     } else if (!D.isFunctionDefinition() &&
                isa<CXXMethodDecl>(NewFD) && NewFD->isOutOfLine() &&
                !isFriend && !isFunctionTemplateSpecialization &&
@@ -8176,8 +8501,8 @@
       // definition (C++ [class.mfct]p2).
       // Note that this is not the case for explicit specializations of
       // function templates or member functions of class templates, per
-      // C++ [temp.expl.spec]p2. We also allow these declarations as an 
-      // extension for compatibility with old SWIG code which likes to 
+      // C++ [temp.expl.spec]p2. We also allow these declarations as an
+      // extension for compatibility with old SWIG code which likes to
       // generate them.
       Diag(NewFD->getLocation(), diag::ext_out_of_line_declaration)
         << D.getCXXScopeSpec().getRange();
@@ -8189,7 +8514,7 @@
 
   AddKnownFunctionAttributes(NewFD);
 
-  if (NewFD->hasAttr<OverloadableAttr>() && 
+  if (NewFD->hasAttr<OverloadableAttr>() &&
       !NewFD->getType()->getAs<FunctionProtoType>()) {
     Diag(NewFD->getLocation(),
          diag::err_attribute_overloadable_no_prototype)
@@ -8232,7 +8557,30 @@
   if (D.isRedeclaration() && !Previous.empty()) {
     checkDLLAttributeRedeclaration(
         *this, dyn_cast<NamedDecl>(Previous.getRepresentativeDecl()), NewFD,
-        isExplicitSpecialization || isFunctionTemplateSpecialization);
+        isExplicitSpecialization || isFunctionTemplateSpecialization,
+        D.isFunctionDefinition());
+  }
+
+  if (getLangOpts().CUDA) {
+    IdentifierInfo *II = NewFD->getIdentifier();
+    if (II && II->isStr("cudaConfigureCall") && !NewFD->isInvalidDecl() &&
+        NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+      if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
+        Diag(NewFD->getLocation(), diag::err_config_scalar_return);
+
+      Context.setcudaConfigureCallDecl(NewFD);
+    }
+
+    // Variadic functions, other than a *declaration* of printf, are not allowed
+    // in device-side CUDA code, unless someone passed
+    // -fcuda-allow-variadic-functions.
+    if (!getLangOpts().CUDAAllowVariadicFunctions && NewFD->isVariadic() &&
+        (NewFD->hasAttr<CUDADeviceAttr>() ||
+         NewFD->hasAttr<CUDAGlobalAttr>()) &&
+        !(II && II->isStr("printf") && NewFD->isExternC() &&
+          !D.isFunctionDefinition())) {
+      Diag(NewFD->getLocation(), diag::err_variadic_device_fn);
+    }
   }
 
   if (getLangOpts().CPlusPlus) {
@@ -8250,7 +8598,7 @@
       Diag(D.getIdentifierLoc(), diag::err_static_kernel);
       D.setInvalidType();
     }
-    
+
     // OpenCL v1.2, s6.9 -- Kernels can only have return type void.
     if (!NewFD->getReturnType()->isVoidType()) {
       SourceRange RTRange = NewFD->getReturnTypeSourceRange();
@@ -8261,12 +8609,10 @@
     }
 
     llvm::SmallPtrSet<const Type *, 16> ValidTypes;
-    for (auto Param : NewFD->params())
+    for (auto Param : NewFD->parameters())
       checkIsValidOpenCLKernelParameter(*this, D, Param, ValidTypes);
   }
-  for (FunctionDecl::param_iterator PI = NewFD->param_begin(),
-       PE = NewFD->param_end(); PI != PE; ++PI) {
-    ParmVarDecl *Param = *PI;
+  for (const ParmVarDecl *Param : NewFD->parameters()) {
     QualType PT = Param->getType();
 
     // OpenCL 2.0 pipe restrictions forbids pipe packet types to be non-value
@@ -8284,25 +8630,13 @@
 
   MarkUnusedFileScopedDecl(NewFD);
 
-  if (getLangOpts().CUDA)
-    if (IdentifierInfo *II = NewFD->getIdentifier())
-      if (!NewFD->isInvalidDecl() &&
-          NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
-        if (II->isStr("cudaConfigureCall")) {
-          if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
-            Diag(NewFD->getLocation(), diag::err_config_scalar_return);
-
-          Context.setcudaConfigureCallDecl(NewFD);
-        }
-      }
-  
   // Here we have an function template explicit specialization at class scope.
   // The actually specialization will be postponed to template instatiation
   // time via the ClassScopeFunctionSpecializationDecl node.
   if (isDependentClassScopeExplicitSpecialization) {
     ClassScopeFunctionSpecializationDecl *NewSpec =
                          ClassScopeFunctionSpecializationDecl::Create(
-                                Context, CurContext, SourceLocation(), 
+                                Context, CurContext, SourceLocation(),
                                 cast<CXXMethodDecl>(NewFD),
                                 HasExplicitTemplateArgs, TemplateArgs);
     CurContext->addDecl(NewSpec);
@@ -8472,20 +8806,28 @@
       FunctionTemplateDecl *NewTemplateDecl
         = NewFD->getDescribedFunctionTemplate();
       assert(NewTemplateDecl && "Template/non-template mismatch");
-      if (CXXMethodDecl *Method 
+      if (CXXMethodDecl *Method
             = dyn_cast<CXXMethodDecl>(NewTemplateDecl->getTemplatedDecl())) {
         Method->setAccess(OldTemplateDecl->getAccess());
         NewTemplateDecl->setAccess(OldTemplateDecl->getAccess());
       }
-      
+
       // If this is an explicit specialization of a member that is a function
       // template, mark it as a member specialization.
-      if (IsExplicitSpecialization && 
+      if (IsExplicitSpecialization &&
           NewTemplateDecl->getInstantiatedFromMemberTemplate()) {
         NewTemplateDecl->setMemberSpecialization();
         assert(OldTemplateDecl->isMemberSpecialization());
+        // Explicit specializations of a member template do not inherit deleted
+        // status from the parent member template that they are specializing.
+        if (OldTemplateDecl->getTemplatedDecl()->isDeleted()) {
+          FunctionDecl *const OldTemplatedDecl =
+              OldTemplateDecl->getTemplatedDecl();
+          assert(OldTemplatedDecl->getCanonicalDecl() == OldTemplatedDecl);
+          OldTemplatedDecl->setDeletedAsWritten(false);
+        }
       }
-      
+
     } else {
       // This needs to happen first so that 'inline' propagates.
       NewFD->setPreviousDeclaration(cast<FunctionDecl>(OldDecl));
@@ -8501,11 +8843,11 @@
     // C++-specific checks.
     if (CXXConstructorDecl *Constructor = dyn_cast<CXXConstructorDecl>(NewFD)) {
       CheckConstructor(Constructor);
-    } else if (CXXDestructorDecl *Destructor = 
+    } else if (CXXDestructorDecl *Destructor =
                 dyn_cast<CXXDestructorDecl>(NewFD)) {
       CXXRecordDecl *Record = Destructor->getParent();
       QualType ClassType = Context.getTypeDeclType(Record);
-      
+
       // FIXME: Shouldn't we be able to perform this check even when the class
       // type is dependent? Both gcc and edg can handle that.
       if (!ClassType->isDependentType()) {
@@ -8525,7 +8867,7 @@
 
     // Find any virtual functions that this function overrides.
     if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(NewFD)) {
-      if (!Method->isFunctionTemplateSpecialization() && 
+      if (!Method->isFunctionTemplateSpecialization() &&
           !Method->getDescribedFunctionTemplate() &&
           Method->isCanonicalDecl()) {
         if (AddOverriddenMethods(Method->getParent(), Method)) {
@@ -8535,7 +8877,7 @@
           }
         }
       }
-      
+
       if (Method->isStatic())
         checkThisInStaticMemberFunctionType(Method);
     }
@@ -8561,7 +8903,7 @@
       CheckCXXDefaultArguments(NewFD);
 
     // If this function declares a builtin function, check the type of this
-    // declaration against the expected type for the builtin. 
+    // declaration against the expected type for the builtin.
     if (unsigned BuiltinID = NewFD->getBuiltinID()) {
       ASTContext::GetBuiltinTypeError Error;
       LookupPredefedObjCSuperType(*this, S, NewFD->getIdentifier());
@@ -8573,7 +8915,7 @@
       }
     }
 
-    // If this function is declared as being extern "C", then check to see if 
+    // If this function is declared as being extern "C", then check to see if
     // the function returns a UDT (class, struct, or union type) that is not C
     // compatible, and if it does, warn the user.
     // But, issue any diagnostic on the first declaration only.
@@ -8599,11 +8941,11 @@
   // static main is not an error under C99, but we should warn about it.
   // We accept _Noreturn main as an extension.
   if (FD->getStorageClass() == SC_Static)
-    Diag(DS.getStorageClassSpecLoc(), getLangOpts().CPlusPlus 
-         ? diag::err_static_main : diag::warn_static_main) 
+    Diag(DS.getStorageClassSpecLoc(), getLangOpts().CPlusPlus
+         ? diag::err_static_main : diag::warn_static_main)
       << FixItHint::CreateRemoval(DS.getStorageClassSpecLoc());
   if (FD->isInlineSpecified())
-    Diag(DS.getInlineSpecLoc(), diag::err_inline_main) 
+    Diag(DS.getInlineSpecLoc(), diag::err_inline_main)
       << FixItHint::CreateRemoval(DS.getInlineSpecLoc());
   if (DS.isNoreturnSpecified()) {
     SourceLocation NoreturnLoc = DS.getNoreturnSpecLoc();
@@ -8730,7 +9072,7 @@
   if (nparams == 1 && !FD->isInvalidDecl()) {
     Diag(FD->getLocation(), diag::warn_main_one_arg);
   }
-  
+
   if (!FD->isInvalidDecl() && FD->getDescribedFunctionTemplate()) {
     Diag(FD->getLocation(), diag::err_mainlike_template_decl) << FD;
     FD->setInvalidDecl();
@@ -8787,6 +9129,7 @@
 
     bool isInitList;
     llvm::SmallVector<unsigned, 4> InitFieldIndex;
+
   public:
     typedef EvaluatedExprVisitor<SelfReferenceChecker> Inherited;
 
@@ -9008,7 +9351,7 @@
       Inherited::VisitUnaryOperator(E);
     }
 
-    void VisitObjCMessageExpr(ObjCMessageExpr *E) { return; }
+    void VisitObjCMessageExpr(ObjCMessageExpr *E) {}
 
     void VisitCXXConstructExpr(CXXConstructExpr *E) {
       if (E->getConstructor()->isCopyConstructor()) {
@@ -9104,7 +9447,7 @@
 
     SelfReferenceChecker(S, OrigDecl).CheckExpr(E);
   }
-}
+} // end anonymous namespace
 
 QualType Sema::deduceVarTypeFromInitializer(VarDecl *VDecl,
                                             DeclarationName Name, QualType Type,
@@ -9115,6 +9458,9 @@
   assert((!VDecl || !VDecl->isInitCapture()) &&
          "init captures are expected to be deduced prior to initialization");
 
+  // FIXME: Deduction for a decomposition declaration does weird things if the
+  // initializer is an array.
+
   ArrayRef<Expr *> DeduceInits = Init;
   if (DirectInit) {
     if (auto *PL = dyn_cast<ParenListExpr>(Init))
@@ -9224,6 +9570,11 @@
     return;
   }
 
+  // C++1z [dcl.dcl]p1 grammar implies that a parenthesized initializer is not
+  // permitted.
+  if (isa<DecompositionDecl>(VDecl) && DirectInit && isa<ParenListExpr>(Init))
+    Diag(VDecl->getLocation(), diag::err_decomp_decl_paren_init) << VDecl;
+
   // C++11 [decl.spec.auto]p6. Deduce the type which 'auto' stands in for.
   if (TypeMayContainAuto && VDecl->getType()->isUndeducedType()) {
     // Attempt typo correction early so that the type of the init expression can
@@ -9302,7 +9653,7 @@
   VarDecl *Def;
   if ((Def = VDecl->getDefinition()) && Def != VDecl) {
     NamedDecl *Hidden = nullptr;
-    if (!hasVisibleDefinition(Def, &Hidden) && 
+    if (!hasVisibleDefinition(Def, &Hidden) &&
         (VDecl->getFormalLinkage() == InternalLinkage ||
          VDecl->getDescribedVarTemplate() ||
          VDecl->getNumTemplateParameterLists() ||
@@ -9338,7 +9689,7 @@
            diag::note_previous_initializer)
           << 0;
       return;
-    }  
+    }
 
     if (VDecl->hasLocalStorage())
       getCurFunction()->setHasBranchProtectedScope();
@@ -9360,7 +9711,7 @@
   // Get the decls type and save a reference for later, since
   // CheckInitializerTypes may change it.
   QualType DclT = VDecl->getType(), SavT = DclT;
-  
+
   // Expressions default to 'id' when we're in a debugger
   // and we are assigning it to a variable of Objective-C pointer type.
   if (getLangOpts().DebuggerCastResultToId && DclT->isObjCObjectPointerType() &&
@@ -9496,7 +9847,7 @@
              diag::ext_aggregate_init_not_constant)
           << Culprit->getSourceRange();
     }
-  } else if (VDecl->isStaticDataMember() &&
+  } else if (VDecl->isStaticDataMember() && !VDecl->isInline() &&
              VDecl->getLexicalDeclContext()->isRecord()) {
     // This is an in-class initialization for a static data member, e.g.,
     //
@@ -9510,8 +9861,8 @@
     //   const enumeration type, see 9.4.2.
     //
     // C++11 [class.static.data]p3:
-    //   If a non-volatile const static data member is of integral or
-    //   enumeration type, its declaration in the class definition can
+    //   If a non-volatile non-inline const static data member is of integral
+    //   or enumeration type, its declaration in the class definition can
     //   specify a brace-or-equal-initializer in which every initalizer-clause
     //   that is an assignment-expression is a constant expression. A static
     //   data member of literal type can be declared in the class definition
@@ -9641,6 +9992,11 @@
   VarDecl *VD = dyn_cast<VarDecl>(D);
   if (!VD) return;
 
+  // Bindings are not usable if we can't make sense of the initializer.
+  if (auto *DD = dyn_cast<DecompositionDecl>(D))
+    for (auto *BD : DD->bindings())
+      BD->setInvalidDecl();
+
   // Auto types are meaningless if we can't make sense of the initializer.
   if (ParsingInitForAutoVars.count(D)) {
     D->setInvalidDecl();
@@ -9651,7 +10007,7 @@
   if (Ty->isDependentType()) return;
 
   // Require a complete type.
-  if (RequireCompleteType(VD->getLocation(), 
+  if (RequireCompleteType(VD->getLocation(),
                           Context.getBaseElementType(Ty),
                           diag::err_typecheck_decl_incomplete_type)) {
     VD->setInvalidDecl();
@@ -9679,6 +10035,13 @@
   if (VarDecl *Var = dyn_cast<VarDecl>(RealDecl)) {
     QualType Type = Var->getType();
 
+    // C++1z [dcl.dcl]p1 grammar implies that an initializer is mandatory.
+    if (isa<DecompositionDecl>(RealDecl)) {
+      Diag(Var->getLocation(), diag::err_decomp_decl_requires_init) << Var;
+      Var->setInvalidDecl();
+      return;
+    }
+
     // C++11 [dcl.spec.auto]p3
     if (TypeMayContainAuto && Type->getContainedAutoType()) {
       Diag(Var->getLocation(), diag::err_auto_var_requires_init)
@@ -9694,23 +10057,32 @@
     // the definition of a variable [...] or the declaration of a static data
     // member.
     if (Var->isConstexpr() && !Var->isThisDeclarationADefinition()) {
-      if (Var->isStaticDataMember())
-        Diag(Var->getLocation(),
-             diag::err_constexpr_static_mem_var_requires_init)
-          << Var->getDeclName();
-      else
+      if (Var->isStaticDataMember()) {
+        // C++1z removes the relevant rule; the in-class declaration is always
+        // a definition there.
+        if (!getLangOpts().CPlusPlus1z) {
+          Diag(Var->getLocation(),
+               diag::err_constexpr_static_mem_var_requires_init)
+            << Var->getDeclName();
+          Var->setInvalidDecl();
+          return;
+        }
+      } else {
         Diag(Var->getLocation(), diag::err_invalid_constexpr_var_decl);
-      Var->setInvalidDecl();
-      return;
+        Var->setInvalidDecl();
+        return;
+      }
     }
 
     // C++ Concepts TS [dcl.spec.concept]p1: [...]  A variable template
     // definition having the concept specifier is called a variable concept. A
     // concept definition refers to [...] a variable concept and its initializer.
-    if (Var->isConcept()) {
-      Diag(Var->getLocation(), diag::err_var_concept_not_initialized);
-      Var->setInvalidDecl();
-      return;
+    if (VarTemplateDecl *VTD = Var->getDescribedVarTemplate()) {
+      if (VTD->isConcept()) {
+        Diag(Var->getLocation(), diag::err_var_concept_not_initialized);
+        Var->setInvalidDecl();
+        return;
+      }
     }
 
     // OpenCL v1.1 s6.5.3: variables declared in the constant address space must
@@ -9730,17 +10102,17 @@
 
       // We have an out-of-line definition of a static data member
       // that has an in-class initializer, so we type-check this like
-      // a declaration. 
+      // a declaration.
       //
       // Fall through
-      
+
     case VarDecl::DeclarationOnly:
-      // It's only a declaration. 
+      // It's only a declaration.
 
       // Block scope. C99 6.7p7: If an identifier for an object is
       // declared with no linkage (C99 6.2.2p6), the type for the
       // object shall be complete.
-      if (!Type->isDependentType() && Var->isLocalVarDecl() && 
+      if (!Type->isDependentType() && Var->isLocalVarDecl() &&
           !Var->hasLinkage() && !Var->isInvalidDecl() &&
           RequireCompleteType(Var->getLocation(), Type,
                               diag::err_typecheck_decl_incomplete_type))
@@ -9757,7 +10129,7 @@
         Diag(Var->getLocation(), diag::warn_private_extern);
         Diag(Var->getLocation(), diag::note_private_extern);
       }
-        
+
       return;
 
     case VarDecl::TentativeDefinition:
@@ -9862,7 +10234,7 @@
           getCurFunction()->setHasBranchProtectedScope();
       }
     }
-    
+
     // C++03 [dcl.init]p9:
     //   If no initializer is specified for an object, and the
     //   object is of (possibly cv-qualified) non-POD class type (or
@@ -9896,6 +10268,10 @@
 }
 
 void Sema::ActOnCXXForRangeDecl(Decl *D) {
+  // If there is no declaration, there was an error parsing it. Ignore it.
+  if (!D)
+    return;
+
   VarDecl *VD = dyn_cast<VarDecl>(D);
   if (!VD) {
     Diag(D->getLocation(), diag::err_for_range_decl_must_be_var);
@@ -9967,6 +10343,18 @@
 void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
   if (var->isInvalidDecl()) return;
 
+  if (getLangOpts().OpenCL) {
+    // OpenCL v2.0 s6.12.5 - Every block variable declaration must have an
+    // initialiser
+    if (var->getTypeSourceInfo()->getType()->isBlockPointerType() &&
+        !var->hasInit()) {
+      Diag(var->getLocation(), diag::err_opencl_invalid_block_declaration)
+          << 1 /*Init*/;
+      var->setInvalidDecl();
+      return;
+    }
+  }
+
   // In Objective-C, don't allow jumps past the implicit initialization of a
   // local retaining variable.
   if (getLangOpts().ObjC1 &&
@@ -10024,7 +10412,6 @@
       if (getLangOpts().CPlusPlus11)
         Diag(var->getLocation(), diag::note_use_thread_local);
     }
-
   }
 
   // Apply section attributes and pragmas to global variables.
@@ -10062,6 +10449,9 @@
   // All the following checks are C++ only.
   if (!getLangOpts().CPlusPlus) return;
 
+  if (auto *DD = dyn_cast<DecompositionDecl>(var))
+    CheckCompleteDecompositionDeclaration(DD);
+
   QualType type = var->getType();
   if (type->isDependentType()) return;
 
@@ -10134,6 +10524,11 @@
   // Require the destructor.
   if (const RecordType *recordType = baseType->getAs<RecordType>())
     FinalizeVarWithDestructor(var, recordType);
+
+  // If this variable must be emitted, add it as an initializer for the current
+  // module.
+  if (Context.DeclMustBeEmitted(var) && !ModuleScopes.empty())
+    Context.addModuleInitializer(ModuleScopes.back().Module, var);
 }
 
 /// \brief Determines if a variable's alignment is dependent.
@@ -10157,6 +10552,14 @@
   if (!VD)
     return;
 
+  if (auto *DD = dyn_cast<DecompositionDecl>(ThisDecl)) {
+    for (auto *BD : DD->bindings()) {
+      if (ThisDecl->isInvalidDecl())
+        BD->setInvalidDecl();
+      FinalizeDeclaration(BD);
+    }
+  }
+
   checkAttributesAfterMerging(*this, *VD);
 
   // Perform TLS alignment check here after attributes attached to the variable
@@ -10175,15 +10578,82 @@
     }
   }
 
-  // Static locals inherit dll attributes from their function.
   if (VD->isStaticLocal()) {
     if (FunctionDecl *FD =
             dyn_cast_or_null<FunctionDecl>(VD->getParentFunctionOrMethod())) {
+      // Static locals inherit dll attributes from their function.
       if (Attr *A = getDLLAttr(FD)) {
         auto *NewAttr = cast<InheritableAttr>(A->clone(getASTContext()));
         NewAttr->setInherited(true);
         VD->addAttr(NewAttr);
       }
+      // CUDA E.2.9.4: Within the body of a __device__ or __global__
+      // function, only __shared__ variables may be declared with
+      // static storage class.
+      if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice &&
+          (FD->hasAttr<CUDADeviceAttr>() || FD->hasAttr<CUDAGlobalAttr>()) &&
+          !VD->hasAttr<CUDASharedAttr>()) {
+        Diag(VD->getLocation(), diag::err_device_static_local_var);
+        VD->setInvalidDecl();
+      }
+    }
+  }
+
+  // Perform check for initializers of device-side global variables.
+  // CUDA allows empty constructors as initializers (see E.2.3.1, CUDA
+  // 7.5). We must also apply the same checks to all __shared__
+  // variables whether they are local or not. CUDA also allows
+  // constant initializers for __constant__ and __device__ variables.
+  if (getLangOpts().CUDA) {
+    const Expr *Init = VD->getInit();
+    if (Init && VD->hasGlobalStorage()) {
+      if (VD->hasAttr<CUDADeviceAttr>() || VD->hasAttr<CUDAConstantAttr>() ||
+          VD->hasAttr<CUDASharedAttr>()) {
+        assert((!VD->isStaticLocal() || VD->hasAttr<CUDASharedAttr>()));
+        bool AllowedInit = false;
+        if (const CXXConstructExpr *CE = dyn_cast<CXXConstructExpr>(Init))
+          AllowedInit =
+              isEmptyCudaConstructor(VD->getLocation(), CE->getConstructor());
+        // We'll allow constant initializers even if it's a non-empty
+        // constructor according to CUDA rules. This deviates from NVCC,
+        // but allows us to handle things like constexpr constructors.
+        if (!AllowedInit &&
+            (VD->hasAttr<CUDADeviceAttr>() || VD->hasAttr<CUDAConstantAttr>()))
+          AllowedInit = VD->getInit()->isConstantInitializer(
+              Context, VD->getType()->isReferenceType());
+
+        // Also make sure that destructor, if there is one, is empty.
+        if (AllowedInit)
+          if (CXXRecordDecl *RD = VD->getType()->getAsCXXRecordDecl())
+            AllowedInit =
+                isEmptyCudaDestructor(VD->getLocation(), RD->getDestructor());
+
+        if (!AllowedInit) {
+          Diag(VD->getLocation(), VD->hasAttr<CUDASharedAttr>()
+                                      ? diag::err_shared_var_init
+                                      : diag::err_dynamic_var_init)
+              << Init->getSourceRange();
+          VD->setInvalidDecl();
+        }
+      } else {
+        // This is a host-side global variable.  Check that the initializer is
+        // callable from the host side.
+        const FunctionDecl *InitFn = nullptr;
+        if (const CXXConstructExpr *CE = dyn_cast<CXXConstructExpr>(Init)) {
+          InitFn = CE->getConstructor();
+        } else if (const CallExpr *CE = dyn_cast<CallExpr>(Init)) {
+          InitFn = CE->getDirectCallee();
+        }
+        if (InitFn) {
+          CUDAFunctionTarget InitFnTarget = IdentifyCUDATarget(InitFn);
+          if (InitFnTarget != CFT_Host && InitFnTarget != CFT_HostDevice) {
+            Diag(VD->getLocation(), diag::err_ref_bad_target_global_initializer)
+                << InitFnTarget << InitFn;
+            Diag(InitFn->getLocation(), diag::note_previous_decl) << InitFn;
+            VD->setInvalidDecl();
+          }
+        }
+      }
     }
   }
 
@@ -10287,13 +10757,36 @@
     Decls.push_back(DS.getRepAsDecl());
 
   DeclaratorDecl *FirstDeclaratorInGroup = nullptr;
-  for (unsigned i = 0, e = Group.size(); i != e; ++i)
+  DecompositionDecl *FirstDecompDeclaratorInGroup = nullptr;
+  bool DiagnosedMultipleDecomps = false;
+
+  for (unsigned i = 0, e = Group.size(); i != e; ++i) {
     if (Decl *D = Group[i]) {
-      if (DeclaratorDecl *DD = dyn_cast<DeclaratorDecl>(D))
-        if (!FirstDeclaratorInGroup)
-          FirstDeclaratorInGroup = DD;
+      auto *DD = dyn_cast<DeclaratorDecl>(D);
+      if (DD && !FirstDeclaratorInGroup)
+        FirstDeclaratorInGroup = DD;
+
+      auto *Decomp = dyn_cast<DecompositionDecl>(D);
+      if (Decomp && !FirstDecompDeclaratorInGroup)
+        FirstDecompDeclaratorInGroup = Decomp;
+
+      // A decomposition declaration cannot be combined with any other
+      // declaration in the same group.
+      auto *OtherDD = FirstDeclaratorInGroup;
+      if (OtherDD == FirstDecompDeclaratorInGroup)
+        OtherDD = DD;
+      if (OtherDD && FirstDecompDeclaratorInGroup &&
+          OtherDD != FirstDecompDeclaratorInGroup &&
+          !DiagnosedMultipleDecomps) {
+        Diag(FirstDecompDeclaratorInGroup->getLocation(),
+             diag::err_decomp_decl_not_alone)
+          << OtherDD->getSourceRange();
+        DiagnosedMultipleDecomps = true;
+      }
+
       Decls.push_back(D);
     }
+  }
 
   if (DeclSpec::isDeclRep(DS.getTypeSpecType())) {
     if (TagDecl *Tag = dyn_cast_or_null<TagDecl>(DS.getRepAsDecl())) {
@@ -10426,6 +10919,9 @@
   if (DeclSpec::TSCS TSCS = DS.getThreadStorageClassSpec())
     Diag(DS.getThreadStorageClassSpecLoc(), diag::err_invalid_thread)
       << DeclSpec::getSpecifierName(TSCS);
+  if (DS.isInlineSpecified())
+    Diag(DS.getInlineSpecLoc(), diag::err_inline_non_function)
+        << getLangOpts().CPlusPlus1z;
   if (DS.isConstexprSpecified())
     Diag(DS.getConstexprSpecLoc(), diag::err_invalid_constexpr)
       << 0;
@@ -10441,7 +10937,7 @@
     // Check that there are no default arguments inside the type of this
     // parameter.
     CheckExtraCXXDefaultArguments(D);
-    
+
     // Parameter declarators cannot be qualified (C++ [dcl.meaning]p1).
     if (D.getCXXScopeSpec().isSet()) {
       Diag(D.getIdentifierLoc(), diag::err_qualified_param_declarator)
@@ -10501,7 +10997,7 @@
   assert(S->getFunctionPrototypeDepth() >= 1);
   New->setScopeInfo(S->getFunctionPrototypeDepth() - 1,
                     S->getNextFunctionPrototypeIndex());
-  
+
   // Add the parameter declaration into this scope.
   S->AddDecl(New);
   if (II)
@@ -10536,26 +11032,23 @@
   return Param;
 }
 
-void Sema::DiagnoseUnusedParameters(ParmVarDecl * const *Param,
-                                    ParmVarDecl * const *ParamEnd) {
+void Sema::DiagnoseUnusedParameters(ArrayRef<ParmVarDecl *> Parameters) {
   // Don't diagnose unused-parameter errors in template instantiations; we
   // will already have done so in the template itself.
   if (!ActiveTemplateInstantiations.empty())
     return;
 
-  for (; Param != ParamEnd; ++Param) {
-    if (!(*Param)->isReferenced() && (*Param)->getDeclName() &&
-        !(*Param)->hasAttr<UnusedAttr>()) {
-      Diag((*Param)->getLocation(), diag::warn_unused_parameter)
-        << (*Param)->getDeclName();
+  for (const ParmVarDecl *Parameter : Parameters) {
+    if (!Parameter->isReferenced() && Parameter->getDeclName() &&
+        !Parameter->hasAttr<UnusedAttr>()) {
+      Diag(Parameter->getLocation(), diag::warn_unused_parameter)
+        << Parameter->getDeclName();
     }
   }
 }
 
-void Sema::DiagnoseSizeOfParametersAndReturnValue(ParmVarDecl * const *Param,
-                                                  ParmVarDecl * const *ParamEnd,
-                                                  QualType ReturnTy,
-                                                  NamedDecl *D) {
+void Sema::DiagnoseSizeOfParametersAndReturnValue(
+    ArrayRef<ParmVarDecl *> Parameters, QualType ReturnTy, NamedDecl *D) {
   if (LangOpts.NumLargeByValueCopy == 0) // No check.
     return;
 
@@ -10570,14 +11063,14 @@
 
   // Warn if any parameter is pass-by-value and larger than the specified
   // threshold.
-  for (; Param != ParamEnd; ++Param) {
-    QualType T = (*Param)->getType();
+  for (const ParmVarDecl *Parameter : Parameters) {
+    QualType T = Parameter->getType();
     if (T->isDependentType() || !T.isPODType(Context))
       continue;
     unsigned Size = Context.getTypeSizeInChars(T).getQuantity();
     if (Size > LangOpts.NumLargeByValueCopy)
-      Diag((*Param)->getLocation(), diag::warn_parameter_size)
-          << (*Param)->getDeclName() << Size;
+      Diag(Parameter->getLocation(), diag::warn_parameter_size)
+          << Parameter->getDeclName() << Size;
   }
 }
 
@@ -10609,7 +11102,7 @@
   }
 
   ParmVarDecl *New = ParmVarDecl::Create(Context, DC, StartLoc, NameLoc, Name,
-                                         Context.getAdjustedParameterType(T), 
+                                         Context.getAdjustedParameterType(T),
                                          TSInfo, SC, nullptr);
 
   // Parameters can not be abstract class types.
@@ -10632,7 +11125,7 @@
     New->setType(T);
   }
 
-  // ISO/IEC TR 18037 S6.7.3: "The type of an object with automatic storage 
+  // ISO/IEC TR 18037 S6.7.3: "The type of an object with automatic storage
   // duration shall not be qualified by an address-space qualifier."
   // Since all parameters have automatic store duration, they can not have
   // an address space.
@@ -10643,7 +11136,7 @@
       Diag(NameLoc, diag::err_arg_with_address_space);
       New->setInvalidDecl();
     }
-  }   
+  }
 
   return New;
 }
@@ -10697,11 +11190,11 @@
   return ActOnStartOfFunctionDef(FnBodyScope, DP, SkipBody);
 }
 
-void Sema::ActOnFinishInlineMethodDef(CXXMethodDecl *D) {
-  Consumer.HandleInlineMethodDefinition(D);
+void Sema::ActOnFinishInlineFunctionDef(FunctionDecl *D) {
+  Consumer.HandleInlineFunctionDefinition(D);
 }
 
-static bool ShouldWarnAboutMissingPrototype(const FunctionDecl *FD, 
+static bool ShouldWarnAboutMissingPrototype(const FunctionDecl *FD,
                              const FunctionDecl*& PossibleZeroParamPrototype) {
   // Don't warn about invalid declarations.
   if (FD->isInvalidDecl())
@@ -10797,11 +11290,10 @@
   FD->setInvalidDecl();
 }
 
-
-static void RebuildLambdaScopeInfo(CXXMethodDecl *CallOperator, 
+static void RebuildLambdaScopeInfo(CXXMethodDecl *CallOperator,
                                    Sema &S) {
   CXXRecordDecl *const LambdaClass = CallOperator->getParent();
-  
+
   LambdaScopeInfo *LSI = S.PushLambdaScope();
   LSI->CallOperator = CallOperator;
   LSI->Lambda = LambdaClass;
@@ -10815,12 +11307,12 @@
   else if (LCD == LCD_ByRef)
     LSI->ImpCaptureStyle = CapturingScopeInfo::ImpCap_LambdaByref;
   DeclarationNameInfo DNI = CallOperator->getNameInfo();
-    
-  LSI->IntroducerRange = DNI.getCXXOperatorNameRange(); 
+
+  LSI->IntroducerRange = DNI.getCXXOperatorNameRange();
   LSI->Mutable = !CallOperator->isConst();
 
   // Add the captures to the LSI so they can be noted as already
-  // captured within tryCaptureVar. 
+  // captured within tryCaptureVar.
   auto I = LambdaClass->field_begin();
   for (const auto &C : LambdaClass->captures()) {
     if (C.capturesVariable()) {
@@ -10829,15 +11321,16 @@
         S.CurrentInstantiationScope->InstantiatedLocal(VD, VD);
       QualType CaptureType = VD->getType();
       const bool ByRef = C.getCaptureKind() == LCK_ByRef;
-      LSI->addCapture(VD, /*IsBlock*/false, ByRef, 
+      LSI->addCapture(VD, /*IsBlock*/false, ByRef,
           /*RefersToEnclosingVariableOrCapture*/true, C.getLocation(),
-          /*EllipsisLoc*/C.isPackExpansion() 
+          /*EllipsisLoc*/C.isPackExpansion()
                          ? C.getEllipsisLoc() : SourceLocation(),
           CaptureType, /*Expr*/ nullptr);
 
     } else if (C.capturesThis()) {
-      LSI->addThisCapture(/*Nested*/ false, C.getLocation(), 
-                              S.getCurrentThisType(), /*Expr*/ nullptr);
+      LSI->addThisCapture(/*Nested*/ false, C.getLocation(),
+                              /*Expr*/ nullptr,
+                              C.getCaptureKind() == LCK_StarThis);
     } else {
       LSI->addVLATypeCapture(C.getLocation(), I->getType());
     }
@@ -10849,7 +11342,7 @@
                                     SkipBodyInfo *SkipBody) {
   // Clear the last template instantiation error context.
   LastTemplateInstantiationErrorContext = ActiveTemplateInstantiation();
-  
+
   if (!D)
     return D;
   FunctionDecl *FD = nullptr;
@@ -10870,16 +11363,16 @@
 
   // If we are instantiating a generic lambda call operator, push
   // a LambdaScopeInfo onto the function stack.  But use the information
-  // that's already been calculated (ActOnLambdaExpr) to prime the current 
-  // LambdaScopeInfo.  
+  // that's already been calculated (ActOnLambdaExpr) to prime the current
+  // LambdaScopeInfo.
   // When the template operator is being specialized, the LambdaScopeInfo,
   // has to be properly restored so that tryCaptureVariable doesn't try
   // and capture any new variables. In addition when calculating potential
-  // captures during transformation of nested lambdas, it is necessary to 
-  // have the LSI properly restored. 
+  // captures during transformation of nested lambdas, it is necessary to
+  // have the LSI properly restored.
   if (isGenericLambdaCallOperatorSpecialization(FD)) {
     assert(ActiveTemplateInstantiations.size() &&
-      "There should be an active template instantiation on the stack " 
+      "There should be an active template instantiation on the stack "
       "when instantiating a generic lambda!");
     RebuildLambdaScopeInfo(cast<CXXMethodDecl>(D), *this);
   }
@@ -10909,11 +11402,11 @@
     PushDeclContext(FnBodyScope, FD);
 
   // Check the validity of our function parameters
-  CheckParmsForFunctionDef(FD->param_begin(), FD->param_end(),
+  CheckParmsForFunctionDef(FD->parameters(),
                            /*CheckParameterNames=*/true);
 
   // Introduce our parameters into the function scope
-  for (auto Param : FD->params()) {
+  for (auto Param : FD->parameters()) {
     Param->setOwningFunction(FD);
 
     // If this has an identifier, add it to the scope stack.
@@ -10976,15 +11469,15 @@
       getCurLexicalContext()->getDeclKind() != Decl::ObjCCategoryImpl &&
       getCurLexicalContext()->getDeclKind() != Decl::ObjCImplementation)
     Diag(FD->getLocation(), diag::warn_function_def_in_objc_container);
-    
+
   return D;
 }
 
 /// \brief Given the set of return statements within a function body,
-/// compute the variables that are subject to the named return value 
+/// compute the variables that are subject to the named return value
 /// optimization.
 ///
-/// Each of the variables that is subject to the named return value 
+/// Each of the variables that is subject to the named return value
 /// optimization will be marked as NRVO variables in the AST, and any
 /// return statement that has a marked NRVO variable as its NRVO candidate can
 /// use the named return value optimization.
@@ -11044,7 +11537,7 @@
     FD->setHasSkippedBody();
   else if (ObjCMethodDecl *MD = dyn_cast_or_null<ObjCMethodDecl>(Decl))
     MD->setHasSkippedBody();
-  return ActOnFinishFunctionBody(Decl, nullptr);
+  return Decl;
 }
 
 Decl *Sema::ActOnFinishFunctionBody(Decl *D, Stmt *BodyArg) {
@@ -11127,8 +11620,8 @@
     if (!FD->isInvalidDecl()) {
       // Don't diagnose unused parameters of defaulted or deleted functions.
       if (!FD->isDeleted() && !FD->isDefaulted())
-        DiagnoseUnusedParameters(FD->param_begin(), FD->param_end());
-      DiagnoseSizeOfParametersAndReturnValue(FD->param_begin(), FD->param_end(),
+        DiagnoseUnusedParameters(FD->parameters());
+      DiagnoseSizeOfParametersAndReturnValue(FD->parameters(),
                                              FD->getReturnType(), FD);
 
       // If this is a structor, we need a vtable.
@@ -11136,7 +11629,7 @@
         MarkVTableUsed(FD->getLocation(), Constructor->getParent());
       else if (CXXDestructorDecl *Destructor = dyn_cast<CXXDestructorDecl>(FD))
         MarkVTableUsed(FD->getLocation(), Destructor->getParent());
-      
+
       // Try to apply the named return value optimization. We have to check
       // if we can do this here because lambdas keep return statements around
       // to deduce an implicit return type.
@@ -11199,8 +11692,8 @@
     assert(MD == getCurMethodDecl() && "Method parsing confused");
     MD->setBody(Body);
     if (!MD->isInvalidDecl()) {
-      DiagnoseUnusedParameters(MD->param_begin(), MD->param_end());
-      DiagnoseSizeOfParametersAndReturnValue(MD->param_begin(), MD->param_end(),
+      DiagnoseUnusedParameters(MD->parameters());
+      DiagnoseSizeOfParametersAndReturnValue(MD->parameters(),
                                              MD->getReturnType(), MD);
 
       if (Body)
@@ -11249,6 +11742,9 @@
     return nullptr;
   }
 
+  if (Body && getCurFunction()->HasPotentialAvailabilityViolations)
+    DiagnoseUnguardedAvailabilityViolations(dcl);
+
   assert(!getCurFunction()->ObjCShouldCallSuper &&
          "This should only be set for ObjC methods, which should have been "
          "handled in the block above.");
@@ -11260,7 +11756,7 @@
     // Verify this.
     if (FD && isa<CXXConstructorDecl>(FD) && isa<CXXTryStmt>(Body))
       DiagnoseReturnInConstructorExceptionHandler(cast<CXXTryStmt>(Body));
-    
+
     // Verify that gotos and switch cases don't jump into scopes illegally.
     if (getCurFunction()->NeedsScopeChecking() &&
         !PP.isCodeCompletionEnabled())
@@ -11273,7 +11769,7 @@
       MarkBaseAndMemberDestructorsReferenced(Destructor->getLocation(),
                                              Destructor->getParent());
     }
-    
+
     // If any errors have occurred, clear out any temporaries that may have
     // been leftover. This ensures that these temporaries won't be picked up for
     // deletion in some later function.
@@ -11307,11 +11803,11 @@
     assert(ExprCleanupObjects.size() ==
                ExprEvalContexts.back().NumCleanupObjects &&
            "Leftover temporaries in function");
-    assert(!ExprNeedsCleanups && "Unaccounted cleanups in function");
+    assert(!Cleanup.exprNeedsCleanups() && "Unaccounted cleanups in function");
     assert(MaybeODRUseExprs.empty() &&
            "Leftover expressions for odr-use checking");
   }
-  
+
   if (!IsInstantiation)
     PopDeclContext();
 
@@ -11326,7 +11822,6 @@
   return dcl;
 }
 
-
 /// When we finish delayed parsing of an attribute, we must attach it to the
 /// relevant Decl.
 void Sema::ActOnFinishDelayedAttribute(Scope *S, Decl *D,
@@ -11334,15 +11829,15 @@
   // Always attach attributes to the underlying decl.
   if (TemplateDecl *TD = dyn_cast<TemplateDecl>(D))
     D = TD->getTemplatedDecl();
+
   ProcessDeclAttributeList(S, D, Attrs.getList());  
   ProcessAPINotes(D);
-  
+
   if (CXXMethodDecl *Method = dyn_cast_or_null<CXXMethodDecl>(D))
     if (Method->isStatic())
       checkThisInStaticMemberFunctionAttributes(Method);
 }
 
-
 /// ImplicitlyDefineFunction - An undeclared identifier was used in a function
 /// call, forming a call to an implicitly defined function (per C99 6.5.1p2).
 NamedDecl *Sema::ImplicitlyDefineFunction(SourceLocation Loc,
@@ -11489,14 +11984,15 @@
                                          FD->getLocation()));
     if (Context.BuiltinInfo.isNoThrow(BuiltinID) && !FD->hasAttr<NoThrowAttr>())
       FD->addAttr(NoThrowAttr::CreateImplicit(Context, FD->getLocation()));
+    if (Context.BuiltinInfo.isPure(BuiltinID) && !FD->hasAttr<PureAttr>())
+      FD->addAttr(PureAttr::CreateImplicit(Context, FD->getLocation()));
     if (Context.BuiltinInfo.isConst(BuiltinID) && !FD->hasAttr<ConstAttr>())
       FD->addAttr(ConstAttr::CreateImplicit(Context, FD->getLocation()));
-    if (getLangOpts().CUDA && getLangOpts().CUDATargetOverloads &&
-        Context.BuiltinInfo.isTSBuiltin(BuiltinID) &&
+    if (getLangOpts().CUDA && Context.BuiltinInfo.isTSBuiltin(BuiltinID) &&
         !FD->hasAttr<CUDADeviceAttr>() && !FD->hasAttr<CUDAHostAttr>()) {
-      // Assign appropriate attribute depending on CUDA compilation
-      // mode and the target builtin belongs to. E.g. during host
-      // compilation, aux builtins are __device__, the rest are __host__.
+      // Add the appropriate attribute, depending on the CUDA compilation mode
+      // and which target the builtin belongs to. For example, during host
+      // compilation, aux builtins are __device__, while the rest are __host__.
       if (getLangOpts().CUDAIsDevice !=
           Context.BuiltinInfo.isAuxBuiltinID(BuiltinID))
         FD->addAttr(CUDADeviceAttr::CreateImplicit(Context, FD->getLocation()));
@@ -11505,6 +12001,16 @@
     }
   }
 
+  // If C++ exceptions are enabled but we are told extern "C" functions cannot
+  // throw, add an implicit nothrow attribute to any extern "C" function we come
+  // across.
+  if (getLangOpts().CXXExceptions && getLangOpts().ExternCNoUnwind &&
+      FD->isExternC() && !FD->hasAttr<NoThrowAttr>()) {
+    const auto *FPT = FD->getType()->getAs<FunctionProtoType>();
+    if (!FPT || FPT->getExceptionSpecType() == EST_None)
+      FD->addAttr(NoThrowAttr::CreateImplicit(Context, FD->getLocation()));
+  }
+
   IdentifierInfo *Name = FD->getIdentifier();
   if (!Name)
     return;
@@ -11559,7 +12065,7 @@
     NewTD->setInvalidDecl();
     return NewTD;
   }
-  
+
   if (D.getDeclSpec().isModulePrivateSpecified()) {
     if (CurContext->isFunctionOrMethod())
       Diag(NewTD->getLocation(), diag::err_module_private_local)
@@ -11569,7 +12075,7 @@
     else
       NewTD->setModulePrivate();
   }
-  
+
   // C++ [dcl.typedef]p8:
   //   If the typedef declaration defines an unnamed class (or
   //   enum), the first typedef-name declared by the declaration
@@ -11594,7 +12100,6 @@
   return NewTD;
 }
 
-
 /// \brief Check that this is a valid underlying type for an enum declaration.
 bool Sema::CheckEnumUnderlyingType(TypeSourceInfo *TI) {
   SourceLocation UnderlyingLoc = TI->getTypeLoc().getBeginLoc();
@@ -11831,6 +12336,28 @@
   return false;
 }
 
+/// Find the DeclContext in which a tag is implicitly declared if we see an
+/// elaborated type specifier in the specified context, and lookup finds
+/// nothing.
+static DeclContext *getTagInjectionContext(DeclContext *DC) {
+  while (!DC->isFileContext() && !DC->isFunctionOrMethod())
+    DC = DC->getParent();
+  return DC;
+}
+
+/// Find the Scope in which a tag is implicitly declared if we see an
+/// elaborated type specifier in the specified context, and lookup finds
+/// nothing.
+static Scope *getTagInjectionScope(Scope *S, const LangOptions &LangOpts) {
+  while (S->isClassScope() ||
+         (LangOpts.CPlusPlus &&
+          S->isFunctionPrototypeScope()) ||
+         ((S->getFlags() & Scope::DeclScope) == 0) ||
+         (S->getEntity() && S->getEntity()->isTransparentContext()))
+    S = S->getParent();
+  return S;
+}
+
 /// \brief This is invoked when we see 'struct foo' or 'struct {'.  In the
 /// former case, Name will be non-null.  In the later case, Name will be null.
 /// TagSpec indicates what kind of tag this is. TUK indicates whether this is a
@@ -12000,7 +12527,7 @@
       }
 
       // A tag 'foo::bar' must already exist.
-      Diag(NameLoc, diag::err_not_tag_in_scope) 
+      Diag(NameLoc, diag::err_not_tag_in_scope)
         << Kind << Name << DC << SS.getRange();
       Name = nullptr;
       Invalid = true;
@@ -12024,12 +12551,13 @@
 
     // When declaring or defining a tag, ignore ambiguities introduced
     // by types using'ed into this scope.
-    if (Previous.isAmbiguous() && 
+    if (Previous.isAmbiguous() &&
         (TUK == TUK_Definition || TUK == TUK_Declaration)) {
       LookupResult::Filter F = Previous.makeFilter();
       while (F.hasNext()) {
         NamedDecl *ND = F.next();
-        if (ND->getDeclContext()->getRedeclContext() != SearchDC)
+        if (!ND->getDeclContext()->getRedeclContext()->Equals(
+                SearchDC->getRedeclContext()))
           F.erase();
       }
       F.done();
@@ -12100,10 +12628,10 @@
       DC->Equals(getStdNamespace()) && Name->isStr("bad_alloc")) {
     // This is a declaration of or a reference to "std::bad_alloc".
     isStdBadAlloc = true;
-    
+
     if (Previous.empty() && StdBadAlloc) {
       // std::bad_alloc has been implicitly declared (but made invisible to
-      // name lookup). Fill in this implicit declaration as the previous 
+      // name lookup). Fill in this implicit declaration as the previous
       // declaration, so that the declarations get chained appropriately.
       Previous.addDecl(getStdBadAlloc());
     }
@@ -12147,16 +12675,10 @@
       // Find the context where we'll be declaring the tag.
       // FIXME: We would like to maintain the current DeclContext as the
       // lexical context,
-      while (!SearchDC->isFileContext() && !SearchDC->isFunctionOrMethod())
-        SearchDC = SearchDC->getParent();
+      SearchDC = getTagInjectionContext(SearchDC);
 
       // Find the scope where we'll be declaring the tag.
-      while (S->isClassScope() ||
-             (getLangOpts().CPlusPlus &&
-              S->isFunctionPrototypeScope()) ||
-             ((S->getFlags() & Scope::DeclScope) == 0) ||
-             (S->getEntity() && S->getEntity()->isTransparentContext()))
-        S = S->getParent();
+      S = getTagInjectionScope(S, getLangOpts());
     } else {
       assert(TUK == TUK_Friend);
       // C++ [namespace.memdef]p3:
@@ -12309,16 +12831,34 @@
         if (!Invalid) {
           // If this is a use, just return the declaration we found, unless
           // we have attributes.
-
-          // FIXME: In the future, return a variant or some other clue
-          // for the consumer of this Decl to know it doesn't own it.
-          // For our current ASTs this shouldn't be a problem, but will
-          // need to be changed with DeclGroups.
-          if (!Attr &&
-              ((TUK == TUK_Reference &&
-                (!PrevTagDecl->getFriendObjectKind() || getLangOpts().MicrosoftExt))
-               || TUK == TUK_Friend))
-            return PrevTagDecl;
+          if (TUK == TUK_Reference || TUK == TUK_Friend) {
+            if (Attr) {
+              // FIXME: Diagnose these attributes. For now, we create a new
+              // declaration to hold them.
+            } else if (TUK == TUK_Reference &&
+                       (PrevTagDecl->getFriendObjectKind() ==
+                            Decl::FOK_Undeclared ||
+                        PP.getModuleContainingLocation(
+                            PrevDecl->getLocation()) !=
+                            PP.getModuleContainingLocation(KWLoc)) &&
+                       SS.isEmpty()) {
+              // This declaration is a reference to an existing entity, but
+              // has different visibility from that entity: it either makes
+              // a friend visible or it makes a type visible in a new module.
+              // In either case, create a new declaration. We only do this if
+              // the declaration would have meant the same thing if no prior
+              // declaration were found, that is, if it was found in the same
+              // scope where we would have injected a declaration.
+              if (!getTagInjectionContext(CurContext)->getRedeclContext()
+                       ->Equals(PrevDecl->getDeclContext()->getRedeclContext()))
+                return PrevTagDecl;
+              // This is in the injected scope, create a new declaration in
+              // that scope.
+              S = getTagInjectionScope(S, getLangOpts());
+            } else {
+              return PrevTagDecl;
+            }
+          }
 
           // Diagnose attempts to redefine a tag.
           if (TUK == TUK_Definition) {
@@ -12404,7 +12944,6 @@
       // is non-NULL, it's a definition of the tag declared by
       // PrevDecl. If it's NULL, we have a new definition.
 
-
     // Otherwise, PrevDecl is not a tag, but was found with tag
     // lookup.  This is only actually possible in C++, where a few
     // things like templates still live in the tag namespace.
@@ -12505,8 +13044,8 @@
         else if (getLangOpts().CPlusPlus)
           DiagID = diag::err_forward_ref_enum;
         Diag(Loc, DiagID);
-        
-        // If this is a forward-declared reference to an enumeration, make a 
+
+        // If this is a forward-declared reference to an enumeration, make a
         // note of it; we won't actually be introducing the declaration into
         // the declaration context.
         if (TUK == TUK_Reference)
@@ -12522,7 +13061,6 @@
         ED->setIntegerType(QualType(EnumUnderlying.get<const Type*>(), 0));
       ED->setPromotionType(ED->getIntegerType());
     }
-
   } else {
     // struct/union/class
 
@@ -12551,10 +13089,10 @@
   // Maybe add qualifier info.
   if (SS.isNotEmpty()) {
     if (SS.isSet()) {
-      // If this is either a declaration or a definition, check the 
+      // If this is either a declaration or a definition, check the
       // nested-name-specifier against the current context. We don't do this
       // for explicit specializations, because they have similar checking
-      // (with more specific diagnostics) in the call to 
+      // (with more specific diagnostics) in the call to
       // CheckMemberSpecialization, below.
       if (!isExplicitSpecialization &&
           (TUK == TUK_Definition || TUK == TUK_Declaration) &&
@@ -12616,7 +13154,7 @@
             << Name;
         Invalid = true;
       }
-    } else {
+    } else if (!PrevDecl) {
       Diag(Loc, diag::warn_decl_in_param_list) << Context.getTagDeclType(New);
     }
     DeclsInPrototypeScope.push_back(New);
@@ -12664,7 +13202,6 @@
     PushOnScopeChains(New, S, !IsForwardReference);
     if (IsForwardReference)
       SearchDC->makeDeclVisibleInContext(New);
-
   } else {
     CurContext->addDecl(New);
   }
@@ -12692,7 +13229,7 @@
 void Sema::ActOnTagStartDefinition(Scope *S, Decl *TagD) {
   AdjustDeclIfTemplate(TagD);
   TagDecl *Tag = cast<TagDecl>(TagD);
-  
+
   // Enter the tag context.
   PushDeclContext(S, Tag);
 
@@ -12704,7 +13241,7 @@
 }
 
 Decl *Sema::ActOnObjCContainerStartDefinition(Decl *IDecl) {
-  assert(isa<ObjCContainerDecl>(IDecl) && 
+  assert(isa<ObjCContainerDecl>(IDecl) &&
          "ActOnObjCContainerStartDefinition - Not ObjCContainerDecl");
   DeclContext *OCD = cast<DeclContext>(IDecl);
   assert(getContainingDC(OCD) == CurContext &&
@@ -12809,7 +13346,7 @@
   // ActOnStartCXXMemberDeclarations, so we don't have to mess with
   // the FieldCollector.
 
-  PopDeclContext();  
+  PopDeclContext();
 }
 
 // Note that FieldName may be null for anonymous bitfields.
@@ -12921,6 +13458,13 @@
                              Declarator &D, Expr *BitWidth,
                              InClassInitStyle InitStyle,
                              AccessSpecifier AS) {
+  if (D.isDecompositionDeclarator()) {
+    const DecompositionDeclarator &Decomp = D.getDecompositionDeclarator();
+    Diag(Decomp.getLSquareLoc(), diag::err_decomp_decl_context)
+      << Decomp.getSourceRange();
+    return nullptr;
+  }
+
   IdentifierInfo *II = D.getIdentifier();
   SourceLocation Loc = DeclStart;
   if (II) Loc = D.getIdentifierLoc();
@@ -12944,15 +13488,19 @@
     D.setInvalidType();
   }
 
-  // OpenCL 1.2 spec, s6.9 r:
-  // The event type cannot be used to declare a structure or union field.
-  if (LangOpts.OpenCL && T->isEventT()) {
-    Diag(Loc, diag::err_event_t_struct_field);
+  // OpenCL v1.2 s6.9b,r & OpenCL v2.0 s6.12.5 - The following types cannot be
+  // used as structure or union field: image, sampler, event or block types.
+  if (LangOpts.OpenCL && (T->isEventT() || T->isImageType() ||
+                          T->isSamplerT() || T->isBlockPointerType())) {
+    Diag(Loc, diag::err_opencl_type_struct_or_union_field) << T;
     D.setInvalidType();
   }
 
   DiagnoseFunctionSpecifiers(D.getDeclSpec());
 
+  if (D.getDeclSpec().isInlineSpecified())
+    Diag(D.getDeclSpec().getInlineSpecLoc(), diag::err_inline_non_function)
+        << getLangOpts().CPlusPlus1z;
   if (DeclSpec::TSCS TSCS = D.getDeclSpec().getThreadStorageClassSpec())
     Diag(D.getDeclSpec().getThreadStorageClassSpecLoc(),
          diag::err_invalid_thread)
@@ -12967,11 +13515,11 @@
     case LookupResult::FoundUnresolvedValue:
       PrevDecl = Previous.getAsSingle<NamedDecl>();
       break;
-      
+
     case LookupResult::FoundOverloaded:
       PrevDecl = Previous.getRepresentativeDecl();
       break;
-      
+
     case LookupResult::NotFound:
     case LookupResult::NotFoundInCurrentInstantiation:
     case LookupResult::Ambiguous:
@@ -13001,7 +13549,7 @@
 
   if (D.getDeclSpec().isModulePrivateSpecified())
     NewFD->setModulePrivate();
-  
+
   if (NewFD->isInvalidDecl() && PrevDecl) {
     // Don't introduce NewFD into scope; there's already something
     // with the same name in the same scope.
@@ -13233,9 +13781,9 @@
         if (!getLangOpts().CPlusPlus11 &&
             getLangOpts().ObjCAutoRefCount && RDecl->hasObjectMember()) {
           // Objective-C++ ARC: it is an error to have a non-trivial field of
-          // a union. However, system headers in Objective-C programs 
+          // a union. However, system headers in Objective-C programs
           // occasionally have Objective-C lifetime objects within unions,
-          // and rather than cause the program to fail, we make those 
+          // and rather than cause the program to fail, we make those
           // members unavailable.
           SourceLocation Loc = FD->getLocation();
           if (getSourceManager().isInSystemHeader(Loc)) {
@@ -13331,7 +13879,7 @@
     else
       EnclosingContext = EnclosingDecl;
   } else {
-    if (ObjCCategoryDecl *CDecl = 
+    if (ObjCCategoryDecl *CDecl =
         dyn_cast<ObjCCategoryDecl>(EnclosingDecl)) {
       if (LangOpts.ObjCRuntime.isFragile() || !CDecl->IsClassExtension()) {
         Diag(Loc, diag::err_misplaced_ivar) << CDecl->IsClassExtension();
@@ -13369,33 +13917,33 @@
 
   if (D.getDeclSpec().isModulePrivateSpecified())
     NewID->setModulePrivate();
-  
+
   if (II) {
     // FIXME: When interfaces are DeclContexts, we'll need to add
     // these to the interface.
     S->AddDecl(NewID);
     IdResolver.AddDecl(NewID);
   }
-  
+
   if (LangOpts.ObjCRuntime.isNonFragile() &&
       !NewID->isInvalidDecl() && isa<ObjCInterfaceDecl>(EnclosingDecl))
     Diag(Loc, diag::warn_ivars_in_interface);
-  
+
   return NewID;
 }
 
-/// ActOnLastBitfield - This routine handles synthesized bitfields rules for 
-/// class and class extensions. For every class \@interface and class 
-/// extension \@interface, if the last ivar is a bitfield of any type, 
+/// ActOnLastBitfield - This routine handles synthesized bitfields rules for
+/// class and class extensions. For every class \@interface and class
+/// extension \@interface, if the last ivar is a bitfield of any type,
 /// then add an implicit `char :0` ivar to the end of that interface.
 void Sema::ActOnLastBitfield(SourceLocation DeclLoc,
                              SmallVectorImpl<Decl *> &AllIvarDecls) {
   if (LangOpts.ObjCRuntime.isFragile() || AllIvarDecls.empty())
     return;
-  
+
   Decl *ivarDecl = AllIvarDecls[AllIvarDecls.size()-1];
   ObjCIvarDecl *Ivar = cast<ObjCIvarDecl>(ivarDecl);
-  
+
   if (!Ivar->isBitField() || Ivar->getBitWidthValue(Context) == 0)
     return;
   ObjCInterfaceDecl *ID = dyn_cast<ObjCInterfaceDecl>(CurContext);
@@ -13414,7 +13962,7 @@
 
   Ivar = ObjCIvarDecl::Create(Context, cast<ObjCContainerDecl>(CurContext),
                               DeclLoc, DeclLoc, nullptr,
-                              Context.CharTy, 
+                              Context.CharTy,
                               Context.getTrivialTypeSourceInfo(Context.CharTy,
                                                                DeclLoc),
                               ObjCIvarDecl::Private, BW,
@@ -13443,7 +13991,7 @@
       break;
     }
   }
-  
+
   RecordDecl *Record = dyn_cast<RecordDecl>(EnclosingDecl);
 
   // Start counting up the number of named members; make sure to include
@@ -13497,7 +14045,7 @@
       FD->setInvalidDecl();
       EnclosingDecl->setInvalidDecl();
       continue;
-    } else if (FDTy->isIncompleteArrayType() && Record && 
+    } else if (FDTy->isIncompleteArrayType() && Record &&
                ((i + 1 == Fields.end() && !Record->isUnion()) ||
                 ((getLangOpts().MicrosoftExt ||
                   getLangOpts().CPlusPlus) &&
@@ -13513,14 +14061,12 @@
                      : getLangOpts().CPlusPlus
                            ? diag::ext_flexible_array_union_gnu
                            : diag::err_flexible_array_union;
-      else if (Fields.size() == 1)
+      else if (NumNamedMembers < 1)
         DiagID = getLangOpts().MicrosoftExt
                      ? diag::ext_flexible_array_empty_aggregate_ms
                      : getLangOpts().CPlusPlus
                            ? diag::ext_flexible_array_empty_aggregate_gnu
-                           : NumNamedMembers < 1
-                                 ? diag::err_flexible_array_empty_aggregate
-                                 : 0;
+                           : diag::err_flexible_array_empty_aggregate;
 
       if (DiagID)
         Diag(FD->getLocation(), DiagID) << FD->getDeclName()
@@ -13614,7 +14160,7 @@
                           UnavailableAttr::IR_ARCFieldWithOwnership, loc));
           }
         } else {
-          Diag(FD->getLocation(), diag::err_arc_objc_object_in_tag) 
+          Diag(FD->getLocation(), diag::err_arc_objc_object_in_tag)
             << T->isBlockPointerType() << Record->getTagKind();
         }
         ARCErrReported = true;
@@ -13627,7 +14173,7 @@
         Record->setHasObjectMember(true);
       else if (Context.getAsArrayType(FD->getType())) {
         QualType BaseType = Context.getBaseElementType(FD->getType());
-        if (BaseType->isRecordType() && 
+        if (BaseType->isRecordType() &&
             BaseType->getAs<RecordType>()->getDecl()->hasObjectMember())
           Record->setHasObjectMember(true);
         else if (BaseType->isObjCObjectPointerType() ||
@@ -13666,39 +14212,39 @@
           // Add any implicitly-declared members to this class.
           AddImplicitlyDeclaredMembersToClass(CXXRecord);
 
-          // If we have virtual base classes, we may end up finding multiple 
-          // final overriders for a given virtual function. Check for this 
+          // If we have virtual base classes, we may end up finding multiple
+          // final overriders for a given virtual function. Check for this
           // problem now.
           if (CXXRecord->getNumVBases()) {
             CXXFinalOverriderMap FinalOverriders;
             CXXRecord->getFinalOverriders(FinalOverriders);
-            
-            for (CXXFinalOverriderMap::iterator M = FinalOverriders.begin(), 
+
+            for (CXXFinalOverriderMap::iterator M = FinalOverriders.begin(),
                                              MEnd = FinalOverriders.end();
                  M != MEnd; ++M) {
-              for (OverridingMethods::iterator SO = M->second.begin(), 
+              for (OverridingMethods::iterator SO = M->second.begin(),
                                             SOEnd = M->second.end();
                    SO != SOEnd; ++SO) {
-                assert(SO->second.size() > 0 && 
+                assert(SO->second.size() > 0 &&
                        "Virtual function without overridding functions?");
                 if (SO->second.size() == 1)
                   continue;
-                
+
                 // C++ [class.virtual]p2:
                 //   In a derived class, if a virtual member function of a base
                 //   class subobject has more than one final overrider the
                 //   program is ill-formed.
                 Diag(Record->getLocation(), diag::err_multiple_final_overriders)
                   << (const NamedDecl *)M->first << Record;
-                Diag(M->first->getLocation(), 
+                Diag(M->first->getLocation(),
                      diag::note_overridden_virtual_function);
-                for (OverridingMethods::overriding_iterator 
-                          OM = SO->second.begin(), 
+                for (OverridingMethods::overriding_iterator
+                          OM = SO->second.begin(),
                        OMEnd = SO->second.end();
                      OM != OMEnd; ++OM)
                   Diag(OM->Method->getLocation(), diag::note_final_overrider)
                     << (const NamedDecl *)M->first << OM->Method->getParent();
-                
+
                 Record->setInvalidDecl();
               }
             }
@@ -13708,7 +14254,7 @@
         }
       }
     }
-    
+
     if (!Completed)
       Record->completeDefinition();
 
@@ -13797,7 +14343,7 @@
       CheckImplementationIvars(IMPDecl, ClsFields, RecFields.size(), RBrac);
       IMPDecl->setIvarLBraceLoc(LBrac);
       IMPDecl->setIvarRBraceLoc(RBrac);
-    } else if (ObjCCategoryDecl *CDecl = 
+    } else if (ObjCCategoryDecl *CDecl =
                 dyn_cast<ObjCCategoryDecl>(EnclosingDecl)) {
       // case of ivars in class extension; all other cases have been
       // reported as errors elsewhere.
@@ -13808,18 +14354,18 @@
       ObjCInterfaceDecl *IDecl = CDecl->getClassInterface();
       for (unsigned i = 0, e = RecFields.size(); i != e; ++i) {
         if (IDecl) {
-          if (const ObjCIvarDecl *ClsIvar = 
+          if (const ObjCIvarDecl *ClsIvar =
               IDecl->getIvarDecl(ClsFields[i]->getIdentifier())) {
-            Diag(ClsFields[i]->getLocation(), 
-                 diag::err_duplicate_ivar_declaration); 
+            Diag(ClsFields[i]->getLocation(),
+                 diag::err_duplicate_ivar_declaration);
             Diag(ClsIvar->getLocation(), diag::note_previous_definition);
             continue;
           }
           for (const auto *Ext : IDecl->known_extensions()) {
             if (const ObjCIvarDecl *ClsExtIvar
                   = Ext->getIvarDecl(ClsFields[i]->getIdentifier())) {
-              Diag(ClsFields[i]->getLocation(), 
-                   diag::err_duplicate_ivar_declaration); 
+              Diag(ClsFields[i]->getLocation(),
+                   diag::err_duplicate_ivar_declaration);
               Diag(ClsExtIvar->getLocation(), diag::note_previous_definition);
               continue;
             }
@@ -13845,37 +14391,37 @@
                                         QualType T) {
   assert(T->isIntegralType(Context) && "Integral type required!");
   unsigned BitWidth = Context.getIntWidth(T);
-  
+
   if (Value.isUnsigned() || Value.isNonNegative()) {
-    if (T->isSignedIntegerOrEnumerationType()) 
+    if (T->isSignedIntegerOrEnumerationType())
       --BitWidth;
     return Value.getActiveBits() <= BitWidth;
-  }  
+  }
   return Value.getMinSignedBits() <= BitWidth;
 }
 
 // \brief Given an integral type, return the next larger integral type
 // (or a NULL type of no such type exists).
 static QualType getNextLargerIntegralType(ASTContext &Context, QualType T) {
-  // FIXME: Int128/UInt128 support, which also needs to be introduced into 
+  // FIXME: Int128/UInt128 support, which also needs to be introduced into
   // enum checking below.
   assert(T->isIntegralType(Context) && "Integral type required!");
   const unsigned NumTypes = 4;
-  QualType SignedIntegralTypes[NumTypes] = { 
+  QualType SignedIntegralTypes[NumTypes] = {
     Context.ShortTy, Context.IntTy, Context.LongTy, Context.LongLongTy
   };
-  QualType UnsignedIntegralTypes[NumTypes] = { 
-    Context.UnsignedShortTy, Context.UnsignedIntTy, Context.UnsignedLongTy, 
+  QualType UnsignedIntegralTypes[NumTypes] = {
+    Context.UnsignedShortTy, Context.UnsignedIntTy, Context.UnsignedLongTy,
     Context.UnsignedLongLongTy
   };
-  
+
   unsigned BitWidth = Context.getTypeSize(T);
   QualType *Types = T->isSignedIntegerOrEnumerationType()? SignedIntegralTypes
                                                         : UnsignedIntegralTypes;
   for (unsigned I = 0; I != NumTypes; ++I)
     if (Context.getTypeSize(Types[I]) > BitWidth)
       return Types[I];
-  
+
   return QualType();
 }
 
@@ -13939,7 +14485,7 @@
           // C++11 [dcl.enum]p5:
           //   If the underlying type is not fixed, the type of each enumerator
           //   is the type of its initializing value:
-          //     - If an initializer is specified for an enumerator, the 
+          //     - If an initializer is specified for an enumerator, the
           //       initializing value has the same type as the expression.
           EltTy = Val->getType();
         } else {
@@ -13970,10 +14516,10 @@
       // C++0x [dcl.enum]p5:
       //   If the underlying type is not fixed, the type of each enumerator
       //   is the type of its initializing value:
-      //     - If no initializer is specified for the first enumerator, the 
+      //     - If no initializer is specified for the first enumerator, the
       //       initializing value has an unspecified integral type.
       //
-      // GCC uses 'int' for its unspecified integral type, as does 
+      // GCC uses 'int' for its unspecified integral type, as does
       // C99 6.7.2.2p3.
       if (Enum->isFixed()) {
         EltTy = Enum->getIntegerType();
@@ -13996,12 +14542,12 @@
         //     - Otherwise the type of the initializing value is the same as
         //       the type of the initializing value of the preceding enumerator
         //       unless the incremented value is not representable in that type,
-        //       in which case the type is an unspecified integral type 
+        //       in which case the type is an unspecified integral type
         //       sufficient to contain the incremented value. If no such type
         //       exists, the program is ill-formed.
         QualType T = getNextLargerIntegralType(Context, EltTy);
         if (T.isNull() || Enum->isFixed()) {
-          // There is no integral type larger enough to represent this 
+          // There is no integral type larger enough to represent this
           // value. Complain, then allow the value to wrap around.
           EnumVal = LastEnumConst->getInitVal();
           EnumVal = EnumVal.zext(EnumVal.getBitWidth() * 2);
@@ -14017,15 +14563,15 @@
         } else {
           EltTy = T;
         }
-        
+
         // Retrieve the last enumerator's value, extent that type to the
         // type that is supposed to be large enough to represent the incremented
         // value, then increment.
         EnumVal = LastEnumConst->getInitVal();
         EnumVal.setIsSigned(EltTy->isSignedIntegerOrEnumerationType());
         EnumVal = EnumVal.zextOrTrunc(Context.getIntWidth(EltTy));
-        ++EnumVal;        
-        
+        ++EnumVal;
+
         // If we're not in C++, diagnose the overflow of enumerator values,
         // which in C99 means that the enumerator value is not representable in
         // an int (C99 6.7.2.2p2). However, we support GCC's extension that
@@ -14043,12 +14589,12 @@
   }
 
   if (!EltTy->isDependentType()) {
-    // Make the enumerator value match the signedness and size of the 
+    // Make the enumerator value match the signedness and size of the
     // enumerator's type.
     EnumVal = EnumVal.extOrTrunc(Context.getIntWidth(EltTy));
     EnumVal.setIsSigned(EltTy->isSignedIntegerOrEnumerationType());
   }
-  
+
   return EnumConstantDecl::Create(Context, Enum, IdLoc, Id, EltTy,
                                   Val, EnumVal);
 }
@@ -14103,14 +14649,14 @@
   }
 
   // C++ [class.mem]p15:
-  // If T is the name of a class, then each of the following shall have a name 
+  // If T is the name of a class, then each of the following shall have a name
   // different from T:
-  // - every enumerator of every member of class T that is an unscoped 
+  // - every enumerator of every member of class T that is an unscoped
   // enumerated type
   if (!TheEnumDecl->isScoped())
     DiagnoseClassNameShadow(TheEnumDecl->getDeclContext(),
                             DeclarationNameInfo(Id, IdLoc));
-  
+
   EnumConstantDecl *New =
     CheckEnumConstant(TheEnumDecl, LastEnumConst, IdLoc, Id, Val);
   if (!New)
@@ -14422,7 +14968,7 @@
   //   int, long long int, or unsigned long long int.
   // C99 6.4.4.3p2:
   //   An identifier declared as an enumeration constant has type int.
-  // The C99 rule is modified by a gcc extension 
+  // The C99 rule is modified by a gcc extension
   QualType BestPromotionType;
 
   bool Packed = Enum->hasAttr<PackedAttr>();
@@ -14638,8 +15184,8 @@
   return checkModuleImportContext(*this, M, ImportLoc, CurContext);
 }
 
-DeclResult Sema::ActOnModuleImport(SourceLocation AtLoc, 
-                                   SourceLocation ImportLoc, 
+DeclResult Sema::ActOnModuleImport(SourceLocation AtLoc,
+                                   SourceLocation ImportLoc,
                                    ModuleIdPath Path) {
   Module *Mod =
       getModuleLoader().loadModule(ImportLoc, Path, Module::AllVisible,
@@ -14655,11 +15201,10 @@
   // of the same top-level module. Until we do, make it an error rather than
   // silently ignoring the import.
   if (Mod->getTopLevelModuleName() == getLangOpts().CurrentModule)
-    Diag(ImportLoc, diag::err_module_self_import)
+    Diag(ImportLoc, getLangOpts().CompilingModule
+                        ? diag::err_module_self_import
+                        : diag::err_module_import_in_implementation)
         << Mod->getFullModuleName() << getLangOpts().CurrentModule;
-  else if (Mod->getTopLevelModuleName() == getLangOpts().ImplementationOfModule)
-    Diag(ImportLoc, diag::err_module_import_in_implementation)
-        << Mod->getFullModuleName() << getLangOpts().ImplementationOfModule;
 
   SmallVector<SourceLocation, 2> IdentifierLocs;
   Module *ModCheck = Mod;
@@ -14669,15 +15214,17 @@
     if (!ModCheck)
       break;
     ModCheck = ModCheck->Parent;
-    
+
     IdentifierLocs.push_back(Path[I].second);
   }
 
-  ImportDecl *Import = ImportDecl::Create(Context, 
-                                          Context.getTranslationUnitDecl(),
-                                          AtLoc.isValid()? AtLoc : ImportLoc, 
+  TranslationUnitDecl *TU = getASTContext().getTranslationUnitDecl();
+  ImportDecl *Import = ImportDecl::Create(Context, TU,
+                                          AtLoc.isValid()? AtLoc : ImportLoc,
                                           Mod, IdentifierLocs);
-  Context.getTranslationUnitDecl()->addDecl(Import);
+  if (!ModuleScopes.empty())
+    Context.addModuleInitializer(ModuleScopes.back().Module, Import);
+  TU->addDecl(Import);
   return Import;
 }
 
@@ -14693,23 +15240,21 @@
       TUKind == TU_Module &&
       getSourceManager().isWrittenInMainFile(DirectiveLoc);
 
-  // Similarly, if this module is specified by -fmodule-implementation-of
-  // don't actually synthesize an illegal module import.
-  bool ShouldAddImport = !IsInModuleIncludes &&
-    (getLangOpts().ImplementationOfModule.empty() ||
-     getLangOpts().ImplementationOfModule != Mod->getTopLevelModuleName());
+  bool ShouldAddImport = !IsInModuleIncludes;
 
-  // If this module import was due to an inclusion directive, create an 
+  // If this module import was due to an inclusion directive, create an
   // implicit import declaration to capture it in the AST.
   if (ShouldAddImport) {
     TranslationUnitDecl *TU = getASTContext().getTranslationUnitDecl();
     ImportDecl *ImportD = ImportDecl::CreateImplicit(getASTContext(), TU,
                                                      DirectiveLoc, Mod,
                                                      DirectiveLoc);
+    if (!ModuleScopes.empty())
+      Context.addModuleInitializer(ModuleScopes.back().Module, ImportD);
     TU->addDecl(ImportD);
     Consumer.HandleImplicitImportDecl(ImportD);
   }
-  
+
   getModuleLoader().makeModuleVisible(Mod, Module::AllVisible, DirectiveLoc);
   VisibleModules.setVisible(Mod, DirectiveLoc);
 }
@@ -14717,8 +15262,11 @@
 void Sema::ActOnModuleBegin(SourceLocation DirectiveLoc, Module *Mod) {
   checkModuleImportContext(*this, Mod, DirectiveLoc, CurContext);
 
+  ModuleScopes.push_back({});
+  ModuleScopes.back().Module = Mod;
   if (getLangOpts().ModulesLocalVisibility)
-    VisibleModulesStack.push_back(std::move(VisibleModules));
+    ModuleScopes.back().OuterVisibleModules = std::move(VisibleModules);
+
   VisibleModules.setVisible(Mod, DirectiveLoc);
 }
 
@@ -14726,9 +15274,15 @@
   checkModuleImportContext(*this, Mod, DirectiveLoc, CurContext);
 
   if (getLangOpts().ModulesLocalVisibility) {
-    VisibleModules = std::move(VisibleModulesStack.back());
-    VisibleModulesStack.pop_back();
+    assert(!ModuleScopes.empty() && ModuleScopes.back().Module == Mod &&
+           "left the wrong module scope");
+    VisibleModules = std::move(ModuleScopes.back().OuterVisibleModules);
+    ModuleScopes.pop_back();
+
     VisibleModules.setVisible(Mod, DirectiveLoc);
+    // Leaving a module hides namespace names, so our visible namespace cache
+    // is now out of date.
+    VisibleNamespaceCache.clear();
   }
 }
 
diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp
index dcc458b..3e3b901 100644
--- a/lib/Sema/SemaDeclAttr.cpp
+++ b/lib/Sema/SemaDeclAttr.cpp
@@ -11,8 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
+#include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
@@ -20,17 +21,20 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/Mangle.h"
-#include "clang/AST/ASTMutationListener.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/DelayedDiagnostic.h"
+#include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/MathExtras.h"
+
 using namespace clang;
 using namespace sema;
 
@@ -40,7 +44,7 @@
     Cpp,
     ObjC
   };
-}
+} // end namespace AttributeLangSupport
 
 //===----------------------------------------------------------------------===//
 //  Helper functions
@@ -52,6 +56,7 @@
 static bool isFunctionOrMethod(const Decl *D) {
   return (D->getFunctionType() != nullptr) || isa<ObjCMethodDecl>(D);
 }
+
 /// \brief Return true if the given decl has function type (function or
 /// function-typed variable) or an Objective-C method or a block.
 static bool isFunctionOrMethodOrBlock(const Decl *D) {
@@ -801,6 +806,8 @@
 }
 
 static void handleEnableIfAttr(Sema &S, Decl *D, const AttributeList &Attr) {
+  S.Diag(Attr.getLoc(), diag::ext_clang_enable_if);
+
   Expr *Cond = Attr.getArgAsExpr(0);
   if (!Cond->isTypeDependent()) {
     ExprResult Converted = S.PerformContextuallyConvertToBool(Cond);
@@ -887,7 +894,6 @@
                             Attr.getAttributeSpellingListIndex()));
 }
 
-
 static bool checkForConsumableClass(Sema &S, const CXXMethodDecl *MD,
                                         const AttributeList &Attr) {
   ASTContext &CurrContext = S.getASTContext();
@@ -905,7 +911,6 @@
   return true;
 }
 
-
 static void handleCallableWhenAttr(Sema &S, Decl *D,
                                    const AttributeList &Attr) {
   if (!checkAttributeAtLeastNumArgs(S, Attr, 1))
@@ -944,7 +949,6 @@
                States.size(), Attr.getAttributeSpellingListIndex()));
 }
 
-
 static void handleParamTypestateAttr(Sema &S, Decl *D,
                                     const AttributeList &Attr) {
   ParamTypestateAttr::ConsumedState ParamState;
@@ -982,7 +986,6 @@
                                 Attr.getAttributeSpellingListIndex()));
 }
 
-
 static void handleReturnTypestateAttr(Sema &S, Decl *D,
                                       const AttributeList &Attr) {
   ReturnTypestateAttr::ConsumedState ReturnState;
@@ -1031,7 +1034,6 @@
                                  Attr.getAttributeSpellingListIndex()));
 }
 
-
 static void handleSetTypestateAttr(Sema &S, Decl *D, const AttributeList &Attr) {
   if (!checkForConsumableClass(S, cast<CXXMethodDecl>(D), Attr))
     return;
@@ -1568,6 +1570,28 @@
                          Attr.getAttributeSpellingListIndex()));
 }
 
+static void handleIFuncAttr(Sema &S, Decl *D, const AttributeList &Attr) {
+  StringRef Str;
+  if (!S.checkStringLiteralArgumentAttr(Attr, 0, Str))
+    return;
+
+  // Aliases should be on declarations, not definitions.
+  const auto *FD = cast<FunctionDecl>(D);
+  if (FD->isThisDeclarationADefinition()) {
+    S.Diag(Attr.getLoc(), diag::err_alias_is_definition) << FD << 1;
+    return;
+  }
+  // FIXME: it should be handled as a target specific attribute.
+  if (S.Context.getTargetInfo().getTriple().getObjectFormat() !=
+          llvm::Triple::ELF) {
+    S.Diag(Attr.getLoc(), diag::warn_attribute_ignored) << Attr.getName();
+    return;
+  }
+
+  D->addAttr(::new (S.Context) IFuncAttr(Attr.getRange(), S.Context, Str,
+                                         Attr.getAttributeSpellingListIndex()));
+}
+
 static void handleAliasAttr(Sema &S, Decl *D, const AttributeList &Attr) {
   StringRef Str;
   if (!S.checkStringLiteralArgumentAttr(Attr, 0, Str))
@@ -1577,17 +1601,20 @@
     S.Diag(Attr.getLoc(), diag::err_alias_not_supported_on_darwin);
     return;
   }
+  if (S.Context.getTargetInfo().getTriple().isNVPTX()) {
+    S.Diag(Attr.getLoc(), diag::err_alias_not_supported_on_nvptx);
+  }
 
   // Aliases should be on declarations, not definitions.
   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
     if (FD->isThisDeclarationADefinition()) {
-      S.Diag(Attr.getLoc(), diag::err_alias_is_definition) << FD;
+      S.Diag(Attr.getLoc(), diag::err_alias_is_definition) << FD << 0;
       return;
     }
   } else {
     const auto *VD = cast<VarDecl>(D);
     if (VD->isThisDeclarationADefinition() && VD->isExternallyVisible()) {
-      S.Diag(Attr.getLoc(), diag::err_alias_is_definition) << VD;
+      S.Diag(Attr.getLoc(), diag::err_alias_is_definition) << VD << 0;
       return;
     }
   }
@@ -1824,6 +1851,28 @@
                       Attr.getAttributeSpellingListIndex()));
 }
 
+static void handleUnusedAttr(Sema &S, Decl *D, const AttributeList &Attr) {
+  bool IsCXX1zAttr = Attr.isCXX11Attribute() && !Attr.getScopeName();
+
+  if (IsCXX1zAttr && isa<VarDecl>(D)) {
+    // The C++1z spelling of this attribute cannot be applied to a static data
+    // member per [dcl.attr.unused]p2.
+    if (cast<VarDecl>(D)->isStaticDataMember()) {
+      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type)
+          << Attr.getName() << ExpectedForMaybeUnused;
+      return;
+    }
+  }
+
+  // If this is spelled as the standard C++1z attribute, but not in C++1z, warn
+  // about using it as an extension.
+  if (!S.getLangOpts().CPlusPlus1z && IsCXX1zAttr)
+    S.Diag(Attr.getLoc(), diag::ext_cxx1z_attr) << Attr.getName();
+
+  D->addAttr(::new (S.Context) UnusedAttr(
+      Attr.getRange(), S.Context, Attr.getAttributeSpellingListIndex()));
+}
+
 static void handleConstructorAttr(Sema &S, Decl *D, const AttributeList &Attr) {
   uint32_t priority = ConstructorAttr::DefaultPriority;
   if (Attr.getNumArgs() &&
@@ -2125,9 +2174,10 @@
     Replacement = SE->getString();
 
   if (II->getName() == "swift") {
-    if (Introduced.isValid() || Deprecated.isValid() || Obsoleted.isValid() ||
-        !IsUnavailable) {
-      S.Diag(Attr.getLoc(), diag::warn_availability_swift_unavailable_only);
+    if (Introduced.isValid() || Obsoleted.isValid() ||
+        (!IsUnavailable && !Deprecated.isValid())) {
+      S.Diag(Attr.getLoc(),
+             diag::warn_availability_swift_unavailable_deprecated_only);
       return;
     }
   }
@@ -2502,6 +2552,12 @@
       return;
     }
   
+  // If this is spelled as the standard C++1z attribute, but not in C++1z, warn
+  // about using it as an extension.
+  if (!S.getLangOpts().CPlusPlus1z && Attr.isCXX11Attribute() &&
+      !Attr.getScopeName())
+    S.Diag(Attr.getLoc(), diag::ext_cxx1z_attr) << Attr.getName();
+
   D->addAttr(::new (S.Context) 
              WarnUnusedResultAttr(Attr.getRange(), S.Context,
                                   Attr.getAttributeSpellingListIndex()));
@@ -2658,7 +2714,6 @@
   D->addAttr(NewAttr);
 }
 
-
 static void handleCleanupAttr(Sema &S, Decl *D, const AttributeList &Attr) {
   VarDecl *VD = cast<VarDecl>(D);
   if (!VD->hasLocalStorage()) {
@@ -3117,7 +3172,6 @@
 
   // Save dependent expressions in the AST to be instantiated.
   D->addAttr(::new (Context) AlignValueAttr(TmpAttr));
-  return;
 }
 
 static void handleAlignedAttr(Sema &S, Decl *D, const AttributeList &Attr) {
@@ -3341,6 +3395,8 @@
 /// attribute.
 static void parseModeAttrArg(Sema &S, StringRef Str, unsigned &DestWidth,
                              bool &IntegerMode, bool &ComplexMode) {
+  IntegerMode = true;
+  ComplexMode = false;
   switch (Str.size()) {
   case 2:
     switch (Str[0]) {
@@ -3376,7 +3432,7 @@
     // FIXME: glibc uses 'word' to define register_t; this is narrower than a
     // pointer on PIC16 and other embedded platforms.
     if (Str == "word")
-      DestWidth = S.Context.getTargetInfo().getPointerWidth(0);
+      DestWidth = S.Context.getTargetInfo().getRegisterWidth();
     else if (Str == "byte")
       DestWidth = S.Context.getTargetInfo().getCharWidth();
     break;
@@ -3407,9 +3463,15 @@
   }
 
   IdentifierInfo *Name = Attr.getArgAsIdent(0)->Ident;
-  StringRef Str = Name->getName();
 
+  S.AddModeAttr(Attr.getRange(), D, Name, Attr.getAttributeSpellingListIndex());
+}
+
+void Sema::AddModeAttr(SourceRange AttrRange, Decl *D, IdentifierInfo *Name,
+                       unsigned SpellingListIndex, bool InInstantiation) {
+  StringRef Str = Name->getName();
   normalizeName(Str);
+  SourceLocation AttrLoc = AttrRange.getBegin();
 
   unsigned DestWidth = 0;
   bool IntegerMode = true;
@@ -3425,114 +3487,129 @@
     if (VectorStringLength &&
         !Str.substr(1, VectorStringLength).getAsInteger(10, VectorSize) &&
         VectorSize.isPowerOf2()) {
-      parseModeAttrArg(S, Str.substr(VectorStringLength + 1), DestWidth,
+      parseModeAttrArg(*this, Str.substr(VectorStringLength + 1), DestWidth,
                        IntegerMode, ComplexMode);
-      S.Diag(Attr.getLoc(), diag::warn_vector_mode_deprecated);
+      // Avoid duplicate warning from template instantiation.
+      if (!InInstantiation)
+        Diag(AttrLoc, diag::warn_vector_mode_deprecated);
     } else {
       VectorSize = 0;
     }
   }
 
   if (!VectorSize)
-    parseModeAttrArg(S, Str, DestWidth, IntegerMode, ComplexMode);
+    parseModeAttrArg(*this, Str, DestWidth, IntegerMode, ComplexMode);
+
+  // FIXME: Sync this with InitializePredefinedMacros; we need to match int8_t
+  // and friends, at least with glibc.
+  // FIXME: Make sure floating-point mappings are accurate
+  // FIXME: Support XF and TF types
+  if (!DestWidth) {
+    Diag(AttrLoc, diag::err_machine_mode) << 0 /*Unknown*/ << Name;
+    return;
+  }
 
   QualType OldTy;
   if (TypedefNameDecl *TD = dyn_cast<TypedefNameDecl>(D))
     OldTy = TD->getUnderlyingType();
-  else
+  else if (EnumDecl *ED = dyn_cast<EnumDecl>(D)) {
+    // Something like 'typedef enum { X } __attribute__((mode(XX))) T;'.
+    // Try to get type from enum declaration, default to int.
+    OldTy = ED->getIntegerType();
+    if (OldTy.isNull())
+      OldTy = Context.IntTy;
+  } else
     OldTy = cast<ValueDecl>(D)->getType();
 
+  if (OldTy->isDependentType()) {
+    D->addAttr(::new (Context)
+               ModeAttr(AttrRange, Context, Name, SpellingListIndex));
+    return;
+  }
+
   // Base type can also be a vector type (see PR17453).
   // Distinguish between base type and base element type.
   QualType OldElemTy = OldTy;
   if (const VectorType *VT = OldTy->getAs<VectorType>())
     OldElemTy = VT->getElementType();
 
-  if (!OldElemTy->getAs<BuiltinType>() && !OldElemTy->isComplexType())
-    S.Diag(Attr.getLoc(), diag::err_mode_not_primitive);
+  // GCC allows 'mode' attribute on enumeration types (even incomplete), except
+  // for vector modes. So, 'enum X __attribute__((mode(QI)));' forms a complete
+  // type, 'enum { A } __attribute__((mode(V4SI)))' is rejected.
+  if ((isa<EnumDecl>(D) || OldElemTy->getAs<EnumType>()) &&
+      VectorSize.getBoolValue()) {
+    Diag(AttrLoc, diag::err_enum_mode_vector_type) << Name << AttrRange;
+    return;
+  }
+  bool IntegralOrAnyEnumType =
+      OldElemTy->isIntegralOrEnumerationType() || OldElemTy->getAs<EnumType>();
+
+  if (!OldElemTy->getAs<BuiltinType>() && !OldElemTy->isComplexType() &&
+      !IntegralOrAnyEnumType)
+    Diag(AttrLoc, diag::err_mode_not_primitive);
   else if (IntegerMode) {
-    if (!OldElemTy->isIntegralOrEnumerationType())
-      S.Diag(Attr.getLoc(), diag::err_mode_wrong_type);
+    if (!IntegralOrAnyEnumType)
+      Diag(AttrLoc, diag::err_mode_wrong_type);
   } else if (ComplexMode) {
     if (!OldElemTy->isComplexType())
-      S.Diag(Attr.getLoc(), diag::err_mode_wrong_type);
+      Diag(AttrLoc, diag::err_mode_wrong_type);
   } else {
     if (!OldElemTy->isFloatingType())
-      S.Diag(Attr.getLoc(), diag::err_mode_wrong_type);
-  }
-
-  // FIXME: Sync this with InitializePredefinedMacros; we need to match int8_t
-  // and friends, at least with glibc.
-  // FIXME: Make sure floating-point mappings are accurate
-  // FIXME: Support XF and TF types
-  if (!DestWidth) {
-    S.Diag(Attr.getLoc(), diag::err_machine_mode) << 0 /*Unknown*/ << Name;
-    return;
+      Diag(AttrLoc, diag::err_mode_wrong_type);
   }
 
   QualType NewElemTy;
 
   if (IntegerMode)
-    NewElemTy = S.Context.getIntTypeForBitwidth(
-        DestWidth, OldElemTy->isSignedIntegerType());
+    NewElemTy = Context.getIntTypeForBitwidth(DestWidth,
+                                              OldElemTy->isSignedIntegerType());
   else
-    NewElemTy = S.Context.getRealTypeForBitwidth(DestWidth);
+    NewElemTy = Context.getRealTypeForBitwidth(DestWidth);
 
   if (NewElemTy.isNull()) {
-    S.Diag(Attr.getLoc(), diag::err_machine_mode) << 1 /*Unsupported*/ << Name;
+    Diag(AttrLoc, diag::err_machine_mode) << 1 /*Unsupported*/ << Name;
     return;
   }
 
   if (ComplexMode) {
-    NewElemTy = S.Context.getComplexType(NewElemTy);
+    NewElemTy = Context.getComplexType(NewElemTy);
   }
 
   QualType NewTy = NewElemTy;
   if (VectorSize.getBoolValue()) {
-    NewTy = S.Context.getVectorType(NewTy, VectorSize.getZExtValue(),
-                                    VectorType::GenericVector);
+    NewTy = Context.getVectorType(NewTy, VectorSize.getZExtValue(),
+                                  VectorType::GenericVector);
   } else if (const VectorType *OldVT = OldTy->getAs<VectorType>()) {
     // Complex machine mode does not support base vector types.
     if (ComplexMode) {
-      S.Diag(Attr.getLoc(), diag::err_complex_mode_vector_type);
+      Diag(AttrLoc, diag::err_complex_mode_vector_type);
       return;
     }
-    unsigned NumElements = S.Context.getTypeSize(OldElemTy) *
+    unsigned NumElements = Context.getTypeSize(OldElemTy) *
                            OldVT->getNumElements() /
-                           S.Context.getTypeSize(NewElemTy);
+                           Context.getTypeSize(NewElemTy);
     NewTy =
-        S.Context.getVectorType(NewElemTy, NumElements, OldVT->getVectorKind());
+        Context.getVectorType(NewElemTy, NumElements, OldVT->getVectorKind());
   }
 
   if (NewTy.isNull()) {
-    S.Diag(Attr.getLoc(), diag::err_mode_wrong_type);
+    Diag(AttrLoc, diag::err_mode_wrong_type);
     return;
   }
 
   // Install the new type.
   if (TypedefNameDecl *TD = dyn_cast<TypedefNameDecl>(D))
     TD->setModedTypeSourceInfo(TD->getTypeSourceInfo(), NewTy);
+  else if (EnumDecl *ED = dyn_cast<EnumDecl>(D))
+    ED->setIntegerType(NewTy);
   else
     cast<ValueDecl>(D)->setType(NewTy);
 
-  D->addAttr(::new (S.Context)
-             ModeAttr(Attr.getRange(), S.Context, Name,
-                      Attr.getAttributeSpellingListIndex()));
+  D->addAttr(::new (Context)
+             ModeAttr(AttrRange, Context, Name, SpellingListIndex));
 }
 
 static void handleNoDebugAttr(Sema &S, Decl *D, const AttributeList &Attr) {
-  if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
-    if (!VD->hasGlobalStorage())
-      S.Diag(Attr.getLoc(),
-             diag::warn_attribute_requires_functions_or_static_globals)
-        << Attr.getName();
-  } else if (!isFunctionOrMethod(D)) {
-    S.Diag(Attr.getLoc(),
-           diag::warn_attribute_requires_functions_or_static_globals)
-      << Attr.getName();
-    return;
-  }
-
   D->addAttr(::new (S.Context)
              NoDebugAttr(Attr.getRange(), S.Context,
                          Attr.getAttributeSpellingListIndex()));
@@ -3686,11 +3763,21 @@
                               : FixItHint());
     return;
   }
+  if (const auto *Method = dyn_cast<CXXMethodDecl>(FD)) {
+    if (Method->isInstance()) {
+      S.Diag(Method->getLocStart(), diag::err_kern_is_nonstatic_method)
+          << Method;
+      return;
+    }
+    S.Diag(Method->getLocStart(), diag::warn_kern_is_method) << Method;
+  }
+  // Only warn for "inline" when compiling for host, to cut down on noise.
+  if (FD->isInlineSpecified() && !S.getLangOpts().CUDAIsDevice)
+    S.Diag(FD->getLocStart(), diag::warn_kern_is_inline) << FD;
 
   D->addAttr(::new (S.Context)
               CUDAGlobalAttr(Attr.getRange(), S.Context,
                              Attr.getAttributeSpellingListIndex()));
-
 }
 
 static void handleGNUInlineAttr(Sema &S, Decl *D, const AttributeList &Attr) {
@@ -3867,11 +3954,12 @@
 
     // This convention is not valid for the target. Use the default function or
     // method calling convention.
-    TargetInfo::CallingConvMethodType MT = TargetInfo::CCMT_Unknown;
-    if (FD)
-      MT = FD->isCXXInstanceMember() ? TargetInfo::CCMT_Member : 
-                                    TargetInfo::CCMT_NonMember;
-    CC = TI.getDefaultCallingConv(MT);
+    bool IsCXXMethod = false, IsVariadic = false;
+    if (FD) {
+      IsCXXMethod = FD->isCXXInstanceMember();
+      IsVariadic = FD->isVariadic();
+    }
+    CC = Context.getDefaultCallingConvention(IsVariadic, IsCXXMethod);
   }
 
   attr.setProcessingCache((unsigned) CC);
@@ -4004,49 +4092,60 @@
   return false;
 }
 
-// Checks whether an argument of launch_bounds attribute is acceptable
-// May output an error.
-static bool checkLaunchBoundsArgument(Sema &S, Expr *E,
-                                      const CUDALaunchBoundsAttr &Attr,
-                                      const unsigned Idx) {
-
+// Checks whether an argument of launch_bounds attribute is
+// acceptable, performs implicit conversion to Rvalue, and returns
+// non-nullptr Expr result on success. Otherwise, it returns nullptr
+// and may output an error.
+static Expr *makeLaunchBoundsArgExpr(Sema &S, Expr *E,
+                                     const CUDALaunchBoundsAttr &Attr,
+                                     const unsigned Idx) {
   if (S.DiagnoseUnexpandedParameterPack(E))
-    return false;
+    return nullptr;
 
   // Accept template arguments for now as they depend on something else.
   // We'll get to check them when they eventually get instantiated.
   if (E->isValueDependent())
-    return true;
+    return E;
 
   llvm::APSInt I(64);
   if (!E->isIntegerConstantExpr(I, S.Context)) {
     S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
         << &Attr << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
-    return false;
+    return nullptr;
   }
   // Make sure we can fit it in 32 bits.
   if (!I.isIntN(32)) {
     S.Diag(E->getExprLoc(), diag::err_ice_too_large) << I.toString(10, false)
                                                      << 32 << /* Unsigned */ 1;
-    return false;
+    return nullptr;
   }
   if (I < 0)
     S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
         << &Attr << Idx << E->getSourceRange();
 
-  return true;
+  // We may need to perform implicit conversion of the argument.
+  InitializedEntity Entity = InitializedEntity::InitializeParameter(
+      S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+  ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+  assert(!ValArg.isInvalid() &&
+         "Unexpected PerformCopyInitialization() failure.");
+
+  return ValArg.getAs<Expr>();
 }
 
 void Sema::AddLaunchBoundsAttr(SourceRange AttrRange, Decl *D, Expr *MaxThreads,
                                Expr *MinBlocks, unsigned SpellingListIndex) {
   CUDALaunchBoundsAttr TmpAttr(AttrRange, Context, MaxThreads, MinBlocks,
                                SpellingListIndex);
-
-  if (!checkLaunchBoundsArgument(*this, MaxThreads, TmpAttr, 0))
+  MaxThreads = makeLaunchBoundsArgExpr(*this, MaxThreads, TmpAttr, 0);
+  if (MaxThreads == nullptr)
     return;
 
-  if (MinBlocks && !checkLaunchBoundsArgument(*this, MinBlocks, TmpAttr, 1))
-    return;
+  if (MinBlocks) {
+    MinBlocks = makeLaunchBoundsArgExpr(*this, MinBlocks, TmpAttr, 1);
+    if (MinBlocks == nullptr)
+      return;
+  }
 
   D->addAttr(::new (Context) CUDALaunchBoundsAttr(
       AttrRange, Context, MaxThreads, MinBlocks, SpellingListIndex));
@@ -4152,6 +4251,7 @@
          type->isObjCObjectPointerType() || 
          S.Context.isObjCNSObjectType(type);
 }
+
 static bool isValidSubjectOfCFAttribute(Sema &S, QualType type) {
   return type->isDependentType() || 
          type->isPointerType() || 
@@ -4202,7 +4302,6 @@
 
 static void handleNSReturnsRetainedAttr(Sema &S, Decl *D,
                                         const AttributeList &Attr) {
-
   QualType returnType;
 
   if (ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D))
@@ -4365,7 +4464,8 @@
 
 static void handleNSErrorDomain(Sema &S, Decl *D, const AttributeList &Attr) {
   if (!isa<TagDecl>(D)) {
-    S.Diag(D->getLocStart(), diag::err_nserrordomain_not_tagdecl);
+    S.Diag(D->getLocStart(), diag::err_nserrordomain_not_tagdecl)
+        << S.getLangOpts().CPlusPlus;
     return;
   }
   IdentifierLoc *identLoc =
@@ -4509,10 +4609,9 @@
                                  Attr.getAttributeSpellingListIndex()));
 }
 
-// when a user wants to use objc_boxable with a union or struct
-// but she doesn't have access to the declaration (legacy/third-party code)
-// then she can 'enable' this feature via trick with a typedef
-// e.g.:
+// When a user wants to use objc_boxable with a union or struct
+// but they don't have access to the declaration (legacy/third-party code)
+// then they can 'enable' this feature with a typedef:
 // typedef struct __attribute((objc_boxable)) legacy_struct legacy_struct;
 static void handleObjCBoxable(Sema &S, Decl *D, const AttributeList &Attr) {
   bool notify = false;
@@ -5060,8 +5159,10 @@
       D, Attr.getRange(), /*BestCase=*/true,
       Attr.getAttributeSpellingListIndex(),
       (MSInheritanceAttr::Spelling)Attr.getSemanticSpelling());
-  if (IA)
+  if (IA) {
     D->addAttr(IA);
+    S.Consumer.AssignInheritanceModel(cast<CXXRecordDecl>(D));
+  }
 }
 
 static void handleDeclspecThreadAttr(Sema &S, Decl *D,
@@ -5083,6 +5184,38 @@
       Attr.getRange(), S.Context, Attr.getAttributeSpellingListIndex()));
 }
 
+static void handleAbiTagAttr(Sema &S, Decl *D, const AttributeList &Attr) {
+  SmallVector<StringRef, 4> Tags;
+  for (unsigned I = 0, E = Attr.getNumArgs(); I != E; ++I) {
+    StringRef Tag;
+    if (!S.checkStringLiteralArgumentAttr(Attr, I, Tag))
+      return;
+    Tags.push_back(Tag);
+  }
+
+  if (const auto *NS = dyn_cast<NamespaceDecl>(D)) {
+    if (!NS->isInline()) {
+      S.Diag(Attr.getLoc(), diag::warn_attr_abi_tag_namespace) << 0;
+      return;
+    }
+    if (NS->isAnonymousNamespace()) {
+      S.Diag(Attr.getLoc(), diag::warn_attr_abi_tag_namespace) << 1;
+      return;
+    }
+    if (Attr.getNumArgs() == 0)
+      Tags.push_back(NS->getName());
+  } else if (!checkAttributeAtLeastNumArgs(S, Attr, 1))
+    return;
+
+  // Store tags sorted and without duplicates.
+  std::sort(Tags.begin(), Tags.end());
+  Tags.erase(std::unique(Tags.begin(), Tags.end()), Tags.end());
+
+  D->addAttr(::new (S.Context)
+             AbiTagAttr(Attr.getRange(), S.Context, Tags.data(), Tags.size(),
+                        Attr.getAttributeSpellingListIndex()));
+}
+
 static void handleARMInterruptAttr(Sema &S, Decl *D,
                                    const AttributeList &Attr) {
   // Check the attribute arguments.
@@ -5344,6 +5477,24 @@
                                         Attr.getAttributeSpellingListIndex()));
 }
 
+static void handleLayoutVersion(Sema &S, Decl *D, const AttributeList &Attr) {
+  uint32_t Version;
+  Expr *VersionExpr = static_cast<Expr *>(Attr.getArgAsExpr(0));
+  if (!checkUInt32Argument(S, Attr, Attr.getArgAsExpr(0), Version))
+    return;
+
+  // TODO: Investigate what happens with the next major version of MSVC.
+  if (Version != LangOptions::MSVC2015) {
+    S.Diag(Attr.getLoc(), diag::err_attribute_argument_out_of_bounds)
+        << Attr.getName() << Version << VersionExpr->getSourceRange();
+    return;
+  }
+
+  D->addAttr(::new (S.Context)
+                 LayoutVersionAttr(Attr.getRange(), S.Context, Version,
+                                   Attr.getAttributeSpellingListIndex()));
+}
+
 DLLImportAttr *Sema::mergeDLLImportAttr(Decl *D, SourceRange Range,
                                         unsigned AttrSpellingListIndex) {
   if (D->hasAttr<DLLExportAttr>()) {
@@ -5553,7 +5704,7 @@
   if (!S.getLangOpts().CPlusPlus14)
     if (Attr.isCXX11Attribute() &&
         !(Attr.hasScope() && Attr.getScopeName()->isStr("gnu")))
-      S.Diag(Attr.getLoc(), diag::ext_deprecated_attr_is_a_cxx14_extension);
+      S.Diag(Attr.getLoc(), diag::ext_cxx14_attr) << Attr.getName();
 
   D->addAttr(::new (S.Context) DeprecatedAttr(Attr.getRange(), S.Context, Str,
                                    Replacement,
@@ -5564,7 +5715,7 @@
   if (!checkAttributeAtLeastNumArgs(S, Attr, 1))
     return;
 
-  std::vector<std::string> Sanitizers;
+  std::vector<StringRef> Sanitizers;
 
   for (unsigned I = 0, E = Attr.getNumArgs(); I != E; ++I) {
     StringRef SanitizerName;
@@ -5588,8 +5739,8 @@
                                          const AttributeList &Attr) {
   StringRef AttrName = Attr.getName()->getName();
   normalizeName(AttrName);
-  std::string SanitizerName =
-      llvm::StringSwitch<std::string>(AttrName)
+  StringRef SanitizerName =
+      llvm::StringSwitch<StringRef>(AttrName)
           .Case("no_address_safety_analysis", "address")
           .Case("no_sanitize_address", "address")
           .Case("no_sanitize_thread", "thread")
@@ -5607,6 +5758,15 @@
     D->addAttr(Internal);
 }
 
+static void handleOpenCLNoSVMAttr(Sema &S, Decl *D, const AttributeList &Attr) {
+  if (S.LangOpts.OpenCLVersion != 200)
+    S.Diag(Attr.getLoc(), diag::err_attribute_requires_opencl_version)
+        << Attr.getName() << "2.0" << 0;
+  else
+    S.Diag(Attr.getLoc(), diag::warn_opencl_attr_deprecated_ignored)
+        << Attr.getName() << "2.0";
+}
+
 /// Handles semantic checking for features that are common to all attributes,
 /// such as checking whether a parameter was properly specified, or the correct
 /// number of arguments were passed, etc.
@@ -5648,6 +5808,40 @@
   return false;
 }
 
+static void handleOpenCLAccessAttr(Sema &S, Decl *D,
+                                   const AttributeList &Attr) {
+  if (D->isInvalidDecl())
+    return;
+
+  // Check if there is only one access qualifier.
+  if (D->hasAttr<OpenCLAccessAttr>()) {
+    S.Diag(Attr.getLoc(), diag::err_opencl_multiple_access_qualifiers)
+        << D->getSourceRange();
+    D->setInvalidDecl(true);
+    return;
+  }
+
+  // OpenCL v2.0 s6.6 - read_write can be used for image types to specify that an
+  // image object can be read and written.
+  // OpenCL v2.0 s6.13.6 - A kernel cannot read from and write to the same pipe
+  // object. Using the read_write (or __read_write) qualifier with the pipe
+  // qualifier is a compilation error.
+  if (const ParmVarDecl *PDecl = dyn_cast<ParmVarDecl>(D)) {
+    const Type *DeclTy = PDecl->getType().getCanonicalType().getTypePtr();
+    if (Attr.getName()->getName().find("read_write") != StringRef::npos) {
+      if (S.getLangOpts().OpenCLVersion < 200 || DeclTy->isPipeType()) {
+        S.Diag(Attr.getLoc(), diag::err_opencl_invalid_read_write)
+            << Attr.getName() << PDecl->getType() << DeclTy->isImageType();
+        D->setInvalidDecl(true);
+        return;
+      }
+    }
+  }
+
+  D->addAttr(::new (S.Context) OpenCLAccessAttr(
+      Attr.getRange(), S.Context, Attr.getAttributeSpellingListIndex()));
+}
+
 //===----------------------------------------------------------------------===//
 // Top Level Sema Entry Points
 //===----------------------------------------------------------------------===//
@@ -5683,8 +5877,13 @@
 
   switch (Attr.getKind()) {
   default:
-    // Type attributes are handled elsewhere; silently move on.
-    assert(Attr.isTypeAttr() && "Non-type attribute not handled");
+    if (!Attr.isStmtAttr()) {
+      // Type attributes are handled elsewhere; silently move on.
+      assert(Attr.isTypeAttr() && "Non-type attribute not handled");
+      break;
+    }
+    S.Diag(Attr.getLoc(), diag::err_stmt_attribute_invalid_on_decl)
+        << Attr.getName() << D->getLocation();
     break;
   case AttributeList::AT_Interrupt:
     handleInterruptAttr(S, D, Attr);
@@ -5718,6 +5917,9 @@
   case AttributeList::AT_IBOutletCollection:
     handleIBOutletCollection(S, D, Attr);
     break;
+  case AttributeList::AT_IFunc:
+    handleIFuncAttr(S, D, Attr);
+    break;
   case AttributeList::AT_Alias:
     handleAliasAttr(S, D, Attr);
     break;
@@ -5869,38 +6071,30 @@
   case AttributeList::AT_VecReturn:
     handleVecReturnAttr(S, D, Attr);
     break;
-
   case AttributeList::AT_ObjCOwnership:
     handleObjCOwnershipAttr(S, D, Attr);
     break;
   case AttributeList::AT_ObjCPreciseLifetime:
     handleObjCPreciseLifetimeAttr(S, D, Attr);
     break;
-
   case AttributeList::AT_ObjCReturnsInnerPointer:
     handleObjCReturnsInnerPointerAttr(S, D, Attr);
     break;
-
   case AttributeList::AT_ObjCRequiresSuper:
     handleObjCRequiresSuperAttr(S, D, Attr);
     break;
-
   case AttributeList::AT_ObjCBridge:
     handleObjCBridgeAttr(S, scope, D, Attr);
     break;
-
   case AttributeList::AT_ObjCBridgeMutable:
     handleObjCBridgeMutableAttr(S, scope, D, Attr);
     break;
-
   case AttributeList::AT_ObjCBridgeRelated:
     handleObjCBridgeRelatedAttr(S, scope, D, Attr);
     break;
-
   case AttributeList::AT_ObjCDesignatedInitializer:
     handleObjCDesignatedInitializer(S, D, Attr);
     break;
-
   case AttributeList::AT_ObjCRuntimeName:
     handleObjCRuntimeName(S, D, Attr);
     break;
@@ -5910,18 +6104,15 @@
   case AttributeList::AT_ObjCBoxable:
     handleObjCBoxable(S, D, Attr);
     break;
-
   case AttributeList::AT_NSErrorDomain:
     handleNSErrorDomain(S, D, Attr);
     break;
-          
   case AttributeList::AT_CFAuditedTransfer:
     handleCFAuditedTransferAttr(S, D, Attr);
     break;
   case AttributeList::AT_CFUnknownTransfer:
     handleCFUnknownTransferAttr(S, D, Attr);
     break;
-
   case AttributeList::AT_CFConsumed:
   case AttributeList::AT_NSConsumed:
     handleNSConsumedAttr(S, D, Attr);
@@ -5929,7 +6120,6 @@
   case AttributeList::AT_NSConsumesSelf:
     handleSimpleAttribute<NSConsumesSelfAttr>(S, D, Attr);
     break;
-
   case AttributeList::AT_NSReturnsAutoreleased:
   case AttributeList::AT_NSReturnsNotRetained:
   case AttributeList::AT_CFReturnsNotRetained:
@@ -5946,11 +6136,9 @@
   case AttributeList::AT_VecTypeHint:
     handleVecTypeHint(S, D, Attr);
     break;
-
   case AttributeList::AT_InitPriority:
     handleInitPriorityAttr(S, D, Attr);
     break;
-
   case AttributeList::AT_Packed:
     handlePackedAttr(S, D, Attr);
     break;
@@ -5982,7 +6170,7 @@
     handleSimpleAttribute<ObjCRequiresPropertyDefsAttr>(S, D, Attr);
     break;
   case AttributeList::AT_Unused:
-    handleSimpleAttribute<UnusedAttr>(S, D, Attr);
+    handleUnusedAttr(S, D, Attr);
     break;
   case AttributeList::AT_ReturnsTwice:
     handleSimpleAttribute<ReturnsTwiceAttr>(S, D, Attr);
@@ -6077,8 +6265,11 @@
   case AttributeList::AT_OpenCLKernel:
     handleSimpleAttribute<OpenCLKernelAttr>(S, D, Attr);
     break;
-  case AttributeList::AT_OpenCLImageAccess:
-    handleSimpleAttribute<OpenCLImageAccessAttr>(S, D, Attr);
+  case AttributeList::AT_OpenCLAccess:
+    handleOpenCLAccessAttr(S, D, Attr);
+    break;
+  case AttributeList::AT_OpenCLNoSVM:
+    handleOpenCLNoSVMAttr(S, D, Attr);
     break;
   case AttributeList::AT_SwiftContext:
     handleParameterABIAttr(S, D, Attr, ParameterABI::SwiftContext);
@@ -6092,8 +6283,17 @@
   case AttributeList::AT_InternalLinkage:
     handleInternalLinkageAttr(S, D, Attr);
     break;
+  case AttributeList::AT_LTOVisibilityPublic:
+    handleSimpleAttribute<LTOVisibilityPublicAttr>(S, D, Attr);
+    break;
 
   // Microsoft attributes:
+  case AttributeList::AT_EmptyBases:
+    handleSimpleAttribute<EmptyBasesAttr>(S, D, Attr);
+    break;
+  case AttributeList::AT_LayoutVersion:
+    handleLayoutVersion(S, D, Attr);
+    break;
   case AttributeList::AT_MSNoVTable:
     handleSimpleAttribute<MSNoVTableAttr>(S, D, Attr);
     break;
@@ -6113,6 +6313,10 @@
     handleDeclspecThreadAttr(S, D, Attr);
     break;
 
+  case AttributeList::AT_AbiTag:
+    handleAbiTagAttr(S, D, Attr);
+    break;
+
   // Thread safety attributes:
   case AttributeList::AT_AssertExclusiveLock:
     handleAssertExclusiveLockAttr(S, D, Attr);
@@ -6218,7 +6422,9 @@
   case AttributeList::AT_TypeTagForDatatype:
     handleTypeTagForDatatypeAttr(S, D, Attr);
     break;
-
+  case AttributeList::AT_RenderScriptKernel:
+    handleSimpleAttribute<RenderScriptKernelAttr>(S, D, Attr);
+    break;
   // Swift attributes.
   case AttributeList::AT_SwiftPrivate:
     handleSimpleAttribute<SwiftPrivateAttr>(S, D, Attr);
@@ -6234,6 +6440,9 @@
     break;
   case AttributeList::AT_SwiftNewtype:
     handleSwiftNewtypeAttr(S, D, Attr);
+  // XRay attributes.
+  case AttributeList::AT_XRayInstrument:
+    handleSimpleAttribute<XRayInstrumentAttr>(S, D, Attr);
     break;
   }
 }
@@ -6516,7 +6725,6 @@
   diag.Triggered = true;
 }
 
-
 static bool isDeclDeprecated(Decl *D) {
   do {
     if (D->isDeprecated())
@@ -6569,7 +6777,7 @@
   return nullptr;
 }
 
-static void DoEmitAvailabilityWarning(Sema &S, Sema::AvailabilityDiagnostic K,
+static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
                                       Decl *Ctx, const NamedDecl *D,
                                       StringRef Message, SourceLocation Loc,
                                       const ObjCInterfaceDecl *UnknownObjCClass,
@@ -6587,7 +6795,7 @@
 
   // Don't warn if our current context is deprecated or unavailable.
   switch (K) {
-  case Sema::AD_Deprecation:
+  case AR_Deprecated:
     if (isDeclDeprecated(Ctx) || isDeclUnavailable(Ctx))
       return;
     diag = !ObjCPropertyAccess ? diag::warn_deprecated
@@ -6598,7 +6806,7 @@
     available_here_select_kind = /* deprecated */ 2;
     break;
 
-  case Sema::AD_Unavailable:
+  case AR_Unavailable:
     if (isDeclUnavailable(Ctx))
       return;
     diag = !ObjCPropertyAccess ? diag::err_unavailable
@@ -6650,21 +6858,26 @@
         }
       }
     }
-
     break;
 
-  case Sema::AD_Partial:
+  case AR_NotYetIntroduced:
+    assert(!S.getCurFunctionOrMethodDecl() &&
+           "Function-level partial availablity should not be diagnosed here!");
+
     diag = diag::warn_partial_availability;
     diag_message = diag::warn_partial_message;
     diag_fwdclass_message = diag::warn_partial_fwdclass_message;
     property_note_select = /* partial */ 2;
     available_here_select_kind = /* partial */ 3;
     break;
+
+  case AR_Available:
+    llvm_unreachable("Warning for availability of available declaration?");
   }
 
   CharSourceRange UseRange;
   StringRef Replacement;
-  if (K == Sema::AD_Deprecation) {
+  if (K == AR_Deprecated) {
     if (auto attr = D->getAttr<DeprecatedAttr>())
       Replacement = attr->getReplacement();
     if (auto attr = getAttrForPlatform(S.Context, D))
@@ -6717,7 +6930,7 @@
     S.Diag(D->getLocation(), diag_available_here)
         << D << available_here_select_kind;
 
-  if (K == Sema::AD_Partial)
+  if (K == AR_NotYetIntroduced)
     S.Diag(Loc, diag::note_partial_availability_silence) << D;
 }
 
@@ -6725,12 +6938,12 @@
                                            Decl *Ctx) {
   assert(DD.Kind == DelayedDiagnostic::Deprecation ||
          DD.Kind == DelayedDiagnostic::Unavailable);
-  Sema::AvailabilityDiagnostic AD = DD.Kind == DelayedDiagnostic::Deprecation
-                                        ? Sema::AD_Deprecation
-                                        : Sema::AD_Unavailable;
+  AvailabilityResult AR = DD.Kind == DelayedDiagnostic::Deprecation
+                              ? AR_Deprecated
+                              : AR_Unavailable;
   DD.Triggered = true;
   DoEmitAvailabilityWarning(
-      S, AD, Ctx, DD.getDeprecationDecl(), DD.getDeprecationMessage(), DD.Loc,
+      S, AR, Ctx, DD.getDeprecationDecl(), DD.getDeprecationMessage(), DD.Loc,
       DD.getUnknownObjCClass(), DD.getObjCProperty(), false);
 }
 
@@ -6790,21 +7003,188 @@
   curPool->steal(pool);
 }
 
-void Sema::EmitAvailabilityWarning(AvailabilityDiagnostic AD,
+void Sema::EmitAvailabilityWarning(AvailabilityResult AR,
                                    NamedDecl *D, StringRef Message,
                                    SourceLocation Loc,
                                    const ObjCInterfaceDecl *UnknownObjCClass,
                                    const ObjCPropertyDecl  *ObjCProperty,
                                    bool ObjCPropertyAccess) {
   // Delay if we're currently parsing a declaration.
-  if (DelayedDiagnostics.shouldDelayDiagnostics() && AD != AD_Partial) {
+  if (DelayedDiagnostics.shouldDelayDiagnostics() &&
+      AR != AR_NotYetIntroduced) {
     DelayedDiagnostics.add(DelayedDiagnostic::makeAvailability(
-        AD, Loc, D, UnknownObjCClass, ObjCProperty, Message,
+        AR, Loc, D, UnknownObjCClass, ObjCProperty, Message,
         ObjCPropertyAccess));
     return;
   }
 
   Decl *Ctx = cast<Decl>(getCurLexicalContext());
-  DoEmitAvailabilityWarning(*this, AD, Ctx, D, Message, Loc, UnknownObjCClass,
+  DoEmitAvailabilityWarning(*this, AR, Ctx, D, Message, Loc, UnknownObjCClass,
                             ObjCProperty, ObjCPropertyAccess);
 }
+
+VersionTuple Sema::getVersionForDecl(const Decl *D) const {
+  assert(D && "Expected a declaration here!");
+
+  VersionTuple DeclVersion;
+  if (const auto *AA = getAttrForPlatform(getASTContext(), D))
+    DeclVersion = AA->getIntroduced();
+
+  const ObjCInterfaceDecl *Interface = nullptr;
+
+  if (const auto *MD = dyn_cast<ObjCMethodDecl>(D))
+    Interface = MD->getClassInterface();
+  else if (const auto *ID = dyn_cast<ObjCImplementationDecl>(D))
+    Interface = ID->getClassInterface();
+
+  if (Interface) {
+    if (const auto *AA = getAttrForPlatform(getASTContext(), Interface))
+      if (AA->getIntroduced() > DeclVersion)
+        DeclVersion = AA->getIntroduced();
+  }
+
+  return std::max(DeclVersion, Context.getTargetInfo().getPlatformMinVersion());
+}
+
+namespace {
+
+/// \brief This class implements -Wunguarded-availability.
+///
+/// This is done with a traversal of the AST of a function that makes reference
+/// to a partially available declaration. Whenever we encounter an \c if of the
+/// form: \c if(@available(...)), we use the version from the condition to visit
+/// the then statement.
+class DiagnoseUnguardedAvailability
+    : public RecursiveASTVisitor<DiagnoseUnguardedAvailability> {
+  typedef RecursiveASTVisitor<DiagnoseUnguardedAvailability> Base;
+
+  Sema &SemaRef;
+
+  /// Stack of potentially nested 'if (@available(...))'s.
+  SmallVector<VersionTuple, 8> AvailabilityStack;
+
+  void DiagnoseDeclAvailability(NamedDecl *D, SourceRange Range);
+
+public:
+  DiagnoseUnguardedAvailability(Sema &SemaRef, VersionTuple BaseVersion)
+      : SemaRef(SemaRef) {
+    AvailabilityStack.push_back(BaseVersion);
+  }
+
+  void IssueDiagnostics(Stmt *S) { TraverseStmt(S); }
+
+  bool TraverseIfStmt(IfStmt *If);
+
+  bool VisitObjCMessageExpr(ObjCMessageExpr *Msg) {
+    if (ObjCMethodDecl *D = Msg->getMethodDecl())
+      DiagnoseDeclAvailability(
+          D, SourceRange(Msg->getSelectorStartLoc(), Msg->getLocEnd()));
+    return true;
+  }
+
+  bool VisitDeclRefExpr(DeclRefExpr *DRE) {
+    DiagnoseDeclAvailability(DRE->getDecl(),
+                             SourceRange(DRE->getLocStart(), DRE->getLocEnd()));
+    return true;
+  }
+
+  bool VisitMemberExpr(MemberExpr *ME) {
+    DiagnoseDeclAvailability(ME->getMemberDecl(),
+                             SourceRange(ME->getLocStart(), ME->getLocEnd()));
+    return true;
+  }
+
+  bool VisitTypeLoc(TypeLoc Ty);
+};
+
+void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
+    NamedDecl *D, SourceRange Range) {
+
+  VersionTuple ContextVersion = AvailabilityStack.back();
+  if (AvailabilityResult Result = SemaRef.ShouldDiagnoseAvailabilityOfDecl(
+          D, ContextVersion, nullptr)) {
+    // All other diagnostic kinds have already been handled in
+    // DiagnoseAvailabilityOfDecl.
+    if (Result != AR_NotYetIntroduced)
+      return;
+
+    const AvailabilityAttr *AA = getAttrForPlatform(SemaRef.getASTContext(), D);
+    VersionTuple Introduced = AA->getIntroduced();
+
+    SemaRef.Diag(Range.getBegin(), diag::warn_unguarded_availability)
+        << Range << D
+        << AvailabilityAttr::getPrettyPlatformName(
+               SemaRef.getASTContext().getTargetInfo().getPlatformName())
+        << Introduced.getAsString();
+
+    SemaRef.Diag(D->getLocation(), diag::note_availability_specified_here)
+        << D << /* partial */ 3;
+
+    // FIXME: Replace this with a fixit diagnostic.
+    SemaRef.Diag(Range.getBegin(), diag::note_unguarded_available_silence)
+        << Range << D;
+  }
+}
+
+bool DiagnoseUnguardedAvailability::VisitTypeLoc(TypeLoc Ty) {
+  const Type *TyPtr = Ty.getTypePtr();
+  SourceRange Range{Ty.getBeginLoc(), Ty.getEndLoc()};
+
+  if (const TagType *TT = dyn_cast<TagType>(TyPtr)) {
+    TagDecl *TD = TT->getDecl();
+    DiagnoseDeclAvailability(TD, Range);
+
+  } else if (const TypedefType *TD = dyn_cast<TypedefType>(TyPtr)) {
+    TypedefNameDecl *D = TD->getDecl();
+    DiagnoseDeclAvailability(D, Range);
+
+  } else if (const auto *ObjCO = dyn_cast<ObjCObjectType>(TyPtr)) {
+    if (NamedDecl *D = ObjCO->getInterface())
+      DiagnoseDeclAvailability(D, Range);
+  }
+
+  return true;
+}
+
+bool DiagnoseUnguardedAvailability::TraverseIfStmt(IfStmt *If) {
+  VersionTuple CondVersion;
+  if (auto *E = dyn_cast<ObjCAvailabilityCheckExpr>(If->getCond())) {
+    CondVersion = E->getVersion();
+
+    // If we're using the '*' case here or if this check is redundant, then we
+    // use the enclosing version to check both branches.
+    if (CondVersion.empty() || CondVersion <= AvailabilityStack.back())
+      return Base::TraverseStmt(If->getThen()) &&
+             Base::TraverseStmt(If->getElse());
+  } else {
+    // This isn't an availability checking 'if', we can just continue.
+    return Base::TraverseIfStmt(If);
+  }
+
+  AvailabilityStack.push_back(CondVersion);
+  bool ShouldContinue = TraverseStmt(If->getThen());
+  AvailabilityStack.pop_back();
+
+  return ShouldContinue && TraverseStmt(If->getElse());
+}
+
+} // end anonymous namespace
+
+void Sema::DiagnoseUnguardedAvailabilityViolations(Decl *D) {
+  Stmt *Body = nullptr;
+
+  if (auto *FD = D->getAsFunction()) {
+    // FIXME: We only examine the pattern decl for availability violations now,
+    // but we should also examine instantiated templates.
+    if (FD->isTemplateInstantiation())
+      return;
+
+    Body = FD->getBody();
+  } else if (auto *MD = dyn_cast<ObjCMethodDecl>(D))
+    Body = MD->getBody();
+
+  assert(Body && "Need a body here!");
+
+  VersionTuple BaseVersion = getVersionForDecl(D);
+  DiagnoseUnguardedAvailability(*this, BaseVersion).IssueDiagnostics(Body);
+}
diff --git a/lib/Sema/SemaDeclCXX.cpp b/lib/Sema/SemaDeclCXX.cpp
index 5d1dda0..f6b975e 100644
--- a/lib/Sema/SemaDeclCXX.cpp
+++ b/lib/Sema/SemaDeclCXX.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTLambda.h"
@@ -36,9 +35,11 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include <map>
 #include <set>
 
@@ -471,7 +472,7 @@
       continue;
     }
 
-    // We found our guy.
+    // We found the right previous declaration.
     break;
   }
 
@@ -664,6 +665,771 @@
   return Invalid;
 }
 
+NamedDecl *
+Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D,
+                                   MultiTemplateParamsArg TemplateParamLists) {
+  assert(D.isDecompositionDeclarator());
+  const DecompositionDeclarator &Decomp = D.getDecompositionDeclarator();
+
+  // The syntax only allows a decomposition declarator as a simple-declaration
+  // or a for-range-declaration, but we parse it in more cases than that.
+  if (!D.mayHaveDecompositionDeclarator()) {
+    Diag(Decomp.getLSquareLoc(), diag::err_decomp_decl_context)
+      << Decomp.getSourceRange();
+    return nullptr;
+  }
+
+  if (!TemplateParamLists.empty()) {
+    // FIXME: There's no rule against this, but there are also no rules that
+    // would actually make it usable, so we reject it for now.
+    Diag(TemplateParamLists.front()->getTemplateLoc(),
+         diag::err_decomp_decl_template);
+    return nullptr;
+  }
+
+  Diag(Decomp.getLSquareLoc(), getLangOpts().CPlusPlus1z
+                                   ? diag::warn_cxx14_compat_decomp_decl
+                                   : diag::ext_decomp_decl)
+      << Decomp.getSourceRange();
+
+  // The semantic context is always just the current context.
+  DeclContext *const DC = CurContext;
+
+  // C++1z [dcl.dcl]/8:
+  //   The decl-specifier-seq shall contain only the type-specifier auto
+  //   and cv-qualifiers.
+  auto &DS = D.getDeclSpec();
+  {
+    SmallVector<StringRef, 8> BadSpecifiers;
+    SmallVector<SourceLocation, 8> BadSpecifierLocs;
+    if (auto SCS = DS.getStorageClassSpec()) {
+      BadSpecifiers.push_back(DeclSpec::getSpecifierName(SCS));
+      BadSpecifierLocs.push_back(DS.getStorageClassSpecLoc());
+    }
+    if (auto TSCS = DS.getThreadStorageClassSpec()) {
+      BadSpecifiers.push_back(DeclSpec::getSpecifierName(TSCS));
+      BadSpecifierLocs.push_back(DS.getThreadStorageClassSpecLoc());
+    }
+    if (DS.isConstexprSpecified()) {
+      BadSpecifiers.push_back("constexpr");
+      BadSpecifierLocs.push_back(DS.getConstexprSpecLoc());
+    }
+    if (DS.isInlineSpecified()) {
+      BadSpecifiers.push_back("inline");
+      BadSpecifierLocs.push_back(DS.getInlineSpecLoc());
+    }
+    if (!BadSpecifiers.empty()) {
+      auto &&Err = Diag(BadSpecifierLocs.front(), diag::err_decomp_decl_spec);
+      Err << (int)BadSpecifiers.size()
+          << llvm::join(BadSpecifiers.begin(), BadSpecifiers.end(), " ");
+      // Don't add FixItHints to remove the specifiers; we do still respect
+      // them when building the underlying variable.
+      for (auto Loc : BadSpecifierLocs)
+        Err << SourceRange(Loc, Loc);
+    }
+    // We can't recover from it being declared as a typedef.
+    if (DS.getStorageClassSpec() == DeclSpec::SCS_typedef)
+      return nullptr;
+  }
+
+  TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
+  QualType R = TInfo->getType();
+
+  if (DiagnoseUnexpandedParameterPack(D.getIdentifierLoc(), TInfo,
+                                      UPPC_DeclarationType))
+    D.setInvalidType();
+
+  // The syntax only allows a single ref-qualifier prior to the decomposition
+  // declarator. No other declarator chunks are permitted. Also check the type
+  // specifier here.
+  if (DS.getTypeSpecType() != DeclSpec::TST_auto ||
+      D.hasGroupingParens() || D.getNumTypeObjects() > 1 ||
+      (D.getNumTypeObjects() == 1 &&
+       D.getTypeObject(0).Kind != DeclaratorChunk::Reference)) {
+    Diag(Decomp.getLSquareLoc(),
+         (D.hasGroupingParens() ||
+          (D.getNumTypeObjects() &&
+           D.getTypeObject(0).Kind == DeclaratorChunk::Paren))
+             ? diag::err_decomp_decl_parens
+             : diag::err_decomp_decl_type)
+        << R;
+
+    // In most cases, there's no actual problem with an explicitly-specified
+    // type, but a function type won't work here, and ActOnVariableDeclarator
+    // shouldn't be called for such a type.
+    if (R->isFunctionType())
+      D.setInvalidType();
+  }
+
+  // Build the BindingDecls.
+  SmallVector<BindingDecl*, 8> Bindings;
+
+  // Build the BindingDecls.
+  for (auto &B : D.getDecompositionDeclarator().bindings()) {
+    // Check for name conflicts.
+    DeclarationNameInfo NameInfo(B.Name, B.NameLoc);
+    LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
+                          ForRedeclaration);
+    LookupName(Previous, S,
+               /*CreateBuiltins*/DC->getRedeclContext()->isTranslationUnit());
+
+    // It's not permitted to shadow a template parameter name.
+    if (Previous.isSingleResult() &&
+        Previous.getFoundDecl()->isTemplateParameter()) {
+      DiagnoseTemplateParameterShadow(D.getIdentifierLoc(),
+                                      Previous.getFoundDecl());
+      Previous.clear();
+    }
+
+    bool ConsiderLinkage = DC->isFunctionOrMethod() &&
+                           DS.getStorageClassSpec() == DeclSpec::SCS_extern;
+    FilterLookupForScope(Previous, DC, S, ConsiderLinkage,
+                         /*AllowInlineNamespace*/false);
+    if (!Previous.empty()) {
+      auto *Old = Previous.getRepresentativeDecl();
+      Diag(B.NameLoc, diag::err_redefinition) << B.Name;
+      Diag(Old->getLocation(), diag::note_previous_definition);
+    }
+
+    auto *BD = BindingDecl::Create(Context, DC, B.NameLoc, B.Name);
+    PushOnScopeChains(BD, S, true);
+    Bindings.push_back(BD);
+    ParsingInitForAutoVars.insert(BD);
+  }
+
+  // There are no prior lookup results for the variable itself, because it
+  // is unnamed.
+  DeclarationNameInfo NameInfo((IdentifierInfo *)nullptr,
+                               Decomp.getLSquareLoc());
+  LookupResult Previous(*this, NameInfo, LookupOrdinaryName, ForRedeclaration);
+
+  // Build the variable that holds the non-decomposed object.
+  bool AddToScope = true;
+  NamedDecl *New =
+      ActOnVariableDeclarator(S, D, DC, TInfo, Previous,
+                              MultiTemplateParamsArg(), AddToScope, Bindings);
+  CurContext->addHiddenDecl(New);
+
+  if (isInOpenMPDeclareTargetContext())
+    checkDeclIsAllowedInOpenMPTarget(nullptr, New);
+
+  return New;
+}
+
+static bool checkSimpleDecomposition(
+    Sema &S, ArrayRef<BindingDecl *> Bindings, ValueDecl *Src,
+    QualType DecompType, llvm::APSInt NumElems, QualType ElemType,
+    llvm::function_ref<ExprResult(SourceLocation, Expr *, unsigned)> GetInit) {
+  if ((int64_t)Bindings.size() != NumElems) {
+    S.Diag(Src->getLocation(), diag::err_decomp_decl_wrong_number_bindings)
+        << DecompType << (unsigned)Bindings.size() << NumElems.toString(10)
+        << (NumElems < Bindings.size());
+    return true;
+  }
+
+  unsigned I = 0;
+  for (auto *B : Bindings) {
+    SourceLocation Loc = B->getLocation();
+    ExprResult E = S.BuildDeclRefExpr(Src, DecompType, VK_LValue, Loc);
+    if (E.isInvalid())
+      return true;
+    E = GetInit(Loc, E.get(), I++);
+    if (E.isInvalid())
+      return true;
+    B->setBinding(ElemType, E.get());
+  }
+
+  return false;
+}
+
+static bool checkArrayLikeDecomposition(Sema &S,
+                                        ArrayRef<BindingDecl *> Bindings,
+                                        ValueDecl *Src, QualType DecompType,
+                                        llvm::APSInt NumElems,
+                                        QualType ElemType) {
+  return checkSimpleDecomposition(
+      S, Bindings, Src, DecompType, NumElems, ElemType,
+      [&](SourceLocation Loc, Expr *Base, unsigned I) -> ExprResult {
+        ExprResult E = S.ActOnIntegerConstant(Loc, I);
+        if (E.isInvalid())
+          return ExprError();
+        return S.CreateBuiltinArraySubscriptExpr(Base, Loc, E.get(), Loc);
+      });
+}
+
+static bool checkArrayDecomposition(Sema &S, ArrayRef<BindingDecl*> Bindings,
+                                    ValueDecl *Src, QualType DecompType,
+                                    const ConstantArrayType *CAT) {
+  return checkArrayLikeDecomposition(S, Bindings, Src, DecompType,
+                                     llvm::APSInt(CAT->getSize()),
+                                     CAT->getElementType());
+}
+
+static bool checkVectorDecomposition(Sema &S, ArrayRef<BindingDecl*> Bindings,
+                                     ValueDecl *Src, QualType DecompType,
+                                     const VectorType *VT) {
+  return checkArrayLikeDecomposition(
+      S, Bindings, Src, DecompType, llvm::APSInt::get(VT->getNumElements()),
+      S.Context.getQualifiedType(VT->getElementType(),
+                                 DecompType.getQualifiers()));
+}
+
+static bool checkComplexDecomposition(Sema &S,
+                                      ArrayRef<BindingDecl *> Bindings,
+                                      ValueDecl *Src, QualType DecompType,
+                                      const ComplexType *CT) {
+  return checkSimpleDecomposition(
+      S, Bindings, Src, DecompType, llvm::APSInt::get(2),
+      S.Context.getQualifiedType(CT->getElementType(),
+                                 DecompType.getQualifiers()),
+      [&](SourceLocation Loc, Expr *Base, unsigned I) -> ExprResult {
+        return S.CreateBuiltinUnaryOp(Loc, I ? UO_Imag : UO_Real, Base);
+      });
+}
+
+static std::string printTemplateArgs(const PrintingPolicy &PrintingPolicy,
+                                     TemplateArgumentListInfo &Args) {
+  SmallString<128> SS;
+  llvm::raw_svector_ostream OS(SS);
+  bool First = true;
+  for (auto &Arg : Args.arguments()) {
+    if (!First)
+      OS << ", ";
+    Arg.getArgument().print(PrintingPolicy, OS);
+    First = false;
+  }
+  return OS.str();
+}
+
+static bool lookupStdTypeTraitMember(Sema &S, LookupResult &TraitMemberLookup,
+                                     SourceLocation Loc, StringRef Trait,
+                                     TemplateArgumentListInfo &Args,
+                                     unsigned DiagID) {
+  auto DiagnoseMissing = [&] {
+    if (DiagID)
+      S.Diag(Loc, DiagID) << printTemplateArgs(S.Context.getPrintingPolicy(),
+                                               Args);
+    return true;
+  };
+
+  // FIXME: Factor out duplication with lookupPromiseType in SemaCoroutine.
+  NamespaceDecl *Std = S.getStdNamespace();
+  if (!Std)
+    return DiagnoseMissing();
+
+  // Look up the trait itself, within namespace std. We can diagnose various
+  // problems with this lookup even if we've been asked to not diagnose a
+  // missing specialization, because this can only fail if the user has been
+  // declaring their own names in namespace std or we don't support the
+  // standard library implementation in use.
+  LookupResult Result(S, &S.PP.getIdentifierTable().get(Trait),
+                      Loc, Sema::LookupOrdinaryName);
+  if (!S.LookupQualifiedName(Result, Std))
+    return DiagnoseMissing();
+  if (Result.isAmbiguous())
+    return true;
+
+  ClassTemplateDecl *TraitTD = Result.getAsSingle<ClassTemplateDecl>();
+  if (!TraitTD) {
+    Result.suppressDiagnostics();
+    NamedDecl *Found = *Result.begin();
+    S.Diag(Loc, diag::err_std_type_trait_not_class_template) << Trait;
+    S.Diag(Found->getLocation(), diag::note_declared_at);
+    return true;
+  }
+
+  // Build the template-id.
+  QualType TraitTy = S.CheckTemplateIdType(TemplateName(TraitTD), Loc, Args);
+  if (TraitTy.isNull())
+    return true;
+  if (!S.isCompleteType(Loc, TraitTy)) {
+    if (DiagID)
+      S.RequireCompleteType(
+          Loc, TraitTy, DiagID,
+          printTemplateArgs(S.Context.getPrintingPolicy(), Args));
+    return true;
+  }
+
+  CXXRecordDecl *RD = TraitTy->getAsCXXRecordDecl();
+  assert(RD && "specialization of class template is not a class?");
+
+  // Look up the member of the trait type.
+  S.LookupQualifiedName(TraitMemberLookup, RD);
+  return TraitMemberLookup.isAmbiguous();
+}
+
+static TemplateArgumentLoc
+getTrivialIntegralTemplateArgument(Sema &S, SourceLocation Loc, QualType T,
+                                   uint64_t I) {
+  TemplateArgument Arg(S.Context, S.Context.MakeIntValue(I, T), T);
+  return S.getTrivialTemplateArgumentLoc(Arg, T, Loc);
+}
+
+static TemplateArgumentLoc
+getTrivialTypeTemplateArgument(Sema &S, SourceLocation Loc, QualType T) {
+  return S.getTrivialTemplateArgumentLoc(TemplateArgument(T), QualType(), Loc);
+}
+
+namespace { enum class IsTupleLike { TupleLike, NotTupleLike, Error }; }
+
+static IsTupleLike isTupleLike(Sema &S, SourceLocation Loc, QualType T,
+                               llvm::APSInt &Size) {
+  EnterExpressionEvaluationContext ContextRAII(S, Sema::ConstantEvaluated);
+
+  DeclarationName Value = S.PP.getIdentifierInfo("value");
+  LookupResult R(S, Value, Loc, Sema::LookupOrdinaryName);
+
+  // Form template argument list for tuple_size<T>.
+  TemplateArgumentListInfo Args(Loc, Loc);
+  Args.addArgument(getTrivialTypeTemplateArgument(S, Loc, T));
+
+  // If there's no tuple_size specialization, it's not tuple-like.
+  if (lookupStdTypeTraitMember(S, R, Loc, "tuple_size", Args, /*DiagID*/0))
+    return IsTupleLike::NotTupleLike;
+
+  // FIXME: According to the standard, we're not supposed to diagnose if any
+  // of the steps below fail (or if lookup for ::value is ambiguous or otherwise
+  // results in an error), but this is subject to a pending CWG issue / NB
+  // comment, which says we do diagnose if tuple_size<T> is complete but
+  // tuple_size<T>::value is not an ICE.
+
+  struct ICEDiagnoser : Sema::VerifyICEDiagnoser {
+    LookupResult &R;
+    TemplateArgumentListInfo &Args;
+    ICEDiagnoser(LookupResult &R, TemplateArgumentListInfo &Args)
+        : R(R), Args(Args) {}
+    void diagnoseNotICE(Sema &S, SourceLocation Loc, SourceRange SR) {
+      S.Diag(Loc, diag::err_decomp_decl_std_tuple_size_not_constant)
+          << printTemplateArgs(S.Context.getPrintingPolicy(), Args);
+    }
+  } Diagnoser(R, Args);
+
+  if (R.empty()) {
+    Diagnoser.diagnoseNotICE(S, Loc, SourceRange());
+    return IsTupleLike::Error;
+  }
+
+  ExprResult E =
+      S.BuildDeclarationNameExpr(CXXScopeSpec(), R, /*NeedsADL*/false);
+  if (E.isInvalid())
+    return IsTupleLike::Error;
+
+  E = S.VerifyIntegerConstantExpression(E.get(), &Size, Diagnoser, false);
+  if (E.isInvalid())
+    return IsTupleLike::Error;
+
+  return IsTupleLike::TupleLike;
+}
+
+/// \return std::tuple_element<I, T>::type.
+static QualType getTupleLikeElementType(Sema &S, SourceLocation Loc,
+                                        unsigned I, QualType T) {
+  // Form template argument list for tuple_element<I, T>.
+  TemplateArgumentListInfo Args(Loc, Loc);
+  Args.addArgument(
+      getTrivialIntegralTemplateArgument(S, Loc, S.Context.getSizeType(), I));
+  Args.addArgument(getTrivialTypeTemplateArgument(S, Loc, T));
+
+  DeclarationName TypeDN = S.PP.getIdentifierInfo("type");
+  LookupResult R(S, TypeDN, Loc, Sema::LookupOrdinaryName);
+  if (lookupStdTypeTraitMember(
+          S, R, Loc, "tuple_element", Args,
+          diag::err_decomp_decl_std_tuple_element_not_specialized))
+    return QualType();
+
+  auto *TD = R.getAsSingle<TypeDecl>();
+  if (!TD) {
+    R.suppressDiagnostics();
+    S.Diag(Loc, diag::err_decomp_decl_std_tuple_element_not_specialized)
+      << printTemplateArgs(S.Context.getPrintingPolicy(), Args);
+    if (!R.empty())
+      S.Diag(R.getRepresentativeDecl()->getLocation(), diag::note_declared_at);
+    return QualType();
+  }
+
+  return S.Context.getTypeDeclType(TD);
+}
+
+namespace {
+struct BindingDiagnosticTrap {
+  Sema &S;
+  DiagnosticErrorTrap Trap;
+  BindingDecl *BD;
+
+  BindingDiagnosticTrap(Sema &S, BindingDecl *BD)
+      : S(S), Trap(S.Diags), BD(BD) {}
+  ~BindingDiagnosticTrap() {
+    if (Trap.hasErrorOccurred())
+      S.Diag(BD->getLocation(), diag::note_in_binding_decl_init) << BD;
+  }
+};
+}
+
+static bool checkTupleLikeDecomposition(Sema &S,
+                                        ArrayRef<BindingDecl *> Bindings,
+                                        VarDecl *Src, QualType DecompType,
+                                        llvm::APSInt TupleSize) {
+  if ((int64_t)Bindings.size() != TupleSize) {
+    S.Diag(Src->getLocation(), diag::err_decomp_decl_wrong_number_bindings)
+        << DecompType << (unsigned)Bindings.size() << TupleSize.toString(10)
+        << (TupleSize < Bindings.size());
+    return true;
+  }
+
+  if (Bindings.empty())
+    return false;
+
+  DeclarationName GetDN = S.PP.getIdentifierInfo("get");
+
+  // [dcl.decomp]p3:
+  //   The unqualified-id get is looked up in the scope of E by class member
+  //   access lookup
+  LookupResult MemberGet(S, GetDN, Src->getLocation(), Sema::LookupMemberName);
+  bool UseMemberGet = false;
+  if (S.isCompleteType(Src->getLocation(), DecompType)) {
+    if (auto *RD = DecompType->getAsCXXRecordDecl())
+      S.LookupQualifiedName(MemberGet, RD);
+    if (MemberGet.isAmbiguous())
+      return true;
+    UseMemberGet = !MemberGet.empty();
+    S.FilterAcceptableTemplateNames(MemberGet);
+  }
+
+  unsigned I = 0;
+  for (auto *B : Bindings) {
+    BindingDiagnosticTrap Trap(S, B);
+    SourceLocation Loc = B->getLocation();
+
+    ExprResult E = S.BuildDeclRefExpr(Src, DecompType, VK_LValue, Loc);
+    if (E.isInvalid())
+      return true;
+
+    //   e is an lvalue if the type of the entity is an lvalue reference and
+    //   an xvalue otherwise
+    if (!Src->getType()->isLValueReferenceType())
+      E = ImplicitCastExpr::Create(S.Context, E.get()->getType(), CK_NoOp,
+                                   E.get(), nullptr, VK_XValue);
+
+    TemplateArgumentListInfo Args(Loc, Loc);
+    Args.addArgument(
+        getTrivialIntegralTemplateArgument(S, Loc, S.Context.getSizeType(), I));
+
+    if (UseMemberGet) {
+      //   if [lookup of member get] finds at least one declaration, the
+      //   initializer is e.get<i-1>().
+      E = S.BuildMemberReferenceExpr(E.get(), DecompType, Loc, false,
+                                     CXXScopeSpec(), SourceLocation(), nullptr,
+                                     MemberGet, &Args, nullptr);
+      if (E.isInvalid())
+        return true;
+
+      E = S.ActOnCallExpr(nullptr, E.get(), Loc, None, Loc);
+    } else {
+      //   Otherwise, the initializer is get<i-1>(e), where get is looked up
+      //   in the associated namespaces.
+      Expr *Get = UnresolvedLookupExpr::Create(
+          S.Context, nullptr, NestedNameSpecifierLoc(), SourceLocation(),
+          DeclarationNameInfo(GetDN, Loc), /*RequiresADL*/true, &Args,
+          UnresolvedSetIterator(), UnresolvedSetIterator());
+
+      Expr *Arg = E.get();
+      E = S.ActOnCallExpr(nullptr, Get, Loc, Arg, Loc);
+    }
+    if (E.isInvalid())
+      return true;
+    Expr *Init = E.get();
+
+    //   Given the type T designated by std::tuple_element<i - 1, E>::type,
+    QualType T = getTupleLikeElementType(S, Loc, I, DecompType);
+    if (T.isNull())
+      return true;
+
+    //   each vi is a variable of type "reference to T" initialized with the
+    //   initializer, where the reference is an lvalue reference if the
+    //   initializer is an lvalue and an rvalue reference otherwise
+    QualType RefType =
+        S.BuildReferenceType(T, E.get()->isLValue(), Loc, B->getDeclName());
+    if (RefType.isNull())
+      return true;
+    auto *RefVD = VarDecl::Create(
+        S.Context, Src->getDeclContext(), Loc, Loc,
+        B->getDeclName().getAsIdentifierInfo(), RefType,
+        S.Context.getTrivialTypeSourceInfo(T, Loc), Src->getStorageClass());
+    RefVD->setLexicalDeclContext(Src->getLexicalDeclContext());
+    RefVD->setTSCSpec(Src->getTSCSpec());
+    RefVD->setImplicit();
+    if (Src->isInlineSpecified())
+      RefVD->setInlineSpecified();
+    RefVD->getLexicalDeclContext()->addHiddenDecl(RefVD);
+
+    InitializedEntity Entity = InitializedEntity::InitializeBinding(RefVD);
+    InitializationKind Kind = InitializationKind::CreateCopy(Loc, Loc);
+    InitializationSequence Seq(S, Entity, Kind, Init);
+    E = Seq.Perform(S, Entity, Kind, Init);
+    if (E.isInvalid())
+      return true;
+    E = S.ActOnFinishFullExpr(E.get(), Loc);
+    if (E.isInvalid())
+      return true;
+    RefVD->setInit(E.get());
+    RefVD->checkInitIsICE();
+
+    E = S.BuildDeclarationNameExpr(CXXScopeSpec(),
+                                   DeclarationNameInfo(B->getDeclName(), Loc),
+                                   RefVD);
+    if (E.isInvalid())
+      return true;
+
+    B->setBinding(T, E.get());
+    I++;
+  }
+
+  return false;
+}
+
+/// Find the base class to decompose in a built-in decomposition of a class type.
+/// This base class search is, unfortunately, not quite like any other that we
+/// perform anywhere else in C++.
+static const CXXRecordDecl *findDecomposableBaseClass(Sema &S,
+                                                      SourceLocation Loc,
+                                                      const CXXRecordDecl *RD,
+                                                      CXXCastPath &BasePath) {
+  auto BaseHasFields = [](const CXXBaseSpecifier *Specifier,
+                          CXXBasePath &Path) {
+    return Specifier->getType()->getAsCXXRecordDecl()->hasDirectFields();
+  };
+
+  const CXXRecordDecl *ClassWithFields = nullptr;
+  if (RD->hasDirectFields())
+    // [dcl.decomp]p4:
+    //   Otherwise, all of E's non-static data members shall be public direct
+    //   members of E ...
+    ClassWithFields = RD;
+  else {
+    //   ... or of ...
+    CXXBasePaths Paths;
+    Paths.setOrigin(const_cast<CXXRecordDecl*>(RD));
+    if (!RD->lookupInBases(BaseHasFields, Paths)) {
+      // If no classes have fields, just decompose RD itself. (This will work
+      // if and only if zero bindings were provided.)
+      return RD;
+    }
+
+    CXXBasePath *BestPath = nullptr;
+    for (auto &P : Paths) {
+      if (!BestPath)
+        BestPath = &P;
+      else if (!S.Context.hasSameType(P.back().Base->getType(),
+                                      BestPath->back().Base->getType())) {
+        //   ... the same ...
+        S.Diag(Loc, diag::err_decomp_decl_multiple_bases_with_members)
+          << false << RD << BestPath->back().Base->getType()
+          << P.back().Base->getType();
+        return nullptr;
+      } else if (P.Access < BestPath->Access) {
+        BestPath = &P;
+      }
+    }
+
+    //   ... unambiguous ...
+    QualType BaseType = BestPath->back().Base->getType();
+    if (Paths.isAmbiguous(S.Context.getCanonicalType(BaseType))) {
+      S.Diag(Loc, diag::err_decomp_decl_ambiguous_base)
+        << RD << BaseType << S.getAmbiguousPathsDisplayString(Paths);
+      return nullptr;
+    }
+
+    //   ... public base class of E.
+    if (BestPath->Access != AS_public) {
+      S.Diag(Loc, diag::err_decomp_decl_non_public_base)
+        << RD << BaseType;
+      for (auto &BS : *BestPath) {
+        if (BS.Base->getAccessSpecifier() != AS_public) {
+          S.Diag(BS.Base->getLocStart(), diag::note_access_constrained_by_path)
+            << (BS.Base->getAccessSpecifier() == AS_protected)
+            << (BS.Base->getAccessSpecifierAsWritten() == AS_none);
+          break;
+        }
+      }
+      return nullptr;
+    }
+
+    ClassWithFields = BaseType->getAsCXXRecordDecl();
+    S.BuildBasePathArray(Paths, BasePath);
+  }
+
+  // The above search did not check whether the selected class itself has base
+  // classes with fields, so check that now.
+  CXXBasePaths Paths;
+  if (ClassWithFields->lookupInBases(BaseHasFields, Paths)) {
+    S.Diag(Loc, diag::err_decomp_decl_multiple_bases_with_members)
+      << (ClassWithFields == RD) << RD << ClassWithFields
+      << Paths.front().back().Base->getType();
+    return nullptr;
+  }
+
+  return ClassWithFields;
+}
+
+static bool checkMemberDecomposition(Sema &S, ArrayRef<BindingDecl*> Bindings,
+                                     ValueDecl *Src, QualType DecompType,
+                                     const CXXRecordDecl *RD) {
+  CXXCastPath BasePath;
+  RD = findDecomposableBaseClass(S, Src->getLocation(), RD, BasePath);
+  if (!RD)
+    return true;
+  QualType BaseType = S.Context.getQualifiedType(S.Context.getRecordType(RD),
+                                                 DecompType.getQualifiers());
+
+  auto DiagnoseBadNumberOfBindings = [&]() -> bool {
+    unsigned NumFields = std::distance(RD->field_begin(), RD->field_end());
+    assert(Bindings.size() != NumFields);
+    S.Diag(Src->getLocation(), diag::err_decomp_decl_wrong_number_bindings)
+        << DecompType << (unsigned)Bindings.size() << NumFields
+        << (NumFields < Bindings.size());
+    return true;
+  };
+
+  //   all of E's non-static data members shall be public [...] members,
+  //   E shall not have an anonymous union member, ...
+  unsigned I = 0;
+  for (auto *FD : RD->fields()) {
+    if (FD->isUnnamedBitfield())
+      continue;
+
+    if (FD->isAnonymousStructOrUnion()) {
+      S.Diag(Src->getLocation(), diag::err_decomp_decl_anon_union_member)
+        << DecompType << FD->getType()->isUnionType();
+      S.Diag(FD->getLocation(), diag::note_declared_at);
+      return true;
+    }
+
+    // We have a real field to bind.
+    if (I >= Bindings.size())
+      return DiagnoseBadNumberOfBindings();
+    auto *B = Bindings[I++];
+
+    SourceLocation Loc = B->getLocation();
+    if (FD->getAccess() != AS_public) {
+      S.Diag(Loc, diag::err_decomp_decl_non_public_member) << FD << DecompType;
+
+      // Determine whether the access specifier was explicit.
+      bool Implicit = true;
+      for (const auto *D : RD->decls()) {
+        if (declaresSameEntity(D, FD))
+          break;
+        if (isa<AccessSpecDecl>(D)) {
+          Implicit = false;
+          break;
+        }
+      }
+
+      S.Diag(FD->getLocation(), diag::note_access_natural)
+        << (FD->getAccess() == AS_protected) << Implicit;
+      return true;
+    }
+
+    // Initialize the binding to Src.FD.
+    ExprResult E = S.BuildDeclRefExpr(Src, DecompType, VK_LValue, Loc);
+    if (E.isInvalid())
+      return true;
+    E = S.ImpCastExprToType(E.get(), BaseType, CK_UncheckedDerivedToBase,
+                            VK_LValue, &BasePath);
+    if (E.isInvalid())
+      return true;
+    E = S.BuildFieldReferenceExpr(E.get(), /*IsArrow*/ false, Loc,
+                                  CXXScopeSpec(), FD,
+                                  DeclAccessPair::make(FD, FD->getAccess()),
+                                  DeclarationNameInfo(FD->getDeclName(), Loc));
+    if (E.isInvalid())
+      return true;
+
+    // If the type of the member is T, the referenced type is cv T, where cv is
+    // the cv-qualification of the decomposition expression.
+    //
+    // FIXME: We resolve a defect here: if the field is mutable, we do not add
+    // 'const' to the type of the field.
+    Qualifiers Q = DecompType.getQualifiers();
+    if (FD->isMutable())
+      Q.removeConst();
+    B->setBinding(S.BuildQualifiedType(FD->getType(), Loc, Q), E.get());
+  }
+
+  if (I != Bindings.size())
+    return DiagnoseBadNumberOfBindings();
+
+  return false;
+}
+
+void Sema::CheckCompleteDecompositionDeclaration(DecompositionDecl *DD) {
+  QualType DecompType = DD->getType();
+
+  // If the type of the decomposition is dependent, then so is the type of
+  // each binding.
+  if (DecompType->isDependentType()) {
+    for (auto *B : DD->bindings())
+      B->setType(Context.DependentTy);
+    return;
+  }
+
+  DecompType = DecompType.getNonReferenceType();
+  ArrayRef<BindingDecl*> Bindings = DD->bindings();
+
+  // C++1z [dcl.decomp]/2:
+  //   If E is an array type [...]
+  // As an extension, we also support decomposition of built-in complex and
+  // vector types.
+  if (auto *CAT = Context.getAsConstantArrayType(DecompType)) {
+    if (checkArrayDecomposition(*this, Bindings, DD, DecompType, CAT))
+      DD->setInvalidDecl();
+    return;
+  }
+  if (auto *VT = DecompType->getAs<VectorType>()) {
+    if (checkVectorDecomposition(*this, Bindings, DD, DecompType, VT))
+      DD->setInvalidDecl();
+    return;
+  }
+  if (auto *CT = DecompType->getAs<ComplexType>()) {
+    if (checkComplexDecomposition(*this, Bindings, DD, DecompType, CT))
+      DD->setInvalidDecl();
+    return;
+  }
+
+  // C++1z [dcl.decomp]/3:
+  //   if the expression std::tuple_size<E>::value is a well-formed integral
+  //   constant expression, [...]
+  llvm::APSInt TupleSize(32);
+  switch (isTupleLike(*this, DD->getLocation(), DecompType, TupleSize)) {
+  case IsTupleLike::Error:
+    DD->setInvalidDecl();
+    return;
+
+  case IsTupleLike::TupleLike:
+    if (checkTupleLikeDecomposition(*this, Bindings, DD, DecompType, TupleSize))
+      DD->setInvalidDecl();
+    return;
+
+  case IsTupleLike::NotTupleLike:
+    break;
+  }
+
+  // C++1z [dcl.dcl]/8:
+  //   [E shall be of array or non-union class type]
+  CXXRecordDecl *RD = DecompType->getAsCXXRecordDecl();
+  if (!RD || RD->isUnion()) {
+    Diag(DD->getLocation(), diag::err_decomp_decl_unbindable_type)
+        << DD << !RD << DecompType;
+    DD->setInvalidDecl();
+    return;
+  }
+
+  // C++1z [dcl.decomp]/4:
+  //   all of E's non-static data members shall be [...] direct members of
+  //   E or of the same unambiguous public base class of E, ...
+  if (checkMemberDecomposition(*this, Bindings, DD, DecompType, RD))
+    DD->setInvalidDecl();
+}
+
 /// \brief Merge the exception specifications of two variable declarations.
 ///
 /// This is called when there's a redeclaration of a VarDecl. The function
@@ -912,7 +1678,8 @@
       // C++11 and permitted in C++1y, so ignore them.
       continue;
 
-    case Decl::Var: {
+    case Decl::Var:
+    case Decl::Decomposition: {
       // C++1y [dcl.constexpr]p3 allows anything except:
       //   a definition of a variable of non-literal type or of static or
       //   thread storage duration or for which no initialization is performed.
@@ -2192,7 +2959,8 @@
     } else {
       Member = HandleField(S, cast<CXXRecordDecl>(CurContext), Loc, D,
                                 BitWidth, InitStyle, AS);
-      assert(Member && "HandleField never returns null");
+      if (!Member)
+        return nullptr;
     }
   } else {
     Member = HandleDeclarator(S, D, TemplateParameterLists);
@@ -3356,34 +4124,7 @@
   ExprResult BaseInit;
   
   switch (ImplicitInitKind) {
-  case IIK_Inherit: {
-    const CXXRecordDecl *Inherited =
-        Constructor->getInheritedConstructor()->getParent();
-    const CXXRecordDecl *Base = BaseSpec->getType()->getAsCXXRecordDecl();
-    if (Base && Inherited->getCanonicalDecl() == Base->getCanonicalDecl()) {
-      // C++11 [class.inhctor]p8:
-      //   Each expression in the expression-list is of the form
-      //   static_cast<T&&>(p), where p is the name of the corresponding
-      //   constructor parameter and T is the declared type of p.
-      SmallVector<Expr*, 16> Args;
-      for (unsigned I = 0, E = Constructor->getNumParams(); I != E; ++I) {
-        ParmVarDecl *PD = Constructor->getParamDecl(I);
-        ExprResult ArgExpr =
-            SemaRef.BuildDeclRefExpr(PD, PD->getType().getNonReferenceType(),
-                                     VK_LValue, SourceLocation());
-        if (ArgExpr.isInvalid())
-          return true;
-        Args.push_back(CastForMoving(SemaRef, ArgExpr.get(), PD->getType()));
-      }
-
-      InitializationKind InitKind = InitializationKind::CreateDirect(
-          Constructor->getLocation(), SourceLocation(), SourceLocation());
-      InitializationSequence InitSeq(SemaRef, InitEntity, InitKind, Args);
-      BaseInit = InitSeq.Perform(SemaRef, InitEntity, InitKind, Args);
-      break;
-    }
-  }
-  // Fall through.
+  case IIK_Inherit:
   case IIK_Default: {
     InitializationKind InitKind
       = InitializationKind::CreateDefault(Constructor->getLocation());
@@ -3694,12 +4435,12 @@
   BaseAndFieldInfo(Sema &S, CXXConstructorDecl *Ctor, bool ErrorsInInits)
     : S(S), Ctor(Ctor), AnyErrorsInInits(ErrorsInInits) {
     bool Generated = Ctor->isImplicit() || Ctor->isDefaulted();
-    if (Generated && Ctor->isCopyConstructor())
+    if (Ctor->getInheritedConstructor())
+      IIK = IIK_Inherit;
+    else if (Generated && Ctor->isCopyConstructor())
       IIK = IIK_Copy;
     else if (Generated && Ctor->isMoveConstructor())
       IIK = IIK_Move;
-    else if (Ctor->getInheritedConstructor())
-      IIK = IIK_Inherit;
     else
       IIK = IIK_Default;
   }
@@ -4774,7 +5515,6 @@
 
   // The class is either imported or exported.
   const bool ClassExported = ClassAttr->getKind() == attr::DLLExport;
-  const bool ClassImported = !ClassExported;
 
   TemplateSpecializationKind TSK = Class->getTemplateSpecializationKind();
 
@@ -4809,11 +5549,20 @@
         if (!Context.getTargetInfo().getCXXABI().isMicrosoft())
           continue;
 
-        // MSVC versions before 2015 don't export the move assignment operators,
-        // so don't attempt to import them if we have a definition.
-        if (ClassImported && MD->isMoveAssignmentOperator() &&
+        // MSVC versions before 2015 don't export the move assignment operators
+        // and move constructor, so don't attempt to import/export them if
+        // we have a definition.
+        auto *Ctor = dyn_cast<CXXConstructorDecl>(MD);
+        if ((MD->isMoveAssignmentOperator() ||
+             (Ctor && Ctor->isMoveConstructor())) &&
             !getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015))
           continue;
+
+        // MSVC2015 doesn't export trivial defaulted x-tor but copy assign
+        // operator is exported anyway.
+        if (getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015) &&
+            (Ctor || isa<CXXDestructorDecl>(MD)) && MD->isTrivial())
+          continue;
       }
     }
 
@@ -4887,6 +5636,33 @@
   }
 }
 
+static void DefineImplicitSpecialMember(Sema &S, CXXMethodDecl *MD,
+                                        SourceLocation DefaultLoc) {
+  switch (S.getSpecialMember(MD)) {
+  case Sema::CXXDefaultConstructor:
+    S.DefineImplicitDefaultConstructor(DefaultLoc,
+                                       cast<CXXConstructorDecl>(MD));
+    break;
+  case Sema::CXXCopyConstructor:
+    S.DefineImplicitCopyConstructor(DefaultLoc, cast<CXXConstructorDecl>(MD));
+    break;
+  case Sema::CXXCopyAssignment:
+    S.DefineImplicitCopyAssignment(DefaultLoc, MD);
+    break;
+  case Sema::CXXDestructor:
+    S.DefineImplicitDestructor(DefaultLoc, cast<CXXDestructorDecl>(MD));
+    break;
+  case Sema::CXXMoveConstructor:
+    S.DefineImplicitMoveConstructor(DefaultLoc, cast<CXXConstructorDecl>(MD));
+    break;
+  case Sema::CXXMoveAssignment:
+    S.DefineImplicitMoveAssignment(DefaultLoc, MD);
+    break;
+  case Sema::CXXInvalid:
+    llvm_unreachable("Invalid special member.");
+  }
+}
+
 /// \brief Perform semantic checks on a class definition that has been
 /// completing, introducing implicitly-declared members, checking for
 /// abstract types, etc.
@@ -4982,8 +5758,8 @@
 
       // For an explicitly defaulted or deleted special member, we defer
       // determining triviality until the class is complete. That time is now!
+      CXXSpecialMember CSM = getSpecialMember(M);
       if (!M->isImplicit() && !M->isUserProvided()) {
-        CXXSpecialMember CSM = getSpecialMember(M);
         if (CSM != CXXInvalid) {
           M->setTrivial(SpecialMemberIsTrivial(M, CSM));
 
@@ -4991,6 +5767,20 @@
           Record->finishedDefaultedOrDeletedMember(M);
         }
       }
+
+      if (!M->isInvalidDecl() && M->isExplicitlyDefaulted() &&
+          M->hasAttr<DLLExportAttr>()) {
+        if (getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015) &&
+            M->isTrivial() &&
+            (CSM == CXXDefaultConstructor || CSM == CXXCopyConstructor ||
+             CSM == CXXDestructor))
+          M->dropAttr<DLLExportAttr>();
+
+        if (M->hasAttr<DLLExportAttr>()) {
+          DefineImplicitSpecialMember(*this, M, M->getLocation());
+          ActOnFinishInlineFunctionDef(M);
+        }
+      }
     }
   }
 
@@ -5016,15 +5806,6 @@
     Diag(Record->getLocation(), diag::warn_cxx_ms_struct);
   }
 
-  // Declare inheriting constructors. We do this eagerly here because:
-  // - The standard requires an eager diagnostic for conflicting inheriting
-  //   constructors from different classes.
-  // - The lazy declaration of the other implicit constructors is so as to not
-  //   waste space and performance on classes that are not meant to be
-  //   instantiated (e.g. meta-functions). This doesn't apply to classes that
-  //   have inheriting constructors.
-  DeclareInheritingConstructors(Record);
-
   checkClassLevelDLLAttribute(Record);
 }
 
@@ -5058,11 +5839,108 @@
                                LHSQuals & Qualifiers::Volatile);
 }
 
+class Sema::InheritedConstructorInfo {
+  Sema &S;
+  SourceLocation UseLoc;
+
+  /// A mapping from the base classes through which the constructor was
+  /// inherited to the using shadow declaration in that base class (or a null
+  /// pointer if the constructor was declared in that base class).
+  llvm::DenseMap<CXXRecordDecl *, ConstructorUsingShadowDecl *>
+      InheritedFromBases;
+
+public:
+  InheritedConstructorInfo(Sema &S, SourceLocation UseLoc,
+                           ConstructorUsingShadowDecl *Shadow)
+      : S(S), UseLoc(UseLoc) {
+    bool DiagnosedMultipleConstructedBases = false;
+    CXXRecordDecl *ConstructedBase = nullptr;
+    UsingDecl *ConstructedBaseUsing = nullptr;
+
+    // Find the set of such base class subobjects and check that there's a
+    // unique constructed subobject.
+    for (auto *D : Shadow->redecls()) {
+      auto *DShadow = cast<ConstructorUsingShadowDecl>(D);
+      auto *DNominatedBase = DShadow->getNominatedBaseClass();
+      auto *DConstructedBase = DShadow->getConstructedBaseClass();
+
+      InheritedFromBases.insert(
+          std::make_pair(DNominatedBase->getCanonicalDecl(),
+                         DShadow->getNominatedBaseClassShadowDecl()));
+      if (DShadow->constructsVirtualBase())
+        InheritedFromBases.insert(
+            std::make_pair(DConstructedBase->getCanonicalDecl(),
+                           DShadow->getConstructedBaseClassShadowDecl()));
+      else
+        assert(DNominatedBase == DConstructedBase);
+
+      // [class.inhctor.init]p2:
+      //   If the constructor was inherited from multiple base class subobjects
+      //   of type B, the program is ill-formed.
+      if (!ConstructedBase) {
+        ConstructedBase = DConstructedBase;
+        ConstructedBaseUsing = D->getUsingDecl();
+      } else if (ConstructedBase != DConstructedBase &&
+                 !Shadow->isInvalidDecl()) {
+        if (!DiagnosedMultipleConstructedBases) {
+          S.Diag(UseLoc, diag::err_ambiguous_inherited_constructor)
+              << Shadow->getTargetDecl();
+          S.Diag(ConstructedBaseUsing->getLocation(),
+               diag::note_ambiguous_inherited_constructor_using)
+              << ConstructedBase;
+          DiagnosedMultipleConstructedBases = true;
+        }
+        S.Diag(D->getUsingDecl()->getLocation(),
+               diag::note_ambiguous_inherited_constructor_using)
+            << DConstructedBase;
+      }
+    }
+
+    if (DiagnosedMultipleConstructedBases)
+      Shadow->setInvalidDecl();
+  }
+
+  /// Find the constructor to use for inherited construction of a base class,
+  /// and whether that base class constructor inherits the constructor from a
+  /// virtual base class (in which case it won't actually invoke it).
+  std::pair<CXXConstructorDecl *, bool>
+  findConstructorForBase(CXXRecordDecl *Base, CXXConstructorDecl *Ctor) const {
+    auto It = InheritedFromBases.find(Base->getCanonicalDecl());
+    if (It == InheritedFromBases.end())
+      return std::make_pair(nullptr, false);
+
+    // This is an intermediary class.
+    if (It->second)
+      return std::make_pair(
+          S.findInheritingConstructor(UseLoc, Ctor, It->second),
+          It->second->constructsVirtualBase());
+
+    // This is the base class from which the constructor was inherited.
+    return std::make_pair(Ctor, false);
+  }
+};
+
 /// Is the special member function which would be selected to perform the
 /// specified operation on the specified class type a constexpr constructor?
-static bool specialMemberIsConstexpr(Sema &S, CXXRecordDecl *ClassDecl,
-                                     Sema::CXXSpecialMember CSM,
-                                     unsigned Quals, bool ConstRHS) {
+static bool
+specialMemberIsConstexpr(Sema &S, CXXRecordDecl *ClassDecl,
+                         Sema::CXXSpecialMember CSM, unsigned Quals,
+                         bool ConstRHS,
+                         CXXConstructorDecl *InheritedCtor = nullptr,
+                         Sema::InheritedConstructorInfo *Inherited = nullptr) {
+  // If we're inheriting a constructor, see if we need to call it for this base
+  // class.
+  if (InheritedCtor) {
+    assert(CSM == Sema::CXXDefaultConstructor);
+    auto BaseCtor =
+        Inherited->findConstructorForBase(ClassDecl, InheritedCtor).first;
+    if (BaseCtor)
+      return BaseCtor->isConstexpr();
+  }
+
+  if (CSM == Sema::CXXDefaultConstructor)
+    return ClassDecl->hasConstexprDefaultConstructor();
+
   Sema::SpecialMemberOverloadResult *SMOR =
       lookupCallFromSpecialMember(S, ClassDecl, CSM, Quals, ConstRHS);
   if (!SMOR || !SMOR->getMethod())
@@ -5074,9 +5952,10 @@
 
 /// Determine whether the specified special member function would be constexpr
 /// if it were implicitly defined.
-static bool defaultedSpecialMemberIsConstexpr(Sema &S, CXXRecordDecl *ClassDecl,
-                                              Sema::CXXSpecialMember CSM,
-                                              bool ConstArg) {
+static bool defaultedSpecialMemberIsConstexpr(
+    Sema &S, CXXRecordDecl *ClassDecl, Sema::CXXSpecialMember CSM,
+    bool ConstArg, CXXConstructorDecl *InheritedCtor = nullptr,
+    Sema::InheritedConstructorInfo *Inherited = nullptr) {
   if (!S.getLangOpts().CPlusPlus11)
     return false;
 
@@ -5085,6 +5964,8 @@
   bool Ctor = true;
   switch (CSM) {
   case Sema::CXXDefaultConstructor:
+    if (Inherited)
+      break;
     // Since default constructor lookup is essentially trivial (and cannot
     // involve, for instance, template instantiation), we compute whether a
     // defaulted default constructor is constexpr directly within CXXRecordDecl.
@@ -5119,7 +6000,10 @@
   // will be initialized (if the constructor isn't deleted), we just don't know
   // which one.
   if (Ctor && ClassDecl->isUnion())
-    return true;
+    return CSM == Sema::CXXDefaultConstructor
+               ? ClassDecl->hasInClassInitializer() ||
+                     !ClassDecl->hasVariantMembers()
+               : true;
 
   //   -- the class shall not have any virtual base classes;
   if (Ctor && ClassDecl->getNumVBases())
@@ -5139,7 +6023,8 @@
     if (!BaseType) continue;
 
     CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(BaseType->getDecl());
-    if (!specialMemberIsConstexpr(S, BaseClassDecl, CSM, 0, ConstArg))
+    if (!specialMemberIsConstexpr(S, BaseClassDecl, CSM, 0, ConstArg,
+                                  InheritedCtor, Inherited))
       return false;
   }
 
@@ -5153,6 +6038,8 @@
   for (const auto *F : ClassDecl->fields()) {
     if (F->isInvalidDecl())
       continue;
+    if (CSM == Sema::CXXDefaultConstructor && F->hasInClassInitializer())
+      continue;
     QualType BaseType = S.Context.getBaseElementType(F->getType());
     if (const RecordType *RecordTy = BaseType->getAs<RecordType>()) {
       CXXRecordDecl *FieldRecDecl = cast<CXXRecordDecl>(RecordTy->getDecl());
@@ -5160,6 +6047,8 @@
                                     BaseType.getCVRQualifiers(),
                                     ConstArg && !F->isMutable()))
         return false;
+    } else if (CSM == Sema::CXXDefaultConstructor) {
+      return false;
     }
   }
 
@@ -5187,7 +6076,8 @@
   }
   assert(cast<CXXConstructorDecl>(MD)->getInheritedConstructor() &&
          "only special members have implicit exception specs");
-  return S.ComputeInheritingCtorExceptionSpec(cast<CXXConstructorDecl>(MD));
+  return S.ComputeInheritingCtorExceptionSpec(Loc,
+                                              cast<CXXConstructorDecl>(MD));
 }
 
 static FunctionProtoType::ExtProtoInfo getImplicitMethodEPI(Sema &S,
@@ -5384,7 +6274,7 @@
       //   [For a] user-provided explicitly-defaulted function [...] if such a
       //   function is implicitly defined as deleted, the program is ill-formed.
       Diag(MD->getLocation(), diag::err_out_of_line_default_deletes) << CSM;
-      ShouldDeleteSpecialMember(MD, CSM, /*Diagnose*/true);
+      ShouldDeleteSpecialMember(MD, CSM, nullptr, /*Diagnose*/true);
       HadError = true;
     }
   }
@@ -5445,6 +6335,7 @@
   Sema &S;
   CXXMethodDecl *MD;
   Sema::CXXSpecialMember CSM;
+  Sema::InheritedConstructorInfo *ICI;
   bool Diagnose;
 
   // Properties of the special member, computed for convenience.
@@ -5454,11 +6345,11 @@
   bool AllFieldsAreConst;
 
   SpecialMemberDeletionInfo(Sema &S, CXXMethodDecl *MD,
-                            Sema::CXXSpecialMember CSM, bool Diagnose)
-    : S(S), MD(MD), CSM(CSM), Diagnose(Diagnose),
-      IsConstructor(false), IsAssignment(false), IsMove(false),
-      ConstArg(false), Loc(MD->getLocation()),
-      AllFieldsAreConst(true) {
+                            Sema::CXXSpecialMember CSM,
+                            Sema::InheritedConstructorInfo *ICI, bool Diagnose)
+      : S(S), MD(MD), CSM(CSM), ICI(ICI), Diagnose(Diagnose),
+        IsConstructor(false), IsAssignment(false), IsMove(false),
+        ConstArg(false), Loc(MD->getLocation()), AllFieldsAreConst(true) {
     switch (CSM) {
       case Sema::CXXDefaultConstructor:
       case Sema::CXXCopyConstructor:
@@ -5490,6 +6381,10 @@
 
   bool inUnion() const { return MD->getParent()->isUnion(); }
 
+  Sema::CXXSpecialMember getEffectiveCSM() {
+    return ICI ? Sema::CXXInvalid : CSM;
+  }
+
   /// Look up the corresponding special member in the given class.
   Sema::SpecialMemberOverloadResult *lookupIn(CXXRecordDecl *Class,
                                               unsigned Quals, bool IsMutable) {
@@ -5566,13 +6461,13 @@
     if (Field) {
       S.Diag(Field->getLocation(),
              diag::note_deleted_special_member_class_subobject)
-        << CSM << MD->getParent() << /*IsField*/true
+        << getEffectiveCSM() << MD->getParent() << /*IsField*/true
         << Field << DiagKind << IsDtorCallInCtor;
     } else {
       CXXBaseSpecifier *Base = Subobj.get<CXXBaseSpecifier*>();
       S.Diag(Base->getLocStart(),
              diag::note_deleted_special_member_class_subobject)
-        << CSM << MD->getParent() << /*IsField*/false
+        << getEffectiveCSM() << MD->getParent() << /*IsField*/false
         << Base->getType() << DiagKind << IsDtorCallInCtor;
     }
 
@@ -5631,7 +6526,29 @@
   CXXRecordDecl *BaseClass = Base->getType()->getAsCXXRecordDecl();
   // If program is correct, BaseClass cannot be null, but if it is, the error
   // must be reported elsewhere.
-  return BaseClass && shouldDeleteForClassSubobject(BaseClass, Base, 0);
+  if (!BaseClass)
+    return false;
+  // If we have an inheriting constructor, check whether we're calling an
+  // inherited constructor instead of a default constructor.
+  if (ICI) {
+    assert(CSM == Sema::CXXDefaultConstructor);
+    auto *BaseCtor =
+        ICI->findConstructorForBase(BaseClass, cast<CXXConstructorDecl>(MD)
+                                                   ->getInheritedConstructor()
+                                                   .getConstructor())
+            .first;
+    if (BaseCtor) {
+      if (BaseCtor->isDeleted() && Diagnose) {
+        S.Diag(Base->getLocStart(),
+               diag::note_deleted_special_member_class_subobject)
+          << getEffectiveCSM() << MD->getParent() << /*IsField*/false
+          << Base->getType() << /*Deleted*/1 << /*IsDtorCallInCtor*/false;
+        S.NoteDeletedFunction(BaseCtor);
+      }
+      return BaseCtor->isDeleted();
+    }
+  }
+  return shouldDeleteForClassSubobject(BaseClass, Base, 0);
 }
 
 /// Check whether we should delete a special member function due to the class
@@ -5646,7 +6563,7 @@
     if (FieldType->isReferenceType() && !FD->hasInClassInitializer()) {
       if (Diagnose)
         S.Diag(FD->getLocation(), diag::note_deleted_default_ctor_uninit_field)
-          << MD->getParent() << FD << FieldType << /*Reference*/0;
+          << !!ICI << MD->getParent() << FD << FieldType << /*Reference*/0;
       return true;
     }
     // C++11 [class.ctor]p5: any non-variant non-static data member of
@@ -5658,7 +6575,7 @@
         (!FieldRecord || !FieldRecord->hasUserProvidedDefaultConstructor())) {
       if (Diagnose)
         S.Diag(FD->getLocation(), diag::note_deleted_default_ctor_uninit_field)
-          << MD->getParent() << FD << FD->getType() << /*Const*/1;
+          << !!ICI << MD->getParent() << FD << FD->getType() << /*Const*/1;
       return true;
     }
 
@@ -5717,7 +6634,7 @@
         if (Diagnose)
           S.Diag(FieldRecord->getLocation(),
                  diag::note_deleted_default_ctor_all_const)
-            << MD->getParent() << /*anonymous union*/1;
+            << !!ICI << MD->getParent() << /*anonymous union*/1;
         return true;
       }
 
@@ -5745,7 +6662,7 @@
     if (Diagnose)
       S.Diag(MD->getParent()->getLocation(),
              diag::note_deleted_default_ctor_all_const)
-        << MD->getParent() << /*not anonymous union*/0;
+        << !!ICI << MD->getParent() << /*not anonymous union*/0;
     return true;
   }
   return false;
@@ -5755,6 +6672,7 @@
 /// deleted, as specified in C++11 [class.ctor]p5, C++11 [class.copy]p11,
 /// C++11 [class.copy]p23, and C++11 [class.dtor]p5.
 bool Sema::ShouldDeleteSpecialMember(CXXMethodDecl *MD, CXXSpecialMember CSM,
+                                     InheritedConstructorInfo *ICI,
                                      bool Diagnose) {
   if (MD->isInvalidDecl())
     return false;
@@ -5844,7 +6762,7 @@
     }
   }
 
-  SpecialMemberDeletionInfo SMI(*this, MD, CSM, Diagnose);
+  SpecialMemberDeletionInfo SMI(*this, MD, CSM, ICI, Diagnose);
 
   for (auto &BI : RD->bases())
     if (!BI.isVirtual() &&
@@ -6452,27 +7370,33 @@
 /// [special]p1).  This routine can only be executed just before the
 /// definition of the class is complete.
 void Sema::AddImplicitlyDeclaredMembersToClass(CXXRecordDecl *ClassDecl) {
-  if (!ClassDecl->hasUserDeclaredConstructor())
+  if (ClassDecl->needsImplicitDefaultConstructor()) {
     ++ASTContext::NumImplicitDefaultConstructors;
 
-  if (!ClassDecl->hasUserDeclaredCopyConstructor()) {
+    if (ClassDecl->hasInheritedConstructor())
+      DeclareImplicitDefaultConstructor(ClassDecl);
+  }
+
+  if (ClassDecl->needsImplicitCopyConstructor()) {
     ++ASTContext::NumImplicitCopyConstructors;
 
     // If the properties or semantics of the copy constructor couldn't be
     // determined while the class was being declared, force a declaration
     // of it now.
-    if (ClassDecl->needsOverloadResolutionForCopyConstructor())
+    if (ClassDecl->needsOverloadResolutionForCopyConstructor() ||
+        ClassDecl->hasInheritedConstructor())
       DeclareImplicitCopyConstructor(ClassDecl);
   }
 
   if (getLangOpts().CPlusPlus11 && ClassDecl->needsImplicitMoveConstructor()) {
     ++ASTContext::NumImplicitMoveConstructors;
 
-    if (ClassDecl->needsOverloadResolutionForMoveConstructor())
+    if (ClassDecl->needsOverloadResolutionForMoveConstructor() ||
+        ClassDecl->hasInheritedConstructor())
       DeclareImplicitMoveConstructor(ClassDecl);
   }
 
-  if (!ClassDecl->hasUserDeclaredCopyAssignment()) {
+  if (ClassDecl->needsImplicitCopyAssignment()) {
     ++ASTContext::NumImplicitCopyAssignmentOperators;
 
     // If we have a dynamic class, then the copy assignment operator may be
@@ -6480,7 +7404,8 @@
     // it shows up in the right place in the vtable and that we diagnose
     // problems with the implicit exception specification.
     if (ClassDecl->isDynamicClass() ||
-        ClassDecl->needsOverloadResolutionForCopyAssignment())
+        ClassDecl->needsOverloadResolutionForCopyAssignment() ||
+        ClassDecl->hasInheritedAssignment())
       DeclareImplicitCopyAssignment(ClassDecl);
   }
 
@@ -6489,11 +7414,12 @@
 
     // Likewise for the move assignment operator.
     if (ClassDecl->isDynamicClass() ||
-        ClassDecl->needsOverloadResolutionForMoveAssignment())
+        ClassDecl->needsOverloadResolutionForMoveAssignment() ||
+        ClassDecl->hasInheritedAssignment())
       DeclareImplicitMoveAssignment(ClassDecl);
   }
 
-  if (!ClassDecl->hasUserDeclaredDestructor()) {
+  if (ClassDecl->needsImplicitDestructor()) {
     ++ASTContext::NumImplicitDestructors;
 
     // If we have a dynamic class, then the destructor may be virtual, so we
@@ -7740,7 +8666,7 @@
   // function will silently decide not to build a shadow decl, which
   // will pre-empt further diagnostics.
   //
-  // We don't need to do this in C++0x because we do the check once on
+  // We don't need to do this in C++11 because we do the check once on
   // the qualifier.
   //
   // FIXME: diagnose the following if we care enough:
@@ -7871,12 +8797,21 @@
   return true;
 }
 
+/// Determine whether a direct base class is a virtual base class.
+static bool isVirtualDirectBase(CXXRecordDecl *Derived, CXXRecordDecl *Base) {
+  if (!Derived->getNumVBases())
+    return false;
+  for (auto &B : Derived->bases())
+    if (B.getType()->getAsCXXRecordDecl() == Base)
+      return B.isVirtual();
+  llvm_unreachable("not a direct base class");
+}
+
 /// Builds a shadow declaration corresponding to a 'using' declaration.
 UsingShadowDecl *Sema::BuildUsingShadowDecl(Scope *S,
                                             UsingDecl *UD,
                                             NamedDecl *Orig,
                                             UsingShadowDecl *PrevDecl) {
-
   // If we resolved to another shadow declaration, just coalesce them.
   NamedDecl *Target = Orig;
   if (isa<UsingShadowDecl>(Target)) {
@@ -7884,9 +8819,21 @@
     assert(!isa<UsingShadowDecl>(Target) && "nested shadow declaration");
   }
 
-  UsingShadowDecl *Shadow
-    = UsingShadowDecl::Create(Context, CurContext,
-                              UD->getLocation(), UD, Target);
+  NamedDecl *NonTemplateTarget = Target;
+  if (auto *TargetTD = dyn_cast<TemplateDecl>(Target))
+    NonTemplateTarget = TargetTD->getTemplatedDecl();
+
+  UsingShadowDecl *Shadow;
+  if (isa<CXXConstructorDecl>(NonTemplateTarget)) {
+    bool IsVirtualBase =
+        isVirtualDirectBase(cast<CXXRecordDecl>(CurContext),
+                            UD->getQualifier()->getAsRecordDecl());
+    Shadow = ConstructorUsingShadowDecl::Create(
+        Context, CurContext, UD->getLocation(), UD, Orig, IsVirtualBase);
+  } else {
+    Shadow = UsingShadowDecl::Create(Context, CurContext, UD->getLocation(), UD,
+                                     Target);
+  }
   UD->addShadowDecl(Shadow);
 
   Shadow->setAccess(UD->getAccess());
@@ -7988,6 +8935,9 @@
     if (Candidate.WillReplaceSpecifier() && !Candidate.getCorrectionSpecifier())
       return false;
 
+    // FIXME: Don't correct to a name that CheckUsingDeclRedeclaration would
+    // reject.
+
     if (RequireMemberOf) {
       auto *FoundRecord = dyn_cast<CXXRecordDecl>(ND);
       if (FoundRecord && FoundRecord->isInjectedClassName()) {
@@ -8068,8 +9018,17 @@
     return nullptr;
   }
 
+  // For an inheriting constructor declaration, the name of the using
+  // declaration is the name of a constructor in this class, not in the
+  // base class.
+  DeclarationNameInfo UsingName = NameInfo;
+  if (UsingName.getName().getNameKind() == DeclarationName::CXXConstructorName)
+    if (auto *RD = dyn_cast<CXXRecordDecl>(CurContext))
+      UsingName.setName(Context.DeclarationNames.getCXXConstructorName(
+          Context.getCanonicalType(Context.getRecordType(RD))));
+
   // Do the redeclaration lookup in the current scope.
-  LookupResult Previous(*this, NameInfo, LookupUsingDeclName,
+  LookupResult Previous(*this, UsingName, LookupUsingDeclName,
                         ForRedeclaration);
   Previous.setHideTags(false);
   if (S) {
@@ -8126,8 +9085,8 @@
 
   auto Build = [&](bool Invalid) {
     UsingDecl *UD =
-        UsingDecl::Create(Context, CurContext, UsingLoc, QualifierLoc, NameInfo,
-                          HasTypenameKeyword);
+        UsingDecl::Create(Context, CurContext, UsingLoc, QualifierLoc,
+                          UsingName, HasTypenameKeyword);
     UD->setAccess(AS);
     CurContext->addDecl(UD);
     UD->setInvalidDecl(Invalid);
@@ -8182,6 +9141,9 @@
       // If we corrected to an inheriting constructor, handle it as one.
       auto *RD = dyn_cast<CXXRecordDecl>(ND);
       if (RD && RD->isInjectedClassName()) {
+        // The parent of the injected class name is the class itself.
+        RD = cast<CXXRecordDecl>(RD->getParent());
+
         // Fix up the information we'll use to build the using declaration.
         if (Corrected.WillReplaceSpecifier()) {
           NestedNameSpecifierLocBuilder Builder;
@@ -8190,13 +9152,19 @@
           QualifierLoc = Builder.getWithLocInContext(Context);
         }
 
-        NameInfo.setName(Context.DeclarationNames.getCXXConstructorName(
-            Context.getCanonicalType(Context.getRecordType(RD))));
-        NameInfo.setNamedTypeInfo(nullptr);
+        // In this case, the name we introduce is the name of a derived class
+        // constructor.
+        auto *CurClass = cast<CXXRecordDecl>(CurContext);
+        UsingName.setName(Context.DeclarationNames.getCXXConstructorName(
+            Context.getCanonicalType(Context.getRecordType(CurClass))));
+        UsingName.setNamedTypeInfo(nullptr);
         for (auto *Ctor : LookupConstructors(RD))
           R.addDecl(Ctor);
+        R.resolveKind();
       } else {
-        // FIXME: Pick up all the declarations if we found an overloaded function.
+        // FIXME: Pick up all the declarations if we found an overloaded
+        // function.
+        UsingName.setName(ND->getDeclName());
         R.addDecl(ND);
       }
     } else {
@@ -8229,7 +9197,7 @@
     }
   }
 
-  // C++0x N2914 [namespace.udecl]p6:
+  // C++14 [namespace.udecl]p6:
   // A using-declaration shall not name a namespace.
   if (R.getAsSingle<NamespaceDecl>()) {
     Diag(IdentLoc, diag::err_using_decl_can_not_refer_to_namespace)
@@ -8237,19 +9205,28 @@
     return BuildInvalid();
   }
 
+  // C++14 [namespace.udecl]p7:
+  // A using-declaration shall not name a scoped enumerator.
+  if (auto *ED = R.getAsSingle<EnumConstantDecl>()) {
+    if (cast<EnumDecl>(ED->getDeclContext())->isScoped()) {
+      Diag(IdentLoc, diag::err_using_decl_can_not_refer_to_scoped_enum)
+        << SS.getRange();
+      return BuildInvalid();
+    }
+  }
+
   UsingDecl *UD = BuildValid();
 
-  // The normal rules do not apply to inheriting constructor declarations.
-  if (NameInfo.getName().getNameKind() == DeclarationName::CXXConstructorName) {
+  // Some additional rules apply to inheriting constructors.
+  if (UsingName.getName().getNameKind() ==
+        DeclarationName::CXXConstructorName) {
     // Suppress access diagnostics; the access check is instead performed at the
     // point of use for an inheriting constructor.
     R.suppressDiagnostics();
-    CheckInheritingConstructorUsingDecl(UD);
-    return UD;
+    if (CheckInheritingConstructorUsingDecl(UD))
+      return UD;
   }
 
-  // Otherwise, look up the target name.
-
   for (LookupResult::iterator I = R.begin(), E = R.end(); I != E; ++I) {
     UsingShadowDecl *PrevDecl = nullptr;
     if (!CheckUsingShadowDecl(UD, *I, Previous, PrevDecl))
@@ -8361,8 +9338,10 @@
 
     // If we weren't able to compute a valid scope, it must be a
     // dependent class scope.
-    if (!NamedContext || NamedContext->isRecord()) {
-      auto *RD = dyn_cast_or_null<CXXRecordDecl>(NamedContext);
+    if (!NamedContext || NamedContext->getRedeclContext()->isRecord()) {
+      auto *RD = NamedContext
+                     ? cast<CXXRecordDecl>(NamedContext->getRedeclContext())
+                     : nullptr;
       if (RD && RequireCompleteDeclContext(const_cast<CXXScopeSpec&>(SS), RD))
         RD = nullptr;
 
@@ -8411,6 +9390,20 @@
         Diag(UsingLoc, diag::note_using_decl_class_member_workaround)
           << 2 // reference declaration
           << FixIt;
+      } else if (R.getAsSingle<EnumConstantDecl>()) {
+        // Don't provide a fixit outside C++11 mode; we don't want to suggest
+        // repeating the type of the enumeration here, and we can't do so if
+        // the type is anonymous.
+        FixItHint FixIt;
+        if (getLangOpts().CPlusPlus11) {
+          // Convert 'using X::Y;' to 'auto &Y = X::Y;'.
+          FixIt = FixItHint::CreateReplacement(
+              UsingLoc, "constexpr auto " + NameInfo.getName().getAsString() + " = ");
+        }
+
+        Diag(UsingLoc, diag::note_using_decl_class_member_workaround)
+          << (getLangOpts().CPlusPlus11 ? 4 : 3) // const[expr] variable
+          << FixIt;
       }
       return true;
     }
@@ -8446,7 +9439,7 @@
     return true;
 
   if (getLangOpts().CPlusPlus11) {
-    // C++0x [namespace.udecl]p3:
+    // C++11 [namespace.udecl]p3:
     //   In a using-declaration used as a member-declaration, the
     //   nested-name-specifier shall name a base class of the class
     //   being defined.
@@ -8588,6 +9581,10 @@
     }
     TemplateParameterList *TemplateParams = TemplateParamLists[0];
 
+    // Check that we can declare a template here.
+    if (CheckTemplateDeclScope(S, TemplateParams))
+      return nullptr;
+
     // Only consider previous declarations in the same scope.
     FilterLookupForScope(Previous, CurContext, S, /*ConsiderLinkage*/false,
                          /*ExplicitInstantiationOrSpecialization*/false);
@@ -8659,9 +9656,7 @@
     NewND = NewTD;
   }
 
-  if (!Redeclaration)
-    PushOnScopeChains(NewND, S);
-
+  PushOnScopeChains(NewND, S);
   ActOnDocumentableDecl(NewND);
   return NewND;
 }
@@ -8805,7 +9800,8 @@
 }
 
 Sema::ImplicitExceptionSpecification
-Sema::ComputeInheritingCtorExceptionSpec(CXXConstructorDecl *CD) {
+Sema::ComputeInheritingCtorExceptionSpec(SourceLocation Loc,
+                                         CXXConstructorDecl *CD) {
   CXXRecordDecl *ClassDecl = CD->getParent();
 
   // C++ [except.spec]p14:
@@ -8814,36 +9810,26 @@
   if (ClassDecl->isInvalidDecl())
     return ExceptSpec;
 
-  // Inherited constructor.
-  const CXXConstructorDecl *InheritedCD = CD->getInheritedConstructor();
-  const CXXRecordDecl *InheritedDecl = InheritedCD->getParent();
-  // FIXME: Copying or moving the parameters could add extra exceptions to the
-  // set, as could the default arguments for the inherited constructor. This
-  // will be addressed when we implement the resolution of core issue 1351.
-  ExceptSpec.CalledDecl(CD->getLocStart(), InheritedCD);
+  auto Inherited = CD->getInheritedConstructor();
+  InheritedConstructorInfo ICI(*this, Loc, Inherited.getShadowDecl());
 
-  // Direct base-class constructors.
-  for (const auto &B : ClassDecl->bases()) {
-    if (B.isVirtual()) // Handled below.
-      continue;
-
-    if (const RecordType *BaseType = B.getType()->getAs<RecordType>()) {
-      CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(BaseType->getDecl());
-      if (BaseClassDecl == InheritedDecl)
+  // Direct and virtual base-class constructors.
+  for (bool VBase : {false, true}) {
+    for (CXXBaseSpecifier &B :
+         VBase ? ClassDecl->vbases() : ClassDecl->bases()) {
+      // Don't visit direct vbases twice.
+      if (B.isVirtual() != VBase)
         continue;
-      CXXConstructorDecl *Constructor = LookupDefaultConstructor(BaseClassDecl);
-      if (Constructor)
-        ExceptSpec.CalledDecl(B.getLocStart(), Constructor);
-    }
-  }
 
-  // Virtual base-class constructors.
-  for (const auto &B : ClassDecl->vbases()) {
-    if (const RecordType *BaseType = B.getType()->getAs<RecordType>()) {
-      CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(BaseType->getDecl());
-      if (BaseClassDecl == InheritedDecl)
+      CXXRecordDecl *BaseClass = B.getType()->getAsCXXRecordDecl();
+      if (!BaseClass)
         continue;
-      CXXConstructorDecl *Constructor = LookupDefaultConstructor(BaseClassDecl);
+
+      CXXConstructorDecl *Constructor =
+          ICI.findConstructorForBase(BaseClass, Inherited.getConstructor())
+              .first;
+      if (!Constructor)
+        Constructor = LookupDefaultConstructor(BaseClass);
       if (Constructor)
         ExceptSpec.CalledDecl(B.getLocStart(), Constructor);
     }
@@ -8871,10 +9857,11 @@
 struct DeclaringSpecialMember {
   Sema &S;
   Sema::SpecialMemberDecl D;
+  Sema::ContextRAII SavedContext;
   bool WasAlreadyBeingDeclared;
 
   DeclaringSpecialMember(Sema &S, CXXRecordDecl *RD, Sema::CXXSpecialMember CSM)
-    : S(S), D(RD, CSM) {
+    : S(S), D(RD, CSM), SavedContext(S, RD) {
     WasAlreadyBeingDeclared = !S.SpecialMembersBeingDeclared.insert(D).second;
     if (WasAlreadyBeingDeclared)
       // This almost never happens, but if it does, ensure that our cache
@@ -8896,6 +9883,21 @@
 };
 }
 
+void Sema::CheckImplicitSpecialMemberDeclaration(Scope *S, FunctionDecl *FD) {
+  // Look up any existing declarations, but don't trigger declaration of all
+  // implicit special members with this name.
+  DeclarationName Name = FD->getDeclName();
+  LookupResult R(*this, Name, SourceLocation(), LookupOrdinaryName,
+                 ForRedeclaration);
+  for (auto *D : FD->getParent()->lookup(Name))
+    if (auto *Acceptable = R.getAcceptableDecl(D))
+      R.addDecl(Acceptable);
+  R.resolveKind();
+  R.suppressDiagnostics();
+
+  CheckFunctionDeclaration(S, FD, R, /*IsExplicitSpecialization*/false);
+}
+
 CXXConstructorDecl *Sema::DeclareImplicitDefaultConstructor(
                                                      CXXRecordDecl *ClassDecl) {
   // C++ [class.ctor]p5:
@@ -8944,13 +9946,16 @@
   // constructors is easy to compute.
   DefaultCon->setTrivial(ClassDecl->hasTrivialDefaultConstructor());
 
-  if (ShouldDeleteSpecialMember(DefaultCon, CXXDefaultConstructor))
-    SetDeclDeleted(DefaultCon, ClassLoc);
-
   // Note that we have declared this constructor.
   ++ASTContext::NumImplicitDefaultConstructorsDeclared;
 
-  if (Scope *S = getScopeForContext(ClassDecl))
+  Scope *S = getScopeForContext(ClassDecl);
+  CheckImplicitSpecialMemberDeclaration(S, DefaultCon);
+
+  if (ShouldDeleteSpecialMember(DefaultCon, CXXDefaultConstructor))
+    SetDeclDeleted(DefaultCon, ClassLoc);
+
+  if (S)
     PushOnScopeChains(DefaultCon, S, false);
   ClassDecl->addDecl(DefaultCon);
 
@@ -9002,304 +10007,94 @@
   CheckDelayedMemberExceptionSpecs();
 }
 
-namespace {
-/// Information on inheriting constructors to declare.
-class InheritingConstructorInfo {
-public:
-  InheritingConstructorInfo(Sema &SemaRef, CXXRecordDecl *Derived)
-      : SemaRef(SemaRef), Derived(Derived) {
-    // Mark the constructors that we already have in the derived class.
-    //
-    // C++11 [class.inhctor]p3: [...] a constructor is implicitly declared [...]
-    //   unless there is a user-declared constructor with the same signature in
-    //   the class where the using-declaration appears.
-    visitAll(Derived, &InheritingConstructorInfo::noteDeclaredInDerived);
-  }
+/// Find or create the fake constructor we synthesize to model constructing an
+/// object of a derived class via a constructor of a base class.
+CXXConstructorDecl *
+Sema::findInheritingConstructor(SourceLocation Loc,
+                                CXXConstructorDecl *BaseCtor,
+                                ConstructorUsingShadowDecl *Shadow) {
+  CXXRecordDecl *Derived = Shadow->getParent();
+  SourceLocation UsingLoc = Shadow->getLocation();
 
-  void inheritAll(CXXRecordDecl *RD) {
-    visitAll(RD, &InheritingConstructorInfo::inherit);
-  }
+  // FIXME: Add a new kind of DeclarationName for an inherited constructor.
+  // For now we use the name of the base class constructor as a member of the
+  // derived class to indicate a (fake) inherited constructor name.
+  DeclarationName Name = BaseCtor->getDeclName();
 
-private:
-  /// Information about an inheriting constructor.
-  struct InheritingConstructor {
-    InheritingConstructor()
-      : DeclaredInDerived(false), BaseCtor(nullptr), DerivedCtor(nullptr) {}
+  // Check to see if we already have a fake constructor for this inherited
+  // constructor call.
+  for (NamedDecl *Ctor : Derived->lookup(Name))
+    if (declaresSameEntity(cast<CXXConstructorDecl>(Ctor)
+                               ->getInheritedConstructor()
+                               .getConstructor(),
+                           BaseCtor))
+      return cast<CXXConstructorDecl>(Ctor);
 
-    /// If \c true, a constructor with this signature is already declared
-    /// in the derived class.
-    bool DeclaredInDerived;
+  DeclarationNameInfo NameInfo(Name, UsingLoc);
+  TypeSourceInfo *TInfo =
+      Context.getTrivialTypeSourceInfo(BaseCtor->getType(), UsingLoc);
+  FunctionProtoTypeLoc ProtoLoc =
+      TInfo->getTypeLoc().IgnoreParens().castAs<FunctionProtoTypeLoc>();
 
-    /// The constructor which is inherited.
-    const CXXConstructorDecl *BaseCtor;
+  // Check the inherited constructor is valid and find the list of base classes
+  // from which it was inherited.
+  InheritedConstructorInfo ICI(*this, Loc, Shadow);
 
-    /// The derived constructor we declared.
-    CXXConstructorDecl *DerivedCtor;
-  };
+  bool Constexpr =
+      BaseCtor->isConstexpr() &&
+      defaultedSpecialMemberIsConstexpr(*this, Derived, CXXDefaultConstructor,
+                                        false, BaseCtor, &ICI);
 
-  /// Inheriting constructors with a given canonical type. There can be at
-  /// most one such non-template constructor, and any number of templated
-  /// constructors.
-  struct InheritingConstructorsForType {
-    InheritingConstructor NonTemplate;
-    SmallVector<std::pair<TemplateParameterList *, InheritingConstructor>, 4>
-        Templates;
+  CXXConstructorDecl *DerivedCtor = CXXConstructorDecl::Create(
+      Context, Derived, UsingLoc, NameInfo, TInfo->getType(), TInfo,
+      BaseCtor->isExplicit(), /*Inline=*/true,
+      /*ImplicitlyDeclared=*/true, Constexpr,
+      InheritedConstructor(Shadow, BaseCtor));
+  if (Shadow->isInvalidDecl())
+    DerivedCtor->setInvalidDecl();
 
-    InheritingConstructor &getEntry(Sema &S, const CXXConstructorDecl *Ctor) {
-      if (FunctionTemplateDecl *FTD = Ctor->getDescribedFunctionTemplate()) {
-        TemplateParameterList *ParamList = FTD->getTemplateParameters();
-        for (unsigned I = 0, N = Templates.size(); I != N; ++I)
-          if (S.TemplateParameterListsAreEqual(ParamList, Templates[I].first,
-                                               false, S.TPL_TemplateMatch))
-            return Templates[I].second;
-        Templates.push_back(std::make_pair(ParamList, InheritingConstructor()));
-        return Templates.back().second;
-      }
+  // Build an unevaluated exception specification for this fake constructor.
+  const FunctionProtoType *FPT = TInfo->getType()->castAs<FunctionProtoType>();
+  FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
+  EPI.ExceptionSpec.Type = EST_Unevaluated;
+  EPI.ExceptionSpec.SourceDecl = DerivedCtor;
+  DerivedCtor->setType(Context.getFunctionType(FPT->getReturnType(),
+                                               FPT->getParamTypes(), EPI));
 
-      return NonTemplate;
-    }
-  };
-
-  /// Get or create the inheriting constructor record for a constructor.
-  InheritingConstructor &getEntry(const CXXConstructorDecl *Ctor,
-                                  QualType CtorType) {
-    return Map[CtorType.getCanonicalType()->castAs<FunctionProtoType>()]
-        .getEntry(SemaRef, Ctor);
-  }
-
-  typedef void (InheritingConstructorInfo::*VisitFn)(const CXXConstructorDecl*);
-
-  /// Process all constructors for a class.
-  void visitAll(const CXXRecordDecl *RD, VisitFn Callback) {
-    for (const auto *Ctor : RD->ctors())
-      (this->*Callback)(Ctor);
-    for (CXXRecordDecl::specific_decl_iterator<FunctionTemplateDecl>
-             I(RD->decls_begin()), E(RD->decls_end());
-         I != E; ++I) {
-      const FunctionDecl *FD = (*I)->getTemplatedDecl();
-      if (const CXXConstructorDecl *CD = dyn_cast<CXXConstructorDecl>(FD))
-        (this->*Callback)(CD);
-    }
-  }
-
-  /// Note that a constructor (or constructor template) was declared in Derived.
-  void noteDeclaredInDerived(const CXXConstructorDecl *Ctor) {
-    getEntry(Ctor, Ctor->getType()).DeclaredInDerived = true;
-  }
-
-  /// Inherit a single constructor.
-  void inherit(const CXXConstructorDecl *Ctor) {
-    const FunctionProtoType *CtorType =
-        Ctor->getType()->castAs<FunctionProtoType>();
-    ArrayRef<QualType> ArgTypes = CtorType->getParamTypes();
-    FunctionProtoType::ExtProtoInfo EPI = CtorType->getExtProtoInfo();
-
-    SourceLocation UsingLoc = getUsingLoc(Ctor->getParent());
-
-    // Core issue (no number yet): the ellipsis is always discarded.
-    if (EPI.Variadic) {
-      SemaRef.Diag(UsingLoc, diag::warn_using_decl_constructor_ellipsis);
-      SemaRef.Diag(Ctor->getLocation(),
-                   diag::note_using_decl_constructor_ellipsis);
-      EPI.Variadic = false;
-    }
-
-    // Declare a constructor for each number of parameters.
-    //
-    // C++11 [class.inhctor]p1:
-    //   The candidate set of inherited constructors from the class X named in
-    //   the using-declaration consists of [... modulo defects ...] for each
-    //   constructor or constructor template of X, the set of constructors or
-    //   constructor templates that results from omitting any ellipsis parameter
-    //   specification and successively omitting parameters with a default
-    //   argument from the end of the parameter-type-list
-    unsigned MinParams = minParamsToInherit(Ctor);
-    unsigned Params = Ctor->getNumParams();
-    if (Params >= MinParams) {
-      do
-        declareCtor(UsingLoc, Ctor,
-                    SemaRef.Context.getFunctionType(
-                        Ctor->getReturnType(), ArgTypes.slice(0, Params), EPI));
-      while (Params > MinParams &&
-             Ctor->getParamDecl(--Params)->hasDefaultArg());
-    }
-  }
-
-  /// Find the using-declaration which specified that we should inherit the
-  /// constructors of \p Base.
-  SourceLocation getUsingLoc(const CXXRecordDecl *Base) {
-    // No fancy lookup required; just look for the base constructor name
-    // directly within the derived class.
-    ASTContext &Context = SemaRef.Context;
-    DeclarationName Name = Context.DeclarationNames.getCXXConstructorName(
-        Context.getCanonicalType(Context.getRecordType(Base)));
-    DeclContext::lookup_result Decls = Derived->lookup(Name);
-    return Decls.empty() ? Derived->getLocation() : Decls[0]->getLocation();
-  }
-
-  unsigned minParamsToInherit(const CXXConstructorDecl *Ctor) {
-    // C++11 [class.inhctor]p3:
-    //   [F]or each constructor template in the candidate set of inherited
-    //   constructors, a constructor template is implicitly declared
-    if (Ctor->getDescribedFunctionTemplate())
-      return 0;
-
-    //   For each non-template constructor in the candidate set of inherited
-    //   constructors other than a constructor having no parameters or a
-    //   copy/move constructor having a single parameter, a constructor is
-    //   implicitly declared [...]
-    if (Ctor->getNumParams() == 0)
-      return 1;
-    if (Ctor->isCopyOrMoveConstructor())
-      return 2;
-
-    // Per discussion on core reflector, never inherit a constructor which
-    // would become a default, copy, or move constructor of Derived either.
-    const ParmVarDecl *PD = Ctor->getParamDecl(0);
-    const ReferenceType *RT = PD->getType()->getAs<ReferenceType>();
-    return (RT && RT->getPointeeCXXRecordDecl() == Derived) ? 2 : 1;
-  }
-
-  /// Declare a single inheriting constructor, inheriting the specified
-  /// constructor, with the given type.
-  void declareCtor(SourceLocation UsingLoc, const CXXConstructorDecl *BaseCtor,
-                   QualType DerivedType) {
-    InheritingConstructor &Entry = getEntry(BaseCtor, DerivedType);
-
-    // C++11 [class.inhctor]p3:
-    //   ... a constructor is implicitly declared with the same constructor
-    //   characteristics unless there is a user-declared constructor with
-    //   the same signature in the class where the using-declaration appears
-    if (Entry.DeclaredInDerived)
-      return;
-
-    // C++11 [class.inhctor]p7:
-    //   If two using-declarations declare inheriting constructors with the
-    //   same signature, the program is ill-formed
-    if (Entry.DerivedCtor) {
-      if (BaseCtor->getParent() != Entry.BaseCtor->getParent()) {
-        // Only diagnose this once per constructor.
-        if (Entry.DerivedCtor->isInvalidDecl())
-          return;
-        Entry.DerivedCtor->setInvalidDecl();
-
-        SemaRef.Diag(UsingLoc, diag::err_using_decl_constructor_conflict);
-        SemaRef.Diag(BaseCtor->getLocation(),
-                     diag::note_using_decl_constructor_conflict_current_ctor);
-        SemaRef.Diag(Entry.BaseCtor->getLocation(),
-                     diag::note_using_decl_constructor_conflict_previous_ctor);
-        SemaRef.Diag(Entry.DerivedCtor->getLocation(),
-                     diag::note_using_decl_constructor_conflict_previous_using);
-      } else {
-        // Core issue (no number): if the same inheriting constructor is
-        // produced by multiple base class constructors from the same base
-        // class, the inheriting constructor is defined as deleted.
-        SemaRef.SetDeclDeleted(Entry.DerivedCtor, UsingLoc);
-      }
-
-      return;
-    }
-
-    ASTContext &Context = SemaRef.Context;
-    DeclarationName Name = Context.DeclarationNames.getCXXConstructorName(
-        Context.getCanonicalType(Context.getRecordType(Derived)));
-    DeclarationNameInfo NameInfo(Name, UsingLoc);
-
-    TemplateParameterList *TemplateParams = nullptr;
-    if (const FunctionTemplateDecl *FTD =
-            BaseCtor->getDescribedFunctionTemplate()) {
-      TemplateParams = FTD->getTemplateParameters();
-      // We're reusing template parameters from a different DeclContext. This
-      // is questionable at best, but works out because the template depth in
-      // both places is guaranteed to be 0.
-      // FIXME: Rebuild the template parameters in the new context, and
-      // transform the function type to refer to them.
-    }
-
-    // Build type source info pointing at the using-declaration. This is
-    // required by template instantiation.
+  // Build the parameter declarations.
+  SmallVector<ParmVarDecl *, 16> ParamDecls;
+  for (unsigned I = 0, N = FPT->getNumParams(); I != N; ++I) {
     TypeSourceInfo *TInfo =
-        Context.getTrivialTypeSourceInfo(DerivedType, UsingLoc);
-    FunctionProtoTypeLoc ProtoLoc =
-        TInfo->getTypeLoc().IgnoreParens().castAs<FunctionProtoTypeLoc>();
-
-    CXXConstructorDecl *DerivedCtor = CXXConstructorDecl::Create(
-        Context, Derived, UsingLoc, NameInfo, DerivedType,
-        TInfo, BaseCtor->isExplicit(), /*Inline=*/true,
-        /*ImplicitlyDeclared=*/true, /*Constexpr=*/BaseCtor->isConstexpr());
-
-    // Build an unevaluated exception specification for this constructor.
-    const FunctionProtoType *FPT = DerivedType->castAs<FunctionProtoType>();
-    FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
-    EPI.ExceptionSpec.Type = EST_Unevaluated;
-    EPI.ExceptionSpec.SourceDecl = DerivedCtor;
-    DerivedCtor->setType(Context.getFunctionType(FPT->getReturnType(),
-                                                 FPT->getParamTypes(), EPI));
-
-    // Build the parameter declarations.
-    SmallVector<ParmVarDecl *, 16> ParamDecls;
-    for (unsigned I = 0, N = FPT->getNumParams(); I != N; ++I) {
-      TypeSourceInfo *TInfo =
-          Context.getTrivialTypeSourceInfo(FPT->getParamType(I), UsingLoc);
-      ParmVarDecl *PD = ParmVarDecl::Create(
-          Context, DerivedCtor, UsingLoc, UsingLoc, /*IdentifierInfo=*/nullptr,
-          FPT->getParamType(I), TInfo, SC_None, /*DefaultArg=*/nullptr);
-      PD->setScopeInfo(0, I);
-      PD->setImplicit();
-      ParamDecls.push_back(PD);
-      ProtoLoc.setParam(I, PD);
-    }
-
-    // Set up the new constructor.
-    DerivedCtor->setAccess(BaseCtor->getAccess());
-    DerivedCtor->setParams(ParamDecls);
-    DerivedCtor->setInheritedConstructor(BaseCtor);
-    if (BaseCtor->isDeleted())
-      SemaRef.SetDeclDeleted(DerivedCtor, UsingLoc);
-
-    // If this is a constructor template, build the template declaration.
-    if (TemplateParams) {
-      FunctionTemplateDecl *DerivedTemplate =
-          FunctionTemplateDecl::Create(SemaRef.Context, Derived, UsingLoc, Name,
-                                       TemplateParams, DerivedCtor);
-      DerivedTemplate->setAccess(BaseCtor->getAccess());
-      DerivedCtor->setDescribedFunctionTemplate(DerivedTemplate);
-      Derived->addDecl(DerivedTemplate);
-    } else {
-      Derived->addDecl(DerivedCtor);
-    }
-
-    Entry.BaseCtor = BaseCtor;
-    Entry.DerivedCtor = DerivedCtor;
+        Context.getTrivialTypeSourceInfo(FPT->getParamType(I), UsingLoc);
+    ParmVarDecl *PD = ParmVarDecl::Create(
+        Context, DerivedCtor, UsingLoc, UsingLoc, /*IdentifierInfo=*/nullptr,
+        FPT->getParamType(I), TInfo, SC_None, /*DefaultArg=*/nullptr);
+    PD->setScopeInfo(0, I);
+    PD->setImplicit();
+    // Ensure attributes are propagated onto parameters (this matters for
+    // format, pass_object_size, ...).
+    mergeDeclAttributes(PD, BaseCtor->getParamDecl(I));
+    ParamDecls.push_back(PD);
+    ProtoLoc.setParam(I, PD);
   }
 
-  Sema &SemaRef;
-  CXXRecordDecl *Derived;
-  typedef llvm::DenseMap<const Type *, InheritingConstructorsForType> MapType;
-  MapType Map;
-};
+  // Set up the new constructor.
+  assert(!BaseCtor->isDeleted() && "should not use deleted constructor");
+  DerivedCtor->setAccess(BaseCtor->getAccess());
+  DerivedCtor->setParams(ParamDecls);
+  Derived->addDecl(DerivedCtor);
+
+  if (ShouldDeleteSpecialMember(DerivedCtor, CXXDefaultConstructor, &ICI))
+    SetDeclDeleted(DerivedCtor, UsingLoc);
+
+  return DerivedCtor;
 }
 
-void Sema::DeclareInheritingConstructors(CXXRecordDecl *ClassDecl) {
-  // Defer declaring the inheriting constructors until the class is
-  // instantiated.
-  if (ClassDecl->isDependentContext())
-    return;
-
-  // Find base classes from which we might inherit constructors.
-  SmallVector<CXXRecordDecl*, 4> InheritedBases;
-  for (const auto &BaseIt : ClassDecl->bases())
-    if (BaseIt.getInheritConstructors())
-      InheritedBases.push_back(BaseIt.getType()->getAsCXXRecordDecl());
-
-  // Go no further if we're not inheriting any constructors.
-  if (InheritedBases.empty())
-    return;
-
-  // Declare the inherited constructors.
-  InheritingConstructorInfo ICI(*this, ClassDecl);
-  for (unsigned I = 0, N = InheritedBases.size(); I != N; ++I)
-    ICI.inheritAll(InheritedBases[I]);
+void Sema::NoteDeletedInheritingConstructor(CXXConstructorDecl *Ctor) {
+  InheritedConstructorInfo ICI(*this, Ctor->getLocation(),
+                               Ctor->getInheritedConstructor().getShadowDecl());
+  ShouldDeleteSpecialMember(Ctor, CXXDefaultConstructor, &ICI,
+                            /*Diagnose*/true);
 }
 
 void Sema::DefineInheritingConstructor(SourceLocation CurrentLocation,
@@ -9308,19 +10103,71 @@
   assert(Constructor->getInheritedConstructor() &&
          !Constructor->doesThisDeclarationHaveABody() &&
          !Constructor->isDeleted());
+  if (Constructor->isInvalidDecl())
+    return;
 
+  ConstructorUsingShadowDecl *Shadow =
+      Constructor->getInheritedConstructor().getShadowDecl();
+  CXXConstructorDecl *InheritedCtor =
+      Constructor->getInheritedConstructor().getConstructor();
+
+  // [class.inhctor.init]p1:
+  //   initialization proceeds as if a defaulted default constructor is used to
+  //   initialize the D object and each base class subobject from which the
+  //   constructor was inherited
+
+  InheritedConstructorInfo ICI(*this, CurrentLocation, Shadow);
+  CXXRecordDecl *RD = Shadow->getParent();
+  SourceLocation InitLoc = Shadow->getLocation();
+
+  // Initializations are performed "as if by a defaulted default constructor",
+  // so enter the appropriate scope.
   SynthesizedFunctionScope Scope(*this, Constructor);
   DiagnosticErrorTrap Trap(Diags);
-  if (SetCtorInitializers(Constructor, /*AnyErrors=*/false) ||
-      Trap.hasErrorOccurred()) {
-    Diag(CurrentLocation, diag::note_inhctor_synthesized_at)
-      << Context.getTagDeclType(ClassDecl);
+
+  // Build explicit initializers for all base classes from which the
+  // constructor was inherited.
+  SmallVector<CXXCtorInitializer*, 8> Inits;
+  for (bool VBase : {false, true}) {
+    for (CXXBaseSpecifier &B : VBase ? RD->vbases() : RD->bases()) {
+      if (B.isVirtual() != VBase)
+        continue;
+
+      auto *BaseRD = B.getType()->getAsCXXRecordDecl();
+      if (!BaseRD)
+        continue;
+
+      auto BaseCtor = ICI.findConstructorForBase(BaseRD, InheritedCtor);
+      if (!BaseCtor.first)
+        continue;
+
+      MarkFunctionReferenced(CurrentLocation, BaseCtor.first);
+      ExprResult Init = new (Context) CXXInheritedCtorInitExpr(
+          InitLoc, B.getType(), BaseCtor.first, VBase, BaseCtor.second);
+
+      auto *TInfo = Context.getTrivialTypeSourceInfo(B.getType(), InitLoc);
+      Inits.push_back(new (Context) CXXCtorInitializer(
+          Context, TInfo, VBase, InitLoc, Init.get(), InitLoc,
+          SourceLocation()));
+    }
+  }
+
+  // We now proceed as if for a defaulted default constructor, with the relevant
+  // initializers replaced.
+
+  bool HadError = SetCtorInitializers(Constructor, /*AnyErrors*/false, Inits);
+  if (HadError || Trap.hasErrorOccurred()) {
+    Diag(CurrentLocation, diag::note_inhctor_synthesized_at) << RD;
     Constructor->setInvalidDecl();
     return;
   }
 
-  SourceLocation Loc = Constructor->getLocation();
-  Constructor->setBody(new (Context) CompoundStmt(Loc));
+  // The exception specification is needed because we are defining the
+  // function.
+  ResolveExceptionSpec(CurrentLocation,
+                       Constructor->getType()->castAs<FunctionProtoType>());
+
+  Constructor->setBody(new (Context) CompoundStmt(InitLoc));
 
   Constructor->markUsed(Context);
   MarkVTableUsed(CurrentLocation, ClassDecl);
@@ -9328,8 +10175,9 @@
   if (ASTMutationListener *L = getASTMutationListener()) {
     L->CompletedImplicitDefinition(Constructor);
   }
-}
 
+  DiagnoseUninitializedFields(*this, Constructor);
+}
 
 Sema::ImplicitExceptionSpecification
 Sema::ComputeDefaultedDtorExceptionSpec(CXXMethodDecl *MD) {
@@ -9406,20 +10254,21 @@
   FunctionProtoType::ExtProtoInfo EPI = getImplicitMethodEPI(*this, Destructor);
   Destructor->setType(Context.getFunctionType(Context.VoidTy, None, EPI));
 
-  AddOverriddenMethods(ClassDecl, Destructor);
-
   // We don't need to use SpecialMemberIsTrivial here; triviality for
   // destructors is easy to compute.
   Destructor->setTrivial(ClassDecl->hasTrivialDestructor());
 
-  if (ShouldDeleteSpecialMember(Destructor, CXXDestructor))
-    SetDeclDeleted(Destructor, ClassLoc);
-
   // Note that we have declared this destructor.
   ++ASTContext::NumImplicitDestructorsDeclared;
 
+  Scope *S = getScopeForContext(ClassDecl);
+  CheckImplicitSpecialMemberDeclaration(S, Destructor);
+
+  if (ShouldDeleteSpecialMember(Destructor, CXXDestructor))
+    SetDeclDeleted(Destructor, ClassLoc);
+
   // Introduce this destructor into its scope.
-  if (Scope *S = getScopeForContext(ClassDecl))
+  if (S)
     PushOnScopeChains(Destructor, S, false);
   ClassDecl->addDecl(Destructor);
 
@@ -9542,6 +10391,10 @@
   if (RD && Context.getTargetInfo().getCXXABI().isMicrosoft())
     getDefaultArgExprsForConstructors(*this, RD);
 
+  referenceDLLExportedClassMethods();
+}
+
+void Sema::referenceDLLExportedClassMethods() {
   if (!DelayedDllExportClasses.empty()) {
     // Calling ReferenceDllExportedMethods might cause the current function to
     // be called again, so use a local copy of DelayedDllExportClasses.
@@ -9978,10 +10831,10 @@
                                     SizeType, VK_LValue, OK_Ordinary, Loc);
 
   // Construct the loop that copies all elements of this array.
-  return S.ActOnForStmt(Loc, Loc, InitStmt, 
-                        S.MakeFullExpr(Comparison),
-                        nullptr, S.MakeFullDiscardedValueExpr(Increment),
-                        Loc, Copy.get());
+  return S.ActOnForStmt(
+      Loc, Loc, InitStmt,
+      S.ActOnCondition(nullptr, Loc, Comparison, Sema::ConditionKind::Boolean),
+      S.MakeFullDiscardedValueExpr(Increment), Loc, Copy.get());
 }
 
 static StmtResult
@@ -10116,20 +10969,21 @@
                                                nullptr);
   CopyAssignment->setParams(FromParam);
 
-  AddOverriddenMethods(ClassDecl, CopyAssignment);
-
   CopyAssignment->setTrivial(
     ClassDecl->needsOverloadResolutionForCopyAssignment()
       ? SpecialMemberIsTrivial(CopyAssignment, CXXCopyAssignment)
       : ClassDecl->hasTrivialCopyAssignment());
 
-  if (ShouldDeleteSpecialMember(CopyAssignment, CXXCopyAssignment))
-    SetDeclDeleted(CopyAssignment, ClassLoc);
-
   // Note that we have added this copy-assignment operator.
   ++ASTContext::NumImplicitCopyAssignmentOperatorsDeclared;
 
-  if (Scope *S = getScopeForContext(ClassDecl))
+  Scope *S = getScopeForContext(ClassDecl);
+  CheckImplicitSpecialMemberDeclaration(S, CopyAssignment);
+
+  if (ShouldDeleteSpecialMember(CopyAssignment, CXXCopyAssignment))
+    SetDeclDeleted(CopyAssignment, ClassLoc);
+
+  if (S)
     PushOnScopeChains(CopyAssignment, S, false);
   ClassDecl->addDecl(CopyAssignment);
 
@@ -10507,22 +11361,23 @@
                                                nullptr);
   MoveAssignment->setParams(FromParam);
 
-  AddOverriddenMethods(ClassDecl, MoveAssignment);
-
   MoveAssignment->setTrivial(
     ClassDecl->needsOverloadResolutionForMoveAssignment()
       ? SpecialMemberIsTrivial(MoveAssignment, CXXMoveAssignment)
       : ClassDecl->hasTrivialMoveAssignment());
 
+  // Note that we have added this copy-assignment operator.
+  ++ASTContext::NumImplicitMoveAssignmentOperatorsDeclared;
+
+  Scope *S = getScopeForContext(ClassDecl);
+  CheckImplicitSpecialMemberDeclaration(S, MoveAssignment);
+
   if (ShouldDeleteSpecialMember(MoveAssignment, CXXMoveAssignment)) {
     ClassDecl->setImplicitMoveAssignmentIsDeleted();
     SetDeclDeleted(MoveAssignment, ClassLoc);
   }
 
-  // Note that we have added this copy-assignment operator.
-  ++ASTContext::NumImplicitMoveAssignmentOperatorsDeclared;
-
-  if (Scope *S = getScopeForContext(ClassDecl))
+  if (S)
     PushOnScopeChains(MoveAssignment, S, false);
   ClassDecl->addDecl(MoveAssignment);
 
@@ -10948,13 +11803,16 @@
       ? SpecialMemberIsTrivial(CopyConstructor, CXXCopyConstructor)
       : ClassDecl->hasTrivialCopyConstructor());
 
-  if (ShouldDeleteSpecialMember(CopyConstructor, CXXCopyConstructor))
-    SetDeclDeleted(CopyConstructor, ClassLoc);
-
   // Note that we have declared this constructor.
   ++ASTContext::NumImplicitCopyConstructorsDeclared;
 
-  if (Scope *S = getScopeForContext(ClassDecl))
+  Scope *S = getScopeForContext(ClassDecl);
+  CheckImplicitSpecialMemberDeclaration(S, CopyConstructor);
+
+  if (ShouldDeleteSpecialMember(CopyConstructor, CXXCopyConstructor))
+    SetDeclDeleted(CopyConstructor, ClassLoc);
+
+  if (S)
     PushOnScopeChains(CopyConstructor, S, false);
   ClassDecl->addDecl(CopyConstructor);
 
@@ -11125,15 +11983,18 @@
       ? SpecialMemberIsTrivial(MoveConstructor, CXXMoveConstructor)
       : ClassDecl->hasTrivialMoveConstructor());
 
+  // Note that we have declared this constructor.
+  ++ASTContext::NumImplicitMoveConstructorsDeclared;
+
+  Scope *S = getScopeForContext(ClassDecl);
+  CheckImplicitSpecialMemberDeclaration(S, MoveConstructor);
+
   if (ShouldDeleteSpecialMember(MoveConstructor, CXXMoveConstructor)) {
     ClassDecl->setImplicitMoveConstructorIsDeleted();
     SetDeclDeleted(MoveConstructor, ClassLoc);
   }
 
-  // Note that we have declared this constructor.
-  ++ASTContext::NumImplicitMoveConstructorsDeclared;
-
-  if (Scope *S = getScopeForContext(ClassDecl))
+  if (S)
     PushOnScopeChains(MoveConstructor, S, false);
   ClassDecl->addDecl(MoveConstructor);
 
@@ -11338,6 +12199,7 @@
 
 ExprResult
 Sema::BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
+                            NamedDecl *FoundDecl,
                             CXXConstructorDecl *Constructor,
                             MultiExprArg ExprArgs,
                             bool HadMultipleCandidates,
@@ -11358,24 +12220,26 @@
   //       with the same cv-unqualified type, the copy/move operation
   //       can be omitted by constructing the temporary object
   //       directly into the target of the omitted copy/move
-  if (ConstructKind == CXXConstructExpr::CK_Complete &&
+  if (ConstructKind == CXXConstructExpr::CK_Complete && Constructor &&
       Constructor->isCopyOrMoveConstructor() && hasOneRealArgument(ExprArgs)) {
     Expr *SubExpr = ExprArgs[0];
-    Elidable = SubExpr->isTemporaryObject(Context, Constructor->getParent());
+    Elidable = SubExpr->isTemporaryObject(
+        Context, cast<CXXRecordDecl>(FoundDecl->getDeclContext()));
   }
 
-  return BuildCXXConstructExpr(ConstructLoc, DeclInitType, Constructor,
+  return BuildCXXConstructExpr(ConstructLoc, DeclInitType,
+                               FoundDecl, Constructor,
                                Elidable, ExprArgs, HadMultipleCandidates,
                                IsListInitialization,
                                IsStdInitListInitialization, RequiresZeroInit,
                                ConstructKind, ParenRange);
 }
 
-/// BuildCXXConstructExpr - Creates a complete call to a constructor,
-/// including handling of its default argument expressions.
 ExprResult
 Sema::BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
-                            CXXConstructorDecl *Constructor, bool Elidable,
+                            NamedDecl *FoundDecl,
+                            CXXConstructorDecl *Constructor,
+                            bool Elidable,
                             MultiExprArg ExprArgs,
                             bool HadMultipleCandidates,
                             bool IsListInitialization,
@@ -11383,11 +12247,43 @@
                             bool RequiresZeroInit,
                             unsigned ConstructKind,
                             SourceRange ParenRange) {
-  MarkFunctionReferenced(ConstructLoc, Constructor);
-  return CXXConstructExpr::Create(
-      Context, DeclInitType, ConstructLoc, Constructor, Elidable, ExprArgs,
+  if (auto *Shadow = dyn_cast<ConstructorUsingShadowDecl>(FoundDecl)) {
+    Constructor = findInheritingConstructor(ConstructLoc, Constructor, Shadow);
+    if (DiagnoseUseOfDecl(Constructor, ConstructLoc))
+      return ExprError(); 
+  }
+
+  return BuildCXXConstructExpr(
+      ConstructLoc, DeclInitType, Constructor, Elidable, ExprArgs,
       HadMultipleCandidates, IsListInitialization, IsStdInitListInitialization,
-      RequiresZeroInit,
+      RequiresZeroInit, ConstructKind, ParenRange);
+}
+
+/// BuildCXXConstructExpr - Creates a complete call to a constructor,
+/// including handling of its default argument expressions.
+ExprResult
+Sema::BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
+                            CXXConstructorDecl *Constructor,
+                            bool Elidable,
+                            MultiExprArg ExprArgs,
+                            bool HadMultipleCandidates,
+                            bool IsListInitialization,
+                            bool IsStdInitListInitialization,
+                            bool RequiresZeroInit,
+                            unsigned ConstructKind,
+                            SourceRange ParenRange) {
+  assert(declaresSameEntity(
+             Constructor->getParent(),
+             DeclInitType->getBaseElementTypeUnsafe()->getAsCXXRecordDecl()) &&
+         "given constructor for wrong type");
+  MarkFunctionReferenced(ConstructLoc, Constructor);
+  if (getLangOpts().CUDA && !CheckCUDACall(ConstructLoc, Constructor))
+    return ExprError();
+
+  return CXXConstructExpr::Create(
+      Context, DeclInitType, ConstructLoc, Constructor, Elidable,
+      ExprArgs, HadMultipleCandidates, IsListInitialization,
+      IsStdInitListInitialization, RequiresZeroInit,
       static_cast<CXXConstructExpr::ConstructionKind>(ConstructKind),
       ParenRange);
 }
@@ -11407,8 +12303,19 @@
     CXXRecordDecl *ClassPattern = ParentRD->getTemplateInstantiationPattern();
     DeclContext::lookup_result Lookup =
         ClassPattern->lookup(Field->getDeclName());
-    assert(Lookup.size() == 1);
-    FieldDecl *Pattern = cast<FieldDecl>(Lookup[0]);
+
+    // Lookup can return at most two results: the pattern for the field, or the
+    // injected class name of the parent record. No other member can have the
+    // same name as the field.
+    assert(!Lookup.empty() && Lookup.size() <= 2 &&
+           "more than two lookup results for field name");
+    FieldDecl *Pattern = dyn_cast<FieldDecl>(Lookup[0]);
+    if (!Pattern) {
+      assert(isa<CXXRecordDecl>(Lookup[0]) &&
+             "cannot have other non-field member with same name");
+      Pattern = cast<FieldDecl>(Lookup[1]);
+    }
+
     if (InstantiateInClassInitializer(Loc, Field, Pattern,
                                       getTemplateInstantiationArgs(Field)))
       return ExprError();
@@ -11669,7 +12576,7 @@
                   diag::err_operator_overload_static) << FnDecl->getDeclName();
   } else {
     bool ClassOrEnumParam = false;
-    for (auto Param : FnDecl->params()) {
+    for (auto Param : FnDecl->parameters()) {
       QualType ParamType = Param->getType().getNonReferenceType();
       if (ParamType->isDependentType() || ParamType->isRecordType() ||
           ParamType->isEnumeralType()) {
@@ -11691,7 +12598,7 @@
   // Only the function-call operator allows default arguments
   // (C++ [over.call]p1).
   if (Op != OO_Call) {
-    for (auto Param : FnDecl->params()) {
+    for (auto Param : FnDecl->parameters()) {
       if (Param->hasDefaultArg())
         return Diag(Param->getLocation(),
                     diag::err_operator_overload_default_arg)
@@ -11774,6 +12681,49 @@
   return false;
 }
 
+static bool
+checkLiteralOperatorTemplateParameterList(Sema &SemaRef,
+                                          FunctionTemplateDecl *TpDecl) {
+  TemplateParameterList *TemplateParams = TpDecl->getTemplateParameters();
+
+  // Must have one or two template parameters.
+  if (TemplateParams->size() == 1) {
+    NonTypeTemplateParmDecl *PmDecl =
+        dyn_cast<NonTypeTemplateParmDecl>(TemplateParams->getParam(0));
+
+    // The template parameter must be a char parameter pack.
+    if (PmDecl && PmDecl->isTemplateParameterPack() &&
+        SemaRef.Context.hasSameType(PmDecl->getType(), SemaRef.Context.CharTy))
+      return false;
+
+  } else if (TemplateParams->size() == 2) {
+    TemplateTypeParmDecl *PmType =
+        dyn_cast<TemplateTypeParmDecl>(TemplateParams->getParam(0));
+    NonTypeTemplateParmDecl *PmArgs =
+        dyn_cast<NonTypeTemplateParmDecl>(TemplateParams->getParam(1));
+
+    // The second template parameter must be a parameter pack with the
+    // first template parameter as its type.
+    if (PmType && PmArgs && !PmType->isTemplateParameterPack() &&
+        PmArgs->isTemplateParameterPack()) {
+      const TemplateTypeParmType *TArgs =
+          PmArgs->getType()->getAs<TemplateTypeParmType>();
+      if (TArgs && TArgs->getDepth() == PmType->getDepth() &&
+          TArgs->getIndex() == PmType->getIndex()) {
+        if (SemaRef.ActiveTemplateInstantiations.empty())
+          SemaRef.Diag(TpDecl->getLocation(),
+                       diag::ext_string_literal_operator_template);
+        return false;
+      }
+    }
+  }
+
+  SemaRef.Diag(TpDecl->getTemplateParameters()->getSourceRange().getBegin(),
+               diag::err_literal_operator_template)
+      << TpDecl->getTemplateParameters()->getSourceRange();
+  return true;
+}
+
 /// CheckLiteralOperatorDeclaration - Check whether the declaration
 /// of this literal operator function is well-formed. If so, returns
 /// false; otherwise, emits appropriate diagnostics and returns true.
@@ -11789,10 +12739,9 @@
     return true;
   }
 
-  bool Valid = false;
-
   // This might be the definition of a literal operator template.
   FunctionTemplateDecl *TpDecl = FnDecl->getDescribedFunctionTemplate();
+
   // This might be a specialization of a literal operator template.
   if (!TpDecl)
     TpDecl = FnDecl->getPrimaryTemplate();
@@ -11801,104 +12750,120 @@
   // template <class T, T...> type operator "" name() are the only valid
   // template signatures, and the only valid signatures with no parameters.
   if (TpDecl) {
-    if (FnDecl->param_size() == 0) {
-      // Must have one or two template parameters
-      TemplateParameterList *Params = TpDecl->getTemplateParameters();
-      if (Params->size() == 1) {
-        NonTypeTemplateParmDecl *PmDecl =
-          dyn_cast<NonTypeTemplateParmDecl>(Params->getParam(0));
-
-        // The template parameter must be a char parameter pack.
-        if (PmDecl && PmDecl->isTemplateParameterPack() &&
-            Context.hasSameType(PmDecl->getType(), Context.CharTy))
-          Valid = true;
-      } else if (Params->size() == 2) {
-        TemplateTypeParmDecl *PmType =
-          dyn_cast<TemplateTypeParmDecl>(Params->getParam(0));
-        NonTypeTemplateParmDecl *PmArgs =
-          dyn_cast<NonTypeTemplateParmDecl>(Params->getParam(1));
-
-        // The second template parameter must be a parameter pack with the
-        // first template parameter as its type.
-        if (PmType && PmArgs &&
-            !PmType->isTemplateParameterPack() &&
-            PmArgs->isTemplateParameterPack()) {
-          const TemplateTypeParmType *TArgs =
-            PmArgs->getType()->getAs<TemplateTypeParmType>();
-          if (TArgs && TArgs->getDepth() == PmType->getDepth() &&
-              TArgs->getIndex() == PmType->getIndex()) {
-            Valid = true;
-            if (ActiveTemplateInstantiations.empty())
-              Diag(FnDecl->getLocation(),
-                   diag::ext_string_literal_operator_template);
-          }
-        }
-      }
+    if (FnDecl->param_size() != 0) {
+      Diag(FnDecl->getLocation(),
+           diag::err_literal_operator_template_with_params);
+      return true;
     }
-  } else if (FnDecl->param_size()) {
-    // Check the first parameter
+
+    if (checkLiteralOperatorTemplateParameterList(*this, TpDecl))
+      return true;
+
+  } else if (FnDecl->param_size() == 1) {
+    const ParmVarDecl *Param = FnDecl->getParamDecl(0);
+
+    QualType ParamType = Param->getType().getUnqualifiedType();
+
+    // Only unsigned long long int, long double, any character type, and const
+    // char * are allowed as the only parameters.
+    if (ParamType->isSpecificBuiltinType(BuiltinType::ULongLong) ||
+        ParamType->isSpecificBuiltinType(BuiltinType::LongDouble) ||
+        Context.hasSameType(ParamType, Context.CharTy) ||
+        Context.hasSameType(ParamType, Context.WideCharTy) ||
+        Context.hasSameType(ParamType, Context.Char16Ty) ||
+        Context.hasSameType(ParamType, Context.Char32Ty)) {
+    } else if (const PointerType *Ptr = ParamType->getAs<PointerType>()) {
+      QualType InnerType = Ptr->getPointeeType();
+
+      // Pointer parameter must be a const char *.
+      if (!(Context.hasSameType(InnerType.getUnqualifiedType(),
+                                Context.CharTy) &&
+            InnerType.isConstQualified() && !InnerType.isVolatileQualified())) {
+        Diag(Param->getSourceRange().getBegin(),
+             diag::err_literal_operator_param)
+            << ParamType << "'const char *'" << Param->getSourceRange();
+        return true;
+      }
+
+    } else if (ParamType->isRealFloatingType()) {
+      Diag(Param->getSourceRange().getBegin(), diag::err_literal_operator_param)
+          << ParamType << Context.LongDoubleTy << Param->getSourceRange();
+      return true;
+
+    } else if (ParamType->isIntegerType()) {
+      Diag(Param->getSourceRange().getBegin(), diag::err_literal_operator_param)
+          << ParamType << Context.UnsignedLongLongTy << Param->getSourceRange();
+      return true;
+
+    } else {
+      Diag(Param->getSourceRange().getBegin(),
+           diag::err_literal_operator_invalid_param)
+          << ParamType << Param->getSourceRange();
+      return true;
+    }
+
+  } else if (FnDecl->param_size() == 2) {
     FunctionDecl::param_iterator Param = FnDecl->param_begin();
 
-    QualType T = (*Param)->getType().getUnqualifiedType();
+    // First, verify that the first parameter is correct.
 
-    // unsigned long long int, long double, and any character type are allowed
-    // as the only parameters.
-    if (Context.hasSameType(T, Context.UnsignedLongLongTy) ||
-        Context.hasSameType(T, Context.LongDoubleTy) ||
-        Context.hasSameType(T, Context.CharTy) ||
-        Context.hasSameType(T, Context.WideCharTy) ||
-        Context.hasSameType(T, Context.Char16Ty) ||
-        Context.hasSameType(T, Context.Char32Ty)) {
-      if (++Param == FnDecl->param_end())
-        Valid = true;
-      goto FinishedParams;
+    QualType FirstParamType = (*Param)->getType().getUnqualifiedType();
+
+    // Two parameter function must have a pointer to const as a
+    // first parameter; let's strip those qualifiers.
+    const PointerType *PT = FirstParamType->getAs<PointerType>();
+
+    if (!PT) {
+      Diag((*Param)->getSourceRange().getBegin(),
+           diag::err_literal_operator_param)
+          << FirstParamType << "'const char *'" << (*Param)->getSourceRange();
+      return true;
     }
 
-    // Otherwise it must be a pointer to const; let's strip those qualifiers.
-    const PointerType *PT = T->getAs<PointerType>();
-    if (!PT)
-      goto FinishedParams;
-    T = PT->getPointeeType();
-    if (!T.isConstQualified() || T.isVolatileQualified())
-      goto FinishedParams;
-    T = T.getUnqualifiedType();
+    QualType PointeeType = PT->getPointeeType();
+    // First parameter must be const
+    if (!PointeeType.isConstQualified() || PointeeType.isVolatileQualified()) {
+      Diag((*Param)->getSourceRange().getBegin(),
+           diag::err_literal_operator_param)
+          << FirstParamType << "'const char *'" << (*Param)->getSourceRange();
+      return true;
+    }
 
-    // Move on to the second parameter;
+    QualType InnerType = PointeeType.getUnqualifiedType();
+    // Only const char *, const wchar_t*, const char16_t*, and const char32_t*
+    // are allowed as the first parameter to a two-parameter function
+    if (!(Context.hasSameType(InnerType, Context.CharTy) ||
+          Context.hasSameType(InnerType, Context.WideCharTy) ||
+          Context.hasSameType(InnerType, Context.Char16Ty) ||
+          Context.hasSameType(InnerType, Context.Char32Ty))) {
+      Diag((*Param)->getSourceRange().getBegin(),
+           diag::err_literal_operator_param)
+          << FirstParamType << "'const char *'" << (*Param)->getSourceRange();
+      return true;
+    }
+
+    // Move on to the second and final parameter.
     ++Param;
 
-    // If there is no second parameter, the first must be a const char *
-    if (Param == FnDecl->param_end()) {
-      if (Context.hasSameType(T, Context.CharTy))
-        Valid = true;
-      goto FinishedParams;
+    // The second parameter must be a std::size_t.
+    QualType SecondParamType = (*Param)->getType().getUnqualifiedType();
+    if (!Context.hasSameType(SecondParamType, Context.getSizeType())) {
+      Diag((*Param)->getSourceRange().getBegin(),
+           diag::err_literal_operator_param)
+          << SecondParamType << Context.getSizeType()
+          << (*Param)->getSourceRange();
+      return true;
     }
-
-    // const char *, const wchar_t*, const char16_t*, and const char32_t*
-    // are allowed as the first parameter to a two-parameter function
-    if (!(Context.hasSameType(T, Context.CharTy) ||
-          Context.hasSameType(T, Context.WideCharTy) ||
-          Context.hasSameType(T, Context.Char16Ty) ||
-          Context.hasSameType(T, Context.Char32Ty)))
-      goto FinishedParams;
-
-    // The second and final parameter must be an std::size_t
-    T = (*Param)->getType().getUnqualifiedType();
-    if (Context.hasSameType(T, Context.getSizeType()) &&
-        ++Param == FnDecl->param_end())
-      Valid = true;
-  }
-
-  // FIXME: This diagnostic is absolutely terrible.
-FinishedParams:
-  if (!Valid) {
-    Diag(FnDecl->getLocation(), diag::err_literal_operator_params)
-      << FnDecl->getDeclName();
+  } else {
+    Diag(FnDecl->getLocation(), diag::err_literal_operator_bad_param_count);
     return true;
   }
 
+  // Parameters are good.
+
   // A parameter-declaration-clause containing a default argument is not
   // equivalent to any of the permitted forms.
-  for (auto Param : FnDecl->params()) {
+  for (auto Param : FnDecl->parameters()) {
     if (Param->hasDefaultArg()) {
       Diag(Param->getDefaultArgRange().getBegin(),
            diag::err_literal_operator_default_argument)
@@ -12012,6 +12977,11 @@
     Invalid = true;
   }
 
+  if (ExDeclType->isVariablyModifiedType()) {
+    Diag(Loc, diag::err_catch_variably_modified) << ExDeclType;
+    Invalid = true;
+  }
+
   QualType BaseType = ExDeclType;
   int Mode = 0; // 0 for direct type, 1 for pointer, 2 for reference
   unsigned DK = diag::err_catch_incomplete;
@@ -12477,10 +13447,9 @@
   // friend a member of an arbitrary specialization of your template).
 
   Decl *D;
-  if (unsigned NumTempParamLists = TempParams.size())
+  if (!TempParams.empty())
     D = FriendTemplateDecl::Create(Context, CurContext, Loc,
-                                   NumTempParamLists,
-                                   TempParams.data(),
+                                   TempParams,
                                    TSI,
                                    DS.getFriendSpecLoc());
   else
@@ -12903,44 +13872,20 @@
     // the record is complete.
     const FunctionDecl *Primary = MD;
     if (const FunctionDecl *Pattern = MD->getTemplateInstantiationPattern())
-      // Find the uninstantiated declaration that actually had the '= default'
-      // on it.
-      Pattern->isDefined(Primary);
+      // Ask the template instantiation pattern that actually had the
+      // '= default' on it.
+      Primary = Pattern;
 
     // If the method was defaulted on its first declaration, we will have
     // already performed the checking in CheckCompletedCXXClass. Such a
     // declaration doesn't trigger an implicit definition.
-    if (Primary == Primary->getCanonicalDecl())
+    if (Primary->getCanonicalDecl()->isDefaulted())
       return;
 
     CheckExplicitlyDefaultedSpecialMember(MD);
 
-    if (MD->isInvalidDecl())
-      return;
-
-    switch (Member) {
-    case CXXDefaultConstructor:
-      DefineImplicitDefaultConstructor(DefaultLoc,
-                                       cast<CXXConstructorDecl>(MD));
-      break;
-    case CXXCopyConstructor:
-      DefineImplicitCopyConstructor(DefaultLoc, cast<CXXConstructorDecl>(MD));
-      break;
-    case CXXCopyAssignment:
-      DefineImplicitCopyAssignment(DefaultLoc, MD);
-      break;
-    case CXXDestructor:
-      DefineImplicitDestructor(DefaultLoc, cast<CXXDestructorDecl>(MD));
-      break;
-    case CXXMoveConstructor:
-      DefineImplicitMoveConstructor(DefaultLoc, cast<CXXConstructorDecl>(MD));
-      break;
-    case CXXMoveAssignment:
-      DefineImplicitMoveAssignment(DefaultLoc, MD);
-      break;
-    case CXXInvalid:
-      llvm_unreachable("Invalid special member.");
-    }
+    if (!MD->isInvalidDecl())
+      DefineImplicitSpecialMember(*this, MD, DefaultLoc);
   } else {
     Diag(DefaultLoc, diag::err_default_special_members);
   }
@@ -13250,14 +14195,19 @@
     // checks (i.e. operator delete() lookup) when the vtable is marked used, as
     // the deleting destructor is emitted with the vtable, not with the
     // destructor definition as in the Itanium ABI.
-    // If it has a definition, we do the check at that point instead.
-    if (Context.getTargetInfo().getCXXABI().isMicrosoft() &&
-        Class->hasUserDeclaredDestructor() &&
-        !Class->getDestructor()->isDefined() &&
-        !Class->getDestructor()->isDeleted()) {
+    if (Context.getTargetInfo().getCXXABI().isMicrosoft()) {
       CXXDestructorDecl *DD = Class->getDestructor();
-      ContextRAII SavedContext(*this, DD);
-      CheckDestructor(DD);
+      if (DD && DD->isVirtual() && !DD->isDeleted()) {
+        if (Class->hasUserDeclaredDestructor() && !DD->isDefined()) {
+          // If this is an out-of-line declaration, marking it referenced will
+          // not do anything. Manually call CheckDestructor to look up operator
+          // delete().
+          ContextRAII SavedContext(*this, DD);
+          CheckDestructor(DD);
+        } else {
+          MarkFunctionReferenced(Loc, Class->getDestructor());
+        }
+      }
     }
   }
 
@@ -13795,6 +14745,9 @@
 
   DiagnoseFunctionSpecifiers(D.getDeclSpec());
 
+  if (D.getDeclSpec().isInlineSpecified())
+    Diag(D.getDeclSpec().getInlineSpecLoc(), diag::err_inline_non_function)
+        << getLangOpts().CPlusPlus1z;
   if (DeclSpec::TSCS TSCS = D.getDeclSpec().getThreadStorageClassSpec())
     Diag(D.getDeclSpec().getThreadStorageClassSpecLoc(),
          diag::err_invalid_thread)
diff --git a/lib/Sema/SemaDeclObjC.cpp b/lib/Sema/SemaDeclObjC.cpp
index be910b9..a872353 100644
--- a/lib/Sema/SemaDeclObjC.cpp
+++ b/lib/Sema/SemaDeclObjC.cpp
@@ -11,23 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
+#include "TypeLocBuilder.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTMutationListener.h"
-#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprObjC.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Sema/DeclSpec.h"
-#include "clang/Sema/ExternalSemaSource.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "TypeLocBuilder.h"
 
 using namespace clang;
 
@@ -320,11 +319,11 @@
   PushOnScopeChains(MDecl->getCmdDecl(), FnBodyScope);
 
   // The ObjC parser requires parameter names so there's no need to check.
-  CheckParmsForFunctionDef(MDecl->param_begin(), MDecl->param_end(),
+  CheckParmsForFunctionDef(MDecl->parameters(),
                            /*CheckParameterNames=*/false);
 
   // Introduce all of the other parameters into this scope.
-  for (auto *Param : MDecl->params()) {
+  for (auto *Param : MDecl->parameters()) {
     if (!Param->isInvalidDecl() &&
         getLangOpts().ObjCAutoRefCount &&
         !HasExplicitOwnershipAttr(*this, Param))
@@ -1506,6 +1505,7 @@
                                                 SourceLocation(),
                                                 SourceLocation(),
                                                 SourceLocation(),
+                                                SourceLocation(),
                                                 SourceLocation()),
                                                 parsedAttrs,
                                                 starLoc);
@@ -3223,7 +3223,7 @@
   ObjCMethodList *ListWithSameDeclaration = nullptr;
   for (; List; Previous = List, List = List->getNext()) {
     // If we are building a module, keep all of the methods.
-    if (getLangOpts().Modules && !getLangOpts().CurrentModule.empty())
+    if (getLangOpts().CompilingModule)
       continue;
 
     bool SameDeclaration = MatchTwoMethodDeclarations(Method,
@@ -4627,6 +4627,9 @@
     Diag(DS.getStorageClassSpecLoc(), diag::err_storage_spec_on_catch_parm)
       << DeclSpec::getSpecifierName(SCS);
   }
+  if (DS.isInlineSpecified())
+    Diag(DS.getInlineSpecLoc(), diag::err_inline_non_function)
+        << getLangOpts().CPlusPlus1z;
   if (DeclSpec::TSCS TSCS = D.getDeclSpec().getThreadStorageClassSpec())
     Diag(D.getDeclSpec().getThreadStorageClassSpecLoc(),
          diag::err_invalid_thread)
diff --git a/lib/Sema/SemaExceptionSpec.cpp b/lib/Sema/SemaExceptionSpec.cpp
index f12bf24..4a21eb3 100644
--- a/lib/Sema/SemaExceptionSpec.cpp
+++ b/lib/Sema/SemaExceptionSpec.cpp
@@ -110,11 +110,17 @@
   //   A type denoted in an exception-specification shall not denote a
   //   pointer or reference to an incomplete type, other than (cv) void* or a
   //   pointer or reference to a class currently being defined.
+  // In Microsoft mode, downgrade this to a warning.
+  unsigned DiagID = diag::err_incomplete_in_exception_spec;
+  bool ReturnValueOnError = true;
+  if (getLangOpts().MicrosoftExt) {
+    DiagID = diag::ext_incomplete_in_exception_spec;
+    ReturnValueOnError = false;
+  }
   if (!(PointeeT->isRecordType() &&
         PointeeT->getAs<RecordType>()->isBeingDefined()) &&
-      RequireCompleteType(Range.getBegin(), PointeeT,
-                          diag::err_incomplete_in_exception_spec, Kind, Range))
-    return true;
+      RequireCompleteType(Range.getBegin(), PointeeT, DiagID, Kind, Range))
+    return ReturnValueOnError;
 
   return false;
 }
@@ -995,6 +1001,10 @@
     return mergeCanThrow(CT, canSubExprsThrow(*this, E));
   }
 
+  case Expr::CXXInheritedCtorInitExprClass:
+    return canCalleeThrow(*this, E,
+                          cast<CXXInheritedCtorInitExpr>(E)->getConstructor());
+
   case Expr::LambdaExprClass: {
     const LambdaExpr *Lambda = cast<LambdaExpr>(E);
     CanThrowResult CT = CT_Cannot;
@@ -1136,6 +1146,7 @@
   case Expr::ObjCIndirectCopyRestoreExprClass:
   case Expr::ObjCProtocolExprClass:
   case Expr::ObjCSelectorExprClass:
+  case Expr::ObjCAvailabilityCheckExprClass:
   case Expr::OffsetOfExprClass:
   case Expr::PackExpansionExprClass:
   case Expr::PseudoObjectExprClass:
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index 4ff4f78..4bf17a6 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "TreeTransform.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
@@ -42,6 +41,7 @@
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaFixItUtils.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "llvm/Support/ConvertUTF.h"
 using namespace clang;
@@ -76,10 +76,14 @@
 
 static void DiagnoseUnusedOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc) {
   // Warn if this is used but marked unused.
-  if (D->hasAttr<UnusedAttr>()) {
-    const Decl *DC = cast_or_null<Decl>(S.getCurObjCLexicalContext());
-    if (DC && !DC->hasAttr<UnusedAttr>())
-      S.Diag(Loc, diag::warn_used_but_marked_unused) << D->getDeclName();
+  if (const auto *A = D->getAttr<UnusedAttr>()) {
+    // [[maybe_unused]] should not diagnose uses, but __attribute__((unused))
+    // should diagnose them.
+    if (A->getSemanticSpelling() != UnusedAttr::CXX11_maybe_unused) {
+      const Decl *DC = cast_or_null<Decl>(S.getCurObjCLexicalContext());
+      if (DC && !DC->hasAttr<UnusedAttr>())
+        S.Diag(Loc, diag::warn_used_but_marked_unused) << D->getDeclName();
+    }
   }
 }
 
@@ -99,13 +103,9 @@
   return false;
 }
 
-static AvailabilityResult
-DiagnoseAvailabilityOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc,
-                           const ObjCInterfaceDecl *UnknownObjCClass,
-                           bool ObjCPropertyAccess) {
-  // See if this declaration is unavailable or deprecated.
-  std::string Message;
-  AvailabilityResult Result = D->getAvailability(&Message);
+AvailabilityResult Sema::ShouldDiagnoseAvailabilityOfDecl(
+    NamedDecl *&D, VersionTuple ContextVersion, std::string *Message) {
+  AvailabilityResult Result = D->getAvailability(Message, ContextVersion);
 
   // For typedefs, if the typedef declaration appears available look
   // to the underlying type to see if it is more restrictive.
@@ -113,18 +113,18 @@
     if (Result == AR_Available) {
       if (const TagType *TT = TD->getUnderlyingType()->getAs<TagType>()) {
         D = TT->getDecl();
-        Result = D->getAvailability(&Message);
+        Result = D->getAvailability(Message, ContextVersion);
         continue;
       }
     }
     break;
   }
-    
+
   // Forward class declarations get their attributes from their definition.
   if (ObjCInterfaceDecl *IDecl = dyn_cast<ObjCInterfaceDecl>(D)) {
     if (IDecl->getDefinition()) {
       D = IDecl->getDefinition();
-      Result = D->getAvailability(&Message);
+      Result = D->getAvailability(Message, ContextVersion);
     }
   }
 
@@ -132,69 +132,76 @@
     if (Result == AR_Available) {
       const DeclContext *DC = ECD->getDeclContext();
       if (const EnumDecl *TheEnumDecl = dyn_cast<EnumDecl>(DC))
-        Result = TheEnumDecl->getAvailability(&Message);
+        Result = TheEnumDecl->getAvailability(Message, ContextVersion);
     }
 
-  const ObjCPropertyDecl *ObjCPDecl = nullptr;
-  if (Result == AR_Deprecated || Result == AR_Unavailable ||
-      Result == AR_NotYetIntroduced) {
+  switch (Result) {
+  case AR_Available:
+    return Result;
+
+  case AR_Unavailable:
+  case AR_Deprecated:
+    return getCurContextAvailability() != Result ? Result : AR_Available;
+
+  case AR_NotYetIntroduced: {
+    // Don't do this for enums, they can't be redeclared.
+    if (isa<EnumConstantDecl>(D) || isa<EnumDecl>(D))
+      return AR_Available;
+
+    bool Warn = !D->getAttr<AvailabilityAttr>()->isInherited();
+    // Objective-C method declarations in categories are not modelled as
+    // redeclarations, so manually look for a redeclaration in a category
+    // if necessary.
+    if (Warn && HasRedeclarationWithoutAvailabilityInCategory(D))
+      Warn = false;
+    // In general, D will point to the most recent redeclaration. However,
+    // for `@class A;` decls, this isn't true -- manually go through the
+    // redecl chain in that case.
+    if (Warn && isa<ObjCInterfaceDecl>(D))
+      for (Decl *Redecl = D->getMostRecentDecl(); Redecl && Warn;
+           Redecl = Redecl->getPreviousDecl())
+        if (!Redecl->hasAttr<AvailabilityAttr>() ||
+            Redecl->getAttr<AvailabilityAttr>()->isInherited())
+          Warn = false;
+
+    return Warn ? AR_NotYetIntroduced : AR_Available;
+  }
+  }
+  llvm_unreachable("Unknown availability result!");
+}
+
+static void
+DiagnoseAvailabilityOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc,
+                           const ObjCInterfaceDecl *UnknownObjCClass,
+                           bool ObjCPropertyAccess) {
+  VersionTuple ContextVersion;
+  if (const DeclContext *DC = S.getCurObjCLexicalContext())
+    ContextVersion = S.getVersionForDecl(cast<Decl>(DC));
+
+  std::string Message;
+  // See if this declaration is unavailable, deprecated, or partial in the
+  // current context.
+  if (AvailabilityResult Result =
+          S.ShouldDiagnoseAvailabilityOfDecl(D, ContextVersion, &Message)) {
+
+    if (Result == AR_NotYetIntroduced && S.getCurFunctionOrMethodDecl()) {
+      S.getEnclosingFunction()->HasPotentialAvailabilityViolations = true;
+      return;
+    }
+
+    const ObjCPropertyDecl *ObjCPDecl = nullptr;
     if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D)) {
       if (const ObjCPropertyDecl *PD = MD->findPropertyDecl()) {
-        AvailabilityResult PDeclResult = PD->getAvailability(nullptr);
+        AvailabilityResult PDeclResult =
+            PD->getAvailability(nullptr, ContextVersion);
         if (PDeclResult == Result)
           ObjCPDecl = PD;
       }
     }
+
+    S.EmitAvailabilityWarning(Result, D, Message, Loc, UnknownObjCClass,
+                              ObjCPDecl, ObjCPropertyAccess);
   }
-  
-  switch (Result) {
-    case AR_Available:
-      break;
-
-    case AR_Deprecated:
-      if (S.getCurContextAvailability() != AR_Deprecated)
-        S.EmitAvailabilityWarning(Sema::AD_Deprecation,
-                                  D, Message, Loc, UnknownObjCClass, ObjCPDecl,
-                                  ObjCPropertyAccess);
-      break;
-
-    case AR_NotYetIntroduced: {
-      // Don't do this for enums, they can't be redeclared.
-      if (isa<EnumConstantDecl>(D) || isa<EnumDecl>(D))
-        break;
- 
-      bool Warn = !D->getAttr<AvailabilityAttr>()->isInherited();
-      // Objective-C method declarations in categories are not modelled as
-      // redeclarations, so manually look for a redeclaration in a category
-      // if necessary.
-      if (Warn && HasRedeclarationWithoutAvailabilityInCategory(D))
-        Warn = false;
-      // In general, D will point to the most recent redeclaration. However,
-      // for `@class A;` decls, this isn't true -- manually go through the
-      // redecl chain in that case.
-      if (Warn && isa<ObjCInterfaceDecl>(D))
-        for (Decl *Redecl = D->getMostRecentDecl(); Redecl && Warn;
-             Redecl = Redecl->getPreviousDecl())
-          if (!Redecl->hasAttr<AvailabilityAttr>() ||
-              Redecl->getAttr<AvailabilityAttr>()->isInherited())
-            Warn = false;
- 
-      if (Warn)
-        S.EmitAvailabilityWarning(Sema::AD_Partial, D, Message, Loc,
-                                  UnknownObjCClass, ObjCPDecl,
-                                  ObjCPropertyAccess);
-      break;
-    }
-
-    case AR_Unavailable:
-      if (S.getCurContextAvailability() != AR_Unavailable)
-        S.EmitAvailabilityWarning(Sema::AD_Unavailable,
-                                  D, Message, Loc, UnknownObjCClass, ObjCPDecl,
-                                  ObjCPropertyAccess);
-      break;
-
-    }
-    return Result;
 }
 
 /// \brief Emit a note explaining that this function is deleted.
@@ -212,25 +219,14 @@
     // deleted. This might fail, if that reason no longer applies.
     CXXSpecialMember CSM = getSpecialMember(Method);
     if (CSM != CXXInvalid)
-      ShouldDeleteSpecialMember(Method, CSM, /*Diagnose=*/true);
+      ShouldDeleteSpecialMember(Method, CSM, nullptr, /*Diagnose=*/true);
 
     return;
   }
 
-  if (CXXConstructorDecl *CD = dyn_cast<CXXConstructorDecl>(Decl)) {
-    if (CXXConstructorDecl *BaseCD =
-            const_cast<CXXConstructorDecl*>(CD->getInheritedConstructor())) {
-      Diag(Decl->getLocation(), diag::note_inherited_deleted_here);
-      if (BaseCD->isDeleted()) {
-        NoteDeletedFunction(BaseCD);
-      } else {
-        // FIXME: An explanation of why exactly it can't be inherited
-        // would be nice.
-        Diag(BaseCD->getLocation(), diag::note_cannot_inherit);
-      }
-      return;
-    }
-  }
+  auto *Ctor = dyn_cast<CXXConstructorDecl>(Decl);
+  if (Ctor && Ctor->isInheritingConstructor())
+    return NoteDeletedInheritingConstructor(Ctor);
 
   Diag(Decl->getLocation(), diag::note_availability_specified_here)
     << Decl << true;
@@ -347,17 +343,28 @@
 
   // See if this is an auto-typed variable whose initializer we are parsing.
   if (ParsingInitForAutoVars.count(D)) {
-    const AutoType *AT = cast<VarDecl>(D)->getType()->getContainedAutoType();
+    if (isa<BindingDecl>(D)) {
+      Diag(Loc, diag::err_binding_cannot_appear_in_own_initializer)
+        << D->getDeclName();
+    } else {
+      const AutoType *AT = cast<VarDecl>(D)->getType()->getContainedAutoType();
 
-    Diag(Loc, diag::err_auto_variable_cannot_appear_in_own_initializer)
-      << D->getDeclName() << (unsigned)AT->getKeyword();
+      Diag(Loc, diag::err_auto_variable_cannot_appear_in_own_initializer)
+        << D->getDeclName() << (unsigned)AT->getKeyword();
+    }
     return true;
   }
 
   // See if this is a deleted function.
   if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
     if (FD->isDeleted()) {
-      Diag(Loc, diag::err_deleted_function_use);
+      auto *Ctor = dyn_cast<CXXConstructorDecl>(FD);
+      if (Ctor && Ctor->isInheritingConstructor())
+        Diag(Loc, diag::err_deleted_inherited_ctor_use)
+            << Ctor->getParent()
+            << Ctor->getInheritedConstructor().getConstructor()->getParent();
+      else 
+        Diag(Loc, diag::err_deleted_function_use);
       NoteDeletedFunction(FD);
       return true;
     }
@@ -368,6 +375,19 @@
         DeduceReturnType(FD, Loc))
       return true;
   }
+
+  // [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions
+  // Only the variables omp_in and omp_out are allowed in the combiner.
+  // Only the variables omp_priv and omp_orig are allowed in the
+  // initializer-clause.
+  auto *DRD = dyn_cast<OMPDeclareReductionDecl>(CurContext);
+  if (LangOpts.OpenMP && DRD && !CurContext->containsDecl(D) &&
+      isa<VarDecl>(D)) {
+    Diag(Loc, diag::err_omp_wrong_var_in_declare_reduction)
+        << getCurFunction()->HasOMPDeclareReductionCombiner;
+    Diag(D->getLocation(), diag::note_entity_declared_at) << D;
+    return true;
+  }
   DiagnoseAvailabilityOfDecl(*this, D, Loc, UnknownObjCClass,
                              ObjCPropertyAccess);
 
@@ -695,7 +715,7 @@
   // balance that.
   if (getLangOpts().ObjCAutoRefCount &&
       E->getType().getObjCLifetime() == Qualifiers::OCL_Weak)
-    ExprNeedsCleanups = true;
+    Cleanup.setExprNeedsCleanups(true);
 
   ExprResult Res = ImplicitCastExpr::Create(Context, T, CK_LValueToRValue, E,
                                             nullptr, VK_RValue);
@@ -1138,6 +1158,48 @@
                                     /*convertFloat=*/!IsCompAssign);
 }
 
+/// \brief Diagnose attempts to convert between __float128 and long double if
+/// there is no support for such conversion. Helper function of
+/// UsualArithmeticConversions().
+static bool unsupportedTypeConversion(const Sema &S, QualType LHSType,
+                                      QualType RHSType) {
+  /*  No issue converting if at least one of the types is not a floating point
+      type or the two types have the same rank.
+  */
+  if (!LHSType->isFloatingType() || !RHSType->isFloatingType() ||
+      S.Context.getFloatingTypeOrder(LHSType, RHSType) == 0)
+    return false;
+
+  assert(LHSType->isFloatingType() && RHSType->isFloatingType() &&
+         "The remaining types must be floating point types.");
+
+  auto *LHSComplex = LHSType->getAs<ComplexType>();
+  auto *RHSComplex = RHSType->getAs<ComplexType>();
+
+  QualType LHSElemType = LHSComplex ?
+    LHSComplex->getElementType() : LHSType;
+  QualType RHSElemType = RHSComplex ?
+    RHSComplex->getElementType() : RHSType;
+
+  // No issue if the two types have the same representation
+  if (&S.Context.getFloatTypeSemantics(LHSElemType) ==
+      &S.Context.getFloatTypeSemantics(RHSElemType))
+    return false;
+
+  bool Float128AndLongDouble = (LHSElemType == S.Context.Float128Ty &&
+                                RHSElemType == S.Context.LongDoubleTy);
+  Float128AndLongDouble |= (LHSElemType == S.Context.LongDoubleTy &&
+                            RHSElemType == S.Context.Float128Ty);
+
+  /* We've handled the situation where __float128 and long double have the same
+     representation. The only other allowable conversion is if long double is
+     really just double.
+  */
+  return Float128AndLongDouble &&
+    (&S.Context.getFloatTypeSemantics(S.Context.LongDoubleTy) !=
+     &llvm::APFloat::IEEEdouble);
+}
+
 typedef ExprResult PerformCastFn(Sema &S, Expr *operand, QualType toType);
 
 namespace {
@@ -1301,6 +1363,11 @@
 
   // At this point, we have two different arithmetic types.
 
+  // Diagnose attempts to convert between __float128 and long double where
+  // such conversions currently can't be handled.
+  if (unsupportedTypeConversion(*this, LHSType, RHSType))
+    return QualType();
+
   // Handle complex types first (C99 6.3.1.8p1).
   if (LHSType->isComplexType() || RHSType->isComplexType())
     return handleComplexFloatConversion(*this, LHS, RHS, LHSType, RHSType,
@@ -1677,17 +1744,9 @@
                        const CXXScopeSpec *SS, NamedDecl *FoundD,
                        const TemplateArgumentListInfo *TemplateArgs) {
   if (getLangOpts().CUDA)
-    if (const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext))
-      if (const FunctionDecl *Callee = dyn_cast<FunctionDecl>(D)) {
-        if (CheckCUDATarget(Caller, Callee)) {
-          Diag(NameInfo.getLoc(), diag::err_ref_bad_target)
-            << IdentifyCUDATarget(Callee) << D->getIdentifier()
-            << IdentifyCUDATarget(Caller);
-          Diag(D->getLocation(), diag::note_previous_decl)
-            << D->getIdentifier();
-          return ExprError();
-        }
-      }
+    if (FunctionDecl *Callee = dyn_cast<FunctionDecl>(D))
+      if (!CheckCUDACall(NameInfo.getLoc(), Callee))
+        return ExprError();
 
   bool RefersToCapturedVariable =
       isa<VarDecl>(D) &&
@@ -1719,10 +1778,18 @@
       !Diags.isIgnored(diag::warn_arc_repeated_use_of_weak, E->getLocStart()))
       recordUseOfEvaluatedWeak(E);
 
-  // Just in case we're building an illegal pointer-to-member.
-  FieldDecl *FD = dyn_cast<FieldDecl>(D);
-  if (FD && FD->isBitField())
-    E->setObjectKind(OK_BitField);
+  if (FieldDecl *FD = dyn_cast<FieldDecl>(D)) {
+    UnusedPrivateFields.remove(FD);
+    // Just in case we're building an illegal pointer-to-member.
+    if (FD->isBitField())
+      E->setObjectKind(OK_BitField);
+  }
+
+  // C++ [expr.prim]/8: The expression [...] is a bit-field if the identifier
+  // designates a bit-field.
+  if (auto *BD = dyn_cast<BindingDecl>(D))
+    if (auto *BE = BD->getBinding())
+      E->setObjectKind(BE->getObjectKind());
 
   return E;
 }
@@ -2768,6 +2835,10 @@
   return ULE;
 }
 
+static void
+diagnoseUncapturableValueReference(Sema &S, SourceLocation loc,
+                                   ValueDecl *var, DeclContext *DC);
+
 /// \brief Complete semantic analysis for a reference to the given declaration.
 ExprResult Sema::BuildDeclarationNameExpr(
     const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo, NamedDecl *D,
@@ -2840,6 +2911,7 @@
     // Unresolved using declarations are dependent.
     case Decl::EnumConstant:
     case Decl::UnresolvedUsingValue:
+    case Decl::OMPDeclareReduction:
       valueKind = VK_RValue;
       break;
 
@@ -2877,6 +2949,8 @@
     case Decl::Var:
     case Decl::VarTemplateSpecialization:
     case Decl::VarTemplatePartialSpecialization:
+    case Decl::Decomposition:
+    case Decl::OMPCapturedExpr:
       // In C, "extern void blah;" is valid and is an r-value.
       if (!getLangOpts().CPlusPlus &&
           !type.hasQualifiers() &&
@@ -2903,6 +2977,19 @@
       
       break;
     }
+
+    case Decl::Binding: {
+      // These are always lvalues.
+      valueKind = VK_LValue;
+      type = type.getNonReferenceType();
+      // FIXME: Support lambda-capture of BindingDecls, once CWG actually
+      // decides how that's supposed to work.
+      auto *BD = cast<BindingDecl>(VD);
+      if (BD->getDeclContext()->isFunctionOrMethod() &&
+          BD->getDeclContext() != CurContext)
+        diagnoseUncapturableValueReference(*this, Loc, BD, CurContext);
+      break;
+    }
         
     case Decl::Function: {
       if (unsigned BID = cast<FunctionDecl>(VD)->getBuiltinID()) {
@@ -3297,12 +3384,21 @@
 
   if (Literal.isFloatingLiteral()) {
     QualType Ty;
-    if (Literal.isFloat)
+    if (Literal.isHalf){
+      if (getOpenCLOptions().cl_khr_fp16)
+        Ty = Context.HalfTy;
+      else {
+        Diag(Tok.getLocation(), diag::err_half_const_requires_fp16);
+        return ExprError();
+      }
+    } else if (Literal.isFloat)
       Ty = Context.FloatTy;
-    else if (!Literal.isLong)
-      Ty = Context.DoubleTy;
-    else
+    else if (Literal.isLong)
       Ty = Context.LongDoubleTy;
+    else if (Literal.isFloat128)
+      Ty = Context.Float128Ty;
+    else
+      Ty = Context.DoubleTy;
 
     Res = BuildFloatingLiteral(*this, Literal, Ty, Tok.getLocation());
 
@@ -4151,12 +4247,18 @@
     ExprResult Result = CheckPlaceholderExpr(LowerBound);
     if (Result.isInvalid())
       return ExprError();
+    Result = DefaultLvalueConversion(Result.get());
+    if (Result.isInvalid())
+      return ExprError();
     LowerBound = Result.get();
   }
   if (Length && Length->getType()->isNonOverloadPlaceholderType()) {
     ExprResult Result = CheckPlaceholderExpr(Length);
     if (Result.isInvalid())
       return ExprError();
+    Result = DefaultLvalueConversion(Result.get());
+    if (Result.isInvalid())
+      return ExprError();
     Length = Result.get();
   }
 
@@ -4226,14 +4328,13 @@
                           diag::err_omp_section_incomplete_type, Base))
     return ExprError();
 
-  if (LowerBound) {
+  if (LowerBound && !OriginalTy->isAnyPointerType()) {
     llvm::APSInt LowerBoundValue;
     if (LowerBound->EvaluateAsInt(LowerBoundValue, Context)) {
-      // OpenMP 4.0, [2.4 Array Sections]
-      // The lower-bound and length must evaluate to non-negative integers.
+      // OpenMP 4.5, [2.4 Array Sections]
+      // The array section must be a subset of the original array.
       if (LowerBoundValue.isNegative()) {
-        Diag(LowerBound->getExprLoc(), diag::err_omp_section_negative)
-            << 0 << LowerBoundValue.toString(/*Radix=*/10, /*Signed=*/true)
+        Diag(LowerBound->getExprLoc(), diag::err_omp_section_not_subset_of_array)
             << LowerBound->getSourceRange();
         return ExprError();
       }
@@ -4243,11 +4344,11 @@
   if (Length) {
     llvm::APSInt LengthValue;
     if (Length->EvaluateAsInt(LengthValue, Context)) {
-      // OpenMP 4.0, [2.4 Array Sections]
-      // The lower-bound and length must evaluate to non-negative integers.
+      // OpenMP 4.5, [2.4 Array Sections]
+      // The length must evaluate to non-negative integers.
       if (LengthValue.isNegative()) {
-        Diag(Length->getExprLoc(), diag::err_omp_section_negative)
-            << 1 << LengthValue.toString(/*Radix=*/10, /*Signed=*/true)
+        Diag(Length->getExprLoc(), diag::err_omp_section_length_negative)
+            << LengthValue.toString(/*Radix=*/10, /*Signed=*/true)
             << Length->getSourceRange();
         return ExprError();
       }
@@ -4255,7 +4356,7 @@
   } else if (ColonLoc.isValid() &&
              (OriginalTy.isNull() || (!OriginalTy->isConstantArrayType() &&
                                       !OriginalTy->isVariableArrayType()))) {
-    // OpenMP 4.0, [2.4 Array Sections]
+    // OpenMP 4.5, [2.4 Array Sections]
     // When the size of the array dimension is not known, the length must be
     // specified explicitly.
     Diag(ColonLoc, diag::err_omp_section_length_undefined)
@@ -4263,6 +4364,13 @@
     return ExprError();
   }
 
+  if (!Base->getType()->isSpecificPlaceholderType(
+          BuiltinType::OMPArraySection)) {
+    ExprResult Result = DefaultFunctionArrayLvalueConversion(Base);
+    if (Result.isInvalid())
+      return ExprError();
+    Base = Result.get();
+  }
   return new (Context)
       OMPArraySectionExpr(Base, LowerBound, Length, Context.OMPArraySectionTy,
                           VK_LValue, OK_Ordinary, ColonLoc, RBLoc);
@@ -4476,6 +4584,13 @@
     }
   }
 
+  // If the default argument expression is not set yet, we are building it now.
+  if (!Param->hasInit()) {
+    Diag(Param->getLocStart(), diag::err_recursive_default_argument) << FD;
+    Param->setInvalidDecl();
+    return ExprError();
+  }
+
   // If the default expression creates temporaries, we need to
   // push them to the current stack of expression temporaries so they'll
   // be properly destroyed.
@@ -4483,15 +4598,15 @@
   // bound temporaries; see the comment in PR5810.
   // We don't need to do that with block decls, though, because
   // blocks in default argument expression can never capture anything.
-  if (isa<ExprWithCleanups>(Param->getInit())) {
+  if (auto Init = dyn_cast<ExprWithCleanups>(Param->getInit())) {
     // Set the "needs cleanups" bit regardless of whether there are
     // any explicit objects.
-    ExprNeedsCleanups = true;
+    Cleanup.setExprNeedsCleanups(Init->cleanupsHaveSideEffects());
 
     // Append all the objects to the cleanup list.  Right now, this
     // should always be a no-op, because blocks in default argument
     // expressions should never be able to capture anything.
-    assert(!cast<ExprWithCleanups>(Param->getInit())->getNumObjects() &&
+    assert(!Init->getNumObjects() &&
            "default argument expression has capturing blocks?");
   }
 
@@ -4876,6 +4991,9 @@
 
   switch (placeholder->getKind()) {
   // Ignore all the non-placeholder types.
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
 #define PLACEHOLDER_TYPE(ID, SINGLETON_ID)
 #define BUILTIN_TYPE(ID, SINGLETON_ID) case BuiltinType::ID:
 #include "clang/AST/BuiltinTypes.def"
@@ -4957,7 +5075,11 @@
   for (QualType ParamType : FT->param_types()) {
 
     // Convert array arguments to pointer to simplify type lookup.
-    Expr *Arg = Sema->DefaultFunctionArrayLvalueConversion(ArgExprs[i++]).get();
+    ExprResult ArgRes =
+        Sema->DefaultFunctionArrayLvalueConversion(ArgExprs[i++]);
+    if (ArgRes.isInvalid())
+      return nullptr;
+    Expr *Arg = ArgRes.get();
     QualType ArgType = Arg->getType();
     if (!ParamType->isPointerType() ||
         ParamType.getQualifiers().hasAddressSpace() ||
@@ -5013,45 +5135,41 @@
   return Callee->getMinRequiredArguments() <= NumArgs;
 }
 
-/// ActOnCallExpr - Handle a call to Fn with the specified array of arguments.
-/// This provides the location of the left/right parens and a list of comma
-/// locations.
-ExprResult
-Sema::ActOnCallExpr(Scope *S, Expr *Fn, SourceLocation LParenLoc,
-                    MultiExprArg ArgExprs, SourceLocation RParenLoc,
-                    Expr *ExecConfig, bool IsExecConfig) {
+static ExprResult ActOnCallExprImpl(Sema &S, Scope *Scope, Expr *Fn,
+                                    SourceLocation LParenLoc,
+                                    MultiExprArg ArgExprs,
+                                    SourceLocation RParenLoc, Expr *ExecConfig,
+                                    bool IsExecConfig) {
   // Since this might be a postfix expression, get rid of ParenListExprs.
-  ExprResult Result = MaybeConvertParenListExprToParenExpr(S, Fn);
+  ExprResult Result = S.MaybeConvertParenListExprToParenExpr(Scope, Fn);
   if (Result.isInvalid()) return ExprError();
   Fn = Result.get();
 
-  if (checkArgsForPlaceholders(*this, ArgExprs))
+  if (checkArgsForPlaceholders(S, ArgExprs))
     return ExprError();
 
-  if (getLangOpts().CPlusPlus) {
+  if (S.getLangOpts().CPlusPlus) {
     // If this is a pseudo-destructor expression, build the call immediately.
     if (isa<CXXPseudoDestructorExpr>(Fn)) {
       if (!ArgExprs.empty()) {
         // Pseudo-destructor calls should not have any arguments.
-        Diag(Fn->getLocStart(), diag::err_pseudo_dtor_call_with_args)
-          << FixItHint::CreateRemoval(
-                                    SourceRange(ArgExprs.front()->getLocStart(),
-                                                ArgExprs.back()->getLocEnd()));
+        S.Diag(Fn->getLocStart(), diag::err_pseudo_dtor_call_with_args)
+            << FixItHint::CreateRemoval(
+                   SourceRange(ArgExprs.front()->getLocStart(),
+                               ArgExprs.back()->getLocEnd()));
       }
 
-      return new (Context)
-          CallExpr(Context, Fn, None, Context.VoidTy, VK_RValue, RParenLoc);
+      return new (S.Context)
+          CallExpr(S.Context, Fn, None, S.Context.VoidTy, VK_RValue, RParenLoc);
     }
-    if (Fn->getType() == Context.PseudoObjectTy) {
-      ExprResult result = CheckPlaceholderExpr(Fn);
+    if (Fn->getType() == S.Context.PseudoObjectTy) {
+      ExprResult result = S.CheckPlaceholderExpr(Fn);
       if (result.isInvalid()) return ExprError();
       Fn = result.get();
     }
 
     // Determine whether this is a dependent call inside a C++ template,
     // in which case we won't do any semantic analysis now.
-    // FIXME: Will need to cache the results of name lookup (including ADL) in
-    // Fn.
     bool Dependent = false;
     if (Fn->isTypeDependent())
       Dependent = true;
@@ -5060,50 +5178,53 @@
 
     if (Dependent) {
       if (ExecConfig) {
-        return new (Context) CUDAKernelCallExpr(
-            Context, Fn, cast<CallExpr>(ExecConfig), ArgExprs,
-            Context.DependentTy, VK_RValue, RParenLoc);
+        return new (S.Context) CUDAKernelCallExpr(
+            S.Context, Fn, cast<CallExpr>(ExecConfig), ArgExprs,
+            S.Context.DependentTy, VK_RValue, RParenLoc);
       } else {
-        return new (Context) CallExpr(
-            Context, Fn, ArgExprs, Context.DependentTy, VK_RValue, RParenLoc);
+        return new (S.Context)
+            CallExpr(S.Context, Fn, ArgExprs, S.Context.DependentTy, VK_RValue,
+                     RParenLoc);
       }
     }
 
     // Determine whether this is a call to an object (C++ [over.call.object]).
     if (Fn->getType()->isRecordType())
-      return BuildCallToObjectOfClassType(S, Fn, LParenLoc, ArgExprs,
-                                          RParenLoc);
+      return S.BuildCallToObjectOfClassType(Scope, Fn, LParenLoc, ArgExprs,
+                                            RParenLoc);
 
-    if (Fn->getType() == Context.UnknownAnyTy) {
-      ExprResult result = rebuildUnknownAnyFunction(*this, Fn);
+    if (Fn->getType() == S.Context.UnknownAnyTy) {
+      ExprResult result = rebuildUnknownAnyFunction(S, Fn);
       if (result.isInvalid()) return ExprError();
       Fn = result.get();
     }
 
-    if (Fn->getType() == Context.BoundMemberTy) {
-      return BuildCallToMemberFunction(S, Fn, LParenLoc, ArgExprs, RParenLoc);
+    if (Fn->getType() == S.Context.BoundMemberTy) {
+      return S.BuildCallToMemberFunction(Scope, Fn, LParenLoc, ArgExprs,
+                                         RParenLoc);
     }
   }
 
   // Check for overloaded calls.  This can happen even in C due to extensions.
-  if (Fn->getType() == Context.OverloadTy) {
+  if (Fn->getType() == S.Context.OverloadTy) {
     OverloadExpr::FindResult find = OverloadExpr::find(Fn);
 
-    // We aren't supposed to apply this logic for if there's an '&' involved.
+    // We aren't supposed to apply this logic for if there'Scope an '&'
+    // involved.
     if (!find.HasFormOfMemberPointer) {
       OverloadExpr *ovl = find.Expression;
       if (UnresolvedLookupExpr *ULE = dyn_cast<UnresolvedLookupExpr>(ovl))
-        return BuildOverloadedCallExpr(S, Fn, ULE, LParenLoc, ArgExprs,
-                                       RParenLoc, ExecConfig,
-                                       /*AllowTypoCorrection=*/true,
-                                       find.IsAddressOfOperand);
-      return BuildCallToMemberFunction(S, Fn, LParenLoc, ArgExprs, RParenLoc);
+        return S.BuildOverloadedCallExpr(
+            Scope, Fn, ULE, LParenLoc, ArgExprs, RParenLoc, ExecConfig,
+            /*AllowTypoCorrection=*/true, find.IsAddressOfOperand);
+      return S.BuildCallToMemberFunction(Scope, Fn, LParenLoc, ArgExprs,
+                                         RParenLoc);
     }
   }
 
   // If we're directly calling a function, get the appropriate declaration.
-  if (Fn->getType() == Context.UnknownAnyTy) {
-    ExprResult result = rebuildUnknownAnyFunction(*this, Fn);
+  if (Fn->getType() == S.Context.UnknownAnyTy) {
+    ExprResult result = rebuildUnknownAnyFunction(S, Fn);
     if (result.isInvalid()) return ExprError();
     Fn = result.get();
   }
@@ -5127,12 +5248,12 @@
       // Rewrite the function decl for this builtin by replacing parameters
       // with no explicit address space with the address space of the arguments
       // in ArgExprs.
-      if ((FDecl = rewriteBuiltinFunctionDecl(this, Context, FDecl, ArgExprs))) {
+      if ((FDecl =
+               rewriteBuiltinFunctionDecl(&S, S.Context, FDecl, ArgExprs))) {
         NDecl = FDecl;
-        Fn = DeclRefExpr::Create(Context, FDecl->getQualifierLoc(),
-                           SourceLocation(), FDecl, false,
-                           SourceLocation(), FDecl->getType(),
-                           Fn->getValueKind(), FDecl);
+        Fn = DeclRefExpr::Create(
+            S.Context, FDecl->getQualifierLoc(), SourceLocation(), FDecl, false,
+            SourceLocation(), FDecl->getType(), Fn->getValueKind(), FDecl);
       }
     }
   } else if (isa<MemberExpr>(NakedFn))
@@ -5140,8 +5261,8 @@
 
   if (FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(NDecl)) {
     if (CallingNDeclIndirectly &&
-        !checkAddressOfFunctionIsAvailable(FD, /*Complain=*/true,
-                                           Fn->getLocStart()))
+        !S.checkAddressOfFunctionIsAvailable(FD, /*Complain=*/true,
+                                             Fn->getLocStart()))
       return ExprError();
 
     // CheckEnableIf assumes that the we're passing in a sane number of args for
@@ -5151,22 +5272,42 @@
     // number of args looks incorrect, don't do enable_if checks; we should've
     // already emitted an error about the bad call.
     if (FD->hasAttr<EnableIfAttr>() &&
-        isNumberOfArgsValidForCall(*this, FD, ArgExprs.size())) {
-      if (const EnableIfAttr *Attr = CheckEnableIf(FD, ArgExprs, true)) {
-        Diag(Fn->getLocStart(),
-             isa<CXXMethodDecl>(FD) ?
-                 diag::err_ovl_no_viable_member_function_in_call :
-                 diag::err_ovl_no_viable_function_in_call)
-          << FD << FD->getSourceRange();
-        Diag(FD->getLocation(),
-             diag::note_ovl_candidate_disabled_by_enable_if_attr)
+        isNumberOfArgsValidForCall(S, FD, ArgExprs.size())) {
+      if (const EnableIfAttr *Attr = S.CheckEnableIf(FD, ArgExprs, true)) {
+        S.Diag(Fn->getLocStart(),
+               isa<CXXMethodDecl>(FD)
+                   ? diag::err_ovl_no_viable_member_function_in_call
+                   : diag::err_ovl_no_viable_function_in_call)
+            << FD << FD->getSourceRange();
+        S.Diag(FD->getLocation(),
+               diag::note_ovl_candidate_disabled_by_enable_if_attr)
             << Attr->getCond()->getSourceRange() << Attr->getMessage();
       }
     }
   }
 
-  return BuildResolvedCallExpr(Fn, NDecl, LParenLoc, ArgExprs, RParenLoc,
-                               ExecConfig, IsExecConfig);
+  return S.BuildResolvedCallExpr(Fn, NDecl, LParenLoc, ArgExprs, RParenLoc,
+                                 ExecConfig, IsExecConfig);
+}
+
+/// ActOnCallExpr - Handle a call to Fn with the specified array of arguments.
+/// This provides the location of the left/right parens and a list of comma
+/// locations.
+ExprResult Sema::ActOnCallExpr(Scope *S, Expr *Fn, SourceLocation LParenLoc,
+                               MultiExprArg ArgExprs, SourceLocation RParenLoc,
+                               Expr *ExecConfig, bool IsExecConfig) {
+  ExprResult Ret = ActOnCallExprImpl(*this, S, Fn, LParenLoc, ArgExprs,
+                                     RParenLoc, ExecConfig, IsExecConfig);
+
+  // If appropriate, check that this is a valid CUDA call (and emit an error if
+  // the call is not allowed).
+  if (getLangOpts().CUDA && Ret.isUsable())
+    if (auto *Call = dyn_cast<CallExpr>(Ret.get()))
+      if (auto *FD = Call->getDirectCallee())
+        if (!CheckCUDACall(Call->getLocStart(), FD))
+          return ExprError();
+
+  return Ret;
 }
 
 /// ActOnAsTypeExpr - create a new asType (bitcast) from the arguments.
@@ -5505,7 +5646,7 @@
   E = ImplicitCastExpr::Create(Context, E.get()->getType(),
                                CK_ARCExtendBlockObject, E.get(),
                                /*base path*/ nullptr, VK_RValue);
-  ExprNeedsCleanups = true;
+  Cleanup.setExprNeedsCleanups(true);
 }
 
 /// Prepare a conversion of the given expression to an ObjC object
@@ -5908,7 +6049,9 @@
   CheckTollFreeBridgeCast(castType, CastExpr);
   
   CheckObjCBridgeRelatedCast(castType, CastExpr);
-  
+
+  DiscardMisalignedMemberAddress(castType.getTypePtr(), CastExpr);
+
   return BuildCStyleCastExpr(LParenLoc, castTInfo, RParenLoc, CastExpr);
 }
 
@@ -6153,30 +6296,87 @@
   lhptee = S.Context.getQualifiedType(lhptee.getUnqualifiedType(), lhQual);
   rhptee = S.Context.getQualifiedType(rhptee.getUnqualifiedType(), rhQual);
 
+  // For OpenCL:
+  // 1. If LHS and RHS types match exactly and:
+  //  (a) AS match => use standard C rules, no bitcast or addrspacecast
+  //  (b) AS overlap => generate addrspacecast
+  //  (c) AS don't overlap => give an error
+  // 2. if LHS and RHS types don't match:
+  //  (a) AS match => use standard C rules, generate bitcast
+  //  (b) AS overlap => generate addrspacecast instead of bitcast
+  //  (c) AS don't overlap => give an error
+
+  // For OpenCL, non-null composite type is returned only for cases 1a and 1b.
   QualType CompositeTy = S.Context.mergeTypes(lhptee, rhptee);
 
+  // OpenCL cases 1c, 2a, 2b, and 2c.
   if (CompositeTy.isNull()) {
-    S.Diag(Loc, diag::ext_typecheck_cond_incompatible_pointers)
-      << LHSTy << RHSTy << LHS.get()->getSourceRange()
-      << RHS.get()->getSourceRange();
     // In this situation, we assume void* type. No especially good
     // reason, but this is what gcc does, and we do have to pick
     // to get a consistent AST.
-    QualType incompatTy = S.Context.getPointerType(S.Context.VoidTy);
-    LHS = S.ImpCastExprToType(LHS.get(), incompatTy, CK_BitCast);
-    RHS = S.ImpCastExprToType(RHS.get(), incompatTy, CK_BitCast);
+    QualType incompatTy;
+    if (S.getLangOpts().OpenCL) {
+      // OpenCL v1.1 s6.5 - Conversion between pointers to distinct address
+      // spaces is disallowed.
+      unsigned ResultAddrSpace;
+      if (lhQual.isAddressSpaceSupersetOf(rhQual)) {
+        // Cases 2a and 2b.
+        ResultAddrSpace = lhQual.getAddressSpace();
+      } else if (rhQual.isAddressSpaceSupersetOf(lhQual)) {
+        // Cases 2a and 2b.
+        ResultAddrSpace = rhQual.getAddressSpace();
+      } else {
+        // Cases 1c and 2c.
+        S.Diag(Loc,
+               diag::err_typecheck_op_on_nonoverlapping_address_space_pointers)
+            << LHSTy << RHSTy << 2 << LHS.get()->getSourceRange()
+            << RHS.get()->getSourceRange();
+        return QualType();
+      }
+
+      // Continue handling cases 2a and 2b.
+      incompatTy = S.Context.getPointerType(
+          S.Context.getAddrSpaceQualType(S.Context.VoidTy, ResultAddrSpace));
+      LHS = S.ImpCastExprToType(LHS.get(), incompatTy,
+                                (lhQual.getAddressSpace() != ResultAddrSpace)
+                                    ? CK_AddressSpaceConversion /* 2b */
+                                    : CK_BitCast /* 2a */);
+      RHS = S.ImpCastExprToType(RHS.get(), incompatTy,
+                                (rhQual.getAddressSpace() != ResultAddrSpace)
+                                    ? CK_AddressSpaceConversion /* 2b */
+                                    : CK_BitCast /* 2a */);
+    } else {
+      S.Diag(Loc, diag::ext_typecheck_cond_incompatible_pointers)
+          << LHSTy << RHSTy << LHS.get()->getSourceRange()
+          << RHS.get()->getSourceRange();
+      incompatTy = S.Context.getPointerType(S.Context.VoidTy);
+      LHS = S.ImpCastExprToType(LHS.get(), incompatTy, CK_BitCast);
+      RHS = S.ImpCastExprToType(RHS.get(), incompatTy, CK_BitCast);
+    }
     return incompatTy;
   }
 
   // The pointer types are compatible.
   QualType ResultTy = CompositeTy.withCVRQualifiers(MergedCVRQual);
+  auto LHSCastKind = CK_BitCast, RHSCastKind = CK_BitCast;
   if (IsBlockPointer)
     ResultTy = S.Context.getBlockPointerType(ResultTy);
-  else
+  else {
+    // Cases 1a and 1b for OpenCL.
+    auto ResultAddrSpace = ResultTy.getQualifiers().getAddressSpace();
+    LHSCastKind = lhQual.getAddressSpace() == ResultAddrSpace
+                      ? CK_BitCast /* 1a */
+                      : CK_AddressSpaceConversion /* 1b */;
+    RHSCastKind = rhQual.getAddressSpace() == ResultAddrSpace
+                      ? CK_BitCast /* 1a */
+                      : CK_AddressSpaceConversion /* 1b */;
     ResultTy = S.Context.getPointerType(ResultTy);
+  }
 
-  LHS = S.ImpCastExprToType(LHS.get(), ResultTy, CK_BitCast);
-  RHS = S.ImpCastExprToType(RHS.get(), ResultTy, CK_BitCast);
+  // For case 1a of OpenCL, S.ImpCastExprToType will not insert bitcast
+  // if the target type does not change.
+  LHS = S.ImpCastExprToType(LHS.get(), ResultTy, LHSCastKind);
+  RHS = S.ImpCastExprToType(RHS.get(), ResultTy, RHSCastKind);
   return ResultTy;
 }
 
@@ -6444,6 +6644,18 @@
   return OpenCLConvertScalarsToVectors(S, LHS, RHS, CondTy, QuestionLoc);
 }
 
+/// \brief Return true if the Expr is block type
+static bool checkBlockType(Sema &S, const Expr *E) {
+  if (const CallExpr *CE = dyn_cast<CallExpr>(E)) {
+    QualType Ty = CE->getCallee()->getType();
+    if (Ty->isBlockPointerType()) {
+      S.Diag(E->getExprLoc(), diag::err_opencl_ternary_with_block);
+      return true;
+    }
+  }
+  return false;
+}
+
 /// Note that LHS is not null here, even if this is the gnu "x ?: y" extension.
 /// In that case, LHS = cond.
 /// C99 6.5.15
@@ -6493,6 +6705,22 @@
   QualType LHSTy = LHS.get()->getType();
   QualType RHSTy = RHS.get()->getType();
 
+  // Diagnose attempts to convert between __float128 and long double where
+  // such conversions currently can't be handled.
+  if (unsupportedTypeConversion(*this, LHSTy, RHSTy)) {
+    Diag(QuestionLoc,
+         diag::err_typecheck_cond_incompatible_operands) << LHSTy << RHSTy
+      << LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
+    return QualType();
+  }
+
+  // OpenCL v2.0 s6.12.5 - Blocks cannot be used as expressions of the ternary
+  // selection operator (?:).
+  if (getLangOpts().OpenCL &&
+      (checkBlockType(*this, LHS.get()) | checkBlockType(*this, RHS.get()))) {
+    return QualType();
+  }
+
   // If both operands have arithmetic type, do the usual arithmetic conversions
   // to find a common type: C99 6.5.15p3,5.
   if (LHSTy->isArithmeticType() && RHSTy->isArithmeticType()) {
@@ -6824,6 +7052,55 @@
     SourceRange(CondRHS->getLocStart(), RHSExpr->getLocEnd()));
 }
 
+/// Compute the nullability of a conditional expression.
+static QualType computeConditionalNullability(QualType ResTy, bool IsBin,
+                                              QualType LHSTy, QualType RHSTy,
+                                              ASTContext &Ctx) {
+  if (!ResTy->isAnyPointerType())
+    return ResTy;
+
+  auto GetNullability = [&Ctx](QualType Ty) {
+    Optional<NullabilityKind> Kind = Ty->getNullability(Ctx);
+    if (Kind)
+      return *Kind;
+    return NullabilityKind::Unspecified;
+  };
+
+  auto LHSKind = GetNullability(LHSTy), RHSKind = GetNullability(RHSTy);
+  NullabilityKind MergedKind;
+
+  // Compute nullability of a binary conditional expression.
+  if (IsBin) {
+    if (LHSKind == NullabilityKind::NonNull)
+      MergedKind = NullabilityKind::NonNull;
+    else
+      MergedKind = RHSKind;
+  // Compute nullability of a normal conditional expression.
+  } else {
+    if (LHSKind == NullabilityKind::Nullable ||
+        RHSKind == NullabilityKind::Nullable)
+      MergedKind = NullabilityKind::Nullable;
+    else if (LHSKind == NullabilityKind::NonNull)
+      MergedKind = RHSKind;
+    else if (RHSKind == NullabilityKind::NonNull)
+      MergedKind = LHSKind;
+    else
+      MergedKind = NullabilityKind::Unspecified;
+  }
+
+  // Return if ResTy already has the correct nullability.
+  if (GetNullability(ResTy) == MergedKind)
+    return ResTy;
+
+  // Strip all nullability from ResTy.
+  while (ResTy->getNullability(Ctx))
+    ResTy = ResTy.getSingleStepDesugaredType(Ctx);
+
+  // Create a new AttributedType with the new nullability kind.
+  auto NewAttr = AttributedType::getNullabilityAttrKind(MergedKind);
+  return Ctx.getAttributedType(NewAttr, ResTy, ResTy);
+}
+
 /// ActOnConditionalOp - Parse a ?: operation.  Note that 'LHS' may be null
 /// in the case of a the GNU conditional expr extension.
 ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc,
@@ -6835,8 +7112,23 @@
     // doesn't handle dependent types properly, so make sure any TypoExprs have
     // been dealt with before checking the operands.
     ExprResult CondResult = CorrectDelayedTyposInExpr(CondExpr);
-    if (!CondResult.isUsable()) return ExprError();
+    ExprResult LHSResult = CorrectDelayedTyposInExpr(LHSExpr);
+    ExprResult RHSResult = CorrectDelayedTyposInExpr(RHSExpr);
+
+    if (!CondResult.isUsable())
+      return ExprError();
+
+    if (LHSExpr) {
+      if (!LHSResult.isUsable())
+        return ExprError();
+    }
+
+    if (!RHSResult.isUsable())
+      return ExprError();
+
     CondExpr = CondResult.get();
+    LHSExpr = LHSResult.get();
+    RHSExpr = RHSResult.get();
   }
 
   // If this is the gnu "x ?: y" extension, analyze the types as though the LHS
@@ -6876,6 +7168,7 @@
     LHSExpr = CondExpr = opaqueValue;
   }
 
+  QualType LHSTy = LHSExpr->getType(), RHSTy = RHSExpr->getType();
   ExprValueKind VK = VK_RValue;
   ExprObjectKind OK = OK_Ordinary;
   ExprResult Cond = CondExpr, LHS = LHSExpr, RHS = RHSExpr;
@@ -6890,6 +7183,9 @@
 
   CheckBoolLikeConversion(Cond.get(), QuestionLoc);
 
+  result = computeConditionalNullability(result, commonExpr, LHSTy, RHSTy,
+                                         Context);
+
   if (!commonExpr)
     return new (Context)
         ConditionalOperator(Cond.get(), QuestionLoc, LHS.get(), ColonLoc,
@@ -6949,7 +7245,7 @@
     else if (lhq.getObjCLifetime() != rhq.getObjCLifetime())
       ConvTy = Sema::IncompatiblePointerDiscardsQualifiers;
     
-    // For GCC compatibility, other qualifier mismatches are treated
+    // For GCC/MS compatibility, other qualifier mismatches are treated
     // as still compatible in C.
     else ConvTy = Sema::CompatiblePointerDiscardsQualifiers;
   }
@@ -7201,9 +7497,30 @@
         return IncompatibleVectors;
       }
     }
+
+    // When the RHS comes from another lax conversion (e.g. binops between
+    // scalars and vectors) the result is canonicalized as a vector. When the
+    // LHS is also a vector, the lax is allowed by the condition above. Handle
+    // the case where LHS is a scalar.
+    if (LHSType->isScalarType()) {
+      const VectorType *VecType = RHSType->getAs<VectorType>();
+      if (VecType && VecType->getNumElements() == 1 &&
+          isLaxVectorConversion(RHSType, LHSType)) {
+        ExprResult *VecExpr = &RHS;
+        *VecExpr = ImpCastExprToType(VecExpr->get(), LHSType, CK_BitCast);
+        Kind = CK_BitCast;
+        return Compatible;
+      }
+    }
+
     return Incompatible;
   }
 
+  // Diagnose attempts to convert between __float128 and long double where
+  // such conversions currently can't be handled.
+  if (unsupportedTypeConversion(*this, LHSType, RHSType))
+    return Incompatible;
+
   // Arithmetic conversions.
   if (LHSType->isArithmeticType() && RHSType->isArithmeticType() &&
       !(getLangOpts().CPlusPlus && LHSType->isEnumeralType())) {
@@ -7384,6 +7701,11 @@
     }
   }
 
+  if (LHSType->isSamplerT() && RHSType->isIntegerType()) {
+    Kind = CK_IntToOCLSampler;
+    return Compatible;
+  }
+
   return Incompatible;
 }
 
@@ -7735,14 +8057,16 @@
       return RHSType;
   }
 
-  // If we're allowing lax vector conversions, only the total (data) size
-  // needs to be the same.
-  // FIXME: Should we really be allowing this?
-  // FIXME: We really just pick the LHS type arbitrarily?
-  if (isLaxVectorConversion(RHSType, LHSType)) {
-    QualType resultType = LHSType;
-    RHS = ImpCastExprToType(RHS.get(), resultType, CK_BitCast);
-    return resultType;
+  // If we're allowing lax vector conversions, only the total (data) size needs
+  // to be the same. If one of the types is scalar, the result is always the
+  // vector type. Don't allow this if the scalar operand is an lvalue.
+  QualType VecType = LHSVecType ? LHSType : RHSType;
+  QualType ScalarType = LHSVecType ? RHSType : LHSType;
+  ExprResult *ScalarExpr = LHSVecType ? &RHS : &LHS;
+  if (isLaxVectorConversion(ScalarType, VecType) &&
+      !ScalarExpr->get()->isLValue()) {
+    *ScalarExpr = ImpCastExprToType(ScalarExpr->get(), VecType, CK_BitCast);
+    return VecType;
   }
 
   // Okay, the expression is invalid.
@@ -8351,7 +8675,7 @@
 
   // If LHS does not have a signed type and non-negative value
   // then, the behavior is undefined. Warn about it.
-  if (Left.isNegative()) {
+  if (Left.isNegative() && !S.getLangOpts().isSignedOverflowDefined()) {
     S.DiagRuntimeBehavior(Loc, LHS.get(),
                           S.PDiag(diag::warn_shift_lhs_negative)
                             << LHS.get()->getSourceRange());
@@ -8387,11 +8711,10 @@
     << RHS.get()->getSourceRange();
 }
 
-/// \brief Return the resulting type when an OpenCL vector is shifted
+/// \brief Return the resulting type when a vector is shifted
 ///        by a scalar or vector shift amount.
-static QualType checkOpenCLVectorShift(Sema &S,
-                                       ExprResult &LHS, ExprResult &RHS,
-                                       SourceLocation Loc, bool IsCompAssign) {
+static QualType checkVectorShift(Sema &S, ExprResult &LHS, ExprResult &RHS,
+                                 SourceLocation Loc, bool IsCompAssign) {
   // OpenCL v1.1 s6.3.j says RHS can be a vector only if LHS is a vector.
   if (!LHS.get()->getType()->isVectorType()) {
     S.Diag(Loc, diag::err_shift_rhs_only_vector)
@@ -8459,11 +8782,9 @@
   // Vector shifts promote their scalar inputs to vector type.
   if (LHS.get()->getType()->isVectorType() ||
       RHS.get()->getType()->isVectorType()) {
-    if (LangOpts.OpenCL)
-      return checkOpenCLVectorShift(*this, LHS, RHS, Loc, IsCompAssign);
     if (LangOpts.ZVector) {
       // The shift operators for the z vector extensions work basically
-      // like OpenCL shifts, except that neither the LHS nor the RHS is
+      // like general shifts, except that neither the LHS nor the RHS is
       // allowed to be a "vector bool".
       if (auto LHSVecType = LHS.get()->getType()->getAs<VectorType>())
         if (LHSVecType->getVectorKind() == VectorType::AltiVecBool)
@@ -8471,11 +8792,8 @@
       if (auto RHSVecType = RHS.get()->getType()->getAs<VectorType>())
         if (RHSVecType->getVectorKind() == VectorType::AltiVecBool)
           return InvalidOperands(Loc, LHS, RHS);
-      return checkOpenCLVectorShift(*this, LHS, RHS, Loc, IsCompAssign);
     }
-    return CheckVectorOperands(LHS, RHS, Loc, IsCompAssign,
-                               /*AllowBothBool*/true,
-                               /*AllowBoolConversions*/false);
+    return checkVectorShift(*this, LHS, RHS, Loc, IsCompAssign);
   }
 
   // Shifts don't perform usual arithmetic conversions, they just do integer
@@ -9286,7 +9604,7 @@
   }
   
   // Return a signed type for the vector.
-  return GetSignedVectorType(LHSType);
+  return GetSignedVectorType(vType);
 }
 
 QualType Sema::CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
@@ -9453,7 +9771,16 @@
 
   // Decide whether the first capture was for a block or a lambda.
   DeclContext *DC = S.CurContext, *Prev = nullptr;
-  while (DC != var->getDeclContext()) {
+  // Decide whether the first capture was for a block or a lambda.
+  while (DC) {
+    // For init-capture, it is possible that the variable belongs to the
+    // template pattern of the current context.
+    if (auto *FD = dyn_cast<FunctionDecl>(DC))
+      if (var->isInitCapture() &&
+          FD->getTemplateInstantiationPattern() == var->getDeclContext())
+        break;
+    if (DC == var->getDeclContext())
+      break;
     Prev = DC;
     DC = DC->getParent();
   }
@@ -9600,6 +9927,9 @@
 /// emit an error and return true.  If so, return false.
 static bool CheckForModifiableLvalue(Expr *E, SourceLocation Loc, Sema &S) {
   assert(!E->hasPlaceholderType(BuiltinType::PseudoObject));
+
+  S.CheckShadowingDeclModification(E, Loc);
+
   SourceLocation OrigLoc = Loc;
   Expr::isModifiableLvalueResult IsLV = E->isModifiableLvalue(S.Context,
                                                               &Loc);
@@ -9840,6 +10170,67 @@
           ? LHSType : LHSType.getUnqualifiedType());
 }
 
+// Only ignore explicit casts to void.
+static bool IgnoreCommaOperand(const Expr *E) {
+  E = E->IgnoreParens();
+
+  if (const CastExpr *CE = dyn_cast<CastExpr>(E)) {
+    if (CE->getCastKind() == CK_ToVoid) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Look for instances where it is likely the comma operator is confused with
+// another operator.  There is a whitelist of acceptable expressions for the
+// left hand side of the comma operator, otherwise emit a warning.
+void Sema::DiagnoseCommaOperator(const Expr *LHS, SourceLocation Loc) {
+  // No warnings in macros
+  if (Loc.isMacroID())
+    return;
+
+  // Don't warn in template instantiations.
+  if (!ActiveTemplateInstantiations.empty())
+    return;
+
+  // Scope isn't fine-grained enough to whitelist the specific cases, so
+  // instead, skip more than needed, then call back into here with the
+  // CommaVisitor in SemaStmt.cpp.
+  // The whitelisted locations are the initialization and increment portions
+  // of a for loop.  The additional checks are on the condition of
+  // if statements, do/while loops, and for loops.
+  const unsigned ForIncrementFlags =
+      Scope::ControlScope | Scope::ContinueScope | Scope::BreakScope;
+  const unsigned ForInitFlags = Scope::ControlScope | Scope::DeclScope;
+  const unsigned ScopeFlags = getCurScope()->getFlags();
+  if ((ScopeFlags & ForIncrementFlags) == ForIncrementFlags ||
+      (ScopeFlags & ForInitFlags) == ForInitFlags)
+    return;
+
+  // If there are multiple comma operators used together, get the RHS of the
+  // of the comma operator as the LHS.
+  while (const BinaryOperator *BO = dyn_cast<BinaryOperator>(LHS)) {
+    if (BO->getOpcode() != BO_Comma)
+      break;
+    LHS = BO->getRHS();
+  }
+
+  // Only allow some expressions on LHS to not warn.
+  if (IgnoreCommaOperand(LHS))
+    return;
+
+  Diag(Loc, diag::warn_comma_operator);
+  Diag(LHS->getLocStart(), diag::note_cast_to_void)
+      << LHS->getSourceRange()
+      << FixItHint::CreateInsertion(LHS->getLocStart(),
+                                    LangOpts.CPlusPlus ? "static_cast<void>("
+                                                       : "(void)(")
+      << FixItHint::CreateInsertion(PP.getLocForEndOfToken(LHS->getLocEnd()),
+                                    ")");
+}
+
 // C99 6.5.17
 static QualType CheckCommaOperands(Sema &S, ExprResult &LHS, ExprResult &RHS,
                                    SourceLocation Loc) {
@@ -9869,6 +10260,9 @@
                             diag::err_incomplete_type);
   }
 
+  if (!S.getDiagnostics().isIgnored(diag::warn_comma_operator, Loc))
+    S.DiagnoseCommaOperator(LHS.get(), Loc);
+
   return RHS.get()->getType();
 }
 
@@ -10117,8 +10511,8 @@
     if (sfinae)
       return QualType();
     // Materialize the temporary as an lvalue so that we can take its address.
-    OrigOp = op = new (Context)
-        MaterializeTemporaryExpr(op->getType(), OrigOp.get(), true);
+    OrigOp = op =
+        CreateMaterializeTemporaryExpr(op->getType(), OrigOp.get(), true);
   } else if (isa<ObjCSelectorExpr>(op)) {
     return Context.getPointerType(op->getType());
   } else if (lval == Expr::LV_MemberFunction) {
@@ -10222,7 +10616,8 @@
           return MPTy;
         }
       }
-    } else if (!isa<FunctionDecl>(dcl) && !isa<NonTypeTemplateParmDecl>(dcl))
+    } else if (!isa<FunctionDecl>(dcl) && !isa<NonTypeTemplateParmDecl>(dcl) &&
+               !isa<BindingDecl>(dcl))
       llvm_unreachable("Unknown/unexpected decl type");
   }
 
@@ -10241,6 +10636,9 @@
   // If the operand has type "type", the result has type "pointer to type".
   if (op->getType()->isObjCObjectType())
     return Context.getObjCObjectPointerType(op->getType());
+
+  CheckAddressOfPackedMember(op);
+
   return Context.getPointerType(op->getType());
 }
 
@@ -10282,7 +10680,9 @@
   }
 
   if (const PointerType *PT = OpTy->getAs<PointerType>())
+  {
     Result = PT->getPointeeType();
+  }
   else if (const ObjCObjectPointerType *OPT =
              OpTy->getAs<ObjCObjectPointerType>())
     Result = OPT->getPointeeType();
@@ -10520,10 +10920,11 @@
   }
 
   if (getLangOpts().OpenCL) {
+    QualType LHSTy = LHSExpr->getType();
+    QualType RHSTy = RHSExpr->getType();
     // OpenCLC v2.0 s6.13.11.1 allows atomic variables to be initialized by
     // the ATOMIC_VAR_INIT macro.
-    if (LHSExpr->getType()->isAtomicType() ||
-        RHSExpr->getType()->isAtomicType()) {
+    if (LHSTy->isAtomicType() || RHSTy->isAtomicType()) {
       SourceRange SR(LHSExpr->getLocStart(), RHSExpr->getLocEnd());
       if (BO_Assign == Opc)
         Diag(OpLoc, diag::err_atomic_init_constant) << SR;
@@ -10531,6 +10932,16 @@
         ResultTy = InvalidOperands(OpLoc, LHS, RHS);
       return ExprError();
     }
+
+    // OpenCL special types - image, sampler, pipe, and blocks are to be used
+    // only with a builtin functions and therefore should be disallowed here.
+    if (LHSTy->isImageType() || RHSTy->isImageType() ||
+        LHSTy->isSamplerT() || RHSTy->isSamplerT() ||
+        LHSTy->isPipeType() || RHSTy->isPipeType() ||
+        LHSTy->isBlockPointerType() || RHSTy->isBlockPointerType()) {
+      ResultTy = InvalidOperands(OpLoc, LHS, RHS);
+      return ExprError();
+    }
   }
 
   switch (Opc) {
@@ -11001,8 +11412,13 @@
   ExprObjectKind OK = OK_Ordinary;
   QualType resultType;
   if (getLangOpts().OpenCL) {
+    QualType Ty = InputExpr->getType();
     // The only legal unary operation for atomics is '&'.
-    if (Opc != UO_AddrOf && InputExpr->getType()->isAtomicType()) {
+    if ((Opc != UO_AddrOf && Ty->isAtomicType()) ||
+    // OpenCL special types - image, sampler, pipe, and blocks are to be used
+    // only with a builtin functions and therefore should be disallowed here.
+        (Ty->isImageType() || Ty->isSamplerT() || Ty->isPipeType()
+        || Ty->isBlockPointerType())) {
       return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
                        << InputExpr->getType()
                        << Input.get()->getSourceRange());
@@ -11315,7 +11731,8 @@
 
   if (hasAnyUnrecoverableErrorsInThisFunction())
     DiscardCleanupsInEvaluationContext();
-  assert(!ExprNeedsCleanups && "cleanups within StmtExpr not correctly bound!");
+  assert(!Cleanup.exprNeedsCleanups() &&
+         "cleanups within StmtExpr not correctly bound!");
   PopExpressionEvaluationContext();
 
   // FIXME: there are a variety of strange constraints to enforce here, for
@@ -11739,8 +12156,7 @@
   // Set the parameters on the block decl.
   if (!Params.empty()) {
     CurBlock->TheDecl->setParams(Params);
-    CheckParmsForFunctionDef(CurBlock->TheDecl->param_begin(),
-                             CurBlock->TheDecl->param_end(),
+    CheckParmsForFunctionDef(CurBlock->TheDecl->parameters(),
                              /*CheckParameterNames=*/false);
   }
   
@@ -11748,7 +12164,7 @@
   ProcessDeclAttributes(CurScope, CurBlock->TheDecl, ParamInfo);
 
   // Put the parameter variables in scope.
-  for (auto AI : CurBlock->TheDecl->params()) {
+  for (auto AI : CurBlock->TheDecl->parameters()) {
     AI->setOwningFunction(CurBlock->TheDecl);
 
     // If this has an identifier, add it to the scope stack.
@@ -11778,12 +12194,13 @@
                                     Stmt *Body, Scope *CurScope) {
   // If blocks are disabled, emit an error.
   if (!LangOpts.Blocks)
-    Diag(CaretLoc, diag::err_blocks_disable);
+    Diag(CaretLoc, diag::err_blocks_disable) << LangOpts.OpenCL;
 
   // Leave the expression-evaluation context.
   if (hasAnyUnrecoverableErrorsInThisFunction())
     DiscardCleanupsInEvaluationContext();
-  assert(!ExprNeedsCleanups && "cleanups within block not correctly bound!");
+  assert(!Cleanup.exprNeedsCleanups() &&
+         "cleanups within block not correctly bound!");
   PopExpressionEvaluationContext();
 
   BlockScopeInfo *BSI = cast<BlockScopeInfo>(FunctionScopes.back());
@@ -11847,8 +12264,7 @@
     BlockTy = Context.getFunctionType(RetTy, None, EPI);
   }
 
-  DiagnoseUnusedParameters(BSI->TheDecl->param_begin(),
-                           BSI->TheDecl->param_end());
+  DiagnoseUnusedParameters(BSI->TheDecl->parameters());
   BlockTy = Context.getBlockPointerType(BlockTy);
 
   // If needed, diagnose invalid gotos and switches in the block.
@@ -11874,7 +12290,7 @@
   if (Result->getBlockDecl()->hasCaptures()) {
     // First, this expression has a new cleanup object.
     ExprCleanupObjects.push_back(Result->getBlockDecl());
-    ExprNeedsCleanups = true;
+    Cleanup.setExprNeedsCleanups(true);
 
     // It also gets a branch-protected scope if any of the captured
     // variables needs destruction.
@@ -11890,9 +12306,8 @@
   return Result;
 }
 
-ExprResult Sema::ActOnVAArg(SourceLocation BuiltinLoc,
-                                        Expr *E, ParsedType Ty,
-                                        SourceLocation RPLoc) {
+ExprResult Sema::ActOnVAArg(SourceLocation BuiltinLoc, Expr *E, ParsedType Ty,
+                            SourceLocation RPLoc) {
   TypeSourceInfo *TInfo;
   GetTypeFromParser(Ty, &TInfo);
   return BuildVAArgExpr(BuiltinLoc, E, TInfo, RPLoc);
@@ -11904,6 +12319,15 @@
   Expr *OrigExpr = E;
   bool IsMS = false;
 
+  // CUDA device code does not support varargs.
+  if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) {
+    if (const FunctionDecl *F = dyn_cast<FunctionDecl>(CurContext)) {
+      CUDAFunctionTarget T = IdentifyCUDATarget(F);
+      if (T == CFT_Global || T == CFT_Device || T == CFT_HostDevice)
+        return ExprError(Diag(E->getLocStart(), diag::err_va_arg_in_device));
+    }
+  }
+
   // It might be a __builtin_ms_va_list. (But don't ever mark a va_arg()
   // as Microsoft ABI on an actual Microsoft platform, where
   // __builtin_ms_va_list and __builtin_va_list are the same.)
@@ -12104,10 +12528,14 @@
     MayHaveConvFixit = true;
     break;
   case IncompatiblePointer:
-      DiagKind =
-        (Action == AA_Passing_CFAudited ?
-          diag::err_arc_typecheck_convert_incompatible_pointer :
-          diag::ext_typecheck_convert_incompatible_pointer);
+    if (Action == AA_Passing_CFAudited)
+      DiagKind = diag::err_arc_typecheck_convert_incompatible_pointer;
+    else if (SrcType->isFunctionPointerType() &&
+             DstType->isFunctionPointerType())
+      DiagKind = diag::ext_typecheck_convert_incompatible_function_pointer;
+    else
+      DiagKind = diag::ext_typecheck_convert_incompatible_pointer;
+
     CheckInferredResultType = DstType->isObjCObjectPointerType() &&
       SrcType->isObjCObjectPointerType();
     if (Hint.isNull() && !CheckInferredResultType) {
@@ -12503,10 +12931,9 @@
 Sema::PushExpressionEvaluationContext(ExpressionEvaluationContext NewContext,
                                       Decl *LambdaContextDecl,
                                       bool IsDecltype) {
-  ExprEvalContexts.emplace_back(NewContext, ExprCleanupObjects.size(),
-                                ExprNeedsCleanups, LambdaContextDecl,
-                                IsDecltype);
-  ExprNeedsCleanups = false;
+  ExprEvalContexts.emplace_back(NewContext, ExprCleanupObjects.size(), Cleanup,
+                                LambdaContextDecl, IsDecltype);
+  Cleanup.reset();
   if (!MaybeODRUseExprs.empty())
     std::swap(MaybeODRUseExprs, ExprEvalContexts.back().SavedMaybeODRUseExprs);
 }
@@ -12557,12 +12984,12 @@
   if (Rec.isUnevaluated() || Rec.Context == ConstantEvaluated) {
     ExprCleanupObjects.erase(ExprCleanupObjects.begin() + Rec.NumCleanupObjects,
                              ExprCleanupObjects.end());
-    ExprNeedsCleanups = Rec.ParentNeedsCleanups;
+    Cleanup = Rec.ParentCleanup;
     CleanupVarDeclMarking();
     std::swap(MaybeODRUseExprs, Rec.SavedMaybeODRUseExprs);
   // Otherwise, merge the contexts together.
   } else {
-    ExprNeedsCleanups |= Rec.ParentNeedsCleanups;
+    Cleanup.mergeFrom(Rec.ParentCleanup);
     MaybeODRUseExprs.insert(Rec.SavedMaybeODRUseExprs.begin(),
                             Rec.SavedMaybeODRUseExprs.end());
   }
@@ -12581,7 +13008,7 @@
   ExprCleanupObjects.erase(
          ExprCleanupObjects.begin() + ExprEvalContexts.back().NumCleanupObjects,
          ExprCleanupObjects.end());
-  ExprNeedsCleanups = false;
+  Cleanup.reset();
   MaybeODRUseExprs.clear();
 }
 
@@ -12606,6 +13033,11 @@
       // definition of a null pointer constant is completely crazy.)
       return false;
 
+    case Sema::DiscardedStatement:
+      // These are technically a potentially evaluated but they have the effect
+      // of suppressing use marking.
+      return false;
+
     case Sema::ConstantEvaluated:
     case Sema::PotentiallyEvaluated:
       // We are in a potentially evaluated expression (or a constant-expression
@@ -12635,39 +13067,53 @@
   //   set of overloaded functions [...].
   //
   // We (incorrectly) mark overload resolution as an unevaluated context, so we
-  // can just check that here. Skip the rest of this function if we've already
-  // marked the function as used.
+  // can just check that here.
   bool OdrUse = MightBeOdrUse && IsPotentiallyEvaluatedContext(*this);
-  if (Func->isUsed(/*CheckUsedAttr=*/false) || !OdrUse) {
-    // C++11 [temp.inst]p3:
-    //   Unless a function template specialization has been explicitly
-    //   instantiated or explicitly specialized, the function template
-    //   specialization is implicitly instantiated when the specialization is
-    //   referenced in a context that requires a function definition to exist.
-    //
-    // We consider constexpr function templates to be referenced in a context
-    // that requires a definition to exist whenever they are referenced.
-    //
-    // FIXME: This instantiates constexpr functions too frequently. If this is
-    // really an unevaluated context (and we're not just in the definition of a
-    // function template or overload resolution or other cases which we
-    // incorrectly consider to be unevaluated contexts), and we're not in a
-    // subexpression which we actually need to evaluate (for instance, a
-    // template argument, array bound or an expression in a braced-init-list),
-    // we are not permitted to instantiate this constexpr function definition.
-    //
-    // FIXME: This also implicitly defines special members too frequently. They
-    // are only supposed to be implicitly defined if they are odr-used, but they
-    // are not odr-used from constant expressions in unevaluated contexts.
-    // However, they cannot be referenced if they are deleted, and they are
-    // deleted whenever the implicit definition of the special member would
-    // fail.
-    if (!Func->isConstexpr() || Func->getBody())
-      return;
-    CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(Func);
-    if (!Func->isImplicitlyInstantiable() && (!MD || MD->isUserProvided()))
-      return;
-  }
+
+  // Determine whether we require a function definition to exist, per
+  // C++11 [temp.inst]p3:
+  //   Unless a function template specialization has been explicitly
+  //   instantiated or explicitly specialized, the function template
+  //   specialization is implicitly instantiated when the specialization is
+  //   referenced in a context that requires a function definition to exist.
+  //
+  // We consider constexpr function templates to be referenced in a context
+  // that requires a definition to exist whenever they are referenced.
+  //
+  // FIXME: This instantiates constexpr functions too frequently. If this is
+  // really an unevaluated context (and we're not just in the definition of a
+  // function template or overload resolution or other cases which we
+  // incorrectly consider to be unevaluated contexts), and we're not in a
+  // subexpression which we actually need to evaluate (for instance, a
+  // template argument, array bound or an expression in a braced-init-list),
+  // we are not permitted to instantiate this constexpr function definition.
+  //
+  // FIXME: This also implicitly defines special members too frequently. They
+  // are only supposed to be implicitly defined if they are odr-used, but they
+  // are not odr-used from constant expressions in unevaluated contexts.
+  // However, they cannot be referenced if they are deleted, and they are
+  // deleted whenever the implicit definition of the special member would
+  // fail (with very few exceptions).
+  CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(Func);
+  bool NeedDefinition =
+      OdrUse || (Func->isConstexpr() && (Func->isImplicitlyInstantiable() ||
+                                         (MD && !MD->isUserProvided())));
+
+  // C++14 [temp.expl.spec]p6:
+  //   If a template [...] is explicitly specialized then that specialization
+  //   shall be declared before the first use of that specialization that would
+  //   cause an implicit instantiation to take place, in every translation unit
+  //   in which such a use occurs
+  if (NeedDefinition &&
+      (Func->getTemplateSpecializationKind() != TSK_Undeclared ||
+       Func->getMemberSpecializationInfo()))
+    checkSpecializationVisibility(Loc, Func);
+
+  // If we don't need to mark the function as used, and we don't need to
+  // try to provide a definition, there's nothing more to do.
+  if ((Func->isUsed(/*CheckUsedAttr=*/false) || !OdrUse) &&
+      (!NeedDefinition || Func->getBody()))
+    return;
 
   // Note that this declaration has been used.
   if (CXXConstructorDecl *Constructor = dyn_cast<CXXConstructorDecl>(Func)) {
@@ -12702,7 +13148,7 @@
       if (MethodDecl->isDefaulted() && !MethodDecl->isDeleted()) {
         if (MethodDecl->isCopyAssignmentOperator())
           DefineImplicitCopyAssignment(Loc, MethodDecl);
-        else
+        else if (MethodDecl->isMoveAssignmentOperator())
           DefineImplicitMoveAssignment(Loc, MethodDecl);
       }
     } else if (isa<CXXConversionDecl>(MethodDecl) &&
@@ -12790,22 +13236,12 @@
       UndefinedButUsed.insert(std::make_pair(Func->getCanonicalDecl(), Loc));
   }
 
-  // Normally the most current decl is marked used while processing the use and
-  // any subsequent decls are marked used by decl merging. This fails with
-  // template instantiation since marking can happen at the end of the file
-  // and, because of the two phase lookup, this function is called with at
-  // decl in the middle of a decl chain. We loop to maintain the invariant
-  // that once a decl is used, all decls after it are also used.
-  for (FunctionDecl *F = Func->getMostRecentDecl();; F = F->getPreviousDecl()) {
-    F->markUsed(Context);
-    if (F == Func)
-      break;
-  }
+  Func->markUsed(Context);
 }
 
 static void
 diagnoseUncapturableValueReference(Sema &S, SourceLocation loc,
-                                   VarDecl *var, DeclContext *DC) {
+                                   ValueDecl *var, DeclContext *DC) {
   DeclContext *VarDC = var->getDeclContext();
 
   //  If the parameter still belongs to the translation unit, then
@@ -12825,25 +13261,21 @@
   if (!S.getLangOpts().CPlusPlus && !S.CurContext->isFunctionOrMethod())
     return;
 
+  unsigned ValueKind = isa<BindingDecl>(var) ? 1 : 0;
+  unsigned ContextKind = 3; // unknown
   if (isa<CXXMethodDecl>(VarDC) &&
       cast<CXXRecordDecl>(VarDC->getParent())->isLambda()) {
-    S.Diag(loc, diag::err_reference_to_local_var_in_enclosing_lambda)
-      << var->getIdentifier();
-  } else if (FunctionDecl *fn = dyn_cast<FunctionDecl>(VarDC)) {
-    S.Diag(loc, diag::err_reference_to_local_var_in_enclosing_function)
-      << var->getIdentifier() << fn->getDeclName();
+    ContextKind = 2;
+  } else if (isa<FunctionDecl>(VarDC)) {
+    ContextKind = 0;
   } else if (isa<BlockDecl>(VarDC)) {
-    S.Diag(loc, diag::err_reference_to_local_var_in_enclosing_block)
-      << var->getIdentifier();
-  } else {
-    // FIXME: Is there any other context where a local variable can be
-    // declared?
-    S.Diag(loc, diag::err_reference_to_local_var_in_enclosing_context)
-      << var->getIdentifier();
+    ContextKind = 1;
   }
 
+  S.Diag(loc, diag::err_reference_to_local_in_enclosing_context)
+    << var << ValueKind << ContextKind << VarDC;
   S.Diag(var->getLocation(), diag::note_entity_declared_at)
-      << var->getIdentifier();
+      << var;
 
   // FIXME: Add additional diagnostic info about class etc. which prevents
   // capture.
@@ -12988,7 +13420,8 @@
     return false;
   }
   const bool HasBlocksAttr = Var->hasAttr<BlocksAttr>();
-  if (HasBlocksAttr || CaptureType->isReferenceType()) {
+  if (HasBlocksAttr || CaptureType->isReferenceType() ||
+      (S.getLangOpts().OpenMP && S.IsOpenMPCapturedDecl(Var))) {
     // Block capture by reference does not change the capture or
     // declaration reference types.
     ByRef = true;
@@ -13056,14 +13489,13 @@
                                     QualType &DeclRefType, 
                                     const bool RefersToCapturedVariable,
                                     Sema &S) {
-  
   // By default, capture variables by reference.
   bool ByRef = true;
   // Using an LValue reference type is consistent with Lambdas (see below).
-  if (S.getLangOpts().OpenMP) {
-    ByRef = S.IsOpenMPCapturedByRef(Var, RSI);
-    if (S.IsOpenMPCapturedVar(Var))
+  if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) {
+    if (S.IsOpenMPCapturedDecl(Var))
       DeclRefType = DeclRefType.getUnqualifiedType();
+    ByRef = S.IsOpenMPCapturedByRef(Var, RSI->OpenMPLevel);
   }
 
   if (ByRef)
@@ -13103,7 +13535,7 @@
 
 /// \brief Create a field within the lambda class for the variable
 /// being captured.
-static void addAsFieldToClosureType(Sema &S, LambdaScopeInfo *LSI, VarDecl *Var,
+static void addAsFieldToClosureType(Sema &S, LambdaScopeInfo *LSI, 
                                     QualType FieldType, QualType DeclRefType,
                                     SourceLocation Loc,
                                     bool RefersToCapturedVariable) {
@@ -13197,7 +13629,7 @@
 
   // Capture this variable in the lambda.
   if (BuildAndDiagnose)
-    addAsFieldToClosureType(S, LSI, Var, CaptureType, DeclRefType, Loc,
+    addAsFieldToClosureType(S, LSI, CaptureType, DeclRefType, Loc,
                             RefersToCapturedVariable);
     
   // Compute the type of a reference to this captured variable.
@@ -13253,7 +13685,7 @@
   // Capture global variables if it is required to use private copy of this
   // variable.
   bool IsGlobal = !Var->hasLocalStorage();
-  if (IsGlobal && !(LangOpts.OpenMP && IsOpenMPCapturedVar(Var)))
+  if (IsGlobal && !(LangOpts.OpenMP && IsOpenMPCapturedDecl(Var)))
     return true;
 
   // Walk up the stack to determine whether we can capture the variable,
@@ -13269,7 +13701,6 @@
   bool Nested = false;
   bool Explicit = (Kind != TryCapture_Implicit);
   unsigned FunctionScopesIndex = MaxFunctionScopesIndex;
-  unsigned OpenMPLevel = 0;
   do {
     // Only block literals, captured statements, and lambda expressions can
     // capture; other scopes don't work.
@@ -13335,20 +13766,19 @@
         // just break here. Similarly, global variables that are captured in a
         // target region should not be captured outside the scope of the region.
         if (RSI->CapRegionKind == CR_OpenMP) {
-          auto isTargetCap = isOpenMPTargetCapturedVar(Var, OpenMPLevel);
+          auto IsTargetCap = isOpenMPTargetCapturedDecl(Var, RSI->OpenMPLevel);
           // When we detect target captures we are looking from inside the
           // target region, therefore we need to propagate the capture from the
           // enclosing region. Therefore, the capture is not initially nested.
-          if (isTargetCap)
+          if (IsTargetCap)
             FunctionScopesIndex--;
 
-          if (isTargetCap || isOpenMPPrivateVar(Var, OpenMPLevel)) {
-            Nested = !isTargetCap;
+          if (IsTargetCap || isOpenMPPrivateDecl(Var, RSI->OpenMPLevel)) {
+            Nested = !IsTargetCap;
             DeclRefType = DeclRefType.getUnqualifiedType();
             CaptureType = Context.getLValueReferenceType(DeclRefType);
             break;
           }
-          ++OpenMPLevel;
         }
       }
     }
@@ -13583,6 +14013,12 @@
   assert(!isa<VarTemplatePartialSpecializationDecl>(Var) &&
          "Can't instantiate a partial template specialization.");
 
+  // If this might be a member specialization of a static data member, check
+  // the specialization is visible. We already did the checks for variable
+  // template specializations when we created them.
+  if (TSK != TSK_Undeclared && !isa<VarTemplateSpecializationDecl>(Var))
+    SemaRef.checkSpecializationVisibility(Loc, Var);
+
   // Perform implicit instantiation of static data members, static data member
   // templates of class templates, and variable template specializations. Delay
   // instantiations of variable templates, except for those that could be used
@@ -13626,7 +14062,8 @@
     }
   }
 
-  if(!MarkODRUsed) return;
+  if (!MarkODRUsed)
+    return;
 
   // Per C++11 [basic.def.odr], a variable is odr-used "unless it satisfies
   // the requirements for appearing in a constant expression (5.19) and, if
@@ -13655,6 +14092,9 @@
 
 static void MarkExprReferenced(Sema &SemaRef, SourceLocation Loc,
                                Decl *D, Expr *E, bool MightBeOdrUse) {
+  if (SemaRef.isInOpenMPDeclareTargetContext())
+    SemaRef.checkDeclIsAllowedInOpenMPTarget(E, D);
+
   if (VarDecl *Var = dyn_cast<VarDecl>(D)) {
     DoMarkVarDeclReferenced(SemaRef, Loc, Var, E);
     return;
@@ -13883,6 +14323,7 @@
   switch (ExprEvalContexts.back().Context) {
   case Unevaluated:
   case UnevaluatedAbstract:
+  case DiscardedStatement:
     // The argument will never be evaluated, so don't complain.
     break;
 
@@ -14032,7 +14473,8 @@
     }
 }
 
-ExprResult Sema::CheckBooleanCondition(Expr *E, SourceLocation Loc) {
+ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
+                                       bool IsConstexpr) {
   DiagnoseAssignmentAsCondition(E);
   if (ParenExpr *parenE = dyn_cast<ParenExpr>(E))
     DiagnoseEqualityWithExtraParens(parenE);
@@ -14043,7 +14485,7 @@
 
   if (!E->isTypeDependent()) {
     if (getLangOpts().CPlusPlus)
-      return CheckCXXBooleanCondition(E); // C++ 6.4p4
+      return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4
 
     ExprResult ERes = DefaultFunctionArrayLvalueConversion(E);
     if (ERes.isInvalid())
@@ -14062,12 +14504,36 @@
   return E;
 }
 
-ExprResult Sema::ActOnBooleanCondition(Scope *S, SourceLocation Loc,
-                                       Expr *SubExpr) {
+Sema::ConditionResult Sema::ActOnCondition(Scope *S, SourceLocation Loc,
+                                           Expr *SubExpr, ConditionKind CK) {
+  // Empty conditions are valid in for-statements.
   if (!SubExpr)
-    return ExprError();
+    return ConditionResult();
 
-  return CheckBooleanCondition(SubExpr, Loc);
+  ExprResult Cond;
+  switch (CK) {
+  case ConditionKind::Boolean:
+    Cond = CheckBooleanCondition(Loc, SubExpr);
+    break;
+
+  case ConditionKind::ConstexprIf:
+    Cond = CheckBooleanCondition(Loc, SubExpr, true);
+    break;
+
+  case ConditionKind::Switch:
+    Cond = CheckSwitchCondition(Loc, SubExpr);
+    break;
+  }
+  if (Cond.isInvalid())
+    return ConditionError();
+
+  // FIXME: FullExprArg doesn't have an invalid bit, so check nullness instead.
+  FullExprArg FullExpr = MakeFullExpr(Cond.get(), Loc);
+  if (!FullExpr.get())
+    return ConditionError();
+
+  return ConditionResult(*this, nullptr, FullExpr,
+                         CK == ConditionKind::ConstexprIf);
 }
 
 namespace {
@@ -14610,16 +15076,20 @@
   case BuiltinType::Overload: {
     // Try to resolve a single function template specialization.
     // This is obligatory.
-    ExprResult result = E;
-    if (ResolveAndFixSingleFunctionTemplateSpecialization(result, false)) {
-      return result;
+    ExprResult Result = E;
+    if (ResolveAndFixSingleFunctionTemplateSpecialization(Result, false))
+      return Result;
+
+    // No guarantees that ResolveAndFixSingleFunctionTemplateSpecialization
+    // leaves Result unchanged on failure.
+    Result = E;
+    if (resolveAndFixAddressOfOnlyViableOverloadCandidate(Result))
+      return Result;
 
     // If that failed, try to recover with a call.
-    } else {
-      tryToRecoverWithCall(result, PDiag(diag::err_ovl_unresolvable),
-                           /*complain*/ true);
-      return result;
-    }
+    tryToRecoverWithCall(Result, PDiag(diag::err_ovl_unresolvable),
+                         /*complain*/ true);
+    return Result;
   }
 
   // Bound member functions.
@@ -14678,8 +15148,10 @@
     return ExprError();
 
   // Everything else should be impossible.
-#define BUILTIN_TYPE(Id, SingletonId) \
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
   case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
+#define BUILTIN_TYPE(Id, SingletonId) case BuiltinType::Id:
 #define PLACEHOLDER_TYPE(Id, SingletonId)
 #include "clang/AST/BuiltinTypes.def"
     break;
@@ -14716,3 +15188,22 @@
   return new (Context)
       ObjCBoolLiteralExpr(Kind == tok::kw___objc_yes, BoolT, OpLoc);
 }
+
+ExprResult Sema::ActOnObjCAvailabilityCheckExpr(
+    llvm::ArrayRef<AvailabilitySpec> AvailSpecs, SourceLocation AtLoc,
+    SourceLocation RParen) {
+
+  StringRef Platform = getASTContext().getTargetInfo().getPlatformName();
+
+  auto Spec = std::find_if(AvailSpecs.begin(), AvailSpecs.end(),
+                           [&](const AvailabilitySpec &Spec) {
+                             return Spec.getPlatform() == Platform;
+                           });
+
+  VersionTuple Version;
+  if (Spec != AvailSpecs.end())
+    Version = Spec->getVersion();
+
+  return new (Context)
+      ObjCAvailabilityCheckExpr(Version, AtLoc, RParen, Context.BoolTy);
+}
diff --git a/lib/Sema/SemaExprCXX.cpp b/lib/Sema/SemaExprCXX.cpp
index f578d2c..b7a968e 100644
--- a/lib/Sema/SemaExprCXX.cpp
+++ b/lib/Sema/SemaExprCXX.cpp
@@ -508,23 +508,60 @@
   return BuildCXXTypeId(TypeInfoType, OpLoc, (Expr*)TyOrExpr, RParenLoc);
 }
 
+/// Grabs __declspec(uuid()) off a type, or returns 0 if we cannot resolve to
+/// a single GUID.
+static void
+getUuidAttrOfType(Sema &SemaRef, QualType QT,
+                  llvm::SmallSetVector<const UuidAttr *, 1> &UuidAttrs) {
+  // Optionally remove one level of pointer, reference or array indirection.
+  const Type *Ty = QT.getTypePtr();
+  if (QT->isPointerType() || QT->isReferenceType())
+    Ty = QT->getPointeeType().getTypePtr();
+  else if (QT->isArrayType())
+    Ty = Ty->getBaseElementTypeUnsafe();
+
+  const auto *RD = Ty->getAsCXXRecordDecl();
+  if (!RD)
+    return;
+
+  if (const auto *Uuid = RD->getMostRecentDecl()->getAttr<UuidAttr>()) {
+    UuidAttrs.insert(Uuid);
+    return;
+  }
+
+  // __uuidof can grab UUIDs from template arguments.
+  if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RD)) {
+    const TemplateArgumentList &TAL = CTSD->getTemplateArgs();
+    for (const TemplateArgument &TA : TAL.asArray()) {
+      const UuidAttr *UuidForTA = nullptr;
+      if (TA.getKind() == TemplateArgument::Type)
+        getUuidAttrOfType(SemaRef, TA.getAsType(), UuidAttrs);
+      else if (TA.getKind() == TemplateArgument::Declaration)
+        getUuidAttrOfType(SemaRef, TA.getAsDecl()->getType(), UuidAttrs);
+
+      if (UuidForTA)
+        UuidAttrs.insert(UuidForTA);
+    }
+  }
+}
+
 /// \brief Build a Microsoft __uuidof expression with a type operand.
 ExprResult Sema::BuildCXXUuidof(QualType TypeInfoType,
                                 SourceLocation TypeidLoc,
                                 TypeSourceInfo *Operand,
                                 SourceLocation RParenLoc) {
+  StringRef UuidStr;
   if (!Operand->getType()->isDependentType()) {
-    bool HasMultipleGUIDs = false;
-    if (!CXXUuidofExpr::GetUuidAttrOfType(Operand->getType(),
-                                          &HasMultipleGUIDs)) {
-      if (HasMultipleGUIDs)
-        return ExprError(Diag(TypeidLoc, diag::err_uuidof_with_multiple_guids));
-      else
-        return ExprError(Diag(TypeidLoc, diag::err_uuidof_without_guid));
-    }
+    llvm::SmallSetVector<const UuidAttr *, 1> UuidAttrs;
+    getUuidAttrOfType(*this, Operand->getType(), UuidAttrs);
+    if (UuidAttrs.empty())
+      return ExprError(Diag(TypeidLoc, diag::err_uuidof_without_guid));
+    if (UuidAttrs.size() > 1)
+      return ExprError(Diag(TypeidLoc, diag::err_uuidof_with_multiple_guids));
+    UuidStr = UuidAttrs.back()->getGuid();
   }
 
-  return new (Context) CXXUuidofExpr(TypeInfoType.withConst(), Operand,
+  return new (Context) CXXUuidofExpr(TypeInfoType.withConst(), Operand, UuidStr,
                                      SourceRange(TypeidLoc, RParenLoc));
 }
 
@@ -533,18 +570,22 @@
                                 SourceLocation TypeidLoc,
                                 Expr *E,
                                 SourceLocation RParenLoc) {
+  StringRef UuidStr;
   if (!E->getType()->isDependentType()) {
-    bool HasMultipleGUIDs = false;
-    if (!CXXUuidofExpr::GetUuidAttrOfType(E->getType(), &HasMultipleGUIDs) &&
-        !E->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull)) {
-      if (HasMultipleGUIDs)
-        return ExprError(Diag(TypeidLoc, diag::err_uuidof_with_multiple_guids));
-      else
+    if (E->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull)) {
+      UuidStr = "00000000-0000-0000-0000-000000000000";
+    } else {
+      llvm::SmallSetVector<const UuidAttr *, 1> UuidAttrs;
+      getUuidAttrOfType(*this, E->getType(), UuidAttrs);
+      if (UuidAttrs.empty())
         return ExprError(Diag(TypeidLoc, diag::err_uuidof_without_guid));
+      if (UuidAttrs.size() > 1)
+        return ExprError(Diag(TypeidLoc, diag::err_uuidof_with_multiple_guids));
+      UuidStr = UuidAttrs.back()->getGuid();
     }
   }
 
-  return new (Context) CXXUuidofExpr(TypeInfoType.withConst(), E,
+  return new (Context) CXXUuidofExpr(TypeInfoType.withConst(), E, UuidStr,
                                      SourceRange(TypeidLoc, RParenLoc));
 }
 
@@ -831,27 +872,123 @@
   return false;
 }
 
+static QualType adjustCVQualifiersForCXXThisWithinLambda(
+    ArrayRef<FunctionScopeInfo *> FunctionScopes, QualType ThisTy,
+    DeclContext *CurSemaContext, ASTContext &ASTCtx) {
+
+  QualType ClassType = ThisTy->getPointeeType();
+  LambdaScopeInfo *CurLSI = nullptr;
+  DeclContext *CurDC = CurSemaContext;
+
+  // Iterate through the stack of lambdas starting from the innermost lambda to
+  // the outermost lambda, checking if '*this' is ever captured by copy - since
+  // that could change the cv-qualifiers of the '*this' object.
+  // The object referred to by '*this' starts out with the cv-qualifiers of its
+  // member function.  We then start with the innermost lambda and iterate
+  // outward checking to see if any lambda performs a by-copy capture of '*this'
+  // - and if so, any nested lambda must respect the 'constness' of that
+  // capturing lamdbda's call operator.
+  //
+
+  // The issue is that we cannot rely entirely on the FunctionScopeInfo stack
+  // since ScopeInfos are pushed on during parsing and treetransforming. But
+  // since a generic lambda's call operator can be instantiated anywhere (even
+  // end of the TU) we need to be able to examine its enclosing lambdas and so
+  // we use the DeclContext to get a hold of the closure-class and query it for
+  // capture information.  The reason we don't just resort to always using the
+  // DeclContext chain is that it is only mature for lambda expressions
+  // enclosing generic lambda's call operators that are being instantiated.
+
+  for (int I = FunctionScopes.size();
+       I-- && isa<LambdaScopeInfo>(FunctionScopes[I]);
+       CurDC = getLambdaAwareParentOfDeclContext(CurDC)) {
+    CurLSI = cast<LambdaScopeInfo>(FunctionScopes[I]);
+    
+    if (!CurLSI->isCXXThisCaptured()) 
+        continue;
+      
+    auto C = CurLSI->getCXXThisCapture();
+
+    if (C.isCopyCapture()) {
+      ClassType.removeLocalCVRQualifiers(Qualifiers::CVRMask);
+      if (CurLSI->CallOperator->isConst())
+        ClassType.addConst();
+      return ASTCtx.getPointerType(ClassType);
+    }
+  }
+  // We've run out of ScopeInfos but check if CurDC is a lambda (which can
+  // happen during instantiation of generic lambdas)
+  if (isLambdaCallOperator(CurDC)) {
+    assert(CurLSI);
+    assert(isGenericLambdaCallOperatorSpecialization(CurLSI->CallOperator));
+    assert(CurDC == getLambdaAwareParentOfDeclContext(CurLSI->CallOperator));
+    
+    auto IsThisCaptured =
+        [](CXXRecordDecl *Closure, bool &IsByCopy, bool &IsConst) {
+      IsConst = false;
+      IsByCopy = false;
+      for (auto &&C : Closure->captures()) {
+        if (C.capturesThis()) {
+          if (C.getCaptureKind() == LCK_StarThis)
+            IsByCopy = true;
+          if (Closure->getLambdaCallOperator()->isConst())
+            IsConst = true;
+          return true;
+        }
+      }
+      return false;
+    };
+
+    bool IsByCopyCapture = false;
+    bool IsConstCapture = false;
+    CXXRecordDecl *Closure = cast<CXXRecordDecl>(CurDC->getParent());
+    while (Closure &&
+           IsThisCaptured(Closure, IsByCopyCapture, IsConstCapture)) {
+      if (IsByCopyCapture) {
+        ClassType.removeLocalCVRQualifiers(Qualifiers::CVRMask);
+        if (IsConstCapture)
+          ClassType.addConst();
+        return ASTCtx.getPointerType(ClassType);
+      }
+      Closure = isLambdaCallOperator(Closure->getParent())
+                    ? cast<CXXRecordDecl>(Closure->getParent()->getParent())
+                    : nullptr;
+    }
+  }
+  return ASTCtx.getPointerType(ClassType);
+}
+
 QualType Sema::getCurrentThisType() {
   DeclContext *DC = getFunctionLevelDeclContext();
   QualType ThisTy = CXXThisTypeOverride;
+
   if (CXXMethodDecl *method = dyn_cast<CXXMethodDecl>(DC)) {
     if (method && method->isInstance())
       ThisTy = method->getThisType(Context);
   }
-  if (ThisTy.isNull()) {
-    if (isGenericLambdaCallOperatorSpecialization(CurContext) &&
-        CurContext->getParent()->getParent()->isRecord()) {
-      // This is a generic lambda call operator that is being instantiated
-      // within a default initializer - so use the enclosing class as 'this'.
-      // There is no enclosing member function to retrieve the 'this' pointer
-      // from.
-      QualType ClassTy = Context.getTypeDeclType(
-          cast<CXXRecordDecl>(CurContext->getParent()->getParent()));
-      // There are no cv-qualifiers for 'this' within default initializers, 
-      // per [expr.prim.general]p4.
-      return Context.getPointerType(ClassTy);
-    }
+
+  if (ThisTy.isNull() && isLambdaCallOperator(CurContext) &&
+      !ActiveTemplateInstantiations.empty()) {
+
+    assert(isa<CXXRecordDecl>(DC) &&
+           "Trying to get 'this' type from static method?");
+
+    // This is a lambda call operator that is being instantiated as a default
+    // initializer. DC must point to the enclosing class type, so we can recover
+    // the 'this' type from it.
+
+    QualType ClassTy = Context.getTypeDeclType(cast<CXXRecordDecl>(DC));
+    // There are no cv-qualifiers for 'this' within default initializers,
+    // per [expr.prim.general]p4.
+    ThisTy = Context.getPointerType(ClassTy);
   }
+
+  // If we are within a lambda's call operator, the cv-qualifiers of 'this'
+  // might need to be adjusted if the lambda or any of its enclosing lambda's
+  // captures '*this' by copy.
+  if (!ThisTy.isNull() && isLambdaCallOperator(CurContext))
+    return adjustCVQualifiersForCXXThisWithinLambda(FunctionScopes, ThisTy,
+                                                    CurContext, Context);
   return ThisTy;
 }
 
@@ -870,6 +1007,8 @@
   else
     Record = cast<CXXRecordDecl>(ContextDecl);
     
+  // We care only for CVR qualifiers here, so cut everything else.
+  CXXThisTypeQuals &= Qualifiers::FastMask;
   S.CXXThisTypeOverride
     = S.Context.getPointerType(
         S.Context.getRecordType(Record).withCVRQualifiers(CXXThisTypeQuals));
@@ -884,28 +1023,84 @@
   }
 }
 
-static Expr *captureThis(ASTContext &Context, RecordDecl *RD,
-                         QualType ThisTy, SourceLocation Loc) {
-  FieldDecl *Field
-    = FieldDecl::Create(Context, RD, Loc, Loc, nullptr, ThisTy,
-                        Context.getTrivialTypeSourceInfo(ThisTy, Loc),
-                        nullptr, false, ICIS_NoInit);
+static Expr *captureThis(Sema &S, ASTContext &Context, RecordDecl *RD,
+                         QualType ThisTy, SourceLocation Loc,
+                         const bool ByCopy) {
+ 
+  QualType AdjustedThisTy = ThisTy;
+  // The type of the corresponding data member (not a 'this' pointer if 'by
+  // copy').
+  QualType CaptureThisFieldTy = ThisTy;
+  if (ByCopy) {
+    // If we are capturing the object referred to by '*this' by copy, ignore any
+    // cv qualifiers inherited from the type of the member function for the type
+    // of the closure-type's corresponding data member and any use of 'this'.
+    CaptureThisFieldTy = ThisTy->getPointeeType();
+    CaptureThisFieldTy.removeLocalCVRQualifiers(Qualifiers::CVRMask);
+    AdjustedThisTy = Context.getPointerType(CaptureThisFieldTy);
+  }
+  
+  FieldDecl *Field = FieldDecl::Create(
+      Context, RD, Loc, Loc, nullptr, CaptureThisFieldTy,
+      Context.getTrivialTypeSourceInfo(CaptureThisFieldTy, Loc), nullptr, false,
+      ICIS_NoInit);
+
   Field->setImplicit(true);
   Field->setAccess(AS_private);
   RD->addDecl(Field);
-  return new (Context) CXXThisExpr(Loc, ThisTy, /*isImplicit*/true);
+  Expr *This =
+      new (Context) CXXThisExpr(Loc, ThisTy, /*isImplicit*/ true);
+  if (ByCopy) {
+    Expr *StarThis =  S.CreateBuiltinUnaryOp(Loc,
+                                      UO_Deref,
+                                      This).get();
+    InitializedEntity Entity = InitializedEntity::InitializeLambdaCapture(
+      nullptr, CaptureThisFieldTy, Loc);
+    InitializationKind InitKind = InitializationKind::CreateDirect(Loc, Loc, Loc);
+    InitializationSequence Init(S, Entity, InitKind, StarThis);
+    ExprResult ER = Init.Perform(S, Entity, InitKind, StarThis);
+    if (ER.isInvalid()) return nullptr;
+    return ER.get();
+  }
+  return This;
 }
 
-bool Sema::CheckCXXThisCapture(SourceLocation Loc, bool Explicit, 
-    bool BuildAndDiagnose, const unsigned *const FunctionScopeIndexToStopAt) {
+bool Sema::CheckCXXThisCapture(SourceLocation Loc, const bool Explicit, 
+    bool BuildAndDiagnose, const unsigned *const FunctionScopeIndexToStopAt,
+    const bool ByCopy) {
   // We don't need to capture this in an unevaluated context.
   if (isUnevaluatedContext() && !Explicit)
     return true;
+  
+  assert((!ByCopy || Explicit) && "cannot implicitly capture *this by value");
 
   const unsigned MaxFunctionScopesIndex = FunctionScopeIndexToStopAt ?
-    *FunctionScopeIndexToStopAt : FunctionScopes.size() - 1;  
- // Otherwise, check that we can capture 'this'.
-  unsigned NumClosures = 0;
+    *FunctionScopeIndexToStopAt : FunctionScopes.size() - 1;
+  
+  // Check that we can capture the *enclosing object* (referred to by '*this')
+  // by the capturing-entity/closure (lambda/block/etc) at 
+  // MaxFunctionScopesIndex-deep on the FunctionScopes stack.  
+
+  // Note: The *enclosing object* can only be captured by-value by a 
+  // closure that is a lambda, using the explicit notation: 
+  //    [*this] { ... }.
+  // Every other capture of the *enclosing object* results in its by-reference
+  // capture.
+
+  // For a closure 'L' (at MaxFunctionScopesIndex in the FunctionScopes
+  // stack), we can capture the *enclosing object* only if:
+  // - 'L' has an explicit byref or byval capture of the *enclosing object*
+  // -  or, 'L' has an implicit capture.
+  // AND 
+  //   -- there is no enclosing closure
+  //   -- or, there is some enclosing closure 'E' that has already captured the 
+  //      *enclosing object*, and every intervening closure (if any) between 'E' 
+  //      and 'L' can implicitly capture the *enclosing object*.
+  //   -- or, every enclosing closure can implicitly capture the 
+  //      *enclosing object*
+  
+  
+  unsigned NumCapturingClosures = 0;
   for (unsigned idx = MaxFunctionScopesIndex; idx != 0; idx--) {
     if (CapturingScopeInfo *CSI =
             dyn_cast<CapturingScopeInfo>(FunctionScopes[idx])) {
@@ -917,44 +1112,69 @@
       if (LSI && isGenericLambdaCallOperatorSpecialization(LSI->CallOperator)) {
         // This context can't implicitly capture 'this'; fail out.
         if (BuildAndDiagnose)
-          Diag(Loc, diag::err_this_capture) << Explicit;
+          Diag(Loc, diag::err_this_capture)
+              << (Explicit && idx == MaxFunctionScopesIndex);
         return true;
       }
       if (CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_LambdaByref ||
           CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_LambdaByval ||
           CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_Block ||
           CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_CapturedRegion ||
-          Explicit) {
+          (Explicit && idx == MaxFunctionScopesIndex)) {
+        // Regarding (Explicit && idx == MaxFunctionScopesIndex): only the first
+        // iteration through can be an explicit capture, all enclosing closures,
+        // if any, must perform implicit captures.
+
         // This closure can capture 'this'; continue looking upwards.
-        NumClosures++;
-        Explicit = false;
+        NumCapturingClosures++;
         continue;
       }
       // This context can't implicitly capture 'this'; fail out.
       if (BuildAndDiagnose)
-        Diag(Loc, diag::err_this_capture) << Explicit;
+        Diag(Loc, diag::err_this_capture)
+            << (Explicit && idx == MaxFunctionScopesIndex);
       return true;
     }
     break;
   }
   if (!BuildAndDiagnose) return false;
-  // Mark that we're implicitly capturing 'this' in all the scopes we skipped.
+
+  // If we got here, then the closure at MaxFunctionScopesIndex on the
+  // FunctionScopes stack, can capture the *enclosing object*, so capture it
+  // (including implicit by-reference captures in any enclosing closures).
+
+  // In the loop below, respect the ByCopy flag only for the closure requesting
+  // the capture (i.e. first iteration through the loop below).  Ignore it for
+  // all enclosing closure's upto NumCapturingClosures (since they must be
+  // implicitly capturing the *enclosing  object* by reference (see loop
+  // above)).
+  assert((!ByCopy ||
+          dyn_cast<LambdaScopeInfo>(FunctionScopes[MaxFunctionScopesIndex])) &&
+         "Only a lambda can capture the enclosing object (referred to by "
+         "*this) by copy");
   // FIXME: We need to delay this marking in PotentiallyPotentiallyEvaluated
   // contexts.
-  for (unsigned idx = MaxFunctionScopesIndex; NumClosures; 
-      --idx, --NumClosures) {
+  QualType ThisTy = getCurrentThisType();
+  for (unsigned idx = MaxFunctionScopesIndex; NumCapturingClosures; 
+      --idx, --NumCapturingClosures) {
     CapturingScopeInfo *CSI = cast<CapturingScopeInfo>(FunctionScopes[idx]);
     Expr *ThisExpr = nullptr;
-    QualType ThisTy = getCurrentThisType();
-    if (LambdaScopeInfo *LSI = dyn_cast<LambdaScopeInfo>(CSI))
-      // For lambda expressions, build a field and an initializing expression.
-      ThisExpr = captureThis(Context, LSI->Lambda, ThisTy, Loc);
-    else if (CapturedRegionScopeInfo *RSI
+    
+    if (LambdaScopeInfo *LSI = dyn_cast<LambdaScopeInfo>(CSI)) {
+      // For lambda expressions, build a field and an initializing expression,
+      // and capture the *enclosing object* by copy only if this is the first
+      // iteration.
+      ThisExpr = captureThis(*this, Context, LSI->Lambda, ThisTy, Loc,
+                             ByCopy && idx == MaxFunctionScopesIndex);
+      
+    } else if (CapturedRegionScopeInfo *RSI
         = dyn_cast<CapturedRegionScopeInfo>(FunctionScopes[idx]))
-      ThisExpr = captureThis(Context, RSI->TheRecordDecl, ThisTy, Loc);
+      ThisExpr =
+          captureThis(*this, Context, RSI->TheRecordDecl, ThisTy, Loc,
+                      false/*ByCopy*/);
 
-    bool isNested = NumClosures > 1;
-    CSI->addThisCapture(isNested, Loc, ThisTy, ThisExpr);
+    bool isNested = NumCapturingClosures > 1;
+    CSI->addThisCapture(isNested, Loc, ThisExpr, ByCopy);
   }
   return false;
 }
@@ -996,7 +1216,14 @@
   if (!TInfo)
     TInfo = Context.getTrivialTypeSourceInfo(Ty, SourceLocation());
 
-  return BuildCXXTypeConstructExpr(TInfo, LParenLoc, exprs, RParenLoc);
+  auto Result = BuildCXXTypeConstructExpr(TInfo, LParenLoc, exprs, RParenLoc);
+  // Avoid creating a non-type-dependent expression that contains typos.
+  // Non-type-dependent expressions are liable to be discarded without
+  // checking for embedded typos.
+  if (!Result.isInvalid() && Result.get()->isInstantiationDependent() &&
+      !Result.get()->isTypeDependent())
+    Result = CorrectDelayedTyposInExpr(Result.get());
+  return Result;
 }
 
 /// ActOnCXXTypeConstructExpr - Parse construction of a specified type.
@@ -2114,14 +2341,13 @@
 
   QualType VoidPtr = Context.getPointerType(Context.VoidTy);
   QualType SizeT = Context.getSizeType();
-  bool AssumeSaneOperatorNew = getLangOpts().AssumeSaneOperatorNew;
 
   DeclareGlobalAllocationFunction(
       Context.DeclarationNames.getCXXOperatorName(OO_New),
-      VoidPtr, SizeT, QualType(), AssumeSaneOperatorNew);
+      VoidPtr, SizeT, QualType());
   DeclareGlobalAllocationFunction(
       Context.DeclarationNames.getCXXOperatorName(OO_Array_New),
-      VoidPtr, SizeT, QualType(), AssumeSaneOperatorNew);
+      VoidPtr, SizeT, QualType());
   DeclareGlobalAllocationFunction(
       Context.DeclarationNames.getCXXOperatorName(OO_Delete),
       Context.VoidTy, VoidPtr);
@@ -2142,8 +2368,7 @@
 /// allocation function if it doesn't already exist.
 void Sema::DeclareGlobalAllocationFunction(DeclarationName Name,
                                            QualType Return,
-                                           QualType Param1, QualType Param2,
-                                           bool AddRestrictAttr) {
+                                           QualType Param1, QualType Param2) {
   DeclContext *GlobalCtx = Context.getTranslationUnitDecl();
   unsigned NumParams = Param2.isNull() ? 1 : 2;
 
@@ -2166,9 +2391,6 @@
         // FIXME: Do we need to check for default arguments here?
         if (InitialParam1Type == Param1 &&
             (NumParams == 1 || InitialParam2Type == Param2)) {
-          if (AddRestrictAttr && !Func->hasAttr<RestrictAttr>())
-            Func->addAttr(RestrictAttr::CreateImplicit(
-                Context, RestrictAttr::GNU_malloc));
           // Make the function visible to name lookup, even if we found it in
           // an unimported module. It either is an implicitly-declared global
           // allocation function, or is suppressing that function.
@@ -2211,10 +2433,6 @@
   Alloc->addAttr(VisibilityAttr::CreateImplicit(Context,
                                                 VisibilityAttr::Default));
 
-  if (AddRestrictAttr)
-    Alloc->addAttr(
-        RestrictAttr::CreateImplicit(Context, RestrictAttr::GNU_malloc));
-
   ParmVarDecl *ParamDecls[2];
   for (unsigned I = 0; I != NumParams; ++I) {
     ParamDecls[I] = ParmVarDecl::Create(Context, Alloc, SourceLocation(),
@@ -2266,7 +2484,7 @@
            "found an unexpected usual deallocation function");
   }
 
-  if (getLangOpts().CUDA && getLangOpts().CUDATargetOverloads)
+  if (getLangOpts().CUDA)
     EraseUnwantedCUDAMatches(dyn_cast<FunctionDecl>(CurContext), Matches);
 
   assert(Matches.size() == 1 &&
@@ -2300,7 +2518,7 @@
       Matches.push_back(F.getPair());
   }
 
-  if (getLangOpts().CUDA && getLangOpts().CUDATargetOverloads)
+  if (getLangOpts().CUDA)
     EraseUnwantedCUDAMatches(dyn_cast<FunctionDecl>(CurContext), Matches);
 
   // There's exactly one suitable operator;  pick it.
@@ -2837,11 +3055,22 @@
   }
 }
 
+Sema::ConditionResult Sema::ActOnConditionVariable(Decl *ConditionVar,
+                                                   SourceLocation StmtLoc,
+                                                   ConditionKind CK) {
+  ExprResult E =
+      CheckConditionVariable(cast<VarDecl>(ConditionVar), StmtLoc, CK);
+  if (E.isInvalid())
+    return ConditionError();
+  return ConditionResult(*this, ConditionVar, MakeFullExpr(E.get(), StmtLoc),
+                         CK == ConditionKind::ConstexprIf);
+}
+
 /// \brief Check the use of the given variable as a C++ condition in an if,
 /// while, do-while, or switch statement.
 ExprResult Sema::CheckConditionVariable(VarDecl *ConditionVar,
                                         SourceLocation StmtLoc,
-                                        bool ConvertToBoolean) {
+                                        ConditionKind CK) {
   if (ConditionVar->isInvalidDecl())
     return ExprError();
 
@@ -2865,17 +3094,22 @@
 
   MarkDeclRefReferenced(cast<DeclRefExpr>(Condition.get()));
 
-  if (ConvertToBoolean) {
-    Condition = CheckBooleanCondition(Condition.get(), StmtLoc);
-    if (Condition.isInvalid())
-      return ExprError();
+  switch (CK) {
+  case ConditionKind::Boolean:
+    return CheckBooleanCondition(StmtLoc, Condition.get());
+
+  case ConditionKind::ConstexprIf:
+    return CheckBooleanCondition(StmtLoc, Condition.get(), true);
+
+  case ConditionKind::Switch:
+    return CheckSwitchCondition(StmtLoc, Condition.get());
   }
 
-  return Condition;
+  llvm_unreachable("unexpected condition kind");
 }
 
 /// CheckCXXBooleanCondition - Returns true if a conversion to bool is invalid.
-ExprResult Sema::CheckCXXBooleanCondition(Expr *CondExpr) {
+ExprResult Sema::CheckCXXBooleanCondition(Expr *CondExpr, bool IsConstexpr) {
   // C++ 6.4p4:
   // The value of a condition that is an initialized declaration in a statement
   // other than a switch statement is the value of the declared variable
@@ -2884,7 +3118,12 @@
   // The value of a condition that is an expression is the value of the
   // expression, implicitly converted to bool.
   //
-  return PerformContextuallyConvertToBool(CondExpr);
+  // FIXME: Return this value to the caller so they don't need to recompute it.
+  llvm::APSInt Value(/*BitWidth*/1);
+  return (IsConstexpr && !CondExpr->isValueDependent())
+             ? CheckConvertedConstantExpression(CondExpr, Context.BoolTy, Value,
+                                                CCEK_ConstexprIf)
+             : PerformContextuallyConvertToBool(CondExpr);
 }
 
 /// Helper function to determine whether this is the (deprecated) C++
@@ -2918,7 +3157,8 @@
               return (ToPointeeType->getKind() == BuiltinType::Char_U ||
                       ToPointeeType->getKind() == BuiltinType::Char_S);
             case StringLiteral::Wide:
-              return ToPointeeType->isWideCharType();
+              return Context.typesAreCompatible(Context.getWideCharType(),
+                                                QualType(ToPointeeType, 0));
           }
         }
       }
@@ -2947,14 +3187,13 @@
     if (S.CompleteConstructorCall(Constructor, From, CastLoc, ConstructorArgs))
       return ExprError();
 
-    S.CheckConstructorAccess(CastLoc, Constructor,
-                             InitializedEntity::InitializeTemporary(Ty),
-                             Constructor->getAccess());
+    S.CheckConstructorAccess(CastLoc, Constructor, FoundDecl,
+                             InitializedEntity::InitializeTemporary(Ty));
     if (S.DiagnoseUseOfDecl(Method, CastLoc))
       return ExprError();
 
     ExprResult Result = S.BuildCXXConstructExpr(
-        CastLoc, Ty, cast<CXXConstructorDecl>(Method),
+        CastLoc, Ty, FoundDecl, cast<CXXConstructorDecl>(Method),
         ConstructorArgs, HadMultipleCandidates,
         /*ListInit*/ false, /*StdInitListInit*/ false, /*ZeroInit*/ false,
         CXXConstructExpr::CK_Complete, SourceRange());
@@ -3105,13 +3344,15 @@
                                   ConstructorArgs))
         return ExprError();
       return BuildCXXConstructExpr(
-          /*FIXME:ConstructLoc*/ SourceLocation(), ToType, SCS.CopyConstructor,
+          /*FIXME:ConstructLoc*/ SourceLocation(), ToType,
+          SCS.FoundCopyConstructor, SCS.CopyConstructor,
           ConstructorArgs, /*HadMultipleCandidates*/ false,
           /*ListInit*/ false, /*StdInitListInit*/ false, /*ZeroInit*/ false,
           CXXConstructExpr::CK_Complete, SourceRange());
     }
     return BuildCXXConstructExpr(
-        /*FIXME:ConstructLoc*/ SourceLocation(), ToType, SCS.CopyConstructor,
+        /*FIXME:ConstructLoc*/ SourceLocation(), ToType,
+        SCS.FoundCopyConstructor, SCS.CopyConstructor,
         From, /*HadMultipleCandidates*/ false,
         /*ListInit*/ false, /*StdInitListInit*/ false, /*ZeroInit*/ false,
         CXXConstructExpr::CK_Complete, SourceRange());
@@ -4334,6 +4575,7 @@
     return !Result.isInvalid() && !SFINAE.hasErrorOccurred();
   }
 
+  case BTT_IsAssignable:
   case BTT_IsNothrowAssignable:
   case BTT_IsTriviallyAssignable: {
     // C++11 [meta.unary.prop]p3:
@@ -4381,6 +4623,9 @@
     if (Result.isInvalid() || SFINAE.hasErrorOccurred())
       return false;
 
+    if (BTT == BTT_IsAssignable)
+      return true;
+
     if (BTT == BTT_IsNothrowAssignable)
       return Self.canThrow(Result.get()) == CT_Cannot;
 
@@ -4672,7 +4917,7 @@
   return Result;
 }
 
-/// \brief Try to convert a type to another according to C++0x 5.16p3.
+/// \brief Try to convert a type to another according to C++11 5.16p3.
 ///
 /// This is part of the parameter validation for the ? operator. If either
 /// value operand is a class type, the two operands are attempted to be
@@ -4688,17 +4933,21 @@
 
   InitializationKind Kind = InitializationKind::CreateCopy(To->getLocStart(),
                                                            SourceLocation());
-  // C++0x 5.16p3
+  // C++11 5.16p3
   //   The process for determining whether an operand expression E1 of type T1
   //   can be converted to match an operand expression E2 of type T2 is defined
   //   as follows:
-  //   -- If E2 is an lvalue:
-  bool ToIsLvalue = To->isLValue();
-  if (ToIsLvalue) {
-    //   E1 can be converted to match E2 if E1 can be implicitly converted to
-    //   type "lvalue reference to T2", subject to the constraint that in the
-    //   conversion the reference must bind directly to E1.
-    QualType T = Self.Context.getLValueReferenceType(ToType);
+  //   -- If E2 is an lvalue: E1 can be converted to match E2 if E1 can be
+  //      implicitly converted to type "lvalue reference to T2", subject to the
+  //      constraint that in the conversion the reference must bind directly to
+  //      an lvalue.
+  //   -- If E2 is an xvalue: E1 can be converted to match E2 if E1 can be
+  //      implicitly conveted to the type "rvalue reference to R2", subject to
+  //      the constraint that the reference must bind directly.
+  if (To->isLValue() || To->isXValue()) {
+    QualType T = To->isLValue() ? Self.Context.getLValueReferenceType(ToType)
+                                : Self.Context.getRValueReferenceType(ToType);
+
     InitializedEntity Entity = InitializedEntity::InitializeTemporary(T);
 
     InitializationSequence InitSeq(Self, Entity, Kind, From);
@@ -5049,6 +5298,12 @@
     QualType ResTy = UsualArithmeticConversions(LHS, RHS);
     if (LHS.isInvalid() || RHS.isInvalid())
       return QualType();
+    if (ResTy.isNull()) {
+      Diag(QuestionLoc,
+           diag::err_typecheck_cond_incompatible_operands) << LTy << RTy
+        << LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
+      return QualType();
+    }
 
     LHS = ImpCastExprToType(LHS.get(), ResTy, PrepareScalarCast(LHS, ResTy));
     RHS = ImpCastExprToType(RHS.get(), ResTy, PrepareScalarCast(RHS, ResTy));
@@ -5410,7 +5665,7 @@
     if (!ReturnsRetained && E->getType()->isObjCARCImplicitlyUnretainedType())
       return E;
 
-    ExprNeedsCleanups = true;
+    Cleanup.setExprNeedsCleanups(true);
 
     CastKind ck = (ReturnsRetained ? CK_ARCConsumeObject
                                    : CK_ARCReclaimReturnedObject);
@@ -5463,7 +5718,7 @@
       return E;
 
     // We need a cleanup, but we don't need to remember the temporary.
-    ExprNeedsCleanups = true;
+    Cleanup.setExprNeedsCleanups(true);
   }
 
   CXXTemporary *Temp = CXXTemporary::Create(Context, Destructor);
@@ -5490,14 +5745,16 @@
 
   unsigned FirstCleanup = ExprEvalContexts.back().NumCleanupObjects;
   assert(ExprCleanupObjects.size() >= FirstCleanup);
-  assert(ExprNeedsCleanups || ExprCleanupObjects.size() == FirstCleanup);
-  if (!ExprNeedsCleanups)
+  assert(Cleanup.exprNeedsCleanups() ||
+         ExprCleanupObjects.size() == FirstCleanup);
+  if (!Cleanup.exprNeedsCleanups())
     return SubExpr;
 
   auto Cleanups = llvm::makeArrayRef(ExprCleanupObjects.begin() + FirstCleanup,
                                      ExprCleanupObjects.size() - FirstCleanup);
 
-  Expr *E = ExprWithCleanups::Create(Context, SubExpr, Cleanups);
+  auto *E = ExprWithCleanups::Create(
+      Context, SubExpr, Cleanup.cleanupsHaveSideEffects(), Cleanups);
   DiscardCleanupsInEvaluationContext();
 
   return E;
@@ -5508,7 +5765,7 @@
 
   CleanupVarDeclMarking();
 
-  if (!ExprNeedsCleanups)
+  if (!Cleanup.exprNeedsCleanups())
     return SubStmt;
 
   // FIXME: In order to attach the temporaries, wrap the statement into
@@ -5614,7 +5871,7 @@
       return ExprError();
 
     // We need a cleanup, but we don't need to remember the temporary.
-    ExprNeedsCleanups = true;
+    Cleanup.setExprNeedsCleanups(true);
   }
 
   // Possibly strip off the top CXXBindTemporaryExpr.
@@ -5809,7 +6066,7 @@
     if (const PointerType *Ptr = ObjectType->getAs<PointerType>()) {
       ObjectType = Ptr->getPointeeType();
     } else if (!Base->isTypeDependent()) {
-      // The user wrote "p->" when she probably meant "p."; fix it.
+      // The user wrote "p->" when they probably meant "p."; fix it.
       S.Diag(OpLoc, diag::err_typecheck_member_reference_suggestion)
         << ObjectType << true
         << FixItHint::CreateReplacement(OpLoc, ".");
@@ -6413,7 +6670,7 @@
 
 static ExprResult attemptRecovery(Sema &SemaRef,
                                   const TypoCorrectionConsumer &Consumer,
-                                  TypoCorrection TC) {
+                                  const TypoCorrection &TC) {
   LookupResult R(SemaRef, Consumer.getLookupResult().getLookupNameInfo(),
                  Consumer.getLookupResult().getLookupKind());
   const CXXScopeSpec *SS = Consumer.getSS();
@@ -6590,6 +6847,14 @@
 
   ExprResult TransformBlockExpr(BlockExpr *E) { return Owned(E); }
 
+  ExprResult TransformObjCPropertyRefExpr(ObjCPropertyRefExpr *E) {
+    return Owned(E);
+  }
+
+  ExprResult TransformObjCIvarRefExpr(ObjCIvarRefExpr *E) {
+    return Owned(E);
+  }
+
   ExprResult Transform(Expr *E) {
     ExprResult Res;
     while (true) {
diff --git a/lib/Sema/SemaExprMember.cpp b/lib/Sema/SemaExprMember.cpp
index d94f754..e25de10 100644
--- a/lib/Sema/SemaExprMember.cpp
+++ b/lib/Sema/SemaExprMember.cpp
@@ -142,6 +142,7 @@
     AbstractInstanceResult = IMA_Abstract;
     break;
 
+  case Sema::DiscardedStatement:
   case Sema::ConstantEvaluated:
   case Sema::PotentiallyEvaluated:
   case Sema::PotentiallyEvaluatedIfUsed:
@@ -338,7 +339,7 @@
       compStr++;
 
     while (*compStr) {
-      if (!vecType->isAccessorWithinNumElements(*compStr++)) {
+      if (!vecType->isAccessorWithinNumElements(*compStr++, HexSwizzle)) {
         S.Diag(OpLoc, diag::err_ext_vector_component_exceeds_length)
           << baseType << SourceRange(CompLoc);
         return QualType();
@@ -742,12 +743,6 @@
                                   false, ExtraArgs);
 }
 
-static ExprResult
-BuildFieldReferenceExpr(Sema &S, Expr *BaseExpr, bool IsArrow,
-                        SourceLocation OpLoc, const CXXScopeSpec &SS,
-                        FieldDecl *Field, DeclAccessPair FoundDecl,
-                        const DeclarationNameInfo &MemberNameInfo);
-
 ExprResult
 Sema::BuildAnonymousStructUnionMemberReference(const CXXScopeSpec &SS,
                                                SourceLocation loc,
@@ -833,7 +828,7 @@
     // Make a nameInfo that properly uses the anonymous name.
     DeclarationNameInfo memberNameInfo(field->getDeclName(), loc);
 
-    result = BuildFieldReferenceExpr(*this, result, baseObjectIsPointer,
+    result = BuildFieldReferenceExpr(result, baseObjectIsPointer,
                                      SourceLocation(), EmptySS, field,
                                      foundDecl, memberNameInfo).get();
     if (!result)
@@ -854,9 +849,10 @@
         DeclAccessPair::make(field, field->getAccess());
 
     result =
-        BuildFieldReferenceExpr(*this, result, /*isarrow*/ false,
-                                SourceLocation(), (FI == FEnd ? SS : EmptySS),
-                                field, fakeFoundDecl, memberNameInfo).get();
+        BuildFieldReferenceExpr(result, /*isarrow*/ false, SourceLocation(),
+                                (FI == FEnd ? SS : EmptySS), field,
+                                fakeFoundDecl, memberNameInfo)
+            .get();
   }
   
   return result;
@@ -1124,8 +1120,8 @@
     return ExprError();
 
   if (FieldDecl *FD = dyn_cast<FieldDecl>(MemberDecl))
-    return BuildFieldReferenceExpr(*this, BaseExpr, IsArrow, OpLoc, SS, FD,
-                                   FoundDecl, MemberNameInfo);
+    return BuildFieldReferenceExpr(BaseExpr, IsArrow, OpLoc, SS, FD, FoundDecl,
+                                   MemberNameInfo);
 
   if (MSPropertyDecl *PD = dyn_cast<MSPropertyDecl>(MemberDecl))
     return BuildMSPropertyRefExpr(*this, BaseExpr, IsArrow, SS, PD,
@@ -1728,11 +1724,11 @@
                                   NameInfo, TemplateArgs, S, &ExtraArgs);
 }
 
-static ExprResult
-BuildFieldReferenceExpr(Sema &S, Expr *BaseExpr, bool IsArrow,
-                        SourceLocation OpLoc, const CXXScopeSpec &SS,
-                        FieldDecl *Field, DeclAccessPair FoundDecl,
-                        const DeclarationNameInfo &MemberNameInfo) {
+ExprResult
+Sema::BuildFieldReferenceExpr(Expr *BaseExpr, bool IsArrow,
+                              SourceLocation OpLoc, const CXXScopeSpec &SS,
+                              FieldDecl *Field, DeclAccessPair FoundDecl,
+                              const DeclarationNameInfo &MemberNameInfo) {
   // x.a is an l-value if 'a' has a reference type. Otherwise:
   // x.a is an l-value/x-value/pr-value if the base is (and note
   //   that *x is always an l-value), except that if the base isn't
@@ -1766,27 +1762,36 @@
     // except that 'mutable' members don't pick up 'const'.
     if (Field->isMutable()) BaseQuals.removeConst();
 
-    Qualifiers MemberQuals
-    = S.Context.getCanonicalType(MemberType).getQualifiers();
+    Qualifiers MemberQuals =
+        Context.getCanonicalType(MemberType).getQualifiers();
 
     assert(!MemberQuals.hasAddressSpace());
 
-
     Qualifiers Combined = BaseQuals + MemberQuals;
     if (Combined != MemberQuals)
-      MemberType = S.Context.getQualifiedType(MemberType, Combined);
+      MemberType = Context.getQualifiedType(MemberType, Combined);
   }
 
-  S.UnusedPrivateFields.remove(Field);
+  UnusedPrivateFields.remove(Field);
 
-  ExprResult Base =
-  S.PerformObjectMemberConversion(BaseExpr, SS.getScopeRep(),
-                                  FoundDecl, Field);
+  ExprResult Base = PerformObjectMemberConversion(BaseExpr, SS.getScopeRep(),
+                                                  FoundDecl, Field);
   if (Base.isInvalid())
     return ExprError();
-  return BuildMemberExpr(S, S.Context, Base.get(), IsArrow, OpLoc, SS,
-                         /*TemplateKWLoc=*/SourceLocation(), Field, FoundDecl,
-                         MemberNameInfo, MemberType, VK, OK);
+  MemberExpr *ME =
+      BuildMemberExpr(*this, Context, Base.get(), IsArrow, OpLoc, SS,
+                      /*TemplateKWLoc=*/SourceLocation(), Field, FoundDecl,
+                      MemberNameInfo, MemberType, VK, OK);
+
+  // Build a reference to a private copy for non-static data members in
+  // non-static member functions, privatized by OpenMP constructs.
+  if (getLangOpts().OpenMP && IsArrow &&
+      !CurContext->isDependentContext() &&
+      isa<CXXThisExpr>(Base.get()->IgnoreParenImpCasts())) {
+    if (auto *PrivateCopy = IsOpenMPCapturedDecl(Field))
+      return getOpenMPCapturedExpr(PrivateCopy, VK, OK, OpLoc);
+  }
+  return ME;
 }
 
 /// Builds an implicit member access expression.  The current context
diff --git a/lib/Sema/SemaExprObjC.cpp b/lib/Sema/SemaExprObjC.cpp
index b8af158..8f0d4ff 100644
--- a/lib/Sema/SemaExprObjC.cpp
+++ b/lib/Sema/SemaExprObjC.cpp
@@ -1035,7 +1035,6 @@
     
     HasPackExpansions = true;
   }
-
   
   QualType Ty
     = Context.getObjCObjectPointerType(
@@ -1818,7 +1817,7 @@
   Selector Sel = PP.getSelectorTable().getNullarySelector(Member);
   ObjCMethodDecl *Getter = IFace->lookupInstanceMethod(Sel);
   
-  // May be founf in property's qualified list.
+  // May be found in property's qualified list.
   if (!Getter)
     Getter = LookupMethodInQualifiedType(Sel, OPT, true);
 
@@ -1838,7 +1837,7 @@
                                            PP.getSelectorTable(), Member);
   ObjCMethodDecl *Setter = IFace->lookupInstanceMethod(SetterSel);
       
-  // May be founf in property's qualified list.
+  // May be found in property's qualified list.
   if (!Setter)
     Setter = LookupMethodInQualifiedType(SetterSel, OPT, true);
   
@@ -1886,12 +1885,29 @@
                       LookupOrdinaryName, nullptr, nullptr,
                       llvm::make_unique<DeclFilterCCC<ObjCPropertyDecl>>(),
                       CTK_ErrorRecovery, IFace, false, OPT)) {
-    diagnoseTypo(Corrected, PDiag(diag::err_property_not_found_suggest)
-                              << MemberName << QualType(OPT, 0));
     DeclarationName TypoResult = Corrected.getCorrection();
-    return HandleExprPropertyRefExpr(OPT, BaseExpr, OpLoc,
-                                     TypoResult, MemberLoc,
-                                     SuperLoc, SuperType, Super);
+    if (TypoResult.isIdentifier() &&
+        TypoResult.getAsIdentifierInfo() == Member) {
+      // There is no need to try the correction if it is the same.
+      NamedDecl *ChosenDecl =
+        Corrected.isKeyword() ? nullptr : Corrected.getFoundDecl();
+      if (ChosenDecl && isa<ObjCPropertyDecl>(ChosenDecl))
+        if (cast<ObjCPropertyDecl>(ChosenDecl)->isClassProperty()) {
+          // This is a class property, we should not use the instance to
+          // access it.
+          Diag(MemberLoc, diag::err_class_property_found) << MemberName
+          << OPT->getInterfaceDecl()->getName()
+          << FixItHint::CreateReplacement(BaseExpr->getSourceRange(),
+                                          OPT->getInterfaceDecl()->getName());
+          return ExprError();
+        }
+    } else {
+      diagnoseTypo(Corrected, PDiag(diag::err_property_not_found_suggest)
+                                << MemberName << QualType(OPT, 0));
+      return HandleExprPropertyRefExpr(OPT, BaseExpr, OpLoc,
+                                       TypoResult, MemberLoc,
+                                       SuperLoc, SuperType, Super);
+    }
   }
   ObjCInterfaceDecl *ClassDeclared;
   if (ObjCIvarDecl *Ivar = 
@@ -1919,8 +1935,6 @@
   return ExprError();
 }
 
-
-
 ExprResult Sema::
 ActOnClassPropertyRefExpr(IdentifierInfo &receiverName,
                           IdentifierInfo &propertyName,
@@ -2035,7 +2049,7 @@
   }
 };
 
-}
+} // end anonymous namespace
 
 Sema::ObjCMessageKind Sema::getObjCMessageKind(Scope *S,
                                                IdentifierInfo *Name,
@@ -2186,7 +2200,6 @@
                            LBracLoc, SelectorLocs, RBracLoc, Args);
 }
 
-
 ExprResult Sema::BuildClassMessageImplicit(QualType ReceiverType,
                                            bool isSuperReceiver,
                                            SourceLocation Loc,
@@ -2201,7 +2214,6 @@
                           /*SuperLoc=*/isSuperReceiver ? Loc : SourceLocation(),
                            Sel, Method, Loc, Loc, Loc, Args,
                            /*isImplicit=*/true);
-
 }
 
 static void applyCocoaAPICheck(Sema &S, const ObjCMessageExpr *Msg,
@@ -2468,7 +2480,6 @@
   if (ReceiverType.isNull())
     return ExprError();
 
-
   if (!ReceiverTypeInfo)
     ReceiverTypeInfo = Context.getTrivialTypeSourceInfo(ReceiverType, LBracLoc);
 
@@ -3070,11 +3081,13 @@
   /// struct A*
   ACTC_coreFoundation
 };
+
 static bool isAnyRetainable(ARCConversionTypeClass ACTC) {
   return (ACTC == ACTC_retainable ||
           ACTC == ACTC_coreFoundation ||
           ACTC == ACTC_voidPtr);
 }
+
 static bool isAnyCLike(ARCConversionTypeClass ACTC) {
   return ACTC == ACTC_none ||
          ACTC == ACTC_voidPtr ||
@@ -3346,7 +3359,7 @@
       }
     }
   };
-}
+} // end anonymous namespace
 
 bool Sema::isKnownName(StringRef name) {
   if (name.empty())
@@ -3800,7 +3813,6 @@
     else if (PRE->isImplicitProperty()) {
       if (ObjCMethodDecl *Getter = PRE->getImplicitPropertyGetter())
         SrcType = Getter->getReturnType();
-      
     }
   }
   
@@ -3810,7 +3822,6 @@
     return;
   CheckObjCBridgeRelatedConversions(castExpr->getLocStart(),
                                     castType, SrcType, castExpr);
-  return;
 }
 
 bool Sema::CheckTollFreeBridgeStaticCast(QualType castType, Expr *castExpr,
@@ -4073,7 +4084,7 @@
     castExpr = ImplicitCastExpr::Create(Context, castExpr->getType(),
                                         CK_ARCConsumeObject, castExpr,
                                         nullptr, VK_RValue);
-    ExprNeedsCleanups = true;
+    Cleanup.setExprNeedsCleanups(true);
     return ACR_okay;
   }
 
@@ -4316,7 +4327,7 @@
                                                    TSInfo, SubExpr);
   
   if (MustConsume) {
-    ExprNeedsCleanups = true;
+    Cleanup.setExprNeedsCleanups(true);
     Result = ImplicitCastExpr::Create(Context, T, CK_ARCConsumeObject, Result, 
                                       nullptr, VK_RValue);
   }
diff --git a/lib/Sema/SemaInit.cpp b/lib/Sema/SemaInit.cpp
index c0c57f3..386c7ab 100644
--- a/lib/Sema/SemaInit.cpp
+++ b/lib/Sema/SemaInit.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/Initialization.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/ExprCXX.h"
@@ -19,13 +18,14 @@
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/Designator.h"
+#include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <map>
+
 using namespace clang;
 
 //===----------------------------------------------------------------------===//
@@ -204,6 +204,8 @@
 // Semantic checking for initializer lists.
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 /// @brief Semantic checking for initializer lists.
 ///
 /// The InitListChecker class contains a set of routines that each
@@ -231,7 +233,6 @@
 /// point. CheckDesignatedInitializer() recursively steps into the
 /// designated subobject and manages backing out the recursion to
 /// initialize the subobjects after the one designated.
-namespace {
 class InitListChecker {
   Sema &SemaRef;
   bool hadError;
@@ -281,6 +282,7 @@
                        unsigned &StructuredIndex);
   void CheckStructUnionTypes(const InitializedEntity &Entity,
                              InitListExpr *IList, QualType DeclType,
+                             CXXRecordDecl::base_class_range Bases,
                              RecordDecl::field_iterator Field,
                              bool SubobjectIsDesignatorContext, unsigned &Index,
                              InitListExpr *StructuredList,
@@ -340,6 +342,10 @@
   // in the InitListExpr, the "holes" in Case#1 are filled not with empty
   // initializers but with special "NoInitExpr" place holders, which tells the
   // CodeGen not to generate any initializers for these parts.
+  void FillInEmptyInitForBase(unsigned Init, const CXXBaseSpecifier &Base,
+                              const InitializedEntity &ParentEntity,
+                              InitListExpr *ILE, bool &RequiresSecondPass,
+                              bool FillWithNoInit);
   void FillInEmptyInitForField(unsigned Init, FieldDecl *Field,
                                const InitializedEntity &ParentEntity,
                                InitListExpr *ILE, bool &RequiresSecondPass,
@@ -363,6 +369,7 @@
   // semantic analysis and code generation.
   InitListExpr *getFullyStructuredList() const { return FullyStructuredList; }
 };
+
 } // end anonymous namespace
 
 ExprResult InitListChecker::PerformEmptyInit(Sema &SemaRef,
@@ -423,8 +430,6 @@
     if (CtorDecl->getMinRequiredArguments() == 0 &&
         CtorDecl->isExplicit() && R->getDeclName() &&
         SemaRef.SourceMgr.isInSystemHeader(CtorDecl->getLocation())) {
-
-
       bool IsInStd = false;
       for (NamespaceDecl *ND = dyn_cast<NamespaceDecl>(R->getDeclContext());
            ND && !IsInStd; ND = dyn_cast<NamespaceDecl>(ND->getParent())) {
@@ -484,6 +489,38 @@
     hadError = true;
 }
 
+void InitListChecker::FillInEmptyInitForBase(
+    unsigned Init, const CXXBaseSpecifier &Base,
+    const InitializedEntity &ParentEntity, InitListExpr *ILE,
+    bool &RequiresSecondPass, bool FillWithNoInit) {
+  assert(Init < ILE->getNumInits() && "should have been expanded");
+
+  InitializedEntity BaseEntity = InitializedEntity::InitializeBase(
+      SemaRef.Context, &Base, false, &ParentEntity);
+
+  if (!ILE->getInit(Init)) {
+    ExprResult BaseInit =
+        FillWithNoInit ? new (SemaRef.Context) NoInitExpr(Base.getType())
+                       : PerformEmptyInit(SemaRef, ILE->getLocEnd(), BaseEntity,
+                                          /*VerifyOnly*/ false,
+                                          TreatUnavailableAsInvalid);
+    if (BaseInit.isInvalid()) {
+      hadError = true;
+      return;
+    }
+
+    ILE->setInit(Init, BaseInit.getAs<Expr>());
+  } else if (InitListExpr *InnerILE =
+                 dyn_cast<InitListExpr>(ILE->getInit(Init))) {
+    FillInEmptyInitializations(BaseEntity, InnerILE,
+                               RequiresSecondPass, FillWithNoInit);
+  } else if (DesignatedInitUpdateExpr *InnerDIUE =
+               dyn_cast<DesignatedInitUpdateExpr>(ILE->getInit(Init))) {
+    FillInEmptyInitializations(BaseEntity, InnerDIUE->getUpdater(),
+                               RequiresSecondPass, /*FillWithNoInit =*/true);
+  }
+}
+
 void InitListChecker::FillInEmptyInitForField(unsigned Init, FieldDecl *Field,
                                         const InitializedEntity &ParentEntity,
                                               InitListExpr *ILE,
@@ -599,14 +636,25 @@
       // The fields beyond ILE->getNumInits() are default initialized, so in
       // order to leave them uninitialized, the ILE is expanded and the extra
       // fields are then filled with NoInitExpr.
-      unsigned NumFields = 0;
-      for (auto *Field : RDecl->fields())
-        if (!Field->isUnnamedBitfield())
-          ++NumFields;
-      if (ILE->getNumInits() < NumFields)
-        ILE->resizeInits(SemaRef.Context, NumFields);
+      unsigned NumElems = numStructUnionElements(ILE->getType());
+      if (RDecl->hasFlexibleArrayMember())
+        ++NumElems;
+      if (ILE->getNumInits() < NumElems)
+        ILE->resizeInits(SemaRef.Context, NumElems);
 
       unsigned Init = 0;
+
+      if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RDecl)) {
+        for (auto &Base : CXXRD->bases()) {
+          if (hadError)
+            return;
+
+          FillInEmptyInitForBase(Init, Base, Entity, ILE, RequiresSecondPass,
+                                 FillWithNoInit);
+          ++Init;
+        }
+      }
+
       for (auto *Field : RDecl->fields()) {
         if (Field->isUnnamedBitfield())
           continue;
@@ -715,7 +763,6 @@
   }
 }
 
-
 InitListChecker::InitListChecker(Sema &S, const InitializedEntity &Entity,
                                  InitListExpr *IL, QualType &T,
                                  bool VerifyOnly,
@@ -754,6 +801,8 @@
 int InitListChecker::numStructUnionElements(QualType DeclType) {
   RecordDecl *structDecl = DeclType->getAs<RecordType>()->getDecl();
   int InitializableMembers = 0;
+  if (auto *CXXRD = dyn_cast<CXXRecordDecl>(structDecl))
+    InitializableMembers += CXXRD->getNumBases();
   for (const auto *Field : structDecl->fields())
     if (!Field->isUnnamedBitfield())
       ++InitializableMembers;
@@ -887,6 +936,7 @@
   case InitializedEntity::EK_Base:
   case InitializedEntity::EK_Delegating:
   case InitializedEntity::EK_BlockElement:
+  case InitializedEntity::EK_Binding:
     llvm_unreachable("unexpected braced scalar init");
   }
 
@@ -898,7 +948,6 @@
   }
 }
 
-
 /// Check whether the initializer \p IList (that was written with explicit
 /// braces) can be used to initialize an object of type \p T.
 ///
@@ -1002,10 +1051,14 @@
     assert(DeclType->isAggregateType() &&
            "non-aggregate records should be handed in CheckSubElementType");
     RecordDecl *RD = DeclType->getAs<RecordType>()->getDecl();
-    CheckStructUnionTypes(Entity, IList, DeclType, RD->field_begin(),
-                          SubobjectIsDesignatorContext, Index,
-                          StructuredList, StructuredIndex,
-                          TopLevelObject);
+    auto Bases =
+        CXXRecordDecl::base_class_range(CXXRecordDecl::base_class_iterator(),
+                                        CXXRecordDecl::base_class_iterator());
+    if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
+      Bases = CXXRD->bases();
+    CheckStructUnionTypes(Entity, IList, DeclType, Bases, RD->field_begin(),
+                          SubobjectIsDesignatorContext, Index, StructuredList,
+                          StructuredIndex, TopLevelObject);
   } else if (DeclType->isArrayType()) {
     llvm::APSInt Zero(
                     SemaRef.Context.getTypeSize(SemaRef.Context.getSizeType()),
@@ -1140,8 +1193,8 @@
     // Fall through for subaggregate initialization.
 
   } else {
-    assert((ElemType->isRecordType() || ElemType->isVectorType()) &&
-           "Unexpected type");
+    assert((ElemType->isRecordType() || ElemType->isVectorType() ||
+            ElemType->isClkEventT()) && "Unexpected type");
 
     // C99 6.7.8p13:
     //
@@ -1230,7 +1283,6 @@
   }
 }
 
-
 void InitListChecker::CheckScalarType(const InitializedEntity &Entity,
                                       InitListExpr *IList, QualType DeclType,
                                       unsigned &Index,
@@ -1682,16 +1734,13 @@
   return FlexArrayDiag != diag::ext_flexible_array_init;
 }
 
-void InitListChecker::CheckStructUnionTypes(const InitializedEntity &Entity,
-                                            InitListExpr *IList,
-                                            QualType DeclType,
-                                            RecordDecl::field_iterator Field,
-                                            bool SubobjectIsDesignatorContext,
-                                            unsigned &Index,
-                                            InitListExpr *StructuredList,
-                                            unsigned &StructuredIndex,
-                                            bool TopLevelObject) {
-  RecordDecl* structDecl = DeclType->getAs<RecordType>()->getDecl();
+void InitListChecker::CheckStructUnionTypes(
+    const InitializedEntity &Entity, InitListExpr *IList, QualType DeclType,
+    CXXRecordDecl::base_class_range Bases, RecordDecl::field_iterator Field,
+    bool SubobjectIsDesignatorContext, unsigned &Index,
+    InitListExpr *StructuredList, unsigned &StructuredIndex,
+    bool TopLevelObject) {
+  RecordDecl *structDecl = DeclType->getAs<RecordType>()->getDecl();
 
   // If the record is invalid, some of it's members are invalid. To avoid
   // confusion, we forgo checking the intializer for the entire record.
@@ -1736,13 +1785,35 @@
     return;
   }
 
+  bool InitializedSomething = false;
+
+  // If we have any base classes, they are initialized prior to the fields.
+  for (auto &Base : Bases) {
+    Expr *Init = Index < IList->getNumInits() ? IList->getInit(Index) : nullptr;
+    SourceLocation InitLoc = Init ? Init->getLocStart() : IList->getLocEnd();
+
+    // Designated inits always initialize fields, so if we see one, all
+    // remaining base classes have no explicit initializer.
+    if (Init && isa<DesignatedInitExpr>(Init))
+      Init = nullptr;
+
+    InitializedEntity BaseEntity = InitializedEntity::InitializeBase(
+        SemaRef.Context, &Base, false, &Entity);
+    if (Init) {
+      CheckSubElementType(BaseEntity, IList, Base.getType(), Index,
+                          StructuredList, StructuredIndex);
+      InitializedSomething = true;
+    } else if (VerifyOnly) {
+      CheckEmptyInitializable(BaseEntity, InitLoc);
+    }
+  }
+
   // If structDecl is a forward declaration, this loop won't do
   // anything except look at designated initializers; That's okay,
   // because an error should get printed out elsewhere. It might be
   // worthwhile to skip over the rest of the initializer, though.
   RecordDecl *RD = DeclType->getAs<RecordType>()->getDecl();
   RecordDecl::field_iterator FieldEnd = RD->field_end();
-  bool InitializedSomething = false;
   bool CheckForMissingFields = true;
   while (Index < IList->getNumInits()) {
     Expr *Init = IList->getInit(Index);
@@ -1905,8 +1976,8 @@
   SmallVector<Expr*, 4> IndexExprs(NumIndexExprs);
   for (unsigned I = 0; I < NumIndexExprs; ++I)
     IndexExprs[I] = DIE->getSubExpr(I + 1);
-  return DesignatedInitExpr::Create(SemaRef.Context, DIE->designators_begin(),
-                                    DIE->size(), IndexExprs,
+  return DesignatedInitExpr::Create(SemaRef.Context, DIE->designators(),
+                                    IndexExprs,
                                     DIE->getEqualOrColonLoc(),
                                     DIE->usesGNUSyntax(), DIE->getInit());
 }
@@ -1929,7 +2000,7 @@
   RecordDecl *Record;
 };
 
-}
+} // end anonymous namespace
 
 /// @brief Check the well-formedness of a C99 designated initializer.
 ///
@@ -2156,8 +2227,10 @@
     for (auto *FI : RT->getDecl()->fields()) {
       if (FI->isUnnamedBitfield())
         continue;
-      if (KnownField == FI)
+      if (declaresSameEntity(KnownField, FI)) {
+        KnownField = FI;
         break;
+      }
       ++FieldIndex;
     }
 
@@ -2170,11 +2243,11 @@
       FieldIndex = 0;
       if (!VerifyOnly) {
         FieldDecl *CurrentField = StructuredList->getInitializedFieldInUnion();
-        if (CurrentField && CurrentField != *Field) {
+        if (CurrentField && !declaresSameEntity(CurrentField, *Field)) {
           assert(StructuredList->getNumInits() == 1
                  && "A union should never have more than one initializer!");
 
-          // we're about to throw away an initializer, emit warning
+          // We're about to throw away an initializer, emit warning.
           SemaRef.Diag(D->getFieldLoc(),
                        diag::warn_initializer_overrides)
             << D->getSourceRange();
@@ -2314,8 +2387,11 @@
     // Check the remaining fields within this class/struct/union subobject.
     bool prevHadError = hadError;
 
-    CheckStructUnionTypes(Entity, IList, CurrentObjectType, Field, false, Index,
-                          StructuredList, FieldIndex);
+    auto NoBases =
+        CXXRecordDecl::base_class_range(CXXRecordDecl::base_class_iterator(),
+                                        CXXRecordDecl::base_class_iterator());
+    CheckStructUnionTypes(Entity, IList, CurrentObjectType, NoBases, Field,
+                          false, Index, StructuredList, FieldIndex);
     return hadError && !prevHadError;
   }
 
@@ -2761,7 +2837,7 @@
 
   DesignatedInitExpr *DIE
     = DesignatedInitExpr::Create(Context,
-                                 Designators.data(), Designators.size(),
+                                 Designators,
                                  InitExpressions, Loc, GNUSyntax,
                                  Init.getAs<Expr>());
 
@@ -2797,10 +2873,11 @@
 InitializedEntity
 InitializedEntity::InitializeBase(ASTContext &Context,
                                   const CXXBaseSpecifier *Base,
-                                  bool IsInheritedVirtualBase) {
+                                  bool IsInheritedVirtualBase,
+                                  const InitializedEntity *Parent) {
   InitializedEntity Result;
   Result.Kind = EK_Base;
-  Result.Parent = nullptr;
+  Result.Parent = Parent;
   Result.Base = reinterpret_cast<uintptr_t>(Base);
   if (IsInheritedVirtualBase)
     Result.Base |= 0x01;
@@ -2819,6 +2896,7 @@
 
   case EK_Variable:
   case EK_Member:
+  case EK_Binding:
     return VariableOrMember->getDeclName();
 
   case EK_LambdaCapture:
@@ -2842,10 +2920,11 @@
   llvm_unreachable("Invalid EntityKind!");
 }
 
-DeclaratorDecl *InitializedEntity::getDecl() const {
+ValueDecl *InitializedEntity::getDecl() const {
   switch (getKind()) {
   case EK_Variable:
   case EK_Member:
+  case EK_Binding:
     return VariableOrMember;
 
   case EK_Parameter:
@@ -2881,6 +2960,7 @@
   case EK_Parameter:
   case EK_Parameter_CF_Audited:
   case EK_Member:
+  case EK_Binding:
   case EK_New:
   case EK_Temporary:
   case EK_CompoundLiteralInit:
@@ -2912,6 +2992,7 @@
   case EK_Result: OS << "Result"; break;
   case EK_Exception: OS << "Exception"; break;
   case EK_Member: OS << "Member"; break;
+  case EK_Binding: OS << "Binding"; break;
   case EK_New: OS << "New"; break;
   case EK_Temporary: OS << "Temporary"; break;
   case EK_CompoundLiteralInit: OS << "CompoundLiteral";break;
@@ -2928,9 +3009,9 @@
     break;
   }
 
-  if (Decl *D = getDecl()) {
+  if (auto *D = getDecl()) {
     OS << " ";
-    cast<NamedDecl>(D)->printQualifiedName(OS);
+    D->printQualifiedName(OS);
   }
 
   OS << " '" << getType().getAsString() << "'\n";
@@ -3147,13 +3228,9 @@
   Steps.push_back(S);
 }
 
-void
-InitializationSequence
-::AddConstructorInitializationStep(CXXConstructorDecl *Constructor,
-                                   AccessSpecifier Access,
-                                   QualType T,
-                                   bool HadMultipleCandidates,
-                                   bool FromInitList, bool AsInitList) {
+void InitializationSequence::AddConstructorInitializationStep(
+    DeclAccessPair FoundDecl, CXXConstructorDecl *Constructor, QualType T,
+    bool HadMultipleCandidates, bool FromInitList, bool AsInitList) {
   Step S;
   S.Kind = FromInitList ? AsInitList ? SK_StdInitializerListConstructorCall
                                      : SK_ConstructorInitializationFromList
@@ -3161,7 +3238,7 @@
   S.Type = T;
   S.Function.HadMultipleCandidates = HadMultipleCandidates;
   S.Function.Function = Constructor;
-  S.Function.FoundDecl = DeclAccessPair::make(Constructor, Access);
+  S.Function.FoundDecl = FoundDecl;
   Steps.push_back(S);
 }
 
@@ -3372,18 +3449,13 @@
   CandidateSet.clear();
 
   for (NamedDecl *D : Ctors) {
-    DeclAccessPair FoundDecl = DeclAccessPair::make(D, D->getAccess());
+    auto Info = getConstructorInfo(D);
+    if (!Info.Constructor)
+      continue;
+
     bool SuppressUserConversions = false;
 
-    // Find the constructor (which may be a template).
-    CXXConstructorDecl *Constructor = nullptr;
-    FunctionTemplateDecl *ConstructorTmpl = dyn_cast<FunctionTemplateDecl>(D);
-    if (ConstructorTmpl)
-      Constructor = cast<CXXConstructorDecl>(
-                                           ConstructorTmpl->getTemplatedDecl());
-    else {
-      Constructor = cast<CXXConstructorDecl>(D);
-
+    if (!Info.ConstructorTmpl) {
       // C++11 [over.best.ics]p4:
       //   ... and the constructor or user-defined conversion function is a
       //   candidate by
@@ -3400,15 +3472,15 @@
       //     parameter of a constructor of X.
       if ((CopyInitializing ||
            (IsListInit && Args.size() == 1 && isa<InitListExpr>(Args[0]))) &&
-          Constructor->isCopyOrMoveConstructor())
+          Info.Constructor->isCopyOrMoveConstructor())
         SuppressUserConversions = true;
     }
 
-    if (!Constructor->isInvalidDecl() &&
-        (AllowExplicit || !Constructor->isExplicit()) &&
-        (!OnlyListConstructors || S.isInitListConstructor(Constructor))) {
-      if (ConstructorTmpl)
-        S.AddTemplateOverloadCandidate(ConstructorTmpl, FoundDecl,
+    if (!Info.Constructor->isInvalidDecl() &&
+        (AllowExplicit || !Info.Constructor->isExplicit()) &&
+        (!OnlyListConstructors || S.isInitListConstructor(Info.Constructor))) {
+      if (Info.ConstructorTmpl)
+        S.AddTemplateOverloadCandidate(Info.ConstructorTmpl, Info.FoundDecl,
                                        /*ExplicitArgs*/ nullptr, Args,
                                        CandidateSet, SuppressUserConversions);
       else {
@@ -3420,9 +3492,9 @@
         //     are also considered.
         bool AllowExplicitConv = AllowExplicit && !CopyInitializing && 
                                  Args.size() == 1 &&
-                                 Constructor->isCopyOrMoveConstructor();
-        S.AddOverloadCandidate(Constructor, FoundDecl, Args, CandidateSet,
-                               SuppressUserConversions,
+                                 Info.Constructor->isCopyOrMoveConstructor();
+        S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl, Args,
+                               CandidateSet, SuppressUserConversions,
                                /*PartialOverloading=*/false,
                                /*AllowExplicit=*/AllowExplicitConv);
       }
@@ -3530,18 +3602,23 @@
   //   If a program calls for the default initialization of an object
   //   of a const-qualified type T, T shall be a class type with a
   //   user-provided default constructor.
+  // C++ core issue 253 proposal:
+  //   If the implicit default constructor initializes all subobjects, no
+  //   initializer should be required.
+  // The 253 proposal is for example needed to process libstdc++ headers in 5.x.
+  CXXConstructorDecl *CtorDecl = cast<CXXConstructorDecl>(Best->Function);
   if (Kind.getKind() == InitializationKind::IK_Default &&
-      Entity.getType().isConstQualified() &&
-      !cast<CXXConstructorDecl>(Best->Function)->isUserProvided()) {
-    if (!maybeRecoverWithZeroInitialization(S, Sequence, Entity))
-      Sequence.SetFailed(InitializationSequence::FK_DefaultInitOfConst);
-    return;
+      Entity.getType().isConstQualified()) {
+    if (!CtorDecl->getParent()->allowConstDefaultInit()) {
+      if (!maybeRecoverWithZeroInitialization(S, Sequence, Entity))
+        Sequence.SetFailed(InitializationSequence::FK_DefaultInitOfConst);
+      return;
+    }
   }
 
   // C++11 [over.match.list]p1:
   //   In copy-list-initialization, if an explicit constructor is chosen, the
   //   initializer is ill-formed.
-  CXXConstructorDecl *CtorDecl = cast<CXXConstructorDecl>(Best->Function);
   if (IsListInit && !Kind.AllowExplicit() && CtorDecl->isExplicit()) {
     Sequence.SetFailed(InitializationSequence::FK_ExplicitConstructor);
     return;
@@ -3551,7 +3628,7 @@
   // subsumed by the initialization.
   bool HadMultipleCandidates = (CandidateSet.size() > 1);
   Sequence.AddConstructorInitializationStep(
-      CtorDecl, Best->FoundDecl.getAccess(), DestType, HadMultipleCandidates,
+      Best->FoundDecl, CtorDecl, DestType, HadMultipleCandidates,
       IsListInit | IsInitListCopy, AsInitializerList);
 }
 
@@ -3782,8 +3859,48 @@
   }
 
   if (S.getLangOpts().CPlusPlus && !DestType->isAggregateType() &&
-      InitList->getNumInits() == 1 &&
-      InitList->getInit(0)->getType()->isRecordType()) {
+      InitList->getNumInits() == 1) {
+    Expr *E = InitList->getInit(0);
+
+    //   - Otherwise, if T is an enumeration with a fixed underlying type,
+    //     the initializer-list has a single element v, and the initialization
+    //     is direct-list-initialization, the object is initialized with the
+    //     value T(v); if a narrowing conversion is required to convert v to
+    //     the underlying type of T, the program is ill-formed.
+    auto *ET = DestType->getAs<EnumType>();
+    if (S.getLangOpts().CPlusPlus1z &&
+        Kind.getKind() == InitializationKind::IK_DirectList &&
+        ET && ET->getDecl()->isFixed() &&
+        !S.Context.hasSameUnqualifiedType(E->getType(), DestType) &&
+        (E->getType()->isIntegralOrEnumerationType() ||
+         E->getType()->isFloatingType())) {
+      // There are two ways that T(v) can work when T is an enumeration type.
+      // If there is either an implicit conversion sequence from v to T or
+      // a conversion function that can convert from v to T, then we use that.
+      // Otherwise, if v is of integral, enumeration, or floating-point type,
+      // it is converted to the enumeration type via its underlying type.
+      // There is no overlap possible between these two cases (except when the
+      // source value is already of the destination type), and the first
+      // case is handled by the general case for single-element lists below.
+      ImplicitConversionSequence ICS;
+      ICS.setStandard();
+      ICS.Standard.setAsIdentityConversion();
+      // If E is of a floating-point type, then the conversion is ill-formed
+      // due to narrowing, but go through the motions in order to produce the
+      // right diagnostic.
+      ICS.Standard.Second = E->getType()->isFloatingType()
+                                ? ICK_Floating_Integral
+                                : ICK_Integral_Conversion;
+      ICS.Standard.setFromType(E->getType());
+      ICS.Standard.setToType(0, E->getType());
+      ICS.Standard.setToType(1, DestType);
+      ICS.Standard.setToType(2, DestType);
+      Sequence.AddConversionSequenceStep(ICS, ICS.Standard.getToType(2),
+                                         /*TopLevelOfInitList*/true);
+      Sequence.RewrapReferenceInitList(Entity.getType(), InitList);
+      return;
+    }
+
     //   - Otherwise, if the initializer list has a single element of type E
     //     [...references are handled above...], the object or reference is
     //     initialized from that element (by copy-initialization for
@@ -3797,19 +3914,21 @@
     // copy-initialization. This only matters if we might use an 'explicit'
     // conversion operator, so we only need to handle the cases where the source
     // is of record type.
-    InitializationKind SubKind =
-        Kind.getKind() == InitializationKind::IK_DirectList
-            ? InitializationKind::CreateDirect(Kind.getLocation(),
-                                               InitList->getLBraceLoc(),
-                                               InitList->getRBraceLoc())
-            : Kind;
-    Expr *SubInit[1] = { InitList->getInit(0) };
-    Sequence.InitializeFrom(S, Entity, SubKind, SubInit,
-                            /*TopLevelOfInitList*/true,
-                            TreatUnavailableAsInvalid);
-    if (Sequence)
-      Sequence.RewrapReferenceInitList(Entity.getType(), InitList);
-    return;
+    if (InitList->getInit(0)->getType()->isRecordType()) {
+      InitializationKind SubKind =
+          Kind.getKind() == InitializationKind::IK_DirectList
+              ? InitializationKind::CreateDirect(Kind.getLocation(),
+                                                 InitList->getLBraceLoc(),
+                                                 InitList->getRBraceLoc())
+              : Kind;
+      Expr *SubInit[1] = { InitList->getInit(0) };
+      Sequence.InitializeFrom(S, Entity, SubKind, SubInit,
+                              /*TopLevelOfInitList*/true,
+                              TreatUnavailableAsInvalid);
+      if (Sequence)
+        Sequence.RewrapReferenceInitList(Entity.getType(), InitList);
+      return;
+    }
   }
 
   InitListChecker CheckInitList(S, Entity, InitList,
@@ -3867,26 +3986,19 @@
     CXXRecordDecl *T1RecordDecl = cast<CXXRecordDecl>(T1RecordType->getDecl());
 
     for (NamedDecl *D : S.LookupConstructors(T1RecordDecl)) {
-      DeclAccessPair FoundDecl = DeclAccessPair::make(D, D->getAccess());
+      auto Info = getConstructorInfo(D);
+      if (!Info.Constructor)
+        continue;
 
-      // Find the constructor (which may be a template).
-      CXXConstructorDecl *Constructor = nullptr;
-      FunctionTemplateDecl *ConstructorTmpl = dyn_cast<FunctionTemplateDecl>(D);
-      if (ConstructorTmpl)
-        Constructor = cast<CXXConstructorDecl>(
-                                         ConstructorTmpl->getTemplatedDecl());
-      else
-        Constructor = cast<CXXConstructorDecl>(D);
-
-      if (!Constructor->isInvalidDecl() &&
-          Constructor->isConvertingConstructor(AllowExplicit)) {
-        if (ConstructorTmpl)
-          S.AddTemplateOverloadCandidate(ConstructorTmpl, FoundDecl,
+      if (!Info.Constructor->isInvalidDecl() &&
+          Info.Constructor->isConvertingConstructor(AllowExplicit)) {
+        if (Info.ConstructorTmpl)
+          S.AddTemplateOverloadCandidate(Info.ConstructorTmpl, Info.FoundDecl,
                                          /*ExplicitArgs*/ nullptr,
                                          Initializer, CandidateSet,
                                          /*SuppressUserConversions=*/true);
         else
-          S.AddOverloadCandidate(Constructor, FoundDecl,
+          S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl,
                                  Initializer, CandidateSet,
                                  /*SuppressUserConversions=*/true);
       }
@@ -4088,7 +4200,6 @@
   return Initializer->getValueKind();
 }
 
-
 /// \brief Reference initialization without resolving overloaded functions.
 static void TryReferenceInitializationCore(Sema &S,
                                            const InitializedEntity &Entity,
@@ -4323,7 +4434,6 @@
   }
 
   Sequence.AddReferenceBindingStep(cv1T1, /*bindingTemporary=*/true);
-  return;
 }
 
 /// \brief Attempt character array initialization from a string literal
@@ -4492,27 +4602,19 @@
              Con = CopyOfCon.begin(), ConEnd = CopyOfCon.end();
            Con != ConEnd; ++Con) {
         NamedDecl *D = *Con;
-        DeclAccessPair FoundDecl = DeclAccessPair::make(D, D->getAccess());
+        auto Info = getConstructorInfo(D);
+        if (!Info.Constructor)
+          continue;
 
-        // Find the constructor (which may be a template).
-        CXXConstructorDecl *Constructor = nullptr;
-        FunctionTemplateDecl *ConstructorTmpl
-          = dyn_cast<FunctionTemplateDecl>(D);
-        if (ConstructorTmpl)
-          Constructor = cast<CXXConstructorDecl>(
-                                           ConstructorTmpl->getTemplatedDecl());
-        else
-          Constructor = cast<CXXConstructorDecl>(D);
-
-        if (!Constructor->isInvalidDecl() &&
-            Constructor->isConvertingConstructor(AllowExplicit)) {
-          if (ConstructorTmpl)
-            S.AddTemplateOverloadCandidate(ConstructorTmpl, FoundDecl,
+        if (!Info.Constructor->isInvalidDecl() &&
+            Info.Constructor->isConvertingConstructor(AllowExplicit)) {
+          if (Info.ConstructorTmpl)
+            S.AddTemplateOverloadCandidate(Info.ConstructorTmpl, Info.FoundDecl,
                                            /*ExplicitArgs*/ nullptr,
                                            Initializer, CandidateSet,
                                            /*SuppressUserConversions=*/true);
           else
-            S.AddOverloadCandidate(Constructor, FoundDecl,
+            S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl,
                                    Initializer, CandidateSet,
                                    /*SuppressUserConversions=*/true);
         }
@@ -4709,8 +4811,8 @@
   // If isWeakAccess to true, there will be an implicit 
   // load which requires a cleanup.
   if (S.getLangOpts().ObjCAutoRefCount && isWeakAccess)
-    S.ExprNeedsCleanups = true;
-  
+    S.Cleanup.setExprNeedsCleanups(true);
+
   if (iik == IIK_okay) return;
 
   S.Diag(src->getExprLoc(), diag::err_arc_nonlocal_writeback)
@@ -4788,7 +4890,8 @@
                                         QualType DestType,
                                         Expr *Initializer) {
   if (!S.getLangOpts().OpenCL || !DestType->isSamplerT() ||
-    !Initializer->isIntegerConstantExpr(S.getASTContext()))
+      (!Initializer->isIntegerConstantExpr(S.Context) &&
+      !Initializer->getType()->isSamplerT()))
     return false;
 
   Sequence.AddOCLSamplerInitStep(DestType);
@@ -5172,6 +5275,7 @@
     return Sema::AA_Casting;
 
   case InitializedEntity::EK_Member:
+  case InitializedEntity::EK_Binding:
   case InitializedEntity::EK_ArrayElement:
   case InitializedEntity::EK_VectorElement:
   case InitializedEntity::EK_ComplexElement:
@@ -5207,6 +5311,7 @@
   case InitializedEntity::EK_Parameter_CF_Audited:
   case InitializedEntity::EK_Temporary:
   case InitializedEntity::EK_RelatedResult:
+  case InitializedEntity::EK_Binding:
     return true;
   }
 
@@ -5228,6 +5333,7 @@
       return false;
 
     case InitializedEntity::EK_Member:
+    case InitializedEntity::EK_Binding:
     case InitializedEntity::EK_Variable:
     case InitializedEntity::EK_Parameter:
     case InitializedEntity::EK_Parameter_CF_Audited:
@@ -5256,38 +5362,33 @@
   for (SmallVectorImpl<NamedDecl *>::iterator
          CI = Ctors.begin(), CE = Ctors.end(); CI != CE; ++CI) {
     NamedDecl *D = *CI;
-    CXXConstructorDecl *Constructor = nullptr;
+    auto Info = getConstructorInfo(D);
+    if (!Info.Constructor)
+      continue;
 
-    if ((Constructor = dyn_cast<CXXConstructorDecl>(D))) {
-      // Handle copy/moveconstructors, only.
-      if (!Constructor || Constructor->isInvalidDecl() ||
-          !Constructor->isCopyOrMoveConstructor() ||
-          !Constructor->isConvertingConstructor(/*AllowExplicit=*/true))
+    if (!Info.ConstructorTmpl) {
+      // Handle copy/move constructors, only.
+      if (Info.Constructor->isInvalidDecl() ||
+          !Info.Constructor->isCopyOrMoveConstructor() ||
+          !Info.Constructor->isConvertingConstructor(/*AllowExplicit=*/true))
         continue;
 
-      DeclAccessPair FoundDecl
-        = DeclAccessPair::make(Constructor, Constructor->getAccess());
-      S.AddOverloadCandidate(Constructor, FoundDecl,
+      S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl,
                              CurInitExpr, CandidateSet);
       continue;
     }
 
     // Handle constructor templates.
-    FunctionTemplateDecl *ConstructorTmpl = cast<FunctionTemplateDecl>(D);
-    if (ConstructorTmpl->isInvalidDecl())
+    if (Info.ConstructorTmpl->isInvalidDecl())
       continue;
 
-    Constructor = cast<CXXConstructorDecl>(
-                                         ConstructorTmpl->getTemplatedDecl());
-    if (!Constructor->isConvertingConstructor(/*AllowExplicit=*/true))
+    if (!Info.Constructor->isConvertingConstructor(/*AllowExplicit=*/true))
       continue;
 
     // FIXME: Do we need to limit this to copy-constructor-like
     // candidates?
-    DeclAccessPair FoundDecl
-      = DeclAccessPair::make(ConstructorTmpl, ConstructorTmpl->getAccess());
-    S.AddTemplateOverloadCandidate(ConstructorTmpl, FoundDecl, nullptr,
-                                   CurInitExpr, CandidateSet, true);
+    S.AddTemplateOverloadCandidate(Info.ConstructorTmpl, Info.FoundDecl,
+                                   nullptr, CurInitExpr, CandidateSet, true);
   }
 }
 
@@ -5302,6 +5403,7 @@
     return Entity.getThrowLoc();
 
   case InitializedEntity::EK_Variable:
+  case InitializedEntity::EK_Binding:
     return Entity.getDecl()->getLocation();
 
   case InitializedEntity::EK_LambdaCapture:
@@ -5426,8 +5528,8 @@
   SmallVector<Expr*, 8> ConstructorArgs;
   CurInit.get(); // Ownership transferred into MultiExprArg, below.
 
-  S.CheckConstructorAccess(Loc, Constructor, Entity,
-                           Best->FoundDecl.getAccess(), IsExtraneousCopy);
+  S.CheckConstructorAccess(Loc, Constructor, Best->FoundDecl, Entity,
+                           IsExtraneousCopy);
 
   if (IsExtraneousCopy) {
     // If this is a totally extraneous copy for C++03 reference
@@ -5462,7 +5564,8 @@
     return ExprError();
 
   // Actually perform the constructor call.
-  CurInit = S.BuildCXXConstructExpr(Loc, T, Constructor, Elidable,
+  CurInit = S.BuildCXXConstructExpr(Loc, T, Best->FoundDecl, Constructor,
+                                    Elidable,
                                     ConstructorArgs,
                                     HadMultipleCandidates,
                                     /*ListInit*/ false,
@@ -5509,7 +5612,7 @@
   switch (OR) {
   case OR_Success:
     S.CheckConstructorAccess(Loc, cast<CXXConstructorDecl>(Best->Function),
-                             Entity, Best->FoundDecl.getAccess(), Diag);
+                             Best->FoundDecl, Entity, Diag);
     // FIXME: Check default arguments as far as that's possible.
     break;
 
@@ -5635,7 +5738,6 @@
 
   if (isExplicitTemporary(Entity, Kind, NumArgs)) {
     // An explicitly-constructed temporary, e.g., X(1, 2).
-    S.MarkFunctionReferenced(Loc, Constructor);
     if (S.DiagnoseUseOfDecl(Constructor, Loc))
       return ExprError();
 
@@ -5647,10 +5749,19 @@
       ? SourceRange(LBraceLoc, RBraceLoc)
       : Kind.getParenRange();
 
+    if (auto *Shadow = dyn_cast<ConstructorUsingShadowDecl>(
+            Step.Function.FoundDecl.getDecl())) {
+      Constructor = S.findInheritingConstructor(Loc, Constructor, Shadow);
+      if (S.DiagnoseUseOfDecl(Constructor, Loc))
+        return ExprError();
+    }
+    S.MarkFunctionReferenced(Loc, Constructor);
+
     CurInit = new (S.Context) CXXTemporaryObjectExpr(
-        S.Context, Constructor, TSInfo, ConstructorArgs, ParenOrBraceRange,
-        HadMultipleCandidates, IsListInitialization,
-        IsStdInitListInitialization, ConstructorInitRequiresZeroInit);
+        S.Context, Constructor, TSInfo,
+        ConstructorArgs, ParenOrBraceRange, HadMultipleCandidates,
+        IsListInitialization, IsStdInitListInitialization,
+        ConstructorInitRequiresZeroInit);
   } else {
     CXXConstructExpr::ConstructionKind ConstructKind =
       CXXConstructExpr::CK_Complete;
@@ -5675,6 +5786,7 @@
     // unconditionally.
     if (Entity.allowsNRVO())
       CurInit = S.BuildCXXConstructExpr(Loc, Entity.getType(),
+                                        Step.Function.FoundDecl,
                                         Constructor, /*Elidable=*/true,
                                         ConstructorArgs,
                                         HadMultipleCandidates,
@@ -5685,6 +5797,7 @@
                                         ParenOrBraceRange);
     else
       CurInit = S.BuildCXXConstructExpr(Loc, Entity.getType(),
+                                        Step.Function.FoundDecl,
                                         Constructor,
                                         ConstructorArgs,
                                         HadMultipleCandidates,
@@ -5698,8 +5811,7 @@
     return ExprError();
 
   // Only check access if all of that succeeded.
-  S.CheckConstructorAccess(Loc, Constructor, Entity,
-                           Step.Function.FoundDecl.getAccess());
+  S.CheckConstructorAccess(Loc, Constructor, Step.Function.FoundDecl, Entity);
   if (S.DiagnoseUseOfDecl(Step.Function.FoundDecl, Loc))
     return ExprError();
 
@@ -5723,6 +5835,7 @@
   case InitializedEntity::EK_Result:
   case InitializedEntity::EK_Exception:
   case InitializedEntity::EK_Member:
+  case InitializedEntity::EK_Binding:
   case InitializedEntity::EK_New:
   case InitializedEntity::EK_Base:
   case InitializedEntity::EK_Delegating:
@@ -5772,6 +5885,11 @@
     //      ctor-initializer persists until the constructor exits.
     return Entity;
 
+  case InitializedEntity::EK_Binding:
+    // Per [dcl.decomp]p3, the binding is treated as a variable of reference
+    // type.
+    return Entity;
+
   case InitializedEntity::EK_Parameter:
   case InitializedEntity::EK_Parameter_CF_Audited:
     //   -- A temporary bound to a reference parameter in a function call
@@ -5801,6 +5919,11 @@
                                                   FallbackDecl);
 
   case InitializedEntity::EK_Base:
+    // For subobjects, we look at the complete object.
+    if (Entity->getParent())
+      return getEntityForTemporaryLifetimeExtension(Entity->getParent(),
+                                                    Entity);
+    // Fall through.
   case InitializedEntity::EK_Delegating:
     // We can reach this case for aggregate initialization in a constructor:
     //   struct A { int &&r; };
@@ -6066,6 +6189,36 @@
       << FixItHint::CreateRemoval(SourceRange(RParen, RParen));
 }
 
+static void CheckForNullPointerDereference(Sema &S, const Expr *E) {
+  // Check to see if we are dereferencing a null pointer.  If so, this is
+  // undefined behavior, so warn about it.  This only handles the pattern
+  // "*null", which is a very syntactic check.
+  if (const UnaryOperator *UO = dyn_cast<UnaryOperator>(E->IgnoreParenCasts()))
+    if (UO->getOpcode() == UO_Deref &&
+        UO->getSubExpr()->IgnoreParenCasts()->
+        isNullPointerConstant(S.Context, Expr::NPC_ValueDependentIsNotNull)) {
+    S.DiagRuntimeBehavior(UO->getOperatorLoc(), UO,
+                          S.PDiag(diag::warn_binding_null_to_reference)
+                            << UO->getSubExpr()->getSourceRange());
+  }
+}
+
+MaterializeTemporaryExpr *
+Sema::CreateMaterializeTemporaryExpr(QualType T, Expr *Temporary,
+                                     bool BoundToLvalueReference) {
+  auto MTE = new (Context)
+      MaterializeTemporaryExpr(T, Temporary, BoundToLvalueReference);
+
+  // Order an ExprWithCleanups for lifetime marks.
+  //
+  // TODO: It'll be good to have a single place to check the access of the
+  // destructor and generate ExprWithCleanups for various uses. Currently these
+  // are done in both CreateMaterializeTemporaryExpr and MaybeBindToTemporary,
+  // but there may be a chance to merge them.
+  Cleanup.setExprNeedsCleanups(false);
+  return MTE;
+}
+
 ExprResult
 InitializationSequence::Perform(Sema &S,
                                 const InitializedEntity &Entity,
@@ -6112,7 +6265,7 @@
           SourceRange Brackets;
 
           // Scavange the location of the brackets from the entity, if we can.
-          if (DeclaratorDecl *DD = Entity.getDecl()) {
+          if (auto *DD = dyn_cast_or_null<DeclaratorDecl>(Entity.getDecl())) {
             if (TypeSourceInfo *TInfo = DD->getTypeSourceInfo()) {
               TypeLoc TL = TInfo->getTypeLoc();
               if (IncompleteArrayTypeLoc ArrayLoc =
@@ -6318,6 +6471,7 @@
                                   /*IsInitializerList=*/false,
                                   ExtendingEntity->getDecl());
 
+      CheckForNullPointerDereference(S, CurInit.get());
       break;
 
     case SK_BindReferenceToTemporary: {
@@ -6329,7 +6483,7 @@
         return ExprError();
 
       // Materialize the temporary into memory.
-      MaterializeTemporaryExpr *MTE = new (S.Context) MaterializeTemporaryExpr(
+      MaterializeTemporaryExpr *MTE = S.CreateMaterializeTemporaryExpr(
           Entity.getType().getNonReferenceType(), CurInit.get(),
           Entity.getType()->isLValueReferenceType());
 
@@ -6349,7 +6503,7 @@
            MTE->getType()->isObjCLifetimeType()) ||
           (MTE->getStorageDuration() == SD_Automatic &&
            MTE->getType().isDestructedType()))
-        S.ExprNeedsCleanups = true;
+        S.Cleanup.setExprNeedsCleanups(true);
 
       CurInit = MTE;
       break;
@@ -6384,7 +6538,8 @@
           return ExprError();
 
         // Build an expression that constructs a temporary.
-        CurInit = S.BuildCXXConstructExpr(Loc, Step->Type, Constructor,
+        CurInit = S.BuildCXXConstructExpr(Loc, Step->Type,
+                                          FoundFn, Constructor,
                                           ConstructorArgs,
                                           HadMultipleCandidates,
                                           /*ListInit*/ false,
@@ -6395,8 +6550,8 @@
         if (CurInit.isInvalid())
           return ExprError();
 
-        S.CheckConstructorAccess(Kind.getLocation(), Constructor, Entity,
-                                 FoundFn.getAccess());
+        S.CheckConstructorAccess(Kind.getLocation(), Constructor, FoundFn,
+                                 Entity);
         if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
           return ExprError();
 
@@ -6505,12 +6660,16 @@
                                     getAssignmentAction(Entity), CCK);
       if (CurInitExprRes.isInvalid())
         return ExprError();
+
+      S.DiscardMisalignedMemberAddress(Step->Type.getTypePtr(), CurInit.get());
+
       CurInit = CurInitExprRes;
 
       if (Step->Kind == SK_ConversionSequenceNoNarrowing &&
           S.getLangOpts().CPlusPlus && !CurInit.get()->isValueDependent())
         DiagnoseNarrowingInInitList(S, *Step->ICS, SourceType, Entity.getType(),
                                     CurInit.get());
+
       break;
     }
 
@@ -6740,9 +6899,9 @@
         << CurInit.get()->getSourceRange();
 
       // Materialize the temporary into memory.
-      MaterializeTemporaryExpr *MTE = new (S.Context)
-          MaterializeTemporaryExpr(CurInit.get()->getType(), CurInit.get(),
-                                   /*BoundToLvalueReference=*/false);
+      MaterializeTemporaryExpr *MTE = S.CreateMaterializeTemporaryExpr(
+          CurInit.get()->getType(), CurInit.get(),
+          /*BoundToLvalueReference=*/false);
 
       // Maybe lifetime-extend the array temporary's subobjects to match the
       // entity's lifetime.
@@ -6764,19 +6923,93 @@
     }
 
     case SK_OCLSamplerInit: {
-      assert(Step->Type->isSamplerT() && 
+      // Sampler initialzation have 5 cases:
+      //   1. function argument passing
+      //      1a. argument is a file-scope variable
+      //      1b. argument is a function-scope variable
+      //      1c. argument is one of caller function's parameters
+      //   2. variable initialization
+      //      2a. initializing a file-scope variable
+      //      2b. initializing a function-scope variable
+      //
+      // For file-scope variables, since they cannot be initialized by function
+      // call of __translate_sampler_initializer in LLVM IR, their references
+      // need to be replaced by a cast from their literal initializers to
+      // sampler type. Since sampler variables can only be used in function
+      // calls as arguments, we only need to replace them when handling the
+      // argument passing.
+      assert(Step->Type->isSamplerT() &&
              "Sampler initialization on non-sampler type.");
-
-      QualType SourceType = CurInit.get()->getType();
-
+      Expr *Init = CurInit.get();
+      QualType SourceType = Init->getType();
+      // Case 1
       if (Entity.isParameterKind()) {
-        if (!SourceType->isSamplerT())
+        if (!SourceType->isSamplerT()) {
           S.Diag(Kind.getLocation(), diag::err_sampler_argument_required)
             << SourceType;
-      } else if (Entity.getKind() != InitializedEntity::EK_Variable) {
-        llvm_unreachable("Invalid EntityKind!");
+          break;
+        } else if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(Init)) {
+          auto Var = cast<VarDecl>(DRE->getDecl());
+          // Case 1b and 1c
+          // No cast from integer to sampler is needed.
+          if (!Var->hasGlobalStorage()) {
+            CurInit = ImplicitCastExpr::Create(S.Context, Step->Type,
+                                               CK_LValueToRValue, Init,
+                                               /*BasePath=*/nullptr, VK_RValue);
+            break;
+          }
+          // Case 1a
+          // For function call with a file-scope sampler variable as argument,
+          // get the integer literal.
+          // Do not diagnose if the file-scope variable does not have initializer
+          // since this has already been diagnosed when parsing the variable
+          // declaration.
+          if (!Var->getInit() || !isa<ImplicitCastExpr>(Var->getInit()))
+            break;
+          Init = cast<ImplicitCastExpr>(const_cast<Expr*>(
+            Var->getInit()))->getSubExpr();
+          SourceType = Init->getType();
+        }
+      } else {
+        // Case 2
+        // Check initializer is 32 bit integer constant.
+        // If the initializer is taken from global variable, do not diagnose since
+        // this has already been done when parsing the variable declaration.
+        if (!Init->isConstantInitializer(S.Context, false))
+          break;
+        
+        if (!SourceType->isIntegerType() ||
+            32 != S.Context.getIntWidth(SourceType)) {
+          S.Diag(Kind.getLocation(), diag::err_sampler_initializer_not_integer)
+            << SourceType;
+          break;
+        }
+
+        llvm::APSInt Result;
+        Init->EvaluateAsInt(Result, S.Context);
+        const uint64_t SamplerValue = Result.getLimitedValue();
+        // 32-bit value of sampler's initializer is interpreted as
+        // bit-field with the following structure:
+        // |unspecified|Filter|Addressing Mode| Normalized Coords|
+        // |31        6|5    4|3             1|                 0|
+        // This structure corresponds to enum values of sampler properties
+        // defined in SPIR spec v1.2 and also opencl-c.h
+        unsigned AddressingMode  = (0x0E & SamplerValue) >> 1;
+        unsigned FilterMode      = (0x30 & SamplerValue) >> 4;
+        if (FilterMode != 1 && FilterMode != 2)
+          S.Diag(Kind.getLocation(),
+                 diag::warn_sampler_initializer_invalid_bits)
+                 << "Filter Mode";
+        if (AddressingMode > 4)
+          S.Diag(Kind.getLocation(),
+                 diag::warn_sampler_initializer_invalid_bits)
+                 << "Addressing Mode";
       }
 
+      // Cases 1a, 2a and 2b
+      // Insert cast from integer to sampler.
+      CurInit = S.ImpCastExprToType(Init, S.Context.OCLSamplerTy,
+                                      CK_IntToOCLSampler);
       break;
     }
     case SK_OCLZeroEvent: {
@@ -7158,17 +7391,20 @@
             isa<CXXConstructorDecl>(S.CurContext)) {
           // This is implicit default initialization of a member or
           // base within a constructor. If no viable function was
-          // found, notify the user that she needs to explicitly
+          // found, notify the user that they need to explicitly
           // initialize this base/member.
           CXXConstructorDecl *Constructor
             = cast<CXXConstructorDecl>(S.CurContext);
+          const CXXRecordDecl *InheritedFrom = nullptr;
+          if (auto Inherited = Constructor->getInheritedConstructor())
+            InheritedFrom = Inherited.getShadowDecl()->getNominatedBaseClass();
           if (Entity.getKind() == InitializedEntity::EK_Base) {
             S.Diag(Kind.getLocation(), diag::err_missing_default_ctor)
-              << (Constructor->getInheritedConstructor() ? 2 :
-                  Constructor->isImplicit() ? 1 : 0)
+              << (InheritedFrom ? 2 : Constructor->isImplicit() ? 1 : 0)
               << S.Context.getTypeDeclType(Constructor->getParent())
               << /*base=*/0
-              << Entity.getType();
+              << Entity.getType()
+              << InheritedFrom;
 
             RecordDecl *BaseDecl
               = Entity.getBaseSpecifier()->getType()->getAs<RecordType>()
@@ -7177,11 +7413,11 @@
               << S.Context.getTagDeclType(BaseDecl);
           } else {
             S.Diag(Kind.getLocation(), diag::err_missing_default_ctor)
-              << (Constructor->getInheritedConstructor() ? 2 :
-                  Constructor->isImplicit() ? 1 : 0)
+              << (InheritedFrom ? 2 : Constructor->isImplicit() ? 1 : 0)
               << S.Context.getTypeDeclType(Constructor->getParent())
               << /*member=*/1
-              << Entity.getName();
+              << Entity.getName()
+              << InheritedFrom;
             S.Diag(Entity.getDecl()->getLocation(),
                    diag::note_member_declared_at);
 
diff --git a/lib/Sema/SemaLambda.cpp b/lib/Sema/SemaLambda.cpp
index c70b506..a946222 100644
--- a/lib/Sema/SemaLambda.cpp
+++ b/lib/Sema/SemaLambda.cpp
@@ -235,7 +235,7 @@
         /*Template kw loc*/ SourceLocation(), LAngleLoc,
         llvm::makeArrayRef((NamedDecl *const *)LSI->AutoTemplateParams.data(),
                            LSI->AutoTemplateParams.size()),
-        RAngleLoc);
+        RAngleLoc, nullptr);
   }
   return LSI->GLTemplateParameterList;
 }
@@ -355,7 +355,8 @@
                                            SourceRange IntroducerRange,
                                            TypeSourceInfo *MethodTypeInfo,
                                            SourceLocation EndLoc,
-                                           ArrayRef<ParmVarDecl *> Params) {
+                                           ArrayRef<ParmVarDecl *> Params,
+                                           const bool IsConstexprSpecified) {
   QualType MethodType = MethodTypeInfo->getType();
   TemplateParameterList *TemplateParams = 
             getGenericLambdaTemplateParameterList(getCurLambda(), *this);
@@ -392,7 +393,7 @@
                             MethodType, MethodTypeInfo,
                             SC_None,
                             /*isInline=*/true,
-                            /*isConstExpr=*/false,
+                            IsConstexprSpecified,
                             EndLoc);
   Method->setAccess(AS_public);
   
@@ -414,11 +415,10 @@
   // Add parameters.
   if (!Params.empty()) {
     Method->setParams(Params);
-    CheckParmsForFunctionDef(const_cast<ParmVarDecl **>(Params.begin()),
-                             const_cast<ParmVarDecl **>(Params.end()),
+    CheckParmsForFunctionDef(Params,
                              /*CheckParameterNames=*/false);
-    
-    for (auto P : Method->params())
+
+    for (auto P : Method->parameters())
       P->setOwningFunction(Method);
   }
 
@@ -878,8 +878,9 @@
   CXXRecordDecl *Class = createLambdaClosureType(Intro.Range, MethodTyInfo,
                                                  KnownDependent, Intro.Default);
 
-  CXXMethodDecl *Method = startLambdaDefinition(Class, Intro.Range,
-                                                MethodTyInfo, EndLoc, Params);
+  CXXMethodDecl *Method =
+      startLambdaDefinition(Class, Intro.Range, MethodTyInfo, EndLoc, Params,
+                            ParamInfo.getDeclSpec().isConstexprSpecified());
   if (ExplicitParams)
     CheckCXXDefaultArguments(Method);
   
@@ -918,7 +919,12 @@
     = Intro.Default == LCD_None? Intro.Range.getBegin() : Intro.DefaultLoc;
   for (auto C = Intro.Captures.begin(), E = Intro.Captures.end(); C != E;
        PrevCaptureLoc = C->Loc, ++C) {
-    if (C->Kind == LCK_This) {
+    if (C->Kind == LCK_This || C->Kind == LCK_StarThis) {
+      if (C->Kind == LCK_StarThis) 
+        Diag(C->Loc, !getLangOpts().CPlusPlus1z
+                             ? diag::ext_star_this_lambda_capture_cxx1z
+                             : diag::warn_cxx14_compat_star_this_lambda_capture);
+
       // C++11 [expr.prim.lambda]p8:
       //   An identifier or this shall not appear more than once in a 
       //   lambda-capture.
@@ -930,10 +936,12 @@
         continue;
       }
 
-      // C++11 [expr.prim.lambda]p8:
-      //   If a lambda-capture includes a capture-default that is =, the 
-      //   lambda-capture shall not contain this [...].
-      if (Intro.Default == LCD_ByCopy) {
+      // C++1z [expr.prim.lambda]p8:
+      //  If a lambda-capture includes a capture-default that is =, each
+      //  simple-capture of that lambda-capture shall be of the form "&
+      //  identifier" or "* this". [ Note: The form [&,this] is redundant but
+      //  accepted for compatibility with ISO C++14. --end note ]
+      if (Intro.Default == LCD_ByCopy && C->Kind != LCK_StarThis) {
         Diag(C->Loc, diag::err_this_capture_with_copy_default)
             << FixItHint::CreateRemoval(
                 SourceRange(getLocForEndOfToken(PrevCaptureLoc), C->Loc));
@@ -949,7 +957,9 @@
         continue;
       }
       
-      CheckCXXThisCapture(C->Loc, /*Explicit=*/true);
+      CheckCXXThisCapture(C->Loc, /*Explicit=*/true, /*BuildAndDiagnose*/ true,
+                          /*FunctionScopeIndexToStopAtPtr*/ nullptr,
+                          C->Kind == LCK_StarThis);
       continue;
     }
 
@@ -1134,14 +1144,16 @@
 
 /// \brief Add a lambda's conversion to function pointer, as described in
 /// C++11 [expr.prim.lambda]p6.
-static void addFunctionPointerConversion(Sema &S, 
+static void addFunctionPointerConversion(Sema &S,
                                          SourceRange IntroducerRange,
                                          CXXRecordDecl *Class,
                                          CXXMethodDecl *CallOperator) {
   // This conversion is explicitly disabled if the lambda's function has
   // pass_object_size attributes on any of its parameters.
-  if (std::any_of(CallOperator->param_begin(), CallOperator->param_end(),
-                  std::mem_fn(&ParmVarDecl::hasAttr<PassObjectSizeAttr>)))
+  auto HasPassObjectSizeAttr = [](const ParmVarDecl *P) {
+    return P->hasAttr<PassObjectSizeAttr>();
+  };
+  if (llvm::any_of(CallOperator->parameters(), HasPassObjectSizeAttr))
     return;
 
   // Add the conversion to function pointer.
@@ -1489,7 +1501,7 @@
   SourceRange IntroducerRange;
   bool ExplicitParams;
   bool ExplicitResultType;
-  bool LambdaExprNeedsCleanups;
+  CleanupInfo LambdaCleanup;
   bool ContainsUnexpandedParameterPack;
   SmallVector<VarDecl *, 4> ArrayIndexVars;
   SmallVector<unsigned, 4> ArrayIndexStarts;
@@ -1499,7 +1511,7 @@
     IntroducerRange = LSI->IntroducerRange;
     ExplicitParams = LSI->ExplicitParams;
     ExplicitResultType = !LSI->HasImplicitReturnType;
-    LambdaExprNeedsCleanups = LSI->ExprNeedsCleanups;
+    LambdaCleanup = LSI->Cleanup;
     ContainsUnexpandedParameterPack = LSI->ContainsUnexpandedParameterPack;
     
     CallOperator->setLexicalDeclContext(Class);
@@ -1523,10 +1535,9 @@
       // Handle 'this' capture.
       if (From.isThisCapture()) {
         Captures.push_back(
-            LambdaCapture(From.getLocation(), IsImplicit, LCK_This));
-        CaptureInits.push_back(new (Context) CXXThisExpr(From.getLocation(),
-                                                         getCurrentThisType(),
-                                                         /*isImplicit=*/true));
+            LambdaCapture(From.getLocation(), IsImplicit,
+                          From.isCopyCapture() ? LCK_StarThis : LCK_This));
+        CaptureInits.push_back(From.getInitExpr());
         ArrayIndexStarts.push_back(ArrayIndexVars.size());
         continue;
       }
@@ -1581,9 +1592,8 @@
     CheckCompletedCXXClass(Class);
   }
 
-  if (LambdaExprNeedsCleanups)
-    ExprNeedsCleanups = true;
-  
+  Cleanup.mergeFrom(LambdaCleanup);
+
   LambdaExpr *Lambda = LambdaExpr::Create(Context, Class, IntroducerRange, 
                                           CaptureDefault, CaptureDefaultLoc,
                                           Captures, 
@@ -1591,6 +1601,17 @@
                                           CaptureInits, ArrayIndexVars, 
                                           ArrayIndexStarts, EndLoc,
                                           ContainsUnexpandedParameterPack);
+  // If the lambda expression's call operator is not explicitly marked constexpr
+  // and we are not in a dependent context, analyze the call operator to infer
+  // its constexpr-ness, supressing diagnostics while doing so.
+  if (getLangOpts().CPlusPlus1z && !CallOperator->isInvalidDecl() &&
+      !CallOperator->isConstexpr() &&
+      !Class->getDeclContext()->isDependentContext()) {
+    TentativeAnalysisScope DiagnosticScopeGuard(*this);
+    CallOperator->setConstexpr(
+        CheckConstexprFunctionDecl(CallOperator) &&
+        CheckConstexprFunctionBody(CallOperator, CallOperator->getBody()));
+  }
 
   if (!CurContext->isDependentContext()) {
     switch (ExprEvalContexts.back().Context) {
@@ -1615,6 +1636,7 @@
       ExprEvalContexts.back().Lambdas.push_back(Lambda);
       break;
 
+    case DiscardedStatement:
     case PotentiallyEvaluated:
     case PotentiallyEvaluatedIfUsed:
       break;
@@ -1693,7 +1715,7 @@
   // Create the block literal expression.
   Expr *BuildBlock = new (Context) BlockExpr(Block, Conv->getConversionType());
   ExprCleanupObjects.push_back(Block);
-  ExprNeedsCleanups = true;
+  Cleanup.setExprNeedsCleanups(true);
 
   return BuildBlock;
 }
diff --git a/lib/Sema/SemaLookup.cpp b/lib/Sema/SemaLookup.cpp
index 686db72..19df1e3 100644
--- a/lib/Sema/SemaLookup.cpp
+++ b/lib/Sema/SemaLookup.cpp
@@ -12,9 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/Lookup.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
@@ -29,7 +27,7 @@
 #include "clang/Lex/ModuleLoader.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/DeclSpec.h"
-#include "clang/Sema/ExternalSemaSource.h"
+#include "clang/Sema/Lookup.h"
 #include "clang/Sema/Overload.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
@@ -38,17 +36,13 @@
 #include "clang/Sema/TemplateDeduction.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/ADT/edit_distance.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
 #include <iterator>
-#include <limits>
 #include <list>
-#include <map>
 #include <set>
 #include <utility>
 #include <vector>
@@ -280,6 +274,10 @@
     IDNS = Decl::IDNS_ObjCProtocol;
     break;
 
+  case Sema::LookupOMPReductionName:
+    IDNS = Decl::IDNS_OMPReduction;
+    break;
+
   case Sema::LookupAnyName:
     IDNS = Decl::IDNS_Ordinary | Decl::IDNS_Tag | Decl::IDNS_Member
       | Decl::IDNS_Using | Decl::IDNS_Namespace | Decl::IDNS_ObjCProtocol
@@ -419,6 +417,18 @@
     }
   }
 
+  // VarDecl can have incomplete array types, prefer the one with more complete
+  // array type.
+  if (VarDecl *DVD = dyn_cast<VarDecl>(DUnderlying)) {
+    VarDecl *EVD = cast<VarDecl>(EUnderlying);
+    if (EVD->getType()->isIncompleteType() &&
+        !DVD->getType()->isIncompleteType()) {
+      // Prefer the decl with a more complete type if visible.
+      return S.isVisible(DVD);
+    }
+    return false; // Avoid picking up a newer decl, just because it was newer.
+  }
+
   // For most kinds of declaration, it doesn't really matter which one we pick.
   if (!isa<FunctionDecl>(DUnderlying) && !isa<VarDecl>(DUnderlying)) {
     // If the existing declaration is hidden, prefer the new one. Otherwise,
@@ -432,10 +442,6 @@
     if (Prev == EUnderlying)
       return true;
   return false;
-
-  // If the existing declaration is hidden, prefer the new one. Otherwise,
-  // keep what we've got.
-  return !S.isVisible(Existing);
 }
 
 /// Determine whether \p D can hide a tag declaration.
@@ -669,24 +675,21 @@
       NameKind == Sema::LookupRedeclarationWithLinkage) {
     IdentifierInfo *II = R.getLookupName().getAsIdentifierInfo();
     if (II) {
-      if (S.getLangOpts().CPlusPlus11 && S.getLangOpts().GNUMode &&
-          II == S.getFloat128Identifier()) {
-        // libstdc++4.7's type_traits expects type __float128 to exist, so
-        // insert a dummy type to make that header build in gnu++11 mode.
-        R.addDecl(S.getASTContext().getFloat128StubType());
-        return true;
-      }
-      if (S.getLangOpts().CPlusPlus && NameKind == Sema::LookupOrdinaryName &&
-          II == S.getASTContext().getMakeIntegerSeqName()) {
-        R.addDecl(S.getASTContext().getMakeIntegerSeqDecl());
-        return true;
+      if (S.getLangOpts().CPlusPlus && NameKind == Sema::LookupOrdinaryName) {
+        if (II == S.getASTContext().getMakeIntegerSeqName()) {
+          R.addDecl(S.getASTContext().getMakeIntegerSeqDecl());
+          return true;
+        } else if (II == S.getASTContext().getTypePackElementName()) {
+          R.addDecl(S.getASTContext().getTypePackElementDecl());
+          return true;
+        }
       }
 
       // If this is a builtin on this (or all) targets, create the decl.
       if (unsigned BuiltinID = II->getBuiltinID()) {
-        // In C++, we don't have any predefined library functions like
-        // 'malloc'. Instead, we'll just error.
-        if (S.getLangOpts().CPlusPlus &&
+        // In C++ and OpenCL (spec v1.2 s6.9.f), we don't have any predefined
+        // library functions like 'malloc'. Instead, we'll just error.
+        if ((S.getLangOpts().CPlusPlus || S.getLangOpts().OpenCL) &&
             S.Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
           return false;
 
@@ -734,11 +737,11 @@
   if (getLangOpts().CPlusPlus11) {
     // If the move constructor has not yet been declared, do so now.
     if (Class->needsImplicitMoveConstructor())
-      DeclareImplicitMoveConstructor(Class); // might not actually do it
+      DeclareImplicitMoveConstructor(Class);
 
     // If the move assignment operator has not yet been declared, do so now.
     if (Class->needsImplicitMoveAssignment())
-      DeclareImplicitMoveAssignment(Class); // might not actually do it
+      DeclareImplicitMoveAssignment(Class);
   }
 
   // If the destructor has not yet been declared, do so now.
@@ -1074,32 +1077,35 @@
 
   for (; S && !isNamespaceOrTranslationUnitScope(S); S = S->getParent()) {
     DeclContext *Ctx = S->getEntity();
-
+    bool SearchNamespaceScope = true;
     // Check whether the IdResolver has anything in this scope.
-    bool Found = false;
     for (; I != IEnd && S->isDeclScope(*I); ++I) {
       if (NamedDecl *ND = R.getAcceptableDecl(*I)) {
-        if (NameKind == LookupRedeclarationWithLinkage) {
+        if (NameKind == LookupRedeclarationWithLinkage &&
+            !(*I)->isTemplateParameter()) {
+          // If it's a template parameter, we still find it, so we can diagnose
+          // the invalid redeclaration.
+
           // Determine whether this (or a previous) declaration is
           // out-of-scope.
           if (!LeftStartingScope && !Initial->isDeclScope(*I))
             LeftStartingScope = true;
 
           // If we found something outside of our starting scope that
-          // does not have linkage, skip it. If it's a template parameter,
-          // we still find it, so we can diagnose the invalid redeclaration.
-          if (LeftStartingScope && !((*I)->hasLinkage()) &&
-              !(*I)->isTemplateParameter()) {
+          // does not have linkage, skip it.
+          if (LeftStartingScope && !((*I)->hasLinkage())) {
             R.setShadowed();
             continue;
           }
+        } else {
+          // We found something in this scope, we should not look at the
+          // namespace scope
+          SearchNamespaceScope = false;
         }
-
-        Found = true;
         R.addDecl(ND);
       }
     }
-    if (Found) {
+    if (!SearchNamespaceScope) {
       R.resolveKind();
       if (S->isClassScope())
         if (CXXRecordDecl *Record = dyn_cast_or_null<CXXRecordDecl>(Ctx))
@@ -1361,8 +1367,9 @@
       auto &SrcMgr = PP.getSourceManager();
       SourceLocation StartLoc =
           SrcMgr.getLocForStartOfFile(SrcMgr.getMainFileID());
-      auto &TopLevel =
-          VisibleModulesStack.empty() ? VisibleModules : VisibleModulesStack[0];
+      auto &TopLevel = ModuleScopes.empty()
+                           ? VisibleModules
+                           : ModuleScopes[0].OuterVisibleModules;
       TopLevel.setVisible(CachedFakeTopLevelModule, StartLoc);
     }
 
@@ -1470,6 +1477,35 @@
                                      Modules);
 }
 
+bool Sema::hasVisibleMemberSpecialization(
+    const NamedDecl *D, llvm::SmallVectorImpl<Module *> *Modules) {
+  assert(isa<CXXRecordDecl>(D->getDeclContext()) &&
+         "not a member specialization");
+  for (auto *Redecl : D->redecls()) {
+    // If the specialization is declared at namespace scope, then it's a member
+    // specialization declaration. If it's lexically inside the class
+    // definition then it was instantiated.
+    //
+    // FIXME: This is a hack. There should be a better way to determine this.
+    // FIXME: What about MS-style explicit specializations declared within a
+    //        class definition?
+    if (Redecl->getLexicalDeclContext()->isFileContext()) {
+      auto *NonConstR = const_cast<NamedDecl*>(cast<NamedDecl>(Redecl));
+
+      if (isVisible(NonConstR))
+        return true;
+
+      if (Modules) {
+        Modules->push_back(getOwningModule(NonConstR));
+        const auto &Merged = Context.getModulesWithMergedDefinition(NonConstR);
+        Modules->insert(Modules->end(), Merged.begin(), Merged.end());
+      }
+    }
+  }
+
+  return false;
+}
+
 /// \brief Determine whether a declaration is visible to name lookup.
 ///
 /// This routine determines whether the declaration D is visible in the current
@@ -1570,19 +1606,58 @@
   assert(!LookupResult::isVisible(SemaRef, D) && "not in slow case");
 
   for (auto RD : D->redecls()) {
-    if (auto ND = dyn_cast<NamedDecl>(RD)) {
-      // FIXME: This is wrong in the case where the previous declaration is not
-      // visible in the same scope as D. This needs to be done much more
-      // carefully.
-      if (LookupResult::isVisible(SemaRef, ND))
-        return ND;
-    }
+    // Don't bother with extra checks if we already know this one isn't visible.
+    if (RD == D)
+      continue;
+
+    auto ND = cast<NamedDecl>(RD);
+    // FIXME: This is wrong in the case where the previous declaration is not
+    // visible in the same scope as D. This needs to be done much more
+    // carefully.
+    if (LookupResult::isVisible(SemaRef, ND))
+      return ND;
   }
 
   return nullptr;
 }
 
+bool Sema::hasVisibleDeclarationSlow(const NamedDecl *D,
+                                     llvm::SmallVectorImpl<Module *> *Modules) {
+  assert(!isVisible(D) && "not in slow case");
+
+  for (auto *Redecl : D->redecls()) {
+    auto *NonConstR = const_cast<NamedDecl*>(cast<NamedDecl>(Redecl));
+    if (isVisible(NonConstR))
+      return true;
+
+    if (Modules) {
+      Modules->push_back(getOwningModule(NonConstR));
+      const auto &Merged = Context.getModulesWithMergedDefinition(NonConstR);
+      Modules->insert(Modules->end(), Merged.begin(), Merged.end());
+    }
+  }
+
+  return false;
+}
+
 NamedDecl *LookupResult::getAcceptableDeclSlow(NamedDecl *D) const {
+  if (auto *ND = dyn_cast<NamespaceDecl>(D)) {
+    // Namespaces are a bit of a special case: we expect there to be a lot of
+    // redeclarations of some namespaces, all declarations of a namespace are
+    // essentially interchangeable, all declarations are found by name lookup
+    // if any is, and namespaces are never looked up during template
+    // instantiation. So we benefit from caching the check in this case, and
+    // it is correct to do so.
+    auto *Key = ND->getCanonicalDecl();
+    if (auto *Acceptable = getSema().VisibleNamespaceCache.lookup(Key))
+      return Acceptable;
+    auto *Acceptable =
+        isVisible(getSema(), Key) ? Key : findAcceptableDecl(getSema(), Key);
+    if (Acceptable)
+      getSema().VisibleNamespaceCache.insert(std::make_pair(Key, Acceptable));
+    return Acceptable;
+  }
+
   return findAcceptableDecl(getSema(), D);
 }
 
@@ -1986,6 +2061,10 @@
       BaseCallback = &LookupAnyMember;
       break;
 
+    case LookupOMPReductionName:
+      BaseCallback = &CXXRecordDecl::FindOMPReductionMember;
+      break;
+
     case LookupUsingDeclName:
       // This lookup is for redeclarations only.
 
@@ -2864,42 +2943,38 @@
   // from an external source and invalidate lookup_result.
   SmallVector<NamedDecl *, 8> Candidates(R.begin(), R.end());
 
-  for (auto *Cand : Candidates) {
-    if (Cand->isInvalidDecl())
+  for (NamedDecl *CandDecl : Candidates) {
+    if (CandDecl->isInvalidDecl())
       continue;
 
-    if (UsingShadowDecl *U = dyn_cast<UsingShadowDecl>(Cand)) {
-      // FIXME: [namespace.udecl]p15 says that we should only consider a
-      // using declaration here if it does not match a declaration in the
-      // derived class. We do not implement this correctly in other cases
-      // either.
-      Cand = U->getTargetDecl();
-
-      if (Cand->isInvalidDecl())
-        continue;
-    }
-
-    if (CXXMethodDecl *M = dyn_cast<CXXMethodDecl>(Cand)) {
+    DeclAccessPair Cand = DeclAccessPair::make(CandDecl, AS_public);
+    auto CtorInfo = getConstructorInfo(Cand);
+    if (CXXMethodDecl *M = dyn_cast<CXXMethodDecl>(Cand->getUnderlyingDecl())) {
       if (SM == CXXCopyAssignment || SM == CXXMoveAssignment)
-        AddMethodCandidate(M, DeclAccessPair::make(M, AS_public), RD, ThisTy,
-                           Classification, llvm::makeArrayRef(&Arg, NumArgs),
-                           OCS, true);
-      else
-        AddOverloadCandidate(M, DeclAccessPair::make(M, AS_public),
+        AddMethodCandidate(M, Cand, RD, ThisTy, Classification,
+                           llvm::makeArrayRef(&Arg, NumArgs), OCS, true);
+      else if (CtorInfo)
+        AddOverloadCandidate(CtorInfo.Constructor, CtorInfo.FoundDecl,
                              llvm::makeArrayRef(&Arg, NumArgs), OCS, true);
-    } else if (FunctionTemplateDecl *Tmpl =
-                 dyn_cast<FunctionTemplateDecl>(Cand)) {
-      if (SM == CXXCopyAssignment || SM == CXXMoveAssignment)
-        AddMethodTemplateCandidate(Tmpl, DeclAccessPair::make(Tmpl, AS_public),
-                                   RD, nullptr, ThisTy, Classification,
-                                   llvm::makeArrayRef(&Arg, NumArgs),
-                                   OCS, true);
       else
-        AddTemplateOverloadCandidate(Tmpl, DeclAccessPair::make(Tmpl, AS_public),
-                                     nullptr, llvm::makeArrayRef(&Arg, NumArgs),
-                                     OCS, true);
+        AddOverloadCandidate(M, Cand, llvm::makeArrayRef(&Arg, NumArgs), OCS,
+                             true);
+    } else if (FunctionTemplateDecl *Tmpl =
+                 dyn_cast<FunctionTemplateDecl>(Cand->getUnderlyingDecl())) {
+      if (SM == CXXCopyAssignment || SM == CXXMoveAssignment)
+        AddMethodTemplateCandidate(
+            Tmpl, Cand, RD, nullptr, ThisTy, Classification,
+            llvm::makeArrayRef(&Arg, NumArgs), OCS, true);
+      else if (CtorInfo)
+        AddTemplateOverloadCandidate(
+            CtorInfo.ConstructorTmpl, CtorInfo.FoundDecl, nullptr,
+            llvm::makeArrayRef(&Arg, NumArgs), OCS, true);
+      else
+        AddTemplateOverloadCandidate(
+            Tmpl, Cand, nullptr, llvm::makeArrayRef(&Arg, NumArgs), OCS, true);
     } else {
-      assert(isa<UsingDecl>(Cand) && "illegal Kind of operator = Decl");
+      assert(isa<UsingDecl>(Cand.getDecl()) &&
+             "illegal Kind of operator = Decl");
     }
   }
 
@@ -3119,7 +3194,7 @@
   if (FoundRaw && FoundTemplate) {
     Diag(R.getNameLoc(), diag::err_ovl_ambiguous_call) << R.getLookupName();
     for (LookupResult::iterator I = R.begin(), E = R.end(); I != E; ++I)
-      NoteOverloadCandidate((*I)->getUnderlyingDecl()->getAsFunction());
+      NoteOverloadCandidate(*I, (*I)->getUnderlyingDecl()->getAsFunction());
     return LOLR_Error;
   }
 
@@ -3984,8 +4059,8 @@
 
 void TypoCorrectionConsumer::performQualifiedLookups() {
   unsigned TypoLen = Typo->getName().size();
-  for (auto QR : QualifiedResults) {
-    for (auto NSI : Namespaces) {
+  for (const TypoCorrection &QR : QualifiedResults) {
+    for (const auto &NSI : Namespaces) {
       DeclContext *Ctx = NSI.DeclCtx;
       const Type *NSType = NSI.NameSpecifier->getAsType();
 
@@ -4073,10 +4148,8 @@
   // Build the list of identifiers that would be used for an absolute
   // (from the global context) NestedNameSpecifier referring to the current
   // context.
-  for (DeclContextList::reverse_iterator C = CurContextChain.rbegin(),
-                                         CEnd = CurContextChain.rend();
-       C != CEnd; ++C) {
-    if (NamespaceDecl *ND = dyn_cast_or_null<NamespaceDecl>(*C))
+  for (DeclContext *C : llvm::reverse(CurContextChain)) {
+    if (auto *ND = dyn_cast_or_null<NamespaceDecl>(C))
       CurContextIdentifiers.push_back(ND->getIdentifier());
   }
 
@@ -4104,13 +4177,11 @@
 TypoCorrectionConsumer::NamespaceSpecifierSet::buildNestedNameSpecifier(
     DeclContextList &DeclChain, NestedNameSpecifier *&NNS) {
   unsigned NumSpecifiers = 0;
-  for (DeclContextList::reverse_iterator C = DeclChain.rbegin(),
-                                      CEnd = DeclChain.rend();
-       C != CEnd; ++C) {
-    if (NamespaceDecl *ND = dyn_cast_or_null<NamespaceDecl>(*C)) {
+  for (DeclContext *C : llvm::reverse(DeclChain)) {
+    if (auto *ND = dyn_cast_or_null<NamespaceDecl>(C)) {
       NNS = NestedNameSpecifier::Create(Context, NNS, ND);
       ++NumSpecifiers;
-    } else if (RecordDecl *RD = dyn_cast_or_null<RecordDecl>(*C)) {
+    } else if (auto *RD = dyn_cast_or_null<RecordDecl>(C)) {
       NNS = NestedNameSpecifier::Create(Context, NNS, RD->isTemplateDecl(),
                                         RD->getTypeForDecl());
       ++NumSpecifiers;
@@ -4127,10 +4198,9 @@
   DeclContextList FullNamespaceDeclChain(NamespaceDeclChain);
 
   // Eliminate common elements from the two DeclContext chains.
-  for (DeclContextList::reverse_iterator C = CurContextChain.rbegin(),
-                                      CEnd = CurContextChain.rend();
-       C != CEnd && !NamespaceDeclChain.empty() &&
-       NamespaceDeclChain.back() == *C; ++C) {
+  for (DeclContext *C : llvm::reverse(CurContextChain)) {
+    if (NamespaceDeclChain.empty() || NamespaceDeclChain.back() != C)
+      break;
     NamespaceDeclChain.pop_back();
   }
 
@@ -4705,11 +4775,20 @@
     const ObjCObjectPointerType *OPT) {
   assert(CCC && "CorrectTypoDelayed requires a CorrectionCandidateCallback");
 
-  TypoCorrection Empty;
   auto Consumer = makeTypoCorrectionConsumer(
       TypoName, LookupKind, S, SS, std::move(CCC), MemberContext,
       EnteringContext, OPT, Mode == CTK_ErrorRecovery);
 
+  // Give the external sema source a chance to correct the typo.
+  TypoCorrection ExternalTypo;
+  if (ExternalSource && Consumer) {
+    ExternalTypo = ExternalSource->CorrectTypo(
+        TypoName, LookupKind, S, SS, *Consumer->getCorrectionValidator(),
+        MemberContext, EnteringContext, OPT);
+    if (ExternalTypo)
+      Consumer->addCorrection(ExternalTypo);
+  }
+
   if (!Consumer || Consumer->empty())
     return nullptr;
 
@@ -4717,7 +4796,7 @@
   // is not more that about a third of the length of the typo's identifier.
   unsigned ED = Consumer->getBestEditDistance(true);
   IdentifierInfo *Typo = TypoName.getName().getAsIdentifierInfo();
-  if (ED > 0 && Typo->getName().size() / ED < 3)
+  if (!ExternalTypo && ED > 0 && Typo->getName().size() / ED < 3)
     return nullptr;
 
   ExprEvalContexts.back().NumTypos++;
@@ -4853,8 +4932,8 @@
 static NamedDecl *getDefinitionToImport(NamedDecl *D) {
   if (VarDecl *VD = dyn_cast<VarDecl>(D))
     return VD->getDefinition();
-  if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
-    return FD->isDefined(FD) ? const_cast<FunctionDecl*>(FD) : nullptr;
+  if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
+    return FD->getDefinition();
   if (TagDecl *TD = dyn_cast<TagDecl>(D))
     return TD->getDefinition();
   if (ObjCInterfaceDecl *ID = dyn_cast<ObjCInterfaceDecl>(D))
@@ -4867,7 +4946,7 @@
 }
 
 void Sema::diagnoseMissingImport(SourceLocation Loc, NamedDecl *Decl,
-                                 bool NeedDefinition, bool Recover) {
+                                 MissingImportKind MIK, bool Recover) {
   assert(!isVisible(Decl) && "missing import for non-hidden decl?");
 
   // Suggest importing a module providing the definition of this entity, if
@@ -4876,8 +4955,6 @@
   if (!Def)
     Def = Decl;
 
-  // FIXME: Add a Fix-It that imports the corresponding module or includes
-  // the header.
   Module *Owner = getOwningModule(Decl);
   assert(Owner && "definition of hidden declaration is not in a module");
 
@@ -4886,12 +4963,20 @@
   auto Merged = Context.getModulesWithMergedDefinition(Decl);
   OwningModules.insert(OwningModules.end(), Merged.begin(), Merged.end());
 
-  diagnoseMissingImport(Loc, Decl, Decl->getLocation(), OwningModules,
-                        NeedDefinition ? MissingImportKind::Definition
-                                       : MissingImportKind::Declaration,
+  diagnoseMissingImport(Loc, Decl, Decl->getLocation(), OwningModules, MIK,
                         Recover);
 }
 
+/// \brief Get a "quoted.h" or <angled.h> include path to use in a diagnostic
+/// suggesting the addition of a #include of the specified file.
+static std::string getIncludeStringForHeader(Preprocessor &PP,
+                                             const FileEntry *E) {
+  bool IsSystem;
+  auto Path =
+      PP.getHeaderSearchInfo().suggestPathToFileForDiagnostics(E, &IsSystem);
+  return (IsSystem ? '<' : '"') + Path + (IsSystem ? '>' : '"');
+}
+
 void Sema::diagnoseMissingImport(SourceLocation UseLoc, NamedDecl *Decl,
                                  SourceLocation DeclLoc,
                                  ArrayRef<Module *> Modules,
@@ -4912,7 +4997,18 @@
 
     Diag(UseLoc, diag::err_module_unimported_use_multiple)
       << (int)MIK << Decl << ModuleList;
+  } else if (const FileEntry *E =
+                 PP.getModuleHeaderToIncludeForDiagnostics(UseLoc, DeclLoc)) {
+    // The right way to make the declaration visible is to include a header;
+    // suggest doing so.
+    //
+    // FIXME: Find a smart place to suggest inserting a #include, and add
+    // a FixItHint there.
+    Diag(UseLoc, diag::err_module_unimported_use_header)
+      << (int)MIK << Decl << Modules[0]->getFullModuleName()
+      << getIncludeStringForHeader(PP, E);
   } else {
+    // FIXME: Add a FixItHint that imports the corresponding module.
     Diag(UseLoc, diag::err_module_unimported_use)
       << (int)MIK << Decl << Modules[0]->getFullModuleName();
   }
@@ -4928,6 +5024,12 @@
   case MissingImportKind::DefaultArgument:
     DiagID = diag::note_default_argument_declared_here;
     break;
+  case MissingImportKind::ExplicitSpecialization:
+    DiagID = diag::note_explicit_specialization_declared_here;
+    break;
+  case MissingImportKind::PartialSpecialization:
+    DiagID = diag::note_partial_specialization_declared_here;
+    break;
   }
   Diag(DeclLoc, DiagID);
 
@@ -4963,7 +5065,7 @@
     assert(Decl && "import required but no declaration to import");
 
     diagnoseMissingImport(Correction.getCorrectionRange().getBegin(), Decl,
-                          /*NeedDefinition*/ false, ErrorRecovery);
+                          MissingImportKind::Declaration, ErrorRecovery);
     return;
   }
 
diff --git a/lib/Sema/SemaObjCProperty.cpp b/lib/Sema/SemaObjCProperty.cpp
index 527b4a3..5e38751 100644
--- a/lib/Sema/SemaObjCProperty.cpp
+++ b/lib/Sema/SemaObjCProperty.cpp
@@ -336,7 +336,6 @@
     }
   } while (Tok.isNot(tok::r_paren));
   return false;
-  
 }
 
 /// Check for a mismatch in the atomicity of the given properties.
@@ -805,7 +804,6 @@
     property->setPropertyAttributes(ObjCPropertyDecl::OBJC_PR_strong);
   else if (ivarLifetime == Qualifiers::OCL_Weak)
     property->setPropertyAttributes(ObjCPropertyDecl::OBJC_PR_weak);
-  return;
 }
 
 /// DiagnosePropertyMismatchDeclInProtocols - diagnose properties declared
@@ -1670,7 +1668,6 @@
 /// in class's \@implementation.
 void Sema::DefaultSynthesizeProperties(Scope *S, ObjCImplDecl* IMPDecl,
                                        ObjCInterfaceDecl *IDecl) {
-  
   ObjCInterfaceDecl::PropertyMap PropMap;
   ObjCInterfaceDecl::PropertyDeclOrder PropertyOrder;
   IDecl->collectPropertiesToImplement(PropMap, PropertyOrder);
@@ -1768,19 +1765,23 @@
       DefaultSynthesizeProperties(S, IC, IDecl);
 }
 
-static void DiagnoseUnimplementedAccessor(Sema &S,
-                                          ObjCInterfaceDecl *PrimaryClass,
-                                          Selector Method,
-                                          ObjCImplDecl* IMPDecl,
-                                          ObjCContainerDecl *CDecl,
-                                          ObjCCategoryDecl *C,
-                                          ObjCPropertyDecl *Prop,
-                                          Sema::SelectorSet &SMap) {
+static void DiagnoseUnimplementedAccessor(
+    Sema &S, ObjCInterfaceDecl *PrimaryClass, Selector Method,
+    ObjCImplDecl *IMPDecl, ObjCContainerDecl *CDecl, ObjCCategoryDecl *C,
+    ObjCPropertyDecl *Prop,
+    llvm::SmallPtrSet<const ObjCMethodDecl *, 8> &SMap) {
+  // Check to see if we have a corresponding selector in SMap and with the
+  // right method type.
+  auto I = std::find_if(SMap.begin(), SMap.end(),
+    [&](const ObjCMethodDecl *x) {
+      return x->getSelector() == Method &&
+             x->isClassMethod() == Prop->isClassProperty();
+    });
   // When reporting on missing property setter/getter implementation in
   // categories, do not report when they are declared in primary class,
   // class's protocol, or one of it super classes. This is because,
   // the class is going to implement them.
-  if (!SMap.count(Method) &&
+  if (I == SMap.end() &&
       (PrimaryClass == nullptr ||
        !PrimaryClass->lookupPropertyAccessor(Method, C,
                                              Prop->isClassProperty()))) {
@@ -1872,10 +1873,10 @@
   for (const auto *I : IMPDecl->property_impls())
     PropImplMap.insert(I->getPropertyDecl());
 
-  SelectorSet InsMap;
+  llvm::SmallPtrSet<const ObjCMethodDecl *, 8> InsMap;
   // Collect property accessors implemented in current implementation.
   for (const auto *I : IMPDecl->methods())
-    InsMap.insert(I->getSelector());
+    InsMap.insert(I);
   
   ObjCCategoryDecl *C = dyn_cast<ObjCCategoryDecl>(CDecl);
   ObjCInterfaceDecl *PrimaryClass = nullptr;
@@ -1887,7 +1888,7 @@
         // setter/getter is implemented in category's primary class
         // implementation.
         for (const auto *I : IMP->methods())
-          InsMap.insert(I->getSelector());
+          InsMap.insert(I);
       }
 
   for (ObjCContainerDecl::PropertyMap::iterator
@@ -2531,5 +2532,4 @@
   if ((Attributes & ObjCDeclSpec::DQ_PR_readonly) &&
       (Attributes & ObjCDeclSpec::DQ_PR_setter))
     Diag(Loc, diag::warn_objc_readonly_property_has_setter);
-      
 }
diff --git a/lib/Sema/SemaOpenMP.cpp b/lib/Sema/SemaOpenMP.cpp
index 4d0d313..5cbcea4 100644
--- a/lib/Sema/SemaOpenMP.cpp
+++ b/lib/Sema/SemaOpenMP.cpp
@@ -15,12 +15,14 @@
 #include "TreeTransform.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTMutationListener.h"
+#include "clang/AST/CXXInheritance.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclOpenMP.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/AST/TypeOrdering.h"
 #include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
@@ -43,111 +45,96 @@
   DSA_shared = 1 << 1  /// \brief Default data sharing attribute 'shared'.
 };
 
-template <class T> struct MatchesAny {
-  explicit MatchesAny(ArrayRef<T> Arr) : Arr(std::move(Arr)) {}
-  bool operator()(T Kind) {
-    for (auto KindEl : Arr)
-      if (KindEl == Kind)
-        return true;
-    return false;
-  }
-
-private:
-  ArrayRef<T> Arr;
-};
-struct MatchesAlways {
-  MatchesAlways() {}
-  template <class T> bool operator()(T) { return true; }
-};
-
-typedef MatchesAny<OpenMPClauseKind> MatchesAnyClause;
-typedef MatchesAny<OpenMPDirectiveKind> MatchesAnyDirective;
-
 /// \brief Stack for tracking declarations used in OpenMP directives and
 /// clauses and their data-sharing attributes.
-class DSAStackTy {
+class DSAStackTy final {
 public:
-  struct DSAVarData {
-    OpenMPDirectiveKind DKind;
-    OpenMPClauseKind CKind;
-    DeclRefExpr *RefExpr;
+  struct DSAVarData final {
+    OpenMPDirectiveKind DKind = OMPD_unknown;
+    OpenMPClauseKind CKind = OMPC_unknown;
+    Expr *RefExpr = nullptr;
+    DeclRefExpr *PrivateCopy = nullptr;
     SourceLocation ImplicitDSALoc;
-    DSAVarData()
-        : DKind(OMPD_unknown), CKind(OMPC_unknown), RefExpr(nullptr),
-          ImplicitDSALoc() {}
+    DSAVarData() {}
   };
-
-public:
-  struct MapInfo {
-    Expr *RefExpr;
-  };
+  typedef llvm::SmallVector<std::pair<Expr *, OverloadedOperatorKind>, 4>
+      OperatorOffsetTy;
 
 private:
-  struct DSAInfo {
-    OpenMPClauseKind Attributes;
-    DeclRefExpr *RefExpr;
+  struct DSAInfo final {
+    OpenMPClauseKind Attributes = OMPC_unknown;
+    /// Pointer to a reference expression and a flag which shows that the
+    /// variable is marked as lastprivate(true) or not (false).
+    llvm::PointerIntPair<Expr *, 1, bool> RefExpr;
+    DeclRefExpr *PrivateCopy = nullptr;
   };
-  typedef llvm::SmallDenseMap<VarDecl *, DSAInfo, 64> DeclSAMapTy;
-  typedef llvm::SmallDenseMap<VarDecl *, DeclRefExpr *, 64> AlignedMapTy;
-  typedef llvm::DenseMap<VarDecl *, unsigned> LoopControlVariablesMapTy;
-  typedef llvm::SmallDenseMap<VarDecl *, MapInfo, 64> MappedDeclsTy;
+  typedef llvm::DenseMap<ValueDecl *, DSAInfo> DeclSAMapTy;
+  typedef llvm::DenseMap<ValueDecl *, Expr *> AlignedMapTy;
+  typedef std::pair<unsigned, VarDecl *> LCDeclInfo;
+  typedef llvm::DenseMap<ValueDecl *, LCDeclInfo> LoopControlVariablesMapTy;
+  /// Struct that associates a component with the clause kind where they are
+  /// found.
+  struct MappedExprComponentTy {
+    OMPClauseMappableExprCommon::MappableExprComponentLists Components;
+    OpenMPClauseKind Kind = OMPC_unknown;
+  };
+  typedef llvm::DenseMap<ValueDecl *, MappedExprComponentTy>
+      MappedExprComponentsTy;
   typedef llvm::StringMap<std::pair<OMPCriticalDirective *, llvm::APSInt>>
       CriticalsWithHintsTy;
+  typedef llvm::DenseMap<OMPDependClause *, OperatorOffsetTy>
+      DoacrossDependMapTy;
 
-  struct SharingMapTy {
+  struct SharingMapTy final {
     DeclSAMapTy SharingMap;
     AlignedMapTy AlignedMap;
-    MappedDeclsTy MappedDecls;
+    MappedExprComponentsTy MappedExprComponents;
     LoopControlVariablesMapTy LCVMap;
-    DefaultDataSharingAttributes DefaultAttr;
+    DefaultDataSharingAttributes DefaultAttr = DSA_unspecified;
     SourceLocation DefaultAttrLoc;
-    OpenMPDirectiveKind Directive;
+    OpenMPDirectiveKind Directive = OMPD_unknown;
     DeclarationNameInfo DirectiveName;
-    Scope *CurScope;
+    Scope *CurScope = nullptr;
     SourceLocation ConstructLoc;
+    /// Set of 'depend' clauses with 'sink|source' dependence kind. Required to
+    /// get the data (loop counters etc.) about enclosing loop-based construct.
+    /// This data is required during codegen.
+    DoacrossDependMapTy DoacrossDepends;
     /// \brief first argument (Expr *) contains optional argument of the
     /// 'ordered' clause, the second one is true if the regions has 'ordered'
     /// clause, false otherwise.
     llvm::PointerIntPair<Expr *, 1, bool> OrderedRegion;
-    bool NowaitRegion;
-    bool CancelRegion;
-    unsigned AssociatedLoops;
+    bool NowaitRegion = false;
+    bool CancelRegion = false;
+    unsigned AssociatedLoops = 1;
     SourceLocation InnerTeamsRegionLoc;
     SharingMapTy(OpenMPDirectiveKind DKind, DeclarationNameInfo Name,
                  Scope *CurScope, SourceLocation Loc)
-        : SharingMap(), AlignedMap(), LCVMap(), DefaultAttr(DSA_unspecified),
-          Directive(DKind), DirectiveName(std::move(Name)), CurScope(CurScope),
-          ConstructLoc(Loc), OrderedRegion(), NowaitRegion(false),
-          CancelRegion(false), AssociatedLoops(1), InnerTeamsRegionLoc() {}
-    SharingMapTy()
-        : SharingMap(), AlignedMap(), LCVMap(), DefaultAttr(DSA_unspecified),
-          Directive(OMPD_unknown), DirectiveName(), CurScope(nullptr),
-          ConstructLoc(), OrderedRegion(), NowaitRegion(false),
-          CancelRegion(false), AssociatedLoops(1), InnerTeamsRegionLoc() {}
+        : Directive(DKind), DirectiveName(Name), CurScope(CurScope),
+          ConstructLoc(Loc) {}
+    SharingMapTy() {}
   };
 
-  typedef SmallVector<SharingMapTy, 64> StackTy;
+  typedef SmallVector<SharingMapTy, 4> StackTy;
 
   /// \brief Stack of used declaration and their data-sharing attributes.
   StackTy Stack;
   /// \brief true, if check for DSA must be from parent directive, false, if
   /// from current directive.
-  OpenMPClauseKind ClauseKindMode;
+  OpenMPClauseKind ClauseKindMode = OMPC_unknown;
   Sema &SemaRef;
-  bool ForceCapturing;
+  bool ForceCapturing = false;
   CriticalsWithHintsTy Criticals;
 
   typedef SmallVector<SharingMapTy, 8>::reverse_iterator reverse_iterator;
 
-  DSAVarData getDSA(StackTy::reverse_iterator Iter, VarDecl *D);
+  DSAVarData getDSA(StackTy::reverse_iterator &Iter, ValueDecl *D);
 
   /// \brief Checks if the variable is a local for OpenMP region.
   bool isOpenMPLocal(VarDecl *D, StackTy::reverse_iterator Iter);
 
 public:
-  explicit DSAStackTy(Sema &S)
-      : Stack(1), ClauseKindMode(OMPC_unknown), SemaRef(S),
-        ForceCapturing(false) {}
+  explicit DSAStackTy(Sema &S) : Stack(1), SemaRef(S) {}
 
   bool isClauseParsingMode() const { return ClauseKindMode != OMPC_unknown; }
   void setClauseParsingMode(OpenMPClauseKind K) { ClauseKindMode = K; }
@@ -179,51 +166,54 @@
   /// \brief If 'aligned' declaration for given variable \a D was not seen yet,
   /// add it and return NULL; otherwise return previous occurrence's expression
   /// for diagnostics.
-  DeclRefExpr *addUniqueAligned(VarDecl *D, DeclRefExpr *NewDE);
+  Expr *addUniqueAligned(ValueDecl *D, Expr *NewDE);
 
   /// \brief Register specified variable as loop control variable.
-  void addLoopControlVariable(VarDecl *D);
+  void addLoopControlVariable(ValueDecl *D, VarDecl *Capture);
   /// \brief Check if the specified variable is a loop control variable for
   /// current region.
   /// \return The index of the loop control variable in the list of associated
   /// for-loops (from outer to inner).
-  unsigned isLoopControlVariable(VarDecl *D);
+  LCDeclInfo isLoopControlVariable(ValueDecl *D);
   /// \brief Check if the specified variable is a loop control variable for
   /// parent region.
   /// \return The index of the loop control variable in the list of associated
   /// for-loops (from outer to inner).
-  unsigned isParentLoopControlVariable(VarDecl *D);
+  LCDeclInfo isParentLoopControlVariable(ValueDecl *D);
   /// \brief Get the loop control variable for the I-th loop (or nullptr) in
   /// parent directive.
-  VarDecl *getParentLoopControlVariable(unsigned I);
+  ValueDecl *getParentLoopControlVariable(unsigned I);
 
   /// \brief Adds explicit data sharing attribute to the specified declaration.
-  void addDSA(VarDecl *D, DeclRefExpr *E, OpenMPClauseKind A);
+  void addDSA(ValueDecl *D, Expr *E, OpenMPClauseKind A,
+              DeclRefExpr *PrivateCopy = nullptr);
 
   /// \brief Returns data sharing attributes from top of the stack for the
   /// specified declaration.
-  DSAVarData getTopDSA(VarDecl *D, bool FromParent);
+  DSAVarData getTopDSA(ValueDecl *D, bool FromParent);
   /// \brief Returns data-sharing attributes for the specified declaration.
-  DSAVarData getImplicitDSA(VarDecl *D, bool FromParent);
+  DSAVarData getImplicitDSA(ValueDecl *D, bool FromParent);
   /// \brief Checks if the specified variables has data-sharing attributes which
   /// match specified \a CPred predicate in any directive which matches \a DPred
   /// predicate.
-  template <class ClausesPredicate, class DirectivesPredicate>
-  DSAVarData hasDSA(VarDecl *D, ClausesPredicate CPred,
-                    DirectivesPredicate DPred, bool FromParent);
+  DSAVarData hasDSA(ValueDecl *D,
+                    const llvm::function_ref<bool(OpenMPClauseKind)> &CPred,
+                    const llvm::function_ref<bool(OpenMPDirectiveKind)> &DPred,
+                    bool FromParent);
   /// \brief Checks if the specified variables has data-sharing attributes which
   /// match specified \a CPred predicate in any innermost directive which
   /// matches \a DPred predicate.
-  template <class ClausesPredicate, class DirectivesPredicate>
-  DSAVarData hasInnermostDSA(VarDecl *D, ClausesPredicate CPred,
-                             DirectivesPredicate DPred,
-                             bool FromParent);
+  DSAVarData
+  hasInnermostDSA(ValueDecl *D,
+                  const llvm::function_ref<bool(OpenMPClauseKind)> &CPred,
+                  const llvm::function_ref<bool(OpenMPDirectiveKind)> &DPred,
+                  bool FromParent);
   /// \brief Checks if the specified variables has explicit data-sharing
   /// attributes which match specified \a CPred predicate at the specified
   /// OpenMP region.
-  bool hasExplicitDSA(VarDecl *D,
+  bool hasExplicitDSA(ValueDecl *D,
                       const llvm::function_ref<bool(OpenMPClauseKind)> &CPred,
-                      unsigned Level);
+                      unsigned Level, bool NotLastprivate = false);
 
   /// \brief Returns true if the directive at level \Level matches in the
   /// specified \a DPred predicate.
@@ -232,8 +222,10 @@
       unsigned Level);
 
   /// \brief Finds a directive which matches specified \a DPred predicate.
-  template <class NamedDirectivesPredicate>
-  bool hasDirective(NamedDirectivesPredicate DPred, bool FromParent);
+  bool hasDirective(const llvm::function_ref<bool(OpenMPDirectiveKind,
+                                                  const DeclarationNameInfo &,
+                                                  SourceLocation)> &DPred,
+                    bool FromParent);
 
   /// \brief Returns currently analyzed directive.
   OpenMPDirectiveKind getCurrentDirective() const {
@@ -245,8 +237,6 @@
       return Stack[Stack.size() - 2].Directive;
     return OMPD_unknown;
   }
-  /// \brief Return the directive associated with the provided scope.
-  OpenMPDirectiveKind getDirectiveForScope(const Scope *S) const;
 
   /// \brief Set default data sharing attribute to none.
   void setDefaultDSANone(SourceLocation Loc) {
@@ -308,9 +298,7 @@
           Stack[Stack.size() - 2].CancelRegion || Cancel;
   }
   /// \brief Return true if current region has inner cancel construct.
-  bool isCancelRegion() const {
-    return Stack.back().CancelRegion;
-  }
+  bool isCancelRegion() const { return Stack.back().CancelRegion; }
 
   /// \brief Set collapse value for the region.
   void setAssociatedLoops(unsigned Val) { Stack.back().AssociatedLoops = Val; }
@@ -338,42 +326,95 @@
   Scope *getCurScope() { return Stack.back().CurScope; }
   SourceLocation getConstructLoc() { return Stack.back().ConstructLoc; }
 
-  MapInfo getMapInfoForVar(VarDecl *VD) {
-    MapInfo VarMI = {0};
-    for (auto Cnt = Stack.size() - 1; Cnt > 0; --Cnt) {
-      if (Stack[Cnt].MappedDecls.count(VD)) {
-        VarMI = Stack[Cnt].MappedDecls[VD];
-        break;
-      }
+  // Do the check specified in \a Check to all component lists and return true
+  // if any issue is found.
+  bool checkMappableExprComponentListsForDecl(
+      ValueDecl *VD, bool CurrentRegionOnly,
+      const llvm::function_ref<
+          bool(OMPClauseMappableExprCommon::MappableExprComponentListRef,
+               OpenMPClauseKind)> &Check) {
+    auto SI = Stack.rbegin();
+    auto SE = Stack.rend();
+
+    if (SI == SE)
+      return false;
+
+    if (CurrentRegionOnly) {
+      SE = std::next(SI);
+    } else {
+      ++SI;
     }
-    return VarMI;
+
+    for (; SI != SE; ++SI) {
+      auto MI = SI->MappedExprComponents.find(VD);
+      if (MI != SI->MappedExprComponents.end())
+        for (auto &L : MI->second.Components)
+          if (Check(L, MI->second.Kind))
+            return true;
+    }
+    return false;
   }
 
-  void addMapInfoForVar(VarDecl *VD, MapInfo MI) {
-    if (Stack.size() > 1) {
-      Stack.back().MappedDecls[VD] = MI;
-    }
+  // Create a new mappable expression component list associated with a given
+  // declaration and initialize it with the provided list of components.
+  void addMappableExpressionComponents(
+      ValueDecl *VD,
+      OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
+      OpenMPClauseKind WhereFoundClauseKind) {
+    assert(Stack.size() > 1 &&
+           "Not expecting to retrieve components from a empty stack!");
+    auto &MEC = Stack.back().MappedExprComponents[VD];
+    // Create new entry and append the new components there.
+    MEC.Components.resize(MEC.Components.size() + 1);
+    MEC.Components.back().append(Components.begin(), Components.end());
+    MEC.Kind = WhereFoundClauseKind;
   }
 
-  MapInfo IsMappedInCurrentRegion(VarDecl *VD) {
-    assert(Stack.size() > 1 && "Target level is 0");
-    MapInfo VarMI = {0};
-    if (Stack.size() > 1 && Stack.back().MappedDecls.count(VD)) {
-      VarMI = Stack.back().MappedDecls[VD];
+  unsigned getNestingLevel() const {
+    assert(Stack.size() > 1);
+    return Stack.size() - 2;
+  }
+  void addDoacrossDependClause(OMPDependClause *C, OperatorOffsetTy &OpsOffs) {
+    assert(Stack.size() > 2);
+    assert(isOpenMPWorksharingDirective(Stack[Stack.size() - 2].Directive));
+    Stack[Stack.size() - 2].DoacrossDepends.insert({C, OpsOffs});
+  }
+  llvm::iterator_range<DoacrossDependMapTy::const_iterator>
+  getDoacrossDependClauses() const {
+    assert(Stack.size() > 1);
+    if (isOpenMPWorksharingDirective(Stack[Stack.size() - 1].Directive)) {
+      auto &Ref = Stack[Stack.size() - 1].DoacrossDepends;
+      return llvm::make_range(Ref.begin(), Ref.end());
     }
-    return VarMI;
+    return llvm::make_range(Stack[0].DoacrossDepends.end(),
+                            Stack[0].DoacrossDepends.end());
   }
 };
 bool isParallelOrTaskRegion(OpenMPDirectiveKind DKind) {
-  return isOpenMPParallelDirective(DKind) || DKind == OMPD_task ||
-         isOpenMPTeamsDirective(DKind) || DKind == OMPD_unknown ||
-         isOpenMPTaskLoopDirective(DKind);
+  return isOpenMPParallelDirective(DKind) || isOpenMPTaskingDirective(DKind) ||
+         isOpenMPTeamsDirective(DKind) || DKind == OMPD_unknown;
 }
 } // namespace
 
-DSAStackTy::DSAVarData DSAStackTy::getDSA(StackTy::reverse_iterator Iter,
-                                          VarDecl *D) {
-  D = D->getCanonicalDecl();
+static ValueDecl *getCanonicalDecl(ValueDecl *D) {
+  auto *VD = dyn_cast<VarDecl>(D);
+  auto *FD = dyn_cast<FieldDecl>(D);
+  if (VD != nullptr) {
+    VD = VD->getCanonicalDecl();
+    D = VD;
+  } else {
+    assert(FD);
+    FD = FD->getCanonicalDecl();
+    D = FD;
+  }
+  return D;
+}
+
+DSAStackTy::DSAVarData DSAStackTy::getDSA(StackTy::reverse_iterator &Iter,
+                                          ValueDecl *D) {
+  D = getCanonicalDecl(D);
+  auto *VD = dyn_cast<VarDecl>(D);
+  auto *FD = dyn_cast<FieldDecl>(D);
   DSAVarData DVar;
   if (Iter == std::prev(Stack.rend())) {
     // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
@@ -381,14 +422,18 @@
     //  File-scope or namespace-scope variables referenced in called routines
     //  in the region are shared unless they appear in a threadprivate
     //  directive.
-    if (!D->isFunctionOrMethodVarDecl() && !isa<ParmVarDecl>(D))
+    if (VD && !VD->isFunctionOrMethodVarDecl() && !isa<ParmVarDecl>(D))
       DVar.CKind = OMPC_shared;
 
     // OpenMP [2.9.1.2, Data-sharing Attribute Rules for Variables Referenced
     // in a region but not in construct]
     //  Variables with static storage duration that are declared in called
     //  routines in the region are shared.
-    if (D->hasGlobalStorage())
+    if (VD && VD->hasGlobalStorage())
+      DVar.CKind = OMPC_shared;
+
+    // Non-static data members are shared by default.
+    if (FD)
       DVar.CKind = OMPC_shared;
 
     return DVar;
@@ -399,8 +444,8 @@
   // in a Construct, C/C++, predetermined, p.1]
   // Variables with automatic storage duration that are declared in a scope
   // inside the construct are private.
-  if (isOpenMPLocal(D, Iter) && D->isLocalVarDecl() &&
-      (D->getStorageClass() == SC_Auto || D->getStorageClass() == SC_None)) {
+  if (VD && isOpenMPLocal(VD, Iter) && VD->isLocalVarDecl() &&
+      (VD->getStorageClass() == SC_Auto || VD->getStorageClass() == SC_None)) {
     DVar.CKind = OMPC_private;
     return DVar;
   }
@@ -408,7 +453,8 @@
   // Explicitly specified attributes and local variables with predetermined
   // attributes.
   if (Iter->SharingMap.count(D)) {
-    DVar.RefExpr = Iter->SharingMap[D].RefExpr;
+    DVar.RefExpr = Iter->SharingMap[D].RefExpr.getPointer();
+    DVar.PrivateCopy = Iter->SharingMap[D].PrivateCopy;
     DVar.CKind = Iter->SharingMap[D].Attributes;
     DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
     return DVar;
@@ -442,27 +488,24 @@
     //  In a task construct, if no default clause is present, a variable that in
     //  the enclosing context is determined to be shared by all implicit tasks
     //  bound to the current team is shared.
-    if (DVar.DKind == OMPD_task) {
+    if (isOpenMPTaskingDirective(DVar.DKind)) {
       DSAVarData DVarTemp;
       for (StackTy::reverse_iterator I = std::next(Iter), EE = Stack.rend();
            I != EE; ++I) {
         // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables
-        // Referenced
-        // in a Construct, implicitly determined, p.6]
+        // Referenced in a Construct, implicitly determined, p.6]
         //  In a task construct, if no default clause is present, a variable
         //  whose data-sharing attribute is not determined by the rules above is
         //  firstprivate.
         DVarTemp = getDSA(I, D);
         if (DVarTemp.CKind != OMPC_shared) {
           DVar.RefExpr = nullptr;
-          DVar.DKind = OMPD_task;
           DVar.CKind = OMPC_firstprivate;
           return DVar;
         }
         if (isParallelOrTaskRegion(I->Directive))
           break;
       }
-      DVar.DKind = OMPD_task;
       DVar.CKind =
           (DVarTemp.CKind == OMPC_unknown) ? OMPC_firstprivate : OMPC_shared;
       return DVar;
@@ -473,12 +516,12 @@
   //  For constructs other than task, if no default clause is present, these
   //  variables inherit their data-sharing attributes from the enclosing
   //  context.
-  return getDSA(std::next(Iter), D);
+  return getDSA(++Iter, D);
 }
 
-DeclRefExpr *DSAStackTy::addUniqueAligned(VarDecl *D, DeclRefExpr *NewDE) {
+Expr *DSAStackTy::addUniqueAligned(ValueDecl *D, Expr *NewDE) {
   assert(Stack.size() > 1 && "Data sharing attributes stack is empty");
-  D = D->getCanonicalDecl();
+  D = getCanonicalDecl(D);
   auto It = Stack.back().AlignedMap.find(D);
   if (It == Stack.back().AlignedMap.end()) {
     assert(NewDE && "Unexpected nullptr expr to be added into aligned map");
@@ -491,46 +534,69 @@
   return nullptr;
 }
 
-void DSAStackTy::addLoopControlVariable(VarDecl *D) {
+void DSAStackTy::addLoopControlVariable(ValueDecl *D, VarDecl *Capture) {
   assert(Stack.size() > 1 && "Data-sharing attributes stack is empty");
-  D = D->getCanonicalDecl();
-  Stack.back().LCVMap.insert(std::make_pair(D, Stack.back().LCVMap.size() + 1));
+  D = getCanonicalDecl(D);
+  Stack.back().LCVMap.insert(
+      std::make_pair(D, LCDeclInfo(Stack.back().LCVMap.size() + 1, Capture)));
 }
 
-unsigned DSAStackTy::isLoopControlVariable(VarDecl *D) {
+DSAStackTy::LCDeclInfo DSAStackTy::isLoopControlVariable(ValueDecl *D) {
   assert(Stack.size() > 1 && "Data-sharing attributes stack is empty");
-  D = D->getCanonicalDecl();
-  return Stack.back().LCVMap.count(D) > 0 ? Stack.back().LCVMap[D] : 0;
+  D = getCanonicalDecl(D);
+  return Stack.back().LCVMap.count(D) > 0 ? Stack.back().LCVMap[D]
+                                          : LCDeclInfo(0, nullptr);
 }
 
-unsigned DSAStackTy::isParentLoopControlVariable(VarDecl *D) {
+DSAStackTy::LCDeclInfo DSAStackTy::isParentLoopControlVariable(ValueDecl *D) {
   assert(Stack.size() > 2 && "Data-sharing attributes stack is empty");
-  D = D->getCanonicalDecl();
+  D = getCanonicalDecl(D);
   return Stack[Stack.size() - 2].LCVMap.count(D) > 0
              ? Stack[Stack.size() - 2].LCVMap[D]
-             : 0;
+             : LCDeclInfo(0, nullptr);
 }
 
-VarDecl *DSAStackTy::getParentLoopControlVariable(unsigned I) {
+ValueDecl *DSAStackTy::getParentLoopControlVariable(unsigned I) {
   assert(Stack.size() > 2 && "Data-sharing attributes stack is empty");
   if (Stack[Stack.size() - 2].LCVMap.size() < I)
     return nullptr;
   for (auto &Pair : Stack[Stack.size() - 2].LCVMap) {
-    if (Pair.second == I)
+    if (Pair.second.first == I)
       return Pair.first;
   }
   return nullptr;
 }
 
-void DSAStackTy::addDSA(VarDecl *D, DeclRefExpr *E, OpenMPClauseKind A) {
-  D = D->getCanonicalDecl();
+void DSAStackTy::addDSA(ValueDecl *D, Expr *E, OpenMPClauseKind A,
+                        DeclRefExpr *PrivateCopy) {
+  D = getCanonicalDecl(D);
   if (A == OMPC_threadprivate) {
-    Stack[0].SharingMap[D].Attributes = A;
-    Stack[0].SharingMap[D].RefExpr = E;
+    auto &Data = Stack[0].SharingMap[D];
+    Data.Attributes = A;
+    Data.RefExpr.setPointer(E);
+    Data.PrivateCopy = nullptr;
   } else {
     assert(Stack.size() > 1 && "Data-sharing attributes stack is empty");
-    Stack.back().SharingMap[D].Attributes = A;
-    Stack.back().SharingMap[D].RefExpr = E;
+    auto &Data = Stack.back().SharingMap[D];
+    assert(Data.Attributes == OMPC_unknown || (A == Data.Attributes) ||
+           (A == OMPC_firstprivate && Data.Attributes == OMPC_lastprivate) ||
+           (A == OMPC_lastprivate && Data.Attributes == OMPC_firstprivate) ||
+           (isLoopControlVariable(D).first && A == OMPC_private));
+    if (A == OMPC_lastprivate && Data.Attributes == OMPC_firstprivate) {
+      Data.RefExpr.setInt(/*IntVal=*/true);
+      return;
+    }
+    const bool IsLastprivate =
+        A == OMPC_lastprivate || Data.Attributes == OMPC_lastprivate;
+    Data.Attributes = A;
+    Data.RefExpr.setPointerAndInt(E, IsLastprivate);
+    Data.PrivateCopy = PrivateCopy;
+    if (PrivateCopy) {
+      auto &Data = Stack.back().SharingMap[PrivateCopy->getDecl()];
+      Data.Attributes = A;
+      Data.RefExpr.setPointerAndInt(PrivateCopy, IsLastprivate);
+      Data.PrivateCopy = nullptr;
+    }
   }
 }
 
@@ -581,29 +647,35 @@
                              VK_LValue);
 }
 
-DSAStackTy::DSAVarData DSAStackTy::getTopDSA(VarDecl *D, bool FromParent) {
-  D = D->getCanonicalDecl();
+DSAStackTy::DSAVarData DSAStackTy::getTopDSA(ValueDecl *D, bool FromParent) {
+  D = getCanonicalDecl(D);
   DSAVarData DVar;
 
   // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
   // in a Construct, C/C++, predetermined, p.1]
   //  Variables appearing in threadprivate directives are threadprivate.
-  if ((D->getTLSKind() != VarDecl::TLS_None &&
-       !(D->hasAttr<OMPThreadPrivateDeclAttr>() &&
+  auto *VD = dyn_cast<VarDecl>(D);
+  if ((VD && VD->getTLSKind() != VarDecl::TLS_None &&
+       !(VD->hasAttr<OMPThreadPrivateDeclAttr>() &&
          SemaRef.getLangOpts().OpenMPUseTLS &&
          SemaRef.getASTContext().getTargetInfo().isTLSSupported())) ||
-      (D->getStorageClass() == SC_Register && D->hasAttr<AsmLabelAttr>() &&
-       !D->isLocalVarDecl())) {
-    addDSA(D, buildDeclRefExpr(SemaRef, D, D->getType().getNonReferenceType(),
+      (VD && VD->getStorageClass() == SC_Register &&
+       VD->hasAttr<AsmLabelAttr>() && !VD->isLocalVarDecl())) {
+    addDSA(D, buildDeclRefExpr(SemaRef, VD, D->getType().getNonReferenceType(),
                                D->getLocation()),
            OMPC_threadprivate);
   }
   if (Stack[0].SharingMap.count(D)) {
-    DVar.RefExpr = Stack[0].SharingMap[D].RefExpr;
+    DVar.RefExpr = Stack[0].SharingMap[D].RefExpr.getPointer();
     DVar.CKind = OMPC_threadprivate;
     return DVar;
   }
 
+  if (Stack.size() == 1) {
+    // Not in OpenMP execution region and top scope was already checked.
+    return DVar;
+  }
+
   // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
   // in a Construct, C/C++, predetermined, p.4]
   //  Static data members are shared.
@@ -611,9 +683,9 @@
   // in a Construct, C/C++, predetermined, p.7]
   //  Variables with static storage duration that are declared in a scope
   //  inside the construct are shared.
-  if (D->isStaticDataMember()) {
-    DSAVarData DVarTemp =
-        hasDSA(D, isOpenMPPrivate, MatchesAlways(), FromParent);
+  auto &&MatchesAlways = [](OpenMPDirectiveKind) -> bool { return true; };
+  if (VD && VD->isStaticDataMember()) {
+    DSAVarData DVarTemp = hasDSA(D, isOpenMPPrivate, MatchesAlways, FromParent);
     if (DVarTemp.CKind != OMPC_unknown && DVarTemp.RefExpr)
       return DVar;
 
@@ -634,11 +706,13 @@
     if (auto *CTD = CTSD->getSpecializedTemplate())
       RD = CTD->getTemplatedDecl();
   if (IsConstant &&
-      !(SemaRef.getLangOpts().CPlusPlus && RD && RD->hasMutableFields())) {
+      !(SemaRef.getLangOpts().CPlusPlus && RD && RD->hasDefinition() &&
+        RD->hasMutableFields())) {
     // Variables with const-qualified type having no mutable member may be
     // listed in a firstprivate clause, even if they are static data members.
-    DSAVarData DVarTemp = hasDSA(D, MatchesAnyClause(OMPC_firstprivate),
-                                 MatchesAlways(), FromParent);
+    DSAVarData DVarTemp = hasDSA(
+        D, [](OpenMPClauseKind C) -> bool { return C == OMPC_firstprivate; },
+        MatchesAlways, FromParent);
     if (DVarTemp.CKind == OMPC_firstprivate && DVarTemp.RefExpr)
       return DVar;
 
@@ -655,7 +729,8 @@
   }
   auto I = std::prev(StartI);
   if (I->SharingMap.count(D)) {
-    DVar.RefExpr = I->SharingMap[D].RefExpr;
+    DVar.RefExpr = I->SharingMap[D].RefExpr.getPointer();
+    DVar.PrivateCopy = I->SharingMap[D].PrivateCopy;
     DVar.CKind = I->SharingMap[D].Attributes;
     DVar.ImplicitDSALoc = I->DefaultAttrLoc;
   }
@@ -663,8 +738,9 @@
   return DVar;
 }
 
-DSAStackTy::DSAVarData DSAStackTy::getImplicitDSA(VarDecl *D, bool FromParent) {
-  D = D->getCanonicalDecl();
+DSAStackTy::DSAVarData DSAStackTy::getImplicitDSA(ValueDecl *D,
+                                                  bool FromParent) {
+  D = getCanonicalDecl(D);
   auto StartI = Stack.rbegin();
   auto EndI = std::prev(Stack.rend());
   if (FromParent && StartI != EndI) {
@@ -673,13 +749,14 @@
   return getDSA(StartI, D);
 }
 
-template <class ClausesPredicate, class DirectivesPredicate>
-DSAStackTy::DSAVarData DSAStackTy::hasDSA(VarDecl *D, ClausesPredicate CPred,
-                                          DirectivesPredicate DPred,
-                                          bool FromParent) {
-  D = D->getCanonicalDecl();
+DSAStackTy::DSAVarData
+DSAStackTy::hasDSA(ValueDecl *D,
+                   const llvm::function_ref<bool(OpenMPClauseKind)> &CPred,
+                   const llvm::function_ref<bool(OpenMPDirectiveKind)> &DPred,
+                   bool FromParent) {
+  D = getCanonicalDecl(D);
   auto StartI = std::next(Stack.rbegin());
-  auto EndI = std::prev(Stack.rend());
+  auto EndI = Stack.rend();
   if (FromParent && StartI != EndI) {
     StartI = std::next(StartI);
   }
@@ -693,59 +770,57 @@
   return DSAVarData();
 }
 
-template <class ClausesPredicate, class DirectivesPredicate>
-DSAStackTy::DSAVarData
-DSAStackTy::hasInnermostDSA(VarDecl *D, ClausesPredicate CPred,
-                            DirectivesPredicate DPred, bool FromParent) {
-  D = D->getCanonicalDecl();
+DSAStackTy::DSAVarData DSAStackTy::hasInnermostDSA(
+    ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> &CPred,
+    const llvm::function_ref<bool(OpenMPDirectiveKind)> &DPred,
+    bool FromParent) {
+  D = getCanonicalDecl(D);
   auto StartI = std::next(Stack.rbegin());
-  auto EndI = std::prev(Stack.rend());
-  if (FromParent && StartI != EndI) {
+  auto EndI = Stack.rend();
+  if (FromParent && StartI != EndI)
     StartI = std::next(StartI);
-  }
-  for (auto I = StartI, EE = EndI; I != EE; ++I) {
-    if (!DPred(I->Directive))
-      break;
-    DSAVarData DVar = getDSA(I, D);
-    if (CPred(DVar.CKind))
-      return DVar;
+  if (StartI == EndI || !DPred(StartI->Directive))
     return DSAVarData();
-  }
-  return DSAVarData();
+  DSAVarData DVar = getDSA(StartI, D);
+  return CPred(DVar.CKind) ? DVar : DSAVarData();
 }
 
 bool DSAStackTy::hasExplicitDSA(
-    VarDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> &CPred,
-    unsigned Level) {
+    ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> &CPred,
+    unsigned Level, bool NotLastprivate) {
   if (CPred(ClauseKindMode))
     return true;
-  if (isClauseParsingMode())
-    ++Level;
-  D = D->getCanonicalDecl();
-  auto StartI = Stack.rbegin();
-  auto EndI = std::prev(Stack.rend());
+  D = getCanonicalDecl(D);
+  auto StartI = std::next(Stack.begin());
+  auto EndI = Stack.end();
   if (std::distance(StartI, EndI) <= (int)Level)
     return false;
   std::advance(StartI, Level);
-  return (StartI->SharingMap.count(D) > 0) && StartI->SharingMap[D].RefExpr &&
-         CPred(StartI->SharingMap[D].Attributes);
+  return (StartI->SharingMap.count(D) > 0) &&
+         StartI->SharingMap[D].RefExpr.getPointer() &&
+         CPred(StartI->SharingMap[D].Attributes) &&
+         (!NotLastprivate || !StartI->SharingMap[D].RefExpr.getInt());
 }
 
 bool DSAStackTy::hasExplicitDirective(
     const llvm::function_ref<bool(OpenMPDirectiveKind)> &DPred,
     unsigned Level) {
-  if (isClauseParsingMode())
-    ++Level;
-  auto StartI = Stack.rbegin();
-  auto EndI = std::prev(Stack.rend());
+  auto StartI = std::next(Stack.begin());
+  auto EndI = Stack.end();
   if (std::distance(StartI, EndI) <= (int)Level)
     return false;
   std::advance(StartI, Level);
   return DPred(StartI->Directive);
 }
 
-template <class NamedDirectivesPredicate>
-bool DSAStackTy::hasDirective(NamedDirectivesPredicate DPred, bool FromParent) {
+bool DSAStackTy::hasDirective(
+    const llvm::function_ref<bool(OpenMPDirectiveKind,
+                                  const DeclarationNameInfo &, SourceLocation)>
+        &DPred,
+    bool FromParent) {
+  // We look only in the enclosing region.
+  if (Stack.size() < 2)
+    return false;
   auto StartI = std::next(Stack.rbegin());
   auto EndI = std::prev(Stack.rend());
   if (FromParent && StartI != EndI) {
@@ -758,31 +833,22 @@
   return false;
 }
 
-OpenMPDirectiveKind DSAStackTy::getDirectiveForScope(const Scope *S) const {
-  for (auto I = Stack.rbegin(), EE = Stack.rend(); I != EE; ++I)
-    if (I->CurScope == S)
-      return I->Directive;
-  return OMPD_unknown;
-}
-
 void Sema::InitDataSharingAttributesStack() {
   VarDataSharingAttributesStack = new DSAStackTy(*this);
 }
 
 #define DSAStack static_cast<DSAStackTy *>(VarDataSharingAttributesStack)
 
-bool Sema::IsOpenMPCapturedByRef(VarDecl *VD,
-                                 const CapturedRegionScopeInfo *RSI) {
+bool Sema::IsOpenMPCapturedByRef(ValueDecl *D, unsigned Level) {
   assert(LangOpts.OpenMP && "OpenMP is not allowed");
 
   auto &Ctx = getASTContext();
   bool IsByRef = true;
 
   // Find the directive that is associated with the provided scope.
-  auto DKind = DSAStack->getDirectiveForScope(RSI->TheScope);
-  auto Ty = VD->getType();
+  auto Ty = D->getType();
 
-  if (isOpenMPTargetDirective(DKind)) {
+  if (DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective, Level)) {
     // This table summarizes how a given variable should be passed to the device
     // given its type and the clauses where it appears. This table is based on
     // the description in OpenMP 4.5 [2.10.4, target Construct] and
@@ -837,31 +903,88 @@
     //    array section, the runtime library may pass the NULL value to the
     //    device instead of the value passed to it by the compiler.
 
-    // FIXME: Right now, only implicit maps are implemented. Properly mapping
-    // values requires having the map, private, and firstprivate clauses SEMA
-    // and parsing in place, which we don't yet.
-
     if (Ty->isReferenceType())
       Ty = Ty->castAs<ReferenceType>()->getPointeeType();
-    IsByRef = !Ty->isScalarType();
+
+    // Locate map clauses and see if the variable being captured is referred to
+    // in any of those clauses. Here we only care about variables, not fields,
+    // because fields are part of aggregates.
+    bool IsVariableUsedInMapClause = false;
+    bool IsVariableAssociatedWithSection = false;
+
+    DSAStack->checkMappableExprComponentListsForDecl(
+        D, /*CurrentRegionOnly=*/true,
+        [&](OMPClauseMappableExprCommon::MappableExprComponentListRef
+                MapExprComponents,
+            OpenMPClauseKind WhereFoundClauseKind) {
+          // Only the map clause information influences how a variable is
+          // captured. E.g. is_device_ptr does not require changing the default
+          // behaviour.
+          if (WhereFoundClauseKind != OMPC_map)
+            return false;
+
+          auto EI = MapExprComponents.rbegin();
+          auto EE = MapExprComponents.rend();
+
+          assert(EI != EE && "Invalid map expression!");
+
+          if (isa<DeclRefExpr>(EI->getAssociatedExpression()))
+            IsVariableUsedInMapClause |= EI->getAssociatedDeclaration() == D;
+
+          ++EI;
+          if (EI == EE)
+            return false;
+
+          if (isa<ArraySubscriptExpr>(EI->getAssociatedExpression()) ||
+              isa<OMPArraySectionExpr>(EI->getAssociatedExpression()) ||
+              isa<MemberExpr>(EI->getAssociatedExpression())) {
+            IsVariableAssociatedWithSection = true;
+            // There is nothing more we need to know about this variable.
+            return true;
+          }
+
+          // Keep looking for more map info.
+          return false;
+        });
+
+    if (IsVariableUsedInMapClause) {
+      // If variable is identified in a map clause it is always captured by
+      // reference except if it is a pointer that is dereferenced somehow.
+      IsByRef = !(Ty->isPointerType() && IsVariableAssociatedWithSection);
+    } else {
+      // By default, all the data that has a scalar type is mapped by copy.
+      IsByRef = !Ty->isScalarType();
+    }
   }
 
-  // When passing data by value, we need to make sure it fits the uintptr size
+  if (IsByRef && Ty.getNonReferenceType()->isScalarType()) {
+    IsByRef = !DSAStack->hasExplicitDSA(
+        D, [](OpenMPClauseKind K) -> bool { return K == OMPC_firstprivate; },
+        Level, /*NotLastprivate=*/true);
+  }
+
+  // When passing data by copy, we need to make sure it fits the uintptr size
   // and alignment, because the runtime library only deals with uintptr types.
   // If it does not fit the uintptr size, we need to pass the data by reference
   // instead.
   if (!IsByRef &&
       (Ctx.getTypeSizeInChars(Ty) >
            Ctx.getTypeSizeInChars(Ctx.getUIntPtrType()) ||
-       Ctx.getDeclAlign(VD) > Ctx.getTypeAlignInChars(Ctx.getUIntPtrType())))
+       Ctx.getDeclAlign(D) > Ctx.getTypeAlignInChars(Ctx.getUIntPtrType()))) {
     IsByRef = true;
+  }
 
   return IsByRef;
 }
 
-bool Sema::IsOpenMPCapturedVar(VarDecl *VD) {
+unsigned Sema::getOpenMPNestingLevel() const {
+  assert(getLangOpts().OpenMP);
+  return DSAStack->getNestingLevel();
+}
+
+VarDecl *Sema::IsOpenMPCapturedDecl(ValueDecl *D) {
   assert(LangOpts.OpenMP && "OpenMP is not allowed");
-  VD = VD->getCanonicalDecl();
+  D = getCanonicalDecl(D);
 
   // If we are attempting to capture a global variable in a directive with
   // 'target' we return true so that this global is also mapped to the device.
@@ -870,52 +993,55 @@
   // then it should not be captured. Therefore, an extra check has to be
   // inserted here once support for 'declare target' is added.
   //
-  if (!VD->hasLocalStorage()) {
+  auto *VD = dyn_cast<VarDecl>(D);
+  if (VD && !VD->hasLocalStorage()) {
     if (DSAStack->getCurrentDirective() == OMPD_target &&
-        !DSAStack->isClauseParsingMode()) {
-      return true;
-    }
-    if (DSAStack->getCurScope() &&
-        DSAStack->hasDirective(
-            [](OpenMPDirectiveKind K, const DeclarationNameInfo &DNI,
-               SourceLocation Loc) -> bool {
-              return isOpenMPTargetDirective(K);
+        !DSAStack->isClauseParsingMode())
+      return VD;
+    if (DSAStack->hasDirective(
+            [](OpenMPDirectiveKind K, const DeclarationNameInfo &,
+               SourceLocation) -> bool {
+              return isOpenMPTargetExecutionDirective(K);
             },
-            false)) {
-      return true;
-    }
+            false))
+      return VD;
   }
 
   if (DSAStack->getCurrentDirective() != OMPD_unknown &&
       (!DSAStack->isClauseParsingMode() ||
        DSAStack->getParentDirective() != OMPD_unknown)) {
-    if (DSAStack->isLoopControlVariable(VD) ||
-        (VD->hasLocalStorage() &&
+    auto &&Info = DSAStack->isLoopControlVariable(D);
+    if (Info.first ||
+        (VD && VD->hasLocalStorage() &&
          isParallelOrTaskRegion(DSAStack->getCurrentDirective())) ||
-        DSAStack->isForceVarCapturing())
-      return true;
-    auto DVarPrivate = DSAStack->getTopDSA(VD, DSAStack->isClauseParsingMode());
+        (VD && DSAStack->isForceVarCapturing()))
+      return VD ? VD : Info.second;
+    auto DVarPrivate = DSAStack->getTopDSA(D, DSAStack->isClauseParsingMode());
     if (DVarPrivate.CKind != OMPC_unknown && isOpenMPPrivate(DVarPrivate.CKind))
-      return true;
-    DVarPrivate = DSAStack->hasDSA(VD, isOpenMPPrivate, MatchesAlways(),
-                                   DSAStack->isClauseParsingMode());
-    return DVarPrivate.CKind != OMPC_unknown;
+      return VD ? VD : cast<VarDecl>(DVarPrivate.PrivateCopy->getDecl());
+    DVarPrivate = DSAStack->hasDSA(
+        D, isOpenMPPrivate, [](OpenMPDirectiveKind) -> bool { return true; },
+        DSAStack->isClauseParsingMode());
+    if (DVarPrivate.CKind != OMPC_unknown)
+      return VD ? VD : cast<VarDecl>(DVarPrivate.PrivateCopy->getDecl());
   }
-  return false;
+  return nullptr;
 }
 
-bool Sema::isOpenMPPrivateVar(VarDecl *VD, unsigned Level) {
+bool Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level) {
   assert(LangOpts.OpenMP && "OpenMP is not allowed");
   return DSAStack->hasExplicitDSA(
-      VD, [](OpenMPClauseKind K) -> bool { return K == OMPC_private; }, Level);
+      D, [](OpenMPClauseKind K) -> bool { return K == OMPC_private; }, Level);
 }
 
-bool Sema::isOpenMPTargetCapturedVar(VarDecl *VD, unsigned Level) {
+bool Sema::isOpenMPTargetCapturedDecl(ValueDecl *D, unsigned Level) {
   assert(LangOpts.OpenMP && "OpenMP is not allowed");
   // Return true if the current level is no longer enclosed in a target region.
 
-  return !VD->hasLocalStorage() &&
-         DSAStack->hasExplicitDirective(isOpenMPTargetDirective, Level);
+  auto *VD = dyn_cast<VarDecl>(D);
+  return VD && !VD->hasLocalStorage() &&
+         DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective,
+                                        Level);
 }
 
 void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; }
@@ -941,7 +1067,7 @@
   //  clause requires an accessible, unambiguous default constructor for the
   //  class type, unless the list item is also specified in a firstprivate
   //  clause.
-  if (auto D = dyn_cast_or_null<OMPExecutableDirective>(CurDirective)) {
+  if (auto *D = dyn_cast_or_null<OMPExecutableDirective>(CurDirective)) {
     for (auto *C : D->clauses()) {
       if (auto *Clause = dyn_cast<OMPLastprivateClause>(C)) {
         SmallVector<Expr *, 8> PrivateCopies;
@@ -950,7 +1076,8 @@
             PrivateCopies.push_back(nullptr);
             continue;
           }
-          auto *VD = cast<VarDecl>(cast<DeclRefExpr>(DE)->getDecl());
+          auto *DRE = cast<DeclRefExpr>(DE->IgnoreParens());
+          VarDecl *VD = cast<VarDecl>(DRE->getDecl());
           QualType Type = VD->getType().getNonReferenceType();
           auto DVar = DSAStack->getTopDSA(VD, false);
           if (DVar.CKind == OMPC_lastprivate) {
@@ -974,9 +1101,8 @@
           }
         }
         // Set initializers to private copies if no errors were found.
-        if (PrivateCopies.size() == Clause->varlist_size()) {
+        if (PrivateCopies.size() == Clause->varlist_size())
           Clause->setPrivateCopies(PrivateCopies);
-        }
       }
     }
   }
@@ -988,7 +1114,7 @@
 
 static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
                                      Expr *NumIterations, Sema &SemaRef,
-                                     Scope *S);
+                                     Scope *S, DSAStackTy *Stack);
 
 namespace {
 
@@ -1000,7 +1126,7 @@
   explicit VarDeclFilterCCC(Sema &S) : SemaRef(S) {}
   bool ValidateCandidate(const TypoCorrection &Candidate) override {
     NamedDecl *ND = Candidate.getCorrectionDecl();
-    if (VarDecl *VD = dyn_cast_or_null<VarDecl>(ND)) {
+    if (auto *VD = dyn_cast_or_null<VarDecl>(ND)) {
       return VD->hasGlobalStorage() &&
              SemaRef.isDeclInScope(ND, SemaRef.getCurLexicalContext(),
                                    SemaRef.getCurScope());
@@ -1008,6 +1134,23 @@
     return false;
   }
 };
+
+class VarOrFuncDeclFilterCCC : public CorrectionCandidateCallback {
+private:
+  Sema &SemaRef;
+
+public:
+  explicit VarOrFuncDeclFilterCCC(Sema &S) : SemaRef(S) {}
+  bool ValidateCandidate(const TypoCorrection &Candidate) override {
+    NamedDecl *ND = Candidate.getCorrectionDecl();
+    if (isa<VarDecl>(ND) || isa<FunctionDecl>(ND)) {
+      return SemaRef.isDeclInScope(ND, SemaRef.getCurLexicalContext(),
+                                   SemaRef.getCurScope());
+    }
+    return false;
+  }
+};
+
 } // namespace
 
 ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
@@ -1130,8 +1273,10 @@
   }
 
   QualType ExprType = VD->getType().getNonReferenceType();
-  ExprResult DE = buildDeclRefExpr(*this, VD, ExprType, Id.getLoc());
-  return DE;
+  return DeclRefExpr::Create(Context, NestedNameSpecifierLoc(),
+                             SourceLocation(), VD,
+                             /*RefersToEnclosingVariableOrCapture=*/false,
+                             Id.getLoc(), ExprType, VK_LValue);
 }
 
 Sema::DeclGroupPtrTy
@@ -1150,7 +1295,7 @@
 
 public:
   bool VisitDeclRefExpr(const DeclRefExpr *E) {
-    if (auto VD = dyn_cast<VarDecl>(E->getDecl())) {
+    if (auto *VD = dyn_cast<VarDecl>(E->getDecl())) {
       if (VD->hasLocalStorage()) {
         SemaRef.Diag(E->getLocStart(),
                      diag::err_omp_local_var_in_threadprivate_init)
@@ -1181,6 +1326,10 @@
     VarDecl *VD = cast<VarDecl>(DE->getDecl());
     SourceLocation ILoc = DE->getExprLoc();
 
+    // Mark variable as used.
+    VD->setReferenced();
+    VD->markUsed(Context);
+
     QualType QType = VD->getType();
     if (QType->isDependentType() || QType->isInstantiationDependentType()) {
       // It will be analyzed later.
@@ -1251,7 +1400,7 @@
 }
 
 static void ReportOriginalDSA(Sema &SemaRef, DSAStackTy *Stack,
-                              const VarDecl *VD, DSAStackTy::DSAVarData DVar,
+                              const ValueDecl *D, DSAStackTy::DSAVarData DVar,
                               bool IsLoopIterVar = false) {
   if (DVar.RefExpr) {
     SemaRef.Diag(DVar.RefExpr->getExprLoc(), diag::note_omp_explicit_dsa)
@@ -1271,7 +1420,8 @@
     PDSA_Implicit
   } Reason = PDSA_Implicit;
   bool ReportHint = false;
-  auto ReportLoc = VD->getLocation();
+  auto ReportLoc = D->getLocation();
+  auto *VD = dyn_cast<VarDecl>(D);
   if (IsLoopIterVar) {
     if (DVar.CKind == OMPC_private)
       Reason = PDSA_LoopIterVarPrivate;
@@ -1279,18 +1429,19 @@
       Reason = PDSA_LoopIterVarLastprivate;
     else
       Reason = PDSA_LoopIterVarLinear;
-  } else if (DVar.DKind == OMPD_task && DVar.CKind == OMPC_firstprivate) {
+  } else if (isOpenMPTaskingDirective(DVar.DKind) &&
+             DVar.CKind == OMPC_firstprivate) {
     Reason = PDSA_TaskVarFirstprivate;
     ReportLoc = DVar.ImplicitDSALoc;
-  } else if (VD->isStaticLocal())
+  } else if (VD && VD->isStaticLocal())
     Reason = PDSA_StaticLocalVarShared;
-  else if (VD->isStaticDataMember())
+  else if (VD && VD->isStaticDataMember())
     Reason = PDSA_StaticMemberShared;
-  else if (VD->isFileVarDecl())
+  else if (VD && VD->isFileVarDecl())
     Reason = PDSA_GlobalVarShared;
-  else if (VD->getType().isConstant(SemaRef.getASTContext()))
+  else if (D->getType().isConstant(SemaRef.getASTContext()))
     Reason = PDSA_ConstVarShared;
-  else if (VD->isLocalVarDecl() && DVar.CKind == OMPC_private) {
+  else if (VD && VD->isLocalVarDecl() && DVar.CKind == OMPC_private) {
     ReportHint = true;
     Reason = PDSA_LocalVarPrivate;
   }
@@ -1311,10 +1462,13 @@
   bool ErrorFound;
   CapturedStmt *CS;
   llvm::SmallVector<Expr *, 8> ImplicitFirstprivate;
-  llvm::DenseMap<VarDecl *, Expr *> VarsWithInheritedDSA;
+  llvm::DenseMap<ValueDecl *, Expr *> VarsWithInheritedDSA;
 
 public:
   void VisitDeclRefExpr(DeclRefExpr *E) {
+    if (E->isTypeDependent() || E->isValueDependent() ||
+        E->containsUnexpandedParameterPack() || E->isInstantiationDependent())
+      return;
     if (auto *VD = dyn_cast<VarDecl>(E->getDecl())) {
       // Skip internally declared variables.
       if (VD->isLocalVarDecl() && !CS->capturesVariable(VD))
@@ -1322,7 +1476,8 @@
 
       auto DVar = Stack->getTopDSA(VD, false);
       // Check if the variable has explicit DSA set and stop analysis if it so.
-      if (DVar.RefExpr) return;
+      if (DVar.RefExpr)
+        return;
 
       auto ELoc = E->getExprLoc();
       auto DKind = Stack->getCurrentDirective();
@@ -1341,14 +1496,14 @@
       //  A list item that appears in a reduction clause of the innermost
       //  enclosing worksharing or parallel construct may not be accessed in an
       //  explicit task.
-      DVar = Stack->hasInnermostDSA(VD, MatchesAnyClause(OMPC_reduction),
-                                    [](OpenMPDirectiveKind K) -> bool {
-                                      return isOpenMPParallelDirective(K) ||
-                                             isOpenMPWorksharingDirective(K) ||
-                                             isOpenMPTeamsDirective(K);
-                                    },
-                                    false);
-      if (DKind == OMPD_task && DVar.CKind == OMPC_reduction) {
+      DVar = Stack->hasInnermostDSA(
+          VD, [](OpenMPClauseKind C) -> bool { return C == OMPC_reduction; },
+          [](OpenMPDirectiveKind K) -> bool {
+            return isOpenMPParallelDirective(K) ||
+                   isOpenMPWorksharingDirective(K) || isOpenMPTeamsDirective(K);
+          },
+          false);
+      if (isOpenMPTaskingDirective(DKind) && DVar.CKind == OMPC_reduction) {
         ErrorFound = true;
         SemaRef.Diag(ELoc, diag::err_omp_reduction_in_task);
         ReportOriginalDSA(SemaRef, Stack, VD, DVar);
@@ -1357,10 +1512,52 @@
 
       // Define implicit data-sharing attributes for task.
       DVar = Stack->getImplicitDSA(VD, false);
-      if (DKind == OMPD_task && DVar.CKind != OMPC_shared)
+      if (isOpenMPTaskingDirective(DKind) && DVar.CKind != OMPC_shared &&
+          !Stack->isLoopControlVariable(VD).first)
         ImplicitFirstprivate.push_back(E);
     }
   }
+  void VisitMemberExpr(MemberExpr *E) {
+    if (E->isTypeDependent() || E->isValueDependent() ||
+        E->containsUnexpandedParameterPack() || E->isInstantiationDependent())
+      return;
+    if (isa<CXXThisExpr>(E->getBase()->IgnoreParens())) {
+      if (auto *FD = dyn_cast<FieldDecl>(E->getMemberDecl())) {
+        auto DVar = Stack->getTopDSA(FD, false);
+        // Check if the variable has explicit DSA set and stop analysis if it
+        // so.
+        if (DVar.RefExpr)
+          return;
+
+        auto ELoc = E->getExprLoc();
+        auto DKind = Stack->getCurrentDirective();
+        // OpenMP [2.9.3.6, Restrictions, p.2]
+        //  A list item that appears in a reduction clause of the innermost
+        //  enclosing worksharing or parallel construct may not be accessed in
+        //  an  explicit task.
+        DVar = Stack->hasInnermostDSA(
+            FD, [](OpenMPClauseKind C) -> bool { return C == OMPC_reduction; },
+            [](OpenMPDirectiveKind K) -> bool {
+              return isOpenMPParallelDirective(K) ||
+                     isOpenMPWorksharingDirective(K) ||
+                     isOpenMPTeamsDirective(K);
+            },
+            false);
+        if (isOpenMPTaskingDirective(DKind) && DVar.CKind == OMPC_reduction) {
+          ErrorFound = true;
+          SemaRef.Diag(ELoc, diag::err_omp_reduction_in_task);
+          ReportOriginalDSA(SemaRef, Stack, FD, DVar);
+          return;
+        }
+
+        // Define implicit data-sharing attributes for task.
+        DVar = Stack->getImplicitDSA(FD, false);
+        if (isOpenMPTaskingDirective(DKind) && DVar.CKind != OMPC_shared &&
+            !Stack->isLoopControlVariable(FD).first)
+          ImplicitFirstprivate.push_back(E);
+      }
+    }
+  }
   void VisitOMPExecutableDirective(OMPExecutableDirective *S) {
     for (auto *C : S->clauses()) {
       // Skip analysis of arguments of implicitly defined firstprivate clause
@@ -1381,7 +1578,7 @@
 
   bool isErrorFound() { return ErrorFound; }
   ArrayRef<Expr *> getImplicitFirstprivate() { return ImplicitFirstprivate; }
-  llvm::DenseMap<VarDecl *, Expr *> &getVarsWithInheritedDSA() {
+  llvm::DenseMap<ValueDecl *, Expr *> &getVarsWithInheritedDSA() {
     return VarsWithInheritedDSA;
   }
 
@@ -1392,7 +1589,11 @@
 
 void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
   switch (DKind) {
-  case OMPD_parallel: {
+  case OMPD_parallel:
+  case OMPD_parallel_for:
+  case OMPD_parallel_for_simd:
+  case OMPD_parallel_sections:
+  case OMPD_teams: {
     QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
     QualType KmpInt32PtrTy =
         Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
@@ -1405,7 +1606,24 @@
                              Params);
     break;
   }
-  case OMPD_simd: {
+  case OMPD_simd:
+  case OMPD_for:
+  case OMPD_for_simd:
+  case OMPD_sections:
+  case OMPD_section:
+  case OMPD_single:
+  case OMPD_master:
+  case OMPD_critical:
+  case OMPD_taskgroup:
+  case OMPD_distribute:
+  case OMPD_ordered:
+  case OMPD_atomic:
+  case OMPD_target_data:
+  case OMPD_target:
+  case OMPD_target_parallel:
+  case OMPD_target_parallel_for:
+  case OMPD_target_parallel_for_simd:
+  case OMPD_target_simd: {
     Sema::CapturedParamNameType Params[] = {
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
@@ -1413,101 +1631,6 @@
                              Params);
     break;
   }
-  case OMPD_for: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_for_simd: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_sections: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_section: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_single: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_master: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_critical: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_parallel_for: {
-    QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
-    QualType KmpInt32PtrTy =
-        Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(".global_tid.", KmpInt32PtrTy),
-        std::make_pair(".bound_tid.", KmpInt32PtrTy),
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_parallel_for_simd: {
-    QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
-    QualType KmpInt32PtrTy =
-        Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(".global_tid.", KmpInt32PtrTy),
-        std::make_pair(".bound_tid.", KmpInt32PtrTy),
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_parallel_sections: {
-    QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
-    QualType KmpInt32PtrTy =
-        Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(".global_tid.", KmpInt32PtrTy),
-        std::make_pair(".bound_tid.", KmpInt32PtrTy),
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
   case OMPD_task: {
     QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
     QualType Args[] = {Context.VoidPtrTy.withConst().withRestrict()};
@@ -1516,12 +1639,11 @@
     QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
     Sema::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32Ty),
-        std::make_pair(".part_id.", KmpInt32Ty),
-        std::make_pair(".privates.",
-                       Context.VoidPtrTy.withConst().withRestrict()),
-        std::make_pair(
-            ".copy_fn.",
-            Context.getPointerType(CopyFnType).withConst().withRestrict()),
+        std::make_pair(".part_id.", Context.getPointerType(KmpInt32Ty)),
+        std::make_pair(".privates.", Context.VoidPtrTy.withConst()),
+        std::make_pair(".copy_fn.",
+                       Context.getPointerType(CopyFnType).withConst()),
+        std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
@@ -1533,70 +1655,53 @@
             Context, AlwaysInlineAttr::Keyword_forceinline, SourceRange()));
     break;
   }
-  case OMPD_ordered: {
+  case OMPD_taskloop:
+  case OMPD_taskloop_simd: {
+    QualType KmpInt32Ty =
+        Context.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
+    QualType KmpUInt64Ty =
+        Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
+    QualType KmpInt64Ty =
+        Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
+    QualType Args[] = {Context.VoidPtrTy.withConst().withRestrict()};
+    FunctionProtoType::ExtProtoInfo EPI;
+    EPI.Variadic = true;
+    QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
     Sema::CapturedParamNameType Params[] = {
+        std::make_pair(".global_tid.", KmpInt32Ty),
+        std::make_pair(".part_id.", Context.getPointerType(KmpInt32Ty)),
+        std::make_pair(".privates.",
+                       Context.VoidPtrTy.withConst().withRestrict()),
+        std::make_pair(
+            ".copy_fn.",
+            Context.getPointerType(CopyFnType).withConst().withRestrict()),
+        std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
+        std::make_pair(".lb.", KmpUInt64Ty),
+        std::make_pair(".ub.", KmpUInt64Ty), std::make_pair(".st.", KmpInt64Ty),
+        std::make_pair(".liter.", KmpInt32Ty),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
                              Params);
+    // Mark this captured region as inlined, because we don't use outlined
+    // function directly.
+    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+        AlwaysInlineAttr::CreateImplicit(
+            Context, AlwaysInlineAttr::Keyword_forceinline, SourceRange()));
     break;
   }
-  case OMPD_atomic: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_target_data:
-  case OMPD_target: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_teams: {
+  case OMPD_distribute_parallel_for_simd:
+  case OMPD_distribute_simd:
+  case OMPD_distribute_parallel_for:
+  case OMPD_teams_distribute: {
     QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
     QualType KmpInt32PtrTy =
         Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
     Sema::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_taskgroup: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_taskloop: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_taskloop_simd: {
-    Sema::CapturedParamNameType Params[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
-    break;
-  }
-  case OMPD_distribute: {
-    Sema::CapturedParamNameType Params[] = {
+        std::make_pair(".previous.lb.", Context.getSizeType()),
+        std::make_pair(".previous.ub.", Context.getSizeType()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
@@ -1610,12 +1715,78 @@
   case OMPD_cancellation_point:
   case OMPD_cancel:
   case OMPD_flush:
+  case OMPD_target_enter_data:
+  case OMPD_target_exit_data:
+  case OMPD_declare_reduction:
+  case OMPD_declare_simd:
+  case OMPD_declare_target:
+  case OMPD_end_declare_target:
+  case OMPD_target_update:
     llvm_unreachable("OpenMP Directive is not allowed");
   case OMPD_unknown:
     llvm_unreachable("Unknown OpenMP directive");
   }
 }
 
+static OMPCapturedExprDecl *buildCaptureDecl(Sema &S, IdentifierInfo *Id,
+                                             Expr *CaptureExpr, bool WithInit,
+                                             bool AsExpression) {
+  assert(CaptureExpr);
+  ASTContext &C = S.getASTContext();
+  Expr *Init = AsExpression ? CaptureExpr : CaptureExpr->IgnoreImpCasts();
+  QualType Ty = Init->getType();
+  if (CaptureExpr->getObjectKind() == OK_Ordinary && CaptureExpr->isGLValue()) {
+    if (S.getLangOpts().CPlusPlus)
+      Ty = C.getLValueReferenceType(Ty);
+    else {
+      Ty = C.getPointerType(Ty);
+      ExprResult Res =
+          S.CreateBuiltinUnaryOp(CaptureExpr->getExprLoc(), UO_AddrOf, Init);
+      if (!Res.isUsable())
+        return nullptr;
+      Init = Res.get();
+    }
+    WithInit = true;
+  }
+  auto *CED = OMPCapturedExprDecl::Create(C, S.CurContext, Id, Ty);
+  if (!WithInit)
+    CED->addAttr(OMPCaptureNoInitAttr::CreateImplicit(C, SourceRange()));
+  S.CurContext->addHiddenDecl(CED);
+  S.AddInitializerToDecl(CED, Init, /*DirectInit=*/false,
+                         /*TypeMayContainAuto=*/true);
+  return CED;
+}
+
+static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr,
+                                 bool WithInit) {
+  OMPCapturedExprDecl *CD;
+  if (auto *VD = S.IsOpenMPCapturedDecl(D))
+    CD = cast<OMPCapturedExprDecl>(VD);
+  else
+    CD = buildCaptureDecl(S, D->getIdentifier(), CaptureExpr, WithInit,
+                          /*AsExpression=*/false);
+  return buildDeclRefExpr(S, CD, CD->getType().getNonReferenceType(),
+                          CaptureExpr->getExprLoc());
+}
+
+static ExprResult buildCapture(Sema &S, Expr *CaptureExpr, DeclRefExpr *&Ref) {
+  if (!Ref) {
+    auto *CD =
+        buildCaptureDecl(S, &S.getASTContext().Idents.get(".capture_expr."),
+                         CaptureExpr, /*WithInit=*/true, /*AsExpression=*/true);
+    Ref = buildDeclRefExpr(S, CD, CD->getType().getNonReferenceType(),
+                           CaptureExpr->getExprLoc());
+  }
+  ExprResult Res = Ref;
+  if (!S.getLangOpts().CPlusPlus &&
+      CaptureExpr->getObjectKind() == OK_Ordinary && CaptureExpr->isGLValue() &&
+      Ref->getType()->isPointerType())
+    Res = S.CreateBuiltinUnaryOp(CaptureExpr->getExprLoc(), UO_Deref, Ref);
+  if (!Res.isUsable())
+    return ExprError();
+  return CaptureExpr->isGLValue() ? Res : S.DefaultLvalueConversion(Res.get());
+}
+
 StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
                                       ArrayRef<OMPClause *> Clauses) {
   if (!S.isUsable()) {
@@ -1641,14 +1812,20 @@
         }
       }
       DSAStack->setForceVarCapturing(/*V=*/false);
-    } else if (isParallelOrTaskRegion(DSAStack->getCurrentDirective()) &&
-               Clause->getClauseKind() == OMPC_schedule) {
+    } else if (isParallelOrTaskRegion(DSAStack->getCurrentDirective())) {
       // Mark all variables in private list clauses as used in inner region.
       // Required for proper codegen of combined directives.
       // TODO: add processing for other clauses.
-      if (auto *E = cast_or_null<Expr>(
-              cast<OMPScheduleClause>(Clause)->getHelperChunkSize()))
-        MarkDeclarationsReferencedInExpr(E);
+      if (auto *C = OMPClauseWithPreInit::get(Clause)) {
+        if (auto *DS = cast_or_null<DeclStmt>(C->getPreInitStmt())) {
+          for (auto *D : DS->decls())
+            MarkVariableReferenced(D->getLocation(), cast<VarDecl>(D));
+        }
+      }
+      if (auto *C = OMPClauseWithPostUpdate::get(Clause)) {
+        if (auto *E = C->getPostUpdateExpr())
+          MarkDeclarationsReferencedInExpr(E);
+      }
     }
     if (Clause->getClauseKind() == OMPC_schedule)
       SC = cast<OMPScheduleClause>(Clause);
@@ -1724,13 +1901,27 @@
   // | parallel         | ordered         | +                                  |
   // | parallel         | atomic          | *                                  |
   // | parallel         | target          | *                                  |
+  // | parallel         | target parallel | *                                  |
+  // | parallel         | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | parallel         | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | parallel         | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | parallel         | teams           | +                                  |
   // | parallel         | cancellation    |                                    |
   // |                  | point           | !                                  |
   // | parallel         | cancel          | !                                  |
   // | parallel         | taskloop        | *                                  |
   // | parallel         | taskloop simd   | *                                  |
-  // | parallel         | distribute      |                                    |  
+  // | parallel         | distribute      | +                                  |
+  // | parallel         | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | parallel         | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | parallel         | distribute simd | +                                  |
+  // | parallel         | target simd     | *                                  |
+  // | parallel         | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | for              | parallel        | *                                  |
   // | for              | for             | +                                  |
@@ -1753,13 +1944,29 @@
   // | for              | ordered         | * (if construct is ordered)        |
   // | for              | atomic          | *                                  |
   // | for              | target          | *                                  |
+  // | for              | target parallel | *                                  |
+  // | for              | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | for              | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | for              | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | for              | teams           | +                                  |
   // | for              | cancellation    |                                    |
   // |                  | point           | !                                  |
   // | for              | cancel          | !                                  |
   // | for              | taskloop        | *                                  |
   // | for              | taskloop simd   | *                                  |
-  // | for              | distribute      |                                    |
+  // | for              | distribute      | +                                  |
+  // | for              | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | for              | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | for              | distribute simd | +                                  |
+  // | for              | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | for              | target simd     | *                                  |
+  // | for              | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | master           | parallel        | *                                  |
   // | master           | for             | +                                  |
@@ -1782,13 +1989,29 @@
   // | master           | ordered         | +                                  |
   // | master           | atomic          | *                                  |
   // | master           | target          | *                                  |
+  // | master           | target parallel | *                                  |
+  // | master           | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | master           | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | master           | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | master           | teams           | +                                  |
   // | master           | cancellation    |                                    |
   // |                  | point           |                                    |
   // | master           | cancel          |                                    |
   // | master           | taskloop        | *                                  |
   // | master           | taskloop simd   | *                                  |
-  // | master           | distribute      |                                    |
+  // | master           | distribute      | +                                  |
+  // | master           | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | master           | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | master           | distribute simd | +                                  |
+  // | master           | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | master           | target simd     | *                                  |
+  // | master           | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | critical         | parallel        | *                                  |
   // | critical         | for             | +                                  |
@@ -1810,20 +2033,36 @@
   // | critical         | ordered         | +                                  |
   // | critical         | atomic          | *                                  |
   // | critical         | target          | *                                  |
+  // | critical         | target parallel | *                                  |
+  // | critical         | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | critical         | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | critical         | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | critical         | teams           | +                                  |
   // | critical         | cancellation    |                                    |
   // |                  | point           |                                    |
   // | critical         | cancel          |                                    |
   // | critical         | taskloop        | *                                  |
   // | critical         | taskloop simd   | *                                  |
-  // | critical         | distribute      |                                    |
+  // | critical         | distribute      | +                                  |
+  // | critical         | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | critical         | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | critical         | distribute simd | +                                  |
+  // | critical         | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | critical         | target simd     | *                                  |
+  // | critical         | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | simd             | parallel        |                                    |
   // | simd             | for             |                                    |
   // | simd             | for simd        |                                    |
   // | simd             | master          |                                    |
   // | simd             | critical        |                                    |
-  // | simd             | simd            |                                    |
+  // | simd             | simd            | *                                  |
   // | simd             | sections        |                                    |
   // | simd             | section         |                                    |
   // | simd             | single          |                                    |
@@ -1839,6 +2078,13 @@
   // | simd             | ordered         | + (with simd clause)               |
   // | simd             | atomic          |                                    |
   // | simd             | target          |                                    |
+  // | simd             | target parallel |                                    |
+  // | simd             | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | simd             | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | simd             | target exit     |                                    |
+  // |                  | data            |                                    |
   // | simd             | teams           |                                    |
   // | simd             | cancellation    |                                    |
   // |                  | point           |                                    |
@@ -1846,13 +2092,22 @@
   // | simd             | taskloop        |                                    |
   // | simd             | taskloop simd   |                                    |
   // | simd             | distribute      |                                    |
+  // | simd             | distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | simd             | distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | simd             | distribute simd |                                    |
+  // | simd             | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | simd             | target simd     |                                    |
+  // | simd             | teams distribute|                                    |
   // +------------------+-----------------+------------------------------------+
   // | for simd         | parallel        |                                    |
   // | for simd         | for             |                                    |
   // | for simd         | for simd        |                                    |
   // | for simd         | master          |                                    |
   // | for simd         | critical        |                                    |
-  // | for simd         | simd            |                                    |
+  // | for simd         | simd            | *                                  |
   // | for simd         | sections        |                                    |
   // | for simd         | section         |                                    |
   // | for simd         | single          |                                    |
@@ -1868,6 +2123,13 @@
   // | for simd         | ordered         | + (with simd clause)               |
   // | for simd         | atomic          |                                    |
   // | for simd         | target          |                                    |
+  // | for simd         | target parallel |                                    |
+  // | for simd         | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | for simd         | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | for simd         | target exit     |                                    |
+  // |                  | data            |                                    |
   // | for simd         | teams           |                                    |
   // | for simd         | cancellation    |                                    |
   // |                  | point           |                                    |
@@ -1875,13 +2137,22 @@
   // | for simd         | taskloop        |                                    |
   // | for simd         | taskloop simd   |                                    |
   // | for simd         | distribute      |                                    |
+  // | for simd         | distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | for simd         | distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | for simd         | distribute simd |                                    |
+  // | for simd         | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | for simd         | target simd     |                                    |
+  // | for simd         | teams distribute|                                    |
   // +------------------+-----------------+------------------------------------+
   // | parallel for simd| parallel        |                                    |
   // | parallel for simd| for             |                                    |
   // | parallel for simd| for simd        |                                    |
   // | parallel for simd| master          |                                    |
   // | parallel for simd| critical        |                                    |
-  // | parallel for simd| simd            |                                    |
+  // | parallel for simd| simd            | *                                  |
   // | parallel for simd| sections        |                                    |
   // | parallel for simd| section         |                                    |
   // | parallel for simd| single          |                                    |
@@ -1897,6 +2168,13 @@
   // | parallel for simd| ordered         | + (with simd clause)               |
   // | parallel for simd| atomic          |                                    |
   // | parallel for simd| target          |                                    |
+  // | parallel for simd| target parallel |                                    |
+  // | parallel for simd| target parallel |                                    |
+  // |                  | for             |                                    |
+  // | parallel for simd| target enter    |                                    |
+  // |                  | data            |                                    |
+  // | parallel for simd| target exit     |                                    |
+  // |                  | data            |                                    |
   // | parallel for simd| teams           |                                    |
   // | parallel for simd| cancellation    |                                    |
   // |                  | point           |                                    |
@@ -1904,6 +2182,14 @@
   // | parallel for simd| taskloop        |                                    |
   // | parallel for simd| taskloop simd   |                                    |
   // | parallel for simd| distribute      |                                    |
+  // | parallel for simd| distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | parallel for simd| distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | parallel for simd| distribute simd |                                    |
+  // |                  | for simd        |                                    |
+  // | parallel for simd| target simd     |                                    |
+  // | parallel for simd| teams distribute|                                    |
   // +------------------+-----------------+------------------------------------+
   // | sections         | parallel        | *                                  |
   // | sections         | for             | +                                  |
@@ -1926,13 +2212,28 @@
   // | sections         | ordered         | +                                  |
   // | sections         | atomic          | *                                  |
   // | sections         | target          | *                                  |
+  // | sections         | target parallel | *                                  |
+  // | sections         | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | sections         | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | sections         | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | sections         | teams           | +                                  |
   // | sections         | cancellation    |                                    |
   // |                  | point           | !                                  |
   // | sections         | cancel          | !                                  |
   // | sections         | taskloop        | *                                  |
   // | sections         | taskloop simd   | *                                  |
-  // | sections         | distribute      |                                    |
+  // | sections         | distribute      | +                                  |
+  // | sections         | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | sections         | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | sections         | distribute simd | +                                  |
+  // | sections         | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | sections         | target simd     | *                                  |
   // +------------------+-----------------+------------------------------------+
   // | section          | parallel        | *                                  |
   // | section          | for             | +                                  |
@@ -1955,13 +2256,29 @@
   // | section          | ordered         | +                                  |
   // | section          | atomic          | *                                  |
   // | section          | target          | *                                  |
+  // | section          | target parallel | *                                  |
+  // | section          | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | section          | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | section          | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | section          | teams           | +                                  |
   // | section          | cancellation    |                                    |
   // |                  | point           | !                                  |
   // | section          | cancel          | !                                  |
   // | section          | taskloop        | *                                  |
   // | section          | taskloop simd   | *                                  |
-  // | section          | distribute      |                                    |
+  // | section          | distribute      | +                                  |
+  // | section          | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | section          | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | section          | distribute simd | +                                  |
+  // | section          | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | section          | target simd     | *                                  |
+  // | section          | teams distrubte | +                                  |
   // +------------------+-----------------+------------------------------------+
   // | single           | parallel        | *                                  |
   // | single           | for             | +                                  |
@@ -1984,13 +2301,29 @@
   // | single           | ordered         | +                                  |
   // | single           | atomic          | *                                  |
   // | single           | target          | *                                  |
+  // | single           | target parallel | *                                  |
+  // | single           | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | single           | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | single           | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | single           | teams           | +                                  |
   // | single           | cancellation    |                                    |
   // |                  | point           |                                    |
   // | single           | cancel          |                                    |
   // | single           | taskloop        | *                                  |
   // | single           | taskloop simd   | *                                  |
-  // | single           | distribute      |                                    |
+  // | single           | distribute      | +                                  |
+  // | single           | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | single           | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | single           | distribute simd | +                                  |
+  // | single           | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | single           | target simd     | *                                  |
+  // | single           | teams distrubte | +                                  |
   // +------------------+-----------------+------------------------------------+
   // | parallel for     | parallel        | *                                  |
   // | parallel for     | for             | +                                  |
@@ -2013,13 +2346,29 @@
   // | parallel for     | ordered         | * (if construct is ordered)        |
   // | parallel for     | atomic          | *                                  |
   // | parallel for     | target          | *                                  |
+  // | parallel for     | target parallel | *                                  |
+  // | parallel for     | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | parallel for     | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | parallel for     | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | parallel for     | teams           | +                                  |
   // | parallel for     | cancellation    |                                    |
   // |                  | point           | !                                  |
   // | parallel for     | cancel          | !                                  |
   // | parallel for     | taskloop        | *                                  |
   // | parallel for     | taskloop simd   | *                                  |
-  // | parallel for     | distribute      |                                    |
+  // | parallel for     | distribute      | +                                  |
+  // | parallel for     | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | parallel for     | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | parallel for     | distribute simd | +                                  |
+  // | parallel for     | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | parallel for     | target simd     | *                                  |
+  // | parallel for     | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | parallel sections| parallel        | *                                  |
   // | parallel sections| for             | +                                  |
@@ -2042,13 +2391,29 @@
   // | parallel sections| ordered         | +                                  |
   // | parallel sections| atomic          | *                                  |
   // | parallel sections| target          | *                                  |
+  // | parallel sections| target parallel | *                                  |
+  // | parallel sections| target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | parallel sections| target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | parallel sections| target exit     | *                                  |
+  // |                  | data            |                                    |
   // | parallel sections| teams           | +                                  |
   // | parallel sections| cancellation    |                                    |
   // |                  | point           | !                                  |
   // | parallel sections| cancel          | !                                  |
   // | parallel sections| taskloop        | *                                  |
   // | parallel sections| taskloop simd   | *                                  |
-  // | parallel sections| distribute      |                                    | 
+  // | parallel sections| distribute      | +                                  |
+  // | parallel sections| distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | parallel sections| distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | parallel sections| distribute simd | +                                  |
+  // | parallel sections| target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | parallel sections| target simd     | *                                  |
+  // | parallel sections| teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | task             | parallel        | *                                  |
   // | task             | for             | +                                  |
@@ -2071,13 +2436,29 @@
   // | task             | ordered         | +                                  |
   // | task             | atomic          | *                                  |
   // | task             | target          | *                                  |
+  // | task             | target parallel | *                                  |
+  // | task             | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | task             | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | task             | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | task             | teams           | +                                  |
   // | task             | cancellation    |                                    |
   // |                  | point           | !                                  |
   // | task             | cancel          | !                                  |
   // | task             | taskloop        | *                                  |
   // | task             | taskloop simd   | *                                  |
-  // | task             | distribute      |                                    |
+  // | task             | distribute      | +                                  |
+  // | task             | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | task             | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | task             | distribute simd | +                                  |
+  // | task             | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | task             | target simd     | *                                  |
+  // | task             | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | ordered          | parallel        | *                                  |
   // | ordered          | for             | +                                  |
@@ -2100,13 +2481,29 @@
   // | ordered          | ordered         | +                                  |
   // | ordered          | atomic          | *                                  |
   // | ordered          | target          | *                                  |
+  // | ordered          | target parallel | *                                  |
+  // | ordered          | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | ordered          | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | ordered          | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | ordered          | teams           | +                                  |
   // | ordered          | cancellation    |                                    |
   // |                  | point           |                                    |
   // | ordered          | cancel          |                                    |
   // | ordered          | taskloop        | *                                  |
   // | ordered          | taskloop simd   | *                                  |
-  // | ordered          | distribute      |                                    |
+  // | ordered          | distribute      | +                                  |
+  // | ordered          | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | ordered          | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | ordered          | distribute simd | +                                  |
+  // | ordered          | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | ordered          | target simd     | *                                  |
+  // | ordered          | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | atomic           | parallel        |                                    |
   // | atomic           | for             |                                    |
@@ -2129,13 +2526,29 @@
   // | atomic           | ordered         |                                    |
   // | atomic           | atomic          |                                    |
   // | atomic           | target          |                                    |
+  // | atomic           | target parallel |                                    |
+  // | atomic           | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | atomic           | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | atomic           | target exit     |                                    |
+  // |                  | data            |                                    |
   // | atomic           | teams           |                                    |
   // | atomic           | cancellation    |                                    |
   // |                  | point           |                                    |
   // | atomic           | cancel          |                                    |
   // | atomic           | taskloop        |                                    |
   // | atomic           | taskloop simd   |                                    |
-  // | atomic           | distribute      |                                    | 
+  // | atomic           | distribute      |                                    |
+  // | atomic           | distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | atomic           | distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | atomic           | distribute simd |                                    |
+  // | atomic           | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | atomic           | target simd     |                                    |
+  // | atomic           | teams distribute|                                    |
   // +------------------+-----------------+------------------------------------+
   // | target           | parallel        | *                                  |
   // | target           | for             | *                                  |
@@ -2157,14 +2570,150 @@
   // | target           | flush           | *                                  |
   // | target           | ordered         | *                                  |
   // | target           | atomic          | *                                  |
-  // | target           | target          | *                                  |
+  // | target           | target          |                                    |
+  // | target           | target parallel |                                    |
+  // | target           | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | target           | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | target           | target exit     |                                    |
+  // |                  | data            |                                    |
   // | target           | teams           | *                                  |
   // | target           | cancellation    |                                    |
   // |                  | point           |                                    |
   // | target           | cancel          |                                    |
   // | target           | taskloop        | *                                  |
   // | target           | taskloop simd   | *                                  |
-  // | target           | distribute      |                                    |
+  // | target           | distribute      | +                                  |
+  // | target           | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | target           | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | target           | distribute simd | +                                  |
+  // | target           | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | target           | target simd     |                                    |
+  // | target           | teams distribute|                                    |
+  // +------------------+-----------------+------------------------------------+
+  // | target parallel  | parallel        | *                                  |
+  // | target parallel  | for             | *                                  |
+  // | target parallel  | for simd        | *                                  |
+  // | target parallel  | master          | *                                  |
+  // | target parallel  | critical        | *                                  |
+  // | target parallel  | simd            | *                                  |
+  // | target parallel  | sections        | *                                  |
+  // | target parallel  | section         | *                                  |
+  // | target parallel  | single          | *                                  |
+  // | target parallel  | parallel for    | *                                  |
+  // | target parallel  |parallel for simd| *                                  |
+  // | target parallel  |parallel sections| *                                  |
+  // | target parallel  | task            | *                                  |
+  // | target parallel  | taskyield       | *                                  |
+  // | target parallel  | barrier         | *                                  |
+  // | target parallel  | taskwait        | *                                  |
+  // | target parallel  | taskgroup       | *                                  |
+  // | target parallel  | flush           | *                                  |
+  // | target parallel  | ordered         | *                                  |
+  // | target parallel  | atomic          | *                                  |
+  // | target parallel  | target          |                                    |
+  // | target parallel  | target parallel |                                    |
+  // | target parallel  | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | target parallel  | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | target parallel  | target exit     |                                    |
+  // |                  | data            |                                    |
+  // | target parallel  | teams           |                                    |
+  // | target parallel  | cancellation    |                                    |
+  // |                  | point           | !                                  |
+  // | target parallel  | cancel          | !                                  |
+  // | target parallel  | taskloop        | *                                  |
+  // | target parallel  | taskloop simd   | *                                  |
+  // | target parallel  | distribute      |                                    |
+  // | target parallel  | distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | target parallel  | distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | target parallel  | distribute simd |                                    |
+  // | target parallel  | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | target parallel  | target simd     |                                    |
+  // | target parallel  | teams distribute| +                                  |
+  // +------------------+-----------------+------------------------------------+
+  // | target parallel  | parallel        | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | for             | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | for simd        | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | master          | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | critical        | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | simd            | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | sections        | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | section         | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | single          | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | parallel for    | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  |parallel for simd| *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  |parallel sections| *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | task            | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | taskyield       | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | barrier         | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | taskwait        | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | taskgroup       | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | flush           | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | ordered         | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | atomic          | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | target          |                                    |
+  // | for              |                 |                                    |
+  // | target parallel  | target parallel |                                    |
+  // | for              |                 |                                    |
+  // | target parallel  | target parallel |                                    |
+  // | for              | for             |                                    |
+  // | target parallel  | target enter    |                                    |
+  // | for              | data            |                                    |
+  // | target parallel  | target exit     |                                    |
+  // | for              | data            |                                    |
+  // | target parallel  | teams           |                                    |
+  // | for              |                 |                                    |
+  // | target parallel  | cancellation    |                                    |
+  // | for              | point           | !                                  |
+  // | target parallel  | cancel          | !                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | taskloop        | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | taskloop simd   | *                                  |
+  // | for              |                 |                                    |
+  // | target parallel  | distribute      |                                    |
+  // | for              |                 |                                    |
+  // | target parallel  | distribute      |                                    |
+  // | for              | parallel for    |                                    |
+  // | target parallel  | distribute      |                                    |
+  // | for              |parallel for simd|                                    |
+  // | target parallel  | distribute simd |                                    |
+  // | for              |                 |                                    |
+  // | target parallel  | target parallel |                                    |
+  // | for              | for simd        |                                    |
+  // | target parallel  | target simd     |                                    |
+  // | for              |                 |                                    |
+  // | target parallel  | teams distribute|                                    |
+  // | for              |                 |                                    |
   // +------------------+-----------------+------------------------------------+
   // | teams            | parallel        | *                                  |
   // | teams            | for             | +                                  |
@@ -2187,6 +2736,13 @@
   // | teams            | ordered         | +                                  |
   // | teams            | atomic          | +                                  |
   // | teams            | target          | +                                  |
+  // | teams            | target parallel | +                                  |
+  // | teams            | target parallel | +                                  |
+  // |                  | for             |                                    |
+  // | teams            | target enter    | +                                  |
+  // |                  | data            |                                    |
+  // | teams            | target exit     | +                                  |
+  // |                  | data            |                                    |
   // | teams            | teams           | +                                  |
   // | teams            | cancellation    |                                    |
   // |                  | point           |                                    |
@@ -2194,6 +2750,15 @@
   // | teams            | taskloop        | +                                  |
   // | teams            | taskloop simd   | +                                  |
   // | teams            | distribute      | !                                  |
+  // | teams            | distribute      | !                                  |
+  // |                  | parallel for    |                                    |
+  // | teams            | distribute      | !                                  |
+  // |                  |parallel for simd|                                    |
+  // | teams            | distribute simd | !                                  |
+  // | teams            | target parallel | +                                  |
+  // |                  | for simd        |                                    |
+  // | teams            | target simd     | +                                  |
+  // | teams            | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | taskloop         | parallel        | *                                  |
   // | taskloop         | for             | +                                  |
@@ -2216,19 +2781,35 @@
   // | taskloop         | ordered         | +                                  |
   // | taskloop         | atomic          | *                                  |
   // | taskloop         | target          | *                                  |
+  // | taskloop         | target parallel | *                                  |
+  // | taskloop         | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | taskloop         | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | taskloop         | target exit     | *                                  |
+  // |                  | data            |                                    |
   // | taskloop         | teams           | +                                  |
   // | taskloop         | cancellation    |                                    |
   // |                  | point           |                                    |
   // | taskloop         | cancel          |                                    |
   // | taskloop         | taskloop        | *                                  |
-  // | taskloop         | distribute      |                                    |
+  // | taskloop         | distribute      | +                                  |
+  // | taskloop         | distribute      | +                                  |
+  // |                  | parallel for    |                                    |
+  // | taskloop         | distribute      | +                                  |
+  // |                  |parallel for simd|                                    |
+  // | taskloop         | distribute simd | +                                  |
+  // | taskloop         | target parallel | *                                  |
+  // |                  | for simd        |                                    |
+  // | taskloop         | target simd     | *                                  |
+  // | taskloop         | teams distribute| +                                  |
   // +------------------+-----------------+------------------------------------+
   // | taskloop simd    | parallel        |                                    |
   // | taskloop simd    | for             |                                    |
   // | taskloop simd    | for simd        |                                    |
   // | taskloop simd    | master          |                                    |
   // | taskloop simd    | critical        |                                    |
-  // | taskloop simd    | simd            |                                    |
+  // | taskloop simd    | simd            | *                                  |
   // | taskloop simd    | sections        |                                    |
   // | taskloop simd    | section         |                                    |
   // | taskloop simd    | single          |                                    |
@@ -2244,6 +2825,13 @@
   // | taskloop simd    | ordered         | + (with simd clause)               |
   // | taskloop simd    | atomic          |                                    |
   // | taskloop simd    | target          |                                    |
+  // | taskloop simd    | target parallel |                                    |
+  // | taskloop simd    | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | taskloop simd    | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | taskloop simd    | target exit     |                                    |
+  // |                  | data            |                                    |
   // | taskloop simd    | teams           |                                    |
   // | taskloop simd    | cancellation    |                                    |
   // |                  | point           |                                    |
@@ -2251,6 +2839,15 @@
   // | taskloop simd    | taskloop        |                                    |
   // | taskloop simd    | taskloop simd   |                                    |
   // | taskloop simd    | distribute      |                                    |
+  // | taskloop simd    | distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | taskloop simd    | distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | taskloop simd    | distribute simd |                                    |
+  // | taskloop simd    | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | taskloop simd    | target simd     |                                    |
+  // | taskloop simd    | teams distribute|                                    |
   // +------------------+-----------------+------------------------------------+
   // | distribute       | parallel        | *                                  |
   // | distribute       | for             | *                                  |
@@ -2273,6 +2870,13 @@
   // | distribute       | ordered         | +                                  |
   // | distribute       | atomic          | *                                  |
   // | distribute       | target          |                                    |
+  // | distribute       | target parallel |                                    |
+  // | distribute       | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | distribute       | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | distribute       | target exit     |                                    |
+  // |                  | data            |                                    |
   // | distribute       | teams           |                                    |
   // | distribute       | cancellation    | +                                  |
   // |                  | point           |                                    |
@@ -2280,11 +2884,382 @@
   // | distribute       | taskloop        | *                                  |
   // | distribute       | taskloop simd   | *                                  |
   // | distribute       | distribute      |                                    |
+  // | distribute       | distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | distribute       | distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | distribute       | distribute simd |                                    |
+  // | distribute       | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | distribute       | target simd     |                                    |
+  // | distribute       | teams distribute|                                    |
+  // +------------------+-----------------+------------------------------------+
+  // | distribute       | parallel        | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | for             | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | for simd        | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | master          | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | critical        | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | simd            | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | sections        | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | section         | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | single          | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | parallel for    | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       |parallel for simd| *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       |parallel sections| *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | task            | *                                  |
+  // | parallel for     |                 |                                    |
+  // | parallel for     |                 |                                    |
+  // | distribute       | taskyield       | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | barrier         | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | taskwait        | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | taskgroup       | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | flush           | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | ordered         | +                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | atomic          | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | target          |                                    |
+  // | parallel for     |                 |                                    |
+  // | distribute       | target parallel |                                    |
+  // | parallel for     |                 |                                    |
+  // | distribute       | target parallel |                                    |
+  // | parallel for     | for             |                                    |
+  // | distribute       | target enter    |                                    |
+  // | parallel for     | data            |                                    |
+  // | distribute       | target exit     |                                    |
+  // | parallel for     | data            |                                    |
+  // | distribute       | teams           |                                    |
+  // | parallel for     |                 |                                    |
+  // | distribute       | cancellation    | +                                  |
+  // | parallel for     | point           |                                    |
+  // | distribute       | cancel          | +                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | taskloop        | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | taskloop simd   | *                                  |
+  // | parallel for     |                 |                                    |
+  // | distribute       | distribute      |                                    |
+  // | parallel for     |                 |                                    |
+  // | distribute       | distribute      |                                    |
+  // | parallel for     | parallel for    |                                    |
+  // | distribute       | distribute      |                                    |
+  // | parallel for     |parallel for simd|                                    |
+  // | distribute       | distribute simd |                                    |
+  // | parallel for     |                 |                                    |
+  // | distribute       | target parallel |                                    |
+  // | parallel for     | for simd        |                                    |
+  // | distribute       | target simd     |                                    |
+  // | parallel for     |                 |                                    |
+  // | distribute       | teams distribute|                                    |
+  // | parallel for     |                 |                                    |
+  // +------------------+-----------------+------------------------------------+
+  // | distribute       | parallel        | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | for             | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | for simd        | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | master          | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | critical        | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | simd            | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | sections        | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | section         | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | single          | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | parallel for    | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       |parallel for simd| *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       |parallel sections| *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | task            | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | taskyield       | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | barrier         | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | taskwait        | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | taskgroup       | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | flush           | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | ordered         | +                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | atomic          | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | target          |                                    |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | target parallel |                                    |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | target parallel |                                    |
+  // | parallel for simd| for             |                                    |
+  // | distribute       | target enter    |                                    |
+  // | parallel for simd| data            |                                    |
+  // | distribute       | target exit     |                                    |
+  // | parallel for simd| data            |                                    |
+  // | distribute       | teams           |                                    |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | cancellation    | +                                  |
+  // | parallel for simd| point           |                                    |
+  // | distribute       | cancel          | +                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | taskloop        | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | taskloop simd   | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | distribute      |                                    |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | distribute      | *                                  |
+  // | parallel for simd| parallel for    |                                    |
+  // | distribute       | distribute      | *                                  |
+  // | parallel for simd|parallel for simd|                                    |
+  // | distribute       | distribute simd | *                                  |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | target parallel |                                    |
+  // | parallel for simd| for simd        |                                    |
+  // | distribute       | target simd     |                                    |
+  // | parallel for simd|                 |                                    |
+  // | distribute       | teams distribute|                                    |
+  // | parallel for simd|                 |                                    |
+  // +------------------+-----------------+------------------------------------+
+  // | distribute simd  | parallel        | *                                  |
+  // | distribute simd  | for             | *                                  |
+  // | distribute simd  | for simd        | *                                  |
+  // | distribute simd  | master          | *                                  |
+  // | distribute simd  | critical        | *                                  |
+  // | distribute simd  | simd            | *                                  |
+  // | distribute simd  | sections        | *                                  |
+  // | distribute simd  | section         | *                                  |
+  // | distribute simd  | single          | *                                  |
+  // | distribute simd  | parallel for    | *                                  |
+  // | distribute simd  |parallel for simd| *                                  |
+  // | distribute simd  |parallel sections| *                                  |
+  // | distribute simd  | task            | *                                  |
+  // | distribute simd  | taskyield       | *                                  |
+  // | distribute simd  | barrier         | *                                  |
+  // | distribute simd  | taskwait        | *                                  |
+  // | distribute simd  | taskgroup       | *                                  |
+  // | distribute simd  | flush           | *                                  |
+  // | distribute simd  | ordered         | +                                  |
+  // | distribute simd  | atomic          | *                                  |
+  // | distribute simd  | target          | *                                  |
+  // | distribute simd  | target parallel | *                                  |
+  // | distribute simd  | target parallel | *                                  |
+  // |                  | for             |                                    |
+  // | distribute simd  | target enter    | *                                  |
+  // |                  | data            |                                    |
+  // | distribute simd  | target exit     | *                                  |
+  // |                  | data            |                                    |
+  // | distribute simd  | teams           | *                                  |
+  // | distribute simd  | cancellation    | +                                  |
+  // |                  | point           |                                    |
+  // | distribute simd  | cancel          | +                                  |
+  // | distribute simd  | taskloop        | *                                  |
+  // | distribute simd  | taskloop simd   | *                                  |
+  // | distribute simd  | distribute      |                                    |
+  // | distribute simd  | distribute      | *                                  |
+  // |                  | parallel for    |                                    |
+  // | distribute simd  | distribute      | *                                  |
+  // |                  |parallel for simd|                                    |
+  // | distribute simd  | distribute simd | *                                  |
+  // | distribute simd  | target parallel | *                                  |
+  // |                  | for simd        |                                    |
+  // | distribute simd  | target simd     | *                                  |
+  // | distribute simd  | teams distribute| *                                  |
+  // +------------------+-----------------+------------------------------------+
+  // | target parallel  | parallel        | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | for             | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | for simd        | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | master          | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | critical        | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | simd            | !                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | sections        | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | section         | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | single          | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | parallel for    | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  |parallel for simd| *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  |parallel sections| *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | task            | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | taskyield       | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | barrier         | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | taskwait        | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | taskgroup       | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | flush           | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | ordered         | + (with simd clause)               |
+  // | for simd         |                 |                                    |
+  // | target parallel  | atomic          | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | target          | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | target parallel | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | target parallel | *                                  |
+  // | for simd         | for             |                                    |
+  // | target parallel  | target enter    | *                                  |
+  // | for simd         | data            |                                    |
+  // | target parallel  | target exit     | *                                  |
+  // | for simd         | data            |                                    |
+  // | target parallel  | teams           | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | cancellation    | *                                  |
+  // | for simd         | point           |                                    |
+  // | target parallel  | cancel          | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | taskloop        | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | taskloop simd   | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | distribute      | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | distribute      | *                                  |
+  // | for simd         | parallel for    |                                    |
+  // | target parallel  | distribute      | *                                  |
+  // | for simd         |parallel for simd|                                    |
+  // | target parallel  | distribute simd | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | target parallel | *                                  |
+  // | for simd         | for simd        |                                    |
+  // | target parallel  | target simd     | *                                  |
+  // | for simd         |                 |                                    |
+  // | target parallel  | teams distribute| *                                  |
+  // | for simd         |                 |                                    |
+  // +------------------+-----------------+------------------------------------+
+  // | target simd      | parallel        |                                    |
+  // | target simd      | for             |                                    |
+  // | target simd      | for simd        |                                    |
+  // | target simd      | master          |                                    |
+  // | target simd      | critical        |                                    |
+  // | target simd      | simd            |                                    |
+  // | target simd      | sections        |                                    |
+  // | target simd      | section         |                                    |
+  // | target simd      | single          |                                    |
+  // | target simd      | parallel for    |                                    |
+  // | target simd      |parallel for simd|                                    |
+  // | target simd      |parallel sections|                                    |
+  // | target simd      | task            |                                    |
+  // | target simd      | taskyield       |                                    |
+  // | target simd      | barrier         |                                    |
+  // | target simd      | taskwait        |                                    |
+  // | target simd      | taskgroup       |                                    |
+  // | target simd      | flush           |                                    |
+  // | target simd      | ordered         | + (with simd clause)               |
+  // | target simd      | atomic          |                                    |
+  // | target simd      | target          |                                    |
+  // | target simd      | target parallel |                                    |
+  // | target simd      | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | target simd      | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | target simd      | target exit     |                                    |
+  // |                  | data            |                                    |
+  // | target simd      | teams           |                                    |
+  // | target simd      | cancellation    |                                    |
+  // |                  | point           |                                    |
+  // | target simd      | cancel          |                                    |
+  // | target simd      | taskloop        |                                    |
+  // | target simd      | taskloop simd   |                                    |
+  // | target simd      | distribute      |                                    |
+  // | target simd      | distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | target simd      | distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | target simd      | distribute simd |                                    |
+  // | target simd      | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | target simd      | target simd     |                                    |
+  // | target simd      | teams distribute|                                    |
+  // +------------------+-----------------+------------------------------------+
+  // | teams distribute | parallel        |                                    |
+  // | teams distribute | for             |                                    |
+  // | teams distribute | for simd        |                                    |
+  // | teams distribute | master          |                                    |
+  // | teams distribute | critical        |                                    |
+  // | teams distribute | simd            |                                    |
+  // | teams distribute | sections        |                                    |
+  // | teams distribute | section         |                                    |
+  // | teams distribute | single          |                                    |
+  // | teams distribute | parallel for    |                                    |
+  // | teams distribute |parallel for simd|                                    |
+  // | teams distribute |parallel sections|                                    |
+  // | teams distribute | task            |                                    |
+  // | teams distribute | taskyield       |                                    |
+  // | teams distribute | barrier         |                                    |
+  // | teams distribute | taskwait        |                                    |
+  // | teams distribute | taskgroup       |                                    |
+  // | teams distribute | flush           |                                    |
+  // | teams distribute | ordered         | + (with simd clause)               |
+  // | teams distribute | atomic          |                                    |
+  // | teams distribute | target          |                                    |
+  // | teams distribute | target parallel |                                    |
+  // | teams distribute | target parallel |                                    |
+  // |                  | for             |                                    |
+  // | teams distribute | target enter    |                                    |
+  // |                  | data            |                                    |
+  // | teams distribute | target exit     |                                    |
+  // |                  | data            |                                    |
+  // | teams distribute | teams           |                                    |
+  // | teams distribute | cancellation    |                                    |
+  // |                  | point           |                                    |
+  // | teams distribute | cancel          |                                    |
+  // | teams distribute | taskloop        |                                    |
+  // | teams distribute | taskloop simd   |                                    |
+  // | teams distribute | distribute      |                                    |
+  // | teams distribute | distribute      |                                    |
+  // |                  | parallel for    |                                    |
+  // | teams distribute | distribute      |                                    |
+  // |                  |parallel for simd|                                    |
+  // | teams distribute | distribute simd |                                    |
+  // | teams distribute | target parallel |                                    |
+  // |                  | for simd        |                                    |
+  // | teams distribute | teams distribute|                                    |
   // +------------------+-----------------+------------------------------------+
   if (Stack->getCurScope()) {
     auto ParentRegion = Stack->getParentDirective();
+    auto OffendingRegion = ParentRegion;
     bool NestingProhibited = false;
     bool CloseNesting = true;
+    bool OrphanSeen = false;
     enum {
       NoRecommend,
       ShouldBeInParallelRegion,
@@ -2296,10 +3271,15 @@
       // OpenMP [2.16, Nesting of Regions]
       // OpenMP constructs may not be nested inside a simd region.
       // OpenMP [2.8.1,simd Construct, Restrictions]
-      // An ordered construct with the simd clause is the only OpenMP construct
-      // that can appear in the simd region.
-      SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region_simd);
-      return true;
+      // An ordered construct with the simd clause is the only OpenMP
+      // construct that can appear in the simd region.
+      // Allowing a SIMD construct nested in another SIMD construct is an
+      // extension. The OpenMP 4.5 spec does not allow it. Issue a warning
+      // message.
+      SemaRef.Diag(StartLoc, (CurrentRegion != OMPD_simd)
+                                 ? diag::err_omp_prohibited_region_simd
+                                 : diag::warn_omp_nesting_simd);
+      return CurrentRegion != OMPD_simd;
     }
     if (ParentRegion == OMPD_atomic) {
       // OpenMP [2.16, Nesting of Regions]
@@ -2321,9 +3301,10 @@
       }
       return false;
     }
-    // Allow some constructs to be orphaned (they could be used in functions,
-    // called from OpenMP regions with the required preconditions).
-    if (ParentRegion == OMPD_unknown)
+    // Allow some constructs (except teams) to be orphaned (they could be
+    // used in functions, called from OpenMP regions with the required
+    // preconditions).
+    if (ParentRegion == OMPD_unknown && !isOpenMPTeamsDirective(CurrentRegion))
       return false;
     if (CurrentRegion == OMPD_cancellation_point ||
         CurrentRegion == OMPD_cancel) {
@@ -2339,9 +3320,12 @@
       // OpenMP construct that matches the type specified in
       // construct-type-clause.
       NestingProhibited =
-          !((CancelRegion == OMPD_parallel && ParentRegion == OMPD_parallel) ||
+          !((CancelRegion == OMPD_parallel &&
+             (ParentRegion == OMPD_parallel ||
+              ParentRegion == OMPD_target_parallel)) ||
             (CancelRegion == OMPD_for &&
-             (ParentRegion == OMPD_for || ParentRegion == OMPD_parallel_for)) ||
+             (ParentRegion == OMPD_for || ParentRegion == OMPD_parallel_for ||
+              ParentRegion == OMPD_target_parallel_for)) ||
             (CancelRegion == OMPD_taskgroup && ParentRegion == OMPD_task) ||
             (CancelRegion == OMPD_sections &&
              (ParentRegion == OMPD_section || ParentRegion == OMPD_sections ||
@@ -2351,28 +3335,24 @@
       // A master region may not be closely nested inside a worksharing,
       // atomic, or explicit task region.
       NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) ||
-                          ParentRegion == OMPD_task ||
-                          isOpenMPTaskLoopDirective(ParentRegion);
+                          isOpenMPTaskingDirective(ParentRegion);
     } else if (CurrentRegion == OMPD_critical && CurrentName.getName()) {
       // OpenMP [2.16, Nesting of Regions]
       // A critical region may not be nested (closely or otherwise) inside a
       // critical region with the same name. Note that this restriction is not
       // sufficient to prevent deadlock.
       SourceLocation PreviousCriticalLoc;
-      bool DeadLock =
-          Stack->hasDirective([CurrentName, &PreviousCriticalLoc](
-                                  OpenMPDirectiveKind K,
-                                  const DeclarationNameInfo &DNI,
-                                  SourceLocation Loc)
-                                  ->bool {
-                                if (K == OMPD_critical &&
-                                    DNI.getName() == CurrentName.getName()) {
-                                  PreviousCriticalLoc = Loc;
-                                  return true;
-                                } else
-                                  return false;
-                              },
-                              false /* skip top directive */);
+      bool DeadLock = Stack->hasDirective(
+          [CurrentName, &PreviousCriticalLoc](OpenMPDirectiveKind K,
+                                              const DeclarationNameInfo &DNI,
+                                              SourceLocation Loc) -> bool {
+            if (K == OMPD_critical && DNI.getName() == CurrentName.getName()) {
+              PreviousCriticalLoc = Loc;
+              return true;
+            } else
+              return false;
+          },
+          false /* skip top directive */);
       if (DeadLock) {
         SemaRef.Diag(StartLoc,
                      diag::err_omp_prohibited_region_critical_same_name)
@@ -2386,21 +3366,21 @@
       // OpenMP [2.16, Nesting of Regions]
       // A barrier region may not be closely nested inside a worksharing,
       // explicit task, critical, ordered, atomic, or master region.
-      NestingProhibited =
-          isOpenMPWorksharingDirective(ParentRegion) ||
-          ParentRegion == OMPD_task || ParentRegion == OMPD_master ||
-          ParentRegion == OMPD_critical || ParentRegion == OMPD_ordered ||
-          isOpenMPTaskLoopDirective(ParentRegion);
+      NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) ||
+                          isOpenMPTaskingDirective(ParentRegion) ||
+                          ParentRegion == OMPD_master ||
+                          ParentRegion == OMPD_critical ||
+                          ParentRegion == OMPD_ordered;
     } else if (isOpenMPWorksharingDirective(CurrentRegion) &&
                !isOpenMPParallelDirective(CurrentRegion)) {
       // OpenMP [2.16, Nesting of Regions]
       // A worksharing region may not be closely nested inside a worksharing,
       // explicit task, critical, ordered, atomic, or master region.
-      NestingProhibited =
-          isOpenMPWorksharingDirective(ParentRegion) ||
-          ParentRegion == OMPD_task || ParentRegion == OMPD_master ||
-          ParentRegion == OMPD_critical || ParentRegion == OMPD_ordered ||
-          isOpenMPTaskLoopDirective(ParentRegion);
+      NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) ||
+                          isOpenMPTaskingDirective(ParentRegion) ||
+                          ParentRegion == OMPD_master ||
+                          ParentRegion == OMPD_critical ||
+                          ParentRegion == OMPD_ordered;
       Recommend = ShouldBeInParallelRegion;
     } else if (CurrentRegion == OMPD_ordered) {
       // OpenMP [2.16, Nesting of Regions]
@@ -2412,8 +3392,7 @@
       // An ordered construct with the simd clause is the only OpenMP construct
       // that can appear in the simd region.
       NestingProhibited = ParentRegion == OMPD_critical ||
-                          ParentRegion == OMPD_task ||
-                          isOpenMPTaskLoopDirective(ParentRegion) ||
+                          isOpenMPTaskingDirective(ParentRegion) ||
                           !(isOpenMPSimdDirective(ParentRegion) ||
                             Stack->isParentOrderedRegion());
       Recommend = ShouldBeInOrderedRegion;
@@ -2422,10 +3401,11 @@
       // If specified, a teams construct must be contained within a target
       // construct.
       NestingProhibited = ParentRegion != OMPD_target;
+      OrphanSeen = ParentRegion == OMPD_unknown;
       Recommend = ShouldBeInTargetRegion;
       Stack->setParentTeamsRegionLoc(Stack->getConstructLoc());
     }
-    if (!NestingProhibited && isOpenMPTeamsDirective(ParentRegion)) {
+    if (!NestingProhibited && ParentRegion == OMPD_teams) {
       // OpenMP [2.16, Nesting of Regions]
       // distribute, parallel, parallel sections, parallel workshare, and the
       // parallel loop and parallel loop SIMD constructs are the only OpenMP
@@ -2434,17 +3414,42 @@
                           !isOpenMPDistributeDirective(CurrentRegion);
       Recommend = ShouldBeInParallelRegion;
     }
-    if (!NestingProhibited && isOpenMPDistributeDirective(CurrentRegion)) {
+    if (!NestingProhibited &&
+        isOpenMPNestingDistributeDirective(CurrentRegion)) {
       // OpenMP 4.5 [2.17 Nesting of Regions]
       // The region associated with the distribute construct must be strictly
       // nested inside a teams region
-      NestingProhibited = !isOpenMPTeamsDirective(ParentRegion);
+      NestingProhibited = ParentRegion != OMPD_teams;
       Recommend = ShouldBeInTeamsRegion;
     }
+    if (!NestingProhibited &&
+        (isOpenMPTargetExecutionDirective(CurrentRegion) ||
+         isOpenMPTargetDataManagementDirective(CurrentRegion))) {
+      // OpenMP 4.5 [2.17 Nesting of Regions]
+      // If a target, target update, target data, target enter data, or
+      // target exit data construct is encountered during execution of a
+      // target region, the behavior is unspecified.
+      NestingProhibited = Stack->hasDirective(
+          [&OffendingRegion](OpenMPDirectiveKind K, const DeclarationNameInfo &,
+                             SourceLocation) -> bool {
+            if (isOpenMPTargetExecutionDirective(K)) {
+              OffendingRegion = K;
+              return true;
+            } else
+              return false;
+          },
+          false /* don't skip top directive */);
+      CloseNesting = false;
+    }
     if (NestingProhibited) {
-      SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region)
-          << CloseNesting << getOpenMPDirectiveName(ParentRegion) << Recommend
-          << getOpenMPDirectiveName(CurrentRegion);
+      if (OrphanSeen) {
+        SemaRef.Diag(StartLoc, diag::err_omp_orphaned_device_directive)
+            << getOpenMPDirectiveName(CurrentRegion) << Recommend;
+      } else {
+        SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region)
+            << CloseNesting << getOpenMPDirectiveName(OffendingRegion)
+            << Recommend << getOpenMPDirectiveName(CurrentRegion);
+      }
       return true;
     }
   }
@@ -2543,7 +3548,7 @@
     return StmtError();
 
   llvm::SmallVector<OMPClause *, 8> ClausesWithImplicit;
-  llvm::DenseMap<VarDecl *, Expr *> VarsWithInheritedDSA;
+  llvm::DenseMap<ValueDecl *, Expr *> VarsWithInheritedDSA;
   bool ErrorFound = false;
   ClausesWithImplicit.append(Clauses.begin(), Clauses.end());
   if (AStmt) {
@@ -2678,6 +3683,18 @@
                                      EndLoc);
     AllowedNameModifiers.push_back(OMPD_target);
     break;
+  case OMPD_target_parallel:
+    Res = ActOnOpenMPTargetParallelDirective(ClausesWithImplicit, AStmt,
+                                             StartLoc, EndLoc);
+    AllowedNameModifiers.push_back(OMPD_target);
+    AllowedNameModifiers.push_back(OMPD_parallel);
+    break;
+  case OMPD_target_parallel_for:
+    Res = ActOnOpenMPTargetParallelForDirective(
+        ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
+    AllowedNameModifiers.push_back(OMPD_target);
+    AllowedNameModifiers.push_back(OMPD_parallel);
+    break;
   case OMPD_cancellation_point:
     assert(ClausesWithImplicit.empty() &&
            "No clauses are allowed for 'omp cancellation point' directive");
@@ -2697,6 +3714,16 @@
                                          EndLoc);
     AllowedNameModifiers.push_back(OMPD_target_data);
     break;
+  case OMPD_target_enter_data:
+    Res = ActOnOpenMPTargetEnterDataDirective(ClausesWithImplicit, StartLoc,
+                                              EndLoc);
+    AllowedNameModifiers.push_back(OMPD_target_enter_data);
+    break;
+  case OMPD_target_exit_data:
+    Res = ActOnOpenMPTargetExitDataDirective(ClausesWithImplicit, StartLoc,
+                                             EndLoc);
+    AllowedNameModifiers.push_back(OMPD_target_exit_data);
+    break;
   case OMPD_taskloop:
     Res = ActOnOpenMPTaskLoopDirective(ClausesWithImplicit, AStmt, StartLoc,
                                        EndLoc, VarsWithInheritedDSA);
@@ -2711,7 +3738,46 @@
     Res = ActOnOpenMPDistributeDirective(ClausesWithImplicit, AStmt, StartLoc,
                                          EndLoc, VarsWithInheritedDSA);
     break;
+  case OMPD_target_update:
+    assert(!AStmt && "Statement is not allowed for target update");
+    Res =
+        ActOnOpenMPTargetUpdateDirective(ClausesWithImplicit, StartLoc, EndLoc);
+    AllowedNameModifiers.push_back(OMPD_target_update);
+    break;
+  case OMPD_distribute_parallel_for:
+    Res = ActOnOpenMPDistributeParallelForDirective(
+        ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
+    AllowedNameModifiers.push_back(OMPD_parallel);
+    break;
+  case OMPD_distribute_parallel_for_simd:
+    Res = ActOnOpenMPDistributeParallelForSimdDirective(
+        ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
+    AllowedNameModifiers.push_back(OMPD_parallel);
+    break;
+  case OMPD_distribute_simd:
+    Res = ActOnOpenMPDistributeSimdDirective(
+        ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
+    break;
+  case OMPD_target_parallel_for_simd:
+    Res = ActOnOpenMPTargetParallelForSimdDirective(
+        ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
+    AllowedNameModifiers.push_back(OMPD_target);
+    AllowedNameModifiers.push_back(OMPD_parallel);
+    break;
+  case OMPD_target_simd:
+    Res = ActOnOpenMPTargetSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
+                                         EndLoc, VarsWithInheritedDSA);
+    AllowedNameModifiers.push_back(OMPD_target);
+    break;
+  case OMPD_teams_distribute:
+    Res = ActOnOpenMPTeamsDistributeDirective(
+        ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
+    break;
+  case OMPD_declare_target:
+  case OMPD_end_declare_target:
   case OMPD_threadprivate:
+  case OMPD_declare_reduction:
+  case OMPD_declare_simd:
     llvm_unreachable("OpenMP Directive is not allowed");
   case OMPD_unknown:
     llvm_unreachable("Unknown OpenMP directive");
@@ -2732,6 +3798,252 @@
   return Res;
 }
 
+Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareSimdDirective(
+    DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS, Expr *Simdlen,
+    ArrayRef<Expr *> Uniforms, ArrayRef<Expr *> Aligneds,
+    ArrayRef<Expr *> Alignments, ArrayRef<Expr *> Linears,
+    ArrayRef<unsigned> LinModifiers, ArrayRef<Expr *> Steps, SourceRange SR) {
+  assert(Aligneds.size() == Alignments.size());
+  assert(Linears.size() == LinModifiers.size());
+  assert(Linears.size() == Steps.size());
+  if (!DG || DG.get().isNull())
+    return DeclGroupPtrTy();
+
+  if (!DG.get().isSingleDecl()) {
+    Diag(SR.getBegin(), diag::err_omp_single_decl_in_declare_simd);
+    return DG;
+  }
+  auto *ADecl = DG.get().getSingleDecl();
+  if (auto *FTD = dyn_cast<FunctionTemplateDecl>(ADecl))
+    ADecl = FTD->getTemplatedDecl();
+
+  auto *FD = dyn_cast<FunctionDecl>(ADecl);
+  if (!FD) {
+    Diag(ADecl->getLocation(), diag::err_omp_function_expected);
+    return DeclGroupPtrTy();
+  }
+
+  // OpenMP [2.8.2, declare simd construct, Description]
+  // The parameter of the simdlen clause must be a constant positive integer
+  // expression.
+  ExprResult SL;
+  if (Simdlen)
+    SL = VerifyPositiveIntegerConstantInClause(Simdlen, OMPC_simdlen);
+  // OpenMP [2.8.2, declare simd construct, Description]
+  // The special this pointer can be used as if was one of the arguments to the
+  // function in any of the linear, aligned, or uniform clauses.
+  // The uniform clause declares one or more arguments to have an invariant
+  // value for all concurrent invocations of the function in the execution of a
+  // single SIMD loop.
+  llvm::DenseMap<Decl *, Expr *> UniformedArgs;
+  Expr *UniformedLinearThis = nullptr;
+  for (auto *E : Uniforms) {
+    E = E->IgnoreParenImpCasts();
+    if (auto *DRE = dyn_cast<DeclRefExpr>(E))
+      if (auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl()))
+        if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
+            FD->getParamDecl(PVD->getFunctionScopeIndex())
+                    ->getCanonicalDecl() == PVD->getCanonicalDecl()) {
+          UniformedArgs.insert(std::make_pair(PVD->getCanonicalDecl(), E));
+          continue;
+        }
+    if (isa<CXXThisExpr>(E)) {
+      UniformedLinearThis = E;
+      continue;
+    }
+    Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
+        << FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
+  }
+  // OpenMP [2.8.2, declare simd construct, Description]
+  // The aligned clause declares that the object to which each list item points
+  // is aligned to the number of bytes expressed in the optional parameter of
+  // the aligned clause.
+  // The special this pointer can be used as if was one of the arguments to the
+  // function in any of the linear, aligned, or uniform clauses.
+  // The type of list items appearing in the aligned clause must be array,
+  // pointer, reference to array, or reference to pointer.
+  llvm::DenseMap<Decl *, Expr *> AlignedArgs;
+  Expr *AlignedThis = nullptr;
+  for (auto *E : Aligneds) {
+    E = E->IgnoreParenImpCasts();
+    if (auto *DRE = dyn_cast<DeclRefExpr>(E))
+      if (auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
+        auto *CanonPVD = PVD->getCanonicalDecl();
+        if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
+            FD->getParamDecl(PVD->getFunctionScopeIndex())
+                    ->getCanonicalDecl() == CanonPVD) {
+          // OpenMP  [2.8.1, simd construct, Restrictions]
+          // A list-item cannot appear in more than one aligned clause.
+          if (AlignedArgs.count(CanonPVD) > 0) {
+            Diag(E->getExprLoc(), diag::err_omp_aligned_twice)
+                << 1 << E->getSourceRange();
+            Diag(AlignedArgs[CanonPVD]->getExprLoc(),
+                 diag::note_omp_explicit_dsa)
+                << getOpenMPClauseName(OMPC_aligned);
+            continue;
+          }
+          AlignedArgs[CanonPVD] = E;
+          QualType QTy = PVD->getType()
+                             .getNonReferenceType()
+                             .getUnqualifiedType()
+                             .getCanonicalType();
+          const Type *Ty = QTy.getTypePtrOrNull();
+          if (!Ty || (!Ty->isArrayType() && !Ty->isPointerType())) {
+            Diag(E->getExprLoc(), diag::err_omp_aligned_expected_array_or_ptr)
+                << QTy << getLangOpts().CPlusPlus << E->getSourceRange();
+            Diag(PVD->getLocation(), diag::note_previous_decl) << PVD;
+          }
+          continue;
+        }
+      }
+    if (isa<CXXThisExpr>(E)) {
+      if (AlignedThis) {
+        Diag(E->getExprLoc(), diag::err_omp_aligned_twice)
+            << 2 << E->getSourceRange();
+        Diag(AlignedThis->getExprLoc(), diag::note_omp_explicit_dsa)
+            << getOpenMPClauseName(OMPC_aligned);
+      }
+      AlignedThis = E;
+      continue;
+    }
+    Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
+        << FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
+  }
+  // The optional parameter of the aligned clause, alignment, must be a constant
+  // positive integer expression. If no optional parameter is specified,
+  // implementation-defined default alignments for SIMD instructions on the
+  // target platforms are assumed.
+  SmallVector<Expr *, 4> NewAligns;
+  for (auto *E : Alignments) {
+    ExprResult Align;
+    if (E)
+      Align = VerifyPositiveIntegerConstantInClause(E, OMPC_aligned);
+    NewAligns.push_back(Align.get());
+  }
+  // OpenMP [2.8.2, declare simd construct, Description]
+  // The linear clause declares one or more list items to be private to a SIMD
+  // lane and to have a linear relationship with respect to the iteration space
+  // of a loop.
+  // The special this pointer can be used as if was one of the arguments to the
+  // function in any of the linear, aligned, or uniform clauses.
+  // When a linear-step expression is specified in a linear clause it must be
+  // either a constant integer expression or an integer-typed parameter that is
+  // specified in a uniform clause on the directive.
+  llvm::DenseMap<Decl *, Expr *> LinearArgs;
+  const bool IsUniformedThis = UniformedLinearThis != nullptr;
+  auto MI = LinModifiers.begin();
+  for (auto *E : Linears) {
+    auto LinKind = static_cast<OpenMPLinearClauseKind>(*MI);
+    ++MI;
+    E = E->IgnoreParenImpCasts();
+    if (auto *DRE = dyn_cast<DeclRefExpr>(E))
+      if (auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
+        auto *CanonPVD = PVD->getCanonicalDecl();
+        if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
+            FD->getParamDecl(PVD->getFunctionScopeIndex())
+                    ->getCanonicalDecl() == CanonPVD) {
+          // OpenMP  [2.15.3.7, linear Clause, Restrictions]
+          // A list-item cannot appear in more than one linear clause.
+          if (LinearArgs.count(CanonPVD) > 0) {
+            Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
+                << getOpenMPClauseName(OMPC_linear)
+                << getOpenMPClauseName(OMPC_linear) << E->getSourceRange();
+            Diag(LinearArgs[CanonPVD]->getExprLoc(),
+                 diag::note_omp_explicit_dsa)
+                << getOpenMPClauseName(OMPC_linear);
+            continue;
+          }
+          // Each argument can appear in at most one uniform or linear clause.
+          if (UniformedArgs.count(CanonPVD) > 0) {
+            Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
+                << getOpenMPClauseName(OMPC_linear)
+                << getOpenMPClauseName(OMPC_uniform) << E->getSourceRange();
+            Diag(UniformedArgs[CanonPVD]->getExprLoc(),
+                 diag::note_omp_explicit_dsa)
+                << getOpenMPClauseName(OMPC_uniform);
+            continue;
+          }
+          LinearArgs[CanonPVD] = E;
+          if (E->isValueDependent() || E->isTypeDependent() ||
+              E->isInstantiationDependent() ||
+              E->containsUnexpandedParameterPack())
+            continue;
+          (void)CheckOpenMPLinearDecl(CanonPVD, E->getExprLoc(), LinKind,
+                                      PVD->getOriginalType());
+          continue;
+        }
+      }
+    if (isa<CXXThisExpr>(E)) {
+      if (UniformedLinearThis) {
+        Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
+            << getOpenMPClauseName(OMPC_linear)
+            << getOpenMPClauseName(IsUniformedThis ? OMPC_uniform : OMPC_linear)
+            << E->getSourceRange();
+        Diag(UniformedLinearThis->getExprLoc(), diag::note_omp_explicit_dsa)
+            << getOpenMPClauseName(IsUniformedThis ? OMPC_uniform
+                                                   : OMPC_linear);
+        continue;
+      }
+      UniformedLinearThis = E;
+      if (E->isValueDependent() || E->isTypeDependent() ||
+          E->isInstantiationDependent() || E->containsUnexpandedParameterPack())
+        continue;
+      (void)CheckOpenMPLinearDecl(/*D=*/nullptr, E->getExprLoc(), LinKind,
+                                  E->getType());
+      continue;
+    }
+    Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
+        << FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
+  }
+  Expr *Step = nullptr;
+  Expr *NewStep = nullptr;
+  SmallVector<Expr *, 4> NewSteps;
+  for (auto *E : Steps) {
+    // Skip the same step expression, it was checked already.
+    if (Step == E || !E) {
+      NewSteps.push_back(E ? NewStep : nullptr);
+      continue;
+    }
+    Step = E;
+    if (auto *DRE = dyn_cast<DeclRefExpr>(Step))
+      if (auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
+        auto *CanonPVD = PVD->getCanonicalDecl();
+        if (UniformedArgs.count(CanonPVD) == 0) {
+          Diag(Step->getExprLoc(), diag::err_omp_expected_uniform_param)
+              << Step->getSourceRange();
+        } else if (E->isValueDependent() || E->isTypeDependent() ||
+                   E->isInstantiationDependent() ||
+                   E->containsUnexpandedParameterPack() ||
+                   CanonPVD->getType()->hasIntegerRepresentation())
+          NewSteps.push_back(Step);
+        else {
+          Diag(Step->getExprLoc(), diag::err_omp_expected_int_param)
+              << Step->getSourceRange();
+        }
+        continue;
+      }
+    NewStep = Step;
+    if (Step && !Step->isValueDependent() && !Step->isTypeDependent() &&
+        !Step->isInstantiationDependent() &&
+        !Step->containsUnexpandedParameterPack()) {
+      NewStep = PerformOpenMPImplicitIntegerConversion(Step->getExprLoc(), Step)
+                    .get();
+      if (NewStep)
+        NewStep = VerifyIntegerConstantExpression(NewStep).get();
+    }
+    NewSteps.push_back(NewStep);
+  }
+  auto *NewAttr = OMPDeclareSimdDeclAttr::CreateImplicit(
+      Context, BS, SL.get(), const_cast<Expr **>(Uniforms.data()),
+      Uniforms.size(), const_cast<Expr **>(Aligneds.data()), Aligneds.size(),
+      const_cast<Expr **>(NewAligns.data()), NewAligns.size(),
+      const_cast<Expr **>(Linears.data()), Linears.size(),
+      const_cast<unsigned *>(LinModifiers.data()), LinModifiers.size(),
+      NewSteps.data(), NewSteps.size(), SR);
+  ADecl->addAttr(NewAttr);
+  return ConvertDeclToDeclGroup(ADecl);
+}
+
 StmtResult Sema::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
                                               Stmt *AStmt,
                                               SourceLocation StartLoc,
@@ -2771,33 +4083,29 @@
   /// \brief A source location for referring to increment later.
   SourceRange IncrementSrcRange;
   /// \brief Loop variable.
-  VarDecl *Var;
+  ValueDecl *LCDecl = nullptr;
   /// \brief Reference to loop variable.
-  DeclRefExpr *VarRef;
+  Expr *LCRef = nullptr;
   /// \brief Lower bound (initializer for the var).
-  Expr *LB;
+  Expr *LB = nullptr;
   /// \brief Upper bound.
-  Expr *UB;
+  Expr *UB = nullptr;
   /// \brief Loop step (increment).
-  Expr *Step;
+  Expr *Step = nullptr;
   /// \brief This flag is true when condition is one of:
   ///   Var <  UB
   ///   Var <= UB
   ///   UB  >  Var
   ///   UB  >= Var
-  bool TestIsLessOp;
+  bool TestIsLessOp = false;
   /// \brief This flag is true when condition is strict ( < or > ).
-  bool TestIsStrictOp;
+  bool TestIsStrictOp = false;
   /// \brief This flag is true when step is subtracted on each iteration.
-  bool SubtractStep;
+  bool SubtractStep = false;
 
 public:
   OpenMPIterationSpaceChecker(Sema &SemaRef, SourceLocation DefaultLoc)
-      : SemaRef(SemaRef), DefaultLoc(DefaultLoc), ConditionLoc(DefaultLoc),
-        InitSrcRange(SourceRange()), ConditionSrcRange(SourceRange()),
-        IncrementSrcRange(SourceRange()), Var(nullptr), VarRef(nullptr),
-        LB(nullptr), UB(nullptr), Step(nullptr), TestIsLessOp(false),
-        TestIsStrictOp(false), SubtractStep(false) {}
+      : SemaRef(SemaRef), DefaultLoc(DefaultLoc), ConditionLoc(DefaultLoc) {}
   /// \brief Check init-expr for canonical loop form and save loop counter
   /// variable - #Var and its initialization value - #LB.
   bool CheckInit(Stmt *S, bool EmitDiags = true);
@@ -2808,9 +4116,9 @@
   /// does not conform, otherwise save loop step (#Step).
   bool CheckInc(Expr *S);
   /// \brief Return the loop counter variable.
-  VarDecl *GetLoopVar() const { return Var; }
+  ValueDecl *GetLoopDecl() const { return LCDecl; }
   /// \brief Return the reference expression to loop counter variable.
-  DeclRefExpr *GetLoopVarRefExpr() const { return VarRef; }
+  Expr *GetLoopDeclRefExpr() const { return LCRef; }
   /// \brief Source range of the loop init.
   SourceRange GetInitSrcRange() const { return InitSrcRange; }
   /// \brief Source range of the loop condition.
@@ -2820,15 +4128,19 @@
   /// \brief True if the step should be subtracted.
   bool ShouldSubtractStep() const { return SubtractStep; }
   /// \brief Build the expression to calculate the number of iterations.
-  Expr *BuildNumIterations(Scope *S, const bool LimitedType) const;
+  Expr *
+  BuildNumIterations(Scope *S, const bool LimitedType,
+                     llvm::MapVector<Expr *, DeclRefExpr *> &Captures) const;
   /// \brief Build the precondition expression for the loops.
-  Expr *BuildPreCond(Scope *S, Expr *Cond) const;
+  Expr *BuildPreCond(Scope *S, Expr *Cond,
+                     llvm::MapVector<Expr *, DeclRefExpr *> &Captures) const;
   /// \brief Build reference expression to the counter be used for codegen.
-  Expr *BuildCounterVar() const;
+  DeclRefExpr *BuildCounterVar(llvm::MapVector<Expr *, DeclRefExpr *> &Captures,
+                               DSAStackTy &DSA) const;
   /// \brief Build reference expression to the private counter be used for
   /// codegen.
   Expr *BuildPrivateCounterVar() const;
-  /// \brief Build initization of the counter be used for codegen.
+  /// \brief Build initialization of the counter be used for codegen.
   Expr *BuildCounterInit() const;
   /// \brief Build step of the counter be used for codegen.
   Expr *BuildCounterStep() const;
@@ -2840,7 +4152,7 @@
   /// expression.
   bool CheckIncRHS(Expr *RHS);
   /// \brief Helper to set loop counter variable and its initializer.
-  bool SetVarAndLB(VarDecl *NewVar, DeclRefExpr *NewVarRefExpr, Expr *NewLB);
+  bool SetLCDeclAndLB(ValueDecl *NewLCDecl, Expr *NewDeclRefExpr, Expr *NewLB);
   /// \brief Helper to set upper bound.
   bool SetUB(Expr *NewUB, bool LessOp, bool StrictOp, SourceRange SR,
              SourceLocation SL);
@@ -2849,16 +4161,16 @@
 };
 
 bool OpenMPIterationSpaceChecker::Dependent() const {
-  if (!Var) {
+  if (!LCDecl) {
     assert(!LB && !UB && !Step);
     return false;
   }
-  return Var->getType()->isDependentType() || (LB && LB->isValueDependent()) ||
-         (UB && UB->isValueDependent()) || (Step && Step->isValueDependent());
+  return LCDecl->getType()->isDependentType() ||
+         (LB && LB->isValueDependent()) || (UB && UB->isValueDependent()) ||
+         (Step && Step->isValueDependent());
 }
 
-template <typename T>
-static T *getExprAsWritten(T *E) {
+static Expr *getExprAsWritten(Expr *E) {
   if (auto *ExprTemp = dyn_cast<ExprWithCleanups>(E))
     E = ExprTemp->getSubExpr();
 
@@ -2873,16 +4185,16 @@
   return E->IgnoreParens();
 }
 
-bool OpenMPIterationSpaceChecker::SetVarAndLB(VarDecl *NewVar,
-                                              DeclRefExpr *NewVarRefExpr,
-                                              Expr *NewLB) {
+bool OpenMPIterationSpaceChecker::SetLCDeclAndLB(ValueDecl *NewLCDecl,
+                                                 Expr *NewLCRefExpr,
+                                                 Expr *NewLB) {
   // State consistency checking to ensure correct usage.
-  assert(Var == nullptr && LB == nullptr && VarRef == nullptr &&
+  assert(LCDecl == nullptr && LB == nullptr && LCRef == nullptr &&
          UB == nullptr && Step == nullptr && !TestIsLessOp && !TestIsStrictOp);
-  if (!NewVar || !NewLB)
+  if (!NewLCDecl || !NewLB)
     return true;
-  Var = NewVar;
-  VarRef = NewVarRefExpr;
+  LCDecl = getCanonicalDecl(NewLCDecl);
+  LCRef = NewLCRefExpr;
   if (auto *CE = dyn_cast_or_null<CXXConstructExpr>(NewLB))
     if (const CXXConstructorDecl *Ctor = CE->getConstructor())
       if ((Ctor->isCopyOrMoveConstructor() ||
@@ -2896,8 +4208,8 @@
 bool OpenMPIterationSpaceChecker::SetUB(Expr *NewUB, bool LessOp, bool StrictOp,
                                         SourceRange SR, SourceLocation SL) {
   // State consistency checking to ensure correct usage.
-  assert(Var != nullptr && LB != nullptr && UB == nullptr && Step == nullptr &&
-         !TestIsLessOp && !TestIsStrictOp);
+  assert(LCDecl != nullptr && LB != nullptr && UB == nullptr &&
+         Step == nullptr && !TestIsLessOp && !TestIsStrictOp);
   if (!NewUB)
     return true;
   UB = NewUB;
@@ -2910,7 +4222,7 @@
 
 bool OpenMPIterationSpaceChecker::SetStep(Expr *NewStep, bool Subtract) {
   // State consistency checking to ensure correct usage.
-  assert(Var != nullptr && LB != nullptr && Step == nullptr);
+  assert(LCDecl != nullptr && LB != nullptr && Step == nullptr);
   if (!NewStep)
     return true;
   if (!NewStep->isValueDependent()) {
@@ -2946,15 +4258,16 @@
                              : (IsConstPos || (IsUnsigned && !Subtract))))) {
       SemaRef.Diag(NewStep->getExprLoc(),
                    diag::err_omp_loop_incr_not_compatible)
-          << Var << TestIsLessOp << NewStep->getSourceRange();
+          << LCDecl << TestIsLessOp << NewStep->getSourceRange();
       SemaRef.Diag(ConditionLoc,
                    diag::note_omp_loop_cond_requres_compatible_incr)
           << TestIsLessOp << ConditionSrcRange;
       return true;
     }
     if (TestIsLessOp == Subtract) {
-      NewStep = SemaRef.CreateBuiltinUnaryOp(NewStep->getExprLoc(), UO_Minus,
-                                             NewStep).get();
+      NewStep =
+          SemaRef.CreateBuiltinUnaryOp(NewStep->getExprLoc(), UO_Minus, NewStep)
+              .get();
       Subtract = !Subtract;
     }
   }
@@ -2979,33 +4292,60 @@
     }
     return true;
   }
+  if (auto *ExprTemp = dyn_cast<ExprWithCleanups>(S))
+    if (!ExprTemp->cleanupsHaveSideEffects())
+      S = ExprTemp->getSubExpr();
+
   InitSrcRange = S->getSourceRange();
   if (Expr *E = dyn_cast<Expr>(S))
     S = E->IgnoreParens();
-  if (auto BO = dyn_cast<BinaryOperator>(S)) {
-    if (BO->getOpcode() == BO_Assign)
-      if (auto DRE = dyn_cast<DeclRefExpr>(BO->getLHS()->IgnoreParens()))
-        return SetVarAndLB(dyn_cast<VarDecl>(DRE->getDecl()), DRE,
-                           BO->getRHS());
-  } else if (auto DS = dyn_cast<DeclStmt>(S)) {
+  if (auto *BO = dyn_cast<BinaryOperator>(S)) {
+    if (BO->getOpcode() == BO_Assign) {
+      auto *LHS = BO->getLHS()->IgnoreParens();
+      if (auto *DRE = dyn_cast<DeclRefExpr>(LHS)) {
+        if (auto *CED = dyn_cast<OMPCapturedExprDecl>(DRE->getDecl()))
+          if (auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
+            return SetLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS());
+        return SetLCDeclAndLB(DRE->getDecl(), DRE, BO->getRHS());
+      }
+      if (auto *ME = dyn_cast<MemberExpr>(LHS)) {
+        if (ME->isArrow() &&
+            isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()))
+          return SetLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS());
+      }
+    }
+  } else if (auto *DS = dyn_cast<DeclStmt>(S)) {
     if (DS->isSingleDecl()) {
-      if (auto Var = dyn_cast_or_null<VarDecl>(DS->getSingleDecl())) {
+      if (auto *Var = dyn_cast_or_null<VarDecl>(DS->getSingleDecl())) {
         if (Var->hasInit() && !Var->getType()->isReferenceType()) {
           // Accept non-canonical init form here but emit ext. warning.
           if (Var->getInitStyle() != VarDecl::CInit && EmitDiags)
             SemaRef.Diag(S->getLocStart(),
                          diag::ext_omp_loop_not_canonical_init)
                 << S->getSourceRange();
-          return SetVarAndLB(Var, nullptr, Var->getInit());
+          return SetLCDeclAndLB(Var, nullptr, Var->getInit());
         }
       }
     }
-  } else if (auto CE = dyn_cast<CXXOperatorCallExpr>(S))
-    if (CE->getOperator() == OO_Equal)
-      if (auto DRE = dyn_cast<DeclRefExpr>(CE->getArg(0)))
-        return SetVarAndLB(dyn_cast<VarDecl>(DRE->getDecl()), DRE,
-                           CE->getArg(1));
+  } else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
+    if (CE->getOperator() == OO_Equal) {
+      auto *LHS = CE->getArg(0);
+      if (auto *DRE = dyn_cast<DeclRefExpr>(LHS)) {
+        if (auto *CED = dyn_cast<OMPCapturedExprDecl>(DRE->getDecl()))
+          if (auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
+            return SetLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS());
+        return SetLCDeclAndLB(DRE->getDecl(), DRE, CE->getArg(1));
+      }
+      if (auto *ME = dyn_cast<MemberExpr>(LHS)) {
+        if (ME->isArrow() &&
+            isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()))
+          return SetLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS());
+      }
+    }
+  }
 
+  if (Dependent() || SemaRef.CurContext->isDependentContext())
+    return false;
   if (EmitDiags) {
     SemaRef.Diag(S->getLocStart(), diag::err_omp_loop_not_canonical_init)
         << S->getSourceRange();
@@ -3015,7 +4355,7 @@
 
 /// \brief Ignore parenthesizes, implicit casts, copy constructor and return the
 /// variable (which may be the loop variable) if possible.
-static const VarDecl *GetInitVarDecl(const Expr *E) {
+static const ValueDecl *GetInitLCDecl(Expr *E) {
   if (!E)
     return nullptr;
   E = getExprAsWritten(E);
@@ -3025,10 +4365,18 @@
            Ctor->isConvertingConstructor(/*AllowExplicit=*/false)) &&
           CE->getNumArgs() > 0 && CE->getArg(0) != nullptr)
         E = CE->getArg(0)->IgnoreParenImpCasts();
-  auto DRE = dyn_cast_or_null<DeclRefExpr>(E);
-  if (!DRE)
-    return nullptr;
-  return dyn_cast<VarDecl>(DRE->getDecl());
+  if (auto *DRE = dyn_cast_or_null<DeclRefExpr>(E)) {
+    if (auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+      if (auto *CED = dyn_cast<OMPCapturedExprDecl>(VD))
+        if (auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
+          return getCanonicalDecl(ME->getMemberDecl());
+      return getCanonicalDecl(VD);
+    }
+  }
+  if (auto *ME = dyn_cast_or_null<MemberExpr>(E))
+    if (ME->isArrow() && isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()))
+      return getCanonicalDecl(ME->getMemberDecl());
+  return nullptr;
 }
 
 bool OpenMPIterationSpaceChecker::CheckCond(Expr *S) {
@@ -3039,25 +4387,25 @@
   //   b relational-op var
   //
   if (!S) {
-    SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_cond) << Var;
+    SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_cond) << LCDecl;
     return true;
   }
   S = getExprAsWritten(S);
   SourceLocation CondLoc = S->getLocStart();
-  if (auto BO = dyn_cast<BinaryOperator>(S)) {
+  if (auto *BO = dyn_cast<BinaryOperator>(S)) {
     if (BO->isRelationalOp()) {
-      if (GetInitVarDecl(BO->getLHS()) == Var)
+      if (GetInitLCDecl(BO->getLHS()) == LCDecl)
         return SetUB(BO->getRHS(),
                      (BO->getOpcode() == BO_LT || BO->getOpcode() == BO_LE),
                      (BO->getOpcode() == BO_LT || BO->getOpcode() == BO_GT),
                      BO->getSourceRange(), BO->getOperatorLoc());
-      if (GetInitVarDecl(BO->getRHS()) == Var)
+      if (GetInitLCDecl(BO->getRHS()) == LCDecl)
         return SetUB(BO->getLHS(),
                      (BO->getOpcode() == BO_GT || BO->getOpcode() == BO_GE),
                      (BO->getOpcode() == BO_LT || BO->getOpcode() == BO_GT),
                      BO->getSourceRange(), BO->getOperatorLoc());
     }
-  } else if (auto CE = dyn_cast<CXXOperatorCallExpr>(S)) {
+  } else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
     if (CE->getNumArgs() == 2) {
       auto Op = CE->getOperator();
       switch (Op) {
@@ -3065,11 +4413,11 @@
       case OO_GreaterEqual:
       case OO_Less:
       case OO_LessEqual:
-        if (GetInitVarDecl(CE->getArg(0)) == Var)
+        if (GetInitLCDecl(CE->getArg(0)) == LCDecl)
           return SetUB(CE->getArg(1), Op == OO_Less || Op == OO_LessEqual,
                        Op == OO_Less || Op == OO_Greater, CE->getSourceRange(),
                        CE->getOperatorLoc());
-        if (GetInitVarDecl(CE->getArg(1)) == Var)
+        if (GetInitLCDecl(CE->getArg(1)) == LCDecl)
           return SetUB(CE->getArg(0), Op == OO_Greater || Op == OO_GreaterEqual,
                        Op == OO_Less || Op == OO_Greater, CE->getSourceRange(),
                        CE->getOperatorLoc());
@@ -3079,8 +4427,10 @@
       }
     }
   }
+  if (Dependent() || SemaRef.CurContext->isDependentContext())
+    return false;
   SemaRef.Diag(CondLoc, diag::err_omp_loop_not_canonical_cond)
-      << S->getSourceRange() << Var;
+      << S->getSourceRange() << LCDecl;
   return true;
 }
 
@@ -3091,25 +4441,27 @@
   //   var - incr
   //
   RHS = RHS->IgnoreParenImpCasts();
-  if (auto BO = dyn_cast<BinaryOperator>(RHS)) {
+  if (auto *BO = dyn_cast<BinaryOperator>(RHS)) {
     if (BO->isAdditiveOp()) {
       bool IsAdd = BO->getOpcode() == BO_Add;
-      if (GetInitVarDecl(BO->getLHS()) == Var)
+      if (GetInitLCDecl(BO->getLHS()) == LCDecl)
         return SetStep(BO->getRHS(), !IsAdd);
-      if (IsAdd && GetInitVarDecl(BO->getRHS()) == Var)
+      if (IsAdd && GetInitLCDecl(BO->getRHS()) == LCDecl)
         return SetStep(BO->getLHS(), false);
     }
-  } else if (auto CE = dyn_cast<CXXOperatorCallExpr>(RHS)) {
+  } else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(RHS)) {
     bool IsAdd = CE->getOperator() == OO_Plus;
     if ((IsAdd || CE->getOperator() == OO_Minus) && CE->getNumArgs() == 2) {
-      if (GetInitVarDecl(CE->getArg(0)) == Var)
+      if (GetInitLCDecl(CE->getArg(0)) == LCDecl)
         return SetStep(CE->getArg(1), !IsAdd);
-      if (IsAdd && GetInitVarDecl(CE->getArg(1)) == Var)
+      if (IsAdd && GetInitLCDecl(CE->getArg(1)) == LCDecl)
         return SetStep(CE->getArg(0), false);
     }
   }
+  if (Dependent() || SemaRef.CurContext->isDependentContext())
+    return false;
   SemaRef.Diag(RHS->getLocStart(), diag::err_omp_loop_not_canonical_incr)
-      << RHS->getSourceRange() << Var;
+      << RHS->getSourceRange() << LCDecl;
   return true;
 }
 
@@ -3128,135 +4480,100 @@
   //   var = var - incr
   //
   if (!S) {
-    SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_incr) << Var;
+    SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_incr) << LCDecl;
     return true;
   }
+  if (auto *ExprTemp = dyn_cast<ExprWithCleanups>(S))
+    if (!ExprTemp->cleanupsHaveSideEffects())
+      S = ExprTemp->getSubExpr();
+
   IncrementSrcRange = S->getSourceRange();
   S = S->IgnoreParens();
-  if (auto UO = dyn_cast<UnaryOperator>(S)) {
-    if (UO->isIncrementDecrementOp() && GetInitVarDecl(UO->getSubExpr()) == Var)
-      return SetStep(
-          SemaRef.ActOnIntegerConstant(UO->getLocStart(),
-                                       (UO->isDecrementOp() ? -1 : 1)).get(),
-          false);
-  } else if (auto BO = dyn_cast<BinaryOperator>(S)) {
+  if (auto *UO = dyn_cast<UnaryOperator>(S)) {
+    if (UO->isIncrementDecrementOp() &&
+        GetInitLCDecl(UO->getSubExpr()) == LCDecl)
+      return SetStep(SemaRef
+                         .ActOnIntegerConstant(UO->getLocStart(),
+                                               (UO->isDecrementOp() ? -1 : 1))
+                         .get(),
+                     false);
+  } else if (auto *BO = dyn_cast<BinaryOperator>(S)) {
     switch (BO->getOpcode()) {
     case BO_AddAssign:
     case BO_SubAssign:
-      if (GetInitVarDecl(BO->getLHS()) == Var)
+      if (GetInitLCDecl(BO->getLHS()) == LCDecl)
         return SetStep(BO->getRHS(), BO->getOpcode() == BO_SubAssign);
       break;
     case BO_Assign:
-      if (GetInitVarDecl(BO->getLHS()) == Var)
+      if (GetInitLCDecl(BO->getLHS()) == LCDecl)
         return CheckIncRHS(BO->getRHS());
       break;
     default:
       break;
     }
-  } else if (auto CE = dyn_cast<CXXOperatorCallExpr>(S)) {
+  } else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
     switch (CE->getOperator()) {
     case OO_PlusPlus:
     case OO_MinusMinus:
-      if (GetInitVarDecl(CE->getArg(0)) == Var)
-        return SetStep(
-            SemaRef.ActOnIntegerConstant(
-                        CE->getLocStart(),
-                        ((CE->getOperator() == OO_MinusMinus) ? -1 : 1)).get(),
-            false);
+      if (GetInitLCDecl(CE->getArg(0)) == LCDecl)
+        return SetStep(SemaRef
+                           .ActOnIntegerConstant(
+                               CE->getLocStart(),
+                               ((CE->getOperator() == OO_MinusMinus) ? -1 : 1))
+                           .get(),
+                       false);
       break;
     case OO_PlusEqual:
     case OO_MinusEqual:
-      if (GetInitVarDecl(CE->getArg(0)) == Var)
+      if (GetInitLCDecl(CE->getArg(0)) == LCDecl)
         return SetStep(CE->getArg(1), CE->getOperator() == OO_MinusEqual);
       break;
     case OO_Equal:
-      if (GetInitVarDecl(CE->getArg(0)) == Var)
+      if (GetInitLCDecl(CE->getArg(0)) == LCDecl)
         return CheckIncRHS(CE->getArg(1));
       break;
     default:
       break;
     }
   }
+  if (Dependent() || SemaRef.CurContext->isDependentContext())
+    return false;
   SemaRef.Diag(S->getLocStart(), diag::err_omp_loop_not_canonical_incr)
-      << S->getSourceRange() << Var;
+      << S->getSourceRange() << LCDecl;
   return true;
 }
 
-namespace {
-// Transform variables declared in GNU statement expressions to new ones to
-// avoid crash on codegen.
-class TransformToNewDefs : public TreeTransform<TransformToNewDefs> {
-  typedef TreeTransform<TransformToNewDefs> BaseTransform;
-
-public:
-  TransformToNewDefs(Sema &SemaRef) : BaseTransform(SemaRef) {}
-
-  Decl *TransformDefinition(SourceLocation Loc, Decl *D) {
-    if (auto *VD = cast<VarDecl>(D))
-      if (!isa<ParmVarDecl>(D) && !isa<VarTemplateSpecializationDecl>(D) &&
-          !isa<ImplicitParamDecl>(D)) {
-        auto *NewVD = VarDecl::Create(
-            SemaRef.Context, VD->getDeclContext(), VD->getLocStart(),
-            VD->getLocation(), VD->getIdentifier(), VD->getType(),
-            VD->getTypeSourceInfo(), VD->getStorageClass());
-        NewVD->setTSCSpec(VD->getTSCSpec());
-        NewVD->setInit(VD->getInit());
-        NewVD->setInitStyle(VD->getInitStyle());
-        NewVD->setExceptionVariable(VD->isExceptionVariable());
-        NewVD->setNRVOVariable(VD->isNRVOVariable());
-        NewVD->setCXXForRangeDecl(VD->isInExternCXXContext());
-        NewVD->setConstexpr(VD->isConstexpr());
-        NewVD->setInitCapture(VD->isInitCapture());
-        NewVD->setPreviousDeclInSameBlockScope(
-            VD->isPreviousDeclInSameBlockScope());
-        VD->getDeclContext()->addHiddenDecl(NewVD);
-        if (VD->hasAttrs())
-          NewVD->setAttrs(VD->getAttrs());
-        transformedLocalDecl(VD, NewVD);
-        return NewVD;
-      }
-    return BaseTransform::TransformDefinition(Loc, D);
-  }
-
-  ExprResult TransformDeclRefExpr(DeclRefExpr *E) {
-    if (auto *NewD = TransformDecl(E->getExprLoc(), E->getDecl()))
-      if (E->getDecl() != NewD) {
-        NewD->setReferenced();
-        NewD->markUsed(SemaRef.Context);
-        return DeclRefExpr::Create(
-            SemaRef.Context, E->getQualifierLoc(), E->getTemplateKeywordLoc(),
-            cast<ValueDecl>(NewD), E->refersToEnclosingVariableOrCapture(),
-            E->getNameInfo(), E->getType(), E->getValueKind());
-      }
-    return BaseTransform::TransformDeclRefExpr(E);
-  }
-};
+static ExprResult
+tryBuildCapture(Sema &SemaRef, Expr *Capture,
+                llvm::MapVector<Expr *, DeclRefExpr *> &Captures) {
+  if (SemaRef.CurContext->isDependentContext())
+    return ExprResult(Capture);
+  if (Capture->isEvaluatable(SemaRef.Context, Expr::SE_AllowSideEffects))
+    return SemaRef.PerformImplicitConversion(
+        Capture->IgnoreImpCasts(), Capture->getType(), Sema::AA_Converting,
+        /*AllowExplicit=*/true);
+  auto I = Captures.find(Capture);
+  if (I != Captures.end())
+    return buildCapture(SemaRef, Capture, I->second);
+  DeclRefExpr *Ref = nullptr;
+  ExprResult Res = buildCapture(SemaRef, Capture, Ref);
+  Captures[Capture] = Ref;
+  return Res;
 }
 
 /// \brief Build the expression to calculate the number of iterations.
-Expr *
-OpenMPIterationSpaceChecker::BuildNumIterations(Scope *S,
-                                                const bool LimitedType) const {
-  TransformToNewDefs Transform(SemaRef);
+Expr *OpenMPIterationSpaceChecker::BuildNumIterations(
+    Scope *S, const bool LimitedType,
+    llvm::MapVector<Expr *, DeclRefExpr *> &Captures) const {
   ExprResult Diff;
-  auto VarType = Var->getType().getNonReferenceType();
+  auto VarType = LCDecl->getType().getNonReferenceType();
   if (VarType->isIntegerType() || VarType->isPointerType() ||
       SemaRef.getLangOpts().CPlusPlus) {
     // Upper - Lower
     auto *UBExpr = TestIsLessOp ? UB : LB;
     auto *LBExpr = TestIsLessOp ? LB : UB;
-    Expr *Upper = Transform.TransformExpr(UBExpr).get();
-    Expr *Lower = Transform.TransformExpr(LBExpr).get();
-    if (!Upper || !Lower)
-      return nullptr;
-    Upper = SemaRef.PerformImplicitConversion(Upper, UBExpr->getType(),
-                                                    Sema::AA_Converting,
-                                                    /*AllowExplicit=*/true)
-                      .get();
-    Lower = SemaRef.PerformImplicitConversion(Lower, LBExpr->getType(),
-                                              Sema::AA_Converting,
-                                              /*AllowExplicit=*/true)
-                .get();
+    Expr *Upper = tryBuildCapture(SemaRef, UBExpr, Captures).get();
+    Expr *Lower = tryBuildCapture(SemaRef, LBExpr, Captures).get();
     if (!Upper || !Lower)
       return nullptr;
 
@@ -3283,13 +4600,8 @@
     return nullptr;
 
   // Upper - Lower [- 1] + Step
-  auto NewStep = Transform.TransformExpr(Step->IgnoreImplicit());
-  if (NewStep.isInvalid())
-    return nullptr;
-  NewStep = SemaRef.PerformImplicitConversion(
-      NewStep.get(), Step->IgnoreImplicit()->getType(), Sema::AA_Converting,
-      /*AllowExplicit=*/true);
-  if (NewStep.isInvalid())
+  auto NewStep = tryBuildCapture(SemaRef, Step, Captures);
+  if (!NewStep.isUsable())
     return nullptr;
   Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Add, Diff.get(), NewStep.get());
   if (!Diff.isUsable())
@@ -3301,14 +4613,6 @@
     return nullptr;
 
   // (Upper - Lower [- 1] + Step) / Step
-  NewStep = Transform.TransformExpr(Step->IgnoreImplicit());
-  if (NewStep.isInvalid())
-    return nullptr;
-  NewStep = SemaRef.PerformImplicitConversion(
-      NewStep.get(), Step->IgnoreImplicit()->getType(), Sema::AA_Converting,
-      /*AllowExplicit=*/true);
-  if (NewStep.isInvalid())
-    return nullptr;
   Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Div, Diff.get(), NewStep.get());
   if (!Diff.isUsable())
     return nullptr;
@@ -3324,10 +4628,12 @@
     bool IsSigned = UseVarType ? VarType->hasSignedIntegerRepresentation()
                                : Type->hasSignedIntegerRepresentation();
     Type = C.getIntTypeForBitwidth(NewSize, IsSigned);
-    Diff = SemaRef.PerformImplicitConversion(
-        Diff.get(), Type, Sema::AA_Converting, /*AllowExplicit=*/true);
-    if (!Diff.isUsable())
-      return nullptr;
+    if (!SemaRef.Context.hasSameType(Diff.get()->getType(), Type)) {
+      Diff = SemaRef.PerformImplicitConversion(
+          Diff.get(), Type, Sema::AA_Converting, /*AllowExplicit=*/true);
+      if (!Diff.isUsable())
+        return nullptr;
+    }
   }
   if (LimitedType) {
     unsigned NewSize = (C.getTypeSize(Type) > 32) ? 64 : 32;
@@ -3340,42 +4646,40 @@
       QualType NewType = C.getIntTypeForBitwidth(
           NewSize, Type->hasSignedIntegerRepresentation() ||
                        C.getTypeSize(Type) < NewSize);
-      Diff = SemaRef.PerformImplicitConversion(Diff.get(), NewType,
-                                               Sema::AA_Converting, true);
-      if (!Diff.isUsable())
-        return nullptr;
+      if (!SemaRef.Context.hasSameType(Diff.get()->getType(), NewType)) {
+        Diff = SemaRef.PerformImplicitConversion(Diff.get(), NewType,
+                                                 Sema::AA_Converting, true);
+        if (!Diff.isUsable())
+          return nullptr;
+      }
     }
   }
 
   return Diff.get();
 }
 
-Expr *OpenMPIterationSpaceChecker::BuildPreCond(Scope *S, Expr *Cond) const {
+Expr *OpenMPIterationSpaceChecker::BuildPreCond(
+    Scope *S, Expr *Cond,
+    llvm::MapVector<Expr *, DeclRefExpr *> &Captures) const {
   // Try to build LB <op> UB, where <op> is <, >, <=, or >=.
   bool Suppress = SemaRef.getDiagnostics().getSuppressAllDiagnostics();
   SemaRef.getDiagnostics().setSuppressAllDiagnostics(/*Val=*/true);
-  TransformToNewDefs Transform(SemaRef);
 
-  auto NewLB = Transform.TransformExpr(LB);
-  auto NewUB = Transform.TransformExpr(UB);
-  if (NewLB.isInvalid() || NewUB.isInvalid())
-    return Cond;
-  NewLB = SemaRef.PerformImplicitConversion(NewLB.get(), LB->getType(),
-                                            Sema::AA_Converting,
-                                            /*AllowExplicit=*/true);
-  NewUB = SemaRef.PerformImplicitConversion(NewUB.get(), UB->getType(),
-                                            Sema::AA_Converting,
-                                            /*AllowExplicit=*/true);
-  if (NewLB.isInvalid() || NewUB.isInvalid())
-    return Cond;
+  auto NewLB = tryBuildCapture(SemaRef, LB, Captures);
+  auto NewUB = tryBuildCapture(SemaRef, UB, Captures);
+  if (!NewLB.isUsable() || !NewUB.isUsable())
+    return nullptr;
+
   auto CondExpr = SemaRef.BuildBinOp(
       S, DefaultLoc, TestIsLessOp ? (TestIsStrictOp ? BO_LT : BO_LE)
                                   : (TestIsStrictOp ? BO_GT : BO_GE),
       NewLB.get(), NewUB.get());
   if (CondExpr.isUsable()) {
-    CondExpr = SemaRef.PerformImplicitConversion(
-        CondExpr.get(), SemaRef.Context.BoolTy, /*Action=*/Sema::AA_Casting,
-        /*AllowExplicit=*/true);
+    if (!SemaRef.Context.hasSameUnqualifiedType(CondExpr.get()->getType(),
+                                                SemaRef.Context.BoolTy))
+      CondExpr = SemaRef.PerformImplicitConversion(
+          CondExpr.get(), SemaRef.Context.BoolTy, /*Action=*/Sema::AA_Casting,
+          /*AllowExplicit=*/true);
   }
   SemaRef.getDiagnostics().setSuppressAllDiagnostics(Suppress);
   // Otherwise use original loop conditon and evaluate it in runtime.
@@ -3383,17 +4687,30 @@
 }
 
 /// \brief Build reference expression to the counter be used for codegen.
-Expr *OpenMPIterationSpaceChecker::BuildCounterVar() const {
-  return buildDeclRefExpr(SemaRef, Var, Var->getType().getNonReferenceType(),
+DeclRefExpr *OpenMPIterationSpaceChecker::BuildCounterVar(
+    llvm::MapVector<Expr *, DeclRefExpr *> &Captures, DSAStackTy &DSA) const {
+  auto *VD = dyn_cast<VarDecl>(LCDecl);
+  if (!VD) {
+    VD = SemaRef.IsOpenMPCapturedDecl(LCDecl);
+    auto *Ref = buildDeclRefExpr(
+        SemaRef, VD, VD->getType().getNonReferenceType(), DefaultLoc);
+    DSAStackTy::DSAVarData Data = DSA.getTopDSA(LCDecl, /*FromParent=*/false);
+    // If the loop control decl is explicitly marked as private, do not mark it
+    // as captured again.
+    if (!isOpenMPPrivate(Data.CKind) || !Data.RefExpr)
+      Captures.insert(std::make_pair(LCRef, Ref));
+    return Ref;
+  }
+  return buildDeclRefExpr(SemaRef, VD, VD->getType().getNonReferenceType(),
                           DefaultLoc);
 }
 
 Expr *OpenMPIterationSpaceChecker::BuildPrivateCounterVar() const {
-  if (Var && !Var->isInvalidDecl()) {
-    auto Type = Var->getType().getNonReferenceType();
+  if (LCDecl && !LCDecl->isInvalidDecl()) {
+    auto Type = LCDecl->getType().getNonReferenceType();
     auto *PrivateVar =
-        buildVarDecl(SemaRef, DefaultLoc, Type, Var->getName(),
-                     Var->hasAttrs() ? &Var->getAttrs() : nullptr);
+        buildVarDecl(SemaRef, DefaultLoc, Type, LCDecl->getName(),
+                     LCDecl->hasAttrs() ? &LCDecl->getAttrs() : nullptr);
     if (PrivateVar->isInvalidDecl())
       return nullptr;
     return buildDeclRefExpr(SemaRef, PrivateVar, Type, DefaultLoc);
@@ -3401,30 +4718,30 @@
   return nullptr;
 }
 
-/// \brief Build initization of the counter be used for codegen.
+/// \brief Build instillation of the counter be used for codegen.
 Expr *OpenMPIterationSpaceChecker::BuildCounterInit() const { return LB; }
 
 /// \brief Build step of the counter be used for codegen.
 Expr *OpenMPIterationSpaceChecker::BuildCounterStep() const { return Step; }
 
 /// \brief Iteration space of a single for loop.
-struct LoopIterationSpace {
+struct LoopIterationSpace final {
   /// \brief Condition of the loop.
-  Expr *PreCond;
+  Expr *PreCond = nullptr;
   /// \brief This expression calculates the number of iterations in the loop.
   /// It is always possible to calculate it before starting the loop.
-  Expr *NumIterations;
+  Expr *NumIterations = nullptr;
   /// \brief The loop counter variable.
-  Expr *CounterVar;
+  Expr *CounterVar = nullptr;
   /// \brief Private loop counter variable.
-  Expr *PrivateCounterVar;
+  Expr *PrivateCounterVar = nullptr;
   /// \brief This is initializer for the initial value of #CounterVar.
-  Expr *CounterInit;
+  Expr *CounterInit = nullptr;
   /// \brief This is step for the #CounterVar used to generate its update:
   /// #CounterVar = #CounterInit + #CounterStep * CurrentIteration.
-  Expr *CounterStep;
+  Expr *CounterStep = nullptr;
   /// \brief Should step be subtracted?
-  bool Subtract;
+  bool Subtract = false;
   /// \brief Source range of the loop init.
   SourceRange InitSrcRange;
   /// \brief Source range of the loop condition.
@@ -3442,8 +4759,21 @@
   if (AssociatedLoops > 0 &&
       isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
     OpenMPIterationSpaceChecker ISC(*this, ForLoc);
-    if (!ISC.CheckInit(Init, /*EmitDiags=*/false))
-      DSAStack->addLoopControlVariable(ISC.GetLoopVar());
+    if (!ISC.CheckInit(Init, /*EmitDiags=*/false)) {
+      if (auto *D = ISC.GetLoopDecl()) {
+        auto *VD = dyn_cast<VarDecl>(D);
+        if (!VD) {
+          if (auto *Private = IsOpenMPCapturedDecl(D))
+            VD = Private;
+          else {
+            auto *Ref = buildCapture(*this, D, ISC.GetLoopDeclRefExpr(),
+                                     /*WithInit=*/false);
+            VD = cast<VarDecl>(Ref->getDecl());
+          }
+        }
+        DSAStack->addLoopControlVariable(D, VD);
+      }
+    }
     DSAStack->setAssociatedLoops(AssociatedLoops - 1);
   }
 }
@@ -3454,11 +4784,12 @@
     OpenMPDirectiveKind DKind, Stmt *S, Sema &SemaRef, DSAStackTy &DSA,
     unsigned CurrentNestedLoopCount, unsigned NestedLoopCount,
     Expr *CollapseLoopCountExpr, Expr *OrderedLoopCountExpr,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA,
-    LoopIterationSpace &ResultIterSpace) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA,
+    LoopIterationSpace &ResultIterSpace,
+    llvm::MapVector<Expr *, DeclRefExpr *> &Captures) {
   // OpenMP [2.6, Canonical Loop Form]
   //   for (init-expr; test-expr; incr-expr) structured-block
-  auto For = dyn_cast_or_null<ForStmt>(S);
+  auto *For = dyn_cast_or_null<ForStmt>(S);
   if (!For) {
     SemaRef.Diag(S->getLocStart(), diag::err_omp_not_for)
         << (CollapseLoopCountExpr != nullptr || OrderedLoopCountExpr != nullptr)
@@ -3487,98 +4818,102 @@
 
   // Check init.
   auto Init = For->getInit();
-  if (ISC.CheckInit(Init)) {
+  if (ISC.CheckInit(Init))
     return true;
-  }
 
   bool HasErrors = false;
 
   // Check loop variable's type.
-  auto Var = ISC.GetLoopVar();
+  if (auto *LCDecl = ISC.GetLoopDecl()) {
+    auto *LoopDeclRefExpr = ISC.GetLoopDeclRefExpr();
 
-  // OpenMP [2.6, Canonical Loop Form]
-  // Var is one of the following:
-  //   A variable of signed or unsigned integer type.
-  //   For C++, a variable of a random access iterator type.
-  //   For C, a variable of a pointer type.
-  auto VarType = Var->getType().getNonReferenceType();
-  if (!VarType->isDependentType() && !VarType->isIntegerType() &&
-      !VarType->isPointerType() &&
-      !(SemaRef.getLangOpts().CPlusPlus && VarType->isOverloadableType())) {
-    SemaRef.Diag(Init->getLocStart(), diag::err_omp_loop_variable_type)
-        << SemaRef.getLangOpts().CPlusPlus;
-    HasErrors = true;
+    // OpenMP [2.6, Canonical Loop Form]
+    // Var is one of the following:
+    //   A variable of signed or unsigned integer type.
+    //   For C++, a variable of a random access iterator type.
+    //   For C, a variable of a pointer type.
+    auto VarType = LCDecl->getType().getNonReferenceType();
+    if (!VarType->isDependentType() && !VarType->isIntegerType() &&
+        !VarType->isPointerType() &&
+        !(SemaRef.getLangOpts().CPlusPlus && VarType->isOverloadableType())) {
+      SemaRef.Diag(Init->getLocStart(), diag::err_omp_loop_variable_type)
+          << SemaRef.getLangOpts().CPlusPlus;
+      HasErrors = true;
+    }
+
+    // OpenMP, 2.14.1.1 Data-sharing Attribute Rules for Variables Referenced in
+    // a Construct
+    // The loop iteration variable(s) in the associated for-loop(s) of a for or
+    // parallel for construct is (are) private.
+    // The loop iteration variable in the associated for-loop of a simd
+    // construct with just one associated for-loop is linear with a
+    // constant-linear-step that is the increment of the associated for-loop.
+    // Exclude loop var from the list of variables with implicitly defined data
+    // sharing attributes.
+    VarsWithImplicitDSA.erase(LCDecl);
+
+    // OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables Referenced
+    // in a Construct, C/C++].
+    // The loop iteration variable in the associated for-loop of a simd
+    // construct with just one associated for-loop may be listed in a linear
+    // clause with a constant-linear-step that is the increment of the
+    // associated for-loop.
+    // The loop iteration variable(s) in the associated for-loop(s) of a for or
+    // parallel for construct may be listed in a private or lastprivate clause.
+    DSAStackTy::DSAVarData DVar = DSA.getTopDSA(LCDecl, false);
+    // If LoopVarRefExpr is nullptr it means the corresponding loop variable is
+    // declared in the loop and it is predetermined as a private.
+    auto PredeterminedCKind =
+        isOpenMPSimdDirective(DKind)
+            ? ((NestedLoopCount == 1) ? OMPC_linear : OMPC_lastprivate)
+            : OMPC_private;
+    if (((isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
+          DVar.CKind != PredeterminedCKind) ||
+         ((isOpenMPWorksharingDirective(DKind) || DKind == OMPD_taskloop ||
+           isOpenMPDistributeDirective(DKind)) &&
+          !isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
+          DVar.CKind != OMPC_private && DVar.CKind != OMPC_lastprivate)) &&
+        (DVar.CKind != OMPC_private || DVar.RefExpr != nullptr)) {
+      SemaRef.Diag(Init->getLocStart(), diag::err_omp_loop_var_dsa)
+          << getOpenMPClauseName(DVar.CKind) << getOpenMPDirectiveName(DKind)
+          << getOpenMPClauseName(PredeterminedCKind);
+      if (DVar.RefExpr == nullptr)
+        DVar.CKind = PredeterminedCKind;
+      ReportOriginalDSA(SemaRef, &DSA, LCDecl, DVar, /*IsLoopIterVar=*/true);
+      HasErrors = true;
+    } else if (LoopDeclRefExpr != nullptr) {
+      // Make the loop iteration variable private (for worksharing constructs),
+      // linear (for simd directives with the only one associated loop) or
+      // lastprivate (for simd directives with several collapsed or ordered
+      // loops).
+      if (DVar.CKind == OMPC_unknown)
+        DVar = DSA.hasDSA(LCDecl, isOpenMPPrivate,
+                          [](OpenMPDirectiveKind) -> bool { return true; },
+                          /*FromParent=*/false);
+      DSA.addDSA(LCDecl, LoopDeclRefExpr, PredeterminedCKind);
+    }
+
+    assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars");
+
+    // Check test-expr.
+    HasErrors |= ISC.CheckCond(For->getCond());
+
+    // Check incr-expr.
+    HasErrors |= ISC.CheckInc(For->getInc());
   }
 
-  // OpenMP, 2.14.1.1 Data-sharing Attribute Rules for Variables Referenced in a
-  // Construct
-  // The loop iteration variable(s) in the associated for-loop(s) of a for or
-  // parallel for construct is (are) private.
-  // The loop iteration variable in the associated for-loop of a simd construct
-  // with just one associated for-loop is linear with a constant-linear-step
-  // that is the increment of the associated for-loop.
-  // Exclude loop var from the list of variables with implicitly defined data
-  // sharing attributes.
-  VarsWithImplicitDSA.erase(Var);
-
-  // OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables Referenced in
-  // a Construct, C/C++].
-  // The loop iteration variable in the associated for-loop of a simd construct
-  // with just one associated for-loop may be listed in a linear clause with a
-  // constant-linear-step that is the increment of the associated for-loop.
-  // The loop iteration variable(s) in the associated for-loop(s) of a for or
-  // parallel for construct may be listed in a private or lastprivate clause.
-  DSAStackTy::DSAVarData DVar = DSA.getTopDSA(Var, false);
-  auto LoopVarRefExpr = ISC.GetLoopVarRefExpr();
-  // If LoopVarRefExpr is nullptr it means the corresponding loop variable is
-  // declared in the loop and it is predetermined as a private.
-  auto PredeterminedCKind =
-      isOpenMPSimdDirective(DKind)
-          ? ((NestedLoopCount == 1) ? OMPC_linear : OMPC_lastprivate)
-          : OMPC_private;
-  if (((isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
-        DVar.CKind != PredeterminedCKind) ||
-       ((isOpenMPWorksharingDirective(DKind) || DKind == OMPD_taskloop ||
-         isOpenMPDistributeDirective(DKind)) &&
-        !isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
-        DVar.CKind != OMPC_private && DVar.CKind != OMPC_lastprivate)) &&
-      (DVar.CKind != OMPC_private || DVar.RefExpr != nullptr)) {
-    SemaRef.Diag(Init->getLocStart(), diag::err_omp_loop_var_dsa)
-        << getOpenMPClauseName(DVar.CKind) << getOpenMPDirectiveName(DKind)
-        << getOpenMPClauseName(PredeterminedCKind);
-    if (DVar.RefExpr == nullptr)
-      DVar.CKind = PredeterminedCKind;
-    ReportOriginalDSA(SemaRef, &DSA, Var, DVar, /*IsLoopIterVar=*/true);
-    HasErrors = true;
-  } else if (LoopVarRefExpr != nullptr) {
-    // Make the loop iteration variable private (for worksharing constructs),
-    // linear (for simd directives with the only one associated loop) or
-    // lastprivate (for simd directives with several collapsed or ordered
-    // loops).
-    if (DVar.CKind == OMPC_unknown)
-      DVar = DSA.hasDSA(Var, isOpenMPPrivate, MatchesAlways(),
-                        /*FromParent=*/false);
-    DSA.addDSA(Var, LoopVarRefExpr, PredeterminedCKind);
-  }
-
-  assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars");
-
-  // Check test-expr.
-  HasErrors |= ISC.CheckCond(For->getCond());
-
-  // Check incr-expr.
-  HasErrors |= ISC.CheckInc(For->getInc());
-
   if (ISC.Dependent() || SemaRef.CurContext->isDependentContext() || HasErrors)
     return HasErrors;
 
   // Build the loop's iteration space representation.
-  ResultIterSpace.PreCond = ISC.BuildPreCond(DSA.getCurScope(), For->getCond());
+  ResultIterSpace.PreCond =
+      ISC.BuildPreCond(DSA.getCurScope(), For->getCond(), Captures);
   ResultIterSpace.NumIterations = ISC.BuildNumIterations(
-      DSA.getCurScope(), (isOpenMPWorksharingDirective(DKind) ||
-                          isOpenMPTaskLoopDirective(DKind) ||
-                          isOpenMPDistributeDirective(DKind)));
-  ResultIterSpace.CounterVar = ISC.BuildCounterVar();
+      DSA.getCurScope(),
+      (isOpenMPWorksharingDirective(DKind) ||
+       isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind)),
+      Captures);
+  ResultIterSpace.CounterVar = ISC.BuildCounterVar(Captures, DSA);
   ResultIterSpace.PrivateCounterVar = ISC.BuildPrivateCounterVar();
   ResultIterSpace.CounterInit = ISC.BuildCounterInit();
   ResultIterSpace.CounterStep = ISC.BuildCounterStep();
@@ -3598,24 +4933,22 @@
 }
 
 /// \brief Build 'VarRef = Start.
-static ExprResult BuildCounterInit(Sema &SemaRef, Scope *S, SourceLocation Loc,
-                                   ExprResult VarRef, ExprResult Start) {
-  TransformToNewDefs Transform(SemaRef);
+static ExprResult
+BuildCounterInit(Sema &SemaRef, Scope *S, SourceLocation Loc, ExprResult VarRef,
+                 ExprResult Start,
+                 llvm::MapVector<Expr *, DeclRefExpr *> &Captures) {
   // Build 'VarRef = Start.
-  auto NewStart = Transform.TransformExpr(Start.get()->IgnoreImplicit());
-  if (NewStart.isInvalid())
-    return ExprError();
-  NewStart = SemaRef.PerformImplicitConversion(
-      NewStart.get(), Start.get()->IgnoreImplicit()->getType(),
-      Sema::AA_Converting,
-      /*AllowExplicit=*/true);
-  if (NewStart.isInvalid())
-    return ExprError();
-  NewStart = SemaRef.PerformImplicitConversion(
-      NewStart.get(), VarRef.get()->getType(), Sema::AA_Converting,
-      /*AllowExplicit=*/true);
+  auto NewStart = tryBuildCapture(SemaRef, Start.get(), Captures);
   if (!NewStart.isUsable())
     return ExprError();
+  if (!SemaRef.Context.hasSameType(NewStart.get()->getType(),
+                                   VarRef.get()->getType())) {
+    NewStart = SemaRef.PerformImplicitConversion(
+        NewStart.get(), VarRef.get()->getType(), Sema::AA_Converting,
+        /*AllowExplicit=*/true);
+    if (!NewStart.isUsable())
+      return ExprError();
+  }
 
   auto Init =
       SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), NewStart.get());
@@ -3623,24 +4956,20 @@
 }
 
 /// \brief Build 'VarRef = Start + Iter * Step'.
-static ExprResult BuildCounterUpdate(Sema &SemaRef, Scope *S,
-                                     SourceLocation Loc, ExprResult VarRef,
-                                     ExprResult Start, ExprResult Iter,
-                                     ExprResult Step, bool Subtract) {
+static ExprResult
+BuildCounterUpdate(Sema &SemaRef, Scope *S, SourceLocation Loc,
+                   ExprResult VarRef, ExprResult Start, ExprResult Iter,
+                   ExprResult Step, bool Subtract,
+                   llvm::MapVector<Expr *, DeclRefExpr *> *Captures = nullptr) {
   // Add parentheses (for debugging purposes only).
   Iter = SemaRef.ActOnParenExpr(Loc, Loc, Iter.get());
   if (!VarRef.isUsable() || !Start.isUsable() || !Iter.isUsable() ||
       !Step.isUsable())
     return ExprError();
 
-  TransformToNewDefs Transform(SemaRef);
-  auto NewStep = Transform.TransformExpr(Step.get()->IgnoreImplicit());
-  if (NewStep.isInvalid())
-    return ExprError();
-  NewStep = SemaRef.PerformImplicitConversion(
-      NewStep.get(), Step.get()->IgnoreImplicit()->getType(),
-      Sema::AA_Converting,
-      /*AllowExplicit=*/true);
+  ExprResult NewStep = Step;
+  if (Captures)
+    NewStep = tryBuildCapture(SemaRef, Step.get(), *Captures);
   if (NewStep.isInvalid())
     return ExprError();
   ExprResult Update =
@@ -3648,34 +4977,59 @@
   if (!Update.isUsable())
     return ExprError();
 
-  // Build 'VarRef = Start + Iter * Step'.
-  auto NewStart = Transform.TransformExpr(Start.get()->IgnoreImplicit());
+  // Try to build 'VarRef = Start, VarRef (+|-)= Iter * Step' or
+  // 'VarRef = Start (+|-) Iter * Step'.
+  ExprResult NewStart = Start;
+  if (Captures)
+    NewStart = tryBuildCapture(SemaRef, Start.get(), *Captures);
   if (NewStart.isInvalid())
     return ExprError();
-  NewStart = SemaRef.PerformImplicitConversion(
-      NewStart.get(), Start.get()->IgnoreImplicit()->getType(),
-      Sema::AA_Converting,
-      /*AllowExplicit=*/true);
-  if (NewStart.isInvalid())
-    return ExprError();
-  Update = SemaRef.BuildBinOp(S, Loc, (Subtract ? BO_Sub : BO_Add),
-                              NewStart.get(), Update.get());
-  if (!Update.isUsable())
-    return ExprError();
 
-  Update = SemaRef.PerformImplicitConversion(
-      Update.get(), VarRef.get()->getType(), Sema::AA_Converting, true);
-  if (!Update.isUsable())
-    return ExprError();
+  // First attempt: try to build 'VarRef = Start, VarRef += Iter * Step'.
+  ExprResult SavedUpdate = Update;
+  ExprResult UpdateVal;
+  if (VarRef.get()->getType()->isOverloadableType() ||
+      NewStart.get()->getType()->isOverloadableType() ||
+      Update.get()->getType()->isOverloadableType()) {
+    bool Suppress = SemaRef.getDiagnostics().getSuppressAllDiagnostics();
+    SemaRef.getDiagnostics().setSuppressAllDiagnostics(/*Val=*/true);
+    Update =
+        SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), NewStart.get());
+    if (Update.isUsable()) {
+      UpdateVal =
+          SemaRef.BuildBinOp(S, Loc, Subtract ? BO_SubAssign : BO_AddAssign,
+                             VarRef.get(), SavedUpdate.get());
+      if (UpdateVal.isUsable()) {
+        Update = SemaRef.CreateBuiltinBinOp(Loc, BO_Comma, Update.get(),
+                                            UpdateVal.get());
+      }
+    }
+    SemaRef.getDiagnostics().setSuppressAllDiagnostics(Suppress);
+  }
 
-  Update = SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), Update.get());
+  // Second attempt: try to build 'VarRef = Start (+|-) Iter * Step'.
+  if (!Update.isUsable() || !UpdateVal.isUsable()) {
+    Update = SemaRef.BuildBinOp(S, Loc, Subtract ? BO_Sub : BO_Add,
+                                NewStart.get(), SavedUpdate.get());
+    if (!Update.isUsable())
+      return ExprError();
+
+    if (!SemaRef.Context.hasSameType(Update.get()->getType(),
+                                     VarRef.get()->getType())) {
+      Update = SemaRef.PerformImplicitConversion(
+          Update.get(), VarRef.get()->getType(), Sema::AA_Converting, true);
+      if (!Update.isUsable())
+        return ExprError();
+    }
+
+    Update = SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), Update.get());
+  }
   return Update;
 }
 
 /// \brief Convert integer expression \a E to make it have at least \a Bits
 /// bits.
-static ExprResult WidenIterationCount(unsigned Bits, Expr *E,
-                                      Sema &SemaRef) {
+static ExprResult WidenIterationCount(unsigned Bits, Expr *E, Sema &SemaRef) {
   if (E == nullptr)
     return ExprError();
   auto &C = SemaRef.Context;
@@ -3700,6 +5054,49 @@
   return false;
 }
 
+/// Build preinits statement for the given declarations.
+static Stmt *buildPreInits(ASTContext &Context,
+                           SmallVectorImpl<Decl *> &PreInits) {
+  if (!PreInits.empty()) {
+    return new (Context) DeclStmt(
+        DeclGroupRef::Create(Context, PreInits.begin(), PreInits.size()),
+        SourceLocation(), SourceLocation());
+  }
+  return nullptr;
+}
+
+/// Build preinits statement for the given declarations.
+static Stmt *buildPreInits(ASTContext &Context,
+                           llvm::MapVector<Expr *, DeclRefExpr *> &Captures) {
+  if (!Captures.empty()) {
+    SmallVector<Decl *, 16> PreInits;
+    for (auto &Pair : Captures)
+      PreInits.push_back(Pair.second->getDecl());
+    return buildPreInits(Context, PreInits);
+  }
+  return nullptr;
+}
+
+/// Build postupdate expression for the given list of postupdates expressions.
+static Expr *buildPostUpdate(Sema &S, ArrayRef<Expr *> PostUpdates) {
+  Expr *PostUpdate = nullptr;
+  if (!PostUpdates.empty()) {
+    for (auto *E : PostUpdates) {
+      Expr *ConvE = S.BuildCStyleCastExpr(
+                         E->getExprLoc(),
+                         S.Context.getTrivialTypeSourceInfo(S.Context.VoidTy),
+                         E->getExprLoc(), E)
+                        .get();
+      PostUpdate = PostUpdate
+                       ? S.CreateBuiltinBinOp(ConvE->getExprLoc(), BO_Comma,
+                                              PostUpdate, ConvE)
+                             .get()
+                       : ConvE;
+    }
+  }
+  return PostUpdate;
+}
+
 /// \brief Called on a for stmt to check itself and nested loops (if any).
 /// \return Returns 0 if one of the collapsed stmts is not canonical for loop,
 /// number of collapsed loops otherwise.
@@ -3707,7 +5104,7 @@
 CheckOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
                 Expr *OrderedLoopCountExpr, Stmt *AStmt, Sema &SemaRef,
                 DSAStackTy &DSA,
-                llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA,
+                llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA,
                 OMPLoopDirective::HelperExprs &Built) {
   unsigned NestedLoopCount = 1;
   if (CollapseLoopCountExpr) {
@@ -3733,6 +5130,7 @@
   }
   // This is helper routine for loop directives (e.g., 'for', 'simd',
   // 'for simd', etc.).
+  llvm::MapVector<Expr *, DeclRefExpr *> Captures;
   SmallVector<LoopIterationSpace, 4> IterSpaces;
   IterSpaces.resize(NestedLoopCount);
   Stmt *CurStmt = AStmt->IgnoreContainers(/* IgnoreCaptured */ true);
@@ -3740,7 +5138,7 @@
     if (CheckOpenMPIterationSpace(DKind, CurStmt, SemaRef, DSA, Cnt,
                                   NestedLoopCount, CollapseLoopCountExpr,
                                   OrderedLoopCountExpr, VarsWithImplicitDSA,
-                                  IterSpaces[Cnt]))
+                                  IterSpaces[Cnt], Captures))
       return 0;
     // Move on to the next nested for loop, or to the loop body.
     // OpenMP [2.8.1, simd construct, Restrictions]
@@ -3789,15 +5187,17 @@
   auto PreCond = ExprResult(IterSpaces[0].PreCond);
   auto N0 = IterSpaces[0].NumIterations;
   ExprResult LastIteration32 = WidenIterationCount(
-      32 /* Bits */, SemaRef.PerformImplicitConversion(
-                                N0->IgnoreImpCasts(), N0->getType(),
-                                Sema::AA_Converting, /*AllowExplicit=*/true)
+      32 /* Bits */, SemaRef
+                         .PerformImplicitConversion(
+                             N0->IgnoreImpCasts(), N0->getType(),
+                             Sema::AA_Converting, /*AllowExplicit=*/true)
                          .get(),
       SemaRef);
   ExprResult LastIteration64 = WidenIterationCount(
-      64 /* Bits */, SemaRef.PerformImplicitConversion(
-                                N0->IgnoreImpCasts(), N0->getType(),
-                                Sema::AA_Converting, /*AllowExplicit=*/true)
+      64 /* Bits */, SemaRef
+                         .PerformImplicitConversion(
+                             N0->IgnoreImpCasts(), N0->getType(),
+                             Sema::AA_Converting, /*AllowExplicit=*/true)
                          .get(),
       SemaRef);
 
@@ -3818,16 +5218,18 @@
     if (LastIteration32.isUsable())
       LastIteration32 = SemaRef.BuildBinOp(
           CurScope, SourceLocation(), BO_Mul, LastIteration32.get(),
-          SemaRef.PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
-                                            Sema::AA_Converting,
-                                            /*AllowExplicit=*/true)
+          SemaRef
+              .PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
+                                         Sema::AA_Converting,
+                                         /*AllowExplicit=*/true)
               .get());
     if (LastIteration64.isUsable())
       LastIteration64 = SemaRef.BuildBinOp(
           CurScope, SourceLocation(), BO_Mul, LastIteration64.get(),
-          SemaRef.PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
-                                            Sema::AA_Converting,
-                                            /*AllowExplicit=*/true)
+          SemaRef
+              .PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
+                                         Sema::AA_Converting,
+                                         /*AllowExplicit=*/true)
               .get());
   }
 
@@ -3841,6 +5243,15 @@
            LastIteration32.get()->getType()->hasSignedIntegerRepresentation(),
            LastIteration64.get(), SemaRef)))
     LastIteration = LastIteration32;
+  QualType VType = LastIteration.get()->getType();
+  QualType RealVType = VType;
+  QualType StrideVType = VType;
+  if (isOpenMPTaskLoopDirective(DKind)) {
+    VType =
+        SemaRef.Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
+    StrideVType =
+        SemaRef.Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
+  }
 
   if (!LastIteration.isUsable())
     return 0;
@@ -3862,19 +5273,13 @@
       LastIteration.get()->isIntegerConstantExpr(Result, SemaRef.Context);
   ExprResult CalcLastIteration;
   if (!IsConstant) {
-    SourceLocation SaveLoc;
-    VarDecl *SaveVar =
-        buildVarDecl(SemaRef, SaveLoc, LastIteration.get()->getType(),
-                     ".omp.last.iteration");
-    ExprResult SaveRef = buildDeclRefExpr(
-        SemaRef, SaveVar, LastIteration.get()->getType(), SaveLoc);
-    CalcLastIteration = SemaRef.BuildBinOp(CurScope, SaveLoc, BO_Assign,
-                                           SaveRef.get(), LastIteration.get());
+    ExprResult SaveRef =
+        tryBuildCapture(SemaRef, LastIteration.get(), Captures);
     LastIteration = SaveRef;
 
     // Prepare SaveRef + 1.
     NumIterations = SemaRef.BuildBinOp(
-        CurScope, SaveLoc, BO_Add, SaveRef.get(),
+        CurScope, SourceLocation(), BO_Add, SaveRef.get(),
         SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
     if (!NumIterations.isUsable())
       return 0;
@@ -3882,9 +5287,8 @@
 
   SourceLocation InitLoc = IterSpaces[0].InitSrcRange.getBegin();
 
-  QualType VType = LastIteration.get()->getType();
-  // Build variables passed into runtime, nesessary for worksharing directives.
-  ExprResult LB, UB, IL, ST, EUB;
+  // Build variables passed into runtime, necessary for worksharing directives.
+  ExprResult LB, UB, IL, ST, EUB, PrevLB, PrevUB;
   if (isOpenMPWorksharingDirective(DKind) || isOpenMPTaskLoopDirective(DKind) ||
       isOpenMPDistributeDirective(DKind)) {
     // Lower bound variable, initialized with zero.
@@ -3911,14 +5315,15 @@
         /*DirectInit*/ false, /*TypeMayContainAuto*/ false);
 
     // Stride variable returned by runtime (we initialize it to 1 by default).
-    VarDecl *STDecl = buildVarDecl(SemaRef, InitLoc, VType, ".omp.stride");
-    ST = buildDeclRefExpr(SemaRef, STDecl, VType, InitLoc);
+    VarDecl *STDecl =
+        buildVarDecl(SemaRef, InitLoc, StrideVType, ".omp.stride");
+    ST = buildDeclRefExpr(SemaRef, STDecl, StrideVType, InitLoc);
     SemaRef.AddInitializerToDecl(
         STDecl, SemaRef.ActOnIntegerConstant(InitLoc, 1).get(),
         /*DirectInit*/ false, /*TypeMayContainAuto*/ false);
 
     // Build expression: UB = min(UB, LastIteration)
-    // It is nesessary for CodeGen of directives with static scheduling.
+    // It is necessary for CodeGen of directives with static scheduling.
     ExprResult IsUBGreater = SemaRef.BuildBinOp(CurScope, InitLoc, BO_GT,
                                                 UB.get(), LastIteration.get());
     ExprResult CondOp = SemaRef.ActOnConditionalOp(
@@ -3926,19 +5331,44 @@
     EUB = SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, UB.get(),
                              CondOp.get());
     EUB = SemaRef.ActOnFinishFullExpr(EUB.get());
+
+    // If we have a combined directive that combines 'distribute', 'for' or
+    // 'simd' we need to be able to access the bounds of the schedule of the
+    // enclosing region. E.g. in 'distribute parallel for' the bounds obtained
+    // by scheduling 'distribute' have to be passed to the schedule of 'for'.
+    if (isOpenMPLoopBoundSharingDirective(DKind)) {
+      auto *CD = cast<CapturedStmt>(AStmt)->getCapturedDecl();
+
+      // We expect to have at least 2 more parameters than the 'parallel'
+      // directive does - the lower and upper bounds of the previous schedule.
+      assert(CD->getNumParams() >= 4 &&
+             "Unexpected number of parameters in loop combined directive");
+
+      // Set the proper type for the bounds given what we learned from the
+      // enclosed loops.
+      auto *PrevLBDecl = CD->getParam(/*PrevLB=*/2);
+      auto *PrevUBDecl = CD->getParam(/*PrevUB=*/3);
+
+      // Previous lower and upper bounds are obtained from the region
+      // parameters.
+      PrevLB =
+          buildDeclRefExpr(SemaRef, PrevLBDecl, PrevLBDecl->getType(), InitLoc);
+      PrevUB =
+          buildDeclRefExpr(SemaRef, PrevUBDecl, PrevUBDecl->getType(), InitLoc);
+    }
   }
 
   // Build the iteration variable and its initialization before loop.
   ExprResult IV;
   ExprResult Init;
   {
-    VarDecl *IVDecl = buildVarDecl(SemaRef, InitLoc, VType, ".omp.iv");
-    IV = buildDeclRefExpr(SemaRef, IVDecl, VType, InitLoc);
-    Expr *RHS = (isOpenMPWorksharingDirective(DKind) ||
-                 isOpenMPTaskLoopDirective(DKind) ||
-                 isOpenMPDistributeDirective(DKind))
-                    ? LB.get()
-                    : SemaRef.ActOnIntegerConstant(SourceLocation(), 0).get();
+    VarDecl *IVDecl = buildVarDecl(SemaRef, InitLoc, RealVType, ".omp.iv");
+    IV = buildDeclRefExpr(SemaRef, IVDecl, RealVType, InitLoc);
+    Expr *RHS =
+        (isOpenMPWorksharingDirective(DKind) ||
+         isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind))
+            ? LB.get()
+            : SemaRef.ActOnIntegerConstant(SourceLocation(), 0).get();
     Init = SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, IV.get(), RHS);
     Init = SemaRef.ActOnFinishFullExpr(Init.get());
   }
@@ -3997,6 +5427,7 @@
   Built.Inits.resize(NestedLoopCount);
   Built.Updates.resize(NestedLoopCount);
   Built.Finals.resize(NestedLoopCount);
+  SmallVector<Expr *, 4> LoopMultipliers;
   {
     ExprResult Div;
     // Go from inner nested loop to outer.
@@ -4024,19 +5455,19 @@
       }
 
       // Build update: IS.CounterVar(Private) = IS.Start + Iter * IS.Step
-      auto *CounterVar = buildDeclRefExpr(
-          SemaRef, cast<VarDecl>(cast<DeclRefExpr>(IS.CounterVar)->getDecl()),
-          IS.CounterVar->getType(), IS.CounterVar->getExprLoc(),
-          /*RefersToCapture=*/true);
+      auto *VD = cast<VarDecl>(cast<DeclRefExpr>(IS.CounterVar)->getDecl());
+      auto *CounterVar = buildDeclRefExpr(SemaRef, VD, IS.CounterVar->getType(),
+                                          IS.CounterVar->getExprLoc(),
+                                          /*RefersToCapture=*/true);
       ExprResult Init = BuildCounterInit(SemaRef, CurScope, UpdLoc, CounterVar,
-                                         IS.CounterInit);
+                                         IS.CounterInit, Captures);
       if (!Init.isUsable()) {
         HasErrors = true;
         break;
       }
-      ExprResult Update =
-          BuildCounterUpdate(SemaRef, CurScope, UpdLoc, CounterVar,
-                             IS.CounterInit, Iter, IS.CounterStep, IS.Subtract);
+      ExprResult Update = BuildCounterUpdate(
+          SemaRef, CurScope, UpdLoc, CounterVar, IS.CounterInit, Iter,
+          IS.CounterStep, IS.Subtract, &Captures);
       if (!Update.isUsable()) {
         HasErrors = true;
         break;
@@ -4045,7 +5476,7 @@
       // Build final: IS.CounterVar = IS.Start + IS.NumIters * IS.Step
       ExprResult Final = BuildCounterUpdate(
           SemaRef, CurScope, UpdLoc, CounterVar, IS.CounterInit,
-          IS.NumIterations, IS.CounterStep, IS.Subtract);
+          IS.NumIterations, IS.CounterStep, IS.Subtract, &Captures);
       if (!Final.isUsable()) {
         HasErrors = true;
         break;
@@ -4061,11 +5492,12 @@
 
         // Add parentheses (for debugging purposes only).
         if (Div.isUsable())
-          Div = SemaRef.ActOnParenExpr(UpdLoc, UpdLoc, Div.get());
+          Div = tryBuildCapture(SemaRef, Div.get(), Captures);
         if (!Div.isUsable()) {
           HasErrors = true;
           break;
         }
+        LoopMultipliers.push_back(Div.get());
       }
       if (!Update.isUsable() || !Final.isUsable()) {
         HasErrors = true;
@@ -4090,6 +5522,7 @@
   Built.CalcLastIteration =
       SemaRef.ActOnFinishFullExpr(CalcLastIteration.get()).get();
   Built.PreCond = PreCond.get();
+  Built.PreInits = buildPreInits(C, Captures);
   Built.Cond = Cond.get();
   Built.Init = Init.get();
   Built.Inc = Inc.get();
@@ -4100,6 +5533,57 @@
   Built.EUB = EUB.get();
   Built.NLB = NextLB.get();
   Built.NUB = NextUB.get();
+  Built.PrevLB = PrevLB.get();
+  Built.PrevUB = PrevUB.get();
+
+  Expr *CounterVal = SemaRef.DefaultLvalueConversion(IV.get()).get();
+  // Fill data for doacross depend clauses.
+  for (auto Pair : DSA.getDoacrossDependClauses()) {
+    if (Pair.first->getDependencyKind() == OMPC_DEPEND_source)
+      Pair.first->setCounterValue(CounterVal);
+    else {
+      if (NestedLoopCount != Pair.second.size() ||
+          NestedLoopCount != LoopMultipliers.size() + 1) {
+        // Erroneous case - clause has some problems.
+        Pair.first->setCounterValue(CounterVal);
+        continue;
+      }
+      assert(Pair.first->getDependencyKind() == OMPC_DEPEND_sink);
+      auto I = Pair.second.rbegin();
+      auto IS = IterSpaces.rbegin();
+      auto ILM = LoopMultipliers.rbegin();
+      Expr *UpCounterVal = CounterVal;
+      Expr *Multiplier = nullptr;
+      for (int Cnt = NestedLoopCount - 1; Cnt >= 0; --Cnt) {
+        if (I->first) {
+          assert(IS->CounterStep);
+          Expr *NormalizedOffset =
+              SemaRef
+                  .BuildBinOp(CurScope, I->first->getExprLoc(), BO_Div,
+                              I->first, IS->CounterStep)
+                  .get();
+          if (Multiplier) {
+            NormalizedOffset =
+                SemaRef
+                    .BuildBinOp(CurScope, I->first->getExprLoc(), BO_Mul,
+                                NormalizedOffset, Multiplier)
+                    .get();
+          }
+          assert(I->second == OO_Plus || I->second == OO_Minus);
+          BinaryOperatorKind BOK = (I->second == OO_Plus) ? BO_Add : BO_Sub;
+          UpCounterVal = SemaRef
+                             .BuildBinOp(CurScope, I->first->getExprLoc(), BOK,
+                                         UpCounterVal, NormalizedOffset)
+                             .get();
+        }
+        Multiplier = *ILM;
+        ++I;
+        ++IS;
+        ++ILM;
+      }
+      Pair.first->setCounterValue(UpCounterVal);
+    }
+  }
 
   return NestedLoopCount;
 }
@@ -4120,26 +5604,44 @@
   return nullptr;
 }
 
-static bool checkSimdlenSafelenValues(Sema &S, const Expr *Simdlen,
-                                      const Expr *Safelen) {
-  llvm::APSInt SimdlenRes, SafelenRes;
-  if (Simdlen->isValueDependent() || Simdlen->isTypeDependent() ||
-      Simdlen->isInstantiationDependent() ||
-      Simdlen->containsUnexpandedParameterPack())
-    return false;
-  if (Safelen->isValueDependent() || Safelen->isTypeDependent() ||
-      Safelen->isInstantiationDependent() ||
-      Safelen->containsUnexpandedParameterPack())
-    return false;
-  Simdlen->EvaluateAsInt(SimdlenRes, S.Context);
-  Safelen->EvaluateAsInt(SafelenRes, S.Context);
-  // OpenMP 4.1 [2.8.1, simd Construct, Restrictions]
-  // If both simdlen and safelen clauses are specified, the value of the simdlen
-  // parameter must be less than or equal to the value of the safelen parameter.
-  if (SimdlenRes > SafelenRes) {
-    S.Diag(Simdlen->getExprLoc(), diag::err_omp_wrong_simdlen_safelen_values)
-        << Simdlen->getSourceRange() << Safelen->getSourceRange();
-    return true;
+static bool checkSimdlenSafelenSpecified(Sema &S,
+                                         const ArrayRef<OMPClause *> Clauses) {
+  OMPSafelenClause *Safelen = nullptr;
+  OMPSimdlenClause *Simdlen = nullptr;
+
+  for (auto *Clause : Clauses) {
+    if (Clause->getClauseKind() == OMPC_safelen)
+      Safelen = cast<OMPSafelenClause>(Clause);
+    else if (Clause->getClauseKind() == OMPC_simdlen)
+      Simdlen = cast<OMPSimdlenClause>(Clause);
+    if (Safelen && Simdlen)
+      break;
+  }
+
+  if (Simdlen && Safelen) {
+    llvm::APSInt SimdlenRes, SafelenRes;
+    auto SimdlenLength = Simdlen->getSimdlen();
+    auto SafelenLength = Safelen->getSafelen();
+    if (SimdlenLength->isValueDependent() || SimdlenLength->isTypeDependent() ||
+        SimdlenLength->isInstantiationDependent() ||
+        SimdlenLength->containsUnexpandedParameterPack())
+      return false;
+    if (SafelenLength->isValueDependent() || SafelenLength->isTypeDependent() ||
+        SafelenLength->isInstantiationDependent() ||
+        SafelenLength->containsUnexpandedParameterPack())
+      return false;
+    SimdlenLength->EvaluateAsInt(SimdlenRes, S.Context);
+    SafelenLength->EvaluateAsInt(SafelenRes, S.Context);
+    // OpenMP 4.5 [2.8.1, simd Construct, Restrictions]
+    // If both simdlen and safelen clauses are specified, the value of the
+    // simdlen parameter must be less than or equal to the value of the safelen
+    // parameter.
+    if (SimdlenRes > SafelenRes) {
+      S.Diag(SimdlenLength->getExprLoc(),
+             diag::err_omp_wrong_simdlen_safelen_values)
+          << SimdlenLength->getSourceRange() << SafelenLength->getSourceRange();
+      return true;
+    }
   }
   return false;
 }
@@ -4147,7 +5649,7 @@
 StmtResult Sema::ActOnOpenMPSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -4167,29 +5669,15 @@
   if (!CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (auto C : Clauses) {
-      if (auto LC = dyn_cast<OMPLinearClause>(C))
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope))
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
           return StmtError();
     }
   }
 
-  // OpenMP 4.1 [2.8.1, simd Construct, Restrictions]
-  // If both simdlen and safelen clauses are specified, the value of the simdlen
-  // parameter must be less than or equal to the value of the safelen parameter.
-  OMPSafelenClause *Safelen = nullptr;
-  OMPSimdlenClause *Simdlen = nullptr;
-  for (auto *Clause : Clauses) {
-    if (Clause->getClauseKind() == OMPC_safelen)
-      Safelen = cast<OMPSafelenClause>(Clause);
-    else if (Clause->getClauseKind() == OMPC_simdlen)
-      Simdlen = cast<OMPSimdlenClause>(Clause);
-    if (Safelen && Simdlen)
-      break;
-  }
-  if (Simdlen && Safelen &&
-      checkSimdlenSafelenValues(*this, Simdlen->getSimdlen(),
-                                Safelen->getSafelen()))
+  if (checkSimdlenSafelenSpecified(*this, Clauses))
     return StmtError();
 
   getCurFunction()->setHasBranchProtectedScope();
@@ -4200,7 +5688,7 @@
 StmtResult Sema::ActOnOpenMPForDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -4220,9 +5708,10 @@
   if (!CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (auto C : Clauses) {
-      if (auto LC = dyn_cast<OMPLinearClause>(C))
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope))
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
           return StmtError();
     }
   }
@@ -4235,7 +5724,7 @@
 StmtResult Sema::ActOnOpenMPForSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -4256,29 +5745,15 @@
   if (!CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (auto C : Clauses) {
-      if (auto LC = dyn_cast<OMPLinearClause>(C))
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope))
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
           return StmtError();
     }
   }
 
-  // OpenMP 4.1 [2.8.1, simd Construct, Restrictions]
-  // If both simdlen and safelen clauses are specified, the value of the simdlen
-  // parameter must be less than or equal to the value of the safelen parameter.
-  OMPSafelenClause *Safelen = nullptr;
-  OMPSimdlenClause *Simdlen = nullptr;
-  for (auto *Clause : Clauses) {
-    if (Clause->getClauseKind() == OMPC_safelen)
-      Safelen = cast<OMPSafelenClause>(Clause);
-    else if (Clause->getClauseKind() == OMPC_simdlen)
-      Simdlen = cast<OMPSimdlenClause>(Clause);
-    if (Safelen && Simdlen)
-      break;
-  }
-  if (Simdlen && Safelen &&
-      checkSimdlenSafelenValues(*this, Simdlen->getSimdlen(),
-                                Safelen->getSafelen()))
+  if (checkSimdlenSafelenSpecified(*this, Clauses))
     return StmtError();
 
   getCurFunction()->setHasBranchProtectedScope();
@@ -4295,9 +5770,9 @@
 
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   auto BaseStmt = AStmt;
-  while (CapturedStmt *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
+  while (auto *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
     BaseStmt = CS->getCapturedStmt();
-  if (auto C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
+  if (auto *C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
     auto S = C->children();
     if (S.begin() == S.end())
       return StmtError();
@@ -4444,7 +5919,7 @@
 StmtResult Sema::ActOnOpenMPParallelForDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -4472,9 +5947,10 @@
   if (!CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (auto C : Clauses) {
-      if (auto LC = dyn_cast<OMPLinearClause>(C))
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope))
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
           return StmtError();
     }
   }
@@ -4488,7 +5964,7 @@
 StmtResult Sema::ActOnOpenMPParallelForSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -4513,29 +5989,15 @@
   if (!CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (auto C : Clauses) {
-      if (auto LC = dyn_cast<OMPLinearClause>(C))
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope))
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
           return StmtError();
     }
   }
 
-  // OpenMP 4.1 [2.8.1, simd Construct, Restrictions]
-  // If both simdlen and safelen clauses are specified, the value of the simdlen
-  // parameter must be less than or equal to the value of the safelen parameter.
-  OMPSafelenClause *Safelen = nullptr;
-  OMPSimdlenClause *Simdlen = nullptr;
-  for (auto *Clause : Clauses) {
-    if (Clause->getClauseKind() == OMPC_safelen)
-      Safelen = cast<OMPSafelenClause>(Clause);
-    else if (Clause->getClauseKind() == OMPC_simdlen)
-      Simdlen = cast<OMPSimdlenClause>(Clause);
-    if (Safelen && Simdlen)
-      break;
-  }
-  if (Simdlen && Safelen &&
-      checkSimdlenSafelenValues(*this, Simdlen->getSimdlen(),
-                                Safelen->getSafelen()))
+  if (checkSimdlenSafelenSpecified(*this, Clauses))
     return StmtError();
 
   getCurFunction()->setHasBranchProtectedScope();
@@ -4552,9 +6014,9 @@
 
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
   auto BaseStmt = AStmt;
-  while (CapturedStmt *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
+  while (auto *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
     BaseStmt = CS->getCapturedStmt();
-  if (auto C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
+  if (auto *C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
     auto S = C->children();
     if (S.begin() == S.end())
       return StmtError();
@@ -4588,7 +6050,7 @@
   if (!AStmt)
     return StmtError();
 
-  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  auto *CS = cast<CapturedStmt>(AStmt);
   // 1.2.2 OpenMP Language Terminology
   // Structured block - An executable statement with a single entry at the
   // top and a single exit at the bottom.
@@ -4885,21 +6347,21 @@
             AtomicCompAssignOp->getOpcode());
         OpLoc = AtomicCompAssignOp->getOperatorLoc();
         E = AtomicCompAssignOp->getRHS();
-        X = AtomicCompAssignOp->getLHS();
+        X = AtomicCompAssignOp->getLHS()->IgnoreParens();
         IsXLHSInRHSPart = true;
       } else if (auto *AtomicBinOp = dyn_cast<BinaryOperator>(
                      AtomicBody->IgnoreParenImpCasts())) {
         // Check for Binary Operation
-        if(checkBinaryOperation(AtomicBinOp, DiagId, NoteId))
+        if (checkBinaryOperation(AtomicBinOp, DiagId, NoteId))
           return true;
-      } else if (auto *AtomicUnaryOp =
-                 dyn_cast<UnaryOperator>(AtomicBody->IgnoreParenImpCasts())) {
+      } else if (auto *AtomicUnaryOp = dyn_cast<UnaryOperator>(
+                     AtomicBody->IgnoreParenImpCasts())) {
         // Check for Unary Operation
         if (AtomicUnaryOp->isIncrementDecrementOp()) {
           IsPostfixUpdate = AtomicUnaryOp->isPostfix();
           Op = AtomicUnaryOp->isIncrementOp() ? BO_Add : BO_Sub;
           OpLoc = AtomicUnaryOp->getOperatorLoc();
-          X = AtomicUnaryOp->getSubExpr();
+          X = AtomicUnaryOp->getSubExpr()->IgnoreParens();
           E = SemaRef.ActOnIntegerConstant(OpLoc, /*uint64_t Val=*/1).get();
           IsXLHSInRHSPart = true;
         } else {
@@ -4959,7 +6421,7 @@
   if (!AStmt)
     return StmtError();
 
-  auto CS = cast<CapturedStmt>(AStmt);
+  auto *CS = cast<CapturedStmt>(AStmt);
   // 1.2.2 OpenMP Language Terminology
   // Structured block - An executable statement with a single entry at the
   // top and a single exit at the bottom.
@@ -5027,8 +6489,8 @@
     SourceRange ErrorRange, NoteRange;
     // If clause is read:
     //  v = x;
-    if (auto AtomicBody = dyn_cast<Expr>(Body)) {
-      auto AtomicBinOp =
+    if (auto *AtomicBody = dyn_cast<Expr>(Body)) {
+      auto *AtomicBinOp =
           dyn_cast<BinaryOperator>(AtomicBody->IgnoreParenImpCasts());
       if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
         X = AtomicBinOp->getRHS()->IgnoreParenImpCasts();
@@ -5089,8 +6551,8 @@
     SourceRange ErrorRange, NoteRange;
     // If clause is write:
     //  x = expr;
-    if (auto AtomicBody = dyn_cast<Expr>(Body)) {
-      auto AtomicBinOp =
+    if (auto *AtomicBody = dyn_cast<Expr>(Body)) {
+      auto *AtomicBinOp =
           dyn_cast<BinaryOperator>(AtomicBody->IgnoreParenImpCasts());
       if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
         X = AtomicBinOp->getLHS();
@@ -5408,7 +6870,7 @@
     if (auto *CS = dyn_cast<CompoundStmt>(S)) {
       auto I = CS->body_begin();
       while (I != CS->body_end()) {
-        auto OED = dyn_cast<OMPExecutableDirective>(*I);
+        auto *OED = dyn_cast<OMPExecutableDirective>(*I);
         if (!OED || !isOpenMPTeamsDirective(OED->getDirectiveKind())) {
           OMPTeamsFound = false;
           break;
@@ -5417,6 +6879,9 @@
       }
       assert(I != CS->body_end() && "Not found statement");
       S = *I;
+    } else {
+      auto *OED = dyn_cast<OMPExecutableDirective>(S);
+      OMPTeamsFound = OED && isOpenMPTeamsDirective(OED->getDirectiveKind());
     }
     if (!OMPTeamsFound) {
       Diag(StartLoc, diag::err_omp_target_contains_not_only_teams);
@@ -5433,6 +6898,84 @@
   return OMPTargetDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
 }
 
+StmtResult
+Sema::ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
+                                         Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();
+
+  getCurFunction()->setHasBranchProtectedScope();
+
+  return OMPTargetParallelDirective::Create(Context, StartLoc, EndLoc, Clauses,
+                                            AStmt);
+}
+
+StmtResult Sema::ActOnOpenMPTargetParallelForDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc,
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();
+
+  OMPLoopDirective::HelperExprs B;
+  // In presence of clause 'collapse' or 'ordered' with number of loops, it will
+  // define the nested loops number.
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_parallel_for, getCollapseNumberExpr(Clauses),
+                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      VarsWithImplicitDSA, B);
+  if (NestedLoopCount == 0)
+    return StmtError();
+
+  assert((CurContext->isDependentContext() || B.builtAll()) &&
+         "omp target parallel for loop exprs were not built");
+
+  if (!CurContext->isDependentContext()) {
+    // Finalize the clauses that need pre-built expressions for CodeGen.
+    for (auto C : Clauses) {
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
+        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
+          return StmtError();
+    }
+  }
+
+  getCurFunction()->setHasBranchProtectedScope();
+  return OMPTargetParallelForDirective::Create(Context, StartLoc, EndLoc,
+                                               NestedLoopCount, Clauses, AStmt,
+                                               B, DSAStack->isCancelRegion());
+}
+
+/// \brief Check for existence of a map clause in the list of clauses.
+static bool HasMapClause(ArrayRef<OMPClause *> Clauses) {
+  for (ArrayRef<OMPClause *>::iterator I = Clauses.begin(), E = Clauses.end();
+       I != E; ++I) {
+    if (*I != nullptr && (*I)->getClauseKind() == OMPC_map) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
                                                 Stmt *AStmt,
                                                 SourceLocation StartLoc,
@@ -5442,12 +6985,66 @@
 
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
 
+  // OpenMP [2.10.1, Restrictions, p. 97]
+  // At least one map clause must appear on the directive.
+  if (!HasMapClause(Clauses)) {
+    Diag(StartLoc, diag::err_omp_no_map_for_directive)
+        << getOpenMPDirectiveName(OMPD_target_data);
+    return StmtError();
+  }
+
   getCurFunction()->setHasBranchProtectedScope();
 
   return OMPTargetDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
                                         AStmt);
 }
 
+StmtResult
+Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
+                                          SourceLocation StartLoc,
+                                          SourceLocation EndLoc) {
+  // OpenMP [2.10.2, Restrictions, p. 99]
+  // At least one map clause must appear on the directive.
+  if (!HasMapClause(Clauses)) {
+    Diag(StartLoc, diag::err_omp_no_map_for_directive)
+        << getOpenMPDirectiveName(OMPD_target_enter_data);
+    return StmtError();
+  }
+
+  return OMPTargetEnterDataDirective::Create(Context, StartLoc, EndLoc,
+                                             Clauses);
+}
+
+StmtResult
+Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
+                                         SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
+  // OpenMP [2.10.3, Restrictions, p. 102]
+  // At least one map clause must appear on the directive.
+  if (!HasMapClause(Clauses)) {
+    Diag(StartLoc, diag::err_omp_no_map_for_directive)
+        << getOpenMPDirectiveName(OMPD_target_exit_data);
+    return StmtError();
+  }
+
+  return OMPTargetExitDataDirective::Create(Context, StartLoc, EndLoc, Clauses);
+}
+
+StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
+  bool seenMotionClause = false;
+  for (auto *C : Clauses) {
+    if (C->getClauseKind() == OMPC_to || C->getClauseKind() == OMPC_from)
+      seenMotionClause = true;
+  }
+  if (!seenMotionClause) {
+    Diag(StartLoc, diag::err_omp_at_least_one_motion_clause_required);
+    return StmtError();
+  }
+  return OMPTargetUpdateDirective::Create(Context, StartLoc, EndLoc, Clauses);
+}
+
 StmtResult Sema::ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
                                            Stmt *AStmt, SourceLocation StartLoc,
                                            SourceLocation EndLoc) {
@@ -5539,7 +7136,7 @@
 StmtResult Sema::ActOnOpenMPTaskLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -5571,7 +7168,7 @@
 StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -5589,6 +7186,17 @@
   assert((CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
+  if (!CurContext->isDependentContext()) {
+    // Finalize the clauses that need pre-built expressions for CodeGen.
+    for (auto C : Clauses) {
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
+        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
+          return StmtError();
+    }
+  }
+
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
@@ -5603,7 +7211,7 @@
 StmtResult Sema::ActOnOpenMPDistributeDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
-    llvm::DenseMap<VarDecl *, Expr *> &VarsWithImplicitDSA) {
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -5626,6 +7234,237 @@
                                         NestedLoopCount, Clauses, AStmt, B);
 }
 
+StmtResult Sema::ActOnOpenMPDistributeParallelForDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc,
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();
+
+  OMPLoopDirective::HelperExprs B;
+  // In presence of clause 'collapse' with number of loops, it will
+  // define the nested loops number.
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_distribute_parallel_for, getCollapseNumberExpr(Clauses),
+      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
+  if (NestedLoopCount == 0)
+    return StmtError();
+
+  assert((CurContext->isDependentContext() || B.builtAll()) &&
+         "omp for loop exprs were not built");
+
+  getCurFunction()->setHasBranchProtectedScope();
+  return OMPDistributeParallelForDirective::Create(
+      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+}
+
+StmtResult Sema::ActOnOpenMPDistributeParallelForSimdDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc,
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();
+
+  OMPLoopDirective::HelperExprs B;
+  // In presence of clause 'collapse' with number of loops, it will
+  // define the nested loops number.
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
+      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
+  if (NestedLoopCount == 0)
+    return StmtError();
+
+  assert((CurContext->isDependentContext() || B.builtAll()) &&
+         "omp for loop exprs were not built");
+
+  if (checkSimdlenSafelenSpecified(*this, Clauses))
+    return StmtError();
+
+  getCurFunction()->setHasBranchProtectedScope();
+  return OMPDistributeParallelForSimdDirective::Create(
+      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+}
+
+StmtResult Sema::ActOnOpenMPDistributeSimdDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc,
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();
+
+  OMPLoopDirective::HelperExprs B;
+  // In presence of clause 'collapse' with number of loops, it will
+  // define the nested loops number.
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_distribute_simd, getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
+  if (NestedLoopCount == 0)
+    return StmtError();
+
+  assert((CurContext->isDependentContext() || B.builtAll()) &&
+         "omp for loop exprs were not built");
+
+  if (checkSimdlenSafelenSpecified(*this, Clauses))
+    return StmtError();
+
+  getCurFunction()->setHasBranchProtectedScope();
+  return OMPDistributeSimdDirective::Create(Context, StartLoc, EndLoc,
+                                            NestedLoopCount, Clauses, AStmt, B);
+}
+
+StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc,
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();
+
+  OMPLoopDirective::HelperExprs B;
+  // In presence of clause 'collapse' or 'ordered' with number of loops, it will
+  // define the nested loops number.
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_target_parallel_for_simd, getCollapseNumberExpr(Clauses),
+      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
+  if (NestedLoopCount == 0)
+    return StmtError();
+
+  assert((CurContext->isDependentContext() || B.builtAll()) &&
+         "omp target parallel for simd loop exprs were not built");
+
+  if (!CurContext->isDependentContext()) {
+    // Finalize the clauses that need pre-built expressions for CodeGen.
+    for (auto C : Clauses) {
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
+        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
+          return StmtError();
+    }
+  }
+  if (checkSimdlenSafelenSpecified(*this, Clauses))
+    return StmtError();
+
+  getCurFunction()->setHasBranchProtectedScope();
+  return OMPTargetParallelForSimdDirective::Create(
+      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+}
+
+StmtResult Sema::ActOnOpenMPTargetSimdDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc,
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();
+
+  OMPLoopDirective::HelperExprs B;
+  // In presence of clause 'collapse' with number of loops, it will define the
+  // nested loops number.
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_simd, getCollapseNumberExpr(Clauses),
+                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      VarsWithImplicitDSA, B);
+  if (NestedLoopCount == 0)
+    return StmtError();
+
+  assert((CurContext->isDependentContext() || B.builtAll()) &&
+         "omp target simd loop exprs were not built");
+
+  if (!CurContext->isDependentContext()) {
+    // Finalize the clauses that need pre-built expressions for CodeGen.
+    for (auto C : Clauses) {
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
+        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
+          return StmtError();
+    }
+  }
+
+  if (checkSimdlenSafelenSpecified(*this, Clauses))
+    return StmtError();
+
+  getCurFunction()->setHasBranchProtectedScope();
+  return OMPTargetSimdDirective::Create(Context, StartLoc, EndLoc,
+                                        NestedLoopCount, Clauses, AStmt, B);
+}
+
+StmtResult Sema::ActOnOpenMPTeamsDistributeDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc,
+    llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA) {
+  if (!AStmt)
+    return StmtError();
+
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();
+
+  OMPLoopDirective::HelperExprs B;
+  // In presence of clause 'collapse' with number of loops, it will
+  // define the nested loops number.
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_teams_distribute, getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
+  if (NestedLoopCount == 0)
+    return StmtError();
+
+  assert((CurContext->isDependentContext() || B.builtAll()) &&
+         "omp teams distribute loop exprs were not built");
+
+  getCurFunction()->setHasBranchProtectedScope();
+  return OMPTeamsDistributeDirective::Create(
+      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+}
+
 OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
                                              SourceLocation StartLoc,
                                              SourceLocation LParenLoc,
@@ -5700,7 +7539,13 @@
   case OMPC_map:
   case OMPC_nogroup:
   case OMPC_dist_schedule:
+  case OMPC_defaultmap:
   case OMPC_unknown:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
@@ -5716,12 +7561,11 @@
   if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
       !Condition->isInstantiationDependent() &&
       !Condition->containsUnexpandedParameterPack()) {
-    ExprResult Val = ActOnBooleanCondition(DSAStack->getCurScope(),
-                                           Condition->getExprLoc(), Condition);
+    ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
     if (Val.isInvalid())
       return nullptr;
 
-    ValExpr = Val.get();
+    ValExpr = MakeFullExpr(Val.get()).get();
   }
 
   return new (Context) OMPIfClause(NameModifier, ValExpr, StartLoc, LParenLoc,
@@ -5736,12 +7580,11 @@
   if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
       !Condition->isInstantiationDependent() &&
       !Condition->containsUnexpandedParameterPack()) {
-    ExprResult Val = ActOnBooleanCondition(DSAStack->getCurScope(),
-                                           Condition->getExprLoc(), Condition);
+    ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
     if (Val.isInvalid())
       return nullptr;
 
-    ValExpr = Val.get();
+    ValExpr = MakeFullExpr(Val.get()).get();
   }
 
   return new (Context) OMPFinalClause(ValExpr, StartLoc, LParenLoc, EndLoc);
@@ -5984,7 +7827,13 @@
   case OMPC_num_tasks:
   case OMPC_hint:
   case OMPC_dist_schedule:
+  case OMPC_defaultmap:
   case OMPC_unknown:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
@@ -6087,6 +7936,14 @@
         static_cast<OpenMPDistScheduleClauseKind>(Argument.back()), Expr,
         StartLoc, LParenLoc, ArgumentLoc.back(), DelimLoc, EndLoc);
     break;
+  case OMPC_defaultmap:
+    enum { Modifier, DefaultmapKind };
+    Res = ActOnOpenMPDefaultmapClause(
+        static_cast<OpenMPDefaultmapClauseModifier>(Argument[Modifier]),
+        static_cast<OpenMPDefaultmapClauseKind>(Argument[DefaultmapKind]),
+        StartLoc, LParenLoc, ArgumentLoc[Modifier], ArgumentLoc[DefaultmapKind],
+        EndLoc);
+    break;
   case OMPC_final:
   case OMPC_num_threads:
   case OMPC_safelen:
@@ -6127,6 +7984,11 @@
   case OMPC_num_tasks:
   case OMPC_hint:
   case OMPC_unknown:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
@@ -6201,7 +8063,7 @@
     return nullptr;
   }
   Expr *ValExpr = ChunkSize;
-  Expr *HelperValExpr = nullptr;
+  Stmt *HelperValStmt = nullptr;
   if (ChunkSize) {
     if (!ChunkSize->isValueDependent() && !ChunkSize->isTypeDependent() &&
         !ChunkSize->isInstantiationDependent() &&
@@ -6224,20 +8086,18 @@
               << "schedule" << 1 << ChunkSize->getSourceRange();
           return nullptr;
         }
-      } else if (isParallelOrTaskRegion(DSAStack->getCurrentDirective())) {
-        auto *ImpVar = buildVarDecl(*this, ChunkSize->getExprLoc(),
-                                    ChunkSize->getType(), ".chunk.");
-        auto *ImpVarRef = buildDeclRefExpr(*this, ImpVar, ChunkSize->getType(),
-                                           ChunkSize->getExprLoc(),
-                                           /*RefersToCapture=*/true);
-        HelperValExpr = ImpVarRef;
+      } else if (isParallelOrTaskRegion(DSAStack->getCurrentDirective()) &&
+                 !CurContext->isDependentContext()) {
+        llvm::MapVector<Expr *, DeclRefExpr *> Captures;
+        ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
+        HelperValStmt = buildPreInits(Context, Captures);
       }
     }
   }
 
   return new (Context)
       OMPScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc, Kind,
-                        ValExpr, HelperValExpr, M1, M1Loc, M2, M2Loc);
+                        ValExpr, HelperValStmt, M1, M1Loc, M2, M2Loc);
 }
 
 OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
@@ -6311,7 +8171,13 @@
   case OMPC_num_tasks:
   case OMPC_hint:
   case OMPC_dist_schedule:
+  case OMPC_defaultmap:
   case OMPC_unknown:
+  case OMPC_uniform:
+  case OMPC_to:
+  case OMPC_from:
+  case OMPC_use_device_ptr:
+  case OMPC_is_device_ptr:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
@@ -6378,8 +8244,9 @@
     SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc,
     SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec,
     const DeclarationNameInfo &ReductionId, OpenMPDependClauseKind DepKind,
-    OpenMPLinearClauseKind LinKind, OpenMPMapClauseKind MapTypeModifier, 
-    OpenMPMapClauseKind MapType, SourceLocation DepLinMapLoc) {
+    OpenMPLinearClauseKind LinKind, OpenMPMapClauseKind MapTypeModifier,
+    OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
+    SourceLocation DepLinMapLoc) {
   OMPClause *Res = nullptr;
   switch (Kind) {
   case OMPC_private:
@@ -6416,12 +8283,25 @@
     Res = ActOnOpenMPFlushClause(VarList, StartLoc, LParenLoc, EndLoc);
     break;
   case OMPC_depend:
-    Res = ActOnOpenMPDependClause(DepKind, DepLinMapLoc, ColonLoc, VarList, 
+    Res = ActOnOpenMPDependClause(DepKind, DepLinMapLoc, ColonLoc, VarList,
                                   StartLoc, LParenLoc, EndLoc);
     break;
   case OMPC_map:
-    Res = ActOnOpenMPMapClause(MapTypeModifier, MapType, DepLinMapLoc, ColonLoc,
-                               VarList, StartLoc, LParenLoc, EndLoc);
+    Res = ActOnOpenMPMapClause(MapTypeModifier, MapType, IsMapTypeImplicit,
+                               DepLinMapLoc, ColonLoc, VarList, StartLoc,
+                               LParenLoc, EndLoc);
+    break;
+  case OMPC_to:
+    Res = ActOnOpenMPToClause(VarList, StartLoc, LParenLoc, EndLoc);
+    break;
+  case OMPC_from:
+    Res = ActOnOpenMPFromClause(VarList, StartLoc, LParenLoc, EndLoc);
+    break;
+  case OMPC_use_device_ptr:
+    Res = ActOnOpenMPUseDevicePtrClause(VarList, StartLoc, LParenLoc, EndLoc);
+    break;
+  case OMPC_is_device_ptr:
+    Res = ActOnOpenMPIsDevicePtrClause(VarList, StartLoc, LParenLoc, EndLoc);
     break;
   case OMPC_if:
   case OMPC_final:
@@ -6453,12 +8333,92 @@
   case OMPC_num_tasks:
   case OMPC_hint:
   case OMPC_dist_schedule:
+  case OMPC_defaultmap:
   case OMPC_unknown:
+  case OMPC_uniform:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
 }
 
+ExprResult Sema::getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK,
+                                       ExprObjectKind OK, SourceLocation Loc) {
+  ExprResult Res = BuildDeclRefExpr(
+      Capture, Capture->getType().getNonReferenceType(), VK_LValue, Loc);
+  if (!Res.isUsable())
+    return ExprError();
+  if (OK == OK_Ordinary && !getLangOpts().CPlusPlus) {
+    Res = CreateBuiltinUnaryOp(Loc, UO_Deref, Res.get());
+    if (!Res.isUsable())
+      return ExprError();
+  }
+  if (VK != VK_LValue && Res.get()->isGLValue()) {
+    Res = DefaultLvalueConversion(Res.get());
+    if (!Res.isUsable())
+      return ExprError();
+  }
+  return Res;
+}
+
+static std::pair<ValueDecl *, bool>
+getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
+               SourceRange &ERange, bool AllowArraySection = false) {
+  if (RefExpr->isTypeDependent() || RefExpr->isValueDependent() ||
+      RefExpr->containsUnexpandedParameterPack())
+    return std::make_pair(nullptr, true);
+
+  // OpenMP [3.1, C/C++]
+  //  A list item is a variable name.
+  // OpenMP  [2.9.3.3, Restrictions, p.1]
+  //  A variable that is part of another variable (as an array or
+  //  structure element) cannot appear in a private clause.
+  RefExpr = RefExpr->IgnoreParens();
+  enum {
+    NoArrayExpr = -1,
+    ArraySubscript = 0,
+    OMPArraySection = 1
+  } IsArrayExpr = NoArrayExpr;
+  if (AllowArraySection) {
+    if (auto *ASE = dyn_cast_or_null<ArraySubscriptExpr>(RefExpr)) {
+      auto *Base = ASE->getBase()->IgnoreParenImpCasts();
+      while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+        Base = TempASE->getBase()->IgnoreParenImpCasts();
+      RefExpr = Base;
+      IsArrayExpr = ArraySubscript;
+    } else if (auto *OASE = dyn_cast_or_null<OMPArraySectionExpr>(RefExpr)) {
+      auto *Base = OASE->getBase()->IgnoreParenImpCasts();
+      while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+        Base = TempOASE->getBase()->IgnoreParenImpCasts();
+      while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+        Base = TempASE->getBase()->IgnoreParenImpCasts();
+      RefExpr = Base;
+      IsArrayExpr = OMPArraySection;
+    }
+  }
+  ELoc = RefExpr->getExprLoc();
+  ERange = RefExpr->getSourceRange();
+  RefExpr = RefExpr->IgnoreParenImpCasts();
+  auto *DE = dyn_cast_or_null<DeclRefExpr>(RefExpr);
+  auto *ME = dyn_cast_or_null<MemberExpr>(RefExpr);
+  if ((!DE || !isa<VarDecl>(DE->getDecl())) &&
+      (S.getCurrentThisType().isNull() || !ME ||
+       !isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()) ||
+       !isa<FieldDecl>(ME->getMemberDecl()))) {
+    if (IsArrayExpr != NoArrayExpr)
+      S.Diag(ELoc, diag::err_omp_expected_base_var_name) << IsArrayExpr
+                                                         << ERange;
+    else {
+      S.Diag(ELoc,
+             AllowArraySection
+                 ? diag::err_omp_expected_var_name_member_expr_or_array_item
+                 : diag::err_omp_expected_var_name_member_expr)
+          << (S.getCurrentThisType().isNull() ? 0 : 1) << ERange;
+    }
+    return std::make_pair(nullptr, false);
+  }
+  return std::make_pair(DE ? DE->getDecl() : ME->getMemberDecl(), false);
+}
+
 OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
                                           SourceLocation StartLoc,
                                           SourceLocation LParenLoc,
@@ -6467,42 +8427,27 @@
   SmallVector<Expr *, 8> PrivateCopies;
   for (auto &RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP private clause.");
-    if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
       PrivateCopies.push_back(nullptr);
-      continue;
     }
+    ValueDecl *D = Res.first;
+    if (!D)
+      continue;
 
-    SourceLocation ELoc = RefExpr->getExprLoc();
-    // OpenMP [2.1, C/C++]
-    //  A list item is a variable name.
-    // OpenMP  [2.9.3.3, Restrictions, p.1]
-    //  A variable that is part of another variable (as an array or
-    //  structure element) cannot appear in a private clause.
-    DeclRefExpr *DE = dyn_cast_or_null<DeclRefExpr>(RefExpr);
-    if (!DE || !isa<VarDecl>(DE->getDecl())) {
-      Diag(ELoc, diag::err_omp_expected_var_name) << RefExpr->getSourceRange();
-      continue;
-    }
-    Decl *D = DE->getDecl();
-    VarDecl *VD = cast<VarDecl>(D);
-
-    QualType Type = VD->getType();
-    if (Type->isDependentType() || Type->isInstantiationDependentType()) {
-      // It will be analyzed later.
-      Vars.push_back(DE);
-      PrivateCopies.push_back(nullptr);
-      continue;
-    }
+    QualType Type = D->getType();
+    auto *VD = dyn_cast<VarDecl>(D);
 
     // OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
     //  A variable that appears in a private clause must not have an incomplete
     //  type or a reference type.
-    if (RequireCompleteType(ELoc, Type,
-                            diag::err_omp_private_incomplete_type)) {
+    if (RequireCompleteType(ELoc, Type, diag::err_omp_private_incomplete_type))
       continue;
-    }
     Type = Type.getNonReferenceType();
 
     // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
@@ -6512,28 +8457,50 @@
     //  listed below. For these exceptions only, listing a predetermined
     //  variable in a data-sharing attribute clause is allowed and overrides
     //  the variable's predetermined data-sharing attributes.
-    DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(VD, false);
+    DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, false);
     if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_private) {
       Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
                                           << getOpenMPClauseName(OMPC_private);
-      ReportOriginalDSA(*this, DSAStack, VD, DVar);
+      ReportOriginalDSA(*this, DSAStack, D, DVar);
       continue;
     }
 
     // Variably modified types are not supported for tasks.
     if (!Type->isAnyPointerType() && Type->isVariablyModifiedType() &&
-        DSAStack->getCurrentDirective() == OMPD_task) {
+        isOpenMPTaskingDirective(DSAStack->getCurrentDirective())) {
       Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
           << getOpenMPClauseName(OMPC_private) << Type
           << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
       bool IsDecl =
+          !VD ||
           VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
-      Diag(VD->getLocation(),
+      Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-          << VD;
+          << D;
       continue;
     }
 
+    // OpenMP 4.5 [2.15.5.1, Restrictions, p.3]
+    // A list item cannot appear in both a map clause and a data-sharing
+    // attribute clause on the same construct
+    if (DSAStack->getCurrentDirective() == OMPD_target) {
+      OpenMPClauseKind ConflictKind;
+      if (DSAStack->checkMappableExprComponentListsForDecl(
+              VD, /*CurrentRegionOnly=*/true,
+              [&](OMPClauseMappableExprCommon::MappableExprComponentListRef,
+                  OpenMPClauseKind WhereFoundClauseKind) -> bool {
+                ConflictKind = WhereFoundClauseKind;
+                return true;
+              })) {
+        Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
+            << getOpenMPClauseName(OMPC_private)
+            << getOpenMPClauseName(ConflictKind)
+            << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
+        ReportOriginalDSA(*this, DSAStack, D, DVar);
+        continue;
+      }
+    }
+
     // OpenMP [2.9.3.3, Restrictions, C/C++, p.1]
     //  A variable of class type (or array thereof) that appears in a private
     //  clause requires an accessible, unambiguous default constructor for the
@@ -6544,16 +8511,21 @@
     // IdResolver, so the code in the OpenMP region uses original variable for
     // proper diagnostics.
     Type = Type.getUnqualifiedType();
-    auto VDPrivate = buildVarDecl(*this, DE->getExprLoc(), Type, VD->getName(),
-                                  VD->hasAttrs() ? &VD->getAttrs() : nullptr);
+    auto VDPrivate = buildVarDecl(*this, ELoc, Type, D->getName(),
+                                  D->hasAttrs() ? &D->getAttrs() : nullptr);
     ActOnUninitializedDecl(VDPrivate, /*TypeMayContainAuto=*/false);
     if (VDPrivate->isInvalidDecl())
       continue;
     auto VDPrivateRefExpr = buildDeclRefExpr(
-        *this, VDPrivate, DE->getType().getUnqualifiedType(), DE->getExprLoc());
+        *this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);
 
-    DSAStack->addDSA(VD, DE, OMPC_private);
-    Vars.push_back(DE);
+    DeclRefExpr *Ref = nullptr;
+    if (!VD && !CurContext->isDependentContext())
+      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
+    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_private, Ref);
+    Vars.push_back((VD || CurContext->isDependentContext())
+                       ? RefExpr->IgnoreParens()
+                       : Ref);
     PrivateCopies.push_back(VDPrivateRefExpr);
   }
 
@@ -6594,51 +8566,37 @@
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> PrivateCopies;
   SmallVector<Expr *, 8> Inits;
+  SmallVector<Decl *, 4> ExprCaptures;
   bool IsImplicitClause =
       StartLoc.isInvalid() && LParenLoc.isInvalid() && EndLoc.isInvalid();
   auto ImplicitClauseLoc = DSAStack->getConstructLoc();
 
   for (auto &RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP firstprivate clause.");
-    if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
       PrivateCopies.push_back(nullptr);
       Inits.push_back(nullptr);
-      continue;
     }
+    ValueDecl *D = Res.first;
+    if (!D)
+      continue;
 
-    SourceLocation ELoc =
-        IsImplicitClause ? ImplicitClauseLoc : RefExpr->getExprLoc();
-    // OpenMP [2.1, C/C++]
-    //  A list item is a variable name.
-    // OpenMP  [2.9.3.3, Restrictions, p.1]
-    //  A variable that is part of another variable (as an array or
-    //  structure element) cannot appear in a private clause.
-    DeclRefExpr *DE = dyn_cast_or_null<DeclRefExpr>(RefExpr);
-    if (!DE || !isa<VarDecl>(DE->getDecl())) {
-      Diag(ELoc, diag::err_omp_expected_var_name) << RefExpr->getSourceRange();
-      continue;
-    }
-    Decl *D = DE->getDecl();
-    VarDecl *VD = cast<VarDecl>(D);
-
-    QualType Type = VD->getType();
-    if (Type->isDependentType() || Type->isInstantiationDependentType()) {
-      // It will be analyzed later.
-      Vars.push_back(DE);
-      PrivateCopies.push_back(nullptr);
-      Inits.push_back(nullptr);
-      continue;
-    }
+    ELoc = IsImplicitClause ? ImplicitClauseLoc : ELoc;
+    QualType Type = D->getType();
+    auto *VD = dyn_cast<VarDecl>(D);
 
     // OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
     //  A variable that appears in a private clause must not have an incomplete
     //  type or a reference type.
     if (RequireCompleteType(ELoc, Type,
-                            diag::err_omp_firstprivate_incomplete_type)) {
+                            diag::err_omp_firstprivate_incomplete_type))
       continue;
-    }
     Type = Type.getNonReferenceType();
 
     // OpenMP [2.9.3.4, Restrictions, C/C++, p.1]
@@ -6648,8 +8606,10 @@
     auto ElemType = Context.getBaseElementType(Type).getNonReferenceType();
 
     // If an implicit firstprivate variable found it was checked already.
+    DSAStackTy::DSAVarData TopDVar;
     if (!IsImplicitClause) {
-      DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(VD, false);
+      DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, false);
+      TopDVar = DVar;
       bool IsConstant = ElemType.isConstant(Context);
       // OpenMP [2.4.13, Data-sharing Attribute Clauses]
       //  A list item that specifies a given variable may not appear in more
@@ -6660,7 +8620,7 @@
         Diag(ELoc, diag::err_omp_wrong_dsa)
             << getOpenMPClauseName(DVar.CKind)
             << getOpenMPClauseName(OMPC_firstprivate);
-        ReportOriginalDSA(*this, DSAStack, VD, DVar);
+        ReportOriginalDSA(*this, DSAStack, D, DVar);
         continue;
       }
 
@@ -6675,12 +8635,12 @@
       // in a Construct, C/C++, p.2]
       //  Variables with const-qualified type having no mutable member may be
       //  listed in a firstprivate clause, even if they are static data members.
-      if (!(IsConstant || VD->isStaticDataMember()) && !DVar.RefExpr &&
+      if (!(IsConstant || (VD && VD->isStaticDataMember())) && !DVar.RefExpr &&
           DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_shared) {
         Diag(ELoc, diag::err_omp_wrong_dsa)
             << getOpenMPClauseName(DVar.CKind)
             << getOpenMPClauseName(OMPC_firstprivate);
-        ReportOriginalDSA(*this, DSAStack, VD, DVar);
+        ReportOriginalDSA(*this, DSAStack, D, DVar);
         continue;
       }
 
@@ -6692,14 +8652,14 @@
       //  to any of the parallel regions arising from the parallel construct.
       if (isOpenMPWorksharingDirective(CurrDir) &&
           !isOpenMPParallelDirective(CurrDir)) {
-        DVar = DSAStack->getImplicitDSA(VD, true);
+        DVar = DSAStack->getImplicitDSA(D, true);
         if (DVar.CKind != OMPC_shared &&
             (isOpenMPParallelDirective(DVar.DKind) ||
              DVar.DKind == OMPD_unknown)) {
           Diag(ELoc, diag::err_omp_required_access)
               << getOpenMPClauseName(OMPC_firstprivate)
               << getOpenMPClauseName(OMPC_shared);
-          ReportOriginalDSA(*this, DSAStack, VD, DVar);
+          ReportOriginalDSA(*this, DSAStack, D, DVar);
           continue;
         }
       }
@@ -6714,20 +8674,20 @@
       //  construct must not appear in a firstprivate clause in a task construct
       //  encountered during execution of any of the worksharing regions arising
       //  from the worksharing construct.
-      if (CurrDir == OMPD_task) {
-        DVar =
-            DSAStack->hasInnermostDSA(VD, MatchesAnyClause(OMPC_reduction),
-                                      [](OpenMPDirectiveKind K) -> bool {
-                                        return isOpenMPParallelDirective(K) ||
-                                               isOpenMPWorksharingDirective(K);
-                                      },
-                                      false);
+      if (isOpenMPTaskingDirective(CurrDir)) {
+        DVar = DSAStack->hasInnermostDSA(
+            D, [](OpenMPClauseKind C) -> bool { return C == OMPC_reduction; },
+            [](OpenMPDirectiveKind K) -> bool {
+              return isOpenMPParallelDirective(K) ||
+                     isOpenMPWorksharingDirective(K);
+            },
+            false);
         if (DVar.CKind == OMPC_reduction &&
             (isOpenMPParallelDirective(DVar.DKind) ||
              isOpenMPWorksharingDirective(DVar.DKind))) {
           Diag(ELoc, diag::err_omp_parallel_reduction_in_task_firstprivate)
               << getOpenMPDirectiveName(DVar.DKind);
-          ReportOriginalDSA(*this, DSAStack, VD, DVar);
+          ReportOriginalDSA(*this, DSAStack, D, DVar);
           continue;
         }
       }
@@ -6746,31 +8706,53 @@
       // A list item may appear in a firstprivate or lastprivate clause but not
       // both.
       if (CurrDir == OMPD_distribute) {
-        DVar = DSAStack->hasInnermostDSA(VD, MatchesAnyClause(OMPC_private),
-                                         [](OpenMPDirectiveKind K) -> bool {
-                                           return isOpenMPTeamsDirective(K);
-                                         },
-                                         false);
+        DVar = DSAStack->hasInnermostDSA(
+            D, [](OpenMPClauseKind C) -> bool { return C == OMPC_private; },
+            [](OpenMPDirectiveKind K) -> bool {
+              return isOpenMPTeamsDirective(K);
+            },
+            false);
         if (DVar.CKind == OMPC_private && isOpenMPTeamsDirective(DVar.DKind)) {
           Diag(ELoc, diag::err_omp_firstprivate_distribute_private_teams);
-          ReportOriginalDSA(*this, DSAStack, VD, DVar);
+          ReportOriginalDSA(*this, DSAStack, D, DVar);
           continue;
         }
-        DVar = DSAStack->hasInnermostDSA(VD, MatchesAnyClause(OMPC_reduction),
-                                         [](OpenMPDirectiveKind K) -> bool {
-                                           return isOpenMPTeamsDirective(K);
-                                         },
-                                         false);
+        DVar = DSAStack->hasInnermostDSA(
+            D, [](OpenMPClauseKind C) -> bool { return C == OMPC_reduction; },
+            [](OpenMPDirectiveKind K) -> bool {
+              return isOpenMPTeamsDirective(K);
+            },
+            false);
         if (DVar.CKind == OMPC_reduction &&
             isOpenMPTeamsDirective(DVar.DKind)) {
           Diag(ELoc, diag::err_omp_firstprivate_distribute_in_teams_reduction);
-          ReportOriginalDSA(*this, DSAStack, VD, DVar);
+          ReportOriginalDSA(*this, DSAStack, D, DVar);
           continue;
         }
-        DVar = DSAStack->getTopDSA(VD, false);
+        DVar = DSAStack->getTopDSA(D, false);
         if (DVar.CKind == OMPC_lastprivate) {
           Diag(ELoc, diag::err_omp_firstprivate_and_lastprivate_in_distribute);
-          ReportOriginalDSA(*this, DSAStack, VD, DVar);
+          ReportOriginalDSA(*this, DSAStack, D, DVar);
+          continue;
+        }
+      }
+      // OpenMP 4.5 [2.15.5.1, Restrictions, p.3]
+      // A list item cannot appear in both a map clause and a data-sharing
+      // attribute clause on the same construct
+      if (CurrDir == OMPD_target) {
+        OpenMPClauseKind ConflictKind;
+        if (DSAStack->checkMappableExprComponentListsForDecl(
+                VD, /*CurrentRegionOnly=*/true,
+                [&](OMPClauseMappableExprCommon::MappableExprComponentListRef,
+                    OpenMPClauseKind WhereFoundClauseKind) -> bool {
+                  ConflictKind = WhereFoundClauseKind;
+                  return true;
+                })) {
+          Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
+              << getOpenMPClauseName(OMPC_firstprivate)
+              << getOpenMPClauseName(ConflictKind)
+              << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
+          ReportOriginalDSA(*this, DSAStack, D, DVar);
           continue;
         }
       }
@@ -6778,21 +8760,22 @@
 
     // Variably modified types are not supported for tasks.
     if (!Type->isAnyPointerType() && Type->isVariablyModifiedType() &&
-        DSAStack->getCurrentDirective() == OMPD_task) {
+        isOpenMPTaskingDirective(DSAStack->getCurrentDirective())) {
       Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
           << getOpenMPClauseName(OMPC_firstprivate) << Type
           << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
       bool IsDecl =
+          !VD ||
           VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
-      Diag(VD->getLocation(),
+      Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-          << VD;
+          << D;
       continue;
     }
 
     Type = Type.getUnqualifiedType();
-    auto VDPrivate = buildVarDecl(*this, ELoc, Type, VD->getName(),
-                                  VD->hasAttrs() ? &VD->getAttrs() : nullptr);
+    auto VDPrivate = buildVarDecl(*this, ELoc, Type, D->getName(),
+                                  D->hasAttrs() ? &D->getAttrs() : nullptr);
     // Generate helper private variable and initialize it with the value of the
     // original variable. The address of the original variable is replaced by
     // the address of the new private variable in the CodeGen. This new variable
@@ -6803,11 +8786,11 @@
     // original array element in CodeGen.
     if (Type->isArrayType()) {
       auto VDInit =
-          buildVarDecl(*this, DE->getExprLoc(), ElemType, VD->getName());
+          buildVarDecl(*this, RefExpr->getExprLoc(), ElemType, D->getName());
       VDInitRefExpr = buildDeclRefExpr(*this, VDInit, ElemType, ELoc);
       auto Init = DefaultLvalueConversion(VDInitRefExpr).get();
       ElemType = ElemType.getUnqualifiedType();
-      auto *VDInitTemp = buildVarDecl(*this, DE->getLocStart(), ElemType,
+      auto *VDInitTemp = buildVarDecl(*this, RefExpr->getExprLoc(), ElemType,
                                       ".firstprivate.temp");
       InitializedEntity Entity =
           InitializedEntity::InitializeVariable(VDInitTemp);
@@ -6822,26 +8805,39 @@
       // Remove temp variable declaration.
       Context.Deallocate(VDInitTemp);
     } else {
-      auto *VDInit =
-          buildVarDecl(*this, DE->getLocStart(), Type, ".firstprivate.temp");
-      VDInitRefExpr =
-          buildDeclRefExpr(*this, VDInit, DE->getType(), DE->getExprLoc());
+      auto *VDInit = buildVarDecl(*this, RefExpr->getExprLoc(), Type,
+                                  ".firstprivate.temp");
+      VDInitRefExpr = buildDeclRefExpr(*this, VDInit, RefExpr->getType(),
+                                       RefExpr->getExprLoc());
       AddInitializerToDecl(VDPrivate,
                            DefaultLvalueConversion(VDInitRefExpr).get(),
                            /*DirectInit=*/false, /*TypeMayContainAuto=*/false);
     }
     if (VDPrivate->isInvalidDecl()) {
       if (IsImplicitClause) {
-        Diag(DE->getExprLoc(),
+        Diag(RefExpr->getExprLoc(),
              diag::note_omp_task_predetermined_firstprivate_here);
       }
       continue;
     }
     CurContext->addDecl(VDPrivate);
     auto VDPrivateRefExpr = buildDeclRefExpr(
-        *this, VDPrivate, DE->getType().getUnqualifiedType(), DE->getExprLoc());
-    DSAStack->addDSA(VD, DE, OMPC_firstprivate);
-    Vars.push_back(DE);
+        *this, VDPrivate, RefExpr->getType().getUnqualifiedType(),
+        RefExpr->getExprLoc());
+    DeclRefExpr *Ref = nullptr;
+    if (!VD && !CurContext->isDependentContext()) {
+      if (TopDVar.CKind == OMPC_lastprivate)
+        Ref = TopDVar.PrivateCopy;
+      else {
+        Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+        if (!IsOpenMPCapturedDecl(D))
+          ExprCaptures.push_back(Ref->getDecl());
+      }
+    }
+    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);
+    Vars.push_back((VD || CurContext->isDependentContext())
+                       ? RefExpr->IgnoreParens()
+                       : Ref);
     PrivateCopies.push_back(VDPrivateRefExpr);
     Inits.push_back(VDInitRefExpr);
   }
@@ -6850,7 +8846,8 @@
     return nullptr;
 
   return OMPFirstprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
-                                       Vars, PrivateCopies, Inits);
+                                       Vars, PrivateCopies, Inits,
+                                       buildPreInits(Context, ExprCaptures));
 }
 
 OMPClause *Sema::ActOnOpenMPLastprivateClause(ArrayRef<Expr *> VarList,
@@ -6861,48 +8858,34 @@
   SmallVector<Expr *, 8> SrcExprs;
   SmallVector<Expr *, 8> DstExprs;
   SmallVector<Expr *, 8> AssignmentOps;
+  SmallVector<Decl *, 4> ExprCaptures;
+  SmallVector<Expr *, 4> ExprPostUpdates;
   for (auto &RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP lastprivate clause.");
-    if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
       SrcExprs.push_back(nullptr);
       DstExprs.push_back(nullptr);
       AssignmentOps.push_back(nullptr);
-      continue;
     }
+    ValueDecl *D = Res.first;
+    if (!D)
+      continue;
 
-    SourceLocation ELoc = RefExpr->getExprLoc();
-    // OpenMP [2.1, C/C++]
-    //  A list item is a variable name.
-    // OpenMP  [2.14.3.5, Restrictions, p.1]
-    //  A variable that is part of another variable (as an array or structure
-    //  element) cannot appear in a lastprivate clause.
-    DeclRefExpr *DE = dyn_cast_or_null<DeclRefExpr>(RefExpr);
-    if (!DE || !isa<VarDecl>(DE->getDecl())) {
-      Diag(ELoc, diag::err_omp_expected_var_name) << RefExpr->getSourceRange();
-      continue;
-    }
-    Decl *D = DE->getDecl();
-    VarDecl *VD = cast<VarDecl>(D);
-
-    QualType Type = VD->getType();
-    if (Type->isDependentType() || Type->isInstantiationDependentType()) {
-      // It will be analyzed later.
-      Vars.push_back(DE);
-      SrcExprs.push_back(nullptr);
-      DstExprs.push_back(nullptr);
-      AssignmentOps.push_back(nullptr);
-      continue;
-    }
+    QualType Type = D->getType();
+    auto *VD = dyn_cast<VarDecl>(D);
 
     // OpenMP [2.14.3.5, Restrictions, C/C++, p.2]
     //  A variable that appears in a lastprivate clause must not have an
     //  incomplete type or a reference type.
     if (RequireCompleteType(ELoc, Type,
-                            diag::err_omp_lastprivate_incomplete_type)) {
+                            diag::err_omp_lastprivate_incomplete_type))
       continue;
-    }
     Type = Type.getNonReferenceType();
 
     // OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables Referenced
@@ -6910,14 +8893,14 @@
     //  Variables with the predetermined data-sharing attributes may not be
     //  listed in data-sharing attributes clauses, except for the cases
     //  listed below.
-    DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(VD, false);
+    DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, false);
     if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_lastprivate &&
         DVar.CKind != OMPC_firstprivate &&
         (DVar.CKind != OMPC_private || DVar.RefExpr != nullptr)) {
       Diag(ELoc, diag::err_omp_wrong_dsa)
           << getOpenMPClauseName(DVar.CKind)
           << getOpenMPClauseName(OMPC_lastprivate);
-      ReportOriginalDSA(*this, DSAStack, VD, DVar);
+      ReportOriginalDSA(*this, DSAStack, D, DVar);
       continue;
     }
 
@@ -6931,15 +8914,28 @@
     DSAStackTy::DSAVarData TopDVar = DVar;
     if (isOpenMPWorksharingDirective(CurrDir) &&
         !isOpenMPParallelDirective(CurrDir)) {
-      DVar = DSAStack->getImplicitDSA(VD, true);
+      DVar = DSAStack->getImplicitDSA(D, true);
       if (DVar.CKind != OMPC_shared) {
         Diag(ELoc, diag::err_omp_required_access)
             << getOpenMPClauseName(OMPC_lastprivate)
             << getOpenMPClauseName(OMPC_shared);
-        ReportOriginalDSA(*this, DSAStack, VD, DVar);
+        ReportOriginalDSA(*this, DSAStack, D, DVar);
         continue;
       }
     }
+
+    // OpenMP 4.5 [2.10.8, Distribute Construct, p.3]
+    // A list item may appear in a firstprivate or lastprivate clause but not
+    // both.
+    if (CurrDir == OMPD_distribute) {
+      DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, false);
+      if (DVar.CKind == OMPC_firstprivate) {
+        Diag(ELoc, diag::err_omp_firstprivate_and_lastprivate_in_distribute);
+        ReportOriginalDSA(*this, DSAStack, D, DVar);
+        continue;
+      }
+    }
+
     // OpenMP [2.14.3.5, Restrictions, C++, p.1,2]
     //  A variable of class type (or array thereof) that appears in a
     //  lastprivate clause requires an accessible, unambiguous default
@@ -6949,42 +8945,54 @@
     //  lastprivate clause requires an accessible, unambiguous copy assignment
     //  operator for the class type.
     Type = Context.getBaseElementType(Type).getNonReferenceType();
-    auto *SrcVD = buildVarDecl(*this, DE->getLocStart(),
+    auto *SrcVD = buildVarDecl(*this, ERange.getBegin(),
                                Type.getUnqualifiedType(), ".lastprivate.src",
-                               VD->hasAttrs() ? &VD->getAttrs() : nullptr);
-    auto *PseudoSrcExpr = buildDeclRefExpr(
-        *this, SrcVD, Type.getUnqualifiedType(), DE->getExprLoc());
+                               D->hasAttrs() ? &D->getAttrs() : nullptr);
+    auto *PseudoSrcExpr =
+        buildDeclRefExpr(*this, SrcVD, Type.getUnqualifiedType(), ELoc);
     auto *DstVD =
-        buildVarDecl(*this, DE->getLocStart(), Type, ".lastprivate.dst",
-                     VD->hasAttrs() ? &VD->getAttrs() : nullptr);
-    auto *PseudoDstExpr =
-        buildDeclRefExpr(*this, DstVD, Type, DE->getExprLoc());
+        buildVarDecl(*this, ERange.getBegin(), Type, ".lastprivate.dst",
+                     D->hasAttrs() ? &D->getAttrs() : nullptr);
+    auto *PseudoDstExpr = buildDeclRefExpr(*this, DstVD, Type, ELoc);
     // For arrays generate assignment operation for single element and replace
     // it by the original array element in CodeGen.
-    auto AssignmentOp = BuildBinOp(/*S=*/nullptr, DE->getExprLoc(), BO_Assign,
+    auto AssignmentOp = BuildBinOp(/*S=*/nullptr, ELoc, BO_Assign,
                                    PseudoDstExpr, PseudoSrcExpr);
     if (AssignmentOp.isInvalid())
       continue;
-    AssignmentOp = ActOnFinishFullExpr(AssignmentOp.get(), DE->getExprLoc(),
+    AssignmentOp = ActOnFinishFullExpr(AssignmentOp.get(), ELoc,
                                        /*DiscardedValue=*/true);
     if (AssignmentOp.isInvalid())
       continue;
 
-    // OpenMP 4.5 [2.10.8, Distribute Construct, p.3]
-    // A list item may appear in a firstprivate or lastprivate clause but not
-    // both.
-    if (CurrDir == OMPD_distribute) {
-      DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(VD, false);
-      if (DVar.CKind == OMPC_firstprivate) {
-        Diag(ELoc, diag::err_omp_firstprivate_and_lastprivate_in_distribute);
-        ReportOriginalDSA(*this, DSAStack, VD, DVar);
-        continue;
+    DeclRefExpr *Ref = nullptr;
+    if (!VD && !CurContext->isDependentContext()) {
+      if (TopDVar.CKind == OMPC_firstprivate)
+        Ref = TopDVar.PrivateCopy;
+      else {
+        Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
+        if (!IsOpenMPCapturedDecl(D))
+          ExprCaptures.push_back(Ref->getDecl());
+      }
+      if (TopDVar.CKind == OMPC_firstprivate ||
+          (!IsOpenMPCapturedDecl(D) &&
+           Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>())) {
+        ExprResult RefRes = DefaultLvalueConversion(Ref);
+        if (!RefRes.isUsable())
+          continue;
+        ExprResult PostUpdateRes =
+            BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign, SimpleRefExpr,
+                       RefRes.get());
+        if (!PostUpdateRes.isUsable())
+          continue;
+        ExprPostUpdates.push_back(
+            IgnoredValueConversions(PostUpdateRes.get()).get());
       }
     }
-
-    if (TopDVar.CKind != OMPC_firstprivate)
-      DSAStack->addDSA(VD, DE, OMPC_lastprivate);
-    Vars.push_back(DE);
+    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_lastprivate, Ref);
+    Vars.push_back((VD || CurContext->isDependentContext())
+                       ? RefExpr->IgnoreParens()
+                       : Ref);
     SrcExprs.push_back(PseudoSrcExpr);
     DstExprs.push_back(PseudoDstExpr);
     AssignmentOps.push_back(AssignmentOp.get());
@@ -6994,7 +9002,9 @@
     return nullptr;
 
   return OMPLastprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
-                                      Vars, SrcExprs, DstExprs, AssignmentOps);
+                                      Vars, SrcExprs, DstExprs, AssignmentOps,
+                                      buildPreInits(Context, ExprCaptures),
+                                      buildPostUpdate(*this, ExprPostUpdates));
 }
 
 OMPClause *Sema::ActOnOpenMPSharedClause(ArrayRef<Expr *> VarList,
@@ -7003,35 +9013,20 @@
                                          SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   for (auto &RefExpr : VarList) {
-    assert(RefExpr && "NULL expr in OpenMP shared clause.");
-    if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
+    assert(RefExpr && "NULL expr in OpenMP lastprivate clause.");
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
-      continue;
     }
-
-    SourceLocation ELoc = RefExpr->getExprLoc();
-    // OpenMP [2.1, C/C++]
-    //  A list item is a variable name.
-    // OpenMP  [2.14.3.2, Restrictions, p.1]
-    //  A variable that is part of another variable (as an array or structure
-    //  element) cannot appear in a shared unless it is a static data member
-    //  of a C++ class.
-    DeclRefExpr *DE = dyn_cast<DeclRefExpr>(RefExpr);
-    if (!DE || !isa<VarDecl>(DE->getDecl())) {
-      Diag(ELoc, diag::err_omp_expected_var_name) << RefExpr->getSourceRange();
+    ValueDecl *D = Res.first;
+    if (!D)
       continue;
-    }
-    Decl *D = DE->getDecl();
-    VarDecl *VD = cast<VarDecl>(D);
 
-    QualType Type = VD->getType();
-    if (Type->isDependentType() || Type->isInstantiationDependentType()) {
-      // It will be analyzed later.
-      Vars.push_back(DE);
-      continue;
-    }
-
+    auto *VD = dyn_cast<VarDecl>(D);
     // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
     // in a Construct]
     //  Variables with the predetermined data-sharing attributes may not be
@@ -7039,17 +9034,22 @@
     //  listed below. For these exceptions only, listing a predetermined
     //  variable in a data-sharing attribute clause is allowed and overrides
     //  the variable's predetermined data-sharing attributes.
-    DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(VD, false);
+    DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, false);
     if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_shared &&
         DVar.RefExpr) {
       Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
                                           << getOpenMPClauseName(OMPC_shared);
-      ReportOriginalDSA(*this, DSAStack, VD, DVar);
+      ReportOriginalDSA(*this, DSAStack, D, DVar);
       continue;
     }
 
-    DSAStack->addDSA(VD, DE, OMPC_shared);
-    Vars.push_back(DE);
+    DeclRefExpr *Ref = nullptr;
+    if (!VD && IsOpenMPCapturedDecl(D) && !CurContext->isDependentContext())
+      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_shared, Ref);
+    Vars.push_back((VD || !Ref || CurContext->isDependentContext())
+                       ? RefExpr->IgnoreParens()
+                       : Ref);
   }
 
   if (Vars.empty())
@@ -7070,8 +9070,9 @@
         return false;
       if (DVar.CKind != OMPC_unknown)
         return true;
-      DSAStackTy::DSAVarData DVarPrivate =
-          Stack->hasDSA(VD, isOpenMPPrivate, MatchesAlways(), false);
+      DSAStackTy::DSAVarData DVarPrivate = Stack->hasDSA(
+          VD, isOpenMPPrivate, [](OpenMPDirectiveKind) -> bool { return true; },
+          false);
       if (DVarPrivate.CKind != OMPC_unknown)
         return true;
       return false;
@@ -7089,16 +9090,137 @@
 };
 } // namespace
 
+namespace {
+// Transform MemberExpression for specified FieldDecl of current class to
+// DeclRefExpr to specified OMPCapturedExprDecl.
+class TransformExprToCaptures : public TreeTransform<TransformExprToCaptures> {
+  typedef TreeTransform<TransformExprToCaptures> BaseTransform;
+  ValueDecl *Field;
+  DeclRefExpr *CapturedExpr;
+
+public:
+  TransformExprToCaptures(Sema &SemaRef, ValueDecl *FieldDecl)
+      : BaseTransform(SemaRef), Field(FieldDecl), CapturedExpr(nullptr) {}
+
+  ExprResult TransformMemberExpr(MemberExpr *E) {
+    if (isa<CXXThisExpr>(E->getBase()->IgnoreParenImpCasts()) &&
+        E->getMemberDecl() == Field) {
+      CapturedExpr = buildCapture(SemaRef, Field, E, /*WithInit=*/false);
+      return CapturedExpr;
+    }
+    return BaseTransform::TransformMemberExpr(E);
+  }
+  DeclRefExpr *getCapturedExpr() { return CapturedExpr; }
+};
+} // namespace
+
+template <typename T>
+static T filterLookupForUDR(SmallVectorImpl<UnresolvedSet<8>> &Lookups,
+                            const llvm::function_ref<T(ValueDecl *)> &Gen) {
+  for (auto &Set : Lookups) {
+    for (auto *D : Set) {
+      if (auto Res = Gen(cast<ValueDecl>(D)))
+        return Res;
+    }
+  }
+  return T();
+}
+
+static ExprResult
+buildDeclareReductionRef(Sema &SemaRef, SourceLocation Loc, SourceRange Range,
+                         Scope *S, CXXScopeSpec &ReductionIdScopeSpec,
+                         const DeclarationNameInfo &ReductionId, QualType Ty,
+                         CXXCastPath &BasePath, Expr *UnresolvedReduction) {
+  if (ReductionIdScopeSpec.isInvalid())
+    return ExprError();
+  SmallVector<UnresolvedSet<8>, 4> Lookups;
+  if (S) {
+    LookupResult Lookup(SemaRef, ReductionId, Sema::LookupOMPReductionName);
+    Lookup.suppressDiagnostics();
+    while (S && SemaRef.LookupParsedName(Lookup, S, &ReductionIdScopeSpec)) {
+      auto *D = Lookup.getRepresentativeDecl();
+      do {
+        S = S->getParent();
+      } while (S && !S->isDeclScope(D));
+      if (S)
+        S = S->getParent();
+      Lookups.push_back(UnresolvedSet<8>());
+      Lookups.back().append(Lookup.begin(), Lookup.end());
+      Lookup.clear();
+    }
+  } else if (auto *ULE =
+                 cast_or_null<UnresolvedLookupExpr>(UnresolvedReduction)) {
+    Lookups.push_back(UnresolvedSet<8>());
+    Decl *PrevD = nullptr;
+    for (auto *D : ULE->decls()) {
+      if (D == PrevD)
+        Lookups.push_back(UnresolvedSet<8>());
+      else if (auto *DRD = cast<OMPDeclareReductionDecl>(D))
+        Lookups.back().addDecl(DRD);
+      PrevD = D;
+    }
+  }
+  if (Ty->isDependentType() || Ty->isInstantiationDependentType() ||
+      Ty->containsUnexpandedParameterPack() ||
+      filterLookupForUDR<bool>(Lookups, [](ValueDecl *D) -> bool {
+        return !D->isInvalidDecl() &&
+               (D->getType()->isDependentType() ||
+                D->getType()->isInstantiationDependentType() ||
+                D->getType()->containsUnexpandedParameterPack());
+      })) {
+    UnresolvedSet<8> ResSet;
+    for (auto &Set : Lookups) {
+      ResSet.append(Set.begin(), Set.end());
+      // The last item marks the end of all declarations at the specified scope.
+      ResSet.addDecl(Set[Set.size() - 1]);
+    }
+    return UnresolvedLookupExpr::Create(
+        SemaRef.Context, /*NamingClass=*/nullptr,
+        ReductionIdScopeSpec.getWithLocInContext(SemaRef.Context), ReductionId,
+        /*ADL=*/true, /*Overloaded=*/true, ResSet.begin(), ResSet.end());
+  }
+  if (auto *VD = filterLookupForUDR<ValueDecl *>(
+          Lookups, [&SemaRef, Ty](ValueDecl *D) -> ValueDecl * {
+            if (!D->isInvalidDecl() &&
+                SemaRef.Context.hasSameType(D->getType(), Ty))
+              return D;
+            return nullptr;
+          }))
+    return SemaRef.BuildDeclRefExpr(VD, Ty, VK_LValue, Loc);
+  if (auto *VD = filterLookupForUDR<ValueDecl *>(
+          Lookups, [&SemaRef, Ty, Loc](ValueDecl *D) -> ValueDecl * {
+            if (!D->isInvalidDecl() &&
+                SemaRef.IsDerivedFrom(Loc, Ty, D->getType()) &&
+                !Ty.isMoreQualifiedThan(D->getType()))
+              return D;
+            return nullptr;
+          })) {
+    CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
+                       /*DetectVirtual=*/false);
+    if (SemaRef.IsDerivedFrom(Loc, Ty, VD->getType(), Paths)) {
+      if (!Paths.isAmbiguous(SemaRef.Context.getCanonicalType(
+              VD->getType().getUnqualifiedType()))) {
+        if (SemaRef.CheckBaseClassAccess(Loc, VD->getType(), Ty, Paths.front(),
+                                         /*DiagID=*/0) !=
+            Sema::AR_inaccessible) {
+          SemaRef.BuildBasePathArray(Paths, BasePath);
+          return SemaRef.BuildDeclRefExpr(VD, Ty, VK_LValue, Loc);
+        }
+      }
+    }
+  }
+  if (ReductionIdScopeSpec.isSet()) {
+    SemaRef.Diag(Loc, diag::err_omp_not_resolved_reduction_identifier) << Range;
+    return ExprError();
+  }
+  return ExprEmpty();
+}
+
 OMPClause *Sema::ActOnOpenMPReductionClause(
     ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation ColonLoc, SourceLocation EndLoc,
-    CXXScopeSpec &ReductionIdScopeSpec,
-    const DeclarationNameInfo &ReductionId) {
-  // TODO: Allow scope specification search when 'declare reduction' is
-  // supported.
-  assert(ReductionIdScopeSpec.isEmpty() &&
-         "No support for scoped reduction identifiers yet.");
-
+    CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
+    ArrayRef<Expr *> UnresolvedReductions) {
   auto DN = ReductionId.getName();
   auto OOK = DN.getCXXOverloadedOperator();
   BinaryOperatorKind BOK = BO_Comma;
@@ -7182,48 +9304,21 @@
     break;
   }
   SourceRange ReductionIdRange;
-  if (ReductionIdScopeSpec.isValid()) {
+  if (ReductionIdScopeSpec.isValid())
     ReductionIdRange.setBegin(ReductionIdScopeSpec.getBeginLoc());
-  }
   ReductionIdRange.setEnd(ReductionId.getEndLoc());
-  if (BOK == BO_Comma) {
-    // Not allowed reduction identifier is found.
-    Diag(ReductionId.getLocStart(), diag::err_omp_unknown_reduction_identifier)
-        << ReductionIdRange;
-    return nullptr;
-  }
 
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> Privates;
   SmallVector<Expr *, 8> LHSs;
   SmallVector<Expr *, 8> RHSs;
   SmallVector<Expr *, 8> ReductionOps;
+  SmallVector<Decl *, 4> ExprCaptures;
+  SmallVector<Expr *, 4> ExprPostUpdates;
+  auto IR = UnresolvedReductions.begin(), ER = UnresolvedReductions.end();
+  bool FirstIter = true;
   for (auto RefExpr : VarList) {
     assert(RefExpr && "nullptr expr in OpenMP reduction clause.");
-    if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
-      // It will be analyzed later.
-      Vars.push_back(RefExpr);
-      Privates.push_back(nullptr);
-      LHSs.push_back(nullptr);
-      RHSs.push_back(nullptr);
-      ReductionOps.push_back(nullptr);
-      continue;
-    }
-
-    if (RefExpr->isTypeDependent() || RefExpr->isValueDependent() ||
-        RefExpr->isInstantiationDependent() ||
-        RefExpr->containsUnexpandedParameterPack()) {
-      // It will be analyzed later.
-      Vars.push_back(RefExpr);
-      Privates.push_back(nullptr);
-      LHSs.push_back(nullptr);
-      RHSs.push_back(nullptr);
-      ReductionOps.push_back(nullptr);
-      continue;
-    }
-
-    auto ELoc = RefExpr->getExprLoc();
-    auto ERange = RefExpr->getSourceRange();
     // OpenMP [2.1, C/C++]
     //  A list item is a variable or array section, subject to the restrictions
     //  specified in Section 2.4 on page 42 and in each of the sections
@@ -7231,52 +9326,53 @@
     // OpenMP  [2.14.3.3, Restrictions, p.1]
     //  A variable that is part of another variable (as an array or
     //  structure element) cannot appear in a private clause.
-    auto *DE = dyn_cast<DeclRefExpr>(RefExpr);
-    auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr);
-    auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr);
-    if (!ASE && !OASE && (!DE || !isa<VarDecl>(DE->getDecl()))) {
-      Diag(ELoc, diag::err_omp_expected_var_name_or_array_item) << ERange;
-      continue;
+    if (!FirstIter && IR != ER)
+      ++IR;
+    FirstIter = false;
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+                              /*AllowArraySection=*/true);
+    if (Res.second) {
+      // It will be analyzed later.
+      Vars.push_back(RefExpr);
+      Privates.push_back(nullptr);
+      LHSs.push_back(nullptr);
+      RHSs.push_back(nullptr);
+      // Try to find 'declare reduction' corresponding construct before using
+      // builtin/overloaded operators.
+      QualType Type = Context.DependentTy;
+      CXXCastPath BasePath;
+      ExprResult DeclareReductionRef = buildDeclareReductionRef(
+          *this, ELoc, ERange, DSAStack->getCurScope(), ReductionIdScopeSpec,
+          ReductionId, Type, BasePath, IR == ER ? nullptr : *IR);
+      if (CurContext->isDependentContext() &&
+          (DeclareReductionRef.isUnset() ||
+           isa<UnresolvedLookupExpr>(DeclareReductionRef.get())))
+        ReductionOps.push_back(DeclareReductionRef.get());
+      else
+        ReductionOps.push_back(nullptr);
     }
+    ValueDecl *D = Res.first;
+    if (!D)
+      continue;
+
     QualType Type;
-    VarDecl *VD = nullptr;
-    if (DE) {
-      auto D = DE->getDecl();
-      VD = cast<VarDecl>(D);
-      Type = VD->getType();
-    } else if (ASE) {
-      Type = ASE->getType();
-      auto *Base = ASE->getBase()->IgnoreParenImpCasts();
-      while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
-        Base = TempASE->getBase()->IgnoreParenImpCasts();
-      DE = dyn_cast<DeclRefExpr>(Base);
-      if (DE)
-        VD = dyn_cast<VarDecl>(DE->getDecl());
-      if (!VD) {
-        Diag(Base->getExprLoc(), diag::err_omp_expected_base_var_name)
-            << 0 << Base->getSourceRange();
-        continue;
-      }
-    } else if (OASE) {
+    auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr->IgnoreParens());
+    auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr->IgnoreParens());
+    if (ASE)
+      Type = ASE->getType().getNonReferenceType();
+    else if (OASE) {
       auto BaseType = OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
       if (auto *ATy = BaseType->getAsArrayTypeUnsafe())
         Type = ATy->getElementType();
       else
         Type = BaseType->getPointeeType();
-      auto *Base = OASE->getBase()->IgnoreParenImpCasts();
-      while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
-        Base = TempOASE->getBase()->IgnoreParenImpCasts();
-      while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
-        Base = TempASE->getBase()->IgnoreParenImpCasts();
-      DE = dyn_cast<DeclRefExpr>(Base);
-      if (DE)
-        VD = dyn_cast<VarDecl>(DE->getDecl());
-      if (!VD) {
-        Diag(Base->getExprLoc(), diag::err_omp_expected_base_var_name)
-            << 1 << Base->getSourceRange();
-        continue;
-      }
-    }
+      Type = Type.getNonReferenceType();
+    } else
+      Type = Context.getBaseElementType(D->getType().getNonReferenceType());
+    auto *VD = dyn_cast<VarDecl>(D);
 
     // OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
     //  A variable that appears in a private clause must not have an incomplete
@@ -7285,39 +9381,27 @@
                             diag::err_omp_reduction_incomplete_type))
       continue;
     // OpenMP [2.14.3.6, reduction clause, Restrictions]
-    // Arrays may not appear in a reduction clause.
-    if (Type.getNonReferenceType()->isArrayType()) {
-      Diag(ELoc, diag::err_omp_reduction_type_array) << Type << ERange;
-      if (!ASE && !OASE) {
-        bool IsDecl = VD->isThisDeclarationADefinition(Context) ==
-                      VarDecl::DeclarationOnly;
-        Diag(VD->getLocation(),
-             IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-            << VD;
-      }
-      continue;
-    }
-    // OpenMP [2.14.3.6, reduction clause, Restrictions]
     // A list item that appears in a reduction clause must not be
     // const-qualified.
     if (Type.getNonReferenceType().isConstant(Context)) {
       Diag(ELoc, diag::err_omp_const_reduction_list_item)
           << getOpenMPClauseName(OMPC_reduction) << Type << ERange;
       if (!ASE && !OASE) {
-        bool IsDecl = VD->isThisDeclarationADefinition(Context) ==
-                      VarDecl::DeclarationOnly;
-        Diag(VD->getLocation(),
+        bool IsDecl = !VD ||
+                      VD->isThisDeclarationADefinition(Context) ==
+                          VarDecl::DeclarationOnly;
+        Diag(D->getLocation(),
              IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-            << VD;
+            << D;
       }
       continue;
     }
     // OpenMP [2.9.3.6, Restrictions, C/C++, p.4]
     //  If a list-item is a reference type then it must bind to the same object
     //  for all threads of the team.
-    if (!ASE && !OASE) {
+    if (!ASE && !OASE && VD) {
       VarDecl *VDDef = VD->getDefinition();
-      if (Type->isReferenceType() && VDDef) {
+      if (VD->getType()->isReferenceType() && VDDef) {
         DSARefChecker Check(DSAStack);
         if (Check.Visit(VDDef->getInit())) {
           Diag(ELoc, diag::err_omp_reduction_ref_type_arg) << ERange;
@@ -7326,40 +9410,7 @@
         }
       }
     }
-    // OpenMP [2.14.3.6, reduction clause, Restrictions]
-    // The type of a list item that appears in a reduction clause must be valid
-    // for the reduction-identifier. For a max or min reduction in C, the type
-    // of the list item must be an allowed arithmetic data type: char, int,
-    // float, double, or _Bool, possibly modified with long, short, signed, or
-    // unsigned. For a max or min reduction in C++, the type of the list item
-    // must be an allowed arithmetic data type: char, wchar_t, int, float,
-    // double, or bool, possibly modified with long, short, signed, or unsigned.
-    if ((BOK == BO_GT || BOK == BO_LT) &&
-        !(Type->isScalarType() ||
-          (getLangOpts().CPlusPlus && Type->isArithmeticType()))) {
-      Diag(ELoc, diag::err_omp_clause_not_arithmetic_type_arg)
-          << getLangOpts().CPlusPlus;
-      if (!ASE && !OASE) {
-        bool IsDecl = VD->isThisDeclarationADefinition(Context) ==
-                      VarDecl::DeclarationOnly;
-        Diag(VD->getLocation(),
-             IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-            << VD;
-      }
-      continue;
-    }
-    if ((BOK == BO_OrAssign || BOK == BO_AndAssign || BOK == BO_XorAssign) &&
-        !getLangOpts().CPlusPlus && Type->isFloatingType()) {
-      Diag(ELoc, diag::err_omp_clause_floating_type_arg);
-      if (!ASE && !OASE) {
-        bool IsDecl = VD->isThisDeclarationADefinition(Context) ==
-                      VarDecl::DeclarationOnly;
-        Diag(VD->getLocation(),
-             IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-            << VD;
-      }
-      continue;
-    }
+
     // OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables Referenced
     // in a Construct]
     //  Variables with the predetermined data-sharing attributes may not be
@@ -7372,18 +9423,17 @@
     //  but a list item can appear only once in the reduction clauses for that
     //  directive.
     DSAStackTy::DSAVarData DVar;
-    DVar = DSAStack->getTopDSA(VD, false);
+    DVar = DSAStack->getTopDSA(D, false);
     if (DVar.CKind == OMPC_reduction) {
       Diag(ELoc, diag::err_omp_once_referenced)
           << getOpenMPClauseName(OMPC_reduction);
-      if (DVar.RefExpr) {
+      if (DVar.RefExpr)
         Diag(DVar.RefExpr->getExprLoc(), diag::note_omp_referenced);
-      }
     } else if (DVar.CKind != OMPC_unknown) {
       Diag(ELoc, diag::err_omp_wrong_dsa)
           << getOpenMPClauseName(DVar.CKind)
           << getOpenMPClauseName(OMPC_reduction);
-      ReportOriginalDSA(*this, DSAStack, VD, DVar);
+      ReportOriginalDSA(*this, DSAStack, D, DVar);
       continue;
     }
 
@@ -7394,24 +9444,91 @@
     OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
     if (isOpenMPWorksharingDirective(CurrDir) &&
         !isOpenMPParallelDirective(CurrDir)) {
-      DVar = DSAStack->getImplicitDSA(VD, true);
+      DVar = DSAStack->getImplicitDSA(D, true);
       if (DVar.CKind != OMPC_shared) {
         Diag(ELoc, diag::err_omp_required_access)
             << getOpenMPClauseName(OMPC_reduction)
             << getOpenMPClauseName(OMPC_shared);
-        ReportOriginalDSA(*this, DSAStack, VD, DVar);
+        ReportOriginalDSA(*this, DSAStack, D, DVar);
+        continue;
+      }
+    }
+
+    // Try to find 'declare reduction' corresponding construct before using
+    // builtin/overloaded operators.
+    CXXCastPath BasePath;
+    ExprResult DeclareReductionRef = buildDeclareReductionRef(
+        *this, ELoc, ERange, DSAStack->getCurScope(), ReductionIdScopeSpec,
+        ReductionId, Type, BasePath, IR == ER ? nullptr : *IR);
+    if (DeclareReductionRef.isInvalid())
+      continue;
+    if (CurContext->isDependentContext() &&
+        (DeclareReductionRef.isUnset() ||
+         isa<UnresolvedLookupExpr>(DeclareReductionRef.get()))) {
+      Vars.push_back(RefExpr);
+      Privates.push_back(nullptr);
+      LHSs.push_back(nullptr);
+      RHSs.push_back(nullptr);
+      ReductionOps.push_back(DeclareReductionRef.get());
+      continue;
+    }
+    if (BOK == BO_Comma && DeclareReductionRef.isUnset()) {
+      // Not allowed reduction identifier is found.
+      Diag(ReductionId.getLocStart(),
+           diag::err_omp_unknown_reduction_identifier)
+          << Type << ReductionIdRange;
+      continue;
+    }
+
+    // OpenMP [2.14.3.6, reduction clause, Restrictions]
+    // The type of a list item that appears in a reduction clause must be valid
+    // for the reduction-identifier. For a max or min reduction in C, the type
+    // of the list item must be an allowed arithmetic data type: char, int,
+    // float, double, or _Bool, possibly modified with long, short, signed, or
+    // unsigned. For a max or min reduction in C++, the type of the list item
+    // must be an allowed arithmetic data type: char, wchar_t, int, float,
+    // double, or bool, possibly modified with long, short, signed, or unsigned.
+    if (DeclareReductionRef.isUnset()) {
+      if ((BOK == BO_GT || BOK == BO_LT) &&
+          !(Type->isScalarType() ||
+            (getLangOpts().CPlusPlus && Type->isArithmeticType()))) {
+        Diag(ELoc, diag::err_omp_clause_not_arithmetic_type_arg)
+            << getLangOpts().CPlusPlus;
+        if (!ASE && !OASE) {
+          bool IsDecl = !VD ||
+                        VD->isThisDeclarationADefinition(Context) ==
+                            VarDecl::DeclarationOnly;
+          Diag(D->getLocation(),
+               IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+              << D;
+        }
+        continue;
+      }
+      if ((BOK == BO_OrAssign || BOK == BO_AndAssign || BOK == BO_XorAssign) &&
+          !getLangOpts().CPlusPlus && Type->isFloatingType()) {
+        Diag(ELoc, diag::err_omp_clause_floating_type_arg);
+        if (!ASE && !OASE) {
+          bool IsDecl = !VD ||
+                        VD->isThisDeclarationADefinition(Context) ==
+                            VarDecl::DeclarationOnly;
+          Diag(D->getLocation(),
+               IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+              << D;
+        }
         continue;
       }
     }
 
     Type = Type.getNonLValueExprType(Context).getUnqualifiedType();
     auto *LHSVD = buildVarDecl(*this, ELoc, Type, ".reduction.lhs",
-                               VD->hasAttrs() ? &VD->getAttrs() : nullptr);
-    auto *RHSVD = buildVarDecl(*this, ELoc, Type, VD->getName(),
-                               VD->hasAttrs() ? &VD->getAttrs() : nullptr);
+                               D->hasAttrs() ? &D->getAttrs() : nullptr);
+    auto *RHSVD = buildVarDecl(*this, ELoc, Type, D->getName(),
+                               D->hasAttrs() ? &D->getAttrs() : nullptr);
     auto PrivateTy = Type;
-    if (OASE) {
-      // For array sections only:
+    if (OASE ||
+        (!ASE &&
+         D->getType().getNonReferenceType()->isVariablyModifiedType())) {
+      // For arrays/array sections only:
       // Create pseudo array type for private copy. The size for this array will
       // be generated during codegen.
       // For array subscripts or single variables Private Ty is the same as Type
@@ -7420,162 +9537,227 @@
           Type, new (Context) OpaqueValueExpr(SourceLocation(),
                                               Context.getSizeType(), VK_RValue),
           ArrayType::Normal, /*IndexTypeQuals=*/0, SourceRange());
-    }
+    } else if (!ASE && !OASE &&
+               Context.getAsArrayType(D->getType().getNonReferenceType()))
+      PrivateTy = D->getType().getNonReferenceType();
     // Private copy.
-    auto *PrivateVD = buildVarDecl(*this, ELoc, PrivateTy, VD->getName(),
-                                   VD->hasAttrs() ? &VD->getAttrs() : nullptr);
+    auto *PrivateVD = buildVarDecl(*this, ELoc, PrivateTy, D->getName(),
+                                   D->hasAttrs() ? &D->getAttrs() : nullptr);
     // Add initializer for private variable.
     Expr *Init = nullptr;
-    switch (BOK) {
-    case BO_Add:
-    case BO_Xor:
-    case BO_Or:
-    case BO_LOr:
-      // '+', '-', '^', '|', '||' reduction ops - initializer is '0'.
-      if (Type->isScalarType() || Type->isAnyComplexType()) {
-        Init = ActOnIntegerConstant(ELoc, /*Val=*/0).get();
+    auto *LHSDRE = buildDeclRefExpr(*this, LHSVD, Type, ELoc);
+    auto *RHSDRE = buildDeclRefExpr(*this, RHSVD, Type, ELoc);
+    if (DeclareReductionRef.isUsable()) {
+      auto *DRDRef = DeclareReductionRef.getAs<DeclRefExpr>();
+      auto *DRD = cast<OMPDeclareReductionDecl>(DRDRef->getDecl());
+      if (DRD->getInitializer()) {
+        Init = DRDRef;
+        RHSVD->setInit(DRDRef);
+        RHSVD->setInitStyle(VarDecl::CallInit);
       }
-      break;
-    case BO_Mul:
-    case BO_LAnd:
-      if (Type->isScalarType() || Type->isAnyComplexType()) {
-        // '*' and '&&' reduction ops - initializer is '1'.
-        Init = ActOnIntegerConstant(ELoc, /*Val=*/1).get();
-      }
-      break;
-    case BO_And: {
-      // '&' reduction op - initializer is '~0'.
-      QualType OrigType = Type;
-      if (auto *ComplexTy = OrigType->getAs<ComplexType>()) {
-        Type = ComplexTy->getElementType();
-      }
-      if (Type->isRealFloatingType()) {
-        llvm::APFloat InitValue =
-            llvm::APFloat::getAllOnesValue(Context.getTypeSize(Type),
-                                           /*isIEEE=*/true);
-        Init = FloatingLiteral::Create(Context, InitValue, /*isexact=*/true,
-                                       Type, ELoc);
-      } else if (Type->isScalarType()) {
-        auto Size = Context.getTypeSize(Type);
-        QualType IntTy = Context.getIntTypeForBitwidth(Size, /*Signed=*/0);
-        llvm::APInt InitValue = llvm::APInt::getAllOnesValue(Size);
-        Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
-      }
-      if (Init && OrigType->isAnyComplexType()) {
-        // Init = 0xFFFF + 0xFFFFi;
-        auto *Im = new (Context) ImaginaryLiteral(Init, OrigType);
-        Init = CreateBuiltinBinOp(ELoc, BO_Add, Init, Im).get();
-      }
-      Type = OrigType;
-      break;
-    }
-    case BO_LT:
-    case BO_GT: {
-      // 'min' reduction op - initializer is 'Largest representable number in
-      // the reduction list item type'.
-      // 'max' reduction op - initializer is 'Least representable number in
-      // the reduction list item type'.
-      if (Type->isIntegerType() || Type->isPointerType()) {
-        bool IsSigned = Type->hasSignedIntegerRepresentation();
-        auto Size = Context.getTypeSize(Type);
-        QualType IntTy =
-            Context.getIntTypeForBitwidth(Size, /*Signed=*/IsSigned);
-        llvm::APInt InitValue =
-            (BOK != BO_LT)
-                ? IsSigned ? llvm::APInt::getSignedMinValue(Size)
-                           : llvm::APInt::getMinValue(Size)
-                : IsSigned ? llvm::APInt::getSignedMaxValue(Size)
-                           : llvm::APInt::getMaxValue(Size);
-        Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
-        if (Type->isPointerType()) {
-          // Cast to pointer type.
-          auto CastExpr = BuildCStyleCastExpr(
-              SourceLocation(), Context.getTrivialTypeSourceInfo(Type, ELoc),
-              SourceLocation(), Init);
-          if (CastExpr.isInvalid())
-            continue;
-          Init = CastExpr.get();
+    } else {
+      switch (BOK) {
+      case BO_Add:
+      case BO_Xor:
+      case BO_Or:
+      case BO_LOr:
+        // '+', '-', '^', '|', '||' reduction ops - initializer is '0'.
+        if (Type->isScalarType() || Type->isAnyComplexType())
+          Init = ActOnIntegerConstant(ELoc, /*Val=*/0).get();
+        break;
+      case BO_Mul:
+      case BO_LAnd:
+        if (Type->isScalarType() || Type->isAnyComplexType()) {
+          // '*' and '&&' reduction ops - initializer is '1'.
+          Init = ActOnIntegerConstant(ELoc, /*Val=*/1).get();
         }
-      } else if (Type->isRealFloatingType()) {
-        llvm::APFloat InitValue = llvm::APFloat::getLargest(
-            Context.getFloatTypeSemantics(Type), BOK != BO_LT);
-        Init = FloatingLiteral::Create(Context, InitValue, /*isexact=*/true,
-                                       Type, ELoc);
+        break;
+      case BO_And: {
+        // '&' reduction op - initializer is '~0'.
+        QualType OrigType = Type;
+        if (auto *ComplexTy = OrigType->getAs<ComplexType>())
+          Type = ComplexTy->getElementType();
+        if (Type->isRealFloatingType()) {
+          llvm::APFloat InitValue =
+              llvm::APFloat::getAllOnesValue(Context.getTypeSize(Type),
+                                             /*isIEEE=*/true);
+          Init = FloatingLiteral::Create(Context, InitValue, /*isexact=*/true,
+                                         Type, ELoc);
+        } else if (Type->isScalarType()) {
+          auto Size = Context.getTypeSize(Type);
+          QualType IntTy = Context.getIntTypeForBitwidth(Size, /*Signed=*/0);
+          llvm::APInt InitValue = llvm::APInt::getAllOnesValue(Size);
+          Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
+        }
+        if (Init && OrigType->isAnyComplexType()) {
+          // Init = 0xFFFF + 0xFFFFi;
+          auto *Im = new (Context) ImaginaryLiteral(Init, OrigType);
+          Init = CreateBuiltinBinOp(ELoc, BO_Add, Init, Im).get();
+        }
+        Type = OrigType;
+        break;
       }
-      break;
+      case BO_LT:
+      case BO_GT: {
+        // 'min' reduction op - initializer is 'Largest representable number in
+        // the reduction list item type'.
+        // 'max' reduction op - initializer is 'Least representable number in
+        // the reduction list item type'.
+        if (Type->isIntegerType() || Type->isPointerType()) {
+          bool IsSigned = Type->hasSignedIntegerRepresentation();
+          auto Size = Context.getTypeSize(Type);
+          QualType IntTy =
+              Context.getIntTypeForBitwidth(Size, /*Signed=*/IsSigned);
+          llvm::APInt InitValue =
+              (BOK != BO_LT)
+                  ? IsSigned ? llvm::APInt::getSignedMinValue(Size)
+                             : llvm::APInt::getMinValue(Size)
+                  : IsSigned ? llvm::APInt::getSignedMaxValue(Size)
+                             : llvm::APInt::getMaxValue(Size);
+          Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
+          if (Type->isPointerType()) {
+            // Cast to pointer type.
+            auto CastExpr = BuildCStyleCastExpr(
+                SourceLocation(), Context.getTrivialTypeSourceInfo(Type, ELoc),
+                SourceLocation(), Init);
+            if (CastExpr.isInvalid())
+              continue;
+            Init = CastExpr.get();
+          }
+        } else if (Type->isRealFloatingType()) {
+          llvm::APFloat InitValue = llvm::APFloat::getLargest(
+              Context.getFloatTypeSemantics(Type), BOK != BO_LT);
+          Init = FloatingLiteral::Create(Context, InitValue, /*isexact=*/true,
+                                         Type, ELoc);
+        }
+        break;
+      }
+      case BO_PtrMemD:
+      case BO_PtrMemI:
+      case BO_MulAssign:
+      case BO_Div:
+      case BO_Rem:
+      case BO_Sub:
+      case BO_Shl:
+      case BO_Shr:
+      case BO_LE:
+      case BO_GE:
+      case BO_EQ:
+      case BO_NE:
+      case BO_AndAssign:
+      case BO_XorAssign:
+      case BO_OrAssign:
+      case BO_Assign:
+      case BO_AddAssign:
+      case BO_SubAssign:
+      case BO_DivAssign:
+      case BO_RemAssign:
+      case BO_ShlAssign:
+      case BO_ShrAssign:
+      case BO_Comma:
+        llvm_unreachable("Unexpected reduction operation");
+      }
     }
-    case BO_PtrMemD:
-    case BO_PtrMemI:
-    case BO_MulAssign:
-    case BO_Div:
-    case BO_Rem:
-    case BO_Sub:
-    case BO_Shl:
-    case BO_Shr:
-    case BO_LE:
-    case BO_GE:
-    case BO_EQ:
-    case BO_NE:
-    case BO_AndAssign:
-    case BO_XorAssign:
-    case BO_OrAssign:
-    case BO_Assign:
-    case BO_AddAssign:
-    case BO_SubAssign:
-    case BO_DivAssign:
-    case BO_RemAssign:
-    case BO_ShlAssign:
-    case BO_ShrAssign:
-    case BO_Comma:
-      llvm_unreachable("Unexpected reduction operation");
-    }
-    if (Init) {
+    if (Init && DeclareReductionRef.isUnset()) {
       AddInitializerToDecl(RHSVD, Init, /*DirectInit=*/false,
                            /*TypeMayContainAuto=*/false);
-    } else
+    } else if (!Init)
       ActOnUninitializedDecl(RHSVD, /*TypeMayContainAuto=*/false);
-    if (!RHSVD->hasInit()) {
+    if (RHSVD->isInvalidDecl())
+      continue;
+    if (!RHSVD->hasInit() && DeclareReductionRef.isUnset()) {
       Diag(ELoc, diag::err_omp_reduction_id_not_compatible) << Type
                                                             << ReductionIdRange;
-      if (VD) {
-        bool IsDecl = VD->isThisDeclarationADefinition(Context) ==
-                      VarDecl::DeclarationOnly;
-        Diag(VD->getLocation(),
-             IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-            << VD;
-      }
+      bool IsDecl =
+          !VD ||
+          VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
+      Diag(D->getLocation(),
+           IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+          << D;
       continue;
     }
     // Store initializer for single element in private copy. Will be used during
     // codegen.
     PrivateVD->setInit(RHSVD->getInit());
     PrivateVD->setInitStyle(RHSVD->getInitStyle());
-    auto *LHSDRE = buildDeclRefExpr(*this, LHSVD, Type, ELoc);
-    auto *RHSDRE = buildDeclRefExpr(*this, RHSVD, Type, ELoc);
     auto *PrivateDRE = buildDeclRefExpr(*this, PrivateVD, PrivateTy, ELoc);
-    ExprResult ReductionOp =
-        BuildBinOp(DSAStack->getCurScope(), ReductionId.getLocStart(), BOK,
-                   LHSDRE, RHSDRE);
-    if (ReductionOp.isUsable()) {
-      if (BOK != BO_LT && BOK != BO_GT) {
-        ReductionOp =
-            BuildBinOp(DSAStack->getCurScope(), ReductionId.getLocStart(),
-                       BO_Assign, LHSDRE, ReductionOp.get());
-      } else {
-        auto *ConditionalOp = new (Context) ConditionalOperator(
-            ReductionOp.get(), SourceLocation(), LHSDRE, SourceLocation(),
-            RHSDRE, Type, VK_LValue, OK_Ordinary);
-        ReductionOp =
-            BuildBinOp(DSAStack->getCurScope(), ReductionId.getLocStart(),
-                       BO_Assign, LHSDRE, ConditionalOp);
+    ExprResult ReductionOp;
+    if (DeclareReductionRef.isUsable()) {
+      QualType RedTy = DeclareReductionRef.get()->getType();
+      QualType PtrRedTy = Context.getPointerType(RedTy);
+      ExprResult LHS = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, LHSDRE);
+      ExprResult RHS = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, RHSDRE);
+      if (!BasePath.empty()) {
+        LHS = DefaultLvalueConversion(LHS.get());
+        RHS = DefaultLvalueConversion(RHS.get());
+        LHS = ImplicitCastExpr::Create(Context, PtrRedTy,
+                                       CK_UncheckedDerivedToBase, LHS.get(),
+                                       &BasePath, LHS.get()->getValueKind());
+        RHS = ImplicitCastExpr::Create(Context, PtrRedTy,
+                                       CK_UncheckedDerivedToBase, RHS.get(),
+                                       &BasePath, RHS.get()->getValueKind());
       }
-      ReductionOp = ActOnFinishFullExpr(ReductionOp.get());
+      FunctionProtoType::ExtProtoInfo EPI;
+      QualType Params[] = {PtrRedTy, PtrRedTy};
+      QualType FnTy = Context.getFunctionType(Context.VoidTy, Params, EPI);
+      auto *OVE = new (Context) OpaqueValueExpr(
+          ELoc, Context.getPointerType(FnTy), VK_RValue, OK_Ordinary,
+          DefaultLvalueConversion(DeclareReductionRef.get()).get());
+      Expr *Args[] = {LHS.get(), RHS.get()};
+      ReductionOp = new (Context)
+          CallExpr(Context, OVE, Args, Context.VoidTy, VK_RValue, ELoc);
+    } else {
+      ReductionOp = BuildBinOp(DSAStack->getCurScope(),
+                               ReductionId.getLocStart(), BOK, LHSDRE, RHSDRE);
+      if (ReductionOp.isUsable()) {
+        if (BOK != BO_LT && BOK != BO_GT) {
+          ReductionOp =
+              BuildBinOp(DSAStack->getCurScope(), ReductionId.getLocStart(),
+                         BO_Assign, LHSDRE, ReductionOp.get());
+        } else {
+          auto *ConditionalOp = new (Context) ConditionalOperator(
+              ReductionOp.get(), SourceLocation(), LHSDRE, SourceLocation(),
+              RHSDRE, Type, VK_LValue, OK_Ordinary);
+          ReductionOp =
+              BuildBinOp(DSAStack->getCurScope(), ReductionId.getLocStart(),
+                         BO_Assign, LHSDRE, ConditionalOp);
+        }
+        ReductionOp = ActOnFinishFullExpr(ReductionOp.get());
+      }
+      if (ReductionOp.isInvalid())
+        continue;
     }
-    if (ReductionOp.isInvalid())
-      continue;
 
-    DSAStack->addDSA(VD, DE, OMPC_reduction);
-    Vars.push_back(RefExpr);
+    DeclRefExpr *Ref = nullptr;
+    Expr *VarsExpr = RefExpr->IgnoreParens();
+    if (!VD && !CurContext->isDependentContext()) {
+      if (ASE || OASE) {
+        TransformExprToCaptures RebuildToCapture(*this, D);
+        VarsExpr =
+            RebuildToCapture.TransformExpr(RefExpr->IgnoreParens()).get();
+        Ref = RebuildToCapture.getCapturedExpr();
+      } else {
+        VarsExpr = Ref =
+            buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
+      }
+      if (!IsOpenMPCapturedDecl(D)) {
+        ExprCaptures.push_back(Ref->getDecl());
+        if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
+          ExprResult RefRes = DefaultLvalueConversion(Ref);
+          if (!RefRes.isUsable())
+            continue;
+          ExprResult PostUpdateRes =
+              BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
+                         SimpleRefExpr, RefRes.get());
+          if (!PostUpdateRes.isUsable())
+            continue;
+          ExprPostUpdates.push_back(
+              IgnoredValueConversions(PostUpdateRes.get()).get());
+        }
+      }
+    }
+    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_reduction, Ref);
+    Vars.push_back(VarsExpr);
     Privates.push_back(PrivateDRE);
     LHSs.push_back(LHSDRE);
     RHSs.push_back(RHSDRE);
@@ -7588,7 +9770,67 @@
   return OMPReductionClause::Create(
       Context, StartLoc, LParenLoc, ColonLoc, EndLoc, Vars,
       ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId, Privates,
-      LHSs, RHSs, ReductionOps);
+      LHSs, RHSs, ReductionOps, buildPreInits(Context, ExprCaptures),
+      buildPostUpdate(*this, ExprPostUpdates));
+}
+
+bool Sema::CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
+                                     SourceLocation LinLoc) {
+  if ((!LangOpts.CPlusPlus && LinKind != OMPC_LINEAR_val) ||
+      LinKind == OMPC_LINEAR_unknown) {
+    Diag(LinLoc, diag::err_omp_wrong_linear_modifier) << LangOpts.CPlusPlus;
+    return true;
+  }
+  return false;
+}
+
+bool Sema::CheckOpenMPLinearDecl(ValueDecl *D, SourceLocation ELoc,
+                                 OpenMPLinearClauseKind LinKind,
+                                 QualType Type) {
+  auto *VD = dyn_cast_or_null<VarDecl>(D);
+  // A variable must not have an incomplete type or a reference type.
+  if (RequireCompleteType(ELoc, Type, diag::err_omp_linear_incomplete_type))
+    return true;
+  if ((LinKind == OMPC_LINEAR_uval || LinKind == OMPC_LINEAR_ref) &&
+      !Type->isReferenceType()) {
+    Diag(ELoc, diag::err_omp_wrong_linear_modifier_non_reference)
+        << Type << getOpenMPSimpleClauseTypeName(OMPC_linear, LinKind);
+    return true;
+  }
+  Type = Type.getNonReferenceType();
+
+  // A list item must not be const-qualified.
+  if (Type.isConstant(Context)) {
+    Diag(ELoc, diag::err_omp_const_variable)
+        << getOpenMPClauseName(OMPC_linear);
+    if (D) {
+      bool IsDecl =
+          !VD ||
+          VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
+      Diag(D->getLocation(),
+           IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+          << D;
+    }
+    return true;
+  }
+
+  // A list item must be of integral or pointer type.
+  Type = Type.getUnqualifiedType().getCanonicalType();
+  const auto *Ty = Type.getTypePtrOrNull();
+  if (!Ty || (!Ty->isDependentType() && !Ty->isIntegralType(Context) &&
+              !Ty->isPointerType())) {
+    Diag(ELoc, diag::err_omp_linear_expected_int_or_ptr) << Type;
+    if (D) {
+      bool IsDecl =
+          !VD ||
+          VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
+      Diag(D->getLocation(),
+           IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+          << D;
+    }
+    return true;
+  }
+  return false;
 }
 
 OMPClause *Sema::ActOnOpenMPLinearClause(
@@ -7598,121 +9840,84 @@
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> Privates;
   SmallVector<Expr *, 8> Inits;
-  if ((!LangOpts.CPlusPlus && LinKind != OMPC_LINEAR_val) ||
-      LinKind == OMPC_LINEAR_unknown) {
-    Diag(LinLoc, diag::err_omp_wrong_linear_modifier) << LangOpts.CPlusPlus;
+  SmallVector<Decl *, 4> ExprCaptures;
+  SmallVector<Expr *, 4> ExprPostUpdates;
+  if (CheckOpenMPLinearModifier(LinKind, LinLoc))
     LinKind = OMPC_LINEAR_val;
-  }
   for (auto &RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP linear clause.");
-    if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+                              /*AllowArraySection=*/false);
+    if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
       Privates.push_back(nullptr);
       Inits.push_back(nullptr);
-      continue;
     }
-
-    // OpenMP [2.14.3.7, linear clause]
-    // A list item that appears in a linear clause is subject to the private
-    // clause semantics described in Section 2.14.3.3 on page 159 except as
-    // noted. In addition, the value of the new list item on each iteration
-    // of the associated loop(s) corresponds to the value of the original
-    // list item before entering the construct plus the logical number of
-    // the iteration times linear-step.
-
-    SourceLocation ELoc = RefExpr->getExprLoc();
-    // OpenMP [2.1, C/C++]
-    //  A list item is a variable name.
-    // OpenMP  [2.14.3.3, Restrictions, p.1]
-    //  A variable that is part of another variable (as an array or
-    //  structure element) cannot appear in a private clause.
-    DeclRefExpr *DE = dyn_cast<DeclRefExpr>(RefExpr);
-    if (!DE || !isa<VarDecl>(DE->getDecl())) {
-      Diag(ELoc, diag::err_omp_expected_var_name) << RefExpr->getSourceRange();
+    ValueDecl *D = Res.first;
+    if (!D)
       continue;
-    }
 
-    VarDecl *VD = cast<VarDecl>(DE->getDecl());
+    QualType Type = D->getType();
+    auto *VD = dyn_cast<VarDecl>(D);
 
     // OpenMP [2.14.3.7, linear clause]
     //  A list-item cannot appear in more than one linear clause.
     //  A list-item that appears in a linear clause cannot appear in any
     //  other data-sharing attribute clause.
-    DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(VD, false);
+    DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, false);
     if (DVar.RefExpr) {
       Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
                                           << getOpenMPClauseName(OMPC_linear);
-      ReportOriginalDSA(*this, DSAStack, VD, DVar);
+      ReportOriginalDSA(*this, DSAStack, D, DVar);
       continue;
     }
 
-    QualType QType = VD->getType();
-    if (QType->isDependentType() || QType->isInstantiationDependentType()) {
-      // It will be analyzed later.
-      Vars.push_back(DE);
-      Privates.push_back(nullptr);
-      Inits.push_back(nullptr);
+    if (CheckOpenMPLinearDecl(D, ELoc, LinKind, Type))
       continue;
-    }
-
-    // A variable must not have an incomplete type or a reference type.
-    if (RequireCompleteType(ELoc, QType,
-                            diag::err_omp_linear_incomplete_type)) {
-      continue;
-    }
-    if ((LinKind == OMPC_LINEAR_uval || LinKind == OMPC_LINEAR_ref) &&
-        !QType->isReferenceType()) {
-      Diag(ELoc, diag::err_omp_wrong_linear_modifier_non_reference)
-          << QType << getOpenMPSimpleClauseTypeName(OMPC_linear, LinKind);
-      continue;
-    }
-    QType = QType.getNonReferenceType();
-
-    // A list item must not be const-qualified.
-    if (QType.isConstant(Context)) {
-      Diag(ELoc, diag::err_omp_const_variable)
-          << getOpenMPClauseName(OMPC_linear);
-      bool IsDecl =
-          VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
-      Diag(VD->getLocation(),
-           IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-          << VD;
-      continue;
-    }
-
-    // A list item must be of integral or pointer type.
-    QType = QType.getUnqualifiedType().getCanonicalType();
-    const Type *Ty = QType.getTypePtrOrNull();
-    if (!Ty || (!Ty->isDependentType() && !Ty->isIntegralType(Context) &&
-                !Ty->isPointerType())) {
-      Diag(ELoc, diag::err_omp_linear_expected_int_or_ptr) << QType;
-      bool IsDecl =
-          VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
-      Diag(VD->getLocation(),
-           IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-          << VD;
-      continue;
-    }
+    Type = Type.getNonReferenceType().getUnqualifiedType().getCanonicalType();
 
     // Build private copy of original var.
-    auto *Private = buildVarDecl(*this, ELoc, QType, VD->getName(),
-                                 VD->hasAttrs() ? &VD->getAttrs() : nullptr);
-    auto *PrivateRef = buildDeclRefExpr(
-        *this, Private, DE->getType().getUnqualifiedType(), DE->getExprLoc());
+    auto *Private = buildVarDecl(*this, ELoc, Type, D->getName(),
+                                 D->hasAttrs() ? &D->getAttrs() : nullptr);
+    auto *PrivateRef = buildDeclRefExpr(*this, Private, Type, ELoc);
     // Build var to save initial value.
-    VarDecl *Init = buildVarDecl(*this, ELoc, QType, ".linear.start");
+    VarDecl *Init = buildVarDecl(*this, ELoc, Type, ".linear.start");
     Expr *InitExpr;
+    DeclRefExpr *Ref = nullptr;
+    if (!VD && !CurContext->isDependentContext()) {
+      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
+      if (!IsOpenMPCapturedDecl(D)) {
+        ExprCaptures.push_back(Ref->getDecl());
+        if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
+          ExprResult RefRes = DefaultLvalueConversion(Ref);
+          if (!RefRes.isUsable())
+            continue;
+          ExprResult PostUpdateRes =
+              BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
+                         SimpleRefExpr, RefRes.get());
+          if (!PostUpdateRes.isUsable())
+            continue;
+          ExprPostUpdates.push_back(
+              IgnoredValueConversions(PostUpdateRes.get()).get());
+        }
+      }
+    }
     if (LinKind == OMPC_LINEAR_uval)
-      InitExpr = VD->getInit();
+      InitExpr = VD ? VD->getInit() : SimpleRefExpr;
     else
-      InitExpr = DE;
+      InitExpr = VD ? SimpleRefExpr : Ref;
     AddInitializerToDecl(Init, DefaultLvalueConversion(InitExpr).get(),
-                         /*DirectInit*/ false, /*TypeMayContainAuto*/ false);
-    auto InitRef = buildDeclRefExpr(
-        *this, Init, DE->getType().getUnqualifiedType(), DE->getExprLoc());
-    DSAStack->addDSA(VD, DE, OMPC_linear);
-    Vars.push_back(DE);
+                         /*DirectInit=*/false, /*TypeMayContainAuto=*/false);
+    auto InitRef = buildDeclRefExpr(*this, Init, Type, ELoc);
+
+    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_linear, Ref);
+    Vars.push_back((VD || CurContext->isDependentContext())
+                       ? RefExpr->IgnoreParens()
+                       : Ref);
     Privates.push_back(PrivateRef);
     Inits.push_back(InitRef);
   }
@@ -7756,12 +9961,14 @@
 
   return OMPLinearClause::Create(Context, StartLoc, LParenLoc, LinKind, LinLoc,
                                  ColonLoc, EndLoc, Vars, Privates, Inits,
-                                 StepExpr, CalcStepExpr);
+                                 StepExpr, CalcStepExpr,
+                                 buildPreInits(Context, ExprCaptures),
+                                 buildPostUpdate(*this, ExprPostUpdates));
 }
 
 static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
                                      Expr *NumIterations, Sema &SemaRef,
-                                     Scope *S) {
+                                     Scope *S, DSAStackTy *Stack) {
   // Walk the vars and build update/final expressions for the CodeGen.
   SmallVector<Expr *, 8> Updates;
   SmallVector<Expr *, 8> Finals;
@@ -7771,17 +9978,35 @@
   // If linear-step is not specified it is assumed to be 1.
   if (Step == nullptr)
     Step = SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get();
-  else if (CalcStep)
+  else if (CalcStep) {
     Step = cast<BinaryOperator>(CalcStep)->getLHS();
+  }
   bool HasErrors = false;
   auto CurInit = Clause.inits().begin();
   auto CurPrivate = Clause.privates().begin();
   auto LinKind = Clause.getModifier();
   for (auto &RefExpr : Clause.varlists()) {
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange,
+                              /*AllowArraySection=*/false);
+    ValueDecl *D = Res.first;
+    if (Res.second || !D) {
+      Updates.push_back(nullptr);
+      Finals.push_back(nullptr);
+      HasErrors = true;
+      continue;
+    }
+    if (auto *CED = dyn_cast<OMPCapturedExprDecl>(D)) {
+      D = cast<MemberExpr>(CED->getInit()->IgnoreParenImpCasts())
+              ->getMemberDecl();
+    }
+    auto &&Info = Stack->isLoopControlVariable(D);
     Expr *InitExpr = *CurInit;
 
     // Build privatized reference to the current linear var.
-    auto DE = cast<DeclRefExpr>(RefExpr);
+    auto *DE = cast<DeclRefExpr>(SimpleRefExpr);
     Expr *CapturedRef;
     if (LinKind == OMPC_LINEAR_uval)
       CapturedRef = cast<VarDecl>(DE->getDecl())->getInit();
@@ -7792,18 +10017,27 @@
                            /*RefersToCapture=*/true);
 
     // Build update: Var = InitExpr + IV * Step
-    ExprResult Update =
-        BuildCounterUpdate(SemaRef, S, RefExpr->getExprLoc(), *CurPrivate,
-                           InitExpr, IV, Step, /* Subtract */ false);
+    ExprResult Update;
+    if (!Info.first) {
+      Update =
+          BuildCounterUpdate(SemaRef, S, RefExpr->getExprLoc(), *CurPrivate,
+                             InitExpr, IV, Step, /* Subtract */ false);
+    } else
+      Update = *CurPrivate;
     Update = SemaRef.ActOnFinishFullExpr(Update.get(), DE->getLocStart(),
                                          /*DiscardedValue=*/true);
 
     // Build final: Var = InitExpr + NumIterations * Step
-    ExprResult Final =
-        BuildCounterUpdate(SemaRef, S, RefExpr->getExprLoc(), CapturedRef,
-                           InitExpr, NumIterations, Step, /* Subtract */ false);
+    ExprResult Final;
+    if (!Info.first) {
+      Final = BuildCounterUpdate(SemaRef, S, RefExpr->getExprLoc(), CapturedRef,
+                                 InitExpr, NumIterations, Step,
+                                 /* Subtract */ false);
+    } else
+      Final = *CurPrivate;
     Final = SemaRef.ActOnFinishFullExpr(Final.get(), DE->getLocStart(),
                                         /*DiscardedValue=*/true);
+
     if (!Update.isUsable() || !Final.isUsable()) {
       Updates.push_back(nullptr);
       Finals.push_back(nullptr);
@@ -7812,7 +10046,8 @@
       Updates.push_back(Update.get());
       Finals.push_back(Final.get());
     }
-    ++CurInit, ++CurPrivate;
+    ++CurInit;
+    ++CurPrivate;
   }
   Clause.setUpdates(Updates);
   Clause.setFinals(Finals);
@@ -7825,52 +10060,55 @@
 
   SmallVector<Expr *, 8> Vars;
   for (auto &RefExpr : VarList) {
-    assert(RefExpr && "NULL expr in OpenMP aligned clause.");
-    if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
+    assert(RefExpr && "NULL expr in OpenMP linear clause.");
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+                              /*AllowArraySection=*/false);
+    if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
-      continue;
     }
-
-    SourceLocation ELoc = RefExpr->getExprLoc();
-    // OpenMP [2.1, C/C++]
-    //  A list item is a variable name.
-    DeclRefExpr *DE = dyn_cast<DeclRefExpr>(RefExpr);
-    if (!DE || !isa<VarDecl>(DE->getDecl())) {
-      Diag(ELoc, diag::err_omp_expected_var_name) << RefExpr->getSourceRange();
+    ValueDecl *D = Res.first;
+    if (!D)
       continue;
-    }
 
-    VarDecl *VD = cast<VarDecl>(DE->getDecl());
+    QualType QType = D->getType();
+    auto *VD = dyn_cast<VarDecl>(D);
 
     // OpenMP  [2.8.1, simd construct, Restrictions]
     // The type of list items appearing in the aligned clause must be
     // array, pointer, reference to array, or reference to pointer.
-    QualType QType = VD->getType();
     QType = QType.getNonReferenceType().getUnqualifiedType().getCanonicalType();
     const Type *Ty = QType.getTypePtrOrNull();
-    if (!Ty || (!Ty->isDependentType() && !Ty->isArrayType() &&
-                !Ty->isPointerType())) {
+    if (!Ty || (!Ty->isArrayType() && !Ty->isPointerType())) {
       Diag(ELoc, diag::err_omp_aligned_expected_array_or_ptr)
-          << QType << getLangOpts().CPlusPlus << RefExpr->getSourceRange();
+          << QType << getLangOpts().CPlusPlus << ERange;
       bool IsDecl =
+          !VD ||
           VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
-      Diag(VD->getLocation(),
+      Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-          << VD;
+          << D;
       continue;
     }
 
     // OpenMP  [2.8.1, simd construct, Restrictions]
     // A list-item cannot appear in more than one aligned clause.
-    if (DeclRefExpr *PrevRef = DSAStack->addUniqueAligned(VD, DE)) {
-      Diag(ELoc, diag::err_omp_aligned_twice) << RefExpr->getSourceRange();
+    if (Expr *PrevRef = DSAStack->addUniqueAligned(D, SimpleRefExpr)) {
+      Diag(ELoc, diag::err_omp_aligned_twice) << 0 << ERange;
       Diag(PrevRef->getExprLoc(), diag::note_omp_explicit_dsa)
           << getOpenMPClauseName(OMPC_aligned);
       continue;
     }
 
-    Vars.push_back(DE);
+    DeclRefExpr *Ref = nullptr;
+    if (!VD && IsOpenMPCapturedDecl(D))
+      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+    Vars.push_back(DefaultFunctionArrayConversion(
+                       (VD || !Ref) ? RefExpr->IgnoreParens() : Ref)
+                       .get());
   }
 
   // OpenMP [2.8.1, simd construct, Description]
@@ -7918,7 +10156,8 @@
     //  A list item that appears in a copyin clause must be threadprivate.
     DeclRefExpr *DE = dyn_cast<DeclRefExpr>(RefExpr);
     if (!DE || !isa<VarDecl>(DE->getDecl())) {
-      Diag(ELoc, diag::err_omp_expected_var_name) << RefExpr->getSourceRange();
+      Diag(ELoc, diag::err_omp_expected_var_name_member_expr)
+          << 0 << RefExpr->getSourceRange();
       continue;
     }
 
@@ -7993,51 +10232,37 @@
   SmallVector<Expr *, 8> DstExprs;
   SmallVector<Expr *, 8> AssignmentOps;
   for (auto &RefExpr : VarList) {
-    assert(RefExpr && "NULL expr in OpenMP copyprivate clause.");
-    if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
+    assert(RefExpr && "NULL expr in OpenMP linear clause.");
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+                              /*AllowArraySection=*/false);
+    if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
       SrcExprs.push_back(nullptr);
       DstExprs.push_back(nullptr);
       AssignmentOps.push_back(nullptr);
-      continue;
     }
-
-    SourceLocation ELoc = RefExpr->getExprLoc();
-    // OpenMP [2.1, C/C++]
-    //  A list item is a variable name.
-    // OpenMP  [2.14.4.1, Restrictions, p.1]
-    //  A list item that appears in a copyin clause must be threadprivate.
-    DeclRefExpr *DE = dyn_cast<DeclRefExpr>(RefExpr);
-    if (!DE || !isa<VarDecl>(DE->getDecl())) {
-      Diag(ELoc, diag::err_omp_expected_var_name) << RefExpr->getSourceRange();
+    ValueDecl *D = Res.first;
+    if (!D)
       continue;
-    }
 
-    Decl *D = DE->getDecl();
-    VarDecl *VD = cast<VarDecl>(D);
-
-    QualType Type = VD->getType();
-    if (Type->isDependentType() || Type->isInstantiationDependentType()) {
-      // It will be analyzed later.
-      Vars.push_back(DE);
-      SrcExprs.push_back(nullptr);
-      DstExprs.push_back(nullptr);
-      AssignmentOps.push_back(nullptr);
-      continue;
-    }
+    QualType Type = D->getType();
+    auto *VD = dyn_cast<VarDecl>(D);
 
     // OpenMP [2.14.4.2, Restrictions, p.2]
     //  A list item that appears in a copyprivate clause may not appear in a
     //  private or firstprivate clause on the single construct.
-    if (!DSAStack->isThreadPrivate(VD)) {
-      auto DVar = DSAStack->getTopDSA(VD, false);
+    if (!VD || !DSAStack->isThreadPrivate(VD)) {
+      auto DVar = DSAStack->getTopDSA(D, false);
       if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_copyprivate &&
           DVar.RefExpr) {
         Diag(ELoc, diag::err_omp_wrong_dsa)
             << getOpenMPClauseName(DVar.CKind)
             << getOpenMPClauseName(OMPC_copyprivate);
-        ReportOriginalDSA(*this, DSAStack, VD, DVar);
+        ReportOriginalDSA(*this, DSAStack, D, DVar);
         continue;
       }
 
@@ -8045,12 +10270,12 @@
       //  All list items that appear in a copyprivate clause must be either
       //  threadprivate or private in the enclosing context.
       if (DVar.CKind == OMPC_unknown) {
-        DVar = DSAStack->getImplicitDSA(VD, false);
+        DVar = DSAStack->getImplicitDSA(D, false);
         if (DVar.CKind == OMPC_shared) {
           Diag(ELoc, diag::err_omp_required_access)
               << getOpenMPClauseName(OMPC_copyprivate)
               << "threadprivate or private in the enclosing context";
-          ReportOriginalDSA(*this, DSAStack, VD, DVar);
+          ReportOriginalDSA(*this, DSAStack, D, DVar);
           continue;
         }
       }
@@ -8062,10 +10287,11 @@
           << getOpenMPClauseName(OMPC_copyprivate) << Type
           << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
       bool IsDecl =
+          !VD ||
           VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
-      Diag(VD->getLocation(),
+      Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
-          << VD;
+          << D;
       continue;
     }
 
@@ -8076,27 +10302,28 @@
     Type = Context.getBaseElementType(Type.getNonReferenceType())
                .getUnqualifiedType();
     auto *SrcVD =
-        buildVarDecl(*this, DE->getLocStart(), Type, ".copyprivate.src",
-                     VD->hasAttrs() ? &VD->getAttrs() : nullptr);
-    auto *PseudoSrcExpr =
-        buildDeclRefExpr(*this, SrcVD, Type, DE->getExprLoc());
+        buildVarDecl(*this, RefExpr->getLocStart(), Type, ".copyprivate.src",
+                     D->hasAttrs() ? &D->getAttrs() : nullptr);
+    auto *PseudoSrcExpr = buildDeclRefExpr(*this, SrcVD, Type, ELoc);
     auto *DstVD =
-        buildVarDecl(*this, DE->getLocStart(), Type, ".copyprivate.dst",
-                     VD->hasAttrs() ? &VD->getAttrs() : nullptr);
-    auto *PseudoDstExpr =
-        buildDeclRefExpr(*this, DstVD, Type, DE->getExprLoc());
-    auto AssignmentOp = BuildBinOp(/*S=*/nullptr, DE->getExprLoc(), BO_Assign,
+        buildVarDecl(*this, RefExpr->getLocStart(), Type, ".copyprivate.dst",
+                     D->hasAttrs() ? &D->getAttrs() : nullptr);
+    auto *PseudoDstExpr = buildDeclRefExpr(*this, DstVD, Type, ELoc);
+    auto AssignmentOp = BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
                                    PseudoDstExpr, PseudoSrcExpr);
     if (AssignmentOp.isInvalid())
       continue;
-    AssignmentOp = ActOnFinishFullExpr(AssignmentOp.get(), DE->getExprLoc(),
+    AssignmentOp = ActOnFinishFullExpr(AssignmentOp.get(), ELoc,
                                        /*DiscardedValue=*/true);
     if (AssignmentOp.isInvalid())
       continue;
 
     // No need to mark vars as copyprivate, they are already threadprivate or
     // implicitly private.
-    Vars.push_back(DE);
+    assert(VD || IsOpenMPCapturedDecl(D));
+    Vars.push_back(
+        VD ? RefExpr->IgnoreParens()
+           : buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false));
     SrcExprs.push_back(PseudoSrcExpr);
     DstExprs.push_back(PseudoDstExpr);
     AssignmentOps.push_back(AssignmentOp.get());
@@ -8141,6 +10368,7 @@
     return nullptr;
   }
   SmallVector<Expr *, 8> Vars;
+  DSAStackTy::OperatorOffsetTy OpsOffs;
   llvm::APSInt DepCounter(/*BitWidth=*/32);
   llvm::APSInt TotalDepCount(/*BitWidth=*/32);
   if (DepKind == OMPC_DEPEND_sink) {
@@ -8153,8 +10381,7 @@
       DSAStack->getParentOrderedRegionParam()) {
     for (auto &RefExpr : VarList) {
       assert(RefExpr && "NULL expr in OpenMP shared clause.");
-      if (isa<DependentScopeDeclRefExpr>(RefExpr) ||
-          (DepKind == OMPC_DEPEND_sink && CurContext->isDependentContext())) {
+      if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
         // It will be analyzed later.
         Vars.push_back(RefExpr);
         continue;
@@ -8176,61 +10403,66 @@
         // directive, xi denotes the loop iteration variable of the i-th nested
         // loop associated with the loop directive, and di is a constant
         // non-negative integer.
-        SimpleExpr = SimpleExpr->IgnoreImplicit();
-        auto *DE = dyn_cast<DeclRefExpr>(SimpleExpr);
-        if (!DE) {
-          OverloadedOperatorKind OOK = OO_None;
-          SourceLocation OOLoc;
-          Expr *LHS, *RHS;
-          if (auto *BO = dyn_cast<BinaryOperator>(SimpleExpr)) {
-            OOK = BinaryOperator::getOverloadedOperator(BO->getOpcode());
-            OOLoc = BO->getOperatorLoc();
-            LHS = BO->getLHS()->IgnoreParenImpCasts();
-            RHS = BO->getRHS()->IgnoreParenImpCasts();
-          } else if (auto *OCE = dyn_cast<CXXOperatorCallExpr>(SimpleExpr)) {
-            OOK = OCE->getOperator();
-            OOLoc = OCE->getOperatorLoc();
-            LHS = OCE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
-            RHS = OCE->getArg(/*Arg=*/1)->IgnoreParenImpCasts();
-          } else if (auto *MCE = dyn_cast<CXXMemberCallExpr>(SimpleExpr)) {
-            OOK = MCE->getMethodDecl()
-                      ->getNameInfo()
-                      .getName()
-                      .getCXXOverloadedOperator();
-            OOLoc = MCE->getCallee()->getExprLoc();
-            LHS = MCE->getImplicitObjectArgument()->IgnoreParenImpCasts();
-            RHS = MCE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
-          } else {
-            Diag(ELoc, diag::err_omp_depend_sink_wrong_expr);
-            continue;
-          }
-          DE = dyn_cast<DeclRefExpr>(LHS);
-          if (!DE) {
-            Diag(LHS->getExprLoc(),
-                 diag::err_omp_depend_sink_expected_loop_iteration)
-                << DSAStack->getParentLoopControlVariable(
-                    DepCounter.getZExtValue());
-            continue;
-          }
-          if (OOK != OO_Plus && OOK != OO_Minus) {
-            Diag(OOLoc, diag::err_omp_depend_sink_expected_plus_minus);
-            continue;
-          }
-          ExprResult Res = VerifyPositiveIntegerConstantInClause(
-              RHS, OMPC_depend, /*StrictlyPositive=*/false);
-          if (Res.isInvalid())
-            continue;
-        }
-        auto *VD = dyn_cast<VarDecl>(DE->getDecl());
-        if (!CurContext->isDependentContext() &&
-            DSAStack->getParentOrderedRegionParam() &&
-            (!VD || DepCounter != DSAStack->isParentLoopControlVariable(VD))) {
-          Diag(DE->getExprLoc(),
-               diag::err_omp_depend_sink_expected_loop_iteration)
-              << DSAStack->getParentLoopControlVariable(
-                  DepCounter.getZExtValue());
+        if (CurContext->isDependentContext()) {
+          // It will be analyzed later.
+          Vars.push_back(RefExpr);
           continue;
         }
+        SimpleExpr = SimpleExpr->IgnoreImplicit();
+        OverloadedOperatorKind OOK = OO_None;
+        SourceLocation OOLoc;
+        Expr *LHS = SimpleExpr;
+        Expr *RHS = nullptr;
+        if (auto *BO = dyn_cast<BinaryOperator>(SimpleExpr)) {
+          OOK = BinaryOperator::getOverloadedOperator(BO->getOpcode());
+          OOLoc = BO->getOperatorLoc();
+          LHS = BO->getLHS()->IgnoreParenImpCasts();
+          RHS = BO->getRHS()->IgnoreParenImpCasts();
+        } else if (auto *OCE = dyn_cast<CXXOperatorCallExpr>(SimpleExpr)) {
+          OOK = OCE->getOperator();
+          OOLoc = OCE->getOperatorLoc();
+          LHS = OCE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
+          RHS = OCE->getArg(/*Arg=*/1)->IgnoreParenImpCasts();
+        } else if (auto *MCE = dyn_cast<CXXMemberCallExpr>(SimpleExpr)) {
+          OOK = MCE->getMethodDecl()
+                    ->getNameInfo()
+                    .getName()
+                    .getCXXOverloadedOperator();
+          OOLoc = MCE->getCallee()->getExprLoc();
+          LHS = MCE->getImplicitObjectArgument()->IgnoreParenImpCasts();
+          RHS = MCE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
+        }
+        SourceLocation ELoc;
+        SourceRange ERange;
+        auto Res = getPrivateItem(*this, LHS, ELoc, ERange,
+                                  /*AllowArraySection=*/false);
+        if (Res.second) {
+          // It will be analyzed later.
+          Vars.push_back(RefExpr);
+        }
+        ValueDecl *D = Res.first;
+        if (!D)
+          continue;
+
+        if (OOK != OO_Plus && OOK != OO_Minus && (RHS || OOK != OO_None)) {
+          Diag(OOLoc, diag::err_omp_depend_sink_expected_plus_minus);
+          continue;
+        }
+        if (RHS) {
+          ExprResult RHSRes = VerifyPositiveIntegerConstantInClause(
+              RHS, OMPC_depend, /*StrictlyPositive=*/false);
+          if (RHSRes.isInvalid())
+            continue;
+        }
+        if (!CurContext->isDependentContext() &&
+            DSAStack->getParentOrderedRegionParam() &&
+            DepCounter != DSAStack->isParentLoopControlVariable(D).first) {
+          Diag(ELoc, diag::err_omp_depend_sink_expected_loop_iteration)
+              << DSAStack->getParentLoopControlVariable(
+                     DepCounter.getZExtValue());
+          continue;
+        }
+        OpsOffs.push_back({RHS, OOK});
       } else {
         // OpenMP  [2.11.1.1, Restrictions, p.3]
         //  A variable that is part of another variable (such as a field of a
@@ -8241,14 +10473,17 @@
         auto *OASE = dyn_cast<OMPArraySectionExpr>(SimpleExpr);
         if (!RefExpr->IgnoreParenImpCasts()->isLValue() ||
             (!ASE && !DE && !OASE) || (DE && !isa<VarDecl>(DE->getDecl())) ||
-            (ASE && !ASE->getBase()->getType()->isAnyPointerType() &&
-             !ASE->getBase()->getType()->isArrayType())) {
-          Diag(ELoc, diag::err_omp_expected_var_name_or_array_item)
-              << RefExpr->getSourceRange();
+            (ASE &&
+             !ASE->getBase()
+                  ->getType()
+                  .getNonReferenceType()
+                  ->isPointerType() &&
+             !ASE->getBase()->getType().getNonReferenceType()->isArrayType())) {
+          Diag(ELoc, diag::err_omp_expected_var_name_member_expr_or_array_item)
+              << 0 << RefExpr->getSourceRange();
           continue;
         }
       }
-
       Vars.push_back(RefExpr->IgnoreParenImpCasts());
     }
 
@@ -8262,9 +10497,11 @@
         Vars.empty())
       return nullptr;
   }
-
-  return OMPDependClause::Create(Context, StartLoc, LParenLoc, EndLoc, DepKind,
-                                 DepLoc, ColonLoc, Vars);
+  auto *C = OMPDependClause::Create(Context, StartLoc, LParenLoc, EndLoc,
+                                    DepKind, DepLoc, ColonLoc, Vars);
+  if (DepKind == OMPC_DEPEND_sink || DepKind == OMPC_DEPEND_source)
+    DSAStack->addDoacrossDependClause(C, OpsOffs);
+  return C;
 }
 
 OMPClause *Sema::ActOnOpenMPDeviceClause(Expr *Device, SourceLocation StartLoc,
@@ -8332,135 +10569,981 @@
     SemaRef.Diag(SL, diag::err_incomplete_type) << QTy << SR;
     return false;
   } else if (CXXRecordDecl *RD = dyn_cast_or_null<CXXRecordDecl>(ND)) {
-    if (!RD->isInvalidDecl() &&
-        !IsCXXRecordForMappable(SemaRef, SL, Stack, RD))
+    if (!RD->isInvalidDecl() && !IsCXXRecordForMappable(SemaRef, SL, Stack, RD))
       return false;
   }
   return true;
 }
 
-OMPClause *Sema::ActOnOpenMPMapClause(
-    OpenMPMapClauseKind MapTypeModifier, OpenMPMapClauseKind MapType,
-    SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
-    SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
-  SmallVector<Expr *, 4> Vars;
+/// \brief Return true if it can be proven that the provided array expression
+/// (array section or array subscript) does NOT specify the whole size of the
+/// array whose base type is \a BaseQTy.
+static bool CheckArrayExpressionDoesNotReferToWholeSize(Sema &SemaRef,
+                                                        const Expr *E,
+                                                        QualType BaseQTy) {
+  auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
 
-  for (auto &RE : VarList) {
-    assert(RE && "Null expr in omp map");
-    if (isa<DependentScopeDeclRefExpr>(RE)) {
-      // It will be analyzed later.
-      Vars.push_back(RE);
+  // If this is an array subscript, it refers to the whole size if the size of
+  // the dimension is constant and equals 1. Also, an array section assumes the
+  // format of an array subscript if no colon is used.
+  if (isa<ArraySubscriptExpr>(E) || (OASE && OASE->getColonLoc().isInvalid())) {
+    if (auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
+      return ATy->getSize().getSExtValue() != 1;
+    // Size can't be evaluated statically.
+    return false;
+  }
+
+  assert(OASE && "Expecting array section if not an array subscript.");
+  auto *LowerBound = OASE->getLowerBound();
+  auto *Length = OASE->getLength();
+
+  // If there is a lower bound that does not evaluates to zero, we are not
+  // covering the whole dimension.
+  if (LowerBound) {
+    llvm::APSInt ConstLowerBound;
+    if (!LowerBound->EvaluateAsInt(ConstLowerBound, SemaRef.getASTContext()))
+      return false; // Can't get the integer value as a constant.
+    if (ConstLowerBound.getSExtValue())
+      return true;
+  }
+
+  // If we don't have a length we covering the whole dimension.
+  if (!Length)
+    return false;
+
+  // If the base is a pointer, we don't have a way to get the size of the
+  // pointee.
+  if (BaseQTy->isPointerType())
+    return false;
+
+  // We can only check if the length is the same as the size of the dimension
+  // if we have a constant array.
+  auto *CATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr());
+  if (!CATy)
+    return false;
+
+  llvm::APSInt ConstLength;
+  if (!Length->EvaluateAsInt(ConstLength, SemaRef.getASTContext()))
+    return false; // Can't get the integer value as a constant.
+
+  return CATy->getSize().getSExtValue() != ConstLength.getSExtValue();
+}
+
+// Return true if it can be proven that the provided array expression (array
+// section or array subscript) does NOT specify a single element of the array
+// whose base type is \a BaseQTy.
+static bool CheckArrayExpressionDoesNotReferToUnitySize(Sema &SemaRef,
+                                                        const Expr *E,
+                                                        QualType BaseQTy) {
+  auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
+
+  // An array subscript always refer to a single element. Also, an array section
+  // assumes the format of an array subscript if no colon is used.
+  if (isa<ArraySubscriptExpr>(E) || (OASE && OASE->getColonLoc().isInvalid()))
+    return false;
+
+  assert(OASE && "Expecting array section if not an array subscript.");
+  auto *Length = OASE->getLength();
+
+  // If we don't have a length we have to check if the array has unitary size
+  // for this dimension. Also, we should always expect a length if the base type
+  // is pointer.
+  if (!Length) {
+    if (auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
+      return ATy->getSize().getSExtValue() != 1;
+    // We cannot assume anything.
+    return false;
+  }
+
+  // Check if the length evaluates to 1.
+  llvm::APSInt ConstLength;
+  if (!Length->EvaluateAsInt(ConstLength, SemaRef.getASTContext()))
+    return false; // Can't get the integer value as a constant.
+
+  return ConstLength.getSExtValue() != 1;
+}
+
+// Return the expression of the base of the mappable expression or null if it
+// cannot be determined and do all the necessary checks to see if the expression
+// is valid as a standalone mappable expression. In the process, record all the
+// components of the expression.
+static Expr *CheckMapClauseExpressionBase(
+    Sema &SemaRef, Expr *E,
+    OMPClauseMappableExprCommon::MappableExprComponentList &CurComponents,
+    OpenMPClauseKind CKind) {
+  SourceLocation ELoc = E->getExprLoc();
+  SourceRange ERange = E->getSourceRange();
+
+  // The base of elements of list in a map clause have to be either:
+  //  - a reference to variable or field.
+  //  - a member expression.
+  //  - an array expression.
+  //
+  // E.g. if we have the expression 'r.S.Arr[:12]', we want to retrieve the
+  // reference to 'r'.
+  //
+  // If we have:
+  //
+  // struct SS {
+  //   Bla S;
+  //   foo() {
+  //     #pragma omp target map (S.Arr[:12]);
+  //   }
+  // }
+  //
+  // We want to retrieve the member expression 'this->S';
+
+  Expr *RelevantExpr = nullptr;
+
+  // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.2]
+  //  If a list item is an array section, it must specify contiguous storage.
+  //
+  // For this restriction it is sufficient that we make sure only references
+  // to variables or fields and array expressions, and that no array sections
+  // exist except in the rightmost expression (unless they cover the whole
+  // dimension of the array). E.g. these would be invalid:
+  //
+  //   r.ArrS[3:5].Arr[6:7]
+  //
+  //   r.ArrS[3:5].x
+  //
+  // but these would be valid:
+  //   r.ArrS[3].Arr[6:7]
+  //
+  //   r.ArrS[3].x
+
+  bool AllowUnitySizeArraySection = true;
+  bool AllowWholeSizeArraySection = true;
+
+  while (!RelevantExpr) {
+    E = E->IgnoreParenImpCasts();
+
+    if (auto *CurE = dyn_cast<DeclRefExpr>(E)) {
+      if (!isa<VarDecl>(CurE->getDecl()))
+        break;
+
+      RelevantExpr = CurE;
+
+      // If we got a reference to a declaration, we should not expect any array
+      // section before that.
+      AllowUnitySizeArraySection = false;
+      AllowWholeSizeArraySection = false;
+
+      // Record the component.
+      CurComponents.push_back(OMPClauseMappableExprCommon::MappableComponent(
+          CurE, CurE->getDecl()));
       continue;
     }
+
+    if (auto *CurE = dyn_cast<MemberExpr>(E)) {
+      auto *BaseE = CurE->getBase()->IgnoreParenImpCasts();
+
+      if (isa<CXXThisExpr>(BaseE))
+        // We found a base expression: this->Val.
+        RelevantExpr = CurE;
+      else
+        E = BaseE;
+
+      if (!isa<FieldDecl>(CurE->getMemberDecl())) {
+        SemaRef.Diag(ELoc, diag::err_omp_expected_access_to_data_field)
+            << CurE->getSourceRange();
+        break;
+      }
+
+      auto *FD = cast<FieldDecl>(CurE->getMemberDecl());
+
+      // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C/C++, p.3]
+      //  A bit-field cannot appear in a map clause.
+      //
+      if (FD->isBitField()) {
+        SemaRef.Diag(ELoc, diag::err_omp_bit_fields_forbidden_in_clause)
+            << CurE->getSourceRange() << getOpenMPClauseName(CKind);
+        break;
+      }
+
+      // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
+      //  If the type of a list item is a reference to a type T then the type
+      //  will be considered to be T for all purposes of this clause.
+      QualType CurType = BaseE->getType().getNonReferenceType();
+
+      // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C/C++, p.2]
+      //  A list item cannot be a variable that is a member of a structure with
+      //  a union type.
+      //
+      if (auto *RT = CurType->getAs<RecordType>())
+        if (RT->isUnionType()) {
+          SemaRef.Diag(ELoc, diag::err_omp_union_type_not_allowed)
+              << CurE->getSourceRange();
+          break;
+        }
+
+      // If we got a member expression, we should not expect any array section
+      // before that:
+      //
+      // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.7]
+      //  If a list item is an element of a structure, only the rightmost symbol
+      //  of the variable reference can be an array section.
+      //
+      AllowUnitySizeArraySection = false;
+      AllowWholeSizeArraySection = false;
+
+      // Record the component.
+      CurComponents.push_back(
+          OMPClauseMappableExprCommon::MappableComponent(CurE, FD));
+      continue;
+    }
+
+    if (auto *CurE = dyn_cast<ArraySubscriptExpr>(E)) {
+      E = CurE->getBase()->IgnoreParenImpCasts();
+
+      if (!E->getType()->isAnyPointerType() && !E->getType()->isArrayType()) {
+        SemaRef.Diag(ELoc, diag::err_omp_expected_base_var_name)
+            << 0 << CurE->getSourceRange();
+        break;
+      }
+
+      // If we got an array subscript that express the whole dimension we
+      // can have any array expressions before. If it only expressing part of
+      // the dimension, we can only have unitary-size array expressions.
+      if (CheckArrayExpressionDoesNotReferToWholeSize(SemaRef, CurE,
+                                                      E->getType()))
+        AllowWholeSizeArraySection = false;
+
+      // Record the component - we don't have any declaration associated.
+      CurComponents.push_back(
+          OMPClauseMappableExprCommon::MappableComponent(CurE, nullptr));
+      continue;
+    }
+
+    if (auto *CurE = dyn_cast<OMPArraySectionExpr>(E)) {
+      E = CurE->getBase()->IgnoreParenImpCasts();
+
+      auto CurType =
+          OMPArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
+
+      // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
+      //  If the type of a list item is a reference to a type T then the type
+      //  will be considered to be T for all purposes of this clause.
+      if (CurType->isReferenceType())
+        CurType = CurType->getPointeeType();
+
+      bool IsPointer = CurType->isAnyPointerType();
+
+      if (!IsPointer && !CurType->isArrayType()) {
+        SemaRef.Diag(ELoc, diag::err_omp_expected_base_var_name)
+            << 0 << CurE->getSourceRange();
+        break;
+      }
+
+      bool NotWhole =
+          CheckArrayExpressionDoesNotReferToWholeSize(SemaRef, CurE, CurType);
+      bool NotUnity =
+          CheckArrayExpressionDoesNotReferToUnitySize(SemaRef, CurE, CurType);
+
+      if (AllowWholeSizeArraySection) {
+        // Any array section is currently allowed. Allowing a whole size array
+        // section implies allowing a unity array section as well.
+        //
+        // If this array section refers to the whole dimension we can still
+        // accept other array sections before this one, except if the base is a
+        // pointer. Otherwise, only unitary sections are accepted.
+        if (NotWhole || IsPointer)
+          AllowWholeSizeArraySection = false;
+      } else if (AllowUnitySizeArraySection && NotUnity) {
+        // A unity or whole array section is not allowed and that is not
+        // compatible with the properties of the current array section.
+        SemaRef.Diag(
+            ELoc, diag::err_array_section_does_not_specify_contiguous_storage)
+            << CurE->getSourceRange();
+        break;
+      }
+
+      // Record the component - we don't have any declaration associated.
+      CurComponents.push_back(
+          OMPClauseMappableExprCommon::MappableComponent(CurE, nullptr));
+      continue;
+    }
+
+    // If nothing else worked, this is not a valid map clause expression.
+    SemaRef.Diag(ELoc,
+                 diag::err_omp_expected_named_var_member_or_array_expression)
+        << ERange;
+    break;
+  }
+
+  return RelevantExpr;
+}
+
+// Return true if expression E associated with value VD has conflicts with other
+// map information.
+static bool CheckMapConflicts(
+    Sema &SemaRef, DSAStackTy *DSAS, ValueDecl *VD, Expr *E,
+    bool CurrentRegionOnly,
+    OMPClauseMappableExprCommon::MappableExprComponentListRef CurComponents,
+    OpenMPClauseKind CKind) {
+  assert(VD && E);
+  SourceLocation ELoc = E->getExprLoc();
+  SourceRange ERange = E->getSourceRange();
+
+  // In order to easily check the conflicts we need to match each component of
+  // the expression under test with the components of the expressions that are
+  // already in the stack.
+
+  assert(!CurComponents.empty() && "Map clause expression with no components!");
+  assert(CurComponents.back().getAssociatedDeclaration() == VD &&
+         "Map clause expression with unexpected base!");
+
+  // Variables to help detecting enclosing problems in data environment nests.
+  bool IsEnclosedByDataEnvironmentExpr = false;
+  const Expr *EnclosingExpr = nullptr;
+
+  bool FoundError = DSAS->checkMappableExprComponentListsForDecl(
+      VD, CurrentRegionOnly,
+      [&](OMPClauseMappableExprCommon::MappableExprComponentListRef
+              StackComponents,
+          OpenMPClauseKind) -> bool {
+
+        assert(!StackComponents.empty() &&
+               "Map clause expression with no components!");
+        assert(StackComponents.back().getAssociatedDeclaration() == VD &&
+               "Map clause expression with unexpected base!");
+
+        // The whole expression in the stack.
+        auto *RE = StackComponents.front().getAssociatedExpression();
+
+        // Expressions must start from the same base. Here we detect at which
+        // point both expressions diverge from each other and see if we can
+        // detect if the memory referred to both expressions is contiguous and
+        // do not overlap.
+        auto CI = CurComponents.rbegin();
+        auto CE = CurComponents.rend();
+        auto SI = StackComponents.rbegin();
+        auto SE = StackComponents.rend();
+        for (; CI != CE && SI != SE; ++CI, ++SI) {
+
+          // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.3]
+          //  At most one list item can be an array item derived from a given
+          //  variable in map clauses of the same construct.
+          if (CurrentRegionOnly &&
+              (isa<ArraySubscriptExpr>(CI->getAssociatedExpression()) ||
+               isa<OMPArraySectionExpr>(CI->getAssociatedExpression())) &&
+              (isa<ArraySubscriptExpr>(SI->getAssociatedExpression()) ||
+               isa<OMPArraySectionExpr>(SI->getAssociatedExpression()))) {
+            SemaRef.Diag(CI->getAssociatedExpression()->getExprLoc(),
+                         diag::err_omp_multiple_array_items_in_map_clause)
+                << CI->getAssociatedExpression()->getSourceRange();
+            SemaRef.Diag(SI->getAssociatedExpression()->getExprLoc(),
+                         diag::note_used_here)
+                << SI->getAssociatedExpression()->getSourceRange();
+            return true;
+          }
+
+          // Do both expressions have the same kind?
+          if (CI->getAssociatedExpression()->getStmtClass() !=
+              SI->getAssociatedExpression()->getStmtClass())
+            break;
+
+          // Are we dealing with different variables/fields?
+          if (CI->getAssociatedDeclaration() != SI->getAssociatedDeclaration())
+            break;
+        }
+        // Check if the extra components of the expressions in the enclosing
+        // data environment are redundant for the current base declaration.
+        // If they are, the maps completely overlap, which is legal.
+        for (; SI != SE; ++SI) {
+          QualType Type;
+          if (auto *ASE =
+                  dyn_cast<ArraySubscriptExpr>(SI->getAssociatedExpression())) {
+            Type = ASE->getBase()->IgnoreParenImpCasts()->getType();
+          } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(
+                         SI->getAssociatedExpression())) {
+            auto *E = OASE->getBase()->IgnoreParenImpCasts();
+            Type =
+                OMPArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
+          }
+          if (Type.isNull() || Type->isAnyPointerType() ||
+              CheckArrayExpressionDoesNotReferToWholeSize(
+                  SemaRef, SI->getAssociatedExpression(), Type))
+            break;
+        }
+
+        // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.4]
+        //  List items of map clauses in the same construct must not share
+        //  original storage.
+        //
+        // If the expressions are exactly the same or one is a subset of the
+        // other, it means they are sharing storage.
+        if (CI == CE && SI == SE) {
+          if (CurrentRegionOnly) {
+            if (CKind == OMPC_map)
+              SemaRef.Diag(ELoc, diag::err_omp_map_shared_storage) << ERange;
+            else {
+              assert(CKind == OMPC_to || CKind == OMPC_from);
+              SemaRef.Diag(ELoc, diag::err_omp_once_referenced_in_target_update)
+                  << ERange;
+            }
+            SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
+                << RE->getSourceRange();
+            return true;
+          } else {
+            // If we find the same expression in the enclosing data environment,
+            // that is legal.
+            IsEnclosedByDataEnvironmentExpr = true;
+            return false;
+          }
+        }
+
+        QualType DerivedType =
+            std::prev(CI)->getAssociatedDeclaration()->getType();
+        SourceLocation DerivedLoc =
+            std::prev(CI)->getAssociatedExpression()->getExprLoc();
+
+        // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
+        //  If the type of a list item is a reference to a type T then the type
+        //  will be considered to be T for all purposes of this clause.
+        DerivedType = DerivedType.getNonReferenceType();
+
+        // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C/C++, p.1]
+        //  A variable for which the type is pointer and an array section
+        //  derived from that variable must not appear as list items of map
+        //  clauses of the same construct.
+        //
+        // Also, cover one of the cases in:
+        // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.5]
+        //  If any part of the original storage of a list item has corresponding
+        //  storage in the device data environment, all of the original storage
+        //  must have corresponding storage in the device data environment.
+        //
+        if (DerivedType->isAnyPointerType()) {
+          if (CI == CE || SI == SE) {
+            SemaRef.Diag(
+                DerivedLoc,
+                diag::err_omp_pointer_mapped_along_with_derived_section)
+                << DerivedLoc;
+          } else {
+            assert(CI != CE && SI != SE);
+            SemaRef.Diag(DerivedLoc, diag::err_omp_same_pointer_derreferenced)
+                << DerivedLoc;
+          }
+          SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
+              << RE->getSourceRange();
+          return true;
+        }
+
+        // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.4]
+        //  List items of map clauses in the same construct must not share
+        //  original storage.
+        //
+        // An expression is a subset of the other.
+        if (CurrentRegionOnly && (CI == CE || SI == SE)) {
+          if (CKind == OMPC_map)
+            SemaRef.Diag(ELoc, diag::err_omp_map_shared_storage) << ERange;
+          else {
+            assert(CKind == OMPC_to || CKind == OMPC_from);
+            SemaRef.Diag(ELoc, diag::err_omp_once_referenced_in_target_update)
+                << ERange;
+          }
+          SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
+              << RE->getSourceRange();
+          return true;
+        }
+
+        // The current expression uses the same base as other expression in the
+        // data environment but does not contain it completely.
+        if (!CurrentRegionOnly && SI != SE)
+          EnclosingExpr = RE;
+
+        // The current expression is a subset of the expression in the data
+        // environment.
+        IsEnclosedByDataEnvironmentExpr |=
+            (!CurrentRegionOnly && CI != CE && SI == SE);
+
+        return false;
+      });
+
+  if (CurrentRegionOnly)
+    return FoundError;
+
+  // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.5]
+  //  If any part of the original storage of a list item has corresponding
+  //  storage in the device data environment, all of the original storage must
+  //  have corresponding storage in the device data environment.
+  // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.6]
+  //  If a list item is an element of a structure, and a different element of
+  //  the structure has a corresponding list item in the device data environment
+  //  prior to a task encountering the construct associated with the map clause,
+  //  then the list item must also have a corresponding list item in the device
+  //  data environment prior to the task encountering the construct.
+  //
+  if (EnclosingExpr && !IsEnclosedByDataEnvironmentExpr) {
+    SemaRef.Diag(ELoc,
+                 diag::err_omp_original_storage_is_shared_and_does_not_contain)
+        << ERange;
+    SemaRef.Diag(EnclosingExpr->getExprLoc(), diag::note_used_here)
+        << EnclosingExpr->getSourceRange();
+    return true;
+  }
+
+  return FoundError;
+}
+
+namespace {
+// Utility struct that gathers all the related lists associated with a mappable
+// expression.
+struct MappableVarListInfo final {
+  // The list of expressions.
+  ArrayRef<Expr *> VarList;
+  // The list of processed expressions.
+  SmallVector<Expr *, 16> ProcessedVarList;
+  // The mappble components for each expression.
+  OMPClauseMappableExprCommon::MappableExprComponentLists VarComponents;
+  // The base declaration of the variable.
+  SmallVector<ValueDecl *, 16> VarBaseDeclarations;
+
+  MappableVarListInfo(ArrayRef<Expr *> VarList) : VarList(VarList) {
+    // We have a list of components and base declarations for each entry in the
+    // variable list.
+    VarComponents.reserve(VarList.size());
+    VarBaseDeclarations.reserve(VarList.size());
+  }
+};
+}
+
+// Check the validity of the provided variable list for the provided clause kind
+// \a CKind. In the check process the valid expressions, and mappable expression
+// components and variables are extracted and used to fill \a Vars,
+// \a ClauseComponents, and \a ClauseBaseDeclarations. \a MapType and
+// \a IsMapTypeImplicit are expected to be valid if the clause kind is 'map'.
+static void
+checkMappableExpressionList(Sema &SemaRef, DSAStackTy *DSAS,
+                            OpenMPClauseKind CKind, MappableVarListInfo &MVLI,
+                            SourceLocation StartLoc,
+                            OpenMPMapClauseKind MapType = OMPC_MAP_unknown,
+                            bool IsMapTypeImplicit = false) {
+  // We only expect mappable expressions in 'to', 'from', and 'map' clauses.
+  assert((CKind == OMPC_map || CKind == OMPC_to || CKind == OMPC_from) &&
+         "Unexpected clause kind with mappable expressions!");
+
+  // Keep track of the mappable components and base declarations in this clause.
+  // Each entry in the list is going to have a list of components associated. We
+  // record each set of the components so that we can build the clause later on.
+  // In the end we should have the same amount of declarations and component
+  // lists.
+
+  for (auto &RE : MVLI.VarList) {
+    assert(RE && "Null expr in omp to/from/map clause");
     SourceLocation ELoc = RE->getExprLoc();
 
-    // OpenMP [2.14.5, Restrictions]
-    //  A variable that is part of another variable (such as field of a
-    //  structure) but is not an array element or an array section cannot appear
-    //  in a map clause.
     auto *VE = RE->IgnoreParenLValueCasts();
 
     if (VE->isValueDependent() || VE->isTypeDependent() ||
         VE->isInstantiationDependent() ||
         VE->containsUnexpandedParameterPack()) {
-      // It will be analyzed later.
-      Vars.push_back(RE);
+      // We can only analyze this information once the missing information is
+      // resolved.
+      MVLI.ProcessedVarList.push_back(RE);
       continue;
     }
 
     auto *SimpleExpr = RE->IgnoreParenCasts();
-    auto *DE = dyn_cast<DeclRefExpr>(SimpleExpr);
-    auto *ASE = dyn_cast<ArraySubscriptExpr>(SimpleExpr);
-    auto *OASE = dyn_cast<OMPArraySectionExpr>(SimpleExpr);
 
-    if (!RE->IgnoreParenImpCasts()->isLValue() ||
-        (!OASE && !ASE && !DE) ||
-        (DE && !isa<VarDecl>(DE->getDecl())) ||
-        (ASE && !ASE->getBase()->getType()->isAnyPointerType() &&
-         !ASE->getBase()->getType()->isArrayType())) {
-      Diag(ELoc, diag::err_omp_expected_var_name_or_array_item)
-        << RE->getSourceRange();
+    if (!RE->IgnoreParenImpCasts()->isLValue()) {
+      SemaRef.Diag(ELoc,
+                   diag::err_omp_expected_named_var_member_or_array_expression)
+          << RE->getSourceRange();
       continue;
     }
 
-    Decl *D = nullptr;
-    if (DE) {
-      D = DE->getDecl();
-    } else if (ASE) {
-      auto *B = ASE->getBase()->IgnoreParenCasts();
-      D = dyn_cast<DeclRefExpr>(B)->getDecl();
-    } else if (OASE) {
-      auto *B = OASE->getBase();
-      D = dyn_cast<DeclRefExpr>(B)->getDecl();
-    }
-    assert(D && "Null decl on map clause.");
-    auto *VD = cast<VarDecl>(D);
+    OMPClauseMappableExprCommon::MappableExprComponentList CurComponents;
+    ValueDecl *CurDeclaration = nullptr;
 
-    // OpenMP [2.14.5, Restrictions, p.8]
+    // Obtain the array or member expression bases if required. Also, fill the
+    // components array with all the components identified in the process.
+    auto *BE =
+        CheckMapClauseExpressionBase(SemaRef, SimpleExpr, CurComponents, CKind);
+    if (!BE)
+      continue;
+
+    assert(!CurComponents.empty() &&
+           "Invalid mappable expression information.");
+
+    // For the following checks, we rely on the base declaration which is
+    // expected to be associated with the last component. The declaration is
+    // expected to be a variable or a field (if 'this' is being mapped).
+    CurDeclaration = CurComponents.back().getAssociatedDeclaration();
+    assert(CurDeclaration && "Null decl on map clause.");
+    assert(
+        CurDeclaration->isCanonicalDecl() &&
+        "Expecting components to have associated only canonical declarations.");
+
+    auto *VD = dyn_cast<VarDecl>(CurDeclaration);
+    auto *FD = dyn_cast<FieldDecl>(CurDeclaration);
+
+    assert((VD || FD) && "Only variables or fields are expected here!");
+    (void)FD;
+
+    // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.10]
     // threadprivate variables cannot appear in a map clause.
-    if (DSAStack->isThreadPrivate(VD)) {
-      auto DVar = DSAStack->getTopDSA(VD, false);
-      Diag(ELoc, diag::err_omp_threadprivate_in_map);
-      ReportOriginalDSA(*this, DSAStack, VD, DVar);
+    // OpenMP 4.5 [2.10.5, target update Construct]
+    // threadprivate variables cannot appear in a from clause.
+    if (VD && DSAS->isThreadPrivate(VD)) {
+      auto DVar = DSAS->getTopDSA(VD, false);
+      SemaRef.Diag(ELoc, diag::err_omp_threadprivate_in_clause)
+          << getOpenMPClauseName(CKind);
+      ReportOriginalDSA(SemaRef, DSAS, VD, DVar);
       continue;
     }
 
-    // OpenMP [2.14.5, Restrictions, p.2]
-    //  At most one list item can be an array item derived from a given variable
-    //  in map clauses of the same construct.
-    // OpenMP [2.14.5, Restrictions, p.3]
-    //  List items of map clauses in the same construct must not share original
-    //  storage.
-    // OpenMP [2.14.5, Restrictions, C/C++, p.2]
-    //  A variable for which the type is pointer, reference to array, or
-    //  reference to pointer and an array section derived from that variable
-    //  must not appear as list items of map clauses of the same construct.
-    DSAStackTy::MapInfo MI = DSAStack->IsMappedInCurrentRegion(VD);
-    if (MI.RefExpr) {
-      Diag(ELoc, diag::err_omp_map_shared_storage) << ELoc;
-      Diag(MI.RefExpr->getExprLoc(), diag::note_used_here)
-          << MI.RefExpr->getSourceRange();
-      continue;
-    }
+    // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.9]
+    //  A list item cannot appear in both a map clause and a data-sharing
+    //  attribute clause on the same construct.
 
-    // OpenMP [2.14.5, Restrictions, C/C++, p.3,4]
-    //  A variable for which the type is pointer, reference to array, or
-    //  reference to pointer must not appear as a list item if the enclosing
-    //  device data environment already contains an array section derived from
-    //  that variable.
-    //  An array section derived from a variable for which the type is pointer,
-    //  reference to array, or reference to pointer must not appear as a list
-    //  item if the enclosing device data environment already contains that
-    //  variable.
-    QualType Type = VD->getType();
-    MI = DSAStack->getMapInfoForVar(VD);
-    if (MI.RefExpr && (isa<DeclRefExpr>(MI.RefExpr->IgnoreParenLValueCasts()) !=
-                       isa<DeclRefExpr>(VE)) &&
-        (Type->isPointerType() || Type->isReferenceType())) {
-      Diag(ELoc, diag::err_omp_map_shared_storage) << ELoc;
-      Diag(MI.RefExpr->getExprLoc(), diag::note_used_here)
-          << MI.RefExpr->getSourceRange();
-      continue;
-    }
+    // Check conflicts with other map clause expressions. We check the conflicts
+    // with the current construct separately from the enclosing data
+    // environment, because the restrictions are different. We only have to
+    // check conflicts across regions for the map clauses.
+    if (CheckMapConflicts(SemaRef, DSAS, CurDeclaration, SimpleExpr,
+                          /*CurrentRegionOnly=*/true, CurComponents, CKind))
+      break;
+    if (CKind == OMPC_map &&
+        CheckMapConflicts(SemaRef, DSAS, CurDeclaration, SimpleExpr,
+                          /*CurrentRegionOnly=*/false, CurComponents, CKind))
+      break;
 
-    // OpenMP [2.14.5, Restrictions, C/C++, p.7]
+    // OpenMP 4.5 [2.10.5, target update Construct]
+    // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
+    //  If the type of a list item is a reference to a type T then the type will
+    //  be considered to be T for all purposes of this clause.
+    QualType Type = CurDeclaration->getType().getNonReferenceType();
+
+    // OpenMP 4.5 [2.10.5, target update Construct, Restrictions, p.4]
+    // A list item in a to or from clause must have a mappable type.
+    // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.9]
     //  A list item must have a mappable type.
-    if (!CheckTypeMappable(VE->getExprLoc(), VE->getSourceRange(), *this,
-                           DSAStack, Type))
+    if (!CheckTypeMappable(VE->getExprLoc(), VE->getSourceRange(), SemaRef,
+                           DSAS, Type))
       continue;
 
-    Vars.push_back(RE);
-    MI.RefExpr = RE;
-    DSAStack->addMapInfoForVar(VD, MI);
-  }
-  if (Vars.empty())
-    return nullptr;
+    if (CKind == OMPC_map) {
+      // target enter data
+      // OpenMP [2.10.2, Restrictions, p. 99]
+      // A map-type must be specified in all map clauses and must be either
+      // to or alloc.
+      OpenMPDirectiveKind DKind = DSAS->getCurrentDirective();
+      if (DKind == OMPD_target_enter_data &&
+          !(MapType == OMPC_MAP_to || MapType == OMPC_MAP_alloc)) {
+        SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
+            << (IsMapTypeImplicit ? 1 : 0)
+            << getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
+            << getOpenMPDirectiveName(DKind);
+        continue;
+      }
 
-  return OMPMapClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars,
-                              MapTypeModifier, MapType, MapLoc);
+      // target exit_data
+      // OpenMP [2.10.3, Restrictions, p. 102]
+      // A map-type must be specified in all map clauses and must be either
+      // from, release, or delete.
+      if (DKind == OMPD_target_exit_data &&
+          !(MapType == OMPC_MAP_from || MapType == OMPC_MAP_release ||
+            MapType == OMPC_MAP_delete)) {
+        SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
+            << (IsMapTypeImplicit ? 1 : 0)
+            << getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
+            << getOpenMPDirectiveName(DKind);
+        continue;
+      }
+
+      // OpenMP 4.5 [2.15.5.1, Restrictions, p.3]
+      // A list item cannot appear in both a map clause and a data-sharing
+      // attribute clause on the same construct
+      if (DKind == OMPD_target && VD) {
+        auto DVar = DSAS->getTopDSA(VD, false);
+        if (isOpenMPPrivate(DVar.CKind)) {
+          SemaRef.Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
+              << getOpenMPClauseName(DVar.CKind)
+              << getOpenMPClauseName(OMPC_map)
+              << getOpenMPDirectiveName(DSAS->getCurrentDirective());
+          ReportOriginalDSA(SemaRef, DSAS, CurDeclaration, DVar);
+          continue;
+        }
+      }
+    }
+
+    // Save the current expression.
+    MVLI.ProcessedVarList.push_back(RE);
+
+    // Store the components in the stack so that they can be used to check
+    // against other clauses later on.
+    DSAS->addMappableExpressionComponents(CurDeclaration, CurComponents,
+                                          /*WhereFoundClauseKind=*/OMPC_map);
+
+    // Save the components and declaration to create the clause. For purposes of
+    // the clause creation, any component list that has has base 'this' uses
+    // null as base declaration.
+    MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
+    MVLI.VarComponents.back().append(CurComponents.begin(),
+                                     CurComponents.end());
+    MVLI.VarBaseDeclarations.push_back(isa<MemberExpr>(BE) ? nullptr
+                                                           : CurDeclaration);
+  }
 }
 
-OMPClause *Sema::ActOnOpenMPNumTeamsClause(Expr *NumTeams, 
+OMPClause *
+Sema::ActOnOpenMPMapClause(OpenMPMapClauseKind MapTypeModifier,
+                           OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
+                           SourceLocation MapLoc, SourceLocation ColonLoc,
+                           ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+                           SourceLocation LParenLoc, SourceLocation EndLoc) {
+  MappableVarListInfo MVLI(VarList);
+  checkMappableExpressionList(*this, DSAStack, OMPC_map, MVLI, StartLoc,
+                              MapType, IsMapTypeImplicit);
+
+  // We need to produce a map clause even if we don't have variables so that
+  // other diagnostics related with non-existing map clauses are accurate.
+  return OMPMapClause::Create(Context, StartLoc, LParenLoc, EndLoc,
+                              MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+                              MVLI.VarComponents, MapTypeModifier, MapType,
+                              IsMapTypeImplicit, MapLoc);
+}
+
+QualType Sema::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
+                                               TypeResult ParsedType) {
+  assert(ParsedType.isUsable());
+
+  QualType ReductionType = GetTypeFromParser(ParsedType.get());
+  if (ReductionType.isNull())
+    return QualType();
+
+  // [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions, C\C++
+  // A type name in a declare reduction directive cannot be a function type, an
+  // array type, a reference type, or a type qualified with const, volatile or
+  // restrict.
+  if (ReductionType.hasQualifiers()) {
+    Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 0;
+    return QualType();
+  }
+
+  if (ReductionType->isFunctionType()) {
+    Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 1;
+    return QualType();
+  }
+  if (ReductionType->isReferenceType()) {
+    Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 2;
+    return QualType();
+  }
+  if (ReductionType->isArrayType()) {
+    Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 3;
+    return QualType();
+  }
+  return ReductionType;
+}
+
+Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart(
+    Scope *S, DeclContext *DC, DeclarationName Name,
+    ArrayRef<std::pair<QualType, SourceLocation>> ReductionTypes,
+    AccessSpecifier AS, Decl *PrevDeclInScope) {
+  SmallVector<Decl *, 8> Decls;
+  Decls.reserve(ReductionTypes.size());
+
+  LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPReductionName,
+                      ForRedeclaration);
+  // [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions
+  // A reduction-identifier may not be re-declared in the current scope for the
+  // same type or for a type that is compatible according to the base language
+  // rules.
+  llvm::DenseMap<QualType, SourceLocation> PreviousRedeclTypes;
+  OMPDeclareReductionDecl *PrevDRD = nullptr;
+  bool InCompoundScope = true;
+  if (S != nullptr) {
+    // Find previous declaration with the same name not referenced in other
+    // declarations.
+    FunctionScopeInfo *ParentFn = getEnclosingFunction();
+    InCompoundScope =
+        (ParentFn != nullptr) && !ParentFn->CompoundScopes.empty();
+    LookupName(Lookup, S);
+    FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false,
+                         /*AllowInlineNamespace=*/false);
+    llvm::DenseMap<OMPDeclareReductionDecl *, bool> UsedAsPrevious;
+    auto Filter = Lookup.makeFilter();
+    while (Filter.hasNext()) {
+      auto *PrevDecl = cast<OMPDeclareReductionDecl>(Filter.next());
+      if (InCompoundScope) {
+        auto I = UsedAsPrevious.find(PrevDecl);
+        if (I == UsedAsPrevious.end())
+          UsedAsPrevious[PrevDecl] = false;
+        if (auto *D = PrevDecl->getPrevDeclInScope())
+          UsedAsPrevious[D] = true;
+      }
+      PreviousRedeclTypes[PrevDecl->getType().getCanonicalType()] =
+          PrevDecl->getLocation();
+    }
+    Filter.done();
+    if (InCompoundScope) {
+      for (auto &PrevData : UsedAsPrevious) {
+        if (!PrevData.second) {
+          PrevDRD = PrevData.first;
+          break;
+        }
+      }
+    }
+  } else if (PrevDeclInScope != nullptr) {
+    auto *PrevDRDInScope = PrevDRD =
+        cast<OMPDeclareReductionDecl>(PrevDeclInScope);
+    do {
+      PreviousRedeclTypes[PrevDRDInScope->getType().getCanonicalType()] =
+          PrevDRDInScope->getLocation();
+      PrevDRDInScope = PrevDRDInScope->getPrevDeclInScope();
+    } while (PrevDRDInScope != nullptr);
+  }
+  for (auto &TyData : ReductionTypes) {
+    auto I = PreviousRedeclTypes.find(TyData.first.getCanonicalType());
+    bool Invalid = false;
+    if (I != PreviousRedeclTypes.end()) {
+      Diag(TyData.second, diag::err_omp_declare_reduction_redefinition)
+          << TyData.first;
+      Diag(I->second, diag::note_previous_definition);
+      Invalid = true;
+    }
+    PreviousRedeclTypes[TyData.first.getCanonicalType()] = TyData.second;
+    auto *DRD = OMPDeclareReductionDecl::Create(Context, DC, TyData.second,
+                                                Name, TyData.first, PrevDRD);
+    DC->addDecl(DRD);
+    DRD->setAccess(AS);
+    Decls.push_back(DRD);
+    if (Invalid)
+      DRD->setInvalidDecl();
+    else
+      PrevDRD = DRD;
+  }
+
+  return DeclGroupPtrTy::make(
+      DeclGroupRef::Create(Context, Decls.begin(), Decls.size()));
+}
+
+void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) {
+  auto *DRD = cast<OMPDeclareReductionDecl>(D);
+
+  // Enter new function scope.
+  PushFunctionScope();
+  getCurFunction()->setHasBranchProtectedScope();
+  getCurFunction()->setHasOMPDeclareReductionCombiner();
+
+  if (S != nullptr)
+    PushDeclContext(S, DRD);
+  else
+    CurContext = DRD;
+
+  PushExpressionEvaluationContext(PotentiallyEvaluated);
+
+  QualType ReductionType = DRD->getType();
+  // Create 'T* omp_parm;T omp_in;'. All references to 'omp_in' will
+  // be replaced by '*omp_parm' during codegen. This required because 'omp_in'
+  // uses semantics of argument handles by value, but it should be passed by
+  // reference. C lang does not support references, so pass all parameters as
+  // pointers.
+  // Create 'T omp_in;' variable.
+  auto *OmpInParm =
+      buildVarDecl(*this, D->getLocation(), ReductionType, "omp_in");
+  // Create 'T* omp_parm;T omp_out;'. All references to 'omp_out' will
+  // be replaced by '*omp_parm' during codegen. This required because 'omp_out'
+  // uses semantics of argument handles by value, but it should be passed by
+  // reference. C lang does not support references, so pass all parameters as
+  // pointers.
+  // Create 'T omp_out;' variable.
+  auto *OmpOutParm =
+      buildVarDecl(*this, D->getLocation(), ReductionType, "omp_out");
+  if (S != nullptr) {
+    PushOnScopeChains(OmpInParm, S);
+    PushOnScopeChains(OmpOutParm, S);
+  } else {
+    DRD->addDecl(OmpInParm);
+    DRD->addDecl(OmpOutParm);
+  }
+}
+
+void Sema::ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner) {
+  auto *DRD = cast<OMPDeclareReductionDecl>(D);
+  DiscardCleanupsInEvaluationContext();
+  PopExpressionEvaluationContext();
+
+  PopDeclContext();
+  PopFunctionScopeInfo();
+
+  if (Combiner != nullptr)
+    DRD->setCombiner(Combiner);
+  else
+    DRD->setInvalidDecl();
+}
+
+void Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D) {
+  auto *DRD = cast<OMPDeclareReductionDecl>(D);
+
+  // Enter new function scope.
+  PushFunctionScope();
+  getCurFunction()->setHasBranchProtectedScope();
+
+  if (S != nullptr)
+    PushDeclContext(S, DRD);
+  else
+    CurContext = DRD;
+
+  PushExpressionEvaluationContext(PotentiallyEvaluated);
+
+  QualType ReductionType = DRD->getType();
+  // Create 'T* omp_parm;T omp_priv;'. All references to 'omp_priv' will
+  // be replaced by '*omp_parm' during codegen. This required because 'omp_priv'
+  // uses semantics of argument handles by value, but it should be passed by
+  // reference. C lang does not support references, so pass all parameters as
+  // pointers.
+  // Create 'T omp_priv;' variable.
+  auto *OmpPrivParm =
+      buildVarDecl(*this, D->getLocation(), ReductionType, "omp_priv");
+  // Create 'T* omp_parm;T omp_orig;'. All references to 'omp_orig' will
+  // be replaced by '*omp_parm' during codegen. This required because 'omp_orig'
+  // uses semantics of argument handles by value, but it should be passed by
+  // reference. C lang does not support references, so pass all parameters as
+  // pointers.
+  // Create 'T omp_orig;' variable.
+  auto *OmpOrigParm =
+      buildVarDecl(*this, D->getLocation(), ReductionType, "omp_orig");
+  if (S != nullptr) {
+    PushOnScopeChains(OmpPrivParm, S);
+    PushOnScopeChains(OmpOrigParm, S);
+  } else {
+    DRD->addDecl(OmpPrivParm);
+    DRD->addDecl(OmpOrigParm);
+  }
+}
+
+void Sema::ActOnOpenMPDeclareReductionInitializerEnd(Decl *D,
+                                                     Expr *Initializer) {
+  auto *DRD = cast<OMPDeclareReductionDecl>(D);
+  DiscardCleanupsInEvaluationContext();
+  PopExpressionEvaluationContext();
+
+  PopDeclContext();
+  PopFunctionScopeInfo();
+
+  if (Initializer != nullptr)
+    DRD->setInitializer(Initializer);
+  else
+    DRD->setInvalidDecl();
+}
+
+Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveEnd(
+    Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid) {
+  for (auto *D : DeclReductions.get()) {
+    if (IsValid) {
+      auto *DRD = cast<OMPDeclareReductionDecl>(D);
+      if (S != nullptr)
+        PushOnScopeChains(DRD, S, /*AddToContext=*/false);
+    } else
+      D->setInvalidDecl();
+  }
+  return DeclReductions;
+}
+
+OMPClause *Sema::ActOnOpenMPNumTeamsClause(Expr *NumTeams,
                                            SourceLocation StartLoc,
                                            SourceLocation LParenLoc,
                                            SourceLocation EndLoc) {
@@ -8487,8 +11570,8 @@
                                  /*StrictlyPositive=*/true))
     return nullptr;
 
-  return new (Context) OMPThreadLimitClause(ValExpr, StartLoc, LParenLoc,
-                                            EndLoc);
+  return new (Context)
+      OMPThreadLimitClause(ValExpr, StartLoc, LParenLoc, EndLoc);
 }
 
 OMPClause *Sema::ActOnOpenMPPriorityClause(Expr *Priority,
@@ -8565,7 +11648,7 @@
     return nullptr;
   }
   Expr *ValExpr = ChunkSize;
-  Expr *HelperValExpr = nullptr;
+  Stmt *HelperValStmt = nullptr;
   if (ChunkSize) {
     if (!ChunkSize->isValueDependent() && !ChunkSize->isTypeDependent() &&
         !ChunkSize->isInstantiationDependent() &&
@@ -8588,18 +11671,435 @@
               << "dist_schedule" << ChunkSize->getSourceRange();
           return nullptr;
         }
-      } else if (isParallelOrTaskRegion(DSAStack->getCurrentDirective())) {
-        auto *ImpVar = buildVarDecl(*this, ChunkSize->getExprLoc(),
-                                    ChunkSize->getType(), ".chunk.");
-        auto *ImpVarRef = buildDeclRefExpr(*this, ImpVar, ChunkSize->getType(),
-                                           ChunkSize->getExprLoc(),
-                                           /*RefersToCapture=*/true);
-        HelperValExpr = ImpVarRef;
+      } else if (isParallelOrTaskRegion(DSAStack->getCurrentDirective()) &&
+                 !CurContext->isDependentContext()) {
+        llvm::MapVector<Expr *, DeclRefExpr *> Captures;
+        ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
+        HelperValStmt = buildPreInits(Context, Captures);
       }
     }
   }
 
   return new (Context)
       OMPDistScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc,
-                            Kind, ValExpr, HelperValExpr);
+                            Kind, ValExpr, HelperValStmt);
+}
+
+OMPClause *Sema::ActOnOpenMPDefaultmapClause(
+    OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind,
+    SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc,
+    SourceLocation KindLoc, SourceLocation EndLoc) {
+  // OpenMP 4.5 only supports 'defaultmap(tofrom: scalar)'
+  if (M != OMPC_DEFAULTMAP_MODIFIER_tofrom || Kind != OMPC_DEFAULTMAP_scalar) {
+    std::string Value;
+    SourceLocation Loc;
+    Value += "'";
+    if (M != OMPC_DEFAULTMAP_MODIFIER_tofrom) {
+      Value += getOpenMPSimpleClauseTypeName(OMPC_defaultmap,
+                                             OMPC_DEFAULTMAP_MODIFIER_tofrom);
+      Loc = MLoc;
+    } else {
+      Value += getOpenMPSimpleClauseTypeName(OMPC_defaultmap,
+                                             OMPC_DEFAULTMAP_scalar);
+      Loc = KindLoc;
+    }
+    Value += "'";
+    Diag(Loc, diag::err_omp_unexpected_clause_value)
+        << Value << getOpenMPClauseName(OMPC_defaultmap);
+    return nullptr;
+  }
+
+  return new (Context)
+      OMPDefaultmapClause(StartLoc, LParenLoc, MLoc, KindLoc, EndLoc, Kind, M);
+}
+
+bool Sema::ActOnStartOpenMPDeclareTargetDirective(SourceLocation Loc) {
+  DeclContext *CurLexicalContext = getCurLexicalContext();
+  if (!CurLexicalContext->isFileContext() &&
+      !CurLexicalContext->isExternCContext() &&
+      !CurLexicalContext->isExternCXXContext()) {
+    Diag(Loc, diag::err_omp_region_not_file_context);
+    return false;
+  }
+  if (IsInOpenMPDeclareTargetContext) {
+    Diag(Loc, diag::err_omp_enclosed_declare_target);
+    return false;
+  }
+
+  IsInOpenMPDeclareTargetContext = true;
+  return true;
+}
+
+void Sema::ActOnFinishOpenMPDeclareTargetDirective() {
+  assert(IsInOpenMPDeclareTargetContext &&
+         "Unexpected ActOnFinishOpenMPDeclareTargetDirective");
+
+  IsInOpenMPDeclareTargetContext = false;
+}
+
+void Sema::ActOnOpenMPDeclareTargetName(Scope *CurScope,
+                                        CXXScopeSpec &ScopeSpec,
+                                        const DeclarationNameInfo &Id,
+                                        OMPDeclareTargetDeclAttr::MapTypeTy MT,
+                                        NamedDeclSetType &SameDirectiveDecls) {
+  LookupResult Lookup(*this, Id, LookupOrdinaryName);
+  LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
+
+  if (Lookup.isAmbiguous())
+    return;
+  Lookup.suppressDiagnostics();
+
+  if (!Lookup.isSingleResult()) {
+    if (TypoCorrection Corrected =
+            CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr,
+                        llvm::make_unique<VarOrFuncDeclFilterCCC>(*this),
+                        CTK_ErrorRecovery)) {
+      diagnoseTypo(Corrected, PDiag(diag::err_undeclared_var_use_suggest)
+                                  << Id.getName());
+      checkDeclIsAllowedInOpenMPTarget(nullptr, Corrected.getCorrectionDecl());
+      return;
+    }
+
+    Diag(Id.getLoc(), diag::err_undeclared_var_use) << Id.getName();
+    return;
+  }
+
+  NamedDecl *ND = Lookup.getAsSingle<NamedDecl>();
+  if (isa<VarDecl>(ND) || isa<FunctionDecl>(ND)) {
+    if (!SameDirectiveDecls.insert(cast<NamedDecl>(ND->getCanonicalDecl())))
+      Diag(Id.getLoc(), diag::err_omp_declare_target_multiple) << Id.getName();
+
+    if (!ND->hasAttr<OMPDeclareTargetDeclAttr>()) {
+      Attr *A = OMPDeclareTargetDeclAttr::CreateImplicit(Context, MT);
+      ND->addAttr(A);
+      if (ASTMutationListener *ML = Context.getASTMutationListener())
+        ML->DeclarationMarkedOpenMPDeclareTarget(ND, A);
+      checkDeclIsAllowedInOpenMPTarget(nullptr, ND);
+    } else if (ND->getAttr<OMPDeclareTargetDeclAttr>()->getMapType() != MT) {
+      Diag(Id.getLoc(), diag::err_omp_declare_target_to_and_link)
+          << Id.getName();
+    }
+  } else
+    Diag(Id.getLoc(), diag::err_omp_invalid_target_decl) << Id.getName();
+}
+
+static void checkDeclInTargetContext(SourceLocation SL, SourceRange SR,
+                                     Sema &SemaRef, Decl *D) {
+  if (!D)
+    return;
+  Decl *LD = nullptr;
+  if (isa<TagDecl>(D)) {
+    LD = cast<TagDecl>(D)->getDefinition();
+  } else if (isa<VarDecl>(D)) {
+    LD = cast<VarDecl>(D)->getDefinition();
+
+    // If this is an implicit variable that is legal and we do not need to do
+    // anything.
+    if (cast<VarDecl>(D)->isImplicit()) {
+      Attr *A = OMPDeclareTargetDeclAttr::CreateImplicit(
+          SemaRef.Context, OMPDeclareTargetDeclAttr::MT_To);
+      D->addAttr(A);
+      if (ASTMutationListener *ML = SemaRef.Context.getASTMutationListener())
+        ML->DeclarationMarkedOpenMPDeclareTarget(D, A);
+      return;
+    }
+
+  } else if (isa<FunctionDecl>(D)) {
+    const FunctionDecl *FD = nullptr;
+    if (cast<FunctionDecl>(D)->hasBody(FD))
+      LD = const_cast<FunctionDecl *>(FD);
+
+    // If the definition is associated with the current declaration in the
+    // target region (it can be e.g. a lambda) that is legal and we do not need
+    // to do anything else.
+    if (LD == D) {
+      Attr *A = OMPDeclareTargetDeclAttr::CreateImplicit(
+          SemaRef.Context, OMPDeclareTargetDeclAttr::MT_To);
+      D->addAttr(A);
+      if (ASTMutationListener *ML = SemaRef.Context.getASTMutationListener())
+        ML->DeclarationMarkedOpenMPDeclareTarget(D, A);
+      return;
+    }
+  }
+  if (!LD)
+    LD = D;
+  if (LD && !LD->hasAttr<OMPDeclareTargetDeclAttr>() &&
+      (isa<VarDecl>(LD) || isa<FunctionDecl>(LD))) {
+    // Outlined declaration is not declared target.
+    if (LD->isOutOfLine()) {
+      SemaRef.Diag(LD->getLocation(), diag::warn_omp_not_in_target_context);
+      SemaRef.Diag(SL, diag::note_used_here) << SR;
+    } else {
+      DeclContext *DC = LD->getDeclContext();
+      while (DC) {
+        if (isa<FunctionDecl>(DC) &&
+            cast<FunctionDecl>(DC)->hasAttr<OMPDeclareTargetDeclAttr>())
+          break;
+        DC = DC->getParent();
+      }
+      if (DC)
+        return;
+
+      // Is not declared in target context.
+      SemaRef.Diag(LD->getLocation(), diag::warn_omp_not_in_target_context);
+      SemaRef.Diag(SL, diag::note_used_here) << SR;
+    }
+    // Mark decl as declared target to prevent further diagnostic.
+    Attr *A = OMPDeclareTargetDeclAttr::CreateImplicit(
+        SemaRef.Context, OMPDeclareTargetDeclAttr::MT_To);
+    D->addAttr(A);
+    if (ASTMutationListener *ML = SemaRef.Context.getASTMutationListener())
+      ML->DeclarationMarkedOpenMPDeclareTarget(D, A);
+  }
+}
+
+static bool checkValueDeclInTarget(SourceLocation SL, SourceRange SR,
+                                   Sema &SemaRef, DSAStackTy *Stack,
+                                   ValueDecl *VD) {
+  if (VD->hasAttr<OMPDeclareTargetDeclAttr>())
+    return true;
+  if (!CheckTypeMappable(SL, SR, SemaRef, Stack, VD->getType()))
+    return false;
+  return true;
+}
+
+void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D) {
+  if (!D || D->isInvalidDecl())
+    return;
+  SourceRange SR = E ? E->getSourceRange() : D->getSourceRange();
+  SourceLocation SL = E ? E->getLocStart() : D->getLocation();
+  // 2.10.6: threadprivate variable cannot appear in a declare target directive.
+  if (VarDecl *VD = dyn_cast<VarDecl>(D)) {
+    if (DSAStack->isThreadPrivate(VD)) {
+      Diag(SL, diag::err_omp_threadprivate_in_target);
+      ReportOriginalDSA(*this, DSAStack, VD, DSAStack->getTopDSA(VD, false));
+      return;
+    }
+  }
+  if (ValueDecl *VD = dyn_cast<ValueDecl>(D)) {
+    // Problem if any with var declared with incomplete type will be reported
+    // as normal, so no need to check it here.
+    if ((E || !VD->getType()->isIncompleteType()) &&
+        !checkValueDeclInTarget(SL, SR, *this, DSAStack, VD)) {
+      // Mark decl as declared target to prevent further diagnostic.
+      if (isa<VarDecl>(VD) || isa<FunctionDecl>(VD)) {
+        Attr *A = OMPDeclareTargetDeclAttr::CreateImplicit(
+            Context, OMPDeclareTargetDeclAttr::MT_To);
+        VD->addAttr(A);
+        if (ASTMutationListener *ML = Context.getASTMutationListener())
+          ML->DeclarationMarkedOpenMPDeclareTarget(VD, A);
+      }
+      return;
+    }
+  }
+  if (!E) {
+    // Checking declaration inside declare target region.
+    if (!D->hasAttr<OMPDeclareTargetDeclAttr>() &&
+        (isa<VarDecl>(D) || isa<FunctionDecl>(D))) {
+      Attr *A = OMPDeclareTargetDeclAttr::CreateImplicit(
+          Context, OMPDeclareTargetDeclAttr::MT_To);
+      D->addAttr(A);
+      if (ASTMutationListener *ML = Context.getASTMutationListener())
+        ML->DeclarationMarkedOpenMPDeclareTarget(D, A);
+    }
+    return;
+  }
+  checkDeclInTargetContext(E->getExprLoc(), E->getSourceRange(), *this, D);
+}
+
+OMPClause *Sema::ActOnOpenMPToClause(ArrayRef<Expr *> VarList,
+                                     SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc) {
+  MappableVarListInfo MVLI(VarList);
+  checkMappableExpressionList(*this, DSAStack, OMPC_to, MVLI, StartLoc);
+  if (MVLI.ProcessedVarList.empty())
+    return nullptr;
+
+  return OMPToClause::Create(Context, StartLoc, LParenLoc, EndLoc,
+                             MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+                             MVLI.VarComponents);
+}
+
+OMPClause *Sema::ActOnOpenMPFromClause(ArrayRef<Expr *> VarList,
+                                       SourceLocation StartLoc,
+                                       SourceLocation LParenLoc,
+                                       SourceLocation EndLoc) {
+  MappableVarListInfo MVLI(VarList);
+  checkMappableExpressionList(*this, DSAStack, OMPC_from, MVLI, StartLoc);
+  if (MVLI.ProcessedVarList.empty())
+    return nullptr;
+
+  return OMPFromClause::Create(Context, StartLoc, LParenLoc, EndLoc,
+                               MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+                               MVLI.VarComponents);
+}
+
+OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
+                                               SourceLocation StartLoc,
+                                               SourceLocation LParenLoc,
+                                               SourceLocation EndLoc) {
+  MappableVarListInfo MVLI(VarList);
+  SmallVector<Expr *, 8> PrivateCopies;
+  SmallVector<Expr *, 8> Inits;
+
+  for (auto &RefExpr : VarList) {
+    assert(RefExpr && "NULL expr in OpenMP use_device_ptr clause.");
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    if (Res.second) {
+      // It will be analyzed later.
+      MVLI.ProcessedVarList.push_back(RefExpr);
+      PrivateCopies.push_back(nullptr);
+      Inits.push_back(nullptr);
+    }
+    ValueDecl *D = Res.first;
+    if (!D)
+      continue;
+
+    QualType Type = D->getType();
+    Type = Type.getNonReferenceType().getUnqualifiedType();
+
+    auto *VD = dyn_cast<VarDecl>(D);
+
+    // Item should be a pointer or reference to pointer.
+    if (!Type->isPointerType()) {
+      Diag(ELoc, diag::err_omp_usedeviceptr_not_a_pointer)
+          << 0 << RefExpr->getSourceRange();
+      continue;
+    }
+
+    // Build the private variable and the expression that refers to it.
+    auto VDPrivate = buildVarDecl(*this, ELoc, Type, D->getName(),
+                                  D->hasAttrs() ? &D->getAttrs() : nullptr);
+    if (VDPrivate->isInvalidDecl())
+      continue;
+
+    CurContext->addDecl(VDPrivate);
+    auto VDPrivateRefExpr = buildDeclRefExpr(
+        *this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);
+
+    // Add temporary variable to initialize the private copy of the pointer.
+    auto *VDInit =
+        buildVarDecl(*this, RefExpr->getExprLoc(), Type, ".devptr.temp");
+    auto *VDInitRefExpr = buildDeclRefExpr(*this, VDInit, RefExpr->getType(),
+                                           RefExpr->getExprLoc());
+    AddInitializerToDecl(VDPrivate,
+                         DefaultLvalueConversion(VDInitRefExpr).get(),
+                         /*DirectInit=*/false, /*TypeMayContainAuto=*/false);
+
+    // If required, build a capture to implement the privatization initialized
+    // with the current list item value.
+    DeclRefExpr *Ref = nullptr;
+    if (!VD)
+      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+    MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref);
+    PrivateCopies.push_back(VDPrivateRefExpr);
+    Inits.push_back(VDInitRefExpr);
+
+    // We need to add a data sharing attribute for this variable to make sure it
+    // is correctly captured. A variable that shows up in a use_device_ptr has
+    // similar properties of a first private variable.
+    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);
+
+    // Create a mappable component for the list item. List items in this clause
+    // only need a component.
+    MVLI.VarBaseDeclarations.push_back(D);
+    MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
+    MVLI.VarComponents.back().push_back(
+        OMPClauseMappableExprCommon::MappableComponent(SimpleRefExpr, D));
+  }
+
+  if (MVLI.ProcessedVarList.empty())
+    return nullptr;
+
+  return OMPUseDevicePtrClause::Create(
+      Context, StartLoc, LParenLoc, EndLoc, MVLI.ProcessedVarList,
+      PrivateCopies, Inits, MVLI.VarBaseDeclarations, MVLI.VarComponents);
+}
+
+OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
+                                              SourceLocation StartLoc,
+                                              SourceLocation LParenLoc,
+                                              SourceLocation EndLoc) {
+  MappableVarListInfo MVLI(VarList);
+  for (auto &RefExpr : VarList) {
+    assert(RefExpr && "NULL expr in OpenMP use_device_ptr clause.");
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    if (Res.second) {
+      // It will be analyzed later.
+      MVLI.ProcessedVarList.push_back(RefExpr);
+    }
+    ValueDecl *D = Res.first;
+    if (!D)
+      continue;
+
+    QualType Type = D->getType();
+    // item should be a pointer or array or reference to pointer or array
+    if (!Type.getNonReferenceType()->isPointerType() &&
+        !Type.getNonReferenceType()->isArrayType()) {
+      Diag(ELoc, diag::err_omp_argument_type_isdeviceptr)
+          << 0 << RefExpr->getSourceRange();
+      continue;
+    }
+
+    // Check if the declaration in the clause does not show up in any data
+    // sharing attribute.
+    auto DVar = DSAStack->getTopDSA(D, false);
+    if (isOpenMPPrivate(DVar.CKind)) {
+      Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
+          << getOpenMPClauseName(DVar.CKind)
+          << getOpenMPClauseName(OMPC_is_device_ptr)
+          << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
+      ReportOriginalDSA(*this, DSAStack, D, DVar);
+      continue;
+    }
+
+    Expr *ConflictExpr;
+    if (DSAStack->checkMappableExprComponentListsForDecl(
+            D, /*CurrentRegionOnly=*/true,
+            [&ConflictExpr](
+                OMPClauseMappableExprCommon::MappableExprComponentListRef R,
+                OpenMPClauseKind) -> bool {
+              ConflictExpr = R.front().getAssociatedExpression();
+              return true;
+            })) {
+      Diag(ELoc, diag::err_omp_map_shared_storage) << RefExpr->getSourceRange();
+      Diag(ConflictExpr->getExprLoc(), diag::note_used_here)
+          << ConflictExpr->getSourceRange();
+      continue;
+    }
+
+    // Store the components in the stack so that they can be used to check
+    // against other clauses later on.
+    OMPClauseMappableExprCommon::MappableComponent MC(SimpleRefExpr, D);
+    DSAStack->addMappableExpressionComponents(
+        D, MC, /*WhereFoundClauseKind=*/OMPC_is_device_ptr);
+
+    // Record the expression we've just processed.
+    MVLI.ProcessedVarList.push_back(SimpleRefExpr);
+
+    // Create a mappable component for the list item. List items in this clause
+    // only need a component. We use a null declaration to signal fields in
+    // 'this'.
+    assert((isa<DeclRefExpr>(SimpleRefExpr) ||
+            isa<CXXThisExpr>(cast<MemberExpr>(SimpleRefExpr)->getBase())) &&
+           "Unexpected device pointer expression!");
+    MVLI.VarBaseDeclarations.push_back(
+        isa<DeclRefExpr>(SimpleRefExpr) ? D : nullptr);
+    MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
+    MVLI.VarComponents.back().push_back(MC);
+  }
+
+  if (MVLI.ProcessedVarList.empty())
+    return nullptr;
+
+  return OMPIsDevicePtrClause::Create(
+      Context, StartLoc, LParenLoc, EndLoc, MVLI.ProcessedVarList,
+      MVLI.VarBaseDeclarations, MVLI.VarComponents);
 }
diff --git a/lib/Sema/SemaOverload.cpp b/lib/Sema/SemaOverload.cpp
index 299cfdd..72ad9a4 100644
--- a/lib/Sema/SemaOverload.cpp
+++ b/lib/Sema/SemaOverload.cpp
@@ -39,8 +39,9 @@
 using namespace sema;
 
 static bool functionHasPassObjectSizeParams(const FunctionDecl *FD) {
-  return std::any_of(FD->param_begin(), FD->param_end(),
-                     std::mem_fn(&ParmVarDecl::hasAttr<PassObjectSizeAttr>));
+  return llvm::any_of(FD->parameters(), [](const ParmVarDecl *P) {
+    return P->hasAttr<PassObjectSizeAttr>();
+  });
 }
 
 /// A convenience routine for creating a decayed reference to a function.
@@ -293,6 +294,13 @@
   //   A narrowing conversion is an implicit conversion ...
   QualType FromType = getToType(0);
   QualType ToType = getToType(1);
+
+  // A conversion to an enumeration type is narrowing if the conversion to
+  // the underlying type is narrowing. This only arises for expressions of
+  // the form 'Enum{init}'.
+  if (auto *ET = ToType->getAs<EnumType>())
+    ToType = ET->getDecl()->getIntegerType();
+
   switch (Second) {
   // 'bool' is an integral type; dispatch to the right place to handle it.
   case ICK_Boolean_Conversion:
@@ -985,7 +993,7 @@
 }
 
 bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
-                      bool UseUsingDeclRules) {
+                      bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs) {
   // C++ [basic.start.main]p2: This function shall not be overloaded.
   if (New->isMain())
     return false;
@@ -1041,7 +1049,7 @@
   //
   // However, we don't consider either of these when deciding whether
   // a member introduced by a shadow declaration is hidden.
-  if (!UseUsingDeclRules && NewTemplate &&
+  if (!UseMemberUsingDeclRules && NewTemplate &&
       (!TemplateParameterListsAreEqual(NewTemplate->getTemplateParameters(),
                                        OldTemplate->getTemplateParameters(),
                                        false, TPL_TemplateMatch) ||
@@ -1061,7 +1069,7 @@
   if (OldMethod && NewMethod &&
       !OldMethod->isStatic() && !NewMethod->isStatic()) {
     if (OldMethod->getRefQualifier() != NewMethod->getRefQualifier()) {
-      if (!UseUsingDeclRules &&
+      if (!UseMemberUsingDeclRules &&
           (OldMethod->getRefQualifier() == RQ_None ||
            NewMethod->getRefQualifier() == RQ_None)) {
         // C++0x [over.load]p2:
@@ -1118,7 +1126,7 @@
       return true;
   }
 
-  if (getLangOpts().CUDA && getLangOpts().CUDATargetOverloads) {
+  if (getLangOpts().CUDA && ConsiderCudaAttrs) {
     CUDAFunctionTarget NewTarget = IdentifyCUDATarget(New),
                        OldTarget = IdentifyCUDATarget(Old);
     if (NewTarget == CFT_InvalidTarget || NewTarget == CFT_Global)
@@ -1129,7 +1137,10 @@
     // Don't allow mixing of HD with other kinds. This guarantees that
     // we have only one viable function with this signature on any
     // side of CUDA compilation .
-    if ((NewTarget == CFT_HostDevice) || (OldTarget == CFT_HostDevice))
+    // __global__ functions can't be overloaded based on attribute
+    // difference because, like HD, they also exist on both sides.
+    if ((NewTarget == CFT_HostDevice) || (OldTarget == CFT_HostDevice) ||
+        (NewTarget == CFT_Global) || (OldTarget == CFT_Global))
       return false;
 
     // Allow overloading of functions with same signature, but
@@ -1189,7 +1200,6 @@
   case OR_Success:
   case OR_Deleted:
     ICS.setUserDefined();
-    ICS.UserDefined.Before.setAsIdentityConversion();
     // C++ [over.ics.user]p4:
     //   A conversion of an expression of class type to the same class
     //   type is given Exact Match rank, and a conversion of an
@@ -1208,11 +1218,13 @@
            S.IsDerivedFrom(From->getLocStart(), FromCanon, ToCanon))) {
         // Turn this into a "standard" conversion sequence, so that it
         // gets ranked with standard conversion sequences.
+        DeclAccessPair Found = ICS.UserDefined.FoundConversionFunction;
         ICS.setStandard();
         ICS.Standard.setAsIdentityConversion();
         ICS.Standard.setFromType(From->getType());
         ICS.Standard.setAllToTypes(ToType);
         ICS.Standard.CopyConstructor = Constructor;
+        ICS.Standard.FoundCopyConstructor = Found;
         if (ToCanon != FromCanon)
           ICS.Standard.Second = ICK_Derived_To_Base;
       }
@@ -1226,7 +1238,7 @@
     for (OverloadCandidateSet::iterator Cand = Conversions.begin();
          Cand != Conversions.end(); ++Cand)
       if (Cand->Viable)
-        ICS.Ambiguous.addConversion(Cand->Function);
+        ICS.Ambiguous.addConversion(Cand->FoundDecl, Cand->Function);
     break;
 
     // Fall through.
@@ -1661,6 +1673,20 @@
     SCS.Second = ICK_Complex_Real;
     FromType = ToType.getUnqualifiedType();
   } else if (FromType->isRealFloatingType() && ToType->isRealFloatingType()) {
+    // FIXME: disable conversions between long double and __float128 if
+    // their representation is different until there is back end support
+    // We of course allow this conversion if long double is really double.
+    if (&S.Context.getFloatTypeSemantics(FromType) !=
+        &S.Context.getFloatTypeSemantics(ToType)) {
+      bool Float128AndLongDouble = ((FromType == S.Context.Float128Ty &&
+                                    ToType == S.Context.LongDoubleTy) ||
+                                   (FromType == S.Context.LongDoubleTy &&
+                                    ToType == S.Context.Float128Ty));
+      if (Float128AndLongDouble &&
+          (&S.Context.getFloatTypeSemantics(S.Context.LongDoubleTy) !=
+           &llvm::APFloat::IEEEdouble))
+        return false;
+    }
     // Floating point conversions (C++ 4.8).
     SCS.Second = ICK_Floating_Conversion;
     FromType = ToType.getUnqualifiedType();
@@ -1818,8 +1844,7 @@
         (FromType->isSignedIntegerType() ||
          // We can promote any unsigned integer type whose size is
          // less than int to an int.
-         (!FromType->isSignedIntegerType() &&
-          Context.getTypeSize(FromType) < Context.getTypeSize(ToType)))) {
+         Context.getTypeSize(FromType) < Context.getTypeSize(ToType))) {
       return To->getKind() == BuiltinType::Int;
     }
 
@@ -1964,7 +1989,8 @@
       if (!getLangOpts().CPlusPlus &&
           (FromBuiltin->getKind() == BuiltinType::Float ||
            FromBuiltin->getKind() == BuiltinType::Double) &&
-          (ToBuiltin->getKind() == BuiltinType::LongDouble))
+          (ToBuiltin->getKind() == BuiltinType::LongDouble ||
+           ToBuiltin->getKind() == BuiltinType::Float128))
         return true;
 
       // Half can be promoted to float.
@@ -2927,6 +2953,10 @@
 
     Qualifiers FromQuals = FromType.getQualifiers();
     Qualifiers ToQuals = ToType.getQualifiers();
+
+    // Ignore __unaligned qualifier if this type is void.
+    if (ToType.getUnqualifiedType()->isVoidType())
+      FromQuals.removeUnaligned();
     
     // Objective-C ARC:
     //   Check Objective-C lifetime conversions.
@@ -3023,39 +3053,26 @@
                                        UserDefinedConversionSequence &User,
                                        OverloadCandidateSet &CandidateSet,
                                        bool AllowExplicit) {
-  DeclContext::lookup_result R = S.LookupConstructors(To);
-  for (DeclContext::lookup_iterator Con = R.begin(), ConEnd = R.end();
-       Con != ConEnd; ++Con) {
-    NamedDecl *D = *Con;
-    DeclAccessPair FoundDecl = DeclAccessPair::make(D, D->getAccess());
+  for (auto *D : S.LookupConstructors(To)) {
+    auto Info = getConstructorInfo(D);
+    if (!Info)
+      continue;
 
-    // Find the constructor (which may be a template).
-    CXXConstructorDecl *Constructor = nullptr;
-    FunctionTemplateDecl *ConstructorTmpl
-      = dyn_cast<FunctionTemplateDecl>(D);
-    if (ConstructorTmpl)
-      Constructor
-        = cast<CXXConstructorDecl>(ConstructorTmpl->getTemplatedDecl());
-    else
-      Constructor = cast<CXXConstructorDecl>(D);
-
-    bool Usable = !Constructor->isInvalidDecl() &&
-                  S.isInitListConstructor(Constructor) &&
-                  (AllowExplicit || !Constructor->isExplicit());
+    bool Usable = !Info.Constructor->isInvalidDecl() &&
+                  S.isInitListConstructor(Info.Constructor) &&
+                  (AllowExplicit || !Info.Constructor->isExplicit());
     if (Usable) {
       // If the first argument is (a reference to) the target type,
       // suppress conversions.
-      bool SuppressUserConversions =
-          isFirstArgumentCompatibleWithType(S.Context, Constructor, ToType);
-      if (ConstructorTmpl)
-        S.AddTemplateOverloadCandidate(ConstructorTmpl, FoundDecl,
-                                       /*ExplicitArgs*/ nullptr,
-                                       From, CandidateSet,
-                                       SuppressUserConversions);
+      bool SuppressUserConversions = isFirstArgumentCompatibleWithType(
+          S.Context, Info.Constructor, ToType);
+      if (Info.ConstructorTmpl)
+        S.AddTemplateOverloadCandidate(Info.ConstructorTmpl, Info.FoundDecl,
+                                       /*ExplicitArgs*/ nullptr, From,
+                                       CandidateSet, SuppressUserConversions);
       else
-        S.AddOverloadCandidate(Constructor, FoundDecl,
-                               From, CandidateSet,
-                               SuppressUserConversions);
+        S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl, From,
+                               CandidateSet, SuppressUserConversions);
     }
   }
 
@@ -3155,27 +3172,17 @@
         ListInitializing = true;
       }
 
-      DeclContext::lookup_result R = S.LookupConstructors(ToRecordDecl);
-      for (DeclContext::lookup_iterator Con = R.begin(), ConEnd = R.end();
-           Con != ConEnd; ++Con) {
-        NamedDecl *D = *Con;
-        DeclAccessPair FoundDecl = DeclAccessPair::make(D, D->getAccess());
+      for (auto *D : S.LookupConstructors(ToRecordDecl)) {
+        auto Info = getConstructorInfo(D);
+        if (!Info)
+          continue;
 
-        // Find the constructor (which may be a template).
-        CXXConstructorDecl *Constructor = nullptr;
-        FunctionTemplateDecl *ConstructorTmpl
-          = dyn_cast<FunctionTemplateDecl>(D);
-        if (ConstructorTmpl)
-          Constructor
-            = cast<CXXConstructorDecl>(ConstructorTmpl->getTemplatedDecl());
-        else
-          Constructor = cast<CXXConstructorDecl>(D);
-
-        bool Usable = !Constructor->isInvalidDecl();
+        bool Usable = !Info.Constructor->isInvalidDecl();
         if (ListInitializing)
-          Usable = Usable && (AllowExplicit || !Constructor->isExplicit());
+          Usable = Usable && (AllowExplicit || !Info.Constructor->isExplicit());
         else
-          Usable = Usable &&Constructor->isConvertingConstructor(AllowExplicit);
+          Usable = Usable &&
+                   Info.Constructor->isConvertingConstructor(AllowExplicit);
         if (Usable) {
           bool SuppressUserConversions = !ConstructorsOnly;
           if (SuppressUserConversions && ListInitializing) {
@@ -3184,18 +3191,18 @@
               // If the first argument is (a reference to) the target type,
               // suppress conversions.
               SuppressUserConversions = isFirstArgumentCompatibleWithType(
-                                                S.Context, Constructor, ToType);
+                  S.Context, Info.Constructor, ToType);
             }
           }
-          if (ConstructorTmpl)
-            S.AddTemplateOverloadCandidate(ConstructorTmpl, FoundDecl,
-                                           /*ExplicitArgs*/ nullptr,
-                                           llvm::makeArrayRef(Args, NumArgs),
-                                           CandidateSet, SuppressUserConversions);
+          if (Info.ConstructorTmpl)
+            S.AddTemplateOverloadCandidate(
+                Info.ConstructorTmpl, Info.FoundDecl,
+                /*ExplicitArgs*/ nullptr, llvm::makeArrayRef(Args, NumArgs),
+                CandidateSet, SuppressUserConversions);
           else
             // Allow one user-defined conversion when user specifies a
             // From->ToType conversion via an static cast (c-style, etc).
-            S.AddOverloadCandidate(Constructor, FoundDecl,
+            S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl,
                                    llvm::makeArrayRef(Args, NumArgs),
                                    CandidateSet, SuppressUserConversions);
         }
@@ -4135,6 +4142,10 @@
     T2Quals.removeObjCLifetime();    
   }
     
+  // MS compiler ignores __unaligned qualifier for references; do the same.
+  T1Quals.removeUnaligned();
+  T2Quals.removeUnaligned();
+
   if (T1Quals == T2Quals)
     return Ref_Compatible;
   else if (T1Quals.compatiblyIncludes(T2Quals))
@@ -4256,7 +4267,7 @@
     for (OverloadCandidateSet::iterator Cand = CandidateSet.begin();
          Cand != CandidateSet.end(); ++Cand)
       if (Cand->Viable)
-        ICS.Ambiguous.addConversion(Cand->Function);
+        ICS.Ambiguous.addConversion(Cand->FoundDecl, Cand->Function);
     return true;
 
   case OR_No_Viable_Function:
@@ -4456,13 +4467,16 @@
     // initialization fails.
     //
     // Note that we only want to check address spaces and cvr-qualifiers here.
-    // ObjC GC and lifetime qualifiers aren't important.
+    // ObjC GC, lifetime and unaligned qualifiers aren't important.
     Qualifiers T1Quals = T1.getQualifiers();
     Qualifiers T2Quals = T2.getQualifiers();
     T1Quals.removeObjCGCAttr();
     T1Quals.removeObjCLifetime();
     T2Quals.removeObjCGCAttr();
     T2Quals.removeObjCLifetime();
+    // MS compiler ignores __unaligned qualifier for references; do the same.
+    T1Quals.removeUnaligned();
+    T2Quals.removeUnaligned();
     if (!T1Quals.compatiblyIncludes(T2Quals))
       return ICS;
   }
@@ -4526,7 +4540,6 @@
       return ICS;
     }
 
-    ICS.UserDefined.Before.setAsIdentityConversion();
     ICS.UserDefined.After.ReferenceBinding = true;
     ICS.UserDefined.After.IsLvalueReference = !isRValRef;
     ICS.UserDefined.After.BindsToFunctionLvalue = false;
@@ -5803,7 +5816,7 @@
       // case we may not yet know what the member's target is; the target is
       // inferred for the member automatically, based on the bases and fields of
       // the class.
-      if (!Caller->isImplicit() && CheckCUDATarget(Caller, Function)) {
+      if (!Caller->isImplicit() && !IsAllowedCUDACall(Caller, Function)) {
         Candidate.Viable = false;
         Candidate.FailureKind = ovl_fail_bad_target;
         return;
@@ -5960,37 +5973,32 @@
   SFINAETrap Trap(*this);
   SmallVector<Expr *, 16> ConvertedArgs;
   bool InitializationFailed = false;
-  bool ContainsValueDependentExpr = false;
+
+  // Ignore any variadic arguments. Converting them is pointless, since the
+  // user can't refer to them in the enable_if condition.
+  unsigned ArgSizeNoVarargs = std::min(Function->param_size(), Args.size());
 
   // Convert the arguments.
-  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-    if (i == 0 && !MissingImplicitThis && isa<CXXMethodDecl>(Function) &&
+  for (unsigned I = 0; I != ArgSizeNoVarargs; ++I) {
+    ExprResult R;
+    if (I == 0 && !MissingImplicitThis && isa<CXXMethodDecl>(Function) &&
         !cast<CXXMethodDecl>(Function)->isStatic() &&
         !isa<CXXConstructorDecl>(Function)) {
       CXXMethodDecl *Method = cast<CXXMethodDecl>(Function);
-      ExprResult R =
-        PerformObjectArgumentInitialization(Args[0], /*Qualifier=*/nullptr,
-                                            Method, Method);
-      if (R.isInvalid()) {
-        InitializationFailed = true;
-        break;
-      }
-      ContainsValueDependentExpr |= R.get()->isValueDependent();
-      ConvertedArgs.push_back(R.get());
+      R = PerformObjectArgumentInitialization(Args[0], /*Qualifier=*/nullptr,
+                                              Method, Method);
     } else {
-      ExprResult R =
-        PerformCopyInitialization(InitializedEntity::InitializeParameter(
-                                                Context,
-                                                Function->getParamDecl(i)),
-                                  SourceLocation(),
-                                  Args[i]);
-      if (R.isInvalid()) {
-        InitializationFailed = true;
-        break;
-      }
-      ContainsValueDependentExpr |= R.get()->isValueDependent();
-      ConvertedArgs.push_back(R.get());
+      R = PerformCopyInitialization(InitializedEntity::InitializeParameter(
+                                        Context, Function->getParamDecl(I)),
+                                    SourceLocation(), Args[I]);
     }
+
+    if (R.isInvalid()) {
+      InitializationFailed = true;
+      break;
+    }
+
+    ConvertedArgs.push_back(R.get());
   }
 
   if (InitializationFailed || Trap.hasErrorOccurred())
@@ -6010,7 +6018,6 @@
         InitializationFailed = true;
         break;
       }
-      ContainsValueDependentExpr |= R.get()->isValueDependent();
       ConvertedArgs.push_back(R.get());
     }
 
@@ -6020,18 +6027,14 @@
 
   for (auto *EIA : EnableIfAttrs) {
     APValue Result;
-    if (EIA->getCond()->isValueDependent()) {
-      // Don't even try now, we'll examine it after instantiation.
-      continue;
-    }
-
+    // FIXME: This doesn't consider value-dependent cases, because doing so is
+    // very difficult. Ideally, we should handle them more gracefully.
     if (!EIA->getCond()->EvaluateWithSubstitution(
-            Result, Context, Function, llvm::makeArrayRef(ConvertedArgs))) {
-      if (!ContainsValueDependentExpr)
-        return EIA;
-    } else if (!Result.isInt() || !Result.getInt().getBoolValue()) {
+            Result, Context, Function, llvm::makeArrayRef(ConvertedArgs)))
       return EIA;
-    }
+
+    if (!Result.isInt() || !Result.getInt().getBoolValue())
+      return EIA;
   }
   return nullptr;
 }
@@ -6194,7 +6197,7 @@
   // (CUDA B.1): Check for invalid calls between targets.
   if (getLangOpts().CUDA)
     if (const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext))
-      if (CheckCUDATarget(Caller, Method)) {
+      if (!IsAllowedCUDACall(Caller, Method)) {
         Candidate.Viable = false;
         Candidate.FailureKind = ovl_fail_bad_target;
         return;
@@ -7196,13 +7199,13 @@
   // provided via the getArithmeticType() method below.
   // The "promoted arithmetic types" are the arithmetic
   // types are that preserved by promotion (C++ [over.built]p2).
-  static const unsigned FirstIntegralType = 3;
-  static const unsigned LastIntegralType = 20;
-  static const unsigned FirstPromotedIntegralType = 3,
-                        LastPromotedIntegralType = 11;
+  static const unsigned FirstIntegralType = 4;
+  static const unsigned LastIntegralType = 21;
+  static const unsigned FirstPromotedIntegralType = 4,
+                        LastPromotedIntegralType = 12;
   static const unsigned FirstPromotedArithmeticType = 0,
-                        LastPromotedArithmeticType = 11;
-  static const unsigned NumArithmeticTypes = 20;
+                        LastPromotedArithmeticType = 12;
+  static const unsigned NumArithmeticTypes = 21;
 
   /// \brief Get the canonical type for a given arithmetic type index.
   CanQualType getArithmeticType(unsigned index) {
@@ -7213,6 +7216,7 @@
       &ASTContext::FloatTy,
       &ASTContext::DoubleTy,
       &ASTContext::LongDoubleTy,
+      &ASTContext::Float128Ty,
 
       // Start of integral types.
       &ASTContext::IntTy,
@@ -7255,7 +7259,7 @@
     // (we could precompute SLL x UI for all known platforms, but it's
     // better not to make any assumptions).
     // We assume that int128 has a higher rank than long long on all platforms.
-    enum PromotedType {
+    enum PromotedType : int8_t {
             Dep=-1,
             Flt,  Dbl, LDbl,   SI,   SL,  SLL, S128,   UI,   UL,  ULL, U128
     };
@@ -8485,16 +8489,31 @@
   }
 }
 
-// Determines whether Cand1 is "better" in terms of its enable_if attrs than
-// Cand2 for overloading. This function assumes that all of the enable_if attrs
-// on Cand1 and Cand2 have conditions that evaluate to true.
-//
-// Cand1's set of enable_if attributes are said to be "better" than Cand2's iff
-// Cand1's first N enable_if attributes have precisely the same conditions as
-// Cand2's first N enable_if attributes (where N = the number of enable_if
-// attributes on Cand2), and Cand1 has more than N enable_if attributes.
-static bool hasBetterEnableIfAttrs(Sema &S, const FunctionDecl *Cand1,
-                                   const FunctionDecl *Cand2) {
+namespace {
+enum class Comparison { Equal, Better, Worse };
+}
+
+/// Compares the enable_if attributes of two FunctionDecls, for the purposes of
+/// overload resolution.
+///
+/// Cand1's set of enable_if attributes are said to be "better" than Cand2's iff
+/// Cand1's first N enable_if attributes have precisely the same conditions as
+/// Cand2's first N enable_if attributes (where N = the number of enable_if
+/// attributes on Cand2), and Cand1 has more than N enable_if attributes.
+///
+/// Note that you can have a pair of candidates such that Cand1's enable_if
+/// attributes are worse than Cand2's, and Cand2's enable_if attributes are
+/// worse than Cand1's.
+static Comparison compareEnableIfAttrs(const Sema &S, const FunctionDecl *Cand1,
+                                       const FunctionDecl *Cand2) {
+  // Common case: One (or both) decls don't have enable_if attrs.
+  bool Cand1Attr = Cand1->hasAttr<EnableIfAttr>();
+  bool Cand2Attr = Cand2->hasAttr<EnableIfAttr>();
+  if (!Cand1Attr || !Cand2Attr) {
+    if (Cand1Attr == Cand2Attr)
+      return Comparison::Equal;
+    return Cand1Attr ? Comparison::Better : Comparison::Worse;
+  }
 
   // FIXME: The next several lines are just
   // specific_attr_iterator<EnableIfAttr> but going in declaration order,
@@ -8502,10 +8521,10 @@
   auto Cand1Attrs = getOrderedEnableIfAttrs(Cand1);
   auto Cand2Attrs = getOrderedEnableIfAttrs(Cand2);
 
-  // Candidate 1 is better if it has strictly more attributes and
-  // the common sequence is identical.
-  if (Cand1Attrs.size() <= Cand2Attrs.size())
-    return false;
+  // It's impossible for Cand1 to be better than (or equal to) Cand2 if Cand1
+  // has fewer enable_if attributes than Cand2.
+  if (Cand1Attrs.size() < Cand2Attrs.size())
+    return Comparison::Worse;
 
   auto Cand1I = Cand1Attrs.begin();
   llvm::FoldingSetNodeID Cand1ID, Cand2ID;
@@ -8517,10 +8536,10 @@
     Cand1A->getCond()->Profile(Cand1ID, S.getASTContext(), true);
     Cand2A->getCond()->Profile(Cand2ID, S.getASTContext(), true);
     if (Cand1ID != Cand2ID)
-      return false;
+      return Comparison::Worse;
   }
 
-  return true;
+  return Cand1I == Cand1Attrs.end() ? Comparison::Equal : Comparison::Better;
 }
 
 /// isBetterOverloadCandidate - Determines whether the first overload
@@ -8630,14 +8649,33 @@
       return BetterTemplate == Cand1.Function->getPrimaryTemplate();
   }
 
-  // Check for enable_if value-based overload resolution.
-  if (Cand1.Function && Cand2.Function &&
-      (Cand1.Function->hasAttr<EnableIfAttr>() ||
-       Cand2.Function->hasAttr<EnableIfAttr>()))
-    return hasBetterEnableIfAttrs(S, Cand1.Function, Cand2.Function);
+  // FIXME: Work around a defect in the C++17 inheriting constructor wording.
+  // A derived-class constructor beats an (inherited) base class constructor.
+  bool Cand1IsInherited =
+      dyn_cast_or_null<ConstructorUsingShadowDecl>(Cand1.FoundDecl.getDecl());
+  bool Cand2IsInherited =
+      dyn_cast_or_null<ConstructorUsingShadowDecl>(Cand2.FoundDecl.getDecl());
+  if (Cand1IsInherited != Cand2IsInherited)
+    return Cand2IsInherited;
+  else if (Cand1IsInherited) {
+    assert(Cand2IsInherited);
+    auto *Cand1Class = cast<CXXRecordDecl>(Cand1.Function->getDeclContext());
+    auto *Cand2Class = cast<CXXRecordDecl>(Cand2.Function->getDeclContext());
+    if (Cand1Class->isDerivedFrom(Cand2Class))
+      return true;
+    if (Cand2Class->isDerivedFrom(Cand1Class))
+      return false;
+    // Inherited from sibling base classes: still ambiguous.
+  }
 
-  if (S.getLangOpts().CUDA && S.getLangOpts().CUDATargetOverloads &&
-      Cand1.Function && Cand2.Function) {
+  // Check for enable_if value-based overload resolution.
+  if (Cand1.Function && Cand2.Function) {
+    Comparison Cmp = compareEnableIfAttrs(S, Cand1.Function, Cand2.Function);
+    if (Cmp != Comparison::Equal)
+      return Cmp == Comparison::Better;
+  }
+
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
     FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
     return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
            S.IdentifyCUDAPreference(Caller, Cand2.Function);
@@ -8731,14 +8769,44 @@
 OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
                                          iterator &Best,
                                          bool UserDefinedConversion) {
+  llvm::SmallVector<OverloadCandidate *, 16> Candidates;
+  std::transform(begin(), end(), std::back_inserter(Candidates),
+                 [](OverloadCandidate &Cand) { return &Cand; });
+
+  // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA but
+  // are accepted by both clang and NVCC. However, during a particular
+  // compilation mode only one call variant is viable. We need to
+  // exclude non-viable overload candidates from consideration based
+  // only on their host/device attributes. Specifically, if one
+  // candidate call is WrongSide and the other is SameSide, we ignore
+  // the WrongSide candidate.
+  if (S.getLangOpts().CUDA) {
+    const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
+    bool ContainsSameSideCandidate =
+        llvm::any_of(Candidates, [&](OverloadCandidate *Cand) {
+          return Cand->Function &&
+                 S.IdentifyCUDAPreference(Caller, Cand->Function) ==
+                     Sema::CFP_SameSide;
+        });
+    if (ContainsSameSideCandidate) {
+      auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) {
+        return Cand->Function &&
+               S.IdentifyCUDAPreference(Caller, Cand->Function) ==
+                   Sema::CFP_WrongSide;
+      };
+      Candidates.erase(std::remove_if(Candidates.begin(), Candidates.end(),
+                                      IsWrongSideCandidate),
+                       Candidates.end());
+    }
+  }
+
   // Find the best viable function.
   Best = end();
-  for (iterator Cand = begin(); Cand != end(); ++Cand) {
+  for (auto *Cand : Candidates)
     if (Cand->Viable)
       if (Best == end() || isBetterOverloadCandidate(S, *Cand, *Best, Loc,
                                                      UserDefinedConversion))
         Best = Cand;
-  }
 
   // If we didn't find any viable functions, abort.
   if (Best == end())
@@ -8748,7 +8816,7 @@
 
   // Make sure that this function is better than every other viable
   // function. If not, we have an ambiguity.
-  for (iterator Cand = begin(); Cand != end(); ++Cand) {
+  for (auto *Cand : Candidates) {
     if (Cand->Viable &&
         Cand != Best &&
         !isBetterOverloadCandidate(S, *Best, *Cand, Loc,
@@ -8791,10 +8859,12 @@
   oc_implicit_move_constructor,
   oc_implicit_copy_assignment,
   oc_implicit_move_assignment,
-  oc_implicit_inherited_constructor
+  oc_inherited_constructor,
+  oc_inherited_constructor_template
 };
 
 OverloadCandidateKind ClassifyOverloadCandidate(Sema &S,
+                                                NamedDecl *Found,
                                                 FunctionDecl *Fn,
                                                 std::string &Description) {
   bool isTemplate = false;
@@ -8806,11 +8876,13 @@
   }
 
   if (CXXConstructorDecl *Ctor = dyn_cast<CXXConstructorDecl>(Fn)) {
-    if (!Ctor->isImplicit())
-      return isTemplate ? oc_constructor_template : oc_constructor;
-
-    if (Ctor->getInheritedConstructor())
-      return oc_implicit_inherited_constructor;
+    if (!Ctor->isImplicit()) {
+      if (isa<ConstructorUsingShadowDecl>(Found))
+        return isTemplate ? oc_inherited_constructor_template
+                          : oc_inherited_constructor;
+      else
+        return isTemplate ? oc_constructor_template : oc_constructor;
+    }
 
     if (Ctor->isDefaultConstructor())
       return oc_implicit_default_constructor;
@@ -8842,14 +8914,13 @@
   return isTemplate ? oc_function_template : oc_function;
 }
 
-void MaybeEmitInheritedConstructorNote(Sema &S, Decl *Fn) {
-  const CXXConstructorDecl *Ctor = dyn_cast<CXXConstructorDecl>(Fn);
-  if (!Ctor) return;
-
-  Ctor = Ctor->getInheritedConstructor();
-  if (!Ctor) return;
-
-  S.Diag(Ctor->getLocation(), diag::note_ovl_candidate_inherited_constructor);
+void MaybeEmitInheritedConstructorNote(Sema &S, Decl *FoundDecl) {
+  // FIXME: It'd be nice to only emit a note once per using-decl per overload
+  // set.
+  if (auto *Shadow = dyn_cast<ConstructorUsingShadowDecl>(FoundDecl))
+    S.Diag(FoundDecl->getLocation(),
+           diag::note_ovl_candidate_inherited_constructor)
+      << Shadow->getNominatedBaseClass();
 }
 
 } // end anonymous namespace
@@ -8888,8 +8959,9 @@
     return false;
   }
 
-  auto I = std::find_if(FD->param_begin(), FD->param_end(),
-                        std::mem_fn(&ParmVarDecl::hasAttr<PassObjectSizeAttr>));
+  auto I = llvm::find_if(FD->parameters(), [](const ParmVarDecl *P) {
+    return P->hasAttr<PassObjectSizeAttr>();
+  });
   if (I == FD->param_end())
     return true;
 
@@ -8923,19 +8995,19 @@
 }
 
 // Notes the location of an overload candidate.
-void Sema::NoteOverloadCandidate(FunctionDecl *Fn, QualType DestType,
-                                 bool TakingAddress) {
+void Sema::NoteOverloadCandidate(NamedDecl *Found, FunctionDecl *Fn,
+                                 QualType DestType, bool TakingAddress) {
   if (TakingAddress && !checkAddressOfCandidateIsAvailable(*this, Fn))
     return;
 
   std::string FnDesc;
-  OverloadCandidateKind K = ClassifyOverloadCandidate(*this, Fn, FnDesc);
+  OverloadCandidateKind K = ClassifyOverloadCandidate(*this, Found, Fn, FnDesc);
   PartialDiagnostic PD = PDiag(diag::note_ovl_candidate)
                              << (unsigned) K << FnDesc;
 
   HandleFunctionTypeMismatch(PD, Fn->getType(), DestType);
   Diag(Fn->getLocation(), PD);
-  MaybeEmitInheritedConstructorNote(*this, Fn);
+  MaybeEmitInheritedConstructorNote(*this, Found);
 }
 
 // Notes the location of all overload candidates designated through
@@ -8952,11 +9024,11 @@
        I != IEnd; ++I) {
     if (FunctionTemplateDecl *FunTmpl = 
                 dyn_cast<FunctionTemplateDecl>((*I)->getUnderlyingDecl()) ) {
-      NoteOverloadCandidate(FunTmpl->getTemplatedDecl(), DestType,
+      NoteOverloadCandidate(*I, FunTmpl->getTemplatedDecl(), DestType,
                             TakingAddress);
     } else if (FunctionDecl *Fun 
                       = dyn_cast<FunctionDecl>((*I)->getUnderlyingDecl()) ) {
-      NoteOverloadCandidate(Fun, DestType, TakingAddress);
+      NoteOverloadCandidate(*I, Fun, DestType, TakingAddress);
     }
   }
 }
@@ -8980,7 +9052,7 @@
     if (CandsShown >= 4 && ShowOverloads == Ovl_Best)
       break;
     ++CandsShown;
-    S.NoteOverloadCandidate(*I);
+    S.NoteOverloadCandidate(I->first, I->second);
   }
   if (I != E)
     S.Diag(SourceLocation(), diag::note_ovl_too_many_candidates) << int(E - I);
@@ -9005,7 +9077,8 @@
   }
 
   std::string FnDesc;
-  OverloadCandidateKind FnKind = ClassifyOverloadCandidate(S, Fn, FnDesc);
+  OverloadCandidateKind FnKind =
+      ClassifyOverloadCandidate(S, Cand->FoundDecl, Fn, FnDesc);
 
   Expr *FromExpr = Conv.Bad.FromExpr;
   QualType FromTy = Conv.Bad.getFromType();
@@ -9022,7 +9095,7 @@
       << (unsigned) FnKind << FnDesc
       << (FromExpr ? FromExpr->getSourceRange() : SourceRange())
       << ToTy << Name << I+1;
-    MaybeEmitInheritedConstructorNote(S, Fn);
+    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
     return;
   }
 
@@ -9035,8 +9108,10 @@
   else {
     // TODO: detect and diagnose the full richness of const mismatches.
     if (CanQual<PointerType> FromPT = CFromTy->getAs<PointerType>())
-      if (CanQual<PointerType> ToPT = CToTy->getAs<PointerType>())
-        CFromTy = FromPT->getPointeeType(), CToTy = ToPT->getPointeeType();
+      if (CanQual<PointerType> ToPT = CToTy->getAs<PointerType>()) {
+        CFromTy = FromPT->getPointeeType();
+        CToTy = ToPT->getPointeeType();
+      }
   }
 
   if (CToTy.getUnqualifiedType() == CFromTy.getUnqualifiedType() &&
@@ -9051,7 +9126,7 @@
         << FromTy
         << FromQs.getAddressSpace() << ToQs.getAddressSpace()
         << (unsigned) isObjectArgument << I+1;
-      MaybeEmitInheritedConstructorNote(S, Fn);
+      MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
       return;
     }
 
@@ -9062,7 +9137,7 @@
         << FromTy
         << FromQs.getObjCLifetime() << ToQs.getObjCLifetime()
         << (unsigned) isObjectArgument << I+1;
-      MaybeEmitInheritedConstructorNote(S, Fn);
+      MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
       return;
     }
 
@@ -9073,7 +9148,16 @@
       << FromTy
       << FromQs.getObjCGCAttr() << ToQs.getObjCGCAttr()
       << (unsigned) isObjectArgument << I+1;
-      MaybeEmitInheritedConstructorNote(S, Fn);
+      MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
+      return;
+    }
+
+    if (FromQs.hasUnaligned() != ToQs.hasUnaligned()) {
+      S.Diag(Fn->getLocation(), diag::note_ovl_candidate_bad_unaligned)
+        << (unsigned) FnKind << FnDesc
+        << (FromExpr ? FromExpr->getSourceRange() : SourceRange())
+        << FromTy << FromQs.hasUnaligned() << I+1;
+      MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
       return;
     }
 
@@ -9091,7 +9175,7 @@
         << (FromExpr ? FromExpr->getSourceRange() : SourceRange())
         << FromTy << (CVR - 1) << I+1;
     }
-    MaybeEmitInheritedConstructorNote(S, Fn);
+    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
     return;
   }
 
@@ -9102,7 +9186,7 @@
       << (unsigned) FnKind << FnDesc
       << (FromExpr ? FromExpr->getSourceRange() : SourceRange())
       << FromTy << ToTy << (unsigned) isObjectArgument << I+1;
-    MaybeEmitInheritedConstructorNote(S, Fn);
+    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
     return;
   }
 
@@ -9113,11 +9197,14 @@
   if (const PointerType *PTy = TempFromTy->getAs<PointerType>())
     TempFromTy = PTy->getPointeeType();
   if (TempFromTy->isIncompleteType()) {
+    // Emit the generic diagnostic and, optionally, add the hints to it.
     S.Diag(Fn->getLocation(), diag::note_ovl_candidate_bad_conv_incomplete)
       << (unsigned) FnKind << FnDesc
       << (FromExpr ? FromExpr->getSourceRange() : SourceRange())
-      << FromTy << ToTy << (unsigned) isObjectArgument << I+1;
-    MaybeEmitInheritedConstructorNote(S, Fn);
+      << FromTy << ToTy << (unsigned) isObjectArgument << I+1
+      << (unsigned) (Cand->Fix.Kind);
+      
+    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
     return;
   }
 
@@ -9156,7 +9243,7 @@
         << (unsigned) FnKind << FnDesc
         << (FromExpr ? FromExpr->getSourceRange() : SourceRange())
         << (unsigned) isObjectArgument << I + 1;
-      MaybeEmitInheritedConstructorNote(S, Fn);
+      MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
       return;
     }
   }
@@ -9168,7 +9255,7 @@
       << (FromExpr ? FromExpr->getSourceRange() : SourceRange())
       << (BaseToDerivedConversion - 1)
       << FromTy << ToTy << I+1;
-    MaybeEmitInheritedConstructorNote(S, Fn);
+    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
     return;
   }
 
@@ -9181,7 +9268,7 @@
         << (unsigned) FnKind << FnDesc
         << (FromExpr ? FromExpr->getSourceRange() : SourceRange())
         << FromTy << ToTy << (unsigned) isObjectArgument << I+1;
-        MaybeEmitInheritedConstructorNote(S, Fn);
+        MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
         return;
       }
   }
@@ -9203,7 +9290,7 @@
     FDiag << *HI;
   S.Diag(Fn->getLocation(), FDiag);
 
-  MaybeEmitInheritedConstructorNote(S, Fn);
+  MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
 }
 
 /// Additional arity mismatch diagnosis specific to a function overload
@@ -9237,7 +9324,8 @@
 }
 
 /// General arity mismatch diagnosis over a candidate in a candidate set.
-static void DiagnoseArityMismatch(Sema &S, Decl *D, unsigned NumFormalArgs) {
+static void DiagnoseArityMismatch(Sema &S, NamedDecl *Found, Decl *D,
+                                  unsigned NumFormalArgs) {
   assert(isa<FunctionDecl>(D) &&
       "The templated declaration should at least be a function"
       " when diagnosing bad template argument deduction due to too many"
@@ -9267,7 +9355,8 @@
   }
 
   std::string Description;
-  OverloadCandidateKind FnKind = ClassifyOverloadCandidate(S, Fn, Description);
+  OverloadCandidateKind FnKind =
+      ClassifyOverloadCandidate(S, Found, Fn, Description);
 
   if (modeCount == 1 && Fn->getParamDecl(0)->getDeclName())
     S.Diag(Fn->getLocation(), diag::note_ovl_candidate_arity_one)
@@ -9277,28 +9366,25 @@
     S.Diag(Fn->getLocation(), diag::note_ovl_candidate_arity)
       << (unsigned) FnKind << (Fn->getDescribedFunctionTemplate() != nullptr)
       << mode << modeCount << NumFormalArgs;
-  MaybeEmitInheritedConstructorNote(S, Fn);
+  MaybeEmitInheritedConstructorNote(S, Found);
 }
 
 /// Arity mismatch diagnosis specific to a function overload candidate.
 static void DiagnoseArityMismatch(Sema &S, OverloadCandidate *Cand,
                                   unsigned NumFormalArgs) {
   if (!CheckArityMismatch(S, Cand, NumFormalArgs))
-    DiagnoseArityMismatch(S, Cand->Function, NumFormalArgs);
+    DiagnoseArityMismatch(S, Cand->FoundDecl, Cand->Function, NumFormalArgs);
 }
 
 static TemplateDecl *getDescribedTemplate(Decl *Templated) {
-  if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Templated))
-    return FD->getDescribedFunctionTemplate();
-  else if (CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(Templated))
-    return RD->getDescribedClassTemplate();
-
+  if (TemplateDecl *TD = Templated->getDescribedTemplate())
+    return TD;
   llvm_unreachable("Unsupported: Getting the described template declaration"
                    " for bad deduction diagnosis");
 }
 
 /// Diagnose a failed template-argument deduction.
-static void DiagnoseBadDeduction(Sema &S, Decl *Templated,
+static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
                                  DeductionFailureInfo &DeductionFailure,
                                  unsigned NumArgs,
                                  bool TakingCandidateAddress) {
@@ -9316,7 +9402,7 @@
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_incomplete_deduction)
         << ParamD->getDeclName();
-    MaybeEmitInheritedConstructorNote(S, Templated);
+    MaybeEmitInheritedConstructorNote(S, Found);
     return;
   }
 
@@ -9341,7 +9427,7 @@
 
     S.Diag(Templated->getLocation(), diag::note_ovl_candidate_underqualified)
         << ParamD->getDeclName() << Arg << NonCanonParam;
-    MaybeEmitInheritedConstructorNote(S, Templated);
+    MaybeEmitInheritedConstructorNote(S, Found);
     return;
   }
 
@@ -9360,7 +9446,7 @@
            diag::note_ovl_candidate_inconsistent_deduction)
         << which << ParamD->getDeclName() << *DeductionFailure.getFirstArg()
         << *DeductionFailure.getSecondArg();
-    MaybeEmitInheritedConstructorNote(S, Templated);
+    MaybeEmitInheritedConstructorNote(S, Found);
     return;
   }
 
@@ -9383,18 +9469,18 @@
              diag::note_ovl_candidate_explicit_arg_mismatch_unnamed)
           << (index + 1);
     }
-    MaybeEmitInheritedConstructorNote(S, Templated);
+    MaybeEmitInheritedConstructorNote(S, Found);
     return;
 
   case Sema::TDK_TooManyArguments:
   case Sema::TDK_TooFewArguments:
-    DiagnoseArityMismatch(S, Templated, NumArgs);
+    DiagnoseArityMismatch(S, Found, Templated, NumArgs);
     return;
 
   case Sema::TDK_InstantiationDepth:
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_instantiation_depth);
-    MaybeEmitInheritedConstructorNote(S, Templated);
+    MaybeEmitInheritedConstructorNote(S, Found);
     return;
 
   case Sema::TDK_SubstitutionFailure: {
@@ -9432,7 +9518,7 @@
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_substitution_failure)
         << TemplateArgString << SFINAEArgString << R;
-    MaybeEmitInheritedConstructorNote(S, Templated);
+    MaybeEmitInheritedConstructorNote(S, Found);
     return;
   }
 
@@ -9504,7 +9590,7 @@
   // note_ovl_candidate_bad_deduction, which is uselessly vague.
   case Sema::TDK_MiscellaneousDeductionFailure:
     S.Diag(Templated->getLocation(), diag::note_ovl_candidate_bad_deduction);
-    MaybeEmitInheritedConstructorNote(S, Templated);
+    MaybeEmitInheritedConstructorNote(S, Found);
     return;
   }
 }
@@ -9518,7 +9604,7 @@
     if (CheckArityMismatch(S, Cand, NumArgs))
       return;
   }
-  DiagnoseBadDeduction(S, Cand->Function, // pattern
+  DiagnoseBadDeduction(S, Cand->FoundDecl, Cand->Function, // pattern
                        Cand->DeductionFailure, NumArgs, TakingCandidateAddress);
 }
 
@@ -9531,7 +9617,8 @@
                            CalleeTarget = S.IdentifyCUDATarget(Callee);
 
   std::string FnDesc;
-  OverloadCandidateKind FnKind = ClassifyOverloadCandidate(S, Callee, FnDesc);
+  OverloadCandidateKind FnKind =
+      ClassifyOverloadCandidate(S, Cand->FoundDecl, Callee, FnDesc);
 
   S.Diag(Callee->getLocation(), diag::note_ovl_candidate_bad_target)
       << (unsigned)FnKind << CalleeTarget << CallerTarget;
@@ -9608,18 +9695,19 @@
   if (Cand->Viable && (Fn->isDeleted() ||
       S.isFunctionConsideredUnavailable(Fn))) {
     std::string FnDesc;
-    OverloadCandidateKind FnKind = ClassifyOverloadCandidate(S, Fn, FnDesc);
+    OverloadCandidateKind FnKind =
+        ClassifyOverloadCandidate(S, Cand->FoundDecl, Fn, FnDesc);
 
     S.Diag(Fn->getLocation(), diag::note_ovl_candidate_deleted)
       << FnKind << FnDesc
       << (Fn->isDeleted() ? (Fn->isDeletedAsWritten() ? 1 : 2) : 0);
-    MaybeEmitInheritedConstructorNote(S, Fn);
+    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
     return;
   }
 
   // We don't really have anything else to say about viable candidates.
   if (Cand->Viable) {
-    S.NoteOverloadCandidate(Fn);
+    S.NoteOverloadCandidate(Cand->FoundDecl, Fn);
     return;
   }
 
@@ -9629,19 +9717,20 @@
     return DiagnoseArityMismatch(S, Cand, NumArgs);
 
   case ovl_fail_bad_deduction:
-    return DiagnoseBadDeduction(S, Cand, NumArgs, TakingCandidateAddress);
+    return DiagnoseBadDeduction(S, Cand, NumArgs,
+                                TakingCandidateAddress);
 
   case ovl_fail_illegal_constructor: {
     S.Diag(Fn->getLocation(), diag::note_ovl_candidate_illegal_constructor)
       << (Fn->getPrimaryTemplate() ? 1 : 0);
-    MaybeEmitInheritedConstructorNote(S, Fn);
+    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
     return;
   }
 
   case ovl_fail_trivial_conversion:
   case ovl_fail_bad_final_conversion:
   case ovl_fail_final_conversion_not_exact:
-    return S.NoteOverloadCandidate(Fn);
+    return S.NoteOverloadCandidate(Cand->FoundDecl, Fn);
 
   case ovl_fail_bad_conversion: {
     unsigned I = (Cand->IgnoreObjectArgument ? 1 : 0);
@@ -9652,7 +9741,7 @@
     // FIXME: this currently happens when we're called from SemaInit
     // when user-conversion overload fails.  Figure out how to handle
     // those conditions and diagnose them well.
-    return S.NoteOverloadCandidate(Fn);
+    return S.NoteOverloadCandidate(Cand->FoundDecl, Fn);
   }
 
   case ovl_fail_bad_target:
@@ -9700,7 +9789,6 @@
 
   S.Diag(Cand->Surrogate->getLocation(), diag::note_ovl_surrogate_cand)
     << FnType;
-  MaybeEmitInheritedConstructorNote(S, Cand->Surrogate);
 }
 
 static void NoteBuiltinOperatorCandidate(Sema &S, StringRef Opc,
@@ -9730,8 +9818,8 @@
     if (ICS.isBad()) break; // all meaningless after first invalid
     if (!ICS.isAmbiguous()) continue;
 
-    ICS.DiagnoseAmbiguousConversion(S, OpLoc,
-                              S.PDiag(diag::note_ambiguous_type_conversion));
+    ICS.DiagnoseAmbiguousConversion(
+        S, OpLoc, S.PDiag(diag::note_ambiguous_type_conversion));
   }
 }
 
@@ -10098,7 +10186,7 @@
 /// deductions.
 void TemplateSpecCandidate::NoteDeductionFailure(Sema &S,
                                                  bool ForTakingAddress) {
-  DiagnoseBadDeduction(S, Specialization, // pattern
+  DiagnoseBadDeduction(S, FoundDecl, Specialization, // pattern
                        DeductionFailure, /*NumArgs=*/0, ForTakingAddress);
 }
 
@@ -10261,21 +10349,32 @@
       }
     }
 
-    if (S.getLangOpts().CUDA && S.getLangOpts().CUDATargetOverloads &&
-        Matches.size() > 1)
+    if (S.getLangOpts().CUDA && Matches.size() > 1)
       EliminateSuboptimalCudaMatches();
   }
 
   bool hasComplained() const { return HasComplained; }
 
 private:
-  // Is A considered a better overload candidate for the desired type than B?
-  bool isBetterCandidate(const FunctionDecl *A, const FunctionDecl *B) {
-    return hasBetterEnableIfAttrs(S, A, B);
+  bool candidateHasExactlyCorrectType(const FunctionDecl *FD) {
+    QualType Discard;
+    return Context.hasSameUnqualifiedType(TargetFunctionType, FD->getType()) ||
+           S.IsNoReturnConversion(FD->getType(), TargetFunctionType, Discard);
   }
 
-  // Returns true if we've eliminated any (read: all but one) candidates, false
-  // otherwise.
+  /// \return true if A is considered a better overload candidate for the
+  /// desired type than B.
+  bool isBetterCandidate(const FunctionDecl *A, const FunctionDecl *B) {
+    // If A doesn't have exactly the correct type, we don't want to classify it
+    // as "better" than anything else. This way, the user is required to
+    // disambiguate for us if there are multiple candidates and no exact match.
+    return candidateHasExactlyCorrectType(A) &&
+           (!candidateHasExactlyCorrectType(B) ||
+            compareEnableIfAttrs(S, A, B) == Comparison::Better);
+  }
+
+  /// \return true if we were able to eliminate all but one overload candidate,
+  /// false otherwise.
   bool eliminiateSuboptimalOverloadCandidates() {
     // Same algorithm as overload resolution -- one pass to pick the "best",
     // another pass to be sure that nothing is better than the best.
@@ -10340,7 +10439,7 @@
                                       Info, /*InOverloadResolution=*/true)) {
       // Make a note of the failed deduction for diagnostics.
       FailedCandidates.addCandidate()
-          .set(FunctionTemplate->getTemplatedDecl(),
+          .set(CurAccessFunPair, FunctionTemplate->getTemplatedDecl(),
                MakeDeductionFailureInfo(Context, Result, Info));
       return false;
     } 
@@ -10348,7 +10447,6 @@
     // Template argument deduction ensures that we have an exact match or
     // compatible pointer-to-function arguments that would be adjusted by ICS.
     // This function template specicalization works.
-    Specialization = cast<FunctionDecl>(Specialization->getCanonicalDecl());
     assert(S.isSameOrCompatibleFunctionType(
               Context.getCanonicalType(Specialization->getType()),
               Context.getCanonicalType(TargetFunctionType)));
@@ -10374,7 +10472,7 @@
     if (FunctionDecl *FunDecl = dyn_cast<FunctionDecl>(Fn)) {
       if (S.getLangOpts().CUDA)
         if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext))
-          if (!Caller->isImplicit() && S.CheckCUDATarget(Caller, FunDecl))
+          if (!Caller->isImplicit() && !S.IsAllowedCUDACall(Caller, FunDecl))
             return false;
 
       // If any candidate has a placeholder return type, trigger its deduction
@@ -10389,12 +10487,9 @@
       if (!S.checkAddressOfFunctionIsAvailable(FunDecl))
         return false;
 
-      QualType ResultTy;
-      if (Context.hasSameUnqualifiedType(TargetFunctionType,
-                                         FunDecl->getType()) ||
-          S.IsNoReturnConversion(FunDecl->getType(), TargetFunctionType,
-                                 ResultTy) ||
-          (!S.getLangOpts().CPlusPlus && TargetType->isVoidPointerType())) {
+      // If we're in C, we need to support types that aren't exactly identical.
+      if (!S.getLangOpts().CPlusPlus ||
+          candidateHasExactlyCorrectType(FunDecl)) {
         Matches.push_back(std::make_pair(
             CurAccessFunPair, cast<FunctionDecl>(FunDecl->getCanonicalDecl())));
         FoundNonTemplateFunction = true;
@@ -10460,9 +10555,10 @@
     UnresolvedSetIterator Result = S.getMostSpecialized(
         MatchesCopy.begin(), MatchesCopy.end(), FailedCandidates,
         SourceExpr->getLocStart(), S.PDiag(),
-        S.PDiag(diag::err_addr_ovl_ambiguous) << Matches[0]
-                                                     .second->getDeclName(),
-        S.PDiag(diag::note_ovl_candidate) << (unsigned)oc_function_template,
+        S.PDiag(diag::err_addr_ovl_ambiguous)
+          << Matches[0].second->getDeclName(),
+        S.PDiag(diag::note_ovl_candidate)
+          << (unsigned)oc_function_template,
         Complain, TargetFunctionType);
 
     if (Result != MatchesCopy.end()) {
@@ -10510,7 +10606,7 @@
         if (FunctionDecl *Fun =
                 dyn_cast<FunctionDecl>((*I)->getUnderlyingDecl()))
           if (!functionHasPassObjectSizeParams(Fun))
-            S.NoteOverloadCandidate(Fun, TargetFunctionType,
+            S.NoteOverloadCandidate(*I, Fun, TargetFunctionType,
                                     /*TakingAddress=*/true);
       FailedCandidates.NoteCandidates(S, OvlExpr->getLocStart());
     }
@@ -10658,6 +10754,36 @@
   return Result;
 }
 
+/// \brief Given an overloaded function, tries to turn it into a non-overloaded
+/// function reference using resolveAddressOfOnlyViableOverloadCandidate. This
+/// will perform access checks, diagnose the use of the resultant decl, and, if
+/// necessary, perform a function-to-pointer decay.
+///
+/// Returns false if resolveAddressOfOnlyViableOverloadCandidate fails.
+/// Otherwise, returns true. This may emit diagnostics and return true.
+bool Sema::resolveAndFixAddressOfOnlyViableOverloadCandidate(
+    ExprResult &SrcExpr) {
+  Expr *E = SrcExpr.get();
+  assert(E->getType() == Context.OverloadTy && "SrcExpr must be an overload");
+
+  DeclAccessPair DAP;
+  FunctionDecl *Found = resolveAddressOfOnlyViableOverloadCandidate(E, DAP);
+  if (!Found)
+    return false;
+
+  // Emitting multiple diagnostics for a function that is both inaccessible and
+  // unavailable is consistent with our behavior elsewhere. So, always check
+  // for both.
+  DiagnoseUseOfDecl(Found, E->getExprLoc());
+  CheckAddressOfMemberAccess(E, DAP);
+  Expr *Fixed = FixOverloadedFunctionReference(E, DAP, Found);
+  if (Fixed->getType()->isFunctionType())
+    SrcExpr = DefaultFunctionArrayConversion(Fixed, /*Diagnose=*/false);
+  else
+    SrcExpr = Fixed;
+  return true;
+}
+
 /// \brief Given an expression that refers to an overloaded function, try to
 /// resolve that overloaded function expression down to a single function.
 ///
@@ -10716,7 +10842,7 @@
       // Make a note of the failed deduction for diagnostics.
       // TODO: Actually use the failed-deduction info?
       FailedCandidates.addCandidate()
-          .set(FunctionTemplate->getTemplatedDecl(),
+          .set(I.getPair(), FunctionTemplate->getTemplatedDecl(),
                MakeDeductionFailureInfo(Context, Result, Info));
       continue;
     }
@@ -12205,18 +12331,6 @@
     new (Context) CXXMemberCallExpr(Context, MemExprE, Args,
                                     ResultType, VK, RParenLoc);
 
-  // (CUDA B.1): Check for invalid calls between targets.
-  if (getLangOpts().CUDA) {
-    if (const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext)) {
-      if (CheckCUDATarget(Caller, Method)) {
-        Diag(MemExpr->getMemberLoc(), diag::err_ref_bad_target)
-            << IdentifyCUDATarget(Method) << Method->getIdentifier()
-            << IdentifyCUDATarget(Caller);
-        return ExprError();
-      }
-    }
-  }
-
   // Check for a valid return type.
   if (CheckCallReturnType(Method->getReturnType(), MemExpr->getMemberLoc(),
                           TheCall, Method))
@@ -12285,8 +12399,7 @@
   if (CXXDestructorDecl *DD =
           dyn_cast<CXXDestructorDecl>(TheCall->getMethodDecl())) {
     // a->A::f() doesn't go through the vtable, except in AppleKext mode.
-    bool CallCanBeVirtual = !cast<MemberExpr>(NakedMemExpr)->hasQualifier() ||
-                            getLangOpts().AppleKext;
+    bool CallCanBeVirtual = !MemExpr->hasQualifier() || getLangOpts().AppleKext;
     CheckVirtualDtorCall(DD, MemExpr->getLocStart(), /*IsDelete=*/false,
                          CallCanBeVirtual, /*WarnOnNonAbstractTypes=*/true,
                          MemExpr->getMemberLoc());
@@ -12899,6 +13012,9 @@
           = Context.getTypeDeclType(cast<RecordDecl>(Method->getDeclContext()));
         QualType MemPtrType
           = Context.getMemberPointerType(Fn->getType(), ClassType.getTypePtr());
+        // Under the MS ABI, lock down the inheritance model now.
+        if (Context.getTargetInfo().getCXXABI().isMicrosoft())
+          (void)isCompleteType(UnOp->getOperatorLoc(), MemPtrType);
 
         return new (Context) UnaryOperator(SubExpr, UO_AddrOf, MemPtrType,
                                            VK_RValue, OK_Ordinary,
diff --git a/lib/Sema/SemaStmt.cpp b/lib/Sema/SemaStmt.cpp
index a6921b3..0ba1e6b 100644
--- a/lib/Sema/SemaStmt.cpp
+++ b/lib/Sema/SemaStmt.cpp
@@ -37,6 +37,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+
 using namespace clang;
 using namespace sema;
 
@@ -249,10 +250,10 @@
     // is written in a macro body, only warn if it has the warn_unused_result
     // attribute.
     if (const Decl *FD = CE->getCalleeDecl()) {
-      const FunctionDecl *Func = dyn_cast<FunctionDecl>(FD);
-      if (Func ? Func->hasUnusedResultAttr()
-               : FD->hasAttr<WarnUnusedResultAttr>()) {
-        Diag(Loc, diag::warn_unused_result) << R1 << R2;
+      if (const Attr *A = isa<FunctionDecl>(FD)
+                              ? cast<FunctionDecl>(FD)->getUnusedResultAttr()
+                              : FD->getAttr<WarnUnusedResultAttr>()) {
+        Diag(Loc, diag::warn_unused_result) << A << R1 << R2;
         return;
       }
       if (ShouldSuppress)
@@ -276,8 +277,8 @@
     }
     const ObjCMethodDecl *MD = ME->getMethodDecl();
     if (MD) {
-      if (MD->hasAttr<WarnUnusedResultAttr>()) {
-        Diag(Loc, diag::warn_unused_result) << R1 << R2;
+      if (const auto *A = MD->getAttr<WarnUnusedResultAttr>()) {
+        Diag(Loc, diag::warn_unused_result) << A << R1 << R2;
         return;
       }
     }
@@ -488,36 +489,62 @@
   return LS;
 }
 
+namespace {
+class CommaVisitor : public EvaluatedExprVisitor<CommaVisitor> {
+  typedef EvaluatedExprVisitor<CommaVisitor> Inherited;
+  Sema &SemaRef;
+public:
+  CommaVisitor(Sema &SemaRef) : Inherited(SemaRef.Context), SemaRef(SemaRef) {}
+  void VisitBinaryOperator(BinaryOperator *E) {
+    if (E->getOpcode() == BO_Comma)
+      SemaRef.DiagnoseCommaOperator(E->getLHS(), E->getExprLoc());
+    EvaluatedExprVisitor<CommaVisitor>::VisitBinaryOperator(E);
+  }
+};
+}
+
 StmtResult
-Sema::ActOnIfStmt(SourceLocation IfLoc, FullExprArg CondVal, Decl *CondVar,
+Sema::ActOnIfStmt(SourceLocation IfLoc, bool IsConstexpr, Stmt *InitStmt,
+                  ConditionResult Cond,
                   Stmt *thenStmt, SourceLocation ElseLoc,
                   Stmt *elseStmt) {
-  ExprResult CondResult(CondVal.release());
+  if (Cond.isInvalid())
+    Cond = ConditionResult(
+        *this, nullptr,
+        MakeFullExpr(new (Context) OpaqueValueExpr(SourceLocation(),
+                                                   Context.BoolTy, VK_RValue),
+                     IfLoc),
+        false);
 
-  VarDecl *ConditionVar = nullptr;
-  if (CondVar) {
-    ConditionVar = cast<VarDecl>(CondVar);
-    CondResult = CheckConditionVariable(ConditionVar, IfLoc, true);
-    CondResult = ActOnFinishFullExpr(CondResult.get(), IfLoc);
-  }
-  Expr *ConditionExpr = CondResult.getAs<Expr>();
-  if (ConditionExpr) {
-    DiagnoseUnusedExprResult(thenStmt);
+  Expr *CondExpr = Cond.get().second;
+  if (!Diags.isIgnored(diag::warn_comma_operator,
+                       CondExpr->getExprLoc()))
+    CommaVisitor(*this).Visit(CondExpr);
 
-    if (!elseStmt) {
-      DiagnoseEmptyStmtBody(ConditionExpr->getLocEnd(), thenStmt,
-                            diag::warn_empty_if_body);
-    }
+  if (!elseStmt)
+    DiagnoseEmptyStmtBody(CondExpr->getLocEnd(), thenStmt,
+                          diag::warn_empty_if_body);
 
-    DiagnoseUnusedExprResult(elseStmt);
-  } else {
-    // Create a dummy Expr for the condition for error recovery
-    ConditionExpr = new (Context) OpaqueValueExpr(SourceLocation(),
-                                                  Context.BoolTy, VK_RValue);
-  }
+  return BuildIfStmt(IfLoc, IsConstexpr, InitStmt, Cond, thenStmt, ElseLoc,
+                     elseStmt);
+}
 
-  return new (Context) IfStmt(Context, IfLoc, ConditionVar, ConditionExpr,
-                              thenStmt, ElseLoc, elseStmt);
+StmtResult Sema::BuildIfStmt(SourceLocation IfLoc, bool IsConstexpr,
+                             Stmt *InitStmt, ConditionResult Cond,
+                             Stmt *thenStmt, SourceLocation ElseLoc,
+                             Stmt *elseStmt) {
+  if (Cond.isInvalid())
+    return StmtError();
+
+  if (IsConstexpr || isa<ObjCAvailabilityCheckExpr>(Cond.get().second))
+    getCurFunction()->setHasBranchProtectedScope();
+
+  DiagnoseUnusedExprResult(thenStmt);
+  DiagnoseUnusedExprResult(elseStmt);
+
+  return new (Context)
+      IfStmt(Context, IfLoc, IsConstexpr, InitStmt, Cond.get().first,
+             Cond.get().second, thenStmt, ElseLoc, elseStmt);
 }
 
 namespace {
@@ -579,24 +606,7 @@
   return expr->getType();
 }
 
-StmtResult
-Sema::ActOnStartOfSwitchStmt(SourceLocation SwitchLoc, Expr *Cond,
-                             Decl *CondVar) {
-  ExprResult CondResult;
-
-  VarDecl *ConditionVar = nullptr;
-  if (CondVar) {
-    ConditionVar = cast<VarDecl>(CondVar);
-    CondResult = CheckConditionVariable(ConditionVar, SourceLocation(), false);
-    if (CondResult.isInvalid())
-      return StmtError();
-
-    Cond = CondResult.get();
-  }
-
-  if (!Cond)
-    return StmtError();
-
+ExprResult Sema::CheckSwitchCondition(SourceLocation SwitchLoc, Expr *Cond) {
   class SwitchConvertDiagnoser : public ICEConvertDiagnoser {
     Expr *Cond;
 
@@ -644,24 +654,24 @@
     }
   } SwitchDiagnoser(Cond);
 
-  CondResult =
+  ExprResult CondResult =
       PerformContextualImplicitConversion(SwitchLoc, Cond, SwitchDiagnoser);
-  if (CondResult.isInvalid()) return StmtError();
-  Cond = CondResult.get();
+  if (CondResult.isInvalid())
+    return ExprError();
 
   // C99 6.8.4.2p5 - Integer promotions are performed on the controlling expr.
-  CondResult = UsualUnaryConversions(Cond);
-  if (CondResult.isInvalid()) return StmtError();
-  Cond = CondResult.get();
+  return UsualUnaryConversions(CondResult.get());
+}
 
-  CondResult = ActOnFinishFullExpr(Cond, SwitchLoc);
-  if (CondResult.isInvalid())
+StmtResult Sema::ActOnStartOfSwitchStmt(SourceLocation SwitchLoc,
+                                        Stmt *InitStmt, ConditionResult Cond) {
+  if (Cond.isInvalid())
     return StmtError();
-  Cond = CondResult.get();
 
   getCurFunction()->setHasBranchIntoScope();
 
-  SwitchStmt *SS = new (Context) SwitchStmt(Context, ConditionVar, Cond);
+  SwitchStmt *SS = new (Context)
+      SwitchStmt(Context, InitStmt, Cond.get().first, Cond.get().second);
   getCurFunction()->SwitchStack.push_back(SS);
   return SS;
 }
@@ -980,7 +990,8 @@
             << SourceRange(CR->getLHS()->getLocStart(),
                            Hi->getLocEnd());
           CaseRanges.erase(CaseRanges.begin()+i);
-          --i, --e;
+          --i;
+          --e;
           continue;
         }
 
@@ -1221,23 +1232,17 @@
     }
 }
 
-StmtResult
-Sema::ActOnWhileStmt(SourceLocation WhileLoc, FullExprArg Cond,
-                     Decl *CondVar, Stmt *Body) {
-  ExprResult CondResult(Cond.release());
-
-  VarDecl *ConditionVar = nullptr;
-  if (CondVar) {
-    ConditionVar = cast<VarDecl>(CondVar);
-    CondResult = CheckConditionVariable(ConditionVar, WhileLoc, true);
-    CondResult = ActOnFinishFullExpr(CondResult.get(), WhileLoc);
-    if (CondResult.isInvalid())
-      return StmtError();
-  }
-  Expr *ConditionExpr = CondResult.get();
-  if (!ConditionExpr)
+StmtResult Sema::ActOnWhileStmt(SourceLocation WhileLoc, ConditionResult Cond,
+                                Stmt *Body) {
+  if (Cond.isInvalid())
     return StmtError();
-  CheckBreakContinueBinding(ConditionExpr);
+
+  auto CondVal = Cond.get();
+  CheckBreakContinueBinding(CondVal.second);
+
+  if (CondVal.second &&
+      !Diags.isIgnored(diag::warn_comma_operator, CondVal.second->getExprLoc()))
+    CommaVisitor(*this).Visit(CondVal.second);
 
   DiagnoseUnusedExprResult(Body);
 
@@ -1245,7 +1250,7 @@
     getCurCompoundScope().setHasEmptyLoopBodies();
 
   return new (Context)
-      WhileStmt(Context, ConditionVar, ConditionExpr, Body, WhileLoc);
+      WhileStmt(Context, CondVal.first, CondVal.second, Body, WhileLoc);
 }
 
 StmtResult
@@ -1255,7 +1260,7 @@
   assert(Cond && "ActOnDoStmt(): missing expression");
 
   CheckBreakContinueBinding(Cond);
-  ExprResult CondResult = CheckBooleanCondition(Cond, DoLoc);
+  ExprResult CondResult = CheckBooleanCondition(DoLoc, Cond);
   if (CondResult.isInvalid())
     return StmtError();
   Cond = CondResult.get();
@@ -1493,6 +1498,10 @@
   // variables Increment and DRE.
   bool ProcessIterationStmt(Sema &S, Stmt* Statement, bool &Increment,
                             DeclRefExpr *&DRE) {
+    if (auto Cleanups = dyn_cast<ExprWithCleanups>(Statement))
+      if (!Cleanups->cleanupsHaveSideEffects())
+        Statement = Cleanups->getSubExpr();
+
     if (UnaryOperator *UO = dyn_cast<UnaryOperator>(Statement)) {
       switch (UO->getOpcode()) {
         default: return false;
@@ -1615,11 +1624,13 @@
   }
 }
 
-StmtResult
-Sema::ActOnForStmt(SourceLocation ForLoc, SourceLocation LParenLoc,
-                   Stmt *First, FullExprArg second, Decl *secondVar,
-                   FullExprArg third,
-                   SourceLocation RParenLoc, Stmt *Body) {
+StmtResult Sema::ActOnForStmt(SourceLocation ForLoc, SourceLocation LParenLoc,
+                              Stmt *First, ConditionResult Second,
+                              FullExprArg third, SourceLocation RParenLoc,
+                              Stmt *Body) {
+  if (Second.isInvalid())
+    return StmtError();
+
   if (!getLangOpts().CPlusPlus) {
     if (DeclStmt *DS = dyn_cast_or_null<DeclStmt>(First)) {
       // C99 6.8.5p3: The declaration part of a 'for' statement shall only
@@ -1637,21 +1648,18 @@
     }
   }
 
-  CheckBreakContinueBinding(second.get());
+  CheckBreakContinueBinding(Second.get().second);
   CheckBreakContinueBinding(third.get());
 
-  CheckForLoopConditionalStatement(*this, second.get(), third.get(), Body);
+  if (!Second.get().first)
+    CheckForLoopConditionalStatement(*this, Second.get().second, third.get(),
+                                     Body);
   CheckForRedundantIteration(*this, third.get(), Body);
 
-  ExprResult SecondResult(second.release());
-  VarDecl *ConditionVar = nullptr;
-  if (secondVar) {
-    ConditionVar = cast<VarDecl>(secondVar);
-    SecondResult = CheckConditionVariable(ConditionVar, ForLoc, true);
-    SecondResult = ActOnFinishFullExpr(SecondResult.get(), ForLoc);
-    if (SecondResult.isInvalid())
-      return StmtError();
-  }
+  if (Second.get().second &&
+      !Diags.isIgnored(diag::warn_comma_operator,
+                       Second.get().second->getExprLoc()))
+    CommaVisitor(*this).Visit(Second.get().second);
 
   Expr *Third  = third.release().getAs<Expr>();
 
@@ -1662,8 +1670,9 @@
   if (isa<NullStmt>(Body))
     getCurCompoundScope().setHasEmptyLoopBodies();
 
-  return new (Context) ForStmt(Context, First, SecondResult.get(), ConditionVar,
-                               Third, Body, ForLoc, LParenLoc, RParenLoc);
+  return new (Context)
+      ForStmt(Context, First, Second.get().second, Second.get().first, Third,
+              Body, ForLoc, LParenLoc, RParenLoc);
 }
 
 /// In an Objective C collection iteration statement:
@@ -2004,8 +2013,9 @@
   }
 
   return BuildCXXForRangeStmt(ForLoc, CoawaitLoc, ColonLoc, RangeDecl.get(),
-                              /*BeginEndDecl=*/nullptr, /*Cond=*/nullptr,
-                              /*Inc=*/nullptr, DS, RParenLoc, Kind);
+                              /*BeginStmt=*/nullptr, /*EndStmt=*/nullptr,
+                              /*Cond=*/nullptr, /*Inc=*/nullptr,
+                              DS, RParenLoc, Kind);
 }
 
 /// \brief Create the initialization, compare, and increment steps for
@@ -2155,8 +2165,8 @@
 /// BuildCXXForRangeStmt - Build or instantiate a C++11 for-range statement.
 StmtResult
 Sema::BuildCXXForRangeStmt(SourceLocation ForLoc, SourceLocation CoawaitLoc,
-                           SourceLocation ColonLoc,
-                           Stmt *RangeDecl, Stmt *BeginEnd, Expr *Cond,
+                           SourceLocation ColonLoc, Stmt *RangeDecl,
+                           Stmt *Begin, Stmt *End, Expr *Cond,
                            Expr *Inc, Stmt *LoopVarDecl,
                            SourceLocation RParenLoc, BuildForRangeKind Kind) {
   // FIXME: This should not be used during template instantiation. We should
@@ -2182,7 +2192,8 @@
   InvalidateOnErrorScope Invalidate(*this, LoopVar,
                                     LoopVar->getType()->isUndeducedType());
 
-  StmtResult BeginEndDecl = BeginEnd;
+  StmtResult BeginDeclStmt = Begin;
+  StmtResult EndDeclStmt = End;
   ExprResult NotEqExpr = Cond, IncrExpr = Inc;
 
   if (RangeVarType->isDependentType()) {
@@ -2193,7 +2204,7 @@
     // them in properly when we instantiate the loop.
     if (!LoopVar->isInvalidDecl() && Kind != BFRK_Check)
       LoopVar->setType(SubstAutoType(LoopVar->getType(), Context.DependentTy));
-  } else if (!BeginEndDecl.get()) {
+  } else if (!BeginDeclStmt.get()) {
     SourceLocation RangeLoc = RangeVar->getLocation();
 
     const QualType RangeVarNonRefType = RangeVarType.getNonReferenceType();
@@ -2318,20 +2329,21 @@
            "invalid range expression in for loop");
 
     // C++11 [dcl.spec.auto]p7: BeginType and EndType must be the same.
+    // C++1z removes this restriction.
     QualType BeginType = BeginVar->getType(), EndType = EndVar->getType();
     if (!Context.hasSameType(BeginType, EndType)) {
-      Diag(RangeLoc, diag::err_for_range_begin_end_types_differ)
-        << BeginType << EndType;
+      Diag(RangeLoc, getLangOpts().CPlusPlus1z
+                         ? diag::warn_for_range_begin_end_types_differ
+                         : diag::ext_for_range_begin_end_types_differ)
+          << BeginType << EndType;
       NoteForRangeBeginEndFunction(*this, BeginExpr.get(), BEF_begin);
       NoteForRangeBeginEndFunction(*this, EndExpr.get(), BEF_end);
     }
 
-    Decl *BeginEndDecls[] = { BeginVar, EndVar };
-    // Claim the type doesn't contain auto: we've already done the checking.
-    DeclGroupPtrTy BeginEndGroup =
-        BuildDeclaratorGroup(MutableArrayRef<Decl *>(BeginEndDecls, 2),
-                             /*TypeMayContainAuto=*/ false);
-    BeginEndDecl = ActOnDeclStmt(BeginEndGroup, ColonLoc, ColonLoc);
+    BeginDeclStmt =
+        ActOnDeclStmt(ConvertDeclToDeclGroup(BeginVar), ColonLoc, ColonLoc);
+    EndDeclStmt =
+        ActOnDeclStmt(ConvertDeclToDeclGroup(EndVar), ColonLoc, ColonLoc);
 
     const QualType BeginRefNonRefType = BeginType.getNonReferenceType();
     ExprResult BeginRef = BuildDeclRefExpr(BeginVar, BeginRefNonRefType,
@@ -2347,8 +2359,10 @@
     // Build and check __begin != __end expression.
     NotEqExpr = ActOnBinOp(S, ColonLoc, tok::exclaimequal,
                            BeginRef.get(), EndRef.get());
-    NotEqExpr = ActOnBooleanCondition(S, ColonLoc, NotEqExpr.get());
-    NotEqExpr = ActOnFinishFullExpr(NotEqExpr.get());
+    if (!NotEqExpr.isInvalid())
+      NotEqExpr = CheckBooleanCondition(ColonLoc, NotEqExpr.get());
+    if (!NotEqExpr.isInvalid())
+      NotEqExpr = ActOnFinishFullExpr(NotEqExpr.get());
     if (NotEqExpr.isInvalid()) {
       Diag(RangeLoc, diag::note_for_range_invalid_iterator)
         << RangeLoc << 0 << BeginRangeRef.get()->getType();
@@ -2406,7 +2420,8 @@
     return StmtResult();
 
   return new (Context) CXXForRangeStmt(
-      RangeDS, cast_or_null<DeclStmt>(BeginEndDecl.get()), NotEqExpr.get(),
+      RangeDS, cast_or_null<DeclStmt>(BeginDeclStmt.get()),
+      cast_or_null<DeclStmt>(EndDeclStmt.get()), NotEqExpr.get(),
       IncrExpr.get(), LoopVarDS, /*Body=*/nullptr, ForLoc, CoawaitLoc,
       ColonLoc, RParenLoc);
 }
@@ -2438,6 +2453,10 @@
 
   QualType VariableType = VD->getType();
 
+  if (auto Cleanups = dyn_cast<ExprWithCleanups>(InitExpr))
+    if (!Cleanups->cleanupsHaveSideEffects())
+      InitExpr = Cleanups->getSubExpr();
+
   const MaterializeTemporaryExpr *MTE =
       dyn_cast<MaterializeTemporaryExpr>(InitExpr);
 
@@ -2833,8 +2852,21 @@
   CapturingScopeInfo *CurCap = cast<CapturingScopeInfo>(getCurFunction());
   QualType FnRetType = CurCap->ReturnType;
   LambdaScopeInfo *CurLambda = dyn_cast<LambdaScopeInfo>(CurCap);
+  bool HasDeducedReturnType =
+      CurLambda && hasDeducedReturnType(CurLambda->CallOperator);
 
-  if (CurLambda && hasDeducedReturnType(CurLambda->CallOperator)) {
+  if (ExprEvalContexts.back().Context == DiscardedStatement &&
+      (HasDeducedReturnType || CurCap->HasImplicitReturnType)) {
+    if (RetValExp) {
+      ExprResult ER = ActOnFinishFullExpr(RetValExp, ReturnLoc);
+      if (ER.isInvalid())
+        return StmtError();
+      RetValExp = ER.get();
+    }
+    return new (Context) ReturnStmt(ReturnLoc, RetValExp, nullptr);
+  }
+
+  if (HasDeducedReturnType) {
     // In C++1y, the return type may involve 'auto'.
     // FIXME: Blocks might have a return type of 'auto' explicitly specified.
     FunctionDecl *FD = CurLambda->CallOperator;
@@ -3115,9 +3147,8 @@
 Sema::ActOnReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp,
                       Scope *CurScope) {
   StmtResult R = BuildReturnStmt(ReturnLoc, RetValExp);
-  if (R.isInvalid()) {
+  if (R.isInvalid() || ExprEvalContexts.back().Context == DiscardedStatement)
     return R;
-  }
 
   if (VarDecl *VD =
       const_cast<VarDecl*>(cast<ReturnStmt>(R.get())->getNRVOCandidate())) {
@@ -3166,6 +3197,19 @@
   } else // If we don't have a function/method context, bail.
     return StmtError();
 
+  // C++1z: discarded return statements are not considered when deducing a
+  // return type.
+  if (ExprEvalContexts.back().Context == DiscardedStatement &&
+      FnRetType->getContainedAutoType()) {
+    if (RetValExp) {
+      ExprResult ER = ActOnFinishFullExpr(RetValExp, ReturnLoc);
+      if (ER.isInvalid())
+        return StmtError();
+      RetValExp = ER.get();
+    }
+    return new (Context) ReturnStmt(ReturnLoc, RetValExp, nullptr);
+  }
+
   // FIXME: Add a flag to the ScopeInfo to indicate whether we're performing
   // deduction.
   if (getLangOpts().CPlusPlus14) {
@@ -3543,11 +3587,6 @@
     return LHS == RHS;
   }
 };
-
-// It's OK to treat CatchHandlerType as a POD type.
-template <> struct isPodLike<CatchHandlerType> {
-  static const bool value = true;
-};
 }
 
 namespace {
@@ -3572,7 +3611,7 @@
   bool operator()(const CXXBaseSpecifier *S, CXXBasePath &) {
     if (S->getAccessSpecifier() == AccessSpecifier::AS_public) {
       CatchHandlerType Check(S->getType(), CheckAgainstPointer);
-      auto M = TypesToCheck;
+      const auto &M = TypesToCheck;
       auto I = M.find(Check);
       if (I != M.end()) {
         FoundHandler = I->second;
@@ -3934,9 +3973,9 @@
   CapturedDecl *CD = RSI->TheCapturedDecl;
   RecordDecl *RD = RSI->TheRecordDecl;
 
-  CapturedStmt *Res = CapturedStmt::Create(getASTContext(), S,
-                                           RSI->CapRegionKind, Captures,
-                                           CaptureInits, CD, RD);
+  CapturedStmt *Res = CapturedStmt::Create(
+      getASTContext(), S, static_cast<CapturedRegionKind>(RSI->CapRegionKind),
+      Captures, CaptureInits, CD, RD);
 
   CD->setBody(Res->getCapturedStmt());
   RD->completeDefinition();
diff --git a/lib/Sema/SemaStmtAsm.cpp b/lib/Sema/SemaStmtAsm.cpp
index 11a4f8b..b36abb4 100644
--- a/lib/Sema/SemaStmtAsm.cpp
+++ b/lib/Sema/SemaStmtAsm.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/AST/TypeLoc.h"
@@ -21,8 +20,8 @@
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 using namespace clang;
 using namespace sema;
@@ -623,16 +622,12 @@
 
   if (!LookupName(BaseResult, getCurScope()))
     return true;
-
-  LookupResult CurrBaseResult(BaseResult);
-
+  
+  if(!BaseResult.isSingleResult())
+    return true;
+  NamedDecl *FoundDecl = BaseResult.getFoundDecl();
   for (StringRef NextMember : Members) {
-
-    if (!CurrBaseResult.isSingleResult())
-      return true;
-
     const RecordType *RT = nullptr;
-    NamedDecl *FoundDecl = CurrBaseResult.getFoundDecl();
     if (VarDecl *VD = dyn_cast<VarDecl>(FoundDecl))
       RT = VD->getType()->getAs<RecordType>();
     else if (TypedefNameDecl *TD = dyn_cast<TypedefNameDecl>(FoundDecl)) {
@@ -655,13 +650,15 @@
     if (!LookupQualifiedName(FieldResult, RT->getDecl()))
       return true;
 
+    if (!FieldResult.isSingleResult())
+      return true;
+    FoundDecl = FieldResult.getFoundDecl();
+
     // FIXME: Handle IndirectFieldDecl?
-    FieldDecl *FD = dyn_cast<FieldDecl>(FieldResult.getFoundDecl());
+    FieldDecl *FD = dyn_cast<FieldDecl>(FoundDecl);
     if (!FD)
       return true;
 
-    CurrBaseResult = FieldResult;
-
     const ASTRecordLayout &RL = Context.getASTRecordLayout(RT->getDecl());
     unsigned i = FD->getFieldIndex();
     CharUnits Result = Context.toCharUnitsFromBits(RL.getFieldOffset(i));
diff --git a/lib/Sema/SemaStmtAttr.cpp b/lib/Sema/SemaStmtAttr.cpp
index 984bd07..87fd889 100644
--- a/lib/Sema/SemaStmtAttr.cpp
+++ b/lib/Sema/SemaStmtAttr.cpp
@@ -25,9 +25,11 @@
 
 static Attr *handleFallThroughAttr(Sema &S, Stmt *St, const AttributeList &A,
                                    SourceRange Range) {
+  FallThroughAttr Attr(A.getRange(), S.Context,
+                       A.getAttributeSpellingListIndex());
   if (!isa<NullStmt>(St)) {
     S.Diag(A.getRange().getBegin(), diag::err_fallthrough_attr_wrong_target)
-        << St->getLocStart();
+        << Attr.getSpelling() << St->getLocStart();
     if (isa<SwitchCase>(St)) {
       SourceLocation L = S.getLocForEndOfToken(Range.getEnd());
       S.Diag(L, diag::note_fallthrough_insert_semi_fixit)
@@ -35,12 +37,20 @@
     }
     return nullptr;
   }
-  if (S.getCurFunction()->SwitchStack.empty()) {
+  auto *FnScope = S.getCurFunction();
+  if (FnScope->SwitchStack.empty()) {
     S.Diag(A.getRange().getBegin(), diag::err_fallthrough_attr_outside_switch);
     return nullptr;
   }
-  return ::new (S.Context) FallThroughAttr(A.getRange(), S.Context,
-                                           A.getAttributeSpellingListIndex());
+
+  // If this is spelled as the standard C++1z attribute, but not in C++1z, warn
+  // about using it as an extension.
+  if (!S.getLangOpts().CPlusPlus1z && A.isCXX11Attribute() &&
+      !A.getScopeName())
+    S.Diag(A.getLoc(), diag::ext_cxx1z_attr) << A.getName();
+
+  FnScope->setHasFallthroughStmt();
+  return ::new (S.Context) auto(Attr);
 }
 
 static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const AttributeList &A,
@@ -97,6 +107,7 @@
                  .Case("interleave_count", LoopHintAttr::InterleaveCount)
                  .Case("unroll", LoopHintAttr::Unroll)
                  .Case("unroll_count", LoopHintAttr::UnrollCount)
+                 .Case("distribute", LoopHintAttr::Distribute)
                  .Default(LoopHintAttr::Vectorize);
     if (Option == LoopHintAttr::VectorizeWidth ||
         Option == LoopHintAttr::InterleaveCount ||
@@ -107,7 +118,8 @@
       State = LoopHintAttr::Numeric;
     } else if (Option == LoopHintAttr::Vectorize ||
                Option == LoopHintAttr::Interleave ||
-               Option == LoopHintAttr::Unroll) {
+               Option == LoopHintAttr::Unroll ||
+               Option == LoopHintAttr::Distribute) {
       assert(StateLoc && StateLoc->Ident && "Loop hint must have an argument");
       if (StateLoc->Ident->isStr("disable"))
         State = LoopHintAttr::Disable;
@@ -130,18 +142,21 @@
 static void
 CheckForIncompatibleAttributes(Sema &S,
                                const SmallVectorImpl<const Attr *> &Attrs) {
-  // There are 3 categories of loop hints attributes: vectorize, interleave,
-  // and unroll. Each comes in two variants: a state form and a numeric form.
-  // The state form selectively defaults/enables/disables the transformation
-  // for the loop (for unroll, default indicates full unrolling rather than
-  // enabling the transformation).  The numeric form form provides an integer
-  // hint (for example, unroll count) to the transformer. The following array
-  // accumulates the hints encountered while iterating through the attributes
-  // to check for compatibility.
+  // There are 4 categories of loop hints attributes: vectorize, interleave,
+  // unroll and distribute. Except for distribute they come in two variants: a
+  // state form and a numeric form.  The state form selectively
+  // defaults/enables/disables the transformation for the loop (for unroll,
+  // default indicates full unrolling rather than enabling the transformation).
+  // The numeric form form provides an integer hint (for example, unroll count)
+  // to the transformer. The following array accumulates the hints encountered
+  // while iterating through the attributes to check for compatibility.
   struct {
     const LoopHintAttr *StateAttr;
     const LoopHintAttr *NumericAttr;
-  } HintAttrs[] = {{nullptr, nullptr}, {nullptr, nullptr}, {nullptr, nullptr}};
+  } HintAttrs[] = {{nullptr, nullptr},
+                   {nullptr, nullptr},
+                   {nullptr, nullptr},
+                   {nullptr, nullptr}};
 
   for (const auto *I : Attrs) {
     const LoopHintAttr *LH = dyn_cast<LoopHintAttr>(I);
@@ -151,7 +166,7 @@
       continue;
 
     LoopHintAttr::OptionType Option = LH->getOption();
-    enum { Vectorize, Interleave, Unroll } Category;
+    enum { Vectorize, Interleave, Unroll, Distribute } Category;
     switch (Option) {
     case LoopHintAttr::Vectorize:
     case LoopHintAttr::VectorizeWidth:
@@ -165,12 +180,17 @@
     case LoopHintAttr::UnrollCount:
       Category = Unroll;
       break;
+    case LoopHintAttr::Distribute:
+      // Perform the check for duplicated 'distribute' hints.
+      Category = Distribute;
+      break;
     };
 
     auto &CategoryState = HintAttrs[Category];
     const LoopHintAttr *PrevAttr;
     if (Option == LoopHintAttr::Vectorize ||
-        Option == LoopHintAttr::Interleave || Option == LoopHintAttr::Unroll) {
+        Option == LoopHintAttr::Interleave || Option == LoopHintAttr::Unroll ||
+        Option == LoopHintAttr::Distribute) {
       // Enable|Disable|AssumeSafety hint.  For example, vectorize(enable).
       PrevAttr = CategoryState.StateAttr;
       CategoryState.StateAttr = LH;
@@ -203,6 +223,52 @@
   }
 }
 
+static Attr *handleOpenCLUnrollHint(Sema &S, Stmt *St, const AttributeList &A,
+                                    SourceRange Range) {
+  // OpenCL v2.0 s6.11.5 - opencl_unroll_hint can have 0 arguments (compiler
+  // determines unrolling factor) or 1 argument (the unroll factor provided
+  // by the user).
+
+  if (S.getLangOpts().OpenCLVersion < 200) {
+    S.Diag(A.getLoc(), diag::err_attribute_requires_opencl_version)
+        << A.getName() << "2.0" << 1;
+    return nullptr;
+  }
+
+  unsigned NumArgs = A.getNumArgs();
+
+  if (NumArgs > 1) {
+    S.Diag(A.getLoc(), diag::err_attribute_too_many_arguments) << A.getName()
+                                                               << 1;
+    return nullptr;
+  }
+
+  unsigned UnrollFactor = 0;
+
+  if (NumArgs == 1) {
+    Expr *E = A.getArgAsExpr(0);
+    llvm::APSInt ArgVal(32);
+
+    if (!E->isIntegerConstantExpr(ArgVal, S.Context)) {
+      S.Diag(A.getLoc(), diag::err_attribute_argument_type)
+          << A.getName() << AANT_ArgumentIntegerConstant << E->getSourceRange();
+      return nullptr;
+    }
+
+    int Val = ArgVal.getSExtValue();
+
+    if (Val <= 0) {
+      S.Diag(A.getRange().getBegin(),
+             diag::err_attribute_requires_positive_integer)
+          << A.getName();
+      return nullptr;
+    }
+    UnrollFactor = Val;
+  }
+
+  return OpenCLUnrollHintAttr::CreateImplicit(S.Context, UnrollFactor);
+}
+
 static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const AttributeList &A,
                                   SourceRange Range) {
   switch (A.getKind()) {
@@ -215,10 +281,12 @@
     return handleFallThroughAttr(S, St, A, Range);
   case AttributeList::AT_LoopHint:
     return handleLoopHintAttr(S, St, A, Range);
+  case AttributeList::AT_OpenCLUnrollHint:
+    return handleOpenCLUnrollHint(S, St, A, Range);
   default:
     // if we're here, then we parsed a known attribute, but didn't recognize
     // it as a statement attribute => it is declaration attribute
-    S.Diag(A.getRange().getBegin(), diag::err_attribute_invalid_on_stmt)
+    S.Diag(A.getRange().getBegin(), diag::err_decl_attribute_invalid_on_stmt)
         << A.getName() << St->getLocStart();
     return nullptr;
   }
diff --git a/lib/Sema/SemaTemplate.cpp b/lib/Sema/SemaTemplate.cpp
index 0c18faf..b62200c 100644
--- a/lib/Sema/SemaTemplate.cpp
+++ b/lib/Sema/SemaTemplate.cpp
@@ -1,13 +1,13 @@
-//===------- SemaTemplate.cpp - Semantic Analysis for C++ Templates -------===/
+//===------- SemaTemplate.cpp - Semantic Analysis for C++ Templates -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-//===----------------------------------------------------------------------===/
+//===----------------------------------------------------------------------===//
 //
 //  This file implements semantic analysis for C++ templates.
-//===----------------------------------------------------------------------===/
+//===----------------------------------------------------------------------===//
 
 #include "TreeTransform.h"
 #include "clang/AST/ASTConsumer.h"
@@ -32,6 +32,8 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+
+#include <iterator>
 using namespace clang;
 using namespace sema;
 
@@ -471,7 +473,6 @@
   Diag(Loc, diag::err_template_param_shadow)
     << cast<NamedDecl>(PrevDecl)->getDeclName();
   Diag(PrevDecl->getLocation(), diag::note_template_param_here);
-  return;
 }
 
 /// AdjustDeclIfTemplate - If the given decl happens to be a template, reset
@@ -568,7 +569,6 @@
                                ParsedType DefaultArg) {
   assert(S->isTemplateParamScope() &&
          "Template type parameter not in template parameter scope!");
-  bool Invalid = false;
 
   SourceLocation Loc = ParamNameLoc;
   if (!ParamName)
@@ -580,8 +580,6 @@
                                    KeyLoc, Loc, Depth, Position, ParamName,
                                    Typename, IsParameterPack);
   Param->setAccess(AS_public);
-  if (Invalid)
-    Param->setInvalidDecl();
 
   if (ParamName) {
     maybeDiagnoseTemplateParameterShadow(*this, S, ParamNameLoc, ParamName);
@@ -803,7 +801,7 @@
     // However, it isn't worth doing.
     TemplateArgumentLoc DefaultArg = translateTemplateArgument(*this, Default);
     if (DefaultArg.getArgument().getAsTemplate().isNull()) {
-      Diag(DefaultArg.getLocation(), diag::err_template_arg_not_class_template)
+      Diag(DefaultArg.getLocation(), diag::err_template_arg_not_valid_template)
         << DefaultArg.getSourceRange();
       return Param;
     }
@@ -820,22 +818,24 @@
   return Param;
 }
 
-/// ActOnTemplateParameterList - Builds a TemplateParameterList that
-/// contains the template parameters in Params/NumParams.
+/// ActOnTemplateParameterList - Builds a TemplateParameterList, optionally
+/// constrained by RequiresClause, that contains the template parameters in
+/// Params.
 TemplateParameterList *
 Sema::ActOnTemplateParameterList(unsigned Depth,
                                  SourceLocation ExportLoc,
                                  SourceLocation TemplateLoc,
                                  SourceLocation LAngleLoc,
                                  ArrayRef<Decl *> Params,
-                                 SourceLocation RAngleLoc) {
+                                 SourceLocation RAngleLoc,
+                                 Expr *RequiresClause) {
   if (ExportLoc.isValid())
     Diag(ExportLoc, diag::warn_template_export_unsupported);
 
   return TemplateParameterList::Create(
       Context, TemplateLoc, LAngleLoc,
       llvm::makeArrayRef((NamedDecl *const *)Params.data(), Params.size()),
-      RAngleLoc);
+      RAngleLoc, RequiresClause);
 }
 
 static void SetNestedNameSpecifier(TagDecl *T, const CXXScopeSpec &SS) {
@@ -929,6 +929,13 @@
   if (Previous.begin() != Previous.end())
     PrevDecl = (*Previous.begin())->getUnderlyingDecl();
 
+  if (PrevDecl && PrevDecl->isTemplateParameter()) {
+    // Maybe we will complain about the shadowed template parameter.
+    DiagnoseTemplateParameterShadow(NameLoc, PrevDecl);
+    // Just pretend that we didn't see the previous declaration.
+    PrevDecl = nullptr;
+  }
+
   // If there is a previous declaration with the same name, check
   // whether this is a valid redeclaration.
   ClassTemplateDecl *PrevClassTemplate
@@ -1054,12 +1061,7 @@
         // definition, as part of error recovery?
         return true;
       }
-    }    
-  } else if (PrevDecl && PrevDecl->isTemplateParameter()) {
-    // Maybe we will complain about the shadowed template parameter.
-    DiagnoseTemplateParameterShadow(NameLoc, PrevDecl);
-    // Just pretend that we didn't see the previous declaration.
-    PrevDecl = nullptr;
+    }
   } else if (PrevDecl) {
     // C++ [temp]p5:
     //   A class template shall not have the same name as any other
@@ -1591,7 +1593,7 @@
     return TraverseType(T->getInjectedSpecializationType());
   }
 };
-}
+} // end anonymous namespace
 
 /// Determines whether a given type depends on the given parameter
 /// list.
@@ -1954,7 +1956,7 @@
       // Fabricate an empty template parameter list for the invented header.
       return TemplateParameterList::Create(Context, SourceLocation(),
                                            SourceLocation(), None,
-                                           SourceLocation());
+                                           SourceLocation(), nullptr);
     }
 
     return nullptr;
@@ -2041,7 +2043,7 @@
                            TemplateArgumentListInfo &TemplateArgs) {
   ASTContext &Context = SemaRef.getASTContext();
   switch (BTD->getBuiltinTemplateKind()) {
-  case BTK__make_integer_seq:
+  case BTK__make_integer_seq: {
     // Specializations of __make_integer_seq<S, T, N> are treated like
     // S<T, 0, ..., N-1>.
 
@@ -2072,17 +2074,37 @@
     for (llvm::APSInt I(NumArgs.getBitWidth(), NumArgs.isUnsigned());
          I < NumArgs; ++I) {
       TemplateArgument TA(Context, I, ArgTy);
-      Expr *E = SemaRef.BuildExpressionFromIntegralTemplateArgument(
-                           TA, TemplateArgs[2].getLocation())
-                    .getAs<Expr>();
-      SyntheticTemplateArgs.addArgument(
-          TemplateArgumentLoc(TemplateArgument(E), E));
+      SyntheticTemplateArgs.addArgument(SemaRef.getTrivialTemplateArgumentLoc(
+          TA, ArgTy, TemplateArgs[2].getLocation()));
     }
     // The first template argument will be reused as the template decl that
     // our synthetic template arguments will be applied to.
     return SemaRef.CheckTemplateIdType(Converted[0].getAsTemplate(),
                                        TemplateLoc, SyntheticTemplateArgs);
   }
+
+  case BTK__type_pack_element:
+    // Specializations of
+    //    __type_pack_element<Index, T_1, ..., T_N>
+    // are treated like T_Index.
+    assert(Converted.size() == 2 &&
+      "__type_pack_element should be given an index and a parameter pack");
+
+    // If the Index is out of bounds, the program is ill-formed.
+    TemplateArgument IndexArg = Converted[0], Ts = Converted[1];
+    llvm::APSInt Index = IndexArg.getAsIntegral();
+    assert(Index >= 0 && "the index used with __type_pack_element should be of "
+                         "type std::size_t, and hence be non-negative");
+    if (Index >= Ts.pack_size()) {
+      SemaRef.Diag(TemplateArgs[0].getLocation(),
+                   diag::err_type_pack_element_out_of_bounds);
+      return QualType();
+    }
+
+    // We simply return the type at index `Index`.
+    auto Nth = std::next(Ts.pack_begin(), Index.getExtValue());
+    return Nth->getAsType();
+  }
   llvm_unreachable("unexpected BuiltinTemplateDecl!");
 }
 
@@ -2133,7 +2155,7 @@
       return QualType();
 
     TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                      Converted.data(), Converted.size());
+                                      Converted);
 
     // Only substitute for the innermost template argument list.
     MultiLevelTemplateArgumentList TemplateArgLists;
@@ -2164,8 +2186,7 @@
     //   template<typename T, typename U = T> struct A;
     TemplateName CanonName = Context.getCanonicalTemplateName(Name);
     CanonType = Context.getTemplateSpecializationType(CanonName,
-                                                      Converted.data(),
-                                                      Converted.size());
+                                                      Converted);
 
     // FIXME: CanonType is not actually the canonical type, and unfortunately
     // it is a TemplateSpecializationType that we will never use again.
@@ -2227,8 +2248,7 @@
                             ClassTemplate->getTemplatedDecl()->getLocStart(),
                                                 ClassTemplate->getLocation(),
                                                      ClassTemplate,
-                                                     Converted.data(),
-                                                     Converted.size(), nullptr);
+                                                     Converted, nullptr);
       ClassTemplate->AddSpecialization(Decl, InsertPos);
       if (ClassTemplate->isOutOfLine())
         Decl->setLexicalDeclContext(ClassTemplate->getLexicalDeclContext());
@@ -2552,7 +2572,7 @@
     bool InstantiationDependent;
     if (!Name.isDependent() &&
         !TemplateSpecializationType::anyDependentTemplateArguments(
-            TemplateArgs.getArgumentArray(), TemplateArgs.size(),
+            TemplateArgs.arguments(),
             InstantiationDependent)) {
       Diag(TemplateNameLoc, diag::err_partial_spec_fully_specialized)
           << VarTemplate->getDeclName();
@@ -2609,7 +2629,7 @@
         VarTemplatePartialSpecializationDecl::Create(
             Context, VarTemplate->getDeclContext(), TemplateKWLoc,
             TemplateNameLoc, TemplateParams, VarTemplate, DI->getType(), DI, SC,
-            Converted.data(), Converted.size(), TemplateArgs);
+            Converted, TemplateArgs);
 
     if (!PrevPartial)
       VarTemplate->AddPartialSpecialization(Partial, InsertPos);
@@ -2651,7 +2671,7 @@
     // this explicit specialization or friend declaration.
     Specialization = VarTemplateSpecializationDecl::Create(
         Context, VarTemplate->getDeclContext(), TemplateKWLoc, TemplateNameLoc,
-        VarTemplate, DI->getType(), DI, SC, Converted.data(), Converted.size());
+        VarTemplate, DI->getType(), DI, SC, Converted);
     Specialization->setTemplateArgsInfo(TemplateArgs);
 
     if (!PrevDecl)
@@ -2727,7 +2747,7 @@
   VarTemplatePartialSpecializationDecl *Partial;
   TemplateArgumentList *Args;
 };
-}
+} // end anonymous namespace
 
 DeclResult
 Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
@@ -2747,9 +2767,11 @@
   // corresponds to these arguments.
   void *InsertPos = nullptr;
   if (VarTemplateSpecializationDecl *Spec = Template->findSpecialization(
-          Converted, InsertPos))
+          Converted, InsertPos)) {
+    checkSpecializationVisibility(TemplateNameLoc, Spec);
     // If we already have a variable template specialization, return it.
     return Spec;
+  }
 
   // This is the first time we have referenced this variable template
   // specialization. Create the canonical declaration and add it to
@@ -2757,7 +2779,7 @@
   // that it represents. That is,
   VarDecl *InstantiationPattern = Template->getTemplatedDecl();
   TemplateArgumentList TemplateArgList(TemplateArgumentList::OnStack,
-                                       Converted.data(), Converted.size());
+                                       Converted);
   TemplateArgumentList *InstantiationArgs = &TemplateArgList;
   bool AmbiguousPartialSpec = false;
   typedef PartialSpecMatchResult MatchResult;
@@ -2790,8 +2812,9 @@
               DeduceTemplateArguments(Partial, TemplateArgList, Info)) {
         // Store the failed-deduction information for use in diagnostics, later.
         // TODO: Actually use the failed-deduction info?
-        FailedCandidates.addCandidate()
-            .set(Partial, MakeDeductionFailureInfo(Context, Result, Info));
+        FailedCandidates.addCandidate().set(
+            DeclAccessPair::make(Template, AS_public), Partial,
+            MakeDeductionFailureInfo(Context, Result, Info));
         (void)Result;
       } else {
         Matched.push_back(PartialSpecMatchResult());
@@ -2848,8 +2871,8 @@
   }
 
   // 2. Create the canonical declaration.
-  // Note that we do not instantiate the variable just yet, since
-  // instantiation is handled in DoMarkVarDeclReferenced().
+  // Note that we do not instantiate a definition until we see an odr-use
+  // in DoMarkVarDeclReferenced().
   // FIXME: LateAttrs et al.?
   VarTemplateSpecializationDecl *Decl = BuildVarTemplateInstantiation(
       Template, InstantiationPattern, *InstantiationArgs, TemplateArgs,
@@ -2877,6 +2900,8 @@
           dyn_cast<VarTemplatePartialSpecializationDecl>(InstantiationPattern))
     Decl->setInstantiationOf(D, InstantiationArgs);
 
+  checkSpecializationVisibility(TemplateNameLoc, Decl);
+
   assert(Decl && "No variable template specialization?");
   return Decl;
 }
@@ -3233,8 +3258,7 @@
     if (Inst.isInvalid())
       return nullptr;
 
-    TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                      Converted.data(), Converted.size());
+    TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Converted);
 
     // Only substitute for the innermost template argument list.
     MultiLevelTemplateArgumentList TemplateArgLists;
@@ -3286,8 +3310,7 @@
   if (Inst.isInvalid())
     return ExprError();
 
-  TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                    Converted.data(), Converted.size());
+  TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Converted);
 
   // Only substitute for the innermost template argument list.
   MultiLevelTemplateArgumentList TemplateArgLists;
@@ -3338,8 +3361,7 @@
   if (Inst.isInvalid())
     return TemplateName();
 
-  TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                    Converted.data(), Converted.size());
+  TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Converted);
 
   // Only substitute for the innermost template argument list.
   MultiLevelTemplateArgumentList TemplateArgLists;
@@ -3490,7 +3512,7 @@
         return true;
 
       TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                        Converted.data(), Converted.size());
+                                        Converted);
       NTTPType = SubstType(NTTPType,
                            MultiLevelTemplateArgumentList(TemplateArgs),
                            NTTP->getLocation(),
@@ -3630,8 +3652,7 @@
     if (Inst.isInvalid())
       return true;
 
-    TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                      Converted.data(), Converted.size());
+    TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Converted);
     TempParm = cast_or_null<TemplateTemplateParmDecl>(
                       SubstDecl(TempParm, CurContext,
                                 MultiLevelTemplateArgumentList(TemplateArgs)));
@@ -3742,7 +3763,7 @@
     S.diagnoseMissingImport(Loc, cast<NamedDecl>(TD),
                             D->getDefaultArgumentLoc(), Modules,
                             Sema::MissingImportKind::DefaultArgument,
-                            /*Recover*/ true);
+                            /*Recover*/true);
     return true;
   }
 
@@ -4028,7 +4049,7 @@
     bool VisitTagDecl(const TagDecl *Tag);
     bool VisitNestedNameSpecifier(NestedNameSpecifier *NNS);
   };
-}
+} // end anonymous namespace
 
 bool UnnamedLocalNoLinkageFinder::VisitBuiltinType(const BuiltinType*) {
   return false;
@@ -4243,7 +4264,6 @@
   llvm_unreachable("Invalid NestedNameSpecifier::Kind!");
 }
 
-
 /// \brief Check a template argument against its corresponding
 /// template type parameter.
 ///
@@ -5354,10 +5374,11 @@
   // partial specializations.
   if (!isa<ClassTemplateDecl>(Template) &&
       !isa<TemplateTemplateParmDecl>(Template) &&
-      !isa<TypeAliasTemplateDecl>(Template)) {
+      !isa<TypeAliasTemplateDecl>(Template) &&
+      !isa<BuiltinTemplateDecl>(Template)) {
     assert(isa<FunctionTemplateDecl>(Template) &&
            "Only function templates are possible here");
-    Diag(Arg.getLocation(), diag::err_template_arg_not_class_template);
+    Diag(Arg.getLocation(), diag::err_template_arg_not_valid_template);
     Diag(Template->getLocation(), diag::note_template_arg_refers_here_func)
       << Template;
   }
@@ -6295,9 +6316,7 @@
     bool InstantiationDependent;
     if (!Name.isDependent() &&
         !TemplateSpecializationType::anyDependentTemplateArguments(
-                                             TemplateArgs.getArgumentArray(),
-                                                         TemplateArgs.size(),
-                                                     InstantiationDependent)) {
+            TemplateArgs.arguments(), InstantiationDependent)) {
       Diag(TemplateNameLoc, diag::err_partial_spec_fully_specialized)
         << ClassTemplate->getDeclName();
       isPartialSpecialization = false;
@@ -6330,8 +6349,7 @@
     // arguments of the class template partial specialization.
     TemplateName CanonTemplate = Context.getCanonicalTemplateName(Name);
     CanonType = Context.getTemplateSpecializationType(CanonTemplate,
-                                                      Converted.data(),
-                                                      Converted.size());
+                                                      Converted);
 
     if (Context.hasSameType(CanonType,
                         ClassTemplate->getInjectedClassNameSpecialization())) {
@@ -6362,8 +6380,7 @@
                                                        KWLoc, TemplateNameLoc,
                                                        TemplateParams,
                                                        ClassTemplate,
-                                                       Converted.data(),
-                                                       Converted.size(),
+                                                       Converted,
                                                        TemplateArgs,
                                                        CanonType,
                                                        PrevPartial);
@@ -6418,8 +6435,7 @@
                                              ClassTemplate->getDeclContext(),
                                                 KWLoc, TemplateNameLoc,
                                                 ClassTemplate,
-                                                Converted.data(),
-                                                Converted.size(),
+                                                Converted,
                                                 PrevDecl);
     SetNestedNameSpecifier(Specialization, SS);
     if (TemplateParameterLists.size() > 0) {
@@ -6437,7 +6453,7 @@
              "Only possible with -fms-extensions!");
       TemplateName CanonTemplate = Context.getCanonicalTemplateName(Name);
       CanonType = Context.getTemplateSpecializationType(
-          CanonTemplate, Converted.data(), Converted.size());
+          CanonTemplate, Converted);
     } else {
       CanonType = Context.getTypeDeclType(Specialization);
     }
@@ -6894,12 +6910,13 @@
       FunctionDecl *Specialization = nullptr;
       if (TemplateDeductionResult TDK = DeduceTemplateArguments(
               cast<FunctionTemplateDecl>(FunTmpl->getFirstDecl()),
-              ExplicitTemplateArgs ? &Args : nullptr, FT, Specialization, Info)) {
+              ExplicitTemplateArgs ? &Args : nullptr, FT, Specialization,
+              Info)) {
         // Template argument deduction failed; record why it failed, so
         // that we can provide nifty diagnostics.
-        FailedCandidates.addCandidate()
-            .set(FunTmpl->getTemplatedDecl(),
-                 MakeDeductionFailureInfo(Context, TDK, Info));
+        FailedCandidates.addCandidate().set(
+            I.getPair(), FunTmpl->getTemplatedDecl(),
+            MakeDeductionFailureInfo(Context, TDK, Info));
         (void)TDK;
         continue;
       }
@@ -6926,6 +6943,15 @@
   // Ignore access information;  it doesn't figure into redeclaration checking.
   FunctionDecl *Specialization = cast<FunctionDecl>(*Result);
 
+  // C++ Concepts TS [dcl.spec.concept]p7: A program shall not declare [...]
+  // an explicit specialization (14.8.3) [...] of a concept definition.
+  if (Specialization->getPrimaryTemplate()->isConcept()) {
+    Diag(FD->getLocation(), diag::err_concept_specialized)
+        << 0 /*function*/ << 1 /*explicitly specialized*/;
+    Diag(Specialization->getLocation(), diag::note_previous_declaration);
+    return true;
+  }
+
   FunctionTemplateSpecializationInfo *SpecInfo
     = Specialization->getTemplateSpecializationInfo();
   assert(SpecInfo && "Function template specialization info missing?");
@@ -6975,6 +7001,21 @@
   // Mark the prior declaration as an explicit specialization, so that later
   // clients know that this is an explicit specialization.
   if (!isFriend) {
+    // Since explicit specializations do not inherit '=delete' from their
+    // primary function template - check if the 'specialization' that was
+    // implicitly generated (during template argument deduction for partial
+    // ordering) from the most specialized of all the function templates that
+    // 'FD' could have been specializing, has a 'deleted' definition.  If so,
+    // first check that it was implicitly generated during template argument
+    // deduction by making sure it wasn't referenced, and then reset the deleted
+    // flag to not-deleted, so that we can inherit that information from 'FD'.
+    if (Specialization->isDeleted() && !SpecInfo->isExplicitSpecialization() &&
+        !Specialization->getCanonicalDecl()->isReferenced()) {
+      assert(
+          Specialization->getCanonicalDecl() == Specialization &&
+          "This must be the only existing declaration of this specialization");
+      Specialization->setDeletedAsWritten(false);
+    }
     SpecInfo->setTemplateSpecializationKind(TSK_ExplicitSpecialization);
     MarkUnusedFileScopedDecl(Specialization);
   }
@@ -7016,6 +7057,7 @@
   assert(!isa<TemplateDecl>(Member) && "Only for non-template members");
 
   // Try to find the member we are instantiating.
+  NamedDecl *FoundInstantiation = nullptr;
   NamedDecl *Instantiation = nullptr;
   NamedDecl *InstantiatedFrom = nullptr;
   MemberSpecializationInfo *MSInfo = nullptr;
@@ -7031,6 +7073,7 @@
         if (!hasExplicitCallingConv(Adjusted))
           Adjusted = adjustCCAndNoReturn(Adjusted, Method->getType());
         if (Context.hasSameType(Adjusted, Method->getType())) {
+          FoundInstantiation = *I;
           Instantiation = Method;
           InstantiatedFrom = Method->getInstantiatedFromMemberFunction();
           MSInfo = Method->getMemberSpecializationInfo();
@@ -7043,6 +7086,7 @@
     if (Previous.isSingleResult() &&
         (PrevVar = dyn_cast<VarDecl>(Previous.getFoundDecl())))
       if (PrevVar->isStaticDataMember()) {
+        FoundInstantiation = Previous.getRepresentativeDecl();
         Instantiation = PrevVar;
         InstantiatedFrom = PrevVar->getInstantiatedFromStaticDataMember();
         MSInfo = PrevVar->getMemberSpecializationInfo();
@@ -7051,6 +7095,7 @@
     CXXRecordDecl *PrevRecord;
     if (Previous.isSingleResult() &&
         (PrevRecord = dyn_cast<CXXRecordDecl>(Previous.getFoundDecl()))) {
+      FoundInstantiation = Previous.getRepresentativeDecl();
       Instantiation = PrevRecord;
       InstantiatedFrom = PrevRecord->getInstantiatedFromMemberClass();
       MSInfo = PrevRecord->getMemberSpecializationInfo();
@@ -7059,6 +7104,7 @@
     EnumDecl *PrevEnum;
     if (Previous.isSingleResult() &&
         (PrevEnum = dyn_cast<EnumDecl>(Previous.getFoundDecl()))) {
+      FoundInstantiation = Previous.getRepresentativeDecl();
       Instantiation = PrevEnum;
       InstantiatedFrom = PrevEnum->getInstantiatedFromMemberEnum();
       MSInfo = PrevEnum->getMemberSpecializationInfo();
@@ -7087,7 +7133,7 @@
     }
 
     Previous.clear();
-    Previous.addDecl(Instantiation);
+    Previous.addDecl(FoundInstantiation);
     return false;
   }
 
@@ -7134,6 +7180,13 @@
       InstantiationFunction->setTemplateSpecializationKind(
                                                   TSK_ExplicitSpecialization);
       InstantiationFunction->setLocation(Member->getLocation());
+      // Explicit specializations of member functions of class templates do not
+      // inherit '=delete' from the member function they are specializing.
+      if (InstantiationFunction->isDeleted()) {
+        assert(InstantiationFunction->getCanonicalDecl() ==
+               InstantiationFunction);
+        InstantiationFunction->setDeletedAsWritten(false);
+      }
     }
 
     cast<FunctionDecl>(Member)->setInstantiationOfMemberFunction(
@@ -7181,7 +7234,7 @@
   // Save the caller the trouble of having to figure out which declaration
   // this specialization matches.
   Previous.clear();
-  Previous.addDecl(Instantiation);
+  Previous.addDecl(FoundInstantiation);
   return false;
 }
 
@@ -7336,6 +7389,29 @@
     }
   }
 
+  // In MSVC mode, dllimported explicit instantiation definitions are treated as
+  // instantiation declarations for most purposes.
+  bool DLLImportExplicitInstantiationDef = false;
+  if (TSK == TSK_ExplicitInstantiationDefinition &&
+      Context.getTargetInfo().getCXXABI().isMicrosoft()) {
+    // Check for dllimport class template instantiation definitions.
+    bool DLLImport =
+        ClassTemplate->getTemplatedDecl()->getAttr<DLLImportAttr>();
+    for (AttributeList *A = Attr; A; A = A->getNext()) {
+      if (A->getKind() == AttributeList::AT_DLLImport)
+        DLLImport = true;
+      if (A->getKind() == AttributeList::AT_DLLExport) {
+        // dllexport trumps dllimport here.
+        DLLImport = false;
+        break;
+      }
+    }
+    if (DLLImport) {
+      TSK = TSK_ExplicitInstantiationDeclaration;
+      DLLImportExplicitInstantiationDef = true;
+    }
+  }
+
   // Translate the parser's template argument list in our AST format.
   TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
   translateTemplateArguments(TemplateArgsIn, TemplateArgs);
@@ -7389,6 +7465,12 @@
       Specialization->setLocation(TemplateNameLoc);
       PrevDecl = nullptr;
     }
+
+    if (PrevDecl_TSK == TSK_ExplicitInstantiationDeclaration &&
+        DLLImportExplicitInstantiationDef) {
+      // The new specialization might add a dllimport attribute.
+      HasNoEffect = false;
+    }
   }
 
   if (!Specialization) {
@@ -7399,8 +7481,7 @@
                                              ClassTemplate->getDeclContext(),
                                                 KWLoc, TemplateNameLoc,
                                                 ClassTemplate,
-                                                Converted.data(),
-                                                Converted.size(),
+                                                Converted,
                                                 PrevDecl);
     SetNestedNameSpecifier(Specialization, SS);
 
@@ -7467,11 +7548,11 @@
                                        Specialization->getDefinition());
   if (Def) {
     TemplateSpecializationKind Old_TSK = Def->getTemplateSpecializationKind();
-
     // Fix a TSK_ExplicitInstantiationDeclaration followed by a
     // TSK_ExplicitInstantiationDefinition
     if (Old_TSK == TSK_ExplicitInstantiationDeclaration &&
-        TSK == TSK_ExplicitInstantiationDefinition) {
+        (TSK == TSK_ExplicitInstantiationDefinition ||
+         DLLImportExplicitInstantiationDef)) {
       // FIXME: Need to notify the ASTMutationListener that we did this.
       Def->setTemplateSpecializationKind(TSK);
 
@@ -7484,7 +7565,13 @@
             getDLLAttr(Specialization)->clone(getASTContext()));
         A->setInherited(true);
         Def->addAttr(A);
+
+        // We reject explicit instantiations in class scope, so there should
+        // never be any delayed exported classes to worry about.
+        assert(DelayedDllExportClasses.empty() &&
+               "delayed exports present at explicit instantiation");
         checkClassLevelDLLAttribute(Def);
+        referenceDLLExportedClassMethods();
 
         // Propagate attribute to base class templates.
         for (auto &B : Def->bases()) {
@@ -7695,6 +7782,15 @@
     Diag(D.getDeclSpec().getConstexprSpecLoc(),
          diag::err_explicit_instantiation_constexpr);
 
+  // C++ Concepts TS [dcl.spec.concept]p1: The concept specifier shall be
+  // applied only to the definition of a function template or variable template,
+  // declared in namespace scope.
+  if (D.getDeclSpec().isConceptSpecified()) {
+    Diag(D.getDeclSpec().getConceptSpecLoc(),
+         diag::err_concept_specified_specialization) << 0;
+    return true;
+  }
+
   // C++0x [temp.explicit]p2:
   //   There are two forms of explicit instantiation: an explicit instantiation
   //   definition and an explicit instantiation declaration. An explicit
@@ -7766,6 +7862,15 @@
         return true;
       }
 
+      // C++ Concepts TS [dcl.spec.concept]p7: A program shall not declare an
+      // explicit instantiation (14.8.2) [...] of a concept definition.
+      if (PrevTemplate->isConcept()) {
+        Diag(D.getIdentifierLoc(), diag::err_concept_specialized)
+            << 1 /*variable*/ << 0 /*explicitly instantiated*/;
+        Diag(PrevTemplate->getLocation(), diag::note_previous_declaration);
+        return true;
+      }
+
       // Translate the parser's template argument list into our AST format.
       TemplateArgumentListInfo TemplateArgs =
           makeTemplateArgumentListInfo(*this, *D.getName().TemplateId);
@@ -7879,7 +7984,7 @@
                                     R, Specialization, Info)) {
       // Keep track of almost-matches.
       FailedCandidates.addCandidate()
-          .set(FunTmpl->getTemplatedDecl(),
+          .set(P.getPair(), FunTmpl->getTemplatedDecl(),
                MakeDeductionFailureInfo(Context, TDK, Info));
       (void)TDK;
       continue;
@@ -7982,6 +8087,16 @@
          diag::ext_explicit_instantiation_without_qualified_id)
     << Specialization << D.getCXXScopeSpec().getRange();
 
+  // C++ Concepts TS [dcl.spec.concept]p7: A program shall not declare an
+  // explicit instantiation (14.8.2) [...] of a concept definition.
+  if (FunTmpl && FunTmpl->isConcept() &&
+      !D.getDeclSpec().isConceptSpecified()) {
+    Diag(D.getIdentifierLoc(), diag::err_concept_specialized)
+        << 0 /*function*/ << 0 /*explicitly instantiated*/;
+    Diag(FunTmpl->getLocation(), diag::note_previous_declaration);
+    return true;
+  }
+
   CheckExplicitInstantiationScope(*this,
                    FunTmpl? (NamedDecl *)FunTmpl
                           : Specialization->getInstantiatedFromMemberFunction(),
@@ -8318,7 +8433,7 @@
       return E;
     }
   };
-}
+} // end anonymous namespace
 
 /// \brief Rebuilds a type within the context of the current instantiation.
 ///
@@ -8493,3 +8608,151 @@
   }
   return false;
 }
+
+namespace {
+/// \brief Walk the path from which a declaration was instantiated, and check
+/// that every explicit specialization along that path is visible. This enforces
+/// C++ [temp.expl.spec]/6:
+///
+///   If a template, a member template or a member of a class template is
+///   explicitly specialized then that specialization shall be declared before
+///   the first use of that specialization that would cause an implicit
+///   instantiation to take place, in every translation unit in which such a
+///   use occurs; no diagnostic is required.
+///
+/// and also C++ [temp.class.spec]/1:
+///
+///   A partial specialization shall be declared before the first use of a
+///   class template specialization that would make use of the partial
+///   specialization as the result of an implicit or explicit instantiation
+///   in every translation unit in which such a use occurs; no diagnostic is
+///   required.
+class ExplicitSpecializationVisibilityChecker {
+  Sema &S;
+  SourceLocation Loc;
+  llvm::SmallVector<Module *, 8> Modules;
+
+public:
+  ExplicitSpecializationVisibilityChecker(Sema &S, SourceLocation Loc)
+      : S(S), Loc(Loc) {}
+
+  void check(NamedDecl *ND) {
+    if (auto *FD = dyn_cast<FunctionDecl>(ND))
+      return checkImpl(FD);
+    if (auto *RD = dyn_cast<CXXRecordDecl>(ND))
+      return checkImpl(RD);
+    if (auto *VD = dyn_cast<VarDecl>(ND))
+      return checkImpl(VD);
+    if (auto *ED = dyn_cast<EnumDecl>(ND))
+      return checkImpl(ED);
+  }
+
+private:
+  void diagnose(NamedDecl *D, bool IsPartialSpec) {
+    auto Kind = IsPartialSpec ? Sema::MissingImportKind::PartialSpecialization
+                              : Sema::MissingImportKind::ExplicitSpecialization;
+    const bool Recover = true;
+
+    // If we got a custom set of modules (because only a subset of the
+    // declarations are interesting), use them, otherwise let
+    // diagnoseMissingImport intelligently pick some.
+    if (Modules.empty())
+      S.diagnoseMissingImport(Loc, D, Kind, Recover);
+    else
+      S.diagnoseMissingImport(Loc, D, D->getLocation(), Modules, Kind, Recover);
+  }
+
+  // Check a specific declaration. There are three problematic cases:
+  //
+  //  1) The declaration is an explicit specialization of a template
+  //     specialization.
+  //  2) The declaration is an explicit specialization of a member of an
+  //     templated class.
+  //  3) The declaration is an instantiation of a template, and that template
+  //     is an explicit specialization of a member of a templated class.
+  //
+  // We don't need to go any deeper than that, as the instantiation of the
+  // surrounding class / etc is not triggered by whatever triggered this
+  // instantiation, and thus should be checked elsewhere.
+  template<typename SpecDecl>
+  void checkImpl(SpecDecl *Spec) {
+    bool IsHiddenExplicitSpecialization = false;
+    if (Spec->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) {
+      IsHiddenExplicitSpecialization =
+          Spec->getMemberSpecializationInfo()
+              ? !S.hasVisibleMemberSpecialization(Spec, &Modules)
+              : !S.hasVisibleDeclaration(Spec);
+    } else {
+      checkInstantiated(Spec);
+    }
+
+    if (IsHiddenExplicitSpecialization)
+      diagnose(Spec->getMostRecentDecl(), false);
+  }
+
+  void checkInstantiated(FunctionDecl *FD) {
+    if (auto *TD = FD->getPrimaryTemplate())
+      checkTemplate(TD);
+  }
+
+  void checkInstantiated(CXXRecordDecl *RD) {
+    auto *SD = dyn_cast<ClassTemplateSpecializationDecl>(RD);
+    if (!SD)
+      return;
+
+    auto From = SD->getSpecializedTemplateOrPartial();
+    if (auto *TD = From.dyn_cast<ClassTemplateDecl *>())
+      checkTemplate(TD);
+    else if (auto *TD =
+                 From.dyn_cast<ClassTemplatePartialSpecializationDecl *>()) {
+      if (!S.hasVisibleDeclaration(TD))
+        diagnose(TD, true);
+      checkTemplate(TD);
+    }
+  }
+
+  void checkInstantiated(VarDecl *RD) {
+    auto *SD = dyn_cast<VarTemplateSpecializationDecl>(RD);
+    if (!SD)
+      return;
+
+    auto From = SD->getSpecializedTemplateOrPartial();
+    if (auto *TD = From.dyn_cast<VarTemplateDecl *>())
+      checkTemplate(TD);
+    else if (auto *TD =
+                 From.dyn_cast<VarTemplatePartialSpecializationDecl *>()) {
+      if (!S.hasVisibleDeclaration(TD))
+        diagnose(TD, true);
+      checkTemplate(TD);
+    }
+  }
+
+  void checkInstantiated(EnumDecl *FD) {}
+
+  template<typename TemplDecl>
+  void checkTemplate(TemplDecl *TD) {
+    if (TD->isMemberSpecialization()) {
+      if (!S.hasVisibleMemberSpecialization(TD, &Modules))
+        diagnose(TD->getMostRecentDecl(), false);
+    }
+  }
+};
+} // end anonymous namespace
+
+void Sema::checkSpecializationVisibility(SourceLocation Loc, NamedDecl *Spec) {
+  if (!getLangOpts().Modules)
+    return;
+
+  ExplicitSpecializationVisibilityChecker(*this, Loc).check(Spec);
+}
+
+/// \brief Check whether a template partial specialization that we've discovered
+/// is hidden, and produce suitable diagnostics if so.
+void Sema::checkPartialSpecializationVisibility(SourceLocation Loc,
+                                                NamedDecl *Spec) {
+  llvm::SmallVector<Module *, 8> Modules;
+  if (!hasVisibleDeclaration(Spec, &Modules))
+    diagnoseMissingImport(Loc, Spec, Spec->getLocation(), Modules,
+                          MissingImportKind::PartialSpecialization,
+                          /*Recover*/true);
+}
diff --git a/lib/Sema/SemaTemplateDeduction.cpp b/lib/Sema/SemaTemplateDeduction.cpp
index a25e548..f8e825f 100644
--- a/lib/Sema/SemaTemplateDeduction.cpp
+++ b/lib/Sema/SemaTemplateDeduction.cpp
@@ -103,12 +103,12 @@
                                    bool PartialOrdering = false);
 
 static Sema::TemplateDeductionResult
-DeduceTemplateArguments(Sema &S,
-                        TemplateParameterList *TemplateParams,
+DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         const TemplateArgument *Params, unsigned NumParams,
                         const TemplateArgument *Args, unsigned NumArgs,
                         TemplateDeductionInfo &Info,
-                        SmallVectorImpl<DeducedTemplateArgument> &Deduced);
+                        SmallVectorImpl<DeducedTemplateArgument> &Deduced,
+                        bool NumberOfArgumentsMustMatch);
 
 /// \brief If the given expression is of a form that permits the deduction
 /// of a non-type template parameter, return the declaration of that
@@ -286,13 +286,10 @@
 
 /// \brief Deduce the value of the given non-type template parameter
 /// from the given constant.
-static Sema::TemplateDeductionResult
-DeduceNonTypeTemplateArgument(Sema &S,
-                              NonTypeTemplateParmDecl *NTTP,
-                              llvm::APSInt Value, QualType ValueType,
-                              bool DeducedFromArrayBound,
-                              TemplateDeductionInfo &Info,
-                    SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
+static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+    Sema &S, NonTypeTemplateParmDecl *NTTP, const llvm::APSInt &Value,
+    QualType ValueType, bool DeducedFromArrayBound, TemplateDeductionInfo &Info,
+    SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   assert(NTTP->getDepth() == 0 &&
          "Cannot deduce non-type template argument with depth > 0");
 
@@ -456,10 +453,10 @@
     // Perform template argument deduction on each template
     // argument. Ignore any missing/extra arguments, since they could be
     // filled in by default arguments.
-    return DeduceTemplateArguments(S, TemplateParams,
-                                   Param->getArgs(), Param->getNumArgs(),
-                                   SpecArg->getArgs(), SpecArg->getNumArgs(),
-                                   Info, Deduced);
+    return DeduceTemplateArguments(S, TemplateParams, Param->getArgs(),
+                                   Param->getNumArgs(), SpecArg->getArgs(),
+                                   SpecArg->getNumArgs(), Info, Deduced,
+                                   /*NumberOfArgumentsMustMatch=*/false);
   }
 
   // If the argument type is a class template specialization, we
@@ -490,11 +487,10 @@
     return Result;
 
   // Perform template argument deduction for the template arguments.
-  return DeduceTemplateArguments(S, TemplateParams,
-                                 Param->getArgs(), Param->getNumArgs(),
-                                 SpecArg->getTemplateArgs().data(),
-                                 SpecArg->getTemplateArgs().size(),
-                                 Info, Deduced);
+  return DeduceTemplateArguments(
+      S, TemplateParams, Param->getArgs(), Param->getNumArgs(),
+      SpecArg->getTemplateArgs().data(), SpecArg->getTemplateArgs().size(),
+      Info, Deduced, /*NumberOfArgumentsMustMatch=*/true);
 }
 
 /// \brief Determines whether the given type is an opaque type that
@@ -867,12 +863,12 @@
 
   if (ParamQs == ArgQs)
     return false;
-       
+
   // Mismatched (but not missing) Objective-C GC attributes.
-  if (ParamQs.getObjCGCAttr() != ArgQs.getObjCGCAttr() && 
+  if (ParamQs.getObjCGCAttr() != ArgQs.getObjCGCAttr() &&
       ParamQs.hasObjCGCAttr())
     return true;
-  
+
   // Mismatched (but not missing) address spaces.
   if (ParamQs.getAddressSpace() != ArgQs.getAddressSpace() &&
       ParamQs.hasAddressSpace())
@@ -882,7 +878,7 @@
   if (ParamQs.getObjCLifetime() != ArgQs.getObjCLifetime() &&
       ParamQs.hasObjCLifetime())
     return true;
-  
+
   // CVR qualifier superset.
   return (ParamQs.getCVRQualifiers() != ArgQs.getCVRQualifiers()) &&
       ((ParamQs.getCVRQualifiers() | ArgQs.getCVRQualifiers())
@@ -1064,7 +1060,7 @@
     // Just skip any attempts to deduce from a placeholder type.
     if (Arg->isPlaceholderType())
       return Sema::TDK_Success;
-    
+
     unsigned Index = TemplateTypeParm->getIndex();
     bool RecanonicalizeArg = false;
 
@@ -1104,7 +1100,7 @@
       DeducedQs.removeAddressSpace();
     if (ParamQs.hasObjCLifetime())
       DeducedQs.removeObjCLifetime();
-    
+
     // Objective-C ARC:
     //   If template deduction would produce a lifetime qualifier on a type
     //   that is not a lifetime type, template argument deduction fails.
@@ -1113,9 +1109,9 @@
       Info.Param = cast<TemplateTypeParmDecl>(TemplateParams->getParam(Index));
       Info.FirstArg = TemplateArgument(Param);
       Info.SecondArg = TemplateArgument(Arg);
-      return Sema::TDK_Underqualified;      
+      return Sema::TDK_Underqualified;
     }
-    
+
     // Objective-C ARC:
     //   If template deduction would produce an argument type with lifetime type
     //   but no lifetime qualifier, the __strong lifetime qualifier is inferred.
@@ -1123,10 +1119,10 @@
         DeducedType->isObjCLifetimeType() &&
         !DeducedQs.hasObjCLifetime())
       DeducedQs.setObjCLifetime(Qualifiers::OCL_Strong);
-    
+
     DeducedType = S.Context.getQualifiedType(DeducedType.getUnqualifiedType(),
                                              DeducedQs);
-    
+
     if (RecanonicalizeArg)
       DeducedType = S.Context.getCanonicalType(DeducedType);
 
@@ -1167,7 +1163,7 @@
       if (Param.getCVRQualifiers() != Arg.getCVRQualifiers())
         return Sema::TDK_NonDeducedMismatch;
     }
-    
+
     // If the parameter type is not dependent, there is nothing to deduce.
     if (!Param->isDependentType()) {
       if (!(TDF & TDF_SkipNonDependent)) {
@@ -1197,7 +1193,7 @@
   case Type::Class: llvm_unreachable("deducing non-canonical type: " #Class);
 #define TYPE(Class, Base)
 #include "clang/AST/TypeNodes.def"
-      
+
     case Type::TemplateTypeParm:
     case Type::SubstTemplateTypeParmPack:
       llvm_unreachable("Type nodes handled above");
@@ -1215,20 +1211,20 @@
     case Type::ObjCObjectPointer: {
       if (TDF & TDF_SkipNonDependent)
         return Sema::TDK_Success;
-      
+
       if (TDF & TDF_IgnoreQualifiers) {
         Param = Param.getUnqualifiedType();
         Arg = Arg.getUnqualifiedType();
       }
-            
+
       return Param == Arg? Sema::TDK_Success : Sema::TDK_NonDeducedMismatch;
     }
-      
-    //     _Complex T   [placeholder extension]  
+
+    //     _Complex T   [placeholder extension]
     case Type::Complex:
       if (const ComplexType *ComplexArg = Arg->getAs<ComplexType>())
-        return DeduceTemplateArgumentsByTypeMatch(S, TemplateParams, 
-                                    cast<ComplexType>(Param)->getElementType(), 
+        return DeduceTemplateArgumentsByTypeMatch(S, TemplateParams,
+                                    cast<ComplexType>(Param)->getElementType(),
                                     ComplexArg->getElementType(),
                                     Info, Deduced, TDF);
 
@@ -1464,6 +1460,7 @@
       SmallVector<const RecordType *, 8> ToVisit;
       ToVisit.push_back(RecordT);
       bool Successful = false;
+      SmallVector<DeducedTemplateArgument, 8> SuccessfulDeduced;
       while (!ToVisit.empty()) {
         // Retrieve the next class in the inheritance hierarchy.
         const RecordType *NextT = ToVisit.pop_back_val();
@@ -1484,14 +1481,20 @@
           // note that we had some success. Otherwise, ignore any deductions
           // from this base class.
           if (BaseResult == Sema::TDK_Success) {
+            // If we've already seen some success, then deduction fails due to
+            // an ambiguity (temp.deduct.call p5).
+            if (Successful)
+              return Sema::TDK_MiscellaneousDeductionFailure;
+
             Successful = true;
-            DeducedOrig.clear();
-            DeducedOrig.append(Deduced.begin(), Deduced.end());
+            std::swap(SuccessfulDeduced, Deduced);
+
             Info.Param = BaseInfo.Param;
             Info.FirstArg = BaseInfo.FirstArg;
             Info.SecondArg = BaseInfo.SecondArg;
-          } else
-            Deduced = DeducedOrig;
+          }
+
+          Deduced = DeducedOrig;
         }
 
         // Visit base classes
@@ -1503,8 +1506,10 @@
         }
       }
 
-      if (Successful)
+      if (Successful) {
+        std::swap(SuccessfulDeduced, Deduced);
         return Sema::TDK_Success;
+      }
 
       return Result;
     }
@@ -1544,7 +1549,7 @@
       return DeduceTemplateArgumentsByTypeMatch(S, TemplateParams,
                                            QualType(MemPtrParam->getClass(), 0),
                                            QualType(MemPtrArg->getClass(), 0),
-                                           Info, Deduced, 
+                                           Info, Deduced,
                                            TDF & TDF_IgnoreQualifiers);
     }
 
@@ -1575,15 +1580,15 @@
         // Make sure that the vectors have the same number of elements.
         if (VectorParam->getNumElements() != VectorArg->getNumElements())
           return Sema::TDK_NonDeducedMismatch;
-        
+
         // Perform deduction on the element types.
         return DeduceTemplateArgumentsByTypeMatch(S, TemplateParams,
                                                   VectorParam->getElementType(),
                                                   VectorArg->getElementType(),
                                                   Info, Deduced, TDF);
       }
-      
-      if (const DependentSizedExtVectorType *VectorArg 
+
+      if (const DependentSizedExtVectorType *VectorArg
                                 = dyn_cast<DependentSizedExtVectorType>(Arg)) {
         // We can't check the number of elements, since the argument has a
         // dependent number of elements. This can only occur during partial
@@ -1595,10 +1600,10 @@
                                                   VectorArg->getElementType(),
                                                   Info, Deduced, TDF);
       }
-      
+
       return Sema::TDK_NonDeducedMismatch;
     }
-      
+
     //     (clang extension)
     //
     //     T __attribute__(((ext_vector_type(N))))
@@ -1614,7 +1619,7 @@
                                                    VectorArg->getElementType(),
                                                    Info, Deduced, TDF))
           return Result;
-        
+
         // Perform deduction on the vector size, if we can.
         NonTypeTemplateParmDecl *NTTP
           = getDeducedParameterFromExpr(VectorParam->getSizeExpr());
@@ -1626,8 +1631,8 @@
         return DeduceNonTypeTemplateArgument(S, NTTP, ArgSize, S.Context.IntTy,
                                              false, Info, Deduced);
       }
-      
-      if (const DependentSizedExtVectorType *VectorArg 
+
+      if (const DependentSizedExtVectorType *VectorArg
                                 = dyn_cast<DependentSizedExtVectorType>(Arg)) {
         // Perform deduction on the element types.
         if (Sema::TemplateDeductionResult Result
@@ -1636,20 +1641,20 @@
                                                  VectorArg->getElementType(),
                                                  Info, Deduced, TDF))
           return Result;
-        
+
         // Perform deduction on the vector size, if we can.
         NonTypeTemplateParmDecl *NTTP
           = getDeducedParameterFromExpr(VectorParam->getSizeExpr());
         if (!NTTP)
           return Sema::TDK_Success;
-        
+
         return DeduceNonTypeTemplateArgument(S, NTTP, VectorArg->getSizeExpr(),
                                              Info, Deduced);
       }
-      
+
       return Sema::TDK_NonDeducedMismatch;
     }
-      
+
     case Type::TypeOfExpr:
     case Type::TypeOf:
     case Type::DependentName:
@@ -1828,12 +1833,12 @@
 }
 
 static Sema::TemplateDeductionResult
-DeduceTemplateArguments(Sema &S,
-                        TemplateParameterList *TemplateParams,
+DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         const TemplateArgument *Params, unsigned NumParams,
                         const TemplateArgument *Args, unsigned NumArgs,
                         TemplateDeductionInfo &Info,
-                        SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
+                        SmallVectorImpl<DeducedTemplateArgument> &Deduced,
+                        bool NumberOfArgumentsMustMatch) {
   // C++0x [temp.deduct.type]p9:
   //   If the template argument list of P contains a pack expansion that is not
   //   the last template argument, the entire template argument list is a
@@ -1853,7 +1858,8 @@
 
       // Check whether we have enough arguments.
       if (!hasTemplateArgumentForDeduction(Args, ArgIdx, NumArgs))
-        return Sema::TDK_Success;
+        return NumberOfArgumentsMustMatch ? Sema::TDK_TooFewArguments
+                                          : Sema::TDK_Success;
 
       if (Args[ArgIdx].isPackExpansion()) {
         // FIXME: We follow the logic of C++0x [temp.deduct.type]p22 here,
@@ -1924,7 +1930,7 @@
   return DeduceTemplateArguments(S, TemplateParams,
                                  ParamList.data(), ParamList.size(),
                                  ArgList.data(), ArgList.size(),
-                                 Info, Deduced);
+                                 Info, Deduced, false);
 }
 
 /// \brief Determine whether two template arguments are the same.
@@ -1985,8 +1991,6 @@
 /// \brief Allocate a TemplateArgumentLoc where all locations have
 /// been initialized to the given location.
 ///
-/// \param S The semantic analysis object.
-///
 /// \param Arg The template argument we are producing template argument
 /// location information for.
 ///
@@ -1996,37 +2000,33 @@
 ///
 /// \param Loc The source location to use for the resulting template
 /// argument.
-static TemplateArgumentLoc
-getTrivialTemplateArgumentLoc(Sema &S,
-                              const TemplateArgument &Arg,
-                              QualType NTTPType,
-                              SourceLocation Loc) {
+TemplateArgumentLoc
+Sema::getTrivialTemplateArgumentLoc(const TemplateArgument &Arg,
+                                    QualType NTTPType, SourceLocation Loc) {
   switch (Arg.getKind()) {
   case TemplateArgument::Null:
     llvm_unreachable("Can't get a NULL template argument here");
 
   case TemplateArgument::Type:
-    return TemplateArgumentLoc(Arg,
-                     S.Context.getTrivialTypeSourceInfo(Arg.getAsType(), Loc));
+    return TemplateArgumentLoc(
+        Arg, Context.getTrivialTypeSourceInfo(Arg.getAsType(), Loc));
 
   case TemplateArgument::Declaration: {
-    Expr *E
-      = S.BuildExpressionFromDeclTemplateArgument(Arg, NTTPType, Loc)
-          .getAs<Expr>();
+    Expr *E = BuildExpressionFromDeclTemplateArgument(Arg, NTTPType, Loc)
+                  .getAs<Expr>();
     return TemplateArgumentLoc(TemplateArgument(E), E);
   }
 
   case TemplateArgument::NullPtr: {
-    Expr *E
-      = S.BuildExpressionFromDeclTemplateArgument(Arg, NTTPType, Loc)
-          .getAs<Expr>();
+    Expr *E = BuildExpressionFromDeclTemplateArgument(Arg, NTTPType, Loc)
+                  .getAs<Expr>();
     return TemplateArgumentLoc(TemplateArgument(NTTPType, /*isNullPtr*/true),
                                E);
   }
 
   case TemplateArgument::Integral: {
-    Expr *E
-      = S.BuildExpressionFromIntegralTemplateArgument(Arg, Loc).getAs<Expr>();
+    Expr *E =
+        BuildExpressionFromIntegralTemplateArgument(Arg, Loc).getAs<Expr>();
     return TemplateArgumentLoc(TemplateArgument(E), E);
   }
 
@@ -2035,18 +2035,16 @@
       NestedNameSpecifierLocBuilder Builder;
       TemplateName Template = Arg.getAsTemplate();
       if (DependentTemplateName *DTN = Template.getAsDependentTemplateName())
-        Builder.MakeTrivial(S.Context, DTN->getQualifier(), Loc);
+        Builder.MakeTrivial(Context, DTN->getQualifier(), Loc);
       else if (QualifiedTemplateName *QTN =
                    Template.getAsQualifiedTemplateName())
-        Builder.MakeTrivial(S.Context, QTN->getQualifier(), Loc);
-      
+        Builder.MakeTrivial(Context, QTN->getQualifier(), Loc);
+
       if (Arg.getKind() == TemplateArgument::Template)
-        return TemplateArgumentLoc(Arg, 
-                                   Builder.getWithLocInContext(S.Context),
+        return TemplateArgumentLoc(Arg, Builder.getWithLocInContext(Context),
                                    Loc);
-      
-      
-      return TemplateArgumentLoc(Arg, Builder.getWithLocInContext(S.Context),
+
+      return TemplateArgumentLoc(Arg, Builder.getWithLocInContext(Context),
                                  Loc, Loc);
     }
 
@@ -2067,11 +2065,45 @@
 ConvertDeducedTemplateArgument(Sema &S, NamedDecl *Param,
                                DeducedTemplateArgument Arg,
                                NamedDecl *Template,
-                               QualType NTTPType,
-                               unsigned ArgumentPackIndex,
                                TemplateDeductionInfo &Info,
                                bool InFunctionTemplate,
                                SmallVectorImpl<TemplateArgument> &Output) {
+  // First, for a non-type template parameter type that is
+  // initialized by a declaration, we need the type of the
+  // corresponding non-type template parameter.
+  QualType NTTPType;
+  if (NonTypeTemplateParmDecl *NTTP =
+          dyn_cast<NonTypeTemplateParmDecl>(Param)) {
+    NTTPType = NTTP->getType();
+    if (NTTPType->isDependentType()) {
+      TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Output);
+      NTTPType = S.SubstType(NTTPType,
+                             MultiLevelTemplateArgumentList(TemplateArgs),
+                             NTTP->getLocation(),
+                             NTTP->getDeclName());
+      if (NTTPType.isNull())
+        return true;
+    }
+  }
+
+  auto ConvertArg = [&](DeducedTemplateArgument Arg,
+                        unsigned ArgumentPackIndex) {
+    // Convert the deduced template argument into a template
+    // argument that we can check, almost as if the user had written
+    // the template argument explicitly.
+    TemplateArgumentLoc ArgLoc =
+        S.getTrivialTemplateArgumentLoc(Arg, NTTPType, Info.getLocation());
+
+    // Check the template argument, converting it as necessary.
+    return S.CheckTemplateArgument(
+        Param, ArgLoc, Template, Template->getLocation(),
+        Template->getSourceRange().getEnd(), ArgumentPackIndex, Output,
+        InFunctionTemplate
+            ? (Arg.wasDeducedFromArrayBound() ? Sema::CTAK_DeducedFromArrayBound
+                                              : Sema::CTAK_Deduced)
+            : Sema::CTAK_Specified);
+  };
+
   if (Arg.getKind() == TemplateArgument::Pack) {
     // This is a template argument pack, so check each of its arguments against
     // the template parameter.
@@ -2082,39 +2114,41 @@
       // checking logic has all of the prior template arguments available.
       DeducedTemplateArgument InnerArg(P);
       InnerArg.setDeducedFromArrayBound(Arg.wasDeducedFromArrayBound());
-      if (ConvertDeducedTemplateArgument(S, Param, InnerArg, Template,
-                                         NTTPType, PackedArgsBuilder.size(),
-                                         Info, InFunctionTemplate, Output))
+      assert(InnerArg.getKind() != TemplateArgument::Pack &&
+             "deduced nested pack");
+      if (ConvertArg(InnerArg, PackedArgsBuilder.size()))
         return true;
 
       // Move the converted template argument into our argument pack.
       PackedArgsBuilder.push_back(Output.pop_back_val());
     }
 
+    // If the pack is empty, we still need to substitute into the parameter
+    // itself, in case that substitution fails. For non-type parameters, we did
+    // this above. For type parameters, no substitution is ever required.
+    auto *TTP = dyn_cast<TemplateTemplateParmDecl>(Param);
+    if (TTP && PackedArgsBuilder.empty()) {
+      // Set up a template instantiation context.
+      LocalInstantiationScope Scope(S);
+      Sema::InstantiatingTemplate Inst(S, Template->getLocation(), Template,
+                                       TTP, Output,
+                                       Template->getSourceRange());
+      if (Inst.isInvalid())
+        return true;
+
+      TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Output);
+      if (!S.SubstDecl(TTP, S.CurContext,
+                       MultiLevelTemplateArgumentList(TemplateArgs)))
+        return true;
+    }
+
     // Create the resulting argument pack.
     Output.push_back(
         TemplateArgument::CreatePackCopy(S.Context, PackedArgsBuilder));
     return false;
   }
 
-  // Convert the deduced template argument into a template
-  // argument that we can check, almost as if the user had written
-  // the template argument explicitly.
-  TemplateArgumentLoc ArgLoc = getTrivialTemplateArgumentLoc(S, Arg, NTTPType,
-                                                             Info.getLocation());
-
-  // Check the template argument, converting it as necessary.
-  return S.CheckTemplateArgument(Param, ArgLoc,
-                                 Template,
-                                 Template->getLocation(),
-                                 Template->getSourceRange().getEnd(),
-                                 ArgumentPackIndex,
-                                 Output,
-                                 InFunctionTemplate
-                                  ? (Arg.wasDeducedFromArrayBound()
-                                       ? Sema::CTAK_DeducedFromArrayBound
-                                       : Sema::CTAK_Deduced)
-                                 : Sema::CTAK_Specified);
+  return ConvertArg(Arg, 0);
 }
 
 /// Complete template argument deduction for a class template partial
@@ -2145,47 +2179,19 @@
 
     // We have deduced this argument, so it still needs to be
     // checked and converted.
-
-    // First, for a non-type template parameter type that is
-    // initialized by a declaration, we need the type of the
-    // corresponding non-type template parameter.
-    QualType NTTPType;
-    if (NonTypeTemplateParmDecl *NTTP
-                                  = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
-      NTTPType = NTTP->getType();
-      if (NTTPType->isDependentType()) {
-        TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                          Builder.data(), Builder.size());
-        NTTPType = S.SubstType(NTTPType,
-                               MultiLevelTemplateArgumentList(TemplateArgs),
-                               NTTP->getLocation(),
-                               NTTP->getDeclName());
-        if (NTTPType.isNull()) {
-          Info.Param = makeTemplateParameter(Param);
-          // FIXME: These template arguments are temporary. Free them!
-          Info.reset(TemplateArgumentList::CreateCopy(S.Context,
-                                                      Builder.data(),
-                                                      Builder.size()));
-          return Sema::TDK_SubstitutionFailure;
-        }
-      }
-    }
-
     if (ConvertDeducedTemplateArgument(S, Param, Deduced[I],
-                                       Partial, NTTPType, 0, Info, false,
+                                       Partial, Info, false,
                                        Builder)) {
       Info.Param = makeTemplateParameter(Param);
       // FIXME: These template arguments are temporary. Free them!
-      Info.reset(TemplateArgumentList::CreateCopy(S.Context, Builder.data(),
-                                                  Builder.size()));
+      Info.reset(TemplateArgumentList::CreateCopy(S.Context, Builder));
       return Sema::TDK_SubstitutionFailure;
     }
   }
 
   // Form the template argument list from the deduced template arguments.
   TemplateArgumentList *DeducedArgumentList
-    = TemplateArgumentList::CreateCopy(S.Context, Builder.data(),
-                                       Builder.size());
+    = TemplateArgumentList::CreateCopy(S.Context, Builder);
 
   Info.reset(DeducedArgumentList);
 
@@ -2313,43 +2319,18 @@
 
     // We have deduced this argument, so it still needs to be
     // checked and converted.
-
-    // First, for a non-type template parameter type that is
-    // initialized by a declaration, we need the type of the
-    // corresponding non-type template parameter.
-    QualType NTTPType;
-    if (NonTypeTemplateParmDecl *NTTP =
-            dyn_cast<NonTypeTemplateParmDecl>(Param)) {
-      NTTPType = NTTP->getType();
-      if (NTTPType->isDependentType()) {
-        TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                          Builder.data(), Builder.size());
-        NTTPType =
-            S.SubstType(NTTPType, MultiLevelTemplateArgumentList(TemplateArgs),
-                        NTTP->getLocation(), NTTP->getDeclName());
-        if (NTTPType.isNull()) {
-          Info.Param = makeTemplateParameter(Param);
-          // FIXME: These template arguments are temporary. Free them!
-          Info.reset(TemplateArgumentList::CreateCopy(S.Context, Builder.data(),
-                                                      Builder.size()));
-          return Sema::TDK_SubstitutionFailure;
-        }
-      }
-    }
-
-    if (ConvertDeducedTemplateArgument(S, Param, Deduced[I], Partial, NTTPType,
-                                       0, Info, false, Builder)) {
+    if (ConvertDeducedTemplateArgument(S, Param, Deduced[I], Partial,
+                                       Info, false, Builder)) {
       Info.Param = makeTemplateParameter(Param);
       // FIXME: These template arguments are temporary. Free them!
-      Info.reset(TemplateArgumentList::CreateCopy(S.Context, Builder.data(),
-                                                  Builder.size()));
+      Info.reset(TemplateArgumentList::CreateCopy(S.Context, Builder));
       return Sema::TDK_SubstitutionFailure;
     }
   }
 
   // Form the template argument list from the deduced template arguments.
   TemplateArgumentList *DeducedArgumentList = TemplateArgumentList::CreateCopy(
-      S.Context, Builder.data(), Builder.size());
+      S.Context, Builder);
 
   Info.reset(DeducedArgumentList);
 
@@ -2495,7 +2476,7 @@
   if (ExplicitTemplateArgs.size() == 0) {
     // No arguments to substitute; just copy over the parameter types and
     // fill in the function type.
-    for (auto P : Function->params())
+    for (auto P : Function->parameters())
       ParamTypes.push_back(P->getType());
 
     if (FunctionType)
@@ -2540,7 +2521,7 @@
   // Form the template argument list from the explicitly-specified
   // template arguments.
   TemplateArgumentList *ExplicitArgumentList
-    = TemplateArgumentList::CreateCopy(Context, Builder.data(), Builder.size());
+    = TemplateArgumentList::CreateCopy(Context, Builder);
   Info.reset(ExplicitArgumentList);
 
   // Template argument deduction and the final substitution should be
@@ -2578,22 +2559,21 @@
   // return type, substitute it after the arguments to ensure we substitute
   // in lexical order.
   if (Proto->hasTrailingReturn()) {
-    if (SubstParmTypes(Function->getLocation(),
-                       Function->param_begin(), Function->getNumParams(),
+    if (SubstParmTypes(Function->getLocation(), Function->parameters(),
                        Proto->getExtParameterInfosOrNull(),
                        MultiLevelTemplateArgumentList(*ExplicitArgumentList),
                        ParamTypes, /*params*/ nullptr, ExtParamInfos))
       return TDK_SubstitutionFailure;
   }
-  
+
   // Instantiate the return type.
   QualType ResultType;
   {
     // C++11 [expr.prim.general]p3:
-    //   If a declaration declares a member function or member function 
-    //   template of a class X, the expression this is a prvalue of type 
+    //   If a declaration declares a member function or member function
+    //   template of a class X, the expression this is a prvalue of type
     //   "pointer to cv-qualifier-seq X" between the optional cv-qualifer-seq
-    //   and the end of the function-definition, member-declarator, or 
+    //   and the end of the function-definition, member-declarator, or
     //   declarator.
     unsigned ThisTypeQuals = 0;
     CXXRecordDecl *ThisContext = nullptr;
@@ -2601,7 +2581,7 @@
       ThisContext = Method->getParent();
       ThisTypeQuals = Method->getTypeQualifiers();
     }
-      
+
     CXXThisScopeRAII ThisScope(*this, ThisContext, ThisTypeQuals,
                                getLangOpts().CPlusPlus11);
 
@@ -2616,8 +2596,7 @@
   // Instantiate the types of each of the function parameters given the
   // explicitly-specified template arguments if we didn't do so earlier.
   if (!Proto->hasTrailingReturn() &&
-      SubstParmTypes(Function->getLocation(),
-                     Function->param_begin(), Function->getNumParams(),
+      SubstParmTypes(Function->getLocation(), Function->parameters(),
                      Proto->getExtParameterInfosOrNull(),
                      MultiLevelTemplateArgumentList(*ExplicitArgumentList),
                      ParamTypes, /*params*/ nullptr, ExtParamInfos))
@@ -2658,35 +2637,35 @@
 
 /// \brief Check whether the deduced argument type for a call to a function
 /// template matches the actual argument type per C++ [temp.deduct.call]p4.
-static bool 
-CheckOriginalCallArgDeduction(Sema &S, Sema::OriginalCallArg OriginalArg, 
+static bool
+CheckOriginalCallArgDeduction(Sema &S, Sema::OriginalCallArg OriginalArg,
                               QualType DeducedA) {
   ASTContext &Context = S.Context;
-  
+
   QualType A = OriginalArg.OriginalArgType;
   QualType OriginalParamType = OriginalArg.OriginalParamType;
-  
+
   // Check for type equality (top-level cv-qualifiers are ignored).
   if (Context.hasSameUnqualifiedType(A, DeducedA))
     return false;
-  
+
   // Strip off references on the argument types; they aren't needed for
   // the following checks.
   if (const ReferenceType *DeducedARef = DeducedA->getAs<ReferenceType>())
     DeducedA = DeducedARef->getPointeeType();
   if (const ReferenceType *ARef = A->getAs<ReferenceType>())
     A = ARef->getPointeeType();
-  
+
   // C++ [temp.deduct.call]p4:
   //   [...] However, there are three cases that allow a difference:
-  //     - If the original P is a reference type, the deduced A (i.e., the 
-  //       type referred to by the reference) can be more cv-qualified than 
+  //     - If the original P is a reference type, the deduced A (i.e., the
+  //       type referred to by the reference) can be more cv-qualified than
   //       the transformed A.
   if (const ReferenceType *OriginalParamRef
       = OriginalParamType->getAs<ReferenceType>()) {
     // We don't want to keep the reference around any more.
     OriginalParamType = OriginalParamRef->getPointeeType();
-    
+
     Qualifiers AQuals = A.getQualifiers();
     Qualifiers DeducedAQuals = DeducedA.getQualifiers();
 
@@ -2706,16 +2685,16 @@
       // Qualifiers match; there's nothing to do.
     } else if (!DeducedAQuals.compatiblyIncludes(AQuals)) {
       return true;
-    } else {        
+    } else {
       // Qualifiers are compatible, so have the argument type adopt the
       // deduced argument type's qualifiers as if we had performed the
       // qualification conversion.
       A = Context.getQualifiedType(A.getUnqualifiedType(), DeducedAQuals);
     }
   }
-  
-  //    - The transformed A can be another pointer or pointer to member 
-  //      type that can be converted to the deduced A via a qualification 
+
+  //    - The transformed A can be another pointer or pointer to member
+  //      type that can be converted to the deduced A via a qualification
   //      conversion.
   //
   // Also allow conversions which merely strip [[noreturn]] from function types
@@ -2728,12 +2707,12 @@
                                    ObjCLifetimeConversion) ||
        S.IsNoReturnConversion(A, DeducedA, ResultTy)))
     return false;
-  
-  
-  //    - If P is a class and P has the form simple-template-id, then the 
+
+
+  //    - If P is a class and P has the form simple-template-id, then the
   //      transformed A can be a derived class of the deduced A. [...]
-  //     [...] Likewise, if P is a pointer to a class of the form 
-  //      simple-template-id, the transformed A can be a pointer to a 
+  //     [...] Likewise, if P is a pointer to a class of the form
+  //      simple-template-id, the transformed A can be a pointer to a
   //      derived class pointed to by the deduced A.
   if (const PointerType *OriginalParamPtr
       = OriginalParamType->getAs<PointerType>()) {
@@ -2747,14 +2726,14 @@
       }
     }
   }
-  
+
   if (Context.hasSameUnqualifiedType(A, DeducedA))
     return false;
-  
+
   if (A->isRecordType() && isSimpleTemplateIdType(OriginalParamType) &&
       S.IsDerivedFrom(SourceLocation(), A, DeducedA))
     return false;
-  
+
   return true;
 }
 
@@ -2817,41 +2796,15 @@
         }
         continue;
       }
+
       // We have deduced this argument, so it still needs to be
       // checked and converted.
-
-      // First, for a non-type template parameter type that is
-      // initialized by a declaration, we need the type of the
-      // corresponding non-type template parameter.
-      QualType NTTPType;
-      if (NonTypeTemplateParmDecl *NTTP
-                                = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
-        NTTPType = NTTP->getType();
-        if (NTTPType->isDependentType()) {
-          TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                            Builder.data(), Builder.size());
-          NTTPType = SubstType(NTTPType,
-                               MultiLevelTemplateArgumentList(TemplateArgs),
-                               NTTP->getLocation(),
-                               NTTP->getDeclName());
-          if (NTTPType.isNull()) {
-            Info.Param = makeTemplateParameter(Param);
-            // FIXME: These template arguments are temporary. Free them!
-            Info.reset(TemplateArgumentList::CreateCopy(Context,
-                                                        Builder.data(),
-                                                        Builder.size()));
-            return TDK_SubstitutionFailure;
-          }
-        }
-      }
-
       if (ConvertDeducedTemplateArgument(*this, Param, Deduced[I],
-                                         FunctionTemplate, NTTPType, 0, Info,
+                                         FunctionTemplate, Info,
                                          true, Builder)) {
         Info.Param = makeTemplateParameter(Param);
         // FIXME: These template arguments are temporary. Free them!
-        Info.reset(TemplateArgumentList::CreateCopy(Context, Builder.data(),
-                                                    Builder.size()));
+        Info.reset(TemplateArgumentList::CreateCopy(Context, Builder));
         return TDK_SubstitutionFailure;
       }
 
@@ -2875,11 +2828,21 @@
         Builder.push_back(TemplateArgument(
             llvm::makeArrayRef(ExplicitArgs, NumExplicitArgs)));
 
-        // Forget the partially-substituted pack; it's substitution is now
+        // Forget the partially-substituted pack; its substitution is now
         // complete.
         CurrentInstantiationScope->ResetPartiallySubstitutedPack();
       } else {
-        Builder.push_back(TemplateArgument::getEmptyPack());
+        // Go through the motions of checking the empty argument pack against
+        // the parameter pack.
+        DeducedTemplateArgument DeducedPack(TemplateArgument::getEmptyPack());
+        if (ConvertDeducedTemplateArgument(*this, Param, DeducedPack,
+                                           FunctionTemplate, Info, true,
+                                           Builder)) {
+          Info.Param = makeTemplateParameter(Param);
+          // FIXME: These template arguments are temporary. Free them!
+          Info.reset(TemplateArgumentList::CreateCopy(Context, Builder));
+          return TDK_SubstitutionFailure;
+        }
       }
       continue;
     }
@@ -2897,8 +2860,7 @@
     if (DefArg.getArgument().isNull()) {
       Info.Param = makeTemplateParameter(
                          const_cast<NamedDecl *>(TemplateParams->getParam(I)));
-      Info.reset(TemplateArgumentList::CreateCopy(Context, Builder.data(),
-                                                  Builder.size()));
+      Info.reset(TemplateArgumentList::CreateCopy(Context, Builder));
       if (PartialOverloading) break;
 
       return HasDefaultArg ? TDK_SubstitutionFailure : TDK_Incomplete;
@@ -2914,8 +2876,7 @@
       Info.Param = makeTemplateParameter(
                          const_cast<NamedDecl *>(TemplateParams->getParam(I)));
       // FIXME: These template arguments are temporary. Free them!
-      Info.reset(TemplateArgumentList::CreateCopy(Context, Builder.data(),
-                                                  Builder.size()));
+      Info.reset(TemplateArgumentList::CreateCopy(Context, Builder));
       return TDK_SubstitutionFailure;
     }
 
@@ -2924,7 +2885,7 @@
 
   // Form the template argument list from the deduced template arguments.
   TemplateArgumentList *DeducedArgumentList
-    = TemplateArgumentList::CreateCopy(Context, Builder.data(), Builder.size());
+    = TemplateArgumentList::CreateCopy(Context, Builder);
   Info.reset(DeducedArgumentList);
 
   // Substitute the deduced template arguments into the function template
@@ -2958,15 +2919,15 @@
   if (OriginalCallArgs) {
     // C++ [temp.deduct.call]p4:
     //   In general, the deduction process attempts to find template argument
-    //   values that will make the deduced A identical to A (after the type A 
+    //   values that will make the deduced A identical to A (after the type A
     //   is transformed as described above). [...]
     for (unsigned I = 0, N = OriginalCallArgs->size(); I != N; ++I) {
       OriginalCallArg OriginalArg = (*OriginalCallArgs)[I];
       unsigned ParamIdx = OriginalArg.ArgIdx;
-      
+
       if (ParamIdx >= Specialization->getNumParams())
         continue;
-      
+
       QualType DeducedA = Specialization->getParamDecl(ParamIdx)->getType();
       if (CheckOriginalCallArgDeduction(*this, OriginalArg, DeducedA)) {
         Info.FirstArg = TemplateArgument(DeducedA);
@@ -2976,7 +2937,7 @@
       }
     }
   }
-  
+
   // If we suppressed any diagnostics while performing template argument
   // deduction, and if we haven't already instantiated this declaration,
   // keep track of these diagnostics. They'll be emitted if this specialization
@@ -3049,9 +3010,14 @@
         return GetTypeOfFunction(S, R, ExplicitSpec);
     }
 
+    DeclAccessPair DAP;
+    if (FunctionDecl *Viable =
+            S.resolveAddressOfOnlyViableOverloadCandidate(Arg, DAP))
+      return GetTypeOfFunction(S, R, Viable);
+
     return QualType();
   }
-  
+
   // Gather the explicit template arguments, if any.
   TemplateArgumentListInfo ExplicitTemplateArgs;
   if (Ovl->hasExplicitTemplateArgs())
@@ -3067,14 +3033,14 @@
       //     non-deduced context.
       if (!Ovl->hasExplicitTemplateArgs())
         return QualType();
-      
-      // Otherwise, see if we can resolve a function type 
+
+      // Otherwise, see if we can resolve a function type
       FunctionDecl *Specialization = nullptr;
       TemplateDeductionInfo Info(Ovl->getNameLoc());
       if (S.DeduceTemplateArguments(FunTmpl, &ExplicitTemplateArgs,
                                     Specialization, Info))
         continue;
-      
+
       D = Specialization;
     }
 
@@ -3317,7 +3283,7 @@
 
   // For all other cases, just match by type.
   QualType ArgType = Arg->getType();
-  if (AdjustFunctionParmAndArgTypesForDeduction(S, TemplateParams, ParamType, 
+  if (AdjustFunctionParmAndArgTypesForDeduction(S, TemplateParams, ParamType,
                                                 ArgType, Arg, TDF)) {
     Info.Expression = Arg;
     return Sema::TDK_FailedOverloadResolution;
@@ -3408,7 +3374,7 @@
        ParamIdx != NumParamTypes; ++ParamIdx) {
     QualType OrigParamType = ParamTypes[ParamIdx];
     QualType ParamType = OrigParamType;
-    
+
     const PackExpansionType *ParamExpansion
       = dyn_cast<PackExpansionType>(ParamType);
     if (!ParamExpansion) {
@@ -3418,7 +3384,7 @@
 
       Expr *Arg = Args[ArgIdx++];
       QualType ArgType = Arg->getType();
-      
+
       unsigned TDF = 0;
       if (AdjustFunctionParmAndArgTypesForDeduction(*this, TemplateParams,
                                                     ParamType, ArgType, Arg,
@@ -3445,7 +3411,7 @@
 
       // Keep track of the argument type and corresponding parameter index,
       // so we can check for compatibility between the deduced A and A.
-      OriginalCallArgs.push_back(OriginalCallArg(OrigParamType, ArgIdx-1, 
+      OriginalCallArgs.push_back(OriginalCallArg(OrigParamType, ArgIdx-1,
                                                  ArgType));
 
       if (TemplateDeductionResult Result
@@ -3508,7 +3474,7 @@
         // Keep track of the argument type and corresponding argument index,
         // so we can check for compatibility between the deduced A and A.
         if (hasDeducibleTemplateParameters(*this, FunctionTemplate, ParamType))
-          OriginalCallArgs.push_back(OriginalCallArg(OrigParamType, ArgIdx, 
+          OriginalCallArgs.push_back(OriginalCallArg(OrigParamType, ArgIdx,
                                                      ArgType));
 
         if (TemplateDeductionResult Result
@@ -3669,70 +3635,70 @@
   return TDK_Success;
 }
 
-/// \brief Given a function declaration (e.g. a generic lambda conversion 
-///  function) that contains an 'auto' in its result type, substitute it 
+/// \brief Given a function declaration (e.g. a generic lambda conversion
+///  function) that contains an 'auto' in its result type, substitute it
 ///  with TypeToReplaceAutoWith.  Be careful to pass in the type you want
 ///  to replace 'auto' with and not the actual result type you want
 ///  to set the function to.
-static inline void 
-SubstAutoWithinFunctionReturnType(FunctionDecl *F, 
+static inline void
+SubstAutoWithinFunctionReturnType(FunctionDecl *F,
                                     QualType TypeToReplaceAutoWith, Sema &S) {
   assert(!TypeToReplaceAutoWith->getContainedAutoType());
   QualType AutoResultType = F->getReturnType();
-  assert(AutoResultType->getContainedAutoType()); 
-  QualType DeducedResultType = S.SubstAutoType(AutoResultType, 
+  assert(AutoResultType->getContainedAutoType());
+  QualType DeducedResultType = S.SubstAutoType(AutoResultType,
                                                TypeToReplaceAutoWith);
   S.Context.adjustDeducedFunctionResultType(F, DeducedResultType);
 }
 
-/// \brief Given a specialized conversion operator of a generic lambda 
-/// create the corresponding specializations of the call operator and 
-/// the static-invoker. If the return type of the call operator is auto, 
-/// deduce its return type and check if that matches the 
+/// \brief Given a specialized conversion operator of a generic lambda
+/// create the corresponding specializations of the call operator and
+/// the static-invoker. If the return type of the call operator is auto,
+/// deduce its return type and check if that matches the
 /// return type of the destination function ptr.
 
-static inline Sema::TemplateDeductionResult 
+static inline Sema::TemplateDeductionResult
 SpecializeCorrespondingLambdaCallOperatorAndInvoker(
     CXXConversionDecl *ConversionSpecialized,
     SmallVectorImpl<DeducedTemplateArgument> &DeducedArguments,
     QualType ReturnTypeOfDestFunctionPtr,
     TemplateDeductionInfo &TDInfo,
     Sema &S) {
-  
+
   CXXRecordDecl *LambdaClass = ConversionSpecialized->getParent();
-  assert(LambdaClass && LambdaClass->isGenericLambda()); 
-  
+  assert(LambdaClass && LambdaClass->isGenericLambda());
+
   CXXMethodDecl *CallOpGeneric = LambdaClass->getLambdaCallOperator();
   QualType CallOpResultType = CallOpGeneric->getReturnType();
-  const bool GenericLambdaCallOperatorHasDeducedReturnType = 
+  const bool GenericLambdaCallOperatorHasDeducedReturnType =
       CallOpResultType->getContainedAutoType();
-  
-  FunctionTemplateDecl *CallOpTemplate = 
+
+  FunctionTemplateDecl *CallOpTemplate =
       CallOpGeneric->getDescribedFunctionTemplate();
 
   FunctionDecl *CallOpSpecialized = nullptr;
-  // Use the deduced arguments of the conversion function, to specialize our 
+  // Use the deduced arguments of the conversion function, to specialize our
   // generic lambda's call operator.
   if (Sema::TemplateDeductionResult Result
-      = S.FinishTemplateArgumentDeduction(CallOpTemplate, 
-                                          DeducedArguments, 
+      = S.FinishTemplateArgumentDeduction(CallOpTemplate,
+                                          DeducedArguments,
                                           0, CallOpSpecialized, TDInfo))
     return Result;
- 
+
   // If we need to deduce the return type, do so (instantiates the callop).
   if (GenericLambdaCallOperatorHasDeducedReturnType &&
       CallOpSpecialized->getReturnType()->isUndeducedType())
-    S.DeduceReturnType(CallOpSpecialized, 
+    S.DeduceReturnType(CallOpSpecialized,
                        CallOpSpecialized->getPointOfInstantiation(),
                        /*Diagnose*/ true);
-    
+
   // Check to see if the return type of the destination ptr-to-function
   // matches the return type of the call operator.
   if (!S.Context.hasSameType(CallOpSpecialized->getReturnType(),
                              ReturnTypeOfDestFunctionPtr))
     return Sema::TDK_NonDeducedMismatch;
   // Since we have succeeded in matching the source and destination
-  // ptr-to-functions (now including return type), and have successfully 
+  // ptr-to-functions (now including return type), and have successfully
   // specialized our corresponding call operator, we are ready to
   // specialize the static invoker with the deduced arguments of our
   // ptr-to-function.
@@ -3743,16 +3709,16 @@
 #ifndef NDEBUG
   Sema::TemplateDeductionResult LLVM_ATTRIBUTE_UNUSED Result =
 #endif
-    S.FinishTemplateArgumentDeduction(InvokerTemplate, DeducedArguments, 0, 
+    S.FinishTemplateArgumentDeduction(InvokerTemplate, DeducedArguments, 0,
           InvokerSpecialized, TDInfo);
-  assert(Result == Sema::TDK_Success && 
+  assert(Result == Sema::TDK_Success &&
     "If the call operator succeeded so should the invoker!");
   // Set the result type to match the corresponding call operator
   // specialization's result type.
   if (GenericLambdaCallOperatorHasDeducedReturnType &&
       InvokerSpecialized->getReturnType()->isUndeducedType()) {
     // Be sure to get the type to replace 'auto' with and not
-    // the full result type of the call op specialization 
+    // the full result type of the call op specialization
     // to substitute into the 'auto' of the invoker and conversion
     // function.
     // For e.g.
@@ -3764,14 +3730,14 @@
                                          ->getDeducedType();
     SubstAutoWithinFunctionReturnType(InvokerSpecialized,
         TypeToReplaceAutoWith, S);
-    SubstAutoWithinFunctionReturnType(ConversionSpecialized, 
+    SubstAutoWithinFunctionReturnType(ConversionSpecialized,
         TypeToReplaceAutoWith, S);
   }
-    
+
   // Ensure that static invoker doesn't have a const qualifier.
-  // FIXME: When creating the InvokerTemplate in SemaLambda.cpp 
+  // FIXME: When creating the InvokerTemplate in SemaLambda.cpp
   // do not use the CallOperator's TypeSourceInfo which allows
-  // the const qualifier to leak through. 
+  // the const qualifier to leak through.
   const FunctionProtoType *InvokerFPT = InvokerSpecialized->
                   getType().getTypePtr()->castAs<FunctionProtoType>();
   FunctionProtoType::ExtProtoInfo EPI = InvokerFPT->getExtProtoInfo();
@@ -3883,7 +3849,7 @@
   // Finish template argument deduction.
   FunctionDecl *ConversionSpecialized = nullptr;
   TemplateDeductionResult Result
-      = FinishTemplateArgumentDeduction(ConversionTemplate, Deduced, 0, 
+      = FinishTemplateArgumentDeduction(ConversionTemplate, Deduced, 0,
                                         ConversionSpecialized, Info);
   Specialization = cast_or_null<CXXConversionDecl>(ConversionSpecialized);
 
@@ -3892,19 +3858,19 @@
   // function to specialize the corresponding call operator.
   //   e.g., int (*fp)(int) = [](auto a) { return a; };
   if (Result == TDK_Success && isLambdaConversionOperator(ConversionGeneric)) {
-    
+
     // Get the return type of the destination ptr-to-function we are converting
-    // to.  This is necessary for matching the lambda call operator's return 
+    // to.  This is necessary for matching the lambda call operator's return
     // type to that of the destination ptr-to-function's return type.
-    assert(A->isPointerType() && 
+    assert(A->isPointerType() &&
         "Can only convert from lambda to ptr-to-function");
-    const FunctionType *ToFunType = 
+    const FunctionType *ToFunType =
         A->getPointeeType().getTypePtr()->getAs<FunctionType>();
     const QualType DestFunctionPtrReturnType = ToFunType->getReturnType();
 
-    // Create the corresponding specializations of the call operator and 
-    // the static-invoker; and if the return type is auto, 
-    // deduce the return type and check if it matches the 
+    // Create the corresponding specializations of the call operator and
+    // the static-invoker; and if the return type is auto,
+    // deduce the return type and check if it matches the
     // DestFunctionPtrReturnType.
     // For instance:
     //   auto L = [](auto a) { return f(a); };
@@ -3912,7 +3878,7 @@
     //   char (*fp2)(int) = L; <-- Not OK.
 
     Result = SpecializeCorrespondingLambdaCallOperatorAndInvoker(
-        Specialization, Deduced, DestFunctionPtrReturnType, 
+        Specialization, Deduced, DestFunctionPtrReturnType,
         Info, *this);
   }
   return Result;
@@ -4062,8 +4028,8 @@
                                  nullptr, false, false);
   QualType TemplArg = QualType(TemplParam->getTypeForDecl(), 0);
   NamedDecl *TemplParamPtr = TemplParam;
-  FixedSizeTemplateParameterListStorage<1> TemplateParamsSt(
-      Loc, Loc, TemplParamPtr, Loc);
+  FixedSizeTemplateParameterListStorage<1, false> TemplateParamsSt(
+      Loc, Loc, TemplParamPtr, Loc, nullptr);
 
   QualType FuncParam = SubstituteAutoTransform(*this, TemplArg).Apply(Type);
   assert(!FuncParam.isNull() &&
@@ -4129,13 +4095,13 @@
   return DAR_Succeeded;
 }
 
-QualType Sema::SubstAutoType(QualType TypeWithAuto, 
+QualType Sema::SubstAutoType(QualType TypeWithAuto,
                              QualType TypeToReplaceAuto) {
   return SubstituteAutoTransform(*this, TypeToReplaceAuto).
                TransformType(TypeWithAuto);
 }
 
-TypeSourceInfo* Sema::SubstAutoTypeSourceInfo(TypeSourceInfo *TypeWithAuto, 
+TypeSourceInfo* Sema::SubstAutoTypeSourceInfo(TypeSourceInfo *TypeWithAuto,
                              QualType TypeToReplaceAuto) {
     return SubstituteAutoTransform(*this, TypeToReplaceAuto).
                TransformType(TypeWithAuto);
@@ -4622,11 +4588,9 @@
   TemplateName Name(PS1->getSpecializedTemplate());
   TemplateName CanonTemplate = Context.getCanonicalTemplateName(Name);
   QualType PT1 = Context.getTemplateSpecializationType(
-      CanonTemplate, PS1->getTemplateArgs().data(),
-      PS1->getTemplateArgs().size());
+      CanonTemplate, PS1->getTemplateArgs().asArray());
   QualType PT2 = Context.getTemplateSpecializationType(
-      CanonTemplate, PS2->getTemplateArgs().data(),
-      PS2->getTemplateArgs().size());
+      CanonTemplate, PS2->getTemplateArgs().asArray());
 
   // Determine whether PS1 is at least as specialized as PS2
   Deduced.resize(PS2->getTemplateParameters()->size());
@@ -5082,7 +5046,7 @@
   TemplateParameterList *TemplateParams
     = FunctionTemplate->getTemplateParameters();
   llvm::SmallBitVector Deduced(TemplateParams->size());
-  ::MarkUsedTemplateParameters(S.Context, T, true, TemplateParams->getDepth(), 
+  ::MarkUsedTemplateParameters(S.Context, T, true, TemplateParams->getDepth(),
                                Deduced);
 
   return Deduced.any();
diff --git a/lib/Sema/SemaTemplateInstantiate.cpp b/lib/Sema/SemaTemplateInstantiate.cpp
index caf8693..da947fc 100644
--- a/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/lib/Sema/SemaTemplateInstantiate.cpp
@@ -15,12 +15,14 @@
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTLambda.h"
+#include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/PrettyDeclStackTrace.h"
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateDeduction.h"
 
@@ -446,10 +448,8 @@
       SmallVector<char, 128> TemplateArgsStr;
       llvm::raw_svector_ostream OS(TemplateArgsStr);
       Template->printName(OS);
-      TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                                         Active->TemplateArgs,
-                                                      Active->NumTemplateArgs,
-                                                      getPrintingPolicy());
+      TemplateSpecializationType::PrintTemplateArgumentList(
+          OS, Active->template_arguments(), getPrintingPolicy());
       Diags.Report(Active->PointOfInstantiation,
                    diag::note_default_arg_instantiation_here)
         << OS.str()
@@ -500,10 +500,8 @@
       SmallVector<char, 128> TemplateArgsStr;
       llvm::raw_svector_ostream OS(TemplateArgsStr);
       FD->printName(OS);
-      TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                                         Active->TemplateArgs,
-                                                      Active->NumTemplateArgs,
-                                                      getPrintingPolicy());
+      TemplateSpecializationType::PrintTemplateArgumentList(
+          OS, Active->template_arguments(), getPrintingPolicy());
       Diags.Report(Active->PointOfInstantiation,
                    diag::note_default_function_arg_instantiation_here)
         << OS.str()
@@ -1517,7 +1515,7 @@
 }
 
 static bool NeedsInstantiationAsFunctionType(TypeSourceInfo *T) {
-  if (T->getType()->isInstantiationDependentType() || 
+  if (T->getType()->isInstantiationDependentType() ||
       T->getType()->isVariablyModifiedType())
     return true;
 
@@ -1526,23 +1524,13 @@
     return false;
 
   FunctionProtoTypeLoc FP = TL.castAs<FunctionProtoTypeLoc>();
-  for (unsigned I = 0, E = FP.getNumParams(); I != E; ++I) {
-    ParmVarDecl *P = FP.getParam(I);
-
+  for (ParmVarDecl *P : FP.getParams()) {
     // This must be synthesized from a typedef.
     if (!P) continue;
 
-    // The parameter's type as written might be dependent even if the
-    // decayed type was not dependent.
-    if (TypeSourceInfo *TSInfo = P->getTypeSourceInfo())
-      if (TSInfo->getType()->isInstantiationDependentType())
-        return true;
-
-    // TODO: currently we always rebuild expressions.  When we
-    // properly get lazier about this, we should use the same
-    // logic to avoid rebuilding prototypes here.
-    if (P->hasDefaultArg())
-      return true;
+    // If there are any parameters, a new TypeSourceInfo that refers to the
+    // instantiated parameters must be built.
+    return true;
   }
 
   return false;
@@ -1561,7 +1549,7 @@
   assert(!ActiveTemplateInstantiations.empty() &&
          "Cannot perform an instantiation without some context on the "
          "instantiation stack");
-  
+
   if (!NeedsInstantiationAsFunctionType(T))
     return T;
 
@@ -1723,23 +1711,21 @@
 /// \brief Substitute the given template arguments into the given set of
 /// parameters, producing the set of parameter types that would be generated
 /// from such a substitution.
-bool Sema::SubstParmTypes(SourceLocation Loc, 
-                          ParmVarDecl **Params, unsigned NumParams,
-                    const FunctionProtoType::ExtParameterInfo *ExtParamInfos,
-                          const MultiLevelTemplateArgumentList &TemplateArgs,
-                          SmallVectorImpl<QualType> &ParamTypes,
-                          SmallVectorImpl<ParmVarDecl *> *OutParams,
-                          ExtParameterInfoBuilder &ParamInfos) {
+bool Sema::SubstParmTypes(
+    SourceLocation Loc, ArrayRef<ParmVarDecl *> Params,
+    const FunctionProtoType::ExtParameterInfo *ExtParamInfos,
+    const MultiLevelTemplateArgumentList &TemplateArgs,
+    SmallVectorImpl<QualType> &ParamTypes,
+    SmallVectorImpl<ParmVarDecl *> *OutParams,
+    ExtParameterInfoBuilder &ParamInfos) {
   assert(!ActiveTemplateInstantiations.empty() &&
          "Cannot perform an instantiation without some context on the "
          "instantiation stack");
   
   TemplateInstantiator Instantiator(*this, TemplateArgs, Loc, 
                                     DeclarationName());
-  return Instantiator.TransformFunctionTypeParams(Loc, Params, NumParams,
-                                                  nullptr, ExtParamInfos,
-                                                  ParamTypes, OutParams,
-                                                  ParamInfos);
+  return Instantiator.TransformFunctionTypeParams(
+      Loc, Params, nullptr, ExtParamInfos, ParamTypes, OutParams, ParamInfos);
 }
 
 /// \brief Perform substitution on the base class specifiers of the
@@ -1869,8 +1855,19 @@
                                            TagDecl *PatternDef,
                                            TemplateSpecializationKind TSK,
                                            bool Complain = true) {
-  if (PatternDef && !PatternDef->isBeingDefined())
+  if (PatternDef && !PatternDef->isBeingDefined()) {
+    NamedDecl *SuggestedDef = nullptr;
+    if (!S.hasVisibleDefinition(PatternDef, &SuggestedDef,
+                                /*OnlyNeedComplete*/false)) {
+      // If we're allowed to diagnose this and recover, do so.
+      bool Recover = Complain && !S.isSFINAEContext();
+      if (Complain)
+        S.diagnoseMissingImport(PointOfInstantiation, SuggestedDef,
+                                Sema::MissingImportKind::Definition, Recover);
+      return !Recover;
+    }
     return false;
+  }
 
   if (!Complain || (PatternDef && PatternDef->isInvalidDecl())) {
     // Say nothing
@@ -1954,6 +1951,8 @@
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
+  PrettyDeclStackTraceEntry CrashInfo(*this, Instantiation, SourceLocation(),
+                                      "instantiating class definition");
 
   // Enter the scope of this instantiation. We don't use
   // PushDeclContext because we don't have a scope.
@@ -1967,6 +1966,13 @@
   bool MergeWithParentScope = !Instantiation->isDefinedOutsideFunctionOrMethod();
   LocalInstantiationScope Scope(*this, MergeWithParentScope);
 
+  // All dllexported classes created during instantiation should be fully
+  // emitted after instantiation completes. We may not be ready to emit any
+  // delayed classes already on the stack, so save them away and put them back
+  // later.
+  decltype(DelayedDllExportClasses) ExportedClasses;
+  std::swap(ExportedClasses, DelayedDllExportClasses);
+
   // Pull attributes from the pattern onto the instantiation.
   InstantiateAttrs(TemplateArgs, Pattern, Instantiation);
 
@@ -2052,6 +2058,9 @@
   // default arg exprs for default constructors if necessary now.
   ActOnFinishCXXNonNestedClass(Instantiation);
 
+  // Put back the delayed exported classes that we moved out of the way.
+  std::swap(ExportedClasses, DelayedDllExportClasses);
+
   // Instantiate late parsed attributes, and attach them to their decls.
   // See Sema::InstantiateAttrs
   for (LateInstantiatedAttrVec::iterator I = LateAttrs.begin(),
@@ -2167,6 +2176,8 @@
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
+  PrettyDeclStackTraceEntry CrashInfo(*this, Instantiation, SourceLocation(),
+                                      "instantiating enum definition");
 
   // The instantiation is visible here, even if it was first declared in an
   // unimported module.
@@ -2239,6 +2250,8 @@
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
+  PrettyDeclStackTraceEntry CrashInfo(*this, Instantiation, SourceLocation(),
+                                      "instantiating default member init");
 
   // Enter the scope of this instantiation. We don't use PushDeclContext because
   // we don't have a scope.
@@ -2259,6 +2272,9 @@
   ActOnFinishCXXInClassMemberInitializer(
       Instantiation, Init ? Init->getLocStart() : SourceLocation(), Init);
 
+  if (auto *L = getASTMutationListener())
+    L->DefaultMemberInitializerInstantiated(Instantiation);
+
   // Exit the scope of this instantiation.
   SavedContext.pop();
 
@@ -2310,8 +2326,9 @@
                                     Info)) {
       // Store the failed-deduction information for use in diagnostics, later.
       // TODO: Actually use the failed-deduction info?
-      FailedCandidates.addCandidate()
-          .set(Partial, MakeDeductionFailureInfo(Context, Result, Info));
+      FailedCandidates.addCandidate().set(
+          DeclAccessPair::make(Template, AS_public), Partial,
+          MakeDeductionFailureInfo(Context, Result, Info));
       (void)Result;
     } else {
       Matched.push_back(PartialSpecMatchResult());
@@ -2503,8 +2520,7 @@
           //   specialization and is only an explicit instantiation definition 
           //   of members whose definition is visible at the point of 
           //   instantiation.
-          if (!Var->getInstantiatedFromStaticDataMember()
-                                                     ->getOutOfLineDefinition())
+          if (!Var->getInstantiatedFromStaticDataMember()->getDefinition())
             continue;
           
           Var->setTemplateSpecializationKind(TSK, PointOfInstantiation);
@@ -2530,6 +2546,13 @@
                                                 == TSK_ExplicitSpecialization)
         continue;
 
+      if (Context.getTargetInfo().getCXXABI().isMicrosoft() &&
+          TSK == TSK_ExplicitInstantiationDeclaration) {
+        // In MSVC mode, explicit instantiation decl of the outer class doesn't
+        // affect the inner class.
+        continue;
+      }
+
       if (CheckSpecializationInstantiationRedecl(PointOfInstantiation, TSK, 
                                                  Record, 
                                         MSInfo->getTemplateSpecializationKind(),
@@ -2591,7 +2614,7 @@
       if (Enum->getDefinition())
         continue;
 
-      EnumDecl *Pattern = Enum->getInstantiatedFromMemberEnum();
+      EnumDecl *Pattern = Enum->getTemplateInstantiationPattern();
       assert(Pattern && "Missing instantiated-from-template information");
 
       if (TSK == TSK_ExplicitInstantiationDefinition) {
@@ -2611,8 +2634,7 @@
             Instantiation->getTemplateInstantiationPattern();
         DeclContext::lookup_result Lookup =
             ClassPattern->lookup(Field->getDeclName());
-        assert(Lookup.size() == 1);
-        FieldDecl *Pattern = cast<FieldDecl>(Lookup[0]);
+        FieldDecl *Pattern = cast<FieldDecl>(Lookup.front());
         InstantiateInClassInitializer(PointOfInstantiation, Field, Pattern,
                                       TemplateArgs);
       }
diff --git a/lib/Sema/SemaTemplateInstantiateDecl.cpp b/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 8adc88f..1e28b35 100644
--- a/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/TypeLoc.h"
+#include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/PrettyDeclStackTrace.h"
 #include "clang/Sema/Template.h"
@@ -227,6 +228,86 @@
                         Attr.getSpellingListIndex());
 }
 
+static void
+instantiateDependentModeAttr(Sema &S,
+                             const MultiLevelTemplateArgumentList &TemplateArgs,
+                             const ModeAttr &Attr, Decl *New) {
+  S.AddModeAttr(Attr.getRange(), New, Attr.getMode(),
+                Attr.getSpellingListIndex(), /*InInstantiation=*/true);
+}
+
+/// Instantiation of 'declare simd' attribute and its arguments.
+static void instantiateOMPDeclareSimdDeclAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const OMPDeclareSimdDeclAttr &Attr, Decl *New) {
+  // Allow 'this' in clauses with varlists.
+  if (auto *FTD = dyn_cast<FunctionTemplateDecl>(New))
+    New = FTD->getTemplatedDecl();
+  auto *FD = cast<FunctionDecl>(New);
+  auto *ThisContext = dyn_cast_or_null<CXXRecordDecl>(FD->getDeclContext());
+  SmallVector<Expr *, 4> Uniforms, Aligneds, Alignments, Linears, Steps;
+  SmallVector<unsigned, 4> LinModifiers;
+
+  auto &&Subst = [&](Expr *E) -> ExprResult {
+    if (auto *DRE = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts()))
+      if (auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
+        Sema::ContextRAII SavedContext(S, FD);
+        LocalInstantiationScope Local(S);
+        if (FD->getNumParams() > PVD->getFunctionScopeIndex())
+          Local.InstantiatedLocal(
+              PVD, FD->getParamDecl(PVD->getFunctionScopeIndex()));
+        return S.SubstExpr(E, TemplateArgs);
+      }
+    Sema::CXXThisScopeRAII ThisScope(S, ThisContext, /*TypeQuals=*/0,
+                                     FD->isCXXInstanceMember());
+    return S.SubstExpr(E, TemplateArgs);
+  };
+
+  ExprResult Simdlen;
+  if (auto *E = Attr.getSimdlen())
+    Simdlen = Subst(E);
+
+  if (Attr.uniforms_size() > 0) {
+    for(auto *E : Attr.uniforms()) {
+      ExprResult Inst = Subst(E);
+      if (Inst.isInvalid())
+        continue;
+      Uniforms.push_back(Inst.get());
+    }
+  }
+
+  auto AI = Attr.alignments_begin();
+  for (auto *E : Attr.aligneds()) {
+    ExprResult Inst = Subst(E);
+    if (Inst.isInvalid())
+      continue;
+    Aligneds.push_back(Inst.get());
+    Inst = ExprEmpty();
+    if (*AI)
+      Inst = S.SubstExpr(*AI, TemplateArgs);
+    Alignments.push_back(Inst.get());
+    ++AI;
+  }
+
+  auto SI = Attr.steps_begin();
+  for (auto *E : Attr.linears()) {
+    ExprResult Inst = Subst(E);
+    if (Inst.isInvalid())
+      continue;
+    Linears.push_back(Inst.get());
+    Inst = ExprEmpty();
+    if (*SI)
+      Inst = S.SubstExpr(*SI, TemplateArgs);
+    Steps.push_back(Inst.get());
+    ++SI;
+  }
+  LinModifiers.append(Attr.modifiers_begin(), Attr.modifiers_end());
+  (void)S.ActOnOpenMPDeclareSimdDirective(
+      S.ConvertDeclToDeclGroup(New), Attr.getBranchState(), Simdlen.get(),
+      Uniforms, Aligneds, Alignments, Linears, LinModifiers, Steps,
+      Attr.getRange());
+}
+
 void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
                             const Decl *Tmpl, Decl *New,
                             LateInstantiatedAttrVec *LateAttrs,
@@ -265,6 +346,16 @@
       continue;
     }
 
+    if (const ModeAttr *Mode = dyn_cast<ModeAttr>(TmplAttr)) {
+      instantiateDependentModeAttr(*this, TemplateArgs, *Mode, New);
+      continue;
+    }
+
+    if (const auto *OMPAttr = dyn_cast<OMPDeclareSimdDeclAttr>(TmplAttr)) {
+      instantiateOMPDeclareSimdDeclAttr(*this, TemplateArgs, *OMPAttr, New);
+      continue;
+    }
+
     // Existing DLL attribute on the instantiation takes precedence.
     if (TmplAttr->getKind() == attr::DLLExport ||
         TmplAttr->getKind() == attr::DLLImport) {
@@ -335,6 +426,16 @@
 }
 
 Decl *
+TemplateDeclInstantiator::VisitPragmaCommentDecl(PragmaCommentDecl *D) {
+  llvm_unreachable("pragma comment cannot be instantiated");
+}
+
+Decl *TemplateDeclInstantiator::VisitPragmaDetectMismatchDecl(
+    PragmaDetectMismatchDecl *D) {
+  llvm_unreachable("pragma comment cannot be instantiated");
+}
+
+Decl *
 TemplateDeclInstantiator::VisitExternCContextDecl(ExternCContextDecl *D) {
   llvm_unreachable("extern \"C\" context cannot be instantiated");
 }
@@ -498,19 +599,37 @@
   return Inst;
 }
 
+Decl *TemplateDeclInstantiator::VisitBindingDecl(BindingDecl *D) {
+  auto *NewBD = BindingDecl::Create(SemaRef.Context, Owner, D->getLocation(),
+                                    D->getIdentifier());
+  SemaRef.CurrentInstantiationScope->InstantiatedLocal(D, NewBD);
+  return NewBD;
+}
+
+Decl *TemplateDeclInstantiator::VisitDecompositionDecl(DecompositionDecl *D) {
+  // Transform the bindings first.
+  SmallVector<BindingDecl*, 16> NewBindings;
+  for (auto *OldBD : D->bindings())
+    NewBindings.push_back(cast<BindingDecl>(VisitBindingDecl(OldBD)));
+  ArrayRef<BindingDecl*> NewBindingArray = NewBindings;
+
+  auto *NewDD = cast_or_null<DecompositionDecl>(
+      VisitVarDecl(D, /*InstantiatingVarTemplate=*/false, &NewBindingArray));
+
+  if (!NewDD || NewDD->isInvalidDecl())
+    for (auto *NewBD : NewBindings)
+      NewBD->setInvalidDecl();
+
+  return NewDD;
+}
+
 Decl *TemplateDeclInstantiator::VisitVarDecl(VarDecl *D) {
   return VisitVarDecl(D, /*InstantiatingVarTemplate=*/false);
 }
 
 Decl *TemplateDeclInstantiator::VisitVarDecl(VarDecl *D,
-                                             bool InstantiatingVarTemplate) {
-
-  // If this is the variable for an anonymous struct or union,
-  // instantiate the anonymous struct/union type first.
-  if (const RecordType *RecordTy = D->getType()->getAs<RecordType>())
-    if (RecordTy->getDecl()->isAnonymousStructOrUnion())
-      if (!VisitCXXRecordDecl(cast<CXXRecordDecl>(RecordTy->getDecl())))
-        return nullptr;
+                                             bool InstantiatingVarTemplate,
+                                             ArrayRef<BindingDecl*> *Bindings) {
 
   // Do substitution on the type of the declaration
   TypeSourceInfo *DI = SemaRef.SubstType(D->getTypeSourceInfo(),
@@ -531,9 +650,15 @@
     SemaRef.adjustContextForLocalExternDecl(DC);
 
   // Build the instantiated declaration.
-  VarDecl *Var = VarDecl::Create(SemaRef.Context, DC, D->getInnerLocStart(),
-                                 D->getLocation(), D->getIdentifier(),
-                                 DI->getType(), DI, D->getStorageClass());
+  VarDecl *Var;
+  if (Bindings)
+    Var = DecompositionDecl::Create(SemaRef.Context, DC, D->getInnerLocStart(),
+                                    D->getLocation(), DI->getType(), DI,
+                                    D->getStorageClass(), *Bindings);
+  else
+    Var = VarDecl::Create(SemaRef.Context, DC, D->getInnerLocStart(),
+                          D->getLocation(), D->getIdentifier(), DI->getType(),
+                          DI, D->getStorageClass());
 
   // In ARC, infer 'retaining' for variables of retainable type.
   if (SemaRef.getLangOpts().ObjCAutoRefCount && 
@@ -710,7 +835,7 @@
   QualType T = cast<FieldDecl>(NamedChain[i-1])->getType();
   IndirectFieldDecl *IndirectField = IndirectFieldDecl::Create(
       SemaRef.Context, Owner, D->getLocation(), D->getIdentifier(), T,
-      NamedChain, D->getChainingSize());
+      {NamedChain, D->getChainingSize()});
 
   for (const auto *Attr : D->attrs())
     IndirectField->addAttr(Attr->clone(SemaRef.Context));
@@ -1511,8 +1636,7 @@
     ArrayRef<TemplateArgument> Innermost = TemplateArgs.getInnermost();
     Function->setFunctionTemplateSpecialization(FunctionTemplate,
                             TemplateArgumentList::CreateCopy(SemaRef.Context,
-                                                             Innermost.begin(),
-                                                             Innermost.size()),
+                                                             Innermost),
                                                 /*InsertPos=*/nullptr);
   } else if (isFriend) {
     // Note, we need this connection even if the friend doesn't have a body.
@@ -1748,36 +1872,6 @@
                                         Constructor->isExplicit(),
                                         Constructor->isInlineSpecified(),
                                         false, Constructor->isConstexpr());
-
-    // Claim that the instantiation of a constructor or constructor template
-    // inherits the same constructor that the template does.
-    if (CXXConstructorDecl *Inh = const_cast<CXXConstructorDecl *>(
-            Constructor->getInheritedConstructor())) {
-      // If we're instantiating a specialization of a function template, our
-      // "inherited constructor" will actually itself be a function template.
-      // Instantiate a declaration of it, too.
-      if (FunctionTemplate) {
-        assert(!TemplateParams && Inh->getDescribedFunctionTemplate() &&
-               !Inh->getParent()->isDependentContext() &&
-               "inheriting constructor template in dependent context?");
-        Sema::InstantiatingTemplate Inst(SemaRef, Constructor->getLocation(),
-                                         Inh);
-        if (Inst.isInvalid())
-          return nullptr;
-        Sema::ContextRAII SavedContext(SemaRef, Inh->getDeclContext());
-        LocalInstantiationScope LocalScope(SemaRef);
-
-        // Use the same template arguments that we deduced for the inheriting
-        // constructor. There's no way they could be deduced differently.
-        MultiLevelTemplateArgumentList InheritedArgs;
-        InheritedArgs.addOuterTemplateArguments(TemplateArgs.getInnermost());
-        Inh = cast_or_null<CXXConstructorDecl>(
-            SemaRef.SubstDecl(Inh, Inh->getDeclContext(), InheritedArgs));
-        if (!Inh)
-          return nullptr;
-      }
-      cast<CXXConstructorDecl>(Method)->setInheritedConstructor(Inh);
-    }
   } else if (CXXDestructorDecl *Destructor = dyn_cast<CXXDestructorDecl>(D)) {
     Method = CXXDestructorDecl::Create(SemaRef.Context, Record,
                                        StartLoc, NameInfo, T, TInfo,
@@ -1833,8 +1927,7 @@
     ArrayRef<TemplateArgument> Innermost = TemplateArgs.getInnermost();
     Method->setFunctionTemplateSpecialization(FunctionTemplate,
                          TemplateArgumentList::CreateCopy(SemaRef.Context,
-                                                          Innermost.begin(),
-                                                          Innermost.size()),
+                                                          Innermost),
                                               /*InsertPos=*/nullptr);
   } else if (!isFriend) {
     // Record that this is an instantiation of a member function.
@@ -2092,16 +2185,11 @@
 
   NonTypeTemplateParmDecl *Param;
   if (IsExpandedParameterPack)
-    Param = NonTypeTemplateParmDecl::Create(SemaRef.Context, Owner,
-                                            D->getInnerLocStart(),
-                                            D->getLocation(),
-                                    D->getDepth() - TemplateArgs.getNumLevels(),
-                                            D->getPosition(),
-                                            D->getIdentifier(), T,
-                                            DI,
-                                            ExpandedParameterPackTypes.data(),
-                                            ExpandedParameterPackTypes.size(),
-                                    ExpandedParameterPackTypesAsWritten.data());
+    Param = NonTypeTemplateParmDecl::Create(
+        SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
+        D->getDepth() - TemplateArgs.getNumLevels(), D->getPosition(),
+        D->getIdentifier(), T, DI, ExpandedParameterPackTypes,
+        ExpandedParameterPackTypesAsWritten);
   else
     Param = NonTypeTemplateParmDecl::Create(SemaRef.Context, Owner,
                                             D->getInnerLocStart(),
@@ -2303,9 +2391,14 @@
   if (!QualifierLoc)
     return nullptr;
 
-  // The name info is non-dependent, so no transformation
-  // is required.
+  // For an inheriting constructor declaration, the name of the using
+  // declaration is the name of a constructor in this class, not in the
+  // base class.
   DeclarationNameInfo NameInfo = D->getNameInfo();
+  if (NameInfo.getName().getNameKind() == DeclarationName::CXXConstructorName)
+    if (auto *RD = dyn_cast<CXXRecordDecl>(SemaRef.CurContext))
+      NameInfo.setName(SemaRef.Context.DeclarationNames.getCXXConstructorName(
+          SemaRef.Context.getCanonicalType(SemaRef.Context.getRecordType(RD))));
 
   // We only need to do redeclaration lookups if we're in a class
   // scope (in fact, it's not really even possible in non-class
@@ -2348,18 +2441,23 @@
   if (NewUD->isInvalidDecl())
     return NewUD;
 
-  if (NameInfo.getName().getNameKind() == DeclarationName::CXXConstructorName) {
+  if (NameInfo.getName().getNameKind() == DeclarationName::CXXConstructorName)
     SemaRef.CheckInheritingConstructorUsingDecl(NewUD);
-    return NewUD;
-  }
 
   bool isFunctionScope = Owner->isFunctionOrMethod();
 
   // Process the shadow decls.
   for (auto *Shadow : D->shadows()) {
+    // FIXME: UsingShadowDecl doesn't preserve its immediate target, so
+    // reconstruct it in the case where it matters.
+    NamedDecl *OldTarget = Shadow->getTargetDecl();
+    if (auto *CUSD = dyn_cast<ConstructorUsingShadowDecl>(Shadow))
+      if (auto *BaseShadow = CUSD->getNominatedBaseClassShadowDecl())
+        OldTarget = BaseShadow;
+
     NamedDecl *InstTarget =
         cast_or_null<NamedDecl>(SemaRef.FindInstantiatedDecl(
-            Shadow->getLocation(), Shadow->getTargetDecl(), TemplateArgs));
+            Shadow->getLocation(), OldTarget, TemplateArgs));
     if (!InstTarget)
       return nullptr;
 
@@ -2390,6 +2488,12 @@
   return nullptr;
 }
 
+Decl *TemplateDeclInstantiator::VisitConstructorUsingShadowDecl(
+    ConstructorUsingShadowDecl *D) {
+  // Ignore these;  we handle them in bulk when processing the UsingDecl.
+  return nullptr;
+}
+
 Decl * TemplateDeclInstantiator
     ::VisitUnresolvedUsingTypenameDecl(UnresolvedUsingTypenameDecl *D) {
   NestedNameSpecifierLoc QualifierLoc
@@ -2491,6 +2595,86 @@
   return TD;
 }
 
+Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
+    OMPDeclareReductionDecl *D) {
+  // Instantiate type and check if it is allowed.
+  QualType SubstReductionType = SemaRef.ActOnOpenMPDeclareReductionType(
+      D->getLocation(),
+      ParsedType::make(SemaRef.SubstType(D->getType(), TemplateArgs,
+                                         D->getLocation(), DeclarationName())));
+  if (SubstReductionType.isNull())
+    return nullptr;
+  bool IsCorrect = !SubstReductionType.isNull();
+  // Create instantiated copy.
+  std::pair<QualType, SourceLocation> ReductionTypes[] = {
+      std::make_pair(SubstReductionType, D->getLocation())};
+  auto *PrevDeclInScope = D->getPrevDeclInScope();
+  if (PrevDeclInScope && !PrevDeclInScope->isInvalidDecl()) {
+    PrevDeclInScope = cast<OMPDeclareReductionDecl>(
+        SemaRef.CurrentInstantiationScope->findInstantiationOf(PrevDeclInScope)
+            ->get<Decl *>());
+  }
+  auto DRD = SemaRef.ActOnOpenMPDeclareReductionDirectiveStart(
+      /*S=*/nullptr, Owner, D->getDeclName(), ReductionTypes, D->getAccess(),
+      PrevDeclInScope);
+  auto *NewDRD = cast<OMPDeclareReductionDecl>(DRD.get().getSingleDecl());
+  if (isDeclWithinFunction(NewDRD))
+    SemaRef.CurrentInstantiationScope->InstantiatedLocal(D, NewDRD);
+  Expr *SubstCombiner = nullptr;
+  Expr *SubstInitializer = nullptr;
+  // Combiners instantiation sequence.
+  if (D->getCombiner()) {
+    SemaRef.ActOnOpenMPDeclareReductionCombinerStart(
+        /*S=*/nullptr, NewDRD);
+    const char *Names[] = {"omp_in", "omp_out"};
+    for (auto &Name : Names) {
+      DeclarationName DN(&SemaRef.Context.Idents.get(Name));
+      auto OldLookup = D->lookup(DN);
+      auto Lookup = NewDRD->lookup(DN);
+      if (!OldLookup.empty() && !Lookup.empty()) {
+        assert(Lookup.size() == 1 && OldLookup.size() == 1);
+        SemaRef.CurrentInstantiationScope->InstantiatedLocal(OldLookup.front(),
+                                                             Lookup.front());
+      }
+    }
+    SubstCombiner = SemaRef.SubstExpr(D->getCombiner(), TemplateArgs).get();
+    SemaRef.ActOnOpenMPDeclareReductionCombinerEnd(NewDRD, SubstCombiner);
+    // Initializers instantiation sequence.
+    if (D->getInitializer()) {
+      SemaRef.ActOnOpenMPDeclareReductionInitializerStart(
+          /*S=*/nullptr, NewDRD);
+      const char *Names[] = {"omp_orig", "omp_priv"};
+      for (auto &Name : Names) {
+        DeclarationName DN(&SemaRef.Context.Idents.get(Name));
+        auto OldLookup = D->lookup(DN);
+        auto Lookup = NewDRD->lookup(DN);
+        if (!OldLookup.empty() && !Lookup.empty()) {
+          assert(Lookup.size() == 1 && OldLookup.size() == 1);
+          SemaRef.CurrentInstantiationScope->InstantiatedLocal(
+              OldLookup.front(), Lookup.front());
+        }
+      }
+      SubstInitializer =
+          SemaRef.SubstExpr(D->getInitializer(), TemplateArgs).get();
+      SemaRef.ActOnOpenMPDeclareReductionInitializerEnd(NewDRD,
+                                                        SubstInitializer);
+    }
+    IsCorrect = IsCorrect && SubstCombiner &&
+                (!D->getInitializer() || SubstInitializer);
+  } else
+    IsCorrect = false;
+
+  (void)SemaRef.ActOnOpenMPDeclareReductionDirectiveEnd(/*S=*/nullptr, DRD,
+                                                        IsCorrect);
+
+  return NewDRD;
+}
+
+Decl *TemplateDeclInstantiator::VisitOMPCapturedExprDecl(
+    OMPCapturedExprDecl * /*D*/) {
+  llvm_unreachable("Should not be met in templates");
+}
+
 Decl *TemplateDeclInstantiator::VisitFunctionDecl(FunctionDecl *D) {
   return VisitFunctionDecl(D, nullptr);
 }
@@ -2594,8 +2778,7 @@
                                               D->getLocStart(),
                                               D->getLocation(),
                                               InstClassTemplate,
-                                              Converted.data(),
-                                              Converted.size(),
+                                              Converted,
                                               PrevDecl);
 
   // Add this partial specialization to the set of class template partial
@@ -2610,7 +2793,7 @@
   // Build the canonical type that describes the converted template
   // arguments of the class template explicit specialization.
   QualType CanonType = SemaRef.Context.getTemplateSpecializationType(
-      TemplateName(InstClassTemplate), Converted.data(), Converted.size(),
+      TemplateName(InstClassTemplate), Converted,
       SemaRef.Context.getRecordType(InstD));
 
   // Build the fully-sugared type for this class template
@@ -2687,13 +2870,6 @@
     const TemplateArgumentListInfo &TemplateArgsInfo,
     ArrayRef<TemplateArgument> Converted) {
 
-  // If this is the variable for an anonymous struct or union,
-  // instantiate the anonymous struct/union type first.
-  if (const RecordType *RecordTy = D->getType()->getAs<RecordType>())
-    if (RecordTy->getDecl()->isAnonymousStructOrUnion())
-      if (!VisitCXXRecordDecl(cast<CXXRecordDecl>(RecordTy->getDecl())))
-        return nullptr;
-
   // Do substitution on the type of the declaration
   TypeSourceInfo *DI =
       SemaRef.SubstType(D->getTypeSourceInfo(), TemplateArgs,
@@ -2710,8 +2886,7 @@
   // Build the instantiated declaration
   VarTemplateSpecializationDecl *Var = VarTemplateSpecializationDecl::Create(
       SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
-      VarTemplate, DI->getType(), DI, D->getStorageClass(), Converted.data(),
-      Converted.size());
+      VarTemplate, DI->getType(), DI, D->getStorageClass(), Converted);
   Var->setTemplateArgsInfo(TemplateArgsInfo);
   if (InsertPos)
     VarTemplate->AddSpecialization(Var, InsertPos);
@@ -2779,10 +2954,14 @@
   if (Invalid)
     return nullptr;
 
+  // Note: we substitute into associated constraints later
+  Expr *const UninstantiatedRequiresClause = L->getRequiresClause();
+
   TemplateParameterList *InstL
     = TemplateParameterList::Create(SemaRef.Context, L->getTemplateLoc(),
                                     L->getLAngleLoc(), Params,
-                                    L->getRAngleLoc());
+                                    L->getRAngleLoc(),
+                                    UninstantiatedRequiresClause);
   return InstL;
 }
 
@@ -2844,8 +3023,7 @@
   // arguments of the class template partial specialization.
   QualType CanonType
     = SemaRef.Context.getTemplateSpecializationType(TemplateName(ClassTemplate),
-                                                    Converted.data(),
-                                                    Converted.size());
+                                                    Converted);
 
   // Build the fully-sugared type for this class template
   // specialization as the user wrote in the specialization
@@ -2894,8 +3072,7 @@
                                                      PartialSpec->getLocation(),
                                                      InstParams,
                                                      ClassTemplate,
-                                                     Converted.data(),
-                                                     Converted.size(),
+                                                     Converted,
                                                      InstTemplateArgs,
                                                      CanonType,
                                                      nullptr);
@@ -2967,7 +3144,7 @@
   // Build the canonical type that describes the converted template
   // arguments of the variable template partial specialization.
   QualType CanonType = SemaRef.Context.getTemplateSpecializationType(
-      TemplateName(VarTemplate), Converted.data(), Converted.size());
+      TemplateName(VarTemplate), Converted);
 
   // Build the fully-sugared type for this variable template
   // specialization as the user wrote in the specialization
@@ -3023,8 +3200,7 @@
       VarTemplatePartialSpecializationDecl::Create(
           SemaRef.Context, Owner, PartialSpec->getInnerLocStart(),
           PartialSpec->getLocation(), InstParams, VarTemplate, DI->getType(),
-          DI, PartialSpec->getStorageClass(), Converted.data(),
-          Converted.size(), InstTemplateArgs);
+          DI, PartialSpec->getStorageClass(), Converted, InstTemplateArgs);
 
   // Substitute the nested name specifier, if any.
   if (SubstQualifier(PartialSpec, InstPartialSpec))
@@ -3133,9 +3309,9 @@
     // synthesized in the method declaration.
     SmallVector<QualType, 4> ParamTypes;
     Sema::ExtParameterInfoBuilder ExtParamInfos;
-    if (SemaRef.SubstParmTypes(D->getLocation(), D->param_begin(),
-                               D->getNumParams(), nullptr, TemplateArgs,
-                               ParamTypes, &Params, ExtParamInfos))
+    if (SemaRef.SubstParmTypes(D->getLocation(), D->parameters(), nullptr,
+                               TemplateArgs, ParamTypes, &Params,
+                               ExtParamInfos))
       return nullptr;
   }
 
@@ -3362,7 +3538,8 @@
 void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
                                          FunctionDecl *Function,
                                          bool Recursive,
-                                         bool DefinitionRequired) {
+                                         bool DefinitionRequired,
+                                         bool AtEndOfTU) {
   if (Function->isInvalidDecl() || Function->isDefined())
     return;
 
@@ -3416,6 +3593,10 @@
     Pattern = PatternDecl->getBody(PatternDecl);
   }
 
+  // FIXME: Check that the definition is visible before trying to instantiate
+  // it. This requires us to track the instantiation stack in order to know
+  // which definitions should be visible.
+
   if (!Pattern && !PatternDecl->isDefaulted()) {
     if (DefinitionRequired) {
       if (Function->getPrimaryTemplate())
@@ -3436,6 +3617,16 @@
       assert(!Recursive);
       PendingInstantiations.push_back(
         std::make_pair(Function, PointOfInstantiation));
+    } else if (Function->getTemplateSpecializationKind()
+                 == TSK_ImplicitInstantiation) {
+      if (AtEndOfTU && !getDiagnostics().hasErrorOccurred()) {
+        Diag(PointOfInstantiation, diag::warn_func_template_missing)
+          << Function;
+        Diag(PatternDecl->getLocation(), diag::note_forward_template_decl);
+        if (getLangOpts().CPlusPlus11)
+          Diag(PointOfInstantiation, diag::note_inst_declaration_hint)
+            << Function;
+      }
     }
 
     return;
@@ -3466,6 +3657,8 @@
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Function);
   if (Inst.isInvalid())
     return;
+  PrettyDeclStackTraceEntry CrashInfo(*this, Function, SourceLocation(),
+                                      "instantiating function definition");
 
   // Copy the inner loc start from the pattern.
   Function->setInnerLocStart(PatternDecl->getInnerLocStart());
@@ -3696,11 +3889,12 @@
   Context.setManglingNumber(NewVar, Context.getManglingNumber(OldVar));
   Context.setStaticLocalNumber(NewVar, Context.getStaticLocalNumber(OldVar));
 
-  // Delay instantiation of the initializer for variable templates until a
-  // definition of the variable is needed. We need it right away if the type
-  // contains 'auto'.
+  // Delay instantiation of the initializer for variable templates or inline
+  // static data members until a definition of the variable is needed. We need
+  // it right away if the type contains 'auto'.
   if ((!isa<VarTemplateSpecializationDecl>(NewVar) &&
-       !InstantiatingVarTemplate) ||
+       !InstantiatingVarTemplate &&
+       !(OldVar->isInline() && OldVar->isThisDeclarationADefinition())) ||
       NewVar->getType()->isUndeducedType())
     InstantiateVariableInitializer(NewVar, OldVar, TemplateArgs);
 
@@ -3716,6 +3910,13 @@
 void Sema::InstantiateVariableInitializer(
     VarDecl *Var, VarDecl *OldVar,
     const MultiLevelTemplateArgumentList &TemplateArgs) {
+  // We propagate the 'inline' flag with the initializer, because it
+  // would otherwise imply that the variable is a definition for a
+  // non-static data member.
+  if (OldVar->isInlineSpecified())
+    Var->setInlineSpecified();
+  else if (OldVar->isInline())
+    Var->setImplicitlyInline();
 
   if (Var->getAnyInitializer())
     // We already have an initializer in the class.
@@ -3788,7 +3989,7 @@
 
 void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
                                          VarDecl *Var, bool Recursive,
-                                         bool DefinitionRequired) {
+                                      bool DefinitionRequired, bool AtEndOfTU) {
   if (Var->isInvalidDecl())
     return;
 
@@ -3850,6 +4051,8 @@
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
       if (Inst.isInvalid())
         return;
+      PrettyDeclStackTraceEntry CrashInfo(*this, Var, SourceLocation(),
+                                          "instantiating variable initializer");
 
       // If we're performing recursive template instantiation, create our own
       // queue of pending implicit instantiations that we will instantiate
@@ -3896,9 +4099,13 @@
 
     assert(PatternDecl && "data member was not instantiated from a template?");
     assert(PatternDecl->isStaticDataMember() && "not a static data member?");
-    Def = PatternDecl->getOutOfLineDefinition();
+    Def = PatternDecl->getDefinition();
   }
 
+  // FIXME: Check that the definition is visible before trying to instantiate
+  // it. This requires us to track the instantiation stack in order to know
+  // which definitions should be visible.
+
   // If we don't have a definition of the variable template, we won't perform
   // any instantiation. Rather, we rely on the user to instantiate this
   // definition (or provide a specialization for it) in another translation
@@ -3920,6 +4127,16 @@
                  == TSK_ExplicitInstantiationDefinition) {
       PendingInstantiations.push_back(
         std::make_pair(Var, PointOfInstantiation));
+    } else if (Var->getTemplateSpecializationKind()
+                 == TSK_ImplicitInstantiation) {
+      // Warn about missing definition at the end of translation unit.
+      if (AtEndOfTU && !getDiagnostics().hasErrorOccurred()) {
+        Diag(PointOfInstantiation, diag::warn_var_template_missing)
+          << Var;
+        Diag(PatternDecl->getLocation(), diag::note_forward_template_decl);
+        if (getLangOpts().CPlusPlus11)
+          Diag(PointOfInstantiation, diag::note_inst_declaration_hint) << Var;
+      }
     }
 
     return;
@@ -3963,6 +4180,8 @@
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
   if (Inst.isInvalid())
     return;
+  PrettyDeclStackTraceEntry CrashInfo(*this, Var, SourceLocation(),
+                                      "instantiating variable definition");
 
   // If we're performing recursive template instantiation, create our own
   // queue of pending implicit instantiations that we will instantiate later,
@@ -3978,11 +4197,16 @@
   LocalInstantiationScope Local(*this);
 
   VarDecl *OldVar = Var;
-  if (!VarSpec)
+  if (Def->isStaticDataMember() && !Def->isOutOfLine()) {
+    // We're instantiating an inline static data member whose definition was
+    // provided inside the class.
+    // FIXME: Update record?
+    InstantiateVariableInitializer(Var, Def, TemplateArgs);
+  } else if (!VarSpec) {
     Var = cast_or_null<VarDecl>(SubstDecl(Def, Var->getDeclContext(),
                                           TemplateArgs));
-  else if (Var->isStaticDataMember() &&
-           Var->getLexicalDeclContext()->isRecord()) {
+  } else if (Var->isStaticDataMember() &&
+             Var->getLexicalDeclContext()->isRecord()) {
     // We need to instantiate the definition of a static data member template,
     // and all we have is the in-class declaration of it. Instantiate a separate
     // declaration of the definition.
@@ -4684,12 +4908,10 @@
 
     // Instantiate function definitions
     if (FunctionDecl *Function = dyn_cast<FunctionDecl>(Inst.first)) {
-      PrettyDeclStackTraceEntry CrashInfo(*this, Function, SourceLocation(),
-                                          "instantiating function definition");
       bool DefinitionRequired = Function->getTemplateSpecializationKind() ==
                                 TSK_ExplicitInstantiationDefinition;
       InstantiateFunctionDefinition(/*FIXME:*/Inst.second, Function, true,
-                                    DefinitionRequired);
+                                    DefinitionRequired, true);
       continue;
     }
 
@@ -4730,7 +4952,7 @@
     // Instantiate static data member definitions or variable template
     // specializations.
     InstantiateVariableDefinition(/*FIXME:*/ Inst.second, Var, true,
-                                  DefinitionRequired);
+                                  DefinitionRequired, true);
   }
 }
 
diff --git a/lib/Sema/SemaTemplateVariadic.cpp b/lib/Sema/SemaTemplateVariadic.cpp
index cb67d71..06afe87 100644
--- a/lib/Sema/SemaTemplateVariadic.cpp
+++ b/lib/Sema/SemaTemplateVariadic.cpp
@@ -604,7 +604,7 @@
     //   Template argument deduction can extend the sequence of template 
     //   arguments corresponding to a template parameter pack, even when the
     //   sequence contains explicitly specified template arguments.
-    if (!IsFunctionParameterPack) {
+    if (!IsFunctionParameterPack && CurrentInstantiationScope) {
       if (NamedDecl *PartialPack 
                     = CurrentInstantiationScope->getPartiallySubstitutedPack()){
         unsigned PartialDepth, PartialIndex;
@@ -727,6 +727,7 @@
   case TST_half:
   case TST_float:
   case TST_double:
+  case TST_float128:
   case TST_bool:
   case TST_decimal32:
   case TST_decimal64:
@@ -739,6 +740,8 @@
   case TST_auto:
   case TST_auto_type:
   case TST_decltype_auto:
+#define GENERIC_IMAGE_TYPE(ImgType, Id) case TST_##ImgType##_t:
+#include "clang/Basic/OpenCLImageTypes.def"
   case TST_unknown_anytype:
   case TST_error:
     break;
@@ -996,10 +999,6 @@
                                        BinaryOperatorKind Operator) {
   // [temp.variadic]p9:
   //   If N is zero for a unary fold-expression, the value of the expression is
-  //       *   ->  1
-  //       +   ->  int()
-  //       &   ->  -1
-  //       |   ->  int()
   //       &&  ->  true
   //       ||  ->  false
   //       ,   ->  void()
@@ -1009,17 +1008,6 @@
   // prevent the result from being a null pointer constant.
   QualType ScalarType;
   switch (Operator) {
-  case BO_Add:
-    ScalarType = Context.IntTy;
-    break;
-  case BO_Mul:
-    return ActOnIntegerConstant(EllipsisLoc, 1);
-  case BO_Or:
-    ScalarType = Context.IntTy;
-    break;
-  case BO_And:
-    return CreateBuiltinUnaryOp(EllipsisLoc, UO_Minus,
-                                ActOnIntegerConstant(EllipsisLoc, 1).get());
   case BO_LOr:
     return ActOnCXXBoolLiteral(EllipsisLoc, tok::kw_false);
   case BO_LAnd:
diff --git a/lib/Sema/SemaType.cpp b/lib/Sema/SemaType.cpp
index 6ff4adf..7b78ff6 100644
--- a/lib/Sema/SemaType.cpp
+++ b/lib/Sema/SemaType.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaInternal.h"
 #include "TypeLocBuilder.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
@@ -22,7 +21,6 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/TypeLocVisitor.h"
-#include "clang/Lex/Preprocessor.h"
 #include "clang/Basic/PartialDiagnostic.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
@@ -30,9 +28,11 @@
 #include "clang/Sema/DelayedDiagnostic.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
@@ -246,7 +246,7 @@
       savedAttrs.back()->setNext(nullptr);
     }
   };
-}
+} // end anonymous namespace
 
 static void spliceAttrIntoList(AttributeList &attr, AttributeList *&head) {
   attr.setNext(head);
@@ -1220,6 +1220,21 @@
   return CreateParsedType(Result, ResultTInfo);
 }
 
+static StringRef getImageAccessAttrStr(AttributeList *attrs) {
+  if (attrs) {
+
+    AttributeList *Next;
+    do {
+      AttributeList &Attr = *attrs;
+      Next = Attr.getNext();
+      if (Attr.getKind() == AttributeList::AT_OpenCLAccess) {
+        return Attr.getName()->getName();
+      }
+    } while (Next);
+  }
+  return "";
+}
+
 /// \brief Convert the specified declspec to the appropriate type
 /// object.
 /// \param state Specifies the declarator containing the declaration specifier
@@ -1382,7 +1397,8 @@
   }
   case DeclSpec::TST_int128:
     if (!S.Context.getTargetInfo().hasInt128Type())
-      S.Diag(DS.getTypeSpecTypeLoc(), diag::err_int128_unsupported);
+      S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_unsupported)
+        << "__int128";
     if (DS.getTypeSpecSign() == DeclSpec::TSS_unsigned)
       Result = Context.UnsignedInt128Ty;
     else
@@ -1404,7 +1420,14 @@
       declarator.setInvalidType(true);
     }
     break;
+  case DeclSpec::TST_float128:
+    if (!S.Context.getTargetInfo().hasFloat128Type())
+      S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_unsupported)
+        << "__float128";
+    Result = Context.Float128Ty;
+    break;
   case DeclSpec::TST_bool: Result = Context.BoolTy; break; // _Bool or bool
+    break;
   case DeclSpec::TST_decimal32:    // _Decimal32
   case DeclSpec::TST_decimal64:    // _Decimal64
   case DeclSpec::TST_decimal128:   // _Decimal128
@@ -1473,9 +1496,18 @@
           declarator.setInvalidType(true);
         }
       } else if (!S.getOpenCLOptions().cl_khr_gl_msaa_sharing &&
-                 (Result->isImage2dMSAAT() || Result->isImage2dArrayMSAAT() ||
-                  Result->isImage2dArrayMSAATDepth() ||
-                  Result->isImage2dMSAATDepth())) {
+                 (Result->isOCLImage2dArrayMSAADepthROType() ||
+                  Result->isOCLImage2dArrayMSAADepthWOType() ||
+                  Result->isOCLImage2dArrayMSAADepthRWType() ||
+                  Result->isOCLImage2dArrayMSAAROType() ||
+                  Result->isOCLImage2dArrayMSAARWType() ||
+                  Result->isOCLImage2dArrayMSAAWOType() ||
+                  Result->isOCLImage2dMSAADepthROType() ||
+                  Result->isOCLImage2dMSAADepthRWType() ||
+                  Result->isOCLImage2dMSAADepthWOType() ||
+                  Result->isOCLImage2dMSAAROType() ||
+                  Result->isOCLImage2dMSAARWType() ||
+                  Result->isOCLImage2dMSAAWOType())) {
         S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_requires_extension)
             << Result << "cl_khr_gl_msaa_sharing";
         declarator.setInvalidType(true);
@@ -1589,6 +1621,16 @@
     }
     break;
 
+#define GENERIC_IMAGE_TYPE(ImgType, Id) \
+  case DeclSpec::TST_##ImgType##_t: \
+    Result = llvm::StringSwitch<QualType>( \
+                 getImageAccessAttrStr(DS.getAttributes().getList())) \
+                 .Cases("write_only", "__write_only", Context.Id##WOTy) \
+                 .Cases("read_write", "__read_write", Context.Id##RWTy) \
+                 .Default(Context.Id##ROTy); \
+    break;
+#include "clang/Basic/OpenCLImageTypes.def"
+
   case DeclSpec::TST_error:
     Result = Context.IntTy;
     declarator.setInvalidType(true);
@@ -1706,6 +1748,12 @@
   if (T.isNull())
     return QualType();
 
+  // Ignore any attempt to form a cv-qualified reference.
+  if (T->isReferenceType()) {
+    Qs.removeConst();
+    Qs.removeVolatile();
+  }
+
   // Enforce C99 6.7.3p2: "Types other than pointer types derived from
   // object or incomplete types shall not be restrict-qualified."
   if (Qs.hasRestrict()) {
@@ -1743,12 +1791,18 @@
 }
 
 QualType Sema::BuildQualifiedType(QualType T, SourceLocation Loc,
-                                  unsigned CVRA, const DeclSpec *DS) {
+                                  unsigned CVRAU, const DeclSpec *DS) {
   if (T.isNull())
     return QualType();
 
-  // Convert from DeclSpec::TQ to Qualifiers::TQ by just dropping TQ_atomic.
-  unsigned CVR = CVRA & ~DeclSpec::TQ_atomic;
+  // Ignore any attempt to form a cv-qualified reference.
+  if (T->isReferenceType())
+    CVRAU &=
+        ~(DeclSpec::TQ_const | DeclSpec::TQ_volatile | DeclSpec::TQ_atomic);
+
+  // Convert from DeclSpec::TQ to Qualifiers::TQ by just dropping TQ_atomic and
+  // TQ_unaligned;
+  unsigned CVR = CVRAU & ~(DeclSpec::TQ_atomic | DeclSpec::TQ_unaligned);
 
   // C11 6.7.3/5:
   //   If the same qualifier appears more than once in the same
@@ -1758,7 +1812,7 @@
   // It's not specified what happens when the _Atomic qualifier is applied to
   // a type specified with the _Atomic specifier, but we assume that this
   // should be treated as if the _Atomic qualifier appeared multiple times.
-  if (CVRA & DeclSpec::TQ_atomic && !T->isAtomicType()) {
+  if (CVRAU & DeclSpec::TQ_atomic && !T->isAtomicType()) {
     // C11 6.7.3/5:
     //   If other qualifiers appear along with the _Atomic qualifier in a
     //   specifier-qualifier-list, the resulting type is the so-qualified
@@ -1775,7 +1829,9 @@
     return BuildQualifiedType(T, Loc, Split.Quals);
   }
 
-  return BuildQualifiedType(T, Loc, Qualifiers::fromCVRMask(CVR), DS);
+  Qualifiers Q = Qualifiers::fromCVRMask(CVR);
+  Q.setUnaligned(CVRAU & DeclSpec::TQ_unaligned);
+  return BuildQualifiedType(T, Loc, Q, DS);
 }
 
 /// \brief Build a paren type including \p T.
@@ -1871,7 +1927,7 @@
 ///
 /// The values of this enum are used in diagnostics.
 enum QualifiedFunctionKind { QFK_BlockPointer, QFK_Pointer, QFK_Reference };
-}
+} // end anonymous namespace
 
 /// Check whether the type T is a qualified function type, and if it is,
 /// diagnose that it cannot be contained within the given kind of declarator.
@@ -2018,10 +2074,10 @@
   } Diagnoser;
 
   return S.VerifyIntegerConstantExpression(ArraySize, &SizeVal, Diagnoser,
-                                           S.LangOpts.GNUMode).isInvalid();
+                                           S.LangOpts.GNUMode ||
+                                           S.LangOpts.OpenCL).isInvalid();
 }
 
-
 /// \brief Build an array type.
 ///
 /// \param T The type of each element in the array.
@@ -2200,15 +2256,8 @@
   // If this is not C99, extwarn about VLA's and C99 array size modifiers.
   if (!getLangOpts().C99) {
     if (T->isVariableArrayType()) {
-      // Prohibit the use of non-POD types in VLAs.
-      QualType BaseT = Context.getBaseElementType(T);
-      if (!T->isDependentType() && isCompleteType(Loc, BaseT) &&
-          !BaseT.isPODType(Context) && !BaseT->isObjCLifetimeType()) {
-        Diag(Loc, diag::err_vla_non_pod) << BaseT;
-        return QualType();
-      }
       // Prohibit the use of VLAs during template argument deduction.
-      else if (isSFINAEContext()) {
+      if (isSFINAEContext()) {
         Diag(Loc, diag::err_vla_in_sfinae);
         return QualType();
       }
@@ -2226,6 +2275,18 @@
     Diag(Loc, diag::warn_vla_used);
   }
 
+  // OpenCL v2.0 s6.12.5 - Arrays of blocks are not supported.
+  // OpenCL v2.0 s6.16.13.1 - Arrays of pipe type are not supported.
+  // OpenCL v2.0 s6.9.b - Arrays of image/sampler type are not supported.
+  if (getLangOpts().OpenCL) {
+    const QualType ArrType = Context.getBaseElementType(T);
+    if (ArrType->isBlockPointerType() || ArrType->isPipeType() ||
+        ArrType->isSamplerT() || ArrType->isImageType()) {
+      Diag(Loc, diag::err_opencl_invalid_type_array) << ArrType;
+      return QualType();
+    }
+  }
+
   return T;
 }
 
@@ -2606,7 +2667,8 @@
                                      SourceLocation ConstQualLoc,
                                      SourceLocation VolatileQualLoc,
                                      SourceLocation RestrictQualLoc,
-                                     SourceLocation AtomicQualLoc) {
+                                     SourceLocation AtomicQualLoc,
+                                     SourceLocation UnalignedQualLoc) {
   if (!Quals)
     return;
 
@@ -2614,26 +2676,27 @@
     const char *Name;
     unsigned Mask;
     SourceLocation Loc;
-  } const QualKinds[4] = {
+  } const QualKinds[5] = {
     { "const", DeclSpec::TQ_const, ConstQualLoc },
     { "volatile", DeclSpec::TQ_volatile, VolatileQualLoc },
     { "restrict", DeclSpec::TQ_restrict, RestrictQualLoc },
+    { "__unaligned", DeclSpec::TQ_unaligned, UnalignedQualLoc },
     { "_Atomic", DeclSpec::TQ_atomic, AtomicQualLoc }
   };
 
   SmallString<32> QualStr;
   unsigned NumQuals = 0;
   SourceLocation Loc;
-  FixItHint FixIts[4];
+  FixItHint FixIts[5];
 
   // Build a string naming the redundant qualifiers.
-  for (unsigned I = 0; I != 4; ++I) {
-    if (Quals & QualKinds[I].Mask) {
+  for (auto &E : QualKinds) {
+    if (Quals & E.Mask) {
       if (!QualStr.empty()) QualStr += ' ';
-      QualStr += QualKinds[I].Name;
+      QualStr += E.Name;
 
       // If we have a location for the qualifier, offer a fixit.
-      SourceLocation QualLoc = QualKinds[I].Loc;
+      SourceLocation QualLoc = E.Loc;
       if (QualLoc.isValid()) {
         FixIts[NumQuals] = FixItHint::CreateRemoval(QualLoc);
         if (Loc.isInvalid() ||
@@ -2679,7 +2742,8 @@
           SourceLocation::getFromRawEncoding(PTI.ConstQualLoc),
           SourceLocation::getFromRawEncoding(PTI.VolatileQualLoc),
           SourceLocation::getFromRawEncoding(PTI.RestrictQualLoc),
-          SourceLocation::getFromRawEncoding(PTI.AtomicQualLoc));
+          SourceLocation::getFromRawEncoding(PTI.AtomicQualLoc),
+          SourceLocation::getFromRawEncoding(PTI.UnalignedQualLoc));
       return;
     }
 
@@ -2715,7 +2779,8 @@
                               D.getDeclSpec().getConstSpecLoc(),
                               D.getDeclSpec().getVolatileSpecLoc(),
                               D.getDeclSpec().getRestrictSpecLoc(),
-                              D.getDeclSpec().getAtomicSpecLoc());
+                              D.getDeclSpec().getAtomicSpecLoc(),
+                              D.getDeclSpec().getUnalignedSpecLoc());
 }
 
 static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
@@ -2829,6 +2894,7 @@
     case Declarator::FileContext:
     case Declarator::BlockContext:
     case Declarator::ForContext:
+    case Declarator::InitStmtContext:
     case Declarator::ConditionContext:
       break;
     case Declarator::CXXNewContext:
@@ -2914,6 +2980,7 @@
     case Declarator::MemberContext:
     case Declarator::BlockContext:
     case Declarator::ForContext:
+    case Declarator::InitStmtContext:
     case Declarator::BlockLiteralContext:
     case Declarator::LambdaExprContext:
       // C++11 [dcl.type]p3:
@@ -3128,15 +3195,19 @@
   CallingConv CC = S.Context.getDefaultCallingConvention(FTI.isVariadic,
                                                          IsCXXInstanceMethod);
 
-  // Attribute AT_OpenCLKernel affects the calling convention only on
-  // the SPIR target, hence it cannot be treated as a calling
+  // Attribute AT_OpenCLKernel affects the calling convention for SPIR
+  // and AMDGPU targets, hence it cannot be treated as a calling
   // convention attribute. This is the simplest place to infer
-  // "spir_kernel" for OpenCL kernels on SPIR.
-  if (CC == CC_SpirFunction) {
+  // calling convention for OpenCL kernels.
+  if (S.getLangOpts().OpenCL) {
     for (const AttributeList *Attr = D.getDeclSpec().getAttributes().getList();
          Attr; Attr = Attr->getNext()) {
       if (Attr->getKind() == AttributeList::AT_OpenCLKernel) {
-        CC = CC_SpirKernel;
+        llvm::Triple::ArchType arch = S.Context.getTargetInfo().getTriple().getArch();
+        if (arch == llvm::Triple::spir || arch == llvm::Triple::spir64 ||
+            arch == llvm::Triple::amdgcn) {
+          CC = CC_OpenCLKernel;
+        }
         break;
       }
     }
@@ -3153,7 +3224,7 @@
     BlockPointer,
     MemberPointer,
   };
-}
+} // end anonymous namespace
 
 IdentifierInfo *Sema::getNullabilityKeyword(NullabilityKind nullability) {
   switch (nullability) {
@@ -3213,7 +3284,7 @@
     // NSError**
     NSErrorPointerPointer,
   };
-}
+} // end anonymous namespace
 
 /// Classify the given declarator, whose type-specified is \c type, based on
 /// what kind of pointer it refers to.
@@ -3325,7 +3396,6 @@
     break;
   } while (true);
 
-
   switch (numNormalPointers) {
   case 0:
     return PointerDeclaratorKind::NonPointer;
@@ -3662,6 +3732,7 @@
     case Declarator::CXXCatchContext:
     case Declarator::CXXNewContext:
     case Declarator::ForContext:
+    case Declarator::InitStmtContext:
     case Declarator::LambdaExprContext:
     case Declarator::LambdaExprParameterContext:
     case Declarator::ObjCCatchContext:
@@ -3762,15 +3833,20 @@
     case DeclaratorChunk::BlockPointer:
       // If blocks are disabled, emit an error.
       if (!LangOpts.Blocks)
-        S.Diag(DeclType.Loc, diag::err_blocks_disable);
+        S.Diag(DeclType.Loc, diag::err_blocks_disable) << LangOpts.OpenCL;
 
       // Handle pointer nullability.
       inferPointerNullability(SimplePointerKind::BlockPointer,
                               DeclType.Loc, DeclType.getAttrListRef());
 
       T = S.BuildBlockPointerType(T, D.getIdentifierLoc(), Name);
-      if (DeclType.Cls.TypeQuals)
+      if (DeclType.Cls.TypeQuals || LangOpts.OpenCL) {
+        // OpenCL v2.0, s6.12.5 - Block variable declarations are implicitly
+        // qualified with const.
+        if (LangOpts.OpenCL)
+          DeclType.Cls.TypeQuals |= DeclSpec::TQ_const;
         T = S.BuildQualifiedType(T, DeclType.Loc, DeclType.Cls.TypeQuals);
+      }
       break;
     case DeclaratorChunk::Pointer:
       // Verify that we're not building a pointer to pointer to function with
@@ -3791,10 +3867,21 @@
           T = S.BuildQualifiedType(T, DeclType.Loc, DeclType.Ptr.TypeQuals);
         break;
       }
+
+      // OpenCL v2.0 s6.9b - Pointer to image/sampler cannot be used.
+      // OpenCL v2.0 s6.13.16.1 - Pointer to pipe cannot be used.
+      // OpenCL v2.0 s6.12.5 - Pointers to Blocks are not allowed.
+      if (LangOpts.OpenCL) {
+        if (T->isImageType() || T->isSamplerT() || T->isPipeType() ||
+            T->isBlockPointerType()) {
+          S.Diag(D.getIdentifierLoc(), diag::err_opencl_pointer_to_type) << T;
+          D.setInvalidType(true);
+        }
+      }
+
       T = S.BuildPointerType(T, DeclType.Loc, Name);
       if (DeclType.Ptr.TypeQuals)
         T = S.BuildQualifiedType(T, DeclType.Loc, DeclType.Ptr.TypeQuals);
-
       break;
     case DeclaratorChunk::Reference: {
       // Verify that we're not building a reference to pointer to function with
@@ -3961,7 +4048,8 @@
       if (T->isHalfType()) {
         if (S.getLangOpts().OpenCL) {
           if (!S.getOpenCLOptions().cl_khr_fp16) {
-            S.Diag(D.getIdentifierLoc(), diag::err_opencl_half_return) << T;
+            S.Diag(D.getIdentifierLoc(), diag::err_opencl_invalid_return)
+                << T << 0 /*pointer hint*/;
             D.setInvalidType(true);
           } 
         } else if (!S.getLangOpts().HalfArgsAndReturns) {
@@ -3971,6 +4059,15 @@
         }
       }
 
+        // OpenCL v2.0 s6.12.5 - A block cannot be the return value of a
+        // function.
+      if (LangOpts.OpenCL && (T->isBlockPointerType() || T->isImageType() ||
+                              T->isSamplerT() || T->isPipeType())) {
+        S.Diag(D.getIdentifierLoc(), diag::err_opencl_invalid_return)
+            << T << 1 /*hint off*/;
+        D.setInvalidType(true);
+      }
+
       // Methods cannot return interface types. All ObjC objects are
       // passed by reference.
       if (T->isObjCObjectType()) {
@@ -4229,7 +4326,6 @@
 
         T = Context.getFunctionType(T, ParamTys, EPI);
       }
-
       break;
     }
     case DeclaratorChunk::MemberPointer: {
@@ -4467,6 +4563,7 @@
     case Declarator::MemberContext:
     case Declarator::BlockContext:
     case Declarator::ForContext:
+    case Declarator::InitStmtContext:
     case Declarator::ConditionContext:
     case Declarator::CXXCatchContext:
     case Declarator::ObjCCatchContext:
@@ -4892,7 +4989,7 @@
     void VisitPipeTypeLoc(PipeTypeLoc TL) {
       TL.setKWLoc(DS.getTypeSpecTypeLoc());
 
-      TypeSourceInfo *TInfo = 0;
+      TypeSourceInfo *TInfo = nullptr;
       Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
       TL.getValueLoc().initializeFullCopy(TInfo->getTypeLoc());
     }
@@ -5026,7 +5123,7 @@
       llvm_unreachable("unsupported TypeLoc kind in declarator!");
     }
   };
-}
+} // end anonymous namespace
 
 static void fillAtomicQualLoc(AtomicTypeLoc ATL, const DeclaratorChunk &Chunk) {
   SourceLocation Loc;
@@ -5162,7 +5259,6 @@
   return CreateParsedType(T, TInfo);
 }
 
-
 //===----------------------------------------------------------------------===//
 // Type Attribute Processing
 //===----------------------------------------------------------------------===//
@@ -5538,6 +5634,7 @@
   struct FunctionTypeUnwrapper {
     enum WrapKind {
       Desugar,
+      Attributed,
       Parens,
       Pointer,
       BlockPointer,
@@ -5570,6 +5667,9 @@
         } else if (isa<ReferenceType>(Ty)) {
           T = cast<ReferenceType>(Ty)->getPointeeType();
           Stack.push_back(Reference);
+        } else if (isa<AttributedType>(Ty)) {
+          T = cast<AttributedType>(Ty)->getEquivalentType();
+          Stack.push_back(Attributed);
         } else {
           const Type *DTy = Ty->getUnqualifiedDesugaredType();
           if (Ty == DTy) {
@@ -5618,6 +5718,9 @@
         // information.
         return wrap(C, Old->getUnqualifiedDesugaredType(), I);
 
+      case Attributed:
+        return wrap(C, cast<AttributedType>(Old)->getEquivalentType(), I);
+
       case Parens: {
         QualType New = wrap(C, cast<ParenType>(Old)->getInnerType(), I);
         return C.getParenType(New);
@@ -5652,7 +5755,7 @@
       llvm_unreachable("unknown wrapping kind");
     }
   };
-}
+} // end anonymous namespace
 
 static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &State,
                                              AttributeList &Attr,
@@ -6416,6 +6519,36 @@
   CurType = S.Context.getVectorType(CurType, numElts, VecKind);
 }
 
+/// Handle OpenCL Access Qualifier Attribute.
+static void HandleOpenCLAccessAttr(QualType &CurType, const AttributeList &Attr,
+                                   Sema &S) {
+  // OpenCL v2.0 s6.6 - Access qualifier can be used only for image and pipe type.
+  if (!(CurType->isImageType() || CurType->isPipeType())) {
+    S.Diag(Attr.getLoc(), diag::err_opencl_invalid_access_qualifier);
+    Attr.setInvalid();
+    return;
+  }
+
+  if (const TypedefType* TypedefTy = CurType->getAs<TypedefType>()) {
+    QualType PointeeTy = TypedefTy->desugar();
+    S.Diag(Attr.getLoc(), diag::err_opencl_multiple_access_qualifiers);
+
+    std::string PrevAccessQual;
+    switch (cast<BuiltinType>(PointeeTy.getTypePtr())->getKind()) {
+      #define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    case BuiltinType::Id:                                          \
+      PrevAccessQual = #Access;                                    \
+      break;
+      #include "clang/Basic/OpenCLImageTypes.def"
+    default:
+      assert(0 && "Unable to find corresponding image type.");
+    }
+
+    S.Diag(TypedefTy->getDecl()->getLocStart(),
+       diag::note_opencl_typedef_access_qualifier) << PrevAccessQual;
+  }
+}
+
 static void processTypeAttrs(TypeProcessingState &state, QualType &type,
                              TypeAttrLocation TAL, AttributeList *attrs) {
   // Scan through and apply attributes to this type where it makes sense.  Some
@@ -6511,9 +6644,8 @@
                                VectorType::NeonPolyVector);
       attr.setUsedAsTypeAttr();
       break;
-    case AttributeList::AT_OpenCLImageAccess:
-      // FIXME: there should be some type checking happening here, I would
-      // imagine, but the original handler's checking was entirely superfluous.
+    case AttributeList::AT_OpenCLAccess:
+      HandleOpenCLAccessAttr(type, attr, state.getSema());
       attr.setUsedAsTypeAttr();
       break;
 
@@ -6754,8 +6886,8 @@
       RD = Pattern;
     D = RD->getDefinition();
   } else if (auto *ED = dyn_cast<EnumDecl>(D)) {
-    while (auto *NewED = ED->getInstantiatedFromMemberEnum())
-      ED = NewED;
+    if (auto *Pattern = ED->getTemplateInstantiationPattern())
+      ED = Pattern;
     if (OnlyNeedComplete && ED->isFixed()) {
       // If the enum has a fixed underlying type, and we're only looking for a
       // complete type (not a definition), any visible declaration of it will
@@ -6816,6 +6948,7 @@
         S.ImplicitMSInheritanceAttrLoc.isValid()
             ? S.ImplicitMSInheritanceAttrLoc
             : RD->getSourceRange()));
+    S.Consumer.AssignInheritanceModel(RD);
   }
 }
 
@@ -6841,9 +6974,16 @@
     }
   }
 
-  // If we have a complete type, we're done.
   NamedDecl *Def = nullptr;
-  if (!T->isIncompleteType(&Def)) {
+  bool Incomplete = T->isIncompleteType(&Def);
+
+  // Check that any necessary explicit specializations are visible. For an
+  // enum, we just need the declaration, so don't check this.
+  if (Def && !isa<EnumDecl>(Def))
+    checkSpecializationVisibility(Loc, Def);
+
+  // If we have a complete type, we're done.
+  if (!Incomplete) {
     // If we know about the definition but it is not visible, complain.
     NamedDecl *SuggestedDef = nullptr;
     if (Def &&
@@ -6852,7 +6992,7 @@
       // definition visible.
       bool TreatAsComplete = Diagnoser && !isSFINAEContext();
       if (Diagnoser)
-        diagnoseMissingImport(Loc, SuggestedDef, /*NeedDefinition*/true,
+        diagnoseMissingImport(Loc, SuggestedDef, MissingImportKind::Definition,
                               /*Recover*/TreatAsComplete);
       return !TreatAsComplete;
     }
@@ -6945,16 +7085,12 @@
     }
   }
 
+  // FIXME: If we didn't instantiate a definition because of an explicit
+  // specialization declaration, check that it's visible.
+
   if (!Diagnoser)
     return true;
 
-  // We have an incomplete type. Produce a diagnostic.
-  if (Ident___float128 &&
-      T == Context.getTypeDeclType(Context.getFloat128StubType())) {
-    Diag(Loc, diag::err_typecheck_decl_incomplete_type___float128);
-    return true;
-  }
-
   Diagnoser->diagnose(*this, Loc, T);
 
   // If the type was a forward declaration of a class/struct/union
diff --git a/lib/Sema/TreeTransform.h b/lib/Sema/TreeTransform.h
index d533778..c53505d 100644
--- a/lib/Sema/TreeTransform.h
+++ b/lib/Sema/TreeTransform.h
@@ -410,6 +410,14 @@
     return D;
   }
 
+  /// \brief Transform the specified condition.
+  ///
+  /// By default, this transforms the variable and expression and rebuilds
+  /// the condition.
+  Sema::ConditionResult TransformCondition(SourceLocation Loc, VarDecl *Var,
+                                           Expr *Expr,
+                                           Sema::ConditionKind Kind);
+
   /// \brief Transform the attributes associated with the given declaration and
   /// place them on the new declaration.
   ///
@@ -604,13 +612,12 @@
   /// variables vector are acceptable.
   ///
   /// Return true on error.
-  bool TransformFunctionTypeParams(SourceLocation Loc,
-                                   ParmVarDecl **Params, unsigned NumParams,
-                                   const QualType *ParamTypes,
-                     const FunctionProtoType::ExtParameterInfo *ParamInfos,
-                                   SmallVectorImpl<QualType> &PTypes,
-                                   SmallVectorImpl<ParmVarDecl*> *PVars,
-                                   Sema::ExtParameterInfoBuilder &PInfos);
+  bool TransformFunctionTypeParams(
+      SourceLocation Loc, ArrayRef<ParmVarDecl *> Params,
+      const QualType *ParamTypes,
+      const FunctionProtoType::ExtParameterInfo *ParamInfos,
+      SmallVectorImpl<QualType> &PTypes, SmallVectorImpl<ParmVarDecl *> *PVars,
+      Sema::ExtParameterInfoBuilder &PInfos);
 
   /// \brief Transforms a single function-type parameter.  Return null
   /// on error.
@@ -1166,20 +1173,20 @@
   ///
   /// By default, performs semantic analysis to build the new statement.
   /// Subclasses may override this routine to provide different behavior.
-  StmtResult RebuildIfStmt(SourceLocation IfLoc, Sema::FullExprArg Cond,
-                           VarDecl *CondVar, Stmt *Then,
+  StmtResult RebuildIfStmt(SourceLocation IfLoc, bool IsConstexpr,
+                           Sema::ConditionResult Cond, Stmt *Init, Stmt *Then,
                            SourceLocation ElseLoc, Stmt *Else) {
-    return getSema().ActOnIfStmt(IfLoc, Cond, CondVar, Then, ElseLoc, Else);
+    return getSema().ActOnIfStmt(IfLoc, IsConstexpr, Init, Cond, Then,
+                                 ElseLoc, Else);
   }
 
   /// \brief Start building a new switch statement.
   ///
   /// By default, performs semantic analysis to build the new statement.
   /// Subclasses may override this routine to provide different behavior.
-  StmtResult RebuildSwitchStmtStart(SourceLocation SwitchLoc,
-                                    Expr *Cond, VarDecl *CondVar) {
-    return getSema().ActOnStartOfSwitchStmt(SwitchLoc, Cond,
-                                            CondVar);
+  StmtResult RebuildSwitchStmtStart(SourceLocation SwitchLoc, Stmt *Init,
+                                    Sema::ConditionResult Cond) {
+    return getSema().ActOnStartOfSwitchStmt(SwitchLoc, Init, Cond);
   }
 
   /// \brief Attach the body to the switch statement.
@@ -1195,9 +1202,9 @@
   ///
   /// By default, performs semantic analysis to build the new statement.
   /// Subclasses may override this routine to provide different behavior.
-  StmtResult RebuildWhileStmt(SourceLocation WhileLoc, Sema::FullExprArg Cond,
-                              VarDecl *CondVar, Stmt *Body) {
-    return getSema().ActOnWhileStmt(WhileLoc, Cond, CondVar, Body);
+  StmtResult RebuildWhileStmt(SourceLocation WhileLoc,
+                              Sema::ConditionResult Cond, Stmt *Body) {
+    return getSema().ActOnWhileStmt(WhileLoc, Cond, Body);
   }
 
   /// \brief Build a new do-while statement.
@@ -1216,11 +1223,11 @@
   /// By default, performs semantic analysis to build the new statement.
   /// Subclasses may override this routine to provide different behavior.
   StmtResult RebuildForStmt(SourceLocation ForLoc, SourceLocation LParenLoc,
-                            Stmt *Init, Sema::FullExprArg Cond,
-                            VarDecl *CondVar, Sema::FullExprArg Inc,
-                            SourceLocation RParenLoc, Stmt *Body) {
+                            Stmt *Init, Sema::ConditionResult Cond,
+                            Sema::FullExprArg Inc, SourceLocation RParenLoc,
+                            Stmt *Body) {
     return getSema().ActOnForStmt(ForLoc, LParenLoc, Init, Cond,
-                                  CondVar, Inc, RParenLoc, Body);
+                                  Inc, RParenLoc, Body);
   }
 
   /// \brief Build a new goto statement.
@@ -1561,10 +1568,11 @@
                                        SourceLocation ColonLoc,
                                        SourceLocation EndLoc,
                                        CXXScopeSpec &ReductionIdScopeSpec,
-                                       const DeclarationNameInfo &ReductionId) {
+                                       const DeclarationNameInfo &ReductionId,
+                                       ArrayRef<Expr *> UnresolvedReductions) {
     return getSema().ActOnOpenMPReductionClause(
         VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec,
-        ReductionId);
+        ReductionId, UnresolvedReductions);
   }
 
   /// \brief Build a new OpenMP 'linear' clause.
@@ -1660,14 +1668,15 @@
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
   /// Subclasses may override this routine to provide different behavior.
-  OMPClause *RebuildOMPMapClause(
-      OpenMPMapClauseKind MapTypeModifier, OpenMPMapClauseKind MapType,
-      SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
-      SourceLocation StartLoc, SourceLocation LParenLoc,
-      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPMapClause(MapTypeModifier, MapType, MapLoc,
-                                          ColonLoc, VarList,StartLoc,
-                                          LParenLoc, EndLoc);
+  OMPClause *
+  RebuildOMPMapClause(OpenMPMapClauseKind MapTypeModifier,
+                      OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
+                      SourceLocation MapLoc, SourceLocation ColonLoc,
+                      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+                      SourceLocation LParenLoc, SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPMapClause(MapTypeModifier, MapType,
+                                          IsMapTypeImplicit, MapLoc, ColonLoc,
+                                          VarList, StartLoc, LParenLoc, EndLoc);
   }
 
   /// \brief Build a new OpenMP 'num_teams' clause.
@@ -1749,6 +1758,53 @@
         Kind, ChunkSize, StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc);
   }
 
+  /// \brief Build a new OpenMP 'to' clause.
+  ///
+  /// By default, performs semantic analysis to build the new statement.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPToClause(ArrayRef<Expr *> VarList,
+                                SourceLocation StartLoc,
+                                SourceLocation LParenLoc,
+                                SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPToClause(VarList, StartLoc, LParenLoc, EndLoc);
+  }
+
+  /// \brief Build a new OpenMP 'from' clause.
+  ///
+  /// By default, performs semantic analysis to build the new statement.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPFromClause(ArrayRef<Expr *> VarList,
+                                  SourceLocation StartLoc,
+                                  SourceLocation LParenLoc,
+                                  SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPFromClause(VarList, StartLoc, LParenLoc,
+                                           EndLoc);
+  }
+
+  /// Build a new OpenMP 'use_device_ptr' clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
+                                          SourceLocation StartLoc,
+                                          SourceLocation LParenLoc,
+                                          SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPUseDevicePtrClause(VarList, StartLoc, LParenLoc,
+                                                   EndLoc);
+  }
+
+  /// Build a new OpenMP 'is_device_ptr' clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
+                                         SourceLocation StartLoc,
+                                         SourceLocation LParenLoc,
+                                         SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPIsDevicePtrClause(VarList, StartLoc, LParenLoc,
+                                                  EndLoc);
+  }
+
   /// \brief Rebuild the operand to an Objective-C \@synchronized statement.
   ///
   /// By default, performs semantic analysis to build the new statement.
@@ -1838,7 +1894,7 @@
   StmtResult RebuildCXXForRangeStmt(SourceLocation ForLoc,
                                     SourceLocation CoawaitLoc,
                                     SourceLocation ColonLoc,
-                                    Stmt *Range, Stmt *BeginEnd,
+                                    Stmt *Range, Stmt *Begin, Stmt *End,
                                     Expr *Cond, Expr *Inc,
                                     Stmt *LoopVar,
                                     SourceLocation RParenLoc) {
@@ -1860,7 +1916,7 @@
     }
 
     return getSema().BuildCXXForRangeStmt(ForLoc, CoawaitLoc, ColonLoc,
-                                          Range, BeginEnd,
+                                          Range, Begin, End,
                                           Cond, Inc, LoopVar, RParenLoc,
                                           Sema::BFRK_Rebuild);
   }
@@ -2649,7 +2705,8 @@
                                           ConvertedArgs))
       return ExprError();
 
-    return getSema().BuildCXXConstructExpr(Loc, T, Constructor, IsElidable,
+    return getSema().BuildCXXConstructExpr(Loc, T, Constructor,
+                                           IsElidable,
                                            ConvertedArgs,
                                            HadMultipleCandidates,
                                            ListInitialization,
@@ -2658,6 +2715,16 @@
                                            ParenRange);
   }
 
+  /// \brief Build a new implicit construction via inherited constructor
+  /// expression.
+  ExprResult RebuildCXXInheritedCtorInitExpr(QualType T, SourceLocation Loc,
+                                             CXXConstructorDecl *Constructor,
+                                             bool ConstructsVBase,
+                                             bool InheritedFromVBase) {
+    return new (getSema().Context) CXXInheritedCtorInitExpr(
+        Loc, T, Constructor, ConstructsVBase, InheritedFromVBase);
+  }
+
   /// \brief Build a new object-construction expression.
   ///
   /// By default, performs semantic analysis to build the new expression.
@@ -3284,8 +3351,6 @@
         if (Out.isInvalid())
           return true;
 
-        // FIXME: Can this happen? We should not try to expand the pack
-        // in this case.
         if (Out.get()->containsUnexpandedParameterPack()) {
           Out = getDerived().RebuildPackExpansion(
               Out.get(), Expansion->getEllipsisLoc(), OrigNumExpansions);
@@ -3331,6 +3396,31 @@
   return false;
 }
 
+template <typename Derived>
+Sema::ConditionResult TreeTransform<Derived>::TransformCondition(
+    SourceLocation Loc, VarDecl *Var, Expr *Expr, Sema::ConditionKind Kind) {
+  if (Var) {
+    VarDecl *ConditionVar = cast_or_null<VarDecl>(
+        getDerived().TransformDefinition(Var->getLocation(), Var));
+
+    if (!ConditionVar)
+      return Sema::ConditionError();
+
+    return getSema().ActOnConditionVariable(ConditionVar, Loc, Kind);
+  }
+
+  if (Expr) {
+    ExprResult CondExpr = getDerived().TransformExpr(Expr);
+
+    if (CondExpr.isInvalid())
+      return Sema::ConditionError();
+
+    return getSema().ActOnCondition(nullptr, Loc, CondExpr.get(), Kind);
+  }
+
+  return Sema::ConditionResult();
+}
+
 template<typename Derived>
 NestedNameSpecifierLoc
 TreeTransform<Derived>::TransformNestedNameSpecifierLoc(
@@ -3348,15 +3438,13 @@
     NestedNameSpecifier *QNNS = Q.getNestedNameSpecifier();
 
     switch (QNNS->getKind()) {
-    case NestedNameSpecifier::Identifier:
-      if (SemaRef.BuildCXXNestedNameSpecifier(/*Scope=*/nullptr,
-                                              *QNNS->getAsIdentifier(),
-                                              Q.getLocalBeginLoc(),
-                                              Q.getLocalEndLoc(),
-                                              ObjectType, false, SS,
-                                              FirstQualifierInScope, false))
+    case NestedNameSpecifier::Identifier: {
+      Sema::NestedNameSpecInfo IdInfo(QNNS->getAsIdentifier(),
+                          Q.getLocalBeginLoc(), Q.getLocalEndLoc(), ObjectType);
+      if (SemaRef.BuildCXXNestedNameSpecifier(/*Scope=*/nullptr, IdInfo, false,
+                                              SS, FirstQualifierInScope, false))
         return NestedNameSpecifierLoc();
-
+    }
       break;
 
     case NestedNameSpecifier::Namespace: {
@@ -4605,17 +4693,17 @@
   return newParm;
 }
 
-template<typename Derived>
-bool TreeTransform<Derived>::
-  TransformFunctionTypeParams(SourceLocation Loc,
-                              ParmVarDecl **Params, unsigned NumParams,
-                              const QualType *ParamTypes,
-                        const FunctionProtoType::ExtParameterInfo *ParamInfos,
-                              SmallVectorImpl<QualType> &OutParamTypes,
-                              SmallVectorImpl<ParmVarDecl*> *PVars,
-                              Sema::ExtParameterInfoBuilder &PInfos) {
+template <typename Derived>
+bool TreeTransform<Derived>::TransformFunctionTypeParams(
+    SourceLocation Loc, ArrayRef<ParmVarDecl *> Params,
+    const QualType *ParamTypes,
+    const FunctionProtoType::ExtParameterInfo *ParamInfos,
+    SmallVectorImpl<QualType> &OutParamTypes,
+    SmallVectorImpl<ParmVarDecl *> *PVars,
+    Sema::ExtParameterInfoBuilder &PInfos) {
   int indexAdjustment = 0;
 
+  unsigned NumParams = Params.size();
   for (unsigned i = 0; i != NumParams; ++i) {
     if (ParmVarDecl *OldParm = Params[i]) {
       assert(OldParm->getFunctionScopeIndex() == i);
@@ -4754,6 +4842,14 @@
           if (NewType.isNull())
             return true;
 
+          if (NewType->containsUnexpandedParameterPack()) {
+            NewType =
+                getSema().getASTContext().getPackExpansionType(NewType, None);
+
+            if (NewType.isNull())
+              return true;
+          }
+
           if (ParamInfos)
             PInfos.set(OutParamTypes.size(), ParamInfos[i]);
           OutParamTypes.push_back(NewType);
@@ -4850,7 +4946,7 @@
 
   if (T->hasTrailingReturn()) {
     if (getDerived().TransformFunctionTypeParams(
-            TL.getBeginLoc(), TL.getParmArray(), TL.getNumParams(),
+            TL.getBeginLoc(), TL.getParams(),
             TL.getTypePtr()->param_type_begin(),
             T->getExtParameterInfosOrNull(),
             ParamTypes, &ParamDecls, ExtParamInfos))
@@ -4876,7 +4972,7 @@
       return QualType();
 
     if (getDerived().TransformFunctionTypeParams(
-            TL.getBeginLoc(), TL.getParmArray(), TL.getNumParams(),
+            TL.getBeginLoc(), TL.getParams(),
             TL.getTypePtr()->param_type_begin(),
             T->getExtParameterInfosOrNull(),
             ParamTypes, &ParamDecls, ExtParamInfos))
@@ -4936,8 +5032,8 @@
     if (NoexceptExpr.isInvalid())
       return true;
 
-    NoexceptExpr = getSema().CheckBooleanCondition(
-        NoexceptExpr.get(), NoexceptExpr.get()->getLocStart());
+    // FIXME: This is bogus, a noexcept expression is not a condition.
+    NoexceptExpr = getSema().CheckBooleanCondition(Loc, NoexceptExpr.get());
     if (NoexceptExpr.isInvalid())
       return true;
 
@@ -6168,85 +6264,73 @@
 template<typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformIfStmt(IfStmt *S) {
-  // Transform the condition
-  ExprResult Cond;
-  VarDecl *ConditionVar = nullptr;
-  if (S->getConditionVariable()) {
-    ConditionVar
-      = cast_or_null<VarDecl>(
-                   getDerived().TransformDefinition(
-                                      S->getConditionVariable()->getLocation(),
-                                                    S->getConditionVariable()));
-    if (!ConditionVar)
-      return StmtError();
-  } else {
-    Cond = getDerived().TransformExpr(S->getCond());
-
-    if (Cond.isInvalid())
-      return StmtError();
-
-    // Convert the condition to a boolean value.
-    if (S->getCond()) {
-      ExprResult CondE = getSema().ActOnBooleanCondition(nullptr, S->getIfLoc(),
-                                                         Cond.get());
-      if (CondE.isInvalid())
-        return StmtError();
-
-      Cond = CondE.get();
-    }
-  }
-
-  Sema::FullExprArg FullCond(getSema().MakeFullExpr(Cond.get(), S->getIfLoc()));
-  if (!S->getConditionVariable() && S->getCond() && !FullCond.get())
+  // Transform the initialization statement
+  StmtResult Init = getDerived().TransformStmt(S->getInit());
+  if (Init.isInvalid())
     return StmtError();
 
+  // Transform the condition
+  Sema::ConditionResult Cond = getDerived().TransformCondition(
+      S->getIfLoc(), S->getConditionVariable(), S->getCond(),
+      S->isConstexpr() ? Sema::ConditionKind::ConstexprIf
+                       : Sema::ConditionKind::Boolean);
+  if (Cond.isInvalid())
+    return StmtError();
+
+  // If this is a constexpr if, determine which arm we should instantiate.
+  llvm::Optional<bool> ConstexprConditionValue;
+  if (S->isConstexpr())
+    ConstexprConditionValue = Cond.getKnownValue();
+
   // Transform the "then" branch.
-  StmtResult Then = getDerived().TransformStmt(S->getThen());
-  if (Then.isInvalid())
-    return StmtError();
+  StmtResult Then;
+  if (!ConstexprConditionValue || *ConstexprConditionValue) {
+    Then = getDerived().TransformStmt(S->getThen());
+    if (Then.isInvalid())
+      return StmtError();
+  } else {
+    Then = new (getSema().Context) NullStmt(S->getThen()->getLocStart());
+  }
 
   // Transform the "else" branch.
-  StmtResult Else = getDerived().TransformStmt(S->getElse());
-  if (Else.isInvalid())
-    return StmtError();
+  StmtResult Else;
+  if (!ConstexprConditionValue || !*ConstexprConditionValue) {
+    Else = getDerived().TransformStmt(S->getElse());
+    if (Else.isInvalid())
+      return StmtError();
+  }
 
   if (!getDerived().AlwaysRebuild() &&
-      FullCond.get() == S->getCond() &&
-      ConditionVar == S->getConditionVariable() &&
+      Init.get() == S->getInit() &&
+      Cond.get() == std::make_pair(S->getConditionVariable(), S->getCond()) &&
       Then.get() == S->getThen() &&
       Else.get() == S->getElse())
     return S;
 
-  return getDerived().RebuildIfStmt(S->getIfLoc(), FullCond, ConditionVar,
-                                    Then.get(),
-                                    S->getElseLoc(), Else.get());
+  return getDerived().RebuildIfStmt(S->getIfLoc(), S->isConstexpr(), Cond,
+                                    Init.get(), Then.get(), S->getElseLoc(),
+                                    Else.get());
 }
 
 template<typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformSwitchStmt(SwitchStmt *S) {
-  // Transform the condition.
-  ExprResult Cond;
-  VarDecl *ConditionVar = nullptr;
-  if (S->getConditionVariable()) {
-    ConditionVar
-      = cast_or_null<VarDecl>(
-                   getDerived().TransformDefinition(
-                                      S->getConditionVariable()->getLocation(),
-                                                    S->getConditionVariable()));
-    if (!ConditionVar)
-      return StmtError();
-  } else {
-    Cond = getDerived().TransformExpr(S->getCond());
+  // Transform the initialization statement
+  StmtResult Init = getDerived().TransformStmt(S->getInit());
+  if (Init.isInvalid())
+    return StmtError();
 
-    if (Cond.isInvalid())
-      return StmtError();
-  }
+  // Transform the condition.
+  Sema::ConditionResult Cond = getDerived().TransformCondition(
+      S->getSwitchLoc(), S->getConditionVariable(), S->getCond(),
+      Sema::ConditionKind::Switch);
+  if (Cond.isInvalid())
+    return StmtError();
 
   // Rebuild the switch statement.
   StmtResult Switch
-    = getDerived().RebuildSwitchStmtStart(S->getSwitchLoc(), Cond.get(),
-                                          ConditionVar);
+    = getDerived().RebuildSwitchStmtStart(S->getSwitchLoc(),
+                                          S->getInit(), Cond);
   if (Switch.isInvalid())
     return StmtError();
 
@@ -6264,36 +6348,10 @@
 StmtResult
 TreeTransform<Derived>::TransformWhileStmt(WhileStmt *S) {
   // Transform the condition
-  ExprResult Cond;
-  VarDecl *ConditionVar = nullptr;
-  if (S->getConditionVariable()) {
-    ConditionVar
-      = cast_or_null<VarDecl>(
-                   getDerived().TransformDefinition(
-                                      S->getConditionVariable()->getLocation(),
-                                                    S->getConditionVariable()));
-    if (!ConditionVar)
-      return StmtError();
-  } else {
-    Cond = getDerived().TransformExpr(S->getCond());
-
-    if (Cond.isInvalid())
-      return StmtError();
-
-    if (S->getCond()) {
-      // Convert the condition to a boolean value.
-      ExprResult CondE = getSema().ActOnBooleanCondition(nullptr,
-                                                         S->getWhileLoc(),
-                                                         Cond.get());
-      if (CondE.isInvalid())
-        return StmtError();
-      Cond = CondE;
-    }
-  }
-
-  Sema::FullExprArg FullCond(
-      getSema().MakeFullExpr(Cond.get(), S->getWhileLoc()));
-  if (!S->getConditionVariable() && S->getCond() && !FullCond.get())
+  Sema::ConditionResult Cond = getDerived().TransformCondition(
+      S->getWhileLoc(), S->getConditionVariable(), S->getCond(),
+      Sema::ConditionKind::Boolean);
+  if (Cond.isInvalid())
     return StmtError();
 
   // Transform the body
@@ -6302,13 +6360,11 @@
     return StmtError();
 
   if (!getDerived().AlwaysRebuild() &&
-      FullCond.get() == S->getCond() &&
-      ConditionVar == S->getConditionVariable() &&
+      Cond.get() == std::make_pair(S->getConditionVariable(), S->getCond()) &&
       Body.get() == S->getBody())
     return Owned(S);
 
-  return getDerived().RebuildWhileStmt(S->getWhileLoc(), FullCond,
-                                       ConditionVar, Body.get());
+  return getDerived().RebuildWhileStmt(S->getWhileLoc(), Cond, Body.get());
 }
 
 template<typename Derived>
@@ -6348,37 +6404,10 @@
     getSema().ActOnOpenMPLoopInitialization(S->getForLoc(), Init.get());
 
   // Transform the condition
-  ExprResult Cond;
-  VarDecl *ConditionVar = nullptr;
-  if (S->getConditionVariable()) {
-    ConditionVar
-      = cast_or_null<VarDecl>(
-                   getDerived().TransformDefinition(
-                                      S->getConditionVariable()->getLocation(),
-                                                    S->getConditionVariable()));
-    if (!ConditionVar)
-      return StmtError();
-  } else {
-    Cond = getDerived().TransformExpr(S->getCond());
-
-    if (Cond.isInvalid())
-      return StmtError();
-
-    if (S->getCond()) {
-      // Convert the condition to a boolean value.
-      ExprResult CondE = getSema().ActOnBooleanCondition(nullptr,
-                                                         S->getForLoc(),
-                                                         Cond.get());
-      if (CondE.isInvalid())
-        return StmtError();
-
-      Cond = CondE.get();
-    }
-  }
-
-  Sema::FullExprArg FullCond(
-      getSema().MakeFullExpr(Cond.get(), S->getForLoc()));
-  if (!S->getConditionVariable() && S->getCond() && !FullCond.get())
+  Sema::ConditionResult Cond = getDerived().TransformCondition(
+      S->getForLoc(), S->getConditionVariable(), S->getCond(),
+      Sema::ConditionKind::Boolean);
+  if (Cond.isInvalid())
     return StmtError();
 
   // Transform the increment
@@ -6397,14 +6426,14 @@
 
   if (!getDerived().AlwaysRebuild() &&
       Init.get() == S->getInit() &&
-      FullCond.get() == S->getCond() &&
+      Cond.get() == std::make_pair(S->getConditionVariable(), S->getCond()) &&
       Inc.get() == S->getInc() &&
       Body.get() == S->getBody())
     return S;
 
   return getDerived().RebuildForStmt(S->getForLoc(), S->getLParenLoc(),
-                                     Init.get(), FullCond, ConditionVar,
-                                     FullInc, S->getRParenLoc(), Body.get());
+                                     Init.get(), Cond, FullInc,
+                                     S->getRParenLoc(), Body.get());
 }
 
 template<typename Derived>
@@ -6887,15 +6916,18 @@
   if (Range.isInvalid())
     return StmtError();
 
-  StmtResult BeginEnd = getDerived().TransformStmt(S->getBeginEndStmt());
-  if (BeginEnd.isInvalid())
+  StmtResult Begin = getDerived().TransformStmt(S->getBeginStmt());
+  if (Begin.isInvalid())
+    return StmtError();
+  StmtResult End = getDerived().TransformStmt(S->getEndStmt());
+  if (End.isInvalid())
     return StmtError();
 
   ExprResult Cond = getDerived().TransformExpr(S->getCond());
   if (Cond.isInvalid())
     return StmtError();
   if (Cond.get())
-    Cond = SemaRef.CheckBooleanCondition(Cond.get(), S->getColonLoc());
+    Cond = SemaRef.CheckBooleanCondition(S->getColonLoc(), Cond.get());
   if (Cond.isInvalid())
     return StmtError();
   if (Cond.get())
@@ -6914,14 +6946,16 @@
   StmtResult NewStmt = S;
   if (getDerived().AlwaysRebuild() ||
       Range.get() != S->getRangeStmt() ||
-      BeginEnd.get() != S->getBeginEndStmt() ||
+      Begin.get() != S->getBeginStmt() ||
+      End.get() != S->getEndStmt() ||
       Cond.get() != S->getCond() ||
       Inc.get() != S->getInc() ||
       LoopVar.get() != S->getLoopVarStmt()) {
     NewStmt = getDerived().RebuildCXXForRangeStmt(S->getForLoc(),
                                                   S->getCoawaitLoc(),
                                                   S->getColonLoc(), Range.get(),
-                                                  BeginEnd.get(), Cond.get(),
+                                                  Begin.get(), End.get(),
+                                                  Cond.get(),
                                                   Inc.get(), LoopVar.get(),
                                                   S->getRParenLoc());
     if (NewStmt.isInvalid())
@@ -6938,7 +6972,8 @@
     NewStmt = getDerived().RebuildCXXForRangeStmt(S->getForLoc(),
                                                   S->getCoawaitLoc(),
                                                   S->getColonLoc(), Range.get(),
-                                                  BeginEnd.get(), Cond.get(),
+                                                  Begin.get(), End.get(),
+                                                  Cond.get(),
                                                   Inc.get(), LoopVar.get(),
                                                   S->getRParenLoc());
     if (NewStmt.isInvalid())
@@ -7423,6 +7458,61 @@
 }
 
 template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTargetEnterDataDirective(
+    OMPTargetEnterDataDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_enter_data, DirName,
+                                             nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTargetExitDataDirective(
+    OMPTargetExitDataDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_exit_data, DirName,
+                                             nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTargetParallelDirective(
+    OMPTargetParallelDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel, DirName,
+                                             nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTargetParallelForDirective(
+    OMPTargetParallelForDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel_for, DirName,
+                                             nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTargetUpdateDirective(
+    OMPTargetUpdateDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_update, DirName,
+                                             nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPTeamsDirective(OMPTeamsDirective *D) {
   DeclarationNameInfo DirName;
@@ -7488,6 +7578,74 @@
   return Res;
 }
 
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPDistributeParallelForDirective(
+    OMPDistributeParallelForDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(
+      OMPD_distribute_parallel_for, DirName, nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPDistributeParallelForSimdDirective(
+    OMPDistributeParallelForSimdDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(
+      OMPD_distribute_parallel_for_simd, DirName, nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPDistributeSimdDirective(
+    OMPDistributeSimdDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_distribute_simd, DirName,
+                                             nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTargetParallelForSimdDirective(
+    OMPTargetParallelForSimdDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel_for_simd,
+                                             DirName, nullptr,
+                                             D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTargetSimdDirective(
+    OMPTargetSimdDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_simd, DirName, nullptr,
+                                             D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTeamsDistributeDirective(
+    OMPTeamsDistributeDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_teams_distribute, DirName,
+                                             nullptr, D->getLocStart());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
 //===----------------------------------------------------------------------===//
 // OpenMP clause transformation
 //===----------------------------------------------------------------------===//
@@ -7746,9 +7904,31 @@
     if (!NameInfo.getName())
       return nullptr;
   }
+  // Build a list of all UDR decls with the same names ranged by the Scopes.
+  // The Scope boundary is a duplication of the previous decl.
+  llvm::SmallVector<Expr *, 16> UnresolvedReductions;
+  for (auto *E : C->reduction_ops()) {
+    // Transform all the decls.
+    if (E) {
+      auto *ULE = cast<UnresolvedLookupExpr>(E);
+      UnresolvedSet<8> Decls;
+      for (auto *D : ULE->decls()) {
+        NamedDecl *InstD =
+            cast<NamedDecl>(getDerived().TransformDecl(E->getExprLoc(), D));
+        Decls.addDecl(InstD, InstD->getAccess());
+      }
+      UnresolvedReductions.push_back(
+       UnresolvedLookupExpr::Create(
+          SemaRef.Context, /*NamingClass=*/nullptr,
+          ReductionIdScopeSpec.getWithLocInContext(SemaRef.Context),
+          NameInfo, /*ADL=*/true, ULE->isOverloaded(),
+          Decls.begin(), Decls.end()));
+    } else
+      UnresolvedReductions.push_back(nullptr);
+  }
   return getDerived().RebuildOMPReductionClause(
       Vars, C->getLocStart(), C->getLParenLoc(), C->getColonLoc(),
-      C->getLocEnd(), ReductionIdScopeSpec, NameInfo);
+      C->getLocEnd(), ReductionIdScopeSpec, NameInfo, UnresolvedReductions);
 }
 
 template <typename Derived>
@@ -7870,9 +8050,9 @@
     Vars.push_back(EVar.get());
   }
   return getDerived().RebuildOMPMapClause(
-      C->getMapTypeModifier(), C->getMapType(), C->getMapLoc(),
-      C->getColonLoc(), Vars, C->getLocStart(), C->getLParenLoc(),
-      C->getLocEnd());
+      C->getMapTypeModifier(), C->getMapType(), C->isImplicitMapType(),
+      C->getMapLoc(), C->getColonLoc(), Vars, C->getLocStart(),
+      C->getLParenLoc(), C->getLocEnd());
 }
 
 template <typename Derived>
@@ -7945,6 +8125,70 @@
       C->getDistScheduleKindLoc(), C->getCommaLoc(), C->getLocEnd());
 }
 
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPDefaultmapClause(OMPDefaultmapClause *C) {
+  return C;
+}
+
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPToClause(OMPToClause *C) {
+  llvm::SmallVector<Expr *, 16> Vars;
+  Vars.reserve(C->varlist_size());
+  for (auto *VE : C->varlists()) {
+    ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
+    if (EVar.isInvalid())
+      return 0;
+    Vars.push_back(EVar.get());
+  }
+  return getDerived().RebuildOMPToClause(Vars, C->getLocStart(),
+                                         C->getLParenLoc(), C->getLocEnd());
+}
+
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPFromClause(OMPFromClause *C) {
+  llvm::SmallVector<Expr *, 16> Vars;
+  Vars.reserve(C->varlist_size());
+  for (auto *VE : C->varlists()) {
+    ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
+    if (EVar.isInvalid())
+      return 0;
+    Vars.push_back(EVar.get());
+  }
+  return getDerived().RebuildOMPFromClause(Vars, C->getLocStart(),
+                                           C->getLParenLoc(), C->getLocEnd());
+}
+
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPUseDevicePtrClause(
+    OMPUseDevicePtrClause *C) {
+  llvm::SmallVector<Expr *, 16> Vars;
+  Vars.reserve(C->varlist_size());
+  for (auto *VE : C->varlists()) {
+    ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
+    if (EVar.isInvalid())
+      return nullptr;
+    Vars.push_back(EVar.get());
+  }
+  return getDerived().RebuildOMPUseDevicePtrClause(
+      Vars, C->getLocStart(), C->getLParenLoc(), C->getLocEnd());
+}
+
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
+  llvm::SmallVector<Expr *, 16> Vars;
+  Vars.reserve(C->varlist_size());
+  for (auto *VE : C->varlists()) {
+    ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
+    if (EVar.isInvalid())
+      return nullptr;
+    Vars.push_back(EVar.get());
+  }
+  return getDerived().RebuildOMPIsDevicePtrClause(
+      Vars, C->getLocStart(), C->getLParenLoc(), C->getLocEnd());
+}
+
 //===----------------------------------------------------------------------===//
 // Expression transformation
 //===----------------------------------------------------------------------===//
@@ -8637,46 +8881,44 @@
   // transform the designators.
   SmallVector<Expr*, 4> ArrayExprs;
   bool ExprChanged = false;
-  for (DesignatedInitExpr::designators_iterator D = E->designators_begin(),
-                                             DEnd = E->designators_end();
-       D != DEnd; ++D) {
-    if (D->isFieldDesignator()) {
-      Desig.AddDesignator(Designator::getField(D->getFieldName(),
-                                               D->getDotLoc(),
-                                               D->getFieldLoc()));
+  for (const DesignatedInitExpr::Designator &D : E->designators()) {
+    if (D.isFieldDesignator()) {
+      Desig.AddDesignator(Designator::getField(D.getFieldName(),
+                                               D.getDotLoc(),
+                                               D.getFieldLoc()));
       continue;
     }
 
-    if (D->isArrayDesignator()) {
-      ExprResult Index = getDerived().TransformExpr(E->getArrayIndex(*D));
+    if (D.isArrayDesignator()) {
+      ExprResult Index = getDerived().TransformExpr(E->getArrayIndex(D));
       if (Index.isInvalid())
         return ExprError();
 
-      Desig.AddDesignator(Designator::getArray(Index.get(),
-                                               D->getLBracketLoc()));
+      Desig.AddDesignator(
+          Designator::getArray(Index.get(), D.getLBracketLoc()));
 
-      ExprChanged = ExprChanged || Init.get() != E->getArrayIndex(*D);
+      ExprChanged = ExprChanged || Init.get() != E->getArrayIndex(D);
       ArrayExprs.push_back(Index.get());
       continue;
     }
 
-    assert(D->isArrayRangeDesignator() && "New kind of designator?");
+    assert(D.isArrayRangeDesignator() && "New kind of designator?");
     ExprResult Start
-      = getDerived().TransformExpr(E->getArrayRangeStart(*D));
+      = getDerived().TransformExpr(E->getArrayRangeStart(D));
     if (Start.isInvalid())
       return ExprError();
 
-    ExprResult End = getDerived().TransformExpr(E->getArrayRangeEnd(*D));
+    ExprResult End = getDerived().TransformExpr(E->getArrayRangeEnd(D));
     if (End.isInvalid())
       return ExprError();
 
     Desig.AddDesignator(Designator::getArrayRange(Start.get(),
                                                   End.get(),
-                                                  D->getLBracketLoc(),
-                                                  D->getEllipsisLoc()));
+                                                  D.getLBracketLoc(),
+                                                  D.getEllipsisLoc()));
 
-    ExprChanged = ExprChanged || Start.get() != E->getArrayRangeStart(*D) ||
-      End.get() != E->getArrayRangeEnd(*D);
+    ExprChanged = ExprChanged || Start.get() != E->getArrayRangeStart(D) ||
+                  End.get() != E->getArrayRangeEnd(D);
 
     ArrayExprs.push_back(Start.get());
     ArrayExprs.push_back(End.get());
@@ -9824,8 +10066,8 @@
   }
 
   return getDerived().RebuildCXXConstructExpr(T, /*FIXME:*/E->getLocStart(),
-                                              Constructor, E->isElidable(),
-                                              Args,
+                                              Constructor,
+                                              E->isElidable(), Args,
                                               E->hadMultipleCandidates(),
                                               E->isListInitialization(),
                                               E->isStdInitListInitialization(),
@@ -9834,6 +10076,32 @@
                                               E->getParenOrBraceRange());
 }
 
+template<typename Derived>
+ExprResult TreeTransform<Derived>::TransformCXXInheritedCtorInitExpr(
+    CXXInheritedCtorInitExpr *E) {
+  QualType T = getDerived().TransformType(E->getType());
+  if (T.isNull())
+    return ExprError();
+
+  CXXConstructorDecl *Constructor = cast_or_null<CXXConstructorDecl>(
+      getDerived().TransformDecl(E->getLocStart(), E->getConstructor()));
+  if (!Constructor)
+    return ExprError();
+
+  if (!getDerived().AlwaysRebuild() &&
+      T == E->getType() &&
+      Constructor == E->getConstructor()) {
+    // Mark the constructor as referenced.
+    // FIXME: Instantiation-specific
+    SemaRef.MarkFunctionReferenced(E->getLocStart(), Constructor);
+    return E;
+  }
+
+  return getDerived().RebuildCXXInheritedCtorInitExpr(
+      T, E->getLocation(), Constructor,
+      E->constructsVBase(), E->inheritedFromVBase());
+}
+
 /// \brief Transform a C++ temporary-binding expression.
 ///
 /// Since CXXBindTemporaryExpr nodes are implicitly generated, we just
@@ -9974,7 +10242,9 @@
   CXXMethodDecl *NewCallOperator = getSema().startLambdaDefinition(
       Class, E->getIntroducerRange(), NewCallOpTSI,
       E->getCallOperator()->getLocEnd(),
-      NewCallOpTSI->getTypeLoc().castAs<FunctionProtoTypeLoc>().getParams());
+      NewCallOpTSI->getTypeLoc().castAs<FunctionProtoTypeLoc>().getParams(),
+      E->getCallOperator()->isConstexpr());
+
   LSI->CallOperator = NewCallOperator;
 
   getDerived().transformAttrs(E->getCallOperator(), NewCallOperator);
@@ -10009,7 +10279,9 @@
 
     // Capturing 'this' is trivial.
     if (C->capturesThis()) {
-      getSema().CheckCXXThisCapture(C->getLocation(), C->isExplicit());
+      getSema().CheckCXXThisCapture(C->getLocation(), C->isExplicit(),
+                                    /*BuildAndDiagnose*/ true, nullptr,
+                                    C->getCaptureKind() == LCK_StarThis);
       continue;
     }
     // Captured expression will be recaptured during captured variables
@@ -10860,6 +11132,12 @@
                                       Result.get());
 }
 
+template <typename Derived>
+ExprResult TreeTransform<Derived>::TransformObjCAvailabilityCheckExpr(
+    ObjCAvailabilityCheckExpr *E) {
+  return E;
+}
+
 template<typename Derived>
 ExprResult
 TreeTransform<Derived>::TransformObjCMessageExpr(ObjCMessageExpr *E) {
@@ -10896,6 +11174,9 @@
   }
   else if (E->getReceiverKind() == ObjCMessageExpr::SuperClass ||
            E->getReceiverKind() == ObjCMessageExpr::SuperInstance) {
+    if (!E->getMethodDecl())
+      return ExprError();
+
     // Build a new class message send to 'super'.
     SmallVector<SourceLocation, 16> SelLocs;
     E->getSelectorLocs(SelLocs);
@@ -11099,13 +11380,10 @@
 
   // Parameter substitution.
   Sema::ExtParameterInfoBuilder extParamInfos;
-  if (getDerived().TransformFunctionTypeParams(E->getCaretLocation(),
-                                               oldBlock->param_begin(),
-                                               oldBlock->param_size(),
-                                               nullptr,
-                             exprFunctionType->getExtParameterInfosOrNull(),
-                                               paramTypes, &params,
-                                               extParamInfos)) {
+  if (getDerived().TransformFunctionTypeParams(
+          E->getCaretLocation(), oldBlock->parameters(), nullptr,
+          exprFunctionType->getExtParameterInfosOrNull(), paramTypes, &params,
+          extParamInfos)) {
     getSema().ActOnBlockError(E->getCaretLocation(), /*Scope=*/nullptr);
     return ExprError();
   }
diff --git a/lib/Sema/TypeLocBuilder.h b/lib/Sema/TypeLocBuilder.h
index 82844b3..3828218 100644
--- a/lib/Sema/TypeLocBuilder.h
+++ b/lib/Sema/TypeLocBuilder.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This files defines TypeLocBuilder, a class for building TypeLocs
+//  This file defines TypeLocBuilder, a class for building TypeLocs
 //  bottom-up.
 //
 //===----------------------------------------------------------------------===//
diff --git a/lib/Serialization/ASTCommon.cpp b/lib/Serialization/ASTCommon.cpp
index 2b78d74..07d0c1c 100644
--- a/lib/Serialization/ASTCommon.cpp
+++ b/lib/Serialization/ASTCommon.cpp
@@ -91,6 +91,9 @@
   case BuiltinType::LongDouble:
     ID = PREDEF_TYPE_LONGDOUBLE_ID;
     break;
+  case BuiltinType::Float128:
+    ID = PREDEF_TYPE_FLOAT128_ID;
+    break;
   case BuiltinType::NullPtr:
     ID = PREDEF_TYPE_NULLPTR_ID;
     break;
@@ -127,42 +130,11 @@
   case BuiltinType::ObjCSel:
     ID = PREDEF_TYPE_OBJC_SEL;
     break;
-  case BuiltinType::OCLImage1d:
-    ID = PREDEF_TYPE_IMAGE1D_ID;
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id: \
+    ID = PREDEF_TYPE_##Id##_ID; \
     break;
-  case BuiltinType::OCLImage1dArray:
-    ID = PREDEF_TYPE_IMAGE1D_ARR_ID;
-    break;
-  case BuiltinType::OCLImage1dBuffer:
-    ID = PREDEF_TYPE_IMAGE1D_BUFF_ID;
-    break;
-  case BuiltinType::OCLImage2d:
-    ID = PREDEF_TYPE_IMAGE2D_ID;
-    break;
-  case BuiltinType::OCLImage2dArray:
-    ID = PREDEF_TYPE_IMAGE2D_ARR_ID;
-    break;
-  case BuiltinType::OCLImage2dDepth:
-    ID = PREDEF_TYPE_IMAGE2D_DEP_ID;
-    break;
-  case BuiltinType::OCLImage2dArrayDepth:
-    ID = PREDEF_TYPE_IMAGE2D_ARR_DEP_ID;
-    break;
-  case BuiltinType::OCLImage2dMSAA:
-    ID = PREDEF_TYPE_IMAGE2D_MSAA_ID;
-    break;
-  case BuiltinType::OCLImage2dArrayMSAA:
-    ID = PREDEF_TYPE_IMAGE2D_ARR_MSAA_ID;
-    break;
-  case BuiltinType::OCLImage2dMSAADepth:
-    ID = PREDEF_TYPE_IMAGE2D_MSAA_DEP_ID;
-    break;
-  case BuiltinType::OCLImage2dArrayMSAADepth:
-    ID = PREDEF_TYPE_IMAGE2D_ARR_MSAA_DEPTH_ID;
-    break;
-  case BuiltinType::OCLImage3d:
-    ID = PREDEF_TYPE_IMAGE3D_ID;
-    break;
+#include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
     ID = PREDEF_TYPE_SAMPLER_ID;
     break;
@@ -286,6 +258,7 @@
   case Decl::CXXDestructor:
   case Decl::CXXConversion:
   case Decl::UsingShadow:
+  case Decl::ConstructorUsingShadow:
   case Decl::Var:
   case Decl::FunctionTemplate:
   case Decl::ClassTemplate:
@@ -319,6 +292,8 @@
   case Decl::ObjCCompatibleAlias:
   case Decl::LinkageSpec:
   case Decl::ObjCPropertyImpl:
+  case Decl::PragmaComment:
+  case Decl::PragmaDetectMismatch:
   case Decl::FileScopeAsm:
   case Decl::AccessSpec:
   case Decl::Friend:
@@ -329,7 +304,11 @@
   case Decl::ClassScopeFunctionSpecialization:
   case Decl::Import:
   case Decl::OMPThreadPrivate:
+  case Decl::OMPCapturedExpr:
+  case Decl::OMPDeclareReduction:
   case Decl::BuiltinTemplate:
+  case Decl::Decomposition:
+  case Decl::Binding:
     return false;
 
   // These indirectly derive from Redeclarable<T> but are not actually
diff --git a/lib/Serialization/ASTCommon.h b/lib/Serialization/ASTCommon.h
index 64f583c..cbc5f04 100644
--- a/lib/Serialization/ASTCommon.h
+++ b/lib/Serialization/ASTCommon.h
@@ -30,6 +30,7 @@
   UPD_CXX_INSTANTIATED_STATIC_DATA_MEMBER,
   UPD_CXX_INSTANTIATED_CLASS_DEFINITION,
   UPD_CXX_INSTANTIATED_DEFAULT_ARGUMENT,
+  UPD_CXX_INSTANTIATED_DEFAULT_MEMBER_INITIALIZER,
   UPD_CXX_RESOLVED_DTOR_DELETE,
   UPD_CXX_RESOLVED_EXCEPTION_SPEC,
   UPD_CXX_DEDUCED_RETURN_TYPE,
@@ -37,6 +38,7 @@
   UPD_MANGLING_NUMBER,
   UPD_STATIC_LOCAL_NUMBER,
   UPD_DECL_MARKED_OPENMP_THREADPRIVATE,
+  UPD_DECL_MARKED_OPENMP_DECLARETARGET,
   UPD_DECL_EXPORTED,
   UPD_ADDED_ATTR_TO_RECORD
 };
diff --git a/lib/Serialization/ASTReader.cpp b/lib/Serialization/ASTReader.cpp
index 6a1ae65..7d04e8c 100644
--- a/lib/Serialization/ASTReader.cpp
+++ b/lib/Serialization/ASTReader.cpp
@@ -48,6 +48,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitcode/BitstreamReader.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -216,8 +217,13 @@
   if (!AllowCompatibleDifferences)                                 \
     ENUM_LANGOPT(Name, Bits, Default, Description)
 
+#define COMPATIBLE_VALUE_LANGOPT(Name, Bits, Default, Description) \
+  if (!AllowCompatibleDifferences)                                 \
+    VALUE_LANGOPT(Name, Bits, Default, Description)
+
 #define BENIGN_LANGOPT(Name, Bits, Default, Description)
 #define BENIGN_ENUM_LANGOPT(Name, Type, Bits, Default, Description)
+#define BENIGN_VALUE_LANGOPT(Name, Type, Bits, Default, Description)
 #include "clang/Basic/LangOptions.def"
 
   if (ExistingLangOpts.ModuleFeatures != LangOpts.ModuleFeatures) {
@@ -768,6 +774,15 @@
   return Reader.getGlobalIdentifierID(F, RawID >> 1);
 }
 
+static void markIdentifierFromAST(ASTReader &Reader, IdentifierInfo &II) {
+  if (!II.isFromAST()) {
+    II.setIsFromAST();
+    bool IsModule = Reader.getPreprocessor().getCurrentModule() != nullptr;
+    if (isInterestingIdentifier(Reader, II, IsModule))
+      II.setChangedSinceDeserialization();
+  }
+}
+
 IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
                                                    const unsigned char* d,
                                                    unsigned DataLen) {
@@ -784,12 +799,7 @@
     II = &Reader.getIdentifierTable().getOwn(k);
     KnownII = II;
   }
-  if (!II->isFromAST()) {
-    II->setIsFromAST();
-    bool IsModule = Reader.PP.getCurrentModule() != nullptr;
-    if (isInterestingIdentifier(Reader, *II, IsModule))
-      II->setChangedSinceDeserialization();
-  }
+  markIdentifierFromAST(Reader, *II);
   Reader.markIdentifierUpToDate(II);
 
   IdentID ID = Reader.getGlobalIdentifierID(F, RawID);
@@ -1199,6 +1209,32 @@
     return true;
   }
 
+  // Local helper to read the (possibly-compressed) buffer data following the
+  // entry record.
+  auto ReadBuffer = [this](
+      BitstreamCursor &SLocEntryCursor,
+      StringRef Name) -> std::unique_ptr<llvm::MemoryBuffer> {
+    RecordData Record;
+    StringRef Blob;
+    unsigned Code = SLocEntryCursor.ReadCode();
+    unsigned RecCode = SLocEntryCursor.readRecord(Code, Record, &Blob);
+
+    if (RecCode == SM_SLOC_BUFFER_BLOB_COMPRESSED) {
+      SmallString<0> Uncompressed;
+      if (llvm::zlib::uncompress(Blob, Uncompressed, Record[0]) !=
+          llvm::zlib::StatusOK) {
+        Error("could not decompress embedded file contents");
+        return nullptr;
+      }
+      return llvm::MemoryBuffer::getMemBufferCopy(Uncompressed, Name);
+    } else if (RecCode == SM_SLOC_BUFFER_BLOB) {
+      return llvm::MemoryBuffer::getMemBuffer(Blob.drop_back(1), Name, true);
+    } else {
+      Error("AST record has invalid code");
+      return nullptr;
+    }
+  };
+
   ModuleFile *F = GlobalSLocEntryMap.find(-ID)->second;
   F->SLocEntryCursor.JumpToBit(F->SLocEntryOffsets[ID - F->SLocEntryBaseID]);
   BitstreamCursor &SLocEntryCursor = F->SLocEntryCursor;
@@ -1254,24 +1290,16 @@
       FileDeclIDs[FID] = FileDeclsInfo(F, llvm::makeArrayRef(FirstDecl,
                                                              NumFileDecls));
     }
-    
+
     const SrcMgr::ContentCache *ContentCache
       = SourceMgr.getOrCreateContentCache(File,
                               /*isSystemFile=*/FileCharacter != SrcMgr::C_User);
     if (OverriddenBuffer && !ContentCache->BufferOverridden &&
         ContentCache->ContentsEntry == ContentCache->OrigEntry &&
         !ContentCache->getRawBuffer()) {
-      unsigned Code = SLocEntryCursor.ReadCode();
-      Record.clear();
-      unsigned RecCode = SLocEntryCursor.readRecord(Code, Record, &Blob);
-      
-      if (RecCode != SM_SLOC_BUFFER_BLOB) {
-        Error("AST record has invalid code");
+      auto Buffer = ReadBuffer(SLocEntryCursor, File->getName());
+      if (!Buffer)
         return true;
-      }
-      
-      std::unique_ptr<llvm::MemoryBuffer> Buffer
-        = llvm::MemoryBuffer::getMemBuffer(Blob.drop_back(1), File->getName());
       SourceMgr.overrideFileContents(File, std::move(Buffer));
     }
 
@@ -1284,21 +1312,14 @@
     SrcMgr::CharacteristicKind
       FileCharacter = (SrcMgr::CharacteristicKind)Record[2];
     SourceLocation IncludeLoc = ReadSourceLocation(*F, Record[1]);
-    if (IncludeLoc.isInvalid() && F->isModule()) {
+    if (IncludeLoc.isInvalid() &&
+        (F->Kind == MK_ImplicitModule || F->Kind == MK_ExplicitModule)) {
       IncludeLoc = getImportLocation(F);
     }
-    unsigned Code = SLocEntryCursor.ReadCode();
-    Record.clear();
-    unsigned RecCode
-      = SLocEntryCursor.readRecord(Code, Record, &Blob);
 
-    if (RecCode != SM_SLOC_BUFFER_BLOB) {
-      Error("AST record has invalid code");
+    auto Buffer = ReadBuffer(SLocEntryCursor, Name);
+    if (!Buffer)
       return true;
-    }
-
-    std::unique_ptr<llvm::MemoryBuffer> Buffer =
-        llvm::MemoryBuffer::getMemBuffer(Blob.drop_back(1), Name);
     SourceMgr.createFileID(std::move(Buffer), FileCharacter, ID,
                            BaseOffset + Offset, IncludeLoc);
     break;
@@ -1330,7 +1351,7 @@
 
   // Find which module file this entry lands in.
   ModuleFile *M = GlobalSLocEntryMap.find(-ID)->second;
-  if (!M->isModule())
+  if (M->Kind != MK_ImplicitModule && M->Kind != MK_ExplicitModule)
     return std::make_pair(SourceLocation(), "");
 
   // FIXME: Can we map this down to a particular submodule? That would be
@@ -1840,7 +1861,7 @@
 
   // Don't read the directive history for a module; we don't have anywhere
   // to put it.
-  if (M.isModule())
+  if (M.Kind == MK_ImplicitModule || M.Kind == MK_ExplicitModule)
     return;
 
   // Deserialize the macro directives history in reverse source-order.
@@ -1989,17 +2010,8 @@
   // For an overridden file, there is nothing to validate.
   if (!Overridden && //
       (StoredSize != File->getSize() ||
-#if defined(LLVM_ON_WIN32)
-       false
-#else
-       // In our regression testing, the Windows file system seems to
-       // have inconsistent modification times that sometimes
-       // erroneously trigger this error-handling path.
-       //
-       // FIXME: This probably also breaks HeaderFileInfo lookups on Windows.
        (StoredTime && StoredTime != File->getModificationTime() &&
         !DisableValidation)
-#endif
        )) {
     if (Complain) {
       // Build a list of the PCH imports that got us here (in reverse).
@@ -2182,8 +2194,7 @@
       // All user input files reside at the index range [0, NumUserInputs), and
       // system input files reside at [NumUserInputs, NumInputs). For explicitly
       // loaded module files, ignore missing inputs.
-      if (!DisableValidation && F.Kind != MK_ExplicitModule &&
-          F.Kind != MK_PrebuiltModule) {
+      if (!DisableValidation && F.Kind != MK_ExplicitModule) {
         bool Complain = (ClientLoadCapabilities & ARR_OutOfDate) == 0;
 
         // If we are reading a module, we will create a verification timestamp,
@@ -2214,8 +2225,7 @@
           bool IsSystem = I >= NumUserInputs;
           InputFileInfo FI = readInputFileInfo(F, I+1);
           Listener->visitInputFile(FI.Filename, IsSystem, FI.Overridden,
-                                   F.Kind == MK_ExplicitModule ||
-                                   F.Kind == MK_PrebuiltModule);
+                                   F.Kind == MK_ExplicitModule);
         }
       }
 
@@ -2245,7 +2255,7 @@
           //
           // FIXME: Allow this for files explicitly specified with -include-pch.
           bool AllowCompatibleConfigurationMismatch =
-              F.Kind == MK_ExplicitModule || F.Kind == MK_PrebuiltModule;
+              F.Kind == MK_ExplicitModule;
           const HeaderSearchOptions &HSOpts =
               PP.getHeaderSearchInfo().getHeaderSearchOpts();
 
@@ -2339,9 +2349,9 @@
         ModuleKind ImportedKind = (ModuleKind)Record[Idx++];
         // The import location will be the local one for now; we will adjust
         // all import locations of module imports after the global source
-        // location info are setup.
+        // location info are setup, in ReadAST.
         SourceLocation ImportLoc =
-            SourceLocation::getFromRawEncoding(Record[Idx++]);
+            ReadUntranslatedSourceLocation(Record[Idx++]);
         off_t StoredSize = (off_t)Record[Idx++];
         time_t StoredModTime = (time_t)Record[Idx++];
         ASTFileSignature StoredSignature = Record[Idx++];
@@ -2407,7 +2417,7 @@
       if (M && M->Directory) {
         // If we're implicitly loading a module, the base directory can't
         // change between the build and use.
-        if (F.Kind != MK_ExplicitModule && F.Kind != MK_PrebuiltModule) {
+        if (F.Kind != MK_ExplicitModule) {
           const DirectoryEntry *BuildDir =
               PP.getFileManager().getDirectory(Blob);
           if (!BuildDir || BuildDir != M->Directory) {
@@ -3031,17 +3041,6 @@
       break;
     }
 
-    case DECL_REPLACEMENTS: {
-      if (Record.size() % 3 != 0) {
-        Error("invalid DECL_REPLACEMENTS block in AST file");
-        return Failure;
-      }
-      for (unsigned I = 0, N = Record.size(); I != N; I += 3)
-        ReplacedDecls[getGlobalDeclID(F, Record[I])]
-          = ReplacedDeclInfo(&F, Record[I+1], Record[I+2]);
-      break;
-    }
-
     case OBJC_CATEGORIES_MAP: {
       if (F.LocalNumObjCCategoriesInMap != 0) {
         Error("duplicate OBJC_CATEGORIES_MAP record in AST file");
@@ -3057,28 +3056,6 @@
       F.ObjCCategories.swap(Record);
       break;
 
-    case CXX_BASE_SPECIFIER_OFFSETS: {
-      if (F.LocalNumCXXBaseSpecifiers != 0) {
-        Error("duplicate CXX_BASE_SPECIFIER_OFFSETS record in AST file");
-        return Failure;
-      }
-
-      F.LocalNumCXXBaseSpecifiers = Record[0];
-      F.CXXBaseSpecifiersOffsets = (const uint32_t *)Blob.data();
-      break;
-    }
-
-    case CXX_CTOR_INITIALIZERS_OFFSETS: {
-      if (F.LocalNumCXXCtorInitializers != 0) {
-        Error("duplicate CXX_CTOR_INITIALIZERS_OFFSETS record in AST file");
-        return Failure;
-      }
-
-      F.LocalNumCXXCtorInitializers = Record[0];
-      F.CXXCtorInitializersOffsets = (const uint32_t *)Blob.data();
-      break;
-    }
-
     case DIAG_PRAGMA_MAPPINGS:
       if (F.PragmaDiagMappings.empty())
         F.PragmaDiagMappings.swap(Record);
@@ -3164,7 +3141,7 @@
       break;
 
     case IMPORTED_MODULES: {
-      if (!F.isModule()) {
+      if (F.Kind != MK_ImplicitModule && F.Kind != MK_ExplicitModule) {
         // If we aren't loading a module (which has its own exports), make
         // all of the imported modules visible.
         // FIXME: Deal with macros-only imports.
@@ -3215,6 +3192,23 @@
       OptimizeOffPragmaLocation = ReadSourceLocation(F, Record[0]);
       break;
 
+    case MSSTRUCT_PRAGMA_OPTIONS:
+      if (Record.size() != 1) {
+        Error("invalid pragma ms_struct record");
+        return Failure;
+      }
+      PragmaMSStructState = Record[0];
+      break;
+
+    case POINTERS_TO_MEMBERS_PRAGMA_OPTIONS:
+      if (Record.size() != 2) {
+        Error("invalid pragma ms_struct record");
+        return Failure;
+      }
+      PragmaMSPointersToMembersState = Record[0];
+      PointersToMembersPragmaLocation = ReadSourceLocation(F, Record[1]);
+      break;
+
     case UNUSED_LOCAL_TYPEDEF_NAME_CANDIDATES:
       for (unsigned I = 0, N = Record.size(); I != N; ++I)
         UnusedLocalTypedefNameCandidates.push_back(
@@ -3231,7 +3225,7 @@
   unsigned Idx = 0;
   F.ModuleMapPath = ReadPath(F, Record, Idx);
 
-  if (F.Kind == MK_ExplicitModule || F.Kind == MK_PrebuiltModule) {
+  if (F.Kind == MK_ExplicitModule) {
     // For an explicitly-loaded module, we don't care whether the original
     // module map file exists or matches.
     return Success;
@@ -3481,7 +3475,7 @@
   }
 }
 
-ASTReader::ASTReadResult ASTReader::ReadAST(const std::string &FileName,
+ASTReader::ASTReadResult ASTReader::ReadAST(StringRef FileName,
                                             ModuleKind Type,
                                             SourceLocation ImportLoc,
                                             unsigned ClientLoadCapabilities) {
@@ -3574,12 +3568,7 @@
 
       // Mark this identifier as being from an AST file so that we can track
       // whether we need to serialize it.
-      if (!II.isFromAST()) {
-        II.setIsFromAST();
-        bool IsModule = PP.getCurrentModule() != nullptr;
-        if (isInterestingIdentifier(*this, II, IsModule))
-          II.setChangedSinceDeserialization();
-      }
+      markIdentifierFromAST(*this, II);
 
       // Associate the ID with the identifier so that the writer can reuse it.
       auto ID = Trait.ReadIdentifierID(Data + KeyDataLen.first);
@@ -3598,16 +3587,16 @@
 
     // Set the import location.
     F.DirectImportLoc = ImportLoc;
+    // FIXME: We assume that locations from PCH / preamble do not need
+    // any translation.
     if (!M->ImportedBy)
       F.ImportLoc = M->ImportLoc;
     else
-      F.ImportLoc = ReadSourceLocation(*M->ImportedBy,
-                                       M->ImportLoc.getRawEncoding());
+      F.ImportLoc = TranslateSourceLocation(*M->ImportedBy, M->ImportLoc);
   }
 
   if (!Context.getLangOpts().CPlusPlus ||
-      (Type != MK_ImplicitModule && Type != MK_ExplicitModule &&
-       Type != MK_PrebuiltModule)) {
+      (Type != MK_ImplicitModule && Type != MK_ExplicitModule)) {
     // Mark all of the identifiers in the identifier table as being out of date,
     // so that various accessors know to check the loaded modules when the
     // identifier is used.
@@ -3724,7 +3713,6 @@
     return 0; // PCH
   case MK_ImplicitModule:
   case MK_ExplicitModule:
-  case MK_PrebuiltModule:
     return 1; // module
   case MK_MainFile:
   case MK_Preamble:
@@ -3830,8 +3818,7 @@
         //
         // FIXME: Should we also perform the converse check? Loading a module as
         // a PCH file sort of works, but it's a bit wonky.
-        if ((Type == MK_ImplicitModule || Type == MK_ExplicitModule ||
-             Type == MK_PrebuiltModule) &&
+        if ((Type == MK_ImplicitModule || Type == MK_ExplicitModule) &&
             F.ModuleName.empty()) {
           auto Result = (Type == MK_ImplicitModule) ? OutOfDate : Failure;
           if (Result != OutOfDate ||
@@ -4547,14 +4534,25 @@
       
       SubmodulesLoaded[GlobalIndex] = CurrentModule;
 
-      // Clear out data that will be replaced by what is the module file.
+      // Clear out data that will be replaced by what is in the module file.
       CurrentModule->LinkLibraries.clear();
       CurrentModule->ConfigMacros.clear();
       CurrentModule->UnresolvedConflicts.clear();
       CurrentModule->Conflicts.clear();
+
+      // The module is available unless it's missing a requirement; relevant
+      // requirements will be (re-)added by SUBMODULE_REQUIRES records.
+      // Missing headers that were present when the module was built do not
+      // make it unavailable -- if we got this far, this must be an explicitly
+      // imported module file.
+      CurrentModule->Requirements.clear();
+      CurrentModule->MissingHeaders.clear();
+      CurrentModule->IsMissingRequirement =
+          ParentModule && ParentModule->IsMissingRequirement;
+      CurrentModule->IsAvailable = !CurrentModule->IsMissingRequirement;
       break;
     }
-        
+
     case SUBMODULE_UMBRELLA_HEADER: {
       std::string Filename = Blob;
       ResolveImportedPath(F, Filename);
@@ -4683,6 +4681,13 @@
       UnresolvedModuleRefs.push_back(Unresolved);
       break;
     }
+
+    case SUBMODULE_INITIALIZERS:
+      SmallVector<uint32_t, 16> Inits;
+      for (auto &ID : Record)
+        Inits.push_back(getGlobalDeclID(F, ID));
+      Context.addLazyModuleInitializers(CurrentModule, Inits);
+      break;
     }
   }
 }
@@ -4904,8 +4909,8 @@
     return nullptr;
 
   // Read the record.
-  SourceRange Range(ReadSourceLocation(M, PPOffs.Begin),
-                    ReadSourceLocation(M, PPOffs.End));
+  SourceRange Range(TranslateSourceLocation(M, PPOffs.getBegin()),
+                    TranslateSourceLocation(M, PPOffs.getEnd()));
   PreprocessingRecord &PPRec = *PP.getPreprocessingRecord();
   StringRef Blob;
   RecordData Record;
@@ -4989,7 +4994,6 @@
 
 namespace {
 
-template <unsigned PPEntityOffset::*PPLoc>
 struct PPEntityComp {
   const ASTReader &Reader;
   ModuleFile &M;
@@ -5013,7 +5017,7 @@
   }
 
   SourceLocation getLoc(const PPEntityOffset &PPE) const {
-    return Reader.ReadSourceLocation(M, PPE.*PPLoc);
+    return Reader.TranslateSourceLocation(M, PPE.getBegin());
   }
 };
 
@@ -5044,7 +5048,7 @@
 
   if (EndsAfter) {
     PPI = std::upper_bound(pp_begin, pp_end, Loc,
-                           PPEntityComp<&PPEntityOffset::Begin>(*this, M));
+                           PPEntityComp(*this, M));
   } else {
     // Do a binary search manually instead of using std::lower_bound because
     // The end locations of entities may be unordered (when a macro expansion
@@ -5054,8 +5058,8 @@
       Half = Count / 2;
       PPI = First;
       std::advance(PPI, Half);
-      if (SourceMgr.isBeforeInTranslationUnit(ReadSourceLocation(M, PPI->End),
-                                              Loc)) {
+      if (SourceMgr.isBeforeInTranslationUnit(
+              TranslateSourceLocation(M, PPI->getEnd()), Loc)) {
         First = PPI;
         ++First;
         Count = Count - Half - 1;
@@ -5096,7 +5100,7 @@
   unsigned LocalIndex = PPInfo.second;
   const PPEntityOffset &PPOffs = M.PreprocessedEntityOffsets[LocalIndex];
   
-  SourceLocation Loc = ReadSourceLocation(M, PPOffs.Begin);
+  SourceLocation Loc = TranslateSourceLocation(M, PPOffs.getBegin());
   if (Loc.isInvalid())
     return false;
   
@@ -5631,7 +5635,7 @@
     while (NumArgs--)
       Args.push_back(ReadTemplateArgument(*Loc.F, Record, Idx));
     return Context.getDependentTemplateSpecializationType(Keyword, NNS, Name,
-                                                      Args.size(), Args.data());
+                                                          Args);
   }
 
   case TYPE_DEPENDENT_SIZED_ARRAY: {
@@ -5660,11 +5664,9 @@
     QualType Underlying = readType(*Loc.F, Record, Idx);
     QualType T;
     if (Underlying.isNull())
-      T = Context.getCanonicalTemplateSpecializationType(Name, Args.data(),
-                                                          Args.size());
+      T = Context.getCanonicalTemplateSpecializationType(Name, Args);
     else
-      T = Context.getTemplateSpecializationType(Name, Args.data(),
-                                                 Args.size(), Underlying);
+      T = Context.getTemplateSpecializationType(Name, Args, Underlying);
     const_cast<Type*>(T.getTypePtr())->setDependent(IsDependent);
     return T;
   }
@@ -6050,6 +6052,9 @@
     case PREDEF_TYPE_LONGDOUBLE_ID:
       T = Context.LongDoubleTy;
       break;
+    case PREDEF_TYPE_FLOAT128_ID:
+      T = Context.Float128Ty;
+      break;
     case PREDEF_TYPE_OVERLOAD_ID:
       T = Context.OverloadTy;
       break;
@@ -6083,42 +6088,11 @@
     case PREDEF_TYPE_OBJC_SEL:
       T = Context.ObjCBuiltinSelTy;
       break;
-    case PREDEF_TYPE_IMAGE1D_ID:
-      T = Context.OCLImage1dTy;
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+    case PREDEF_TYPE_##Id##_ID: \
+      T = Context.SingletonId; \
       break;
-    case PREDEF_TYPE_IMAGE1D_ARR_ID:
-      T = Context.OCLImage1dArrayTy;
-      break;
-    case PREDEF_TYPE_IMAGE1D_BUFF_ID:
-      T = Context.OCLImage1dBufferTy;
-      break;
-    case PREDEF_TYPE_IMAGE2D_ID:
-      T = Context.OCLImage2dTy;
-      break;
-    case PREDEF_TYPE_IMAGE2D_ARR_ID:
-      T = Context.OCLImage2dArrayTy;
-      break;
-    case PREDEF_TYPE_IMAGE2D_DEP_ID:
-      T = Context.OCLImage2dDepthTy;
-      break;
-    case PREDEF_TYPE_IMAGE2D_ARR_DEP_ID:
-      T = Context.OCLImage2dArrayDepthTy;
-      break;
-    case PREDEF_TYPE_IMAGE2D_MSAA_ID:
-      T = Context.OCLImage2dMSAATy;
-      break;
-    case PREDEF_TYPE_IMAGE2D_ARR_MSAA_ID:
-      T = Context.OCLImage2dArrayMSAATy;
-      break;
-    case PREDEF_TYPE_IMAGE2D_MSAA_DEP_ID:
-      T = Context.OCLImage2dMSAADepthTy;
-      break;
-    case PREDEF_TYPE_IMAGE2D_ARR_MSAA_DEPTH_ID:
-      T = Context.OCLImage2dArrayMSAADepthTy;
-      break;
-    case PREDEF_TYPE_IMAGE3D_ID:
-      T = Context.OCLImage3dTy;
-      break;
+#include "clang/Basic/OpenCLImageTypes.def"
     case PREDEF_TYPE_SAMPLER_ID:
       T = Context.OCLSamplerTy;
       break;
@@ -6322,18 +6296,6 @@
   }
 }
 
-uint64_t ASTReader::ReadCXXCtorInitializersRef(ModuleFile &M,
-                                               const RecordData &Record,
-                                               unsigned &Idx) {
-  if (Idx >= Record.size() || Record[Idx] > M.LocalNumCXXCtorInitializers) {
-    Error("malformed AST file: missing C++ ctor initializers");
-    return 0;
-  }
-
-  unsigned LocalID = Record[Idx++];
-  return getGlobalBitOffset(M, M.CXXCtorInitializersOffsets[LocalID - 1]);
-}
-
 CXXCtorInitializer **
 ASTReader::GetExternalCXXCtorInitializers(uint64_t Offset) {
   RecordLocation Loc = getLocalBitOffset(Offset);
@@ -6354,18 +6316,6 @@
   return ReadCXXCtorInitializers(*Loc.F, Record, Idx);
 }
 
-uint64_t ASTReader::readCXXBaseSpecifiers(ModuleFile &M,
-                                          const RecordData &Record,
-                                          unsigned &Idx) {
-  if (Idx >= Record.size() || Record[Idx] > M.LocalNumCXXBaseSpecifiers) {
-    Error("malformed AST file: missing C++ base specifier");
-    return 0;
-  }
-
-  unsigned LocalID = Record[Idx++];
-  return getGlobalBitOffset(M, M.CXXBaseSpecifiersOffsets[LocalID - 1]);
-}
-
 CXXBaseSpecifier *ASTReader::GetExternalCXXBaseSpecifiers(uint64_t Offset) {
   RecordLocation Loc = getLocalBitOffset(Offset);
   BitstreamCursor &Cursor = Loc.F->DeclsCursor;
@@ -6433,9 +6383,9 @@
   if (Decl *D = DeclsLoaded[Index])
     return D->getLocation();
 
-  unsigned RawLocation = 0;
-  RecordLocation Rec = DeclCursorForID(ID, RawLocation);
-  return ReadSourceLocation(*Rec.F, RawLocation);
+  SourceLocation Loc;
+  DeclCursorForID(ID, Loc);
+  return Loc;
 }
 
 static Decl *getPredefinedDecl(ASTContext &Context, PredefinedDeclIDs ID) {
@@ -6487,6 +6437,9 @@
 
   case PREDEF_DECL_CF_CONSTANT_STRING_TAG_ID:
     return Context.getCFConstantStringTagDecl();
+
+  case PREDEF_DECL_TYPE_PACK_ELEMENT_ID:
+    return Context.getTypePackElementDecl();
   }
   llvm_unreachable("PredefinedDeclIDs unknown enum value");
 }
@@ -7011,10 +6964,18 @@
     SemaDeclRefs.clear();
   }
 
-  // Update the state of 'pragma clang optimize'. Use the same API as if we had
-  // encountered the pragma in the source.
+  // Update the state of pragmas. Use the same API as if we had encountered the
+  // pragma in the source.
   if(OptimizeOffPragmaLocation.isValid())
     SemaObj->ActOnPragmaOptimize(/* IsOn = */ false, OptimizeOffPragmaLocation);
+  if (PragmaMSStructState != -1)
+    SemaObj->ActOnPragmaMSStruct((PragmaMSStructKind)PragmaMSStructState);
+  if (PointersToMembersPragmaLocation.isValid()) {
+    SemaObj->ActOnPragmaMSPointersToMembers(
+        (LangOptions::PragmaMSPointersToMembersKind)
+            PragmaMSPointersToMembersState,
+        PointersToMembersPragmaLocation);
+  }
 }
 
 IdentifierInfo *ASTReader::get(StringRef Name) {
@@ -7535,10 +7496,11 @@
     const unsigned char *StrLenPtr = (const unsigned char*) Str - 2;
     unsigned StrLen = (((unsigned) StrLenPtr[0])
                        | (((unsigned) StrLenPtr[1]) << 8)) - 1;
-    IdentifiersLoaded[ID]
-      = &PP.getIdentifierTable().get(StringRef(Str, StrLen));
+    auto &II = PP.getIdentifierTable().get(StringRef(Str, StrLen));
+    IdentifiersLoaded[ID] = &II;
+    markIdentifierFromAST(*this,  II);
     if (DeserializationListener)
-      DeserializationListener->IdentifierRead(ID + 1, IdentifiersLoaded[ID]);
+      DeserializationListener->IdentifierRead(ID + 1, &II);
   }
 
   return IdentifiersLoaded[ID];
@@ -7945,9 +7907,10 @@
   while (NumParams--)
     Params.push_back(ReadDeclAs<NamedDecl>(F, Record, Idx));
 
+  // TODO: Concepts
   TemplateParameterList* TemplateParams =
     TemplateParameterList::Create(Context, TemplateLoc, LAngleLoc,
-                                  Params, RAngleLoc);
+                                  Params, RAngleLoc, nullptr);
   return TemplateParams;
 }
 
@@ -8403,14 +8366,16 @@
       for (unsigned IDIdx = 0, NumIDs = GlobalIDs.size(); IDIdx != NumIDs;
            ++IDIdx) {
         const PendingMacroInfo &Info = GlobalIDs[IDIdx];
-        if (!Info.M->isModule())
+        if (Info.M->Kind != MK_ImplicitModule &&
+            Info.M->Kind != MK_ExplicitModule)
           resolvePendingMacro(II, Info);
       }
       // Handle module imports.
       for (unsigned IDIdx = 0, NumIDs = GlobalIDs.size(); IDIdx != NumIDs;
            ++IDIdx) {
         const PendingMacroInfo &Info = GlobalIDs[IDIdx];
-        if (Info.M->isModule())
+        if (Info.M->Kind == MK_ImplicitModule ||
+            Info.M->Kind == MK_ExplicitModule)
           resolvePendingMacro(II, Info);
       }
     }
@@ -8688,6 +8653,7 @@
       auto Updates = std::move(PendingExceptionSpecUpdates);
       PendingExceptionSpecUpdates.clear();
       for (auto Update : Updates) {
+        ProcessingUpdatesRAIIObj ProcessingUpdates(*this);
         auto *FPT = Update.second->getType()->castAs<FunctionProtoType>();
         auto ESI = FPT->getExtProtoInfo().ExceptionSpec;
         if (auto *Listener = Context.getASTMutationListener())
@@ -8751,11 +8717,14 @@
       Consumer(nullptr), ModuleMgr(PP.getFileManager(), PCHContainerRdr),
       DummyIdResolver(PP),
       ReadTimer(std::move(ReadTimer)),
+      PragmaMSStructState(-1),
+      PragmaMSPointersToMembersState(-1),
       isysroot(isysroot), DisableValidation(DisableValidation),
       AllowASTWithCompilerErrors(AllowASTWithCompilerErrors),
       AllowConfigurationMismatch(AllowConfigurationMismatch),
       ValidateSystemInputs(ValidateSystemInputs),
       UseGlobalIndex(UseGlobalIndex), TriedLoadingGlobalIndex(false),
+      ProcessingUpdateRecords(false),
       CurrSwitchCaseStmts(&SwitchCaseStmts), NumSLocEntriesRead(0),
       TotalNumSLocEntries(0), NumStatementsRead(0), TotalNumStatements(0),
       NumMacrosRead(0), TotalNumMacros(0), NumIdentifierLookups(0),
diff --git a/lib/Serialization/ASTReaderDecl.cpp b/lib/Serialization/ASTReaderDecl.cpp
index 243db02..7ca117f 100644
--- a/lib/Serialization/ASTReaderDecl.cpp
+++ b/lib/Serialization/ASTReaderDecl.cpp
@@ -12,10 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Serialization/ASTReader.h"
 #include "ASTCommon.h"
 #include "ASTReaderInternals.h"
-#include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclGroup.h"
@@ -24,6 +22,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/Sema/IdentifierResolver.h"
 #include "clang/Sema/SemaDiagnostic.h"
+#include "clang/Serialization/ASTReader.h"
 #include "llvm/Support/SaveAndRestore.h"
 
 using namespace clang;
@@ -37,8 +36,9 @@
   class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
     ASTReader &Reader;
     ModuleFile &F;
+    uint64_t Offset;
     const DeclID ThisDeclID;
-    const unsigned RawLocation;
+    const SourceLocation ThisDeclLoc;
     typedef ASTReader::RecordData RecordData;
     const RecordData &Record;
     unsigned &Idx;
@@ -46,27 +46,47 @@
     unsigned AnonymousDeclNumber;
     GlobalDeclID NamedDeclForTagDecl;
     IdentifierInfo *TypedefNameForLinkage;
-    
+
     bool HasPendingBody;
 
+    ///\brief A flag to carry the information for a decl from the entity is
+    /// used. We use it to delay the marking of the canonical decl as used until
+    /// the entire declaration is deserialized and merged.
+    bool IsDeclMarkedUsed;
+
     uint64_t GetCurrentCursorOffset();
-    
+
+    uint64_t ReadLocalOffset(const RecordData &R, unsigned &I) {
+      uint64_t LocalOffset = R[I++];
+      assert(LocalOffset < Offset && "offset point after current record");
+      return LocalOffset ? Offset - LocalOffset : 0;
+    }
+
+    uint64_t ReadGlobalOffset(ModuleFile &F, const RecordData &R, unsigned &I) {
+      uint64_t Local = ReadLocalOffset(R, I);
+      return Local ? Reader.getGlobalBitOffset(F, Local) : 0;
+    }
+
     SourceLocation ReadSourceLocation(const RecordData &R, unsigned &I) {
       return Reader.ReadSourceLocation(F, R, I);
     }
-    
+
     SourceRange ReadSourceRange(const RecordData &R, unsigned &I) {
       return Reader.ReadSourceRange(F, R, I);
     }
-    
+
     TypeSourceInfo *GetTypeSourceInfo(const RecordData &R, unsigned &I) {
       return Reader.GetTypeSourceInfo(F, R, I);
     }
-    
+
     serialization::DeclID ReadDeclID(const RecordData &R, unsigned &I) {
       return Reader.ReadDeclID(F, R, I);
     }
 
+    std::string ReadString(const RecordData &R, unsigned &I) {
+      return Reader.ReadString(R, I);
+    }
+
     void ReadDeclIDList(SmallVectorImpl<DeclID> &IDs) {
       for (unsigned I = 0, Size = Record[Idx++]; I != Size; ++I)
         IDs.push_back(ReadDeclID(Record, Idx));
@@ -150,12 +170,12 @@
       ASTReader &Reader;
       NamedDecl *New;
       NamedDecl *Existing;
-      mutable bool AddResult;
+      bool AddResult;
 
       unsigned AnonymousDeclNumber;
       IdentifierInfo *TypedefNameForLinkage;
 
-      void operator=(FindExistingResult&) = delete;
+      void operator=(FindExistingResult &&) = delete;
 
     public:
       FindExistingResult(ASTReader &Reader)
@@ -169,7 +189,7 @@
             AnonymousDeclNumber(AnonymousDeclNumber),
             TypedefNameForLinkage(TypedefNameForLinkage) {}
 
-      FindExistingResult(const FindExistingResult &Other)
+      FindExistingResult(FindExistingResult &&Other)
           : Reader(Other.Reader), New(Other.New), Existing(Other.Existing),
             AddResult(Other.AddResult),
             AnonymousDeclNumber(Other.AnonymousDeclNumber),
@@ -194,12 +214,14 @@
     FindExistingResult findExisting(NamedDecl *D);
 
   public:
-    ASTDeclReader(ASTReader &Reader, ModuleFile &F, DeclID thisDeclID,
-                  unsigned RawLocation, const RecordData &Record, unsigned &Idx)
-        : Reader(Reader), F(F), ThisDeclID(thisDeclID),
-          RawLocation(RawLocation), Record(Record), Idx(Idx),
+    ASTDeclReader(ASTReader &Reader, ASTReader::RecordLocation Loc,
+                  DeclID thisDeclID, SourceLocation ThisDeclLoc,
+                  const RecordData &Record, unsigned &Idx)
+        : Reader(Reader), F(*Loc.F), Offset(Loc.Offset), ThisDeclID(thisDeclID),
+          ThisDeclLoc(ThisDeclLoc), Record(Record), Idx(Idx),
           TypeIDForTypeDecl(0), NamedDeclForTagDecl(0),
-          TypedefNameForLinkage(nullptr), HasPendingBody(false) {}
+          TypedefNameForLinkage(nullptr), HasPendingBody(false),
+          IsDeclMarkedUsed(false) {}
 
     template <typename DeclT>
     static Decl *getMostRecentDeclImpl(Redeclarable<DeclT> *D);
@@ -237,6 +259,8 @@
     }
 
     void VisitDecl(Decl *D);
+    void VisitPragmaCommentDecl(PragmaCommentDecl *D);
+    void VisitPragmaDetectMismatchDecl(PragmaDetectMismatchDecl *D);
     void VisitTranslationUnitDecl(TranslationUnitDecl *TU);
     void VisitNamedDecl(NamedDecl *ND);
     void VisitLabelDecl(LabelDecl *LD);
@@ -288,6 +312,8 @@
     void VisitVarDecl(VarDecl *VD) { VisitVarDeclImpl(VD); }
     void VisitImplicitParamDecl(ImplicitParamDecl *PD);
     void VisitParmVarDecl(ParmVarDecl *PD);
+    void VisitDecompositionDecl(DecompositionDecl *DD);
+    void VisitBindingDecl(BindingDecl *BD);
     void VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D);
     DeclID VisitTemplateDecl(TemplateDecl *D);
     RedeclarableResult VisitRedeclarableTemplateDecl(RedeclarableTemplateDecl *D);
@@ -299,6 +325,7 @@
     void VisitTypeAliasTemplateDecl(TypeAliasTemplateDecl *D);
     void VisitUsingDecl(UsingDecl *D);
     void VisitUsingShadowDecl(UsingShadowDecl *D);
+    void VisitConstructorUsingShadowDecl(ConstructorUsingShadowDecl *D);
     void VisitLinkageSpecDecl(LinkageSpecDecl *D);
     void VisitFileScopeAsmDecl(FileScopeAsmDecl *AD);
     void VisitImportDecl(ImportDecl *D);
@@ -349,6 +376,8 @@
     void VisitObjCPropertyDecl(ObjCPropertyDecl *D);
     void VisitObjCPropertyImplDecl(ObjCPropertyImplDecl *D);
     void VisitOMPThreadPrivateDecl(OMPThreadPrivateDecl *D);
+    void VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D);
+    void VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D);
 
     /// We've merged the definition \p MergedDef into the existing definition
     /// \p Def. Ensure that \p Def is made visible whenever \p MergedDef is made
@@ -409,8 +438,9 @@
 };
 } // end anonymous namespace
 
-template<typename DeclT>
-llvm::iterator_range<MergedRedeclIterator<DeclT>> merged_redecls(DeclT *D) {
+template <typename DeclT>
+static llvm::iterator_range<MergedRedeclIterator<DeclT>>
+merged_redecls(DeclT *D) {
   return llvm::make_range(MergedRedeclIterator<DeclT>(D),
                           MergedRedeclIterator<DeclT>());
 }
@@ -422,6 +452,11 @@
 void ASTDeclReader::Visit(Decl *D) {
   DeclVisitor<ASTDeclReader, void>::Visit(D);
 
+  // At this point we have deserialized and merged the decl and it is safe to
+  // update its canonical decl to signal that the entire entity is used.
+  D->getCanonicalDecl()->Used |= IsDeclMarkedUsed;
+  IsDeclMarkedUsed = false;
+
   if (DeclaratorDecl *DD = dyn_cast<DeclaratorDecl>(D)) {
     if (DD->DeclInfo) {
       DeclaratorDecl::ExtInfo *Info =
@@ -455,8 +490,7 @@
       if (auto *CD = dyn_cast<CXXConstructorDecl>(FD)) {
         CD->NumCtorInitializers = Record[Idx++];
         if (CD->NumCtorInitializers)
-          CD->CtorInitializers =
-              Reader.ReadCXXCtorInitializersRef(F, Record, Idx);
+          CD->CtorInitializers = ReadGlobalOffset(F, Record, Idx);
       }
       Reader.PendingBodies[FD] = GetCurrentCursorOffset();
       HasPendingBody = true;
@@ -492,7 +526,7 @@
     D->setDeclContextsImpl(MergedSemaDC ? MergedSemaDC : SemaDC, LexicalDC,
                            Reader.getContext());
   }
-  D->setLocation(Reader.ReadSourceLocation(F, RawLocation));
+  D->setLocation(ThisDeclLoc);
   D->setInvalidDecl(Record[Idx++]);
   if (Record[Idx++]) { // hasAttrs
     AttrVec Attrs;
@@ -503,6 +537,7 @@
   }
   D->setImplicit(Record[Idx++]);
   D->Used = Record[Idx++];
+  IsDeclMarkedUsed |= D->Used;
   D->setReferenced(Record[Idx++]);
   D->setTopLevelDeclInObjCContainer(Record[Idx++]);
   D->setAccess((AccessSpecifier)Record[Idx++]);
@@ -527,7 +562,7 @@
       if (Owner->NameVisibility != Module::AllVisible) {
         // The owning module is not visible. Mark this declaration as hidden.
         D->Hidden = true;
-        
+
         // Note that this declaration was hidden because its owning module is 
         // not yet visible.
         Reader.HiddenNamesMap[Owner].push_back(D);
@@ -536,6 +571,29 @@
   }
 }
 
+void ASTDeclReader::VisitPragmaCommentDecl(PragmaCommentDecl *D) {
+  VisitDecl(D);
+  D->setLocation(ReadSourceLocation(Record, Idx));
+  D->CommentKind = (PragmaMSCommentKind)Record[Idx++];
+  std::string Arg = ReadString(Record, Idx);
+  memcpy(D->getTrailingObjects<char>(), Arg.data(), Arg.size());
+  D->getTrailingObjects<char>()[Arg.size()] = '\0';
+}
+
+void ASTDeclReader::VisitPragmaDetectMismatchDecl(PragmaDetectMismatchDecl *D) {
+  VisitDecl(D);
+  D->setLocation(ReadSourceLocation(Record, Idx));
+  std::string Name = ReadString(Record, Idx);
+  memcpy(D->getTrailingObjects<char>(), Name.data(), Name.size());
+  D->getTrailingObjects<char>()[Name.size()] = '\0';
+
+  D->ValueStart = Name.size() + 1;
+  std::string Value = ReadString(Record, Idx);
+  memcpy(D->getTrailingObjects<char>() + D->ValueStart, Value.data(),
+         Value.size());
+  D->getTrailingObjects<char>()[D->ValueStart + Value.size()] = '\0';
+}
+
 void ASTDeclReader::VisitTranslationUnitDecl(TranslationUnitDecl *TU) {
   llvm_unreachable("Translation units are not serialized");
 }
@@ -770,7 +828,7 @@
 
     ASTContext &C = Reader.getContext();
     TemplateArgumentList *TemplArgList
-      = TemplateArgumentList::CreateCopy(C, TemplArgs.data(), TemplArgs.size());
+      = TemplateArgumentList::CreateCopy(C, TemplArgs);
     TemplateArgumentListInfo TemplArgsInfo(LAngleLoc, RAngleLoc);
     for (unsigned i=0, e = TemplArgLocs.size(); i != e; ++i)
       TemplArgsInfo.addArgument(TemplArgLocs[i]);
@@ -1097,7 +1155,7 @@
   D->setHasDestructors(Record[Idx++]);
   D->NumIvarInitializers = Record[Idx++];
   if (D->NumIvarInitializers)
-    D->IvarInitializers = Reader.ReadCXXCtorInitializersRef(F, Record, Idx);
+    D->IvarInitializers = ReadGlobalOffset(F, Record, Idx);
 }
 
 void ASTDeclReader::VisitObjCPropertyImplDecl(ObjCPropertyImplDecl *D) {
@@ -1162,6 +1220,8 @@
     VD->NonParmVarDeclBits.NRVOVariable = Record[Idx++];
     VD->NonParmVarDeclBits.CXXForRangeDecl = Record[Idx++];
     VD->NonParmVarDeclBits.ARCPseudoStrong = Record[Idx++];
+    VD->NonParmVarDeclBits.IsInline = Record[Idx++];
+    VD->NonParmVarDeclBits.IsInlineSpecified = Record[Idx++];
     VD->NonParmVarDeclBits.IsConstexpr = Record[Idx++];
     VD->NonParmVarDeclBits.IsInitCapture = Record[Idx++];
     VD->NonParmVarDeclBits.PreviousDeclInSameBlockScope = Record[Idx++];
@@ -1237,6 +1297,18 @@
   // inheritance of default arguments.
 }
 
+void ASTDeclReader::VisitDecompositionDecl(DecompositionDecl *DD) {
+  VisitVarDecl(DD);
+  BindingDecl **BDs = DD->getTrailingObjects<BindingDecl*>();
+  for (unsigned I = 0; I != DD->NumBindings; ++I)
+    BDs[I] = ReadDeclAs<BindingDecl>(Record, Idx);
+}
+
+void ASTDeclReader::VisitBindingDecl(BindingDecl *BD) {
+  VisitValueDecl(BD);
+  BD->Binding = Reader.ReadExpr(F);
+}
+
 void ASTDeclReader::VisitFileScopeAsmDecl(FileScopeAsmDecl *AD) {
   VisitDecl(AD);
   AD->setAsmString(cast<StringLiteral>(Reader.ReadExpr(F)));
@@ -1326,7 +1398,7 @@
     // any other module's anonymous namespaces, so don't attach the anonymous
     // namespace at all.
     NamespaceDecl *Anon = cast<NamespaceDecl>(Reader.GetDecl(AnonNamespace));
-    if (!F.isModule())
+    if (F.Kind != MK_ImplicitModule && F.Kind != MK_ExplicitModule)
       D->setAnonymousNamespace(Anon);
   }
 }
@@ -1364,6 +1436,16 @@
   mergeRedeclarable(D, Redecl);
 }
 
+void ASTDeclReader::VisitConstructorUsingShadowDecl(
+    ConstructorUsingShadowDecl *D) {
+  VisitUsingShadowDecl(D);
+  D->NominatedBaseClassShadowDecl =
+      ReadDeclAs<ConstructorUsingShadowDecl>(Record, Idx);
+  D->ConstructedBaseClassShadowDecl =
+      ReadDeclAs<ConstructorUsingShadowDecl>(Record, Idx);
+  D->IsVirtual = Record[Idx++];
+}
+
 void ASTDeclReader::VisitUsingDirectiveDecl(UsingDirectiveDecl *D) {
   VisitNamedDecl(D);
   D->UsingLoc = ReadSourceLocation(Record, Idx);
@@ -1410,6 +1492,9 @@
   Data.HasOnlyCMembers = Record[Idx++];
   Data.HasInClassInitializer = Record[Idx++];
   Data.HasUninitializedReferenceMember = Record[Idx++];
+  Data.HasUninitializedFields = Record[Idx++];
+  Data.HasInheritedConstructor = Record[Idx++];
+  Data.HasInheritedAssignment = Record[Idx++];
   Data.NeedOverloadResolutionForMoveConstructor = Record[Idx++];
   Data.NeedOverloadResolutionForMoveAssignment = Record[Idx++];
   Data.NeedOverloadResolutionForDestructor = Record[Idx++];
@@ -1420,6 +1505,7 @@
   Data.DeclaredNonTrivialSpecialMembers = Record[Idx++];
   Data.HasIrrelevantDestructor = Record[Idx++];
   Data.HasConstexprNonCopyMoveConstructor = Record[Idx++];
+  Data.HasDefaultedDefaultConstructor = Record[Idx++];
   Data.DefaultedDefaultConstructorIsConstexpr = Record[Idx++];
   Data.HasConstexprDefaultConstructor = Record[Idx++];
   Data.HasNonLiteralTypeFieldsOrBases = Record[Idx++];
@@ -1433,10 +1519,10 @@
 
   Data.NumBases = Record[Idx++];
   if (Data.NumBases)
-    Data.Bases = Reader.readCXXBaseSpecifiers(F, Record, Idx);
+    Data.Bases = ReadGlobalOffset(F, Record, Idx);
   Data.NumVBases = Record[Idx++];
   if (Data.NumVBases)
-    Data.VBases = Reader.readCXXBaseSpecifiers(F, Record, Idx);
+    Data.VBases = ReadGlobalOffset(F, Record, Idx);
   
   Reader.ReadUnresolvedSet(F, Data.Conversions, Record, Idx);
   Reader.ReadUnresolvedSet(F, Data.VisibleConversions, Record, Idx);
@@ -1463,6 +1549,7 @@
       bool IsImplicit = Record[Idx++];
       LambdaCaptureKind Kind = static_cast<LambdaCaptureKind>(Record[Idx++]);
       switch (Kind) {
+      case LCK_StarThis: 
       case LCK_This:
       case LCK_VLAType:
         *ToCapture++ = Capture(Loc, IsImplicit, Kind, nullptr,SourceLocation());
@@ -1480,9 +1567,9 @@
 
 void ASTDeclReader::MergeDefinitionData(
     CXXRecordDecl *D, struct CXXRecordDecl::DefinitionData &&MergeDD) {
-  assert(D->DefinitionData.getNotUpdated() &&
+  assert(D->DefinitionData &&
          "merging class definition into non-definition");
-  auto &DD = *D->DefinitionData.getNotUpdated();
+  auto &DD = *D->DefinitionData;
 
   if (DD.Definition != MergeDD.Definition) {
     // Track that we merged the definitions.
@@ -1534,6 +1621,9 @@
   MATCH_FIELD(HasOnlyCMembers)
   MATCH_FIELD(HasInClassInitializer)
   MATCH_FIELD(HasUninitializedReferenceMember)
+  MATCH_FIELD(HasUninitializedFields)
+  MATCH_FIELD(HasInheritedConstructor)
+  MATCH_FIELD(HasInheritedAssignment)
   MATCH_FIELD(NeedOverloadResolutionForMoveConstructor)
   MATCH_FIELD(NeedOverloadResolutionForMoveAssignment)
   MATCH_FIELD(NeedOverloadResolutionForDestructor)
@@ -1544,6 +1634,7 @@
   OR_FIELD(DeclaredNonTrivialSpecialMembers)
   MATCH_FIELD(HasIrrelevantDestructor)
   OR_FIELD(HasConstexprNonCopyMoveConstructor)
+  OR_FIELD(HasDefaultedDefaultConstructor)
   MATCH_FIELD(DefaultedDefaultConstructorIsConstexpr)
   OR_FIELD(HasConstexprDefaultConstructor)
   MATCH_FIELD(HasNonLiteralTypeFieldsOrBases)
@@ -1601,7 +1692,7 @@
   // because we're reading an update record, or because we've already done some
   // merging. Either way, just merge into it.
   CXXRecordDecl *Canon = D->getCanonicalDecl();
-  if (Canon->DefinitionData.getNotUpdated()) {
+  if (Canon->DefinitionData) {
     MergeDefinitionData(Canon, std::move(*DD));
     D->DefinitionData = Canon->DefinitionData;
     return;
@@ -1702,11 +1793,17 @@
 }
 
 void ASTDeclReader::VisitCXXConstructorDecl(CXXConstructorDecl *D) {
+  // We need the inherited constructor information to merge the declaration,
+  // so we have to read it before we call VisitCXXMethodDecl.
+  if (D->isInheritingConstructor()) {
+    auto *Shadow = ReadDeclAs<ConstructorUsingShadowDecl>(Record, Idx);
+    auto *Ctor = ReadDeclAs<CXXConstructorDecl>(Record, Idx);
+    *D->getTrailingObjects<InheritedConstructor>() =
+        InheritedConstructor(Shadow, Ctor);
+  }
+
   VisitCXXMethodDecl(D);
 
-  if (auto *CD = ReadDeclAs<CXXConstructorDecl>(Record, Idx))
-    if (D->isCanonicalDecl())
-      D->setInheritedConstructor(CD->getCanonicalDecl());
   D->IsExplicitSpecified = Record[Idx++];
 }
 
@@ -1897,8 +1994,7 @@
       SmallVector<TemplateArgument, 8> TemplArgs;
       Reader.ReadTemplateArgumentList(TemplArgs, F, Record, Idx);
       TemplateArgumentList *ArgList
-        = TemplateArgumentList::CreateCopy(C, TemplArgs.data(), 
-                                           TemplArgs.size());
+        = TemplateArgumentList::CreateCopy(C, TemplArgs);
       ClassTemplateSpecializationDecl::SpecializedPartialSpecialization *PS
           = new (C) ClassTemplateSpecializationDecl::
                                              SpecializedPartialSpecialization();
@@ -1912,8 +2008,7 @@
   SmallVector<TemplateArgument, 8> TemplArgs;
   Reader.ReadTemplateArgumentList(TemplArgs, F, Record, Idx,
                                   /*Canonicalize*/ true);
-  D->TemplateArgs = TemplateArgumentList::CreateCopy(C, TemplArgs.data(), 
-                                                     TemplArgs.size());
+  D->TemplateArgs = TemplateArgumentList::CreateCopy(C, TemplArgs);
   D->PointOfInstantiation = ReadSourceLocation(Record, Idx);
   D->SpecializationKind = (TemplateSpecializationKind)Record[Idx++];
 
@@ -1937,8 +2032,8 @@
 
         // This declaration might be a definition. Merge with any existing
         // definition.
-        if (auto *DDD = D->DefinitionData.getNotUpdated()) {
-          if (CanonSpec->DefinitionData.getNotUpdated())
+        if (auto *DDD = D->DefinitionData) {
+          if (CanonSpec->DefinitionData)
             MergeDefinitionData(CanonSpec, std::move(*DDD));
           else
             CanonSpec->DefinitionData = D->DefinitionData;
@@ -2016,7 +2111,7 @@
       SmallVector<TemplateArgument, 8> TemplArgs;
       Reader.ReadTemplateArgumentList(TemplArgs, F, Record, Idx);
       TemplateArgumentList *ArgList = TemplateArgumentList::CreateCopy(
-          C, TemplArgs.data(), TemplArgs.size());
+          C, TemplArgs);
       VarTemplateSpecializationDecl::SpecializedPartialSpecialization *PS =
           new (C)
           VarTemplateSpecializationDecl::SpecializedPartialSpecialization();
@@ -2040,8 +2135,7 @@
   SmallVector<TemplateArgument, 8> TemplArgs;
   Reader.ReadTemplateArgumentList(TemplArgs, F, Record, Idx,
                                   /*Canonicalize*/ true);
-  D->TemplateArgs =
-      TemplateArgumentList::CreateCopy(C, TemplArgs.data(), TemplArgs.size());
+  D->TemplateArgs = TemplateArgumentList::CreateCopy(C, TemplArgs);
   D->PointOfInstantiation = ReadSourceLocation(Record, Idx);
   D->SpecializationKind = (TemplateSpecializationKind)Record[Idx++];
 
@@ -2150,8 +2244,8 @@
 
 std::pair<uint64_t, uint64_t>
 ASTDeclReader::VisitDeclContext(DeclContext *DC) {
-  uint64_t LexicalOffset = Record[Idx++];
-  uint64_t VisibleOffset = Record[Idx++];
+  uint64_t LexicalOffset = ReadLocalOffset(Record, Idx);
+  uint64_t VisibleOffset = ReadLocalOffset(Record, Idx);
   return std::make_pair(LexicalOffset, VisibleOffset);
 }
 
@@ -2186,7 +2280,7 @@
     for (unsigned I = 0; I != N - 1; ++I)
       MergeWith = ReadDecl(Record, Idx/*, MergeWith*/);
 
-    RedeclOffset = Record[Idx++];
+    RedeclOffset = ReadLocalOffset(Record, Idx);
   } else {
     // This declaration was not the first local declaration. Read the first
     // local declaration now, to trigger the import of other redeclarations.
@@ -2262,8 +2356,8 @@
     // FIXME: This is duplicated in several places. Refactor.
     auto *ExistingClass =
         cast<CXXRecordDecl>(ExistingPattern)->getCanonicalDecl();
-    if (auto *DDD = DClass->DefinitionData.getNotUpdated()) {
-      if (ExistingClass->DefinitionData.getNotUpdated()) {
+    if (auto *DDD = DClass->DefinitionData) {
+      if (ExistingClass->DefinitionData) {
         MergeDefinitionData(ExistingClass, std::move(*DDD));
       } else {
         ExistingClass->DefinitionData = DClass->DefinitionData;
@@ -2306,6 +2400,8 @@
     // appropriate canonical declaration.
     D->RedeclLink = Redeclarable<T>::PreviousDeclLink(ExistingCanon);
     D->First = ExistingCanon;
+    ExistingCanon->Used |= D->Used;
+    D->Used = false;
 
     // When we merge a namespace, update its pointer to the first namespace.
     // We cannot have loaded any redeclarations of this declaration yet, so
@@ -2359,6 +2455,18 @@
   D->setVars(Vars);
 }
 
+void ASTDeclReader::VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D) {
+  VisitValueDecl(D);
+  D->setLocation(Reader.ReadSourceLocation(F, Record, Idx));
+  D->setCombiner(Reader.ReadExpr(F));
+  D->setInitializer(Reader.ReadExpr(F));
+  D->PrevDeclInScope = Reader.ReadDeclID(F, Record, Idx);
+}
+
+void ASTDeclReader::VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D) {
+  VisitVarDecl(D);
+}
+
 //===----------------------------------------------------------------------===//
 // Attribute Reading
 //===----------------------------------------------------------------------===//
@@ -2400,16 +2508,25 @@
 /// This routine should return true for anything that might affect
 /// code generation, e.g., inline function definitions, Objective-C
 /// declarations with metadata, etc.
-static bool isConsumerInterestedIn(Decl *D, bool HasBody) {
+static bool isConsumerInterestedIn(ASTContext &Ctx, Decl *D, bool HasBody) {
   // An ObjCMethodDecl is never considered as "interesting" because its
   // implementation container always is.
 
+  // An ImportDecl or VarDecl imported from a module will get emitted when
+  // we import the relevant module.
+  if ((isa<ImportDecl>(D) || isa<VarDecl>(D)) && Ctx.DeclMustBeEmitted(D) &&
+      D->getImportedOwningModule())
+    return false;
+
   if (isa<FileScopeAsmDecl>(D) || 
       isa<ObjCProtocolDecl>(D) || 
       isa<ObjCImplDecl>(D) ||
       isa<ImportDecl>(D) ||
-      isa<OMPThreadPrivateDecl>(D))
+      isa<PragmaCommentDecl>(D) ||
+      isa<PragmaDetectMismatchDecl>(D))
     return true;
+  if (isa<OMPThreadPrivateDecl>(D) || isa<OMPDeclareReductionDecl>(D))
+    return !D->getDeclContext()->isFunctionOrMethod();
   if (VarDecl *Var = dyn_cast<VarDecl>(D))
     return Var->isFileVarDecl() &&
            Var->isThisDeclarationADefinition() == VarDecl::Definition;
@@ -2421,20 +2538,13 @@
 
 /// \brief Get the correct cursor and offset for loading a declaration.
 ASTReader::RecordLocation
-ASTReader::DeclCursorForID(DeclID ID, unsigned &RawLocation) {
-  // See if there's an override.
-  DeclReplacementMap::iterator It = ReplacedDecls.find(ID);
-  if (It != ReplacedDecls.end()) {
-    RawLocation = It->second.RawLoc;
-    return RecordLocation(It->second.Mod, It->second.Offset);
-  }
-
+ASTReader::DeclCursorForID(DeclID ID, SourceLocation &Loc) {
   GlobalDeclMapType::iterator I = GlobalDeclMap.find(ID);
   assert(I != GlobalDeclMap.end() && "Corrupted global declaration map");
   ModuleFile *M = I->second;
-  const DeclOffset &
-    DOffs =  M->DeclOffsets[ID - M->BaseDeclID - NUM_PREDEF_DECL_IDS];
-  RawLocation = DOffs.Loc;
+  const DeclOffset &DOffs =
+      M->DeclOffsets[ID - M->BaseDeclID - NUM_PREDEF_DECL_IDS];
+  Loc = TranslateSourceLocation(*M, DOffs.getLocation());
   return RecordLocation(M, DOffs.BitOffset);
 }
 
@@ -2587,6 +2697,13 @@
   // functions, etc.
   if (FunctionDecl *FuncX = dyn_cast<FunctionDecl>(X)) {
     FunctionDecl *FuncY = cast<FunctionDecl>(Y);
+    if (CXXConstructorDecl *CtorX = dyn_cast<CXXConstructorDecl>(X)) {
+      CXXConstructorDecl *CtorY = cast<CXXConstructorDecl>(Y);
+      if (CtorX->getInheritedConstructor() &&
+          !isSameEntity(CtorX->getInheritedConstructor().getConstructor(),
+                        CtorY->getInheritedConstructor().getConstructor()))
+        return false;
+    }
     return (FuncX->getLinkageInternal() == FuncY->getLinkageInternal()) &&
       FuncX->getASTContext().hasSameType(FuncX->getType(), FuncY->getType());
   }
@@ -2594,8 +2711,24 @@
   // Variables with the same type and linkage match.
   if (VarDecl *VarX = dyn_cast<VarDecl>(X)) {
     VarDecl *VarY = cast<VarDecl>(Y);
-    return (VarX->getLinkageInternal() == VarY->getLinkageInternal()) &&
-      VarX->getASTContext().hasSameType(VarX->getType(), VarY->getType());
+    if (VarX->getLinkageInternal() == VarY->getLinkageInternal()) {
+      ASTContext &C = VarX->getASTContext();
+      if (C.hasSameType(VarX->getType(), VarY->getType()))
+        return true;
+
+      // We can get decls with different types on the redecl chain. Eg.
+      // template <typename T> struct S { static T Var[]; }; // #1
+      // template <typename T> T S<T>::Var[sizeof(T)]; // #2
+      // Only? happens when completing an incomplete array type. In this case
+      // when comparing #1 and #2 we should go through their element type.
+      const ArrayType *VarXTy = C.getAsArrayType(VarX->getType());
+      const ArrayType *VarYTy = C.getAsArrayType(VarY->getType());
+      if (!VarXTy || !VarYTy)
+        return false;
+      if (VarXTy->isIncompleteArrayType() || VarYTy->isIncompleteArrayType())
+        return C.hasSameType(VarXTy->getElementType(), VarYTy->getElementType());
+    }
+    return false;
   }
 
   // Namespaces with the same name and inlinedness match.
@@ -2675,9 +2808,9 @@
 
   if (CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(DC)) {
     // Try to dig out the definition.
-    auto *DD = RD->DefinitionData.getNotUpdated();
+    auto *DD = RD->DefinitionData;
     if (!DD)
-      DD = RD->getCanonicalDecl()->DefinitionData.getNotUpdated();
+      DD = RD->getCanonicalDecl()->DefinitionData;
 
     // If there's no definition yet, then DC's definition is added by an update
     // record, but we've not yet loaded that update record. In this case, we
@@ -3039,11 +3172,6 @@
       Previous->IdentifierNamespace &
       (Decl::IDNS_Ordinary | Decl::IDNS_Tag | Decl::IDNS_Type);
 
-  // If the previous declaration is marked as used, then this declaration should
-  // be too.
-  if (Previous->Used)
-    D->Used = true;
-
   // If the declaration declares a template, it may inherit default arguments
   // from the previous declaration.
   if (TemplateDecl *TD = dyn_cast<TemplateDecl>(D))
@@ -3094,8 +3222,8 @@
 /// \brief Read the declaration at the given offset from the AST file.
 Decl *ASTReader::ReadDeclRecord(DeclID ID) {
   unsigned Index = ID - NUM_PREDEF_DECL_IDS;
-  unsigned RawLocation = 0;
-  RecordLocation Loc = DeclCursorForID(ID, RawLocation);
+  SourceLocation DeclLoc;
+  RecordLocation Loc = DeclCursorForID(ID, DeclLoc);
   llvm::BitstreamCursor &DeclsCursor = Loc.F->DeclsCursor;
   // Keep track of where we are in the stream, then jump back there
   // after reading this declaration.
@@ -3110,7 +3238,7 @@
   RecordData Record;
   unsigned Code = DeclsCursor.ReadCode();
   unsigned Idx = 0;
-  ASTDeclReader Reader(*this, *Loc.F, ID, RawLocation, Record,Idx);
+  ASTDeclReader Reader(*this, Loc, ID, DeclLoc, Record,Idx);
 
   Decl *D = nullptr;
   switch ((DeclCode)DeclsCursor.readRecord(Code, Record)) {
@@ -3153,6 +3281,9 @@
   case DECL_USING_SHADOW:
     D = UsingShadowDecl::CreateDeserialized(Context, ID);
     break;
+  case DECL_CONSTRUCTOR_USING_SHADOW:
+    D = ConstructorUsingShadowDecl::CreateDeserialized(Context, ID);
+    break;
   case DECL_USING_DIRECTIVE:
     D = UsingDirectiveDecl::CreateDeserialized(Context, ID);
     break;
@@ -3169,7 +3300,10 @@
     D = CXXMethodDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CXX_CONSTRUCTOR:
-    D = CXXConstructorDecl::CreateDeserialized(Context, ID);
+    D = CXXConstructorDecl::CreateDeserialized(Context, ID, false);
+    break;
+  case DECL_CXX_INHERITED_CONSTRUCTOR:
+    D = CXXConstructorDecl::CreateDeserialized(Context, ID, true);
     break;
   case DECL_CXX_DESTRUCTOR:
     D = CXXDestructorDecl::CreateDeserialized(Context, ID);
@@ -3280,6 +3414,12 @@
   case DECL_PARM_VAR:
     D = ParmVarDecl::CreateDeserialized(Context, ID);
     break;
+  case DECL_DECOMPOSITION:
+    D = DecompositionDecl::CreateDeserialized(Context, ID, Record[Idx++]);
+    break;
+  case DECL_BINDING:
+    D = BindingDecl::CreateDeserialized(Context, ID);
+    break;
   case DECL_FILE_SCOPE_ASM:
     D = FileScopeAsmDecl::CreateDeserialized(Context, ID);
     break;
@@ -3306,6 +3446,19 @@
   case DECL_OMP_THREADPRIVATE:
     D = OMPThreadPrivateDecl::CreateDeserialized(Context, ID, Record[Idx++]);
     break;
+  case DECL_OMP_DECLARE_REDUCTION:
+    D = OMPDeclareReductionDecl::CreateDeserialized(Context, ID);
+    break;
+  case DECL_OMP_CAPTUREDEXPR:
+    D = OMPCapturedExprDecl::CreateDeserialized(Context, ID);
+    break;
+  case DECL_PRAGMA_COMMENT:
+    D = PragmaCommentDecl::CreateDeserialized(Context, ID, Record[Idx++]);
+    break;
+  case DECL_PRAGMA_DETECT_MISMATCH:
+    D = PragmaDetectMismatchDecl::CreateDeserialized(Context, ID,
+                                                     Record[Idx++]);
+    break;
   case DECL_EMPTY:
     D = EmptyDecl::CreateDeserialized(Context, ID);
     break;
@@ -3347,7 +3500,7 @@
   // AST consumer might need to know about, queue it.
   // We don't pass it to the consumer immediately because we may be in recursive
   // loading, and some declarations may still be initializing.
-  if (isConsumerInterestedIn(D, Reader.hasPendingBody()))
+  if (isConsumerInterestedIn(Context, D, Reader.hasPendingBody()))
     InterestingDecls.push_back(D);
 
   return D;
@@ -3357,12 +3510,13 @@
   // The declaration may have been modified by files later in the chain.
   // If this is the case, read the record containing the updates from each file
   // and pass it to ASTDeclReader to make the modifications.
+  ProcessingUpdatesRAIIObj ProcessingUpdates(*this);
   DeclUpdateOffsetsMap::iterator UpdI = DeclUpdateOffsets.find(ID);
   if (UpdI != DeclUpdateOffsets.end()) {
     auto UpdateOffsets = std::move(UpdI->second);
     DeclUpdateOffsets.erase(UpdI);
 
-    bool WasInteresting = isConsumerInterestedIn(D, false);
+    bool WasInteresting = isConsumerInterestedIn(Context, D, false);
     for (auto &FileAndOffset : UpdateOffsets) {
       ModuleFile *F = FileAndOffset.first;
       uint64_t Offset = FileAndOffset.second;
@@ -3376,13 +3530,14 @@
       assert(RecCode == DECL_UPDATES && "Expected DECL_UPDATES record!");
 
       unsigned Idx = 0;
-      ASTDeclReader Reader(*this, *F, ID, 0, Record, Idx);
+      ASTDeclReader Reader(*this, RecordLocation(F, Offset), ID,
+                           SourceLocation(), Record, Idx);
       Reader.UpdateDecl(D, *F, Record);
 
       // We might have made this declaration interesting. If so, remember that
       // we need to hand it off to the consumer.
       if (!WasInteresting &&
-          isConsumerInterestedIn(D, Reader.hasPendingBody())) {
+          isConsumerInterestedIn(Context, D, Reader.hasPendingBody())) {
         InterestingDecls.push_back(D);
         WasInteresting = true;
       }
@@ -3612,7 +3767,8 @@
       // Each module has its own anonymous namespace, which is disjoint from
       // any other module's anonymous namespaces, so don't attach the anonymous
       // namespace at all.
-      if (!ModuleFile.isModule()) {
+      if (ModuleFile.Kind != MK_ImplicitModule &&
+          ModuleFile.Kind != MK_ExplicitModule) {
         if (TranslationUnitDecl *TU = dyn_cast<TranslationUnitDecl>(D))
           TU->setAnonymousNamespace(Anon);
         else
@@ -3641,6 +3797,23 @@
       break;
     }
 
+    case UPD_CXX_INSTANTIATED_DEFAULT_MEMBER_INITIALIZER: {
+      auto FD = cast<FieldDecl>(D);
+      auto DefaultInit = Reader.ReadExpr(F);
+
+      // Only apply the update if the field still has an uninstantiated
+      // default member initializer.
+      if (FD->hasInClassInitializer() && !FD->getInClassInitializer()) {
+        if (DefaultInit)
+          FD->setInClassInitializer(DefaultInit);
+        else
+          // Instantiation failed. We can get here if we serialized an AST for
+          // an invalid program.
+          FD->removeInClassInitializer();
+      }
+      break;
+    }
+
     case UPD_CXX_ADDED_FUNCTION_DEFINITION: {
       FunctionDecl *FD = cast<FunctionDecl>(D);
       if (Reader.PendingBodies[FD]) {
@@ -3661,8 +3834,7 @@
       if (auto *CD = dyn_cast<CXXConstructorDecl>(FD)) {
         CD->NumCtorInitializers = Record[Idx++];
         if (CD->NumCtorInitializers)
-          CD->CtorInitializers =
-              Reader.ReadCXXCtorInitializersRef(F, Record, Idx);
+          CD->CtorInitializers = ReadGlobalOffset(F, Record, Idx);
       }
       // Store the offset of the body so we can lazily load it later.
       Reader.PendingBodies[FD] = GetCurrentCursorOffset();
@@ -3673,14 +3845,14 @@
 
     case UPD_CXX_INSTANTIATED_CLASS_DEFINITION: {
       auto *RD = cast<CXXRecordDecl>(D);
-      auto *OldDD = RD->getCanonicalDecl()->DefinitionData.getNotUpdated();
+      auto *OldDD = RD->getCanonicalDecl()->DefinitionData;
       bool HadRealDefinition =
           OldDD && (OldDD->Definition != RD ||
                     !Reader.PendingFakeDefinitionData.count(OldDD));
       ReadCXXRecordDefinition(RD, /*Update*/true);
 
       // Visible update is handled separately.
-      uint64_t LexicalOffset = Record[Idx++];
+      uint64_t LexicalOffset = ReadLocalOffset(Record, Idx);
       if (!HadRealDefinition && LexicalOffset) {
         Reader.ReadLexicalDeclContextStorage(ModuleFile, ModuleFile.DeclsCursor,
                                              LexicalOffset, RD);
@@ -3705,7 +3877,7 @@
           SmallVector<TemplateArgument, 8> TemplArgs;
           Reader.ReadTemplateArgumentList(TemplArgs, F, Record, Idx);
           auto *TemplArgList = TemplateArgumentList::CreateCopy(
-              Reader.getContext(), TemplArgs.data(), TemplArgs.size());
+              Reader.getContext(), TemplArgs);
 
           // FIXME: If we already have a partial specialization set,
           // check that it matches.
@@ -3774,11 +3946,8 @@
     }
 
     case UPD_DECL_MARKED_USED: {
-      // FIXME: This doesn't send the right notifications if there are
-      // ASTMutationListeners other than an ASTWriter.
-
       // Maintain AST consistency: any later redeclarations are used too.
-      forAllLaterRedecls(D, [](Decl *D) { D->Used = true; });
+      D->markUsed(Reader.Context);
       break;
     }
 
@@ -3802,11 +3971,8 @@
         Exported = TD->getDefinition();
       Module *Owner = SubmoduleID ? Reader.getSubmodule(SubmoduleID) : nullptr;
       if (Reader.getContext().getLangOpts().ModulesLocalVisibility) {
-        // FIXME: This doesn't send the right notifications if there are
-        // ASTMutationListeners other than an ASTWriter.
-        Reader.getContext().mergeDefinitionIntoModule(
-            cast<NamedDecl>(Exported), Owner,
-            /*NotifyListeners*/ false);
+        Reader.getContext().mergeDefinitionIntoModule(cast<NamedDecl>(Exported),
+                                                      Owner);
         Reader.PendingMergedDefinitionsToDeduplicate.insert(
             cast<NamedDecl>(Exported));
       } else if (Owner && Owner->NameVisibility != Module::AllVisible) {
@@ -3820,6 +3986,7 @@
       break;
     }
 
+    case UPD_DECL_MARKED_OPENMP_DECLARETARGET:
     case UPD_ADDED_ATTR_TO_RECORD:
       AttrVec Attrs;
       Reader.ReadAttributes(F, Attrs, Record, Idx);
diff --git a/lib/Serialization/ASTReaderInternals.h b/lib/Serialization/ASTReaderInternals.h
index d392364..250c8b1 100644
--- a/lib/Serialization/ASTReaderInternals.h
+++ b/lib/Serialization/ASTReaderInternals.h
@@ -13,14 +13,12 @@
 #ifndef LLVM_CLANG_LIB_SERIALIZATION_ASTREADERINTERNALS_H
 #define LLVM_CLANG_LIB_SERIALIZATION_ASTREADERINTERNALS_H
 
+#include "MultiOnDiskHashTable.h"
 #include "clang/AST/DeclarationName.h"
 #include "clang/Serialization/ASTBitCodes.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/PointerUnion.h"
-#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/OnDiskHashTable.h"
-#include "MultiOnDiskHashTable.h"
 #include <utility>
 
 namespace clang {
diff --git a/lib/Serialization/ASTReaderStmt.cpp b/lib/Serialization/ASTReaderStmt.cpp
index 3c2a98b..331932e 100644
--- a/lib/Serialization/ASTReaderStmt.cpp
+++ b/lib/Serialization/ASTReaderStmt.cpp
@@ -184,6 +184,8 @@
 
 void ASTStmtReader::VisitIfStmt(IfStmt *S) {
   VisitStmt(S);
+  S->setConstexpr(Record[Idx++]);
+  S->setInit(Reader.ReadSubStmt());
   S->setConditionVariable(Reader.getContext(),
                           ReadDeclAs<VarDecl>(Record, Idx));
   S->setCond(Reader.ReadSubExpr());
@@ -195,6 +197,7 @@
 
 void ASTStmtReader::VisitSwitchStmt(SwitchStmt *S) {
   VisitStmt(S);
+  S->setInit(Reader.ReadSubStmt());
   S->setConditionVariable(Reader.getContext(),
                           ReadDeclAs<VarDecl>(Record, Idx));
   S->setCond(Reader.ReadSubExpr());
@@ -1179,6 +1182,14 @@
   E->setLocation(ReadSourceLocation(Record, Idx));
 }
 
+void ASTStmtReader::VisitObjCAvailabilityCheckExpr(ObjCAvailabilityCheckExpr *E) {
+  VisitExpr(E);
+  SourceRange R = Reader.ReadSourceRange(F, Record, Idx);
+  E->AtLoc = R.getBegin();
+  E->RParen = R.getEnd();
+  E->VersionToCheck = Reader.ReadVersionTuple(Record, Idx);
+}
+
 //===----------------------------------------------------------------------===//
 // C++ Expressions and Statements
 //===----------------------------------------------------------------------===//
@@ -1207,7 +1218,8 @@
   S->ColonLoc = ReadSourceLocation(Record, Idx);
   S->RParenLoc = ReadSourceLocation(Record, Idx);
   S->setRangeStmt(Reader.ReadSubStmt());
-  S->setBeginEndStmt(Reader.ReadSubStmt());
+  S->setBeginStmt(Reader.ReadSubStmt());
+  S->setEndStmt(Reader.ReadSubStmt());
   S->setCond(Reader.ReadSubExpr());
   S->setInc(Reader.ReadSubExpr());
   S->setLoopVarStmt(Reader.ReadSubStmt());
@@ -1248,6 +1260,14 @@
   E->ParenOrBraceRange = ReadSourceRange(Record, Idx);
 }
 
+void ASTStmtReader::VisitCXXInheritedCtorInitExpr(CXXInheritedCtorInitExpr *E) {
+  VisitExpr(E);
+  E->Constructor = ReadDeclAs<CXXConstructorDecl>(Record, Idx);
+  E->Loc = ReadSourceLocation(Record, Idx);
+  E->ConstructsVirtualBase = Record[Idx++];
+  E->InheritedFromVirtualBase = Record[Idx++];
+}
+
 void ASTStmtReader::VisitCXXTemporaryObjectExpr(CXXTemporaryObjectExpr *E) {
   VisitCXXConstructExpr(E);
   E->Type = GetTypeSourceInfo(Record, Idx);
@@ -1447,6 +1467,7 @@
     E->getTrailingObjects<BlockDecl *>()[i] =
         ReadDeclAs<BlockDecl>(Record, Idx);
 
+  E->ExprWithCleanupsBits.CleanupsHaveSideEffects = Record[Idx++];
   E->SubExpr = Reader.ReadSubExpr();
 }
 
@@ -1679,6 +1700,8 @@
 void ASTStmtReader::VisitCXXUuidofExpr(CXXUuidofExpr *E) {
   VisitExpr(E);
   E->setSourceRange(ReadSourceRange(Record, Idx));
+  std::string UuidStr = ReadString(Record, Idx);
+  E->setUuidStr(StringRef(UuidStr).copy(Reader.getContext()));
   if (E->isTypeOperand()) { // __uuidof(ComType)
     E->setTypeOperandSourceInfo(
         GetTypeSourceInfo(Record, Idx));
@@ -1748,10 +1771,11 @@
   OMPClauseReader(ASTStmtReader *R, ASTContext &C,
                   const ASTReader::RecordData &Record, unsigned &Idx)
     : Reader(R), Context(C), Record(Record), Idx(Idx) { }
-#define OPENMP_CLAUSE(Name, Class)    \
-  void Visit##Class(Class *S);
+#define OPENMP_CLAUSE(Name, Class) void Visit##Class(Class *C);
 #include "clang/Basic/OpenMPKinds.def"
   OMPClause *readClause();
+  void VisitOMPClauseWithPreInit(OMPClauseWithPreInit *C);
+  void VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *C);
 };
 }
 
@@ -1857,9 +1881,15 @@
   case OMPC_device:
     C = new (Context) OMPDeviceClause();
     break;
-  case OMPC_map:
-    C = OMPMapClause::CreateEmpty(Context, Record[Idx++]);
+  case OMPC_map: {
+    unsigned NumVars = Record[Idx++];
+    unsigned NumDeclarations = Record[Idx++];
+    unsigned NumLists = Record[Idx++];
+    unsigned NumComponents = Record[Idx++];
+    C = OMPMapClause::CreateEmpty(Context, NumVars, NumDeclarations, NumLists,
+                                  NumComponents);
     break;
+  }
   case OMPC_num_teams:
     C = new (Context) OMPNumTeamsClause();
     break;
@@ -1881,6 +1911,45 @@
   case OMPC_dist_schedule:
     C = new (Context) OMPDistScheduleClause();
     break;
+  case OMPC_defaultmap:
+    C = new (Context) OMPDefaultmapClause();
+    break;
+  case OMPC_to: {
+    unsigned NumVars = Record[Idx++];
+    unsigned NumDeclarations = Record[Idx++];
+    unsigned NumLists = Record[Idx++];
+    unsigned NumComponents = Record[Idx++];
+    C = OMPToClause::CreateEmpty(Context, NumVars, NumDeclarations, NumLists,
+                                 NumComponents);
+    break;
+  }
+  case OMPC_from: {
+    unsigned NumVars = Record[Idx++];
+    unsigned NumDeclarations = Record[Idx++];
+    unsigned NumLists = Record[Idx++];
+    unsigned NumComponents = Record[Idx++];
+    C = OMPFromClause::CreateEmpty(Context, NumVars, NumDeclarations, NumLists,
+                                   NumComponents);
+    break;
+  }
+  case OMPC_use_device_ptr: {
+    unsigned NumVars = Record[Idx++];
+    unsigned NumDeclarations = Record[Idx++];
+    unsigned NumLists = Record[Idx++];
+    unsigned NumComponents = Record[Idx++];
+    C = OMPUseDevicePtrClause::CreateEmpty(Context, NumVars, NumDeclarations,
+                                           NumLists, NumComponents);
+    break;
+  }
+  case OMPC_is_device_ptr: {
+    unsigned NumVars = Record[Idx++];
+    unsigned NumDeclarations = Record[Idx++];
+    unsigned NumLists = Record[Idx++];
+    unsigned NumComponents = Record[Idx++];
+    C = OMPIsDevicePtrClause::CreateEmpty(Context, NumVars, NumDeclarations,
+                                          NumLists, NumComponents);
+    break;
+  }
   }
   Visit(C);
   C->setLocStart(Reader->ReadSourceLocation(Record, Idx));
@@ -1889,6 +1958,15 @@
   return C;
 }
 
+void OMPClauseReader::VisitOMPClauseWithPreInit(OMPClauseWithPreInit *C) {
+  C->setPreInitStmt(Reader->Reader.ReadSubStmt());
+}
+
+void OMPClauseReader::VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *C) {
+  VisitOMPClauseWithPreInit(C);
+  C->setPostUpdateExpr(Reader->Reader.ReadSubExpr());
+}
+
 void OMPClauseReader::VisitOMPIfClause(OMPIfClause *C) {
   C->setNameModifier(static_cast<OpenMPDirectiveKind>(Record[Idx++]));
   C->setNameModifierLoc(Reader->ReadSourceLocation(Record, Idx));
@@ -1937,6 +2015,7 @@
 }
 
 void OMPClauseReader::VisitOMPScheduleClause(OMPScheduleClause *C) {
+  VisitOMPClauseWithPreInit(C);
   C->setScheduleKind(
        static_cast<OpenMPScheduleClauseKind>(Record[Idx++]));
   C->setFirstScheduleModifier(
@@ -1944,7 +2023,6 @@
   C->setSecondScheduleModifier(
       static_cast<OpenMPScheduleClauseModifier>(Record[Idx++]));
   C->setChunkSize(Reader->Reader.ReadSubExpr());
-  C->setHelperChunkSize(Reader->Reader.ReadSubExpr());
   C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
   C->setFirstScheduleModifierLoc(Reader->ReadSourceLocation(Record, Idx));
   C->setSecondScheduleModifierLoc(Reader->ReadSourceLocation(Record, Idx));
@@ -1994,6 +2072,7 @@
 }
 
 void OMPClauseReader::VisitOMPFirstprivateClause(OMPFirstprivateClause *C) {
+  VisitOMPClauseWithPreInit(C);
   C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
   unsigned NumVars = C->varlist_size();
   SmallVector<Expr *, 16> Vars;
@@ -2012,6 +2091,7 @@
 }
 
 void OMPClauseReader::VisitOMPLastprivateClause(OMPLastprivateClause *C) {
+  VisitOMPClauseWithPostUpdate(C);
   C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
   unsigned NumVars = C->varlist_size();
   SmallVector<Expr *, 16> Vars;
@@ -2048,6 +2128,7 @@
 }
 
 void OMPClauseReader::VisitOMPReductionClause(OMPReductionClause *C) {
+  VisitOMPClauseWithPostUpdate(C);
   C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
   C->setColonLoc(Reader->ReadSourceLocation(Record, Idx));
   NestedNameSpecifierLoc NNSL =
@@ -2082,6 +2163,7 @@
 }
 
 void OMPClauseReader::VisitOMPLinearClause(OMPLinearClause *C) {
+  VisitOMPClauseWithPostUpdate(C);
   C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
   C->setColonLoc(Reader->ReadSourceLocation(Record, Idx));
   C->setModifier(static_cast<OpenMPLinearClauseKind>(Record[Idx++]));
@@ -2189,6 +2271,7 @@
   for (unsigned i = 0; i != NumVars; ++i)
     Vars.push_back(Reader->Reader.ReadSubExpr());
   C->setVarRefs(Vars);
+  C->setCounterValue(Reader->Reader.ReadSubExpr());
 }
 
 void OMPClauseReader::VisitOMPDeviceClause(OMPDeviceClause *C) {
@@ -2205,12 +2288,45 @@
   C->setMapLoc(Reader->ReadSourceLocation(Record, Idx));
   C->setColonLoc(Reader->ReadSourceLocation(Record, Idx));
   auto NumVars = C->varlist_size();
+  auto UniqueDecls = C->getUniqueDeclarationsNum();
+  auto TotalLists = C->getTotalComponentListNum();
+  auto TotalComponents = C->getTotalComponentsNum();
+
   SmallVector<Expr *, 16> Vars;
   Vars.reserve(NumVars);
-  for (unsigned i = 0; i != NumVars; ++i) {
+  for (unsigned i = 0; i != NumVars; ++i)
     Vars.push_back(Reader->Reader.ReadSubExpr());
-  }
   C->setVarRefs(Vars);
+
+  SmallVector<ValueDecl *, 16> Decls;
+  Decls.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    Decls.push_back(
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx));
+  C->setUniqueDecls(Decls);
+
+  SmallVector<unsigned, 16> ListsPerDecl;
+  ListsPerDecl.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    ListsPerDecl.push_back(Record[Idx++]);
+  C->setDeclNumLists(ListsPerDecl);
+
+  SmallVector<unsigned, 32> ListSizes;
+  ListSizes.reserve(TotalLists);
+  for (unsigned i = 0; i < TotalLists; ++i)
+    ListSizes.push_back(Record[Idx++]);
+  C->setComponentListSizes(ListSizes);
+
+  SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
+  Components.reserve(TotalComponents);
+  for (unsigned i = 0; i < TotalComponents; ++i) {
+    Expr *AssociatedExpr = Reader->Reader.ReadSubExpr();
+    ValueDecl *AssociatedDecl =
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx);
+    Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
+        AssociatedExpr, AssociatedDecl));
+  }
+  C->setComponents(Components, ListSizes);
 }
 
 void OMPClauseReader::VisitOMPNumTeamsClause(OMPNumTeamsClause *C) {
@@ -2244,15 +2360,210 @@
 }
 
 void OMPClauseReader::VisitOMPDistScheduleClause(OMPDistScheduleClause *C) {
+  VisitOMPClauseWithPreInit(C);
   C->setDistScheduleKind(
       static_cast<OpenMPDistScheduleClauseKind>(Record[Idx++]));
   C->setChunkSize(Reader->Reader.ReadSubExpr());
-  C->setHelperChunkSize(Reader->Reader.ReadSubExpr());
   C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
   C->setDistScheduleKindLoc(Reader->ReadSourceLocation(Record, Idx));
   C->setCommaLoc(Reader->ReadSourceLocation(Record, Idx));
 }
 
+void OMPClauseReader::VisitOMPDefaultmapClause(OMPDefaultmapClause *C) {
+  C->setDefaultmapKind(
+       static_cast<OpenMPDefaultmapClauseKind>(Record[Idx++]));
+  C->setDefaultmapModifier(
+      static_cast<OpenMPDefaultmapClauseModifier>(Record[Idx++]));
+  C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
+  C->setDefaultmapModifierLoc(Reader->ReadSourceLocation(Record, Idx));
+  C->setDefaultmapKindLoc(Reader->ReadSourceLocation(Record, Idx));
+}
+
+void OMPClauseReader::VisitOMPToClause(OMPToClause *C) {
+  C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
+  auto NumVars = C->varlist_size();
+  auto UniqueDecls = C->getUniqueDeclarationsNum();
+  auto TotalLists = C->getTotalComponentListNum();
+  auto TotalComponents = C->getTotalComponentsNum();
+
+  SmallVector<Expr *, 16> Vars;
+  Vars.reserve(NumVars);
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Reader->Reader.ReadSubExpr());
+  C->setVarRefs(Vars);
+
+  SmallVector<ValueDecl *, 16> Decls;
+  Decls.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    Decls.push_back(
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx));
+  C->setUniqueDecls(Decls);
+
+  SmallVector<unsigned, 16> ListsPerDecl;
+  ListsPerDecl.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    ListsPerDecl.push_back(Record[Idx++]);
+  C->setDeclNumLists(ListsPerDecl);
+
+  SmallVector<unsigned, 32> ListSizes;
+  ListSizes.reserve(TotalLists);
+  for (unsigned i = 0; i < TotalLists; ++i)
+    ListSizes.push_back(Record[Idx++]);
+  C->setComponentListSizes(ListSizes);
+
+  SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
+  Components.reserve(TotalComponents);
+  for (unsigned i = 0; i < TotalComponents; ++i) {
+    Expr *AssociatedExpr = Reader->Reader.ReadSubExpr();
+    ValueDecl *AssociatedDecl =
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx);
+    Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
+        AssociatedExpr, AssociatedDecl));
+  }
+  C->setComponents(Components, ListSizes);
+}
+
+void OMPClauseReader::VisitOMPFromClause(OMPFromClause *C) {
+  C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
+  auto NumVars = C->varlist_size();
+  auto UniqueDecls = C->getUniqueDeclarationsNum();
+  auto TotalLists = C->getTotalComponentListNum();
+  auto TotalComponents = C->getTotalComponentsNum();
+
+  SmallVector<Expr *, 16> Vars;
+  Vars.reserve(NumVars);
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Reader->Reader.ReadSubExpr());
+  C->setVarRefs(Vars);
+
+  SmallVector<ValueDecl *, 16> Decls;
+  Decls.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    Decls.push_back(
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx));
+  C->setUniqueDecls(Decls);
+
+  SmallVector<unsigned, 16> ListsPerDecl;
+  ListsPerDecl.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    ListsPerDecl.push_back(Record[Idx++]);
+  C->setDeclNumLists(ListsPerDecl);
+
+  SmallVector<unsigned, 32> ListSizes;
+  ListSizes.reserve(TotalLists);
+  for (unsigned i = 0; i < TotalLists; ++i)
+    ListSizes.push_back(Record[Idx++]);
+  C->setComponentListSizes(ListSizes);
+
+  SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
+  Components.reserve(TotalComponents);
+  for (unsigned i = 0; i < TotalComponents; ++i) {
+    Expr *AssociatedExpr = Reader->Reader.ReadSubExpr();
+    ValueDecl *AssociatedDecl =
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx);
+    Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
+        AssociatedExpr, AssociatedDecl));
+  }
+  C->setComponents(Components, ListSizes);
+}
+
+void OMPClauseReader::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *C) {
+  C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
+  auto NumVars = C->varlist_size();
+  auto UniqueDecls = C->getUniqueDeclarationsNum();
+  auto TotalLists = C->getTotalComponentListNum();
+  auto TotalComponents = C->getTotalComponentsNum();
+
+  SmallVector<Expr *, 16> Vars;
+  Vars.reserve(NumVars);
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Reader->Reader.ReadSubExpr());
+  C->setVarRefs(Vars);
+  Vars.clear();
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Reader->Reader.ReadSubExpr());
+  C->setPrivateCopies(Vars);
+  Vars.clear();
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Reader->Reader.ReadSubExpr());
+  C->setInits(Vars);
+
+  SmallVector<ValueDecl *, 16> Decls;
+  Decls.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    Decls.push_back(
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx));
+  C->setUniqueDecls(Decls);
+
+  SmallVector<unsigned, 16> ListsPerDecl;
+  ListsPerDecl.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    ListsPerDecl.push_back(Record[Idx++]);
+  C->setDeclNumLists(ListsPerDecl);
+
+  SmallVector<unsigned, 32> ListSizes;
+  ListSizes.reserve(TotalLists);
+  for (unsigned i = 0; i < TotalLists; ++i)
+    ListSizes.push_back(Record[Idx++]);
+  C->setComponentListSizes(ListSizes);
+
+  SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
+  Components.reserve(TotalComponents);
+  for (unsigned i = 0; i < TotalComponents; ++i) {
+    Expr *AssociatedExpr = Reader->Reader.ReadSubExpr();
+    ValueDecl *AssociatedDecl =
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx);
+    Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
+        AssociatedExpr, AssociatedDecl));
+  }
+  C->setComponents(Components, ListSizes);
+}
+
+void OMPClauseReader::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
+  C->setLParenLoc(Reader->ReadSourceLocation(Record, Idx));
+  auto NumVars = C->varlist_size();
+  auto UniqueDecls = C->getUniqueDeclarationsNum();
+  auto TotalLists = C->getTotalComponentListNum();
+  auto TotalComponents = C->getTotalComponentsNum();
+
+  SmallVector<Expr *, 16> Vars;
+  Vars.reserve(NumVars);
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Reader->Reader.ReadSubExpr());
+  C->setVarRefs(Vars);
+  Vars.clear();
+
+  SmallVector<ValueDecl *, 16> Decls;
+  Decls.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    Decls.push_back(
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx));
+  C->setUniqueDecls(Decls);
+
+  SmallVector<unsigned, 16> ListsPerDecl;
+  ListsPerDecl.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    ListsPerDecl.push_back(Record[Idx++]);
+  C->setDeclNumLists(ListsPerDecl);
+
+  SmallVector<unsigned, 32> ListSizes;
+  ListSizes.reserve(TotalLists);
+  for (unsigned i = 0; i < TotalLists; ++i)
+    ListSizes.push_back(Record[Idx++]);
+  C->setComponentListSizes(ListSizes);
+
+  SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
+  Components.reserve(TotalComponents);
+  for (unsigned i = 0; i < TotalComponents; ++i) {
+    Expr *AssociatedExpr = Reader->Reader.ReadSubExpr();
+    ValueDecl *AssociatedDecl =
+        Reader->Reader.ReadDeclAs<ValueDecl>(Reader->F, Record, Idx);
+    Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
+        AssociatedExpr, AssociatedDecl));
+  }
+  C->setComponents(Components, ListSizes);
+}
+
 //===----------------------------------------------------------------------===//
 // OpenMP Directives.
 //===----------------------------------------------------------------------===//
@@ -2280,7 +2591,10 @@
   D->setCond(Reader.ReadSubExpr());
   D->setInit(Reader.ReadSubExpr());
   D->setInc(Reader.ReadSubExpr());
-  if (isOpenMPWorksharingDirective(D->getDirectiveKind())) {
+  D->setPreInits(Reader.ReadSubStmt());
+  if (isOpenMPWorksharingDirective(D->getDirectiveKind()) ||
+      isOpenMPTaskLoopDirective(D->getDirectiveKind()) ||
+      isOpenMPDistributeDirective(D->getDirectiveKind())) {
     D->setIsLastIterVariable(Reader.ReadSubExpr());
     D->setLowerBoundVariable(Reader.ReadSubExpr());
     D->setUpperBoundVariable(Reader.ReadSubExpr());
@@ -2288,6 +2602,11 @@
     D->setEnsureUpperBound(Reader.ReadSubExpr());
     D->setNextLowerBound(Reader.ReadSubExpr());
     D->setNextUpperBound(Reader.ReadSubExpr());
+    D->setNumIterations(Reader.ReadSubExpr());
+  }
+  if (isOpenMPLoopBoundSharingDirective(D->getDirectiveKind())) {
+    D->setPrevLowerBoundVariable(Reader.ReadSubExpr());
+    D->setPrevUpperBoundVariable(Reader.ReadSubExpr());
   }
   SmallVector<Expr *, 4> Sub;
   unsigned CollapsedNum = D->getCollapsedNumber();
@@ -2455,6 +2774,33 @@
   VisitOMPExecutableDirective(D);
 }
 
+void ASTStmtReader::VisitOMPTargetEnterDataDirective(
+    OMPTargetEnterDataDirective *D) {
+  VisitStmt(D);
+  ++Idx;
+  VisitOMPExecutableDirective(D);
+}
+
+void ASTStmtReader::VisitOMPTargetExitDataDirective(
+    OMPTargetExitDataDirective *D) {
+  VisitStmt(D);
+  ++Idx;
+  VisitOMPExecutableDirective(D);
+}
+
+void ASTStmtReader::VisitOMPTargetParallelDirective(
+    OMPTargetParallelDirective *D) {
+  VisitStmt(D);
+  ++Idx;
+  VisitOMPExecutableDirective(D);
+}
+
+void ASTStmtReader::VisitOMPTargetParallelForDirective(
+    OMPTargetParallelForDirective *D) {
+  VisitOMPLoopDirective(D);
+  D->setHasCancel(Record[Idx++]);
+}
+
 void ASTStmtReader::VisitOMPTeamsDirective(OMPTeamsDirective *D) {
   VisitStmt(D);
   // The NumClauses field was read in ReadStmtFromStream.
@@ -2489,6 +2835,40 @@
   VisitOMPLoopDirective(D);
 }
 
+void ASTStmtReader::VisitOMPTargetUpdateDirective(OMPTargetUpdateDirective *D) {
+  VisitStmt(D);
+  ++Idx;
+  VisitOMPExecutableDirective(D);
+}
+void ASTStmtReader::VisitOMPDistributeParallelForDirective(
+    OMPDistributeParallelForDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void ASTStmtReader::VisitOMPDistributeParallelForSimdDirective(
+    OMPDistributeParallelForSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void ASTStmtReader::VisitOMPDistributeSimdDirective(
+    OMPDistributeSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void ASTStmtReader::VisitOMPTargetParallelForSimdDirective(
+    OMPTargetParallelForSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void ASTStmtReader::VisitOMPTargetSimdDirective(OMPTargetSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void ASTStmtReader::VisitOMPTeamsDistributeDirective(
+    OMPTeamsDistributeDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
 //===----------------------------------------------------------------------===//
 // ASTReader Implementation
 //===----------------------------------------------------------------------===//
@@ -2945,6 +3325,9 @@
     case EXPR_OBJC_BOOL_LITERAL:
       S = new (Context) ObjCBoolLiteralExpr(Empty);
       break;
+    case EXPR_OBJC_AVAILABILITY_CHECK:
+      S = new (Context) ObjCAvailabilityCheckExpr(Empty);
+      break;
     case STMT_SEH_LEAVE:
       S = new (Context) SEHLeaveStmt(Empty);
       break;
@@ -3098,6 +3481,34 @@
           Context, Record[ASTStmtReader::NumStmtFields], Empty);
       break;
 
+    case STMT_OMP_TARGET_ENTER_DATA_DIRECTIVE:
+      S = OMPTargetEnterDataDirective::CreateEmpty(
+          Context, Record[ASTStmtReader::NumStmtFields], Empty);
+      break;
+
+    case STMT_OMP_TARGET_EXIT_DATA_DIRECTIVE:
+      S = OMPTargetExitDataDirective::CreateEmpty(
+          Context, Record[ASTStmtReader::NumStmtFields], Empty);
+      break;
+
+    case STMT_OMP_TARGET_PARALLEL_DIRECTIVE:
+      S = OMPTargetParallelDirective::CreateEmpty(
+          Context, Record[ASTStmtReader::NumStmtFields], Empty);
+      break;
+
+    case STMT_OMP_TARGET_PARALLEL_FOR_DIRECTIVE: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPTargetParallelForDirective::CreateEmpty(Context, NumClauses,
+                                                     CollapsedNum, Empty);
+      break;
+    }
+
+    case STMT_OMP_TARGET_UPDATE_DIRECTIVE:
+      S = OMPTargetUpdateDirective::CreateEmpty(
+          Context, Record[ASTStmtReader::NumStmtFields], Empty);
+      break;
+
     case STMT_OMP_TEAMS_DIRECTIVE:
       S = OMPTeamsDirective::CreateEmpty(
           Context, Record[ASTStmtReader::NumStmtFields], Empty);
@@ -3136,6 +3547,55 @@
       break;
     }
 
+    case STMT_OMP_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPDistributeParallelForDirective::CreateEmpty(Context, NumClauses,
+                                                         CollapsedNum, Empty);
+      break;
+    }
+
+    case STMT_OMP_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPDistributeParallelForSimdDirective::CreateEmpty(Context, NumClauses,
+                                                             CollapsedNum,
+                                                             Empty);
+      break;
+    }
+
+    case STMT_OMP_DISTRIBUTE_SIMD_DIRECTIVE: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPDistributeSimdDirective::CreateEmpty(Context, NumClauses,
+                                                  CollapsedNum, Empty);
+      break;
+    }
+
+    case STMT_OMP_TARGET_PARALLEL_FOR_SIMD_DIRECTIVE: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPTargetParallelForSimdDirective::CreateEmpty(Context, NumClauses,
+                                                         CollapsedNum, Empty);
+      break;
+    }
+
+    case STMT_OMP_TARGET_SIMD_DIRECTIVE: {
+      auto NumClauses = Record[ASTStmtReader::NumStmtFields];
+      auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPTargetSimdDirective::CreateEmpty(Context, NumClauses, CollapsedNum,
+                                              Empty);
+      break;
+    }
+
+    case STMT_OMP_TEAMS_DISTRIBUTE_DIRECTIVE: {
+      auto NumClauses = Record[ASTStmtReader::NumStmtFields];
+      auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPTeamsDistributeDirective::CreateEmpty(Context, NumClauses,
+                                                   CollapsedNum, Empty);
+      break;
+    }
+
     case EXPR_CXX_OPERATOR_CALL:
       S = new (Context) CXXOperatorCallExpr(Context, Empty);
       break;
@@ -3148,6 +3608,10 @@
       S = new (Context) CXXConstructExpr(Empty);
       break;
 
+    case EXPR_CXX_INHERITED_CTOR_INIT:
+      S = new (Context) CXXInheritedCtorInitExpr(Empty);
+      break;
+
     case EXPR_CXX_TEMPORARY_OBJECT:
       S = new (Context) CXXTemporaryObjectExpr(Empty);
       break;
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index cd0d62f..e1506ed 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Serialization/ASTWriter.h"
-#include "clang/Serialization/ModuleFileExtension.h"
 #include "ASTCommon.h"
 #include "ASTReaderInternals.h"
 #include "MultiOnDiskHashTable.h"
@@ -44,14 +43,15 @@
 #include "clang/Sema/IdentifierResolver.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Serialization/ASTReader.h"
+#include "clang/Serialization/ModuleFileExtension.h"
 #include "clang/Serialization/SerializationDiagnostic.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/OnDiskHashTable.h"
 #include "llvm/Support/Path.h"
@@ -81,19 +81,42 @@
 // Type serialization
 //===----------------------------------------------------------------------===//
 
-namespace {
+namespace clang {
   class ASTTypeWriter {
     ASTWriter &Writer;
-    ASTWriter::RecordDataImpl &Record;
+    ASTRecordWriter Record;
 
-  public:
     /// \brief Type code that corresponds to the record generated.
     TypeCode Code;
     /// \brief Abbreviation to use for the record, if any.
     unsigned AbbrevToUse;
 
+  public:
     ASTTypeWriter(ASTWriter &Writer, ASTWriter::RecordDataImpl &Record)
-      : Writer(Writer), Record(Record), Code(TYPE_EXT_QUAL) { }
+      : Writer(Writer), Record(Writer, Record), Code((TypeCode)0), AbbrevToUse(0) { }
+
+    uint64_t Emit() {
+      return Record.Emit(Code, AbbrevToUse);
+    }
+
+    void Visit(QualType T) {
+      if (T.hasLocalNonFastQualifiers()) {
+        Qualifiers Qs = T.getLocalQualifiers();
+        Record.AddTypeRef(T.getLocalUnqualifiedType());
+        Record.push_back(Qs.getAsOpaqueValue());
+        Code = TYPE_EXT_QUAL;
+        AbbrevToUse = Writer.TypeExtQualAbbrev;
+      } else {
+        switch (T->getTypeClass()) {
+          // For all of the concrete, non-dependent types, call the
+          // appropriate visitor function.
+#define TYPE(Class, Base) \
+        case Type::Class: Visit##Class##Type(cast<Class##Type>(T)); break;
+#define ABSTRACT_TYPE(Class, Base)
+#include "clang/AST/TypeNodes.def"
+        }
+      }
+    }
 
     void VisitArrayType(const ArrayType *T);
     void VisitFunctionType(const FunctionType *T);
@@ -103,64 +126,64 @@
 #define ABSTRACT_TYPE(Class, Base)
 #include "clang/AST/TypeNodes.def"
   };
-} // end anonymous namespace
+} // end namespace clang
 
 void ASTTypeWriter::VisitBuiltinType(const BuiltinType *T) {
   llvm_unreachable("Built-in types are never serialized");
 }
 
 void ASTTypeWriter::VisitComplexType(const ComplexType *T) {
-  Writer.AddTypeRef(T->getElementType(), Record);
+  Record.AddTypeRef(T->getElementType());
   Code = TYPE_COMPLEX;
 }
 
 void ASTTypeWriter::VisitPointerType(const PointerType *T) {
-  Writer.AddTypeRef(T->getPointeeType(), Record);
+  Record.AddTypeRef(T->getPointeeType());
   Code = TYPE_POINTER;
 }
 
 void ASTTypeWriter::VisitDecayedType(const DecayedType *T) {
-  Writer.AddTypeRef(T->getOriginalType(), Record);
+  Record.AddTypeRef(T->getOriginalType());
   Code = TYPE_DECAYED;
 }
 
 void ASTTypeWriter::VisitAdjustedType(const AdjustedType *T) {
-  Writer.AddTypeRef(T->getOriginalType(), Record);
-  Writer.AddTypeRef(T->getAdjustedType(), Record);
+  Record.AddTypeRef(T->getOriginalType());
+  Record.AddTypeRef(T->getAdjustedType());
   Code = TYPE_ADJUSTED;
 }
 
 void ASTTypeWriter::VisitBlockPointerType(const BlockPointerType *T) {
-  Writer.AddTypeRef(T->getPointeeType(), Record);
+  Record.AddTypeRef(T->getPointeeType());
   Code = TYPE_BLOCK_POINTER;
 }
 
 void ASTTypeWriter::VisitLValueReferenceType(const LValueReferenceType *T) {
-  Writer.AddTypeRef(T->getPointeeTypeAsWritten(), Record);
+  Record.AddTypeRef(T->getPointeeTypeAsWritten());
   Record.push_back(T->isSpelledAsLValue());
   Code = TYPE_LVALUE_REFERENCE;
 }
 
 void ASTTypeWriter::VisitRValueReferenceType(const RValueReferenceType *T) {
-  Writer.AddTypeRef(T->getPointeeTypeAsWritten(), Record);
+  Record.AddTypeRef(T->getPointeeTypeAsWritten());
   Code = TYPE_RVALUE_REFERENCE;
 }
 
 void ASTTypeWriter::VisitMemberPointerType(const MemberPointerType *T) {
-  Writer.AddTypeRef(T->getPointeeType(), Record);
-  Writer.AddTypeRef(QualType(T->getClass(), 0), Record);
+  Record.AddTypeRef(T->getPointeeType());
+  Record.AddTypeRef(QualType(T->getClass(), 0));
   Code = TYPE_MEMBER_POINTER;
 }
 
 void ASTTypeWriter::VisitArrayType(const ArrayType *T) {
-  Writer.AddTypeRef(T->getElementType(), Record);
+  Record.AddTypeRef(T->getElementType());
   Record.push_back(T->getSizeModifier()); // FIXME: stable values
   Record.push_back(T->getIndexTypeCVRQualifiers()); // FIXME: stable values
 }
 
 void ASTTypeWriter::VisitConstantArrayType(const ConstantArrayType *T) {
   VisitArrayType(T);
-  Writer.AddAPInt(T->getSize(), Record);
+  Record.AddAPInt(T->getSize());
   Code = TYPE_CONSTANT_ARRAY;
 }
 
@@ -171,14 +194,14 @@
 
 void ASTTypeWriter::VisitVariableArrayType(const VariableArrayType *T) {
   VisitArrayType(T);
-  Writer.AddSourceLocation(T->getLBracketLoc(), Record);
-  Writer.AddSourceLocation(T->getRBracketLoc(), Record);
-  Writer.AddStmt(T->getSizeExpr());
+  Record.AddSourceLocation(T->getLBracketLoc());
+  Record.AddSourceLocation(T->getRBracketLoc());
+  Record.AddStmt(T->getSizeExpr());
   Code = TYPE_VARIABLE_ARRAY;
 }
 
 void ASTTypeWriter::VisitVectorType(const VectorType *T) {
-  Writer.AddTypeRef(T->getElementType(), Record);
+  Record.AddTypeRef(T->getElementType());
   Record.push_back(T->getNumElements());
   Record.push_back(T->getVectorKind());
   Code = TYPE_VECTOR;
@@ -190,7 +213,7 @@
 }
 
 void ASTTypeWriter::VisitFunctionType(const FunctionType *T) {
-  Writer.AddTypeRef(T->getReturnType(), Record);
+  Record.AddTypeRef(T->getReturnType());
   FunctionType::ExtInfo C = T->getExtInfo();
   Record.push_back(C.getNoReturn());
   Record.push_back(C.getHasRegParm());
@@ -208,20 +231,20 @@
   Code = TYPE_FUNCTION_NO_PROTO;
 }
 
-static void addExceptionSpec(ASTWriter &Writer, const FunctionProtoType *T,
-                             ASTWriter::RecordDataImpl &Record) {
+static void addExceptionSpec(const FunctionProtoType *T,
+                             ASTRecordWriter &Record) {
   Record.push_back(T->getExceptionSpecType());
   if (T->getExceptionSpecType() == EST_Dynamic) {
     Record.push_back(T->getNumExceptions());
     for (unsigned I = 0, N = T->getNumExceptions(); I != N; ++I)
-      Writer.AddTypeRef(T->getExceptionType(I), Record);
+      Record.AddTypeRef(T->getExceptionType(I));
   } else if (T->getExceptionSpecType() == EST_ComputedNoexcept) {
-    Writer.AddStmt(T->getNoexceptExpr());
+    Record.AddStmt(T->getNoexceptExpr());
   } else if (T->getExceptionSpecType() == EST_Uninstantiated) {
-    Writer.AddDeclRef(T->getExceptionSpecDecl(), Record);
-    Writer.AddDeclRef(T->getExceptionSpecTemplate(), Record);
+    Record.AddDeclRef(T->getExceptionSpecDecl());
+    Record.AddDeclRef(T->getExceptionSpecTemplate());
   } else if (T->getExceptionSpecType() == EST_Unevaluated) {
-    Writer.AddDeclRef(T->getExceptionSpecDecl(), Record);
+    Record.AddDeclRef(T->getExceptionSpecDecl());
   }
 }
 
@@ -232,11 +255,11 @@
   Record.push_back(T->hasTrailingReturn());
   Record.push_back(T->getTypeQuals());
   Record.push_back(static_cast<unsigned>(T->getRefQualifier()));
-  addExceptionSpec(Writer, T, Record);
+  addExceptionSpec(T, Record);
 
   Record.push_back(T->getNumParams());
   for (unsigned I = 0, N = T->getNumParams(); I != N; ++I)
-    Writer.AddTypeRef(T->getParamType(I), Record);
+    Record.AddTypeRef(T->getParamType(I));
 
   if (T->hasExtParameterInfos()) {
     for (unsigned I = 0, N = T->getNumParams(); I != N; ++I)
@@ -252,42 +275,42 @@
 }
 
 void ASTTypeWriter::VisitUnresolvedUsingType(const UnresolvedUsingType *T) {
-  Writer.AddDeclRef(T->getDecl(), Record);
+  Record.AddDeclRef(T->getDecl());
   Code = TYPE_UNRESOLVED_USING;
 }
 
 void ASTTypeWriter::VisitTypedefType(const TypedefType *T) {
-  Writer.AddDeclRef(T->getDecl(), Record);
+  Record.AddDeclRef(T->getDecl());
   assert(!T->isCanonicalUnqualified() && "Invalid typedef ?");
-  Writer.AddTypeRef(T->getCanonicalTypeInternal(), Record);
+  Record.AddTypeRef(T->getCanonicalTypeInternal());
   Code = TYPE_TYPEDEF;
 }
 
 void ASTTypeWriter::VisitTypeOfExprType(const TypeOfExprType *T) {
-  Writer.AddStmt(T->getUnderlyingExpr());
+  Record.AddStmt(T->getUnderlyingExpr());
   Code = TYPE_TYPEOF_EXPR;
 }
 
 void ASTTypeWriter::VisitTypeOfType(const TypeOfType *T) {
-  Writer.AddTypeRef(T->getUnderlyingType(), Record);
+  Record.AddTypeRef(T->getUnderlyingType());
   Code = TYPE_TYPEOF;
 }
 
 void ASTTypeWriter::VisitDecltypeType(const DecltypeType *T) {
-  Writer.AddTypeRef(T->getUnderlyingType(), Record);
-  Writer.AddStmt(T->getUnderlyingExpr());
+  Record.AddTypeRef(T->getUnderlyingType());
+  Record.AddStmt(T->getUnderlyingExpr());
   Code = TYPE_DECLTYPE;
 }
 
 void ASTTypeWriter::VisitUnaryTransformType(const UnaryTransformType *T) {
-  Writer.AddTypeRef(T->getBaseType(), Record);
-  Writer.AddTypeRef(T->getUnderlyingType(), Record);
+  Record.AddTypeRef(T->getBaseType());
+  Record.AddTypeRef(T->getUnderlyingType());
   Record.push_back(T->getUTTKind());
   Code = TYPE_UNARY_TRANSFORM;
 }
 
 void ASTTypeWriter::VisitAutoType(const AutoType *T) {
-  Writer.AddTypeRef(T->getDeducedType(), Record);
+  Record.AddTypeRef(T->getDeducedType());
   Record.push_back((unsigned)T->getKeyword());
   if (T->getDeducedType().isNull())
     Record.push_back(T->isDependentType());
@@ -296,7 +319,7 @@
 
 void ASTTypeWriter::VisitTagType(const TagType *T) {
   Record.push_back(T->isDependentType());
-  Writer.AddDeclRef(T->getDecl()->getCanonicalDecl(), Record);
+  Record.AddDeclRef(T->getDecl()->getCanonicalDecl());
   assert(!T->isBeingDefined() &&
          "Cannot serialize in the middle of a type definition");
 }
@@ -312,8 +335,8 @@
 }
 
 void ASTTypeWriter::VisitAttributedType(const AttributedType *T) {
-  Writer.AddTypeRef(T->getModifiedType(), Record);
-  Writer.AddTypeRef(T->getEquivalentType(), Record);
+  Record.AddTypeRef(T->getModifiedType());
+  Record.AddTypeRef(T->getEquivalentType());
   Record.push_back(T->getAttrKind());
   Code = TYPE_ATTRIBUTED;
 }
@@ -321,16 +344,16 @@
 void
 ASTTypeWriter::VisitSubstTemplateTypeParmType(
                                         const SubstTemplateTypeParmType *T) {
-  Writer.AddTypeRef(QualType(T->getReplacedParameter(), 0), Record);
-  Writer.AddTypeRef(T->getReplacementType(), Record);
+  Record.AddTypeRef(QualType(T->getReplacedParameter(), 0));
+  Record.AddTypeRef(T->getReplacementType());
   Code = TYPE_SUBST_TEMPLATE_TYPE_PARM;
 }
 
 void
 ASTTypeWriter::VisitSubstTemplateTypeParmPackType(
                                       const SubstTemplateTypeParmPackType *T) {
-  Writer.AddTypeRef(QualType(T->getReplacedParameter(), 0), Record);
-  Writer.AddTemplateArgument(T->getArgumentPack(), Record);
+  Record.AddTypeRef(QualType(T->getReplacedParameter(), 0));
+  Record.AddTemplateArgument(T->getArgumentPack());
   Code = TYPE_SUBST_TEMPLATE_TYPE_PARM_PACK;
 }
 
@@ -338,22 +361,22 @@
 ASTTypeWriter::VisitTemplateSpecializationType(
                                        const TemplateSpecializationType *T) {
   Record.push_back(T->isDependentType());
-  Writer.AddTemplateName(T->getTemplateName(), Record);
+  Record.AddTemplateName(T->getTemplateName());
   Record.push_back(T->getNumArgs());
   for (const auto &ArgI : *T)
-    Writer.AddTemplateArgument(ArgI, Record);
-  Writer.AddTypeRef(T->isTypeAlias() ? T->getAliasedType() :
-                    T->isCanonicalUnqualified() ? QualType()
-                                                : T->getCanonicalTypeInternal(),
-                    Record);
+    Record.AddTemplateArgument(ArgI);
+  Record.AddTypeRef(T->isTypeAlias() ? T->getAliasedType()
+                                     : T->isCanonicalUnqualified()
+                                           ? QualType()
+                                           : T->getCanonicalTypeInternal());
   Code = TYPE_TEMPLATE_SPECIALIZATION;
 }
 
 void
 ASTTypeWriter::VisitDependentSizedArrayType(const DependentSizedArrayType *T) {
   VisitArrayType(T);
-  Writer.AddStmt(T->getSizeExpr());
-  Writer.AddSourceRange(T->getBracketsRange(), Record);
+  Record.AddStmt(T->getSizeExpr());
+  Record.AddSourceRange(T->getBracketsRange());
   Code = TYPE_DEPENDENT_SIZED_ARRAY;
 }
 
@@ -369,18 +392,17 @@
   Record.push_back(T->getDepth());
   Record.push_back(T->getIndex());
   Record.push_back(T->isParameterPack());
-  Writer.AddDeclRef(T->getDecl(), Record);
+  Record.AddDeclRef(T->getDecl());
   Code = TYPE_TEMPLATE_TYPE_PARM;
 }
 
 void
 ASTTypeWriter::VisitDependentNameType(const DependentNameType *T) {
   Record.push_back(T->getKeyword());
-  Writer.AddNestedNameSpecifier(T->getQualifier(), Record);
-  Writer.AddIdentifierRef(T->getIdentifier(), Record);
-  Writer.AddTypeRef(T->isCanonicalUnqualified() ? QualType()
-                                                : T->getCanonicalTypeInternal(),
-                    Record);
+  Record.AddNestedNameSpecifier(T->getQualifier());
+  Record.AddIdentifierRef(T->getIdentifier());
+  Record.AddTypeRef(
+      T->isCanonicalUnqualified() ? QualType() : T->getCanonicalTypeInternal());
   Code = TYPE_DEPENDENT_NAME;
 }
 
@@ -388,16 +410,16 @@
 ASTTypeWriter::VisitDependentTemplateSpecializationType(
                                 const DependentTemplateSpecializationType *T) {
   Record.push_back(T->getKeyword());
-  Writer.AddNestedNameSpecifier(T->getQualifier(), Record);
-  Writer.AddIdentifierRef(T->getIdentifier(), Record);
+  Record.AddNestedNameSpecifier(T->getQualifier());
+  Record.AddIdentifierRef(T->getIdentifier());
   Record.push_back(T->getNumArgs());
   for (const auto &I : *T)
-    Writer.AddTemplateArgument(I, Record);
+    Record.AddTemplateArgument(I);
   Code = TYPE_DEPENDENT_TEMPLATE_SPECIALIZATION;
 }
 
 void ASTTypeWriter::VisitPackExpansionType(const PackExpansionType *T) {
-  Writer.AddTypeRef(T->getPattern(), Record);
+  Record.AddTypeRef(T->getPattern());
   if (Optional<unsigned> NumExpansions = T->getNumExpansions())
     Record.push_back(*NumExpansions + 1);
   else
@@ -406,67 +428,66 @@
 }
 
 void ASTTypeWriter::VisitParenType(const ParenType *T) {
-  Writer.AddTypeRef(T->getInnerType(), Record);
+  Record.AddTypeRef(T->getInnerType());
   Code = TYPE_PAREN;
 }
 
 void ASTTypeWriter::VisitElaboratedType(const ElaboratedType *T) {
   Record.push_back(T->getKeyword());
-  Writer.AddNestedNameSpecifier(T->getQualifier(), Record);
-  Writer.AddTypeRef(T->getNamedType(), Record);
+  Record.AddNestedNameSpecifier(T->getQualifier());
+  Record.AddTypeRef(T->getNamedType());
   Code = TYPE_ELABORATED;
 }
 
 void ASTTypeWriter::VisitInjectedClassNameType(const InjectedClassNameType *T) {
-  Writer.AddDeclRef(T->getDecl()->getCanonicalDecl(), Record);
-  Writer.AddTypeRef(T->getInjectedSpecializationType(), Record);
+  Record.AddDeclRef(T->getDecl()->getCanonicalDecl());
+  Record.AddTypeRef(T->getInjectedSpecializationType());
   Code = TYPE_INJECTED_CLASS_NAME;
 }
 
 void ASTTypeWriter::VisitObjCInterfaceType(const ObjCInterfaceType *T) {
-  Writer.AddDeclRef(T->getDecl()->getCanonicalDecl(), Record);
+  Record.AddDeclRef(T->getDecl()->getCanonicalDecl());
   Code = TYPE_OBJC_INTERFACE;
 }
 
 void ASTTypeWriter::VisitObjCObjectType(const ObjCObjectType *T) {
-  Writer.AddTypeRef(T->getBaseType(), Record);
+  Record.AddTypeRef(T->getBaseType());
   Record.push_back(T->getTypeArgsAsWritten().size());
   for (auto TypeArg : T->getTypeArgsAsWritten())
-    Writer.AddTypeRef(TypeArg, Record);
+    Record.AddTypeRef(TypeArg);
   Record.push_back(T->getNumProtocols());
   for (const auto *I : T->quals())
-    Writer.AddDeclRef(I, Record);
+    Record.AddDeclRef(I);
   Record.push_back(T->isKindOfTypeAsWritten());
   Code = TYPE_OBJC_OBJECT;
 }
 
 void
 ASTTypeWriter::VisitObjCObjectPointerType(const ObjCObjectPointerType *T) {
-  Writer.AddTypeRef(T->getPointeeType(), Record);
+  Record.AddTypeRef(T->getPointeeType());
   Code = TYPE_OBJC_OBJECT_POINTER;
 }
 
 void
 ASTTypeWriter::VisitAtomicType(const AtomicType *T) {
-  Writer.AddTypeRef(T->getValueType(), Record);
+  Record.AddTypeRef(T->getValueType());
   Code = TYPE_ATOMIC;
 }
 
 void
 ASTTypeWriter::VisitPipeType(const PipeType *T) {
-  Writer.AddTypeRef(T->getElementType(), Record);
+  Record.AddTypeRef(T->getElementType());
   Code = TYPE_PIPE;
 }
 
 namespace {
 
 class TypeLocWriter : public TypeLocVisitor<TypeLocWriter> {
-  ASTWriter &Writer;
-  ASTWriter::RecordDataImpl &Record;
+  ASTRecordWriter &Record;
 
 public:
-  TypeLocWriter(ASTWriter &Writer, ASTWriter::RecordDataImpl &Record)
-    : Writer(Writer), Record(Record) { }
+  TypeLocWriter(ASTRecordWriter &Record)
+    : Record(Record) { }
 
 #define ABSTRACT_TYPELOC(CLASS, PARENT)
 #define TYPELOC(CLASS, PARENT) \
@@ -483,7 +504,7 @@
   // nothing to do
 }
 void TypeLocWriter::VisitBuiltinTypeLoc(BuiltinTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getBuiltinLoc(), Record);
+  Record.AddSourceLocation(TL.getBuiltinLoc());
   if (TL.needsExtraLocalData()) {
     Record.push_back(TL.getWrittenTypeSpec());
     Record.push_back(TL.getWrittenSignSpec());
@@ -492,10 +513,10 @@
   }
 }
 void TypeLocWriter::VisitComplexTypeLoc(ComplexTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitPointerTypeLoc(PointerTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getStarLoc(), Record);
+  Record.AddSourceLocation(TL.getStarLoc());
 }
 void TypeLocWriter::VisitDecayedTypeLoc(DecayedTypeLoc TL) {
   // nothing to do
@@ -504,24 +525,24 @@
   // nothing to do
 }
 void TypeLocWriter::VisitBlockPointerTypeLoc(BlockPointerTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getCaretLoc(), Record);
+  Record.AddSourceLocation(TL.getCaretLoc());
 }
 void TypeLocWriter::VisitLValueReferenceTypeLoc(LValueReferenceTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getAmpLoc(), Record);
+  Record.AddSourceLocation(TL.getAmpLoc());
 }
 void TypeLocWriter::VisitRValueReferenceTypeLoc(RValueReferenceTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getAmpAmpLoc(), Record);
+  Record.AddSourceLocation(TL.getAmpAmpLoc());
 }
 void TypeLocWriter::VisitMemberPointerTypeLoc(MemberPointerTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getStarLoc(), Record);
-  Writer.AddTypeSourceInfo(TL.getClassTInfo(), Record);
+  Record.AddSourceLocation(TL.getStarLoc());
+  Record.AddTypeSourceInfo(TL.getClassTInfo());
 }
 void TypeLocWriter::VisitArrayTypeLoc(ArrayTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getLBracketLoc(), Record);
-  Writer.AddSourceLocation(TL.getRBracketLoc(), Record);
+  Record.AddSourceLocation(TL.getLBracketLoc());
+  Record.AddSourceLocation(TL.getRBracketLoc());
   Record.push_back(TL.getSizeExpr() ? 1 : 0);
   if (TL.getSizeExpr())
-    Writer.AddStmt(TL.getSizeExpr());
+    Record.AddStmt(TL.getSizeExpr());
 }
 void TypeLocWriter::VisitConstantArrayTypeLoc(ConstantArrayTypeLoc TL) {
   VisitArrayTypeLoc(TL);
@@ -538,21 +559,21 @@
 }
 void TypeLocWriter::VisitDependentSizedExtVectorTypeLoc(
                                         DependentSizedExtVectorTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitVectorTypeLoc(VectorTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitExtVectorTypeLoc(ExtVectorTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitFunctionTypeLoc(FunctionTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getLocalRangeBegin(), Record);
-  Writer.AddSourceLocation(TL.getLParenLoc(), Record);
-  Writer.AddSourceLocation(TL.getRParenLoc(), Record);
-  Writer.AddSourceLocation(TL.getLocalRangeEnd(), Record);
+  Record.AddSourceLocation(TL.getLocalRangeBegin());
+  Record.AddSourceLocation(TL.getLParenLoc());
+  Record.AddSourceLocation(TL.getRParenLoc());
+  Record.AddSourceLocation(TL.getLocalRangeEnd());
   for (unsigned i = 0, e = TL.getNumParams(); i != e; ++i)
-    Writer.AddDeclRef(TL.getParam(i), Record);
+    Record.AddDeclRef(TL.getParam(i));
 }
 void TypeLocWriter::VisitFunctionProtoTypeLoc(FunctionProtoTypeLoc TL) {
   VisitFunctionTypeLoc(TL);
@@ -561,131 +582,131 @@
   VisitFunctionTypeLoc(TL);
 }
 void TypeLocWriter::VisitUnresolvedUsingTypeLoc(UnresolvedUsingTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitTypedefTypeLoc(TypedefTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitTypeOfExprTypeLoc(TypeOfExprTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getTypeofLoc(), Record);
-  Writer.AddSourceLocation(TL.getLParenLoc(), Record);
-  Writer.AddSourceLocation(TL.getRParenLoc(), Record);
+  Record.AddSourceLocation(TL.getTypeofLoc());
+  Record.AddSourceLocation(TL.getLParenLoc());
+  Record.AddSourceLocation(TL.getRParenLoc());
 }
 void TypeLocWriter::VisitTypeOfTypeLoc(TypeOfTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getTypeofLoc(), Record);
-  Writer.AddSourceLocation(TL.getLParenLoc(), Record);
-  Writer.AddSourceLocation(TL.getRParenLoc(), Record);
-  Writer.AddTypeSourceInfo(TL.getUnderlyingTInfo(), Record);
+  Record.AddSourceLocation(TL.getTypeofLoc());
+  Record.AddSourceLocation(TL.getLParenLoc());
+  Record.AddSourceLocation(TL.getRParenLoc());
+  Record.AddTypeSourceInfo(TL.getUnderlyingTInfo());
 }
 void TypeLocWriter::VisitDecltypeTypeLoc(DecltypeTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitUnaryTransformTypeLoc(UnaryTransformTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getKWLoc(), Record);
-  Writer.AddSourceLocation(TL.getLParenLoc(), Record);
-  Writer.AddSourceLocation(TL.getRParenLoc(), Record);
-  Writer.AddTypeSourceInfo(TL.getUnderlyingTInfo(), Record);
+  Record.AddSourceLocation(TL.getKWLoc());
+  Record.AddSourceLocation(TL.getLParenLoc());
+  Record.AddSourceLocation(TL.getRParenLoc());
+  Record.AddTypeSourceInfo(TL.getUnderlyingTInfo());
 }
 void TypeLocWriter::VisitAutoTypeLoc(AutoTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitRecordTypeLoc(RecordTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitEnumTypeLoc(EnumTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitAttributedTypeLoc(AttributedTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getAttrNameLoc(), Record);
+  Record.AddSourceLocation(TL.getAttrNameLoc());
   if (TL.hasAttrOperand()) {
     SourceRange range = TL.getAttrOperandParensRange();
-    Writer.AddSourceLocation(range.getBegin(), Record);
-    Writer.AddSourceLocation(range.getEnd(), Record);
+    Record.AddSourceLocation(range.getBegin());
+    Record.AddSourceLocation(range.getEnd());
   }
   if (TL.hasAttrExprOperand()) {
     Expr *operand = TL.getAttrExprOperand();
     Record.push_back(operand ? 1 : 0);
-    if (operand) Writer.AddStmt(operand);
+    if (operand) Record.AddStmt(operand);
   } else if (TL.hasAttrEnumOperand()) {
-    Writer.AddSourceLocation(TL.getAttrEnumOperandLoc(), Record);
+    Record.AddSourceLocation(TL.getAttrEnumOperandLoc());
   }
 }
 void TypeLocWriter::VisitTemplateTypeParmTypeLoc(TemplateTypeParmTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitSubstTemplateTypeParmTypeLoc(
                                             SubstTemplateTypeParmTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitSubstTemplateTypeParmPackTypeLoc(
                                           SubstTemplateTypeParmPackTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitTemplateSpecializationTypeLoc(
                                            TemplateSpecializationTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getTemplateKeywordLoc(), Record);
-  Writer.AddSourceLocation(TL.getTemplateNameLoc(), Record);
-  Writer.AddSourceLocation(TL.getLAngleLoc(), Record);
-  Writer.AddSourceLocation(TL.getRAngleLoc(), Record);
+  Record.AddSourceLocation(TL.getTemplateKeywordLoc());
+  Record.AddSourceLocation(TL.getTemplateNameLoc());
+  Record.AddSourceLocation(TL.getLAngleLoc());
+  Record.AddSourceLocation(TL.getRAngleLoc());
   for (unsigned i = 0, e = TL.getNumArgs(); i != e; ++i)
-    Writer.AddTemplateArgumentLocInfo(TL.getArgLoc(i).getArgument().getKind(),
-                                      TL.getArgLoc(i).getLocInfo(), Record);
+    Record.AddTemplateArgumentLocInfo(TL.getArgLoc(i).getArgument().getKind(),
+                                      TL.getArgLoc(i).getLocInfo());
 }
 void TypeLocWriter::VisitParenTypeLoc(ParenTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getLParenLoc(), Record);
-  Writer.AddSourceLocation(TL.getRParenLoc(), Record);
+  Record.AddSourceLocation(TL.getLParenLoc());
+  Record.AddSourceLocation(TL.getRParenLoc());
 }
 void TypeLocWriter::VisitElaboratedTypeLoc(ElaboratedTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getElaboratedKeywordLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(TL.getQualifierLoc(), Record);
+  Record.AddSourceLocation(TL.getElaboratedKeywordLoc());
+  Record.AddNestedNameSpecifierLoc(TL.getQualifierLoc());
 }
 void TypeLocWriter::VisitInjectedClassNameTypeLoc(InjectedClassNameTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitDependentNameTypeLoc(DependentNameTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getElaboratedKeywordLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(TL.getQualifierLoc(), Record);
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getElaboratedKeywordLoc());
+  Record.AddNestedNameSpecifierLoc(TL.getQualifierLoc());
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitDependentTemplateSpecializationTypeLoc(
        DependentTemplateSpecializationTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getElaboratedKeywordLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(TL.getQualifierLoc(), Record);
-  Writer.AddSourceLocation(TL.getTemplateKeywordLoc(), Record);
-  Writer.AddSourceLocation(TL.getTemplateNameLoc(), Record);
-  Writer.AddSourceLocation(TL.getLAngleLoc(), Record);
-  Writer.AddSourceLocation(TL.getRAngleLoc(), Record);
+  Record.AddSourceLocation(TL.getElaboratedKeywordLoc());
+  Record.AddNestedNameSpecifierLoc(TL.getQualifierLoc());
+  Record.AddSourceLocation(TL.getTemplateKeywordLoc());
+  Record.AddSourceLocation(TL.getTemplateNameLoc());
+  Record.AddSourceLocation(TL.getLAngleLoc());
+  Record.AddSourceLocation(TL.getRAngleLoc());
   for (unsigned I = 0, E = TL.getNumArgs(); I != E; ++I)
-    Writer.AddTemplateArgumentLocInfo(TL.getArgLoc(I).getArgument().getKind(),
-                                      TL.getArgLoc(I).getLocInfo(), Record);
+    Record.AddTemplateArgumentLocInfo(TL.getArgLoc(I).getArgument().getKind(),
+                                      TL.getArgLoc(I).getLocInfo());
 }
 void TypeLocWriter::VisitPackExpansionTypeLoc(PackExpansionTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getEllipsisLoc(), Record);
+  Record.AddSourceLocation(TL.getEllipsisLoc());
 }
 void TypeLocWriter::VisitObjCInterfaceTypeLoc(ObjCInterfaceTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getNameLoc(), Record);
+  Record.AddSourceLocation(TL.getNameLoc());
 }
 void TypeLocWriter::VisitObjCObjectTypeLoc(ObjCObjectTypeLoc TL) {
   Record.push_back(TL.hasBaseTypeAsWritten());
-  Writer.AddSourceLocation(TL.getTypeArgsLAngleLoc(), Record);
-  Writer.AddSourceLocation(TL.getTypeArgsRAngleLoc(), Record);
+  Record.AddSourceLocation(TL.getTypeArgsLAngleLoc());
+  Record.AddSourceLocation(TL.getTypeArgsRAngleLoc());
   for (unsigned i = 0, e = TL.getNumTypeArgs(); i != e; ++i)
-    Writer.AddTypeSourceInfo(TL.getTypeArgTInfo(i), Record);
-  Writer.AddSourceLocation(TL.getProtocolLAngleLoc(), Record);
-  Writer.AddSourceLocation(TL.getProtocolRAngleLoc(), Record);
+    Record.AddTypeSourceInfo(TL.getTypeArgTInfo(i));
+  Record.AddSourceLocation(TL.getProtocolLAngleLoc());
+  Record.AddSourceLocation(TL.getProtocolRAngleLoc());
   for (unsigned i = 0, e = TL.getNumProtocols(); i != e; ++i)
-    Writer.AddSourceLocation(TL.getProtocolLoc(i), Record);
+    Record.AddSourceLocation(TL.getProtocolLoc(i));
 }
 void TypeLocWriter::VisitObjCObjectPointerTypeLoc(ObjCObjectPointerTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getStarLoc(), Record);
+  Record.AddSourceLocation(TL.getStarLoc());
 }
 void TypeLocWriter::VisitAtomicTypeLoc(AtomicTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getKWLoc(), Record);
-  Writer.AddSourceLocation(TL.getLParenLoc(), Record);
-  Writer.AddSourceLocation(TL.getRParenLoc(), Record);
+  Record.AddSourceLocation(TL.getKWLoc());
+  Record.AddSourceLocation(TL.getLParenLoc());
+  Record.AddSourceLocation(TL.getRParenLoc());
 }
 void TypeLocWriter::VisitPipeTypeLoc(PipeTypeLoc TL) {
-  Writer.AddSourceLocation(TL.getKWLoc(), Record);
+  Record.AddSourceLocation(TL.getKWLoc());
 }
 
 void ASTWriter::WriteTypeAbbrevs() {
@@ -936,11 +957,9 @@
   RECORD(SEMA_DECL_REFS);
   RECORD(WEAK_UNDECLARED_IDENTIFIERS);
   RECORD(PENDING_IMPLICIT_INSTANTIATIONS);
-  RECORD(DECL_REPLACEMENTS);
   RECORD(UPDATE_VISIBLE);
   RECORD(DECL_UPDATE_OFFSETS);
   RECORD(DECL_UPDATES);
-  RECORD(CXX_BASE_SPECIFIER_OFFSETS);
   RECORD(DIAG_PRAGMA_MAPPINGS);
   RECORD(CUDA_SPECIAL_DECL_REFS);
   RECORD(HEADER_SEARCH_TABLE);
@@ -959,8 +978,9 @@
   RECORD(UNDEFINED_BUT_USED);
   RECORD(LATE_PARSED_TEMPLATE);
   RECORD(OPTIMIZE_PRAGMA_OPTIONS);
+  RECORD(MSSTRUCT_PRAGMA_OPTIONS);
+  RECORD(POINTERS_TO_MEMBERS_PRAGMA_OPTIONS);
   RECORD(UNUSED_LOCAL_TYPEDEF_NAME_CANDIDATES);
-  RECORD(CXX_CTOR_INITIALIZERS_OFFSETS);
   RECORD(DELETE_EXPRS_TO_ANALYZE);
 
   // SourceManager Block.
@@ -968,6 +988,7 @@
   RECORD(SM_SLOC_FILE_ENTRY);
   RECORD(SM_SLOC_BUFFER_ENTRY);
   RECORD(SM_SLOC_BUFFER_BLOB);
+  RECORD(SM_SLOC_BUFFER_BLOB_COMPRESSED);
   RECORD(SM_SLOC_EXPANSION_ENTRY);
 
   // Preprocessor Block.
@@ -996,6 +1017,7 @@
   RECORD(SUBMODULE_PRIVATE_HEADER);
   RECORD(SUBMODULE_TEXTUAL_HEADER);
   RECORD(SUBMODULE_PRIVATE_TEXTUAL_HEADER);
+  RECORD(SUBMODULE_INITIALIZERS);
 
   // Comments Block.
   BLOCK(COMMENTS_BLOCK);
@@ -1082,6 +1104,7 @@
   RECORD(DECL_CXX_RECORD);
   RECORD(DECL_CXX_METHOD);
   RECORD(DECL_CXX_CONSTRUCTOR);
+  RECORD(DECL_CXX_INHERITED_CONSTRUCTOR);
   RECORD(DECL_CXX_DESTRUCTOR);
   RECORD(DECL_CXX_CONVERSION);
   RECORD(DECL_ACCESS_SPEC);
@@ -1097,10 +1120,22 @@
   RECORD(DECL_TEMPLATE_TYPE_PARM);
   RECORD(DECL_NON_TYPE_TEMPLATE_PARM);
   RECORD(DECL_TEMPLATE_TEMPLATE_PARM);
+  RECORD(DECL_TYPE_ALIAS_TEMPLATE);
   RECORD(DECL_STATIC_ASSERT);
   RECORD(DECL_CXX_BASE_SPECIFIERS);
+  RECORD(DECL_CXX_CTOR_INITIALIZERS);
   RECORD(DECL_INDIRECTFIELD);
   RECORD(DECL_EXPANDED_NON_TYPE_TEMPLATE_PARM_PACK);
+  RECORD(DECL_EXPANDED_TEMPLATE_TEMPLATE_PARM_PACK);
+  RECORD(DECL_CLASS_SCOPE_FUNCTION_SPECIALIZATION);
+  RECORD(DECL_IMPORT);
+  RECORD(DECL_OMP_THREADPRIVATE);
+  RECORD(DECL_EMPTY);
+  RECORD(DECL_OBJC_TYPE_PARAM);
+  RECORD(DECL_OMP_CAPTUREDEXPR);
+  RECORD(DECL_PRAGMA_COMMENT);
+  RECORD(DECL_PRAGMA_DETECT_MISMATCH);
+  RECORD(DECL_OMP_DECLARE_REDUCTION);
   
   // Statements and Exprs can occur in the Decls and Types block.
   AddStmtsExprs(Stream, Record);
@@ -1637,11 +1672,15 @@
 
 /// \brief Create an abbreviation for the SLocEntry that refers to a
 /// buffer's blob.
-static unsigned CreateSLocBufferBlobAbbrev(llvm::BitstreamWriter &Stream) {
+static unsigned CreateSLocBufferBlobAbbrev(llvm::BitstreamWriter &Stream,
+                                           bool Compressed) {
   using namespace llvm;
 
   auto *Abbrev = new BitCodeAbbrev();
-  Abbrev->Add(BitCodeAbbrevOp(SM_SLOC_BUFFER_BLOB));
+  Abbrev->Add(BitCodeAbbrevOp(Compressed ? SM_SLOC_BUFFER_BLOB_COMPRESSED
+                                         : SM_SLOC_BUFFER_BLOB));
+  if (Compressed)
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Uncompressed size
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Blob
   return Stream.EmitAbbrev(Abbrev);
 }
@@ -1863,12 +1902,14 @@
   RecordData Record;
 
   // Enter the source manager block.
-  Stream.EnterSubblock(SOURCE_MANAGER_BLOCK_ID, 3);
+  Stream.EnterSubblock(SOURCE_MANAGER_BLOCK_ID, 4);
 
   // Abbreviations for the various kinds of source-location entries.
   unsigned SLocFileAbbrv = CreateSLocFileAbbrev(Stream);
   unsigned SLocBufferAbbrv = CreateSLocBufferAbbrev(Stream);
-  unsigned SLocBufferBlobAbbrv = CreateSLocBufferBlobAbbrev(Stream);
+  unsigned SLocBufferBlobAbbrv = CreateSLocBufferBlobAbbrev(Stream, false);
+  unsigned SLocBufferBlobCompressedAbbrv =
+      CreateSLocBufferBlobAbbrev(Stream, true);
   unsigned SLocExpansionAbbrv = CreateSLocExpansionAbbrev(Stream);
 
   // Write out the source location entry table. We skip the first
@@ -1903,11 +1944,12 @@
     Record.push_back(SLoc->getOffset() - 2);
     if (SLoc->isFile()) {
       const SrcMgr::FileInfo &File = SLoc->getFile();
-      Record.push_back(File.getIncludeLoc().getRawEncoding());
+      AddSourceLocation(File.getIncludeLoc(), Record);
       Record.push_back(File.getFileCharacteristic()); // FIXME: stable encoding
       Record.push_back(File.hasLineDirectives());
 
       const SrcMgr::ContentCache *Content = File.getContentCache();
+      bool EmitBlob = false;
       if (Content->OrigEntry) {
         assert(Content->OrigEntry == Content->ContentsEntry &&
                "Writing to AST an overridden file is not supported");
@@ -1929,14 +1971,8 @@
         
         Stream.EmitRecordWithAbbrev(SLocFileAbbrv, Record);
         
-        if (Content->BufferOverridden || Content->IsTransient) {
-          RecordData::value_type Record[] = {SM_SLOC_BUFFER_BLOB};
-          const llvm::MemoryBuffer *Buffer
-            = Content->getBuffer(PP.getDiagnostics(), PP.getSourceManager());
-          Stream.EmitRecordWithBlob(SLocBufferBlobAbbrv, Record,
-                                    StringRef(Buffer->getBufferStart(),
-                                              Buffer->getBufferSize() + 1));          
-        }
+        if (Content->BufferOverridden || Content->IsTransient)
+          EmitBlob = true;
       } else {
         // The source location entry is a buffer. The blob associated
         // with this entry contains the contents of the buffer.
@@ -1949,22 +1985,43 @@
         const char *Name = Buffer->getBufferIdentifier();
         Stream.EmitRecordWithBlob(SLocBufferAbbrv, Record,
                                   StringRef(Name, strlen(Name) + 1));
-        RecordData::value_type Record[] = {SM_SLOC_BUFFER_BLOB};
-        Stream.EmitRecordWithBlob(SLocBufferBlobAbbrv, Record,
-                                  StringRef(Buffer->getBufferStart(),
-                                                  Buffer->getBufferSize() + 1));
+        EmitBlob = true;
 
         if (strcmp(Name, "<built-in>") == 0) {
           PreloadSLocs.push_back(SLocEntryOffsets.size());
         }
       }
+
+      if (EmitBlob) {
+        // Include the implicit terminating null character in the on-disk buffer
+        // if we're writing it uncompressed.
+        const llvm::MemoryBuffer *Buffer =
+            Content->getBuffer(PP.getDiagnostics(), PP.getSourceManager());
+        StringRef Blob(Buffer->getBufferStart(), Buffer->getBufferSize() + 1);
+
+        // Compress the buffer if possible. We expect that almost all PCM
+        // consumers will not want its contents.
+        SmallString<0> CompressedBuffer;
+        if (llvm::zlib::compress(Blob.drop_back(1), CompressedBuffer) ==
+            llvm::zlib::StatusOK) {
+          RecordData::value_type Record[] = {SM_SLOC_BUFFER_BLOB_COMPRESSED,
+                                             Blob.size() - 1};
+          Stream.EmitRecordWithBlob(SLocBufferBlobCompressedAbbrv, Record,
+                                    CompressedBuffer);
+        } else {
+          RecordData::value_type Record[] = {SM_SLOC_BUFFER_BLOB};
+          Stream.EmitRecordWithBlob(SLocBufferBlobAbbrv, Record, Blob);
+        }
+      }
     } else {
       // The source location entry is a macro expansion.
       const SrcMgr::ExpansionInfo &Expansion = SLoc->getExpansion();
-      Record.push_back(Expansion.getSpellingLoc().getRawEncoding());
-      Record.push_back(Expansion.getExpansionLocStart().getRawEncoding());
-      Record.push_back(Expansion.isMacroArgExpansion() ? 0
-                             : Expansion.getExpansionLocEnd().getRawEncoding());
+      AddSourceLocation(Expansion.getSpellingLoc(), Record);
+      AddSourceLocation(Expansion.getExpansionLocStart(), Record);
+      AddSourceLocation(Expansion.isMacroArgExpansion()
+                            ? SourceLocation()
+                            : Expansion.getExpansionLocEnd(),
+                        Record);
 
       // Compute the token length for this macro expansion.
       unsigned NextOffset = SourceMgr.getNextLocalOffset();
@@ -2361,7 +2418,9 @@
   if (Known != SubmoduleIDs.end())
     return Known->second;
 
-  if (Mod->getTopLevelModule() != WritingModule)
+  auto *Top = Mod->getTopLevelModule();
+  if (Top != WritingModule &&
+      !Top->fullModuleNameIs(StringRef(getLangOpts().CurrentModule)))
     return 0;
 
   return SubmoduleIDs[Mod] = NextSubmoduleID++;
@@ -2594,6 +2653,13 @@
       Stream.EmitRecordWithBlob(ConfigMacroAbbrev, Record, CM);
     }
 
+    // Emit the initializers, if any.
+    RecordData Inits;
+    for (Decl *D : Context->getModuleInitializers(Mod))
+      Inits.push_back(GetDeclRef(D));
+    if (!Inits.empty())
+      Stream.EmitRecord(SUBMODULE_INITIALIZERS, Inits);
+
     // Queue up the submodules of this module.
     for (auto *M : Mod->submodules())
       Q.push(M);
@@ -2646,7 +2712,7 @@
     if (point.Loc.isInvalid())
       continue;
 
-    Record.push_back(point.Loc.getRawEncoding());
+    AddSourceLocation(point.Loc, Record);
     unsigned &DiagStateID = DiagStateIDMap[point.State];
     Record.push_back(DiagStateID);
     
@@ -2667,95 +2733,36 @@
     Stream.EmitRecord(DIAG_PRAGMA_MAPPINGS, Record);
 }
 
-void ASTWriter::WriteCXXCtorInitializersOffsets() {
-  if (CXXCtorInitializersOffsets.empty())
-    return;
-
-  // Create a blob abbreviation for the C++ ctor initializer offsets.
-  using namespace llvm;
-
-  auto *Abbrev = new BitCodeAbbrev();
-  Abbrev->Add(BitCodeAbbrevOp(CXX_CTOR_INITIALIZERS_OFFSETS));
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // size
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  unsigned CtorInitializersOffsetAbbrev = Stream.EmitAbbrev(Abbrev);
-
-  // Write the base specifier offsets table.
-  RecordData::value_type Record[] = {CXX_CTOR_INITIALIZERS_OFFSETS,
-                                     CXXCtorInitializersOffsets.size()};
-  Stream.EmitRecordWithBlob(CtorInitializersOffsetAbbrev, Record,
-                            bytes(CXXCtorInitializersOffsets));
-}
-
-void ASTWriter::WriteCXXBaseSpecifiersOffsets() {
-  if (CXXBaseSpecifiersOffsets.empty())
-    return;
-
-  // Create a blob abbreviation for the C++ base specifiers offsets.
-  using namespace llvm;
-    
-  auto *Abbrev = new BitCodeAbbrev();
-  Abbrev->Add(BitCodeAbbrevOp(CXX_BASE_SPECIFIER_OFFSETS));
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // size
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  unsigned BaseSpecifierOffsetAbbrev = Stream.EmitAbbrev(Abbrev);
-  
-  // Write the base specifier offsets table.
-  RecordData::value_type Record[] = {CXX_BASE_SPECIFIER_OFFSETS,
-                                     CXXBaseSpecifiersOffsets.size()};
-  Stream.EmitRecordWithBlob(BaseSpecifierOffsetAbbrev, Record,
-                            bytes(CXXBaseSpecifiersOffsets));
-}
-
 //===----------------------------------------------------------------------===//
 // Type Serialization
 //===----------------------------------------------------------------------===//
 
 /// \brief Write the representation of a type to the AST stream.
 void ASTWriter::WriteType(QualType T) {
-  TypeIdx &Idx = TypeIdxs[T];
-  if (Idx.getIndex() == 0) // we haven't seen this type before.
-    Idx = TypeIdx(NextTypeID++);
+  TypeIdx &IdxRef = TypeIdxs[T];
+  if (IdxRef.getIndex() == 0) // we haven't seen this type before.
+    IdxRef = TypeIdx(NextTypeID++);
+  TypeIdx Idx = IdxRef;
 
   assert(Idx.getIndex() >= FirstTypeID && "Re-writing a type from a prior AST");
 
-  // Record the offset for this type.
-  unsigned Index = Idx.getIndex() - FirstTypeID;
-  if (TypeOffsets.size() == Index)
-    TypeOffsets.push_back(Stream.GetCurrentBitNo());
-  else if (TypeOffsets.size() < Index) {
-    TypeOffsets.resize(Index + 1);
-    TypeOffsets[Index] = Stream.GetCurrentBitNo();
-  }
-
   RecordData Record;
 
   // Emit the type's representation.
   ASTTypeWriter W(*this, Record);
-  W.AbbrevToUse = 0;
+  W.Visit(T);
+  uint64_t Offset = W.Emit();
 
-  if (T.hasLocalNonFastQualifiers()) {
-    Qualifiers Qs = T.getLocalQualifiers();
-    AddTypeRef(T.getLocalUnqualifiedType(), Record);
-    Record.push_back(Qs.getAsOpaqueValue());
-    W.Code = TYPE_EXT_QUAL;
-    W.AbbrevToUse = TypeExtQualAbbrev;
+  // Record the offset for this type.
+  unsigned Index = Idx.getIndex() - FirstTypeID;
+  if (TypeOffsets.size() == Index)
+    TypeOffsets.push_back(Offset);
+  else if (TypeOffsets.size() < Index) {
+    TypeOffsets.resize(Index + 1);
+    TypeOffsets[Index] = Offset;
   } else {
-    switch (T->getTypeClass()) {
-      // For all of the concrete, non-dependent types, call the
-      // appropriate visitor function.
-#define TYPE(Class, Base) \
-    case Type::Class: W.Visit##Class##Type(cast<Class##Type>(T)); break;
-#define ABSTRACT_TYPE(Class, Base)
-#include "clang/AST/TypeNodes.def"
-    }
+    llvm_unreachable("Types emitted in wrong order");
   }
-
-  // Emit the serialized record.
-  Stream.EmitRecord(W.Code, Record, W.AbbrevToUse);
-
-  // Flush any expressions that were written as part of this type.
-  FlushStmts();
 }
 
 //===----------------------------------------------------------------------===//
@@ -3079,6 +3086,7 @@
     return;
 
   RecordData Record;
+  ASTRecordWriter Writer(*this, Record);
 
   // Note: this writes out all references even for a dependent AST. But it is
   // very tricky to fix, and given that @selector shouldn't really appear in
@@ -3086,10 +3094,10 @@
   for (auto &SelectorAndLocation : SemaRef.ReferencedSelectors) {
     Selector Sel = SelectorAndLocation.first;
     SourceLocation Loc = SelectorAndLocation.second;
-    AddSelectorRef(Sel, Record);
-    AddSourceLocation(Loc, Record);
+    Writer.AddSelectorRef(Sel);
+    Writer.AddSourceLocation(Loc);
   }
-  Stream.EmitRecord(REFERENCED_SELECTOR_POOL, Record);
+  Writer.Emit(REFERENCED_SELECTOR_POOL);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3109,11 +3117,20 @@
   if (Decl *Redecl = D->getPreviousDecl()) {
     // For Redeclarable decls, a prior declaration might be local.
     for (; Redecl; Redecl = Redecl->getPreviousDecl()) {
-      if (!Redecl->isFromASTFile())
+      // If we find a local decl, we're done.
+      if (!Redecl->isFromASTFile()) {
+        // Exception: in very rare cases (for injected-class-names), not all
+        // redeclarations are in the same semantic context. Skip ones in a
+        // different context. They don't go in this lookup table at all.
+        if (!Redecl->getDeclContext()->getRedeclContext()->Equals(
+                D->getDeclContext()->getRedeclContext()))
+          continue;
         return cast<NamedDecl>(Redecl);
+      }
+
       // If we find a decl from a (chained-)PCH stop since we won't find a
       // local one.
-      if (D->getOwningModuleID() == 0)
+      if (Redecl->getOwningModuleID() == 0)
         break;
     }
   } else if (Decl *First = D->getCanonicalDecl()) {
@@ -3168,6 +3185,8 @@
         NeedDecls(!IsModule || !Writer.getLangOpts().CPlusPlus),
         InterestingIdentifierOffsets(InterestingIdentifierOffsets) {}
 
+  bool needDecls() const { return NeedDecls; }
+
   static hash_value_type ComputeHash(const IdentifierInfo* II) {
     return llvm::HashString(II->getName());
   }
@@ -3313,7 +3332,12 @@
       auto *II = const_cast<IdentifierInfo *>(IdentIDPair.first);
       IdentID ID = IdentIDPair.second;
       assert(II && "NULL identifier in identifier table");
-      if (!Chain || !II->isFromAST() || II->hasChangedSinceDeserialization())
+      // Write out identifiers if either the ID is local or the identifier has
+      // changed since it was loaded.
+      if (ID >= FirstIdentID || !Chain || !II->isFromAST()
+          || II->hasChangedSinceDeserialization() ||
+          (Trait.needDecls() &&
+           II->hasFETokenInfoChangedSinceDeserialization()))
         Generator.insert(II, ID, Trait);
     }
 
@@ -3902,6 +3926,22 @@
   Stream.EmitRecord(OPTIMIZE_PRAGMA_OPTIONS, Record);
 }
 
+/// \brief Write the state of 'pragma ms_struct' at the end of the module.
+void ASTWriter::WriteMSStructPragmaOptions(Sema &SemaRef) {
+  RecordData Record;
+  Record.push_back(SemaRef.MSStructPragmaOn ? PMSST_ON : PMSST_OFF);
+  Stream.EmitRecord(MSSTRUCT_PRAGMA_OPTIONS, Record);
+}
+
+/// \brief Write the state of 'pragma pointers_to_members' at the end of the
+//module.
+void ASTWriter::WriteMSPointersToMembersPragmaOptions(Sema &SemaRef) {
+  RecordData Record;
+  Record.push_back(SemaRef.MSPointerToMemberRepresentationMethod);
+  AddSourceLocation(SemaRef.ImplicitMSInheritanceAttrLoc, Record);
+  Stream.EmitRecord(POINTERS_TO_MEMBERS_PRAGMA_OPTIONS, Record);
+}
+
 void ASTWriter::WriteModuleFileExtension(Sema &SemaRef,
                                          ModuleFileExtensionWriter &Writer) {
   // Enter the extension block.
@@ -3941,13 +3981,13 @@
 // General Serialization Routines
 //===----------------------------------------------------------------------===//
 
-/// \brief Write a record containing the given attributes.
-void ASTWriter::WriteAttributes(ArrayRef<const Attr*> Attrs,
-                                RecordDataImpl &Record) {
+/// \brief Emit the list of attributes to the specified record.
+void ASTRecordWriter::AddAttributes(ArrayRef<const Attr *> Attrs) {
+  auto &Record = *this;
   Record.push_back(Attrs.size());
   for (const auto *A : Attrs) {
     Record.push_back(A->getKind()); // FIXME: stable encoding, target attrs
-    AddSourceRange(A->getRange(), Record);
+    Record.AddSourceRange(A->getRange());
 
 #include "clang/Serialization/AttrPCHWrite.inc"
 
@@ -4052,9 +4092,8 @@
       NextMacroID(FirstMacroID), FirstSubmoduleID(NUM_PREDEF_SUBMODULE_IDS),
       NextSubmoduleID(FirstSubmoduleID),
       FirstSelectorID(NUM_PREDEF_SELECTOR_IDS), NextSelectorID(FirstSelectorID),
-      CollectedStmts(&StmtsToEmit), NumStatements(0), NumMacros(0),
+      NumStatements(0), NumMacros(0),
       NumLexicalDeclContexts(0), NumVisibleDeclContexts(0),
-      NextCXXBaseSpecifiersID(1), NextCXXCtorInitializersID(1),
       TypeExtQualAbbrev(0), TypeFunctionProtoAbbrev(0), DeclParmVarAbbrev(0),
       DeclContextLexicalAbbrev(0), DeclContextVisibleLookupAbbrev(0),
       UpdateVisibleAbbrev(0), DeclRecordAbbrev(0), DeclTypedefAbbrev(0),
@@ -4162,6 +4201,8 @@
                      PREDEF_DECL_CF_CONSTANT_STRING_ID);
   RegisterPredefDecl(Context.CFConstantStringTagDecl,
                      PREDEF_DECL_CF_CONSTANT_STRING_TAG_ID);
+  RegisterPredefDecl(Context.TypePackElementDecl,
+                     PREDEF_DECL_TYPE_PACK_ELEMENT_ID);
 
   // Build a record containing all of the tentative definitions in this file, in
   // TentativeDefinitions order.  Generally, this record will be empty for
@@ -4468,8 +4509,6 @@
   WriteTypeDeclOffsets();
   if (!DeclUpdatesOffsetsRecord.empty())
     Stream.EmitRecord(DECL_UPDATE_OFFSETS, DeclUpdatesOffsetsRecord);
-  WriteCXXBaseSpecifiersOffsets();
-  WriteCXXCtorInitializersOffsets();
   WriteFileDeclIDsMap();
   WriteSourceManagerBlock(Context.getSourceManager(), PP);
   WriteComments();
@@ -4486,6 +4525,17 @@
   // If we're emitting a module, write out the submodule information.  
   if (WritingModule)
     WriteSubmodules(WritingModule);
+  else if (!getLangOpts().CurrentModule.empty()) {
+    // If we're building a PCH in the implementation of a module, we may need
+    // the description of the current module.
+    //
+    // FIXME: We may need other modules that we did not load from an AST file,
+    // such as if a module declares a 'conflicts' on a different module.
+    Module *M = PP.getHeaderSearchInfo().getModuleMap().findModule(
+        getLangOpts().CurrentModule);
+    if (M && !M->IsFromModuleFile)
+      WriteSubmodules(M);
+  }
 
   Stream.EmitRecord(SPECIAL_TYPES, SpecialTypes);
 
@@ -4590,10 +4640,12 @@
     }
   }
 
-  WriteDeclReplacementsBlock();
   WriteObjCCategories();
-  if(!WritingModule)
+  if(!WritingModule) {
     WriteOptimizePragmaOptions(SemaRef);
+    WriteMSStructPragmaOptions(SemaRef);
+    WriteMSPointersToMembersPragmaOptions(SemaRef);
+  }
 
   // Some simple statistics
   RecordData::value_type Record[] = {
@@ -4619,11 +4671,18 @@
     const Decl *D = DeclUpdate.first;
 
     bool HasUpdatedBody = false;
-    RecordData Record;
+    RecordData RecordData;
+    ASTRecordWriter Record(*this, RecordData);
     for (auto &Update : DeclUpdate.second) {
       DeclUpdateKind Kind = (DeclUpdateKind)Update.getKind();
 
-      Record.push_back(Kind);
+      // An updated body is emitted last, so that the reader doesn't need
+      // to skip over the lazy body to reach statements for other records.
+      if (Kind == UPD_CXX_ADDED_FUNCTION_DEFINITION)
+        HasUpdatedBody = true;
+      else
+        Record.push_back(Kind);
+
       switch (Kind) {
       case UPD_CXX_ADDED_IMPLICIT_MEMBER:
       case UPD_CXX_ADDED_TEMPLATE_SPECIALIZATION:
@@ -4633,26 +4692,27 @@
         break;
 
       case UPD_CXX_ADDED_FUNCTION_DEFINITION:
-        // An updated body is emitted last, so that the reader doesn't need
-        // to skip over the lazy body to reach statements for other records.
-        Record.pop_back();
-        HasUpdatedBody = true;
         break;
 
       case UPD_CXX_INSTANTIATED_STATIC_DATA_MEMBER:
-        AddSourceLocation(Update.getLoc(), Record);
+        Record.AddSourceLocation(Update.getLoc());
         break;
 
       case UPD_CXX_INSTANTIATED_DEFAULT_ARGUMENT:
-        AddStmt(const_cast<Expr*>(
-                  cast<ParmVarDecl>(Update.getDecl())->getDefaultArg()));
+        Record.AddStmt(const_cast<Expr *>(
+            cast<ParmVarDecl>(Update.getDecl())->getDefaultArg()));
+        break;
+
+      case UPD_CXX_INSTANTIATED_DEFAULT_MEMBER_INITIALIZER:
+        Record.AddStmt(
+            cast<FieldDecl>(Update.getDecl())->getInClassInitializer());
         break;
 
       case UPD_CXX_INSTANTIATED_CLASS_DEFINITION: {
         auto *RD = cast<CXXRecordDecl>(D);
         UpdatedDeclContexts.insert(RD->getPrimaryContext());
-        AddCXXDefinitionData(RD, Record);
-        Record.push_back(WriteDeclContextLexicalBlock(
+        Record.AddCXXDefinitionData(RD);
+        Record.AddOffset(WriteDeclContextLexicalBlock(
             *Context, const_cast<CXXRecordDecl *>(RD)));
 
         // This state is sometimes updated by template instantiation, when we
@@ -4660,11 +4720,11 @@
         // to it referring to the template definition.
         if (auto *MSInfo = RD->getMemberSpecializationInfo()) {
           Record.push_back(MSInfo->getTemplateSpecializationKind());
-          AddSourceLocation(MSInfo->getPointOfInstantiation(), Record);
+          Record.AddSourceLocation(MSInfo->getPointOfInstantiation());
         } else {
           auto *Spec = cast<ClassTemplateSpecializationDecl>(RD);
           Record.push_back(Spec->getTemplateSpecializationKind());
-          AddSourceLocation(Spec->getPointOfInstantiation(), Record);
+          Record.AddSourceLocation(Spec->getPointOfInstantiation());
 
           // The instantiation might have been resolved to a partial
           // specialization. If so, record which one.
@@ -4672,35 +4732,33 @@
           if (auto PartialSpec =
                 From.dyn_cast<ClassTemplatePartialSpecializationDecl*>()) {
             Record.push_back(true);
-            AddDeclRef(PartialSpec, Record);
-            AddTemplateArgumentList(&Spec->getTemplateInstantiationArgs(),
-                                    Record);
+            Record.AddDeclRef(PartialSpec);
+            Record.AddTemplateArgumentList(
+                &Spec->getTemplateInstantiationArgs());
           } else {
             Record.push_back(false);
           }
         }
         Record.push_back(RD->getTagKind());
-        AddSourceLocation(RD->getLocation(), Record);
-        AddSourceLocation(RD->getLocStart(), Record);
-        AddSourceRange(RD->getBraceRange(), Record);
+        Record.AddSourceLocation(RD->getLocation());
+        Record.AddSourceLocation(RD->getLocStart());
+        Record.AddSourceRange(RD->getBraceRange());
 
         // Instantiation may change attributes; write them all out afresh.
         Record.push_back(D->hasAttrs());
-        if (Record.back())
-          WriteAttributes(llvm::makeArrayRef(D->getAttrs().begin(),
-                                             D->getAttrs().size()), Record);
+        if (D->hasAttrs())
+          Record.AddAttributes(D->getAttrs());
 
         // FIXME: Ensure we don't get here for explicit instantiations.
         break;
       }
 
       case UPD_CXX_RESOLVED_DTOR_DELETE:
-        AddDeclRef(Update.getDecl(), Record);
+        Record.AddDeclRef(Update.getDecl());
         break;
 
       case UPD_CXX_RESOLVED_EXCEPTION_SPEC:
         addExceptionSpec(
-            *this,
             cast<FunctionDecl>(D)->getType()->castAs<FunctionProtoType>(),
             Record);
         break;
@@ -4718,8 +4776,13 @@
         break;
 
       case UPD_DECL_MARKED_OPENMP_THREADPRIVATE:
-        AddSourceRange(D->getAttr<OMPThreadPrivateDeclAttr>()->getRange(),
-                       Record);
+        Record.AddSourceRange(
+            D->getAttr<OMPThreadPrivateDeclAttr>()->getRange());
+        break;
+
+      case UPD_DECL_MARKED_OPENMP_DECLARETARGET:
+        Record.AddSourceRange(
+            D->getAttr<OMPDeclareTargetDeclAttr>()->getRange());
         break;
 
       case UPD_DECL_EXPORTED:
@@ -4727,7 +4790,7 @@
         break;
 
       case UPD_ADDED_ATTR_TO_RECORD:
-        WriteAttributes(llvm::makeArrayRef(Update.getAttr()), Record);
+        Record.AddAttributes(llvm::makeArrayRef(Update.getAttr()));
         break;
       }
     }
@@ -4736,34 +4799,18 @@
       const auto *Def = cast<FunctionDecl>(D);
       Record.push_back(UPD_CXX_ADDED_FUNCTION_DEFINITION);
       Record.push_back(Def->isInlined());
-      AddSourceLocation(Def->getInnerLocStart(), Record);
-      AddFunctionDefinition(Def, Record);
+      Record.AddSourceLocation(Def->getInnerLocStart());
+      Record.AddFunctionDefinition(Def);
     }
 
     OffsetsRecord.push_back(GetDeclRef(D));
-    OffsetsRecord.push_back(Stream.GetCurrentBitNo());
-
-    Stream.EmitRecord(DECL_UPDATES, Record);
-
-    FlushPendingAfterDecl();
+    OffsetsRecord.push_back(Record.Emit(DECL_UPDATES));
   }
 }
 
-void ASTWriter::WriteDeclReplacementsBlock() {
-  if (ReplacedDecls.empty())
-    return;
-
-  RecordData Record;
-  for (const auto &I : ReplacedDecls) {
-    Record.push_back(I.ID);
-    Record.push_back(I.Offset);
-    Record.push_back(I.Loc);
-  }
-  Stream.EmitRecord(DECL_REPLACEMENTS, Record);
-}
-
 void ASTWriter::AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record) {
-  Record.push_back(Loc.getRawEncoding());
+  uint32_t Raw = Loc.getRawEncoding();
+  Record.push_back((Raw << 1) | (Raw >> 31));
 }
 
 void ASTWriter::AddSourceRange(SourceRange Range, RecordDataImpl &Record) {
@@ -4771,19 +4818,19 @@
   AddSourceLocation(Range.getEnd(), Record);
 }
 
-void ASTWriter::AddAPInt(const llvm::APInt &Value, RecordDataImpl &Record) {
-  Record.push_back(Value.getBitWidth());
+void ASTRecordWriter::AddAPInt(const llvm::APInt &Value) {
+  Record->push_back(Value.getBitWidth());
   const uint64_t *Words = Value.getRawData();
-  Record.append(Words, Words + Value.getNumWords());
+  Record->append(Words, Words + Value.getNumWords());
 }
 
-void ASTWriter::AddAPSInt(const llvm::APSInt &Value, RecordDataImpl &Record) {
-  Record.push_back(Value.isUnsigned());
-  AddAPInt(Value, Record);
+void ASTRecordWriter::AddAPSInt(const llvm::APSInt &Value) {
+  Record->push_back(Value.isUnsigned());
+  AddAPInt(Value);
 }
 
-void ASTWriter::AddAPFloat(const llvm::APFloat &Value, RecordDataImpl &Record) {
-  AddAPInt(Value.bitcastToAPInt(), Record);
+void ASTRecordWriter::AddAPFloat(const llvm::APFloat &Value) {
+  AddAPInt(Value.bitcastToAPInt());
 }
 
 void ASTWriter::AddIdentifierRef(const IdentifierInfo *II, RecordDataImpl &Record) {
@@ -4828,8 +4875,8 @@
   return IdentMacroDirectivesOffsetMap.lookup(Name);
 }
 
-void ASTWriter::AddSelectorRef(const Selector SelRef, RecordDataImpl &Record) {
-  Record.push_back(getSelectorRef(SelRef));
+void ASTRecordWriter::AddSelectorRef(const Selector SelRef) {
+  Record->push_back(Writer->getSelectorRef(SelRef));
 }
 
 SelectorID ASTWriter::getSelectorRef(Selector Sel) {
@@ -4851,46 +4898,27 @@
   return SID;
 }
 
-void ASTWriter::AddCXXTemporary(const CXXTemporary *Temp, RecordDataImpl &Record) {
-  AddDeclRef(Temp->getDestructor(), Record);
+void ASTRecordWriter::AddCXXTemporary(const CXXTemporary *Temp) {
+  AddDeclRef(Temp->getDestructor());
 }
 
-void ASTWriter::AddCXXCtorInitializersRef(ArrayRef<CXXCtorInitializer *> Inits,
-                                          RecordDataImpl &Record) {
-  assert(!Inits.empty() && "Empty ctor initializer sets are not recorded");
-  CXXCtorInitializersToWrite.push_back(
-      QueuedCXXCtorInitializers(NextCXXCtorInitializersID, Inits));
-  Record.push_back(NextCXXCtorInitializersID++);
-}
-
-void ASTWriter::AddCXXBaseSpecifiersRef(CXXBaseSpecifier const *Bases,
-                                        CXXBaseSpecifier const *BasesEnd,
-                                        RecordDataImpl &Record) {
-  assert(Bases != BasesEnd && "Empty base-specifier sets are not recorded");
-  CXXBaseSpecifiersToWrite.push_back(
-                                QueuedCXXBaseSpecifiers(NextCXXBaseSpecifiersID,
-                                                        Bases, BasesEnd));
-  Record.push_back(NextCXXBaseSpecifiersID++);
-}
-
-void ASTWriter::AddTemplateArgumentLocInfo(TemplateArgument::ArgKind Kind,
-                                           const TemplateArgumentLocInfo &Arg,
-                                           RecordDataImpl &Record) {
+void ASTRecordWriter::AddTemplateArgumentLocInfo(
+    TemplateArgument::ArgKind Kind, const TemplateArgumentLocInfo &Arg) {
   switch (Kind) {
   case TemplateArgument::Expression:
     AddStmt(Arg.getAsExpr());
     break;
   case TemplateArgument::Type:
-    AddTypeSourceInfo(Arg.getAsTypeSourceInfo(), Record);
+    AddTypeSourceInfo(Arg.getAsTypeSourceInfo());
     break;
   case TemplateArgument::Template:
-    AddNestedNameSpecifierLoc(Arg.getTemplateQualifierLoc(), Record);
-    AddSourceLocation(Arg.getTemplateNameLoc(), Record);
+    AddNestedNameSpecifierLoc(Arg.getTemplateQualifierLoc());
+    AddSourceLocation(Arg.getTemplateNameLoc());
     break;
   case TemplateArgument::TemplateExpansion:
-    AddNestedNameSpecifierLoc(Arg.getTemplateQualifierLoc(), Record);
-    AddSourceLocation(Arg.getTemplateNameLoc(), Record);
-    AddSourceLocation(Arg.getTemplateEllipsisLoc(), Record);
+    AddNestedNameSpecifierLoc(Arg.getTemplateQualifierLoc());
+    AddSourceLocation(Arg.getTemplateNameLoc());
+    AddSourceLocation(Arg.getTemplateEllipsisLoc());
     break;
   case TemplateArgument::Null:
   case TemplateArgument::Integral:
@@ -4902,35 +4930,32 @@
   }
 }
 
-void ASTWriter::AddTemplateArgumentLoc(const TemplateArgumentLoc &Arg,
-                                       RecordDataImpl &Record) {
-  AddTemplateArgument(Arg.getArgument(), Record);
+void ASTRecordWriter::AddTemplateArgumentLoc(const TemplateArgumentLoc &Arg) {
+  AddTemplateArgument(Arg.getArgument());
 
   if (Arg.getArgument().getKind() == TemplateArgument::Expression) {
     bool InfoHasSameExpr
       = Arg.getArgument().getAsExpr() == Arg.getLocInfo().getAsExpr();
-    Record.push_back(InfoHasSameExpr);
+    Record->push_back(InfoHasSameExpr);
     if (InfoHasSameExpr)
       return; // Avoid storing the same expr twice.
   }
-  AddTemplateArgumentLocInfo(Arg.getArgument().getKind(), Arg.getLocInfo(),
-                             Record);
+  AddTemplateArgumentLocInfo(Arg.getArgument().getKind(), Arg.getLocInfo());
 }
 
-void ASTWriter::AddTypeSourceInfo(TypeSourceInfo *TInfo, 
-                                  RecordDataImpl &Record) {
+void ASTRecordWriter::AddTypeSourceInfo(TypeSourceInfo *TInfo) {
   if (!TInfo) {
-    AddTypeRef(QualType(), Record);
+    AddTypeRef(QualType());
     return;
   }
 
-  AddTypeLoc(TInfo->getTypeLoc(), Record);
+  AddTypeLoc(TInfo->getTypeLoc());
 }
 
-void ASTWriter::AddTypeLoc(TypeLoc TL, RecordDataImpl &Record) {
-  AddTypeRef(TL.getType(), Record);
+void ASTRecordWriter::AddTypeLoc(TypeLoc TL) {
+  AddTypeRef(TL.getType());
 
-  TypeLocWriter TLW(*this, Record);
+  TypeLocWriter TLW(*this);
   for (; !TL.isNull(); TL = TL.getNextTypeLoc())
     TLW.Visit(TL);
 }
@@ -5065,32 +5090,32 @@
   Decls.insert(I, LocDecl);
 }
 
-void ASTWriter::AddDeclarationName(DeclarationName Name, RecordDataImpl &Record) {
+void ASTRecordWriter::AddDeclarationName(DeclarationName Name) {
   // FIXME: Emit a stable enum for NameKind.  0 = Identifier etc.
-  Record.push_back(Name.getNameKind());
+  Record->push_back(Name.getNameKind());
   switch (Name.getNameKind()) {
   case DeclarationName::Identifier:
-    AddIdentifierRef(Name.getAsIdentifierInfo(), Record);
+    AddIdentifierRef(Name.getAsIdentifierInfo());
     break;
 
   case DeclarationName::ObjCZeroArgSelector:
   case DeclarationName::ObjCOneArgSelector:
   case DeclarationName::ObjCMultiArgSelector:
-    AddSelectorRef(Name.getObjCSelector(), Record);
+    AddSelectorRef(Name.getObjCSelector());
     break;
 
   case DeclarationName::CXXConstructorName:
   case DeclarationName::CXXDestructorName:
   case DeclarationName::CXXConversionFunctionName:
-    AddTypeRef(Name.getCXXNameType(), Record);
+    AddTypeRef(Name.getCXXNameType());
     break;
 
   case DeclarationName::CXXOperatorName:
-    Record.push_back(Name.getCXXOverloadedOperator());
+    Record->push_back(Name.getCXXOverloadedOperator());
     break;
 
   case DeclarationName::CXXLiteralOperatorName:
-    AddIdentifierRef(Name.getCXXLiteralIdentifier(), Record);
+    AddIdentifierRef(Name.getCXXLiteralIdentifier());
     break;
 
   case DeclarationName::CXXUsingDirective:
@@ -5120,28 +5145,25 @@
   return It->second;
 }
 
-void ASTWriter::AddDeclarationNameLoc(const DeclarationNameLoc &DNLoc,
-                                     DeclarationName Name, RecordDataImpl &Record) {
+void ASTRecordWriter::AddDeclarationNameLoc(const DeclarationNameLoc &DNLoc,
+                                            DeclarationName Name) {
   switch (Name.getNameKind()) {
   case DeclarationName::CXXConstructorName:
   case DeclarationName::CXXDestructorName:
   case DeclarationName::CXXConversionFunctionName:
-    AddTypeSourceInfo(DNLoc.NamedType.TInfo, Record);
+    AddTypeSourceInfo(DNLoc.NamedType.TInfo);
     break;
 
   case DeclarationName::CXXOperatorName:
+    AddSourceLocation(SourceLocation::getFromRawEncoding(
+        DNLoc.CXXOperatorName.BeginOpNameLoc));
     AddSourceLocation(
-       SourceLocation::getFromRawEncoding(DNLoc.CXXOperatorName.BeginOpNameLoc),
-       Record);
-    AddSourceLocation(
-        SourceLocation::getFromRawEncoding(DNLoc.CXXOperatorName.EndOpNameLoc),
-        Record);
+        SourceLocation::getFromRawEncoding(DNLoc.CXXOperatorName.EndOpNameLoc));
     break;
 
   case DeclarationName::CXXLiteralOperatorName:
-    AddSourceLocation(
-     SourceLocation::getFromRawEncoding(DNLoc.CXXLiteralOperatorName.OpNameLoc),
-     Record);
+    AddSourceLocation(SourceLocation::getFromRawEncoding(
+        DNLoc.CXXLiteralOperatorName.OpNameLoc));
     break;
 
   case DeclarationName::Identifier:
@@ -5153,23 +5175,21 @@
   }
 }
 
-void ASTWriter::AddDeclarationNameInfo(const DeclarationNameInfo &NameInfo,
-                                       RecordDataImpl &Record) {
-  AddDeclarationName(NameInfo.getName(), Record);
-  AddSourceLocation(NameInfo.getLoc(), Record);
-  AddDeclarationNameLoc(NameInfo.getInfo(), NameInfo.getName(), Record);
+void ASTRecordWriter::AddDeclarationNameInfo(
+    const DeclarationNameInfo &NameInfo) {
+  AddDeclarationName(NameInfo.getName());
+  AddSourceLocation(NameInfo.getLoc());
+  AddDeclarationNameLoc(NameInfo.getInfo(), NameInfo.getName());
 }
 
-void ASTWriter::AddQualifierInfo(const QualifierInfo &Info,
-                                 RecordDataImpl &Record) {
-  AddNestedNameSpecifierLoc(Info.QualifierLoc, Record);
-  Record.push_back(Info.NumTemplParamLists);
+void ASTRecordWriter::AddQualifierInfo(const QualifierInfo &Info) {
+  AddNestedNameSpecifierLoc(Info.QualifierLoc);
+  Record->push_back(Info.NumTemplParamLists);
   for (unsigned i=0, e=Info.NumTemplParamLists; i != e; ++i)
-    AddTemplateParameterList(Info.TemplParamLists[i], Record);
+    AddTemplateParameterList(Info.TemplParamLists[i]);
 }
 
-void ASTWriter::AddNestedNameSpecifier(NestedNameSpecifier *NNS,
-                                       RecordDataImpl &Record) {
+void ASTRecordWriter::AddNestedNameSpecifier(NestedNameSpecifier *NNS) {
   // Nested name specifiers usually aren't too long. I think that 8 would
   // typically accommodate the vast majority.
   SmallVector<NestedNameSpecifier *, 8> NestedNames;
@@ -5180,28 +5200,28 @@
     NNS = NNS->getPrefix();
   }
 
-  Record.push_back(NestedNames.size());
+  Record->push_back(NestedNames.size());
   while(!NestedNames.empty()) {
     NNS = NestedNames.pop_back_val();
     NestedNameSpecifier::SpecifierKind Kind = NNS->getKind();
-    Record.push_back(Kind);
+    Record->push_back(Kind);
     switch (Kind) {
     case NestedNameSpecifier::Identifier:
-      AddIdentifierRef(NNS->getAsIdentifier(), Record);
+      AddIdentifierRef(NNS->getAsIdentifier());
       break;
 
     case NestedNameSpecifier::Namespace:
-      AddDeclRef(NNS->getAsNamespace(), Record);
+      AddDeclRef(NNS->getAsNamespace());
       break;
 
     case NestedNameSpecifier::NamespaceAlias:
-      AddDeclRef(NNS->getAsNamespaceAlias(), Record);
+      AddDeclRef(NNS->getAsNamespaceAlias());
       break;
 
     case NestedNameSpecifier::TypeSpec:
     case NestedNameSpecifier::TypeSpecWithTemplate:
-      AddTypeRef(QualType(NNS->getAsType(), 0), Record);
-      Record.push_back(Kind == NestedNameSpecifier::TypeSpecWithTemplate);
+      AddTypeRef(QualType(NNS->getAsType(), 0));
+      Record->push_back(Kind == NestedNameSpecifier::TypeSpecWithTemplate);
       break;
 
     case NestedNameSpecifier::Global:
@@ -5209,14 +5229,13 @@
       break;
 
     case NestedNameSpecifier::Super:
-      AddDeclRef(NNS->getAsRecordDecl(), Record);
+      AddDeclRef(NNS->getAsRecordDecl());
       break;
     }
   }
 }
 
-void ASTWriter::AddNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS,
-                                          RecordDataImpl &Record) {
+void ASTRecordWriter::AddNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) {
   // Nested name specifiers usually aren't too long. I think that 8 would
   // typically accommodate the vast majority.
   SmallVector<NestedNameSpecifierLoc , 8> NestedNames;
@@ -5228,373 +5247,334 @@
     NNS = NNS.getPrefix();
   }
 
-  Record.push_back(NestedNames.size());
+  Record->push_back(NestedNames.size());
   while(!NestedNames.empty()) {
     NNS = NestedNames.pop_back_val();
     NestedNameSpecifier::SpecifierKind Kind
       = NNS.getNestedNameSpecifier()->getKind();
-    Record.push_back(Kind);
+    Record->push_back(Kind);
     switch (Kind) {
     case NestedNameSpecifier::Identifier:
-      AddIdentifierRef(NNS.getNestedNameSpecifier()->getAsIdentifier(), Record);
-      AddSourceRange(NNS.getLocalSourceRange(), Record);
+      AddIdentifierRef(NNS.getNestedNameSpecifier()->getAsIdentifier());
+      AddSourceRange(NNS.getLocalSourceRange());
       break;
 
     case NestedNameSpecifier::Namespace:
-      AddDeclRef(NNS.getNestedNameSpecifier()->getAsNamespace(), Record);
-      AddSourceRange(NNS.getLocalSourceRange(), Record);
+      AddDeclRef(NNS.getNestedNameSpecifier()->getAsNamespace());
+      AddSourceRange(NNS.getLocalSourceRange());
       break;
 
     case NestedNameSpecifier::NamespaceAlias:
-      AddDeclRef(NNS.getNestedNameSpecifier()->getAsNamespaceAlias(), Record);
-      AddSourceRange(NNS.getLocalSourceRange(), Record);
+      AddDeclRef(NNS.getNestedNameSpecifier()->getAsNamespaceAlias());
+      AddSourceRange(NNS.getLocalSourceRange());
       break;
 
     case NestedNameSpecifier::TypeSpec:
     case NestedNameSpecifier::TypeSpecWithTemplate:
-      Record.push_back(Kind == NestedNameSpecifier::TypeSpecWithTemplate);
-      AddTypeLoc(NNS.getTypeLoc(), Record);
-      AddSourceLocation(NNS.getLocalSourceRange().getEnd(), Record);
+      Record->push_back(Kind == NestedNameSpecifier::TypeSpecWithTemplate);
+      AddTypeLoc(NNS.getTypeLoc());
+      AddSourceLocation(NNS.getLocalSourceRange().getEnd());
       break;
 
     case NestedNameSpecifier::Global:
-      AddSourceLocation(NNS.getLocalSourceRange().getEnd(), Record);
+      AddSourceLocation(NNS.getLocalSourceRange().getEnd());
       break;
 
     case NestedNameSpecifier::Super:
-      AddDeclRef(NNS.getNestedNameSpecifier()->getAsRecordDecl(), Record);
-      AddSourceRange(NNS.getLocalSourceRange(), Record);
+      AddDeclRef(NNS.getNestedNameSpecifier()->getAsRecordDecl());
+      AddSourceRange(NNS.getLocalSourceRange());
       break;
     }
   }
 }
 
-void ASTWriter::AddTemplateName(TemplateName Name, RecordDataImpl &Record) {
+void ASTRecordWriter::AddTemplateName(TemplateName Name) {
   TemplateName::NameKind Kind = Name.getKind();
-  Record.push_back(Kind);
+  Record->push_back(Kind);
   switch (Kind) {
   case TemplateName::Template:
-    AddDeclRef(Name.getAsTemplateDecl(), Record);
+    AddDeclRef(Name.getAsTemplateDecl());
     break;
 
   case TemplateName::OverloadedTemplate: {
     OverloadedTemplateStorage *OvT = Name.getAsOverloadedTemplate();
-    Record.push_back(OvT->size());
+    Record->push_back(OvT->size());
     for (const auto &I : *OvT)
-      AddDeclRef(I, Record);
+      AddDeclRef(I);
     break;
   }
 
   case TemplateName::QualifiedTemplate: {
     QualifiedTemplateName *QualT = Name.getAsQualifiedTemplateName();
-    AddNestedNameSpecifier(QualT->getQualifier(), Record);
-    Record.push_back(QualT->hasTemplateKeyword());
-    AddDeclRef(QualT->getTemplateDecl(), Record);
+    AddNestedNameSpecifier(QualT->getQualifier());
+    Record->push_back(QualT->hasTemplateKeyword());
+    AddDeclRef(QualT->getTemplateDecl());
     break;
   }
 
   case TemplateName::DependentTemplate: {
     DependentTemplateName *DepT = Name.getAsDependentTemplateName();
-    AddNestedNameSpecifier(DepT->getQualifier(), Record);
-    Record.push_back(DepT->isIdentifier());
+    AddNestedNameSpecifier(DepT->getQualifier());
+    Record->push_back(DepT->isIdentifier());
     if (DepT->isIdentifier())
-      AddIdentifierRef(DepT->getIdentifier(), Record);
+      AddIdentifierRef(DepT->getIdentifier());
     else
-      Record.push_back(DepT->getOperator());
+      Record->push_back(DepT->getOperator());
     break;
   }
 
   case TemplateName::SubstTemplateTemplateParm: {
     SubstTemplateTemplateParmStorage *subst
       = Name.getAsSubstTemplateTemplateParm();
-    AddDeclRef(subst->getParameter(), Record);
-    AddTemplateName(subst->getReplacement(), Record);
+    AddDeclRef(subst->getParameter());
+    AddTemplateName(subst->getReplacement());
     break;
   }
       
   case TemplateName::SubstTemplateTemplateParmPack: {
     SubstTemplateTemplateParmPackStorage *SubstPack
       = Name.getAsSubstTemplateTemplateParmPack();
-    AddDeclRef(SubstPack->getParameterPack(), Record);
-    AddTemplateArgument(SubstPack->getArgumentPack(), Record);
+    AddDeclRef(SubstPack->getParameterPack());
+    AddTemplateArgument(SubstPack->getArgumentPack());
     break;
   }
   }
 }
 
-void ASTWriter::AddTemplateArgument(const TemplateArgument &Arg,
-                                    RecordDataImpl &Record) {
-  Record.push_back(Arg.getKind());
+void ASTRecordWriter::AddTemplateArgument(const TemplateArgument &Arg) {
+  Record->push_back(Arg.getKind());
   switch (Arg.getKind()) {
   case TemplateArgument::Null:
     break;
   case TemplateArgument::Type:
-    AddTypeRef(Arg.getAsType(), Record);
+    AddTypeRef(Arg.getAsType());
     break;
   case TemplateArgument::Declaration:
-    AddDeclRef(Arg.getAsDecl(), Record);
-    AddTypeRef(Arg.getParamTypeForDecl(), Record);
+    AddDeclRef(Arg.getAsDecl());
+    AddTypeRef(Arg.getParamTypeForDecl());
     break;
   case TemplateArgument::NullPtr:
-    AddTypeRef(Arg.getNullPtrType(), Record);
+    AddTypeRef(Arg.getNullPtrType());
     break;
   case TemplateArgument::Integral:
-    AddAPSInt(Arg.getAsIntegral(), Record);
-    AddTypeRef(Arg.getIntegralType(), Record);
+    AddAPSInt(Arg.getAsIntegral());
+    AddTypeRef(Arg.getIntegralType());
     break;
   case TemplateArgument::Template:
-    AddTemplateName(Arg.getAsTemplateOrTemplatePattern(), Record);
+    AddTemplateName(Arg.getAsTemplateOrTemplatePattern());
     break;
   case TemplateArgument::TemplateExpansion:
-    AddTemplateName(Arg.getAsTemplateOrTemplatePattern(), Record);
+    AddTemplateName(Arg.getAsTemplateOrTemplatePattern());
     if (Optional<unsigned> NumExpansions = Arg.getNumTemplateExpansions())
-      Record.push_back(*NumExpansions + 1);
+      Record->push_back(*NumExpansions + 1);
     else
-      Record.push_back(0);
+      Record->push_back(0);
     break;
   case TemplateArgument::Expression:
     AddStmt(Arg.getAsExpr());
     break;
   case TemplateArgument::Pack:
-    Record.push_back(Arg.pack_size());
+    Record->push_back(Arg.pack_size());
     for (const auto &P : Arg.pack_elements())
-      AddTemplateArgument(P, Record);
+      AddTemplateArgument(P);
     break;
   }
 }
 
-void
-ASTWriter::AddTemplateParameterList(const TemplateParameterList *TemplateParams,
-                                    RecordDataImpl &Record) {
+void ASTRecordWriter::AddTemplateParameterList(
+    const TemplateParameterList *TemplateParams) {
   assert(TemplateParams && "No TemplateParams!");
-  AddSourceLocation(TemplateParams->getTemplateLoc(), Record);
-  AddSourceLocation(TemplateParams->getLAngleLoc(), Record);
-  AddSourceLocation(TemplateParams->getRAngleLoc(), Record);
-  Record.push_back(TemplateParams->size());
+  AddSourceLocation(TemplateParams->getTemplateLoc());
+  AddSourceLocation(TemplateParams->getLAngleLoc());
+  AddSourceLocation(TemplateParams->getRAngleLoc());
+  // TODO: Concepts
+  Record->push_back(TemplateParams->size());
   for (const auto &P : *TemplateParams)
-    AddDeclRef(P, Record);
+    AddDeclRef(P);
 }
 
 /// \brief Emit a template argument list.
-void
-ASTWriter::AddTemplateArgumentList(const TemplateArgumentList *TemplateArgs,
-                                   RecordDataImpl &Record) {
+void ASTRecordWriter::AddTemplateArgumentList(
+    const TemplateArgumentList *TemplateArgs) {
   assert(TemplateArgs && "No TemplateArgs!");
-  Record.push_back(TemplateArgs->size());
+  Record->push_back(TemplateArgs->size());
   for (int i=0, e = TemplateArgs->size(); i != e; ++i)
-    AddTemplateArgument(TemplateArgs->get(i), Record);
+    AddTemplateArgument(TemplateArgs->get(i));
 }
 
-void
-ASTWriter::AddASTTemplateArgumentListInfo
-(const ASTTemplateArgumentListInfo *ASTTemplArgList, RecordDataImpl &Record) {
+void ASTRecordWriter::AddASTTemplateArgumentListInfo(
+    const ASTTemplateArgumentListInfo *ASTTemplArgList) {
   assert(ASTTemplArgList && "No ASTTemplArgList!");
-  AddSourceLocation(ASTTemplArgList->LAngleLoc, Record);
-  AddSourceLocation(ASTTemplArgList->RAngleLoc, Record);
-  Record.push_back(ASTTemplArgList->NumTemplateArgs);
+  AddSourceLocation(ASTTemplArgList->LAngleLoc);
+  AddSourceLocation(ASTTemplArgList->RAngleLoc);
+  Record->push_back(ASTTemplArgList->NumTemplateArgs);
   const TemplateArgumentLoc *TemplArgs = ASTTemplArgList->getTemplateArgs();
   for (int i=0, e = ASTTemplArgList->NumTemplateArgs; i != e; ++i)
-    AddTemplateArgumentLoc(TemplArgs[i], Record);
+    AddTemplateArgumentLoc(TemplArgs[i]);
 }
 
-void
-ASTWriter::AddUnresolvedSet(const ASTUnresolvedSet &Set, RecordDataImpl &Record) {
-  Record.push_back(Set.size());
+void ASTRecordWriter::AddUnresolvedSet(const ASTUnresolvedSet &Set) {
+  Record->push_back(Set.size());
   for (ASTUnresolvedSet::const_iterator
          I = Set.begin(), E = Set.end(); I != E; ++I) {
-    AddDeclRef(I.getDecl(), Record);
-    Record.push_back(I.getAccess());
+    AddDeclRef(I.getDecl());
+    Record->push_back(I.getAccess());
   }
 }
 
-void ASTWriter::AddCXXBaseSpecifier(const CXXBaseSpecifier &Base,
-                                    RecordDataImpl &Record) {
-  Record.push_back(Base.isVirtual());
-  Record.push_back(Base.isBaseOfClass());
-  Record.push_back(Base.getAccessSpecifierAsWritten());
-  Record.push_back(Base.getInheritConstructors());
-  AddTypeSourceInfo(Base.getTypeSourceInfo(), Record);
-  AddSourceRange(Base.getSourceRange(), Record);
+// FIXME: Move this out of the main ASTRecordWriter interface.
+void ASTRecordWriter::AddCXXBaseSpecifier(const CXXBaseSpecifier &Base) {
+  Record->push_back(Base.isVirtual());
+  Record->push_back(Base.isBaseOfClass());
+  Record->push_back(Base.getAccessSpecifierAsWritten());
+  Record->push_back(Base.getInheritConstructors());
+  AddTypeSourceInfo(Base.getTypeSourceInfo());
+  AddSourceRange(Base.getSourceRange());
   AddSourceLocation(Base.isPackExpansion()? Base.getEllipsisLoc() 
-                                          : SourceLocation(),
-                    Record);
+                                          : SourceLocation());
 }
 
-void ASTWriter::FlushCXXBaseSpecifiers() {
-  RecordData Record;
-  unsigned N = CXXBaseSpecifiersToWrite.size();
-  for (unsigned I = 0; I != N; ++I) {
-    Record.clear();
-    
-    // Record the offset of this base-specifier set.
-    unsigned Index = CXXBaseSpecifiersToWrite[I].ID - 1;
-    if (Index == CXXBaseSpecifiersOffsets.size())
-      CXXBaseSpecifiersOffsets.push_back(Stream.GetCurrentBitNo());
-    else {
-      if (Index > CXXBaseSpecifiersOffsets.size())
-        CXXBaseSpecifiersOffsets.resize(Index + 1);
-      CXXBaseSpecifiersOffsets[Index] = Stream.GetCurrentBitNo();
-    }
+static uint64_t EmitCXXBaseSpecifiers(ASTWriter &W,
+                                      ArrayRef<CXXBaseSpecifier> Bases) {
+  ASTWriter::RecordData Record;
+  ASTRecordWriter Writer(W, Record);
+  Writer.push_back(Bases.size());
 
-    const CXXBaseSpecifier *B = CXXBaseSpecifiersToWrite[I].Bases,
-                        *BEnd = CXXBaseSpecifiersToWrite[I].BasesEnd;
-    Record.push_back(BEnd - B);
-    for (; B != BEnd; ++B)
-      AddCXXBaseSpecifier(*B, Record);
-    Stream.EmitRecord(serialization::DECL_CXX_BASE_SPECIFIERS, Record);
-    
-    // Flush any expressions that were written as part of the base specifiers.
-    FlushStmts();
-  }
+  for (auto &Base : Bases)
+    Writer.AddCXXBaseSpecifier(Base);
 
-  assert(N == CXXBaseSpecifiersToWrite.size() &&
-         "added more base specifiers while writing base specifiers");
-  CXXBaseSpecifiersToWrite.clear();
+  return Writer.Emit(serialization::DECL_CXX_BASE_SPECIFIERS);
 }
 
-void ASTWriter::AddCXXCtorInitializers(
-                             const CXXCtorInitializer * const *CtorInitializers,
-                             unsigned NumCtorInitializers,
-                             RecordDataImpl &Record) {
-  Record.push_back(NumCtorInitializers);
-  for (unsigned i=0; i != NumCtorInitializers; ++i) {
-    const CXXCtorInitializer *Init = CtorInitializers[i];
+// FIXME: Move this out of the main ASTRecordWriter interface.
+void ASTRecordWriter::AddCXXBaseSpecifiers(ArrayRef<CXXBaseSpecifier> Bases) {
+  AddOffset(EmitCXXBaseSpecifiers(*Writer, Bases));
+}
 
+static uint64_t
+EmitCXXCtorInitializers(ASTWriter &W,
+                        ArrayRef<CXXCtorInitializer *> CtorInits) {
+  ASTWriter::RecordData Record;
+  ASTRecordWriter Writer(W, Record);
+  Writer.push_back(CtorInits.size());
+
+  for (auto *Init : CtorInits) {
     if (Init->isBaseInitializer()) {
-      Record.push_back(CTOR_INITIALIZER_BASE);
-      AddTypeSourceInfo(Init->getTypeSourceInfo(), Record);
-      Record.push_back(Init->isBaseVirtual());
+      Writer.push_back(CTOR_INITIALIZER_BASE);
+      Writer.AddTypeSourceInfo(Init->getTypeSourceInfo());
+      Writer.push_back(Init->isBaseVirtual());
     } else if (Init->isDelegatingInitializer()) {
-      Record.push_back(CTOR_INITIALIZER_DELEGATING);
-      AddTypeSourceInfo(Init->getTypeSourceInfo(), Record);
+      Writer.push_back(CTOR_INITIALIZER_DELEGATING);
+      Writer.AddTypeSourceInfo(Init->getTypeSourceInfo());
     } else if (Init->isMemberInitializer()){
-      Record.push_back(CTOR_INITIALIZER_MEMBER);
-      AddDeclRef(Init->getMember(), Record);
+      Writer.push_back(CTOR_INITIALIZER_MEMBER);
+      Writer.AddDeclRef(Init->getMember());
     } else {
-      Record.push_back(CTOR_INITIALIZER_INDIRECT_MEMBER);
-      AddDeclRef(Init->getIndirectMember(), Record);
+      Writer.push_back(CTOR_INITIALIZER_INDIRECT_MEMBER);
+      Writer.AddDeclRef(Init->getIndirectMember());
     }
 
-    AddSourceLocation(Init->getMemberLocation(), Record);
-    AddStmt(Init->getInit());
-    AddSourceLocation(Init->getLParenLoc(), Record);
-    AddSourceLocation(Init->getRParenLoc(), Record);
-    Record.push_back(Init->isWritten());
+    Writer.AddSourceLocation(Init->getMemberLocation());
+    Writer.AddStmt(Init->getInit());
+    Writer.AddSourceLocation(Init->getLParenLoc());
+    Writer.AddSourceLocation(Init->getRParenLoc());
+    Writer.push_back(Init->isWritten());
     if (Init->isWritten()) {
-      Record.push_back(Init->getSourceOrder());
+      Writer.push_back(Init->getSourceOrder());
     } else {
-      Record.push_back(Init->getNumArrayIndices());
-      for (unsigned i=0, e=Init->getNumArrayIndices(); i != e; ++i)
-        AddDeclRef(Init->getArrayIndex(i), Record);
+      Writer.push_back(Init->getNumArrayIndices());
+      for (auto *VD : Init->getArrayIndices())
+        Writer.AddDeclRef(VD);
     }
   }
+
+  return Writer.Emit(serialization::DECL_CXX_CTOR_INITIALIZERS);
 }
 
-void ASTWriter::FlushCXXCtorInitializers() {
-  RecordData Record;
-
-  unsigned N = CXXCtorInitializersToWrite.size();
-  (void)N; // Silence unused warning in non-assert builds.
-  for (auto &Init : CXXCtorInitializersToWrite) {
-    Record.clear();
-
-    // Record the offset of this mem-initializer list.
-    unsigned Index = Init.ID - 1;
-    if (Index == CXXCtorInitializersOffsets.size())
-      CXXCtorInitializersOffsets.push_back(Stream.GetCurrentBitNo());
-    else {
-      if (Index > CXXCtorInitializersOffsets.size())
-        CXXCtorInitializersOffsets.resize(Index + 1);
-      CXXCtorInitializersOffsets[Index] = Stream.GetCurrentBitNo();
-    }
-
-    AddCXXCtorInitializers(Init.Inits.data(), Init.Inits.size(), Record);
-    Stream.EmitRecord(serialization::DECL_CXX_CTOR_INITIALIZERS, Record);
-
-    // Flush any expressions that were written as part of the initializers.
-    FlushStmts();
-  }
-
-  assert(N == CXXCtorInitializersToWrite.size() &&
-         "added more ctor initializers while writing ctor initializers");
-  CXXCtorInitializersToWrite.clear();
+// FIXME: Move this out of the main ASTRecordWriter interface.
+void ASTRecordWriter::AddCXXCtorInitializers(
+    ArrayRef<CXXCtorInitializer *> CtorInits) {
+  AddOffset(EmitCXXCtorInitializers(*Writer, CtorInits));
 }
 
-void ASTWriter::AddCXXDefinitionData(const CXXRecordDecl *D, RecordDataImpl &Record) {
+void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
   auto &Data = D->data();
-  Record.push_back(Data.IsLambda);
-  Record.push_back(Data.UserDeclaredConstructor);
-  Record.push_back(Data.UserDeclaredSpecialMembers);
-  Record.push_back(Data.Aggregate);
-  Record.push_back(Data.PlainOldData);
-  Record.push_back(Data.Empty);
-  Record.push_back(Data.Polymorphic);
-  Record.push_back(Data.Abstract);
-  Record.push_back(Data.IsStandardLayout);
-  Record.push_back(Data.HasNoNonEmptyBases);
-  Record.push_back(Data.HasPrivateFields);
-  Record.push_back(Data.HasProtectedFields);
-  Record.push_back(Data.HasPublicFields);
-  Record.push_back(Data.HasMutableFields);
-  Record.push_back(Data.HasVariantMembers);
-  Record.push_back(Data.HasOnlyCMembers);
-  Record.push_back(Data.HasInClassInitializer);
-  Record.push_back(Data.HasUninitializedReferenceMember);
-  Record.push_back(Data.NeedOverloadResolutionForMoveConstructor);
-  Record.push_back(Data.NeedOverloadResolutionForMoveAssignment);
-  Record.push_back(Data.NeedOverloadResolutionForDestructor);
-  Record.push_back(Data.DefaultedMoveConstructorIsDeleted);
-  Record.push_back(Data.DefaultedMoveAssignmentIsDeleted);
-  Record.push_back(Data.DefaultedDestructorIsDeleted);
-  Record.push_back(Data.HasTrivialSpecialMembers);
-  Record.push_back(Data.DeclaredNonTrivialSpecialMembers);
-  Record.push_back(Data.HasIrrelevantDestructor);
-  Record.push_back(Data.HasConstexprNonCopyMoveConstructor);
-  Record.push_back(Data.DefaultedDefaultConstructorIsConstexpr);
-  Record.push_back(Data.HasConstexprDefaultConstructor);
-  Record.push_back(Data.HasNonLiteralTypeFieldsOrBases);
-  Record.push_back(Data.ComputedVisibleConversions);
-  Record.push_back(Data.UserProvidedDefaultConstructor);
-  Record.push_back(Data.DeclaredSpecialMembers);
-  Record.push_back(Data.ImplicitCopyConstructorHasConstParam);
-  Record.push_back(Data.ImplicitCopyAssignmentHasConstParam);
-  Record.push_back(Data.HasDeclaredCopyConstructorWithConstParam);
-  Record.push_back(Data.HasDeclaredCopyAssignmentWithConstParam);
+  Record->push_back(Data.IsLambda);
+  Record->push_back(Data.UserDeclaredConstructor);
+  Record->push_back(Data.UserDeclaredSpecialMembers);
+  Record->push_back(Data.Aggregate);
+  Record->push_back(Data.PlainOldData);
+  Record->push_back(Data.Empty);
+  Record->push_back(Data.Polymorphic);
+  Record->push_back(Data.Abstract);
+  Record->push_back(Data.IsStandardLayout);
+  Record->push_back(Data.HasNoNonEmptyBases);
+  Record->push_back(Data.HasPrivateFields);
+  Record->push_back(Data.HasProtectedFields);
+  Record->push_back(Data.HasPublicFields);
+  Record->push_back(Data.HasMutableFields);
+  Record->push_back(Data.HasVariantMembers);
+  Record->push_back(Data.HasOnlyCMembers);
+  Record->push_back(Data.HasInClassInitializer);
+  Record->push_back(Data.HasUninitializedReferenceMember);
+  Record->push_back(Data.HasUninitializedFields);
+  Record->push_back(Data.HasInheritedConstructor);
+  Record->push_back(Data.HasInheritedAssignment);
+  Record->push_back(Data.NeedOverloadResolutionForMoveConstructor);
+  Record->push_back(Data.NeedOverloadResolutionForMoveAssignment);
+  Record->push_back(Data.NeedOverloadResolutionForDestructor);
+  Record->push_back(Data.DefaultedMoveConstructorIsDeleted);
+  Record->push_back(Data.DefaultedMoveAssignmentIsDeleted);
+  Record->push_back(Data.DefaultedDestructorIsDeleted);
+  Record->push_back(Data.HasTrivialSpecialMembers);
+  Record->push_back(Data.DeclaredNonTrivialSpecialMembers);
+  Record->push_back(Data.HasIrrelevantDestructor);
+  Record->push_back(Data.HasConstexprNonCopyMoveConstructor);
+  Record->push_back(Data.HasDefaultedDefaultConstructor);
+  Record->push_back(Data.DefaultedDefaultConstructorIsConstexpr);
+  Record->push_back(Data.HasConstexprDefaultConstructor);
+  Record->push_back(Data.HasNonLiteralTypeFieldsOrBases);
+  Record->push_back(Data.ComputedVisibleConversions);
+  Record->push_back(Data.UserProvidedDefaultConstructor);
+  Record->push_back(Data.DeclaredSpecialMembers);
+  Record->push_back(Data.ImplicitCopyConstructorHasConstParam);
+  Record->push_back(Data.ImplicitCopyAssignmentHasConstParam);
+  Record->push_back(Data.HasDeclaredCopyConstructorWithConstParam);
+  Record->push_back(Data.HasDeclaredCopyAssignmentWithConstParam);
   // IsLambda bit is already saved.
 
-  Record.push_back(Data.NumBases);
+  Record->push_back(Data.NumBases);
   if (Data.NumBases > 0)
-    AddCXXBaseSpecifiersRef(Data.getBases(), Data.getBases() + Data.NumBases, 
-                            Record);
-  
-  // FIXME: Make VBases lazily computed when needed to avoid storing them.
-  Record.push_back(Data.NumVBases);
-  if (Data.NumVBases > 0)
-    AddCXXBaseSpecifiersRef(Data.getVBases(), Data.getVBases() + Data.NumVBases, 
-                            Record);
+    AddCXXBaseSpecifiers(Data.bases());
 
-  AddUnresolvedSet(Data.Conversions.get(*Context), Record);
-  AddUnresolvedSet(Data.VisibleConversions.get(*Context), Record);
+  // FIXME: Make VBases lazily computed when needed to avoid storing them.
+  Record->push_back(Data.NumVBases);
+  if (Data.NumVBases > 0)
+    AddCXXBaseSpecifiers(Data.vbases());
+
+  AddUnresolvedSet(Data.Conversions.get(*Writer->Context));
+  AddUnresolvedSet(Data.VisibleConversions.get(*Writer->Context));
   // Data.Definition is the owning decl, no need to write it. 
-  AddDeclRef(D->getFirstFriend(), Record);
+  AddDeclRef(D->getFirstFriend());
   
   // Add lambda-specific data.
   if (Data.IsLambda) {
     auto &Lambda = D->getLambdaData();
-    Record.push_back(Lambda.Dependent);
-    Record.push_back(Lambda.IsGenericLambda);
-    Record.push_back(Lambda.CaptureDefault);
-    Record.push_back(Lambda.NumCaptures);
-    Record.push_back(Lambda.NumExplicitCaptures);
-    Record.push_back(Lambda.ManglingNumber);
-    AddDeclRef(Lambda.ContextDecl, Record);
-    AddTypeSourceInfo(Lambda.MethodTyInfo, Record);
+    Record->push_back(Lambda.Dependent);
+    Record->push_back(Lambda.IsGenericLambda);
+    Record->push_back(Lambda.CaptureDefault);
+    Record->push_back(Lambda.NumCaptures);
+    Record->push_back(Lambda.NumExplicitCaptures);
+    Record->push_back(Lambda.ManglingNumber);
+    AddDeclRef(Lambda.ContextDecl);
+    AddTypeSourceInfo(Lambda.MethodTyInfo);
     for (unsigned I = 0, N = Lambda.NumCaptures; I != N; ++I) {
       const LambdaCapture &Capture = Lambda.Captures[I];
-      AddSourceLocation(Capture.getLocation(), Record);
-      Record.push_back(Capture.isImplicit());
-      Record.push_back(Capture.getCaptureKind());
+      AddSourceLocation(Capture.getLocation());
+      Record->push_back(Capture.isImplicit());
+      Record->push_back(Capture.getCaptureKind());
       switch (Capture.getCaptureKind()) {
+      case LCK_StarThis:
       case LCK_This:
       case LCK_VLAType:
         break;
@@ -5602,10 +5582,9 @@
       case LCK_ByRef:
         VarDecl *Var =
             Capture.capturesVariable() ? Capture.getCapturedVar() : nullptr;
-        AddDeclRef(Var, Record);
+        AddDeclRef(Var);
         AddSourceLocation(Capture.isPackExpansion() ? Capture.getEllipsisLoc()
-                                                    : SourceLocation(),
-                          Record);
+                                                    : SourceLocation());
         break;
       }
     }
@@ -5685,6 +5664,7 @@
 }
 
 void ASTWriter::CompletedTagDefinition(const TagDecl *D) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(D->isCompleteDefinition());
   assert(!WritingAST && "Already writing the AST!");
   if (auto *RD = dyn_cast<CXXRecordDecl>(D)) {
@@ -5705,17 +5685,14 @@
   if (D->isFromASTFile())
     return true;
 
-  // If we've not loaded any modules, this can't be imported.
-  if (!Chain || !Chain->getModuleManager().size())
-    return false;
-
   // The predefined __va_list_tag struct is imported if we imported any decls.
   // FIXME: This is a gross hack.
   return D == D->getASTContext().getVaListTagDecl();
 }
 
 void ASTWriter::AddedVisibleDecl(const DeclContext *DC, const Decl *D) {
-   assert(DC->isLookupContext() &&
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
+  assert(DC->isLookupContext() &&
           "Should not add lookup results to non-lookup contexts!");
 
   // TU is handled elsewhere.
@@ -5749,6 +5726,7 @@
 }
 
 void ASTWriter::AddedCXXImplicitMember(const CXXRecordDecl *RD, const Decl *D) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(D->isImplicit());
 
   // We're only interested in cases where a local declaration is added to an
@@ -5766,6 +5744,7 @@
 }
 
 void ASTWriter::ResolvedExceptionSpec(const FunctionDecl *FD) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!DoneWritingDeclsAndTypes && "Already done writing updates!");
   if (!Chain) return;
   Chain->forEachImportedKeyDecl(FD, [&](const Decl *D) {
@@ -5780,6 +5759,7 @@
 }
 
 void ASTWriter::DeducedReturnType(const FunctionDecl *FD, QualType ReturnType) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   if (!Chain) return;
   Chain->forEachImportedKeyDecl(FD, [&](const Decl *D) {
@@ -5790,6 +5770,7 @@
 
 void ASTWriter::ResolvedOperatorDelete(const CXXDestructorDecl *DD,
                                        const FunctionDecl *Delete) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   assert(Delete && "Not given an operator delete");
   if (!Chain) return;
@@ -5799,6 +5780,7 @@
 }
 
 void ASTWriter::CompletedImplicitDefinition(const FunctionDecl *D) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   if (!D->isFromASTFile())
     return; // Declaration not imported from PCH.
@@ -5808,6 +5790,7 @@
 }
 
 void ASTWriter::FunctionDefinitionInstantiated(const FunctionDecl *D) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   if (!D->isFromASTFile())
     return;
@@ -5816,6 +5799,7 @@
 }
 
 void ASTWriter::StaticDataMemberInstantiated(const VarDecl *D) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   if (!D->isFromASTFile())
     return;
@@ -5828,6 +5812,7 @@
 }
 
 void ASTWriter::DefaultArgumentInstantiated(const ParmVarDecl *D) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   if (!D->isFromASTFile())
     return;
@@ -5836,8 +5821,18 @@
       DeclUpdate(UPD_CXX_INSTANTIATED_DEFAULT_ARGUMENT, D));
 }
 
+void ASTWriter::DefaultMemberInitializerInstantiated(const FieldDecl *D) {
+  assert(!WritingAST && "Already writing the AST!");
+  if (!D->isFromASTFile())
+    return;
+
+  DeclUpdates[D].push_back(
+      DeclUpdate(UPD_CXX_INSTANTIATED_DEFAULT_MEMBER_INITIALIZER, D));
+}
+
 void ASTWriter::AddedObjCCategoryToInterface(const ObjCCategoryDecl *CatD,
                                              const ObjCInterfaceDecl *IFD) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   if (!IFD->isFromASTFile())
     return; // Declaration not imported from PCH.
@@ -5848,14 +5843,21 @@
 }
 
 void ASTWriter::DeclarationMarkedUsed(const Decl *D) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
-  if (!D->isFromASTFile())
-    return;
+
+  // If there is *any* declaration of the entity that's not from an AST file,
+  // we can skip writing the update record. We make sure that isUsed() triggers
+  // completion of the redeclaration chain of the entity.
+  for (auto Prev = D->getMostRecentDecl(); Prev; Prev = Prev->getPreviousDecl())
+    if (IsLocalDecl(Prev))
+      return;
 
   DeclUpdates[D].push_back(DeclUpdate(UPD_DECL_MARKED_USED));
 }
 
 void ASTWriter::DeclarationMarkedOpenMPThreadPrivate(const Decl *D) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   if (!D->isFromASTFile())
     return;
@@ -5863,7 +5865,19 @@
   DeclUpdates[D].push_back(DeclUpdate(UPD_DECL_MARKED_OPENMP_THREADPRIVATE));
 }
 
+void ASTWriter::DeclarationMarkedOpenMPDeclareTarget(const Decl *D,
+                                                     const Attr *Attr) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
+  assert(!WritingAST && "Already writing the AST!");
+  if (!D->isFromASTFile())
+    return;
+
+  DeclUpdates[D].push_back(
+      DeclUpdate(UPD_DECL_MARKED_OPENMP_DECLARETARGET, Attr));
+}
+
 void ASTWriter::RedefinedHiddenDefinition(const NamedDecl *D, Module *M) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   assert(D->isHidden() && "expected a hidden declaration");
   DeclUpdates[D].push_back(DeclUpdate(UPD_DECL_EXPORTED, M));
@@ -5871,6 +5885,7 @@
 
 void ASTWriter::AddedAttributeToRecord(const Attr *Attr,
                                        const RecordDecl *Record) {
+  if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
   if (!Record->isFromASTFile())
     return;
diff --git a/lib/Serialization/ASTWriterDecl.cpp b/lib/Serialization/ASTWriterDecl.cpp
index 49893dc..158123b 100644
--- a/lib/Serialization/ASTWriterDecl.cpp
+++ b/lib/Serialization/ASTWriterDecl.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Serialization/ASTWriter.h"
 #include "ASTCommon.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclContextInternals.h"
@@ -20,7 +19,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Serialization/ASTReader.h"
-#include "llvm/ADT/Twine.h"
+#include "clang/Serialization/ASTWriter.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace clang;
@@ -32,23 +31,31 @@
 
 namespace clang {
   class ASTDeclWriter : public DeclVisitor<ASTDeclWriter, void> {
-
     ASTWriter &Writer;
     ASTContext &Context;
-    typedef ASTWriter::RecordData RecordData;
-    RecordData &Record;
+    ASTRecordWriter Record;
 
-  public:
     serialization::DeclCode Code;
     unsigned AbbrevToUse;
 
-    ASTDeclWriter(ASTWriter &Writer, ASTContext &Context, RecordData &Record)
-      : Writer(Writer), Context(Context), Record(Record) {
+  public:
+    ASTDeclWriter(ASTWriter &Writer, ASTContext &Context,
+                  ASTWriter::RecordDataImpl &Record)
+        : Writer(Writer), Context(Context), Record(Writer, Record),
+          Code((serialization::DeclCode)0), AbbrevToUse(0) {}
+
+    uint64_t Emit(Decl *D) {
+      if (!Code)
+        llvm::report_fatal_error(StringRef("unexpected declaration kind '") +
+            D->getDeclKindName() + "'");
+      return Record.Emit(Code, AbbrevToUse);
     }
 
     void Visit(Decl *D);
 
     void VisitDecl(Decl *D);
+    void VisitPragmaCommentDecl(PragmaCommentDecl *D);
+    void VisitPragmaDetectMismatchDecl(PragmaDetectMismatchDecl *D);
     void VisitTranslationUnitDecl(TranslationUnitDecl *D);
     void VisitNamedDecl(NamedDecl *D);
     void VisitLabelDecl(LabelDecl *LD);
@@ -89,6 +96,8 @@
     void VisitVarDecl(VarDecl *D);
     void VisitImplicitParamDecl(ImplicitParamDecl *D);
     void VisitParmVarDecl(ParmVarDecl *D);
+    void VisitDecompositionDecl(DecompositionDecl *D);
+    void VisitBindingDecl(BindingDecl *D);
     void VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D);
     void VisitTemplateDecl(TemplateDecl *D);
     void VisitRedeclarableTemplateDecl(RedeclarableTemplateDecl *D);
@@ -99,6 +108,7 @@
     void VisitTypeAliasTemplateDecl(TypeAliasTemplateDecl *D);
     void VisitUsingDecl(UsingDecl *D);
     void VisitUsingShadowDecl(UsingShadowDecl *D);
+    void VisitConstructorUsingShadowDecl(ConstructorUsingShadowDecl *D);
     void VisitLinkageSpecDecl(LinkageSpecDecl *D);
     void VisitFileScopeAsmDecl(FileScopeAsmDecl *D);
     void VisitImportDecl(ImportDecl *D);
@@ -110,8 +120,7 @@
     void VisitCapturedDecl(CapturedDecl *D);
     void VisitEmptyDecl(EmptyDecl *D);
 
-    void VisitDeclContext(DeclContext *DC, uint64_t LexicalOffset,
-                          uint64_t VisibleOffset);
+    void VisitDeclContext(DeclContext *DC);
     template <typename T> void VisitRedeclarable(Redeclarable<T> *D);
 
 
@@ -131,6 +140,8 @@
     void VisitObjCPropertyDecl(ObjCPropertyDecl *D);
     void VisitObjCPropertyImplDecl(ObjCPropertyImplDecl *D);
     void VisitOMPThreadPrivateDecl(OMPThreadPrivateDecl *D);
+    void VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D);
+    void VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D);
 
     /// Add an Objective-C type parameter list to the given record.
     void AddObjCTypeParamList(ObjCTypeParamList *typeParams) {
@@ -142,21 +153,10 @@
 
       Record.push_back(typeParams->size());
       for (auto typeParam : *typeParams) {
-        Writer.AddDeclRef(typeParam, Record);
+        Record.AddDeclRef(typeParam);
       }
-      Writer.AddSourceLocation(typeParams->getLAngleLoc(), Record);
-      Writer.AddSourceLocation(typeParams->getRAngleLoc(), Record);
-    }
-
-    void AddFunctionDefinition(const FunctionDecl *FD) {
-      assert(FD->doesThisDeclarationHaveABody());
-      if (auto *CD = dyn_cast<CXXConstructorDecl>(FD)) {
-        Record.push_back(CD->NumCtorInitializers);
-        if (CD->NumCtorInitializers)
-          Writer.AddCXXCtorInitializersRef(
-              llvm::makeArrayRef(CD->init_begin(), CD->init_end()), Record);
-      }
-      Writer.AddStmt(FD->getBody());
+      Record.AddSourceLocation(typeParams->getLAngleLoc());
+      Record.AddSourceLocation(typeParams->getRAngleLoc());
     }
 
     /// Add to the record the first declaration from each module file that
@@ -172,7 +172,7 @@
           Firsts[nullptr] = R;
       }
       for (const auto &F : Firsts)
-        Writer.AddDeclRef(F.second, Record);
+        Record.AddDeclRef(F.second);
     }
 
     /// Get the specialization decl from an entry in the specialization list.
@@ -261,7 +261,7 @@
   // abbreviation infrastructure requires that arrays are encoded last, so
   // we handle it here in the case of those classes derived from DeclaratorDecl
   if (DeclaratorDecl *DD = dyn_cast<DeclaratorDecl>(D)) {
-    Writer.AddTypeSourceInfo(DD->getTypeSourceInfo(), Record);
+    Record.AddTypeSourceInfo(DD->getTypeSourceInfo());
   }
 
   // Handle FunctionDecl's body here and write it after all other Stmts/Exprs
@@ -270,21 +270,26 @@
   if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
     Record.push_back(FD->doesThisDeclarationHaveABody());
     if (FD->doesThisDeclarationHaveABody())
-      AddFunctionDefinition(FD);
+      Record.AddFunctionDefinition(FD);
   }
+
+  // If this declaration is also a DeclContext, write blocks for the
+  // declarations that lexically stored inside its context and those
+  // declarations that are visible from its context.
+  if (DeclContext *DC = dyn_cast<DeclContext>(D))
+    VisitDeclContext(DC);
 }
 
 void ASTDeclWriter::VisitDecl(Decl *D) {
-  Writer.AddDeclRef(cast_or_null<Decl>(D->getDeclContext()), Record);
+  Record.AddDeclRef(cast_or_null<Decl>(D->getDeclContext()));
   if (D->getDeclContext() != D->getLexicalDeclContext())
-    Writer.AddDeclRef(cast_or_null<Decl>(D->getLexicalDeclContext()), Record);
+    Record.AddDeclRef(cast_or_null<Decl>(D->getLexicalDeclContext()));
   else
     Record.push_back(0);
   Record.push_back(D->isInvalidDecl());
   Record.push_back(D->hasAttrs());
   if (D->hasAttrs())
-    Writer.WriteAttributes(llvm::makeArrayRef(D->getAttrs().begin(),
-                                              D->getAttrs().size()), Record);
+    Record.AddAttributes(D->getAttrs());
   Record.push_back(D->isImplicit());
   Record.push_back(D->isUsed(false));
   Record.push_back(D->isReferenced());
@@ -314,13 +319,35 @@
   }
 }
 
+void ASTDeclWriter::VisitPragmaCommentDecl(PragmaCommentDecl *D) {
+  StringRef Arg = D->getArg();
+  Record.push_back(Arg.size());
+  VisitDecl(D);
+  Record.AddSourceLocation(D->getLocStart());
+  Record.push_back(D->getCommentKind());
+  Record.AddString(Arg);
+  Code = serialization::DECL_PRAGMA_COMMENT;
+}
+
+void ASTDeclWriter::VisitPragmaDetectMismatchDecl(
+    PragmaDetectMismatchDecl *D) {
+  StringRef Name = D->getName();
+  StringRef Value = D->getValue();
+  Record.push_back(Name.size() + 1 + Value.size());
+  VisitDecl(D);
+  Record.AddSourceLocation(D->getLocStart());
+  Record.AddString(Name);
+  Record.AddString(Value);
+  Code = serialization::DECL_PRAGMA_DETECT_MISMATCH;
+}
+
 void ASTDeclWriter::VisitTranslationUnitDecl(TranslationUnitDecl *D) {
   llvm_unreachable("Translation units aren't directly serialized");
 }
 
 void ASTDeclWriter::VisitNamedDecl(NamedDecl *D) {
   VisitDecl(D);
-  Writer.AddDeclarationName(D->getDeclName(), Record);
+  Record.AddDeclarationName(D->getDeclName());
   Record.push_back(needsAnonymousDeclarationNumber(D)
                        ? Writer.getAnonymousDeclarationNumber(D)
                        : 0);
@@ -328,17 +355,17 @@
 
 void ASTDeclWriter::VisitTypeDecl(TypeDecl *D) {
   VisitNamedDecl(D);
-  Writer.AddSourceLocation(D->getLocStart(), Record);
-  Writer.AddTypeRef(QualType(D->getTypeForDecl(), 0), Record);
+  Record.AddSourceLocation(D->getLocStart());
+  Record.AddTypeRef(QualType(D->getTypeForDecl(), 0));
 }
 
 void ASTDeclWriter::VisitTypedefNameDecl(TypedefNameDecl *D) {
   VisitRedeclarable(D);
   VisitTypeDecl(D);
-  Writer.AddTypeSourceInfo(D->getTypeSourceInfo(), Record);
+  Record.AddTypeSourceInfo(D->getTypeSourceInfo());
   Record.push_back(D->isModed());
   if (D->isModed())
-    Writer.AddTypeRef(D->getUnderlyingType(), Record);
+    Record.AddTypeRef(D->getUnderlyingType());
 }
 
 void ASTDeclWriter::VisitTypedefDecl(TypedefDecl *D) {
@@ -359,7 +386,7 @@
 
 void ASTDeclWriter::VisitTypeAliasDecl(TypeAliasDecl *D) {
   VisitTypedefNameDecl(D);
-  Writer.AddDeclRef(D->getDescribedAliasTemplate(), Record);
+  Record.AddDeclRef(D->getDescribedAliasTemplate());
   Code = serialization::DECL_TYPEALIAS;
 }
 
@@ -373,15 +400,15 @@
   Record.push_back(D->isEmbeddedInDeclarator());
   Record.push_back(D->isFreeStanding());
   Record.push_back(D->isCompleteDefinitionRequired());
-  Writer.AddSourceRange(D->getBraceRange(), Record);
+  Record.AddSourceRange(D->getBraceRange());
 
   if (D->hasExtInfo()) {
     Record.push_back(1);
-    Writer.AddQualifierInfo(*D->getExtInfo(), Record);
+    Record.AddQualifierInfo(*D->getExtInfo());
   } else if (auto *TD = D->getTypedefNameForAnonDecl()) {
     Record.push_back(2);
-    Writer.AddDeclRef(TD, Record);
-    Writer.AddIdentifierRef(TD->getDeclName().getAsIdentifierInfo(), Record);
+    Record.AddDeclRef(TD);
+    Record.AddIdentifierRef(TD->getDeclName().getAsIdentifierInfo());
   } else {
     Record.push_back(0);
   }
@@ -389,21 +416,21 @@
 
 void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
   VisitTagDecl(D);
-  Writer.AddTypeSourceInfo(D->getIntegerTypeSourceInfo(), Record);
+  Record.AddTypeSourceInfo(D->getIntegerTypeSourceInfo());
   if (!D->getIntegerTypeSourceInfo())
-    Writer.AddTypeRef(D->getIntegerType(), Record);
-  Writer.AddTypeRef(D->getPromotionType(), Record);
+    Record.AddTypeRef(D->getIntegerType());
+  Record.AddTypeRef(D->getPromotionType());
   Record.push_back(D->getNumPositiveBits());
   Record.push_back(D->getNumNegativeBits());
   Record.push_back(D->isScoped());
   Record.push_back(D->isScopedUsingClassTag());
   Record.push_back(D->isFixed());
   if (MemberSpecializationInfo *MemberInfo = D->getMemberSpecializationInfo()) {
-    Writer.AddDeclRef(MemberInfo->getInstantiatedFrom(), Record);
+    Record.AddDeclRef(MemberInfo->getInstantiatedFrom());
     Record.push_back(MemberInfo->getTemplateSpecializationKind());
-    Writer.AddSourceLocation(MemberInfo->getPointOfInstantiation(), Record);
+    Record.AddSourceLocation(MemberInfo->getPointOfInstantiation());
   } else {
-    Writer.AddDeclRef(nullptr, Record);
+    Record.AddDeclRef(nullptr);
   }
 
   if (D->getDeclContext() == D->getLexicalDeclContext() &&
@@ -457,31 +484,31 @@
 
 void ASTDeclWriter::VisitValueDecl(ValueDecl *D) {
   VisitNamedDecl(D);
-  Writer.AddTypeRef(D->getType(), Record);
+  Record.AddTypeRef(D->getType());
 }
 
 void ASTDeclWriter::VisitEnumConstantDecl(EnumConstantDecl *D) {
   VisitValueDecl(D);
   Record.push_back(D->getInitExpr()? 1 : 0);
   if (D->getInitExpr())
-    Writer.AddStmt(D->getInitExpr());
-  Writer.AddAPSInt(D->getInitVal(), Record);
+    Record.AddStmt(D->getInitExpr());
+  Record.AddAPSInt(D->getInitVal());
 
   Code = serialization::DECL_ENUM_CONSTANT;
 }
 
 void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
   VisitValueDecl(D);
-  Writer.AddSourceLocation(D->getInnerLocStart(), Record);
+  Record.AddSourceLocation(D->getInnerLocStart());
   Record.push_back(D->hasExtInfo());
   if (D->hasExtInfo())
-    Writer.AddQualifierInfo(*D->getExtInfo(), Record);
+    Record.AddQualifierInfo(*D->getExtInfo());
 }
 
 void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   VisitRedeclarable(D);
   VisitDeclaratorDecl(D);
-  Writer.AddDeclarationNameLoc(D->DNLoc, D->getDeclName(), Record);
+  Record.AddDeclarationNameLoc(D->DNLoc, D->getDeclName());
   Record.push_back(D->getIdentifierNamespace());
   
   // FunctionDecl's body is handled last at ASTWriterDecl::Visit,
@@ -503,20 +530,20 @@
   Record.push_back(D->HasSkippedBody);
   Record.push_back(D->IsLateTemplateParsed);
   Record.push_back(D->getLinkageInternal());
-  Writer.AddSourceLocation(D->getLocEnd(), Record);
+  Record.AddSourceLocation(D->getLocEnd());
 
   Record.push_back(D->getTemplatedKind());
   switch (D->getTemplatedKind()) {
   case FunctionDecl::TK_NonTemplate:
     break;
   case FunctionDecl::TK_FunctionTemplate:
-    Writer.AddDeclRef(D->getDescribedFunctionTemplate(), Record);
+    Record.AddDeclRef(D->getDescribedFunctionTemplate());
     break;
   case FunctionDecl::TK_MemberSpecialization: {
     MemberSpecializationInfo *MemberInfo = D->getMemberSpecializationInfo();
-    Writer.AddDeclRef(MemberInfo->getInstantiatedFrom(), Record);
+    Record.AddDeclRef(MemberInfo->getInstantiatedFrom());
     Record.push_back(MemberInfo->getTemplateSpecializationKind());
-    Writer.AddSourceLocation(MemberInfo->getPointOfInstantiation(), Record);
+    Record.AddSourceLocation(MemberInfo->getPointOfInstantiation());
     break;
   }
   case FunctionDecl::TK_FunctionTemplateSpecialization: {
@@ -525,11 +552,11 @@
 
     RegisterTemplateSpecialization(FTSInfo->getTemplate(), D);
 
-    Writer.AddDeclRef(FTSInfo->getTemplate(), Record);
+    Record.AddDeclRef(FTSInfo->getTemplate());
     Record.push_back(FTSInfo->getTemplateSpecializationKind());
     
     // Template arguments.
-    Writer.AddTemplateArgumentList(FTSInfo->TemplateArguments, Record);
+    Record.AddTemplateArgumentList(FTSInfo->TemplateArguments);
     
     // Template args as written.
     Record.push_back(FTSInfo->TemplateArgumentsAsWritten != nullptr);
@@ -537,20 +564,18 @@
       Record.push_back(FTSInfo->TemplateArgumentsAsWritten->NumTemplateArgs);
       for (int i=0, e = FTSInfo->TemplateArgumentsAsWritten->NumTemplateArgs;
              i!=e; ++i)
-        Writer.AddTemplateArgumentLoc((*FTSInfo->TemplateArgumentsAsWritten)[i],
-                                      Record);
-      Writer.AddSourceLocation(FTSInfo->TemplateArgumentsAsWritten->LAngleLoc,
-                               Record);
-      Writer.AddSourceLocation(FTSInfo->TemplateArgumentsAsWritten->RAngleLoc,
-                               Record);
+        Record.AddTemplateArgumentLoc(
+            (*FTSInfo->TemplateArgumentsAsWritten)[i]);
+      Record.AddSourceLocation(FTSInfo->TemplateArgumentsAsWritten->LAngleLoc);
+      Record.AddSourceLocation(FTSInfo->TemplateArgumentsAsWritten->RAngleLoc);
     }
     
-    Writer.AddSourceLocation(FTSInfo->getPointOfInstantiation(), Record);
+    Record.AddSourceLocation(FTSInfo->getPointOfInstantiation());
 
     if (D->isCanonicalDecl()) {
       // Write the template that contains the specializations set. We will
       // add a FunctionTemplateSpecializationInfo to it when reading.
-      Writer.AddDeclRef(FTSInfo->getTemplate()->getCanonicalDecl(), Record);
+      Record.AddDeclRef(FTSInfo->getTemplate()->getCanonicalDecl());
     }
     break;
   }
@@ -561,21 +586,21 @@
     // Templates.
     Record.push_back(DFTSInfo->getNumTemplates());
     for (int i=0, e = DFTSInfo->getNumTemplates(); i != e; ++i)
-      Writer.AddDeclRef(DFTSInfo->getTemplate(i), Record);
+      Record.AddDeclRef(DFTSInfo->getTemplate(i));
     
     // Templates args.
     Record.push_back(DFTSInfo->getNumTemplateArgs());
     for (int i=0, e = DFTSInfo->getNumTemplateArgs(); i != e; ++i)
-      Writer.AddTemplateArgumentLoc(DFTSInfo->getTemplateArg(i), Record);
-    Writer.AddSourceLocation(DFTSInfo->getLAngleLoc(), Record);
-    Writer.AddSourceLocation(DFTSInfo->getRAngleLoc(), Record);
+      Record.AddTemplateArgumentLoc(DFTSInfo->getTemplateArg(i));
+    Record.AddSourceLocation(DFTSInfo->getLAngleLoc());
+    Record.AddSourceLocation(DFTSInfo->getRAngleLoc());
     break;
   }
   }
 
   Record.push_back(D->param_size());
-  for (auto P : D->params())
-    Writer.AddDeclRef(P, Record);
+  for (auto P : D->parameters())
+    Record.AddDeclRef(P);
   Code = serialization::DECL_FUNCTION;
 }
 
@@ -587,9 +612,9 @@
                       D->getSelfDecl() != nullptr || D->getCmdDecl() != nullptr;
   Record.push_back(HasBodyStuff);
   if (HasBodyStuff) {
-    Writer.AddStmt(D->getBody());
-    Writer.AddDeclRef(D->getSelfDecl(), Record);
-    Writer.AddDeclRef(D->getCmdDecl(), Record);
+    Record.AddStmt(D->getBody());
+    Record.AddDeclRef(D->getSelfDecl());
+    Record.AddDeclRef(D->getCmdDecl());
   }
   Record.push_back(D->isInstanceMethod());
   Record.push_back(D->isVariadic());
@@ -602,7 +627,7 @@
   Record.push_back(D->HasRedeclaration);
   if (D->HasRedeclaration) {
     assert(Context.getObjCMethodRedeclaration(D));
-    Writer.AddDeclRef(Context.getObjCMethodRedeclaration(D), Record);
+    Record.AddDeclRef(Context.getObjCMethodRedeclaration(D));
   }
 
   // FIXME: stable encoding for @required/@optional
@@ -610,19 +635,19 @@
   // FIXME: stable encoding for in/out/inout/bycopy/byref/oneway/nullability
   Record.push_back(D->getObjCDeclQualifier());
   Record.push_back(D->hasRelatedResultType());
-  Writer.AddTypeRef(D->getReturnType(), Record);
-  Writer.AddTypeSourceInfo(D->getReturnTypeSourceInfo(), Record);
-  Writer.AddSourceLocation(D->getLocEnd(), Record);
+  Record.AddTypeRef(D->getReturnType());
+  Record.AddTypeSourceInfo(D->getReturnTypeSourceInfo());
+  Record.AddSourceLocation(D->getLocEnd());
   Record.push_back(D->param_size());
-  for (const auto *P : D->params())
-    Writer.AddDeclRef(P, Record);
+  for (const auto *P : D->parameters())
+    Record.AddDeclRef(P);
 
   Record.push_back(D->SelLocsKind);
   unsigned NumStoredSelLocs = D->getNumStoredSelLocs();
   SourceLocation *SelLocs = D->getStoredSelLocs();
   Record.push_back(NumStoredSelLocs);
   for (unsigned i = 0; i != NumStoredSelLocs; ++i)
-    Writer.AddSourceLocation(SelLocs[i], Record);
+    Record.AddSourceLocation(SelLocs[i]);
 
   Code = serialization::DECL_OBJC_METHOD;
 }
@@ -631,23 +656,23 @@
   VisitTypedefNameDecl(D);
   Record.push_back(D->Variance);
   Record.push_back(D->Index);
-  Writer.AddSourceLocation(D->VarianceLoc, Record);
-  Writer.AddSourceLocation(D->ColonLoc, Record);
+  Record.AddSourceLocation(D->VarianceLoc);
+  Record.AddSourceLocation(D->ColonLoc);
 
   Code = serialization::DECL_OBJC_TYPE_PARAM;
 }
 
 void ASTDeclWriter::VisitObjCContainerDecl(ObjCContainerDecl *D) {
   VisitNamedDecl(D);
-  Writer.AddSourceLocation(D->getAtStartLoc(), Record);
-  Writer.AddSourceRange(D->getAtEndRange(), Record);
+  Record.AddSourceLocation(D->getAtStartLoc());
+  Record.AddSourceRange(D->getAtEndRange());
   // Abstract class (no need to define a stable serialization::DECL code).
 }
 
 void ASTDeclWriter::VisitObjCInterfaceDecl(ObjCInterfaceDecl *D) {
   VisitRedeclarable(D);
   VisitObjCContainerDecl(D);
-  Writer.AddTypeRef(QualType(D->getTypeForDecl(), 0), Record);
+  Record.AddTypeRef(QualType(D->getTypeForDecl(), 0));
   AddObjCTypeParamList(D->TypeParamList);
 
   Record.push_back(D->isThisDeclarationADefinition());
@@ -655,16 +680,16 @@
     // Write the DefinitionData
     ObjCInterfaceDecl::DefinitionData &Data = D->data();
     
-    Writer.AddTypeSourceInfo(D->getSuperClassTInfo(), Record);
-    Writer.AddSourceLocation(D->getEndOfDefinitionLoc(), Record);
+    Record.AddTypeSourceInfo(D->getSuperClassTInfo());
+    Record.AddSourceLocation(D->getEndOfDefinitionLoc());
     Record.push_back(Data.HasDesignatedInitializers);
 
     // Write out the protocols that are directly referenced by the @interface.
     Record.push_back(Data.ReferencedProtocols.size());
     for (const auto *P : D->protocols())
-      Writer.AddDeclRef(P, Record);
+      Record.AddDeclRef(P);
     for (const auto &PL : D->protocol_locs())
-      Writer.AddSourceLocation(PL, Record);
+      Record.AddSourceLocation(PL);
     
     // Write out the protocols that are transitively referenced.
     Record.push_back(Data.AllReferencedProtocols.size());
@@ -672,7 +697,7 @@
               P = Data.AllReferencedProtocols.begin(),
            PEnd = Data.AllReferencedProtocols.end();
          P != PEnd; ++P)
-      Writer.AddDeclRef(*P, Record);
+      Record.AddDeclRef(*P);
 
     
     if (ObjCCategoryDecl *Cat = D->getCategoryListRaw()) {
@@ -717,9 +742,9 @@
   if (D->isThisDeclarationADefinition()) {
     Record.push_back(D->protocol_size());
     for (const auto *I : D->protocols())
-      Writer.AddDeclRef(I, Record);
+      Record.AddDeclRef(I);
     for (const auto &PL : D->protocol_locs())
-      Writer.AddSourceLocation(PL, Record);
+      Record.AddSourceLocation(PL);
   }
   
   Code = serialization::DECL_OBJC_PROTOCOL;
@@ -732,80 +757,80 @@
 
 void ASTDeclWriter::VisitObjCCategoryDecl(ObjCCategoryDecl *D) {
   VisitObjCContainerDecl(D);
-  Writer.AddSourceLocation(D->getCategoryNameLoc(), Record);
-  Writer.AddSourceLocation(D->getIvarLBraceLoc(), Record);
-  Writer.AddSourceLocation(D->getIvarRBraceLoc(), Record);
-  Writer.AddDeclRef(D->getClassInterface(), Record);
+  Record.AddSourceLocation(D->getCategoryNameLoc());
+  Record.AddSourceLocation(D->getIvarLBraceLoc());
+  Record.AddSourceLocation(D->getIvarRBraceLoc());
+  Record.AddDeclRef(D->getClassInterface());
   AddObjCTypeParamList(D->TypeParamList);
   Record.push_back(D->protocol_size());
   for (const auto *I : D->protocols())
-    Writer.AddDeclRef(I, Record);
+    Record.AddDeclRef(I);
   for (const auto &PL : D->protocol_locs())
-    Writer.AddSourceLocation(PL, Record);
+    Record.AddSourceLocation(PL);
   Code = serialization::DECL_OBJC_CATEGORY;
 }
 
 void ASTDeclWriter::VisitObjCCompatibleAliasDecl(ObjCCompatibleAliasDecl *D) {
   VisitNamedDecl(D);
-  Writer.AddDeclRef(D->getClassInterface(), Record);
+  Record.AddDeclRef(D->getClassInterface());
   Code = serialization::DECL_OBJC_COMPATIBLE_ALIAS;
 }
 
 void ASTDeclWriter::VisitObjCPropertyDecl(ObjCPropertyDecl *D) {
   VisitNamedDecl(D);
-  Writer.AddSourceLocation(D->getAtLoc(), Record);
-  Writer.AddSourceLocation(D->getLParenLoc(), Record);
-  Writer.AddTypeRef(D->getType(), Record);
-  Writer.AddTypeSourceInfo(D->getTypeSourceInfo(), Record);
+  Record.AddSourceLocation(D->getAtLoc());
+  Record.AddSourceLocation(D->getLParenLoc());
+  Record.AddTypeRef(D->getType());
+  Record.AddTypeSourceInfo(D->getTypeSourceInfo());
   // FIXME: stable encoding
   Record.push_back((unsigned)D->getPropertyAttributes());
   Record.push_back((unsigned)D->getPropertyAttributesAsWritten());
   // FIXME: stable encoding
   Record.push_back((unsigned)D->getPropertyImplementation());
-  Writer.AddDeclarationName(D->getGetterName(), Record);
-  Writer.AddDeclarationName(D->getSetterName(), Record);
-  Writer.AddDeclRef(D->getGetterMethodDecl(), Record);
-  Writer.AddDeclRef(D->getSetterMethodDecl(), Record);
-  Writer.AddDeclRef(D->getPropertyIvarDecl(), Record);
+  Record.AddDeclarationName(D->getGetterName());
+  Record.AddDeclarationName(D->getSetterName());
+  Record.AddDeclRef(D->getGetterMethodDecl());
+  Record.AddDeclRef(D->getSetterMethodDecl());
+  Record.AddDeclRef(D->getPropertyIvarDecl());
   Code = serialization::DECL_OBJC_PROPERTY;
 }
 
 void ASTDeclWriter::VisitObjCImplDecl(ObjCImplDecl *D) {
   VisitObjCContainerDecl(D);
-  Writer.AddDeclRef(D->getClassInterface(), Record);
+  Record.AddDeclRef(D->getClassInterface());
   // Abstract class (no need to define a stable serialization::DECL code).
 }
 
 void ASTDeclWriter::VisitObjCCategoryImplDecl(ObjCCategoryImplDecl *D) {
   VisitObjCImplDecl(D);
-  Writer.AddIdentifierRef(D->getIdentifier(), Record);
-  Writer.AddSourceLocation(D->getCategoryNameLoc(), Record);
+  Record.AddIdentifierRef(D->getIdentifier());
+  Record.AddSourceLocation(D->getCategoryNameLoc());
   Code = serialization::DECL_OBJC_CATEGORY_IMPL;
 }
 
 void ASTDeclWriter::VisitObjCImplementationDecl(ObjCImplementationDecl *D) {
   VisitObjCImplDecl(D);
-  Writer.AddDeclRef(D->getSuperClass(), Record);
-  Writer.AddSourceLocation(D->getSuperClassLoc(), Record);
-  Writer.AddSourceLocation(D->getIvarLBraceLoc(), Record);
-  Writer.AddSourceLocation(D->getIvarRBraceLoc(), Record);
+  Record.AddDeclRef(D->getSuperClass());
+  Record.AddSourceLocation(D->getSuperClassLoc());
+  Record.AddSourceLocation(D->getIvarLBraceLoc());
+  Record.AddSourceLocation(D->getIvarRBraceLoc());
   Record.push_back(D->hasNonZeroConstructors());
   Record.push_back(D->hasDestructors());
   Record.push_back(D->NumIvarInitializers);
   if (D->NumIvarInitializers)
-    Writer.AddCXXCtorInitializersRef(
-        llvm::makeArrayRef(D->init_begin(), D->init_end()), Record);
+    Record.AddCXXCtorInitializers(
+        llvm::makeArrayRef(D->init_begin(), D->init_end()));
   Code = serialization::DECL_OBJC_IMPLEMENTATION;
 }
 
 void ASTDeclWriter::VisitObjCPropertyImplDecl(ObjCPropertyImplDecl *D) {
   VisitDecl(D);
-  Writer.AddSourceLocation(D->getLocStart(), Record);
-  Writer.AddDeclRef(D->getPropertyDecl(), Record);
-  Writer.AddDeclRef(D->getPropertyIvarDecl(), Record);
-  Writer.AddSourceLocation(D->getPropertyIvarDeclLoc(), Record);
-  Writer.AddStmt(D->getGetterCXXConstructor());
-  Writer.AddStmt(D->getSetterCXXAssignment());
+  Record.AddSourceLocation(D->getLocStart());
+  Record.AddDeclRef(D->getPropertyDecl());
+  Record.AddDeclRef(D->getPropertyIvarDecl());
+  Record.AddSourceLocation(D->getPropertyIvarDeclLoc());
+  Record.AddStmt(D->getGetterCXXConstructor());
+  Record.AddStmt(D->getSetterCXXAssignment());
   Code = serialization::DECL_OBJC_PROPERTY_IMPL;
 }
 
@@ -817,15 +842,14 @@
     Record.push_back(0);
   } else if (D->InitStorage.getInt() == FieldDecl::ISK_CapturedVLAType) {
     Record.push_back(D->InitStorage.getInt() + 1);
-    Writer.AddTypeRef(
-        QualType(static_cast<Type *>(D->InitStorage.getPointer()), 0),
-        Record);
+    Record.AddTypeRef(
+        QualType(static_cast<Type *>(D->InitStorage.getPointer()), 0));
   } else {
     Record.push_back(D->InitStorage.getInt() + 1);
-    Writer.AddStmt(static_cast<Expr *>(D->InitStorage.getPointer()));
+    Record.AddStmt(static_cast<Expr *>(D->InitStorage.getPointer()));
   }
   if (!D->getDeclName())
-    Writer.AddDeclRef(Context.getInstantiatedFromUnnamedFieldDecl(D), Record);
+    Record.AddDeclRef(Context.getInstantiatedFromUnnamedFieldDecl(D));
 
   if (D->getDeclContext() == D->getLexicalDeclContext() &&
       !D->hasAttrs() &&
@@ -848,8 +872,8 @@
 
 void ASTDeclWriter::VisitMSPropertyDecl(MSPropertyDecl *D) {
   VisitDeclaratorDecl(D);
-  Writer.AddIdentifierRef(D->getGetterId(), Record);
-  Writer.AddIdentifierRef(D->getSetterId(), Record);
+  Record.AddIdentifierRef(D->getGetterId());
+  Record.AddIdentifierRef(D->getSetterId());
   Code = serialization::DECL_MS_PROPERTY;
 }
 
@@ -858,7 +882,7 @@
   Record.push_back(D->getChainingSize());
 
   for (const auto *P : D->chain())
-    Writer.AddDeclRef(P, Record);
+    Record.AddDeclRef(P);
   Code = serialization::DECL_INDIRECTFIELD;
 }
 
@@ -873,6 +897,8 @@
     Record.push_back(D->isNRVOVariable());
     Record.push_back(D->isCXXForRangeDecl());
     Record.push_back(D->isARCPseudoStrong());
+    Record.push_back(D->isInline());
+    Record.push_back(D->isInlineSpecified());
     Record.push_back(D->isConstexpr());
     Record.push_back(D->isInitCapture());
     Record.push_back(D->isPreviousDeclInSameBlockScope());
@@ -881,7 +907,7 @@
 
   if (D->getInit()) {
     Record.push_back(!D->isInitKnownICE() ? 1 : (D->isInitICE() ? 3 : 2));
-    Writer.AddStmt(D->getInit());
+    Record.AddStmt(D->getInit());
   } else {
     Record.push_back(0);
   }
@@ -891,13 +917,13 @@
   };
   if (VarTemplateDecl *TemplD = D->getDescribedVarTemplate()) {
     Record.push_back(VarTemplate);
-    Writer.AddDeclRef(TemplD, Record);
+    Record.AddDeclRef(TemplD);
   } else if (MemberSpecializationInfo *SpecInfo
                = D->getMemberSpecializationInfo()) {
     Record.push_back(StaticDataMemberSpecialization);
-    Writer.AddDeclRef(SpecInfo->getInstantiatedFrom(), Record);
+    Record.AddDeclRef(SpecInfo->getInstantiatedFrom());
     Record.push_back(SpecInfo->getTemplateSpecializationKind());
-    Writer.AddSourceLocation(SpecInfo->getPointOfInstantiation(), Record);
+    Record.AddSourceLocation(SpecInfo->getPointOfInstantiation());
   } else {
     Record.push_back(VarNotTemplate);
   }
@@ -917,8 +943,8 @@
       D->getFirstDecl() == D->getMostRecentDecl() &&
       D->getInitStyle() == VarDecl::CInit &&
       D->getInit() == nullptr &&
-      !isa<ParmVarDecl>(D) &&
-      !isa<VarTemplateSpecializationDecl>(D) &&
+      D->getKind() == Decl::Var &&
+      !D->isInline() &&
       !D->isConstexpr() &&
       !D->isInitCapture() &&
       !D->isPreviousDeclInSameBlockScope() &&
@@ -943,7 +969,7 @@
   Record.push_back(D->hasInheritedDefaultArg());
   Record.push_back(D->hasUninstantiatedDefaultArg());
   if (D->hasUninstantiatedDefaultArg())
-    Writer.AddStmt(D->getUninstantiatedDefaultArg());
+    Record.AddStmt(D->getUninstantiatedDefaultArg());
   Code = serialization::DECL_PARM_VAR;
 
   assert(!D->isARCPseudoStrong()); // can be true of ImplicitParamDecl
@@ -980,10 +1006,26 @@
          "PARM_VAR_DECL can't be static data member");
 }
 
+void ASTDeclWriter::VisitDecompositionDecl(DecompositionDecl *D) {
+  // Record the number of bindings first to simplify deserialization.
+  Record.push_back(D->bindings().size());
+
+  VisitVarDecl(D);
+  for (auto *B : D->bindings())
+    Record.AddDeclRef(B);
+  Code = serialization::DECL_DECOMPOSITION;
+}
+
+void ASTDeclWriter::VisitBindingDecl(BindingDecl *D) {
+  VisitValueDecl(D);
+  Record.AddStmt(D->getBinding());
+  Code = serialization::DECL_BINDING;
+}
+
 void ASTDeclWriter::VisitFileScopeAsmDecl(FileScopeAsmDecl *D) {
   VisitDecl(D);
-  Writer.AddStmt(D->getAsmString());
-  Writer.AddSourceLocation(D->getRParenLoc(), Record);
+  Record.AddStmt(D->getAsmString());
+  Record.AddSourceLocation(D->getRParenLoc());
   Code = serialization::DECL_FILE_SCOPE_ASM;
 }
 
@@ -994,19 +1036,18 @@
 
 void ASTDeclWriter::VisitBlockDecl(BlockDecl *D) {
   VisitDecl(D);
-  Writer.AddStmt(D->getBody());
-  Writer.AddTypeSourceInfo(D->getSignatureAsWritten(), Record);
+  Record.AddStmt(D->getBody());
+  Record.AddTypeSourceInfo(D->getSignatureAsWritten());
   Record.push_back(D->param_size());
-  for (FunctionDecl::param_iterator P = D->param_begin(), PEnd = D->param_end();
-       P != PEnd; ++P)
-    Writer.AddDeclRef(*P, Record);
+  for (ParmVarDecl *P : D->parameters())
+    Record.AddDeclRef(P);
   Record.push_back(D->isVariadic());
   Record.push_back(D->blockMissingReturnType());
   Record.push_back(D->isConversionFromLambda());
   Record.push_back(D->capturesCXXThis());
   Record.push_back(D->getNumCaptures());
   for (const auto &capture : D->captures()) {
-    Writer.AddDeclRef(capture.getVariable(), Record);
+    Record.AddDeclRef(capture.getVariable());
 
     unsigned flags = 0;
     if (capture.isByRef()) flags |= 1;
@@ -1014,7 +1055,7 @@
     if (capture.hasCopyExpr()) flags |= 4;
     Record.push_back(flags);
 
-    if (capture.hasCopyExpr()) Writer.AddStmt(capture.getCopyExpr());
+    if (capture.hasCopyExpr()) Record.AddStmt(capture.getCopyExpr());
   }
 
   Code = serialization::DECL_BLOCK;
@@ -1027,21 +1068,21 @@
   Record.push_back(CD->isNothrow() ? 1 : 0);
   // Body is stored by VisitCapturedStmt.
   for (unsigned I = 0; I < CD->getNumParams(); ++I)
-    Writer.AddDeclRef(CD->getParam(I), Record);
+    Record.AddDeclRef(CD->getParam(I));
   Code = serialization::DECL_CAPTURED;
 }
 
 void ASTDeclWriter::VisitLinkageSpecDecl(LinkageSpecDecl *D) {
   VisitDecl(D);
   Record.push_back(D->getLanguage());
-  Writer.AddSourceLocation(D->getExternLoc(), Record);
-  Writer.AddSourceLocation(D->getRBraceLoc(), Record);
+  Record.AddSourceLocation(D->getExternLoc());
+  Record.AddSourceLocation(D->getRBraceLoc());
   Code = serialization::DECL_LINKAGE_SPEC;
 }
 
 void ASTDeclWriter::VisitLabelDecl(LabelDecl *D) {
   VisitNamedDecl(D);
-  Writer.AddSourceLocation(D->getLocStart(), Record);
+  Record.AddSourceLocation(D->getLocStart());
   Code = serialization::DECL_LABEL;
 }
 
@@ -1050,11 +1091,11 @@
   VisitRedeclarable(D);
   VisitNamedDecl(D);
   Record.push_back(D->isInline());
-  Writer.AddSourceLocation(D->getLocStart(), Record);
-  Writer.AddSourceLocation(D->getRBraceLoc(), Record);
+  Record.AddSourceLocation(D->getLocStart());
+  Record.AddSourceLocation(D->getRBraceLoc());
 
   if (D->isOriginalNamespace())
-    Writer.AddDeclRef(D->getAnonymousNamespace(), Record);
+    Record.AddDeclRef(D->getAnonymousNamespace());
   Code = serialization::DECL_NAMESPACE;
 
   if (Writer.hasChain() && D->isAnonymousNamespace() && 
@@ -1075,56 +1116,65 @@
 void ASTDeclWriter::VisitNamespaceAliasDecl(NamespaceAliasDecl *D) {
   VisitRedeclarable(D);
   VisitNamedDecl(D);
-  Writer.AddSourceLocation(D->getNamespaceLoc(), Record);
-  Writer.AddSourceLocation(D->getTargetNameLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(D->getQualifierLoc(), Record);
-  Writer.AddDeclRef(D->getNamespace(), Record);
+  Record.AddSourceLocation(D->getNamespaceLoc());
+  Record.AddSourceLocation(D->getTargetNameLoc());
+  Record.AddNestedNameSpecifierLoc(D->getQualifierLoc());
+  Record.AddDeclRef(D->getNamespace());
   Code = serialization::DECL_NAMESPACE_ALIAS;
 }
 
 void ASTDeclWriter::VisitUsingDecl(UsingDecl *D) {
   VisitNamedDecl(D);
-  Writer.AddSourceLocation(D->getUsingLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(D->getQualifierLoc(), Record);
-  Writer.AddDeclarationNameLoc(D->DNLoc, D->getDeclName(), Record);
-  Writer.AddDeclRef(D->FirstUsingShadow.getPointer(), Record);
+  Record.AddSourceLocation(D->getUsingLoc());
+  Record.AddNestedNameSpecifierLoc(D->getQualifierLoc());
+  Record.AddDeclarationNameLoc(D->DNLoc, D->getDeclName());
+  Record.AddDeclRef(D->FirstUsingShadow.getPointer());
   Record.push_back(D->hasTypename());
-  Writer.AddDeclRef(Context.getInstantiatedFromUsingDecl(D), Record);
+  Record.AddDeclRef(Context.getInstantiatedFromUsingDecl(D));
   Code = serialization::DECL_USING;
 }
 
 void ASTDeclWriter::VisitUsingShadowDecl(UsingShadowDecl *D) {
   VisitRedeclarable(D);
   VisitNamedDecl(D);
-  Writer.AddDeclRef(D->getTargetDecl(), Record);
-  Writer.AddDeclRef(D->UsingOrNextShadow, Record);
-  Writer.AddDeclRef(Context.getInstantiatedFromUsingShadowDecl(D), Record);
+  Record.AddDeclRef(D->getTargetDecl());
+  Record.AddDeclRef(D->UsingOrNextShadow);
+  Record.AddDeclRef(Context.getInstantiatedFromUsingShadowDecl(D));
   Code = serialization::DECL_USING_SHADOW;
 }
 
+void ASTDeclWriter::VisitConstructorUsingShadowDecl(
+    ConstructorUsingShadowDecl *D) {
+  VisitUsingShadowDecl(D);
+  Record.AddDeclRef(D->NominatedBaseClassShadowDecl);
+  Record.AddDeclRef(D->ConstructedBaseClassShadowDecl);
+  Record.push_back(D->IsVirtual);
+  Code = serialization::DECL_CONSTRUCTOR_USING_SHADOW;
+}
+
 void ASTDeclWriter::VisitUsingDirectiveDecl(UsingDirectiveDecl *D) {
   VisitNamedDecl(D);
-  Writer.AddSourceLocation(D->getUsingLoc(), Record);
-  Writer.AddSourceLocation(D->getNamespaceKeyLocation(), Record);
-  Writer.AddNestedNameSpecifierLoc(D->getQualifierLoc(), Record);
-  Writer.AddDeclRef(D->getNominatedNamespace(), Record);
-  Writer.AddDeclRef(dyn_cast<Decl>(D->getCommonAncestor()), Record);
+  Record.AddSourceLocation(D->getUsingLoc());
+  Record.AddSourceLocation(D->getNamespaceKeyLocation());
+  Record.AddNestedNameSpecifierLoc(D->getQualifierLoc());
+  Record.AddDeclRef(D->getNominatedNamespace());
+  Record.AddDeclRef(dyn_cast<Decl>(D->getCommonAncestor()));
   Code = serialization::DECL_USING_DIRECTIVE;
 }
 
 void ASTDeclWriter::VisitUnresolvedUsingValueDecl(UnresolvedUsingValueDecl *D) {
   VisitValueDecl(D);
-  Writer.AddSourceLocation(D->getUsingLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(D->getQualifierLoc(), Record);
-  Writer.AddDeclarationNameLoc(D->DNLoc, D->getDeclName(), Record);
+  Record.AddSourceLocation(D->getUsingLoc());
+  Record.AddNestedNameSpecifierLoc(D->getQualifierLoc());
+  Record.AddDeclarationNameLoc(D->DNLoc, D->getDeclName());
   Code = serialization::DECL_UNRESOLVED_USING_VALUE;
 }
 
 void ASTDeclWriter::VisitUnresolvedUsingTypenameDecl(
                                                UnresolvedUsingTypenameDecl *D) {
   VisitTypeDecl(D);
-  Writer.AddSourceLocation(D->getTypenameLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(D->getQualifierLoc(), Record);
+  Record.AddSourceLocation(D->getTypenameLoc());
+  Record.AddNestedNameSpecifierLoc(D->getQualifierLoc());
   Code = serialization::DECL_UNRESOLVED_USING_TYPENAME;
 }
 
@@ -1136,25 +1186,25 @@
   };
   if (ClassTemplateDecl *TemplD = D->getDescribedClassTemplate()) {
     Record.push_back(CXXRecTemplate);
-    Writer.AddDeclRef(TemplD, Record);
+    Record.AddDeclRef(TemplD);
   } else if (MemberSpecializationInfo *MSInfo
                = D->getMemberSpecializationInfo()) {
     Record.push_back(CXXRecMemberSpecialization);
-    Writer.AddDeclRef(MSInfo->getInstantiatedFrom(), Record);
+    Record.AddDeclRef(MSInfo->getInstantiatedFrom());
     Record.push_back(MSInfo->getTemplateSpecializationKind());
-    Writer.AddSourceLocation(MSInfo->getPointOfInstantiation(), Record);
+    Record.AddSourceLocation(MSInfo->getPointOfInstantiation());
   } else {
     Record.push_back(CXXRecNotTemplate);
   }
 
   Record.push_back(D->isThisDeclarationADefinition());
   if (D->isThisDeclarationADefinition())
-    Writer.AddCXXDefinitionData(D, Record);
+    Record.AddCXXDefinitionData(D);
 
   // Store (what we currently believe to be) the key function to avoid
   // deserializing every method so we can compute it.
   if (D->IsCompleteDefinition)
-    Writer.AddDeclRef(Context.getCurrentKeyFunction(D), Record);
+    Record.AddDeclRef(Context.getCurrentKeyFunction(D));
 
   Code = serialization::DECL_CXX_RECORD;
 }
@@ -1166,7 +1216,7 @@
     for (CXXMethodDecl::method_iterator
            I = D->begin_overridden_methods(), E = D->end_overridden_methods();
            I != E; ++I)
-      Writer.AddDeclRef(*I, Record);
+      Record.AddDeclRef(*I);
   } else {
     // We only need to record overridden methods once for the canonical decl.
     Record.push_back(0);
@@ -1187,18 +1237,27 @@
 }
 
 void ASTDeclWriter::VisitCXXConstructorDecl(CXXConstructorDecl *D) {
+  if (auto Inherited = D->getInheritedConstructor()) {
+    Record.AddDeclRef(Inherited.getShadowDecl());
+    Record.AddDeclRef(Inherited.getConstructor());
+    Code = serialization::DECL_CXX_INHERITED_CONSTRUCTOR;
+  } else {
+    Code = serialization::DECL_CXX_CONSTRUCTOR;
+  }
+
   VisitCXXMethodDecl(D);
 
-  Writer.AddDeclRef(D->getInheritedConstructor(), Record);
   Record.push_back(D->IsExplicitSpecified);
 
-  Code = serialization::DECL_CXX_CONSTRUCTOR;
+  Code = D->isInheritingConstructor()
+             ? serialization::DECL_CXX_INHERITED_CONSTRUCTOR
+             : serialization::DECL_CXX_CONSTRUCTOR;
 }
 
 void ASTDeclWriter::VisitCXXDestructorDecl(CXXDestructorDecl *D) {
   VisitCXXMethodDecl(D);
 
-  Writer.AddDeclRef(D->getOperatorDelete(), Record);
+  Record.AddDeclRef(D->getOperatorDelete());
 
   Code = serialization::DECL_CXX_DESTRUCTOR;
 }
@@ -1215,11 +1274,11 @@
   ArrayRef<SourceLocation> IdentifierLocs = D->getIdentifierLocs();
   Record.push_back(!IdentifierLocs.empty());
   if (IdentifierLocs.empty()) {
-    Writer.AddSourceLocation(D->getLocEnd(), Record);
+    Record.AddSourceLocation(D->getLocEnd());
     Record.push_back(1);
   } else {
     for (unsigned I = 0, N = IdentifierLocs.size(); I != N; ++I)
-      Writer.AddSourceLocation(IdentifierLocs[I], Record);
+      Record.AddSourceLocation(IdentifierLocs[I]);
     Record.push_back(IdentifierLocs.size());
   }
   // Note: the number of source locations must always be the last element in
@@ -1229,7 +1288,7 @@
 
 void ASTDeclWriter::VisitAccessSpecDecl(AccessSpecDecl *D) {
   VisitDecl(D);
-  Writer.AddSourceLocation(D->getColonLoc(), Record);
+  Record.AddSourceLocation(D->getColonLoc());
   Code = serialization::DECL_ACCESS_SPEC;
 }
 
@@ -1241,15 +1300,14 @@
   bool hasFriendDecl = D->Friend.is<NamedDecl*>();
   Record.push_back(hasFriendDecl);
   if (hasFriendDecl)
-    Writer.AddDeclRef(D->getFriendDecl(), Record);
+    Record.AddDeclRef(D->getFriendDecl());
   else
-    Writer.AddTypeSourceInfo(D->getFriendType(), Record);
+    Record.AddTypeSourceInfo(D->getFriendType());
   for (unsigned i = 0; i < D->NumTPLists; ++i)
-    Writer.AddTemplateParameterList(D->getFriendTypeTemplateParameterList(i),
-                                    Record);
-  Writer.AddDeclRef(D->getNextFriend(), Record);
+    Record.AddTemplateParameterList(D->getFriendTypeTemplateParameterList(i));
+  Record.AddDeclRef(D->getNextFriend());
   Record.push_back(D->UnsupportedFriend);
-  Writer.AddSourceLocation(D->FriendLoc, Record);
+  Record.AddSourceLocation(D->FriendLoc);
   Code = serialization::DECL_FRIEND;
 }
 
@@ -1257,21 +1315,21 @@
   VisitDecl(D);
   Record.push_back(D->getNumTemplateParameters());
   for (unsigned i = 0, e = D->getNumTemplateParameters(); i != e; ++i)
-    Writer.AddTemplateParameterList(D->getTemplateParameterList(i), Record);
+    Record.AddTemplateParameterList(D->getTemplateParameterList(i));
   Record.push_back(D->getFriendDecl() != nullptr);
   if (D->getFriendDecl())
-    Writer.AddDeclRef(D->getFriendDecl(), Record);
+    Record.AddDeclRef(D->getFriendDecl());
   else
-    Writer.AddTypeSourceInfo(D->getFriendType(), Record);
-  Writer.AddSourceLocation(D->getFriendLoc(), Record);
+    Record.AddTypeSourceInfo(D->getFriendType());
+  Record.AddSourceLocation(D->getFriendLoc());
   Code = serialization::DECL_FRIEND_TEMPLATE;
 }
 
 void ASTDeclWriter::VisitTemplateDecl(TemplateDecl *D) {
   VisitNamedDecl(D);
 
-  Writer.AddDeclRef(D->getTemplatedDecl(), Record);
-  Writer.AddTemplateParameterList(D->getTemplateParameters(), Record);
+  Record.AddDeclRef(D->getTemplatedDecl());
+  Record.AddTemplateParameterList(D->getTemplateParameters());
 }
 
 void ASTDeclWriter::VisitRedeclarableTemplateDecl(RedeclarableTemplateDecl *D) {
@@ -1281,7 +1339,7 @@
   // getCommonPtr() can be used while this is still initializing.
   if (D->isFirstDecl()) {
     // This declaration owns the 'common' pointer, so serialize that data now.
-    Writer.AddDeclRef(D->getInstantiatedFromMemberTemplate(), Record);
+    Record.AddDeclRef(D->getInstantiatedFromMemberTemplate());
     if (D->getInstantiatedFromMemberTemplate())
       Record.push_back(D->isMemberSpecialization());
   }
@@ -1308,28 +1366,27 @@
                      ClassTemplatePartialSpecializationDecl *> InstFrom
     = D->getSpecializedTemplateOrPartial();
   if (Decl *InstFromD = InstFrom.dyn_cast<ClassTemplateDecl *>()) {
-    Writer.AddDeclRef(InstFromD, Record);
+    Record.AddDeclRef(InstFromD);
   } else {
-    Writer.AddDeclRef(InstFrom.get<ClassTemplatePartialSpecializationDecl *>(),
-                      Record);
-    Writer.AddTemplateArgumentList(&D->getTemplateInstantiationArgs(), Record);
+    Record.AddDeclRef(InstFrom.get<ClassTemplatePartialSpecializationDecl *>());
+    Record.AddTemplateArgumentList(&D->getTemplateInstantiationArgs());
   }
 
-  Writer.AddTemplateArgumentList(&D->getTemplateArgs(), Record);
-  Writer.AddSourceLocation(D->getPointOfInstantiation(), Record);
+  Record.AddTemplateArgumentList(&D->getTemplateArgs());
+  Record.AddSourceLocation(D->getPointOfInstantiation());
   Record.push_back(D->getSpecializationKind());
   Record.push_back(D->isCanonicalDecl());
 
   if (D->isCanonicalDecl()) {
     // When reading, we'll add it to the folding set of the following template. 
-    Writer.AddDeclRef(D->getSpecializedTemplate()->getCanonicalDecl(), Record);
+    Record.AddDeclRef(D->getSpecializedTemplate()->getCanonicalDecl());
   }
 
   // Explicit info.
-  Writer.AddTypeSourceInfo(D->getTypeAsWritten(), Record);
+  Record.AddTypeSourceInfo(D->getTypeAsWritten());
   if (D->getTypeAsWritten()) {
-    Writer.AddSourceLocation(D->getExternLoc(), Record);
-    Writer.AddSourceLocation(D->getTemplateKeywordLoc(), Record);
+    Record.AddSourceLocation(D->getExternLoc());
+    Record.AddSourceLocation(D->getTemplateKeywordLoc());
   }
 
   Code = serialization::DECL_CLASS_TEMPLATE_SPECIALIZATION;
@@ -1339,12 +1396,12 @@
                                     ClassTemplatePartialSpecializationDecl *D) {
   VisitClassTemplateSpecializationDecl(D);
 
-  Writer.AddTemplateParameterList(D->getTemplateParameters(), Record);
-  Writer.AddASTTemplateArgumentListInfo(D->getTemplateArgsAsWritten(), Record);
+  Record.AddTemplateParameterList(D->getTemplateParameters());
+  Record.AddASTTemplateArgumentListInfo(D->getTemplateArgsAsWritten());
 
   // These are read/set from/to the first declaration.
   if (D->getPreviousDecl() == nullptr) {
-    Writer.AddDeclRef(D->getInstantiatedFromMember(), Record);
+    Record.AddDeclRef(D->getInstantiatedFromMember());
     Record.push_back(D->isMemberSpecialization());
   }
 
@@ -1368,28 +1425,27 @@
   llvm::PointerUnion<VarTemplateDecl *, VarTemplatePartialSpecializationDecl *>
   InstFrom = D->getSpecializedTemplateOrPartial();
   if (Decl *InstFromD = InstFrom.dyn_cast<VarTemplateDecl *>()) {
-    Writer.AddDeclRef(InstFromD, Record);
+    Record.AddDeclRef(InstFromD);
   } else {
-    Writer.AddDeclRef(InstFrom.get<VarTemplatePartialSpecializationDecl *>(),
-                      Record);
-    Writer.AddTemplateArgumentList(&D->getTemplateInstantiationArgs(), Record);
+    Record.AddDeclRef(InstFrom.get<VarTemplatePartialSpecializationDecl *>());
+    Record.AddTemplateArgumentList(&D->getTemplateInstantiationArgs());
   }
 
   // Explicit info.
-  Writer.AddTypeSourceInfo(D->getTypeAsWritten(), Record);
+  Record.AddTypeSourceInfo(D->getTypeAsWritten());
   if (D->getTypeAsWritten()) {
-    Writer.AddSourceLocation(D->getExternLoc(), Record);
-    Writer.AddSourceLocation(D->getTemplateKeywordLoc(), Record);
+    Record.AddSourceLocation(D->getExternLoc());
+    Record.AddSourceLocation(D->getTemplateKeywordLoc());
   }
 
-  Writer.AddTemplateArgumentList(&D->getTemplateArgs(), Record);
-  Writer.AddSourceLocation(D->getPointOfInstantiation(), Record);
+  Record.AddTemplateArgumentList(&D->getTemplateArgs());
+  Record.AddSourceLocation(D->getPointOfInstantiation());
   Record.push_back(D->getSpecializationKind());
   Record.push_back(D->isCanonicalDecl());
 
   if (D->isCanonicalDecl()) {
     // When reading, we'll add it to the folding set of the following template.
-    Writer.AddDeclRef(D->getSpecializedTemplate()->getCanonicalDecl(), Record);
+    Record.AddDeclRef(D->getSpecializedTemplate()->getCanonicalDecl());
   }
 
   Code = serialization::DECL_VAR_TEMPLATE_SPECIALIZATION;
@@ -1399,12 +1455,12 @@
     VarTemplatePartialSpecializationDecl *D) {
   VisitVarTemplateSpecializationDecl(D);
 
-  Writer.AddTemplateParameterList(D->getTemplateParameters(), Record);
-  Writer.AddASTTemplateArgumentListInfo(D->getTemplateArgsAsWritten(), Record);
+  Record.AddTemplateParameterList(D->getTemplateParameters());
+  Record.AddASTTemplateArgumentListInfo(D->getTemplateArgsAsWritten());
 
   // These are read/set from/to the first declaration.
   if (D->getPreviousDecl() == nullptr) {
-    Writer.AddDeclRef(D->getInstantiatedFromMember(), Record);
+    Record.AddDeclRef(D->getInstantiatedFromMember());
     Record.push_back(D->isMemberSpecialization());
   }
 
@@ -1414,7 +1470,7 @@
 void ASTDeclWriter::VisitClassScopeFunctionSpecializationDecl(
                                     ClassScopeFunctionSpecializationDecl *D) {
   VisitDecl(D);
-  Writer.AddDeclRef(D->getSpecialization(), Record);
+  Record.AddDeclRef(D->getSpecialization());
   Code = serialization::DECL_CLASS_SCOPE_FUNCTION_SPECIALIZATION;
 }
 
@@ -1436,7 +1492,7 @@
                         !D->defaultArgumentWasInherited();
   Record.push_back(OwnsDefaultArg);
   if (OwnsDefaultArg)
-    Writer.AddTypeSourceInfo(D->getDefaultArgumentInfo(), Record);
+    Record.AddTypeSourceInfo(D->getDefaultArgumentInfo());
 
   Code = serialization::DECL_TEMPLATE_TYPE_PARM;
 }
@@ -1455,8 +1511,8 @@
   
   if (D->isExpandedParameterPack()) {
     for (unsigned I = 0, N = D->getNumExpansionTypes(); I != N; ++I) {
-      Writer.AddTypeRef(D->getExpansionType(I), Record);
-      Writer.AddTypeSourceInfo(D->getExpansionTypeSourceInfo(I), Record);
+      Record.AddTypeRef(D->getExpansionType(I));
+      Record.AddTypeSourceInfo(D->getExpansionTypeSourceInfo(I));
     }
       
     Code = serialization::DECL_EXPANDED_NON_TYPE_TEMPLATE_PARM_PACK;
@@ -1467,7 +1523,7 @@
                           !D->defaultArgumentWasInherited();
     Record.push_back(OwnsDefaultArg);
     if (OwnsDefaultArg)
-      Writer.AddStmt(D->getDefaultArgument());
+      Record.AddStmt(D->getDefaultArgument());
     Code = serialization::DECL_NON_TYPE_TEMPLATE_PARM;
   }
 }
@@ -1487,8 +1543,7 @@
   if (D->isExpandedParameterPack()) {
     for (unsigned I = 0, N = D->getNumExpansionTemplateParameters();
          I != N; ++I)
-      Writer.AddTemplateParameterList(D->getExpansionTemplateParameters(I),
-                                      Record);
+      Record.AddTemplateParameterList(D->getExpansionTemplateParameters(I));
     Code = serialization::DECL_EXPANDED_TEMPLATE_TEMPLATE_PARM_PACK;
   } else {
     // Rest of TemplateTemplateParmDecl.
@@ -1497,7 +1552,7 @@
                           !D->defaultArgumentWasInherited();
     Record.push_back(OwnsDefaultArg);
     if (OwnsDefaultArg)
-      Writer.AddTemplateArgumentLoc(D->getDefaultArgument(), Record);
+      Record.AddTemplateArgumentLoc(D->getDefaultArgument());
     Code = serialization::DECL_TEMPLATE_TEMPLATE_PARM;
   }
 }
@@ -1509,41 +1564,20 @@
 
 void ASTDeclWriter::VisitStaticAssertDecl(StaticAssertDecl *D) {
   VisitDecl(D);
-  Writer.AddStmt(D->getAssertExpr());
+  Record.AddStmt(D->getAssertExpr());
   Record.push_back(D->isFailed());
-  Writer.AddStmt(D->getMessage());
-  Writer.AddSourceLocation(D->getRParenLoc(), Record);
+  Record.AddStmt(D->getMessage());
+  Record.AddSourceLocation(D->getRParenLoc());
   Code = serialization::DECL_STATIC_ASSERT;
 }
 
 /// \brief Emit the DeclContext part of a declaration context decl.
-///
-/// \param LexicalOffset the offset at which the DECL_CONTEXT_LEXICAL
-/// block for this declaration context is stored. May be 0 to indicate
-/// that there are no declarations stored within this context.
-///
-/// \param VisibleOffset the offset at which the DECL_CONTEXT_VISIBLE
-/// block for this declaration context is stored. May be 0 to indicate
-/// that there are no declarations visible from this context. Note
-/// that this value will not be emitted for non-primary declaration
-/// contexts.
-void ASTDeclWriter::VisitDeclContext(DeclContext *DC, uint64_t LexicalOffset,
-                                     uint64_t VisibleOffset) {
-  Record.push_back(LexicalOffset);
-  Record.push_back(VisibleOffset);
+void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
+  Record.AddOffset(Writer.WriteDeclContextLexicalBlock(Context, DC));
+  Record.AddOffset(Writer.WriteDeclContextVisibleBlock(Context, DC));
 }
 
 const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) {
-  /// \brief Is this a local declaration (that is, one that will be written to
-  /// our AST file)? This is the case for declarations that are neither imported
-  /// from another AST file nor predefined.
-  auto IsLocalDecl = [&](const Decl *D) -> bool {
-    if (D->isFromASTFile())
-      return false;
-    auto I = DeclIDs.find(D);
-    return (I == DeclIDs.end() || I->second >= NUM_PREDEF_DECL_IDS);
-  };
-
   assert(IsLocalDecl(D) && "expected a local declaration");
 
   const Decl *Canon = D->getCanonicalDecl();
@@ -1569,7 +1603,7 @@
     assert(isRedeclarableDeclKind(DAsT->getKind()) &&
            "Not considered redeclarable?");
 
-    Writer.AddDeclRef(First, Record);
+    Record.AddDeclRef(First);
 
     // Write out a list of local redeclarations of this declaration if it's the
     // first local declaration in the chain.
@@ -1587,23 +1621,22 @@
 
       // Collect the set of local redeclarations of this declaration, from
       // newest to oldest.
-      RecordData LocalRedecls;
+      ASTWriter::RecordData LocalRedecls;
+      ASTRecordWriter LocalRedeclWriter(Record, LocalRedecls);
       for (const Decl *Prev = FirstLocal->getMostRecentDecl();
            Prev != FirstLocal; Prev = Prev->getPreviousDecl())
         if (!Prev->isFromASTFile())
-          Writer.AddDeclRef(Prev, LocalRedecls);
+          LocalRedeclWriter.AddDeclRef(Prev);
 
       // If we have any redecls, write them now as a separate record preceding
       // the declaration itself.
       if (LocalRedecls.empty())
         Record.push_back(0);
-      else {
-        Record.push_back(Writer.Stream.GetCurrentBitNo());
-        Writer.Stream.EmitRecord(LOCAL_REDECLARATIONS, LocalRedecls);
-      }
+      else
+        Record.AddOffset(LocalRedeclWriter.Emit(LOCAL_REDECLARATIONS));
     } else {
       Record.push_back(0);
-      Writer.AddDeclRef(FirstLocal, Record);
+      Record.AddDeclRef(FirstLocal);
     }
 
     // Make sure that we serialize both the previous and the most-recent 
@@ -1624,10 +1657,24 @@
   Record.push_back(D->varlist_size());
   VisitDecl(D);
   for (auto *I : D->varlists())
-    Writer.AddStmt(I);
+    Record.AddStmt(I);
   Code = serialization::DECL_OMP_THREADPRIVATE;
 }
 
+void ASTDeclWriter::VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D) {
+  VisitValueDecl(D);
+  Record.AddSourceLocation(D->getLocStart());
+  Record.AddStmt(D->getCombiner());
+  Record.AddStmt(D->getInitializer());
+  Record.AddDeclRef(D->getPrevDeclInScope());
+  Code = serialization::DECL_OMP_DECLARE_REDUCTION;
+}
+
+void ASTDeclWriter::VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D) {
+  VisitVarDecl(D);
+  Code = serialization::DECL_OMP_CAPTUREDEXPR;
+}
+
 //===----------------------------------------------------------------------===//
 // ASTWriter Implementation
 //===----------------------------------------------------------------------===//
@@ -1909,6 +1956,8 @@
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isNRVOVariable
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isCXXForRangeDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isARCPseudoStrong
+  Abv->Add(BitCodeAbbrevOp(0));                         // isInline
+  Abv->Add(BitCodeAbbrevOp(0));                         // isInlineSpecified
   Abv->Add(BitCodeAbbrevOp(0));                         // isConstexpr
   Abv->Add(BitCodeAbbrevOp(0));                         // isInitCapture
   Abv->Add(BitCodeAbbrevOp(0));                         // isPrevDeclInSameScope
@@ -2084,26 +2133,22 @@
   // An ObjCMethodDecl is never considered as "required" because its
   // implementation container always is.
 
-  // File scoped assembly or obj-c implementation must be seen.
-  if (isa<FileScopeAsmDecl>(D) || isa<ObjCImplDecl>(D))
+  // File scoped assembly or obj-c or OMP declare target implementation must be
+  // seen.
+  if (isa<FileScopeAsmDecl>(D) || isa<ObjCImplDecl>(D) ||
+      D->hasAttr<OMPDeclareTargetDeclAttr>())
     return true;
 
-  // ImportDecl is used by codegen to determine the set of imported modules to
-  // search for inputs for automatic linking; include it if it has a semantic
-  // effect.
-  if (isa<ImportDecl>(D) && !WritingModule)
-    return true;
+  if (WritingModule && (isa<VarDecl>(D) || isa<ImportDecl>(D))) {
+    // These declarations are part of the module initializer, and are emitted
+    // if and when the module is imported, rather than being emitted eagerly.
+    return false;
+  }
 
   return Context.DeclMustBeEmitted(D);
 }
 
 void ASTWriter::WriteDecl(ASTContext &Context, Decl *D) {
-  // Switch case IDs are per Decl.
-  ClearSwitchCaseIDs();
-
-  RecordData Record;
-  ASTDeclWriter W(*this, Context, Record);
-
   // Determine the ID for this declaration.
   serialization::DeclID ID;
   assert(!D->isFromASTFile() && "should not be emitting imported decl");
@@ -2113,66 +2158,34 @@
     
   ID = IDR;
 
-  bool isReplacingADecl = ID < FirstDeclID;
-
-  // If this declaration is also a DeclContext, write blocks for the
-  // declarations that lexically stored inside its context and those
-  // declarations that are visible from its context. These blocks
-  // are written before the declaration itself so that we can put
-  // their offsets into the record for the declaration.
-  uint64_t LexicalOffset = 0;
-  uint64_t VisibleOffset = 0;
-  DeclContext *DC = dyn_cast<DeclContext>(D);
-  if (DC) {
-    if (isReplacingADecl) {
-      // It is replacing a decl from a chained PCH; make sure that the
-      // DeclContext is fully loaded.
-      if (DC->hasExternalLexicalStorage())
-        DC->LoadLexicalDeclsFromExternalStorage();
-      if (DC->hasExternalVisibleStorage())
-        Chain->completeVisibleDeclsMap(DC);
-    }
-    LexicalOffset = WriteDeclContextLexicalBlock(Context, DC);
-    VisibleOffset = WriteDeclContextVisibleBlock(Context, DC);
-  }
+  assert(ID >= FirstDeclID && "invalid decl ID");
   
+  RecordData Record;
+  ASTDeclWriter W(*this, Context, Record);
+
   // Build a record for this declaration
-  Record.clear();
-  W.Code = (serialization::DeclCode)0;
-  W.AbbrevToUse = 0;
   W.Visit(D);
-  if (DC) W.VisitDeclContext(DC, LexicalOffset, VisibleOffset);
 
-  if (isReplacingADecl) {
-    // We're replacing a decl in a previous file.
-    ReplacedDecls.push_back(ReplacedDeclInfo(ID, Stream.GetCurrentBitNo(),
-                                             D->getLocation()));
+  // Emit this declaration to the bitstream.
+  uint64_t Offset = W.Emit(D);
+
+  // Record the offset for this declaration
+  SourceLocation Loc = D->getLocation();
+  unsigned Index = ID - FirstDeclID;
+  if (DeclOffsets.size() == Index)
+    DeclOffsets.push_back(DeclOffset(Loc, Offset));
+  else if (DeclOffsets.size() < Index) {
+    // FIXME: Can/should this happen?
+    DeclOffsets.resize(Index+1);
+    DeclOffsets[Index].setLocation(Loc);
+    DeclOffsets[Index].BitOffset = Offset;
   } else {
-    unsigned Index = ID - FirstDeclID;
-
-    // Record the offset for this declaration
-    SourceLocation Loc = D->getLocation();
-    if (DeclOffsets.size() == Index)
-      DeclOffsets.push_back(DeclOffset(Loc, Stream.GetCurrentBitNo()));
-    else if (DeclOffsets.size() < Index) {
-      DeclOffsets.resize(Index+1);
-      DeclOffsets[Index].setLocation(Loc);
-      DeclOffsets[Index].BitOffset = Stream.GetCurrentBitNo();
-    }
-
-    SourceManager &SM = Context.getSourceManager();
-    if (Loc.isValid() && SM.isLocalSourceLocation(Loc))
-      associateDeclWithFile(D, ID);
+    llvm_unreachable("declarations should be emitted in ID order");
   }
 
-  if (!W.Code)
-    llvm::report_fatal_error(StringRef("unexpected declaration kind '") +
-                            D->getDeclKindName() + "'");
-  Stream.EmitRecord(W.Code, Record, W.AbbrevToUse);
-
-  // Flush any expressions, base specifiers, and ctor initializers that
-  // were written as part of this declaration.
-  FlushPendingAfterDecl();
+  SourceManager &SM = Context.getSourceManager();
+  if (Loc.isValid() && SM.isLocalSourceLocation(Loc))
+    associateDeclWithFile(D, ID);
 
   // Note declarations that should be deserialized eagerly so that we can add
   // them to a record in the AST file later.
@@ -2180,10 +2193,16 @@
     EagerlyDeserializedDecls.push_back(ID);
 }
 
-void ASTWriter::AddFunctionDefinition(const FunctionDecl *FD,
-                                      RecordData &Record) {
-  ClearSwitchCaseIDs();
+void ASTRecordWriter::AddFunctionDefinition(const FunctionDecl *FD) {
+  // Switch case IDs are per function body.
+  Writer->ClearSwitchCaseIDs();
 
-  ASTDeclWriter W(*this, FD->getASTContext(), Record);
-  W.AddFunctionDefinition(FD);
+  assert(FD->doesThisDeclarationHaveABody());
+  if (auto *CD = dyn_cast<CXXConstructorDecl>(FD)) {
+    Record->push_back(CD->getNumCtorInitializers());
+    if (CD->getNumCtorInitializers())
+      AddCXXCtorInitializers(
+          llvm::makeArrayRef(CD->init_begin(), CD->init_end()));
+  }
+  AddStmt(FD->getBody());
 }
diff --git a/lib/Serialization/ASTWriterStmt.cpp b/lib/Serialization/ASTWriterStmt.cpp
index 7b5440b..19b1e20 100644
--- a/lib/Serialization/ASTWriterStmt.cpp
+++ b/lib/Serialization/ASTWriterStmt.cpp
@@ -29,16 +29,24 @@
 namespace clang {
 
   class ASTStmtWriter : public StmtVisitor<ASTStmtWriter, void> {
-    friend class OMPClauseWriter;
     ASTWriter &Writer;
-    ASTWriter::RecordData &Record;
+    ASTRecordWriter Record;
 
-  public:
     serialization::StmtCode Code;
     unsigned AbbrevToUse;
 
+  public:
     ASTStmtWriter(ASTWriter &Writer, ASTWriter::RecordData &Record)
-      : Writer(Writer), Record(Record) { }
+        : Writer(Writer), Record(Writer, Record),
+          Code(serialization::STMT_NULL_PTR), AbbrevToUse(0) {}
+
+    ASTStmtWriter(const ASTStmtWriter&) = delete;
+
+    uint64_t Emit() {
+      assert(Code != serialization::STMT_NULL_PTR &&
+             "unhandled sub-statement writing AST file");
+      return Record.EmitStmt(Code, AbbrevToUse);
+    }
 
     void AddTemplateKWAndArgsInfo(const ASTTemplateKWAndArgsInfo &ArgInfo,
                                   const TemplateArgumentLoc *Args);
@@ -52,11 +60,11 @@
 
 void ASTStmtWriter::AddTemplateKWAndArgsInfo(
     const ASTTemplateKWAndArgsInfo &ArgInfo, const TemplateArgumentLoc *Args) {
-  Writer.AddSourceLocation(ArgInfo.TemplateKWLoc, Record);
-  Writer.AddSourceLocation(ArgInfo.LAngleLoc, Record);
-  Writer.AddSourceLocation(ArgInfo.RAngleLoc, Record);
+  Record.AddSourceLocation(ArgInfo.TemplateKWLoc);
+  Record.AddSourceLocation(ArgInfo.LAngleLoc);
+  Record.AddSourceLocation(ArgInfo.RAngleLoc);
   for (unsigned i = 0; i != ArgInfo.NumTemplateArgs; ++i)
-    Writer.AddTemplateArgumentLoc(Args[i], Record);
+    Record.AddTemplateArgumentLoc(Args[i]);
 }
 
 void ASTStmtWriter::VisitStmt(Stmt *S) {
@@ -64,7 +72,7 @@
 
 void ASTStmtWriter::VisitNullStmt(NullStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getSemiLoc(), Record);
+  Record.AddSourceLocation(S->getSemiLoc());
   Record.push_back(S->HasLeadingEmptyMacro);
   Code = serialization::STMT_NULL;
 }
@@ -73,68 +81,71 @@
   VisitStmt(S);
   Record.push_back(S->size());
   for (auto *CS : S->body())
-    Writer.AddStmt(CS);
-  Writer.AddSourceLocation(S->getLBracLoc(), Record);
-  Writer.AddSourceLocation(S->getRBracLoc(), Record);
+    Record.AddStmt(CS);
+  Record.AddSourceLocation(S->getLBracLoc());
+  Record.AddSourceLocation(S->getRBracLoc());
   Code = serialization::STMT_COMPOUND;
 }
 
 void ASTStmtWriter::VisitSwitchCase(SwitchCase *S) {
   VisitStmt(S);
   Record.push_back(Writer.getSwitchCaseID(S));
-  Writer.AddSourceLocation(S->getKeywordLoc(), Record);
-  Writer.AddSourceLocation(S->getColonLoc(), Record);
+  Record.AddSourceLocation(S->getKeywordLoc());
+  Record.AddSourceLocation(S->getColonLoc());
 }
 
 void ASTStmtWriter::VisitCaseStmt(CaseStmt *S) {
   VisitSwitchCase(S);
-  Writer.AddStmt(S->getLHS());
-  Writer.AddStmt(S->getRHS());
-  Writer.AddStmt(S->getSubStmt());
-  Writer.AddSourceLocation(S->getEllipsisLoc(), Record);
+  Record.AddStmt(S->getLHS());
+  Record.AddStmt(S->getRHS());
+  Record.AddStmt(S->getSubStmt());
+  Record.AddSourceLocation(S->getEllipsisLoc());
   Code = serialization::STMT_CASE;
 }
 
 void ASTStmtWriter::VisitDefaultStmt(DefaultStmt *S) {
   VisitSwitchCase(S);
-  Writer.AddStmt(S->getSubStmt());
+  Record.AddStmt(S->getSubStmt());
   Code = serialization::STMT_DEFAULT;
 }
 
 void ASTStmtWriter::VisitLabelStmt(LabelStmt *S) {
   VisitStmt(S);
-  Writer.AddDeclRef(S->getDecl(), Record);
-  Writer.AddStmt(S->getSubStmt());
-  Writer.AddSourceLocation(S->getIdentLoc(), Record);
+  Record.AddDeclRef(S->getDecl());
+  Record.AddStmt(S->getSubStmt());
+  Record.AddSourceLocation(S->getIdentLoc());
   Code = serialization::STMT_LABEL;
 }
 
 void ASTStmtWriter::VisitAttributedStmt(AttributedStmt *S) {
   VisitStmt(S);
   Record.push_back(S->getAttrs().size());
-  Writer.WriteAttributes(S->getAttrs(), Record);
-  Writer.AddStmt(S->getSubStmt());
-  Writer.AddSourceLocation(S->getAttrLoc(), Record);
+  Record.AddAttributes(S->getAttrs());
+  Record.AddStmt(S->getSubStmt());
+  Record.AddSourceLocation(S->getAttrLoc());
   Code = serialization::STMT_ATTRIBUTED;
 }
 
 void ASTStmtWriter::VisitIfStmt(IfStmt *S) {
   VisitStmt(S);
-  Writer.AddDeclRef(S->getConditionVariable(), Record);
-  Writer.AddStmt(S->getCond());
-  Writer.AddStmt(S->getThen());
-  Writer.AddStmt(S->getElse());
-  Writer.AddSourceLocation(S->getIfLoc(), Record);
-  Writer.AddSourceLocation(S->getElseLoc(), Record);
+  Record.push_back(S->isConstexpr());
+  Record.AddStmt(S->getInit());
+  Record.AddDeclRef(S->getConditionVariable());
+  Record.AddStmt(S->getCond());
+  Record.AddStmt(S->getThen());
+  Record.AddStmt(S->getElse());
+  Record.AddSourceLocation(S->getIfLoc());
+  Record.AddSourceLocation(S->getElseLoc());
   Code = serialization::STMT_IF;
 }
 
 void ASTStmtWriter::VisitSwitchStmt(SwitchStmt *S) {
   VisitStmt(S);
-  Writer.AddDeclRef(S->getConditionVariable(), Record);
-  Writer.AddStmt(S->getCond());
-  Writer.AddStmt(S->getBody());
-  Writer.AddSourceLocation(S->getSwitchLoc(), Record);
+  Record.AddStmt(S->getInit());
+  Record.AddDeclRef(S->getConditionVariable());
+  Record.AddStmt(S->getCond());
+  Record.AddStmt(S->getBody());
+  Record.AddSourceLocation(S->getSwitchLoc());
   Record.push_back(S->isAllEnumCasesCovered());
   for (SwitchCase *SC = S->getSwitchCaseList(); SC;
        SC = SC->getNextSwitchCase())
@@ -144,79 +155,79 @@
 
 void ASTStmtWriter::VisitWhileStmt(WhileStmt *S) {
   VisitStmt(S);
-  Writer.AddDeclRef(S->getConditionVariable(), Record);
-  Writer.AddStmt(S->getCond());
-  Writer.AddStmt(S->getBody());
-  Writer.AddSourceLocation(S->getWhileLoc(), Record);
+  Record.AddDeclRef(S->getConditionVariable());
+  Record.AddStmt(S->getCond());
+  Record.AddStmt(S->getBody());
+  Record.AddSourceLocation(S->getWhileLoc());
   Code = serialization::STMT_WHILE;
 }
 
 void ASTStmtWriter::VisitDoStmt(DoStmt *S) {
   VisitStmt(S);
-  Writer.AddStmt(S->getCond());
-  Writer.AddStmt(S->getBody());
-  Writer.AddSourceLocation(S->getDoLoc(), Record);
-  Writer.AddSourceLocation(S->getWhileLoc(), Record);
-  Writer.AddSourceLocation(S->getRParenLoc(), Record);
+  Record.AddStmt(S->getCond());
+  Record.AddStmt(S->getBody());
+  Record.AddSourceLocation(S->getDoLoc());
+  Record.AddSourceLocation(S->getWhileLoc());
+  Record.AddSourceLocation(S->getRParenLoc());
   Code = serialization::STMT_DO;
 }
 
 void ASTStmtWriter::VisitForStmt(ForStmt *S) {
   VisitStmt(S);
-  Writer.AddStmt(S->getInit());
-  Writer.AddStmt(S->getCond());
-  Writer.AddDeclRef(S->getConditionVariable(), Record);
-  Writer.AddStmt(S->getInc());
-  Writer.AddStmt(S->getBody());
-  Writer.AddSourceLocation(S->getForLoc(), Record);
-  Writer.AddSourceLocation(S->getLParenLoc(), Record);
-  Writer.AddSourceLocation(S->getRParenLoc(), Record);
+  Record.AddStmt(S->getInit());
+  Record.AddStmt(S->getCond());
+  Record.AddDeclRef(S->getConditionVariable());
+  Record.AddStmt(S->getInc());
+  Record.AddStmt(S->getBody());
+  Record.AddSourceLocation(S->getForLoc());
+  Record.AddSourceLocation(S->getLParenLoc());
+  Record.AddSourceLocation(S->getRParenLoc());
   Code = serialization::STMT_FOR;
 }
 
 void ASTStmtWriter::VisitGotoStmt(GotoStmt *S) {
   VisitStmt(S);
-  Writer.AddDeclRef(S->getLabel(), Record);
-  Writer.AddSourceLocation(S->getGotoLoc(), Record);
-  Writer.AddSourceLocation(S->getLabelLoc(), Record);
+  Record.AddDeclRef(S->getLabel());
+  Record.AddSourceLocation(S->getGotoLoc());
+  Record.AddSourceLocation(S->getLabelLoc());
   Code = serialization::STMT_GOTO;
 }
 
 void ASTStmtWriter::VisitIndirectGotoStmt(IndirectGotoStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getGotoLoc(), Record);
-  Writer.AddSourceLocation(S->getStarLoc(), Record);
-  Writer.AddStmt(S->getTarget());
+  Record.AddSourceLocation(S->getGotoLoc());
+  Record.AddSourceLocation(S->getStarLoc());
+  Record.AddStmt(S->getTarget());
   Code = serialization::STMT_INDIRECT_GOTO;
 }
 
 void ASTStmtWriter::VisitContinueStmt(ContinueStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getContinueLoc(), Record);
+  Record.AddSourceLocation(S->getContinueLoc());
   Code = serialization::STMT_CONTINUE;
 }
 
 void ASTStmtWriter::VisitBreakStmt(BreakStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getBreakLoc(), Record);
+  Record.AddSourceLocation(S->getBreakLoc());
   Code = serialization::STMT_BREAK;
 }
 
 void ASTStmtWriter::VisitReturnStmt(ReturnStmt *S) {
   VisitStmt(S);
-  Writer.AddStmt(S->getRetValue());
-  Writer.AddSourceLocation(S->getReturnLoc(), Record);
-  Writer.AddDeclRef(S->getNRVOCandidate(), Record);
+  Record.AddStmt(S->getRetValue());
+  Record.AddSourceLocation(S->getReturnLoc());
+  Record.AddDeclRef(S->getNRVOCandidate());
   Code = serialization::STMT_RETURN;
 }
 
 void ASTStmtWriter::VisitDeclStmt(DeclStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getStartLoc(), Record);
-  Writer.AddSourceLocation(S->getEndLoc(), Record);
+  Record.AddSourceLocation(S->getStartLoc());
+  Record.AddSourceLocation(S->getEndLoc());
   DeclGroupRef DG = S->getDeclGroup();
   for (DeclGroupRef::iterator D = DG.begin(), DEnd = DG.end(); D != DEnd; ++D)
-    Writer.AddDeclRef(*D, Record);
+    Record.AddDeclRef(*D);
   Code = serialization::STMT_DECL;
 }
 
@@ -225,64 +236,65 @@
   Record.push_back(S->getNumOutputs());
   Record.push_back(S->getNumInputs());
   Record.push_back(S->getNumClobbers());
-  Writer.AddSourceLocation(S->getAsmLoc(), Record);
+  Record.AddSourceLocation(S->getAsmLoc());
   Record.push_back(S->isVolatile());
   Record.push_back(S->isSimple());
 }
 
 void ASTStmtWriter::VisitGCCAsmStmt(GCCAsmStmt *S) {
   VisitAsmStmt(S);
-  Writer.AddSourceLocation(S->getRParenLoc(), Record);
-  Writer.AddStmt(S->getAsmString());
+  Record.AddSourceLocation(S->getRParenLoc());
+  Record.AddStmt(S->getAsmString());
 
   // Outputs
   for (unsigned I = 0, N = S->getNumOutputs(); I != N; ++I) {      
-    Writer.AddIdentifierRef(S->getOutputIdentifier(I), Record);
-    Writer.AddStmt(S->getOutputConstraintLiteral(I));
-    Writer.AddStmt(S->getOutputExpr(I));
+    Record.AddIdentifierRef(S->getOutputIdentifier(I));
+    Record.AddStmt(S->getOutputConstraintLiteral(I));
+    Record.AddStmt(S->getOutputExpr(I));
   }
 
   // Inputs
   for (unsigned I = 0, N = S->getNumInputs(); I != N; ++I) {
-    Writer.AddIdentifierRef(S->getInputIdentifier(I), Record);
-    Writer.AddStmt(S->getInputConstraintLiteral(I));
-    Writer.AddStmt(S->getInputExpr(I));
+    Record.AddIdentifierRef(S->getInputIdentifier(I));
+    Record.AddStmt(S->getInputConstraintLiteral(I));
+    Record.AddStmt(S->getInputExpr(I));
   }
 
   // Clobbers
   for (unsigned I = 0, N = S->getNumClobbers(); I != N; ++I)
-    Writer.AddStmt(S->getClobberStringLiteral(I));
+    Record.AddStmt(S->getClobberStringLiteral(I));
 
   Code = serialization::STMT_GCCASM;
 }
 
 void ASTStmtWriter::VisitMSAsmStmt(MSAsmStmt *S) {
   VisitAsmStmt(S);
-  Writer.AddSourceLocation(S->getLBraceLoc(), Record);
-  Writer.AddSourceLocation(S->getEndLoc(), Record);
+  Record.AddSourceLocation(S->getLBraceLoc());
+  Record.AddSourceLocation(S->getEndLoc());
   Record.push_back(S->getNumAsmToks());
-  Writer.AddString(S->getAsmString(), Record);
+  Record.AddString(S->getAsmString());
 
   // Tokens
   for (unsigned I = 0, N = S->getNumAsmToks(); I != N; ++I) {
-    Writer.AddToken(S->getAsmToks()[I], Record);
+    // FIXME: Move this to ASTRecordWriter?
+    Writer.AddToken(S->getAsmToks()[I], Record.getRecordData());
   }
 
   // Clobbers
   for (unsigned I = 0, N = S->getNumClobbers(); I != N; ++I) {
-    Writer.AddString(S->getClobber(I), Record);
+    Record.AddString(S->getClobber(I));
   }
 
   // Outputs
   for (unsigned I = 0, N = S->getNumOutputs(); I != N; ++I) {      
-    Writer.AddStmt(S->getOutputExpr(I));
-    Writer.AddString(S->getOutputConstraint(I), Record);
+    Record.AddStmt(S->getOutputExpr(I));
+    Record.AddString(S->getOutputConstraint(I));
   }
 
   // Inputs
   for (unsigned I = 0, N = S->getNumInputs(); I != N; ++I) {
-    Writer.AddStmt(S->getInputExpr(I));
-    Writer.AddString(S->getInputConstraint(I), Record);
+    Record.AddStmt(S->getInputExpr(I));
+    Record.AddString(S->getInputConstraint(I));
   }
 
   Code = serialization::STMT_MSASM;
@@ -314,26 +326,26 @@
   Record.push_back(std::distance(S->capture_begin(), S->capture_end()));
 
   // CapturedDecl and captured region kind
-  Writer.AddDeclRef(S->getCapturedDecl(), Record);
+  Record.AddDeclRef(S->getCapturedDecl());
   Record.push_back(S->getCapturedRegionKind());
 
-  Writer.AddDeclRef(S->getCapturedRecordDecl(), Record);
+  Record.AddDeclRef(S->getCapturedRecordDecl());
 
   // Capture inits
   for (auto *I : S->capture_inits())
-    Writer.AddStmt(I);
+    Record.AddStmt(I);
 
   // Body
-  Writer.AddStmt(S->getCapturedStmt());
+  Record.AddStmt(S->getCapturedStmt());
 
   // Captures
   for (const auto &I : S->captures()) {
     if (I.capturesThis() || I.capturesVariableArrayType())
-      Writer.AddDeclRef(nullptr, Record);
+      Record.AddDeclRef(nullptr);
     else
-      Writer.AddDeclRef(I.getCapturedVar(), Record);
+      Record.AddDeclRef(I.getCapturedVar());
     Record.push_back(I.getCaptureKind());
-    Writer.AddSourceLocation(I.getLocation(), Record);
+    Record.AddSourceLocation(I.getLocation());
   }
 
   Code = serialization::STMT_CAPTURED;
@@ -341,7 +353,7 @@
 
 void ASTStmtWriter::VisitExpr(Expr *E) {
   VisitStmt(E);
-  Writer.AddTypeRef(E->getType(), Record);
+  Record.AddTypeRef(E->getType());
   Record.push_back(E->isTypeDependent());
   Record.push_back(E->isValueDependent());
   Record.push_back(E->isInstantiationDependent());
@@ -352,9 +364,9 @@
 
 void ASTStmtWriter::VisitPredefinedExpr(PredefinedExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getLocation(), Record);
+  Record.AddSourceLocation(E->getLocation());
   Record.push_back(E->getIdentType()); // FIXME: stable encoding
-  Writer.AddStmt(E->getFunctionName());
+  Record.AddStmt(E->getFunctionName());
   Code = serialization::EXPR_PREDEFINED;
 }
 
@@ -381,25 +393,25 @@
   }
 
   if (E->hasQualifier())
-    Writer.AddNestedNameSpecifierLoc(E->getQualifierLoc(), Record);
+    Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
 
   if (E->getDecl() != E->getFoundDecl())
-    Writer.AddDeclRef(E->getFoundDecl(), Record);
+    Record.AddDeclRef(E->getFoundDecl());
 
   if (E->hasTemplateKWAndArgsInfo())
     AddTemplateKWAndArgsInfo(*E->getTrailingObjects<ASTTemplateKWAndArgsInfo>(),
                              E->getTrailingObjects<TemplateArgumentLoc>());
 
-  Writer.AddDeclRef(E->getDecl(), Record);
-  Writer.AddSourceLocation(E->getLocation(), Record);
-  Writer.AddDeclarationNameLoc(E->DNLoc, E->getDecl()->getDeclName(), Record);
+  Record.AddDeclRef(E->getDecl());
+  Record.AddSourceLocation(E->getLocation());
+  Record.AddDeclarationNameLoc(E->DNLoc, E->getDecl()->getDeclName());
   Code = serialization::EXPR_DECL_REF;
 }
 
 void ASTStmtWriter::VisitIntegerLiteral(IntegerLiteral *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getLocation(), Record);
-  Writer.AddAPInt(E->getValue(), Record);
+  Record.AddSourceLocation(E->getLocation());
+  Record.AddAPInt(E->getValue());
 
   if (E->getValue().getBitWidth() == 32) {
     AbbrevToUse = Writer.getIntegerLiteralAbbrev();
@@ -412,14 +424,14 @@
   VisitExpr(E);
   Record.push_back(E->getRawSemantics());
   Record.push_back(E->isExact());
-  Writer.AddAPFloat(E->getValue(), Record);
-  Writer.AddSourceLocation(E->getLocation(), Record);
+  Record.AddAPFloat(E->getValue());
+  Record.AddSourceLocation(E->getLocation());
   Code = serialization::EXPR_FLOATING_LITERAL;
 }
 
 void ASTStmtWriter::VisitImaginaryLiteral(ImaginaryLiteral *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getSubExpr());
+  Record.AddStmt(E->getSubExpr());
   Code = serialization::EXPR_IMAGINARY_LITERAL;
 }
 
@@ -435,14 +447,14 @@
   // the AST file during deserialization.
   Record.append(E->getBytes().begin(), E->getBytes().end());
   for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I)
-    Writer.AddSourceLocation(E->getStrTokenLoc(I), Record);
+    Record.AddSourceLocation(E->getStrTokenLoc(I));
   Code = serialization::EXPR_STRING_LITERAL;
 }
 
 void ASTStmtWriter::VisitCharacterLiteral(CharacterLiteral *E) {
   VisitExpr(E);
   Record.push_back(E->getValue());
-  Writer.AddSourceLocation(E->getLocation(), Record);
+  Record.AddSourceLocation(E->getLocation());
   Record.push_back(E->getKind());
 
   AbbrevToUse = Writer.getCharacterLiteralAbbrev();
@@ -452,9 +464,9 @@
 
 void ASTStmtWriter::VisitParenExpr(ParenExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getLParen(), Record);
-  Writer.AddSourceLocation(E->getRParen(), Record);
-  Writer.AddStmt(E->getSubExpr());
+  Record.AddSourceLocation(E->getLParen());
+  Record.AddSourceLocation(E->getRParen());
+  Record.AddStmt(E->getSubExpr());
   Code = serialization::EXPR_PAREN;
 }
 
@@ -462,17 +474,17 @@
   VisitExpr(E);
   Record.push_back(E->NumExprs);
   for (unsigned i=0; i != E->NumExprs; ++i)
-    Writer.AddStmt(E->Exprs[i]);
-  Writer.AddSourceLocation(E->LParenLoc, Record);
-  Writer.AddSourceLocation(E->RParenLoc, Record);
+    Record.AddStmt(E->Exprs[i]);
+  Record.AddSourceLocation(E->LParenLoc);
+  Record.AddSourceLocation(E->RParenLoc);
   Code = serialization::EXPR_PAREN_LIST;
 }
 
 void ASTStmtWriter::VisitUnaryOperator(UnaryOperator *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getSubExpr());
+  Record.AddStmt(E->getSubExpr());
   Record.push_back(E->getOpcode()); // FIXME: stable encoding
-  Writer.AddSourceLocation(E->getOperatorLoc(), Record);
+  Record.AddSourceLocation(E->getOperatorLoc());
   Code = serialization::EXPR_UNARY_OPERATOR;
 }
 
@@ -480,34 +492,34 @@
   VisitExpr(E);
   Record.push_back(E->getNumComponents());
   Record.push_back(E->getNumExpressions());
-  Writer.AddSourceLocation(E->getOperatorLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
-  Writer.AddTypeSourceInfo(E->getTypeSourceInfo(), Record);
+  Record.AddSourceLocation(E->getOperatorLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
+  Record.AddTypeSourceInfo(E->getTypeSourceInfo());
   for (unsigned I = 0, N = E->getNumComponents(); I != N; ++I) {
     const OffsetOfNode &ON = E->getComponent(I);
     Record.push_back(ON.getKind()); // FIXME: Stable encoding
-    Writer.AddSourceLocation(ON.getSourceRange().getBegin(), Record);
-    Writer.AddSourceLocation(ON.getSourceRange().getEnd(), Record);
+    Record.AddSourceLocation(ON.getSourceRange().getBegin());
+    Record.AddSourceLocation(ON.getSourceRange().getEnd());
     switch (ON.getKind()) {
     case OffsetOfNode::Array:
       Record.push_back(ON.getArrayExprIndex());
       break;
 
     case OffsetOfNode::Field:
-      Writer.AddDeclRef(ON.getField(), Record);
+      Record.AddDeclRef(ON.getField());
       break;
 
     case OffsetOfNode::Identifier:
-      Writer.AddIdentifierRef(ON.getFieldName(), Record);
+      Record.AddIdentifierRef(ON.getFieldName());
       break;
 
     case OffsetOfNode::Base:
-      Writer.AddCXXBaseSpecifier(*ON.getBase(), Record);
+      Record.AddCXXBaseSpecifier(*ON.getBase());
       break;
     }
   }
   for (unsigned I = 0, N = E->getNumExpressions(); I != N; ++I)
-    Writer.AddStmt(E->getIndexExpr(I));
+    Record.AddStmt(E->getIndexExpr(I));
   Code = serialization::EXPR_OFFSETOF;
 }
 
@@ -515,42 +527,42 @@
   VisitExpr(E);
   Record.push_back(E->getKind());
   if (E->isArgumentType())
-    Writer.AddTypeSourceInfo(E->getArgumentTypeInfo(), Record);
+    Record.AddTypeSourceInfo(E->getArgumentTypeInfo());
   else {
     Record.push_back(0);
-    Writer.AddStmt(E->getArgumentExpr());
+    Record.AddStmt(E->getArgumentExpr());
   }
-  Writer.AddSourceLocation(E->getOperatorLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddSourceLocation(E->getOperatorLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_SIZEOF_ALIGN_OF;
 }
 
 void ASTStmtWriter::VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getLHS());
-  Writer.AddStmt(E->getRHS());
-  Writer.AddSourceLocation(E->getRBracketLoc(), Record);
+  Record.AddStmt(E->getLHS());
+  Record.AddStmt(E->getRHS());
+  Record.AddSourceLocation(E->getRBracketLoc());
   Code = serialization::EXPR_ARRAY_SUBSCRIPT;
 }
 
 void ASTStmtWriter::VisitOMPArraySectionExpr(OMPArraySectionExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getBase());
-  Writer.AddStmt(E->getLowerBound());
-  Writer.AddStmt(E->getLength());
-  Writer.AddSourceLocation(E->getColonLoc(), Record);
-  Writer.AddSourceLocation(E->getRBracketLoc(), Record);
+  Record.AddStmt(E->getBase());
+  Record.AddStmt(E->getLowerBound());
+  Record.AddStmt(E->getLength());
+  Record.AddSourceLocation(E->getColonLoc());
+  Record.AddSourceLocation(E->getRBracketLoc());
   Code = serialization::EXPR_OMP_ARRAY_SECTION;
 }
 
 void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
   VisitExpr(E);
   Record.push_back(E->getNumArgs());
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
-  Writer.AddStmt(E->getCallee());
+  Record.AddSourceLocation(E->getRParenLoc());
+  Record.AddStmt(E->getCallee());
   for (CallExpr::arg_iterator Arg = E->arg_begin(), ArgEnd = E->arg_end();
        Arg != ArgEnd; ++Arg)
-    Writer.AddStmt(*Arg);
+    Record.AddStmt(*Arg);
   Code = serialization::EXPR_CALL;
 }
 
@@ -559,43 +571,43 @@
 
   Record.push_back(E->hasQualifier());
   if (E->hasQualifier())
-    Writer.AddNestedNameSpecifierLoc(E->getQualifierLoc(), Record);
+    Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
 
   Record.push_back(E->HasTemplateKWAndArgsInfo);
   if (E->HasTemplateKWAndArgsInfo) {
-    Writer.AddSourceLocation(E->getTemplateKeywordLoc(), Record);
+    Record.AddSourceLocation(E->getTemplateKeywordLoc());
     unsigned NumTemplateArgs = E->getNumTemplateArgs();
     Record.push_back(NumTemplateArgs);
-    Writer.AddSourceLocation(E->getLAngleLoc(), Record);
-    Writer.AddSourceLocation(E->getRAngleLoc(), Record);
+    Record.AddSourceLocation(E->getLAngleLoc());
+    Record.AddSourceLocation(E->getRAngleLoc());
     for (unsigned i=0; i != NumTemplateArgs; ++i)
-      Writer.AddTemplateArgumentLoc(E->getTemplateArgs()[i], Record);
+      Record.AddTemplateArgumentLoc(E->getTemplateArgs()[i]);
   }
 
   Record.push_back(E->hadMultipleCandidates());
 
   DeclAccessPair FoundDecl = E->getFoundDecl();
-  Writer.AddDeclRef(FoundDecl.getDecl(), Record);
+  Record.AddDeclRef(FoundDecl.getDecl());
   Record.push_back(FoundDecl.getAccess());
 
-  Writer.AddTypeRef(E->getType(), Record);
+  Record.AddTypeRef(E->getType());
   Record.push_back(E->getValueKind());
   Record.push_back(E->getObjectKind());
-  Writer.AddStmt(E->getBase());
-  Writer.AddDeclRef(E->getMemberDecl(), Record);
-  Writer.AddSourceLocation(E->getMemberLoc(), Record);
+  Record.AddStmt(E->getBase());
+  Record.AddDeclRef(E->getMemberDecl());
+  Record.AddSourceLocation(E->getMemberLoc());
   Record.push_back(E->isArrow());
-  Writer.AddSourceLocation(E->getOperatorLoc(), Record);
-  Writer.AddDeclarationNameLoc(E->MemberDNLoc,
-                               E->getMemberDecl()->getDeclName(), Record);
+  Record.AddSourceLocation(E->getOperatorLoc());
+  Record.AddDeclarationNameLoc(E->MemberDNLoc,
+                               E->getMemberDecl()->getDeclName());
   Code = serialization::EXPR_MEMBER;
 }
 
 void ASTStmtWriter::VisitObjCIsaExpr(ObjCIsaExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getBase());
-  Writer.AddSourceLocation(E->getIsaMemberLoc(), Record);
-  Writer.AddSourceLocation(E->getOpLoc(), Record);
+  Record.AddStmt(E->getBase());
+  Record.AddSourceLocation(E->getIsaMemberLoc());
+  Record.AddSourceLocation(E->getOpLoc());
   Record.push_back(E->isArrow());
   Code = serialization::EXPR_OBJC_ISA;
 }
@@ -603,15 +615,15 @@
 void ASTStmtWriter::
 VisitObjCIndirectCopyRestoreExpr(ObjCIndirectCopyRestoreExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getSubExpr());
+  Record.AddStmt(E->getSubExpr());
   Record.push_back(E->shouldCopy());
   Code = serialization::EXPR_OBJC_INDIRECT_COPY_RESTORE;
 }
 
 void ASTStmtWriter::VisitObjCBridgedCastExpr(ObjCBridgedCastExpr *E) {
   VisitExplicitCastExpr(E);
-  Writer.AddSourceLocation(E->getLParenLoc(), Record);
-  Writer.AddSourceLocation(E->getBridgeKeywordLoc(), Record);
+  Record.AddSourceLocation(E->getLParenLoc());
+  Record.AddSourceLocation(E->getBridgeKeywordLoc());
   Record.push_back(E->getBridgeKind()); // FIXME: Stable encoding
   Code = serialization::EXPR_OBJC_BRIDGED_CAST;
 }
@@ -619,51 +631,51 @@
 void ASTStmtWriter::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
   Record.push_back(E->path_size());
-  Writer.AddStmt(E->getSubExpr());
+  Record.AddStmt(E->getSubExpr());
   Record.push_back(E->getCastKind()); // FIXME: stable encoding
 
   for (CastExpr::path_iterator
          PI = E->path_begin(), PE = E->path_end(); PI != PE; ++PI)
-    Writer.AddCXXBaseSpecifier(**PI, Record);
+    Record.AddCXXBaseSpecifier(**PI);
 }
 
 void ASTStmtWriter::VisitBinaryOperator(BinaryOperator *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getLHS());
-  Writer.AddStmt(E->getRHS());
+  Record.AddStmt(E->getLHS());
+  Record.AddStmt(E->getRHS());
   Record.push_back(E->getOpcode()); // FIXME: stable encoding
-  Writer.AddSourceLocation(E->getOperatorLoc(), Record);
+  Record.AddSourceLocation(E->getOperatorLoc());
   Record.push_back(E->isFPContractable());
   Code = serialization::EXPR_BINARY_OPERATOR;
 }
 
 void ASTStmtWriter::VisitCompoundAssignOperator(CompoundAssignOperator *E) {
   VisitBinaryOperator(E);
-  Writer.AddTypeRef(E->getComputationLHSType(), Record);
-  Writer.AddTypeRef(E->getComputationResultType(), Record);
+  Record.AddTypeRef(E->getComputationLHSType());
+  Record.AddTypeRef(E->getComputationResultType());
   Code = serialization::EXPR_COMPOUND_ASSIGN_OPERATOR;
 }
 
 void ASTStmtWriter::VisitConditionalOperator(ConditionalOperator *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getCond());
-  Writer.AddStmt(E->getLHS());
-  Writer.AddStmt(E->getRHS());
-  Writer.AddSourceLocation(E->getQuestionLoc(), Record);
-  Writer.AddSourceLocation(E->getColonLoc(), Record);
+  Record.AddStmt(E->getCond());
+  Record.AddStmt(E->getLHS());
+  Record.AddStmt(E->getRHS());
+  Record.AddSourceLocation(E->getQuestionLoc());
+  Record.AddSourceLocation(E->getColonLoc());
   Code = serialization::EXPR_CONDITIONAL_OPERATOR;
 }
 
 void
 ASTStmtWriter::VisitBinaryConditionalOperator(BinaryConditionalOperator *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getOpaqueValue());
-  Writer.AddStmt(E->getCommon());
-  Writer.AddStmt(E->getCond());
-  Writer.AddStmt(E->getTrueExpr());
-  Writer.AddStmt(E->getFalseExpr());
-  Writer.AddSourceLocation(E->getQuestionLoc(), Record);
-  Writer.AddSourceLocation(E->getColonLoc(), Record);
+  Record.AddStmt(E->getOpaqueValue());
+  Record.AddStmt(E->getCommon());
+  Record.AddStmt(E->getCond());
+  Record.AddStmt(E->getTrueExpr());
+  Record.AddStmt(E->getFalseExpr());
+  Record.AddSourceLocation(E->getQuestionLoc());
+  Record.AddSourceLocation(E->getColonLoc());
   Code = serialization::EXPR_BINARY_CONDITIONAL_OPERATOR;
 }
 
@@ -678,30 +690,30 @@
 
 void ASTStmtWriter::VisitExplicitCastExpr(ExplicitCastExpr *E) {
   VisitCastExpr(E);
-  Writer.AddTypeSourceInfo(E->getTypeInfoAsWritten(), Record);
+  Record.AddTypeSourceInfo(E->getTypeInfoAsWritten());
 }
 
 void ASTStmtWriter::VisitCStyleCastExpr(CStyleCastExpr *E) {
   VisitExplicitCastExpr(E);
-  Writer.AddSourceLocation(E->getLParenLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddSourceLocation(E->getLParenLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_CSTYLE_CAST;
 }
 
 void ASTStmtWriter::VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getLParenLoc(), Record);
-  Writer.AddTypeSourceInfo(E->getTypeSourceInfo(), Record);
-  Writer.AddStmt(E->getInitializer());
+  Record.AddSourceLocation(E->getLParenLoc());
+  Record.AddTypeSourceInfo(E->getTypeSourceInfo());
+  Record.AddStmt(E->getInitializer());
   Record.push_back(E->isFileScope());
   Code = serialization::EXPR_COMPOUND_LITERAL;
 }
 
 void ASTStmtWriter::VisitExtVectorElementExpr(ExtVectorElementExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getBase());
-  Writer.AddIdentifierRef(&E->getAccessor(), Record);
-  Writer.AddSourceLocation(E->getAccessorLoc(), Record);
+  Record.AddStmt(E->getBase());
+  Record.AddIdentifierRef(&E->getAccessor());
+  Record.AddSourceLocation(E->getAccessorLoc());
   Code = serialization::EXPR_EXT_VECTOR_ELEMENT;
 }
 
@@ -709,15 +721,15 @@
   VisitExpr(E);
   // NOTE: only add the (possibly null) syntactic form.
   // No need to serialize the isSemanticForm flag and the semantic form.
-  Writer.AddStmt(E->getSyntacticForm());
-  Writer.AddSourceLocation(E->getLBraceLoc(), Record);
-  Writer.AddSourceLocation(E->getRBraceLoc(), Record);
+  Record.AddStmt(E->getSyntacticForm());
+  Record.AddSourceLocation(E->getLBraceLoc());
+  Record.AddSourceLocation(E->getRBraceLoc());
   bool isArrayFiller = E->ArrayFillerOrUnionFieldInit.is<Expr*>();
   Record.push_back(isArrayFiller);
   if (isArrayFiller)
-    Writer.AddStmt(E->getArrayFiller());
+    Record.AddStmt(E->getArrayFiller());
   else
-    Writer.AddDeclRef(E->getInitializedFieldInUnion(), Record);
+    Record.AddDeclRef(E->getInitializedFieldInUnion());
   Record.push_back(E->hadArrayRangeDesignator());
   Record.push_back(E->getNumInits());
   if (isArrayFiller) {
@@ -725,10 +737,10 @@
     // Replace them by 0 to indicate that the filler goes in that place.
     Expr *filler = E->getArrayFiller();
     for (unsigned I = 0, N = E->getNumInits(); I != N; ++I)
-      Writer.AddStmt(E->getInit(I) != filler ? E->getInit(I) : nullptr);
+      Record.AddStmt(E->getInit(I) != filler ? E->getInit(I) : nullptr);
   } else {
     for (unsigned I = 0, N = E->getNumInits(); I != N; ++I)
-      Writer.AddStmt(E->getInit(I));
+      Record.AddStmt(E->getInit(I));
   }
   Code = serialization::EXPR_INIT_LIST;
 }
@@ -737,34 +749,32 @@
   VisitExpr(E);
   Record.push_back(E->getNumSubExprs());
   for (unsigned I = 0, N = E->getNumSubExprs(); I != N; ++I)
-    Writer.AddStmt(E->getSubExpr(I));
-  Writer.AddSourceLocation(E->getEqualOrColonLoc(), Record);
+    Record.AddStmt(E->getSubExpr(I));
+  Record.AddSourceLocation(E->getEqualOrColonLoc());
   Record.push_back(E->usesGNUSyntax());
-  for (DesignatedInitExpr::designators_iterator D = E->designators_begin(),
-                                             DEnd = E->designators_end();
-       D != DEnd; ++D) {
-    if (D->isFieldDesignator()) {
-      if (FieldDecl *Field = D->getField()) {
+  for (const DesignatedInitExpr::Designator &D : E->designators()) {
+    if (D.isFieldDesignator()) {
+      if (FieldDecl *Field = D.getField()) {
         Record.push_back(serialization::DESIG_FIELD_DECL);
-        Writer.AddDeclRef(Field, Record);
+        Record.AddDeclRef(Field);
       } else {
         Record.push_back(serialization::DESIG_FIELD_NAME);
-        Writer.AddIdentifierRef(D->getFieldName(), Record);
+        Record.AddIdentifierRef(D.getFieldName());
       }
-      Writer.AddSourceLocation(D->getDotLoc(), Record);
-      Writer.AddSourceLocation(D->getFieldLoc(), Record);
-    } else if (D->isArrayDesignator()) {
+      Record.AddSourceLocation(D.getDotLoc());
+      Record.AddSourceLocation(D.getFieldLoc());
+    } else if (D.isArrayDesignator()) {
       Record.push_back(serialization::DESIG_ARRAY);
-      Record.push_back(D->getFirstExprIndex());
-      Writer.AddSourceLocation(D->getLBracketLoc(), Record);
-      Writer.AddSourceLocation(D->getRBracketLoc(), Record);
+      Record.push_back(D.getFirstExprIndex());
+      Record.AddSourceLocation(D.getLBracketLoc());
+      Record.AddSourceLocation(D.getRBracketLoc());
     } else {
-      assert(D->isArrayRangeDesignator() && "Unknown designator");
+      assert(D.isArrayRangeDesignator() && "Unknown designator");
       Record.push_back(serialization::DESIG_ARRAY_RANGE);
-      Record.push_back(D->getFirstExprIndex());
-      Writer.AddSourceLocation(D->getLBracketLoc(), Record);
-      Writer.AddSourceLocation(D->getEllipsisLoc(), Record);
-      Writer.AddSourceLocation(D->getRBracketLoc(), Record);
+      Record.push_back(D.getFirstExprIndex());
+      Record.AddSourceLocation(D.getLBracketLoc());
+      Record.AddSourceLocation(D.getEllipsisLoc());
+      Record.AddSourceLocation(D.getRBracketLoc());
     }
   }
   Code = serialization::EXPR_DESIGNATED_INIT;
@@ -772,8 +782,8 @@
 
 void ASTStmtWriter::VisitDesignatedInitUpdateExpr(DesignatedInitUpdateExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getBase());
-  Writer.AddStmt(E->getUpdater());
+  Record.AddStmt(E->getBase());
+  Record.AddStmt(E->getUpdater());
   Code = serialization::EXPR_DESIGNATED_INIT_UPDATE;
 }
 
@@ -789,44 +799,44 @@
 
 void ASTStmtWriter::VisitVAArgExpr(VAArgExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getSubExpr());
-  Writer.AddTypeSourceInfo(E->getWrittenTypeInfo(), Record);
-  Writer.AddSourceLocation(E->getBuiltinLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddStmt(E->getSubExpr());
+  Record.AddTypeSourceInfo(E->getWrittenTypeInfo());
+  Record.AddSourceLocation(E->getBuiltinLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Record.push_back(E->isMicrosoftABI());
   Code = serialization::EXPR_VA_ARG;
 }
 
 void ASTStmtWriter::VisitAddrLabelExpr(AddrLabelExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getAmpAmpLoc(), Record);
-  Writer.AddSourceLocation(E->getLabelLoc(), Record);
-  Writer.AddDeclRef(E->getLabel(), Record);
+  Record.AddSourceLocation(E->getAmpAmpLoc());
+  Record.AddSourceLocation(E->getLabelLoc());
+  Record.AddDeclRef(E->getLabel());
   Code = serialization::EXPR_ADDR_LABEL;
 }
 
 void ASTStmtWriter::VisitStmtExpr(StmtExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getSubStmt());
-  Writer.AddSourceLocation(E->getLParenLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddStmt(E->getSubStmt());
+  Record.AddSourceLocation(E->getLParenLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_STMT;
 }
 
 void ASTStmtWriter::VisitChooseExpr(ChooseExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getCond());
-  Writer.AddStmt(E->getLHS());
-  Writer.AddStmt(E->getRHS());
-  Writer.AddSourceLocation(E->getBuiltinLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddStmt(E->getCond());
+  Record.AddStmt(E->getLHS());
+  Record.AddStmt(E->getRHS());
+  Record.AddSourceLocation(E->getBuiltinLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Record.push_back(E->isConditionDependent() ? false : E->isConditionTrue());
   Code = serialization::EXPR_CHOOSE;
 }
 
 void ASTStmtWriter::VisitGNUNullExpr(GNUNullExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getTokenLocation(), Record);
+  Record.AddSourceLocation(E->getTokenLocation());
   Code = serialization::EXPR_GNU_NULL;
 }
 
@@ -834,24 +844,24 @@
   VisitExpr(E);
   Record.push_back(E->getNumSubExprs());
   for (unsigned I = 0, N = E->getNumSubExprs(); I != N; ++I)
-    Writer.AddStmt(E->getExpr(I));
-  Writer.AddSourceLocation(E->getBuiltinLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+    Record.AddStmt(E->getExpr(I));
+  Record.AddSourceLocation(E->getBuiltinLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_SHUFFLE_VECTOR;
 }
 
 void ASTStmtWriter::VisitConvertVectorExpr(ConvertVectorExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getBuiltinLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
-  Writer.AddTypeSourceInfo(E->getTypeSourceInfo(), Record);
-  Writer.AddStmt(E->getSrcExpr());
+  Record.AddSourceLocation(E->getBuiltinLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
+  Record.AddTypeSourceInfo(E->getTypeSourceInfo());
+  Record.AddStmt(E->getSrcExpr());
   Code = serialization::EXPR_CONVERT_VECTOR;
 }
 
 void ASTStmtWriter::VisitBlockExpr(BlockExpr *E) {
   VisitExpr(E);
-  Writer.AddDeclRef(E->getBlockDecl(), Record);
+  Record.AddDeclRef(E->getBlockDecl());
   Code = serialization::EXPR_BLOCK;
 }
 
@@ -859,16 +869,16 @@
   VisitExpr(E);
   Record.push_back(E->getNumAssocs());
 
-  Writer.AddStmt(E->getControllingExpr());
+  Record.AddStmt(E->getControllingExpr());
   for (unsigned I = 0, N = E->getNumAssocs(); I != N; ++I) {
-    Writer.AddTypeSourceInfo(E->getAssocTypeSourceInfo(I), Record);
-    Writer.AddStmt(E->getAssocExpr(I));
+    Record.AddTypeSourceInfo(E->getAssocTypeSourceInfo(I));
+    Record.AddStmt(E->getAssocExpr(I));
   }
   Record.push_back(E->isResultDependent() ? -1U : E->getResultIndex());
 
-  Writer.AddSourceLocation(E->getGenericLoc(), Record);
-  Writer.AddSourceLocation(E->getDefaultLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddSourceLocation(E->getGenericLoc());
+  Record.AddSourceLocation(E->getDefaultLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_GENERIC_SELECTION;
 }
 
@@ -882,10 +892,10 @@
   result = (result == PseudoObjectExpr::NoResult ? 0 : result + 1);
   Record.push_back(result);
 
-  Writer.AddStmt(E->getSyntacticForm());
+  Record.AddStmt(E->getSyntacticForm());
   for (PseudoObjectExpr::semantics_iterator
          i = E->semantics_begin(), e = E->semantics_end(); i != e; ++i) {
-    Writer.AddStmt(*i);
+    Record.AddStmt(*i);
   }
   Code = serialization::EXPR_PSEUDO_OBJECT;
 }
@@ -894,9 +904,9 @@
   VisitExpr(E);
   Record.push_back(E->getOp());
   for (unsigned I = 0, N = E->getNumSubExprs(); I != N; ++I)
-    Writer.AddStmt(E->getSubExprs()[I]);
-  Writer.AddSourceLocation(E->getBuiltinLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+    Record.AddStmt(E->getSubExprs()[I]);
+  Record.AddSourceLocation(E->getBuiltinLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_ATOMIC;
 }
 
@@ -906,16 +916,16 @@
 
 void ASTStmtWriter::VisitObjCStringLiteral(ObjCStringLiteral *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getString());
-  Writer.AddSourceLocation(E->getAtLoc(), Record);
+  Record.AddStmt(E->getString());
+  Record.AddSourceLocation(E->getAtLoc());
   Code = serialization::EXPR_OBJC_STRING_LITERAL;
 }
 
 void ASTStmtWriter::VisitObjCBoxedExpr(ObjCBoxedExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getSubExpr());
-  Writer.AddDeclRef(E->getBoxingMethod(), Record);
-  Writer.AddSourceRange(E->getSourceRange(), Record);
+  Record.AddStmt(E->getSubExpr());
+  Record.AddDeclRef(E->getBoxingMethod());
+  Record.AddSourceRange(E->getSourceRange());
   Code = serialization::EXPR_OBJC_BOXED_EXPRESSION;
 }
 
@@ -923,9 +933,9 @@
   VisitExpr(E);
   Record.push_back(E->getNumElements());
   for (unsigned i = 0; i < E->getNumElements(); i++)
-    Writer.AddStmt(E->getElement(i));
-  Writer.AddDeclRef(E->getArrayWithObjectsMethod(), Record);
-  Writer.AddSourceRange(E->getSourceRange(), Record);
+    Record.AddStmt(E->getElement(i));
+  Record.AddDeclRef(E->getArrayWithObjectsMethod());
+  Record.AddSourceRange(E->getSourceRange());
   Code = serialization::EXPR_OBJC_ARRAY_LITERAL;
 }
 
@@ -935,10 +945,10 @@
   Record.push_back(E->HasPackExpansions);
   for (unsigned i = 0; i < E->getNumElements(); i++) {
     ObjCDictionaryElement Element = E->getKeyValueElement(i);
-    Writer.AddStmt(Element.Key);
-    Writer.AddStmt(Element.Value);
+    Record.AddStmt(Element.Key);
+    Record.AddStmt(Element.Value);
     if (E->HasPackExpansions) {
-      Writer.AddSourceLocation(Element.EllipsisLoc, Record);
+      Record.AddSourceLocation(Element.EllipsisLoc);
       unsigned NumExpansions = 0;
       if (Element.NumExpansions)
         NumExpansions = *Element.NumExpansions + 1;
@@ -946,42 +956,42 @@
     }
   }
     
-  Writer.AddDeclRef(E->getDictWithObjectsMethod(), Record);
-  Writer.AddSourceRange(E->getSourceRange(), Record);
+  Record.AddDeclRef(E->getDictWithObjectsMethod());
+  Record.AddSourceRange(E->getSourceRange());
   Code = serialization::EXPR_OBJC_DICTIONARY_LITERAL;
 }
 
 void ASTStmtWriter::VisitObjCEncodeExpr(ObjCEncodeExpr *E) {
   VisitExpr(E);
-  Writer.AddTypeSourceInfo(E->getEncodedTypeSourceInfo(), Record);
-  Writer.AddSourceLocation(E->getAtLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddTypeSourceInfo(E->getEncodedTypeSourceInfo());
+  Record.AddSourceLocation(E->getAtLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_OBJC_ENCODE;
 }
 
 void ASTStmtWriter::VisitObjCSelectorExpr(ObjCSelectorExpr *E) {
   VisitExpr(E);
-  Writer.AddSelectorRef(E->getSelector(), Record);
-  Writer.AddSourceLocation(E->getAtLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddSelectorRef(E->getSelector());
+  Record.AddSourceLocation(E->getAtLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_OBJC_SELECTOR_EXPR;
 }
 
 void ASTStmtWriter::VisitObjCProtocolExpr(ObjCProtocolExpr *E) {
   VisitExpr(E);
-  Writer.AddDeclRef(E->getProtocol(), Record);
-  Writer.AddSourceLocation(E->getAtLoc(), Record);
-  Writer.AddSourceLocation(E->ProtoLoc, Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddDeclRef(E->getProtocol());
+  Record.AddSourceLocation(E->getAtLoc());
+  Record.AddSourceLocation(E->ProtoLoc);
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_OBJC_PROTOCOL_EXPR;
 }
 
 void ASTStmtWriter::VisitObjCIvarRefExpr(ObjCIvarRefExpr *E) {
   VisitExpr(E);
-  Writer.AddDeclRef(E->getDecl(), Record);
-  Writer.AddSourceLocation(E->getLocation(), Record);
-  Writer.AddSourceLocation(E->getOpLoc(), Record);
-  Writer.AddStmt(E->getBase());
+  Record.AddDeclRef(E->getDecl());
+  Record.AddSourceLocation(E->getLocation());
+  Record.AddSourceLocation(E->getOpLoc());
+  Record.AddStmt(E->getBase());
   Record.push_back(E->isArrow());
   Record.push_back(E->isFreeIvar());
   Code = serialization::EXPR_OBJC_IVAR_REF_EXPR;
@@ -992,22 +1002,22 @@
   Record.push_back(E->SetterAndMethodRefFlags.getInt());
   Record.push_back(E->isImplicitProperty());
   if (E->isImplicitProperty()) {
-    Writer.AddDeclRef(E->getImplicitPropertyGetter(), Record);
-    Writer.AddDeclRef(E->getImplicitPropertySetter(), Record);
+    Record.AddDeclRef(E->getImplicitPropertyGetter());
+    Record.AddDeclRef(E->getImplicitPropertySetter());
   } else {
-    Writer.AddDeclRef(E->getExplicitProperty(), Record);
+    Record.AddDeclRef(E->getExplicitProperty());
   }
-  Writer.AddSourceLocation(E->getLocation(), Record);
-  Writer.AddSourceLocation(E->getReceiverLocation(), Record);
+  Record.AddSourceLocation(E->getLocation());
+  Record.AddSourceLocation(E->getReceiverLocation());
   if (E->isObjectReceiver()) {
     Record.push_back(0);
-    Writer.AddStmt(E->getBase());
+    Record.AddStmt(E->getBase());
   } else if (E->isSuperReceiver()) {
     Record.push_back(1);
-    Writer.AddTypeRef(E->getSuperReceiverType(), Record);
+    Record.AddTypeRef(E->getSuperReceiverType());
   } else {
     Record.push_back(2);
-    Writer.AddDeclRef(E->getClassReceiver(), Record);
+    Record.AddDeclRef(E->getClassReceiver());
   }
   
   Code = serialization::EXPR_OBJC_PROPERTY_REF_EXPR;
@@ -1015,11 +1025,11 @@
 
 void ASTStmtWriter::VisitObjCSubscriptRefExpr(ObjCSubscriptRefExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getRBracket(), Record);
-  Writer.AddStmt(E->getBaseExpr());
-  Writer.AddStmt(E->getKeyExpr());
-  Writer.AddDeclRef(E->getAtIndexMethodDecl(), Record);
-  Writer.AddDeclRef(E->setAtIndexMethodDecl(), Record);
+  Record.AddSourceLocation(E->getRBracket());
+  Record.AddStmt(E->getBaseExpr());
+  Record.AddStmt(E->getKeyExpr());
+  Record.AddDeclRef(E->getAtIndexMethodDecl());
+  Record.AddDeclRef(E->setAtIndexMethodDecl());
   
   Code = serialization::EXPR_OBJC_SUBSCRIPT_REF_EXPR;
 }
@@ -1034,155 +1044,163 @@
   Record.push_back((unsigned)E->getReceiverKind()); // FIXME: stable encoding
   switch (E->getReceiverKind()) {
   case ObjCMessageExpr::Instance:
-    Writer.AddStmt(E->getInstanceReceiver());
+    Record.AddStmt(E->getInstanceReceiver());
     break;
 
   case ObjCMessageExpr::Class:
-    Writer.AddTypeSourceInfo(E->getClassReceiverTypeInfo(), Record);
+    Record.AddTypeSourceInfo(E->getClassReceiverTypeInfo());
     break;
 
   case ObjCMessageExpr::SuperClass:
   case ObjCMessageExpr::SuperInstance:
-    Writer.AddTypeRef(E->getSuperType(), Record);
-    Writer.AddSourceLocation(E->getSuperLoc(), Record);
+    Record.AddTypeRef(E->getSuperType());
+    Record.AddSourceLocation(E->getSuperLoc());
     break;
   }
 
   if (E->getMethodDecl()) {
     Record.push_back(1);
-    Writer.AddDeclRef(E->getMethodDecl(), Record);
+    Record.AddDeclRef(E->getMethodDecl());
   } else {
     Record.push_back(0);
-    Writer.AddSelectorRef(E->getSelector(), Record);    
+    Record.AddSelectorRef(E->getSelector());    
   }
     
-  Writer.AddSourceLocation(E->getLeftLoc(), Record);
-  Writer.AddSourceLocation(E->getRightLoc(), Record);
+  Record.AddSourceLocation(E->getLeftLoc());
+  Record.AddSourceLocation(E->getRightLoc());
 
   for (CallExpr::arg_iterator Arg = E->arg_begin(), ArgEnd = E->arg_end();
        Arg != ArgEnd; ++Arg)
-    Writer.AddStmt(*Arg);
+    Record.AddStmt(*Arg);
 
   SourceLocation *Locs = E->getStoredSelLocs();
   for (unsigned i = 0, e = E->getNumStoredSelLocs(); i != e; ++i)
-    Writer.AddSourceLocation(Locs[i], Record);
+    Record.AddSourceLocation(Locs[i]);
 
   Code = serialization::EXPR_OBJC_MESSAGE_EXPR;
 }
 
 void ASTStmtWriter::VisitObjCForCollectionStmt(ObjCForCollectionStmt *S) {
   VisitStmt(S);
-  Writer.AddStmt(S->getElement());
-  Writer.AddStmt(S->getCollection());
-  Writer.AddStmt(S->getBody());
-  Writer.AddSourceLocation(S->getForLoc(), Record);
-  Writer.AddSourceLocation(S->getRParenLoc(), Record);
+  Record.AddStmt(S->getElement());
+  Record.AddStmt(S->getCollection());
+  Record.AddStmt(S->getBody());
+  Record.AddSourceLocation(S->getForLoc());
+  Record.AddSourceLocation(S->getRParenLoc());
   Code = serialization::STMT_OBJC_FOR_COLLECTION;
 }
 
 void ASTStmtWriter::VisitObjCAtCatchStmt(ObjCAtCatchStmt *S) {
-  Writer.AddStmt(S->getCatchBody());
-  Writer.AddDeclRef(S->getCatchParamDecl(), Record);
-  Writer.AddSourceLocation(S->getAtCatchLoc(), Record);
-  Writer.AddSourceLocation(S->getRParenLoc(), Record);
+  Record.AddStmt(S->getCatchBody());
+  Record.AddDeclRef(S->getCatchParamDecl());
+  Record.AddSourceLocation(S->getAtCatchLoc());
+  Record.AddSourceLocation(S->getRParenLoc());
   Code = serialization::STMT_OBJC_CATCH;
 }
 
 void ASTStmtWriter::VisitObjCAtFinallyStmt(ObjCAtFinallyStmt *S) {
-  Writer.AddStmt(S->getFinallyBody());
-  Writer.AddSourceLocation(S->getAtFinallyLoc(), Record);
+  Record.AddStmt(S->getFinallyBody());
+  Record.AddSourceLocation(S->getAtFinallyLoc());
   Code = serialization::STMT_OBJC_FINALLY;
 }
 
 void ASTStmtWriter::VisitObjCAutoreleasePoolStmt(ObjCAutoreleasePoolStmt *S) {
-  Writer.AddStmt(S->getSubStmt());
-  Writer.AddSourceLocation(S->getAtLoc(), Record);
+  Record.AddStmt(S->getSubStmt());
+  Record.AddSourceLocation(S->getAtLoc());
   Code = serialization::STMT_OBJC_AUTORELEASE_POOL;
 }
 
 void ASTStmtWriter::VisitObjCAtTryStmt(ObjCAtTryStmt *S) {
   Record.push_back(S->getNumCatchStmts());
   Record.push_back(S->getFinallyStmt() != nullptr);
-  Writer.AddStmt(S->getTryBody());
+  Record.AddStmt(S->getTryBody());
   for (unsigned I = 0, N = S->getNumCatchStmts(); I != N; ++I)
-    Writer.AddStmt(S->getCatchStmt(I));
+    Record.AddStmt(S->getCatchStmt(I));
   if (S->getFinallyStmt())
-    Writer.AddStmt(S->getFinallyStmt());
-  Writer.AddSourceLocation(S->getAtTryLoc(), Record);
+    Record.AddStmt(S->getFinallyStmt());
+  Record.AddSourceLocation(S->getAtTryLoc());
   Code = serialization::STMT_OBJC_AT_TRY;
 }
 
 void ASTStmtWriter::VisitObjCAtSynchronizedStmt(ObjCAtSynchronizedStmt *S) {
-  Writer.AddStmt(S->getSynchExpr());
-  Writer.AddStmt(S->getSynchBody());
-  Writer.AddSourceLocation(S->getAtSynchronizedLoc(), Record);
+  Record.AddStmt(S->getSynchExpr());
+  Record.AddStmt(S->getSynchBody());
+  Record.AddSourceLocation(S->getAtSynchronizedLoc());
   Code = serialization::STMT_OBJC_AT_SYNCHRONIZED;
 }
 
 void ASTStmtWriter::VisitObjCAtThrowStmt(ObjCAtThrowStmt *S) {
-  Writer.AddStmt(S->getThrowExpr());
-  Writer.AddSourceLocation(S->getThrowLoc(), Record);
+  Record.AddStmt(S->getThrowExpr());
+  Record.AddSourceLocation(S->getThrowLoc());
   Code = serialization::STMT_OBJC_AT_THROW;
 }
 
 void ASTStmtWriter::VisitObjCBoolLiteralExpr(ObjCBoolLiteralExpr *E) {
   VisitExpr(E);
   Record.push_back(E->getValue());
-  Writer.AddSourceLocation(E->getLocation(), Record);
+  Record.AddSourceLocation(E->getLocation());
   Code = serialization::EXPR_OBJC_BOOL_LITERAL;
 }
 
+void ASTStmtWriter::VisitObjCAvailabilityCheckExpr(ObjCAvailabilityCheckExpr *E) {
+  VisitExpr(E);
+  Record.AddSourceRange(E->getSourceRange());
+  Record.AddVersionTuple(E->getVersion());
+  Code = serialization::EXPR_OBJC_AVAILABILITY_CHECK;
+}
+
 //===----------------------------------------------------------------------===//
 // C++ Expressions and Statements.
 //===----------------------------------------------------------------------===//
 
 void ASTStmtWriter::VisitCXXCatchStmt(CXXCatchStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getCatchLoc(), Record);
-  Writer.AddDeclRef(S->getExceptionDecl(), Record);
-  Writer.AddStmt(S->getHandlerBlock());
+  Record.AddSourceLocation(S->getCatchLoc());
+  Record.AddDeclRef(S->getExceptionDecl());
+  Record.AddStmt(S->getHandlerBlock());
   Code = serialization::STMT_CXX_CATCH;
 }
 
 void ASTStmtWriter::VisitCXXTryStmt(CXXTryStmt *S) {
   VisitStmt(S);
   Record.push_back(S->getNumHandlers());
-  Writer.AddSourceLocation(S->getTryLoc(), Record);
-  Writer.AddStmt(S->getTryBlock());
+  Record.AddSourceLocation(S->getTryLoc());
+  Record.AddStmt(S->getTryBlock());
   for (unsigned i = 0, e = S->getNumHandlers(); i != e; ++i)
-    Writer.AddStmt(S->getHandler(i));
+    Record.AddStmt(S->getHandler(i));
   Code = serialization::STMT_CXX_TRY;
 }
 
 void ASTStmtWriter::VisitCXXForRangeStmt(CXXForRangeStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getForLoc(), Record);
-  Writer.AddSourceLocation(S->getCoawaitLoc(), Record);
-  Writer.AddSourceLocation(S->getColonLoc(), Record);
-  Writer.AddSourceLocation(S->getRParenLoc(), Record);
-  Writer.AddStmt(S->getRangeStmt());
-  Writer.AddStmt(S->getBeginEndStmt());
-  Writer.AddStmt(S->getCond());
-  Writer.AddStmt(S->getInc());
-  Writer.AddStmt(S->getLoopVarStmt());
-  Writer.AddStmt(S->getBody());
+  Record.AddSourceLocation(S->getForLoc());
+  Record.AddSourceLocation(S->getCoawaitLoc());
+  Record.AddSourceLocation(S->getColonLoc());
+  Record.AddSourceLocation(S->getRParenLoc());
+  Record.AddStmt(S->getRangeStmt());
+  Record.AddStmt(S->getBeginStmt());
+  Record.AddStmt(S->getEndStmt());
+  Record.AddStmt(S->getCond());
+  Record.AddStmt(S->getInc());
+  Record.AddStmt(S->getLoopVarStmt());
+  Record.AddStmt(S->getBody());
   Code = serialization::STMT_CXX_FOR_RANGE;
 }
 
 void ASTStmtWriter::VisitMSDependentExistsStmt(MSDependentExistsStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getKeywordLoc(), Record);
+  Record.AddSourceLocation(S->getKeywordLoc());
   Record.push_back(S->isIfExists());
-  Writer.AddNestedNameSpecifierLoc(S->getQualifierLoc(), Record);
-  Writer.AddDeclarationNameInfo(S->getNameInfo(), Record);
-  Writer.AddStmt(S->getSubStmt());
+  Record.AddNestedNameSpecifierLoc(S->getQualifierLoc());
+  Record.AddDeclarationNameInfo(S->getNameInfo());
+  Record.AddStmt(S->getSubStmt());
   Code = serialization::STMT_MS_DEPENDENT_EXISTS;
 }
 
 void ASTStmtWriter::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
   VisitCallExpr(E);
   Record.push_back(E->getOperator());
-  Writer.AddSourceRange(E->Range, Record);
+  Record.AddSourceRange(E->Range);
   Record.push_back(E->isFPContractable());
   Code = serialization::EXPR_CXX_OPERATOR_CALL;
 }
@@ -1196,22 +1214,31 @@
   VisitExpr(E);
   Record.push_back(E->getNumArgs());
   for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
-    Writer.AddStmt(E->getArg(I));
-  Writer.AddDeclRef(E->getConstructor(), Record);
-  Writer.AddSourceLocation(E->getLocation(), Record);
+    Record.AddStmt(E->getArg(I));
+  Record.AddDeclRef(E->getConstructor());
+  Record.AddSourceLocation(E->getLocation());
   Record.push_back(E->isElidable());
   Record.push_back(E->hadMultipleCandidates());
   Record.push_back(E->isListInitialization());
   Record.push_back(E->isStdInitListInitialization());
   Record.push_back(E->requiresZeroInitialization());
   Record.push_back(E->getConstructionKind()); // FIXME: stable encoding
-  Writer.AddSourceRange(E->getParenOrBraceRange(), Record);
+  Record.AddSourceRange(E->getParenOrBraceRange());
   Code = serialization::EXPR_CXX_CONSTRUCT;
 }
 
+void ASTStmtWriter::VisitCXXInheritedCtorInitExpr(CXXInheritedCtorInitExpr *E) {
+  VisitExpr(E);
+  Record.AddDeclRef(E->getConstructor());
+  Record.AddSourceLocation(E->getLocation());
+  Record.push_back(E->constructsVBase());
+  Record.push_back(E->inheritedFromVBase());
+  Code = serialization::EXPR_CXX_INHERITED_CTOR_INIT;
+}
+
 void ASTStmtWriter::VisitCXXTemporaryObjectExpr(CXXTemporaryObjectExpr *E) {
   VisitCXXConstructExpr(E);
-  Writer.AddTypeSourceInfo(E->getTypeSourceInfo(), Record);
+  Record.AddTypeSourceInfo(E->getTypeSourceInfo());
   Code = serialization::EXPR_CXX_TEMPORARY_OBJECT;
 }
 
@@ -1222,18 +1249,18 @@
   if (E->HasArrayIndexVars)
     NumArrayIndexVars = E->getArrayIndexStarts()[E->NumCaptures];
   Record.push_back(NumArrayIndexVars);
-  Writer.AddSourceRange(E->IntroducerRange, Record);
+  Record.AddSourceRange(E->IntroducerRange);
   Record.push_back(E->CaptureDefault); // FIXME: stable encoding
-  Writer.AddSourceLocation(E->CaptureDefaultLoc, Record);
+  Record.AddSourceLocation(E->CaptureDefaultLoc);
   Record.push_back(E->ExplicitParams);
   Record.push_back(E->ExplicitResultType);
-  Writer.AddSourceLocation(E->ClosingBrace, Record);
+  Record.AddSourceLocation(E->ClosingBrace);
   
   // Add capture initializers.
   for (LambdaExpr::capture_init_iterator C = E->capture_init_begin(),
                                       CEnd = E->capture_init_end();
        C != CEnd; ++C) {
-    Writer.AddStmt(*C);
+    Record.AddStmt(*C);
   }
   
   // Add array index variables, if any.
@@ -1242,7 +1269,7 @@
                   E->getArrayIndexStarts() + E->NumCaptures + 1);
     VarDecl **ArrayIndexVars = E->getArrayIndexVars();
     for (unsigned I = 0; I != NumArrayIndexVars; ++I)
-      Writer.AddDeclRef(ArrayIndexVars[I], Record);
+      Record.AddDeclRef(ArrayIndexVars[I]);
   }
   
   Code = serialization::EXPR_LAMBDA;
@@ -1250,15 +1277,14 @@
 
 void ASTStmtWriter::VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getSubExpr());
+  Record.AddStmt(E->getSubExpr());
   Code = serialization::EXPR_CXX_STD_INITIALIZER_LIST;
 }
 
 void ASTStmtWriter::VisitCXXNamedCastExpr(CXXNamedCastExpr *E) {
   VisitExplicitCastExpr(E);
-  Writer.AddSourceRange(SourceRange(E->getOperatorLoc(), E->getRParenLoc()),
-                        Record);
-  Writer.AddSourceRange(E->getAngleBrackets(), Record);
+  Record.AddSourceRange(SourceRange(E->getOperatorLoc(), E->getRParenLoc()));
+  Record.AddSourceRange(E->getAngleBrackets());
 }
 
 void ASTStmtWriter::VisitCXXStaticCastExpr(CXXStaticCastExpr *E) {
@@ -1283,82 +1309,82 @@
 
 void ASTStmtWriter::VisitCXXFunctionalCastExpr(CXXFunctionalCastExpr *E) {
   VisitExplicitCastExpr(E);
-  Writer.AddSourceLocation(E->getLParenLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddSourceLocation(E->getLParenLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_CXX_FUNCTIONAL_CAST;
 }
 
 void ASTStmtWriter::VisitUserDefinedLiteral(UserDefinedLiteral *E) {
   VisitCallExpr(E);
-  Writer.AddSourceLocation(E->UDSuffixLoc, Record);
+  Record.AddSourceLocation(E->UDSuffixLoc);
   Code = serialization::EXPR_USER_DEFINED_LITERAL;
 }
 
 void ASTStmtWriter::VisitCXXBoolLiteralExpr(CXXBoolLiteralExpr *E) {
   VisitExpr(E);
   Record.push_back(E->getValue());
-  Writer.AddSourceLocation(E->getLocation(), Record);
+  Record.AddSourceLocation(E->getLocation());
   Code = serialization::EXPR_CXX_BOOL_LITERAL;
 }
 
 void ASTStmtWriter::VisitCXXNullPtrLiteralExpr(CXXNullPtrLiteralExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getLocation(), Record);
+  Record.AddSourceLocation(E->getLocation());
   Code = serialization::EXPR_CXX_NULL_PTR_LITERAL;
 }
 
 void ASTStmtWriter::VisitCXXTypeidExpr(CXXTypeidExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceRange(E->getSourceRange(), Record);
+  Record.AddSourceRange(E->getSourceRange());
   if (E->isTypeOperand()) {
-    Writer.AddTypeSourceInfo(E->getTypeOperandSourceInfo(), Record);
+    Record.AddTypeSourceInfo(E->getTypeOperandSourceInfo());
     Code = serialization::EXPR_CXX_TYPEID_TYPE;
   } else {
-    Writer.AddStmt(E->getExprOperand());
+    Record.AddStmt(E->getExprOperand());
     Code = serialization::EXPR_CXX_TYPEID_EXPR;
   }
 }
 
 void ASTStmtWriter::VisitCXXThisExpr(CXXThisExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getLocation(), Record);
+  Record.AddSourceLocation(E->getLocation());
   Record.push_back(E->isImplicit());
   Code = serialization::EXPR_CXX_THIS;
 }
 
 void ASTStmtWriter::VisitCXXThrowExpr(CXXThrowExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getThrowLoc(), Record);
-  Writer.AddStmt(E->getSubExpr());
+  Record.AddSourceLocation(E->getThrowLoc());
+  Record.AddStmt(E->getSubExpr());
   Record.push_back(E->isThrownVariableInScope());
   Code = serialization::EXPR_CXX_THROW;
 }
 
 void ASTStmtWriter::VisitCXXDefaultArgExpr(CXXDefaultArgExpr *E) {
   VisitExpr(E);
-  Writer.AddDeclRef(E->getParam(), Record);
-  Writer.AddSourceLocation(E->getUsedLocation(), Record);
+  Record.AddDeclRef(E->getParam());
+  Record.AddSourceLocation(E->getUsedLocation());
   Code = serialization::EXPR_CXX_DEFAULT_ARG;
 }
 
 void ASTStmtWriter::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *E) {
   VisitExpr(E);
-  Writer.AddDeclRef(E->getField(), Record);
-  Writer.AddSourceLocation(E->getExprLoc(), Record);
+  Record.AddDeclRef(E->getField());
+  Record.AddSourceLocation(E->getExprLoc());
   Code = serialization::EXPR_CXX_DEFAULT_INIT;
 }
 
 void ASTStmtWriter::VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) {
   VisitExpr(E);
-  Writer.AddCXXTemporary(E->getTemporary(), Record);
-  Writer.AddStmt(E->getSubExpr());
+  Record.AddCXXTemporary(E->getTemporary());
+  Record.AddStmt(E->getSubExpr());
   Code = serialization::EXPR_CXX_BIND_TEMPORARY;
 }
 
 void ASTStmtWriter::VisitCXXScalarValueInitExpr(CXXScalarValueInitExpr *E) {
   VisitExpr(E);
-  Writer.AddTypeSourceInfo(E->getTypeSourceInfo(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+  Record.AddTypeSourceInfo(E->getTypeSourceInfo());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_CXX_SCALAR_VALUE_INIT;
 }
 
@@ -1369,15 +1395,15 @@
   Record.push_back(E->doesUsualArrayDeleteWantSize());
   Record.push_back(E->getNumPlacementArgs());
   Record.push_back(E->StoredInitializationStyle);
-  Writer.AddDeclRef(E->getOperatorNew(), Record);
-  Writer.AddDeclRef(E->getOperatorDelete(), Record);
-  Writer.AddTypeSourceInfo(E->getAllocatedTypeSourceInfo(), Record);
-  Writer.AddSourceRange(E->getTypeIdParens(), Record);
-  Writer.AddSourceRange(E->getSourceRange(), Record);
-  Writer.AddSourceRange(E->getDirectInitRange(), Record);
+  Record.AddDeclRef(E->getOperatorNew());
+  Record.AddDeclRef(E->getOperatorDelete());
+  Record.AddTypeSourceInfo(E->getAllocatedTypeSourceInfo());
+  Record.AddSourceRange(E->getTypeIdParens());
+  Record.AddSourceRange(E->getSourceRange());
+  Record.AddSourceRange(E->getDirectInitRange());
   for (CXXNewExpr::arg_iterator I = E->raw_arg_begin(), e = E->raw_arg_end();
        I != e; ++I)
-    Writer.AddStmt(*I);
+    Record.AddStmt(*I);
 
   Code = serialization::EXPR_CXX_NEW;
 }
@@ -1388,9 +1414,9 @@
   Record.push_back(E->isArrayForm());
   Record.push_back(E->isArrayFormAsWritten());
   Record.push_back(E->doesUsualArrayDeleteWantSize());
-  Writer.AddDeclRef(E->getOperatorDelete(), Record);
-  Writer.AddStmt(E->getArgument());
-  Writer.AddSourceLocation(E->getSourceRange().getBegin(), Record);
+  Record.AddDeclRef(E->getOperatorDelete());
+  Record.AddStmt(E->getArgument());
+  Record.AddSourceLocation(E->getSourceRange().getBegin());
   
   Code = serialization::EXPR_CXX_DELETE;
 }
@@ -1398,20 +1424,20 @@
 void ASTStmtWriter::VisitCXXPseudoDestructorExpr(CXXPseudoDestructorExpr *E) {
   VisitExpr(E);
 
-  Writer.AddStmt(E->getBase());
+  Record.AddStmt(E->getBase());
   Record.push_back(E->isArrow());
-  Writer.AddSourceLocation(E->getOperatorLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(E->getQualifierLoc(), Record);
-  Writer.AddTypeSourceInfo(E->getScopeTypeInfo(), Record);
-  Writer.AddSourceLocation(E->getColonColonLoc(), Record);
-  Writer.AddSourceLocation(E->getTildeLoc(), Record);
+  Record.AddSourceLocation(E->getOperatorLoc());
+  Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
+  Record.AddTypeSourceInfo(E->getScopeTypeInfo());
+  Record.AddSourceLocation(E->getColonColonLoc());
+  Record.AddSourceLocation(E->getTildeLoc());
 
   // PseudoDestructorTypeStorage.
-  Writer.AddIdentifierRef(E->getDestroyedTypeIdentifier(), Record);
+  Record.AddIdentifierRef(E->getDestroyedTypeIdentifier());
   if (E->getDestroyedTypeIdentifier())
-    Writer.AddSourceLocation(E->getDestroyedTypeLoc(), Record);
+    Record.AddSourceLocation(E->getDestroyedTypeLoc());
   else
-    Writer.AddTypeSourceInfo(E->getDestroyedTypeInfo(), Record);
+    Record.AddTypeSourceInfo(E->getDestroyedTypeInfo());
 
   Code = serialization::EXPR_CXX_PSEUDO_DESTRUCTOR;
 }
@@ -1420,9 +1446,10 @@
   VisitExpr(E);
   Record.push_back(E->getNumObjects());
   for (unsigned i = 0, e = E->getNumObjects(); i != e; ++i)
-    Writer.AddDeclRef(E->getObject(i), Record);
-  
-  Writer.AddStmt(E->getSubExpr());
+    Record.AddDeclRef(E->getObject(i));
+
+  Record.push_back(E->cleanupsHaveSideEffects());
+  Record.AddStmt(E->getSubExpr());
   Code = serialization::EXPR_EXPR_WITH_CLEANUPS;
 }
 
@@ -1443,15 +1470,15 @@
   }
 
   if (!E->isImplicitAccess())
-    Writer.AddStmt(E->getBase());
+    Record.AddStmt(E->getBase());
   else
-    Writer.AddStmt(nullptr);
-  Writer.AddTypeRef(E->getBaseType(), Record);
+    Record.AddStmt(nullptr);
+  Record.AddTypeRef(E->getBaseType());
   Record.push_back(E->isArrow());
-  Writer.AddSourceLocation(E->getOperatorLoc(), Record);
-  Writer.AddNestedNameSpecifierLoc(E->getQualifierLoc(), Record);
-  Writer.AddDeclRef(E->getFirstQualifierFoundInScope(), Record);
-  Writer.AddDeclarationNameInfo(E->MemberNameInfo, Record);
+  Record.AddSourceLocation(E->getOperatorLoc());
+  Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
+  Record.AddDeclRef(E->getFirstQualifierFoundInScope());
+  Record.AddDeclarationNameInfo(E->MemberNameInfo);
   Code = serialization::EXPR_CXX_DEPENDENT_SCOPE_MEMBER;
 }
 
@@ -1471,8 +1498,8 @@
                              E->getTrailingObjects<TemplateArgumentLoc>());
   }
 
-  Writer.AddNestedNameSpecifierLoc(E->getQualifierLoc(), Record);
-  Writer.AddDeclarationNameInfo(E->NameInfo, Record);
+  Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
+  Record.AddDeclarationNameInfo(E->NameInfo);
   Code = serialization::EXPR_CXX_DEPENDENT_SCOPE_DECL_REF;
 }
 
@@ -1482,10 +1509,10 @@
   Record.push_back(E->arg_size());
   for (CXXUnresolvedConstructExpr::arg_iterator
          ArgI = E->arg_begin(), ArgE = E->arg_end(); ArgI != ArgE; ++ArgI)
-    Writer.AddStmt(*ArgI);
-  Writer.AddTypeSourceInfo(E->getTypeSourceInfo(), Record);
-  Writer.AddSourceLocation(E->getLParenLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
+    Record.AddStmt(*ArgI);
+  Record.AddTypeSourceInfo(E->getTypeSourceInfo());
+  Record.AddSourceLocation(E->getLParenLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
   Code = serialization::EXPR_CXX_UNRESOLVED_CONSTRUCT;
 }
 
@@ -1506,21 +1533,21 @@
   Record.push_back(E->getNumDecls());
   for (OverloadExpr::decls_iterator
          OvI = E->decls_begin(), OvE = E->decls_end(); OvI != OvE; ++OvI) {
-    Writer.AddDeclRef(OvI.getDecl(), Record);
+    Record.AddDeclRef(OvI.getDecl());
     Record.push_back(OvI.getAccess());
   }
 
-  Writer.AddDeclarationNameInfo(E->NameInfo, Record);
-  Writer.AddNestedNameSpecifierLoc(E->getQualifierLoc(), Record);
+  Record.AddDeclarationNameInfo(E->NameInfo);
+  Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
 }
 
 void ASTStmtWriter::VisitUnresolvedMemberExpr(UnresolvedMemberExpr *E) {
   VisitOverloadExpr(E);
   Record.push_back(E->isArrow());
   Record.push_back(E->hasUnresolvedUsing());
-  Writer.AddStmt(!E->isImplicitAccess() ? E->getBase() : nullptr);
-  Writer.AddTypeRef(E->getBaseType(), Record);
-  Writer.AddSourceLocation(E->getOperatorLoc(), Record);
+  Record.AddStmt(!E->isImplicitAccess() ? E->getBase() : nullptr);
+  Record.AddTypeRef(E->getBaseType());
+  Record.AddSourceLocation(E->getOperatorLoc());
   Code = serialization::EXPR_CXX_UNRESOLVED_MEMBER;
 }
 
@@ -1528,7 +1555,7 @@
   VisitOverloadExpr(E);
   Record.push_back(E->requiresADL());
   Record.push_back(E->isOverloaded());
-  Writer.AddDeclRef(E->getNamingClass(), Record);
+  Record.AddDeclRef(E->getNamingClass());
   Code = serialization::EXPR_CXX_UNRESOLVED_LOOKUP;
 }
 
@@ -1537,9 +1564,9 @@
   Record.push_back(E->TypeTraitExprBits.NumArgs);
   Record.push_back(E->TypeTraitExprBits.Kind); // FIXME: Stable encoding
   Record.push_back(E->TypeTraitExprBits.Value);
-  Writer.AddSourceRange(E->getSourceRange(), Record);
+  Record.AddSourceRange(E->getSourceRange());
   for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
-    Writer.AddTypeSourceInfo(E->getArg(I), Record);
+    Record.AddTypeSourceInfo(E->getArg(I));
   Code = serialization::EXPR_TYPE_TRAIT;
 }
 
@@ -1547,8 +1574,8 @@
   VisitExpr(E);
   Record.push_back(E->getTrait());
   Record.push_back(E->getValue());
-  Writer.AddSourceRange(E->getSourceRange(), Record);
-  Writer.AddTypeSourceInfo(E->getQueriedTypeSourceInfo(), Record);
+  Record.AddSourceRange(E->getSourceRange());
+  Record.AddTypeSourceInfo(E->getQueriedTypeSourceInfo());
   Code = serialization::EXPR_ARRAY_TYPE_TRAIT;
 }
 
@@ -1556,24 +1583,24 @@
   VisitExpr(E);
   Record.push_back(E->getTrait());
   Record.push_back(E->getValue());
-  Writer.AddSourceRange(E->getSourceRange(), Record);
-  Writer.AddStmt(E->getQueriedExpression());
+  Record.AddSourceRange(E->getSourceRange());
+  Record.AddStmt(E->getQueriedExpression());
   Code = serialization::EXPR_CXX_EXPRESSION_TRAIT;
 }
 
 void ASTStmtWriter::VisitCXXNoexceptExpr(CXXNoexceptExpr *E) {
   VisitExpr(E);
   Record.push_back(E->getValue());
-  Writer.AddSourceRange(E->getSourceRange(), Record);
-  Writer.AddStmt(E->getOperand());
+  Record.AddSourceRange(E->getSourceRange());
+  Record.AddStmt(E->getOperand());
   Code = serialization::EXPR_CXX_NOEXCEPT;
 }
 
 void ASTStmtWriter::VisitPackExpansionExpr(PackExpansionExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getEllipsisLoc(), Record);
+  Record.AddSourceLocation(E->getEllipsisLoc());
   Record.push_back(E->NumExpansions);
-  Writer.AddStmt(E->getPattern());
+  Record.AddStmt(E->getPattern());
   Code = serialization::EXPR_PACK_EXPANSION;
 }
 
@@ -1581,13 +1608,13 @@
   VisitExpr(E);
   Record.push_back(E->isPartiallySubstituted() ? E->getPartialArguments().size()
                                                : 0);
-  Writer.AddSourceLocation(E->OperatorLoc, Record);
-  Writer.AddSourceLocation(E->PackLoc, Record);
-  Writer.AddSourceLocation(E->RParenLoc, Record);
-  Writer.AddDeclRef(E->Pack, Record);
+  Record.AddSourceLocation(E->OperatorLoc);
+  Record.AddSourceLocation(E->PackLoc);
+  Record.AddSourceLocation(E->RParenLoc);
+  Record.AddDeclRef(E->Pack);
   if (E->isPartiallySubstituted()) {
     for (const auto &TA : E->getPartialArguments())
-      Writer.AddTemplateArgument(TA, Record);
+      Record.AddTemplateArgument(TA);
   } else if (!E->isValueDependent()) {
     Record.push_back(E->getPackLength());
   }
@@ -1597,62 +1624,62 @@
 void ASTStmtWriter::VisitSubstNonTypeTemplateParmExpr(
                                               SubstNonTypeTemplateParmExpr *E) {
   VisitExpr(E);
-  Writer.AddDeclRef(E->getParameter(), Record);
-  Writer.AddSourceLocation(E->getNameLoc(), Record);
-  Writer.AddStmt(E->getReplacement());
+  Record.AddDeclRef(E->getParameter());
+  Record.AddSourceLocation(E->getNameLoc());
+  Record.AddStmt(E->getReplacement());
   Code = serialization::EXPR_SUBST_NON_TYPE_TEMPLATE_PARM;
 }
 
 void ASTStmtWriter::VisitSubstNonTypeTemplateParmPackExpr(
                                           SubstNonTypeTemplateParmPackExpr *E) {
   VisitExpr(E);
-  Writer.AddDeclRef(E->getParameterPack(), Record);
-  Writer.AddTemplateArgument(E->getArgumentPack(), Record);
-  Writer.AddSourceLocation(E->getParameterPackLocation(), Record);
+  Record.AddDeclRef(E->getParameterPack());
+  Record.AddTemplateArgument(E->getArgumentPack());
+  Record.AddSourceLocation(E->getParameterPackLocation());
   Code = serialization::EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK;
 }
 
 void ASTStmtWriter::VisitFunctionParmPackExpr(FunctionParmPackExpr *E) {
   VisitExpr(E);
   Record.push_back(E->getNumExpansions());
-  Writer.AddDeclRef(E->getParameterPack(), Record);
-  Writer.AddSourceLocation(E->getParameterPackLocation(), Record);
+  Record.AddDeclRef(E->getParameterPack());
+  Record.AddSourceLocation(E->getParameterPackLocation());
   for (FunctionParmPackExpr::iterator I = E->begin(), End = E->end();
        I != End; ++I)
-    Writer.AddDeclRef(*I, Record);
+    Record.AddDeclRef(*I);
   Code = serialization::EXPR_FUNCTION_PARM_PACK;
 }
 
 void ASTStmtWriter::VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getTemporary());
-  Writer.AddDeclRef(E->getExtendingDecl(), Record);
+  Record.AddStmt(E->getTemporary());
+  Record.AddDeclRef(E->getExtendingDecl());
   Record.push_back(E->getManglingNumber());
   Code = serialization::EXPR_MATERIALIZE_TEMPORARY;
 }
 
 void ASTStmtWriter::VisitCXXFoldExpr(CXXFoldExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->LParenLoc, Record);
-  Writer.AddSourceLocation(E->EllipsisLoc, Record);
-  Writer.AddSourceLocation(E->RParenLoc, Record);
-  Writer.AddStmt(E->SubExprs[0]);
-  Writer.AddStmt(E->SubExprs[1]);
+  Record.AddSourceLocation(E->LParenLoc);
+  Record.AddSourceLocation(E->EllipsisLoc);
+  Record.AddSourceLocation(E->RParenLoc);
+  Record.AddStmt(E->SubExprs[0]);
+  Record.AddStmt(E->SubExprs[1]);
   Record.push_back(E->Opcode);
   Code = serialization::EXPR_CXX_FOLD;
 }
 
 void ASTStmtWriter::VisitOpaqueValueExpr(OpaqueValueExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getSourceExpr());
-  Writer.AddSourceLocation(E->getLocation(), Record);
+  Record.AddStmt(E->getSourceExpr());
+  Record.AddSourceLocation(E->getLocation());
   Code = serialization::EXPR_OPAQUE_VALUE;
 }
 
 void ASTStmtWriter::VisitTypoExpr(TypoExpr *E) {
   VisitExpr(E);
   // TODO: Figure out sane writer behavior for a TypoExpr, if necessary
-  assert(false && "Cannot write TypoExpr nodes");
+  llvm_unreachable("Cannot write TypoExpr nodes");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1661,7 +1688,7 @@
 
 void ASTStmtWriter::VisitCUDAKernelCallExpr(CUDAKernelCallExpr *E) {
   VisitCallExpr(E);
-  Writer.AddStmt(E->getConfig());
+  Record.AddStmt(E->getConfig());
   Code = serialization::EXPR_CUDA_KERNEL_CALL;
 }
 
@@ -1670,9 +1697,9 @@
 //===----------------------------------------------------------------------===//
 void ASTStmtWriter::VisitAsTypeExpr(AsTypeExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceLocation(E->getBuiltinLoc(), Record);
-  Writer.AddSourceLocation(E->getRParenLoc(), Record);
-  Writer.AddStmt(E->getSrcExpr());
+  Record.AddSourceLocation(E->getBuiltinLoc());
+  Record.AddSourceLocation(E->getRParenLoc());
+  Record.AddStmt(E->getSrcExpr());
   Code = serialization::EXPR_ASTYPE;
 }
 
@@ -1682,60 +1709,61 @@
 void ASTStmtWriter::VisitMSPropertyRefExpr(MSPropertyRefExpr *E) {
   VisitExpr(E);
   Record.push_back(E->isArrow());
-  Writer.AddStmt(E->getBaseExpr());
-  Writer.AddNestedNameSpecifierLoc(E->getQualifierLoc(), Record);
-  Writer.AddSourceLocation(E->getMemberLoc(), Record);
-  Writer.AddDeclRef(E->getPropertyDecl(), Record);
+  Record.AddStmt(E->getBaseExpr());
+  Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
+  Record.AddSourceLocation(E->getMemberLoc());
+  Record.AddDeclRef(E->getPropertyDecl());
   Code = serialization::EXPR_CXX_PROPERTY_REF_EXPR;
 }
 
 void ASTStmtWriter::VisitMSPropertySubscriptExpr(MSPropertySubscriptExpr *E) {
   VisitExpr(E);
-  Writer.AddStmt(E->getBase());
-  Writer.AddStmt(E->getIdx());
-  Writer.AddSourceLocation(E->getRBracketLoc(), Record);
+  Record.AddStmt(E->getBase());
+  Record.AddStmt(E->getIdx());
+  Record.AddSourceLocation(E->getRBracketLoc());
   Code = serialization::EXPR_CXX_PROPERTY_SUBSCRIPT_EXPR;
 }
 
 void ASTStmtWriter::VisitCXXUuidofExpr(CXXUuidofExpr *E) {
   VisitExpr(E);
-  Writer.AddSourceRange(E->getSourceRange(), Record);
+  Record.AddSourceRange(E->getSourceRange());
+  Record.AddString(E->getUuidStr());
   if (E->isTypeOperand()) {
-    Writer.AddTypeSourceInfo(E->getTypeOperandSourceInfo(), Record);
+    Record.AddTypeSourceInfo(E->getTypeOperandSourceInfo());
     Code = serialization::EXPR_CXX_UUIDOF_TYPE;
   } else {
-    Writer.AddStmt(E->getExprOperand());
+    Record.AddStmt(E->getExprOperand());
     Code = serialization::EXPR_CXX_UUIDOF_EXPR;
   }
 }
 
 void ASTStmtWriter::VisitSEHExceptStmt(SEHExceptStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getExceptLoc(), Record);
-  Writer.AddStmt(S->getFilterExpr());
-  Writer.AddStmt(S->getBlock());
+  Record.AddSourceLocation(S->getExceptLoc());
+  Record.AddStmt(S->getFilterExpr());
+  Record.AddStmt(S->getBlock());
   Code = serialization::STMT_SEH_EXCEPT;
 }
 
 void ASTStmtWriter::VisitSEHFinallyStmt(SEHFinallyStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getFinallyLoc(), Record);
-  Writer.AddStmt(S->getBlock());
+  Record.AddSourceLocation(S->getFinallyLoc());
+  Record.AddStmt(S->getBlock());
   Code = serialization::STMT_SEH_FINALLY;
 }
 
 void ASTStmtWriter::VisitSEHTryStmt(SEHTryStmt *S) {
   VisitStmt(S);
   Record.push_back(S->getIsCXXTry());
-  Writer.AddSourceLocation(S->getTryLoc(), Record);
-  Writer.AddStmt(S->getTryBlock());
-  Writer.AddStmt(S->getHandler());
+  Record.AddSourceLocation(S->getTryLoc());
+  Record.AddStmt(S->getTryBlock());
+  Record.AddStmt(S->getHandler());
   Code = serialization::STMT_SEH_TRY;
 }
 
 void ASTStmtWriter::VisitSEHLeaveStmt(SEHLeaveStmt *S) {
   VisitStmt(S);
-  Writer.AddSourceLocation(S->getLeaveLoc(), Record);
+  Record.AddSourceLocation(S->getLeaveLoc());
   Code = serialization::STMT_SEH_LEAVE;
 }
 
@@ -1745,86 +1773,95 @@
 
 namespace clang {
 class OMPClauseWriter : public OMPClauseVisitor<OMPClauseWriter> {
-  ASTStmtWriter *Writer;
-  ASTWriter::RecordData &Record;
+  ASTRecordWriter &Record;
 public:
-  OMPClauseWriter(ASTStmtWriter *W, ASTWriter::RecordData &Record)
-    : Writer(W), Record(Record) { }
+  OMPClauseWriter(ASTRecordWriter &Record) : Record(Record) {}
 #define OPENMP_CLAUSE(Name, Class)    \
   void Visit##Class(Class *S);
 #include "clang/Basic/OpenMPKinds.def"
   void writeClause(OMPClause *C);
+  void VisitOMPClauseWithPreInit(OMPClauseWithPreInit *C);
+  void VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *C);
 };
 }
 
 void OMPClauseWriter::writeClause(OMPClause *C) {
   Record.push_back(C->getClauseKind());
   Visit(C);
-  Writer->Writer.AddSourceLocation(C->getLocStart(), Record);
-  Writer->Writer.AddSourceLocation(C->getLocEnd(), Record);
+  Record.AddSourceLocation(C->getLocStart());
+  Record.AddSourceLocation(C->getLocEnd());
+}
+
+void OMPClauseWriter::VisitOMPClauseWithPreInit(OMPClauseWithPreInit *C) {
+  Record.AddStmt(C->getPreInitStmt());
+}
+
+void OMPClauseWriter::VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *C) {
+  VisitOMPClauseWithPreInit(C);
+  Record.AddStmt(C->getPostUpdateExpr());
 }
 
 void OMPClauseWriter::VisitOMPIfClause(OMPIfClause *C) {
   Record.push_back(C->getNameModifier());
-  Writer->Writer.AddSourceLocation(C->getNameModifierLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getColonLoc(), Record);
-  Writer->Writer.AddStmt(C->getCondition());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddSourceLocation(C->getNameModifierLoc());
+  Record.AddSourceLocation(C->getColonLoc());
+  Record.AddStmt(C->getCondition());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPFinalClause(OMPFinalClause *C) {
-  Writer->Writer.AddStmt(C->getCondition());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getCondition());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPNumThreadsClause(OMPNumThreadsClause *C) {
-  Writer->Writer.AddStmt(C->getNumThreads());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getNumThreads());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPSafelenClause(OMPSafelenClause *C) {
-  Writer->Writer.AddStmt(C->getSafelen());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getSafelen());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPSimdlenClause(OMPSimdlenClause *C) {
-  Writer->Writer.AddStmt(C->getSimdlen());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getSimdlen());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPCollapseClause(OMPCollapseClause *C) {
-  Writer->Writer.AddStmt(C->getNumForLoops());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getNumForLoops());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPDefaultClause(OMPDefaultClause *C) {
   Record.push_back(C->getDefaultKind());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getDefaultKindKwLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getDefaultKindKwLoc());
 }
 
 void OMPClauseWriter::VisitOMPProcBindClause(OMPProcBindClause *C) {
   Record.push_back(C->getProcBindKind());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getProcBindKindKwLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getProcBindKindKwLoc());
 }
 
 void OMPClauseWriter::VisitOMPScheduleClause(OMPScheduleClause *C) {
+  VisitOMPClauseWithPreInit(C);
   Record.push_back(C->getScheduleKind());
   Record.push_back(C->getFirstScheduleModifier());
   Record.push_back(C->getSecondScheduleModifier());
-  Writer->Writer.AddStmt(C->getChunkSize());
-  Writer->Writer.AddStmt(C->getHelperChunkSize());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getFirstScheduleModifierLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getSecondScheduleModifierLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getScheduleKindLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getCommaLoc(), Record);
+  Record.AddStmt(C->getChunkSize());
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getFirstScheduleModifierLoc());
+  Record.AddSourceLocation(C->getSecondScheduleModifierLoc());
+  Record.AddSourceLocation(C->getScheduleKindLoc());
+  Record.AddSourceLocation(C->getCommaLoc());
 }
 
 void OMPClauseWriter::VisitOMPOrderedClause(OMPOrderedClause *C) {
-  Writer->Writer.AddStmt(C->getNumForLoops());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getNumForLoops());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPNowaitClause(OMPNowaitClause *) {}
@@ -1851,213 +1888,323 @@
 
 void OMPClauseWriter::VisitOMPPrivateClause(OMPPrivateClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
   for (auto *VE : C->private_copies()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
 }
 
 void OMPClauseWriter::VisitOMPFirstprivateClause(OMPFirstprivateClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  VisitOMPClauseWithPreInit(C);
+  Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
   for (auto *VE : C->private_copies()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
   for (auto *VE : C->inits()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
 }
 
 void OMPClauseWriter::VisitOMPLastprivateClause(OMPLastprivateClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  VisitOMPClauseWithPostUpdate(C);
+  Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   for (auto *E : C->private_copies())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->source_exprs())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->destination_exprs())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->assignment_ops())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
 }
 
 void OMPClauseWriter::VisitOMPSharedClause(OMPSharedClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
 }
 
 void OMPClauseWriter::VisitOMPReductionClause(OMPReductionClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getColonLoc(), Record);
-  Writer->Writer.AddNestedNameSpecifierLoc(C->getQualifierLoc(), Record);
-  Writer->Writer.AddDeclarationNameInfo(C->getNameInfo(), Record);
+  VisitOMPClauseWithPostUpdate(C);
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getColonLoc());
+  Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
+  Record.AddDeclarationNameInfo(C->getNameInfo());
   for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   for (auto *VE : C->privates())
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   for (auto *E : C->lhs_exprs())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->rhs_exprs())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->reduction_ops())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
 }
 
 void OMPClauseWriter::VisitOMPLinearClause(OMPLinearClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getColonLoc(), Record);
+  VisitOMPClauseWithPostUpdate(C);
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getColonLoc());
   Record.push_back(C->getModifier());
-  Writer->Writer.AddSourceLocation(C->getModifierLoc(), Record);
+  Record.AddSourceLocation(C->getModifierLoc());
   for (auto *VE : C->varlists()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
   for (auto *VE : C->privates()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
   for (auto *VE : C->inits()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
   for (auto *VE : C->updates()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
   for (auto *VE : C->finals()) {
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   }
-  Writer->Writer.AddStmt(C->getStep());
-  Writer->Writer.AddStmt(C->getCalcStep());
+  Record.AddStmt(C->getStep());
+  Record.AddStmt(C->getCalcStep());
 }
 
 void OMPClauseWriter::VisitOMPAlignedClause(OMPAlignedClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getColonLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getColonLoc());
   for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
-  Writer->Writer.AddStmt(C->getAlignment());
+    Record.AddStmt(VE);
+  Record.AddStmt(C->getAlignment());
 }
 
 void OMPClauseWriter::VisitOMPCopyinClause(OMPCopyinClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   for (auto *E : C->source_exprs())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->destination_exprs())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->assignment_ops())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
 }
 
 void OMPClauseWriter::VisitOMPCopyprivateClause(OMPCopyprivateClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
   for (auto *E : C->source_exprs())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->destination_exprs())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
   for (auto *E : C->assignment_ops())
-    Writer->Writer.AddStmt(E);
+    Record.AddStmt(E);
 }
 
 void OMPClauseWriter::VisitOMPFlushClause(OMPFlushClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
 }
 
 void OMPClauseWriter::VisitOMPDependClause(OMPDependClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddSourceLocation(C->getLParenLoc());
   Record.push_back(C->getDependencyKind());
-  Writer->Writer.AddSourceLocation(C->getDependencyLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getColonLoc(), Record);
+  Record.AddSourceLocation(C->getDependencyLoc());
+  Record.AddSourceLocation(C->getColonLoc());
   for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
+    Record.AddStmt(VE);
+  Record.AddStmt(C->getCounterValue());
 }
 
 void OMPClauseWriter::VisitOMPDeviceClause(OMPDeviceClause *C) {
-  Writer->Writer.AddStmt(C->getDevice());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getDevice());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) {
   Record.push_back(C->varlist_size());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.push_back(C->getUniqueDeclarationsNum());
+  Record.push_back(C->getTotalComponentListNum());
+  Record.push_back(C->getTotalComponentsNum());
+  Record.AddSourceLocation(C->getLParenLoc());
   Record.push_back(C->getMapTypeModifier());
   Record.push_back(C->getMapType());
-  Writer->Writer.AddSourceLocation(C->getMapLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getColonLoc(), Record);
-  for (auto *VE : C->varlists())
-    Writer->Writer.AddStmt(VE);
+  Record.AddSourceLocation(C->getMapLoc());
+  Record.AddSourceLocation(C->getColonLoc());
+  for (auto *E : C->varlists())
+    Record.AddStmt(E);
+  for (auto *D : C->all_decls())
+    Record.AddDeclRef(D);
+  for (auto N : C->all_num_lists())
+    Record.push_back(N);
+  for (auto N : C->all_lists_sizes())
+    Record.push_back(N);
+  for (auto &M : C->all_components()) {
+    Record.AddStmt(M.getAssociatedExpression());
+    Record.AddDeclRef(M.getAssociatedDeclaration());
+  }
 }
 
 void OMPClauseWriter::VisitOMPNumTeamsClause(OMPNumTeamsClause *C) {
-  Writer->Writer.AddStmt(C->getNumTeams());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getNumTeams());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPThreadLimitClause(OMPThreadLimitClause *C) {
-  Writer->Writer.AddStmt(C->getThreadLimit());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getThreadLimit());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPPriorityClause(OMPPriorityClause *C) {
-  Writer->Writer.AddStmt(C->getPriority());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getPriority());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPGrainsizeClause(OMPGrainsizeClause *C) {
-  Writer->Writer.AddStmt(C->getGrainsize());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getGrainsize());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPNumTasksClause(OMPNumTasksClause *C) {
-  Writer->Writer.AddStmt(C->getNumTasks());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getNumTasks());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPHintClause(OMPHintClause *C) {
-  Writer->Writer.AddStmt(C->getHint());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
+  Record.AddStmt(C->getHint());
+  Record.AddSourceLocation(C->getLParenLoc());
 }
 
 void OMPClauseWriter::VisitOMPDistScheduleClause(OMPDistScheduleClause *C) {
+  VisitOMPClauseWithPreInit(C);
   Record.push_back(C->getDistScheduleKind());
-  Writer->Writer.AddStmt(C->getChunkSize());
-  Writer->Writer.AddStmt(C->getHelperChunkSize());
-  Writer->Writer.AddSourceLocation(C->getLParenLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getDistScheduleKindLoc(), Record);
-  Writer->Writer.AddSourceLocation(C->getCommaLoc(), Record);
+  Record.AddStmt(C->getChunkSize());
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getDistScheduleKindLoc());
+  Record.AddSourceLocation(C->getCommaLoc());
+}
+
+void OMPClauseWriter::VisitOMPDefaultmapClause(OMPDefaultmapClause *C) {
+  Record.push_back(C->getDefaultmapKind());
+  Record.push_back(C->getDefaultmapModifier());
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getDefaultmapModifierLoc());
+  Record.AddSourceLocation(C->getDefaultmapKindLoc());
+}
+
+void OMPClauseWriter::VisitOMPToClause(OMPToClause *C) {
+  Record.push_back(C->varlist_size());
+  Record.push_back(C->getUniqueDeclarationsNum());
+  Record.push_back(C->getTotalComponentListNum());
+  Record.push_back(C->getTotalComponentsNum());
+  Record.AddSourceLocation(C->getLParenLoc());
+  for (auto *E : C->varlists())
+    Record.AddStmt(E);
+  for (auto *D : C->all_decls())
+    Record.AddDeclRef(D);
+  for (auto N : C->all_num_lists())
+    Record.push_back(N);
+  for (auto N : C->all_lists_sizes())
+    Record.push_back(N);
+  for (auto &M : C->all_components()) {
+    Record.AddStmt(M.getAssociatedExpression());
+    Record.AddDeclRef(M.getAssociatedDeclaration());
+  }
+}
+
+void OMPClauseWriter::VisitOMPFromClause(OMPFromClause *C) {
+  Record.push_back(C->varlist_size());
+  Record.push_back(C->getUniqueDeclarationsNum());
+  Record.push_back(C->getTotalComponentListNum());
+  Record.push_back(C->getTotalComponentsNum());
+  Record.AddSourceLocation(C->getLParenLoc());
+  for (auto *E : C->varlists())
+    Record.AddStmt(E);
+  for (auto *D : C->all_decls())
+    Record.AddDeclRef(D);
+  for (auto N : C->all_num_lists())
+    Record.push_back(N);
+  for (auto N : C->all_lists_sizes())
+    Record.push_back(N);
+  for (auto &M : C->all_components()) {
+    Record.AddStmt(M.getAssociatedExpression());
+    Record.AddDeclRef(M.getAssociatedDeclaration());
+  }
+}
+
+void OMPClauseWriter::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *C) {
+  Record.push_back(C->varlist_size());
+  Record.push_back(C->getUniqueDeclarationsNum());
+  Record.push_back(C->getTotalComponentListNum());
+  Record.push_back(C->getTotalComponentsNum());
+  Record.AddSourceLocation(C->getLParenLoc());
+  for (auto *E : C->varlists())
+    Record.AddStmt(E);
+  for (auto *VE : C->private_copies())
+    Record.AddStmt(VE);
+  for (auto *VE : C->inits())
+    Record.AddStmt(VE);
+  for (auto *D : C->all_decls())
+    Record.AddDeclRef(D);
+  for (auto N : C->all_num_lists())
+    Record.push_back(N);
+  for (auto N : C->all_lists_sizes())
+    Record.push_back(N);
+  for (auto &M : C->all_components()) {
+    Record.AddStmt(M.getAssociatedExpression());
+    Record.AddDeclRef(M.getAssociatedDeclaration());
+  }
+}
+
+void OMPClauseWriter::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
+  Record.push_back(C->varlist_size());
+  Record.push_back(C->getUniqueDeclarationsNum());
+  Record.push_back(C->getTotalComponentListNum());
+  Record.push_back(C->getTotalComponentsNum());
+  Record.AddSourceLocation(C->getLParenLoc());
+  for (auto *E : C->varlists())
+    Record.AddStmt(E);
+  for (auto *D : C->all_decls())
+    Record.AddDeclRef(D);
+  for (auto N : C->all_num_lists())
+    Record.push_back(N);
+  for (auto N : C->all_lists_sizes())
+    Record.push_back(N);
+  for (auto &M : C->all_components()) {
+    Record.AddStmt(M.getAssociatedExpression());
+    Record.AddDeclRef(M.getAssociatedDeclaration());
+  }
 }
 
 //===----------------------------------------------------------------------===//
 // OpenMP Directives.
 //===----------------------------------------------------------------------===//
 void ASTStmtWriter::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
-  Writer.AddSourceLocation(E->getLocStart(), Record);
-  Writer.AddSourceLocation(E->getLocEnd(), Record);
-  OMPClauseWriter ClauseWriter(this, Record);
+  Record.AddSourceLocation(E->getLocStart());
+  Record.AddSourceLocation(E->getLocEnd());
+  OMPClauseWriter ClauseWriter(Record);
   for (unsigned i = 0; i < E->getNumClauses(); ++i) {
     ClauseWriter.writeClause(E->getClause(i));
   }
   if (E->hasAssociatedStmt())
-    Writer.AddStmt(E->getAssociatedStmt());
+    Record.AddStmt(E->getAssociatedStmt());
 }
 
 void ASTStmtWriter::VisitOMPLoopDirective(OMPLoopDirective *D) {
@@ -2065,36 +2212,44 @@
   Record.push_back(D->getNumClauses());
   Record.push_back(D->getCollapsedNumber());
   VisitOMPExecutableDirective(D);
-  Writer.AddStmt(D->getIterationVariable());
-  Writer.AddStmt(D->getLastIteration());
-  Writer.AddStmt(D->getCalcLastIteration());
-  Writer.AddStmt(D->getPreCond());
-  Writer.AddStmt(D->getCond());
-  Writer.AddStmt(D->getInit());
-  Writer.AddStmt(D->getInc());
-  if (isOpenMPWorksharingDirective(D->getDirectiveKind())) {
-    Writer.AddStmt(D->getIsLastIterVariable());
-    Writer.AddStmt(D->getLowerBoundVariable());
-    Writer.AddStmt(D->getUpperBoundVariable());
-    Writer.AddStmt(D->getStrideVariable());
-    Writer.AddStmt(D->getEnsureUpperBound());
-    Writer.AddStmt(D->getNextLowerBound());
-    Writer.AddStmt(D->getNextUpperBound());
+  Record.AddStmt(D->getIterationVariable());
+  Record.AddStmt(D->getLastIteration());
+  Record.AddStmt(D->getCalcLastIteration());
+  Record.AddStmt(D->getPreCond());
+  Record.AddStmt(D->getCond());
+  Record.AddStmt(D->getInit());
+  Record.AddStmt(D->getInc());
+  Record.AddStmt(D->getPreInits());
+  if (isOpenMPWorksharingDirective(D->getDirectiveKind()) ||
+      isOpenMPTaskLoopDirective(D->getDirectiveKind()) ||
+      isOpenMPDistributeDirective(D->getDirectiveKind())) {
+    Record.AddStmt(D->getIsLastIterVariable());
+    Record.AddStmt(D->getLowerBoundVariable());
+    Record.AddStmt(D->getUpperBoundVariable());
+    Record.AddStmt(D->getStrideVariable());
+    Record.AddStmt(D->getEnsureUpperBound());
+    Record.AddStmt(D->getNextLowerBound());
+    Record.AddStmt(D->getNextUpperBound());
+    Record.AddStmt(D->getNumIterations());
+  }
+  if (isOpenMPLoopBoundSharingDirective(D->getDirectiveKind())) {
+    Record.AddStmt(D->getPrevLowerBoundVariable());
+    Record.AddStmt(D->getPrevUpperBoundVariable());
   }
   for (auto I : D->counters()) {
-    Writer.AddStmt(I);
+    Record.AddStmt(I);
   }
   for (auto I : D->private_counters()) {
-    Writer.AddStmt(I);
+    Record.AddStmt(I);
   }
   for (auto I : D->inits()) {
-    Writer.AddStmt(I);
+    Record.AddStmt(I);
   }
   for (auto I : D->updates()) {
-    Writer.AddStmt(I);
+    Record.AddStmt(I);
   }
   for (auto I : D->finals()) {
-    Writer.AddStmt(I);
+    Record.AddStmt(I);
   }
 }
 
@@ -2154,7 +2309,7 @@
   VisitStmt(D);
   Record.push_back(D->getNumClauses());
   VisitOMPExecutableDirective(D);
-  Writer.AddDeclarationNameInfo(D->getDirectiveName(), Record);
+  Record.AddDeclarationNameInfo(D->getDirectiveName());
   Code = serialization::STMT_OMP_CRITICAL_DIRECTIVE;
 }
 
@@ -2191,10 +2346,10 @@
   VisitStmt(D);
   Record.push_back(D->getNumClauses());
   VisitOMPExecutableDirective(D);
-  Writer.AddStmt(D->getX());
-  Writer.AddStmt(D->getV());
-  Writer.AddStmt(D->getExpr());
-  Writer.AddStmt(D->getUpdateExpr());
+  Record.AddStmt(D->getX());
+  Record.AddStmt(D->getV());
+  Record.AddStmt(D->getExpr());
+  Record.AddStmt(D->getUpdateExpr());
   Record.push_back(D->isXLHSInRHSPart() ? 1 : 0);
   Record.push_back(D->isPostfixUpdate() ? 1 : 0);
   Code = serialization::STMT_OMP_ATOMIC_DIRECTIVE;
@@ -2214,6 +2369,37 @@
   Code = serialization::STMT_OMP_TARGET_DATA_DIRECTIVE;
 }
 
+void ASTStmtWriter::VisitOMPTargetEnterDataDirective(
+    OMPTargetEnterDataDirective *D) {
+  VisitStmt(D);
+  Record.push_back(D->getNumClauses());
+  VisitOMPExecutableDirective(D);
+  Code = serialization::STMT_OMP_TARGET_ENTER_DATA_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPTargetExitDataDirective(
+    OMPTargetExitDataDirective *D) {
+  VisitStmt(D);
+  Record.push_back(D->getNumClauses());
+  VisitOMPExecutableDirective(D);
+  Code = serialization::STMT_OMP_TARGET_EXIT_DATA_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPTargetParallelDirective(
+    OMPTargetParallelDirective *D) {
+  VisitStmt(D);
+  Record.push_back(D->getNumClauses());
+  VisitOMPExecutableDirective(D);
+  Code = serialization::STMT_OMP_TARGET_PARALLEL_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPTargetParallelForDirective(
+    OMPTargetParallelForDirective *D) {
+  VisitOMPLoopDirective(D);
+  Record.push_back(D->hasCancel() ? 1 : 0);
+  Code = serialization::STMT_OMP_TARGET_PARALLEL_FOR_DIRECTIVE;
+}
+
 void ASTStmtWriter::VisitOMPTaskyieldDirective(OMPTaskyieldDirective *D) {
   VisitStmt(D);
   VisitOMPExecutableDirective(D);
@@ -2290,6 +2476,48 @@
   Code = serialization::STMT_OMP_DISTRIBUTE_DIRECTIVE;
 }
 
+void ASTStmtWriter::VisitOMPTargetUpdateDirective(OMPTargetUpdateDirective *D) {
+  VisitStmt(D);
+  Record.push_back(D->getNumClauses());
+  VisitOMPExecutableDirective(D);
+  Code = serialization::STMT_OMP_TARGET_UPDATE_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPDistributeParallelForDirective(
+    OMPDistributeParallelForDirective *D) {
+  VisitOMPLoopDirective(D);
+  Code = serialization::STMT_OMP_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPDistributeParallelForSimdDirective(
+    OMPDistributeParallelForSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+  Code = serialization::STMT_OMP_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPDistributeSimdDirective(
+    OMPDistributeSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+  Code = serialization::STMT_OMP_DISTRIBUTE_SIMD_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPTargetParallelForSimdDirective(
+    OMPTargetParallelForSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+  Code = serialization::STMT_OMP_TARGET_PARALLEL_FOR_SIMD_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPTargetSimdDirective(OMPTargetSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+  Code = serialization::STMT_OMP_TARGET_SIMD_DIRECTIVE;
+}
+
+void ASTStmtWriter::VisitOMPTeamsDistributeDirective(
+    OMPTeamsDistributeDirective *D) {
+  VisitOMPLoopDirective(D);
+  Code = serialization::STMT_OMP_TEAMS_DISTRIBUTE_DIRECTIVE;
+}
+
 //===----------------------------------------------------------------------===//
 // ASTWriter Implementation
 //===----------------------------------------------------------------------===//
@@ -2314,9 +2542,7 @@
 
 /// \brief Write the given substatement or subexpression to the
 /// bitstream.
-void ASTWriter::WriteSubStmt(Stmt *S,
-                             llvm::DenseMap<Stmt *, uint64_t> &SubStmtEntries,
-                             llvm::DenseSet<Stmt *> &ParentStmts) {
+void ASTWriter::WriteSubStmt(Stmt *S) {
   RecordData Record;
   ASTStmtWriter Writer(*this, Record);
   ++NumStatements;
@@ -2352,61 +2578,44 @@
   ParentStmtInserterRAII ParentStmtInserter(S, ParentStmts);
 #endif
 
-  // Redirect ASTWriter::AddStmt to collect sub-stmts.
-  SmallVector<Stmt *, 16> SubStmts;
-  CollectedStmts = &SubStmts;
-
-  Writer.Code = serialization::STMT_NULL_PTR;
-  Writer.AbbrevToUse = 0;
   Writer.Visit(S);
   
-#ifndef NDEBUG
-  if (Writer.Code == serialization::STMT_NULL_PTR) {
-    SourceManager &SrcMgr
-      = DeclIDs.begin()->first->getASTContext().getSourceManager();
-    S->dump(SrcMgr);
-    llvm_unreachable("Unhandled sub-statement writing AST file");
-  }
-#endif
-
-  // Revert ASTWriter::AddStmt.
-  CollectedStmts = &StmtsToEmit;
-
-  // Write the sub-stmts in reverse order, last to first. When reading them back
-  // we will read them in correct order by "pop"ing them from the Stmts stack.
-  // This simplifies reading and allows to store a variable number of sub-stmts
-  // without knowing it in advance.
-  while (!SubStmts.empty())
-    WriteSubStmt(SubStmts.pop_back_val(), SubStmtEntries, ParentStmts);
-  
-  Stream.EmitRecord(Writer.Code, Record, Writer.AbbrevToUse);
- 
-  SubStmtEntries[S] = Stream.GetCurrentBitNo();
+  uint64_t Offset = Writer.Emit();
+  SubStmtEntries[S] = Offset;
 }
 
 /// \brief Flush all of the statements that have been added to the
 /// queue via AddStmt().
-void ASTWriter::FlushStmts() {
-  RecordData Record;
-
+void ASTRecordWriter::FlushStmts() {
   // We expect to be the only consumer of the two temporary statement maps,
   // assert that they are empty.
-  assert(SubStmtEntries.empty() && "unexpected entries in sub-stmt map");
-  assert(ParentStmts.empty() && "unexpected entries in parent stmt map");
+  assert(Writer->SubStmtEntries.empty() && "unexpected entries in sub-stmt map");
+  assert(Writer->ParentStmts.empty() && "unexpected entries in parent stmt map");
 
   for (unsigned I = 0, N = StmtsToEmit.size(); I != N; ++I) {
-    WriteSubStmt(StmtsToEmit[I], SubStmtEntries, ParentStmts);
+    Writer->WriteSubStmt(StmtsToEmit[I]);
     
-    assert(N == StmtsToEmit.size() &&
-           "Substatement written via AddStmt rather than WriteSubStmt!");
+    assert(N == StmtsToEmit.size() && "record modified while being written!");
 
     // Note that we are at the end of a full expression. Any
     // expression records that follow this one are part of a different
     // expression.
-    Stream.EmitRecord(serialization::STMT_STOP, Record);
+    Writer->Stream.EmitRecord(serialization::STMT_STOP, ArrayRef<uint32_t>());
 
-    SubStmtEntries.clear();
-    ParentStmts.clear();
+    Writer->SubStmtEntries.clear();
+    Writer->ParentStmts.clear();
+  }
+
+  StmtsToEmit.clear();
+}
+
+void ASTRecordWriter::FlushSubStmts() {
+  // For a nested statement, write out the substatements in reverse order (so
+  // that a simple stack machine can be used when loading), and don't emit a
+  // STMT_STOP after each one.
+  for (unsigned I = 0, N = StmtsToEmit.size(); I != N; ++I) {
+    Writer->WriteSubStmt(StmtsToEmit[N - I - 1]);
+    assert(N == StmtsToEmit.size() && "record modified while being written!");
   }
 
   StmtsToEmit.clear();
diff --git a/lib/Serialization/GeneratePCH.cpp b/lib/Serialization/GeneratePCH.cpp
index 308fde8..47dce37 100644
--- a/lib/Serialization/GeneratePCH.cpp
+++ b/lib/Serialization/GeneratePCH.cpp
@@ -12,14 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Serialization/ASTWriter.h"
-#include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/SemaConsumer.h"
+#include "clang/Serialization/ASTWriter.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
-#include <string>
 
 using namespace clang;
 
diff --git a/lib/Serialization/Makefile b/lib/Serialization/Makefile
deleted file mode 100644
index e89ddc3..0000000
--- a/lib/Serialization/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-##===- clang/lib/Serialization/Makefile --------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-#  This implements the semantic analyzer and AST builder library for the 
-#  C-Language front-end.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangSerialization
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/lib/Serialization/Module.cpp b/lib/Serialization/Module.cpp
index ca033b4..72b0861 100644
--- a/lib/Serialization/Module.cpp
+++ b/lib/Serialization/Module.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 #include "clang/Serialization/Module.h"
 #include "ASTReaderInternals.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -37,8 +36,6 @@
     LocalNumSelectors(0), SelectorOffsets(nullptr), BaseSelectorID(0),
     SelectorLookupTableData(nullptr), SelectorLookupTable(nullptr),
     LocalNumDecls(0), DeclOffsets(nullptr), BaseDeclID(0),
-    LocalNumCXXBaseSpecifiers(0), CXXBaseSpecifiersOffsets(nullptr),
-    LocalNumCXXCtorInitializers(0), CXXCtorInitializersOffsets(nullptr),
     FileSortedDecls(nullptr), NumFileSortedDecls(0),
     ObjCCategoriesMap(nullptr), LocalNumObjCCategoriesInMap(0),
     LocalNumTypes(0), TypeOffsets(nullptr), BaseTypeIndex(0)
diff --git a/lib/Serialization/ModuleFileExtension.cpp b/lib/Serialization/ModuleFileExtension.cpp
index 81dcfd6..5bd0a1c 100644
--- a/lib/Serialization/ModuleFileExtension.cpp
+++ b/lib/Serialization/ModuleFileExtension.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 #include "clang/Serialization/ModuleFileExtension.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/Support/raw_ostream.h"
 using namespace clang;
 
 ModuleFileExtension::~ModuleFileExtension() { }
diff --git a/lib/Serialization/ModuleManager.cpp b/lib/Serialization/ModuleManager.cpp
index 03a0266..8692f9e 100644
--- a/lib/Serialization/ModuleManager.cpp
+++ b/lib/Serialization/ModuleManager.cpp
@@ -11,14 +11,13 @@
 //  modules for the ASTReader.
 //
 //===----------------------------------------------------------------------===//
+#include "clang/Serialization/ModuleManager.h"
 #include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/ModuleMap.h"
 #include "clang/Serialization/GlobalModuleIndex.h"
-#include "clang/Serialization/ModuleManager.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 #include <system_error>
 
 #ifndef NDEBUG
@@ -67,7 +66,7 @@
   // Look for the file entry. This only fails if the expected size or
   // modification time differ.
   const FileEntry *Entry;
-  if (Type == MK_ExplicitModule || Type == MK_PrebuiltModule) {
+  if (Type == MK_ExplicitModule) {
     // If we're not expecting to pull this file out of the module cache, it
     // might have a different mtime due to being moved across filesystems in
     // a distributed build. The size must still match, though. (As must the
@@ -320,11 +319,11 @@
     Queue.reserve(N);
     llvm::SmallVector<unsigned, 4> UnusedIncomingEdges;
     UnusedIncomingEdges.resize(size());
-    for (auto M = rbegin(), MEnd = rend(); M != MEnd; ++M) {
-      unsigned Size = (*M)->ImportedBy.size();
-      UnusedIncomingEdges[(*M)->Index] = Size;
+    for (ModuleFile *M : llvm::reverse(*this)) {
+      unsigned Size = M->ImportedBy.size();
+      UnusedIncomingEdges[M->Index] = Size;
       if (!Size)
-        Queue.push_back(*M);
+        Queue.push_back(M);
     }
 
     // Traverse the graph, making sure to visit a module before visiting any
diff --git a/lib/Serialization/MultiOnDiskHashTable.h b/lib/Serialization/MultiOnDiskHashTable.h
index 04dea83..fdbbb60 100644
--- a/lib/Serialization/MultiOnDiskHashTable.h
+++ b/lib/Serialization/MultiOnDiskHashTable.h
@@ -18,7 +18,11 @@
 #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
 #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/OnDiskHashTable.h"
 
diff --git a/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp b/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
index 26d42ba..6239c55 100644
--- a/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
+++ b/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
@@ -132,7 +132,7 @@
     void checkPostStmt(const ObjCArrayLiteral *AL,
                        CheckerContext &C) const;
   };
-}
+} // end anonymous namespace
 
 void NilArgChecker::warnIfNilExpr(const Expr *E,
                                   const char *Msg,
@@ -143,7 +143,6 @@
     if (ExplodedNode *N = C.generateErrorNode()) {
       generateBugReport(N, Msg, E->getSourceRange(), E, C);
     }
-
   }
 }
 
@@ -530,6 +529,7 @@
 class CFRetainReleaseChecker : public Checker< check::PreStmt<CallExpr> > {
   mutable std::unique_ptr<APIMisuse> BT;
   mutable IdentifierInfo *Retain, *Release, *MakeCollectable, *Autorelease;
+
 public:
   CFRetainReleaseChecker()
       : Retain(nullptr), Release(nullptr), MakeCollectable(nullptr),
@@ -538,7 +538,6 @@
 };
 } // end anonymous namespace
 
-
 void CFRetainReleaseChecker::checkPreStmt(const CallExpr *CE,
                                           CheckerContext &C) const {
   // If the CallExpr doesn't have exactly 1 argument just give up checking.
@@ -631,11 +630,10 @@
 public:
   void checkPreObjCMessage(const ObjCMethodCall &msg, CheckerContext &C) const;
 };
-}
+} // end anonymous namespace
 
 void ClassReleaseChecker::checkPreObjCMessage(const ObjCMethodCall &msg,
                                               CheckerContext &C) const {
-
   if (!BT) {
     BT.reset(new APIMisuse(
         this, "message incorrectly sent to class instead of class instance"));
@@ -692,7 +690,7 @@
 public:
   void checkPreObjCMessage(const ObjCMethodCall &msg, CheckerContext &C) const;
 };
-}
+} // end anonymous namespace
 
 /// isVariadicMessage - Returns whether the given message is a variadic message,
 /// where all arguments must be Objective-C types.
@@ -855,7 +853,7 @@
                                      const CallEvent *Call,
                                      PointerEscapeKind Kind) const;
 };
-}
+} // end anonymous namespace
 
 static bool isKnownNonNilCollectionType(QualType T) {
   const ObjCObjectPointerType *PT = T->getAs<ObjCObjectPointerType>();
@@ -983,7 +981,6 @@
   return assumeCollectionNonEmpty(C, State, CollectionS, Assumption);
 }
 
-
 /// If the fist block edge is a back edge, we are reentering the loop.
 static bool alreadyExecutedAtLeastOneLoopIteration(const ExplodedNode *N,
                                              const ObjCForCollectionStmt *FCS) {
@@ -1080,7 +1077,6 @@
 
     C.addTransition(State);
   }
-  return;
 }
 
 static SymbolRef getMethodReceiverIfKnownImmutable(const CallEvent *Call) {
@@ -1203,7 +1199,7 @@
 
   void checkPostObjCMessage(const ObjCMethodCall &M, CheckerContext &C) const;
 };
-}
+} // end anonymous namespace
 
 ProgramStateRef
 ObjCNonNilReturnValueChecker::assumeExprIsNonNull(const Expr *NonNullExpr,
diff --git a/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/lib/StaticAnalyzer/Checkers/CMakeLists.txt
index e1d98cf..639de00 100644
--- a/lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ b/lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -1,8 +1,3 @@
-clang_tablegen(Checkers.inc -gen-clang-sa-checkers
-  -I ${CMAKE_CURRENT_SOURCE_DIR}/../../../include
-  SOURCE Checkers.td
-  TARGET ClangSACheckers)
-
 set(LLVM_LINK_COMPONENTS
   Support
   )
@@ -27,6 +22,8 @@
   CheckerDocumentation.cpp
   ChrootChecker.cpp
   ClangCheckers.cpp
+  CloneChecker.cpp
+  CXXSelfAssignmentChecker.cpp
   DeadStoresChecker.cpp
   DebugCheckers.cpp
   DereferenceChecker.cpp
@@ -46,6 +43,9 @@
   MallocChecker.cpp
   MallocOverflowSecurityChecker.cpp
   MallocSizeofChecker.cpp
+  MPI-Checker/MPIBugReporter.cpp
+  MPI-Checker/MPIChecker.cpp
+  MPI-Checker/MPIFunctionClassifier.cpp
   NSAutoreleasePoolChecker.cpp
   NSErrorChecker.cpp
   NoReturnFunctionChecker.cpp
diff --git a/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 5130dd6..1f13dba 100644
--- a/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -22,7 +22,6 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -920,7 +919,7 @@
     // Invalidate and escape only indirect regions accessible through the source
     // buffer.
     if (IsSourceBuffer) {
-      ITraits.setTrait(R,
+      ITraits.setTrait(R->getBaseRegion(),
                        RegionAndSymbolInvalidationTraits::TK_PreserveContents);
       ITraits.setTrait(R, RegionAndSymbolInvalidationTraits::TK_SuppressEscape);
       CausesPointerEscape = true;
@@ -1837,6 +1836,8 @@
   const StringLiteral *s1StrLiteral = getCStringLiteral(C, state, s1, s1Val);
   const StringLiteral *s2StrLiteral = getCStringLiteral(C, state, s2, s2Val);
   bool canComputeResult = false;
+  SVal resultVal = svalBuilder.conjureSymbolVal(nullptr, CE, LCtx,
+                                                C.blockCount());
 
   if (s1StrLiteral && s2StrLiteral) {
     StringRef s1StrRef = s1StrLiteral->getString();
@@ -1870,28 +1871,29 @@
         s2StrRef = s2StrRef.substr(0, s2Term);
 
       // Use StringRef's comparison methods to compute the actual result.
-      int result;
+      int compareRes = ignoreCase ? s1StrRef.compare_lower(s2StrRef)
+                                  : s1StrRef.compare(s2StrRef);
 
-      if (ignoreCase) {
-        // Compare string 1 to string 2 the same way strcasecmp() does.
-        result = s1StrRef.compare_lower(s2StrRef);
-      } else {
-        // Compare string 1 to string 2 the same way strcmp() does.
-        result = s1StrRef.compare(s2StrRef);
+      // The strcmp function returns an integer greater than, equal to, or less
+      // than zero, [c11, p7.24.4.2].
+      if (compareRes == 0) {
+        resultVal = svalBuilder.makeIntVal(compareRes, CE->getType());
       }
-
-      // Build the SVal of the comparison and bind the return value.
-      SVal resultVal = svalBuilder.makeIntVal(result, CE->getType());
-      state = state->BindExpr(CE, LCtx, resultVal);
+      else {
+        DefinedSVal zeroVal = svalBuilder.makeIntVal(0, CE->getType());
+        // Constrain strcmp's result range based on the result of StringRef's
+        // comparison methods.
+        BinaryOperatorKind op = (compareRes == 1) ? BO_GT : BO_LT;
+        SVal compareWithZero =
+          svalBuilder.evalBinOp(state, op, resultVal, zeroVal,
+                                svalBuilder.getConditionType());
+        DefinedSVal compareWithZeroVal = compareWithZero.castAs<DefinedSVal>();
+        state = state->assume(compareWithZeroVal, true);
+      }
     }
   }
 
-  if (!canComputeResult) {
-    // Conjure a symbolic value. It's the best we can do.
-    SVal resultVal = svalBuilder.conjureSymbolVal(nullptr, CE, LCtx,
-                                                  C.blockCount());
-    state = state->BindExpr(CE, LCtx, resultVal);
-  }
+  state = state->BindExpr(CE, LCtx, resultVal);
 
   // Record this as a possible path.
   C.addTransition(state);
diff --git a/lib/StaticAnalyzer/Checkers/CXXSelfAssignmentChecker.cpp b/lib/StaticAnalyzer/Checkers/CXXSelfAssignmentChecker.cpp
new file mode 100644
index 0000000..7631322
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/CXXSelfAssignmentChecker.cpp
@@ -0,0 +1,62 @@
+//=== CXXSelfAssignmentChecker.cpp -----------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines CXXSelfAssignmentChecker, which tests all custom defined
+// copy and move assignment operators for the case of self assignment, thus
+// where the parameter refers to the same location where the this pointer
+// points to. The checker itself does not do any checks at all, but it
+// causes the analyzer to check every copy and move assignment operator twice:
+// once for when 'this' aliases with the parameter and once for when it may not.
+// It is the task of the other enabled checkers to find the bugs in these two
+// different cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClangSACheckers.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+
+class CXXSelfAssignmentChecker : public Checker<check::BeginFunction> {
+public:
+  CXXSelfAssignmentChecker();
+  void checkBeginFunction(CheckerContext &C) const;
+};
+}
+
+CXXSelfAssignmentChecker::CXXSelfAssignmentChecker() {}
+
+void CXXSelfAssignmentChecker::checkBeginFunction(CheckerContext &C) const {
+  if (!C.inTopFrame())
+    return;
+  const auto *LCtx = C.getLocationContext();
+  const auto *MD = dyn_cast<CXXMethodDecl>(LCtx->getDecl());
+  if (!MD)
+    return;
+  if (!MD->isCopyAssignmentOperator() && !MD->isMoveAssignmentOperator())
+    return;
+  auto &State = C.getState();
+  auto &SVB = C.getSValBuilder();
+  auto ThisVal =
+      State->getSVal(SVB.getCXXThis(MD, LCtx->getCurrentStackFrame()));
+  auto Param = SVB.makeLoc(State->getRegion(MD->getParamDecl(0), LCtx));
+  auto ParamVal = State->getSVal(Param);
+  ProgramStateRef SelfAssignState = State->bindLoc(Param, ThisVal);
+  C.addTransition(SelfAssignState);
+  ProgramStateRef NonSelfAssignState = State->bindLoc(Param, ParamVal);
+  C.addTransition(NonSelfAssignState);
+}
+
+void ento::registerCXXSelfAssignmentChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<CXXSelfAssignmentChecker>();
+}
diff --git a/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp b/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
index 50915f3..ffb0adc 100644
--- a/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
+++ b/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
@@ -315,15 +315,7 @@
 /// Returns nullptr if the instance symbol cannot be found.
 const ObjCIvarRegion *
 ObjCDeallocChecker::getIvarRegionForIvarSymbol(SymbolRef IvarSym) const {
-  const MemRegion *RegionLoadedFrom = nullptr;
-  if (auto *DerivedSym = dyn_cast<SymbolDerived>(IvarSym))
-    RegionLoadedFrom = DerivedSym->getRegion();
-  else if (auto *RegionSym = dyn_cast<SymbolRegionValue>(IvarSym))
-    RegionLoadedFrom = RegionSym->getRegion();
-  else
-    return nullptr;
-
-  return dyn_cast<ObjCIvarRegion>(RegionLoadedFrom);
+  return dyn_cast_or_null<ObjCIvarRegion>(IvarSym->getOriginRegion());
 }
 
 /// Given a symbol for an ivar, return a symbol for the instance containing
@@ -533,7 +525,7 @@
     if (SelfRegion != IvarRegion->getSuperRegion())
       continue;
 
-      const ObjCIvarDecl *IvarDecl = IvarRegion->getDecl();
+    const ObjCIvarDecl *IvarDecl = IvarRegion->getDecl();
     // Prevent an inlined call to -dealloc in a super class from warning
     // about the values the subclass's -dealloc should release.
     if (IvarDecl->getContainingInterface() !=
diff --git a/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp b/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
index efa8139..86764c9 100644
--- a/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
+++ b/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
@@ -1,4 +1,4 @@
-//= CheckerDocumentation.cpp - Documentation checker ---------------*- C++ -*-//
+//===- CheckerDocumentation.cpp - Documentation checker ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -45,6 +45,7 @@
                                        check::Location,
                                        check::Bind,
                                        check::DeadSymbols,
+                                       check::BeginFunction,
                                        check::EndFunction,
                                        check::EndAnalysis,
                                        check::EndOfTranslationUnit,
@@ -57,7 +58,6 @@
                                        check::Event<ImplicitNullDerefEvent>,
                                        check::ASTDecl<FunctionDecl> > {
 public:
-
   /// \brief Pre-visit the Statement.
   ///
   /// The method will be called before the analyzer core processes the
@@ -147,7 +147,6 @@
   /// check::Bind
   void checkBind(SVal Loc, SVal Val, const Stmt *S, CheckerContext &) const {}
 
-
   /// \brief Called whenever a symbol becomes dead.
   ///
   /// This callback should be used by the checkers to aggressively clean
@@ -198,7 +197,6 @@
                                  AnalysisManager &Mgr,
                                  BugReporter &BR) const {}
 
-
   /// \brief Evaluates function call.
   ///
   /// The analysis core threats all function calls in the same way. However, some
@@ -318,12 +316,10 @@
   void checkASTDecl(const FunctionDecl *D,
                     AnalysisManager &Mgr,
                     BugReporter &BR) const {}
-
 };
 
 void CheckerDocumentation::checkPostStmt(const DeclStmt *DS,
                                          CheckerContext &C) const {
-  return;
 }
 
 } // end namespace ento
diff --git a/lib/StaticAnalyzer/Checkers/Checkers.td b/lib/StaticAnalyzer/Checkers/Checkers.td
deleted file mode 100644
index a674af7..0000000
--- a/lib/StaticAnalyzer/Checkers/Checkers.td
+++ /dev/null
@@ -1,651 +0,0 @@
-//===--- Checkers.td - Static Analyzer Checkers -===-----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-include "clang/StaticAnalyzer/Checkers/CheckerBase.td"
-
-//===----------------------------------------------------------------------===//
-// Packages.
-//===----------------------------------------------------------------------===//
-
-// The Alpha package is for checkers that have too many false positives to be
-// turned on by default. The hierarchy under Alpha should be organized in the
-// hierarchy checkers would have had if they were truly at the top level.
-// (For example, a Cocoa-specific checker that is alpha should be in
-// alpha.osx.cocoa).
-def Alpha : Package<"alpha">;
-
-def Core : Package<"core">;
-def CoreBuiltin : Package<"builtin">, InPackage<Core>;
-def CoreUninitialized  : Package<"uninitialized">, InPackage<Core>;
-def CoreAlpha : Package<"core">, InPackage<Alpha>, Hidden;
-
-// The OptIn package is for checkers that are not alpha and that would normally
-// be on by default but where the driver does not have enough information to
-// determine when they are applicable. For example, localizability checkers fit
-// this criterion because the driver cannot determine whether a project is
-// localized or not -- this is best determined at the IDE or build-system level.
-//
-// The checker hierarchy under OptIn should mirror that in Alpha: checkers
-// should be organized as if they were at the top level.
-//
-// Note: OptIn is *not* intended for checkers that are too noisy to be on by
-// default. Such checkers belong in the alpha package.
-def OptIn : Package<"optin">;
-
-def Nullability : Package<"nullability">;
-
-def Cplusplus : Package<"cplusplus">;
-def CplusplusAlpha : Package<"cplusplus">, InPackage<Alpha>, Hidden;
-
-def DeadCode : Package<"deadcode">;
-def DeadCodeAlpha : Package<"deadcode">, InPackage<Alpha>, Hidden;
-
-def Performance : Package<"performance">, InPackage<OptIn>;
-
-def Security : Package <"security">;
-def InsecureAPI : Package<"insecureAPI">, InPackage<Security>;
-def SecurityAlpha : Package<"security">, InPackage<Alpha>, Hidden;
-def Taint : Package<"taint">, InPackage<SecurityAlpha>, Hidden;
-
-def Unix : Package<"unix">;
-def UnixAlpha : Package<"unix">, InPackage<Alpha>, Hidden;
-def CString : Package<"cstring">, InPackage<Unix>, Hidden;
-def CStringAlpha : Package<"cstring">, InPackage<UnixAlpha>, Hidden;
-
-def OSX : Package<"osx">;
-def OSXAlpha : Package<"osx">, InPackage<Alpha>, Hidden;
-def OSXOptIn : Package<"osx">, InPackage<OptIn>;
-
-def Cocoa : Package<"cocoa">, InPackage<OSX>;
-def CocoaAlpha : Package<"cocoa">, InPackage<OSXAlpha>, Hidden;
-def CocoaOptIn : Package<"cocoa">, InPackage<OSXOptIn>;
-
-def CoreFoundation : Package<"coreFoundation">, InPackage<OSX>;
-def Containers : Package<"containers">, InPackage<CoreFoundation>;
-
-def LocalizabilityAlpha : Package<"localizability">, InPackage<CocoaAlpha>;
-def LocalizabilityOptIn : Package<"localizability">, InPackage<CocoaOptIn>;
-
-def LLVM : Package<"llvm">;
-def Debug : Package<"debug">;
-
-//===----------------------------------------------------------------------===//
-// Core Checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = Core in {
-
-def DereferenceChecker : Checker<"NullDereference">,
-  HelpText<"Check for dereferences of null pointers">,
-  DescFile<"DereferenceChecker.cpp">;
-
-def CallAndMessageChecker : Checker<"CallAndMessage">,
-  HelpText<"Check for logical errors for function calls and Objective-C message expressions (e.g., uninitialized arguments, null function pointers)">,
-  DescFile<"CallAndMessageChecker.cpp">;
-
-def NonNullParamChecker : Checker<"NonNullParamChecker">,
-  HelpText<"Check for null pointers passed as arguments to a function whose arguments are references or marked with the 'nonnull' attribute">,
-  DescFile<"NonNullParamChecker.cpp">;
-
-def VLASizeChecker : Checker<"VLASize">,
-  HelpText<"Check for declarations of VLA of undefined or zero size">,
-  DescFile<"VLASizeChecker.cpp">;
-
-def DivZeroChecker : Checker<"DivideZero">,
-  HelpText<"Check for division by zero">,
-  DescFile<"DivZeroChecker.cpp">;
-
-def UndefResultChecker : Checker<"UndefinedBinaryOperatorResult">,
-  HelpText<"Check for undefined results of binary operators">,
-  DescFile<"UndefResultChecker.cpp">;
-
-def StackAddrEscapeChecker : Checker<"StackAddressEscape">,
-  HelpText<"Check that addresses to stack memory do not escape the function">,
-  DescFile<"StackAddrEscapeChecker.cpp">;
-
-def DynamicTypePropagation : Checker<"DynamicTypePropagation">,
-  HelpText<"Generate dynamic type information">,
-  DescFile<"DynamicTypePropagation.cpp">;
-
-} // end "core"
-
-let ParentPackage = CoreAlpha in {
-
-def BoolAssignmentChecker : Checker<"BoolAssignment">,
-  HelpText<"Warn about assigning non-{0,1} values to Boolean variables">,
-  DescFile<"BoolAssignmentChecker.cpp">;
-
-def CastSizeChecker : Checker<"CastSize">,
-  HelpText<"Check when casting a malloc'ed type T, whether the size is a multiple of the size of T">,
-  DescFile<"CastSizeChecker.cpp">;
-
-def CastToStructChecker : Checker<"CastToStruct">,
-  HelpText<"Check for cast from non-struct pointer to struct pointer">,
-  DescFile<"CastToStructChecker.cpp">;
-
-def IdenticalExprChecker : Checker<"IdenticalExpr">,
-  HelpText<"Warn about unintended use of identical expressions in operators">,
-  DescFile<"IdenticalExprChecker.cpp">;
-
-def FixedAddressChecker : Checker<"FixedAddr">,
-  HelpText<"Check for assignment of a fixed address to a pointer">,
-  DescFile<"FixedAddressChecker.cpp">;
-
-def PointerArithChecker : Checker<"PointerArithm">,
-  HelpText<"Check for pointer arithmetic on locations other than array elements">,
-  DescFile<"PointerArithChecker">;
-
-def PointerSubChecker : Checker<"PointerSub">,
-  HelpText<"Check for pointer subtractions on two pointers pointing to different memory chunks">,
-  DescFile<"PointerSubChecker">;
-
-def SizeofPointerChecker : Checker<"SizeofPtr">,
-  HelpText<"Warn about unintended use of sizeof() on pointer expressions">,
-  DescFile<"CheckSizeofPointer.cpp">;
-
-def CallAndMessageUnInitRefArg : Checker<"CallAndMessageUnInitRefArg">,
-  HelpText<"Check for logical errors for function calls and Objective-C message expressions (e.g., uninitialized arguments, null function pointers, and pointer to undefined variables)">,
-  DescFile<"CallAndMessageChecker.cpp">;
-
-def TestAfterDivZeroChecker : Checker<"TestAfterDivZero">,
-  HelpText<"Check for division by variable that is later compared against 0. Either the comparison is useless or there is division by zero.">,
-  DescFile<"TestAfterDivZeroChecker.cpp">;
-
-def DynamicTypeChecker : Checker<"DynamicTypeChecker">,
-  HelpText<"Check for cases where the dynamic and the static type of an object are unrelated.">,
-  DescFile<"DynamicTypeChecker.cpp">;
-
-} // end "alpha.core"
-
-let ParentPackage = Nullability in {
-
-def NullPassedToNonnullChecker : Checker<"NullPassedToNonnull">,
-  HelpText<"Warns when a null pointer is passed to a pointer which has a _Nonnull type.">,
-  DescFile<"NullabilityChecker.cpp">;
-
-def NullReturnedFromNonnullChecker : Checker<"NullReturnedFromNonnull">,
-  HelpText<"Warns when a null pointer is returned from a function that has _Nonnull return type.">,
-  DescFile<"NullabilityChecker.cpp">;
-
-def NullableDereferencedChecker : Checker<"NullableDereferenced">,
-  HelpText<"Warns when a nullable pointer is dereferenced.">,
-  DescFile<"NullabilityChecker.cpp">;
-
-def NullablePassedToNonnullChecker : Checker<"NullablePassedToNonnull">,
-  HelpText<"Warns when a nullable pointer is passed to a pointer which has a _Nonnull type.">,
-  DescFile<"NullabilityChecker.cpp">;
-
-def NullableReturnedFromNonnullChecker : Checker<"NullablePassedToNonnull">,
-  HelpText<"Warns when a nullable pointer is returned from a function that has _Nonnull return type.">,
-  DescFile<"NullabilityChecker.cpp">;
-
-} // end "nullability"
-
-//===----------------------------------------------------------------------===//
-// Evaluate "builtin" functions.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = CoreBuiltin in {
-
-def NoReturnFunctionChecker : Checker<"NoReturnFunctions">,
-  HelpText<"Evaluate \"panic\" functions that are known to not return to the caller">,
-  DescFile<"NoReturnFunctionChecker.cpp">;
-
-def BuiltinFunctionChecker : Checker<"BuiltinFunctions">,
-  HelpText<"Evaluate compiler builtin functions (e.g., alloca())">,
-  DescFile<"BuiltinFunctionChecker.cpp">;
-
-} // end "core.builtin"
-
-//===----------------------------------------------------------------------===//
-// Uninitialized values checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = CoreUninitialized in {
-
-def UndefinedArraySubscriptChecker : Checker<"ArraySubscript">,
-  HelpText<"Check for uninitialized values used as array subscripts">,
-  DescFile<"UndefinedArraySubscriptChecker.cpp">;
-
-def UndefinedAssignmentChecker : Checker<"Assign">,
-  HelpText<"Check for assigning uninitialized values">,
-  DescFile<"UndefinedAssignmentChecker.cpp">;
-
-def UndefBranchChecker : Checker<"Branch">,
-  HelpText<"Check for uninitialized values used as branch conditions">,
-  DescFile<"UndefBranchChecker.cpp">;
-
-def UndefCapturedBlockVarChecker : Checker<"CapturedBlockVariable">,
-  HelpText<"Check for blocks that capture uninitialized values">,
-  DescFile<"UndefCapturedBlockVarChecker.cpp">;
-
-def ReturnUndefChecker : Checker<"UndefReturn">,
-  HelpText<"Check for uninitialized values being returned to the caller">,
-  DescFile<"ReturnUndefChecker.cpp">;
-
-} // end "core.uninitialized"
-
-//===----------------------------------------------------------------------===//
-// C++ checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = Cplusplus in {
-
-def NewDeleteChecker : Checker<"NewDelete">,
-  HelpText<"Check for double-free and use-after-free problems. Traces memory managed by new/delete.">,
-  DescFile<"MallocChecker.cpp">;
-
-def NewDeleteLeaksChecker : Checker<"NewDeleteLeaks">,
-  HelpText<"Check for memory leaks. Traces memory managed by new/delete.">,
-  DescFile<"MallocChecker.cpp">;
-
-} // end: "cplusplus"
-
-let ParentPackage = CplusplusAlpha in {
-
-def VirtualCallChecker : Checker<"VirtualCall">,
-  HelpText<"Check virtual function calls during construction or destruction">,
-  DescFile<"VirtualCallChecker.cpp">;
-
-} // end: "alpha.cplusplus"
-
-//===----------------------------------------------------------------------===//
-// Deadcode checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = DeadCode in {
-
-def DeadStoresChecker : Checker<"DeadStores">,
-  HelpText<"Check for values stored to variables that are never read afterwards">,
-  DescFile<"DeadStoresChecker.cpp">;
-} // end DeadCode
-
-let ParentPackage = DeadCodeAlpha in {
-
-def UnreachableCodeChecker : Checker<"UnreachableCode">,
-  HelpText<"Check unreachable code">,
-  DescFile<"UnreachableCodeChecker.cpp">;
-
-} // end "alpha.deadcode"
-
-//===----------------------------------------------------------------------===//
-// Performance checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = Performance in {
-
-def PaddingChecker : Checker<"Padding">,
-  HelpText<"Check for excessively padded structs.">,
-  DescFile<"PaddingChecker.cpp">;
-
-} // end: "padding"
-
-//===----------------------------------------------------------------------===//
-// Security checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = InsecureAPI in {
-  def gets : Checker<"gets">,
-    HelpText<"Warn on uses of the 'gets' function">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-  def getpw : Checker<"getpw">,
-    HelpText<"Warn on uses of the 'getpw' function">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-  def mktemp : Checker<"mktemp">,
-    HelpText<"Warn on uses of the 'mktemp' function">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-  def mkstemp : Checker<"mkstemp">,
-    HelpText<"Warn when 'mkstemp' is passed fewer than 6 X's in the format string">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-  def rand : Checker<"rand">,
-    HelpText<"Warn on uses of the 'rand', 'random', and related functions">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-  def strcpy : Checker<"strcpy">,
-    HelpText<"Warn on uses of the 'strcpy' and 'strcat' functions">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-  def vfork : Checker<"vfork">,
-    HelpText<"Warn on uses of the 'vfork' function">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-  def UncheckedReturn : Checker<"UncheckedReturn">,
-    HelpText<"Warn on uses of functions whose return values must be always checked">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-}
-let ParentPackage = Security in {
-  def FloatLoopCounter : Checker<"FloatLoopCounter">,
-    HelpText<"Warn on using a floating point value as a loop counter (CERT: FLP30-C, FLP30-CPP)">,
-    DescFile<"CheckSecuritySyntaxOnly.cpp">;
-}
-
-let ParentPackage = SecurityAlpha in {
-
-def ArrayBoundChecker : Checker<"ArrayBound">,
-  HelpText<"Warn about buffer overflows (older checker)">,
-  DescFile<"ArrayBoundChecker.cpp">;
-
-def ArrayBoundCheckerV2 : Checker<"ArrayBoundV2">,
-  HelpText<"Warn about buffer overflows (newer checker)">,
-  DescFile<"ArrayBoundCheckerV2.cpp">;
-
-def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
-  HelpText<"Check for an out-of-bound pointer being returned to callers">,
-  DescFile<"ReturnPointerRangeChecker.cpp">;
-
-def MallocOverflowSecurityChecker : Checker<"MallocOverflow">,
-  HelpText<"Check for overflows in the arguments to malloc()">,
-  DescFile<"MallocOverflowSecurityChecker.cpp">;
-
-} // end "alpha.security"
-
-//===----------------------------------------------------------------------===//
-// Taint checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = Taint in {
-
-def GenericTaintChecker : Checker<"TaintPropagation">,
-  HelpText<"Generate taint information used by other checkers">,
-  DescFile<"GenericTaintChecker.cpp">;
-
-} // end "alpha.security.taint"
-
-//===----------------------------------------------------------------------===//
-// Unix API checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = Unix in {
-
-def UnixAPIChecker : Checker<"API">,
-  HelpText<"Check calls to various UNIX/Posix functions">,
-  DescFile<"UnixAPIChecker.cpp">;
-
-def MallocChecker: Checker<"Malloc">,
-  HelpText<"Check for memory leaks, double free, and use-after-free problems. Traces memory managed by malloc()/free().">,
-  DescFile<"MallocChecker.cpp">;
-
-def MallocSizeofChecker : Checker<"MallocSizeof">,
-  HelpText<"Check for dubious malloc arguments involving sizeof">,
-  DescFile<"MallocSizeofChecker.cpp">;
-
-def MismatchedDeallocatorChecker : Checker<"MismatchedDeallocator">,
-  HelpText<"Check for mismatched deallocators.">,
-  DescFile<"MallocChecker.cpp">;
-
-def VforkChecker : Checker<"Vfork">,
-  HelpText<"Check for proper usage of vfork">,
-  DescFile<"VforkChecker.cpp">;
-
-} // end "unix"
-
-let ParentPackage = UnixAlpha in {
-
-def ChrootChecker : Checker<"Chroot">,
-  HelpText<"Check improper use of chroot">,
-  DescFile<"ChrootChecker.cpp">;
-
-def PthreadLockChecker : Checker<"PthreadLock">,
-  HelpText<"Simple lock -> unlock checker">,
-  DescFile<"PthreadLockChecker.cpp">;
-
-def StreamChecker : Checker<"Stream">,
-  HelpText<"Check stream handling functions">,
-  DescFile<"StreamChecker.cpp">;
-
-def SimpleStreamChecker : Checker<"SimpleStream">,
-  HelpText<"Check for misuses of stream APIs">,
-  DescFile<"SimpleStreamChecker.cpp">;
-
-} // end "alpha.unix"
-
-let ParentPackage = CString in {
-
-def CStringNullArg : Checker<"NullArg">,
-  HelpText<"Check for null pointers being passed as arguments to C string functions">,
-  DescFile<"CStringChecker.cpp">;
-
-def CStringSyntaxChecker : Checker<"BadSizeArg">,
-  HelpText<"Check the size argument passed into C string functions for common erroneous patterns">,
-  DescFile<"CStringSyntaxChecker.cpp">;
-}
-
-let ParentPackage = CStringAlpha in {
-
-def CStringOutOfBounds : Checker<"OutOfBounds">,
-  HelpText<"Check for out-of-bounds access in string functions">,
-  DescFile<"CStringChecker.cpp">;
-
-def CStringBufferOverlap : Checker<"BufferOverlap">,
-  HelpText<"Checks for overlap in two buffer arguments">,
-  DescFile<"CStringChecker.cpp">;
-
-def CStringNotNullTerm : Checker<"NotNullTerminated">,
-  HelpText<"Check for arguments which are not null-terminating strings">,
-  DescFile<"CStringChecker.cpp">;
-}
-
-//===----------------------------------------------------------------------===//
-// Mac OS X, Cocoa, and Core Foundation checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = OSX in {
-
-def MacOSXAPIChecker : Checker<"API">,
-  InPackage<OSX>,
-  HelpText<"Check for proper uses of various Apple APIs">,
-  DescFile<"MacOSXAPIChecker.cpp">;
-
-def MacOSKeychainAPIChecker : Checker<"SecKeychainAPI">,
-  InPackage<OSX>,
-  HelpText<"Check for proper uses of Secure Keychain APIs">,
-  DescFile<"MacOSKeychainAPIChecker.cpp">;
-
-} // end "osx"
-
-let ParentPackage = Cocoa in {
-
-def ObjCAtSyncChecker : Checker<"AtSync">,
-  HelpText<"Check for nil pointers used as mutexes for @synchronized">,
-  DescFile<"ObjCAtSyncChecker.cpp">;
-
-def NilArgChecker : Checker<"NilArg">,
-  HelpText<"Check for prohibited nil arguments to ObjC method calls">,
-  DescFile<"BasicObjCFoundationChecks.cpp">;
-
-def ClassReleaseChecker : Checker<"ClassRelease">,
-  HelpText<"Check for sending 'retain', 'release', or 'autorelease' directly to a Class">,
-  DescFile<"BasicObjCFoundationChecks.cpp">;
-
-def VariadicMethodTypeChecker : Checker<"VariadicMethodTypes">,
-  HelpText<"Check for passing non-Objective-C types to variadic collection "
-           "initialization methods that expect only Objective-C types">,
-  DescFile<"BasicObjCFoundationChecks.cpp">;
-
-def NSAutoreleasePoolChecker : Checker<"NSAutoreleasePool">,
-  HelpText<"Warn for suboptimal uses of NSAutoreleasePool in Objective-C GC mode">,
-  DescFile<"NSAutoreleasePoolChecker.cpp">;
-
-def ObjCMethSigsChecker : Checker<"IncompatibleMethodTypes">,
-  HelpText<"Warn about Objective-C method signatures with type incompatibilities">,
-  DescFile<"CheckObjCInstMethSignature.cpp">;
-
-def ObjCUnusedIvarsChecker : Checker<"UnusedIvars">,
-  HelpText<"Warn about private ivars that are never used">,
-  DescFile<"ObjCUnusedIVarsChecker.cpp">;
-
-def ObjCSelfInitChecker : Checker<"SelfInit">,
-  HelpText<"Check that 'self' is properly initialized inside an initializer method">,
-  DescFile<"ObjCSelfInitChecker.cpp">;
-
-def ObjCLoopChecker : Checker<"Loops">,
-  HelpText<"Improved modeling of loops using Cocoa collection types">,
-  DescFile<"BasicObjCFoundationChecks.cpp">;
-
-def ObjCNonNilReturnValueChecker : Checker<"NonNilReturnValue">,
-  HelpText<"Model the APIs that are guaranteed to return a non-nil value">,
-  DescFile<"BasicObjCFoundationChecks.cpp">;
-
-def ObjCSuperCallChecker : Checker<"MissingSuperCall">,
-  HelpText<"Warn about Objective-C methods that lack a necessary call to super">,
-  DescFile<"ObjCMissingSuperCallChecker.cpp">;
-
-def NSErrorChecker : Checker<"NSError">,
-  HelpText<"Check usage of NSError** parameters">,
-  DescFile<"NSErrorChecker.cpp">;
-
-def RetainCountChecker : Checker<"RetainCount">,
-  HelpText<"Check for leaks and improper reference count management">,
-  DescFile<"RetainCountChecker.cpp">;
-
-def ObjCGenericsChecker : Checker<"ObjCGenerics">,
-  HelpText<"Check for type errors when using Objective-C generics">,
-  DescFile<"DynamicTypePropagation.cpp">;
-
-def ObjCDeallocChecker : Checker<"Dealloc">,
-  HelpText<"Warn about Objective-C classes that lack a correct implementation of -dealloc">,
-  DescFile<"CheckObjCDealloc.cpp">;
-
-def ObjCSuperDeallocChecker : Checker<"SuperDealloc">,
-  HelpText<"Warn about improper use of '[super dealloc]' in Objective-C">,
-  DescFile<"ObjCSuperDeallocChecker.cpp">;
-
-} // end "osx.cocoa"
-
-let ParentPackage = CocoaAlpha in {
-
-def InstanceVariableInvalidation : Checker<"InstanceVariableInvalidation">,
-  HelpText<"Check that the invalidatable instance variables are invalidated in the methods annotated with objc_instance_variable_invalidator">,
-  DescFile<"IvarInvalidationChecker.cpp">;
-
-def MissingInvalidationMethod : Checker<"MissingInvalidationMethod">,
-  HelpText<"Check that the invalidation methods are present in classes that contain invalidatable instance variables">,
-  DescFile<"IvarInvalidationChecker.cpp">;
-
-def DirectIvarAssignment : Checker<"DirectIvarAssignment">,
-  HelpText<"Check for direct assignments to instance variables">,
-  DescFile<"DirectIvarAssignment.cpp">;
-
-def DirectIvarAssignmentForAnnotatedFunctions : Checker<"DirectIvarAssignmentForAnnotatedFunctions">,
-  HelpText<"Check for direct assignments to instance variables in the methods annotated with objc_no_direct_instance_variable_assignment">,
-  DescFile<"DirectIvarAssignment.cpp">;
-
-} // end "alpha.osx.cocoa"
-
-let ParentPackage = CoreFoundation in {
-
-def CFNumberCreateChecker : Checker<"CFNumber">,
-  HelpText<"Check for proper uses of CFNumberCreate">,
-  DescFile<"BasicObjCFoundationChecks.cpp">;
-
-def CFRetainReleaseChecker : Checker<"CFRetainRelease">,
-  HelpText<"Check for null arguments to CFRetain/CFRelease/CFMakeCollectable">,
-  DescFile<"BasicObjCFoundationChecks.cpp">;
-
-def CFErrorChecker : Checker<"CFError">,
-  HelpText<"Check usage of CFErrorRef* parameters">,
-  DescFile<"NSErrorChecker.cpp">;
-}
-
-let ParentPackage = Containers in {
-def ObjCContainersASTChecker : Checker<"PointerSizedValues">,
-  HelpText<"Warns if 'CFArray', 'CFDictionary', 'CFSet' are created with non-pointer-size values">,
-  DescFile<"ObjCContainersASTChecker.cpp">;
-
-def ObjCContainersChecker : Checker<"OutOfBounds">,
-  HelpText<"Checks for index out-of-bounds when using 'CFArray' API">,
-  DescFile<"ObjCContainersChecker.cpp">;
-
-}
-
-let ParentPackage = LocalizabilityOptIn in {
-def NonLocalizedStringChecker : Checker<"NonLocalizedStringChecker">,
-  HelpText<"Warns about uses of non-localized NSStrings passed to UI methods expecting localized NSStrings">,
-  DescFile<"LocalizationChecker.cpp">;
-
-def EmptyLocalizationContextChecker : Checker<"EmptyLocalizationContextChecker">,
-  HelpText<"Check that NSLocalizedString macros include a comment for context">,
-  DescFile<"LocalizationChecker.cpp">;
-}
-
-let ParentPackage = LocalizabilityAlpha in {
-def PluralMisuseChecker : Checker<"PluralMisuseChecker">,
-  HelpText<"Warns against using one vs. many plural pattern in code when generating localized strings.">,
-  DescFile<"LocalizationChecker.cpp">;
-}
-
-//===----------------------------------------------------------------------===//
-// Checkers for LLVM development.
-//===----------------------------------------------------------------------===//
-
-def LLVMConventionsChecker : Checker<"Conventions">,
-  InPackage<LLVM>,
-  HelpText<"Check code for LLVM codebase conventions">,
-  DescFile<"LLVMConventionsChecker.cpp">;
-
-//===----------------------------------------------------------------------===//
-// Debugging checkers (for analyzer development).
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = Debug in {
-
-def DominatorsTreeDumper : Checker<"DumpDominators">,
-  HelpText<"Print the dominance tree for a given CFG">,
-  DescFile<"DebugCheckers.cpp">;
-
-def LiveVariablesDumper : Checker<"DumpLiveVars">,
-  HelpText<"Print results of live variable analysis">,
-  DescFile<"DebugCheckers.cpp">;
-
-def CFGViewer : Checker<"ViewCFG">,
-  HelpText<"View Control-Flow Graphs using GraphViz">,
-  DescFile<"DebugCheckers.cpp">;
-
-def CFGDumper : Checker<"DumpCFG">,
-  HelpText<"Display Control-Flow Graphs">,
-  DescFile<"DebugCheckers.cpp">;
-
-def CallGraphViewer : Checker<"ViewCallGraph">,
-  HelpText<"View Call Graph using GraphViz">,
-  DescFile<"DebugCheckers.cpp">;
-
-def CallGraphDumper : Checker<"DumpCallGraph">,
-  HelpText<"Display Call Graph">,
-  DescFile<"DebugCheckers.cpp">;
-
-def ConfigDumper : Checker<"ConfigDumper">,
-  HelpText<"Dump config table">,
-  DescFile<"DebugCheckers.cpp">;
-
-def TraversalDumper : Checker<"DumpTraversal">,
-  HelpText<"Print branch conditions as they are traversed by the engine">,
-  DescFile<"TraversalChecker.cpp">;
-
-def CallDumper : Checker<"DumpCalls">,
-  HelpText<"Print calls as they are traversed by the engine">,
-  DescFile<"TraversalChecker.cpp">;
-
-def AnalyzerStatsChecker : Checker<"Stats">,
-  HelpText<"Emit warnings with analyzer statistics">,
-  DescFile<"AnalyzerStatsChecker.cpp">;
-
-def TaintTesterChecker : Checker<"TaintTest">,
-  HelpText<"Mark tainted symbols as such.">,
-  DescFile<"TaintTesterChecker.cpp">;
-
-def ExprInspectionChecker : Checker<"ExprInspection">,
-  HelpText<"Check the analyzer's understanding of expressions">,
-  DescFile<"ExprInspectionChecker.cpp">;
-
-def ExplodedGraphViewer : Checker<"ViewExplodedGraph">,
-  HelpText<"View Exploded Graphs using GraphViz">,
-  DescFile<"DebugCheckers.cpp">;
-
-def BugHashDumper : Checker<"DumpBugHash">,
-  HelpText<"Dump the bug hash for all statements.">,
-  DescFile<"DebugCheckers.cpp">;
-
-} // end "debug"
diff --git a/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp b/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
index 3ad1996..9e9939a 100644
--- a/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
@@ -1,4 +1,4 @@
-//===- Chrootchecker.cpp -------- Basic security checks ----------*- C++ -*-==//
+//===- Chrootchecker.cpp -------- Basic security checks ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,7 +19,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
-#include "llvm/ADT/ImmutableMap.h"
+
 using namespace clang;
 using namespace ento;
 
@@ -148,8 +148,6 @@
         C.emitReport(llvm::make_unique<BugReport>(
             *BT_BreakJail, BT_BreakJail->getDescription(), N));
       }
-
-  return;
 }
 
 void ento::registerChrootChecker(CheckerManager &mgr) {
diff --git a/lib/StaticAnalyzer/Checkers/ClangCheckers.cpp b/lib/StaticAnalyzer/Checkers/ClangCheckers.cpp
index 77a5a72..fb9e366 100644
--- a/lib/StaticAnalyzer/Checkers/ClangCheckers.cpp
+++ b/lib/StaticAnalyzer/Checkers/ClangCheckers.cpp
@@ -27,6 +27,6 @@
 #define GET_CHECKERS
 #define CHECKER(FULLNAME,CLASS,DESCFILE,HELPTEXT,GROUPINDEX,HIDDEN)    \
   registry.addChecker(register##CLASS, FULLNAME, HELPTEXT);
-#include "Checkers.inc"
+#include "clang/StaticAnalyzer/Checkers/Checkers.inc"
 #undef GET_CHECKERS
 }
diff --git a/lib/StaticAnalyzer/Checkers/ClangSACheckers.h b/lib/StaticAnalyzer/Checkers/ClangSACheckers.h
index 05b4a61..d6e96f2 100644
--- a/lib/StaticAnalyzer/Checkers/ClangSACheckers.h
+++ b/lib/StaticAnalyzer/Checkers/ClangSACheckers.h
@@ -26,7 +26,7 @@
 #define GET_CHECKERS
 #define CHECKER(FULLNAME,CLASS,CXXFILE,HELPTEXT,GROUPINDEX,HIDDEN)    \
   void register##CLASS(CheckerManager &mgr);
-#include "Checkers.inc"
+#include "clang/StaticAnalyzer/Checkers/Checkers.inc"
 #undef CHECKER
 #undef GET_CHECKERS
 
diff --git a/lib/StaticAnalyzer/Checkers/CloneChecker.cpp b/lib/StaticAnalyzer/Checkers/CloneChecker.cpp
new file mode 100644
index 0000000..87c813d
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -0,0 +1,96 @@
+//===--- CloneChecker.cpp - Clone detection checker -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// CloneChecker is a checker that reports clones in the current translation
+/// unit.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ClangSACheckers.h"
+#include "clang/Analysis/CloneDetection.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+class CloneChecker
+    : public Checker<check::ASTCodeBody, check::EndOfTranslationUnit> {
+  mutable CloneDetector Detector;
+
+public:
+  void checkASTCodeBody(const Decl *D, AnalysisManager &Mgr,
+                        BugReporter &BR) const;
+
+  void checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                 AnalysisManager &Mgr, BugReporter &BR) const;
+};
+} // end anonymous namespace
+
+void CloneChecker::checkASTCodeBody(const Decl *D, AnalysisManager &Mgr,
+                                    BugReporter &BR) const {
+  // Every statement that should be included in the search for clones needs to
+  // be passed to the CloneDetector.
+  Detector.analyzeCodeBody(D);
+}
+
+void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                             AnalysisManager &Mgr,
+                                             BugReporter &BR) const {
+  // At this point, every statement in the translation unit has been analyzed by
+  // the CloneDetector. The only thing left to do is to report the found clones.
+
+  int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger(
+      "MinimumCloneComplexity", 10, this);
+
+  assert(MinComplexity >= 0);
+
+  SourceManager &SM = BR.getSourceManager();
+
+  std::vector<CloneDetector::CloneGroup> CloneGroups;
+  Detector.findClones(CloneGroups, MinComplexity);
+
+  DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
+
+  unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning,
+                                               "Detected code clone.");
+
+  unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
+                                               "Related code clone is here.");
+
+  for (CloneDetector::CloneGroup &Group : CloneGroups) {
+    // For readability reasons we sort the clones by line numbers.
+    std::sort(Group.Sequences.begin(), Group.Sequences.end(),
+              [&SM](const StmtSequence &LHS, const StmtSequence &RHS) {
+                return SM.isBeforeInTranslationUnit(LHS.getStartLoc(),
+                                                    RHS.getStartLoc()) &&
+                       SM.isBeforeInTranslationUnit(LHS.getEndLoc(),
+                                                    RHS.getEndLoc());
+              });
+
+    // We group the clones by printing the first as a warning and all others
+    // as a note.
+    DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID);
+    for (unsigned i = 1; i < Group.Sequences.size(); ++i) {
+      DiagEngine.Report(Group.Sequences[i].getStartLoc(), NoteID);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Register CloneChecker
+//===----------------------------------------------------------------------===//
+
+void ento::registerCloneChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<CloneChecker>();
+}
diff --git a/lib/StaticAnalyzer/Checkers/DeadStoresChecker.cpp b/lib/StaticAnalyzer/Checkers/DeadStoresChecker.cpp
index f2a269a..8ca2a24 100644
--- a/lib/StaticAnalyzer/Checkers/DeadStoresChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/DeadStoresChecker.cpp
@@ -278,6 +278,8 @@
           RHS = RHS->IgnoreParenCasts();
 
           QualType T = VD->getType();
+          if (T.isVolatileQualified())
+            return;
           if (T->isPointerType() || T->isObjCObjectPointerType()) {
             if (RHS->isNullPointerConstant(Ctx, Expr::NPC_ValueDependentIsNull))
               return;
diff --git a/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp b/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
index aa904e7..b8e4332 100644
--- a/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
+++ b/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
@@ -1,4 +1,4 @@
-//== DynamicTypePropagation.cpp -------------------------------- -*- C++ -*--=//
+//===- DynamicTypePropagation.cpp ------------------------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -97,6 +97,7 @@
                          const ObjCObjectPointerType *To, ExplodedNode *N,
                          SymbolRef Sym, CheckerContext &C,
                          const Stmt *ReportedNode = nullptr) const;
+
 public:
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
@@ -109,7 +110,7 @@
   /// This value is set to true, when the Generics checker is turned on.
   DefaultBool CheckGenerics;
 };
-}
+} // end anonymous namespace
 
 void DynamicTypePropagation::checkDeadSymbols(SymbolReaper &SR,
                                               CheckerContext &C) const {
@@ -151,7 +152,6 @@
   ProgramStateRef State = C.getState();
   State = setDynamicTypeInfo(State, Region, Ty, /*CanBeSubclass=*/false);
   C.addTransition(State);
-  return;
 }
 
 void DynamicTypePropagation::checkPreCall(const CallEvent &Call,
@@ -797,7 +797,6 @@
   // class. This method is provided by the runtime and available on all classes.
   if (MessageExpr->getReceiverKind() == ObjCMessageExpr::Class &&
       Sel.getAsString() == "class") {
-
     QualType ReceiverType = MessageExpr->getClassReceiver();
     const auto *ReceiverClassType = ReceiverType->getAs<ObjCObjectType>();
     QualType ReceiverClassPointerType =
diff --git a/lib/StaticAnalyzer/Checkers/IvarInvalidationChecker.cpp b/lib/StaticAnalyzer/Checkers/IvarInvalidationChecker.cpp
index 153c05b..8076ca0 100644
--- a/lib/StaticAnalyzer/Checkers/IvarInvalidationChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/IvarInvalidationChecker.cpp
@@ -1,4 +1,4 @@
-//=- IvarInvalidationChecker.cpp - -*- C++ -------------------------------*-==//
+//===- IvarInvalidationChecker.cpp ------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -43,7 +43,6 @@
 using namespace ento;
 
 namespace {
-
 struct ChecksFilter {
   /// Check for missing invalidation method declarations.
   DefaultBool check_MissingInvalidationMethod;
@@ -55,7 +54,6 @@
 };
 
 class IvarInvalidationCheckerImpl {
-
   typedef llvm::SmallSetVector<const ObjCMethodDecl*, 2> MethodSet;
   typedef llvm::DenseMap<const ObjCMethodDecl*,
                          const ObjCIvarDecl*> MethToIvarMapTy;
@@ -64,7 +62,6 @@
   typedef llvm::DenseMap<const ObjCIvarDecl*,
                          const ObjCPropertyDecl*> IvarToPropMapTy;
 
-
   struct InvalidationInfo {
     /// Has the ivar been invalidated?
     bool IsInvalidated;
@@ -167,7 +164,7 @@
     void VisitObjCMessageExpr(const ObjCMessageExpr *ME);
 
     void VisitChildren(const Stmt *S) {
-      for (const Stmt *Child : S->children()) {
+      for (const auto *Child : S->children()) {
         if (Child)
           this->Visit(Child);
         if (CalledAnotherInvalidationMethod)
@@ -208,6 +205,7 @@
                                   const IvarToPropMapTy &IvarToPopertyMap,
                                   const ObjCInterfaceDecl *InterfaceD,
                                   bool MissingDeclaration) const;
+
   void reportIvarNeedsInvalidation(const ObjCIvarDecl *IvarD,
                                    const IvarToPropMapTy &IvarToPopertyMap,
                                    const ObjCMethodDecl *MethodD) const;
@@ -276,8 +274,6 @@
     }
     return;
   }
-
-  return;
 }
 
 bool IvarInvalidationCheckerImpl::trackIvar(const ObjCIvarDecl *Iv,
@@ -586,8 +582,7 @@
     // If InvalidationMethod is present, we are processing the message send and
     // should ensure we are invalidating with the appropriate method,
     // otherwise, we are processing setting to 'nil'.
-    if (!InvalidationMethod ||
-        (InvalidationMethod && I->second.hasMethod(InvalidationMethod)))
+    if (!InvalidationMethod || I->second.hasMethod(InvalidationMethod))
       IVars.erase(I);
   }
 }
@@ -724,11 +719,10 @@
 
   VisitStmt(ME);
 }
-}
+} // end anonymous namespace
 
 // Register the checkers.
 namespace {
-
 class IvarInvalidationChecker :
   public Checker<check::ASTDecl<ObjCImplementationDecl> > {
 public:
@@ -740,7 +734,7 @@
     Walker.visit(D);
   }
 };
-}
+} // end anonymous namespace
 
 #define REGISTER_CHECKER(name)                                                 \
   void ento::register##name(CheckerManager &mgr) {                             \
@@ -752,4 +746,3 @@
 
 REGISTER_CHECKER(InstanceVariableInvalidation)
 REGISTER_CHECKER(MissingInvalidationMethod)
-
diff --git a/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp b/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
index ca6567c..1386f97 100644
--- a/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
@@ -19,6 +19,9 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/StmtVisitor.h"
+#include "clang/Lex/Lexer.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
@@ -26,11 +29,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
-#include "clang/Lex/Lexer.h"
-#include "clang/AST/RecursiveASTVisitor.h"
-#include "clang/AST/StmtVisitor.h"
 #include "llvm/Support/Unicode.h"
-#include "llvm/ADT/StringSet.h"
 
 using namespace clang;
 using namespace ento;
@@ -189,6 +188,22 @@
   NEW_RECEIVER(NSButton)
   ADD_UNARY_METHOD(NSButton, setTitle, 0)
   ADD_UNARY_METHOD(NSButton, setAlternateTitle, 0)
+  IdentifierInfo *radioButtonWithTitleNSButton[] = {
+      &Ctx.Idents.get("radioButtonWithTitle"), &Ctx.Idents.get("target"),
+      &Ctx.Idents.get("action")};
+  ADD_METHOD(NSButton, radioButtonWithTitleNSButton, 3, 0)
+  IdentifierInfo *buttonWithTitleNSButtonImage[] = {
+      &Ctx.Idents.get("buttonWithTitle"), &Ctx.Idents.get("image"),
+      &Ctx.Idents.get("target"), &Ctx.Idents.get("action")};
+  ADD_METHOD(NSButton, buttonWithTitleNSButtonImage, 4, 0)
+  IdentifierInfo *checkboxWithTitleNSButton[] = {
+      &Ctx.Idents.get("checkboxWithTitle"), &Ctx.Idents.get("target"),
+      &Ctx.Idents.get("action")};
+  ADD_METHOD(NSButton, checkboxWithTitleNSButton, 3, 0)
+  IdentifierInfo *buttonWithTitleNSButtonTarget[] = {
+      &Ctx.Idents.get("buttonWithTitle"), &Ctx.Idents.get("target"),
+      &Ctx.Idents.get("action")};
+  ADD_METHOD(NSButton, buttonWithTitleNSButtonTarget, 3, 0)
 
   NEW_RECEIVER(NSSavePanel)
   ADD_UNARY_METHOD(NSSavePanel, setPrompt, 0)
@@ -271,6 +286,9 @@
   ADD_UNARY_METHOD(NSButtonCell, setTitle, 0)
   ADD_UNARY_METHOD(NSButtonCell, setAlternateTitle, 0)
 
+  NEW_RECEIVER(NSDatePickerCell)
+  ADD_UNARY_METHOD(NSDatePickerCell, initTextCell, 0)
+
   NEW_RECEIVER(NSSliderCell)
   ADD_UNARY_METHOD(NSSliderCell, setTitle, 0)
 
@@ -336,9 +354,6 @@
   ADD_UNARY_METHOD(UIActionSheet, addButtonWithTitle, 0)
   ADD_UNARY_METHOD(UIActionSheet, setTitle, 0)
 
-  NEW_RECEIVER(NSURLSessionTask)
-  ADD_UNARY_METHOD(NSURLSessionTask, setTaskDescription, 0)
-
   NEW_RECEIVER(UIAccessibilityCustomAction)
   IdentifierInfo *initWithNameUIAccessibilityCustomAction[] = {
       &Ctx.Idents.get("initWithName"), &Ctx.Idents.get("target"),
@@ -363,6 +378,9 @@
 
   NEW_RECEIVER(NSTextField)
   ADD_UNARY_METHOD(NSTextField, setPlaceholderString, 0)
+  ADD_UNARY_METHOD(NSTextField, textFieldWithString, 0)
+  ADD_UNARY_METHOD(NSTextField, wrappingLabelWithString, 0)
+  ADD_UNARY_METHOD(NSTextField, labelWithString, 0)
 
   NEW_RECEIVER(NSAttributedString)
   ADD_UNARY_METHOD(NSAttributedString, initWithString, 0)
@@ -523,9 +541,6 @@
   ADD_METHOD(NSUserNotificationAction,
              actionWithIdentifierNSUserNotificationAction, 2, 1)
 
-  NEW_RECEIVER(NSURLSession)
-  ADD_UNARY_METHOD(NSURLSession, setSessionDescription, 0)
-
   NEW_RECEIVER(UITextField)
   ADD_UNARY_METHOD(UITextField, setText, 0)
   ADD_UNARY_METHOD(UITextField, setPlaceholder, 0)
@@ -1001,8 +1016,6 @@
 void EmptyLocalizationContextChecker::MethodCrawler::VisitObjCMessageExpr(
     const ObjCMessageExpr *ME) {
 
-  // FIXME: We may be able to use PPCallbacks to check for empy context
-  // comments as part of preprocessing and avoid this re-lexing hack.
   const ObjCInterfaceDecl *OD = ME->getReceiverInterface();
   if (!OD)
     return;
@@ -1037,12 +1050,7 @@
     SE = Mgr.getSourceManager().getSLocEntry(SLInfo.first);
   }
 
-  bool Invalid = false;
-  llvm::MemoryBuffer *BF =
-      Mgr.getSourceManager().getBuffer(SLInfo.first, SL, &Invalid);
-  if (Invalid)
-    return;
-
+  llvm::MemoryBuffer *BF = SE.getFile().getContentCache()->getRawBuffer();
   Lexer TheLexer(SL, LangOptions(), BF->getBufferStart(),
                  BF->getBufferStart() + SLInfo.second, BF->getBufferEnd());
 
diff --git a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp
new file mode 100644
index 0000000..d56ea6d
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp
@@ -0,0 +1,115 @@
+//===-- MPIBugReporter.cpp - bug reporter -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines prefabricated reports which are emitted in
+/// case of MPI related bugs, detected by path-sensitive analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MPIBugReporter.h"
+#include "MPIChecker.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
+
+namespace clang {
+namespace ento {
+namespace mpi {
+
+void MPIBugReporter::reportDoubleNonblocking(
+    const CallEvent &MPICallEvent, const ento::mpi::Request &Req,
+    const MemRegion *const RequestRegion,
+    const ExplodedNode *const ExplNode,
+    BugReporter &BReporter) const {
+
+  std::string ErrorText;
+  ErrorText = "Double nonblocking on request " +
+              RequestRegion->getDescriptiveName() + ". ";
+
+  auto Report = llvm::make_unique<BugReport>(*DoubleNonblockingBugType,
+                                             ErrorText, ExplNode);
+
+  Report->addRange(MPICallEvent.getSourceRange());
+  SourceRange Range = RequestRegion->sourceRange();
+
+  if (Range.isValid())
+    Report->addRange(Range);
+
+  Report->addVisitor(llvm::make_unique<RequestNodeVisitor>(
+      RequestRegion, "Request is previously used by nonblocking call here. "));
+  Report->markInteresting(RequestRegion);
+
+  BReporter.emitReport(std::move(Report));
+}
+
+void MPIBugReporter::reportMissingWait(
+    const ento::mpi::Request &Req, const MemRegion *const RequestRegion,
+    const ExplodedNode *const ExplNode,
+    BugReporter &BReporter) const {
+  std::string ErrorText{"Request " + RequestRegion->getDescriptiveName() +
+                        " has no matching wait. "};
+
+  auto Report =
+      llvm::make_unique<BugReport>(*MissingWaitBugType, ErrorText, ExplNode);
+
+  SourceRange Range = RequestRegion->sourceRange();
+  if (Range.isValid())
+    Report->addRange(Range);
+  Report->addVisitor(llvm::make_unique<RequestNodeVisitor>(
+      RequestRegion, "Request is previously used by nonblocking call here. "));
+  Report->markInteresting(RequestRegion);
+
+  BReporter.emitReport(std::move(Report));
+}
+
+void MPIBugReporter::reportUnmatchedWait(
+    const CallEvent &CE, const clang::ento::MemRegion *const RequestRegion,
+    const ExplodedNode *const ExplNode,
+    BugReporter &BReporter) const {
+  std::string ErrorText{"Request " + RequestRegion->getDescriptiveName() +
+                        " has no matching nonblocking call. "};
+
+  auto Report =
+      llvm::make_unique<BugReport>(*UnmatchedWaitBugType, ErrorText, ExplNode);
+
+  Report->addRange(CE.getSourceRange());
+  SourceRange Range = RequestRegion->sourceRange();
+  if (Range.isValid())
+    Report->addRange(Range);
+
+  BReporter.emitReport(std::move(Report));
+}
+
+PathDiagnosticPiece *MPIBugReporter::RequestNodeVisitor::VisitNode(
+    const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
+    BugReport &BR) {
+
+  if (IsNodeFound)
+    return nullptr;
+
+  const Request *const Req = N->getState()->get<RequestMap>(RequestRegion);
+  const Request *const PrevReq =
+      PrevN->getState()->get<RequestMap>(RequestRegion);
+
+  // Check if request was previously unused or in a different state.
+  if ((Req && !PrevReq) || (Req->CurrentState != PrevReq->CurrentState)) {
+    IsNodeFound = true;
+
+    ProgramPoint P = PrevN->getLocation();
+    PathDiagnosticLocation L =
+        PathDiagnosticLocation::create(P, BRC.getSourceManager());
+
+    return new PathDiagnosticEventPiece(L, ErrorText);
+  }
+
+  return nullptr;
+}
+
+} // end of namespace: mpi
+} // end of namespace: ento
+} // end of namespace: clang
diff --git a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h
new file mode 100644
index 0000000..8474d2d
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h
@@ -0,0 +1,109 @@
+//===-- MPIBugReporter.h - bug reporter -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines prefabricated reports which are emitted in
+/// case of MPI related bugs, detected by path-sensitive analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPIBUGREPORTER_H
+#define LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPIBUGREPORTER_H
+
+#include "MPITypes.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+
+namespace clang {
+namespace ento {
+namespace mpi {
+
+class MPIBugReporter {
+public:
+  MPIBugReporter(const CheckerBase &CB) {
+    UnmatchedWaitBugType.reset(new BugType(&CB, "Unmatched wait", MPIError));
+    DoubleNonblockingBugType.reset(
+        new BugType(&CB, "Double nonblocking", MPIError));
+    MissingWaitBugType.reset(new BugType(&CB, "Missing wait", MPIError));
+  }
+
+  /// Report duplicate request use by nonblocking calls without intermediate
+  /// wait.
+  ///
+  /// \param MPICallEvent MPI call that caused the double nonblocking
+  /// \param Req request that was used by two nonblocking calls in sequence
+  /// \param RequestRegion memory region of the request
+  /// \param ExplNode node in the graph the bug appeared at
+  /// \param BReporter bug reporter for current context
+  void reportDoubleNonblocking(const CallEvent &MPICallEvent,
+                               const Request &Req,
+                               const MemRegion *const RequestRegion,
+                               const ExplodedNode *const ExplNode,
+                              BugReporter &BReporter) const;
+
+  /// Report a missing wait for a nonblocking call.
+  ///
+  /// \param Req request that is not matched by a wait
+  /// \param RequestRegion memory region of the request
+  /// \param ExplNode node in the graph the bug appeared at
+  /// \param BReporter bug reporter for current context
+  void reportMissingWait(const Request &Req,
+                         const MemRegion *const RequestRegion,
+                         const ExplodedNode *const ExplNode,
+                         BugReporter &BReporter) const;
+
+  /// Report a wait on a request that has not been used at all before.
+  ///
+  /// \param CE wait call that uses the request
+  /// \param RequestRegion memory region of the request
+  /// \param ExplNode node in the graph the bug appeared at
+  /// \param BReporter bug reporter for current context
+  void reportUnmatchedWait(const CallEvent &CE,
+                           const MemRegion *const RequestRegion,
+                           const ExplodedNode *const ExplNode,
+                           BugReporter &BReporter) const;
+
+private:
+  const std::string MPIError = "MPI Error";
+
+  // path-sensitive bug types
+  std::unique_ptr<BugType> UnmatchedWaitBugType;
+  std::unique_ptr<BugType> MissingWaitBugType;
+  std::unique_ptr<BugType> DoubleNonblockingBugType;
+
+  /// Bug visitor class to find the node where the request region was previously
+  /// used in order to include it into the BugReport path.
+  class RequestNodeVisitor : public BugReporterVisitorImpl<RequestNodeVisitor> {
+  public:
+    RequestNodeVisitor(const MemRegion *const MemoryRegion,
+                       const std::string &ErrText)
+        : RequestRegion(MemoryRegion), ErrorText(ErrText) {}
+
+    void Profile(llvm::FoldingSetNodeID &ID) const override {
+      static int X = 0;
+      ID.AddPointer(&X);
+      ID.AddPointer(RequestRegion);
+    }
+
+    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
+                                   const ExplodedNode *PrevN,
+                                   BugReporterContext &BRC,
+                                   BugReport &BR) override;
+
+  private:
+    const MemRegion *const RequestRegion;
+    bool IsNodeFound = false;
+    std::string ErrorText;
+  };
+};
+
+} // end of namespace: mpi
+} // end of namespace: ento
+} // end of namespace: clang
+
+#endif
diff --git a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIChecker.cpp b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIChecker.cpp
new file mode 100644
index 0000000..c667b9e
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIChecker.cpp
@@ -0,0 +1,193 @@
+//===-- MPIChecker.cpp - Checker Entry Point Class --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the main class of MPI-Checker which serves as an entry
+/// point. It is created once for each translation unit analysed.
+/// The checker defines path-sensitive checks, to verify correct usage of the
+/// MPI API.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MPIChecker.h"
+#include "../ClangSACheckers.h"
+
+namespace clang {
+namespace ento {
+namespace mpi {
+
+void MPIChecker::checkDoubleNonblocking(const CallEvent &PreCallEvent,
+                                        CheckerContext &Ctx) const {
+  if (!FuncClassifier->isNonBlockingType(PreCallEvent.getCalleeIdentifier())) {
+    return;
+  }
+  const MemRegion *const MR =
+      PreCallEvent.getArgSVal(PreCallEvent.getNumArgs() - 1).getAsRegion();
+  if (!MR)
+    return;
+  const ElementRegion *const ER = dyn_cast<ElementRegion>(MR);
+
+  // The region must be typed, in order to reason about it.
+  if (!isa<TypedRegion>(MR) || (ER && !isa<TypedRegion>(ER->getSuperRegion())))
+    return;
+
+  ProgramStateRef State = Ctx.getState();
+  const Request *const Req = State->get<RequestMap>(MR);
+
+  // double nonblocking detected
+  if (Req && Req->CurrentState == Request::State::Nonblocking) {
+    ExplodedNode *ErrorNode = Ctx.generateNonFatalErrorNode();
+    BReporter.reportDoubleNonblocking(PreCallEvent, *Req, MR, ErrorNode,
+                                      Ctx.getBugReporter());
+    Ctx.addTransition(ErrorNode->getState(), ErrorNode);
+  }
+  // no error
+  else {
+    State = State->set<RequestMap>(MR, Request::State::Nonblocking);
+    Ctx.addTransition(State);
+  }
+}
+
+void MPIChecker::checkUnmatchedWaits(const CallEvent &PreCallEvent,
+                                     CheckerContext &Ctx) const {
+  if (!FuncClassifier->isWaitType(PreCallEvent.getCalleeIdentifier()))
+    return;
+  const MemRegion *const MR = topRegionUsedByWait(PreCallEvent);
+  if (!MR)
+    return;
+  const ElementRegion *const ER = dyn_cast<ElementRegion>(MR);
+
+  // The region must be typed, in order to reason about it.
+  if (!isa<TypedRegion>(MR) || (ER && !isa<TypedRegion>(ER->getSuperRegion())))
+    return;
+
+  llvm::SmallVector<const MemRegion *, 2> ReqRegions;
+  allRegionsUsedByWait(ReqRegions, MR, PreCallEvent, Ctx);
+  if (ReqRegions.empty())
+    return;
+
+  ProgramStateRef State = Ctx.getState();
+  static CheckerProgramPointTag Tag("MPI-Checker", "UnmatchedWait");
+  ExplodedNode *ErrorNode{nullptr};
+
+  // Check all request regions used by the wait function.
+  for (const auto &ReqRegion : ReqRegions) {
+    const Request *const Req = State->get<RequestMap>(ReqRegion);
+    State = State->set<RequestMap>(ReqRegion, Request::State::Wait);
+    if (!Req) {
+      if (!ErrorNode) {
+        ErrorNode = Ctx.generateNonFatalErrorNode(State, &Tag);
+        State = ErrorNode->getState();
+      }
+      // A wait has no matching nonblocking call.
+      BReporter.reportUnmatchedWait(PreCallEvent, ReqRegion, ErrorNode,
+                                    Ctx.getBugReporter());
+    }
+  }
+
+  if (!ErrorNode) {
+    Ctx.addTransition(State);
+  } else {
+    Ctx.addTransition(State, ErrorNode);
+  }
+}
+
+void MPIChecker::checkMissingWaits(SymbolReaper &SymReaper,
+                                   CheckerContext &Ctx) const {
+  if (!SymReaper.hasDeadSymbols())
+    return;
+
+  ProgramStateRef State = Ctx.getState();
+  const auto &Requests = State->get<RequestMap>();
+  if (Requests.isEmpty())
+    return;
+
+  static CheckerProgramPointTag Tag("MPI-Checker", "MissingWait");
+  ExplodedNode *ErrorNode{nullptr};
+
+  auto ReqMap = State->get<RequestMap>();
+  for (const auto &Req : ReqMap) {
+    if (!SymReaper.isLiveRegion(Req.first)) {
+      if (Req.second.CurrentState == Request::State::Nonblocking) {
+
+        if (!ErrorNode) {
+          ErrorNode = Ctx.generateNonFatalErrorNode(State, &Tag);
+          State = ErrorNode->getState();
+        }
+        BReporter.reportMissingWait(Req.second, Req.first, ErrorNode,
+                                    Ctx.getBugReporter());
+      }
+      State = State->remove<RequestMap>(Req.first);
+    }
+  }
+
+  // Transition to update the state regarding removed requests.
+  if (!ErrorNode) {
+    Ctx.addTransition(State);
+  } else {
+    Ctx.addTransition(State, ErrorNode);
+  }
+}
+
+const MemRegion *MPIChecker::topRegionUsedByWait(const CallEvent &CE) const {
+
+  if (FuncClassifier->isMPI_Wait(CE.getCalleeIdentifier())) {
+    return CE.getArgSVal(0).getAsRegion();
+  } else if (FuncClassifier->isMPI_Waitall(CE.getCalleeIdentifier())) {
+    return CE.getArgSVal(1).getAsRegion();
+  } else {
+    return (const MemRegion *)nullptr;
+  }
+}
+
+void MPIChecker::allRegionsUsedByWait(
+    llvm::SmallVector<const MemRegion *, 2> &ReqRegions,
+    const MemRegion *const MR, const CallEvent &CE, CheckerContext &Ctx) const {
+
+  MemRegionManager *const RegionManager = MR->getMemRegionManager();
+
+  if (FuncClassifier->isMPI_Waitall(CE.getCalleeIdentifier())) {
+    const MemRegion *SuperRegion{nullptr};
+    if (const ElementRegion *const ER = MR->getAs<ElementRegion>()) {
+      SuperRegion = ER->getSuperRegion();
+    }
+
+    // A single request is passed to MPI_Waitall.
+    if (!SuperRegion) {
+      ReqRegions.push_back(MR);
+      return;
+    }
+
+    const auto &Size = Ctx.getStoreManager().getSizeInElements(
+        Ctx.getState(), SuperRegion,
+        CE.getArgExpr(1)->getType()->getPointeeType());
+    const llvm::APSInt &ArrSize = Size.getAs<nonloc::ConcreteInt>()->getValue();
+
+    for (size_t i = 0; i < ArrSize; ++i) {
+      const NonLoc Idx = Ctx.getSValBuilder().makeArrayIndex(i);
+
+      const ElementRegion *const ER = RegionManager->getElementRegion(
+          CE.getArgExpr(1)->getType()->getPointeeType(), Idx, SuperRegion,
+          Ctx.getASTContext());
+
+      ReqRegions.push_back(ER->getAs<MemRegion>());
+    }
+  } else if (FuncClassifier->isMPI_Wait(CE.getCalleeIdentifier())) {
+    ReqRegions.push_back(MR);
+  }
+}
+
+} // end of namespace: mpi
+} // end of namespace: ento
+} // end of namespace: clang
+
+// Registers the checker for static analysis.
+void clang::ento::registerMPIChecker(CheckerManager &MGR) {
+  MGR.registerChecker<clang::ento::mpi::MPIChecker>();
+}
diff --git a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIChecker.h b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIChecker.h
new file mode 100644
index 0000000..6b1c062
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIChecker.h
@@ -0,0 +1,105 @@
+//===-- MPIChecker.h - Verify MPI API usage- --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the main class of MPI-Checker which serves as an entry
+/// point. It is created once for each translation unit analysed.
+/// The checker defines path-sensitive checks, to verify correct usage of the
+/// MPI API.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPICHECKER_H
+#define LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPICHECKER_H
+
+#include "MPIBugReporter.h"
+#include "MPITypes.h"
+#include "clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+namespace clang {
+namespace ento {
+namespace mpi {
+
+class MPIChecker : public Checker<check::PreCall, check::DeadSymbols> {
+public:
+  MPIChecker() : BReporter(*this) {}
+
+  // path-sensitive callbacks
+  void checkPreCall(const CallEvent &CE, CheckerContext &Ctx) const {
+    dynamicInit(Ctx);
+    checkUnmatchedWaits(CE, Ctx);
+    checkDoubleNonblocking(CE, Ctx);
+  }
+
+  void checkDeadSymbols(SymbolReaper &SymReaper, CheckerContext &Ctx) const {
+    dynamicInit(Ctx);
+    checkMissingWaits(SymReaper, Ctx);
+  }
+
+  void dynamicInit(CheckerContext &Ctx) const {
+    if (FuncClassifier)
+      return;
+    const_cast<std::unique_ptr<MPIFunctionClassifier> &>(FuncClassifier)
+        .reset(new MPIFunctionClassifier{Ctx.getASTContext()});
+  }
+
+  /// Checks if a request is used by nonblocking calls multiple times
+  /// in sequence without intermediate wait. The check contains a guard,
+  /// in order to only inspect nonblocking functions.
+  ///
+  /// \param PreCallEvent MPI call to verify
+  void checkDoubleNonblocking(const clang::ento::CallEvent &PreCallEvent,
+                              clang::ento::CheckerContext &Ctx) const;
+
+  /// Checks if the request used by the wait function was not used at all
+  /// before. The check contains a guard, in order to only inspect wait
+  /// functions.
+  ///
+  /// \param PreCallEvent MPI call to verify
+  void checkUnmatchedWaits(const clang::ento::CallEvent &PreCallEvent,
+                           clang::ento::CheckerContext &Ctx) const;
+
+  /// Check if a nonblocking call is not matched by a wait.
+  /// If a memory region is not alive and the last function using the
+  /// request was a nonblocking call, this is rated as a missing wait.
+  void checkMissingWaits(clang::ento::SymbolReaper &SymReaper,
+                         clang::ento::CheckerContext &Ctx) const;
+
+private:
+  /// Collects all memory regions of a request(array) used by a wait
+  /// function. If the wait function uses a single request, this is a single
+  /// region. For wait functions using multiple requests, multiple regions
+  /// representing elements in the array are collected.
+  ///
+  /// \param ReqRegions vector the regions get pushed into
+  /// \param MR top most region to iterate
+  /// \param CE MPI wait call using the request(s)
+  void allRegionsUsedByWait(
+      llvm::SmallVector<const clang::ento::MemRegion *, 2> &ReqRegions,
+      const clang::ento::MemRegion *const MR, const clang::ento::CallEvent &CE,
+      clang::ento::CheckerContext &Ctx) const;
+
+  /// Returns the memory region used by a wait function.
+  /// Distinguishes between MPI_Wait and MPI_Waitall.
+  ///
+  /// \param CE MPI wait call
+  const clang::ento::MemRegion *
+  topRegionUsedByWait(const clang::ento::CallEvent &CE) const;
+
+  const std::unique_ptr<MPIFunctionClassifier> FuncClassifier;
+  MPIBugReporter BReporter;
+};
+
+} // end of namespace: mpi
+} // end of namespace: ento
+} // end of namespace: clang
+
+#endif
diff --git a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIFunctionClassifier.cpp b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIFunctionClassifier.cpp
new file mode 100644
index 0000000..12760ab
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIFunctionClassifier.cpp
@@ -0,0 +1,284 @@
+//===-- MPIFunctionClassifier.cpp - classifies MPI functions ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functionality to identify and classify MPI functions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace clang {
+namespace ento {
+namespace mpi {
+
+void MPIFunctionClassifier::identifierInit(ASTContext &ASTCtx) {
+  // Initialize function identifiers.
+  initPointToPointIdentifiers(ASTCtx);
+  initCollectiveIdentifiers(ASTCtx);
+  initAdditionalIdentifiers(ASTCtx);
+}
+
+void MPIFunctionClassifier::initPointToPointIdentifiers(ASTContext &ASTCtx) {
+  // Copy identifiers into the correct classification containers.
+  IdentInfo_MPI_Send = &ASTCtx.Idents.get("MPI_Send");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Send);
+  MPIType.push_back(IdentInfo_MPI_Send);
+  assert(IdentInfo_MPI_Send);
+
+  IdentInfo_MPI_Isend = &ASTCtx.Idents.get("MPI_Isend");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Isend);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Isend);
+  MPIType.push_back(IdentInfo_MPI_Isend);
+  assert(IdentInfo_MPI_Isend);
+
+  IdentInfo_MPI_Ssend = &ASTCtx.Idents.get("MPI_Ssend");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Ssend);
+  MPIType.push_back(IdentInfo_MPI_Ssend);
+  assert(IdentInfo_MPI_Ssend);
+
+  IdentInfo_MPI_Issend = &ASTCtx.Idents.get("MPI_Issend");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Issend);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Issend);
+  MPIType.push_back(IdentInfo_MPI_Issend);
+  assert(IdentInfo_MPI_Issend);
+
+  IdentInfo_MPI_Bsend = &ASTCtx.Idents.get("MPI_Bsend");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Bsend);
+  MPIType.push_back(IdentInfo_MPI_Bsend);
+  assert(IdentInfo_MPI_Bsend);
+
+  IdentInfo_MPI_Ibsend = &ASTCtx.Idents.get("MPI_Ibsend");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Ibsend);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Ibsend);
+  MPIType.push_back(IdentInfo_MPI_Ibsend);
+  assert(IdentInfo_MPI_Ibsend);
+
+  IdentInfo_MPI_Rsend = &ASTCtx.Idents.get("MPI_Rsend");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Rsend);
+  MPIType.push_back(IdentInfo_MPI_Rsend);
+  assert(IdentInfo_MPI_Rsend);
+
+  IdentInfo_MPI_Irsend = &ASTCtx.Idents.get("MPI_Irsend");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Irsend);
+  MPIType.push_back(IdentInfo_MPI_Irsend);
+  assert(IdentInfo_MPI_Irsend);
+
+  IdentInfo_MPI_Recv = &ASTCtx.Idents.get("MPI_Recv");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Recv);
+  MPIType.push_back(IdentInfo_MPI_Recv);
+  assert(IdentInfo_MPI_Recv);
+
+  IdentInfo_MPI_Irecv = &ASTCtx.Idents.get("MPI_Irecv");
+  MPIPointToPointTypes.push_back(IdentInfo_MPI_Irecv);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Irecv);
+  MPIType.push_back(IdentInfo_MPI_Irecv);
+  assert(IdentInfo_MPI_Irecv);
+}
+
+void MPIFunctionClassifier::initCollectiveIdentifiers(ASTContext &ASTCtx) {
+  // Copy identifiers into the correct classification containers.
+  IdentInfo_MPI_Scatter = &ASTCtx.Idents.get("MPI_Scatter");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Scatter);
+  MPIPointToCollTypes.push_back(IdentInfo_MPI_Scatter);
+  MPIType.push_back(IdentInfo_MPI_Scatter);
+  assert(IdentInfo_MPI_Scatter);
+
+  IdentInfo_MPI_Iscatter = &ASTCtx.Idents.get("MPI_Iscatter");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Iscatter);
+  MPIPointToCollTypes.push_back(IdentInfo_MPI_Iscatter);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Iscatter);
+  MPIType.push_back(IdentInfo_MPI_Iscatter);
+  assert(IdentInfo_MPI_Iscatter);
+
+  IdentInfo_MPI_Gather = &ASTCtx.Idents.get("MPI_Gather");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Gather);
+  MPICollToPointTypes.push_back(IdentInfo_MPI_Gather);
+  MPIType.push_back(IdentInfo_MPI_Gather);
+  assert(IdentInfo_MPI_Gather);
+
+  IdentInfo_MPI_Igather = &ASTCtx.Idents.get("MPI_Igather");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Igather);
+  MPICollToPointTypes.push_back(IdentInfo_MPI_Igather);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Igather);
+  MPIType.push_back(IdentInfo_MPI_Igather);
+  assert(IdentInfo_MPI_Igather);
+
+  IdentInfo_MPI_Allgather = &ASTCtx.Idents.get("MPI_Allgather");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Allgather);
+  MPICollToCollTypes.push_back(IdentInfo_MPI_Allgather);
+  MPIType.push_back(IdentInfo_MPI_Allgather);
+  assert(IdentInfo_MPI_Allgather);
+
+  IdentInfo_MPI_Iallgather = &ASTCtx.Idents.get("MPI_Iallgather");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Iallgather);
+  MPICollToCollTypes.push_back(IdentInfo_MPI_Iallgather);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Iallgather);
+  MPIType.push_back(IdentInfo_MPI_Iallgather);
+  assert(IdentInfo_MPI_Iallgather);
+
+  IdentInfo_MPI_Bcast = &ASTCtx.Idents.get("MPI_Bcast");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Bcast);
+  MPIPointToCollTypes.push_back(IdentInfo_MPI_Bcast);
+  MPIType.push_back(IdentInfo_MPI_Bcast);
+  assert(IdentInfo_MPI_Bcast);
+
+  IdentInfo_MPI_Ibcast = &ASTCtx.Idents.get("MPI_Ibcast");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Ibcast);
+  MPIPointToCollTypes.push_back(IdentInfo_MPI_Ibcast);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Ibcast);
+  MPIType.push_back(IdentInfo_MPI_Ibcast);
+  assert(IdentInfo_MPI_Ibcast);
+
+  IdentInfo_MPI_Reduce = &ASTCtx.Idents.get("MPI_Reduce");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Reduce);
+  MPICollToPointTypes.push_back(IdentInfo_MPI_Reduce);
+  MPIType.push_back(IdentInfo_MPI_Reduce);
+  assert(IdentInfo_MPI_Reduce);
+
+  IdentInfo_MPI_Ireduce = &ASTCtx.Idents.get("MPI_Ireduce");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Ireduce);
+  MPICollToPointTypes.push_back(IdentInfo_MPI_Ireduce);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Ireduce);
+  MPIType.push_back(IdentInfo_MPI_Ireduce);
+  assert(IdentInfo_MPI_Ireduce);
+
+  IdentInfo_MPI_Allreduce = &ASTCtx.Idents.get("MPI_Allreduce");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Allreduce);
+  MPICollToCollTypes.push_back(IdentInfo_MPI_Allreduce);
+  MPIType.push_back(IdentInfo_MPI_Allreduce);
+  assert(IdentInfo_MPI_Allreduce);
+
+  IdentInfo_MPI_Iallreduce = &ASTCtx.Idents.get("MPI_Iallreduce");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Iallreduce);
+  MPICollToCollTypes.push_back(IdentInfo_MPI_Iallreduce);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Iallreduce);
+  MPIType.push_back(IdentInfo_MPI_Iallreduce);
+  assert(IdentInfo_MPI_Iallreduce);
+
+  IdentInfo_MPI_Alltoall = &ASTCtx.Idents.get("MPI_Alltoall");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Alltoall);
+  MPICollToCollTypes.push_back(IdentInfo_MPI_Alltoall);
+  MPIType.push_back(IdentInfo_MPI_Alltoall);
+  assert(IdentInfo_MPI_Alltoall);
+
+  IdentInfo_MPI_Ialltoall = &ASTCtx.Idents.get("MPI_Ialltoall");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Ialltoall);
+  MPICollToCollTypes.push_back(IdentInfo_MPI_Ialltoall);
+  MPINonBlockingTypes.push_back(IdentInfo_MPI_Ialltoall);
+  MPIType.push_back(IdentInfo_MPI_Ialltoall);
+  assert(IdentInfo_MPI_Ialltoall);
+}
+
+void MPIFunctionClassifier::initAdditionalIdentifiers(ASTContext &ASTCtx) {
+  IdentInfo_MPI_Comm_rank = &ASTCtx.Idents.get("MPI_Comm_rank");
+  MPIType.push_back(IdentInfo_MPI_Comm_rank);
+  assert(IdentInfo_MPI_Comm_rank);
+
+  IdentInfo_MPI_Comm_size = &ASTCtx.Idents.get("MPI_Comm_size");
+  MPIType.push_back(IdentInfo_MPI_Comm_size);
+  assert(IdentInfo_MPI_Comm_size);
+
+  IdentInfo_MPI_Wait = &ASTCtx.Idents.get("MPI_Wait");
+  MPIType.push_back(IdentInfo_MPI_Wait);
+  assert(IdentInfo_MPI_Wait);
+
+  IdentInfo_MPI_Waitall = &ASTCtx.Idents.get("MPI_Waitall");
+  MPIType.push_back(IdentInfo_MPI_Waitall);
+  assert(IdentInfo_MPI_Waitall);
+
+  IdentInfo_MPI_Barrier = &ASTCtx.Idents.get("MPI_Barrier");
+  MPICollectiveTypes.push_back(IdentInfo_MPI_Barrier);
+  MPIType.push_back(IdentInfo_MPI_Barrier);
+  assert(IdentInfo_MPI_Barrier);
+}
+
+// general identifiers
+bool MPIFunctionClassifier::isMPIType(const IdentifierInfo *IdentInfo) const {
+  return llvm::is_contained(MPIType, IdentInfo);
+}
+
+bool MPIFunctionClassifier::isNonBlockingType(
+    const IdentifierInfo *IdentInfo) const {
+  return llvm::is_contained(MPINonBlockingTypes, IdentInfo);
+}
+
+// point-to-point identifiers
+bool MPIFunctionClassifier::isPointToPointType(
+    const IdentifierInfo *IdentInfo) const {
+  return llvm::is_contained(MPIPointToPointTypes, IdentInfo);
+}
+
+// collective identifiers
+bool MPIFunctionClassifier::isCollectiveType(
+    const IdentifierInfo *IdentInfo) const {
+  return llvm::is_contained(MPICollectiveTypes, IdentInfo);
+}
+
+bool MPIFunctionClassifier::isCollToColl(
+    const IdentifierInfo *IdentInfo) const {
+  return llvm::is_contained(MPICollToCollTypes, IdentInfo);
+}
+
+bool MPIFunctionClassifier::isScatterType(
+    const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Scatter ||
+         IdentInfo == IdentInfo_MPI_Iscatter;
+}
+
+bool MPIFunctionClassifier::isGatherType(
+    const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Gather ||
+         IdentInfo == IdentInfo_MPI_Igather ||
+         IdentInfo == IdentInfo_MPI_Allgather ||
+         IdentInfo == IdentInfo_MPI_Iallgather;
+}
+
+bool MPIFunctionClassifier::isAllgatherType(
+    const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Allgather ||
+         IdentInfo == IdentInfo_MPI_Iallgather;
+}
+
+bool MPIFunctionClassifier::isAlltoallType(
+    const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Alltoall ||
+         IdentInfo == IdentInfo_MPI_Ialltoall;
+}
+
+bool MPIFunctionClassifier::isBcastType(const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Bcast || IdentInfo == IdentInfo_MPI_Ibcast;
+}
+
+bool MPIFunctionClassifier::isReduceType(
+    const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Reduce ||
+         IdentInfo == IdentInfo_MPI_Ireduce ||
+         IdentInfo == IdentInfo_MPI_Allreduce ||
+         IdentInfo == IdentInfo_MPI_Iallreduce;
+}
+
+// additional identifiers
+bool MPIFunctionClassifier::isMPI_Wait(const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Wait;
+}
+
+bool MPIFunctionClassifier::isMPI_Waitall(
+    const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Waitall;
+}
+
+bool MPIFunctionClassifier::isWaitType(const IdentifierInfo *IdentInfo) const {
+  return IdentInfo == IdentInfo_MPI_Wait || IdentInfo == IdentInfo_MPI_Waitall;
+}
+
+} // end of namespace: mpi
+} // end of namespace: ento
+} // end of namespace: clang
diff --git a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPITypes.h b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPITypes.h
new file mode 100644
index 0000000..2e7140c
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPITypes.h
@@ -0,0 +1,67 @@
+//===-- MPITypes.h - Functionality to model MPI concepts --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides definitions to model concepts of MPI. The mpi::Request
+/// class defines a wrapper class, in order to make MPI requests trackable for
+/// path-sensitive analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPITYPES_H
+#define LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPITYPES_H
+
+#include "clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
+#include "llvm/ADT/SmallSet.h"
+
+namespace clang {
+namespace ento {
+namespace mpi {
+
+class Request {
+public:
+  enum State : unsigned char { Nonblocking, Wait };
+
+  Request(State S) : CurrentState{S} {}
+
+  void Profile(llvm::FoldingSetNodeID &Id) const {
+    Id.AddInteger(CurrentState);
+  }
+
+  bool operator==(const Request &ToCompare) const {
+    return CurrentState == ToCompare.CurrentState;
+  }
+
+  const State CurrentState;
+};
+
+// The RequestMap stores MPI requests which are identified by their memory
+// region. Requests are used in MPI to complete nonblocking operations with wait
+// operations. A custom map implementation is used, in order to make it
+// available in an arbitrary amount of translation units.
+struct RequestMap {};
+typedef llvm::ImmutableMap<const clang::ento::MemRegion *,
+                           clang::ento::mpi::Request>
+    RequestMapImpl;
+
+} // end of namespace: mpi
+
+template <>
+struct ProgramStateTrait<mpi::RequestMap>
+    : public ProgramStatePartialTrait<mpi::RequestMapImpl> {
+  static void *GDMIndex() {
+    static int index = 0;
+    return &index;
+  }
+};
+
+} // end of namespace: ento
+} // end of namespace: clang
+#endif
diff --git a/lib/StaticAnalyzer/Checkers/Makefile b/lib/StaticAnalyzer/Checkers/Makefile
deleted file mode 100644
index 7c8f7bf..0000000
--- a/lib/StaticAnalyzer/Checkers/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-##===- clang/lib/Checker/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# This implements analyses built on top of source-level CFGs.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../../..
-LIBRARYNAME := clangStaticAnalyzerCheckers
-
-BUILT_SOURCES = Checkers.inc
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(CLANG_LEVEL)/Makefile
-
-$(ObjDir)/Checkers.inc.tmp : Checkers.td $(PROJ_SRC_DIR)/$(CLANG_LEVEL)/include/clang/StaticAnalyzer/Checkers/CheckerBase.td $(CLANG_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building Clang SA Checkers tables with tblgen"
-	$(Verb) $(ClangTableGen) -gen-clang-sa-checkers -I $(PROJ_SRC_DIR)/$(CLANG_LEVEL)/include -o $(call SYSPATH, $@) $<
diff --git a/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index f365e53..c6eb30c 100644
--- a/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -26,11 +26,11 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
-#include "llvm/ADT/ImmutableMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include <climits>
+#include <utility>
 
 using namespace clang;
 using namespace ento;
@@ -520,7 +520,7 @@
 class StopTrackingCallback final : public SymbolVisitor {
   ProgramStateRef state;
 public:
-  StopTrackingCallback(ProgramStateRef st) : state(st) {}
+  StopTrackingCallback(ProgramStateRef st) : state(std::move(st)) {}
   ProgramStateRef getState() const { return state; }
 
   bool VisitSymbol(SymbolRef sym) override {
@@ -943,7 +943,7 @@
   const CXXConstructorDecl *CtorD = ConstructE->getConstructor();
 
   // Iterate over the constructor parameters.
-  for (const auto *CtorParam : CtorD->params()) {
+  for (const auto *CtorParam : CtorD->parameters()) {
 
     QualType CtorParamPointeeT = CtorParam->getType()->getPointeeType();
     if (CtorParamPointeeT.isNull())
diff --git a/lib/StaticAnalyzer/Checkers/MallocOverflowSecurityChecker.cpp b/lib/StaticAnalyzer/Checkers/MallocOverflowSecurityChecker.cpp
index 99ba90d..fc2ab1d 100644
--- a/lib/StaticAnalyzer/Checkers/MallocOverflowSecurityChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/MallocOverflowSecurityChecker.cpp
@@ -25,10 +25,10 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
+#include <utility>
 
 using namespace clang;
 using namespace ento;
-using llvm::APInt;
 using llvm::APSInt;
 
 namespace {
@@ -38,7 +38,7 @@
   APSInt maxVal;
 
   MallocOverflowCheck(const BinaryOperator *m, const Expr *v, APSInt val)
-      : mulop(m), variable(v), maxVal(val) {}
+      : mulop(m), variable(v), maxVal(std::move(val)) {}
 };
 
 class MallocOverflowSecurityChecker : public Checker<check::ASTCodeBody> {
@@ -141,25 +141,25 @@
       return false;
     }
 
-    const Decl *getDecl(const DeclRefExpr *DR) { return DR->getDecl(); }
-
-    const Decl *getDecl(const MemberExpr *ME) { return ME->getMemberDecl(); }
+    static const Decl *getDecl(const DeclRefExpr *DR) { return DR->getDecl(); }
+    static const Decl *getDecl(const MemberExpr *ME) {
+      return ME->getMemberDecl();
+    }
 
     template <typename T1>
-    void Erase(const T1 *DR, std::function<bool(theVecType::iterator)> pred) {
-      theVecType::iterator i = toScanFor.end();
-      theVecType::iterator e = toScanFor.begin();
-      while (i != e) {
-        --i;
-        if (const T1 *DR_i = dyn_cast<T1>(i->variable)) {
-          if ((getDecl(DR_i) == getDecl(DR)) && pred(i))
-            i = toScanFor.erase(i);
-        }
-      }
+    void Erase(const T1 *DR,
+               llvm::function_ref<bool(const MallocOverflowCheck &)> Pred) {
+      auto P = [DR, Pred](const MallocOverflowCheck &Check) {
+        if (const auto *CheckDR = dyn_cast<T1>(Check.variable))
+          return getDecl(CheckDR) == getDecl(DR) && Pred(Check);
+        return false;
+      };
+      toScanFor.erase(std::remove_if(toScanFor.begin(), toScanFor.end(), P),
+                      toScanFor.end());
     }
 
     void CheckExpr(const Expr *E_p) {
-      auto PredTrue = [](theVecType::iterator) -> bool { return true; };
+      auto PredTrue = [](const MallocOverflowCheck &) { return true; };
       const Expr *E = E_p->IgnoreParenImpCasts();
       if (const DeclRefExpr *DR = dyn_cast<DeclRefExpr>(E))
         Erase<DeclRefExpr>(DR, PredTrue);
@@ -210,9 +210,9 @@
       const Expr *E = lhs->IgnoreParenImpCasts();
 
       auto pred = [assignKnown, numeratorKnown,
-                   denomExtVal](theVecType::iterator i) {
+                   denomExtVal](const MallocOverflowCheck &Check) {
         return assignKnown ||
-               (numeratorKnown && (denomExtVal >= i->maxVal.getExtValue()));
+               (numeratorKnown && (denomExtVal >= Check.maxVal.getExtValue()));
       };
 
       if (const DeclRefExpr *DR = dyn_cast<DeclRefExpr>(E))
diff --git a/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp b/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp
index dab068b..559c75d 100644
--- a/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp
@@ -61,7 +61,7 @@
     II = &D->getASTContext().Idents.get("NSError");
 
   bool hasNSError = false;
-  for (const auto *I : D->params())  {
+  for (const auto *I : D->parameters())  {
     if (IsNSError(I->getType(), II)) {
       hasNSError = true;
       break;
@@ -108,7 +108,7 @@
     II = &D->getASTContext().Idents.get("CFErrorRef");
 
   bool hasCFError = false;
-  for (auto I : D->params())  {
+  for (auto I : D->parameters())  {
     if (IsCFError(I->getType(), II)) {
       hasCFError = true;
       break;
diff --git a/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp b/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
index 0203d79..58ebf72 100644
--- a/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
@@ -79,7 +79,6 @@
 
   C.addTransition(
       State->set<ArraySizeMap>(ArraySym, SizeV.castAs<DefinedSVal>()));
-  return;
 }
 
 void ObjCContainersChecker::checkPostStmt(const CallExpr *CE,
@@ -156,10 +155,7 @@
                                           const InvalidatedSymbols &Escaped,
                                           const CallEvent *Call,
                                           PointerEscapeKind Kind) const {
-  for (InvalidatedSymbols::const_iterator I = Escaped.begin(),
-                                          E = Escaped.end();
-                                          I != E; ++I) {
-    SymbolRef Sym = *I;
+  for (const auto &Sym : Escaped) {
     // When a symbol for a mutable array escapes, we can't reason precisely
     // about its size any more -- so remove it from the map.
     // Note that we aren't notified here when a CFMutableArrayRef escapes as a
@@ -169,6 +165,7 @@
   }
   return State;
 }
+
 /// Register checker.
 void ento::registerObjCContainersChecker(CheckerManager &mgr) {
   mgr.registerChecker<ObjCContainersChecker>();
diff --git a/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp b/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp
index 0640d2f..a51dda6 100644
--- a/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp
@@ -82,7 +82,11 @@
     CharUnits BaselinePad = calculateBaselinePad(RD, ASTContext, RL);
     if (BaselinePad.isZero())
       return;
-    CharUnits OptimalPad = calculateOptimalPad(RD, ASTContext, RL);
+
+    CharUnits OptimalPad;
+    SmallVector<const FieldDecl *, 20> OptimalFieldsOrder;
+    std::tie(OptimalPad, OptimalFieldsOrder) =
+        calculateOptimalPad(RD, ASTContext, RL);
 
     CharUnits DiffPad = PadMultiplier * (BaselinePad - OptimalPad);
     if (DiffPad.getQuantity() <= AllowedPad) {
@@ -90,7 +94,7 @@
       // There is not enough excess padding to trigger a warning.
       return;
     }
-    reportRecord(RD, BaselinePad, OptimalPad);
+    reportRecord(RD, BaselinePad, OptimalPad, OptimalFieldsOrder);
   }
 
   /// \brief Look for arrays of overly padded types. If the padding of the
@@ -199,22 +203,30 @@
   /// 7.  Add tail padding by rounding the current offset up to the structure
   ///     alignment. Track the amount of padding added.
 
-  static CharUnits calculateOptimalPad(const RecordDecl *RD,
-                                       const ASTContext &ASTContext,
-                                       const ASTRecordLayout &RL) {
-    struct CharUnitPair {
+  static std::pair<CharUnits, SmallVector<const FieldDecl *, 20>>
+  calculateOptimalPad(const RecordDecl *RD, const ASTContext &ASTContext,
+                      const ASTRecordLayout &RL) {
+    struct FieldInfo {
       CharUnits Align;
       CharUnits Size;
-      bool operator<(const CharUnitPair &RHS) const {
+      const FieldDecl *Field;
+      bool operator<(const FieldInfo &RHS) const {
         // Order from small alignments to large alignments,
         // then large sizes to small sizes.
-        return std::make_pair(Align, -Size) <
-               std::make_pair(RHS.Align, -RHS.Size);
+        // then large field indices to small field indices
+        return std::make_tuple(Align, -Size,
+                               Field ? -static_cast<int>(Field->getFieldIndex())
+                                     : 0) <
+               std::make_tuple(
+                   RHS.Align, -RHS.Size,
+                   RHS.Field ? -static_cast<int>(RHS.Field->getFieldIndex())
+                             : 0);
       }
     };
-    SmallVector<CharUnitPair, 20> Fields;
+    SmallVector<FieldInfo, 20> Fields;
     auto GatherSizesAndAlignments = [](const FieldDecl *FD) {
-      CharUnitPair RetVal;
+      FieldInfo RetVal;
+      RetVal.Field = FD;
       auto &Ctx = FD->getASTContext();
       std::tie(RetVal.Size, RetVal.Align) =
           Ctx.getTypeInfoInChars(FD->getType());
@@ -226,14 +238,13 @@
     std::transform(RD->field_begin(), RD->field_end(),
                    std::back_inserter(Fields), GatherSizesAndAlignments);
     std::sort(Fields.begin(), Fields.end());
-
     // This lets us skip over vptrs and non-virtual bases,
     // so that we can just worry about the fields in our object.
     // Note that this does cause us to miss some cases where we
     // could pack more bytes in to a base class's tail padding.
     CharUnits NewOffset = ASTContext.toCharUnitsFromBits(RL.getFieldOffset(0));
     CharUnits NewPad;
-
+    SmallVector<const FieldDecl *, 20> OptimalFieldsOrder;
     while (!Fields.empty()) {
       unsigned TrailingZeros =
           llvm::countTrailingZeros((unsigned long long)NewOffset.getQuantity());
@@ -242,7 +253,7 @@
       // our long long (and CharUnits internal type) negative. So shift 62.
       long long CurAlignmentBits = 1ull << (std::min)(TrailingZeros, 62u);
       CharUnits CurAlignment = CharUnits::fromQuantity(CurAlignmentBits);
-      CharUnitPair InsertPoint = {CurAlignment, CharUnits::Zero()};
+      FieldInfo InsertPoint = {CurAlignment, CharUnits::Zero(), nullptr};
       auto CurBegin = Fields.begin();
       auto CurEnd = Fields.end();
 
@@ -255,6 +266,7 @@
         // We found a field that we can layout with the current alignment.
         --Iter;
         NewOffset += Iter->Size;
+        OptimalFieldsOrder.push_back(Iter->Field);
         Fields.erase(Iter);
       } else {
         // We are poorly aligned, and we need to pad in order to layout another
@@ -268,18 +280,18 @@
     // Calculate tail padding.
     CharUnits NewSize = NewOffset.alignTo(RL.getAlignment());
     NewPad += NewSize - NewOffset;
-    return NewPad;
+    return {NewPad, std::move(OptimalFieldsOrder)};
   }
 
-  void reportRecord(const RecordDecl *RD, CharUnits BaselinePad,
-                    CharUnits TargetPad) const {
+  void reportRecord(
+      const RecordDecl *RD, CharUnits BaselinePad, CharUnits OptimalPad,
+      const SmallVector<const FieldDecl *, 20> &OptimalFieldsOrder) const {
     if (!PaddingBug)
       PaddingBug =
           llvm::make_unique<BugType>(this, "Excessive Padding", "Performance");
 
     SmallString<100> Buf;
     llvm::raw_svector_ostream Os(Buf);
-
     Os << "Excessive padding in '";
     Os << QualType::getAsString(RD->getTypeForDecl(), Qualifiers()) << "'";
 
@@ -294,16 +306,18 @@
     }
 
     Os << " (" << BaselinePad.getQuantity() << " padding bytes, where "
-       << TargetPad.getQuantity() << " is optimal). Consider reordering "
-       << "the fields or adding explicit padding members.";
+       << OptimalPad.getQuantity() << " is optimal). \n"
+       << "Optimal fields order: \n";
+    for (const auto *FD : OptimalFieldsOrder)
+      Os << FD->getName() << ", \n";
+    Os << "consider reordering the fields or adding explicit padding "
+          "members.";
 
     PathDiagnosticLocation CELoc =
         PathDiagnosticLocation::create(RD, BR->getSourceManager());
-
     auto Report = llvm::make_unique<BugReport>(*PaddingBug, Os.str(), CELoc);
     Report->setDeclWithIssue(RD);
     Report->addRange(RD->getSourceRange());
-
     BR->emitReport(std::move(Report));
   }
 };
diff --git a/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp b/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
index df51188..8caf6df 100644
--- a/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
@@ -19,7 +19,6 @@
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
-#include "llvm/ADT/SmallVector.h"
 
 using namespace clang;
 using namespace ento;
diff --git a/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp b/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
index 28a4a08..7ef79c6 100644
--- a/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
@@ -18,7 +18,6 @@
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
-#include "llvm/ADT/ImmutableList.h"
 
 using namespace clang;
 using namespace ento;
diff --git a/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp b/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
index f983c30..6bb0727 100644
--- a/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ClangSACheckers.h"
 #include "AllocationDiagnostics.h"
+#include "ClangSACheckers.h"
 #include "SelectorExtras.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
@@ -39,6 +39,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include <cstdarg>
+#include <utility>
 
 using namespace clang;
 using namespace ento;
@@ -1170,8 +1171,9 @@
         break;
       }
 
-      // For CoreGraphics ('CG') types.
-      if (cocoa::isRefType(RetTy, "CG", FName)) {
+      // For CoreGraphics ('CG') and CoreVideo ('CV') types.
+      if (cocoa::isRefType(RetTy, "CG", FName) ||
+          cocoa::isRefType(RetTy, "CV", FName)) {
         if (isRetain(FD, FName))
           S = getUnarySummary(FT, cfretain);
         else
@@ -2683,7 +2685,7 @@
 class StopTrackingCallback final : public SymbolVisitor {
   ProgramStateRef state;
 public:
-  StopTrackingCallback(ProgramStateRef st) : state(st) {}
+  StopTrackingCallback(ProgramStateRef st) : state(std::move(st)) {}
   ProgramStateRef getState() const { return state; }
 
   bool VisitSymbol(SymbolRef sym) override {
@@ -2832,14 +2834,6 @@
   C.addTransition(State);
 }
 
-static bool wasLoadedFromIvar(SymbolRef Sym) {
-  if (auto DerivedVal = dyn_cast<SymbolDerived>(Sym))
-    return isa<ObjCIvarRegion>(DerivedVal->getRegion());
-  if (auto RegionVal = dyn_cast<SymbolRegionValue>(Sym))
-    return isa<ObjCIvarRegion>(RegionVal->getRegion());
-  return false;
-}
-
 void RetainCountChecker::checkPostStmt(const ObjCIvarRefExpr *IRE,
                                        CheckerContext &C) const {
   Optional<Loc> IVarLoc = C.getSVal(IRE).getAs<Loc>();
@@ -2848,7 +2842,7 @@
 
   ProgramStateRef State = C.getState();
   SymbolRef Sym = State->getSVal(*IVarLoc).getAsSymbol();
-  if (!Sym || !wasLoadedFromIvar(Sym))
+  if (!Sym || !dyn_cast_or_null<ObjCIvarRegion>(Sym->getOriginRegion()))
     return;
 
   // Accessing an ivar directly is unusual. If we've done that, be more
@@ -3379,12 +3373,13 @@
     // Handle: id NSMakeCollectable(CFTypeRef)
     canEval = II->isStr("NSMakeCollectable");
   } else if (ResultTy->isPointerType()) {
-    // Handle: (CF|CG)Retain
+    // Handle: (CF|CG|CV)Retain
     //         CFAutorelease
     //         CFMakeCollectable
     // It's okay to be a little sloppy here (CGMakeCollectable doesn't exist).
     if (cocoa::isRefType(ResultTy, "CF", FName) ||
-        cocoa::isRefType(ResultTy, "CG", FName)) {
+        cocoa::isRefType(ResultTy, "CG", FName) ||
+        cocoa::isRefType(ResultTy, "CV", FName)) {
       canEval = isRetain(FD, FName) || isAutorelease(FD, FName) ||
                 isMakeCollectable(FD, FName);
     }
diff --git a/lib/StaticAnalyzer/Checkers/SimpleStreamChecker.cpp b/lib/StaticAnalyzer/Checkers/SimpleStreamChecker.cpp
index 62e6f02..ab4b4d3 100644
--- a/lib/StaticAnalyzer/Checkers/SimpleStreamChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/SimpleStreamChecker.cpp
@@ -20,6 +20,7 @@
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include <utility>
 
 using namespace clang;
 using namespace ento;
@@ -92,7 +93,7 @@
 class StopTrackingCallback final : public SymbolVisitor {
   ProgramStateRef state;
 public:
-  StopTrackingCallback(ProgramStateRef st) : state(st) {}
+  StopTrackingCallback(ProgramStateRef st) : state(std::move(st)) {}
   ProgramStateRef getState() const { return state; }
 
   bool VisitSymbol(SymbolRef sym) override {
diff --git a/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp b/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
index 79fc701..556274d 100644
--- a/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
@@ -236,7 +236,12 @@
     SmallString<512> buf;
     llvm::raw_svector_ostream os(buf);
     SourceRange range = genName(os, cb.V[i].second, Ctx.getASTContext());
-    os << " is still referred to by the global variable '";
+    os << " is still referred to by the ";
+    if (isa<StaticGlobalSpaceRegion>(cb.V[i].first->getMemorySpace()))
+      os << "static";
+    else
+      os << "global";
+    os << " variable '";
     const VarRegion *VR = cast<VarRegion>(cb.V[i].first->getBaseRegion());
     os << *VR->getDecl()
        << "' upon returning to the caller.  This will be a dangling reference";
diff --git a/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 82b01fe..915514b 100644
--- a/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -19,7 +19,6 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
-#include "llvm/ADT/ImmutableMap.h"
 
 using namespace clang;
 using namespace ento;
diff --git a/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp b/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
index ed17610..0a27429 100644
--- a/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
@@ -17,6 +17,7 @@
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include <utility>
 
 using namespace clang;
 using namespace ento;
@@ -31,7 +32,7 @@
     const LocationContext *LCtx;
 
     FindUndefExpr(ProgramStateRef S, const LocationContext *L)
-      : St(S), LCtx(L) {}
+        : St(std::move(S)), LCtx(L) {}
 
     const Expr *FindExpr(const Expr *Ex) {
       if (!MatchesCriteria(Ex))
diff --git a/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
index e3b2ed2..40217bd 100644
--- a/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
@@ -76,7 +76,6 @@
   report->addRange(SizeE->getSourceRange());
   bugreporter::trackNullOrUndefValue(N, SizeE, *report);
   C.emitReport(std::move(report));
-  return;
 }
 
 void VLASizeChecker::checkPreStmt(const DeclStmt *DS, CheckerContext &C) const {
diff --git a/lib/StaticAnalyzer/Checkers/VforkChecker.cpp b/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
index 26ffee8..75aefc0 100644
--- a/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
@@ -54,10 +54,10 @@
   bool isCallWhitelisted(const IdentifierInfo *II, CheckerContext &C) const;
 
   void reportBug(const char *What, CheckerContext &C,
-                 const char *Details = 0) const;
+                 const char *Details = nullptr) const;
 
 public:
-  VforkChecker() : II_vfork(0) {}
+  VforkChecker() : II_vfork(nullptr) {}
 
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
@@ -107,7 +107,7 @@
       "execv",
       "execvp",
       "execvpe",
-      0,
+      nullptr
     };
 
     ASTContext &AC = C.getASTContext();
diff --git a/lib/StaticAnalyzer/Core/BugReporter.cpp b/lib/StaticAnalyzer/Core/BugReporter.cpp
index f546a66..e04aa39 100644
--- a/lib/StaticAnalyzer/Core/BugReporter.cpp
+++ b/lib/StaticAnalyzer/Core/BugReporter.cpp
@@ -2922,7 +2922,7 @@
   while (true) {
     // Create the equivalent node in the new graph with the same state
     // and location.
-    ExplodedNode *NewN = GNew->getNode(OrigN->getLocation(), OrigN->getState(),
+    ExplodedNode *NewN = GNew->createUncachedNode(OrigN->getLocation(), OrigN->getState(),
                                        OrigN->isSink());
 
     // Store the mapping to the original node.
@@ -3104,6 +3104,7 @@
     R->addVisitor(llvm::make_unique<NilReceiverBRVisitor>());
     R->addVisitor(llvm::make_unique<ConditionBRVisitor>());
     R->addVisitor(llvm::make_unique<LikelyFalsePositiveSuppressionBRVisitor>());
+    R->addVisitor(llvm::make_unique<CXXSelfAssignmentBRVisitor>());
 
     BugReport::VisitorList visitors;
     unsigned origReportConfigToken, finalReportConfigToken;
diff --git a/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 360ae23..3b72244 100644
--- a/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -916,7 +916,7 @@
     if (PropRef && PropRef->isMessagingGetter()) {
       const Expr *GetterMessageSend =
           POE->getSemanticExpr(POE->getNumSemanticExprs() - 1);
-      assert(isa<ObjCMessageExpr>(GetterMessageSend->IgnoreParenCasts()));
+      assert(isa<ObjCMessageExpr>(GetterMessageSend));
       return peelOffOuterExpr(GetterMessageSend, N);
     }
   }
@@ -1693,3 +1693,56 @@
   }
   return nullptr;
 }
+
+PathDiagnosticPiece *
+CXXSelfAssignmentBRVisitor::VisitNode(const ExplodedNode *Succ,
+                                      const ExplodedNode *Pred,
+                                      BugReporterContext &BRC, BugReport &BR) {
+  if (Satisfied)
+    return nullptr;
+
+  auto Edge = Succ->getLocation().getAs<BlockEdge>();
+  if (!Edge.hasValue())
+    return nullptr;
+
+  auto Tag = Edge->getTag();
+  if (!Tag)
+    return nullptr;
+
+  if (Tag->getTagDescription() != "cplusplus.SelfAssignment")
+    return nullptr;
+
+  Satisfied = true;
+
+  const auto *Met =
+      dyn_cast<CXXMethodDecl>(Succ->getCodeDecl().getAsFunction());
+  assert(Met && "Not a C++ method.");
+  assert((Met->isCopyAssignmentOperator() || Met->isMoveAssignmentOperator()) &&
+         "Not a copy/move assignment operator.");
+
+  const auto *LCtx = Edge->getLocationContext();
+
+  const auto &State = Succ->getState();
+  auto &SVB = State->getStateManager().getSValBuilder();
+
+  const auto Param =
+      State->getSVal(State->getRegion(Met->getParamDecl(0), LCtx));
+  const auto This =
+      State->getSVal(SVB.getCXXThis(Met, LCtx->getCurrentStackFrame()));
+
+  auto L = PathDiagnosticLocation::create(Met, BRC.getSourceManager());
+
+  if (!L.isValid() || !L.asLocation().isValid())
+    return nullptr;
+
+  SmallString<256> Buf;
+  llvm::raw_svector_ostream Out(Buf);
+
+  Out << "Assuming " << Met->getParamDecl(0)->getName() <<
+    ((Param == This) ? " == " : " != ") << "*this";
+
+  auto *Piece = new PathDiagnosticEventPiece(L, Out.str());
+  Piece->addRange(Met->getSourceRange());
+
+  return Piece;
+}
diff --git a/lib/StaticAnalyzer/Core/CallEvent.cpp b/lib/StaticAnalyzer/Core/CallEvent.cpp
index 6267758..bd47e89 100644
--- a/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -177,7 +177,7 @@
     // below for efficiency.
     if (PreserveArgs.count(Idx))
       if (const MemRegion *MR = getArgSVal(Idx).getAsRegion())
-        ETraits.setTrait(MR->StripCasts(),
+        ETraits.setTrait(MR->getBaseRegion(),
                         RegionAndSymbolInvalidationTraits::TK_PreserveContents);
         // TODO: Factor this out + handle the lower level const pointers.
 
@@ -552,7 +552,7 @@
 
       // FIXME: CallEvent maybe shouldn't be directly accessing StoreManager.
       bool Failed;
-      ThisVal = StateMgr.getStoreManager().evalDynamicCast(ThisVal, Ty, Failed);
+      ThisVal = StateMgr.getStoreManager().attemptDownCast(ThisVal, Ty, Failed);
       assert(!Failed && "Calling an incorrectly devirtualized method");
     }
 
diff --git a/lib/StaticAnalyzer/Core/CheckerHelpers.cpp b/lib/StaticAnalyzer/Core/CheckerHelpers.cpp
index d6aeceb..ed41914 100644
--- a/lib/StaticAnalyzer/Core/CheckerHelpers.cpp
+++ b/lib/StaticAnalyzer/Core/CheckerHelpers.cpp
@@ -75,8 +75,8 @@
 // Extract lhs and rhs from assignment statement
 std::pair<const clang::VarDecl *, const clang::Expr *>
 clang::ento::parseAssignment(const Stmt *S) {
-  const VarDecl *VD = 0;
-  const Expr *RHS = 0;
+  const VarDecl *VD = nullptr;
+  const Expr *RHS = nullptr;
 
   if (auto Assign = dyn_cast_or_null<BinaryOperator>(S)) {
     if (Assign->isAssignmentOp()) {
diff --git a/lib/StaticAnalyzer/Core/CheckerRegistry.cpp b/lib/StaticAnalyzer/Core/CheckerRegistry.cpp
index ba03e2f..c9cb189 100644
--- a/lib/StaticAnalyzer/Core/CheckerRegistry.cpp
+++ b/lib/StaticAnalyzer/Core/CheckerRegistry.cpp
@@ -175,3 +175,22 @@
     out << '\n';
   }
 }
+
+void CheckerRegistry::printList(
+    raw_ostream &out, SmallVectorImpl<CheckerOptInfo> &opts) const {
+  std::sort(Checkers.begin(), Checkers.end(), checkerNameLT);
+
+  // Collect checkers enabled by the options.
+  CheckerInfoSet enabledCheckers;
+  for (SmallVectorImpl<CheckerOptInfo>::iterator i = opts.begin(),
+                                                       e = opts.end();
+       i != e; ++i) {
+    collectCheckers(Checkers, Packages, *i, enabledCheckers);
+  }
+
+  for (CheckerInfoSet::const_iterator i = enabledCheckers.begin(),
+                                      e = enabledCheckers.end();
+       i != e; ++i) {
+    out << (*i)->FullName << '\n';
+  }
+}
diff --git a/lib/StaticAnalyzer/Core/CoreEngine.cpp b/lib/StaticAnalyzer/Core/CoreEngine.cpp
index c75fb2e..a0f994e 100644
--- a/lib/StaticAnalyzer/Core/CoreEngine.cpp
+++ b/lib/StaticAnalyzer/Core/CoreEngine.cpp
@@ -18,7 +18,6 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Casting.h"
 
@@ -208,6 +207,11 @@
 
   // Check if we have a steps limit
   bool UnlimitedSteps = Steps == 0;
+  // Cap our pre-reservation in the event that the user specifies
+  // a very large number of maximum steps.
+  const unsigned PreReservationCap = 4000000;
+  if(!UnlimitedSteps)
+    G.reserve(std::min(Steps,PreReservationCap));
 
   while (WList->hasWork()) {
     if (!UnlimitedSteps) {
diff --git a/lib/StaticAnalyzer/Core/ExplodedGraph.cpp b/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
index 8a09720..3bc8e09 100644
--- a/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
+++ b/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
@@ -17,11 +17,9 @@
 #include "clang/AST/Stmt.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include <vector>
 
 using namespace clang;
 using namespace ento;
@@ -336,6 +334,14 @@
   return V;
 }
 
+ExplodedNode *ExplodedGraph::createUncachedNode(const ProgramPoint &L,
+                                                ProgramStateRef State,
+                                                bool IsSink) {
+  NodeTy *V = (NodeTy *) getAllocator().Allocate<NodeTy>();
+  new (V) NodeTy(L, State, IsSink);
+  return V;
+}
+
 std::unique_ptr<ExplodedGraph>
 ExplodedGraph::trim(ArrayRef<const NodeTy *> Sinks,
                     InterExplodedGraphMap *ForwardMap,
@@ -395,8 +401,7 @@
 
     // Create the corresponding node in the new graph and record the mapping
     // from the old node to the new node.
-    ExplodedNode *NewN = G->getNode(N->getLocation(), N->State, N->isSink(),
-                                    nullptr);
+    ExplodedNode *NewN = G->createUncachedNode(N->getLocation(), N->State, N->isSink());
     Pass2[N] = NewN;
 
     // Also record the reverse mapping from the new node to the old node.
diff --git a/lib/StaticAnalyzer/Core/ExprEngine.cpp b/lib/StaticAnalyzer/Core/ExprEngine.cpp
index c9db5ba..6ca24c7 100644
--- a/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -27,10 +27,9 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/LoopWidening.h"
-#include "llvm/ADT/ImmutableList.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SaveAndRestore.h"
+#include "llvm/Support/raw_ostream.h"
 
 #ifndef NDEBUG
 #include "llvm/Support/GraphWriter.h"
@@ -755,6 +754,7 @@
     // C++ and ARC stuff we don't support yet.
     case Expr::ObjCIndirectCopyRestoreExprClass:
     case Stmt::CXXDependentScopeMemberExprClass:
+    case Stmt::CXXInheritedCtorInitExprClass:
     case Stmt::CXXTryStmtClass:
     case Stmt::CXXTypeidExprClass:
     case Stmt::CXXUuidofExprClass:
@@ -831,12 +831,23 @@
     case Stmt::OMPAtomicDirectiveClass:
     case Stmt::OMPTargetDirectiveClass:
     case Stmt::OMPTargetDataDirectiveClass:
+    case Stmt::OMPTargetEnterDataDirectiveClass:
+    case Stmt::OMPTargetExitDataDirectiveClass:
+    case Stmt::OMPTargetParallelDirectiveClass:
+    case Stmt::OMPTargetParallelForDirectiveClass:
+    case Stmt::OMPTargetUpdateDirectiveClass:
     case Stmt::OMPTeamsDirectiveClass:
     case Stmt::OMPCancellationPointDirectiveClass:
     case Stmt::OMPCancelDirectiveClass:
     case Stmt::OMPTaskLoopDirectiveClass:
     case Stmt::OMPTaskLoopSimdDirectiveClass:
     case Stmt::OMPDistributeDirectiveClass:
+    case Stmt::OMPDistributeParallelForDirectiveClass:
+    case Stmt::OMPDistributeParallelForSimdDirectiveClass:
+    case Stmt::OMPDistributeSimdDirectiveClass:
+    case Stmt::OMPTargetParallelForSimdDirectiveClass:
+    case Stmt::OMPTargetSimdDirectiveClass:
+    case Stmt::OMPTeamsDistributeDirectiveClass:
       llvm_unreachable("Stmt should not be in analyzer evaluation loop");
 
     case Stmt::ObjCSubscriptRefExprClass:
@@ -906,6 +917,7 @@
     case Stmt::CXXScalarValueInitExprClass:
     case Stmt::CXXBoolLiteralExprClass:
     case Stmt::ObjCBoolLiteralExprClass:
+    case Stmt::ObjCAvailabilityCheckExprClass:
     case Stmt::FloatingLiteralClass:
     case Stmt::NoInitExprClass:
     case Stmt::SizeOfPackExprClass:
@@ -2510,26 +2522,10 @@
   // FIXME: Since we do not cache error nodes in ExprEngine now, this does not
   // work.
   static std::string getNodeAttributes(const ExplodedNode *N, void*) {
-
-#if 0
-      // FIXME: Replace with a general scheme to tell if the node is
-      // an error node.
-    if (GraphPrintCheckerState->isImplicitNullDeref(N) ||
-        GraphPrintCheckerState->isExplicitNullDeref(N) ||
-        GraphPrintCheckerState->isUndefDeref(N) ||
-        GraphPrintCheckerState->isUndefStore(N) ||
-        GraphPrintCheckerState->isUndefControlFlow(N) ||
-        GraphPrintCheckerState->isUndefResult(N) ||
-        GraphPrintCheckerState->isBadCall(N) ||
-        GraphPrintCheckerState->isUndefArg(N))
-      return "color=\"red\",style=\"filled\"";
-
-    if (GraphPrintCheckerState->isNoReturnCall(N))
-      return "color=\"blue\",style=\"filled\"";
-#endif
     return "";
   }
 
+  // De-duplicate some source location pretty-printing.
   static void printLocation(raw_ostream &Out, SourceLocation SLoc) {
     if (SLoc.isFileID()) {
       Out << "\\lline="
@@ -2539,6 +2535,12 @@
         << "\\l";
     }
   }
+  static void printLocation2(raw_ostream &Out, SourceLocation SLoc) {
+    if (SLoc.isFileID() && GraphPrintSourceManager->isInMainFile(SLoc))
+      Out << "line " << GraphPrintSourceManager->getExpansionLineNumber(SLoc);
+    else
+      SLoc.print(Out, *GraphPrintSourceManager);
+  }
 
   static std::string getNodeLabel(const ExplodedNode *N, void*){
 
@@ -2552,12 +2554,6 @@
       case ProgramPoint::BlockEntranceKind: {
         Out << "Block Entrance: B"
             << Loc.castAs<BlockEntrance>().getBlock()->getBlockID();
-        if (const NamedDecl *ND =
-                    dyn_cast<NamedDecl>(Loc.getLocationContext()->getDecl())) {
-          Out << " (";
-          ND->printName(Out);
-          Out << ")";
-        }
         break;
       }
 
@@ -2682,13 +2678,6 @@
           Out << "\\l";
         }
 
-#if 0
-          // FIXME: Replace with a general scheme to determine
-          // the name of the check.
-        if (GraphPrintCheckerState->isUndefControlFlow(N)) {
-          Out << "\\|Control-flow based on\\lUndefined value.\\l";
-        }
-#endif
         break;
       }
 
@@ -2710,27 +2699,6 @@
         else if (Loc.getAs<PostLValue>())
           Out << "\\lPostLValue\\l";
 
-#if 0
-          // FIXME: Replace with a general scheme to determine
-          // the name of the check.
-        if (GraphPrintCheckerState->isImplicitNullDeref(N))
-          Out << "\\|Implicit-Null Dereference.\\l";
-        else if (GraphPrintCheckerState->isExplicitNullDeref(N))
-          Out << "\\|Explicit-Null Dereference.\\l";
-        else if (GraphPrintCheckerState->isUndefDeref(N))
-          Out << "\\|Dereference of undefialied value.\\l";
-        else if (GraphPrintCheckerState->isUndefStore(N))
-          Out << "\\|Store to Undefined Loc.";
-        else if (GraphPrintCheckerState->isUndefResult(N))
-          Out << "\\|Result of operation is undefined.";
-        else if (GraphPrintCheckerState->isNoReturnCall(N))
-          Out << "\\|Call to function marked \"noreturn\".";
-        else if (GraphPrintCheckerState->isBadCall(N))
-          Out << "\\|Call to NULL/Undefined.";
-        else if (GraphPrintCheckerState->isUndefArg(N))
-          Out << "\\|Argument in call is undefined";
-#endif
-
         break;
       }
     }
@@ -2738,6 +2706,40 @@
     ProgramStateRef state = N->getState();
     Out << "\\|StateID: " << (const void*) state.get()
         << " NodeID: " << (const void*) N << "\\|";
+
+    // Analysis stack backtrace.
+    Out << "Location context stack (from current to outer):\\l";
+    const LocationContext *LC = Loc.getLocationContext();
+    unsigned Idx = 0;
+    for (; LC; LC = LC->getParent(), ++Idx) {
+      Out << Idx << ". (" << (const void *)LC << ") ";
+      switch (LC->getKind()) {
+      case LocationContext::StackFrame:
+        if (const NamedDecl *D = dyn_cast<NamedDecl>(LC->getDecl()))
+          Out << "Calling " << D->getQualifiedNameAsString();
+        else
+          Out << "Calling anonymous code";
+        if (const Stmt *S = cast<StackFrameContext>(LC)->getCallSite()) {
+          Out << " at ";
+          printLocation2(Out, S->getLocStart());
+        }
+        break;
+      case LocationContext::Block:
+        Out << "Invoking block";
+        if (const Decl *D = cast<BlockInvocationContext>(LC)->getBlockDecl()) {
+          Out << " defined at ";
+          printLocation2(Out, D->getLocStart());
+        }
+        break;
+      case LocationContext::Scope:
+        Out << "Entering scope";
+        // FIXME: Add more info once ScopeContext is activated.
+        break;
+      }
+      Out << "\\l";
+    }
+    Out << "\\l";
+
     state->printDOT(Out);
 
     Out << "\\l";
diff --git a/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index 175225b..0f40739 100644
--- a/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -341,6 +341,7 @@
       case CK_AnyPointerToBlockPointerCast:
       case CK_ObjCObjectLValueCast:
       case CK_ZeroToOCLEvent:
+      case CK_IntToOCLSampler:
       case CK_LValueBitCast: {
         // Delegate to SValBuilder to process.
         SVal V = state->getSVal(Ex, LCtx);
@@ -385,7 +386,7 @@
           Failed = true;
         // Else, evaluate the cast.
         else
-          val = getStoreManager().evalDynamicCast(val, T, Failed);
+          val = getStoreManager().attemptDownCast(val, T, Failed);
 
         if (Failed) {
           if (T->isReferenceType()) {
@@ -411,6 +412,28 @@
         Bldr.generateNode(CastE, Pred, state);
         continue;
       }
+      case CK_BaseToDerived: {
+        SVal val = state->getSVal(Ex, LCtx);
+        QualType resultType = CastE->getType();
+        if (CastE->isGLValue())
+          resultType = getContext().getPointerType(resultType);
+
+        bool Failed = false;
+
+        if (!val.isConstant()) {
+          val = getStoreManager().attemptDownCast(val, T, Failed);
+        }
+
+        // Failed to cast or the result is unknown, fall back to conservative.
+        if (Failed || val.isUnknown()) {
+          val =
+            svalBuilder.conjureSymbolVal(nullptr, CastE, LCtx, resultType,
+                                         currBldrCtx->blockCount());
+        }
+        state = state->BindExpr(CastE, LCtx, val);
+        Bldr.generateNode(CastE, Pred, state);
+        continue;
+      }
       case CK_NullToMemberPointer: {
         // FIXME: For now, member pointers are represented by void *.
         SVal V = svalBuilder.makeNull();
@@ -420,7 +443,6 @@
       }
       // Various C++ casts that are not handled yet.
       case CK_ToUnion:
-      case CK_BaseToDerived:
       case CK_BaseToDerivedMemberPointer:
       case CK_DerivedToBaseMemberPointer:
       case CK_ReinterpretMemberPointer:
diff --git a/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp b/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
index 3b504bb..39d88bf 100644
--- a/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
+++ b/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
@@ -382,7 +382,6 @@
     }
     LCtx = LCtx->getParent();
   }
-
 }
 
 // The GDM component containing the dynamic dispatch bifurcation info. When
@@ -396,7 +395,8 @@
     DynamicDispatchModeInlined = 1,
     DynamicDispatchModeConservative
   };
-}
+} // end anonymous namespace
+
 REGISTER_TRAIT_WITH_PROGRAMSTATE(DynamicDispatchBifurcationMap,
                                  CLANG_ENTO_PROGRAMSTATE_MAP(const MemRegion *,
                                                              unsigned))
@@ -429,7 +429,6 @@
                              currBldrCtx->getBlock(),
                              currStmtIdx);
 
-
   CallEnter Loc(CallE, CalleeSFC, CurLC);
 
   // Construct a new state which contains the mapping from actual to
@@ -766,7 +765,6 @@
       if (!Opts.mayInlineCXXSharedPtrDtor())
         if (isCXXSharedPtrDtor(FD))
           return false;
-
     }
   }
 
@@ -976,13 +974,10 @@
   conservativeEvalCall(Call, Bldr, Pred, NoIState);
 
   NumOfDynamicDispatchPathSplits++;
-  return;
 }
 
-
 void ExprEngine::VisitReturnStmt(const ReturnStmt *RS, ExplodedNode *Pred,
                                  ExplodedNodeSet &Dst) {
-
   ExplodedNodeSet dstPreVisit;
   getCheckerManager().runCheckersForPreStmt(dstPreVisit, Pred, RS, *this);
 
diff --git a/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
index b3edb85..3a18956 100644
--- a/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
+++ b/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
@@ -412,13 +412,13 @@
   // Output a maximum size.
   if (!isa<PathDiagnosticMacroPiece>(P)) {
     // Get the string and determining its maximum substring.
-    const std::string& Msg = P.getString();
+    const auto &Msg = P.getString();
     unsigned max_token = 0;
     unsigned cnt = 0;
     unsigned len = Msg.size();
 
-    for (std::string::const_iterator I=Msg.begin(), E=Msg.end(); I!=E; ++I)
-      switch (*I) {
+    for (char C : Msg)
+      switch (C) {
       default:
         ++cnt;
         continue;
diff --git a/lib/StaticAnalyzer/Core/IssueHash.cpp b/lib/StaticAnalyzer/Core/IssueHash.cpp
index bd5c811..abdea88 100644
--- a/lib/StaticAnalyzer/Core/IssueHash.cpp
+++ b/lib/StaticAnalyzer/Core/IssueHash.cpp
@@ -13,7 +13,6 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Lex/Lexer.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
diff --git a/lib/StaticAnalyzer/Core/Makefile b/lib/StaticAnalyzer/Core/Makefile
deleted file mode 100644
index c3e00fa..0000000
--- a/lib/StaticAnalyzer/Core/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-##===- clang/lib/StaticAnalyzer/Core/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# This implements analyses built on top of source-level CFGs.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../../..
-LIBRARYNAME := clangStaticAnalyzerCore
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/StaticAnalyzer/Core/MemRegion.cpp b/lib/StaticAnalyzer/Core/MemRegion.cpp
index 11dfb9d..c4ba2ae 100644
--- a/lib/StaticAnalyzer/Core/MemRegion.cpp
+++ b/lib/StaticAnalyzer/Core/MemRegion.cpp
@@ -31,29 +31,6 @@
 // MemRegion Construction.
 //===----------------------------------------------------------------------===//
 
-template<typename RegionTy> struct MemRegionManagerTrait;
-
-template <typename RegionTy, typename A1>
-RegionTy* MemRegionManager::getRegion(const A1 a1) {
-
-  const typename MemRegionManagerTrait<RegionTy>::SuperRegionTy *superRegion =
-  MemRegionManagerTrait<RegionTy>::getSuperRegion(*this, a1);
-
-  llvm::FoldingSetNodeID ID;
-  RegionTy::ProfileRegion(ID, a1, superRegion);
-  void *InsertPos;
-  RegionTy* R = cast_or_null<RegionTy>(Regions.FindNodeOrInsertPos(ID,
-                                                                   InsertPos));
-
-  if (!R) {
-    R = A.Allocate<RegionTy>();
-    new (R) RegionTy(a1, superRegion);
-    Regions.InsertNode(R, InsertPos);
-  }
-
-  return R;
-}
-
 template <typename RegionTy, typename A1>
 RegionTy* MemRegionManager::getSubRegion(const A1 a1,
                                          const MemRegion *superRegion) {
@@ -73,30 +50,8 @@
 }
 
 template <typename RegionTy, typename A1, typename A2>
-RegionTy* MemRegionManager::getRegion(const A1 a1, const A2 a2) {
-
-  const typename MemRegionManagerTrait<RegionTy>::SuperRegionTy *superRegion =
-  MemRegionManagerTrait<RegionTy>::getSuperRegion(*this, a1, a2);
-
-  llvm::FoldingSetNodeID ID;
-  RegionTy::ProfileRegion(ID, a1, a2, superRegion);
-  void *InsertPos;
-  RegionTy* R = cast_or_null<RegionTy>(Regions.FindNodeOrInsertPos(ID,
-                                                                   InsertPos));
-
-  if (!R) {
-    R = A.Allocate<RegionTy>();
-    new (R) RegionTy(a1, a2, superRegion);
-    Regions.InsertNode(R, InsertPos);
-  }
-
-  return R;
-}
-
-template <typename RegionTy, typename A1, typename A2>
 RegionTy* MemRegionManager::getSubRegion(const A1 a1, const A2 a2,
                                          const MemRegion *superRegion) {
-
   llvm::FoldingSetNodeID ID;
   RegionTy::ProfileRegion(ID, a1, a2, superRegion);
   void *InsertPos;
@@ -115,7 +70,6 @@
 template <typename RegionTy, typename A1, typename A2, typename A3>
 RegionTy* MemRegionManager::getSubRegion(const A1 a1, const A2 a2, const A3 a3,
                                          const MemRegion *superRegion) {
-
   llvm::FoldingSetNodeID ID;
   RegionTy::ProfileRegion(ID, a1, a2, a3, superRegion);
   void *InsertPos;
@@ -582,12 +536,10 @@
   os << "'";
   printPrettyAsExpr(os);
   os << "'";
-  return;
 }
 
 void MemRegion::printPrettyAsExpr(raw_ostream &os) const {
   llvm_unreachable("This region cannot be printed pretty.");
-  return;
 }
 
 bool VarRegion::canPrintPrettyAsExpr() const {
@@ -628,7 +580,6 @@
   } else {
     os << "field " << "\'" << getDecl()->getName() << "'";
   }
-  return;
 }
 
 bool CXXBaseObjectRegion::canPrintPrettyAsExpr() const {
@@ -639,6 +590,65 @@
   superRegion->printPrettyAsExpr(os);
 }
 
+std::string MemRegion::getDescriptiveName(bool UseQuotes) const {
+  std::string VariableName;
+  std::string ArrayIndices;
+  const MemRegion *R = this;
+  SmallString<50> buf;
+  llvm::raw_svector_ostream os(buf);
+
+  // Obtain array indices to add them to the variable name.
+  const ElementRegion *ER = nullptr;
+  while ((ER = R->getAs<ElementRegion>())) {
+    // Index is a ConcreteInt.
+    if (auto CI = ER->getIndex().getAs<nonloc::ConcreteInt>()) {
+      llvm::SmallString<2> Idx;
+      CI->getValue().toString(Idx);
+      ArrayIndices = (llvm::Twine("[") + Idx.str() + "]" + ArrayIndices).str();
+    }
+    // If not a ConcreteInt, try to obtain the variable
+    // name by calling 'getDescriptiveName' recursively.
+    else {
+      std::string Idx = ER->getDescriptiveName(false);
+      if (!Idx.empty()) {
+        ArrayIndices = (llvm::Twine("[") + Idx + "]" + ArrayIndices).str();
+      }
+    }
+    R = ER->getSuperRegion();
+  }
+
+  // Get variable name.
+  if (R && R->canPrintPrettyAsExpr()) {
+    R->printPrettyAsExpr(os);
+    if (UseQuotes) {
+      return (llvm::Twine("'") + os.str() + ArrayIndices + "'").str();
+    } else {
+      return (llvm::Twine(os.str()) + ArrayIndices).str();
+    }
+  }
+
+  return VariableName;
+}
+
+SourceRange MemRegion::sourceRange() const {
+  const VarRegion *const VR = dyn_cast<VarRegion>(this->getBaseRegion());
+  const FieldRegion *const FR = dyn_cast<FieldRegion>(this);
+
+  // Check for more specific regions first.
+  // FieldRegion
+  if (FR) {
+    return FR->getDecl()->getSourceRange();
+  }
+  // VarRegion
+  else if (VR) {
+    return VR->getDecl()->getSourceRange();
+  }
+  // Return invalid source range (can be checked by client).
+  else {
+    return SourceRange{};
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // MemRegionManager methods.
 //===----------------------------------------------------------------------===//
@@ -892,7 +902,6 @@
 const CompoundLiteralRegion*
 MemRegionManager::getCompoundLiteralRegion(const CompoundLiteralExpr *CL,
                                            const LocationContext *LC) {
-
   const MemRegion *sReg = nullptr;
 
   if (CL->isFileScope())
@@ -910,7 +919,6 @@
 MemRegionManager::getElementRegion(QualType elementType, NonLoc Idx,
                                    const MemRegion* superRegion,
                                    ASTContext &Ctx){
-
   QualType T = Ctx.getCanonicalType(elementType).getUnqualifiedType();
 
   llvm::FoldingSetNodeID ID;
diff --git a/lib/StaticAnalyzer/Core/PathDiagnostic.cpp b/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
index 504df30..217d628 100644
--- a/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
+++ b/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
@@ -61,7 +61,6 @@
 PathDiagnosticControlFlowPiece::~PathDiagnosticControlFlowPiece() {}
 PathDiagnosticMacroPiece::~PathDiagnosticMacroPiece() {}
 
-
 void PathPieces::flattenTo(PathPieces &Primary, PathPieces &Current,
                            bool ShouldFlattenMacros) const {
   for (PathPieces::const_iterator I = begin(), E = end(); I != E; ++I) {
@@ -102,7 +101,6 @@
   }
 }
 
-
 PathDiagnostic::~PathDiagnostic() {}
 
 PathDiagnostic::PathDiagnostic(StringRef CheckName, const Decl *declWithIssue,
@@ -278,6 +276,7 @@
 }
 
 static Optional<bool> comparePath(const PathPieces &X, const PathPieces &Y);
+
 static Optional<bool>
 compareControlFlow(const PathDiagnosticControlFlowPiece &X,
                    const PathDiagnosticControlFlowPiece &Y) {
@@ -505,7 +504,6 @@
   // S might be a temporary statement that does not have a location in the
   // source code, so find an enclosing statement and use its location.
   if (!L.isValid()) {
-
     AnalysisDeclContext *ADC;
     if (LAC.is<const LocationContext*>())
       ADC = LAC.get<const LocationContext*>()->getAnalysisDeclContext();
@@ -578,22 +576,20 @@
   llvm_unreachable("Unknown CFGElement kind");
 }
 
-
 PathDiagnosticLocation
-  PathDiagnosticLocation::createBegin(const Decl *D,
-                                      const SourceManager &SM) {
+PathDiagnosticLocation::createBegin(const Decl *D,
+                                    const SourceManager &SM) {
   return PathDiagnosticLocation(D->getLocStart(), SM, SingleLocK);
 }
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::createBegin(const Stmt *S,
-                                      const SourceManager &SM,
-                                      LocationOrAnalysisDeclContext LAC) {
+PathDiagnosticLocation::createBegin(const Stmt *S,
+                                    const SourceManager &SM,
+                                    LocationOrAnalysisDeclContext LAC) {
   return PathDiagnosticLocation(getValidSourceLocation(S, LAC),
                                 SM, SingleLocK);
 }
 
-
 PathDiagnosticLocation
 PathDiagnosticLocation::createEnd(const Stmt *S,
                                   const SourceManager &SM,
@@ -605,13 +601,13 @@
 }
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::createOperatorLoc(const BinaryOperator *BO,
-                                            const SourceManager &SM) {
+PathDiagnosticLocation::createOperatorLoc(const BinaryOperator *BO,
+                                          const SourceManager &SM) {
   return PathDiagnosticLocation(BO->getOperatorLoc(), SM, SingleLocK);
 }
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::createConditionalColonLoc(
+PathDiagnosticLocation::createConditionalColonLoc(
                                             const ConditionalOperator *CO,
                                             const SourceManager &SM) {
   return PathDiagnosticLocation(CO->getColonLoc(), SM, SingleLocK);
@@ -619,28 +615,28 @@
 
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::createMemberLoc(const MemberExpr *ME,
-                                          const SourceManager &SM) {
+PathDiagnosticLocation::createMemberLoc(const MemberExpr *ME,
+                                        const SourceManager &SM) {
   return PathDiagnosticLocation(ME->getMemberLoc(), SM, SingleLocK);
 }
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::createBeginBrace(const CompoundStmt *CS,
-                                           const SourceManager &SM) {
+PathDiagnosticLocation::createBeginBrace(const CompoundStmt *CS,
+                                         const SourceManager &SM) {
   SourceLocation L = CS->getLBracLoc();
   return PathDiagnosticLocation(L, SM, SingleLocK);
 }
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::createEndBrace(const CompoundStmt *CS,
-                                         const SourceManager &SM) {
+PathDiagnosticLocation::createEndBrace(const CompoundStmt *CS,
+                                       const SourceManager &SM) {
   SourceLocation L = CS->getRBracLoc();
   return PathDiagnosticLocation(L, SM, SingleLocK);
 }
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::createDeclBegin(const LocationContext *LC,
-                                          const SourceManager &SM) {
+PathDiagnosticLocation::createDeclBegin(const LocationContext *LC,
+                                        const SourceManager &SM) {
   // FIXME: Should handle CXXTryStmt if analyser starts supporting C++.
   if (const CompoundStmt *CS =
         dyn_cast_or_null<CompoundStmt>(LC->getDecl()->getBody()))
@@ -653,16 +649,15 @@
 }
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::createDeclEnd(const LocationContext *LC,
-                                        const SourceManager &SM) {
+PathDiagnosticLocation::createDeclEnd(const LocationContext *LC,
+                                      const SourceManager &SM) {
   SourceLocation L = LC->getDecl()->getBodyRBrace();
   return PathDiagnosticLocation(L, SM, SingleLocK);
 }
 
 PathDiagnosticLocation
-  PathDiagnosticLocation::create(const ProgramPoint& P,
-                                 const SourceManager &SMng) {
-
+PathDiagnosticLocation::create(const ProgramPoint& P,
+                               const SourceManager &SMng) {
   const Stmt* S = nullptr;
   if (Optional<BlockEdge> BE = P.getAs<BlockEdge>()) {
     const CFGBlock *BSrc = BE->getSrc();
@@ -1062,7 +1057,6 @@
   ID.AddInteger(Range.getBegin().getRawEncoding());
   ID.AddInteger(Range.getEnd().getRawEncoding());
   ID.AddInteger(Loc.getRawEncoding());
-  return;
 }
 
 void PathDiagnosticPiece::Profile(llvm::FoldingSetNodeID &ID) const {
diff --git a/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp b/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
index 55e1222..8f5337e 100644
--- a/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
+++ b/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
@@ -19,7 +19,6 @@
 #include "clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h"
 #include "clang/StaticAnalyzer/Core/IssueHash.h"
 #include "clang/StaticAnalyzer/Core/PathDiagnosticConsumers.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 using namespace clang;
@@ -124,7 +123,7 @@
   --indent;
 
   // Output any helper text.
-  const std::string& s = P.getString();
+  const auto &s = P.getString();
   if (!s.empty()) {
     Indent(o, indent) << "<key>alternate</key>";
     EmitString(o, s) << '\n';
diff --git a/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index 77b0ad3..4c98f25 100644
--- a/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -18,7 +18,6 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/ImmutableSet.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
diff --git a/lib/StaticAnalyzer/Core/RegionStore.cpp b/lib/StaticAnalyzer/Core/RegionStore.cpp
index a63f6e4..5de3af9 100644
--- a/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -14,6 +14,7 @@
 // parameters are created lazily.
 //
 //===----------------------------------------------------------------------===//
+
 #include "clang/AST/Attr.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/Analysis/Analyses/LiveVariables.h"
@@ -25,10 +26,10 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
-#include "llvm/ADT/ImmutableList.h"
 #include "llvm/ADT/ImmutableMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/raw_ostream.h"
+#include <utility>
 
 using namespace clang;
 using namespace ento;
@@ -665,10 +666,9 @@
 
 public:
   ClusterAnalysis(RegionStoreManager &rm, ProgramStateManager &StateMgr,
-                  RegionBindingsRef b )
-    : RM(rm), Ctx(StateMgr.getContext()),
-      svalBuilder(StateMgr.getSValBuilder()),
-      B(b) {}
+                  RegionBindingsRef b)
+      : RM(rm), Ctx(StateMgr.getContext()),
+        svalBuilder(StateMgr.getSValBuilder()), B(std::move(b)) {}
 
   RegionBindingsRef getRegionBindings() const { return B; }
 
@@ -1130,11 +1130,10 @@
         // Check offset is not symbolic and within array's boundaries.
         // Handles arrays of 0 elements and of 0-sized elements as well.
         if (!ROffset ||
-            (ROffset &&
-             ((*ROffset >= LowerOffset && *ROffset < UpperOffset) ||
-              (UpperOverflow &&
-               (*ROffset >= LowerOffset || *ROffset < UpperOffset)) ||
-              (LowerOffset == UpperOffset && *ROffset == LowerOffset)))) {
+            ((*ROffset >= LowerOffset && *ROffset < UpperOffset) ||
+             (UpperOverflow &&
+              (*ROffset >= LowerOffset || *ROffset < UpperOffset)) ||
+             (LowerOffset == UpperOffset && *ROffset == LowerOffset))) {
           B = B.removeBinding(I.getKey());
           // Bound symbolic regions need to be invalidated for dead symbol
           // detection.
diff --git a/lib/StaticAnalyzer/Core/Store.cpp b/lib/StaticAnalyzer/Core/Store.cpp
index de29f0e..aca6e3b 100644
--- a/lib/StaticAnalyzer/Core/Store.cpp
+++ b/lib/StaticAnalyzer/Core/Store.cpp
@@ -292,7 +292,7 @@
   return nullptr;
 }
 
-SVal StoreManager::evalDynamicCast(SVal Base, QualType TargetType,
+SVal StoreManager::attemptDownCast(SVal Base, QualType TargetType,
                                    bool &Failed) {
   Failed = false;
 
diff --git a/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index 083ee51..2faf62e 100644
--- a/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -13,17 +13,14 @@
 
 #include "clang/StaticAnalyzer/Frontend/AnalysisConsumer.h"
 #include "ModelInjector.h"
-#include "clang/AST/ASTConsumer.h"
-#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
-#include "clang/AST/ParentMap.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Analysis/Analyses/LiveVariables.h"
 #include "clang/Analysis/CFG.h"
 #include "clang/Analysis/CallGraph.h"
 #include "clang/Analysis/CodeInjector.h"
-#include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Lex/Preprocessor.h"
@@ -36,9 +33,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
 #include "clang/StaticAnalyzer/Frontend/CheckerRegistration.h"
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -47,10 +42,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 #include <queue>
+#include <utility>
 
 using namespace clang;
 using namespace ento;
-using llvm::SmallPtrSet;
 
 #define DEBUG_TYPE "AnalysisConsumer"
 
@@ -185,13 +180,12 @@
   /// translation unit.
   FunctionSummariesTy FunctionSummaries;
 
-  AnalysisConsumer(const Preprocessor& pp,
-                   const std::string& outdir,
-                   AnalyzerOptionsRef opts,
-                   ArrayRef<std::string> plugins,
+  AnalysisConsumer(const Preprocessor &pp, const std::string &outdir,
+                   AnalyzerOptionsRef opts, ArrayRef<std::string> plugins,
                    CodeInjector *injector)
-    : RecVisitorMode(0), RecVisitorBR(nullptr), Ctx(nullptr), PP(pp),
-      OutDir(outdir), Opts(opts), Plugins(plugins), Injector(injector) {
+      : RecVisitorMode(0), RecVisitorBR(nullptr), Ctx(nullptr), PP(pp),
+        OutDir(outdir), Opts(std::move(opts)), Plugins(plugins),
+        Injector(injector) {
     DigestAnalyzerOptions();
     if (Opts->PrintStats) {
       llvm::EnableStatistics();
@@ -271,19 +265,8 @@
       else
         assert(Mode == (AM_Syntax | AM_Path) && "Unexpected mode!");
 
-      llvm::errs() << ": " << Loc.getFilename();
-      if (isa<FunctionDecl>(D) || isa<ObjCMethodDecl>(D)) {
-        const NamedDecl *ND = cast<NamedDecl>(D);
-        llvm::errs() << ' ' << ND->getQualifiedNameAsString() << '\n';
-      }
-      else if (isa<BlockDecl>(D)) {
-        llvm::errs() << ' ' << "block(line:" << Loc.getLine() << ",col:"
-                     << Loc.getColumn() << '\n';
-      }
-      else if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D)) {
-        Selector S = MD->getSelector();
-        llvm::errs() << ' ' << S.getAsString();
-      }
+      llvm::errs() << ": " << Loc.getFilename() << ' '
+                           << getFunctionName(D) << '\n';
     }
   }
 
@@ -383,6 +366,7 @@
 
 private:
   void storeTopLevelDecls(DeclGroupRef DG);
+  std::string getFunctionName(const Decl *D);
 
   /// \brief Check if we should skip (not analyze) the given function.
   AnalysisMode getModeForDecl(Decl *D, AnalysisMode Mode);
@@ -432,6 +416,13 @@
   //   Count naming convention errors more aggressively.
   if (isa<ObjCMethodDecl>(D))
     return false;
+  // We also want to reanalyze all C++ copy and move assignment operators to
+  // separately check the two cases where 'this' aliases with the parameter and
+  // where it may not. (cplusplus.SelfAssignmentChecker)
+  if (const auto *MD = dyn_cast<CXXMethodDecl>(D)) {
+    if (MD->isCopyAssignmentOperator() || MD->isMoveAssignmentOperator())
+      return false;
+  }
 
   // Otherwise, if we visited the function before, do not reanalyze it.
   return Visited.count(D);
@@ -443,9 +434,7 @@
   // We want to reanalyze all ObjC methods as top level to report Retain
   // Count naming convention errors more aggressively. But we should tune down
   // inlining when reanalyzing an already inlined function.
-  if (Visited.count(D)) {
-    assert(isa<ObjCMethodDecl>(D) &&
-           "We are only reanalyzing ObjCMethods.");
+  if (Visited.count(D) && isa<ObjCMethodDecl>(D)) {
     const ObjCMethodDecl *ObjCM = cast<ObjCMethodDecl>(D);
     if (ObjCM->getMethodFamily() != OMF_init)
       return ExprEngine::Inline_Minimal;
@@ -569,16 +558,64 @@
 
 }
 
-static std::string getFunctionName(const Decl *D) {
-  if (const ObjCMethodDecl *ID = dyn_cast<ObjCMethodDecl>(D)) {
-    return ID->getSelector().getAsString();
+std::string AnalysisConsumer::getFunctionName(const Decl *D) {
+  std::string Str;
+  llvm::raw_string_ostream OS(Str);
+
+  if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
+    OS << FD->getQualifiedNameAsString();
+
+    // In C++, there are overloads.
+    if (Ctx->getLangOpts().CPlusPlus) {
+      OS << '(';
+      for (const auto &P : FD->parameters()) {
+        if (P != *FD->param_begin())
+          OS << ", ";
+        OS << P->getType().getAsString();
+      }
+      OS << ')';
+    }
+
+  } else if (isa<BlockDecl>(D)) {
+    PresumedLoc Loc = Ctx->getSourceManager().getPresumedLoc(D->getLocation());
+
+    if (Loc.isValid()) {
+      OS << "block (line: " << Loc.getLine() << ", col: " << Loc.getColumn()
+         << ')';
+    }
+
+  } else if (const ObjCMethodDecl *OMD = dyn_cast<ObjCMethodDecl>(D)) {
+
+    // FIXME: copy-pasted from CGDebugInfo.cpp.
+    OS << (OMD->isInstanceMethod() ? '-' : '+') << '[';
+    const DeclContext *DC = OMD->getDeclContext();
+    if (const auto *OID = dyn_cast<ObjCImplementationDecl>(DC)) {
+      OS << OID->getName();
+    } else if (const auto *OID = dyn_cast<ObjCInterfaceDecl>(DC)) {
+      OS << OID->getName();
+    } else if (const auto *OC = dyn_cast<ObjCCategoryDecl>(DC)) {
+      if (OC->IsClassExtension()) {
+        OS << OC->getClassInterface()->getName();
+      } else {
+        OS << OC->getIdentifier()->getNameStart() << '('
+           << OC->getIdentifier()->getNameStart() << ')';
+      }
+    } else if (const auto *OCD = dyn_cast<ObjCCategoryImplDecl>(DC)) {
+      OS << ((const NamedDecl *)OCD)->getIdentifier()->getNameStart() << '('
+         << OCD->getIdentifier()->getNameStart() << ')';
+    } else if (isa<ObjCProtocolDecl>(DC)) {
+      // We can extract the type of the class from the self pointer.
+      if (ImplicitParamDecl *SelfDecl = OMD->getSelfDecl()) {
+        QualType ClassTy =
+            cast<ObjCObjectPointerType>(SelfDecl->getType())->getPointeeType();
+        ClassTy.print(OS, PrintingPolicy(LangOptions()));
+      }
+    }
+    OS << ' ' << OMD->getSelector().getAsString() << ']';
+
   }
-  if (const FunctionDecl *ND = dyn_cast<FunctionDecl>(D)) {
-    IdentifierInfo *II = ND->getIdentifier();
-    if (II)
-      return II->getName();
-  }
-  return "";
+
+  return OS.str();
 }
 
 AnalysisConsumer::AnalysisMode
@@ -799,10 +836,7 @@
   std::string Ubiviz;
   if (auto Path = llvm::sys::findProgramByName("ubiviz"))
     Ubiviz = *Path;
-  std::vector<const char*> args;
-  args.push_back(Ubiviz.c_str());
-  args.push_back(Filename.c_str());
-  args.push_back(nullptr);
+  const char *args[] = {Ubiviz.c_str(), Filename.c_str(), nullptr};
 
   if (llvm::sys::ExecuteAndWait(Ubiviz, &args[0], nullptr, nullptr, 0, 0,
                                 &ErrMsg)) {
diff --git a/lib/StaticAnalyzer/Frontend/CheckerRegistration.cpp b/lib/StaticAnalyzer/Frontend/CheckerRegistration.cpp
index 75fa4c6..1668aeb 100644
--- a/lib/StaticAnalyzer/Frontend/CheckerRegistration.cpp
+++ b/lib/StaticAnalyzer/Frontend/CheckerRegistration.cpp
@@ -101,6 +101,16 @@
       << pluginAPIVersion;
 }
 
+static SmallVector<CheckerOptInfo, 8>
+getCheckerOptList(const AnalyzerOptions &opts) {
+  SmallVector<CheckerOptInfo, 8> checkerOpts;
+  for (unsigned i = 0, e = opts.CheckersControlList.size(); i != e; ++i) {
+    const std::pair<std::string, bool> &opt = opts.CheckersControlList[i];
+    checkerOpts.push_back(CheckerOptInfo(opt.first.c_str(), opt.second));
+  }
+  return checkerOpts;
+}
+
 std::unique_ptr<CheckerManager>
 ento::createCheckerManager(AnalyzerOptions &opts, const LangOptions &langOpts,
                            ArrayRef<std::string> plugins,
@@ -108,11 +118,7 @@
   std::unique_ptr<CheckerManager> checkerMgr(
       new CheckerManager(langOpts, &opts));
 
-  SmallVector<CheckerOptInfo, 8> checkerOpts;
-  for (unsigned i = 0, e = opts.CheckersControlList.size(); i != e; ++i) {
-    const std::pair<std::string, bool> &opt = opts.CheckersControlList[i];
-    checkerOpts.push_back(CheckerOptInfo(opt.first.c_str(), opt.second));
-  }
+  SmallVector<CheckerOptInfo, 8> checkerOpts = getCheckerOptList(opts);
 
   ClangCheckerRegistry allCheckers(plugins, &diags);
   allCheckers.initializeManager(*checkerMgr, checkerOpts);
@@ -137,3 +143,12 @@
 
   ClangCheckerRegistry(plugins).printHelp(out);
 }
+
+void ento::printEnabledCheckerList(raw_ostream &out,
+                                   ArrayRef<std::string> plugins,
+                                   const AnalyzerOptions &opts) {
+  out << "OVERVIEW: Clang Static Analyzer Enabled Checkers List\n\n";
+
+  SmallVector<CheckerOptInfo, 8> checkerOpts = getCheckerOptList(opts);
+  ClangCheckerRegistry(plugins).printList(out, checkerOpts);
+}
diff --git a/lib/StaticAnalyzer/Frontend/Makefile b/lib/StaticAnalyzer/Frontend/Makefile
deleted file mode 100644
index 3f15988..0000000
--- a/lib/StaticAnalyzer/Frontend/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-##===- clang/lib/StaticAnalyzer/Frontend/Makefile ----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# Starting point into the static analyzer land for the driver.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../../..
-LIBRARYNAME := clangStaticAnalyzerFrontend
-
-CPP.Flags += -I${PROJ_OBJ_DIR}/../Checkers
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/StaticAnalyzer/Frontend/ModelInjector.cpp b/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
index ee2c3f5..0a28485 100644
--- a/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
+++ b/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/FileSystem.h"
-#include <string>
 #include <utility>
 
 using namespace clang;
diff --git a/lib/StaticAnalyzer/Frontend/ModelInjector.h b/lib/StaticAnalyzer/Frontend/ModelInjector.h
index e23bf8a..98a5f69 100644
--- a/lib/StaticAnalyzer/Frontend/ModelInjector.h
+++ b/lib/StaticAnalyzer/Frontend/ModelInjector.h
@@ -25,11 +25,7 @@
 #define LLVM_CLANG_SA_FRONTEND_MODELINJECTOR_H
 
 #include "clang/Analysis/CodeInjector.h"
-#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/StringMap.h"
-#include <map>
-#include <memory>
-#include <vector>
 
 namespace clang {
 
diff --git a/lib/StaticAnalyzer/Makefile b/lib/StaticAnalyzer/Makefile
deleted file mode 100644
index c166f06..0000000
--- a/lib/StaticAnalyzer/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- clang/lib/StaticAnalyzer/Makefile -------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-#
-# This implements analyses built on top of source-level CFGs. 
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-DIRS := Checkers Frontend
-PARALLEL_DIRS := Core
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Tooling/CMakeLists.txt b/lib/Tooling/CMakeLists.txt
index b5c3d54..56134c1 100644
--- a/lib/Tooling/CMakeLists.txt
+++ b/lib/Tooling/CMakeLists.txt
@@ -7,6 +7,7 @@
   CommonOptionsParser.cpp
   CompilationDatabase.cpp
   FileMatchTrie.cpp
+  FixIt.cpp
   JSONCompilationDatabase.cpp
   Refactoring.cpp
   RefactoringCallbacks.cpp
@@ -17,6 +18,7 @@
   clangASTMatchers
   clangBasic
   clangDriver
+  clangFormat
   clangFrontend
   clangLex
   clangRewrite
diff --git a/lib/Tooling/CommonOptionsParser.cpp b/lib/Tooling/CommonOptionsParser.cpp
index 82f5601..5a44061 100644
--- a/lib/Tooling/CommonOptionsParser.cpp
+++ b/lib/Tooling/CommonOptionsParser.cpp
@@ -62,7 +62,7 @@
       : Compilations(std::move(Compilations)) {}
 
   void appendArgumentsAdjuster(ArgumentsAdjuster Adjuster) {
-    Adjusters.push_back(Adjuster);
+    Adjusters.push_back(std::move(Adjuster));
   }
 
   std::vector<CompileCommand>
@@ -118,6 +118,8 @@
 
   Compilations.reset(FixedCompilationDatabase::loadFromCommandLine(argc, argv));
   cl::ParseCommandLineOptions(argc, argv, Overview);
+  cl::PrintOptionValues();
+
   SourcePathList = SourcePaths;
   if ((OccurrencesFlag == cl::ZeroOrMore || OccurrencesFlag == cl::Optional) &&
       SourcePathList.empty())
diff --git a/lib/Tooling/CompilationDatabase.cpp b/lib/Tooling/CompilationDatabase.cpp
index 957e401..6f95bf0 100644
--- a/lib/Tooling/CompilationDatabase.cpp
+++ b/lib/Tooling/CompilationDatabase.cpp
@@ -32,6 +32,8 @@
 using namespace clang;
 using namespace tooling;
 
+LLVM_INSTANTIATE_REGISTRY(CompilationDatabasePluginRegistry)
+
 CompilationDatabase::~CompilationDatabase() {}
 
 std::unique_ptr<CompilationDatabase>
@@ -139,9 +141,8 @@
       ;
     }
 
-    for (driver::ActionList::const_iterator I = A->begin(), E = A->end();
-         I != E; ++I)
-      runImpl(*I, CollectChildren);
+    for (const driver::Action *AI : A->inputs())
+      runImpl(AI, CollectChildren);
   }
 };
 
diff --git a/lib/Tooling/Core/CMakeLists.txt b/lib/Tooling/Core/CMakeLists.txt
index b88e1f8..f6348cb 100644
--- a/lib/Tooling/Core/CMakeLists.txt
+++ b/lib/Tooling/Core/CMakeLists.txt
@@ -3,6 +3,7 @@
 add_clang_library(clangToolingCore
   Lookup.cpp
   Replacement.cpp
+  QualTypeNames.cpp
 
   LINK_LIBS
   clangAST
diff --git a/lib/Tooling/Core/Lookup.cpp b/lib/Tooling/Core/Lookup.cpp
index 697eeb4..84135f4 100644
--- a/lib/Tooling/Core/Lookup.cpp
+++ b/lib/Tooling/Core/Lookup.cpp
@@ -16,33 +16,46 @@
 using namespace clang;
 using namespace clang::tooling;
 
-static bool isInsideDifferentNamespaceWithSameName(const DeclContext *DeclA,
-                                                   const DeclContext *DeclB) {
+// Returns true if the context in which the type is used and the context in
+// which the type is declared are the same semantical namespace but different
+// lexical namespaces.
+static bool
+usingFromDifferentCanonicalNamespace(const DeclContext *FromContext,
+                                     const DeclContext *UseContext) {
   while (true) {
-    // Look past non-namespaces on DeclA.
-    while (DeclA && !isa<NamespaceDecl>(DeclA))
-      DeclA = DeclA->getParent();
+    // Look past non-namespaces and anonymous namespaces on FromContext.
+    // We can skip anonymous namespace because:
+    // 1. `FromContext` and `UseContext` must be in the same anonymous
+    // namespaces since referencing across anonymous namespaces is not possible.
+    // 2. If `FromContext` and `UseContext` are in the same anonymous namespace,
+    // the function will still return `false` as expected.
+    while (FromContext &&
+           (!isa<NamespaceDecl>(FromContext) ||
+            cast<NamespaceDecl>(FromContext)->isAnonymousNamespace()))
+      FromContext = FromContext->getParent();
 
-    // Look past non-namespaces on DeclB.
-    while (DeclB && !isa<NamespaceDecl>(DeclB))
-      DeclB = DeclB->getParent();
+    // Look past non-namespaces and anonymous namespaces on UseContext.
+    while (UseContext &&
+           (!isa<NamespaceDecl>(UseContext) ||
+            cast<NamespaceDecl>(UseContext)->isAnonymousNamespace()))
+      UseContext = UseContext->getParent();
 
     // We hit the root, no namespace collision.
-    if (!DeclA || !DeclB)
+    if (!FromContext || !UseContext)
       return false;
 
     // Literally the same namespace, not a collision.
-    if (DeclA == DeclB)
+    if (FromContext == UseContext)
       return false;
 
     // Now check the names. If they match we have a different namespace with the
     // same name.
-    if (cast<NamespaceDecl>(DeclA)->getDeclName() ==
-        cast<NamespaceDecl>(DeclB)->getDeclName())
+    if (cast<NamespaceDecl>(FromContext)->getDeclName() ==
+        cast<NamespaceDecl>(UseContext)->getDeclName())
       return true;
 
-    DeclA = DeclA->getParent();
-    DeclB = DeclB->getParent();
+    FromContext = FromContext->getParent();
+    UseContext = UseContext->getParent();
   }
 }
 
@@ -98,8 +111,8 @@
   const bool in_global_namespace =
       isa<TranslationUnitDecl>(FromDecl->getDeclContext());
   if (class_name_only && !in_global_namespace &&
-      !isInsideDifferentNamespaceWithSameName(FromDecl->getDeclContext(),
-                                              UseContext)) {
+      !usingFromDifferentCanonicalNamespace(FromDecl->getDeclContext(),
+                                            UseContext)) {
     auto Pos = ReplacementString.rfind("::");
     return Pos != StringRef::npos ? ReplacementString.substr(Pos + 2)
                                   : ReplacementString;
diff --git a/lib/Tooling/Core/Makefile b/lib/Tooling/Core/Makefile
deleted file mode 100644
index 366466c..0000000
--- a/lib/Tooling/Core/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- clang/lib/Tooling/Core/Makefile ---------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../../..
-LIBRARYNAME := clangToolingCore
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Tooling/Core/QualTypeNames.cpp b/lib/Tooling/Core/QualTypeNames.cpp
new file mode 100644
index 0000000..721c2c9
--- /dev/null
+++ b/lib/Tooling/Core/QualTypeNames.cpp
@@ -0,0 +1,477 @@
+//===------- QualTypeNames.cpp - Generate Complete QualType Names ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Core/QualTypeNames.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/DeclarationName.h"
+#include "clang/AST/GlobalDecl.h"
+#include "clang/AST/Mangle.h"
+
+#include <stdio.h>
+#include <memory>
+
+namespace clang {
+
+namespace TypeName {
+/// \brief Generates a QualType that can be used to name the same type
+/// if used at the end of the current translation unit. This ignores
+/// issues such as type shadowing.
+///
+/// \param[in] QT - the type for which the fully qualified type will be
+/// returned.
+/// \param[in] Ctx - the ASTContext to be used.
+/// \param[in] WithGlobalNsPrefix - Indicate whether the global namespace
+/// specifier "::" should be prepended or not.
+static QualType getFullyQualifiedType(QualType QT, const ASTContext &Ctx,
+                                      bool WithGlobalNsPrefix);
+
+/// \brief Create a NestedNameSpecifier for Namesp and its enclosing
+/// scopes.
+///
+/// \param[in] Ctx - the AST Context to be used.
+/// \param[in] Namesp - the NamespaceDecl for which a NestedNameSpecifier
+/// is requested.
+/// \param[in] WithGlobalNsPrefix - Indicate whether the global namespace
+/// specifier "::" should be prepended or not.
+static NestedNameSpecifier *createNestedNameSpecifier(
+    const ASTContext &Ctx,
+    const NamespaceDecl *Namesp,
+    bool WithGlobalNsPrefix);
+
+/// \brief Create a NestedNameSpecifier for TagDecl and its enclosing
+/// scopes.
+///
+/// \param[in] Ctx - the AST Context to be used.
+/// \param[in] TD - the TagDecl for which a NestedNameSpecifier is
+/// requested.
+/// \param[in] FullyQualify - Convert all template arguments into fully
+/// qualified names.
+/// \param[in] WithGlobalNsPrefix - Indicate whether the global namespace
+/// specifier "::" should be prepended or not.
+static NestedNameSpecifier *createNestedNameSpecifier(
+    const ASTContext &Ctx, const TypeDecl *TD,
+    bool FullyQualify, bool WithGlobalNsPrefix);
+
+static NestedNameSpecifier *createNestedNameSpecifierForScopeOf(
+    const ASTContext &Ctx, const Decl *decl,
+    bool FullyQualified, bool WithGlobalNsPrefix);
+
+static NestedNameSpecifier *getFullyQualifiedNestedNameSpecifier(
+    const ASTContext &Ctx, NestedNameSpecifier *scope, bool WithGlobalNsPrefix);
+
+static bool getFullyQualifiedTemplateName(const ASTContext &Ctx,
+                                          TemplateName &TName,
+                                          bool WithGlobalNsPrefix) {
+  bool Changed = false;
+  NestedNameSpecifier *NNS = nullptr;
+
+  TemplateDecl *ArgTDecl = TName.getAsTemplateDecl();
+  // ArgTDecl won't be NULL because we asserted that this isn't a
+  // dependent context very early in the call chain.
+  assert(ArgTDecl != nullptr);
+  QualifiedTemplateName *QTName = TName.getAsQualifiedTemplateName();
+
+  if (QTName && !QTName->hasTemplateKeyword()) {
+    NNS = QTName->getQualifier();
+    NestedNameSpecifier *QNNS = getFullyQualifiedNestedNameSpecifier(
+        Ctx, NNS, WithGlobalNsPrefix);
+    if (QNNS != NNS) {
+      Changed = true;
+      NNS = QNNS;
+    } else {
+      NNS = nullptr;
+    }
+  } else {
+    NNS = createNestedNameSpecifierForScopeOf(
+        Ctx, ArgTDecl, true, WithGlobalNsPrefix);
+  }
+  if (NNS) {
+    TName = Ctx.getQualifiedTemplateName(NNS,
+                                         /*TemplateKeyword=*/false, ArgTDecl);
+    Changed = true;
+  }
+  return Changed;
+}
+
+static bool getFullyQualifiedTemplateArgument(const ASTContext &Ctx,
+                                              TemplateArgument &Arg,
+                                              bool WithGlobalNsPrefix) {
+  bool Changed = false;
+
+  // Note: we do not handle TemplateArgument::Expression, to replace it
+  // we need the information for the template instance decl.
+
+  if (Arg.getKind() == TemplateArgument::Template) {
+    TemplateName TName = Arg.getAsTemplate();
+    Changed = getFullyQualifiedTemplateName(Ctx, TName, WithGlobalNsPrefix);
+    if (Changed) {
+      Arg = TemplateArgument(TName);
+    }
+  } else if (Arg.getKind() == TemplateArgument::Type) {
+    QualType SubTy = Arg.getAsType();
+    // Check if the type needs more desugaring and recurse.
+    QualType QTFQ = getFullyQualifiedType(SubTy, Ctx, WithGlobalNsPrefix);
+    if (QTFQ != SubTy) {
+      Arg = TemplateArgument(QTFQ);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+static const Type *getFullyQualifiedTemplateType(const ASTContext &Ctx,
+                                                 const Type *TypePtr,
+                                                 bool WithGlobalNsPrefix) {
+  // DependentTemplateTypes exist within template declarations and
+  // definitions. Therefore we shouldn't encounter them at the end of
+  // a translation unit. If we do, the caller has made an error.
+  assert(!isa<DependentTemplateSpecializationType>(TypePtr));
+  // In case of template specializations, iterate over the arguments
+  // and fully qualify them as well.
+  if (const auto *TST = dyn_cast<const TemplateSpecializationType>(TypePtr)) {
+    bool MightHaveChanged = false;
+    SmallVector<TemplateArgument, 4> FQArgs;
+    for (TemplateSpecializationType::iterator I = TST->begin(), E = TST->end();
+         I != E; ++I) {
+      // Cheap to copy and potentially modified by
+      // getFullyQualifedTemplateArgument.
+      TemplateArgument Arg(*I);
+      MightHaveChanged |= getFullyQualifiedTemplateArgument(
+          Ctx, Arg, WithGlobalNsPrefix);
+      FQArgs.push_back(Arg);
+    }
+
+    // If a fully qualified arg is different from the unqualified arg,
+    // allocate new type in the AST.
+    if (MightHaveChanged) {
+      QualType QT = Ctx.getTemplateSpecializationType(
+          TST->getTemplateName(), FQArgs,
+          TST->getCanonicalTypeInternal());
+      // getTemplateSpecializationType returns a fully qualified
+      // version of the specialization itself, so no need to qualify
+      // it.
+      return QT.getTypePtr();
+    }
+  } else if (const auto *TSTRecord = dyn_cast<const RecordType>(TypePtr)) {
+    // We are asked to fully qualify and we have a Record Type,
+    // which can point to a template instantiation with no sugar in any of
+    // its template argument, however we still need to fully qualify them.
+
+    if (const auto *TSTDecl =
+        dyn_cast<ClassTemplateSpecializationDecl>(TSTRecord->getDecl())) {
+      const TemplateArgumentList &TemplateArgs = TSTDecl->getTemplateArgs();
+
+      bool MightHaveChanged = false;
+      SmallVector<TemplateArgument, 4> FQArgs;
+      for (unsigned int I = 0, E = TemplateArgs.size(); I != E; ++I) {
+        // cheap to copy and potentially modified by
+        // getFullyQualifedTemplateArgument
+        TemplateArgument Arg(TemplateArgs[I]);
+        MightHaveChanged |= getFullyQualifiedTemplateArgument(
+            Ctx, Arg, WithGlobalNsPrefix);
+        FQArgs.push_back(Arg);
+      }
+
+      // If a fully qualified arg is different from the unqualified arg,
+      // allocate new type in the AST.
+      if (MightHaveChanged) {
+        TemplateName TN(TSTDecl->getSpecializedTemplate());
+        QualType QT = Ctx.getTemplateSpecializationType(
+            TN, FQArgs,
+            TSTRecord->getCanonicalTypeInternal());
+        // getTemplateSpecializationType returns a fully qualified
+        // version of the specialization itself, so no need to qualify
+        // it.
+        return QT.getTypePtr();
+      }
+    }
+  }
+  return TypePtr;
+}
+
+static NestedNameSpecifier *createOuterNNS(const ASTContext &Ctx, const Decl *D,
+                                           bool FullyQualify,
+                                           bool WithGlobalNsPrefix) {
+  const DeclContext *DC = D->getDeclContext();
+  if (const auto *NS = dyn_cast<NamespaceDecl>(DC)) {
+    while (NS && NS->isInline()) {
+      // Ignore inline namespace;
+      NS = dyn_cast<NamespaceDecl>(NS->getDeclContext());
+    }
+    if (NS->getDeclName()) {
+      return createNestedNameSpecifier(Ctx, NS, WithGlobalNsPrefix);
+    }
+    return nullptr;  // no starting '::', no anonymous
+  } else if (const auto *TD = dyn_cast<TagDecl>(DC)) {
+    return createNestedNameSpecifier(Ctx, TD, FullyQualify, WithGlobalNsPrefix);
+  } else if (const auto *TDD = dyn_cast<TypedefNameDecl>(DC)) {
+    return createNestedNameSpecifier(
+        Ctx, TDD, FullyQualify, WithGlobalNsPrefix);
+  } else if (WithGlobalNsPrefix && DC->isTranslationUnit()) {
+    return NestedNameSpecifier::GlobalSpecifier(Ctx);
+  }
+  return nullptr;  // no starting '::' if |WithGlobalNsPrefix| is false
+}
+
+/// \brief Return a fully qualified version of this name specifier.
+static NestedNameSpecifier *getFullyQualifiedNestedNameSpecifier(
+    const ASTContext &Ctx, NestedNameSpecifier *Scope,
+    bool WithGlobalNsPrefix) {
+  switch (Scope->getKind()) {
+    case NestedNameSpecifier::Global:
+      // Already fully qualified
+      return Scope;
+    case NestedNameSpecifier::Namespace:
+      return TypeName::createNestedNameSpecifier(
+          Ctx, Scope->getAsNamespace(), WithGlobalNsPrefix);
+    case NestedNameSpecifier::NamespaceAlias:
+      // Namespace aliases are only valid for the duration of the
+      // scope where they were introduced, and therefore are often
+      // invalid at the end of the TU.  So use the namespace name more
+      // likely to be valid at the end of the TU.
+      return TypeName::createNestedNameSpecifier(
+          Ctx,
+          Scope->getAsNamespaceAlias()->getNamespace()->getCanonicalDecl(),
+          WithGlobalNsPrefix);
+    case NestedNameSpecifier::Identifier:
+      // A function or some other construct that makes it un-namable
+      // at the end of the TU. Skip the current component of the name,
+      // but use the name of it's prefix.
+      return getFullyQualifiedNestedNameSpecifier(
+          Ctx, Scope->getPrefix(), WithGlobalNsPrefix);
+    case NestedNameSpecifier::Super:
+    case NestedNameSpecifier::TypeSpec:
+    case NestedNameSpecifier::TypeSpecWithTemplate: {
+      const Type *Type = Scope->getAsType();
+      // Find decl context.
+      const TagDecl *TD = nullptr;
+      if (const TagType *TagDeclType = Type->getAs<TagType>()) {
+        TD = TagDeclType->getDecl();
+      } else {
+        TD = Type->getAsCXXRecordDecl();
+      }
+      if (TD) {
+        return TypeName::createNestedNameSpecifier(Ctx, TD,
+                                                   true /*FullyQualified*/,
+                                                   WithGlobalNsPrefix);
+      } else if (const auto *TDD = dyn_cast<TypedefType>(Type)) {
+        return TypeName::createNestedNameSpecifier(Ctx, TDD->getDecl(),
+                                                   true /*FullyQualified*/,
+                                                   WithGlobalNsPrefix);
+      }
+      return Scope;
+    }
+  }
+  llvm_unreachable("bad NNS kind");
+}
+
+/// \brief Create a nested name specifier for the declaring context of
+/// the type.
+static NestedNameSpecifier *createNestedNameSpecifierForScopeOf(
+    const ASTContext &Ctx, const Decl *Decl,
+    bool FullyQualified, bool WithGlobalNsPrefix) {
+  assert(Decl);
+
+  const DeclContext *DC = Decl->getDeclContext()->getRedeclContext();
+  const auto *Outer = dyn_cast_or_null<NamedDecl>(DC);
+  const auto *OuterNS = dyn_cast_or_null<NamespaceDecl>(DC);
+  if (Outer && !(OuterNS && OuterNS->isAnonymousNamespace())) {
+    if (const auto *CxxDecl = dyn_cast<CXXRecordDecl>(DC)) {
+      if (ClassTemplateDecl *ClassTempl =
+              CxxDecl->getDescribedClassTemplate()) {
+        // We are in the case of a type(def) that was declared in a
+        // class template but is *not* type dependent.  In clang, it
+        // gets attached to the class template declaration rather than
+        // any specific class template instantiation.  This result in
+        // 'odd' fully qualified typename:
+        //
+        //    vector<_Tp,_Alloc>::size_type
+        //
+        // Make the situation is 'useable' but looking a bit odd by
+        // picking a random instance as the declaring context.
+        if (ClassTempl->spec_begin() != ClassTempl->spec_end()) {
+          Decl = *(ClassTempl->spec_begin());
+          Outer = dyn_cast<NamedDecl>(Decl);
+          OuterNS = dyn_cast<NamespaceDecl>(Decl);
+        }
+      }
+    }
+
+    if (OuterNS) {
+      return createNestedNameSpecifier(Ctx, OuterNS, WithGlobalNsPrefix);
+    } else if (const auto *TD = dyn_cast<TagDecl>(Outer)) {
+      return createNestedNameSpecifier(
+          Ctx, TD, FullyQualified, WithGlobalNsPrefix);
+    } else if (dyn_cast<TranslationUnitDecl>(Outer)) {
+      // Context is the TU. Nothing needs to be done.
+      return nullptr;
+    } else {
+      // Decl's context was neither the TU, a namespace, nor a
+      // TagDecl, which means it is a type local to a scope, and not
+      // accessible at the end of the TU.
+      return nullptr;
+    }
+  } else if (WithGlobalNsPrefix && DC->isTranslationUnit()) {
+    return NestedNameSpecifier::GlobalSpecifier(Ctx);
+  }
+  return nullptr;
+}
+
+/// \brief Create a nested name specifier for the declaring context of
+/// the type.
+static NestedNameSpecifier *createNestedNameSpecifierForScopeOf(
+    const ASTContext &Ctx, const Type *TypePtr,
+    bool FullyQualified, bool WithGlobalNsPrefix) {
+  if (!TypePtr) return nullptr;
+
+  Decl *Decl = nullptr;
+  // There are probably other cases ...
+  if (const auto *TDT = dyn_cast<TypedefType>(TypePtr)) {
+    Decl = TDT->getDecl();
+  } else if (const auto *TagDeclType = dyn_cast<TagType>(TypePtr)) {
+    Decl = TagDeclType->getDecl();
+  } else if (const auto *TST = dyn_cast<TemplateSpecializationType>(TypePtr)) {
+    Decl = TST->getTemplateName().getAsTemplateDecl();
+  } else {
+    Decl = TypePtr->getAsCXXRecordDecl();
+  }
+
+  if (!Decl) return nullptr;
+
+  return createNestedNameSpecifierForScopeOf(
+      Ctx, Decl, FullyQualified, WithGlobalNsPrefix);
+}
+
+NestedNameSpecifier *createNestedNameSpecifier(const ASTContext &Ctx,
+                                               const NamespaceDecl *Namespace,
+                                               bool WithGlobalNsPrefix) {
+  while (Namespace && Namespace->isInline()) {
+    // Ignore inline namespace;
+    Namespace = dyn_cast<NamespaceDecl>(Namespace->getDeclContext());
+  }
+  if (!Namespace) return nullptr;
+
+  bool FullyQualified = true;  // doesn't matter, DeclContexts are namespaces
+  return NestedNameSpecifier::Create(
+      Ctx,
+      createOuterNNS(Ctx, Namespace, FullyQualified, WithGlobalNsPrefix),
+      Namespace);
+}
+
+NestedNameSpecifier *createNestedNameSpecifier(const ASTContext &Ctx,
+                                               const TypeDecl *TD,
+                                               bool FullyQualify,
+                                               bool WithGlobalNsPrefix) {
+  return NestedNameSpecifier::Create(
+      Ctx,
+      createOuterNNS(Ctx, TD, FullyQualify, WithGlobalNsPrefix),
+      false /*No TemplateKeyword*/,
+      TD->getTypeForDecl());
+}
+
+/// \brief Return the fully qualified type, including fully-qualified
+/// versions of any template parameters.
+QualType getFullyQualifiedType(QualType QT, const ASTContext &Ctx,
+                               bool WithGlobalNsPrefix) {
+  // In case of myType* we need to strip the pointer first, fully
+  // qualify and attach the pointer once again.
+  if (isa<PointerType>(QT.getTypePtr())) {
+    // Get the qualifiers.
+    Qualifiers Quals = QT.getQualifiers();
+    QT = getFullyQualifiedType(QT->getPointeeType(), Ctx, WithGlobalNsPrefix);
+    QT = Ctx.getPointerType(QT);
+    // Add back the qualifiers.
+    QT = Ctx.getQualifiedType(QT, Quals);
+    return QT;
+  }
+
+  // In case of myType& we need to strip the reference first, fully
+  // qualify and attach the reference once again.
+  if (isa<ReferenceType>(QT.getTypePtr())) {
+    // Get the qualifiers.
+    bool IsLValueRefTy = isa<LValueReferenceType>(QT.getTypePtr());
+    Qualifiers Quals = QT.getQualifiers();
+    QT = getFullyQualifiedType(QT->getPointeeType(), Ctx, WithGlobalNsPrefix);
+    // Add the r- or l-value reference type back to the fully
+    // qualified one.
+    if (IsLValueRefTy)
+      QT = Ctx.getLValueReferenceType(QT);
+    else
+      QT = Ctx.getRValueReferenceType(QT);
+    // Add back the qualifiers.
+    QT = Ctx.getQualifiedType(QT, Quals);
+    return QT;
+  }
+
+  // Remove the part of the type related to the type being a template
+  // parameter (we won't report it as part of the 'type name' and it
+  // is actually make the code below to be more complex (to handle
+  // those)
+  while (isa<SubstTemplateTypeParmType>(QT.getTypePtr())) {
+    // Get the qualifiers.
+    Qualifiers Quals = QT.getQualifiers();
+
+    QT = dyn_cast<SubstTemplateTypeParmType>(QT.getTypePtr())->desugar();
+
+    // Add back the qualifiers.
+    QT = Ctx.getQualifiedType(QT, Quals);
+  }
+
+  NestedNameSpecifier *Prefix = nullptr;
+  // Local qualifiers are attached to the QualType outside of the
+  // elaborated type.  Retrieve them before descending into the
+  // elaborated type.
+  Qualifiers PrefixQualifiers = QT.getLocalQualifiers();
+  QT = QualType(QT.getTypePtr(), 0);
+  ElaboratedTypeKeyword Keyword = ETK_None;
+  if (const auto *ETypeInput = dyn_cast<ElaboratedType>(QT.getTypePtr())) {
+    QT = ETypeInput->getNamedType();
+    assert(!QT.hasLocalQualifiers());
+    Keyword = ETypeInput->getKeyword();
+  }
+  // Create a nested name specifier if needed.
+  Prefix = createNestedNameSpecifierForScopeOf(Ctx, QT.getTypePtr(),
+                                               true /*FullyQualified*/,
+                                               WithGlobalNsPrefix);
+
+  // In case of template specializations iterate over the arguments and
+  // fully qualify them as well.
+  if (isa<const TemplateSpecializationType>(QT.getTypePtr()) ||
+      isa<const RecordType>(QT.getTypePtr())) {
+    // We are asked to fully qualify and we have a Record Type (which
+    // may point to a template specialization) or Template
+    // Specialization Type. We need to fully qualify their arguments.
+
+    const Type *TypePtr = getFullyQualifiedTemplateType(
+        Ctx, QT.getTypePtr(), WithGlobalNsPrefix);
+    QT = QualType(TypePtr, 0);
+  }
+  if (Prefix || Keyword != ETK_None) {
+    QT = Ctx.getElaboratedType(Keyword, Prefix, QT);
+  }
+  QT = Ctx.getQualifiedType(QT, PrefixQualifiers);
+  return QT;
+}
+
+std::string getFullyQualifiedName(QualType QT,
+                                  const ASTContext &Ctx,
+                                  bool WithGlobalNsPrefix) {
+  PrintingPolicy Policy(Ctx.getPrintingPolicy());
+  Policy.SuppressScope = false;
+  Policy.AnonymousTagLocations = false;
+  Policy.PolishForDeclaration = true;
+  Policy.SuppressUnwrittenScope = true;
+  QualType FQQT = getFullyQualifiedType(QT, Ctx, WithGlobalNsPrefix);
+  return FQQT.getAsString(Policy);
+}
+
+}  // end namespace TypeName
+}  // end namespace clang
diff --git a/lib/Tooling/Core/Replacement.cpp b/lib/Tooling/Core/Replacement.cpp
index 47bbdeb..b257f0f 100644
--- a/lib/Tooling/Core/Replacement.cpp
+++ b/lib/Tooling/Core/Replacement.cpp
@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Tooling/Core/Replacement.h"
+
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticIDs.h"
 #include "clang/Basic/DiagnosticOptions.h"
@@ -18,9 +20,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Rewrite/Core/Rewriter.h"
-#include "clang/Tooling/Core/Replacement.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_os_ostream.h"
 
 namespace clang {
@@ -57,14 +57,8 @@
   const FileEntry *Entry = SM.getFileManager().getFile(FilePath);
   if (!Entry)
     return false;
-  FileID ID;
-  // FIXME: Use SM.translateFile directly.
-  SourceLocation Location = SM.translateFileLineCol(Entry, 1, 1);
-  ID = Location.isValid() ?
-    SM.getFileID(Location) :
-    SM.createFileID(Entry, SourceLocation(), SrcMgr::C_User);
-  // FIXME: We cannot check whether Offset + Length is in the file, as
-  // the remapping API is not public in the RewriteBuffer.
+
+  FileID ID = SM.getOrCreateFileID(Entry, SrcMgr::C_User);
   const SourceLocation Start =
     SM.getLocForStartOfFile(ID).
     getLocWithOffset(ReplacementRange.getOffset());
@@ -143,145 +137,61 @@
                         ReplacementText);
 }
 
-template <typename T>
-unsigned shiftedCodePositionInternal(const T &Replaces, unsigned Position) {
-  unsigned Offset = 0;
-  for (const auto& R : Replaces) {
-    if (R.getOffset() + R.getLength() <= Position) {
-      Offset += R.getReplacementText().size() - R.getLength();
-      continue;
-    }
-    if (R.getOffset() < Position &&
-        R.getOffset() + R.getReplacementText().size() <= Position) {
-      Position = R.getOffset() + R.getReplacementText().size() - 1;
-    }
-    break;
-  }
-  return Position + Offset;
-}
+llvm::Error Replacements::add(const Replacement &R) {
+  // Check the file path.
+  if (!Replaces.empty() && R.getFilePath() != Replaces.begin()->getFilePath())
+    return llvm::make_error<llvm::StringError>(
+        "All replacements must have the same file path. New replacement: " +
+            R.getFilePath() + ", existing replacements: " +
+            Replaces.begin()->getFilePath() + "\n",
+        llvm::inconvertibleErrorCode());
 
-unsigned shiftedCodePosition(const Replacements &Replaces, unsigned Position) {
-  return shiftedCodePositionInternal(Replaces, Position);
-}
-
-// FIXME: Remove this function when Replacements is implemented as std::vector
-// instead of std::set.
-unsigned shiftedCodePosition(const std::vector<Replacement> &Replaces,
-                             unsigned Position) {
-  return shiftedCodePositionInternal(Replaces, Position);
-}
-
-void deduplicate(std::vector<Replacement> &Replaces,
-                 std::vector<Range> &Conflicts) {
-  if (Replaces.empty())
-    return;
-
-  auto LessNoPath = [](const Replacement &LHS, const Replacement &RHS) {
-    if (LHS.getOffset() != RHS.getOffset())
-      return LHS.getOffset() < RHS.getOffset();
-    if (LHS.getLength() != RHS.getLength())
-      return LHS.getLength() < RHS.getLength();
-    return LHS.getReplacementText() < RHS.getReplacementText();
-  };
-
-  auto EqualNoPath = [](const Replacement &LHS, const Replacement &RHS) {
-    return LHS.getOffset() == RHS.getOffset() &&
-           LHS.getLength() == RHS.getLength() &&
-           LHS.getReplacementText() == RHS.getReplacementText();
-  };
-
-  // Deduplicate. We don't want to deduplicate based on the path as we assume
-  // that all replacements refer to the same file (or are symlinks).
-  std::sort(Replaces.begin(), Replaces.end(), LessNoPath);
-  Replaces.erase(std::unique(Replaces.begin(), Replaces.end(), EqualNoPath),
-                 Replaces.end());
-
-  // Detect conflicts
-  Range ConflictRange(Replaces.front().getOffset(),
-                      Replaces.front().getLength());
-  unsigned ConflictStart = 0;
-  unsigned ConflictLength = 1;
-  for (unsigned i = 1; i < Replaces.size(); ++i) {
-    Range Current(Replaces[i].getOffset(), Replaces[i].getLength());
-    if (ConflictRange.overlapsWith(Current)) {
-      // Extend conflicted range
-      ConflictRange = Range(ConflictRange.getOffset(),
-                            std::max(ConflictRange.getLength(),
-                                     Current.getOffset() + Current.getLength() -
-                                         ConflictRange.getOffset()));
-      ++ConflictLength;
-    } else {
-      if (ConflictLength > 1)
-        Conflicts.push_back(Range(ConflictStart, ConflictLength));
-      ConflictRange = Current;
-      ConflictStart = i;
-      ConflictLength = 1;
-    }
+  // Special-case header insertions.
+  if (R.getOffset() == UINT_MAX) {
+    Replaces.insert(R);
+    return llvm::Error::success();
   }
 
-  if (ConflictLength > 1)
-    Conflicts.push_back(Range(ConflictStart, ConflictLength));
-}
+  // This replacement cannot conflict with replacements that end before
+  // this replacement starts or start after this replacement ends.
+  // We also know that there currently are no overlapping replacements.
+  // Thus, we know that all replacements that start after the end of the current
+  // replacement cannot overlap.
+  Replacement AtEnd(R.getFilePath(), R.getOffset() + R.getLength(), 0, "");
 
-bool applyAllReplacements(const Replacements &Replaces, Rewriter &Rewrite) {
-  bool Result = true;
-  for (Replacements::const_iterator I = Replaces.begin(),
-                                    E = Replaces.end();
-       I != E; ++I) {
-    if (I->isApplicable()) {
-      Result = I->apply(Rewrite) && Result;
-    } else {
-      Result = false;
-    }
-  }
-  return Result;
-}
+  // Find the first entry that starts after or at the end of R. Note that
+  // entries that start at the end can still be conflicting if R is an
+  // insertion.
+  auto I = Replaces.lower_bound(AtEnd);
+  // If it starts at the same offset as R (can only happen if R is an
+  // insertion), we have a conflict.  In that case, increase I to fall through
+  // to the conflict check.
+  if (I != Replaces.end() && R.getOffset() == I->getOffset())
+    ++I;
 
-// FIXME: Remove this function when Replacements is implemented as std::vector
-// instead of std::set.
-bool applyAllReplacements(const std::vector<Replacement> &Replaces,
-                          Rewriter &Rewrite) {
-  bool Result = true;
-  for (std::vector<Replacement>::const_iterator I = Replaces.begin(),
-                                                E = Replaces.end();
-       I != E; ++I) {
-    if (I->isApplicable()) {
-      Result = I->apply(Rewrite) && Result;
-    } else {
-      Result = false;
-    }
+  // I is the smallest iterator whose entry cannot overlap.
+  // If that is begin(), there are no overlaps.
+  if (I == Replaces.begin()) {
+    Replaces.insert(R);
+    return llvm::Error::success();
   }
-  return Result;
-}
-
-std::string applyAllReplacements(StringRef Code, const Replacements &Replaces) {
-  IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
-      new vfs::InMemoryFileSystem);
-  FileManager Files(FileSystemOptions(), InMemoryFileSystem);
-  DiagnosticsEngine Diagnostics(
-      IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs),
-      new DiagnosticOptions);
-  SourceManager SourceMgr(Diagnostics, Files);
-  Rewriter Rewrite(SourceMgr, LangOptions());
-  InMemoryFileSystem->addFile(
-      "<stdin>", 0, llvm::MemoryBuffer::getMemBuffer(Code, "<stdin>"));
-  FileID ID = SourceMgr.createFileID(Files.getFile("<stdin>"), SourceLocation(),
-                                     clang::SrcMgr::C_User);
-  for (Replacements::const_iterator I = Replaces.begin(), E = Replaces.end();
-       I != E; ++I) {
-    Replacement Replace("<stdin>", I->getOffset(), I->getLength(),
-                        I->getReplacementText());
-    if (!Replace.apply(Rewrite))
-      return "";
+  --I;
+  // If the previous entry does not overlap, we know that entries before it
+  // can also not overlap.
+  if (R.getOffset() != I->getOffset() &&
+      !Range(R.getOffset(), R.getLength())
+           .overlapsWith(Range(I->getOffset(), I->getLength()))) {
+    Replaces.insert(R);
+    return llvm::Error::success();
   }
-  std::string Result;
-  llvm::raw_string_ostream OS(Result);
-  Rewrite.getEditBuffer(ID).write(OS);
-  OS.flush();
-  return Result;
+  return llvm::make_error<llvm::StringError>(
+      "New replacement:\n" + R.toString() +
+          "\nconflicts with existing replacement:\n" + I->toString(),
+      llvm::inconvertibleErrorCode());
 }
 
 namespace {
+
 // Represents a merged replacement, i.e. a replacement consisting of multiple
 // overlapping replacements from 'First' and 'Second' in mergeReplacements.
 //
@@ -314,7 +224,7 @@
 
   // Merges the next element 'R' into this merged element. As we always merge
   // from 'First' into 'Second' or vice versa, the MergedReplacement knows what
-  // set the next element is coming from. 
+  // set the next element is coming from.
   void merge(const Replacement &R) {
     if (MergeSecond) {
       unsigned REnd = R.getOffset() + Delta + R.getLength();
@@ -375,17 +285,19 @@
   unsigned Length;
   std::string Text;
 };
+
 } // namespace
 
-Replacements mergeReplacements(const Replacements &First,
-                               const Replacements &Second) {
-  if (First.empty() || Second.empty())
-    return First.empty() ? Second : First;
+Replacements Replacements::merge(const Replacements &ReplacesToMerge) const {
+  if (empty() || ReplacesToMerge.empty())
+    return empty() ? ReplacesToMerge : *this;
 
+  auto &First = Replaces;
+  auto &Second = ReplacesToMerge.Replaces;
   // Delta is the amount of characters that replacements from 'Second' need to
   // be shifted so that their offsets refer to the original text.
   int Delta = 0;
-  Replacements Result;
+  ReplacementsImpl Result;
 
   // Iterate over both sets and always add the next element (smallest total
   // Offset) from either 'First' or 'Second'. Merge that element with
@@ -411,9 +323,143 @@
     Delta -= Merged.deltaFirst();
     Result.insert(Merged.asReplacement());
   }
+  return Replacements(Result.begin(), Result.end());
+}
+
+// Combines overlapping ranges in \p Ranges and sorts the combined ranges.
+// Returns a set of non-overlapping and sorted ranges that is equivalent to
+// \p Ranges.
+static std::vector<Range> combineAndSortRanges(std::vector<Range> Ranges) {
+  std::sort(Ranges.begin(), Ranges.end(),
+            [](const Range &LHS, const Range &RHS) {
+              if (LHS.getOffset() != RHS.getOffset())
+                return LHS.getOffset() < RHS.getOffset();
+              return LHS.getLength() < RHS.getLength();
+            });
+  std::vector<Range> Result;
+  for (const auto &R : Ranges) {
+    if (Result.empty() ||
+        Result.back().getOffset() + Result.back().getLength() < R.getOffset()) {
+      Result.push_back(R);
+    } else {
+      unsigned NewEnd =
+          std::max(Result.back().getOffset() + Result.back().getLength(),
+                   R.getOffset() + R.getLength());
+      Result[Result.size() - 1] =
+          Range(Result.back().getOffset(), NewEnd - Result.back().getOffset());
+    }
+  }
   return Result;
 }
 
+std::vector<Range>
+calculateRangesAfterReplacements(const Replacements &Replaces,
+                                 const std::vector<Range> &Ranges) {
+  // To calculate the new ranges,
+  //   - Turn \p Ranges into Replacements at (offset, length) with an empty
+  //     (unimportant) replacement text of length "length".
+  //   - Merge with \p Replaces.
+  //   - The new ranges will be the affected ranges of the merged replacements.
+  auto MergedRanges = combineAndSortRanges(Ranges);
+  if (Replaces.empty())
+    return MergedRanges;
+  tooling::Replacements FakeReplaces;
+  for (const auto &R : MergedRanges) {
+    auto Err = FakeReplaces.add(Replacement(Replaces.begin()->getFilePath(),
+                                            R.getOffset(), R.getLength(),
+                                            std::string(R.getLength(), ' ')));
+    assert(!Err &&
+           "Replacements must not conflict since ranges have been merged.");
+    (void)Err;
+  }
+  return FakeReplaces.merge(Replaces).getAffectedRanges();
+}
+
+std::vector<Range> Replacements::getAffectedRanges() const {
+  std::vector<Range> ChangedRanges;
+  int Shift = 0;
+  for (const Replacement &R : Replaces) {
+    unsigned Offset = R.getOffset() + Shift;
+    unsigned Length = R.getReplacementText().size();
+    Shift += Length - R.getLength();
+    ChangedRanges.push_back(Range(Offset, Length));
+  }
+  return combineAndSortRanges(ChangedRanges);
+}
+
+unsigned Replacements::getShiftedCodePosition(unsigned Position) const {
+  unsigned Offset = 0;
+  for (const auto& R : Replaces) {
+    if (R.getOffset() + R.getLength() <= Position) {
+      Offset += R.getReplacementText().size() - R.getLength();
+      continue;
+    }
+    if (R.getOffset() < Position &&
+        R.getOffset() + R.getReplacementText().size() <= Position) {
+      Position = R.getOffset() + R.getReplacementText().size();
+      if (R.getReplacementText().size() > 0)
+        Position--;
+    }
+    break;
+  }
+  return Position + Offset;
+}
+
+bool applyAllReplacements(const Replacements &Replaces, Rewriter &Rewrite) {
+  bool Result = true;
+  for (Replacements::const_iterator I = Replaces.begin(),
+                                    E = Replaces.end();
+       I != E; ++I) {
+    if (I->isApplicable()) {
+      Result = I->apply(Rewrite) && Result;
+    } else {
+      Result = false;
+    }
+  }
+  return Result;
+}
+
+llvm::Expected<std::string> applyAllReplacements(StringRef Code,
+                                                const Replacements &Replaces) {
+  if (Replaces.empty())
+    return Code.str();
+
+  IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
+      new vfs::InMemoryFileSystem);
+  FileManager Files(FileSystemOptions(), InMemoryFileSystem);
+  DiagnosticsEngine Diagnostics(
+      IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs),
+      new DiagnosticOptions);
+  SourceManager SourceMgr(Diagnostics, Files);
+  Rewriter Rewrite(SourceMgr, LangOptions());
+  InMemoryFileSystem->addFile(
+      "<stdin>", 0, llvm::MemoryBuffer::getMemBuffer(Code, "<stdin>"));
+  FileID ID = SourceMgr.createFileID(Files.getFile("<stdin>"), SourceLocation(),
+                                     clang::SrcMgr::C_User);
+  for (Replacements::const_iterator I = Replaces.begin(), E = Replaces.end();
+       I != E; ++I) {
+    Replacement Replace("<stdin>", I->getOffset(), I->getLength(),
+                        I->getReplacementText());
+    if (!Replace.apply(Rewrite))
+      return llvm::make_error<llvm::StringError>(
+          "Failed to apply replacement: " + Replace.toString(),
+          llvm::inconvertibleErrorCode());
+  }
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  Rewrite.getEditBuffer(ID).write(OS);
+  OS.flush();
+  return Result;
+}
+
+std::map<std::string, Replacements>
+groupReplacementsByFile(const Replacements &Replaces) {
+  std::map<std::string, Replacements> FileToReplaces;
+  for (const auto &Replace : Replaces)
+    // We can ignore the Error here since \p Replaces is already conflict-free.
+    FileToReplaces[Replace.getFilePath()].add(Replace);
+  return FileToReplaces;
+}
+
 } // end namespace tooling
 } // end namespace clang
-
diff --git a/lib/Tooling/FixIt.cpp b/lib/Tooling/FixIt.cpp
new file mode 100644
index 0000000..70942c5
--- /dev/null
+++ b/lib/Tooling/FixIt.cpp
@@ -0,0 +1,31 @@
+//===--- FixIt.cpp - FixIt Hint utilities -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains implementations of utitilies to ease source code rewriting
+// by providing helper functions related to FixItHint.
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Tooling/FixIt.h"
+#include "clang/Lex/Lexer.h"
+
+namespace clang {
+namespace tooling {
+namespace fixit {
+
+namespace internal {
+StringRef getText(SourceRange Range, const ASTContext &Context) {
+  return Lexer::getSourceText(CharSourceRange::getTokenRange(Range),
+                              Context.getSourceManager(),
+                              Context.getLangOpts());
+}
+} // end namespace internal
+
+} // end namespace fixit
+} // end namespace tooling
+} // end namespace clang
diff --git a/lib/Tooling/Makefile b/lib/Tooling/Makefile
deleted file mode 100644
index 7ea85a8..0000000
--- a/lib/Tooling/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- clang/lib/Tooling/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME := clangTooling
-PARALLEL_DIRS := Core
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/lib/Tooling/Refactoring.cpp b/lib/Tooling/Refactoring.cpp
index d32452f..5565b54 100644
--- a/lib/Tooling/Refactoring.cpp
+++ b/lib/Tooling/Refactoring.cpp
@@ -11,14 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Tooling/Refactoring.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Rewrite/Core/Rewriter.h"
-#include "clang/Tooling/Refactoring.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_os_ostream.h"
 
@@ -30,7 +30,9 @@
     std::shared_ptr<PCHContainerOperations> PCHContainerOps)
     : ClangTool(Compilations, SourcePaths, PCHContainerOps) {}
 
-Replacements &RefactoringTool::getReplacements() { return Replace; }
+std::map<std::string, Replacements> &RefactoringTool::getReplacements() {
+  return FileToReplaces;
+}
 
 int RefactoringTool::runAndSave(FrontendActionFactory *ActionFactory) {
   if (int Result = run(ActionFactory)) {
@@ -54,12 +56,42 @@
 }
 
 bool RefactoringTool::applyAllReplacements(Rewriter &Rewrite) {
-  return tooling::applyAllReplacements(Replace, Rewrite);
+  bool Result = true;
+  for (const auto &Entry : FileToReplaces)
+    Result = tooling::applyAllReplacements(Entry.second, Rewrite) && Result;
+  return Result;
 }
 
 int RefactoringTool::saveRewrittenFiles(Rewriter &Rewrite) {
   return Rewrite.overwriteChangedFiles() ? 1 : 0;
 }
 
+bool formatAndApplyAllReplacements(
+    const std::map<std::string, Replacements> &FileToReplaces, Rewriter &Rewrite,
+    StringRef Style) {
+  SourceManager &SM = Rewrite.getSourceMgr();
+  FileManager &Files = SM.getFileManager();
+
+  bool Result = true;
+  for (const auto &FileAndReplaces : FileToReplaces) {
+    const std::string &FilePath = FileAndReplaces.first;
+    auto &CurReplaces = FileAndReplaces.second;
+
+    const FileEntry *Entry = Files.getFile(FilePath);
+    FileID ID = SM.getOrCreateFileID(Entry, SrcMgr::C_User);
+    StringRef Code = SM.getBufferData(ID);
+
+    format::FormatStyle CurStyle = format::getStyle(Style, FilePath, "LLVM");
+    auto NewReplacements =
+        format::formatReplacements(Code, CurReplaces, CurStyle);
+    if (!NewReplacements) {
+      llvm::errs() << llvm::toString(NewReplacements.takeError()) << "\n";
+      return false;
+    }
+    Result = applyAllReplacements(*NewReplacements, Rewrite) && Result;
+  }
+  return Result;
+}
+
 } // end namespace tooling
 } // end namespace clang
diff --git a/lib/Tooling/RefactoringCallbacks.cpp b/lib/Tooling/RefactoringCallbacks.cpp
index 4de125e..af25fd8 100644
--- a/lib/Tooling/RefactoringCallbacks.cpp
+++ b/lib/Tooling/RefactoringCallbacks.cpp
@@ -40,10 +40,14 @@
 void ReplaceStmtWithText::run(
     const ast_matchers::MatchFinder::MatchResult &Result) {
   if (const Stmt *FromMatch = Result.Nodes.getStmtAs<Stmt>(FromId)) {
-    Replace.insert(tooling::Replacement(
+    auto Err = Replace.add(tooling::Replacement(
         *Result.SourceManager,
-        CharSourceRange::getTokenRange(FromMatch->getSourceRange()),
-        ToText));
+        CharSourceRange::getTokenRange(FromMatch->getSourceRange()), ToText));
+    // FIXME: better error handling. For now, just print error message in the
+    // release version.
+    if (Err)
+      llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+    assert(!Err);
   }
 }
 
@@ -54,9 +58,15 @@
     const ast_matchers::MatchFinder::MatchResult &Result) {
   const Stmt *FromMatch = Result.Nodes.getStmtAs<Stmt>(FromId);
   const Stmt *ToMatch = Result.Nodes.getStmtAs<Stmt>(ToId);
-  if (FromMatch && ToMatch)
-    Replace.insert(replaceStmtWithStmt(
-        *Result.SourceManager, *FromMatch, *ToMatch));
+  if (FromMatch && ToMatch) {
+    auto Err = Replace.add(
+        replaceStmtWithStmt(*Result.SourceManager, *FromMatch, *ToMatch));
+    // FIXME: better error handling. For now, just print error message in the
+    // release version.
+    if (Err)
+      llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+    assert(!Err);
+  }
 }
 
 ReplaceIfStmtWithItsBody::ReplaceIfStmtWithItsBody(StringRef Id,
@@ -68,11 +78,23 @@
   if (const IfStmt *Node = Result.Nodes.getStmtAs<IfStmt>(Id)) {
     const Stmt *Body = PickTrueBranch ? Node->getThen() : Node->getElse();
     if (Body) {
-      Replace.insert(replaceStmtWithStmt(*Result.SourceManager, *Node, *Body));
+      auto Err =
+          Replace.add(replaceStmtWithStmt(*Result.SourceManager, *Node, *Body));
+      // FIXME: better error handling. For now, just print error message in the
+      // release version.
+      if (Err)
+        llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+      assert(!Err);
     } else if (!PickTrueBranch) {
       // If we want to use the 'else'-branch, but it doesn't exist, delete
       // the whole 'if'.
-      Replace.insert(replaceStmtWithText(*Result.SourceManager, *Node, ""));
+      auto Err =
+          Replace.add(replaceStmtWithText(*Result.SourceManager, *Node, ""));
+      // FIXME: better error handling. For now, just print error message in the
+      // release version.
+      if (Err)
+        llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+      assert(!Err);
     }
   }
 }
diff --git a/lib/Tooling/Tooling.cpp b/lib/Tooling/Tooling.cpp
index fd5596e..9611871 100644
--- a/lib/Tooling/Tooling.cpp
+++ b/lib/Tooling/Tooling.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/Tooling.h"
-#include "clang/AST/ASTConsumer.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Tool.h"
@@ -22,6 +21,7 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Tooling/ArgumentsAdjusters.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/STLExtras.h"
@@ -30,7 +30,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include <utility>
 
 #define DEBUG_TYPE "clang-tooling"
 
@@ -49,8 +51,9 @@
 static clang::driver::Driver *newDriver(
     clang::DiagnosticsEngine *Diagnostics, const char *BinaryName,
     IntrusiveRefCntPtr<vfs::FileSystem> VFS) {
-  clang::driver::Driver *CompilerDriver = new clang::driver::Driver(
-      BinaryName, llvm::sys::getDefaultTargetTriple(), *Diagnostics, VFS);
+  clang::driver::Driver *CompilerDriver =
+      new clang::driver::Driver(BinaryName, llvm::sys::getDefaultTargetTriple(),
+                                *Diagnostics, std::move(VFS));
   CompilerDriver->setTitle("clang_based_tool");
   return CompilerDriver;
 }
@@ -103,14 +106,16 @@
                    const Twine &FileName,
                    std::shared_ptr<PCHContainerOperations> PCHContainerOps) {
   return runToolOnCodeWithArgs(ToolAction, Code, std::vector<std::string>(),
-                               FileName, PCHContainerOps);
+                               FileName, "clang-tool",
+                               std::move(PCHContainerOps));
 }
 
 static std::vector<std::string>
-getSyntaxOnlyToolArgs(const std::vector<std::string> &ExtraArgs,
+getSyntaxOnlyToolArgs(const Twine &ToolName,
+                      const std::vector<std::string> &ExtraArgs,
                       StringRef FileName) {
   std::vector<std::string> Args;
-  Args.push_back("clang-tool");
+  Args.push_back(ToolName.str());
   Args.push_back("-fsyntax-only");
   Args.insert(Args.end(), ExtraArgs.begin(), ExtraArgs.end());
   Args.push_back(FileName.str());
@@ -120,6 +125,7 @@
 bool runToolOnCodeWithArgs(
     clang::FrontendAction *ToolAction, const Twine &Code,
     const std::vector<std::string> &Args, const Twine &FileName,
+    const Twine &ToolName,
     std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     const FileContentMappings &VirtualMappedFiles) {
 
@@ -132,8 +138,9 @@
   OverlayFileSystem->pushOverlay(InMemoryFileSystem);
   llvm::IntrusiveRefCntPtr<FileManager> Files(
       new FileManager(FileSystemOptions(), OverlayFileSystem));
-  ToolInvocation Invocation(getSyntaxOnlyToolArgs(Args, FileNameRef),
-                            ToolAction, Files.get(), PCHContainerOps);
+  ToolInvocation Invocation(getSyntaxOnlyToolArgs(ToolName, Args, FileNameRef),
+                            ToolAction, Files.get(),
+                            std::move(PCHContainerOps));
 
   SmallString<1024> CodeStorage;
   InMemoryFileSystem->addFile(FileNameRef, 0,
@@ -206,14 +213,16 @@
     std::vector<std::string> CommandLine, ToolAction *Action,
     FileManager *Files, std::shared_ptr<PCHContainerOperations> PCHContainerOps)
     : CommandLine(std::move(CommandLine)), Action(Action), OwnsAction(false),
-      Files(Files), PCHContainerOps(PCHContainerOps), DiagConsumer(nullptr) {}
+      Files(Files), PCHContainerOps(std::move(PCHContainerOps)),
+      DiagConsumer(nullptr) {}
 
 ToolInvocation::ToolInvocation(
     std::vector<std::string> CommandLine, FrontendAction *FAction,
     FileManager *Files, std::shared_ptr<PCHContainerOperations> PCHContainerOps)
     : CommandLine(std::move(CommandLine)),
       Action(new SingleFrontendActionFactory(FAction)), OwnsAction(true),
-      Files(Files), PCHContainerOps(PCHContainerOps), DiagConsumer(nullptr) {}
+      Files(Files), PCHContainerOps(std::move(PCHContainerOps)),
+      DiagConsumer(nullptr) {}
 
 ToolInvocation::~ToolInvocation() {
   if (OwnsAction)
@@ -260,7 +269,7 @@
                                                       Input.release());
   }
   return runInvocation(BinaryName, Compilation.get(), Invocation.release(),
-                       PCHContainerOps);
+                       std::move(PCHContainerOps));
 }
 
 bool ToolInvocation::runInvocation(
@@ -274,7 +283,7 @@
     llvm::errs() << "\n";
   }
 
-  return Action->runInvocation(Invocation, Files, PCHContainerOps,
+  return Action->runInvocation(Invocation, Files, std::move(PCHContainerOps),
                                DiagConsumer);
 }
 
@@ -283,7 +292,7 @@
     std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     DiagnosticConsumer *DiagConsumer) {
   // Create a compiler instance to handle the actual work.
-  clang::CompilerInstance Compiler(PCHContainerOps);
+  clang::CompilerInstance Compiler(std::move(PCHContainerOps));
   Compiler.setInvocation(Invocation);
   Compiler.setFileManager(Files);
 
@@ -309,7 +318,7 @@
                      ArrayRef<std::string> SourcePaths,
                      std::shared_ptr<PCHContainerOperations> PCHContainerOps)
     : Compilations(Compilations), SourcePaths(SourcePaths),
-      PCHContainerOps(PCHContainerOps),
+      PCHContainerOps(std::move(PCHContainerOps)),
       OverlayFileSystem(new vfs::OverlayFileSystem(vfs::getRealFileSystem())),
       InMemoryFileSystem(new vfs::InMemoryFileSystem),
       Files(new FileManager(FileSystemOptions(), OverlayFileSystem)),
@@ -327,26 +336,32 @@
 
 void ClangTool::appendArgumentsAdjuster(ArgumentsAdjuster Adjuster) {
   if (ArgsAdjuster)
-    ArgsAdjuster = combineAdjusters(ArgsAdjuster, Adjuster);
+    ArgsAdjuster =
+        combineAdjusters(std::move(ArgsAdjuster), std::move(Adjuster));
   else
-    ArgsAdjuster = Adjuster;
+    ArgsAdjuster = std::move(Adjuster);
 }
 
 void ClangTool::clearArgumentsAdjusters() {
   ArgsAdjuster = nullptr;
 }
 
+static void injectResourceDir(CommandLineArguments &Args, const char *Argv0,
+                              void *MainAddr) {
+  // Allow users to override the resource dir.
+  for (StringRef Arg : Args)
+    if (Arg.startswith("-resource-dir"))
+      return;
+
+  // If there's no override in place add our resource dir.
+  Args.push_back("-resource-dir=" +
+                 CompilerInvocation::GetResourcesPath(Argv0, MainAddr));
+}
+
 int ClangTool::run(ToolAction *Action) {
   // Exists solely for the purpose of lookup of the resource path.
   // This just needs to be some symbol in the binary.
   static int StaticSymbol;
-  // The driver detects the builtin header path based on the path of the
-  // executable.
-  // FIXME: On linux, GetMainExecutable is independent of the value of the
-  // first argument, thus allowing ClangTool and runToolOnCode to just
-  // pass in made-up names here. Make sure this works on other platforms.
-  std::string MainExecutable =
-      llvm::sys::fs::getMainExecutable("clang_tool", &StaticSymbol);
 
   llvm::SmallString<128> InitialDirectory;
   if (std::error_code EC = llvm::sys::fs::current_path(InitialDirectory))
@@ -411,7 +426,17 @@
       if (ArgsAdjuster)
         CommandLine = ArgsAdjuster(CommandLine, CompileCommand.Filename);
       assert(!CommandLine.empty());
-      CommandLine[0] = MainExecutable;
+
+      // Add the resource dir based on the binary of this tool. argv[0] in the
+      // compilation database may refer to a different compiler and we want to
+      // pick up the very same standard library that compiler is using. The
+      // builtin headers in the resource dir need to match the exact clang
+      // version the tool is using.
+      // FIXME: On linux, GetMainExecutable is independent of the value of the
+      // first argument, thus allowing ClangTool and runToolOnCode to just
+      // pass in made-up names here. Make sure this works on other platforms.
+      injectResourceDir(CommandLine, "clang_tool", &StaticSymbol);
+
       // FIXME: We need a callback mechanism for the tool writer to output a
       // customized message for each file.
       DEBUG({ llvm::dbgs() << "Processing: " << File << ".\n"; });
@@ -446,7 +471,7 @@
                      std::shared_ptr<PCHContainerOperations> PCHContainerOps,
                      DiagnosticConsumer *DiagConsumer) override {
     std::unique_ptr<ASTUnit> AST = ASTUnit::LoadFromCompilerInvocation(
-        Invocation, PCHContainerOps,
+        Invocation, std::move(PCHContainerOps),
         CompilerInstance::createDiagnostics(&Invocation->getDiagnosticOpts(),
                                             DiagConsumer,
                                             /*ShouldOwnClient=*/false),
@@ -458,7 +483,6 @@
     return true;
   }
 };
-
 }
 
 int ClangTool::buildASTs(std::vector<std::unique_ptr<ASTUnit>> &ASTs) {
@@ -470,12 +494,12 @@
 buildASTFromCode(const Twine &Code, const Twine &FileName,
                  std::shared_ptr<PCHContainerOperations> PCHContainerOps) {
   return buildASTFromCodeWithArgs(Code, std::vector<std::string>(), FileName,
-                                  PCHContainerOps);
+                                  "clang-tool", std::move(PCHContainerOps));
 }
 
 std::unique_ptr<ASTUnit> buildASTFromCodeWithArgs(
     const Twine &Code, const std::vector<std::string> &Args,
-    const Twine &FileName,
+    const Twine &FileName, const Twine &ToolName,
     std::shared_ptr<PCHContainerOperations> PCHContainerOps) {
   SmallString<16> FileNameStorage;
   StringRef FileNameRef = FileName.toNullTerminatedStringRef(FileNameStorage);
@@ -489,8 +513,8 @@
   OverlayFileSystem->pushOverlay(InMemoryFileSystem);
   llvm::IntrusiveRefCntPtr<FileManager> Files(
       new FileManager(FileSystemOptions(), OverlayFileSystem));
-  ToolInvocation Invocation(getSyntaxOnlyToolArgs(Args, FileNameRef), &Action,
-                            Files.get(), PCHContainerOps);
+  ToolInvocation Invocation(getSyntaxOnlyToolArgs(ToolName, Args, FileNameRef),
+                            &Action, Files.get(), std::move(PCHContainerOps));
 
   SmallString<1024> CodeStorage;
   InMemoryFileSystem->addFile(FileNameRef, 0,
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 43e57b6..1f0b2d9 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -23,18 +23,17 @@
 endfunction()
 
 set(COMPILER_RT_SRC_ROOT ${LLVM_MAIN_SRC_DIR}/projects/compiler-rt)
-if(LLVM_BUILD_EXTERNAL_COMPILER_RT AND EXISTS ${COMPILER_RT_SRC_ROOT}/)
-  if(CMAKE_VERSION VERSION_GREATER 3.3.20150708)
-    set(cmake_3_4_USES_TERMINAL_OPTIONS
-      USES_TERMINAL_CONFIGURE 1
-      USES_TERMINAL_BUILD 1
-      USES_TERMINAL_INSTALL 1
-      )
+# Fallback to the external path, if the other one isn't available.
+# This is the same behavior (try "internal", then check the LLVM_EXTERNAL_...
+# variable) as in add_llvm_external_project
+if(NOT EXISTS ${COMPILER_RT_SRC_ROOT})
+  # We don't want to set it if LLVM_EXTERNAL_COMPILER_RT_SOURCE_DIR is ""
+  if(${LLVM_EXTERNAL_COMPILER_RT_SOURCE_DIR})
+    set(COMPILER_RT_SRC_ROOT ${LLVM_EXTERNAL_COMPILER_RT_SOURCE_DIR})
   endif()
+endif()
 
-  if(CMAKE_VERSION VERSION_GREATER 3.1.20141117)
-    set(cmake_3_2_USES_TERMINAL USES_TERMINAL)
-  endif()
+if(LLVM_BUILD_EXTERNAL_COMPILER_RT AND EXISTS ${COMPILER_RT_SRC_ROOT}/)
 
   # Add compiler-rt as an external project.
   set(COMPILER_RT_PREFIX ${CMAKE_BINARY_DIR}/projects/compiler-rt)
@@ -77,10 +76,13 @@
                -DCOMPILER_RT_INSTALL_PATH:STRING=lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}
                -DCOMPILER_RT_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS}
                -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+               -DLLVM_LIBDIR_SUFFIX=${LLVM_LIBDIR_SUFFIX}
                ${COMPILER_RT_PASSTHROUGH_VARIABLES}
     INSTALL_COMMAND ""
     STEP_TARGETS configure build
-    ${cmake_3_4_USES_TERMINAL_OPTIONS}
+    USES_TERMINAL_CONFIGURE 1
+    USES_TERMINAL_BUILD 1
+    USES_TERMINAL_INSTALL 1
     )
 
   get_ext_project_build_command(run_clean_compiler_rt clean)
@@ -101,7 +103,7 @@
                     COMMAND "${CMAKE_COMMAND}"
                              -DCMAKE_INSTALL_COMPONENT=compiler-rt
                              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake"
-                    ${cmake_3_2_USES_TERMINAL})
+                    USES_TERMINAL)
 
   # Add top-level targets that build specific compiler-rt runtimes.
   set(COMPILER_RT_RUNTIMES asan builtins dfsan lsan msan profile tsan ubsan)
@@ -111,7 +113,7 @@
       COMMAND ${build_runtime_cmd}
       DEPENDS compiler-rt-configure
       WORKING_DIRECTORY ${BINARY_DIR}
-      VERBATIM ${cmake_3_2_USES_TERMINAL})
+      VERBATIM USES_TERMINAL)
   endforeach()
 
   if(LLVM_INCLUDE_TESTS)
@@ -129,7 +131,9 @@
         COMMAND ${run_test_suite}
         DEPENDS compiler-rt-build ${COMPILER_RT_TEST_DEPENDENCIES}
         WORKING_DIRECTORY ${BINARY_DIR}
-        VERBATIM ${cmake_3_2_USES_TERMINAL})
+        VERBATIM
+        USES_TERMINAL
+        )
     endforeach()
 
     # Add special target to run all compiler-rt test suites.
@@ -138,7 +142,16 @@
       COMMAND ${run_check_compiler_rt}
       DEPENDS compiler-rt-build ${COMPILER_RT_TEST_DEPENDENCIES}
       WORKING_DIRECTORY ${BINARY_DIR}
-      VERBATIM ${cmake_3_2_USES_TERMINAL})
-    set_property(GLOBAL APPEND PROPERTY LLVM_LIT_DEPENDS check-compiler-rt)
+      VERBATIM USES_TERMINAL)
+
+    # Add special target to run all compiler-rt test suites.
+    get_ext_project_build_command(run_check_compiler_rt compiler-rt-test-depends)
+    add_custom_target(compiler-rt-test-depends
+      COMMAND ${run_check_compiler_rt}
+      DEPENDS compiler-rt-build ${COMPILER_RT_TEST_DEPENDENCIES}
+      WORKING_DIRECTORY ${BINARY_DIR}
+      VERBATIM USES_TERMINAL)
+    set_property(GLOBAL APPEND PROPERTY LLVM_ADDITIONAL_TEST_DEPENDS compiler-rt-test-depends)
+    set_property(GLOBAL APPEND PROPERTY LLVM_ADDITIONAL_TEST_TARGETS check-compiler-rt)
   endif()
 endif()
diff --git a/runtime/Makefile b/runtime/Makefile
deleted file mode 100644
index 4b0625d..0000000
--- a/runtime/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-##===- runtime/Makefile ------------------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ..
-include $(CLANG_LEVEL)/../../Makefile.config
-
-ifndef NO_RUNTIME_LIBS
-
-PARALLEL_DIRS  := compiler-rt libcxx
-
-endif
-
-include $(CLANG_LEVEL)/Makefile
-
-install::
-
diff --git a/runtime/compiler-rt/Makefile b/runtime/compiler-rt/Makefile
deleted file mode 100644
index 55642a6..0000000
--- a/runtime/compiler-rt/Makefile
+++ /dev/null
@@ -1,259 +0,0 @@
-##===- clang/runtime/compiler-rt/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# This file defines support for building the Clang runtime libraries (which are
-# implemented by compiler-rt) and placing them in the proper locations in the
-# Clang resources directory (i.e., where the driver expects them).
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-include $(CLANG_LEVEL)/Makefile
-
-CLANG_VERSION := $(word 3,$(shell grep "CLANG_VERSION " \
-	$(PROJ_OBJ_DIR)/$(CLANG_LEVEL)/include/clang/Basic/Version.inc))
-
-ResourceDir := $(PROJ_OBJ_ROOT)/$(BuildMode)/lib/clang/$(CLANG_VERSION)
-PROJ_resources := $(DESTDIR)$(PROJ_prefix)/lib/clang/$(CLANG_VERSION)
-
-ResourceLibDir := $(ResourceDir)/lib
-ResourceIncludeDir := $(ResourceDir)/include
-PROJ_resources_lib := $(PROJ_resources)/lib
-PROJ_resources_include := $(PROJ_resources)/include
-
-# Expect compiler-rt to be in llvm/projects/compiler-rt
-COMPILERRT_SRC_ROOT := $(LLVM_SRC_ROOT)/projects/compiler-rt
-
-# We don't currently support building runtime libraries when we are
-# cross-compiling. The issue is that we really want to be set up so that the
-# available compiler targets are independent of the current build.
-#
-# Since we have to build the runtime libraries for the target, it requires we
-# have a cross compiler from the build machine to the target. Although in the
-# case where for the current build (host == target), we do have such a cross
-# compiler, but not defined in a way that is easy for us to reuse. Regardless,
-# that also wouldn't help for other possible compiler configurations.
-#
-# Thus, the simple set up we currently use is to assume that we will be using
-# the just built Clang to compile the compiler-rt libraries. As we grow better
-# cross compilation support inside Clang and tool support in LLVM, this makes it
-# easier for us to achieve the goal of having the compiler targets be easily
-# selected at configure time. However, this design does currently preclude the
-# building of compiler-rt libraries when the Clang itself is being cross
-# compiled.
-#
-# There are three possible solutions:
-#  1. Require building a build-target version of Clang when cross compiling. This
-#     is simplest, but als greatly increases the build time of cross builds.
-#
-#  2. Require cross builds have a build-target version of Clang available for
-#     use. This is a reasonable compromise on #1, as the compiler-rt libraries
-#     are simple enough that there is not a strong desire to ensure they are
-#     built with the exact version of Clang being used. Similarly, as Clang
-#     becomes a better cross compiler it is also increasingly more likely that
-#     the cross compiler being used will already be a version of Clang.
-#
-#  3. Come up with an alternate mechanism to define all the toolchain
-#     information that compiler-rt would need to build libraries for all the
-#     requested targets. This might be a simple short term solution, but is
-#     likely to be unwieldy and irritating to maintain in the long term.
-ifneq ($(LLVM_CROSS_COMPILING),1)
-ifneq ($(CLANG_NO_RUNTIME),1)
-ifeq ($(shell test -d $(COMPILERRT_SRC_ROOT) && echo OK),OK)
-
-# Select the compiler-rt configuration to use, and install directory.
-#
-# FIXME: Eventually, we want some kind of configure support for this. We want to
-# build/install runtime libraries for as many targets as clang was configured to
-# support.
-RuntimeDirs :=
-ifeq ($(OS),Darwin)
-RuntimeDirs += darwin macho_embedded
-RuntimeLibrary.darwin.Configs := \
-	eprintf.a 10.4.a osx.a cc_kext.a \
-	asan_osx_dynamic.dylib \
-	profile_osx.a \
-	ubsan_osx_dynamic.dylib
-
-IOS_SDK := $(shell xcrun --show-sdk-path -sdk iphoneos 2> /dev/null)
-IOSSIM_SDK := $(shell xcrun --show-sdk-path -sdk iphonesimulator 2> /dev/null)
-
-ifneq ($(IOS_SDK)$(IOSSIM_SDK),)
-RuntimeLibrary.darwin.Configs += ios.a profile_ios.a
-endif
-
-ifneq ($(IOS_SDK),)
-ifneq (,$(filter ARM AARCH64,$(TARGETS_TO_BUILD)))
-RuntimeLibrary.darwin.Configs += cc_kext_ios.a
-endif
-endif
-
-ifneq ($(IOSSIM_SDK),)
-RuntimeLibrary.darwin.Configs += asan_iossim_dynamic.dylib \
-                                 ubsan_iossim_dynamic.dylib
-endif
-
-RuntimeLibrary.macho_embedded.Configs := \
-	hard_static.a hard_pic.a
-ifneq (,$(findstring ARM,$(TARGETS_TO_BUILD)))
-RuntimeLibrary.macho_embedded.Configs += \
-	soft_static.a soft_pic.a
-endif
-endif
-
-# On Linux, include a library which has all the runtime functions.
-ifeq ($(OS),Linux)
-RuntimeDirs += linux
-RuntimeLibrary.linux.Configs :=
-
-# TryCompile compiler source flags
-# Returns exit code of running a compiler invocation.
-TryCompile = \
-  $(shell \
-    cflags=""; \
-    for flag in $(3); do \
-      cflags="$$cflags $$flag"; \
-    done; \
-    $(1) $$cflags $(2) -o /dev/null > /dev/null 2> /dev/null ; \
-    echo $$?)
-
-# We try to build 32-bit runtimes both on 32-bit hosts and 64-bit hosts.
-Runtime32BitConfigs = \
-	builtins-i386.a profile-i386.a
-
-# We currently only try to generate runtime libraries on x86.
-ifeq ($(ARCH),x86)
-RuntimeLibrary.linux.Configs += $(Runtime32BitConfigs)
-endif
-
-ifeq ($(ARCH),x86_64)
-RuntimeLibrary.linux.Configs += \
-	builtins-x86_64.a profile-x86_64.a
-# We need to build 32-bit libraries on 64-bit platform, and add them
-# to the list of runtime libraries to make "clang -m32" work.
-# We check that Clang can produce working 32-bit binaries by compiling a simple
-# executable.
-test_source = $(LLVM_SRC_ROOT)/tools/clang/runtime/compiler-rt/clang_linux_test_input.c
-ifeq ($(call TryCompile,$(ToolDir)/clang,$(test_source),-m32),0)
-RuntimeLibrary.linux.Configs += $(Runtime32BitConfigs)
-endif
-endif
-
-endif
-
-####
-# The build rules below are designed to be generic and should only need to be
-# modified based on changes in the compiler-rt layout or build system.
-####
-
-# Rule to build the compiler-rt libraries we need.
-#
-# We build all the libraries in a single shot to avoid recursive make as much as
-# possible.
-BuildRuntimeLibraries:
-	$(Verb) $(MAKE) -C $(COMPILERRT_SRC_ROOT) \
-	  ProjSrcRoot=$(COMPILERRT_SRC_ROOT) \
-	  ProjObjRoot=$(PROJ_OBJ_DIR) \
-	  CC="$(ToolDir)/clang" \
-	  VERBOSE=$(VERBOSE) \
-	  $(RuntimeDirs:%=clang_%)
-.PHONY: BuildRuntimeLibraries
-CleanRuntimeLibraries:
-	$(Verb) $(MAKE) -C $(COMPILERRT_SRC_ROOT) \
-	  ProjSrcRoot=$(COMPILERRT_SRC_ROOT) \
-	  ProjObjRoot=$(PROJ_OBJ_DIR) \
-	  VERBOSE=$(VERBOSE) \
-	  clean
-.PHONY: CleanRuntimeLibraries
-RuntimeHeader: $(ResourceIncludeDir)/sanitizer
-
-$(PROJ_resources_lib):
-	$(Verb) $(MKDIR) $@
-
-$(ResourceIncludeDir):
-	$(Verb) $(MKDIR) $@
-
-$(ResourceIncludeDir)/sanitizer: $(ResourceIncludeDir)
-	$(Verb) $(MKDIR) $@
-	$(Verb) cp $(COMPILERRT_SRC_ROOT)/include/sanitizer/*.h $@
-
-# Expand rules for copying/installing each individual library. We can't use
-# implicit rules here because we need to match against multiple things.
-define RuntimeLibraryTemplate
-$(PROJ_OBJ_DIR)/clang_$1/%/libcompiler_rt.a: BuildRuntimeLibraries
-	@true
-$(PROJ_OBJ_DIR)/clang_$1/%/libcompiler_rt.so: BuildRuntimeLibraries
-	@true
-$(PROJ_OBJ_DIR)/clang_$1/%/libcompiler_rt.dylib: BuildRuntimeLibraries
-	@true
-.PRECIOUS: $(PROJ_OBJ_DIR)/clang_$1/%/libcompiler_rt.a
-
-# Rule to copy the libraries to their resource directory location.
-$(ResourceLibDir)/$1/libclang_rt.%.a: \
-		$(PROJ_OBJ_DIR)/clang_$1/%/libcompiler_rt.a \
-		$(ResourceLibDir)/$1/.dir
-	$(Echo) Copying runtime library $1/$$* to build dir
-	$(Verb) cp $(PROJ_OBJ_DIR)/clang_$1/$$*/libcompiler_rt.a $$@
-$(ResourceLibDir)/$1/libclang_rt.%.so: \
-		$(PROJ_OBJ_DIR)/clang_$1/%/libcompiler_rt.so \
-		$(ResourceLibDir)/$1/.dir
-	$(Echo) Copying runtime library $1/$$* to build dir
-	$(Verb) cp $(PROJ_OBJ_DIR)/clang_$1/$$*/libcompiler_rt.so $$@
-$(ResourceLibDir)/$1/libclang_rt.%.dylib: \
-		$(PROJ_OBJ_DIR)/clang_$1/%/libcompiler_rt.dylib \
-		$(ResourceLibDir)/$1/.dir
-	$(Echo) Copying runtime library $1/$$* to build dir
-	$(Verb) cp $(PROJ_OBJ_DIR)/clang_$1/$$*/libcompiler_rt.dylib $$@
-RuntimeLibrary.$1: \
-		$(RuntimeLibrary.$1.Configs:%=$(ResourceLibDir)/$1/libclang_rt.%)
-.PHONY: RuntimeLibrary.$1
-
-$(PROJ_resources_lib)/$1: $(PROJ_resources_lib)
-	$(Verb) $(MKDIR) $$@
-
-$(PROJ_resources_lib)/$1/libclang_rt.%.a: \
-		$(ResourceLibDir)/$1/libclang_rt.%.a | $(PROJ_resources_lib)/$1
-	$(Echo) Installing compiler runtime library: $1/$$*
-	$(Verb) $(DataInstall) $$< $(PROJ_resources_lib)/$1
-$(PROJ_resources_lib)/$1/libclang_rt.%.so: \
-		$(ResourceLibDir)/$1/libclang_rt.%.so | $(PROJ_resources_lib)/$1
-	$(Echo) Installing compiler runtime library: $1/$$*
-	$(Verb) $(DataInstall) $$< $(PROJ_resources_lib)/$1
-$(PROJ_resources_lib)/$1/libclang_rt.%.dylib: \
-		$(ResourceLibDir)/$1/libclang_rt.%.dylib | $(PROJ_resources_lib)/$1
-	$(Echo) Installing compiler runtime library: $1/$$*
-	$(Verb) $(DataInstall) $$< $(PROJ_resources_lib)/$1
-
-# Rule to install runtime libraries.
-RuntimeLibraryInstall.$1: \
-		$(RuntimeLibrary.$1.Configs:%=$(PROJ_resources_lib)/$1/libclang_rt.%)
-.PHONY: RuntimeLibraryInstall.$1
-endef
-$(foreach lib,$(RuntimeDirs), $(eval $(call RuntimeLibraryTemplate,$(lib))))
-
-$(PROJ_resources_include):
-	$(Verb) $(MKDIR) $@
-
-$(PROJ_resources_include)/sanitizer: $(ResourceIncludeDir)/sanitizer $(PROJ_resources_include)
-	$(Verb) $(MKDIR) $@
-	$(Echo) Installing compiler runtime headers
-	$(Verb) $(DataInstall) $(ResourceIncludeDir)/sanitizer/* \
-                               $(PROJ_resources_include)/sanitizer
-
-RuntimeHeaderInstall: $(PROJ_resources_include)/sanitizer
-.PHONY: RuntimeHeaderInstall
-
-# Hook into the standard Makefile rules.
-all-local:: $(RuntimeDirs:%=RuntimeLibrary.%) RuntimeHeader
-install-local:: $(RuntimeDirs:%=RuntimeLibraryInstall.%) RuntimeHeaderInstall
-clean-local:: CleanRuntimeLibraries
-
-endif
-endif
-endif
diff --git a/runtime/libcxx/Makefile b/runtime/libcxx/Makefile
deleted file mode 100644
index a7df868..0000000
--- a/runtime/libcxx/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-##===- clang/runtime/libcxx/Makefile -----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# This file defines support for installing a copy of the libcxx headers where
-# the driver expects them.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-include $(CLANG_LEVEL)/Makefile
-
-PROJ_libcxx_hdrs := $(DESTDIR)$(PROJ_prefix)/include
-
-# Expect libcxx to be in llvm/projects/libcxx
-LIBCXX_SRC_ROOT := $(LLVM_SRC_ROOT)/projects/libcxx
-
-ifneq ($(CLANG_NO_RUNTIME),1)
-ifeq ($(shell test -d $(LIBCXX_SRC_ROOT) && echo OK),OK)
-
-install-local::
-	$(MAKE) -C $(LIBCXX_SRC_ROOT) \
-	  HEADER_DIR=$(PROJ_libcxx_hdrs) installheaders
-
-endif
-endif
diff --git a/test/ARCMT/with space/test.h b/test/ARCMT/Inputs/with space/test.h
similarity index 100%
rename from test/ARCMT/with space/test.h
rename to test/ARCMT/Inputs/with space/test.h
diff --git a/test/ARCMT/with space/test.h.result b/test/ARCMT/Inputs/with space/test.h.result
similarity index 100%
rename from test/ARCMT/with space/test.h.result
rename to test/ARCMT/Inputs/with space/test.h.result
diff --git a/test/ARCMT/with space/test1.m.in b/test/ARCMT/Inputs/with space/test1.m.in
similarity index 100%
rename from test/ARCMT/with space/test1.m.in
rename to test/ARCMT/Inputs/with space/test1.m.in
diff --git a/test/ARCMT/with space/test1.m.in.result b/test/ARCMT/Inputs/with space/test1.m.in.result
similarity index 100%
rename from test/ARCMT/with space/test1.m.in.result
rename to test/ARCMT/Inputs/with space/test1.m.in.result
diff --git a/test/ARCMT/with space/test2.m.in b/test/ARCMT/Inputs/with space/test2.m.in
similarity index 100%
rename from test/ARCMT/with space/test2.m.in
rename to test/ARCMT/Inputs/with space/test2.m.in
diff --git a/test/ARCMT/with space/test2.m.in.result b/test/ARCMT/Inputs/with space/test2.m.in.result
similarity index 100%
rename from test/ARCMT/with space/test2.m.in.result
rename to test/ARCMT/Inputs/with space/test2.m.in.result
diff --git a/test/ARCMT/migrate-space-in-path.m b/test/ARCMT/migrate-space-in-path.m
index a797e6d..d060485 100644
--- a/test/ARCMT/migrate-space-in-path.m
+++ b/test/ARCMT/migrate-space-in-path.m
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t.migrate
-// RUN: %clang_cc1 -arcmt-migrate -mt-migrate-directory %t.migrate %S/"with space"/test1.m.in -x objective-c 
-// RUN: %clang_cc1 -arcmt-migrate -mt-migrate-directory %t.migrate %S/"with space"/test2.m.in -x objective-c 
-// RUN: c-arcmt-test -mt-migrate-directory %t.migrate | arcmt-test -verify-transformed-files %S/"with space"/test1.m.in.result %S/"with space"/test2.m.in.result %S/"with space"/test.h.result
+// RUN: %clang_cc1 -arcmt-migrate -mt-migrate-directory %t.migrate %S/Inputs/"with space"/test1.m.in -x objective-c
+// RUN: %clang_cc1 -arcmt-migrate -mt-migrate-directory %t.migrate %S/Inputs/"with space"/test2.m.in -x objective-c
+// RUN: c-arcmt-test -mt-migrate-directory %t.migrate | arcmt-test -verify-transformed-files %S/Inputs/"with space"/test1.m.in.result %S/Inputs/"with space"/test2.m.in.result %S/Inputs/"with space"/test.h.result
 // RUN: rm -rf %t.migrate
diff --git a/test/ASTMerge/Inputs/class1.cpp b/test/ASTMerge/Inputs/class1.cpp
index 0cd6565..b0a7645 100644
--- a/test/ASTMerge/Inputs/class1.cpp
+++ b/test/ASTMerge/Inputs/class1.cpp
@@ -1,5 +1,6 @@
 struct A {
-  int x;
+  public:
+    int x;
 };
 
 struct B : A {
diff --git a/test/ASTMerge/Inputs/class2.cpp b/test/ASTMerge/Inputs/class2.cpp
index 5d5d9ca..2bed6d7 100644
--- a/test/ASTMerge/Inputs/class2.cpp
+++ b/test/ASTMerge/Inputs/class2.cpp
@@ -1,5 +1,6 @@
 struct A {
-  int x;
+  public:
+    int x;
 };
 
 struct B : A {
diff --git a/test/ASTMerge/anonymous-fields.cpp b/test/ASTMerge/anonymous-fields.cpp
index 6210142..67afc29 100644
--- a/test/ASTMerge/anonymous-fields.cpp
+++ b/test/ASTMerge/anonymous-fields.cpp
@@ -1,4 +1,3 @@
-// XFAIL: *
 // RUN: %clang_cc1 -emit-pch -o %t.1.ast %S/Inputs/anonymous-fields1.cpp
 // RUN: %clang_cc1 -emit-pch -o %t.2.ast %S/Inputs/anonymous-fields2.cpp
 // RUN: %clang_cc1 -emit-obj -o /dev/null -ast-merge %t.1.ast -ast-merge %t.2.ast %s
diff --git a/test/ASTMerge/class.cpp b/test/ASTMerge/class.cpp
index 7b31187..a68a2d1 100644
--- a/test/ASTMerge/class.cpp
+++ b/test/ASTMerge/class.cpp
@@ -3,12 +3,12 @@
 // RUN: %clang_cc1 -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only %s 2>&1 | FileCheck %s
 // RUN: %clang_cc1 -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only %s 2>&1 -Wno-odr -Werror
 
-// CHECK: class1.cpp:5:8: warning: type 'B' has incompatible definitions in different translation units
-// CHECK: class1.cpp:6:9: note: field 'y' has type 'float' here
-// CHECK: class2.cpp:6:7: note: field 'y' has type 'int' here
+// CHECK: class1.cpp:6:8: warning: type 'B' has incompatible definitions in different translation units
+// CHECK: class1.cpp:7:9: note: field 'y' has type 'float' here
+// CHECK: class2.cpp:7:7: note: field 'y' has type 'int' here
 
 // FIXME: we should also complain about mismatched types on the method
 
-// CHECK: class1.cpp:17:6: warning: type 'E' has incompatible definitions in different translation units
-// CHECK: class1.cpp:18:3: note: enumerator 'b' with value 1 here
-// CHECK: class2.cpp:11:3: note: enumerator 'a' with value 0 here
+// CHECK: class1.cpp:18:6: warning: type 'E' has incompatible definitions in different translation units
+// CHECK: class1.cpp:19:3: note: enumerator 'b' with value 1 here
+// CHECK: class2.cpp:12:3: note: enumerator 'a' with value 0 here
diff --git a/test/Analysis/Inputs/localization-pch.h b/test/Analysis/Inputs/localization-pch.h
deleted file mode 100644
index 973270e..0000000
--- a/test/Analysis/Inputs/localization-pch.h
+++ /dev/null
@@ -1,5 +0,0 @@
-// Used to test missing checker for missing localization context comments
-// in precompiled headers.
-
-#define MyLocalizedStringInPCH(key) NSLocalizedString((key), @"")
-
diff --git a/test/Analysis/MPIMock.h b/test/Analysis/MPIMock.h
new file mode 100644
index 0000000..01d2d42
--- /dev/null
+++ b/test/Analysis/MPIMock.h
@@ -0,0 +1,55 @@
+// Message Passing Interface mock header. Mocks MPI constants and functions, in
+// order to make them available in distinct integration test files.
+
+#define NULL 0
+
+// mock types
+typedef int MPI_Datatype;
+typedef int MPI_Comm;
+typedef int MPI_Request;
+typedef int MPI_Status;
+typedef int MPI_Op;
+typedef int int8_t;
+typedef int uint8_t;
+typedef int uint16_t;
+typedef int int64_t;
+namespace std { template<class T> struct complex { T real; T imag; }; }
+
+// mock constants
+#define MPI_DATATYPE_NULL 0
+#define MPI_CHAR 0
+#define MPI_BYTE 0
+#define MPI_INT 0
+#define MPI_LONG 0
+#define MPI_LONG_DOUBLE 0
+#define MPI_UNSIGNED 0
+#define MPI_INT8_T 0
+#define MPI_UINT8_T 0
+#define MPI_UINT16_T 0
+#define MPI_C_LONG_DOUBLE_COMPLEX 0
+#define MPI_FLOAT 0
+#define MPI_DOUBLE 0
+#define MPI_CXX_BOOL 0
+#define MPI_CXX_FLOAT_COMPLEX 0
+#define MPI_CXX_DOUBLE_COMPLEX 0
+#define MPI_CXX_LONG_DOUBLE_COMPLEX 0
+#define MPI_IN_PLACE 0
+#define MPI_COMM_WORLD 0
+#define MPI_STATUS_IGNORE 0
+#define MPI_STATUSES_IGNORE 0
+#define MPI_SUM 0
+
+// mock functions
+int MPI_Comm_size(MPI_Comm, int *);
+int MPI_Comm_rank(MPI_Comm, int *);
+int MPI_Send(const void *, int, MPI_Datatype, int, int, MPI_Comm);
+int MPI_Recv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Status *);
+int MPI_Isend(const void *, int, MPI_Datatype, int, int, MPI_Comm,
+    MPI_Request *);
+int MPI_Irecv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Request *);
+int MPI_Wait(MPI_Request *, MPI_Status *);
+int MPI_Waitall(int, MPI_Request[], MPI_Status[]);
+int MPI_Reduce(const void *, void *, int, MPI_Datatype, MPI_Op, int, MPI_Comm);
+int MPI_Ireduce(const void *, void *, int, MPI_Datatype, MPI_Op, int, MPI_Comm,
+    MPI_Request *);
+int MPI_Bcast(void *, int count, MPI_Datatype, int, MPI_Comm);
diff --git a/test/Analysis/MemRegion.cpp b/test/Analysis/MemRegion.cpp
new file mode 100644
index 0000000..992b7f1
--- /dev/null
+++ b/test/Analysis/MemRegion.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=optin.mpi.MPI-Checker -verify %s
+
+#include "MPIMock.h"
+
+// Use MPI-Checker to test 'getDescriptiveName', as the checker uses the
+// function for diagnostics.
+void testGetDescriptiveName() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq1;
+  MPI_Wait(&sendReq1, MPI_STATUS_IGNORE); // expected-warning{{Request 'sendReq1' has no matching nonblocking call.}}
+}
+
+void testGetDescriptiveName2() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq1[10][10][10];
+  MPI_Wait(&sendReq1[1][7][9], MPI_STATUS_IGNORE); // expected-warning{{Request 'sendReq1[1][7][9]' has no matching nonblocking call.}}
+}
+
+void testGetDescriptiveName3() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  typedef struct { MPI_Request req; } ReqStruct;
+  ReqStruct rs;
+  MPI_Request *r = &rs.req;
+  MPI_Wait(r, MPI_STATUS_IGNORE); // expected-warning{{Request 'rs.req' has no matching nonblocking call.}}
+}
+
+void testGetDescriptiveName4() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  typedef struct { MPI_Request req[2][2]; } ReqStruct;
+  ReqStruct rs;
+  MPI_Request *r = &rs.req[0][1];
+  MPI_Wait(r, MPI_STATUS_IGNORE); // expected-warning{{Request 'rs.req[0][1]' has no matching nonblocking call.}}
+}
+
+void testGetDescriptiveName5() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  typedef struct { MPI_Request req; } ReqStructInner;
+  typedef struct  { ReqStructInner req; } ReqStruct;
+  ReqStruct rs;
+  MPI_Request *r = &rs.req.req;
+  MPI_Wait(r, MPI_STATUS_IGNORE); // expected-warning{{Request 'rs.req.req' has no matching nonblocking call.}}
+}
diff --git a/test/Analysis/NewDelete-checker-test.cpp b/test/Analysis/NewDelete-checker-test.cpp
index 443cb2e..78a0015 100644
--- a/test/Analysis/NewDelete-checker-test.cpp
+++ b/test/Analysis/NewDelete-checker-test.cpp
@@ -377,3 +377,19 @@
   delete foo;
   delete foo;  // expected-warning {{Attempt to delete released memory}}
 }
+
+struct Base {
+  virtual ~Base() {}
+};
+
+struct Derived : Base {
+};
+
+Base *allocate() {
+  return new Derived;
+}
+
+void shouldNotReportLeak() {
+  Derived *p = (Derived *)allocate();
+  delete p;
+}
diff --git a/test/Analysis/analyzeOneFunction.m b/test/Analysis/analyzeOneFunction.m
index 1ff2fc8..e70b2d7 100644
--- a/test/Analysis/analyzeOneFunction.m
+++ b/test/Analysis/analyzeOneFunction.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -analyze -analyze-function="myMethodWithY:withX:" -analyzer-checker=core,osx.cocoa.RetainCount -analyzer-store=region -verify %s
+// RUN: %clang_cc1 -analyze -analyze-function="-[Test1 myMethodWithY:withX:]" -analyzer-checker=core,osx.cocoa.RetainCount -analyzer-store=region -verify %s
 
 typedef signed char BOOL;
 typedef unsigned int NSUInteger;
diff --git a/test/Analysis/analyze_display_progress.cpp b/test/Analysis/analyze_display_progress.cpp
deleted file mode 100644
index c84ab63..0000000
--- a/test/Analysis/analyze_display_progress.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clang_cc1 -analyze -analyzer-display-progress %s 2>&1 | FileCheck %s
-
-void f() {};
-void g() {};
-void h() {}
-
-struct SomeStruct {
-  void f() {}
-};
-
-struct SomeOtherStruct {
-  void f() {}
-};
-
-namespace ns {
-  struct SomeStruct {
-    void f() {}
-  };
-}
-
-// CHECK: analyze_display_progress.cpp f
-// CHECK: analyze_display_progress.cpp g
-// CHECK: analyze_display_progress.cpp h
-// CHECK: analyze_display_progress.cpp SomeStruct::f
-// CHECK: analyze_display_progress.cpp SomeOtherStruct::f
-// CHECK: analyze_display_progress.cpp ns::SomeStruct::f
diff --git a/test/Analysis/analyzer-display-progress.cpp b/test/Analysis/analyzer-display-progress.cpp
new file mode 100644
index 0000000..5d9f5e5
--- /dev/null
+++ b/test/Analysis/analyzer-display-progress.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -analyze -analyzer-display-progress %s 2>&1 | FileCheck %s
+
+void f() {};
+void g() {};
+void h() {}
+
+struct SomeStruct {
+  void f() {}
+};
+
+struct SomeOtherStruct {
+  void f() {}
+};
+
+namespace ns {
+  struct SomeStruct {
+    void f(int) {}
+    void f(float, ::SomeStruct) {}
+    void f(float, SomeStruct) {}
+  };
+}
+
+// CHECK: analyzer-display-progress.cpp f()
+// CHECK: analyzer-display-progress.cpp g()
+// CHECK: analyzer-display-progress.cpp h()
+// CHECK: analyzer-display-progress.cpp SomeStruct::f()
+// CHECK: analyzer-display-progress.cpp SomeOtherStruct::f()
+// CHECK: analyzer-display-progress.cpp ns::SomeStruct::f(int)
+// CHECK: analyzer-display-progress.cpp ns::SomeStruct::f(float, ::SomeStruct)
+// CHECK: analyzer-display-progress.cpp ns::SomeStruct::f(float, struct ns::SomeStruct)
diff --git a/test/Analysis/analyzer-display-progress.m b/test/Analysis/analyzer-display-progress.m
new file mode 100644
index 0000000..cc43cf3
--- /dev/null
+++ b/test/Analysis/analyzer-display-progress.m
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -fblocks -analyze -analyzer-display-progress %s 2>&1 | FileCheck %s
+
+#include "Inputs/system-header-simulator-objc.h"
+
+static void f() {}
+
+@interface I: NSObject
+-(void)instanceMethod:(int)arg1 with:(int)arg2;
++(void)classMethod;
+@end
+
+@implementation I
+-(void)instanceMethod:(int)arg1 with:(int)arg2 {}
++(void)classMethod {}
+@end
+
+void g(I *i, int x, int y) {
+  [I classMethod];
+  [i instanceMethod: x with: y];
+
+  void (^block)(void);
+  block = ^{};
+  block();
+}
+
+// CHECK: analyzer-display-progress.m f
+// CHECK: analyzer-display-progress.m -[I instanceMethod:with:]
+// CHECK: analyzer-display-progress.m +[I classMethod]
+// CHECK: analyzer-display-progress.m g
+// CHECK: analyzer-display-progress.m block (line: 22, col: 11)
diff --git a/test/Analysis/analyzer-enabled-checkers.c b/test/Analysis/analyzer-enabled-checkers.c
new file mode 100644
index 0000000..e60de05
--- /dev/null
+++ b/test/Analysis/analyzer-enabled-checkers.c
@@ -0,0 +1,20 @@
+// RUN: %clang -target x86_64-apple-darwin10 --analyze %s -o /dev/null -Xclang -analyzer-checker=core -Xclang -analyzer-list-enabled-checkers > %t 2>&1
+// RUN: FileCheck --input-file=%t %s
+
+// CHECK: OVERVIEW: Clang Static Analyzer Enabled Checkers List
+// CHECK: core.CallAndMessage
+// CHECK: core.DivideZero
+// CHECK: core.DynamicTypePropagation
+// CHECK: core.NonNullParamChecker
+// CHECK: core.NullDereference
+// CHECK: core.StackAddressEscape
+// CHECK: core.UndefinedBinaryOperatorResult
+// CHECK: core.VLASize
+// CHECK: core.builtin.BuiltinFunctions
+// CHECK: core.builtin.NoReturnFunctions
+// CHECK: core.uninitialized.ArraySubscript
+// CHECK: core.uninitialized.Assign
+// CHECK: core.uninitialized.Branch
+// CHECK: core.uninitialized.CapturedBlockVariable
+// CHECK: core.uninitialized.UndefReturn
+
diff --git a/test/Analysis/call-invalidation.cpp b/test/Analysis/call-invalidation.cpp
index 7297d1e..80323ff 100644
--- a/test/Analysis/call-invalidation.cpp
+++ b/test/Analysis/call-invalidation.cpp
@@ -118,3 +118,50 @@
 }
 
 
+struct PlainStruct {
+  int x, y;
+  mutable int z;
+};
+
+PlainStruct glob;
+
+void useAnything(void *);
+void useAnythingConst(const void *);
+
+void testInvalidationThroughBaseRegionPointer() {
+  PlainStruct s1;
+  s1.x = 1;
+  s1.z = 1;
+  clang_analyzer_eval(s1.x == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(s1.z == 1); // expected-warning{{TRUE}}
+  // Not only passing a structure pointer through const pointer parameter,
+  // but also passing a field pointer through const pointer parameter
+  // should preserve the contents of the structure.
+  useAnythingConst(&(s1.y));
+  clang_analyzer_eval(s1.x == 1); // expected-warning{{TRUE}}
+  // FIXME: Should say "UNKNOWN", because it is not uncommon to
+  // modify a mutable member variable through const pointer.
+  clang_analyzer_eval(s1.z == 1); // expected-warning{{TRUE}}
+  useAnything(&(s1.y));
+  clang_analyzer_eval(s1.x == 1); // expected-warning{{UNKNOWN}}
+}
+
+
+void useFirstConstSecondNonConst(const void *x, void *y);
+void useFirstNonConstSecondConst(void *x, const void *y);
+
+void testMixedConstNonConstCalls() {
+  PlainStruct s2;
+  s2.x = 1;
+  useFirstConstSecondNonConst(&(s2.x), &(s2.y));
+  clang_analyzer_eval(s2.x == 1); // expected-warning{{UNKNOWN}}
+  s2.x = 1;
+  useFirstNonConstSecondConst(&(s2.x), &(s2.y));
+  clang_analyzer_eval(s2.x == 1); // expected-warning{{UNKNOWN}}
+  s2.y = 1;
+  useFirstConstSecondNonConst(&(s2.x), &(s2.y));
+  clang_analyzer_eval(s2.y == 1); // expected-warning{{UNKNOWN}}
+  s2.y = 1;
+  useFirstNonConstSecondConst(&(s2.x), &(s2.y));
+  clang_analyzer_eval(s2.y == 1); // expected-warning{{UNKNOWN}}
+}
diff --git a/test/Analysis/copypaste/asm.cpp b/test/Analysis/copypaste/asm.cpp
new file mode 100644
index 0000000..e93f119
--- /dev/null
+++ b/test/Analysis/copypaste/asm.cpp
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -analyze -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+int foo1(int src) {
+  int dst = src;
+  if (src < 100 && src > 0) {
+
+    asm ("mov %1, %0\n\t"
+         "add $1, %0"
+         : "=r" (dst)
+         : "r" (src));
+
+  }
+  return dst;
+}
+
+// Identical to foo1 except that it adds two instead of one, so it's no clone.
+int foo2(int src) {
+  int dst = src;
+  if (src < 100 && src > 0) {
+
+    asm ("mov %1, %0\n\t"
+         "add $2, %0"
+         : "=r" (dst)
+         : "r" (src));
+
+  }
+  return dst;
+}
+
+// Identical to foo1 except that its a volatile asm statement, so it's no clone.
+int foo3(int src) {
+  int dst = src;
+  if (src < 100 && src > 0) {
+
+    asm volatile ("mov %1, %0\n\t"
+         "add $1, %0"
+         : "=r" (dst)
+         : "r" (src));
+
+  }
+  return dst;
+}
diff --git a/test/Analysis/copypaste/attributes.cpp b/test/Analysis/copypaste/attributes.cpp
new file mode 100644
index 0000000..72d654c
--- /dev/null
+++ b/test/Analysis/copypaste/attributes.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -analyze -std=c++1z -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+int foo1(int n) {
+  int result = 0;
+  switch (n) {
+  case 33:
+    result += 33;
+    [[clang::fallthrough]];
+  case 44:
+    result += 44;
+  }
+  return result;
+}
+
+// Identical to foo1 except the missing attribute.
+int foo2(int n) {
+  int result = 0;
+  switch (n) {
+  case 33:
+    result += 33;
+    ;
+  case 44:
+    result += 44;
+  }
+  return result;
+}
diff --git a/test/Analysis/copypaste/blocks.cpp b/test/Analysis/copypaste/blocks.cpp
new file mode 100644
index 0000000..0bd9812
--- /dev/null
+++ b/test/Analysis/copypaste/blocks.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -analyze -fblocks -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// This tests if we search for clones in blocks.
+
+void log();
+
+auto BlockA = ^(int a, int b){ // expected-warning{{Detected code clone.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+};
+
+auto BlockB = ^(int a, int b){ // expected-note{{Related code clone is here.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+};
diff --git a/test/Analysis/copypaste/call.cpp b/test/Analysis/copypaste/call.cpp
new file mode 100644
index 0000000..0c10262
--- /dev/null
+++ b/test/Analysis/copypaste/call.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -analyze -std=c++1z -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+bool a();
+bool b();
+
+// Calls method a with some extra code to pass the minimum complexity
+bool foo1(int x) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    return a();
+  return true;
+}
+
+// Calls method b with some extra code to pass the minimum complexity
+bool foo2(int x) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    return b();
+  return true;
+}
+
+// Test that we don't crash on function pointer calls
+
+bool (*funcPtr)(int);
+
+bool fooPtr1(int x) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    return funcPtr(1);
+  return true;
+}
diff --git a/test/Analysis/copypaste/catch.cpp b/test/Analysis/copypaste/catch.cpp
new file mode 100644
index 0000000..590ce8f
--- /dev/null
+++ b/test/Analysis/copypaste/catch.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -analyze -fcxx-exceptions -std=c++1z -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+bool foo1(int x) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    try { x--; } catch (int i) {}
+  return true;
+}
+
+// Uses parenthesis instead of type
+bool foo2(int x) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    try { x--; } catch (...) {}
+  return true;
+}
+
+// Catches a different type (long instead of int)
+bool foo3(int x) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    try { x--; } catch (long i) {}
+  return true;
+}
diff --git a/test/Analysis/copypaste/delete.cpp b/test/Analysis/copypaste/delete.cpp
new file mode 100644
index 0000000..dc42c9c
--- /dev/null
+++ b/test/Analysis/copypaste/delete.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -analyze -std=c++1z -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+bool foo1(int x, int* a) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    delete a;
+  return true;
+}
+
+// Explicit global delete
+bool foo2(int x, int* a) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    ::delete a;
+  return true;
+}
+
+// Array delete
+bool foo3(int x, int* a) {
+  if (x > 0)
+    return false;
+  else if (x < 0)
+    delete[] a;
+  return true;
+}
diff --git a/test/Analysis/copypaste/dependent-exist.cpp b/test/Analysis/copypaste/dependent-exist.cpp
new file mode 100644
index 0000000..5182ba6
--- /dev/null
+++ b/test/Analysis/copypaste/dependent-exist.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -analyze -fms-extensions -std=c++1z -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+bool foo1(int x) {
+  if (x < 0) {
+    __if_exists(x) { return false; }
+  }
+  return true;
+}
+
+// Same as above, but __if_not_exists
+bool foo2(int x) {
+  if (x < 0) {
+    __if_not_exists(x) { return false; }
+  }
+  return true;
+}
diff --git a/test/Analysis/copypaste/expr-types.cpp b/test/Analysis/copypaste/expr-types.cpp
new file mode 100644
index 0000000..14eef6e
--- /dev/null
+++ b/test/Analysis/copypaste/expr-types.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+
+int foo1(int a, int b) {
+  if (a > b)
+    return a;
+  return b;
+}
+
+// Different types, so not a clone
+int foo2(long a, long b) {
+  if (a > b)
+    return a;
+  return b;
+}
diff --git a/test/Analysis/copypaste/fold.cpp b/test/Analysis/copypaste/fold.cpp
new file mode 100644
index 0000000..548dfb1
--- /dev/null
+++ b/test/Analysis/copypaste/fold.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -analyze -std=c++1z -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+int global = 0;
+
+template<typename ...Args>
+int foo1(Args&&... args) {
+  if (global > 0)
+    return 0;
+  else if (global < 0)
+    return (args + ...);
+  return 1;
+}
+
+// Different opeator in fold expression.
+template<typename ...Args>
+int foo2(Args&&... args) {
+  if (global > 0)
+    return 0;
+  else if (global < 0)
+    return (args - ...);
+  return 1;
+}
+
+// Parameter pack on a different side
+template<typename ...Args>
+int foo3(Args&&... args) {
+  if (global > 0)
+    return 0;
+  else if (global < 0)
+    return -1;
+  return (... + args);
+return 1;
+}
diff --git a/test/Analysis/copypaste/function-try-block.cpp b/test/Analysis/copypaste/function-try-block.cpp
new file mode 100644
index 0000000..b13096d
--- /dev/null
+++ b/test/Analysis/copypaste/function-try-block.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -analyze -fcxx-exceptions -std=c++1z -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// Tests if function try blocks are correctly handled.
+
+void nonCompoundStmt1(int& x)
+  try { x += 1; } catch(...) { x -= 1; } // expected-warning{{Detected code clone.}}
+
+void nonCompoundStmt2(int& x)
+  try { x += 1; } catch(...) { x -= 1; } // expected-note{{Related code clone is here.}}
diff --git a/test/Analysis/copypaste/functions.cpp b/test/Analysis/copypaste/functions.cpp
new file mode 100644
index 0000000..2a871f7
--- /dev/null
+++ b/test/Analysis/copypaste/functions.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// This tests if we search for clones in functions.
+
+void log();
+
+int max(int a, int b) { // expected-warning{{Detected code clone.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int x, int y) { // expected-note{{Related code clone is here.}}
+  log();
+  if (x > y)
+    return x;
+  return y;
+}
+
+// Functions below are not clones and should not be reported.
+
+// The next two functions test that statement classes are still respected when
+// checking for clones in expressions. This will show that the statement
+// specific data of all base classes is collected, and not just the data of the
+// first base class.
+int testBaseClass(int a, int b) { // no-warning
+  log();
+  if (a > b)
+    return true ? a : b;
+  return b;
+}
+int testBaseClass2(int a, int b) { // no-warning
+  log();
+  if (a > b)
+    return __builtin_choose_expr(true, a, b);
+  return b;
+}
+
+// No clone because of the different comparison operator.
+int min1(int a, int b) { // no-warning
+  log();
+  if (a < b)
+    return a;
+  return b;
+}
+
+// No clone because of the different pattern in which the variables are used.
+int min2(int a, int b) { // no-warning
+  log();
+  if (a > b)
+    return b;
+  return a;
+}
+
+int foo(int a, int b) { // no-warning
+  return a + b;
+}
diff --git a/test/Analysis/copypaste/generic.c b/test/Analysis/copypaste/generic.c
new file mode 100644
index 0000000..9d83921
--- /dev/null
+++ b/test/Analysis/copypaste/generic.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -analyze -std=c11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+int global;
+
+int foo1() {
+  if (global > 0)
+    return 0;
+  else if (global < 0)
+    return _Generic(global, double: 1, float: 2, default: 3);
+  return 1;
+}
+
+// Different associated type (int instead of float)
+int foo2() {
+  if (global > 0)
+    return 0;
+  else if (global < 0)
+    return _Generic(global, double: 1, int: 2, default: 4);
+  return 1;
+}
+
+// Different number of associated types.
+int foo3() {
+  if (global > 0)
+    return 0;
+  else if (global < 0)
+    return _Generic(global, double: 1, default: 4);
+  return 1;
+}
diff --git a/test/Analysis/copypaste/labels.cpp b/test/Analysis/copypaste/labels.cpp
new file mode 100644
index 0000000..26318ac
--- /dev/null
+++ b/test/Analysis/copypaste/labels.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -analyze -std=gnu++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+
+bool foo1(int x) {
+  start:
+  if (x != 3) {
+    ++x;
+    void *ptr = &&start;
+    goto start;
+  }
+  end:
+  return false;
+}
+
+// Targeting a different label with the address-of-label operator.
+bool foo2(int x) {
+  start:
+  if (x != 3) {
+    ++x;
+    void *ptr = &&end;
+    goto start;
+  }
+  end:
+  return false;
+}
+
+// Different target label in goto
+bool foo3(int x) {
+  start:
+  if (x != 3) {
+    ++x;
+    void *ptr = &&start;
+    goto end;
+  }
+  end:
+  return false;
+}
+
+// FIXME: Can't detect same algorithm as in foo1 but with different label names.
+bool foo4(int x) {
+  foo:
+  if (x != 3) {
+    ++x;
+    void *ptr = &&foo;
+    goto foo;
+  }
+  end:
+  return false;
+}
diff --git a/test/Analysis/copypaste/lambda.cpp b/test/Analysis/copypaste/lambda.cpp
new file mode 100644
index 0000000..c13c56f
--- /dev/null
+++ b/test/Analysis/copypaste/lambda.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// expected-no-diagnostics
+
+void foo1(int a, long b) {
+  auto l = [a, b](){};
+}
+
+void foo2(int a, long b) {
+  auto l = [&a, b](){};
+}
+
+void foo3(int a, long b) {
+  auto l = [a](){};
+}
+
+void foo4(int a, long b) {
+  auto l = [=](){};
+}
+
+void foo5(int a, long b) {
+  auto l = [&](){};
+}
+
diff --git a/test/Analysis/copypaste/objc-methods.m b/test/Analysis/copypaste/objc-methods.m
new file mode 100644
index 0000000..0636447
--- /dev/null
+++ b/test/Analysis/copypaste/objc-methods.m
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -analyze -Wno-objc-root-class -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// This tests if we search for clones in Objective-C methods.
+
+@interface A
+- (int) setOk : (int) a : (int) b;
+@end
+
+@implementation A
+- (int) setOk : (int) a : (int) b {  // expected-warning{{Detected code clone.}}
+  if (a > b)
+    return a;
+  return b;
+}
+@end
+
+@interface B
+- (int) setOk : (int) a : (int) b;
+@end
+
+@implementation B
+- (int) setOk : (int) a : (int) b { // expected-note{{Related code clone is here.}}
+  if (a > b)
+    return a;
+  return b;
+}
+@end
diff --git a/test/Analysis/copypaste/sub-sequences.cpp b/test/Analysis/copypaste/sub-sequences.cpp
new file mode 100644
index 0000000..59dc464
--- /dev/null
+++ b/test/Analysis/copypaste/sub-sequences.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// This tests if sub-sequences can match with normal sequences.
+
+void log2(int a);
+void log();
+
+int max(int a, int b) {
+  log2(a);
+  log(); // expected-warning{{Detected code clone.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int a, int b) {
+  log(); // expected-note{{Related code clone is here.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) { // no-warning
+  return a + b;
+}
diff --git a/test/Analysis/cxx11-crashes.cpp b/test/Analysis/cxx11-crashes.cpp
index 3c33de3..c6034e6 100644
--- a/test/Analysis/cxx11-crashes.cpp
+++ b/test/Analysis/cxx11-crashes.cpp
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -analyze -analyzer-checker=core -std=c++11 -verify %s
-// expected-no-diagnostics
 
 // radar://11485149, PR12871
 class PlotPoint {
@@ -91,6 +90,6 @@
 void fallthrough() {
   switch (1) {
     case 1:
-      [[clang::fallthrough]];
+      [[clang::fallthrough]]; // expected-error {{does not directly precede}}
   }
 }
diff --git a/test/Analysis/dead-stores.c b/test/Analysis/dead-stores.c
index da8e8bd..cddb6c6 100644
--- a/test/Analysis/dead-stores.c
+++ b/test/Analysis/dead-stores.c
@@ -569,3 +569,7 @@
 
 }
 
+void testVolatile() {
+    volatile int v;
+    v = 0; // no warning
+}
diff --git a/test/Analysis/initializers-cfg-output.cpp b/test/Analysis/initializers-cfg-output.cpp
index db3c0fb..deefbef 100644
--- a/test/Analysis/initializers-cfg-output.cpp
+++ b/test/Analysis/initializers-cfg-output.cpp
@@ -61,7 +61,7 @@
 // CHECK:    6: B([B1.5]) (Base initializer)
 // CHECK:    7:  (CXXConstructExpr, class A)
 // CHECK:    8: A([B1.7]) (Base initializer)
-// CHECK:    9: /*implicit*/int()
+// CHECK:    9: /*implicit*/(int)0
 // CHECK:   10: i([B1.9]) (Member initializer)
 // CHECK:   11: this
 // CHECK:   12: [B1.11]->i
diff --git a/test/Analysis/localization-aggressive.m b/test/Analysis/localization-aggressive.m
index 89950d4..79c9c13 100644
--- a/test/Analysis/localization-aggressive.m
+++ b/test/Analysis/localization-aggressive.m
@@ -1,6 +1,4 @@
-// RUN: %clang_cc1 -fblocks -x objective-c-header -emit-pch -o %t.pch %S/Inputs/localization-pch.h
-
-// RUN: %clang_cc1 -analyze -fblocks -analyzer-store=region  -analyzer-checker=optin.osx.cocoa.localizability.NonLocalizedStringChecker -analyzer-checker=optin.osx.cocoa.localizability.EmptyLocalizationContextChecker -include-pch %t.pch -verify  -analyzer-config AggressiveReport=true %s
+// RUN: %clang_cc1 -analyze -fblocks -analyzer-store=region  -analyzer-checker=optin.osx.cocoa.localizability.NonLocalizedStringChecker -analyzer-checker=optin.osx.cocoa.localizability.EmptyLocalizationContextChecker -verify  -analyzer-config AggressiveReport=true %s
 
 // These declarations were reduced using Delta-Debugging from Foundation.h
 // on Mac OS X.
@@ -251,10 +249,6 @@
   NSString *string3 = NSLocalizedString((0 ? @"Critical" : @"Current"),nil); // expected-warning {{Localized string macro should include a non-empty comment for translators}}
 }
 
-- (void)testMacroExpansionDefinedInPCH {
-  NSString *string = MyLocalizedStringInPCH(@"Hello"); // expected-warning {{Localized string macro should include a non-empty comment for translators}}
-}
-
 #define KCLocalizedString(x,comment) NSLocalizedString(x, comment)
 #define POSSIBLE_FALSE_POSITIVE(s,other) KCLocalizedString(s,@"Comment")
 
diff --git a/test/Analysis/malloc.c b/test/Analysis/malloc.c
index 30d7269..51e2cd6 100644
--- a/test/Analysis/malloc.c
+++ b/test/Analysis/malloc.c
@@ -1750,6 +1750,19 @@
   fake_rb_tree_insert_node(rbt, data); // no warning
 }
 
+struct IntAndPtr {
+  int x;
+  int *p;
+};
+
+void constEscape(const void *ptr);
+
+void testConstEscapeThroughAnotherField() {
+  struct IntAndPtr s;
+  s.p = malloc(sizeof(int));
+  constEscape(&(s.x)); // could free s->p!
+} // no-warning
+
 // ----------------------------------------------------------------------------
 // False negatives.
 
@@ -1769,3 +1782,9 @@
   // FIXME: This is a leak: if we think a system function won't free p, it
   // won't free (p-1) either.
 }
+
+void testMallocIntoMalloc() {
+  StructWithPtr *s = malloc(sizeof(StructWithPtr));
+  s->memP = malloc(sizeof(int));
+  free(s);
+} // FIXME: should warn here
diff --git a/test/Analysis/mpichecker.cpp b/test/Analysis/mpichecker.cpp
new file mode 100644
index 0000000..b7a1e00
--- /dev/null
+++ b/test/Analysis/mpichecker.cpp
@@ -0,0 +1,342 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=optin.mpi.MPI-Checker -verify %s
+
+#include "MPIMock.h"
+
+void matchedWait1() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank >= 0) {
+    MPI_Request sendReq1, recvReq1;
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &recvReq1);
+
+    MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+    MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+  }
+} // no error
+
+void matchedWait2() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank >= 0) {
+    MPI_Request sendReq1, recvReq1;
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &recvReq1);
+    MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+    MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+  }
+} // no error
+
+void matchedWait3() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank >= 0) {
+    MPI_Request sendReq1, recvReq1;
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &recvReq1);
+
+    if (rank > 1000) {
+      MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+      MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+    } else {
+      MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+      MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+    }
+  }
+} // no error
+
+void missingWait1() { // Check missing wait for dead region.
+  double buf = 0;
+  MPI_Request sendReq1;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &sendReq1);
+} // expected-warning{{Request 'sendReq1' has no matching wait.}}
+
+void missingWait2() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank == 0) {
+  } else {
+    MPI_Request sendReq1, recvReq1;
+
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &recvReq1); // expected-warning{{Request 'sendReq1' has no matching wait.}}
+    MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+  }
+}
+
+void doubleNonblocking() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank == 1) {
+  } else {
+    MPI_Request sendReq1;
+
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &sendReq1); // expected-warning{{Double nonblocking on request 'sendReq1'.}}
+    MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+  }
+}
+
+void doubleNonblocking2() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Request req;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &req);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &req); // expected-warning{{Double nonblocking on request 'req'.}}
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+}
+
+void doubleNonblocking3() {
+  typedef struct { MPI_Request req; } ReqStruct;
+
+  ReqStruct rs;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &rs.req);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &rs.req); // expected-warning{{Double nonblocking on request 'rs.req'.}}
+  MPI_Wait(&rs.req, MPI_STATUS_IGNORE);
+}
+
+void doubleNonblocking4() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Request req;
+  for (int i = 0; i < 2; ++i) {
+    MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &req); // expected-warning{{Double nonblocking on request 'req'.}}
+  }
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+}
+
+void tripleNonblocking() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq);
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}}
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}}
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE);
+}
+
+void missingNonBlocking() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq1[10][10][10];
+  MPI_Wait(&sendReq1[1][7][9], MPI_STATUS_IGNORE); // expected-warning{{Request 'sendReq1[1][7][9]' has no matching nonblocking call.}}
+}
+
+void missingNonBlocking2() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  typedef struct { MPI_Request req[2][2]; } ReqStruct;
+  ReqStruct rs;
+  MPI_Request *r = &rs.req[0][1];
+  MPI_Wait(r, MPI_STATUS_IGNORE); // expected-warning{{Request 'rs.req[0][1]' has no matching nonblocking call.}}
+}
+
+void missingNonBlocking3() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq;
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE); // expected-warning{{Request 'sendReq' has no matching nonblocking call.}}
+}
+
+void missingNonBlockingMultiple() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq[4];
+  for (int i = 0; i < 4; ++i) {
+    MPI_Wait(&sendReq[i], MPI_STATUS_IGNORE); // expected-warning-re 1+{{Request {{.*}} has no matching nonblocking call.}}
+  }
+}
+
+void missingNonBlockingWaitall() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request req[4];
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[1]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[3]);
+
+  MPI_Waitall(4, req, MPI_STATUSES_IGNORE); // expected-warning{{Request 'req[2]' has no matching nonblocking call.}}
+}
+
+void missingNonBlockingWaitall2() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request req[4];
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[3]);
+
+  MPI_Waitall(4, req, MPI_STATUSES_IGNORE); // expected-warning-re 2{{Request '{{(.*)[[1-2]](.*)}}' has no matching nonblocking call.}}
+}
+
+void missingNonBlockingWaitall3() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request req[4];
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[2]);
+
+  MPI_Waitall(4, req, MPI_STATUSES_IGNORE); // expected-warning-re 2{{Request '{{(.*)[[1,3]](.*)}}' has no matching nonblocking call.}}
+}
+
+void missingNonBlockingWaitall4() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request req[4];
+  MPI_Waitall(4, req, MPI_STATUSES_IGNORE); // expected-warning-re 4{{Request '{{(.*)[[0-3]](.*)}}' has no matching nonblocking call.}}
+}
+
+void noDoubleRequestUsage() {
+  typedef struct {
+    MPI_Request req;
+    MPI_Request req2;
+  } ReqStruct;
+
+  ReqStruct rs;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req2);
+  MPI_Wait(&rs.req, MPI_STATUS_IGNORE);
+  MPI_Wait(&rs.req2, MPI_STATUS_IGNORE);
+} // no error
+
+void noDoubleRequestUsage2() {
+  typedef struct {
+    MPI_Request req[2];
+    MPI_Request req2;
+  } ReqStruct;
+
+  ReqStruct rs;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req[1]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req2);
+  MPI_Wait(&rs.req[0], MPI_STATUS_IGNORE);
+  MPI_Wait(&rs.req[1], MPI_STATUS_IGNORE);
+  MPI_Wait(&rs.req2, MPI_STATUS_IGNORE);
+} // no error
+
+void nestedRequest() {
+  typedef struct {
+    MPI_Request req[2];
+    MPI_Request req2;
+  } ReqStruct;
+
+  ReqStruct rs;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req[1]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req2);
+  MPI_Waitall(2, rs.req, MPI_STATUSES_IGNORE);
+  MPI_Wait(&rs.req2, MPI_STATUS_IGNORE);
+} // no error
+
+void singleRequestInWaitall() {
+  MPI_Request r;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &r);
+  MPI_Waitall(1, &r, MPI_STATUSES_IGNORE);
+} // no error
+
+void multiRequestUsage() {
+  double buf = 0;
+  MPI_Request req;
+
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &req);
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &req);
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+} // no error
+
+void multiRequestUsage2() {
+  double buf = 0;
+  MPI_Request req;
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &req);
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &req);
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+} // no error
+
+// wrapper function
+void callNonblocking(MPI_Request *req) {
+  double buf = 0;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+             req);
+}
+
+// wrapper function
+void callWait(MPI_Request *req) {
+  MPI_Wait(req, MPI_STATUS_IGNORE);
+}
+
+// Call nonblocking, wait wrapper functions.
+void callWrapperFunctions() {
+  MPI_Request req;
+  callNonblocking(&req);
+  callWait(&req);
+} // no error
+
+void externFunctions1() {
+  double buf = 0;
+  MPI_Request req;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &req);
+  void callWaitExtern(MPI_Request *req);
+  callWaitExtern(&req);
+} // expected-warning{{Request 'req' has no matching wait.}}
+
+void externFunctions2() {
+  MPI_Request req;
+  void callNonblockingExtern(MPI_Request *req);
+  callNonblockingExtern(&req);
+}
diff --git a/test/Analysis/mpicheckernotes.cpp b/test/Analysis/mpicheckernotes.cpp
new file mode 100644
index 0000000..be312fd
--- /dev/null
+++ b/test/Analysis/mpicheckernotes.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=optin.mpi.MPI-Checker -analyzer-output=text -verify %s
+
+// MPI-Checker test file to test note diagnostics.
+
+#include "MPIMock.h"
+
+void doubleNonblocking() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-note{{Request is previously used by nonblocking call here.}}
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}} expected-note{{Double nonblocking on request 'sendReq'.}}
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE);
+}
+
+void missingWait() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &sendReq); // expected-note{{Request is previously used by nonblocking call here.}}
+} // expected-warning{{Request 'sendReq' has no matching wait.}} expected-note{{Request 'sendReq' has no matching wait.}}
+
+// If more than 2 nonblocking calls are using a request in a sequence, they all
+// point to the first call as the 'previous' call. This is because the
+// BugReporterVisitor only checks for differences in state or existence of an
+// entity.
+void tripleNonblocking() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-note 2{{Request is previously used by nonblocking call here.}}
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}} expected-note{{Double nonblocking on request 'sendReq'.}}
+
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}} expected-note{{Double nonblocking on request 'sendReq'.}}
+
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE);
+}
diff --git a/test/Analysis/ns_error_enum.m b/test/Analysis/ns_error_enum.m
index 79e3c25..bf61629 100644
--- a/test/Analysis/ns_error_enum.m
+++ b/test/Analysis/ns_error_enum.m
@@ -26,7 +26,7 @@
 struct __attribute__((ns_error_domain(MyErrorDomain))) MyStructErrorDomain {};
 
 typedef NS_ERROR_ENUM(unsigned char, MyErrorEnumInvalid, InvalidDomain) {
-	// expected-error@-1{{domain argument 'InvalidDomain' not valid top-level declaration}}
+	// expected-error@-1{{domain argument 'InvalidDomain' does not refer to global constant}}
 	MyErrFirstInvalid,
 	MyErrSecondInvalid,
 };
@@ -35,8 +35,8 @@
   // expected-error@-1{{domain argument must be an identifier}}
 
 int __attribute__((ns_error_domain(MyErrorDomain))) NotTagDecl;
-  // expected-error@-1{{ns_error_domain attribute only valid on enum/struct/union/class}}
+  // expected-error@-1{{ns_error_domain attribute only valid on enums, structs, and unions}}
 
 void foo() {}
 typedef NS_ERROR_ENUM(unsigned char, MyErrorEnumInvalidFunction, foo);
-  // expected-error@-1{{domain argument 'foo' not valid top-level declaration}}
+  // expected-error@-1{{domain argument 'foo' does not refer to global constant}}
diff --git a/test/Analysis/padding_message.cpp b/test/Analysis/padding_message.cpp
index f73a11a..bbfa453 100644
--- a/test/Analysis/padding_message.cpp
+++ b/test/Analysis/padding_message.cpp
@@ -1,13 +1,25 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux -std=c++14 -analyze -analyzer-checker=optin.performance -analyzer-config optin.performance.Padding:AllowedPad=2 -verify %s
 
-// expected-warning@+1{{Excessive padding in 'struct IntSandwich' (6 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct IntSandwich' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+i, \
+c1, \
+c2, \
+}}
 struct IntSandwich {
   char c1;
   int i;
   char c2;
 };
 
-// expected-warning@+1{{Excessive padding in 'struct TurDuckHen' (6 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct TurDuckHen' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+i, \
+c1, \
+c2, \
+}}
 struct TurDuckHen {
   char c1;
   struct IntSandwich i;
@@ -16,7 +28,17 @@
 
 #pragma pack(push)
 #pragma pack(2)
-// expected-warning@+1{{Excessive padding in 'struct SmallIntSandwich' (4 padding bytes, where 0 is optimal)}}
+// expected-warning@+11{{\
+Excessive padding in 'struct SmallIntSandwich' (4 padding bytes, where 0 is optimal). \
+Optimal fields order: \
+i1, \
+i2, \
+i3, \
+c1, \
+c2, \
+c3, \
+c4, \
+}}
 struct SmallIntSandwich {
   char c1;
   int i1;
@@ -34,7 +56,13 @@
   int i;
 };
 
-// expected-warning@+1{{Excessive padding in 'struct HoldsAUnion' (6 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct HoldsAUnion' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+u, \
+c1, \
+c2, \
+}}
 struct HoldsAUnion {
   char c1;
   union SomeUnion u;
@@ -49,28 +77,53 @@
   int i[5];
 };
 
-// expected-warning@+1{{Excessive padding in 'struct StructSandwich' (6 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct StructSandwich' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+m, \
+s, \
+s2, \
+}}
 struct StructSandwich {
   struct SmallCharArray s;
   struct MediumIntArray m;
   struct SmallCharArray s2;
 };
 
-// expected-warning@+1{{Excessive padding in 'TypedefSandwich' (6 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'TypedefSandwich' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+i, \
+c1, \
+c2, \
+}}
 typedef struct {
   char c1;
   int i;
   char c2;
 } TypedefSandwich;
 
-// expected-warning@+1{{Excessive padding in 'struct StructAttrAlign' (10 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct StructAttrAlign' (10 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+i, \
+c1, \
+c2, \
+}}
 struct StructAttrAlign {
   char c1;
   int i;
   char c2;
 } __attribute__((aligned(8)));
 
-// expected-warning@+1{{Excessive padding in 'struct OverlyAlignedChar' (8185 padding bytes, where 4089 is optimal)}}
+// expected-warning@+8{{\
+Excessive padding in 'struct OverlyAlignedChar' (8185 padding bytes, where 4089 is optimal). \
+Optimal fields order: \
+c, \
+c1, \
+c2, \
+x, \
+}}
 struct OverlyAlignedChar {
   char c1;
   int x;
@@ -78,7 +131,13 @@
   char c __attribute__((aligned(4096)));
 };
 
-// expected-warning@+1{{Excessive padding in 'struct HoldsOverlyAlignedChar' (8190 padding bytes, where 4094 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct HoldsOverlyAlignedChar' (8190 padding bytes, where 4094 is optimal). \
+Optimal fields order: \
+o, \
+c1, \
+c2, \
+}}
 struct HoldsOverlyAlignedChar {
   char c1;
   struct OverlyAlignedChar o;
@@ -86,7 +145,13 @@
 };
 
 void internalStructFunc() {
-  // expected-warning@+1{{Excessive padding in 'struct X' (6 padding bytes, where 2 is optimal)}}
+  // expected-warning@+7{{\
+Excessive padding in 'struct X' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+t, \
+c1, \
+c2, \
+}}
   struct X {
     char c1;
     int t;
@@ -96,7 +161,13 @@
 }
 
 void typedefStructFunc() {
-  // expected-warning@+1{{Excessive padding in 'S' (6 padding bytes, where 2 is optimal)}}
+  // expected-warning@+7{{\
+Excessive padding in 'S' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+t, \
+c1, \
+c2, \
+}}
   typedef struct {
     char c1;
     int t;
@@ -105,21 +176,39 @@
   S obj;
 }
 
-// expected-warning@+1{{Excessive padding in 'struct DefaultAttrAlign' (22 padding bytes, where 6 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct DefaultAttrAlign' (22 padding bytes, where 6 is optimal). \
+Optimal fields order: \
+i, \
+c1, \
+c2, \
+}}
 struct DefaultAttrAlign {
   char c1;
   long long i;
   char c2;
 } __attribute__((aligned));
 
-// expected-warning@+1{{Excessive padding in 'struct SmallArrayShortSandwich' (2 padding bytes, where 0 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct SmallArrayShortSandwich' (2 padding bytes, where 0 is optimal). \
+Optimal fields order: \
+s, \
+c1, \
+c2, \
+}}
 struct SmallArrayShortSandwich {
   char c1;
   short s;
   char c2;
 } ShortArray[20];
 
-// expected-warning@+1{{Excessive padding in 'struct SmallArrayInFunc' (2 padding bytes, where 0 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'struct SmallArrayInFunc' (2 padding bytes, where 0 is optimal). \
+Optimal fields order: \
+s, \
+c1, \
+c2, \
+}}
 struct SmallArrayInFunc {
   char c1;
   short s;
@@ -130,7 +219,13 @@
   struct SmallArrayInFunc Arr[15];
 }
 
-// expected-warning@+1{{Excessive padding in 'class VirtualIntSandwich' (10 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'class VirtualIntSandwich' (10 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+i, \
+c1, \
+c2, \
+}}
 class VirtualIntSandwich {
   virtual void foo() {}
   char c1;
@@ -139,7 +234,14 @@
 };
 
 // constructed so as not to have tail padding
-// expected-warning@+1{{Excessive padding in 'class InnerPaddedB' (6 padding bytes, where 2 is optimal)}}
+// expected-warning@+8{{\
+Excessive padding in 'class InnerPaddedB' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+i1, \
+i2, \
+c1, \
+c2, \
+}}
 class InnerPaddedB {
   char c1;
   int i1;
@@ -149,17 +251,35 @@
 
 class Empty {}; // no-warning
 
-// expected-warning@+1{{Excessive padding in 'class LotsOfSpace' (6 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'class LotsOfSpace' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+i, \
+e1, \
+e2, \
+}}
 class LotsOfSpace {
   Empty e1;
   int i;
   Empty e2;
 };
 
-// expected-warning@+1{{Excessive padding in 'TypedefSandwich2' (6 padding bytes, where 2 is optimal)}}
+// expected-warning@+7{{\
+Excessive padding in 'TypedefSandwich2' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+t, \
+c1, \
+c2, \
+}}
 typedef struct {
   char c1;
-  // expected-warning@+1{{Excessive padding in 'TypedefSandwich2::NestedTypedef' (6 padding bytes, where 2 is optimal)}}
+  // expected-warning@+7{{\
+Excessive padding in 'TypedefSandwich2::NestedTypedef' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+i, \
+c1, \
+c2, \
+}}
   typedef struct {
     char c1;
     int i;
@@ -171,7 +291,13 @@
 
 template <typename T>
 struct Foo {
-  // expected-warning@+1{{Excessive padding in 'struct Foo<int>::Nested' (6 padding bytes, where 2 is optimal)}}
+  // expected-warning@+7{{\
+Excessive padding in 'struct Foo<int>::Nested' (6 padding bytes, where 2 is optimal). \
+Optimal fields order: \
+t, \
+c1, \
+c2, \
+}}
   struct Nested {
     char c1;
     T t;
diff --git a/test/Analysis/retain-release.m b/test/Analysis/retain-release.m
index f0d91e3..3acf86c 100644
--- a/test/Analysis/retain-release.m
+++ b/test/Analysis/retain-release.m
@@ -1247,6 +1247,26 @@
               pixelBufferAttributes, pixelBufferOut) ;
 }
 
+#pragma clang arc_cf_code_audited begin
+typedef struct SomeOpaqueStruct *CMSampleBufferRef;
+CVImageBufferRef _Nonnull CMSampleBufferGetImageBuffer(CMSampleBufferRef _Nonnull sbuf);
+#pragma clang arc_cf_code_audited end
+
+CVBufferRef _Nullable CVBufferRetain(CVBufferRef _Nullable buffer);
+void CVBufferRelease(CF_CONSUMED CVBufferRef _Nullable buffer);
+
+void testCVPrefixRetain(CMSampleBufferRef sbuf) {
+  // Make sure RetainCountChecker treats CVFooRetain() as a CF-style retain.
+  CVPixelBufferRef pixelBuf = CMSampleBufferGetImageBuffer(sbuf);
+  CVBufferRetain(pixelBuf);
+  CVBufferRelease(pixelBuf); // no-warning
+
+
+  // Make sure result of CVFooRetain() is the same as its argument.
+  CVPixelBufferRef pixelBufAlias = CVBufferRetain(pixelBuf);
+  CVBufferRelease(pixelBufAlias); // no-warning
+}
+
 //===----------------------------------------------------------------------===//
 // <rdar://problem/7358899> False leak associated with 
 //  CGBitmapContextCreateWithData
diff --git a/test/Analysis/self-assign.cpp b/test/Analysis/self-assign.cpp
new file mode 100644
index 0000000..74fb0fe
--- /dev/null
+++ b/test/Analysis/self-assign.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -std=c++11 -analyze -analyzer-checker=core,cplusplus,unix.Malloc,debug.ExprInspection %s -verify -analyzer-output=text
+
+extern "C" char *strdup(const char* s);
+extern "C" void free(void* ptr);
+
+namespace std {
+template<class T> struct remove_reference      { typedef T type; };
+template<class T> struct remove_reference<T&>  { typedef T type; };
+template<class T> struct remove_reference<T&&> { typedef T type; };
+template<class T> typename remove_reference<T>::type&& move(T&& t);
+}
+
+void clang_analyzer_eval(int);
+
+class StringUsed {
+public:
+  StringUsed(const char *s = "") : str(strdup(s)) {}
+  StringUsed(const StringUsed &rhs) : str(strdup(rhs.str)) {}
+  ~StringUsed();
+  StringUsed& operator=(const StringUsed &rhs);
+  StringUsed& operator=(StringUsed &&rhs);
+  operator const char*() const;
+private:
+  char *str;
+};
+
+StringUsed::~StringUsed() {
+  free(str);
+}
+
+StringUsed& StringUsed::operator=(const StringUsed &rhs) { // expected-note{{Assuming rhs == *this}} expected-note{{Assuming rhs == *this}} expected-note{{Assuming rhs != *this}}
+  clang_analyzer_eval(*this == rhs); // expected-warning{{TRUE}} expected-warning{{UNKNOWN}} expected-note{{TRUE}} expected-note{{UNKNOWN}}
+  free(str); // expected-note{{Memory is released}}
+  str = strdup(rhs.str); // expected-warning{{Use of memory after it is freed}}  expected-note{{Use of memory after it is freed}}
+  return *this;
+}
+
+StringUsed& StringUsed::operator=(StringUsed &&rhs) { // expected-note{{Assuming rhs == *this}} expected-note{{Assuming rhs != *this}}
+  clang_analyzer_eval(*this == rhs); // expected-warning{{TRUE}} expected-warning{{UNKNOWN}} expected-note{{TRUE}} expected-note{{UNKNOWN}}
+  str = rhs.str;
+  rhs.str = nullptr; // FIXME: An improved leak checker should warn here
+  return *this;
+}
+
+StringUsed::operator const char*() const {
+  return str;
+}
+
+class StringUnused {
+public:
+  StringUnused(const char *s = "") : str(strdup(s)) {}
+  StringUnused(const StringUnused &rhs) : str(strdup(rhs.str)) {}
+  ~StringUnused();
+  StringUnused& operator=(const StringUnused &rhs);
+  StringUnused& operator=(StringUnused &&rhs);
+  operator const char*() const;
+private:
+  char *str;
+};
+
+StringUnused::~StringUnused() {
+  free(str);
+}
+
+StringUnused& StringUnused::operator=(const StringUnused &rhs) { // expected-note{{Assuming rhs == *this}} expected-note{{Assuming rhs == *this}} expected-note{{Assuming rhs != *this}}
+  clang_analyzer_eval(*this == rhs); // expected-warning{{TRUE}} expected-warning{{UNKNOWN}} expected-note{{TRUE}} expected-note{{UNKNOWN}}
+  free(str); // expected-note{{Memory is released}}
+  str = strdup(rhs.str); // expected-warning{{Use of memory after it is freed}}  expected-note{{Use of memory after it is freed}}
+  return *this;
+}
+
+StringUnused& StringUnused::operator=(StringUnused &&rhs) { // expected-note{{Assuming rhs == *this}} expected-note{{Assuming rhs != *this}}
+  clang_analyzer_eval(*this == rhs); // expected-warning{{TRUE}} expected-warning{{UNKNOWN}} expected-note{{TRUE}} expected-note{{UNKNOWN}}
+  str = rhs.str;
+  rhs.str = nullptr; // FIXME: An improved leak checker should warn here
+  return *this;
+}
+
+StringUnused::operator const char*() const {
+  return str;
+}
+
+
+int main() {
+  StringUsed s1 ("test"), s2;
+  s2 = s1;
+  s2 = std::move(s1);
+  return 0;
+}
diff --git a/test/Analysis/stackaddrleak.c b/test/Analysis/stackaddrleak.c
index 21a15d7..717f309 100644
--- a/test/Analysis/stackaddrleak.c
+++ b/test/Analysis/stackaddrleak.c
@@ -19,7 +19,7 @@
   p = (const char *) __builtin_alloca(12);
 } // expected-warning{{Address of stack memory allocated by call to alloca() on line 19 is still referred to by the global variable 'p' upon returning to the caller.  This will be a dangling reference}}
 
-// PR 7383 - previosly the stack address checker would crash on this example
+// PR 7383 - previously the stack address checker would crash on this example
 //  because it would attempt to do a direct load from 'pr7383_list'. 
 static int pr7383(__const char *__)
 {
@@ -33,7 +33,7 @@
   int x;
   a = &x;
   b = &x;
-} // expected-warning{{Address of stack memory associated with local variable 'x' is still referred to by the global variable 'a' upon returning}} expected-warning{{Address of stack memory associated with local variable 'x' is still referred to by the global variable 'b' upon returning}}
+} // expected-warning{{Address of stack memory associated with local variable 'x' is still referred to by the static variable 'a' upon returning}} expected-warning{{Address of stack memory associated with local variable 'x' is still referred to by the static variable 'b' upon returning}}
 
 intptr_t returnAsNonLoc() {
   int x;
diff --git a/test/Analysis/string.c b/test/Analysis/string.c
index c65d2be..2803362 100644
--- a/test/Analysis/string.c
+++ b/test/Analysis/string.c
@@ -680,6 +680,18 @@
 #define strcmp BUILTIN(strcmp)
 int strcmp(const char * s1, const char * s2);
 
+void strcmp_check_modelling() {
+  char *x = "aa";
+  char *y = "a";
+  clang_analyzer_eval(strcmp(x, y) > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) <= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strcmp(x, y) > 1); // expected-warning{{UNKNOWN}}
+
+  clang_analyzer_eval(strcmp(y, x) < 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(y, x) >= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strcmp(y, x) < -1); // expected-warning{{UNKNOWN}}
+}
+
 void strcmp_constant0() {
   clang_analyzer_eval(strcmp("123", "123") == 0); // expected-warning{{TRUE}}
 }
@@ -703,13 +715,13 @@
 void strcmp_1() {
   char *x = "234";
   char *y = "123";
-  clang_analyzer_eval(strcmp(x, y) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) > 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_2() {
   char *x = "123";
   char *y = "234";
-  clang_analyzer_eval(strcmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_null_0() {
@@ -727,25 +739,25 @@
 void strcmp_diff_length_0() {
   char *x = "12345";
   char *y = "234";
-  clang_analyzer_eval(strcmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_diff_length_1() {
   char *x = "123";
   char *y = "23456";
-  clang_analyzer_eval(strcmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_diff_length_2() {
   char *x = "12345";
   char *y = "123";
-  clang_analyzer_eval(strcmp(x, y) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) > 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_diff_length_3() {
   char *x = "123";
   char *y = "12345";
-  clang_analyzer_eval(strcmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_embedded_null () {
@@ -777,6 +789,18 @@
 #define strncmp BUILTIN(strncmp)
 int strncmp(const char *s1, const char *s2, size_t n);
 
+void strncmp_check_modelling() {
+  char *x = "aa";
+  char *y = "a";
+  clang_analyzer_eval(strncmp(x, y, 2) > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 2) <= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strncmp(x, y, 2) > 1); // expected-warning{{UNKNOWN}}
+
+  clang_analyzer_eval(strncmp(y, x, 2) < 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(y, x, 2) >= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strncmp(y, x, 2) < -1); // expected-warning{{UNKNOWN}}
+}
+
 void strncmp_constant0() {
   clang_analyzer_eval(strncmp("123", "123", 3) == 0); // expected-warning{{TRUE}}
 }
@@ -800,13 +824,13 @@
 void strncmp_1() {
   char *x = "234";
   char *y = "123";
-  clang_analyzer_eval(strncmp(x, y, 3) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 3) > 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_2() {
   char *x = "123";
   char *y = "234";
-  clang_analyzer_eval(strncmp(x, y, 3) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 3) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_null_0() {
@@ -824,25 +848,25 @@
 void strncmp_diff_length_0() {
   char *x = "12345";
   char *y = "234";
-  clang_analyzer_eval(strncmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_1() {
   char *x = "123";
   char *y = "23456";
-  clang_analyzer_eval(strncmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_2() {
   char *x = "12345";
   char *y = "123";
-  clang_analyzer_eval(strncmp(x, y, 5) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 5) > 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_3() {
   char *x = "123";
   char *y = "12345";
-  clang_analyzer_eval(strncmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_4() {
@@ -854,13 +878,13 @@
 void strncmp_diff_length_5() {
   char *x = "012";
   char *y = "12345";
-  clang_analyzer_eval(strncmp(x, y, 3) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 3) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_6() {
   char *x = "234";
   char *y = "12345";
-  clang_analyzer_eval(strncmp(x, y, 3) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 3) > 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_embedded_null () {
@@ -874,6 +898,18 @@
 #define strcasecmp BUILTIN(strcasecmp)
 int strcasecmp(const char *s1, const char *s2);
 
+void strcasecmp_check_modelling() {
+  char *x = "aa";
+  char *y = "a";
+  clang_analyzer_eval(strcasecmp(x, y) > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) <= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strcasecmp(x, y) > 1); // expected-warning{{UNKNOWN}}
+
+  clang_analyzer_eval(strcasecmp(y, x) < 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(y, x) >= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strcasecmp(y, x) < -1); // expected-warning{{UNKNOWN}}
+}
+
 void strcasecmp_constant0() {
   clang_analyzer_eval(strcasecmp("abc", "Abc") == 0); // expected-warning{{TRUE}}
 }
@@ -897,13 +933,13 @@
 void strcasecmp_1() {
   char *x = "Bcd";
   char *y = "abc";
-  clang_analyzer_eval(strcasecmp(x, y) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) > 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_2() {
   char *x = "abc";
   char *y = "Bcd";
-  clang_analyzer_eval(strcasecmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_null_0() {
@@ -921,25 +957,25 @@
 void strcasecmp_diff_length_0() {
   char *x = "abcde";
   char *y = "aBd";
-  clang_analyzer_eval(strcasecmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_diff_length_1() {
   char *x = "abc";
   char *y = "aBdef";
-  clang_analyzer_eval(strcasecmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_diff_length_2() {
   char *x = "aBcDe";
   char *y = "abc";
-  clang_analyzer_eval(strcasecmp(x, y) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) > 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_diff_length_3() {
   char *x = "aBc";
   char *y = "abcde";
-  clang_analyzer_eval(strcasecmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_embedded_null () {
@@ -953,6 +989,18 @@
 #define strncasecmp BUILTIN(strncasecmp)
 int strncasecmp(const char *s1, const char *s2, size_t n);
 
+void strncasecmp_check_modelling() {
+  char *x = "aa";
+  char *y = "a";
+  clang_analyzer_eval(strncasecmp(x, y, 2) > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 2) <= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strncasecmp(x, y, 2) > 1); // expected-warning{{UNKNOWN}}
+
+  clang_analyzer_eval(strncasecmp(y, x, 2) < 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(y, x, 2) >= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strncasecmp(y, x, 2) < -1); // expected-warning{{UNKNOWN}}
+}
+
 void strncasecmp_constant0() {
   clang_analyzer_eval(strncasecmp("abc", "Abc", 3) == 0); // expected-warning{{TRUE}}
 }
@@ -976,13 +1024,13 @@
 void strncasecmp_1() {
   char *x = "Bcd";
   char *y = "abc";
-  clang_analyzer_eval(strncasecmp(x, y, 3) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 3) > 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_2() {
   char *x = "abc";
   char *y = "Bcd";
-  clang_analyzer_eval(strncasecmp(x, y, 3) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 3) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_null_0() {
@@ -1000,25 +1048,25 @@
 void strncasecmp_diff_length_0() {
   char *x = "abcde";
   char *y = "aBd";
-  clang_analyzer_eval(strncasecmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_1() {
   char *x = "abc";
   char *y = "aBdef";
-  clang_analyzer_eval(strncasecmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_2() {
   char *x = "aBcDe";
   char *y = "abc";
-  clang_analyzer_eval(strncasecmp(x, y, 5) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 5) > 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_3() {
   char *x = "aBc";
   char *y = "abcde";
-  clang_analyzer_eval(strncasecmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_4() {
@@ -1030,13 +1078,13 @@
 void strncasecmp_diff_length_5() {
   char *x = "abcde";
   char *y = "aBd";
-  clang_analyzer_eval(strncasecmp(x, y, 3) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 3) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_6() {
   char *x = "aBDe";
   char *y = "abc";
-  clang_analyzer_eval(strncasecmp(x, y, 3) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 3) > 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_embedded_null () {
diff --git a/test/Analysis/temp-obj-dtors-cfg-output.cpp b/test/Analysis/temp-obj-dtors-cfg-output.cpp
index dc10e87..b425d91 100644
--- a/test/Analysis/temp-obj-dtors-cfg-output.cpp
+++ b/test/Analysis/temp-obj-dtors-cfg-output.cpp
@@ -1077,7 +1077,7 @@
 // CHECK:    14: a([B1.13]) (Member initializer)
 // CHECK:    15: ~B() (Temporary object destructor)
 // CHECK:    16: ~A() (Temporary object destructor)
-// CHECK:    17: /*implicit*/int()
+// CHECK:    17: /*implicit*/(int)0
 // CHECK:    18: b([B1.17]) (Member initializer)
 // CHECK:     Preds (1): B2
 // CHECK:     Succs (1): B0
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f1a5838..f4be0ad 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -26,10 +26,16 @@
 
 list(APPEND CLANG_TEST_DEPS
   clang clang-headers
-  clang-check clang-format
+  clang-format
   c-index-test diagtool
   clang-tblgen
   )
+  
+if(CLANG_ENABLE_STATIC_ANALYZER)
+  list(APPEND CLANG_TEST_DEPS
+    clang-check
+    )
+endif()
 
 if (CLANG_ENABLE_ARCMT)
   list(APPEND CLANG_TEST_DEPS
@@ -40,6 +46,7 @@
 
 if (ENABLE_CLANG_EXAMPLES)
   list(APPEND CLANG_TEST_DEPS
+    AnnotateFunctions
     clang-interpreter
     PrintFunctionNames
     )
@@ -61,17 +68,21 @@
     FileCheck count not
     llc
     llvm-bcanalyzer
-    llvm-lto
+    llvm-nm
     llvm-objdump
     llvm-profdata
     llvm-readobj
     llvm-symbolizer
-    LTO
     opt
     )
+
+  if(TARGET llvm-lto)
+    list(APPEND CLANG_TEST_DEPS llvm-lto)
+  endif()
 endif()
 
 add_custom_target(clang-test-depends DEPENDS ${CLANG_TEST_DEPS})
+set_target_properties(clang-test-depends PROPERTIES FOLDER "Clang tests")
 
 add_lit_testsuite(check-clang "Running the Clang regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
@@ -82,6 +93,11 @@
   )
 set_target_properties(check-clang PROPERTIES FOLDER "Clang tests")
 
+add_lit_testsuites(CLANG ${CMAKE_CURRENT_SOURCE_DIR}
+  PARAMS ${CLANG_TEST_PARAMS}
+  DEPENDS ${CLANG_TEST_DEPS}
+)
+
 # Add a legacy target spelling: clang-test
 add_custom_target(clang-test)
 add_dependencies(clang-test check-clang)
diff --git a/test/CXX/basic/basic.def/p2.cpp b/test/CXX/basic/basic.def/p2.cpp
new file mode 100644
index 0000000..598a79a
--- /dev/null
+++ b/test/CXX/basic/basic.def/p2.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s -Wdeprecated
+
+namespace {
+  struct A {
+    static constexpr int n = 0;
+  };
+  const int A::n; // expected-warning {{deprecated}}
+}
diff --git a/test/CXX/basic/basic.def/p4.cpp b/test/CXX/basic/basic.def/p4.cpp
new file mode 100644
index 0000000..c391915
--- /dev/null
+++ b/test/CXX/basic/basic.def/p4.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+inline int f(); // expected-warning {{inline function 'f' is not defined}}
+extern inline int n; // expected-error {{inline variable 'n' is not defined}}
+
+int use = f() + n; // expected-note 2{{used here}}
diff --git a/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp b/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
index c207283..bb6bb73 100644
--- a/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
+++ b/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify -std=c++11 %s
 
 // C++98 [basic.lookup.classref]p1:
 //   In a class member access expression (5.2.5), if the . or -> token is
@@ -21,10 +23,16 @@
 
 // From PR 7247
 template<typename T>
-struct set{};  // expected-note{{lookup from the current scope refers here}}
+struct set{};
+#if __cplusplus <= 199711L
+// expected-note@-2 {{lookup from the current scope refers here}}
+#endif
 struct Value {
   template<typename T>
-  void set(T value) {}  // expected-note{{lookup in the object type 'Value' refers here}}
+  void set(T value) {}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{lookup in the object type 'Value' refers here}}
+#endif
 
   void resolves_to_same() {
     Value v;
@@ -36,7 +44,10 @@
     Value v;
     // The fact that the next line is a warning rather than an error is an
     // extension.
-    v.set<double>(3.2);  // expected-warning{{lookup of 'set' in member access expression is ambiguous; using member of 'Value'}}
+    v.set<double>(3.2);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{lookup of 'set' in member access expression is ambiguous; using member of 'Value'}}
+#endif
   }
   {
     int set;  // Non-template.
diff --git a/test/CXX/basic/basic.lookup/basic.lookup.qual/class.qual/p2.cpp b/test/CXX/basic/basic.lookup/basic.lookup.qual/class.qual/p2.cpp
index d1562d4..f32b239 100644
--- a/test/CXX/basic/basic.lookup/basic.lookup.qual/class.qual/p2.cpp
+++ b/test/CXX/basic/basic.lookup/basic.lookup.qual/class.qual/p2.cpp
@@ -53,16 +53,17 @@
   int n = b.T(); // expected-error {{'T' is a protected member of 'InhCtor::A'}}
                  // expected-note@-15 {{declared protected here}}
 
+  // FIXME: EDG and GCC reject this too, but it's not clear why it would be
+  // ill-formed.
   template<typename T>
   struct S : T {
-    struct U : S {
+    struct U : S { // expected-note 6{{candidate}}
       using S::S;
     };
     using T::T;
   };
-
-  S<A>::U ua(0);
-  S<B>::U ub(0);
+  S<A>::U ua(0); // expected-error {{no match}}
+  S<B>::U ub(0); // expected-error {{no match}}
 
   template<typename T>
   struct X : T {
diff --git a/test/CXX/basic/basic.types/p10.cpp b/test/CXX/basic/basic.types/p10.cpp
index 19258f8..31ef6b6 100644
--- a/test/CXX/basic/basic.types/p10.cpp
+++ b/test/CXX/basic/basic.types/p10.cpp
@@ -141,3 +141,45 @@
 }
 constexpr long Overflow[ // expected-error {{constexpr variable cannot have non-literal type 'long const[(1 << 30) << 2]'}}
     (1 << 30) << 2]{};   // expected-warning {{requires 34 bits to represent}}
+
+namespace inherited_ctor {
+  struct A { constexpr A(int); };
+  struct B : A {
+    B();
+    using A::A;
+  };
+  constexpr int f(B) { return 0; } // ok
+
+  struct C { constexpr C(int); };
+  struct D : C { // expected-note {{because}}
+    D(int);
+    using C::C;
+  };
+  constexpr int f(D) { return 0; } // expected-error {{not a literal type}}
+
+  // This one is a bit odd: F inherits E's default constructor, which is
+  // constexpr. Because F has a constructor of its own, it doesn't declare a
+  // default constructor hiding E's one.
+  struct E {};
+  struct F : E {
+    F(int);
+    using E::E;
+  };
+  constexpr int f(F) { return 0; }
+
+  // FIXME: Is this really the right behavior? We presumably should be checking
+  // whether the inherited constructor would be a copy or move constructor for
+  // the derived class, not for the base class.
+  struct G { constexpr G(const G&); };
+  struct H : G { // expected-note {{because}}
+    using G::G;
+  };
+  constexpr int f(H) { return 0; } // expected-error {{not a literal type}}
+
+  struct J;
+  struct I { constexpr I(const J&); };
+  struct J : I {
+    using I::I;
+  };
+  constexpr int f(J) { return 0; }
+}
diff --git a/test/CXX/class.access/class.access.dcl/p1.cpp b/test/CXX/class.access/class.access.dcl/p1.cpp
index aab5fff..118ab9e 100644
--- a/test/CXX/class.access/class.access.dcl/p1.cpp
+++ b/test/CXX/class.access/class.access.dcl/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // This is just the test for [namespace.udecl]p4 with 'using'
 // uniformly stripped out.
@@ -24,10 +26,33 @@
   }
 
   class Test0 {
-    NonClass::type; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
-    NonClass::hiding; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
-    NonClass::union_member; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
-    NonClass::enumerator; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
+    NonClass::type; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    NonClass::hiding; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    NonClass::union_member; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    NonClass::enumerator; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
   };
 }
 
@@ -43,11 +68,39 @@
   };
 
   struct B : A {
-    A::type; // expected-warning {{access declarations are deprecated}}
-    A::hiding; // expected-warning {{access declarations are deprecated}}
-    A::union_member; // expected-warning {{access declarations are deprecated}}
-    A::enumerator; // expected-warning {{access declarations are deprecated}}
-    A::tagname; // expected-warning {{access declarations are deprecated}}
+    A::type;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+    A::hiding;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::union_member;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::enumerator;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::tagname;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
 
     void test0() {
       type t = 0;
@@ -86,11 +139,40 @@
   };
 
   template <class T> struct B : A {
-    A::type; // expected-warning {{access declarations are deprecated}}
-    A::hiding; // expected-warning {{access declarations are deprecated}}
-    A::union_member; // expected-warning {{access declarations are deprecated}}
-    A::enumerator; // expected-warning {{access declarations are deprecated}}
-    A::tagname; // expected-warning {{access declarations are deprecated}}
+    A::type;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::hiding;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::union_member;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::enumerator;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::tagname;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
 
     void test0() {
       type t = 0;
@@ -131,11 +213,40 @@
   };
 
   template <class T> struct B : A<T> {
-    A<T>::type; // expected-error {{dependent using declaration resolved to type without 'typename'}} // expected-warning {{access declarations are deprecated}}
-    A<T>::hiding; // expected-warning {{access declarations are deprecated}}
-    A<T>::union_member; // expected-warning {{access declarations are deprecated}}
-    A<T>::enumerator; // expected-warning {{access declarations are deprecated}}
-    A<T>::tagname; // expected-error {{dependent using declaration resolved to type without 'typename'}} // expected-warning {{access declarations are deprecated}}
+    A<T>::type; // expected-error {{dependent using declaration resolved to type without 'typename'}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A<T>::hiding;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A<T>::union_member;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A<T>::enumerator;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A<T>::tagname; // expected-error {{dependent using declaration resolved to type without 'typename'}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
 
     // FIXME: re-enable these when the various bugs involving tags are fixed
 #if 0
@@ -186,14 +297,54 @@
 
   // We should be able to diagnose these without instantiation.
   template <class T> struct C : Base {
-    InnerNS::foo; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
-    Base::bar; // expected-error {{no member named 'bar'}} expected-warning {{access declarations are deprecated}}
-    Unrelated::foo; // expected-error {{not a base class}} expected-warning {{access declarations are deprecated}}
-    C::foo; // legal in C++03 // expected-warning {{access declarations are deprecated}}
-    Subclass::foo; // legal in C++03 // expected-warning {{access declarations are deprecated}}
+    InnerNS::foo; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
 
-    int bar(); //expected-note {{target of using declaration}}
-    C::bar; // expected-error {{refers to its own class}} expected-warning {{access declarations are deprecated}}
+    Base::bar; // expected-error {{no member named 'bar'}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    Unrelated::foo; // expected-error {{not a base class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    C::foo; // legal in C++03
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+    // expected-error@-5 {{using declaration refers to its own class}}
+#endif
+
+    Subclass::foo; // legal in C++03
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+    // expected-error@-5 {{using declaration refers into 'Subclass::', which is not a base class of 'C'}}
+#endif
+
+    int bar();
+#if __cplusplus <= 199711L
+    //expected-note@-2 {{target of using declaration}}
+#endif
+    C::bar;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+    // expected-error@-6 {{using declaration refers to its own class}}
   };
 }
 
diff --git a/test/CXX/class/class.friend/p1.cpp b/test/CXX/class/class.friend/p1.cpp
index b83dfa3..037fc3d 100644
--- a/test/CXX/class/class.friend/p1.cpp
+++ b/test/CXX/class/class.friend/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 struct Outer {
   struct Inner {
@@ -41,7 +43,10 @@
   UndeclaredSoFar x; // expected-error {{unknown type name 'UndeclaredSoFar'}}
 
   void a_member();
-  friend void A::a_member(); // expected-error {{friends cannot be members of the declaring class}}
+  friend void A::a_member();
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{friends cannot be members of the declaring class}}
+#endif
   friend void a_member(); // okay (because we ignore class scopes when looking up friends)
   friend class A::AInner; // this is okay as an extension
   friend class AInner; // okay, refers to ::AInner
diff --git a/test/CXX/class/class.friend/p2.cpp b/test/CXX/class/class.friend/p2.cpp
index fb3cd19..e4a46b3 100644
--- a/test/CXX/class/class.friend/p2.cpp
+++ b/test/CXX/class/class.friend/p2.cpp
@@ -1,10 +1,18 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 struct B0;
 
 class A {
   friend class B {}; // expected-error {{cannot define a type in a friend declaration}}
-  friend int; // expected-warning {{non-class friend type 'int' is a C++11 extension}}
-  friend B0; // expected-warning {{specify 'struct' to befriend 'B0'}}
+  friend int;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{non-class friend type 'int' is a C++11 extension}}
+#endif
+  friend B0;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{unelaborated friend declaration is a C++11 extension; specify 'struct' to befriend 'B0'}}
+#endif
   friend class C; // okay
 };
diff --git a/test/CXX/class/class.static/class.static.data/p2.cpp b/test/CXX/class/class.static/class.static.data/p2.cpp
new file mode 100644
index 0000000..8c38276
--- /dev/null
+++ b/test/CXX/class/class.static/class.static.data/p2.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+struct X {
+  static struct A a;
+  static inline struct B b; // expected-error {{incomplete type}} expected-note {{forward decl}}
+  static inline struct C c = {}; // expected-error {{incomplete type}} expected-note {{forward decl}}
+};
diff --git a/test/CXX/class/class.static/class.static.data/p3.cpp b/test/CXX/class/class.static/class.static.data/p3.cpp
index 1607bac..413017d 100644
--- a/test/CXX/class/class.static/class.static.data/p3.cpp
+++ b/test/CXX/class/class.static/class.static.data/p3.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s
 
 struct NonLit { // expected-note 3{{no constexpr constructors}}
   NonLit();
@@ -6,7 +7,7 @@
 
 struct S {
   static constexpr int a = 0;
-  static constexpr int b; // expected-error {{declaration of constexpr static data member 'b' requires an initializer}}
+  static constexpr int b; // expected-error {{initializ}} expected-note 0-1{{previous}}
 
   static constexpr int c = 0;
   static const int d;
@@ -16,19 +17,27 @@
   static const double f = 0.0; // expected-error {{requires 'constexpr' specifier}} expected-note {{add 'constexpr'}}
   static char *const g = 0; // expected-error {{requires 'constexpr' specifier}}
   static const NonLit h = NonLit(); // expected-error {{must be initialized out of line}}
+
+  static inline int i; // expected-note {{previous}} expected-warning 0-1{{extension}}
+  static inline int j; // expected-note {{previous}} expected-warning 0-1{{extension}}
+  static constexpr int k = 0;
 };
 
 constexpr int S::a;
-constexpr int S::b = 0;
+constexpr int S::b = 0; // expected-error 0-1{{redefinition}}
 
 const int S::c;
 constexpr int S::d = 0;
 constexpr int S::d2;
 
+int S::i; // expected-error {{redefinition}}
+int S::j; // expected-error {{redefinition}}
+const int S::k; // ok (deprecated)
+
 template<typename T>
 struct U {
   static constexpr int a = 0;
-  static constexpr int b; // expected-error {{declaration of constexpr static data member 'b' requires an initializer}}
+  static constexpr int b; // expected-error {{initializ}}
   static constexpr NonLit h = NonLit(); // expected-error {{cannot have non-literal type 'const NonLit'}}
   static constexpr T c = T(); // expected-error {{cannot have non-literal type}}
   static const T d;
diff --git a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p1.cpp b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p1.cpp
index ded6ed0..3baf238 100644
--- a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p1.cpp
+++ b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p1.cpp
@@ -41,3 +41,20 @@
 void fpc(concept int i) {} // expected-error {{'concept' can only appear on the definition of a function template or variable template}}
 
 concept bool; // expected-error {{'concept' can only appear on the definition of a function template or variable template}}
+
+template <typename T> concept bool VCEI{ true };
+template concept bool VCEI<int>; // expected-error {{'concept' cannot be applied on an explicit instantiation}}
+extern template concept bool VCEI<int>; // expected-error {{'concept' cannot be applied on an explicit instantiation}}
+
+template <typename T> concept bool VCPS{ true };
+template <typename T> concept bool VCPS<T *>{ true }; // expected-error {{'concept' cannot be applied on an partial specialization}}
+
+template <typename T> concept bool VCES{ true };
+template <> concept bool VCES<int>{ true }; // expected-error {{'concept' cannot be applied on an explicit specialization}}
+
+template <typename T> concept bool FCEI() { return true; }
+template concept bool FCEI<int>(); // expected-error {{'concept' cannot be applied on an explicit instantiation}}
+extern template concept bool FCEI<int>(); // expected-error {{'concept' cannot be applied on an explicit instantiation}}
+
+template <typename T> concept bool FCES() { return true; }
+template <> concept bool FCES<bool>() { return true; } // expected-error {{'concept' cannot be applied on an explicit specialization}}
diff --git a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p5.cpp b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p5.cpp
index 38593bc..69672ca 100644
--- a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p5.cpp
+++ b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p5.cpp
@@ -11,3 +11,15 @@
 
 template<typename T>
 concept bool fcpva(...) { return true; } // expected-error {{function concept cannot have any parameters}}
+
+template<typename T>
+concept const bool fcrtc() { return true; } // expected-error {{declared return type of function concept must be 'bool'}}
+
+template<typename T>
+concept int fcrti() { return 5; } // expected-error {{declared return type of function concept must be 'bool'}}
+
+template<typename T>
+concept float fcrtf() { return 5.5; } // expected-error {{declared return type of function concept must be 'bool'}}
+
+template<typename T>
+concept decltype(auto) fcrtd(void) { return true; } // expected-error {{declared return type of function concept must be 'bool'}}
diff --git a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p6.cpp b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p6.cpp
new file mode 100644
index 0000000..f8a1bb7
--- /dev/null
+++ b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p6.cpp
@@ -0,0 +1,25 @@
+// RUN:  %clang_cc1 -std=c++14 -fconcepts-ts -x c++ -verify %s
+
+template<typename T>
+concept bool vc { true };
+
+template<typename T>
+struct B { typedef bool Boolean; };
+
+template<int N>
+B<void>::Boolean concept vctb(!0);
+
+template<typename T>
+concept const bool vctc { true }; // expected-error {{declared type of variable concept must be 'bool'}}
+
+template<typename T>
+concept int vcti { 5 }; // expected-error {{declared type of variable concept must be 'bool'}}
+
+template<typename T>
+concept float vctf { 5.5 }; // expected-error {{declared type of variable concept must be 'bool'}}
+
+template<typename T>
+concept auto vcta { true }; // expected-error {{declared type of variable concept must be 'bool'}}
+
+template<typename T>
+concept decltype(auto) vctd { true }; // expected-error {{declared type of variable concept must be 'bool'}}
diff --git a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p7.cpp b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p7.cpp
new file mode 100644
index 0000000..1bad6bb
--- /dev/null
+++ b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p7.cpp
@@ -0,0 +1,18 @@
+// RUN:  %clang_cc1 -std=c++14 -fconcepts-ts -x c++ -verify %s
+
+template <typename T> concept bool FCEI() { return true; } // expected-note {{previous declaration is here}} expected-note {{previous declaration is here}}
+template bool FCEI<int>(); // expected-error {{function concept cannot be explicitly instantiated}}
+extern template bool FCEI<double>(); // expected-error {{function concept cannot be explicitly instantiated}}
+
+template <typename T> concept bool FCES() { return true; } // expected-note {{previous declaration is here}}
+template <> bool FCES<int>() { return true; } // expected-error {{function concept cannot be explicitly specialized}}
+
+template <typename T> concept bool VC { true }; // expected-note {{previous declaration is here}} expected-note {{previous declaration is here}}
+template bool VC<int>; // expected-error {{variable concept cannot be explicitly instantiated}}
+extern template bool VC<double>; // expected-error {{variable concept cannot be explicitly instantiated}}
+
+template <typename T> concept bool VCES { true }; // expected-note {{previous declaration is here}}
+template <> bool VCES<int> { true }; // expected-error {{variable concept cannot be explicitly specialized}}
+
+template <typename T> concept bool VCPS { true }; // expected-note {{previous declaration is here}}
+template <typename T> bool VCPS<T *> { true }; // expected-error {{variable concept cannot be partially specialized}}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp
index cc28bf6..ce43720 100644
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp
@@ -1,3 +1,5 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 
 // C++03 [namespace.udecl]p12:
@@ -161,3 +163,33 @@
     d.bar<int>(3); // expected-error {{'bar' is a protected member}}
   }
 }
+
+namespace test5 {
+  struct Derived;
+  struct Base {
+    void operator=(const Derived&);
+  };
+  struct Derived : Base {
+    // Hidden by implicit derived class operator.
+    using Base::operator=;
+  };
+  void f(Derived d) {
+    d = d;
+  }
+}
+
+#if __cplusplus >= 201103L
+namespace test6 {
+  struct Derived;
+  struct Base {
+    void operator=(Derived&&);
+  };
+  struct Derived : Base {
+    // Hidden by implicit derived class operator.
+    using Base::operator=;
+  };
+  void f(Derived d) {
+    d = Derived();
+  }
+}
+#endif
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp
new file mode 100644
index 0000000..3e04d50
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+struct B1 { // expected-note 2{{candidate}}
+  B1(int); // expected-note {{candidate}}
+};
+
+struct B2 { // expected-note 2{{candidate}}
+  B2(int); // expected-note {{candidate}}
+};
+
+struct D1 : B1, B2 { // expected-note 2{{candidate}}
+  using B1::B1; // expected-note 3{{inherited here}}
+  using B2::B2; // expected-note 3{{inherited here}}
+};
+D1 d1(0); // expected-error {{ambiguous}}
+
+struct D2 : B1, B2 {
+  using B1::B1;
+  using B2::B2;
+  D2(int);
+};
+D2 d2(0); // ok
+
+
+// The emergent behavior of implicit special members is a bit odd when
+// inheriting from multiple base classes.
+namespace default_ctor {
+  struct C;
+  struct D;
+
+  struct A { // expected-note 4{{candidate}}
+    A(); // expected-note {{candidate}}
+
+    A(C &&); // expected-note {{candidate}}
+    C &operator=(C&&); // expected-note {{candidate}}
+
+    A(D &&); // expected-note {{candidate}}
+    D &operator=(D&&); // expected-note {{candidate}}
+  };
+
+  struct B { // expected-note 4{{candidate}}
+    B(); // expected-note {{candidate}}
+
+    B(C &&); // expected-note {{candidate}}
+    C &operator=(C&&); // expected-note {{candidate}}
+
+    B(D &&); // expected-note {{candidate}}
+    D &operator=(D&&); // expected-note {{candidate}}
+  };
+
+  struct C : A, B {
+    using A::A;
+    using A::operator=;
+    using B::B;
+    using B::operator=;
+  };
+  struct D : A, B {
+    using A::A; // expected-note 5{{inherited here}}
+    using A::operator=;
+    using B::B; // expected-note 5{{inherited here}}
+    using B::operator=;
+
+    D(int);
+    D(const D&); // expected-note {{candidate}}
+    D &operator=(const D&); // expected-note {{candidate}}
+  };
+
+  C c;
+  void f(C c) {
+    C c2(static_cast<C&&>(c));
+    c = static_cast<C&&>(c);
+  }
+
+  // D does not declare D(), D(D&&), nor operator=(D&&), so the base class
+  // versions are inherited.
+  D d; // expected-error {{ambiguous}}
+  void f(D d) {
+    D d2(static_cast<D&&>(d)); // expected-error {{ambiguous}}
+    d = static_cast<D&&>(d); // expected-error {{ambiguous}}
+  }
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p18.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p18.cpp
new file mode 100644
index 0000000..b9fca4b
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p18.cpp
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+struct Public {} public_;
+struct Protected {} protected_;
+struct Private {} private_;
+
+class A {
+public:
+  A(Public);
+  void f(Public);
+
+protected:
+  A(Protected); // expected-note {{protected here}}
+  void f(Protected);
+
+private:
+  A(Private); // expected-note 4{{private here}}
+  void f(Private); // expected-note {{private here}}
+
+  friend void Friend();
+};
+
+class B : private A {
+  using A::A; // ok
+  using A::f; // expected-error {{private member}}
+
+  void f() {
+    B a(public_);
+    B b(protected_);
+    B c(private_); // expected-error {{private}}
+  }
+
+  B(Public p, int) : B(p) {}
+  B(Protected p, int) : B(p) {}
+  B(Private p, int) : B(p) {} // expected-error {{private}}
+};
+
+class C : public B {
+  C(Public p) : B(p) {}
+  // There is no access check on the conversion from derived to base here;
+  // protected constructors of A act like protected constructors of B.
+  C(Protected p) : B(p) {}
+  C(Private p) : B(p) {} // expected-error {{private}}
+};
+
+void Friend() {
+  // There is no access check on the conversion from derived to base here.
+  B a(public_);
+  B b(protected_);
+  B c(private_);
+}
+
+void NonFriend() {
+  B a(public_);
+  B b(protected_); // expected-error {{protected}}
+  B c(private_); // expected-error {{private}}
+}
+
+namespace ProtectedAccessFromMember {
+namespace a {
+  struct ES {
+  private:
+    ES(const ES &) = delete;
+  protected:
+    ES(const char *);
+  };
+}
+namespace b {
+  struct DES : a::ES {
+    DES *f();
+  private:
+    using a::ES::ES;
+  };
+}
+b::DES *b::DES::f() { return new b::DES("foo"); }
+
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3-cxx0x.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3-cxx0x.cpp
deleted file mode 100644
index f61437e..0000000
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3-cxx0x.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
-// C++0x N2914.
-
-struct B {
-  void f(char);
-  void g(char);
-  enum E { e };
-  union { int x; };
-};
-
-class C {
-  int g();
-};
-
-class D2 : public B {
-  using B::f;
-  using B::e;
-  using B::x;
-  using C::g; // expected-error{{using declaration refers into 'C::', which is not a base class of 'D2'}}
-};
-
-namespace test1 {
-  struct Base {
-    int foo();
-  };
-
-  struct Unrelated {
-    int foo();
-  };
-
-  struct Subclass : Base {
-  };
-
-  namespace InnerNS {
-    int foo();
-  }
-
-  // We should be able to diagnose these without instantiation.
-  template <class T> struct C : Base {
-    using InnerNS::foo; // expected-error {{not a class}}
-    using Base::bar; // expected-error {{no member named 'bar'}}
-    using Unrelated::foo; // expected-error {{not a base class}}
-    using C::foo; // expected-error {{refers to its own class}}
-    using Subclass::foo; // expected-error {{not a base class}}
-  };
-}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3.cpp
new file mode 100644
index 0000000..6c505a5
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3.cpp
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+struct B {
+  void f(char);
+  void g(char);
+  enum E { e };
+  union { int x; };
+
+  enum class EC { ec }; // expected-warning 0-1 {{C++11}}
+
+  void f2(char);
+  void g2(char);
+  enum E2 { e2 };
+  union { int x2; };
+};
+
+class C {
+  int g();
+};
+
+struct D : B {};
+
+class D2 : public B {
+  using B::f;
+  using B::E;
+  using B::e;
+  using B::x;
+  using C::g; // expected-error{{using declaration refers into 'C::', which is not a base class of 'D2'}}
+
+  // These are valid in C++98 but not in C++11.
+  using D::f2;
+  using D::E2;
+  using D::e2;
+  using D::x2;
+#if __cplusplus >= 201103L
+  // expected-error@-5 {{using declaration refers into 'D::', which is not a base class of 'D2'}}
+  // expected-error@-5 {{using declaration refers into 'D::', which is not a base class of 'D2'}}
+  // expected-error@-5 {{using declaration refers into 'D::', which is not a base class of 'D2'}}
+  // expected-error@-5 {{using declaration refers into 'D::', which is not a base class of 'D2'}}
+#endif
+
+  using B::EC;
+  using B::EC::ec; // expected-error {{not a class}} expected-warning 0-1 {{C++11}}
+};
+
+namespace test1 {
+  struct Base {
+    int foo();
+  };
+
+  struct Unrelated {
+    int foo();
+  };
+
+  struct Subclass : Base {
+  };
+
+  namespace InnerNS {
+    int foo();
+  }
+
+  struct B : Base {
+  };
+
+  // We should be able to diagnose these without instantiation.
+  template <class T> struct C : Base {
+    using InnerNS::foo; // expected-error {{not a class}}
+    using Base::bar; // expected-error {{no member named 'bar'}}
+    using Unrelated::foo; // expected-error {{not a base class}}
+
+    // In C++98, it's hard to see that these are invalid, because indirect
+    // references to base class members are permitted.
+    using C::foo;
+    using Subclass::foo;
+#if __cplusplus >= 201103L
+    // expected-error@-3 {{refers to its own class}}
+    // expected-error@-3 {{not a base class}}
+#endif
+  };
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p4.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p4.cpp
index a43d9e0..781a1a1 100644
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p4.cpp
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p4.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // C++03 [namespace.udecl]p4:
 //   A using-declaration used as a member-declaration shall refer to a
@@ -206,8 +207,33 @@
     using Unrelated::foo; // expected-error {{not a base class}}
     using C::foo; // legal in C++03
     using Subclass::foo; // legal in C++03
+#if __cplusplus >= 201103L
+    // expected-error@-3 {{refers to its own class}}
+    // expected-error@-3 {{refers into 'Subclass::', which is not a base class}}
+#endif
 
-    int bar(); //expected-note {{target of using declaration}}
+    int bar();
+#if __cplusplus < 201103L
+    // expected-note@-2 {{target of using declaration}}
+#endif
     using C::bar; // expected-error {{refers to its own class}}
   };
 }
+
+namespace test5 {
+  struct B;
+  struct A {
+    A(const B&);
+    B &operator=(const B&);
+  };
+  struct B : A {
+#if __cplusplus >= 201103L
+    using A::A;
+#endif
+    using A::operator=;
+  };
+  void test(B b) {
+    B b2(b);
+    b2 = b;
+  }
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx0x.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx0x.cpp
deleted file mode 100644
index c2fb959..0000000
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx0x.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// C++0x N2914.
-
-namespace A {
-  namespace B { }
-}
-
-using A::B; // expected-error{{using declaration cannot refer to namespace}}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx11.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx11.cpp
new file mode 100644
index 0000000..97b2953
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx11.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace A {
+  namespace B { }
+}
+
+using A::B; // expected-error{{using declaration cannot refer to a namespace}}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p7.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p7.cpp
new file mode 100644
index 0000000..6c9379f
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p7.cpp
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+enum class EC { ec };
+using EC::ec; // expected-error {{using declaration cannot refer to a scoped enumerator}}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p8-cxx0x.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p8-cxx0x.cpp
index ebe5388..6c63f06 100644
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p8-cxx0x.cpp
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p8-cxx0x.cpp
@@ -7,14 +7,41 @@
 struct X {
   int i;
   static int a;
+  enum E { e };
 };
 
 using X::i; // expected-error{{using declaration cannot refer to class member}}
 using X::s; // expected-error{{using declaration cannot refer to class member}}
+using X::e; // expected-error{{using declaration cannot refer to class member}}
+using X::E::e; // expected-error{{using declaration cannot refer to class member}} expected-warning 0-1{{C++11}}
+#if __cplusplus < 201103L
+// expected-note@-3 {{use a const variable}}
+// expected-note@-3 {{use a const variable}}
+// CXX98-NOT: fix-it:"{{.*}}":{[[@LINE-5]]:
+// CXX98-NOT: fix-it:"{{.*}}":{[[@LINE-5]]:
+#else
+// expected-note@-8 {{use a constexpr variable}}
+// expected-note@-8 {{use a constexpr variable}}
+// CXX11: fix-it:"{{.*}}":{[[@LINE-10]]:1-[[@LINE-10]]:6}:"constexpr auto e = "
+// CXX11: fix-it:"{{.*}}":{[[@LINE-10]]:1-[[@LINE-10]]:6}:"constexpr auto e = "
+#endif
 
 void f() {
   using X::i; // expected-error{{using declaration cannot refer to class member}}
   using X::s; // expected-error{{using declaration cannot refer to class member}}
+  using X::e; // expected-error{{using declaration cannot refer to class member}}
+  using X::E::e; // expected-error{{using declaration cannot refer to class member}} expected-warning 0-1{{C++11}}
+#if __cplusplus < 201103L
+  // expected-note@-3 {{use a const variable}}
+  // expected-note@-3 {{use a const variable}}
+  // CXX98-NOT: fix-it:"{{.*}}":{[[@LINE-5]]:
+  // CXX98-NOT: fix-it:"{{.*}}":{[[@LINE-5]]:
+#else
+  // expected-note@-8 {{use a constexpr variable}}
+  // expected-note@-8 {{use a constexpr variable}}
+  // CXX11: fix-it:"{{.*}}":{[[@LINE-10]]:3-[[@LINE-10]]:8}:"constexpr auto e = "
+  // CXX11: fix-it:"{{.*}}":{[[@LINE-10]]:3-[[@LINE-10]]:8}:"constexpr auto e = "
+#endif
 }
 
 template <typename T>
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp
new file mode 100644
index 0000000..e7c9033
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+void f(int n) {
+  switch (n) {
+  case 0:
+    n += 1;
+    [[fallthrough]]; // ok
+  case 1:
+    if (n) {
+      [[fallthrough]]; // ok
+    } else {
+      return;
+    }
+  case 2:
+    for (int n = 0; n != 10; ++n)
+      [[fallthrough]]; // expected-error {{does not directly precede switch label}}
+  case 3:
+    while (true)
+      [[fallthrough]]; // expected-error {{does not directly precede switch label}}
+  case 4:
+    while (false)
+      [[fallthrough]]; // expected-error {{does not directly precede switch label}}
+  case 5:
+    do [[fallthrough]]; while (true); // expected-error {{does not directly precede switch label}}
+  case 6:
+    do [[fallthrough]]; while (false); // expected-error {{does not directly precede switch label}}
+  case 7:
+    switch (n) {
+    case 0:
+      // FIXME: This should be an error, even though the next thing we do is to
+      // fall through in an outer switch statement.
+      [[fallthrough]];
+    }
+  case 8:
+    [[fallthrough]]; // expected-error {{does not directly precede switch label}}
+    goto label;
+  label:
+  case 9:
+    n += 1;
+  case 10: // no warning, -Wimplicit-fallthrough is not enabled in this test, and does not need to
+           // be enabled for these diagnostics to be produced.
+    break;
+  }
+}
+
+[[fallthrough]] typedef int n; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+typedef int [[fallthrough]] n; // expected-error {{'fallthrough' attribute cannot be applied to types}}
+typedef int n [[fallthrough]]; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+
+enum [[fallthrough]] E {}; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+class [[fallthrough]] C {}; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+
+[[fallthrough]] // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+void g() {
+  [[fallthrough]] int n; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+  [[fallthrough]] ++n; // expected-error-re {{{{^}}fallthrough attribute is only allowed on empty statements}}
+
+  switch (n) {
+    // FIXME: This should be an error.
+    [[fallthrough]];
+    return;
+
+  case 0:
+    [[fallthrough, fallthrough]]; // expected-error {{multiple times}}
+  case 1:
+    [[fallthrough(0)]]; // expected-error {{argument list}}
+  case 2:
+    break;
+  }
+}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.grammar/p2-1z.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.grammar/p2-1z.cpp
new file mode 100644
index 0000000..192fa12
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.grammar/p2-1z.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+[[disable_tail_calls, noduplicate]] void f() {} // expected-warning {{unknown attribute 'disable_tail_calls'}} expected-warning {{unknown attribute 'noduplicate'}}
+
+[[using clang: disable_tail_calls, noduplicate]] void g() {} // ok
+
+[[using]] extern int n; // expected-error {{expected identifier}}
+[[using foo
+] // expected-error {{expected ':'}}
+] extern int n;
+[[using 42:]] extern int n; // expected-error {{expected identifier}}
+[[using clang:]] extern int n; // ok
+[[using blah: clang::optnone]] extern int n; // expected-error {{attribute with scope specifier cannot follow}} expected-warning {{only applies to functions}}
+
+[[using clang: unknown_attr]] extern int n; // expected-warning {{unknown attribute}}
+[[using unknown_ns: something]] extern int n; // expected-warning {{unknown attribute}}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p1.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p1.cpp
new file mode 100644
index 0000000..e7a2382
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p1.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++1z -verify %s
+
+struct [[nodiscard]] S1 {}; // ok
+struct [[nodiscard nodiscard]] S2 {}; // expected-error {{attribute 'nodiscard' cannot appear multiple times in an attribute specifier}}
+struct [[nodiscard("Wrong")]] S3 {}; // expected-error {{'nodiscard' cannot have an argument list}}
+
+[[nodiscard]] int f();
+enum [[nodiscard]] E {};
+
+namespace [[nodiscard]] N {} // expected-warning {{'nodiscard' attribute only applies to functions, methods, enums, and classes}}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
new file mode 100644
index 0000000..3d4b925
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++1z -verify -Wc++1z-extensions %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify -DEXT -Wc++1z-extensions %s
+
+struct [[nodiscard]] S {};
+S get_s();
+S& get_s_ref();
+
+enum [[nodiscard]] E {};
+E get_e();
+
+[[nodiscard]] int get_i();
+
+void f() {
+  get_s(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  get_i(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  get_e(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  // Okay, warnings are not encouraged
+  get_s_ref();
+  (void)get_s();
+  (void)get_i();
+  (void)get_e();
+}
+
+#ifdef EXT
+// expected-warning@4 {{use of the 'nodiscard' attribute is a C++1z extension}}
+// expected-warning@8 {{use of the 'nodiscard' attribute is a C++1z extension}}
+// expected-warning@11 {{use of the 'nodiscard' attribute is a C++1z extension}}
+#endif
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp
new file mode 100644
index 0000000..a3543cf
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+namespace std_example {
+  struct [[nodiscard]] error_info{
+    // ...
+  };
+
+  error_info enable_missile_safety_mode();
+  void launch_missiles();
+  void test_missiles() {
+    enable_missile_safety_mode(); // expected-warning {{ignoring return value of function declared with 'nodiscard'}}
+    launch_missiles();
+  }
+
+  error_info &foo();
+  void f() { foo(); } // no warning
+}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p1.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p1.cpp
new file mode 100644
index 0000000..8da2ca7
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p1.cpp
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused -std=c++1z -verify %s
+
+struct [[maybe_unused]] S1 {}; // ok
+struct [[maybe_unused maybe_unused]] S2 {}; // expected-error {{attribute 'maybe_unused' cannot appear multiple times in an attribute specifier}}
+struct [[maybe_unused("Wrong")]] S3 {}; // expected-error {{'maybe_unused' cannot have an argument list}}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p2.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p2.cpp
new file mode 100644
index 0000000..b539ca4
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p2.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused -std=c++1z -verify %s
+
+struct [[maybe_unused]] S {
+  int I [[maybe_unused]];
+  static int SI [[maybe_unused]]; // expected-warning {{'maybe_unused' attribute only applies to variables, functions, methods, types, enumerations, enumerators, labels, and non-static data members}}
+};
+
+enum [[maybe_unused]] E1 {
+  EnumVal [[maybe_unused]]
+};
+
+[[maybe_unused]] void unused_func([[maybe_unused]] int parm) {
+  typedef int maybe_unused_int [[maybe_unused]];
+  [[maybe_unused]] int I;
+}
+
+namespace [[maybe_unused]] N {} // expected-warning {{'maybe_unused' attribute only applies to}}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p3.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p3.cpp
new file mode 100644
index 0000000..a627d83
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p3.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused -Wused-but-marked-unused -std=c++1z -Wc++1z-extensions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wunused -Wused-but-marked-unused -std=c++11 -Wc++1z-extensions -verify -DEXT %s
+
+static_assert(__has_cpp_attribute(maybe_unused) == 201603, "");
+
+struct [[maybe_unused]] S {};
+
+void f() {
+  int x; // expected-warning {{unused variable}}
+  typedef int I; // expected-warning {{unused typedef 'I'}}
+
+  // Should not warn about these due to not being used.
+  [[maybe_unused]] int y;
+  typedef int maybe_unused_int [[maybe_unused]];
+
+  // Should not warn about these uses.
+  S s;
+  maybe_unused_int test;
+  y = 12;
+}
+
+#ifdef EXT
+// expected-warning@6 {{use of the 'maybe_unused' attribute is a C++1z extension}}
+// expected-warning@13 {{use of the 'maybe_unused' attribute is a C++1z extension}}
+// expected-warning@14 {{use of the 'maybe_unused' attribute is a C++1z extension}}
+#endif
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p4.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p4.cpp
new file mode 100644
index 0000000..d4a2759
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p4.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused -std=c++1z -verify %s
+// expected-no-diagnostics
+
+void f();
+[[maybe_unused]] void f();
+
+void f() {
+}
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p1.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p1.cpp
index 35dbec9..5a4c5c9 100644
--- a/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p1.cpp
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s
 
 struct notlit { // expected-note {{not literal because}}
   notlit() {}
@@ -26,7 +28,12 @@
 // non-static member
 struct s2 {
   constexpr int mi1; // expected-error {{non-static data member cannot be constexpr; did you intend to make it const?}}
-  static constexpr int mi2; // expected-error {{requires an initializer}}
+  static constexpr int mi2;
+#if __cplusplus <= 201402L
+  // expected-error@-2 {{requires an initializer}}
+#else
+  // expected-error@-4 {{default initialization of an object of const}}
+#endif
   mutable constexpr int mi3 = 3; // expected-error-re {{non-static data member cannot be constexpr{{$}}}} expected-error {{'mutable' and 'const' cannot be mixed}}
 };
 // typedef
@@ -71,7 +78,7 @@
 template <typename T> constexpr T ft(T t) { return t; }
 template <typename T> T gt(T t) { return t; }
 struct S {
-  template<typename T> constexpr T f(); // expected-warning {{C++14}}
+  template<typename T> constexpr T f(); // expected-warning 0-1{{C++14}} expected-note 0-1{{candidate}}
   template <typename T>
   T g() const; // expected-note-re {{candidate template ignored: could not match 'T (){{( __attribute__\(\(thiscall\)\))?}} const' against 'char (){{( __attribute__\(\(thiscall\)\))?}}'}}
 };
@@ -82,7 +89,15 @@
 template <> constexpr char ft(char nl); // expected-error {{constexpr declaration of 'ft<char>' follows non-constexpr declaration}}
 template <> constexpr int gt(int nl) { return nl; }
 template <> notlit S::f() const { return notlit(); }
-template <> constexpr int S::g() { return 0; } // expected-note {{previous}} expected-warning {{C++14}}
+#if __cplusplus >= 201402L
+// expected-error@-2 {{no function template matches}}
+#endif
+template <> constexpr int S::g() { return 0; } // expected-note {{previous}}
+#if __cplusplus < 201402L
+// expected-warning@-2 {{C++14}}
+#else
+// expected-error@-4 {{does not match any declaration in 'S'}}
+#endif
 template <> int S::g() const; // expected-error {{non-constexpr declaration of 'g<int>' follows constexpr declaration}}
 // specializations can drop the 'constexpr' but not the implied 'const'.
 template <> char S::g() { return 0; } // expected-error {{no function template matches}}
@@ -123,3 +138,11 @@
 }
 
 extern constexpr int memsz; // expected-error {{constexpr variable declaration must be a definition}}
+
+namespace {
+  struct A {
+    static constexpr int n = 0;
+  };
+  // FIXME: We should diagnose this prior to C++17.
+  const int &r = A::n;
+}
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p1.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p1.cpp
new file mode 100644
index 0000000..6db0b04
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p1.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+inline int f(); // ok
+inline int n; // ok
+
+inline typedef int t; // expected-error {{'inline' can only appear on functions and non-local variables}}
+inline struct S {}; // expected-error {{'inline' can only appear on functions and non-local variables}}
+inline struct T {} s; // ok
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p5.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p5.cpp
new file mode 100644
index 0000000..0ca7bbc
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p5.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+void x() {
+  inline int f(int); // expected-error {{inline declaration of 'f' not allowed in block scope}}
+  inline int n; // expected-error {{inline declaration of 'n' not allowed in block scope}}
+  static inline int m; // expected-error {{inline declaration of 'm' not allowed in block scope}}
+}
+
+inline void g();
+struct X {
+  inline void f();
+  // FIXME: This is ill-formed per [dcl.inline]p5.
+  inline void g();
+  inline void h() {}
+};
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.type/dcl.spec.auto/p2-1z.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.type/dcl.spec.auto/p2-1z.cpp
new file mode 100644
index 0000000..e41270e
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.type/dcl.spec.auto/p2-1z.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+template<typename T, typename U> constexpr bool same = false;
+template<typename T> constexpr bool same<T, T> = true;
+
+auto a() {
+  if constexpr (false)
+    return 0;
+}
+static_assert(same<decltype(a()), void>);
+
+auto b() {
+  if constexpr (false)
+    return 0;
+  else
+    return 0.0;
+}
+static_assert(same<decltype(b()), double>);
+
+auto c() {
+  if constexpr (true)
+    return "foo";
+  else
+    return 'x';
+  if constexpr (false)
+    return 7.6;
+  else
+    return 5; // expected-error {{deduced as 'int' here but deduced as 'const char *' in earlier}}
+}
+
+template<int k> auto d() {
+  if constexpr(k == 0)
+    return 0;
+  if constexpr(k == 1)
+    return "foo";
+  else if constexpr (k == 2)
+    return 1.0;
+}
+static_assert(same<decltype(d<0>()), int>);
+static_assert(same<decltype(d<1>()), const char *>);
+static_assert(same<decltype(d<2>()), double>);
+static_assert(same<decltype(d<3>()), void>);
+
+auto e = []{ if constexpr (false) return 0; }(); // expected-error {{variable has incomplete type 'void'}}
+
+auto f = []{ if constexpr (true) return 0; }();
+static_assert(same<decltype(e), int>);
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.type/p3-0x.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.type/p3-0x.cpp
index 39d6e70..447f7c5 100644
--- a/test/CXX/dcl.dcl/dcl.spec/dcl.type/p3-0x.cpp
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.type/p3-0x.cpp
@@ -18,6 +18,9 @@
   for (struct S { S(int) {} } s : arr) { // expected-error {{types may not be defined in a for range declaration}}
   }
 
+  for (struct S { S(int) {} } s : Undeclared); // expected-error{{types may not be defined in a for range declaration}}
+                                               // expected-error@-1{{use of undeclared identifier 'Undeclared'}}
+
   new struct T {}; // expected-error {{'T' cannot be defined in a type specifier}}
   new struct A {}; // expected-error {{'A' cannot be defined in a type specifier}}
 
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.typedef/p2-0x.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.typedef/p2-0x.cpp
index 20b5104..8c6f6e5 100644
--- a/test/CXX/dcl.dcl/dcl.spec/dcl.typedef/p2-0x.cpp
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.typedef/p2-0x.cpp
@@ -38,8 +38,8 @@
   using T = int[n]; // expected-error {{variable length array declaration not allowed at file scope}}
 
   const int m = 42;
-  using U = int[m]; // expected-note {{previous definition}}
-  using U = int[42]; // ok
+  using U = int[m];
+  using U = int[42]; // expected-note {{previous definition}}
   using U = int; // expected-error {{type alias redefinition with different types ('int' vs 'int [42]')}}
 
   void f() {
diff --git a/test/CXX/dcl.decl/dcl.decomp/p2.cpp b/test/CXX/dcl.decl/dcl.decomp/p2.cpp
new file mode 100644
index 0000000..639aff6
--- /dev/null
+++ b/test/CXX/dcl.decl/dcl.decomp/p2.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+int array() {
+  static int arr[3] = {};
+  // FIXME: We are supposed to create an array object here and perform elementwise initialization.
+  auto [a, b, c] = arr; // expected-error {{cannot decompose non-class, non-array}}
+
+  auto &[d, e] = arr; // expected-error {{type 'int [3]' decomposes into 3 elements, but only 2 names were provided}}
+  auto &[f, g, h, i] = arr; // expected-error {{type 'int [3]' decomposes into 3 elements, but 4 names were provided}}
+
+  auto &[r0, r1, r2] = arr;
+  const auto &[cr0, cr1, cr2] = arr;
+
+  static_assert(&arr[0] == &r0);
+  static_assert(&arr[0] == &cr0);
+
+  using T = int;
+  using T = decltype(r0);
+  using U = const int;
+  using U = decltype(cr0);
+
+  return r1 + cr2;
+}
diff --git a/test/CXX/dcl.decl/dcl.decomp/p3.cpp b/test/CXX/dcl.decl/dcl.decomp/p3.cpp
new file mode 100644
index 0000000..e4a7a6c
--- /dev/null
+++ b/test/CXX/dcl.decl/dcl.decomp/p3.cpp
@@ -0,0 +1,232 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+using size_t = decltype(sizeof(0));
+
+struct A { int x, y; };
+struct B { int x, y; };
+
+void no_tuple_size_1() { auto [x, y] = A(); } // ok, decompose elementwise
+
+namespace std { template<typename T> struct tuple_size; }
+void no_tuple_size_2() { auto [x, y] = A(); } // ok, decompose elementwise
+
+struct Bad1 {};
+template<> struct std::tuple_size<Bad1> {};
+void no_tuple_size_3() { auto [x, y] = Bad1(); } // expected-error {{cannot decompose this type; 'std::tuple_size<Bad1>::value' is not a valid integral constant expression}}
+
+struct Bad2 {};
+template<> struct std::tuple_size<Bad2> { const int value = 5; };
+void no_tuple_size_4() { auto [x, y] = Bad2(); } // expected-error {{cannot decompose this type; 'std::tuple_size<Bad2>::value' is not a valid integral constant expression}}
+
+template<> struct std::tuple_size<A> { static const int value = 3; };
+template<> struct std::tuple_size<B> { enum { value = 3 }; };
+
+void no_get_1() {
+  {
+    auto [a0, a1] = A(); // expected-error {{decomposes into 3 elements}}
+    auto [b0, b1] = B(); // expected-error {{decomposes into 3 elements}}
+  }
+  auto [a0, a1, a2] = A(); // expected-error {{undeclared identifier 'get'}} expected-note {{in implicit initialization of binding declaration 'a0'}}
+}
+
+int get(A);
+
+void no_get_2() {
+  // FIXME: This diagnostic is not great.
+  auto [a0, a1, a2] = A(); // expected-error {{undeclared identifier 'get'}} expected-note {{in implicit initialization of binding declaration 'a0'}}
+}
+
+template<int> float &get(A);
+
+void no_tuple_element_1() {
+  auto [a0, a1, a2] = A(); // expected-error-re {{'std::tuple_element<0U{{L*}}, A>::type' does not name a type}} expected-note {{in implicit}}
+}
+
+namespace std { template<size_t, typename> struct tuple_element; } // expected-note 2{{here}}
+
+void no_tuple_element_2() {
+  auto [a0, a1, a2] = A(); // expected-error {{implicit instantiation of undefined template 'std::tuple_element<0, A>'}} expected-note {{in implicit}}
+}
+
+template<> struct std::tuple_element<0, A> { typedef float type; };
+
+void no_tuple_element_3() {
+  auto [a0, a1, a2] = A(); // expected-error {{implicit instantiation of undefined template 'std::tuple_element<1, A>'}} expected-note {{in implicit}}
+}
+
+template<> struct std::tuple_element<1, A> { typedef float &type; };
+template<> struct std::tuple_element<2, A> { typedef const float &type; };
+
+template<int N> auto get(B) -> int (&)[N + 1];
+template<int N> struct std::tuple_element<N, B> { typedef int type[N +1 ]; };
+
+template<typename T> struct std::tuple_size<const T> : std::tuple_size<T> {};
+template<size_t N, typename T> struct std::tuple_element<N, const T> {
+  typedef const typename std::tuple_element<N, T>::type type;
+};
+
+void referenced_type() {
+  auto [a0, a1, a2] = A();
+  auto [b0, b1, b2] = B();
+
+  A a;
+  B b;
+  auto &[ar0, ar1, ar2] = a;
+  auto &[br0, br1, br2] = b;
+
+  auto &&[arr0, arr1, arr2] = A();
+  auto &&[brr0, brr1, brr2] = B();
+
+  const auto &[acr0, acr1, acr2] = A();
+  const auto &[bcr0, bcr1, bcr2] = B();
+
+
+  using Float = float;
+  using Float = decltype(a0);
+  using Float = decltype(ar0);
+  using Float = decltype(arr0);
+
+  using ConstFloat = const float;
+  using ConstFloat = decltype(acr0);
+
+  using FloatRef = float&;
+  using FloatRef = decltype(a1);
+  using FloatRef = decltype(ar1);
+  using FloatRef = decltype(arr1);
+  using FloatRef = decltype(acr1);
+
+  using ConstFloatRef = const float&;
+  using ConstFloatRef = decltype(a2);
+  using ConstFloatRef = decltype(ar2);
+  using ConstFloatRef = decltype(arr2);
+  using ConstFloatRef = decltype(acr2);
+
+
+  using Int1 = int[1];
+  using Int1 = decltype(b0);
+  using Int1 = decltype(br0);
+  using Int1 = decltype(brr0);
+
+  using ConstInt1 = const int[1];
+  using ConstInt1 = decltype(bcr0);
+
+  using Int2 = int[2];
+  using Int2 = decltype(b1);
+  using Int2 = decltype(br1);
+  using Int2 = decltype(brr1);
+
+  using ConstInt2 = const int[2];
+  using ConstInt2 = decltype(bcr1);
+
+  using Int3 = int[3];
+  using Int3 = decltype(b2);
+  using Int3 = decltype(br2);
+  using Int3 = decltype(brr2);
+
+  using ConstInt3 = const int[3];
+  using ConstInt3 = decltype(bcr2);
+}
+
+struct C { template<int> int get(); };
+template<> struct std::tuple_size<C> { static const int value = 1; };
+template<> struct std::tuple_element<0, C> { typedef int type; };
+
+int member_get() {
+  auto [c] = C();
+  using T = int;
+  using T = decltype(c);
+  return c;
+}
+
+struct D { template<int> struct get {}; }; // expected-note {{declared here}}
+template<> struct std::tuple_size<D> { static const int value = 1; };
+template<> struct std::tuple_element<0, D> { typedef D::get<0> type; };
+void member_get_class_template() {
+  auto [d] = D(); // expected-error {{cannot refer to member 'get' in 'D' with '.'}} expected-note {{in implicit init}}
+}
+
+struct E { int get(); };
+template<> struct std::tuple_size<E> { static const int value = 1; };
+template<> struct std::tuple_element<0, E> { typedef int type; };
+void member_get_non_template() {
+  // FIXME: This diagnostic is not very good.
+  auto [e] = E(); // expected-error {{no member named 'get'}} expected-note {{in implicit init}}
+}
+
+namespace ADL {
+  struct X {};
+};
+template<int> int get(ADL::X);
+template<> struct std::tuple_size<ADL::X> { static const int value = 1; };
+template<> struct std::tuple_element<0, ADL::X> { typedef int type; };
+void adl_only_bad() {
+  auto [x] = ADL::X(); // expected-error {{undeclared identifier 'get'}} expected-note {{in implicit init}}
+}
+
+template<typename ElemType, typename GetTypeLV, typename GetTypeRV>
+struct wrap {
+  template<size_t> GetTypeLV get() &;
+  template<size_t> GetTypeRV get() &&;
+};
+template<typename ET, typename GTL, typename GTR>
+struct std::tuple_size<wrap<ET, GTL, GTR>> {
+  static const int value = 1;
+};
+template<typename ET, typename GTL, typename GTR>
+struct std::tuple_element<0, wrap<ET, GTL, GTR>> {
+  using type = ET;
+};
+
+template<typename T> T &lvalue();
+
+void test_value_category() {
+  // If the declared variable is an lvalue reference, the operand to get is an
+  // lvalue. Otherwise it's an xvalue.
+  { auto [a] = wrap<int, void, int>(); }
+  { auto &[a] = lvalue<wrap<int, int, void>>(); }
+  { auto &&[a] = wrap<int, void, int>(); }
+  // If the initializer (call to get) is an lvalue, the binding is an lvalue
+  // reference to the element type. Otherwise it's an rvalue reference to the
+  // element type.
+  { auto [a] = wrap<int, void, int&>(); }
+  { auto [a] = wrap<int&, void, int&>(); }
+  { auto [a] = wrap<int&&, void, int&>(); } // ok, reference collapse to int&
+
+  { auto [a] = wrap<int, void, int&&>(); }
+  { auto [a] = wrap<int&, void, int&&>(); } // expected-error {{non-const lvalue reference to type 'int' cannot bind}} expected-note {{in implicit}}
+  { auto [a] = wrap<const int&, void, int&&>(); }
+  { auto [a] = wrap<int&&, void, int&&>(); }
+
+  { auto [a] = wrap<int, void, float&>(); } // expected-error {{cannot bind}} expected-note {{implicit}}
+  { auto [a] = wrap<const int, void, float&>(); } // ok, const int &a can bind to float
+  { auto [a] = wrap<int, void, float>(); } // ok, int &&a can bind to float
+}
+
+namespace constant {
+  struct Q {};
+  template<int N> constexpr int get(Q &&) { return N * N; }
+}
+template<> struct std::tuple_size<constant::Q> { static const int value = 3; };
+template<int N> struct std::tuple_element<N, constant::Q> { typedef int type; };
+namespace constant {
+  Q q;
+  // This creates and lifetime-extends a temporary to hold the result of each get() call.
+  auto [a, b, c] = q;    // expected-note {{temporary}}
+  static_assert(a == 0); // expected-error {{constant expression}} expected-note {{temporary}}
+
+  constexpr bool f() {
+    auto [a, b, c] = q;
+    return a == 0 && b == 1 && c == 4;
+  }
+  static_assert(f());
+
+  constexpr int g() {
+    int *p = nullptr;
+    {
+      auto [a, b, c] = q;
+      p = &c;
+    }
+    return *p; // expected-note {{read of object outside its lifetime}}
+  }
+  static_assert(g() == 4); // expected-error {{constant}} expected-note {{in call to 'g()'}}
+}
diff --git a/test/CXX/dcl.decl/dcl.decomp/p4.cpp b/test/CXX/dcl.decl/dcl.decomp/p4.cpp
new file mode 100644
index 0000000..c461eb6
--- /dev/null
+++ b/test/CXX/dcl.decl/dcl.decomp/p4.cpp
@@ -0,0 +1,200 @@
+// RUN: %clang_cc1 -std=c++1z -verify -triple i686-linux-gnu %s
+
+template<typename T, typename U> struct same;
+template<typename T> struct same<T, T> { ~same(); };
+
+struct Empty {};
+
+struct A {
+  int a;
+};
+
+namespace NonPublicMembers {
+  struct NonPublic1 {
+  protected:
+    int a; // expected-note {{declared protected here}}
+  };
+
+  struct NonPublic2 {
+  private:
+    int a; // expected-note 2{{declared private here}}
+  };
+
+  struct NonPublic3 : private A {}; // expected-note {{constrained by private inheritance}}
+
+  struct NonPublic4 : NonPublic2 {};
+
+  void test() {
+    auto [a1] = NonPublic1(); // expected-error {{cannot decompose non-public member 'a' of 'NonPublicMembers::NonPublic1'}}
+    auto [a2] = NonPublic2(); // expected-error {{cannot decompose non-public member 'a' of 'NonPublicMembers::NonPublic2'}}
+    auto [a3] = NonPublic3(); // expected-error {{cannot decompose members of non-public base class 'A' of 'NonPublic3'}}
+    auto [a4] = NonPublic4(); // expected-error {{cannot decompose non-public member 'a' of 'NonPublicMembers::NonPublic4'}}
+  }
+}
+
+namespace AnonymousMember {
+  struct Struct {
+    struct { // expected-note {{declared here}}
+      int i;
+    };
+  };
+
+  struct Union {
+    union { // expected-note {{declared here}}
+      int i;
+    };
+  };
+
+  void test() {
+    auto [a1] = Struct(); // expected-error {{cannot decompose class type 'AnonymousMember::Struct' because it has an anonymous struct member}}
+    auto [a2] = Union(); // expected-error {{cannot decompose class type 'AnonymousMember::Union' because it has an anonymous union member}}
+  }
+}
+
+namespace MultipleClasses {
+  struct B : A {
+    int a;
+  };
+
+  struct C { int a; };
+  struct D : A, C {};
+
+  struct E : virtual A {};
+  struct F : A, E {}; // expected-warning {{direct base 'A' is inaccessible due to ambiguity}}
+
+  struct G : virtual A {};
+  struct H : E, G {};
+
+  struct I { int i; };
+  struct J : I {};
+  struct K : I, virtual J {}; // expected-warning {{direct base 'MultipleClasses::I' is inaccessible due to ambiguity}}
+
+  struct L : virtual J {};
+  struct M : virtual J, L {};
+
+  void test() {
+    auto [b] = B(); // expected-error {{cannot decompose class type 'B': both it and its base class 'A' have non-static data members}}
+    auto [d] = D(); // expected-error {{cannot decompose class type 'D': its base classes 'A' and 'MultipleClasses::C' have non-static data members}}
+    auto [e] = E();
+    auto [f] = F(); // expected-error-re {{cannot decompose members of ambiguous base class 'A' of 'F':{{.*}}struct MultipleClasses::F -> struct A{{.*}}struct MultipleClasses::F -> struct MultipleClasses::E -> struct A}}
+    auto [h] = H(); // ok, only one (virtual) base subobject even though there are two paths to it
+    auto [k] = K(); // expected-error {{cannot decompose members of ambiguous base class 'MultipleClasses::I'}}
+    auto [m] = M(); // ok, all paths to I are through the same virtual base subobject J
+
+    same<decltype(m), int>();
+  }
+}
+
+namespace BindingTypes {
+  struct A {
+    int i = 0;
+    int &r = i;
+    const float f = i;
+    mutable volatile int mvi;
+  };
+  void e() {
+    auto [i,r,f,mvi] = A();
+
+    same<decltype(i), int>();
+    same<decltype(r), int&>();
+    same<decltype(f), const float>();
+    same<decltype(mvi), volatile int>();
+
+    same<decltype((i)), int&>();
+    same<decltype((r)), int&>();
+    same<decltype((f)), const float&>();
+    same<decltype((mvi)), volatile int&>();
+  }
+  void f() {
+    auto &&[i,r,f,mvi] = A();
+
+    same<decltype(i), int>();
+    same<decltype(r), int&>();
+    same<decltype(f), const float>();
+    same<decltype(mvi), volatile int>();
+
+    same<decltype((i)), int&>();
+    same<decltype((r)), int&>();
+    same<decltype((f)), const float&>();
+    same<decltype((mvi)), volatile int&>();
+  }
+  void g() {
+    const auto [i,r,f,mvi] = A();
+
+    same<decltype(i), const int>();
+    same<decltype(r), int&>();
+    same<decltype(f), const float>();
+    same<decltype(mvi), volatile int>(); // not 'const volatile int', per expected resolution of DRxxx
+
+    same<decltype((i)), const int&>();
+    same<decltype((r)), int&>();
+    same<decltype((f)), const float&>();
+    same<decltype((mvi)), volatile int&>(); // not 'const volatile int&', per expected resolution of DRxxx
+  }
+  void h() {
+    typedef const A CA;
+    auto &[i,r,f,mvi] = CA(); // type of var is 'const A &'
+
+    same<decltype(i), const int>(); // not 'int', per expected resolution of DRxxx
+    same<decltype(r), int&>();
+    same<decltype(f), const float>();
+    same<decltype(mvi), volatile int>(); // not 'const volatile int', per expected resolution of DRxxx
+
+    same<decltype((i)), const int&>(); // not 'int&', per expected resolution of DRxxx
+    same<decltype((r)), int&>();
+    same<decltype((f)), const float&>();
+    same<decltype((mvi)), volatile int&>(); // not 'const volatile int&', per expected resolution of DRxxx
+  }
+  struct B {
+    mutable int i;
+  };
+  void mut() {
+    auto [i] = B();
+    const auto [ci] = B();
+    volatile auto [vi] = B();
+    same<decltype(i), int>();
+    same<decltype(ci), int>();
+    same<decltype(vi), volatile int>();
+  }
+}
+
+namespace Bitfield {
+  struct S { unsigned long long x : 4, y : 32; int z; }; // expected-note 2{{here}}
+  int f(S s) {
+    auto [a, b, c] = s;
+    unsigned long long &ra = a; // expected-error {{bit-field 'x'}}
+    unsigned long long &rb = b; // expected-error {{bit-field 'y'}}
+    int &rc = c;
+
+    // the type of the binding is the type of the field
+    same<decltype(a), unsigned long long>();
+    same<decltype(b), unsigned long long>();
+
+    // the type of the expression is an lvalue of the field type
+    // (even though a reference can't bind to the field)
+    same<decltype((a)), unsigned long long&>();
+    same<decltype((b)), unsigned long long&>();
+
+    // the expression promotes to a type large enough to hold the result
+    same<decltype(+a), int>();
+    same<decltype(+b), unsigned int>();
+    return rc;
+  }
+}
+
+namespace Constexpr {
+  struct Q { int a, b; constexpr Q() : a(1), b(2) {} };
+  constexpr Q q;
+  auto &[qa, qb] = q;
+  static_assert(&qa == &q.a && &qb == &q.b);
+  static_assert(qa == 1 && qb == 2);
+}
+
+namespace std_example {
+  struct S { int x1 : 2; volatile double y1; };
+  S f();
+  const auto [x, y] = f();
+
+  same<decltype((x)), const int&> same1;
+  same<decltype((y)), const volatile double&> same2;
+}
diff --git a/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp b/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
index 5cf281c..c2f3b5a 100644
--- a/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
+++ b/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
@@ -116,6 +116,7 @@
 namespace PR13492 {
   struct B {
     B() = default;
+    int field;
   };
 
   void f() {
diff --git a/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1-0x.cpp b/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1-0x.cpp
deleted file mode 100644
index 8767678..0000000
--- a/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1-0x.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1y %s -DCXX1Y
-
-// An aggregate is an array or a class...
-struct Aggr {
-private:
-  static const int n;
-  void f();
-protected:
-  struct Inner { int m; };
-public:
-  bool &br; // expected-note {{default constructor of 'Aggr' is implicitly deleted because field 'br' of reference type 'bool &' would not be initialized}}
-};
-bool b;
-Aggr ag = { b };
-
-// with no user-provided constructors, ...
-struct NonAggr1a { // expected-note 2 {{candidate constructor}}
-  NonAggr1a(int, int); // expected-note {{candidate constructor}}
-  int k;
-};
-NonAggr1a na1a = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr1a'}}
-
-struct NonAggr1b {
-  NonAggr1b(const NonAggr1b &); // expected-note {{candidate constructor}}
-  int k;
-};
-NonAggr1b na1b = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr1b'}}
-
-// no brace-or-equal-initializers for non-static data members, ...
-// Note, this bullet was removed in C++1y.
-struct NonAggr2 {
-  int m = { 123 };
-};
-NonAggr2 na2 = { 42 };
-#ifndef CXX1Y
-// expected-error@-2 {{no matching constructor for initialization of 'NonAggr2'}}
-// expected-note@-6 3 {{candidate constructor}}
-#endif
-
-// no private...
-struct NonAggr3 { // expected-note 3 {{candidate constructor}}
-private:
-  int n;
-};
-NonAggr3 na3 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr3'}}
-
-// or protected non-static data members, ...
-struct NonAggr4 { // expected-note 3 {{candidate constructor}}
-protected:
-  int n;
-};
-NonAggr4 na4 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr4'}}
-
-// no base classes, ...
-struct NonAggr5 : Aggr { // expected-note 3 {{candidate constructor}}
-};
-NonAggr5 na5 = { b }; // expected-error {{no matching constructor for initialization of 'NonAggr5'}}
-template<typename...BaseList>
-struct MaybeAggr5a : BaseList... {}; // expected-note {{default constructor of 'MaybeAggr5a<Aggr>' is implicitly deleted because base class 'Aggr' has a deleted default constructor}}
-MaybeAggr5a<> ma5a0 = {}; // ok
-MaybeAggr5a<Aggr> ma5a1 = {}; // expected-error {{call to implicitly-deleted default constructor of 'MaybeAggr5a<Aggr>'}}
-
-// and no virtual functions.
-struct NonAggr6 { // expected-note 3 {{candidate constructor}}
-  virtual void f();
-  int n;
-};
-NonAggr6 na6 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr6'}}
-
-struct DefaultedAggr {
-  int n;
-
-  DefaultedAggr() = default;
-  DefaultedAggr(const DefaultedAggr &) = default;
-  DefaultedAggr(DefaultedAggr &&) = default;
-  DefaultedAggr &operator=(const DefaultedAggr &) = default;
-  DefaultedAggr &operator=(DefaultedAggr &&) = default;
-  ~DefaultedAggr() = default;
-};
-DefaultedAggr da = { 42 } ;
diff --git a/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1.cpp b/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1.cpp
new file mode 100644
index 0000000..40f6431
--- /dev/null
+++ b/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s
+
+// An aggregate is an array or a class...
+struct Aggr {
+private:
+  static const int n;
+  void f();
+protected:
+  struct Inner { int m; };
+public:
+  bool &br;
+};
+bool b;
+Aggr ag = { b };
+
+// with no user-provided constructors, ...
+struct NonAggr1a { // expected-note 2 {{candidate constructor}}
+  NonAggr1a(int, int); // expected-note {{candidate constructor}}
+  int k;
+};
+NonAggr1a na1a = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr1a'}}
+
+struct NonAggr1b {
+  NonAggr1b(const NonAggr1b &); // expected-note {{candidate constructor}}
+  int k;
+};
+NonAggr1b na1b = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr1b'}}
+
+// no brace-or-equal-initializers for non-static data members, ...
+// Note, this bullet was removed in C++1y.
+struct NonAggr2 {
+  int m = { 123 };
+};
+NonAggr2 na2 = { 42 };
+#if __cplusplus < 201402L
+// expected-error@-2 {{no matching constructor for initialization of 'NonAggr2'}}
+// expected-note@-6 3 {{candidate constructor}}
+#endif
+
+// no private...
+struct NonAggr3 { // expected-note 3 {{candidate constructor}}
+private:
+  int n;
+};
+NonAggr3 na3 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr3'}}
+
+// or protected non-static data members, ...
+struct NonAggr4 { // expected-note 3 {{candidate constructor}}
+protected:
+  int n;
+};
+NonAggr4 na4 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr4'}}
+
+// [pre-C++1z] no base classes, ...
+struct NonAggr5 : Aggr {
+};
+NonAggr5 na5 = { b };
+#if __cplusplus <= 201402L
+// expected-error@-2 {{no matching constructor for initialization of 'NonAggr5'}}
+// expected-note@-5 3 {{candidate constructor}}
+#endif
+template<typename...BaseList>
+struct MaybeAggr5a : BaseList... {};
+MaybeAggr5a<> ma5a0 = {}; // ok
+MaybeAggr5a<Aggr> ma5a1 = {}; // ok in C++17
+MaybeAggr5a<NonAggr2> m5a2 = {}; // ok, aggregate init in C++17, default ctor in C++11 and C++14
+MaybeAggr5a<NonAggr2> m5a3 = {0}; // ok in C++17, overrides default member initializer in base class
+#if __cplusplus <= 201402L
+// expected-error@-4 {{call to implicitly-deleted default constructor of 'MaybeAggr5a<Aggr>'}}
+// expected-note@-7 {{default constructor of 'MaybeAggr5a<Aggr>' is implicitly deleted because base class 'Aggr' has a deleted default constructor}}
+// expected-note@13 {{default constructor of 'Aggr' is implicitly deleted because field 'br' of reference type 'bool &' would not be initialized}}
+// expected-error@-5 {{no matching constructor}} expected-note@-9 3{{candidate}}
+#else
+// expected-error@-9 {{reference member of type 'bool &' uninitialized}}
+// expected-note@13 {{uninitialized reference member is here}}
+#endif
+
+// [C++1z] no virtual, protected, or private base classes, ...
+struct NonAggr5b : virtual Aggr {}; // expected-note 3{{candidate}}
+NonAggr5b na5b = { b }; // expected-error {{no matching constructor}}
+struct NonAggr5c : NonAggr5b {}; // expected-note 3{{candidate}}
+NonAggr5c na5c = { b }; // expected-error {{no matching constructor}}
+struct NonAggr5d : protected Aggr {}; // expected-note 3{{candidate}}
+NonAggr5d na5d = { b }; // expected-error {{no matching constructor}}
+struct NonAggr5e : private Aggr {}; // expected-note 3{{candidate}}
+NonAggr5e na5e = { b }; // expected-error {{no matching constructor}}
+class NonAggr5f : Aggr {}; // expected-note 3{{candidate}}
+NonAggr5f na5f = { b }; // expected-error {{no matching constructor}}
+
+// [C++1z] (the base class need not itself be an aggregate)
+struct MaybeAggr5g : NonAggr1a {};
+MaybeAggr5g ma5g1 = { 1 };
+MaybeAggr5g ma5g2 = { {1, 2} };
+MaybeAggr5g ma5g3 = {};
+#if __cplusplus <= 201402L
+// expected-error@-4 {{no matching constructor}} // expected-note@-5 3{{candidate}}
+// expected-error@-4 {{no matching constructor}} // expected-note@-6 3{{candidate}}
+// expected-error@-4 {{implicitly-deleted default constructor}} expected-note@-7 {{no default constructor}}
+#else
+// expected-error@-8 {{no viable conversion from 'int' to 'NonAggr1a'}} expected-note@19 2{{candidate}}
+// (ok)
+// expected-error@-8 {{no matching constructor}} expected-note@19 2{{candidate}} expected-note@20 {{candidate}}
+#endif
+
+// and no virtual functions.
+struct NonAggr6 { // expected-note 3 {{candidate constructor}}
+  virtual void f();
+  int n;
+};
+NonAggr6 na6 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr6'}}
+
+struct DefaultedAggr {
+  int n;
+
+  DefaultedAggr() = default;
+  DefaultedAggr(const DefaultedAggr &) = default;
+  DefaultedAggr(DefaultedAggr &&) = default;
+  DefaultedAggr &operator=(const DefaultedAggr &) = default;
+  DefaultedAggr &operator=(DefaultedAggr &&) = default;
+  ~DefaultedAggr() = default;
+};
+DefaultedAggr da = { 42 } ;
diff --git a/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3-0x.cpp b/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3-0x.cpp
deleted file mode 100644
index d7ffd07..0000000
--- a/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3-0x.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
-
-namespace std {
-  typedef decltype(sizeof(int)) size_t;
-
-  template <typename E>
-  struct initializer_list
-  {
-    const E *p;
-    size_t n;
-    initializer_list(const E *p, size_t n) : p(p), n(n) {}
-  };
-
-  struct string {
-    string(const char *);
-  };
-
-  template<typename A, typename B>
-  struct pair {
-    pair(const A&, const B&);
-  };
-}
-
-namespace bullet1 {
-  double ad[] = { 1, 2.0 };
-  int ai[] = { 1, 2.0 };  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
-
-  struct S2 {
-    int m1;
-    double m2, m3;
-  };
-
-  S2 s21 = { 1, 2, 3.0 };
-  S2 s22 { 1.0, 2, 3 };  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
-  S2 s23 { };
-}
-
-namespace bullet4_example1 {
-  struct S {
-    S(std::initializer_list<double> d) {}
-    S(std::initializer_list<int> i) {}
-    S() {}
-  };
-
-  S s1 = { 1.0, 2.0, 3.0 };
-  S s2 = { 1, 2, 3 };
-  S s3 = { };
-}
-
-namespace bullet4_example2 {
-  struct Map {
-    Map(std::initializer_list<std::pair<std::string,int>>) {}
-  };
-
-  Map ship = {{"Sophie",14}, {"Surprise",28}};
-}
-
-namespace bullet4_example3 {
-  struct S {
-    S(int, double, double) {}
-    S() {}
-  };
-
-  S s1 = { 1, 2, 3.0 };
-  S s2 { 1.0, 2, 3 }; // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
-  S s3 {};
-}
-
-namespace bullet5 {
-  int x1 {2};
-  int x2 {2.0};  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
-}
-
-namespace bullet6 {
-  struct S {
-    S(std::initializer_list<double>) {}
-    S(const std::string &) {}
-  };
-
-  const S& r1 = { 1, 2, 3.0 };
-  const S& r2 = { "Spinach" };
-  S& r3 = { 1, 2, 3 };  // expected-error {{non-const lvalue reference to type 'bullet6::S' cannot bind to an initializer list temporary}}
-  const int& i1 = { 1 };
-  const int& i2 = { 1.1 };  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}} expected-warning {{implicit conversion}}
-  const int (&iar)[2] = { 1, 2 };
-}
-
-namespace bullet7 {
-  int** pp {};
-}
-
-namespace bullet8 {
-  struct A { int i; int j; };
-  A a1 { 1, 2 };
-  A a2 { 1.2 };  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}} expected-warning {{implicit conversion}}
-
-  struct B {
-    B(std::initializer_list<int> i) {}
-  };
-  B b1 { 1, 2 };
-  B b2 { 1, 2.0 }; // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
-
-  struct C {
-    C(int i, double j) {}
-  };
-  C c1 = { 1, 2.2 };
-  // FIXME: Suppress the narrowing warning in the cases where we issue a narrowing error.
-  C c2 = { 1.1, 2 }; // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}} expected-warning {{implicit conversion}}
-
-  int j { 1 };
-  int k { };
-}
-
-namespace rdar13395022 {
-  struct MoveOnly { // expected-note {{candidate}}
-    MoveOnly(MoveOnly&&); // expected-note 2{{copy constructor is implicitly deleted because}} expected-note {{candidate}}
-  };
-
-  void test(MoveOnly mo) {
-    auto &&list1 = {mo}; // expected-error {{call to implicitly-deleted copy constructor}} expected-note {{in initialization of temporary of type 'std::initializer_list}}
-    MoveOnly (&&list2)[1] = {mo}; // expected-error {{call to implicitly-deleted copy constructor}} expected-note {{in initialization of temporary of type 'rdar13395022::MoveOnly [1]'}}
-    std::initializer_list<MoveOnly> &&list3 = {};
-    MoveOnly (&&list4)[1] = {}; // expected-error {{no matching constructor}}
-    // expected-note@-1 {{in implicit initialization of array element 0 with omitted initializer}}
-    // expected-note@-2 {{in initialization of temporary of type 'rdar13395022::MoveOnly [1]' created to list-initialize this reference}}
-  }
-}
diff --git a/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3.cpp b/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3.cpp
new file mode 100644
index 0000000..f381ed7
--- /dev/null
+++ b/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3.cpp
@@ -0,0 +1,255 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s
+// FIXME: Remove the triple when PR27098 is fixed.
+// RUN: %clang_cc1 -std=c++1z -fsyntax-only -verify %s -triple %itanium_abi_triple
+
+namespace std {
+  typedef decltype(sizeof(int)) size_t;
+
+  template <typename E>
+  struct initializer_list
+  {
+    const E *p;
+    size_t n;
+    initializer_list(const E *p, size_t n) : p(p), n(n) {}
+  };
+
+  struct string {
+    string(const char *);
+  };
+
+  template<typename A, typename B>
+  struct pair {
+    pair(const A&, const B&);
+  };
+}
+
+namespace bullet1 {
+  double ad[] = { 1, 2.0 };
+  int ai[] = { 1, 2.0 };  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
+
+  struct S2 {
+    int m1;
+    double m2, m3;
+  };
+
+  S2 s21 = { 1, 2, 3.0 };
+  S2 s22 { 1.0, 2, 3 };  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
+  S2 s23 { };
+}
+
+namespace bullet4_example1 {
+  struct S {
+    S(std::initializer_list<double> d) {}
+    S(std::initializer_list<int> i) {}
+    S() {}
+  };
+
+  S s1 = { 1.0, 2.0, 3.0 };
+  S s2 = { 1, 2, 3 };
+  S s3 = { };
+}
+
+namespace bullet4_example2 {
+  struct Map {
+    Map(std::initializer_list<std::pair<std::string,int>>) {}
+  };
+
+  Map ship = {{"Sophie",14}, {"Surprise",28}};
+}
+
+namespace bullet4_example3 {
+  struct S {
+    S(int, double, double) {}
+    S() {}
+  };
+
+  S s1 = { 1, 2, 3.0 };
+  S s2 { 1.0, 2, 3 }; // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
+  S s3 {};
+}
+
+namespace bullet5 {
+  int x1 {2};
+  int x2 {2.0};  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
+}
+
+namespace bullet6 {
+  struct S {
+    S(std::initializer_list<double>) {}
+    S(const std::string &) {}
+  };
+
+  const S& r1 = { 1, 2, 3.0 };
+  const S& r2 = { "Spinach" };
+  S& r3 = { 1, 2, 3 };  // expected-error {{non-const lvalue reference to type 'bullet6::S' cannot bind to an initializer list temporary}}
+  const int& i1 = { 1 };
+  const int& i2 = { 1.1 };  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}} expected-warning {{implicit conversion}}
+  const int (&iar)[2] = { 1, 2 };
+}
+
+namespace bullet7 {
+  int** pp {};
+}
+
+namespace bullet8 {
+  struct A { int i; int j; };
+  A a1 { 1, 2 };
+  A a2 { 1.2 };  // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}} expected-warning {{implicit conversion}}
+
+  struct B {
+    B(std::initializer_list<int> i) {}
+  };
+  B b1 { 1, 2 };
+  B b2 { 1, 2.0 }; // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}}
+
+  struct C {
+    C(int i, double j) {}
+  };
+  C c1 = { 1, 2.2 };
+  // FIXME: Suppress the narrowing warning in the cases where we issue a narrowing error.
+  C c2 = { 1.1, 2 }; // expected-error {{type 'double' cannot be narrowed to 'int' in initializer list}} expected-note {{silence}} expected-warning {{implicit conversion}}
+
+  int j { 1 };
+  int k { };
+}
+
+namespace rdar13395022 {
+  struct MoveOnly { // expected-note {{candidate}}
+    MoveOnly(MoveOnly&&); // expected-note 2{{copy constructor is implicitly deleted because}} expected-note {{candidate}}
+  };
+
+  void test(MoveOnly mo) {
+    auto &&list1 = {mo}; // expected-error {{call to implicitly-deleted copy constructor}} expected-note {{in initialization of temporary of type 'std::initializer_list}}
+    MoveOnly (&&list2)[1] = {mo}; // expected-error {{call to implicitly-deleted copy constructor}} expected-note {{in initialization of temporary of type 'rdar13395022::MoveOnly [1]'}}
+    std::initializer_list<MoveOnly> &&list3 = {};
+    MoveOnly (&&list4)[1] = {}; // expected-error {{no matching constructor}}
+    // expected-note@-1 {{in implicit initialization of array element 0 with omitted initializer}}
+    // expected-note@-2 {{in initialization of temporary of type 'rdar13395022::MoveOnly [1]' created to list-initialize this reference}}
+  }
+}
+
+namespace cxx1z_direct_enum_init {
+  enum A {};
+  enum B : char {};
+  enum class C {};
+  enum class D : char {};
+  enum class E : char { k = 5 };
+
+  template<typename T> void good() {
+    (void)T{0};
+    T t1{0};
+    T t2 = T{0};
+
+    struct S { T t; };
+    S s{T{0}};
+
+    struct U { T t{0}; } u; // expected-note 0+{{instantiation of}}
+
+    struct V { T t; V() : t{0} {} }; // expected-note 0+{{instantiation of}}
+
+    void f(T);
+    f(T{0});
+  }
+#if __cplusplus <= 201402L
+  // expected-error@-15 5{{cannot initialize}}
+  // expected-error@-15 5{{cannot initialize}}
+  // expected-error@-15 5{{cannot initialize}}
+  //
+  //
+  // expected-error@-15 5{{cannot initialize}}
+  //
+  // expected-error@-15 5{{cannot initialize}}
+  //
+  // expected-error@-15 5{{cannot initialize}}
+  //
+  //
+  // expected-error@-15 5{{cannot initialize}}
+#else
+  // expected-error@-29 {{cannot initialize}}
+  // expected-error@-29 {{cannot initialize}}
+  // expected-error@-29 {{cannot initialize}}
+  //
+  //
+  // expected-error@-29 {{cannot initialize}}
+  //
+  // expected-error@-29 {{cannot initialize}}
+  //
+  // expected-error@-29 {{cannot initialize}}
+  //
+  //
+  // expected-error@-29 {{cannot initialize}}
+#endif
+
+  template<typename T> void bad() {
+    T t = {0};
+
+    struct S { T t; };
+    S s1{0};
+    S s2{{0}};
+
+    struct U { T t = {0}; } u; // expected-note 0+{{instantiation of}}
+
+    struct V { T t; V() : t({0}) {} }; // expected-note 0+{{instantiation of}}
+
+    void f(T); // expected-note 0+{{passing argument}}
+    f({0});
+  }
+  // expected-error@-13 5{{cannot initialize}}
+  //
+  //
+  // expected-error@-13 5{{cannot initialize}}
+  // expected-error@-13 5{{cannot initialize}}
+  //
+  // expected-error@-13 5{{cannot initialize}}
+  //
+  // expected-error@-13 5{{cannot initialize}}
+  //
+  //
+  // expected-error@-13 5{{cannot initialize}}
+
+  template<typename T> void ugly() {
+    extern char c;
+    T t1{char('0' + c)};
+    T t2{'0' + c};
+    T t3{1234};
+  }
+#if __cplusplus <= 201402L
+  // expected-error@-5 4{{cannot initialize}}
+  // expected-error@-5 4{{cannot initialize}}
+  // expected-error@-5 4{{cannot initialize}}
+#else
+  // expected-error@-8 3{{non-constant-expression cannot be narrowed}}
+  // expected-error@-8 3{{constant expression evaluates to 1234 which cannot be narrowed}} expected-warning@-8 {{changes value}}
+#endif
+
+  void test() {
+    good<A>(); // expected-note 4{{instantiation of}}
+    good<B>();
+    good<C>();
+    good<D>();
+    good<E>();
+#if __cplusplus <= 201402L
+    // expected-note@-5 4{{instantiation of}}
+    // expected-note@-5 4{{instantiation of}}
+    // expected-note@-5 4{{instantiation of}}
+    // expected-note@-5 4{{instantiation of}}
+#endif
+
+    bad<A>(); // expected-note 4{{instantiation of}}
+    bad<B>(); // expected-note 4{{instantiation of}}
+    bad<C>(); // expected-note 4{{instantiation of}}
+    bad<D>(); // expected-note 4{{instantiation of}}
+    bad<E>(); // expected-note 4{{instantiation of}}
+
+    ugly<B>(); // expected-note {{instantiation of}}
+    ugly<C>(); // ok
+    ugly<D>(); // expected-note {{instantiation of}}
+    ugly<E>(); // expected-note {{instantiation of}}
+#if __cplusplus <= 201402L
+    // expected-note@-4 {{instantiation of}}
+#else
+    (void)B{0.0}; // expected-error {{type 'double' cannot be narrowed}}
+#endif
+  }
+}
diff --git a/test/CXX/dcl.decl/dcl.init/p6.cpp b/test/CXX/dcl.decl/dcl.init/p6.cpp
index e404a1e..b646ba7 100644
--- a/test/CXX/dcl.decl/dcl.init/p6.cpp
+++ b/test/CXX/dcl.decl/dcl.init/p6.cpp
@@ -4,9 +4,9 @@
 
 // If a program calls for the default initialization of an object of a
 // const-qualified type T, T shall be a class type with a
-// user-provided default constructor.
+// user-provided default constructor, except if T has no uninitialized fields.
 struct MakeNonPOD { MakeNonPOD(); };
-struct NoUserDefault : public MakeNonPOD { };
+struct NoUserDefault : public MakeNonPOD { int field; };
 struct HasUserDefault { HasUserDefault(); };
 
 void test_const_default_init() {
@@ -16,7 +16,7 @@
 }
 
 // rdar://8501008
-struct s0 {};
+struct s0 { int field; };
 struct s1 { static const s0 foo; };
 const struct s0 s1::foo; // expected-error{{default initialization of an object of const type 'const struct s0' without a user-provided default constructor}}
 
diff --git a/test/CXX/dcl.decl/dcl.meaning/dcl.array/p3.cpp b/test/CXX/dcl.decl/dcl.meaning/dcl.array/p3.cpp
index 4686b1c..188a0a2 100644
--- a/test/CXX/dcl.decl/dcl.meaning/dcl.array/p3.cpp
+++ b/test/CXX/dcl.decl/dcl.meaning/dcl.array/p3.cpp
@@ -207,3 +207,7 @@
     int j() { return sizeof(d); }
   }
 }
+
+extern int arr[];
+void f1() { extern int arr[2]; } // expected-note {{previous}}
+void f2() { extern int arr[3]; } // expected-error {{different type: 'int [3]' vs 'int [2]'}}
diff --git a/test/CXX/drs/dr12xx.cpp b/test/CXX/drs/dr12xx.cpp
new file mode 100644
index 0000000..048c21a
--- /dev/null
+++ b/test/CXX/drs/dr12xx.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++1z %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+
+// expected-no-diagnostics
+
+namespace dr1250 {  // dr1250: 3.9
+struct Incomplete;
+
+struct Base {
+  virtual const Incomplete *meow() = 0;
+};
+
+struct Derived : Base {
+  virtual Incomplete *meow();
+};
+} // dr1250
diff --git a/test/CXX/drs/dr13xx.cpp b/test/CXX/drs/dr13xx.cpp
index 37c144e..8c3e7f2 100644
--- a/test/CXX/drs/dr13xx.cpp
+++ b/test/CXX/drs/dr13xx.cpp
@@ -28,3 +28,19 @@
   }
 #endif
 }
+
+namespace dr1359 { // dr1359: 3.5
+#if __cplusplus >= 201103L
+  union A { constexpr A() = default; };
+  union B { constexpr B() = default; int a; }; // expected-error {{not constexpr}} expected-note 2{{candidate}}
+  union C { constexpr C() = default; int a, b; }; // expected-error {{not constexpr}} expected-note 2{{candidate}}
+  struct X { constexpr X() = default; union {}; };
+  struct Y { constexpr Y() = default; union { int a; }; }; // expected-error {{not constexpr}} expected-note 2{{candidate}}
+
+  constexpr A a = A();
+  constexpr B b = B(); // expected-error {{no matching}}
+  constexpr C c = C(); // expected-error {{no matching}}
+  constexpr X x = X();
+  constexpr Y y = Y(); // expected-error {{no matching}}
+#endif
+}
diff --git a/test/CXX/drs/dr15xx.cpp b/test/CXX/drs/dr15xx.cpp
index 7472be7..4398234 100644
--- a/test/CXX/drs/dr15xx.cpp
+++ b/test/CXX/drs/dr15xx.cpp
@@ -22,6 +22,31 @@
   const X &x = true ? get() : throw 0;
 }
 
+namespace dr1573 { // dr1573: 3.9
+#if __cplusplus >= 201103L
+  // ellipsis is inherited (p0136r1 supersedes this part).
+  struct A { A(); A(int, char, ...); };
+  struct B : A { using A::A; };
+  B b(1, 'x', 4.0, "hello"); // ok
+
+  // inherited constructor is effectively constexpr if the user-written constructor would be
+  struct C { C(); constexpr C(int) {} };
+  struct D : C { using C::C; };
+  constexpr D d = D(0); // ok
+  struct E : C { using C::C; A a; }; // expected-note {{non-literal type}}
+  constexpr E e = E(0); // expected-error {{non-literal type}}
+  // FIXME: This diagnostic is pretty bad; we should explain that the problem
+  // is that F::c would be initialized by a non-constexpr constructor.
+  struct F : C { using C::C; C c; }; // expected-note {{here}}
+  constexpr F f = F(0); // expected-error {{constant expression}} expected-note {{constructor inherited from base class 'C'}}
+
+  // inherited constructor is effectively deleted if the user-written constructor would be
+  struct G { G(int); };
+  struct H : G { using G::G; G g; }; // expected-note {{constructor inherited by 'H' is implicitly deleted because field 'g' has no default constructor}}
+  H h(0); // expected-error {{constructor inherited by 'H' from base class 'G' is implicitly deleted}}
+#endif
+}
+
 #if __cplusplus >= 201103L
 namespace std {
   typedef decltype(sizeof(int)) size_t;
diff --git a/test/CXX/drs/dr16xx.cpp b/test/CXX/drs/dr16xx.cpp
index ddb7d16..65467e3 100644
--- a/test/CXX/drs/dr16xx.cpp
+++ b/test/CXX/drs/dr16xx.cpp
@@ -18,8 +18,8 @@
 #endif
 }
 
+namespace dr1631 {  // dr1631: 3.7
 #if __cplusplus >= 201103L
-namespace dr1631 {  // dr1631: 3.7 c++11
   // Incorrect overload resolution for single-element initializer-list
 
   struct A { int a[1]; };
@@ -41,5 +41,22 @@
       f({0}, {{1}});        // expected-error{{call to 'f' is ambiguous}}
     }
   }
-} // dr1631
 #endif
+}
+
+namespace dr1645 { // dr1645: 3.9
+#if __cplusplus >= 201103L
+  struct A { // expected-note 2{{candidate}}
+    constexpr A(int, float = 0); // expected-note 2{{candidate}}
+    explicit A(int, int = 0); // expected-note 2{{candidate}}
+    A(int, int, int = 0) = delete; // expected-note {{candidate}}
+  };
+
+  struct B : A { // expected-note 2{{candidate}}
+    using A::A; // expected-note 7{{inherited here}}
+  };
+
+  constexpr B a(0); // expected-error {{ambiguous}}
+  constexpr B b(0, 0); // expected-error {{ambiguous}}
+#endif
+}
diff --git a/test/CXX/drs/dr17xx.cpp b/test/CXX/drs/dr17xx.cpp
index 1ab8c40..a917412 100644
--- a/test/CXX/drs/dr17xx.cpp
+++ b/test/CXX/drs/dr17xx.cpp
@@ -3,19 +3,63 @@
 // RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++1z %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
 
+#if __cplusplus < 201103L
 // expected-no-diagnostics
+#endif
 
+namespace dr1715 { // dr1715: 3.9
 #if __cplusplus >= 201103L
-namespace dr1756 {  // dr1756: 3.7 c++11
+  struct B {
+    template<class T> B(T, typename T::Q);
+  };
+
+  class S {
+    using Q = int;
+    template<class T> friend B::B(T, typename T::Q);
+  };
+
+  struct D : B {
+    using B::B;
+  };
+  struct E : B { // expected-note 2{{candidate}}
+    template<class T> E(T t, typename T::Q q) : B(t, q) {} // expected-note {{'Q' is a private member}}
+  };
+
+  B b(S(), 1);
+  D d(S(), 2);
+  E e(S(), 3); // expected-error {{no match}}
+#endif
+}
+
+namespace dr1736 { // dr1736: 3.9
+#if __cplusplus >= 201103L
+struct S {
+  template <class T> S(T t) {
+    struct L : S {
+      using S::S;
+    };
+    typename T::type value; // expected-error {{no member}}
+    L l(value); // expected-note {{instantiation of}}
+  }
+};
+struct Q { typedef int type; } q;
+S s(q); // expected-note {{instantiation of}}
+#endif
+}
+
+namespace dr1756 { // dr1756: 3.7
+#if __cplusplus >= 201103L
   // Direct-list-initialization of a non-class object
   
   int a{0};
   
   struct X { operator int(); } x;
   int b{x};
-} // dr1756
+#endif
+}
 
-namespace dr1758 {  // dr1758: 3.7 c++11
+namespace dr1758 { // dr1758: 3.7
+#if __cplusplus >= 201103L
   // Explicit conversion in copy/move list initialization
 
   struct X { X(); };
@@ -30,5 +74,5 @@
     operator A() { return A(); }
   } b;
   A a{b};
-} // dr1758
 #endif
+}
diff --git a/test/CXX/drs/dr19xx.cpp b/test/CXX/drs/dr19xx.cpp
index 368e7b3..5b626dd 100644
--- a/test/CXX/drs/dr19xx.cpp
+++ b/test/CXX/drs/dr19xx.cpp
@@ -39,6 +39,31 @@
 #endif
 }
 
+namespace dr1903 {
+  namespace A {
+    struct a {};
+    int a;
+    namespace B {
+      int b;
+    }
+    using namespace B;
+    namespace {
+      int c;
+    }
+    namespace D {
+      int d;
+    }
+    using D::d;
+  }
+  namespace X {
+    using A::a;
+    using A::b;
+    using A::c;
+    using A::d;
+    struct a *p;
+  }
+}
+
 namespace dr1909 { // dr1909: yes
   struct A {
     template<typename T> struct A {}; // expected-error {{member 'A' has the same name as its class}}
@@ -54,22 +79,52 @@
   };
 }
 
-#if __cplusplus >= 201103L
 namespace dr1940 { // dr1940: yes
+#if __cplusplus >= 201103L
 static union {
   static_assert(true, "");  // ok
   static_assert(false, ""); // expected-error {{static_assert failed}}
 };
-}
 #endif
+}
 
+namespace dr1941 { // dr1941: 3.9
 #if __cplusplus >= 201402L
+template<typename X>
+struct base {
+  template<typename T>
+  base(T a, T b, decltype(void(*T()), 0) = 0) {
+    while (a != b) (void)*a++;
+  }
+
+  template<typename T>
+  base(T a, X x, decltype(void(T(0) * 1), 0) = 0) {
+    for (T n = 0; n != a; ++n) (void)X(x);
+  }
+};
+
+struct derived : base<int> {
+  using base::base;
+};
+
+struct iter {
+  iter operator++(int);
+  int operator*();
+  friend bool operator!=(iter, iter);
+} it, end;
+
+derived d1(it, end);
+derived d2(42, 9);
+#endif
+}
+
 namespace dr1947 { // dr1947: yes
+#if __cplusplus >= 201402L
 unsigned o = 0'01;  // ok
 unsigned b = 0b'01; // expected-error {{invalid digit 'b' in octal constant}}
 unsigned x = 0x'01; // expected-error {{invalid suffix 'x'01' on integer constant}}
-}
 #endif
+}
 
 #if __cplusplus >= 201103L
 // dr1948: yes
@@ -77,10 +132,58 @@
 void *operator new(__SIZE_TYPE__) noexcept { return nullptr; } // expected-error{{exception specification in declaration does not match previous declaration}}
 #endif
 
+namespace dr1959 { // dr1959: 3.9
 #if __cplusplus >= 201103L
-namespace dr1968 { // dr1968: yes
-static_assert(&typeid(int) == &typeid(int), ""); // expected-error{{not an integral constant expression}}
-}
+  struct b;
+  struct c;
+  struct a {
+    a() = default;
+    a(const a &) = delete; // expected-note 2{{deleted}}
+    a(const b &) = delete; // not inherited
+    a(c &&) = delete; // expected-note {{deleted}}
+    template<typename T> a(T) = delete;
+  };
+
+  struct b : a { // expected-note {{copy constructor of 'b' is implicitly deleted because base class 'dr1959::a' has a deleted copy constructor}}
+    using a::a;
+  };
+
+  a x;
+  b y = x; // expected-error {{deleted}}
+  b z = z; // expected-error {{deleted}}
+
+  // FIXME: It's not really clear that this matches the intent, but it's
+  // consistent with the behavior for assignment operators.
+  struct c : a {
+    using a::a;
+    c(const c &);
+  };
+  c q(static_cast<c&&>(q)); // expected-error {{call to deleted}}
 #endif
+}
+
+namespace dr1968 { // dr1968: yes
+#if __cplusplus >= 201103L
+  static_assert(&typeid(int) == &typeid(int), ""); // expected-error{{not an integral constant expression}}
+#endif
+}
+
+namespace dr1991 { // dr1991: 3.9
+#if __cplusplus >= 201103L
+  struct A {
+    A(int, int) = delete;
+  };
+
+  struct B : A {
+    using A::A;
+    B(int, int, int = 0);
+  };
+
+  // FIXME: As a resolution to an open DR against P0136R1, we treat derived
+  // class constructors as better than base class constructors in the presence
+  // of ambiguity.
+  B b(0, 0); // ok, calls B constructor
+#endif
+}
 
 // dr1994: dup 529
diff --git a/test/CXX/drs/dr4xx.cpp b/test/CXX/drs/dr4xx.cpp
index bceea79..b1c21f8 100644
--- a/test/CXX/drs/dr4xx.cpp
+++ b/test/CXX/drs/dr4xx.cpp
@@ -702,8 +702,8 @@
   namespace X { namespace Q { int n; } }
   namespace Y {
     using X; // expected-error {{requires a qualified name}}
-    using dr460::X; // expected-error {{cannot refer to namespace}}
-    using X::Q; // expected-error {{cannot refer to namespace}}
+    using dr460::X; // expected-error {{cannot refer to a namespace}}
+    using X::Q; // expected-error {{cannot refer to a namespace}}
   }
 }
 
@@ -1197,12 +1197,12 @@
   int check6[ __is_trivially_assignable(B, const B&) ? 1 : -1];
 }
 
-namespace dr497 { // dr497: yes
+namespace dr497 { // dr497: sup 253
   void before() {
     struct S {
       mutable int i;
     };
-    const S cs; // expected-error {{default initialization}}
+    const S cs;
     int S::*pm = &S::i;
     cs.*pm = 88; // expected-error {{not assignable}}
   }
diff --git a/test/CXX/drs/dr5xx.cpp b/test/CXX/drs/dr5xx.cpp
index 96d3494..e0bab57 100644
--- a/test/CXX/drs/dr5xx.cpp
+++ b/test/CXX/drs/dr5xx.cpp
@@ -814,7 +814,7 @@
   }
 }
 
-namespace dr580 { // dr580: no
+namespace dr580 { // dr580: partial
   class C;
   struct A { static C c; };
   struct B { static C c; };
@@ -822,7 +822,7 @@
     C(); // expected-note {{here}}
     ~C(); // expected-note {{here}}
 
-    typedef int I; // expected-note {{here}}
+    typedef int I; // expected-note 2{{here}}
     template<int> struct X;
     template<int> friend struct Y;
     template<int> void f();
@@ -832,7 +832,20 @@
 
   template<C::I> struct C::X {};
   template<C::I> struct Y {};
-  template<C::I> struct Z {}; // FIXME: should reject, accepted because C befriends A!
+  template<C::I> struct Z {}; // expected-error {{private}}
+
+  struct C2 {
+    class X {
+      struct A;
+      typedef int I;
+      friend struct A;
+    };
+    class Y {
+      template<X::I> struct A {}; // FIXME: We incorrectly accept this
+                                  // because we think C2::Y::A<...> might
+                                  // instantiate to C2::X::A
+    };
+  };
 
   template<C::I> void C::f() {}
   template<C::I> void g() {}
diff --git a/test/CXX/drs/dr6xx.cpp b/test/CXX/drs/dr6xx.cpp
index 988c8f4..1d37a6d 100644
--- a/test/CXX/drs/dr6xx.cpp
+++ b/test/CXX/drs/dr6xx.cpp
@@ -146,9 +146,9 @@
 #if __cplusplus >= 201103L
   struct S { int n; } s;
   // FIXME: These should all be 'int &&'
-  using T = decltype(S().n); // expected-note 2{{previous}}
+  using T = decltype(S().n);
   using T = decltype(static_cast<S&&>(s).n);
-  using T = decltype(S().*&S::n);
+  using T = decltype(S().*&S::n); // expected-note 2{{previous}}
   using T = decltype(static_cast<S&&>(s).*&S::n); // expected-error {{different type}}
   using T = int&&; // expected-error {{different type}}
 #endif
diff --git a/test/CXX/except/except.spec/p14.cpp b/test/CXX/except/except.spec/p14.cpp
index 945a767..c717d97 100644
--- a/test/CXX/except/except.spec/p14.cpp
+++ b/test/CXX/except/except.spec/p14.cpp
@@ -124,14 +124,20 @@
   template<typename T> struct Throw {
     Throw() throw(T);
   };
-  struct Derived : Base, Throw<X<3>> {
+  struct Derived1 : Base, X<5> {
+    using Base::Base;
+    int n;
+  };
+  struct Derived2 : Base, Throw<X<3>> {
+    using Base::Base;
+  };
+  struct Derived3 : Base {
     using Base::Base;
     Throw<X<4>> x;
   };
-  struct Test {
-    friend Derived::Derived(X<0>) throw(X<3>, X<4>);
-    friend Derived::Derived(X<1>) noexcept(false);
-    friend Derived::Derived(X<2>) throw(X<2>, X<3>, X<4>);
-  };
-  static_assert(!noexcept(Derived{X<5>{}}), "");
+  static_assert(noexcept(Derived1(X<0>())), "");
+  static_assert(!noexcept(Derived1(X<1>())), "");
+  static_assert(!noexcept(Derived1(X<2>())), "");
+  static_assert(!noexcept(Derived2(X<0>())), "");
+  static_assert(!noexcept(Derived3(X<0>())), "");
 }
diff --git a/test/CXX/expr/expr.const/p2-0x.cpp b/test/CXX/expr/expr.const/p2-0x.cpp
index c519ecb..fd15960 100644
--- a/test/CXX/expr/expr.const/p2-0x.cpp
+++ b/test/CXX/expr/expr.const/p2-0x.cpp
@@ -242,8 +242,8 @@
     constexpr int n13 = n5 + n5; // expected-error {{constant expression}} expected-note {{value -4294967296 is outside the range of }}
     constexpr int n14 = n3 - n5; // expected-error {{constant expression}} expected-note {{value 4294967295 is outside the range of }}
     constexpr int n15 = n5 * n5; // expected-error {{constant expression}} expected-note {{value 4611686018427387904 is outside the range of }}
-    constexpr signed char c1 = 100 * 2; // ok
-    constexpr signed char c2 = '\x64' * '\2'; // also ok
+    constexpr signed char c1 = 100 * 2; // ok expected-warning{{changes value}}
+    constexpr signed char c2 = '\x64' * '\2'; // also ok  expected-warning{{changes value}}
     constexpr long long ll1 = 0x7fffffffffffffff; // ok
     constexpr long long ll2 = ll1 + 1; // expected-error {{constant}} expected-note {{ 9223372036854775808 }}
     constexpr long long ll3 = -ll1 - 1; // ok
diff --git a/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp b/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp
new file mode 100644
index 0000000..bae1e25
--- /dev/null
+++ b/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++1z %s -verify
+
+class NonCopyable {
+  NonCopyable(const NonCopyable&) = delete; //expected-note3{{explicitly marked deleted here}}
+  int x = 10;
+  void foo() {
+    auto L = [this] { return x; };
+    const auto &M = [*this] { return x; };//expected-error{{call to deleted}}
+    const auto &M2 = [this] () -> auto&& {
+      ++x;
+      return [*this] {  //expected-error{{call to deleted}} expected-warning{{reference to local}}
+         return ++x; //expected-error{{read-only}}
+      }; 
+    };
+    const auto &M3 = [*this] () mutable -> auto&& { //expected-error{{call to deleted}} 
+      ++x;
+      return [this] {  // expected-warning{{reference to local}}
+         return x;
+      }; 
+    };
+  }  
+};
diff --git a/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp b/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
index dc2c209..b8504d4 100644
--- a/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
+++ b/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
@@ -11,7 +11,7 @@
 
 template<typename T>
 struct bogus_override_if_virtual : public T {
-  bogus_override_if_virtual() : T(*(T*)0) { }
+  bogus_override_if_virtual() : T(*(T*)0) { } // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   int operator()() const;
 };
 
@@ -36,7 +36,7 @@
   lv(); // expected-error{{no matching function for call to object of type}}
   mlv(); // expected-error{{no matching function for call to object of type}}
 
-  bogus_override_if_virtual<decltype(l)> bogus;
+  bogus_override_if_virtual<decltype(l)> bogus; // expected-note{{in instantiation of member function 'bogus_override_if_virtual<(lambda}}
 }
 
 // Core issue 974: default arguments (8.3.6) may be specified in the
diff --git a/test/CXX/over/over.oper/over.literal/p5.cpp b/test/CXX/over/over.oper/over.literal/p5.cpp
index 66f3f97..bfad5f0 100644
--- a/test/CXX/over/over.oper/over.literal/p5.cpp
+++ b/test/CXX/over/over.oper/over.literal/p5.cpp
@@ -12,11 +12,11 @@
   friend U operator "" _a(const T *, size_t); // expected-error {{parameter}}
 };
 template<char...> struct V {
-  friend void operator "" _b(); // expected-error {{parameter}}
+  friend void operator "" _b(); // expected-error {{parameters}}
 };
 
-template<char... C, int N = 0> void operator "" _b(); // expected-error {{parameter}}
-template<char... C> void operator "" _b(int N = 0); // expected-error {{parameter}}
-template<char, char...> void operator "" _b(); // expected-error {{parameter}}
-template<typename T> T operator "" _b(const char *); // expected-error {{parameter}}
-template<typename T> int operator "" _b(const T *, size_t); // expected-error {{parameter}}
+template<char... C, int N = 0> void operator "" _b(); // expected-error {{template}}
+template<char... C> void operator "" _b(int N = 0); // expected-error {{template}}
+template<char, char...> void operator "" _b(); // expected-error {{template}}
+template<typename T> T operator "" _b(const char *); // expected-error {{template}}
+template<typename T> int operator "" _b(const T *, size_t); // expected-error {{template}}
diff --git a/test/CXX/over/over.oper/over.literal/p8.cpp b/test/CXX/over/over.oper/over.literal/p8.cpp
index 70a1843..6644bae 100644
--- a/test/CXX/over/over.oper/over.literal/p8.cpp
+++ b/test/CXX/over/over.oper/over.literal/p8.cpp
@@ -12,6 +12,6 @@
 float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}}
 string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}}
 double operator "" _miles(double); // expected-error {{parameter}}
-template<char...> int operator "" j(const char*); // expected-error {{parameter}}
+template<char...> int operator "" j(const char*); // expected-error {{template}}
 
 float operator ""_E(const char *);
diff --git a/test/CXX/special/class.copy/implicit-move-def.cpp b/test/CXX/special/class.copy/implicit-move-def.cpp
index 880268d..f344b0c 100644
--- a/test/CXX/special/class.copy/implicit-move-def.cpp
+++ b/test/CXX/special/class.copy/implicit-move-def.cpp
@@ -1,4 +1,4 @@
-// FIXME: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - -std=c++11 %s | FileCheck -check-prefix=CHECK %s
+// FIXME: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - -std=c++11 %s | FileCheck %s
 // RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - -std=c++11 %s | FileCheck -check-prefix=CHECK-ASSIGN %s
 // RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - -std=c++11 %s | FileCheck -check-prefix=CHECK-CTOR %s
 
diff --git a/test/CXX/special/class.copy/p11.0x.move.cpp b/test/CXX/special/class.copy/p11.0x.move.cpp
index 514817d..ab42595 100644
--- a/test/CXX/special/class.copy/p11.0x.move.cpp
+++ b/test/CXX/special/class.copy/p11.0x.move.cpp
@@ -4,6 +4,9 @@
 struct NonTrivial {
   NonTrivial(NonTrivial&&); // expected-note{{copy constructor is implicitly deleted}}
 };
+struct DeletedCopy {
+  DeletedCopy(const DeletedCopy&) = delete;
+};
 
 // A defaulted move constructor for a class X is defined as deleted if X has:
 
@@ -22,6 +25,15 @@
 };
 DeletedNTVariant2::DeletedNTVariant2(DeletedNTVariant2&&) = default; // expected-error{{would delete}}
 
+// Note, move constructor is not a candidate because it is deleted.
+template<typename T> struct DeletedNTVariant3 { // expected-note 2{{default}} expected-note 2{{copy}}
+  union {
+    T NT;
+  };
+};
+extern DeletedNTVariant3<NonTrivial> dntv3a(0); // expected-error {{no matching}}
+extern DeletedNTVariant3<DeletedCopy> dntv3a(0); // expected-error {{no matching}}
+
 // -- a non-static data member of class type M (or array thereof) that cannot be
 //    copied because overload resolution results in an ambiguity or a function
 //    that is deleted or inaccessible
diff --git a/test/CXX/special/class.inhctor/p1.cpp b/test/CXX/special/class.inhctor/p1.cpp
index fa0416e..c006abe 100644
--- a/test/CXX/special/class.inhctor/p1.cpp
+++ b/test/CXX/special/class.inhctor/p1.cpp
@@ -1,53 +1,55 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
-// Per a core issue (no number yet), an ellipsis is always dropped.
-struct A {
-  A(...); // expected-note {{here}}
-  A(int = 0, int = 0, int = 0, int = 0, ...); // expected-note 9{{here}} expected-note 2{{constructor cannot be inherited}}
-  A(int = 0, int = 0, ...); // expected-note {{here}}
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
-  template<typename T> A(T, int = 0, ...); // expected-note 5{{here}}
+struct A { // expected-note 8{{candidate is the implicit}}
+  A(...); // expected-note 4{{candidate constructor}} expected-note 4{{candidate inherited constructor}}
+  A(int = 0, int = 0, int = 0, int = 0, ...); // expected-note 3{{candidate constructor}} expected-note 3{{candidate inherited constructor}}
+  A(int = 0, int = 0, ...); // expected-note 3{{candidate constructor}} expected-note 3{{candidate inherited constructor}}
 
-  template<typename T, int N> A(const T (&)[N]); // expected-note 2{{here}} expected-note {{constructor cannot be inherited}}
-  template<typename T, int N> A(const T (&)[N], int = 0); // expected-note 2{{here}}
+  template<typename T> A(T, int = 0, ...); // expected-note 3{{candidate constructor}} expected-note 3{{candidate inherited constructor}}
+
+  template<typename T, int N> A(const T (&)[N]); // expected-note {{candidate constructor}} expected-note {{candidate inherited constructor}}
+  template<typename T, int N> A(const T (&)[N], int = 0); // expected-note {{candidate constructor}} expected-note {{candidate inherited constructor}}
 };
 
-struct B : A { // expected-note 6{{candidate}}
-  using A::A; // expected-warning 4{{inheriting constructor does not inherit ellipsis}} expected-note 16{{candidate}} expected-note 3{{deleted constructor was inherited here}}
+struct B : A { // expected-note 4{{candidate is the implicit}}
+  using A::A; // expected-note 19{{inherited here}}
+  B(void*);
 };
 
 struct C {} c;
 
-B b0{};
-// expected-error@-1 {{call to implicitly-deleted default constructor of 'B'}}
-// expected-note@-8 {{default constructor of 'B' is implicitly deleted because base class 'A' has multiple default constructors}}
+A a0{}; // expected-error {{ambiguous}}
+B b0{}; // expected-error {{ambiguous}}
 
-B b1{1};
-// expected-error@-1 {{call to deleted constructor of 'B'}}
+A a1{1}; // expected-error {{ambiguous}}
+B b1{1}; // expected-error {{ambiguous}}
 
-B b2{1,2};
-// expected-error@-1 {{call to deleted constructor of 'B'}}
+A a2{1,2}; // expected-error {{ambiguous}}
+B b2{1,2}; // expected-error {{ambiguous}}
 
-B b3{1,2,3};
-// ok
+A a3{1,2,3}; // ok
+B b3{1,2,3}; // ok
 
-B b4{1,2,3,4};
-// ok
+A a4{1,2,3,4}; // ok
+B b4{1,2,3,4}; // ok
 
-B b5{1,2,3,4,5};
-// expected-error@-1 {{no matching constructor for initialization of 'B'}}
+A a5{1,2,3,4,5}; // ok
+B b5{1,2,3,4,5}; // ok
 
-B b6{c};
-// ok
+A a6{c}; // ok
+B b6{c}; // ok
 
-B b7{c,0};
-// ok
+A a7{c,0}; // ok
+B b7{c,0}; // ok
 
-B b8{c,0,1};
-// expected-error@-1 {{no matching constructor}}
+A a8{c,0,1}; // ok
+B b8{c,0,1}; // ok
 
-B b9{"foo"};
-// FIXME: explain why the inheriting constructor was deleted
-// expected-error@-2 {{call to deleted constructor of 'B'}}
+A a9{"foo"}; // expected-error {{ambiguous}}
+B b9{"foo"}; // expected-error {{ambiguous}}
 
 namespace PR15755 {
   struct X {
diff --git a/test/CXX/special/class.inhctor/p2.cpp b/test/CXX/special/class.inhctor/p2.cpp
index d1c16ff..f84dc64 100644
--- a/test/CXX/special/class.inhctor/p2.cpp
+++ b/test/CXX/special/class.inhctor/p2.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
 template<int> struct X {};
 
@@ -8,10 +11,10 @@
 //   - absence or presence of explicit
 //   - absence or presence of constexpr
 struct A {
-  A(X<0>) {} // expected-note 2{{here}}
+  A(X<0>) {} // expected-note 4{{here}}
   constexpr A(X<1>) {}
-  explicit A(X<2>) {} // expected-note 3{{here}}
-  explicit constexpr A(X<3>) {} // expected-note 2{{here}}
+  explicit A(X<2>) {} // expected-note 6{{here}}
+  explicit constexpr A(X<3>) {} // expected-note 4{{here}}
 };
 
 A a0 { X<0>{} };
@@ -36,7 +39,7 @@
 
 
 struct B : A {
-  using A::A; // expected-note 7{{here}}
+  using A::A;
 };
 
 B b0 { X<0>{} };
@@ -62,14 +65,19 @@
 
 // 'constexpr' is OK even if the constructor doesn't obey the constraints.
 struct NonLiteral { NonLiteral(); };
-struct NonConstexpr { NonConstexpr(); constexpr NonConstexpr(int); }; // expected-note {{here}}
+struct NonConstexpr { NonConstexpr(); constexpr NonConstexpr(int); };
 struct Constexpr { constexpr Constexpr(int) {} };
 
 struct BothNonLiteral : NonLiteral, Constexpr { using Constexpr::Constexpr; }; // expected-note {{base class 'NonLiteral' of non-literal type}}
 constexpr BothNonLiteral bothNL{42}; // expected-error {{constexpr variable cannot have non-literal type 'const BothNonLiteral'}}
 
-struct BothNonConstexpr : NonConstexpr, Constexpr { using Constexpr::Constexpr; }; // expected-note {{non-constexpr constructor 'NonConstexpr}}
-constexpr BothNonConstexpr bothNC{42}; // expected-error {{must be initialized by a constant expression}} expected-note {{in call to 'BothNonConstexpr(42)'}}
+// FIXME: This diagnostic is not very good. We should explain that the problem is that base class NonConstexpr cannot be initialized.
+struct BothNonConstexpr
+    : NonConstexpr,
+      Constexpr {
+  using Constexpr::Constexpr; // expected-note {{here}}
+};
+constexpr BothNonConstexpr bothNC{42}; // expected-error {{must be initialized by a constant expression}} expected-note {{inherited from base class 'Constexpr'}}
 
 
 struct ConstexprEval {
@@ -87,25 +95,25 @@
 static_assert(ce.k2 == 'x', "");
 
 
-struct TemplateCtors {
-  constexpr TemplateCtors() {}
-  template<template<int> class T> TemplateCtors(X<0>, T<0>);
-  template<int N> TemplateCtors(X<1>, X<N>);
-  template<typename T> TemplateCtors(X<2>, T);
+struct TemplateCtors { // expected-note 2{{candidate constructor (the implicit}}
+  constexpr TemplateCtors() {} // expected-note {{candidate inherited constructor}}
+  template<template<int> class T> TemplateCtors(X<0>, T<0>); // expected-note {{here}} expected-note {{candidate inherited constructor}}
+  template<int N> TemplateCtors(X<1>, X<N>); // expected-note {{here}} expected-note {{candidate inherited constructor}}
+  template<typename T> TemplateCtors(X<2>, T); // expected-note {{here}} expected-note {{candidate inherited constructor}}
 
-  template<typename T = int> TemplateCtors(int, int = 0, int = 0); // expected-note {{inherited from here}}
+  template<typename T = int> TemplateCtors(int, int = 0, int = 0);
 };
 
-struct UsingTemplateCtors : TemplateCtors {  // expected-note 2{{candidate is the implicit}}
-  using TemplateCtors::TemplateCtors; // expected-note 4{{here}} expected-note {{candidate}}
+struct UsingTemplateCtors : TemplateCtors { // expected-note 2{{candidate constructor (the implicit}}
+  using TemplateCtors::TemplateCtors; // expected-note 6{{inherited here}}
 
-  constexpr UsingTemplateCtors(X<0>, X<0>) {}
-  constexpr UsingTemplateCtors(X<1>, X<1>) {}
-  constexpr UsingTemplateCtors(X<2>, X<2>) {}
+  constexpr UsingTemplateCtors(X<0>, X<0>) {} // expected-note {{not viable}}
+  constexpr UsingTemplateCtors(X<1>, X<1>) {} // expected-note {{not viable}}
+  constexpr UsingTemplateCtors(X<2>, X<2>) {} // expected-note {{not viable}}
 
-  template<int = 0> constexpr UsingTemplateCtors(int) {} // expected-note {{candidate}}
-  template<typename T = void> constexpr UsingTemplateCtors(int, int) {}
-  template<typename T, typename U> constexpr UsingTemplateCtors(int, int, int) {}
+  template<int = 0> constexpr UsingTemplateCtors(int) {} // expected-note {{not viable}}
+  template<typename T = void> constexpr UsingTemplateCtors(int, int) {} // expected-note {{not viable}}
+  template<typename T, typename U> constexpr UsingTemplateCtors(int, int, int) {} // expected-note {{couldn't infer}}
 };
 
 template<int> struct Y {};
@@ -116,6 +124,10 @@
 constexpr UsingTemplateCtors uct5{ X<2>{}, 0 }; // expected-error {{must be initialized by a constant expression}} expected-note {{non-constexpr}}
 constexpr UsingTemplateCtors uct6{ X<2>{}, X<2>{} };
 
-constexpr UsingTemplateCtors utc7{ 0 }; // expected-error {{ambiguous}}
+constexpr UsingTemplateCtors utc7{ 0 }; // ok
 constexpr UsingTemplateCtors utc8{ 0, 0 }; // ok
-constexpr UsingTemplateCtors utc9{ 0, 0, 0 }; // expected-error {{must be initialized by a constant expression}} expected-note {{non-constexpr}}
+// FIXME: The standard says that UsingTemplateCtors' (int, int, int) constructor
+// hides the one from TemplateCtors, even though the template parameter lists
+// don't match. It's not clear that that's *really* the intent, and it's not
+// what other compilers do.
+constexpr UsingTemplateCtors utc9{ 0, 0, 0 }; // expected-error {{no matching constructor}}
diff --git a/test/CXX/special/class.inhctor/p3.cpp b/test/CXX/special/class.inhctor/p3.cpp
index 7aaaa7a..7f05487 100644
--- a/test/CXX/special/class.inhctor/p3.cpp
+++ b/test/CXX/special/class.inhctor/p3.cpp
@@ -1,8 +1,11 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
 struct B1 {
-  B1(int);
-  B1(int, int);
+  B1(int); // expected-note 3{{target of using}}
+  B1(int, int); // expected-note 3{{target of using}}
 };
 struct D1 : B1 {
   using B1::B1;
@@ -11,48 +14,56 @@
 
 D1 fd1() { return 1; }
 
-struct B2 {
+struct B2 { // expected-note 2{{candidate}}
   explicit B2(int, int = 0, int = 0);
 };
-struct D2 : B2 { // expected-note 2 {{candidate constructor}}
-  using B2::B2;
+struct D2 : B2 { // expected-note 2{{candidate constructor}}
+  using B2::B2; // expected-note 2{{inherited here}}
 };
 D2 d2a(1), d2b(1, 1), d2c(1, 1, 1);
 
 D2 fd2() { return 1; } // expected-error {{no viable conversion}}
 
-struct B3 {
-  B3(void*); // expected-note {{inherited from here}}
+struct B3 { // expected-note 2{{candidate}}
+  B3(void*); // expected-note {{candidate}}
 };
-struct D3 : B3 { // expected-note 2 {{candidate constructor}}
-  using B3::B3; // expected-note {{candidate constructor (inherited)}}
+struct D3 : B3 { // expected-note 2{{candidate constructor}}
+  using B3::B3; // expected-note 3{{inherited here}}
 };
 D3 fd3() { return 1; } // expected-error {{no viable conversion}}
 
 template<typename T> struct T1 : B1 {
-  using B1::B1;
+  using B1::B1; // expected-note 2{{using declaration}}
 };
 template<typename T> struct T2 : T1<T> {
-  using T1<int>::T1;
+  using T1<int>::T1; // expected-note 2{{using declaration}}
 };
 template<typename T> struct T3 : T1<int> {
-  using T1<T>::T1;
+  using T1<T>::T1; // expected-note 2{{using declaration}}
 };
 struct U {
-  friend T1<int>::T1(int);
-  friend T1<int>::T1(int, int);
-  friend T2<int>::T2(int);
-  friend T2<int>::T2(int, int);
-  friend T3<int>::T3(int);
-  friend T3<int>::T3(int, int);
+  // [dcl.meaning]p1: "the member shall not merely hav ebeen introduced by a
+  // using-declaration in the scope of the class [...] nominated by the
+  // nested-name-specifier of the declarator-id"
+  friend T1<int>::T1(int); // expected-error {{cannot befriend target of using declaration}}
+  friend T1<int>::T1(int, int); // expected-error {{cannot befriend target of using declaration}}
+  friend T2<int>::T2(int); // expected-error {{cannot befriend target of using declaration}}
+  friend T2<int>::T2(int, int); // expected-error {{cannot befriend target of using declaration}}
+  friend T3<int>::T3(int); // expected-error {{cannot befriend target of using declaration}}
+  friend T3<int>::T3(int, int); // expected-error {{cannot befriend target of using declaration}}
 };
 
 struct B4 {
-  template<typename T> explicit B4(T, int = 0);
+  template<typename T> explicit B4(T, int = 0); // expected-note 2{{here}}
 };
 template<typename T> struct T4 : B4 {
-  using B4::B4; // expected-note {{here}}
+  using B4::B4;
   template<typename U> T4(U);
 };
+template<typename T> struct U4 : T4<T> {
+  using T4<T>::T4;
+};
 T4<void> t4a = {0};
 T4<void> t4b = {0, 0}; // expected-error {{chosen constructor is explicit}}
+U4<void> u4a = {0};
+U4<void> u4b = {0, 0}; // expected-error {{chosen constructor is explicit}}
diff --git a/test/CXX/special/class.inhctor/p4.cpp b/test/CXX/special/class.inhctor/p4.cpp
index ae1f7a5..69fbea3 100644
--- a/test/CXX/special/class.inhctor/p4.cpp
+++ b/test/CXX/special/class.inhctor/p4.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
 template<int> struct X {};
 
@@ -8,20 +11,20 @@
 public:
   A(X<0>) {}
 protected:
-  A(X<1>) {}
+  A(X<1>) {} // expected-note 2{{declared protected here}}
 private:
-  A(X<2>) {} // expected-note {{declared private here}}
+  A(X<2>) {} // expected-note 2{{declared private here}}
   friend class FA;
 };
 
 struct B : A {
-  using A::A; // expected-error {{private constructor}} expected-note {{implicitly declared protected here}}
+  using A::A;
   friend class FB;
 };
 
 B b0{X<0>{}};
 B b1{X<1>{}}; // expected-error {{calling a protected constructor}}
-B b2{X<2>{}}; // expected-note {{first required here}}
+B b2{X<2>{}}; // expected-error {{calling a private constructor}}
 
 struct C : B {
   C(X<0> x) : B(x) {}
@@ -34,7 +37,7 @@
 };
 
 struct FA : A {
-  using A::A; // expected-note 2{{here}}
+  using A::A;
 };
 FA fa0{X<0>{}};
 FA fa1{X<1>{}}; // expected-error {{calling a protected constructor}}
@@ -47,7 +50,7 @@
   template<typename T> G(T*) = delete; // expected-note {{'G<const char>' has been explicitly marked deleted here}}
 };
 struct H : G {
-  using G::G; // expected-note 2{{deleted constructor was inherited here}}
+  using G::G;
 };
 H h1(5); // expected-error {{call to deleted constructor of 'H'}}
 H h2("foo"); // expected-error {{call to deleted constructor of 'H'}}
@@ -57,15 +60,15 @@
 // same signature.
 namespace DRnnnn {
   struct A {
-    constexpr A(int, float = 0) {}
-    explicit A(int, int = 0) {} // expected-note {{constructor cannot be inherited}}
+    constexpr A(int, float = 0) {} // expected-note {{candidate}}
+    explicit A(int, int = 0) {} // expected-note {{candidate}}
 
-    A(int, int, int = 0) = delete;
+    A(int, int, int = 0) = delete; // expected-note {{deleted}}
   };
   struct B : A {
-    using A::A; // expected-note {{here}}
+    using A::A; // expected-note 3{{inherited here}}
   };
 
   constexpr B b0(0, 0.0f); // ok, constexpr
-  B b1(0, 1); // expected-error {{call to deleted constructor of 'DRnnnn::B'}}
+  B b1(0, 1); // expected-error {{call to constructor of 'DRnnnn::B' is ambiguous}}
 }
diff --git a/test/CXX/special/class.inhctor/p7.cpp b/test/CXX/special/class.inhctor/p7.cpp
index a57e855..c22a43a 100644
--- a/test/CXX/special/class.inhctor/p7.cpp
+++ b/test/CXX/special/class.inhctor/p7.cpp
@@ -1,47 +1,48 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
-// Straight from the standard
-struct B1 {
-  B1(int); // expected-note {{previous constructor}} expected-note {{conflicting constructor}}
+struct B1 { // expected-note 2{{candidate}}
+  B1(int); // expected-note {{candidate}}
 };
-struct B2 {
-  B2(int); // expected-note {{conflicting constructor}}
+struct B2 { // expected-note 2{{candidate}}
+  B2(int); // expected-note {{candidate}}
 };
-struct D1 : B1, B2 {
-  using B1::B1; // expected-note {{inherited here}}
-  using B2::B2; // expected-error {{already inherited constructor with the same signature}}
+struct D1 : B1, B2 { // expected-note 2{{candidate}}
+  using B1::B1; // expected-note 3{{inherited here}}
+  using B2::B2; // expected-note 3{{inherited here}}
 };
 struct D2 : B1, B2 {
   using B1::B1;
   using B2::B2;
   D2(int);
 };
+D1 d1(0); // expected-error {{ambiguous}}
+D2 d2(0);
 
 template<typename T> struct B3 {
-  B3(T); // expected-note {{previous constructor}}
+  B3(T);
 };
 template<typename T> struct B4 : B3<T>, B1 {
   B4();
-  using B3<T>::B3; // expected-note {{inherited here}}
-  using B1::B1; // expected-error {{already inherited}}
+  using B3<T>::B3;
+  using B1::B1;
 };
 B4<char> b4c;
-B4<int> b4i; // expected-note {{here}}
+B4<int> b4i;
 
 struct B5 {
-  template<typename T> B5(T); // expected-note {{previous constructor}}
+  template<typename T> B5(T);
 };
-struct B6 {
-  template<typename T> B6(T); // expected-note {{conflicting constructor}}
-};
-struct B7 {
-  template<typename T, int> B7(T);
-};
-struct D56 : B5, B6, B7 {
-  using B5::B5; // expected-note {{inherited here}}
-  using B6::B6; // expected-error {{already inherited}}
-};
-struct D57 : B5, B6, B7 {
+struct D6 : B5 {
   using B5::B5;
-  using B7::B7; // ok, not the same signature
+  template<typename T> D6(T);
 };
+D6 d6(0);
+struct D7 : B5 {
+  using B5::B5;
+  template<typename T> D7(T, ...);
+};
+// DRxxx (no number yet): derived class ctor beats base class ctor.
+D7 d7(0);
diff --git a/test/CXX/special/class.inhctor/p8.cpp b/test/CXX/special/class.inhctor/p8.cpp
index effc2c3..58c01d2 100644
--- a/test/CXX/special/class.inhctor/p8.cpp
+++ b/test/CXX/special/class.inhctor/p8.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
 struct A {
   constexpr A(const int&) : rval(false) {}
@@ -13,8 +16,6 @@
 constexpr A a0{0};
 constexpr A a1{k};
 constexpr B b0{0};
-// This performs static_cast<(const int&)&&>(k), so calls the A(const int&)
-// constructor.
 constexpr B b1{k};
 
 static_assert(a0.rval && !a1.rval && b0.rval && !b1.rval, "");
@@ -28,5 +29,4 @@
 };
 static_assert(D(123).v == 123, "");
 
-// FIXME: This diagnostic sucks.
-template<typename T> constexpr D::D(T t) : C(t) {} // expected-error {{definition of implicitly declared function}}
+template<typename T> constexpr D::D(T t) : C(t) {} // expected-error {{does not match any declaration in 'D'}}
diff --git a/test/CXX/special/class.init/class.inhctor.init/p1.cpp b/test/CXX/special/class.init/class.inhctor.init/p1.cpp
new file mode 100644
index 0000000..e07d879
--- /dev/null
+++ b/test/CXX/special/class.init/class.inhctor.init/p1.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+namespace std_example {
+  struct B1 {
+    B1(int, ...) {}
+  };
+
+  struct B2 {
+    B2(double) {}
+  };
+
+  int get();
+
+  struct D1 : B1 { // expected-note {{no default constructor}}
+    using B1::B1; // inherits B1(int, ...)
+    int x;
+    int y = get();
+  };
+
+  void test() {
+    D1 d(2, 3, 4); // OK: B1 is initialized by calling B1(2, 3, 4),
+    // then d.x is default-initialized (no initialization is performed),
+    // then d.y is initialized by calling get()
+    D1 e; // expected-error {{implicitly-deleted}}
+  }
+
+  struct D2 : B2 {
+    using B2::B2;
+    B1 b; // expected-note {{constructor inherited by 'D2' is implicitly deleted because field 'b' has no default constructor}}
+  };
+
+  D2 f(1.0); // expected-error {{constructor inherited by 'D2' from base class 'B2' is implicitly deleted}}
+
+  struct W {
+    W(int);
+  };
+  struct X : virtual W {
+    using W::W;
+    X() = delete;
+  };
+  struct Y : X {
+    using X::X;
+  };
+  struct Z : Y, virtual W {
+    using Y::Y;
+  };
+  Z z(0); // OK: initialization of Y does not invoke default constructor of X
+
+  template <class T> struct Log : T {
+    using T::T; // inherits all constructors from class T
+    ~Log() { /* ... */ }
+  };
+}
+
+namespace vbase {
+  struct V {
+    V(int);
+  };
+
+  struct A : virtual V {
+    A() = delete; // expected-note 2{{deleted here}} expected-note {{deleted}}
+    using V::V;
+  };
+  struct B : virtual V { // expected-note {{no default constructor}}
+    B() = delete; // expected-note 2{{deleted here}}
+    B(int, int);
+    using V::V;
+  };
+  struct C : B { // expected-note {{deleted default constructor}}
+    using B::B;
+  };
+  struct D : A, C { // expected-note {{deleted default constructor}} expected-note {{deleted corresponding constructor}}
+    using A::A;
+    using C::C;
+  };
+
+  A a0; // expected-error {{deleted}}
+  A a1(0);
+  B b0; // expected-error {{deleted}}
+  B b1(0);
+  B b2(0, 0);
+  C c0; // expected-error {{deleted}}
+  C c1(0);
+  C c2(0, 0); // expected-error {{deleted}}
+  D d0; // expected-error {{deleted}}
+  D d1(0);
+  D d2(0, 0); // expected-error {{deleted}}
+}
+
+namespace constexpr_init_order {
+  struct Param;
+  struct A {
+    constexpr A(Param);
+    int a;
+  };
+
+  struct B : A { B(); using A::A; int b = 2; };
+  extern const B b;
+
+  struct Param {
+    constexpr Param(int c) : n(4 * b.a + b.b + c) {}
+    int n;
+  };
+
+  constexpr A::A(Param p) : a(p.n) {}
+
+  constexpr B b(1);
+  constexpr B c(1);
+  static_assert(b.a == 1, "p should be initialized before B() is executed");
+  static_assert(c.a == 7, "b not initialzed properly");
+}
+
+namespace default_args {
+  // We work around a defect in P0136R1 where it would reject reasonable
+  // code like the following:
+  struct Base {
+    Base(int = 0);
+  };
+  struct Derived : Base {
+    using Base::Base;
+  };
+  Derived d;
+  // FIXME: Once a fix is standardized, implement it.
+}
diff --git a/test/CXX/special/class.init/class.inhctor.init/p2.cpp b/test/CXX/special/class.init/class.inhctor.init/p2.cpp
new file mode 100644
index 0000000..7ea2ccc
--- /dev/null
+++ b/test/CXX/special/class.init/class.inhctor.init/p2.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+namespace std_example {
+  struct A { A(int); };
+  struct B : A { using A::A; };
+
+  struct C1 : B { using B::B; };
+  struct C2 : B { using B::B; };
+
+  struct D1 : C1, C2 {
+    using C1::C1; // expected-note {{inherited from base class 'C1' here}}
+    using C2::C2; // expected-note {{inherited from base class 'C2' here}}
+  };
+
+  struct V1 : virtual B { using B::B; };
+  struct V2 : virtual B { using B::B; };
+
+  struct D2 : V1, V2 {
+    using V1::V1;
+    using V2::V2;
+  };
+
+  D1 d1(0); // expected-error {{constructor of 'A' inherited from multiple base class subobjects}}
+  D2 d2(0); // OK: initializes virtual B base class, which initializes the A base class
+            // then initializes the V1 and V2 base classes as if by a defaulted default constructor
+
+  struct M { M(); M(int); };
+  struct N : M { using M::M; };
+  struct O : M {};
+  struct P : N, O { using N::N; using O::O; };
+  P p(0); // OK: use M(0) to initialize N's base class,
+          // use M() to initialize O's base class
+}
diff --git a/test/CXX/stmt.stmt/stmt.dcl/p3.cpp b/test/CXX/stmt.stmt/stmt.dcl/p3.cpp
index 4bcc648..03c835b 100644
--- a/test/CXX/stmt.stmt/stmt.dcl/p3.cpp
+++ b/test/CXX/stmt.stmt/stmt.dcl/p3.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // PR10034
 struct X {};
@@ -40,8 +42,16 @@
 };
 
 void test_Z() {
-  goto end; // expected-error{{cannot jump from this goto statement to its label}}
-  Z z; // expected-note{{jump bypasses initialization of non-POD variable}}
+  goto end;
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{cannot jump from this goto statement to its label}}
+#endif
+
+  Z z;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{jump bypasses initialization of non-POD variable}}
+#endif
+
  end:
   return;
 }
diff --git a/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp b/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
index 7d689ae..8c4f36c 100644
--- a/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
+++ b/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++1z -fsyntax-only -verify %s
 
 struct pr12960 {
   int begin;
@@ -118,10 +120,15 @@
     ;
 
   struct Differ {
-    int *begin(); // expected-note {{selected 'begin' function with iterator type 'int *'}}
-    null_t end(); // expected-note {{selected 'end' function with iterator type 'null_t'}}
+    int *begin();
+    null_t end();
   };
-  for (auto a : Differ()) // expected-error {{'begin' and 'end' must return the same type (got 'int *' and 'null_t')}}
+  for (auto a : Differ())
+#if __cplusplus <= 201402L
+    // expected-warning@-2 {{'begin' and 'end' returning different types ('int *' and 'null_t') is a C++1z extension}}
+    // expected-note@-6 {{selected 'begin' function with iterator type 'int *'}}
+    // expected-note@-6 {{selected 'end' function with iterator type 'null_t'}}
+#endif
     ;
 
   for (void f() : "error") // expected-error {{for range declaration must declare a variable}}
@@ -129,7 +136,7 @@
 
   for (extern int a : A()) {} // expected-error {{loop variable 'a' may not be declared 'extern'}}
   for (static int a : A()) {} // expected-error {{loop variable 'a' may not be declared 'static'}}
-  for (register int a : A()) {} // expected-error {{loop variable 'a' may not be declared 'register'}} expected-warning {{deprecated}}
+  for (register int a : A()) {} // expected-error {{loop variable 'a' may not be declared 'register'}} expected-warning 0-1{{register}} expected-error 0-1{{register}}
   for (constexpr int a : X::C()) {} // OK per CWG issue #1204.
 
   for (auto u : X::NoBeginADL()) { // expected-error {{invalid range expression of type 'X::NoBeginADL'; no viable 'begin' function available}}
diff --git a/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp b/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
new file mode 100644
index 0000000..d6a2169
--- /dev/null
+++ b/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
@@ -0,0 +1,137 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+// RUN: %clang_cc1 -std=c++1z -verify %s -DUNDEFINED
+
+#ifdef UNDEFINED
+// "used but not defined" errors don't get produced if we have more interesting
+// errors.
+namespace std_example {
+  template <typename T, typename... Rest> void g(T &&p, Rest &&... rs) {
+    // use p
+    if constexpr(sizeof...(rs) > 0)
+      g(rs...);
+  }
+  void use_g() {
+    g(1, 2, 3);
+  }
+
+  static int x(); // no definition of x required
+  int f() {
+    if constexpr (true)
+      return 0;
+    else if (x())
+      return x();
+    else
+      return -x();
+  }
+}
+
+namespace odr_use_in_selected_arm {
+  static int x(); // expected-warning {{is not defined}}
+  int f() {
+    if constexpr (false)
+      return 0;
+    else if (x()) // expected-note {{here}}
+      return x();
+    else
+      return -x();
+  }
+}
+#else
+namespace ccce {
+  void f() {
+    if (5) {}
+    if constexpr (5) {} // expected-error {{cannot be narrowed}}
+  }
+  template<int N> void g() {
+    if constexpr (N) {} // expected-error {{cannot be narrowed}}
+  }
+  template void g<5>(); // expected-note {{instantiation of}}
+}
+
+namespace generic_lambda {
+  // Substituting for T produces a hard error here, even if substituting for
+  // the type of x would remove the error.
+  template<typename T> void f() {
+    [](auto x) {
+      if constexpr (sizeof(T) == 1 && sizeof(x) == 1)
+        T::error(); // expected-error 2{{'::'}}
+    } (0);
+  }
+
+  template<typename T> void g() {
+    [](auto x) {
+      if constexpr (sizeof(T) == 1)
+        if constexpr (sizeof(x) == 1)
+          T::error(); // expected-error {{'::'}}
+    } (0);
+  }
+
+  void use() {
+    f<int>(); // expected-note {{instantiation of}}
+    f<char>(); // expected-note {{instantiation of}}
+    g<int>(); // ok
+    g<char>(); // expected-note {{instantiation of}}
+  }
+}
+
+namespace potentially_discarded_branch_target {
+  void in_switch(int n) {
+    switch (n)
+      case 4: if constexpr(sizeof(n) == 4) return;
+    if constexpr(sizeof(n) == 4)
+      switch (n) case 4: return;
+    switch (n) {
+      if constexpr (sizeof(n) == 4) // expected-note 2{{constexpr if}}
+        case 4: return; // expected-error {{cannot jump}}
+      else
+        default: break; // expected-error {{cannot jump}}
+    }
+  }
+
+  template<typename T>
+  void in_switch_tmpl(int n) {
+    switch (n) {
+      if constexpr (sizeof(T) == 4) // expected-note 2{{constexpr if}}
+        case 4: return; // expected-error {{cannot jump}}
+      else
+        default: break; // expected-error {{cannot jump}}
+    }
+  }
+
+  void goto_scope(int n) {
+    goto foo; // expected-error {{cannot jump}}
+    if constexpr(sizeof(n) == 4) // expected-note {{constexpr if}}
+      foo: return;
+bar:
+    if constexpr(sizeof(n) == 4)
+      goto bar; // ok
+  }
+
+  template<typename T>
+  void goto_scope(int n) {
+    goto foo; // expected-error {{cannot jump}}
+    if constexpr(sizeof(n) == 4) // expected-note {{constexpr if}}
+      foo: return;
+bar:
+    if constexpr(sizeof(n) == 4)
+      goto bar; // ok
+  }
+
+  void goto_redef(int n) {
+a:  if constexpr(sizeof(n) == 4) // expected-error {{redefinition}} expected-note {{constexpr if}}
+      a: goto a; // expected-note 2{{previous}}
+    else
+      a: goto a; // expected-error {{redefinition}} expected-error {{cannot jump}}
+  }
+
+  void evil_things() {
+    goto evil_label; // expected-error {{cannot jump}}
+    if constexpr (true || ({evil_label: false;})) {} // expected-note {{constexpr if}}
+
+    if constexpr (true) // expected-note {{constexpr if}}
+      goto surprise; // expected-error {{cannot jump}}
+    else
+      surprise: {}
+  }
+}
+#endif
diff --git a/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp b/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
index 0719f78..58290ac 100644
--- a/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
+++ b/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -triple=x86_64-linux-gnu %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple=x86_64-linux-gnu -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple=x86_64-linux-gnu -std=c++11 %s
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -triple=x86_64-linux-gnu %s -DCPP11ONLY
 
 // C++11 [temp.arg.nontype]p1:
@@ -31,43 +33,103 @@
 //      if the corresopnding template-parameter is a reference; or
 namespace addr_of_obj_or_func {
   template <int* p> struct X0 { }; // expected-note 5{{here}}
+#if __cplusplus >= 201103L
+  // expected-note@-2 2{{template parameter is declared here}}
+#endif
+
   template <int (*fp)(int)> struct X1 { };
   template <int &p> struct X2 { }; // expected-note 4{{here}}
   template <const int &p> struct X2k { }; // expected-note {{here}}
   template <int (&fp)(int)> struct X3 { }; // expected-note 4{{here}}
 
   int i = 42;
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{declared here}}
+#endif
+
   int iarr[10];
   int f(int i);
-  const int ki = 9; // expected-note 5{{here}}
-  __thread int ti = 100; // expected-note 2{{here}}
-  static int f_internal(int); // expected-note 4{{here}}
+  const int ki = 9;
+#if __cplusplus <= 199711L
+  // expected-note@-2 5{{non-type template argument refers to object here}}
+#endif
+
+  __thread int ti = 100; // expected-note {{here}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{here}}
+#endif
+
+  static int f_internal(int);
+#if __cplusplus <= 199711L
+  // expected-note@-2 4{{non-type template argument refers to function here}}
+#endif
+
   template <typename T> T f_tmpl(T t);
   struct S { union { int NonStaticMember; }; };
 
   void test() {
-    X0<i> x0a; // expected-error {{must have its address taken}}
+    X0<i> x0a;
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{non-type template argument for template parameter of pointer type 'int *' must have its address taken}}
+#else
+    // expected-error@-4 {{non-type template argument of type 'int' is not a constant expression}}
+    // expected-note@-5 {{read of non-const variable 'i' is not allowed in a constant expression}}
+#endif
     X0<&i> x0a_addr;
     X0<iarr> x0b;
     X0<&iarr> x0b_addr; // expected-error {{cannot be converted to a value of type 'int *'}}
-    X0<ki> x0c; // expected-error {{must have its address taken}} expected-warning {{internal linkage is a C++11 extension}}
-    X0<&ki> x0c_addr; // expected-error {{cannot be converted to a value of type 'int *'}} expected-warning {{internal linkage is a C++11 extension}}
-    X0<&ti> x0d_addr; // expected-error {{refers to thread-local object}}
+    X0<ki> x0c; // expected-error {{must have its address taken}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X0<&ki> x0c_addr; // expected-error {{cannot be converted to a value of type 'int *'}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X0<&ti> x0d_addr;
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{non-type template argument refers to thread-local object}}
+#else
+    // expected-error@-4 {{non-type template argument of type 'int *' is not a constant expression}}
+#endif
+
     X1<f> x1a;
     X1<&f> x1a_addr;
     X1<f_tmpl> x1b;
     X1<&f_tmpl> x1b_addr;
     X1<f_tmpl<int> > x1c;
     X1<&f_tmpl<int> > x1c_addr;
-    X1<f_internal> x1d; // expected-warning {{internal linkage is a C++11 extension}}
-    X1<&f_internal> x1d_addr; // expected-warning {{internal linkage is a C++11 extension}}
+    X1<f_internal> x1d;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X1<&f_internal> x1d_addr;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
     X2<i> x2a;
     X2<&i> x2a_addr; // expected-error {{address taken}}
     X2<iarr> x2b; // expected-error {{cannot bind to template argument of type 'int [10]'}}
     X2<&iarr> x2b_addr; // expected-error {{address taken}}
-    X2<ki> x2c; // expected-error {{ignores qualifiers}} expected-warning {{internal linkage is a C++11 extension}}
-    X2k<ki> x2kc; // expected-warning {{internal linkage is a C++11 extension}}
-    X2k<&ki> x2kc_addr; // expected-error {{address taken}} expected-warning {{internal linkage is a C++11 extension}}
+    X2<ki> x2c; // expected-error {{ignores qualifiers}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X2k<ki> x2kc;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X2k<&ki> x2kc_addr; // expected-error {{address taken}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
     X2<ti> x2d_addr; // expected-error {{refers to thread-local object}}
     X3<f> x3a;
     X3<&f> x3a_addr; // expected-error {{address taken}}
@@ -75,11 +137,31 @@
     X3<&f_tmpl> x3b_addr; // expected-error {{address taken}}
     X3<f_tmpl<int> > x3c;
     X3<&f_tmpl<int> > x3c_addr; // expected-error {{address taken}}
-    X3<f_internal> x3d; // expected-warning {{internal linkage is a C++11 extension}}
-    X3<&f_internal> x3d_addr; // expected-error {{address taken}} expected-warning {{internal linkage is a C++11 extension}}
+    X3<f_internal> x3d;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
 
-    int n; // expected-note {{here}}
-    X0<&n> x0_no_linkage; // expected-error {{non-type template argument refers to object 'n' that does not have linkage}}
+    X3<&f_internal> x3d_addr; // expected-error {{address taken}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    int n;
+#if __cplusplus <= 199711L
+    // expected-note@-2 {{non-type template argument refers to object here}}
+#else
+    // expected-note@-4 {{declared here}}
+#endif
+
+    X0<&n> x0_no_linkage;
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{non-type template argument refers to object 'n' that does not have linkage}}
+#else
+    // expected-error@-4 {{non-type template argument of type 'int *' is not a constant expression}}
+    // expected-note@-5 {{pointer to 'n' is not a constant expression}}
+#endif
+
     struct Local { static int f() {} }; // expected-note {{here}}
     X1<&Local::f> x1_no_linkage; // expected-error {{non-type template argument refers to function 'f' that does not have linkage}}
     X0<&S::NonStaticMember> x0_non_static; // expected-error {{non-static data member}}
@@ -96,7 +178,17 @@
   int i = 42;
   X0<&i + 2> x0a; // expected-error{{non-type template argument does not refer to any declaration}}
   int* iptr = &i;
-  X0<iptr> x0b; // expected-error{{non-type template argument for template parameter of pointer type 'int *' must have its address taken}}
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{declared here}}
+#endif
+
+  X0<iptr> x0b;
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{non-type template argument for template parameter of pointer type 'int *' must have its address taken}}
+#else
+  // expected-error@-4 {{non-type template argument of type 'int *' is not a constant expression}}
+  // expected-note@-5 {{read of non-constexpr variable 'iptr' is not allowed in a constant expression}}
+#endif
 }
 #endif // CPP11ONLY
 
@@ -108,4 +200,4 @@
 }
 #endif // CPP11ONLY
 
-}
\ No newline at end of file
+}
diff --git a/test/CXX/temp/temp.arg/temp.arg.type/p2.cpp b/test/CXX/temp/temp.arg/temp.arg.type/p2.cpp
index 0fd9a7e..539baec 100644
--- a/test/CXX/temp/temp.arg/temp.arg.type/p2.cpp
+++ b/test/CXX/temp/temp.arg/temp.arg.type/p2.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 template<class T> struct A {
   static T t; // expected-error{{static data member instantiated with function type 'int ()'}}
 };
@@ -11,10 +14,17 @@
 B<function> b; // expected-note{{instantiation of}}
 
 template <typename T> int f0(void *, const T&); // expected-note{{candidate template ignored: substitution failure}}
-enum {e}; // expected-note{{unnamed type used in template argument was declared here}}
+enum {e};
+#if __cplusplus <= 199711L
+// expected-note@-2 {{unnamed type used in template argument was declared here}}
+#endif
 
 void test_f0(int n) {
-  int i = f0(0, e); // expected-warning{{template argument uses unnamed type}}
+  int i = f0(0, e);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
   int vla[n];
   f0(0, vla); // expected-error{{no matching function for call to 'f0'}}
 }
@@ -23,20 +33,50 @@
   template <typename R, typename A1> void f0(R (*)(A1));
   template <typename T> int f1(T);
   template <typename T, typename U> int f1(T, U);
-  enum {e1}; // expected-note 2{{unnamed type used in template argument was declared here}}
-  enum {e2}; // expected-note 2{{unnamed type used in template argument was declared here}}
-  enum {e3}; // expected-note{{unnamed type used in template argument was declared here}}
+  enum {e1};
+#if __cplusplus <= 199711L
+  // expected-note@-2 2{{unnamed type used in template argument was declared here}}
+#endif
+
+  enum {e2};
+#if __cplusplus <= 199711L
+  // expected-note@-2 2{{unnamed type used in template argument was declared here}}
+#endif
+
+  enum {e3};
+#if __cplusplus <= 199711L
+ // expected-note@-2 {{unnamed type used in template argument was declared here}}
+#endif
 
   template<typename T> struct X;
   template<typename T> struct X<T*> { };
 
   void f() {
-    f0( // expected-warning{{template argument uses unnamed type}}
-       &f1<__typeof__(e1)>); // expected-warning{{template argument uses unnamed type}}
-    int (*fp1)(int, __typeof__(e2)) = f1; // expected-warning{{template argument uses unnamed type}}
-    f1(e2); // expected-warning{{template argument uses unnamed type}}
+    f0(
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
+       &f1<__typeof__(e1)>);
+#if __cplusplus <= 199711L
+ // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
+    int (*fp1)(int, __typeof__(e2)) = f1;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
+    f1(e2);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
     f1(e2);
 
-    X<__typeof__(e3)*> x; // expected-warning{{template argument uses unnamed type}}
+    X<__typeof__(e3)*> x;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
   }
 }
diff --git a/test/CXX/temp/temp.decls/temp.mem/p1.cpp b/test/CXX/temp/temp.decls/temp.mem/p1.cpp
index 01eab24..b48e145 100644
--- a/test/CXX/temp/temp.decls/temp.mem/p1.cpp
+++ b/test/CXX/temp/temp.decls/temp.mem/p1.cpp
@@ -10,6 +10,7 @@
     }
   };
 };
+extern template bool A<bool>::cond;
 
 int foo() {
   A<bool>::cond = true;
diff --git a/test/CXX/temp/temp.decls/temp.mem/p2.cpp b/test/CXX/temp/temp.decls/temp.mem/p2.cpp
index c24d5a9..feeb362 100644
--- a/test/CXX/temp/temp.decls/temp.mem/p2.cpp
+++ b/test/CXX/temp/temp.decls/temp.mem/p2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s
 
 template <typename>
 void quux();
@@ -8,5 +8,7 @@
     template <typename> struct bar {};  // expected-error{{templates cannot be declared inside of a local class}}
     template <typename> void baz() {}   // expected-error{{templates cannot be declared inside of a local class}}
     template <typename> void qux();     // expected-error{{templates cannot be declared inside of a local class}}
+    template <typename> using corge = int; // expected-error{{templates cannot be declared inside of a local class}}
+    template <typename T> static T grault; // expected-error{{static data member}} expected-error{{templates cannot be declared inside of a local class}}
   };
 }
diff --git a/test/CXX/temp/temp.decls/temp.variadic/p5.cpp b/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
index 4f9368f..206e9f7 100644
--- a/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
+++ b/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
@@ -437,3 +437,35 @@
   template void g<>();
   template void g<1, 2, 3>();
 }
+
+template <class... Ts>
+int var_expr(Ts... ts);
+
+template <class... Ts>
+auto a_function(Ts... ts) -> decltype(var_expr(ts...));
+
+template <class T>
+using partial = decltype(a_function<int, T>);
+
+int use_partial() { partial<char> n; }
+
+namespace PR26017 {
+template <class T>
+struct Foo {};
+template <class... Ts>
+using FooAlias = Foo<void(Ts...)>;
+
+template <class... Ts>
+using FooAliasAlias = FooAlias<Ts..., Ts...>;
+
+template <class... Ts>
+void bar(const FooAlias<Ts...> &) {}
+
+int fn() {
+  FooAlias<> a;
+  bar(a);
+
+  FooAlias<int> b;
+  bar(b);
+}
+}
diff --git a/test/CXX/temp/temp.fct.spec/temp.deduct/p7.cpp b/test/CXX/temp/temp.fct.spec/temp.deduct/p7.cpp
new file mode 100644
index 0000000..bc074ba
--- /dev/null
+++ b/test/CXX/temp/temp.fct.spec/temp.deduct/p7.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+struct Q { typedef int type; };
+
+// "The substitution occurs in all types and expressions that are used in [...]
+// template parameter declarations." In particular, we must substitute into the
+// type of a parameter pack that is not a pack expansion, even if we know the
+// corresponding argument pack is empty.
+template<typename T, typename T::type...> void a(T);
+int &a(...);
+int &a_disabled = a(0);
+int &a_enabled = a(Q()); // expected-error {{cannot bind to a temporary of type 'void'}}
+
+template<typename T, template<typename T::type> class ...X> void b(T);
+int &b(...);
+int &b_disabled = b(0);
+int &b_enabled = b(Q()); // expected-error {{cannot bind to a temporary of type 'void'}}
+
+template<typename T, template<typename T::type...> class ...X> void c(T);
+int &c(...);
+int &c_disabled = c(0);
+int &c_enabled = c(Q()); // expected-error {{cannot bind to a temporary of type 'void'}}
diff --git a/test/CXX/temp/temp.fct.spec/temp.deduct/p9.cpp b/test/CXX/temp/temp.fct.spec/temp.deduct/p9.cpp
index c27261c..9fd3df5 100644
--- a/test/CXX/temp/temp.fct.spec/temp.deduct/p9.cpp
+++ b/test/CXX/temp/temp.fct.spec/temp.deduct/p9.cpp
@@ -1,9 +1,22 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
-template <int> int f(int);  // expected-note 2{{candidate}}
-template <signed char> int f(int); // expected-note 2{{candidate}}
-int i1 = f<1>(0); // expected-error{{ambiguous}}
-int i2 = f<1000>(0); // expected-error{{ambiguous}}
+template <int> int f(int);  // expected-note {{candidate function}}
+#if __cplusplus <= 199711L
+// expected-note@-2 {{candidate function}}
+#endif
+
+template <signed char> int f(int); // expected-note {{candidate function}}
+#if __cplusplus <= 199711L
+// expected-note@-2 {{candidate function}}
+#endif
+
+int i1 = f<1>(0); // expected-error{{call to 'f' is ambiguous}}
+int i2 = f<1000>(0);
+#if __cplusplus <= 199711L
+// expected-error@-2{{call to 'f' is ambiguous}}
+#endif
 
 namespace PR6707 {
   template<typename T, T Value>
diff --git a/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3.cpp b/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3.cpp
index c573b9c..ff8178f 100644
--- a/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3.cpp
+++ b/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3.cpp
@@ -160,3 +160,16 @@
 }
 
 }
+
+namespace PR28195 {
+
+template<int N> struct B {};
+struct D : B<0>, B<1> {};
+
+template<int N> int callee(B<N>); // expected-note{{failed template argument deduction}}
+
+int caller() {
+  callee(D()); // expected-error{{no matching function}}
+}
+
+}
diff --git a/test/CXX/temp/temp.param/p15-cxx0x.cpp b/test/CXX/temp/temp.param/p15-cxx0x.cpp
index ade192b..667152d 100644
--- a/test/CXX/temp/temp.param/p15-cxx0x.cpp
+++ b/test/CXX/temp/temp.param/p15-cxx0x.cpp
@@ -102,10 +102,10 @@
 using D1 = types<long>;
 
 using T2 = take<4, int, char, double, long>::type; // expected-note {{previous}}
-using T2 = types<int, char, double, long>;
 // FIXME: Desguar the types on the RHS in this diagnostic.
 // desired-error {{'types<void, void, void, void>' vs 'types<int, char, double, long>'}}
 using T2 = types<void, void, void, void>; // expected-error {{'types<void, void, void, void>' vs 'types<typename inner<_>::type, typename inner<_>::type, typename inner<_>::type, typename inner<_>::type>'}}
+using T2 = types<int, char, double, long>;
 using D2 = drop<4, int, char, double, long>::type;
 using D2 = types<>;
 
diff --git a/test/CXX/temp/temp.res/temp.local/p6.cpp b/test/CXX/temp/temp.res/temp.local/p6.cpp
index 843b455..e2aa0ff 100644
--- a/test/CXX/temp/temp.res/temp.local/p6.cpp
+++ b/test/CXX/temp/temp.res/temp.local/p6.cpp
@@ -5,11 +5,11 @@
 template<typename T, // expected-note {{declared here}}
          typename T> struct X {}; // expected-error {{declaration of 'T' shadows template parameter}}
 
-template<typename T> struct Y { // expected-note 17{{declared here}}
+template<typename T> struct Y { // expected-note 18{{declared here}}
   template<typename T> struct A {}; // expected-error {{declaration of 'T' shadows template parameter}}
 
   struct B {
-    template<typename> struct T {}; // FIXME: desired-error {{declaration of 'T' shadows template parameter}}
+    template<typename> struct T {}; // expected-error {{declaration of 'T' shadows template parameter}}
   };
   struct C {
     template<typename> void T(); // expected-error {{declaration of 'T' shadows template parameter}}
@@ -65,11 +65,11 @@
   friend struct T; // expected-error {{declaration of 'T' shadows template parameter}}
 };
 
-template<int T> struct Z { // expected-note 15{{declared here}}
+template<int T> struct Z { // expected-note 16{{declared here}}
   template<typename T> struct A {}; // expected-error {{declaration of 'T' shadows template parameter}}
 
   struct B {
-    template<typename> struct T {}; // FIXME: desired-error {{declaration of 'T' shadows template parameter}}
+    template<typename> struct T {}; // expected-error {{declaration of 'T' shadows template parameter}}
   };
   struct C {
     template<typename> void T(); // expected-error {{declaration of 'T' shadows template parameter}}
@@ -129,7 +129,8 @@
 
 // FIXME: These are ill-formed: a template-parameter shall not have the same name as the template name.
 namespace A {
-  template<typename T> struct T {};
+  template<typename T> struct T {};  // expected-error{{declaration of 'T' shadows template parameter}}
+                                     // expected-note@-1{{template parameter is declared here}}
 }
 namespace B {
   template<typename T> void T() {}
@@ -137,3 +138,13 @@
 namespace C {
   template<typename T> int T;
 }
+
+namespace PR28023 {
+template<int V>  // expected-note{{template parameter is declared here}}
+struct A {
+  struct B {
+    template <int> friend struct V;  // expected-error{{declaration of 'V' shadows template parameter}}
+  };
+};
+A<0>::B a;
+}
diff --git a/test/CXX/temp/temp.spec/no-body.cpp b/test/CXX/temp/temp.spec/no-body.cpp
index 61d285b..4ec18fd 100644
--- a/test/CXX/temp/temp.spec/no-body.cpp
+++ b/test/CXX/temp/temp.spec/no-body.cpp
@@ -1,17 +1,44 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 // RUN: cp %s %t
 // RUN: not %clang_cc1 -x c++ -fixit %t -DFIXING
 // RUN: %clang_cc1 -x c++ %t -DFIXING
 
 template<typename T> void f(T) { }
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{explicit instantiation refers here}}
+#endif
+
 template<typename T> void g(T) { }
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{explicit instantiation refers here}}
+#endif
+
 template<typename T> struct x { };
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{explicit instantiation refers here}}
+#endif
+
 template<typename T> struct y { };  // expected-note {{declared here}}
 
-namespace good {
+namespace good { // Only good in C++98/03
+#ifndef FIXING
   template void f<int>(int);
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{explicit instantiation of 'f' must occur at global scope}}
+#endif
+
   template void g(int);
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{explicit instantiation of 'g' must occur at global scope}}
+#endif
+
   template struct x<int>;
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{explicit instantiation of 'x' must occur at global scope}}
+#endif
+#endif
 }
 
 namespace unsupported {
diff --git a/test/CXX/temp/temp.spec/temp.expl.spec/p2.cpp b/test/CXX/temp/temp.spec/temp.expl.spec/p2.cpp
index 4fbc45a..21399b6 100644
--- a/test/CXX/temp/temp.spec/temp.expl.spec/p2.cpp
+++ b/test/CXX/temp/temp.spec/temp.expl.spec/p2.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 
 // This test creates cases where implicit instantiations of various entities
 // would cause a diagnostic, but provides expliict specializations for those
@@ -16,7 +19,10 @@
 
 //     -- function template
 namespace N0 {
-  template<typename T> void f0(T) { // expected-note{{here}}
+  template<typename T> void f0(T) {
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
     T t;
   }
 
@@ -36,7 +42,11 @@
   template<> void N0::f0(long) { } // expected-error{{does not enclose namespace}}
 }
 
-template<> void N0::f0(double); // expected-warning{{C++11 extension}}
+template<> void N0::f0(double);
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of function template specialization of 'f0' outside namespace 'N0' is a C++11 extension}}
+#endif
+
 template<> void N0::f0(double) { }
 
 struct X1 {
@@ -49,21 +59,39 @@
 namespace N0 {
   
 template<typename T>
-struct X0 { // expected-note 2{{here}}
-  static T member; // expected-note{{here}}
+struct X0 { // expected-note {{explicitly specialized declaration is here}}
+#if __cplusplus <= 199711L
+// expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
+  static T member;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
   
-  void f1(T t) { // expected-note{{explicitly specialized declaration is here}}
+  void f1(T t) {
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
     t = 17;
   }
   
-  struct Inner : public T { }; // expected-note 3{{here}}
+  struct Inner : public T { }; // expected-note 2{{explicitly specialized declaration is here}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
   
   template<typename U>
-  struct InnerTemplate : public T { }; // expected-note 2{{explicitly specialized}} \
-   // expected-error{{base specifier}}
+  struct InnerTemplate : public T { }; // expected-note {{explicitly specialized declaration is here}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
+  // expected-error@-4 {{base specifier must name a class}}
   
   template<typename U>
-  void ft1(T t, U u); // expected-note{{explicitly specialized}}
+  void ft1(T t, U u);
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
 };
 
 }
@@ -76,7 +104,10 @@
 
 template<typename T> T N0::X0<T>::member;
 
-template<> struct N0::X0<void> { }; // expected-warning{{C++11 extension}}
+template<> struct N0::X0<void> { };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of class template specialization of 'X0' outside namespace 'N0' is a C++11 extension}}
+#endif
 N0::X0<void> test_X0;
 
 namespace N1 {
@@ -92,7 +123,10 @@
 };
 
 //     -- member function of a class template
-template<> void N0::X0<void*>::f1(void *) { } // expected-warning{{member function specialization}}
+template<> void N0::X0<void*>::f1(void *) { }
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of member function specialization of 'f1' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 void test_spec(N0::X0<void*> xvp, void *vp) {
   xvp.f1(vp);
@@ -125,7 +159,10 @@
   return N0::X0<NonDefaultConstructible>::member;
 }
 
-template<> int N0::X0<int>::member;  // expected-warning{{C++11 extension}}
+template<> int N0::X0<int>::member;
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of static data member specialization of 'member' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 template<> float N0::X0<float>::member = 3.14f;
 
@@ -153,7 +190,10 @@
 }
 
 template<>
-struct N0::X0<long>::Inner { }; // expected-warning{{C++11 extension}}
+struct N0::X0<long>::Inner { };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of member class specialization of 'Inner' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 template<>
 struct N0::X0<float>::Inner { };
@@ -192,7 +232,10 @@
 struct N0::X0<int>::InnerTemplate<long> { }; // okay
 
 template<> template<>
-struct N0::X0<int>::InnerTemplate<float> { }; // expected-warning{{class template specialization}}
+struct N0::X0<int>::InnerTemplate<float> { };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of class template specialization of 'InnerTemplate' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 namespace N1 {
   template<> template<>
@@ -224,7 +267,10 @@
 void N0::X0<void*>::ft1(void *, unsigned) { } // okay
 
 template<> template<>
-void N0::X0<void*>::ft1(void *, float) { } // expected-warning{{function template specialization}}
+void N0::X0<void*>::ft1(void *, float) { }
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of function template specialization of 'ft1' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 namespace N1 {
   template<> template<>
diff --git a/test/CXX/temp/temp.spec/temp.expl.spec/p3.cpp b/test/CXX/temp/temp.spec/temp.expl.spec/p3.cpp
index c8b7def..d82691c 100644
--- a/test/CXX/temp/temp.spec/temp.expl.spec/p3.cpp
+++ b/test/CXX/temp/temp.spec/temp.expl.spec/p3.cpp
@@ -1,14 +1,20 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 namespace N {
-  template<class T> class X; // expected-note {{'N::X' declared here}} \
-                             // expected-note {{explicitly specialized declaration is here}}
+  template<class T> class X; // expected-note {{'N::X' declared here}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
 }
 
 // TODO: Don't add a namespace qualifier to the template if it would trigger
 // the warning about the specialization being outside of the namespace.
-template<> class X<int> { /* ... */ };	// expected-error {{no template named 'X'; did you mean 'N::X'?}} \
-                                        // expected-warning {{first declaration of class template specialization of 'X' outside namespace 'N' is a C++11 extension}}
+template<> class X<int> { /* ... */ };	// expected-error {{no template named 'X'; did you mean 'N::X'?}}
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of class template specialization of 'X' outside namespace 'N' is a C++11 extension}}
+#endif
 
 namespace N {
   
diff --git a/test/CXX/temp/temp.spec/temp.explicit/p2.cpp b/test/CXX/temp/temp.spec/temp.explicit/p2.cpp
index 1dfcf0c..0270221 100644
--- a/test/CXX/temp/temp.spec/temp.explicit/p2.cpp
+++ b/test/CXX/temp/temp.spec/temp.explicit/p2.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -pedantic -Wc++11-compat %s
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -std=c++98 -Wc++11-compat %s
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -std=c++11 %s
 
 // Example from the standard
 template<class T> class Array { void mf() { } }; 
@@ -39,5 +41,16 @@
 }
 using namespace N;
 
-template struct X1<int>; // expected-warning{{must occur in}}
-template void f1(int); // expected-warning{{must occur in}}
+template struct X1<int>;
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{explicit instantiation of 'N::X1' must occur in namespace 'N'}}
+#else
+// expected-error@-4 {{explicit instantiation of 'N::X1' must occur in namespace 'N'}}
+#endif
+
+template void f1(int);
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{explicit instantiation of 'N::f1' must occur in namespace 'N'}}
+#else
+// expected-error@-4 {{explicit instantiation of 'N::f1' must occur in namespace 'N'}}
+#endif
diff --git a/test/CXX/temp/temp.spec/temp.explicit/p5.cpp b/test/CXX/temp/temp.spec/temp.explicit/p5.cpp
index 8422c51..ca1f9a3 100644
--- a/test/CXX/temp/temp.spec/temp.explicit/p5.cpp
+++ b/test/CXX/temp/temp.spec/temp.explicit/p5.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 -Wc++11-compat %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 namespace N {
   template<class T> class Y { // expected-note{{explicit instantiation refers here}}
@@ -11,7 +13,12 @@
 // FIXME: This example from the standard is wrong; note posted to CWG reflector
 // on 10/27/2009
 using N::Y; 
-template class Y<int>; // expected-warning{{must occur in}}
+template class Y<int>;
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{explicit instantiation of 'N::Y' must occur in namespace 'N'}}
+#else
+// expected-error@-4 {{explicit instantiation of 'N::Y' must occur in namespace 'N'}}
+#endif
 
 template class N::Y<char*>; 
 template void N::Y<double>::mf();
diff --git a/test/CXX/temp/temp.spec/temp.inst/p1.cpp b/test/CXX/temp/temp.spec/temp.inst/p1.cpp
index adf812b..3d2d6d7 100644
--- a/test/CXX/temp/temp.spec/temp.inst/p1.cpp
+++ b/test/CXX/temp/temp.spec/temp.inst/p1.cpp
@@ -54,6 +54,16 @@
   int test2 = g<int>(); // expected-note {{here}}
 }
 
+// - static data members
+namespace StaticDataMembers {
+  template<typename T>
+  struct A {
+    static const int n = T::error; // expected-error {{has no members}}
+    static inline int m = T::error; // expected-warning {{extension}}
+  };
+  A<int> ai; // expected-note {{here}}
+}
+
 // And it cases the implicit instantiations of the definitions of:
 
 // - unscoped member enumerations
diff --git a/test/CodeCompletion/bracket-decl.c b/test/CodeCompletion/bracket-decl.c
new file mode 100644
index 0000000..cf80b42
--- /dev/null
+++ b/test/CodeCompletion/bracket-decl.c
@@ -0,0 +1,9 @@
+#define PATHSIZE 256
+
+static const int len = 1234;
+
+void foo() {
+  char arr[
+// RUN: %clang_cc1 -fsyntax-only -code-completion-macros -code-completion-at=%s:6:12 %s -o - | FileCheck %s
+// CHECK: COMPLETION: len
+// CHECK: COMPLETION: PATHSIZE
diff --git a/test/CodeCompletion/ctor-initializer.cpp b/test/CodeCompletion/ctor-initializer.cpp
new file mode 100644
index 0000000..00af64d
--- /dev/null
+++ b/test/CodeCompletion/ctor-initializer.cpp
@@ -0,0 +1,41 @@
+struct Base1 {
+  Base1() : {}
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:2:12 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
+  // CHECK-CC1: COMPLETION: Pattern : member1(<#args#>)
+  // CHECK-CC1: COMPLETION: Pattern : member2(<#args#>
+
+  Base1(int) : member1(123), {}
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:7:30 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
+  // CHECK-CC2-NOT: COMPLETION: Pattern : member1(<#args#>)
+  // CHECK-CC2: COMPLETION: Pattern : member2(<#args#>
+
+  int member1;
+  float member2;
+};
+
+struct Derived : public Base1 {
+  Derived();
+  Derived(int);
+  Derived(float);
+  int deriv1;
+};
+
+Derived::Derived() : {}
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:23:22 %s -o - | FileCheck -check-prefix=CHECK-CC3 %s
+// CHECK-CC3: COMPLETION: Pattern : Base1(<#args#>)
+// CHECK-CC3: COMPLETION: Pattern : deriv1(<#args#>)
+
+Derived::Derived(int) try : {
+} catch (...) {
+}
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:28:29 %s -o - | FileCheck -check-prefix=CHECK-CC4 %s
+// CHECK-CC4: COMPLETION: Pattern : Base1(<#args#>)
+// CHECK-CC4: COMPLETION: Pattern : deriv1(<#args#>)
+
+Derived::Derived(float) try : Base1(),
+{
+} catch (...) {
+}
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:35:39 %s -o - | FileCheck -check-prefix=CHECK-CC5 %s
+// CHECK-CC5-NOT: COMPLETION: Pattern : Base1(<#args#>)
+// CHECK-CC5: COMPLETION: Pattern : deriv1(<#args#>)
diff --git a/test/CodeCompletion/objc-message.mm b/test/CodeCompletion/objc-message.mm
index 352a18e..7a50309 100644
--- a/test/CodeCompletion/objc-message.mm
+++ b/test/CodeCompletion/objc-message.mm
@@ -38,9 +38,9 @@
   [ptr instanceMethod1];
 }
 
-// RUN: %clang_cc1 -fsyntax-only -std=c++11 -code-completion-at=%s:33:7 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -code-completion-at=%s:33:8 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
 // CHECK-CC1: categoryInstanceMethod : [#id#]categoryInstanceMethod
 // CHECK-CC1: instanceMethod1 : [#id#]instanceMethod1
 // CHECK-CC1: protocolInstanceMethod : [#id#]protocolInstanceMethod
-// RUN: %clang_cc1 -fsyntax-only -std=c++11 -code-completion-at=%s:38:7 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -code-completion-at=%s:38:8 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
 // CHECK-CC2: protocolInstanceMethod : [#id#]protocolInstanceMethod
diff --git a/test/CodeGen/3dnow-builtins.c b/test/CodeGen/3dnow-builtins.c
index d534349..50e0e5d 100644
--- a/test/CodeGen/3dnow-builtins.c
+++ b/test/CodeGen/3dnow-builtins.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-unknown-unknown -target-feature +3dnowa -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-unknown-unknown -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=GCC -check-prefix=CHECK
+// RUN: %clang_cc1 %s -triple=x86_64-scei-ps4 -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=PS4 -check-prefix=CHECK
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -6,151 +7,176 @@
 #include <x86intrin.h>
 
 __m64 test_m_pavgusb(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pavgusb
+  // PS4-LABEL: define i64 @test_m_pavgusb
+  // GCC-LABEL: define double @test_m_pavgusb
   // CHECK: @llvm.x86.3dnow.pavgusb
   return _m_pavgusb(m1, m2);
 }
 
 __m64 test_m_pf2id(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pf2id
+  // PS4-LABEL: define i64 @test_m_pf2id
+  // GCC-LABEL: define double @test_m_pf2id
   // CHECK: @llvm.x86.3dnow.pf2id
   return _m_pf2id(m);
 }
 
 __m64 test_m_pfacc(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfacc
+  // PS4-LABEL: define i64 @test_m_pfacc
+  // GCC-LABEL: define double @test_m_pfacc
   // CHECK: @llvm.x86.3dnow.pfacc
   return _m_pfacc(m1, m2);
 }
 
 __m64 test_m_pfadd(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfadd
+  // PS4-LABEL: define i64 @test_m_pfadd
+  // GCC-LABEL: define double @test_m_pfadd
   // CHECK: @llvm.x86.3dnow.pfadd
   return _m_pfadd(m1, m2);
 }
 
 __m64 test_m_pfcmpeq(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfcmpeq
+  // PS4-LABEL: define i64 @test_m_pfcmpeq
+  // GCC-LABEL: define double @test_m_pfcmpeq
   // CHECK: @llvm.x86.3dnow.pfcmpeq
   return _m_pfcmpeq(m1, m2);
 }
 
 __m64 test_m_pfcmpge(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfcmpge
+  // PS4-LABEL: define i64 @test_m_pfcmpge
+  // GCC-LABEL: define double @test_m_pfcmpge
   // CHECK: @llvm.x86.3dnow.pfcmpge
   return _m_pfcmpge(m1, m2);
 }
 
 __m64 test_m_pfcmpgt(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfcmpgt
+  // PS4-LABEL: define i64 @test_m_pfcmpgt
+  // GCC-LABEL: define double @test_m_pfcmpgt
   // CHECK: @llvm.x86.3dnow.pfcmpgt
   return _m_pfcmpgt(m1, m2);
 }
 
 __m64 test_m_pfmax(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfmax
+  // PS4-LABEL: define i64 @test_m_pfmax
+  // GCC-LABEL: define double @test_m_pfmax
   // CHECK: @llvm.x86.3dnow.pfmax
   return _m_pfmax(m1, m2);
 }
 
 __m64 test_m_pfmin(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfmin
+  // PS4-LABEL: define i64 @test_m_pfmin
+  // GCC-LABEL: define double @test_m_pfmin
   // CHECK: @llvm.x86.3dnow.pfmin
   return _m_pfmin(m1, m2);
 }
 
 __m64 test_m_pfmul(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfmul
+  // PS4-LABEL: define i64 @test_m_pfmul
+  // GCC-LABEL: define double @test_m_pfmul
   // CHECK: @llvm.x86.3dnow.pfmul
   return _m_pfmul(m1, m2);
 }
 
 __m64 test_m_pfrcp(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pfrcp
+  // PS4-LABEL: define i64 @test_m_pfrcp
+  // GCC-LABEL: define double @test_m_pfrcp
   // CHECK: @llvm.x86.3dnow.pfrcp
   return _m_pfrcp(m);
 }
 
 __m64 test_m_pfrcpit1(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfrcpit1
+  // PS4-LABEL: define i64 @test_m_pfrcpit1
+  // GCC-LABEL: define double @test_m_pfrcpit1
   // CHECK: @llvm.x86.3dnow.pfrcpit1
   return _m_pfrcpit1(m1, m2);
 }
 
 __m64 test_m_pfrcpit2(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfrcpit2
+  // PS4-LABEL: define i64 @test_m_pfrcpit2
+  // GCC-LABEL: define double @test_m_pfrcpit2
   // CHECK: @llvm.x86.3dnow.pfrcpit2
   return _m_pfrcpit2(m1, m2);
 }
 
 __m64 test_m_pfrsqrt(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pfrsqrt
+  // PS4-LABEL: define i64 @test_m_pfrsqrt
+  // GCC-LABEL: define double @test_m_pfrsqrt
   // CHECK: @llvm.x86.3dnow.pfrsqrt
   return _m_pfrsqrt(m);
 }
 
 __m64 test_m_pfrsqrtit1(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfrsqrtit1
+  // PS4-LABEL: define i64 @test_m_pfrsqrtit1
+  // GCC-LABEL: define double @test_m_pfrsqrtit1
   // CHECK: @llvm.x86.3dnow.pfrsqit1
   return _m_pfrsqrtit1(m1, m2);
 }
 
 __m64 test_m_pfsub(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfsub
+  // PS4-LABEL: define i64 @test_m_pfsub
+  // GCC-LABEL: define double @test_m_pfsub
   // CHECK: @llvm.x86.3dnow.pfsub
   return _m_pfsub(m1, m2);
 }
 
 __m64 test_m_pfsubr(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfsubr
+  // PS4-LABEL: define i64 @test_m_pfsubr
+  // GCC-LABEL: define double @test_m_pfsubr
   // CHECK: @llvm.x86.3dnow.pfsubr
   return _m_pfsubr(m1, m2);
 }
 
 __m64 test_m_pi2fd(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pi2fd
+  // PS4-LABEL: define i64 @test_m_pi2fd
+  // GCC-LABEL: define double @test_m_pi2fd
   // CHECK: @llvm.x86.3dnow.pi2fd
   return _m_pi2fd(m);
 }
 
 __m64 test_m_pmulhrw(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pmulhrw
+  // PS4-LABEL: define i64 @test_m_pmulhrw
+  // GCC-LABEL: define double @test_m_pmulhrw
   // CHECK: @llvm.x86.3dnow.pmulhrw
   return _m_pmulhrw(m1, m2);
 }
 
 __m64 test_m_pf2iw(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pf2iw
+  // PS4-LABEL: define i64 @test_m_pf2iw
+  // GCC-LABEL: define double @test_m_pf2iw
   // CHECK: @llvm.x86.3dnowa.pf2iw
   return _m_pf2iw(m);
 }
 
 __m64 test_m_pfnacc(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfnacc
+  // PS4-LABEL: define i64 @test_m_pfnacc
+  // GCC-LABEL: define double @test_m_pfnacc
   // CHECK: @llvm.x86.3dnowa.pfnacc
   return _m_pfnacc(m1, m2);
 }
 
 __m64 test_m_pfpnacc(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfpnacc
+  // PS4-LABEL: define i64 @test_m_pfpnacc
+  // GCC-LABEL: define double @test_m_pfpnacc
   // CHECK: @llvm.x86.3dnowa.pfpnacc
   return _m_pfpnacc(m1, m2);
 }
 
 __m64 test_m_pi2fw(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pi2fw
+  // PS4-LABEL: define i64 @test_m_pi2fw
+  // GCC-LABEL: define double @test_m_pi2fw
   // CHECK: @llvm.x86.3dnowa.pi2fw
   return _m_pi2fw(m);
 }
 
 __m64 test_m_pswapdsf(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pswapdsf
+  // PS4-LABEL: define i64 @test_m_pswapdsf
+  // GCC-LABEL: define double @test_m_pswapdsf
   // CHECK: @llvm.x86.3dnowa.pswapd
   return _m_pswapdsf(m);
 }
 
 __m64 test_m_pswapdsi(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pswapdsi
+  // PS4-LABEL: define i64 @test_m_pswapdsi
+  // GCC-LABEL: define double @test_m_pswapdsi
   // CHECK: @llvm.x86.3dnowa.pswapd
   return _m_pswapdsi(m);
 }
diff --git a/test/CodeGen/CFStrings.c b/test/CodeGen/CFStrings.c
new file mode 100644
index 0000000..4edb5ff
--- /dev/null
+++ b/test/CodeGen/CFStrings.c
@@ -0,0 +1,59 @@
+// REQUIRES: arm-registered-target,x86-registered-target
+
+// RUN: %clang_cc1 -triple thumbv7-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+// RUN: %clang_cc1 -triple i686-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+// RUN: %clang_cc1 -triple x86_64-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+
+// RUN: %clang_cc1 -triple armv7-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF32
+// RUN: %clang_cc1 -triple i686-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF32
+// RUN: %clang_cc1 -triple x86_64-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF64
+// RUN: %clang_cc1 -triple armv7-elf -S %s -o - | FileCheck %s -check-prefix CHECK-ELF-DATA-SECTION
+
+// RUN: %clang_cc1 -triple armv7-macho -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
+// RUN: %clang_cc1 -triple i386-apple-macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
+// RUN: %clang_cc1 -triple x86_64-macho -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO64
+
+// RUN: %clang_cc1 -triple thumbv7-windows -S %s -o - | FileCheck %s -check-prefix CHECK-ASM-COFF
+// RUN: %clang_cc1 -triple thumbv7-elf -S %s -o - | FileCheck %s -check-prefix CHECK-ASM-ELF
+// RUN: %clang_cc1 -triple thumbv7-macho -S %s -o - | FileCheck %s -check-prefix CHECK-ASM-MACHO
+
+typedef struct __CFString *CFStringRef;
+const CFStringRef one = (CFStringRef)__builtin___CFStringMakeConstantString("one");
+const CFStringRef two = (CFStringRef)__builtin___CFStringMakeConstantString("\xef\xbf\xbd\x74\xef\xbf\xbd\x77\xef\xbf\xbd\x6f");
+
+// CHECK-COFF: @.str = private unnamed_addr constant [4 x i8] c"one\00", align 1
+// CHECK-ELF: @.str = private unnamed_addr constant [4 x i8] c"one\00", align 1
+// CHECK-MACHO: @.str = private unnamed_addr constant [4 x i8] c"one\00", section "__TEXT,__cstring,cstring_literals", align 1
+
+// CHECK-COFF: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "cfstring", align {{[48]}}
+// CHECK-ELF32: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "cfstring", align 4
+// CHECK-ELF64: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 3 }, section "cfstring", align 8
+// CHECK-MACHO32: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "__DATA,__cfstring", align 4
+// CHECK-MACHO64: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 3 }, section "__DATA,__cfstring", align 8
+
+// CHECK-COFF: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], align 2
+// CHECK-ELF: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], align 2
+// CHECK-MACHO: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], section "__TEXT,__ustring", align 2
+
+// CHECK-COFF: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "cfstring", align {{[48]}}
+// CHECK-ELF32: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "cfstring", align 4
+// CHECK-ELF64: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i64 6 }, section "cfstring", align 8
+// CHECK-MACHO32: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "__DATA,__cfstring", align 4
+// CHECK-MACHO64: @_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i64 6 }, section "__DATA,__cfstring", align 8
+
+// CHECK-ELF-DATA-SECTION: .section .rodata.str1.1
+// CHECK-ELF-DATA-SECTION: .asciz "one"
+
+// CHECK-ELF-DATA-SECTION: .section .rodata.str2.2
+// CHECK-ELF-DATA-SECTION: .short 65533
+// CHECK-ELF-DATA-SECTION: .short 116
+// CHECK-ELF-DATA-SECTION: .short 65533
+// CHECK-ELF-DATA-SECTION: .short 119
+// CHECK-ELF-DATA-SECTION: .short 65533
+// CHECK-ELF-DATA-SECTION: .short 111
+// CHECK-ELF-DATA-SECTION: .short 0
+
+// CHECK-ASM-COFF: .section cfstring,"dw"
+// CHECK-ASM-ELF: .section cfstring,"aw"
+// CHECK-ASM-MACHO: .section __DATA,__cfstring
+
diff --git a/test/CodeGen/Inputs/pgo-sample.prof b/test/CodeGen/Inputs/pgo-sample.prof
new file mode 100644
index 0000000..c5b8d9e
--- /dev/null
+++ b/test/CodeGen/Inputs/pgo-sample.prof
@@ -0,0 +1,2 @@
+bar:100:100
+ 1: 2000
diff --git a/test/CodeGen/Inputs/pgotestclang.profraw b/test/CodeGen/Inputs/pgotestclang.profraw
new file mode 100644
index 0000000..401ba07
--- /dev/null
+++ b/test/CodeGen/Inputs/pgotestclang.profraw
@@ -0,0 +1 @@
+:fe
diff --git a/test/CodeGen/Inputs/pgotestir.profraw b/test/CodeGen/Inputs/pgotestir.profraw
new file mode 100644
index 0000000..04a7c1c
--- /dev/null
+++ b/test/CodeGen/Inputs/pgotestir.profraw
@@ -0,0 +1 @@
+:ir
diff --git a/test/CodeGen/Inputs/thinlto_backend.ll b/test/CodeGen/Inputs/thinlto_backend.ll
new file mode 100644
index 0000000..78678c0
--- /dev/null
+++ b/test/CodeGen/Inputs/thinlto_backend.ll
@@ -0,0 +1,6 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f2() {
+  ret void
+}
diff --git a/test/CodeGen/aarch64-fix-cortex-a53-835769.c b/test/CodeGen/aarch64-fix-cortex-a53-835769.c
index 7ad1240..c6a38b2 100644
--- a/test/CodeGen/aarch64-fix-cortex-a53-835769.c
+++ b/test/CodeGen/aarch64-fix-cortex-a53-835769.c
@@ -23,5 +23,5 @@
 
 // CHECK: ldr
 // CHECK-YES-NEXT: nop
-// CHECK-NO-NEXT-NOT: nop
+// CHECK-NO-NOT: nop
 // CHECK-NEXT: madd
diff --git a/test/CodeGen/aarch64-neon-2velem.c b/test/CodeGen/aarch64-neon-2velem.c
index fa910ff..2866990 100644
--- a/test/CodeGen/aarch64-neon-2velem.c
+++ b/test/CodeGen/aarch64-neon-2velem.c
@@ -1,2452 +1,4432 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: @test_vmla_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_s16
   return vmla_lane_s16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlaq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s16
   return vmlaq_lane_s16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmla_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_s32
   return vmla_lane_s32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlaq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s32
   return vmlaq_lane_s32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmla_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s16
   return vmla_laneq_s16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlaq_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s16
   return vmlaq_laneq_s16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmla_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s32
   return vmla_laneq_s32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlaq_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s32
   return vmlaq_laneq_s32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmls_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_s16
   return vmls_lane_s16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlsq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s16
   return vmlsq_lane_s16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmls_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_s32
   return vmls_lane_s32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlsq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s32
   return vmlsq_lane_s32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmls_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s16
   return vmls_laneq_s16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlsq_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s16
   return vmlsq_laneq_s16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmls_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s32
   return vmls_laneq_s32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlsq_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s32
   return vmlsq_laneq_s32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmul_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_s16
   return vmul_lane_s16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s16
   return vmulq_lane_s16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmul_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_s32
   return vmul_lane_s32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s32
   return vmulq_lane_s32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmul_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_u16
   return vmul_lane_u16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u16
   return vmulq_lane_u16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmul_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_u32
   return vmul_lane_u32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u32
   return vmulq_lane_u32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s16
   return vmul_laneq_s16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s16
   return vmulq_laneq_s16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s32
   return vmul_laneq_s32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s32
   return vmulq_laneq_s32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u16
   return vmul_laneq_u16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u16
   return vmulq_laneq_u16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u32
   return vmul_laneq_u32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u32
   return vmulq_laneq_u32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vfma_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK:   ret <2 x float> [[FMLA2]]
 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfma_lane_f32
   return vfma_lane_f32(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vfmaq_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK:   ret <4 x float> [[FMLA2]]
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f32
   return vfmaq_lane_f32(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vfma_laneq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfma_laneq_f32
   return vfma_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vfmaq_laneq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f32
   return vfmaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vfms_lane_f32(
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK:   ret <2 x float> [[FMLA2]]
 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfms_lane_f32
   return vfms_lane_f32(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vfmsq_lane_f32(
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK:   ret <4 x float> [[FMLA2]]
 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f32
   return vfmsq_lane_f32(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vfms_laneq_f32(
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfms_laneq_f32
   return vfms_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vfmsq_laneq_f32(
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f32
   return vfmsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vfmaq_lane_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
+// CHECK:   ret <2 x double> [[FMLA2]]
 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f64
   return vfmaq_lane_f64(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vfmaq_laneq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f64
   return vfmaq_laneq_f64(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: @test_vfmsq_lane_f64(
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
+// CHECK:   ret <2 x double> [[FMLA2]]
 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f64
   return vfmsq_lane_f64(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vfmsq_laneq_f64(
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f64
   return vfmsq_laneq_f64(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: @test_vfmas_laneq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmas_laneq_f32
   return vfmas_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vfmsd_lane_f64(
+// CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmsd_lane_f64
   return vfmsd_lane_f64(a, b, v, 0);
-  // CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+\.d\[0\]|fmsub d[0-9]+, d[0-9]+, d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vfmss_laneq_f32(
+// CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmss_laneq_f32
   return vfmss_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vfmsd_laneq_f64(
+// CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsd_laneq_f64
   return vfmsd_laneq_f64(a, b, v, 1);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: @test_vmlal_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s16
   return vmlal_lane_s16(a, b, v, 3);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlal_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s32
   return vmlal_lane_s32(a, b, v, 1);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlal_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s16
   return vmlal_laneq_s16(a, b, v, 7);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlal_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s32
   return vmlal_laneq_s32(a, b, v, 3);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlal_high_lane_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s16
   return vmlal_high_lane_s16(a, b, v, 3);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlal_high_lane_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s32
   return vmlal_high_lane_s32(a, b, v, 1);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlal_high_laneq_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s16
   return vmlal_high_laneq_s16(a, b, v, 7);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlal_high_laneq_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s32
   return vmlal_high_laneq_s32(a, b, v, 3);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlsl_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s16
   return vmlsl_lane_s16(a, b, v, 3);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlsl_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s32
   return vmlsl_lane_s32(a, b, v, 1);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlsl_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s16
   return vmlsl_laneq_s16(a, b, v, 7);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlsl_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s32
   return vmlsl_laneq_s32(a, b, v, 3);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_lane_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s16
   return vmlsl_high_lane_s16(a, b, v, 3);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_lane_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s32
   return vmlsl_high_lane_s32(a, b, v, 1);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_laneq_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s16
   return vmlsl_high_laneq_s16(a, b, v, 7);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_laneq_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s32
   return vmlsl_high_laneq_s32(a, b, v, 3);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlal_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u16
   return vmlal_lane_u16(a, b, v, 3);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlal_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u32
   return vmlal_lane_u32(a, b, v, 1);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlal_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u16
   return vmlal_laneq_u16(a, b, v, 7);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlal_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u32
   return vmlal_laneq_u32(a, b, v, 3);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlal_high_lane_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u16
   return vmlal_high_lane_u16(a, b, v, 3);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlal_high_lane_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u32
   return vmlal_high_lane_u32(a, b, v, 1);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlal_high_laneq_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u16
   return vmlal_high_laneq_u16(a, b, v, 7);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlal_high_laneq_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u32
   return vmlal_high_laneq_u32(a, b, v, 3);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlsl_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u16
   return vmlsl_lane_u16(a, b, v, 3);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlsl_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u32
   return vmlsl_lane_u32(a, b, v, 1);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlsl_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u16
   return vmlsl_laneq_u16(a, b, v, 7);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlsl_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u32
   return vmlsl_laneq_u32(a, b, v, 3);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_lane_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u16
   return vmlsl_high_lane_u16(a, b, v, 3);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_lane_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u32
   return vmlsl_high_lane_u32(a, b, v, 1);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_laneq_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u16
   return vmlsl_high_laneq_u16(a, b, v, 7);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_laneq_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u32
   return vmlsl_high_laneq_u32(a, b, v, 3);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmull_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_s16
   return vmull_lane_s16(a, v, 3);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmull_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_s32
   return vmull_lane_s32(a, v, 1);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmull_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_u16
   return vmull_lane_u16(a, v, 3);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmull_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_u32
   return vmull_lane_u32(a, v, 1);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmull_high_lane_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s16
   return vmull_high_lane_s16(a, v, 3);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmull_high_lane_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s32
   return vmull_high_lane_s32(a, v, 1);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmull_high_lane_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u16
   return vmull_high_lane_u16(a, v, 3);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmull_high_lane_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u32
   return vmull_high_lane_u32(a, v, 1);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmull_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s16
   return vmull_laneq_s16(a, v, 7);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmull_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s32
   return vmull_laneq_s32(a, v, 3);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmull_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u16
   return vmull_laneq_u16(a, v, 7);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmull_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u32
   return vmull_laneq_u32(a, v, 3);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmull_high_laneq_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s16
   return vmull_high_laneq_s16(a, v, 7);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmull_high_laneq_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s32
   return vmull_high_laneq_s32(a, v, 3);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmull_high_laneq_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u16
   return vmull_high_laneq_u16(a, v, 7);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmull_high_laneq_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u32
   return vmull_high_laneq_u32(a, v, 3);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmlal_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s16
   return vqdmlal_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmlal_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s32
   return vqdmlal_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_lane_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s16
   return vqdmlal_high_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_lane_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s32
   return vqdmlal_high_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s16
   return vqdmlsl_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s32
   return vqdmlsl_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s16
   return vqdmlsl_high_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s32
   return vqdmlsl_high_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqdmull_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s16
   return vqdmull_lane_s16(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmull_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s32
   return vqdmull_lane_s32(a, v, 1);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqdmull_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s16
   return vqdmull_laneq_s16(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmull_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s32
   return vqdmull_laneq_s32(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmull_high_lane_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s16
   return vqdmull_high_lane_s16(a, v, 3);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmull_high_lane_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s32
   return vqdmull_high_lane_s32(a, v, 1);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqdmull_high_laneq_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s16
   return vqdmull_high_laneq_s16(a, v, 7);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqdmull_high_laneq_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s32
   return vqdmull_high_laneq_s32(a, v, 3);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmulh_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s16
   return vqdmulh_lane_s16(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmulhq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s16
   return vqdmulhq_lane_s16(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqdmulh_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s32
   return vqdmulh_lane_s32(a, v, 1);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqdmulhq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s32
   return vqdmulhq_lane_s32(a, v, 1);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqrdmulh_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s16
   return vqrdmulh_lane_s16(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s16
   return vqrdmulhq_lane_s16(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vqrdmulh_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s32
   return vqrdmulh_lane_s32(a, v, 1);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s32
   return vqrdmulhq_lane_s32(a, v, 1);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmul_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_f32
   return vmul_lane_f32(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmul_lane_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 
 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmul_lane_f64
   return vmul_lane_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+\.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmulq_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f32
   return vmulq_lane_f32(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_f64(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f64
   return vmulq_lane_f64(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f32
   return vmul_laneq_f32(a, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f64
   return vmul_laneq_f64(a, v, 1);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 
 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f32
   return vmulq_laneq_f32(a, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_f64(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f64
   return vmulq_laneq_f64(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: @test_vmulx_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulx_lane_f32
   return vmulx_lane_f32(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmulxq_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f32
   return vmulxq_lane_f32(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmulxq_lane_f64(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f64
   return vmulxq_lane_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vmulx_laneq_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulx_laneq_f32
   return vmulx_laneq_f32(a, v, 3);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmulxq_laneq_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f32
   return vmulxq_laneq_f32(a, v, 3);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmulxq_laneq_f64(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f64
   return vmulxq_laneq_f64(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: @test_vmla_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_s16_0
   return vmla_lane_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlaq_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s16_0
   return vmlaq_lane_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmla_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_s32_0
   return vmla_lane_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlaq_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s32_0
   return vmlaq_lane_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmla_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s16_0
   return vmla_laneq_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlaq_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s16_0
   return vmlaq_laneq_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmla_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s32_0
   return vmla_laneq_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlaq_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s32_0
   return vmlaq_laneq_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmls_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_s16_0
   return vmls_lane_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsq_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s16_0
   return vmlsq_lane_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmls_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_s32_0
   return vmls_lane_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsq_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s32_0
   return vmlsq_lane_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmls_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s16_0
   return vmls_laneq_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsq_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s16_0
   return vmlsq_laneq_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmls_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s32_0
   return vmls_laneq_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsq_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s32_0
   return vmlsq_laneq_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmul_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_s16_0
   return vmul_lane_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s16_0
   return vmulq_lane_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmul_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_s32_0
   return vmul_lane_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s32_0
   return vmulq_lane_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmul_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_u16_0
   return vmul_lane_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u16_0
   return vmulq_lane_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmul_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_u32_0
   return vmul_lane_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u32_0
   return vmulq_lane_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s16_0
   return vmul_laneq_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s16_0
   return vmulq_laneq_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s32_0
   return vmul_laneq_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s32_0
   return vmulq_laneq_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u16_0
   return vmul_laneq_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u16_0
   return vmulq_laneq_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u32_0
   return vmul_laneq_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u32_0
   return vmulq_laneq_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfma_lane_f32_0(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK:   ret <2 x float> [[FMLA2]]
 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfma_lane_f32_0
   return vfma_lane_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfmaq_lane_f32_0(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK:   ret <4 x float> [[FMLA2]]
 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f32_0
   return vfmaq_lane_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfma_laneq_f32_0(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfma_laneq_f32_0
   return vfma_laneq_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfmaq_laneq_f32_0(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f32_0
   return vfmaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfms_lane_f32_0(
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK:   ret <2 x float> [[FMLA2]]
 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfms_lane_f32_0
   return vfms_lane_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfmsq_lane_f32_0(
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK:   ret <4 x float> [[FMLA2]]
 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f32_0
   return vfmsq_lane_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfms_laneq_f32_0(
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfms_laneq_f32_0
   return vfms_laneq_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfmsq_laneq_f32_0(
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f32_0
   return vfmsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfmaq_laneq_f64_0(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f64_0
   return vfmaq_laneq_f64(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vfmsq_laneq_f64_0(
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f64_0
   return vfmsq_laneq_f64(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vmlal_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s16_0
   return vmlal_lane_s16(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlal_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s32_0
   return vmlal_lane_s32(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlal_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s16_0
   return vmlal_laneq_s16(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlal_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s32_0
   return vmlal_laneq_s32(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlal_high_lane_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s16_0
   return vmlal_high_lane_s16(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlal_high_lane_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s32_0
   return vmlal_high_lane_s32(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s16_0
   return vmlal_high_laneq_s16(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s32_0
   return vmlal_high_laneq_s32(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s16_0
   return vmlsl_lane_s16(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s32_0
   return vmlsl_lane_s32(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s16_0
   return vmlsl_laneq_s16(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s32_0
   return vmlsl_laneq_s32(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s16_0
   return vmlsl_high_lane_s16(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s32_0
   return vmlsl_high_lane_s32(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s16_0
   return vmlsl_high_laneq_s16(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s32_0
   return vmlsl_high_laneq_s32(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlal_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u16_0
   return vmlal_lane_u16(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlal_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u32_0
   return vmlal_lane_u32(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlal_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u16_0
   return vmlal_laneq_u16(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlal_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u32_0
   return vmlal_laneq_u32(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlal_high_lane_u16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u16_0
   return vmlal_high_lane_u16(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlal_high_lane_u32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u32_0
   return vmlal_high_lane_u32(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u16_0
   return vmlal_high_laneq_u16(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u32_0
   return vmlal_high_laneq_u32(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u16_0
   return vmlsl_lane_u16(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u32_0
   return vmlsl_lane_u32(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u16_0
   return vmlsl_laneq_u16(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u32_0
   return vmlsl_laneq_u32(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u16_0
   return vmlsl_high_lane_u16(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u32_0
   return vmlsl_high_lane_u32(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u16_0
   return vmlsl_high_laneq_u16(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u32_0
   return vmlsl_high_laneq_u32(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmull_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_s16_0
   return vmull_lane_s16(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmull_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_s32_0
   return vmull_lane_s32(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmull_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_u16_0
   return vmull_lane_u16(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmull_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_u32_0
   return vmull_lane_u32(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_lane_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s16_0
   return vmull_high_lane_s16(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_lane_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s32_0
   return vmull_high_lane_s32(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_lane_u16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u16_0
   return vmull_high_lane_u16(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_lane_u32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u32_0
   return vmull_high_lane_u32(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmull_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s16_0
   return vmull_laneq_s16(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmull_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s32_0
   return vmull_laneq_s32(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmull_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u16_0
   return vmull_laneq_u16(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmull_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u32_0
   return vmull_laneq_u32(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_laneq_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s16_0
   return vmull_high_laneq_s16(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_laneq_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s32_0
   return vmull_high_laneq_s32(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_laneq_u16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u16_0
   return vmull_high_laneq_u16(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_laneq_u32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u32_0
   return vmull_high_laneq_u32(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmlal_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s16_0
   return vqdmlal_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmlal_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s32_0
   return vqdmlal_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s16_0
   return vqdmlal_high_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s32_0
   return vqdmlal_high_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s16_0
   return vqdmlsl_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s32_0
   return vqdmlsl_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s16_0
   return vqdmlsl_high_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s32_0
   return vqdmlsl_high_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmull_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s16_0
   return vqdmull_lane_s16(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmull_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s32_0
   return vqdmull_lane_s32(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmull_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s16_0
   return vqdmull_laneq_s16(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmull_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s32_0
   return vqdmull_laneq_s32(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s16_0
   return vqdmull_high_lane_s16(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s32_0
   return vqdmull_high_lane_s32(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s16_0
   return vqdmull_high_laneq_s16(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s32_0
   return vqdmull_high_laneq_s32(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmulh_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s16_0
   return vqdmulh_lane_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s16_0
   return vqdmulhq_lane_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmulh_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s32_0
   return vqdmulh_lane_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s32_0
   return vqdmulhq_lane_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s16_0
   return vqrdmulh_lane_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s16_0
   return vqrdmulhq_lane_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s32_0
   return vqrdmulh_lane_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s32_0
   return vqrdmulhq_lane_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmul_lane_f32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_f32_0
   return vmul_lane_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulq_lane_f32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f32_0
   return vmulq_lane_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_f32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f32_0
   return vmul_laneq_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmul_laneq_f64_0(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f64_0
   return vmul_laneq_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_f32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f32_0
   return vmulq_laneq_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulq_laneq_f64_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f64_0
   return vmulq_laneq_f64(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vmulx_lane_f32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulx_lane_f32_0
   return vmulx_lane_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulxq_lane_f32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f32_0
   return vmulxq_lane_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulxq_lane_f64_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f64_0
   return vmulxq_lane_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vmulx_laneq_f32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulx_laneq_f32_0
   return vmulx_laneq_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulxq_laneq_f32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f32_0
   return vmulxq_laneq_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulxq_laneq_f64_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f64_0
   return vmulxq_laneq_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vmull_high_n_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vmull_high_n_s16
   return vmull_high_n_s16(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vmull_high_n_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vmull_high_n_s32
   return vmull_high_n_s32(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vmull_high_n_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmull_high_n_u16
   return vmull_high_n_u16(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vmull_high_n_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmull_high_n_u32
   return vmull_high_n_u32(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vqdmull_high_n_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V5_I_I]]
 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmull_high_n_s16
   return vqdmull_high_n_s16(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vqdmull_high_n_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V3_I_I]]
 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmull_high_n_s32
   return vqdmull_high_n_s32(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vmlal_high_n_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_s16
   return vmlal_high_n_s16(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vmlal_high_n_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_s32
   return vmlal_high_n_s32(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vmlal_high_n_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_u16
   return vmlal_high_n_u16(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vmlal_high_n_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_u32
   return vmlal_high_n_u32(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_n_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_n_s16
   return vqdmlal_high_n_s16(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_n_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_n_s32
   return vqdmlal_high_n_s32(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vmlsl_high_n_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_s16
   return vmlsl_high_n_s16(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vmlsl_high_n_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_s32
   return vmlsl_high_n_s32(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vmlsl_high_n_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_u16
   return vmlsl_high_n_u16(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vmlsl_high_n_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_u32
   return vmlsl_high_n_u32(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_n_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
+// CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_n_s16
   return vqdmlsl_high_n_s16(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_n_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
+// CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_n_s32
   return vqdmlsl_high_n_s32(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: @test_vmul_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
-  // CHECK-LABEL: test_vmul_n_f32
   return vmul_n_f32(a, b);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulq_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
-  // CHECK-LABEL: test_vmulq_n_f32
   return vmulq_n_f32(a, b);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmulq_n_f64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x double> [[MUL_I]]
 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
-  // CHECK-LABEL: test_vmulq_n_f64
   return vmulq_n_f64(a, b);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: @test_vfma_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> [[VECINIT1_I]], <2 x float> %a) #2
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
-  // CHECK-LABEL: test_vfma_n_f32
   return vfma_n_f32(a, b, n);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfmaq_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> [[VECINIT3_I]], <4 x float> %a) #2
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
-  // CHECK-LABEL: test_vfmaq_n_f32
   return vfmaq_n_f32(a, b, n);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfms_n_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> [[VECINIT1_I]], <2 x float> %a) #2
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
-  // CHECK-LABEL: test_vfms_n_f32
   return vfms_n_f32(a, b, n);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vfmsq_n_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> [[VECINIT3_I]], <4 x float> %a) #2
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
-  // CHECK-LABEL: test_vfmsq_n_f32
   return vfmsq_n_f32(a, b, n);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmul_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vmul_n_s16
   return vmul_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmulq_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vmulq_n_s16
   return vmulq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmul_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vmul_n_s32
   return vmul_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmulq_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vmulq_n_s32
   return vmulq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmul_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmul_n_u16
   return vmul_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmulq_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmulq_n_u16
   return vmulq_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmul_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmul_n_u32
   return vmul_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmulq_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmulq_n_u32
   return vmulq_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmull_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vmull_n_s16
   return vmull_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmull_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vmull_n_s32
   return vmull_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmull_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmull_n_u16
   return vmull_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmull_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmull_n_u32
   return vmull_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmull_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V5_I]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmull_n_s16
   return vqdmull_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqdmull_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V3_I]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmull_n_s32
   return vqdmull_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmulh_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V5_I]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmulh_n_s16
   return vqdmulh_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqdmulhq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #2
+// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V9_I]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmulhq_n_s16
   return vqdmulhq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqdmulh_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V3_I]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmulh_n_s32
   return vqdmulh_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmulhq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #2
+// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V5_I]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmulhq_n_s32
   return vqdmulhq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqrdmulh_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V5_I]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqrdmulh_n_s16
   return vqrdmulh_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #2
+// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V9_I]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqrdmulhq_n_s16
   return vqrdmulhq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqrdmulh_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V3_I]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqrdmulh_n_s32
   return vqrdmulh_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #2
+// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V5_I]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqrdmulhq_n_s32
   return vqrdmulhq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmla_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmla_n_s16
   return vmla_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmlaq_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlaq_n_s16
   return vmlaq_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmla_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmla_n_s32
   return vmla_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlaq_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_s32
   return vmlaq_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmla_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmla_n_u16
   return vmla_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmlaq_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlaq_n_u16
   return vmlaq_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmla_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmla_n_u32
   return vmla_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlaq_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_u32
   return vmlaq_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmlal_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlal_n_s16
   return vmlal_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmlal_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlal_n_s32
   return vmlal_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlal_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlal_n_u16
   return vmlal_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmlal_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlal_n_u32
   return vmlal_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmlal_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlal_n_s16
   return vqdmlal_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqdmlal_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlal_n_s32
   return vqdmlal_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmls_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmls_n_s16
   return vmls_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmlsq_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsq_n_s16
   return vmlsq_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmls_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmls_n_s32
   return vmls_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlsq_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_s32
   return vmlsq_n_s32(a, b, c);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmls_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmls_n_u16
   return vmls_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmlsq_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsq_n_u16
   return vmlsq_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmls_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmls_n_u32
   return vmls_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlsq_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_u32
   return vmlsq_n_u32(a, b, c);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmlsl_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsl_n_s16
   return vmlsl_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmlsl_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsl_n_s32
   return vmlsl_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlsl_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsl_n_u16
   return vmlsl_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmlsl_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsl_n_u32
   return vmlsl_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmlsl_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlsl_n_s16
   return vqdmlsl_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqdmlsl_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlsl_n_s32
   return vqdmlsl_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmla_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_u16_0
   return vmla_lane_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlaq_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u16_0
   return vmlaq_lane_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmla_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_u32_0
   return vmla_lane_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlaq_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u32_0
   return vmlaq_lane_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmla_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u16_0
   return vmla_laneq_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlaq_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u16_0
   return vmlaq_laneq_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmla_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u32_0
   return vmla_laneq_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlaq_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u32_0
   return vmlaq_laneq_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s16_0
   return vqdmlal_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s32_0
   return vqdmlal_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s16_0
   return vqdmlal_high_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s32_0
   return vqdmlal_high_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmls_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_u16_0
   return vmls_lane_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsq_lane_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u16_0
   return vmlsq_lane_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmls_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_u32_0
   return vmls_lane_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsq_lane_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u32_0
   return vmlsq_lane_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmls_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u16_0
   return vmls_laneq_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmlsq_laneq_u16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u16_0
   return vmlsq_laneq_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vmls_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u32_0
   return vmls_laneq_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmlsq_laneq_u32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u32_0
   return vmlsq_laneq_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s16_0
   return vqdmlsl_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s32_0
   return vqdmlsl_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s16_0
   return vqdmlsl_high_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s32_0
   return vqdmlsl_high_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s16_0
   return vqdmulh_laneq_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s16_0
   return vqdmulhq_laneq_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s32_0
   return vqdmulh_laneq_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s32_0
   return vqdmulhq_laneq_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s16_0
   return vqrdmulh_laneq_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s16_0
   return vqrdmulhq_laneq_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s32_0
   return vqrdmulh_laneq_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s32_0
   return vqrdmulhq_laneq_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: @test_vmla_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_u16
   return vmla_lane_u16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlaq_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u16
   return vmlaq_lane_u16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmla_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_u32
   return vmla_lane_u32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlaq_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u32
   return vmlaq_lane_u32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmla_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u16
   return vmla_laneq_u16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlaq_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u16
   return vmlaq_laneq_u16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmla_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u32
   return vmla_laneq_u32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlaq_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u32
   return vmlaq_laneq_u32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmlal_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s16
   return vqdmlal_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqdmlal_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s32
   return vqdmlal_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s16
   return vqdmlal_high_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s32
   return vqdmlal_high_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmls_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_u16
   return vmls_lane_u16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmlsq_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u16
   return vmlsq_lane_u16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: @test_vmls_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_u32
   return vmls_lane_u32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmlsq_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u32
   return vmlsq_lane_u32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: @test_vmls_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u16
   return vmls_laneq_u16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmlsq_laneq_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u16
   return vmlsq_laneq_u16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vmls_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u32
   return vmls_laneq_u32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vmlsq_laneq_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u32
   return vmlsq_laneq_u32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s16
   return vqdmlsl_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s32
   return vqdmlsl_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s16
   return vqdmlsl_high_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s32
   return vqdmlsl_high_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmulh_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s16
   return vqdmulh_laneq_s16(a, v, 7);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqdmulhq_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s16
   return vqdmulhq_laneq_s16(a, v, 7);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqdmulh_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s32
   return vqdmulh_laneq_s32(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqdmulhq_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s32
   return vqdmulhq_laneq_s32(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqrdmulh_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s16
   return vqrdmulh_laneq_s16(a, v, 7);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s16
   return vqrdmulhq_laneq_s16(a, v, 7);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: @test_vqrdmulh_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s32
   return vqrdmulh_laneq_s32(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s32
   return vqrdmulhq_laneq_s32(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
-
diff --git a/test/CodeGen/aarch64-neon-3v.c b/test/CodeGen/aarch64-neon-3v.c
index ca32652..3581f78 100644
--- a/test/CodeGen/aarch64-neon-3v.c
+++ b/test/CodeGen/aarch64-neon-3v.c
@@ -1,486 +1,597 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vand_s8
   return vand_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vandq_s8
   return vandq_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vand_s16
   return vand_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vandq_s16
   return vandq_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vand_s32
   return vand_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vandq_s32
   return vandq_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vand_s64
   return vand_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vandq_s64
   return vandq_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vand_u8
   return vand_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vandq_u8
   return vandq_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vand_u16
   return vand_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vandq_u16
   return vandq_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vand_u32
   return vand_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vandq_u32
   return vandq_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vand_u64
   return vand_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vandq_u64
   return vandq_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorr_s8
   return vorr_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vorrq_s8
   return vorrq_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorr_s16
   return vorr_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vorrq_s16
   return vorrq_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorr_s32
   return vorr_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vorrq_s32
   return vorrq_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorr_s64
   return vorr_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vorrq_s64
   return vorrq_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorr_u8
   return vorr_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vorrq_u8
   return vorrq_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorr_u16
   return vorr_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vorrq_u16
   return vorrq_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorr_u32
   return vorr_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vorrq_u32
   return vorrq_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorr_u64
   return vorr_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vorrq_u64
   return vorrq_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_veor_s8
   return veor_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_veorq_s8
   return veorq_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_veor_s16
   return veor_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_veorq_s16
   return veorq_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_veor_s32
   return veor_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_veorq_s32
   return veorq_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_veor_s64
   return veor_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_veorq_s64
   return veorq_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_veor_u8
   return veor_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_veorq_u8
   return veorq_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_veor_u16
   return veor_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_veorq_u16
   return veorq_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_veor_u32
   return veor_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_veorq_u32
   return veorq_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_veor_u64
   return veor_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_veorq_u64
   return veorq_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vbic_s8
   return vbic_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vbicq_s8
   return vbicq_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vbic_s16
   return vbic_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vbicq_s16
   return vbicq_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vbic_s32
   return vbic_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vbicq_s32
   return vbicq_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vbic_s64
   return vbic_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vbicq_s64
   return vbicq_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vbic_u8
   return vbic_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vbicq_u8
   return vbicq_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vbic_u16
   return vbic_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vbicq_u16
   return vbicq_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vbic_u32
   return vbic_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vbicq_u32
   return vbicq_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vbic_u64
   return vbic_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vbicq_u64
   return vbicq_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorn_s8
   return vorn_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vornq_s8
   return vornq_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorn_s16
   return vorn_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vornq_s16
   return vornq_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorn_s32
   return vorn_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vornq_s32
   return vornq_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorn_s64
   return vorn_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vornq_s64
   return vornq_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorn_u8
   return vorn_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vornq_u8
   return vornq_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorn_u16
   return vorn_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vornq_u16
   return vornq_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorn_u32
   return vorn_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vornq_u32
   return vornq_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorn_u64
   return vorn_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vornq_u64
   return vornq_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
diff --git a/test/CodeGen/aarch64-neon-across.c b/test/CodeGen/aarch64-neon-across.c
index 00eb2e4..6d7a0d5 100644
--- a/test/CodeGen/aarch64-neon-across.c
+++ b/test/CodeGen/aarch64-neon-across.c
@@ -1,271 +1,342 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define i16 @test_vaddlv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_s8
   return vaddlv_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_s16
   return vaddlv_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i16 @test_vaddlv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_u8
   return vaddlv_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_u16
   return vaddlv_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i16 @test_vaddlvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_s8
   return vaddlvq_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_s16
   return vaddlvq_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i64 @test_vaddlvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a) #2
+// CHECK:   ret i64 [[VADDLVQ_S32_I]]
 int64_t test_vaddlvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_s32
   return vaddlvq_s32(a);
-  // CHECK: saddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i16 @test_vaddlvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_u8
   return vaddlvq_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_u16
   return vaddlvq_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i64 @test_vaddlvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a) #2
+// CHECK:   ret i64 [[VADDLVQ_U32_I]]
 uint64_t test_vaddlvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_u32
   return vaddlvq_u32(a);
-  // CHECK: uaddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vmaxv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_s8
   return vmaxv_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_s16
   return vmaxv_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vmaxv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_u8
   return vmaxv_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_u16
   return vmaxv_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vmaxvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_s8
   return vmaxvq_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_s16
   return vmaxvq_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vmaxvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a) #2
+// CHECK:   ret i32 [[VMAXVQ_S32_I]]
 int32_t test_vmaxvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_s32
   return vmaxvq_s32(a);
-  // CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vmaxvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_u8
   return vmaxvq_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_u16
   return vmaxvq_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vmaxvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a) #2
+// CHECK:   ret i32 [[VMAXVQ_U32_I]]
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_u32
   return vmaxvq_u32(a);
-  // CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vminv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vminv_s8
   return vminv_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vminv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vminv_s16
   return vminv_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vminv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vminv_u8
   return vminv_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vminv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vminv_u16
   return vminv_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vminvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vminvq_s8
   return vminvq_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vminvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vminvq_s16
   return vminvq_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vminvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a) #2
+// CHECK:   ret i32 [[VMINVQ_S32_I]]
 int32_t test_vminvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vminvq_s32
   return vminvq_s32(a);
-  // CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vminvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vminvq_u8
   return vminvq_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vminvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vminvq_u16
   return vminvq_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vminvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a) #2
+// CHECK:   ret i32 [[VMINVQ_U32_I]]
 uint32_t test_vminvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vminvq_u32
   return vminvq_u32(a);
-  // CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vaddv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddv_s8
   return vaddv_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vaddv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddv_s16
   return vaddv_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vaddv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddv_u8
   return vaddv_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vaddv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddv_u16
   return vaddv_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vaddvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_s8
   return vaddvq_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vaddvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_s16
   return vaddvq_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vaddvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a) #2
+// CHECK:   ret i32 [[VADDVQ_S32_I]]
 int32_t test_vaddvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_s32
   return vaddvq_s32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vaddvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_u8
   return vaddvq_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vaddvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_u16
   return vaddvq_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vaddvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a) #2
+// CHECK:   ret i32 [[VADDVQ_U32_I]]
 uint32_t test_vaddvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_u32
   return vaddvq_u32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vmaxvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a) #2
+// CHECK:   ret float [[VMAXVQ_F32_I]]
 float32_t test_vmaxvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_f32
   return vmaxvq_f32(a);
-  // CHECK: fmaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vminvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a) #2
+// CHECK:   ret float [[VMINVQ_F32_I]]
 float32_t test_vminvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminvq_f32
   return vminvq_f32(a);
-  // CHECK: fminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vmaxnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a) #2
+// CHECK:   ret float [[VMAXNMVQ_F32_I]]
 float32_t test_vmaxnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxnmvq_f32
   return vmaxnmvq_f32(a);
-  // CHECK: fmaxnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vminnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a) #2
+// CHECK:   ret float [[VMINNMVQ_F32_I]]
 float32_t test_vminnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminnmvq_f32
   return vminnmvq_f32(a);
-  // CHECK: fminnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
diff --git a/test/CodeGen/aarch64-neon-extract.c b/test/CodeGen/aarch64-neon-extract.c
index cc654cc..c84c861 100644
--- a/test/CodeGen/aarch64-neon-extract.c
+++ b/test/CodeGen/aarch64-neon-extract.c
@@ -1,148 +1,247 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vext_s8
   return vext_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vext_s16
   return vext_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vext_s32
   return vext_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vext_s64
   return vext_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vextq_s8
   return vextq_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vextq_s16
   return vextq_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vextq_s32
   return vextq_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vextq_s64
   return vextq_s64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vext_u8
   return vext_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vext_u16
   return vext_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vext_u32
   return vext_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vext_u64
   return vext_u64(a, b, 0);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vextq_u8
   return vextq_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vextq_u16
   return vextq_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vextq_u32
   return vextq_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vextq_u64
   return vextq_u64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vext_f32
   return vext_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[VEXT]]
 float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vext_f64
   return vext_f64(a, b, 0);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vextq_f32
   return vextq_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x double> [[VEXT]]
 float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vextq_f64
   return vextq_f64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vext_p8
   return vext_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vext_p16
   return vext_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vextq_p8
   return vextq_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vextq_p16
   return vextq_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
diff --git a/test/CodeGen/aarch64-neon-fcvt-intrinsics.c b/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
index d1b9996..f2c238e 100644
--- a/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
@@ -1,133 +1,153 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define float @test_vcvtxd_f32_f64(double %a) #0 {
+// CHECK:   [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double %a) #2
+// CHECK:   ret float [[VCVTXD_F32_F64_I]]
 float32_t test_vcvtxd_f32_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtxd_f32_f64
-// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}}
   return (float32_t)vcvtxd_f32_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtas_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_S32_F32_I]]
 int32_t test_vcvtas_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_s32_f32
-// CHECK: fcvtas {{[ws][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtas_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_test_vcvtad_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_S64_F64_I]]
 int64_t test_test_vcvtad_s64_f64(float64_t a) {
-// CHECK-LABEL: test_test_vcvtad_s64_f64
-// CHECK: fcvtas {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtad_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtas_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_U32_F32_I]]
 uint32_t test_vcvtas_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_u32_f32
-// CHECK: fcvtau {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtas_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtad_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_U64_F64_I]]
 uint64_t test_vcvtad_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtad_u64_f64
-// CHECK: fcvtau {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtad_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtms_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_S32_F32_I]]
 int32_t test_vcvtms_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_s32_f32
-// CHECK: fcvtms {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtms_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtmd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_S64_F64_I]]
 int64_t test_vcvtmd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_s64_f64
-// CHECK: fcvtms {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtmd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtms_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_U32_F32_I]]
 uint32_t test_vcvtms_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_u32_f32
-// CHECK: fcvtmu {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtms_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtmd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_U64_F64_I]]
 uint64_t test_vcvtmd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_u64_f64
-// CHECK: fcvtmu {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtmd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtns_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_S32_F32_I]]
 int32_t test_vcvtns_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_s32_f32
-// CHECK: fcvtns {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtns_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtnd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_S64_F64_I]]
 int64_t test_vcvtnd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_s64_f64
-// CHECK: fcvtns {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtnd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtns_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_U32_F32_I]]
 uint32_t test_vcvtns_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_u32_f32
-// CHECK: fcvtnu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtns_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtnd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_U64_F64_I]]
 uint64_t test_vcvtnd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_u64_f64
-// CHECK: fcvtnu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtnd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtps_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_S32_F32_I]]
 int32_t test_vcvtps_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_s32_f32
-// CHECK: fcvtps {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtps_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtpd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_S64_F64_I]]
 int64_t test_vcvtpd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_s64_f64
-// CHECK: fcvtps {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtpd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtps_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_U32_F32_I]]
 uint32_t test_vcvtps_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_u32_f32
-// CHECK: fcvtpu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtps_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtpd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_U64_F64_I]]
 uint64_t test_vcvtpd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_u64_f64
-// CHECK: fcvtpu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtpd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_s32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 int32_t test_vcvts_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_s32_f32
-// CHECK: fcvtzs {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvts_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_s64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 int64_t test_vcvtd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_s64_f64
-// CHECK: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_u32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 uint32_t test_vcvts_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_u32_f32
-// CHECK: fcvtzu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvts_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_u64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 uint64_t test_vcvtd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_u64_f64
-// CHECK: fcvtzu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtd_u64_f64(a);
 }
diff --git a/test/CodeGen/aarch64-neon-fma.c b/test/CodeGen/aarch64-neon-fma.c
index ac80833..6ada533 100644
--- a/test/CodeGen/aarch64-neon-fma.c
+++ b/test/CodeGen/aarch64-neon-fma.c
@@ -1,199 +1,231 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmla_n_f32
   return vmla_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f32
   return vmlaq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vmlaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f64
   return vmlaq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f32
   return vmlsq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmls_n_f32
   return vmls_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f64
   return vmlsq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32_0
   return vmla_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32_0
   return vmlaq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32_0
   return vmla_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32_0
   return vmlaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32_0
   return vmls_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32_0
   return vmlsq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32_0
   return vmls_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32_0
   return vmlsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32
   return vmla_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32
   return vmlaq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32
   return vmla_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32
   return vmlaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32
   return vmls_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32
   return vmlsq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32
   return vmls_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32
   return vmlsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> [[VECINIT1_I]], <2 x double> %a)
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmaq_n_f64:
   return vfmaq_n_f64(a, b, c);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> [[VECINIT1_I]], <2 x double> %a) #2
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmsq_n_f64:
   return vfmsq_n_f64(a, b, c);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
diff --git a/test/CodeGen/aarch64-neon-intrinsics.c b/test/CodeGen/aarch64-neon-intrinsics.c
index e3ea237..2ffbcdc 100644
--- a/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
@@ -1,11751 +1,21541 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+// RUN:     -fallow-half-arguments-and-returns -ffp-contract=fast -S -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: @test_vadd_s8(
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vadd_s8
   return vadd_s8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vadd_s16(
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vadd_s16
   return vadd_s16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vadd_s32(
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vadd_s32
   return vadd_s32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vadd_s64(
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) {
-  // CHECK-LABEL: test_vadd_s64
   return vadd_s64(v1, v2);
-  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vadd_f32(
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vadd_f32
   return vadd_f32(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vadd_u8(
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vadd_u8
   return vadd_u8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vadd_u16(
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vadd_u16
   return vadd_u16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vadd_u32(
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vadd_u32
   return vadd_u32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vadd_u64(
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) {
-   // CHECK-LABEL: test_vadd_u64
   return vadd_u64(v1, v2);
-  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vaddq_s8(
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vaddq_s8
   return vaddq_s8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vaddq_s16(
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vaddq_s16
   return vaddq_s16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
-int32x4_t test_vaddq_s32(int32x4_t v1,int32x4_t  v2) {
-   // CHECK-LABEL: test_vaddq_s32
+// CHECK-LABEL: @test_vaddq_s32(
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[ADD_I]]
+int32x4_t test_vaddq_s32(int32x4_t v1, int32x4_t v2) {
   return vaddq_s32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddq_s64(
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vaddq_s64
   return vaddq_s64(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vaddq_f32(
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vaddq_f32
   return vaddq_f32(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddq_f64(
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vaddq_f64
   return vaddq_f64(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vaddq_u8(
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vaddq_u8
   return vaddq_u8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vaddq_u16(
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vaddq_u16
   return vaddq_u16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddq_u32(
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK: vaddq_u32
   return vaddq_u32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddq_u64(
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vaddq_u64
   return vaddq_u64(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vsub_s8(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vsub_s8
   return vsub_s8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vsub_s16(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vsub_s16
   return vsub_s16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vsub_s32(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vsub_s32
   return vsub_s32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsub_s64(
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) {
-   // CHECK-LABEL: test_vsub_s64
   return vsub_s64(v1, v2);
-  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vsub_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vsub_f32
   return vsub_f32(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsub_u8(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vsub_u8
   return vsub_u8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vsub_u16(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vsub_u16
   return vsub_u16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vsub_u32(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vsub_u32
   return vsub_u32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsub_u64(
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[SUB_I]]
 uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) {
-   // CHECK-LABEL: test_vsub_u64
   return vsub_u64(v1, v2);
-  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vsubq_s8(
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vsubq_s8
   return vsubq_s8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vsubq_s16(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vsubq_s16
   return vsubq_s16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
-int32x4_t test_vsubq_s32(int32x4_t v1,int32x4_t  v2) {
-   // CHECK-LABEL: test_vsubq_s32
+// CHECK-LABEL: @test_vsubq_s32(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[SUB_I]]
+int32x4_t test_vsubq_s32(int32x4_t v1, int32x4_t v2) {
   return vsubq_s32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubq_s64(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vsubq_s64
   return vsubq_s64(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vsubq_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vsubq_f32
   return vsubq_f32(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubq_f64(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vsubq_f64
   return vsubq_f64(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vsubq_u8(
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vsubq_u8
   return vsubq_u8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vsubq_u16(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vsubq_u16
   return vsubq_u16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubq_u32(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK: vsubq_u32
   return vsubq_u32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubq_u64(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vsubq_u64
   return vsubq_u64(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmul_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vmul_s8
   return vmul_s8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmul_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vmul_s16
   return vmul_s16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmul_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vmul_s32
   return vmul_s32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmul_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vmul_f32
   return vmul_f32(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
-
+// CHECK-LABEL: @test_vmul_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vmul_u8
   return vmul_u8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmul_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vmul_u16
   return vmul_u16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmul_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vmul_u32
   return vmul_u32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmulq_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vmulq_s8
   return vmulq_s8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmulq_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vmulq_s16
   return vmulq_s16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmulq_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_s32
   return vmulq_s32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
-    
+
+// CHECK-LABEL: @test_vmulq_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vmulq_u8
   return vmulq_u8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmulq_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vmulq_u16
   return vmulq_u16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmulq_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_u32
   return vmulq_u32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmulq_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_f32
   return vmulq_f32(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmulq_f64(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[MUL_I]]
 float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vmulq_f64
   return vmulq_f64(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmul_p8(
+// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) {
-  //  test_vmul_p8
   return vmul_p8(v1, v2);
-  //  pmul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmulq_p8(
+// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) {
-  // test_vmulq_p8
   return vmulq_p8(v1, v2);
-  // pmul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
-
+// CHECK-LABEL: @test_vmla_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vmla_s8
   return vmla_s8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmla_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vmla_s16
   return vmla_s16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmla_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vmla_s32
   return vmla_s32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmla_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vmla_f32
   return vmla_f32(v1, v2, v3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmla_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vmla_u8
   return vmla_u8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmla_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vmla_u16
   return vmla_u16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmla_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vmla_u32
   return vmla_u32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlaq_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vmlaq_s8
   return vmlaq_s8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmlaq_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vmlaq_s16
   return vmlaq_s16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmlaq_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_s32
   return vmlaq_s32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-} 
+}
 
+// CHECK-LABEL: @test_vmlaq_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_f32
   return vmlaq_f32(v1, v2, v3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmlaq_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-   // CHECK-LABEL: test_vmlaq_u8
   return vmlaq_u8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmlaq_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vmlaq_u16
   return vmlaq_u16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmlaq_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_u32
   return vmlaq_u32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmlaq_f64(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]]
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vmlaq_f64
   return vmlaq_f64(v1, v2, v3);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmls_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vmls_s8
   return vmls_s8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmls_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vmls_s16
   return vmls_s16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmls_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vmls_s32
   return vmls_s32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmls_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vmls_f32
   return vmls_f32(v1, v2, v3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmls_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vmls_u8
   return vmls_u8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmls_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vmls_u16
   return vmls_u16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmls_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vmls_u32
   return vmls_u32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+
+// CHECK-LABEL: @test_vmlsq_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vmlsq_s8
   return vmlsq_s8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmlsq_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vmlsq_s16
   return vmlsq_s16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmlsq_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_s32
   return vmlsq_s32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmlsq_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_f32
   return vmlsq_f32(v1, v2, v3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vmlsq_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vmlsq_u8
   return vmlsq_u8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmlsq_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vmlsq_u16
   return vmlsq_u16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmlsq_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_u32
   return vmlsq_u32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmlsq_f64(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]]
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vmlsq_f64
   return vmlsq_f64(v1, v2, v3);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+
+// CHECK-LABEL: @test_vfma_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1) #4
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vfma_f32
   return vfma_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vfmaq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1) #4
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vfmaq_f32
   return vfmaq_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vfmaq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1) #4
+// CHECK:   ret <2 x double> [[TMP3]]
 float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vfmaq_f64
   return vfmaq_f64(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+
+// CHECK-LABEL: @test_vfms_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v2
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1) #4
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vfms_f32
   return vfms_f32(v1, v2, v3);
-  // CHECK: fmls v0.2s, {{v1.2s, v2.2s|v2.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vfmsq_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v2
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1) #4
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vfmsq_f32
   return vfmsq_f32(v1, v2, v3);
-  // CHECK: fmls v0.4s, {{v1.4s, v2.4s|v2.4s, v1.4s}}
 }
 
+// CHECK-LABEL: @test_vfmsq_f64(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v2
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1) #4
+// CHECK:   ret <2 x double> [[TMP3]]
 float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK: vfmsq_f64
   return vfmsq_f64(v1, v2, v3);
-  // CHECK: fmls v0.2d, {{v1.2d, v2.2d|v2.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vdivq_f64(
+// CHECK:   [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[DIV_I]]
 float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vdivq_f64
   return vdivq_f64(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vdivq_f32(
+// CHECK:   [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[DIV_I]]
 float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vdivq_f32
   return vdivq_f32(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vdiv_f32(
+// CHECK:   [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[DIV_I]]
 float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vdiv_f32
   return vdiv_f32(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaba_s8(
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vaba_s8
   return vaba_s8(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vaba_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v2, <4 x i16> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vaba_s16
   return vaba_s16(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vaba_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v2, <2 x i32> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vaba_s32
   return vaba_s32(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaba_u8(
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vaba_u8
   return vaba_u8(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vaba_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v2, <4 x i16> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vaba_u16
   return vaba_u16(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vaba_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v2, <2 x i32> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vaba_u32
   return vaba_u32(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vabaq_s8(
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vabaq_s8
   return vabaq_s8(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vabaq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v2, <8 x i16> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vabaq_s16
   return vabaq_s16(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vabaq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v2, <4 x i32> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vabaq_s32
   return vabaq_s32(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vabaq_u8(
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vabaq_u8
   return vabaq_u8(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vabaq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v2, <8 x i16> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vabaq_u16
   return vabaq_u16(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vabaq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v2, <4 x i32> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vabaq_u32
   return vabaq_u32(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vabd_s8(
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vabd_s8
   return vabd_s8(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vabd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vabd_s16
   return vabd_s16(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vabd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vabd_s32
   return vabd_s32(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vabd_u8(
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vabd_u8
   return vabd_u8(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vabd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vabd_u16
   return vabd_u16(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vabd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vabd_u32
   return vabd_u32(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vabd_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %v1, <2 x float> %v2) #4
+// CHECK:   ret <2 x float> [[VABD2_I]]
 float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vabd_f32
   return vabd_f32(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vabdq_s8(
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vabdq_s8
   return vabdq_s8(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vabdq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vabdq_s16
   return vabdq_s16(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vabdq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_s32
   return vabdq_s32(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vabdq_u8(
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vabdq_u8
   return vabdq_u8(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vabdq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vabdq_u16
   return vabdq_u16(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vabdq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_u32
   return vabdq_u32(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vabdq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %v1, <4 x float> %v2) #4
+// CHECK:   ret <4 x float> [[VABD2_I]]
 float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_f32
   return vabdq_f32(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vabdq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %v1, <2 x double> %v2) #4
+// CHECK:   ret <2 x double> [[VABD2_I]]
 float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vabdq_f64
   return vabdq_f64(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-
+// CHECK-LABEL: @test_vbsl_s8(
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_s8
   return vbsl_s8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP4]]
 int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_s16
   return vbsl_s16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> %v1, <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_s32
   return vbsl_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_s64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_s64
   return vbsl_s64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_u8(
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_u8
   return vbsl_u8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_u16
   return vbsl_u16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> %v1, <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_u32
   return vbsl_u32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_u64
   return vbsl_u64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[TMP0]], [[VBSL1_I]]
+// CHECK:   [[TMP4:%.*]] = xor <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP5]]
 float32x2_t test_vbsl_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_f32
   return vbsl_f32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP4]]
 float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_f64
   return vbsl_f64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_p8(
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_p8
   return vbsl_p8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbsl_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_p16
   return vbsl_p16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vbslq_s8(
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_s8
   return vbslq_s8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_s16
   return vbslq_s16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_s32
   return vbslq_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_s64
   return vbslq_s64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_u8(
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_u8
   return vbslq_u8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_u16
   return vbslq_u16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_u32
   return vbslq_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_u64
   return vbslq_u64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP4]]
 float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_f32
   return vbslq_f32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_p8(
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_p8
   return vbslq_p8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_p16
   return vbslq_p16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vbslq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP4]]
 float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_f64
   return vbslq_f64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vrecps_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %v1, <2 x float> %v2) #4
+// CHECK:   ret <2 x float> [[VRECPS_V2_I]]
 float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vrecps_f32
-   return vrecps_f32(v1, v2);
-   // CHECK: frecps {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  return vrecps_f32(v1, v2);
 }
 
+// CHECK-LABEL: @test_vrecpsq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %v1, <4 x float> %v2) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VRECPSQ_V2_I]]
 float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vrecpsq_f32
-   return vrecpsq_f32(v1, v2);
-   // CHECK: frecps {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+  return vrecpsq_f32(v1, v2);
 }
 
+// CHECK-LABEL: @test_vrecpsq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %v1, <2 x double> %v2) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x double> [[VRECPSQ_V2_I]]
 float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) {
-   // CHECK-LABEL: test_vrecpsq_f64
   return vrecpsq_f64(v1, v2);
-  // CHECK: frecps {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrsqrts_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v1, <2 x float> %v2) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VRSQRTS_V2_I]]
 float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vrsqrts_f32
   return vrsqrts_f32(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrsqrtsq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v1, <4 x float> %v2) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VRSQRTSQ_V2_I]]
 float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vrsqrtsq_f32
   return vrsqrtsq_f32(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrsqrtsq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %v1, <2 x double> %v2) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x double> [[VRSQRTSQ_V2_I]]
 float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) {
-   // CHECK-LABEL: test_vrsqrtsq_f64
   return vrsqrtsq_f64(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcage_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2) #4
+// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcage_f32
   return vcage_f32(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcage_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x i64> [[VCAGE_V2_I]]
 uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcage_f64
   return vcage_f64(a, b);
-  // CHECK: facge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcageq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2) #4
+// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcageq_f32
   return vcageq_f32(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcageq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2) #4
+// CHECK:   ret <2 x i64> [[VCAGEQ_V2_I]]
 uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcageq_f64
   return vcageq_f64(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcagt_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2) #4
+// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcagt_f32
   return vcagt_f32(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcagt_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x i64> [[VCAGT_V2_I]]
 uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcagt_f64
   return vcagt_f64(a, b);
-  // CHECK: facgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcagtq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2) #4
+// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcagtq_f32
   return vcagtq_f32(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcagtq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2) #4
+// CHECK:   ret <2 x i64> [[VCAGTQ_V2_I]]
 uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcagtq_f64
   return vcagtq_f64(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcale_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1) #4
+// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcale_f32
   return vcale_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: @test_vcale_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCALE_V2_I]]
 uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcale_f64
   return vcale_f64(a, b);
-  // CHECK: facge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcaleq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1) #4
+// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcaleq_f32
   return vcaleq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: @test_vcaleq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1) #4
+// CHECK:   ret <2 x i64> [[VCALEQ_V2_I]]
 uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcaleq_f64
   return vcaleq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: @test_vcalt_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1) #4
+// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcalt_f32
   return vcalt_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: @test_vcalt_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCALT_V2_I]]
 uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcalt_f64
   return vcalt_f64(a, b);
-  // CHECK: facgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcaltq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1) #4
+// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcaltq_f32
   return vcaltq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: @test_vcaltq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1) #4
+// CHECK:   ret <2 x i64> [[VCALTQ_V2_I]]
 uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcaltq_f64
   return vcaltq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: @test_vtst_s8(
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vtst_s8
   return vtst_s8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtst_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vtst_s16
   return vtst_s16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtst_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <2 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vtst_s32
   return vtst_s32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vtst_u8(
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vtst_u8
   return vtst_u8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtst_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vtst_u16
   return vtst_u16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtst_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <2 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vtst_u32
   return vtst_u32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vtstq_s8(
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_s8
   return vtstq_s8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtstq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_s16
   return vtstq_s16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtstq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) {
-   // CHECK-LABEL: test_vtstq_s32
   return vtstq_s32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vtstq_u8(
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_u8
   return vtstq_u8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtstq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_u16
   return vtstq_u16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtstq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i32> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK-LABEL: test_vtstq_u32
   return vtstq_u32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vtstq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <2 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vtstq_s64
   return vtstq_s64(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vtstq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <2 x i64> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vtstq_u64
   return vtstq_u64(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vtst_p8(
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) {
-   // CHECK-LABEL: test_vtst_p8
   return vtst_p8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtst_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) {
-   // CHECK-LABEL: test_vtst_p16
   return vtst_p16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtstq_p8(
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_p8
   return vtstq_p8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtstq_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_p16
   return vtstq_p16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtst_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <1 x i64> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vtst_s64
   return vtst_s64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vtst_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <1 x i64> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vtst_u64
   return vtst_u64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vceq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vceq_s8
   return vceq_s8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vceq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vceq_s16
   return vceq_s16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vceq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vceq_s32
   return vceq_s32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vceq_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vceq_s64
   return vceq_s64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vceq_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vceq_u64
   return vceq_u64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vceq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vceq_f32
   return vceq_f32(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vceq_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vceq_f64
   return vceq_f64(a, b);
-  // CHECK: fcmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vceq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vceq_u8
   return vceq_u8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vceq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vceq_u16
   return vceq_u16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vceq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vceq_u32
   return vceq_u32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vceq_p8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) {
-  // CHECK-LABEL: test_vceq_p8
   return vceq_p8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vceqq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_s8
   return vceqq_s8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vceqq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vceqq_s16
   return vceqq_s16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vceqq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_s32
   return vceqq_s32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vceqq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_f32
   return vceqq_f32(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vceqq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_u8
   return vceqq_u8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vceqq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vceqq_u16
   return vceqq_u16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vceqq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_u32
   return vceqq_u32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vceqq_p8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_p8
   return vceqq_p8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
-
+// CHECK-LABEL: @test_vceqq_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_s64
   return vceqq_s64(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vceqq_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_u64
   return vceqq_u64(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vceqq_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_f64
   return vceqq_f64(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+
+// CHECK-LABEL: @test_vcge_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vcge_s8
   return vcge_s8(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vcge_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vcge_s16
   return vcge_s16(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vcge_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vcge_s32
   return vcge_s32(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcge_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcge_s64
   return vcge_s64(a, b);
-  // CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcge_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcge_u64
   return vcge_u64(a, b);
-  // CHECK: cmhs {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcge_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) {
-// CHECK-LABEL: test_vcge_f32
   return vcge_f32(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcge_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcge_f64
   return vcge_f64(a, b);
-  // CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcge_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vcge_u8
   return vcge_u8(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vcge_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vcge_u16
   return vcge_u16(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vcge_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vcge_u32
   return vcge_u32(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcgeq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vcgeq_s8
   return vcgeq_s8(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vcgeq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vcgeq_s16
   return vcgeq_s16(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vcgeq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_s32
   return vcgeq_s32(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcgeq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_f32
   return vcgeq_f32(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcgeq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vcgeq_u8
   return vcgeq_u8(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vcgeq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vcgeq_u16
   return vcgeq_u16(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vcgeq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_u32
   return vcgeq_u32(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcgeq_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_s64
   return vcgeq_s64(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcgeq_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_u64
   return vcgeq_u64(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcgeq_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_f64
   return vcgeq_f64(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcle_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 // Notes about vcle:
 // LE condition predicate implemented as GE, so check reversed operands.
 // Using registers other than v0, v1 are possible, but would be odd.
 uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vcle_s8
   return vcle_s8(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: @test_vcle_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vcle_s16
   return vcle_s16(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: @test_vcle_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vcle_s32
   return vcle_s32(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: @test_vcle_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcle_s64
   return vcle_s64(a, b);
-  // CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcle_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcle_u64
   return vcle_u64(a, b);
-  // CHECK: cmhs {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcle_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcle_f32
   return vcle_f32(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: @test_vcle_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcle_f64
   return vcle_f64(a, b);
-  // CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcle_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vcle_u8
   return vcle_u8(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: @test_vcle_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vcle_u16
   return vcle_u16(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: @test_vcle_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vcle_u32
   return vcle_u32(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: @test_vcleq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcleq_s8
   return vcleq_s8(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: @test_vcleq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcleq_s16
   return vcleq_s16(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: @test_vcleq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_s32
   return vcleq_s32(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: @test_vcleq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_f32
   return vcleq_f32(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: @test_vcleq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcleq_u8
   return vcleq_u8(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: @test_vcleq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcleq_u16
   return vcleq_u16(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: @test_vcleq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_u32
   return vcleq_u32(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: @test_vcleq_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_s64
   return vcleq_s64(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: @test_vcleq_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_u64
   return vcleq_u64(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: @test_vcleq_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_f64
   return vcleq_f64(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
-
+// CHECK-LABEL: @test_vcgt_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vcgt_s8
   return vcgt_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vcgt_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vcgt_s16
   return vcgt_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vcgt_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_s32
   return vcgt_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcgt_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcgt_s64
   return vcgt_s64(a, b);
-  // CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcgt_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcgt_u64
   return vcgt_u64(a, b);
-  // CHECK: cmhi {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcgt_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_f32
   return vcgt_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcgt_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcgt_f64
   return vcgt_f64(a, b);
-  // CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcgt_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vcgt_u8
   return vcgt_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vcgt_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vcgt_u16
   return vcgt_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vcgt_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_u32
   return vcgt_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcgtq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcgtq_s8
   return vcgtq_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vcgtq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcgtq_s16
   return vcgtq_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vcgtq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_s32
   return vcgtq_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcgtq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_f32
   return vcgtq_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcgtq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcgtq_u8
   return vcgtq_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vcgtq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcgtq_u16
   return vcgtq_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vcgtq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_u32
   return vcgtq_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcgtq_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_s64
   return vcgtq_s64(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcgtq_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_u64
   return vcgtq_u64(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcgtq_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_f64
   return vcgtq_f64(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-
+// CHECK-LABEL: @test_vclt_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 // Notes about vclt:
 // LT condition predicate implemented as GT, so check reversed operands.
 // Using registers other than v0, v1 are possible, but would be odd.
-
 uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vclt_s8
   return vclt_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: @test_vclt_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vclt_s16
   return vclt_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: @test_vclt_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vclt_s32
   return vclt_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: @test_vclt_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vclt_s64
   return vclt_s64(a, b);
-  // CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vclt_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vclt_u64
   return vclt_u64(a, b);
-  // CHECK: cmhi {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vclt_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vclt_f32
   return vclt_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: @test_vclt_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vclt_f64
   return vclt_f64(a, b);
-  // CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vclt_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vclt_u8
   return vclt_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: @test_vclt_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vclt_u16
   return vclt_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: @test_vclt_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vclt_u32
   return vclt_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: @test_vcltq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcltq_s8
   return vcltq_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: @test_vcltq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcltq_s16
   return vcltq_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: @test_vcltq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_s32
   return vcltq_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: @test_vcltq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_f32
   return vcltq_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: @test_vcltq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcltq_u8
   return vcltq_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: @test_vcltq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcltq_u16
   return vcltq_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: @test_vcltq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_u32
   return vcltq_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: @test_vcltq_s64(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_s64
   return vcltq_s64(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: @test_vcltq_u64(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_u64
   return vcltq_u64(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: @test_vcltq_f64(
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_f64
   return vcltq_f64(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
-
+// CHECK-LABEL: @test_vhadd_s8(
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vhadd_s8
   return vhadd_s8(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vhadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
 int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vhadd_s16
   return vhadd_s16(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vhadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
 int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vhadd_s32
   return vhadd_s32(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vhadd_u8(
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vhadd_u8
   return vhadd_u8(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vhadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
 uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vhadd_u16
   return vhadd_u16(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vhadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
 uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vhadd_u32
   return vhadd_u32(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vhaddq_s8(
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vhaddq_s8
   return vhaddq_s8(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vhaddq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
 int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vhaddq_s16
   return vhaddq_s16(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vhaddq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
 int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vhaddq_s32
   return vhaddq_s32(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vhaddq_u8(
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vhaddq_u8
   return vhaddq_u8(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vhaddq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
 uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vhaddq_u16
   return vhaddq_u16(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vhaddq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
 uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vhaddq_u32
   return vhaddq_u32(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
-
+// CHECK-LABEL: @test_vhsub_s8(
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vhsub_s8
   return vhsub_s8(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vhsub_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
 int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vhsub_s16
   return vhsub_s16(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vhsub_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
 int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vhsub_s32
   return vhsub_s32(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vhsub_u8(
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vhsub_u8
   return vhsub_u8(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vhsub_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
 uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vhsub_u16
   return vhsub_u16(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vhsub_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
 uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vhsub_u32
   return vhsub_u32(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vhsubq_s8(
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vhsubq_s8
   return vhsubq_s8(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vhsubq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
 int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vhsubq_s16
   return vhsubq_s16(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vhsubq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
 int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vhsubq_s32
   return vhsubq_s32(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vhsubq_u8(
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vhsubq_u8
   return vhsubq_u8(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vhsubq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
 uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vhsubq_u16
   return vhsubq_u16(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vhsubq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
 uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vhsubq_u32
   return vhsubq_u32(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
-
+// CHECK-LABEL: @test_vrhadd_s8(
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vrhadd_s8
   return vrhadd_s8(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vrhadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
 int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vrhadd_s16
   return vrhadd_s16(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vrhadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
 int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vrhadd_s32
   return vrhadd_s32(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrhadd_u8(
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vrhadd_u8
   return vrhadd_u8(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vrhadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
 uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vrhadd_u16
   return vrhadd_u16(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vrhadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
 uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vrhadd_u32
   return vrhadd_u32(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrhaddq_s8(
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vrhaddq_s8
   return vrhaddq_s8(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vrhaddq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
 int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vrhaddq_s16
   return vrhaddq_s16(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vrhaddq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
 int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vrhaddq_s32
   return vrhaddq_s32(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrhaddq_u8(
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vrhaddq_u8
   return vrhaddq_u8(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vrhaddq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vrhaddq_u16
   return vrhaddq_u16(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vrhaddq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vrhaddq_u32
   return vrhaddq_u32(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vqadd_s8(
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqadd_s8
   return vqadd_s8(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqadd_s16
   return vqadd_s16(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqadd_s32
   return vqadd_s32(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqadd_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqadd_s64
   return vqadd_s64(a, b);
-// CHECK:  sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqadd_u8(
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vqadd_u8
   return vqadd_u8(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vqadd_u16
   return vqadd_u16(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vqadd_u32
   return vqadd_u32(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqadd_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK:  test_vqadd_u64
   return vqadd_u64(a, b);
-// CHECK:  uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqaddq_s8(
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqaddq_s8
   return vqaddq_s8(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqaddq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqaddq_s16
   return vqaddq_s16(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqaddq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqaddq_s32
   return vqaddq_s32(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqaddq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqaddq_s64
   return vqaddq_s64(a, b);
-// CHECK: sqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqaddq_u8(
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vqaddq_u8
   return vqaddq_u8(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqaddq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vqaddq_u16
   return vqaddq_u16(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqaddq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vqaddq_u32
   return vqaddq_u32(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqaddq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
-// CHECK-LABEL: test_vqaddq_u64
   return vqaddq_u64(a, b);
-// CHECK: uqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-
+// CHECK-LABEL: @test_vqsub_s8(
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqsub_s8
   return vqsub_s8(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqsub_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqsub_s16
   return vqsub_s16(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqsub_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqsub_s32
   return vqsub_s32(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqsub_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqsub_s64
   return vqsub_s64(a, b);
-// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqsub_u8(
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vqsub_u8
   return vqsub_u8(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqsub_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vqsub_u16
   return vqsub_u16(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqsub_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vqsub_u32
   return vqsub_u32(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqsub_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vqsub_u64
   return vqsub_u64(a, b);
-// CHECK:  uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqsubq_s8(
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqsubq_s8
   return vqsubq_s8(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqsubq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqsubq_s16
   return vqsubq_s16(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqsubq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqsubq_s32
   return vqsubq_s32(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqsubq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqsubq_s64
   return vqsubq_s64(a, b);
-// CHECK: sqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqsubq_u8(
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vqsubq_u8
   return vqsubq_u8(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqsubq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vqsubq_u16
   return vqsubq_u16(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqsubq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vqsubq_u32
   return vqsubq_u32(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqsubq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
-// CHECK-LABEL: test_vqsubq_u64
   return vqsubq_u64(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-
+// CHECK-LABEL: @test_vshl_s8(
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vshl_s8
   return vshl_s8(a, b);
-// CHECK: sshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vshl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vshl_s16
   return vshl_s16(a, b);
-// CHECK: sshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vshl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vshl_s32
   return vshl_s32(a, b);
-// CHECK: sshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vshl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vshl_s64
   return vshl_s64(a, b);
-// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vshl_u8(
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vshl_u8
   return vshl_u8(a, b);
-// CHECK: ushl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vshl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vshl_u16
   return vshl_u16(a, b);
-// CHECK: ushl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vshl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vshl_u32
   return vshl_u32(a, b);
-// CHECK: ushl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vshl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vshl_u64
   return vshl_u64(a, b);
-// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vshlq_s8(
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vshlq_s8
   return vshlq_s8(a, b);
-// CHECK: sshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vshlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vshlq_s16
   return vshlq_s16(a, b);
-// CHECK: sshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vshlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vshlq_s32
   return vshlq_s32(a, b);
-// CHECK: sshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vshlq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vshlq_s64
   return vshlq_s64(a, b);
-// CHECK: sshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vshlq_u8(
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vshlq_u8
   return vshlq_u8(a, b);
-// CHECK: ushl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vshlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vshlq_u16
   return vshlq_u16(a, b);
-// CHECK: ushl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vshlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vshlq_u32
   return vshlq_u32(a, b);
-// CHECK: ushl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vshlq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vshlq_u64
   return vshlq_u64(a, b);
-// CHECK: ushl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-
+// CHECK-LABEL: @test_vqshl_s8(
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqshl_s8
   return vqshl_s8(a, b);
-// CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqshl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqshl_s16
   return vqshl_s16(a, b);
-// CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqshl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqshl_s32
   return vqshl_s32(a, b);
-// CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqshl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqshl_s64
   return vqshl_s64(a, b);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqshl_u8(
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqshl_u8
   return vqshl_u8(a, b);
-// CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqshl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqshl_u16
   return vqshl_u16(a, b);
-// CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqshl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqshl_u32
   return vqshl_u32(a, b);
-// CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqshl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqshl_u64
   return vqshl_u64(a, b);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqshlq_s8(
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqshlq_s8
   return vqshlq_s8(a, b);
-// CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqshlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqshlq_s16
   return vqshlq_s16(a, b);
-// CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqshlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqshlq_s32
   return vqshlq_s32(a, b);
-// CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqshlq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqshlq_s64
   return vqshlq_s64(a, b);
-// CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqshlq_u8(
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqshlq_u8
   return vqshlq_u8(a, b);
-// CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqshlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqshlq_u16
   return vqshlq_u16(a, b);
-// CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqshlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqshlq_u32
   return vqshlq_u32(a, b);
-// CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqshlq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqshlq_u64
   return vqshlq_u64(a, b);
-// CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrshl_s8(
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vrshl_s8
   return vrshl_s8(a, b);
-// CHECK: srshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vrshl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vrshl_s16
   return vrshl_s16(a, b);
-// CHECK: srshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vrshl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vrshl_s32
   return vrshl_s32(a, b);
-// CHECK: srshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrshl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrshl_s64
   return vrshl_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrshl_u8(
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vrshl_u8
   return vrshl_u8(a, b);
-// CHECK: urshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vrshl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vrshl_u16
   return vrshl_u16(a, b);
-// CHECK: urshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vrshl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vrshl_u32
   return vrshl_u32(a, b);
-// CHECK: urshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrshl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrshl_u64
   return vrshl_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrshlq_s8(
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vrshlq_s8
   return vrshlq_s8(a, b);
-// CHECK: srshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vrshlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vrshlq_s16
   return vrshlq_s16(a, b);
-// CHECK: srshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vrshlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vrshlq_s32
   return vrshlq_s32(a, b);
-// CHECK: srshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrshlq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vrshlq_s64
   return vrshlq_s64(a, b);
-// CHECK: srshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrshlq_u8(
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vrshlq_u8
   return vrshlq_u8(a, b);
-// CHECK: urshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vrshlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vrshlq_u16
   return vrshlq_u16(a, b);
-// CHECK: urshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vrshlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vrshlq_u32
   return vrshlq_u32(a, b);
-// CHECK: urshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrshlq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vrshlq_u64
   return vrshlq_u64(a, b);
-// CHECK: urshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-
+// CHECK-LABEL: @test_vqrshl_s8(
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqrshl_s8
   return vqrshl_s8(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqrshl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrshl_s16
   return vqrshl_s16(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqrshl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrshl_s32
   return vqrshl_s32(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqrshl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqrshl_s64
   return vqrshl_s64(a, b);
-// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqrshl_u8(
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqrshl_u8
   return vqrshl_u8(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqrshl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrshl_u16
   return vqrshl_u16(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqrshl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrshl_u32
   return vqrshl_u32(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqrshl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqrshl_u64
   return vqrshl_u64(a, b);
-// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqrshlq_s8(
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqrshlq_s8
   return vqrshlq_s8(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqrshlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrshlq_s16
   return vqrshlq_s16(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqrshlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrshlq_s32
   return vqrshlq_s32(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqrshlq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqrshlq_s64
   return vqrshlq_s64(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vqrshlq_u8
+// CHECK-LABEL: @test_vqrshlq_u8(
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqrshlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrshlq_u16
   return vqrshlq_u16(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqrshlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrshlq_u32
   return vqrshlq_u32(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqrshlq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqrshlq_u64
   return vqrshlq_u64(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vsli_n_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 0)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) {
-// CHECK-LABEL: test_vsli_n_p64
-  return vsli_n_p64(a, b, 0); 
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0
+  return vsli_n_p64(a, b, 0);
 }
 
+// CHECK-LABEL: @test_vsliq_n_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 0)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) {
-// CHECK-LABEL: test_vsliq_n_p64
-  return vsliq_n_p64(a, b, 0); 
-// CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+  return vsliq_n_p64(a, b, 0);
 }
 
+// CHECK-LABEL: @test_vmax_s8(
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vmax_s8
   return vmax_s8(a, b);
-// CHECK: smax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmax_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vmax_s16
   return vmax_s16(a, b);
-// CHECK: smax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmax_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vmax_s32
   return vmax_s32(a, b);
-// CHECK: smax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmax_u8(
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vmax_u8
   return vmax_u8(a, b);
-// CHECK: umax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmax_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vmax_u16
   return vmax_u16(a, b);
-// CHECK: umax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmax_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vmax_u32
   return vmax_u32(a, b);
-// CHECK: umax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmax_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VMAX2_I]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmax_f32
   return vmax_f32(a, b);
-// CHECK: fmax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmaxq_s8(
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vmaxq_s8
   return vmaxq_s8(a, b);
-// CHECK: smax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmaxq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vmaxq_s16
   return vmaxq_s16(a, b);
-// CHECK: smax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmaxq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vmaxq_s32
   return vmaxq_s32(a, b);
-// CHECK: smax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmaxq_u8(
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vmaxq_u8
   return vmaxq_u8(a, b);
-// CHECK: umax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmaxq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vmaxq_u16
   return vmaxq_u16(a, b);
-// CHECK: umax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmaxq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vmaxq_u32
   return vmaxq_u32(a, b);
-// CHECK: umax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmaxq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VMAX2_I]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmaxq_f32
   return vmaxq_f32(a, b);
-// CHECK: fmax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmaxq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VMAX2_I]]
 float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmaxq_f64
   return vmaxq_f64(a, b);
-// CHECK: fmax {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-
+// CHECK-LABEL: @test_vmin_s8(
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vmin_s8
   return vmin_s8(a, b);
-// CHECK: smin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmin_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vmin_s16
   return vmin_s16(a, b);
-// CHECK: smin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmin_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vmin_s32
   return vmin_s32(a, b);
-// CHECK: smin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmin_u8(
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vmin_u8
   return vmin_u8(a, b);
-// CHECK: umin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmin_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vmin_u16
   return vmin_u16(a, b);
-// CHECK: umin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vmin_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vmin_u32
   return vmin_u32(a, b);
-// CHECK: umin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmin_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VMIN2_I]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmin_f32
   return vmin_f32(a, b);
-// CHECK: fmin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vminq_s8(
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vminq_s8
   return vminq_s8(a, b);
-// CHECK: smin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vminq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vminq_s16
   return vminq_s16(a, b);
-// CHECK: smin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vminq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vminq_s32
   return vminq_s32(a, b);
-// CHECK: smin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vminq_u8(
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vminq_u8
   return vminq_u8(a, b);
-// CHECK: umin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vminq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vminq_u16
   return vminq_u16(a, b);
-// CHECK: umin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vminq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vminq_u32
   return vminq_u32(a, b);
-// CHECK: umin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vminq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VMIN2_I]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vminq_f32
   return vminq_f32(a, b);
-// CHECK: fmin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vminq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VMIN2_I]]
 float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vminq_f64
   return vminq_f64(a, b);
-// CHECK: fmin {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmaxnm_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VMAXNM2_I]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmaxnm_f32
   return vmaxnm_f32(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmaxnmq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VMAXNM2_I]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmaxnmq_f32
   return vmaxnmq_f32(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmaxnmq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VMAXNM2_I]]
 float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmaxnmq_f64
   return vmaxnmq_f64(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vminnm_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VMINNM2_I]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vminnm_f32
   return vminnm_f32(a, b);
-// CHECK: fminnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vminnmq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VMINNM2_I]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vminnmq_f32
   return vminnmq_f32(a, b);
-// CHECK: fminnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vminnmq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VMINNM2_I]]
 float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vminnmq_f64
   return vminnmq_f64(a, b);
-// CHECK: fminnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vpmax_s8(
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpmax_s8
   return vpmax_s8(a, b);
-// CHECK: smaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpmax_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpmax_s16
   return vpmax_s16(a, b);
-// CHECK: smaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpmax_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpmax_s32
   return vpmax_s32(a, b);
-// CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpmax_u8(
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpmax_u8
   return vpmax_u8(a, b);
-// CHECK: umaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpmax_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpmax_u16
   return vpmax_u16(a, b);
-// CHECK: umaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpmax_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpmax_u32
   return vpmax_u32(a, b);
-// CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpmax_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VPMAX2_I]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmax_f32
   return vpmax_f32(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpmaxq_s8(
+// CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMAX_I]]
 int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpmaxq_s8
   return vpmaxq_s8(a, b);
-// CHECK: smaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpmaxq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VPMAX2_I]]
 int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpmaxq_s16
   return vpmaxq_s16(a, b);
-// CHECK: smaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpmaxq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VPMAX2_I]]
 int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_s32
   return vpmaxq_s32(a, b);
-// CHECK: smaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpmaxq_u8(
+// CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMAX_I]]
 uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpmaxq_u8
   return vpmaxq_u8(a, b);
-// CHECK: umaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpmaxq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VPMAX2_I]]
 uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpmaxq_u16
   return vpmaxq_u16(a, b);
-// CHECK: umaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpmaxq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VPMAX2_I]]
 uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_u32
   return vpmaxq_u32(a, b);
-// CHECK: umaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpmaxq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VPMAX2_I]]
 float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_f32
   return vpmaxq_f32(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpmaxq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VPMAX2_I]]
 float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpmaxq_f64
   return vpmaxq_f64(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vpmin_s8(
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpmin_s8
   return vpmin_s8(a, b);
-// CHECK: sminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpmin_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpmin_s16
   return vpmin_s16(a, b);
-// CHECK: sminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpmin_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpmin_s32
   return vpmin_s32(a, b);
-// CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpmin_u8(
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpmin_u8
   return vpmin_u8(a, b);
-// CHECK: uminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpmin_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpmin_u16
   return vpmin_u16(a, b);
-// CHECK: uminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpmin_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpmin_u32
   return vpmin_u32(a, b);
-// CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpmin_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VPMIN2_I]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmin_f32
   return vpmin_f32(a, b);
-// CHECK: fminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpminq_s8(
+// CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMIN_I]]
 int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpminq_s8
   return vpminq_s8(a, b);
-// CHECK: sminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpminq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VPMIN2_I]]
 int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpminq_s16
   return vpminq_s16(a, b);
-// CHECK: sminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpminq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VPMIN2_I]]
 int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpminq_s32
   return vpminq_s32(a, b);
-// CHECK: sminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpminq_u8(
+// CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMIN_I]]
 uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpminq_u8
   return vpminq_u8(a, b);
-// CHECK: uminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpminq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VPMIN2_I]]
 uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpminq_u16
   return vpminq_u16(a, b);
-// CHECK: uminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpminq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VPMIN2_I]]
 uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpminq_u32
   return vpminq_u32(a, b);
-// CHECK: uminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpminq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VPMIN2_I]]
 float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpminq_f32
   return vpminq_f32(a, b);
-// CHECK: fminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpminq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VPMIN2_I]]
 float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpminq_f64
   return vpminq_f64(a, b);
-// CHECK: fminp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vpmaxnm_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VPMAXNM2_I]]
 float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmaxnm_f32
   return vpmaxnm_f32(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpmaxnmq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VPMAXNM2_I]]
 float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpmaxnmq_f32
   return vpmaxnmq_f32(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpmaxnmq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VPMAXNM2_I]]
 float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpmaxnmq_f64
   return vpmaxnmq_f64(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vpminnm_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VPMINNM2_I]]
 float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpminnm_f32
   return vpminnm_f32(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpminnmq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VPMINNM2_I]]
 float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpminnmq_f32
   return vpminnmq_f32(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpminnmq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VPMINNM2_I]]
 float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpminnmq_f64
   return vpminnmq_f64(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vpadd_s8(
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpadd_s8
   return vpadd_s8(a, b);
-// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpadd_s16
   return vpadd_s16(a, b);
-// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpadd_s32
   return vpadd_s32(a, b);
-// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpadd_u8(
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpadd_u8
   return vpadd_u8(a, b);
-// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpadd_u16
   return vpadd_u16(a, b);
-// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpadd_u32
   return vpadd_u32(a, b);
-// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpadd_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VPADD_V2_I]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpadd_f32
   return vpadd_f32(a, b);
-// CHECK: faddp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpaddq_s8(
+// CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
 int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpaddq_s8
   return vpaddq_s8(a, b);
-// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpaddq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VPADDQ_V2_I]]
 int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpaddq_s16
   return vpaddq_s16(a, b);
-// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpaddq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VPADDQ_V2_I]]
 int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpaddq_s32
   return vpaddq_s32(a, b);
-// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpaddq_u8(
+// CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
 uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpaddq_u8
   return vpaddq_u8(a, b);
-// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpaddq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VPADDQ_V2_I]]
 uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpaddq_u16
   return vpaddq_u16(a, b);
-// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpaddq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VPADDQ_V2_I]]
 uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpaddq_u32
   return vpaddq_u32(a, b);
-// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpaddq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VPADDQ_V2_I]]
 float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpaddq_f32
   return vpaddq_f32(a, b);
-// CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpaddq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x double> [[VPADDQ_V2_I]]
 float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpaddq_f64
   return vpaddq_f64(a, b);
-// CHECK: faddp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqdmulh_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqdmulh_s16
   return vqdmulh_s16(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqdmulh_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqdmulh_s32
   return vqdmulh_s32(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmulhq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqdmulhq_s16
   return vqdmulhq_s16(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqdmulhq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqdmulhq_s32
   return vqdmulhq_s32(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqrdmulh_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrdmulh_s16
   return vqrdmulh_s16(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqrdmulh_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrdmulh_s32
   return vqrdmulh_s32(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrdmulhq_s16
   return vqrdmulhq_s16(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqrdmulhq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrdmulhq_s32
   return vqrdmulhq_s32(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmulx_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmulx_f32
   return vmulx_f32(a, b);
-// CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmulxq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmulxq_f32
   return vmulxq_f32(a, b);
-// CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmulxq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %b) #4
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmulxq_f64
   return vmulxq_f64(a, b);
-// CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vshl_n_s8(
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_s8(int8x8_t a) {
-// CHECK-LABEL: test_vshl_n_s8
   return vshl_n_s8(a, 3);
-// CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vshl_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_s16(int16x4_t a) {
-// CHECK-LABEL: test_vshl_n_s16
   return vshl_n_s16(a, 3);
-// CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vshl_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_s32(int32x2_t a) {
-// CHECK-LABEL: test_vshl_n_s32
   return vshl_n_s32(a, 3);
-// CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vshlq_n_s8(
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
-// CHECK-LABEL: test_vshlq_n_s8
   return vshlq_n_s8(a, 3);
-// CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vshlq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
-// CHECK-LABEL: test_vshlq_n_s16
   return vshlq_n_s16(a, 3);
-// CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vshlq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
-// CHECK-LABEL: test_vshlq_n_s32
   return vshlq_n_s32(a, 3);
-// CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vshlq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
-// CHECK-LABEL: test_vshlq_n_s64
   return vshlq_n_s64(a, 3);
-// CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vshl_n_u8(
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_u8(int8x8_t a) {
-// CHECK-LABEL: test_vshl_n_u8
   return vshl_n_u8(a, 3);
-// CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vshl_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_u16(int16x4_t a) {
-// CHECK-LABEL: test_vshl_n_u16
   return vshl_n_u16(a, 3);
-// CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vshl_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_u32(int32x2_t a) {
-// CHECK-LABEL: test_vshl_n_u32
   return vshl_n_u32(a, 3);
-// CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vshlq_n_u8(
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_u8(int8x16_t a) {
-// CHECK-LABEL: test_vshlq_n_u8
   return vshlq_n_u8(a, 3);
-// CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vshlq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_u16(int16x8_t a) {
-// CHECK-LABEL: test_vshlq_n_u16
   return vshlq_n_u16(a, 3);
-// CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vshlq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_u32(int32x4_t a) {
-// CHECK-LABEL: test_vshlq_n_u32
   return vshlq_n_u32(a, 3);
-// CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vshlq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_u64(int64x2_t a) {
-// CHECK-LABEL: test_vshlq_n_u64
   return vshlq_n_u64(a, 3);
-// CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vshr_n_s8(
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vshr_n_s8
   return vshr_n_s8(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vshr_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vshr_n_s16
   return vshr_n_s16(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vshr_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vshr_n_s32
   return vshr_n_s32(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vshrq_n_s8(
+// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vshrq_n_s8
   return vshrq_n_s8(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vshrq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrq_n_s16
   return vshrq_n_s16(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vshrq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrq_n_s32
   return vshrq_n_s32(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vshrq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrq_n_s64
   return vshrq_n_s64(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vshr_n_u8(
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_u8(int8x8_t a) {
-  // CHECK-LABEL: test_vshr_n_u8
   return vshr_n_u8(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vshr_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_u16(int16x4_t a) {
-  // CHECK-LABEL: test_vshr_n_u16
   return vshr_n_u16(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vshr_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_u32(int32x2_t a) {
-  // CHECK-LABEL: test_vshr_n_u32
   return vshr_n_u32(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vshrq_n_u8(
+// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_u8(int8x16_t a) {
-  // CHECK-LABEL: test_vshrq_n_u8
   return vshrq_n_u8(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vshrq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_u16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrq_n_u16
   return vshrq_n_u16(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vshrq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_u32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrq_n_u32
   return vshrq_n_u32(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vshrq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_u64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrq_n_u64
   return vshrq_n_u64(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vsra_n_s8(
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsra_n_s8
   return vsra_n_s8(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vsra_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsra_n_s16
   return vsra_n_s16(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vsra_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsra_n_s32
   return vsra_n_s32(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vsraq_n_s8(
+// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsraq_n_s8
   return vsraq_n_s8(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vsraq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsraq_n_s16
   return vsraq_n_s16(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vsraq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsraq_n_s32
   return vsraq_n_s32(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vsraq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 3, i64 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsraq_n_s64
   return vsraq_n_s64(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vsra_n_u8(
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsra_n_u8
   return vsra_n_u8(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vsra_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsra_n_u16
   return vsra_n_u16(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vsra_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsra_n_u32
   return vsra_n_u32(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vsraq_n_u8(
+// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsraq_n_u8
   return vsraq_n_u8(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vsraq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsraq_n_u16
   return vsraq_n_u16(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vsraq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsraq_n_u32
   return vsraq_n_u32(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vsraq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 3, i64 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsraq_n_u64
   return vsraq_n_u64(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vrshr_n_s8(
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vrshr_n_s8
   return vrshr_n_s8(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vrshr_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <4 x i16> [[VRSHR_N1]]
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vrshr_n_s16
   return vrshr_n_s16(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vrshr_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   ret <2 x i32> [[VRSHR_N1]]
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vrshr_n_s32
   return vrshr_n_s32(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vrshrq_n_s8(
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s8
   return vrshrq_n_s8(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vrshrq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <8 x i16> [[VRSHR_N1]]
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s16
   return vrshrq_n_s16(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vrshrq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   ret <4 x i32> [[VRSHR_N1]]
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s32
   return vrshrq_n_s32(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vrshrq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   ret <2 x i64> [[VRSHR_N1]]
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s64
   return vrshrq_n_s64(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vrshr_n_u8(
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_u8(int8x8_t a) {
-  // CHECK-LABEL: test_vrshr_n_u8
   return vrshr_n_u8(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vrshr_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <4 x i16> [[VRSHR_N1]]
 int16x4_t test_vrshr_n_u16(int16x4_t a) {
-  // CHECK-LABEL: test_vrshr_n_u16
   return vrshr_n_u16(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vrshr_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   ret <2 x i32> [[VRSHR_N1]]
 int32x2_t test_vrshr_n_u32(int32x2_t a) {
-  // CHECK-LABEL: test_vrshr_n_u32
   return vrshr_n_u32(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vrshrq_n_u8(
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_u8(int8x16_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u8
   return vrshrq_n_u8(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vrshrq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <8 x i16> [[VRSHR_N1]]
 int16x8_t test_vrshrq_n_u16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u16
   return vrshrq_n_u16(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vrshrq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   ret <4 x i32> [[VRSHR_N1]]
 int32x4_t test_vrshrq_n_u32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u32
   return vrshrq_n_u32(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vrshrq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   ret <2 x i64> [[VRSHR_N1]]
 int64x2_t test_vrshrq_n_u64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u64
   return vrshrq_n_u64(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vrsra_n_s8(
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vrsra_n_s8
   return vrsra_n_s8(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vrsra_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vrsra_n_s16
   return vrsra_n_s16(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vrsra_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vrsra_n_s32
   return vrsra_n_s32(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vrsraq_n_s8(
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s8
   return vrsraq_n_s8(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vrsraq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s16
   return vrsraq_n_s16(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vrsraq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s32
   return vrsraq_n_s32(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vrsraq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s64
   return vrsraq_n_s64(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vrsra_n_u8(
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vrsra_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vrsra_n_u8
   return vrsra_n_u8(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vrsra_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vrsra_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vrsra_n_u16
   return vrsra_n_u16(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vrsra_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vrsra_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vrsra_n_u32
   return vrsra_n_u32(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vrsraq_n_u8(
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vrsraq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u8
   return vrsraq_n_u8(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vrsraq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vrsraq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u16
   return vrsraq_n_u16(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vrsraq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vrsraq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u32
   return vrsraq_n_u32(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vrsraq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vrsraq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u64
   return vrsraq_n_u64(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vsri_n_s8(
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_s8
   return vsri_n_s8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vsri_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3)
+// CHECK:   ret <4 x i16> [[VSRI_N2]]
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_s16
   return vsri_n_s16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vsri_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3)
+// CHECK:   ret <2 x i32> [[VSRI_N2]]
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsri_n_s32
   return vsri_n_s32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_s8(
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_s8
   return vsriq_n_s8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3)
+// CHECK:   ret <8 x i16> [[VSRI_N2]]
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_s16
   return vsriq_n_s16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3)
+// CHECK:   ret <4 x i32> [[VSRI_N2]]
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsriq_n_s32
   return vsriq_n_s32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3)
+// CHECK:   ret <2 x i64> [[VSRI_N2]]
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_s64
   return vsriq_n_s64(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vsri_n_u8(
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_u8
   return vsri_n_u8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vsri_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3)
+// CHECK:   ret <4 x i16> [[VSRI_N2]]
 int16x4_t test_vsri_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_u16
   return vsri_n_u16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vsri_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3)
+// CHECK:   ret <2 x i32> [[VSRI_N2]]
 int32x2_t test_vsri_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsri_n_u32
   return vsri_n_u32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_u8(
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_u8
   return vsriq_n_u8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3)
+// CHECK:   ret <8 x i16> [[VSRI_N2]]
 int16x8_t test_vsriq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_u16
   return vsriq_n_u16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3)
+// CHECK:   ret <4 x i32> [[VSRI_N2]]
 int32x4_t test_vsriq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsriq_n_u32
   return vsriq_n_u32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3)
+// CHECK:   ret <2 x i64> [[VSRI_N2]]
 int64x2_t test_vsriq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_u64
   return vsriq_n_u64(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vsri_n_p8(
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_p8
   return vsri_n_p8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vsri_n_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 15)
+// CHECK:   ret <4 x i16> [[VSRI_N2]]
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_p16
   return vsri_n_p16(a, b, 15);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
 }
 
+// CHECK-LABEL: @test_vsriq_n_p8(
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_p8
   return vsriq_n_p8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vsriq_n_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 15)
+// CHECK:   ret <8 x i16> [[VSRI_N2]]
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_p16
   return vsriq_n_p16(a, b, 15);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
 }
 
+// CHECK-LABEL: @test_vsli_n_s8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_s8
   return vsli_n_s8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vsli_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_s16
   return vsli_n_s16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vsli_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsli_n_s32
   return vsli_n_s32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_s8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_s8
   return vsliq_n_s8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_s16
   return vsliq_n_s16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsliq_n_s32
   return vsliq_n_s32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsliq_n_s64
   return vsliq_n_s64(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vsli_n_u8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_u8
   return vsli_n_u8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vsli_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_u16
   return vsli_n_u16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vsli_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsli_n_u32
   return vsli_n_u32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_u8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_u8
   return vsliq_n_u8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_u16
   return vsliq_n_u16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsliq_n_u32
   return vsliq_n_u32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsliq_n_u64
   return vsliq_n_u64(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vsli_n_p8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_p8
   return vsli_n_p8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vsli_n_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 15)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_p16
   return vsli_n_p16(a, b, 15);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
 }
 
+// CHECK-LABEL: @test_vsliq_n_p8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_p8
   return vsliq_n_p8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vsliq_n_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 15)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_p16
   return vsliq_n_p16(a, b, 15);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
 }
 
+// CHECK-LABEL: @test_vqshlu_n_s8(
+// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+// CHECK:   ret <8 x i8> [[VQSHLU_N]]
 int8x8_t test_vqshlu_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s8
   return vqshlu_n_s8(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vqshlu_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+// CHECK:   ret <4 x i16> [[VQSHLU_N1]]
 int16x4_t test_vqshlu_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s16
   return vqshlu_n_s16(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: @test_vqshlu_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 3, i32 3>)
+// CHECK:   ret <2 x i32> [[VQSHLU_N1]]
 int32x2_t test_vqshlu_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s32
   return vqshlu_n_s32(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: @test_vqshluq_n_s8(
+// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+// CHECK:   ret <16 x i8> [[VQSHLU_N]]
 int8x16_t test_vqshluq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s8
   return vqshluq_n_s8(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vqshluq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+// CHECK:   ret <8 x i16> [[VQSHLU_N1]]
 int16x8_t test_vqshluq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s16
   return vqshluq_n_s16(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqshluq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+// CHECK:   ret <4 x i32> [[VQSHLU_N1]]
 int32x4_t test_vqshluq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s32
   return vqshluq_n_s32(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: @test_vqshluq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 3, i64 3>)
+// CHECK:   ret <2 x i64> [[VQSHLU_N1]]
 int64x2_t test_vqshluq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s64
   return vqshluq_n_s64(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: @test_vshrn_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrn_n_s16
   return vshrn_n_s16(a, 3);
-  // CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vshrn_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrn_n_s32
   return vshrn_n_s32(a, 9);
-  // CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vshrn_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrn_n_s64
   return vshrn_n_s64(a, 19);
-  // CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vshrn_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vshrn_n_u16
   return vshrn_n_u16(a, 3);
-  // CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vshrn_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vshrn_n_u32
   return vshrn_n_u32(a, 9);
-  // CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vshrn_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vshrn_n_u64
   return vshrn_n_u64(a, 19);
-  // CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vshrn_high_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s16
   return vshrn_high_n_s16(a, b, 3);
-  // CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vshrn_high_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s32
   return vshrn_high_n_s32(a, b, 9);
-  // CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vshrn_high_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s64
   return vshrn_high_n_s64(a, b, 19);
-  // CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vshrn_high_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u16
   return vshrn_high_n_u16(a, b, 3);
-  // CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vshrn_high_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u32
   return vshrn_high_n_u32(a, b, 9);
-  // CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vshrn_high_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u64
   return vshrn_high_n_u64(a, b, 19);
-  // CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqshrun_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
 int8x8_t test_vqshrun_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s16
   return vqshrun_n_s16(a, 3);
-  // CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqshrun_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
 int16x4_t test_vqshrun_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s32
   return vqshrun_n_s32(a, 9);
-  // CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqshrun_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
 int32x2_t test_vqshrun_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s64
   return vqshrun_n_s64(a, 19);
-  // CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqshrun_high_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s16
   return vqshrun_high_n_s16(a, b, 3);
-  // CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqshrun_high_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s32
   return vqshrun_high_n_s32(a, b, 9);
-  // CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqshrun_high_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s64
   return vqshrun_high_n_s64(a, b, 19);
-  // CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vrshrn_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s16
   return vrshrn_n_s16(a, 3);
-  // CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vrshrn_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s32
   return vrshrn_n_s32(a, 9);
-  // CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vrshrn_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s64
   return vrshrn_n_s64(a, 19);
-  // CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vrshrn_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u16
   return vrshrn_n_u16(a, 3);
-  // CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vrshrn_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u32
   return vrshrn_n_u32(a, 9);
-  // CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vrshrn_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u64
   return vrshrn_n_u64(a, 19);
-  // CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vrshrn_high_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s16
   return vrshrn_high_n_s16(a, b, 3);
-  // CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vrshrn_high_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s32
   return vrshrn_high_n_s32(a, b, 9);
-  // CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vrshrn_high_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s64
   return vrshrn_high_n_s64(a, b, 19);
-  // CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vrshrn_high_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u16
   return vrshrn_high_n_u16(a, b, 3);
-  // CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vrshrn_high_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u32
   return vrshrn_high_n_u32(a, b, 9);
-  // CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vrshrn_high_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u64
   return vrshrn_high_n_u64(a, b, 19);
-  // CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqrshrun_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
 int8x8_t test_vqrshrun_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s16
   return vqrshrun_n_s16(a, 3);
-  // CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqrshrun_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
 int16x4_t test_vqrshrun_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s32
   return vqrshrun_n_s32(a, 9);
-  // CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqrshrun_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
 int32x2_t test_vqrshrun_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s64
   return vqrshrun_n_s64(a, 19);
-  // CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqrshrun_high_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s16
   return vqrshrun_high_n_s16(a, b, 3);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqrshrun_high_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s32
   return vqrshrun_high_n_s32(a, b, 9);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqrshrun_high_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s64
   return vqrshrun_high_n_s64(a, b, 19);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqshrn_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s16
   return vqshrn_n_s16(a, 3);
-  // CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqshrn_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s32
   return vqshrn_n_s32(a, 9);
-  // CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqshrn_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s64
   return vqshrn_n_s64(a, 19);
-  // CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqshrn_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u16
   return vqshrn_n_u16(a, 3);
-  // CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqshrn_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u32
   return vqshrn_n_u32(a, 9);
-  // CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqshrn_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u64
   return vqshrn_n_u64(a, 19);
-  // CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqshrn_high_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s16
   return vqshrn_high_n_s16(a, b, 3);
-  // CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqshrn_high_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s32
   return vqshrn_high_n_s32(a, b, 9);
-  // CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqshrn_high_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s64
   return vqshrn_high_n_s64(a, b, 19);
-  // CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqshrn_high_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u16
   return vqshrn_high_n_u16(a, b, 3);
-  // CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqshrn_high_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u32
   return vqshrn_high_n_u32(a, b, 9);
-  // CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqshrn_high_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u64
   return vqshrn_high_n_u64(a, b, 19);
-  // CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqrshrn_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s16
   return vqrshrn_n_s16(a, 3);
-  // CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqrshrn_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s32
   return vqrshrn_n_s32(a, 9);
-  // CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqrshrn_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s64
   return vqrshrn_n_s64(a, 19);
-  // CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqrshrn_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u16
   return vqrshrn_n_u16(a, 3);
-  // CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqrshrn_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u32
   return vqrshrn_n_u32(a, 9);
-  // CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqrshrn_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u64
   return vqrshrn_n_u64(a, 19);
-  // CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqrshrn_high_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s16
   return vqrshrn_high_n_s16(a, b, 3);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqrshrn_high_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s32
   return vqrshrn_high_n_s32(a, b, 9);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqrshrn_high_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s64
   return vqrshrn_high_n_s64(a, b, 19);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vqrshrn_high_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u16
   return vqrshrn_high_n_u16(a, b, 3);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: @test_vqrshrn_high_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u32
   return vqrshrn_high_n_u32(a, b, 9);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: @test_vqrshrn_high_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u64
   return vqrshrn_high_n_u64(a, b, 19);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: @test_vshll_n_s8(
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
-// CHECK-LABEL: test_vshll_n_s8
   return vshll_n_s8(a, 3);
-// CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vshll_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
-// CHECK-LABEL: test_vshll_n_s16
   return vshll_n_s16(a, 9);
-// CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
 }
 
+// CHECK-LABEL: @test_vshll_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
-// CHECK-LABEL: test_vshll_n_s32
   return vshll_n_s32(a, 19);
-// CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
 }
 
+// CHECK-LABEL: @test_vshll_n_u8(
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
-// CHECK-LABEL: test_vshll_n_u8
   return vshll_n_u8(a, 3);
-// CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: @test_vshll_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
-// CHECK-LABEL: test_vshll_n_u16
   return vshll_n_u16(a, 9);
-// CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
 }
 
+// CHECK-LABEL: @test_vshll_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vshll_n_u32
   return vshll_n_u32(a, 19);
-// CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
 }
 
+// CHECK-LABEL: @test_vshll_high_n_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
-// CHECK-LABEL: test_vshll_high_n_s8
   return vshll_high_n_s8(a, 3);
-// CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vshll_high_n_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
-// CHECK-LABEL: test_vshll_high_n_s16
   return vshll_high_n_s16(a, 9);
-// CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
 }
 
+// CHECK-LABEL: @test_vshll_high_n_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
-// CHECK-LABEL: test_vshll_high_n_s32
   return vshll_high_n_s32(a, 19);
-// CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
 }
 
+// CHECK-LABEL: @test_vshll_high_n_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
-// CHECK-LABEL: test_vshll_high_n_u8
   return vshll_high_n_u8(a, 3);
-// CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: @test_vshll_high_n_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
-// CHECK-LABEL: test_vshll_high_n_u16
   return vshll_high_n_u16(a, 9);
-// CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
 }
 
+// CHECK-LABEL: @test_vshll_high_n_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vshll_high_n_u32
   return vshll_high_n_u32(a, 19);
-// CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
 }
 
+// CHECK-LABEL: @test_vmovl_s8(
+// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
-// CHECK-LABEL: test_vmovl_s8
   return vmovl_s8(a);
-// CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: @test_vmovl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
-// CHECK-LABEL: test_vmovl_s16
   return vmovl_s16(a);
-// CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: @test_vmovl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
-// CHECK-LABEL: test_vmovl_s32
   return vmovl_s32(a);
-// CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: @test_vmovl_u8(
+// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
-// CHECK-LABEL: test_vmovl_u8
   return vmovl_u8(a);
-// CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: @test_vmovl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
-// CHECK-LABEL: test_vmovl_u16
   return vmovl_u16(a);
-// CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: @test_vmovl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vmovl_u32
   return vmovl_u32(a);
-// CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: @test_vmovl_high_s8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vmovl_high_s8(int8x16_t a) {
-// CHECK-LABEL: test_vmovl_high_s8
   return vmovl_high_s8(a);
-// CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: @test_vmovl_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vmovl_high_s16(int16x8_t a) {
-// CHECK-LABEL: test_vmovl_high_s16
   return vmovl_high_s16(a);
-// CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: @test_vmovl_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vmovl_high_s32(int32x4_t a) {
-// CHECK-LABEL: test_vmovl_high_s32
   return vmovl_high_s32(a);
-// CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: @test_vmovl_high_u8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vmovl_high_u8(uint8x16_t a) {
-// CHECK-LABEL: test_vmovl_high_u8
   return vmovl_high_u8(a);
-// CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: @test_vmovl_high_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vmovl_high_u16(uint16x8_t a) {
-// CHECK-LABEL: test_vmovl_high_u16
   return vmovl_high_u16(a);
-// CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: @test_vmovl_high_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 uint64x2_t test_vmovl_high_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vmovl_high_u32
   return vmovl_high_u32(a);
-// CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: @test_vcvt_n_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x float> [[VCVT_N1]]
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_f32_s32
   return vcvt_n_f32_s32(a, 31);
-  // CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: @test_vcvtq_n_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x float> [[VCVT_N1]]
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f32_s32
   return vcvtq_n_f32_s32(a, 31);
-  // CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: @test_vcvtq_n_f64_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x double> [[VCVT_N1]]
 float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f64_s64
   return vcvtq_n_f64_s64(a, 50);
-  // CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: @test_vcvt_n_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x float> [[VCVT_N1]]
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_f32_u32
   return vcvt_n_f32_u32(a, 31);
-  // CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: @test_vcvtq_n_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x float> [[VCVT_N1]]
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f32_u32
   return vcvtq_n_f32_u32(a, 31);
-  // CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: @test_vcvtq_n_f64_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x double> [[VCVT_N1]]
 float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f64_u64
   return vcvtq_n_f64_u64(a, 50);
-  // CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: @test_vcvt_n_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x i32> [[VCVT_N1]]
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_s32_f32
   return vcvt_n_s32_f32(a, 31);
-  // CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: @test_vcvtq_n_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x i32> [[VCVT_N1]]
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_s32_f32
   return vcvtq_n_s32_f32(a, 31);
-  // CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: @test_vcvtq_n_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x i64> [[VCVT_N1]]
 int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_s64_f64
   return vcvtq_n_s64_f64(a, 50);
-  // CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: @test_vcvt_n_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x i32> [[VCVT_N1]]
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_u32_f32
   return vcvt_n_u32_f32(a, 31);
-  // CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: @test_vcvtq_n_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x i32> [[VCVT_N1]]
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_u32_f32
   return vcvtq_n_u32_f32(a, 31);
-  // CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: @test_vcvtq_n_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x i64> [[VCVT_N1]]
 uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_u64_f64
   return vcvtq_n_u64_f64(a, 50);
-  // CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: @test_vaddl_s8(
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vaddl_s8
   return vaddl_s8(a, b);
-  // CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vaddl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vaddl_s16
   return vaddl_s16(a, b);
-  // CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vaddl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vaddl_s32
   return vaddl_s32(a, b);
-  // CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddl_u8(
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vaddl_u8
   return vaddl_u8(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vaddl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vaddl_u16
   return vaddl_u16(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vaddl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vaddl_u32
   return vaddl_u32(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddl_high_s8(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vaddl_high_s8
   return vaddl_high_s8(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vaddl_high_s16(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddl_high_s16
   return vaddl_high_s16(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddl_high_s32(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddl_high_s32
   return vaddl_high_s32(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddl_high_u8(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vaddl_high_u8
   return vaddl_high_u8(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vaddl_high_u16(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddl_high_u16
   return vaddl_high_u16(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddl_high_u32(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddl_high_u32
   return vaddl_high_u32(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddw_s8(
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vaddw_s8
   return vaddw_s8(a, b);
-  // CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vaddw_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vaddw_s16
   return vaddw_s16(a, b);
-  // CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vaddw_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vaddw_s32
   return vaddw_s32(a, b);
-  // CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddw_u8(
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vaddw_u8
   return vaddw_u8(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vaddw_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vaddw_u16
   return vaddw_u16(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vaddw_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vaddw_u32
   return vaddw_u32(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddw_high_s8(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vaddw_high_s8
   return vaddw_high_s8(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vaddw_high_s16(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddw_high_s16
   return vaddw_high_s16(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddw_high_s32(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddw_high_s32
   return vaddw_high_s32(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddw_high_u8(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vaddw_high_u8
   return vaddw_high_u8(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vaddw_high_u16(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddw_high_u16
   return vaddw_high_u16(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddw_high_u32(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddw_high_u32
   return vaddw_high_u32(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubl_s8(
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsubl_s8
   return vsubl_s8(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vsubl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsubl_s16
   return vsubl_s16(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vsubl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsubl_s32
   return vsubl_s32(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsubl_u8(
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsubl_u8
   return vsubl_u8(a, b);
-  // CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vsubl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsubl_u16
   return vsubl_u16(a, b);
-  // CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vsubl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsubl_u32
   return vsubl_u32(a, b);
-  // CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsubl_high_s8(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsubl_high_s8
   return vsubl_high_s8(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vsubl_high_s16(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubl_high_s16
   return vsubl_high_s16(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubl_high_s32(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubl_high_s32
   return vsubl_high_s32(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubl_high_u8(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsubl_high_u8
   return vsubl_high_u8(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vsubl_high_u16(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubl_high_u16
   return vsubl_high_u16(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubl_high_u32(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubl_high_u32
   return vsubl_high_u32(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubw_s8(
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsubw_s8
   return vsubw_s8(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vsubw_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsubw_s16
   return vsubw_s16(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vsubw_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsubw_s32
   return vsubw_s32(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsubw_u8(
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsubw_u8
   return vsubw_u8(a, b);
-  // CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vsubw_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsubw_u16
   return vsubw_u16(a, b);
-  // CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vsubw_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsubw_u32
   return vsubw_u32(a, b);
-  // CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsubw_high_s8(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsubw_high_s8
   return vsubw_high_s8(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vsubw_high_s16(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubw_high_s16
   return vsubw_high_s16(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubw_high_s32(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubw_high_s32
   return vsubw_high_s32(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubw_high_u8(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsubw_high_u8
   return vsubw_high_u8(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vsubw_high_u16(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubw_high_u16
   return vsubw_high_u16(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubw_high_u32(
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubw_high_u32
   return vsubw_high_u32(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddhn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_s16
   return vaddhn_s16(a, b);
-  // CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddhn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_s32
   return vaddhn_s32(a, b);
-  // CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddhn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_s64
   return vaddhn_s64(a, b);
-  // CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vaddhn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_u16
   return vaddhn_u16(a, b);
-  // CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddhn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_u32
   return vaddhn_u32(a, b);
-  // CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddhn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_u64
   return vaddhn_u64(a, b);
-  // CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vaddhn_high_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s16
   return vaddhn_high_s16(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddhn_high_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s32
   return vaddhn_high_s32(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddhn_high_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s64
   return vaddhn_high_s64(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vaddhn_high_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u16
   return vaddhn_high_u16(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vaddhn_high_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u32
   return vaddhn_high_u32(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vaddhn_high_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u64
   return vaddhn_high_u64(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vraddhn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_s16
   return vraddhn_s16(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vraddhn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_s32
   return vraddhn_s32(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vraddhn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_s64
   return vraddhn_s64(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vraddhn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_u16
   return vraddhn_u16(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vraddhn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_u32
   return vraddhn_u32(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vraddhn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_u64
   return vraddhn_u64(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vraddhn_high_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s16
   return vraddhn_high_s16(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vraddhn_high_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s32
   return vraddhn_high_s32(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vraddhn_high_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s64
   return vraddhn_high_s64(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vraddhn_high_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u16
   return vraddhn_high_u16(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vraddhn_high_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u32
   return vraddhn_high_u32(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vraddhn_high_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u64
   return vraddhn_high_u64(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vsubhn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_s16
   return vsubhn_s16(a, b);
-  // CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubhn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_s32
   return vsubhn_s32(a, b);
-  // CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubhn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_s64
   return vsubhn_s64(a, b);
-  // CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vsubhn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_u16
   return vsubhn_u16(a, b);
-  // CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubhn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_u32
   return vsubhn_u32(a, b);
-  // CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubhn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_u64
   return vsubhn_u64(a, b);
-  // CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vsubhn_high_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s16
   return vsubhn_high_s16(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubhn_high_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s32
   return vsubhn_high_s32(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubhn_high_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s64
   return vsubhn_high_s64(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vsubhn_high_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u16
   return vsubhn_high_u16(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsubhn_high_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u32
   return vsubhn_high_u32(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsubhn_high_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u64
   return vsubhn_high_u64(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrsubhn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_s16
   return vrsubhn_s16(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vrsubhn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_s32
   return vrsubhn_s32(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrsubhn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_s64
   return vrsubhn_s64(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrsubhn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_u16
   return vrsubhn_u16(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vrsubhn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_u32
   return vrsubhn_u32(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrsubhn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_u64
   return vrsubhn_u64(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrsubhn_high_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s16
   return vrsubhn_high_s16(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vrsubhn_high_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s32
   return vrsubhn_high_s32(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrsubhn_high_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s64
   return vrsubhn_high_s64(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrsubhn_high_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u16
   return vrsubhn_high_u16(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vrsubhn_high_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u32
   return vrsubhn_high_u32(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrsubhn_high_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u64
   return vrsubhn_high_u64(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vabdl_s8(
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vabdl_s8
   return vabdl_s8(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vabdl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vabdl_s16
   return vabdl_s16(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vabdl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vabdl_s32
   return vabdl_s32(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+
+// CHECK-LABEL: @test_vabdl_u8(
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vabdl_u8
   return vabdl_u8(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vabdl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vabdl_u16
   return vabdl_u16(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vabdl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vabdl_u32
   return vabdl_u32(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vabal_s8(
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vabal_s8
   return vabal_s8(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vabal_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vabal_s16
   return vabal_s16(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vabal_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vabal_s32
   return vabal_s32(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+
+// CHECK-LABEL: @test_vabal_u8(
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vabal_u8
   return vabal_u8(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vabal_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vabal_u16
   return vabal_u16(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vabal_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vabal_u32
   return vabal_u32(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vabdl_high_s8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
 int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vabdl_high_s8
   return vabdl_high_s8(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vabdl_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
 int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vabdl_high_s16
   return vabdl_high_s16(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vabdl_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
 int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vabdl_high_s32
   return vabdl_high_s32(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vabdl_high_u8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
 uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vabdl_high_u8
   return vabdl_high_u8(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vabdl_high_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
 uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vabdl_high_u16
   return vabdl_high_u16(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vabdl_high_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
 uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vabdl_high_u32
   return vabdl_high_u32(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vabal_high_s8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vabal_high_s8
   return vabal_high_s8(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vabal_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vabal_high_s16
   return vabal_high_s16(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vabal_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vabal_high_s32
   return vabal_high_s32(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vabal_high_u8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vabal_high_u8
   return vabal_high_u8(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vabal_high_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vabal_high_u16
   return vabal_high_u16(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vabal_high_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vabal_high_u32
   return vabal_high_u32(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmull_s8(
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vmull_s8
   return vmull_s8(a, b);
-  // CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vmull_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vmull_s16
   return vmull_s16(a, b);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vmull_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vmull_s32
   return vmull_s32(a, b);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+
+// CHECK-LABEL: @test_vmull_u8(
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vmull_u8
   return vmull_u8(a, b);
-  // CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vmull_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vmull_u16
   return vmull_u16(a, b);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vmull_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vmull_u32
   return vmull_u32(a, b);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmull_high_s8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_s8
   return vmull_high_s8(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vmull_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I_I]]
 int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmull_high_s16
   return vmull_high_s16(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vmull_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I_I]]
 int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmull_high_s32
   return vmull_high_s32(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vmull_high_u8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_u8
   return vmull_high_u8(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vmull_high_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I_I]]
 uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vmull_high_u16
   return vmull_high_u16(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vmull_high_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I_I]]
 uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vmull_high_u32
   return vmull_high_u32(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmlal_s8(
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vmlal_s8
   return vmlal_s8(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vmlal_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vmlal_s16
   return vmlal_s16(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vmlal_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vmlal_s32
   return vmlal_s32(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+
+// CHECK-LABEL: @test_vmlal_u8(
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vmlal_u8
   return vmlal_u8(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vmlal_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vmlal_u16
   return vmlal_u16(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vmlal_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vmlal_u32
   return vmlal_u32(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlal_high_s8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vmlal_high_s8
   return vmlal_high_s8(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vmlal_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vmlal_high_s16
   return vmlal_high_s16(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vmlal_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vmlal_high_s32
   return vmlal_high_s32(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vmlal_high_u8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vmlal_high_u8
   return vmlal_high_u8(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vmlal_high_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vmlal_high_u16
   return vmlal_high_u16(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vmlal_high_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vmlal_high_u32
   return vmlal_high_u32(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmlsl_s8(
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vmlsl_s8
   return vmlsl_s8(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vmlsl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vmlsl_s16
   return vmlsl_s16(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vmlsl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vmlsl_s32
   return vmlsl_s32(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+
+// CHECK-LABEL: @test_vmlsl_u8(
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vmlsl_u8
   return vmlsl_u8(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vmlsl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vmlsl_u16
   return vmlsl_u16(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vmlsl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vmlsl_u32
   return vmlsl_u32(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmlsl_high_s8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I_I]]
 int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s8
   return vmlsl_high_s8(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vmlsl_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s16
   return vmlsl_high_s16(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vmlsl_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s32
   return vmlsl_high_s32(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vmlsl_high_u8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I_I]]
 uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u8
   return vmlsl_high_u8(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vmlsl_high_u16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u16
   return vmlsl_high_u16(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vmlsl_high_u32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u32
   return vmlsl_high_u32(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqdmull_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vqdmull_s16
   return vqdmull_s16(a, b);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vqdmull_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vqdmull_s32
   return vqdmull_s32(a, b);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmlal_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vqdmlal_s16
   return vqdmlal_s16(a, b, c);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqdmlal_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vqdmlal_s32
   return vqdmlal_s32(a, b, c);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmlsl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vqdmlsl_s16
   return vqdmlsl_s16(a, b, c);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqdmlsl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vqdmlsl_s32
   return vqdmlsl_s32(a, b, c);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqdmull_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I_I]]
 int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqdmull_high_s16
   return vqdmull_high_s16(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vqdmull_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I_I]]
 int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqdmull_high_s32
   return vqdmull_high_s32(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I_I]]
 int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_s16
   return vqdmlal_high_s16(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqdmlal_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I_I]]
 int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_s32
   return vqdmlal_high_s32(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_s16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I_I]]
 int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_s16
   return vqdmlsl_high_s16(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqdmlsl_high_s32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I_I]]
 int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_s32
   return vqdmlsl_high_s32(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmull_p8(
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vmull_p8
   return vmull_p8(a, b);
-  // CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmull_high_p8(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_p8
   return vmull_high_p8(a, b);
-  // CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vaddd_s64(
+// CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
+// CHECK:   ret i64 [[VADDD_I]]
 int64_t test_vaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vaddd_s64
   return vaddd_s64(a, b);
-// CHECK: add {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: @test_vaddd_u64(
+// CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
+// CHECK:   ret i64 [[VADDD_I]]
 uint64_t test_vaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vaddd_u64
   return vaddd_u64(a, b);
-// CHECK: add {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: @test_vsubd_s64(
+// CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
+// CHECK:   ret i64 [[VSUBD_I]]
 int64_t test_vsubd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsubd_s64
   return vsubd_s64(a, b);
-// CHECK: sub {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: @test_vsubd_u64(
+// CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
+// CHECK:   ret i64 [[VSUBD_I]]
 uint64_t test_vsubd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsubd_u64
   return vsubd_u64(a, b);
-// CHECK: sub {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqaddb_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqaddb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vqaddb_s8
   return vqaddb_s8(a, b);
-// CHECK: sqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: @test_vqaddh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqaddh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqaddh_s16
   return vqaddh_s16(a, b);
-// CHECK: sqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: @test_vqadds_s32(
+// CHECK:   [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQADDS_S32_I]]
 int32_t test_vqadds_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqadds_s32
   return vqadds_s32(a, b);
-// CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqaddd_s64(
+// CHECK:   [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQADDD_S64_I]]
 int64_t test_vqaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vqaddd_s64
   return vqaddd_s64(a, b);
-// CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqaddb_u8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vqaddb_u8
   return vqaddb_u8(a, b);
-// CHECK: uqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: @test_vqaddh_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vqaddh_u16
   return vqaddh_u16(a, b);
-// CHECK: uqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: @test_vqadds_u32(
+// CHECK:   [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQADDS_U32_I]]
 uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vqadds_u32
   return vqadds_u32(a, b);
-// CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqaddd_u64(
+// CHECK:   [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQADDD_U64_I]]
 uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vqaddd_u64
   return vqaddd_u64(a, b);
-// CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqsubb_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqsubb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vqsubb_s8
   return vqsubb_s8(a, b);
-// CHECK: sqsub {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: @test_vqsubh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqsubh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqsubh_s16
   return vqsubh_s16(a, b);
-// CHECK: sqsub {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: @test_vqsubs_s32(
+// CHECK:   [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSUBS_S32_I]]
 int32_t test_vqsubs_s32(int32_t a, int32_t b) {
-  // CHECK-LABEL: test_vqsubs_s32
   return vqsubs_s32(a, b);
-// CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqsubd_s64(
+// CHECK:   [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSUBD_S64_I]]
 int64_t test_vqsubd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vqsubd_s64
   return vqsubd_s64(a, b);
-// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqsubb_u8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vqsubb_u8
   return vqsubb_u8(a, b);
-// CHECK: uqsub {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: @test_vqsubh_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vqsubh_u16
   return vqsubh_u16(a, b);
-// CHECK: uqsub {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: @test_vqsubs_u32(
+// CHECK:   [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSUBS_U32_I]]
 uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vqsubs_u32
   return vqsubs_u32(a, b);
-// CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqsubd_u64(
+// CHECK:   [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSUBD_U64_I]]
 uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vqsubd_u64
   return vqsubd_u64(a, b);
-// CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vshld_s64(
+// CHECK:   [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSHLD_S64_I]]
 int64_t test_vshld_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vshld_s64
   return vshld_s64(a, b);
-// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vshld_u64(
+// CHECK:   [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSHLD_U64_I]]
 uint64_t test_vshld_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vshld_u64
   return vshld_u64(a, b);
-// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshlb_s8
+// CHECK-LABEL: @test_vqshlb_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqshlb_s8(int8_t a, int8_t b) {
   return vqshlb_s8(a, b);
-// CHECK: sqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqshlh_s16
+// CHECK-LABEL: @test_vqshlh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqshlh_s16(int16_t a, int16_t b) {
   return vqshlh_s16(a, b);
-// CHECK: sqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqshls_s32
+// CHECK-LABEL: @test_vqshls_s32(
+// CHECK:   [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSHLS_S32_I]]
 int32_t test_vqshls_s32(int32_t a, int32_t b) {
   return vqshls_s32(a, b);
-// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshld_s64
+// CHECK-LABEL: @test_vqshld_s64(
+// CHECK:   [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSHLD_S64_I]]
 int64_t test_vqshld_s64(int64_t a, int64_t b) {
   return vqshld_s64(a, b);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshlb_u8
+// CHECK-LABEL: @test_vqshlb_u8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) {
   return vqshlb_u8(a, b);
-// CHECK: uqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqshlh_u16
+// CHECK-LABEL: @test_vqshlh_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) {
   return vqshlh_u16(a, b);
-// CHECK: uqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqshls_u32
+// CHECK-LABEL: @test_vqshls_u32(
+// CHECK:   [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSHLS_U32_I]]
 uint32_t test_vqshls_u32(uint32_t a, uint32_t b) {
   return vqshls_u32(a, b);
-// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshld_u64
+// CHECK-LABEL: @test_vqshld_u64(
+// CHECK:   [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSHLD_U64_I]]
 uint64_t test_vqshld_u64(uint64_t a, uint64_t b) {
   return vqshld_u64(a, b);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vrshld_s64
+// CHECK-LABEL: @test_vrshld_s64(
+// CHECK:   [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VRSHLD_S64_I]]
 int64_t test_vrshld_s64(int64_t a, int64_t b) {
   return vrshld_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-
-// CHECK-LABEL: test_vrshld_u64
+// CHECK-LABEL: @test_vrshld_u64(
+// CHECK:   [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VRSHLD_U64_I]]
 uint64_t test_vrshld_u64(uint64_t a, uint64_t b) {
   return vrshld_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshlb_s8
+// CHECK-LABEL: @test_vqrshlb_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
   return vqrshlb_s8(a, b);
-// CHECK: sqrshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqrshlh_s16
+// CHECK-LABEL: @test_vqrshlh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
   return vqrshlh_s16(a, b);
-// CHECK: sqrshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqrshls_s32
+// CHECK-LABEL: @test_vqrshls_s32(
+// CHECK:   [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRSHLS_S32_I]]
 int32_t test_vqrshls_s32(int32_t a, int32_t b) {
   return vqrshls_s32(a, b);
-// CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshld_s64
+// CHECK-LABEL: @test_vqrshld_s64(
+// CHECK:   [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQRSHLD_S64_I]]
 int64_t test_vqrshld_s64(int64_t a, int64_t b) {
   return vqrshld_s64(a, b);
-// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshlb_u8
+// CHECK-LABEL: @test_vqrshlb_u8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) {
   return vqrshlb_u8(a, b);
-// CHECK: uqrshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqrshlh_u16
+// CHECK-LABEL: @test_vqrshlh_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) {
   return vqrshlh_u16(a, b);
-// CHECK: uqrshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqrshls_u32
+// CHECK-LABEL: @test_vqrshls_u32(
+// CHECK:   [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRSHLS_U32_I]]
 uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) {
   return vqrshls_u32(a, b);
-// CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshld_u64
+// CHECK-LABEL: @test_vqrshld_u64(
+// CHECK:   [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQRSHLD_U64_I]]
 uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) {
   return vqrshld_u64(a, b);
-// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vpaddd_s64
+// CHECK-LABEL: @test_vpaddd_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) #4
+// CHECK:   ret i64 [[VPADDD_S64_I]]
 int64_t test_vpaddd_s64(int64x2_t a) {
   return vpaddd_s64(a);
-// CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpadds_f32
+// CHECK-LABEL: @test_vpadds_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x float> %a, i64 0
+// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x float> %a, i64 1
+// CHECK:   [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
+// CHECK:   ret float [[VPADDD_I]]
 float32_t test_vpadds_f32(float32x2_t a) {
   return vpadds_f32(a);
-// CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpaddd_f64
+// CHECK-LABEL: @test_vpaddd_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x double> %a, i64 0
+// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x double> %a, i64 1
+// CHECK:   [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
+// CHECK:   ret double [[VPADDD_I]]
 float64_t test_vpaddd_f64(float64x2_t a) {
   return vpaddd_f64(a);
-// CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmaxnms_f32
+// CHECK-LABEL: @test_vpmaxnms_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VPMAXNMS_F32_I]]
 float32_t test_vpmaxnms_f32(float32x2_t a) {
   return vpmaxnms_f32(a);
-// CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpmaxnmqd_f64
+// CHECK-LABEL: @test_vpmaxnmqd_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VPMAXNMQD_F64_I]]
 float64_t test_vpmaxnmqd_f64(float64x2_t a) {
   return vpmaxnmqd_f64(a);
-// CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmaxs_f32
+// CHECK-LABEL: @test_vpmaxs_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VPMAXS_F32_I]]
 float32_t test_vpmaxs_f32(float32x2_t a) {
   return vpmaxs_f32(a);
-// CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpmaxqd_f64
+// CHECK-LABEL: @test_vpmaxqd_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VPMAXQD_F64_I]]
 float64_t test_vpmaxqd_f64(float64x2_t a) {
   return vpmaxqd_f64(a);
-// CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpminnms_f32
+// CHECK-LABEL: @test_vpminnms_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VPMINNMS_F32_I]]
 float32_t test_vpminnms_f32(float32x2_t a) {
   return vpminnms_f32(a);
-// CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpminnmqd_f64
+// CHECK-LABEL: @test_vpminnmqd_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VPMINNMQD_F64_I]]
 float64_t test_vpminnmqd_f64(float64x2_t a) {
   return vpminnmqd_f64(a);
-// CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmins_f32
+// CHECK-LABEL: @test_vpmins_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VPMINS_F32_I]]
 float32_t test_vpmins_f32(float32x2_t a) {
   return vpmins_f32(a);
-// CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpminqd_f64
+// CHECK-LABEL: @test_vpminqd_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VPMINQD_F64_I]]
 float64_t test_vpminqd_f64(float64x2_t a) {
   return vpminqd_f64(a);
-// CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqdmulhh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqdmulhh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqdmulhh_s16
   return vqdmulhh_s16(a, b);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: @test_vqdmulhs_s32(
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqdmulhs_s32
   return vqdmulhs_s32(a, b);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqrdmulhh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqrdmulhh_s16
   return vqrdmulhh_s16(a, b);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: @test_vqrdmulhs_s32(
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqrdmulhs_s32
   return vqrdmulhs_s32(a, b);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmulxs_f32(
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) #4
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vmulxs_f32
   return vmulxs_f32(a, b);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmulxd_f64(
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) #4
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vmulxd_f64
   return vmulxd_f64(a, b);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmulx_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x double> [[VMULX2_I]]
 float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmulx_f64
   return vmulx_f64(a, b);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrecpss_f32(
+// CHECK:   [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b) #4
+// CHECK:   ret float [[VRECPS_I]]
 float32_t test_vrecpss_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vrecpss_f32
   return vrecpss_f32(a, b);
-// CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrecpsd_f64(
+// CHECK:   [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b) #4
+// CHECK:   ret double [[VRECPS_I]]
 float64_t test_vrecpsd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vrecpsd_f64
   return vrecpsd_f64(a, b);
-// CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrsqrtss_f32(
+// CHECK:   [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) #4
+// CHECK:   ret float [[VRSQRTSS_F32_I]]
 float32_t test_vrsqrtss_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vrsqrtss_f32
   return vrsqrtss_f32(a, b);
-// CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrsqrtsd_f64(
+// CHECK:   [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) #4
+// CHECK:   ret double [[VRSQRTSD_F64_I]]
 float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vrsqrtsd_f64
   return vrsqrtsd_f64(a, b);
-// CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvts_f32_s32(
+// CHECK:   [[TMP0:%.*]] = sitofp i32 %a to float
+// CHECK:   ret float [[TMP0]]
 float32_t test_vcvts_f32_s32(int32_t a) {
-// CHECK-LABEL: test_vcvts_f32_s32
-// CHECK: scvtf {{s[0-9]+}}, {{[ws][0-9]+}}
   return vcvts_f32_s32(a);
 }
 
+// CHECK-LABEL: @test_vcvtd_f64_s64(
+// CHECK:   [[TMP0:%.*]] = sitofp i64 %a to double
+// CHECK:   ret double [[TMP0]]
 float64_t test_vcvtd_f64_s64(int64_t a) {
-// CHECK-LABEL: test_vcvtd_f64_s64
-// CHECK: scvtf {{d[0-9]+}}, {{[dx][0-9]+}}
   return vcvtd_f64_s64(a);
 }
 
+// CHECK-LABEL: @test_vcvts_f32_u32(
+// CHECK:   [[TMP0:%.*]] = uitofp i32 %a to float
+// CHECK:   ret float [[TMP0]]
 float32_t test_vcvts_f32_u32(uint32_t a) {
-// CHECK-LABEL: test_vcvts_f32_u32
-// CHECK: ucvtf {{s[0-9]+}}, {{[ws][0-9]+}}
   return vcvts_f32_u32(a);
 }
 
+// CHECK-LABEL: @test_vcvtd_f64_u64(
+// CHECK:   [[TMP0:%.*]] = uitofp i64 %a to double
+// CHECK:   ret double [[TMP0]]
 float64_t test_vcvtd_f64_u64(uint64_t a) {
-// CHECK-LABEL: test_vcvtd_f64_u64
-// CHECK: ucvtf {{d[0-9]+}}, {{[xd][0-9]+}}
   return vcvtd_f64_u64(a);
 }
 
+// CHECK-LABEL: @test_vrecpes_f32(
+// CHECK:   [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a) #4
+// CHECK:   ret float [[VRECPES_F32_I]]
 float32_t test_vrecpes_f32(float32_t a) {
-// CHECK-LABEL: test_vrecpes_f32
-// CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
   return vrecpes_f32(a);
 }
- 
+
+// CHECK-LABEL: @test_vrecped_f64(
+// CHECK:   [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a) #4
+// CHECK:   ret double [[VRECPED_F64_I]]
 float64_t test_vrecped_f64(float64_t a) {
-// CHECK-LABEL: test_vrecped_f64
-// CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
   return vrecped_f64(a);
 }
- 
+
+// CHECK-LABEL: @test_vrecpxs_f32(
+// CHECK:   [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a) #4
+// CHECK:   ret float [[VRECPXS_F32_I]]
 float32_t test_vrecpxs_f32(float32_t a) {
-// CHECK-LABEL: test_vrecpxs_f32
-// CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
   return vrecpxs_f32(a);
- }
- 
+}
+
+// CHECK-LABEL: @test_vrecpxd_f64(
+// CHECK:   [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a) #4
+// CHECK:   ret double [[VRECPXD_F64_I]]
 float64_t test_vrecpxd_f64(float64_t a) {
-// CHECK-LABEL: test_vrecpxd_f64
-// CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
   return vrecpxd_f64(a);
 }
 
+// CHECK-LABEL: @test_vrsqrte_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %a) #4
+// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vrsqrte_u32
-// CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   return vrsqrte_u32(a);
 }
 
+// CHECK-LABEL: @test_vrsqrteq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %a) #4
+// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vrsqrteq_u32
-// CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   return vrsqrteq_u32(a);
 }
 
+// CHECK-LABEL: @test_vrsqrtes_f32(
+// CHECK:   [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a) #4
+// CHECK:   ret float [[VRSQRTES_F32_I]]
 float32_t test_vrsqrtes_f32(float32_t a) {
-// CHECK: vrsqrtes_f32
-// CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
   return vrsqrtes_f32(a);
 }
 
+// CHECK-LABEL: @test_vrsqrted_f64(
+// CHECK:   [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a) #4
+// CHECK:   ret double [[VRSQRTED_F64_I]]
 float64_t test_vrsqrted_f64(float64_t a) {
-// CHECK: vrsqrted_f64
-// CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
   return vrsqrted_f64(a);
 }
 
+// CHECK-LABEL: @test_vld1q_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 uint8x16_t test_vld1q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8
   return vld1q_u8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vld1q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16
   return vld1q_u16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vld1q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32
   return vld1q_u32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vld1q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64
   return vld1q_u64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 int8x16_t test_vld1q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8
   return vld1q_s8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vld1q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16
   return vld1q_s16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vld1q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32
   return vld1q_s32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vld1q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64
   return vld1q_s64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP3]]
 float16x8_t test_vld1q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16
   return vld1q_f16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vld1q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32
   return vld1q_f32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
+// CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]]
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vld1q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64
   return vld1q_f64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 poly8x16_t test_vld1q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8
   return vld1q_p8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 poly16x8_t test_vld1q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16
   return vld1q_p16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 uint8x8_t test_vld1_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8
   return vld1_u8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vld1_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16
   return vld1_u16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vld1_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32
   return vld1_u32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vld1_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64
   return vld1_u64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 int8x8_t test_vld1_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8
   return vld1_s8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vld1_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16
   return vld1_s16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vld1_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32
   return vld1_s32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vld1_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64
   return vld1_s64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP3]]
 float16x4_t test_vld1_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16
   return vld1_f16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vld1_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32
   return vld1_f32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
+// CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]]
+// CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vld1_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64
   return vld1_f64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 poly8x8_t test_vld1_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8
   return vld1_p8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 poly16x4_t test_vld1_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16
   return vld1_p16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_u8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
 uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld2q_u8
   return vld2q_u8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_u16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld2q_u16
   return vld2q_u16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_u32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld2q_u32
   return vld2q_u32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_u64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld2q_u64
   return vld2q_u64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_s8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP5]]
 int8x16x2_t test_vld2q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld2q_s8
   return vld2q_s8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_s16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld2q_s16
   return vld2q_s16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_s32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld2q_s32
   return vld2q_s32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_s64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld2q_s64
   return vld2q_s64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_f16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld2q_f16
   return vld2q_f16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_f32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld2q_f32
   return vld2q_f32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_f64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld2q_f64
   return vld2q_f64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_p8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
 poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld2q_p8
   return vld2q_p8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2q_p16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld2q_p16
   return vld2q_p16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_u8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
 uint8x8x2_t test_vld2_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld2_u8
   return vld2_u8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_u16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld2_u16
   return vld2_u16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_u32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld2_u32
   return vld2_u32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_u64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld2_u64
   return vld2_u64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_s8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP5]]
 int8x8x2_t test_vld2_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld2_s8
   return vld2_s8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_s16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld2_s16
   return vld2_s16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_s32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld2_s32
   return vld2_s32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_s64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld2_s64
   return vld2_s64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_f16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld2_f16
   return vld2_f16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_f32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld2_f32
   return vld2_f32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_f64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld2_f64
   return vld2_f64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_p8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
 poly8x8x2_t test_vld2_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld2_p8
   return vld2_p8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld2_p16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld2_p16
   return vld2_p16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_u8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
 uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld3q_u8
   return vld3q_u8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_u16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld3q_u16
   return vld3q_u16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_u32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld3q_u32
   return vld3q_u32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_u64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld3q_u64
   return vld3q_u64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_s8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP5]]
 int8x16x3_t test_vld3q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld3q_s8
   return vld3q_s8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_s16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld3q_s16
   return vld3q_s16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_s32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld3q_s32
   return vld3q_s32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_s64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld3q_s64
   return vld3q_s64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_f16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld3q_f16
   return vld3q_f16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_f32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld3q_f32
   return vld3q_f32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_f64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld3q_f64
   return vld3q_f64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_p8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
 poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld3q_p8
   return vld3q_p8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3q_p16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld3q_p16
   return vld3q_p16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_u8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
 uint8x8x3_t test_vld3_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld3_u8
   return vld3_u8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_u16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld3_u16
   return vld3_u16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_u32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld3_u32
   return vld3_u32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_u64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld3_u64
   return vld3_u64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_s8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP5]]
 int8x8x3_t test_vld3_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld3_s8
   return vld3_s8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_s16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld3_s16
   return vld3_s16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_s32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld3_s32
   return vld3_s32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_s64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld3_s64
   return vld3_s64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_f16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld3_f16
   return vld3_f16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_f32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld3_f32
   return vld3_f32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_f64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld3_f64
   return vld3_f64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_p8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
 poly8x8x3_t test_vld3_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld3_p8
   return vld3_p8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld3_p16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld3_p16
   return vld3_p16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_u8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
 uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld4q_u8
   return vld4q_u8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_u16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld4q_u16
   return vld4q_u16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_u32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld4q_u32
   return vld4q_u32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_u64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld4q_u64
   return vld4q_u64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_s8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP5]]
 int8x16x4_t test_vld4q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld4q_s8
   return vld4q_s8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_s16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld4q_s16
   return vld4q_s16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_s32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld4q_s32
   return vld4q_s32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_s64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld4q_s64
   return vld4q_s64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_f16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld4q_f16
   return vld4q_f16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_f32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld4q_f32
   return vld4q_f32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_f64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld4q_f64
   return vld4q_f64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_p8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
 poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld4q_p8
   return vld4q_p8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4q_p16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld4q_p16
   return vld4q_p16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_u8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
 uint8x8x4_t test_vld4_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld4_u8
   return vld4_u8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_u16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld4_u16
   return vld4_u16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_u32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld4_u32
   return vld4_u32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_u64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld4_u64
   return vld4_u64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_s8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP5]]
 int8x8x4_t test_vld4_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld4_s8
   return vld4_s8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_s16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld4_s16
   return vld4_s16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_s32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld4_s32
   return vld4_s32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_s64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld4_s64
   return vld4_s64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_f16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld4_f16
   return vld4_f16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_f32(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld4_f32
   return vld4_f32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_f64(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld4_f64
   return vld4_f64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_p8(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
 poly8x8x4_t test_vld4_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld4_p8
   return vld4_p8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld4_p16(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld4_p16
   return vld4_p16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_u8(uint8_t *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vst1q_u8
   vst1q_u8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u16(uint16_t *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vst1q_u16
   vst1q_u16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u32(uint32_t *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vst1q_u32
   vst1q_u32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u64(uint64_t *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vst1q_u64
   vst1q_u64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_s8(int8_t *a, int8x16_t b) {
-  // CHECK-LABEL: test_vst1q_s8
   vst1q_s8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s16(int16_t *a, int16x8_t b) {
-  // CHECK-LABEL: test_vst1q_s16
   vst1q_s16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s32(int32_t *a, int32x4_t b) {
-  // CHECK-LABEL: test_vst1q_s32
   vst1q_s32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s64(int64_t *a, int64x2_t b) {
-  // CHECK-LABEL: test_vst1q_s64
   vst1q_s64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f16(float16_t *a, float16x8_t b) {
-  // CHECK-LABEL: test_vst1q_f16
   vst1q_f16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   store <4 x float> [[TMP3]], <4 x float>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f32(float32_t *a, float32x4_t b) {
-  // CHECK-LABEL: test_vst1q_f32
   vst1q_f32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   store <2 x double> [[TMP3]], <2 x double>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f64(float64_t *a, float64x2_t b) {
-  // CHECK-LABEL: test_vst1q_f64
   vst1q_f64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_p8(poly8_t *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vst1q_p8
   vst1q_p8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_p16(poly16_t *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vst1q_p16
   vst1q_p16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_u8(uint8_t *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vst1_u8
   vst1_u8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u16(uint16_t *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vst1_u16
   vst1_u16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u32(uint32_t *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vst1_u32
   vst1_u32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u64(uint64_t *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vst1_u64
   vst1_u64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_s8(int8_t *a, int8x8_t b) {
-  // CHECK-LABEL: test_vst1_s8
   vst1_s8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s16(int16_t *a, int16x4_t b) {
-  // CHECK-LABEL: test_vst1_s16
   vst1_s16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s32(int32_t *a, int32x2_t b) {
-  // CHECK-LABEL: test_vst1_s32
   vst1_s32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s64(int64_t *a, int64x1_t b) {
-  // CHECK-LABEL: test_vst1_s64
   vst1_s64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f16(float16_t *a, float16x4_t b) {
-  // CHECK-LABEL: test_vst1_f16
   vst1_f16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   store <2 x float> [[TMP3]], <2 x float>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f32(float32_t *a, float32x2_t b) {
-  // CHECK-LABEL: test_vst1_f32
   vst1_f32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   store <1 x double> [[TMP3]], <1 x double>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f64(float64_t *a, float64x1_t b) {
-  // CHECK-LABEL: test_vst1_f64
   vst1_f64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_p8(poly8_t *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vst1_p8
   vst1_p8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_p16(poly16_t *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vst1_p16
   vst1_p16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_u8
   vst2q_u8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_u16
   vst2q_u16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_u32
   vst2q_u32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_u64
   vst2q_u64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_s8
   vst2q_s8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_s16
   vst2q_s16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_s32
   vst2q_s32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_s64
   vst2q_s64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_f16
   vst2q_f16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_f32
   vst2q_f32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_f64(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_f64
   vst2q_f64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_p8
   vst2q_p8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2q_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_p16
   vst2q_p16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_u8
   vst2_u8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_u16
   vst2_u16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_u32
   vst2_u32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_u64
   vst2_u64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_s8(int8_t *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_s8
   vst2_s8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s16(int16_t *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_s16
   vst2_s16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s32(int32_t *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_s32
   vst2_s32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s64(int64_t *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_s64
   vst2_s64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f16(float16_t *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_f16
   vst2_f16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f32(float32_t *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_f32
   vst2_f32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_f64(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f64(float64_t *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_f64
   vst2_f64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_p8
   vst2_p8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst2_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_p16
   vst2_p16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_u8
   vst3q_u8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_u16
   vst3q_u16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_u32
   vst3q_u32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_u64
   vst3q_u64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_s8
   vst3q_s8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_s16
   vst3q_s16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_s32
   vst3q_s32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_s64
   vst3q_s64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_f16
   vst3q_f16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_f32
   vst3q_f32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_f64(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_f64
   vst3q_f64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_p8
   vst3q_p8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3q_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_p16
   vst3q_p16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_u8
   vst3_u8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_u16
   vst3_u16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_u32
   vst3_u32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_u64
   vst3_u64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_s8(int8_t *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_s8
   vst3_s8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s16(int16_t *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_s16
   vst3_s16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s32(int32_t *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_s32
   vst3_s32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s64(int64_t *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_s64
   vst3_s64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f16(float16_t *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_f16
   vst3_f16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f32(float32_t *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_f32
   vst3_f32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_f64(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f64(float64_t *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_f64
   vst3_f64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_p8
   vst3_p8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst3_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_p16
   vst3_p16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_u8
   vst4q_u8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_u16
   vst4q_u16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_u32
   vst4q_u32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_u64
   vst4q_u64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_s8
   vst4q_s8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_s16
   vst4q_s16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_s32
   vst4q_s32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_s64
   vst4q_s64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_f16
   vst4q_f16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4f32.p0i8(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_f32
   vst4q_f32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_f64(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2f64.p0i8(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_f64
   vst4q_f64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_p8
   vst4q_p8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4q_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_p16
   vst4q_p16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_u8
   vst4_u8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_u16
   vst4_u16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_u32
   vst4_u32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_u64
   vst4_u64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_s8(int8_t *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_s8
   vst4_s8(a, b);
-// CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s16(int16_t *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_s16
   vst4_s16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s32(int32_t *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_s32
   vst4_s32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s64(int64_t *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_s64
   vst4_s64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f16(float16_t *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_f16
   vst4_f16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2f32.p0i8(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f32(float32_t *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_f32
   vst4_f32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_f64(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1f64.p0i8(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f64(float64_t *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_f64
   vst4_f64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_p8
   vst4_p8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst4_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_p16
   vst4_p16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u8_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x2
   return vld1q_u8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u16_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x2
   return vld1q_u16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u32_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x2
   return vld1q_u32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u64_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x2
   return vld1q_u64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s8_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x2
   return vld1q_s8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s16_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x2
   return vld1q_s16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s32_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x2
   return vld1q_s32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s64_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x2
   return vld1q_s64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f16_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x2
   return vld1q_f16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f32_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x2
   return vld1q_f32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f64_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x2
   return vld1q_f64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p8_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x2
   return vld1q_p8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p16_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x2
   return vld1q_p16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p64_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x2
   return vld1q_p64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u8_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x2
   return vld1_u8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u16_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x2
   return vld1_u16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u32_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x2
   return vld1_u32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u64_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x2
   return vld1_u64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s8_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x2
   return vld1_s8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s16_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x2
   return vld1_s16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s32_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x2
   return vld1_s32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s64_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x2
   return vld1_s64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f16_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x2
   return vld1_f16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f32_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x2
   return vld1_f32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f64_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x2
   return vld1_f64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p8_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x2
   return vld1_p8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p16_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x2
   return vld1_p16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p64_x2(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x2
   return vld1_p64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u8_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x3
   return vld1q_u8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u16_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x3
   return vld1q_u16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u32_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x3
   return vld1q_u32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u64_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x3
   return vld1q_u64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s8_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x3
   return vld1q_s8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s16_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x3
   return vld1q_s16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s32_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x3
   return vld1q_s32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s64_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x3
   return vld1q_s64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f16_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x3
   return vld1q_f16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f32_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x3
   return vld1q_f32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f64_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x3
   return vld1q_f64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p8_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x3
   return vld1q_p8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p16_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x3
   return vld1q_p16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p64_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x3
   return vld1q_p64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u8_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x3
   return vld1_u8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u16_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x3
   return vld1_u16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u32_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x3
   return vld1_u32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u64_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x3
   return vld1_u64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s8_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x3
   return vld1_s8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s16_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x3
   return vld1_s16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s32_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x3
   return vld1_s32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s64_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x3
   return vld1_s64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f16_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x3
   return vld1_f16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f32_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x3
   return vld1_f32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f64_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x3
   return vld1_f64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p8_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x3
   return vld1_p8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p16_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x3
   return vld1_p16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p64_x3(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x3
   return vld1_p64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u8_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x4
   return vld1q_u8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u16_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x4
   return vld1q_u16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u32_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x4
   return vld1q_u32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_u64_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x4
   return vld1q_u64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s8_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x4
   return vld1q_s8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s16_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x4
   return vld1q_s16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s32_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x4
   return vld1q_s32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_s64_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x4
   return vld1q_s64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f16_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x4
   return vld1q_f16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f32_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x4
   return vld1q_f32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_f64_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x4
   return vld1q_f64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p8_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x4
   return vld1q_p8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p16_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x4
   return vld1q_p16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1q_p64_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x4
   return vld1q_p64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u8_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x4
   return vld1_u8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u16_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x4
   return vld1_u16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u32_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x4
   return vld1_u32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_u64_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x4
   return vld1_u64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s8_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x4
   return vld1_s8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s16_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x4
   return vld1_s16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s32_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x4
   return vld1_s32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_s64_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x4
   return vld1_s64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f16_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x4
   return vld1_f16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f32_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x4
   return vld1_f32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_f64_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x4
   return vld1_f64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p8_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x4
   return vld1_p8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p16_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x4
   return vld1_p16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vld1_p64_x4(
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x4
   return vld1_p64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u8_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x2
   vst1q_u8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u16_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x2
   vst1q_u16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u32_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x2
   vst1q_u32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u64_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x2
   vst1q_u64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s8_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x2
   vst1q_s8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s16_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x2
   vst1q_s16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s32_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x2
   vst1q_s32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s64_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x2
   vst1q_s64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f16_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x2
   vst1q_f16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f32_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x2
   vst1q_f32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f64_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], double* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x2
   vst1q_f64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p8_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x2
   vst1q_p8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p16_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x2
   vst1q_p16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p64_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x2
   vst1q_p64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u8_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_u8_x2
   vst1_u8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u16_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_u16_x2
   vst1_u16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u32_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_u32_x2
   vst1_u32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u64_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_u64_x2
   vst1_u64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s8_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_s8_x2
   vst1_s8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s16_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_s16_x2
   vst1_s16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s32_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_s32_x2
   vst1_s32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s64_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_s64_x2
   vst1_s64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f16_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_f16_x2
   vst1_f16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f32_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_f32_x2
   vst1_f32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f64_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], double* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_f64_x2
   vst1_f64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p8_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_p8_x2
   vst1_p8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p16_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_p16_x2
   vst1_p16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p64_x2(
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_p64_x2
   vst1_p64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u8_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x3
   vst1q_u8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u16_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x3
   vst1q_u16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u32_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x3
   vst1q_u32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u64_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x3
   vst1q_u64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s8_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x3
   vst1q_s8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s16_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x3
   vst1q_s16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s32_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x3
   vst1q_s32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s64_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x3
   vst1q_s64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f16_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x3
   vst1q_f16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f32_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], float* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x3
   vst1q_f32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f64_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], double* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x3
   vst1q_f64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p8_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x3
   vst1q_p8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p16_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x3
   vst1q_p16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p64_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x3
   vst1q_p64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u8_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_u8_x3
   vst1_u8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u16_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_u16_x3
   vst1_u16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u32_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_u32_x3
   vst1_u32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u64_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_u64_x3
   vst1_u64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s8_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_s8_x3
   vst1_s8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s16_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_s16_x3
   vst1_s16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s32_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_s32_x3
   vst1_s32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s64_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_s64_x3
   vst1_s64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f16_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_f16_x3
   vst1_f16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f32_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], float* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_f32_x3
   vst1_f32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f64_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], double* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_f64_x3
   vst1_f64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p8_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_p8_x3
   vst1_p8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p16_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_p16_x3
   vst1_p16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p64_x3(
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_p64_x3
   vst1_p64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u8_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x4
   vst1q_u8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u16_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x4
   vst1q_u16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u32_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x4
   vst1q_u32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_u64_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x4
   vst1q_u64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s8_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x4
   vst1q_s8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s16_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x4
   vst1q_s16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s32_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x4
   vst1q_s32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_s64_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x4
   vst1q_s64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f16_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x4
   vst1q_f16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f32_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], float* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x4
   vst1q_f32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_f64_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], double* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x4
   vst1q_f64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p8_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x4
   vst1q_p8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p16_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x4
   vst1q_p16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1q_p64_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x4
   vst1q_p64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u8_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_u8_x4
   vst1_u8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u16_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_u16_x4
   vst1_u16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u32_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_u32_x4
   vst1_u32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_u64_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_u64_x4
   vst1_u64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s8_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_s8_x4
   vst1_s8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s16_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_s16_x4
   vst1_s16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s32_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_s32_x4
   vst1_s32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_s64_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_s64_x4
   vst1_s64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f16_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_f16_x4
   vst1_f16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f32_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], float* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_f32_x4
   vst1_f32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_f64_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], double* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_f64_x4
   vst1_f64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p8_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_p8_x4
   vst1_p8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p16_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_p16_x4
   vst1_p16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vst1_p64_x4(
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_p64_x4
   vst1_p64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: @test_vceqd_s64(
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vceqd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vceqd_s64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vceqd_s64(a, b);
 }
 
+// CHECK-LABEL: @test_vceqd_u64(
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vceqd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vceqd_u64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vceqd_u64(a, b);
 }
 
+// CHECK-LABEL: @test_vceqzd_s64(
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZ_I]]
 int64_t test_vceqzd_s64(int64_t a) {
-// CHECK-LABEL: test_vceqzd_s64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vceqzd_s64(a);
 }
 
+// CHECK-LABEL: @test_vceqzd_u64(
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
+// CHECK:   [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZD_I]]
 int64_t test_vceqzd_u64(int64_t a) {
-// CHECK-LABEL: test_vceqzd_u64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vceqzd_u64(a);
 }
 
+// CHECK-LABEL: @test_vcged_s64(
+// CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcged_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcged_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcged_s64(a, b);
 }
 
+// CHECK-LABEL: @test_vcged_u64(
+// CHECK:   [[TMP0:%.*]] = icmp uge i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcged_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcged_u64
-// CHECK: {{cmhs d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
-    return (uint64_t)vcged_u64(a, b);
+  return (uint64_t)vcged_u64(a, b);
 }
 
+// CHECK-LABEL: @test_vcgezd_s64(
+// CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, 0
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGEZ_I]]
 int64_t test_vcgezd_s64(int64_t a) {
-// CHECK-LABEL: test_vcgezd_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, #0x0|eor x0, x[0-9]+, x0, asr #63}}
   return (int64_t)vcgezd_s64(a);
 }
 
+// CHECK-LABEL: @test_vcgtd_s64(
+// CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcgtd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcgtd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcgtd_s64(a, b);
 }
 
+// CHECK-LABEL: @test_vcgtd_u64(
+// CHECK:   [[TMP0:%.*]] = icmp ugt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcgtd_u64
-// CHECK: {{cmhi d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcgtd_u64(a, b);
 }
 
+// CHECK-LABEL: @test_vcgtzd_s64(
+// CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, 0
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGTZ_I]]
 int64_t test_vcgtzd_s64(int64_t a) {
-// CHECK-LABEL: test_vcgtzd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vcgtzd_s64(a);
 }
 
+// CHECK-LABEL: @test_vcled_s64(
+// CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcled_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcled_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcled_s64(a, b);
 }
 
+// CHECK-LABEL: @test_vcled_u64(
+// CHECK:   [[TMP0:%.*]] = icmp ule i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcled_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcled_u64
-// CHECK: {{cmhs d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcled_u64(a, b);
 }
 
+// CHECK-LABEL: @test_vclezd_s64(
+// CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, 0
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLEZ_I]]
 int64_t test_vclezd_s64(int64_t a) {
-// CHECK-LABEL: test_vclezd_s64
-// CHECK: {{cmle d[0-9]+, d[0-9]+, #0x0|cmp x0, #1}}
   return (int64_t)vclezd_s64(a);
 }
 
+// CHECK-LABEL: @test_vcltd_s64(
+// CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcltd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcltd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcltd_s64(a, b);
 }
 
+// CHECK-LABEL: @test_vcltd_u64(
+// CHECK:   [[TMP0:%.*]] = icmp ult i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcltd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcltd_u64
-// CHECK: {{cmhi d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcltd_u64(a, b);
 }
 
+// CHECK-LABEL: @test_vcltzd_s64(
+// CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, 0
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLTZ_I]]
 int64_t test_vcltzd_s64(int64_t a) {
-// CHECK-LABEL: test_vcltzd_s64
-// CHECK: {{cmlt d[0-9]+, d[0-9]+, #0x0|asr x0, x0, #63}}
   return (int64_t)vcltzd_s64(a);
 }
 
+// CHECK-LABEL: @test_vtstd_s64(
+// CHECK:   [[TMP0:%.*]] = and i64 %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
+// CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
+// CHECK:   ret i64 [[VTSTD_I]]
 int64_t test_vtstd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vtstd_s64
-// CHECK: {{cmtst d[0-9]+, d[0-9]+, d[0-9]+|tst x1, x0}}
   return (int64_t)vtstd_s64(a, b);
 }
 
+// CHECK-LABEL: @test_vtstd_u64(
+// CHECK:   [[TMP0:%.*]] = and i64 %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
+// CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
+// CHECK:   ret i64 [[VTSTD_I]]
 uint64_t test_vtstd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vtstd_u64
-// CHECK: {{cmtst d[0-9]+, d[0-9]+, d[0-9]+|tst x1, x0}}
   return (uint64_t)vtstd_u64(a, b);
 }
 
+// CHECK-LABEL: @test_vabsd_s64(
+// CHECK:   [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a) #4
+// CHECK:   ret i64 [[VABSD_S64_I]]
 int64_t test_vabsd_s64(int64_t a) {
-// CHECK-LABEL: test_vabsd_s64
-// CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vabsd_s64(a);
 }
 
+// CHECK-LABEL: @test_vqabsb_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqabsb_s8(int8_t a) {
-// CHECK-LABEL: test_vqabsb_s8
-// CHECK: sqabs {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vqabsb_s8(a);
 }
 
+// CHECK-LABEL: @test_vqabsh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqabsh_s16(int16_t a) {
-// CHECK-LABEL: test_vqabsh_s16
-// CHECK: sqabs {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vqabsh_s16(a);
 }
 
+// CHECK-LABEL: @test_vqabss_s32(
+// CHECK:   [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) #4
+// CHECK:   ret i32 [[VQABSS_S32_I]]
 int32_t test_vqabss_s32(int32_t a) {
-// CHECK-LABEL: test_vqabss_s32
-// CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vqabss_s32(a);
 }
 
+// CHECK-LABEL: @test_vqabsd_s64(
+// CHECK:   [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a) #4
+// CHECK:   ret i64 [[VQABSD_S64_I]]
 int64_t test_vqabsd_s64(int64_t a) {
-// CHECK-LABEL: test_vqabsd_s64
-// CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vqabsd_s64(a);
 }
 
+// CHECK-LABEL: @test_vnegd_s64(
+// CHECK:   [[VNEGD_I:%.*]] = sub i64 0, %a
+// CHECK:   ret i64 [[VNEGD_I]]
 int64_t test_vnegd_s64(int64_t a) {
-// CHECK-LABEL: test_vnegd_s64
-// CHECK: neg {{[xd][0-9]+}}, {{[xd][0-9]+}}
   return (int64_t)vnegd_s64(a);
 }
 
+// CHECK-LABEL: @test_vqnegb_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqnegb_s8(int8_t a) {
-// CHECK-LABEL: test_vqnegb_s8
-// CHECK: sqneg {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vqnegb_s8(a);
 }
 
+// CHECK-LABEL: @test_vqnegh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqnegh_s16(int16_t a) {
-// CHECK-LABEL: test_vqnegh_s16
-// CHECK: sqneg {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vqnegh_s16(a);
 }
 
+// CHECK-LABEL: @test_vqnegs_s32(
+// CHECK:   [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a) #4
+// CHECK:   ret i32 [[VQNEGS_S32_I]]
 int32_t test_vqnegs_s32(int32_t a) {
-// CHECK-LABEL: test_vqnegs_s32
-// CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vqnegs_s32(a);
 }
 
+// CHECK-LABEL: @test_vqnegd_s64(
+// CHECK:   [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a) #4
+// CHECK:   ret i64 [[VQNEGD_S64_I]]
 int64_t test_vqnegd_s64(int64_t a) {
-// CHECK-LABEL: test_vqnegd_s64
-// CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vqnegd_s64(a);
 }
 
+// CHECK-LABEL: @test_vuqaddb_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vuqaddb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vuqaddb_s8
-// CHECK: suqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vuqaddb_s8(a, b);
 }
 
+// CHECK-LABEL: @test_vuqaddh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vuqaddh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vuqaddh_s16
-// CHECK: suqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vuqaddh_s16(a, b);
 }
 
+// CHECK-LABEL: @test_vuqadds_s32(
+// CHECK:   [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VUQADDS_S32_I]]
 int32_t test_vuqadds_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vuqadds_s32
-// CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vuqadds_s32(a, b);
 }
 
+// CHECK-LABEL: @test_vuqaddd_s64(
+// CHECK:   [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VUQADDD_S64_I]]
 int64_t test_vuqaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vuqaddd_s64
-// CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vuqaddd_s64(a, b);
 }
 
+// CHECK-LABEL: @test_vsqaddb_u8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vsqaddb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vsqaddb_u8
-// CHECK: usqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (uint8_t)vsqaddb_u8(a, b);
 }
 
+// CHECK-LABEL: @test_vsqaddh_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vsqaddh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vsqaddh_u16
-// CHECK: usqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (uint16_t)vsqaddh_u16(a, b);
 }
 
+// CHECK-LABEL: @test_vsqadds_u32(
+// CHECK:   [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VSQADDS_U32_I]]
 uint32_t test_vsqadds_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vsqadds_u32
-// CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vsqadds_u32(a, b);
 }
 
+// CHECK-LABEL: @test_vsqaddd_u64(
+// CHECK:   [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSQADDD_U64_I]]
 uint64_t test_vsqaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsqaddd_u64
-// CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vsqaddd_u64(a, b);
 }
 
+// CHECK-LABEL: @test_vqdmlalh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
+// CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]]) #4
+// CHECK:   ret i32 [[VQDMLXL1_I]]
 int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) {
-
-// CHECK-ARM64-LABEL: test_vqdmlalh_s16
-// CHECK-ARM64: sqdmull v[[PROD:[0-9]+]].4s, {{v[0-9]+.4h}}, {{v[0-9]+.4h}}
-// CHECK-ARM64: sqadd {{s[0-9]+}}, {{s[0-9]+}}, s[[PROD]]
   return (int32_t)vqdmlalh_s16(a, b, c);
 }
 
+// CHECK-LABEL: @test_vqdmlals_s32(
+// CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
+// CHECK:   ret i64 [[VQDMLXL1_I]]
 int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) {
-// CHECK-LABEL: test_vqdmlals_s32
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmlals_s32(a, b, c);
 }
 
+// CHECK-LABEL: @test_vqdmlslh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
+// CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]]) #4
+// CHECK:   ret i32 [[VQDMLXL1_I]]
 int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) {
-
-// CHECK-ARM64-LABEL: test_vqdmlslh_s16
-// CHECK-ARM64: sqdmull v[[PROD:[0-9]+]].4s, {{v[0-9]+.4h}}, {{v[0-9]+.4h}}
-// CHECK-ARM64: sqsub {{s[0-9]+}}, {{s[0-9]+}}, s[[PROD]]
   return (int32_t)vqdmlslh_s16(a, b, c);
 }
 
+// CHECK-LABEL: @test_vqdmlsls_s32(
+// CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
+// CHECK:   ret i64 [[VQDMLXL1_I]]
 int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) {
-// CHECK-LABEL: test_vqdmlsls_s32
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmlsls_s32(a, b, c);
 }
 
+// CHECK-LABEL: @test_vqdmullh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP2]]
 int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqdmullh_s16
-// CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int32_t)vqdmullh_s16(a, b);
 }
 
+// CHECK-LABEL: @test_vqdmulls_s32(
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b) #4
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqdmulls_s32
-// CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmulls_s32(a, b);
 }
 
+// CHECK-LABEL: @test_vqmovunh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovunh_s16(int16_t a) {
-// CHECK-LABEL: test_vqmovunh_s16
-// CHECK: sqxtun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovunh_s16(a);
 }
 
+// CHECK-LABEL: @test_vqmovuns_s32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovuns_s32(int32_t a) {
-// CHECK-LABEL: test_vqmovuns_s32
-// CHECK: sqxtun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovuns_s32(a);
 }
 
+// CHECK-LABEL: @test_vqmovund_s64(
+// CHECK:   [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVUND_S64_I]]
 int32_t test_vqmovund_s64(int64_t a) {
-// CHECK-LABEL: test_vqmovund_s64
-// CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovund_s64(a);
 }
 
+// CHECK-LABEL: @test_vqmovnh_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovnh_s16(int16_t a) {
-// CHECK-LABEL: test_vqmovnh_s16
-// CHECK: sqxtn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovnh_s16(a);
 }
 
+// CHECK-LABEL: @test_vqmovns_s32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovns_s32(int32_t a) {
-// CHECK-LABEL: test_vqmovns_s32
-// CHECK: sqxtn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovns_s32(a);
 }
 
+// CHECK-LABEL: @test_vqmovnd_s64(
+// CHECK:   [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVND_S64_I]]
 int32_t test_vqmovnd_s64(int64_t a) {
-// CHECK-LABEL: test_vqmovnd_s64
-// CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovnd_s64(a);
 }
 
+// CHECK-LABEL: @test_vqmovnh_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovnh_u16(int16_t a) {
-// CHECK-LABEL: test_vqmovnh_u16
-// CHECK: uqxtn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovnh_u16(a);
 }
 
+// CHECK-LABEL: @test_vqmovns_u32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovns_u32(int32_t a) {
-// CHECK-LABEL: test_vqmovns_u32
-// CHECK: uqxtn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovns_u32(a);
 }
 
+// CHECK-LABEL: @test_vqmovnd_u64(
+// CHECK:   [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVND_U64_I]]
 int32_t test_vqmovnd_u64(int64_t a) {
-// CHECK-LABEL: test_vqmovnd_u64
-// CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovnd_u64(a);
 }
 
+// CHECK-LABEL: @test_vceqs_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vceqs_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vceqs_f32
-// CHECK: {{fcmeq s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vceqs_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vceqd_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vceqd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vceqd_f64
-// CHECK: {{fcmeq d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vceqd_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vceqzs_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCEQZ_I]]
 uint32_t test_vceqzs_f32(float32_t a) {
-// CHECK-LABEL: test_vceqzs_f32
-// CHECK: {{fcmeq s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vceqzs_f32(a);
 }
 
+// CHECK-LABEL: @test_vceqzd_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZ_I]]
 uint64_t test_vceqzd_f64(float64_t a) {
-// CHECK-LABEL: test_vceqzd_f64
-// CHECK: {{fcmeq d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vceqzd_f64(a);
 }
 
+// CHECK-LABEL: @test_vcges_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp oge float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcges_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcges_f32
-// CHECK: {{fcmge s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vcges_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vcged_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp oge double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcged_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcged_f64
-// CHECK: {{fcmge d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vcged_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vcgezs_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCGEZ_I]]
 uint32_t test_vcgezs_f32(float32_t a) {
-// CHECK-LABEL: test_vcgezs_f32
-// CHECK: {{fcmge s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcgezs_f32(a);
 }
 
+// CHECK-LABEL: @test_vcgezd_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGEZ_I]]
 uint64_t test_vcgezd_f64(float64_t a) {
-// CHECK-LABEL: test_vcgezd_f64
-// CHECK: {{fcmge d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcgezd_f64(a);
 }
 
+// CHECK-LABEL: @test_vcgts_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcgts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcgts_f32
-// CHECK: {{fcmgt s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vcgts_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vcgtd_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcgtd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcgtd_f64
-// CHECK: {{fcmgt d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vcgtd_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vcgtzs_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCGTZ_I]]
 uint32_t test_vcgtzs_f32(float32_t a) {
-// CHECK-LABEL: test_vcgtzs_f32
-// CHECK: {{fcmgt s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcgtzs_f32(a);
 }
 
+// CHECK-LABEL: @test_vcgtzd_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGTZ_I]]
 uint64_t test_vcgtzd_f64(float64_t a) {
-// CHECK-LABEL: test_vcgtzd_f64
-// CHECK: {{fcmgt d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcgtzd_f64(a);
 }
 
+// CHECK-LABEL: @test_vcles_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp ole float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcles_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcles_f32
-// CHECK: {{fcmge s0, s1, s0|fcmp s0, s1}}
   return (uint32_t)vcles_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vcled_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp ole double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcled_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcled_f64
-// CHECK: {{fcmge d0, d1, d0|fcmp d0, d1}}
   return (uint64_t)vcled_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vclezs_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCLEZ_I]]
 uint32_t test_vclezs_f32(float32_t a) {
-// CHECK-LABEL: test_vclezs_f32
-// CHECK: {{fcmle s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vclezs_f32(a);
 }
 
+// CHECK-LABEL: @test_vclezd_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLEZ_I]]
 uint64_t test_vclezd_f64(float64_t a) {
-// CHECK-LABEL: test_vclezd_f64
-// CHECK: {{fcmle d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vclezd_f64(a);
 }
 
+// CHECK-LABEL: @test_vclts_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp olt float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vclts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vclts_f32
-// CHECK: {{fcmgt s0, s1, s0|fcmp s0, s1}}
   return (uint32_t)vclts_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vcltd_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp olt double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcltd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcltd_f64
-// CHECK: {{fcmgt d0, d1, d0|fcmp d0, d1}}
   return (uint64_t)vcltd_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vcltzs_f32(
+// CHECK:   [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCLTZ_I]]
 uint32_t test_vcltzs_f32(float32_t a) {
-// CHECK-LABEL: test_vcltzs_f32
-// CHECK: {{fcmlt s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcltzs_f32(a);
 }
 
+// CHECK-LABEL: @test_vcltzd_f64(
+// CHECK:   [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLTZ_I]]
 uint64_t test_vcltzd_f64(float64_t a) {
-// CHECK-LABEL: test_vcltzd_f64
-// CHECK: {{fcmlt d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcltzd_f64(a);
 }
 
+// CHECK-LABEL: @test_vcages_f32(
+// CHECK:   [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b) #4
+// CHECK:   ret i32 [[VCAGES_F32_I]]
 uint32_t test_vcages_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcages_f32
-// CHECK: facge s0, s0, s1
   return (uint32_t)vcages_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vcaged_f64(
+// CHECK:   [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b) #4
+// CHECK:   ret i64 [[VCAGED_F64_I]]
 uint64_t test_vcaged_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaged_f64
-// CHECK: facge d0, d0, d1
   return (uint64_t)vcaged_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vcagts_f32(
+// CHECK:   [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b) #4
+// CHECK:   ret i32 [[VCAGTS_F32_I]]
 uint32_t test_vcagts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcagts_f32
-// CHECK: facgt s0, s0, s1
   return (uint32_t)vcagts_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vcagtd_f64(
+// CHECK:   [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b) #4
+// CHECK:   ret i64 [[VCAGTD_F64_I]]
 uint64_t test_vcagtd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcagtd_f64
-// CHECK: facgt d0, d0, d1
   return (uint64_t)vcagtd_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vcales_f32(
+// CHECK:   [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a) #4
+// CHECK:   ret i32 [[VCALES_F32_I]]
 uint32_t test_vcales_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcales_f32
-// CHECK: facge s0, s1, s0
   return (uint32_t)vcales_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vcaled_f64(
+// CHECK:   [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a) #4
+// CHECK:   ret i64 [[VCALED_F64_I]]
 uint64_t test_vcaled_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaled_f64
-// CHECK: facge d0, d1, d0
   return (uint64_t)vcaled_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vcalts_f32(
+// CHECK:   [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a) #4
+// CHECK:   ret i32 [[VCALTS_F32_I]]
 uint32_t test_vcalts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcalts_f32
-// CHECK: facgt s0, s1, s0
   return (uint32_t)vcalts_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vcaltd_f64(
+// CHECK:   [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a) #4
+// CHECK:   ret i64 [[VCALTD_F64_I]]
 uint64_t test_vcaltd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaltd_f64
-// CHECK: facgt d0, d1, d0
   return (uint64_t)vcaltd_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vshrd_n_s64(
+// CHECK:   [[SHRD_N:%.*]] = ashr i64 %a, 1
+// CHECK:   ret i64 [[SHRD_N]]
 int64_t test_vshrd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vshrd_n_s64
-// CHECK: {{sshr d[0-9]+, d[0-9]+, #1|asr x0, x0, #1}}
   return (int64_t)vshrd_n_s64(a, 1);
 }
 
+// CHECK-LABEL: @test_vshr_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 int64x1_t test_vshr_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vshr_n_s64
-// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vshr_n_s64(a, 1);
 }
 
+// CHECK-LABEL: @test_vshrd_n_u64(
+// CHECK:   ret i64 0
 uint64_t test_vshrd_n_u64(uint64_t a) {
-
-// CHECK-ARM64-LABEL: test_vshrd_n_u64
-// CHECK-ARM64: mov x0, xzr
   return (uint64_t)vshrd_n_u64(a, 64);
 }
 
+// CHECK-LABEL: @test_vshrd_n_u64_2(
+// CHECK:   ret i64 0
 uint64_t test_vshrd_n_u64_2() {
-
-// CHECK-ARM64-LABEL: test_vshrd_n_u64_2
-// CHECK-ARM64: mov x0, xzr
   uint64_t a = UINT64_C(0xf000000000000000);
   return vshrd_n_u64(a, 64);
 }
 
+// CHECK-LABEL: @test_vshr_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vshr_n_u64
-// CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vshr_n_u64(a, 1);
 }
 
+// CHECK-LABEL: @test_vrshrd_n_s64(
+// CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63)
+// CHECK:   ret i64 [[VRSHR_N]]
 int64_t test_vrshrd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vrshrd_n_s64
-// CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vrshrd_n_s64(a, 63);
 }
 
+// CHECK-LABEL: @test_vrshr_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N1]]
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vrshr_n_s64
-// CHECK: srshr d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrshr_n_s64(a, 1);
 }
 
+// CHECK-LABEL: @test_vrshrd_n_u64(
+// CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63)
+// CHECK:   ret i64 [[VRSHR_N]]
 uint64_t test_vrshrd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vrshrd_n_u64
-// CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vrshrd_n_u64(a, 63);
 }
 
+// CHECK-LABEL: @test_vrshr_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N1]]
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vrshr_n_u64
-// CHECK: urshr d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrshr_n_u64(a, 1);
 }
 
+// CHECK-LABEL: @test_vsrad_n_s64(
+// CHECK:   [[SHRD_N:%.*]] = ashr i64 %b, 63
+// CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
+// CHECK:   ret i64 [[TMP0]]
 int64_t test_vsrad_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsrad_n_s64
-// CHECK: {{ssra d[0-9]+, d[0-9]+, #63|add x0, x0, x1, asr #63}}
   return (int64_t)vsrad_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: @test_vsra_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsra_n_s64
-// CHECK: ssra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsra_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: @test_vsrad_n_u64(
+// CHECK:   [[SHRD_N:%.*]] = lshr i64 %b, 63
+// CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
+// CHECK:   ret i64 [[TMP0]]
 uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsrad_n_u64
-// CHECK: {{usra d[0-9]+, d[0-9]+, #63|add x0, x0, x1, lsr #63}}
   return (uint64_t)vsrad_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: @test_vsrad_n_u64_2(
+// CHECK:   ret i64 %a
 uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) {
-
-// CHECK-ARM64-LABEL: test_vsrad_n_u64_2
-// CHECK-ARM64-NOT: add
   return (uint64_t)vsrad_n_u64(a, b, 64);
 }
 
+// CHECK-LABEL: @test_vsra_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsra_n_u64
-// CHECK: usra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsra_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: @test_vrsrad_n_s64(
+// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63)
+// CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
+// CHECK:   ret i64 [[TMP1]]
 int64_t test_vrsrad_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vrsrad_n_s64
-// CHECK: {{srsra d[0-9]+, d[0-9]+, #63}}
   return (int64_t)vrsrad_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: @test_vrsra_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrsra_n_s64
-// CHECK: srsra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrsra_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: @test_vrsrad_n_u64(
+// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63)
+// CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
+// CHECK:   ret i64 [[TMP1]]
 uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vrsrad_n_u64
-// CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vrsrad_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: @test_vrsra_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vrsra_n_u64
-// CHECK: ursra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrsra_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: @test_vshld_n_s64(
+// CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 1
+// CHECK:   ret i64 [[SHLD_N]]
 int64_t test_vshld_n_s64(int64_t a) {
-// CHECK-LABEL: test_vshld_n_s64
-// CHECK: {{shl d[0-9]+, d[0-9]+, #1|lsl x0, x0, #1}}
   return (int64_t)vshld_n_s64(a, 1);
 }
+
+// CHECK-LABEL: @test_vshl_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 int64x1_t test_vshl_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vshl_n_s64
-// CHECK: shl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vshl_n_s64(a, 1);
 }
 
+// CHECK-LABEL: @test_vshld_n_u64(
+// CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 63
+// CHECK:   ret i64 [[SHLD_N]]
 uint64_t test_vshld_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vshld_n_u64
-// CHECK: {{shl d[0-9]+, d[0-9]+, #63|lsl x0, x0, #63}}
   return (uint64_t)vshld_n_u64(a, 63);
 }
 
+// CHECK-LABEL: @test_vshl_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vshl_n_u64
-// CHECK: shl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vshl_n_u64(a, 1);
 }
 
+// CHECK-LABEL: @test_vqshlb_n_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshlb_n_s8(int8_t a) {
-// CHECK-LABEL: test_vqshlb_n_s8
-// CHECK: sqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (int8_t)vqshlb_n_s8(a, 7);
 }
 
+// CHECK-LABEL: @test_vqshlh_n_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshlh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshlh_n_s16
-// CHECK: sqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (int16_t)vqshlh_n_s16(a, 15);
 }
 
+// CHECK-LABEL: @test_vqshls_n_s32(
+// CHECK:   [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLS_N_S32]]
 int32_t test_vqshls_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshls_n_s32
-// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (int32_t)vqshls_n_s32(a, 31);
 }
 
+// CHECK-LABEL: @test_vqshld_n_s64(
+// CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHL_N]]
 int64_t test_vqshld_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshld_n_s64
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vqshld_n_s64(a, 63);
 }
 
+// CHECK-LABEL: @test_vqshl_n_s8(
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqshl_n_s8
   return vqshl_n_s8(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: @test_vqshlq_n_s8(
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s8
   return vqshlq_n_s8(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: @test_vqshl_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
+// CHECK:   ret <4 x i16> [[VQSHL_N1]]
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqshl_n_s16
   return vqshl_n_s16(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: @test_vqshlq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
+// CHECK:   ret <8 x i16> [[VQSHL_N1]]
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s16
   return vqshlq_n_s16(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: @test_vqshl_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
+// CHECK:   ret <2 x i32> [[VQSHL_N1]]
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqshl_n_s32
   return vqshl_n_s32(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: @test_vqshlq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
+// CHECK:   ret <4 x i32> [[VQSHL_N1]]
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s32
   return vqshlq_n_s32(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: @test_vqshlq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
+// CHECK:   ret <2 x i64> [[VQSHL_N1]]
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s64
   return vqshlq_n_s64(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
 }
 
+// CHECK-LABEL: @test_vqshl_n_u8(
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vqshl_n_u8
   return vqshl_n_u8(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: @test_vqshlq_n_u8(
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u8
   return vqshlq_n_u8(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: @test_vqshl_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
+// CHECK:   ret <4 x i16> [[VQSHL_N1]]
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vqshl_n_u16
   return vqshl_n_u16(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: @test_vqshlq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
+// CHECK:   ret <8 x i16> [[VQSHL_N1]]
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u16
   return vqshlq_n_u16(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: @test_vqshl_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
+// CHECK:   ret <2 x i32> [[VQSHL_N1]]
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vqshl_n_u32
   return vqshl_n_u32(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: @test_vqshlq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
+// CHECK:   ret <4 x i32> [[VQSHL_N1]]
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u32
   return vqshlq_n_u32(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: @test_vqshlq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
+// CHECK:   ret <2 x i64> [[VQSHL_N1]]
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u64
   return vqshlq_n_u64(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
 }
 
+// CHECK-LABEL: @test_vqshl_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N1]]
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vqshl_n_s64
-// CHECK: sqshl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshl_n_s64(a, 1);
 }
 
+// CHECK-LABEL: @test_vqshlb_n_u8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqshlb_n_u8(uint8_t a) {
-// CHECK-LABEL: test_vqshlb_n_u8
-// CHECK: uqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (uint8_t)vqshlb_n_u8(a, 7);
 }
 
+// CHECK-LABEL: @test_vqshlh_n_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqshlh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqshlh_n_u16
-// CHECK: uqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (uint16_t)vqshlh_n_u16(a, 15);
 }
 
+// CHECK-LABEL: @test_vqshls_n_u32(
+// CHECK:   [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLS_N_U32]]
 uint32_t test_vqshls_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqshls_n_u32
-// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (uint32_t)vqshls_n_u32(a, 31);
 }
 
+// CHECK-LABEL: @test_vqshld_n_u64(
+// CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHL_N]]
 uint64_t test_vqshld_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqshld_n_u64
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vqshld_n_u64(a, 63);
 }
 
+// CHECK-LABEL: @test_vqshl_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N1]]
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vqshl_n_u64
-// CHECK: uqshl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshl_n_u64(a, 1);
 }
 
+// CHECK-LABEL: @test_vqshlub_n_s8(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshlub_n_s8(int8_t a) {
-// CHECK-LABEL: test_vqshlub_n_s8
-// CHECK: sqshlu {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (int8_t)vqshlub_n_s8(a, 7);
 }
 
+// CHECK-LABEL: @test_vqshluh_n_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshluh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshluh_n_s16
-// CHECK: sqshlu {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (int16_t)vqshluh_n_s16(a, 15);
 }
 
+// CHECK-LABEL: @test_vqshlus_n_s32(
+// CHECK:   [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLUS_N_S32]]
 int32_t test_vqshlus_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshlus_n_s32
-// CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (int32_t)vqshlus_n_s32(a, 31);
 }
 
+// CHECK-LABEL: @test_vqshlud_n_s64(
+// CHECK:   [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHLU_N]]
 int64_t test_vqshlud_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshlud_n_s64
-// CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vqshlud_n_s64(a, 63);
 }
 
+// CHECK-LABEL: @test_vqshlu_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHLU_N1]]
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vqshlu_n_s64
-// CHECK: sqshlu d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshlu_n_s64(a, 1);
 }
 
+// CHECK-LABEL: @test_vsrid_n_s64(
+// CHECK:   [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63)
+// CHECK:   [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S642]] to i64
+// CHECK:   ret i64 [[VSRID_N_S643]]
 int64_t test_vsrid_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsrid_n_s64
-// CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vsrid_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: @test_vsri_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1)
+// CHECK:   ret <1 x i64> [[VSRI_N2]]
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsri_n_s64
-// CHECK: sri d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsri_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: @test_vsrid_n_u64(
+// CHECK:   [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63)
+// CHECK:   [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U642]] to i64
+// CHECK:   ret i64 [[VSRID_N_U643]]
 uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsrid_n_u64
-// CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vsrid_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: @test_vsri_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1)
+// CHECK:   ret <1 x i64> [[VSRI_N2]]
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsri_n_u64
-// CHECK: sri d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsri_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: @test_vslid_n_s64(
+// CHECK:   [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63)
+// CHECK:   [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S642]] to i64
+// CHECK:   ret i64 [[VSLID_N_S643]]
 int64_t test_vslid_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vslid_n_s64
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vslid_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: @test_vsli_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsli_n_s64
-// CHECK: sli d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsli_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: @test_vslid_n_u64(
+// CHECK:   [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63)
+// CHECK:   [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U642]] to i64
+// CHECK:   ret i64 [[VSLID_N_U643]]
 uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vslid_n_u64
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vslid_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: @test_vsli_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsli_n_u64
-// CHECK: sli d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsli_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: @test_vqshrnh_n_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshrnh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshrnh_n_s16
-// CHECK: sqshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqshrnh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: @test_vqshrns_n_s32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshrns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshrns_n_s32
-// CHECK: sqshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqshrns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: @test_vqshrnd_n_s64(
+// CHECK:   [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRND_N_S64]]
 int32_t test_vqshrnd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshrnd_n_s64
-// CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqshrnd_n_s64(a, 32);
 }
 
+// CHECK-LABEL: @test_vqshrnh_n_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqshrnh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqshrnh_n_u16
-// CHECK: uqshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (uint8_t)vqshrnh_n_u16(a, 8);
 }
 
+// CHECK-LABEL: @test_vqshrns_n_u32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqshrns_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqshrns_n_u32
-// CHECK: uqshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (uint16_t)vqshrns_n_u32(a, 16);
 }
 
+// CHECK-LABEL: @test_vqshrnd_n_u64(
+// CHECK:   [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRND_N_U64]]
 uint32_t test_vqshrnd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqshrnd_n_u64
-// CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (uint32_t)vqshrnd_n_u64(a, 32);
 }
 
+// CHECK-LABEL: @test_vqrshrnh_n_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqrshrnh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqrshrnh_n_s16
-// CHECK: sqrshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqrshrnh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: @test_vqrshrns_n_s32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqrshrns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqrshrns_n_s32
-// CHECK: sqrshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqrshrns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: @test_vqrshrnd_n_s64(
+// CHECK:   [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRND_N_S64]]
 int32_t test_vqrshrnd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqrshrnd_n_s64
-// CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqrshrnd_n_s64(a, 32);
 }
 
+// CHECK-LABEL: @test_vqrshrnh_n_u16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqrshrnh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqrshrnh_n_u16
-// CHECK: uqrshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (uint8_t)vqrshrnh_n_u16(a, 8);
 }
 
+// CHECK-LABEL: @test_vqrshrns_n_u32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqrshrns_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqrshrns_n_u32
-// CHECK: uqrshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (uint16_t)vqrshrns_n_u32(a, 16);
 }
 
+// CHECK-LABEL: @test_vqrshrnd_n_u64(
+// CHECK:   [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRND_N_U64]]
 uint32_t test_vqrshrnd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqrshrnd_n_u64
-// CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (uint32_t)vqrshrnd_n_u64(a, 32);
 }
 
+// CHECK-LABEL: @test_vqshrunh_n_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshrunh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshrunh_n_s16
-// CHECK: sqshrun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqshrunh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: @test_vqshruns_n_s32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshruns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshruns_n_s32
-// CHECK: sqshrun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqshruns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: @test_vqshrund_n_s64(
+// CHECK:   [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRUND_N_S64]]
 int32_t test_vqshrund_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshrund_n_s64
-// CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqshrund_n_s64(a, 32);
 }
 
+// CHECK-LABEL: @test_vqrshrunh_n_s16(
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqrshrunh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqrshrunh_n_s16
-// CHECK: sqrshrun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqrshrunh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: @test_vqrshruns_n_s32(
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqrshruns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqrshruns_n_s32
-// CHECK: sqrshrun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqrshruns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: @test_vqrshrund_n_s64(
+// CHECK:   [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRUND_N_S64]]
 int32_t test_vqrshrund_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqrshrund_n_s64
-// CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqrshrund_n_s64(a, 32);
 }
 
+// CHECK-LABEL: @test_vcvts_n_f32_s32(
+// CHECK:   [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1)
+// CHECK:   ret float [[VCVTS_N_F32_S32]]
 float32_t test_vcvts_n_f32_s32(int32_t a) {
-// CHECK-LABEL: test_vcvts_n_f32_s32
-// CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
   return vcvts_n_f32_s32(a, 1);
 }
 
+// CHECK-LABEL: @test_vcvtd_n_f64_s64(
+// CHECK:   [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1)
+// CHECK:   ret double [[VCVTD_N_F64_S64]]
 float64_t test_vcvtd_n_f64_s64(int64_t a) {
-// CHECK-LABEL: test_vcvtd_n_f64_s64
-// CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vcvtd_n_f64_s64(a, 1);
 }
 
+// CHECK-LABEL: @test_vcvts_n_f32_u32(
+// CHECK:   [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32)
+// CHECK:   ret float [[VCVTS_N_F32_U32]]
 float32_t test_vcvts_n_f32_u32(uint32_t a) {
-// CHECK-LABEL: test_vcvts_n_f32_u32
-// CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #32
   return vcvts_n_f32_u32(a, 32);
 }
 
+// CHECK-LABEL: @test_vcvtd_n_f64_u64(
+// CHECK:   [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64)
+// CHECK:   ret double [[VCVTD_N_F64_U64]]
 float64_t test_vcvtd_n_f64_u64(uint64_t a) {
-// CHECK-LABEL: test_vcvtd_n_f64_u64
-// CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #64
   return vcvtd_n_f64_u64(a, 64);
 }
 
+// CHECK-LABEL: @test_vcvts_n_s32_f32(
+// CHECK:   [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1)
+// CHECK:   ret i32 [[VCVTS_N_S32_F32]]
 int32_t test_vcvts_n_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_n_s32_f32
-// CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
   return (int32_t)vcvts_n_s32_f32(a, 1);
 }
 
+// CHECK-LABEL: @test_vcvtd_n_s64_f64(
+// CHECK:   [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1)
+// CHECK:   ret i64 [[VCVTD_N_S64_F64]]
 int64_t test_vcvtd_n_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_n_s64_f64
-// CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
   return (int64_t)vcvtd_n_s64_f64(a, 1);
 }
 
+// CHECK-LABEL: @test_vcvts_n_u32_f32(
+// CHECK:   [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32)
+// CHECK:   ret i32 [[VCVTS_N_U32_F32]]
 uint32_t test_vcvts_n_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_n_u32_f32
-// CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
   return (uint32_t)vcvts_n_u32_f32(a, 32);
 }
 
+// CHECK-LABEL: @test_vcvtd_n_u64_f64(
+// CHECK:   [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64)
+// CHECK:   ret i64 [[VCVTD_N_U64_F64]]
 uint64_t test_vcvtd_n_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_n_u64_f64
-// CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
   return (uint64_t)vcvtd_n_u64_f64(a, 64);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_u8(
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f64(float64x1_t a) {
   return vreinterpret_s8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_p8(
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s8_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) {
   return vreinterpret_s8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_u16(
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f64(float64x1_t a) {
   return vreinterpret_s16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_p16(
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s16_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) {
   return vreinterpret_s16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_u32(
+// CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f64(float64x1_t a) {
   return vreinterpret_s32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s32_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) {
   return vreinterpret_s32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_u64(
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f64(float64x1_t a) {
   return vreinterpret_s64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_s64_p64(
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) {
   return vreinterpret_s64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_s8(
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) {
   return vreinterpret_u8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_p8(
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u8_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) {
   return vreinterpret_u8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_s16(
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) {
   return vreinterpret_u16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_p16(
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u16_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) {
   return vreinterpret_u16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_s32(
+// CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) {
   return vreinterpret_u32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u32_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) {
   return vreinterpret_u32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_s64(
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) {
   return vreinterpret_u64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_u64_p64(
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) {
   return vreinterpret_u64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f64(float64x1_t a) {
   return vreinterpret_f16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f16_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) {
   return vreinterpret_f16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f64(float64x1_t a) {
   return vreinterpret_f32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f32_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) {
   return vreinterpret_f32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s8(int8x8_t a) {
   return vreinterpret_f64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s16(int16x4_t a) {
   return vreinterpret_f64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s32(int32x2_t a) {
   return vreinterpret_f64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s64(int64x1_t a) {
   return vreinterpret_f64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) {
   return vreinterpret_f64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) {
   return vreinterpret_f64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) {
   return vreinterpret_f64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) {
   return vreinterpret_f64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_f16(float16x4_t a) {
   return vreinterpret_f64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_f32(float32x2_t a) {
   return vreinterpret_f64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) {
   return vreinterpret_f64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) {
   return vreinterpret_f64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_f64_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) {
   return vreinterpret_f64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_s8(
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_u8(
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) {
   return vreinterpret_p8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p8_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) {
   return vreinterpret_p8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_s16(
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_u16(
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) {
   return vreinterpret_p16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p16_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) {
   return vreinterpret_p16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) {
   return vreinterpret_p64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) {
   return vreinterpret_p64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) {
   return vreinterpret_p64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_s64(
+// CHECK:   ret <1 x i64> %a
 poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) {
   return vreinterpret_p64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) {
   return vreinterpret_p64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) {
   return vreinterpret_p64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) {
   return vreinterpret_p64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_u64(
+// CHECK:   ret <1 x i64> %a
 poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) {
   return vreinterpret_p64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) {
   return vreinterpret_p64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) {
   return vreinterpret_p64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) {
   return vreinterpret_p64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) {
   return vreinterpret_p64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpret_p64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) {
   return vreinterpret_p64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_u8(
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) {
   return vreinterpretq_s8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_p8(
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s8_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) {
   return vreinterpretq_s8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_u16(
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) {
   return vreinterpretq_s16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_p16(
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s16_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) {
   return vreinterpretq_s16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_u32(
+// CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) {
   return vreinterpretq_s32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s32_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) {
   return vreinterpretq_s32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_u64(
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) {
   return vreinterpretq_s64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_s64_p64(
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) {
   return vreinterpretq_s64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_s8(
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) {
   return vreinterpretq_u8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_p8(
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u8_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) {
   return vreinterpretq_u8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_s16(
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) {
   return vreinterpretq_u16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_p16(
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u16_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) {
   return vreinterpretq_u16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_s32(
+// CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) {
   return vreinterpretq_u32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u32_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) {
   return vreinterpretq_u32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_s64(
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) {
   return vreinterpretq_u64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_u64_p64(
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) {
   return vreinterpretq_u64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) {
   return vreinterpretq_f16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f16_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) {
   return vreinterpretq_f16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) {
   return vreinterpretq_f32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f32_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) {
   return vreinterpretq_f32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) {
   return vreinterpretq_f64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) {
   return vreinterpretq_f64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) {
   return vreinterpretq_f64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) {
   return vreinterpretq_f64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) {
   return vreinterpretq_f64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) {
   return vreinterpretq_f64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) {
   return vreinterpretq_f64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) {
   return vreinterpretq_f64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) {
   return vreinterpretq_f64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) {
   return vreinterpretq_f64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) {
   return vreinterpretq_f64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) {
   return vreinterpretq_f64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_f64_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) {
   return vreinterpretq_f64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_s8(
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_u8(
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) {
   return vreinterpretq_p8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p8_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) {
   return vreinterpretq_p8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_s16(
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_u16(
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) {
   return vreinterpretq_p16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p16_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) {
   return vreinterpretq_p16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) {
   return vreinterpretq_p64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) {
   return vreinterpretq_p64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) {
   return vreinterpretq_p64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_s64(
+// CHECK:   ret <2 x i64> %a
 poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) {
   return vreinterpretq_p64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) {
   return vreinterpretq_p64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) {
   return vreinterpretq_p64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) {
   return vreinterpretq_p64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_u64(
+// CHECK:   ret <2 x i64> %a
 poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) {
   return vreinterpretq_p64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) {
   return vreinterpretq_p64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) {
   return vreinterpretq_p64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) {
   return vreinterpretq_p64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) {
   return vreinterpretq_p64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: @test_vreinterpretq_p64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) {
   return vreinterpretq_p64_p16(a);
 }
 
+// CHECK-LABEL: @test_vabds_f32(
+// CHECK:   [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) #4
+// CHECK:   ret float [[VABDS_F32_I]]
 float32_t test_vabds_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vabds_f32
-// CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return vabds_f32(a, b);
 }
 
+// CHECK-LABEL: @test_vabdd_f64(
+// CHECK:   [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) #4
+// CHECK:   ret double [[VABDD_F64_I]]
 float64_t test_vabdd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vabdd_f64
-// CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   return vabdd_f64(a, b);
 }
 
+// CHECK-LABEL: @test_vuqadd_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   ret <1 x i64> [[VUQADD2_I]]
 int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vuqadd_s64
   return vuqadd_s64(a, b);
-  // CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vsqadd_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   ret <1 x i64> [[VSQADD2_I]]
 uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vsqadd_u64
   return vsqadd_u64(a, b);
-  // CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vsqadd_u8(
+// CHECK:   [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSQADD_I]]
 uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsqadd_u8
   return vsqadd_u8(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vsqaddq_u8(
+// CHECK:   [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSQADD_I]]
 uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsqaddq_u8
   return vsqaddq_u8(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vsqadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i16> [[VSQADD2_I]]
 uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsqadd_u16
   return vsqadd_u16(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vsqaddq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i16> [[VSQADD2_I]]
 uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsqaddq_u16
   return vsqaddq_u16(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vsqadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i32> [[VSQADD2_I]]
 uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsqadd_u32
   return vsqadd_u32(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsqaddq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   ret <4 x i32> [[VSQADD2_I]]
 uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsqaddq_u32
   return vsqaddq_u32(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsqaddq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   ret <2 x i64> [[VSQADD2_I]]
 uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsqaddq_u64
   return vsqaddq_u64(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vabs_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %a) #4
+// CHECK:   ret <1 x i64> [[VABS1_I]]
 int64x1_t test_vabs_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vabs_s64
   return vabs_s64(a);
-  // CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqabs_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> %a) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQABS_V1_I]]
 int64x1_t test_vqabs_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vqabs_s64
   return vqabs_s64(a);
-  // CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vqneg_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> %a) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQNEG_V1_I]]
 int64x1_t test_vqneg_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vqneg_s64
   return vqneg_s64(a);
-  // CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vneg_s64(
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vneg_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vneg_s64
   return vneg_s64(a);
-  // CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vaddv_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VADDV_F32_I]]
 float32_t test_vaddv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vaddv_f32
   return vaddv_f32(a);
-  // CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddvq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a) #4
+// CHECK:   ret float [[VADDVQ_F32_I]]
 float32_t test_vaddvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_f32
   return vaddvq_f32(a);
-  // CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddvq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VADDVQ_F64_I]]
 float64_t test_vaddvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_f64
   return vaddvq_f64(a);
-  // CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmaxv_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VMAXV_F32_I]]
 float32_t test_vmaxv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_f32
   return vmaxv_f32(a);
-  // CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmaxvq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VMAXVQ_F64_I]]
 float64_t test_vmaxvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vmaxvq_f64
   return vmaxvq_f64(a);
-  // CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vminv_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VMINV_F32_I]]
 float32_t test_vminv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vminv_f32
   return vminv_f32(a);
-  // CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vminvq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VMINVQ_F64_I]]
 float64_t test_vminvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vminvq_f64
   return vminvq_f64(a);
-  // CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmaxnmvq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VMAXNMVQ_F64_I]]
 float64_t test_vmaxnmvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vmaxnmvq_f64
   return vmaxnmvq_f64(a);
-  // CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmaxnmv_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VMAXNMV_F32_I]]
 float32_t test_vmaxnmv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vmaxnmv_f32
   return vmaxnmv_f32(a);
-  // CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vminnmvq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a) #4
+// CHECK:   ret double [[VMINNMVQ_F64_I]]
 float64_t test_vminnmvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vminnmvq_f64
   return vminnmvq_f64(a);
-  // CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vminnmv_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a) #4
+// CHECK:   ret float [[VMINNMV_F32_I]]
 float32_t test_vminnmv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vminnmv_f32
   return vminnmv_f32(a);
-  // CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpaddq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VPADDQ_V2_I]]
 int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vpaddq_s64
   return vpaddq_s64(a, b);
-  // CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vpaddq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VPADDQ_V2_I]]
 uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vpaddq_u64
   return vpaddq_u64(a, b);
-  // CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vpaddd_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) #4
+// CHECK:   ret i64 [[VPADDD_U64_I]]
 uint64_t test_vpaddd_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vpaddd_u64
   return vpaddd_u64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vaddvq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a) #4
+// CHECK:   ret i64 [[VADDVQ_S64_I]]
 int64_t test_vaddvq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_s64
   return vaddvq_s64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vaddvq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) #4
+// CHECK:   ret i64 [[VADDVQ_U64_I]]
 uint64_t test_vaddvq_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_u64
   return vaddvq_u64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vadd_f64(
+// CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[ADD_I]]
 float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vadd_f64
   return vadd_f64(a, b);
-  // CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmul_f64(
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[MUL_I]]
 float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vmul_f64
   return vmul_f64(a, b);
-  // CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vdiv_f64(
+// CHECK:   [[DIV_I:%.*]] = fdiv <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[DIV_I]]
 float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vdiv_f64
   return vdiv_f64(a, b);
-  // CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmla_f64(
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]]
+// CHECK:   ret <1 x double> [[ADD_I]]
 float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vmla_f64
   return vmla_f64(a, b, c);
-  // CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  // CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmls_f64(
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]]
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vmls_f64
   return vmls_f64(a, b, c);
-  // CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  // CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vfma_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a) #4
+// CHECK:   ret <1 x double> [[TMP3]]
 float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vfma_f64
   return vfma_f64(a, b, c);
-  // CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vfms_f64(
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a) #4
+// CHECK:   ret <1 x double> [[TMP3]]
 float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vfms_f64
   return vfms_f64(a, b, c);
-  // CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vsub_f64(
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vsub_f64
   return vsub_f64(a, b);
-  // CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vabd_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x double> [[VABD2_I]]
 float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vabd_f64
   return vabd_f64(a, b);
-  // CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmax_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x double> [[VMAX2_I]]
 float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmax_f64
   return vmax_f64(a, b);
-// CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmin_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x double> [[VMIN2_I]]
 float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmin_f64
   return vmin_f64(a, b);
-// CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vmaxnm_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x double> [[VMAXNM2_I]]
 float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmaxnm_f64
   return vmaxnm_f64(a, b);
-// CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vminnm_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x double> [[VMINNM2_I]]
 float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vminnm_f64
   return vminnm_f64(a, b);
-// CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vabs_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VABS1_I]]
 float64x1_t test_vabs_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vabs_f64
   return vabs_f64(a);
-  // CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vneg_f64(
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %a
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vneg_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vneg_f64
   return vneg_f64(a);
-  // CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvt_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fptosi <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP1]]
 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_s64_f64
   return vcvt_s64_f64(a);
-  // CHECK: fcvtzs {{[xd][0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvt_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fptoui <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP1]]
 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_u64_f64
   return vcvt_u64_f64(a);
-  // CHECK: fcvtzu {{[xd][0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvtn_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCVTN1_I]]
 int64x1_t test_vcvtn_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtn_s64_f64
   return vcvtn_s64_f64(a);
-  // CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvtn_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCVTN1_I]]
 uint64x1_t test_vcvtn_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtn_u64_f64
   return vcvtn_u64_f64(a);
-  // CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvtp_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCVTP1_I]]
 int64x1_t test_vcvtp_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtp_s64_f64
   return vcvtp_s64_f64(a);
-  // CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvtp_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCVTP1_I]]
 uint64x1_t test_vcvtp_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtp_u64_f64
   return vcvtp_u64_f64(a);
-  // CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvtm_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCVTM1_I]]
 int64x1_t test_vcvtm_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtm_s64_f64
   return vcvtm_s64_f64(a);
-  // CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvtm_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCVTM1_I]]
 uint64x1_t test_vcvtm_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtm_u64_f64
   return vcvtm_u64_f64(a);
-  // CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvta_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCVTA1_I]]
 int64x1_t test_vcvta_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvta_s64_f64
   return vcvta_s64_f64(a);
-  // CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvta_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x i64> [[VCVTA1_I]]
 uint64x1_t test_vcvta_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvta_u64_f64
   return vcvta_u64_f64(a);
-  // CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvt_f64_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[VCVT_I]]
 float64x1_t test_vcvt_f64_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vcvt_f64_s64
   return vcvt_f64_s64(a);
-  // CHECK: scvtf d{{[0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvt_f64_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[VCVT_I]]
 float64x1_t test_vcvt_f64_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vcvt_f64_u64
   return vcvt_f64_u64(a);
-  // CHECK: ucvtf d{{[0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: @test_vcvt_n_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x i64> [[VCVT_N1]]
 int64x1_t test_vcvt_n_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_s64_f64
   return vcvt_n_s64_f64(a, 64);
-  // CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: @test_vcvt_n_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x i64> [[VCVT_N1]]
 uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_u64_f64
   return vcvt_n_u64_f64(a, 64);
-  // CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: @test_vcvt_n_f64_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x double> [[VCVT_N1]]
 float64x1_t test_vcvt_n_f64_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_f64_s64
   return vcvt_n_f64_s64(a, 64);
-  // CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: @test_vcvt_n_f64_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x double> [[VCVT_N1]]
 float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_f64_u64
   return vcvt_n_f64_u64(a, 64);
-  // CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: @test_vrndn_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDN1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRNDN1_I]]
 float64x1_t test_vrndn_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndn_f64
   return vrndn_f64(a);
-  // CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrnda_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRNDA1_I]]
 float64x1_t test_vrnda_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrnda_f64
   return vrnda_f64(a);
-  // CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrndp_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRNDP1_I]]
 float64x1_t test_vrndp_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndp_f64
   return vrndp_f64(a);
-  // CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrndm_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRNDM1_I]]
 float64x1_t test_vrndm_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndm_f64
   return vrndm_f64(a);
-  // CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrndx_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRNDX1_I]]
 float64x1_t test_vrndx_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndx_f64
   return vrndx_f64(a);
-  // CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrnd_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRNDZ1_I]]
 float64x1_t test_vrnd_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrnd_f64
   return vrnd_f64(a);
-  // CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrndi_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRNDI1_I]]
 float64x1_t test_vrndi_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndi_f64
   return vrndi_f64(a);
-  // CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrsqrte_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRSQRTE_V1_I]]
 float64x1_t test_vrsqrte_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrsqrte_f64
   return vrsqrte_f64(a);
-  // CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrecpe_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VRECPE_V1_I]]
 float64x1_t test_vrecpe_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrecpe_f64
   return vrecpe_f64(a);
-  // CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vsqrt_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a) #4
+// CHECK:   ret <1 x double> [[VSQRT_I]]
 float64x1_t test_vsqrt_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vsqrt_f64
   return vsqrt_f64(a);
-  // CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrecps_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   ret <1 x double> [[VRECPS_V2_I]]
 float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vrecps_f64
   return vrecps_f64(a, b);
-  // CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vrsqrts_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> %a, <1 x double> %b) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x double> [[VRSQRTS_V2_I]]
 float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vrsqrts_f64
   return vrsqrts_f64(a, b);
-  // CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: @test_vminv_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a) #4
+// CHECK:   ret i32 [[VMINV_S32_I]]
 int32_t test_vminv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vminv_s32
   return vminv_s32(a);
-  // CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vminv_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> %a) #4
+// CHECK:   ret i32 [[VMINV_U32_I]]
 uint32_t test_vminv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vminv_u32
   return vminv_u32(a);
-  // CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmaxv_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a) #4
+// CHECK:   ret i32 [[VMAXV_S32_I]]
 int32_t test_vmaxv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_s32
   return vmaxv_s32(a);
-  // CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vmaxv_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> %a) #4
+// CHECK:   ret i32 [[VMAXV_U32_I]]
 uint32_t test_vmaxv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_u32
   return vmaxv_u32(a);
-  // CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddv_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a) #4
+// CHECK:   ret i32 [[VADDV_S32_I]]
 int32_t test_vaddv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vaddv_s32
   return vaddv_s32(a);
-  // CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddv_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a) #4
+// CHECK:   ret i32 [[VADDV_U32_I]]
 uint32_t test_vaddv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vaddv_u32
   return vaddv_u32(a);
-  // CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddlv_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a) #4
+// CHECK:   ret i64 [[VADDLV_S32_I]]
 int64_t test_vaddlv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vaddlv_s32
   return vaddlv_s32(a);
-  // CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vaddlv_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a) #4
+// CHECK:   ret i64 [[VADDLV_U32_I]]
 uint64_t test_vaddlv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vaddlv_u32
   return vaddlv_u32(a);
-  // CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
 }
diff --git a/test/CodeGen/aarch64-neon-ldst-one.c b/test/CodeGen/aarch64-neon-ldst-one.c
index dc888c2..25bd797 100644
--- a/test/CodeGen/aarch64-neon-ldst-one.c
+++ b/test/CodeGen/aarch64-neon-ldst-one.c
@@ -1,2049 +1,7977 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 uint8x16_t test_vld1q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u8
   return vld1q_dup_u8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vld1q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u16
   return vld1q_dup_u16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vld1q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u32
   return vld1q_dup_u32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vld1q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u64
   return vld1q_dup_u64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 int8x16_t test_vld1q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s8
   return vld1q_dup_s8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vld1q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s16
   return vld1q_dup_s16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vld1q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s32
   return vld1q_dup_s32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vld1q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s64
   return vld1q_dup_s64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f16
   return vld1q_dup_f16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vld1q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f32
   return vld1q_dup_f32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_dup_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x double> [[LANE]]
 float64x2_t test_vld1q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f64
   return vld1q_dup_f64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 poly8x16_t test_vld1q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p8
   return vld1q_dup_p8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vld1q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p16
   return vld1q_dup_p16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_p64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 poly64x2_t test_vld1q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p64
   return vld1q_dup_p64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 uint8x8_t test_vld1_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u8
   return vld1_dup_u8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vld1_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u16
   return vld1_dup_u16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vld1_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u32
   return vld1_dup_u32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vld1_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u64
   return vld1_dup_u64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 int8x8_t test_vld1_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s8
   return vld1_dup_s8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vld1_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s16
   return vld1_dup_s16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vld1_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s32
   return vld1_dup_s32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vld1_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s64
   return vld1_dup_s64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f16
   return vld1_dup_f16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vld1_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f32
   return vld1_dup_f32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_dup_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[LANE]]
 float64x1_t test_vld1_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f64
   return vld1_dup_f64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 poly8x8_t test_vld1_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p8
   return vld1_dup_p8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vld1_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p16
   return vld1_dup_p16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_p64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p64
   return vld1_dup_p64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld2q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u8
   return vld2q_dup_u8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u16
   return vld2q_dup_u16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u32
   return vld2q_dup_u32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u64
   return vld2q_dup_u64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld2q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s8
   return vld2q_dup_s8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s16
   return vld2q_dup_s16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s32
   return vld2q_dup_s32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s64
   return vld2q_dup_s64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f16
   return vld2q_dup_f16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f32
   return vld2q_dup_f32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f64
   return vld2q_dup_f64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld2q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p8
   return vld2q_dup_p8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p16
   return vld2q_dup_p16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p64
   return vld2q_dup_p64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld2_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u8
   return vld2_dup_u8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u16
   return vld2_dup_u16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u32
   return vld2_dup_u32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u64
   return vld2_dup_u64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld2_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s8
   return vld2_dup_s8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s16
   return vld2_dup_s16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s32
   return vld2_dup_s32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s64
   return vld2_dup_s64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f16
   return vld2_dup_f16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f32
   return vld2_dup_f32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f64
   return vld2_dup_f64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld2_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p8
   return vld2_dup_p8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p16
   return vld2_dup_p16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p64
   return vld2_dup_p64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld3q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u8
   return vld3q_dup_u8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u16
   return vld3q_dup_u16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u32
   return vld3q_dup_u32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u64
   return vld3q_dup_u64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld3q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s8
   return vld3q_dup_s8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s16
   return vld3q_dup_s16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s32
   return vld3q_dup_s32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s64
   return vld3q_dup_s64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f16
   return vld3q_dup_f16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f32
   return vld3q_dup_f32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f64
   return vld3q_dup_f64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld3q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p8
   return vld3q_dup_p8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p16
   return vld3q_dup_p16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p64
   return vld3q_dup_p64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld3_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u8
   return vld3_dup_u8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u16
   return vld3_dup_u16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u32
   return vld3_dup_u32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u64
   return vld3_dup_u64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld3_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s8
   return vld3_dup_s8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s16
   return vld3_dup_s16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s32
   return vld3_dup_s32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s64
   return vld3_dup_s64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f16
   return vld3_dup_f16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f32
   return vld3_dup_f32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f64
   return vld3_dup_f64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld3_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p8
   return vld3_dup_p8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p16
   return vld3_dup_p16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p64
   return vld3_dup_p64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld4q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u8
   return vld4q_dup_u8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u16
   return vld4q_dup_u16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u32
   return vld4q_dup_u32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u64
   return vld4q_dup_u64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld4q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s8
   return vld4q_dup_s8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s16
   return vld4q_dup_s16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s32
   return vld4q_dup_s32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s64
   return vld4q_dup_s64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f16
   return vld4q_dup_f16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f32
   return vld4q_dup_f32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f64
   return vld4q_dup_f64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld4q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p8
   return vld4q_dup_p8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p16
   return vld4q_dup_p16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p64
   return vld4q_dup_p64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld4_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u8
   return vld4_dup_u8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u16
   return vld4_dup_u16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u32
   return vld4_dup_u32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u64
   return vld4_dup_u64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld4_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s8
   return vld4_dup_s8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s16
   return vld4_dup_s16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s32
   return vld4_dup_s32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s64
   return vld4_dup_s64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f16
   return vld4_dup_f16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f32
   return vld4_dup_f32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f64
   return vld4_dup_f64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld4_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p8
   return vld4_dup_p8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p16
   return vld4_dup_p16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p64
   return vld4_dup_p64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 uint8x16_t test_vld1q_lane_u8(uint8_t  *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u8
   return vld1q_lane_u8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 uint16x8_t test_vld1q_lane_u16(uint16_t  *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u16
   return vld1q_lane_u16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 uint32x4_t test_vld1q_lane_u32(uint32_t  *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u32
   return vld1q_lane_u32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 uint64x2_t test_vld1q_lane_u64(uint64_t  *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u64
   return vld1q_lane_u64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 int8x16_t test_vld1q_lane_s8(int8_t  *a, int8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s8
   return vld1q_lane_s8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 int16x8_t test_vld1q_lane_s16(int16_t  *a, int16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s16
   return vld1q_lane_s16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 int32x4_t test_vld1q_lane_s32(int32_t  *a, int32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s32
   return vld1q_lane_s32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s64
   return vld1q_lane_s64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f16
   return vld1q_lane_f16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK:   ret <4 x float> [[VLD1_LANE]]
 float32x4_t test_vld1q_lane_f32(float32_t  *a, float32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f32
   return vld1q_lane_f32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
+// CHECK:   ret <2 x double> [[VLD1_LANE]]
 float64x2_t test_vld1q_lane_f64(float64_t  *a, float64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f64
   return vld1q_lane_f64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 poly8x16_t test_vld1q_lane_p8(poly8_t  *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p8
   return vld1q_lane_p8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 poly16x8_t test_vld1q_lane_p16(poly16_t  *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p16
   return vld1q_lane_p16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 poly64x2_t test_vld1q_lane_p64(poly64_t  *a, poly64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p64
   return vld1q_lane_p64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 uint8x8_t test_vld1_lane_u8(uint8_t  *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_u8
   return vld1_lane_u8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 uint16x4_t test_vld1_lane_u16(uint16_t  *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_u16
   return vld1_lane_u16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 uint32x2_t test_vld1_lane_u32(uint32_t  *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_u32
   return vld1_lane_u32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 uint64x1_t test_vld1_lane_u64(uint64_t  *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_u64
   return vld1_lane_u64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 int8x8_t test_vld1_lane_s8(int8_t  *a, int8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_s8
   return vld1_lane_s8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 int16x4_t test_vld1_lane_s16(int16_t  *a, int16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_s16
   return vld1_lane_s16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 int32x2_t test_vld1_lane_s32(int32_t  *a, int32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_s32
   return vld1_lane_s32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_s64
   return vld1_lane_s64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_f16
   return vld1_lane_f16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK:   ret <2 x float> [[VLD1_LANE]]
 float32x2_t test_vld1_lane_f32(float32_t  *a, float32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_f32
   return vld1_lane_f32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
+// CHECK:   ret <1 x double> [[VLD1_LANE]]
 float64x1_t test_vld1_lane_f64(float64_t  *a, float64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_f64
   return vld1_lane_f64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 poly8x8_t test_vld1_lane_p8(poly8_t  *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_p8
   return vld1_lane_p8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 poly16x4_t test_vld1_lane_p16(poly16_t  *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_p16
   return vld1_lane_p16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_p64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_p64
   return vld1_lane_p64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_s8
   return vld2q_lane_s8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b,  v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_u8
   return vld2q_lane_u8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_p8
   return vld2q_lane_p8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP9]]
 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
-  // CHECK-LABEL: test_vld3q_lane_s8
   return vld3q_lane_s8(ptr, src, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
-  // CHECK-LABEL: test_vld3q_lane_u8
   return vld3q_lane_u8(ptr, src, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP13]]
 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u16
   return vld2q_lane_u16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP13]]
 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u32
   return vld2q_lane_u32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP13]]
 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u64
   return vld2q_lane_u64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP13]]
 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s16
   return vld2q_lane_s16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP13]]
 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s32
   return vld2q_lane_s32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP13]]
 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s64
   return vld2q_lane_s64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP13]]
 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f16
   return vld2q_lane_f16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP13]]
 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f32
   return vld2q_lane_f32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0i8(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP13]]
 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f64
   return vld2q_lane_f64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP13]]
 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_p16
   return vld2q_lane_p16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP13]]
 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_p64
   return vld2q_lane_p64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u8
   return vld2_lane_u8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP13]]
 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u16
   return vld2_lane_u16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP13]]
 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u32
   return vld2_lane_u32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP13]]
 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u64
   return vld2_lane_u64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s8
   return vld2_lane_s8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP13]]
 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s16
   return vld2_lane_s16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP13]]
 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s32
   return vld2_lane_s32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP13]]
 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s64
   return vld2_lane_s64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP13]]
 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f16
   return vld2_lane_f16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP13]]
 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f32
   return vld2_lane_f32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0i8(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP13]]
 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f64
   return vld2_lane_f64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p8
   return vld2_lane_p8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP13]]
 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p16
   return vld2_lane_p16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP13]]
 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p64
   return vld2_lane_p64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP16]]
 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u16
   return vld3q_lane_u16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP16]]
 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u32
   return vld3q_lane_u32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP16]]
 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u64
   return vld3q_lane_u64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP16]]
 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s16
   return vld3q_lane_s16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP16]]
 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s32
   return vld3q_lane_s32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP16]]
 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s64
   return vld3q_lane_s64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP16]]
 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f16
   return vld3q_lane_f16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP16]]
 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f32
   return vld3q_lane_f32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0i8(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP16]]
 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f64
   return vld3q_lane_f64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p8
   return vld3q_lane_p8(a, b, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP16]]
 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p16
   return vld3q_lane_p16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP16]]
 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p64
   return vld3q_lane_p64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u8
   return vld3_lane_u8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP16]]
 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u16
   return vld3_lane_u16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP16]]
 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u32
   return vld3_lane_u32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP16]]
 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u64
   return vld3_lane_u64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP9]]
 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s8
   return vld3_lane_s8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP16]]
 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s16
   return vld3_lane_s16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP16]]
 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s32
   return vld3_lane_s32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP16]]
 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s64
   return vld3_lane_s64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP16]]
 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f16
   return vld3_lane_f16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP16]]
 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f32
   return vld3_lane_f32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0i8(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP16]]
 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f64
   return vld3_lane_f64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p8
   return vld3_lane_p8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP16]]
 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p16
   return vld3_lane_p16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP16]]
 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p64
   return vld3_lane_p64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP10]]
 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u8
   return vld4q_lane_u8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP19]]
 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u16
   return vld4q_lane_u16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP19]]
 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u32
   return vld4q_lane_u32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP19]]
 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u64
   return vld4q_lane_u64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP10]]
 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s8
   return vld4q_lane_s8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP19]]
 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s16
   return vld4q_lane_s16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP19]]
 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s32
   return vld4q_lane_s32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP19]]
 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s64
   return vld4q_lane_s64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP19]]
 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f16
   return vld4q_lane_f16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP19]]
 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f32
   return vld4q_lane_f32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0i8(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP19]]
 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f64
   return vld4q_lane_f64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP10]]
 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p8
   return vld4q_lane_p8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP19]]
 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p16
   return vld4q_lane_p16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP19]]
 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p64
   return vld4q_lane_p64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP10]]
 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u8
   return vld4_lane_u8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP19]]
 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u16
   return vld4_lane_u16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP19]]
 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u32
   return vld4_lane_u32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP19]]
 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u64
   return vld4_lane_u64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP10]]
 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s8
   return vld4_lane_s8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP19]]
 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s16
   return vld4_lane_s16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP19]]
 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s32
   return vld4_lane_s32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP19]]
 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s64
   return vld4_lane_s64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP19]]
 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f16
   return vld4_lane_f16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP19]]
 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f32
   return vld4_lane_f32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0i8(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP19]]
 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f64
   return vld4_lane_f64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP10]]
 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p8
   return vld4_lane_p8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP19]]
 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p16
   return vld4_lane_p16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP19]]
 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p64
   return vld4_lane_p64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_u8(uint8_t  *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u8
   vst1q_lane_u8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u16(uint16_t  *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u16
   vst1q_lane_u16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u32(uint32_t  *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u32
   vst1q_lane_u32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u64(uint64_t  *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u64
   vst1q_lane_u64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_s8(int8_t  *a, int8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s8
   vst1q_lane_s8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s16(int16_t  *a, int16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s16
   vst1q_lane_s16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s32(int32_t  *a, int32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s32
   vst1q_lane_s32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s64
   vst1q_lane_s64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f16
   vst1q_lane_f16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f32(float32_t  *a, float32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f32
   vst1q_lane_f32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   store double [[TMP3]], double* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f64(float64_t  *a, float64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f64
   vst1q_lane_f64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_p8(poly8_t  *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p8
   vst1q_lane_p8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p16(poly16_t  *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p16
   vst1q_lane_p16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p64(poly64_t  *a, poly64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p64
   vst1q_lane_p64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_u8(uint8_t  *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_u8
   vst1_lane_u8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u16(uint16_t  *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_u16
   vst1_lane_u16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u32(uint32_t  *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_u32
   vst1_lane_u32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u64(uint64_t  *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_u64
   vst1_lane_u64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_s8(int8_t  *a, int8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_s8
   vst1_lane_s8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s16(int16_t  *a, int16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_s16
   vst1_lane_s16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s32(int32_t  *a, int32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_s32
   vst1_lane_s32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_s64
   vst1_lane_s64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_f16
   vst1_lane_f16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f32(float32_t  *a, float32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_f32
   vst1_lane_f32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   store double [[TMP3]], double* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f64(float64_t  *a, float64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_f64
   vst1_lane_f64(a, b, 0);
-  // CHECK: {{st1 { v[0-9]+.d }\[0]|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_p8(poly8_t  *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_p8
   vst1_lane_p8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p16(poly16_t  *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_p16
   vst1_lane_p16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_p64
   vst1_lane_p64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u8
   vst2q_lane_u8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u16
   vst2q_lane_u16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u32
   vst2q_lane_u32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u64
   vst2q_lane_u64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s8
   vst2q_lane_s8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s16
   vst2q_lane_s16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s32
   vst2q_lane_s32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s64
   vst2q_lane_s64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f16
   vst2q_lane_f16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f32
   vst2q_lane_f32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f64
   vst2q_lane_f64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p8
   vst2q_lane_p8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p16
   vst2q_lane_p16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p64
   vst2q_lane_p64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u8
   vst2_lane_u8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u16
   vst2_lane_u16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u32
   vst2_lane_u32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u64
   vst2_lane_u64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s8
   vst2_lane_s8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s16
   vst2_lane_s16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s32
   vst2_lane_s32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s64
   vst2_lane_s64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f16
   vst2_lane_f16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f32
   vst2_lane_f32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f64
   vst2_lane_f64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p8
   vst2_lane_p8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p16
   vst2_lane_p16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p64
   vst2_lane_p64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u8
   vst3q_lane_u8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u16
   vst3q_lane_u16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u32
   vst3q_lane_u32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u64
   vst3q_lane_u64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s8
   vst3q_lane_s8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s16
   vst3q_lane_s16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s32
   vst3q_lane_s32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s64
   vst3q_lane_s64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f16
   vst3q_lane_f16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f32
   vst3q_lane_f32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f64
   vst3q_lane_f64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p8
   vst3q_lane_p8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p16
   vst3q_lane_p16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p64
   vst3q_lane_p64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u8
   vst3_lane_u8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u16
   vst3_lane_u16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u32
   vst3_lane_u32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u64
   vst3_lane_u64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s8
   vst3_lane_s8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s16
   vst3_lane_s16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s32
   vst3_lane_s32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s64
   vst3_lane_s64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f16
   vst3_lane_f16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f32
   vst3_lane_f32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f64
   vst3_lane_f64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p8
   vst3_lane_p8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p16
   vst3_lane_p16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p64
   vst3_lane_p64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u8
   vst4q_lane_u8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u16
   vst4q_lane_u16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u32
   vst4q_lane_u32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u64
   vst4q_lane_u64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s8
   vst4q_lane_s8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s16
   vst4q_lane_s16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s32
   vst4q_lane_s32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s64
   vst4q_lane_s64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f16
   vst4q_lane_f16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f32
   vst4q_lane_f32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f64.p0i8(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f64
   vst4q_lane_f64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p8
   vst4q_lane_p8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p16
   vst4q_lane_p16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p64
   vst4q_lane_p64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u8
   vst4_lane_u8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u16
   vst4_lane_u16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u32
   vst4_lane_u32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u64
   vst4_lane_u64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s8
   vst4_lane_s8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s16
   vst4_lane_s16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s32
   vst4_lane_s32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s64
   vst4_lane_s64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f16
   vst4_lane_f16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f32
   vst4_lane_f32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1f64.p0i8(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f64
   vst4_lane_f64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p8
   vst4_lane_p8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p16
   vst4_lane_p16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p64
   vst4_lane_p64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
diff --git a/test/CodeGen/aarch64-neon-misc.c b/test/CodeGen/aarch64-neon-misc.c
index a251197..1342bbb 100644
--- a/test/CodeGen/aarch64-neon-misc.c
+++ b/test/CodeGen/aarch64-neon-misc.c
@@ -1,2041 +1,2798 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vceqz_s8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_s8(
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_s8(int8x8_t a) {
   return vceqz_s8(a);
 }
 
-// CHECK-LABEL: test_vceqz_s16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_s16(int16x4_t a) {
   return vceqz_s16(a);
 }
 
-// CHECK-LABEL: test_vceqz_s32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_s32(int32x2_t a) {
   return vceqz_s32(a);
 }
 
-// CHECK-LABEL: test_vceqz_s64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_s64(int64x1_t a) {
   return vceqz_s64(a);
 }
 
-// CHECK-LABEL: test_vceqz_u64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_u64(uint64x1_t a) {
   return vceqz_u64(a);
 }
 
-// CHECK-LABEL: test_vceqz_p64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_p64(poly64x1_t a) {
   return vceqz_p64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_s8(
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_s8(int8x16_t a) {
   return vceqzq_s8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_s16(int16x8_t a) {
   return vceqzq_s16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_s32(int32x4_t a) {
   return vceqzq_s32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_s64(int64x2_t a) {
   return vceqzq_s64(a);
 }
 
-// CHECK-LABEL: test_vceqz_u8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_u8(
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_u8(uint8x8_t a) {
   return vceqz_u8(a);
 }
 
-// CHECK-LABEL: test_vceqz_u16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_u16(uint16x4_t a) {
   return vceqz_u16(a);
 }
 
-// CHECK-LABEL: test_vceqz_u32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_u32(uint32x2_t a) {
   return vceqz_u32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_u8(
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_u8(uint8x16_t a) {
   return vceqzq_u8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_u16(uint16x8_t a) {
   return vceqzq_u16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_u32(uint32x4_t a) {
   return vceqzq_u32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_u64(uint64x2_t a) {
   return vceqzq_u64(a);
 }
 
-// CHECK-LABEL: test_vceqz_f32
-// CHECK: fcmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: @test_vceqz_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp oeq <2 x float> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_f32(float32x2_t a) {
   return vceqz_f32(a);
 }
 
-// CHECK-LABEL: test_vceqz_f64
-// CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: @test_vceqz_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp oeq <1 x double> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_f64(float64x1_t a) {
   return vceqz_f64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f32
-// CHECK: fcmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: @test_vceqzq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp oeq <4 x float> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_f32(float32x4_t a) {
   return vceqzq_f32(a);
 }
 
-// CHECK-LABEL: test_vceqz_p8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_p8(
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_p8(poly8x8_t a) {
   return vceqz_p8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_p8(
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_p8(poly8x16_t a) {
   return vceqzq_p8(a);
 }
 
-// CHECK-LABEL: test_vceqz_p16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqz_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_p16(poly16x4_t a) {
   return vceqz_p16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: @test_vceqzq_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_p16(poly16x8_t a) {
   return vceqzq_p16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f64
-// CHECK: fcmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: @test_vceqzq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp oeq <2 x double> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_f64(float64x2_t a) {
   return vceqzq_f64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: @test_vceqzq_p64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_p64(poly64x2_t a) {
   return vceqzq_p64(a);
 }
 
-// CHECK-LABEL: test_vcgez_s8
-// CHECK: cmge  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgez_s8(
+// CHECK:   [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCGEZ_I]]
 uint8x8_t test_vcgez_s8(int8x8_t a) {
   return vcgez_s8(a);
 }
 
-// CHECK-LABEL: test_vcgez_s16
-// CHECK: cmge  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgez_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sge <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCGEZ_I]]
 uint16x4_t test_vcgez_s16(int16x4_t a) {
   return vcgez_s16(a);
 }
 
-// CHECK-LABEL: test_vcgez_s32
-// CHECK: cmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgez_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sge <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_s32(int32x2_t a) {
   return vcgez_s32(a);
 }
 
-// CHECK-LABEL: test_vcgez_s64
-// CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgez_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sge <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_s64(int64x1_t a) {
   return vcgez_s64(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s8
-// CHECK: cmge  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgezq_s8(
+// CHECK:   [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCGEZ_I]]
 uint8x16_t test_vcgezq_s8(int8x16_t a) {
   return vcgezq_s8(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s16
-// CHECK: cmge  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgezq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sge <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCGEZ_I]]
 uint16x8_t test_vcgezq_s16(int16x8_t a) {
   return vcgezq_s16(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s32
-// CHECK: cmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgezq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sge <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_s32(int32x4_t a) {
   return vcgezq_s32(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s64
-// CHECK: cmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgezq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sge <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_s64(int64x2_t a) {
   return vcgezq_s64(a);
 }
 
-// CHECK-LABEL: test_vcgez_f32
-// CHECK: fcmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: @test_vcgez_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp oge <2 x float> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_f32(float32x2_t a) {
   return vcgez_f32(a);
 }
 
-// CHECK-LABEL: test_vcgez_f64
-// CHECK: fcmge  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: @test_vcgez_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp oge <1 x double> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_f64(float64x1_t a) {
   return vcgez_f64(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f32
-// CHECK: fcmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: @test_vcgezq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp oge <4 x float> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_f32(float32x4_t a) {
   return vcgezq_f32(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f64
-// CHECK: fcmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: @test_vcgezq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp oge <2 x double> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_f64(float64x2_t a) {
   return vcgezq_f64(a);
 }
 
-// CHECK-LABEL: test_vclez_s8
-// CHECK: cmle  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: @test_vclez_s8(
+// CHECK:   [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCLEZ_I]]
 uint8x8_t test_vclez_s8(int8x8_t a) {
   return vclez_s8(a);
 }
 
-// CHECK-LABEL: test_vclez_s16
-// CHECK: cmle  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: @test_vclez_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sle <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCLEZ_I]]
 uint16x4_t test_vclez_s16(int16x4_t a) {
   return vclez_s16(a);
 }
 
-// CHECK-LABEL: test_vclez_s32
-// CHECK: cmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: @test_vclez_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sle <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_s32(int32x2_t a) {
   return vclez_s32(a);
 }
 
-// CHECK-LABEL: test_vclez_s64
-// CHECK: cmle {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: @test_vclez_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sle <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_s64(int64x1_t a) {
   return vclez_s64(a);
 }
 
-// CHECK-LABEL: test_vclezq_s8
-// CHECK: cmle  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: @test_vclezq_s8(
+// CHECK:   [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCLEZ_I]]
 uint8x16_t test_vclezq_s8(int8x16_t a) {
   return vclezq_s8(a);
 }
 
-// CHECK-LABEL: test_vclezq_s16
-// CHECK: cmle  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: @test_vclezq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sle <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCLEZ_I]]
 uint16x8_t test_vclezq_s16(int16x8_t a) {
   return vclezq_s16(a);
 }
 
-// CHECK-LABEL: test_vclezq_s32
-// CHECK: cmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: @test_vclezq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sle <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_s32(int32x4_t a) {
   return vclezq_s32(a);
 }
 
-// CHECK-LABEL: test_vclezq_s64
-// CHECK: cmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: @test_vclezq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sle <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_s64(int64x2_t a) {
   return vclezq_s64(a);
 }
 
-// CHECK-LABEL: test_vclez_f32
-// CHECK: fcmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: @test_vclez_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp ole <2 x float> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_f32(float32x2_t a) {
   return vclez_f32(a);
 }
 
-// CHECK-LABEL: test_vclez_f64
-// CHECK: fcmle  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: @test_vclez_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp ole <1 x double> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_f64(float64x1_t a) {
   return vclez_f64(a);
 }
 
-// CHECK-LABEL: test_vclezq_f32
-// CHECK: fcmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: @test_vclezq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp ole <4 x float> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_f32(float32x4_t a) {
   return vclezq_f32(a);
 }
 
-// CHECK-LABEL: test_vclezq_f64
-// CHECK: fcmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: @test_vclezq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp ole <2 x double> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_f64(float64x2_t a) {
   return vclezq_f64(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s8
-// CHECK: cmgt  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgtz_s8(
+// CHECK:   [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCGTZ_I]]
 uint8x8_t test_vcgtz_s8(int8x8_t a) {
   return vcgtz_s8(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s16
-// CHECK: cmgt  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgtz_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCGTZ_I]]
 uint16x4_t test_vcgtz_s16(int16x4_t a) {
   return vcgtz_s16(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s32
-// CHECK: cmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgtz_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_s32(int32x2_t a) {
   return vcgtz_s32(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s64
-// CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgtz_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_s64(int64x1_t a) {
   return vcgtz_s64(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s8
-// CHECK: cmgt  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgtzq_s8(
+// CHECK:   [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCGTZ_I]]
 uint8x16_t test_vcgtzq_s8(int8x16_t a) {
   return vcgtzq_s8(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s16
-// CHECK: cmgt  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgtzq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCGTZ_I]]
 uint16x8_t test_vcgtzq_s16(int16x8_t a) {
   return vcgtzq_s16(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s32
-// CHECK: cmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgtzq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_s32(int32x4_t a) {
   return vcgtzq_s32(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s64
-// CHECK: cmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: @test_vcgtzq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp sgt <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_s64(int64x2_t a) {
   return vcgtzq_s64(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f32
-// CHECK: fcmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: @test_vcgtz_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp ogt <2 x float> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_f32(float32x2_t a) {
   return vcgtz_f32(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f64
-// CHECK: fcmgt  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: @test_vcgtz_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp ogt <1 x double> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_f64(float64x1_t a) {
   return vcgtz_f64(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f32
-// CHECK: fcmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: @test_vcgtzq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp ogt <4 x float> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_f32(float32x4_t a) {
   return vcgtzq_f32(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f64
-// CHECK: fcmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: @test_vcgtzq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp ogt <2 x double> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_f64(float64x2_t a) {
   return vcgtzq_f64(a);
 }
 
-// CHECK-LABEL: test_vcltz_s8
-// CHECK: sshr  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #7
+// CHECK-LABEL: @test_vcltz_s8(
+// CHECK:   [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCLTZ_I]]
 uint8x8_t test_vcltz_s8(int8x8_t a) {
   return vcltz_s8(a);
 }
 
-// CHECK-LABEL: test_vcltz_s16
-// CHECK: sshr  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
+// CHECK-LABEL: @test_vcltz_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp slt <4 x i16> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCLTZ_I]]
 uint16x4_t test_vcltz_s16(int16x4_t a) {
   return vcltz_s16(a);
 }
 
-// CHECK-LABEL: test_vcltz_s32
-// CHECK: sshr  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+// CHECK-LABEL: @test_vcltz_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp slt <2 x i32> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_s32(int32x2_t a) {
   return vcltz_s32(a);
 }
 
-// CHECK-LABEL: test_vcltz_s64
-// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+// CHECK-LABEL: @test_vcltz_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp slt <1 x i64> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_s64(int64x1_t a) {
   return vcltz_s64(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s8
-// CHECK: sshr  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7
+// CHECK-LABEL: @test_vcltzq_s8(
+// CHECK:   [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCLTZ_I]]
 uint8x16_t test_vcltzq_s8(int8x16_t a) {
   return vcltzq_s8(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s16
-// CHECK: sshr  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
+// CHECK-LABEL: @test_vcltzq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp slt <8 x i16> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCLTZ_I]]
 uint16x8_t test_vcltzq_s16(int16x8_t a) {
   return vcltzq_s16(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s32
-// CHECK: sshr  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+// CHECK-LABEL: @test_vcltzq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp slt <4 x i32> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_s32(int32x4_t a) {
   return vcltzq_s32(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s64
-// CHECK: sshr  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
+// CHECK-LABEL: @test_vcltzq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = icmp slt <2 x i64> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_s64(int64x2_t a) {
   return vcltzq_s64(a);
 }
 
-// CHECK-LABEL: test_vcltz_f32
-// CHECK: fcmlt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: @test_vcltz_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp olt <2 x float> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_f32(float32x2_t a) {
   return vcltz_f32(a);
 }
- 
-// CHECK-LABEL: test_vcltz_f64
-// CHECK: fcmlt  {{d[0-9]+}}, {{d[0-9]+}}, #0
+
+// CHECK-LABEL: @test_vcltz_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp olt <1 x double> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_f64(float64x1_t a) {
   return vcltz_f64(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f32
-// CHECK: fcmlt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: @test_vcltzq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp olt <4 x float> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_f32(float32x4_t a) {
   return vcltzq_f32(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f64
-// CHECK: fcmlt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: @test_vcltzq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fcmp olt <2 x double> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_f64(float64x2_t a) {
   return vcltzq_f64(a);
 }
 
-// CHECK-LABEL: test_vrev16_s8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev16_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16_u8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev16_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16_p8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev16_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_s8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev16q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_u8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev16q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_p8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev16q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev32_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: @test_vrev32_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32_u8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev32_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32_u16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: @test_vrev32_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32_p8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev32_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_p16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: @test_vrev32_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev32q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: @test_vrev32q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev32q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: @test_vrev32q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev32q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: @test_vrev32q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev64_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64_s16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: @test_vrev64_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: @test_vrev64_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64_u8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev64_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64_u16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: @test_vrev64_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64_u32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: @test_vrev64_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64_p8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: @test_vrev64_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64_p16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: @test_vrev64_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_f32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: @test_vrev64_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev64q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: @test_vrev64q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: @test_vrev64q_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev64q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: @test_vrev64q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: @test_vrev64q_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: @test_vrev64q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: @test_vrev64q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_f32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: @test_vrev64q_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
+// CHECK-LABEL: @test_vpaddl_s8(
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vpaddl_s8
   return vpaddl_s8(a);
-  // CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpaddl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %a) #2
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vpaddl_s16
   return vpaddl_s16(a);
-  // CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpaddl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %a) #2
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vpaddl_s32
   return vpaddl_s32(a);
-  // CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpaddl_u8(
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vpaddl_u8
   return vpaddl_u8(a);
-  // CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpaddl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %a) #2
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vpaddl_u16
   return vpaddl_u16(a);
-  // CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpaddl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %a) #2
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vpaddl_u32
   return vpaddl_u32(a);
-  // CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpaddlq_s8(
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vpaddlq_s8
   return vpaddlq_s8(a);
-  // CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpaddlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %a) #2
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vpaddlq_s16
   return vpaddlq_s16(a);
-  // CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpaddlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %a) #2
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vpaddlq_s32
   return vpaddlq_s32(a);
-  // CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpaddlq_u8(
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vpaddlq_u8
   return vpaddlq_u8(a);
-  // CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpaddlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %a) #2
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vpaddlq_u16
   return vpaddlq_u16(a);
-  // CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpaddlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %a) #2
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vpaddlq_u32
   return vpaddlq_u32(a);
-  // CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpadal_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vpadal_s8
   return vpadal_s8(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpadal_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vpadal_s16
   return vpadal_s16(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpadal_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vpadal_s32
   return vpadal_s32(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpadal_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vpadal_u8
   return vpadal_u8(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vpadal_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vpadal_u16
   return vpadal_u16(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vpadal_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vpadal_u32
   return vpadal_u32(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vpadalq_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vpadalq_s8
   return vpadalq_s8(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpadalq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vpadalq_s16
   return vpadalq_s16(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpadalq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vpadalq_s32
   return vpadalq_s32(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vpadalq_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a
+// CHECK:   ret <8 x i16> [[TMP1]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vpadalq_u8
   return vpadalq_u8(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vpadalq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vpadalq_u16
   return vpadalq_u16(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vpadalq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %b) #2
+// CHECK:   [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vpadalq_u32
   return vpadalq_u32(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqabs_s8(
+// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqabs_s8
   return vqabs_s8(a);
-  // CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqabsq_s8(
+// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqabsq_s8
   return vqabsq_s8(a);
-  // CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqabs_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %a) #2
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQABS_V1_I]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqabs_s16
   return vqabs_s16(a);
-  // CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqabsq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %a) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQABSQ_V1_I]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqabsq_s16
   return vqabsq_s16(a);
-  // CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqabs_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %a) #2
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQABS_V1_I]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqabs_s32
   return vqabs_s32(a);
-  // CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqabsq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %a) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQABSQ_V1_I]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqabsq_s32
   return vqabsq_s32(a);
-  // CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqabsq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> %a) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQABSQ_V1_I]]
 int64x2_t test_vqabsq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqabsq_s64
   return vqabsq_s64(a);
-  // CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqneg_s8(
+// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqneg_s8
   return vqneg_s8(a);
-  // CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vqnegq_s8(
+// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqnegq_s8
   return vqnegq_s8(a);
-  // CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vqneg_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %a) #2
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQNEG_V1_I]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqneg_s16
   return vqneg_s16(a);
-  // CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vqnegq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %a) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQNEGQ_V1_I]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqnegq_s16
   return vqnegq_s16(a);
-  // CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqneg_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %a) #2
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQNEG_V1_I]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqneg_s32
   return vqneg_s32(a);
-  // CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vqnegq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %a) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQNEGQ_V1_I]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqnegq_s32
   return vqnegq_s32(a);
-  // CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqnegq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> %a) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQNEGQ_V1_I]]
 int64x2_t test_vqnegq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqnegq_s64
   return vqnegq_s64(a);
-  // CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vneg_s8(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vneg_s8
   return vneg_s8(a);
-  // CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vnegq_s8(
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vnegq_s8
   return vnegq_s8(a);
-  // CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vneg_s16(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vneg_s16
   return vneg_s16(a);
-  // CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vnegq_s16(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vnegq_s16
   return vnegq_s16(a);
-  // CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vneg_s32(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vneg_s32
   return vneg_s32(a);
-  // CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vnegq_s32(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vnegq_s32
   return vnegq_s32(a);
-  // CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vnegq_s64(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vnegq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vnegq_s64
   return vnegq_s64(a);
-  // CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vneg_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vneg_f32
   return vneg_f32(a);
-  // CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vnegq_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vnegq_f32
   return vnegq_f32(a);
-  // CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vnegq_f64(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vnegq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vnegq_f64
   return vnegq_f64(a);
-  // CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vabs_s8(
+// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vabs_s8
   return vabs_s8(a);
-  // CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vabsq_s8(
+// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vabsq_s8
   return vabsq_s8(a);
-  // CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vabs_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %a) #2
+// CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vabs_s16
   return vabs_s16(a);
-  // CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vabsq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %a) #2
+// CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vabsq_s16
   return vabsq_s16(a);
-  // CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vabs_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %a) #2
+// CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vabs_s32
   return vabs_s32(a);
-  // CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vabsq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %a) #2
+// CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vabsq_s32
   return vabsq_s32(a);
-  // CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vabsq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> %a) #2
+// CHECK:   ret <2 x i64> [[VABS1_I]]
 int64x2_t test_vabsq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vabsq_s64
   return vabsq_s64(a);
-  // CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vabs_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vabs_f32
   return vabs_f32(a);
-  // CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vabsq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vabsq_f32
   return vabsq_f32(a);
-  // CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vabsq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VABS1_I]]
 float64x2_t test_vabsq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vabsq_f64
   return vabsq_f64(a);
-  // CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vuqadd_s8(
+// CHECK:   [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VUQADD_I]]
 int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuqadd_s8
   return vuqadd_s8(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vuqaddq_s8(
+// CHECK:   [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VUQADD_I]]
 int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuqaddq_s8
   return vuqaddq_s8(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vuqadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #2
+// CHECK:   ret <4 x i16> [[VUQADD2_I]]
 int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuqadd_s16
   return vuqadd_s16(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vuqaddq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #2
+// CHECK:   ret <8 x i16> [[VUQADD2_I]]
 int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuqaddq_s16
   return vuqaddq_s16(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vuqadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #2
+// CHECK:   ret <2 x i32> [[VUQADD2_I]]
 int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuqadd_s32
   return vuqadd_s32(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vuqaddq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #2
+// CHECK:   ret <4 x i32> [[VUQADD2_I]]
 int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuqaddq_s32
   return vuqaddq_s32(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vuqaddq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #2
+// CHECK:   ret <2 x i64> [[VUQADD2_I]]
 int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuqaddq_s64
   return vuqaddq_s64(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcls_s8(
+// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vcls_s8
   return vcls_s8(a);
-  // CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vclsq_s8(
+// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vclsq_s8
   return vclsq_s8(a);
-  // CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vcls_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %a) #2
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VCLS_V1_I]]
 int16x4_t test_vcls_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vcls_s16
   return vcls_s16(a);
-  // CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vclsq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %a) #2
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vclsq_s16
   return vclsq_s16(a);
-  // CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vcls_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %a) #2
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VCLS_V1_I]]
 int32x2_t test_vcls_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vcls_s32
   return vcls_s32(a);
-  // CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vclsq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a) #2
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vclsq_s32
   return vclsq_s32(a);
-  // CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vclz_s8(
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vclz_s8
   return vclz_s8(a);
-  // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vclzq_s8(
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vclzq_s8
   return vclzq_s8(a);
-  // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vclz_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 int16x4_t test_vclz_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vclz_s16
   return vclz_s16(a);
-  // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vclzq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vclzq_s16
   return vclzq_s16(a);
-  // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vclz_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 int32x2_t test_vclz_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vclz_s32
   return vclz_s32(a);
-  // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vclzq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vclzq_s32
   return vclzq_s32(a);
-  // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vclz_u8(
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vclz_u8
   return vclz_u8(a);
-  // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vclzq_u8(
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vclzq_u8
   return vclzq_u8(a);
-  // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vclz_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vclz_u16
   return vclz_u16(a);
-  // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vclzq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vclzq_u16
   return vclzq_u16(a);
-  // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vclz_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vclz_u32
   return vclz_u32(a);
-  // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vclzq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vclzq_u32
   return vclzq_u32(a);
-  // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcnt_s8(
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vcnt_s8
   return vcnt_s8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vcntq_s8(
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vcntq_s8
   return vcntq_s8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vcnt_u8(
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vcnt_u8
   return vcnt_u8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vcntq_u8(
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vcntq_u8
   return vcntq_u8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vcnt_p8(
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vcnt_p8
   return vcnt_p8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vcntq_p8(
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vcntq_p8
   return vcntq_p8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmvn_s8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmvn_s8
   return vmvn_s8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmvnq_s8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_s8
   return vmvnq_s8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmvn_s16(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmvn_s16
   return vmvn_s16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmvnq_s16(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmvnq_s16
   return vmvnq_s16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmvn_s32(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vmvn_s32
   return vmvn_s32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmvnq_s32(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmvnq_s32
   return vmvnq_s32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmvn_u8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmvn_u8
   return vmvn_u8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmvnq_u8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_u8
   return vmvnq_u8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmvn_u16(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmvn_u16
   return vmvn_u16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmvnq_u16(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmvnq_u16
   return vmvnq_u16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmvn_u32(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vmvn_u32
   return vmvn_u32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmvnq_u32(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmvnq_u32
   return vmvnq_u32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmvn_p8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vmvn_p8
   return vmvn_p8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vmvnq_p8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_p8
   return vmvnq_p8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vrbit_s8(
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 int8x8_t test_vrbit_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vrbit_s8
   return vrbit_s8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vrbitq_s8(
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 int8x16_t test_vrbitq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_s8
   return vrbitq_s8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vrbit_u8(
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 uint8x8_t test_vrbit_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vrbit_u8
   return vrbit_u8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vrbitq_u8(
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 uint8x16_t test_vrbitq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_u8
   return vrbitq_u8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vrbit_p8(
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 poly8x8_t test_vrbit_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vrbit_p8
   return vrbit_p8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vrbitq_p8(
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 poly8x16_t test_vrbitq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_p8
   return vrbitq_p8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vmovn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmovn_s16
   return vmovn_s16(a);
-  // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmovn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmovn_s32
   return vmovn_s32(a);
-  // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmovn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vmovn_s64
   return vmovn_s64(a);
-  // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmovn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmovn_u16
   return vmovn_u16(a);
-  // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmovn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmovn_u32
   return vmovn_u32(a);
-  // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmovn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vmovn_u64
   return vmovn_u64(a);
-  // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmovn_high_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmovn_high_s16
   return vmovn_high_s16(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmovn_high_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmovn_high_s32
   return vmovn_high_s32(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmovn_high_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vmovn_high_s64
   return vmovn_high_s64(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vmovn_high_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmovn_high_u16
   return vmovn_high_u16(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vmovn_high_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmovn_high_u32
   return vmovn_high_u32(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vmovn_high_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vmovn_high_u64
   return vmovn_high_u64(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqmovun_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %a) #2
+// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 int8x8_t test_vqmovun_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqmovun_s16
   return vqmovun_s16(a);
-  // CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqmovun_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %a) #2
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQMOVUN_V1_I]]
 int16x4_t test_vqmovun_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqmovun_s32
   return vqmovun_s32(a);
-  // CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqmovun_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %a) #2
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQMOVUN_V1_I]]
 int32x2_t test_vqmovun_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqmovun_s64
   return vqmovun_s64(a);
-  // CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqmovun_high_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %b) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovun_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s16
   return vqmovun_high_s16(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqmovun_high_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %b) #2
+// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVUN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovun_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s32
   return vqmovun_high_s32(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqmovun_high_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %b) #2
+// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVUN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovun_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s64
   return vqmovun_high_s64(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqmovn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %a) #2
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqmovn_s16
   return vqmovn_s16(a);
-  // CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqmovn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %a) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqmovn_s32
   return vqmovn_s32(a);
-  // CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqmovn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %a) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqmovn_s64
   return vqmovn_s64(a);
-  // CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqmovn_high_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %b) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s16
   return vqmovn_high_s16(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqmovn_high_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %b) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s32
   return vqmovn_high_s32(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqmovn_high_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %b) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s64
   return vqmovn_high_s64(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqmovn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %a) #2
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqmovn_u16
   return vqmovn_u16(a);
-  // CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqmovn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %a) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqmovn_u32
   return vqmovn_u32(a);
-  // CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqmovn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %a) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqmovn_u64
   return vqmovn_u64(a);
-  // CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vqmovn_high_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %b) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u16
   return vqmovn_high_u16(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vqmovn_high_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %b) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u32
   return vqmovn_high_u32(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vqmovn_high_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %b) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u64
   return vqmovn_high_u64(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vshll_n_s8(
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vshll_n_s8
   return vshll_n_s8(a, 8);
-  // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
 }
 
+// CHECK-LABEL: @test_vshll_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vshll_n_s16
   return vshll_n_s16(a, 16);
-  // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
 }
 
+// CHECK-LABEL: @test_vshll_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vshll_n_s32
   return vshll_n_s32(a, 32);
-  // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
 }
 
+// CHECK-LABEL: @test_vshll_n_u8(
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vshll_n_u8
   return vshll_n_u8(a, 8);
-  // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
 }
 
+// CHECK-LABEL: @test_vshll_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vshll_n_u16
   return vshll_n_u16(a, 16);
-  // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
 }
 
+// CHECK-LABEL: @test_vshll_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vshll_n_u32
   return vshll_n_u32(a, 32);
-  // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
 }
 
+// CHECK-LABEL: @test_vshll_high_n_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s8
   return vshll_high_n_s8(a, 8);
-  // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
 }
 
+// CHECK-LABEL: @test_vshll_high_n_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s16
   return vshll_high_n_s16(a, 16);
-  // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
 }
 
+// CHECK-LABEL: @test_vshll_high_n_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s32
   return vshll_high_n_s32(a, 32);
-  // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
 }
 
+// CHECK-LABEL: @test_vshll_high_n_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u8
   return vshll_high_n_u8(a, 8);
-  // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
 }
 
+// CHECK-LABEL: @test_vshll_high_n_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u16
   return vshll_high_n_u16(a, 16);
-  // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
 }
 
+// CHECK-LABEL: @test_vshll_high_n_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u32
   return vshll_high_n_u32(a, 32);
-  // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
 }
 
+// CHECK-LABEL: @test_vcvt_f16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) #2
+// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvt_f16_f32
   return vcvt_f16_f32(a);
-  // CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvt_high_f16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %b) #2
+// CHECK:   [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I_I]]
 float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) {
-  //CHECK-LABEL: test_vcvt_high_f16_f32
   return vcvt_high_f16_f32(a, b);
-  // CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvt_f32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = fptrunc <2 x double> %a to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_f64
   return vcvt_f32_f64(a);
-  // CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvt_high_f32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VCVT_I_I:%.*]] = fptrunc <2 x double> %b to <2 x float>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) {
-  //CHECK-LABEL: test_vcvt_high_f32_f64
   return vcvt_high_f32_f64(a, b);
-  // CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtx_f32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x float> [[VCVTX_F32_V1_I]]
 float32x2_t test_vcvtx_f32_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtx_f32_f64
   return vcvtx_f32_f64(a);
-  // CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtx_high_f32_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) {
-  //CHECK-LABEL: test_vcvtx_high_f32_f64
   return vcvtx_high_f32_f64(a, b);
-  // CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvt_f32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #2
+// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VCVT_F32_F161_I]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
-  //CHECK-LABEL: test_vcvt_f32_f16
   return vcvt_f32_f16(a);
-  // CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vcvt_high_f32_f16(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) #2
+// CHECK:   [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VCVT_F32_F161_I_I]]
 float32x4_t test_vcvt_high_f32_f16(float16x8_t a) {
-  //CHECK-LABEL: test_vcvt_high_f32_f16
   return vcvt_high_f32_f16(a);
-  // CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vcvt_f64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = fpext <2 x float> %a to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvt_f64_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f64_f32
   return vcvt_f64_f32(a);
-  // CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvt_high_f64_f32(
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VCVT_I_I:%.*]] = fpext <2 x float> [[SHUFFLE_I_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I_I]]
 float64x2_t test_vcvt_high_f64_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvt_high_f64_f32
   return vcvt_high_f64_f32(a);
-  // CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrndn_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDN1_I]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndn_f32
   return vrndn_f32(a);
-  // CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrndnq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDN1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDN1_I]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndnq_f32
   return vrndnq_f32(a);
-  // CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrndnq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRNDN1_I]]
 float64x2_t test_vrndnq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndnq_f64
   return vrndnq_f64(a);
-  // CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrnda_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDA1_I]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrnda_f32
   return vrnda_f32(a);
-  // CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrndaq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDA1_I]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndaq_f32
   return vrndaq_f32(a);
-  // CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrndaq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRNDA1_I]]
 float64x2_t test_vrndaq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndaq_f64
   return vrndaq_f64(a);
-  // CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrndp_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDP1_I]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndp_f32
   return vrndp_f32(a);
-  // CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrndpq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDP1_I]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndpq_f32
   return vrndpq_f32(a);
-  // CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrndpq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRNDP1_I]]
 float64x2_t test_vrndpq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndpq_f64
   return vrndpq_f64(a);
-  // CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrndm_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDM1_I]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndm_f32
   return vrndm_f32(a);
-  // CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrndmq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDM1_I]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndmq_f32
   return vrndmq_f32(a);
-  // CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrndmq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRNDM1_I]]
 float64x2_t test_vrndmq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndmq_f64
   return vrndmq_f64(a);
-  // CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrndx_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDX1_I]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndx_f32
   return vrndx_f32(a);
-  // CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrndxq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDX1_I]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndxq_f32
   return vrndxq_f32(a);
-  // CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrndxq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRNDX1_I]]
 float64x2_t test_vrndxq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndxq_f64
   return vrndxq_f64(a);
-  // CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrnd_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDZ1_I]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrnd_f32
   return vrnd_f32(a);
-  // CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrndq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDZ1_I]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndq_f32
   return vrndq_f32(a);
-  // CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrndq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRNDZ1_I]]
 float64x2_t test_vrndq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndq_f64
   return vrndq_f64(a);
-  // CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrndi_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDI1_I]]
 float32x2_t test_vrndi_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndi_f32
   return vrndi_f32(a);
-  // CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrndiq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDI1_I]]
 float32x4_t test_vrndiq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndiq_f32
   return vrndiq_f32(a);
-  // CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrndiq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRNDI1_I]]
 float64x2_t test_vrndiq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndiq_f64
   return vrndiq_f64(a);
-  // CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvt_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fptosi <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_s32_f32
   return vcvt_s32_f32(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtq_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fptosi <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_s32_f32
   return vcvtq_s32_f32(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtq_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fptosi <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vcvtq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_s64_f64
   return vcvtq_s64_f64(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvt_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = fptoui <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_u32_f32
   return vcvt_u32_f32(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtq_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fptoui <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_u32_f32
   return vcvtq_u32_f32(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtq_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = fptoui <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 uint64x2_t test_vcvtq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_u64_f64
   return vcvtq_u64_f64(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtn_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTN1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtn_s32_f32
   return vcvtn_s32_f32(a);
-  // CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtnq_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTN1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtnq_s32_f32
   return vcvtnq_s32_f32(a);
-  // CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtnq_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x i64> [[VCVTN1_I]]
 int64x2_t test_vcvtnq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtnq_s64_f64
   return vcvtnq_s64_f64(a);
-  // CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtn_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTN1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtn_u32_f32
   return vcvtn_u32_f32(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtnq_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTN1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtnq_u32_f32
   return vcvtnq_u32_f32(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtnq_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x i64> [[VCVTN1_I]]
 uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtnq_u64_f64
   return vcvtnq_u64_f64(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtp_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTP1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtp_s32_f32
   return vcvtp_s32_f32(a);
-  // CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtpq_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTP1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtpq_s32_f32
   return vcvtpq_s32_f32(a);
-  // CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtpq_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x i64> [[VCVTP1_I]]
 int64x2_t test_vcvtpq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtpq_s64_f64
   return vcvtpq_s64_f64(a);
-  // CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtp_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTP1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtp_u32_f32
   return vcvtp_u32_f32(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtpq_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTP1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtpq_u32_f32
   return vcvtpq_u32_f32(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtpq_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x i64> [[VCVTP1_I]]
 uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtpq_u64_f64
   return vcvtpq_u64_f64(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtm_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTM1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtm_s32_f32
   return vcvtm_s32_f32(a);
-  // CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtmq_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTM1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtmq_s32_f32
   return vcvtmq_s32_f32(a);
-  // CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtmq_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x i64> [[VCVTM1_I]]
 int64x2_t test_vcvtmq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtmq_s64_f64
   return vcvtmq_s64_f64(a);
-  // CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtm_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTM1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtm_u32_f32
   return vcvtm_u32_f32(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtmq_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTM1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtmq_u32_f32
   return vcvtmq_u32_f32(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtmq_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x i64> [[VCVTM1_I]]
 uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtmq_u64_f64
   return vcvtmq_u64_f64(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvta_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTA1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvta_s32_f32
   return vcvta_s32_f32(a);
-  // CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtaq_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTA1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtaq_s32_f32
   return vcvtaq_s32_f32(a);
-  // CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtaq_s64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x i64> [[VCVTA1_I]]
 int64x2_t test_vcvtaq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtaq_s64_f64
   return vcvtaq_s64_f64(a);
-  // CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvta_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTA1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvta_u32_f32
   return vcvta_u32_f32(a);
-  // CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtaq_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTA1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtaq_u32_f32
   return vcvtaq_u32_f32(a);
-  // CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtaq_u64_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x i64> [[VCVTA1_I]]
 uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtaq_u64_f64
   return vcvtaq_u64_f64(a);
-  // CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrsqrte_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrsqrte_f32
   return vrsqrte_f32(a);
-  // CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrsqrteq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrsqrteq_f32
   return vrsqrteq_f32(a);
-  // CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrsqrteq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRSQRTEQ_V1_I]]
 float64x2_t test_vrsqrteq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrsqrteq_f64
   return vrsqrteq_f64(a);
-  // CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrecpe_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrecpe_f32
   return vrecpe_f32(a);
-  // CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrecpeq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrecpeq_f32
   return vrecpeq_f32(a);
-  // CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vrecpeq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VRECPEQ_V1_I]]
 float64x2_t test_vrecpeq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrecpeq_f64
   return vrecpeq_f64(a);
-  // CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vrecpe_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %a) #2
+// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
-  //CHECK-LABEL: test_vrecpe_u32
   return vrecpe_u32(a);
-  // CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vrecpeq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %a) #2
+// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
-  //CHECK-LABEL: test_vrecpeq_u32
   return vrecpeq_u32(a);
-  // CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsqrt_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VSQRT_I]]
 float32x2_t test_vsqrt_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vsqrt_f32
   return vsqrt_f32(a);
-  // CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vsqrtq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VSQRT_I]]
 float32x4_t test_vsqrtq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vsqrtq_f32
   return vsqrtq_f32(a);
-  // CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vsqrtq_f64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #2
+// CHECK:   ret <2 x double> [[VSQRT_I]]
 float64x2_t test_vsqrtq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vsqrtq_f64
   return vsqrtq_f64(a);
-  // CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvt_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_s32
   return vcvt_f32_s32(a);
-  //CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvt_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_u32
   return vcvt_f32_u32(a);
-  //CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: @test_vcvtq_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_f32_s32
   return vcvtq_f32_s32(a);
-  //CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtq_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_f32_u32
   return vcvtq_f32_u32(a);
-  //CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vcvtq_f64_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_s64(int64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_f64_s64
   return vcvtq_f64_s64(a);
-  //CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: @test_vcvtq_f64_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_u64(uint64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_f64_u64
   return vcvtq_f64_u64(a);
-  //CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
diff --git a/test/CodeGen/aarch64-neon-perm.c b/test/CodeGen/aarch64-neon-perm.c
index 07edc11..5b8a99c 100644
--- a/test/CodeGen/aarch64-neon-perm.c
+++ b/test/CodeGen/aarch64-neon-perm.c
@@ -1,1092 +1,2255 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 #include <arm_neon.h>
 
+// CHECK-LABEL: @test_vuzp1_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_s8
   return vuzp1_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vuzp1q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_s8
   return vuzp1q_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vuzp1_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_s16
   return vuzp1_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vuzp1q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_s16
   return vuzp1q_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vuzp1_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_s32
   return vuzp1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vuzp1q_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_s32
   return vuzp1q_s32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vuzp1q_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_s64
   return vuzp1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vuzp1_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_u8
   return vuzp1_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vuzp1q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_u8
   return vuzp1q_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vuzp1_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_u16
   return vuzp1_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vuzp1q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_u16
   return vuzp1q_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vuzp1_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_u32
   return vuzp1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vuzp1q_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_u32
   return vuzp1q_u32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vuzp1q_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_u64
   return vuzp1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vuzp1_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_f32
   return vuzp1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vuzp1q_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_f32
   return vuzp1q_f32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vuzp1q_f64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_f64
   return vuzp1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vuzp1_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_p8
   return vuzp1_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vuzp1q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_p8
   return vuzp1q_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vuzp1_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_p16
   return vuzp1_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vuzp1q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_p16
   return vuzp1q_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vuzp2_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_s8
   return vuzp2_s8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vuzp2q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_s8
   return vuzp2q_s8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vuzp2_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_s16
   return vuzp2_s16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vuzp2q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_s16
   return vuzp2q_s16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vuzp2_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_s32
   return vuzp2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vuzp2q_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_s32
   return vuzp2q_s32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vuzp2q_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_s64
   return vuzp2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vuzp2_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_u8
   return vuzp2_u8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vuzp2q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_u8
   return vuzp2q_u8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vuzp2_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_u16
   return vuzp2_u16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vuzp2q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_u16
   return vuzp2q_u16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vuzp2_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_u32
   return vuzp2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vuzp2q_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_u32
   return vuzp2q_u32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vuzp2q_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_u64
   return vuzp2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vuzp2_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_f32
   return vuzp2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vuzp2q_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_f32
   return vuzp2q_f32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vuzp2q_f64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_f64
   return vuzp2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vuzp2_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_p8
   return vuzp2_p8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vuzp2q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_p8
   return vuzp2q_p8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vuzp2_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_p16
   return vuzp2_p16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vuzp2q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_p16
   return vuzp2q_p16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vzip1_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip1_s8
   return vzip1_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vzip1q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_s8
   return vzip1q_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vzip1_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip1_s16
   return vzip1_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vzip1q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_s16
   return vzip1q_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vzip1_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip1_s32
   return vzip1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vzip1q_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_s32
   return vzip1q_s32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vzip1q_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_s64
   return vzip1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vzip1_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip1_u8
   return vzip1_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vzip1q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_u8
   return vzip1q_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vzip1_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip1_u16
   return vzip1_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vzip1q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_u16
   return vzip1q_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vzip1_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip1_u32
   return vzip1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vzip1q_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_u32
   return vzip1q_u32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vzip1q_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_u64
   return vzip1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vzip1_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip1_f32
   return vzip1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vzip1q_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_f32
   return vzip1q_f32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vzip1q_f64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_f64
   return vzip1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vzip1_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip1_p8
   return vzip1_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vzip1q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_p8
   return vzip1q_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vzip1_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip1_p16
   return vzip1_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vzip1q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_p16
   return vzip1q_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vzip2_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip2_s8
   return vzip2_s8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vzip2q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_s8
   return vzip2q_s8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vzip2_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip2_s16
   return vzip2_s16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vzip2q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_s16
   return vzip2q_s16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vzip2_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip2_s32
   return vzip2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vzip2q_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_s32
   return vzip2q_s32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vzip2q_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_s64
   return vzip2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vzip2_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip2_u8
   return vzip2_u8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vzip2q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_u8
   return vzip2q_u8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vzip2_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip2_u16
   return vzip2_u16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vzip2q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_u16
   return vzip2q_u16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vzip2_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip2_u32
   return vzip2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vzip2q_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_u32
   return vzip2q_u32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vzip2q_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_u64
   return vzip2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vzip2_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip2_f32
   return vzip2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vzip2q_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_f32
   return vzip2q_f32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vzip2q_f64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_f64
   return vzip2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vzip2_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip2_p8
   return vzip2_p8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vzip2q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_p8
   return vzip2q_p8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vzip2_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip2_p16
   return vzip2_p16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vzip2q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_p16
   return vzip2q_p16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtrn1_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_s8
   return vtrn1_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtrn1q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_s8
   return vtrn1q_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtrn1_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_s16
   return vtrn1_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtrn1q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_s16
   return vtrn1q_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtrn1_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_s32
   return vtrn1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vtrn1q_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_s32
   return vtrn1q_s32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vtrn1q_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_s64
   return vtrn1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vtrn1_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_u8
   return vtrn1_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtrn1q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_u8
   return vtrn1q_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtrn1_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_u16
   return vtrn1_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtrn1q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_u16
   return vtrn1q_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtrn1_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_u32
   return vtrn1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vtrn1q_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_u32
   return vtrn1q_u32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vtrn1q_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_u64
   return vtrn1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vtrn1_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_f32
   return vtrn1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vtrn1q_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_f32
   return vtrn1q_f32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vtrn1q_f64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_f64
   return vtrn1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vtrn1_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_p8
   return vtrn1_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtrn1q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_p8
   return vtrn1q_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtrn1_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_p16
   return vtrn1_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtrn1q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_p16
   return vtrn1q_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtrn2_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_s8
   return vtrn2_s8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtrn2q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_s8
   return vtrn2q_s8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtrn2_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_s16
   return vtrn2_s16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtrn2q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_s16
   return vtrn2q_s16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtrn2_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_s32
   return vtrn2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vtrn2q_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_s32
   return vtrn2q_s32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vtrn2q_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_s64
   return vtrn2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vtrn2_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_u8
   return vtrn2_u8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtrn2q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_u8
   return vtrn2q_u8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtrn2_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_u16
   return vtrn2_u16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtrn2q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_u16
   return vtrn2q_u16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtrn2_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_u32
   return vtrn2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vtrn2q_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_u32
   return vtrn2q_u32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vtrn2q_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_u64
   return vtrn2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vtrn2_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_f32
   return vtrn2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: @test_vtrn2q_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_f32
   return vtrn2q_f32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: @test_vtrn2q_f64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_f64
   return vtrn2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: @test_vtrn2_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_p8
   return vtrn2_p8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtrn2q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_p8
   return vtrn2q_p8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: @test_vtrn2_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_p16
   return vtrn2_p16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: @test_vtrn2q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_p16
   return vtrn2q_p16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vuzp_s8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp_s8
   return vuzp_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vuzp_s16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp_s16
   return vuzp_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vuzp_s32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp_s32
   return vuzp_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vuzp_u8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp_u8
   return vuzp_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vuzp_u16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp_u16
   return vuzp_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vuzp_u32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp_u32
   return vuzp_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vuzp_f32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP9]], [2 x <2 x float>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp_f32
   return vuzp_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vuzp_p8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp_p8
   return vuzp_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vuzp_p16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp_p16
   return vuzp_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vuzpq_s8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_s8
   return vuzpq_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vuzpq_s16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_s16
   return vuzpq_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vuzpq_s32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_s32
   return vuzpq_s32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vuzpq_u8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_u8
   return vuzpq_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vuzpq_u16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_u16
   return vuzpq_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vuzpq_u32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_u32
   return vuzpq_u32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vuzpq_f32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP9]], [2 x <4 x float>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_f32
   return vuzpq_f32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vuzpq_p8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_p8
   return vuzpq_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vuzpq_p16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_p16
   return vuzpq_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vzip_s8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip_s8
   return vzip_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vzip_s16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip_s16
   return vzip_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vzip_s32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip_s32
   return vzip_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vzip_u8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip_u8
   return vzip_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vzip_u16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip_u16
   return vzip_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vzip_u32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip_u32
   return vzip_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vzip_f32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP9]], [2 x <2 x float>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip_f32
   return vzip_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vzip_p8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip_p8
   return vzip_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vzip_p16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip_p16
   return vzip_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vzipq_s8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzipq_s8
   return vzipq_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vzipq_s16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzipq_s16
   return vzipq_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vzipq_s32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzipq_s32
   return vzipq_s32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vzipq_u8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzipq_u8
   return vzipq_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vzipq_u16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzipq_u16
   return vzipq_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vzipq_u32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzipq_u32
   return vzipq_u32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vzipq_f32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP9]], [2 x <4 x float>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzipq_f32
   return vzipq_f32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vzipq_p8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzipq_p8
   return vzipq_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vzipq_p16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzipq_p16
   return vzipq_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: @test_vtrn_s8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn_s8
   return vtrn_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: @test_vtrn_s16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn_s16
   return vtrn_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vtrn_s32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn_s32
   return vtrn_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vtrn_u8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn_u8
   return vtrn_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vtrn_u16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn_u16
   return vtrn_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vtrn_u32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP9]], [2 x <2 x i32>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn_u32
   return vtrn_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vtrn_f32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP9]], [2 x <2 x float>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn_f32
   return vtrn_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+
+// CHECK-LABEL: @test_vtrn_p8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn_p8
   return vtrn_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+
+// CHECK-LABEL: @test_vtrn_p16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP9]], [2 x <4 x i16>]* [[TMP8]], align 8
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn_p16
   return vtrn_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+
+// CHECK-LABEL: @test_vtrnq_s8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_s8
   return vtrnq_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vtrnq_s16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_s16
   return vtrnq_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vtrnq_s32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_s32
   return vtrnq_s32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vtrnq_u8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_u8
   return vtrnq_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vtrnq_u16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_u16
   return vtrnq_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+
+// CHECK-LABEL: @test_vtrnq_u32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP9]], [2 x <4 x i32>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_u32
   return vtrnq_u32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vtrnq_f32(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP9]], [2 x <4 x float>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_f32
   return vtrnq_f32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+
+// CHECK-LABEL: @test_vtrnq_p8(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_p8
   return vtrnq_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+
+// CHECK-LABEL: @test_vtrnq_p16(
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP9]], [2 x <8 x i16>]* [[TMP8]], align 16
+// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_p16
   return vtrnq_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
diff --git a/test/CodeGen/aarch64-neon-scalar-copy.c b/test/CodeGen/aarch64-neon-scalar-copy.c
index a50a0b9..90fceb4 100644
--- a/test/CodeGen/aarch64-neon-scalar-copy.c
+++ b/test/CodeGen/aarch64-neon-scalar-copy.c
@@ -1,173 +1,228 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vdups_lane_f32
+// CHECK-LABEL: define float @test_vdups_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VDUPS_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VDUPS_LANE]]
 float32_t test_vdups_lane_f32(float32x2_t a) {
   return vdups_lane_f32(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_f64
+// CHECK-LABEL: define double @test_vdupd_lane_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VDUPD_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   ret double [[VDUPD_LANE]]
 float64_t test_vdupd_lane_f64(float64x1_t a) {
   return vdupd_lane_f64(a, 0);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_f32
+// CHECK-LABEL: define float @test_vdups_laneq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vdups_laneq_f32(float32x4_t a) {
   return vdups_laneq_f32(a, 3);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_f64
+// CHECK-LABEL: define double @test_vdupd_laneq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   ret double [[VGETQ_LANE]]
 float64_t test_vdupd_laneq_f64(float64x2_t a) {
   return vdupd_laneq_f64(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vdupb_lane_s8
+// CHECK-LABEL: define i8 @test_vdupb_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vdupb_lane_s8(int8x8_t a) {
   return vdupb_lane_s8(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
-// CHECK-LABEL: test_vduph_lane_s16
+// CHECK-LABEL: define i16 @test_vduph_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vduph_lane_s16(int16x4_t a) {
   return vduph_lane_s16(a, 3);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
-// CHECK-LABEL: test_vdups_lane_s32
+// CHECK-LABEL: define i32 @test_vdups_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vdups_lane_s32(int32x2_t a) {
   return vdups_lane_s32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_s64
+// CHECK-LABEL: define i64 @test_vdupd_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vdupd_lane_s64(int64x1_t a) {
   return vdupd_lane_s64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vdupb_lane_u8
+// CHECK-LABEL: define i8 @test_vdupb_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vdupb_lane_u8(uint8x8_t a) {
   return vdupb_lane_u8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
-// CHECK-LABEL: test_vduph_lane_u16
+// CHECK-LABEL: define i16 @test_vduph_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vduph_lane_u16(uint16x4_t a) {
   return vduph_lane_u16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
-// CHECK-LABEL: test_vdups_lane_u32
+// CHECK-LABEL: define i32 @test_vdups_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vdups_lane_u32(uint32x2_t a) {
   return vdups_lane_u32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_u64
+// CHECK-LABEL: define i64 @test_vdupd_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vdupd_lane_u64(uint64x1_t a) {
   return vdupd_lane_u64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vdupb_laneq_s8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vdupb_laneq_s8(int8x16_t a) {
   return vdupb_laneq_s8(a, 15);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
-// CHECK-LABEL: test_vduph_laneq_s16
+// CHECK-LABEL: define i16 @test_vduph_laneq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vduph_laneq_s16(int16x8_t a) {
   return vduph_laneq_s16(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_s32
+// CHECK-LABEL: define i32 @test_vdups_laneq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vdups_laneq_s32(int32x4_t a) {
   return vdups_laneq_s32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_s64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vdupd_laneq_s64(int64x2_t a) {
   return vdupd_laneq_s64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vdupb_laneq_u8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vdupb_laneq_u8(uint8x16_t a) {
   return vdupb_laneq_u8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
-// CHECK-LABEL: test_vduph_laneq_u16
+// CHECK-LABEL: define i16 @test_vduph_laneq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vduph_laneq_u16(uint16x8_t a) {
   return vduph_laneq_u16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_u32
+// CHECK-LABEL: define i32 @test_vdups_laneq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vdups_laneq_u32(uint32x4_t a) {
   return vdups_laneq_u32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_u64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vdupd_laneq_u64(uint64x2_t a) {
   return vdupd_laneq_u64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vdupb_lane_p8
+// CHECK-LABEL: define i8 @test_vdupb_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vdupb_lane_p8(poly8x8_t a) {
   return vdupb_lane_p8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
-// CHECK-LABEL: test_vduph_lane_p16
+// CHECK-LABEL: define i16 @test_vduph_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vduph_lane_p16(poly16x4_t a) {
   return vduph_lane_p16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vdupb_laneq_p8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vdupb_laneq_p8(poly8x16_t a) {
   return vdupb_laneq_p8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
-// CHECK-LABEL: test_vduph_laneq_p16
+// CHECK-LABEL: define i16 @test_vduph_laneq_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vduph_laneq_p16(poly16x8_t a) {
   return vduph_laneq_p16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
diff --git a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
index a9d46cd..ac5a090 100644
--- a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
+++ b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
@@ -1,256 +1,507 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
 
+// CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
+// CHECK:   ret float [[MUL]]
 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vmuls_lane_f32
   return vmuls_lane_f32(a, b, 1);
-  // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
+// CHECK:   ret double [[MUL]]
 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vmuld_lane_f64
   return vmuld_lane_f64(a, b, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
+// CHECK:   ret float [[MUL]]
 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vmuls_laneq_f32
   return vmuls_laneq_f32(a, b, 3);
-  // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
+// CHECK:   ret double [[MUL]]
 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vmuld_laneq_f64
   return vmuld_laneq_f64(a, b, 1);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %a to double
+// CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
+// CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP4]]
 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
-  // CHECK-LABEL: test_vmul_n_f64
   return vmul_n_f64(a, b);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) #2
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmulxs_lane_f32
   return vmulxs_lane_f32(a, b, 1);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) #2
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmulxs_laneq_f32
   return vmulxs_laneq_f32(a, b, 3);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) #2
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmulxd_lane_f64
   return vmulxd_lane_f64(a, b, 0);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) #2
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmulxd_laneq_f64
   return vmulxd_laneq_f64(a, b, 1);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vmulx_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
   return vmulx_lane_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vmulx_laneq_f64_0
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vmulx_laneq_f64_1
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 1);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vfmas_lane_f32
+// CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmas_lane_f32(a, b, c, 1);
-  // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vfmad_lane_f64
+// CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
   return vfmad_lane_f64(a, b, c, 0);
-  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfmad_laneq_f64
+// CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
   return vfmad_laneq_f64(a, b, c, 1);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+// CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
-  // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vfma_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
+// CHECK:   ret <1 x double> [[FMLA2]]
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfma_lane_f64(a, b, v, 0);
-  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
+// CHECK:   ret <1 x double> [[FMLA2]]
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
-  // CHECK: {{fmls|fmsub}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfma_laneq_f64
+// CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP7]]
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfma_laneq_f64(a, b, v, 0);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP7]]
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vqdmullh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP4]]
 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmullh_lane_s16(a, b, 3);
-  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9].4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmulls_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulls_lane_s32(a, b, 1);
-  // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmullh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP4]]
 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmullh_laneq_s16(a, b, 7);
-  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmulls_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulls_laneq_s32(a, b, 3);
-  // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmulhh_lane_s16
+// CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmulhh_lane_s16(a, b, 3);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmulhs_lane_s32
+// CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulhs_lane_s32(a, b, 1);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vqdmulhh_laneq_s16
+// CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vqdmulhs_laneq_s32
+// CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulhs_laneq_s32(a, b, 3);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqrdmulhh_lane_s16
+// CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqrdmulhh_lane_s16(a, b, 3);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqrdmulhs_lane_s32
+// CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqrdmulhs_lane_s32(a, b, 1);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vqrdmulhh_laneq_s16
+// CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqrdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vqrdmulhs_laneq_s32
+// CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqrdmulhs_laneq_s32(a, b, 3);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmlalh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
+// CHECK:   ret i32 [[VQDMLXL1]]
 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlalh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmlals_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL1]]
 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlals_lane_s32(a, b, c, 1);
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmlalh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
+// CHECK:   ret i32 [[VQDMLXL1]]
 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlalh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmlals_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL1]]
 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlals_laneq_s32(a, b, c, 3);
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmlslh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
+// CHECK:   ret i32 [[VQDMLXL1]]
 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlslh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmlsls_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL1]]
 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlsls_lane_s32(a, b, c, 1);
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmlslh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
+// CHECK:   ret i32 [[VQDMLXL1]]
 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlslh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmlsls_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL1]]
 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlsls_laneq_s32(a, b, c, 3);
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vmulx_lane_f64_0:
+// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]]) #2
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_lane_f64_0() {
       float64x1_t arg1;
       float64x1_t arg2;
@@ -259,15 +510,24 @@
       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
       result = vmulx_lane_f64(arg1, arg2, 0);
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d0, [x[[ADDRLO]],
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d1, [x[[ADDRLO]],
-// CHECK: fmulx d0, d1, d0
       return result;
 }
 
-// CHECK-LABEL: test_vmulx_laneq_f64_2:
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_2() {
       float64x1_t arg1;
       float64x1_t arg2;
@@ -278,10 +538,5 @@
       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
       arg3 = vcombine_f64(arg1, arg2);
       result = vmulx_laneq_f64(arg1, arg3, 1);
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d0, [x[[ADDRLO]],
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d1, [x[[ADDRLO]],
-// CHECK: fmulx d0, d1, d0
       return result;
 }
diff --git a/test/CodeGen/aarch64-neon-shifts.c b/test/CodeGen/aarch64-neon-shifts.c
index 02d8ca1..66449f7 100644
--- a/test/CodeGen/aarch64-neon-shifts.c
+++ b/test/CodeGen/aarch64-neon-shifts.c
@@ -1,6 +1,5 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -emit-llvm -O1 -o - %s | FileCheck %s
+// RUN:   -ffp-contract=fast -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
@@ -25,19 +24,20 @@
 uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra
   // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_u8(a, b, 5);
 }
 
 int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_smax
   // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_s8(a, b, 8);
 }
 
 uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_umax
-  // CHECK: ret <8 x i8> %a
+  // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer
+  // CHECK: ret <8 x i8> [[RES]]
   return vsra_n_u8(a, b, 8);
 }
diff --git a/test/CodeGen/aarch64-neon-tbl.c b/test/CodeGen/aarch64-neon-tbl.c
index 902fc45..0cc6645 100644
--- a/test/CodeGen/aarch64-neon-tbl.c
+++ b/test/CodeGen/aarch64-neon-tbl.c
@@ -1,463 +1,1500 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_s8
   return vtbl1_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 int8x8_t test_vqtbl1_s8(int8x16_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_s8
   return vqtbl1_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_s8
   return vtbl2_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 int8x8_t test_vqtbl2_s8(int8x16x2_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_s8
   return vqtbl2_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_s8
   return vtbl3_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 int8x8_t test_vqtbl3_s8(int8x16x3_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_s8
   return vqtbl3_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_s8
   return vtbl4_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 int8x8_t test_vqtbl4_s8(int8x16x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_s8
   return vqtbl4_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_s8
   return vqtbl1q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_s8
   return vqtbl2q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_s8
   return vqtbl3q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_s8
   return vqtbl4q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_s8
   return vtbx1_s8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_s8
   return vtbx2_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_s8
   return vtbx3_s8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_s8
   return vtbx4_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_s8
   return vqtbx1_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_s8
   return vqtbx2_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_s8
   return vqtbx3_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_s8
   return vqtbx4_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_s8
   return vqtbx1q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_s8
   return vqtbx2q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_s8
   return vqtbx3q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_s8
   return vqtbx4q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_u8
   return vtbl1_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_u8
   return vqtbl1_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_u8
   return vtbl2_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_u8
   return vqtbl2_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_u8
   return vtbl3_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_u8
   return vqtbl3_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_u8
   return vtbl4_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_u8
   return vqtbl4_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_u8
   return vqtbl1q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_u8
   return vqtbl2q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_u8
   return vqtbl3q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_u8
   return vqtbl4q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_u8
   return vtbx1_u8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_u8
   return vtbx2_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_u8
   return vtbx3_u8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_u8
   return vtbx4_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_u8
   return vqtbx1_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_u8
   return vqtbx2_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_u8
   return vqtbx3_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_u8
   return vqtbx4_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_u8
   return vqtbx1q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_u8
   return vqtbx2q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_u8
   return vqtbx3q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_u8
   return vqtbx4q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_p8
   return vtbl1_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_p8
   return vqtbl1_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_p8
   return vtbl2_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_p8
   return vqtbl2_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_p8
   return vtbl3_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_p8
   return vqtbl3_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_p8
   return vtbl4_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_p8
   return vqtbl4_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_p8
   return vqtbl1q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_p8
   return vqtbl2q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_p8
   return vqtbl3q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_p8
   return vqtbl4q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_p8
   return vtbx1_p8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_p8
   return vtbx2_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_p8
   return vtbx3_p8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_p8
   return vtbx4_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_p8
   return vqtbx1_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_p8
   return vqtbx2_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_p8
   return vqtbx3_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_p8
   return vqtbx4_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_p8
   return vqtbx1q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_p8
   return vqtbx2q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_p8
   return vqtbx3q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 poly8x16_t test_vqtbx4q_p8(poly8x16_t a, poly8x16x4_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_p8
   return vqtbx4q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
diff --git a/test/CodeGen/aarch64-neon-vcombine.c b/test/CodeGen/aarch64-neon-vcombine.c
index a750b8e..482463c 100644
--- a/test/CodeGen/aarch64-neon-vcombine.c
+++ b/test/CodeGen/aarch64-neon-vcombine.c
@@ -1,90 +1,103 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -fallow-half-arguments-and-returns -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t low, int8x8_t high) {
-  // CHECK-LABEL: test_vcombine_s8:
   return vcombine_s8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t low, int16x4_t high) {
-  // CHECK-LABEL: test_vcombine_s16:
   return vcombine_s16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %low, <2 x i32> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t low, int32x2_t high) {
-  // CHECK-LABEL: test_vcombine_s32:
   return vcombine_s32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t low, int64x1_t high) {
-  // CHECK-LABEL: test_vcombine_s64:
   return vcombine_s64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t low, uint8x8_t high) {
-  // CHECK-LABEL: test_vcombine_u8:
   return vcombine_u8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t low, uint16x4_t high) {
-  // CHECK-LABEL: test_vcombine_u16:
   return vcombine_u16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %low, <2 x i32> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t low, uint32x2_t high) {
-  // CHECK-LABEL: test_vcombine_u32:
   return vcombine_u32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t low, uint64x1_t high) {
-  // CHECK-LABEL: test_vcombine_u64:
   return vcombine_u64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
-  // CHECK-LABEL: test_vcombine_p64:
   return vcombine_p64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %low, <4 x half> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %low, <4 x half> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t low, float16x4_t high) {
-  // CHECK-LABEL: test_vcombine_f16:
   return vcombine_f16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %low, <2 x float> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %low, <2 x float> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t low, float32x2_t high) {
-  // CHECK-LABEL: test_vcombine_f32:
   return vcombine_f32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t low, poly8x8_t high) {
-  // CHECK-LABEL: test_vcombine_p8:
   return vcombine_p8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t low, poly16x4_t high) {
-  // CHECK-LABEL: test_vcombine_p16:
   return vcombine_p16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcombine_f64(<1 x double> %low, <1 x double> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> %low, <1 x double> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vcombine_f64(float64x1_t low, float64x1_t high) {
-  // CHECK-LABEL: test_vcombine_f64:
   return vcombine_f64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
diff --git a/test/CodeGen/aarch64-neon-vget-hilo.c b/test/CodeGen/aarch64-neon-vget-hilo.c
index 0959d09..f66bac6 100644
--- a/test/CodeGen/aarch64-neon-vget-hilo.c
+++ b/test/CodeGen/aarch64-neon-vget-hilo.c
@@ -1,176 +1,203 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix CHECK-COMMON --check-prefix CHECK-ARM64
-
+// RUN:  -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s8:
   return vget_high_s8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s16:
   return vget_high_s16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s32:
   return vget_high_s32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s64:
   return vget_high_s64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u8:
   return vget_high_u8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u16:
   return vget_high_u16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u32:
   return vget_high_u32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u64:
   return vget_high_u64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_p64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 poly64x1_t test_vget_high_p64(poly64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p64:
   return vget_high_p64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f16:
   return vget_high_f16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f32:
   return vget_high_f32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p8:
   return vget_high_p8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p16
   return vget_high_p16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x double> @test_vget_high_f64(<2 x double> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x double> [[SHUFFLE_I]]
 float64x1_t test_vget_high_f64(float64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f64
   return vget_high_f64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s8:
   return vget_low_s8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s16:
   return vget_low_s16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s32:
   return vget_low_s32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s64:
   return vget_low_s64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u8:
   return vget_low_u8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u16:
   return vget_low_u16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u32:
   return vget_low_u32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u64:
   return vget_low_u64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_p64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 poly64x1_t test_vget_low_p64(poly64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p64:
   return vget_low_p64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f16:
   return vget_low_f16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f32:
   return vget_low_f32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p8:
   return vget_low_p8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p16:
   return vget_low_p16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x double> @test_vget_low_f64(<2 x double> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[SHUFFLE_I]]
 float64x1_t test_vget_low_f64(float64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f64:
   return vget_low_f64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
diff --git a/test/CodeGen/aarch64-neon-vget.c b/test/CodeGen/aarch64-neon-vget.c
index 83c6494..87afcee 100644
--- a/test/CodeGen/aarch64-neon-vget.c
+++ b/test/CodeGen/aarch64-neon-vget.c
@@ -1,348 +1,458 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-apple-darwin -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_u8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_u8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vget_lane_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_u16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_u16(a, 3);
 }
 
+// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vget_lane_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_u32:
-  // CHECK-NEXT:  mov.s  w0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_u32(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_s8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_s8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vget_lane_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_s16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_s16(a, 3);
 }
 
+// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vget_lane_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_s32:
-  // CHECK-NEXT:  mov.s  w0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_s32(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_p8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_p8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vget_lane_p16(poly16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_p16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_p16(a, 3);
 }
 
+// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vget_lane_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_f32:
-  // CHECK-NEXT:  mov s0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_f32(a, 1);
 }
 
+// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
+// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
+// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vget_lane_f16(float16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_f16:
-  // CHECK-NEXT:  umov.h w8, v0[1]
-  // CHECK-NEXT:  fmov s0, w8
-  // CHECK-NEXT:  fcvt s0, h0
-  // CHECK-NEXT:  ret
   return vget_lane_f16(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u16(a, 7);
 }
 
+// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u32:
-  // CHECK-NEXT:  mov.s  w0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u32(a, 3);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vgetq_lane_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s16(a, 7);
 }
 
+// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vgetq_lane_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s32:
-  // CHECK-NEXT:  mov.s  w0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s32(a, 3);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_p8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_p8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_p16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_p16(a, 7);
 }
 
+// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vgetq_lane_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_f32:
-  // CHECK-NEXT:  mov s0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_f32(a, 3);
 }
 
+// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+// CHECK:   store i16 [[VGETQ_LANE]], i16* [[__REINT1_244]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vgetq_lane_f16(float16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_f16:
-  // CHECK-NEXT:  umov.h w8, v0[3]
-  // CHECK-NEXT:  fmov s0, w8
-  // CHECK-NEXT:  fcvt s0, h0
-  // CHECK-NEXT:  ret
   return vgetq_lane_f16(a, 3);
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vget_lane_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vget_lane_s64:
-  // CHECK-NEXT:  fmov x0, d0
-  // CHECK-NEXT:  ret
   return vget_lane_s64(a, 0);
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vget_lane_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vget_lane_u64:
-  // CHECK-NEXT:  fmov x0, d0
-  // CHECK-NEXT:  ret
   return vget_lane_u64(a, 0);
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vgetq_lane_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s64:
-  // CHECK-NEXT:  mov.d  x0, v0[1]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u64:
-  // CHECK-NEXT:  mov.d  x0, v0[1]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u64(a, 1);
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_u8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_u16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_u32:
-  // CHECK-NEXT:  ins.s v0[1], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u32(a, b, 1);
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_s8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_s16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_s32:
-  // CHECK-NEXT:  ins.s v0[1], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s32(a, b, 1);
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_p8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_p8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_p16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_p16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VSET_LANE]]
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_f32:
-  // CHECK-NEXT:  ins.s v1[1], v0[0]
-  // CHECK-NEXT:  mov.16b  v0, v1
-  // CHECK-NEXT:  ret
   return vset_lane_f32(a, b, 1);
 }
 
+// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
+// CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 3
+// CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
+// CHECK:   ret <4 x half> [[TMP8]]
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_f16:
-  // CHECK-NEXT:  ld1.h { v0 }[3], [x0]
-  // CHECK-NEXT:  ret
   return vset_lane_f16(*a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u32:
-  // CHECK-NEXT:  ins.s v0[3], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u32(a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s32:
-  // CHECK-NEXT:  ins.s v0[3], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s32(a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_p8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_p8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_p16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_p16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_f32:
-  // CHECK-NEXT:  ins.s v1[3], v0[0]
-  // CHECK-NEXT:  mov.16b  v0, v1
-  // CHECK-NEXT:  ret
   return vsetq_lane_f32(a, b, 3);
 }
 
+// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 7
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_f16:
-  // CHECK-NEXT:  ld1.h { v0 }[7], [x0]
-  // CHECK-NEXT:  ret
   return vsetq_lane_f16(*a, b, 7);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vset_lane_s64:
-  // CHECK-NEXT:  fmov d0, x0
-  // CHECK-NEXT:  ret
   return vset_lane_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vset_lane_u64:
-  // CHECK-NEXT:  fmov d0, x0
-  // CHECK-NEXT:  ret
   return vset_lane_u64(a, b, 0);
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s64:
-  // CHECK-NEXT:  ins.d v0[1], x0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u64:
-  // CHECK-NEXT:  ins.d v0[1], x0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u64(a, b, 1);
 }
diff --git a/test/CodeGen/aarch64-poly128.c b/test/CodeGen/aarch64-poly128.c
index eebecf7..01c5090 100644
--- a/test/CodeGen/aarch64-poly128.c
+++ b/test/CodeGen/aarch64-poly128.c
@@ -1,7 +1,7 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \
-// RUN:  --check-prefix=CHECK-ARM64
+// RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s
 
 // Test new aarch64 intrinsics with poly128
 // FIXME: Currently, poly128_t equals to uint128, which will be spilt into
@@ -12,192 +12,238 @@
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   store i128 %val, i128* [[TMP1]]
+// CHECK:   ret void
 void test_vstrq_p128(poly128_t * ptr, poly128_t val) {
-  // CHECK-LABEL: test_vstrq_p128
   vstrq_p128(ptr, val);
 
-  // CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 }
 
+// CHECK-LABEL: define i128 @test_vldrq_p128(i128* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
+// CHECK:   ret i128 [[TMP2]]
 poly128_t test_vldrq_p128(poly128_t * ptr) {
-  // CHECK-LABEL: test_vldrq_p128
   return vldrq_p128(ptr);
 
-  // CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 }
 
+// CHECK-LABEL: define void @test_ld_st_p128(i128* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
+// CHECK:   [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* %ptr, i64 1
+// CHECK:   [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128*
+// CHECK:   store i128 [[TMP2]], i128* [[TMP4]]
+// CHECK:   ret void
 void test_ld_st_p128(poly128_t * ptr) {
-  // CHECK-LABEL: test_ld_st_p128
    vstrq_p128(ptr+1, vldrq_p128(ptr));
 
- // CHECK-ARM64: ldp [[PLO:x[0-9]+]], [[PHI:x[0-9]+]], [{{x[0-9]+}}]
- // CHECK-ARM64-NEXT: stp [[PLO]], [[PHI]], [{{x[0-9]+}}, #16]
 }
 
+// CHECK-LABEL: define i128 @test_vmull_p64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b) #2
+// CHECK:   [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
+// CHECK:   ret i128 [[VMULL_P641_I]]
 poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
-  // CHECK-LABEL: test_vmull_p64
   return vmull_p64(a, b);
-  // CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
 }
 
+// CHECK-LABEL: define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to i64
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <2 x i64> %b, <2 x i64> %b, <1 x i32> <i32 1>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I7_I]] to i64
+// CHECK:   [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #2
+// CHECK:   [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
+// CHECK:   ret i128 [[VMULL_P641_I_I]]
 poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vmull_high_p64
   return vmull_high_p64(a, b);
-  // CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
   return vreinterpretq_p128_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
   return vreinterpretq_p128_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
   return vreinterpretq_p128_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
   return vreinterpretq_p128_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
   return vreinterpretq_p128_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
   return vreinterpretq_p128_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
   return vreinterpretq_p128_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
   return vreinterpretq_p128_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_f32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
   return vreinterpretq_p128_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_f64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
   return vreinterpretq_p128_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
   return vreinterpretq_p128_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
   return vreinterpretq_p128_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
   return vreinterpretq_p128_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
   return vreinterpretq_s8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
   return vreinterpretq_s16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
   return vreinterpretq_s32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
   return vreinterpretq_s64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
   return vreinterpretq_u8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
   return vreinterpretq_u16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
   return vreinterpretq_u32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
   return vreinterpretq_u64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
   return vreinterpretq_f32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
   return vreinterpretq_f64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
   return vreinterpretq_p8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
   return vreinterpretq_p16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p128(poly128_t  a) {
   return vreinterpretq_p64_p128(a);
 }
diff --git a/test/CodeGen/aarch64-poly64.c b/test/CodeGen/aarch64-poly64.c
index 6ea3a2c..eadeda6 100644
--- a/test/CodeGen/aarch64-poly64.c
+++ b/test/CodeGen/aarch64-poly64.c
@@ -1,299 +1,614 @@
-// FIXME: This is a front-end test that depends on LLVM optimizations (-O3). 
-// It should be split into separate files for front/middle/back-end testing.
-
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \
-// RUN:  --check-prefix=CHECK-ARM64
+// RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s
 
 // Test new aarch64 intrinsics with poly64
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vceq_p64
   return vceq_p64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vceqq_p64
   return vceqq_p64(a, b);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vtst_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> %a, %b
+// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vtst_p64
   return vtst_p64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtstq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> %a, %b
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtstq_p64
   return vtstq_p64(a, b);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbsl_p64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %a, <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %c
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
-  // CHECK-LABEL: test_vbsl_p64
   return vbsl_p64(a, b, c);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbslq_p64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %a, <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %c
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) {
-  // CHECK-LABEL: test_vbslq_p64
   return vbslq_p64(a, b, c);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_p64(<1 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 poly64_t test_vget_lane_p64(poly64x1_t v) {
-  // CHECK-LABEL: test_vget_lane_p64
   return vget_lane_p64(v, 0);
-  // CHECK: fmov  {{x[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_p64(<2 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 poly64_t test_vgetq_lane_p64(poly64x2_t v) {
-  // CHECK-LABEL: test_vgetq_lane_p64
   return vgetq_lane_p64(v, 1);
-  // CHECK: {{mov|umov}}  {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_p64(i64 %a, <1 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) {
-  // CHECK-LABEL: test_vset_lane_p64
   return vset_lane_p64(a, v, 0);
-  // CHECK: fmov  {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_p64(i64 %a, <2 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) {
-  // CHECK-LABEL: test_vsetq_lane_p64
   return vsetq_lane_p64(a, v, 1);
-  // CHECK: ins  {{v[0-9]+}}.d[1], {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcopy_lane_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vcopy_lane_p64
   return vcopy_lane_p64(a, 0, b, 0);
 
-  // CHECK-ARM64: mov v0.16b, v1.16b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vcopyq_lane_p64
   return vcopyq_lane_p64(a, 1, b, 0);
-  // CHECK: zip1 v0.2d, v0.2d, v1.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vcopyq_laneq_p64
   return vcopyq_laneq_p64(a, 1, b, 1);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcreate_p64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vcreate_p64(uint64_t a) {
-  // CHECK-LABEL: test_vcreate_p64
   return vcreate_p64(a);
-  // CHECK: fmov  {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VECINIT_I]]
 poly64x1_t test_vdup_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vdup_n_p64
   return vdup_n_p64(a);
-  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 }
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 poly64x2_t test_vdupq_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vdupq_n_p64
   return vdupq_n_p64(a);
-  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VECINIT_I]]
 poly64x1_t test_vmov_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vmov_n_p64
   return vmov_n_p64(a);
-  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 poly64x2_t test_vmovq_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vmovq_n_p64
   return vmovq_n_p64(a);
-  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
-  // CHECK-LABEL: test_vdup_lane_p64
   return vdup_lane_p64(vec, 0);
-  // CHECK: ret
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
-  // CHECK-LABEL: test_vdupq_lane_p64
   return vdupq_lane_p64(vec, 0);
-  // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
-  // CHECK-LABEL: test_vdupq_laneq_p64
   return vdupq_laneq_p64(vec, 1);
-  // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
-  // CHECK-LABEL: test_vcombine_p64
   return vcombine_p64(low, high);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_p64(i64* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 poly64x1_t test_vld1_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld1_p64
   return vld1_p64(ptr);
-  // CHECK-ARM64: ldr {{d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_p64(i64* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 poly64x2_t test_vld1q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld1q_p64
   return vld1q_p64(ptr);
-  // CHECK-ARM64: ldr {{q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64(i64* %ptr, <1 x i64> %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_p64(poly64_t * ptr, poly64x1_t val) {
-  // CHECK-LABEL: test_vst1_p64
   return vst1_p64(ptr, val);
-  // CHECK-ARM64: str {{d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64(i64* %ptr, <2 x i64> %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
-  // CHECK-LABEL: test_vst1q_p64
   return vst1q_p64(ptr, val);
-  // CHECK-ARM64: str {{q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld2_p64
   return vld2_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld2q_p64
   return vld2q_p64(ptr);
-  // CHECK: ld2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld3_p64
   return vld3_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld3q_p64
   return vld3q_p64(ptr);
-  // CHECK: ld3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld4_p64
   return vld4_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld4q_p64
   return vld4q_p64(ptr);
-  // CHECK: ld4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_p64(i64* %ptr, [2 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
-  // CHECK-LABEL: test_vst2_p64
   return vst2_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_p64(i64* %ptr, [2 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
-  // CHECK-LABEL: test_vst2q_p64
   return vst2q_p64(ptr, val);
-  // CHECK:  st2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_p64(i64* %ptr, [3 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL4]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX5]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
-  // CHECK-LABEL: test_vst3_p64
   return vst3_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_p64(i64* %ptr, [3 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL4]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX5]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
-  // CHECK-LABEL: test_vst3q_p64
   return vst3q_p64(ptr, val);
-  // CHECK:  st3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_p64(i64* %ptr, [4 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL4]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX5]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL6]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX7]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
-  // CHECK-LABEL: test_vst4_p64
   return vst4_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_p64(i64* %ptr, [4 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL4]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX5]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL6]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX7]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) {
-  // CHECK-LABEL: test_vst4q_p64
   return vst4q_p64(ptr, val);
-  // CHECK:  st4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vext_p64
   return vext_u64(a, b, 0);
 
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vextq_p64
   return vextq_p64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{#0x8|#8}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_p64
   return vzip1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_p64
   return vzip2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_p64
   return vuzp1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_p64
   return vuzp2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_p64
   return vtrn1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_p64
   return vtrn2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 33)
+// CHECK:   ret <1 x i64> [[VSRI_N2]]
 poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vsri_n_p64
   return vsri_n_p64(a, b, 33);
-  // CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #33
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 64)
+// CHECK:   ret <2 x i64> [[VSRI_N2]]
 poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_p64
   return vsriq_n_p64(a, b, 64);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #64
 }
 
diff --git a/test/CodeGen/aarch64-type-sizes.c b/test/CodeGen/aarch64-type-sizes.c
index 3ff8c4f..ce8b51f 100644
--- a/test/CodeGen/aarch64-type-sizes.c
+++ b/test/CodeGen/aarch64-type-sizes.c
@@ -1,8 +1,7 @@
-// RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+// RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck --check-prefix=CHECK %s
 // char by definition has size 1
 
-// CHECK-LE: target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-// CHECK-BE: target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
+// CHECK: target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 int check_short() {
   return sizeof(short);
@@ -89,4 +88,3 @@
   return sizeof(enum Small);
 // CHECK: ret i32 4
 }
-
diff --git a/test/CodeGen/adc-builtins.c b/test/CodeGen/adc-builtins.c
index 5e58905..0d8d6fa 100644
--- a/test/CodeGen/adc-builtins.c
+++ b/test/CodeGen/adc-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +adx -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 #define __MM_MALLOC_H
 
diff --git a/test/CodeGen/alias.c b/test/CodeGen/alias.c
index a14bc0e..c34dcf5 100644
--- a/test/CodeGen/alias.c
+++ b/test/CodeGen/alias.c
@@ -24,20 +24,20 @@
 // CHECKBASIC-DAG: @__mod_usb_device_table = alias i32, getelementptr inbounds ([8 x i32], [8 x i32]* @wacom_usb_ids, i32 0, i32 0)
 // CHECKASM-DAG: .globl __mod_usb_device_table
 // CHECKASM-DAG: __mod_usb_device_table = wacom_usb_ids
-// CHECKASM-DAG-NOT: .size __mod_usb_device_table
+// CHECKASM-NOT: .size __mod_usb_device_table
 
 extern int g1;
 extern int g1 __attribute((alias("g0")));
 // CHECKBASIC-DAG: @g1 = alias i32, i32* @g0
 // CHECKASM-DAG: .globl g1
 // CHECKASM-DAG: g1 = g0
-// CHECKASM-DAG-NOT: .size g1
+// CHECKASM-NOT: .size g1
 
 extern __thread int __libc_errno __attribute__ ((alias ("TL_WITH_ALIAS")));
 // CHECKBASIC-DAG: @__libc_errno = thread_local alias i32, i32* @TL_WITH_ALIAS
 // CHECKASM-DAG: .globl __libc_errno
 // CHECKASM-DAG: __libc_errno = TL_WITH_ALIAS
-// CHECKASM-DAG-NOT: .size __libc_errno
+// CHECKASM-NOT: .size __libc_errno
 
 void f0(void) { }
 extern void f1(void);
diff --git a/test/CodeGen/arm-bitfield-alignment.c b/test/CodeGen/arm-bitfield-alignment.c
index 66bbdae..1c453b2 100644
--- a/test/CodeGen/arm-bitfield-alignment.c
+++ b/test/CodeGen/arm-bitfield-alignment.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - %s | FileCheck %s
 
 extern struct T {
   int b0 : 8;
diff --git a/test/CodeGen/arm-cc.c b/test/CodeGen/arm-cc.c
index 8e6aae7..b506834 100644
--- a/test/CodeGen/arm-cc.c
+++ b/test/CodeGen/arm-cc.c
@@ -3,6 +3,9 @@
 // RUN: %clang_cc1 -triple armv7-apple-darwin9 -target-abi aapcs  -emit-llvm -w -o - %s | FileCheck -check-prefix=DARWIN-AAPCS %s
 // RUN: %clang_cc1 -triple arm-none-linux-gnueabi -target-abi apcs-gnu -emit-llvm -w -o - %s | FileCheck -check-prefix=LINUX-APCS %s
 // RUN: %clang_cc1 -triple arm-none-linux-gnueabi -target-abi aapcs  -emit-llvm -w -o - %s | FileCheck -check-prefix=LINUX-AAPCS %s
+// RUN: %clang_cc1 -triple arm-none-linux-musleabi -target-abi apcs-gnu -emit-llvm -w -o - %s | FileCheck -check-prefix=LINUX-APCS %s
+// RUN: %clang_cc1 -triple arm-none-linux-musleabi -target-abi aapcs  -emit-llvm -w -o - %s | FileCheck -check-prefix=LINUX-AAPCS %s
+// RUN: %clang_cc1 -triple armv7-none-eabihf -target-abi aapcs-vfp -emit-llvm -w -o - %s | FileCheck -check-prefix=BAREMETAL-AAPCS_VFP %s
 
 
 // DARWIN-APCS-LABEL: define void @f()
@@ -13,6 +16,9 @@
 // LINUX-APCS: call arm_apcscc void @g
 // LINUX-AAPCS-LABEL: define void @f()
 // LINUX-AAPCS: call void @g
+// BAREMETAL-AAPCS_VFP-LABEL: define void @f()
+// BAREMETAL-AAPCS_VFP: call void @g
+// BAREMETAL-AAPCS_VFP: declare void @g()
 void g(void);
 void f(void) {
   g();
diff --git a/test/CodeGen/arm-crc32.c b/test/CodeGen/arm-crc32.c
index d49f20e..8a70d8c 100644
--- a/test/CodeGen/arm-crc32.c
+++ b/test/CodeGen/arm-crc32.c
@@ -1,6 +1,5 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 int crc32b(int a, char b)
 {
@@ -48,7 +47,7 @@
 // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32
 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32
 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32
-// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]])
+// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]])
 // CHECK: call i32 @llvm.arm.crc32w(i32 [[T3]], i32 [[T2]])
 }
 
@@ -58,6 +57,6 @@
 // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32
 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32
 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32
-// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]])
+// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]])
 // CHECK: call i32 @llvm.arm.crc32cw(i32 [[T3]], i32 [[T2]])
 }
diff --git a/test/CodeGen/arm-eabi.c b/test/CodeGen/arm-eabi.c
index 0dc04f5..3a651fe 100644
--- a/test/CodeGen/arm-eabi.c
+++ b/test/CodeGen/arm-eabi.c
@@ -7,6 +7,14 @@
 // RUN: %clang -target arm-none-gnueabi -S -meabi 5 -o - %s | FileCheck -check-prefix=CHECK-EABI %s
 // RUN: %clang -target arm-none-gnueabihf -S -o - %s | FileCheck -check-prefix=CHECK-GNUEABI %s
 // RUN: %clang -target arm-none-gnueabihf -S -meabi 5 -o - %s | FileCheck -check-prefix=CHECK-EABI %s
+// RUN: %clang -target arm-none-musleabi -S -o - %s \
+// RUN:   | FileCheck -check-prefix=CHECK-GNUEABI %s
+// RUN: %clang -target arm-none-musleabi -S -o - %s -meabi 5 \
+// RUN:   | FileCheck -check-prefix=CHECK-EABI %s
+// RUN: %clang -target arm-none-musleabihf -S -o - %s \
+// RUN:   | FileCheck -check-prefix=CHECK-GNUEABI %s
+// RUN: %clang -target arm-none-musleabihf -S -o - %s -meabi 5 \
+// RUN:   | FileCheck -check-prefix=CHECK-EABI %s
 
 struct my_s {
   unsigned long a[18];
diff --git a/test/CodeGen/arm-fp16-arguments.c b/test/CodeGen/arm-fp16-arguments.c
index 15a9ceb..65f076a 100644
--- a/test/CodeGen/arm-fp16-arguments.c
+++ b/test/CodeGen/arm-fp16-arguments.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fallow-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi hard -fallow-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=NATIVE
 
 __fp16 g;
 
@@ -10,12 +11,17 @@
 // HARD: [[BITCAST:%.*]] = bitcast float [[PARAM]] to i32
 // HARD: [[TRUNC:%.*]] = trunc i32 [[BITCAST]] to i16
 // CHECK: store i16 [[TRUNC]], i16* bitcast (half* @g to i16*)
+// NATIVE: define void @t1(half [[PARAM:%.*]])
+// NATIVE: store half [[PARAM]], half* @g
 
 __fp16 t2() { return g; }
 // SOFT: define i32 @t2()
 // HARD: define arm_aapcs_vfpcc float @t2()
+// NATIVE: define half @t2()
 // CHECK: [[LOAD:%.*]] = load i16, i16* bitcast (half* @g to i16*)
 // CHECK: [[ZEXT:%.*]] = zext i16 [[LOAD]] to i32
 // SOFT: ret i32 [[ZEXT]]
 // HARD: [[BITCAST:%.*]] = bitcast i32 [[ZEXT]] to float
 // HARD: ret float [[BITCAST]]
+// NATIVE: [[LOAD:%.*]] = load half, half* @g
+// NATIVE: ret half [[LOAD]]
diff --git a/test/CodeGen/arm-neon-directed-rounding.c b/test/CodeGen/arm-neon-directed-rounding.c
index 8402931..7471b1c 100644
--- a/test/CodeGen/arm-neon-directed-rounding.c
+++ b/test/CodeGen/arm-neon-directed-rounding.c
@@ -1,75 +1,87 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+// CHECK:   [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDA_V1_I]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrnda_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a)
   return vrnda_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDAQ_V1_I]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndaq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a)
   return vrndaq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+// CHECK:   [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDM_V1_I]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a)
   return vrndm_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDMQ_V1_I]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a)
   return vrndmq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+// CHECK:   [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDN_V1_I]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndn_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a)
   return vrndn_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDNQ_V1_I]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndnq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a)
   return vrndnq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+// CHECK:   [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDP_V1_I]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndp_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a)
   return vrndp_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDPQ_V1_I]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndpq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a)
   return vrndpq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+// CHECK:   [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRNDX_V1_I]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndx_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a)
   return vrndx_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDXQ_V1_I]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndxq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a)
   return vrndxq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+// CHECK:   [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x float> [[VRND_V1_I]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrnd_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a)
   return vrnd_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+// CHECK:   [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x float> [[VRNDQ_V1_I]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a)
   return vrndq_f32(a);
 }
diff --git a/test/CodeGen/arm-neon-fma.c b/test/CodeGen/arm-neon-fma.c
index 994702d..9311f6b 100644
--- a/test/CodeGen/arm-neon-fma.c
+++ b/test/CodeGen/arm-neon-fma.c
@@ -1,19 +1,22 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple thumbv7-none-linux-gnueabihf \
 // RUN:   -target-abi aapcs \
-// RUN:   -target-cpu cortex-a8 \
+// RUN:   -target-cpu cortex-a7 \
 // RUN:   -mfloat-abi hard \
 // RUN:   -ffreestanding \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_fma_order(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) #0 {
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum) #2
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
   return vfma_f32(accum, lhs, rhs);
-// CHECK: call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum)
 }
 
+// CHECK-LABEL: define <4 x float> @test_fmaq_order(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) #0 {
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum) #2
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
   return vfmaq_f32(accum, lhs, rhs);
-// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum)
 }
diff --git a/test/CodeGen/arm-neon-numeric-maxmin.c b/test/CodeGen/arm-neon-numeric-maxmin.c
index 615a854..38f020a 100644
--- a/test/CodeGen/arm-neon-numeric-maxmin.c
+++ b/test/CodeGen/arm-neon-numeric-maxmin.c
@@ -1,27 +1,31 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %a, <2 x float> %b) #2
+// CHECK:   ret <2 x float> [[VMAXNM_V2_I]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vmaxnm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %a, <2 x float> %b)
   return vmaxnm_f32(a, b);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %a, <4 x float> %b) #2
+// CHECK:   ret <4 x float> [[VMAXNMQ_V2_I]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vmaxnmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %a, <4 x float> %b)
   return vmaxnmq_f32(a, b);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %a, <2 x float> %b) #2
+// CHECK:   ret <2 x float> [[VMINNM_V2_I]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vminnm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %a, <2 x float> %b)
   return vminnm_f32(a, b);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %a, <4 x float> %b) #2
+// CHECK:   ret <4 x float> [[VMINNMQ_V2_I]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vminnmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %a, <4 x float> %b)
   return vminnmq_f32(a, b);
 }
diff --git a/test/CodeGen/arm-neon-shifts.c b/test/CodeGen/arm-neon-shifts.c
index 7acfb89..ebaa97f 100644
--- a/test/CodeGen/arm-neon-shifts.c
+++ b/test/CodeGen/arm-neon-shifts.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple thumbv7-apple-darwin \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -ffreestanding \
-// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
@@ -27,19 +27,20 @@
 uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra
   // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_u8(a, b, 5);
 }
 
 int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_smax
   // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_s8(a, b, 8);
 }
 
 uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_umax
-  // CHECK: ret <8 x i8> %a
+  // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer
+  // CHECK: ret <8 x i8> [[RES]]
   return vsra_n_u8(a, b, 8);
 }
diff --git a/test/CodeGen/arm-neon-vcvtX.c b/test/CodeGen/arm-neon-vcvtX.c
index ff8ce7e..4ea8fa8 100644
--- a/test/CodeGen/arm-neon-vcvtX.c
+++ b/test/CodeGen/arm-neon-vcvtX.c
@@ -1,99 +1,115 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTA_S32_V1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvta_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
   return vcvta_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTA_U32_V1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvta_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
   return vcvta_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTAQ_S32_V1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtaq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
   return vcvtaq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTAQ_U32_V1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtaq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
   return vcvtaq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTN_S32_V1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtn_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
   return vcvtn_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTN_U32_V1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtn_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
   return vcvtn_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTNQ_S32_V1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtnq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
   return vcvtnq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTNQ_U32_V1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtnq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
   return vcvtnq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTP_S32_V1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtp_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
   return vcvtp_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTP_U32_V1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtp_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
   return vcvtp_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTPQ_S32_V1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtpq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
   return vcvtpq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTPQ_U32_V1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtpq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
   return vcvtpq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTM_S32_V1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtm_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
   return vcvtm_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a) #2
+// CHECK:   ret <2 x i32> [[VCVTM_U32_V1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtm_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
   return vcvtm_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTMQ_S32_V1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtmq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
   return vcvtmq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a) #2
+// CHECK:   ret <4 x i32> [[VCVTMQ_U32_V1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtmq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
   return vcvtmq_u32_f32(a);
 }
diff --git a/test/CodeGen/arm-neon-vget.c b/test/CodeGen/arm-neon-vget.c
index 4a710a2..3bf8905 100644
--- a/test/CodeGen/arm-neon-vget.c
+++ b/test/CodeGen/arm-neon-vget.c
@@ -1,124 +1,123 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple thumbv7-apple-darwin \
 // RUN:   -target-abi apcs-gnu \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -mfloat-abi soft \
 // RUN:   -target-feature +soft-float-abi \
 // RUN:   -ffreestanding \
-// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
 // Check that the vget_low/vget_high intrinsics generate a single shuffle
 // without any bitcasting.
 int8x8_t low_s8(int8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_s8(a);
 }
 
 uint8x8_t low_u8 (uint8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_u8(a);
 }
 
 int16x4_t low_s16( int16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_s16(a);
 }
 
 uint16x4_t low_u16(uint16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_u16(a);
 }
 
 int32x2_t low_s32( int32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_s32(a);
 }
 
 uint32x2_t low_u32(uint32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_u32(a);
 }
 
 int64x1_t low_s64( int64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
   return vget_low_s64(a);
 }
 
 uint64x1_t low_u64(uint64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
   return vget_low_u64(a);
 }
 
 poly8x8_t low_p8 (poly8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_p8(a);
 }
 
 poly16x4_t low_p16(poly16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_p16(a);
 }
 
 float32x2_t low_f32(float32x4_t a) {
-// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_f32(a);
 }
 
 
 int8x8_t high_s8(int8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_s8(a);
 }
 
 uint8x8_t high_u8 (uint8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_u8(a);
 }
 
 int16x4_t high_s16( int16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_s16(a);
 }
 
 uint16x4_t high_u16(uint16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_u16(a);
 }
 
 int32x2_t high_s32( int32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_s32(a);
 }
 
 uint32x2_t high_u32(uint32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_u32(a);
 }
 
 int64x1_t high_s64( int64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
   return vget_high_s64(a);
 }
 
 uint64x1_t high_u64(uint64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
   return vget_high_u64(a);
 }
 
 poly8x8_t high_p8 (poly8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_p8(a);
 }
 
 poly16x4_t high_p16(poly16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_p16(a);
 }
 
 float32x2_t high_f32(float32x4_t a) {
-// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_f32(a);
 }
 
diff --git a/test/CodeGen/arm-target-features.c b/test/CodeGen/arm-target-features.c
index 7829edf..2670dfe 100644
--- a/test/CodeGen/arm-target-features.c
+++ b/test/CodeGen/arm-target-features.c
@@ -22,11 +22,14 @@
 
 
 // RUN: %clang_cc1 -triple thumbv7s-apple-ios7.0 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a35 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple armv8-linux-gnueabi -target-cpu cortex-a53 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a72 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a73 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m2 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // CHECK-BASIC-V8: "target-features"="+crc,+crypto,+dsp,+fp-armv8,+hwdiv,+hwdiv-arm,+neon"
 
 
@@ -39,6 +42,7 @@
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
+// RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
 // CHECK-VFP3-D16-FP16-DIV: "target-features"="+d16,+dsp,+fp16,+hwdiv,+hwdiv-arm,+vfp3"
 
 
diff --git a/test/CodeGen/arm-vfp-asm-constraint.c b/test/CodeGen/arm-vfp-asm-constraint.c
new file mode 100644
index 0000000..21f7362
--- /dev/null
+++ b/test/CodeGen/arm-vfp-asm-constraint.c
@@ -0,0 +1,36 @@
+// REQUIRES: arm-registered-target
+// RUN: %clang_cc1 -triple armv7-unknown-unknown -mfpmath vfp -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-NOT: error:
+
+double fabs(double x) { // CHECK-LABEL: @fabs(
+  // CHECK: call double asm "vabs.f64 ${0:P}, ${1:P}", "=w,w"(double
+  __asm__("vabs.f64 %P0, %P1"
+          : "=w"(x)
+          : "w"(x));
+  return x;
+}
+
+float fabsf(float x) { // CHECK-LABEL: @fabsf(
+  // CHECK: call float asm "vabs.f32 $0, $1", "=t,t"(float
+  __asm__("vabs.f32 %0, %1"
+          : "=t"(x)
+          : "t"(x));
+  return x;
+}
+
+double sqrt(double x) { // CHECK-LABEL: @sqrt(
+  // CHECK: call double asm "vsqrt.f64 ${0:P}, ${1:P}", "=w,w"(double
+  __asm__("vsqrt.f64 %P0, %P1"
+          : "=w"(x)
+          : "w"(x));
+  return x;
+}
+
+float sqrtf(float x) { // CHECK-LABEL: @sqrtf(
+  // CHECK: call float asm "vsqrt.f32 $0, $1", "=t,t"(float
+  __asm__("vsqrt.f32 %0, %1"
+          : "=t"(x)
+          : "t"(x));
+  return x;
+}
diff --git a/test/CodeGen/arm64-abi-vector.c b/test/CodeGen/arm64-abi-vector.c
index 29aeadb..fd828d9 100644
--- a/test/CodeGen/arm64-abi-vector.c
+++ b/test/CodeGen/arm64-abi-vector.c
@@ -1,7 +1,9 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios7 -target-abi darwinpcs -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-android -emit-llvm -o - %s | FileCheck -check-prefix=ANDROID %s
 
 #include <stdarg.h>
 
+typedef __attribute__(( ext_vector_type(2) ))  char __char2;
 typedef __attribute__(( ext_vector_type(3) ))  char __char3;
 typedef __attribute__(( ext_vector_type(4) ))  char __char4;
 typedef __attribute__(( ext_vector_type(5) ))  char __char5;
@@ -13,6 +15,26 @@
 typedef __attribute__(( ext_vector_type(5) ))  int __int5;
 typedef __attribute__(( ext_vector_type(3) ))  double __double3;
 
+// Passing legal vector types as varargs. Check that we've allocated the appropriate size
+double varargs_vec_2c(int fixed, ...) {
+// ANDROID: varargs_vec_2c
+// ANDROID: [[VAR:%.*]] = alloca <2 x i8>, align 2
+// ANDROID: [[AP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[AP_CUR:%.*]], i64 8
+// ANDROID: bitcast i8* [[AP_CUR]] to <2 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char2 c3 = va_arg(ap, __char2);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_2c(__char2 *in) {
+// ANDROID: call double (i32, ...) @varargs_vec_2c(i32 3, i16 {{%.*}})
+  return varargs_vec_2c(3, *in);
+}
+
 double varargs_vec_3c(int fixed, ...) {
 // CHECK: varargs_vec_3c
 // CHECK: alloca <3 x i8>, align 4
diff --git a/test/CodeGen/arm64-be-bitfield.c b/test/CodeGen/arm64-be-bitfield.c
index 132239a..081eab8 100644
--- a/test/CodeGen/arm64-be-bitfield.c
+++ b/test/CodeGen/arm64-be-bitfield.c
@@ -1,6 +1,4 @@
-// REQUIRES: aarch64-registered-target
 // RUN:  %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -emit-llvm -O0 -o - %s | FileCheck --check-prefix IR %s
-// RUN:  %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -S -O1 -o - %s | FileCheck --check-prefix ARM %s
 
 struct bt3 { signed b2:10; signed b3:10; } b16;
 
@@ -10,6 +8,5 @@
 // IR: store i64 [[ARG]], i64* [[PTR:%.*]], align 8
 // IR: [[BITCAST:%.*]] = bitcast i64* [[PTR]] to i8*
 // IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* [[BITCAST]], i64 4
-// ARM: asr x0, x0, #54
   return bp11.b2;
 }
diff --git a/test/CodeGen/arm64-crc32.c b/test/CodeGen/arm64-crc32.c
index 37ced18..efb51ed 100644
--- a/test/CodeGen/arm64-crc32.c
+++ b/test/CodeGen/arm64-crc32.c
@@ -1,6 +1,6 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 int crc32b(int a, char b)
 {
diff --git a/test/CodeGen/arm64-lanes.c b/test/CodeGen/arm64-lanes.c
index 4e80df9..ea47bae 100644
--- a/test/CodeGen/arm64-lanes.c
+++ b/test/CodeGen/arm64-lanes.c
@@ -1,74 +1,127 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -O3 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-BE
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix CHECK-BE
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vdupb_lane_s8
 int8_t test_vdupb_lane_s8(int8x8_t src) {
   return vdupb_lane_s8(src, 2);
+  // CHECK-LABEL: @test_vdupb_lane_s8
   // CHECK: extractelement <8 x i8> %src, i32 2
-  // CHECK-BE: extractelement <8 x i8> %src, i32 5
+
+  // CHECK-BE-LABEL: @test_vdupb_lane_s8
+  // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2
 }
 
-// CHECK-LABEL: @test_vdupb_lane_u8
 uint8_t test_vdupb_lane_u8(uint8x8_t src) {
   return vdupb_lane_u8(src, 2);
+  // CHECK-LABEL: @test_vdupb_lane_u8
   // CHECK: extractelement <8 x i8> %src, i32 2
-  // CHECK-BE: extractelement <8 x i8> %src, i32 5
+
+  // CHECK-BE-LABEL: @test_vdupb_lane_u8
+  // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2
 }
 
-// CHECK-LABEL: @test_vduph_lane_s16
 int16_t test_vduph_lane_s16(int16x4_t src) {
   return vduph_lane_s16(src, 2);
-  // CHECK: extractelement <4 x i16> %src, i32 2
-  // CHECK-BE: extractelement <4 x i16> %src, i32 1
+  // CHECK-LABEL: @test_vduph_lane_s16
+  // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK: extractelement <4 x i16> [[TMP2]], i32 2
+
+  // CHECK-BE-LABEL: @test_vduph_lane_s16
+  // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2
 }
 
-// CHECK-LABEL: @test_vduph_lane_u16
 uint16_t test_vduph_lane_u16(uint16x4_t src) {
   return vduph_lane_u16(src, 2);
-  // CHECK: extractelement <4 x i16> %src, i32 2
-  // CHECK-BE: extractelement <4 x i16> %src, i32 1
+  // CHECK-LABEL: @test_vduph_lane_u16
+  // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK: extractelement <4 x i16> [[TMP2]], i32 2
+
+  // CHECK-BE-LABEL: @test_vduph_lane_u16
+  // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2
 }
 
-// CHECK-LABEL: @test_vdups_lane_s32
 int32_t test_vdups_lane_s32(int32x2_t src) {
   return vdups_lane_s32(src, 0);
-  // CHECK: extractelement <2 x i32> %src, i32 0
-  // CHECK-BE: extractelement <2 x i32> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_s32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK: extractelement <2 x i32> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_s32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdups_lane_u32
 uint32_t test_vdups_lane_u32(uint32x2_t src) {
   return vdups_lane_u32(src, 0);
-  // CHECK: extractelement <2 x i32> %src, i32 0
-  // CHECK-BE: extractelement <2 x i32> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_u32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK: extractelement <2 x i32> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_u32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdups_lane_f32
 float32_t test_vdups_lane_f32(float32x2_t src) {
   return vdups_lane_f32(src, 0);
-  // CHECK: extractelement <2 x float> %src, i32 0
-  // CHECK-BE: extractelement <2 x float> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_f32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float>
+  // CHECK: extractelement <2 x float> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_f32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x float> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x float> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float>
+  // CHECK-BE: extractelement <2 x float> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_s64
 int64_t test_vdupd_lane_s64(int64x1_t src) {
   return vdupd_lane_s64(src, 0);
-  // CHECK: extractelement <1 x i64> %src, i32 0
-  // CHECK-BE: extractelement <1 x i64> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_s64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64>
+  // CHECK: extractelement <1 x i64> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_s64
+  // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_u64
 uint64_t test_vdupd_lane_u64(uint64x1_t src) {
   return vdupd_lane_u64(src, 0);
-  // CHECK: extractelement <1 x i64> %src, i32 0
-  // CHECK-BE: extractelement <1 x i64> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_u64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64>
+  // CHECK: extractelement <1 x i64> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_u64
+  // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_f64
 float64_t test_vdupd_lane_f64(float64x1_t src) {
   return vdupd_lane_f64(src, 0);
-  // CHECK: extractelement <1 x double> %src, i32 0
-  // CHECK-BE: extractelement <1 x double> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_f64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x double>
+  // CHECK: extractelement <1 x double> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_f64
+  // CHECK-BE: extractelement <1 x double> {{.*}}, i32 0
 }
diff --git a/test/CodeGen/arm64-scalar-test.c b/test/CodeGen/arm64-scalar-test.c
deleted file mode 100644
index e2328b1..0000000
--- a/test/CodeGen/arm64-scalar-test.c
+++ /dev/null
@@ -1,547 +0,0 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon  \
-// RUN:   -S -O1 -o - -ffreestanding %s | FileCheck %s
-
-// We're explicitly using arm_neon.h here: some types probably don't match
-// the ACLE definitions, but we want to check current codegen.
-#include <arm_neon.h>
-
-float test_vrsqrtss_f32(float a, float b) {
-// CHECK: test_vrsqrtss_f32
-  return vrsqrtss_f32(a, b);
-// CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-double test_vrsqrtsd_f64(double a, double b) {
-// CHECK: test_vrsqrtsd_f64
-  return vrsqrtsd_f64(a, b);
-// CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK: test_vrshl_s64
-  return vrshl_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK: test_vrshl_u64
-  return vrshl_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vrshld_s64
-int64_t test_vrshld_s64(int64_t a, int64_t b) {
-  return vrshld_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vrshld_u64
-uint64_t test_vrshld_u64(uint64_t a, uint64_t b) {
-  return vrshld_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqrshlb_s8
-int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
-  return vqrshlb_s8(a, b);
-// CHECK: sqrshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqrshlh_s16
-int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
-  return vqrshlh_s16(a, b);
-// CHECK: sqrshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqrshls_s32
-int32_t test_vqrshls_s32(int32_t a, int32_t b) {
-  return vqrshls_s32(a, b);
-// CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqrshld_s64
-int64_t test_vqrshld_s64(int64_t a, int64_t b) {
-  return vqrshld_s64(a, b);
-// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqrshlb_u8
-uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) {
-  return vqrshlb_u8(a, b);
-// CHECK: uqrshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqrshlh_u16
-uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) {
-  return vqrshlh_u16(a, b);
-// CHECK: uqrshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqrshls_u32
-uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) {
-  return vqrshls_u32(a, b);
-// CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqrshld_u64
-uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) {
-  return vqrshld_u64(a, b);
-// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqshlb_s8
-int8_t test_vqshlb_s8(int8_t a, int8_t b) {
-  return vqshlb_s8(a, b);
-// CHECK: sqshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqshlh_s16
-int16_t test_vqshlh_s16(int16_t a, int16_t b) {
-  return vqshlh_s16(a, b);
-// CHECK: sqshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqshls_s32
-int32_t test_vqshls_s32(int32_t a, int32_t b) {
-  return vqshls_s32(a, b);
-// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqshld_s64
-int64_t test_vqshld_s64(int64_t a, int64_t b) {
-  return vqshld_s64(a, b);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqshld_s64_i
-int64_t test_vqshld_s64_i(int64_t a) {
-  return vqshld_s64(a, 36);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-}
-
-// CHECK: test_vqshlb_u8
-uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) {
-  return vqshlb_u8(a, b);
-// CHECK: uqshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqshlh_u16
-uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) {
-  return vqshlh_u16(a, b);
-// CHECK: uqshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqshls_u32
-uint32_t test_vqshls_u32(uint32_t a, uint32_t b) {
-  return vqshls_u32(a, b);
-// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqshld_u64
-uint64_t test_vqshld_u64(uint64_t a, uint64_t b) {
-  return vqshld_u64(a, b);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqshld_u64_i
-uint64_t test_vqshld_u64_i(uint64_t a) {
-  return vqshld_u64(a, 36);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-}
-
-// CHECK: test_vshld_u64
-uint64_t test_vshld_u64(uint64_t a, uint64_t b) {
-  return vshld_u64(a, b);
-// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vshld_s64
-int64_t test_vshld_s64(int64_t a, int64_t b) {
-  return vshld_s64(a, b);
-// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqdmullh_s16
-int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
-  return vqdmullh_s16(a, b);
-// CHECK: sqdmull.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqdmulls_s32
-int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
-  return vqdmulls_s32(a, b);
-// CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqaddb_s8
-int8_t test_vqaddb_s8(int8_t a, int8_t b) {
-  return vqaddb_s8(a, b);
-// CHECK: sqadd.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqaddh_s16
-int16_t test_vqaddh_s16(int16_t a, int16_t b) {
-  return vqaddh_s16(a, b);
-// CHECK: sqadd.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqadds_s32
-int32_t test_vqadds_s32(int32_t a, int32_t b) {
-  return vqadds_s32(a, b);
-// CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqaddd_s64
-int64_t test_vqaddd_s64(int64_t a, int64_t b) {
-  return vqaddd_s64(a, b);
-// CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqaddb_u8
-uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
-  return vqaddb_u8(a, b);
-// CHECK: uqadd.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqaddh_u16
-uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
-  return vqaddh_u16(a, b);
-// CHECK: uqadd.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqadds_u32
-uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
-  return vqadds_u32(a, b);
-// CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqaddd_u64
-uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
-  return vqaddd_u64(a, b);
-// CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqsubb_s8
-int8_t test_vqsubb_s8(int8_t a, int8_t b) {
-  return vqsubb_s8(a, b);
-// CHECK: sqsub.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqsubh_s16
-int16_t test_vqsubh_s16(int16_t a, int16_t b) {
-  return vqsubh_s16(a, b);
-// CHECK: sqsub.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqsubs_s32
-int32_t test_vqsubs_s32(int32_t a, int32_t b) {
-  return vqsubs_s32(a, b);
-// CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqsubd_s64
-int64_t test_vqsubd_s64(int64_t a, int64_t b) {
-  return vqsubd_s64(a, b);
-// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqsubb_u8
-uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
-  return vqsubb_u8(a, b);
-// CHECK: uqsub.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqsubh_u16
-uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
-  return vqsubh_u16(a, b);
-// CHECK: uqsub.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqsubs_u32
-uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
-  return vqsubs_u32(a, b);
-// CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqsubd_u64
-uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
-  return vqsubd_u64(a, b);
-// CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqmovnh_s16
-int8_t test_vqmovnh_s16(int16_t a) {
-  return vqmovnh_s16(a);
-// CHECK: sqxtn.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovnh_u16
-uint8_t test_vqmovnh_u16(uint16_t a) {
-  return vqmovnh_u16(a);
-// CHECK: uqxtn.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovns_s32
-int16_t test_vqmovns_s32(int32_t a) {
-  return vqmovns_s32(a);
-// CHECK: sqxtn.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovns_u32
-uint16_t test_vqmovns_u32(uint32_t a) {
-  return vqmovns_u32(a);
-// CHECK: uqxtn.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovnd_s64
-int32_t test_vqmovnd_s64(int64_t a) {
-  return vqmovnd_s64(a);
-// CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqmovnd_u64
-uint32_t test_vqmovnd_u64(uint64_t a) {
-  return vqmovnd_u64(a);
-// CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqmovunh_s16
-int8_t test_vqmovunh_s16(int16_t a) {
-  return vqmovunh_s16(a);
-// CHECK: sqxtun.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovuns_s32
-int16_t test_vqmovuns_s32(int32_t a) {
-  return vqmovuns_s32(a);
-// CHECK: sqxtun.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovund_s64
-int32_t test_vqmovund_s64(int64_t a) {
-  return vqmovund_s64(a);
-// CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqabsb_s8
-int8_t test_vqabsb_s8(int8_t a) {
-  return vqabsb_s8(a);
-// CHECK: sqabs.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqabsh_s16
-int16_t test_vqabsh_s16(int16_t a) {
-  return vqabsh_s16(a);
-// CHECK: sqabs.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqabss_s32
-int32_t test_vqabss_s32(int32_t a) {
-  return vqabss_s32(a);
-// CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqabsd_s64
-int64_t test_vqabsd_s64(int64_t a) {
-  return vqabsd_s64(a);
-// CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqnegb_s8
-int8_t test_vqnegb_s8(int8_t a) {
-  return vqnegb_s8(a);
-// CHECK: sqneg.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqnegh_s16
-int16_t test_vqnegh_s16(int16_t a) {
-  return vqnegh_s16(a);
-// CHECK: sqneg.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqnegs_s32
-int32_t test_vqnegs_s32(int32_t a) {
-  return vqnegs_s32(a);
-// CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqnegd_s64
-int64_t test_vqnegd_s64(int64_t a) {
-  return vqnegd_s64(a);
-// CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvts_n_f32_s32
-float32_t test_vcvts_n_f32_s32(int32_t a) {
-  return vcvts_n_f32_s32(a, 3);
-// CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #3
-}
-
-// CHECK: test_vcvts_n_f32_u32
-float32_t test_vcvts_n_f32_u32(uint32_t a) {
-  return vcvts_n_f32_u32(a, 3);
-// CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtd_n_f64_s64
-float64_t test_vcvtd_n_f64_s64(int64_t a) {
-  return vcvtd_n_f64_s64(a, 3);
-// CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtd_n_f64_u64
-float64_t test_vcvtd_n_f64_u64(uint64_t a) {
-  return vcvtd_n_f64_u64(a, 3);
-// CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #3
-}
-
-// CHECK: test_vcvts_n_s32_f32
-int32_t test_vcvts_n_s32_f32(float32_t a) {
-  return vcvts_n_s32_f32(a, 3);
-// CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #3
-}
-
-// CHECK: test_vcvts_n_u32_f32
-uint32_t test_vcvts_n_u32_f32(float32_t a) {
-  return vcvts_n_u32_f32(a, 3);
-// CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtd_n_s64_f64
-int64_t test_vcvtd_n_s64_f64(float64_t a) {
-  return vcvtd_n_s64_f64(a, 3);
-// CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtd_n_u64_f64
-uint64_t test_vcvtd_n_u64_f64(float64_t a) {
-  return vcvtd_n_u64_f64(a, 3);
-// CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtas_s32_f32
-int32_t test_vcvtas_s32_f32(float32_t a) {
-  return vcvtas_s32_f32(a);
-// CHECK: fcvtas {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtas_u32_f32
-uint32_t test_vcvtas_u32_f32(float32_t a) {
-  return vcvtas_u32_f32(a);
-// CHECK: fcvtau {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtad_s64_f64
-int64_t test_vcvtad_s64_f64(float64_t a) {
-  return vcvtad_s64_f64(a);
-// CHECK: fcvtas {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtad_u64_f64
-uint64_t test_vcvtad_u64_f64(float64_t a) {
-  return vcvtad_u64_f64(a);
-// CHECK: fcvtau {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtms_s32_f32
-int32_t test_vcvtms_s32_f32(float32_t a) {
-  return vcvtms_s32_f32(a);
-// CHECK: fcvtms {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtms_u32_f32
-uint32_t test_vcvtms_u32_f32(float32_t a) {
-  return vcvtms_u32_f32(a);
-// CHECK: fcvtmu {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtmd_s64_f64
-int64_t test_vcvtmd_s64_f64(float64_t a) {
-  return vcvtmd_s64_f64(a);
-// CHECK: fcvtms {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtmd_u64_f64
-uint64_t test_vcvtmd_u64_f64(float64_t a) {
-  return vcvtmd_u64_f64(a);
-// CHECK: fcvtmu {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtns_s32_f32
-int32_t test_vcvtns_s32_f32(float32_t a) {
-  return vcvtns_s32_f32(a);
-// CHECK: fcvtns {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtns_u32_f32
-uint32_t test_vcvtns_u32_f32(float32_t a) {
-  return vcvtns_u32_f32(a);
-// CHECK: fcvtnu {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtnd_s64_f64
-int64_t test_vcvtnd_s64_f64(float64_t a) {
-  return vcvtnd_s64_f64(a);
-// CHECK: fcvtns {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtnd_u64_f64
-uint64_t test_vcvtnd_u64_f64(float64_t a) {
-  return vcvtnd_u64_f64(a);
-// CHECK: fcvtnu {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtps_s32_f32
-int32_t test_vcvtps_s32_f32(float32_t a) {
-  return vcvtps_s32_f32(a);
-// CHECK: fcvtps {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtps_u32_f32
-uint32_t test_vcvtps_u32_f32(float32_t a) {
-  return vcvtps_u32_f32(a);
-// CHECK: fcvtpu {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtpd_s64_f64
-int64_t test_vcvtpd_s64_f64(float64_t a) {
-  return vcvtpd_s64_f64(a);
-// CHECK: fcvtps {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtpd_u64_f64
-uint64_t test_vcvtpd_u64_f64(float64_t a) {
-  return vcvtpd_u64_f64(a);
-// CHECK: fcvtpu {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtxd_f32_f64
-float32_t test_vcvtxd_f32_f64(float64_t a) {
-  return vcvtxd_f32_f64(a);
-// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vabds_f32
-float32_t test_vabds_f32(float32_t a, float32_t b) {
-  return vabds_f32(a, b);
-  // CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vabdd_f64
-float64_t test_vabdd_f64(float64_t a, float64_t b) {
-  return vabdd_f64(a, b);
-  // CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vmulxs_f32
-float32_t test_vmulxs_f32(float32_t a, float32_t b) {
-  return vmulxs_f32(a, b);
-  // CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vmulxd_f64
-float64_t test_vmulxd_f64(float64_t a, float64_t b) {
-  return vmulxd_f64(a, b);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}
-}
diff --git a/test/CodeGen/arm64-vrsqrt.c b/test/CodeGen/arm64-vrsqrt.c
deleted file mode 100644
index 821c23c..0000000
--- a/test/CodeGen/arm64-vrsqrt.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon -ffreestanding -emit-llvm -O1 -o - %s | FileCheck %s
-
-#include <arm_neon.h>
-
-uint32x2_t test_vrsqrte_u32(uint32x2_t in) {
-  // CHECK-LABEL: @test_vrsqrte_u32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %in)
-  return vrsqrte_u32(in);
-}
-
-float32x2_t test_vrsqrte_f32(float32x2_t in) {
-  // CHECK-LABEL: @test_vrsqrte_f32
-  // CHECK: call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %in)
-  return vrsqrte_f32(in);
-}
-
-
-uint32x4_t test_vrsqrteq_u32(uint32x4_t in) {
-  // CHECK-LABEL: @test_vrsqrteq_u32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %in)
-  return vrsqrteq_u32(in);
-}
-
-float32x4_t test_vrsqrteq_f32(float32x4_t in) {
-  // CHECK-LABEL: @test_vrsqrteq_f32
-  // CHECK: call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %in)
-  return vrsqrteq_f32(in);
-}
-
-
-float32x2_t test_vrsqrts_f32(float32x2_t est, float32x2_t val) {
-  // CHECK-LABEL: @test_vrsqrts_f32
-  // CHECK: call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %est, <2 x float> %val)
-  return vrsqrts_f32(est, val);
-}
-
-
-float32x4_t test_vrsqrtsq_f32(float32x4_t est, float32x4_t val) {
-  // CHECK-LABEL: @test_vrsqrtsq_f32
-  // CHECK: call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %est, <4 x float> %val)
-  return vrsqrtsq_f32(est, val);
-}
-
diff --git a/test/CodeGen/arm64_neon_high_half.c b/test/CodeGen/arm64_neon_high_half.c
deleted file mode 100644
index 6008ba5..0000000
--- a/test/CodeGen/arm64_neon_high_half.c
+++ /dev/null
@@ -1,559 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon -ffreestanding -Os -S -o - %s | FileCheck %s
-// REQUIRES: aarch64-registered-target
-
-#include <arm_neon.h>
-
-int16x8_t test_vaddw_high_s8(int16x8_t lhs, int8x16_t rhs) {
-  // CHECK: saddw2.8h
-  return vaddw_high_s8(lhs, rhs);
-}
-
-int32x4_t test_vaddw_high_s16(int32x4_t lhs, int16x8_t rhs) {
-  // CHECK: saddw2.4s
-  return vaddw_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vaddw_high_s32(int64x2_t lhs, int32x4_t rhs) {
-  // CHECK: saddw2.2d
-  return vaddw_high_s32(lhs, rhs);
-}
-
-uint16x8_t test_vaddw_high_u8(uint16x8_t lhs, uint8x16_t rhs) {
-  // CHECK: uaddw2.8h
-  return vaddw_high_u8(lhs, rhs);
-}
-
-uint32x4_t test_vaddw_high_u16(uint32x4_t lhs, uint16x8_t rhs) {
-  // CHECK: uaddw2.4s
-  return vaddw_high_u16(lhs, rhs);
-}
-
-uint64x2_t test_vaddw_high_u32(uint64x2_t lhs, uint32x4_t rhs) {
-  // CHECK: uaddw2.2d
-  return vaddw_high_u32(lhs, rhs);
-}
-
-int16x8_t test_vsubw_high_s8(int16x8_t lhs, int8x16_t rhs) {
-  // CHECK: ssubw2.8h
-  return vsubw_high_s8(lhs, rhs);
-}
-
-int32x4_t test_vsubw_high_s16(int32x4_t lhs, int16x8_t rhs) {
-  // CHECK: ssubw2.4s
-  return vsubw_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vsubw_high_s32(int64x2_t lhs, int32x4_t rhs) {
-  // CHECK: ssubw2.2d
-  return vsubw_high_s32(lhs, rhs);
-}
-
-uint16x8_t test_vsubw_high_u8(uint16x8_t lhs, uint8x16_t rhs) {
-  // CHECK: usubw2.8h
-  return vsubw_high_u8(lhs, rhs);
-}
-
-uint32x4_t test_vsubw_high_u16(uint32x4_t lhs, uint16x8_t rhs) {
-  // CHECK: usubw2.4s
-  return vsubw_high_u16(lhs, rhs);
-}
-
-uint64x2_t test_vsubw_high_u32(uint64x2_t lhs, uint32x4_t rhs) {
-  // CHECK: usubw2.2d
-  return vsubw_high_u32(lhs, rhs);
-}
-
-int16x8_t test_vabdl_high_s8(int8x16_t lhs, int8x16_t rhs) {
-  // CHECK: sabdl2.8h
-  return vabdl_high_s8(lhs, rhs);
-}
-
-int32x4_t test_vabdl_high_s16(int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sabdl2.4s
-  return vabdl_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vabdl_high_s32(int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sabdl2.2d
-  return vabdl_high_s32(lhs, rhs);
-}
-
-uint16x8_t test_vabdl_high_u8(uint8x16_t lhs, uint8x16_t rhs) {
-  // CHECK: uabdl2.8h
-  return vabdl_high_u8(lhs, rhs);
-}
-
-uint32x4_t test_vabdl_high_u16(uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: uabdl2.4s
-  return vabdl_high_u16(lhs, rhs);
-}
-
-uint64x2_t test_vabdl_high_u32(uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: uabdl2.2d
-  return vabdl_high_u32(lhs, rhs);
-}
-
-int16x8_t test_vabal_high_s8(int16x8_t accum, int8x16_t lhs, int8x16_t rhs) {
-  // CHECK: sabal2.8h
-  return vabal_high_s8(accum, lhs, rhs);
-}
-
-int32x4_t test_vabal_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sabal2.4s
-  return vabal_high_s16(accum, lhs, rhs);
-}
-
-int64x2_t test_vabal_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sabal2.2d
-  return vabal_high_s32(accum, lhs, rhs);
-}
-
-uint16x8_t test_vabal_high_u8(uint16x8_t accum, uint8x16_t lhs, uint8x16_t rhs) {
-  // CHECK: uabal2.8h
-  return vabal_high_u8(accum, lhs, rhs);
-}
-
-uint32x4_t test_vabal_high_u16(uint32x4_t accum, uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: uabal2.4s
-  return vabal_high_u16(accum, lhs, rhs);
-}
-
-uint64x2_t test_vabal_high_u32(uint64x2_t accum, uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: uabal2.2d
-  return vabal_high_u32(accum, lhs, rhs);
-}
-
-int32x4_t test_vqdmlal_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sqdmlal2.4s
-  return vqdmlal_high_s16(accum, lhs, rhs);
-}
-
-int64x2_t test_vqdmlal_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sqdmlal2.2d
-  return vqdmlal_high_s32(accum, lhs, rhs);
-}
-
-int32x4_t test_vqdmlsl_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sqdmlsl2.4s
-  return vqdmlsl_high_s16(accum, lhs, rhs);
-}
-
-int64x2_t test_vqdmlsl_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sqdmlsl2.2d
-  return vqdmlsl_high_s32(accum, lhs, rhs);
-}
-
-int32x4_t test_vqdmull_high_s16(int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sqdmull2.4s
-  return vqdmull_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vqdmull_high_s32(int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sqdmull2.2d
-  return vqdmull_high_s32(lhs, rhs);
-}
-
-int16x8_t test_vshll_high_n_s8(int8x16_t in) {
-  // CHECK: sshll2.8h
-  return vshll_high_n_s8(in, 7);
-}
-
-int32x4_t test_vshll_high_n_s16(int16x8_t in) {
-  // CHECK: sshll2.4s
-  return vshll_high_n_s16(in, 15);
-}
-
-int64x2_t test_vshll_high_n_s32(int32x4_t in) {
-  // CHECK: sshll2.2d
-  return vshll_high_n_s32(in, 31);
-}
-
-int16x8_t test_vshll_high_n_u8(int8x16_t in) {
-  // CHECK: ushll2.8h
-  return vshll_high_n_u8(in, 7);
-}
-
-int32x4_t test_vshll_high_n_u16(int16x8_t in) {
-  // CHECK: ushll2.4s
-  return vshll_high_n_u16(in, 15);
-}
-
-int64x2_t test_vshll_high_n_u32(int32x4_t in) {
-  // CHECK: ushll2.2d
-  return vshll_high_n_u32(in, 31);
-}
-
-int16x8_t test_vshll_high_n_s8_max(int8x16_t in) {
-  // CHECK: shll2.8h
-  return vshll_high_n_s8(in, 8);
-}
-
-int32x4_t test_vshll_high_n_s16_max(int16x8_t in) {
-  // CHECK: shll2.4s
-  return vshll_high_n_s16(in, 16);
-}
-
-int64x2_t test_vshll_high_n_s32_max(int32x4_t in) {
-  // CHECK: shll2.2d
-  return vshll_high_n_s32(in, 32);
-}
-
-int16x8_t test_vshll_high_n_u8_max(int8x16_t in) {
-  // CHECK: shll2.8h
-  return vshll_high_n_u8(in, 8);
-}
-
-int32x4_t test_vshll_high_n_u16_max(int16x8_t in) {
-  // CHECK: shll2.4s
-  return vshll_high_n_u16(in, 16);
-}
-
-int64x2_t test_vshll_high_n_u32_max(int32x4_t in) {
-  // CHECK: shll2.2d
-  return vshll_high_n_u32(in, 32);
-}
-
-int16x8_t test_vsubl_high_s8(int8x16_t lhs, int8x16_t rhs) {
-  // CHECK: ssubl2.8h
-  return vsubl_high_s8(lhs, rhs);
-}
-
-int32x4_t test_vsubl_high_s16(int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: ssubl2.4s
-  return vsubl_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vsubl_high_s32(int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: ssubl2.2d
-  return vsubl_high_s32(lhs, rhs);
-}
-
-uint16x8_t test_vsubl_high_u8(uint8x16_t lhs, uint8x16_t rhs) {
-  // CHECK: usubl2.8h
-  return vsubl_high_u8(lhs, rhs);
-}
-
-uint32x4_t test_vsubl_high_u16(uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: usubl2.4s
-  return vsubl_high_u16(lhs, rhs);
-}
-
-uint64x2_t test_vsubl_high_u32(uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: usubl2.2d
-  return vsubl_high_u32(lhs, rhs);
-}
-
-int8x16_t test_vrshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
-  // CHECK: rshrn2.16b
-  return vrshrn_high_n_s16(lowpart, input, 2);
-}
-
-int16x8_t test_vrshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
-  // CHECK: rshrn2.8h
-  return vrshrn_high_n_s32(lowpart, input, 2);
-}
-
-int32x4_t test_vrshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
-  // CHECK: shrn2.4s
-  return vrshrn_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vrshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
-  // CHECK: rshrn2.16b
-  return vrshrn_high_n_u16(lowpart, input, 2);
-}
-
-uint16x8_t test_vrshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
-  // CHECK: rshrn2.8h
-  return vrshrn_high_n_u32(lowpart, input, 2);
-}
-
-uint32x4_t test_vrshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
-  // CHECK: rshrn2.4s
-  return vrshrn_high_n_u64(lowpart, input, 2);
-}
-
-int8x16_t test_vshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
-  // CHECK: shrn2.16b
-  return vshrn_high_n_s16(lowpart, input, 2);
-}
-
-int16x8_t test_vshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
-  // CHECK: shrn2.8h
-  return vshrn_high_n_s32(lowpart, input, 2);
-}
-
-int32x4_t test_vshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
-  // CHECK: shrn2.4s
-  return vshrn_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
-  // CHECK: shrn2.16b
-  return vshrn_high_n_u16(lowpart, input, 2);
-}
-
-uint16x8_t test_vshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
-  // CHECK: shrn2.8h
-  return vshrn_high_n_u32(lowpart, input, 2);
-}
-
-uint32x4_t test_vshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
-  // CHECK: shrn2.4s
-  return vshrn_high_n_u64(lowpart, input, 2);
-}
-
-uint8x16_t test_vqshrun_high_n_s16(uint8x8_t lowpart, int16x8_t input) {
-  // CHECK: sqshrun2.16b
-  return vqshrun_high_n_s16(lowpart, input, 2);
-}
-
-uint16x8_t test_vqshrun_high_n_s32(uint16x4_t lowpart, int32x4_t input) {
-  // CHECK: sqshrun2.8h
-  return vqshrun_high_n_s32(lowpart, input, 2);
-}
-
-uint32x4_t test_vqshrun_high_n_s64(uint32x2_t lowpart, int64x2_t input) {
-  // CHECK: sqshrun2.4s
-  return vqshrun_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vqrshrun_high_n_s16(uint8x8_t lowpart, int16x8_t input) {
-  // CHECK: sqrshrun2.16b
-  return vqrshrun_high_n_s16(lowpart, input, 2);
-}
-
-uint16x8_t test_vqrshrun_high_n_s32(uint16x4_t lowpart, int32x4_t input) {
-  // CHECK: sqrshrun2.8h
-  return vqrshrun_high_n_s32(lowpart, input, 2);
-}
-
-uint32x4_t test_vqrshrun_high_n_s64(uint32x2_t lowpart, int64x2_t input) {
-  // CHECK: sqrshrun2.4s
-  return vqrshrun_high_n_s64(lowpart, input, 2);
-}
-
-int8x16_t test_vqshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
-  // CHECK: sqshrn2.16b
-  return vqshrn_high_n_s16(lowpart, input, 2);
-}
-
-int16x8_t test_vqshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
-  // CHECK: sqshrn2.8h
-  return vqshrn_high_n_s32(lowpart, input, 2);
-}
-
-int32x4_t test_vqshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
-  // CHECK: sqshrn2.4s
-  return vqshrn_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vqshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
-  // CHECK: uqshrn2.16b
-  return vqshrn_high_n_u16(lowpart, input, 2);
-}
-
-uint16x8_t test_vqshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
-  // CHECK: uqshrn2.8h
-  return vqshrn_high_n_u32(lowpart, input, 2);
-}
-
-uint32x4_t test_vqshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
-  // CHECK: uqshrn2.4s
-  return vqshrn_high_n_u64(lowpart, input, 2);
-}
-
-int8x16_t test_vqrshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
-  // CHECK: sqrshrn2.16b
-  return vqrshrn_high_n_s16(lowpart, input, 2);
-}
-
-int16x8_t test_vqrshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
-  // CHECK: sqrshrn2.8h
-  return vqrshrn_high_n_s32(lowpart, input, 2);
-}
-
-int32x4_t test_vqrshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
-  // CHECK: sqrshrn2.4s
-  return vqrshrn_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
-  // CHECK: uqrshrn2.16b
-  return vqrshrn_high_n_u16(lowpart, input, 2);
-}
-
-uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
-  // CHECK: uqrshrn2.8h
-  return vqrshrn_high_n_u32(lowpart, input, 2);
-}
-
-uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
-  // CHECK: uqrshrn2.4s
-  return vqrshrn_high_n_u64(lowpart, input, 2);
-}
-
-int8x16_t test_vaddhn_high_s16(int8x8_t lowpart, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: addhn2.16b v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s16(lowpart, lhs, rhs);
-}
-
-int16x8_t test_vaddhn_high_s32(int16x4_t lowpart, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: addhn2.8h v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s32(lowpart, lhs, rhs);
-}
-
-int32x4_t test_vaddhn_high_s64(int32x2_t lowpart, int64x2_t lhs, int64x2_t rhs) {
-  // CHECK: addhn2.4s v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s64(lowpart, lhs, rhs);
-}
-
-uint8x16_t test_vaddhn_high_u16(uint8x8_t lowpart, uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: addhn2.16b v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s16(lowpart, lhs, rhs);
-}
-
-uint16x8_t test_vaddhn_high_u32(uint16x4_t lowpart, uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: addhn2.8h v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s32(lowpart, lhs, rhs);
-}
-
-uint32x4_t test_vaddhn_high_u64(uint32x2_t lowpart, uint64x2_t lhs, uint64x2_t rhs) {
-  // CHECK: addhn2.4s v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s64(lowpart, lhs, rhs);
-}
-
-int8x16_t test_vraddhn_high_s16(int8x8_t lowpart, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: raddhn2.16b v0, v1, v2
-  return vraddhn_high_s16(lowpart, lhs, rhs);
-}
-
-int16x8_t test_vraddhn_high_s32(int16x4_t lowpart, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: raddhn2.8h v0, v1, v2
-  return vraddhn_high_s32(lowpart, lhs, rhs);
-}
-
-int32x4_t test_vraddhn_high_s64(int32x2_t lowpart, int64x2_t lhs, int64x2_t rhs) {
-  // CHECK: raddhn2.4s v0, v1, v2
-  return vraddhn_high_s64(lowpart, lhs, rhs);
-}
-
-uint8x16_t test_vraddhn_high_u16(uint8x8_t lowpart, uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: raddhn2.16b v0, v1, v2
-  return vraddhn_high_s16(lowpart, lhs, rhs);
-}
-
-uint16x8_t test_vraddhn_high_u32(uint16x4_t lowpart, uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: raddhn2.8h v0, v1, v2
-  return vraddhn_high_s32(lowpart, lhs, rhs);
-}
-
-uint32x4_t test_vraddhn_high_u64(uint32x2_t lowpart, uint64x2_t lhs, uint64x2_t rhs) {
-  // CHECK: raddhn2.4s v0, v1, v2
-  return vraddhn_high_s64(lowpart, lhs, rhs);
-}
-
-int8x16_t test_vmovn_high_s16(int8x8_t lowpart, int16x8_t wide) {
-  // CHECK: xtn2.16b v0, v1
-  return vmovn_high_s16(lowpart, wide);
-}
-
-int16x8_t test_vmovn_high_s32(int16x4_t lowpart, int32x4_t wide) {
-  // CHECK: xtn2.8h v0, v1
-  return vmovn_high_s32(lowpart, wide);
-}
-
-int32x4_t test_vmovn_high_s64(int32x2_t lowpart, int64x2_t wide) {
-  // CHECK: xtn2.4s v0, v1
-  return vmovn_high_s64(lowpart, wide);
-}
-
-uint8x16_t test_vmovn_high_u16(uint8x8_t lowpart, uint16x8_t wide) {
-  // CHECK: xtn2.16b v0, v1
-  return vmovn_high_u16(lowpart, wide);
-}
-
-uint16x8_t test_vmovn_high_u32(uint16x4_t lowpart, uint32x4_t wide) {
-  // CHECK: xtn2.8h v0, v1
-  return vmovn_high_u32(lowpart, wide);
-}
-
-uint32x4_t test_vmovn_high_u64(uint32x2_t lowpart, uint64x2_t wide) {
-  // CHECK: xtn2.4s v0, v1
-  return vmovn_high_u64(lowpart, wide);
-}
-
-int8x16_t test_vqmovn_high_s16(int8x8_t lowpart, int16x8_t wide) {
-  // CHECK: sqxtn2.16b v0, v1
-  return vqmovn_high_s16(lowpart, wide);
-}
-
-int16x8_t test_vqmovn_high_s32(int16x4_t lowpart, int32x4_t wide) {
-  // CHECK: sqxtn2.8h v0, v1
-  return vqmovn_high_s32(lowpart, wide);
-}
-
-int32x4_t test_vqmovn_high_s64(int32x2_t lowpart, int64x2_t wide) {
-  // CHECK: sqxtn2.4s v0, v1
-  return vqmovn_high_s64(lowpart, wide);
-}
-
-uint8x16_t test_vqmovn_high_u16(uint8x8_t lowpart, int16x8_t wide) {
-  // CHECK: uqxtn2.16b v0, v1
-  return vqmovn_high_u16(lowpart, wide);
-}
-
-uint16x8_t test_vqmovn_high_u32(uint16x4_t lowpart, int32x4_t wide) {
-  // CHECK: uqxtn2.8h v0, v1
-  return vqmovn_high_u32(lowpart, wide);
-}
-
-uint32x4_t test_vqmovn_high_u64(uint32x2_t lowpart, int64x2_t wide) {
-  // CHECK: uqxtn2.4s v0, v1
-  return vqmovn_high_u64(lowpart, wide);
-}
-
-uint8x16_t test_vqmovun_high_s16(uint8x8_t lowpart, int16x8_t wide) {
-  // CHECK: sqxtun2.16b v0, v1
-  return vqmovun_high_s16(lowpart, wide);
-}
-
-uint16x8_t test_vqmovun_high_s32(uint16x4_t lowpart, int32x4_t wide) {
-  // CHECK: sqxtun2.8h v0, v1
-  return vqmovun_high_s32(lowpart, wide);
-}
-
-uint32x4_t test_vqmovun_high_s64(uint32x2_t lowpart, int64x2_t wide) {
-  // CHECK: sqxtun2.4s v0, v1
-  return vqmovun_high_s64(lowpart, wide);
-}
-
-float32x4_t test_vcvtx_high_f32_f64(float32x2_t lowpart, float64x2_t wide) {
-  // CHECK: fcvtxn2 v0.4s, v1.2d
-  return vcvtx_high_f32_f64(lowpart, wide);
-}
-
-float64x2_t test_vcvt_f64_f32(float32x2_t x) {
-  // CHECK: fcvtl v0.2d, v0.2s
-  return vcvt_f64_f32(x);
-}
-
-float64x2_t test_vcvt_high_f64_f32(float32x4_t x) {
-  // CHECK: fcvtl2 v0.2d, v0.4s
-  return vcvt_high_f64_f32(x);
-}
-
-float32x2_t test_vcvt_f32_f64(float64x2_t v) {
-  // CHECK: fcvtn v0.2s, v0.2d
-  return vcvt_f32_f64(v);
-}
-
-float32x4_t test_vcvt_high_f32_f64(float32x2_t x, float64x2_t v) {
-  // CHECK: fcvtn2 v0.4s, v1.2d
-  return vcvt_high_f32_f64(x, v);
-}
-
-float32x2_t test_vcvtx_f32_f64(float64x2_t v) {
-  // CHECK: fcvtxn v0.2s, v0.2d
-  return vcvtx_f32_f64(v);
-}
diff --git a/test/CodeGen/arm64_vCMP.c b/test/CodeGen/arm64_vCMP.c
deleted file mode 100644
index a302128..0000000
--- a/test/CodeGen/arm64_vCMP.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-
-// Test ARM64 SIMD fused multiply add intrinsics
-
-#include <arm_neon.h>
-
-int64x2_t test_vabsq_s64(int64x2_t a1) {
-  // CHECK: test_vabsq_s64
-  return vabsq_s64(a1);
-  // CHECK: llvm.aarch64.neon.abs.v2i64
-  // CHECK-NEXT: ret
-}
-
-int64_t test_vceqd_s64(int64_t a1, int64_t a2) {
-  // CHECK: test_vceqd_s64
-  return vceqd_s64(a1, a2);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp eq i64 %a1, %a2
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-int64_t test_vceqd_f64(float64_t a1, float64_t a2) {
-  // CHECK: test_vceqd_f64
-  return vceqd_f64(a1, a2);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = fcmp oeq double %a1, %a2
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-uint64_t test_vcgtd_u64(uint64_t a1, uint64_t a2) {
-  // CHECK: test_vcgtd_u64
-  return vcgtd_u64(a1, a2);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp ugt i64 %a1, %a2
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-uint64_t test_vcled_u64(uint64_t a1, uint64_t a2) {
-  // CHECK: test_vcled_u64
-  return vcled_u64(a1, a2);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp ule i64 %a1, %a2
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-int64_t test_vceqzd_s64(int64_t a1) {
-  // CHECK: test_vceqzd_s64
-  return vceqzd_s64(a1);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp eq i64 %a1, 0
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-uint64x2_t test_vceqq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vceqq_u64
-  return vceqq_u64(a1, a2);
-  // CHECK:  icmp eq <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcgeq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vcgeq_s64
-  return vcgeq_s64(a1, a2);
-  // CHECK:  icmp sge <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcgeq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vcgeq_u64
-  return vcgeq_u64(a1, a2);
-  // CHECK:  icmp uge <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcgtq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vcgtq_s64
-  return vcgtq_s64(a1, a2);
-  // CHECK: icmp sgt <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcgtq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vcgtq_u64
-  return vcgtq_u64(a1, a2);
-  // CHECK: icmp ugt <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcleq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vcleq_s64
-  return vcleq_s64(a1, a2);
-  // CHECK: icmp sle <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcleq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vcleq_u64
-  return vcleq_u64(a1, a2);
-  // CHECK: icmp ule <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcltq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vcltq_s64
-  return vcltq_s64(a1, a2);
-  // CHECK: icmp slt <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcltq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vcltq_u64
-  return vcltq_u64(a1, a2);
-  // CHECK: icmp ult <2 x i64> %a1, %a2
-}
-
-int64x2_t test_vqabsq_s64(int64x2_t a1) {
-  // CHECK: test_vqabsq_s64
-  return vqabsq_s64(a1);
-  // CHECK: llvm.aarch64.neon.sqabs.v2i64(<2 x i64> %a1)
-  // CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/arm64_vLdStNum_lane.c b/test/CodeGen/arm64_vLdStNum_lane.c
deleted file mode 100644
index 85229d5..0000000
--- a/test/CodeGen/arm64_vLdStNum_lane.c
+++ /dev/null
@@ -1,141 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD load and stores of an N-element structure  intrinsics
-
-#include <arm_neon.h>
-
-int64x2x2_t test_vld2q_lane_s64(const void * a1, int64x2x2_t a2) {
-  // CHECK: test_vld2q_lane_s64
-  return vld2q_lane_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.ld2lane.v2i64.p0i8
-}
-
-uint64x2x2_t test_vld2q_lane_u64(const void * a1, uint64x2x2_t a2) {
-  // CHECK: test_vld2q_lane_u64
-  return vld2q_lane_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.ld2lane.v2i64.p0i8
-}
-
-int64x1x2_t test_vld2_lane_s64(const void * a1, int64x1x2_t a2) {
-  // CHECK: test_vld2_lane_s64
-  return vld2_lane_s64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld2lane.v1i64.p0i8
-}
-
-uint64x1x2_t test_vld2_lane_u64(const void * a1, uint64x1x2_t a2) {
-  // CHECK: test_vld2_lane_u64
-  return vld2_lane_u64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld2lane.v1i64.p0i8
-}
-
-poly8x16x2_t test_vld2q_lane_p8(const void * a1, poly8x16x2_t a2) {
-  // CHECK: test_vld2q_lane_p8
-  return vld2q_lane_p8(a1, a2, 0);
-  // CHECK: extractvalue {{.*}} 0{{ *$}}
-  // CHECK: extractvalue {{.*}} 1{{ *$}}
-}
-
-uint8x16x2_t test_vld2q_lane_u8(const void * a1, uint8x16x2_t a2) {
-  // CHECK: test_vld2q_lane_u8
-  return vld2q_lane_u8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld2lane.v16i8.p0i8
-}
-
-int64x2x3_t test_vld3q_lane_s64(const void * a1, int64x2x3_t a2) {
-  // CHECK: test_vld3q_lane_s64
-  return vld3q_lane_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.ld3lane.v2i64.p0i8
-}
-
-uint64x2x3_t test_vld3q_lane_u64(const void * a1, uint64x2x3_t a2) {
-  // CHECK: test_vld3q_lane_u64
-  return vld3q_lane_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.ld3lane.v2i64.p0i8
-}
-
-int64x1x3_t test_vld3_lane_s64(const void * a1, int64x1x3_t a2) {
-  // CHECK: test_vld3_lane_s64
-  return vld3_lane_s64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v1i64.p0i8
-}
-
-uint64x1x3_t test_vld3_lane_u64(const void * a1, uint64x1x3_t a2) {
-  // CHECK: test_vld3_lane_u64
-  return vld3_lane_u64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v1i64.p0i8
-}
-
-int8x8x3_t test_vld3_lane_s8(const void * a1, int8x8x3_t a2) {
-  // CHECK: test_vld3_lane_s8
-  return vld3_lane_s8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v8i8.p0i8
-}
-
-poly8x16x3_t test_vld3q_lane_p8(const void * a1, poly8x16x3_t a2) {
-  // CHECK: test_vld3q_lane_p8
-  return vld3q_lane_p8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v16i8.p0i8
-}
-
-uint8x16x3_t test_vld3q_lane_u8(const void * a1, uint8x16x3_t a2) {
-  // CHECK: test_vld3q_lane_u8
-  return vld3q_lane_u8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v16i8.p0i8
-}
-
-int64x2x4_t test_vld4q_lane_s64(const void * a1, int64x2x4_t a2) {
-  // CHECK: test_vld4q_lane_s64
-  return vld4q_lane_s64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v2i64.p0i8
-}
-
-uint64x2x4_t test_vld4q_lane_u64(const void * a1, uint64x2x4_t a2) {
-  // CHECK: test_vld4q_lane_u64
-  return vld4q_lane_u64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v2i64.p0i8
-}
-
-int64x1x4_t test_vld4_lane_s64(const void * a1, int64x1x4_t a2) {
-  // CHECK: test_vld4_lane_s64
-  return vld4_lane_s64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v1i64.p0i8
-}
-
-uint64x1x4_t test_vld4_lane_u64(const void * a1, uint64x1x4_t a2) {
-  // CHECK: test_vld4_lane_u64
-  return vld4_lane_u64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v1i64.p0i8
-}
-
-int8x8x4_t test_vld4_lane_s8(const void * a1, int8x8x4_t a2) {
-  // CHECK: test_vld4_lane_s8
-  return vld4_lane_s8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v8i8.p0i8
-}
-
-uint8x8x4_t test_vld4_lane_u8(const void * a1, uint8x8x4_t a2) {
-  // CHECK: test_vld4_lane_u8
-  return vld4_lane_u8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v8i8.p0i8
-}
-
-poly8x16x4_t test_vld4q_lane_p8(const void * a1, poly8x16x4_t a2) {
-  // CHECK: test_vld4q_lane_p8
-  return vld4q_lane_p8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v16i8.p0i8
-}
-
-int8x16x4_t test_vld4q_lane_s8(const void * a1, int8x16x4_t a2) {
-  // CHECK: test_vld4q_lane_s8
-  return vld4q_lane_s8(a1, a2, 0);
-  // CHECK: extractvalue {{.*}} 0{{ *$}}
-  // CHECK: extractvalue {{.*}} 1{{ *$}}
-  // CHECK: extractvalue {{.*}} 2{{ *$}}
-  // CHECK: extractvalue {{.*}} 3{{ *$}}
-}
-
-uint8x16x4_t test_vld4q_lane_u8(const void * a1, uint8x16x4_t a2) {
-  // CHECK: test_vld4q_lane_u8
-  return vld4q_lane_u8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v16i8.p0i8
-}
-
diff --git a/test/CodeGen/arm64_vMaxMin.c b/test/CodeGen/arm64_vMaxMin.c
deleted file mode 100644
index a1dd2ad..0000000
--- a/test/CodeGen/arm64_vMaxMin.c
+++ /dev/null
@@ -1,207 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck -check-prefix=CHECK-CODEGEN %s
-// REQUIRES: aarch64-registered-target
-// Test ARM64 SIMD max/min intrinsics
-
-#include <arm_neon.h>
-
-// Test a represntative sample of 8 and 16, signed and unsigned, 64 and 128 bit reduction
-int8_t test_vmaxv_s8(int8x8_t a1) {
-  // CHECK-LABEL: define i8 @test_vmaxv_s8(
-  return vmaxv_s8(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(
-}
-
-uint16_t test_vminvq_u16(uint16x8_t a1) {
-  // CHECK-LABEL: define i16 @test_vminvq_u16(
-  return vminvq_u16(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.uminv.i32.v8i16(
-}
-
-// Test a represntative sample of 8 and 16, signed and unsigned, 64 and 128 bit pairwise
-uint8x8_t test_vmin_u8(uint8x8_t a1, uint8x8_t a2) {
-  // CHECK-LABEL: define <8 x i8> @test_vmin_u8(
-  return vmin_u8(a1, a2);
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.umin.v8i8(
-}
-
-uint8x16_t test_vminq_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK-LABEL: define <16 x i8> @test_vminq_u8(
-  return vminq_u8(a1, a2);
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.umin.v16i8(
-}
-
-int16x8_t test_vmaxq_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(
-  return vmaxq_s16(a1, a2);
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.smax.v8i16(
-}
-
-// Test the more complicated cases of [suf]32 and f64
-float64x2_t test_vmaxq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK-LABEL: define <2 x double> @test_vmaxq_f64(
-  return vmaxq_f64(a1, a2);
-  // CHECK: call <2 x double> @llvm.aarch64.neon.fmax.v2f64(
-}
-
-float32x4_t test_vmaxq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK-LABEL: define <4 x float> @test_vmaxq_f32(
-  return vmaxq_f32(a1, a2);
-  // CHECK: call <4 x float> @llvm.aarch64.neon.fmax.v4f32(
-}
-
-float64x2_t test_vminq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK-LABEL: define <2 x double> @test_vminq_f64(
-  return vminq_f64(a1, a2);
-  // CHECK: call <2 x double> @llvm.aarch64.neon.fmin.v2f64(
-}
-
-float32x2_t test_vmax_f32(float32x2_t a1, float32x2_t a2) {
-  // CHECK-LABEL: define <2 x float> @test_vmax_f32(
-  return vmax_f32(a1, a2);
-  // CHECK: call <2 x float> @llvm.aarch64.neon.fmax.v2f32(
-}
-
-int32x2_t test_vmax_s32(int32x2_t a1, int32x2_t a2) {
-  // CHECK-LABEL: define <2 x i32> @test_vmax_s32(
-  return vmax_s32(a1, a2);
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.smax.v2i32(
-}
-
-uint32x2_t test_vmin_u32(uint32x2_t a1, uint32x2_t a2) {
-  // CHECK-LABEL: define <2 x i32> @test_vmin_u32(
-  return vmin_u32(a1, a2);
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.umin.v2i32(
-}
-
-float32_t test_vmaxnmv_f32(float32x2_t a1) {
-  // CHECK-LABEL: define float @test_vmaxnmv_f32(
-  return vmaxnmv_f32(a1);
-  // CHECK: llvm.aarch64.neon.fmaxnmv.f32.v2f32
-  // CHECK-NEXT: ret
-}
-
-// this doesn't translate into a valid instruction, regardless of what the
-// ARM doc says.
-#if 0
-float64_t test_vmaxnmvq_f64(float64x2_t a1) {
-  // CHECK@ test_vmaxnmvq_f64
-  return vmaxnmvq_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-#endif
-
-float32_t test_vmaxnmvq_f32(float32x4_t a1) {
-  // CHECK-LABEL: define float @test_vmaxnmvq_f32(
-  return vmaxnmvq_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(
-  // CHECK-NEXT: ret
-}
-
-float32_t test_vmaxv_f32(float32x2_t a1) {
-  // CHECK-LABEL: define float @test_vmaxv_f32(
-  return vmaxv_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fmaxv.f32.v2f32(
-  // FIXME check that the 2nd and 3rd arguments are the same V register below
-  // CHECK-CODEGEN: fmaxp.2s
-  // CHECK-NEXT: ret
-}
-
-int32_t test_vmaxv_s32(int32x2_t a1) {
-  // CHECK-LABEL: define i32 @test_vmaxv_s32(
-  return vmaxv_s32(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(
-  // FIXME check that the 2nd and 3rd arguments are the same V register below
-  // CHECK-CODEGEN: smaxp.2s
-  // CHECK-NEXT: ret
-}
-
-uint32_t test_vmaxv_u32(uint32x2_t a1) {
-  // CHECK-LABEL: define i32 @test_vmaxv_u32(
-  return vmaxv_u32(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(
-  // FIXME check that the 2nd and 3rd arguments are the same V register below
-  // CHECK-CODEGEN: umaxp.2s
-  // CHECK-NEXT: ret
-}
-
-// FIXME punt on this for now; don't forget to fix CHECKs
-#if 0
-float64_t test_vmaxvq_f64(float64x2_t a1) {
-  // CHECK@ test_vmaxvq_f64
-  return vmaxvq_f64(a1);
-  // CHECK@ llvm.aarch64.neon.fmaxv.i64.v2f64
-  // CHECK-NEXT@ ret
-}
-#endif
-
-float32_t test_vmaxvq_f32(float32x4_t a1) {
-  // CHECK-LABEL: define float @test_vmaxvq_f32(
-  return vmaxvq_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fmaxv.f32.v4f32(
-  // CHECK-NEXT: ret
-}
-
-float32_t test_vminnmv_f32(float32x2_t a1) {
-  // CHECK-LABEL: define float @test_vminnmv_f32(
-  return vminnmv_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fminnmv.f32.v2f32(
-  // CHECK-NEXT: ret
-}
-
-float32_t test_vminvq_f32(float32x4_t a1) {
-  // CHECK-LABEL: define float @test_vminvq_f32(
-  return vminvq_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fminv.f32.v4f32(
-  // CHECK-NEXT: ret
-}
-
-// this doesn't translate into a valid instruction, regardless of what the ARM
-// doc says.
-#if 0
-float64_t test_vminnmvq_f64(float64x2_t a1) {
-  // CHECK@ test_vminnmvq_f64
-  return vminnmvq_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-#endif
-
-float32_t test_vminnmvq_f32(float32x4_t a1) {
-  // CHECK-LABEL: define float @test_vminnmvq_f32(
-  return vminnmvq_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fminnmv.f32.v4f32(
-  // CHECK-NEXT: ret
-}
-
-float32_t test_vminv_f32(float32x2_t a1) {
-  // CHECK-LABEL: define float @test_vminv_f32(
-  return vminv_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fminv.f32.v2f32(
-  // CHECK-NEXT: ret
-}
-
-int32_t test_vminv_s32(int32x2_t a1) {
-  // CHECK-LABEL: define i32 @test_vminv_s32(
-  return vminv_s32(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.sminv.i32.v2i32(
-  // CHECK-CODEGEN: sminp.2s
-  // CHECK-NEXT: ret
-}
-
-uint32_t test_vminv_u32(uint32x2_t a1) {
-  // CHECK-LABEL: define i32 @test_vminv_u32(
-  return vminv_u32(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.uminv.i32.v2i32(
-}
-
-// FIXME punt on this for now; don't forget to fix CHECKs
-#if 0
-float64_t test_vminvq_f64(float64x2_t a1) {
-  // CHECK@ test_vminvq_f64
-  return vminvq_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-#endif
diff --git a/test/CodeGen/arm64_vadd.c b/test/CodeGen/arm64_vadd.c
deleted file mode 100644
index 7b2913f..0000000
--- a/test/CodeGen/arm64_vadd.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD add intrinsics
-
-#include <arm_neon.h>
-int64_t test_vaddlv_s32(int32x2_t a1) {
-  // CHECK: test_vaddlv_s32
-  return vaddlv_s32(a1);
-  // CHECK: llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT: ret
-}
-
-uint64_t test_vaddlv_u32(uint32x2_t a1) {
-  // CHECK: test_vaddlv_u32
-  return vaddlv_u32(a1);
-  // CHECK: llvm.aarch64.neon.uaddlv.i64.v2i32
-  // CHECK-NEXT: ret
-}
-
-int8_t test_vaddv_s8(int8x8_t a1) {
-  // CHECK: test_vaddv_s8
-  return vaddv_s8(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v8i8
-  // don't check for return here (there's a trunc?)
-}
-
-int16_t test_vaddv_s16(int16x4_t a1) {
-  // CHECK: test_vaddv_s16
-  return vaddv_s16(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v4i16
-  // don't check for return here (there's a trunc?)
-}
-
-int32_t test_vaddv_s32(int32x2_t a1) {
-  // CHECK: test_vaddv_s32
-  return vaddv_s32(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v2i32
-  // CHECK-NEXT: ret
-}
-
-uint8_t test_vaddv_u8(int8x8_t a1) {
-  // CHECK: test_vaddv_u8
-  return vaddv_u8(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v8i8
-  // don't check for return here (there's a trunc?)
-}
-
-uint16_t test_vaddv_u16(int16x4_t a1) {
-  // CHECK: test_vaddv_u16
-  return vaddv_u16(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v4i16
-  // don't check for return here (there's a trunc?)
-}
-
-uint32_t test_vaddv_u32(int32x2_t a1) {
-  // CHECK: test_vaddv_u32
-  return vaddv_u32(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v2i32
-  // CHECK-NEXT: ret
-}
-
-int8_t test_vaddvq_s8(int8x16_t a1) {
-  // CHECK: test_vaddvq_s8
-  return vaddvq_s8(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v16i8
-  // don't check for return here (there's a trunc?)
-}
-
-int16_t test_vaddvq_s16(int16x8_t a1) {
-  // CHECK: test_vaddvq_s16
-  return vaddvq_s16(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v8i16
-  // don't check for return here (there's a trunc?)
-}
-
-int32_t test_vaddvq_s32(int32x4_t a1) {
-  // CHECK: test_vaddvq_s32
-  return vaddvq_s32(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v4i32
-  // CHECK-NEXT: ret
-}
-
-uint8_t test_vaddvq_u8(int8x16_t a1) {
-  // CHECK: test_vaddvq_u8
-  return vaddvq_u8(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v16i8
-  // don't check for return here (there's a trunc?)
-}
-
-uint16_t test_vaddvq_u16(int16x8_t a1) {
-  // CHECK: test_vaddvq_u16
-  return vaddvq_u16(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v8i16
-  // don't check for return here (there's a trunc?)
-}
-
-uint32_t test_vaddvq_u32(int32x4_t a1) {
-  // CHECK: test_vaddvq_u32
-  return vaddvq_u32(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v4i32
-  // CHECK-NEXT: ret
-}
-
diff --git a/test/CodeGen/arm64_vca.c b/test/CodeGen/arm64_vca.c
deleted file mode 100644
index 00cc283..0000000
--- a/test/CodeGen/arm64_vca.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 vector compare absolute intrinsics
-
-#include <arm_neon.h>
-
-uint32x2_t test_vcale_f32(float32x2_t a1, float32x2_t a2) {
-  // CHECK: test_vcale_f32
-  return vcale_f32(a1, a2);
-  // CHECK: llvm.aarch64.neon.facge.v2i32.v2f32
-  // no check for ret here, as there is a bitcast
-}
-
-uint32x4_t test_vcaleq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK: test_vcaleq_f32
-  return vcaleq_f32(a1, a2);
-  // CHECK: llvm.aarch64.neon.facge.v4i32.v4f32{{.*a2,.*a1}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint32x2_t test_vcalt_f32(float32x2_t a1, float32x2_t a2) {
-  // CHECK: test_vcalt_f32
-  return vcalt_f32(a1, a2);
-  // CHECK: llvm.aarch64.neon.facgt.v2i32.v2f32{{.*a2,.*a1}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint32x4_t test_vcaltq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK: test_vcaltq_f32
-  return vcaltq_f32(a1, a2);
-  // CHECK: llvm.aarch64.neon.facgt.v4i32.v4f32{{.*a2,.*a1}}
-}
-
-uint64x2_t test_vcagtq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK: test_vcagtq_f64
-  return vcagtq_f64(a1, a2);
-  // CHECK: llvm.aarch64.neon.facgt.v2i64.v2f64{{.*a1,.*a2}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint64x2_t test_vcaltq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK: test_vcaltq_f64
-  return vcaltq_f64(a1, a2);
-  // CHECK: llvm.aarch64.neon.facgt.v2i64.v2f64{{.*a2,.*a1}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint64x2_t test_vcageq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK: test_vcageq_f64
-  return vcageq_f64(a1, a2);
-  // CHECK: llvm.aarch64.neon.facge.v2i64.v2f64{{.*a1,.*a2}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint64x2_t test_vcaleq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK: test_vcaleq_f64
-  return vcaleq_f64(a1, a2);
-  // CHECK: llvm.aarch64.neon.facge.v2i64.v2f64{{.*a2,.*a1}}
-  // no check for ret here, as there is a bitcast
-}
diff --git a/test/CodeGen/arm64_vcopy.c b/test/CodeGen/arm64_vcopy.c
index 990d4f6..4c01430 100644
--- a/test/CodeGen/arm64_vcopy.c
+++ b/test/CodeGen/arm64_vcopy.c
@@ -1,69 +1,121 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 
 // Test ARM64 SIMD copy vector element to vector element: vcopyq_lane*
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %a1, <16 x i8> %a2) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vcopyq_laneq_s8(int8x16_t a1, int8x16_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s8
   return vcopyq_laneq_s8(a1, (int64_t) 3, a2, (int64_t) 13);
-  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_u8(<16 x i8> %a1, <16 x i8> %a2) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vcopyq_laneq_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u8
   return vcopyq_laneq_u8(a1, (int64_t) 3, a2, (int64_t) 13);
-  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_s16(<8 x i16> %a1, <8 x i16> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vcopyq_laneq_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s16
   return vcopyq_laneq_s16(a1, (int64_t) 3, a2, (int64_t) 7);
-  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>
 
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_u16(<8 x i16> %a1, <8 x i16> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vcopyq_laneq_u16(uint16x8_t a1, uint16x8_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u16
   return vcopyq_laneq_u16(a1, (int64_t) 3, a2, (int64_t) 7);
-  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>
 
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_s32(<4 x i32> %a1, <4 x i32> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vcopyq_laneq_s32(int32x4_t a1, int32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s32
   return vcopyq_laneq_s32(a1, (int64_t) 3, a2, (int64_t) 3);
-  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_u32(<4 x i32> %a1, <4 x i32> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vcopyq_laneq_u32(uint32x4_t a1, uint32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u32
   return vcopyq_laneq_u32(a1, (int64_t) 3, a2, (int64_t) 3);
-  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_s64(<2 x i64> %a1, <2 x i64> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vcopyq_laneq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s64
   return vcopyq_laneq_s64(a1, (int64_t) 0, a2, (int64_t) 1);
-  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_u64(<2 x i64> %a1, <2 x i64> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vcopyq_laneq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u64
   return vcopyq_laneq_u64(a1, (int64_t) 0, a2, (int64_t) 1);
-  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcopyq_laneq_f32(<4 x float> %a1, <4 x float> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP3]], float [[VGETQ_LANE]], i32 0
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vcopyq_laneq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_f32
   return vcopyq_laneq_f32(a1, 0, a2, 3);
-  // CHECK: shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcopyq_laneq_f64(<2 x double> %a1, <2 x double> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x double> [[TMP3]], double [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x double> [[VSET_LANE]]
 float64x2_t test_vcopyq_laneq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_f64
   return vcopyq_laneq_f64(a1, 0, a2, 1);
-  // CHECK: shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 3, i32 1>
 }
 
diff --git a/test/CodeGen/arm64_vcreate.c b/test/CodeGen/arm64_vcreate.c
index b974752..ddfa147 100644
--- a/test/CodeGen/arm64_vcreate.c
+++ b/test/CodeGen/arm64_vcreate.c
@@ -1,7 +1,6 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 // Test ARM64 SIMD vcreate intrinsics
 
-/*#include <arm_neon.h>*/
 #include <arm_neon.h>
 
 float32x2_t test_vcreate_f32(uint64_t a1) {
@@ -10,14 +9,3 @@
   // CHECK: bitcast {{.*}} to <2 x float>
   // CHECK-NEXT: ret
 }
-
-// FIXME enable when scalar_to_vector in backend is fixed.  Also, change
-// CHECK@ to CHECK<colon> and CHECK-NEXT@ to CHECK-NEXT<colon>
-/*
-float64x1_t test_vcreate_f64(uint64_t a1) {
-  // CHECK@ test_vcreate_f64
-  return vcreate_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-*/
diff --git a/test/CodeGen/arm64_vcvtfp.c b/test/CodeGen/arm64_vcvtfp.c
deleted file mode 100644
index e3dca81..0000000
--- a/test/CodeGen/arm64_vcvtfp.c
+++ /dev/null
@@ -1,48 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-
-#include <arm_neon.h>
-
-float64x2_t test_vcvt_f64_f32(float32x2_t x) {
-  // CHECK-LABEL: test_vcvt_f64_f32
-  return vcvt_f64_f32(x);
-  // CHECK: fpext <2 x float> {{%.*}} to <2 x double>
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vcvt_high_f64_f32(float32x4_t x) {
-  // CHECK-LABEL: test_vcvt_high_f64_f32
-  return vcvt_high_f64_f32(x);
-  // CHECK: [[HIGH:%.*]] = shufflevector <4 x float> {{%.*}}, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK-NEXT: fpext <2 x float> [[HIGH]] to <2 x double>
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vcvt_f32_f64(float64x2_t v) {
-  // CHECK: test_vcvt_f32_f64
-  return vcvt_f32_f64(v);
-  // CHECK: fptrunc <2 x double> {{%.*}} to <2 x float>
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vcvt_high_f32_f64(float32x2_t x, float64x2_t v) {
-  // CHECK: test_vcvt_high_f32_f64
-  return vcvt_high_f32_f64(x, v);
-  // CHECK: [[TRUNC:%.*]] = fptrunc <2 x double> {{.*}} to <2 x float>
-  // CHECK-NEXT: shufflevector <2 x float> {{.*}}, <2 x float> [[TRUNC]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vcvtx_f32_f64(float64x2_t v) {
-  // CHECK: test_vcvtx_f32_f64
-  return vcvtx_f32_f64(v);
-  // CHECK: llvm.aarch64.neon.fcvtxn.v2f32.v2f64
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vcvtx_high_f32_f64(float32x2_t x, float64x2_t v) {
-  // CHECK: test_vcvtx_high_f32_f64
-  return vcvtx_high_f32_f64(x, v);
-  // CHECK: llvm.aarch64.neon.fcvtxn.v2f32.v2f64
-  // CHECK: shufflevector
-  // CHECK: ret
-}
diff --git a/test/CodeGen/arm64_vdupq_n_f64.c b/test/CodeGen/arm64_vdupq_n_f64.c
index ffba55c..58cc7f0 100644
--- a/test/CodeGen/arm64_vdupq_n_f64.c
+++ b/test/CodeGen/arm64_vdupq_n_f64.c
@@ -1,88 +1,78 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | \
-// RUN:   FileCheck -check-prefix=CHECK-IR %s
-// REQUIRES: aarch64-registered-target
-
-/// Test vdupq_n_f64 and vmovq_nf64 ARM64 intrinsics
-// <rdar://problem/11778405> ARM64: vdupq_n_f64 and vdupq_lane_f64 intrinsics
-// missing
-
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -fallow-half-arguments-and-returns -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
 // vdupq_n_f64 -> dup.2d v0, v0[0]
 //
-float64x2_t test_vdupq_n_f64(float64_t w)
-{
+// CHECK-LABEL: define <2 x double> @test_vdupq_n_f64(double %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
+// CHECK:   ret <2 x double> [[VECINIT1_I]]
+float64x2_t test_vdupq_n_f64(float64_t w) {
     return vdupq_n_f64(w);
-  // CHECK-LABEL: test_vdupq_n_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // might as well test this while we're here
 // vdupq_n_f32 -> dup.4s v0, v0[0]
-float32x4_t test_vdupq_n_f32(float32_t w)
-{
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %w, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %w, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %w, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
+float32x4_t test_vdupq_n_f32(float32_t w) {
     return vdupq_n_f32(w);
-  // CHECK-LABEL: test_vdupq_n_f32:
-  // CHECK: dup.4s v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // vdupq_lane_f64 -> dup.2d v0, v0[0]
 // this was in <rdar://problem/11778405>, but had already been implemented,
 // test anyway
-float64x2_t test_vdupq_lane_f64(float64x1_t V)
-{
+// CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x double> [[SHUFFLE]]
+float64x2_t test_vdupq_lane_f64(float64x1_t V) {
     return vdupq_lane_f64(V, 0);
-  // CHECK-LABEL: test_vdupq_lane_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // vmovq_n_f64 -> dup Vd.2d,X0
 // this wasn't in <rdar://problem/11778405>, but it was between the vdups
-float64x2_t test_vmovq_n_f64(float64_t w)
-{
+// CHECK-LABEL: define <2 x double> @test_vmovq_n_f64(double %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
+// CHECK:   ret <2 x double> [[VECINIT1_I]]
+float64x2_t test_vmovq_n_f64(float64_t w) {
   return vmovq_n_f64(w);
-  // CHECK-LABEL: test_vmovq_n_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
-float16x4_t test_vmov_n_f16(float16_t *a1)
-{
-  // CHECK-IR-LABEL: test_vmov_n_f16
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a1) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a1, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT3]]
+float16x4_t test_vmov_n_f16(float16_t *a1) {
   return vmov_n_f16(*a1);
-  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
 }
 
-// Disable until scalar problem in backend is fixed. Change CHECK-IR@ to
-// CHECK-IR<colon>
 /*
-float64x1_t test_vmov_n_f64(float64_t a1)
-{
-  // CHECK-IR@ test_vmov_n_f64
+float64x1_t test_vmov_n_f64(float64_t a1) {
   return vmov_n_f64(a1);
-  // CHECK-IR@ insertelement {{.*}} i32 0{{ *$}}
 }
 */
 
-float16x8_t test_vmovq_n_f16(float16_t *a1)
-{
-  // CHECK-IR-LABEL: test_vmovq_n_f16
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a1) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a1, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT7]]
+float16x8_t test_vmovq_n_f16(float16_t *a1) {
   return vmovq_n_f16(*a1);
-  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 4{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 5{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 6{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 7{{ *$}}
 }
 
diff --git a/test/CodeGen/arm64_vecCmpBr.c b/test/CodeGen/arm64_vecCmpBr.c
deleted file mode 100644
index 3ae7433..0000000
--- a/test/CodeGen/arm64_vecCmpBr.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -S -ffreestanding %s -o - -target-cpu cyclone | FileCheck %s
-// REQUIRES: aarch64-registered-target
-// test code generation for <rdar://problem/11487757>
-#include <arm_neon.h>
-
-unsigned bar();
-
-// Branch if any lane of V0 is zero; 64 bit => !min
-unsigned anyZero64(uint16x4_t a) {
-// CHECK: anyZero64:
-// CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: b {{_bar|bar}}
-  if (!vminv_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if any lane of V0 is zero; 128 bit => !min
-unsigned anyZero128(uint16x8_t a) {
-// CHECK: anyZero128:
-// CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: b {{_bar|bar}}
-  if (!vminvq_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if any lane of V0 is non-zero; 64 bit => max
-unsigned anyNonZero64(uint16x4_t a) {
-// CHECK: anyNonZero64:
-// CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: movz w0, #0
-  if (vmaxv_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if any lane of V0 is non-zero; 128 bit => max
-unsigned anyNonZero128(uint16x8_t a) {
-// CHECK: anyNonZero128:
-// CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: movz w0, #0
-  if (vmaxvq_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if all lanes of V0 are zero; 64 bit => !max
-unsigned allZero64(uint16x4_t a) {
-// CHECK: allZero64:
-// CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: b {{_bar|bar}}
-  if (!vmaxv_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if all lanes of V0 are zero; 128 bit => !max
-unsigned allZero128(uint16x8_t a) {
-// CHECK: allZero128:
-// CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: b {{_bar|bar}}
-  if (!vmaxvq_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if all lanes of V0 are non-zero; 64 bit => min
-unsigned allNonZero64(uint16x4_t a) {
-// CHECK: allNonZero64:
-// CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: movz w0, #0
-  if (vminv_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if all lanes of V0 are non-zero; 128 bit => min
-unsigned allNonZero128(uint16x8_t a) {
-// CHECK: allNonZero128:
-// CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: movz w0, #0
-  if (vminvq_u8(a))
-    return bar();
-  return 0;
-}
-
diff --git a/test/CodeGen/arm64_vext.c b/test/CodeGen/arm64_vext.c
deleted file mode 100644
index 6c3fe73..0000000
--- a/test/CodeGen/arm64_vext.c
+++ /dev/null
@@ -1,239 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-
-// Test ARM64 extract intrinsics
-// can use as back end test by adding a run line with
-// -check-prefix=CHECK-CODEGEN on the FileCheck
-
-#include <arm_neon.h>
-
-void test_vext_s8()
-{
-  // CHECK: test_vext_s8
-  int8x8_t xS8x8;
-  xS8x8 = vext_s8(xS8x8, xS8x8, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_s8:
-  // CHECK-CODEGEN: {{ext.8.*#1}}
-}
-
-void test_vext_u8()
-{
-  // CHECK: test_vext_u8
-  uint8x8_t xU8x8;
-  xU8x8 = vext_u8(xU8x8, xU8x8, 2);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_u8:
-  // CHECK-CODEGEN: {{ext.8.*#2}}
-}
-
-void test_vext_p8()
-{
-  // CHECK: test_vext_p8
-  poly8x8_t xP8x8;
-  xP8x8 = vext_p8(xP8x8, xP8x8, 3);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_p8:
-  // CHECK-CODEGEN: {{ext.8.*#3}}
-}
-
-void test_vext_s16()
-{
-  // CHECK: test_vext_s16
-  int16x4_t xS16x4;
-  xS16x4 = vext_s16(xS16x4, xS16x4, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_s16:
-  // CHECK-CODEGEN: {{ext.8.*#2}}
-}
-
-void test_vext_u16()
-{
-  // CHECK: test_vext_u16
-  uint16x4_t xU16x4;
-  xU16x4 = vext_u16(xU16x4, xU16x4, 2);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_u16:
-  // CHECK-CODEGEN: {{ext.8.*#4}}
-}
-
-void test_vext_p16()
-{
-  // CHECK: test_vext_p16
-  poly16x4_t xP16x4;
-  xP16x4 = vext_p16(xP16x4, xP16x4, 3);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_p16:
-  // CHECK-CODEGEN: {{ext.8.*#6}}
-}
-
-void test_vext_s32()
-{
-  // CHECK: test_vext_s32
-  int32x2_t xS32x2;
-  xS32x2 = vext_s32(xS32x2, xS32x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_s32:
-  // CHECK-CODEGEN: {{ext.8.*#4}}
-}
-
-void test_vext_u32()
-{
-  // CHECK: test_vext_u32
-  uint32x2_t xU32x2;
-  xU32x2 = vext_u32(xU32x2, xU32x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_u32:
-  // CHECK-CODEGEN: {{ext.8.*#4}}
-}
-
-void test_vext_f32()
-{
-  // CHECK: test_vext_f32
-  float32x2_t xF32x2;
-  xF32x2 = vext_f32(xF32x2, xF32x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_f32:
-  // CHECK-CODEGEN: {{ext.8.*#4}}
-}
-
-void test_vext_s64()
-{
-  // CHECK: test_vext_s64
-  int64x1_t xS64x1;
-  // FIXME don't use 1 as index or check for now, clang has a bug?
-  xS64x1 = vext_s64(xS64x1, xS64x1, /*1*/0);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_s64:
-  // CHECK_FIXME: {{ext.8.*#0}}
-}
-
-void test_vext_u64()
-{
-  // CHECK: test_vext_u64
-  uint64x1_t xU64x1;
-  // FIXME don't use 1 as index or check for now, clang has a bug?
-  xU64x1 = vext_u64(xU64x1, xU64x1, /*1*/0);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_u64:
-  // CHECK_FIXME: {{ext.8.*#0}}
-}
-
-void test_vextq_s8()
-{
-  // CHECK: test_vextq_s8
-  int8x16_t xS8x16;
-  xS8x16 = vextq_s8(xS8x16, xS8x16, 4);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_s8:
-  // CHECK-CODEGEN: {{ext.16.*#4}}
-}
-
-void test_vextq_u8()
-{
-  // CHECK: test_vextq_u8
-  uint8x16_t xU8x16;
-  xU8x16 = vextq_u8(xU8x16, xU8x16, 5);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u8:
-  // CHECK-CODEGEN: {{ext.16.*#5}}
-}
-
-void test_vextq_p8()
-{
-  // CHECK: test_vextq_p8
-  poly8x16_t xP8x16;
-  xP8x16 = vextq_p8(xP8x16, xP8x16, 6);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_p8:
-  // CHECK-CODEGEN: {{ext.16.*#6}}
-}
-
-void test_vextq_s16()
-{
-  // CHECK: test_vextq_s16
-  int16x8_t xS16x8;
-  xS16x8 = vextq_s16(xS16x8, xS16x8, 7);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_s16:
-  // CHECK-CODEGEN: {{ext.16.*#14}}
-}
-
-void test_vextq_u16()
-{
-  // CHECK: test_vextq_u16
-  uint16x8_t xU16x8;
-  xU16x8 = vextq_u16(xU16x8, xU16x8, 4);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u16:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
-
-void test_vextq_p16()
-{
-  // CHECK: test_vextq_p16
-  poly16x8_t xP16x8;
-  xP16x8 = vextq_p16(xP16x8, xP16x8, 5);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_p16:
-  // CHECK-CODEGEN: {{ext.16.*#10}}
-}
-
-void test_vextq_s32()
-{
-  // CHECK: test_vextq_s32
-  int32x4_t xS32x4;
-  xS32x4 = vextq_s32(xS32x4, xS32x4, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_s32:
-  // CHECK-CODEGEN: {{ext.16.*#4}}
-}
-
-void test_vextq_u32()
-{
-  // CHECK: test_vextq_u32
-  uint32x4_t xU32x4;
-  xU32x4 = vextq_u32(xU32x4, xU32x4, 2);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u32:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
-
-void test_vextq_f32()
-{
-  // CHECK: test_vextq_f32
-  float32x4_t xF32x4;
-  xF32x4 = vextq_f32(xF32x4, xF32x4, 3);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_f32:
-  // CHECK-CODEGEN: {{ext.16.*#12}}
-}
-
-void test_vextq_s64()
-{
-  // CHECK: test_vextq_s64
-  int64x2_t xS64x2;
-  xS64x2 = vextq_s64(xS64x2, xS64x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_s64:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
-
-void test_vextq_u64()
-{
-  // CHECK: test_vextq_u64
-  uint64x2_t xU64x2;
-  xU64x2 = vextq_u64(xU64x2, xU64x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u64:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
-
-void test_vextq_f64()
-{
-  // CHECK: test_vextq_f64
-  float64x2_t xF64x2;
-  xF64x2 = vextq_f64(xF64x2, xF64x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u64:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
diff --git a/test/CodeGen/arm64_vfma.c b/test/CodeGen/arm64_vfma.c
deleted file mode 100644
index 12f3111..0000000
--- a/test/CodeGen/arm64_vfma.c
+++ /dev/null
@@ -1,136 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD fused multiply add intrinsics
-
-#include <arm_neon.h>
-
-float32x2_t test_vfma_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
-  // CHECK: test_vfma_f32
-  return vfma_f32(a1, a2, a3);
-  // CHECK: llvm.fma.v2f32({{.*a2, .*a3, .*a1}})
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmaq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) {
-  // CHECK: test_vfmaq_f32
-  return vfmaq_f32(a1, a2, a3);
-  // CHECK: llvm.fma.v4f32({{.*a2, .*a3, .*a1}})
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmaq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
-  // CHECK: test_vfmaq_f64
-  return vfmaq_f64(a1, a2, a3);
-  // CHECK: llvm.fma.v2f64({{.*a2, .*a3, .*a1}})
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vfma_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
-  // CHECK: test_vfma_lane_f32
-  return vfma_lane_f32(a1, a2, a3, 1);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: llvm.fma.v2f32(<2 x float> %a2, <2 x float> {{.*}}, <2 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmaq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) {
-  // CHECK: test_vfmaq_lane_f32
-  return vfmaq_lane_f32(a1, a2, a3, 1);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: llvm.fma.v4f32(<4 x float> %a2, <4 x float> {{.*}}, <4 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmaq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) {
-  // CHECK: test_vfmaq_lane_f64
-  return vfmaq_lane_f64(a1, a2, a3, 0);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: llvm.fma.v2f64(<2 x double> %a2, <2 x double> {{.*}}, <2 x double> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vfma_n_f32(float32x2_t a1, float32x2_t a2, float32_t a3) {
-  // CHECK: test_vfma_n_f32
-  return vfma_n_f32(a1, a2, a3);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 0 (usually two insertelements)
-  // CHECK: llvm.fma.v2f32
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmaq_n_f32(float32x4_t a1, float32x4_t a2, float32_t a3) {
-  // CHECK: test_vfmaq_n_f32
-  return vfmaq_n_f32(a1, a2, a3);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 0 (usually four insertelements)
-  // CHECK: llvm.fma.v4f32
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmaq_n_f64(float64x2_t a1, float64x2_t a2, float64_t a3) {
-  // CHECK: test_vfmaq_n_f64
-  return vfmaq_n_f64(a1, a2, a3);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 0 (usually two insertelements)
-  // CHECK: llvm.fma.v2f64
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vfms_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
-  // CHECK: test_vfms_f32
-  return vfms_f32(a1, a2, a3);
-  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a2
-  // CHECK: llvm.fma.v2f32(<2 x float> [[NEG]], <2 x float> %a3, <2 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmsq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) {
-  // CHECK: test_vfmsq_f32
-  return vfmsq_f32(a1, a2, a3);
-  // CHECK: [[NEG:%.*]] = fsub <4 x float> {{.*}}, %a2
-  // CHECK: llvm.fma.v4f32(<4 x float> [[NEG]], <4 x float> %a3, <4 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmsq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
-  // CHECK: test_vfmsq_f64
-  return vfmsq_f64(a1, a2, a3);
-  // CHECK: [[NEG:%.*]] = fsub <2 x double> {{.*}}, %a2
-  // CHECK: llvm.fma.v2f64(<2 x double> [[NEG]], <2 x double> %a3, <2 x double> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vfms_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
-  // CHECK: test_vfms_lane_f32
-  return vfms_lane_f32(a1, a2, a3, 1);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a2
-  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> %a3
-  // CHECK: llvm.fma.v2f32(<2 x float> [[NEG]], <2 x float> [[LANE]], <2 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmsq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) {
-  // CHECK: test_vfmsq_lane_f32
-  return vfmsq_lane_f32(a1, a2, a3, 1);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <4 x float> {{.*}}, %a2
-  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> %a3
-  // CHECK: llvm.fma.v4f32(<4 x float> [[NEG]], <4 x float> [[LANE]], <4 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmsq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) {
-  // CHECK: test_vfmsq_lane_f64
-  return vfmsq_lane_f64(a1, a2, a3, 0);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <2 x double> {{.*}}, %a2
-  // CHECK: [[LANE:%.*]] = shufflevector <1 x double> %a3
-  // CHECK: llvm.fma.v2f64(<2 x double> [[NEG]], <2 x double> [[LANE]], <2 x double> %a1)
-  // CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/arm64_vneg.c b/test/CodeGen/arm64_vneg.c
deleted file mode 100644
index d520ebd..0000000
--- a/test/CodeGen/arm64_vneg.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD negate and saturating negate intrinsics
-
-#include <arm_neon.h>
-
-int64x2_t test_vnegq_s64(int64x2_t a1) {
-  // CHECK: test_vnegq_s64
-  return vnegq_s64(a1);
-  // CHECK: sub <2 x i64> zeroinitializer, %a1
-  // CHECK-NEXT: ret
-}
-
-int64x2_t test_vqnegq_s64(int64x2_t a1) {
-  // CHECK: test_vqnegq_s64
-  return vqnegq_s64(a1);
-  // CHECK: llvm.aarch64.neon.sqneg.v2i64
-  // CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/arm64_vqmov.c b/test/CodeGen/arm64_vqmov.c
deleted file mode 100644
index 6480e66..0000000
--- a/test/CodeGen/arm64_vqmov.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s
-// REQUIRES: aarch64-registered-target
-/// Test vqmov[u]n_high_<su>{16,32,64) ARM64 intrinsics
-
-#include <arm_neon.h>
-
-// vqmovn_high_s16 -> UQXTN2 Vd.16b,Vn.8h
-int8x16_t test_vqmovn_high_s16(int8x8_t Vdlow, int16x8_t Vn)
-{
-    return vqmovn_high_s16(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_s16:
-  // CHECK: sqxtn2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovun_high_s16 -> UQXTN2 Vd.16b,Vn.8h
-uint8x16_t test_vqmovun_high_s16(uint8x8_t Vdlow, uint16x8_t Vn)
-{
-    return vqmovun_high_s16(Vdlow, Vn);
-  // CHECK: test_vqmovun_high_s16:
-  // CHECK: sqxtun2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_s32 -> SQXTN2 Vd.8h,Vn.4s
-int16x8_t test_vqmovn_high_s32(int16x4_t Vdlow, int32x4_t Vn)
-{
-    return vqmovn_high_s32(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_s32:
-  // CHECK: sqxtn2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_u32 -> UQXTN2 Vd.8h,Vn.4s
-uint16x8_t test_vqmovn_high_u32(uint16x4_t Vdlow, uint32x4_t Vn)
-{
-    return vqmovn_high_u32(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_u32:
-  // CHECK: uqxtn2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_s64 -> SQXTN2 Vd.4s,Vn.2d
-int32x4_t test_vqmovn_high_s64(int32x2_t Vdlow, int64x2_t Vn)
-{
-    return vqmovn_high_s64(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_s64:
-  // CHECK: sqxtn2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_u64 -> UQXTN2 Vd.4s,Vn.2d
-uint32x4_t test_vqmovn_high_u64(uint32x2_t Vdlow, uint64x2_t Vn)
-{
-    return vqmovn_high_u64(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_u64:
-  // CHECK: uqxtn2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_u16 -> UQXTN2 Vd.16b,Vn.8h
-uint8x16_t test_vqmovn_high_u16(uint8x8_t Vdlow, uint16x8_t Vn)
-{
-    return vqmovn_high_u16(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_u16:
-  // CHECK: uqxtn2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovun_high_s32 -> SQXTUN2 Vd.8h,Vn.4s
-uint16x8_t test_vqmovun_high_s32(uint16x4_t Vdlow, uint32x4_t Vn)
-{
-    return vqmovun_high_s32(Vdlow, Vn);
-  // CHECK: test_vqmovun_high_s32:
-  // CHECK: sqxtun2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovun_high_s64 -> SQXTUN2  Vd.4s,Vn.2d
-uint32x4_t test_vqmovun_high_s64(uint32x2_t Vdlow, uint64x2_t Vn)
-{
-    return vqmovun_high_s64(Vdlow, Vn);
-  // CHECK: test_vqmovun_high_s64:
-  // CHECK: sqxtun2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
diff --git a/test/CodeGen/arm64_vrecps.c b/test/CodeGen/arm64_vrecps.c
deleted file mode 100644
index a3af13c..0000000
--- a/test/CodeGen/arm64_vrecps.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s
-// REQUIRES: aarch64-registered-target
-/// Test vrecpss_f32, vrecpsd_f64 ARM64 intrinsics
-
-
-#include <arm_neon.h>
-
-// vrecpss_f32 -> FRECPS Sd,Sa,Sb
-//
-float32_t test_vrecpss_f32(float32_t Vdlow, float32_t Vn)
-{
-    return vrecpss_f32(Vdlow, Vn);
-  // CHECK: test_vrecpss_f32:
-  // CHECK: frecps  s0, s0, s1
-  // CHECK-NEXT: ret
-}
-
-// vrecpsd_f64 -> FRECPS Dd,Da,Db
-//
-float64_t test_vrecpsd_f64(float64_t Vdlow, float64_t Vn)
-{
-    return vrecpsd_f64(Vdlow, Vn);
-  // CHECK: test_vrecpsd_f64:
-  // CHECK: frecps d0, d0, d1
-  // CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/arm64_vshift.c b/test/CodeGen/arm64_vshift.c
deleted file mode 100644
index af02899..0000000
--- a/test/CodeGen/arm64_vshift.c
+++ /dev/null
@@ -1,357 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon -ffreestanding -emit-llvm -o - -O1 %s | FileCheck %s
-#include <arm_neon.h>
-
-int8x8_t test_vqshl_n_s8(int8x8_t in) {
-  // CHECK-LABEL: @test_vqshl_n_s8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshl_n_s8(in, 1);
-}
-
-int16x4_t test_vqshl_n_s16(int16x4_t in) {
-  // CHECK-LABEL: @test_vqshl_n_s16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-  return vqshl_n_s16(in, 1);
-}
-
-int32x2_t test_vqshl_n_s32(int32x2_t in) {
-  // CHECK-LABEL: @test_vqshl_n_s32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
-  return vqshl_n_s32(in, 1);
-}
-
-int64x1_t test_vqshl_n_s64(int64x1_t in) {
-  // CHECK-LABEL: @test_vqshl_n_s64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
-  return vqshl_n_s64(in, 1);
-}
-
-
-int8x16_t test_vqshlq_n_s8(int8x16_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_s8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshlq_n_s8(in, 1);
-}
-
-int16x8_t test_vqshlq_n_s16(int16x8_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_s16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-  return vqshlq_n_s16(in, 1);
-}
-
-int32x4_t test_vqshlq_n_s32(int32x4_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_s32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-  return vqshlq_n_s32(in, 1);
-}
-
-int64x2_t test_vqshlq_n_s64(int64x2_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_s64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
-  return vqshlq_n_s64(in, 1);
-}
-
-uint8x8_t test_vqshl_n_u8(uint8x8_t in) {
-  // CHECK-LABEL: @test_vqshl_n_u8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshl_n_u8(in, 1);
-}
-
-uint16x4_t test_vqshl_n_u16(uint16x4_t in) {
-  // CHECK-LABEL: @test_vqshl_n_u16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-  return vqshl_n_u16(in, 1);
-}
-
-uint32x2_t test_vqshl_n_u32(uint32x2_t in) {
-  // CHECK-LABEL: @test_vqshl_n_u32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
-  return vqshl_n_u32(in, 1);
-}
-
-uint64x1_t test_vqshl_n_u64(uint64x1_t in) {
-  // CHECK-LABEL: @test_vqshl_n_u64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
-  return vqshl_n_u64(in, 1);
-}
-
-uint8x16_t test_vqshlq_n_u8(uint8x16_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_u8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshlq_n_u8(in, 1);
-}
-
-uint16x8_t test_vqshlq_n_u16(uint16x8_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_u16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-  return vqshlq_n_u16(in, 1);
-}
-
-uint32x4_t test_vqshlq_n_u32(uint32x4_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_u32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-  return vqshlq_n_u32(in, 1);
-}
-
-uint64x2_t test_vqshlq_n_u64(uint64x2_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_u64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
-  return vqshlq_n_u64(in, 1);
-}
-
-int8x8_t test_vrshr_n_s8(int8x8_t in) {
-  // CHECK-LABEL: @test_vrshr_n_s8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  return vrshr_n_s8(in, 1);
-}
-
-int16x4_t test_vrshr_n_s16(int16x4_t in) {
-  // CHECK-LABEL: @test_vrshr_n_s16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-  return vrshr_n_s16(in, 1);
-}
-
-int32x2_t test_vrshr_n_s32(int32x2_t in) {
-  // CHECK-LABEL: @test_vrshr_n_s32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
-  return vrshr_n_s32(in, 1);
-}
-
-int64x1_t test_vrshr_n_s64(int64x1_t in) {
-  // CHECK-LABEL: @test_vrshr_n_s64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
-  return vrshr_n_s64(in, 1);
-}
-
-
-int8x16_t test_vrshrq_n_s8(int8x16_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_s8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  return vrshrq_n_s8(in, 1);
-}
-
-int16x8_t test_vrshrq_n_s16(int16x8_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_s16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-  return vrshrq_n_s16(in, 1);
-}
-
-int32x4_t test_vrshrq_n_s32(int32x4_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_s32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-  return vrshrq_n_s32(in, 1);
-}
-
-int64x2_t test_vrshrq_n_s64(int64x2_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_s64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>
-  return vrshrq_n_s64(in, 1);
-}
-
-uint8x8_t test_vrshr_n_u8(uint8x8_t in) {
-  // CHECK-LABEL: @test_vrshr_n_u8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  return vrshr_n_u8(in, 1);
-}
-
-uint16x4_t test_vrshr_n_u16(uint16x4_t in) {
-  // CHECK-LABEL: @test_vrshr_n_u16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-  return vrshr_n_u16(in, 1);
-}
-
-uint32x2_t test_vrshr_n_u32(uint32x2_t in) {
-  // CHECK-LABEL: @test_vrshr_n_u32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
-  return vrshr_n_u32(in, 1);
-}
-
-uint64x1_t test_vrshr_n_u64(uint64x1_t in) {
-  // CHECK-LABEL: @test_vrshr_n_u64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
-  return vrshr_n_u64(in, 1);
-}
-
-uint8x16_t test_vrshrq_n_u8(uint8x16_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_u8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  return vrshrq_n_u8(in, 1);
-}
-
-uint16x8_t test_vrshrq_n_u16(uint16x8_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_u16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-  return vrshrq_n_u16(in, 1);
-}
-
-uint32x4_t test_vrshrq_n_u32(uint32x4_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_u32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-  return vrshrq_n_u32(in, 1);
-}
-
-uint64x2_t test_vrshrq_n_u64(uint64x2_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_u64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>
-  return vrshrq_n_u64(in, 1);
-}
-
-int8x8_t test_vqshlu_n_s8(int8x8_t in) {
-  // CHECK-LABEL: @test_vqshlu_n_s8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshlu_n_s8(in, 1);
-}
-
-int16x4_t test_vqshlu_n_s16(int16x4_t in) {
-  // CHECK-LABEL: @test_vqshlu_n_s16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-  return vqshlu_n_s16(in, 1);
-}
-
-int32x2_t test_vqshlu_n_s32(int32x2_t in) {
-  // CHECK-LABEL: @test_vqshlu_n_s32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
-  return vqshlu_n_s32(in, 1);
-}
-
-int64x1_t test_vqshlu_n_s64(int64x1_t in) {
-  // CHECK-LABEL: @test_vqshlu_n_s64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
-  return vqshlu_n_s64(in, 1);
-}
-
-
-int8x16_t test_vqshluq_n_s8(int8x16_t in) {
-  // CHECK-LABEL: @test_vqshluq_n_s8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshluq_n_s8(in, 1);
-}
-
-int16x8_t test_vqshluq_n_s16(int16x8_t in) {
-  // CHECK-LABEL: @test_vqshluq_n_s16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-  return vqshluq_n_s16(in, 1);
-}
-
-int32x4_t test_vqshluq_n_s32(int32x4_t in) {
-  // CHECK-LABEL: @test_vqshluq_n_s32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-  return vqshluq_n_s32(in, 1);
-}
-
-int64x2_t test_vqshluq_n_s64(int64x2_t in) {
-  // CHECK-LABEL: @test_vqshluq_n_s64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
-  return vqshluq_n_s64(in, 1);
-}
-
-int8x8_t test_vrsra_n_s8(int8x8_t acc, int8x8_t in) {
-  // CHECK-LABEL: @test_vrsra_n_s8
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  // CHECK: add <8 x i8> [[TMP]], %acc
-  return vrsra_n_s8(acc, in, 1);
-}
-
-int16x4_t test_vrsra_n_s16(int16x4_t acc, int16x4_t in) {
-  // CHECK-LABEL: @test_vrsra_n_s16
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-  // CHECK: add <4 x i16> [[TMP]], %acc
-  return vrsra_n_s16(acc, in, 1);
-}
-
-int32x2_t test_vrsra_n_s32(int32x2_t acc, int32x2_t in) {
-  // CHECK-LABEL: @test_vrsra_n_s32
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
-  // CHECK: add <2 x i32> [[TMP]], %acc
-  return vrsra_n_s32(acc, in, 1);
-}
-
-int64x1_t test_vrsra_n_s64(int64x1_t acc, int64x1_t in) {
-  // CHECK-LABEL: @test_vrsra_n_s64
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
-  // CHECK: add <1 x i64> [[TMP]], %acc
-  return vrsra_n_s64(acc, in, 1);
-}
-
-int8x16_t test_vrsraq_n_s8(int8x16_t acc, int8x16_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_s8
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  // CHECK: add <16 x i8> [[TMP]], %acc
-  return vrsraq_n_s8(acc, in, 1);
-}
-
-int16x8_t test_vrsraq_n_s16(int16x8_t acc, int16x8_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_s16
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-  // CHECK: add <8 x i16> [[TMP]], %acc
-  return vrsraq_n_s16(acc, in, 1);
-}
-
-int32x4_t test_vrsraq_n_s32(int32x4_t acc, int32x4_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_s32
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-  // CHECK: add <4 x i32> [[TMP]], %acc
-  return vrsraq_n_s32(acc, in, 1);
-}
-
-int64x2_t test_vrsraq_n_s64(int64x2_t acc, int64x2_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_s64
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>)
-  // CHECK: add <2 x i64> [[TMP]], %acc
-  return vrsraq_n_s64(acc, in, 1);
-}
-
-uint8x8_t test_vrsra_n_u8(uint8x8_t acc, uint8x8_t in) {
-  // CHECK-LABEL: @test_vrsra_n_u8
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  // CHECK: add <8 x i8> [[TMP]], %acc
-  return vrsra_n_u8(acc, in, 1);
-}
-
-uint16x4_t test_vrsra_n_u16(uint16x4_t acc, uint16x4_t in) {
-  // CHECK-LABEL: @test_vrsra_n_u16
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-  // CHECK: add <4 x i16> [[TMP]], %acc
-  return vrsra_n_u16(acc, in, 1);
-}
-
-uint32x2_t test_vrsra_n_u32(uint32x2_t acc, uint32x2_t in) {
-  // CHECK-LABEL: @test_vrsra_n_u32
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
-  // CHECK: add <2 x i32> [[TMP]], %acc
-  return vrsra_n_u32(acc, in, 1);
-}
-
-uint64x1_t test_vrsra_n_u64(uint64x1_t acc, uint64x1_t in) {
-  // CHECK-LABEL: @test_vrsra_n_u64
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
-  // CHECK: add <1 x i64> [[TMP]], %acc
-  return vrsra_n_u64(acc, in, 1);
-}
-
-uint8x16_t test_vrsraq_n_u8(uint8x16_t acc, uint8x16_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_u8
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  // CHECK: add <16 x i8> [[TMP]], %acc
-  return vrsraq_n_u8(acc, in, 1);
-}
-
-uint16x8_t test_vrsraq_n_u16(uint16x8_t acc, uint16x8_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_u16
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-  // CHECK: add <8 x i16> [[TMP]], %acc
-  return vrsraq_n_u16(acc, in, 1);
-}
-
-uint32x4_t test_vrsraq_n_u32(uint32x4_t acc, uint32x4_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_u32
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-  // CHECK: add <4 x i32> [[TMP]], %acc
-  return vrsraq_n_u32(acc, in, 1);
-}
-
-uint64x2_t test_vrsraq_n_u64(uint64x2_t acc, uint64x2_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_u64
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>)
-  // CHECK: add <2 x i64> [[TMP]], %acc
-  return vrsraq_n_u64(acc, in, 1);
-}
diff --git a/test/CodeGen/arm64_vsli.c b/test/CodeGen/arm64_vsli.c
deleted file mode 100644
index b2a30ab..0000000
--- a/test/CodeGen/arm64_vsli.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | \
-// RUN:   FileCheck -check-prefix=CHECK_CODEGEN %s
-// REQUIRES: aarch64-registered-target
-// Test
-
-#include <arm_neon.h>
-
-int8x8_t test_vsli_n_s8(int8x8_t a1, int8x8_t a2) {
-  // CHECK: test_vsli_n_s8
-  return vsli_n_s8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v8i8
-  // CHECK_CODEGEN: sli.8b  v0, v1, #3
-}
-
-int16x4_t test_vsli_n_s16(int16x4_t a1, int16x4_t a2) {
-  // CHECK: test_vsli_n_s16
-  return vsli_n_s16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v4i16
-  // CHECK_CODEGEN: sli.4h  v0, v1, #3
-}
-
-int32x2_t test_vsli_n_s32(int32x2_t a1, int32x2_t a2) {
-  // CHECK: test_vsli_n_s32
-  return vsli_n_s32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v2i32
-  // CHECK_CODEGEN: sli.2s  v0, v1, #1
-}
-
-int64x1_t test_vsli_n_s64(int64x1_t a1, int64x1_t a2) {
-  // CHECK: test_vsli_n_s64
-  return vsli_n_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v1i64
-  // CHECK_CODEGEN: sli     d0, d1, #1
-}
-
-uint8x8_t test_vsli_n_u8(uint8x8_t a1, uint8x8_t a2) {
-  // CHECK: test_vsli_n_u8
-  return vsli_n_u8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v8i8
-  // CHECK_CODEGEN: sli.8b  v0, v1, #3
-}
-
-uint16x4_t test_vsli_n_u16(uint16x4_t a1, uint16x4_t a2) {
-  // CHECK: test_vsli_n_u16
-  return vsli_n_u16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v4i16
-  // CHECK_CODEGEN: sli.4h  v0, v1, #3
-}
-
-uint32x2_t test_vsli_n_u32(uint32x2_t a1, uint32x2_t a2) {
-  // CHECK: test_vsli_n_u32
-  return vsli_n_u32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v2i32
-  // CHECK_CODEGEN: sli.2s  v0, v1, #1
-}
-
-uint64x1_t test_vsli_n_u64(uint64x1_t a1, uint64x1_t a2) {
-  // CHECK: test_vsli_n_u64
-  return vsli_n_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v1i64
-  // CHECK_CODEGEN: sli     d0, d1, #1
-}
-
-poly8x8_t test_vsli_n_p8(poly8x8_t a1, poly8x8_t a2) {
-  // CHECK: test_vsli_n_p8
-  return vsli_n_p8(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v8i8
-  // CHECK_CODEGEN: sli.8b  v0, v1, #1
-}
-
-poly16x4_t test_vsli_n_p16(poly16x4_t a1, poly16x4_t a2) {
-  // CHECK: test_vsli_n_p16
-  return vsli_n_p16(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v4i16
-  // CHECK_CODEGEN: sli.4h  v0, v1, #1
-}
-
-int8x16_t test_vsliq_n_s8(int8x16_t a1, int8x16_t a2) {
-  // CHECK: test_vsliq_n_s8
-  return vsliq_n_s8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v16i8
-  // CHECK_CODEGEN: sli.16b v0, v1, #3
-}
-
-int16x8_t test_vsliq_n_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK: test_vsliq_n_s16
-  return vsliq_n_s16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v8i16
-  // CHECK_CODEGEN: sli.8h  v0, v1, #3
-}
-
-int32x4_t test_vsliq_n_s32(int32x4_t a1, int32x4_t a2) {
-  // CHECK: test_vsliq_n_s32
-  return vsliq_n_s32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v4i32
-  // CHECK_CODEGEN: sli.4s  v0, v1, #1
-}
-
-int64x2_t test_vsliq_n_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vsliq_n_s64
-  return vsliq_n_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v2i64
-  // CHECK_CODEGEN: sli.2d  v0, v1, #1
-}
-
-uint8x16_t test_vsliq_n_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK: test_vsliq_n_u8
-  return vsliq_n_u8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v16i8
-  // CHECK_CODEGEN: sli.16b v0, v1, #3
-}
-
-uint16x8_t test_vsliq_n_u16(uint16x8_t a1, uint16x8_t a2) {
-  // CHECK: test_vsliq_n_u16
-  return vsliq_n_u16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v8i16
-  // CHECK_CODEGEN: sli.8h  v0, v1, #3
-}
-
-uint32x4_t test_vsliq_n_u32(uint32x4_t a1, uint32x4_t a2) {
-  // CHECK: test_vsliq_n_u32
-  return vsliq_n_u32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v4i32
-  // CHECK_CODEGEN: sli.4s  v0, v1, #1
-}
-
-uint64x2_t test_vsliq_n_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vsliq_n_u64
-  return vsliq_n_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v2i64
-  // CHECK_CODEGEN: sli.2d  v0, v1, #1
-}
-
-poly8x16_t test_vsliq_n_p8(poly8x16_t a1, poly8x16_t a2) {
-  // CHECK: test_vsliq_n_p8
-  return vsliq_n_p8(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v16i8
-  // CHECK_CODEGEN: sli.16b v0, v1, #1
-}
-
-poly16x8_t test_vsliq_n_p16(poly16x8_t a1, poly16x8_t a2) {
-  // CHECK: test_vsliq_n_p16
-  return vsliq_n_p16(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v8i16
-  // CHECK_CODEGEN: sli.8h  v0, v1, #1
-}
-
diff --git a/test/CodeGen/arm64_vsri.c b/test/CodeGen/arm64_vsri.c
deleted file mode 100644
index 579431d..0000000
--- a/test/CodeGen/arm64_vsri.c
+++ /dev/null
@@ -1,149 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | \
-// RUN:   FileCheck -check-prefix=CHECK_CODEGEN %s
-// REQUIRES: aarch64-registered-target
-
-// Test ARM64 SIMD vector shift right and insert: vsri[q]_n_*
-
-#include <arm_neon.h>
-
-int8x8_t test_vsri_n_s8(int8x8_t a1, int8x8_t a2) {
-  // CHECK: test_vsri_n_s8
-  return vsri_n_s8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v8i8
-  // CHECK_CODEGEN: sri.8b  v0, v1, #3
-}
-
-int16x4_t test_vsri_n_s16(int16x4_t a1, int16x4_t a2) {
-  // CHECK: test_vsri_n_s16
-  return vsri_n_s16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v4i16
-  // CHECK_CODEGEN: sri.4h  v0, v1, #3
-}
-
-int32x2_t test_vsri_n_s32(int32x2_t a1, int32x2_t a2) {
-  // CHECK: test_vsri_n_s32
-  return vsri_n_s32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v2i32
-  // CHECK_CODEGEN: sri.2s  v0, v1, #1
-}
-
-int64x1_t test_vsri_n_s64(int64x1_t a1, int64x1_t a2) {
-  // CHECK: test_vsri_n_s64
-  return vsri_n_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v1i64
-  // CHECK_CODEGEN: sri     d0, d1, #1
-}
-
-uint8x8_t test_vsri_n_u8(uint8x8_t a1, uint8x8_t a2) {
-  // CHECK: test_vsri_n_u8
-  return vsri_n_u8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v8i8
-  // CHECK_CODEGEN: sri.8b  v0, v1, #3
-}
-
-uint16x4_t test_vsri_n_u16(uint16x4_t a1, uint16x4_t a2) {
-  // CHECK: test_vsri_n_u16
-  return vsri_n_u16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v4i16
-  // CHECK_CODEGEN: sri.4h  v0, v1, #3
-}
-
-uint32x2_t test_vsri_n_u32(uint32x2_t a1, uint32x2_t a2) {
-  // CHECK: test_vsri_n_u32
-  return vsri_n_u32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v2i32
-  // CHECK_CODEGEN: sri.2s  v0, v1, #1
-}
-
-uint64x1_t test_vsri_n_u64(uint64x1_t a1, uint64x1_t a2) {
-  // CHECK: test_vsri_n_u64
-  return vsri_n_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v1i64
-  // CHECK_CODEGEN: sri     d0, d1, #1
-}
-
-poly8x8_t test_vsri_n_p8(poly8x8_t a1, poly8x8_t a2) {
-  // CHECK: test_vsri_n_p8
-  return vsri_n_p8(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v8i8
-  // CHECK_CODEGEN: sri.8b  v0, v1, #1
-}
-
-poly16x4_t test_vsri_n_p16(poly16x4_t a1, poly16x4_t a2) {
-  // CHECK: test_vsri_n_p16
-  return vsri_n_p16(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v4i16
-  // CHECK_CODEGEN: sri.4h  v0, v1, #1
-}
-
-int8x16_t test_vsriq_n_s8(int8x16_t a1, int8x16_t a2) {
-  // CHECK: test_vsriq_n_s8
-  return vsriq_n_s8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v16i8
-  // CHECK_CODEGEN: sri.16b v0, v1, #3
-}
-
-int16x8_t test_vsriq_n_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK: test_vsriq_n_s16
-  return vsriq_n_s16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v8i16
-  // CHECK_CODEGEN: sri.8h  v0, v1, #3
-}
-
-int32x4_t test_vsriq_n_s32(int32x4_t a1, int32x4_t a2) {
-  // CHECK: test_vsriq_n_s32
-  return vsriq_n_s32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v4i32
-  // CHECK_CODEGEN: sri.4s  v0, v1, #1
-}
-
-int64x2_t test_vsriq_n_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vsriq_n_s64
-  return vsriq_n_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v2i64
-  // CHECK_CODEGEN: sri.2d  v0, v1, #1
-}
-
-uint8x16_t test_vsriq_n_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK: test_vsriq_n_u8
-  return vsriq_n_u8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v16i8
-  // CHECK_CODEGEN: sri.16b v0, v1, #3
-}
-
-uint16x8_t test_vsriq_n_u16(uint16x8_t a1, uint16x8_t a2) {
-  // CHECK: test_vsriq_n_u16
-  return vsriq_n_u16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v8i16
-  // CHECK_CODEGEN: sri.8h  v0, v1, #3
-}
-
-uint32x4_t test_vsriq_n_u32(uint32x4_t a1, uint32x4_t a2) {
-  // CHECK: test_vsriq_n_u32
-  return vsriq_n_u32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v4i32
-  // CHECK_CODEGEN: sri.4s  v0, v1, #1
-}
-
-uint64x2_t test_vsriq_n_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vsriq_n_u64
-  return vsriq_n_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v2i64
-  // CHECK_CODEGEN: sri.2d  v0, v1, #1
-}
-
-poly8x16_t test_vsriq_n_p8(poly8x16_t a1, poly8x16_t a2) {
-  // CHECK: test_vsriq_n_p8
-  return vsriq_n_p8(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v16i8
-  // CHECK_CODEGEN: sri.16b v0, v1, #1
-}
-
-poly16x8_t test_vsriq_n_p16(poly16x8_t a1, poly16x8_t a2) {
-  // CHECK: test_vsriq_n_p16
-  return vsriq_n_p16(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v8i16
-  // CHECK_CODEGEN: sri.8h  v0, v1, #1
-}
-
diff --git a/test/CodeGen/arm64_vtst.c b/test/CodeGen/arm64_vtst.c
deleted file mode 100644
index 9f3ed84..0000000
--- a/test/CodeGen/arm64_vtst.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD comparison test intrinsics
-
-#include <arm_neon.h>
-
-uint64x2_t test_vtstq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK-LABEL: test_vtstq_s64
-  return vtstq_s64(a1, a2);
-  // CHECK: [[COMMONBITS:%[A-Za-z0-9.]+]] = and <2 x i64> {{%a1, %a2|%a2, %a1}}
-  // CHECK: [[MASK:%[A-Za-z0-9.]+]] = icmp ne <2 x i64> [[COMMONBITS]], zeroinitializer
-  // CHECK: [[RES:%[A-Za-z0-9.]+]] = sext <2 x i1> [[MASK]] to <2 x i64>
-  // CHECK: ret <2 x i64> [[RES]]
-}
-
-uint64x2_t test_vtstq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK-LABEL: test_vtstq_u64
-  return vtstq_u64(a1, a2);
-  // CHECK: [[COMMONBITS:%[A-Za-z0-9.]+]] = and <2 x i64> {{%a1, %a2|%a2, %a1}}
-  // CHECK: [[MASK:%[A-Za-z0-9.]+]] = icmp ne <2 x i64> [[COMMONBITS]], zeroinitializer
-  // CHECK: [[RES:%[A-Za-z0-9.]+]] = sext <2 x i1> [[MASK]] to <2 x i64>
-  // CHECK: ret <2 x i64> [[RES]]
-}
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index 3a87211..ad8587b 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -1,11788 +1,21619 @@
 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
-// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
-// RUN: %clang_cc1 -triple armv8-linux-gnu \
-// RUN:  -target-cpu cortex-a57 -mfloat-abi soft -ffreestanding -Os -S -o - %s\
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-A57
+// RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding -emit-llvm -o - %s \
+// RUN:  | opt -S -mem2reg | FileCheck %s
 
-// REQUIRES: long_tests
+// REQUIRES: long-tests
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vaba_s8
-// CHECK: vaba.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaba_s8(
+// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vaba_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_s16
-// CHECK: vaba.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaba_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vaba_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_s32
-// CHECK: vaba.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaba_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vaba_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u8
-// CHECK: vaba.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaba_u8(
+// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vaba_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u16
-// CHECK: vaba.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaba_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vaba_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u32
-// CHECK: vaba.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaba_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vaba_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s8
-// CHECK: vaba.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabaq_s8(
+// CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vabaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s16
-// CHECK: vaba.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabaq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %b, <8 x i16> %c) #4
+// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vabaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s32
-// CHECK: vaba.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabaq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %b, <4 x i32> %c) #4
+// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vabaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u8
-// CHECK: vaba.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabaq_u8(
+// CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vabaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u16
-// CHECK: vaba.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabaq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %b, <8 x i16> %c) #4
+// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vabaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u32
-// CHECK: vaba.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabaq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %b, <4 x i32> %c) #4
+// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vabaq_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vabal_s8
-// CHECK: vabal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabal_s8(
+// CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vabal_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_s16
-// CHECK: vabal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabal_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vabal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_s32
-// CHECK: vabal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabal_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vabal_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u8
-// CHECK: vabal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabal_u8(
+// CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vabal_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u16
-// CHECK: vabal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabal_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vabal_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u32
-// CHECK: vabal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabal_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vabal_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vabd_s8
-// CHECK: vabd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabd_s8(
+// CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VABD_V_I]]
 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
   return vabd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabd_s16
-// CHECK: vabd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VABD_V2_I]]
 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
   return vabd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_s32
-// CHECK: vabd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VABD_V2_I]]
 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
   return vabd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u8
-// CHECK: vabd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabd_u8(
+// CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VABD_V_I]]
 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
   return vabd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u16
-// CHECK: vabd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VABD_V2_I]]
 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
   return vabd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u32
-// CHECK: vabd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VABD_V2_I]]
 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
   return vabd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vabd_f32
-// CHECK: vabd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabd_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VABD_V2_I]]
 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
   return vabd_f32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s8
-// CHECK: vabd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabdq_s8(
+// CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VABDQ_V_I]]
 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
   return vabdq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s16
-// CHECK: vabd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabdq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VABDQ_V2_I]]
 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
   return vabdq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s32
-// CHECK: vabd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabdq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VABDQ_V2_I]]
 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
   return vabdq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u8
-// CHECK: vabd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabdq_u8(
+// CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VABDQ_V_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
   return vabdq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u16
-// CHECK: vabd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabdq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VABDQ_V2_I]]
 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
   return vabdq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u32
-// CHECK: vabd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabdq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VABDQ_V2_I]]
 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
   return vabdq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_f32
-// CHECK: vabd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabdq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VABDQ_V2_I]]
 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
   return vabdq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vabdl_s8
-// CHECK: vabdl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabdl_s8(
+// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   return vabdl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_s16
-// CHECK: vabdl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabdl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   return vabdl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_s32
-// CHECK: vabdl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabdl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   return vabdl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u8
-// CHECK: vabdl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabdl_u8(
+// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   return vabdl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u16
-// CHECK: vabdl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabdl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   return vabdl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u32
-// CHECK: vabdl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabdl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   return vabdl_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vabs_s8
-// CHECK: vabs.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabs_s8(
+// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
   return vabs_s8(a);
 }
 
-// CHECK-LABEL: test_vabs_s16
-// CHECK: vabs.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabs_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4
+// CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
   return vabs_s16(a);
 }
 
-// CHECK-LABEL: test_vabs_s32
-// CHECK: vabs.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabs_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4
+// CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
   return vabs_s32(a);
 }
 
-// CHECK-LABEL: test_vabs_f32
-// CHECK: vabs.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vabs_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4
+// CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
   return vabs_f32(a);
 }
 
-// CHECK-LABEL: test_vabsq_s8
-// CHECK: vabs.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabsq_s8(
+// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
   return vabsq_s8(a);
 }
 
-// CHECK-LABEL: test_vabsq_s16
-// CHECK: vabs.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabsq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4
+// CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
   return vabsq_s16(a);
 }
 
-// CHECK-LABEL: test_vabsq_s32
-// CHECK: vabs.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabsq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4
+// CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
   return vabsq_s32(a);
 }
 
-// CHECK-LABEL: test_vabsq_f32
-// CHECK: vabs.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vabsq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4
+// CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
   return vabsq_f32(a);
 }
 
-
-// CHECK-LABEL: test_vadd_s8
-// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_s8(
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
   return vadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s16
-// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_s16(
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
   return vadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s32
-// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_s32(
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
   return vadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s64
-// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_s64(
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
   return vadd_s64(a, b);
 }
 
-// CHECK-LABEL: test_vadd_f32
-// CHECK: vadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_f32(
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
   return vadd_f32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u8
-// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_u8(
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
   return vadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u16
-// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_u16(
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
   return vadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u32
-// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_u32(
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
   return vadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u64
-// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vadd_u64(
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
   return vadd_u64(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s8
-// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_s8(
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
   return vaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s16
-// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_s16(
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
   return vaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s32
-// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_s32(
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
   return vaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s64
-// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_s64(
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
   return vaddq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_f32
-// CHECK: vadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_f32(
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
   return vaddq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u8
-// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_u8(
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u16
-// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_u16(
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u32
-// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_u32(
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vaddq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u64
-// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddq_u64(
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vaddq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vaddhn_s16
-// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddhn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   return vaddhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_s32
-// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddhn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   return vaddhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_s64
-// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddhn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   return vaddhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u16
-// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddhn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vaddhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u32
-// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddhn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vaddhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u64
-// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vaddhn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vaddhn_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vaddl_s8
-// CHECK: vaddl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddl_s8(
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   return vaddl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_s16
-// CHECK: vaddl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   return vaddl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_s32
-// CHECK: vaddl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   return vaddl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u8
-// CHECK: vaddl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddl_u8(
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   return vaddl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u16
-// CHECK: vaddl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   return vaddl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u32
-// CHECK: vaddl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   return vaddl_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vaddw_s8
-// CHECK: vaddw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddw_s8(
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   return vaddw_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_s16
-// CHECK: vaddw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddw_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   return vaddw_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_s32
-// CHECK: vaddw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddw_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   return vaddw_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u8
-// CHECK: vaddw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddw_u8(
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   return vaddw_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u16
-// CHECK: vaddw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddw_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   return vaddw_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u32
-// CHECK: vaddw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vaddw_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   return vaddw_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vand_s8
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vand_s8(
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
   return vand_s8(a, b);
 }
 
-// CHECK-LABEL: test_vand_s16
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vand_s16(
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
   return vand_s16(a, b);
 }
 
-// CHECK-LABEL: test_vand_s32
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vand_s32(
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
   return vand_s32(a, b);
 }
 
-// CHECK-LABEL: test_vand_s64
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vand_s64(
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
   return vand_s64(a, b);
 }
 
-// CHECK-LABEL: test_vand_u8
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vand_u8(
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
   return vand_u8(a, b);
 }
 
-// CHECK-LABEL: test_vand_u16
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vand_u16(
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
   return vand_u16(a, b);
 }
 
-// CHECK-LABEL: test_vand_u32
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vand_u32(
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
   return vand_u32(a, b);
 }
 
-// CHECK-LABEL: test_vand_u64
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vand_u64(
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
   return vand_u64(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s8
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vandq_s8(
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
   return vandq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s16
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vandq_s16(
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
   return vandq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s32
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vandq_s32(
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
   return vandq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s64
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vandq_s64(
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
   return vandq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u8
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vandq_u8(
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
   return vandq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u16
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vandq_u16(
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
   return vandq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u32
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vandq_u32(
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
   return vandq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u64
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vandq_u64(
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
   return vandq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vbic_s8
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbic_s8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
   return vbic_s8(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s16
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbic_s16(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
   return vbic_s16(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s32
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbic_s32(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
   return vbic_s32(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s64
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbic_s64(
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
   return vbic_s64(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u8
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbic_u8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
   return vbic_u8(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u16
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbic_u16(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
   return vbic_u16(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u32
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbic_u32(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
   return vbic_u32(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u64
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbic_u64(
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
   return vbic_u64(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s8
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbicq_s8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
   return vbicq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s16
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbicq_s16(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
   return vbicq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s32
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbicq_s32(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
   return vbicq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s64
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbicq_s64(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
   return vbicq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u8
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbicq_u8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
   return vbicq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u16
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbicq_u16(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
   return vbicq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u32
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbicq_u32(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
   return vbicq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u64
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbicq_u64(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
   return vbicq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vbsl_s8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_s8(
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VBSL_V_I]]
 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
   return vbsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
   return vbsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
   return vbsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s64
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
   return vbsl_s64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_u8(
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VBSL_V_I]]
 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vbsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP3]]
 uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vbsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP3]]
 uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vbsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u64
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
   return vbsl_u64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_f32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
   return vbsl_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_p8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_p8(
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VBSL_V_I]]
 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
   return vbsl_p8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_p16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vbsl_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP3]]
 poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
   return vbsl_p16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_s8(
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
   return vbslq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
   return vbslq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
   return vbslq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s64
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
   return vbslq_s64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_u8(
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vbslq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP3]]
 uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vbslq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP3]]
 uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vbslq_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u64
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP3]]
 uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
   return vbslq_u64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_f32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
   return vbslq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_p8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_p8(
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
   return vbslq_p8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_p16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vbslq_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP3]]
 poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
   return vbslq_p16(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vcage_f32
-// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcage_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
   return vcage_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcageq_f32
-// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcageq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
   return vcageq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vcagt_f32
-// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcagt_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
   return vcagt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcagtq_f32
-// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcagtq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
   return vcagtq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vcale_f32
-// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcale_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %b, <2 x float> %a) #4
+// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
   return vcale_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcaleq_f32
-// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcaleq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %b, <4 x float> %a) #4
+// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
   return vcaleq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vcalt_f32
-// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcalt_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %b, <2 x float> %a) #4
+// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
   return vcalt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcaltq_f32
-// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcaltq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %b, <4 x float> %a) #4
+// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
   return vcaltq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vceq_s8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vceq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
   return vceq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vceq_s16
-// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vceq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
   return vceq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_s32
-// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vceq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
   return vceq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_f32
-// CHECK: vceq.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vceq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
   return vceq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vceq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
   return vceq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u16
-// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vceq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
   return vceq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u32
-// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vceq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
   return vceq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_p8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vceq_p8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
   return vceq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vceqq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
   return vceqq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s16
-// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vceqq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
   return vceqq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s32
-// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vceqq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
   return vceqq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_f32
-// CHECK: vceq.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vceqq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
   return vceqq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vceqq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
   return vceqq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u16
-// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vceqq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
   return vceqq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u32
-// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vceqq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
   return vceqq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_p8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vceqq_p8(
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
   return vceqq_p8(a, b);
 }
 
-
-// CHECK-LABEL: test_vcge_s8
-// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcge_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
   return vcge_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcge_s16
-// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcge_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
   return vcge_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_s32
-// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcge_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
   return vcge_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcge_f32
-// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcge_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
   return vcge_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u8
-// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcge_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
   return vcge_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u16
-// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcge_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
   return vcge_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u32
-// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcge_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
   return vcge_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s8
-// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgeq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
   return vcgeq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s16
-// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgeq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
   return vcgeq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s32
-// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgeq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
   return vcgeq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_f32
-// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgeq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
   return vcgeq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u8
-// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgeq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgeq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u16
-// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgeq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgeq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u32
-// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgeq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgeq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vcgt_s8
-// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcgt_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
   return vcgt_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_s16
-// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcgt_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
   return vcgt_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_s32
-// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcgt_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
   return vcgt_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_f32
-// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcgt_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
   return vcgt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u8
-// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcgt_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
   return vcgt_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u16
-// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcgt_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
   return vcgt_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u32
-// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcgt_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
   return vcgt_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s8
-// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgtq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
   return vcgtq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s16
-// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgtq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
   return vcgtq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s32
-// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgtq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
   return vcgtq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_f32
-// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgtq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
   return vcgtq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u8
-// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgtq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgtq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u16
-// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgtq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgtq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u32
-// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcgtq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgtq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vcle_s8
-// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcle_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
   return vcle_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcle_s16
-// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcle_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
   return vcle_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_s32
-// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcle_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
   return vcle_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcle_f32
-// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcle_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
   return vcle_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u8
-// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcle_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
   return vcle_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u16
-// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcle_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
   return vcle_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u32
-// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcle_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
   return vcle_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s8
-// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcleq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
   return vcleq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s16
-// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcleq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
   return vcleq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s32
-// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcleq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
   return vcleq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_f32
-// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcleq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
   return vcleq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u8
-// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcleq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
   return vcleq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u16
-// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcleq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
   return vcleq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u32
-// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcleq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
   return vcleq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vcls_s8
-// CHECK: vcls.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcls_s8(
+// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
   return vcls_s8(a);
 }
 
-// CHECK-LABEL: test_vcls_s16
-// CHECK: vcls.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcls_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VCLS_V1_I]]
 int16x4_t test_vcls_s16(int16x4_t a) {
   return vcls_s16(a);
 }
 
-// CHECK-LABEL: test_vcls_s32
-// CHECK: vcls.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcls_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VCLS_V1_I]]
 int32x2_t test_vcls_s32(int32x2_t a) {
   return vcls_s32(a);
 }
 
-// CHECK-LABEL: test_vclsq_s8
-// CHECK: vcls.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclsq_s8(
+// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
   return vclsq_s8(a);
 }
 
-// CHECK-LABEL: test_vclsq_s16
-// CHECK: vcls.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclsq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
   return vclsq_s16(a);
 }
 
-// CHECK-LABEL: test_vclsq_s32
-// CHECK: vcls.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclsq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
   return vclsq_s32(a);
 }
 
-
-// CHECK-LABEL: test_vclt_s8
-// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclt_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
   return vclt_s8(a, b);
 }
 
-// CHECK-LABEL: test_vclt_s16
-// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclt_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
   return vclt_s16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_s32
-// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclt_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
   return vclt_s32(a, b);
 }
 
-// CHECK-LABEL: test_vclt_f32
-// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclt_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
   return vclt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u8
-// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclt_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
   return vclt_u8(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u16
-// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclt_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
   return vclt_u16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u32
-// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclt_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
   return vclt_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s8
-// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcltq_s8(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
   return vcltq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s16
-// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcltq_s16(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
   return vcltq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s32
-// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcltq_s32(
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
   return vcltq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_f32
-// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcltq_f32(
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
   return vcltq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u8
-// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcltq_u8(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
   return vcltq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u16
-// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcltq_u16(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
   return vcltq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u32
-// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcltq_u32(
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
   return vcltq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vclz_s8
-// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclz_s8(
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
   return vclz_s8(a);
 }
 
-// CHECK-LABEL: test_vclz_s16
-// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclz_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 int16x4_t test_vclz_s16(int16x4_t a) {
   return vclz_s16(a);
 }
 
-// CHECK-LABEL: test_vclz_s32
-// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclz_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 int32x2_t test_vclz_s32(int32x2_t a) {
   return vclz_s32(a);
 }
 
-// CHECK-LABEL: test_vclz_u8
-// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclz_u8(
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
   return vclz_u8(a);
 }
 
-// CHECK-LABEL: test_vclz_u16
-// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclz_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
   return vclz_u16(a);
 }
 
-// CHECK-LABEL: test_vclz_u32
-// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vclz_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
   return vclz_u32(a);
 }
 
-// CHECK-LABEL: test_vclzq_s8
-// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclzq_s8(
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
   return vclzq_s8(a);
 }
 
-// CHECK-LABEL: test_vclzq_s16
-// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclzq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
   return vclzq_s16(a);
 }
 
-// CHECK-LABEL: test_vclzq_s32
-// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclzq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
   return vclzq_s32(a);
 }
 
-// CHECK-LABEL: test_vclzq_u8
-// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclzq_u8(
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   return vclzq_u8(a);
 }
 
-// CHECK-LABEL: test_vclzq_u16
-// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclzq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   return vclzq_u16(a);
 }
 
-// CHECK-LABEL: test_vclzq_u32
-// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vclzq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   return vclzq_u32(a);
 }
 
-
-// CHECK-LABEL: test_vcnt_u8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcnt_u8(
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   return vcnt_u8(a);
 }
 
-// CHECK-LABEL: test_vcnt_s8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcnt_s8(
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
   return vcnt_s8(a);
 }
 
-// CHECK-LABEL: test_vcnt_p8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcnt_p8(
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   return vcnt_p8(a);
 }
 
-// CHECK-LABEL: test_vcntq_u8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcntq_u8(
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   return vcntq_u8(a);
 }
 
-// CHECK-LABEL: test_vcntq_s8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcntq_s8(
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
   return vcntq_s8(a);
 }
 
-// CHECK-LABEL: test_vcntq_p8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcntq_p8(
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   return vcntq_p8(a);
 }
 
-
-// CHECK-LABEL: test_vcombine_s8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
   return vcombine_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
   return vcombine_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
   return vcombine_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s64
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
   return vcombine_s64(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_f16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_f16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
   return vcombine_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_f32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
   return vcombine_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
   return vcombine_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
   return vcombine_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
   return vcombine_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u64
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
   return vcombine_u64(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_p8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
   return vcombine_p8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_p16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: @test_vcombine_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
   return vcombine_p16(a, b);
 }
 
-
-// CHECK-LABEL: test_vcreate_s8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: @test_vcreate_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vcreate_s8(uint64_t a) {
   return vclz_s8(vcreate_s8(a));
 }
 
-// CHECK-LABEL: test_vcreate_s16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: @test_vcreate_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 int16x4_t test_vcreate_s16(uint64_t a) {
   return vclz_s16(vcreate_s16(a));
 }
 
-// CHECK-LABEL: test_vcreate_s32
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: @test_vcreate_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 int32x2_t test_vcreate_s32(uint64_t a) {
   return vclz_s32(vcreate_s32(a));
 }
 
-// CHECK-LABEL: test_vcreate_f16
+// CHECK-LABEL: @test_vcreate_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vcreate_f16(uint64_t a) {
   return vcreate_f16(a);
 }
 
-// CHECK-LABEL: test_vcreate_f32
+// CHECK-LABEL: @test_vcreate_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vcreate_f32(uint64_t a) {
   return vcreate_f32(a);
 }
 
-// CHECK-LABEL: test_vcreate_u8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: @test_vcreate_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vcreate_u8(uint64_t a) {
   return vclz_s8(vcreate_u8(a));
 }
 
-// CHECK-LABEL: test_vcreate_u16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: @test_vcreate_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
 uint16x4_t test_vcreate_u16(uint64_t a) {
   return vclz_s16(vcreate_u16(a));
 }
 
-// CHECK-LABEL: test_vcreate_u32
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: @test_vcreate_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
 uint32x2_t test_vcreate_u32(uint64_t a) {
   return vclz_s32(vcreate_u32(a));
 }
 
-
-// We have two ways of lowering that.  Either with one 'vmov d, r, r' or
-// with two 'vmov d[],r'.  LLVM does the latter. We may want to be less
-// strict about the matching pattern if it starts causing problem.
-// CHECK-LABEL: test_vcreate_u64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: @test_vcreate_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vcreate_u64(uint64_t a) {
   uint64x1_t tmp = vcreate_u64(a);
   return vadd_u64(tmp, tmp);
-
 }
 
-// CHECK-LABEL: test_vcreate_p8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vcnt.8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: @test_vcreate_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcreate_p8(uint64_t a) {
   return vcnt_p8(vcreate_p8(a));
 }
 
-// CHECK-LABEL: test_vcreate_p16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
+// CHECK-LABEL: @test_vcreate_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) #4
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP4]]
 poly16x4_t test_vcreate_p16(uint64_t a) {
   poly16x4_t tmp = vcreate_p16(a);
   return vbsl_p16(tmp, tmp, tmp);
 }
 
-// CHECK-LABEL: test_vcreate_s64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: @test_vcreate_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vcreate_s64(uint64_t a) {
   int64x1_t tmp = vcreate_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-
-// CHECK-LABEL: test_vcvt_f16_f32
-// CHECK: vcvt.f16.f32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_f16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4
+// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
 }
 
-
-// CHECK-LABEL: test_vcvt_f32_s32
-// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   return vcvt_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vcvt_f32_u32
-// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   return vcvt_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f32_s32
-// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcvtq_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   return vcvtq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f32_u32
-// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcvtq_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   return vcvtq_f32_u32(a);
 }
 
-
-// CHECK-LABEL: test_vcvt_f32_f16
-// CHECK: vcvt.f32.f16
+// CHECK-LABEL: @test_vcvt_f32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4
+// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VCVT_F32_F161_I]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   return vcvt_f32_f16(a);
 }
 
-
-// CHECK-LABEL: test_vcvt_n_f32_s32
-// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_n_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x float> [[VCVT_N1]]
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   return vcvt_n_f32_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvt_n_f32_u32
-// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_n_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x float> [[VCVT_N1]]
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   return vcvt_n_f32_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f32_s32
-// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vcvtq_n_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x float> [[VCVT_N1]]
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   return vcvtq_n_f32_s32(a, 3);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f32_u32
-// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vcvtq_n_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x float> [[VCVT_N1]]
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   return vcvtq_n_f32_u32(a, 3);
 }
 
-
-// CHECK-LABEL: test_vcvt_n_s32_f32
-// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_n_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VCVT_N1]]
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   return vcvt_n_s32_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_s32_f32
-// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vcvtq_n_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x i32> [[VCVT_N1]]
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   return vcvtq_n_s32_f32(a, 3);
 }
 
-
-// CHECK-LABEL: test_vcvt_n_u32_f32
-// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_n_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VCVT_N1]]
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   return vcvt_n_u32_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_u32_f32
-// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vcvtq_n_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x i32> [[VCVT_N1]]
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   return vcvtq_n_u32_f32(a, 3);
 }
 
-
-// CHECK-LABEL: test_vcvt_s32_f32
-// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCVT_I]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_s32_f32
-// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcvtq_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCVT_I]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
 }
 
-
-// CHECK-LABEL: test_vcvt_u32_f32
-// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vcvt_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCVT_I]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_u32_f32
-// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vcvtq_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCVT_I]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
 }
 
-
-// CHECK-LABEL: test_vdup_lane_u8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_u8(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
   return vdup_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_u16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
   return vdup_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_u32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE]]
 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
   return vdup_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_s8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_s8(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 int8x8_t test_vdup_lane_s8(int8x8_t a) {
   return vdup_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_s16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 int16x4_t test_vdup_lane_s16(int16x4_t a) {
   return vdup_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_s32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE]]
 int32x2_t test_vdup_lane_s32(int32x2_t a) {
   return vdup_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_p8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_p8(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
   return vdup_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_p16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_p16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
   return vdup_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_f32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdup_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE]]
 float32x2_t test_vdup_lane_f32(float32x2_t a) {
   return vdup_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_u8(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
   return vdupq_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
   return vdupq_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[SHUFFLE]]
 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
   return vdupq_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_s8(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
   return vdupq_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
   return vdupq_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[SHUFFLE]]
 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
   return vdupq_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_p8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_p8(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
   return vdupq_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_p16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_p16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
   return vdupq_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_f32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vdupq_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x float> [[SHUFFLE]]
 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
   return vdupq_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_s64
+// CHECK-LABEL: @test_vdup_lane_s64(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   return vdup_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vdup_lane_u64
+// CHECK-LABEL: @test_vdup_lane_u64(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   return vdup_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: @test_vdupq_lane_s64(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
   return vdupq_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: @test_vdupq_lane_u64(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
   return vdupq_lane_u64(a, 0);
 }
 
-
-// CHECK-LABEL: test_vdup_n_u8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdup_n_u8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 uint8x8_t test_vdup_n_u8(uint8_t a) {
   return vdup_n_u8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_u16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdup_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 uint16x4_t test_vdup_n_u16(uint16_t a) {
   return vdup_n_u16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_u32
-// CHECK: mov 
+// CHECK-LABEL: @test_vdup_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 uint32x2_t test_vdup_n_u32(uint32_t a) {
   return vdup_n_u32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdup_n_s8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 int8x8_t test_vdup_n_s8(int8_t a) {
   return vdup_n_s8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdup_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 int16x4_t test_vdup_n_s16(int16_t a) {
   return vdup_n_s16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s32
-// CHECK: mov 
+// CHECK-LABEL: @test_vdup_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 int32x2_t test_vdup_n_s32(int32_t a) {
   return vdup_n_s32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_p8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdup_n_p8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 poly8x8_t test_vdup_n_p8(poly8_t a) {
   return vdup_n_p8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_p16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdup_n_p16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 poly16x4_t test_vdup_n_p16(poly16_t a) {
   return vdup_n_p16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\]}}}
+// CHECK-LABEL: @test_vdup_n_f16(
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT3]]
 float16x4_t test_vdup_n_f16(float16_t *a) {
   return vdup_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vdup_n_f32
-// CHECK: mov 
+// CHECK-LABEL: @test_vdup_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VECINIT1_I]]
 float32x2_t test_vdup_n_f32(float32_t a) {
   return vdup_n_f32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_u8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 uint8x16_t test_vdupq_n_u8(uint8_t a) {
   return vdupq_n_u8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 uint16x8_t test_vdupq_n_u16(uint16_t a) {
   return vdupq_n_u16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 uint32x4_t test_vdupq_n_u32(uint32_t a) {
   return vdupq_n_u32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_s8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 int8x16_t test_vdupq_n_s8(int8_t a) {
   return vdupq_n_s8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 int16x8_t test_vdupq_n_s16(int16_t a) {
   return vdupq_n_s16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 int32x4_t test_vdupq_n_s32(int32_t a) {
   return vdupq_n_s32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_p8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_p8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 poly8x16_t test_vdupq_n_p8(poly8_t a) {
   return vdupq_n_p8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_p16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_p16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 poly16x8_t test_vdupq_n_p16(poly16_t a) {
   return vdupq_n_p16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}}
+// CHECK-LABEL: @test_vdupq_n_f16(
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT7]]
 float16x8_t test_vdupq_n_f16(float16_t *a) {
   return vdupq_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vdupq_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
 float32x4_t test_vdupq_n_f32(float32_t a) {
   return vdupq_n_f32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s64
-// CHECK: vmov
+// CHECK-LABEL: @test_vdup_n_s64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vdup_n_s64(int64_t a) {
   int64x1_t tmp = vdup_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vdup_n_u64
-// CHECK: vmov
+// CHECK-LABEL: @test_vdup_n_u64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vdup_n_u64(uint64_t a) {
   int64x1_t tmp = vdup_n_u64(a);
   return vadd_s64(tmp, tmp);
-
 }
 
-// CHECK-LABEL: test_vdupq_n_s64
-// CHECK: vmov
+// CHECK-LABEL: @test_vdupq_n_s64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vdupq_n_s64(int64_t a) {
   int64x2_t tmp = vdupq_n_s64(a);
   return vaddq_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vdupq_n_u64
-// CHECK: vmov
+// CHECK-LABEL: @test_vdupq_n_u64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vdupq_n_u64(uint64_t a) {
   int64x2_t tmp = vdupq_n_u64(a);
   return vaddq_u64(tmp, tmp);
 }
 
-
-// CHECK-LABEL: test_veor_s8
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_veor_s8(
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
   return veor_s8(a, b);
 }
 
-// CHECK-LABEL: test_veor_s16
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_veor_s16(
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
   return veor_s16(a, b);
 }
 
-// CHECK-LABEL: test_veor_s32
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_veor_s32(
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
   return veor_s32(a, b);
 }
 
-// CHECK-LABEL: test_veor_s64
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_veor_s64(
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
   return veor_s64(a, b);
 }
 
-// CHECK-LABEL: test_veor_u8
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_veor_u8(
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
   return veor_u8(a, b);
 }
 
-// CHECK-LABEL: test_veor_u16
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_veor_u16(
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
   return veor_u16(a, b);
 }
 
-// CHECK-LABEL: test_veor_u32
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_veor_u32(
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
   return veor_u32(a, b);
 }
 
-// CHECK-LABEL: test_veor_u64
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_veor_u64(
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
   return veor_u64(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s8
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_veorq_s8(
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
   return veorq_s8(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s16
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_veorq_s16(
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
   return veorq_s16(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s32
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_veorq_s32(
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
   return veorq_s32(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s64
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_veorq_s64(
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
   return veorq_s64(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u8
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_veorq_u8(
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
   return veorq_u8(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u16
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_veorq_u16(
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
   return veorq_u16(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u32
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_veorq_u32(
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
   return veorq_u32(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u64
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_veorq_u64(
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
   return veorq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vext_s8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_s8(
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   return vext_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_u8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_u8(
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   return vext_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_p8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_p8(
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   return vext_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_s16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   return vext_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_u16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   return vext_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_p16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   return vext_p16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_s32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   return vext_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vext_u32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   return vext_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vext_s64
+// CHECK-LABEL: @test_vext_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   return vext_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vext_u64
+// CHECK-LABEL: @test_vext_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   return vext_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vext_f32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vext_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   return vext_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_s8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_s8(
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   return vextq_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_u8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_u8(
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   return vextq_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_p8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_p8(
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   return vextq_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_s16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   return vextq_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_u16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   return vextq_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_p16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   return vextq_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_s32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   return vextq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vextq_u32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   return vextq_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vextq_s64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: @test_vextq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   return vextq_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_u64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: @test_vextq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   return vextq_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_f32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vextq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   return vextq_f32(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vfma_f32
-// CHECK: vfma.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vfma_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %c, <2 x float> %a) #4
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfma_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmaq_f32
-// CHECK: vfma.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vfmaq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a) #4
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK-LABEL: @test_vfms_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
-// CHECK:   ret <2 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %c, <2 x float> %a) #4
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfms_f32(a, b, c);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK-LABEL: @test_vfmsq_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
-// CHECK:   ret <4 x float> [[TMP6]]
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %c, <4 x float> %a) #4
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmsq_f32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vget_high_s8
+// CHECK-LABEL: @test_vget_high_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
   return vget_high_s8(a);
 }
 
-// CHECK-LABEL: test_vget_high_s16
+// CHECK-LABEL: @test_vget_high_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
   return vget_high_s16(a);
 }
 
-// CHECK-LABEL: test_vget_high_s32
+// CHECK-LABEL: @test_vget_high_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
   return vget_high_s32(a);
 }
 
-// CHECK-LABEL: test_vget_high_s64
+// CHECK-LABEL: @test_vget_high_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
   return vget_high_s64(a);
 }
 
-// CHECK-LABEL: test_vget_high_f16
+// CHECK-LABEL: @test_vget_high_f16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
   return vget_high_f16(a);
 }
 
-// CHECK-LABEL: test_vget_high_f32
+// CHECK-LABEL: @test_vget_high_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
   return vget_high_f32(a);
 }
 
-// CHECK-LABEL: test_vget_high_u8
+// CHECK-LABEL: @test_vget_high_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
   return vget_high_u8(a);
 }
 
-// CHECK-LABEL: test_vget_high_u16
+// CHECK-LABEL: @test_vget_high_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
   return vget_high_u16(a);
 }
 
-// CHECK-LABEL: test_vget_high_u32
+// CHECK-LABEL: @test_vget_high_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
   return vget_high_u32(a);
 }
 
-// CHECK-LABEL: test_vget_high_u64
+// CHECK-LABEL: @test_vget_high_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
   return vget_high_u64(a);
 }
 
-// CHECK-LABEL: test_vget_high_p8
+// CHECK-LABEL: @test_vget_high_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
   return vget_high_p8(a);
 }
 
-// CHECK-LABEL: test_vget_high_p16
+// CHECK-LABEL: @test_vget_high_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
   return vget_high_p16(a);
 }
 
-
-// CHECK-LABEL: test_vget_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vget_lane_u8(
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
   return vget_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vget_lane_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vget_lane_u16(uint16x4_t a) {
   return vget_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_u32
-// CHECK: mov 
+// CHECK-LABEL: @test_vget_lane_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vget_lane_u32(uint32x2_t a) {
   return vget_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vget_lane_s8(
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
   return vget_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vget_lane_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vget_lane_s16(int16x4_t a) {
   return vget_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_s32
-// CHECK: mov 
+// CHECK-LABEL: @test_vget_lane_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vget_lane_s32(int32x2_t a) {
   return vget_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vget_lane_p8(
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
   return vget_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vget_lane_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vget_lane_p16(poly16x4_t a) {
   return vget_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vget_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vget_lane_f32(float32x2_t a) {
   return vget_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vget_lane_f16(
+// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
+// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vget_lane_f16(float16x4_t a) {
   return vget_lane_f16(a, 1);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_u8(
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
   return vgetq_lane_u8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
   return vgetq_lane_u16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
   return vgetq_lane_u32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_s8(
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
   return vgetq_lane_s8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vgetq_lane_s16(int16x8_t a) {
   return vgetq_lane_s16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vgetq_lane_s32(int32x4_t a) {
   return vgetq_lane_s32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_p8(
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
   return vgetq_lane_p8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
   return vgetq_lane_p16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vgetq_lane_f32(float32x4_t a) {
   return vgetq_lane_f32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_f16(
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_244]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vgetq_lane_f16(float16x8_t a) {
   return vgetq_lane_f16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_s64
-// The optimizer is able to remove all moves now.
+// CHECK-LABEL: @test_vget_lane_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vget_lane_s64(int64x1_t a) {
   return vget_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vget_lane_u64
-// The optimizer is able to remove all moves now.
+// CHECK-LABEL: @test_vget_lane_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vget_lane_u64(uint64x1_t a) {
   return vget_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s64
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vgetq_lane_s64(int64x2_t a) {
   return vgetq_lane_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u64
-// CHECK: vmov 
+// CHECK-LABEL: @test_vgetq_lane_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
   return vgetq_lane_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vget_low_s8
+// CHECK-LABEL: @test_vget_low_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
   return vget_low_s8(a);
 }
 
-// CHECK-LABEL: test_vget_low_s16
+// CHECK-LABEL: @test_vget_low_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
   return vget_low_s16(a);
 }
 
-// CHECK-LABEL: test_vget_low_s32
+// CHECK-LABEL: @test_vget_low_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
   return vget_low_s32(a);
 }
 
-// CHECK-LABEL: test_vget_low_s64
+// CHECK-LABEL: @test_vget_low_s64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
   return vget_low_s64(a);
 }
 
-// CHECK-LABEL: test_vget_low_f16
+// CHECK-LABEL: @test_vget_low_f16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
   return vget_low_f16(a);
 }
 
-// CHECK-LABEL: test_vget_low_f32
+// CHECK-LABEL: @test_vget_low_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
   return vget_low_f32(a);
 }
 
-// CHECK-LABEL: test_vget_low_u8
+// CHECK-LABEL: @test_vget_low_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
   return vget_low_u8(a);
 }
 
-// CHECK-LABEL: test_vget_low_u16
+// CHECK-LABEL: @test_vget_low_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
   return vget_low_u16(a);
 }
 
-// CHECK-LABEL: test_vget_low_u32
+// CHECK-LABEL: @test_vget_low_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
   return vget_low_u32(a);
 }
 
-// CHECK-LABEL: test_vget_low_u64
+// CHECK-LABEL: @test_vget_low_u64(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
   return vget_low_u64(a);
 }
 
-// CHECK-LABEL: test_vget_low_p8
+// CHECK-LABEL: @test_vget_low_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
   return vget_low_p8(a);
 }
 
-// CHECK-LABEL: test_vget_low_p16
+// CHECK-LABEL: @test_vget_low_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
   return vget_low_p16(a);
 }
 
-
-// CHECK-LABEL: test_vhadd_s8
-// CHECK: vhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhadd_s8(
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
   return vhadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_s16
-// CHECK: vhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
   return vhadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_s32
-// CHECK: vhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
   return vhadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u8
-// CHECK: vhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhadd_u8(
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vhadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u16
-// CHECK: vhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vhadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u32
-// CHECK: vhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vhadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s8
-// CHECK: vhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhaddq_s8(
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
   return vhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s16
-// CHECK: vhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhaddq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
   return vhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s32
-// CHECK: vhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhaddq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
   return vhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u8
-// CHECK: vhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhaddq_u8(
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u16
-// CHECK: vhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhaddq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u32
-// CHECK: vhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhaddq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vhaddq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vhsub_s8
-// CHECK: vhsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhsub_s8(
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
   return vhsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_s16
-// CHECK: vhsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhsub_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
   return vhsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_s32
-// CHECK: vhsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhsub_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
   return vhsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u8
-// CHECK: vhsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhsub_u8(
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
   return vhsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u16
-// CHECK: vhsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhsub_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
   return vhsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u32
-// CHECK: vhsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vhsub_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
   return vhsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s8
-// CHECK: vhsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhsubq_s8(
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
   return vhsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s16
-// CHECK: vhsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhsubq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
   return vhsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s32
-// CHECK: vhsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhsubq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
   return vhsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u8
-// CHECK: vhsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhsubq_u8(
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vhsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u16
-// CHECK: vhsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhsubq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vhsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u32
-// CHECK: vhsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vhsubq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vhsubq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vld1q_u8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_u8(
+// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <16 x i8> [[VLD1]]
 uint8x16_t test_vld1q_u8(uint8_t const * a) {
   return vld1q_u8(a);
 }
 
-// CHECK-LABEL: test_vld1q_u16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <8 x i16> [[VLD1]]
 uint16x8_t test_vld1q_u16(uint16_t const * a) {
   return vld1q_u16(a);
 }
 
-// CHECK-LABEL: test_vld1q_u32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <4 x i32> [[VLD1]]
 uint32x4_t test_vld1q_u32(uint32_t const * a) {
   return vld1q_u32(a);
 }
 
-// CHECK-LABEL: test_vld1q_u64
-// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: @test_vld1q_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x i64> [[VLD1]]
 uint64x2_t test_vld1q_u64(uint64_t const * a) {
   return vld1q_u64(a);
 }
 
-// CHECK-LABEL: test_vld1q_s8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_s8(
+// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <16 x i8> [[VLD1]]
 int8x16_t test_vld1q_s8(int8_t const * a) {
   return vld1q_s8(a);
 }
 
-// CHECK-LABEL: test_vld1q_s16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <8 x i16> [[VLD1]]
 int16x8_t test_vld1q_s16(int16_t const * a) {
   return vld1q_s16(a);
 }
 
-// CHECK-LABEL: test_vld1q_s32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <4 x i32> [[VLD1]]
 int32x4_t test_vld1q_s32(int32_t const * a) {
   return vld1q_s32(a);
 }
 
-// CHECK-LABEL: test_vld1q_s64
-// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: @test_vld1q_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x i64> [[VLD1]]
 int64x2_t test_vld1q_s64(int64_t const * a) {
   return vld1q_s64(a);
 }
 
-// CHECK-LABEL: test_vld1q_f16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP1]]
 float16x8_t test_vld1q_f16(float16_t const * a) {
   return vld1q_f16(a);
 }
 
-// CHECK-LABEL: test_vld1q_f32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <4 x float> [[VLD1]]
 float32x4_t test_vld1q_f32(float32_t const * a) {
   return vld1q_f32(a);
 }
 
-// CHECK-LABEL: test_vld1q_p8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_p8(
+// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <16 x i8> [[VLD1]]
 poly8x16_t test_vld1q_p8(poly8_t const * a) {
   return vld1q_p8(a);
 }
 
-// CHECK-LABEL: test_vld1q_p16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <8 x i16> [[VLD1]]
 poly16x8_t test_vld1q_p16(poly16_t const * a) {
   return vld1q_p16(a);
 }
 
-// CHECK-LABEL: test_vld1_u8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_u8(
+// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <8 x i8> [[VLD1]]
 uint8x8_t test_vld1_u8(uint8_t const * a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: test_vld1_u16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <4 x i16> [[VLD1]]
 uint16x4_t test_vld1_u16(uint16_t const * a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: test_vld1_u32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x i32> [[VLD1]]
 uint32x2_t test_vld1_u32(uint32_t const * a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: test_vld1_u64
-// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: @test_vld1_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <1 x i64> [[VLD1]]
 uint64x1_t test_vld1_u64(uint64_t const * a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: test_vld1_s8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_s8(
+// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <8 x i8> [[VLD1]]
 int8x8_t test_vld1_s8(int8_t const * a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: test_vld1_s16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <4 x i16> [[VLD1]]
 int16x4_t test_vld1_s16(int16_t const * a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: test_vld1_s32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x i32> [[VLD1]]
 int32x2_t test_vld1_s32(int32_t const * a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: test_vld1_s64
-// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: @test_vld1_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <1 x i64> [[VLD1]]
 int64x1_t test_vld1_s64(int64_t const * a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: test_vld1_f16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vld1_f16(float16_t const * a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: test_vld1_f32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x float> [[VLD1]]
 float32x2_t test_vld1_f32(float32_t const * a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: test_vld1_p8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_p8(
+// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <8 x i8> [[VLD1]]
 poly8x8_t test_vld1_p8(poly8_t const * a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: test_vld1_p16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <4 x i16> [[VLD1]]
 poly16x4_t test_vld1_p16(poly16_t const * a) {
   return vld1_p16(a);
 }
 
-
-// CHECK-LABEL: test_vld1q_dup_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_dup_u8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
   return vld1q_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1q_dup_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
   return vld1q_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1q_dup_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
   return vld1q_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: @test_vld1q_dup_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
   return vld1q_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_dup_s8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
   return vld1q_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1q_dup_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
   return vld1q_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1q_dup_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
   return vld1q_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: @test_vld1q_dup_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
   return vld1q_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1q_dup_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   return vld1q_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1q_dup_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
   return vld1q_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_dup_p8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
   return vld1q_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1q_dup_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
   return vld1q_dup_p16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_dup_u8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
   return vld1_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1_dup_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
   return vld1_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1_dup_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
   return vld1_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: @test_vld1_dup_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
   return vld1_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_dup_s8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 int8x8_t test_vld1_dup_s8(int8_t const * a) {
   return vld1_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1_dup_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vld1_dup_s16(int16_t const * a) {
   return vld1_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1_dup_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vld1_dup_s32(int32_t const * a) {
   return vld1_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: @test_vld1_dup_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vld1_dup_s64(int64_t const * a) {
   return vld1_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1_dup_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t const * a) {
   return vld1_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1_dup_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vld1_dup_f32(float32_t const * a) {
   return vld1_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_dup_p8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
   return vld1_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1_dup_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
   return vld1_dup_p16(a);
 }
 
-
-// CHECK-LABEL: test_vld1q_lane_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_lane_u8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
   return vld1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1q_lane_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
   return vld1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1q_lane_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
   return vld1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: @test_vld1q_lane_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
   return vld1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_lane_s8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
   return vld1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1q_lane_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
   return vld1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1q_lane_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
   return vld1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: @test_vld1q_lane_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
   return vld1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1q_lane_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1q_lane_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1q_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK:   ret <4 x float> [[VLD1_LANE]]
 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
   return vld1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1q_lane_p8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
   return vld1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1q_lane_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
   return vld1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_lane_u8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
   return vld1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1_lane_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
   return vld1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1_lane_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
   return vld1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: @test_vld1_lane_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
   return vld1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vld1_lane_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_lane_s8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
   return vld1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1_lane_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
   return vld1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1_lane_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
   return vld1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: @test_vld1_lane_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
   return vld1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vld1_lane_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1_lane_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vld1_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK:   ret <2 x float> [[VLD1_LANE]]
 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
   return vld1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld1_lane_p8(
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
   return vld1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vld1_lane_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
   return vld1_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vld2q_u8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
   return vld2q_u8(a);
 }
 
-// CHECK-LABEL: test_vld2q_u16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
   return vld2q_u16(a);
 }
 
-// CHECK-LABEL: test_vld2q_u32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
   return vld2q_u32(a);
 }
 
-// CHECK-LABEL: test_vld2q_s8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
   return vld2q_s8(a);
 }
 
-// CHECK-LABEL: test_vld2q_s16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 int16x8x2_t test_vld2q_s16(int16_t const * a) {
   return vld2q_s16(a);
 }
 
-// CHECK-LABEL: test_vld2q_s32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
 int32x4x2_t test_vld2q_s32(int32_t const * a) {
   return vld2q_s32(a);
 }
 
-// CHECK-LABEL: test_vld2q_f16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 float16x8x2_t test_vld2q_f16(float16_t const * a) {
   return vld2q_f16(a);
 }
 
-// CHECK-LABEL: test_vld2q_f32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float>
 float32x4x2_t test_vld2q_f32(float32_t const * a) {
   return vld2q_f32(a);
 }
 
-// CHECK-LABEL: test_vld2q_p8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
   return vld2q_p8(a);
 }
 
-// CHECK-LABEL: test_vld2q_p16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
   return vld2q_p16(a);
 }
 
-// CHECK-LABEL: test_vld2_u8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
   return vld2_u8(a);
 }
 
-// CHECK-LABEL: test_vld2_u16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
   return vld2_u16(a);
 }
 
-// CHECK-LABEL: test_vld2_u32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
   return vld2_u32(a);
 }
 
-// CHECK-LABEL: test_vld2_u64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld2_u64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
   return vld2_u64(a);
 }
 
-// CHECK-LABEL: test_vld2_s8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
 int8x8x2_t test_vld2_s8(int8_t const * a) {
   return vld2_s8(a);
 }
 
-// CHECK-LABEL: test_vld2_s16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 int16x4x2_t test_vld2_s16(int16_t const * a) {
   return vld2_s16(a);
 }
 
-// CHECK-LABEL: test_vld2_s32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
 int32x2x2_t test_vld2_s32(int32_t const * a) {
   return vld2_s32(a);
 }
 
-// CHECK-LABEL: test_vld2_s64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld2_s64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
 int64x1x2_t test_vld2_s64(int64_t const * a) {
   return vld2_s64(a);
 }
 
-// CHECK-LABEL: test_vld2_f16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_f16(float16_t const * a) {
   return vld2_f16(a);
 }
 
-// CHECK-LABEL: test_vld2_f32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float>
 float32x2x2_t test_vld2_f32(float32_t const * a) {
   return vld2_f32(a);
 }
 
-// CHECK-LABEL: test_vld2_p8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
   return vld2_p8(a);
 }
 
-// CHECK-LABEL: test_vld2_p16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
   return vld2_p16(a);
 }
 
-
-// CHECK-LABEL: test_vld2_dup_u8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
 uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
   return vld2_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
   return vld2_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>
 uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
   return vld2_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld2_dup_u64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>
 uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
   return vld2_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
 int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
   return vld2_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
   return vld2_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>
 int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
   return vld2_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld2_dup_s64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>
 int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
   return vld2_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
   return vld2_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>
 float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
   return vld2_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_p8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
 poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
   return vld2_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_dup_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
   return vld2_dup_p16(a);
 }
 
-
-// CHECK-LABEL: test_vld2q_lane_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
   return vld2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
   return vld2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
   return vld2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
   return vld2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>
 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
   return vld2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2q_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
   return vld2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_u8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
   return vld2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
   return vld2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
   return vld2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_s8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
   return vld2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
   return vld2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
   return vld2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float>
 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
   return vld2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_p8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
   return vld2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld2_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
   return vld2_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vld3q_u8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
   return vld3q_u8(a);
 }
 
-// CHECK-LABEL: test_vld3q_u16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
   return vld3q_u16(a);
 }
 
-// CHECK-LABEL: test_vld3q_u32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
   return vld3q_u32(a);
 }
 
-// CHECK-LABEL: test_vld3q_s8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
   return vld3q_s8(a);
 }
 
-// CHECK-LABEL: test_vld3q_s16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 int16x8x3_t test_vld3q_s16(int16_t const * a) {
   return vld3q_s16(a);
 }
 
-// CHECK-LABEL: test_vld3q_s32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
 int32x4x3_t test_vld3q_s32(int32_t const * a) {
   return vld3q_s32(a);
 }
 
-// CHECK-LABEL: test_vld3q_f16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x3_t test_vld3q_f16(float16_t const * a) {
   return vld3q_f16(a);
 }
 
-// CHECK-LABEL: test_vld3q_f32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
 float32x4x3_t test_vld3q_f32(float32_t const * a) {
   return vld3q_f32(a);
 }
 
-// CHECK-LABEL: test_vld3q_p8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
   return vld3q_p8(a);
 }
 
-// CHECK-LABEL: test_vld3q_p16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld3q_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
   return vld3q_p16(a);
 }
 
-// CHECK-LABEL: test_vld3_u8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
   return vld3_u8(a);
 }
 
-// CHECK-LABEL: test_vld3_u16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
   return vld3_u16(a);
 }
 
-// CHECK-LABEL: test_vld3_u32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
   return vld3_u32(a);
 }
 
-// CHECK-LABEL: test_vld3_u64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld3_u64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
   return vld3_u64(a);
 }
 
-// CHECK-LABEL: test_vld3_s8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x3_t test_vld3_s8(int8_t const * a) {
   return vld3_s8(a);
 }
 
-// CHECK-LABEL: test_vld3_s16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x3_t test_vld3_s16(int16_t const * a) {
   return vld3_s16(a);
 }
 
-// CHECK-LABEL: test_vld3_s32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x3_t test_vld3_s32(int32_t const * a) {
   return vld3_s32(a);
 }
 
-// CHECK-LABEL: test_vld3_s64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld3_s64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
 int64x1x3_t test_vld3_s64(int64_t const * a) {
   return vld3_s64(a);
 }
 
-// CHECK-LABEL: test_vld3_f16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_f16(float16_t const * a) {
   return vld3_f16(a);
 }
 
-// CHECK-LABEL: test_vld3_f32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
 float32x2x3_t test_vld3_f32(float32_t const * a) {
   return vld3_f32(a);
 }
 
-// CHECK-LABEL: test_vld3_p8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
   return vld3_p8(a);
 }
 
-// CHECK-LABEL: test_vld3_p16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
   return vld3_p16(a);
 }
 
-
-// CHECK-LABEL: test_vld3_dup_u8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
   return vld3_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
   return vld3_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
   return vld3_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld3_dup_u64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
 uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
   return vld3_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
   return vld3_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
   return vld3_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
   return vld3_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld3_dup_s64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
 int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
   return vld3_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
   return vld3_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
 float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
   return vld3_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_p8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
   return vld3_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_dup_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
   return vld3_dup_p16(a);
 }
 
-
-// CHECK-LABEL: test_vld3q_lane_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld3q_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
   return vld3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld3q_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
   return vld3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld3q_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
   return vld3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld3q_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
   return vld3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld3q_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld3q_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
   return vld3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld3q_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
   return vld3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_u8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
   return vld3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
   return vld3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
   return vld3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_s8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
   return vld3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
   return vld3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
   return vld3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
   return vld3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_p8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
   return vld3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld3_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
   return vld3_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vld4q_u8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
   return vld4q_u8(a);
 }
 
-// CHECK-LABEL: test_vld4q_u16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
   return vld4q_u16(a);
 }
 
-// CHECK-LABEL: test_vld4q_u32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
   return vld4q_u32(a);
 }
 
-// CHECK-LABEL: test_vld4q_s8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
   return vld4q_s8(a);
 }
 
-// CHECK-LABEL: test_vld4q_s16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 int16x8x4_t test_vld4q_s16(int16_t const * a) {
   return vld4q_s16(a);
 }
 
-// CHECK-LABEL: test_vld4q_s32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
 int32x4x4_t test_vld4q_s32(int32_t const * a) {
   return vld4q_s32(a);
 }
 
-// CHECK-LABEL: test_vld4q_f16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x4_t test_vld4q_f16(float16_t const * a) {
   return vld4q_f16(a);
 }
 
-// CHECK-LABEL: test_vld4q_f32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
 float32x4x4_t test_vld4q_f32(float32_t const * a) {
   return vld4q_f32(a);
 }
 
-// CHECK-LABEL: test_vld4q_p8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
   return vld4q_p8(a);
 }
 
-// CHECK-LABEL: test_vld4q_p16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vld4q_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
   return vld4q_p16(a);
 }
 
-// CHECK-LABEL: test_vld4_u8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
   return vld4_u8(a);
 }
 
-// CHECK-LABEL: test_vld4_u16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
   return vld4_u16(a);
 }
 
-// CHECK-LABEL: test_vld4_u32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
   return vld4_u32(a);
 }
 
-// CHECK-LABEL: test_vld4_u64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld4_u64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
   return vld4_u64(a);
 }
 
-// CHECK-LABEL: test_vld4_s8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x4_t test_vld4_s8(int8_t const * a) {
   return vld4_s8(a);
 }
 
-// CHECK-LABEL: test_vld4_s16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x4_t test_vld4_s16(int16_t const * a) {
   return vld4_s16(a);
 }
 
-// CHECK-LABEL: test_vld4_s32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x4_t test_vld4_s32(int32_t const * a) {
   return vld4_s32(a);
 }
 
-// CHECK-LABEL: test_vld4_s64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld4_s64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
 int64x1x4_t test_vld4_s64(int64_t const * a) {
   return vld4_s64(a);
 }
 
-// CHECK-LABEL: test_vld4_f16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_f16(float16_t const * a) {
   return vld4_f16(a);
 }
 
-// CHECK-LABEL: test_vld4_f32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
 float32x2x4_t test_vld4_f32(float32_t const * a) {
   return vld4_f32(a);
 }
 
-// CHECK-LABEL: test_vld4_p8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
   return vld4_p8(a);
 }
 
-// CHECK-LABEL: test_vld4_p16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
   return vld4_p16(a);
 }
 
-
-// CHECK-LABEL: test_vld4_dup_u8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_u8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
   return vld4_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_u16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
   return vld4_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_u32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
   return vld4_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld4_dup_u64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
 uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
   return vld4_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_s8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
   return vld4_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_s16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
   return vld4_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_s32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
   return vld4_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: @test_vld4_dup_s64(
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
 int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
   return vld4_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_f16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
   return vld4_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_f32(
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
 float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
   return vld4_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_p8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_p8(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
   return vld4_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_dup_p16(
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
   return vld4_dup_p16(a);
 }
 
-
-// CHECK-LABEL: test_vld4q_lane_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld4q_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
   return vld4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld4q_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
   return vld4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld4q_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
   return vld4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld4q_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
   return vld4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld4q_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld4q_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
   return vld4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vld4q_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
   return vld4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_u8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
   return vld4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
   return vld4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
   return vld4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_s8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
   return vld4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
   return vld4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
   return vld4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
   return vld4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_p8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
   return vld4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vld4_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
   return vld4_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vmax_s8
-// CHECK: vmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmax_s8(
+// CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_V_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   return vmax_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmax_s16
-// CHECK: vmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmax_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VMAX_V2_I]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   return vmax_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_s32
-// CHECK: vmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmax_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VMAX_V2_I]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   return vmax_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u8
-// CHECK: vmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmax_u8(
+// CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_V_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   return vmax_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u16
-// CHECK: vmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmax_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VMAX_V2_I]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   return vmax_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u32
-// CHECK: vmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmax_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VMAX_V2_I]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   return vmax_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmax_f32
-// CHECK: vmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmax_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VMAX_V2_I]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   return vmax_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s8
-// CHECK: vmax.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmaxq_s8(
+// CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   return vmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s16
-// CHECK: vmax.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmaxq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VMAXQ_V2_I]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   return vmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s32
-// CHECK: vmax.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmaxq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VMAXQ_V2_I]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   return vmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u8
-// CHECK: vmax.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmaxq_u8(
+// CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u16
-// CHECK: vmax.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmaxq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VMAXQ_V2_I]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u32
-// CHECK: vmax.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmaxq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VMAXQ_V2_I]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_f32
-// CHECK: vmax.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmaxq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VMAXQ_V2_I]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   return vmaxq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vmin_s8
-// CHECK: vmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmin_s8(
+// CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_V_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   return vmin_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmin_s16
-// CHECK: vmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmin_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VMIN_V2_I]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   return vmin_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_s32
-// CHECK: vmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmin_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VMIN_V2_I]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   return vmin_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u8
-// CHECK: vmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmin_u8(
+// CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_V_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   return vmin_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u16
-// CHECK: vmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmin_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VMIN_V2_I]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   return vmin_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u32
-// CHECK: vmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmin_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VMIN_V2_I]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   return vmin_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmin_f32
-// CHECK: vmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmin_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VMIN_V2_I]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   return vmin_f32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s8
-// CHECK: vmin.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vminq_s8(
+// CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMINQ_V_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   return vminq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s16
-// CHECK: vmin.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vminq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VMINQ_V2_I]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   return vminq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s32
-// CHECK: vmin.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vminq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VMINQ_V2_I]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   return vminq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u8
-// CHECK: vmin.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vminq_u8(
+// CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMINQ_V_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   return vminq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u16
-// CHECK: vmin.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vminq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VMINQ_V2_I]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   return vminq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u32
-// CHECK: vmin.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vminq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VMINQ_V2_I]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   return vminq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_f32
-// CHECK: vmin.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vminq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VMINQ_V2_I]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   return vminq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vmla_s8
-// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmla_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u8
-// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmla_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s8
-// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u8
-// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlaq_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vmlal_s8
-// CHECK: vmlal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_s8(
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlal_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u8
-// CHECK: vmlal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_u8(
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlal_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vmlal_lane_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlal_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlal_lane_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlal_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlal_lane_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlal_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlal_lane_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlal_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_lane_u32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: test_vmlal_n_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlal_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlal_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlal_n_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vmla_lane_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmla_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmla_lane_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmla_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmla_lane_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmla_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmla_lane_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmla_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmla_lane_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmla_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlaq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlaq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlaq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlaq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlaq_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlaq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlaq_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlaq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlaq_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlaq_lane_f32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: test_vmla_n_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmla_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmla_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmla_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmla_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmla_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmla_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlaq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlaq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlaq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlaq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, 
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlaq_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlaq_n_f32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vmls_s8
-// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmls_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u8
-// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmls_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s8
-// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlsq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlsq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlsq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlsq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u8
-// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlsq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlsq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlsq_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vmlsl_s8
-// CHECK: vmlsl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_s8(
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u8
-// CHECK: vmlsl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_u8(
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vmlsl_lane_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsl_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsl_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsl_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsl_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_lane_u32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: test_vmlsl_n_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlsl_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsl_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlsl_n_u32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vmls_lane_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmls_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmls_lane_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmls_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmls_lane_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmls_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmls_lane_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmls_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmls_lane_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmls_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlsq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlsq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsq_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlsq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsq_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlsq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmlsq_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlsq_lane_f32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: test_vmls_n_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmls_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmls_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmls_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmls_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmls_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmls_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlsq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlsq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlsq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlsq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmlsq_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlsq_n_f32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vmovl_s8
-// CHECK: vmovl.s8 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmovl_s8(
+// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
   return vmovl_s8(a);
 }
 
-// CHECK-LABEL: test_vmovl_s16
-// CHECK: vmovl.s16 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmovl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
   return vmovl_s16(a);
 }
 
-// CHECK-LABEL: test_vmovl_s32
-// CHECK: vmovl.s32 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmovl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
   return vmovl_s32(a);
 }
 
-// CHECK-LABEL: test_vmovl_u8
-// CHECK: vmovl.u8 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmovl_u8(
+// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   return vmovl_u8(a);
 }
 
-// CHECK-LABEL: test_vmovl_u16
-// CHECK: vmovl.u16 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmovl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   return vmovl_u16(a);
 }
 
-// CHECK-LABEL: test_vmovl_u32
-// CHECK: vmovl.u32 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmovl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   return vmovl_u32(a);
 }
 
-
-// CHECK-LABEL: test_vmovn_s16
-// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmovn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
   return vmovn_s16(a);
 }
 
-// CHECK-LABEL: test_vmovn_s32
-// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmovn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
   return vmovn_s32(a);
 }
 
-// CHECK-LABEL: test_vmovn_s64
-// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmovn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
   return vmovn_s64(a);
 }
 
-// CHECK-LABEL: test_vmovn_u16
-// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmovn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   return vmovn_u16(a);
 }
 
-// CHECK-LABEL: test_vmovn_u32
-// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmovn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   return vmovn_u32(a);
 }
 
-// CHECK-LABEL: test_vmovn_u64
-// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmovn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   return vmovn_u64(a);
 }
 
-
-// CHECK-LABEL: test_vmov_n_u8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_u8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 uint8x8_t test_vmov_n_u8(uint8_t a) {
   return vmov_n_u8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_u16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 uint16x4_t test_vmov_n_u16(uint16_t a) {
   return vmov_n_u16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_u32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 uint32x2_t test_vmov_n_u32(uint32_t a) {
   return vmov_n_u32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_s8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 int8x8_t test_vmov_n_s8(int8_t a) {
   return vmov_n_s8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 int16x4_t test_vmov_n_s16(int16_t a) {
   return vmov_n_s16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 int32x2_t test_vmov_n_s32(int32_t a) {
   return vmov_n_s32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_p8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_p8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 poly8x8_t test_vmov_n_p8(poly8_t a) {
   return vmov_n_p8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_p16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_p16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 poly16x4_t test_vmov_n_p16(poly16_t a) {
   return vmov_n_p16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\]}}}
+// CHECK-LABEL: @test_vmov_n_f16(
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT3]]
 float16x4_t test_vmov_n_f16(float16_t *a) {
   return vmov_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vmov_n_f32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmov_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VECINIT1_I]]
 float32x2_t test_vmov_n_f32(float32_t a) {
   return vmov_n_f32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_u8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 uint8x16_t test_vmovq_n_u8(uint8_t a) {
   return vmovq_n_u8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 uint16x8_t test_vmovq_n_u16(uint16_t a) {
   return vmovq_n_u16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 uint32x4_t test_vmovq_n_u32(uint32_t a) {
   return vmovq_n_u32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_s8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 int8x16_t test_vmovq_n_s8(int8_t a) {
   return vmovq_n_s8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 int16x8_t test_vmovq_n_s16(int16_t a) {
   return vmovq_n_s16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 int32x4_t test_vmovq_n_s32(int32_t a) {
   return vmovq_n_s32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_p8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_p8(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 poly8x16_t test_vmovq_n_p8(poly8_t a) {
   return vmovq_n_p8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_p16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_p16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 poly16x8_t test_vmovq_n_p16(poly16_t a) {
   return vmovq_n_p16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}}
+// CHECK-LABEL: @test_vmovq_n_f16(
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT7]]
 float16x8_t test_vmovq_n_f16(float16_t *a) {
   return vmovq_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
 float32x4_t test_vmovq_n_f32(float32_t a) {
   return vmovq_n_f32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: @test_vmov_n_s64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vmov_n_s64(int64_t a) {
   int64x1_t tmp = vmov_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vmov_n_u64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: @test_vmov_n_u64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vmov_n_u64(uint64_t a) {
   uint64x1_t tmp = vmov_n_u64(a);
   return vadd_u64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vmovq_n_s64
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_s64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 int64x2_t test_vmovq_n_s64(int64_t a) {
   return vmovq_n_s64(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u64
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: @test_vmovq_n_u64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 uint64x2_t test_vmovq_n_u64(uint64_t a) {
   return vmovq_n_u64(a);
 }
 
-
-// CHECK-LABEL: test_vmul_s8
-// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
   return vmul_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmul_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
   return vmul_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
   return vmul_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
   return vmul_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u8
-// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
   return vmul_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s8
-// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_s8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
   return vmulq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_s16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
   return vmulq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_s32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
   return vmulq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_f32(
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
   return vmulq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u8
-// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_u8(
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
   return vmulq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_u16(
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
   return vmulq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_u32(
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
   return vmulq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vmull_s8
-// CHECK: vmull.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_s8(
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   return vmull_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmull_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   return vmull_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   return vmull_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u8
-// CHECK: vmull.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_u8(
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   return vmull_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_p8
-// CHECK: vmull.p8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_p8(
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   return vmull_p8(a, b);
 }
 
-
-// CHECK-LABEL: test_vmull_lane_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmull_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmull_lane_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmull_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vmull_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmull_lane_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmull_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmull_lane_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmull_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_lane_u32(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vmull_n_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   return vmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   return vmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   return vmull_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmull_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   return vmull_n_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vmul_p8
-// CHECK: vmul.p8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_p8(
+// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
   return vmul_p8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_p8
-// CHECK: vmul.p8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_p8(
+// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
   return vmulq_p8(a, b);
 }
 
-
-// CHECK-LABEL: test_vmul_lane_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmul_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
   return vmul_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_lane_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmul_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
   return vmul_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmul_lane_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmul_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
   return vmul_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmul_lane_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmul_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_lane_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmul_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmulq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
   return vmulq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmulq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
   return vmulq_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmulq_lane_f32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
   return vmulq_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmulq_lane_u16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
   return vmulq_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vmulq_lane_u32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
   return vmulq_lane_u32(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vmul_n_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   return vmul_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   return vmul_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   return vmul_n_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   return vmul_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmul_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   return vmul_n_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_n_s16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   return vmulq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_n_s32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   return vmulq_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
+// CHECK-LABEL: @test_vmulq_n_f32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   return vmulq_n_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_n_u16(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   return vmulq_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmulq_n_u32(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   return vmulq_n_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vmvn_s8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmvn_s8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
   return vmvn_s8(a);
 }
 
-// CHECK-LABEL: test_vmvn_s16
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmvn_s16(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
   return vmvn_s16(a);
 }
 
-// CHECK-LABEL: test_vmvn_s32
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmvn_s32(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
   return vmvn_s32(a);
 }
 
-// CHECK-LABEL: test_vmvn_u8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmvn_u8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   return vmvn_u8(a);
 }
 
-// CHECK-LABEL: test_vmvn_u16
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmvn_u16(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   return vmvn_u16(a);
 }
 
-// CHECK-LABEL: test_vmvn_u32
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmvn_u32(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   return vmvn_u32(a);
 }
 
-// CHECK-LABEL: test_vmvn_p8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vmvn_p8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   return vmvn_p8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmvnq_s8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
   return vmvnq_s8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s16
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmvnq_s16(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
   return vmvnq_s16(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s32
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmvnq_s32(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
   return vmvnq_s32(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmvnq_u8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   return vmvnq_u8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u16
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmvnq_u16(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   return vmvnq_u16(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u32
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmvnq_u32(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   return vmvnq_u32(a);
 }
 
-// CHECK-LABEL: test_vmvnq_p8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vmvnq_p8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   return vmvnq_p8(a);
 }
 
-
-// CHECK-LABEL: test_vneg_s8
-// CHECK: vneg.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vneg_s8(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
   return vneg_s8(a);
 }
 
-// CHECK-LABEL: test_vneg_s16
-// CHECK: vneg.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vneg_s16(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
   return vneg_s16(a);
 }
 
-// CHECK-LABEL: test_vneg_s32
-// CHECK: vneg.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vneg_s32(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
   return vneg_s32(a);
 }
 
-// CHECK-LABEL: test_vneg_f32
-// CHECK: vneg.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vneg_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
   return vneg_f32(a);
 }
 
-// CHECK-LABEL: test_vnegq_s8
-// CHECK: vneg.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vnegq_s8(
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
   return vnegq_s8(a);
 }
 
-// CHECK-LABEL: test_vnegq_s16
-// CHECK: vneg.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vnegq_s16(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
   return vnegq_s16(a);
 }
 
-// CHECK-LABEL: test_vnegq_s32
-// CHECK: vneg.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vnegq_s32(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
   return vnegq_s32(a);
 }
 
-// CHECK-LABEL: test_vnegq_f32
-// CHECK: vneg.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vnegq_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
   return vnegq_f32(a);
 }
 
-
-// CHECK-LABEL: test_vorn_s8
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorn_s8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
   return vorn_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s16
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorn_s16(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
   return vorn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s32
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorn_s32(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
   return vorn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s64
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorn_s64(
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
   return vorn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u8
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorn_u8(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
   return vorn_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u16
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorn_u16(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
   return vorn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u32
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorn_u32(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
   return vorn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u64
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorn_u64(
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
   return vorn_u64(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s8
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vornq_s8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
   return vornq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s16
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vornq_s16(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
   return vornq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s32
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vornq_s32(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
   return vornq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s64
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vornq_s64(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
   return vornq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u8
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vornq_u8(
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
   return vornq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u16
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vornq_u16(
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
   return vornq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u32
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vornq_u32(
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
   return vornq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u64
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vornq_u64(
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
   return vornq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vorr_s8
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorr_s8(
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
   return vorr_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s16
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorr_s16(
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
   return vorr_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s32
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorr_s32(
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
   return vorr_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s64
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorr_s64(
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
   return vorr_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u8
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorr_u8(
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
   return vorr_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u16
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorr_u16(
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
   return vorr_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u32
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorr_u32(
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
   return vorr_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u64
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vorr_u64(
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
   return vorr_u64(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s8
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vorrq_s8(
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
   return vorrq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s16
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vorrq_s16(
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
   return vorrq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s32
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vorrq_s32(
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
   return vorrq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s64
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vorrq_s64(
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
   return vorrq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u8
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vorrq_u8(
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
   return vorrq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u16
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vorrq_u16(
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
   return vorrq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u32
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vorrq_u32(
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
   return vorrq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u64
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vorrq_u64(
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
   return vorrq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vpadal_s8
-// CHECK: vpadal.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadal_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
+// CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   return vpadal_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_s16
-// CHECK: vpadal.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadal_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
+// CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   return vpadal_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_s32
-// CHECK: vpadal.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadal_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
+// CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   return vpadal_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u8
-// CHECK: vpadal.u8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadal_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
+// CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   return vpadal_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u16
-// CHECK: vpadal.u16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadal_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
+// CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   return vpadal_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u32
-// CHECK: vpadal.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadal_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
+// CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   return vpadal_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s8
-// CHECK: vpadal.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpadalq_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   return vpadalq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s16
-// CHECK: vpadal.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpadalq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
+// CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   return vpadalq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s32
-// CHECK: vpadal.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpadalq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
+// CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   return vpadalq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u8
-// CHECK: vpadal.u8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpadalq_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   return vpadalq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u16
-// CHECK: vpadal.u16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpadalq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
+// CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   return vpadalq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u32
-// CHECK: vpadal.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpadalq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
+// CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   return vpadalq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vpadd_s8
-// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadd_s8(
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   return vpadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_s16
-// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   return vpadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_s32
-// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   return vpadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u8
-// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadd_u8(
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   return vpadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u16
-// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   return vpadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u32
-// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   return vpadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_f32
-// CHECK: vpadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpadd_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VPADD_V2_I]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   return vpadd_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vpaddl_s8
-// CHECK: vpaddl.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddl_s8(
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
   return vpaddl_s8(a);
 }
 
-// CHECK-LABEL: test_vpaddl_s16
-// CHECK: vpaddl.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
   return vpaddl_s16(a);
 }
 
-// CHECK-LABEL: test_vpaddl_s32
-// CHECK: vpaddl.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
   return vpaddl_s32(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u8
-// CHECK: vpaddl.u8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddl_u8(
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   return vpaddl_u8(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u16
-// CHECK: vpaddl.u16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   return vpaddl_u16(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u32
-// CHECK: vpaddl.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   return vpaddl_u32(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s8
-// CHECK: vpaddl.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddlq_s8(
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   return vpaddlq_s8(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s16
-// CHECK: vpaddl.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   return vpaddlq_s16(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s32
-// CHECK: vpaddl.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   return vpaddlq_s32(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u8
-// CHECK: vpaddl.u8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddlq_u8(
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   return vpaddlq_u8(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u16
-// CHECK: vpaddl.u16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   return vpaddlq_u16(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u32
-// CHECK: vpaddl.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vpaddlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   return vpaddlq_u32(a);
 }
 
-
-// CHECK-LABEL: test_vpmax_s8
-// CHECK: vpmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmax_s8(
+// CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_V_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   return vpmax_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_s16
-// CHECK: vpmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmax_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VPMAX_V2_I]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   return vpmax_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_s32
-// CHECK: vpmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmax_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VPMAX_V2_I]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   return vpmax_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u8
-// CHECK: vpmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmax_u8(
+// CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_V_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   return vpmax_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u16
-// CHECK: vpmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmax_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VPMAX_V2_I]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   return vpmax_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u32
-// CHECK: vpmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmax_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VPMAX_V2_I]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   return vpmax_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_f32
-// CHECK: vpmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmax_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VPMAX_V2_I]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   return vpmax_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vpmin_s8
-// CHECK: vpmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmin_s8(
+// CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_V_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   return vpmin_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_s16
-// CHECK: vpmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmin_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VPMIN_V2_I]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   return vpmin_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_s32
-// CHECK: vpmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmin_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VPMIN_V2_I]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   return vpmin_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u8
-// CHECK: vpmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmin_u8(
+// CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_V_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   return vpmin_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u16
-// CHECK: vpmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmin_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VPMIN_V2_I]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   return vpmin_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u32
-// CHECK: vpmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmin_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VPMIN_V2_I]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   return vpmin_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_f32
-// CHECK: vpmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vpmin_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VPMIN_V2_I]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   return vpmin_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vqabs_s8
-// CHECK: vqabs.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqabs_s8(
+// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
   return vqabs_s8(a);
 }
 
-// CHECK-LABEL: test_vqabs_s16
-// CHECK: vqabs.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqabs_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQABS_V1_I]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
   return vqabs_s16(a);
 }
 
-// CHECK-LABEL: test_vqabs_s32
-// CHECK: vqabs.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqabs_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQABS_V1_I]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
   return vqabs_s32(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s8
-// CHECK: vqabs.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqabsq_s8(
+// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
   return vqabsq_s8(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s16
-// CHECK: vqabs.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqabsq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQABSQ_V1_I]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
   return vqabsq_s16(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s32
-// CHECK: vqabs.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqabsq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQABSQ_V1_I]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
   return vqabsq_s32(a);
 }
 
-
-// CHECK-LABEL: test_vqadd_s8
-// CHECK: vqadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqadd_s8(
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s16
-// CHECK: vqadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   return vqadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s32
-// CHECK: vqadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   return vqadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s64
-// CHECK: vqadd.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqadd_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   return vqadd_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u8
-// CHECK: vqadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqadd_u8(
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u16
-// CHECK: vqadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   return vqadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u32
-// CHECK: vqadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   return vqadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u64
-// CHECK: vqadd.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqadd_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   return vqadd_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s8
-// CHECK: vqadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqaddq_s8(
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s16
-// CHECK: vqadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqaddq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   return vqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s32
-// CHECK: vqadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqaddq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   return vqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s64
-// CHECK: vqadd.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqaddq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   return vqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u8
-// CHECK: vqadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqaddq_u8(
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u16
-// CHECK: vqadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqaddq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u32
-// CHECK: vqadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqaddq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u64
-// CHECK: vqadd.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqaddq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vqaddq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vqdmlal_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmlal_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlal_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmlal_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vqdmlal_lane_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmlal_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqdmlal_lane_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmlal_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_lane_s32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: test_vqdmlal_n_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmlal_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlal_n_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmlal_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlal_n_s32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vqdmlsl_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmlsl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlsl_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmlsl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vqdmlsl_lane_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmlsl_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqdmlsl_lane_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmlsl_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_lane_s32(a, b, c, 1);
 }
 
-
-// CHECK-LABEL: test_vqdmlsl_n_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmlsl_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlsl_n_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmlsl_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlsl_n_s32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vqdmulh_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmulh_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulh_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmulh_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmulhq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmulhq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqdmulhq_s32(a, b);
 }
 
-
-// CHECK-LABEL: test_vqdmulh_lane_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmulh_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmulh_lane_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmulh_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vqdmulhq_lane_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmulhq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmulhq_lane_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmulhq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqdmulhq_lane_s32(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vqdmulh_n_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmulh_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQDMULH_V5_I]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulh_n_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmulh_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQDMULH_V3_I]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_n_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmulhq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #4
+// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQDMULHQ_V9_I]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_n_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmulhq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #4
+// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULHQ_V5_I]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqdmulhq_n_s32(a, b);
 }
 
-
-// CHECK-LABEL: test_vqdmull_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmull_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmull_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmull_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_s32(a, b);
 }
 
-
-// CHECK-LABEL: test_vqdmull_lane_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmull_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmull_lane_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqdmull_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_lane_s32(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vqdmull_n_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmull_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQDMULL_V5_I]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   return vqdmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmull_n_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqdmull_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQDMULL_V3_I]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   return vqdmull_n_s32(a, b);
 }
 
-
-// CHECK-LABEL: test_vqmovn_s16
-// CHECK: vqmovn.s16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
   return vqmovn_s16(a);
 }
 
-// CHECK-LABEL: test_vqmovn_s32
-// CHECK: vqmovn.s32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
   return vqmovn_s32(a);
 }
 
-// CHECK-LABEL: test_vqmovn_s64
-// CHECK: vqmovn.s64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
   return vqmovn_s64(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u16
-// CHECK: vqmovn.u16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   return vqmovn_u16(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u32
-// CHECK: vqmovn.u32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   return vqmovn_u32(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u64
-// CHECK: vqmovn.u64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   return vqmovn_u64(a);
 }
 
-
-// CHECK-LABEL: test_vqmovun_s16
-// CHECK: vqmovun.s16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovun_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4
+// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 uint8x8_t test_vqmovun_s16(int16x8_t a) {
   return vqmovun_s16(a);
 }
 
-// CHECK-LABEL: test_vqmovun_s32
-// CHECK: vqmovun.s32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovun_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQMOVUN_V1_I]]
 uint16x4_t test_vqmovun_s32(int32x4_t a) {
   return vqmovun_s32(a);
 }
 
-// CHECK-LABEL: test_vqmovun_s64
-// CHECK: vqmovun.s64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqmovun_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQMOVUN_V1_I]]
 uint32x2_t test_vqmovun_s64(int64x2_t a) {
   return vqmovun_s64(a);
 }
 
-
-// CHECK-LABEL: test_vqneg_s8
-// CHECK: vqneg.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqneg_s8(
+// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
   return vqneg_s8(a);
 }
 
-// CHECK-LABEL: test_vqneg_s16
-// CHECK: vqneg.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqneg_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQNEG_V1_I]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
   return vqneg_s16(a);
 }
 
-// CHECK-LABEL: test_vqneg_s32
-// CHECK: vqneg.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqneg_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQNEG_V1_I]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
   return vqneg_s32(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s8
-// CHECK: vqneg.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqnegq_s8(
+// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
   return vqnegq_s8(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s16
-// CHECK: vqneg.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqnegq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQNEGQ_V1_I]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
   return vqnegq_s16(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s32
-// CHECK: vqneg.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqnegq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQNEGQ_V1_I]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
   return vqnegq_s32(a);
 }
 
-
-// CHECK-LABEL: test_vqrdmulh_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrdmulh_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulh_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrdmulh_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrdmulhq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqrdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrdmulhq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }
 
-
-// CHECK-LABEL: test_vqrdmulh_lane_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqrdmulh_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqrdmulh_lane_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqrdmulh_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_lane_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqrdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_lane_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqrdmulhq_lane_s32(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vqrdmulh_n_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrdmulh_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #4
+// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRDMULH_V5_I]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqrdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulh_n_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrdmulh_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #4
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRDMULH_V3_I]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqrdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_n_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrdmulhq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #4
+// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRDMULHQ_V9_I]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqrdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_n_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrdmulhq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #4
+// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRDMULHQ_V5_I]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqrdmulhq_n_s32(a, b);
 }
 
-
-// CHECK-LABEL: test_vqrshl_s8
-// CHECK: vqrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshl_s8(
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   return vqrshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s16
-// CHECK: vqrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   return vqrshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s32
-// CHECK: vqrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   return vqrshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s64
-// CHECK: vqrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   return vqrshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u8
-// CHECK: vqrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshl_u8(
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   return vqrshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u16
-// CHECK: vqrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   return vqrshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u32
-// CHECK: vqrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return vqrshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u64
-// CHECK: vqrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   return vqrshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s8
-// CHECK: vqrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshlq_s8(
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   return vqrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s16
-// CHECK: vqrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   return vqrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s32
-// CHECK: vqrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   return vqrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s64
-// CHECK: vqrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshlq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   return vqrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u8
-// CHECK: vqrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshlq_u8(
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u16
-// CHECK: vqrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u32
-// CHECK: vqrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u64
-// CHECK: vqrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshlq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqrshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vqrshrn_n_s16
-// CHECK: vqrshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrn_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   return vqrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_s32
-// CHECK: vqrshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrn_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   return vqrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_s64
-// CHECK: vqrshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrn_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   return vqrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u16
-// CHECK: vqrshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrn_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   return vqrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u32
-// CHECK: vqrshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrn_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   return vqrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u64
-// CHECK: vqrshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrn_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   return vqrshrn_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vqrshrun_n_s16
-// CHECK: vqrshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrun_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   return vqrshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrun_n_s32
-// CHECK: vqrshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrun_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   return vqrshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrun_n_s64
-// CHECK: vqrshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqrshrun_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   return vqrshrun_n_s64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vqshl_s8
-// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_s8(
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   return vqshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s16
-// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   return vqshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s32
-// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   return vqshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s64
-// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   return vqshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u8
-// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_u8(
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   return vqshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u16
-// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   return vqshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u32
-// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   return vqshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u64
-// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   return vqshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s8
-// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_s8(
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   return vqshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s16
-// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   return vqshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s32
-// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   return vqshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s64
-// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   return vqshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u8
-// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_u8(
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u16
-// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u32
-// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u64
-// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vqshlu_n_s8
-// CHECK: vqshlu.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlu_n_s8(
+// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHLU_N]]
 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
   return vqshlu_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s16
-// CHECK: vqshlu.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlu_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHLU_N1]]
 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
   return vqshlu_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s32
-// CHECK: vqshlu.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlu_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHLU_N1]]
 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
   return vqshlu_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s64
-// CHECK: vqshlu.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlu_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHLU_N1]]
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   return vqshlu_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s8
-// CHECK: vqshlu.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshluq_n_s8(
+// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHLU_N]]
 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
   return vqshluq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s16
-// CHECK: vqshlu.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshluq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHLU_N1]]
 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
   return vqshluq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s32
-// CHECK: vqshlu.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshluq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHLU_N1]]
 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
   return vqshluq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s64
-// CHECK: vqshlu.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshluq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHLU_N1]]
 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
   return vqshluq_n_s64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vqshl_n_s8
-// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_n_s8(
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   return vqshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s16
-// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHL_N1]]
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
   return vqshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s32
-// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHL_N1]]
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
   return vqshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s64
-// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N1]]
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
   return vqshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u8
-// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_n_u8(
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   return vqshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u16
-// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHL_N1]]
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   return vqshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u32
-// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHL_N1]]
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   return vqshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u64
-// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshl_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N1]]
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   return vqshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s8
-// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_n_s8(
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   return vqshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s16
-// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHL_N1]]
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   return vqshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s32
-// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHL_N1]]
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   return vqshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s64
-// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHL_N1]]
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   return vqshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u8
-// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_n_u8(
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   return vqshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u16
-// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHL_N1]]
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   return vqshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u32
-// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHL_N1]]
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   return vqshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u64
-// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshlq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHL_N1]]
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   return vqshlq_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vqshrn_n_s16
-// CHECK: vqshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrn_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   return vqshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_s32
-// CHECK: vqshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrn_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   return vqshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_s64
-// CHECK: vqshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrn_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   return vqshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u16
-// CHECK: vqshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrn_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   return vqshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u32
-// CHECK: vqshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrn_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   return vqshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u64
-// CHECK: vqshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrn_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   return vqshrn_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vqshrun_n_s16
-// CHECK: vqshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrun_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
   return vqshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrun_n_s32
-// CHECK: vqshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrun_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
   return vqshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrun_n_s64
-// CHECK: vqshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vqshrun_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
   return vqshrun_n_s64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vqsub_s8
-// CHECK: vqsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqsub_s8(
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s16
-// CHECK: vqsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqsub_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   return vqsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s32
-// CHECK: vqsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqsub_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   return vqsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s64
-// CHECK: vqsub.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqsub_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   return vqsub_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u8
-// CHECK: vqsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqsub_u8(
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u16
-// CHECK: vqsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqsub_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   return vqsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u32
-// CHECK: vqsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqsub_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   return vqsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u64
-// CHECK: vqsub.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vqsub_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   return vqsub_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s8
-// CHECK: vqsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqsubq_s8(
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s16
-// CHECK: vqsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqsubq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   return vqsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s32
-// CHECK: vqsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqsubq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   return vqsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s64
-// CHECK: vqsub.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqsubq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   return vqsubq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u8
-// CHECK: vqsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqsubq_u8(
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u16
-// CHECK: vqsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqsubq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vqsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u32
-// CHECK: vqsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqsubq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vqsubq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u64
-// CHECK: vqsub.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vqsubq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vqsubq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vraddhn_s16
-// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vraddhn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   return vraddhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_s32
-// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vraddhn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   return vraddhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_s64
-// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vraddhn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   return vraddhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u16
-// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vraddhn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vraddhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u32
-// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vraddhn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vraddhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u64
-// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vraddhn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vraddhn_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vrecpe_f32
-// CHECK: vrecpe.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrecpe_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4
+// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
   return vrecpe_f32(a);
 }
 
-// CHECK-LABEL: test_vrecpe_u32
-// CHECK: vrecpe.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrecpe_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4
+// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   return vrecpe_u32(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_f32
-// CHECK: vrecpe.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrecpeq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4
+// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   return vrecpeq_f32(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_u32
-// CHECK: vrecpe.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrecpeq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4
+// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   return vrecpeq_u32(a);
 }
 
-
-// CHECK-LABEL: test_vrecps_f32
-// CHECK: vrecps.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrecps_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VRECPS_V2_I]]
 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
   return vrecps_f32(a, b);
 }
 
-// CHECK-LABEL: test_vrecpsq_f32
-// CHECK: vrecps.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrecpsq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VRECPSQ_V2_I]]
 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
   return vrecpsq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vreinterpret_s8_s16
+// CHECK-LABEL: @test_vreinterpret_s8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s32
+// CHECK-LABEL: @test_vreinterpret_s8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s64
+// CHECK-LABEL: @test_vreinterpret_s8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u8
+// CHECK-LABEL: @test_vreinterpret_s8_u8(
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u16
+// CHECK-LABEL: @test_vreinterpret_s8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u32
+// CHECK-LABEL: @test_vreinterpret_s8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u64
+// CHECK-LABEL: @test_vreinterpret_s8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f16
+// CHECK-LABEL: @test_vreinterpret_s8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f32
+// CHECK-LABEL: @test_vreinterpret_s8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p8
+// CHECK-LABEL: @test_vreinterpret_s8_p8(
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p16
+// CHECK-LABEL: @test_vreinterpret_s8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s8
+// CHECK-LABEL: @test_vreinterpret_s16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s32
+// CHECK-LABEL: @test_vreinterpret_s16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s64
+// CHECK-LABEL: @test_vreinterpret_s16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u8
+// CHECK-LABEL: @test_vreinterpret_s16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u16
+// CHECK-LABEL: @test_vreinterpret_s16_u16(
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u32
+// CHECK-LABEL: @test_vreinterpret_s16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u64
+// CHECK-LABEL: @test_vreinterpret_s16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f16
+// CHECK-LABEL: @test_vreinterpret_s16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f32
+// CHECK-LABEL: @test_vreinterpret_s16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p8
+// CHECK-LABEL: @test_vreinterpret_s16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p16
+// CHECK-LABEL: @test_vreinterpret_s16_p16(
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s8
+// CHECK-LABEL: @test_vreinterpret_s32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s16
+// CHECK-LABEL: @test_vreinterpret_s32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s64
+// CHECK-LABEL: @test_vreinterpret_s32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u8
+// CHECK-LABEL: @test_vreinterpret_s32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u16
+// CHECK-LABEL: @test_vreinterpret_s32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u32
+// CHECK-LABEL: @test_vreinterpret_s32_u32(
+// CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u64
+// CHECK-LABEL: @test_vreinterpret_s32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f16
+// CHECK-LABEL: @test_vreinterpret_s32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f32
+// CHECK-LABEL: @test_vreinterpret_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p8
+// CHECK-LABEL: @test_vreinterpret_s32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p16
+// CHECK-LABEL: @test_vreinterpret_s32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s8
+// CHECK-LABEL: @test_vreinterpret_s64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s16
+// CHECK-LABEL: @test_vreinterpret_s64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s32
+// CHECK-LABEL: @test_vreinterpret_s64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u8
+// CHECK-LABEL: @test_vreinterpret_s64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u16
+// CHECK-LABEL: @test_vreinterpret_s64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u32
+// CHECK-LABEL: @test_vreinterpret_s64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u64
+// CHECK-LABEL: @test_vreinterpret_s64_u64(
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f16
+// CHECK-LABEL: @test_vreinterpret_s64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f32
+// CHECK-LABEL: @test_vreinterpret_s64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p8
+// CHECK-LABEL: @test_vreinterpret_s64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p16
+// CHECK-LABEL: @test_vreinterpret_s64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s8
+// CHECK-LABEL: @test_vreinterpret_u8_s8(
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s16
+// CHECK-LABEL: @test_vreinterpret_u8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s32
+// CHECK-LABEL: @test_vreinterpret_u8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s64
+// CHECK-LABEL: @test_vreinterpret_u8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u16
+// CHECK-LABEL: @test_vreinterpret_u8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u32
+// CHECK-LABEL: @test_vreinterpret_u8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u64
+// CHECK-LABEL: @test_vreinterpret_u8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f16
+// CHECK-LABEL: @test_vreinterpret_u8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f32
+// CHECK-LABEL: @test_vreinterpret_u8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p8
+// CHECK-LABEL: @test_vreinterpret_u8_p8(
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p16
+// CHECK-LABEL: @test_vreinterpret_u8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s8
+// CHECK-LABEL: @test_vreinterpret_u16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s16
+// CHECK-LABEL: @test_vreinterpret_u16_s16(
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s32
+// CHECK-LABEL: @test_vreinterpret_u16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s64
+// CHECK-LABEL: @test_vreinterpret_u16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u8
+// CHECK-LABEL: @test_vreinterpret_u16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u32
+// CHECK-LABEL: @test_vreinterpret_u16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u64
+// CHECK-LABEL: @test_vreinterpret_u16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f16
+// CHECK-LABEL: @test_vreinterpret_u16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f32
+// CHECK-LABEL: @test_vreinterpret_u16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p8
+// CHECK-LABEL: @test_vreinterpret_u16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p16
+// CHECK-LABEL: @test_vreinterpret_u16_p16(
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s8
+// CHECK-LABEL: @test_vreinterpret_u32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s16
+// CHECK-LABEL: @test_vreinterpret_u32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s32
+// CHECK-LABEL: @test_vreinterpret_u32_s32(
+// CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s64
+// CHECK-LABEL: @test_vreinterpret_u32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u8
+// CHECK-LABEL: @test_vreinterpret_u32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u16
+// CHECK-LABEL: @test_vreinterpret_u32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u64
+// CHECK-LABEL: @test_vreinterpret_u32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f16
+// CHECK-LABEL: @test_vreinterpret_u32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f32
+// CHECK-LABEL: @test_vreinterpret_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p8
+// CHECK-LABEL: @test_vreinterpret_u32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p16
+// CHECK-LABEL: @test_vreinterpret_u32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s8
+// CHECK-LABEL: @test_vreinterpret_u64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s16
+// CHECK-LABEL: @test_vreinterpret_u64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s32
+// CHECK-LABEL: @test_vreinterpret_u64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s64
+// CHECK-LABEL: @test_vreinterpret_u64_s64(
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u8
+// CHECK-LABEL: @test_vreinterpret_u64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u16
+// CHECK-LABEL: @test_vreinterpret_u64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u32
+// CHECK-LABEL: @test_vreinterpret_u64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f16
+// CHECK-LABEL: @test_vreinterpret_u64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f32
+// CHECK-LABEL: @test_vreinterpret_u64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p8
+// CHECK-LABEL: @test_vreinterpret_u64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p16
+// CHECK-LABEL: @test_vreinterpret_u64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s8
+// CHECK-LABEL: @test_vreinterpret_f16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s16
+// CHECK-LABEL: @test_vreinterpret_f16_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s32
+// CHECK-LABEL: @test_vreinterpret_f16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s64
+// CHECK-LABEL: @test_vreinterpret_f16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u8
+// CHECK-LABEL: @test_vreinterpret_f16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u16
+// CHECK-LABEL: @test_vreinterpret_f16_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u32
+// CHECK-LABEL: @test_vreinterpret_f16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u64
+// CHECK-LABEL: @test_vreinterpret_f16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f32
+// CHECK-LABEL: @test_vreinterpret_f16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p8
+// CHECK-LABEL: @test_vreinterpret_f16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p16
+// CHECK-LABEL: @test_vreinterpret_f16_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s8
+// CHECK-LABEL: @test_vreinterpret_f32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s16
+// CHECK-LABEL: @test_vreinterpret_f32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s32
+// CHECK-LABEL: @test_vreinterpret_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s64
+// CHECK-LABEL: @test_vreinterpret_f32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u8
+// CHECK-LABEL: @test_vreinterpret_f32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u16
+// CHECK-LABEL: @test_vreinterpret_f32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u32
+// CHECK-LABEL: @test_vreinterpret_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u64
+// CHECK-LABEL: @test_vreinterpret_f32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f16
+// CHECK-LABEL: @test_vreinterpret_f32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p8
+// CHECK-LABEL: @test_vreinterpret_f32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p16
+// CHECK-LABEL: @test_vreinterpret_f32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s8
+// CHECK-LABEL: @test_vreinterpret_p8_s8(
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s16
+// CHECK-LABEL: @test_vreinterpret_p8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s32
+// CHECK-LABEL: @test_vreinterpret_p8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s64
+// CHECK-LABEL: @test_vreinterpret_p8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u8
+// CHECK-LABEL: @test_vreinterpret_p8_u8(
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u16
+// CHECK-LABEL: @test_vreinterpret_p8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u32
+// CHECK-LABEL: @test_vreinterpret_p8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u64
+// CHECK-LABEL: @test_vreinterpret_p8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f16
+// CHECK-LABEL: @test_vreinterpret_p8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f32
+// CHECK-LABEL: @test_vreinterpret_p8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p16
+// CHECK-LABEL: @test_vreinterpret_p8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s8
+// CHECK-LABEL: @test_vreinterpret_p16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s16
+// CHECK-LABEL: @test_vreinterpret_p16_s16(
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s32
+// CHECK-LABEL: @test_vreinterpret_p16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s64
+// CHECK-LABEL: @test_vreinterpret_p16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u8
+// CHECK-LABEL: @test_vreinterpret_p16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u16
+// CHECK-LABEL: @test_vreinterpret_p16_u16(
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u32
+// CHECK-LABEL: @test_vreinterpret_p16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u64
+// CHECK-LABEL: @test_vreinterpret_p16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f16
+// CHECK-LABEL: @test_vreinterpret_p16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f32
+// CHECK-LABEL: @test_vreinterpret_p16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p8
+// CHECK-LABEL: @test_vreinterpret_p16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s16
+// CHECK-LABEL: @test_vreinterpretq_s8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s32
+// CHECK-LABEL: @test_vreinterpretq_s8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s64
+// CHECK-LABEL: @test_vreinterpretq_s8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u8
+// CHECK-LABEL: @test_vreinterpretq_s8_u8(
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u16
+// CHECK-LABEL: @test_vreinterpretq_s8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u32
+// CHECK-LABEL: @test_vreinterpretq_s8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u64
+// CHECK-LABEL: @test_vreinterpretq_s8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f16
+// CHECK-LABEL: @test_vreinterpretq_s8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f32
+// CHECK-LABEL: @test_vreinterpretq_s8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p8
+// CHECK-LABEL: @test_vreinterpretq_s8_p8(
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p16
+// CHECK-LABEL: @test_vreinterpretq_s8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s8
+// CHECK-LABEL: @test_vreinterpretq_s16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s32
+// CHECK-LABEL: @test_vreinterpretq_s16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s64
+// CHECK-LABEL: @test_vreinterpretq_s16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u8
+// CHECK-LABEL: @test_vreinterpretq_s16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u16
+// CHECK-LABEL: @test_vreinterpretq_s16_u16(
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u32
+// CHECK-LABEL: @test_vreinterpretq_s16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u64
+// CHECK-LABEL: @test_vreinterpretq_s16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f16
+// CHECK-LABEL: @test_vreinterpretq_s16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f32
+// CHECK-LABEL: @test_vreinterpretq_s16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p8
+// CHECK-LABEL: @test_vreinterpretq_s16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p16
+// CHECK-LABEL: @test_vreinterpretq_s16_p16(
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s8
+// CHECK-LABEL: @test_vreinterpretq_s32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s16
+// CHECK-LABEL: @test_vreinterpretq_s32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s64
+// CHECK-LABEL: @test_vreinterpretq_s32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u8
+// CHECK-LABEL: @test_vreinterpretq_s32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u16
+// CHECK-LABEL: @test_vreinterpretq_s32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u32
+// CHECK-LABEL: @test_vreinterpretq_s32_u32(
+// CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u64
+// CHECK-LABEL: @test_vreinterpretq_s32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f16
+// CHECK-LABEL: @test_vreinterpretq_s32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f32
+// CHECK-LABEL: @test_vreinterpretq_s32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p8
+// CHECK-LABEL: @test_vreinterpretq_s32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p16
+// CHECK-LABEL: @test_vreinterpretq_s32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s8
+// CHECK-LABEL: @test_vreinterpretq_s64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s16
+// CHECK-LABEL: @test_vreinterpretq_s64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s32
+// CHECK-LABEL: @test_vreinterpretq_s64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u8
+// CHECK-LABEL: @test_vreinterpretq_s64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u16
+// CHECK-LABEL: @test_vreinterpretq_s64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u32
+// CHECK-LABEL: @test_vreinterpretq_s64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u64
+// CHECK-LABEL: @test_vreinterpretq_s64_u64(
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f16
+// CHECK-LABEL: @test_vreinterpretq_s64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f32
+// CHECK-LABEL: @test_vreinterpretq_s64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p8
+// CHECK-LABEL: @test_vreinterpretq_s64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p16
+// CHECK-LABEL: @test_vreinterpretq_s64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s8
+// CHECK-LABEL: @test_vreinterpretq_u8_s8(
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s16
+// CHECK-LABEL: @test_vreinterpretq_u8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s32
+// CHECK-LABEL: @test_vreinterpretq_u8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s64
+// CHECK-LABEL: @test_vreinterpretq_u8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u16
+// CHECK-LABEL: @test_vreinterpretq_u8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u32
+// CHECK-LABEL: @test_vreinterpretq_u8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u64
+// CHECK-LABEL: @test_vreinterpretq_u8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f16
+// CHECK-LABEL: @test_vreinterpretq_u8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f32
+// CHECK-LABEL: @test_vreinterpretq_u8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p8
+// CHECK-LABEL: @test_vreinterpretq_u8_p8(
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p16
+// CHECK-LABEL: @test_vreinterpretq_u8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s8
+// CHECK-LABEL: @test_vreinterpretq_u16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s16
+// CHECK-LABEL: @test_vreinterpretq_u16_s16(
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s32
+// CHECK-LABEL: @test_vreinterpretq_u16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s64
+// CHECK-LABEL: @test_vreinterpretq_u16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u8
+// CHECK-LABEL: @test_vreinterpretq_u16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u32
+// CHECK-LABEL: @test_vreinterpretq_u16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u64
+// CHECK-LABEL: @test_vreinterpretq_u16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f16
+// CHECK-LABEL: @test_vreinterpretq_u16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f32
+// CHECK-LABEL: @test_vreinterpretq_u16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p8
+// CHECK-LABEL: @test_vreinterpretq_u16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p16
+// CHECK-LABEL: @test_vreinterpretq_u16_p16(
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s8
+// CHECK-LABEL: @test_vreinterpretq_u32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s16
+// CHECK-LABEL: @test_vreinterpretq_u32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s32
+// CHECK-LABEL: @test_vreinterpretq_u32_s32(
+// CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s64
+// CHECK-LABEL: @test_vreinterpretq_u32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u8
+// CHECK-LABEL: @test_vreinterpretq_u32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u16
+// CHECK-LABEL: @test_vreinterpretq_u32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u64
+// CHECK-LABEL: @test_vreinterpretq_u32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f16
+// CHECK-LABEL: @test_vreinterpretq_u32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f32
+// CHECK-LABEL: @test_vreinterpretq_u32_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p8
+// CHECK-LABEL: @test_vreinterpretq_u32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p16
+// CHECK-LABEL: @test_vreinterpretq_u32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s8
+// CHECK-LABEL: @test_vreinterpretq_u64_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s16
+// CHECK-LABEL: @test_vreinterpretq_u64_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s32
+// CHECK-LABEL: @test_vreinterpretq_u64_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s64
+// CHECK-LABEL: @test_vreinterpretq_u64_s64(
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u8
+// CHECK-LABEL: @test_vreinterpretq_u64_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u16
+// CHECK-LABEL: @test_vreinterpretq_u64_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u32
+// CHECK-LABEL: @test_vreinterpretq_u64_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f16
+// CHECK-LABEL: @test_vreinterpretq_u64_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f32
+// CHECK-LABEL: @test_vreinterpretq_u64_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p8
+// CHECK-LABEL: @test_vreinterpretq_u64_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p16
+// CHECK-LABEL: @test_vreinterpretq_u64_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s8
+// CHECK-LABEL: @test_vreinterpretq_f16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s16
+// CHECK-LABEL: @test_vreinterpretq_f16_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s32
+// CHECK-LABEL: @test_vreinterpretq_f16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s64
+// CHECK-LABEL: @test_vreinterpretq_f16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u8
+// CHECK-LABEL: @test_vreinterpretq_f16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u16
+// CHECK-LABEL: @test_vreinterpretq_f16_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u32
+// CHECK-LABEL: @test_vreinterpretq_f16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u64
+// CHECK-LABEL: @test_vreinterpretq_f16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f32
+// CHECK-LABEL: @test_vreinterpretq_f16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p8
+// CHECK-LABEL: @test_vreinterpretq_f16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p16
+// CHECK-LABEL: @test_vreinterpretq_f16_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s8
+// CHECK-LABEL: @test_vreinterpretq_f32_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s16
+// CHECK-LABEL: @test_vreinterpretq_f32_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s32
+// CHECK-LABEL: @test_vreinterpretq_f32_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s64
+// CHECK-LABEL: @test_vreinterpretq_f32_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u8
+// CHECK-LABEL: @test_vreinterpretq_f32_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u16
+// CHECK-LABEL: @test_vreinterpretq_f32_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u32
+// CHECK-LABEL: @test_vreinterpretq_f32_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u64
+// CHECK-LABEL: @test_vreinterpretq_f32_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f16
+// CHECK-LABEL: @test_vreinterpretq_f32_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p8
+// CHECK-LABEL: @test_vreinterpretq_f32_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p16
+// CHECK-LABEL: @test_vreinterpretq_f32_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s8
+// CHECK-LABEL: @test_vreinterpretq_p8_s8(
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s16
+// CHECK-LABEL: @test_vreinterpretq_p8_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s32
+// CHECK-LABEL: @test_vreinterpretq_p8_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s64
+// CHECK-LABEL: @test_vreinterpretq_p8_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u8
+// CHECK-LABEL: @test_vreinterpretq_p8_u8(
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u16
+// CHECK-LABEL: @test_vreinterpretq_p8_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u32
+// CHECK-LABEL: @test_vreinterpretq_p8_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u64
+// CHECK-LABEL: @test_vreinterpretq_p8_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f16
+// CHECK-LABEL: @test_vreinterpretq_p8_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f32
+// CHECK-LABEL: @test_vreinterpretq_p8_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p16
+// CHECK-LABEL: @test_vreinterpretq_p8_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s8
+// CHECK-LABEL: @test_vreinterpretq_p16_s8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s16
+// CHECK-LABEL: @test_vreinterpretq_p16_s16(
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s32
+// CHECK-LABEL: @test_vreinterpretq_p16_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s64
+// CHECK-LABEL: @test_vreinterpretq_p16_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u8
+// CHECK-LABEL: @test_vreinterpretq_p16_u8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u16
+// CHECK-LABEL: @test_vreinterpretq_p16_u16(
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u32
+// CHECK-LABEL: @test_vreinterpretq_p16_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u64
+// CHECK-LABEL: @test_vreinterpretq_p16_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f16
+// CHECK-LABEL: @test_vreinterpretq_p16_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f32
+// CHECK-LABEL: @test_vreinterpretq_p16_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p8
+// CHECK-LABEL: @test_vreinterpretq_p16_p8(
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
-
-// CHECK-LABEL: test_vrev16_s8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev16_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16_u8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev16_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16_p8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev16_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_s8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev16q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_u8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev16q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_p8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev16q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-
-// CHECK-LABEL: test_vrev32_s8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32_u8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32_u16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32_p8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_p16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev32q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-
-// CHECK-LABEL: test_vrev64_s8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64_s16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64_u8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64_u16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64_u32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64_p8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64_p16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_f32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_s8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_s16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_s32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_u8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_u16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_u32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_p8(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_p16(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_f32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrev64q_f32(
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
-
-// CHECK-LABEL: test_vrhadd_s8
-// CHECK: vrhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrhadd_s8(
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
   return vrhadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_s16
-// CHECK: vrhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrhadd_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
   return vrhadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_s32
-// CHECK: vrhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrhadd_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
   return vrhadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u8
-// CHECK: vrhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrhadd_u8(
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vrhadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u16
-// CHECK: vrhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrhadd_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vrhadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u32
-// CHECK: vrhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrhadd_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vrhadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s8
-// CHECK: vrhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrhaddq_s8(
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
   return vrhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s16
-// CHECK: vrhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrhaddq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
   return vrhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s32
-// CHECK: vrhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrhaddq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
   return vrhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u8
-// CHECK: vrhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrhaddq_u8(
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vrhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u16
-// CHECK: vrhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrhaddq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vrhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u32
-// CHECK: vrhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrhaddq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vrhaddq_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vrshl_s8
-// CHECK: vrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrshl_s8(
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   return vrshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s16
-// CHECK: vrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrshl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   return vrshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s32
-// CHECK: vrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrshl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   return vrshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s64
-// CHECK: vrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrshl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   return vrshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u8
-// CHECK: vrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrshl_u8(
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   return vrshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u16
-// CHECK: vrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrshl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   return vrshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u32
-// CHECK: vrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrshl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   return vrshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u64
-// CHECK: vrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrshl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   return vrshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s8
-// CHECK: vrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrshlq_s8(
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   return vrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s16
-// CHECK: vrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrshlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   return vrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s32
-// CHECK: vrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrshlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   return vrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s64
-// CHECK: vrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrshlq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   return vrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u8
-// CHECK: vrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrshlq_u8(
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u16
-// CHECK: vrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrshlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u32
-// CHECK: vrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrshlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u64
-// CHECK: vrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrshlq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vrshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vrshrn_n_s16
-// CHECK: vrshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrn_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   return vrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_s32
-// CHECK: vrshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrn_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   return vrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_s64
-// CHECK: vrshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrn_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   return vrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u16
-// CHECK: vrshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrn_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   return vrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u32
-// CHECK: vrshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrn_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   return vrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u64
-// CHECK: vrshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrn_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   return vrshrn_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vrshr_n_s8
-// CHECK: vrshr.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshr_n_s8(
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   return vrshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s16
-// CHECK: vrshr.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshr_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VRSHR_N1]]
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
   return vrshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s32
-// CHECK: vrshr.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshr_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VRSHR_N1]]
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
   return vrshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s64
-// CHECK: vrshr.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshr_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N1]]
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
   return vrshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u8
-// CHECK: vrshr.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshr_n_u8(
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
   return vrshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u16
-// CHECK: vrshr.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshr_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VRSHR_N1]]
 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
   return vrshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u32
-// CHECK: vrshr.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshr_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VRSHR_N1]]
 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
   return vrshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u64
-// CHECK: vrshr.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshr_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N1]]
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   return vrshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s8
-// CHECK: vrshr.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrq_n_s8(
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   return vrshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s16
-// CHECK: vrshr.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VRSHR_N1]]
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   return vrshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s32
-// CHECK: vrshr.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VRSHR_N1]]
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   return vrshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s64
-// CHECK: vrshr.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VRSHR_N1]]
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   return vrshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u8
-// CHECK: vrshr.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrq_n_u8(
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
   return vrshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u16
-// CHECK: vrshr.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VRSHR_N1]]
 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
   return vrshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u32
-// CHECK: vrshr.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VRSHR_N1]]
 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
   return vrshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u64
-// CHECK: vrshr.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrshrq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VRSHR_N1]]
 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
   return vrshrq_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vrsqrte_f32
-// CHECK: vrsqrte.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrsqrte_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4
+// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
   return vrsqrte_f32(a);
 }
 
-// CHECK-LABEL: test_vrsqrte_u32
-// CHECK: vrsqrte.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrsqrte_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a) #4
+// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   return vrsqrte_u32(a);
 }
 
-// CHECK-LABEL: test_vrsqrteq_f32
-// CHECK: vrsqrte.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsqrteq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4
+// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
   return vrsqrteq_f32(a);
 }
 
-// CHECK-LABEL: test_vrsqrteq_u32
-// CHECK: vrsqrte.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsqrteq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a) #4
+// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   return vrsqrteq_u32(a);
 }
 
-
-// CHECK-LABEL: test_vrsqrts_f32
-// CHECK: vrsqrts.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vrsqrts_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %a, <2 x float> %b) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x float> [[VRSQRTS_V2_I]]
 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
   return vrsqrts_f32(a, b);
 }
 
-// CHECK-LABEL: test_vrsqrtsq_f32
-// CHECK: vrsqrts.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsqrtsq_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %a, <4 x float> %b) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x float> [[VRSQRTSQ_V2_I]]
 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
   return vrsqrtsq_f32(a, b);
 }
 
-
-// CHECK-LABEL: test_vrsra_n_s8
-// CHECK: vrsra.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsra_n_s8(
+// CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
+// CHECK:   ret <8 x i8> [[VRSRA_N]]
 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   return vrsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s16
-// CHECK: vrsra.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsra_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
+// CHECK:   ret <4 x i16> [[VRSRA_N]]
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   return vrsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s32
-// CHECK: vrsra.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsra_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+// CHECK:   ret <2 x i32> [[VRSRA_N]]
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   return vrsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s64
-// CHECK: vrsra.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsra_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
+// CHECK:   ret <1 x i64> [[VRSRA_N]]
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   return vrsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u8
-// CHECK: vrsra.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsra_n_u8(
+// CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
+// CHECK:   ret <8 x i8> [[VRSRA_N]]
 uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vrsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u16
-// CHECK: vrsra.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsra_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
+// CHECK:   ret <4 x i16> [[VRSRA_N]]
 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vrsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u32
-// CHECK: vrsra.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsra_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+// CHECK:   ret <2 x i32> [[VRSRA_N]]
 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vrsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u64
-// CHECK: vrsra.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsra_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
+// CHECK:   ret <1 x i64> [[VRSRA_N]]
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vrsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s8
-// CHECK: vrsra.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsraq_n_s8(
+// CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
+// CHECK:   ret <16 x i8> [[VRSRA_N]]
 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vrsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s16
-// CHECK: vrsra.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsraq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i16> [[VRSRA_N]]
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vrsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s32
-// CHECK: vrsra.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsraq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+// CHECK:   ret <4 x i32> [[VRSRA_N]]
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vrsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s64
-// CHECK: vrsra.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsraq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+// CHECK:   ret <2 x i64> [[VRSRA_N]]
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vrsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u8
-// CHECK: vrsra.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsraq_n_u8(
+// CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
+// CHECK:   ret <16 x i8> [[VRSRA_N]]
 uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vrsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u16
-// CHECK: vrsra.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsraq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i16> [[VRSRA_N]]
 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vrsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u32
-// CHECK: vrsra.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsraq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+// CHECK:   ret <4 x i32> [[VRSRA_N]]
 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vrsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u64
-// CHECK: vrsra.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vrsraq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+// CHECK:   ret <2 x i64> [[VRSRA_N]]
 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vrsraq_n_u64(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vrsubhn_s16
-// CHECK: vrsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsubhn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   return vrsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_s32
-// CHECK: vrsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsubhn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   return vrsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_s64
-// CHECK: vrsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsubhn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   return vrsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u16
-// CHECK: vrsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsubhn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vrsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u32
-// CHECK: vrsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsubhn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vrsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u64
-// CHECK: vrsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vrsubhn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vrsubhn_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vset_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vset_lane_u8(
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
   return vset_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vset_lane_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
   return vset_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_u32
-// CHECK: mov 
+// CHECK-LABEL: @test_vset_lane_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
   return vset_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vset_lane_s8(
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
   return vset_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vset_lane_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
   return vset_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_s32
-// CHECK: mov 
+// CHECK-LABEL: @test_vset_lane_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
   return vset_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vset_lane_p8(
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
   return vset_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vset_lane_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
   return vset_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_f32
-// CHECK: mov 
+// CHECK-LABEL: @test_vset_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VSET_LANE]]
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
   return vset_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_f16
-// CHECK: mov 
+// CHECK-LABEL: @test_vset_lane_f16(
+// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
+// CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 1
+// CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
+// CHECK:   ret <4 x half> [[TMP8]]
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
   return vset_lane_f16(*a, b, 1);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_u8(
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
   return vsetq_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
   return vsetq_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
   return vsetq_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_s8(
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
   return vsetq_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
   return vsetq_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
   return vsetq_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_p8(
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
   return vsetq_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
   return vsetq_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
   return vsetq_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_f16(
+// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   return vsetq_lane_f16(*a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_s64
-// The optimizer is able to get rid of all moves now.
+// CHECK-LABEL: @test_vset_lane_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
   return vset_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vset_lane_u64
-// The optimizer is able to get rid of all moves now.
+// CHECK-LABEL: @test_vset_lane_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
   return vset_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s64
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
   return vsetq_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u64
-// CHECK: vmov 
+// CHECK-LABEL: @test_vsetq_lane_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
   return vsetq_lane_u64(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vshl_s8
-// CHECK: vshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_s8(
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   return vshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s16
-// CHECK: vshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   return vshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s32
-// CHECK: vshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   return vshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s64
-// CHECK: vshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   return vshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u8
-// CHECK: vshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_u8(
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   return vshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u16
-// CHECK: vshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   return vshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u32
-// CHECK: vshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   return vshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u64
-// CHECK: vshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   return vshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s8
-// CHECK: vshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_s8(
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   return vshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s16
-// CHECK: vshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   return vshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s32
-// CHECK: vshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   return vshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s64
-// CHECK: vshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   return vshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u8
-// CHECK: vshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_u8(
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   return vshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u16
-// CHECK: vshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   return vshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u32
-// CHECK: vshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   return vshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u64
-// CHECK: vshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   return vshlq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vshll_n_s8
-// CHECK: vshll.s8 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshll_n_s8(
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_s16
-// CHECK: vshll.s16 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshll_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_s32
-// CHECK: vshll.s32 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshll_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u8
-// CHECK: vshll.u8 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshll_n_u8(
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u16
-// CHECK: vshll.u16 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshll_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u32
-// CHECK: vshll.u32 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshll_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 1);
 }
 
-
-// CHECK-LABEL: test_vshl_n_s8
-// CHECK: vshl.i8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_n_s8(
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_s8(int8x8_t a) {
   return vshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s16
-// CHECK: vshl.i16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_s16(int16x4_t a) {
   return vshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s32
-// CHECK: vshl.i32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_s32(int32x2_t a) {
   return vshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s64
-// CHECK: vshl.i64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 int64x1_t test_vshl_n_s64(int64x1_t a) {
   return vshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u8
-// CHECK: vshl.i8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_n_u8(
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
   return vshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u16
-// CHECK: vshl.i16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
   return vshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u32
-// CHECK: vshl.i32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
   return vshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u64
-// CHECK: vshl.i64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshl_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   return vshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s8
-// CHECK: vshl.i8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_n_s8(
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   return vshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s16
-// CHECK: vshl.i16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
   return vshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s32
-// CHECK: vshl.i32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
   return vshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s64
-// CHECK: vshl.i64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
   return vshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u8
-// CHECK: vshl.i8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_n_u8(
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
   return vshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u16
-// CHECK: vshl.i16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
   return vshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u32
-// CHECK: vshl.i32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
   return vshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u64
-// CHECK: vshl.i64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshlq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
   return vshlq_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vshrn_n_s16
-// CHECK: vshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrn_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
   return vshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_s32
-// CHECK: vshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrn_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
   return vshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_s64
-// CHECK: vshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrn_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
   return vshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u16
-// CHECK: vshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrn_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   return vshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u32
-// CHECK: vshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrn_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   return vshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u64
-// CHECK: vshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrn_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   return vshrn_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vshr_n_s8
-// CHECK: vshr.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshr_n_s8(
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_s8(int8x8_t a) {
   return vshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s16
-// CHECK: vshr.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshr_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_s16(int16x4_t a) {
   return vshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s32
-// CHECK: vshr.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshr_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_s32(int32x2_t a) {
   return vshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s64
-// CHECK: vshr.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshr_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 int64x1_t test_vshr_n_s64(int64x1_t a) {
   return vshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u8
-// CHECK: vshr.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshr_n_u8(
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
   return vshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u16
-// CHECK: vshr.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshr_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
   return vshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u32
-// CHECK: vshr.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshr_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
   return vshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u64
-// CHECK: vshr.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshr_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   return vshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s8
-// CHECK: vshr.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrq_n_s8(
+// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   return vshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s16
-// CHECK: vshr.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
   return vshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s32
-// CHECK: vshr.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
   return vshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s64
-// CHECK: vshr.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
   return vshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u8
-// CHECK: vshr.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrq_n_u8(
+// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
   return vshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u16
-// CHECK: vshr.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
   return vshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u32
-// CHECK: vshr.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
   return vshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u64
-// CHECK: vshr.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vshrq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
   return vshrq_n_u64(a, 1);
 }
 
-
-// CHECK-LABEL: test_vsli_n_s8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_s8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   return vsli_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   return vsli_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s32
-// CHECK: vsli.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   return vsli_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s64
-// CHECK: vsli.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   return vsli_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_u8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsli_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsli_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u32
-// CHECK: vsli.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsli_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u64
-// CHECK: vsli.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsli_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_p8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_p8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsli_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_p16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsli_n_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsli_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_s8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   return vsliq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   return vsliq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s32
-// CHECK: vsli.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   return vsliq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s64
-// CHECK: vsli.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   return vsliq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_u8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsliq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsliq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u32
-// CHECK: vsli.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsliq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u64
-// CHECK: vsli.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsliq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_p8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_p8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsliq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_p16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsliq_n_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsliq_n_p16(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vsra_n_s8
-// CHECK: vsra.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsra_n_s8(
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   return vsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s16
-// CHECK: vsra.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsra_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   return vsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s32
-// CHECK: vsra.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsra_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   return vsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s64
-// CHECK: vsra.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsra_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   return vsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u8
-// CHECK: vsra.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsra_n_u8(
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u16
-// CHECK: vsra.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsra_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u32
-// CHECK: vsra.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsra_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u64
-// CHECK: vsra.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsra_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s8
-// CHECK: vsra.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsraq_n_s8(
+// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s16
-// CHECK: vsra.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsraq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s32
-// CHECK: vsra.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsraq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s64
-// CHECK: vsra.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsraq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 1, i64 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u8
-// CHECK: vsra.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsraq_n_u8(
+// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u16
-// CHECK: vsra.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsraq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u32
-// CHECK: vsra.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsraq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u64
-// CHECK: vsra.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsraq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 1, i64 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsraq_n_u64(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vsri_n_s8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_s8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   return vsri_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   return vsri_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s32
-// CHECK: vsri.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   return vsri_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s64
-// CHECK: vsri.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   return vsri_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_u8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsri_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsri_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u32
-// CHECK: vsri.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsri_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u64
-// CHECK: vsri.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsri_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_p8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_p8(
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsri_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_p16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsri_n_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsri_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_s8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   return vsriq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   return vsriq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s32
-// CHECK: vsri.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   return vsriq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s64
-// CHECK: vsri.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   return vsriq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_u8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsriq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsriq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u32
-// CHECK: vsri.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsriq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u64
-// CHECK: vsri.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsriq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_p8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_p8(
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsriq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_p16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: @test_vsriq_n_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsriq_n_p16(a, b, 1);
 }
 
-
-// CHECK-LABEL: test_vst1q_u8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_u8(
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
   vst1q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
   vst1q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
   vst1q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u64
-// CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: @test_vst1q_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
   vst1q_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_s8(
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1q_s8(int8_t * a, int8x16_t b) {
   vst1q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1q_s16(int16_t * a, int16x8_t b) {
   vst1q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_s32(int32_t * a, int32x4_t b) {
   vst1q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s64
-// CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: @test_vst1q_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_s64(int64_t * a, int64x2_t b) {
   vst1q_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_f16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1q_f16(float16_t * a, float16x8_t b) {
   vst1q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_f32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* [[TMP0]], <4 x float> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_f32(float32_t * a, float32x4_t b) {
   vst1q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_p8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_p8(
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
   vst1q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_p16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
   vst1q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_u8(
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1_u8(uint8_t * a, uint8x8_t b) {
   vst1_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1_u16(uint16_t * a, uint16x4_t b) {
   vst1_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_u32(uint32_t * a, uint32x2_t b) {
   vst1_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u64
-// CHECK: vst1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: @test_vst1_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_u64(uint64_t * a, uint64x1_t b) {
   vst1_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_s8(
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1_s8(int8_t * a, int8x8_t b) {
   vst1_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1_s16(int16_t * a, int16x4_t b) {
   vst1_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_s32(int32_t * a, int32x2_t b) {
   vst1_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s64
-// CHECK: vst1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: @test_vst1_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_s64(int64_t * a, int64x1_t b) {
   vst1_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst1_f16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1_f16(float16_t * a, float16x4_t b) {
   vst1_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_f32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* [[TMP0]], <2 x float> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_f32(float32_t * a, float32x2_t b) {
   vst1_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_p8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_p8(
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1_p8(poly8_t * a, poly8x8_t b) {
   vst1_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_p16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1_p16(poly16_t * a, poly16x4_t b) {
   vst1_p16(a, b);
 }
 
-
-// CHECK-LABEL: test_vst1q_lane_u8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_lane_u8(
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
   vst1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vst1q_lane_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
   vst1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vst1q_lane_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
   vst1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: @test_vst1q_lane_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
+// CHECK:   ret void
 void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
   vst1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_lane_s8(
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
   vst1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vst1q_lane_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
   vst1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vst1q_lane_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
   vst1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: @test_vst1q_lane_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
+// CHECK:   ret void
 void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
   vst1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1q_lane_f16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vst1q_lane_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_f32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vst1q_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
   vst1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_p8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1q_lane_p8(
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
   vst1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_p16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vst1q_lane_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
   vst1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_u8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_lane_u8(
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
   vst1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_u16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vst1_lane_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
   vst1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_u32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vst1_lane_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
   vst1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_u64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: @test_vst1_lane_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
   vst1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vst1_lane_s8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_lane_s8(
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
   vst1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_s16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vst1_lane_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
   vst1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_s32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vst1_lane_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
   vst1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_s64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: @test_vst1_lane_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
   vst1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vst1_lane_f16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vst1_lane_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_f32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: @test_vst1_lane_f32(
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
   vst1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_p8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst1_lane_p8(
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
   vst1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_p16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: @test_vst1_lane_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
   vst1_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vst2q_u8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
   vst2q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_u16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_u32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
   vst2q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
   vst2q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
   vst2q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_f16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
   vst2q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_f32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
   vst2q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_p8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
   vst2q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_p16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u64
-// CHECK: vst1.64
+// CHECK-LABEL: @test_vst2_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
   vst2_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2_s8(int8_t * a, int8x8x2_t b) {
   vst2_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2_s16(int16_t * a, int16x4x2_t b) {
   vst2_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_s32(int32_t * a, int32x2x2_t b) {
   vst2_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s64
-// CHECK: vst1.64
+// CHECK-LABEL: @test_vst2_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_s64(int64_t * a, int64x1x2_t b) {
   vst2_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst2_f16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
   vst2_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_f32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_f32(float32_t * a, float32x2x2_t b) {
   vst2_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_p8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_p16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_p16(a, b);
 }
 
-
-// CHECK-LABEL: test_vst2q_lane_u16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_u32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_s16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
   vst2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_s32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
   vst2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_f16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_f32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
   vst2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_p16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2q_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_u8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_u16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_u32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_s8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
   vst2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_s16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
   vst2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_s32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
   vst2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_f16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_f32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
   vst2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_p8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_p16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst2_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vst3q_u8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
   vst3q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_u16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_u32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
   vst3q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
   vst3q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
   vst3q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_f16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
   vst3q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_f32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
   vst3q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_p8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
   vst3q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_p16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst3q_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u64
-// CHECK: vst1.64
+// CHECK-LABEL: @test_vst3_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
   vst3_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3_s8(int8_t * a, int8x8x3_t b) {
   vst3_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3_s16(int16_t * a, int16x4x3_t b) {
   vst3_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_s32(int32_t * a, int32x2x3_t b) {
   vst3_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s64
-// CHECK: vst1.64
+// CHECK-LABEL: @test_vst3_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_s64(int64_t * a, int64x1x3_t b) {
   vst3_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst3_f16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
   vst3_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_f32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_f32(float32_t * a, float32x2x3_t b) {
   vst3_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_p8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_p16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_p16(a, b);
 }
 
-
-// CHECK-LABEL: test_vst3q_lane_u16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst3q_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_u32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst3q_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_s16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst3q_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
   vst3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_s32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst3q_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
   vst3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_f16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst3q_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_f32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst3q_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
   vst3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_p16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst3q_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_u8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_u16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_u32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_s8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
   vst3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_s16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
   vst3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_s32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
   vst3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_f16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_f32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
   vst3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_p8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_p16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst3_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vst4q_u8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
   vst4q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_u16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_u32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
   vst4q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
   vst4q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
   vst4q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_f16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
   vst4q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_f32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
   vst4q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_p8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
   vst4q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_p16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: @test_vst4q_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u64
-// CHECK: vst1.64
+// CHECK-LABEL: @test_vst4_u64(
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
   vst4_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4_s8(int8_t * a, int8x8x4_t b) {
   vst4_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4_s16(int16_t * a, int16x4x4_t b) {
   vst4_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_s32(int32_t * a, int32x2x4_t b) {
   vst4_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s64
-// CHECK: vst1.64
+// CHECK-LABEL: @test_vst4_s64(
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_s64(int64_t * a, int64x1x4_t b) {
   vst4_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst4_f16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
   vst4_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_f32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_f32(float32_t * a, float32x2x4_t b) {
   vst4_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_p8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_p16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_p16(a, b);
 }
 
-
-// CHECK-LABEL: test_vst4q_lane_u16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst4q_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_u32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst4q_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_s16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst4q_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
   vst4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_s32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst4q_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
   vst4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_f16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst4q_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_f32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst4q_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
   vst4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_p16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: @test_vst4q_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_u8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_u8(
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_u16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_u16(
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_u32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_u32(
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_s8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_s8(
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
   vst4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_s16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_s16(
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
   vst4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_s32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_s32(
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
   vst4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_f16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_f16(
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_f32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_f32(
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
   vst4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_p8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_p8(
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_p16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: @test_vst4_lane_p16(
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_lane_p16(a, b, 3);
 }
 
-
-// CHECK-LABEL: test_vsub_s8
-// CHECK: vsub.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_s8(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
   return vsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s16
-// CHECK: vsub.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_s16(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
   return vsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s32
-// CHECK: vsub.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_s32(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
   return vsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s64
-// CHECK: vsub.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_s64(
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
   return vsub_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsub_f32
-// CHECK: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
   return vsub_f32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u8
-// CHECK: vsub.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_u8(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
   return vsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u16
-// CHECK: vsub.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_u16(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
   return vsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u32
-// CHECK: vsub.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_u32(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
   return vsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u64
-// CHECK: vsub.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsub_u64(
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[SUB_I]]
 uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
   return vsub_u64(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s8
-// CHECK: vsub.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_s8(
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
   return vsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s16
-// CHECK: vsub.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_s16(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
   return vsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s32
-// CHECK: vsub.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_s32(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
   return vsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s64
-// CHECK: vsub.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_s64(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
   return vsubq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_f32
-// CHECK: vsub.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_f32(
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
   return vsubq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u8
-// CHECK: vsub.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_u8(
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u16
-// CHECK: vsub.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_u16(
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u32
-// CHECK: vsub.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_u32(
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vsubq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u64
-// CHECK: vsub.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubq_u64(
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vsubq_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vsubhn_s16
-// CHECK: vsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubhn_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   return vsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_s32
-// CHECK: vsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubhn_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   return vsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_s64
-// CHECK: vsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubhn_s64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   return vsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u16
-// CHECK: vsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubhn_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u32
-// CHECK: vsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubhn_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u64
-// CHECK: vsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vsubhn_u64(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vsubhn_u64(a, b);
 }
 
-
-// CHECK-LABEL: test_vsubl_s8
-// CHECK: vsubl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubl_s8(
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   return vsubl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_s16
-// CHECK: vsubl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubl_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   return vsubl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_s32
-// CHECK: vsubl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubl_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   return vsubl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u8
-// CHECK: vsubl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubl_u8(
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   return vsubl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u16
-// CHECK: vsubl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubl_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   return vsubl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u32
-// CHECK: vsubl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubl_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   return vsubl_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vsubw_s8
-// CHECK: vsubw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubw_s8(
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   return vsubw_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_s16
-// CHECK: vsubw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubw_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   return vsubw_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_s32
-// CHECK: vsubw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubw_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   return vsubw_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u8
-// CHECK: vsubw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubw_u8(
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   return vsubw_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u16
-// CHECK: vsubw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubw_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   return vsubw_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u32
-// CHECK: vsubw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vsubw_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   return vsubw_u32(a, b);
 }
 
-
-// CHECK-LABEL: test_vtbl1_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl1_u8(
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
   return vtbl1_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl1_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl1_s8(
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
   return vtbl1_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl1_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl1_p8(
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
   return vtbl1_p8(a, b);
 }
 
-
-// CHECK-LABEL: test_vtbl2_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl2_u8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
   return vtbl2_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl2_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl2_s8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
   return vtbl2_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl2_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl2_p8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
   return vtbl2_p8(a, b);
 }
 
-
-// CHECK-LABEL: test_vtbl3_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl3_u8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
   return vtbl3_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl3_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl3_s8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
   return vtbl3_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl3_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl3_p8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
   return vtbl3_p8(a, b);
 }
 
-
-// CHECK-LABEL: test_vtbl4_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl4_u8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
   return vtbl4_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl4_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl4_s8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
   return vtbl4_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl4_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbl4_p8(
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
   return vtbl4_p8(a, b);
 }
 
-
-// CHECK-LABEL: test_vtbx1_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx1_u8(
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vtbx1_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx1_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx1_s8(
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vtbx1_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx1_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx1_p8(
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
   return vtbx1_p8(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vtbx2_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx2_u8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
   return vtbx2_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx2_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx2_s8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
   return vtbx2_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx2_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx2_p8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
   return vtbx2_p8(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vtbx3_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx3_u8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
   return vtbx3_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx3_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx3_s8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
   return vtbx3_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx3_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx3_p8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
   return vtbx3_p8(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vtbx4_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx4_u8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
   return vtbx4_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx4_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx4_s8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
   return vtbx4_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx4_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtbx4_p8(
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   return vtbx4_p8(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vtrn_s8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_s8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !3
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !3
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_s16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_s16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !6
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !6
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_s32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_s32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !noalias !9
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !9
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_u8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !12
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !12
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_u16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !15
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !15
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_u32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !noalias !18
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !18
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_f32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_f32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]], !noalias !21
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !noalias !21
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_p8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_p8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !24
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !24
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_p16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtrn_p16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !27
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !27
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_s8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !30
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !30
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_s16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !33
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !33
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_s32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !noalias !36
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !36
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_u8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !39
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !39
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_u16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !42
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !42
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_u32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !noalias !45
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !45
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_f32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_f32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]], !noalias !48
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !noalias !48
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_p8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_p8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !51
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !51
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_p16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtrnq_p16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !54
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !54
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
 }
 
-
-// CHECK-LABEL: test_vtst_s8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtst_s8(
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
   return vtst_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_s16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtst_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
   return vtst_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtst_s32
-// CHECK: vtst.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtst_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <2 x i32> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
   return vtst_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtst_u8(
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
   return vtst_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtst_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
   return vtst_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u32
-// CHECK: vtst.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtst_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <2 x i32> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
   return vtst_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtst_p8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtst_p8(
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
   return vtst_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_p16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vtst_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
   return vtst_p16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtstq_s8(
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
   return vtstq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtstq_s16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
   return vtstq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s32
-// CHECK: vtst.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtstq_s32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i32> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
   return vtstq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtstq_u8(
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
   return vtstq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtstq_u16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
   return vtstq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u32
-// CHECK: vtst.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtstq_u32(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <4 x i32> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
   return vtstq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_p8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtstq_p8(
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
   return vtstq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_p16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vtstq_p16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
+// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
   return vtstq_p16(a, b);
 }
 
-
-// CHECK-LABEL: test_vuzp_s8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_s8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !57
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !57
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_s16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_s16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !60
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !60
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_s32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_s32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !noalias !63
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !63
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_u8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !66
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !66
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_u16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !69
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !69
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_u32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !noalias !72
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !72
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_f32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_f32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]], !noalias !75
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !noalias !75
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_p8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_p8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !78
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !78
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_p16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vuzp_p16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !81
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !81
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_s8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !84
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !84
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_s16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !87
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !87
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_s32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !noalias !90
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !90
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_u8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !93
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !93
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_u16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !96
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !96
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_u32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !noalias !99
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !99
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_f32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_f32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]], !noalias !102
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !noalias !102
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_p8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_p8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !105
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !105
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_p16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vuzpq_p16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !108
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !108
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
-
-// CHECK-LABEL: test_vzip_s8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_s8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !111
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !111
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_s16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_s16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !114
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !114
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
 
-// CHECK-LABEL: test_vzip_s32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_s32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !noalias !117
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !117
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_u8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !120
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !120
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_u16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !123
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !123
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_u32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !noalias !126
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !126
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_f32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_f32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]], !noalias !129
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !noalias !129
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_p8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_p8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !132
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !132
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_p16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: @test_vzip_p16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !135
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !135
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_s8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !138
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !138
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_s16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !141
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !141
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_s32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !noalias !144
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !144
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_u8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !147
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !147
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_u16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !150
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !150
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_u32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !noalias !153
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !153
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_f32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_f32(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]], !noalias !156
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !noalias !156
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_p8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_p8(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !159
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !159
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_p16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: @test_vzipq_p16(
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !162
+// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !162
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
 }
-
-
diff --git a/test/CodeGen/asm-errors.c b/test/CodeGen/asm-errors.c
index ed7b02b..a959896 100644
--- a/test/CodeGen/asm-errors.c
+++ b/test/CodeGen/asm-errors.c
@@ -1,10 +1,8 @@
 // REQUIRES: x86-registered-target
 
-// RUN: true
-// UN: not %clang_cc1 -triple i386-apple-darwin10 -emit-obj %s -o /dev/null > %t 2>&1
-// UN: FileCheck %s < %t
-// RUN: %clang_cc1 -triple i386-apple-darwin10 -emit-llvm-bc %s -o %t.bc
-// RUN: not %clang_cc1 -triple i386-apple-darwin10 -emit-obj %t.bc -o /dev/null 2>&1 | \
+// RUN: not %clang_cc1 -triple i386-apple-darwin10 -emit-obj %s -o /dev/null > %t 2>&1
+// RUN: FileCheck %s < %t
+// RUN: not %clang -target i386-apple-darwin10 -fembed-bitcode -c %s -o /dev/null 2>&1 | \
 // RUN:   FileCheck --check-prefix=CRASH-REPORT %s
 // CRASH-REPORT: <inline asm>:
 // CRASH-REPORT: error: invalid instruction mnemonic 'abc'
diff --git a/test/CodeGen/atomics-inlining.c b/test/CodeGen/atomics-inlining.c
index 23a79a2..4974f22 100644
--- a/test/CodeGen/atomics-inlining.c
+++ b/test/CodeGen/atomics-inlining.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple powerpc64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=PPC64
 // RUN: %clang_cc1 -triple mipsel-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS32
 // RUN: %clang_cc1 -triple mips64el-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS64
+// RUN: %clang_cc1 -triple sparc-unknown-eabi -emit-llvm %s -o - | FileCheck %s -check-prefix=SPARC
 
 unsigned char c1, c2;
 unsigned short s1, s2;
@@ -90,4 +91,16 @@
 // MIPS64: store atomic i64 {{.*}}, i64* @ll1 seq_cst
 // MIPS64: call void @__atomic_load(i64 zeroext 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0)
 // MIPS64: call void @__atomic_store(i64 zeroext 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
+
+// SPARC-LABEL: define void @test1
+// SPARC: = load atomic i8, i8* @c1 seq_cst
+// SPARC: store atomic i8 {{.*}}, i8* @c1 seq_cst
+// SPARC: = load atomic i16, i16* @s1 seq_cst
+// SPARC: store atomic i16 {{.*}}, i16* @s1 seq_cst
+// SPARC: = load atomic i32, i32* @i1 seq_cst
+// SPARC: store atomic i32 {{.*}}, i32* @i1 seq_cst
+// SPARC: = load atomic i64, i64* @ll1 seq_cst
+// SPARC: store atomic i64 {{.*}}, i64* @ll1 seq_cst
+// SPARC: call void @__atomic_load(i32 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
+// SPARC: call void @__atomic_store(i32 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
 }
diff --git a/test/CodeGen/attr-func-def.c b/test/CodeGen/attr-func-def.c
index ceafa12..a295488 100644
--- a/test/CodeGen/attr-func-def.c
+++ b/test/CodeGen/attr-func-def.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.10.0 -emit-llvm -Oz -o - %s | FileCheck %s
 
-// CHECK: define i32 @foo2(i32 %a) [[ATTRS2:#[0-9]+]] {
-// CHECK: define i32 @foo1(i32 %a) [[ATTRS1:#[0-9]+]] {
+// CHECK: define i32 @foo2(i32 %a) local_unnamed_addr [[ATTRS2:#[0-9]+]] {
+// CHECK: define i32 @foo1(i32 %a) local_unnamed_addr [[ATTRS1:#[0-9]+]] {
 
 int foo1(int);
 
diff --git a/test/CodeGen/attr-mode-enums.c b/test/CodeGen/attr-mode-enums.c
new file mode 100644
index 0000000..4675f6c
--- /dev/null
+++ b/test/CodeGen/attr-mode-enums.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+// Test checks that 'mode' attribute is handled correctly with enums, i. e. code
+//   1. "typedef enum { A } __attribute__((mode(HI))) T;" is accepted,
+//   2. "enum X __attribute__((mode(QI))) var;" forms a complete integer type.
+
+int main() {
+  // CHECK: [[X1:%.+]] = alloca i8
+  enum { A1, B1 } __attribute__((mode(QI))) x1 = A1;
+
+  // CHECK: [[X2:%.+]] = alloca i16
+  enum { A2, B2 } x2 __attribute__((mode(HI))) = B2;
+
+  // CHECK: [[X3:%.+]] = alloca i32
+  typedef enum { A3, B3 } __attribute__((mode(SI))) T3;
+  T3 x3 = A3;
+
+  // CHECK: [[X4:%.+]] = alloca i64
+  typedef enum { A4, B4 } T4 __attribute__((mode(DI)));
+  T4 x4 = B4;
+
+  // CHECK: [[X5:%.+]] = alloca i8
+  typedef enum __attribute__((mode(QI))) { A5, B5 } T5;
+  T5 x5 = A5;
+
+  // CHECK: [[X6:%.+]] = alloca i8
+  typedef enum X __attribute__((mode(QI))) T6;
+  T6 x6;
+
+  // CHECK: [[X7:%.+]] = alloca i128
+  enum { A7, B7 } __attribute__((mode(TI))) x7 = A7;
+
+  // CHECK: [[X8:%.+]] = alloca i8
+  enum __attribute__((mode(QI))) { A8, B8 } x8 = B8;
+
+  // CHECK: store i8 0, i8* [[X1]]
+  // CHECK: store i16 1, i16* [[X2]]
+  // CHECK: store i32 0, i32* [[X3]]
+  // CHECK: store i64 1, i64* [[X4]]
+  // CHECK: store i8 0, i8* [[X5]]
+  // CHECK: store i128 0, i128* [[X7]]
+  // CHECK: store i8 1, i8* [[X8]]
+
+  return x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8;
+}
diff --git a/test/CodeGen/attr-target-x86-mmx.c b/test/CodeGen/attr-target-x86-mmx.c
index 6720c6b..412e8e9 100644
--- a/test/CodeGen/attr-target-x86-mmx.c
+++ b/test/CodeGen/attr-target-x86-mmx.c
@@ -19,4 +19,4 @@
   _mm_srai_pi32(a, c);
 }
 
-// CHECK: "target-features"="+mmx,+sse"
+// CHECK: "target-features"="+mmx,+sse,+x87"
diff --git a/test/CodeGen/attr-target-x86.c b/test/CodeGen/attr-target-x86.c
index 58e33d1..7557ec7 100644
--- a/test/CodeGen/attr-target-x86.c
+++ b/test/CodeGen/attr-target-x86.c
@@ -18,6 +18,8 @@
 
 int __attribute__((target("no-mmx"))) qq(int a) { return 40; }
 
+int __attribute__((target("arch=lakemont"))) lake(int a) { return 4; }
+
 // Check that we emit the additional subtarget and cpu features for foo and not for baz or bar.
 // CHECK: baz{{.*}} #0
 // CHECK: foo{{.*}} #1
@@ -31,9 +33,11 @@
 // CHECK: qux{{.*}} #1
 // CHECK: qax{{.*}} #4
 // CHECK: qq{{.*}} #5
-// CHECK: #0 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2"
-// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt"
-// CHECK: #2 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,-aes,-avx,-avx2,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-f16c,-fma,-fma4,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-xop,-xsave,-xsaveopt"
-// CHECK: #3 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3"
-// CHECK: #4 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-aes"
-// CHECK: #5 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+sse,+sse2,-3dnow,-3dnowa,-mmx"
+// CHECK: lake{{.*}} #6
+// CHECK: #0 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
+// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
+// CHECK: #2 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+x87,-aes,-avx,-avx2,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vl,-f16c,-fma,-fma4,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-xop,-xsave,-xsaveopt"
+// CHECK: #3 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87"
+// CHECK: #4 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes"
+// CHECK: #5 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+sse,+sse2,+x87,-3dnow,-3dnowa,-mmx"
+// CHECK: #6 = {{.*}}"target-cpu"="lakemont" "target-features"="+mmx,+sse,+sse2"
diff --git a/test/CodeGen/attr-target-x87-softfp.c b/test/CodeGen/attr-target-x87-softfp.c
new file mode 100644
index 0000000..16b7cfe
--- /dev/null
+++ b/test/CodeGen/attr-target-x87-softfp.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu x86-64 -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=HARD
+// RUN: %clang_cc1 -msoft-float -triple x86_64-linux-gnu -target-cpu x86-64 -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT
+
+int __attribute__((target("x87"))) foo(int a) { return 4; }
+int __attribute__((target("no-x87"))) bar(int a) { return 4; }
+
+// CHECK: foo{{.*}} #0
+// CHECK: bar{{.*}} #1
+
+// CHECK: #0 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
+// HARD: "use-soft-float"="false"
+// SOFT: "use-soft-float"="true"
+
+// CHECK: #1 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,-x87"
+// HARD: "use-soft-float"="false"
+// SOFT: "use-soft-float"="true"
diff --git a/test/CodeGen/attr-used.c b/test/CodeGen/attr-used.c
index bc92b94..de38b51 100644
--- a/test/CodeGen/attr-used.c
+++ b/test/CodeGen/attr-used.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -emit-llvm -o %t %s
+// RUN: grep '@llvm.used = .*@a0' %t
 // RUN: grep '@llvm.used = .*@g0' %t
 // RUN: grep '@llvm.used = .*@f0' %t
 // RUN: grep '@llvm.used = .*@f1.l0' %t
@@ -12,3 +13,6 @@
 void f1() { 
   static int l0 __attribute__((used)) = 5225; 
 }
+
+__attribute__((used)) int a0;
+void pr27535() { (void)a0; }
diff --git a/test/CodeGen/attr-x86-interrupt.c b/test/CodeGen/attr-x86-interrupt.c
index dcc8ab6..0aca1f5 100644
--- a/test/CodeGen/attr-x86-interrupt.c
+++ b/test/CodeGen/attr-x86-interrupt.c
@@ -15,12 +15,20 @@
 // X86_64_LINUX: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i64)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
 // X86_64_LINUX: define x86_intrcc void @foo7(i32* %{{.+}}, i64 %{{.+}})
 // X86_64_LINUX: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_64_LINUX: "disable-tail-calls"="true"
+// X86_64_LINUX-NOT: "disable-tail-calls"="false"
 // X86_LINUX: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i32)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
 // X86_LINUX: define x86_intrcc void @foo7(i32* %{{.+}}, i32 %{{.+}})
 // X86_LINUX: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_LINUX: "disable-tail-calls"="true"
+// X86_LINUX-NOT: "disable-tail-calls"="false"
 // X86_64_WIN: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i64)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
 // X86_64_WIN: define x86_intrcc void @foo7(i32* %{{.+}}, i64 %{{.+}})
 // X86_64_WIN: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_64_Win: "disable-tail-calls"="true"
+// X86_64_Win-NOT: "disable-tail-calls"="false"
 // X86_WIN: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i32)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
 // X86_WIN: define x86_intrcc void @foo7(i32* %{{.+}}, i32 %{{.+}})
 // X86_WIN: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_Win: "disable-tail-calls"="true"
+// X86_Win-NOT: "disable-tail-calls"="false"
diff --git a/test/CodeGen/avx-builtins.c b/test/CodeGen/avx-builtins.c
index ee0f58f..8be03c3 100644
--- a/test/CodeGen/avx-builtins.c
+++ b/test/CodeGen/avx-builtins.c
@@ -1,154 +1,1325 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
-#include <immintrin.h>
+#include <x86intrin.h>
 
-//
-// Test LLVM IR codegen of shuffle instructions
-//
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
 
-__m256 test__mm256_loadu_ps(void* p) {
-  // CHECK: load <8 x float>, <8 x float>* %{{.*}}, align 1
-  return _mm256_loadu_ps(p);
+__m256d test_mm256_add_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_add_pd
+  // CHECK: fadd <4 x double>
+  return _mm256_add_pd(A, B);
 }
 
-__m256d test__mm256_loadu_pd(void* p) {
-  // CHECK: load <4 x double>, <4 x double>* %{{.*}}, align 1
-  return _mm256_loadu_pd(p);
+__m256 test_mm256_add_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_add_ps
+  // CHECK: fadd <8 x float>
+  return _mm256_add_ps(A, B);
 }
 
-__m256i test__mm256_loadu_si256(void* p) {
-  // CHECK: load <4 x i64>, <4 x i64>* %{{.+}}, align 1
-  return _mm256_loadu_si256(p);
+__m256d test_mm256_addsub_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_addsub_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_addsub_pd(A, B);
 }
 
-__m128i test_mm_cmpestrm(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestrm128
-  return _mm_cmpestrm(A, LA, B, LB, 7);
+__m256 test_mm256_addsub_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_addsub_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_addsub_ps(A, B);
 }
 
-int test_mm_cmpestri(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestri128
-  return _mm_cmpestri(A, LA, B, LB, 7);
+__m256d test_mm256_and_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_and_pd
+  // CHECK: and <4 x i64>
+  return _mm256_and_pd(A, B);
 }
 
-int test_mm_cmpestra(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestria128
-  return _mm_cmpestra(A, LA, B, LB, 7);
+__m256 test_mm256_and_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_and_ps
+  // CHECK: and <8 x i32>
+  return _mm256_and_ps(A, B);
 }
 
-int test_mm_cmpestrc(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestric128
-  return _mm_cmpestrc(A, LA, B, LB, 7);
+__m256d test_mm256_andnot_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_andnot_pd
+  // CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <4 x i64>
+  return _mm256_andnot_pd(A, B);
 }
 
-int test_mm_cmpestro(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestrio128
-  return _mm_cmpestro(A, LA, B, LB, 7);
+__m256 test_mm256_andnot_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_andnot_ps
+  // CHECK: xor <8 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <8 x i32>
+  return _mm256_andnot_ps(A, B);
 }
 
-int test_mm_cmpestrs(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestris128
-  return _mm_cmpestrs(A, LA, B, LB, 7);
-}
-
-int test_mm_cmpestrz(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestriz128
-  return _mm_cmpestrz(A, LA, B, LB, 7);
-}
-
-__m128i test_mm_cmpistrm(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistrm128
-  return _mm_cmpistrm(A, B, 7);
-}
-
-int test_mm_cmpistri(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistri128
-  return _mm_cmpistri(A, B, 7);
-}
-
-int test_mm_cmpistra(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistria128
-  return _mm_cmpistra(A, B, 7);
-}
-
-int test_mm_cmpistrc(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistric128
-  return _mm_cmpistrc(A, B, 7);
-}
-
-int test_mm_cmpistro(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistrio128
-  return _mm_cmpistro(A, B, 7);
-}
-
-int test_mm_cmpistrs(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistris128
-  return _mm_cmpistrs(A, B, 7);
-}
-
-int test_mm_cmpistrz(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistriz128
-  return _mm_cmpistrz(A, B, 7);
-}
-
-int test_extract_epi32(__m256i __a) {
-  // CHECK-LABEL: @test_extract_epi32
-  // CHECK: [[SHIFT1:%[^ ]+]] = and i32 %{{.*}}, 7
-  // CHECK: extractelement <8 x i32> %{{.*}}, i32 [[SHIFT1]]
-  return _mm256_extract_epi32(__a, 8);
-}
-
-int test_extract_epi16(__m256i __a) {
-  // CHECK-LABEL: @test_extract_epi16
-  // CHECK: [[SHIFT2:%[^ ]+]] = and i32 %{{.*}}, 15
-  // CHECK: extractelement <16 x i16> %{{.*}}, i32 [[SHIFT2]]
-  return _mm256_extract_epi16(__a, 16);
-}
-
-int test_extract_epi8(__m256i __a) {
-  // CHECK-LABEL: @test_extract_epi8
-  // CHECK: [[SHIFT3:%[^ ]+]] = and i32 %{{.*}}, 31
-  // CHECK: extractelement <32 x i8> %{{.*}}, i32 [[SHIFT3]]
-  return _mm256_extract_epi8(__a, 32);
-}
-
-__m256d test_256_blend_pd(__m256d __a, __m256d __b) {
-  // CHECK-LABEL: @test_256_blend_pd
+__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_blend_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm256_blend_pd(__a, __b, 0x35);
+  return _mm256_blend_pd(A, B, 0x35);
 }
 
-__m256 test_256_blend_ps(__m256 __a, __m256 __b) {
-  // CHECK-LABEL: @test_256_blend_ps
+__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_blend_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
-  return _mm256_blend_ps(__a, __b, 0x35);
+  return _mm256_blend_ps(A, B, 0x35);
 }
 
-__m256i test_256_insert_epi8(__m256i __a) {
-  // CHECK-LABEL: @test_256_insert_epi8
-  // CHECK: insertelement <32 x i8> {{.*}}, i8 {{.*}}, i32 {{.*}}
-  return _mm256_insert_epi8(__a, 42, 3);
+__m256d test_mm256_blendv_pd(__m256d V1, __m256d V2, __m256d V3) {
+  // CHECK-LABEL: test_mm256_blendv_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_blendv_pd(V1, V2, V3);
 }
 
-__m256i test_256_insert_epi16(__m256i __a) {
-  // CHECK-LABEL: @test_256_insert_epi16
-  // CHECK: insertelement <16 x i16> {{.*}}, i16 {{.*}}, i32 {{.*}}
-  return _mm256_insert_epi16(__a, 42, 3);
+__m256 test_mm256_blendv_ps(__m256 V1, __m256 V2, __m256 V3) {
+  // CHECK-LABEL: test_mm256_blendv_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_blendv_ps(V1, V2, V3);
 }
 
-__m256i test_256_insert_epi32(__m256i __a) {
-  // CHECK-LABEL: @test_256_insert_epi32
-  // CHECK: insertelement <8 x i32> {{.*}}, i32 {{.*}}, i32 {{.*}}
-  return _mm256_insert_epi32(__a, 42, 3);
+__m256d test_mm256_broadcast_pd(__m128d* A) {
+  // CHECK-LABEL: test_mm256_broadcast_pd
+  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  return _mm256_broadcast_pd(A);
 }
 
-__m256i test_256_insert_epi64(__m256i __a) {
-  // CHECK-LABEL: @test_256_insert_epi64
-  // CHECK: insertelement <4 x i64> {{.*}}, i64 {{.*}}, i32 {{.*}}
-  return _mm256_insert_epi64(__a, 42, 3);
+__m256 test_mm256_broadcast_ps(__m128* A) {
+  // CHECK-LABEL: test_mm256_broadcast_ps
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  return _mm256_broadcast_ps(A);
+}
+
+__m256d test_mm256_broadcast_sd(double* A) {
+  // CHECK-LABEL: test_mm256_broadcast_sd
+  // CHECK: load double, double* %{{.*}}
+  // CHECK: insertelement <4 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
+  return _mm256_broadcast_sd(A);
+}
+
+__m128d test_mm_broadcast_ss(float* A) {
+  // CHECK-LABEL: test_mm_broadcast_ss
+  // CHECK: load float, float* %{{.*}}
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
+  return _mm_broadcast_ss(A);
+}
+
+__m256d test_mm256_broadcast_ss(float* A) {
+  // CHECK-LABEL: test_mm256_broadcast_ss
+  // CHECK: load float, float* %{{.*}}
+  // CHECK: insertelement <8 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
+  return _mm256_broadcast_ss(A);
+}
+
+__m256 test_mm256_castpd_ps(__m256d A) {
+  // CHECK-LABEL: test_mm256_castpd_ps
+  // CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
+  return _mm256_castpd_ps(A);
+}
+
+__m256i test_mm256_castpd_si256(__m256d A) {
+  // CHECK-LABEL: test_mm256_castpd_si256
+  // CHECK: bitcast <4 x double> %{{.*}} to <4 x i64>
+  return _mm256_castpd_si256(A);
+}
+
+__m256d test_mm256_castpd128_pd256(__m128d A) {
+  // CHECK-LABEL: test_mm256_castpd128_pd256
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  return _mm256_castpd128_pd256(A);
+}
+
+__m128d test_mm256_castpd256_pd128(__m256d A) {
+  // CHECK-LABEL: test_mm256_castpd256_pd128
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  return _mm256_castpd256_pd128(A);
+}
+
+__m256d test_mm256_castps_pd(__m256 A) {
+  // CHECK-LABEL: test_mm256_castps_pd
+  // CHECK: bitcast <8 x float> %{{.*}} to <4 x double>
+  return _mm256_castps_pd(A);
+}
+
+__m256i test_mm256_castps_si256(__m256 A) {
+  // CHECK-LABEL: test_mm256_castps_si256
+  // CHECK: bitcast <8 x float> %{{.*}} to <4 x i64>
+  return _mm256_castps_si256(A);
+}
+
+__m256 test_mm256_castps128_ps256(__m128 A) {
+  // CHECK-LABEL: test_mm256_castps128_ps256
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm256_castps128_ps256(A);
+}
+
+__m128 test_mm256_castps256_ps128(__m256 A) {
+  // CHECK-LABEL: test_mm256_castps256_ps128
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm256_castps256_ps128(A);
+}
+
+__m256i test_mm256_castsi128_si256(__m128i A) {
+  // CHECK-LABEL: test_mm256_castsi128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  return _mm256_castsi128_si256(A);
+}
+
+__m256d test_mm256_castsi256_pd(__m256i A) {
+  // CHECK-LABEL: test_mm256_castsi256_pd
+  // CHECK: bitcast <4 x i64> %{{.*}} to <4 x double>
+  return _mm256_castsi256_pd(A);
+}
+
+__m256 test_mm256_castsi256_ps(__m256i A) {
+  // CHECK-LABEL: test_mm256_castsi256_ps
+  // CHECK: bitcast <4 x i64> %{{.*}} to <8 x float>
+  return _mm256_castsi256_ps(A);
+}
+
+__m128i test_mm256_castsi256_si128(__m256i A) {
+  // CHECK-LABEL: test_mm256_castsi256_si128
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  return _mm256_castsi256_si128(A);
+}
+
+__m256d test_mm256_ceil_pd(__m256d x) {
+  // CHECK-LABEL: test_mm256_ceil_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
+  return _mm256_ceil_pd(x);
+}
+
+__m256 test_mm_ceil_ps(__m256 x) {
+  // CHECK-LABEL: test_mm_ceil_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2)
+  return _mm256_ceil_ps(x);
+}
+
+__m128d test_mm_cmp_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_cmp_pd
+  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 13)
+  return _mm_cmp_pd(A, B, _CMP_GE_OS);
+}
+
+__m256d test_mm256_cmp_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_cmp_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i8 13)
+  return _mm256_cmp_pd(A, B, _CMP_GE_OS);
+}
+
+__m128 test_mm_cmp_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_cmp_ps
+  // CHECK: call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 13)
+  return _mm_cmp_ps(A, B, _CMP_GE_OS);
+}
+
+__m256 test_mm256_cmp_ps(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_cmp_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 13)
+  return _mm256_cmp_ps(A, B, _CMP_GE_OS);
+}
+
+__m128d test_mm_cmp_sd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_cmp_sd
+  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 13)
+  return _mm_cmp_sd(A, B, _CMP_GE_OS);
+}
+
+__m128 test_mm_cmp_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_cmp_ss
+  // CHECK: call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 13)
+  return _mm_cmp_ss(A, B, _CMP_GE_OS);
+}
+
+__m256d test_mm256_cvtepi32_pd(__m128i A) {
+  // CHECK-LABEL: test_mm256_cvtepi32_pd
+  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x double>
+  return _mm256_cvtepi32_pd(A);
+}
+
+__m256 test_mm256_cvtepi32_ps(__m256i A) {
+  // CHECK-LABEL: test_mm256_cvtepi32_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %{{.*}})
+  return _mm256_cvtepi32_ps(A);
+}
+
+__m128i test_mm256_cvtpd_epi32(__m256d A) {
+  // CHECK-LABEL: test_mm256_cvtpd_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %{{.*}})
+  return _mm256_cvtpd_epi32(A);
+}
+
+__m128 test_mm256_cvtpd_ps(__m256d A) {
+  // CHECK-LABEL: test_mm256_cvtpd_ps
+  // CHECK: call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %{{.*}})
+  return _mm256_cvtpd_ps(A);
+}
+
+__m256i test_mm256_cvtps_epi32(__m256 A) {
+  // CHECK-LABEL: test_mm256_cvtps_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %{{.*}})
+  return _mm256_cvtps_epi32(A);
+}
+
+__m256d test_mm256_cvtps_pd(__m128 A) {
+  // CHECK-LABEL: test_mm256_cvtps_pd
+  // CHECK: fpext <4 x float> %{{.*}} to <4 x double>
+  return _mm256_cvtps_pd(A);
+}
+
+__m128i test_mm256_cvttpd_epi32(__m256d A) {
+  // CHECK-LABEL: test_mm256_cvttpd_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %{{.*}})
+  return _mm256_cvttpd_epi32(A);
+}
+
+__m256i test_mm256_cvttps_epi32(__m256 A) {
+  // CHECK-LABEL: test_mm256_cvttps_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %{{.*}})
+  return _mm256_cvttps_epi32(A);
+}
+
+__m256d test_mm256_div_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_div_pd
+  // CHECK: fdiv <4 x double>
+  return _mm256_div_pd(A, B);
+}
+
+__m256 test_mm256_div_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_div_ps
+  // CHECK: fdiv <8 x float>
+  return _mm256_div_ps(A, B);
+}
+
+__m256 test_mm256_dp_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_dp_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> {{.*}}, <8 x float> {{.*}}, i8 7)
+  return _mm256_dp_ps(A, B, 7);
+}
+
+int test_mm256_extract_epi8(__m256i A) {
+  // CHECK-LABEL: test_mm256_extract_epi8
+  // CHECK: and i32 %{{.*}}, 31
+  // CHECK: extractelement <32 x i8> %{{.*}}, i32 %{{.*}}
+  // CHECK: zext i8 %{{.*}} to i32
+  return _mm256_extract_epi8(A, 32);
+}
+
+int test_mm256_extract_epi16(__m256i A) {
+  // CHECK-LABEL: test_mm256_extract_epi16
+  // CHECK: and i32 %{{.*}}, 15
+  // CHECK: extractelement <16 x i16> %{{.*}}, i32 %{{.*}}
+  // CHECK: zext i16 %{{.*}} to i32
+  return _mm256_extract_epi16(A, 16);
+}
+
+int test_mm256_extract_epi32(__m256i A) {
+  // CHECK-LABEL: test_mm256_extract_epi32
+  // CHECK: and i32 %{{.*}}, 7
+  // CHECK: extractelement <8 x i32> %{{.*}}, i32 %{{.*}}
+  return _mm256_extract_epi32(A, 8);
+}
+
+long long test_mm256_extract_epi64(__m256i A) {
+  // CHECK-LABEL: test_mm256_extract_epi64
+  // CHECK: and i32 %{{.*}}, 3
+  // CHECK: extractelement <4 x i64> %{{.*}}, i32 %{{.*}}
+  return _mm256_extract_epi64(A, 5);
+}
+
+__m128d test_mm256_extractf128_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_extractf128_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  return _mm256_extractf128_pd(A, 1);
+}
+
+__m128 test_mm256_extractf128_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_extractf128_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  return _mm256_extractf128_ps(A, 1);
+}
+
+__m128i test_mm256_extractf128_si256(__m256i A) {
+  // CHECK-LABEL: test_mm256_extractf128_si256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  return _mm256_extractf128_si256(A, 1);
+}
+
+__m256d test_mm256_floor_pd(__m256d x) {
+  // CHECK-LABEL: test_mm256_floor_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
+  return _mm256_floor_pd(x);
+}
+
+__m256 test_mm_floor_ps(__m256 x) {
+  // CHECK-LABEL: test_mm_floor_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1)
+  return _mm256_floor_ps(x);
+}
+
+__m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_hadd_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_hadd_pd(A, B);
+}
+
+__m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_hadd_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_hadd_ps(A, B);
+}
+
+__m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_hsub_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_hsub_pd(A, B);
+}
+
+__m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_hsub_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_hsub_ps(A, B);
+}
+
+__m256i test_mm256_insert_epi8(__m256i x, char b) {
+  // CHECK-LABEL: test_mm256_insert_epi8
+  // CHECK: and i32 %{{.*}}, 31
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 %{{.*}}
+  return _mm256_insert_epi8(x, b, 17);
+}
+
+__m256i test_mm256_insert_epi16(__m256i x, int b) {
+  // CHECK-LABEL: test_mm256_insert_epi16
+  // CHECK: and i32 %{{.*}}, 15
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 %{{.*}}
+  return _mm256_insert_epi16(x, b, 4);
+}
+
+__m256i test_mm256_insert_epi32(__m256i x, int b) {
+  // CHECK-LABEL: test_mm256_insert_epi32
+  // CHECK: and i32 %{{.*}}, 7
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}
+  return _mm256_insert_epi32(x, b, 5);
+}
+
+__m256i test_mm256_insert_epi64(__m256i x, long long b) {
+  // CHECK-LABEL: test_mm256_insert_epi64
+  // CHECK: and i32 %{{.*}}, 3
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 %{{.*}}
+  return _mm256_insert_epi64(x, b, 2);
+}
+
+__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
+  // CHECK-LABEL: test_mm256_insertf128_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  return _mm256_insertf128_pd(A, B, 0);
+}
+
+__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
+  // CHECK-LABEL: test_mm256_insertf128_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm256_insertf128_ps(A, B, 1);
+}
+
+__m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
+  // CHECK-LABEL: test_mm256_insertf128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  return _mm256_insertf128_si256(A, B, 0);
+}
+
+__m256i test_mm256_lddqu_si256(__m256i* A) {
+  // CHECK-LABEL: test_mm256_lddqu_si256
+  // CHECK: call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %{{.*}})
+  return _mm256_lddqu_si256(A);
+}
+
+__m256d test_mm256_load_pd(double* A) {
+  // CHECK-LABEL: test_mm256_load_pd
+  // CHECK: load <4 x double>, <4 x double>* %{{.*}}, align 32
+  return _mm256_load_pd(A);
+}
+
+__m256 test_mm256_load_ps(float* A) {
+  // CHECK-LABEL: test_mm256_load_ps
+  // CHECK: load <8 x float>, <8 x float>* %{{.*}}, align 32
+  return _mm256_load_ps(A);
+}
+
+__m256i test_mm256_load_si256(__m256i* A) {
+  // CHECK-LABEL: test_mm256_load_si256
+  // CHECK: load <4 x i64>, <4 x i64>* %{{.*}}, align 32
+  return _mm256_load_si256(A);
+}
+
+__m256d test_mm256_loadu_pd(double* A) {
+  // CHECK-LABEL: test_mm256_loadu_pd
+  // CHECK: load <4 x double>, <4 x double>* %{{.*}}, align 1{{$}}
+  return _mm256_loadu_pd(A);
+}
+
+__m256 test_mm256_loadu_ps(float* A) {
+  // CHECK-LABEL: test_mm256_loadu_ps
+  // CHECK: load <8 x float>, <8 x float>* %{{.*}}, align 1{{$}}
+  return _mm256_loadu_ps(A);
+}
+
+__m256i test_mm256_loadu_si256(__m256i* A) {
+  // CHECK-LABEL: test_mm256_loadu_si256
+  // CHECK: load <4 x i64>, <4 x i64>* %{{.+}}, align 1{{$}}
+  return _mm256_loadu_si256(A);
+}
+
+__m256 test_mm256_loadu2_m128(float* A, float* B) {
+  // CHECK-LABEL: test_mm256_loadu2_m128
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm256_loadu2_m128(A, B);
+}
+
+__m256d test_mm256_loadu2_m128d(double* A, double* B) {
+  // CHECK-LABEL: test_mm256_loadu2_m128d
+  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm256_loadu2_m128d(A, B);
+}
+
+__m256i test_mm256_loadu2_m128i(__m128i* A, __m128i* B) {
+  // CHECK-LABEL: test_mm256_loadu2_m128i
+  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm256_loadu2_m128i(A, B);
+}
+
+__m128d test_mm_maskload_pd(double* A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskload_pd
+  // CHECK: call <2 x double> @llvm.x86.avx.maskload.pd(i8* %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_maskload_pd(A, B);
+}
+
+__m256d test_mm256_maskload_pd(double* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskload_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskload_pd(A, B);
+}
+
+__m128 test_mm_maskload_ps(float* A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskload_ps
+  // CHECK: call <4 x float> @llvm.x86.avx.maskload.ps(i8* %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskload_ps(A, B);
+}
+
+__m256d test_mm256_maskload_ps(float* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskload_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskload_ps(A, B);
+}
+
+void test_mm_maskstore_pd(double* A, __m128i B, __m128d C) {
+  // CHECK-LABEL: test_mm_maskstore_pd
+  // CHECK: call void @llvm.x86.avx.maskstore.pd(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}})
+  _mm_maskstore_pd(A, B, C);
+}
+
+void test_mm256_maskstore_pd(double* A, __m256i B, __m256d C) {
+  // CHECK-LABEL: test_mm256_maskstore_pd
+  // CHECK: call void @llvm.x86.avx.maskstore.pd.256(i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}})
+  _mm256_maskstore_pd(A, B, C);
+}
+
+void test_mm_maskstore_ps(float* A, __m128i B, __m128 C) {
+  // CHECK-LABEL: test_mm_maskstore_ps
+  // CHECK: call void @llvm.x86.avx.maskstore.ps(i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}})
+  _mm_maskstore_ps(A, B, C);
+}
+
+void test_mm256_maskstore_ps(float* A, __m256i B, __m256 C) {
+  // CHECK-LABEL: test_mm256_maskstore_ps
+  // CHECK: call void @llvm.x86.avx.maskstore.ps.256(i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}})
+  _mm256_maskstore_ps(A, B, C);
+}
+
+__m256d test_mm256_max_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_max_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_max_pd(A, B);
+}
+
+__m256 test_mm256_max_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_max_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_max_ps(A, B);
+}
+
+__m256d test_mm256_min_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_min_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_min_pd(A, B);
+}
+
+__m256 test_mm256_min_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_min_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_min_ps(A, B);
+}
+
+__m256d test_mm256_movedup_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_movedup_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  return _mm256_movedup_pd(A);
+}
+
+__m256 test_mm256_movehdup_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_movehdup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  return _mm256_movehdup_ps(A);
+}
+
+__m256 test_mm256_moveldup_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_moveldup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  return _mm256_moveldup_ps(A);
+}
+
+int test_mm256_movemask_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_movemask_pd
+  // CHECK: call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %{{.*}})
+  return _mm256_movemask_pd(A);
+}
+
+int test_mm256_movemask_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_movemask_ps
+  // CHECK: call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %{{.*}})
+  return _mm256_movemask_ps(A);
+}
+
+__m256d test_mm256_mul_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_mul_pd
+  // CHECK: fmul <4 x double>
+  return _mm256_mul_pd(A, B);
+}
+
+__m256 test_mm256_mul_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_mul_ps
+  // CHECK: fmul <8 x float>
+  return _mm256_mul_ps(A, B);
+}
+
+__m256d test_mm256_or_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_or_pd
+  // CHECK: or <4 x i64>
+  return _mm256_or_pd(A, B);
+}
+
+__m256 test_mm256_or_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_or_ps
+  // CHECK: or <8 x i32>
+  return _mm256_or_ps(A, B);
+}
+
+__m128d test_mm_permute_pd(__m128d A) {
+  // CHECK-LABEL: test_mm_permute_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  return _mm_permute_pd(A, 1);
+}
+
+__m256d test_mm256_permute_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_permute_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  return _mm256_permute_pd(A, 5);
+}
+
+__m128 test_mm_permute_ps(__m128 A) {
+  // CHECK-LABEL: test_mm_permute_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  return _mm_permute_ps(A, 0x1b);
+}
+
+// Test case for PR12401
+__m128 test2_mm_permute_ps(__m128 a) {
+  // CHECK-LABEL: test2_mm_permute_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+  return _mm_permute_ps(a, 0xe6);
+}
+
+__m256 test_mm256_permute_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_permute_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  return _mm256_permute_ps(A, 0x1b);
+}
+
+__m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_permute2f128_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i8 49)
+  return _mm256_permute2f128_pd(A, B, 0x31);
+}
+
+__m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_permute2f128_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 19)
+  return _mm256_permute2f128_ps(A, B, 0x13);
+}
+
+__m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
+  // CHECK-LABEL: test_mm256_permute2f128_si256
+  // CHECK: call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 32)
+  return _mm256_permute2f128_si256(A, B, 0x20);
+}
+
+__m128d test_mm_permutevar_pd(__m128d A, __m128i B) {
+  // CHECK-LABEL: test_mm_permutevar_pd
+  // CHECK: call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_permutevar_pd(A, B);
+}
+
+__m256d test_mm256_permutevar_pd(__m256d A, __m256i B) {
+  // CHECK-LABEL: test_mm256_permutevar_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_permutevar_pd(A, B);
+}
+
+__m128 test_mm_permutevar_ps(__m128 A, __m128i B) {
+  // CHECK-LABEL: test_mm_permutevar_ps
+  // CHECK: call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_permutevar_ps(A, B);
+}
+
+__m256 test_mm256_permutevar_ps(__m256 A, __m256i B) {
+  // CHECK-LABEL: test_mm256_permutevar_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_permutevar_ps(A, B);
+}
+
+__m256 test_mm256_rcp_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_rcp_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %{{.*}})
+  return _mm256_rcp_ps(A);
+}
+
+__m256d test_mm256_round_pd(__m256d x) {
+  // CHECK-LABEL: test_mm256_round_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4)
+  return _mm256_round_pd(x, 4);
+}
+
+__m256 test_mm256_round_ps(__m256 x) {
+  // CHECK-LABEL: test_mm256_round_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4)
+  return _mm256_round_ps(x, 4);
+}
+
+__m256 test_mm256_rsqrt_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_rsqrt_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %{{.*}})
+  return _mm256_rsqrt_ps(A);
+}
+
+__m256i test_mm256_set_epi8(char A0, char A1, char A2, char A3, char A4, char A5, char A6, char A7,
+                            char A8, char A9, char A10, char A11, char A12, char A13, char A14, char A15,
+                            char A16, char A17, char A18, char A19, char A20, char A21, char A22, char A23,
+                            char A24, char A25, char A26, char A27, char A28, char A29, char A30, char A31) {
+  // CHECK-LABEL: test_mm256_set_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
+  return _mm256_set_epi8(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31);
+}
+
+__m256i test_mm256_set_epi16(short A0, short A1, short A2, short A3, short A4, short A5, short A6, short A7,
+                             short A8, short A9, short A10, short A11, short A12, short A13, short A14, short A15) {
+  // CHECK-LABEL: test_mm256_set_epi16
+  // CHECK: insertelement <16 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
+  return _mm256_set_epi16(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15);
+}
+
+__m256i test_mm256_set_epi32(int A0, int A1, int A2, int A3, int A4, int A5, int A6, int A7) {
+  // CHECK-LABEL: test_mm256_set_epi32
+  // CHECK: insertelement <8 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
+  return _mm256_set_epi32(A0, A1, A2, A3, A4, A5, A6, A7);
+}
+
+__m256i test_mm256_set_epi64x(long long A0, long long A1, long long A2, long long A3) {
+  // CHECK-LABEL: test_mm256_set_epi64x
+  // CHECK: insertelement <4 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
+  return _mm256_set_epi64x(A0, A1, A2, A3);
+}
+
+__m256 test_mm256_set_m128(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm256_set_m128
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_set_m128(A, B);
+}
+
+__m256d test_mm256_set_m128d(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm256_set_m128d
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_set_m128d(A, B);
+}
+
+__m256i test_mm256_set_m128i(__m128i A, __m128i B) {
+  // CHECK-LABEL: test_mm256_set_m128i
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_set_m128i(A, B);
+}
+
+__m256d test_mm256_set_pd(double A0, double A1, double A2, double A3) {
+  // CHECK-LABEL: test_mm256_set_pd
+  // CHECK: insertelement <4 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
+  return _mm256_set_pd(A0, A1, A2, A3);
+}
+
+__m256 test_mm256_set_ps(float A0, float A1, float A2, float A3, float A4, float A5, float A6, float A7) {
+  // CHECK-LABEL: test_mm256_set_ps
+  // CHECK: insertelement <8 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
+  return _mm256_set_ps(A0, A1, A2, A3, A4, A5, A6, A7);
+}
+
+__m256i test_mm256_set1_epi8(char A) {
+  // CHECK-LABEL: test_mm256_set1_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
+  return _mm256_set1_epi8(A);
+}
+
+__m256i test_mm256_set1_epi16(short A) {
+  // CHECK-LABEL: test_mm256_set1_epi16
+  // CHECK: insertelement <16 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
+  return _mm256_set1_epi16(A);
+}
+
+__m256i test_mm256_set1_epi32(int A) {
+  // CHECK-LABEL: test_mm256_set1_epi32
+  // CHECK: insertelement <8 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
+  return _mm256_set1_epi32(A);
+}
+
+__m256i test_mm256_set1_epi64x(long long A) {
+  // CHECK-LABEL: test_mm256_set1_epi64x
+  // CHECK: insertelement <4 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
+  return _mm256_set1_epi64x(A);
+}
+
+__m256d test_mm256_set1_pd(double A) {
+  // CHECK-LABEL: test_mm256_set1_pd
+  // CHECK: insertelement <4 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
+  return _mm256_set1_pd(A);
+}
+
+__m256 test_mm256_set1_ps(float A) {
+  // CHECK-LABEL: test_mm256_set1_ps
+  // CHECK: insertelement <8 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
+  return _mm256_set1_ps(A);
+}
+
+__m256i test_mm256_setr_epi8(char A0, char A1, char A2, char A3, char A4, char A5, char A6, char A7,
+                             char A8, char A9, char A10, char A11, char A12, char A13, char A14, char A15,
+                             char A16, char A17, char A18, char A19, char A20, char A21, char A22, char A23,
+                             char A24, char A25, char A26, char A27, char A28, char A29, char A30, char A31) {
+  // CHECK-LABEL: test_mm256_setr_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
+  return _mm256_setr_epi8(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31);
+}
+
+__m256i test_mm256_setr_epi16(short A0, short A1, short A2, short A3, short A4, short A5, short A6, short A7,
+                              short A8, short A9, short A10, short A11, short A12, short A13, short A14, short A15) {
+  // CHECK-LABEL: test_mm256_setr_epi16
+  // CHECK: insertelement <16 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
+  return _mm256_setr_epi16(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15);
+}
+
+__m256i test_mm256_setr_epi32(int A0, int A1, int A2, int A3, int A4, int A5, int A6, int A7) {
+  // CHECK-LABEL: test_mm256_setr_epi32
+  // CHECK: insertelement <8 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
+  return _mm256_setr_epi32(A0, A1, A2, A3, A4, A5, A6, A7);
+}
+
+__m256i test_mm256_setr_epi64x(long long A0, long long A1, long long A2, long long A3) {
+  // CHECK-LABEL: test_mm256_setr_epi64x
+  // CHECK: insertelement <4 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
+  return _mm256_setr_epi64x(A0, A1, A2, A3);
+}
+
+__m256 test_mm256_setr_m128(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm256_setr_m128
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_setr_m128(A, B);
+}
+
+__m256d test_mm256_setr_m128d(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm256_setr_m128d
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_setr_m128d(A, B);
+}
+
+__m256i test_mm256_setr_m128i(__m128i A, __m128i B) {
+  // CHECK-LABEL: test_mm256_setr_m128i
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_setr_m128i(A, B);
+}
+
+__m256d test_mm256_setr_pd(double A0, double A1, double A2, double A3) {
+  // CHECK-LABEL: test_mm256_setr_pd
+  // CHECK: insertelement <4 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
+  return _mm256_setr_pd(A0, A1, A2, A3);
+}
+
+__m256 test_mm256_setr_ps(float A0, float A1, float A2, float A3, float A4, float A5, float A6, float A7) {
+  // CHECK-LABEL: test_mm256_setr_ps
+  // CHECK: insertelement <8 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
+  return _mm256_setr_ps(A0, A1, A2, A3, A4, A5, A6, A7);
+}
+
+__m256d test_mm256_setzero_pd() {
+  // CHECK-LABEL: test_mm256_setzero_pd
+  // CHECK: store <4 x double> zeroinitializer
+  return _mm256_setzero_pd();
+}
+
+__m256 test_mm256_setzero_ps() {
+  // CHECK-LABEL: test_mm256_setzero_ps
+  // CHECK: store <8 x float> zeroinitializer
+  return _mm256_setzero_ps();
+}
+
+__m256i test_mm256_setzero_si256() {
+  // CHECK-LABEL: test_mm256_setzero_si256
+  // CHECK: store <4 x i64> zeroinitializer
+  return _mm256_setzero_si256();
+}
+
+__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  return _mm256_shuffle_pd(A, B, 0);
+}
+
+__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+  return _mm256_shuffle_ps(A, B, 0);
+}
+
+__m256d test_mm256_sqrt_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_sqrt_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %{{.*}})
+  return _mm256_sqrt_pd(A);
+}
+
+__m256 test_mm256_sqrt_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_sqrt_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %{{.*}})
+  return _mm256_sqrt_ps(A);
+}
+
+void test_mm256_store_pd(double* A, __m256d B) {
+  // CHECK-LABEL: test_mm256_store_pd
+  // CHECK: store <4 x double> %{{.*}}, <4 x double>* %{{.*}}, align 32
+  _mm256_store_pd(A, B);
+}
+
+void test_mm256_store_ps(float* A, __m256 B) {
+  // CHECK-LABEL: test_mm256_store_ps
+  // CHECK: store <8 x float> %{{.*}}, <8 x float>* %{{.*}}, align 32
+  _mm256_store_ps(A, B);
+}
+
+void test_mm256_store_si256(__m256i* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_store_si256
+  // CHECK: store <4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, align 32
+  _mm256_store_si256(A, B);
+}
+
+void test_mm256_storeu_pd(double* A, __m256d B) {
+  // CHECK-LABEL: test_mm256_storeu_pd
+  // CHECK:   store <4 x double> %{{.*}}, <4 x double>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm256_storeu_pd(A, B);
+}
+
+void test_mm256_storeu_ps(float* A, __m256 B) {
+  // CHECK-LABEL: test_mm256_storeu_ps
+  // CHECK: store <8 x float> %{{.*}}, <8 x float>* %{{.*}}, align 1{{$}}
+  // CHECk-NEXT: ret void
+  _mm256_storeu_ps(A, B);
+}
+
+void test_mm256_storeu_si256(__m256i* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_storeu_si256
+  // CHECK: store <4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, align 1{{$}}
+  // CHECk-NEXT: ret void
+  _mm256_storeu_si256(A, B);
+}
+
+void test_mm256_storeu2_m128(float* A, float* B, __m256 C) {
+  // CHECK-LABEL: test_mm256_storeu2_m128
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
+  _mm256_storeu2_m128(A, B, C);
+}
+
+void test_mm256_storeu2_m128d(double* A, double* B, __m256d C) {
+  // CHECK-LABEL: test_mm256_storeu2_m128d
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
+  _mm256_storeu2_m128d(A, B, C);
+}
+
+void test_mm256_storeu2_m128i(__m128i* A, __m128i* B, __m256i C) {
+  // CHECK-LABEL: test_mm256_storeu2_m128i
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
+  _mm256_storeu2_m128i(A, B, C);
+}
+
+void test_mm256_stream_pd(double* A, __m256d B) {
+  // CHECK-LABEL: test_mm256_stream_pd
+  // CHECK: store <4 x double> %{{.*}}, <4 x double>* %{{.*}}, align 32, !nontemporal
+  _mm256_stream_pd(A, B);
+}
+
+void test_mm256_stream_ps(float* A, __m256 B) {
+  // CHECK-LABEL: test_mm256_stream_ps
+  // CHECK: store <8 x float> %{{.*}}, <8 x float>* %{{.*}}, align 32, !nontemporal
+  _mm256_stream_ps(A, B);
+}
+
+void test_mm256_stream_si256(__m256i* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_stream_si256
+  // CHECK: store <4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, align 32, !nontemporal
+  _mm256_stream_si256(A, B);
+}
+
+__m256d test_mm256_sub_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_sub_pd
+  // CHECK: fsub <4 x double>
+  return _mm256_sub_pd(A, B);
+}
+
+__m256 test_mm256_sub_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_sub_ps
+  // CHECK: fsub <8 x float>
+  return _mm256_sub_ps(A, B);
+}
+
+int test_mm_testc_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_testc_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_testc_pd(A, B);
+}
+
+int test_mm256_testc_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_testc_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_testc_pd(A, B);
+}
+
+int test_mm_testc_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_testc_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_testc_ps(A, B);
+}
+
+int test_mm256_testc_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testc_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_testc_ps(A, B);
+}
+
+int test_mm256_testc_si256(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testc_si256
+  // CHECK: call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_testc_si256(A, B);
+}
+
+int test_mm_testnzc_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_testnzc_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_testnzc_pd(A, B);
+}
+
+int test_mm256_testnzc_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_testnzc_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_testnzc_pd(A, B);
+}
+
+int test_mm_testnzc_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_testnzc_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_testnzc_ps(A, B);
+}
+
+int test_mm256_testnzc_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testnzc_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_testnzc_ps(A, B);
+}
+
+int test_mm256_testnzc_si256(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testnzc_si256
+  // CHECK: call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_testnzc_si256(A, B);
+}
+
+int test_mm_testz_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_testz_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_testz_pd(A, B);
+}
+
+int test_mm256_testz_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_testz_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_testz_pd(A, B);
+}
+
+int test_mm_testz_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_testz_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_testz_ps(A, B);
+}
+
+int test_mm256_testz_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testz_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_testz_ps(A, B);
+}
+
+int test_mm256_testz_si256(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testz_si256
+  // CHECK: call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_testz_si256(A, B);
 }
 
 __m256 test_mm256_undefined_ps() {
@@ -168,3 +1339,72 @@
   // CHECK: ret <4 x i64> undef
   return _mm256_undefined_si256();
 }
+
+__m256d test_mm256_unpackhi_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_unpackhi_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  return _mm256_unpackhi_pd(A, B);
+}
+
+__m256 test_mm256_unpackhi_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_unpackhi_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  return _mm256_unpackhi_ps(A, B);
+}
+
+__m256d test_mm256_unpacklo_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_unpacklo_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  return _mm256_unpacklo_pd(A, B);
+}
+
+__m256 test_mm256_unpacklo_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_unpacklo_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  return _mm256_unpacklo_ps(A, B);
+}
+
+__m256d test_mm256_xor_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_xor_pd
+  // CHECK: xor <4 x i64>
+  return _mm256_xor_pd(A, B);
+}
+
+__m256 test_mm256_xor_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_xor_ps
+  // CHECK: xor <8 x i32>
+  return _mm256_xor_ps(A, B);
+}
+
+void test_mm256_zeroall() {
+  // CHECK-LABEL: test_mm256_zeroall
+  // CHECK: call void @llvm.x86.avx.vzeroall()
+  return _mm256_zeroall();
+}
+
+void test_mm256_zeroupper() {
+  // CHECK-LABEL: test_mm256_zeroupper
+  // CHECK: call void @llvm.x86.avx.vzeroupper()
+  return _mm256_zeroupper();
+}
+
+double test_mm256_cvtsd_f64(__m256d __a)
+{
+ // CHECK-LABEL: @test_mm256_cvtsd_f64
+ // CHECK: extractelement <4 x double> %{{.*}}, i32 0
+ return _mm256_cvtsd_f64(__a);
+}
+
+int test_mm256_cvtsi256_si32(__m256i __a)
+{
+ // CHECK-LABEL: @test_mm256_cvtsi256_si32
+ // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
+ return _mm256_cvtsi256_si32(__a);
+}
+
+float test_mm256_cvtss_f32(__m256 __a)
+{
+ // CHECK-LABEL: @test_mm256_cvtss_f32
+ // CHECK: extractelement <8 x float> %{{.*}}, i32 0
+ return _mm256_cvtss_f32(__a);
+}
diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c
index 89981bb..4985337 100644
--- a/test/CodeGen/avx2-builtins.c
+++ b/test/CodeGen/avx2-builtins.c
@@ -1,182 +1,116 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
-#include <immintrin.h>
+#include <x86intrin.h>
 
-__m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) {
-  // CHECK: @llvm.x86.avx2.mpsadbw({{.*}}, {{.*}}, i8 3)
-  return _mm256_mpsadbw_epu8(x, y, 3);
-}
-
-__m256i test_mm256_sad_epu8(__m256i x, __m256i y) {
-  // CHECK: @llvm.x86.avx2.psad.bw
-  return _mm256_sad_epu8(x, y);
-}
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
 
 __m256i test_mm256_abs_epi8(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pabs.b
+  // CHECK-LABEL: test_mm256_abs_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %{{.*}})
   return _mm256_abs_epi8(a);
 }
 
 __m256i test_mm256_abs_epi16(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pabs.w
+  // CHECK-LABEL: test_mm256_abs_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %{{.*}})
   return _mm256_abs_epi16(a);
 }
 
 __m256i test_mm256_abs_epi32(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pabs.d
+  // CHECK-LABEL: test_mm256_abs_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %{{.*}})
   return _mm256_abs_epi32(a);
 }
 
-__m256i test_mm256_packs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.packsswb
-  return _mm256_packs_epi16(a, b);
-}
-
-__m256i test_mm256_packs_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.packssdw
-  return _mm256_packs_epi32(a, b);
-}
-
-__m256i test_mm256_packs_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.packuswb
-  return _mm256_packus_epi16(a, b);
-}
-
-__m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.packusdw
-  return _mm256_packus_epi32(a, b);
-}
-
 __m256i test_mm256_add_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_add_epi8
   // CHECK: add <32 x i8>
   return _mm256_add_epi8(a, b);
 }
 
 __m256i test_mm256_add_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_add_epi16
   // CHECK: add <16 x i16>
   return _mm256_add_epi16(a, b);
 }
 
 __m256i test_mm256_add_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_add_epi32
   // CHECK: add <8 x i32>
   return _mm256_add_epi32(a, b);
 }
 
 __m256i test_mm256_add_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_add_epi64
   // CHECK: add <4 x i64>
   return _mm256_add_epi64(a, b);
 }
 
 __m256i test_mm256_adds_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.padds.b
+  // CHECK-LABEL: test_mm256_adds_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_adds_epi8(a, b);
 }
 
 __m256i test_mm256_adds_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.padds.w
+  // CHECK-LABEL: test_mm256_adds_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_adds_epi16(a, b);
 }
 
 __m256i test_mm256_adds_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.paddus.b
+  // CHECK-LABEL: test_mm256_adds_epu8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_adds_epu8(a, b);
 }
 
 __m256i test_mm256_adds_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.paddus.w
+  // CHECK-LABEL: test_mm256_adds_epu16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_adds_epu16(a, b);
 }
 
 __m256i test_mm256_alignr_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_alignr_epi8
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
   return _mm256_alignr_epi8(a, b, 2);
 }
 
 __m256i test2_mm256_alignr_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test2_mm256_alignr_epi8
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
   return _mm256_alignr_epi8(a, b, 17);
 }
 
-__m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
-  // CHECK: sub <32 x i8>
-  return _mm256_sub_epi8(a, b);
-}
-
-__m256i test_mm256_sub_epi16(__m256i a, __m256i b) {
-  // CHECK: sub <16 x i16>
-  return _mm256_sub_epi16(a, b);
-}
-
-__m256i test_mm256_sub_epi32(__m256i a, __m256i b) {
-  // CHECK: sub <8 x i32>
-  return _mm256_sub_epi32(a, b);
-}
-
-__m256i test_mm256_sub_epi64(__m256i a, __m256i b) {
-  // CHECK: sub <4 x i64>
-  return _mm256_sub_epi64(a, b);
-}
-
-__m256i test_mm256_subs_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psubs.b
-  return _mm256_subs_epi8(a, b);
-}
-
-__m256i test_mm256_subs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psubs.w
-  return _mm256_subs_epi16(a, b);
-}
-
-__m256i test_mm256_subs_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psubus.b
-  return _mm256_subs_epu8(a, b);
-}
-
-__m256i test_mm256_subs_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psubus.w
-  return _mm256_subs_epu16(a, b);
-}
-
 __m256i test_mm256_and_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_and_si256
   // CHECK: and <4 x i64>
   return _mm256_and_si256(a, b);
 }
 
 __m256i test_mm256_andnot_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_andnot_si256
   // CHECK: xor <4 x i64>
   // CHECK: and <4 x i64>
   return _mm256_andnot_si256(a, b);
 }
 
-__m256i test_mm256_or_si256(__m256i a, __m256i b) {
-  // CHECK: or <4 x i64>
-  return _mm256_or_si256(a, b);
-}
-
-__m256i test_mm256_xor_si256(__m256i a, __m256i b) {
-  // CHECK: xor <4 x i64>
-  return _mm256_xor_si256(a, b);
-}
-
 __m256i test_mm256_avg_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pavg.b
+  // CHECK-LABEL: test_mm256_avg_epu8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_avg_epu8(a, b);
 }
 
 __m256i test_mm256_avg_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pavg.w
+  // CHECK-LABEL: test_mm256_avg_epu16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_avg_epu16(a, b);
 }
 
-__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
-  // CHECK: @llvm.x86.avx2.pblendvb
-  return _mm256_blendv_epi8(a, b, m);
-}
-
 // FIXME: We should also lower the __builtin_ia32_pblendw128 (and similar)
 // functions to this IR. In the future we could delete the corresponding
 // intrinsic in LLVM if it's not being used anymore.
@@ -187,458 +121,6 @@
   return _mm256_blend_epi16(a, b, 2);
 }
 
-__m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
-  // CHECK: icmp eq <32 x i8>
-  return _mm256_cmpeq_epi8(a, b);
-}
-
-__m256i test_mm256_cmpeq_epi16(__m256i a, __m256i b) {
-  // CHECK: icmp eq <16 x i16>
-  return _mm256_cmpeq_epi16(a, b);
-}
-
-__m256i test_mm256_cmpeq_epi32(__m256i a, __m256i b) {
-  // CHECK: icmp eq <8 x i32>
-  return _mm256_cmpeq_epi32(a, b);
-}
-
-__m256i test_mm256_cmpeq_epi64(__m256i a, __m256i b) {
-  // CHECK: icmp eq <4 x i64>
-  return _mm256_cmpeq_epi64(a, b);
-}
-
-__m256i test_mm256_cmpgt_epi8(__m256i a, __m256i b) {
-  // CHECK: icmp sgt <32 x i8>
-  return _mm256_cmpgt_epi8(a, b);
-}
-
-__m256i test_mm256_cmpgt_epi16(__m256i a, __m256i b) {
-  // CHECK: icmp sgt <16 x i16>
-  return _mm256_cmpgt_epi16(a, b);
-}
-
-__m256i test_mm256_cmpgt_epi32(__m256i a, __m256i b) {
-  // CHECK: icmp sgt <8 x i32>
-  return _mm256_cmpgt_epi32(a, b);
-}
-
-__m256i test_mm256_cmpgt_epi64(__m256i a, __m256i b) {
-  // CHECK: icmp sgt <4 x i64>
-  return _mm256_cmpgt_epi64(a, b);
-}
-
-__m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phadd.w
-  return _mm256_hadd_epi16(a, b);
-}
-
-__m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phadd.d
-  return _mm256_hadd_epi32(a, b);
-}
-
-__m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phadd.sw
-  return _mm256_hadds_epi16(a, b);
-}
-
-__m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phsub.w
-  return _mm256_hsub_epi16(a, b);
-}
-
-__m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phsub.d
-  return _mm256_hsub_epi32(a, b);
-}
-
-__m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phsub.sw
-  return _mm256_hsubs_epi16(a, b);
-}
-
-__m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmadd.ub.sw
-  return _mm256_maddubs_epi16(a, b);
-}
-
-__m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmadd.wd
-  return _mm256_madd_epi16(a, b);
-}
-
-__m256i test_mm256_max_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxs.b
-  return _mm256_max_epi8(a, b);
-}
-
-__m256i test_mm256_max_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxs.w
-  return _mm256_max_epi16(a, b);
-}
-
-__m256i test_mm256_max_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxs.d
-  return _mm256_max_epi32(a, b);
-}
-
-__m256i test_mm256_max_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxu.b
-  return _mm256_max_epu8(a, b);
-}
-
-__m256i test_mm256_max_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxu.w
-  return _mm256_max_epu16(a, b);
-}
-
-__m256i test_mm256_max_epu32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxu.d
-  return _mm256_max_epu32(a, b);
-}
-
-__m256i test_mm256_min_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmins.b
-  return _mm256_min_epi8(a, b);
-}
-
-__m256i test_mm256_min_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmins.w
-  return _mm256_min_epi16(a, b);
-}
-
-__m256i test_mm256_min_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmins.d
-  return _mm256_min_epi32(a, b);
-}
-
-__m256i test_mm256_min_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pminu.b
-  return _mm256_min_epu8(a, b);
-}
-
-__m256i test_mm256_min_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pminu.w
-  return _mm256_min_epu16(a, b);
-}
-
-__m256i test_mm256_min_epu32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pminu.d
-  return _mm256_min_epu32(a, b);
-}
-
-int test_mm256_movemask_epi8(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pmovmskb
-  return _mm256_movemask_epi8(a);
-}
-
-__m256i test_mm256_cvtepi8_epi16(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxbw
-  return _mm256_cvtepi8_epi16(a);
-}
-
-__m256i test_mm256_cvtepi8_epi32(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxbd
-  return _mm256_cvtepi8_epi32(a);
-}
-
-__m256i test_mm256_cvtepi8_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxbq
-  return _mm256_cvtepi8_epi64(a);
-}
-
-__m256i test_mm256_cvtepi16_epi32(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxwd
-  return _mm256_cvtepi16_epi32(a);
-}
-
-__m256i test_mm256_cvtepi16_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxwq
-  return _mm256_cvtepi16_epi64(a);
-}
-
-__m256i test_mm256_cvtepi32_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxdq
-  return _mm256_cvtepi32_epi64(a);
-}
-
-__m256i test_mm256_cvtepu8_epi16(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxbw
-  return _mm256_cvtepu8_epi16(a);
-}
-
-__m256i test_mm256_cvtepu8_epi32(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxbd
-  return _mm256_cvtepu8_epi32(a);
-}
-
-__m256i test_mm256_cvtepu8_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxbq
-  return _mm256_cvtepu8_epi64(a);
-}
-
-__m256i test_mm256_cvtepu16_epi32(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxwd
-  return _mm256_cvtepu16_epi32(a);
-}
-
-__m256i test_mm256_cvtepu16_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxwq
-  return _mm256_cvtepu16_epi64(a);
-}
-
-__m256i test_mm256_cvtepu32_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxdq
-  return _mm256_cvtepu32_epi64(a);
-}
-
-__m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmul.dq
-  return _mm256_mul_epi32(a, b);
-}
-
-__m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmul.hr.sw
-  return _mm256_mulhrs_epi16(a, b);
-}
-
-__m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmulhu.w
-  return _mm256_mulhi_epu16(a, b);
-}
-
-__m256i test_mm256_mulhi_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmulh.w
-  return _mm256_mulhi_epi16(a, b);
-}
-
-__m256i test_mm256_mullo_epi16(__m256i a, __m256i b) {
-  // CHECK: mul <16 x i16>
-  return _mm256_mullo_epi16(a, b);
-}
-
-__m256i test_mm256_mullo_epi32(__m256i a, __m256i b) {
-  // CHECK: mul <8 x i32>
-  return _mm256_mullo_epi32(a, b);
-}
-
-__m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmulu.dq
-  return _mm256_mul_epu32(a, b);
-}
-
-__m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pshuf.b
-  return _mm256_shuffle_epi8(a, b);
-}
-
-__m256i test_mm256_shuffle_epi32(__m256i a) {
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
-  return _mm256_shuffle_epi32(a, 15);
-}
-
-__m256i test_mm256_shufflehi_epi16(__m256i a) {
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
-  return _mm256_shufflehi_epi16(a, 107);
-}
-
-__m256i test_mm256_shufflelo_epi16(__m256i a) {
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
-  return _mm256_shufflelo_epi16(a, 83);
-}
-
-__m256i test_mm256_sign_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psign.b
-  return _mm256_sign_epi8(a, b);
-}
-
-__m256i test_mm256_sign_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psign.w
-  return _mm256_sign_epi16(a, b);
-}
-
-__m256i test_mm256_sign_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psign.d
-  return _mm256_sign_epi32(a, b);
-}
-
-__m256i test_mm256_slli_si256(__m256i a) {
-  // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
-  return _mm256_slli_si256(a, 3);
-}
-
-__m256i test_mm256_bslli_epi128(__m256i a) {
-  // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
-  return _mm256_bslli_epi128(a, 3);
-}
-
-__m256i test_mm256_slli_epi16(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pslli.w
-  return _mm256_slli_epi16(a, 3);
-}
-
-__m256i test_mm256_sll_epi16(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psll.w
-  return _mm256_sll_epi16(a, b);
-}
-
-__m256i test_mm256_slli_epi32(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pslli.d
-  return _mm256_slli_epi32(a, 3);
-}
-
-__m256i test_mm256_sll_epi32(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psll.d
-  return _mm256_sll_epi32(a, b);
-}
-
-__m256i test_mm256_slli_epi64(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pslli.q
-  return _mm256_slli_epi64(a, 3);
-}
-
-__m256i test_mm256_sll_epi64(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psll.q
-  return _mm256_sll_epi64(a, b);
-}
-
-__m256i test_mm256_srai_epi16(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrai.w
-  return _mm256_srai_epi16(a, 3);
-}
-
-__m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psra.w
-  return _mm256_sra_epi16(a, b);
-}
-
-__m256i test_mm256_srai_epi32(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrai.d
-  return _mm256_srai_epi32(a, 3);
-}
-
-__m256i test_mm256_sra_epi32(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psra.d
-  return _mm256_sra_epi32(a, b);
-}
-
-__m256i test_mm256_srli_si256(__m256i a) {
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
-  return _mm256_srli_si256(a, 3);
-}
-
-__m256i test_mm256_bsrli_epi128(__m256i a) {
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
-  return _mm256_bsrli_epi128(a, 3);
-}
-
-__m256i test_mm256_srli_epi16(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrli.w
-  return _mm256_srli_epi16(a, 3);
-}
-
-__m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrl.w
-  return _mm256_srl_epi16(a, b);
-}
-
-__m256i test_mm256_srli_epi32(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrli.d
-  return _mm256_srli_epi32(a, 3);
-}
-
-__m256i test_mm256_srl_epi32(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrl.d
-  return _mm256_srl_epi32(a, b);
-}
-
-__m256i test_mm256_srli_epi64(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrli.q
-  return _mm256_srli_epi64(a, 3);
-}
-
-__m256i test_mm256_srl_epi64(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrl.q
-  return _mm256_srl_epi64(a, b);
-}
-
-__m256i test_mm256_unpackhi_epi8(__m256i a, __m256i b) {
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
-  return _mm256_unpackhi_epi8(a, b);
-}
-
-__m256i test_mm256_unpackhi_epi16(__m256i a, __m256i b) {
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  return _mm256_unpackhi_epi16(a, b);
-}
-
-__m256i test_mm256_unpackhi_epi32(__m256i a, __m256i b) {
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  return _mm256_unpackhi_epi32(a, b);
-}
-
-__m256i test_mm256_unpackhi_epi64(__m256i a, __m256i b) {
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  return _mm256_unpackhi_epi64(a, b);
-}
-
-__m256i test_mm256_unpacklo_epi8(__m256i a, __m256i b) {
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
-  return _mm256_unpacklo_epi8(a, b);
-}
-
-__m256i test_mm256_unpacklo_epi16(__m256i a, __m256i b) {
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
-  return _mm256_unpacklo_epi16(a, b);
-}
-
-__m256i test_mm256_unpacklo_epi32(__m256i a, __m256i b) {
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  return _mm256_unpacklo_epi32(a, b);
-}
-
-__m256i test_mm256_unpacklo_epi64(__m256i a, __m256i b) {
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  return _mm256_unpacklo_epi64(a, b);
-}
-
-__m256i test_mm256_stream_load_si256(__m256i const *a) {
-  // CHECK: @llvm.x86.avx2.movntdqa
-  return _mm256_stream_load_si256(a);
-}
-
-__m128 test_mm_broadcastss_ps(__m128 a) {
-  // CHECK-LABEL: test_mm_broadcastss_ps
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
-  return _mm_broadcastss_ps(a);
-}
-
-__m128d test_mm_broadcastsd_pd(__m128d a) {
-  // CHECK-LABEL: test_mm_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
-  return _mm_broadcastsd_pd(a);
-}
-
-__m256 test_mm256_broadcastss_ps(__m128 a) {
-  // CHECK-LABEL: test_mm256_broadcastss_ps
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps.256
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
-  return _mm256_broadcastss_ps(a);
-}
-
-__m256d test_mm256_broadcastsd_pd(__m128d a) {
-  // CHECK-LABEL: test_mm256_broadcastsd_pd
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.sd.pd.256
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
-  return _mm256_broadcastsd_pd(a);
-}
-
-__m256i test_mm256_broadcastsi128_si256(__m128i a) {
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  return _mm256_broadcastsi128_si256(a);
-}
-
 __m128i test_mm_blend_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_blend_epi32
   // CHECK-NOT: @llvm.x86.avx2.pblendd.128
@@ -653,32 +135,10 @@
   return _mm256_blend_epi32(a, b, 0x35);
 }
 
-__m256i test_mm256_broadcastb_epi8(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastb_epi8
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastb.256
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
-  return _mm256_broadcastb_epi8(a);
-}
-
-__m256i test_mm256_broadcastw_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastw_epi16
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.256
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
-  return _mm256_broadcastw_epi16(a);
-}
-
-__m256i test_mm256_broadcastd_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastd_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.256
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
-  return _mm256_broadcastd_epi32(a);
-}
-
-__m256i test_mm256_broadcastq_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastq_epi64
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
-  return _mm256_broadcastq_epi64(a);
+__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
+  // CHECK-LABEL: test_mm256_blendv_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_blendv_epi8(a, b, m);
 }
 
 __m128i test_mm_broadcastb_epi8(__m128i a) {
@@ -688,11 +148,11 @@
   return _mm_broadcastb_epi8(a);
 }
 
-__m128i test_mm_broadcastw_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastw_epi16
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.128
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
-  return _mm_broadcastw_epi16(a);
+__m256i test_mm256_broadcastb_epi8(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastb_epi8
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastb.256
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
+  return _mm256_broadcastb_epi8(a);
 }
 
 __m128i test_mm_broadcastd_epi32(__m128i a) {
@@ -702,6 +162,13 @@
   return _mm_broadcastd_epi32(a);
 }
 
+__m256i test_mm256_broadcastd_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastd_epi32
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.256
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  return _mm256_broadcastd_epi32(a);
+}
+
 __m128i test_mm_broadcastq_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_broadcastq_epi64
   // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.128
@@ -709,331 +176,1051 @@
   return _mm_broadcastq_epi64(a);
 }
 
-__m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.permd
-  return _mm256_permutevar8x32_epi32(a, b);
+__m256i test_mm256_broadcastq_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastq_epi64
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
+  return _mm256_broadcastq_epi64(a);
 }
 
-__m256d test_mm256_permute4x64_pd(__m256d a) {
-  // CHECK: shufflevector{{.*}}<i32 1, i32 2, i32 1, i32 0>
-  return _mm256_permute4x64_pd(a, 25);
+__m128d test_mm_broadcastsd_pd(__m128d a) {
+  // CHECK-LABEL: test_mm_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  return _mm_broadcastsd_pd(a);
 }
 
-__m256 test_mm256_permutevar8x32_ps(__m256 a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.permps
-  return _mm256_permutevar8x32_ps(a, b);
+__m256d test_mm256_broadcastsd_pd(__m128d a) {
+  // CHECK-LABEL: test_mm256_broadcastsd_pd
+  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.sd.pd.256
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
+  return _mm256_broadcastsd_pd(a);
 }
 
-__m256i test_mm256_permute4x64_epi64(__m256i a) {
-  // CHECK: shufflevector{{.*}}<i32 3, i32 0, i32 2, i32 0>
-  return _mm256_permute4x64_epi64(a, 35);
+__m256i test_mm256_broadcastsi128_si256(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastsi128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  return _mm256_broadcastsi128_si256(a);
 }
 
-__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.vperm2i128
-  return _mm256_permute2x128_si256(a, b, 0x31);
+__m128 test_mm_broadcastss_ps(__m128 a) {
+  // CHECK-LABEL: test_mm_broadcastss_ps
+  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  return _mm_broadcastss_ps(a);
 }
 
-__m128i test_mm256_extracti128_si256_0(__m256i a) {
-  // CHECK-LABEL: @test_mm256_extracti128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
+__m256 test_mm256_broadcastss_ps(__m128 a) {
+  // CHECK-LABEL: test_mm256_broadcastss_ps
+  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps.256
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
+  return _mm256_broadcastss_ps(a);
+}
+
+__m128i test_mm_broadcastw_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm_broadcastw_epi16
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
+  return _mm_broadcastw_epi16(a);
+}
+
+__m256i test_mm256_broadcastw_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastw_epi16
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.256
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
+  return _mm256_broadcastw_epi16(a);
+}
+
+__m256i test_mm256_bslli_epi128(__m256i a) {
+  // CHECK-LABEL: test_mm256_bslli_epi128
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+  return _mm256_bslli_epi128(a, 3);
+}
+
+__m256i test_mm256_bsrli_epi128(__m256i a) {
+  // CHECK-LABEL: test_mm256_bsrli_epi128
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+  return _mm256_bsrli_epi128(a, 3);
+}
+
+__m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpeq_epi8
+  // CHECK: icmp eq <32 x i8>
+  return _mm256_cmpeq_epi8(a, b);
+}
+
+__m256i test_mm256_cmpeq_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpeq_epi16
+  // CHECK: icmp eq <16 x i16>
+  return _mm256_cmpeq_epi16(a, b);
+}
+
+__m256i test_mm256_cmpeq_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpeq_epi32
+  // CHECK: icmp eq <8 x i32>
+  return _mm256_cmpeq_epi32(a, b);
+}
+
+__m256i test_mm256_cmpeq_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpeq_epi64
+  // CHECK: icmp eq <4 x i64>
+  return _mm256_cmpeq_epi64(a, b);
+}
+
+__m256i test_mm256_cmpgt_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpgt_epi8
+  // CHECK: icmp sgt <32 x i8>
+  return _mm256_cmpgt_epi8(a, b);
+}
+
+__m256i test_mm256_cmpgt_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpgt_epi16
+  // CHECK: icmp sgt <16 x i16>
+  return _mm256_cmpgt_epi16(a, b);
+}
+
+__m256i test_mm256_cmpgt_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpgt_epi32
+  // CHECK: icmp sgt <8 x i32>
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+__m256i test_mm256_cmpgt_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpgt_epi64
+  // CHECK: icmp sgt <4 x i64>
+  return _mm256_cmpgt_epi64(a, b);
+}
+
+__m256i test_mm256_cvtepi8_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi8_epi16
+  // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
+  return _mm256_cvtepi8_epi16(a);
+}
+
+__m256i test_mm256_cvtepi8_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi8_epi32
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
+  return _mm256_cvtepi8_epi32(a);
+}
+
+__m256i test_mm256_cvtepi8_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi8_epi64
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
+  return _mm256_cvtepi8_epi64(a);
+}
+
+__m256i test_mm256_cvtepi16_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi16_epi32
+  // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
+  return _mm256_cvtepi16_epi32(a);
+}
+
+__m256i test_mm256_cvtepi16_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi16_epi64
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
+  return _mm256_cvtepi16_epi64(a);
+}
+
+__m256i test_mm256_cvtepi32_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi32_epi64
+  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  return _mm256_cvtepi32_epi64(a);
+}
+
+__m256i test_mm256_cvtepu8_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu8_epi16
+  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
+  return _mm256_cvtepu8_epi16(a);
+}
+
+__m256i test_mm256_cvtepu8_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu8_epi32
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
+  return _mm256_cvtepu8_epi32(a);
+}
+
+__m256i test_mm256_cvtepu8_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu8_epi64
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
+  return _mm256_cvtepu8_epi64(a);
+}
+
+__m256i test_mm256_cvtepu16_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu16_epi32
+  // CHECK: zext <8 x i16> {{.*}} to <8 x i32>
+  return _mm256_cvtepu16_epi32(a);
+}
+
+__m256i test_mm256_cvtepu16_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu16_epi64
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
+  return _mm256_cvtepu16_epi64(a);
+}
+
+__m256i test_mm256_cvtepu32_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu32_epi64
+  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  return _mm256_cvtepu32_epi64(a);
+}
+
+__m128i test0_mm256_extracti128_si256_0(__m256i a) {
+  // CHECK-LABEL: test0_mm256_extracti128_si256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   return _mm256_extracti128_si256(a, 0);
 }
 
-__m128i test_mm256_extracti128_si256_1(__m256i a) {
-  // CHECK-LABEL: @test_mm256_extracti128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 2, i32 3>
+__m128i test1_mm256_extracti128_si256_1(__m256i a) {
+  // CHECK-LABEL: test1_mm256_extracti128_si256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti128_si256(a, 1);
 }
 
 // Immediate should be truncated to one bit.
-__m128i test_mm256_extracti128_si256_2(__m256i a) {
-  // CHECK-LABEL: @test_mm256_extracti128_si256_2
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
+__m128i test2_mm256_extracti128_si256(__m256i a) {
+  // CHECK-LABEL: test2_mm256_extracti128_si256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   return _mm256_extracti128_si256(a, 2);
 }
 
-__m256i test_mm256_inserti128_si256_0(__m256i a, __m128i b) {
-  // CHECK-LABEL: @test_mm256_inserti128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
+__m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hadd_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_hadd_epi16(a, b);
+}
+
+__m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hadd_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_hadd_epi32(a, b);
+}
+
+__m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hadds_epi16
+  // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_hadds_epi16(a, b);
+}
+
+__m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hsub_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_hsub_epi16(a, b);
+}
+
+__m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hsub_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_hsub_epi32(a, b);
+}
+
+__m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hsubs_epi16
+  // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_hsubs_epi16(a, b);
+}
+
+__m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i32gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm_i32gather_epi32(b, c, 2);
+}
+
+__m128i test_mm_mask_i32gather_epi32(__m128i a, int const *b, __m128i c, __m128i d) {
+  // CHECK-LABEL: test_mm_mask_i32gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm_mask_i32gather_epi32(a, b, c, d, 2);
+}
+
+__m256i test_mm256_i32gather_epi32(int const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i32gather_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %{{.*}}, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 2)
+  return _mm256_i32gather_epi32(b, c, 2);
+}
+
+__m256i test_mm256_mask_i32gather_epi32(__m256i a, int const *b, __m256i c, __m256i d) {
+  // CHECK-LABEL: test_mm256_mask_i32gather_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %{{.*}}, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 2)
+  return _mm256_mask_i32gather_epi32(a, b, c, d, 2);
+}
+
+__m128i test_mm_i32gather_epi64(long long const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i32gather_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
+  return _mm_i32gather_epi64(b, c, 2);
+}
+
+__m128i test_mm_mask_i32gather_epi64(__m128i a, long long const *b, __m128i c, __m128i d) {
+  // CHECK-LABEL: test_mm_mask_i32gather_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
+  return _mm_mask_i32gather_epi64(a, b, c, d, 2);
+}
+
+__m256i test_mm256_i32gather_epi64(long long const *b, __m128i c) {
+  // CHECK-LABEL: test_mm256_i32gather_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
+  return _mm256_i32gather_epi64(b, c, 2);
+}
+
+__m256i test_mm256_mask_i32gather_epi64(__m256i a, long long const *b, __m128i c, __m256i d) {
+  // CHECK-LABEL: test_mm256_mask_i32gather_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
+  return _mm256_mask_i32gather_epi64(a, b, c, d, 2);
+}
+
+__m128d test_mm_i32gather_pd(double const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i32gather_pd
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK: call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  return _mm_i32gather_pd(b, c, 2);
+}
+
+__m128d test_mm_mask_i32gather_pd(__m128d a, double const *b, __m128i c, __m128d d) {
+  // CHECK-LABEL: test_mm_mask_i32gather_pd
+  // CHECK: call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  return _mm_mask_i32gather_pd(a, b, c, d, 2);
+}
+
+__m256d test_mm256_i32gather_pd(double const *b, __m128i c) {
+  // CHECK-LABEL: test_mm256_i32gather_pd
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <4 x double>
+  // CHECK: call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x double> %{{.*}}, i8 2)
+  return _mm256_i32gather_pd(b, c, 2);
+}
+
+__m256d test_mm256_mask_i32gather_pd(__m256d a, double const *b, __m128i c, __m256d d) {
+  // CHECK-LABEL: test_mm256_mask_i32gather_pd
+  // CHECK: call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x double> %{{.*}}, i8 2)
+  return _mm256_mask_i32gather_pd(a, b, c, d, 2);
+}
+
+__m128 test_mm_i32gather_ps(float const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i32gather_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_i32gather_ps(b, c, 2);
+}
+
+__m128 test_mm_mask_i32gather_ps(__m128 a, float const *b, __m128i c, __m128 d) {
+  // CHECK-LABEL: test_mm_mask_i32gather_ps
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_mask_i32gather_ps(a, b, c, d, 2);
+}
+
+__m256 test_mm256_i32gather_ps(float const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i32gather_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <8 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x i32> [[SEXT]] to <8 x float>
+  // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
+  return _mm256_i32gather_ps(b, c, 2);
+}
+
+__m256 test_mm256_mask_i32gather_ps(__m256 a, float const *b, __m256i c, __m256 d) {
+  // CHECK-LABEL: test_mm256_mask_i32gather_ps
+  // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %{{.*}}, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
+  return _mm256_mask_i32gather_ps(a, b, c, d, 2);
+}
+
+__m128i test_mm_i64gather_epi32(int const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i64gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm_i64gather_epi32(b, c, 2);
+}
+
+__m128i test_mm_mask_i64gather_epi32(__m128i a, int const *b, __m128i c, __m128i d) {
+  // CHECK-LABEL: test_mm_mask_i64gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm_mask_i64gather_epi32(a, b, c, d, 2);
+}
+
+__m128i test_mm256_i64gather_epi32(int const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i64gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm256_i64gather_epi32(b, c, 2);
+}
+
+__m128i test_mm256_mask_i64gather_epi32(__m128i a, int const *b, __m256i c, __m128i d) {
+  // CHECK-LABEL: test_mm256_mask_i64gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm256_mask_i64gather_epi32(a, b, c, d, 2);
+}
+
+__m128i test_mm_i64gather_epi64(long long const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i64gather_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
+  return _mm_i64gather_epi64(b, c, 2);
+}
+
+__m128i test_mm_mask_i64gather_epi64(__m128i a, long long const *b, __m128i c, __m128i d) {
+  // CHECK-LABEL: test_mm_mask_i64gather_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
+  return _mm_mask_i64gather_epi64(a, b, c, d, 2);
+}
+
+__m256i test_mm256_i64gather_epi64(long long const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i64gather_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
+  return _mm256_i64gather_epi64(b, c, 2);
+}
+
+__m256i test_mm256_mask_i64gather_epi64(__m256i a, long long const *b, __m256i c, __m256i d) {
+  // CHECK-LABEL: test_mm256_mask_i64gather_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
+  return _mm256_mask_i64gather_epi64(a, b, c, d, 2);
+}
+
+__m128d test_mm_i64gather_pd(double const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i64gather_pd
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK: call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  return _mm_i64gather_pd(b, c, 2);
+}
+
+__m128d test_mm_mask_i64gather_pd(__m128d a, double const *b, __m128i c, __m128d d) {
+  // CHECK-LABEL: test_mm_mask_i64gather_pd
+  // CHECK: call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  return _mm_mask_i64gather_pd(a, b, c, d, 2);
+}
+
+__m256d test_mm256_i64gather_pd(double const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i64gather_pd
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <4 x double>
+  // CHECK: call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}}, i8 2)
+  return _mm256_i64gather_pd(b, c, 2);
+}
+
+__m256d test_mm256_mask_i64gather_pd(__m256d a, double const *b, __m256i c, __m256d d) {
+  // CHECK-LABEL: test_mm256_mask_i64gather_pd
+  // CHECK: call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}}, i8 2)
+  return _mm256_mask_i64gather_pd(a, b, c, d, 2);
+}
+
+__m128 test_mm_i64gather_ps(float const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i64gather_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %{{.*}}, <2 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_i64gather_ps(b, c, 2);
+}
+
+__m128 test_mm_mask_i64gather_ps(__m128 a, float const *b, __m128i c, __m128 d) {
+  // CHECK-LABEL: test_mm_mask_i64gather_ps
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_mask_i64gather_ps(a, b, c, d, 2);
+}
+
+__m128 test_mm256_i64gather_ps(float const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i64gather_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm256_i64gather_ps(b, c, 2);
+}
+
+__m128 test_mm256_mask_i64gather_ps(__m128 a, float const *b, __m256i c, __m128 d) {
+  // CHECK-LABEL: test_mm256_mask_i64gather_ps
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm256_mask_i64gather_ps(a, b, c, d, 2);
+}
+
+__m256i test0_mm256_inserti128_si256(__m256i a, __m128i b) {
+  // CHECK-LABEL: test0_mm256_inserti128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_inserti128_si256(a, b, 0);
 }
 
-__m256i test_mm256_inserti128_si256_1(__m256i a, __m128i b) {
-  // CHECK-LABEL: @test_mm256_inserti128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
+__m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
+  // CHECK-LABEL: test1_mm256_inserti128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_inserti128_si256(a, b, 1);
 }
 
 // Immediate should be truncated to one bit.
-__m256i test_mm256_inserti128_si256_2(__m256i a, __m128i b) {
-  // CHECK-LABEL: @test_mm256_inserti128_si256_2
-  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
+__m256i test2_mm256_inserti128_si256(__m256i a, __m128i b) {
+  // CHECK-LABEL: test2_mm256_inserti128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_inserti128_si256(a, b, 2);
 }
 
-__m256i test_mm256_maskload_epi32(int const *a, __m256i m) {
-  // CHECK: @llvm.x86.avx2.maskload.d.256
-  return _mm256_maskload_epi32(a, m);
+__m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_madd_epi16
+  // CHECK: call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_madd_epi16(a, b);
 }
 
-__m256i test_mm256_maskload_epi64(long long const *a, __m256i m) {
-  // CHECK: @llvm.x86.avx2.maskload.q.256
-  return _mm256_maskload_epi64(a, m);
+__m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_maddubs_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_maddubs_epi16(a, b);
 }
 
 __m128i test_mm_maskload_epi32(int const *a, __m128i m) {
-  // CHECK: @llvm.x86.avx2.maskload.d
+  // CHECK-LABEL: test_mm_maskload_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maskload_epi32(a, m);
 }
 
+__m256i test_mm256_maskload_epi32(int const *a, __m256i m) {
+  // CHECK-LABEL: test_mm256_maskload_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskload_epi32(a, m);
+}
+
 __m128i test_mm_maskload_epi64(long long const *a, __m128i m) {
-  // CHECK: @llvm.x86.avx2.maskload.q
+  // CHECK-LABEL: test_mm_maskload_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maskload_epi64(a, m);
 }
 
-void test_mm256_maskstore_epi32(int *a, __m256i m, __m256i b) {
-  // CHECK: @llvm.x86.avx2.maskstore.d.256
-  _mm256_maskstore_epi32(a, m, b);
-}
-
-void test_mm256_maskstore_epi64(long long *a, __m256i m, __m256i b) {
-  // CHECK: @llvm.x86.avx2.maskstore.q.256
-  _mm256_maskstore_epi64(a, m, b);
+__m256i test_mm256_maskload_epi64(long long const *a, __m256i m) {
+  // CHECK-LABEL: test_mm256_maskload_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskload_epi64(a, m);
 }
 
 void test_mm_maskstore_epi32(int *a, __m128i m, __m128i b) {
-  // CHECK: @llvm.x86.avx2.maskstore.d
+  // CHECK-LABEL: test_mm_maskstore_epi32
+  // CHECK: call void @llvm.x86.avx2.maskstore.d(i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   _mm_maskstore_epi32(a, m, b);
 }
 
+void test_mm256_maskstore_epi32(int *a, __m256i m, __m256i b) {
+  // CHECK-LABEL: test_mm256_maskstore_epi32
+  // CHECK: call void @llvm.x86.avx2.maskstore.d.256(i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  _mm256_maskstore_epi32(a, m, b);
+}
+
 void test_mm_maskstore_epi64(long long *a, __m128i m, __m128i b) {
-  // CHECK: @llvm.x86.avx2.maskstore.q
+  // CHECK-LABEL: test_mm_maskstore_epi64
+  // CHECK: call void @llvm.x86.avx2.maskstore.q(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   _mm_maskstore_epi64(a, m, b);
 }
 
-__m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psllv.d.256
-  return _mm256_sllv_epi32(a, b);
+void test_mm256_maskstore_epi64(long long *a, __m256i m, __m256i b) {
+  // CHECK-LABEL: test_mm256_maskstore_epi64
+  // CHECK: call void @llvm.x86.avx2.maskstore.q.256(i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  _mm256_maskstore_epi64(a, m, b);
+}
+
+__m256i test_mm256_max_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epi8
+  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  return _mm256_max_epi8(a, b);
+}
+
+__m256i test_mm256_max_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epi16
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  return _mm256_max_epi16(a, b);
+}
+
+__m256i test_mm256_max_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epi32
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  return _mm256_max_epi32(a, b);
+}
+
+__m256i test_mm256_max_epu8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epu8
+  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  return _mm256_max_epu8(a, b);
+}
+
+__m256i test_mm256_max_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epu16
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  return _mm256_max_epu16(a, b);
+}
+
+__m256i test_mm256_max_epu32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epu32
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  return _mm256_max_epu32(a, b);
+}
+
+__m256i test_mm256_min_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epi8
+  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  return _mm256_min_epi8(a, b);
+}
+
+__m256i test_mm256_min_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epi16
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  return _mm256_min_epi16(a, b);
+}
+
+__m256i test_mm256_min_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epi32
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  return _mm256_min_epi32(a, b);
+}
+
+__m256i test_mm256_min_epu8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epu8
+  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  return _mm256_min_epu8(a, b);
+}
+
+__m256i test_mm256_min_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epu16
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  return _mm256_min_epu16(a, b);
+}
+
+__m256i test_mm256_min_epu32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epu32
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  return _mm256_min_epu32(a, b);
+}
+
+int test_mm256_movemask_epi8(__m256i a) {
+  // CHECK-LABEL: test_mm256_movemask_epi8
+  // CHECK: call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %{{.*}})
+  return _mm256_movemask_epi8(a);
+}
+
+__m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) {
+  // CHECK-LABEL: test_mm256_mpsadbw_epu8
+  // CHECK: call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, i8 3)
+  return _mm256_mpsadbw_epu8(x, y, 3);
+}
+
+__m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mul_epi32
+  // CHECK: call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mul_epi32(a, b);
+}
+
+__m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mul_epu32
+  // CHECK: call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mul_epu32(a, b);
+}
+
+__m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mulhi_epu16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_mulhi_epu16(a, b);
+}
+
+__m256i test_mm256_mulhi_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mulhi_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_mulhi_epi16(a, b);
+}
+
+__m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mulhrs_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_mulhrs_epi16(a, b);
+}
+
+__m256i test_mm256_mullo_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mullo_epi16
+  // CHECK: mul <16 x i16>
+  return _mm256_mullo_epi16(a, b);
+}
+
+__m256i test_mm256_mullo_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mullo_epi32
+  // CHECK: mul <8 x i32>
+  return _mm256_mullo_epi32(a, b);
+}
+
+__m256i test_mm256_or_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_or_si256
+  // CHECK: or <4 x i64>
+  return _mm256_or_si256(a, b);
+}
+
+__m256i test_mm256_packs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_packs_epi16
+  // CHECK: call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_packs_epi16(a, b);
+}
+
+__m256i test_mm256_packs_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_packs_epi32
+  // CHECK: call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_packs_epi32(a, b);
+}
+
+__m256i test_mm256_packs_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_packs_epu16
+  // CHECK:  call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_packus_epi16(a, b);
+}
+
+__m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_packs_epu32
+  // CHECK: call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_packus_epi32(a, b);
+}
+
+__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_permute2x128_si256
+  // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 49)
+  return _mm256_permute2x128_si256(a, b, 0x31);
+}
+
+__m256i test_mm256_permute4x64_epi64(__m256i a) {
+  // CHECK-LABEL: test_mm256_permute4x64_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
+  return _mm256_permute4x64_epi64(a, 35);
+}
+
+__m256d test_mm256_permute4x64_pd(__m256d a) {
+  // CHECK-LABEL: test_mm256_permute4x64_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+  return _mm256_permute4x64_pd(a, 25);
+}
+
+__m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_permutevar8x32_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_permutevar8x32_epi32(a, b);
+}
+
+__m256 test_mm256_permutevar8x32_ps(__m256 a, __m256i b) {
+  // CHECK-LABEL: test_mm256_permutevar8x32_ps
+  // CHECK: call <8 x float> @llvm.x86.avx2.permps(<8 x float> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_permutevar8x32_ps(a, b);
+}
+
+__m256i test_mm256_sad_epu8(__m256i x, __m256i y) {
+  // CHECK-LABEL: test_mm256_sad_epu8
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_sad_epu8(x, y);
+}
+
+__m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_shuffle_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_shuffle_epi8(a, b);
+}
+
+__m256i test_mm256_shuffle_epi32(__m256i a) {
+  // CHECK-LABEL: test_mm256_shuffle_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
+  return _mm256_shuffle_epi32(a, 15);
+}
+
+__m256i test_mm256_shufflehi_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_shufflehi_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
+  return _mm256_shufflehi_epi16(a, 107);
+}
+
+__m256i test_mm256_shufflelo_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_shufflelo_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
+  return _mm256_shufflelo_epi16(a, 83);
+}
+
+__m256i test_mm256_sign_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sign_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_sign_epi8(a, b);
+}
+
+__m256i test_mm256_sign_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sign_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_sign_epi16(a, b);
+}
+
+__m256i test_mm256_sign_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sign_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_sign_epi32(a, b);
+}
+
+__m256i test_mm256_slli_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_slli_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
+  return _mm256_slli_epi16(a, 3);
+}
+
+__m256i test_mm256_slli_epi32(__m256i a) {
+  // CHECK-LABEL: test_mm256_slli_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+  return _mm256_slli_epi32(a, 3);
+}
+
+__m256i test_mm256_slli_epi64(__m256i a) {
+  // CHECK-LABEL: test_mm256_slli_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %{{.*}}, i32 %{{.*}})
+  return _mm256_slli_epi64(a, 3);
+}
+
+__m256i test_mm256_slli_si256(__m256i a) {
+  // CHECK-LABEL: test_mm256_slli_si256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+  return _mm256_slli_si256(a, 3);
 }
 
 __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psllv.d
+  // CHECK-LABEL: test_mm_sllv_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sllv_epi32(a, b);
 }
 
-__m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psllv.q.256
-  return _mm256_sllv_epi64(a, b);
+__m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sllv_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_sllv_epi32(a, b);
 }
 
 __m128i test_mm_sllv_epi64(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psllv.q
+  // CHECK-LABEL: test_mm_sllv_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_sllv_epi64(a, b);
 }
 
-__m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psrav.d.256
-  return _mm256_srav_epi32(a, b);
+__m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sllv_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_sllv_epi64(a, b);
+}
+
+__m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_sra_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm256_sra_epi16(a, b);
+}
+
+__m256i test_mm256_sra_epi32(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_sra_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm256_sra_epi32(a, b);
+}
+
+__m256i test_mm256_srai_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_srai_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %{{.*}}, i32 %{{.*}})
+  return _mm256_srai_epi16(a, 3);
+}
+
+__m256i test_mm256_srai_epi32(__m256i a) {
+  // CHECK-LABEL: test_mm256_srai_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+  return _mm256_srai_epi32(a, 3);
 }
 
 __m128i test_mm_srav_epi32(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrav.d
+  // CHECK-LABEL: test_mm_srav_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_srav_epi32(a, b);
 }
 
-__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psrlv.d.256
-  return _mm256_srlv_epi32(a, b);
+__m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_srav_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_srav_epi32(a, b);
+}
+
+__m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_srl_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm256_srl_epi16(a, b);
+}
+
+__m256i test_mm256_srl_epi32(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_srl_epi32
+  // CHECK:call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm256_srl_epi32(a, b);
+}
+
+__m256i test_mm256_srl_epi64(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_srl_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm256_srl_epi64(a, b);
+}
+
+__m256i test_mm256_srli_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_srli_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
+  return _mm256_srli_epi16(a, 3);
+}
+
+__m256i test_mm256_srli_epi32(__m256i a) {
+  // CHECK-LABEL: test_mm256_srli_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+  return _mm256_srli_epi32(a, 3);
+}
+
+__m256i test_mm256_srli_epi64(__m256i a) {
+  // CHECK-LABEL: test_mm256_srli_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %{{.*}}, i32 %{{.*}})
+  return _mm256_srli_epi64(a, 3);
+}
+
+__m256i test_mm256_srli_si256(__m256i a) {
+  // CHECK-LABEL: test_mm256_srli_si256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+  return _mm256_srli_si256(a, 3);
 }
 
 __m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrlv.d
+  // CHECK-LABEL: test_mm_srlv_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_srlv_epi32(a, b);
 }
 
-__m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psrlv.q.256
-  return _mm256_srlv_epi64(a, b);
+__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_srlv_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_srlv_epi32(a, b);
 }
 
 __m128i test_mm_srlv_epi64(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrlv.q
+  // CHECK-LABEL: test_mm_srlv_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_srlv_epi64(a, b);
 }
 
-__m128d test_mm_mask_i32gather_pd(__m128d a, double const *b, __m128i c,
-                                  __m128d d) {
-  // CHECK: @llvm.x86.avx2.gather.d.pd
-  return _mm_mask_i32gather_pd(a, b, c, d, 2);
+__m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_srlv_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_srlv_epi64(a, b);
 }
 
-__m256d test_mm256_mask_i32gather_pd(__m256d a, double const *b, __m128i c,
-                                      __m256d d) {
-  // CHECK: @llvm.x86.avx2.gather.d.pd.256
-  return _mm256_mask_i32gather_pd(a, b, c, d, 2);
+__m256i test_mm256_stream_load_si256(__m256i const *a) {
+  // CHECK-LABEL: test_mm256_stream_load_si256
+  // CHECK: call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %{{.*}})
+  return _mm256_stream_load_si256(a);
 }
 
-__m128d test_mm_mask_i64gather_pd(__m128d a, double const *b, __m128i c,
-                                  __m128d d) {
-  // CHECK: @llvm.x86.avx2.gather.q.pd
-  return _mm_mask_i64gather_pd(a, b, c, d, 2);
+__m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sub_epi8
+  // CHECK: sub <32 x i8>
+  return _mm256_sub_epi8(a, b);
 }
 
-__m256d test_mm256_mask_i64gather_pd(__m256d a, double const *b, __m256i c,
-                                      __m256d d) {
-  // CHECK: @llvm.x86.avx2.gather.q.pd.256
-  return _mm256_mask_i64gather_pd(a, b, c, d, 2);
+__m256i test_mm256_sub_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sub_epi16
+  // CHECK: sub <16 x i16>
+  return _mm256_sub_epi16(a, b);
 }
 
-__m128 test_mm_mask_i32gather_ps(__m128 a, float const *b, __m128i c,
-                                 __m128 d) {
-  // CHECK: @llvm.x86.avx2.gather.d.ps
-  return _mm_mask_i32gather_ps(a, b, c, d, 2);
+__m256i test_mm256_sub_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sub_epi32
+  // CHECK: sub <8 x i32>
+  return _mm256_sub_epi32(a, b);
 }
 
-__m256 test_mm256_mask_i32gather_ps(__m256 a, float const *b, __m256i c,
-                                     __m256 d) {
-  // CHECK: @llvm.x86.avx2.gather.d.ps.256
-  return _mm256_mask_i32gather_ps(a, b, c, d, 2);
+__m256i test_mm256_sub_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sub_epi64
+  // CHECK: sub <4 x i64>
+  return _mm256_sub_epi64(a, b);
 }
 
-__m128 test_mm_mask_i64gather_ps(__m128 a, float const *b, __m128i c,
-                                 __m128 d) {
-  // CHECK: @llvm.x86.avx2.gather.q.ps
-  return _mm_mask_i64gather_ps(a, b, c, d, 2);
+__m256i test_mm256_subs_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_subs_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_subs_epi8(a, b);
 }
 
-__m128 test_mm256_mask_i64gather_ps(__m128 a, float const *b, __m256i c,
-                                    __m128 d) {
-  // CHECK: @llvm.x86.avx2.gather.q.ps.256
-  return _mm256_mask_i64gather_ps(a, b, c, d, 2);
+__m256i test_mm256_subs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_subs_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_subs_epi16(a, b);
 }
 
-__m128i test_mm_mask_i32gather_epi32(__m128i a, int const *b, __m128i c,
-                                     __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.d.d
-  return _mm_mask_i32gather_epi32(a, b, c, d, 2);
+__m256i test_mm256_subs_epu8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_subs_epu8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_subs_epu8(a, b);
 }
 
-__m256i test_mm256_mask_i32gather_epi32(__m256i a, int const *b, __m256i c,
-                                        __m256i d) {
-  // CHECK: @llvm.x86.avx2.gather.d.d.256
-  return _mm256_mask_i32gather_epi32(a, b, c, d, 2);
+__m256i test_mm256_subs_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_subs_epu16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_subs_epu16(a, b);
 }
 
-__m128i test_mm_mask_i64gather_epi32(__m128i a, int const *b, __m128i c,
-                                     __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.q.d
-  return _mm_mask_i64gather_epi32(a, b, c, d, 2);
+__m256i test_mm256_unpackhi_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpackhi_epi8
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  return _mm256_unpackhi_epi8(a, b);
 }
 
-__m128i test_mm256_mask_i64gather_epi32(__m128i a, int const *b, __m256i c,
-                                        __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.q.d.256
-  return _mm256_mask_i64gather_epi32(a, b, c, d, 2);
+__m256i test_mm256_unpackhi_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpackhi_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  return _mm256_unpackhi_epi16(a, b);
 }
 
-__m128i test_mm_mask_i32gather_epi64(__m128i a, long long const *b, __m128i c,
-                                     __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.d.q
-  return _mm_mask_i32gather_epi64(a, b, c, d, 2);
+__m256i test_mm256_unpackhi_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpackhi_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  return _mm256_unpackhi_epi32(a, b);
 }
 
-__m256i test_mm256_mask_i32gather_epi64(__m256i a, long long const *b, __m128i c,
-                                        __m256i d) {
-  // CHECK: @llvm.x86.avx2.gather.d.q.256
-  return _mm256_mask_i32gather_epi64(a, b, c, d, 2);
+__m256i test_mm256_unpackhi_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpackhi_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  return _mm256_unpackhi_epi64(a, b);
 }
 
-__m128i test_mm_mask_i64gather_epi64(__m128i a, long long const *b, __m128i c,
-                                     __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.q.q
-  return _mm_mask_i64gather_epi64(a, b, c, d, 2);
+__m256i test_mm256_unpacklo_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpacklo_epi8
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  return _mm256_unpacklo_epi8(a, b);
 }
 
-__m256i test_mm256_mask_i64gather_epi64(__m256i a, long long const *b, __m256i c,
-                                        __m256i d) {
-  // CHECK: @llvm.x86.avx2.gather.q.q.256
-  return _mm256_mask_i64gather_epi64(a, b, c, d, 2);
+__m256i test_mm256_unpacklo_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpacklo_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  return _mm256_unpacklo_epi16(a, b);
 }
 
-__m128d test_mm_i32gather_pd(double const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.pd
-  return _mm_i32gather_pd(b, c, 2);
+__m256i test_mm256_unpacklo_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpacklo_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  return _mm256_unpacklo_epi32(a, b);
 }
 
-__m256d test_mm256_i32gather_pd(double const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.pd.256
-  return _mm256_i32gather_pd(b, c, 2);
+__m256i test_mm256_unpacklo_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpacklo_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  return _mm256_unpacklo_epi64(a, b);
 }
 
-__m128d test_mm_i64gather_pd(double const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.pd
-  return _mm_i64gather_pd(b, c, 2);
-}
-
-__m256d test_mm256_i64gather_pd(double const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.pd.256
-  return _mm256_i64gather_pd(b, c, 2);
-}
-
-__m128 test_mm_i32gather_ps(float const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.ps
-  return _mm_i32gather_ps(b, c, 2);
-}
-
-__m256 test_mm256_i32gather_ps(float const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.ps.256
-  return _mm256_i32gather_ps(b, c, 2);
-}
-
-__m128 test_mm_i64gather_ps(float const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.ps
-  return _mm_i64gather_ps(b, c, 2);
-}
-
-__m128 test_mm256_i64gather_ps(float const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.ps.256
-  return _mm256_i64gather_ps(b, c, 2);
-}
-
-__m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.d
-  return _mm_i32gather_epi32(b, c, 2);
-}
-
-__m256i test_mm256_i32gather_epi32(int const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.d.256
-  return _mm256_i32gather_epi32(b, c, 2);
-}
-
-__m128i test_mm_i64gather_epi32(int const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.d
-  return _mm_i64gather_epi32(b, c, 2);
-}
-
-__m128i test_mm256_i64gather_epi32(int const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.d.256
-  return _mm256_i64gather_epi32(b, c, 2);
-}
-
-__m128i test_mm_i32gather_epi64(long long const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.q
-  return _mm_i32gather_epi64(b, c, 2);
-}
-
-__m256i test_mm256_i32gather_epi64(long long const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.q.256
-  return _mm256_i32gather_epi64(b, c, 2);
-}
-
-__m128i test_mm_i64gather_epi64(long long const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.q
-  return _mm_i64gather_epi64(b, c, 2);
-}
-
-__m256i test_mm256_i64gather_epi64(long long const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.q.256
-  return _mm256_i64gather_epi64(b, c, 2);
+__m256i test_mm256_xor_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_xor_si256
+  // CHECK: xor <4 x i64>
+  return _mm256_xor_si256(a, b);
 }
diff --git a/test/CodeGen/avx512bw-builtins.c b/test/CodeGen/avx512bw-builtins.c
index 7addd98..7cb2a70 100644
--- a/test/CodeGen/avx512bw-builtins.c
+++ b/test/CodeGen/avx512bw-builtins.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -8,338 +8,366 @@
 
 __mmask64 test_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.512
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpeq_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.512
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpeq_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.512
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpeq_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.512
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpeq_epi16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.512
+  // CHECK: icmp sgt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpgt_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.512
+  // CHECK: icmp sgt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpgt_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.512
+  // CHECK: icmp sgt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpgt_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.512
+  // CHECK: icmp sgt <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpgt_epi16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 0, i64 -1)
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpeq_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 0, i64 {{.*}})
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpeq_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 0, i32 -1)
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpeq_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 0, i32 {{.*}})
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpeq_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 6, i64 -1)
+  // CHECK: icmp ugt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpgt_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 6, i64 {{.*}})
+  // CHECK: icmp ugt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpgt_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 6, i32 -1)
+  // CHECK: icmp ugt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpgt_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 6, i32 {{.*}})
+  // CHECK: icmp ugt <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpgt_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 5, i64 -1)
+  // CHECK: icmp sge <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpge_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 5, i64 {{.*}})
+  // CHECK: icmp sge <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpge_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 5, i64 -1)
+  // CHECK: icmp uge <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpge_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 5, i64 {{.*}})
+  // CHECK: icmp uge <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpge_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 5, i32 -1)
+  // CHECK: icmp sge <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpge_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 5, i32 {{.*}})
+  // CHECK: icmp sge <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpge_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 5, i32 -1)
+  // CHECK: icmp uge <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpge_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 5, i32 {{.*}})
+  // CHECK: icmp uge <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpge_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 2, i64 -1)
+  // CHECK: icmp sle <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmple_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 2, i64 {{.*}})
+  // CHECK: icmp sle <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmple_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 2, i64 -1)
+  // CHECK: icmp ule <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmple_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 2, i64 {{.*}})
+  // CHECK: icmp ule <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmple_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 2, i32 -1)
+  // CHECK: icmp sle <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmple_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 2, i32 {{.*}})
+  // CHECK: icmp sle <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmple_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 2, i32 -1)
+  // CHECK: icmp ule <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmple_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 2, i32 {{.*}})
+  // CHECK: icmp ule <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmple_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 1, i64 -1)
+  // CHECK: icmp slt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmplt_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 1, i64 {{.*}})
+  // CHECK: icmp slt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmplt_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 1, i64 -1)
+  // CHECK: icmp ult <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmplt_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 1, i64 {{.*}})
+  // CHECK: icmp ult <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmplt_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 1, i32 -1)
+  // CHECK: icmp slt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmplt_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 1, i32 {{.*}})
+  // CHECK: icmp slt <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmplt_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 1, i32 -1)
+  // CHECK: icmp ult <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmplt_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 1, i32 {{.*}})
+  // CHECK: icmp ult <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmplt_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 4, i64 -1)
+  // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpneq_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 4, i64 {{.*}})
+  // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpneq_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 4, i64 -1)
+  // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpneq_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 4, i64 {{.*}})
+  // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpneq_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 4, i32 -1)
+  // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpneq_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 4, i32 {{.*}})
+  // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpneq_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 4, i32 -1)
+  // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpneq_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 4, i32 {{.*}})
+  // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpneq_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmp_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 7, i64 -1)
-  return (__mmask64)_mm512_cmp_epi8_mask(__a, __b, 7);
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_cmp_epi8_mask(__a, __b, 0);
 }
 
 __mmask64 test_mm512_mask_cmp_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 7, i64 {{.*}})
-  return (__mmask64)_mm512_mask_cmp_epi8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_mask_cmp_epi8_mask(__u, __a, __b, 0);
 }
 
 __mmask64 test_mm512_cmp_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 7, i64 -1)
-  return (__mmask64)_mm512_cmp_epu8_mask(__a, __b, 7);
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_cmp_epu8_mask(__a, __b, 0);
 }
 
 __mmask64 test_mm512_mask_cmp_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 7, i64 {{.*}})
-  return (__mmask64)_mm512_mask_cmp_epu8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_mask_cmp_epu8_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm512_cmp_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 7, i32 -1)
-  return (__mmask32)_mm512_cmp_epi16_mask(__a, __b, 7);
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_cmp_epi16_mask(__a, __b, 0);
 }
 
 __mmask32 test_mm512_mask_cmp_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 7, i32 {{.*}})
-  return (__mmask32)_mm512_mask_cmp_epi16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_mask_cmp_epi16_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm512_cmp_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 7, i32 -1)
-  return (__mmask32)_mm512_cmp_epu16_mask(__a, __b, 7);
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_cmp_epu16_mask(__a, __b, 0);
 }
 
 __mmask32 test_mm512_mask_cmp_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 7, i32 {{.*}})
-  return (__mmask32)_mm512_mask_cmp_epu16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_mask_cmp_epu16_mask(__u, __a, __b, 0);
 }
 
 __m512i test_mm512_add_epi8 (__m512i __A, __m512i __B) {
@@ -434,12 +462,12 @@
 
 __m512i test_mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, __m512i __W) {
   // CHECK-LABEL: @test_mm512_mask_blend_epi8
-  // CHECK: @llvm.x86.avx512.mask.blend.b.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_blend_epi8(__U,__A,__W); 
 }
 __m512i test_mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, __m512i __W) {
   // CHECK-LABEL: @test_mm512_mask_blend_epi16
-  // CHECK: @llvm.x86.avx512.mask.blend.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_blend_epi16(__U,__A,__W); 
 }
 __m512i test_mm512_abs_epi8(__m512i __A) {
@@ -971,73 +999,617 @@
 
 __m512i test_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
   return _mm512_unpackhi_epi8(__A, __B); 
 }
 
 __m512i test_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpackhi_epi8(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   return _mm512_unpackhi_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpackhi_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
   return _mm512_unpacklo_epi8(__A, __B); 
 }
 
 __m512i test_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpacklo_epi8(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
   return _mm512_unpacklo_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpacklo_epi16(__U, __A, __B); 
 }
 
+__m512i test_mm512_cvtepi8_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  return _mm512_cvtepi8_epi16(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  return _mm512_mask_cvtepi8_epi16(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  return _mm512_maskz_cvtepi8_epi16(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu8_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  return _mm512_cvtepu8_epi16(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  return _mm512_mask_cvtepu8_epi16(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  return _mm512_maskz_cvtepu8_epi16(__U, __A); 
+}
+
+__m512i test_mm512_shufflehi_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_shufflehi_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  return _mm512_shufflehi_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_shufflehi_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shufflehi_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_shufflehi_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shufflehi_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_shufflelo_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_shufflelo_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  return _mm512_shufflelo_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_shufflelo_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shufflelo_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_shufflelo_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shufflelo_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_sllv_epi16(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm512_sllv_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_sllv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm512_mask_sllv_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm512_maskz_sllv_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_sll_epi16(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  return _mm512_sll_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  return _mm512_mask_sll_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  return _mm512_maskz_sll_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_slli_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  return _mm512_slli_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  return _mm512_mask_slli_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  return _mm512_maskz_slli_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_bslli_epi128(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_bslli_epi128
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+  return _mm512_bslli_epi128(__A, 5);
+}
+
+__m512i test_mm512_srlv_epi16(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_srlv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm512_srlv_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srlv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm512_mask_srlv_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srlv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm512_maskz_srlv_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_srav_epi16(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_srav_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm512_srav_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srav_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm512_mask_srav_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srav_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm512_maskz_srav_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_sra_epi16(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sra_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  return _mm512_sra_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sra_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  return _mm512_mask_sra_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sra_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  return _mm512_maskz_sra_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_srai_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srai_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  return _mm512_srai_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srai_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  return _mm512_mask_srai_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srai_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  return _mm512_maskz_srai_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_srl_epi16(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_srl_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  return _mm512_srl_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srl_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  return _mm512_mask_srl_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srl_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  return _mm512_maskz_srl_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_srli_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  return _mm512_srli_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  return _mm512_mask_srli_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  return _mm512_maskz_srli_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_bsrli_epi128(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_bsrli_epi128
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+  return _mm512_bsrli_epi128(__A, 5);
+}
+__m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_epi16
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_mov_epi16(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_epi16
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_mov_epi16(__U, __A); 
+}
+
+__m512i test_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_epi8
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  return _mm512_mask_mov_epi8(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_epi8
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  return _mm512_maskz_mov_epi8(__U, __A); 
+}
+
+__m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
+  // CHECK-LABEL: @test_mm512_mask_set1_epi8
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.b.gpr.512
+  return _mm512_mask_set1_epi8(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
+  // CHECK-LABEL: @test_mm512_maskz_set1_epi8
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.b.gpr.512
+  return _mm512_maskz_set1_epi8(__M, __A); 
+}
+
+__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
+  // CHECK-LABEL: @test_mm512_kunpackd
+  // CHECK: @llvm.x86.avx512.kunpck.dq
+  return _mm512_kunpackd(__A, __B); 
+}
+
+__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
+  // CHECK-LABEL: @test_mm512_kunpackw
+  // CHECK: @llvm.x86.avx512.kunpck.wd
+  return _mm512_kunpackw(__A, __B); 
+}
+
+__m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_loadu_epi16
+  // CHECK: @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  return _mm512_mask_loadu_epi16(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_loadu_epi16
+  // CHECK: @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  return _mm512_maskz_loadu_epi16(__U, __P); 
+}
+
+__m512i test_mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_loadu_epi8
+  // CHECK: @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  return _mm512_mask_loadu_epi8(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_loadu_epi8
+  // CHECK: @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  return _mm512_maskz_loadu_epi8(__U, __P); 
+}
+void test_mm512_mask_storeu_epi16(void *__P, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_storeu_epi16
+  // CHECK: @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %{{.*}}, <32 x i16>* %{{.*}}, i32 1, <32 x i1> %{{.*}})
+  return _mm512_mask_storeu_epi16(__P, __U, __A); 
+}
+__mmask64 test_mm512_test_epi8_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.512
+  return _mm512_test_epi8_mask(__A, __B); 
+}
+
+void test_mm512_mask_storeu_epi8(void *__P, __mmask64 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_storeu_epi8
+  // CHECK: @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %{{.*}}, <64 x i8>* %{{.*}}, i32 1, <64 x i1> %{{.*}})
+  return _mm512_mask_storeu_epi8(__P, __U, __A); 
+}
+__mmask64 test_mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.512
+  return _mm512_mask_test_epi8_mask(__U, __A, __B); 
+}
+
+__mmask32 test_mm512_test_epi16_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.
+  return _mm512_test_epi16_mask(__A, __B); 
+}
+
+__mmask32 test_mm512_mask_test_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.
+  return _mm512_mask_test_epi16_mask(__U, __A, __B); 
+}
+
+__mmask64 test_mm512_testn_epi8_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.
+  return _mm512_testn_epi8_mask(__A, __B); 
+}
+
+__mmask64 test_mm512_mask_testn_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.
+  return _mm512_mask_testn_epi8_mask(__U, __A, __B); 
+}
+
+__mmask32 test_mm512_testn_epi16_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.
+  return _mm512_testn_epi16_mask(__A, __B); 
+}
+
+__mmask32 test_mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.
+  return _mm512_mask_testn_epi16_mask(__U, __A, __B); 
+}
+
+__mmask64 test_mm512_movepi8_mask(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_movepi8_mask
+  // CHECK: @llvm.x86.avx512.cvtb2mask.512
+  return _mm512_movepi8_mask(__A); 
+}
+
+__m512i test_mm512_movm_epi8(__mmask64 __A) {
+  // CHECK-LABEL: @test_mm512_movm_epi8
+  // CHECK: @llvm.x86.avx512.cvtmask2b.512
+  return _mm512_movm_epi8(__A); 
+}
+
+__m512i test_mm512_movm_epi16(__mmask32 __A) {
+  // CHECK-LABEL: @test_mm512_movm_epi16
+  // CHECK: @llvm.x86.avx512.cvtmask2w.512
+  return _mm512_movm_epi16(__A); 
+}
+
+__m512i test_mm512_broadcastb_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <64 x i32> zeroinitializer
+  return _mm512_broadcastb_epi8(__A);
+}
+
+__m512i test_mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <64 x i32> zeroinitializer
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  return _mm512_mask_broadcastb_epi8(__O, __M, __A);
+}
+
+__m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <64 x i32> zeroinitializer
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  return _mm512_maskz_broadcastb_epi8(__M, __A);
+}
+
+__m512i test_mm512_broadcastw_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <32 x i32> zeroinitializer
+  return _mm512_broadcastw_epi16(__A);
+}
+
+__m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <32 x i32> zeroinitializer
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_broadcastw_epi16(__O, __M, __A);
+}
+
+__m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <32 x i32> zeroinitializer
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_broadcastw_epi16(__M, __A);
+}
+
+__m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
+  // CHECK-LABEL: @test_mm512_mask_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.512
+  return _mm512_mask_set1_epi16(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
+  // CHECK-LABEL: @test_mm512_maskz_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.512
+  return _mm512_maskz_set1_epi16(__M, __A); 
+}
+__m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+ return _mm512_permutexvar_epi16(__A, __B); 
+}
+
+__m512i test_mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
+ // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+  return _mm512_maskz_permutexvar_epi16(__M, __A, __B); 
+}
+
+__m512i test_mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+  return _mm512_mask_permutexvar_epi16(__W, __M, __A, __B); 
+}
+__m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){
+    // CHECK-LABEL: @test_mm512_alignr_epi8
+    // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
+    return _mm512_alignr_epi8(__A, __B, 2); 
+}
+
+__m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m512i __B){
+    // CHECK-LABEL: @test_mm512_mask_alignr_epi8
+    // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
+    // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+    return _mm512_mask_alignr_epi8(__W, __U, __A, __B, 2); 
+}
+
+__m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
+    // CHECK-LABEL: @test_mm512_maskz_alignr_epi8
+    // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
+    // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+   return _mm512_maskz_alignr_epi8(__U, __A, __B, 2); 
+}
+
+
+
+__m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mm_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  return _mm512_dbsad_epu8(__A, __B, 170); 
+}
+
+__m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mm_mask_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+}
+
+__m512i test_mm512_mm_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mm_maskz_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); 
+}
+
+__m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_sad_epu8
+  // CHECK: @llvm.x86.avx512.psad.bw.512
+  return _mm512_sad_epu8(__A, __B); 
+}
+
+__mmask32 test_mm512_movepi16_mask(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_movepi16_mask
+  // CHECK: @llvm.x86.avx512.cvtw2mask.512
+  return _mm512_movepi16_mask(__A); 
+}
+
+void test_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ // CHECK-LABEL: @test_mm512_mask_cvtepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.512
+ __builtin_ia32_pmovwb512mem_mask ( __P,  __A, __M);
+}
+
+void test_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ // CHECK-LABEL: @test_mm512_mask_cvtsepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.512
+ __builtin_ia32_pmovswb512mem_mask ( __P,  __A, __M);
+}
+
+void test_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ // CHECK-LABEL: @test_mm512_mask_cvtusepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.512
+ __builtin_ia32_pmovuswb512mem_mask ( __P, __A, __M);
+}
diff --git a/test/CodeGen/avx512cdintrin.c b/test/CodeGen/avx512cdintrin.c
index 625a3d2..b5860b7 100644
--- a/test/CodeGen/avx512cdintrin.c
+++ b/test/CodeGen/avx512cdintrin.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512cd -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512cd -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -37,31 +37,47 @@
 }
 __m512i test_mm512_lzcnt_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_lzcnt_epi32
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.d.512
+  // CHECK: call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %{{.*}}, i1 false)
   return _mm512_lzcnt_epi32(__A); 
 }
 __m512i test_mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_lzcnt_epi32
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.d.512
+  // CHECK: call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %{{.*}}, i1 false)
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_lzcnt_epi32(__W,__U,__A); 
 }
 __m512i test_mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_lzcnt_epi32
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.d.512
+  // CHECK: call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %{{.*}}, i1 false)
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_lzcnt_epi32(__U,__A); 
 }
 __m512i test_mm512_lzcnt_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_lzcnt_epi64
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.q.512
+  // CHECK: call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %{{.*}}, i1 false)
   return _mm512_lzcnt_epi64(__A); 
 }
 __m512i test_mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_lzcnt_epi64
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.q.512
+  // CHECK: call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %{{.*}}, i1 false)
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_lzcnt_epi64(__W,__U,__A); 
 }
 __m512i test_mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_lzcnt_epi64
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.q.512
+  // CHECK: call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %{{.*}}, i1 false)
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_lzcnt_epi64(__U,__A); 
 }
+
+__m512i test_mm512_broadcastmb_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm512_broadcastmb_epi64
+  // CHECK: @llvm.x86.avx512.broadcastmb.512
+  return _mm512_broadcastmb_epi64(__A); 
+}
+
+__m512i test_mm512_broadcastmw_epi32(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm512_broadcastmw_epi32
+  // CHECK: @llvm.x86.avx512.broadcastmw.512
+  return _mm512_broadcastmw_epi32(__A); 
+}
diff --git a/test/CodeGen/avx512dq-builtins.c b/test/CodeGen/avx512dq-builtins.c
index fc09a28..91bfbaf 100644
--- a/test/CodeGen/avx512dq-builtins.c
+++ b/test/CodeGen/avx512dq-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -133,7 +133,8 @@
 
 __m512d test_mm512_andnot_pd (__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_andnot_pd
-  // CHECK: @llvm.x86.avx512.mask.andn.pd.512
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64>
   return (__m512d) _mm512_andnot_pd(__A, __B);
 }
 
@@ -151,7 +152,8 @@
 
 __m512 test_mm512_andnot_ps (__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_andnot_ps
-  // CHECK: @llvm.x86.avx512.mask.andn.ps.512
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32>
   return (__m512) _mm512_andnot_ps(__A, __B);
 }
 
@@ -635,6 +637,78 @@
   return _mm512_maskz_range_round_pd(__U, __A, __B, 4, 8); 
 }
 
+__m128d test_mm512_range_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_range_round_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_range_round_sd(__A, __B, 4, 8); 
+}
+
+__m128d test_mm512_mask_range_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: test_mm512_mask_range_round_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_mask_range_round_sd(__W, __U, __A, __B, 4, 8); 
+}
+
+__m128d test_mm512_maskz_range_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_range_round_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_maskz_range_round_sd(__U, __A, __B, 4, 8); 
+}
+
+__m128d test_mm512_range_round_ss(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_range_round_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_range_round_ss(__A, __B, 4, 8); 
+}
+
+__m128d test_mm512_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_mask_range_round_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_mask_range_round_ss(__W, __U, __A, __B, 4, 8); 
+}
+
+__m128 test_mm512_maskz_range_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_range_round_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_maskz_range_round_ss(__U, __A, __B, 4, 8); 
+}
+
+__m128d test_mm_range_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_range_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_range_sd(__A, __B, 4); 
+}
+
+__m128d test_mm_mask_range_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: test_mm_mask_range_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_mask_range_sd(__W, __U, __A, __B, 4); 
+}
+
+__m128d test_mm_maskz_range_sd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_range_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_maskz_range_sd(__U, __A, __B, 4); 
+}
+
+__m128d test_mm_range_ss(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_range_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_range_ss(__A, __B, 4); 
+}
+
+__m128d test_mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_range_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_mask_range_ss(__W, __U, __A, __B, 4); 
+}
+
+__m128 test_mm_maskz_range_ss(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_range_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_maskz_range_ss(__U, __A, __B, 4); 
+}
+
 __m512 test_mm512_range_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.512
@@ -743,3 +817,397 @@
   return _mm512_maskz_reduce_round_ps(__U, __A, 4, 8); 
 }
 
+__m128 test_mm_reduce_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_reduce_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_reduce_ss(__A, __B, 4);
+}
+
+__m128 test_mm_mask_reduce_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_mask_reduce_ss(__W, __U, __A, __B, 4);
+}
+
+__m128 test_mm_maskz_reduce_ss(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_maskz_reduce_ss(__U, __A, __B, 4);
+}
+
+__m128 test_mm_reduce_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_reduce_round_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_reduce_round_ss(__A, __B, 4, 8);
+}
+
+__m128 test_mm_mask_reduce_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_round_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_mask_reduce_round_ss(__W, __U, __A, __B, 4, 8);
+}
+
+__m128 test_mm_maskz_reduce_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_round_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_maskz_reduce_round_ss(__U, __A, __B, 4, 8);
+}
+
+__m128d test_mm_reduce_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_reduce_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_reduce_sd(__A, __B, 4);
+}
+
+__m128d test_mm_mask_reduce_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_mask_reduce_sd(__W, __U, __A, __B, 4);
+}
+
+__m128d test_mm_maskz_reduce_sd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_maskz_reduce_sd(__U, __A, __B, 4);
+}
+
+__m128d test_mm_reduce_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_reduce_round_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_reduce_round_sd(__A, __B, 4, 8);
+}
+
+__m128d test_mm_mask_reduce_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_round_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_mask_reduce_round_sd(__W, __U, __A, __B, 4, 8);
+}
+
+__m128d test_mm_maskz_reduce_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_round_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_maskz_reduce_round_sd(__U, __A, __B, 4, 8);
+}
+
+__mmask16 test_mm512_movepi32_mask(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_movepi32_mask
+  // CHECK: @llvm.x86.avx512.cvtd2mask.512
+  return _mm512_movepi32_mask(__A); 
+}
+
+__m512i test_mm512_movm_epi32(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm512_movm_epi32
+  // CHECK: @llvm.x86.avx512.cvtmask2d.512
+  return _mm512_movm_epi32(__A); 
+}
+
+__m512i test_mm512_movm_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm512_movm_epi64
+  // CHECK: @llvm.x86.avx512.cvtmask2q.512
+  return _mm512_movm_epi64(__A); 
+}
+
+__mmask8 test_mm512_movepi64_mask(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_movepi64_mask
+  // CHECK: @llvm.x86.avx512.cvtq2mask.512
+  return _mm512_movepi64_mask(__A); 
+}
+
+__m512 test_mm512_broadcast_f32x2(__m128 __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm512_broadcast_f32x2(__A); 
+}
+
+__m512 test_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm512_mask_broadcast_f32x2(__O, __M, __A); 
+}
+
+__m512 test_mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm512_maskz_broadcast_f32x2(__M, __A); 
+}
+
+__m512 test_mm512_broadcast_f32x8(__m256 __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x8
+  return _mm512_broadcast_f32x8(__A); 
+}
+
+__m512 test_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x8
+  return _mm512_mask_broadcast_f32x8(__O, __M, __A); 
+}
+
+__m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x8
+  return _mm512_maskz_broadcast_f32x8(__M, __A); 
+}
+
+__m512d test_mm512_broadcast_f64x2(__m128d __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm512_broadcast_f64x2(__A); 
+}
+
+__m512d test_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm512_mask_broadcast_f64x2(__O, __M, __A); 
+}
+
+__m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm512_maskz_broadcast_f64x2(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i32x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm512_broadcast_i32x2(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm512_mask_broadcast_i32x2(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm512_maskz_broadcast_i32x2(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i32x8(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x8
+  return _mm512_broadcast_i32x8(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x8
+  return _mm512_mask_broadcast_i32x8(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x8
+  return _mm512_maskz_broadcast_i32x8(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i64x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm512_broadcast_i64x2(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm512_mask_broadcast_i64x2(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm512_maskz_broadcast_i64x2(__M, __A); 
+}
+__m256 test_mm512_extractf32x8_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_extractf32x8_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  return _mm512_extractf32x8_ps(__A, 1); 
+}
+
+__m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_extractf32x8_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); 
+}
+
+__m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extractf32x8_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  return _mm512_maskz_extractf32x8_ps(__U, __A, 1); 
+}
+
+__m128d test_mm512_extractf64x2_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm512_extractf64x2_pd(__A, 3); 
+}
+
+__m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); 
+}
+
+__m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm512_maskz_extractf64x2_pd(__U, __A, 3); 
+}
+
+__m256i test_mm512_extracti32x8_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_extracti32x8_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  return _mm512_extracti32x8_epi32(__A, 1); 
+}
+
+__m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_extracti32x8_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); 
+}
+
+__m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extracti32x8_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); 
+}
+
+__m128i test_mm512_extracti64x2_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm512_extracti64x2_epi64(__A, 3); 
+}
+
+__m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); 
+}
+
+__m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); 
+}
+
+__m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm512_insertf32x8
+  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  return _mm512_insertf32x8(__A, __B, 1); 
+}
+
+__m512 test_mm512_mask_insertf32x8(__m512 __W, __mmask16 __U, __m512 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm512_mask_insertf32x8
+  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  return _mm512_mask_insertf32x8(__W, __U, __A, __B, 1); 
+}
+
+__m512 test_mm512_maskz_insertf32x8(__mmask16 __U, __m512 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_insertf32x8
+  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  return _mm512_maskz_insertf32x8(__U, __A, __B, 1); 
+}
+
+__m512d test_mm512_insertf64x2(__m512d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm512_insertf64x2(__A, __B, 3); 
+}
+
+__m512d test_mm512_mask_insertf64x2(__m512d __W, __mmask8 __U, __m512d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_mask_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm512_mask_insertf64x2(__W, __U, __A, __B, 3); 
+}
+
+__m512d test_mm512_maskz_insertf64x2(__mmask8 __U, __m512d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm512_maskz_insertf64x2(__U, __A, __B, 3); 
+}
+
+__m512i test_mm512_inserti32x8(__m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_inserti32x8
+  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  return _mm512_inserti32x8(__A, __B, 1); 
+}
+
+__m512i test_mm512_mask_inserti32x8(__m512i __W, __mmask16 __U, __m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_mask_inserti32x8
+  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  return _mm512_mask_inserti32x8(__W, __U, __A, __B, 1); 
+}
+
+__m512i test_mm512_maskz_inserti32x8(__mmask16 __U, __m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_inserti32x8
+  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  return _mm512_maskz_inserti32x8(__U, __A, __B, 1); 
+}
+
+__m512i test_mm512_inserti64x2(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm512_inserti64x2(__A, __B, 1); 
+}
+
+__m512i test_mm512_mask_inserti64x2(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm512_mask_inserti64x2(__W, __U, __A, __B, 1); 
+}
+
+__m512i test_mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm512_maskz_inserti64x2(__U, __A, __B, 1); 
+}
+__mmask8 test_mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.512
+  return _mm512_mask_fpclass_pd_mask(__U, __A, 4); 
+}
+
+__mmask8 test_mm512_fpclass_pd_mask(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.512
+  return _mm512_fpclass_pd_mask(__A, 4); 
+}
+
+__mmask16 test_mm512_mask_fpclass_ps_mask(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.512
+  return _mm512_mask_fpclass_ps_mask(__U, __A, 4); 
+}
+
+__mmask16 test_mm512_fpclass_ps_mask(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.512
+  return _mm512_fpclass_ps_mask(__A, 4); 
+}
+
+__mmask8 test_mm_fpclass_sd_mask(__m128 __A)  { 
+  // CHECK-LABEL: @test_mm_fpclass_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.sd
+ return _mm_fpclass_sd_mask (__A, 2);
+}
+
+__mmask8 test_mm_mask_fpclass_sd_mask(__mmask8 __U, __m128 __A)  {
+ // CHECK-LABEL: @test_mm_mask_fpclass_sd_mask
+ // CHECK: @llvm.x86.avx512.mask.fpclass.sd
+ return _mm_mask_fpclass_sd_mask (__U,  __A, 2);
+}
+
+__mmask8 test_mm_fpclass_ss_mask(__m128 __A)  { 
+ // CHECK-LABEL: @test_mm_fpclass_ss_mask
+ // CHECK: @llvm.x86.avx512.mask.fpclass.ss
+ return _mm_fpclass_ss_mask ( __A, 2);
+}
+
+__mmask8 test_mm_mask_fpclass_ss_mask(__mmask8 __U, __m128 __A)  {
+ // CHECK-LABEL: @test_mm_mask_fpclass_ss_mask
+ // CHECK: @llvm.x86.avx512.mask.fpclass.ss
+ return _mm_mask_fpclass_ss_mask (__U, __A, 2);
+}
+
diff --git a/test/CodeGen/avx512er-builtins.c b/test/CodeGen/avx512er-builtins.c
index 7c6b050..1532935 100644
--- a/test/CodeGen/avx512er-builtins.c
+++ b/test/CodeGen/avx512er-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512er -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512er -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
diff --git a/test/CodeGen/avx512f-builtins.c b/test/CodeGen/avx512f-builtins.c
index c1f4c0e..a51a485 100644
--- a/test/CodeGen/avx512f-builtins.c
+++ b/test/CodeGen/avx512f-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -12,6 +12,41 @@
   return _mm512_sqrt_pd(a);
 }
 
+__m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_sqrt_pd 
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_mask_sqrt_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_pd 
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_maskz_sqrt_pd (__U,__A);
+}
+
+__m512d test_mm512_mask_sqrt_round_pd(__m512d __W,__mmask8 __U,__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_sqrt_round_pd
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_mask_sqrt_round_pd(__W,__U,__A,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_maskz_sqrt_round_pd(__mmask8 __U,__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_round_pd
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_maskz_sqrt_round_pd(__U,__A,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_sqrt_round_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_sqrt_round_pd
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_sqrt_round_pd(__A,_MM_FROUND_CUR_DIRECTION);
+}
+
 __m512 test_mm512_sqrt_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_sqrt_ps
@@ -19,6 +54,41 @@
   return _mm512_sqrt_ps(a);
 }
 
+__m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_sqrt_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_mask_sqrt_ps( __W, __U, __A);
+}
+
+__m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_maskz_sqrt_ps(__U ,__A);
+}
+
+__m512 test_mm512_mask_sqrt_round_ps(__m512 __W,__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_sqrt_round_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_mask_sqrt_round_ps(__W,__U,__A,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_sqrt_round_ps(__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_maskz_sqrt_round_ps(__U,__A,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_sqrt_round_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_sqrt_round_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_sqrt_round_ps(__A,_MM_FROUND_CUR_DIRECTION);
+}
+
 __m512d test_mm512_rsqrt14_pd(__m512d a)
 {
   // CHECK-LABEL: @test_mm512_rsqrt14_pd
@@ -26,6 +96,20 @@
   return _mm512_rsqrt14_pd(a);
 }
 
+__m512d test_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_rsqrt14_pd 
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.512
+  return _mm512_mask_rsqrt14_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_rsqrt14_pd 
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.512
+  return _mm512_maskz_rsqrt14_pd (__U,__A);
+}
+
 __m512 test_mm512_rsqrt14_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_rsqrt14_ps
@@ -33,6 +117,20 @@
   return _mm512_rsqrt14_ps(a);
 }
 
+__m512 test_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_rsqrt14_ps 
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.512
+  return _mm512_mask_rsqrt14_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_rsqrt14_ps 
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.512
+  return _mm512_maskz_rsqrt14_ps (__U,__A);
+}
+
 __m512 test_mm512_add_ps(__m512 a, __m512 b)
 {
   // CHECK-LABEL: @test_mm512_add_ps
@@ -61,27 +159,67 @@
   return _mm512_mul_pd(a, b);
 }
 
+void test_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_storeu_si512
+  // CHECK: store <16 x i32> %{{.*}}, <16 x i32>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm512_storeu_si512 ( __P,__A);
+}
+
 void test_mm512_storeu_ps(void *p, __m512 a)
 {
   // CHECK-LABEL: @test_mm512_storeu_ps
-  // CHECK: @llvm.x86.avx512.mask.storeu.ps.512
+  // CHECK: store <16 x float> %{{.*}}, <16 x float>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
   _mm512_storeu_ps(p, a);
 }
 
 void test_mm512_storeu_pd(void *p, __m512d a)
 {
   // CHECK-LABEL: @test_mm512_storeu_pd
-  // CHECK: @llvm.x86.avx512.mask.storeu.pd.512
+  // CHECK: store <8 x double> %{{.*}}, <8 x double>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
   _mm512_storeu_pd(p, a);
 }
 
 void test_mm512_mask_store_ps(void *p, __m512 a, __mmask16 m)
 {
   // CHECK-LABEL: @test_mm512_mask_store_ps
-  // CHECK: @llvm.x86.avx512.mask.store.ps.512
+  // CHECK: @llvm.masked.store.v16f32.p0v16f32(<16 x float> %{{.*}}, <16 x float>* %{{.*}}, i32 64, <16 x i1> %{{.*}})
   _mm512_mask_store_ps(p, m, a);
 }
 
+void test_mm512_store_si512 (void *__P, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_store_si512 
+  // CHECK: load <8 x i64>, <8 x i64>* %__A.addr.i, align 64
+  // CHECK: [[SI512_3:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: bitcast i8* [[SI512_3]] to <8 x i64>*
+  // CHECK: store <8 x i64>  
+  _mm512_store_si512 ( __P,__A);
+}
+
+void test_mm512_store_epi32 (void *__P, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_store_epi32 
+  // CHECK: load <8 x i64>, <8 x i64>* %__A.addr.i, align 64
+  // CHECK: [[Si32_3:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: bitcast i8* [[Si32_3]] to <8 x i64>*
+  // CHECK: store <8 x i64>  
+  _mm512_store_epi32 ( __P,__A);
+}
+
+void test_mm512_store_epi64 (void *__P, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_store_epi64 
+  // CHECK: load <8 x i64>, <8 x i64>* %__A.addr.i, align 64
+  // CHECK: [[SI64_3:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: bitcast i8* [[SI64_3]] to <8 x i64>*
+  // CHECK: store <8 x i64>  
+  _mm512_store_epi64 ( __P,__A);
+}
+
 void test_mm512_store_ps(void *p, __m512 a)
 {
   // CHECK-LABEL: @test_mm512_store_ps
@@ -89,13 +227,6 @@
   _mm512_store_ps(p, a);
 }
 
-void test_mm512_mask_store_pd(void *p, __m512d a, __mmask8 m)
-{
-  // CHECK-LABEL: @test_mm512_mask_store_pd
-  // CHECK: @llvm.x86.avx512.mask.store.pd.512
-  _mm512_mask_store_pd(p, m, a);
-}
-
 void test_mm512_store_pd(void *p, __m512d a)
 {
   // CHECK-LABEL: @test_mm512_store_pd
@@ -103,6 +234,46 @@
   _mm512_store_pd(p, a);
 }
 
+void test_mm512_mask_store_pd(void *p, __m512d a, __mmask8 m)
+{
+  // CHECK-LABEL: @test_mm512_mask_store_pd
+  // CHECK: @llvm.masked.store.v8f64.p0v8f64(<8 x double> %{{.*}}, <8 x double>* %{{.*}}, i32 64, <8 x i1> %{{.*}})
+  _mm512_mask_store_pd(p, m, a);
+}
+
+void test_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_storeu_epi32
+  // CHECK: @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %{{.*}}, <16 x i32>* %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  return _mm512_mask_storeu_epi32(__P, __U, __A); 
+}
+
+void test_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_storeu_epi64
+  // CHECK: @llvm.masked.store.v8i64.p0v8i64(<8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  return _mm512_mask_storeu_epi64(__P, __U, __A); 
+}
+
+__m512i test_mm512_loadu_si512 (void *__P)
+{
+  // CHECK-LABEL: @test_mm512_loadu_si512 
+  // CHECK: load <16 x i32>, <16 x i32>* %{{.*}}, align 1{{$}}
+  return _mm512_loadu_si512 ( __P);
+}
+
+__m512i test_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_mask_loadu_epi32 
+  // CHECK: @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_mask_loadu_epi32 (__W,__U, __P);
+}
+
+__m512i test_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_mask_loadu_epi64 
+  // CHECK: @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_mask_loadu_epi64 (__W,__U, __P);
+}
+
 __m512 test_mm512_loadu_ps(void *p)
 {
   // CHECK-LABEL: @test_mm512_loadu_ps
@@ -110,6 +281,13 @@
   return _mm512_loadu_ps(p);
 }
 
+__m512 test_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_mask_loadu_ps 
+  // CHECK: @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_mask_loadu_ps (__W,__U, __P);
+}
+
 __m512d test_mm512_loadu_pd(void *p)
 {
   // CHECK-LABEL: @test_mm512_loadu_pd
@@ -117,34 +295,82 @@
   return _mm512_loadu_pd(p);
 }
 
-__m512 test_mm512_maskz_load_ps(void *p, __mmask16 m)
+__m512d test_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_maskz_load_ps
-  // CHECK: @llvm.x86.avx512.mask.load.ps.512
-  return _mm512_maskz_load_ps(m, p);
+  // CHECK-LABEL: @test_mm512_mask_loadu_pd 
+  // CHECK: @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_mask_loadu_pd (__W,__U, __P);
+}
+
+__m512i test_mm512_load_si512 (void *__P)
+{
+  // CHECK-LABEL: @test_mm512_load_si512 
+  // CHECK: [[LI512_1:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: [[LI512_2:%.+]] = bitcast i8* [[LI512_1]] to <8 x i64>*
+  // CHECK: load <8 x i64>, <8 x i64>* [[LI512_2]], align 64
+  return _mm512_load_si512 ( __P);
+}
+
+__m512i test_mm512_load_epi32 (void *__P)
+{
+  // CHECK-LABEL: @test_mm512_load_epi32 
+  // CHECK: [[LI32_1:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: [[LI32_2:%.+]] = bitcast i8* [[LI32_1]] to <8 x i64>*
+  // CHECK: load <8 x i64>, <8 x i64>* [[LI32_2]], align 64
+  return _mm512_load_epi32 ( __P);
+}
+
+__m512i test_mm512_load_epi64 (void *__P)
+{
+  // CHECK-LABEL: @test_mm512_load_epi64 
+  // CHECK: [[LI64_1:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: [[LI64_2:%.+]] = bitcast i8* [[LI64_1]] to <8 x i64>*
+  // CHECK: load <8 x i64>, <8 x i64>* [[LI64_2]], align 64
+  return _mm512_load_epi64 ( __P);
 }
 
 __m512 test_mm512_load_ps(void *p)
 {
   // CHECK-LABEL: @test_mm512_load_ps
-  // CHECK: @llvm.x86.avx512.mask.load.ps.512
+  // CHECK: load <16 x float>, <16 x float>* %{{.*}}, align 64
   return _mm512_load_ps(p);
 }
 
-__m512d test_mm512_maskz_load_pd(void *p, __mmask8 m)
+__m512 test_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_maskz_load_pd
-  // CHECK: @llvm.x86.avx512.mask.load.pd.512
-  return _mm512_maskz_load_pd(m, p);
+  // CHECK-LABEL: @test_mm512_mask_load_ps 
+  // CHECK: @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_mask_load_ps (__W,__U, __P);
+}
+
+__m512 test_mm512_maskz_load_ps(__mmask16 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_maskz_load_ps
+  // CHECK: @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_maskz_load_ps(__U, __P);
 }
 
 __m512d test_mm512_load_pd(void *p)
 {
   // CHECK-LABEL: @test_mm512_load_pd
-  // CHECK: @llvm.x86.avx512.mask.load.pd.512
+  // CHECK: load <8 x double>, <8 x double>* %{{.*}}, align 64
   return _mm512_load_pd(p);
 }
 
+__m512d test_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_mask_load_pd 
+  // CHECK: @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_mask_load_pd (__W,__U, __P);
+}
+
+__m512d test_mm512_maskz_load_pd(__mmask8 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_maskz_load_pd
+  // CHECK: @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_maskz_load_pd(__U, __P);
+}
+
 __m512d test_mm512_set1_pd(double d)
 {
   // CHECK-LABEL: @test_mm512_set1_pd
@@ -159,13 +385,6 @@
   return _mm512_set1_pd(d);
 }
 
-__m512d test_mm512_castpd256_pd512(__m256d a)
-{
-  // CHECK-LABEL: @test_mm512_castpd256_pd512
-  // CHECK: shufflevector <4 x double> {{.*}} <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-  return _mm512_castpd256_pd512(a);
-}
-
 __mmask16 test_mm512_knot(__mmask16 a)
 {
   // CHECK-LABEL: @test_mm512_knot
@@ -180,6 +399,20 @@
   return _mm512_alignr_epi32(a, b, 2);
 }
 
+__m512i test_mm512_mask_alignr_epi32(__m512i w, __mmask16 u, __m512i a, __m512i b)
+{
+  // CHECK-LABEL: @test_mm512_mask_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.512
+  return _mm512_mask_alignr_epi32(w, u, a, b, 2);
+}
+
+__m512i test_mm512_maskz_alignr_epi32( __mmask16 u, __m512i a, __m512i b)
+{
+  // CHECK-LABEL: @test_mm512_maskz_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.512
+  return _mm512_maskz_alignr_epi32(u, a, b, 2);
+}
+
 __m512i test_mm512_alignr_epi64(__m512i a, __m512i b)
 {
   // CHECK-LABEL: @test_mm512_alignr_epi64
@@ -187,18 +420,18 @@
   return _mm512_alignr_epi64(a, b, 2);
 }
 
-__m512d test_mm512_broadcastsd_pd(__m128d a)
+__m512i test_mm512_mask_alignr_epi64(__m512i w, __mmask8 u, __m512i a, __m512i b)
 {
-  // CHECK-LABEL: @test_mm512_broadcastsd_pd
-  // CHECK: insertelement <8 x double> {{.*}}, i32 0
-  // CHECK: insertelement <8 x double> {{.*}}, i32 1
-  // CHECK: insertelement <8 x double> {{.*}}, i32 2
-  // CHECK: insertelement <8 x double> {{.*}}, i32 3
-  // CHECK: insertelement <8 x double> {{.*}}, i32 4
-  // CHECK: insertelement <8 x double> {{.*}}, i32 5
-  // CHECK: insertelement <8 x double> {{.*}}, i32 6
-  // CHECK: insertelement <8 x double> {{.*}}, i32 7
-  return _mm512_broadcastsd_pd(a);
+  // CHECK-LABEL: @test_mm512_mask_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.512
+  return _mm512_mask_alignr_epi64(w, u, a, b, 2);
+}
+
+__m512i test_mm512_maskz_alignr_epi64( __mmask8 u, __m512i a, __m512i b)
+{
+  // CHECK-LABEL: @test_mm512_maskz_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.512
+  return _mm512_maskz_alignr_epi64(u, a, b, 2);
 }
 
 __m512d test_mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -685,49 +918,53 @@
 
 __mmask16 test_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.d.512
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpeq_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.d.512
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpeq_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.q.512
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpeq_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.q.512
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpeq_epi64_mask(__a, __b);
 }
 
 __mmask16 test_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.d.512
+  // CHECK: icmp sgt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpgt_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.d.512
+  // CHECK: icmp sgt <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpgt_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.q.512
+  // CHECK: icmp sgt <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpgt_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.q.512
+  // CHECK: icmp sgt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpgt_epi64_mask(__a, __b);
 }
 
@@ -762,13 +999,13 @@
 __mmask16 test_mm512_cmp_round_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmp_round_ps_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
-  return _mm512_cmp_round_ps_mask(a, b, 0, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_cmp_round_ps_mask(a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask16 test_mm512_mask_cmp_round_ps_mask(__mmask16 m, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_round_ps_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
-  return _mm512_mask_cmp_round_ps_mask(m, a, b, 0, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_cmp_round_ps_mask(m, a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask16 test_mm512_cmp_ps_mask(__m512 a, __m512 b) {
@@ -786,13 +1023,13 @@
 __mmask8 test_mm512_cmp_round_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmp_round_pd_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
-  return _mm512_cmp_round_pd_mask(a, b, 0, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_cmp_round_pd_mask(a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask8 test_mm512_mask_cmp_round_pd_mask(__mmask8 m, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_round_pd_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
-  return _mm512_mask_cmp_round_pd_mask(m, a, b, 0, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_cmp_round_pd_mask(m, a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask8 test_mm512_cmp_pd_mask(__m512d a, __m512d b) {
@@ -814,6 +1051,18 @@
   return _mm512_extractf64x4_pd(a, 1);
 }
 
+__m256d test_mm512_mask_extractf64x4_pd(__m256d  __W,__mmask8  __U,__m512d __A){
+ //CHECK-LABEL:@test_mm512_mask_extractf64x4_pd
+ //CHECL:@llvm.x86.avx512.mask.vextractf64x4.512
+ return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
+}
+
+__m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
+ //CHECK-LABEL:@test_mm512_maskz_extractf64x4_pd
+ //CHECL:@llvm.x86.avx512.mask.vextractf64x4.512
+ return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
+}
+
 __m128 test_mm512_extractf32x4_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_extractf32x4_ps
@@ -821,369 +1070,429 @@
   return _mm512_extractf32x4_ps(a, 1);
 }
 
+__m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512d __A){
+ //CHECK-LABEL:@test_mm512_mask_extractf32x4_ps
+ //CHECL: @llvm.x86.avx512.mask.vextractf32x4.512
+ return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
+}
+
+__m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512d __A){
+ //CHECK-LABEL:@test_mm512_maskz_extractf32x4_ps
+ //CHECL: @llvm.x86.avx512.mask.vextractf32x4.512
+ return _mm512_maskz_extractf32x4_ps(  __U, __A, 1);
+}
+
 __mmask16 test_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 0, i16 -1)
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpeq_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 0, i16 {{.*}})
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpeq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 0, i8 -1)
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpeq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 0, i8 {{.*}})
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpeq_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 5, i16 -1)
+  // CHECK: icmp sge <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpge_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 5, i16 {{.*}})
+  // CHECK: icmp sge <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpge_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpge_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpge_epi64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 5, i16 -1)
+  // CHECK: icmp uge <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpge_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 5, i16 {{.*}})
+  // CHECK: icmp uge <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpge_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpge_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpge_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 6, i16 -1)
+  // CHECK: icmp ugt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpgt_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 6, i16 {{.*}})
+  // CHECK: icmp ugt <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpgt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpgt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpgt_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 2, i16 -1)
+  // CHECK: icmp sle <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmple_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 2, i16 {{.*}})
+  // CHECK: icmp sle <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmple_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmple_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmple_epi64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 2, i16 -1)
+  // CHECK: icmp ule <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmple_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 2, i16 {{.*}})
+  // CHECK: icmp ule <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmple_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmple_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmple_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 1, i16 -1)
+  // CHECK: icmp slt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmplt_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 1, i16 {{.*}})
+  // CHECK: icmp slt <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmplt_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmplt_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmplt_epi64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 1, i16 -1)
+  // CHECK: icmp ult <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmplt_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 1, i16 {{.*}})
+  // CHECK: icmp ult <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmplt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmplt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmplt_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 4, i16 -1)
+  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpneq_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 4, i16 {{.*}})
+  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpneq_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpneq_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpneq_epi64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 4, i16 -1)
+  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpneq_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 4, i16 {{.*}})
+  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpneq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpneq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
-__mmask16 test_mm512_cmp_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 3, i16 -1)
-  return (__mmask16)_mm512_cmp_epi32_mask(__a, __b, 3);
+__mmask16 test_mm512_cmp_eq_epi32_mask(__m512i __a, __m512i __b) {
+  // CHECK-LABEL: @test_mm512_cmp_eq_epi32_mask
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_cmp_epi32_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask16 test_mm512_mask_cmp_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 3, i16 {{.*}})
-  return (__mmask16)_mm512_mask_cmp_epi32_mask(__u, __a, __b, 3);
+__mmask16 test_mm512_mask_cmp_eq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_eq_epi32_mask
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_mask_cmp_epi32_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm512_cmp_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 3, i8 -1)
-  return (__mmask8)_mm512_cmp_epi64_mask(__a, __b, 3);
+__mmask8 test_mm512_cmp_eq_epi64_mask(__m512i __a, __m512i __b) {
+  // CHECK-LABEL: @test_mm512_cmp_eq_epi64_mask
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_cmp_epi64_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm512_mask_cmp_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 3, i8 {{.*}})
-  return (__mmask8)_mm512_mask_cmp_epi64_mask(__u, __a, __b, 3);
+__mmask8 test_mm512_mask_cmp_eq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_eq_epi64_mask
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_mask_cmp_epi64_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
 __mmask16 test_mm512_cmp_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 3, i16 -1)
-  return (__mmask16)_mm512_cmp_epu32_mask(__a, __b, 3);
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_cmp_epu32_mask(__a, __b, 0);
 }
 
 __mmask16 test_mm512_mask_cmp_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 3, i16 {{.*}})
-  return (__mmask16)_mm512_mask_cmp_epu32_mask(__u, __a, __b, 3);
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_mask_cmp_epu32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm512_cmp_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 3, i8 -1)
-  return (__mmask8)_mm512_cmp_epu64_mask(__a, __b, 3);
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_cmp_epu64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm512_mask_cmp_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 3, i8 {{.*}})
-  return (__mmask8)_mm512_mask_cmp_epu64_mask(__u, __a, __b, 3);
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_mask_cmp_epu64_mask(__u, __a, __b, 0);
 }
 
 __m512i test_mm512_mask_and_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_and_epi32
-  // CHECK: @llvm.x86.avx512.mask.pand.d.512
+  // CHECK: and <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_and_epi32(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_and_epi32
-  // CHECK: @llvm.x86.avx512.mask.pand.d.512
+  // CHECK: and <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_and_epi32(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_and_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_and_epi64
-  // CHECK: @llvm.x86.avx512.mask.pand.q.512
+  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_and_epi64(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_and_epi64
-  // CHECK: @llvm.x86.avx512.mask.pand.q.512
+  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_and_epi64(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_or_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_or_epi32
-  // CHECK: @llvm.x86.avx512.mask.por.d.512
+  // CHECK: or <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_or_epi32(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_or_epi32
-  // CHECK: @llvm.x86.avx512.mask.por.d.512
+  // CHECK: or <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_or_epi32(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_or_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_or_epi64
-  // CHECK: @llvm.x86.avx512.mask.por.q.512
+  // CHECK: %[[OR_RES:.*]] = or <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_or_epi64(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_or_epi64
-  // CHECK: @llvm.x86.avx512.mask.por.q.512
+  // CHECK: %[[OR_RES:.*]] = or <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_or_epi64(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_xor_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_xor_epi32
-  // CHECK: @llvm.x86.avx512.mask.pxor.d.512
+  // CHECK: xor <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_xor_epi32(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_xor_epi32
-  // CHECK: @llvm.x86.avx512.mask.pxor.d.512
+  // CHECK: xor <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_xor_epi32(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_xor_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_xor_epi64
-  // CHECK: @llvm.x86.avx512.mask.pxor.q.512
+  // CHECK: %[[XOR_RES:.*]] = xor <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_xor_epi64(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_xor_epi64
-  // CHECK: @llvm.x86.avx512.mask.pxor.q.512
+  // CHECK: %[[XOR_RES:.*]] = xor <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_xor_epi64(__k,__a, __b);
 }
 
 __m512i test_mm512_and_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_and_epi32
-  // CHECK: and <8 x i64>
+  // CHECK: and <16 x i32>
   return _mm512_and_epi32(__a, __b);
 }
 
@@ -1195,7 +1504,7 @@
 
 __m512i test_mm512_or_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_or_epi32
-  // CHECK: or <8 x i64>
+  // CHECK: or <16 x i32>
   return _mm512_or_epi32(__a, __b);
 }
 
@@ -1207,7 +1516,7 @@
 
 __m512i test_mm512_xor_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_xor_epi32
-  // CHECK: xor <8 x i64>
+  // CHECK: xor <16 x i32>
   return _mm512_xor_epi32(__a, __b);
 }
 
@@ -1218,40 +1527,61 @@
 }
 
 __m512i test_mm512_maskz_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B){
-  //CHECK-LABEL: @test_mm512_maskz_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
+  // CHECK-LABEL: @test_mm512_maskz_andnot_epi32
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_andnot_epi32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B,
                                       __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
+  // CHECK-LABEL: @test_mm512_mask_andnot_epi32
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_andnot_epi32(__src,__k,__A,__B);
 }
 
+__m512i test_mm512_andnot_si512(__m512i __A, __m512i __B)
+{
+  //CHECK-LABEL: @test_mm512_andnot_si512
+  //CHECK: load {{.*}}%__A.addr.i, align 64
+  //CHECK: %neg.i = xor{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: load {{.*}}%__B.addr.i, align 64
+  //CHECK: and <8 x i64> %neg.i,{{.*}}
+
+  return _mm512_andnot_si512(__A, __B);
+}
+
 __m512i test_mm512_andnot_epi32(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
+  // CHECK-LABEL: @test_mm512_andnot_epi32
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   return _mm512_andnot_epi32(__A,__B);
 }
 
 __m512i test_mm512_maskz_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
+  // CHECK-LABEL: @test_mm512_maskz_andnot_epi64
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_andnot_epi64(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B, 
                                       __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_andnot_epi64(__src,__k,__A,__B);
 }
 
 __m512i test_mm512_andnot_epi64(__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_andnot_epi64(__A,__B);
 }
 
@@ -1690,10 +2020,15 @@
   // CHECK: @llvm.x86.avx512.mask.div.pd.512
   return _mm512_maskz_div_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
 }
-__m512d test_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_div_pd
+__m512d test_mm512_div_pd(__m512d __a, __m512d __b) {
+  // CHECK-LABLE: @test_mm512_div_pd
+  // CHECK: fdiv <8 x double>
+  return _mm512_div_pd(__a,__b); 
+}
+__m512d test_mm512_mask_div_pd(__m512d __w, __mmask8 __u, __m512d __a, __m512d __b) {
+  // CHECK-LABLE: @test_mm512_mask_div_pd
   // CHECK: @llvm.x86.avx512.mask.div.pd.512
-  return _mm512_mask_div_pd(__W,__U,__A,__B); 
+  return _mm512_mask_div_pd(__w,__u,__a,__b); 
 }
 __m512d test_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_div_pd
@@ -1715,6 +2050,11 @@
   // CHECK: @llvm.x86.avx512.mask.div.ps.512
   return _mm512_maskz_div_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
 }
+__m512 test_mm512_div_ps(__m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_div_ps
+  // CHECK: fdiv <16 x float>
+  return _mm512_div_ps(__A,__B); 
+}
 __m512 test_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_div_ps
   // CHECK: @llvm.x86.avx512.mask.div.ps.512
@@ -1899,3 +2239,5377 @@
   // CHECK: ret <8 x i64> undef
   return _mm512_undefined_epi32();
 }
+
+__m512i test_mm512_cvtepi8_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  return _mm512_cvtepi8_epi32(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  return _mm512_mask_cvtepi8_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  return _mm512_maskz_cvtepi8_epi32(__U, __A); 
+}
+
+__m512i test_mm512_cvtepi8_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  return _mm512_cvtepi8_epi64(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  return _mm512_mask_cvtepi8_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  return _mm512_maskz_cvtepi8_epi64(__U, __A); 
+}
+
+__m512i test_mm512_cvtepi32_epi64(__m256i __X) {
+  // CHECK-LABEL: @test_mm512_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  return _mm512_cvtepi32_epi64(__X); 
+}
+
+__m512i test_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  return _mm512_mask_cvtepi32_epi64(__W, __U, __X); 
+}
+
+__m512i test_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  return _mm512_maskz_cvtepi32_epi64(__U, __X); 
+}
+
+__m512i test_mm512_cvtepi16_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  return _mm512_cvtepi16_epi32(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  return _mm512_mask_cvtepi16_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  return _mm512_maskz_cvtepi16_epi32(__U, __A); 
+}
+
+__m512i test_mm512_cvtepi16_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  return _mm512_cvtepi16_epi64(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  return _mm512_mask_cvtepi16_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  return _mm512_maskz_cvtepi16_epi64(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu8_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  return _mm512_cvtepu8_epi32(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  return _mm512_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  return _mm512_maskz_cvtepu8_epi32(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu8_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  return _mm512_cvtepu8_epi64(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  return _mm512_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  return _mm512_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu32_epi64(__m256i __X) {
+  // CHECK-LABEL: @test_mm512_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  return _mm512_cvtepu32_epi64(__X); 
+}
+
+__m512i test_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  return _mm512_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m512i test_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  return _mm512_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m512i test_mm512_cvtepu16_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  return _mm512_cvtepu16_epi32(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  return _mm512_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  return _mm512_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu16_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  return _mm512_cvtepu16_epi64(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  return _mm512_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  return _mm512_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+
+__m512i test_mm512_rol_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  return _mm512_rol_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_rol_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  return _mm512_mask_rol_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_rol_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  return _mm512_maskz_rol_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_rol_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  return _mm512_rol_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_rol_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  return _mm512_mask_rol_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_rol_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  return _mm512_maskz_rol_epi64(__U, __A, 5); 
+}
+
+__m512i test_mm512_rolv_epi32(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  return _mm512_rolv_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_rolv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  return _mm512_mask_rolv_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_rolv_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  return _mm512_maskz_rolv_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_rolv_epi64(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  return _mm512_rolv_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_rolv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  return _mm512_mask_rolv_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_rolv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  return _mm512_maskz_rolv_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_ror_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  return _mm512_ror_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_ror_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  return _mm512_mask_ror_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_ror_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  return _mm512_maskz_ror_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_ror_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  return _mm512_ror_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_ror_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  return _mm512_mask_ror_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_ror_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  return _mm512_maskz_ror_epi64(__U, __A, 5); 
+}
+
+
+__m512i test_mm512_rorv_epi32(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  return _mm512_rorv_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_rorv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  return _mm512_mask_rorv_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_rorv_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  return _mm512_maskz_rorv_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_rorv_epi64(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  return _mm512_rorv_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_rorv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  return _mm512_mask_rorv_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  return _mm512_maskz_rorv_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_slli_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_slli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  return _mm512_slli_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_slli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  return _mm512_mask_slli_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_slli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  return _mm512_maskz_slli_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_slli_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_slli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  return _mm512_slli_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_slli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  return _mm512_mask_slli_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_slli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  return _mm512_maskz_slli_epi64(__U, __A, 5); 
+}
+
+__m512i test_mm512_srli_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  return _mm512_srli_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  return _mm512_mask_srli_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  return _mm512_maskz_srli_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_srli_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  return _mm512_srli_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  return _mm512_mask_srli_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  return _mm512_maskz_srli_epi64(__U, __A, 5); 
+}
+
+__m512i test_mm512_mask_load_epi32(__m512i __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_load_epi32
+  // CHECK: @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_mask_load_epi32(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_load_epi32(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_load_epi32
+  // CHECK: @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_maskz_load_epi32(__U, __P); 
+}
+
+__m512i test_mm512_mask_mov_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_epi32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_mov_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_mov_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_epi32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_mov_epi32(__U, __A); 
+}
+
+__m512i test_mm512_mask_mov_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_epi64
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_mov_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_mov_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_epi64
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_mov_epi64(__U, __A); 
+}
+
+__m512i test_mm512_mask_load_epi64(__m512i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_load_epi64
+  // CHECK: @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_mask_load_epi64(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_load_epi64
+  // CHECK: @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_maskz_load_epi64(__U, __P); 
+}
+
+void test_mm512_mask_store_epi32(void *__P, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_store_epi32
+  // CHECK: @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %{{.*}}, <16 x i32>* %{{.*}}, i32 64, <16 x i1> %{{.*}})
+  return _mm512_mask_store_epi32(__P, __U, __A); 
+}
+
+void test_mm512_mask_store_epi64(void *__P, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_store_epi64
+  // CHECK: @llvm.masked.store.v8i64.p0v8i64(<8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, i32 64, <8 x i1> %{{.*}})
+  return _mm512_mask_store_epi64(__P, __U, __A); 
+}
+
+__m512d test_mm512_movedup_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_movedup_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  return _mm512_movedup_pd(__A);
+}
+
+__m512d test_mm512_mask_movedup_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_movedup_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_movedup_pd(__W, __U, __A);
+}
+
+__m512d test_mm512_maskz_movedup_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_movedup_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_movedup_pd(__U, __A);
+}
+
+int test_mm_comi_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_comi_round_sd
+  // CHECK: @llvm.x86.avx512.vcomi.sd
+  return _mm_comi_round_sd(__A, __B, 5, 3); 
+}
+
+int test_mm_comi_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_comi_round_ss
+  // CHECK: @llvm.x86.avx512.vcomi.ss
+  return _mm_comi_round_ss(__A, __B, 5, 3); 
+}
+
+__m512d test_mm512_fixupimm_round_pd(__m512d __A, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_fixupimm_round_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
+  return _mm512_fixupimm_round_pd(__A, __B, __C, 5, 8); 
+}
+
+__m512d test_mm512_mask_fixupimm_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_fixupimm_round_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
+  return _mm512_mask_fixupimm_round_pd(__A, __U, __B, __C, 5, 8); 
+}
+
+__m512d test_mm512_fixupimm_pd(__m512d __A, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
+  return _mm512_fixupimm_pd(__A, __B, __C, 5); 
+}
+
+__m512d test_mm512_mask_fixupimm_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
+  return _mm512_mask_fixupimm_pd(__A, __U, __B, __C, 5); 
+}
+
+__m512d test_mm512_maskz_fixupimm_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fixupimm_round_pd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.512
+  return _mm512_maskz_fixupimm_round_pd(__U, __A, __B, __C, 5, 8); 
+}
+
+__m512d test_mm512_maskz_fixupimm_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.512
+  return _mm512_maskz_fixupimm_pd(__U, __A, __B, __C, 5); 
+}
+
+__m512 test_mm512_fixupimm_round_ps(__m512 __A, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_fixupimm_round_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
+  return _mm512_fixupimm_round_ps(__A, __B, __C, 5, 8); 
+}
+
+__m512 test_mm512_mask_fixupimm_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_fixupimm_round_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
+  return _mm512_mask_fixupimm_round_ps(__A, __U, __B, __C, 5, 8); 
+}
+
+__m512 test_mm512_fixupimm_ps(__m512 __A, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
+  return _mm512_fixupimm_ps(__A, __B, __C, 5); 
+}
+
+__m512 test_mm512_mask_fixupimm_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
+  return _mm512_mask_fixupimm_ps(__A, __U, __B, __C, 5); 
+}
+
+__m512 test_mm512_maskz_fixupimm_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fixupimm_round_ps
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.512
+  return _mm512_maskz_fixupimm_round_ps(__U, __A, __B, __C, 5, 8); 
+}
+
+__m512 test_mm512_maskz_fixupimm_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.512
+  return _mm512_maskz_fixupimm_ps(__U, __A, __B, __C, 5); 
+}
+
+__m128d test_mm_fixupimm_round_sd(__m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_round_sd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_fixupimm_round_sd(__A, __B, __C, 5, 8); 
+}
+
+__m128d test_mm_mask_fixupimm_round_sd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_round_sd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_mask_fixupimm_round_sd(__A, __U, __B, __C, 5, 8); 
+}
+
+__m128d test_mm_fixupimm_sd(__m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_sd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_fixupimm_sd(__A, __B, __C, 5); 
+}
+
+__m128d test_mm_mask_fixupimm_sd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_sd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_mask_fixupimm_sd(__A, __U, __B, __C, 5); 
+}
+
+__m128d test_mm_maskz_fixupimm_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm
+  return _mm_maskz_fixupimm_round_sd(__U, __A, __B, __C, 5, 8); 
+}
+
+__m128d test_mm_maskz_fixupimm_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_sd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm
+  return _mm_maskz_fixupimm_sd(__U, __A, __B, __C, 5); 
+}
+
+__m128 test_mm_fixupimm_round_ss(__m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_round_ss
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_fixupimm_round_ss(__A, __B, __C, 5, 8); 
+}
+
+__m128 test_mm_mask_fixupimm_round_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_round_ss
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_mask_fixupimm_round_ss(__A, __U, __B, __C, 5, 8); 
+}
+
+__m128 test_mm_fixupimm_ss(__m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_ss
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_fixupimm_ss(__A, __B, __C, 5); 
+}
+
+__m128 test_mm_mask_fixupimm_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_ss
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_mask_fixupimm_ss(__A, __U, __B, __C, 5); 
+}
+
+__m128 test_mm_maskz_fixupimm_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm
+  return _mm_maskz_fixupimm_round_ss(__U, __A, __B, __C, 5, 8); 
+}
+
+__m128 test_mm_maskz_fixupimm_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_ss
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm
+  return _mm_maskz_fixupimm_ss(__U, __A, __B, __C, 5); 
+}
+
+__m128d test_mm_getexp_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_getexp_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_getexp_round_sd(__A, __B, 8); 
+}
+
+__m128d test_mm_getexp_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_getexp_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_getexp_sd(__A, __B); 
+}
+
+__m128 test_mm_getexp_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_getexp_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_getexp_round_ss(__A, __B, 8); 
+}
+
+__m128 test_mm_getexp_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_getexp_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_getexp_ss(__A, __B); 
+}
+
+__m128d test_mm_getmant_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_getmant_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_getmant_round_sd(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8); 
+}
+
+__m128d test_mm_getmant_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_getmant_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_getmant_sd(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src); 
+}
+
+__m128 test_mm_getmant_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_getmant_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_getmant_round_ss(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8); 
+}
+
+__m128 test_mm_getmant_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_getmant_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_getmant_ss(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src); 
+}
+
+__mmask16 test_mm512_kmov(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm512_kmov
+  // CHECK: load i16, i16* %__A.addr.i, align 2
+  return _mm512_kmov(__A); 
+}
+
+__m512d test_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpackhi_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_unpackhi_pd(__W, __U, __A, __B); 
+}
+#if __x86_64__
+unsigned long long test_mm_cvt_roundsd_si64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_si64
+  // CHECK: @llvm.x86.avx512.vcvtsd2si64
+  return _mm_cvt_roundsd_si64(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+#endif
+__m512i test_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi32
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.d.512
+  return _mm512_mask2_permutex2var_epi32(__A, __I, __U, __B); 
+}
+__m512i test_mm512_unpackhi_epi32(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_unpackhi_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  return _mm512_unpackhi_epi32(__A, __B); 
+}
+
+__m512d test_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpackhi_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_unpackhi_pd(__U, __A, __B); 
+}
+#if __x86_64__
+long long test_mm_cvt_roundsd_i64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_i64
+  // CHECK: @llvm.x86.avx512.vcvtsd2si64
+  return _mm_cvt_roundsd_i64(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+#endif
+__m512d test_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.pd.512
+  return _mm512_mask2_permutex2var_pd(__A, __I, __U, __B); 
+}
+__m512i test_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpackhi_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_unpackhi_epi32(__W, __U, __A, __B); 
+}
+
+__m512 test_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpackhi_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_unpackhi_ps(__W, __U, __A, __B); 
+}
+
+__m512 test_mm512_maskz_unpackhi_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpackhi_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_unpackhi_ps(__U, __A, __B); 
+}
+
+__m512d test_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpacklo_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_unpacklo_pd(__W, __U, __A, __B); 
+}
+
+__m512d test_mm512_maskz_unpacklo_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpacklo_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_unpacklo_pd(__U, __A, __B); 
+}
+
+__m512 test_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpacklo_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_unpacklo_ps(__W, __U, __A, __B); 
+}
+
+__m512 test_mm512_maskz_unpacklo_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpacklo_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_unpacklo_ps(__U, __A, __B); 
+}
+int test_mm_cvt_roundsd_si32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_si32
+  // CHECK: @llvm.x86.avx512.vcvtsd2si32
+  return _mm_cvt_roundsd_si32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvt_roundsd_i32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_i32
+  // CHECK: @llvm.x86.avx512.vcvtsd2si32
+  return _mm_cvt_roundsd_i32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvt_roundsd_u32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_u32
+  // CHECK: @llvm.x86.avx512.vcvtsd2usi32
+  return _mm_cvt_roundsd_u32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvtsd_u32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtsd_u32
+  // CHECK: @llvm.x86.avx512.vcvtsd2usi32
+  return _mm_cvtsd_u32(__A); 
+}
+
+#ifdef __x86_64__
+unsigned long long test_mm_cvt_roundsd_u64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_u64
+  // CHECK: @llvm.x86.avx512.vcvtsd2usi64
+  return _mm_cvt_roundsd_u64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned long long test_mm_cvtsd_u64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtsd_u64
+  // CHECK: @llvm.x86.avx512.vcvtsd2usi64
+  return _mm_cvtsd_u64(__A); 
+}
+#endif
+
+int test_mm_cvt_roundss_si32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_si32
+  // CHECK: @llvm.x86.avx512.vcvtss2si32
+  return _mm_cvt_roundss_si32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvt_roundss_i32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_i32
+  // CHECK: @llvm.x86.avx512.vcvtss2si32
+  return _mm_cvt_roundss_i32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+int test_mm_cvt_roundss_si64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_si64
+  // CHECK: @llvm.x86.avx512.vcvtss2si64
+  return _mm_cvt_roundss_si64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvt_roundss_i64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_i64
+  // CHECK: @llvm.x86.avx512.vcvtss2si64
+  return _mm_cvt_roundss_i64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+unsigned test_mm_cvt_roundss_u32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_u32
+  // CHECK: @llvm.x86.avx512.vcvtss2usi32
+  return _mm_cvt_roundss_u32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvtss_u32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtss_u32
+  // CHECK: @llvm.x86.avx512.vcvtss2usi32
+  return _mm_cvtss_u32(__A); 
+}
+
+#ifdef __x86_64__
+unsigned long long test_mm_cvt_roundss_u64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_u64
+  // CHECK: @llvm.x86.avx512.vcvtss2usi64
+  return _mm_cvt_roundss_u64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned long long test_mm_cvtss_u64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtss_u64
+  // CHECK: @llvm.x86.avx512.vcvtss2usi64
+  return _mm_cvtss_u64(__A); 
+}
+#endif
+
+int test_mm_cvtt_roundsd_i32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_i32
+  // CHECK: @llvm.x86.avx512.cvttsd2si
+  return _mm_cvtt_roundsd_i32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvtt_roundsd_si32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_si32
+  // CHECK: @llvm.x86.avx512.cvttsd2si
+  return _mm_cvtt_roundsd_si32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvttsd_i32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvttsd_i32
+  // CHECK: @llvm.x86.avx512.cvttsd2si
+  return _mm_cvttsd_i32(__A); 
+}
+
+#ifdef __x86_64__
+unsigned long long test_mm_cvtt_roundsd_si64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_si64
+  // CHECK: @llvm.x86.avx512.cvttsd2si64
+  return _mm_cvtt_roundsd_si64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvtt_roundsd_i64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_i64
+  // CHECK: @llvm.x86.avx512.cvttsd2si64
+  return _mm_cvtt_roundsd_i64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvttsd_i64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvttsd_i64
+  // CHECK: @llvm.x86.avx512.cvttsd2si64
+  return _mm_cvttsd_i64(__A); 
+}
+#endif
+
+unsigned test_mm_cvtt_roundsd_u32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_u32
+  // CHECK: @llvm.x86.avx512.cvttsd2usi
+  return _mm_cvtt_roundsd_u32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvttsd_u32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvttsd_u32
+  // CHECK: @llvm.x86.avx512.cvttsd2usi
+  return _mm_cvttsd_u32(__A); 
+}
+
+#ifdef __x86_64__
+unsigned long long test_mm_cvtt_roundsd_u64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_u64
+  // CHECK: @llvm.x86.avx512.cvttsd2usi64
+  return _mm_cvtt_roundsd_u64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned long long test_mm_cvttsd_u64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvttsd_u64
+  // CHECK: @llvm.x86.avx512.cvttsd2usi64
+  return _mm_cvttsd_u64(__A); 
+}
+#endif
+
+int test_mm_cvtt_roundss_i32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_i32
+  // CHECK: @llvm.x86.avx512.cvttss2si
+  return _mm_cvtt_roundss_i32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvtt_roundss_si32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_si32
+  // CHECK: @llvm.x86.avx512.cvttss2si
+  return _mm_cvtt_roundss_si32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvttss_i32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvttss_i32
+  // CHECK: @llvm.x86.avx512.cvttss2si
+  return _mm_cvttss_i32(__A); 
+}
+
+#ifdef __x86_64__
+float test_mm_cvtt_roundss_i64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_i64
+  // CHECK: @llvm.x86.avx512.cvttss2si64
+  return _mm_cvtt_roundss_i64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvtt_roundss_si64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_si64
+  // CHECK: @llvm.x86.avx512.cvttss2si64
+  return _mm_cvtt_roundss_si64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvttss_i64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvttss_i64
+  // CHECK: @llvm.x86.avx512.cvttss2si64
+  return _mm_cvttss_i64(__A); 
+}
+#endif
+
+unsigned test_mm_cvtt_roundss_u32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_u32
+  // CHECK: @llvm.x86.avx512.cvttss2usi
+  return _mm_cvtt_roundss_u32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvttss_u32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvttss_u32
+  // CHECK: @llvm.x86.avx512.cvttss2usi
+  return _mm_cvttss_u32(__A); 
+}
+
+#ifdef __x86_64__
+unsigned long long test_mm_cvtt_roundss_u64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_u64
+  // CHECK: @llvm.x86.avx512.cvttss2usi64
+  return _mm_cvtt_roundss_u64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned long long test_mm_cvttss_u64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvttss_u64
+  // CHECK: @llvm.x86.avx512.cvttss2usi64
+  return _mm_cvttss_u64(__A); 
+}
+#endif
+
+__m512i test_mm512_cvtt_roundps_epu32(__m512 __A) 
+{
+    // CHECK-LABEL: @test_mm512_cvtt_roundps_epu32
+    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+    return _mm512_cvtt_roundps_epu32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_mask_cvtt_roundps_epu32(__m512i __W, __mmask16 __U, __m512 __A)
+{
+    // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epu32
+    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+    return _mm512_mask_cvtt_roundps_epu32(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_maskz_cvtt_roundps_epu32( __mmask16 __U, __m512 __A)
+{
+    // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epu32
+    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+
+    return _mm512_maskz_cvtt_roundps_epu32(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m256i test_mm512_cvt_roundps_ph(__m512  __A)
+{
+    // CHECK-LABEL: @test_mm512_cvt_roundps_ph
+    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+    return _mm512_cvt_roundps_ph(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m256i test_mm512_mask_cvt_roundps_ph(__m256i __W , __mmask16 __U, __m512  __A)
+{
+    // CHECK-LABEL: @test_mm512_mask_cvt_roundps_ph
+    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+    return _mm512_mask_cvt_roundps_ph(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m256i test_mm512_maskz_cvt_roundps_ph(__mmask16 __U, __m512  __A)
+{
+    // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_ph
+    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+    return _mm512_maskz_cvt_roundps_ph(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_cvt_roundph_ps(__m256i __A)
+{
+    // CHECK-LABEL: @test_mm512_cvt_roundph_ps
+    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+    return _mm512_cvt_roundph_ps(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mask_cvt_roundph_ps(__m512 __W, __mmask16 __U, __m256i __A)
+{
+    // CHECK-LABEL: @test_mm512_mask_cvt_roundph_ps
+    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+    return _mm512_mask_cvt_roundph_ps(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_cvt_roundph_ps(__mmask16 __U, __m256i __A)
+{
+    // CHECK-LABEL: @test_mm512_maskz_cvt_roundph_ps
+    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+    return _mm512_maskz_cvt_roundph_ps(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mask_cvt_roundepi32_ps(__m512 __W, __mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundepi32_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_mask_cvt_roundepi32_ps(__W,__U,__A,4);
+}
+
+__m512 test_mm512_maskz_cvt_roundepi32_ps(__mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepi32_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_maskz_cvt_roundepi32_ps(__U,__A,4);
+}
+
+__m512 test_mm512_mask_cvt_roundepu32_ps(__m512 __W, __mmask16 __U,__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundepu32_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_mask_cvt_roundepu32_ps(__W,__U,__A,4);
+}
+
+__m512 test_mm512_maskz_cvt_roundepu32_ps(__mmask16 __U,__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepu32_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_maskz_cvt_roundepu32_ps(__U,__A,4);
+}
+
+__m256 test_mm512_mask_cvt_roundpd_ps(__m256 W, __mmask8 U,__m512d A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_mask_cvt_roundpd_ps(W,U,A,4);
+}
+
+__m256 test_mm512_maskz_cvt_roundpd_ps(__mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_maskz_cvt_roundpd_ps(U,A,4);
+}
+
+__m256i test_mm512_cvtt_roundpd_epi32(__m512d A)
+{
+  // CHECK-LABEL: @test_mm512_cvtt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_cvtt_roundpd_epi32(A,4);
+}
+
+__m256i test_mm512_mask_cvtt_roundpd_epi32(__m256i W, __mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_mask_cvtt_roundpd_epi32(W,U,A,4);
+}
+
+__m256i test_mm512_maskz_cvtt_roundpd_epi32(__mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_maskz_cvtt_roundpd_epi32(U,A,4);
+}
+
+__m512i test_mm512_mask_cvtt_roundps_epi32(__m512i W,__mmask16 U, __m512 A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
+  return _mm512_mask_cvtt_roundps_epi32(W,U,A,4);
+}
+
+__m512i test_mm512_maskz_cvtt_roundps_epi32(__mmask16 U, __m512 A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
+  return _mm512_maskz_cvtt_roundps_epi32(U,A,4);
+}
+
+__m512i test_mm512_mask_cvt_roundps_epi32(__m512i __W,__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_mask_cvt_roundps_epi32(__W,__U,__A,4);
+}
+
+__m512i test_mm512_maskz_cvt_roundps_epi32(__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_maskz_cvt_roundps_epi32(__U,__A,4);
+}
+
+__m256i test_mm512_mask_cvt_roundpd_epi32(__m256i W,__mmask8 U,__m512d A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_mask_cvt_roundpd_epi32(W,U,A,4);
+}
+
+__m256i test_mm512_maskz_cvt_roundpd_epi32(__mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_maskz_cvt_roundpd_epi32(U,A,4);
+}
+
+__m512i test_mm512_mask_cvt_roundps_epu32(__m512i __W,__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_mask_cvt_roundps_epu32(__W,__U,__A,4);
+}
+
+__m512i test_mm512_maskz_cvt_roundps_epu32(__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_maskz_cvt_roundps_epu32(__U,__A, 4);
+}
+
+__m256i test_mm512_mask_cvt_roundpd_epu32(__m256i W, __mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_mask_cvt_roundpd_epu32(W,U,A,4);
+}
+
+__m256i test_mm512_maskz_cvt_roundpd_epu32(__mmask8 U, __m512d A) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_maskz_cvt_roundpd_epu32(U, A, 4);
+}
+
+__m512 test_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.ps.512
+  return _mm512_mask2_permutex2var_ps(__A, __I, __U, __B); 
+}
+
+__m512i test_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi64
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.q.512
+  return _mm512_mask2_permutex2var_epi64(__A, __I, __U, __B); 
+}
+
+__m512d test_mm512_permute_pd(__m512d __X) {
+  // CHECK-LABEL: @test_mm512_permute_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  return _mm512_permute_pd(__X, 2);
+}
+
+__m512d test_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X) {
+  // CHECK-LABEL: @test_mm512_mask_permute_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_permute_pd(__W, __U, __X, 2);
+}
+
+__m512d test_mm512_maskz_permute_pd(__mmask8 __U, __m512d __X) {
+  // CHECK-LABEL: @test_mm512_maskz_permute_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_permute_pd(__U, __X, 2);
+}
+
+__m512 test_mm512_permute_ps(__m512 __X) {
+  // CHECK-LABEL: @test_mm512_permute_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  return _mm512_permute_ps(__X, 2);
+}
+
+__m512 test_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X) {
+  // CHECK-LABEL: @test_mm512_mask_permute_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_permute_ps(__W, __U, __X, 2);
+}
+
+__m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) {
+  // CHECK-LABEL: @test_mm512_maskz_permute_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_permute_ps(__U, __X, 2);
+}
+
+__m512d test_mm512_permutevar_pd(__m512d __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  return _mm512_permutevar_pd(__A, __C); 
+}
+
+__m512d test_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  return _mm512_mask_permutevar_pd(__W, __U, __A, __C); 
+}
+
+__m512d test_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  return _mm512_maskz_permutevar_pd(__U, __A, __C); 
+}
+
+__m512 test_mm512_permutevar_ps(__m512 __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  return _mm512_permutevar_ps(__A, __C); 
+}
+
+__m512 test_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  return _mm512_mask_permutevar_ps(__W, __U, __A, __C); 
+}
+
+__m512 test_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  return _mm512_maskz_permutevar_ps(__U, __A, __C); 
+}
+
+__m512i test_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi32
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.d.512
+  return _mm512_maskz_permutex2var_epi32(__U, __A, __I, __B); 
+}
+
+__m512i test_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, __m512i __I, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi32 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.d.512
+  return _mm512_mask_permutex2var_epi32 (__A,__U,__I,__B);
+}
+
+__m512d test_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_permutex2var_pd 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.512
+  return _mm512_permutex2var_pd (__A, __I,__B);
+}
+
+__m512d test_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_pd 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.512
+  return _mm512_mask_permutex2var_pd (__A,__U,__I,__B);
+}
+
+__m512d test_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_pd
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.pd.512
+  return _mm512_maskz_permutex2var_pd(__U, __A, __I, __B); 
+}
+
+__m512 test_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_permutex2var_ps 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.512
+  return _mm512_permutex2var_ps (__A, __I, __B);
+}
+
+__m512 test_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_ps 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.512
+  return _mm512_mask_permutex2var_ps (__A,__U,__I,__B);
+}
+
+__m512 test_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_ps
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.ps.512
+  return _mm512_maskz_permutex2var_ps(__U, __A, __I, __B); 
+}
+
+__m512i test_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, __m512i __B){
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi64
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.q.512
+  return _mm512_mask_permutex2var_epi64(__A, __U, __I, __B);
+}
+
+__m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi64
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.q.512
+  return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B);
+}
+__mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.512
+  return _mm512_testn_epi32_mask(__A, __B); 
+}
+
+__mmask16 test_mm512_mask_testn_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.512
+  return _mm512_mask_testn_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm512_testn_epi64_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.512
+  return _mm512_testn_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm512_mask_testn_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.512
+  return _mm512_mask_testn_epi64_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_test_epi32_mask 
+  // CHECK: @llvm.x86.avx512.ptestm.d.512
+  return _mm512_mask_test_epi32_mask (__U,__A,__B);
+}
+
+__mmask8 test_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_test_epi64_mask 
+  // CHECK: @llvm.x86.avx512.ptestm.q.512
+  return _mm512_mask_test_epi64_mask (__U,__A,__B);
+}
+
+__m512i test_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_unpackhi_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_unpackhi_epi64(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_unpackhi_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  return _mm512_unpackhi_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpackhi_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_unpackhi_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_unpackhi_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_unpacklo_epi32(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_unpacklo_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  return _mm512_unpacklo_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpacklo_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_unpacklo_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_unpacklo_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_unpacklo_epi64(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_unpacklo_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  return _mm512_unpacklo_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_unpacklo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpacklo_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_unpacklo_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_unpacklo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_unpacklo_epi64(__U, __A, __B); 
+}
+
+__m128d test_mm_roundscale_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_roundscale_round_sd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+  return _mm_roundscale_round_sd(__A, __B, 3, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_roundscale_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_roundscale_sd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+  return _mm_roundscale_sd(__A, __B, 3); 
+}
+
+__m128d test_mm_mask_roundscale_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+    return _mm_mask_roundscale_sd(__W,__U,__A,__B,3);
+}
+
+__m128d test_mm_mask_roundscale_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+    return _mm_mask_roundscale_round_sd(__W,__U,__A,__B,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_roundscale_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+    return _mm_maskz_roundscale_sd(__U,__A,__B,3);
+}
+
+__m128d test_mm_maskz_roundscale_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+    return _mm_maskz_roundscale_round_sd(__U,__A,__B,3,_MM_FROUND_CUR_DIRECTION );
+}
+
+__m128 test_mm_roundscale_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_roundscale_round_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+  return _mm_roundscale_round_ss(__A, __B, 3, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_roundscale_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_roundscale_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+  return _mm_roundscale_ss(__A, __B, 3); 
+}
+
+__m128 test_mm_mask_roundscale_ss(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_roundscale_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+    return _mm_mask_roundscale_ss(__W,__U,__A,__B,3);
+}
+
+__m128 test_mm_maskz_roundscale_round_ss( __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_roundscale_round_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+    return _mm_maskz_roundscale_round_ss(__U,__A,__B,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_roundscale_ss(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_roundscale_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+    return _mm_maskz_roundscale_ss(__U,__A,__B,3);
+}
+
+__m512d test_mm512_scalef_round_pd(__m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_scalef_round_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_scalef_round_pd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_mask_scalef_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_round_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_mask_scalef_round_pd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_maskz_scalef_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_round_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_maskz_scalef_round_pd(__U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_scalef_pd(__m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_scalef_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_scalef_pd(__A, __B); 
+}
+
+__m512d test_mm512_mask_scalef_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_mask_scalef_pd(__W, __U, __A, __B); 
+}
+
+__m512d test_mm512_maskz_scalef_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_maskz_scalef_pd(__U, __A, __B); 
+}
+
+__m512 test_mm512_scalef_round_ps(__m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_scalef_round_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_scalef_round_ps(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_mask_scalef_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_round_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_mask_scalef_round_ps(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_maskz_scalef_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_round_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_maskz_scalef_round_ps(__U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_scalef_ps(__m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_scalef_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_scalef_ps(__A, __B); 
+}
+
+__m512 test_mm512_mask_scalef_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_mask_scalef_ps(__W, __U, __A, __B); 
+}
+
+__m512 test_mm512_maskz_scalef_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_maskz_scalef_ps(__U, __A, __B); 
+}
+
+__m128d test_mm_scalef_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_scalef_round_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef
+  return _mm_scalef_round_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_scalef_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_scalef_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef
+  return _mm_scalef_sd(__A, __B); 
+}
+
+__m128d test_mm_mask_scalef_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_scalef_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd
+  return _mm_mask_scalef_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_scalef_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_scalef_round_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd
+    return _mm_mask_scalef_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_scalef_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_scalef_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd
+    return _mm_maskz_scalef_sd(__U, __A, __B);
+}
+
+__m128d test_mm_maskz_scalef_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_scalef_round_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd
+    return _mm_maskz_scalef_round_sd(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_scalef_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_scalef_round_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+  return _mm_scalef_round_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_scalef_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_scalef_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+  return _mm_scalef_ss(__A, __B); 
+}
+
+__m128 test_mm_mask_scalef_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_scalef_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+    return _mm_mask_scalef_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_scalef_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_scalef_round_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+    return _mm_mask_scalef_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_scalef_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_scalef_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+    return _mm_maskz_scalef_ss(__U, __A, __B);
+}
+
+__m128 test_mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_scalef_round_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+    return _mm_maskz_scalef_round_ss(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_srai_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  return _mm512_srai_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  return _mm512_mask_srai_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  return _mm512_maskz_srai_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_srai_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  return _mm512_srai_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  return _mm512_mask_srai_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  return _mm512_maskz_srai_epi64(__U, __A, 5); 
+}
+
+__m512i test_mm512_sll_epi32(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sll_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.d
+  return _mm512_sll_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sll_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.d
+  return _mm512_mask_sll_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sll_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.d
+  return _mm512_maskz_sll_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_sll_epi64(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sll_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.q
+  return _mm512_sll_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sll_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.q
+  return _mm512_mask_sll_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sll_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.q
+  return _mm512_maskz_sll_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_sllv_epi32(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  return _mm512_sllv_epi32(__X, __Y); 
+}
+
+__m512i test_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  return _mm512_mask_sllv_epi32(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  return _mm512_maskz_sllv_epi32(__U, __X, __Y); 
+}
+
+__m512i test_mm512_sllv_epi64(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  return _mm512_sllv_epi64(__X, __Y); 
+}
+
+__m512i test_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  return _mm512_mask_sllv_epi64(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  return _mm512_maskz_sllv_epi64(__U, __X, __Y); 
+}
+
+__m512i test_mm512_sra_epi32(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d
+  return _mm512_sra_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d
+  return _mm512_mask_sra_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d
+  return _mm512_maskz_sra_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_sra_epi64(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q
+  return _mm512_sra_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q
+  return _mm512_mask_sra_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q
+  return _mm512_maskz_sra_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_srav_epi32(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  return _mm512_srav_epi32(__X, __Y); 
+}
+
+__m512i test_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  return _mm512_mask_srav_epi32(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  return _mm512_maskz_srav_epi32(__U, __X, __Y); 
+}
+
+__m512i test_mm512_srav_epi64(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  return _mm512_srav_epi64(__X, __Y); 
+}
+
+__m512i test_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  return _mm512_mask_srav_epi64(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  return _mm512_maskz_srav_epi64(__U, __X, __Y); 
+}
+
+__m512i test_mm512_srl_epi32(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  return _mm512_srl_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  return _mm512_mask_srl_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  return _mm512_maskz_srl_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_srl_epi64(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  return _mm512_srl_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  return _mm512_mask_srl_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  return _mm512_maskz_srl_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_srlv_epi32(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  return _mm512_srlv_epi32(__X, __Y); 
+}
+
+__m512i test_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  return _mm512_mask_srlv_epi32(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  return _mm512_maskz_srlv_epi32(__U, __X, __Y); 
+}
+
+__m512i test_mm512_srlv_epi64(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  return _mm512_srlv_epi64(__X, __Y); 
+}
+
+__m512i test_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  return _mm512_mask_srlv_epi64(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  return _mm512_maskz_srlv_epi64(__U, __X, __Y); 
+}
+
+__m512i test_mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.512
+  return _mm512_ternarylogic_epi32(__A, __B, __C, 4); 
+}
+
+__m512i test_mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.512
+  return _mm512_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
+}
+
+__m512i test_mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.512
+  return _mm512_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
+}
+
+__m512i test_mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.512
+  return _mm512_ternarylogic_epi64(__A, __B, __C, 4); 
+}
+
+__m512i test_mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.512
+  return _mm512_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
+}
+
+__m512i test_mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.512
+  return _mm512_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
+}
+
+__m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm512_shuffle_f32x4(__A, __B, 4); 
+}
+
+__m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4); 
+}
+
+__m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4); 
+}
+
+__m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm512_shuffle_f64x2(__A, __B, 4); 
+}
+
+__m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4); 
+}
+
+__m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4); 
+}
+
+__m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm512_shuffle_i32x4(__A, __B, 4); 
+}
+
+__m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); 
+}
+
+__m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); 
+}
+
+__m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm512_shuffle_i64x2(__A, __B, 4); 
+}
+
+__m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4); 
+}
+
+__m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm512_maskz_shuffle_i64x2(__U, __A, __B, 4); 
+}
+
+__m512d test_mm512_shuffle_pd(__m512d __M, __m512d __V) {
+  // CHECK-LABEL: @test_mm512_shuffle_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+  return _mm512_shuffle_pd(__M, __V, 4); 
+}
+
+__m512d test_mm512_mask_shuffle_pd(__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_shuffle_pd(__W, __U, __M, __V, 4); 
+}
+
+__m512d test_mm512_maskz_shuffle_pd(__mmask8 __U, __m512d __M, __m512d __V) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_shuffle_pd(__U, __M, __V, 4); 
+}
+
+__m512 test_mm512_shuffle_ps(__m512 __M, __m512 __V) {
+  // CHECK-LABEL: @test_mm512_shuffle_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+  return _mm512_shuffle_ps(__M, __V, 4); 
+}
+
+__m512 test_mm512_mask_shuffle_ps(__m512 __W, __mmask16 __U, __m512 __M, __m512 __V) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_shuffle_ps(__W, __U, __M, __V, 4); 
+}
+
+__m512 test_mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_shuffle_ps(__U, __M, __V, 4); 
+}
+
+__m128d test_mm_sqrt_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_sqrt_round_sd
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+  return _mm_sqrt_round_sd(__A, __B, 4); 
+}
+
+__m128d test_mm_mask_sqrt_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+    return _mm_mask_sqrt_sd(__W,__U,__A,__B);
+}
+
+__m128d test_mm_mask_sqrt_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+    return _mm_mask_sqrt_round_sd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_sqrt_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+    return _mm_maskz_sqrt_sd(__U,__A,__B);
+}
+
+__m128d test_mm_maskz_sqrt_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+    return _mm_maskz_sqrt_round_sd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_sqrt_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_sqrt_round_ss
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+  return _mm_sqrt_round_ss(__A, __B, 4); 
+}
+
+__m128 test_mm_mask_sqrt_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+    return _mm_mask_sqrt_ss(__W,__U,__A,__B);
+}
+
+__m128 test_mm_mask_sqrt_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+    return _mm_mask_sqrt_round_ss(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_sqrt_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+    return _mm_maskz_sqrt_ss(__U,__A,__B);
+}
+
+__m128 test_mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+    return _mm_maskz_sqrt_round_ss(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_broadcast_f32x4(__m128 __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm512_broadcast_f32x4(__A); 
+}
+
+__m512 test_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm512_mask_broadcast_f32x4(__O, __M, __A); 
+}
+
+__m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm512_maskz_broadcast_f32x4(__M, __A); 
+}
+
+__m512d test_mm512_broadcast_f64x4(__m256d __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x4
+  return _mm512_broadcast_f64x4(__A); 
+}
+
+__m512d test_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x4
+  return _mm512_mask_broadcast_f64x4(__O, __M, __A); 
+}
+
+__m512d test_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x4
+  return _mm512_maskz_broadcast_f64x4(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i32x4(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm512_broadcast_i32x4(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm512_mask_broadcast_i32x4(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm512_maskz_broadcast_i32x4(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i64x4(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x4
+  return _mm512_broadcast_i64x4(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x4
+  return _mm512_mask_broadcast_i64x4(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x4
+  return _mm512_maskz_broadcast_i64x4(__M, __A); 
+}
+
+__m512d test_mm512_broadcastsd_pd(__m128d __A) {
+  // CHECK-LABEL: @test_mm512_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <8 x i32> zeroinitializer
+  return _mm512_broadcastsd_pd(__A);
+}
+
+__m512d test_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_broadcastsd_pd(__O, __M, __A);
+}
+
+__m512d test_mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_broadcastsd_pd(__M, __A);
+}
+
+__m512 test_mm512_broadcastss_ps(__m128 __A) {
+  // CHECK-LABEL: @test_mm512_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <16 x i32> zeroinitializer
+  return _mm512_broadcastss_ps(__A);
+}
+
+__m512 test_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_broadcastss_ps(__O, __M, __A);
+}
+
+__m512 test_mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_broadcastss_ps(__M, __A);
+}
+
+__m512i test_mm512_broadcastd_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <16 x i32> zeroinitializer
+  return _mm512_broadcastd_epi32(__A);
+}
+
+__m512i test_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_broadcastd_epi32(__O, __M, __A);
+}
+
+__m512i test_mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_broadcastd_epi32(__M, __A);
+}
+
+__m512i test_mm512_broadcastq_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <8 x i32> zeroinitializer
+  return _mm512_broadcastq_epi64(__A);
+}
+
+__m512i test_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_broadcastq_epi64(__O, __M, __A);
+}
+
+__m512i test_mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_broadcastq_epi64(__M, __A);
+}
+
+__m128i test_mm512_cvtsepi32_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
+  return _mm512_cvtsepi32_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtsepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
+  return _mm512_mask_cvtsepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtsepi32_epi8(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
+  return _mm512_maskz_cvtsepi32_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.mem.512
+  return _mm512_mask_cvtsepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtsepi32_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
+  return _mm512_cvtsepi32_epi16(__A); 
+}
+
+__m256i test_mm512_mask_cvtsepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
+  return _mm512_mask_cvtsepi32_epi16(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtsepi32_epi16(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
+  return _mm512_maskz_cvtsepi32_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.mem.512
+  return _mm512_mask_cvtsepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtsepi64_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
+  return _mm512_cvtsepi64_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
+  return _mm512_mask_cvtsepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtsepi64_epi8(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
+  return _mm512_maskz_cvtsepi64_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.mem.512
+  return _mm512_mask_cvtsepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtsepi64_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
+  return _mm512_cvtsepi64_epi32(__A); 
+}
+
+__m256i test_mm512_mask_cvtsepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
+  return _mm512_mask_cvtsepi64_epi32(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtsepi64_epi32(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
+  return _mm512_maskz_cvtsepi64_epi32(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.mem.512
+  return _mm512_mask_cvtsepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtsepi64_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
+  return _mm512_cvtsepi64_epi16(__A); 
+}
+
+__m128i test_mm512_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
+  return _mm512_mask_cvtsepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtsepi64_epi16(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
+  return _mm512_maskz_cvtsepi64_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi64_storeu_epi16(void * __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.mem.512
+  return _mm512_mask_cvtsepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtusepi32_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
+  return _mm512_cvtusepi32_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtusepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
+  return _mm512_mask_cvtusepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtusepi32_epi8(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
+  return _mm512_maskz_cvtusepi32_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.mem.512
+  return _mm512_mask_cvtusepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtusepi32_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
+  return _mm512_cvtusepi32_epi16(__A); 
+}
+
+__m256i test_mm512_mask_cvtusepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
+  return _mm512_mask_cvtusepi32_epi16(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtusepi32_epi16(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
+  return _mm512_maskz_cvtusepi32_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.mem.512
+  return _mm512_mask_cvtusepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtusepi64_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
+  return _mm512_cvtusepi64_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
+  return _mm512_mask_cvtusepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtusepi64_epi8(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
+  return _mm512_maskz_cvtusepi64_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.mem.512
+  return _mm512_mask_cvtusepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtusepi64_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
+  return _mm512_cvtusepi64_epi32(__A); 
+}
+
+__m256i test_mm512_mask_cvtusepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
+  return _mm512_mask_cvtusepi64_epi32(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtusepi64_epi32(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
+  return _mm512_maskz_cvtusepi64_epi32(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi64_storeu_epi32(void* __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.mem.512
+  return _mm512_mask_cvtusepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtusepi64_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
+  return _mm512_cvtusepi64_epi16(__A); 
+}
+
+__m128i test_mm512_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
+  return _mm512_mask_cvtusepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtusepi64_epi16(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
+  return _mm512_maskz_cvtusepi64_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.mem.512
+  return _mm512_mask_cvtusepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtepi32_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
+  return _mm512_cvtepi32_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
+  return _mm512_mask_cvtepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtepi32_epi8(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
+  return _mm512_maskz_cvtepi32_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.mem.512
+  return _mm512_mask_cvtepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtepi32_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
+  return _mm512_cvtepi32_epi16(__A); 
+}
+
+__m256i test_mm512_mask_cvtepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
+  return _mm512_mask_cvtepi32_epi16(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtepi32_epi16(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
+  return _mm512_maskz_cvtepi32_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi32_storeu_epi16(void * __P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.mem.512
+  return _mm512_mask_cvtepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtepi64_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
+  return _mm512_cvtepi64_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
+  return _mm512_mask_cvtepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtepi64_epi8(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
+  return _mm512_maskz_cvtepi64_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.mem.512
+  return _mm512_mask_cvtepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtepi64_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  return _mm512_cvtepi64_epi32(__A); 
+}
+
+__m256i test_mm512_mask_cvtepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  return _mm512_mask_cvtepi64_epi32(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtepi64_epi32(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  return _mm512_maskz_cvtepi64_epi32(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi64_storeu_epi32(void* __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.mem.512
+  return _mm512_mask_cvtepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtepi64_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
+  return _mm512_cvtepi64_epi16(__A); 
+}
+
+__m128i test_mm512_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
+  return _mm512_mask_cvtepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtepi64_epi16(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
+  return _mm512_maskz_cvtepi64_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.mem.512
+  return _mm512_mask_cvtepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_extracti32x4_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm512_extracti32x4_epi32(__A, 3); 
+}
+
+__m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); 
+}
+
+__m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm512_maskz_extracti32x4_epi32(__U, __A, 3); 
+}
+
+__m256i test_mm512_extracti64x4_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_extracti64x4_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  return _mm512_extracti64x4_epi64(__A, 1); 
+}
+
+__m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_extracti64x4_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); 
+}
+
+__m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extracti64x4_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  return _mm512_maskz_extracti64x4_epi64(__U, __A, 1); 
+}
+
+__m512d test_mm512_insertf64x4(__m512d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm512_insertf64x4
+  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  return _mm512_insertf64x4(__A, __B, 1);
+}
+
+__m512d test_mm512_mask_insertf64x4(__m512d __W, __mmask8 __U, __m512d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm512_mask_insertf64x4
+  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  return _mm512_mask_insertf64x4(__W, __U, __A, __B, 1); 
+}
+
+__m512d test_mm512_maskz_insertf64x4(__mmask8 __U, __m512d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_insertf64x4
+  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  return _mm512_maskz_insertf64x4(__U, __A, __B, 1); 
+}
+
+__m512i test_mm512_inserti64x4(__m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_inserti64x4
+  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  return _mm512_inserti64x4(__A, __B, 1); 
+}
+
+__m512i test_mm512_mask_inserti64x4(__m512i __W, __mmask8 __U, __m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_mask_inserti64x4
+  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  return _mm512_mask_inserti64x4(__W, __U, __A, __B, 1); 
+}
+
+__m512i test_mm512_maskz_inserti64x4(__mmask8 __U, __m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_inserti64x4
+  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  return _mm512_maskz_inserti64x4(__U, __A, __B, 1); 
+}
+
+__m512 test_mm512_insertf32x4(__m512 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm512_insertf32x4(__A, __B, 1);
+}
+
+__m512 test_mm512_mask_insertf32x4(__m512 __W, __mmask16 __U, __m512 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_mask_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm512_mask_insertf32x4(__W, __U, __A, __B, 1); 
+}
+
+__m512 test_mm512_maskz_insertf32x4(__mmask16 __U, __m512 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm512_maskz_insertf32x4(__U, __A, __B, 1); 
+}
+
+__m512i test_mm512_inserti32x4(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm512_inserti32x4(__A, __B, 1); 
+}
+
+__m512i test_mm512_mask_inserti32x4(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm512_mask_inserti32x4(__W, __U, __A, __B, 1); 
+}
+
+__m512i test_mm512_maskz_inserti32x4(__mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm512_maskz_inserti32x4(__U, __A, __B, 1); 
+}
+
+__m512d test_mm512_getmant_round_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_getmant_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_getmant_round_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_mask_getmant_round_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_mask_getmant_round_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_maskz_getmant_round_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_maskz_getmant_round_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_getmant_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_getmant_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512d test_mm512_mask_getmant_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_mask_getmant_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512d test_mm512_maskz_getmant_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_maskz_getmant_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512 test_mm512_getmant_round_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_getmant_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_getmant_round_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_mask_getmant_round_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_mask_getmant_round_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_maskz_getmant_round_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_maskz_getmant_round_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_getmant_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_getmant_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512 test_mm512_mask_getmant_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_mask_getmant_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512 test_mm512_maskz_getmant_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_maskz_getmant_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512d test_mm512_getexp_round_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_getexp_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_getexp_round_pd(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_mask_getexp_round_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_mask_getexp_round_pd(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_maskz_getexp_round_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_maskz_getexp_round_pd(__U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_getexp_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_getexp_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_getexp_pd(__A); 
+}
+
+__m512d test_mm512_mask_getexp_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_mask_getexp_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_getexp_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_maskz_getexp_pd(__U, __A); 
+}
+
+__m512 test_mm512_getexp_round_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_getexp_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_getexp_round_ps(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_mask_getexp_round_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_mask_getexp_round_ps(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_maskz_getexp_round_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_maskz_getexp_round_ps(__U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_getexp_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_getexp_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_getexp_ps(__A); 
+}
+
+__m512 test_mm512_mask_getexp_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_mask_getexp_ps(__W, __U, __A); 
+}
+
+__m512 test_mm512_maskz_getexp_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_maskz_getexp_ps(__U, __A); 
+}
+
+__m256 test_mm512_i64gather_ps(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gather.qps.512
+  return _mm512_i64gather_ps(__index, __addr, 2); 
+}
+
+__m256 test_mm512_mask_i64gather_ps(__m256 __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gather.qps.512
+  return _mm512_mask_i64gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256i test_mm512_i64gather_epi32(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i64gather_epi32
+  // CHECK: @llvm.x86.avx512.gather.qpi.512
+  return _mm512_i64gather_epi32(__index, __addr, 2); 
+}
+
+__m256i test_mm512_mask_i64gather_epi32(__m256i __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i64gather_epi32
+  // CHECK: @llvm.x86.avx512.gather.qpi.512
+  return _mm512_mask_i64gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512d test_mm512_i64gather_pd(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gather.qpd.512
+  return _mm512_i64gather_pd(__index, __addr, 2); 
+}
+
+__m512d test_mm512_mask_i64gather_pd(__m512d __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gather.qpd.512
+  return _mm512_mask_i64gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512i test_mm512_i64gather_epi64(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i64gather_epi64
+  // CHECK: @llvm.x86.avx512.gather.qpq.512
+  return _mm512_i64gather_epi64(__index, __addr, 2); 
+}
+
+__m512i test_mm512_mask_i64gather_epi64(__m512i __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i64gather_epi64
+  // CHECK: @llvm.x86.avx512.gather.qpq.512
+  return _mm512_mask_i64gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512 test_mm512_i32gather_ps(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gather.dps.512
+  return _mm512_i32gather_ps(__index, __addr, 2); 
+}
+
+__m512 test_mm512_mask_i32gather_ps(__m512 v1_old, __mmask16 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gather.dps.512
+  return _mm512_mask_i32gather_ps(v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512i test_mm512_i32gather_epi32(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i32gather_epi32
+  // CHECK: @llvm.x86.avx512.gather.dpi.512
+  return _mm512_i32gather_epi32(__index, __addr, 2); 
+}
+
+__m512i test_mm512_mask_i32gather_epi32(__m512i __v1_old, __mmask16 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i32gather_epi32
+  // CHECK: @llvm.x86.avx512.gather.dpi.512
+  return _mm512_mask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512d test_mm512_i32gather_pd(__m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gather.dpd.512
+  return _mm512_i32gather_pd(__index, __addr, 2); 
+}
+
+__m512d test_mm512_mask_i32gather_pd(__m512d __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gather.dpd.512
+  return _mm512_mask_i32gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512i test_mm512_i32gather_epi64(__m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i32gather_epi64
+  // CHECK: @llvm.x86.avx512.gather.dpq.512
+  return _mm512_i32gather_epi64(__index, __addr, 2); 
+}
+
+__m512i test_mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i32gather_epi64
+  // CHECK: @llvm.x86.avx512.gather.dpq.512
+  return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+void test_mm512_i64scatter_ps(void *__addr, __m512i __index, __m256 __v1) {
+  // CHECK-LABEL: @test_mm512_i64scatter_ps
+  // CHECK: @llvm.x86.avx512.scatter.qps.512
+  return _mm512_i64scatter_ps(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m512i __index, __m256 __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i64scatter_ps
+  // CHECK: @llvm.x86.avx512.scatter.qps.512
+  return _mm512_mask_i64scatter_ps(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i64scatter_epi32(void *__addr, __m512i __index, __m256i __v1) {
+  // CHECK-LABEL: @test_mm512_i64scatter_epi32
+  // CHECK: @llvm.x86.avx512.scatter.qpi.512
+  return _mm512_i64scatter_epi32(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m512i __index, __m256i __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i64scatter_epi32
+  // CHECK: @llvm.x86.avx512.scatter.qpi.512
+  return _mm512_mask_i64scatter_epi32(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i64scatter_pd(void *__addr, __m512i __index, __m512d __v1) {
+  // CHECK-LABEL: @test_mm512_i64scatter_pd
+  // CHECK: @llvm.x86.avx512.scatter.qpd.512
+  return _mm512_i64scatter_pd(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m512i __index, __m512d __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i64scatter_pd
+  // CHECK: @llvm.x86.avx512.scatter.qpd.512
+  return _mm512_mask_i64scatter_pd(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i64scatter_epi64(void *__addr, __m512i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_i64scatter_epi64
+  // CHECK: @llvm.x86.avx512.scatter.qpq.512
+  return _mm512_i64scatter_epi64(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m512i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i64scatter_epi64
+  // CHECK: @llvm.x86.avx512.scatter.qpq.512
+  return _mm512_mask_i64scatter_epi64(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i32scatter_ps(void *__addr, __m512i __index, __m512 __v1) {
+  // CHECK-LABEL: @test_mm512_i32scatter_ps
+  // CHECK: @llvm.x86.avx512.scatter.dps.512
+  return _mm512_i32scatter_ps(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i32scatter_ps(void *__addr, __mmask16 __mask, __m512i __index, __m512 __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i32scatter_ps
+  // CHECK: @llvm.x86.avx512.scatter.dps.512
+  return _mm512_mask_i32scatter_ps(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i32scatter_epi32(void *__addr, __m512i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_i32scatter_epi32
+  // CHECK: @llvm.x86.avx512.scatter.dpi.512
+  return _mm512_i32scatter_epi32(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i32scatter_epi32(void *__addr, __mmask16 __mask, __m512i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i32scatter_epi32
+  // CHECK: @llvm.x86.avx512.scatter.dpi.512
+  return _mm512_mask_i32scatter_epi32(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i32scatter_pd(void *__addr, __m256i __index, __m512d __v1) {
+  // CHECK-LABEL: @test_mm512_i32scatter_pd
+  // CHECK: @llvm.x86.avx512.scatter.dpd.512
+  return _mm512_i32scatter_pd(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m256i __index, __m512d __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i32scatter_pd
+  // CHECK: @llvm.x86.avx512.scatter.dpd.512
+  return _mm512_mask_i32scatter_pd(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i32scatter_epi64(void *__addr, __m256i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_i32scatter_epi64
+  // CHECK: @llvm.x86.avx512.scatter.dpq.512
+  return _mm512_i32scatter_epi64(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m256i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i32scatter_epi64
+  // CHECK: @llvm.x86.avx512.scatter.dpq.512
+  return _mm512_mask_i32scatter_epi64(__addr, __mask, __index, __v1, 2); 
+}
+
+__m128d test_mm_mask_rsqrt14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_rsqrt14_sd
+  // CHECK: @llvm.x86.avx512.rsqrt14.sd
+  return _mm_mask_rsqrt14_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_maskz_rsqrt14_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_rsqrt14_sd
+  // CHECK: @llvm.x86.avx512.rsqrt14.sd
+  return _mm_maskz_rsqrt14_sd(__U, __A, __B);
+}
+
+__m128 test_mm_mask_rsqrt14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_rsqrt14_ss
+  // CHECK: @llvm.x86.avx512.rsqrt14.ss
+  return _mm_mask_rsqrt14_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_maskz_rsqrt14_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_rsqrt14_ss
+  // CHECK: @llvm.x86.avx512.rsqrt14.ss
+  return _mm_maskz_rsqrt14_ss(__U, __A, __B);
+}
+
+__m512d test_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_rcp14_pd 
+  // CHECK: @llvm.x86.avx512.rcp14.pd.512
+  return _mm512_mask_rcp14_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_rcp14_pd 
+  // CHECK: @llvm.x86.avx512.rcp14.pd.512
+  return _mm512_maskz_rcp14_pd (__U,__A);
+}
+
+__m512 test_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_rcp14_ps 
+  // CHECK: @llvm.x86.avx512.rcp14.ps.512
+  return _mm512_mask_rcp14_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_rcp14_ps 
+  // CHECK: @llvm.x86.avx512.rcp14.ps.512
+  return _mm512_maskz_rcp14_ps (__U,__A);
+}
+
+__m128d test_mm_mask_rcp14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_rcp14_sd
+  // CHECK: @llvm.x86.avx512.rcp14.sd
+  return _mm_mask_rcp14_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_maskz_rcp14_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_rcp14_sd
+  // CHECK: @llvm.x86.avx512.rcp14.sd
+  return _mm_maskz_rcp14_sd(__U, __A, __B);
+}
+
+__m128 test_mm_mask_rcp14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_rcp14_ss
+  // CHECK: @llvm.x86.avx512.rcp14.ss
+  return _mm_mask_rcp14_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_maskz_rcp14_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_rcp14_ss
+  // CHECK: @llvm.x86.avx512.rcp14.ss
+  return _mm_maskz_rcp14_ss(__U, __A, __B);
+}
+
+__m128d test_mm_mask_getexp_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_getexp_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_mask_getexp_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_getexp_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_getexp_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_mask_getexp_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_getexp_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_getexp_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_maskz_getexp_sd(__U, __A, __B);
+}
+
+__m128d test_mm_maskz_getexp_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_getexp_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_maskz_getexp_round_sd(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_getexp_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_getexp_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_mask_getexp_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_getexp_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_getexp_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_mask_getexp_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_getexp_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_getexp_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_maskz_getexp_ss(__U, __A, __B);
+}
+
+__m128 test_mm_maskz_getexp_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_getexp_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_maskz_getexp_round_ss(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_getmant_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_getmant_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_mask_getmant_sd(__W, __U, __A, __B, 1, 2);
+}
+
+__m128d test_mm_mask_getmant_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_getmant_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_mask_getmant_round_sd(__W, __U, __A, __B, 1, 2, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_getmant_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_getmant_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_maskz_getmant_sd(__U, __A, __B, 1, 2);
+}
+
+__m128d test_mm_maskz_getmant_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_getmant_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_maskz_getmant_round_sd(__U, __A, __B, 1, 2, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_getmant_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_getmant_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_mask_getmant_ss(__W, __U, __A, __B, 1, 2);
+}
+
+__m128 test_mm_mask_getmant_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_getmant_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_mask_getmant_round_ss(__W, __U, __A, __B, 1, 2, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_getmant_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_getmant_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_maskz_getmant_ss(__U, __A, __B, 1, 2);
+}
+
+__m128 test_mm_maskz_getmant_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_getmant_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_maskz_getmant_round_ss(__U, __A, __B, 1, 2, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fmadd_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fmadd_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_fmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fmadd_round_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fmadd_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fmadd_ss(__U, __A, __B, __C);
+}
+
+__m128 test_mm_maskz_fmadd_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fmadd_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask3_fmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmadd_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fmadd_ss(__W, __X, __Y, __U);
+}
+
+__m128 test_mm_mask3_fmadd_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmadd_round_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fmsub_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fmsub_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_fmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fmsub_round_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_fmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fmsub_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fmsub_ss(__U, __A, __B, __C);
+}
+
+__m128 test_mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fmsub_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask3_fmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmsub_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fmsub_ss(__W, __X, __Y, __U);
+}
+
+__m128 test_mm_mask3_fmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmsub_round_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fnmadd_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fnmadd_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_fnmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fnmadd_round_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fnmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmadd_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fnmadd_ss(__U, __A, __B, __C);
+}
+
+__m128 test_mm_maskz_fnmadd_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmadd_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fnmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask3_fnmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmadd_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fnmadd_ss(__W, __X, __Y, __U);
+}
+
+__m128 test_mm_mask3_fnmadd_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmadd_round_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fnmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fnmsub_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fnmsub_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_fnmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fnmsub_round_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fnmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmsub_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fnmsub_ss(__U, __A, __B, __C);
+}
+
+__m128 test_mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmsub_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fnmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask3_fnmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmsub_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fnmsub_ss(__W, __X, __Y, __U);
+}
+
+__m128 test_mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmsub_round_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fnmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fmadd_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fmadd_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_fmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fmadd_round_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_fmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fmadd_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fmadd_sd(__U, __A, __B, __C);
+}
+
+__m128d test_mm_maskz_fmadd_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fmadd_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask3_fmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmadd_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fmadd_sd(__W, __X, __Y, __U);
+}
+
+__m128d test_mm_mask3_fmadd_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmadd_round_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fmsub_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fmsub_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_fmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fmsub_round_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_fmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fmsub_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fmsub_sd(__U, __A, __B, __C);
+}
+
+__m128d test_mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fmsub_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask3_fmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmsub_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fmsub_sd(__W, __X, __Y, __U);
+}
+
+__m128d test_mm_mask3_fmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmsub_round_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fnmadd_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fnmadd_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_fnmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fnmadd_round_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fnmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmadd_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fnmadd_sd(__U, __A, __B, __C);
+}
+
+__m128d test_mm_maskz_fnmadd_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmadd_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fnmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask3_fnmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmadd_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fnmadd_sd(__W, __X, __Y, __U);
+}
+
+__m128d test_mm_mask3_fnmadd_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmadd_round_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fnmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fnmsub_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fnmsub_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_fnmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fnmsub_round_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fnmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmsub_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fnmsub_sd(__U, __A, __B, __C);
+}
+
+__m128d test_mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmsub_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fnmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask3_fnmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmsub_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fnmsub_sd(__W, __X, __Y, __U);
+}
+
+__m128d test_mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fnmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_permutex_pd(__m512d __X) {
+  // CHECK-LABEL: @test_mm512_permutex_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  return _mm512_permutex_pd(__X, 0);
+}
+
+__m512d test_mm512_mask_permutex_pd(__m512d __W, __mmask8 __U, __m512d __X) {
+  // CHECK-LABEL: @test_mm512_mask_permutex_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_permutex_pd(__W, __U, __X, 0);
+}
+
+__m512d test_mm512_maskz_permutex_pd(__mmask8 __U, __m512d __X) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_permutex_pd(__U, __X, 0);
+}
+
+__m512i test_mm512_permutex_epi64(__m512i __X) {
+  // CHECK-LABEL: @test_mm512_permutex_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  return _mm512_permutex_epi64(__X, 0);
+}
+
+__m512i test_mm512_mask_permutex_epi64(__m512i __W, __mmask8 __M, __m512i __X) {
+  // CHECK-LABEL: @test_mm512_mask_permutex_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_permutex_epi64(__W, __M, __X, 0);
+}
+
+__m512i test_mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_permutex_epi64(__M, __X, 0);
+}
+
+__m512d test_mm512_permutexvar_pd(__m512i __X, __m512d __Y) {
+  // CHECK-LABEL: @test_mm512_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  return _mm512_permutexvar_pd(__X, __Y); 
+}
+
+__m512d test_mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  return _mm512_mask_permutexvar_pd(__W, __U, __X, __Y); 
+}
+
+__m512d test_mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  return _mm512_maskz_permutexvar_pd(__U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  return _mm512_maskz_permutexvar_epi64(__M, __X, __Y); 
+}
+
+__m512i test_mm512_permutexvar_epi64(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  return _mm512_permutexvar_epi64(__X, __Y); 
+}
+
+__m512i test_mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  return _mm512_mask_permutexvar_epi64(__W, __M, __X, __Y); 
+}
+
+__m512 test_mm512_permutexvar_ps(__m512i __X, __m512 __Y) {
+  // CHECK-LABEL: @test_mm512_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  return _mm512_permutexvar_ps(__X, __Y); 
+}
+
+__m512 test_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  return _mm512_mask_permutexvar_ps(__W, __U, __X, __Y); 
+}
+
+__m512 test_mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  return _mm512_maskz_permutexvar_ps(__U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  return _mm512_maskz_permutexvar_epi32(__M, __X, __Y); 
+}
+
+__m512i test_mm512_permutexvar_epi32(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  return _mm512_permutexvar_epi32(__X, __Y); 
+}
+
+__m512i test_mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  return _mm512_mask_permutexvar_epi32(__W, __M, __X, __Y); 
+}
+
+__mmask16 test_mm512_kand(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kand
+  // CHECK: @llvm.x86.avx512.kand.w
+  return _mm512_kand(__A, __B); 
+}
+
+__mmask16 test_mm512_kandn(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kandn
+  // CHECK: @llvm.x86.avx512.kandn.w
+  return _mm512_kandn(__A, __B); 
+}
+
+__mmask16 test_mm512_kor(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kor
+  // CHECK: @llvm.x86.avx512.kor.w
+  return _mm512_kor(__A, __B); 
+}
+
+int test_mm512_kortestc(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kortestc
+  // CHECK: @llvm.x86.avx512.kortestc.w
+  return _mm512_kortestc(__A, __B); 
+}
+
+int test_mm512_kortestz(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kortestz
+  // CHECK: @llvm.x86.avx512.kortestz.w
+  return _mm512_kortestz(__A, __B); 
+}
+
+__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kunpackb
+  // CHECK: @llvm.x86.avx512.kunpck.bw
+  return _mm512_kunpackb(__A, __B); 
+}
+
+__mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kxnor
+  // CHECK: @llvm.x86.avx512.kxnor.w
+  return _mm512_kxnor(__A, __B); 
+}
+
+__mmask16 test_mm512_kxor(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kxor
+  // CHECK: @llvm.x86.avx512.kxor.w
+  return _mm512_kxor(__A, __B); 
+}
+
+void test_mm512_stream_si512(__m512i * __P, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_stream_si512
+  // CHECK: store <8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, align 64, !nontemporal
+  _mm512_stream_si512(__P, __A); 
+}
+
+__m512i test_mm512_stream_load_si512(void *__P) {
+  // CHECK-LABEL: @test_mm512_stream_load_si512
+  // CHECK: @llvm.x86.avx512.movntdqa
+  return _mm512_stream_load_si512(__P); 
+}
+
+void test_mm512_stream_pd(double *__P, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_stream_pd
+  // CHECK: store <8 x double> %{{.*}}, <8 x double>* %{{.*}}, align 64, !nontemporal
+  return _mm512_stream_pd(__P, __A); 
+}
+
+void test_mm512_stream_ps(float *__P, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_stream_ps
+  // CHECK: store <16 x float> %{{.*}}, <16 x float>* %{{.*}}, align 64, !nontemporal
+  _mm512_stream_ps(__P, __A); 
+}
+
+__m512d test_mm512_mask_compress_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_compress_pd
+  // CHECK: @llvm.x86.avx512.mask.compress.pd.512
+  return _mm512_mask_compress_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_compress_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_compress_pd
+  // CHECK: @llvm.x86.avx512.mask.compress.pd.512
+  return _mm512_maskz_compress_pd(__U, __A); 
+}
+
+__m512i test_mm512_mask_compress_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_compress_epi64
+  // CHECK: @llvm.x86.avx512.mask.compress.q.512
+  return _mm512_mask_compress_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_compress_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_compress_epi64
+  // CHECK: @llvm.x86.avx512.mask.compress.q.512
+  return _mm512_maskz_compress_epi64(__U, __A); 
+}
+
+__m512 test_mm512_mask_compress_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_compress_ps
+  // CHECK: @llvm.x86.avx512.mask.compress.ps.512
+  return _mm512_mask_compress_ps(__W, __U, __A); 
+}
+
+__m512 test_mm512_maskz_compress_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_compress_ps
+  // CHECK: @llvm.x86.avx512.mask.compress.ps.512
+  return _mm512_maskz_compress_ps(__U, __A); 
+}
+
+__m512i test_mm512_mask_compress_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_compress_epi32
+  // CHECK: @llvm.x86.avx512.mask.compress.d.512
+  return _mm512_mask_compress_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_compress_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_compress_epi32
+  // CHECK: @llvm.x86.avx512.mask.compress.d.512
+  return _mm512_maskz_compress_epi32(__U, __A); 
+}
+
+__mmask8 test_mm_cmp_round_ss_mask(__m128 __X, __m128 __Y) {
+  // CHECK-LABEL: @test_mm_cmp_round_ss_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_cmp_round_ss_mask(__X, __Y, 5, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__mmask8 test_mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_round_ss_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_mask_cmp_round_ss_mask(__M, __X, __Y, 5, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__mmask8 test_mm_cmp_ss_mask(__m128 __X, __m128 __Y) {
+  // CHECK-LABEL: @test_mm_cmp_ss_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_cmp_ss_mask(__X, __Y, 5); 
+}
+
+__mmask8 test_mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_ss_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_mask_cmp_ss_mask(__M, __X, __Y, 5); 
+}
+
+__mmask8 test_mm_cmp_round_sd_mask(__m128d __X, __m128d __Y) {
+  // CHECK-LABEL: @test_mm_cmp_round_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_cmp_round_sd_mask(__X, __Y, 5, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__mmask8 test_mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_round_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_mask_cmp_round_sd_mask(__M, __X, __Y, 5, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__mmask8 test_mm_cmp_sd_mask(__m128d __X, __m128d __Y) {
+  // CHECK-LABEL: @test_mm_cmp_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_cmp_sd_mask(__X, __Y, 5); 
+}
+
+__mmask8 test_mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_mask_cmp_sd_mask(__M, __X, __Y, 5); 
+}
+
+__m512 test_mm512_movehdup_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_movehdup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  return _mm512_movehdup_ps(__A);
+}
+
+__m512 test_mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_movehdup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_movehdup_ps(__W, __U, __A);
+}
+
+__m512 test_mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_movehdup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_movehdup_ps(__U, __A);
+}
+
+__m512 test_mm512_moveldup_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_moveldup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  return _mm512_moveldup_ps(__A);
+}
+
+__m512 test_mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_moveldup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_moveldup_ps(__W, __U, __A);
+}
+
+__m512 test_mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_moveldup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_moveldup_ps(__U, __A);
+}
+
+__m512i test_mm512_shuffle_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_shuffle_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  return _mm512_shuffle_epi32(__A, 1); 
+}
+
+__m512i test_mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shuffle_epi32(__W, __U, __A, 1); 
+}
+
+__m512i test_mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shuffle_epi32(__U, __A, 1); 
+}
+
+__m512d test_mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_expand_pd
+  // CHECK: @llvm.x86.avx512.mask.expand.pd.512
+  return _mm512_mask_expand_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_expand_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_expand_pd
+  // CHECK: @llvm.x86.avx512.mask.expand.pd.512
+  return _mm512_maskz_expand_pd(__U, __A); 
+}
+
+__m512i test_mm512_mask_expand_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_expand_epi64
+  // CHECK: @llvm.x86.avx512.mask.expand.q.512
+  return _mm512_mask_expand_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_expand_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_expand_epi64
+  // CHECK: @llvm.x86.avx512.mask.expand.q.512
+  return _mm512_maskz_expand_epi64(__U, __A); 
+}
+__m512i test_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_expandloadu_epi64
+  // CHECK: @llvm.x86.avx512.mask.expand.load.q.512
+  return _mm512_mask_expandloadu_epi64(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi64
+  // CHECK: @llvm.x86.avx512.mask.expand.load.q.512
+  return _mm512_maskz_expandloadu_epi64(__U, __P); 
+}
+
+__m512d test_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_expandloadu_pd
+  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.512
+  return _mm512_mask_expandloadu_pd(__W, __U, __P); 
+}
+
+__m512d test_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_expandloadu_pd
+  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.512
+  return _mm512_maskz_expandloadu_pd(__U, __P); 
+}
+
+__m512i test_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_expandloadu_epi32
+  // CHECK: @llvm.x86.avx512.mask.expand.load.d.512
+  return _mm512_mask_expandloadu_epi32(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi32
+  // CHECK: @llvm.x86.avx512.mask.expand.load.d.512
+  return _mm512_maskz_expandloadu_epi32(__U, __P); 
+}
+
+__m512 test_mm512_mask_expand_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_expand_ps
+  // CHECK: @llvm.x86.avx512.mask.expand.ps.512
+  return _mm512_mask_expand_ps(__W, __U, __A); 
+}
+
+__m512 test_mm512_maskz_expand_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_expand_ps
+  // CHECK: @llvm.x86.avx512.mask.expand.ps.512
+  return _mm512_maskz_expand_ps(__U, __A); 
+}
+
+__m512i test_mm512_mask_expand_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_expand_epi32
+  // CHECK: @llvm.x86.avx512.mask.expand.d.512
+  return _mm512_mask_expand_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_expand_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_expand_epi32
+  // CHECK: @llvm.x86.avx512.mask.expand.d.512
+  return _mm512_maskz_expand_epi32(__U, __A); 
+}
+__m512d test_mm512_cvt_roundps_pd(__m256 __A) {
+  // CHECK-LABEL: @test_mm512_cvt_roundps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_cvt_roundps_pd(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_mask_cvt_roundps_pd(__m512d __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_mask_cvt_roundps_pd(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_maskz_cvt_roundps_pd(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_maskz_cvt_roundps_pd(__U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_cvtps_pd(__m256 __A) {
+  // CHECK-LABEL: @test_mm512_cvtps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_cvtps_pd(__A); 
+}
+
+__m512d test_mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_mask_cvtps_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_maskz_cvtps_pd(__U, __A); 
+}
+__m512d test_mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_pd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_mov_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_pd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_mov_pd(__U, __A); 
+}
+
+__m512 test_mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_ps
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_mov_ps(__W, __U, __A); 
+}
+
+__m512 test_mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_ps
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_mov_ps(__U, __A); 
+}
+
+void test_mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_compressstoreu_pd
+  // CHECK: @llvm.x86.avx512.mask.compress.store.pd.512
+  return _mm512_mask_compressstoreu_pd(__P, __U, __A); 
+}
+
+void test_mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi64
+  // CHECK: @llvm.x86.avx512.mask.compress.store.q.512
+  return _mm512_mask_compressstoreu_epi64(__P, __U, __A); 
+}
+
+void test_mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_compressstoreu_ps
+  // CHECK: @llvm.x86.avx512.mask.compress.store.ps.512
+  return _mm512_mask_compressstoreu_ps(__P, __U, __A); 
+}
+
+void test_mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi32
+  // CHECK: @llvm.x86.avx512.mask.compress.store.d.512
+  return _mm512_mask_compressstoreu_epi32(__P, __U, __A); 
+}
+
+__m256i test_mm512_cvtt_roundpd_epu32(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_cvtt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_cvtt_roundpd_epu32(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m256i test_mm512_mask_cvtt_roundpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_mask_cvtt_roundpd_epu32(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m256i test_mm512_maskz_cvtt_roundpd_epu32(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_maskz_cvtt_roundpd_epu32(__U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m256i test_mm512_cvttpd_epu32(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_cvttpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_cvttpd_epu32(__A); 
+}
+
+__m256i test_mm512_mask_cvttpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvttpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_mask_cvttpd_epu32(__W, __U, __A); 
+}
+
+__m256i test_mm512_maskz_cvttpd_epu32(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvttpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_maskz_cvttpd_epu32(__U, __A); 
+}
+
+__m512 test_mm512_castpd_ps (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_castpd_ps 
+  // CHECK: bitcast <8 x double> %{{.}} to <16 x float>
+  return _mm512_castpd_ps (__A);
+}
+
+__m512d test_mm512_castps_pd (__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_castps_pd 
+  // CHECK: bitcast <16 x float> %{{.}} to <8 x double>
+  return _mm512_castps_pd (__A);
+}
+
+__m512i test_mm512_castpd_si512 (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_castpd_si512 
+  // CHECK: bitcast <8 x double> %{{.}} to <8 x i64>
+  return _mm512_castpd_si512 (__A);
+}
+
+__m512 test_mm512_castps128_ps512(__m128 __A) {
+  // CHECK-LABEL: @test_mm512_castps128_ps512
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castps128_ps512(__A); 
+}
+
+__m512d test_mm512_castpd128_pd512(__m128d __A) {
+  // CHECK-LABEL: @test_mm512_castpd128_pd512
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castpd128_pd512(__A); 
+}
+
+__m512d test_mm512_set1_epi8(char d)
+{
+  // CHECK-LABEL: @test_mm512_set1_epi8
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 0
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 1
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 2
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 3
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 4
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 5
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 6
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 7
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 63
+  return _mm512_set1_epi8(d);
+}
+
+__m512d test_mm512_set1_epi16(short d)
+{
+  // CHECK-LABEL: @test_mm512_set1_epi16
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 0
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 1
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 2
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 3
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 4
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 5
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 6
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 7
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 31
+  return _mm512_set1_epi16(d);
+}
+
+__m512i test_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
+{
+  // CHECK-LABEL: @test_mm512_set4_epi32 
+  // CHECK: insertelement <16 x i32> {{.*}}, i32 15
+  return _mm512_set4_epi32 (__A,__B,__C,__D);
+}
+
+__m512i test_mm512_set4_epi64 (long long __A, long long __B, long long __C, long long __D)
+{
+  // CHECK-LABEL: @test_mm512_set4_epi64 
+  // CHECK: insertelement <8 x i64> {{.*}}, i32 7
+  return _mm512_set4_epi64 (__A,__B,__C,__D);
+}
+
+__m512d test_mm512_set4_pd (double __A, double __B, double __C, double __D)
+{
+  // CHECK-LABEL: @test_mm512_set4_pd 
+  // CHECK: insertelement <8 x double> {{.*}}, i32 7
+  return _mm512_set4_pd (__A,__B,__C,__D);
+}
+
+__m512 test_mm512_set4_ps (float __A, float __B, float __C, float __D)
+{
+  // CHECK-LABEL: @test_mm512_set4_ps 
+  // CHECK: insertelement <16 x float> {{.*}}, i32 15
+  return _mm512_set4_ps (__A,__B,__C,__D);
+}
+
+__m512i test_mm512_setr4_epi32(int e0, int e1, int e2, int e3)
+{
+  // CHECK-LABEL: @test_mm512_setr4_epi32
+  // CHECK: insertelement <16 x i32> {{.*}}, i32 15
+  return _mm512_setr4_epi32(e0, e1, e2, e3);
+}
+
+ __m512i test_mm512_setr4_epi64(long long e0, long long e1, long long e2, long long e3)
+{
+  // CHECK-LABEL: @test_mm512_setr4_epi64
+  // CHECK: insertelement <8 x i64> {{.*}}, i32 7
+  return _mm512_setr4_epi64(e0, e1, e2, e3);
+}
+
+__m512i test_mm512_setr4_pd(double e0, double e1, double e2, double e3)
+{
+  // CHECK-LABEL: @test_mm512_setr4_pd
+  // CHECK: insertelement <8 x double> {{.*}}, i32 7
+  return _mm512_setr4_pd(e0,e1,e2,e3);
+}
+
+ __m512i test_mm512_setr4_ps(float e0, float e1, float e2, float e3)
+{
+  // CHECK-LABEL: @test_mm512_setr4_ps
+  // CHECK: insertelement <16 x float> {{.*}}, i32 15
+  return _mm512_setr4_ps(e0,e1,e2,e3);
+}
+
+__m512d test_mm512_castpd256_pd512(__m256d a)
+{
+  // CHECK-LABEL: @test_mm512_castpd256_pd512
+  // CHECK: shufflevector <4 x double> {{.*}} <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castpd256_pd512(a);
+}
+
+__m256d test_mm512_castpd512_pd256 (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_castpd512_pd256 
+  // CHECK: shufflevector <8 x double> %{{.}}, <8 x double> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm512_castpd512_pd256 (__A);
+}
+
+__m256 test_mm512_castps512_ps256 (__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_castps512_ps256 
+  // CHECK: shufflevector <16 x float> %{{.}}, <16 x float> %{{.}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm512_castps512_ps256 (__A);
+}
+
+__m512i test_mm512_castps_si512 (__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_castps_si512 
+  // CHECK: bitcast <16 x float> %{{.}} to <8 x i64>
+  return _mm512_castps_si512 (__A);
+}
+__m512i test_mm512_castsi128_si512(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_castsi128_si512
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castsi128_si512(__A); 
+}
+
+__m512i test_mm512_castsi256_si512(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_castsi256_si512
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castsi256_si512(__A); 
+}
+
+__m512 test_mm512_castsi512_ps (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_castsi512_ps 
+  // CHECK: bitcast <8 x i64> %{{.}} to <16 x float>
+  return _mm512_castsi512_ps (__A);
+}
+
+__m512d test_mm512_castsi512_pd (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_castsi512_pd 
+  // CHECK: bitcast <8 x i64> %{{.}} to <8 x double>
+  return _mm512_castsi512_pd (__A);
+}
+
+__m128i test_mm512_castsi512_si128 (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_castsi512_si128 
+  // CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <2 x i32> <i32 0, i32 1>
+  return _mm512_castsi512_si128 (__A);
+}
+
+__m256i test_mm512_castsi512_si256 (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_castsi512_si256 
+  // CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm512_castsi512_si256 (__A);
+}
+
+__m128 test_mm_cvt_roundsd_ss(__m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_cvt_roundsd_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_mask_cvt_roundsd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_cvt_roundsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_mask_cvt_roundsd_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_maskz_cvt_roundsd_ss(__mmask8 __U, __m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvt_roundsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_maskz_cvt_roundsd_ss(__U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+#ifdef __x86_64__
+__m128d test_mm_cvt_roundi64_sd(__m128d __A, long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundi64_sd
+  // CHECK: @llvm.x86.avx512.cvtsi2sd64
+  return _mm_cvt_roundi64_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_cvt_roundsi64_sd(__m128d __A, long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundsi64_sd
+  // CHECK: @llvm.x86.avx512.cvtsi2sd64
+  return _mm_cvt_roundsi64_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+#endif
+
+__m128 test_mm_cvt_roundsi32_ss(__m128 __A, int __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundsi32_ss
+  // CHECK: @llvm.x86.avx512.cvtsi2ss32
+  return _mm_cvt_roundsi32_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvt_roundi32_ss(__m128 __A, int __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundi32_ss
+  // CHECK: @llvm.x86.avx512.cvtsi2ss32
+  return _mm_cvt_roundi32_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+#ifdef __x86_64__
+__m128 test_mm_cvt_roundsi64_ss(__m128 __A, long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundsi64_ss
+  // CHECK: @llvm.x86.avx512.cvtsi2ss64
+  return _mm_cvt_roundsi64_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvt_roundi64_ss(__m128 __A, long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundi64_ss
+  // CHECK: @llvm.x86.avx512.cvtsi2ss64
+  return _mm_cvt_roundi64_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+#endif
+
+__m128d test_mm_cvt_roundss_sd(__m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_cvt_roundss_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_mask_cvt_roundss_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_cvt_roundss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_mask_cvt_roundss_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_maskz_cvt_roundss_sd( __mmask8 __U, __m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvt_roundss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_maskz_cvt_roundss_sd( __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_cvtu32_sd(__m128d __A, unsigned __B) {
+  // CHECK-LABEL: @test_mm_cvtu32_sd
+  // CHECK: @llvm.x86.avx512.cvtusi2sd
+  return _mm_cvtu32_sd(__A, __B); 
+}
+
+#ifdef __x86_64__
+__m128d test_mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundu64_sd
+  // CHECK: @llvm.x86.avx512.cvtusi642sd
+  return _mm_cvt_roundu64_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_cvtu64_sd(__m128d __A, unsigned long long __B) {
+  // CHECK-LABEL: @test_mm_cvtu64_sd
+  // CHECK: @llvm.x86.avx512.cvtusi642sd
+  return _mm_cvtu64_sd(__A, __B); 
+}
+#endif
+
+__m128 test_mm_cvt_roundu32_ss(__m128 __A, unsigned __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundu32_ss
+  // CHECK: @llvm.x86.avx512.cvtusi2ss
+  return _mm_cvt_roundu32_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvtu32_ss(__m128 __A, unsigned __B) {
+  // CHECK-LABEL: @test_mm_cvtu32_ss
+  // CHECK: @llvm.x86.avx512.cvtusi2ss
+  return _mm_cvtu32_ss(__A, __B); 
+}
+
+#ifdef __x86_64__
+__m128 test_mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundu64_ss
+  // CHECK: @llvm.x86.avx512.cvtusi642ss
+    return _mm_cvt_roundu64_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvtu64_ss(__m128 __A, unsigned long long __B) {
+  // CHECK-LABEL: @test_mm_cvtu64_ss
+  // CHECK: @llvm.x86.avx512.cvtusi642ss
+  return _mm_cvtu64_ss(__A, __B); 
+}
+#endif
+
+__m512i test_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvttps_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+  return _mm512_mask_cvttps_epu32 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvttps_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+  return _mm512_maskz_cvttps_epu32 (__U,__A);
+}
+
+__m512 test_mm512_cvtepu32_ps (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtepu32_ps 
+  // CHECK:  @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_cvtepu32_ps (__A);
+}
+
+__m512 test_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepu32_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_mask_cvtepu32_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_maskz_cvtepu32_ps (__U,__A);
+}
+
+__m512d test_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_pd 
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.512
+  return _mm512_mask_cvtepi32_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_pd 
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.512
+  return _mm512_maskz_cvtepi32_pd (__U,__A);
+}
+
+__m512 test_mm512_cvtepi32_ps (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtepi32_ps 
+  // CHECK:  @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_cvtepi32_ps (__A);
+}
+
+__m512 test_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_mask_cvtepi32_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_maskz_cvtepi32_ps (__U,__A);
+}
+
+__m512d test_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepu32_pd 
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.512
+  return _mm512_mask_cvtepu32_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_pd 
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.512
+  return _mm512_maskz_cvtepu32_pd (__U,__A);
+}
+
+__m256 test_mm512_cvtpd_ps (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtpd_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_cvtpd_ps (__A);
+}
+
+__m256 test_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtpd_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_mask_cvtpd_ps (__W,__U,__A);
+}
+
+__m256 test_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtpd_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_maskz_cvtpd_ps (__U,__A);
+}
+
+__m512 test_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtph_ps 
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+  return _mm512_mask_cvtph_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtph_ps 
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+  return _mm512_maskz_cvtph_ps (__U,__A);
+}
+
+__m256i test_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvttpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_mask_cvttpd_epi32 (__W,__U,__A);
+}
+
+__m256i test_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvttpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_maskz_cvttpd_epi32 (__U,__A);
+}
+
+__m512i test_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvttps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
+  return _mm512_mask_cvttps_epi32 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvttps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
+  return _mm512_maskz_cvttps_epi32 (__U,__A);
+}
+
+__m512i test_mm512_cvtps_epi32 (__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_cvtps_epi32 (__A);
+}
+
+__m512i test_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_mask_cvtps_epi32 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_maskz_cvtps_epi32 (__U,__A);
+}
+
+__m256i test_mm512_cvtpd_epi32 (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_cvtpd_epi32 (__A);
+}
+
+__m256i test_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_mask_cvtpd_epi32 (__W,__U,__A);
+}
+
+__m256i test_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_maskz_cvtpd_epi32 (__U,__A);
+}
+
+__m256i test_mm512_cvtpd_epu32 (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtpd_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_cvtpd_epu32 (__A);
+}
+
+__m256i test_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtpd_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_mask_cvtpd_epu32 (__W,__U,__A);
+}
+
+__m256i test_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtpd_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_maskz_cvtpd_epu32 (__U,__A);
+}
+
+__m256i test_mm512_mask_cvtps_ph(__m256i src, __mmask16 k, __m512 a) 
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+  return _mm512_mask_cvtps_ph(src, k, a,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m256i test_mm512_maskz_cvtps_ph (__mmask16 k, __m512 a) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+  return _mm512_maskz_cvtps_ph( k, a,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_cvtps_epu32 ( __m512 __A) 
+{
+  // CHECK-LABEL: @test_mm512_cvtps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_cvtps_epu32(__A);
+}
+
+__m512i test_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_mask_cvtps_epu32( __W, __U, __A);
+}
+__m512i test_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_maskz_cvtps_epu32( __U, __A);
+}
+
+__m512d test_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_pd 
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_mask_max_pd (__W,__U,__A,__B);
+}
+
+__m512d test_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_pd 
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_maskz_max_pd (__U,__A,__B);
+}
+
+__m512 test_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_ps 
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_mask_max_ps (__W,__U,__A,__B);
+}
+
+__m512d test_mm512_mask_max_round_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_round_pd
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_mask_max_round_pd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_maskz_max_round_pd(__mmask8 __U,__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_round_pd
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_maskz_max_round_pd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_max_round_pd(__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_max_round_pd
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_max_round_pd(__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_ps 
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_maskz_max_ps (__U,__A,__B);
+}
+
+__m512 test_mm512_mask_max_round_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_round_ps
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_mask_max_round_ps(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_max_round_ps(__mmask16 __U,__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_round_ps
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_maskz_max_round_ps(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_max_round_ps(__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_max_round_ps
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_max_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_pd 
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_mask_min_pd (__W,__U,__A,__B);
+}
+
+__m512d test_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_pd 
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_maskz_min_pd (__U,__A,__B);
+}
+
+__m512d test_mm512_mask_min_round_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_round_pd
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_mask_min_round_pd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_maskz_min_round_pd(__mmask8 __U,__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_round_pd
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_maskz_min_round_pd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_min_round_pd( __m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_min_round_pd
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_min_round_pd(__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_ps 
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_mask_min_ps (__W,__U,__A,__B);
+}
+
+__m512 test_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_ps 
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_maskz_min_ps (__U,__A,__B);
+}
+
+__m512 test_mm512_mask_min_round_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_round_ps
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_mask_min_round_ps(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_min_round_ps(__mmask16 __U,__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_round_ps
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_maskz_min_round_ps(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_min_round_ps(__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_min_round_ps
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_floor_ps 
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_mask_floor_ps (__W,__U,__A);
+}
+
+__m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_floor_pd 
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_mask_floor_pd (__W,__U,__A);
+}
+
+__m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_ceil_ps 
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_mask_ceil_ps (__W,__U,__A);
+}
+
+__m512d test_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_ceil_pd 
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_mask_ceil_pd (__W,__U,__A);
+}
+
+__m512 test_mm512_mask_roundscale_ps(__m512 __W, __mmask16 __U, __m512 __A) 
+{
+  // CHECK-LABEL: @test_mm512_mask_roundscale_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_mask_roundscale_ps(__W,__U,__A, 1);
+}
+
+__m512 test_mm512_maskz_roundscale_ps(__mmask16 __U, __m512 __A) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_maskz_roundscale_ps(__U,__A, 1);
+}
+
+__m512 test_mm512_mask_roundscale_round_ps(__m512 __A,__mmask16 __U,__m512 __C)
+{
+  // CHECK-LABEL: @test_mm512_mask_roundscale_round_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_mask_roundscale_round_ps(__A,__U,__C,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_roundscale_round_ps(__m512 __A,__mmask16 __U) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_round_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_maskz_roundscale_round_ps(__U,__A,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_roundscale_round_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_roundscale_round_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_roundscale_round_ps(__A,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_mask_roundscale_pd(__m512d __W, __mmask8 __U, __m512d __A) 
+{
+  // CHECK-LABEL: @test_mm512_mask_roundscale_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_mask_roundscale_pd(__W,__U,__A, 1);
+}
+
+__m512d test_mm512_maskz_roundscale_pd(__mmask8 __U, __m512d __A) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_maskz_roundscale_pd(__U,__A, 1);
+}
+
+__m512d test_mm512_mask_roundscale_round_pd(__m512d __A,__mmask8 __U,__m512d __C)
+{
+  // CHECK-LABEL: @test_mm512_mask_roundscale_round_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_mask_roundscale_round_pd(__A,__U,__C,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_maskz_roundscale_round_pd(__m512d __A,__mmask8 __U)
+{
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_round_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_maskz_roundscale_round_pd(__U,__A,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_roundscale_round_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_roundscale_round_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_roundscale_round_pd(__A,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.512
+  return _mm512_mask_max_epi32 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.512
+  return _mm512_maskz_max_epi32 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.512
+  return _mm512_mask_max_epi64 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.512
+  return _mm512_maskz_max_epi64 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_epu64 
+  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.512
+  return _mm512_mask_max_epu64 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_epu64 
+  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.512
+  return _mm512_maskz_max_epu64 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_epu32 
+  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.512
+  return _mm512_mask_max_epu32 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_epu32 
+  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.512
+  return _mm512_maskz_max_epu32 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pmins.d.512
+  return _mm512_mask_min_epi32 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pmins.d.512
+  return _mm512_maskz_min_epi32 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_epu32 
+  // CHECK: @llvm.x86.avx512.mask.pminu.d.512
+  return _mm512_mask_min_epu32 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_epu32 
+  // CHECK: @llvm.x86.avx512.mask.pminu.d.512
+  return _mm512_maskz_min_epu32 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pmins.q.512
+  return _mm512_mask_min_epi64 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pmins.q.512
+  return _mm512_maskz_min_epi64 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_epu64 
+  // CHECK: @llvm.x86.avx512.mask.pminu.q.512
+  return _mm512_mask_min_epu64 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_epu64 
+  // CHECK: @llvm.x86.avx512.mask.pminu.q.512
+  return _mm512_maskz_min_epu64 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+    //CHECK-LABEL: @test_mm512_mask_set1_epi32
+    //CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.512
+  return _mm512_mask_set1_epi32 ( __O, __M, __A);
+}
+
+__m512i test_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+               int __E, int __F, int __G, int __H,
+               int __I, int __J, int __K, int __L,
+               int __M, int __N, int __O, int __P)
+{
+ //CHECK-LABEL: @test_mm512_set_epi32
+ //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+    //CHECK: insertelement{{.*}}i32 8
+    //CHECK: insertelement{{.*}}i32 9
+    //CHECK: insertelement{{.*}}i32 10
+    //CHECK: insertelement{{.*}}i32 11
+    //CHECK: insertelement{{.*}}i32 12
+    //CHECK: insertelement{{.*}}i32 13
+    //CHECK: insertelement{{.*}}i32 14
+    //CHECK: insertelement{{.*}}i32 15
+ return _mm512_set_epi32( __A, __B, __C, __D,__E, __F, __G, __H,
+              __I, __J, __K, __L,__M, __N, __O, __P);
+}
+
+__m512i test_mm512_setr_epi32 (int __A, int __B, int __C, int __D,
+               int __E, int __F, int __G, int __H,
+               int __I, int __J, int __K, int __L,
+               int __M, int __N, int __O, int __P)
+{
+    //CHECK-LABEL: @test_mm512_setr_epi32
+    //CHECK: load{{.*}}%__P.addr, align 4
+    //CHECK: load{{.*}}%__O.addr, align 4
+    //CHECK: load{{.*}}%__N.addr, align 4
+    //CHECK: load{{.*}}%__M.addr, align 4
+    //CHECK: load{{.*}}%__L.addr, align 4
+    //CHECK: load{{.*}}%__K.addr, align 4
+    //CHECK: load{{.*}}%__J.addr, align 4
+    //CHECK: load{{.*}}%__I.addr, align 4
+    //CHECK: load{{.*}}%__H.addr, align 4
+    //CHECK: load{{.*}}%__G.addr, align 4
+    //CHECK: load{{.*}}%__F.addr, align 4
+    //CHECK: load{{.*}}%__E.addr, align 4
+    //CHECK: load{{.*}}%__D.addr, align 4
+    //CHECK: load{{.*}}%__C.addr, align 4
+    //CHECK: load{{.*}}%__B.addr, align 4
+    //CHECK: load{{.*}}%__A.addr, align 4
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+    //CHECK: insertelement{{.*}}i32 8
+    //CHECK: insertelement{{.*}}i32 9
+    //CHECK: insertelement{{.*}}i32 10
+    //CHECK: insertelement{{.*}}i32 11
+    //CHECK: insertelement{{.*}}i32 12
+    //CHECK: insertelement{{.*}}i32 13
+    //CHECK: insertelement{{.*}}i32 14
+    //CHECK: insertelement{{.*}}i32 15
+ return _mm512_setr_epi32( __A, __B, __C, __D,__E, __F, __G, __H,
+              __I, __J, __K, __L,__M, __N, __O, __P);
+}
+
+#ifdef __x86_64__
+__m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+    //CHECK-LABEL: @test_mm512_mask_set1_epi64
+    //CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.512
+  return _mm512_mask_set1_epi64 (__O, __M, __A);
+}
+#endif
+
+__m512i test_mm512_set_epi64 (long long __A, long long __B, long long __C,
+                              long long __D, long long __E, long long __F,
+                              long long __G, long long __H)
+{
+    //CHECK-LABEL: @test_mm512_set_epi64
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+  return _mm512_set_epi64(__A, __B, __C, __D, __E, __F, __G, __H );
+}
+
+__m512i test_mm512_setr_epi64 (long long __A, long long __B, long long __C,
+                              long long __D, long long __E, long long __F,
+                              long long __G, long long __H)
+{
+    //CHECK-LABEL: @test_mm512_setr_epi64
+    //CHECK: load{{.*}}%__H.addr, align 8
+    //CHECK: load{{.*}}%__G.addr, align 8
+    //CHECK: load{{.*}}%__F.addr, align 8
+    //CHECK: load{{.*}}%__E.addr, align 8
+    //CHECK: load{{.*}}%__D.addr, align 8
+    //CHECK: load{{.*}}%__C.addr, align 8
+    //CHECK: load{{.*}}%__B.addr, align 8
+    //CHECK: load{{.*}}%__A.addr, align 8
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+  return _mm512_setr_epi64(__A, __B, __C, __D, __E, __F, __G, __H );
+}
+
+__m512d test_mm512_set_pd (double __A, double __B, double __C, double __D,
+                           double __E, double __F, double __G, double __H)
+{
+    //CHECK-LABEL: @test_mm512_set_pd
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+  return _mm512_set_pd( __A, __B, __C, __D, __E, __F, __G, __H);
+}
+
+__m512d test_mm512_setr_pd (double __A, double __B, double __C, double __D,
+                           double __E, double __F, double __G, double __H)
+{
+    //CHECK-LABEL: @test_mm512_setr_pd
+    //CHECK: load{{.*}}%__H.addr, align 8
+    //CHECK: load{{.*}}%__G.addr, align 8
+    //CHECK: load{{.*}}%__F.addr, align 8
+    //CHECK: load{{.*}}%__E.addr, align 8
+    //CHECK: load{{.*}}%__D.addr, align 8
+    //CHECK: load{{.*}}%__C.addr, align 8
+    //CHECK: load{{.*}}%__B.addr, align 8
+    //CHECK: load{{.*}}%__A.addr, align 8
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+  return _mm512_setr_pd( __A, __B, __C, __D, __E, __F, __G, __H);
+}
+
+__m512 test_mm512_set_ps (float __A, float __B, float __C, float __D,
+                          float __E, float __F, float __G, float __H,
+                          float __I, float __J, float __K, float __L,
+                          float __M, float __N, float __O, float __P)
+{
+    //CHECK-LABEL: @test_mm512_set_ps
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+    //CHECK: insertelement{{.*}}i32 8
+    //CHECK: insertelement{{.*}}i32 9
+    //CHECK: insertelement{{.*}}i32 10
+    //CHECK: insertelement{{.*}}i32 11
+    //CHECK: insertelement{{.*}}i32 12
+    //CHECK: insertelement{{.*}}i32 13
+    //CHECK: insertelement{{.*}}i32 14
+    //CHECK: insertelement{{.*}}i32 15
+    return _mm512_set_ps( __A, __B, __C, __D, __E, __F, __G, __H,
+                          __I, __J, __K, __L, __M, __N, __O, __P);
+}
+
+__m512i test_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_abs_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pabs.q.512
+  return _mm512_mask_abs_epi64 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_abs_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pabs.q.512
+  return _mm512_maskz_abs_epi64 (__U,__A);
+}
+
+__m512i test_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_abs_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pabs.d.512
+  return _mm512_mask_abs_epi32 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_abs_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pabs.d.512
+  return _mm512_maskz_abs_epi32 (__U,__A);
+}
+
+__m512 test_mm512_setr_ps (float __A, float __B, float __C, float __D,
+                          float __E, float __F, float __G, float __H,
+                          float __I, float __J, float __K, float __L,
+                          float __M, float __N, float __O, float __P)
+{
+    //CHECK-LABEL: @test_mm512_setr_ps
+    //CHECK: load{{.*}}%__P.addr, align 4
+    //CHECK: load{{.*}}%__O.addr, align 4
+    //CHECK: load{{.*}}%__N.addr, align 4
+    //CHECK: load{{.*}}%__M.addr, align 4
+    //CHECK: load{{.*}}%__L.addr, align 4
+    //CHECK: load{{.*}}%__K.addr, align 4
+    //CHECK: load{{.*}}%__J.addr, align 4
+    //CHECK: load{{.*}}%__I.addr, align 4
+    //CHECK: load{{.*}}%__H.addr, align 4
+    //CHECK: load{{.*}}%__G.addr, align 4
+    //CHECK: load{{.*}}%__F.addr, align 4
+    //CHECK: load{{.*}}%__E.addr, align 4
+    //CHECK: load{{.*}}%__D.addr, align 4
+    //CHECK: load{{.*}}%__C.addr, align 4
+    //CHECK: load{{.*}}%__B.addr, align 4
+    //CHECK: load{{.*}}%__A.addr, align 4
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+    //CHECK: insertelement{{.*}}i32 8
+    //CHECK: insertelement{{.*}}i32 9
+    //CHECK: insertelement{{.*}}i32 10
+    //CHECK: insertelement{{.*}}i32 11
+    //CHECK: insertelement{{.*}}i32 12
+    //CHECK: insertelement{{.*}}i32 13
+    //CHECK: insertelement{{.*}}i32 14
+    //CHECK: insertelement{{.*}}i32 15
+    return _mm512_setr_ps( __A, __B, __C, __D, __E, __F, __G, __H,
+                          __I, __J, __K, __L, __M, __N, __O, __P);
+}
+
+int test_mm_cvtss_i32(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_i32
+  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
+  return _mm_cvtss_i32(A);
+}
+
+#ifdef __x86_64__
+long long test_mm_cvtss_i64(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_i64
+  // CHECK: call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %{{.*}})
+  return _mm_cvtss_i64(A);
+}
+#endif
+
+__m128d test_mm_cvti32_sd(__m128d A, int B) {
+  // CHECK-LABEL: test_mm_cvti32_sd
+  // CHECK: sitofp i32 %{{.*}} to double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
+  return _mm_cvti32_sd(A, B);
+}
+
+#ifdef __x86_64__
+__m128d test_mm_cvti64_sd(__m128d A, long long B) {
+  // CHECK-LABEL: test_mm_cvti64_sd
+  // CHECK: sitofp i64 %{{.*}} to double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
+  return _mm_cvti64_sd(A, B);
+}
+#endif
+
+__m128 test_mm_cvti32_ss(__m128 A, int B) {
+  // CHECK-LABEL: test_mm_cvti32_ss
+  // CHECK: sitofp i32 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_cvti32_ss(A, B);
+}
+
+#ifdef __x86_64__
+__m128 test_mm_cvti64_ss(__m128 A, long long B) {
+  // CHECK-LABEL: test_mm_cvti64_ss
+  // CHECK: sitofp i64 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_cvti64_ss(A, B);
+}
+#endif
+
+int test_mm_cvtsd_i32(__m128d A) {
+  // CHECK-LABEL: test_mm_cvtsd_i32
+  // CHECK: call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %{{.*}})
+  return _mm_cvtsd_i32(A);
+}
+
+#ifdef __x86_64__
+long long test_mm_cvtsd_i64(__m128d A) {
+  // CHECK-LABEL: test_mm_cvtsd_i64
+  // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %{{.*}})
+  return _mm_cvtsd_i64(A);
+}
+#endif
+
+__m128d test_mm_mask_cvtss_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_cvtss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_mask_cvtss_sd(__W, __U, __A, __B); 
+}
+
+__m128d test_mm_maskz_cvtss_sd( __mmask8 __U, __m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvtss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_maskz_cvtss_sd( __U, __A, __B); 
+}
+
+__m128 test_mm_mask_cvtsd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_cvtsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_mask_cvtsd_ss(__W, __U, __A, __B); 
+}
+
+__m128 test_mm_maskz_cvtsd_ss(__mmask8 __U, __m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_maskz_cvtsd_ss(__U, __A, __B); 
+}
+
+
+__m512i test_mm512_setzero_epi32()
+{
+  // CHECK-LABEL: @test_mm512_setzero_epi32
+  // CHECK: zeroinitializer
+  return _mm512_setzero_epi32();
+}
+
+__m512i test_mm512_setzero()
+{
+  // CHECK-LABEL: @test_mm512_setzero
+  // CHECK: zeroinitializer
+  return _mm512_setzero();
+}
+
+__m512i test_mm512_setzero_si512()
+{
+  // CHECK-LABEL: @test_mm512_setzero_si512
+  // CHECK: zeroinitializer
+  return _mm512_setzero_si512();
+}
+
+__m512i test_mm512_setzero_ps()
+{
+  // CHECK-LABEL: @test_mm512_setzero_ps
+  // CHECK: zeroinitializer
+  return _mm512_setzero_ps();
+}
+
+__m512d test_mm512_setzero_pd()
+{
+  // CHECK-LABEL: @test_mm512_setzero_pd
+  // CHECK: zeroinitializer
+  return _mm512_setzero_pd();
+}
+
+__m512d test_mm512_abs_pd(__m512d a){
+  // CHECK-LABEL: @test_mm512_abs_pd
+  // CHECK: and <8 x i64> 
+  return _mm512_abs_pd(a);
+}
+
+__m512d test_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A){
+  // CHECK-LABEL: @test_mm512_mask_abs_pd 
+  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
+  return _mm512_mask_abs_pd (__W,__U,__A);
+}
+
+__m512 test_mm512_abs_ps(__m512 a){
+  // CHECK-LABEL: @test_mm512_abs_ps
+  // CHECK: and <16 x i32> 
+  return _mm512_abs_ps(a);
+}
+
+__m512 test_mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A){
+  // CHECK-LABEL: @test_mm512_mask_abs_ps
+  // CHECK: and <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_abs_ps( __W, __U, __A);
+}
+
diff --git a/test/CodeGen/avx512ifma-builtins.c b/test/CodeGen/avx512ifma-builtins.c
new file mode 100644
index 0000000..d3114dd
--- /dev/null
+++ b/test/CodeGen/avx512ifma-builtins.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m512i test_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  // CHECK-LABEL: @test_mm512_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.512
+  return _mm512_madd52hi_epu64(__X, __Y, __Z); 
+}
+
+__m512i test_mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.512
+  return _mm512_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
+  // CHECK-LABEL: @test_mm512_maskz_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.512
+  return _mm512_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+}
+
+__m512i test_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  // CHECK-LABEL: @test_mm512_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  return _mm512_madd52lo_epu64(__X, __Y, __Z); 
+}
+
+__m512i test_mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  return _mm512_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
+  // CHECK-LABEL: @test_mm512_maskz_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  return _mm512_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+}
diff --git a/test/CodeGen/avx512ifmavl-builtins.c b/test/CodeGen/avx512ifmavl-builtins.c
new file mode 100644
index 0000000..c59af0e
--- /dev/null
+++ b/test/CodeGen/avx512ifmavl-builtins.c
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  // CHECK-LABEL: @test_mm_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.128
+  return _mm_madd52hi_epu64(__X, __Y, __Z); 
+}
+
+__m128i test_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.128
+  return _mm_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+}
+
+__m128i test_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
+  // CHECK-LABEL: @test_mm_maskz_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.128
+  return _mm_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+}
+
+__m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  // CHECK-LABEL: @test_mm256_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.256
+  return _mm256_madd52hi_epu64(__X, __Y, __Z); 
+}
+
+__m256i test_mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.256
+  return _mm256_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
+  // CHECK-LABEL: @test_mm256_maskz_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.256
+  return _mm256_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+}
+
+__m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  // CHECK-LABEL: @test_mm_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.128
+  return _mm_madd52lo_epu64(__X, __Y, __Z); 
+}
+
+__m128i test_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.128
+  return _mm_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+}
+
+__m128i test_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
+  // CHECK-LABEL: @test_mm_maskz_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.maskz.vpmadd52l.uq.128
+  return _mm_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+}
+
+__m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  // CHECK-LABEL: @test_mm256_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  return _mm256_madd52lo_epu64(__X, __Y, __Z); 
+}
+
+__m256i test_mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  return _mm256_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
+  // CHECK-LABEL: @test_mm256_maskz_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  return _mm256_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+}
diff --git a/test/CodeGen/avx512pf-builtins.c b/test/CodeGen/avx512pf-builtins.c
new file mode 100644
index 0000000..4e00552
--- /dev/null
+++ b/test/CodeGen/avx512pf-builtins.c
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512pf -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+void test_mm512_mask_prefetch_i32gather_pd(__m256i index, __mmask8 mask, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gatherpf.dpd
+  return _mm512_mask_prefetch_i32gather_pd(index, mask, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i32gather_pd(__m256i index, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_prefetch_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gatherpf.dpd
+  return _mm512_prefetch_i32gather_pd(index, addr, 2, 1); 
+}
+
+void test_mm512_mask_prefetch_i32gather_ps(__m512i index, __mmask16 mask, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gatherpf.dps
+  return _mm512_mask_prefetch_i32gather_ps(index, mask, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i32gather_ps(__m512i index,  void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_prefetch_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gatherpf.dps
+  return _mm512_prefetch_i32gather_ps(index, addr, 2, 1); 
+}
+
+void test_mm512_mask_prefetch_i64gather_pd(__m512i index, __mmask8 mask, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gatherpf.qpd
+  return _mm512_mask_prefetch_i64gather_pd(index, mask, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i64gather_pd(__m512i index, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_prefetch_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gatherpf.qpd
+  return _mm512_prefetch_i64gather_pd(index, addr, 2, 1); 
+}
+
+void test_mm512_mask_prefetch_i64gather_ps(__m512i index, __mmask8 mask, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gatherpf.qps
+  return _mm512_mask_prefetch_i64gather_ps(index, mask, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i64gather_ps(__m512i index, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_prefetch_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gatherpf.qps
+  return _mm512_prefetch_i64gather_ps(index, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i32scatter_pd(void *addr, __m256i index) {
+  // CHECK-LABEL: @test_mm512_prefetch_i32scatter_pd
+  // CHECK: @llvm.x86.avx512.scatterpf.dpd.512
+  return _mm512_prefetch_i32scatter_pd(addr, index, 1, 2); 
+}
+
+void test_mm512_mask_prefetch_i32scatter_pd(void *addr, __mmask8 mask, __m256i index) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_pd
+  // CHECK: @llvm.x86.avx512.scatterpf.dpd.512
+  return _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, 1, 2); 
+}
+
+void test_mm512_prefetch_i32scatter_ps(void *addr, __m512i index) {
+  // CHECK-LABEL: @test_mm512_prefetch_i32scatter_ps
+  // CHECK: @llvm.x86.avx512.scatterpf.dps.512
+  return _mm512_prefetch_i32scatter_ps(addr, index, 1, 2); 
+}
+
+void test_mm512_mask_prefetch_i32scatter_ps(void *addr, __mmask16 mask, __m512i index) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_ps
+  // CHECK: @llvm.x86.avx512.scatterpf.dps.512
+  return _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, 1, 2); 
+}
+
+void test_mm512_prefetch_i64scatter_pd(void *addr, __m512i index) {
+  // CHECK-LABEL: @test_mm512_prefetch_i64scatter_pd
+  // CHECK: @llvm.x86.avx512.scatterpf.qpd.512
+  return _mm512_prefetch_i64scatter_pd(addr, index, 1, 2); 
+}
+
+void test_mm512_mask_prefetch_i64scatter_pd(void *addr, __mmask16 mask, __m512i index) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_pd
+  // CHECK: @llvm.x86.avx512.scatterpf.qpd.512
+  return _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, 1, 2); 
+}
+
+void test_mm512_prefetch_i64scatter_ps(void *addr, __m512i index) {
+  // CHECK-LABEL: @test_mm512_prefetch_i64scatter_ps
+  // CHECK: @llvm.x86.avx512.scatterpf.qps.512
+  return _mm512_prefetch_i64scatter_ps(addr, index, 1, 2); 
+}
+
+void test_mm512_mask_prefetch_i64scatter_ps(void *addr, __mmask16 mask, __m512i index) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_ps
+  // CHECK: @llvm.x86.avx512.scatterpf.qps.512
+  return _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, 1, 2); 
+}
diff --git a/test/CodeGen/avx512vbmi-builtins.c b/test/CodeGen/avx512vbmi-builtins.c
new file mode 100644
index 0000000..74f8660
--- /dev/null
+++ b/test/CodeGen/avx512vbmi-builtins.c
@@ -0,0 +1,66 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.512
+  return _mm512_mask2_permutex2var_epi8(__A, __I, __U, __B); 
+}
+
+__m512i test_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.512
+  return _mm512_permutex2var_epi8(__A, __I, __B); 
+}
+
+__m512i test_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.512
+  return _mm512_mask_permutex2var_epi8(__A, __U, __I, __B); 
+}
+
+__m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.512
+  return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); 
+}
+
+__m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  return _mm512_permutexvar_epi8(__A, __B); 
+}
+
+__m512i test_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  return _mm512_maskz_permutexvar_epi8(__M, __A, __B); 
+}
+
+__m512i test_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  return _mm512_mask_permutexvar_epi8(__W, __M, __A, __B); 
+}
+
+__m512i test_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.512
+  return _mm512_mask_multishift_epi64_epi8(__W, __M, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.512
+  return _mm512_maskz_multishift_epi64_epi8(__M, __X, __Y); 
+}
+
+__m512i test_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.512
+  return _mm512_multishift_epi64_epi8(__X, __Y); 
+}
diff --git a/test/CodeGen/avx512vbmivl-builtin.c b/test/CodeGen/avx512vbmivl-builtin.c
new file mode 100644
index 0000000..bee66e3
--- /dev/null
+++ b/test/CodeGen/avx512vbmivl-builtin.c
@@ -0,0 +1,127 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  return _mm_permutexvar_epi8(__A, __B); 
+}
+
+__m128i test_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  return _mm_maskz_permutexvar_epi8(__M, __A, __B); 
+}
+
+__m128i test_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  return _mm_mask_permutexvar_epi8(__W, __M, __A, __B); 
+}
+
+__m256i test_mm256_permutexvar_epi8(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  return _mm256_permutexvar_epi8(__A, __B); 
+}
+
+__m256i test_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  return _mm256_maskz_permutexvar_epi8(__M, __A, __B); 
+}
+
+__m256i test_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  return _mm256_mask_permutexvar_epi8(__W, __M, __A, __B); 
+}
+
+__m128i test_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask2_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.128
+  return _mm_mask2_permutex2var_epi8(__A, __I, __U, __B); 
+}
+
+__m256i test_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask2_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.256
+  return _mm256_mask2_permutex2var_epi8(__A, __I, __U, __B); 
+}
+
+__m128i test_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) {
+  // CHECK-LABEL: @test_mm_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.128
+  return _mm_permutex2var_epi8(__A, __I, __B); 
+}
+
+__m128i test_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.128
+  return _mm_mask_permutex2var_epi8(__A, __U, __I, __B); 
+}
+
+__m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.128
+  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); 
+}
+
+__m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.256
+  return _mm256_permutex2var_epi8(__A, __I, __B); 
+}
+
+__m256i test_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.256
+  return _mm256_mask_permutex2var_epi8(__A, __U, __I, __B); 
+}
+
+__m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.256
+  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); 
+}
+
+__m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.128
+  return _mm_mask_multishift_epi64_epi8(__W, __M, __X, __Y); 
+}
+
+__m128i test_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.128
+  return _mm_maskz_multishift_epi64_epi8(__M, __X, __Y); 
+}
+
+__m128i test_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.128
+  return _mm_multishift_epi64_epi8(__X, __Y); 
+}
+
+__m256i test_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.256
+  return _mm256_mask_multishift_epi64_epi8(__W, __M, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.256
+  return _mm256_maskz_multishift_epi64_epi8(__M, __X, __Y); 
+}
+
+__m256i test_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.256
+  return _mm256_multishift_epi64_epi8(__X, __Y); 
+}
+
diff --git a/test/CodeGen/avx512vl-builtins.c b/test/CodeGen/avx512vl-builtins.c
index 445513c..d9031fe 100644
--- a/test/CodeGen/avx512vl-builtins.c
+++ b/test/CodeGen/avx512vl-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -7,590 +7,602 @@
 
 __mmask8 test_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 0, i8 -1)
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return (__mmask8)_mm_cmpeq_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 0, i8 {{.*}})
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpeq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 0, i8 -1)
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   return (__mmask8)_mm_cmpeq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 0, i8 {{.*}})
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpeq_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpge_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpge_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpge_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpge_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpge_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpge_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpge_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpge_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpge_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpge_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpge_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpge_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpge_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpge_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpge_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpge_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpgt_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpgt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpgt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpgt_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpgt_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpgt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpgt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpgt_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmple_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmple_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmple_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmple_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmple_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmple_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmple_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmple_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmple_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmple_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmple_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmple_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmple_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmple_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmple_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmple_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmplt_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmplt_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmplt_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmplt_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmplt_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmplt_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmplt_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmplt_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmplt_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmplt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmplt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmplt_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmplt_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmplt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmplt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmplt_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpneq_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpneq_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpneq_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpneq_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpneq_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpneq_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpneq_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpneq_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpneq_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpneq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpneq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpneq_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpneq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpneq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
-__mmask8 test_mm_cmp_epi32_mask(__m128i __a, __m128i __b) {
-  // CHECK-LABEL: @test_mm_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm_cmp_epi32_mask(__a, __b, 7);
+__mmask8 test_mm_cmp_eq_epi32_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmp_eq_epi32_mask
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epi32_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm_mask_cmp_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  // CHECK-LABEL: @test_mm_mask_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm_mask_cmp_epi32_mask(__u, __a, __b, 7);
+__mmask8 test_mm_mask_cmp_lt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmp_lt_epi32_mask
+  // CHECK: icmp slt <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epi32_mask(__u, __a, __b, _MM_CMPINT_LT);
 }
 
-__mmask8 test_mm_cmp_epi64_mask(__m128i __a, __m128i __b) {
-  // CHECK-LABEL: @test_mm_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm_cmp_epi64_mask(__a, __b, 7);
+__mmask8 test_mm_cmp_lt_epi64_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmp_lt_epi64_mask
+  // CHECK: icmp slt <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epi64_mask(__a, __b, _MM_CMPINT_LT);
 }
 
-__mmask8 test_mm_mask_cmp_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  // CHECK-LABEL: @test_mm_mask_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm_mask_cmp_epi64_mask(__u, __a, __b, 7);
+__mmask8 test_mm_mask_cmp_eq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmp_eq_epi64_mask
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epi64_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm256_cmp_epi32_mask(__m256i __a, __m256i __b) {
-  // CHECK-LABEL: @test_mm256_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm256_cmp_epi32_mask(__a, __b, 7);
+__mmask8 test_mm256_cmp_eq_epi32_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmp_eq_epi32_mask
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epi32_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm256_mask_cmp_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  // CHECK-LABEL: @test_mm256_mask_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm256_mask_cmp_epi32_mask(__u, __a, __b, 7);
+__mmask8 test_mm256_mask_cmp_le_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmp_le_epi32_mask
+  // CHECK: icmp sle <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epi32_mask(__u, __a, __b, _MM_CMPINT_LE);
 }
 
-__mmask8 test_mm256_cmp_epi64_mask(__m256i __a, __m256i __b) {
-  // CHECK-LABEL: @test_mm256_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm256_cmp_epi64_mask(__a, __b, 7);
+__mmask8 test_mm256_cmp_eq_epi64_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmp_eq_epi64_mask
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epi64_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
-__mmask8 test_mm256_mask_cmp_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  // CHECK-LABEL: @test_mm256_mask_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm256_mask_cmp_epi64_mask(__u, __a, __b, 7);
+__mmask8 test_mm256_mask_cmp_eq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmp_eq_epi64_mask
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epi64_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
 __mmask8 test_mm_cmp_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm_cmp_epu32_mask(__a, __b, 7);
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epu32_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm_mask_cmp_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm_mask_cmp_epu32_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epu32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm_cmp_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm_cmp_epu64_mask(__a, __b, 7);
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epu64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm_mask_cmp_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm_mask_cmp_epu64_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epu64_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm256_cmp_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm256_cmp_epu32_mask(__a, __b, 7);
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epu32_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm256_mask_cmp_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm256_mask_cmp_epu32_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epu32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm256_cmp_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm256_cmp_epu64_mask(__a, __b, 7);
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epu64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm256_mask_cmp_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm256_mask_cmp_epu64_mask(__u, __a, __b, 7);
-}
-
-__m512i test_mm512_maskz_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
-  return _mm512_maskz_andnot_epi32(__k,__A,__B);
-}
-
-__m512i test_mm512_mask_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B, __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
-  return _mm512_mask_andnot_epi32(__src,__k,__A,__B);
-}
-
-__m512i test_mm512_andnot_epi32(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
-  return _mm512_andnot_epi32(__A,__B);
-}
-
-__m512i test_mm512_maskz_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
-  return _mm512_maskz_andnot_epi64(__k,__A,__B);
-}
-
-__m512i test_mm512_mask_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
-  return _mm512_mask_andnot_epi64(__src,__k,__A,__B);
-}
-
-__m512i test_mm512_andnot_epi64(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
-  return _mm512_andnot_epi64(__A,__B);
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epu64_mask(__u, __a, __b, 0);
 }
 
 __m256i test_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
@@ -780,204 +792,232 @@
 __m256i test_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_and_epi32
-  //CHECK: @llvm.x86.avx512.mask.pand.d.256
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_and_epi32(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_and_epi32
-  //CHECK: @llvm.x86.avx512.mask.pand.d.256
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_and_epi32(__U, __A, __B);
 }
 
 __m128i test_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_and_epi32
-  //CHECK: @llvm.x86.avx512.mask.pand.d.128
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_and_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_and_epi32
-  //CHECK: @llvm.x86.avx512.mask.pand.d.128
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_and_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
         __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.256
+  //CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_andnot_epi32(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.256
+  //CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_andnot_epi32(__U, __A, __B);
 }
 
 __m128i test_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
            __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.128
+  //CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_andnot_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.128
+  //CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_andnot_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
           __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_or_epi32
-  //CHECK: @llvm.x86.avx512.mask.por.d.256
+  //CHECK: or <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_or_epi32(__W, __U, __A, __B);
 }
 
  __m256i test_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_or_epi32
-  //CHECK: @llvm.x86.avx512.mask.por.d.256
+  //CHECK: or <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_or_epi32(__U, __A, __B);
 }
 
  __m128i test_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_or_epi32
-  //CHECK: @llvm.x86.avx512.mask.por.d.128
+  //CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_or_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_or_epi32
-  //CHECK: @llvm.x86.avx512.mask.por.d.128
+  //CHECK: or <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_or_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_xor_epi32
-  //CHECK: @llvm.x86.avx512.mask.pxor.d.256
+  //CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_xor_epi32(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_xor_epi32
-  //CHECK: @llvm.x86.avx512.mask.pxor.d.256
+  //CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_xor_epi32(__U, __A, __B);
 }
 
 __m128i test_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_xor_epi32
-  //CHECK: @llvm.x86.avx512.mask.pxor.d.128
+  //CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_xor_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_xor_epi32
-  //CHECK: @llvm.x86.avx512.mask.pxor.d.128
+  //CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_xor_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_and_epi64
-  //CHECK: @llvm.x86.avx512.mask.pand.q.256
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_and_epi64(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_and_epi64
-  //CHECK: @llvm.x86.avx512.mask.pand.q.256
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_and_epi64(__U, __A, __B);
 }
 
 __m128i test_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_and_epi64
-  //CHECK: @llvm.x86.avx512.mask.pand.q.128
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_and_epi64(__W,__U, __A, __B);
 }
 
 __m128i test_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_and_epi64
-  //CHECK: @llvm.x86.avx512.mask.pand.q.128
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_and_epi64(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
         __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.256
+  //CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_andnot_epi64(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.256
+  //CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_andnot_epi64(__U, __A, __B);
 }
 
 __m128i test_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
            __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.128
+  //CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_andnot_epi64(__W,__U, __A, __B);
 }
 
 __m128i test_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.128
+  //CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_andnot_epi64(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
           __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_or_epi64
-  //CHECK: @llvm.x86.avx512.mask.por.q.256
+  //CHECK: or <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_or_epi64(__W,__U, __A, __B);
 }
 
 __m256i test_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_or_epi64
-  //CHECK: @llvm.x86.avx512.mask.por.q.256
+  //CHECK: or <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_or_epi64(__U, __A, __B);
 }
 
 __m128i test_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_or_epi64
-  //CHECK: @llvm.x86.avx512.mask.por.q.128
+  //CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_or_epi64(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
-//CHECK-LABEL: @test_mm_maskz_or_epi64
-  //CHECK: @llvm.x86.avx512.mask.por.q.128
+  //CHECK-LABEL: @test_mm_maskz_or_epi64
+  //CHECK: or <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_or_epi64( __U, __A, __B);
 }
 
 __m256i test_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
           __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_xor_epi64
-  //CHECK: @llvm.x86.avx512.mask.pxor.q.256
+  //CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_xor_epi64(__W,__U, __A, __B);
 }
 
 __m256i test_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_xor_epi64
-  //CHECK: @llvm.x86.avx512.mask.pxor.q.256
+  //CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_xor_epi64(__U, __A, __B);
 }
 
 __m128i test_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_xor_epi64
-  //CHECK: @llvm.x86.avx512.mask.pxor.q.128
+  //CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_xor_epi64(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_xor_epi64
-  //CHECK: @llvm.x86.avx512.mask.pxor.q.128
+  //CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_xor_epi64( __U, __A, __B);
 }
 
@@ -993,16 +1033,16 @@
   return _mm256_mask_cmp_ps_mask(m, __A, __B, 0);
 }
 
-__mmask8 test_mm128_cmp_ps_mask(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm128_cmp_ps_mask
+__mmask8 test_mm_cmp_ps_mask(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_cmp_ps_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.ps.128
-  return (__mmask8)_mm128_cmp_ps_mask(__A, __B, 0);
+  return (__mmask8)_mm_cmp_ps_mask(__A, __B, 0);
 }
 
-__mmask8 test_mm128_mask_cmp_ps_mask(__mmask8 m, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm128_mask_cmp_ps_mask
+__mmask8 test_mm_mask_cmp_ps_mask(__mmask8 m, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_cmp_ps_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.ps.128
-  return _mm128_mask_cmp_ps_mask(m, __A, __B, 0);
+  return _mm_mask_cmp_ps_mask(m, __A, __B, 0);
 }
 
 __mmask8 test_mm256_cmp_pd_mask(__m256d __A, __m256d __B) {
@@ -1017,21 +1057,18 @@
   return _mm256_mask_cmp_pd_mask(m, __A, __B, 0);
 }
 
-__mmask8 test_mm128_cmp_pd_mask(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm128_cmp_pd_mask
+__mmask8 test_mm_cmp_pd_mask(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_cmp_pd_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.pd.128
-  return (__mmask8)_mm128_cmp_pd_mask(__A, __B, 0);
+  return (__mmask8)_mm_cmp_pd_mask(__A, __B, 0);
 }
 
-__mmask8 test_mm128_mask_cmp_pd_mask(__mmask8 m, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm128_mask_cmp_pd_mask
+__mmask8 test_mm_mask_cmp_pd_mask(__mmask8 m, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_cmp_pd_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.pd.128
-  return _mm128_mask_cmp_pd_mask(m, __A, __B, 0);
+  return _mm_mask_cmp_pd_mask(m, __A, __B, 0);
 }
 
-
-//igorb
-
 __m128d test_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_mask_fmadd_pd
   // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.128
@@ -1506,42 +1543,42 @@
 }
 __m128i test_mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: @test_mm_mask_blend_epi32
-  // CHECK: @llvm.x86.avx512.mask.blend.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_blend_epi32(__U,__A,__W); 
 }
 __m256i test_mm256_mask_blend_epi32(__mmask8 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_epi32
-  // CHECK: @llvm.x86.avx512.mask.blend.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_blend_epi32(__U,__A,__W); 
 }
 __m128d test_mm_mask_blend_pd(__mmask8 __U, __m128d __A, __m128d __W) {
   // CHECK-LABEL: @test_mm_mask_blend_pd
-  // CHECK: @llvm.x86.avx512.mask.blend.pd.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_blend_pd(__U,__A,__W); 
 }
 __m256d test_mm256_mask_blend_pd(__mmask8 __U, __m256d __A, __m256d __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_pd
-  // CHECK: @llvm.x86.avx512.mask.blend.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_blend_pd(__U,__A,__W); 
 }
 __m128 test_mm_mask_blend_ps(__mmask8 __U, __m128 __A, __m128 __W) {
   // CHECK-LABEL: @test_mm_mask_blend_ps
-  // CHECK: @llvm.x86.avx512.mask.blend.ps.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_blend_ps(__U,__A,__W); 
 }
 __m256 test_mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_ps
-  // CHECK: @llvm.x86.avx512.mask.blend.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_blend_ps(__U,__A,__W); 
 }
 __m128i test_mm_mask_blend_epi64(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: @test_mm_mask_blend_epi64
-  // CHECK: @llvm.x86.avx512.mask.blend.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_blend_epi64(__U,__A,__W); 
 }
 __m256i test_mm256_mask_blend_epi64(__mmask8 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_epi64
-  // CHECK: @llvm.x86.avx512.mask.blend.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_blend_epi64(__U,__A,__W); 
 }
 __m128d test_mm_mask_compress_pd(__m128d __W, __mmask8 __U, __m128d __A) {
@@ -3154,3 +3191,3736 @@
   // CHECK: @llvm.x86.avx512.maskz.vpermt2var.q.256
   return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
+
+__m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.128
+  return _mm_mask_cvtepi8_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.128
+  return _mm_maskz_cvtepi8_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.256
+  return _mm256_mask_cvtepi8_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.256
+  return _mm256_maskz_cvtepi8_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.128
+  return _mm_mask_cvtepi8_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.128
+  return _mm_maskz_cvtepi8_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.256
+  return _mm256_mask_cvtepi8_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.256
+  return _mm256_maskz_cvtepi8_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.128
+  return _mm_mask_cvtepi32_epi64(__W, __U, __X); 
+}
+
+__m128i test_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.128
+  return _mm_maskz_cvtepi32_epi64(__U, __X); 
+}
+
+__m256i test_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.256
+  return _mm256_mask_cvtepi32_epi64(__W, __U, __X); 
+}
+
+__m256i test_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.256
+  return _mm256_maskz_cvtepi32_epi64(__U, __X); 
+}
+
+__m128i test_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.128
+  return _mm_mask_cvtepi16_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.128
+  return _mm_maskz_cvtepi16_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.256
+  return _mm256_mask_cvtepi16_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.256
+  return _mm256_maskz_cvtepi16_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.128
+  return _mm_mask_cvtepi16_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.128
+  return _mm_maskz_cvtepi16_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.256
+  return _mm256_mask_cvtepi16_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.256
+  return _mm256_maskz_cvtepi16_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  return _mm_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  return _mm_maskz_cvtepu8_epi32(__U, __A);
+}
+
+__m256i test_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  return _mm256_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  return _mm256_maskz_cvtepu8_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  return _mm_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  return _mm_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  return _mm256_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  return _mm256_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  return _mm_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m128i test_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  return _mm_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m256i test_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  return _mm256_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m256i test_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  return _mm256_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m128i test_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  return _mm_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  return _mm_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  return _mm256_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  return _mm256_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  return _mm_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  return _mm_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  return _mm256_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  return _mm256_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+__m128i test_mm_rol_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  return _mm_rol_epi32(__A, 5); 
+}
+
+__m128i test_mm_mask_rol_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  return _mm_mask_rol_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_rol_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  return _mm_maskz_rol_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_rol_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  return _mm256_rol_epi32(__A, 5); 
+}
+
+__m256i test_mm256_mask_rol_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  return _mm256_mask_rol_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_rol_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  return _mm256_maskz_rol_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_rol_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  return _mm_rol_epi64(__A, 5); 
+}
+
+__m128i test_mm_mask_rol_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  return _mm_mask_rol_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_rol_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  return _mm_maskz_rol_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_rol_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  return _mm256_rol_epi64(__A, 5); 
+}
+
+__m256i test_mm256_mask_rol_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  return _mm256_mask_rol_epi64(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_rol_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  return _mm256_maskz_rol_epi64(__U, __A, 5); 
+}
+
+__m128i test_mm_rolv_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  return _mm_rolv_epi32(__A, __B); 
+}
+
+__m128i test_mm_mask_rolv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  return _mm_mask_rolv_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_rolv_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  return _mm_maskz_rolv_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_rolv_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  return _mm256_rolv_epi32(__A, __B); 
+}
+
+__m256i test_mm256_mask_rolv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  return _mm256_mask_rolv_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_rolv_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  return _mm256_maskz_rolv_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_rolv_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  return _mm_rolv_epi64(__A, __B); 
+}
+
+__m128i test_mm_mask_rolv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  return _mm_mask_rolv_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_rolv_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  return _mm_maskz_rolv_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_rolv_epi64(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  return _mm256_rolv_epi64(__A, __B); 
+}
+
+__m256i test_mm256_mask_rolv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  return _mm256_mask_rolv_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_rolv_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  return _mm256_maskz_rolv_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_ror_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  return _mm_ror_epi32(__A, 5); 
+}
+
+__m128i test_mm_mask_ror_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  return _mm_mask_ror_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_ror_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  return _mm_maskz_ror_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_ror_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  return _mm256_ror_epi32(__A, 5); 
+}
+
+__m256i test_mm256_mask_ror_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  return _mm256_mask_ror_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_ror_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  return _mm256_maskz_ror_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_ror_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  return _mm_ror_epi64(__A, 5); 
+}
+
+__m128i test_mm_mask_ror_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  return _mm_mask_ror_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_ror_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  return _mm_maskz_ror_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_ror_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  return _mm256_ror_epi64(__A, 5); 
+}
+
+__m256i test_mm256_mask_ror_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  return _mm256_mask_ror_epi64(__W, __U, __A,5); 
+}
+
+__m256i test_mm256_maskz_ror_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  return _mm256_maskz_ror_epi64(__U, __A, 5); 
+}
+
+
+__m128i test_mm_rorv_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  return _mm_rorv_epi32(__A, __B); 
+}
+
+__m128i test_mm_mask_rorv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  return _mm_mask_rorv_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_rorv_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  return _mm_maskz_rorv_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_rorv_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  return _mm256_rorv_epi32(__A, __B); 
+}
+
+__m256i test_mm256_mask_rorv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  return _mm256_mask_rorv_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_rorv_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  return _mm256_maskz_rorv_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_rorv_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  return _mm_rorv_epi64(__A, __B); 
+}
+
+__m128i test_mm_mask_rorv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  return _mm_mask_rorv_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_rorv_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  return _mm_maskz_rorv_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_rorv_epi64(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  return _mm256_rorv_epi64(__A, __B); 
+}
+
+__m256i test_mm256_mask_rorv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  return _mm256_mask_rorv_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_rorv_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  return _mm256_maskz_rorv_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_mask_sllv_epi64(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_maskz_sllv_epi64(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_mask_sllv_epi64(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_maskz_sllv_epi64(__U, __X, __Y); 
+}
+
+__m128i test_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_mask_sllv_epi32(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_maskz_sllv_epi32(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_mask_sllv_epi32(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_maskz_sllv_epi32(__U, __X, __Y); 
+}
+
+__m128i test_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm_mask_srlv_epi64(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm_maskz_srlv_epi64(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm256_mask_srlv_epi64(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm256_maskz_srlv_epi64(__U, __X, __Y); 
+}
+
+__m128i test_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm_mask_srlv_epi32(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm_maskz_srlv_epi32(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm256_mask_srlv_epi32(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm256_maskz_srlv_epi32(__U, __X, __Y); 
+}
+
+__m128i test_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d.128
+  return _mm_mask_srl_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d.128
+  return _mm_maskz_srl_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d.256
+  return _mm256_mask_srl_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d.256
+  return _mm256_maskz_srl_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.128
+  return _mm_mask_srli_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.128
+  return _mm_maskz_srli_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.256
+  return _mm256_mask_srli_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.256
+  return _mm256_maskz_srli_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q.128
+  return _mm_mask_srl_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q.128
+  return _mm_maskz_srl_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q.256
+  return _mm256_mask_srl_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q.256
+  return _mm256_maskz_srl_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.128
+  return _mm_mask_srli_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.128
+  return _mm_maskz_srli_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.256
+  return _mm256_mask_srli_epi64(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.256
+  return _mm256_maskz_srli_epi64(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm_mask_srav_epi32(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm_maskz_srav_epi32(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm256_mask_srav_epi32(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm256_maskz_srav_epi32(__U, __X, __Y); 
+}
+
+__m128i test_mm_srav_epi64(__m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  return _mm_srav_epi64(__X, __Y); 
+}
+
+__m128i test_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  return _mm_mask_srav_epi64(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  return _mm_maskz_srav_epi64(__U, __X, __Y); 
+}
+
+__m256i test_mm256_srav_epi64(__m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  return _mm256_srav_epi64(__X, __Y); 
+}
+
+__m256i test_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  return _mm256_mask_srav_epi64(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_srav_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  return _mm256_maskz_srav_epi64(__U, __X, __Y); 
+}
+
+void test_mm_mask_store_epi32(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_store_epi32
+  // CHECK: @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %{{.*}}, <4 x i32>* %{{.}}, i32 16, <4 x i1> %{{.*}})
+  return _mm_mask_store_epi32(__P, __U, __A); 
+}
+
+void test_mm256_mask_store_epi32(void *__P, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_store_epi32
+  // CHECK: @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %{{.*}}, <8 x i32>* %{{.}}, i32 32, <8 x i1> %{{.*}})
+  return _mm256_mask_store_epi32(__P, __U, __A); 
+}
+
+__m128i test_mm_mask_mov_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_epi32
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_mov_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_mov_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_epi32
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_mov_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_mov_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_epi32
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_mov_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_mov_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_epi32
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_mov_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_mov_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_epi64
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_mov_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_mov_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_epi64
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_mov_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_mov_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_epi64
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_mov_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_mov_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_epi64
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_mov_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_load_epi32(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_load_epi32
+  // CHECK: @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_mask_load_epi32(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_load_epi32(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_load_epi32
+  // CHECK: @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskz_load_epi32(__U, __P); 
+}
+
+__m256i test_mm256_mask_load_epi32(__m256i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_load_epi32
+  // CHECK: @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mask_load_epi32(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_load_epi32(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_load_epi32
+  // CHECK: @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskz_load_epi32(__U, __P); 
+}
+
+__m128i test_mm_mask_load_epi64(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_load_epi64
+  // CHECK: @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_mask_load_epi64(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_load_epi64
+  // CHECK: @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_maskz_load_epi64(__U, __P); 
+}
+
+__m256i test_mm256_mask_load_epi64(__m256i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_load_epi64
+  // CHECK: @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_mask_load_epi64(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_load_epi64
+  // CHECK: @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskz_load_epi64(__U, __P); 
+}
+
+void test_mm_mask_store_epi64(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_store_epi64
+  // CHECK: @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, i32 16, <2 x i1> %{{.*}})
+  return _mm_mask_store_epi64(__P, __U, __A); 
+}
+
+void test_mm256_mask_store_epi64(void *__P, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_store_epi64
+  // CHECK: @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, i32 32, <4 x i1> %{{.*}})
+  return _mm256_mask_store_epi64(__P, __U, __A); 
+}
+
+__m128d test_mm_mask_movedup_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_movedup_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_movedup_pd(__W, __U, __A); 
+}
+
+__m128d test_mm_maskz_movedup_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_movedup_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_movedup_pd(__U, __A); 
+}
+
+__m256d test_mm256_mask_movedup_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_movedup_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_movedup_pd(__W, __U, __A); 
+}
+
+__m256d test_mm256_maskz_movedup_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_movedup_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_movedup_pd(__U, __A); 
+}
+
+__m128i test_mm_mask_set1_epi32(__m128i __O, __mmask8 __M) {
+  // CHECK-LABEL: @test_mm_mask_set1_epi32
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.128
+  return _mm_mask_set1_epi32(__O, __M, 5); 
+}
+
+__m128i test_mm_maskz_set1_epi32(__mmask8 __M) {
+  // CHECK-LABEL: @test_mm_maskz_set1_epi32
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.128
+  return _mm_maskz_set1_epi32(__M, 5); 
+}
+
+__m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi32
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.256
+  return _mm256_mask_set1_epi32(__O, __M, 5); 
+}
+
+__m256i test_mm256_maskz_set1_epi32(__mmask8 __M) {
+  // CHECK-LABEL: @test_mm256_maskz_set1_epi32
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.256
+  return _mm256_maskz_set1_epi32(__M, 5); 
+}
+
+#ifdef __x86_64__
+__m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
+  // CHECK-LABEL: @test_mm_mask_set1_epi64
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.128
+  return _mm_mask_set1_epi64(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
+  // CHECK-LABEL: @test_mm_maskz_set1_epi64
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.128
+  return _mm_maskz_set1_epi64(__M, __A); 
+}
+
+__m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi64
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.256
+  return _mm256_mask_set1_epi64(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
+  // CHECK-LABEL: @test_mm256_maskz_set1_epi64
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.256
+  return _mm256_maskz_set1_epi64(__M, __A); 
+}
+#endif
+
+__m128d test_mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.128
+  return _mm_fixupimm_pd(__A, __B, __C, 5); 
+}
+
+__m128d test_mm_mask_fixupimm_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.128
+  return _mm_mask_fixupimm_pd(__A, __U, __B, __C, 5); 
+}
+
+__m128d test_mm_maskz_fixupimm_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.128
+  return _mm_maskz_fixupimm_pd(__U, __A, __B, __C, 5); 
+}
+
+__m256d test_mm256_fixupimm_pd(__m256d __A, __m256d __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.256
+  return _mm256_fixupimm_pd(__A, __B, __C, 5); 
+}
+
+__m256d test_mm256_mask_fixupimm_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.256
+  return _mm256_mask_fixupimm_pd(__A, __U, __B, __C, 5); 
+}
+
+__m256d test_mm256_maskz_fixupimm_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.256
+  return _mm256_maskz_fixupimm_pd(__U, __A, __B, __C, 5); 
+}
+
+__m128 test_mm_fixupimm_ps(__m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.128
+  return _mm_fixupimm_ps(__A, __B, __C, 5); 
+}
+
+__m128 test_mm_mask_fixupimm_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.128
+  return _mm_mask_fixupimm_ps(__A, __U, __B, __C, 5); 
+}
+
+__m128 test_mm_maskz_fixupimm_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.128
+  return _mm_maskz_fixupimm_ps(__U, __A, __B, __C, 5); 
+}
+
+__m256 test_mm256_fixupimm_ps(__m256 __A, __m256 __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.256
+  return _mm256_fixupimm_ps(__A, __B, __C, 5); 
+}
+
+__m256 test_mm256_mask_fixupimm_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.256
+  return _mm256_mask_fixupimm_ps(__A, __U, __B, __C, 5); 
+}
+
+__m256 test_mm256_maskz_fixupimm_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.256
+  return _mm256_maskz_fixupimm_ps(__U, __A, __B, __C, 5); 
+}
+
+__m128d test_mm_mask_load_pd(__m128d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_load_pd
+  // CHECK: @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_mask_load_pd(__W, __U, __P); 
+}
+
+__m128d test_mm_maskz_load_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_load_pd
+  // CHECK: @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_maskz_load_pd(__U, __P); 
+}
+
+__m256d test_mm256_mask_load_pd(__m256d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_load_pd
+  // CHECK: @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_mask_load_pd(__W, __U, __P); 
+}
+
+__m256d test_mm256_maskz_load_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_load_pd
+  // CHECK: @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_maskz_load_pd(__U, __P); 
+}
+
+__m128 test_mm_mask_load_ps(__m128 __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_load_ps
+  // CHECK: @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_mask_load_ps(__W, __U, __P); 
+}
+
+__m128 test_mm_maskz_load_ps(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_load_ps
+  // CHECK: @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_maskz_load_ps(__U, __P); 
+}
+
+__m256 test_mm256_mask_load_ps(__m256 __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_load_ps
+  // CHECK: @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_mask_load_ps(__W, __U, __P); 
+}
+
+__m256 test_mm256_maskz_load_ps(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_load_ps
+  // CHECK: @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_maskz_load_ps(__U, __P); 
+}
+
+__m128i test_mm_mask_loadu_epi64(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_epi64
+  // CHECK: @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_mask_loadu_epi64(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_epi64
+  // CHECK: @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_maskz_loadu_epi64(__U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi64(__m256i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_epi64
+  // CHECK: @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_mask_loadu_epi64(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_epi64
+  // CHECK: @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskz_loadu_epi64(__U, __P); 
+}
+
+__m128i test_mm_mask_loadu_epi32(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_epi32
+  // CHECK: @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_mask_loadu_epi32(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi32(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_epi32
+  // CHECK: @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskz_loadu_epi32(__U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi32(__m256i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_epi32
+  // CHECK: @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mask_loadu_epi32(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi32(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_epi32
+  // CHECK: @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskz_loadu_epi32(__U, __P); 
+}
+
+__m128d test_mm_mask_loadu_pd(__m128d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_pd
+  // CHECK: @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_mask_loadu_pd(__W, __U, __P); 
+}
+
+__m128d test_mm_maskz_loadu_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_pd
+  // CHECK: @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_maskz_loadu_pd(__U, __P); 
+}
+
+__m256d test_mm256_mask_loadu_pd(__m256d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_pd
+  // CHECK: @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_mask_loadu_pd(__W, __U, __P); 
+}
+
+__m256d test_mm256_maskz_loadu_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_pd
+  // CHECK: @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_maskz_loadu_pd(__U, __P); 
+}
+
+__m128 test_mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_ps
+  // CHECK: @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_mask_loadu_ps(__W, __U, __P); 
+}
+
+__m128 test_mm_maskz_loadu_ps(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_ps
+  // CHECK: @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_maskz_loadu_ps(__U, __P); 
+}
+
+__m256 test_mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_ps
+  // CHECK: @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_mask_loadu_ps(__W, __U, __P); 
+}
+
+__m256 test_mm256_maskz_loadu_ps(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_ps
+  // CHECK: @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_maskz_loadu_ps(__U, __P); 
+}
+
+void test_mm_mask_store_pd(void *__P, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_store_pd
+  // CHECK: @llvm.masked.store.v2f64.p0v2f64(<2 x double> %{{.*}}, <2 x double>* %{{.*}}, i32 16, <2 x i1> %{{.*}})
+  return _mm_mask_store_pd(__P, __U, __A); 
+}
+
+void test_mm256_mask_store_pd(void *__P, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_store_pd
+  // CHECK: @llvm.masked.store.v4f64.p0v4f64(<4 x double> %{{.*}}, <4 x double>* %{{.*}}, i32 32, <4 x i1> %{{.*}})
+  return _mm256_mask_store_pd(__P, __U, __A); 
+}
+
+void test_mm_mask_store_ps(void *__P, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_store_ps
+  // CHECK: @llvm.masked.store.v4f32.p0v4f32(<4 x float> %{{.*}}, <4 x float>* %{{.*}}, i32 16, <4 x i1> %{{.*}})
+  return _mm_mask_store_ps(__P, __U, __A); 
+}
+
+void test_mm256_mask_store_ps(void *__P, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_store_ps
+  // CHECK: @llvm.masked.store.v8f32.p0v8f32(<8 x float> %{{.*}}, <8 x float>* %{{.*}}, i32 32, <8 x i1> %{{.*}})
+  return _mm256_mask_store_ps(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_epi64(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_epi64
+  // CHECK: @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, i32 1, <2 x i1> %{{.*}})
+  return _mm_mask_storeu_epi64(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_epi64(void *__P, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_epi64
+  // CHECK: @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  return _mm256_mask_storeu_epi64(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_epi32(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_epi32
+  // CHECK: @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  return _mm_mask_storeu_epi32(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_epi32(void *__P, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_epi32
+  // CHECK: @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %{{.*}}, <8 x i32>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  return _mm256_mask_storeu_epi32(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_pd(void *__P, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_pd
+  // CHECK: @llvm.masked.store.v2f64.p0v2f64(<2 x double> %{{.*}}, <2 x double>* %{{.*}}, i32 1, <2 x i1> %{{.*}})
+  return _mm_mask_storeu_pd(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_pd(void *__P, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_pd
+  // CHECK: @llvm.masked.store.v4f64.p0v4f64(<4 x double> %{{.*}}, <4 x double>* %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  return _mm256_mask_storeu_pd(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_ps(void *__P, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_ps
+  // CHECK: @llvm.masked.store.v4f32.p0v4f32(<4 x float> %{{.*}}, <4 x float>* %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  return _mm_mask_storeu_ps(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_ps(void *__P, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_ps
+  // CHECK: @llvm.masked.store.v8f32.p0v8f32(<8 x float> %{{.*}}, <8 x float>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  return _mm256_mask_storeu_ps(__P, __U, __A); 
+}
+
+__m128d test_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_unpackhi_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_unpackhi_pd(__W, __U, __A, __B); 
+}
+
+__m128d test_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpackhi_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_unpackhi_pd(__U, __A, __B); 
+}
+
+__m256d test_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpackhi_pd
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_unpackhi_pd(__W, __U, __A, __B); 
+}
+
+__m256d test_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpackhi_pd
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_unpackhi_pd(__U, __A, __B); 
+}
+
+__m128 test_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_unpackhi_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}} <4 x float> %{{.*}}
+  return _mm_mask_unpackhi_ps(__W, __U, __A, __B); 
+}
+
+__m128 test_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpackhi_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}} <4 x float> %{{.*}}
+  return _mm_maskz_unpackhi_ps(__U, __A, __B); 
+}
+
+__m256 test_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpackhi_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_unpackhi_ps(__W, __U, __A, __B); 
+}
+
+__m256 test_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpackhi_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_unpackhi_ps(__U, __A, __B); 
+}
+
+__m128d test_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_unpacklo_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_unpacklo_pd(__W, __U, __A, __B); 
+}
+
+__m128d test_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpacklo_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_unpacklo_pd(__U, __A, __B); 
+}
+
+__m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpacklo_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_unpacklo_pd(__W, __U, __A, __B); 
+}
+
+__m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpacklo_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_unpacklo_pd(__U, __A, __B); 
+}
+
+__m128 test_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_unpacklo_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_unpacklo_ps(__W, __U, __A, __B); 
+}
+
+__m128 test_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpacklo_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_unpacklo_ps(__U, __A, __B); 
+}
+
+__m256 test_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpacklo_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_unpacklo_ps(__W, __U, __A, __B); 
+}
+
+__m256 test_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpacklo_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_unpacklo_ps(__U, __A, __B); 
+}
+
+__m128d test_mm_rcp14_pd(__m128d __A) {
+  // CHECK-LABEL: @test_mm_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.128
+  return _mm_rcp14_pd(__A); 
+}
+
+__m128d test_mm_mask_rcp14_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.128
+  return _mm_mask_rcp14_pd(__W, __U, __A); 
+}
+
+__m128d test_mm_maskz_rcp14_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.128
+  return _mm_maskz_rcp14_pd(__U, __A); 
+}
+
+__m256d test_mm256_rcp14_pd(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.256
+  return _mm256_rcp14_pd(__A); 
+}
+
+__m256d test_mm256_mask_rcp14_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.256
+  return _mm256_mask_rcp14_pd(__W, __U, __A); 
+}
+
+__m256d test_mm256_maskz_rcp14_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.256
+  return _mm256_maskz_rcp14_pd(__U, __A); 
+}
+
+__m128 test_mm_rcp14_ps(__m128 __A) {
+  // CHECK-LABEL: @test_mm_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.128
+  return _mm_rcp14_ps(__A); 
+}
+
+__m128 test_mm_mask_rcp14_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.128
+  return _mm_mask_rcp14_ps(__W, __U, __A); 
+}
+
+__m128 test_mm_maskz_rcp14_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.128
+  return _mm_maskz_rcp14_ps(__U, __A); 
+}
+
+__m256 test_mm256_rcp14_ps(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.256
+  return _mm256_rcp14_ps(__A); 
+}
+
+__m256 test_mm256_mask_rcp14_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.256
+  return _mm256_mask_rcp14_ps(__W, __U, __A); 
+}
+
+__m256 test_mm256_maskz_rcp14_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.256
+  return _mm256_maskz_rcp14_ps(__U, __A); 
+}
+
+__m128d test_mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X) {
+  // CHECK-LABEL: @test_mm_mask_permute_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_permute_pd(__W, __U, __X, 1); 
+}
+
+__m128d test_mm_maskz_permute_pd(__mmask8 __U, __m128d __X) {
+  // CHECK-LABEL: @test_mm_maskz_permute_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_permute_pd(__U, __X, 1); 
+}
+
+__m256d test_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X) {
+  // CHECK-LABEL: @test_mm256_mask_permute_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_permute_pd(__W, __U, __X, 5); 
+}
+
+__m256d test_mm256_maskz_permute_pd(__mmask8 __U, __m256d __X) {
+  // CHECK-LABEL: @test_mm256_maskz_permute_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_permute_pd(__U, __X, 5); 
+}
+
+__m128 test_mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X) {
+  // CHECK-LABEL: @test_mm_mask_permute_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_permute_ps(__W, __U, __X, 0x1b); 
+}
+
+__m128 test_mm_maskz_permute_ps(__mmask8 __U, __m128 __X) {
+  // CHECK-LABEL: @test_mm_maskz_permute_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_permute_ps(__U, __X, 0x1b); 
+}
+
+__m256 test_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X) {
+  // CHECK-LABEL: @test_mm256_mask_permute_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_permute_ps(__W, __U, __X, 0x1b); 
+}
+
+__m256 test_mm256_maskz_permute_ps(__mmask8 __U, __m256 __X) {
+  // CHECK-LABEL: @test_mm256_maskz_permute_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_permute_ps(__U, __X, 0x1b); 
+}
+
+__m128d test_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd
+  return _mm_mask_permutevar_pd(__W, __U, __A, __C); 
+}
+
+__m128d test_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd
+  return _mm_maskz_permutevar_pd(__U, __A, __C); 
+}
+
+__m256d test_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.256
+  return _mm256_mask_permutevar_pd(__W, __U, __A, __C); 
+}
+
+__m256d test_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.256
+  return _mm256_maskz_permutevar_pd(__U, __A, __C); 
+}
+
+__m128 test_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps
+  return _mm_mask_permutevar_ps(__W, __U, __A, __C); 
+}
+
+__m128 test_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps
+  return _mm_maskz_permutevar_ps(__U, __A, __C); 
+}
+
+__m256 test_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.256
+  return _mm256_mask_permutevar_ps(__W, __U, __A, __C); 
+}
+
+__m256 test_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.256
+  return _mm256_maskz_permutevar_ps(__U, __A, __C); 
+}
+
+__mmask8 test_mm_test_epi32_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_test_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestm.d.128
+  return _mm_test_epi32_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_test_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_test_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestm.d.128
+  return _mm_mask_test_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm256_test_epi32_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_test_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestm.d.256
+  return _mm256_test_epi32_mask(__A, __B); 
+}
+
+__mmask8 test_mm256_mask_test_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_test_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestm.d.256
+  return _mm256_mask_test_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_test_epi64_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_test_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestm.q.128
+  return _mm_test_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_test_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_test_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestm.q.128
+  return _mm_mask_test_epi64_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm256_test_epi64_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_test_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestm.q.256
+  return _mm256_test_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm256_mask_test_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_test_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestm.q.256
+  return _mm256_mask_test_epi64_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_testn_epi32_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.128
+  return _mm_testn_epi32_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_testn_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.128
+  return _mm_mask_testn_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm256_testn_epi32_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.256
+  return _mm256_testn_epi32_mask(__A, __B); 
+}
+
+__mmask8 test_mm256_mask_testn_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.256
+  return _mm256_mask_testn_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_testn_epi64_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.128
+  return _mm_testn_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_testn_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.128
+  return _mm_mask_testn_epi64_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm256_testn_epi64_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.256
+  return _mm256_testn_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm256_mask_testn_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.256
+  return _mm256_mask_testn_epi64_mask(__U, __A, __B); 
+}
+__m128i test_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_unpackhi_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_unpackhi_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpackhi_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_unpackhi_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpackhi_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_unpackhi_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_unpackhi_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_unpackhi_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_unpackhi_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpackhi_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_unpackhi_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpackhi_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_unpackhi_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_unpackhi_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_unpacklo_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_unpacklo_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpacklo_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_unpacklo_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpacklo_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_unpacklo_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_unpacklo_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_unpacklo_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_unpacklo_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpacklo_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_unpacklo_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpacklo_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_unpacklo_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_unpacklo_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d.128
+  return _mm_mask_sra_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d.128
+  return _mm_maskz_sra_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d.256
+  return _mm256_mask_sra_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d.256
+  return _mm256_maskz_sra_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.128
+  return _mm_mask_srai_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.128
+  return _mm_maskz_srai_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.256
+  return _mm256_mask_srai_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.256
+  return _mm256_maskz_srai_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_sra_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  return _mm_sra_epi64(__A, __B); 
+}
+
+__m128i test_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  return _mm_mask_sra_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  return _mm_maskz_sra_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_sra_epi64(__m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  return _mm256_sra_epi64(__A, __B); 
+}
+
+__m256i test_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  return _mm256_mask_sra_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  return _mm256_maskz_sra_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_srai_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  return _mm_srai_epi64(__A, 5); 
+}
+
+__m128i test_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  return _mm_mask_srai_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  return _mm_maskz_srai_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_srai_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  return _mm256_srai_epi64(__A, 5); 
+}
+
+__m256i test_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  return _mm256_mask_srai_epi64(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  return _mm256_maskz_srai_epi64(__U, __A, 5); 
+}
+
+__m128i test_mm_ternarylogic_epi32(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.128
+  return _mm_ternarylogic_epi32(__A, __B, __C, 4); 
+}
+
+__m128i test_mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.128
+  return _mm_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
+}
+
+__m128i test_mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.128
+  return _mm_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
+}
+
+__m256i test_mm256_ternarylogic_epi32(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.256
+  return _mm256_ternarylogic_epi32(__A, __B, __C, 4); 
+}
+
+__m256i test_mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.256
+  return _mm256_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
+}
+
+__m256i test_mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.256
+  return _mm256_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
+}
+
+__m128i test_mm_ternarylogic_epi64(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.128
+  return _mm_ternarylogic_epi64(__A, __B, __C, 4); 
+}
+
+__m128i test_mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.128
+  return _mm_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
+}
+
+__m128i test_mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.128
+  return _mm_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
+}
+
+__m256i test_mm256_ternarylogic_epi64(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.256
+  return _mm256_ternarylogic_epi64(__A, __B, __C, 4); 
+}
+
+__m256i test_mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.256
+  return _mm256_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
+}
+
+__m256i test_mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.256
+  return _mm256_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
+}
+__m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm256_shuffle_f32x4(__A, __B, 3); 
+}
+
+__m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); 
+}
+
+__m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); 
+}
+
+__m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm256_shuffle_f64x2(__A, __B, 3); 
+}
+
+__m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); 
+}
+
+__m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); 
+}
+
+__m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm256_shuffle_i32x4(__A, __B, 3); 
+}
+
+__m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); 
+}
+
+__m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); 
+}
+
+__m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm256_shuffle_i64x2(__A, __B, 3); 
+}
+
+__m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); 
+}
+
+__m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm256_maskz_shuffle_i64x2(__U, __A, __B, 3); 
+}
+
+__m128d test_mm_mask_shuffle_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_shuffle_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_shuffle_pd(__W, __U, __A, __B, 3); 
+}
+
+__m128d test_mm_maskz_shuffle_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_shuffle_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_shuffle_pd(__U, __A, __B, 3); 
+}
+
+__m256d test_mm256_mask_shuffle_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_shuffle_pd(__W, __U, __A, __B, 3); 
+}
+
+__m256d test_mm256_maskz_shuffle_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_shuffle_pd(__U, __A, __B, 3); 
+}
+
+__m128 test_mm_mask_shuffle_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_shuffle_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_shuffle_ps(__W, __U, __A, __B, 4); 
+}
+
+__m128 test_mm_maskz_shuffle_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_shuffle_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_shuffle_ps(__U, __A, __B, 4); 
+}
+
+__m256 test_mm256_mask_shuffle_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_shuffle_ps(__W, __U, __A, __B, 4); 
+}
+
+__m256 test_mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_shuffle_ps(__U, __A, __B, 4); 
+}
+
+__m128d test_mm_rsqrt14_pd(__m128d __A) {
+  // CHECK-LABEL: @test_mm_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.128
+  return _mm_rsqrt14_pd(__A); 
+}
+
+__m128d test_mm_mask_rsqrt14_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.128
+  return _mm_mask_rsqrt14_pd(__W, __U, __A); 
+}
+
+__m128d test_mm_maskz_rsqrt14_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.128
+  return _mm_maskz_rsqrt14_pd(__U, __A); 
+}
+
+__m256d test_mm256_rsqrt14_pd(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.256
+  return _mm256_rsqrt14_pd(__A); 
+}
+
+__m256d test_mm256_mask_rsqrt14_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.256
+  return _mm256_mask_rsqrt14_pd(__W, __U, __A); 
+}
+
+__m256d test_mm256_maskz_rsqrt14_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.256
+  return _mm256_maskz_rsqrt14_pd(__U, __A); 
+}
+
+__m128 test_mm_rsqrt14_ps(__m128 __A) {
+  // CHECK-LABEL: @test_mm_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.128
+  return _mm_rsqrt14_ps(__A); 
+}
+
+__m128 test_mm_mask_rsqrt14_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.128
+  return _mm_mask_rsqrt14_ps(__W, __U, __A); 
+}
+
+__m128 test_mm_maskz_rsqrt14_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.128
+  return _mm_maskz_rsqrt14_ps(__U, __A); 
+}
+
+__m256 test_mm256_rsqrt14_ps(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.256
+  return _mm256_rsqrt14_ps(__A); 
+}
+
+__m256 test_mm256_mask_rsqrt14_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.256
+  return _mm256_mask_rsqrt14_ps(__W, __U, __A); 
+}
+
+__m256 test_mm256_maskz_rsqrt14_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.256
+  return _mm256_maskz_rsqrt14_ps(__U, __A); 
+}
+
+__m256 test_mm256_broadcast_f32x4(__m128 __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm256_broadcast_f32x4(__A); 
+}
+
+__m256 test_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm256_mask_broadcast_f32x4(__O, __M, __A); 
+}
+
+__m256 test_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm256_maskz_broadcast_f32x4(__M, __A); 
+}
+
+__m256i test_mm256_broadcast_i32x4(__m128i __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm256_broadcast_i32x4(__A); 
+}
+
+__m256i test_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm256_mask_broadcast_i32x4(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm256_maskz_broadcast_i32x4(__M, __A); 
+}
+
+__m256d test_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_broadcastsd_pd(__O, __M, __A);
+}
+
+__m256d test_mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_broadcastsd_pd(__M, __A);
+}
+
+__m128 test_mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_broadcastss_ps(__O, __M, __A);
+}
+
+__m128 test_mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_broadcastss_ps(__M, __A);
+}
+
+__m256 test_mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_broadcastss_ps(__O, __M, __A);
+}
+
+__m256 test_mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_broadcastss_ps(__M, __A);
+}
+
+__m128i test_mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_broadcastd_epi32(__O, __M, __A);
+}
+
+__m128i test_mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_broadcastd_epi32(__M, __A);
+}
+
+__m256i test_mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_broadcastd_epi32(__O, __M, __A);
+}
+
+__m256i test_mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_broadcastd_epi32(__M, __A);
+}
+
+__m128i test_mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_broadcastq_epi64(__O, __M, __A);
+}
+
+__m128i test_mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_broadcastq_epi64(__M, __A);
+}
+
+__m256i test_mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_broadcastq_epi64(__O, __M, __A);
+}
+
+__m256i test_mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_broadcastq_epi64(__M, __A);
+}
+
+__m128i test_mm_cvtsepi32_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.128
+  return _mm_cvtsepi32_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.128
+  return _mm_mask_cvtsepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi32_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.128
+  return _mm_maskz_cvtsepi32_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi32_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.mem.128
+  return _mm_mask_cvtsepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi32_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.256
+  return _mm256_cvtsepi32_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.256
+  return _mm256_mask_cvtsepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi32_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.256
+  return _mm256_maskz_cvtsepi32_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi32_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.mem.256
+  return _mm256_mask_cvtsepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtsepi32_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.128
+  return _mm_cvtsepi32_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.128
+  return _mm_mask_cvtsepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi32_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.128
+  return _mm_maskz_cvtsepi32_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi32_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.mem.128
+  return _mm_mask_cvtsepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi32_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.256
+  return _mm256_cvtsepi32_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.256
+  return _mm256_mask_cvtsepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi32_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.256
+  return _mm256_maskz_cvtsepi32_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi32_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.mem.256
+  return _mm256_mask_cvtsepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtsepi64_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.128
+  return _mm_cvtsepi64_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.128
+  return _mm_mask_cvtsepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi64_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.128
+  return _mm_maskz_cvtsepi64_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi64_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.mem.128
+  return _mm_mask_cvtsepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi64_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.256
+  return _mm256_cvtsepi64_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.256
+  return _mm256_mask_cvtsepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi64_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.256
+  return _mm256_maskz_cvtsepi64_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi64_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.mem.256
+  return _mm256_mask_cvtsepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtsepi64_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.128
+  return _mm_cvtsepi64_epi32(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.128
+  return _mm_mask_cvtsepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi64_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.128
+  return _mm_maskz_cvtsepi64_epi32(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi64_storeu_epi32(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.mem.128
+  return _mm_mask_cvtsepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi64_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.256
+  return _mm256_cvtsepi64_epi32(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.256
+  return _mm256_mask_cvtsepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi64_epi32(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.256
+  return _mm256_maskz_cvtsepi64_epi32(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi64_storeu_epi32(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.mem.256
+  return _mm256_mask_cvtsepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtsepi64_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.128
+  return _mm_cvtsepi64_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.128
+  return _mm_mask_cvtsepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi64_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.128
+  return _mm_maskz_cvtsepi64_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi64_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.mem.128
+  return _mm_mask_cvtsepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi64_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.256
+  return _mm256_cvtsepi64_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.256
+  return _mm256_mask_cvtsepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi64_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.256
+  return _mm256_maskz_cvtsepi64_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.mem.256
+  return _mm256_mask_cvtsepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi32_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.128
+  return _mm_cvtusepi32_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.128
+  return _mm_mask_cvtusepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi32_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.128
+  return _mm_maskz_cvtusepi32_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi32_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.mem.128
+  return _mm_mask_cvtusepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi32_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.256
+  return _mm256_cvtusepi32_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.256
+  return _mm256_mask_cvtusepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi32_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.256
+  return _mm256_maskz_cvtusepi32_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi32_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.mem.256
+  return _mm256_mask_cvtusepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi32_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.128
+  return _mm_cvtusepi32_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.128
+  return _mm_mask_cvtusepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi32_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.128
+  return _mm_maskz_cvtusepi32_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi32_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.mem.128
+  return _mm_mask_cvtusepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi32_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.256
+  return _mm256_cvtusepi32_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.256
+  return _mm256_mask_cvtusepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi32_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.256
+  return _mm256_maskz_cvtusepi32_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi32_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.mem.256
+  return _mm256_mask_cvtusepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi64_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.128
+  return _mm_cvtusepi64_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.128
+  return _mm_mask_cvtusepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi64_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.128
+  return _mm_maskz_cvtusepi64_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi64_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.mem.128
+  return _mm_mask_cvtusepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi64_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.256
+  return _mm256_cvtusepi64_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.256
+  return _mm256_mask_cvtusepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi64_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.256
+  return _mm256_maskz_cvtusepi64_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi64_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.mem.256
+  return _mm256_mask_cvtusepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi64_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.128
+  return _mm_cvtusepi64_epi32(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.128
+  return _mm_mask_cvtusepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi64_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.128
+  return _mm_maskz_cvtusepi64_epi32(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi64_storeu_epi32(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.mem.128
+  return _mm_mask_cvtusepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi64_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.256
+  return _mm256_cvtusepi64_epi32(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.256
+  return _mm256_mask_cvtusepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi64_epi32(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.256
+  return _mm256_maskz_cvtusepi64_epi32(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi64_storeu_epi32(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.mem.256
+  return _mm256_mask_cvtusepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi64_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.128
+  return _mm_cvtusepi64_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.128
+  return _mm_mask_cvtusepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi64_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.128
+  return _mm_maskz_cvtusepi64_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi64_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.mem.128
+  return _mm_mask_cvtusepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi64_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.256
+  return _mm256_cvtusepi64_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.256
+  return _mm256_mask_cvtusepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi64_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.256
+  return _mm256_maskz_cvtusepi64_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.mem.256
+  return _mm256_mask_cvtusepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi32_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  return _mm_cvtepi32_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  return _mm_mask_cvtepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi32_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  return _mm_maskz_cvtepi32_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtepi32_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.mem.128
+  return _mm_mask_cvtepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi32_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  return _mm256_cvtepi32_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  return _mm256_mask_cvtepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi32_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  return _mm256_maskz_cvtepi32_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi32_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.mem.256
+  return _mm256_mask_cvtepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi32_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  return _mm_cvtepi32_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  return _mm_mask_cvtepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi32_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  return _mm_maskz_cvtepi32_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtepi32_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.mem.128
+  return _mm_mask_cvtepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi32_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.256
+  return _mm256_cvtepi32_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.256
+  return _mm256_mask_cvtepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi32_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.256
+  return _mm256_maskz_cvtepi32_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi32_storeu_epi16(void *  __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.mem.256
+  return _mm256_mask_cvtepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi64_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  return _mm_cvtepi64_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  return _mm_mask_cvtepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi64_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  return _mm_maskz_cvtepi64_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.mem.128
+  return _mm_mask_cvtepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi64_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  return _mm256_cvtepi64_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  return _mm256_mask_cvtepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi64_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  return _mm256_maskz_cvtepi64_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.mem.256
+  return _mm256_mask_cvtepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi64_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  return _mm_cvtepi64_epi32(__A); 
+}
+
+__m128i test_mm_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  return _mm_mask_cvtepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi64_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  return _mm_maskz_cvtepi64_epi32(__M, __A); 
+}
+
+void test_mm_mask_cvtepi64_storeu_epi32(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.mem.128
+  return _mm_mask_cvtepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi64_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  return _mm256_cvtepi64_epi32(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  return _mm256_mask_cvtepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  return _mm256_maskz_cvtepi64_epi32(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi64_storeu_epi32(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.mem.256
+  return _mm256_mask_cvtepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi64_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  return _mm_cvtepi64_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  return _mm_mask_cvtepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi64_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  return _mm_maskz_cvtepi64_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtepi64_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.mem.128
+  return _mm_mask_cvtepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi64_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  return _mm256_cvtepi64_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  return _mm256_mask_cvtepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi64_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  return _mm256_maskz_cvtepi64_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.mem.256
+  return _mm256_mask_cvtepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128 test_mm256_extractf32x4_ps(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_extractf32x4_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  return _mm256_extractf32x4_ps(__A, 1); 
+}
+
+__m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_extractf32x4_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1); 
+}
+
+__m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_extractf32x4_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  return _mm256_maskz_extractf32x4_ps(__U, __A, 1); 
+}
+
+__m128i test_mm256_extracti32x4_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm256_extracti32x4_epi32(__A, 1); 
+}
+
+__m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1); 
+}
+
+__m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm256_maskz_extracti32x4_epi32(__U, __A, 1); 
+}
+
+__m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm256_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm256_insertf32x4(__A, __B, 1); 
+}
+
+__m256 test_mm256_mask_insertf32x4(__m256 __W, __mmask8 __U, __m256 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm256_mask_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm256_mask_insertf32x4(__W, __U, __A, __B, 1); 
+}
+
+__m256 test_mm256_maskz_insertf32x4(__mmask8 __U, __m256 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm256_maskz_insertf32x4(__U, __A, __B, 1); 
+}
+
+__m256i test_mm256_inserti32x4(__m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm256_inserti32x4(__A, __B, 1); 
+}
+
+__m256i test_mm256_mask_inserti32x4(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm256_mask_inserti32x4(__W, __U, __A, __B, 1); 
+}
+
+__m256i test_mm256_maskz_inserti32x4(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm256_maskz_inserti32x4(__U, __A, __B, 1); 
+}
+
+__m128d test_mm_getmant_pd(__m128d __A) {
+  // CHECK-LABEL: @test_mm_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.128
+  return _mm_getmant_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128d test_mm_mask_getmant_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.128
+  return _mm_mask_getmant_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128d test_mm_maskz_getmant_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.128
+  return _mm_maskz_getmant_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256d test_mm256_getmant_pd(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.256
+  return _mm256_getmant_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256d test_mm256_mask_getmant_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.256
+  return _mm256_mask_getmant_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256d test_mm256_maskz_getmant_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.256
+  return _mm256_maskz_getmant_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128 test_mm_getmant_ps(__m128 __A) {
+  // CHECK-LABEL: @test_mm_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.128
+  return _mm_getmant_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128 test_mm_mask_getmant_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.128
+  return _mm_mask_getmant_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128 test_mm_maskz_getmant_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.128
+  return _mm_maskz_getmant_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256 test_mm256_getmant_ps(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.256
+  return _mm256_getmant_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256 test_mm256_mask_getmant_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.256
+  return _mm256_mask_getmant_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256 test_mm256_maskz_getmant_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.256
+  return _mm256_maskz_getmant_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128d test_mm_mmask_i64gather_pd(__m128d __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mmask_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gather3div2.df
+  return _mm_mmask_i64gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm_mmask_i64gather_epi64(__m128i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mmask_i64gather_epi64
+  // CHECK: @llvm.x86.avx512.gather3div2.di
+  return _mm_mmask_i64gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256d test_mm256_mmask_i64gather_pd(__m256d __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mmask_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gather3div4.df
+  return _mm256_mmask_i64gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256i test_mm256_mmask_i64gather_epi64(__m256i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mmask_i64gather_epi64
+  // CHECK: @llvm.x86.avx512.gather3div4.di
+  return _mm256_mmask_i64gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128 test_mm_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mmask_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gather3div4.sf
+  return _mm_mmask_i64gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mmask_i64gather_epi32
+  // CHECK: @llvm.x86.avx512.gather3div4.si
+  return _mm_mmask_i64gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128 test_mm256_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mmask_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gather3div8.sf
+  return _mm256_mmask_i64gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm256_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mmask_i64gather_epi32
+  // CHECK: @llvm.x86.avx512.gather3div8.si
+  return _mm256_mmask_i64gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128d test_mm_mask_i32gather_pd(__m128d __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mask_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gather3siv2.df
+  return _mm_mmask_i32gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm_mask_i32gather_epi64(__m128i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mask_i32gather_epi64
+  // CHECK: @llvm.x86.avx512.gather3siv2.di
+  return _mm_mmask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256d test_mm256_mask_i32gather_pd(__m256d __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mask_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gather3siv4.df
+  return _mm256_mmask_i32gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256i test_mm256_mask_i32gather_epi64(__m256i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mask_i32gather_epi64
+  // CHECK: @llvm.x86.avx512.gather3siv4.di
+  return _mm256_mmask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128 test_mm_mask_i32gather_ps(__m128 __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mask_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gather3siv4.sf
+  return _mm_mmask_i32gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm_mask_i32gather_epi32(__m128i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mask_i32gather_epi32
+  // CHECK: @llvm.x86.avx512.gather3siv4.si
+  return _mm_mmask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256 test_mm256_mask_i32gather_ps(__m256 __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mask_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gather3siv8.sf
+  return _mm256_mmask_i32gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256i test_mm256_mask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mask_i32gather_epi32
+  // CHECK: @llvm.x86.avx512.gather3siv8.si
+  return _mm256_mmask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256d test_mm256_permutex_pd(__m256d __X) {
+  // CHECK-LABEL: @test_mm256_permutex_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  return _mm256_permutex_pd(__X, 3);
+}
+
+__m256d test_mm256_mask_permutex_pd(__m256d __W, __mmask8 __U, __m256d __X) {
+  // CHECK-LABEL: @test_mm256_mask_permutex_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_permutex_pd(__W, __U, __X, 1);
+}
+
+__m256d test_mm256_maskz_permutex_pd(__mmask8 __U, __m256d __X) {
+  // CHECK-LABEL: @test_mm256_maskz_permutex_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_permutex_pd(__U, __X, 1);
+}
+
+__m256i test_mm256_permutex_epi64(__m256i __X) {
+  // CHECK-LABEL: @test_mm256_permutex_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  return _mm256_permutex_epi64(__X, 3);
+}
+
+__m256i test_mm256_mask_permutex_epi64(__m256i __W, __mmask8 __M, __m256i __X) {
+  // CHECK-LABEL: @test_mm256_mask_permutex_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_permutex_epi64(__W, __M, __X, 3);
+}
+
+__m256i test_mm256_maskz_permutex_epi64(__mmask8 __M, __m256i __X) {
+  // CHECK-LABEL: @test_mm256_maskz_permutex_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_permutex_epi64(__M, __X, 3);
+}
+
+__m256d test_mm256_permutexvar_pd(__m256i __X, __m256d __Y) {
+  // CHECK-LABEL: @test_mm256_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  return _mm256_permutexvar_pd(__X, __Y);
+}
+
+__m256d test_mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, __m256i __X, __m256d __Y) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  return _mm256_mask_permutexvar_pd(__W, __U, __X, __Y);
+}
+
+__m256d test_mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, __m256d __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  return _mm256_maskz_permutexvar_pd(__U, __X, __Y);
+}
+
+__m256i test_mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.256
+  return _mm256_maskz_permutexvar_epi64(__M, __X, __Y);
+}
+
+__m256i test_mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.256
+  return _mm256_mask_permutexvar_epi64(__W, __M, __X, __Y);
+}
+
+__m256 test_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  return _mm256_mask_permutexvar_ps(__W, __U, __X, __Y);
+}
+
+__m256 test_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  return _mm256_maskz_permutexvar_ps(__U, __X, __Y);
+}
+
+__m256 test_mm256_permutexvar_ps(__m256i __X, __m256 __Y) {
+  // CHECK-LABEL: @test_mm256_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  return _mm256_permutexvar_ps( __X, __Y);
+}
+
+__m256i test_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  return _mm256_maskz_permutexvar_epi32(__M, __X, __Y);
+}
+
+__m256i test_mm256_permutexvar_epi32(__m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  return _mm256_permutexvar_epi32(__X, __Y);
+}
+
+__m256i test_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  return _mm256_mask_permutexvar_epi32(__W, __M, __X, __Y);
+}
+
+__m128i test_mm_alignr_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  return _mm_alignr_epi32(__A, __B, 1);
+}
+
+__m128i test_mm_mask_alignr_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  return _mm_mask_alignr_epi32(__W, __U, __A, __B, 1);
+}
+
+__m128i test_mm_maskz_alignr_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  return _mm_maskz_alignr_epi32(__U, __A, __B, 1);
+}
+
+__m256i test_mm256_alignr_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  return _mm256_alignr_epi32(__A, __B, 1);
+}
+
+__m256i test_mm256_mask_alignr_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  return _mm256_mask_alignr_epi32(__W, __U, __A, __B, 1);
+}
+
+__m256i test_mm256_maskz_alignr_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  return _mm256_maskz_alignr_epi32(__U, __A, __B, 1);
+}
+
+__m128i test_mm_alignr_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  return _mm_alignr_epi64(__A, __B, 1);
+}
+
+__m128i test_mm_mask_alignr_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  return _mm_mask_alignr_epi64(__W, __U, __A, __B, 1);
+}
+
+__m128i test_mm_maskz_alignr_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  return _mm_maskz_alignr_epi64(__U, __A, __B, 1);
+}
+
+__m256i test_mm256_alignr_epi64(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  return _mm256_alignr_epi64(__A, __B, 1);
+}
+
+__m256i test_mm256_mask_alignr_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  return _mm256_mask_alignr_epi64(__W, __U, __A, __B, 1);
+}
+
+__m256i test_mm256_maskz_alignr_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  return _mm256_maskz_alignr_epi64(__U, __A, __B, 1);
+}
+
+__m128 test_mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_movehdup_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_movehdup_ps(__W, __U, __A);
+}
+
+__m128 test_mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_movehdup_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_movehdup_ps(__U, __A);
+}
+
+__m256 test_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_movehdup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  // CHECK: select <8 x i1> %{{.*}} <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_movehdup_ps(__W, __U, __A);
+}
+
+__m256 test_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_movehdup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  // CHECK: select <8 x i1> %{{.*}} <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_movehdup_ps(__U, __A);
+}
+
+__m128 test_mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_moveldup_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_moveldup_ps(__W, __U, __A);
+}
+
+__m128 test_mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_moveldup_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_moveldup_ps(__U, __A);
+}
+
+__m256 test_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_moveldup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}} <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_moveldup_ps(__W, __U, __A);
+}
+
+__m256 test_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_moveldup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}} <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_moveldup_ps(__U, __A);
+}
+
+__m128i test_mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_shuffle_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_shuffle_epi32(__W, __U, __A, 1);
+}
+
+__m128i test_mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_shuffle_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_shuffle_epi32(__U, __A, 2);
+}
+
+__m256i test_mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_shuffle_epi32(__W, __U, __A, 2);
+}
+
+__m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_shuffle_epi32(__U, __A, 2);
+}
+
+__m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_mov_pd(__W, __U, __A);
+}
+
+__m128d test_mm_maskz_mov_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_mov_pd(__U, __A);
+}
+
+__m256d test_mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_pd
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_mov_pd(__W, __U, __A);
+}
+
+__m256d test_mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_pd
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_mov_pd(__U, __A);
+}
+
+__m128 test_mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_mov_ps(__W, __U, __A);
+}
+
+__m128 test_mm_maskz_mov_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_mov_ps(__U, __A);
+}
+
+__m256 test_mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_ps
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_mov_ps(__W, __U, __A);
+}
+
+__m256 test_mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_ps
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_mov_ps(__U, __A);
+}
+
+__m128 test_mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtph_ps
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.128
+  return _mm_mask_cvtph_ps(__W, __U, __A);
+}
+
+__m128 test_mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtph_ps
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.128
+  return _mm_maskz_cvtph_ps(__U, __A);
+}
+
+__m256 test_mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtph_ps
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.256
+  return _mm256_mask_cvtph_ps(__W, __U, __A);
+}
+
+__m256 test_mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtph_ps
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.256
+  return _mm256_maskz_cvtph_ps(__U, __A);
+}
+
+__m128i test_mm_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.128
+  return _mm_mask_cvtps_ph(__W, __U, __A);
+}
+
+__m128i test_mm_maskz_cvtps_ph(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.128
+  return _mm_maskz_cvtps_ph(__U, __A);
+}
+
+__m128i test_mm256_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.256
+  return _mm256_mask_cvtps_ph(__W, __U, __A);
+}
+
+__m128i test_mm256_maskz_cvtps_ph(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.256
+  return _mm256_maskz_cvtps_ph(__U, __A);
+}
+
+__m128i test_mm_mask_cvt_roundps_ph(__m128i __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_cvt_roundps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.128
+  return _mm_mask_cvt_roundps_ph(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128i test_mm_maskz_cvt_roundps_ph(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvt_roundps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.128
+  return _mm_maskz_cvt_roundps_ph(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128i test_mm256_mask_cvt_roundps_ph(__m128i __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvt_roundps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.256
+  return _mm256_mask_cvt_roundps_ph(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128i test_mm256_maskz_cvt_roundps_ph(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvt_roundps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.256
+  return _mm256_maskz_cvt_roundps_ph(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__mmask8 test_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmpeq_epi32_mask
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpeq_epi32_mask(__a, __b);
+}
+
+__mmask8 test_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmpeq_epi32_mask
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpeq_epi32_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmpeq_epi64_mask
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpeq_epi64_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmpeq_epi64_mask
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpeq_epi64_mask(__a, __b);
+}
+
+__mmask8 test_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmpgt_epi32_mask
+  // CHECK: icmp sgt <4 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpgt_epi32_mask(__a, __b);
+}
+
+__mmask8 test_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmpgt_epi32_mask
+  // CHECK: icmp sgt <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpgt_epi32_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmpgt_epi64_mask
+  // CHECK: icmp sgt <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpgt_epi64_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmpgt_epi64_mask
+  // CHECK: icmp sgt <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpgt_epi64_mask(__a, __b);
+}
+
+__mmask8 test_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmpeq_epi32_mask
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmpeq_epi32_mask(__a, __b);
+}
+
+__mmask8 test_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmpeq_epi32_mask
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmpeq_epi32_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmpeq_epi64_mask
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmpeq_epi64_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmpeq_epi64_mask
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmpeq_epi64_mask(__a, __b);
+}
+
+__mmask8 test_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmpgt_epi32_mask
+  // CHECK: icmp sgt <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmpgt_epi32_mask(__a, __b);
+}
+
+__mmask8 test_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmpgt_epi32_mask
+  // CHECK: icmp sgt <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmpgt_epi32_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmpgt_epi64_mask
+  // CHECK: icmp sgt <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmpgt_epi64_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmpgt_epi64_mask
+  // CHECK: icmp sgt <4 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmpgt_epi64_mask(__a, __b);
+}
diff --git a/test/CodeGen/avx512vlbw-builtins.c b/test/CodeGen/avx512vlbw-builtins.c
index 11155f6..fd3f6f5 100644
--- a/test/CodeGen/avx512vlbw-builtins.c
+++ b/test/CodeGen/avx512vlbw-builtins.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -8,674 +8,730 @@
 
 __mmask32 test_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.256
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
   return (__mmask32)_mm256_cmpeq_epi8_mask(__a, __b);
 }
 
 __mmask32 test_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.256
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm256_mask_cmpeq_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.128
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
   return (__mmask16)_mm_cmpeq_epi8_mask(__a, __b);
 }
 
 __mmask16 test_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.128
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm_mask_cmpeq_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.256
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
   return (__mmask16)_mm256_cmpeq_epi16_mask(__a, __b);
 }
 
 __mmask16 test_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.256
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm256_mask_cmpeq_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.128
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpeq_epi16_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.128
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpeq_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.256
+  // CHECK: icmp sgt <32 x i8> %{{.*}}, %{{.*}}
   return (__mmask32)_mm256_cmpgt_epi8_mask(__a, __b);
 }
 
 __mmask32 test_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.256
+  // CHECK: icmp sgt <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm256_mask_cmpgt_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.128
+  // CHECK: icmp sgt <16 x i8> %{{.*}}, %{{.*}}
   return (__mmask16)_mm_cmpgt_epi8_mask(__a, __b);
 }
 
 __mmask16 test_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.128
+  // CHECK: icmp sgt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm_mask_cmpgt_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.256
+  // CHECK: icmp sgt <16 x i16> %{{.*}}, %{{.*}}
   return (__mmask16)_mm256_cmpgt_epi16_mask(__a, __b);
 }
 
 __mmask16 test_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.256
+  // CHECK: icmp sgt <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm256_mask_cmpgt_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.128
+  // CHECK: icmp sgt <8 x i16> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpgt_epi16_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.128
+  // CHECK: icmp sgt <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpgt_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 0, i16 -1)
-  return (__mmask64)_mm_cmpeq_epu8_mask(__a, __b);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpeq_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpeq_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 0, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpeq_epu8_mask(__u, __a, __b);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpeq_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 0, i8 -1)
-  return (__mmask32)_mm_cmpeq_epu16_mask(__a, __b);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpeq_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpeq_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 0, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpeq_epu16_mask(__u, __a, __b);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpeq_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 0, i32 -1)
-  return (__mmask64)_mm256_cmpeq_epu8_mask(__a, __b);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpeq_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpeq_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 0, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpeq_epu8_mask(__u, __a, __b);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpeq_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 0, i16 -1)
-  return (__mmask32)_mm256_cmpeq_epu16_mask(__a, __b);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpeq_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpeq_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 0, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpeq_epu16_mask(__u, __a, __b);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpeq_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 6, i16 -1)
-  return (__mmask64)_mm_cmpgt_epu8_mask(__a, __b);
+  // CHECK: icmp ugt <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpgt_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpgt_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 6, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpgt_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ugt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpgt_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 6, i8 -1)
-  return (__mmask32)_mm_cmpgt_epu16_mask(__a, __b);
+  // CHECK: icmp ugt <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpgt_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpgt_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 6, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpgt_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ugt <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpgt_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 6, i32 -1)
-  return (__mmask64)_mm256_cmpgt_epu8_mask(__a, __b);
+  // CHECK: icmp ugt <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpgt_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpgt_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 6, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpgt_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ugt <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpgt_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 6, i16 -1)
-  return (__mmask32)_mm256_cmpgt_epu16_mask(__a, __b);
+  // CHECK: icmp ugt <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpgt_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpgt_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 6, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpgt_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ugt <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpgt_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 5, i16 -1)
-  return (__mmask64)_mm_cmpge_epi8_mask(__a, __b);
+  // CHECK: icmp sge <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpge_epi8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpge_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 5, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpge_epi8_mask(__u, __a, __b);
+  // CHECK: icmp sge <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpge_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 5, i16 -1)
-  return (__mmask64)_mm_cmpge_epu8_mask(__a, __b);
+  // CHECK: icmp uge <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpge_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpge_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 5, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpge_epu8_mask(__u, __a, __b);
+  // CHECK: icmp uge <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpge_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 5, i8 -1)
-  return (__mmask32)_mm_cmpge_epi16_mask(__a, __b);
+  // CHECK: icmp sge <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpge_epi16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpge_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 5, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpge_epi16_mask(__u, __a, __b);
+  // CHECK: icmp sge <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpge_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 5, i8 -1)
-  return (__mmask32)_mm_cmpge_epu16_mask(__a, __b);
+  // CHECK: icmp uge <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpge_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpge_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 5, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpge_epu16_mask(__u, __a, __b);
+  // CHECK: icmp uge <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpge_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 5, i32 -1)
-  return (__mmask64)_mm256_cmpge_epi8_mask(__a, __b);
+  // CHECK: icmp sge <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpge_epi8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpge_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 5, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpge_epi8_mask(__u, __a, __b);
+  // CHECK: icmp sge <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpge_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 5, i32 -1)
-  return (__mmask64)_mm256_cmpge_epu8_mask(__a, __b);
+  // CHECK: icmp uge <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpge_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpge_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 5, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpge_epu8_mask(__u, __a, __b);
+  // CHECK: icmp uge <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpge_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 5, i16 -1)
-  return (__mmask32)_mm256_cmpge_epi16_mask(__a, __b);
+  // CHECK: icmp sge <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpge_epi16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpge_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 5, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpge_epi16_mask(__u, __a, __b);
+  // CHECK: icmp sge <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpge_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 5, i16 -1)
-  return (__mmask32)_mm256_cmpge_epu16_mask(__a, __b);
+  // CHECK: icmp uge <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpge_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpge_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 5, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpge_epu16_mask(__u, __a, __b);
+  // CHECK: icmp uge <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpge_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 2, i16 -1)
-  return (__mmask64)_mm_cmple_epi8_mask(__a, __b);
+  // CHECK: icmp sle <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmple_epi8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmple_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 2, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmple_epi8_mask(__u, __a, __b);
+  // CHECK: icmp sle <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmple_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 2, i16 -1)
-  return (__mmask64)_mm_cmple_epu8_mask(__a, __b);
+  // CHECK: icmp ule <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmple_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmple_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 2, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmple_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ule <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmple_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 2, i8 -1)
-  return (__mmask32)_mm_cmple_epi16_mask(__a, __b);
+  // CHECK: icmp sle <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmple_epi16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmple_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 2, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmple_epi16_mask(__u, __a, __b);
+  // CHECK: icmp sle <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmple_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 2, i8 -1)
-  return (__mmask32)_mm_cmple_epu16_mask(__a, __b);
+  // CHECK: icmp ule <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmple_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmple_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 2, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmple_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ule <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmple_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 2, i32 -1)
-  return (__mmask64)_mm256_cmple_epi8_mask(__a, __b);
+  // CHECK: icmp sle <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmple_epi8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmple_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 2, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmple_epi8_mask(__u, __a, __b);
+  // CHECK: icmp sle <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmple_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 2, i32 -1)
-  return (__mmask64)_mm256_cmple_epu8_mask(__a, __b);
+  // CHECK: icmp ule <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmple_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmple_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 2, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmple_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ule <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmple_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 2, i16 -1)
-  return (__mmask32)_mm256_cmple_epi16_mask(__a, __b);
+  // CHECK: icmp sle <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmple_epi16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmple_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 2, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmple_epi16_mask(__u, __a, __b);
+  // CHECK: icmp sle <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmple_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 2, i16 -1)
-  return (__mmask32)_mm256_cmple_epu16_mask(__a, __b);
+  // CHECK: icmp ule <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmple_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmple_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 2, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmple_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ule <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmple_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 1, i16 -1)
-  return (__mmask64)_mm_cmplt_epi8_mask(__a, __b);
+  // CHECK: icmp slt <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmplt_epi8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmplt_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 1, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmplt_epi8_mask(__u, __a, __b);
+  // CHECK: icmp slt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmplt_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 1, i16 -1)
-  return (__mmask64)_mm_cmplt_epu8_mask(__a, __b);
+  // CHECK: icmp ult <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmplt_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmplt_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 1, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmplt_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ult <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmplt_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 1, i8 -1)
-  return (__mmask32)_mm_cmplt_epi16_mask(__a, __b);
+  // CHECK: icmp slt <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmplt_epi16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmplt_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 1, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmplt_epi16_mask(__u, __a, __b);
+  // CHECK: icmp slt <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmplt_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 1, i8 -1)
-  return (__mmask32)_mm_cmplt_epu16_mask(__a, __b);
+  // CHECK: icmp ult <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmplt_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmplt_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 1, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmplt_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ult <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmplt_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 1, i32 -1)
-  return (__mmask64)_mm256_cmplt_epi8_mask(__a, __b);
+  // CHECK: icmp slt <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmplt_epi8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmplt_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 1, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmplt_epi8_mask(__u, __a, __b);
+  // CHECK: icmp slt <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmplt_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 1, i32 -1)
-  return (__mmask64)_mm256_cmplt_epu8_mask(__a, __b);
+  // CHECK: icmp ult <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmplt_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmplt_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 1, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmplt_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ult <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmplt_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 1, i16 -1)
-  return (__mmask32)_mm256_cmplt_epi16_mask(__a, __b);
+  // CHECK: icmp slt <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmplt_epi16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmplt_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 1, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmplt_epi16_mask(__u, __a, __b);
+  // CHECK: icmp slt <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmplt_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 1, i16 -1)
-  return (__mmask32)_mm256_cmplt_epu16_mask(__a, __b);
+  // CHECK: icmp ult <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmplt_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmplt_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 1, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmplt_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ult <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmplt_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 4, i16 -1)
-  return (__mmask64)_mm_cmpneq_epi8_mask(__a, __b);
+  // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpneq_epi8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpneq_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 4, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpneq_epi8_mask(__u, __a, __b);
+  // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpneq_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 4, i16 -1)
-  return (__mmask64)_mm_cmpneq_epu8_mask(__a, __b);
+  // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpneq_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpneq_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 4, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpneq_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpneq_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 4, i8 -1)
-  return (__mmask32)_mm_cmpneq_epi16_mask(__a, __b);
+  // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpneq_epi16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpneq_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 4, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpneq_epi16_mask(__u, __a, __b);
+  // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpneq_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 4, i8 -1)
-  return (__mmask32)_mm_cmpneq_epu16_mask(__a, __b);
+  // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpneq_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpneq_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 4, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpneq_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpneq_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 4, i32 -1)
-  return (__mmask64)_mm256_cmpneq_epi8_mask(__a, __b);
+  // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpneq_epi8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpneq_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 4, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpneq_epi8_mask(__u, __a, __b);
+  // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpneq_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 4, i32 -1)
-  return (__mmask64)_mm256_cmpneq_epu8_mask(__a, __b);
+  // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpneq_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpneq_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 4, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpneq_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpneq_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 4, i16 -1)
-  return (__mmask32)_mm256_cmpneq_epi16_mask(__a, __b);
+  // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpneq_epi16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpneq_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 4, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpneq_epi16_mask(__u, __a, __b);
+  // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpneq_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 4, i16 -1)
-  return (__mmask32)_mm256_cmpneq_epu16_mask(__a, __b);
+  // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpneq_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpneq_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 4, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpneq_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpneq_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmp_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 7, i16 -1)
-  return (__mmask64)_mm_cmp_epi8_mask(__a, __b, 7);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 0);
 }
 
-__mmask16 test_mm_mask_cmp_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmp_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 7, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmp_epi8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmp_epi8_mask(__u, __a, __b, 0);
 }
 
 __mmask16 test_mm_cmp_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 7, i16 -1)
-  return (__mmask64)_mm_cmp_epu8_mask(__a, __b, 7);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmp_epu8_mask(__a, __b, 0);
 }
 
-__mmask16 test_mm_mask_cmp_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmp_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 7, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmp_epu8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmp_epu8_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm_cmp_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 7, i8 -1)
-  return (__mmask32)_mm_cmp_epi16_mask(__a, __b, 7);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epi16_mask(__a, __b, 0);
 }
 
-__mmask8 test_mm_mask_cmp_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmp_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmp_epi16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epi16_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm_cmp_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 7, i8 -1)
-  return (__mmask32)_mm_cmp_epu16_mask(__a, __b, 7);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epu16_mask(__a, __b, 0);
 }
 
-__mmask8 test_mm_mask_cmp_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmp_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmp_epu16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epu16_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm256_cmp_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 7, i32 -1)
-  return (__mmask64)_mm256_cmp_epi8_mask(__a, __b, 7);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmp_epi8_mask(__a, __b, 0);
 }
 
-__mmask32 test_mm256_mask_cmp_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmp_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 7, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmp_epi8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmp_epi8_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm256_cmp_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 7, i32 -1)
-  return (__mmask64)_mm256_cmp_epu8_mask(__a, __b, 7);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmp_epu8_mask(__a, __b, 0);
 }
 
-__mmask32 test_mm256_mask_cmp_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmp_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 7, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmp_epu8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmp_epu8_mask(__u, __a, __b, 0);
 }
 
 __mmask16 test_mm256_cmp_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 7, i16 -1)
-  return (__mmask32)_mm256_cmp_epi16_mask(__a, __b, 7);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmp_epi16_mask(__a, __b, 0);
 }
 
-__mmask16 test_mm256_mask_cmp_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmp_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 7, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmp_epi16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmp_epi16_mask(__u, __a, __b, 0);
 }
 
 __mmask16 test_mm256_cmp_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 7, i16 -1)
-  return (__mmask32)_mm256_cmp_epu16_mask(__a, __b, 7);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmp_epu16_mask(__a, __b, 0);
 }
 
-__mmask16 test_mm256_mask_cmp_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmp_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 7, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmp_epu16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmp_epu16_mask(__u, __a, __b, 0);
 }
 
 
@@ -800,24 +856,24 @@
 
 __m128i test_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: @test_mm_mask_blend_epi8
-  // CHECK: @llvm.x86.avx512.mask.blend.b.128
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_blend_epi8(__U,__A,__W); 
 }
 __m256i test_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_epi8
-  // CHECK: @llvm.x86.avx512.mask.blend.b.256
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_blend_epi8(__U,__A,__W); 
 }
 
 __m128i test_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: @test_mm_mask_blend_epi16
-  // CHECK: @llvm.x86.avx512.mask.blend.w.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_blend_epi16(__U,__A,__W); 
 }
 
 __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_epi16
-  // CHECK: @llvm.x86.avx512.mask.blend.w.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_blend_epi16(__U,__A,__W); 
 }
 
@@ -1611,97 +1667,902 @@
 
 __m128i test_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_unpackhi_epi8(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_unpackhi_epi8(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_unpackhi_epi16(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.256
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.256
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_unpackhi_epi16(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_unpacklo_epi8(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_unpacklo_epi8(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_unpacklo_epi16(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.256
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.256
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_unpacklo_epi16(__U, __A, __B); 
 }
 
+__m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.128
+  return _mm_mask_cvtepi8_epi16(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.128
+  return _mm_maskz_cvtepi8_epi16(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.256
+  return _mm256_mask_cvtepi8_epi16(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.256
+  return _mm256_maskz_cvtepi8_epi16(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  return _mm_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  return _mm_maskz_cvtepu8_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  return _mm256_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  return _mm256_maskz_cvtepu8_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  return _mm_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  return _mm_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  return _mm256_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  return _mm256_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  return _mm_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m128i test_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  return _mm_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m256i test_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  return _mm256_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m256i test_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  return _mm256_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m128i test_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  return _mm_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  return _mm_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  return _mm256_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  return _mm256_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  return _mm_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  return _mm_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  return _mm256_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  return _mm256_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.128
+  return _mm_mask_cvtepu8_epi16(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.128
+  return _mm_maskz_cvtepu8_epi16(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.256
+  return _mm256_mask_cvtepu8_epi16(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.256
+  return _mm256_maskz_cvtepu8_epi16(__U, __A); 
+}
+
+__m256i test_mm256_sllv_epi16(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_sllv_epi16(__A, __B); 
+}
+
+__m256i test_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_mask_sllv_epi16(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_maskz_sllv_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_sllv_epi16(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_sllv_epi16(__A, __B); 
+}
+
+__m128i test_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_mask_sllv_epi16(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_maskz_sllv_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.128
+  return _mm_mask_sll_epi16(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sll_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.128
+  return _mm_maskz_sll_epi16(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.256
+  return _mm256_mask_sll_epi16(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.256
+  return _mm256_maskz_sll_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.128
+  return _mm_mask_slli_epi16(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_slli_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.128
+  return _mm_maskz_slli_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.256
+  return _mm256_mask_slli_epi16(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.256
+  return _mm256_maskz_slli_epi16(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_epi16
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_mov_epi16(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_mov_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_epi16
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_mov_epi16(__U, __A); 
+}
+
+__m256i test_mm256_mask_mov_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_epi16
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_mov_epi16(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_mov_epi16(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_epi16
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_mov_epi16(__U, __A); 
+}
+
+__m128i test_mm_mask_mov_epi8(__m128i __W, __mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_epi8
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_mov_epi8(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_mov_epi8(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_epi8
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_mov_epi8(__U, __A); 
+}
+
+__m256i test_mm256_mask_mov_epi8(__m256i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_epi8
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_mask_mov_epi8(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_epi8
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_maskz_mov_epi8(__U, __A); 
+}
+
+__m128i test_mm_mask_loadu_epi16(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_epi16
+  // CHECK: @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm_mask_loadu_epi16(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi16(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_epi16
+  // CHECK: @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm_maskz_loadu_epi16(__U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi16(__m256i __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_epi16
+  // CHECK: @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_mask_loadu_epi16(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi16(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_epi16
+  // CHECK: @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_maskz_loadu_epi16(__U, __P); 
+}
+
+__m128i test_mm_mask_loadu_epi8(__m128i __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_epi8
+  // CHECK: @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  return _mm_mask_loadu_epi8(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi8(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_epi8
+  // CHECK: @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  return _mm_maskz_loadu_epi8(__U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi8(__m256i __W, __mmask32 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_epi8
+  // CHECK: @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_mask_loadu_epi8(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi8(__mmask32 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_epi8
+  // CHECK: @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_maskz_loadu_epi8(__U, __P); 
+}
+
+void test_mm_mask_storeu_epi16(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_epi16
+  // CHECK: @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %{{.*}}, <8 x i16>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  return _mm_mask_storeu_epi16(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_epi16(void *__P, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_epi16
+  // CHECK: @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %{{.*}}, <16 x i16>* %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  return _mm256_mask_storeu_epi16(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_epi8(void *__P, __mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_epi8
+  // CHECK: @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  return _mm_mask_storeu_epi8(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_epi8(void *__P, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_epi8
+  // CHECK: @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %{{.*}}, <32 x i8>* %{{.*}}, i32 1, <32 x i1> %{{.*}})
+  return _mm256_mask_storeu_epi8(__P, __U, __A); 
+}
+__mmask16 test_mm_test_epi8_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.128
+  return _mm_test_epi8_mask(__A, __B); 
+}
+
+__mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.128
+  return _mm_mask_test_epi8_mask(__U, __A, __B); 
+}
+
+__mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.256
+  return _mm256_test_epi8_mask(__A, __B); 
+}
+
+__mmask32 test_mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.256
+  return _mm256_mask_test_epi8_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_test_epi16_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.128
+  return _mm_test_epi16_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_test_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.128
+  return _mm_mask_test_epi16_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm256_test_epi16_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.256
+  return _mm256_test_epi16_mask(__A, __B); 
+}
+
+__mmask16 test_mm256_mask_test_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.256
+  return _mm256_mask_test_epi16_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm_testn_epi8_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.128
+  return _mm_testn_epi8_mask(__A, __B); 
+}
+
+__mmask16 test_mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.128
+  return _mm_mask_testn_epi8_mask(__U, __A, __B); 
+}
+
+__mmask32 test_mm256_testn_epi8_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.256
+  return _mm256_testn_epi8_mask(__A, __B); 
+}
+
+__mmask32 test_mm256_mask_testn_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.256
+  return _mm256_mask_testn_epi8_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_testn_epi16_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.128
+  return _mm_testn_epi16_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_testn_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.128
+  return _mm_mask_testn_epi16_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm256_testn_epi16_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.256
+  return _mm256_testn_epi16_mask(__A, __B); 
+}
+
+__mmask16 test_mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.256
+  return _mm256_mask_testn_epi16_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm_movepi8_mask(__m128i __A) {
+  // CHECK-LABEL: @test_mm_movepi8_mask
+  // CHECK: @llvm.x86.avx512.cvtb2mask.128
+  return _mm_movepi8_mask(__A); 
+}
+
+__mmask32 test_mm256_movepi8_mask(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_movepi8_mask
+  // CHECK: @llvm.x86.avx512.cvtb2mask.256
+  return _mm256_movepi8_mask(__A); 
+}
+
+__m128i test_mm_movm_epi8(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm_movm_epi8
+  // CHECK: @llvm.x86.avx512.cvtmask2b.128
+  return _mm_movm_epi8(__A); 
+}
+
+__m256i test_mm256_movm_epi8(__mmask32 __A) {
+  // CHECK-LABEL: @test_mm256_movm_epi8
+  // CHECK: @llvm.x86.avx512.cvtmask2b.256
+  return _mm256_movm_epi8(__A); 
+}
+
+__m128i test_mm_movm_epi16(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm_movm_epi16
+  // CHECK: @llvm.x86.avx512.cvtmask2w.128
+  return _mm_movm_epi16(__A); 
+}
+
+__m256i test_mm256_movm_epi16(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm256_movm_epi16
+  // CHECK: @llvm.x86.avx512.cvtmask2w.256
+  return _mm256_movm_epi16(__A); 
+}
+
+__m128i test_mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_broadcastb_epi8(__O, __M, __A);
+}
+
+__m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_broadcastb_epi8(__M, __A);
+}
+
+__m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_mask_broadcastb_epi8(__O, __M, __A);
+}
+
+__m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_maskz_broadcastb_epi8(__M, __A);
+}
+
+__m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_broadcastw_epi16(__O, __M, __A);
+}
+
+__m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_broadcastw_epi16(__M, __A);
+}
+
+__m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_broadcastw_epi16(__O, __M, __A);
+}
+
+__m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_broadcastw_epi16(__M, __A);
+}
+
+__m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.256
+  return _mm256_mask_set1_epi16(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) {
+  // CHECK-LABEL: @test_mm256_maskz_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.256
+  return _mm256_maskz_set1_epi16(__M, __A); 
+}
+
+__m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) {
+  // CHECK-LABEL: @test_mm_mask_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.128
+  return _mm_mask_set1_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
+  // CHECK-LABEL: @test_mm_maskz_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.128
+  return _mm_maskz_set1_epi16(__M, __A); 
+}
+__m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  return _mm_permutexvar_epi16(__A, __B); 
+}
+
+__m128i test_mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  return _mm_maskz_permutexvar_epi16(__M, __A, __B); 
+}
+
+__m128i test_mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  return _mm_mask_permutexvar_epi16(__W, __M, __A, __B); 
+}
+
+__m256i test_mm256_permutexvar_epi16(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  return _mm256_permutexvar_epi16(__A, __B); 
+}
+
+__m256i test_mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  return _mm256_maskz_permutexvar_epi16(__M, __A, __B); 
+}
+
+__m256i test_mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  return _mm256_mask_permutexvar_epi16(__W, __M, __A, __B); 
+}
+__m128i test_mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_alignr_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_alignr_epi8(__W, __U, __A, __B, 2); 
+}
+
+__m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_alignr_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_alignr_epi8(__U, __A, __B, 2); 
+}
+
+__m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_alignr_epi8
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_mask_alignr_epi8(__W, __U, __A, __B, 2); 
+}
+
+__m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_alignr_epi8
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_maskz_alignr_epi8(__U, __A, __B, 2); 
+}
+
+__m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  return _mm_dbsad_epu8(__A, __B, 170); 
+}
+
+__m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+}
+
+__m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); 
+}
+
+__m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  return _mm256_dbsad_epu8(__A, __B, 170); 
+}
+
+__m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+}
+
+__m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); 
+}
+__mmask8 test_mm_movepi16_mask(__m128i __A) {
+  // CHECK-LABEL: @test_mm_movepi16_mask
+  // CHECK: @llvm.x86.avx512.cvtw2mask.128
+  return _mm_movepi16_mask(__A); 
+}
+
+__mmask16 test_mm256_movepi16_mask(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_movepi16_mask
+  // CHECK: @llvm.x86.avx512.cvtw2mask.256
+  return _mm256_movepi16_mask(__A); 
+}
+
+__m128i test_mm_mask_shufflehi_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_shufflehi_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_shufflehi_epi16(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_shufflehi_epi16(__mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_shufflehi_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_shufflehi_epi16(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_shufflelo_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_shufflelo_epi16(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_shufflelo_epi16(__mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_shufflelo_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_shufflelo_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_shufflehi_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_shufflehi_epi16(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_shufflehi_epi16(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_shufflehi_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_shufflehi_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_shufflelo_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_shufflelo_epi16(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_shufflelo_epi16(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_shufflelo_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_shufflelo_epi16(__U, __A, 5); 
+}
+
+void test_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ // CHECK-LABEL:@test_mm_mask_cvtepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.128
+ _mm_mask_cvtepi16_storeu_epi8 (__P, __M, __A);
+}
+
+void test_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ // CHECK-LABEL:@test_mm_mask_cvtsepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.128
+  _mm_mask_cvtsepi16_storeu_epi8 ( __P,  __M, __A);
+}
+
+void test_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ // CHECK-LABEL:@test_mm_mask_cvtusepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.128
+  _mm_mask_cvtusepi16_storeu_epi8 (__P, __M, __A);
+}
+
+void test_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ // CHECK-LABEL:@test_mm256_mask_cvtusepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.256
+  _mm256_mask_cvtusepi16_storeu_epi8 ( __P, __M, __A);
+}
+
+void test_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ // CHECK-LABEL:@test_mm256_mask_cvtepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.256
+  _mm256_mask_cvtepi16_storeu_epi8 ( __P,  __M, __A);
+}
+
+void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ // CHECK-LABEL:@test_mm256_mask_cvtsepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256
+ _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A);
+}
diff --git a/test/CodeGen/avx512vlcd-builtins.c b/test/CodeGen/avx512vlcd-builtins.c
new file mode 100644
index 0000000..9945d7b
--- /dev/null
+++ b/test/CodeGen/avx512vlcd-builtins.c
@@ -0,0 +1,182 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vl -target-feature +avx512cd -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m128i test_mm_broadcastmb_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm_broadcastmb_epi64
+  // CHECK: @llvm.x86.avx512.broadcastmb.128
+  return _mm_broadcastmb_epi64(__A); 
+}
+
+__m256i test_mm256_broadcastmb_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm256_broadcastmb_epi64
+  // CHECK: @llvm.x86.avx512.broadcastmb.256
+  return _mm256_broadcastmb_epi64(__A); 
+}
+
+__m128i test_mm_broadcastmw_epi32(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm_broadcastmw_epi32
+  // CHECK: @llvm.x86.avx512.broadcastmw.128
+  return _mm_broadcastmw_epi32(__A); 
+}
+
+__m256i test_mm256_broadcastmw_epi32(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm256_broadcastmw_epi32
+  // CHECK: @llvm.x86.avx512.broadcastmw.256
+  return _mm256_broadcastmw_epi32(__A); 
+}
+
+__m128i test_mm_conflict_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.128
+  return _mm_conflict_epi64(__A); 
+}
+
+__m128i test_mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.128
+  return _mm_mask_conflict_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.128
+  return _mm_maskz_conflict_epi64(__U, __A); 
+}
+
+__m256i test_mm256_conflict_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.256
+  return _mm256_conflict_epi64(__A); 
+}
+
+__m256i test_mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.256
+  return _mm256_mask_conflict_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.256
+  return _mm256_maskz_conflict_epi64(__U, __A); 
+}
+
+__m128i test_mm_conflict_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.128
+  return _mm_conflict_epi32(__A); 
+}
+
+__m128i test_mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.128
+  return _mm_mask_conflict_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.128
+  return _mm_maskz_conflict_epi32(__U, __A); 
+}
+
+__m256i test_mm256_conflict_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.256
+  return _mm256_conflict_epi32(__A); 
+}
+
+__m256i test_mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.256
+  return _mm256_mask_conflict_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.256
+  return _mm256_maskz_conflict_epi32(__U, __A); 
+}
+
+__m128i test_mm_lzcnt_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_lzcnt_epi32
+  // CHECK: call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+  return _mm_lzcnt_epi32(__A); 
+}
+
+__m128i test_mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_lzcnt_epi32
+  // CHECK: call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_lzcnt_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_lzcnt_epi32
+  // CHECK: call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_lzcnt_epi32(__U, __A); 
+}
+
+__m256i test_mm256_lzcnt_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_lzcnt_epi32
+  // CHECK: call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %{{.*}}, i1 false)
+  return _mm256_lzcnt_epi32(__A); 
+}
+
+__m256i test_mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_lzcnt_epi32
+  // CHECK: call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %{{.*}}, i1 false)
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_lzcnt_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_lzcnt_epi32
+  // CHECK: call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %{{.*}}, i1 false)
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_lzcnt_epi32(__U, __A); 
+}
+
+__m128i test_mm_lzcnt_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_lzcnt_epi64
+  // CHECK: call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %{{.*}}, i1 false)
+  return _mm_lzcnt_epi64(__A); 
+}
+
+__m128i test_mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_lzcnt_epi64
+  // CHECK: call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %{{.*}}, i1 false)
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_lzcnt_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_lzcnt_epi64
+  // CHECK: call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %{{.*}}, i1 false)
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_lzcnt_epi64(__U, __A); 
+}
+
+__m256i test_mm256_lzcnt_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_lzcnt_epi64
+  // CHECK: call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %{{.*}}, i1 false)
+  return _mm256_lzcnt_epi64(__A); 
+}
+
+__m256i test_mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_lzcnt_epi64
+  // CHECK: call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %{{.*}}, i1 false)
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_lzcnt_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_lzcnt_epi64
+  // CHECK: call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %{{.*}}, i1 false)
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_lzcnt_epi64(__U, __A); 
+}
diff --git a/test/CodeGen/avx512vldq-builtins.c b/test/CodeGen/avx512vldq-builtins.c
index 69bdc7a..68d793c 100644
--- a/test/CodeGen/avx512vldq-builtins.c
+++ b/test/CodeGen/avx512vldq-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -808,3 +808,262 @@
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.256
   return _mm256_maskz_reduce_ps(__U, __A, 4); 
 }
+
+__mmask8 test_mm_movepi32_mask(__m128i __A) {
+  // CHECK-LABEL: @test_mm_movepi32_mask
+  // CHECK: @llvm.x86.avx512.cvtd2mask.128
+  return _mm_movepi32_mask(__A); 
+}
+
+__mmask8 test_mm256_movepi32_mask(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_movepi32_mask
+  // CHECK: @llvm.x86.avx512.cvtd2mask.256
+  return _mm256_movepi32_mask(__A); 
+}
+
+__m128i test_mm_movm_epi32(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm_movm_epi32
+  // CHECK: @llvm.x86.avx512.cvtmask2d.128
+  return _mm_movm_epi32(__A); 
+}
+
+__m256i test_mm256_movm_epi32(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm256_movm_epi32
+  // CHECK: @llvm.x86.avx512.cvtmask2d.256
+  return _mm256_movm_epi32(__A); 
+}
+
+__m128i test_mm_movm_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm_movm_epi64
+  // CHECK: @llvm.x86.avx512.cvtmask2q.128
+  return _mm_movm_epi64(__A); 
+}
+
+__m256i test_mm256_movm_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm256_movm_epi64
+  // CHECK: @llvm.x86.avx512.cvtmask2q.256
+  return _mm256_movm_epi64(__A); 
+}
+
+__mmask8 test_mm_movepi64_mask(__m128i __A) {
+  // CHECK-LABEL: @test_mm_movepi64_mask
+  // CHECK: @llvm.x86.avx512.cvtq2mask.128
+  return _mm_movepi64_mask(__A); 
+}
+
+__mmask8 test_mm256_movepi64_mask(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_movepi64_mask
+  // CHECK: @llvm.x86.avx512.cvtq2mask.256
+  return _mm256_movepi64_mask(__A); 
+}
+
+
+__m256 test_mm256_broadcast_f32x2(__m128 __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm256_broadcast_f32x2(__A); 
+}
+
+__m256 test_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm256_mask_broadcast_f32x2(__O, __M, __A); 
+}
+
+__m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm256_maskz_broadcast_f32x2(__M, __A); 
+}
+
+__m256d test_mm256_broadcast_f64x2(__m128d __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm256_broadcast_f64x2(__A); 
+}
+
+__m256d test_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm256_mask_broadcast_f64x2(__O, __M, __A); 
+}
+
+__m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm256_maskz_broadcast_f64x2(__M, __A); 
+}
+
+__m128i test_mm_broadcast_i32x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm_broadcast_i32x2(__A); 
+}
+
+__m128i test_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm_mask_broadcast_i32x2(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm_maskz_broadcast_i32x2(__M, __A); 
+}
+
+__m256i test_mm256_broadcast_i32x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm256_broadcast_i32x2(__A); 
+}
+
+__m256i test_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm256_mask_broadcast_i32x2(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm256_maskz_broadcast_i32x2(__M, __A); 
+}
+
+__m256i test_mm256_broadcast_i64x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm256_broadcast_i64x2(__A); 
+}
+
+__m256i test_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm256_mask_broadcast_i64x2(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm256_maskz_broadcast_i64x2(__M, __A); 
+}
+
+__m128d test_mm256_extractf64x2_pd(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm256_extractf64x2_pd(__A, 1); 
+}
+
+__m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); 
+}
+
+__m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm256_maskz_extractf64x2_pd(__U, __A, 1); 
+}
+
+__m128i test_mm256_extracti64x2_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm256_extracti64x2_epi64(__A, 1); 
+}
+
+__m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); 
+}
+
+__m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); 
+}
+
+__m256d test_mm256_insertf64x2(__m256d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm256_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm256_insertf64x2(__A, __B, 1); 
+}
+
+__m256d test_mm256_mask_insertf64x2(__m256d __W, __mmask8 __U, __m256d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm256_mask_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm256_mask_insertf64x2(__W, __U, __A, __B, 1); 
+}
+
+__m256d test_mm256_maskz_insertf64x2(__mmask8 __U, __m256d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm256_maskz_insertf64x2(__U, __A, __B, 1); 
+}
+
+__m256i test_mm256_inserti64x2(__m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm256_inserti64x2(__A, __B, 1); 
+}
+
+__m256i test_mm256_mask_inserti64x2(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm256_mask_inserti64x2(__W, __U, __A, __B, 1); 
+}
+
+__m256i test_mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm256_maskz_inserti64x2(__U, __A, __B, 1); 
+}
+
+__mmask8 test_mm_mask_fpclass_pd_mask(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.128
+  return _mm_mask_fpclass_pd_mask(__U, __A, 2); 
+}
+
+__mmask8 test_mm_fpclass_pd_mask(__m128d __A) {
+  // CHECK-LABEL: @test_mm_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.128
+  return _mm_fpclass_pd_mask(__A, 2); 
+}
+
+__mmask8 test_mm256_mask_fpclass_pd_mask(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.256
+  return _mm256_mask_fpclass_pd_mask(__U, __A, 2); 
+}
+
+__mmask8 test_mm256_fpclass_pd_mask(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.256
+  return _mm256_fpclass_pd_mask(__A, 2); 
+}
+
+__mmask8 test_mm_mask_fpclass_ps_mask(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.128
+  return _mm_mask_fpclass_ps_mask(__U, __A, 2); 
+}
+
+__mmask8 test_mm_fpclass_ps_mask(__m128 __A) {
+  // CHECK-LABEL: @test_mm_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.128
+  return _mm_fpclass_ps_mask(__A, 2); 
+}
+
+__mmask8 test_mm256_mask_fpclass_ps_mask(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.256
+  return _mm256_mask_fpclass_ps_mask(__U, __A, 2); 
+}
+
+__mmask8 test_mm256_fpclass_ps_mask(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.256
+  return _mm256_fpclass_ps_mask(__A, 2); 
+}
diff --git a/test/CodeGen/backend-unsupported-error.ll b/test/CodeGen/backend-unsupported-error.ll
new file mode 100644
index 0000000..1a15bfc
--- /dev/null
+++ b/test/CodeGen/backend-unsupported-error.ll
@@ -0,0 +1,44 @@
+; RUN: not %clang_cc1 -triple r600-unknown-unknown -S -o - %s 2>&1 | FileCheck %s
+; REQUIRES: amdgpu-registered-target
+
+; This is to check that backend errors for unsupported features are formatted correctly
+
+; CHECK: error: test.c:2:20: in function bar i32 (): unsupported call to function foo.2
+
+target triple = "r600-unknown-unknown"
+
+; Function Attrs: nounwind uwtable
+define i32 @bar() #0 !dbg !4 {
+entry:
+  %call = call i32 @foo(), !dbg !12
+  ret i32 %call, !dbg !13
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @foo() #0 !dbg !8 {
+entry:
+  %call = call i32 @bar(), !dbg !14
+  ret i32 %call, !dbg !15
+}
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, variables: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0"}
+!12 = !DILocation(line: 2, column: 20, scope: !4)
+!13 = !DILocation(line: 2, column: 13, scope: !4)
+!14 = !DILocation(line: 3, column: 20, scope: !8)
+!15 = !DILocation(line: 3, column: 13, scope: !8)
diff --git a/test/CodeGen/bitscan-builtins.c b/test/CodeGen/bitscan-builtins.c
new file mode 100644
index 0000000..ae817e8
--- /dev/null
+++ b/test/CodeGen/bitscan-builtins.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+#include <immintrin.h>
+
+int test_bit_scan_forward(int a) {
+  return _bit_scan_forward(a);
+// CHECK: @test_bit_scan_forward
+// CHECK: %[[call:.*]] = call i32 @llvm.cttz.i32(
+// CHECK: ret i32 %[[call]]
+}
+
+int test_bit_scan_reverse(int a) {
+  return _bit_scan_reverse(a);
+// CHECK:  %[[call:.*]] = call i32 @llvm.ctlz.i32(
+// CHECK:  %[[sub:.*]] = sub nsw i32 31, %[[call]]
+// CHECK: ret i32 %[[sub]]
+}
diff --git a/test/CodeGen/blocks-opencl.cl b/test/CodeGen/blocks-opencl.cl
index d356298..61c479b 100644
--- a/test/CodeGen/blocks-opencl.cl
+++ b/test/CodeGen/blocks-opencl.cl
@@ -2,15 +2,16 @@
 // This used to crash due to trying to generate a bitcase from a cstring
 // in the constant address space to i8* in AS0.
 
-void dummy(float (^op)(float))
-{
+void dummy(float (^const op)(float)) {
 }
 
 // CHECK: i8 addrspace(3)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(3)* @.str, i32 0, i32 0)
 
 kernel void test_block()
 {
-  float (^X)(float) = ^(float x) { return x + 42.0f; };
+  float (^const X)(float) = ^(float x) {
+    return x + 42.0f;
+  };
   dummy(X);
 }
 
diff --git a/test/CodeGen/blocks-windows.c b/test/CodeGen/blocks-windows.c
new file mode 100644
index 0000000..ced00ef
--- /dev/null
+++ b/test/CodeGen/blocks-windows.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+void *_Block_copy(void *);
+
+#if defined(BLOCKS_IN_BLOCKS_DECL)
+extern __declspec(dllexport) long _NSConcreteStackBlock[];
+#endif
+
+#if defined(BLOCKS_IN_BLOCKS_DEFN)
+__declspec(dllexport) long _NSConcreteStackBlock[5];
+#endif
+
+#if defined(BLOCKS_NOT_IN_BLOCKS_EXTERN)
+extern long _NSConcreteStackBlock[];
+#endif
+
+#if defined(BLOCKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT)
+extern __declspec(dllimport) long _NSConcreteStackBlock[];
+#endif
+
+#if defined(BLOCKS_NOT_IN_BLOCKS_DLLIMPORT)
+__declspec(dllimport) long _NSConcreteStackBlock[];
+#endif
+
+int (*g(void))(void) {
+  __block int i;
+  return _Block_copy(^{ ++i; return i; });
+}
+
+// CHECK-BLOCKS-IN-BLOCKS-DECL: @_NSConcreteStackBlock = external dllexport global i8*
+// CHECK-BLOCKS-IN-BLOCKS-DEFN: @_NSConcreteStackBlock = common dllexport global [5 x i32]
+// CHECK-BLOCKS-NOT-IN-BLOCKS: @_NSConcreteStackBlock = external dllimport global i8*
+// CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN: @_NSConcreteStackBlock = external dllimport global i8*
+// CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT: @_NSConcreteStackBlock = external dllimport global i8*
+// CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT: @_NSConcreteStackBlock = external dllimport global i8*
+
diff --git a/test/CodeGen/bmi-builtins.c b/test/CodeGen/bmi-builtins.c
index 92332e3..1202d99 100644
--- a/test/CodeGen/bmi-builtins.c
+++ b/test/CodeGen/bmi-builtins.c
@@ -1,164 +1,223 @@
-// RUN: %clang_cc1 %s -O3 -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
-// The double underscore intrinsics are for compatibility with 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
+
+// The double underscore intrinsics are for compatibility with
 // AMD's BMI interface. The single underscore intrinsics
 // are for compatibility with Intel's BMI interface.
 // Apart from the underscores, the interfaces are identical
-// except in one case: although the 'bextr' register-form 
-// instruction is identical in hardware, the AMD and Intel 
-// intrinsics are different! 
+// except in one case: although the 'bextr' register-form
+// instruction is identical in hardware, the AMD and Intel
+// intrinsics are different!
 
 unsigned short test__tzcnt_u16(unsigned short __X) {
-  // CHECK: @llvm.cttz.i16
+  // CHECK-LABEL: test__tzcnt_u16
+  // CHECK: zext i16 %{{.*}} to i32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i16 @llvm.cttz.i16(i16 %{{.*}}, i1 true)
   return __tzcnt_u16(__X);
 }
 
 unsigned int test__andn_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: [[DEST:%.*]] = xor i32 %{{.*}}, -1
-  // CHECK-NEXT: %{{.*}} = and i32 %{{.*}}, [[DEST]]
+  // CHECK-LABEL: test__andn_u32
+  // CHECK: xor i32 %{{.*}}, -1
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return __andn_u32(__X, __Y);
 }
 
 unsigned int test__bextr_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: @llvm.x86.bmi.bextr.32
+  // CHECK-LABEL: test__bextr_u32
+  // CHECK: i32 @llvm.x86.bmi.bextr.32(i32 %{{.*}}, i32 %{{.*}})
   return __bextr_u32(__X, __Y);
 }
 
 unsigned int test__blsi_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = sub i32 0, [[SRC:%.*]]
-  // CHECK-NEXT: %{{.*}} = and i32 [[SRC]], [[DEST]]
+  // CHECK-LABEL: test__blsi_u32
+  // CHECK: sub i32 0, %{{.*}}
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return __blsi_u32(__X);
 }
 
 unsigned int test__blsmsk_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = xor i32 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test__blsmsk_u32
+  // CHECK: sub i32 %{{.*}}, 1
+  // CHECK: xor i32 %{{.*}}, %{{.*}}
   return __blsmsk_u32(__X);
 }
 
 unsigned int test__blsr_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = and i32 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test__blsr_u32
+  // CHECK: sub i32 %{{.*}}, 1
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return __blsr_u32(__X);
 }
 
 unsigned int test__tzcnt_u32(unsigned int __X) {
-  // CHECK: @llvm.cttz.i32
+  // CHECK-LABEL: test__tzcnt_u32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i32 @llvm.cttz.i32(i32 %{{.*}}, i1 true)
   return __tzcnt_u32(__X);
 }
 
+int test_mm_tzcnt_32(unsigned int __X) {
+  // CHECK-LABEL: test_mm_tzcnt_32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i32 @llvm.cttz.i32(i32 %{{.*}}, i1 true)
+  return _mm_tzcnt_32(__X);
+}
+
 unsigned long long test__andn_u64(unsigned long __X, unsigned long __Y) {
-  // CHECK: [[DEST:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK-NEXT: %{{.*}} = and i64 %{{.*}}, [[DEST]]
+  // CHECK-LABEL: test__andn_u64
+  // CHECK: xor i64 %{{.*}}, -1
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return __andn_u64(__X, __Y);
 }
 
 unsigned long long test__bextr_u64(unsigned long __X, unsigned long __Y) {
-  // CHECK: @llvm.x86.bmi.bextr.64
+  // CHECK-LABEL: test__bextr_u64
+  // CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return __bextr_u64(__X, __Y);
 }
 
 unsigned long long test__blsi_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = sub i64 0, [[SRC:%.*]]
-  // CHECK-NEXT: %{{.*}} = and i64 [[SRC]], [[DEST]]
+  // CHECK-LABEL: test__blsi_u64
+  // CHECK: sub i64 0, %{{.*}}
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return __blsi_u64(__X);
 }
 
 unsigned long long test__blsmsk_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = xor i64 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test__blsmsk_u64
+  // CHECK: sub i64 %{{.*}}, 1
+  // CHECK: xor i64 %{{.*}}, %{{.*}}
   return __blsmsk_u64(__X);
 }
 
 unsigned long long test__blsr_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = and i64 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test__blsr_u64
+  // CHECK: sub i64 %{{.*}}, 1
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return __blsr_u64(__X);
 }
 
 unsigned long long test__tzcnt_u64(unsigned long long __X) {
-  // CHECK: @llvm.cttz.i64
+  // CHECK-LABEL: test__tzcnt_u64
+  // CHECK: icmp ne i64 %{{.*}}, 0
+  // CHECK: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 true)
   return __tzcnt_u64(__X);
 }
 
+long long test_mm_tzcnt_64(unsigned long long __X) {
+  // CHECK-LABEL: test_mm_tzcnt_64
+  // CHECK: icmp ne i64 %{{.*}}, 0
+  // CHECK: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 true)
+  return _mm_tzcnt_64(__X);
+}
+
 // Intel intrinsics
 
 unsigned short test_tzcnt_u16(unsigned short __X) {
-  // CHECK: @llvm.cttz.i16
+  // CHECK-LABEL: test_tzcnt_u16
+  // CHECK: zext i16 %{{.*}} to i32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i16 @llvm.cttz.i16(i16 %{{.*}}, i1 true)
   return _tzcnt_u16(__X);
 }
 
 unsigned int test_andn_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: [[DEST:%.*]] = xor i32 %{{.*}}, -1
-  // CHECK-NEXT: %{{.*}} = and i32 %{{.*}}, [[DEST]]
+  // CHECK-LABEL: test_andn_u32
+  // CHECK: xor i32 %{{.*}}, -1
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return _andn_u32(__X, __Y);
 }
 
-unsigned int test_bextr_u32(unsigned int __X, unsigned int __Y, 
+unsigned int test_bextr_u32(unsigned int __X, unsigned int __Y,
                             unsigned int __Z) {
-  // CHECK: @llvm.x86.bmi.bextr.32
+  // CHECK-LABEL: test_bextr_u32
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: i32 @llvm.x86.bmi.bextr.32(i32 %{{.*}}, i32 %{{.*}})
   return _bextr_u32(__X, __Y, __Z);
 }
 
 unsigned int test_blsi_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = sub i32 0, [[SRC:%.*]]
-  // CHECK-NEXT: %{{.*}} = and i32 [[SRC]], [[DEST]]
+  // CHECK-LABEL: test_blsi_u32
+  // CHECK: sub i32 0, %{{.*}}
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return _blsi_u32(__X);
 }
 
 unsigned int test_blsmsk_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = xor i32 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test_blsmsk_u32
+  // CHECK: sub i32 %{{.*}}, 1
+  // CHECK: xor i32 %{{.*}}, %{{.*}}
   return _blsmsk_u32(__X);
 }
 
 unsigned int test_blsr_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = and i32 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test_blsr_u32
+  // CHECK: sub i32 %{{.*}}, 1
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return _blsr_u32(__X);
 }
 
 unsigned int test_tzcnt_u32(unsigned int __X) {
-  // CHECK: @llvm.cttz.i32
+  // CHECK-LABEL: test_tzcnt_u32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i32 @llvm.cttz.i32(i32 %{{.*}}, i1 true)
   return _tzcnt_u32(__X);
 }
 
 unsigned long long test_andn_u64(unsigned long __X, unsigned long __Y) {
-  // CHECK: [[DEST:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK-NEXT: %{{.*}} = and i64 %{{.*}}, [[DEST]]
+  // CHECK-LABEL: test_andn_u64
+  // CHECK: xor i64 %{{.*}}, -1
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return _andn_u64(__X, __Y);
 }
 
-unsigned long long test_bextr_u64(unsigned long __X, unsigned int __Y, 
+unsigned long long test_bextr_u64(unsigned long __X, unsigned int __Y,
                                   unsigned int __Z) {
-  // CHECK: @llvm.x86.bmi.bextr.64
+  // CHECK-LABEL: test_bextr_u64
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: zext i32 %{{.*}} to i64
+  // CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return _bextr_u64(__X, __Y, __Z);
 }
 
 unsigned long long test_blsi_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = sub i64 0, [[SRC:%.*]]
-  // CHECK-NEXT: %{{.*}} = and i64 [[SRC]], [[DEST]]
+  // CHECK-LABEL: test_blsi_u64
+  // CHECK: sub i64 0, %{{.*}}
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return _blsi_u64(__X);
 }
 
 unsigned long long test_blsmsk_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = xor i64 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test_blsmsk_u64
+  // CHECK: sub i64 %{{.*}}, 1
+  // CHECK: xor i64 %{{.*}}, %{{.*}}
   return _blsmsk_u64(__X);
 }
 
 unsigned long long test_blsr_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = and i64 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test_blsr_u64
+  // CHECK: sub i64 %{{.*}}, 1
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return _blsr_u64(__X);
 }
 
 unsigned long long test_tzcnt_u64(unsigned long long __X) {
-  // CHECK: @llvm.cttz.i64
+  // CHECK-LABEL: test_tzcnt_u64
+  // CHECK: icmp ne i64 %{{.*}}, 0
+  // CHECK: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 true)
   return _tzcnt_u64(__X);
 }
diff --git a/test/CodeGen/builtin-clflushopt.c b/test/CodeGen/builtin-clflushopt.c
new file mode 100644
index 0000000..e98c2aa
--- /dev/null
+++ b/test/CodeGen/builtin-clflushopt.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +clflushopt  -emit-llvm -o - -Wall -Werror | FileCheck %s
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+void test_mm_clflushopt(char * __m) {
+  //CHECK-LABLE: @test_mm_clflushopt
+  //CHECK: @llvm.x86.clflushopt
+  _mm_clflushopt(__m);
+}
diff --git a/test/CodeGen/builtin-expect.c b/test/CodeGen/builtin-expect.c
index 884110c..560625e 100644
--- a/test/CodeGen/builtin-expect.c
+++ b/test/CodeGen/builtin-expect.c
@@ -1,45 +1,69 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O0 | FileCheck %s --check-prefix=CHECK_O0
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O1 -disable-llvm-optzns | FileCheck %s --check-prefix=ALL --check-prefix=O1
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O0 | FileCheck %s --check-prefix=ALL --check-prefix=O0
 
-int x;
-int y(void);
-void foo();
-void FUNC() {
-// CHECK-LABEL: define void @FUNC()
-// CHECK: [[call:%.*]] = call i32 @y
-// CHECK_O0: [[call:%.*]] = call i32 @y
-// CHECK_O0-NOT: call i64 @llvm.expect
-  if (__builtin_expect (x, y()))
-    foo ();
-}
+// In all tests, make sure that no expect is generated if optimizations are off.
+// If optimizations are on, generate the correct expect and preserve other necessary operations.
 
-// rdar://9330105
-void isigprocmask(void);
-long bar();
+int expect_taken(int x) {
+// ALL-LABEL: define i32 @expect_taken
+// O1:        call i64 @llvm.expect.i64(i64 {{%.*}}, i64 1)
+// O0-NOT:    @llvm.expect
 
-int main() {
-    (void) __builtin_expect((isigprocmask(), 0), bar());
-}
-
-// CHECK-LABEL: define i32 @main()
-// CHECK: call void @isigprocmask()
-// CHECK: [[C:%.*]] = call i64 (...) @bar()
-// CHECK_O0: call void @isigprocmask()
-// CHECK_O0: [[C:%.*]] = call i64 (...) @bar()
-// CHECK_O0-NOT: call i64 @llvm.expect
-
-
-// CHECK-LABEL: define i32 @test1
-int test1(int x) {
-// CHECK_O0-NOT: call i64 @llvm.expect
   if (__builtin_expect (x, 1))
     return 0;
   return x;
 }
 
-// CHECK: define i32 @test2
-int test2(int x) {
-// CHECK_O0-NOT: call i64 @llvm.expect
+
+int expect_not_taken(int x) {
+// ALL-LABEL: define i32 @expect_not_taken
+// O1:        call i64 @llvm.expect.i64(i64 {{%.*}}, i64 0)
+// O0-NOT:    @llvm.expect
+
+  if (__builtin_expect (x, 0))
+    return 0;
+  return x;
+}
+
+
+int x;
+int y(void);
+void foo();
+
+void expect_value_side_effects() {
+// ALL-LABEL: define void @expect_value_side_effects()
+// ALL:       [[CALL:%.*]] = call i32 @y
+// O1:        [[SEXT:%.*]] = sext i32 [[CALL]] to i64
+// O1:        call i64 @llvm.expect.i64(i64 {{%.*}}, i64 [[SEXT]])
+// O0-NOT:    @llvm.expect
+
+  if (__builtin_expect (x, y()))
+    foo ();
+}
+
+
+// Make sure that issigprocmask() is called before bar()?
+// There's no compare, so there's nothing to expect?
+// rdar://9330105
+void isigprocmask(void);
+long bar();
+
+int main() {
+// ALL-LABEL: define i32 @main()
+// ALL:       call void @isigprocmask()
+// ALL:       [[CALL:%.*]] = call i64 (...) @bar()
+// O1:        call i64 @llvm.expect.i64(i64 0, i64 [[CALL]])
+// O0-NOT:    @llvm.expect
+
+  (void) __builtin_expect((isigprocmask(), 0), bar());
+}
+
+
+int switch_cond(int x) {
+// ALL-LABEL: define i32 @switch_cond
+// O1:        call i64 @llvm.expect.i64(i64 {{%.*}}, i64 5)
+// O0-NOT:    @llvm.expect
+
   switch(__builtin_expect(x, 5)) {
   default:
     return 0;
@@ -53,3 +77,4 @@
 
   return 0;
 }
+
diff --git a/test/CodeGen/builtins-arm-exclusive.c b/test/CodeGen/builtins-arm-exclusive.c
index 2b10238..b0bc2b8 100644
--- a/test/CodeGen/builtins-arm-exclusive.c
+++ b/test/CodeGen/builtins-arm-exclusive.c
@@ -1,32 +1,6 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -O3 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -O3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARM64
+// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix=CHECK-ARM64
 
-// Make sure the canonical use works before going into smaller details:
-int atomic_inc(int *addr) {
-  int Failure, OldVal;
-  do {
-    OldVal = __builtin_arm_ldrex(addr);
-    Failure = __builtin_arm_strex(OldVal + 1, addr);
-  } while (Failure);
-
-  return OldVal;
-}
-
-// CHECK-LABEL: @atomic_inc
-// CHECK:   [[OLDVAL:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
-// CHECK:   [[INC:%.*]] = add nsw i32 [[OLDVAL]], 1
-// CHECK:   [[FAILURE:%.*]] = tail call i32 @llvm.arm.strex.p0i32(i32 [[INC]], i32* %addr)
-// CHECK:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
-// CHECK:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
-
-// CHECK-ARM64-LABEL: @atomic_inc
-// CHECK-ARM64:   [[OLDVAL:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
-// CHECK-ARM64:   [[INC:%.*]] = add i64 [[OLDVAL]], 1
-// CHECK-ARM64:   [[TRUNC:%.*]] = and i64 [[INC]], 4294967295
-// CHECK-ARM64:   [[FAILURE:%.*]] = tail call i32 @llvm.aarch64.stxr.p0i32(i64 [[TRUNC]], i32* %addr)
-// CHECK-ARM64:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
-// CHECK-ARM64:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
 
 struct Simple {
   char a, b;
@@ -37,36 +11,33 @@
 // CHECK-ARM64-LABEL: @test_ldrex
   int sum = 0;
   sum += __builtin_arm_ldrex(addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
-// CHECK: and i32 [[INTRES]], 255
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
+// CHECK: trunc i32 [[INTRES]] to i8
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
-// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+// CHECK-ARM64: trunc i64 [[INTRES]] to i8
 
   sum += __builtin_arm_ldrex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
-// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
-// CHECK: ashr exact i32 [[TMPSEXT]], 16
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
+// CHECK: trunc i32 [[INTRES]] to i16
 
 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]])
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
-// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: trunc i64 [[INTRES]] to i16
 
   sum += __builtin_arm_ldrex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK:  call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
 
 // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]])
 // CHECK-ARM64: trunc i64 [[INTRES]] to i32
 
   sum += __builtin_arm_ldrex((long long *)addr);
-// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
@@ -79,16 +50,18 @@
 
   sum += __builtin_arm_ldrex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float
 
 // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]])
 // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float
 
   sum += __builtin_arm_ldrex((double *)addr);
-// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]])
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0
 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64
@@ -97,21 +70,31 @@
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: bitcast i64 [[INTRES]] to double
 
   sum += *__builtin_arm_ldrex((int **)addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*
 
   sum += __builtin_arm_ldrex((struct Simple **)addr)->a;
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
   return sum;
 }
@@ -121,36 +104,33 @@
 // CHECK-ARM64-LABEL: @test_ldaex
   int sum = 0;
   sum += __builtin_arm_ldaex(addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
-// CHECK: and i32 [[INTRES]], 255
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
+// CHECK: trunc i32 [[INTRES]] to i8
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
-// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+// CHECK-ARM64: trunc i64 [[INTRES]] to i8
 
   sum += __builtin_arm_ldaex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]])
-// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
-// CHECK: ashr exact i32 [[TMPSEXT]], 16
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]])
+// CHECK: trunc i32 [[INTRES]] to i16
 
 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]])
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
-// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i16
 
   sum += __builtin_arm_ldaex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
 // CHECK:  call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
 
 // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]])
 // CHECK-ARM64: trunc i64 [[INTRES]] to i32
 
   sum += __builtin_arm_ldaex((long long *)addr);
-// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
@@ -163,16 +143,18 @@
 
   sum += __builtin_arm_ldaex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]])
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float
 
 // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]])
 // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float
 
   sum += __builtin_arm_ldaex((double *)addr);
-// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldaexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]])
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0
 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64
@@ -181,21 +163,31 @@
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: bitcast i64 [[INTRES]] to double
 
   sum += *__builtin_arm_ldaex((int **)addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*
 
   sum += __builtin_arm_ldaex((struct Simple **)addr)->a;
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
   return sum;
 }
@@ -225,27 +217,51 @@
 // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 42, i32* [[ADDR32]])
 
   res |= __builtin_arm_strex(42, (long long *)addr);
-// CHECK: call i32 @llvm.arm.strexd(i32 42, i32 0, i8* %addr)
+// CHECK: store i64 42, i64* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 42, i64* [[ADDR64]])
 
   res |= __builtin_arm_strex(2.71828f, (float *)addr);
-// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[TMP5]])
 
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[TMP5]])
 
   res |= __builtin_arm_strex(3.14159, (double *)addr);
-// CHECK: call i32 @llvm.arm.strexd(i32 -266631570, i32 1074340345, i8* %addr)
+// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
 
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[TMP5]])
 
   res |= __builtin_arm_strex(&var, (struct Simple **)addr);
-// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
-// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32
+// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[TMP5]])
 
-// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]])
 
   return res;
 }
@@ -275,27 +291,51 @@
 // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 42, i32* [[ADDR32]])
 
   res |= __builtin_arm_stlex(42, (long long *)addr);
-// CHECK: call i32 @llvm.arm.stlexd(i32 42, i32 0, i8* %addr)
+// CHECK: store i64 42, i64* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 42, i64* [[ADDR64]])
 
   res |= __builtin_arm_stlex(2.71828f, (float *)addr);
-// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[TMP5]])
 
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[TMP5]])
 
   res |= __builtin_arm_stlex(3.14159, (double *)addr);
-// CHECK: call i32 @llvm.arm.stlexd(i32 -266631570, i32 1074340345, i8* %addr)
+// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
 
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[TMP5]])
 
   res |= __builtin_arm_stlex(&var, (struct Simple **)addr);
-// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
-// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32
+// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[TMP5]])
 
-// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]])
 
   return res;
 }
@@ -317,7 +357,7 @@
 
   return __builtin_arm_ldrex(addr);
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]])
 // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
@@ -331,11 +371,13 @@
 // CHECK-ARM64-LABEL: @test_strex_128
 
   return __builtin_arm_strex(val, addr);
-// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
-// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
-// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16
+// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }*
+// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]]
+// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0
+// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
+// CHECK-ARM64: call i32 @llvm.aarch64.stxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]])
 }
 
 __int128 test_ldaex_128(__int128 *addr) {
@@ -343,7 +385,7 @@
 
   return __builtin_arm_ldaex(addr);
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]])
 // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
@@ -357,11 +399,13 @@
 // CHECK-ARM64-LABEL: @test_stlex_128
 
   return __builtin_arm_stlex(val, addr);
-// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
-// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
-// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16
+// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }*
+// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]]
+// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0
+// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stlxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
+// CHECK-ARM64: [[RES:%.*]] = call i32 @llvm.aarch64.stlxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]])
 }
 
 #endif
diff --git a/test/CodeGen/builtins-arm.c b/test/CodeGen/builtins-arm.c
index 4cec84c..a385bd2 100644
--- a/test/CodeGen/builtins-arm.c
+++ b/test/CodeGen/builtins-arm.c
@@ -1,5 +1,6 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <stdint.h>
 
 void *f0()
 {
@@ -85,16 +86,86 @@
 // CHECK: call {{.*}} @llvm.prefetch(i8* %{{.*}}, i32 1, i32 3, i32 0)
 }
 
+void ldc(const void *i) {
+  // CHECK: define void @ldc(i8* %i)
+  // CHECK: call void @llvm.arm.ldc(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_ldc(1, 2, i);
+}
+
+void ldcl(const void *i) {
+  // CHECK: define void @ldcl(i8* %i)
+  // CHECK: call void @llvm.arm.ldcl(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_ldcl(1, 2, i);
+}
+
+void ldc2(const void *i) {
+  // CHECK: define void @ldc2(i8* %i)
+  // CHECK: call void @llvm.arm.ldc2(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_ldc2(1, 2, i);
+}
+
+void ldc2l(const void *i) {
+  // CHECK: define void @ldc2l(i8* %i)
+  // CHECK: call void @llvm.arm.ldc2l(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_ldc2l(1, 2, i);
+}
+
+void stc(void *i) {
+  // CHECK: define void @stc(i8* %i)
+  // CHECK: call void @llvm.arm.stc(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_stc(1, 2, i);
+}
+
+void stcl(void *i) {
+  // CHECK: define void @stcl(i8* %i)
+  // CHECK: call void @llvm.arm.stcl(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_stcl(1, 2, i);
+}
+
+void stc2(void *i) {
+  // CHECK: define void @stc2(i8* %i)
+  // CHECK: call void @llvm.arm.stc2(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_stc2(1, 2, i);
+}
+
+void stc2l(void *i) {
+  // CHECK: define void @stc2l(i8* %i)
+  // CHECK: call void @llvm.arm.stc2l(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_stc2l(1, 2, i);
+}
+
+void cdp() {
+  // CHECK: define void @cdp()
+  // CHECK: call void @llvm.arm.cdp(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+  // CHECK-NEXT: ret void
+  __builtin_arm_cdp(1, 2, 3, 4, 5, 6);
+}
+
+void cdp2() {
+  // CHECK: define void @cdp2()
+  // CHECK: call void @llvm.arm.cdp2(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+  // CHECK-NEXT: ret void
+  __builtin_arm_cdp2(1, 2, 3, 4, 5, 6);
+}
+
 unsigned mrc() {
   // CHECK: define i32 @mrc()
-  // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3)
+  // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3)
   // CHECK-NEXT: ret i32 [[R]]
   return __builtin_arm_mrc(15, 0, 13, 0, 3);
 }
 
 unsigned mrc2() {
   // CHECK: define i32 @mrc2()
-  // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3)
+  // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3)
   // CHECK-NEXT: ret i32 [[R]]
   return __builtin_arm_mrc2(15, 0, 13, 0, 3);
 }
@@ -111,53 +182,65 @@
   __builtin_arm_mcr2(15, 0, a, 13, 0, 3);
 }
 
-void mcrr(unsigned a, unsigned b) {
-  // CHECK: define void @mcrr(i32 [[A:%.*]], i32 [[B:%.*]])
-  // CHECK: call void @llvm.arm.mcrr(i32 15, i32 0, i32 [[A]], i32 [[B]], i32 0)
-  __builtin_arm_mcrr(15, 0, a, b, 0);
+void mcrr(uint64_t a) {
+  // CHECK: define void @mcrr(i64 %{{.*}})
+  // CHECK: call void @llvm.arm.mcrr(i32 15, i32 0, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0)
+  __builtin_arm_mcrr(15, 0, a, 0);
 }
 
-void mcrr2(unsigned a, unsigned b) {
-  // CHECK: define void @mcrr2(i32 [[A:%.*]], i32 [[B:%.*]])
-  // CHECK: call void @llvm.arm.mcrr2(i32 15, i32 0, i32 [[A]], i32 [[B]], i32 0)
-  __builtin_arm_mcrr2(15, 0, a, b, 0);
+void mcrr2(uint64_t a) {
+  // CHECK: define void @mcrr2(i64 %{{.*}})
+  // CHECK: call void @llvm.arm.mcrr2(i32 15, i32 0, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0)
+  __builtin_arm_mcrr2(15, 0, a, 0);
+}
+
+uint64_t mrrc() {
+  // CHECK: define i64 @mrrc()
+  // CHECK: call { i32, i32 } @llvm.arm.mrrc(i32 15, i32 0, i32 0)
+  return __builtin_arm_mrrc(15, 0, 0);
+}
+
+uint64_t mrrc2() {
+  // CHECK: define i64 @mrrc2()
+  // CHECK: call { i32, i32 } @llvm.arm.mrrc2(i32 15, i32 0, i32 0)
+  return __builtin_arm_mrrc2(15, 0, 0);
 }
 
 unsigned rsr() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !7)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M0:.*]])
   // CHECK-NEXT: ret i32 [[V0]]
   return __builtin_arm_rsr("cp1:2:c3:c4:5");
 }
 
 unsigned long long rsr64() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata !8)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M1:.*]])
   // CHECK-NEXT: ret i64 [[V0]]
   return __builtin_arm_rsr64("cp1:2:c3");
 }
 
 void *rsrp() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !9)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M2:.*]])
   // CHECK-NEXT: [[V1:[%A-Za-z0-9.]+]] = inttoptr i32 [[V0]] to i8*
   // CHECK-NEXT: ret i8* [[V1]]
   return __builtin_arm_rsrp("sysreg");
 }
 
 void wsr(unsigned v) {
-  // CHECK: call void @llvm.write_register.i32(metadata !7, i32 %v)
+  // CHECK: call void @llvm.write_register.i32(metadata ![[M0]], i32 %v)
   __builtin_arm_wsr("cp1:2:c3:c4:5", v);
 }
 
 void wsr64(unsigned long long v) {
-  // CHECK: call void @llvm.write_register.i64(metadata !8, i64 %v)
+  // CHECK: call void @llvm.write_register.i64(metadata ![[M1]], i64 %v)
   __builtin_arm_wsr64("cp1:2:c3", v);
 }
 
 void wsrp(void *v) {
   // CHECK: [[V0:[%A-Za-z0-9.]+]] = ptrtoint i8* %v to i32
-  // CHECK-NEXT: call void @llvm.write_register.i32(metadata !9, i32 [[V0]])
+  // CHECK-NEXT: call void @llvm.write_register.i32(metadata ![[M2]], i32 [[V0]])
   __builtin_arm_wsrp("sysreg", v);
 }
 
-// CHECK: !7 = !{!"cp1:2:c3:c4:5"}
-// CHECK: !8 = !{!"cp1:2:c3"}
-// CHECK: !9 = !{!"sysreg"}
+// CHECK: ![[M0]] = !{!"cp1:2:c3:c4:5"}
+// CHECK: ![[M1]] = !{!"cp1:2:c3"}
+// CHECK: ![[M2]] = !{!"sysreg"}
diff --git a/test/CodeGen/builtins-arm64.c b/test/CodeGen/builtins-arm64.c
index 16e22d7..20eb2ab 100644
--- a/test/CodeGen/builtins-arm64.c
+++ b/test/CodeGen/builtins-arm64.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-unknown-linux -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 void f0(void *a, void *b) {
 	__clear_cache(a,b);
@@ -7,7 +7,7 @@
 
 void *tp (void) {
   return __builtin_thread_pointer ();
-// CHECK: call {{.*}} @llvm.aarch64.thread.pointer()
+// CHECK: call {{.*}} @llvm.thread.pointer()
 }
 
 // CHECK: call {{.*}} @llvm.aarch64.rbit.i32(i32 %a)
@@ -50,7 +50,7 @@
 }
 
 unsigned rsr() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
   // CHECK-NEXT: trunc i64 [[V0]] to i32
   return __builtin_arm_rsr("1:2:3:4:5");
 }
@@ -61,7 +61,7 @@
 }
 
 void *rsrp() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
   // CHECK-NEXT: inttoptr i64 [[V0]] to i8*
   return __builtin_arm_rsrp("1:2:3:4:5");
 }
diff --git a/test/CodeGen/builtins-hexagon.c b/test/CodeGen/builtins-hexagon.c
new file mode 100644
index 0000000..e2eda2a
--- /dev/null
+++ b/test/CodeGen/builtins-hexagon.c
@@ -0,0 +1,2965 @@
+// REQUIRES: hexagon-registered-target
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -emit-llvm %s -o - | FileCheck %s
+
+void foo() {
+  int v16 __attribute__((__vector_size__(64)));
+  int v32 __attribute__((__vector_size__(128)));
+  int v64 __attribute__((__vector_size__(256)));
+
+  // The circ/brev intrinsics do not have _HEXAGON_ in the name.
+  __builtin_brev_ldb(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldb
+  __builtin_brev_ldd(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldd
+  __builtin_brev_ldh(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldh
+  __builtin_brev_ldub(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldub
+  __builtin_brev_lduh(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.lduh
+  __builtin_brev_ldw(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldw
+  __builtin_brev_stb(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.stb
+  __builtin_brev_std(0, 0LL, 0);
+  // CHECK: @llvm.hexagon.brev.std
+  __builtin_brev_sth(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.sth
+  __builtin_brev_sthhi(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.sthhi
+  __builtin_brev_stw(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.stw
+  __builtin_circ_ldb(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldb
+  __builtin_circ_ldd(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldd
+  __builtin_circ_ldh(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldh
+  __builtin_circ_ldub(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldub
+  __builtin_circ_lduh(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.lduh
+  __builtin_circ_ldw(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldw
+  __builtin_circ_stb(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.stb
+  __builtin_circ_std(0, 0LL, 0, 0);
+  // CHECK: llvm.hexagon.circ.std
+  __builtin_circ_sth(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.sth
+  __builtin_circ_sthhi(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.sthhi
+  __builtin_circ_stw(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.stw
+
+  __builtin_HEXAGON_A2_abs(0);
+  // CHECK: @llvm.hexagon.A2.abs
+  __builtin_HEXAGON_A2_absp(0);
+  // CHECK: @llvm.hexagon.A2.absp
+  __builtin_HEXAGON_A2_abssat(0);
+  // CHECK: @llvm.hexagon.A2.abssat
+  __builtin_HEXAGON_A2_add(0, 0);
+  // CHECK: @llvm.hexagon.A2.add
+  __builtin_HEXAGON_A2_addh_h16_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.hh
+  __builtin_HEXAGON_A2_addh_h16_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.hl
+  __builtin_HEXAGON_A2_addh_h16_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.lh
+  __builtin_HEXAGON_A2_addh_h16_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.ll
+  __builtin_HEXAGON_A2_addh_h16_sat_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.sat.hh
+  __builtin_HEXAGON_A2_addh_h16_sat_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.sat.hl
+  __builtin_HEXAGON_A2_addh_h16_sat_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.sat.lh
+  __builtin_HEXAGON_A2_addh_h16_sat_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.sat.ll
+  __builtin_HEXAGON_A2_addh_l16_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.l16.hl
+  __builtin_HEXAGON_A2_addh_l16_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.l16.ll
+  __builtin_HEXAGON_A2_addh_l16_sat_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.l16.sat.hl
+  __builtin_HEXAGON_A2_addh_l16_sat_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.l16.sat.ll
+  __builtin_HEXAGON_A2_addi(0, 0);
+  // CHECK: @llvm.hexagon.A2.addi
+  __builtin_HEXAGON_A2_addp(0, 0);
+  // CHECK: @llvm.hexagon.A2.addp
+  __builtin_HEXAGON_A2_addpsat(0, 0);
+  // CHECK: @llvm.hexagon.A2.addpsat
+  __builtin_HEXAGON_A2_addsat(0, 0);
+  // CHECK: @llvm.hexagon.A2.addsat
+  __builtin_HEXAGON_A2_addsp(0, 0);
+  // CHECK: @llvm.hexagon.A2.addsp
+  __builtin_HEXAGON_A2_and(0, 0);
+  // CHECK: @llvm.hexagon.A2.and
+  __builtin_HEXAGON_A2_andir(0, 0);
+  // CHECK: @llvm.hexagon.A2.andir
+  __builtin_HEXAGON_A2_andp(0, 0);
+  // CHECK: @llvm.hexagon.A2.andp
+  __builtin_HEXAGON_A2_aslh(0);
+  // CHECK: @llvm.hexagon.A2.aslh
+  __builtin_HEXAGON_A2_asrh(0);
+  // CHECK: @llvm.hexagon.A2.asrh
+  __builtin_HEXAGON_A2_combine_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.combine.hh
+  __builtin_HEXAGON_A2_combine_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.combine.hl
+  __builtin_HEXAGON_A2_combineii(0, 0);
+  // CHECK: @llvm.hexagon.A2.combineii
+  __builtin_HEXAGON_A2_combine_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.combine.lh
+  __builtin_HEXAGON_A2_combine_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.combine.ll
+  __builtin_HEXAGON_A2_combinew(0, 0);
+  // CHECK: @llvm.hexagon.A2.combinew
+  __builtin_HEXAGON_A2_max(0, 0);
+  // CHECK: @llvm.hexagon.A2.max
+  __builtin_HEXAGON_A2_maxp(0, 0);
+  // CHECK: @llvm.hexagon.A2.maxp
+  __builtin_HEXAGON_A2_maxu(0, 0);
+  // CHECK: @llvm.hexagon.A2.maxu
+  __builtin_HEXAGON_A2_maxup(0, 0);
+  // CHECK: @llvm.hexagon.A2.maxup
+  __builtin_HEXAGON_A2_min(0, 0);
+  // CHECK: @llvm.hexagon.A2.min
+  __builtin_HEXAGON_A2_minp(0, 0);
+  // CHECK: @llvm.hexagon.A2.minp
+  __builtin_HEXAGON_A2_minu(0, 0);
+  // CHECK: @llvm.hexagon.A2.minu
+  __builtin_HEXAGON_A2_minup(0, 0);
+  // CHECK: @llvm.hexagon.A2.minup
+  __builtin_HEXAGON_A2_neg(0);
+  // CHECK: @llvm.hexagon.A2.neg
+  __builtin_HEXAGON_A2_negp(0);
+  // CHECK: @llvm.hexagon.A2.negp
+  __builtin_HEXAGON_A2_negsat(0);
+  // CHECK: @llvm.hexagon.A2.negsat
+  __builtin_HEXAGON_A2_not(0);
+  // CHECK: @llvm.hexagon.A2.not
+  __builtin_HEXAGON_A2_notp(0);
+  // CHECK: @llvm.hexagon.A2.notp
+  __builtin_HEXAGON_A2_or(0, 0);
+  // CHECK: @llvm.hexagon.A2.or
+  __builtin_HEXAGON_A2_orir(0, 0);
+  // CHECK: @llvm.hexagon.A2.orir
+  __builtin_HEXAGON_A2_orp(0, 0);
+  // CHECK: @llvm.hexagon.A2.orp
+  __builtin_HEXAGON_A2_roundsat(0);
+  // CHECK: @llvm.hexagon.A2.roundsat
+  __builtin_HEXAGON_A2_sat(0);
+  // CHECK: @llvm.hexagon.A2.sat
+  __builtin_HEXAGON_A2_satb(0);
+  // CHECK: @llvm.hexagon.A2.satb
+  __builtin_HEXAGON_A2_sath(0);
+  // CHECK: @llvm.hexagon.A2.sath
+  __builtin_HEXAGON_A2_satub(0);
+  // CHECK: @llvm.hexagon.A2.satub
+  __builtin_HEXAGON_A2_satuh(0);
+  // CHECK: @llvm.hexagon.A2.satuh
+  __builtin_HEXAGON_A2_sub(0, 0);
+  // CHECK: @llvm.hexagon.A2.sub
+  __builtin_HEXAGON_A2_subh_h16_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.hh
+  __builtin_HEXAGON_A2_subh_h16_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.hl
+  __builtin_HEXAGON_A2_subh_h16_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.lh
+  __builtin_HEXAGON_A2_subh_h16_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.ll
+  __builtin_HEXAGON_A2_subh_h16_sat_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.sat.hh
+  __builtin_HEXAGON_A2_subh_h16_sat_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.sat.hl
+  __builtin_HEXAGON_A2_subh_h16_sat_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.sat.lh
+  __builtin_HEXAGON_A2_subh_h16_sat_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.sat.ll
+  __builtin_HEXAGON_A2_subh_l16_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.l16.hl
+  __builtin_HEXAGON_A2_subh_l16_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.l16.ll
+  __builtin_HEXAGON_A2_subh_l16_sat_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.l16.sat.hl
+  __builtin_HEXAGON_A2_subh_l16_sat_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.l16.sat.ll
+  __builtin_HEXAGON_A2_subp(0, 0);
+  // CHECK: @llvm.hexagon.A2.subp
+  __builtin_HEXAGON_A2_subri(0, 0);
+  // CHECK: @llvm.hexagon.A2.subri
+  __builtin_HEXAGON_A2_subsat(0, 0);
+  // CHECK: @llvm.hexagon.A2.subsat
+  __builtin_HEXAGON_A2_svaddh(0, 0);
+  // CHECK: @llvm.hexagon.A2.svaddh
+  __builtin_HEXAGON_A2_svaddhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svaddhs
+  __builtin_HEXAGON_A2_svadduhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svadduhs
+  __builtin_HEXAGON_A2_svavgh(0, 0);
+  // CHECK: @llvm.hexagon.A2.svavgh
+  __builtin_HEXAGON_A2_svavghs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svavghs
+  __builtin_HEXAGON_A2_svnavgh(0, 0);
+  // CHECK: @llvm.hexagon.A2.svnavgh
+  __builtin_HEXAGON_A2_svsubh(0, 0);
+  // CHECK: @llvm.hexagon.A2.svsubh
+  __builtin_HEXAGON_A2_svsubhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svsubhs
+  __builtin_HEXAGON_A2_svsubuhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svsubuhs
+  __builtin_HEXAGON_A2_swiz(0);
+  // CHECK: @llvm.hexagon.A2.swiz
+  __builtin_HEXAGON_A2_sxtb(0);
+  // CHECK: @llvm.hexagon.A2.sxtb
+  __builtin_HEXAGON_A2_sxth(0);
+  // CHECK: @llvm.hexagon.A2.sxth
+  __builtin_HEXAGON_A2_sxtw(0);
+  // CHECK: @llvm.hexagon.A2.sxtw
+  __builtin_HEXAGON_A2_tfr(0);
+  // CHECK: @llvm.hexagon.A2.tfr
+  __builtin_HEXAGON_A2_tfrih(0, 0);
+  // CHECK: @llvm.hexagon.A2.tfrih
+  __builtin_HEXAGON_A2_tfril(0, 0);
+  // CHECK: @llvm.hexagon.A2.tfril
+  __builtin_HEXAGON_A2_tfrp(0);
+  // CHECK: @llvm.hexagon.A2.tfrp
+  __builtin_HEXAGON_A2_tfrpi(0);
+  // CHECK: @llvm.hexagon.A2.tfrpi
+  __builtin_HEXAGON_A2_tfrsi(0);
+  // CHECK: @llvm.hexagon.A2.tfrsi
+  __builtin_HEXAGON_A2_vabsh(0);
+  // CHECK: @llvm.hexagon.A2.vabsh
+  __builtin_HEXAGON_A2_vabshsat(0);
+  // CHECK: @llvm.hexagon.A2.vabshsat
+  __builtin_HEXAGON_A2_vabsw(0);
+  // CHECK: @llvm.hexagon.A2.vabsw
+  __builtin_HEXAGON_A2_vabswsat(0);
+  // CHECK: @llvm.hexagon.A2.vabswsat
+  __builtin_HEXAGON_A2_vaddb_map(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddb.map
+  __builtin_HEXAGON_A2_vaddh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddh
+  __builtin_HEXAGON_A2_vaddhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddhs
+  __builtin_HEXAGON_A2_vaddub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddub
+  __builtin_HEXAGON_A2_vaddubs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddubs
+  __builtin_HEXAGON_A2_vadduhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vadduhs
+  __builtin_HEXAGON_A2_vaddw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddw
+  __builtin_HEXAGON_A2_vaddws(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddws
+  __builtin_HEXAGON_A2_vavgh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgh
+  __builtin_HEXAGON_A2_vavghcr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavghcr
+  __builtin_HEXAGON_A2_vavghr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavghr
+  __builtin_HEXAGON_A2_vavgub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgub
+  __builtin_HEXAGON_A2_vavgubr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgubr
+  __builtin_HEXAGON_A2_vavguh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavguh
+  __builtin_HEXAGON_A2_vavguhr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavguhr
+  __builtin_HEXAGON_A2_vavguw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavguw
+  __builtin_HEXAGON_A2_vavguwr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavguwr
+  __builtin_HEXAGON_A2_vavgw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgw
+  __builtin_HEXAGON_A2_vavgwcr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgwcr
+  __builtin_HEXAGON_A2_vavgwr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgwr
+  __builtin_HEXAGON_A2_vcmpbeq(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpbeq
+  __builtin_HEXAGON_A2_vcmpbgtu(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpbgtu
+  __builtin_HEXAGON_A2_vcmpheq(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpheq
+  __builtin_HEXAGON_A2_vcmphgt(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmphgt
+  __builtin_HEXAGON_A2_vcmphgtu(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmphgtu
+  __builtin_HEXAGON_A2_vcmpweq(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpweq
+  __builtin_HEXAGON_A2_vcmpwgt(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpwgt
+  __builtin_HEXAGON_A2_vcmpwgtu(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpwgtu
+  __builtin_HEXAGON_A2_vconj(0);
+  // CHECK: @llvm.hexagon.A2.vconj
+  __builtin_HEXAGON_A2_vmaxb(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxb
+  __builtin_HEXAGON_A2_vmaxh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxh
+  __builtin_HEXAGON_A2_vmaxub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxub
+  __builtin_HEXAGON_A2_vmaxuh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxuh
+  __builtin_HEXAGON_A2_vmaxuw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxuw
+  __builtin_HEXAGON_A2_vmaxw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxw
+  __builtin_HEXAGON_A2_vminb(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminb
+  __builtin_HEXAGON_A2_vminh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminh
+  __builtin_HEXAGON_A2_vminub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminub
+  __builtin_HEXAGON_A2_vminuh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminuh
+  __builtin_HEXAGON_A2_vminuw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminuw
+  __builtin_HEXAGON_A2_vminw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminw
+  __builtin_HEXAGON_A2_vnavgh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavgh
+  __builtin_HEXAGON_A2_vnavghcr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavghcr
+  __builtin_HEXAGON_A2_vnavghr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavghr
+  __builtin_HEXAGON_A2_vnavgw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavgw
+  __builtin_HEXAGON_A2_vnavgwcr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavgwcr
+  __builtin_HEXAGON_A2_vnavgwr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavgwr
+  __builtin_HEXAGON_A2_vraddub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vraddub
+  __builtin_HEXAGON_A2_vraddub_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.A2.vraddub.acc
+  __builtin_HEXAGON_A2_vrsadub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vrsadub
+  __builtin_HEXAGON_A2_vrsadub_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.A2.vrsadub.acc
+  __builtin_HEXAGON_A2_vsubb_map(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubb.map
+  __builtin_HEXAGON_A2_vsubh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubh
+  __builtin_HEXAGON_A2_vsubhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubhs
+  __builtin_HEXAGON_A2_vsubub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubub
+  __builtin_HEXAGON_A2_vsububs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsububs
+  __builtin_HEXAGON_A2_vsubuhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubuhs
+  __builtin_HEXAGON_A2_vsubw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubw
+  __builtin_HEXAGON_A2_vsubws(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubws
+  __builtin_HEXAGON_A2_xor(0, 0);
+  // CHECK: @llvm.hexagon.A2.xor
+  __builtin_HEXAGON_A2_xorp(0, 0);
+  // CHECK: @llvm.hexagon.A2.xorp
+  __builtin_HEXAGON_A2_zxtb(0);
+  // CHECK: @llvm.hexagon.A2.zxtb
+  __builtin_HEXAGON_A2_zxth(0);
+  // CHECK: @llvm.hexagon.A2.zxth
+  __builtin_HEXAGON_A4_andn(0, 0);
+  // CHECK: @llvm.hexagon.A4.andn
+  __builtin_HEXAGON_A4_andnp(0, 0);
+  // CHECK: @llvm.hexagon.A4.andnp
+  __builtin_HEXAGON_A4_bitsplit(0, 0);
+  // CHECK: @llvm.hexagon.A4.bitsplit
+  __builtin_HEXAGON_A4_bitspliti(0, 0);
+  // CHECK: @llvm.hexagon.A4.bitspliti
+  __builtin_HEXAGON_A4_boundscheck(0, 0);
+  // CHECK: @llvm.hexagon.A4.boundscheck
+  __builtin_HEXAGON_A4_cmpbeq(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbeq
+  __builtin_HEXAGON_A4_cmpbeqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbeqi
+  __builtin_HEXAGON_A4_cmpbgt(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbgt
+  __builtin_HEXAGON_A4_cmpbgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbgti
+  __builtin_HEXAGON_A4_cmpbgtu(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbgtu
+  __builtin_HEXAGON_A4_cmpbgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbgtui
+  __builtin_HEXAGON_A4_cmpheq(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpheq
+  __builtin_HEXAGON_A4_cmpheqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpheqi
+  __builtin_HEXAGON_A4_cmphgt(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmphgt
+  __builtin_HEXAGON_A4_cmphgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmphgti
+  __builtin_HEXAGON_A4_cmphgtu(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmphgtu
+  __builtin_HEXAGON_A4_cmphgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmphgtui
+  __builtin_HEXAGON_A4_combineir(0, 0);
+  // CHECK: @llvm.hexagon.A4.combineir
+  __builtin_HEXAGON_A4_combineri(0, 0);
+  // CHECK: @llvm.hexagon.A4.combineri
+  __builtin_HEXAGON_A4_cround_ri(0, 0);
+  // CHECK: @llvm.hexagon.A4.cround.ri
+  __builtin_HEXAGON_A4_cround_rr(0, 0);
+  // CHECK: @llvm.hexagon.A4.cround.rr
+  __builtin_HEXAGON_A4_modwrapu(0, 0);
+  // CHECK: @llvm.hexagon.A4.modwrapu
+  __builtin_HEXAGON_A4_orn(0, 0);
+  // CHECK: @llvm.hexagon.A4.orn
+  __builtin_HEXAGON_A4_ornp(0, 0);
+  // CHECK: @llvm.hexagon.A4.ornp
+  __builtin_HEXAGON_A4_rcmpeq(0, 0);
+  // CHECK: @llvm.hexagon.A4.rcmpeq
+  __builtin_HEXAGON_A4_rcmpeqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.rcmpeqi
+  __builtin_HEXAGON_A4_rcmpneq(0, 0);
+  // CHECK: @llvm.hexagon.A4.rcmpneq
+  __builtin_HEXAGON_A4_rcmpneqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.rcmpneqi
+  __builtin_HEXAGON_A4_round_ri(0, 0);
+  // CHECK: @llvm.hexagon.A4.round.ri
+  __builtin_HEXAGON_A4_round_ri_sat(0, 0);
+  // CHECK: @llvm.hexagon.A4.round.ri.sat
+  __builtin_HEXAGON_A4_round_rr(0, 0);
+  // CHECK: @llvm.hexagon.A4.round.rr
+  __builtin_HEXAGON_A4_round_rr_sat(0, 0);
+  // CHECK: @llvm.hexagon.A4.round.rr.sat
+  __builtin_HEXAGON_A4_tlbmatch(0, 0);
+  // CHECK: @llvm.hexagon.A4.tlbmatch
+  __builtin_HEXAGON_A4_vcmpbeq_any(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbeq.any
+  __builtin_HEXAGON_A4_vcmpbeqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbeqi
+  __builtin_HEXAGON_A4_vcmpbgt(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbgt
+  __builtin_HEXAGON_A4_vcmpbgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbgti
+  __builtin_HEXAGON_A4_vcmpbgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbgtui
+  __builtin_HEXAGON_A4_vcmpheqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpheqi
+  __builtin_HEXAGON_A4_vcmphgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmphgti
+  __builtin_HEXAGON_A4_vcmphgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmphgtui
+  __builtin_HEXAGON_A4_vcmpweqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpweqi
+  __builtin_HEXAGON_A4_vcmpwgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpwgti
+  __builtin_HEXAGON_A4_vcmpwgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpwgtui
+  __builtin_HEXAGON_A4_vrmaxh(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrmaxh
+  __builtin_HEXAGON_A4_vrmaxuh(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrmaxuh
+  __builtin_HEXAGON_A4_vrmaxuw(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrmaxuw
+  __builtin_HEXAGON_A4_vrmaxw(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrmaxw
+  __builtin_HEXAGON_A4_vrminh(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrminh
+  __builtin_HEXAGON_A4_vrminuh(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrminuh
+  __builtin_HEXAGON_A4_vrminuw(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrminuw
+  __builtin_HEXAGON_A4_vrminw(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrminw
+  __builtin_HEXAGON_A5_vaddhubs(0, 0);
+  // CHECK: @llvm.hexagon.A5.vaddhubs
+  __builtin_HEXAGON_C2_all8(0);
+  // CHECK: @llvm.hexagon.C2.all8
+  __builtin_HEXAGON_C2_and(0, 0);
+  // CHECK: @llvm.hexagon.C2.and
+  __builtin_HEXAGON_C2_andn(0, 0);
+  // CHECK: @llvm.hexagon.C2.andn
+  __builtin_HEXAGON_C2_any8(0);
+  // CHECK: @llvm.hexagon.C2.any8
+  __builtin_HEXAGON_C2_bitsclr(0, 0);
+  // CHECK: @llvm.hexagon.C2.bitsclr
+  __builtin_HEXAGON_C2_bitsclri(0, 0);
+  // CHECK: @llvm.hexagon.C2.bitsclri
+  __builtin_HEXAGON_C2_bitsset(0, 0);
+  // CHECK: @llvm.hexagon.C2.bitsset
+  __builtin_HEXAGON_C2_cmpeq(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpeq
+  __builtin_HEXAGON_C2_cmpeqi(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpeqi
+  __builtin_HEXAGON_C2_cmpeqp(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpeqp
+  __builtin_HEXAGON_C2_cmpgei(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgei
+  __builtin_HEXAGON_C2_cmpgeui(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgeui
+  __builtin_HEXAGON_C2_cmpgt(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgt
+  __builtin_HEXAGON_C2_cmpgti(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgti
+  __builtin_HEXAGON_C2_cmpgtp(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgtp
+  __builtin_HEXAGON_C2_cmpgtu(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgtu
+  __builtin_HEXAGON_C2_cmpgtui(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgtui
+  __builtin_HEXAGON_C2_cmpgtup(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgtup
+  __builtin_HEXAGON_C2_cmplt(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmplt
+  __builtin_HEXAGON_C2_cmpltu(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpltu
+  __builtin_HEXAGON_C2_mask(0);
+  // CHECK: @llvm.hexagon.C2.mask
+  __builtin_HEXAGON_C2_mux(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.mux
+  __builtin_HEXAGON_C2_muxii(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.muxii
+  __builtin_HEXAGON_C2_muxir(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.muxir
+  __builtin_HEXAGON_C2_muxri(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.muxri
+  __builtin_HEXAGON_C2_not(0);
+  // CHECK: @llvm.hexagon.C2.not
+  __builtin_HEXAGON_C2_or (0, 0);
+  // CHECK: @llvm.hexagon.C2.or 
+  __builtin_HEXAGON_C2_orn(0, 0);
+  // CHECK: @llvm.hexagon.C2.orn
+  __builtin_HEXAGON_C2_pxfer_map(0);
+  // CHECK: @llvm.hexagon.C2.pxfer.map
+  __builtin_HEXAGON_C2_tfrpr(0);
+  // CHECK: @llvm.hexagon.C2.tfrpr
+  __builtin_HEXAGON_C2_tfrrp(0);
+  // CHECK: @llvm.hexagon.C2.tfrrp
+  __builtin_HEXAGON_C2_vitpack(0, 0);
+  // CHECK: @llvm.hexagon.C2.vitpack
+  __builtin_HEXAGON_C2_vmux(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.vmux
+  __builtin_HEXAGON_C2_xor(0, 0);
+  // CHECK: @llvm.hexagon.C2.xor
+  __builtin_HEXAGON_C4_and_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.and.and
+  __builtin_HEXAGON_C4_and_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.and.andn
+  __builtin_HEXAGON_C4_and_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.and.or
+  __builtin_HEXAGON_C4_and_orn(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.and.orn
+  __builtin_HEXAGON_C4_cmplte(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmplte
+  __builtin_HEXAGON_C4_cmpltei(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmpltei
+  __builtin_HEXAGON_C4_cmplteu(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmplteu
+  __builtin_HEXAGON_C4_cmplteui(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmplteui
+  __builtin_HEXAGON_C4_cmpneq(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmpneq
+  __builtin_HEXAGON_C4_cmpneqi(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmpneqi
+  __builtin_HEXAGON_C4_fastcorner9(0, 0);
+  // CHECK: @llvm.hexagon.C4.fastcorner9
+  __builtin_HEXAGON_C4_fastcorner9_not(0, 0);
+  // CHECK: @llvm.hexagon.C4.fastcorner9.not
+  __builtin_HEXAGON_C4_nbitsclr(0, 0);
+  // CHECK: @llvm.hexagon.C4.nbitsclr
+  __builtin_HEXAGON_C4_nbitsclri(0, 0);
+  // CHECK: @llvm.hexagon.C4.nbitsclri
+  __builtin_HEXAGON_C4_nbitsset(0, 0);
+  // CHECK: @llvm.hexagon.C4.nbitsset
+  __builtin_HEXAGON_C4_or_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.or.and
+  __builtin_HEXAGON_C4_or_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.or.andn
+  __builtin_HEXAGON_C4_or_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.or.or
+  __builtin_HEXAGON_C4_or_orn(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.or.orn
+  __builtin_HEXAGON_F2_conv_d2df(0);
+  // CHECK: @llvm.hexagon.F2.conv.d2df
+  __builtin_HEXAGON_F2_conv_d2sf(0);
+  // CHECK: @llvm.hexagon.F2.conv.d2sf
+  __builtin_HEXAGON_F2_conv_df2d(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2d
+  __builtin_HEXAGON_F2_conv_df2d_chop(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2d.chop
+  __builtin_HEXAGON_F2_conv_df2sf(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2sf
+  __builtin_HEXAGON_F2_conv_df2ud(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2ud
+  __builtin_HEXAGON_F2_conv_df2ud_chop(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2ud.chop
+  __builtin_HEXAGON_F2_conv_df2uw(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2uw
+  __builtin_HEXAGON_F2_conv_df2uw_chop(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2uw.chop
+  __builtin_HEXAGON_F2_conv_df2w(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2w
+  __builtin_HEXAGON_F2_conv_df2w_chop(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2w.chop
+  __builtin_HEXAGON_F2_conv_sf2d(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2d
+  __builtin_HEXAGON_F2_conv_sf2d_chop(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2d.chop
+  __builtin_HEXAGON_F2_conv_sf2df(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2df
+  __builtin_HEXAGON_F2_conv_sf2ud(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2ud
+  __builtin_HEXAGON_F2_conv_sf2ud_chop(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2ud.chop
+  __builtin_HEXAGON_F2_conv_sf2uw(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2uw
+  __builtin_HEXAGON_F2_conv_sf2uw_chop(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2uw.chop
+  __builtin_HEXAGON_F2_conv_sf2w(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2w
+  __builtin_HEXAGON_F2_conv_sf2w_chop(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2w.chop
+  __builtin_HEXAGON_F2_conv_ud2df(0);
+  // CHECK: @llvm.hexagon.F2.conv.ud2df
+  __builtin_HEXAGON_F2_conv_ud2sf(0);
+  // CHECK: @llvm.hexagon.F2.conv.ud2sf
+  __builtin_HEXAGON_F2_conv_uw2df(0);
+  // CHECK: @llvm.hexagon.F2.conv.uw2df
+  __builtin_HEXAGON_F2_conv_uw2sf(0);
+  // CHECK: @llvm.hexagon.F2.conv.uw2sf
+  __builtin_HEXAGON_F2_conv_w2df(0);
+  // CHECK: @llvm.hexagon.F2.conv.w2df
+  __builtin_HEXAGON_F2_conv_w2sf(0);
+  // CHECK: @llvm.hexagon.F2.conv.w2sf
+  __builtin_HEXAGON_F2_dfclass(0.0, 0);
+  // CHECK: @llvm.hexagon.F2.dfclass
+  __builtin_HEXAGON_F2_dfcmpeq(0.0, 0.0);
+  // CHECK: @llvm.hexagon.F2.dfcmpeq
+  __builtin_HEXAGON_F2_dfcmpge(0.0, 0.0);
+  // CHECK: @llvm.hexagon.F2.dfcmpge
+  __builtin_HEXAGON_F2_dfcmpgt(0.0, 0.0);
+  // CHECK: @llvm.hexagon.F2.dfcmpgt
+  __builtin_HEXAGON_F2_dfcmpuo(0.0, 0.0);
+  // CHECK: @llvm.hexagon.F2.dfcmpuo
+  __builtin_HEXAGON_F2_dfimm_n(0);
+  // CHECK: @llvm.hexagon.F2.dfimm.n
+  __builtin_HEXAGON_F2_dfimm_p(0);
+  // CHECK: @llvm.hexagon.F2.dfimm.p
+  __builtin_HEXAGON_F2_sfadd(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfadd
+  __builtin_HEXAGON_F2_sfclass(0.0f, 0);
+  // CHECK: @llvm.hexagon.F2.sfclass
+  __builtin_HEXAGON_F2_sfcmpeq(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfcmpeq
+  __builtin_HEXAGON_F2_sfcmpge(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfcmpge
+  __builtin_HEXAGON_F2_sfcmpgt(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfcmpgt
+  __builtin_HEXAGON_F2_sfcmpuo(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfcmpuo
+  __builtin_HEXAGON_F2_sffixupd(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffixupd
+  __builtin_HEXAGON_F2_sffixupn(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffixupn
+  __builtin_HEXAGON_F2_sffixupr(0.0f);
+  // CHECK: @llvm.hexagon.F2.sffixupr
+  __builtin_HEXAGON_F2_sffma(0.0f, 0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffma
+  __builtin_HEXAGON_F2_sffma_lib(0.0f, 0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffma.lib
+  __builtin_HEXAGON_F2_sffma_sc(0.0f, 0.0f, 0.0f, 0);
+  // CHECK: @llvm.hexagon.F2.sffma.sc
+  __builtin_HEXAGON_F2_sffms(0.0f, 0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffms
+  __builtin_HEXAGON_F2_sffms_lib(0.0f, 0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffms.lib
+  __builtin_HEXAGON_F2_sfimm_n(0);
+  // CHECK: @llvm.hexagon.F2.sfimm.n
+  __builtin_HEXAGON_F2_sfimm_p(0);
+  // CHECK: @llvm.hexagon.F2.sfimm.p
+  __builtin_HEXAGON_F2_sfmax(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfmax
+  __builtin_HEXAGON_F2_sfmin(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfmin
+  __builtin_HEXAGON_F2_sfmpy(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfmpy
+  __builtin_HEXAGON_F2_sfsub(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfsub
+  __builtin_HEXAGON_M2_acci(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.acci
+  __builtin_HEXAGON_M2_accii(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.accii
+  __builtin_HEXAGON_M2_cmaci_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmaci.s0
+  __builtin_HEXAGON_M2_cmacr_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacr.s0
+  __builtin_HEXAGON_M2_cmacsc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacsc.s0
+  __builtin_HEXAGON_M2_cmacsc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacsc.s1
+  __builtin_HEXAGON_M2_cmacs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacs.s0
+  __builtin_HEXAGON_M2_cmacs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacs.s1
+  __builtin_HEXAGON_M2_cmpyi_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyi.s0
+  __builtin_HEXAGON_M2_cmpyr_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyr.s0
+  __builtin_HEXAGON_M2_cmpyrsc_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyrsc.s0
+  __builtin_HEXAGON_M2_cmpyrsc_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyrsc.s1
+  __builtin_HEXAGON_M2_cmpyrs_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyrs.s0
+  __builtin_HEXAGON_M2_cmpyrs_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyrs.s1
+  __builtin_HEXAGON_M2_cmpysc_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpysc.s0
+  __builtin_HEXAGON_M2_cmpysc_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpysc.s1
+  __builtin_HEXAGON_M2_cmpys_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpys.s0
+  __builtin_HEXAGON_M2_cmpys_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpys.s1
+  __builtin_HEXAGON_M2_cnacsc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cnacsc.s0
+  __builtin_HEXAGON_M2_cnacsc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cnacsc.s1
+  __builtin_HEXAGON_M2_cnacs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cnacs.s0
+  __builtin_HEXAGON_M2_cnacs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cnacs.s1
+  __builtin_HEXAGON_M2_dpmpyss_acc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyss.acc.s0
+  __builtin_HEXAGON_M2_dpmpyss_nac_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyss.nac.s0
+  __builtin_HEXAGON_M2_dpmpyss_rnd_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyss.rnd.s0
+  __builtin_HEXAGON_M2_dpmpyss_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyss.s0
+  __builtin_HEXAGON_M2_dpmpyuu_acc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyuu.acc.s0
+  __builtin_HEXAGON_M2_dpmpyuu_nac_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyuu.nac.s0
+  __builtin_HEXAGON_M2_dpmpyuu_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyuu.s0
+  __builtin_HEXAGON_M2_hmmpyh_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.hmmpyh.rs1
+  __builtin_HEXAGON_M2_hmmpyh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.hmmpyh.s1
+  __builtin_HEXAGON_M2_hmmpyl_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.hmmpyl.rs1
+  __builtin_HEXAGON_M2_hmmpyl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.hmmpyl.s1
+  __builtin_HEXAGON_M2_maci(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.maci
+  __builtin_HEXAGON_M2_macsin(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.macsin
+  __builtin_HEXAGON_M2_macsip(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.macsip
+  __builtin_HEXAGON_M2_mmachs_rs0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmachs.rs0
+  __builtin_HEXAGON_M2_mmachs_rs1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmachs.rs1
+  __builtin_HEXAGON_M2_mmachs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmachs.s0
+  __builtin_HEXAGON_M2_mmachs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmachs.s1
+  __builtin_HEXAGON_M2_mmacls_rs0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacls.rs0
+  __builtin_HEXAGON_M2_mmacls_rs1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacls.rs1
+  __builtin_HEXAGON_M2_mmacls_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacls.s0
+  __builtin_HEXAGON_M2_mmacls_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacls.s1
+  __builtin_HEXAGON_M2_mmacuhs_rs0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacuhs.rs0
+  __builtin_HEXAGON_M2_mmacuhs_rs1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacuhs.rs1
+  __builtin_HEXAGON_M2_mmacuhs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacuhs.s0
+  __builtin_HEXAGON_M2_mmacuhs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacuhs.s1
+  __builtin_HEXAGON_M2_mmaculs_rs0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmaculs.rs0
+  __builtin_HEXAGON_M2_mmaculs_rs1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmaculs.rs1
+  __builtin_HEXAGON_M2_mmaculs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmaculs.s0
+  __builtin_HEXAGON_M2_mmaculs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmaculs.s1
+  __builtin_HEXAGON_M2_mmpyh_rs0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyh.rs0
+  __builtin_HEXAGON_M2_mmpyh_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyh.rs1
+  __builtin_HEXAGON_M2_mmpyh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyh.s0
+  __builtin_HEXAGON_M2_mmpyh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyh.s1
+  __builtin_HEXAGON_M2_mmpyl_rs0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyl.rs0
+  __builtin_HEXAGON_M2_mmpyl_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyl.rs1
+  __builtin_HEXAGON_M2_mmpyl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyl.s0
+  __builtin_HEXAGON_M2_mmpyl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyl.s1
+  __builtin_HEXAGON_M2_mmpyuh_rs0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyuh.rs0
+  __builtin_HEXAGON_M2_mmpyuh_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyuh.rs1
+  __builtin_HEXAGON_M2_mmpyuh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyuh.s0
+  __builtin_HEXAGON_M2_mmpyuh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyuh.s1
+  __builtin_HEXAGON_M2_mmpyul_rs0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyul.rs0
+  __builtin_HEXAGON_M2_mmpyul_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyul.rs1
+  __builtin_HEXAGON_M2_mmpyul_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyul.s0
+  __builtin_HEXAGON_M2_mmpyul_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyul.s1
+  __builtin_HEXAGON_M2_mpy_acc_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.hh.s0
+  __builtin_HEXAGON_M2_mpy_acc_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.hh.s1
+  __builtin_HEXAGON_M2_mpy_acc_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.hl.s0
+  __builtin_HEXAGON_M2_mpy_acc_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.hl.s1
+  __builtin_HEXAGON_M2_mpy_acc_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.lh.s0
+  __builtin_HEXAGON_M2_mpy_acc_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.lh.s1
+  __builtin_HEXAGON_M2_mpy_acc_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.ll.s0
+  __builtin_HEXAGON_M2_mpy_acc_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.ll.s1
+  __builtin_HEXAGON_M2_mpy_acc_sat_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.hh.s0
+  __builtin_HEXAGON_M2_mpy_acc_sat_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.hh.s1
+  __builtin_HEXAGON_M2_mpy_acc_sat_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.hl.s0
+  __builtin_HEXAGON_M2_mpy_acc_sat_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.hl.s1
+  __builtin_HEXAGON_M2_mpy_acc_sat_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.lh.s0
+  __builtin_HEXAGON_M2_mpy_acc_sat_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.lh.s1
+  __builtin_HEXAGON_M2_mpy_acc_sat_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.ll.s0
+  __builtin_HEXAGON_M2_mpy_acc_sat_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.ll.s1
+  __builtin_HEXAGON_M2_mpyd_acc_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.hh.s0
+  __builtin_HEXAGON_M2_mpyd_acc_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.hh.s1
+  __builtin_HEXAGON_M2_mpyd_acc_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.hl.s0
+  __builtin_HEXAGON_M2_mpyd_acc_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.hl.s1
+  __builtin_HEXAGON_M2_mpyd_acc_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.lh.s0
+  __builtin_HEXAGON_M2_mpyd_acc_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.lh.s1
+  __builtin_HEXAGON_M2_mpyd_acc_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.ll.s0
+  __builtin_HEXAGON_M2_mpyd_acc_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.ll.s1
+  __builtin_HEXAGON_M2_mpyd_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.hh.s0
+  __builtin_HEXAGON_M2_mpyd_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.hh.s1
+  __builtin_HEXAGON_M2_mpyd_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.hl.s0
+  __builtin_HEXAGON_M2_mpyd_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.hl.s1
+  __builtin_HEXAGON_M2_mpyd_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.lh.s0
+  __builtin_HEXAGON_M2_mpyd_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.lh.s1
+  __builtin_HEXAGON_M2_mpyd_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.ll.s0
+  __builtin_HEXAGON_M2_mpyd_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.ll.s1
+  __builtin_HEXAGON_M2_mpyd_nac_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.hh.s0
+  __builtin_HEXAGON_M2_mpyd_nac_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.hh.s1
+  __builtin_HEXAGON_M2_mpyd_nac_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.hl.s0
+  __builtin_HEXAGON_M2_mpyd_nac_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.hl.s1
+  __builtin_HEXAGON_M2_mpyd_nac_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.lh.s0
+  __builtin_HEXAGON_M2_mpyd_nac_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.lh.s1
+  __builtin_HEXAGON_M2_mpyd_nac_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.ll.s0
+  __builtin_HEXAGON_M2_mpyd_nac_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.ll.s1
+  __builtin_HEXAGON_M2_mpyd_rnd_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.hh.s0
+  __builtin_HEXAGON_M2_mpyd_rnd_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.hh.s1
+  __builtin_HEXAGON_M2_mpyd_rnd_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.hl.s0
+  __builtin_HEXAGON_M2_mpyd_rnd_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.hl.s1
+  __builtin_HEXAGON_M2_mpyd_rnd_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.lh.s0
+  __builtin_HEXAGON_M2_mpyd_rnd_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.lh.s1
+  __builtin_HEXAGON_M2_mpyd_rnd_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.ll.s0
+  __builtin_HEXAGON_M2_mpyd_rnd_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.ll.s1
+  __builtin_HEXAGON_M2_mpy_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.hh.s0
+  __builtin_HEXAGON_M2_mpy_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.hh.s1
+  __builtin_HEXAGON_M2_mpy_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.hl.s0
+  __builtin_HEXAGON_M2_mpy_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.hl.s1
+  __builtin_HEXAGON_M2_mpyi(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyi
+  __builtin_HEXAGON_M2_mpy_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.lh.s0
+  __builtin_HEXAGON_M2_mpy_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.lh.s1
+  __builtin_HEXAGON_M2_mpy_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.ll.s0
+  __builtin_HEXAGON_M2_mpy_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.ll.s1
+  __builtin_HEXAGON_M2_mpy_nac_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.hh.s0
+  __builtin_HEXAGON_M2_mpy_nac_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.hh.s1
+  __builtin_HEXAGON_M2_mpy_nac_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.hl.s0
+  __builtin_HEXAGON_M2_mpy_nac_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.hl.s1
+  __builtin_HEXAGON_M2_mpy_nac_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.lh.s0
+  __builtin_HEXAGON_M2_mpy_nac_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.lh.s1
+  __builtin_HEXAGON_M2_mpy_nac_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.ll.s0
+  __builtin_HEXAGON_M2_mpy_nac_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.ll.s1
+  __builtin_HEXAGON_M2_mpy_nac_sat_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.hh.s0
+  __builtin_HEXAGON_M2_mpy_nac_sat_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.hh.s1
+  __builtin_HEXAGON_M2_mpy_nac_sat_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.hl.s0
+  __builtin_HEXAGON_M2_mpy_nac_sat_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.hl.s1
+  __builtin_HEXAGON_M2_mpy_nac_sat_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.lh.s0
+  __builtin_HEXAGON_M2_mpy_nac_sat_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.lh.s1
+  __builtin_HEXAGON_M2_mpy_nac_sat_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.ll.s0
+  __builtin_HEXAGON_M2_mpy_nac_sat_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.ll.s1
+  __builtin_HEXAGON_M2_mpy_rnd_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.hh.s0
+  __builtin_HEXAGON_M2_mpy_rnd_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.hh.s1
+  __builtin_HEXAGON_M2_mpy_rnd_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.hl.s0
+  __builtin_HEXAGON_M2_mpy_rnd_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.hl.s1
+  __builtin_HEXAGON_M2_mpy_rnd_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.lh.s0
+  __builtin_HEXAGON_M2_mpy_rnd_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.lh.s1
+  __builtin_HEXAGON_M2_mpy_rnd_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.ll.s0
+  __builtin_HEXAGON_M2_mpy_rnd_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.ll.s1
+  __builtin_HEXAGON_M2_mpy_sat_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.hh.s0
+  __builtin_HEXAGON_M2_mpy_sat_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.hh.s1
+  __builtin_HEXAGON_M2_mpy_sat_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.hl.s0
+  __builtin_HEXAGON_M2_mpy_sat_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.hl.s1
+  __builtin_HEXAGON_M2_mpy_sat_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.lh.s0
+  __builtin_HEXAGON_M2_mpy_sat_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.lh.s1
+  __builtin_HEXAGON_M2_mpy_sat_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.ll.s0
+  __builtin_HEXAGON_M2_mpy_sat_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.ll.s1
+  __builtin_HEXAGON_M2_mpy_sat_rnd_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.hh.s0
+  __builtin_HEXAGON_M2_mpy_sat_rnd_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.hh.s1
+  __builtin_HEXAGON_M2_mpy_sat_rnd_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.hl.s0
+  __builtin_HEXAGON_M2_mpy_sat_rnd_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.hl.s1
+  __builtin_HEXAGON_M2_mpy_sat_rnd_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.lh.s0
+  __builtin_HEXAGON_M2_mpy_sat_rnd_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.lh.s1
+  __builtin_HEXAGON_M2_mpy_sat_rnd_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.ll.s0
+  __builtin_HEXAGON_M2_mpy_sat_rnd_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.ll.s1
+  __builtin_HEXAGON_M2_mpysmi(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpysmi
+  __builtin_HEXAGON_M2_mpysu_up(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpysu.up
+  __builtin_HEXAGON_M2_mpyu_acc_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.hh.s0
+  __builtin_HEXAGON_M2_mpyu_acc_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.hh.s1
+  __builtin_HEXAGON_M2_mpyu_acc_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.hl.s0
+  __builtin_HEXAGON_M2_mpyu_acc_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.hl.s1
+  __builtin_HEXAGON_M2_mpyu_acc_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.lh.s0
+  __builtin_HEXAGON_M2_mpyu_acc_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.lh.s1
+  __builtin_HEXAGON_M2_mpyu_acc_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.ll.s0
+  __builtin_HEXAGON_M2_mpyu_acc_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.ll.s1
+  __builtin_HEXAGON_M2_mpyud_acc_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.hh.s0
+  __builtin_HEXAGON_M2_mpyud_acc_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.hh.s1
+  __builtin_HEXAGON_M2_mpyud_acc_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.hl.s0
+  __builtin_HEXAGON_M2_mpyud_acc_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.hl.s1
+  __builtin_HEXAGON_M2_mpyud_acc_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.lh.s0
+  __builtin_HEXAGON_M2_mpyud_acc_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.lh.s1
+  __builtin_HEXAGON_M2_mpyud_acc_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.ll.s0
+  __builtin_HEXAGON_M2_mpyud_acc_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.ll.s1
+  __builtin_HEXAGON_M2_mpyud_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.hh.s0
+  __builtin_HEXAGON_M2_mpyud_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.hh.s1
+  __builtin_HEXAGON_M2_mpyud_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.hl.s0
+  __builtin_HEXAGON_M2_mpyud_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.hl.s1
+  __builtin_HEXAGON_M2_mpyud_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.lh.s0
+  __builtin_HEXAGON_M2_mpyud_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.lh.s1
+  __builtin_HEXAGON_M2_mpyud_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.ll.s0
+  __builtin_HEXAGON_M2_mpyud_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.ll.s1
+  __builtin_HEXAGON_M2_mpyud_nac_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.hh.s0
+  __builtin_HEXAGON_M2_mpyud_nac_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.hh.s1
+  __builtin_HEXAGON_M2_mpyud_nac_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.hl.s0
+  __builtin_HEXAGON_M2_mpyud_nac_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.hl.s1
+  __builtin_HEXAGON_M2_mpyud_nac_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.lh.s0
+  __builtin_HEXAGON_M2_mpyud_nac_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.lh.s1
+  __builtin_HEXAGON_M2_mpyud_nac_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.ll.s0
+  __builtin_HEXAGON_M2_mpyud_nac_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.ll.s1
+  __builtin_HEXAGON_M2_mpyu_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.hh.s0
+  __builtin_HEXAGON_M2_mpyu_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.hh.s1
+  __builtin_HEXAGON_M2_mpyu_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.hl.s0
+  __builtin_HEXAGON_M2_mpyu_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.hl.s1
+  __builtin_HEXAGON_M2_mpyui(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyui
+  __builtin_HEXAGON_M2_mpyu_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.lh.s0
+  __builtin_HEXAGON_M2_mpyu_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.lh.s1
+  __builtin_HEXAGON_M2_mpyu_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.ll.s0
+  __builtin_HEXAGON_M2_mpyu_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.ll.s1
+  __builtin_HEXAGON_M2_mpyu_nac_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.hh.s0
+  __builtin_HEXAGON_M2_mpyu_nac_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.hh.s1
+  __builtin_HEXAGON_M2_mpyu_nac_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.hl.s0
+  __builtin_HEXAGON_M2_mpyu_nac_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.hl.s1
+  __builtin_HEXAGON_M2_mpyu_nac_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.lh.s0
+  __builtin_HEXAGON_M2_mpyu_nac_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.lh.s1
+  __builtin_HEXAGON_M2_mpyu_nac_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.ll.s0
+  __builtin_HEXAGON_M2_mpyu_nac_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.ll.s1
+  __builtin_HEXAGON_M2_mpy_up(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.up
+  __builtin_HEXAGON_M2_mpy_up_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.up.s1
+  __builtin_HEXAGON_M2_mpy_up_s1_sat(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.up.s1.sat
+  __builtin_HEXAGON_M2_mpyu_up(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.up
+  __builtin_HEXAGON_M2_nacci(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.nacci
+  __builtin_HEXAGON_M2_naccii(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.naccii
+  __builtin_HEXAGON_M2_subacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.subacc
+  __builtin_HEXAGON_M2_vabsdiffh(0, 0);
+  // CHECK: @llvm.hexagon.M2.vabsdiffh
+  __builtin_HEXAGON_M2_vabsdiffw(0, 0);
+  // CHECK: @llvm.hexagon.M2.vabsdiffw
+  __builtin_HEXAGON_M2_vcmac_s0_sat_i(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmac.s0.sat.i
+  __builtin_HEXAGON_M2_vcmac_s0_sat_r(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmac.s0.sat.r
+  __builtin_HEXAGON_M2_vcmpy_s0_sat_i(0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmpy.s0.sat.i
+  __builtin_HEXAGON_M2_vcmpy_s0_sat_r(0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmpy.s0.sat.r
+  __builtin_HEXAGON_M2_vcmpy_s1_sat_i(0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmpy.s1.sat.i
+  __builtin_HEXAGON_M2_vcmpy_s1_sat_r(0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmpy.s1.sat.r
+  __builtin_HEXAGON_M2_vdmacs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmacs.s0
+  __builtin_HEXAGON_M2_vdmacs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmacs.s1
+  __builtin_HEXAGON_M2_vdmpyrs_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmpyrs.s0
+  __builtin_HEXAGON_M2_vdmpyrs_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmpyrs.s1
+  __builtin_HEXAGON_M2_vdmpys_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmpys.s0
+  __builtin_HEXAGON_M2_vdmpys_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmpys.s1
+  __builtin_HEXAGON_M2_vmac2(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2
+  __builtin_HEXAGON_M2_vmac2es(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2es
+  __builtin_HEXAGON_M2_vmac2es_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2es.s0
+  __builtin_HEXAGON_M2_vmac2es_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2es.s1
+  __builtin_HEXAGON_M2_vmac2s_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2s.s0
+  __builtin_HEXAGON_M2_vmac2s_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2s.s1
+  __builtin_HEXAGON_M2_vmac2su_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2su.s0
+  __builtin_HEXAGON_M2_vmac2su_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2su.s1
+  __builtin_HEXAGON_M2_vmpy2es_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2es.s0
+  __builtin_HEXAGON_M2_vmpy2es_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2es.s1
+  __builtin_HEXAGON_M2_vmpy2s_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2s.s0
+  __builtin_HEXAGON_M2_vmpy2s_s0pack(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2s.s0pack
+  __builtin_HEXAGON_M2_vmpy2s_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2s.s1
+  __builtin_HEXAGON_M2_vmpy2s_s1pack(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2s.s1pack
+  __builtin_HEXAGON_M2_vmpy2su_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2su.s0
+  __builtin_HEXAGON_M2_vmpy2su_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2su.s1
+  __builtin_HEXAGON_M2_vraddh(0, 0);
+  // CHECK: @llvm.hexagon.M2.vraddh
+  __builtin_HEXAGON_M2_vradduh(0, 0);
+  // CHECK: @llvm.hexagon.M2.vradduh
+  __builtin_HEXAGON_M2_vrcmaci_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmaci.s0
+  __builtin_HEXAGON_M2_vrcmaci_s0c(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmaci.s0c
+  __builtin_HEXAGON_M2_vrcmacr_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmacr.s0
+  __builtin_HEXAGON_M2_vrcmacr_s0c(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmacr.s0c
+  __builtin_HEXAGON_M2_vrcmpyi_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpyi.s0
+  __builtin_HEXAGON_M2_vrcmpyi_s0c(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpyi.s0c
+  __builtin_HEXAGON_M2_vrcmpyr_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpyr.s0
+  __builtin_HEXAGON_M2_vrcmpyr_s0c(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpyr.s0c
+  __builtin_HEXAGON_M2_vrcmpys_acc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpys.acc.s1
+  __builtin_HEXAGON_M2_vrcmpys_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpys.s1
+  __builtin_HEXAGON_M2_vrcmpys_s1rp(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpys.s1rp
+  __builtin_HEXAGON_M2_vrmac_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrmac.s0
+  __builtin_HEXAGON_M2_vrmpy_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrmpy.s0
+  __builtin_HEXAGON_M2_xor_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.xor.xacc
+  __builtin_HEXAGON_M4_and_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.and.and
+  __builtin_HEXAGON_M4_and_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.and.andn
+  __builtin_HEXAGON_M4_and_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.and.or
+  __builtin_HEXAGON_M4_and_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.and.xor
+  __builtin_HEXAGON_M4_cmpyi_wh(0, 0);
+  // CHECK: @llvm.hexagon.M4.cmpyi.wh
+  __builtin_HEXAGON_M4_cmpyi_whc(0, 0);
+  // CHECK: @llvm.hexagon.M4.cmpyi.whc
+  __builtin_HEXAGON_M4_cmpyr_wh(0, 0);
+  // CHECK: @llvm.hexagon.M4.cmpyr.wh
+  __builtin_HEXAGON_M4_cmpyr_whc(0, 0);
+  // CHECK: @llvm.hexagon.M4.cmpyr.whc
+  __builtin_HEXAGON_M4_mac_up_s1_sat(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mac.up.s1.sat
+  __builtin_HEXAGON_M4_mpyri_addi(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyri.addi
+  __builtin_HEXAGON_M4_mpyri_addr(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyri.addr
+  __builtin_HEXAGON_M4_mpyri_addr_u2(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyri.addr.u2
+  __builtin_HEXAGON_M4_mpyrr_addi(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyrr.addi
+  __builtin_HEXAGON_M4_mpyrr_addr(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyrr.addr
+  __builtin_HEXAGON_M4_nac_up_s1_sat(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.nac.up.s1.sat
+  __builtin_HEXAGON_M4_or_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.or.and
+  __builtin_HEXAGON_M4_or_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.or.andn
+  __builtin_HEXAGON_M4_or_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.or.or
+  __builtin_HEXAGON_M4_or_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.or.xor
+  __builtin_HEXAGON_M4_pmpyw(0, 0);
+  // CHECK: @llvm.hexagon.M4.pmpyw
+  __builtin_HEXAGON_M4_pmpyw_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.pmpyw.acc
+  __builtin_HEXAGON_M4_vpmpyh(0, 0);
+  // CHECK: @llvm.hexagon.M4.vpmpyh
+  __builtin_HEXAGON_M4_vpmpyh_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vpmpyh.acc
+  __builtin_HEXAGON_M4_vrmpyeh_acc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyeh.acc.s0
+  __builtin_HEXAGON_M4_vrmpyeh_acc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyeh.acc.s1
+  __builtin_HEXAGON_M4_vrmpyeh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyeh.s0
+  __builtin_HEXAGON_M4_vrmpyeh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyeh.s1
+  __builtin_HEXAGON_M4_vrmpyoh_acc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyoh.acc.s0
+  __builtin_HEXAGON_M4_vrmpyoh_acc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyoh.acc.s1
+  __builtin_HEXAGON_M4_vrmpyoh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyoh.s0
+  __builtin_HEXAGON_M4_vrmpyoh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyoh.s1
+  __builtin_HEXAGON_M4_xor_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.xor.and
+  __builtin_HEXAGON_M4_xor_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.xor.andn
+  __builtin_HEXAGON_M4_xor_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.xor.or
+  __builtin_HEXAGON_M4_xor_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.xor.xacc
+  __builtin_HEXAGON_M5_vdmacbsu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vdmacbsu
+  __builtin_HEXAGON_M5_vdmpybsu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vdmpybsu
+  __builtin_HEXAGON_M5_vmacbsu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vmacbsu
+  __builtin_HEXAGON_M5_vmacbuu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vmacbuu
+  __builtin_HEXAGON_M5_vmpybsu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vmpybsu
+  __builtin_HEXAGON_M5_vmpybuu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vmpybuu
+  __builtin_HEXAGON_M5_vrmacbsu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vrmacbsu
+  __builtin_HEXAGON_M5_vrmacbuu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vrmacbuu
+  __builtin_HEXAGON_M5_vrmpybsu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vrmpybsu
+  __builtin_HEXAGON_M5_vrmpybuu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vrmpybuu
+  __builtin_HEXAGON_S2_addasl_rrri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.addasl.rrri
+  __builtin_HEXAGON_S2_asl_i_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p
+  __builtin_HEXAGON_S2_asl_i_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.acc
+  __builtin_HEXAGON_S2_asl_i_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.and
+  __builtin_HEXAGON_S2_asl_i_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.nac
+  __builtin_HEXAGON_S2_asl_i_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.or
+  __builtin_HEXAGON_S2_asl_i_p_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.xacc
+  __builtin_HEXAGON_S2_asl_i_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r
+  __builtin_HEXAGON_S2_asl_i_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.acc
+  __builtin_HEXAGON_S2_asl_i_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.and
+  __builtin_HEXAGON_S2_asl_i_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.nac
+  __builtin_HEXAGON_S2_asl_i_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.or
+  __builtin_HEXAGON_S2_asl_i_r_sat(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.sat
+  __builtin_HEXAGON_S2_asl_i_r_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.xacc
+  __builtin_HEXAGON_S2_asl_i_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.vh
+  __builtin_HEXAGON_S2_asl_i_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.vw
+  __builtin_HEXAGON_S2_asl_r_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p
+  __builtin_HEXAGON_S2_asl_r_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.acc
+  __builtin_HEXAGON_S2_asl_r_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.and
+  __builtin_HEXAGON_S2_asl_r_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.nac
+  __builtin_HEXAGON_S2_asl_r_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.or
+  __builtin_HEXAGON_S2_asl_r_p_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.xor
+  __builtin_HEXAGON_S2_asl_r_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r
+  __builtin_HEXAGON_S2_asl_r_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.acc
+  __builtin_HEXAGON_S2_asl_r_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.and
+  __builtin_HEXAGON_S2_asl_r_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.nac
+  __builtin_HEXAGON_S2_asl_r_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.or
+  __builtin_HEXAGON_S2_asl_r_r_sat(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.sat
+  __builtin_HEXAGON_S2_asl_r_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.vh
+  __builtin_HEXAGON_S2_asl_r_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.vw
+  __builtin_HEXAGON_S2_asr_i_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p
+  __builtin_HEXAGON_S2_asr_i_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.acc
+  __builtin_HEXAGON_S2_asr_i_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.and
+  __builtin_HEXAGON_S2_asr_i_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.nac
+  __builtin_HEXAGON_S2_asr_i_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.or
+  __builtin_HEXAGON_S2_asr_i_p_rnd(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.rnd
+  __builtin_HEXAGON_S2_asr_i_p_rnd_goodsyntax(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.rnd.goodsyntax
+  __builtin_HEXAGON_S2_asr_i_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r
+  __builtin_HEXAGON_S2_asr_i_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.acc
+  __builtin_HEXAGON_S2_asr_i_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.and
+  __builtin_HEXAGON_S2_asr_i_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.nac
+  __builtin_HEXAGON_S2_asr_i_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.or
+  __builtin_HEXAGON_S2_asr_i_r_rnd(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.rnd
+  __builtin_HEXAGON_S2_asr_i_r_rnd_goodsyntax(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.rnd.goodsyntax
+  __builtin_HEXAGON_S2_asr_i_svw_trun(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.svw.trun
+  __builtin_HEXAGON_S2_asr_i_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.vh
+  __builtin_HEXAGON_S2_asr_i_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.vw
+  __builtin_HEXAGON_S2_asr_r_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p
+  __builtin_HEXAGON_S2_asr_r_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.acc
+  __builtin_HEXAGON_S2_asr_r_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.and
+  __builtin_HEXAGON_S2_asr_r_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.nac
+  __builtin_HEXAGON_S2_asr_r_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.or
+  __builtin_HEXAGON_S2_asr_r_p_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.xor
+  __builtin_HEXAGON_S2_asr_r_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r
+  __builtin_HEXAGON_S2_asr_r_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.acc
+  __builtin_HEXAGON_S2_asr_r_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.and
+  __builtin_HEXAGON_S2_asr_r_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.nac
+  __builtin_HEXAGON_S2_asr_r_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.or
+  __builtin_HEXAGON_S2_asr_r_r_sat(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.sat
+  __builtin_HEXAGON_S2_asr_r_svw_trun(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.svw.trun
+  __builtin_HEXAGON_S2_asr_r_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.vh
+  __builtin_HEXAGON_S2_asr_r_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.vw
+  __builtin_HEXAGON_S2_brev(0);
+  // CHECK: @llvm.hexagon.S2.brev
+  __builtin_HEXAGON_S2_brevp(0);
+  // CHECK: @llvm.hexagon.S2.brevp
+  __builtin_HEXAGON_S2_cabacencbin(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.cabacencbin
+  __builtin_HEXAGON_S2_cl0(0);
+  // CHECK: @llvm.hexagon.S2.cl0
+  __builtin_HEXAGON_S2_cl0p(0);
+  // CHECK: @llvm.hexagon.S2.cl0p
+  __builtin_HEXAGON_S2_cl1(0);
+  // CHECK: @llvm.hexagon.S2.cl1
+  __builtin_HEXAGON_S2_cl1p(0);
+  // CHECK: @llvm.hexagon.S2.cl1p
+  __builtin_HEXAGON_S2_clb(0);
+  // CHECK: @llvm.hexagon.S2.clb
+  __builtin_HEXAGON_S2_clbnorm(0);
+  // CHECK: @llvm.hexagon.S2.clbnorm
+  __builtin_HEXAGON_S2_clbp(0);
+  // CHECK: @llvm.hexagon.S2.clbp
+  __builtin_HEXAGON_S2_clrbit_i(0, 0);
+  // CHECK: @llvm.hexagon.S2.clrbit.i
+  __builtin_HEXAGON_S2_clrbit_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.clrbit.r
+  __builtin_HEXAGON_S2_ct0(0);
+  // CHECK: @llvm.hexagon.S2.ct0
+  __builtin_HEXAGON_S2_ct0p(0);
+  // CHECK: @llvm.hexagon.S2.ct0p
+  __builtin_HEXAGON_S2_ct1(0);
+  // CHECK: @llvm.hexagon.S2.ct1
+  __builtin_HEXAGON_S2_ct1p(0);
+  // CHECK: @llvm.hexagon.S2.ct1p
+  __builtin_HEXAGON_S2_deinterleave(0);
+  // CHECK: @llvm.hexagon.S2.deinterleave
+  __builtin_HEXAGON_S2_extractu(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.extractu
+  __builtin_HEXAGON_S2_extractup(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.extractup
+  __builtin_HEXAGON_S2_extractup_rp(0, 0);
+  // CHECK: @llvm.hexagon.S2.extractup.rp
+  __builtin_HEXAGON_S2_extractu_rp(0, 0);
+  // CHECK: @llvm.hexagon.S2.extractu.rp
+  __builtin_HEXAGON_S2_insert(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.insert
+  __builtin_HEXAGON_S2_insertp(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.insertp
+  __builtin_HEXAGON_S2_insertp_rp(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.insertp.rp
+  __builtin_HEXAGON_S2_insert_rp(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.insert.rp
+  __builtin_HEXAGON_S2_interleave(0);
+  // CHECK: @llvm.hexagon.S2.interleave
+  __builtin_HEXAGON_S2_lfsp(0, 0);
+  // CHECK: @llvm.hexagon.S2.lfsp
+  __builtin_HEXAGON_S2_lsl_r_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p
+  __builtin_HEXAGON_S2_lsl_r_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.acc
+  __builtin_HEXAGON_S2_lsl_r_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.and
+  __builtin_HEXAGON_S2_lsl_r_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.nac
+  __builtin_HEXAGON_S2_lsl_r_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.or
+  __builtin_HEXAGON_S2_lsl_r_p_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.xor
+  __builtin_HEXAGON_S2_lsl_r_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r
+  __builtin_HEXAGON_S2_lsl_r_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r.acc
+  __builtin_HEXAGON_S2_lsl_r_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r.and
+  __builtin_HEXAGON_S2_lsl_r_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r.nac
+  __builtin_HEXAGON_S2_lsl_r_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r.or
+  __builtin_HEXAGON_S2_lsl_r_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.vh
+  __builtin_HEXAGON_S2_lsl_r_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.vw
+  __builtin_HEXAGON_S2_lsr_i_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p
+  __builtin_HEXAGON_S2_lsr_i_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.acc
+  __builtin_HEXAGON_S2_lsr_i_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.and
+  __builtin_HEXAGON_S2_lsr_i_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.nac
+  __builtin_HEXAGON_S2_lsr_i_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.or
+  __builtin_HEXAGON_S2_lsr_i_p_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.xacc
+  __builtin_HEXAGON_S2_lsr_i_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r
+  __builtin_HEXAGON_S2_lsr_i_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.acc
+  __builtin_HEXAGON_S2_lsr_i_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.and
+  __builtin_HEXAGON_S2_lsr_i_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.nac
+  __builtin_HEXAGON_S2_lsr_i_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.or
+  __builtin_HEXAGON_S2_lsr_i_r_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.xacc
+  __builtin_HEXAGON_S2_lsr_i_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.vh
+  __builtin_HEXAGON_S2_lsr_i_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.vw
+  __builtin_HEXAGON_S2_lsr_r_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p
+  __builtin_HEXAGON_S2_lsr_r_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.acc
+  __builtin_HEXAGON_S2_lsr_r_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.and
+  __builtin_HEXAGON_S2_lsr_r_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.nac
+  __builtin_HEXAGON_S2_lsr_r_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.or
+  __builtin_HEXAGON_S2_lsr_r_p_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.xor
+  __builtin_HEXAGON_S2_lsr_r_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r
+  __builtin_HEXAGON_S2_lsr_r_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r.acc
+  __builtin_HEXAGON_S2_lsr_r_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r.and
+  __builtin_HEXAGON_S2_lsr_r_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r.nac
+  __builtin_HEXAGON_S2_lsr_r_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r.or
+  __builtin_HEXAGON_S2_lsr_r_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.vh
+  __builtin_HEXAGON_S2_lsr_r_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.vw
+  __builtin_HEXAGON_S2_packhl(0, 0);
+  // CHECK: @llvm.hexagon.S2.packhl
+  __builtin_HEXAGON_S2_parityp(0, 0);
+  // CHECK: @llvm.hexagon.S2.parityp
+  __builtin_HEXAGON_S2_setbit_i(0, 0);
+  // CHECK: @llvm.hexagon.S2.setbit.i
+  __builtin_HEXAGON_S2_setbit_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.setbit.r
+  __builtin_HEXAGON_S2_shuffeb(0, 0);
+  // CHECK: @llvm.hexagon.S2.shuffeb
+  __builtin_HEXAGON_S2_shuffeh(0, 0);
+  // CHECK: @llvm.hexagon.S2.shuffeh
+  __builtin_HEXAGON_S2_shuffob(0, 0);
+  // CHECK: @llvm.hexagon.S2.shuffob
+  __builtin_HEXAGON_S2_shuffoh(0, 0);
+  // CHECK: @llvm.hexagon.S2.shuffoh
+  __builtin_HEXAGON_S2_svsathb(0);
+  // CHECK: @llvm.hexagon.S2.svsathb
+  __builtin_HEXAGON_S2_svsathub(0);
+  // CHECK: @llvm.hexagon.S2.svsathub
+  __builtin_HEXAGON_S2_tableidxb_goodsyntax(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.tableidxb.goodsyntax
+  __builtin_HEXAGON_S2_tableidxd_goodsyntax(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.tableidxd.goodsyntax
+  __builtin_HEXAGON_S2_tableidxh_goodsyntax(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.tableidxh.goodsyntax
+  __builtin_HEXAGON_S2_tableidxw_goodsyntax(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.tableidxw.goodsyntax
+  __builtin_HEXAGON_S2_togglebit_i(0, 0);
+  // CHECK: @llvm.hexagon.S2.togglebit.i
+  __builtin_HEXAGON_S2_togglebit_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.togglebit.r
+  __builtin_HEXAGON_S2_tstbit_i(0, 0);
+  // CHECK: @llvm.hexagon.S2.tstbit.i
+  __builtin_HEXAGON_S2_tstbit_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.tstbit.r
+  __builtin_HEXAGON_S2_valignib(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.valignib
+  __builtin_HEXAGON_S2_valignrb(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.valignrb
+  __builtin_HEXAGON_S2_vcnegh(0, 0);
+  // CHECK: @llvm.hexagon.S2.vcnegh
+  __builtin_HEXAGON_S2_vcrotate(0, 0);
+  // CHECK: @llvm.hexagon.S2.vcrotate
+  __builtin_HEXAGON_S2_vrcnegh(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.vrcnegh
+  __builtin_HEXAGON_S2_vrndpackwh(0);
+  // CHECK: @llvm.hexagon.S2.vrndpackwh
+  __builtin_HEXAGON_S2_vrndpackwhs(0);
+  // CHECK: @llvm.hexagon.S2.vrndpackwhs
+  __builtin_HEXAGON_S2_vsathb(0);
+  // CHECK: @llvm.hexagon.S2.vsathb
+  __builtin_HEXAGON_S2_vsathb_nopack(0);
+  // CHECK: @llvm.hexagon.S2.vsathb.nopack
+  __builtin_HEXAGON_S2_vsathub(0);
+  // CHECK: @llvm.hexagon.S2.vsathub
+  __builtin_HEXAGON_S2_vsathub_nopack(0);
+  // CHECK: @llvm.hexagon.S2.vsathub.nopack
+  __builtin_HEXAGON_S2_vsatwh(0);
+  // CHECK: @llvm.hexagon.S2.vsatwh
+  __builtin_HEXAGON_S2_vsatwh_nopack(0);
+  // CHECK: @llvm.hexagon.S2.vsatwh.nopack
+  __builtin_HEXAGON_S2_vsatwuh(0);
+  // CHECK: @llvm.hexagon.S2.vsatwuh
+  __builtin_HEXAGON_S2_vsatwuh_nopack(0);
+  // CHECK: @llvm.hexagon.S2.vsatwuh.nopack
+  __builtin_HEXAGON_S2_vsplatrb(0);
+  // CHECK: @llvm.hexagon.S2.vsplatrb
+  __builtin_HEXAGON_S2_vsplatrh(0);
+  // CHECK: @llvm.hexagon.S2.vsplatrh
+  __builtin_HEXAGON_S2_vspliceib(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.vspliceib
+  __builtin_HEXAGON_S2_vsplicerb(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.vsplicerb
+  __builtin_HEXAGON_S2_vsxtbh(0);
+  // CHECK: @llvm.hexagon.S2.vsxtbh
+  __builtin_HEXAGON_S2_vsxthw(0);
+  // CHECK: @llvm.hexagon.S2.vsxthw
+  __builtin_HEXAGON_S2_vtrunehb(0);
+  // CHECK: @llvm.hexagon.S2.vtrunehb
+  __builtin_HEXAGON_S2_vtrunewh(0, 0);
+  // CHECK: @llvm.hexagon.S2.vtrunewh
+  __builtin_HEXAGON_S2_vtrunohb(0);
+  // CHECK: @llvm.hexagon.S2.vtrunohb
+  __builtin_HEXAGON_S2_vtrunowh(0, 0);
+  // CHECK: @llvm.hexagon.S2.vtrunowh
+  __builtin_HEXAGON_S2_vzxtbh(0);
+  // CHECK: @llvm.hexagon.S2.vzxtbh
+  __builtin_HEXAGON_S2_vzxthw(0);
+  // CHECK: @llvm.hexagon.S2.vzxthw
+  __builtin_HEXAGON_S4_addaddi(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.addaddi
+  __builtin_HEXAGON_S4_addi_asl_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.addi.asl.ri
+  __builtin_HEXAGON_S4_addi_lsr_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.addi.lsr.ri
+  __builtin_HEXAGON_S4_andi_asl_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.andi.asl.ri
+  __builtin_HEXAGON_S4_andi_lsr_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.andi.lsr.ri
+  __builtin_HEXAGON_S4_clbaddi(0, 0);
+  // CHECK: @llvm.hexagon.S4.clbaddi
+  __builtin_HEXAGON_S4_clbpaddi(0, 0);
+  // CHECK: @llvm.hexagon.S4.clbpaddi
+  __builtin_HEXAGON_S4_clbpnorm(0);
+  // CHECK: @llvm.hexagon.S4.clbpnorm
+  __builtin_HEXAGON_S4_extract(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.extract
+  __builtin_HEXAGON_S4_extractp(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.extractp
+  __builtin_HEXAGON_S4_extractp_rp(0, 0);
+  // CHECK: @llvm.hexagon.S4.extractp.rp
+  __builtin_HEXAGON_S4_extract_rp(0, 0);
+  // CHECK: @llvm.hexagon.S4.extract.rp
+  __builtin_HEXAGON_S4_lsli(0, 0);
+  // CHECK: @llvm.hexagon.S4.lsli
+  __builtin_HEXAGON_S4_ntstbit_i(0, 0);
+  // CHECK: @llvm.hexagon.S4.ntstbit.i
+  __builtin_HEXAGON_S4_ntstbit_r(0, 0);
+  // CHECK: @llvm.hexagon.S4.ntstbit.r
+  __builtin_HEXAGON_S4_or_andi(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.or.andi
+  __builtin_HEXAGON_S4_or_andix(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.or.andix
+  __builtin_HEXAGON_S4_ori_asl_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.ori.asl.ri
+  __builtin_HEXAGON_S4_ori_lsr_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.ori.lsr.ri
+  __builtin_HEXAGON_S4_or_ori(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.or.ori
+  __builtin_HEXAGON_S4_parity(0, 0);
+  // CHECK: @llvm.hexagon.S4.parity
+  __builtin_HEXAGON_S4_subaddi(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.subaddi
+  __builtin_HEXAGON_S4_subi_asl_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.subi.asl.ri
+  __builtin_HEXAGON_S4_subi_lsr_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.subi.lsr.ri
+  __builtin_HEXAGON_S4_vrcrotate(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.vrcrotate
+  __builtin_HEXAGON_S4_vrcrotate_acc(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.vrcrotate.acc
+  __builtin_HEXAGON_S4_vxaddsubh(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxaddsubh
+  __builtin_HEXAGON_S4_vxaddsubhr(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxaddsubhr
+  __builtin_HEXAGON_S4_vxaddsubw(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxaddsubw
+  __builtin_HEXAGON_S4_vxsubaddh(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxsubaddh
+  __builtin_HEXAGON_S4_vxsubaddhr(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxsubaddhr
+  __builtin_HEXAGON_S4_vxsubaddw(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxsubaddw
+  __builtin_HEXAGON_S5_asrhub_rnd_sat_goodsyntax(0, 0);
+  // CHECK: @llvm.hexagon.S5.asrhub.rnd.sat.goodsyntax
+  __builtin_HEXAGON_S5_asrhub_sat(0, 0);
+  // CHECK: @llvm.hexagon.S5.asrhub.sat
+  __builtin_HEXAGON_S5_popcountp(0);
+  // CHECK: @llvm.hexagon.S5.popcountp
+  __builtin_HEXAGON_S5_vasrhrnd_goodsyntax(0, 0);
+  // CHECK: @llvm.hexagon.S5.vasrhrnd.goodsyntax
+  __builtin_HEXAGON_S6_rol_i_p(0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p
+  __builtin_HEXAGON_S6_rol_i_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.acc
+  __builtin_HEXAGON_S6_rol_i_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.and
+  __builtin_HEXAGON_S6_rol_i_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.nac
+  __builtin_HEXAGON_S6_rol_i_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.or
+  __builtin_HEXAGON_S6_rol_i_p_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.xacc
+  __builtin_HEXAGON_S6_rol_i_r(0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r
+  __builtin_HEXAGON_S6_rol_i_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.acc
+  __builtin_HEXAGON_S6_rol_i_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.and
+  __builtin_HEXAGON_S6_rol_i_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.nac
+  __builtin_HEXAGON_S6_rol_i_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.or
+  __builtin_HEXAGON_S6_rol_i_r_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.xacc
+  __builtin_HEXAGON_V6_extractw_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.extractw.128B
+  __builtin_HEXAGON_V6_extractw(v16, 0);
+  // CHECK: @llvm.hexagon.V6.extractw
+  __builtin_HEXAGON_V6_hi_128B(v64);
+  // CHECK: @llvm.hexagon.V6.hi.128B
+  __builtin_HEXAGON_V6_hi(v32);
+  // CHECK: @llvm.hexagon.V6.hi
+  __builtin_HEXAGON_V6_lo_128B(v64);
+  // CHECK: @llvm.hexagon.V6.lo.128B
+  __builtin_HEXAGON_V6_lo(v32);
+  // CHECK: @llvm.hexagon.V6.lo
+  __builtin_HEXAGON_V6_lvsplatw(0);
+  // CHECK: @llvm.hexagon.V6.lvsplatw
+  __builtin_HEXAGON_V6_lvsplatw_128B(0);
+  // CHECK: @llvm.hexagon.V6.lvsplatw.128B
+  __builtin_HEXAGON_V6_pred_and_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.and.128B
+  __builtin_HEXAGON_V6_pred_and_n_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.and.n.128B
+  __builtin_HEXAGON_V6_pred_and_n(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.and.n
+  __builtin_HEXAGON_V6_pred_and(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.and
+  __builtin_HEXAGON_V6_pred_not_128B(v32);
+  // CHECK: @llvm.hexagon.V6.pred.not.128B
+  __builtin_HEXAGON_V6_pred_not(v16);
+  // CHECK: @llvm.hexagon.V6.pred.not
+  __builtin_HEXAGON_V6_pred_or_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.or.128B
+  __builtin_HEXAGON_V6_pred_or_n_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.or.n.128B
+  __builtin_HEXAGON_V6_pred_or_n(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.or.n
+  __builtin_HEXAGON_V6_pred_or(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.or
+  __builtin_HEXAGON_V6_pred_scalar2(0);
+  // CHECK: @llvm.hexagon.V6.pred.scalar2
+  __builtin_HEXAGON_V6_pred_scalar2_128B(0);
+  // CHECK: @llvm.hexagon.V6.pred.scalar2.128B
+  __builtin_HEXAGON_V6_pred_xor_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.xor.128B
+  __builtin_HEXAGON_V6_pred_xor(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.xor
+  __builtin_HEXAGON_V6_vabsdiffh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vabsdiffh.128B
+  __builtin_HEXAGON_V6_vabsdiffh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vabsdiffh
+  __builtin_HEXAGON_V6_vabsdiffub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vabsdiffub.128B
+  __builtin_HEXAGON_V6_vabsdiffub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vabsdiffub
+  __builtin_HEXAGON_V6_vabsdiffuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vabsdiffuh.128B
+  __builtin_HEXAGON_V6_vabsdiffuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vabsdiffuh
+  __builtin_HEXAGON_V6_vabsdiffw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vabsdiffw.128B
+  __builtin_HEXAGON_V6_vabsdiffw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vabsdiffw
+  __builtin_HEXAGON_V6_vabsh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vabsh.128B
+  __builtin_HEXAGON_V6_vabsh_sat_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vabsh.sat.128B
+  __builtin_HEXAGON_V6_vabsh_sat(v16);
+  // CHECK: @llvm.hexagon.V6.vabsh.sat
+  __builtin_HEXAGON_V6_vabsh(v16);
+  // CHECK: @llvm.hexagon.V6.vabsh
+  __builtin_HEXAGON_V6_vabsw_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vabsw.128B
+  __builtin_HEXAGON_V6_vabsw_sat_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vabsw.sat.128B
+  __builtin_HEXAGON_V6_vabsw_sat(v16);
+  // CHECK: @llvm.hexagon.V6.vabsw.sat
+  __builtin_HEXAGON_V6_vabsw(v16);
+  // CHECK: @llvm.hexagon.V6.vabsw
+  __builtin_HEXAGON_V6_vaddb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddb.128B
+  __builtin_HEXAGON_V6_vaddb_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddb.dv.128B
+  __builtin_HEXAGON_V6_vaddb_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddb.dv
+  __builtin_HEXAGON_V6_vaddbnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddbnq.128B
+  __builtin_HEXAGON_V6_vaddbnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddbnq
+  __builtin_HEXAGON_V6_vaddbq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddbq.128B
+  __builtin_HEXAGON_V6_vaddbq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddbq
+  __builtin_HEXAGON_V6_vaddb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddb
+  __builtin_HEXAGON_V6_vaddh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddh.128B
+  __builtin_HEXAGON_V6_vaddh_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddh.dv.128B
+  __builtin_HEXAGON_V6_vaddh_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddh.dv
+  __builtin_HEXAGON_V6_vaddhnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhnq.128B
+  __builtin_HEXAGON_V6_vaddhnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddhnq
+  __builtin_HEXAGON_V6_vaddhq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhq.128B
+  __builtin_HEXAGON_V6_vaddhq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddhq
+  __builtin_HEXAGON_V6_vaddhsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.128B
+  __builtin_HEXAGON_V6_vaddhsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.dv.128B
+  __builtin_HEXAGON_V6_vaddhsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.dv
+  __builtin_HEXAGON_V6_vaddhsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddhsat
+  __builtin_HEXAGON_V6_vaddh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddh
+  __builtin_HEXAGON_V6_vaddhw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhw.128B
+  __builtin_HEXAGON_V6_vaddhw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddhw
+  __builtin_HEXAGON_V6_vaddubh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddubh.128B
+  __builtin_HEXAGON_V6_vaddubh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddubh
+  __builtin_HEXAGON_V6_vaddubsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.128B
+  __builtin_HEXAGON_V6_vaddubsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.dv.128B
+  __builtin_HEXAGON_V6_vaddubsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.dv
+  __builtin_HEXAGON_V6_vaddubsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddubsat
+  __builtin_HEXAGON_V6_vadduhsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.128B
+  __builtin_HEXAGON_V6_vadduhsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.dv.128B
+  __builtin_HEXAGON_V6_vadduhsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.dv
+  __builtin_HEXAGON_V6_vadduhsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vadduhsat
+  __builtin_HEXAGON_V6_vadduhw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vadduhw.128B
+  __builtin_HEXAGON_V6_vadduhw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vadduhw
+  __builtin_HEXAGON_V6_vaddw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddw.128B
+  __builtin_HEXAGON_V6_vaddw_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddw.dv.128B
+  __builtin_HEXAGON_V6_vaddw_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddw.dv
+  __builtin_HEXAGON_V6_vaddwnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddwnq.128B
+  __builtin_HEXAGON_V6_vaddwnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddwnq
+  __builtin_HEXAGON_V6_vaddwq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddwq.128B
+  __builtin_HEXAGON_V6_vaddwq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddwq
+  __builtin_HEXAGON_V6_vaddwsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.128B
+  __builtin_HEXAGON_V6_vaddwsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.dv.128B
+  __builtin_HEXAGON_V6_vaddwsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.dv
+  __builtin_HEXAGON_V6_vaddwsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddwsat
+  __builtin_HEXAGON_V6_vaddw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddw
+  __builtin_HEXAGON_V6_valignb_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.valignb.128B
+  __builtin_HEXAGON_V6_valignbi_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.valignbi.128B
+  __builtin_HEXAGON_V6_valignbi(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.valignbi
+  __builtin_HEXAGON_V6_valignb(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.valignb
+  __builtin_HEXAGON_V6_vand_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vand.128B
+  __builtin_HEXAGON_V6_vandqrt_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.128B
+  __builtin_HEXAGON_V6_vandqrt_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.acc.128B
+  __builtin_HEXAGON_V6_vandqrt_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.acc
+  __builtin_HEXAGON_V6_vandqrt(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt
+  __builtin_HEXAGON_V6_vand(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vand
+  __builtin_HEXAGON_V6_vandvrt_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt.128B
+  __builtin_HEXAGON_V6_vandvrt_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt.acc.128B
+  __builtin_HEXAGON_V6_vandvrt_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt.acc
+  __builtin_HEXAGON_V6_vandvrt(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt
+  __builtin_HEXAGON_V6_vaslh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vaslh.128B
+  __builtin_HEXAGON_V6_vaslhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaslhv.128B
+  __builtin_HEXAGON_V6_vaslh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vaslh
+  __builtin_HEXAGON_V6_vaslhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaslhv
+  __builtin_HEXAGON_V6_vaslw_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw.128B
+  __builtin_HEXAGON_V6_vaslw_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw.acc.128B
+  __builtin_HEXAGON_V6_vaslw_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw.acc
+  __builtin_HEXAGON_V6_vaslwv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaslwv.128B
+  __builtin_HEXAGON_V6_vaslw(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw
+  __builtin_HEXAGON_V6_vaslwv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaslwv
+  __builtin_HEXAGON_V6_vasrh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrh.128B
+  __builtin_HEXAGON_V6_vasrhbrndsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhbrndsat.128B
+  __builtin_HEXAGON_V6_vasrhbrndsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhbrndsat
+  __builtin_HEXAGON_V6_vasrhubrndsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubrndsat.128B
+  __builtin_HEXAGON_V6_vasrhubrndsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubrndsat
+  __builtin_HEXAGON_V6_vasrhubsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubsat.128B
+  __builtin_HEXAGON_V6_vasrhubsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubsat
+  __builtin_HEXAGON_V6_vasrhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vasrhv.128B
+  __builtin_HEXAGON_V6_vasrh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrh
+  __builtin_HEXAGON_V6_vasrhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vasrhv
+  __builtin_HEXAGON_V6_vasrw_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.128B
+  __builtin_HEXAGON_V6_vasrw_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.acc.128B
+  __builtin_HEXAGON_V6_vasrw_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.acc
+  __builtin_HEXAGON_V6_vasrwh_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwh.128B
+  __builtin_HEXAGON_V6_vasrwhrndsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhrndsat.128B
+  __builtin_HEXAGON_V6_vasrwhrndsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhrndsat
+  __builtin_HEXAGON_V6_vasrwhsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhsat.128B
+  __builtin_HEXAGON_V6_vasrwhsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhsat
+  __builtin_HEXAGON_V6_vasrwh(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwh
+  __builtin_HEXAGON_V6_vasrwuhsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwuhsat.128B
+  __builtin_HEXAGON_V6_vasrwuhsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwuhsat
+  __builtin_HEXAGON_V6_vasrwv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vasrwv.128B
+  __builtin_HEXAGON_V6_vasrw(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw
+  __builtin_HEXAGON_V6_vasrwv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vasrwv
+  __builtin_HEXAGON_V6_vassign_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vassign.128B
+  __builtin_HEXAGON_V6_vassignp_128B(v64);
+  // CHECK: @llvm.hexagon.V6.vassignp.128B
+  __builtin_HEXAGON_V6_vassignp(v32);
+  // CHECK: @llvm.hexagon.V6.vassignp
+  __builtin_HEXAGON_V6_vassign(v16);
+  // CHECK: @llvm.hexagon.V6.vassign
+  __builtin_HEXAGON_V6_vavgh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgh.128B
+  __builtin_HEXAGON_V6_vavghrnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavghrnd.128B
+  __builtin_HEXAGON_V6_vavghrnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavghrnd
+  __builtin_HEXAGON_V6_vavgh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgh
+  __builtin_HEXAGON_V6_vavgub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgub.128B
+  __builtin_HEXAGON_V6_vavgubrnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgubrnd.128B
+  __builtin_HEXAGON_V6_vavgubrnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgubrnd
+  __builtin_HEXAGON_V6_vavgub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgub
+  __builtin_HEXAGON_V6_vavguh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavguh.128B
+  __builtin_HEXAGON_V6_vavguhrnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavguhrnd.128B
+  __builtin_HEXAGON_V6_vavguhrnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavguhrnd
+  __builtin_HEXAGON_V6_vavguh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavguh
+  __builtin_HEXAGON_V6_vavgw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgw.128B
+  __builtin_HEXAGON_V6_vavgwrnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgwrnd.128B
+  __builtin_HEXAGON_V6_vavgwrnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgwrnd
+  __builtin_HEXAGON_V6_vavgw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgw
+  __builtin_HEXAGON_V6_vcl0h_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vcl0h.128B
+  __builtin_HEXAGON_V6_vcl0h(v16);
+  // CHECK: @llvm.hexagon.V6.vcl0h
+  __builtin_HEXAGON_V6_vcl0w_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vcl0w.128B
+  __builtin_HEXAGON_V6_vcl0w(v16);
+  // CHECK: @llvm.hexagon.V6.vcl0w
+  __builtin_HEXAGON_V6_vcombine_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vcombine.128B
+  __builtin_HEXAGON_V6_vcombine(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vcombine
+  __builtin_HEXAGON_V6_vd0_128B();
+  // CHECK: @llvm.hexagon.V6.vd0.128B
+  __builtin_HEXAGON_V6_vd0();
+  // CHECK: @llvm.hexagon.V6.vd0
+  __builtin_HEXAGON_V6_vdealb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vdealb.128B
+  __builtin_HEXAGON_V6_vdealb4w_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vdealb4w.128B
+  __builtin_HEXAGON_V6_vdealb4w(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vdealb4w
+  __builtin_HEXAGON_V6_vdealb(v16);
+  // CHECK: @llvm.hexagon.V6.vdealb
+  __builtin_HEXAGON_V6_vdealh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vdealh.128B
+  __builtin_HEXAGON_V6_vdealh(v16);
+  // CHECK: @llvm.hexagon.V6.vdealh
+  __builtin_HEXAGON_V6_vdealvdd_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdealvdd.128B
+  __builtin_HEXAGON_V6_vdealvdd(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdealvdd
+  __builtin_HEXAGON_V6_vdelta_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vdelta.128B
+  __builtin_HEXAGON_V6_vdelta(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vdelta
+  __builtin_HEXAGON_V6_vdmpybus_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.128B
+  __builtin_HEXAGON_V6_vdmpybus_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.acc.128B
+  __builtin_HEXAGON_V6_vdmpybus_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.acc
+  __builtin_HEXAGON_V6_vdmpybus_dv_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.128B
+  __builtin_HEXAGON_V6_vdmpybus_dv_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.acc.128B
+  __builtin_HEXAGON_V6_vdmpybus_dv_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.acc
+  __builtin_HEXAGON_V6_vdmpybus_dv(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv
+  __builtin_HEXAGON_V6_vdmpybus(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus
+  __builtin_HEXAGON_V6_vdmpyhb_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.128B
+  __builtin_HEXAGON_V6_vdmpyhb_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhb_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.acc
+  __builtin_HEXAGON_V6_vdmpyhb_dv_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.128B
+  __builtin_HEXAGON_V6_vdmpyhb_dv_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhb_dv_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.acc
+  __builtin_HEXAGON_V6_vdmpyhb_dv(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv
+  __builtin_HEXAGON_V6_vdmpyhb(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb
+  __builtin_HEXAGON_V6_vdmpyhisat_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.128B
+  __builtin_HEXAGON_V6_vdmpyhisat_acc_128B(v32, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhisat_acc(v16, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.acc
+  __builtin_HEXAGON_V6_vdmpyhisat(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat
+  __builtin_HEXAGON_V6_vdmpyhsat_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.128B
+  __builtin_HEXAGON_V6_vdmpyhsat_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsat_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.acc
+  __builtin_HEXAGON_V6_vdmpyhsat(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat
+  __builtin_HEXAGON_V6_vdmpyhsuisat_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.128B
+  __builtin_HEXAGON_V6_vdmpyhsuisat_acc_128B(v32, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsuisat_acc(v16, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.acc
+  __builtin_HEXAGON_V6_vdmpyhsuisat(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat
+  __builtin_HEXAGON_V6_vdmpyhsusat_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.128B
+  __builtin_HEXAGON_V6_vdmpyhsusat_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsusat_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.acc
+  __builtin_HEXAGON_V6_vdmpyhsusat(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat
+  __builtin_HEXAGON_V6_vdmpyhvsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.128B
+  __builtin_HEXAGON_V6_vdmpyhvsat_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhvsat_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.acc
+  __builtin_HEXAGON_V6_vdmpyhvsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat
+  __builtin_HEXAGON_V6_vdsaduh_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.128B
+  __builtin_HEXAGON_V6_vdsaduh_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.acc.128B
+  __builtin_HEXAGON_V6_vdsaduh_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.acc
+  __builtin_HEXAGON_V6_vdsaduh(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh
+  __builtin_HEXAGON_V6_veqb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqb.128B
+  __builtin_HEXAGON_V6_veqb_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqb.and.128B
+  __builtin_HEXAGON_V6_veqb_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqb.and
+  __builtin_HEXAGON_V6_veqb_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqb.or.128B
+  __builtin_HEXAGON_V6_veqb_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqb.or
+  __builtin_HEXAGON_V6_veqb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqb
+  __builtin_HEXAGON_V6_veqb_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqb.xor.128B
+  __builtin_HEXAGON_V6_veqb_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqb.xor
+  __builtin_HEXAGON_V6_veqh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqh.128B
+  __builtin_HEXAGON_V6_veqh_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqh.and.128B
+  __builtin_HEXAGON_V6_veqh_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqh.and
+  __builtin_HEXAGON_V6_veqh_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqh.or.128B
+  __builtin_HEXAGON_V6_veqh_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqh.or
+  __builtin_HEXAGON_V6_veqh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqh
+  __builtin_HEXAGON_V6_veqh_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqh.xor.128B
+  __builtin_HEXAGON_V6_veqh_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqh.xor
+  __builtin_HEXAGON_V6_veqw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqw.128B
+  __builtin_HEXAGON_V6_veqw_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqw.and.128B
+  __builtin_HEXAGON_V6_veqw_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqw.and
+  __builtin_HEXAGON_V6_veqw_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqw.or.128B
+  __builtin_HEXAGON_V6_veqw_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqw.or
+  __builtin_HEXAGON_V6_veqw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqw
+  __builtin_HEXAGON_V6_veqw_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqw.xor.128B
+  __builtin_HEXAGON_V6_veqw_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqw.xor
+  __builtin_HEXAGON_V6_vgtb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtb.128B
+  __builtin_HEXAGON_V6_vgtb_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtb.and.128B
+  __builtin_HEXAGON_V6_vgtb_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtb.and
+  __builtin_HEXAGON_V6_vgtb_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtb.or.128B
+  __builtin_HEXAGON_V6_vgtb_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtb.or
+  __builtin_HEXAGON_V6_vgtb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtb
+  __builtin_HEXAGON_V6_vgtb_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtb.xor.128B
+  __builtin_HEXAGON_V6_vgtb_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtb.xor
+  __builtin_HEXAGON_V6_vgth_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgth.128B
+  __builtin_HEXAGON_V6_vgth_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgth.and.128B
+  __builtin_HEXAGON_V6_vgth_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgth.and
+  __builtin_HEXAGON_V6_vgth_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgth.or.128B
+  __builtin_HEXAGON_V6_vgth_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgth.or
+  __builtin_HEXAGON_V6_vgth(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgth
+  __builtin_HEXAGON_V6_vgth_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgth.xor.128B
+  __builtin_HEXAGON_V6_vgth_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgth.xor
+  __builtin_HEXAGON_V6_vgtub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtub.128B
+  __builtin_HEXAGON_V6_vgtub_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtub.and.128B
+  __builtin_HEXAGON_V6_vgtub_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtub.and
+  __builtin_HEXAGON_V6_vgtub_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtub.or.128B
+  __builtin_HEXAGON_V6_vgtub_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtub.or
+  __builtin_HEXAGON_V6_vgtub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtub
+  __builtin_HEXAGON_V6_vgtub_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtub.xor.128B
+  __builtin_HEXAGON_V6_vgtub_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtub.xor
+  __builtin_HEXAGON_V6_vgtuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuh.128B
+  __builtin_HEXAGON_V6_vgtuh_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuh.and.128B
+  __builtin_HEXAGON_V6_vgtuh_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuh.and
+  __builtin_HEXAGON_V6_vgtuh_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuh.or.128B
+  __builtin_HEXAGON_V6_vgtuh_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuh.or
+  __builtin_HEXAGON_V6_vgtuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuh
+  __builtin_HEXAGON_V6_vgtuh_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuh.xor.128B
+  __builtin_HEXAGON_V6_vgtuh_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuh.xor
+  __builtin_HEXAGON_V6_vgtuw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuw.128B
+  __builtin_HEXAGON_V6_vgtuw_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuw.and.128B
+  __builtin_HEXAGON_V6_vgtuw_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuw.and
+  __builtin_HEXAGON_V6_vgtuw_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuw.or.128B
+  __builtin_HEXAGON_V6_vgtuw_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuw.or
+  __builtin_HEXAGON_V6_vgtuw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuw
+  __builtin_HEXAGON_V6_vgtuw_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuw.xor.128B
+  __builtin_HEXAGON_V6_vgtuw_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuw.xor
+  __builtin_HEXAGON_V6_vgtw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtw.128B
+  __builtin_HEXAGON_V6_vgtw_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtw.and.128B
+  __builtin_HEXAGON_V6_vgtw_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtw.and
+  __builtin_HEXAGON_V6_vgtw_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtw.or.128B
+  __builtin_HEXAGON_V6_vgtw_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtw.or
+  __builtin_HEXAGON_V6_vgtw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtw
+  __builtin_HEXAGON_V6_vgtw_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtw.xor.128B
+  __builtin_HEXAGON_V6_vgtw_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtw.xor
+  __builtin_HEXAGON_V6_vinsertwr_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vinsertwr.128B
+  __builtin_HEXAGON_V6_vinsertwr(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vinsertwr
+  __builtin_HEXAGON_V6_vlalignb_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignb.128B
+  __builtin_HEXAGON_V6_vlalignbi_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignbi.128B
+  __builtin_HEXAGON_V6_vlalignbi(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignbi
+  __builtin_HEXAGON_V6_vlalignb(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignb
+  __builtin_HEXAGON_V6_vlsrh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrh.128B
+  __builtin_HEXAGON_V6_vlsrhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vlsrhv.128B
+  __builtin_HEXAGON_V6_vlsrh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrh
+  __builtin_HEXAGON_V6_vlsrhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vlsrhv
+  __builtin_HEXAGON_V6_vlsrw_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrw.128B
+  __builtin_HEXAGON_V6_vlsrwv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vlsrwv.128B
+  __builtin_HEXAGON_V6_vlsrw(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrw
+  __builtin_HEXAGON_V6_vlsrwv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vlsrwv
+  __builtin_HEXAGON_V6_vlutb_128B(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.128B
+  __builtin_HEXAGON_V6_vlutb_acc_128B(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.acc.128B
+  __builtin_HEXAGON_V6_vlutb_acc(v16, v16, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.acc
+  __builtin_HEXAGON_V6_vlutb_dv_128B(v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.dv.128B
+  __builtin_HEXAGON_V6_vlutb_dv_acc_128B(v64, v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.dv.acc.128B
+  __builtin_HEXAGON_V6_vlutb_dv_acc(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.dv.acc
+  __builtin_HEXAGON_V6_vlutb_dv(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.dv
+  __builtin_HEXAGON_V6_vlutb(v16, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb
+  __builtin_HEXAGON_V6_vlutvvb_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.128B
+  __builtin_HEXAGON_V6_vlutvvb_oracc_128B(v32, v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.oracc.128B
+  __builtin_HEXAGON_V6_vlutvvb_oracc(v16, v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.oracc
+  __builtin_HEXAGON_V6_vlutvvb(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb
+  __builtin_HEXAGON_V6_vlutvwh_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.128B
+  __builtin_HEXAGON_V6_vlutvwh_oracc_128B(v64, v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.oracc.128B
+  __builtin_HEXAGON_V6_vlutvwh_oracc(v32, v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.oracc
+  __builtin_HEXAGON_V6_vlutvwh(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh
+  __builtin_HEXAGON_V6_vmaxh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmaxh.128B
+  __builtin_HEXAGON_V6_vmaxh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmaxh
+  __builtin_HEXAGON_V6_vmaxub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmaxub.128B
+  __builtin_HEXAGON_V6_vmaxub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmaxub
+  __builtin_HEXAGON_V6_vmaxuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmaxuh.128B
+  __builtin_HEXAGON_V6_vmaxuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmaxuh
+  __builtin_HEXAGON_V6_vmaxw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmaxw.128B
+  __builtin_HEXAGON_V6_vmaxw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmaxw
+  __builtin_HEXAGON_V6_vminh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vminh.128B
+  __builtin_HEXAGON_V6_vminh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vminh
+  __builtin_HEXAGON_V6_vminub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vminub.128B
+  __builtin_HEXAGON_V6_vminub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vminub
+  __builtin_HEXAGON_V6_vminuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vminuh.128B
+  __builtin_HEXAGON_V6_vminuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vminuh
+  __builtin_HEXAGON_V6_vminw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vminw.128B
+  __builtin_HEXAGON_V6_vminw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vminw
+  __builtin_HEXAGON_V6_vmpabus_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus.128B
+  __builtin_HEXAGON_V6_vmpabus_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus.acc.128B
+  __builtin_HEXAGON_V6_vmpabus_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus.acc
+  __builtin_HEXAGON_V6_vmpabusv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpabusv.128B
+  __builtin_HEXAGON_V6_vmpabus(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus
+  __builtin_HEXAGON_V6_vmpabusv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpabusv
+  __builtin_HEXAGON_V6_vmpabuuv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpabuuv.128B
+  __builtin_HEXAGON_V6_vmpabuuv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpabuuv
+  __builtin_HEXAGON_V6_vmpahb_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb.128B
+  __builtin_HEXAGON_V6_vmpahb_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb.acc.128B
+  __builtin_HEXAGON_V6_vmpahb_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb.acc
+  __builtin_HEXAGON_V6_vmpahb(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb
+  __builtin_HEXAGON_V6_vmpybus_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.128B
+  __builtin_HEXAGON_V6_vmpybus_acc_128B(v64, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.acc.128B
+  __builtin_HEXAGON_V6_vmpybus_acc(v32, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.acc
+  __builtin_HEXAGON_V6_vmpybusv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.128B
+  __builtin_HEXAGON_V6_vmpybus(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus
+  __builtin_HEXAGON_V6_vmpybusv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.acc.128B
+  __builtin_HEXAGON_V6_vmpybusv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.acc
+  __builtin_HEXAGON_V6_vmpybusv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpybusv
+  __builtin_HEXAGON_V6_vmpybv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpybv.128B
+  __builtin_HEXAGON_V6_vmpybv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpybv.acc.128B
+  __builtin_HEXAGON_V6_vmpybv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpybv.acc
+  __builtin_HEXAGON_V6_vmpybv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpybv
+  __builtin_HEXAGON_V6_vmpyewuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyewuh.128B
+  __builtin_HEXAGON_V6_vmpyewuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyewuh
+  __builtin_HEXAGON_V6_vmpyh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyh.128B
+  __builtin_HEXAGON_V6_vmpyhsat_acc_128B(v64, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsat.acc.128B
+  __builtin_HEXAGON_V6_vmpyhsat_acc(v32, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsat.acc
+  __builtin_HEXAGON_V6_vmpyhsrs_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsrs.128B
+  __builtin_HEXAGON_V6_vmpyhsrs(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsrs
+  __builtin_HEXAGON_V6_vmpyhss_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhss.128B
+  __builtin_HEXAGON_V6_vmpyhss(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhss
+  __builtin_HEXAGON_V6_vmpyhus_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.128B
+  __builtin_HEXAGON_V6_vmpyhus_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.acc.128B
+  __builtin_HEXAGON_V6_vmpyhus_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.acc
+  __builtin_HEXAGON_V6_vmpyhus(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhus
+  __builtin_HEXAGON_V6_vmpyhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.128B
+  __builtin_HEXAGON_V6_vmpyh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyh
+  __builtin_HEXAGON_V6_vmpyhv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.acc.128B
+  __builtin_HEXAGON_V6_vmpyhv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.acc
+  __builtin_HEXAGON_V6_vmpyhvsrs_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhvsrs.128B
+  __builtin_HEXAGON_V6_vmpyhvsrs(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhvsrs
+  __builtin_HEXAGON_V6_vmpyhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhv
+  __builtin_HEXAGON_V6_vmpyieoh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyieoh.128B
+  __builtin_HEXAGON_V6_vmpyieoh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyieoh
+  __builtin_HEXAGON_V6_vmpyiewh_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyiewh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiewh_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyiewh.acc
+  __builtin_HEXAGON_V6_vmpyiewuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.128B
+  __builtin_HEXAGON_V6_vmpyiewuh_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiewuh_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.acc
+  __builtin_HEXAGON_V6_vmpyiewuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh
+  __builtin_HEXAGON_V6_vmpyih_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyih.128B
+  __builtin_HEXAGON_V6_vmpyih_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyih.acc.128B
+  __builtin_HEXAGON_V6_vmpyih_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyih.acc
+  __builtin_HEXAGON_V6_vmpyihb_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.128B
+  __builtin_HEXAGON_V6_vmpyihb_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.acc.128B
+  __builtin_HEXAGON_V6_vmpyihb_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.acc
+  __builtin_HEXAGON_V6_vmpyihb(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb
+  __builtin_HEXAGON_V6_vmpyih(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyih
+  __builtin_HEXAGON_V6_vmpyiowh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyiowh.128B
+  __builtin_HEXAGON_V6_vmpyiowh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyiowh
+  __builtin_HEXAGON_V6_vmpyiwb_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.128B
+  __builtin_HEXAGON_V6_vmpyiwb_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.acc.128B
+  __builtin_HEXAGON_V6_vmpyiwb_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.acc
+  __builtin_HEXAGON_V6_vmpyiwb(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb
+  __builtin_HEXAGON_V6_vmpyiwh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.128B
+  __builtin_HEXAGON_V6_vmpyiwh_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiwh_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.acc
+  __builtin_HEXAGON_V6_vmpyiwh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh
+  __builtin_HEXAGON_V6_vmpyowh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.128B
+  __builtin_HEXAGON_V6_vmpyowh_rnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.128B
+  __builtin_HEXAGON_V6_vmpyowh_rnd_sacc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.sacc.128B
+  __builtin_HEXAGON_V6_vmpyowh_rnd_sacc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.sacc
+  __builtin_HEXAGON_V6_vmpyowh_rnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd
+  __builtin_HEXAGON_V6_vmpyowh_sacc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.sacc.128B
+  __builtin_HEXAGON_V6_vmpyowh_sacc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.sacc
+  __builtin_HEXAGON_V6_vmpyowh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyowh
+  __builtin_HEXAGON_V6_vmpyub_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub.128B
+  __builtin_HEXAGON_V6_vmpyub_acc_128B(v64, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub.acc.128B
+  __builtin_HEXAGON_V6_vmpyub_acc(v32, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub.acc
+  __builtin_HEXAGON_V6_vmpyubv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.128B
+  __builtin_HEXAGON_V6_vmpyub(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub
+  __builtin_HEXAGON_V6_vmpyubv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.acc.128B
+  __builtin_HEXAGON_V6_vmpyubv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.acc
+  __builtin_HEXAGON_V6_vmpyubv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyubv
+  __builtin_HEXAGON_V6_vmpyuh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.128B
+  __builtin_HEXAGON_V6_vmpyuh_acc_128B(v64, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.acc.128B
+  __builtin_HEXAGON_V6_vmpyuh_acc(v32, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.acc
+  __builtin_HEXAGON_V6_vmpyuhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.128B
+  __builtin_HEXAGON_V6_vmpyuh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh
+  __builtin_HEXAGON_V6_vmpyuhv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.acc.128B
+  __builtin_HEXAGON_V6_vmpyuhv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.acc
+  __builtin_HEXAGON_V6_vmpyuhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv
+  __builtin_HEXAGON_V6_vmux_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmux.128B
+  __builtin_HEXAGON_V6_vmux(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmux
+  __builtin_HEXAGON_V6_vnavgh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vnavgh.128B
+  __builtin_HEXAGON_V6_vnavgh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vnavgh
+  __builtin_HEXAGON_V6_vnavgub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vnavgub.128B
+  __builtin_HEXAGON_V6_vnavgub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vnavgub
+  __builtin_HEXAGON_V6_vnavgw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vnavgw.128B
+  __builtin_HEXAGON_V6_vnavgw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vnavgw
+  __builtin_HEXAGON_V6_vnormamth_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vnormamth.128B
+  __builtin_HEXAGON_V6_vnormamth(v16);
+  // CHECK: @llvm.hexagon.V6.vnormamth
+  __builtin_HEXAGON_V6_vnormamtw_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vnormamtw.128B
+  __builtin_HEXAGON_V6_vnormamtw(v16);
+  // CHECK: @llvm.hexagon.V6.vnormamtw
+  __builtin_HEXAGON_V6_vnot_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vnot.128B
+  __builtin_HEXAGON_V6_vnot(v16);
+  // CHECK: @llvm.hexagon.V6.vnot
+  __builtin_HEXAGON_V6_vor_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vor.128B
+  __builtin_HEXAGON_V6_vor(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vor
+  __builtin_HEXAGON_V6_vpackeb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackeb.128B
+  __builtin_HEXAGON_V6_vpackeb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackeb
+  __builtin_HEXAGON_V6_vpackeh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackeh.128B
+  __builtin_HEXAGON_V6_vpackeh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackeh
+  __builtin_HEXAGON_V6_vpackhb_sat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackhb.sat.128B
+  __builtin_HEXAGON_V6_vpackhb_sat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackhb.sat
+  __builtin_HEXAGON_V6_vpackhub_sat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackhub.sat.128B
+  __builtin_HEXAGON_V6_vpackhub_sat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackhub.sat
+  __builtin_HEXAGON_V6_vpackob_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackob.128B
+  __builtin_HEXAGON_V6_vpackob(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackob
+  __builtin_HEXAGON_V6_vpackoh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackoh.128B
+  __builtin_HEXAGON_V6_vpackoh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackoh
+  __builtin_HEXAGON_V6_vpackwh_sat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackwh.sat.128B
+  __builtin_HEXAGON_V6_vpackwh_sat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackwh.sat
+  __builtin_HEXAGON_V6_vpackwuh_sat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackwuh.sat.128B
+  __builtin_HEXAGON_V6_vpackwuh_sat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackwuh.sat
+  __builtin_HEXAGON_V6_vpopcounth_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vpopcounth.128B
+  __builtin_HEXAGON_V6_vpopcounth(v16);
+  // CHECK: @llvm.hexagon.V6.vpopcounth
+  __builtin_HEXAGON_V6_vrdelta_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrdelta.128B
+  __builtin_HEXAGON_V6_vrdelta(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrdelta
+  __builtin_HEXAGON_V6_vrmpybus_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.128B
+  __builtin_HEXAGON_V6_vrmpybus_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.acc.128B
+  __builtin_HEXAGON_V6_vrmpybus_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.acc
+  __builtin_HEXAGON_V6_vrmpybusi_128B(v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.128B
+  __builtin_HEXAGON_V6_vrmpybusi_acc_128B(v64, v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.acc.128B
+  __builtin_HEXAGON_V6_vrmpybusi_acc(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.acc
+  __builtin_HEXAGON_V6_vrmpybusi(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi
+  __builtin_HEXAGON_V6_vrmpybusv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.128B
+  __builtin_HEXAGON_V6_vrmpybus(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus
+  __builtin_HEXAGON_V6_vrmpybusv_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.acc.128B
+  __builtin_HEXAGON_V6_vrmpybusv_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.acc
+  __builtin_HEXAGON_V6_vrmpybusv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv
+  __builtin_HEXAGON_V6_vrmpybv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.128B
+  __builtin_HEXAGON_V6_vrmpybv_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.acc.128B
+  __builtin_HEXAGON_V6_vrmpybv_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.acc
+  __builtin_HEXAGON_V6_vrmpybv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpybv
+  __builtin_HEXAGON_V6_vrmpyub_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.128B
+  __builtin_HEXAGON_V6_vrmpyub_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.acc.128B
+  __builtin_HEXAGON_V6_vrmpyub_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.acc
+  __builtin_HEXAGON_V6_vrmpyubi_128B(v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.128B
+  __builtin_HEXAGON_V6_vrmpyubi_acc_128B(v64, v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.acc.128B
+  __builtin_HEXAGON_V6_vrmpyubi_acc(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.acc
+  __builtin_HEXAGON_V6_vrmpyubi(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi
+  __builtin_HEXAGON_V6_vrmpyubv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.128B
+  __builtin_HEXAGON_V6_vrmpyub(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub
+  __builtin_HEXAGON_V6_vrmpyubv_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.acc.128B
+  __builtin_HEXAGON_V6_vrmpyubv_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.acc
+  __builtin_HEXAGON_V6_vrmpyubv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv
+  __builtin_HEXAGON_V6_vror_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vror.128B
+  __builtin_HEXAGON_V6_vror(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vror
+  __builtin_HEXAGON_V6_vroundhb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vroundhb.128B
+  __builtin_HEXAGON_V6_vroundhb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vroundhb
+  __builtin_HEXAGON_V6_vroundhub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vroundhub.128B
+  __builtin_HEXAGON_V6_vroundhub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vroundhub
+  __builtin_HEXAGON_V6_vroundwh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vroundwh.128B
+  __builtin_HEXAGON_V6_vroundwh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vroundwh
+  __builtin_HEXAGON_V6_vroundwuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vroundwuh.128B
+  __builtin_HEXAGON_V6_vroundwuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vroundwuh
+  __builtin_HEXAGON_V6_vrsadubi_128B(v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.128B
+  __builtin_HEXAGON_V6_vrsadubi_acc_128B(v64, v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.acc.128B
+  __builtin_HEXAGON_V6_vrsadubi_acc(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.acc
+  __builtin_HEXAGON_V6_vrsadubi(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi
+  __builtin_HEXAGON_V6_vsathub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsathub.128B
+  __builtin_HEXAGON_V6_vsathub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsathub
+  __builtin_HEXAGON_V6_vsatwh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsatwh.128B
+  __builtin_HEXAGON_V6_vsatwh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsatwh
+  __builtin_HEXAGON_V6_vsb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vsb.128B
+  __builtin_HEXAGON_V6_vsb(v16);
+  // CHECK: @llvm.hexagon.V6.vsb
+  __builtin_HEXAGON_V6_vsh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vsh.128B
+  __builtin_HEXAGON_V6_vshufeh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshufeh.128B
+  __builtin_HEXAGON_V6_vshufeh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshufeh
+  __builtin_HEXAGON_V6_vshuffb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vshuffb.128B
+  __builtin_HEXAGON_V6_vshuffb(v16);
+  // CHECK: @llvm.hexagon.V6.vshuffb
+  __builtin_HEXAGON_V6_vshuffeb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshuffeb.128B
+  __builtin_HEXAGON_V6_vshuffeb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshuffeb
+  __builtin_HEXAGON_V6_vshuffh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vshuffh.128B
+  __builtin_HEXAGON_V6_vshuffh(v16);
+  // CHECK: @llvm.hexagon.V6.vshuffh
+  __builtin_HEXAGON_V6_vshuffob_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshuffob.128B
+  __builtin_HEXAGON_V6_vshuffob(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshuffob
+  __builtin_HEXAGON_V6_vshuffvdd_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vshuffvdd.128B
+  __builtin_HEXAGON_V6_vshuffvdd(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vshuffvdd
+  __builtin_HEXAGON_V6_vshufoeb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshufoeb.128B
+  __builtin_HEXAGON_V6_vshufoeb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshufoeb
+  __builtin_HEXAGON_V6_vshufoeh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshufoeh.128B
+  __builtin_HEXAGON_V6_vshufoeh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshufoeh
+  __builtin_HEXAGON_V6_vshufoh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshufoh.128B
+  __builtin_HEXAGON_V6_vshufoh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshufoh
+  __builtin_HEXAGON_V6_vsh(v16);
+  // CHECK: @llvm.hexagon.V6.vsh
+  __builtin_HEXAGON_V6_vsubb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubb.128B
+  __builtin_HEXAGON_V6_vsubb_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubb.dv.128B
+  __builtin_HEXAGON_V6_vsubb_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubb.dv
+  __builtin_HEXAGON_V6_vsubbnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubbnq.128B
+  __builtin_HEXAGON_V6_vsubbnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubbnq
+  __builtin_HEXAGON_V6_vsubbq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubbq.128B
+  __builtin_HEXAGON_V6_vsubbq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubbq
+  __builtin_HEXAGON_V6_vsubb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubb
+  __builtin_HEXAGON_V6_vsubh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubh.128B
+  __builtin_HEXAGON_V6_vsubh_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubh.dv.128B
+  __builtin_HEXAGON_V6_vsubh_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubh.dv
+  __builtin_HEXAGON_V6_vsubhnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhnq.128B
+  __builtin_HEXAGON_V6_vsubhnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubhnq
+  __builtin_HEXAGON_V6_vsubhq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhq.128B
+  __builtin_HEXAGON_V6_vsubhq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubhq
+  __builtin_HEXAGON_V6_vsubhsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.128B
+  __builtin_HEXAGON_V6_vsubhsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.dv.128B
+  __builtin_HEXAGON_V6_vsubhsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.dv
+  __builtin_HEXAGON_V6_vsubhsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubhsat
+  __builtin_HEXAGON_V6_vsubh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubh
+  __builtin_HEXAGON_V6_vsubhw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhw.128B
+  __builtin_HEXAGON_V6_vsubhw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubhw
+  __builtin_HEXAGON_V6_vsububh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsububh.128B
+  __builtin_HEXAGON_V6_vsububh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsububh
+  __builtin_HEXAGON_V6_vsububsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsububsat.128B
+  __builtin_HEXAGON_V6_vsububsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsububsat.dv.128B
+  __builtin_HEXAGON_V6_vsububsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsububsat.dv
+  __builtin_HEXAGON_V6_vsububsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsububsat
+  __builtin_HEXAGON_V6_vsubuhsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.128B
+  __builtin_HEXAGON_V6_vsubuhsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.dv.128B
+  __builtin_HEXAGON_V6_vsubuhsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.dv
+  __builtin_HEXAGON_V6_vsubuhsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat
+  __builtin_HEXAGON_V6_vsubuhw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubuhw.128B
+  __builtin_HEXAGON_V6_vsubuhw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubuhw
+  __builtin_HEXAGON_V6_vsubw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubw.128B
+  __builtin_HEXAGON_V6_vsubw_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubw.dv.128B
+  __builtin_HEXAGON_V6_vsubw_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubw.dv
+  __builtin_HEXAGON_V6_vsubwnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubwnq.128B
+  __builtin_HEXAGON_V6_vsubwnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubwnq
+  __builtin_HEXAGON_V6_vsubwq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubwq.128B
+  __builtin_HEXAGON_V6_vsubwq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubwq
+  __builtin_HEXAGON_V6_vsubwsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.128B
+  __builtin_HEXAGON_V6_vsubwsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.dv.128B
+  __builtin_HEXAGON_V6_vsubwsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.dv
+  __builtin_HEXAGON_V6_vsubwsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubwsat
+  __builtin_HEXAGON_V6_vsubw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubw
+  __builtin_HEXAGON_V6_vswap_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vswap.128B
+  __builtin_HEXAGON_V6_vswap(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vswap
+  __builtin_HEXAGON_V6_vtmpyb_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.128B
+  __builtin_HEXAGON_V6_vtmpyb_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.acc.128B
+  __builtin_HEXAGON_V6_vtmpyb_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.acc
+  __builtin_HEXAGON_V6_vtmpybus_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.128B
+  __builtin_HEXAGON_V6_vtmpybus_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.acc.128B
+  __builtin_HEXAGON_V6_vtmpybus_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.acc
+  __builtin_HEXAGON_V6_vtmpybus(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus
+  __builtin_HEXAGON_V6_vtmpyb(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb
+  __builtin_HEXAGON_V6_vtmpyhb_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.128B
+  __builtin_HEXAGON_V6_vtmpyhb_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.acc.128B
+  __builtin_HEXAGON_V6_vtmpyhb_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.acc
+  __builtin_HEXAGON_V6_vtmpyhb(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb
+  __builtin_HEXAGON_V6_vunpackb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vunpackb.128B
+  __builtin_HEXAGON_V6_vunpackb(v16);
+  // CHECK: @llvm.hexagon.V6.vunpackb
+  __builtin_HEXAGON_V6_vunpackh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vunpackh.128B
+  __builtin_HEXAGON_V6_vunpackh(v16);
+  // CHECK: @llvm.hexagon.V6.vunpackh
+  __builtin_HEXAGON_V6_vunpackob_128B(v64, v32);
+  // CHECK: @llvm.hexagon.V6.vunpackob.128B
+  __builtin_HEXAGON_V6_vunpackob(v32, v16);
+  // CHECK: @llvm.hexagon.V6.vunpackob
+  __builtin_HEXAGON_V6_vunpackoh_128B(v64, v32);
+  // CHECK: @llvm.hexagon.V6.vunpackoh.128B
+  __builtin_HEXAGON_V6_vunpackoh(v32, v16);
+  // CHECK: @llvm.hexagon.V6.vunpackoh
+  __builtin_HEXAGON_V6_vunpackub_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vunpackub.128B
+  __builtin_HEXAGON_V6_vunpackub(v16);
+  // CHECK: @llvm.hexagon.V6.vunpackub
+  __builtin_HEXAGON_V6_vunpackuh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vunpackuh.128B
+  __builtin_HEXAGON_V6_vunpackuh(v16);
+  // CHECK: @llvm.hexagon.V6.vunpackuh
+  __builtin_HEXAGON_V6_vxor_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vxor.128B
+  __builtin_HEXAGON_V6_vxor(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vxor
+  __builtin_HEXAGON_V6_vzb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vzb.128B
+  __builtin_HEXAGON_V6_vzb(v16);
+  // CHECK: @llvm.hexagon.V6.vzb
+  __builtin_HEXAGON_V6_vzh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vzh.128B
+  __builtin_HEXAGON_V6_vzh(v16);
+  // CHECK: @llvm.hexagon.V6.vzh
+}
diff --git a/test/CodeGen/builtins-nvptx.c b/test/CodeGen/builtins-nvptx.c
index 745e74f..cd21361 100644
--- a/test/CodeGen/builtins-nvptx.c
+++ b/test/CodeGen/builtins-nvptx.c
@@ -1,6 +1,8 @@
 // REQUIRES: nvptx-registered-target
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
+// RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
+// RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -9,15 +11,15 @@
 
 __device__ int read_tid() {
 
-// CHECK: call i32 @llvm.ptx.read.tid.x()
-// CHECK: call i32 @llvm.ptx.read.tid.y()
-// CHECK: call i32 @llvm.ptx.read.tid.z()
-// CHECK: call i32 @llvm.ptx.read.tid.w()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
 
-  int x = __builtin_ptx_read_tid_x();
-  int y = __builtin_ptx_read_tid_y();
-  int z = __builtin_ptx_read_tid_z();
-  int w = __builtin_ptx_read_tid_w();
+  int x = __nvvm_read_ptx_sreg_tid_x();
+  int y = __nvvm_read_ptx_sreg_tid_y();
+  int z = __nvvm_read_ptx_sreg_tid_z();
+  int w = __nvvm_read_ptx_sreg_tid_w();
 
   return x + y + z + w;
 
@@ -25,15 +27,15 @@
 
 __device__ int read_ntid() {
 
-// CHECK: call i32 @llvm.ptx.read.ntid.x()
-// CHECK: call i32 @llvm.ptx.read.ntid.y()
-// CHECK: call i32 @llvm.ptx.read.ntid.z()
-// CHECK: call i32 @llvm.ptx.read.ntid.w()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
 
-  int x = __builtin_ptx_read_ntid_x();
-  int y = __builtin_ptx_read_ntid_y();
-  int z = __builtin_ptx_read_ntid_z();
-  int w = __builtin_ptx_read_ntid_w();
+  int x = __nvvm_read_ptx_sreg_ntid_x();
+  int y = __nvvm_read_ptx_sreg_ntid_y();
+  int z = __nvvm_read_ptx_sreg_ntid_z();
+  int w = __nvvm_read_ptx_sreg_ntid_w();
 
   return x + y + z + w;
 
@@ -41,15 +43,15 @@
 
 __device__ int read_ctaid() {
 
-// CHECK: call i32 @llvm.ptx.read.ctaid.x()
-// CHECK: call i32 @llvm.ptx.read.ctaid.y()
-// CHECK: call i32 @llvm.ptx.read.ctaid.z()
-// CHECK: call i32 @llvm.ptx.read.ctaid.w()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
 
-  int x = __builtin_ptx_read_ctaid_x();
-  int y = __builtin_ptx_read_ctaid_y();
-  int z = __builtin_ptx_read_ctaid_z();
-  int w = __builtin_ptx_read_ctaid_w();
+  int x = __nvvm_read_ptx_sreg_ctaid_x();
+  int y = __nvvm_read_ptx_sreg_ctaid_y();
+  int z = __nvvm_read_ptx_sreg_ctaid_z();
+  int w = __nvvm_read_ptx_sreg_ctaid_w();
 
   return x + y + z + w;
 
@@ -57,15 +59,15 @@
 
 __device__ int read_nctaid() {
 
-// CHECK: call i32 @llvm.ptx.read.nctaid.x()
-// CHECK: call i32 @llvm.ptx.read.nctaid.y()
-// CHECK: call i32 @llvm.ptx.read.nctaid.z()
-// CHECK: call i32 @llvm.ptx.read.nctaid.w()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
 
-  int x = __builtin_ptx_read_nctaid_x();
-  int y = __builtin_ptx_read_nctaid_y();
-  int z = __builtin_ptx_read_nctaid_z();
-  int w = __builtin_ptx_read_nctaid_w();
+  int x = __nvvm_read_ptx_sreg_nctaid_x();
+  int y = __nvvm_read_ptx_sreg_nctaid_y();
+  int z = __nvvm_read_ptx_sreg_nctaid_z();
+  int w = __nvvm_read_ptx_sreg_nctaid_w();
 
   return x + y + z + w;
 
@@ -73,19 +75,19 @@
 
 __device__ int read_ids() {
 
-// CHECK: call i32 @llvm.ptx.read.laneid()
-// CHECK: call i32 @llvm.ptx.read.warpid()
-// CHECK: call i32 @llvm.ptx.read.nwarpid()
-// CHECK: call i32 @llvm.ptx.read.smid()
-// CHECK: call i32 @llvm.ptx.read.nsmid()
-// CHECK: call i32 @llvm.ptx.read.gridid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
 
-  int a = __builtin_ptx_read_laneid();
-  int b = __builtin_ptx_read_warpid();
-  int c = __builtin_ptx_read_nwarpid();
-  int d = __builtin_ptx_read_smid();
-  int e = __builtin_ptx_read_nsmid();
-  int f = __builtin_ptx_read_gridid();
+  int a = __nvvm_read_ptx_sreg_laneid();
+  int b = __nvvm_read_ptx_sreg_warpid();
+  int c = __nvvm_read_ptx_sreg_nwarpid();
+  int d = __nvvm_read_ptx_sreg_smid();
+  int e = __nvvm_read_ptx_sreg_nsmid();
+  int f = __nvvm_read_ptx_sreg_gridid();
 
   return a + b + c + d + e + f;
 
@@ -93,17 +95,17 @@
 
 __device__ int read_lanemasks() {
 
-// CHECK: call i32 @llvm.ptx.read.lanemask.eq()
-// CHECK: call i32 @llvm.ptx.read.lanemask.le()
-// CHECK: call i32 @llvm.ptx.read.lanemask.lt()
-// CHECK: call i32 @llvm.ptx.read.lanemask.ge()
-// CHECK: call i32 @llvm.ptx.read.lanemask.gt()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
 
-  int a = __builtin_ptx_read_lanemask_eq();
-  int b = __builtin_ptx_read_lanemask_le();
-  int c = __builtin_ptx_read_lanemask_lt();
-  int d = __builtin_ptx_read_lanemask_ge();
-  int e = __builtin_ptx_read_lanemask_gt();
+  int a = __nvvm_read_ptx_sreg_lanemask_eq();
+  int b = __nvvm_read_ptx_sreg_lanemask_le();
+  int c = __nvvm_read_ptx_sreg_lanemask_lt();
+  int d = __nvvm_read_ptx_sreg_lanemask_ge();
+  int e = __nvvm_read_ptx_sreg_lanemask_gt();
 
   return a + b + c + d + e;
 
@@ -111,26 +113,26 @@
 
 __device__ long long read_clocks() {
 
-// CHECK: call i32 @llvm.ptx.read.clock()
-// CHECK: call i64 @llvm.ptx.read.clock64()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
+// CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
 
-  int a = __builtin_ptx_read_clock();
-  long long b = __builtin_ptx_read_clock64();
+  int a = __nvvm_read_ptx_sreg_clock();
+  long long b = __nvvm_read_ptx_sreg_clock64();
 
   return a + b;
 }
 
 __device__ int read_pms() {
 
-// CHECK: call i32 @llvm.ptx.read.pm0()
-// CHECK: call i32 @llvm.ptx.read.pm1()
-// CHECK: call i32 @llvm.ptx.read.pm2()
-// CHECK: call i32 @llvm.ptx.read.pm3()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
 
-  int a = __builtin_ptx_read_pm0();
-  int b = __builtin_ptx_read_pm1();
-  int c = __builtin_ptx_read_pm2();
-  int d = __builtin_ptx_read_pm3();
+  int a = __nvvm_read_ptx_sreg_pm0();
+  int b = __nvvm_read_ptx_sreg_pm1();
+  int c = __nvvm_read_ptx_sreg_pm2();
+  int d = __nvvm_read_ptx_sreg_pm3();
 
   return a + b + c + d;
 
@@ -138,9 +140,9 @@
 
 __device__ void sync() {
 
-// CHECK: call void @llvm.ptx.bar.sync(i32 0)
+// CHECK: call void @llvm.nvvm.bar.sync(i32 0)
 
-  __builtin_ptx_bar_sync(0);
+  __nvvm_bar_sync(0);
 
 }
 
@@ -177,7 +179,7 @@
 // CHECK: call void @llvm.nvvm.membar.sys()
   __nvvm_membar_sys();
 // CHECK: call void @llvm.nvvm.barrier0()
-  __nvvm_bar0();
+  __syncthreads();
 }
 
 __device__ int di;
@@ -189,7 +191,7 @@
 
 // Check for atomic intrinsics
 // CHECK-LABEL: nvvm_atom
-__device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l,
+__device__ void nvvm_atom(float *fp, float f, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l,
                           long long *llp, long long ll) {
   // CHECK: atomicrmw add
   __nvvm_atom_add_gen_i(ip, i);
@@ -272,5 +274,111 @@
   // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
   __nvvm_atom_add_gen_f(fp, f);
 
+  // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0i32
+  __nvvm_atom_inc_gen_ui(uip, ui);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32
+  __nvvm_atom_dec_gen_ui(uip, ui);
+
   // CHECK: ret
 }
+
+// CHECK-LABEL: nvvm_ldg
+__device__ void nvvm_ldg(const void *p) {
+  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
+  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
+  __nvvm_ldg_c((const char *)p);
+  __nvvm_ldg_uc((const unsigned char *)p);
+
+  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
+  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
+  __nvvm_ldg_s((const short *)p);
+  __nvvm_ldg_us((const unsigned short *)p);
+
+  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
+  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
+  __nvvm_ldg_i((const int *)p);
+  __nvvm_ldg_ui((const unsigned int *)p);
+
+  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
+  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
+  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
+  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
+  __nvvm_ldg_l((const long *)p);
+  __nvvm_ldg_ul((const unsigned long *)p);
+
+  // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* {{%[0-9]+}}, i32 4)
+  __nvvm_ldg_f((const float *)p);
+  // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* {{%[0-9]+}}, i32 8)
+  __nvvm_ldg_d((const double *)p);
+
+  // In practice, the pointers we pass to __ldg will be aligned as appropriate
+  // for the CUDA <type>N vector types (e.g. short4), which are not the same as
+  // the LLVM vector types.  However, each LLVM vector type has an alignment
+  // less than or equal to its corresponding CUDA type, so we're OK.
+  //
+  // PTX Interoperability section 2.2: "For a vector with an even number of
+  // elements, its alignment is set to number of elements times the alignment of
+  // its member: n*alignof(t)."
+
+  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
+  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
+  typedef char char2 __attribute__((ext_vector_type(2)));
+  typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_c2((const char2 *)p);
+  __nvvm_ldg_uc2((const uchar2 *)p);
+
+  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
+  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
+  typedef char char4 __attribute__((ext_vector_type(4)));
+  typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
+  __nvvm_ldg_c4((const char4 *)p);
+  __nvvm_ldg_uc4((const uchar4 *)p);
+
+  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
+  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
+  typedef short short2 __attribute__((ext_vector_type(2)));
+  typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_s2((const short2 *)p);
+  __nvvm_ldg_us2((const ushort2 *)p);
+
+  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
+  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
+  typedef short short4 __attribute__((ext_vector_type(4)));
+  typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
+  __nvvm_ldg_s4((const short4 *)p);
+  __nvvm_ldg_us4((const ushort4 *)p);
+
+  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
+  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
+  typedef int int2 __attribute__((ext_vector_type(2)));
+  typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_i2((const int2 *)p);
+  __nvvm_ldg_ui2((const uint2 *)p);
+
+  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
+  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
+  typedef int int4 __attribute__((ext_vector_type(4)));
+  typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
+  __nvvm_ldg_i4((const int4 *)p);
+  __nvvm_ldg_ui4((const uint4 *)p);
+
+  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
+  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
+  typedef long long longlong2 __attribute__((ext_vector_type(2)));
+  typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_ll2((const longlong2 *)p);
+  __nvvm_ldg_ull2((const ulonglong2 *)p);
+
+  // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* {{%[0-9]+}}, i32 8)
+  typedef float float2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_f2((const float2 *)p);
+
+  // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* {{%[0-9]+}}, i32 16)
+  typedef float float4 __attribute__((ext_vector_type(4)));
+  __nvvm_ldg_f4((const float4 *)p);
+
+  // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* {{%[0-9]+}}, i32 16)
+  typedef double double2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_d2((const double2 *)p);
+}
diff --git a/test/CodeGen/builtins-ppc-altivec.c b/test/CodeGen/builtins-ppc-altivec.c
index 9539d6c..1edf99f 100644
--- a/test/CodeGen/builtins-ppc-altivec.c
+++ b/test/CodeGen/builtins-ppc-altivec.c
@@ -1,7 +1,16 @@
 // REQUIRES: powerpc-registered-target
-// RUN: %clang_cc1 -faltivec -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -faltivec -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -faltivec -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-LE
+// RUN: %clang_cc1 -faltivec -triple powerpc-unknown-unknown -emit-llvm %s \
+// RUN:            -o - | FileCheck %s
+// RUN: %clang_cc1 -faltivec -triple powerpc64-unknown-unknown -emit-llvm %s \
+// RUN:            -o - | FileCheck %s
+// RUN: %clang_cc1 -faltivec -triple powerpc64le-unknown-unknown -emit-llvm %s \
+// RUN:            -o - | FileCheck %s -check-prefix=CHECK-LE
+// RUN: not %clang_cc1 -triple powerpc64le-unknown-unknown -emit-llvm %s \
+// RUN:            -ferror-limit 0 -DNO_ALTIVEC -o - 2>&1 \
+// RUN:            | FileCheck %s -check-prefix=CHECK-NOALTIVEC
+#ifndef NO_ALTIVEC
+#include <altivec.h>
+#endif
 
 vector bool char vbc = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 };
 vector signed char vsc = { 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16 };
@@ -27,6 +36,8 @@
 vector unsigned int res_vui;
 vector float res_vf;
 
+// CHECK-NOALTIVEC: error: unknown type name 'vector'
+
 signed char param_sc;
 unsigned char param_uc;
 short param_s;
@@ -66,8 +77,16 @@
 // CHECK-LE: @llvm.ppc.altivec.vmaxsw
 
   vf = vec_abs(vf);
-// CHECK: and <4 x i32>
-// CHECK-LE: and <4 x i32>
+// CHECK: bitcast <4 x float> %{{.*}} to <4 x i32>
+// CHECK: and <4 x i32> {{.*}}, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+// CHECK: bitcast <4 x i32> %{{.*}} to <4 x float>
+// CHECK: store <4 x float> %{{.*}}, <4 x float>* @vf
+// CHECK-LE: bitcast <4 x float> %{{.*}} to <4 x i32>
+// CHECK-LE: and <4 x i32> {{.*}}, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+// CHECK-LE: bitcast <4 x i32> %{{.*}} to <4 x float>
+// CHECK-LE: store <4 x float> %{{.*}}, <4 x float>* @vf
+// CHECK-NOALTIVEC: error: use of undeclared identifier 'vf'
+// CHECK-NOALTIVEC: vf = vec_abs(vf) 
 
   /* vec_abs */
   vsc = vec_abss(vsc);
diff --git a/test/CodeGen/builtins-ppc-p8vector.c b/test/CodeGen/builtins-ppc-p8vector.c
index 29503f0..096e3e1 100644
--- a/test/CodeGen/builtins-ppc-p8vector.c
+++ b/test/CodeGen/builtins-ppc-p8vector.c
@@ -6,6 +6,7 @@
 // generate the correct errors for functions that are only overloaded with VSX
 // (vec_cmpge, vec_cmple). Without this option, there is only one overload so
 // it is selected.
+#include <altivec.h>
 
 void dummy() { }
 signed int si;
@@ -73,10 +74,10 @@
 // CHECK-PPC: error: call to 'vec_abs' is ambiguous
 
   res_vd = vec_abs(vda);
-// CHECK: store <2 x i64> <i64 9223372036854775807, i64 9223372036854775807>, <2 x i64>*
-// CHECK: and <2 x i64>
-// CHECK-LE: store <2 x i64> <i64 9223372036854775807, i64 9223372036854775807>, <2 x i64>*
-// CHECK-LE: and <2 x i64>
+// CHECK: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+// CHECK: store <2 x double> %{{.*}}, <2 x double>* @res_vd
+// CHECK-LE: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+// CHECK-LE: store <2 x double> %{{.*}}, <2 x double>* @res_vd
 // CHECK-PPC: error: call to 'vec_abs' is ambiguous
 
   /* vec_add */
diff --git a/test/CodeGen/builtins-ppc-quadword.c b/test/CodeGen/builtins-ppc-quadword.c
index e17b679..f381642 100644
--- a/test/CodeGen/builtins-ppc-quadword.c
+++ b/test/CodeGen/builtins-ppc-quadword.c
@@ -8,6 +8,7 @@
 
 // RUN: not %clang_cc1 -faltivec -triple powerpc-unknown-unknown \
 // RUN: -emit-llvm %s -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PPC
+#include <altivec.h>
 
 // CHECK-PPC: error: __int128 is not supported on this target
 vector signed __int128 vlll = { -1 };
diff --git a/test/CodeGen/builtins-ppc-vsx.c b/test/CodeGen/builtins-ppc-vsx.c
index 15f98b5..e58afdd 100644
--- a/test/CodeGen/builtins-ppc-vsx.c
+++ b/test/CodeGen/builtins-ppc-vsx.c
@@ -1,27 +1,63 @@
 // REQUIRES: powerpc-registered-target
 // RUN: %clang_cc1 -faltivec -target-feature +vsx -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -faltivec -target-feature +vsx -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-LE
+#include <altivec.h>
 
+vector bool char vbc = { 0, 1, 0, 1, 0, 1, 0, 1,
+                         0, 1, 0, 1, 0, 1, 0, 1 };
+vector signed char vsc = { -8,  9, -10, 11, -12, 13, -14, 15,
+                           -0,  1,  -2,  3,  -4,  5,  -6,  7};
 vector unsigned char vuc = { 8,  9, 10, 11, 12, 13, 14, 15,
                              0,  1,  2,  3,  4,  5,  6,  7};
 vector float vf = { -1.5, 2.5, -3.5, 4.5 };
 vector double vd = { 3.5, -7.5 };
+vector bool short vbs = { 0, 1, 0, 1, 0, 1, 0, 1 };
+vector signed short vss = { -1, 2, -3, 4, -5, 6, -7, 8 };
+vector unsigned short vus = { 0, 1, 2, 3, 4, 5, 6, 7 };
+vector bool int vbi = { 0, 1, 0, 1 };
 vector signed int vsi = { -1, 2, -3, 4 };
 vector unsigned int vui = { 0, 1, 2, 3 };
 vector bool long long vbll = { 1, 0 };
 vector signed long long vsll = { 255LL, -937LL };
 vector unsigned long long vull = { 1447LL, 2894LL };
 double d = 23.4;
+float af[4] = {23.4f, 56.7f, 89.0f, 12.3f};
+double ad[2] = {23.4, 56.7};
+signed char asc[16] = { -8,  9, -10, 11, -12, 13, -14, 15,
+                        -0,  1,  -2,  3,  -4,  5,  -6,  7};
+unsigned char auc[16] = { 8,  9, 10, 11, 12, 13, 14, 15,
+                          0,  1,  2,  3,  4,  5,  6,  7};
+signed short ass[8] = { -1, 2, -3, 4, -5, 6, -7, 8 };
+unsigned short aus[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+signed int asi[4] = { -1, 2, -3, 4 };
+unsigned int aui[4] = { 0, 1, 2, 3 };
+signed long asl[2] = { -1L, 2L };
+unsigned long aul[2] = { 1L, 2L };
 
 vector float res_vf;
 vector double res_vd;
+vector bool char res_vbc;
+vector signed char res_vsc;
+vector unsigned char res_vuc;
+vector bool short res_vbs;
+vector signed short res_vss;
+vector unsigned short res_vus;
+vector bool int res_vbi;
 vector signed int res_vsi;
 vector unsigned int res_vui;
-vector bool int res_vbi;
 vector bool long long res_vbll;
 vector signed long long res_vsll;
 vector unsigned long long res_vull;
+
 double res_d;
+float res_af[4];
+double res_ad[2];
+signed char res_asc[16];
+unsigned char res_auc[16];
+signed short res_ass[8];
+unsigned short res_aus[8];
+signed int res_asi[4];
+unsigned int res_aui[4];
 
 void dummy() { }
 
@@ -29,6 +65,14 @@
 // CHECK-LABEL: define void @test1
 // CHECK-LE-LABEL: define void @test1
 
+  res_vf = vec_abs(vf);
+// CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
+// CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
+
+  dummy();
+// CHECK: call void @dummy()
+// CHECK-LE: call void @dummy()
+
   res_vd = vec_add(vd, vd);
 // CHECK: fadd <2 x double>
 // CHECK-LE: fadd <2 x double>
@@ -292,18 +336,34 @@
 
   /* vec_vsx_ld */
 
+  res_vbi = vec_vsx_ld(0, &vbi);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   res_vsi = vec_vsx_ld(0, &vsi);
 // CHECK: @llvm.ppc.vsx.lxvw4x
 // CHECK-LE: @llvm.ppc.vsx.lxvw4x
 
+  res_vsi = vec_vsx_ld(0, asi);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   res_vui = vec_vsx_ld(0, &vui);
 // CHECK: @llvm.ppc.vsx.lxvw4x
 // CHECK-LE: @llvm.ppc.vsx.lxvw4x
 
+  res_vui = vec_vsx_ld(0, aui);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   res_vf = vec_vsx_ld (0, &vf);
 // CHECK: @llvm.ppc.vsx.lxvw4x
 // CHECK-LE: @llvm.ppc.vsx.lxvw4x
 
+  res_vf = vec_vsx_ld (0, af);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   res_vsll = vec_vsx_ld(0, &vsll);
 // CHECK: @llvm.ppc.vsx.lxvd2x
 // CHECK-LE: @llvm.ppc.vsx.lxvd2x
@@ -316,20 +376,88 @@
 // CHECK: @llvm.ppc.vsx.lxvd2x
 // CHECK-LE: @llvm.ppc.vsx.lxvd2x
 
+  res_vd = vec_vsx_ld(0, ad);
+// CHECK: @llvm.ppc.vsx.lxvd2x
+// CHECK-LE: @llvm.ppc.vsx.lxvd2x
+
+  res_vbs = vec_vsx_ld(0, &vbs);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vss = vec_vsx_ld(0, &vss);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vss = vec_vsx_ld(0, ass);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vus = vec_vsx_ld(0, &vus);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vus = vec_vsx_ld(0, aus);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vbc = vec_vsx_ld(0, &vbc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vsc = vec_vsx_ld(0, &vsc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vuc = vec_vsx_ld(0, &vuc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vsc = vec_vsx_ld(0, asc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vuc = vec_vsx_ld(0, auc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   /* vec_vsx_st */
 
+  vec_vsx_st(vbi, 0, &res_vbi);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbi, 0, res_aui);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbi, 0, res_asi);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   vec_vsx_st(vsi, 0, &res_vsi);
 // CHECK: @llvm.ppc.vsx.stxvw4x
 // CHECK-LE: @llvm.ppc.vsx.stxvw4x
 
+  vec_vsx_st(vsi, 0, res_asi);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   vec_vsx_st(vui, 0, &res_vui);
 // CHECK: @llvm.ppc.vsx.stxvw4x
 // CHECK-LE: @llvm.ppc.vsx.stxvw4x
 
+  vec_vsx_st(vui, 0, res_aui);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   vec_vsx_st(vf, 0, &res_vf);
 // CHECK: @llvm.ppc.vsx.stxvw4x
 // CHECK-LE: @llvm.ppc.vsx.stxvw4x
 
+  vec_vsx_st(vf, 0, res_af);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   vec_vsx_st(vsll, 0, &res_vsll);
 // CHECK: @llvm.ppc.vsx.stxvd2x
 // CHECK-LE: @llvm.ppc.vsx.stxvd2x
@@ -342,6 +470,66 @@
 // CHECK: @llvm.ppc.vsx.stxvd2x
 // CHECK-LE: @llvm.ppc.vsx.stxvd2x
 
+  vec_vsx_st(vd, 0, res_ad);
+// CHECK: @llvm.ppc.vsx.stxvd2x
+// CHECK-LE: @llvm.ppc.vsx.stxvd2x
+
+  vec_vsx_st(vbs, 0, &res_vbs);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbs, 0, res_aus);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbs, 0, res_ass);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vss, 0, &res_vss);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vss, 0, res_ass);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vus, 0, &res_vus);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vus, 0, res_aus);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vsc, 0, &res_vsc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vsc, 0, res_asc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vuc, 0, &res_vuc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vuc, 0, res_auc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbc, 0, &res_vbc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbc, 0, res_asc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbc, 0, res_auc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   /* vec_and */
   res_vsll = vec_and(vsll, vsll);
 // CHECK: and <2 x i64>
diff --git a/test/CodeGen/builtins-sparc.c b/test/CodeGen/builtins-sparc.c
new file mode 100644
index 0000000..92cc767
--- /dev/null
+++ b/test/CodeGen/builtins-sparc.c
@@ -0,0 +1,10 @@
+// REQUIRES: sparc-registered-target
+// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple sparc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+void test_eh_return_data_regno(void)
+{
+  volatile int res;
+  res = __builtin_eh_return_data_regno(0);  // CHECK: store volatile i32 24
+  res = __builtin_eh_return_data_regno(1);  // CHECK: store volatile i32 25
+}
diff --git a/test/CodeGen/builtins-systemz-error2.c b/test/CodeGen/builtins-systemz-error2.c
new file mode 100644
index 0000000..cf8ee6f
--- /dev/null
+++ b/test/CodeGen/builtins-systemz-error2.c
@@ -0,0 +1,11 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -triple s390x-ibm-linux -S -emit-llvm %s -verify -o -
+
+typedef __attribute__((vector_size(16))) char v16i8;
+
+v16i8 f0(v16i8 a, v16i8 b) {
+  __builtin_tbegin ((void *)0);         // expected-error {{'__builtin_tbegin' needs target feature transactional-execution}}
+  v16i8 tmp = __builtin_s390_vaq(a, b); // expected-error {{'__builtin_s390_vaq' needs target feature vector}}
+  return tmp;
+}
+
diff --git a/test/CodeGen/builtins-wasm.c b/test/CodeGen/builtins-wasm.c
index 15f2e9d..135e329 100644
--- a/test/CodeGen/builtins-wasm.c
+++ b/test/CodeGen/builtins-wasm.c
@@ -4,9 +4,9 @@
 // RUN:   | FileCheck %s -check-prefix=WEBASSEMBLY64
 
 __SIZE_TYPE__ f1(void) {
-  return __builtin_wasm_memory_size();
-// WEBASSEMBLY32: call {{i.*}} @llvm.wasm.memory.size.i32()
-// WEBASSEMBLY64: call {{i.*}} @llvm.wasm.memory.size.i64()
+  return __builtin_wasm_current_memory();
+// WEBASSEMBLY32: call {{i.*}} @llvm.wasm.current.memory.i32()
+// WEBASSEMBLY64: call {{i.*}} @llvm.wasm.current.memory.i64()
 }
 
 void f2(long delta) {
diff --git a/test/CodeGen/builtins-x86-disabled.c b/test/CodeGen/builtins-x86-disabled.c
new file mode 100644
index 0000000..a024336
--- /dev/null
+++ b/test/CodeGen/builtins-x86-disabled.c
@@ -0,0 +1,22 @@
+// REQUIRES: x86-registered-target
+// RUN: not %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - 2>&1 | FileCheck %s
+
+void call_x86_64_builtins(void)
+{
+  unsigned long long a = __builtin_ia32_crc32di(0, 0);
+  unsigned long long b;
+  unsigned int c = __builtin_ia32_rdseed64_step (&b);
+  unsigned long long d = __builtin_ia32_bextr_u64 (0, 0);
+  unsigned long long e = __builtin_ia32_pdep_di(0, 0);
+  unsigned long long f = __builtin_ia32_pext_di(0, 0);
+  unsigned long long g = __builtin_ia32_bzhi_di(0, 0);
+  unsigned long long h;
+  unsigned long long i = __builtin_ia32_addcarryx_u64(0, 0, 0, &h);
+  unsigned long long j;
+  unsigned long long k = __builtin_ia32_addcarry_u64(0, 0, 0, &j);
+  unsigned long long l;
+  unsigned long long m = __builtin_ia32_subborrow_u64(0, 0, 0, &l);
+}
+
+// CHECK: error: this builtin is only available on x86-64 targets
+// CHECK: __builtin_ia32_crc32di
diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c
index 83b11a0..8fa24e6 100644
--- a/test/CodeGen/builtins-x86.c
+++ b/test/CodeGen/builtins-x86.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -emit-llvm -o %t %s
-// RUN: %clang_cc1 -DUSE_ALL -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -fsyntax-only -o %t %s
+// RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -emit-llvm -o %t %s
+// RUN: %clang_cc1 -DUSE_ALL -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -fsyntax-only -o %t %s
 
 #ifdef USE_ALL
 #define USE_3DNOW
@@ -281,24 +281,27 @@
   (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi);
   (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi);
 
+  (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui);
+  (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui);
+
   tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
   tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
   tmp_i = __builtin_ia32_cvtss2si(tmp_V4f);
+  tmp_i = __builtin_ia32_cvttss2si(tmp_V4f);
 
   tmp_i = __builtin_ia32_rdtsc();
   tmp_i = __builtin_ia32_rdtscp(&tmp_Ui);
   tmp_LLi = __builtin_ia32_rdpmc(tmp_i);
 #ifdef USE_64
   tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f);
+  tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f);
 #endif
   tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
   (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
-  (void) __builtin_ia32_storeups(tmp_fp, tmp_V4f);
   (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f);
   (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
   tmp_i = __builtin_ia32_movmskps(tmp_V4f);
   tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
-  (void) __builtin_ia32_movntps(tmp_fp, tmp_V4f);
   (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
   (void) __builtin_ia32_sfence();
 
@@ -310,19 +313,15 @@
   tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f);
   tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f);
   (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp);
-  (void) __builtin_ia32_storeupd(tmp_dp, tmp_V2d);
   tmp_i = __builtin_ia32_movmskpd(tmp_V2d);
   tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c);
   (void) __builtin_ia32_movnti(tmp_ip, tmp_i);
 #ifdef USE_64
   (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
 #endif
-  (void) __builtin_ia32_movntpd(tmp_dp, tmp_V2d);
-  (void) __builtin_ia32_movntdq(tmp_V2LLip, tmp_V2LLi);
   tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
   tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
   tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
-  tmp_V2d = __builtin_ia32_cvtdq2pd(tmp_V4i);
   tmp_V4f = __builtin_ia32_cvtdq2ps(tmp_V4i);
   tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
   tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
@@ -331,16 +330,17 @@
   tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d);
   tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i);
   tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d);
+  tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d);
+  tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d);
 #ifdef USE_64
   tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d);
+  tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d);
 #endif
   tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f);
-  tmp_V2d = __builtin_ia32_cvtps2pd(tmp_V4f);
   tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f);
   (void) __builtin_ia32_clflush(tmp_vCp);
   (void) __builtin_ia32_lfence();
   (void) __builtin_ia32_mfence();
-  (void) __builtin_ia32_storedqu(tmp_cp, tmp_V16c);
   tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
   tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
   tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
@@ -386,14 +386,7 @@
   tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i);
   tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i);
   tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s);
-  tmp_V4i = __builtin_ia32_pmovzxbd128(tmp_V16c);
-  tmp_V2LLi = __builtin_ia32_pmovzxbq128(tmp_V16c);
-  tmp_V8s = __builtin_ia32_pmovzxbw128(tmp_V16c);
-  tmp_V2LLi = __builtin_ia32_pmovzxdq128(tmp_V4i);
-  tmp_V4i = __builtin_ia32_pmovzxwd128(tmp_V8s);
-  tmp_V2LLi = __builtin_ia32_pmovzxwq128(tmp_V8s);
   tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i);
-  tmp_V4i = __builtin_ia32_pmulld128(tmp_V4i, tmp_V4i);
   tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16);
   tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16);
   tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16);
@@ -420,11 +413,9 @@
   tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7);
   tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0);
   tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0);
-  tmp_V4d = __builtin_ia32_cvtdq2pd256(tmp_V4i);
   tmp_V8f = __builtin_ia32_cvtdq2ps256(tmp_V8i);
   tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d);
   tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f);
-  tmp_V4d = __builtin_ia32_cvtps2pd256(tmp_V4f);
   tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d);
   tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d);
   tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f);
@@ -458,13 +449,7 @@
   __builtin_ia32_vzeroupper();
   tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp);
   tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp);
-  __builtin_ia32_storeupd256(tmp_dp, tmp_V4d);
-  __builtin_ia32_storeups256(tmp_fp, tmp_V8f);
-  __builtin_ia32_storedqu256(tmp_cp, tmp_V32c);
   tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
-  __builtin_ia32_movntdq256(tmp_V4LLip, tmp_V4LLi);
-  __builtin_ia32_movntpd256(tmp_dp, tmp_V4d);
-  __builtin_ia32_movntps256(tmp_fp, tmp_V8f);
   tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
   tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
   tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);
diff --git a/test/CodeGen/builtins.c b/test/CodeGen/builtins.c
index 62f9a74..2ce7113 100644
--- a/test/CodeGen/builtins.c
+++ b/test/CodeGen/builtins.c
@@ -116,6 +116,16 @@
   P(bswap16, (N));
   P(bswap32, (N));
   P(bswap64, (N));
+
+  // CHECK: @llvm.bitreverse.i8
+  // CHECK: @llvm.bitreverse.i16
+  // CHECK: @llvm.bitreverse.i32
+  // CHECK: @llvm.bitreverse.i64
+  P(bitreverse8, (N));
+  P(bitreverse16, (N));
+  P(bitreverse32, (N));
+  P(bitreverse64, (N));
+
   // FIXME
   // V(clear_cache, (&N, &N+1));
   V(trap, ());
@@ -207,10 +217,8 @@
   // CHECK:  select i1 %[[ISINF]], i32 %[[SIGN]], i32 0
 
   res = __builtin_isfinite(F);
-  // CHECK: fcmp oeq float 
   // CHECK: call float @llvm.fabs.f32(float
-  // CHECK: fcmp une float {{.*}}, 0x7FF0000000000000
-  // CHECK: and i1 
+  // CHECK: fcmp one float {{.*}}, 0x7FF0000000000000
 
   res = __builtin_isnormal(F);
   // CHECK: fcmp oeq float
@@ -242,6 +250,105 @@
   // CHECK: call float @llvm.fabs.f32(float
   // CHECK: call double @llvm.fabs.f64(double
   // CHECK: call x86_fp80 @llvm.fabs.f80(x86_fp80
+
+  resf = __builtin_canonicalizef(F);
+  resd = __builtin_canonicalize(D);
+  resld = __builtin_canonicalizel(LD);
+  // CHECK: call float @llvm.canonicalize.f32(float
+  // CHECK: call double @llvm.canonicalize.f64(double
+  // CHECK: call x86_fp80 @llvm.canonicalize.f80(x86_fp80
+
+  resf = __builtin_fminf(F, F);
+  // CHECK: call float @llvm.minnum.f32
+
+  resd = __builtin_fmin(D, D);
+  // CHECK: call double @llvm.minnum.f64
+
+  resld = __builtin_fminl(LD, LD);
+  // CHECK: call x86_fp80 @llvm.minnum.f80
+
+  resf = __builtin_fmaxf(F, F);
+  // CHECK: call float @llvm.maxnum.f32
+
+  resd = __builtin_fmax(D, D);
+  // CHECK: call double @llvm.maxnum.f64
+
+  resld = __builtin_fmaxl(LD, LD);
+  // CHECK: call x86_fp80 @llvm.maxnum.f80
+
+  resf = __builtin_fabsf(F);
+  // CHECK: call float @llvm.fabs.f32
+
+  resd = __builtin_fabs(D);
+  // CHECK: call double @llvm.fabs.f64
+
+  resld = __builtin_fabsl(LD);
+  // CHECK: call x86_fp80 @llvm.fabs.f80
+
+  resf = __builtin_copysignf(F, F);
+  // CHECK: call float @llvm.copysign.f32
+
+  resd = __builtin_copysign(D, D);
+  // CHECK: call double @llvm.copysign.f64
+
+  resld = __builtin_copysignl(LD, LD);
+  // CHECK: call x86_fp80 @llvm.copysign.f80
+
+
+  resf = __builtin_ceilf(F);
+  // CHECK: call float @llvm.ceil.f32
+
+  resd = __builtin_ceil(D);
+  // CHECK: call double @llvm.ceil.f64
+
+  resld = __builtin_ceill(LD);
+  // CHECK: call x86_fp80 @llvm.ceil.f80
+
+  resf = __builtin_floorf(F);
+  // CHECK: call float @llvm.floor.f32
+
+  resd = __builtin_floor(D);
+  // CHECK: call double @llvm.floor.f64
+
+  resld = __builtin_floorl(LD);
+  // CHECK: call x86_fp80 @llvm.floor.f80
+
+  resf = __builtin_truncf(F);
+  // CHECK: call float @llvm.trunc.f32
+
+  resd = __builtin_trunc(D);
+  // CHECK: call double @llvm.trunc.f64
+
+  resld = __builtin_truncl(LD);
+  // CHECK: call x86_fp80 @llvm.trunc.f80
+
+  resf = __builtin_rintf(F);
+  // CHECK: call float @llvm.rint.f32
+
+  resd = __builtin_rint(D);
+  // CHECK: call double @llvm.rint.f64
+
+  resld = __builtin_rintl(LD);
+  // CHECK: call x86_fp80 @llvm.rint.f80
+
+  resf = __builtin_nearbyintf(F);
+  // CHECK: call float @llvm.nearbyint.f32
+
+  resd = __builtin_nearbyint(D);
+  // CHECK: call double @llvm.nearbyint.f64
+
+  resld = __builtin_nearbyintl(LD);
+  // CHECK: call x86_fp80 @llvm.nearbyint.f80
+
+  resf = __builtin_roundf(F);
+  // CHECK: call float @llvm.round.f32
+
+  resd = __builtin_round(D);
+  // CHECK: call double @llvm.round.f64
+
+  resld = __builtin_roundl(LD);
+  // CHECK: call x86_fp80 @llvm.round.f80
+
 }
 
 // __builtin_longjmp isn't supported on all platforms, so only test it on X86.
@@ -318,29 +425,4 @@
   __builtin_os_log_format(buf, "%d %{public}s %{private}.16P", i, data, data);
 }
 
-// Check that the %% which does not consume any argument is correctly handled
-void test_builtin_os_log_percent(void *buf, const char *data) {
-  volatile int len;
-  // CHECK: store i8* [[BUF]], i8** [[BUF_ADDR:%.*]], align 8
-  // CHECK: store i8* [[DATA]], i8** [[DATA_ADDR:%.*]], align 8
-  // CHECK: store volatile i32 12
-  len = __builtin_os_log_format_buffer_size("%s %%", data);
-
-  // CHECK: [[BUF2:%.*]] = load i8*, i8** [[BUF_ADDR]]
-  // CHECK: [[SUMMARY:%.*]] = getelementptr i8, i8* [[BUF2]], i64 0
-  // CHECK: store i8 2, i8* [[SUMMARY]]
-  // CHECK: [[NUM_ARGS:%.*]] = getelementptr i8, i8* [[BUF2]], i64 1
-  // CHECK: store i8 1, i8* [[NUM_ARGS]]
-  //
-  // CHECK: [[ARG1_DESC:%.*]] = getelementptr i8, i8* [[BUF2]], i64 2
-  // CHECK: store i8 32, i8* [[ARG1_DESC]]
-  // CHECK: [[ARG1_SIZE:%.*]] = getelementptr i8, i8* [[BUF2]], i64 3
-  // CHECK: store i8 8, i8* [[ARG1_SIZE]]
-  // CHECK: [[ARG1:%.*]] = getelementptr i8, i8* [[BUF2]], i64 4
-  // CHECK: [[ARG1_PTR:%.*]] = bitcast i8* [[ARG1]] to i8**
-  // CHECK: [[DATA2:%.*]] = load i8*, i8** [[DATA_ADDR]]
-  // CHECK: store i8* [[DATA2]], i8** [[ARG1_PTR]]
-  __builtin_os_log_format(buf, "%s %%", data);
-}
-
 #endif
diff --git a/test/CodeGen/cfi-check-fail.c b/test/CodeGen/cfi-check-fail.c
new file mode 100644
index 0000000..b850193
--- /dev/null
+++ b/test/CodeGen/cfi-check-fail.c
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O0 -fsanitize-cfi-cross-dso \
+// RUN:     -fsanitize=cfi-icall,cfi-nvcall,cfi-vcall,cfi-unrelated-cast,cfi-derived-cast \
+// RUN:     -fsanitize-trap=cfi-icall,cfi-nvcall -fsanitize-recover=cfi-vcall,cfi-unrelated-cast \
+// RUN:     -emit-llvm -o - %s | FileCheck %s
+
+void caller(void (*f)()) {
+  f();
+}
+
+// CHECK: define weak_odr hidden void @__cfi_check_fail(i8*, i8*)
+// CHECK: store i8* %0, i8** %[[ALLOCA0:.*]], align 8
+// CHECK: store i8* %1, i8** %[[ALLOCA1:.*]], align 8
+// CHECK: %[[DATA:.*]] = load i8*, i8** %[[ALLOCA0]], align 8
+// CHECK: %[[ADDR:.*]] = load i8*, i8** %[[ALLOCA1]], align 8
+// CHECK: %[[ICMP_NOT_NULL:.*]] = icmp ne i8* %[[DATA]], null
+// CHECK: br i1 %[[ICMP_NOT_NULL]], label %[[CONT0:.*]], label %[[TRAP:.*]],
+
+// CHECK: [[TRAP]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT0]]:
+// CHECK:   %[[A:.*]] = bitcast i8* %[[DATA]] to { i8, { i8*, i32, i32 }, i8* }*
+// CHECK:   %[[KINDPTR:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 0
+// CHECK:   %[[KIND:.*]] = load i8, i8* %[[KINDPTR]], align 4
+// CHECK:   %[[VTVALID0:.*]] = call i1 @llvm.type.test(i8* %[[ADDR]], metadata !"all-vtables")
+// CHECK:   %[[VTVALID:.*]] = zext i1 %[[VTVALID0]] to i64
+// CHECK:   %[[NOT_0:.*]] = icmp ne i8 %[[KIND]], 0
+// CHECK:   br i1 %[[NOT_0]], label %[[CONT1:.*]], label %[[HANDLE0:.*]], !prof
+
+// CHECK: [[HANDLE0]]:
+// CHECK:   %[[DATA0:.*]] = ptrtoint i8* %[[DATA]] to i64,
+// CHECK:   %[[ADDR0:.*]] = ptrtoint i8* %[[ADDR]] to i64,
+// CHECK:   call void @__ubsan_handle_cfi_check_fail(i64 %[[DATA0]], i64 %[[ADDR0]], i64 %[[VTVALID]])
+// CHECK:   br label %[[CONT1]]
+
+// CHECK: [[CONT1]]:
+// CHECK:   %[[NOT_1:.*]] = icmp ne i8 %[[KIND]], 1
+// CHECK:   br i1 %[[NOT_1]], label %[[CONT2:.*]], label %[[HANDLE1:.*]], !nosanitize
+
+// CHECK: [[HANDLE1]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT2]]:
+// CHECK:   %[[NOT_2:.*]] = icmp ne i8 %[[KIND]], 2
+// CHECK:   br i1 %[[NOT_2]], label %[[CONT3:.*]], label %[[HANDLE2:.*]], !prof
+
+// CHECK: [[HANDLE2]]:
+// CHECK:   %[[DATA2:.*]] = ptrtoint i8* %[[DATA]] to i64,
+// CHECK:   %[[ADDR2:.*]] = ptrtoint i8* %[[ADDR]] to i64,
+// CHECK:   call void @__ubsan_handle_cfi_check_fail_abort(i64 %[[DATA2]], i64 %[[ADDR2]], i64 %[[VTVALID]])
+// CHECK:   unreachable
+
+// CHECK: [[CONT3]]:
+// CHECK:   %[[NOT_3:.*]] = icmp ne i8 %[[KIND]], 3
+// CHECK:   br i1 %[[NOT_3]], label %[[CONT4:.*]], label %[[HANDLE3:.*]], !prof
+
+// CHECK: [[HANDLE3]]:
+// CHECK:   %[[DATA3:.*]] = ptrtoint i8* %[[DATA]] to i64,
+// CHECK:   %[[ADDR3:.*]] = ptrtoint i8* %[[ADDR]] to i64,
+// CHECK:   call void @__ubsan_handle_cfi_check_fail(i64 %[[DATA3]], i64 %[[ADDR3]], i64 %[[VTVALID]])
+// CHECK:   br label %[[CONT4]]
+
+// CHECK: [[CONT4]]:
+// CHECK:   %[[NOT_4:.*]] = icmp ne i8 %[[KIND]], 4
+// CHECK:   br i1 %[[NOT_4]], label %[[CONT5:.*]], label %[[HANDLE4:.*]], !nosanitize
+
+// CHECK: [[HANDLE4]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT5]]:
+// CHECK:   ret void
diff --git a/test/CodeGen/cfi-check-fail2.c b/test/CodeGen/cfi-check-fail2.c
new file mode 100644
index 0000000..5340871
--- /dev/null
+++ b/test/CodeGen/cfi-check-fail2.c
@@ -0,0 +1,70 @@
+// __cfi_check_fail codegen when not all CFI checkers are enabled.
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O0 -fsanitize-cfi-cross-dso \
+// RUN:     -fsanitize=cfi-vcall \
+// RUN:     -emit-llvm -o - %s | FileCheck %s
+
+void caller(void (*f)()) {
+  f();
+}
+
+// CHECK: define weak_odr hidden void @__cfi_check_fail(i8*, i8*)
+// CHECK: store i8* %0, i8** %[[ALLOCA0:.*]], align 8
+// CHECK: store i8* %1, i8** %[[ALLOCA1:.*]], align 8
+// CHECK: %[[DATA:.*]] = load i8*, i8** %[[ALLOCA0]], align 8
+// CHECK: %[[ADDR:.*]] = load i8*, i8** %[[ALLOCA1]], align 8
+// CHECK: %[[ICMP_NOT_NULL:.*]] = icmp ne i8* %[[DATA]], null
+// CHECK: br i1 %[[ICMP_NOT_NULL]], label %[[CONT0:.*]], label %[[TRAP:.*]],
+
+// CHECK: [[TRAP]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT0]]:
+// CHECK:   %[[A:.*]] = bitcast i8* %[[DATA]] to { i8, { i8*, i32, i32 }, i8* }*
+// CHECK:   %[[KINDPTR:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 0
+// CHECK:   %[[KIND:.*]] = load i8, i8* %[[KINDPTR]], align 4
+// CHECK:   %[[VTVALID0:.*]] = call i1 @llvm.type.test(i8* %[[ADDR]], metadata !"all-vtables")
+// CHECK:   %[[VTVALID:.*]] = zext i1 %[[VTVALID0]] to i64
+// CHECK:   %[[NOT_0:.*]] = icmp ne i8 %[[KIND]], 0
+// CHECK:   br i1 %[[NOT_0]], label %[[CONT1:.*]], label %[[HANDLE0:.*]], !prof
+
+// CHECK: [[HANDLE0]]:
+// CHECK:   %[[DATA0:.*]] = ptrtoint i8* %[[DATA]] to i64,
+// CHECK:   %[[ADDR0:.*]] = ptrtoint i8* %[[ADDR]] to i64,
+// CHECK:   call void @__ubsan_handle_cfi_check_fail_abort(i64 %[[DATA0]], i64 %[[ADDR0]], i64 %[[VTVALID]])
+// CHECK:   unreachable
+
+// CHECK: [[CONT1]]:
+// CHECK:   %[[NOT_1:.*]] = icmp ne i8 %[[KIND]], 1
+// CHECK:   br i1 %[[NOT_1]], label %[[CONT2:.*]], label %[[HANDLE1:.*]], !nosanitize
+
+// CHECK: [[HANDLE1]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT2]]:
+// CHECK:   %[[NOT_2:.*]] = icmp ne i8 %[[KIND]], 2
+// CHECK:   br i1 %[[NOT_2]], label %[[CONT3:.*]], label %[[HANDLE2:.*]], !nosanitize
+
+// CHECK: [[HANDLE2]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT3]]:
+// CHECK:   %[[NOT_3:.*]] = icmp ne i8 %[[KIND]], 3
+// CHECK:   br i1 %[[NOT_3]], label %[[CONT4:.*]], label %[[HANDLE3:.*]], !nosanitize
+
+// CHECK: [[HANDLE3]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT4]]:
+// CHECK:   %[[NOT_4:.*]] = icmp ne i8 %[[KIND]], 4
+// CHECK:   br i1 %[[NOT_4]], label %[[CONT5:.*]], label %[[HANDLE4:.*]], !nosanitize
+
+// CHECK: [[HANDLE4]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT5]]:
+// CHECK:   ret void
diff --git a/test/CodeGen/cfi-icall-cross-dso.c b/test/CodeGen/cfi-icall-cross-dso.c
index 9337b18..636a9e4 100644
--- a/test/CodeGen/cfi-icall-cross-dso.c
+++ b/test/CodeGen/cfi-icall-cross-dso.c
@@ -1,11 +1,55 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 -fsanitize=cfi-icall -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM %s
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -O1 -fsanitize=cfi-icall  -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso \
+// RUN:   -emit-llvm -o - %s | FileCheck \
+// RUN:       --check-prefix=CHECK --check-prefix=CHECK-DIAG \
+// RUN:       --check-prefix=ITANIUM --check-prefix=ITANIUM-DIAG \
+// RUN:       %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso -fsanitize-trap=cfi-icall \
+// RUN:   -emit-llvm -o - %s | FileCheck \
+// RUN:       --check-prefix=CHECK \
+// RUN:       --check-prefix=ITANIUM --check-prefix=ITANIUM-TRAP \
+// RUN:       %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -O1 \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso \
+// RUN:   -emit-llvm -o - %s | FileCheck \
+// RUN:       --check-prefix=CHECK --check-prefix=CHECK-DIAG \
+// RUN:       --check-prefix=MS --check-prefix=MS-DIAG \
+// RUN:       %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -O1 \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso -fsanitize-trap=cfi-icall \
+// RUN:   -emit-llvm -o - %s | FileCheck \
+// RUN:       --check-prefix=CHECK \
+// RUN:       --check-prefix=MS --check-prefix=MS-TRAP \
+// RUN:       %s
+
+// CHECK-DIAG: @[[SRC:.*]] = private unnamed_addr constant {{.*}}cfi-icall-cross-dso.c\00
+// CHECK-DIAG: @[[TYPE:.*]] = private unnamed_addr constant { i16, i16, [{{.*}} x i8] } { i16 -1, i16 0, [{{.*}} x i8] c"'void ()'\00"
+// CHECK-DIAG: @[[DATA:.*]] = private unnamed_addr global {{.*}}@[[SRC]]{{.*}}@[[TYPE]]
+
+
+// ITANIUM: call i1 @llvm.type.test(i8* %{{.*}}, metadata !"_ZTSFvE"), !nosanitize
+// ITANIUM-DIAG: call void @__cfi_slowpath_diag(i64 6588678392271548388, i8* %{{.*}}, {{.*}}@[[DATA]]{{.*}}) {{.*}}, !nosanitize
+// ITANIUM-TRAP: call void @__cfi_slowpath(i64 6588678392271548388, i8* %{{.*}}) {{.*}}, !nosanitize
+
+// MS: call i1 @llvm.type.test(i8* %{{.*}}, metadata !"?6AX@Z"), !nosanitize
+// MS-DIAG: call void @__cfi_slowpath_diag(i64 4195979634929632483, i8* %{{.*}}, {{.*}}@[[DATA]]{{.*}}) {{.*}}, !nosanitize
+// MS-TRAP: call void @__cfi_slowpath(i64 4195979634929632483, i8* %{{.*}}) {{.*}}, !nosanitize
 
 void caller(void (*f)()) {
   f();
 }
 
+// Check that we emit both string and hash based type entries for static void g(),
+// and don't emit them for the declaration of h().
+
+// CHECK: define internal void @g({{.*}} !type [[TVOID:![0-9]+]] !type [[TVOID_ID:![0-9]+]]
 static void g(void) {}
+
+// CHECK: declare void @h({{[^!]*$}}
 void h(void);
 
 typedef void (*Fn)(void);
@@ -16,34 +60,22 @@
   return &h;
 }
 
+// CHECK: define void @bar({{.*}} !type [[TNOPROTO:![0-9]+]] !type [[TNOPROTO_ID:![0-9]+]]
+// ITANIUM: define available_externally void @foo({{[^!]*$}}
+// MS: define linkonce_odr void @foo({{.*}} !type [[TNOPROTO]] !type [[TNOPROTO_ID]]
 inline void foo() {}
 void bar() { foo(); }
 
-// ITANIUM: call i1 @llvm.bitset.test(i8* %{{.*}}, metadata !"_ZTSFvE"), !nosanitize
-// ITANIUM: call void @__cfi_slowpath(i64 6588678392271548388, i8* %{{.*}}) {{.*}}, !nosanitize
-
-// MS: call i1 @llvm.bitset.test(i8* %{{.*}}, metadata !"?6AX@Z"), !nosanitize
-// MS: call void @__cfi_slowpath(i64 4195979634929632483, i8* %{{.*}}) {{.*}}, !nosanitize
-
-// ITANIUM: define available_externally void @foo()
-// MS: define linkonce_odr void @foo()
-
-// Check that we emit both string and hash based bit set entries for static void g(),
-// and don't emit them for the declaration of h().
-
-// CHECK-NOT: !{!"{{.*}}", void ()* @h, i64 0}
-// CHECK: !{!"{{.*}}", void ()* @g, i64 0}
-// CHECK-NOT: !{!"{{.*}}", void ()* @h, i64 0}
-// CHECK: !{i64 {{.*}}, void ()* @g, i64 0}
-// CHECK-NOT: !{!"{{.*}}", void ()* @h, i64 0}
-
-// ITANIUM-NOT: !{!{{.*}}, void ()* @foo,
-// ITANIUM: !{!"_ZTSFvE", void ()* @bar, i64 0}
-// ITANIUM-NOT: !{!{{.*}}, void ()* @foo,
-// ITANIUM: !{i64 6588678392271548388, void ()* @bar, i64 0}
-// ITANIUM-NOT: !{!{{.*}}, void ()* @foo,
-
-// MS: !{!"?6AX@Z", void ()* @foo, i64 0}
-// MS: !{i64 4195979634929632483, void ()* @foo, i64 0}
-
 // CHECK: !{i32 4, !"Cross-DSO CFI", i32 1}
+
+// Check that the type entries are correct.
+
+// ITANIUM: [[TVOID]] = !{i64 0, !"_ZTSFvvE"}
+// ITANIUM: [[TVOID_ID]] = !{i64 0, i64 9080559750644022485}
+// ITANIUM: [[TNOPROTO]] = !{i64 0, !"_ZTSFvE"}
+// ITANIUM: [[TNOPROTO_ID]] = !{i64 0, i64 6588678392271548388}
+
+// MS: [[TVOID]] = !{i64 0, !"?6AXXZ"}
+// MS: [[TVOID_ID]] = !{i64 0, i64 5113650790573562461}
+// MS: [[TNOPROTO]] = !{i64 0, !"?6AX@Z"}
+// MS: [[TNOPROTO_ID]] = !{i64 0, i64 4195979634929632483}
diff --git a/test/CodeGen/cfi-icall.c b/test/CodeGen/cfi-icall.c
index d6cebef..ed34f4f 100644
--- a/test/CodeGen/cfi-icall.c
+++ b/test/CodeGen/cfi-icall.c
@@ -1,20 +1,24 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -emit-llvm -o - %s | FileCheck --check-prefix=ITANIUM %s
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -emit-llvm -o - %s | FileCheck --check-prefix=MS %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM %s
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s
 
 // Tests that we assign appropriate identifiers to unprototyped functions.
 
+// CHECK: define void @f({{.*}} !type [[TVOID:![0-9]+]]
 void f() {
 }
 
 void xf();
 
+// CHECK: define void @g({{.*}} !type [[TINT:![0-9]+]]
 void g(int b) {
   void (*fp)() = b ? f : xf;
-  // ITANIUM: call i1 @llvm.bitset.test(i8* {{.*}}, metadata !"_ZTSFvE")
+  // ITANIUM: call i1 @llvm.type.test(i8* {{.*}}, metadata !"_ZTSFvE")
   fp();
 }
 
-// ITANIUM-DAG: !{!"_ZTSFvE", void ()* @f, i64 0}
-// ITANIUM-DAG: !{!"_ZTSFvE", void (...)* @xf, i64 0}
-// MS-DAG: !{!"?6AX@Z", void ()* @f, i64 0}
-// MS-DAG: !{!"?6AX@Z", void (...)* @xf, i64 0}
+// CHECK: declare !type [[TVOID:![0-9]+]] void @xf({{.*}}
+
+// ITANIUM-DAG: [[TVOID]] = !{i64 0, !"_ZTSFvE"}
+// ITANIUM-DAG: [[TINT]] = !{i64 0, !"_ZTSFviE"}
+// MS-DAG: [[TVOID]] = !{i64 0, !"?6AX@Z"}
+// MS-DAG: [[TINT]] = !{i64 0, !"?6AXH@Z"}
diff --git a/test/CodeGen/cfstring-windows.c b/test/CodeGen/cfstring-windows.c
new file mode 100644
index 0000000..e54c860
--- /dev/null
+++ b/test/CodeGen/cfstring-windows.c
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DEXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DEXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DDLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-DLLIMPORT
+
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DEXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DEXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DDLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-DLLIMPORT
+
+#if defined(CF_BUILDING_CF)
+#if defined(DECL)
+extern __declspec(dllexport) long __CFConstantStringClassReference[];
+#elif defined(DEFN)
+__declspec(dllexport) long __CFConstantStringClassReference[32];
+#endif
+#else
+#if defined(EXTERN)
+extern long __CFConstantStringClassReference[];
+#elif defined(EXTERN_DLLIMPORT)
+extern __declspec(dllimport) long __CFConstantStringClassReference[];
+#elif defined(DLLIMPORT)
+__declspec(dllimport) long __CFConstantStringClassReference[];
+#endif
+#endif
+
+typedef struct __CFString *CFStringRef;
+const CFStringRef string = (CFStringRef)__builtin___CFStringMakeConstantString("string");
+
+// CHECK-CF-IN-CF-DECL: @__CFConstantStringClassReference = external dllexport global [0 x i32]
+// CHECK-CF-IN-CF-DEFN: @__CFConstantStringClassReference = common dllexport global [32 x i32]
+// CHECK-CF: @__CFConstantStringClassReference = external dllimport global [0 x i32]
+// CHECK-CF-EXTERN: @__CFConstantStringClassReference = external dllimport global [0 x i32]
+// CHECK-CF-EXTERN-DLLIMPORT: @__CFConstantStringClassReference = external dllimport global [0 x i32]
+// CHECK-CF-DLLIMPORT: @__CFConstantStringClassReference = external dllimport global [0 x i32]
+
diff --git a/test/CodeGen/cfstring.c b/test/CodeGen/cfstring.c
index 97d39b6..f0862b9 100644
--- a/test/CodeGen/cfstring.c
+++ b/test/CodeGen/cfstring.c
@@ -1,9 +1,11 @@
-// RUN: %clang_cc1 -emit-llvm %s -o %t
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-macho -emit-llvm %s -o %t
 
 // <rdar://problem/10657500>: Check that the backing store of CFStrings are
 // constant with the -fwritable-strings flag.
 //
-// RUN: %clang_cc1 -fwritable-strings -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-macho -fwritable-strings -emit-llvm %s -o - | FileCheck %s
 //
 // CHECK: @.str = private unnamed_addr constant [14 x i8] c"Hello, World!\00", section "__TEXT,__cstring,cstring_literals", align 1
 // CHECK: @.str.1 = private unnamed_addr constant [7 x i8] c"yo joe\00", section "__TEXT,__cstring,cstring_literals", align 1
diff --git a/test/CodeGen/cleanup-destslot-simple.c b/test/CodeGen/cleanup-destslot-simple.c
index a1c5640..9b9f74e 100644
--- a/test/CodeGen/cleanup-destslot-simple.c
+++ b/test/CodeGen/cleanup-destslot-simple.c
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -O1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=LIFETIME
 
 // We shouldn't have markers at -O0 or with msan.
-// RUN: %clang_cc1 -O0 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck %s --check-prefix=CHECK
-// RUN: %clang_cc1 -O1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - -fsanitize=memory | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -O0 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - -fsanitize=memory | FileCheck %s
 
 // There is no exception to handle here, lifetime.end is not a destructor,
 // so there is no need have cleanup dest slot related code
diff --git a/test/CodeGen/const-init.c b/test/CodeGen/const-init.c
index 9434f1d..3fd231b 100644
--- a/test/CodeGen/const-init.c
+++ b/test/CodeGen/const-init.c
@@ -84,7 +84,7 @@
    { (long) &g12_tmp }
 };
 
-// CHECK: @g14 = global i8* inttoptr (i64 100 to i8*)
+// CHECK: @g14 = global i8* inttoptr (i32 100 to i8*)
 void *g14 = (void*) 100;
 
 // CHECK: @g15 = global i32 -1
diff --git a/test/CodeGen/debug-info-imported-entity.cpp b/test/CodeGen/debug-info-imported-entity.cpp
new file mode 100644
index 0000000..105cc3d
--- /dev/null
+++ b/test/CodeGen/debug-info-imported-entity.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple x86_64-unk-unk -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s
+
+namespace std { class A; }
+using std::A; using ::A;
+
+
+// CHECK: [[CompileUnit:![0-9]+]] = distinct !DICompileUnit({{.+}} imports: [[Imports:![0-9]+]])
+// CHECK: [[Imports]] = !{[[ImportedEntity:![0-9]+]]}
+// CHECK: [[ImportedEntity]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[CompileUnit]], entity: [[STDA:![0-9]+]], line: 4)
+// CHECK: [[STDA]] = !DICompositeType(tag: DW_TAG_class_type, name: "A",
+
diff --git a/test/CodeGen/debug-info-packed-struct.c b/test/CodeGen/debug-info-packed-struct.c
index 189bbe4..8c1a0d4 100644
--- a/test/CodeGen/debug-info-packed-struct.c
+++ b/test/CodeGen/debug-info-packed-struct.c
@@ -21,7 +21,7 @@
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l0_ofs8",
 // CHECK-SAME:     {{.*}}size: 64, align: 64, offset: 64)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l0_ofs16",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 128)
+// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 128, flags: DIFlagBitField, extraData: i64 128)
 
 
 // ---------------------------------------------------------------------
@@ -40,7 +40,7 @@
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l1_ofs1",
 // CHECK-SAME:     {{.*}}size: 64, align: 8, offset: 8)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l1_ofs9",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72)
+// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72, flags: DIFlagBitField, extraData: i64 72)
 
 
 // ---------------------------------------------------------------------
@@ -61,7 +61,7 @@
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l2_ofs1",
 // CHECK-SAME:     {{.*}}size: 64, align: 8, offset: 8)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l2_ofs9",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72)
+// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72, flags: DIFlagBitField, extraData: i64 72)
 
 
 
@@ -83,7 +83,7 @@
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l3_ofs4",
 // CHECK-SAME:     {{.*}}size: 64, align: 32, offset: 32)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l3_ofs12",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 96)
+// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 96, flags: DIFlagBitField, extraData: i64 96)
 
 struct layout0 l0;
 struct layout1 l1;
diff --git a/test/CodeGen/debug-info-renderscript-tag.rs b/test/CodeGen/debug-info-renderscript-tag.rs
new file mode 100644
index 0000000..ded650d
--- /dev/null
+++ b/test/CodeGen/debug-info-renderscript-tag.rs
@@ -0,0 +1,3 @@
+// RUN: %clang -emit-llvm -S -g %s -o - | FileCheck %s
+
+// CHECK: !DICompileUnit(language: DW_LANG_GOOGLE_RenderScript{{.*}})
diff --git a/test/CodeGen/dependent-lib.c b/test/CodeGen/dependent-lib.c
index b3abc2f..9cf49c8 100644
--- a/test/CodeGen/dependent-lib.c
+++ b/test/CodeGen/dependent-lib.c
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 %s --dependent-lib=msvcrt -triple thumbv7-windows -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s --dependent-lib=msvcrt -triple i686-pc-win32 -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s --dependent-lib=msvcrt -triple x86_64-pc-win32 -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s --dependent-lib=msvcrt -triple i686-pc-linux -emit-llvm -o - | FileCheck -check-prefix LINUX %s
diff --git a/test/CodeGen/dllimport.c b/test/CodeGen/dllimport.c
index 0dfecea..f70048e 100644
--- a/test/CodeGen/dllimport.c
+++ b/test/CodeGen/dllimport.c
@@ -45,7 +45,8 @@
 USEVAR(GlobalRedecl3)
 
 // Make sure this works even if the decl has been used before it's defined (PR20792).
-// CHECK: @GlobalRedecl4 = common global i32
+// MS: @GlobalRedecl4 = common dllexport global i32
+// GNU: @GlobalRedecl4 = common global i32
 __declspec(dllimport) extern int GlobalRedecl4;
 USEVAR(GlobalRedecl4)
                       int GlobalRedecl4; // dllimport ignored
@@ -111,13 +112,15 @@
                       void redecl2(void);
 USE(redecl2)
 
-// CHECK-DAG: define void @redecl3()
+// MS: define dllexport void @redecl3()
+// GNU: define void @redecl3()
 __declspec(dllimport) void redecl3(void);
                       void redecl3(void) {} // dllimport ignored
 USE(redecl3)
 
 // Make sure this works even if the decl is used before it's defined (PR20792).
-// CHECK-DAG: define void @redecl4()
+// MS: define dllexport void @redecl4()
+// GNU: define void @redecl4()
 __declspec(dllimport) void redecl4(void);
 USE(redecl4)
                       void redecl4(void) {} // dllimport ignored
diff --git a/test/CodeGen/enable_if.c b/test/CodeGen/enable_if.c
index f863d80..5e9f904 100644
--- a/test/CodeGen/enable_if.c
+++ b/test/CodeGen/enable_if.c
@@ -80,3 +80,16 @@
   // CHECK: store void (i32)* @_Z3quxUa9enable_ifIXLi1EEXL_Z9TRUEFACTSEEEi
   p = &qux;
 }
+
+// There was a bug where, when enable_if was present, overload resolution
+// wouldn't pay attention to lower-priority attributes.
+// (N.B. `foo` with pass_object_size should always be preferred)
+// CHECK-LABEL: define void @test5
+void test5() {
+  int foo(char *i) __attribute__((enable_if(1, ""), overloadable));
+  int foo(char *i __attribute__((pass_object_size(0))))
+      __attribute__((enable_if(1, ""), overloadable));
+
+  // CHECK: call i32 @_Z3fooUa9enable_ifIXLi1EEEPcU17pass_object_size0
+  foo((void*)0);
+}
diff --git a/test/CodeGen/exceptions-seh-finally.c b/test/CodeGen/exceptions-seh-finally.c
index f0ed223..0f2123b 100644
--- a/test/CodeGen/exceptions-seh-finally.c
+++ b/test/CodeGen/exceptions-seh-finally.c
@@ -29,6 +29,7 @@
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
 // CHECK: define internal void @"\01?fin$0@0@basic_finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs:#[0-9]+]]
 // CHECK: call void @cleanup()
 
 // Mostly check that we don't double emit 'r' which would crash.
@@ -62,6 +63,7 @@
 // CHECK: ret void
 
 // CHECK: define internal void @"\01?fin$0@0@label_in_finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: br label %[[l:[^ ]*]]
 //
 // CHECK: [[l]]
@@ -95,6 +97,7 @@
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
 // CHECK: define internal void @"\01?fin$0@0@use_abnormal_termination@@"({{i8( zeroext)?}} %[[abnormal:abnormal_termination]], i8* %frame_pointer)
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: %[[abnormal_zext:[^ ]*]] = zext i8 %[[abnormal]] to i32
 // CHECK: store i32 %[[abnormal_zext]], i32* @crashed
 // CHECK-NEXT: ret void
@@ -112,6 +115,7 @@
 // CHECK: ret void
 
 // CHECK: define internal void @"\01?fin$0@0@noreturn_noop_finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: call void @abort()
 // CHECK: unreachable
 
@@ -137,6 +141,7 @@
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
 // CHECK: define internal void @"\01?fin$0@0@noreturn_finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: call void @abort()
 // CHECK: unreachable
 
@@ -151,6 +156,7 @@
 // CHECK-NEXT: ret i32 42
 
 // CHECK: define internal void @"\01?fin$0@0@finally_with_return@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK-NOT: br i1
 // CHECK-NOT: br label
 // CHECK: ret void
@@ -181,9 +187,11 @@
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
 // CHECK-LABEL: define internal void @"\01?fin$0@0@nested___finally___finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: ret void
 
 // CHECK-LABEL: define internal void @"\01?fin$1@0@nested___finally___finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: unreachable
 
 // FIXME: Our behavior seems suspiciously different.
@@ -226,7 +234,41 @@
 // CHECK-NEXT: cleanupret from %[[outerpad]] unwind to caller
 
 // CHECK-LABEL: define internal void @"\01?fin$0@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: ret void
 
 // CHECK-LABEL: define internal void @"\01?fin$1@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: unreachable
+
+void finally_within_finally() {
+  __try {
+    might_crash();
+  } __finally {
+    __try {
+      might_crash();
+    } __finally {
+    }
+  }
+}
+
+// CHECK-LABEL: define void @finally_within_finally(
+// CHECK: invoke void @might_crash(
+
+// CHECK: call void @"\01?fin$0@0@finally_within_finally@@"(
+// CHECK: call void @"\01?fin$0@0@finally_within_finally@@"({{.*}}) [ "funclet"(
+
+// CHECK-LABEL: define internal void @"\01?fin$0@0@finally_within_finally@@"({{[^)]*}})
+// CHECK-SAME: [[finally_attrs]]
+// CHECK: invoke void @might_crash(
+
+// CHECK: call void @"\01?fin$1@0@finally_within_finally@@"(
+// CHECK: call void @"\01?fin$1@0@finally_within_finally@@"({{.*}}) [ "funclet"(
+
+// CHECK-LABEL: define internal void @"\01?fin$1@0@finally_within_finally@@"({{[^)]*}})
+// CHECK-SAME: [[finally_attrs]]
+
+// Look for the absence of noinline. Enum attributes come first, so check that
+// a string attribute is the first to verify that no enum attributes are
+// present.
+// CHECK: attributes [[finally_attrs]] = { "{{.*}}" }
diff --git a/test/CodeGen/exceptions-seh-leave.c b/test/CodeGen/exceptions-seh-leave.c
index a0b1956..087fadb 100644
--- a/test/CodeGen/exceptions-seh-leave.c
+++ b/test/CodeGen/exceptions-seh-leave.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -fnew-ms-eh -emit-llvm -o - | opt -instnamer -S | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -emit-llvm -o - | opt -instnamer -S | FileCheck %s
 
 void g(void);
 
diff --git a/test/CodeGen/exceptions-seh.c b/test/CodeGen/exceptions-seh.c
index b027bd8..a0a1dbc 100644
--- a/test/CodeGen/exceptions-seh.c
+++ b/test/CodeGen/exceptions-seh.c
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -fnew-ms-eh -emit-llvm -o - \
+// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -emit-llvm -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=X64
-// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -fnew-ms-eh -emit-llvm -o - \
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -emit-llvm -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=X86
-// RUN: %clang_cc1 %s -triple i686-pc-windows-gnu -fms-extensions -fnew-ms-eh -emit-llvm -o - \
+// RUN: %clang_cc1 %s -triple i686-pc-windows-gnu -fms-extensions -emit-llvm -o - \
 // RUN:         | FileCheck %s --check-prefix=X86-GNU
-// RUN: %clang_cc1 %s -triple x86_64-pc-windows-gnu -fms-extensions -fnew-ms-eh -emit-llvm -o - \
+// RUN: %clang_cc1 %s -triple x86_64-pc-windows-gnu -fms-extensions -emit-llvm -o - \
 // RUN:         | FileCheck %s --check-prefix=X64-GNU
 
 void try_body(int numerator, int denominator, int *myres) {
diff --git a/test/CodeGen/f16c-builtins.c b/test/CodeGen/f16c-builtins.c
index f9cfa0d..15c3bde 100644
--- a/test/CodeGen/f16c-builtins.c
+++ b/test/CodeGen/f16c-builtins.c
@@ -1,30 +1,56 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +f16c -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +f16c -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
+float test_cvtsh_ss(unsigned short a) {
+  // CHECK-LABEL: test_cvtsh_ss
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
+  // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}})
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  return _cvtsh_ss(a);
+}
+
+unsigned short test_cvtss_sh(float a) {
+  // CHECK-LABEL: test_cvtss_sh
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
+  // CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %{{.*}}, i32 0)
+  // CHECK: extractelement <8 x i16> %{{.*}}, i32 0
+  return _cvtss_sh(a, 0);
+}
+
 __m128 test_mm_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm_cvtph_ps
-  // CHECK: @llvm.x86.vcvtph2ps.128
+  // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}})
   return _mm_cvtph_ps(a);
 }
 
 __m256 test_mm256_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm256_cvtph_ps
-  // CHECK: @llvm.x86.vcvtph2ps.256
+  // CHECK: call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %{{.*}})
   return _mm256_cvtph_ps(a);
 }
 
 __m128i test_mm_cvtps_ph(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_ph
-  // CHECK: @llvm.x86.vcvtps2ph.128
+  // CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %{{.*}}, i32 0)
   return _mm_cvtps_ph(a, 0);
 }
 
 __m128i test_mm256_cvtps_ph(__m256 a) {
   // CHECK-LABEL: test_mm256_cvtps_ph
-  // CHECK: @llvm.x86.vcvtps2ph.256
+  // CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %{{.*}}, i32 0)
   return _mm256_cvtps_ph(a, 0);
 }
diff --git a/test/CodeGen/fixup-depth-overflow.c b/test/CodeGen/fixup-depth-overflow.c
new file mode 100644
index 0000000..be8f542
--- /dev/null
+++ b/test/CodeGen/fixup-depth-overflow.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -O1 -disable-llvm-optzns -emit-llvm -o - %s | FileCheck %s
+
+#define M if (x) goto L1;
+#define M10 M M M M M M M M M M
+#define M100 M10 M10 M10 M10 M10 M10 M10 M10 M10 M10
+#define M1000 M100 M100 M100 M100 M100 M100 M100 M100 M100 M100
+
+void f(int x) {
+  int h;
+
+  // Many gotos to not-yet-emitted labels would cause EHScope's FixupDepth
+  // to overflow (PR23490).
+  M1000 M1000 M1000
+
+  if (x == 5) {
+    // This will cause us to emit a clean-up of the stack variable. If the
+    // FixupDepths are broken, fixups will erroneously get threaded through it.
+    int i;
+  }
+
+L1:
+  return;
+}
+
+// CHECK-LABEL: define void @f
+// CHECK-NOT: cleanup
diff --git a/test/CodeGen/fma4-builtins.c b/test/CodeGen/fma4-builtins.c
index 69cbcd8..3edd18d 100644
--- a/test/CodeGen/fma4-builtins.c
+++ b/test/CodeGen/fma4-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +fma4 -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +fma4 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
diff --git a/test/CodeGen/fp128_complex.c b/test/CodeGen/fp128_complex.c
index 8775999..48659d2 100644
--- a/test/CodeGen/fp128_complex.c
+++ b/test/CodeGen/fp128_complex.c
@@ -1,9 +1,9 @@
-// RUN: %clang -target aarch64-linux-gnuabi %s -O3 -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -target aarch64-linux-gnuabi %s -S -emit-llvm -o - | FileCheck %s
 
 _Complex long double a, b, c, d;
 void test_fp128_compound_assign(void) {
-  // CHECK: tail call { fp128, fp128 } @__multc3
+  // CHECK: call { fp128, fp128 } @__multc3
   a *= b;
-  // CHECK: tail call { fp128, fp128 } @__divtc3
+  // CHECK: call { fp128, fp128 } @__divtc3
   c /= d;
 }
diff --git a/test/CodeGen/fp16-ops.c b/test/CodeGen/fp16-ops.c
index 7cd08a0..c96727f 100644
--- a/test/CodeGen/fp16-ops.c
+++ b/test/CodeGen/fp16-ops.c
@@ -7,6 +7,8 @@
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
 // RUN: %clang_cc1 -emit-llvm -o - -triple aarch64-none-linux-gnueabi -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
+// RUN: %clang_cc1 -emit-llvm -o - -x renderscript %s \
+// RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
 typedef unsigned cond_t;
 
 volatile cond_t test;
diff --git a/test/CodeGen/function-target-features.c b/test/CodeGen/function-target-features.c
index 351c7f1..6b32d3d 100644
--- a/test/CodeGen/function-target-features.c
+++ b/test/CodeGen/function-target-features.c
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7 -target-feature +avx | FileCheck %s -check-prefix=CORE-CPU-AND-FEATURES
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu x86-64 | FileCheck %s -check-prefix=X86-64-CPU
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7-avx -target-feature -avx | FileCheck %s -check-prefix=AVX-MINUS-FEATURE
-// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm -o - %s -target-feature +soft-float | FileCheck %s -check-prefix=NO-SOFT-FLOAT
+// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm -o - %s -target-feature +soft-float | FileCheck %s -check-prefix=SOFT-FLOAT
 // RUN: %clang_cc1 -triple arm-unknown-unknown -emit-llvm -o - %s -target-feature +soft-float | FileCheck %s -check-prefix=SOFT-FLOAT
 // RUN: %clang_cc1 -triple mips-unknown-unknown -emit-llvm -o - %s -target-feature +soft-float | FileCheck %s -check-prefix=SOFT-FLOAT
 
diff --git a/test/CodeGen/hexagon-inline-asm.c b/test/CodeGen/hexagon-inline-asm.c
new file mode 100644
index 0000000..cda3d0d
--- /dev/null
+++ b/test/CodeGen/hexagon-inline-asm.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-feature +hvx -emit-llvm -o - %s | FileCheck %s
+
+typedef int v64 __attribute__((__vector_size__(64)))
+    __attribute__((aligned(64)));
+
+int g;
+
+void foo(v64 v0, v64 v1, v64 *p) {
+  int r;
+  v64 q0;
+  asm ("%0 = vgtw(%1.w,%2.w)" : "=q"(q0) : "v"(v0), "v"(v1));
+// CHECK: call <16 x i32> asm "$0 = vgtw($1.w,$2.w)", "=q,v,v"(<16 x i32>{{.*}}, <16 x i32>{{.*}})
+  *p = q0;
+
+  asm ("%0 = memw(##%1)" : "=r"(r) : "s"(&g));
+// CHECK: call i32 asm "$0 = memw(##$1)", "=r,s"(i32* @g)
+}
diff --git a/test/CodeGen/iamcu-abi.c b/test/CodeGen/iamcu-abi.c
new file mode 100644
index 0000000..897d475
--- /dev/null
+++ b/test/CodeGen/iamcu-abi.c
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple i386-pc-elfiamcu -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: target datalayout = "e-m:e-p:32:32-i64:32-f64:32-f128:32-n8:16:32-a:0:32-S32"
+// CHECK: target triple = "i386-pc-elfiamcu"
+
+
+void food(double *d);
+void fooll(long long *ll);
+void fooull(unsigned long long *ull);
+void foold(long double *ld);
+
+// CHECK-LABEL: define void @testdouble()
+// CHECK: alloca double, align 4
+void testdouble() {
+  double d = 2.0;
+  food(&d);
+}
+
+// CHECK-LABEL: define void @testlonglong()
+// CHECK: alloca i64, align 4
+void testlonglong() {
+  long long ll = 2;
+  fooll(&ll);
+}
+
+// CHECK-LABEL: define void @testunsignedlonglong()
+// CHECK: alloca i64, align 4
+void testunsignedlonglong() {
+  unsigned long long ull = 2;
+  fooull(&ull);	
+}
+
+// CHECK-LABEL: define void @testlongdouble()
+// CHECK: alloca double, align 4
+void testlongdouble() {
+  long double ld = 2.0;
+  foold(&ld);
+}
diff --git a/test/CodeGen/ifunc.c b/test/CodeGen/ifunc.c
new file mode 100644
index 0000000..a88bb18
--- /dev/null
+++ b/test/CodeGen/ifunc.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -O2 -emit-llvm -o - %s | FileCheck %s
+
+int foo(int) __attribute__ ((ifunc("foo_ifunc")));
+
+static int f1(int i) {
+  return i + 1;
+}
+
+static int f2(int i) {
+  return i + 2;
+}
+
+typedef int (*foo_t)(int);
+
+int global;
+
+static foo_t foo_ifunc() {
+  return global ? f1 : f2;
+}
+
+int bar() {
+  return foo(1);
+}
+
+extern void goo(void);
+
+void bar2(void) {
+  goo();
+}
+
+extern void goo(void) __attribute__ ((ifunc("goo_ifunc")));
+
+void* goo_ifunc(void) {
+  return 0;
+}
+// CHECK: @foo = ifunc i32 (i32), bitcast (i32 (i32)* ()* @foo_ifunc to i32 (i32)*)
+// CHECK: @goo = ifunc void (), bitcast (i8* ()* @goo_ifunc to void ()*)
+
+// CHECK: call i32 @foo(i32
+// CHECK: call void @goo()
diff --git a/test/CodeGen/inline-asm-immediate-ubsan.c b/test/CodeGen/inline-asm-immediate-ubsan.c
index 77d5e4f..2b14e92 100644
--- a/test/CodeGen/inline-asm-immediate-ubsan.c
+++ b/test/CodeGen/inline-asm-immediate-ubsan.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
 // RUN:     -fsanitize=signed-integer-overflow \
-// RUN:   | FileCheck %s --check-prefix=CHECK
+// RUN:   | FileCheck %s
 
 // Verify we emit constants for "immediate" inline assembly arguments.
 // Emitting a scalar expression can make the immediate be generated as
diff --git a/test/CodeGen/inline-asm-mixed-style.c b/test/CodeGen/inline-asm-mixed-style.c
new file mode 100644
index 0000000..6b830d9
--- /dev/null
+++ b/test/CodeGen/inline-asm-mixed-style.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple i386-unknown-unknown -fasm-blocks -fsyntax-only -verify %s -DCHECK_ASM_GOTO
+// RUN: %clang_cc1 -triple i386-unknown-unknown -fasm-blocks -O0 -emit-llvm -S %s -o - | FileCheck %s
+// REQUIRES: x86-registered-target
+
+void f() {
+  __asm mov eax, ebx
+  __asm mov ebx, ecx
+  __asm__("movl %ecx, %edx");
+  // CHECK: movl    %ebx, %eax
+  // CHECK: movl    %ecx, %ebx
+  // CHECK: movl    %ecx, %edx
+
+  __asm mov eax, ebx
+  __asm volatile ("movl %ecx, %edx");
+  // CHECK: movl    %ebx, %eax
+  // CHECK: movl    %ecx, %edx
+
+  __asm mov eax, ebx
+  __asm const ("movl %ecx, %edx"); // expected-warning {{ignored const qualifier on asm}} 
+  // CHECK: movl    %ebx, %eax
+  // CHECK: movl    %ecx, %edx
+
+#ifdef CHECK_ASM_GOTO
+  __asm volatile goto ("movl %ecx, %edx"); // expected-error {{'asm goto' constructs are not supported yet}}
+
+  __asm mov eax, ebx
+  __asm goto ("movl %ecx, %edx"); // expected-error {{'asm goto' constructs are not supported yet}}
+#endif
+}
diff --git a/test/CodeGen/inline-optim.c b/test/CodeGen/inline-optim.c
new file mode 100644
index 0000000..f8b355a
--- /dev/null
+++ b/test/CodeGen/inline-optim.c
@@ -0,0 +1,31 @@
+// Make sure -finline-functions family flags are behaving correctly.
+
+// RUN: %clang_cc1 -triple i686-pc-win32 -emit-llvm %s -o - | FileCheck -check-prefix=NOINLINE %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -O3 -fno-inline-functions -emit-llvm %s -o - | FileCheck -check-prefix=NOINLINE %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -finline-hint-functions -emit-llvm %s -o - | FileCheck -check-prefix=HINT %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -finline-functions -emit-llvm %s -o - | FileCheck -check-prefix=INLINE %s
+
+inline int inline_hint(int a, int b) { return(a+b); }
+
+int inline_no_hint(int a, int b) { return (a/b); }
+
+inline __attribute__ ((__always_inline__)) int inline_always(int a, int b) { return(a*b); }
+
+volatile int *pa = (int*) 0x1000;
+void foo() {
+// NOINLINE-LABEL: @foo
+// HINT-LABEL: @foo
+// INLINE-LABEL: @foo
+// NOINLINE: call i32 @inline_hint
+// HINT-NOT: call i32 @inline_hint
+// INLINE-NOT: call i32 @inline_hint
+    pa[0] = inline_hint(pa[1],pa[2]);
+// NOINLINE-NOT: call i32 @inline_always
+// HINT-NOT: call i32 @inline_always
+// INLINE-NOT: call i32 @inline_always
+    pa[3] = inline_always(pa[4],pa[5]);
+// NOINLINE: call i32 @inline_no_hint
+// HINT: call i32 @inline_no_hint
+// INLINE-NOT: call i32 @inline_no_hint
+    pa[6] = inline_no_hint(pa[7], pa[8]);
+}
diff --git a/test/CodeGen/lanai-arguments.c b/test/CodeGen/lanai-arguments.c
new file mode 100644
index 0000000..9ce4ed9
--- /dev/null
+++ b/test/CodeGen/lanai-arguments.c
@@ -0,0 +1,75 @@
+// RUN: %clang_cc1 -triple lanai-unknown-unknown %s -emit-llvm -o - \
+// RUN:   | FileCheck %s
+
+// Basic argument/attribute tests for Lanai.
+
+// CHECK: define void @f0(i32 inreg %i, i32 inreg %j, i64 inreg %k)
+void f0(int i, long j, long long k) {}
+
+typedef struct {
+  int aa;
+  int bb;
+} s1;
+// CHECK: define void @f1(i32 inreg %i.coerce0, i32 inreg %i.coerce1)
+void f1(s1 i) {}
+
+typedef struct {
+  int cc;
+} s2;
+// CHECK: define void @f2(%struct.s2* noalias sret %agg.result)
+s2 f2() {
+  s2 foo;
+  return foo;
+}
+
+typedef struct {
+  int cc;
+  int dd;
+} s3;
+// CHECK: define void @f3(%struct.s3* noalias sret %agg.result)
+s3 f3() {
+  s3 foo;
+  return foo;
+}
+
+// CHECK: define void @f4(i64 inreg %i)
+void f4(long long i) {}
+
+// CHECK: define void @f5(i8 inreg %a, i16 inreg %b)
+void f5(char a, short b) {}
+
+// CHECK: define void @f6(i8 inreg %a, i16 inreg %b)
+void f6(unsigned char a, unsigned short b) {}
+
+enum my_enum {
+  ENUM1,
+  ENUM2,
+  ENUM3,
+};
+// Enums should be treated as the underlying i32.
+// CHECK: define void @f7(i32 inreg %a)
+void f7(enum my_enum a) {}
+
+enum my_big_enum {
+  ENUM4 = 0xFFFFFFFFFFFFFFFF,
+};
+// Big enums should be treated as the underlying i64.
+// CHECK: define void @f8(i64 inreg %a)
+void f8(enum my_big_enum a) {}
+
+union simple_union {
+  int a;
+  char b;
+};
+// Unions should be passed inreg.
+// CHECK: define void @f9(i32 inreg %s.coerce)
+void f9(union simple_union s) {}
+
+typedef struct {
+  int b4 : 4;
+  int b3 : 3;
+  int b8 : 8;
+} bitfield1;
+// Bitfields should be passed inreg.
+// CHECK: define void @f10(i32 inreg %bf1.coerce)
+void f10(bitfield1 bf1) {}
diff --git a/test/CodeGen/lanai-regparm.c b/test/CodeGen/lanai-regparm.c
new file mode 100644
index 0000000..c315f43
--- /dev/null
+++ b/test/CodeGen/lanai-regparm.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple lanai-unknown-unknown -mregparm 4 %s -emit-llvm -o - | FileCheck %s
+
+void f1(int a, int b, int c, int d,
+        int e, int f, int g, int h);
+
+void f2(int a, int b) __attribute((regparm(0)));
+
+void f0() {
+// CHECK: call void @f1(i32 inreg 1, i32 inreg 2, i32 inreg 3, i32 inreg 4,
+// CHECK: i32 5, i32 6, i32 7, i32 8)
+  f1(1, 2, 3, 4, 5, 6, 7, 8);
+// CHECK: call void @f2(i32 1, i32 2)
+  f2(1, 2);
+}
+
+// CHECK: declare void @f1(i32 inreg, i32 inreg, i32 inreg, i32 inreg,
+// CHECK: i32, i32, i32, i32)
+// CHECK: declare void @f2(i32, i32)
diff --git a/test/CodeGen/le32-vaarg.c b/test/CodeGen/le32-vaarg.c
index 51bbb02..c02af27 100644
--- a/test/CodeGen/le32-vaarg.c
+++ b/test/CodeGen/le32-vaarg.c
@@ -6,7 +6,9 @@
 }
 // CHECK: define i32 @get_int
 // CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, i32{{$}}
-// CHECK: ret i32 [[RESULT]]
+// CHECK: store i32 [[RESULT]], i32* [[LOC:%[a-z_0-9]+]]
+// CHECK: [[RESULT2:%[a-z_0-9]+]] = load i32, i32* [[LOC]]
+// CHECK: ret i32 [[RESULT2]]
 
 struct Foo {
   int x;
@@ -19,7 +21,9 @@
 }
 // CHECK: define void @get_struct
 // CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo{{$}}
-// CHECK: store %struct.Foo [[RESULT]], %struct.Foo* @dest
+// CHECK: store %struct.Foo [[RESULT]], %struct.Foo* [[LOC:%[a-z_0-9]+]]
+// CHECK: [[LOC2:%[a-z_0-9]+]] = bitcast {{.*}} [[LOC]] to i8*
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[LOC2]]
 
 void skip_struct(va_list *args) {
   va_arg(*args, struct Foo);
diff --git a/test/CodeGen/lifetime-asan.c b/test/CodeGen/lifetime-asan.c
new file mode 100644
index 0000000..5f0c66d
--- /dev/null
+++ b/test/CodeGen/lifetime-asan.c
@@ -0,0 +1,21 @@
+// RUN: %clang -target x86_64-linux-gnu -S -emit-llvm -o - -O0 %s | FileCheck %s -check-prefix=CHECK-O0
+// RUN: %clang -target x86_64-linux-gnu -S -emit-llvm -o - -O0 \
+// RUN:     -fsanitize=address -fsanitize-address-use-after-scope %s | \
+// RUN:     FileCheck %s -check-prefix=CHECK-ASAN-USE-AFTER-SCOPE
+
+extern int bar(char *A, int n);
+
+// CHECK-O0-NOT: @llvm.lifetime.start
+int foo(int n) {
+  if (n) {
+    // CHECK-ASAN-USE-AFTER-SCOPE: @llvm.lifetime.start(i64 10, i8* {{.*}})
+    char A[10];
+    return bar(A, 1);
+    // CHECK-ASAN-USE-AFTER-SCOPE: @llvm.lifetime.end(i64 10, i8* {{.*}})
+  } else {
+    // CHECK-ASAN-USE-AFTER-SCOPE: @llvm.lifetime.start(i64 20, i8* {{.*}})
+    char A[20];
+    return bar(A, 2);
+    // CHECK-ASAN-USE-AFTER-SCOPE: @llvm.lifetime.end(i64 20, i8* {{.*}})
+  }
+}
diff --git a/test/CodeGen/malign-double.cpp b/test/CodeGen/malign-double.cpp
new file mode 100644
index 0000000..0cda4dc
--- /dev/null
+++ b/test/CodeGen/malign-double.cpp
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -malign-double -triple i386-unknown-linux -emit-llvm %s -o - \
+// RUN:   | FileCheck --check-prefix=CHECK-ON --check-prefix=CHECK %s
+
+// RUN: %clang_cc1 -triple i386-unknown-linux -emit-llvm %s -o - \
+// RUN:   | FileCheck --check-prefix=CHECK-OFF --check-prefix=CHECK %s
+
+/* Structs S1, S2, S3, S4, and union U5 are taken from Intel, "IA-64
+   Software Conventions and Runtime Architecture Guide", version of
+   August 1999, Section 4.2, Figures 4-1 through 4-5.
+   A Union containing a double was also thrown in for good measure. */
+
+struct S1 {
+  char c;
+};
+
+unsigned S1_align = __alignof(struct S1);
+unsigned S1_size = sizeof(struct S1);
+// CHECK: @S1_align = global i32 1, align 4
+// CHECK: @S1_size = global i32 1, align 4
+
+unsigned S1_c_offset = (unsigned) &((struct S1*) 0)->c;
+// CHECK: @S1_c_offset = global i32 0, align 4
+
+struct S2{
+  char c;
+  char d;
+  short s;
+  int n;
+};
+
+unsigned S2_align = __alignof(struct S2);
+unsigned S2_size = sizeof(struct S2);
+// CHECK: @S2_align = global i32 4, align 4
+// CHECK: @S2_size = global i32 8, align 4
+
+unsigned S2_c_offset = (unsigned) &((struct S2*) 0)->c;
+unsigned S2_d_offset = (unsigned) &((struct S2*) 0)->d;
+unsigned S2_s_offset = (unsigned) &((struct S2*) 0)->s;
+unsigned S2_n_offset = (unsigned) &((struct S2*) 0)->n;
+// CHECK: @S2_c_offset = global i32 0, align 4
+// CHECK: @S2_d_offset = global i32 1, align 4
+// CHECK: @S2_s_offset = global i32 2, align 4
+// CHECK: @S2_n_offset = global i32 4, align 4
+
+struct S3 {
+  char c;
+  short s;
+};
+
+unsigned S3_align = __alignof(struct S3);
+unsigned S3_size = sizeof(struct S3);
+// CHECK: @S3_align = global i32 2, align 4
+// CHECK: @S3_size = global i32 4, align 4
+
+unsigned S3_c_offset = (unsigned) &((struct S3*) 0)->c;
+unsigned S3_s_offset = (unsigned) &((struct S3*) 0)->s;
+// CHECK: @S3_c_offset = global i32 0, align 4
+// CHECK: @S3_s_offset = global i32 2, align 4
+
+struct S4 {
+  char c;
+  double d;
+  short s;
+};
+
+unsigned S4_align = __alignof(struct S4);
+unsigned S4_size = sizeof(struct S4);
+// CHECK-ON: @S4_align = global i32 8, align 4
+// CHECK-ON: @S4_size = global i32 24, align 4
+// CHECK-OFF: @S4_align = global i32 4, align 4
+// CHECK-OFF: @S4_size = global i32 16, align 4
+
+unsigned S4_c_offset = (unsigned) &((struct S4*) 0)->c;
+unsigned S4_d_offset = (unsigned) &((struct S4*) 0)->d;
+unsigned S4_s_offset = (unsigned) &((struct S4*) 0)->s;
+// CHECK: @S4_c_offset = global i32 0, align 4
+// CHECK-ON: @S4_d_offset = global i32 8, align 4
+// CHECK-ON: @S4_s_offset = global i32 16, align 4
+// CHECK-OFF: @S4_d_offset = global i32 4, align 4
+// CHECK-OFF: @S4_s_offset = global i32 12, align 4
+
+union S5 {
+  char c;
+  short s;
+  int j;
+};
+
+unsigned S5_align = __alignof(union S5);
+unsigned S5_size = sizeof(union S5);
+// CHECK: @S5_align = global i32 4, align 4
+// CHECK: @S5_size = global i32 4, align 4
+
+unsigned S5_c_offset = (unsigned) &((union S5*) 0)->c;
+unsigned S5_s_offset = (unsigned) &((union S5*) 0)->s;
+unsigned S5_j_offset = (unsigned) &((union S5*) 0)->j;
+// CHECK: @S5_c_offset = global i32 0, align 4
+// CHECK: @S5_s_offset = global i32 0, align 4
+// CHECK: @S5_j_offset = global i32 0, align 4
+
+union S6 {
+  char c;
+  double d;
+};
+
+unsigned S6_align = __alignof(union S6);
+unsigned S6_size = sizeof(union S6);
+// CHECK-ON: @S6_align = global i32 8, align 4
+// CHECK-ON: @S6_size = global i32 8, align 4
+// CHECK-OFF: @S6_align = global i32 4, align 4
+// CHECK-OFF: @S6_size = global i32 8, align 4
+
+unsigned S6_c_offset = (unsigned) &((union S6*) 0)->c;
+unsigned S6_d_offset = (unsigned) &((union S6*) 0)->d;
+// CHECK: @S6_c_offset = global i32 0, align 4
+// CHECK: @S6_d_offset = global i32 0, align 4
diff --git a/test/CodeGen/mbackchain-2.c b/test/CodeGen/mbackchain-2.c
new file mode 100644
index 0000000..e76afaf
--- /dev/null
+++ b/test/CodeGen/mbackchain-2.c
@@ -0,0 +1,7 @@
+// RUN: %clang -mbackchain --target=s390x-linux -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define void @foo() [[NUW:#[0-9]+]]
+void foo(void) {
+}
+
+// CHECK: attributes [[NUW]] = { {{.*}} "backchain" {{.*}} }
diff --git a/test/CodeGen/mbackchain-3.c b/test/CodeGen/mbackchain-3.c
new file mode 100644
index 0000000..b115861
--- /dev/null
+++ b/test/CodeGen/mbackchain-3.c
@@ -0,0 +1,7 @@
+// RUN: %clang -mno-backchain --target=s390x-linux -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define void @foo() [[NUW:#[0-9]+]]
+void foo(void) {
+}
+
+// CHECK-NOT: "backchain"
diff --git a/test/CodeGen/mbackchain.c b/test/CodeGen/mbackchain.c
new file mode 100644
index 0000000..e7cfc3a
--- /dev/null
+++ b/test/CodeGen/mbackchain.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -mbackchain -triple s390x-linux -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define void @foo() [[NUW:#[0-9]+]]
+void foo(void) {
+}
+
+// CHECK: attributes [[NUW]] = { {{.*}} "backchain" {{.*}} }
diff --git a/test/CodeGen/mcu-struct-return.c b/test/CodeGen/mcu-struct-return.c
new file mode 100644
index 0000000..353c963
--- /dev/null
+++ b/test/CodeGen/mcu-struct-return.c
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -triple i386-pc-elfiamcu -emit-llvm %s -o - | FileCheck %s
+
+// Structure that is more than 8 byte.
+struct Big {
+  double a[10];
+};
+
+// Empty union with zero size must be returned as void.
+union U1 {
+} u1;
+
+// Too large union (80 bytes) must be returned via memory.
+union U2 {
+  struct Big b;
+} u2;
+
+// Must be returned in register.
+union U3 {
+  int x;
+} u3;
+
+// Empty struct with zero size, must be returned as void.
+struct S1 {
+} s1;
+
+// Must be returend in register.
+struct S2 {
+  int x;
+} s2;
+
+// CHECK: [[UNION1_TYPE:%.+]] = type {}
+// CHECK: [[UNION2_TYPE:%.+]] = type { [[STRUCT_TYPE:%.+]] }
+// CHECK: [[STRUCT_TYPE]] = type { [10 x double] }
+// CHECK: [[UNION3_TYPE:%.+]] = type { i32 }
+// CHECK: [[STRUCT1_TYPE:%.+]] = type {}
+// CHECK: [[STRUCT2_TYPE:%.+]] = type { i32 }
+
+union U1 foo1() { return u1; }
+union U2 foo2() { return u2; }
+union U3 foo3() { return u3; }
+struct S1 bar1() { return s1; }
+struct S2 bar2() { return s2; }
+struct S1 bar3(union U1 u) { return s1; }
+// CHECK: define void @foo1()
+// CHECK: define void @foo2([[UNION2_TYPE]]* noalias sret %{{.+}})
+// CHECK: define i32 @foo3()
+// CHECK: define void @bar1()
+// CHECK: define i32 @bar2()
+// CHECK: define void @bar3()
+
+void run() {
+  union U1 x1 = foo1();
+  union U2 x2 = foo2();
+  union U3 x3 = foo3();
+  struct S1 y1 = bar1();
+  struct S2 y2 = bar2();
+  struct S1 y3 = bar3(x1);
+
+  // CHECK: [[X1:%.+]] = alloca [[UNION1_TYPE]]
+  // CHECK: [[X2:%.+]] = alloca [[UNION2_TYPE]]
+  // CHECK: [[X3:%.+]] = alloca [[UNION3_TYPE]]
+  // CHECK: [[Y1:%.+]] = alloca [[STRUCT1_TYPE]]
+  // CHECK: [[Y2:%.+]] = alloca [[STRUCT2_TYPE]]
+  // CHECK: call void @foo1()
+  // CHECK: call void @foo2([[UNION2_TYPE]]* sret [[X2]])
+  // CHECK: {{.+}} = call i32 @foo3()
+  // CHECK: call void @bar1()
+  // CHECK: {{.+}} = call i32 @bar2()
+  // CHECK: call void @bar3()
+}
diff --git a/test/CodeGen/mips-inline-asm.c b/test/CodeGen/mips-inline-asm.c
index 2cfa41c..fa38663 100644
--- a/test/CodeGen/mips-inline-asm.c
+++ b/test/CodeGen/mips-inline-asm.c
@@ -17,3 +17,15 @@
   asm("lw $1, %0" :: "R"(data));
   // CHECK: call void asm sideeffect "lw $$1, $0", "*R,~{$1}"(i32* @data)
 }
+
+int additionalClobberedRegisters () {
+  int temp0;
+  asm volatile(
+                "mfhi %[temp0], $ac1 \n\t"
+                  : [temp0]"=&r"(temp0)
+                  :
+                  : "memory", "t0", "t1", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+  );
+  return 0;
+  // CHECK: call i32 asm sideeffect "mfhi $0, $$ac1 \0A\09", "=&r,~{memory},~{$8},~{$9},~{$ac1hi},~{$ac1lo},~{$ac2hi},~{$ac2lo},~{$ac3hi},~{$ac3lo},~{$1}"
+}
diff --git a/test/CodeGen/mips-interrupt-attr.c b/test/CodeGen/mips-interrupt-attr.c
index df70b12..0ef5dab 100644
--- a/test/CodeGen/mips-interrupt-attr.c
+++ b/test/CodeGen/mips-interrupt-attr.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple mipsel-unknown-linux -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -emit-llvm -o - %s | FileCheck %s
 
 void __attribute__ ((interrupt("vector=sw0")))
 isr_sw0 (void)
diff --git a/test/CodeGen/mips-vector-arg.c b/test/CodeGen/mips-vector-arg.c
index 5e55285..1b9d7ab 100644
--- a/test/CodeGen/mips-vector-arg.c
+++ b/test/CodeGen/mips-vector-arg.c
@@ -8,18 +8,18 @@
 typedef float  v4sf __attribute__ ((__vector_size__ (16)));
 typedef int v4i32 __attribute__ ((__vector_size__ (16)));
 
-// O32: define void @test_v4sf(i32 inreg %a1.coerce0, i32 inreg %a1.coerce1, i32 inreg %a1.coerce2, i32 inreg %a1.coerce3, i32 signext %a2, i32, i32 inreg %a3.coerce0, i32 inreg %a3.coerce1, i32 inreg %a3.coerce2, i32 inreg %a3.coerce3) [[NUW:#[0-9]+]]
+// O32: define void @test_v4sf(i32 inreg %a1.coerce0, i32 inreg %a1.coerce1, i32 inreg %a1.coerce2, i32 inreg %a1.coerce3, i32 signext %a2, i32, i32 inreg %a3.coerce0, i32 inreg %a3.coerce1, i32 inreg %a3.coerce2, i32 inreg %a3.coerce3) local_unnamed_addr [[NUW:#[0-9]+]]
 // O32: declare i32 @test_v4sf_2(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 signext, i32, i32 inreg, i32 inreg, i32 inreg, i32 inreg)
-// N64: define void @test_v4sf(i64 inreg %a1.coerce0, i64 inreg %a1.coerce1, i32 signext %a2, i64, i64 inreg %a3.coerce0, i64 inreg %a3.coerce1) [[NUW:#[0-9]+]]
+// N64: define void @test_v4sf(i64 inreg %a1.coerce0, i64 inreg %a1.coerce1, i32 signext %a2, i64, i64 inreg %a3.coerce0, i64 inreg %a3.coerce1) local_unnamed_addr [[NUW:#[0-9]+]]
 // N64: declare i32 @test_v4sf_2(i64 inreg, i64 inreg, i32 signext, i64, i64 inreg, i64 inreg)
 extern test_v4sf_2(v4sf, int, v4sf);
 void test_v4sf(v4sf a1, int a2, v4sf a3) {
   test_v4sf_2(a3, a2, a1);
 }
 
-// O32: define void @test_v4i32(i32 inreg %a1.coerce0, i32 inreg %a1.coerce1, i32 inreg %a1.coerce2, i32 inreg %a1.coerce3, i32 signext %a2, i32, i32 inreg %a3.coerce0, i32 inreg %a3.coerce1, i32 inreg %a3.coerce2, i32 inreg %a3.coerce3) [[NUW]]
+// O32: define void @test_v4i32(i32 inreg %a1.coerce0, i32 inreg %a1.coerce1, i32 inreg %a1.coerce2, i32 inreg %a1.coerce3, i32 signext %a2, i32, i32 inreg %a3.coerce0, i32 inreg %a3.coerce1, i32 inreg %a3.coerce2, i32 inreg %a3.coerce3) local_unnamed_addr [[NUW]]
 // O32: declare i32 @test_v4i32_2(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 signext, i32, i32 inreg, i32 inreg, i32 inreg, i32 inreg)
-// N64: define void @test_v4i32(i64 inreg %a1.coerce0, i64 inreg %a1.coerce1, i32 signext %a2, i64, i64 inreg %a3.coerce0, i64 inreg %a3.coerce1) [[NUW]]
+// N64: define void @test_v4i32(i64 inreg %a1.coerce0, i64 inreg %a1.coerce1, i32 signext %a2, i64, i64 inreg %a3.coerce0, i64 inreg %a3.coerce1) local_unnamed_addr [[NUW]]
 // N64: declare i32 @test_v4i32_2(i64 inreg, i64 inreg, i32 signext, i64, i64 inreg, i64 inreg)
 extern test_v4i32_2(v4i32, int, v4i32);
 void test_v4i32(v4i32 a1, int a2, v4i32 a3) {
diff --git a/test/CodeGen/mmx-builtins.c b/test/CodeGen/mmx-builtins.c
index 44d1ea4..af4a1cd 100644
--- a/test/CodeGen/mmx-builtins.c
+++ b/test/CodeGen/mmx-builtins.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -44,8 +44,8 @@
 
 __m64 test_mm_add_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q
-  return __builtin_ia32_paddq(a, b);
+  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  return _mm_add_si64(a, b);
 }
 
 __m64 test_mm_adds_pi8(__m64 a, __m64 b) {
@@ -217,6 +217,12 @@
   return _mm_cvttps_pi32(a);
 }
 
+int test_mm_extract_pi16(__m64 a) {
+  // CHECK-LABEL: test_mm_extract_pi16
+  // CHECK: call i32 @llvm.x86.mmx.pextr.w
+  return _mm_extract_pi16(a, 2);
+}
+
 __m64 test_m_from_int(int a) {
   // CHECK-LABEL: test_m_from_int
   // CHECK: insertelement <2 x i32>
@@ -265,6 +271,12 @@
   return _mm_hsubs_pi16(a, b);
 }
 
+__m64 test_mm_insert_pi16(__m64 a, int d) {
+  // CHECK-LABEL: test_mm_insert_pi16
+  // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w
+  return _mm_insert_pi16(a, d, 2);
+}
+
 __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_madd_pi16
   // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd
@@ -315,7 +327,7 @@
 
 __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mul_su32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq
+  // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
   return _mm_mul_su32(a, b);
 }
 
@@ -525,8 +537,8 @@
 
 __m64 test_mm_sub_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.q
-  return __builtin_ia32_psubq(a, b);
+  // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  return _mm_sub_si64(a, b);
 }
 
 __m64 test_mm_subs_pi8(__m64 a, __m64 b) {
diff --git a/test/CodeGen/ms-inline-asm-avx512.c b/test/CodeGen/ms-inline-asm-avx512.c
new file mode 100644
index 0000000..c1b783a
--- /dev/null
+++ b/test/CodeGen/ms-inline-asm-avx512.c
@@ -0,0 +1,21 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 %s -triple x86_64-pc-windows-msvc -target-cpu knl -fasm-blocks -emit-llvm -o - | FileCheck %s
+
+void t1() {
+// CHECK: @t1
+// CHECK: call void asm sideeffect inteldialect "vaddpd zmm8, zmm27, zmm6", "~{zmm8},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: ret void
+  __asm {
+	  vaddpd zmm8, zmm27, zmm6
+  }
+}
+
+
+void t2() {
+// CHECK: @t2
+// CHECK: call void asm sideeffect inteldialect "vaddpd zmm8 {k1}, zmm27, zmm6", "~{zmm8},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: ret void
+  __asm {
+	  vaddpd zmm8 {k1}, zmm27, zmm6
+  }
+}
diff --git a/test/CodeGen/ms-inline-asm-errors.cpp b/test/CodeGen/ms-inline-asm-errors.cpp
new file mode 100644
index 0000000..6484743
--- /dev/null
+++ b/test/CodeGen/ms-inline-asm-errors.cpp
@@ -0,0 +1,15 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -x c++ %s -triple i386-apple-darwin10 -std=c++11 -fasm-blocks -verify
+
+class A {
+public:
+  void foo(int a)   {}
+  void foo(float a) {}
+};
+
+
+void t_fail() {
+	__asm {
+		mov ecx, [eax]A.foo // expected-error {{Unable to lookup field reference!}}
+	}
+}
diff --git a/test/CodeGen/ms-inline-asm.c b/test/CodeGen/ms-inline-asm.c
index e4d9756..ad9e4f3 100644
--- a/test/CodeGen/ms-inline-asm.c
+++ b/test/CodeGen/ms-inline-asm.c
@@ -86,7 +86,7 @@
     __asm { pop ebx }
   }
 // CHECK: t9
-// CHECK: call void asm sideeffect inteldialect "push ebx\0A\09mov ebx, $$0x07\0A\09pop ebx", "~{ebx},~{esp},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call void asm sideeffect inteldialect "push ebx\0A\09mov ebx, $$0x07\0A\09pop ebx\0A\09", "~{ebx},~{esp},~{dirflag},~{fpsr},~{flags}"()
 }
 
 unsigned t10(void) {
diff --git a/test/CodeGen/ms-intrinsics.c b/test/CodeGen/ms-intrinsics.c
index 9103622..ceaa847 100644
--- a/test/CodeGen/ms-intrinsics.c
+++ b/test/CodeGen/ms-intrinsics.c
@@ -8,17 +8,17 @@
 // RUN:         -triple x86_64--windows -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X64
 
-// Intrin.h needs size_t, but -ffreestanding prevents us from getting it from
+// intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
 typedef __SIZE_TYPE__ size_t;
 
-#include <Intrin.h>
+#include <intrin.h>
 
 void *test_InterlockedExchangePointer(void * volatile *Target, void *Value) {
   return _InterlockedExchangePointer(Target, Value);
 }
 
-// CHECK: define{{.*}}i8* @test_InterlockedExchangePointer(i8** %Target, i8* %Value){{.*}}{
+// CHECK: define{{.*}}i8* @test_InterlockedExchangePointer(i8** {{[a-z_ ]*}}%Target, i8* {{[a-z_ ]*}}%Value){{.*}}{
 // CHECK:   %[[TARGET:[0-9]+]] = bitcast i8** %Target to [[iPTR:i[0-9]+]]*
 // CHECK:   %[[VALUE:[0-9]+]] = ptrtoint i8* %Value to [[iPTR]]
 // CHECK:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg [[iPTR]]* %[[TARGET]], [[iPTR]] %[[VALUE]] seq_cst
@@ -31,7 +31,7 @@
   return _InterlockedCompareExchangePointer(Destination, Exchange, Comparand);
 }
 
-// CHECK: define{{.*}}i8* @test_InterlockedCompareExchangePointer(i8** %Destination, i8* %Exchange, i8* %Comparand){{.*}}{
+// CHECK: define{{.*}}i8* @test_InterlockedCompareExchangePointer(i8** {{[a-z_ ]*}}%Destination, i8* {{[a-z_ ]*}}%Exchange, i8* {{[a-z_ ]*}}%Comparand){{.*}}{
 // CHECK:   %[[DEST:[0-9]+]] = bitcast i8** %Destination to [[iPTR]]*
 // CHECK:   %[[EXCHANGE:[0-9]+]] = ptrtoint i8* %Exchange to [[iPTR]]
 // CHECK:   %[[COMPARAND:[0-9]+]] = ptrtoint i8* %Comparand to [[iPTR]]
@@ -45,7 +45,7 @@
   return _InterlockedExchange(Target, Value);
 }
 
-// CHECK: define{{.*}}i32 @test_InterlockedExchange(i32* %Target, i32 %Value){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedExchange(i32* {{[a-z_ ]*}}%Target, i32 %Value){{.*}}{
 // CHECK:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg i32* %Target, i32 %Value seq_cst
 // CHECK:   ret i32 %[[EXCHANGE:[0-9]+]]
 // CHECK: }
diff --git a/test/CodeGen/ms-mm-align.c b/test/CodeGen/ms-mm-align.c
index ae8e980..7130c74 100644
--- a/test/CodeGen/ms-mm-align.c
+++ b/test/CodeGen/ms-mm-align.c
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -target-feature +sse \
 // RUN:         -triple i686--windows -emit-llvm %s -o - \
 // RUN:         | FileCheck %s -check-prefix CHECK
 
-// Intrin.h needs size_t, but -ffreestanding prevents us from getting it from
+// intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
 typedef __SIZE_TYPE__ size_t;
-#include <Intrin.h>
+#include <intrin.h>
 
 void capture_ptr(int* i);
 void test_mm_align16(int p) {
diff --git a/test/CodeGen/ms-volatile.c b/test/CodeGen/ms-volatile.c
index 87393e7..a3ef35a 100644
--- a/test/CodeGen/ms-volatile.c
+++ b/test/CodeGen/ms-volatile.c
@@ -7,6 +7,13 @@
 };
 typedef _Complex float __declspec(align(8)) baz;
 
+#pragma pack(push)
+#pragma pack(1)
+struct qux {
+   volatile int f;
+};
+#pragma pack(pop)
+
 void test1(struct foo *p, struct foo *q) {
   *p = *q;
   // CHECK-LABEL: @test1
@@ -52,11 +59,29 @@
 void test8(volatile double *p, volatile double *q) {
   *p = *q;
   // CHECK-LABEL: @test8
-  // CHECK: load atomic volatile {{.*}} acquire
-  // CHECK: store atomic volatile {{.*}}, {{.*}} release
+  // CHECK: load volatile {{.*}}
+  // CHECK: store volatile {{.*}}, {{.*}}
 }
 void test9(volatile baz *p, baz *q) {
   *p = *q;
   // CHECK-LABEL: @test9
+  // CHECK: store volatile {{.*}}, {{.*}}
+  // CHECK: store volatile {{.*}}, {{.*}}
+}
+void test10(volatile long long *p, volatile long long *q) {
+  *p = *q;
+  // CHECK-LABEL: @test10
+  // CHECK: load volatile {{.*}}
+  // CHECK: store volatile {{.*}}, {{.*}}
+}
+void test11(volatile float *p, volatile float *q) {
+  *p = *q;
+  // CHECK-LABEL: @test11
+  // CHECK: load atomic volatile {{.*}} acquire
   // CHECK: store atomic volatile {{.*}}, {{.*}} release
 }
+int test12(struct qux *p) {
+  return p->f;
+  // CHECK-LABEL: @test12
+  // CHECK: load volatile {{.*}}
+}
diff --git a/test/CodeGen/neon-immediate-ubsan.c b/test/CodeGen/neon-immediate-ubsan.c
index 3fe4b00..c3e1ce2 100644
--- a/test/CodeGen/neon-immediate-ubsan.c
+++ b/test/CodeGen/neon-immediate-ubsan.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -O1 -o - %s \
+// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -o - %s \
 // RUN:     -target-feature +neon -target-cpu cortex-a8 \
 // RUN:     -fsanitize=signed-integer-overflow \
 // RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=ARMV7
 
-// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -O1 -o - %s \
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -o - %s \
 // RUN:     -target-feature +neon -target-cpu cortex-a53 \
 // RUN:     -fsanitize=signed-integer-overflow \
 // RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AARCH64
diff --git a/test/CodeGen/nousejumptable.c b/test/CodeGen/nousejumptable.c
new file mode 100644
index 0000000..91ad581
--- /dev/null
+++ b/test/CodeGen/nousejumptable.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -S -fno-jump-tables %s -emit-llvm -o - | FileCheck %s
+
+// CHECK-LABEL: main
+// CHECK: attributes #0 = {{.*}}"no-jump-tables"="true"{{.*}}
+
+int main() {
+  return 0;
+}
diff --git a/test/CodeGen/nvptx-cpus.c b/test/CodeGen/nvptx-cpus.c
index 015f529..76c55c0 100644
--- a/test/CodeGen/nvptx-cpus.c
+++ b/test/CodeGen/nvptx-cpus.c
@@ -3,6 +3,9 @@
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_30 -O3 -S -o %t %s -emit-llvm
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_35 -O3 -S -o %t %s -emit-llvm
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_37 -O3 -S -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 -O3 -S -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_52 -O3 -S -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -O3 -S -o %t %s -emit-llvm
 
 // Make sure clang accepts all supported architectures.
 
diff --git a/test/CodeGen/overloadable.c b/test/CodeGen/overloadable.c
index 4946c6d..634820c 100644
--- a/test/CodeGen/overloadable.c
+++ b/test/CodeGen/overloadable.c
@@ -29,3 +29,33 @@
   cdv = f(cdv);
   vv = f(vv);
 }
+
+// Ensuring that we pick the correct function for taking the address of an
+// overload when conversions are involved.
+
+void addrof_many(int *a) __attribute__((overloadable, enable_if(0, "")));
+void addrof_many(void *a) __attribute__((overloadable));
+void addrof_many(char *a) __attribute__((overloadable));
+
+void addrof_single(int *a) __attribute__((overloadable, enable_if(0, "")));
+void addrof_single(char *a) __attribute__((overloadable, enable_if(0, "")));
+void addrof_single(char *a) __attribute__((overloadable));
+
+// CHECK-LABEL: define void @foo
+void foo() {
+  // CHECK: store void (i8*)* @_Z11addrof_manyPc
+  void (*p1)(char *) = &addrof_many;
+  // CHECK: store void (i8*)* @_Z11addrof_manyPv
+  void (*p2)(void *) = &addrof_many;
+  // CHECK: void (i8*)* @_Z11addrof_manyPc
+  void *vp1 = (void (*)(char *)) & addrof_many;
+  // CHECK: void (i8*)* @_Z11addrof_manyPv
+  void *vp2 = (void (*)(void *)) & addrof_many;
+
+  // CHECK: store void (i8*)* @_Z13addrof_singlePc
+  void (*p3)(char *) = &addrof_single;
+  // CHECK: @_Z13addrof_singlePc
+  void (*p4)(int *) = &addrof_single;
+  // CHECK: @_Z13addrof_singlePc
+  void *vp3 = &addrof_single;
+}
diff --git a/test/CodeGen/packed-arrays.c b/test/CodeGen/packed-arrays.c
index bb742c6..a90766f 100644
--- a/test/CodeGen/packed-arrays.c
+++ b/test/CodeGen/packed-arrays.c
@@ -23,32 +23,32 @@
   unsigned int z;
 };
 
-// CHECK: @align0 = global i32 1
+// CHECK: @align0 = local_unnamed_addr global i32 1
 int align0 = __alignof(struct s0);
-// CHECK: @align1 = global i32 4
+// CHECK: @align1 = local_unnamed_addr global i32 4
 int align1 = __alignof(struct s1);
-// CHECK: @align2 = global i32 1
+// CHECK: @align2 = local_unnamed_addr global i32 1
 int align2 = __alignof(struct s2);
-// CHECK: @align3 = global i32 1
+// CHECK: @align3 = local_unnamed_addr global i32 1
 int align3 = __alignof(struct s3);
 
-// CHECK: @align0_x = global i32 1
+// CHECK: @align0_x = local_unnamed_addr global i32 1
 int align0_x = __alignof(((struct s0*) 0)->x);
 //
-// CHECK: @align1_x = global i32 1
+// CHECK: @align1_x = local_unnamed_addr global i32 1
 int align1_x = __alignof(((struct s1*) 0)->x);
-// CHECK: @align2_x = global i32 1
+// CHECK: @align2_x = local_unnamed_addr global i32 1
 int align2_x = __alignof(((struct s2*) 0)->x);
-// CHECK: @align3_x = global i32 1
+// CHECK: @align3_x = local_unnamed_addr global i32 1
 int align3_x = __alignof(((struct s3*) 0)->x);
 
-// CHECK: @align0_x0 = global i32 4
+// CHECK: @align0_x0 = local_unnamed_addr global i32 4
 int align0_x0 = __alignof(((struct s0*) 0)->x[0]);
-// CHECK: @align1_x0 = global i32 4
+// CHECK: @align1_x0 = local_unnamed_addr global i32 4
 int align1_x0 = __alignof(((struct s1*) 0)->x[0]);
-// CHECK: @align2_x0 = global i32 4
+// CHECK: @align2_x0 = local_unnamed_addr global i32 4
 int align2_x0 = __alignof(((struct s2*) 0)->x[0]);
-// CHECK: @align3_x0 = global i32 4
+// CHECK: @align3_x0 = local_unnamed_addr global i32 4
 int align3_x0 = __alignof(((struct s3*) 0)->x[0]);
 
 // CHECK-LABEL: define i32 @f0_a
diff --git a/test/CodeGen/pass-object-size.c b/test/CodeGen/pass-object-size.c
index 1ad3f85..6e2bc20 100644
--- a/test/CodeGen/pass-object-size.c
+++ b/test/CodeGen/pass-object-size.c
@@ -351,3 +351,18 @@
   ObjectSize0(++p);
   ObjectSize0(p++);
 }
+
+// There was a bug where variadic functions with pass_object_size would cause
+// problems in the form of failed assertions.
+void my_sprintf(char *const c __attribute__((pass_object_size(0))), ...) {}
+
+// CHECK-LABEL: define void @test14
+void test14(char *c) {
+  // CHECK: @llvm.objectsize
+  // CHECK: call void (i8*, i64, ...) @my_sprintf
+  my_sprintf(c);
+
+  // CHECK: @llvm.objectsize
+  // CHECK: call void (i8*, i64, ...) @my_sprintf
+  my_sprintf(c, 1, 2, 3);
+}
diff --git a/test/CodeGen/pgo-instrumentation.c b/test/CodeGen/pgo-instrumentation.c
new file mode 100644
index 0000000..1dac36f
--- /dev/null
+++ b/test/CodeGen/pgo-instrumentation.c
@@ -0,0 +1,20 @@
+// Test if PGO instrumentation and use pass are invoked.
+//
+// Ensure Pass PGOInstrumentationGenPass is invoked.
+// RUN: %clang_cc1 -O2 -fprofile-instrument=llvm %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOGENPASS-INVOKED-INSTR-GEN
+// CHECK-PGOGENPASS-INVOKED-INSTR-GEN: PGOInstrumentationGenPass
+//
+// Ensure Pass PGOInstrumentationGenPass is not invoked.
+// RUN: %clang_cc1 -O2 -fprofile-instrument=clang %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOGENPASS-INVOKED-INSTR-GEN-CLANG
+// CHECK-PGOGENPASS-INVOKED-INSTR-GEN-CLANG-NOT: PGOInstrumentationGenPass
+
+// Ensure Pass PGOInstrumentationUsePass is invoked.
+// RUN: llvm-profdata merge -o %t.profdata %S/Inputs/pgotestir.profraw
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t.profdata %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-INSTR-USE
+// CHECK-PGOUSEPASS-INVOKED-INSTR-USE: PGOInstrumentationUsePass
+//
+// Ensure Pass PGOInstrumentationUsePass is not invoked.
+// RUN: llvm-profdata merge -o %t.profdata %S/Inputs/pgotestclang.profraw
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t.profdata %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-USE-CLANG
+// CHECK-PGOUSEPASS-INVOKED-USE-CLANG-NOT: PGOInstrumentationUsePass
+
diff --git a/test/CodeGen/pgo-sample-preparation.c b/test/CodeGen/pgo-sample-preparation.c
new file mode 100644
index 0000000..c0a3cb4
--- /dev/null
+++ b/test/CodeGen/pgo-sample-preparation.c
@@ -0,0 +1,16 @@
+// Test if PGO sample use preparation passes are executed correctly.
+//
+// Ensure that instcombine is executed after simplifycfg and sroa so that
+// "a < 255" will not be converted to a * 256 < 255 * 256.
+// RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -emit-llvm -o - 2>&1 | FileCheck %s
+
+void bar(int);
+void foo(int x, int y, int z) {
+  int m;
+  for (m = 0; m < x ; m++) {
+    int a = (((y >> 8) & 0xff) * z) / 256;
+    bar(a < 255 ? a : 255);
+  }
+}
+
+// CHECK-NOT: icmp slt i32 %mul, 65280
diff --git a/test/CodeGen/pgo-sample.c b/test/CodeGen/pgo-sample.c
new file mode 100644
index 0000000..c955edf
--- /dev/null
+++ b/test/CodeGen/pgo-sample.c
@@ -0,0 +1,9 @@
+// Test if PGO sample use passes are invoked.
+//
+// Ensure Pass PGOInstrumentationGenPass is invoked.
+// RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s
+// CHECK: Simplify the CFG
+// CHECK: SROA
+// CHECK: Combine redundant instructions
+// CHECK: Remove unused exception handling info
+// CHECK: Sample profile pass
diff --git a/test/CodeGen/pku.c b/test/CodeGen/pku.c
index 30565a8..d165763 100644
--- a/test/CodeGen/pku.c
+++ b/test/CodeGen/pku.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +pku -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +pku -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
diff --git a/test/CodeGen/ppc-varargs-struct.c b/test/CodeGen/ppc-varargs-struct.c
index 1ad57c2..d7936a1 100644
--- a/test/CodeGen/ppc-varargs-struct.c
+++ b/test/CodeGen/ppc-varargs-struct.c
@@ -37,6 +37,7 @@
 // CHECK-PPC-NEXT:  br label %[[CONT:[a-z0-9]+]]
 //
 // CHECK-PPC:[[USING_OVERFLOW]]
+// CHECK-PPC-NEXT:  store i8 8, i8* [[GPRPTR]], align 4
 // CHECK-PPC-NEXT:  [[OVERFLOW_AREA_P:%[0-9]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* [[ARRAYDECAY]], i32 0, i32 3
 // CHECK-PPC-NEXT:  [[OVERFLOW_AREA:%.+]] = load i8*, i8** [[OVERFLOW_AREA_P]], align 4
 // CHECK-PPC-NEXT:  %{{[0-9]+}} =  ptrtoint i8* %argp.cur to i32
@@ -76,6 +77,7 @@
 // CHECK-PPC-NEXT:  br label %[[CONT:[a-z0-9]+]]
 //
 // CHECK-PPC:[[USING_OVERFLOW]]
+// CHECK-PPC-NEXT:  store i8 8, i8* [[GPRPTR]], align 4
 // CHECK-PPC-NEXT:  [[OVERFLOW_AREA_P:%[0-9]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* [[ARRAYDECAY]], i32 0, i32 3
 // CHECK-PPC-NEXT:  [[OVERFLOW_AREA:%.+]] = load i8*, i8** [[OVERFLOW_AREA_P]], align 4
 // CHECK-PPC-NEXT:  [[MEMADDR:%.+]] = bitcast i8* [[OVERFLOW_AREA]] to i32*
diff --git a/test/CodeGen/pr18235.c b/test/CodeGen/pr18235.c
index d3f12ee..49241c8 100644
--- a/test/CodeGen/pr18235.c
+++ b/test/CodeGen/pr18235.c
@@ -1,3 +1,3 @@
 // RUN: not %clang_cc1 -triple le32-unknown-nacl %s -S -o - 2>&1 | FileCheck %s
 
-// CHECK: error: unable to create target: 'No available targets are compatible with this triple, see -version for the available targets.'
+// CHECK: error: unable to create target: 'No available targets are compatible with this triple.
diff --git a/test/CodeGen/pr25786.c b/test/CodeGen/pr25786.c
new file mode 100644
index 0000000..612da7e
--- /dev/null
+++ b/test/CodeGen/pr25786.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-OK
+
+void (__attribute__((regparm(3), stdcall)) *pf) ();
+void (__attribute__((regparm(2), stdcall)) foo)(int a) {
+}
+// CHECK: @pf = common global void (...)* null
+// CHECK: define void @foo(i32 %a)
+
+// CHECK-OK: @pf = common global void (...)* null
+// CHECK-OK: define x86_stdcallcc void @foo(i32 inreg %a)
diff --git a/test/CodeGen/pr27892.c b/test/CodeGen/pr27892.c
new file mode 100644
index 0000000..694ce9e
--- /dev/null
+++ b/test/CodeGen/pr27892.c
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fms-extensions %s -emit-llvm -o - | FileCheck %s
+
+long test1(long *p) {
+  return _InterlockedIncrement(p);
+}
+// CHECK-DAG: define i64 @test1(
+// CHECK:   %[[p_addr:.*]] = alloca i64*, align 8
+// CHECK:   store i64* %p, i64** %[[p_addr]], align 8
+// CHECK:   %[[p_load:.*]] = load i64*, i64** %[[p_addr]], align 8
+// CHECK:   %[[atomic_add:.*]] = atomicrmw volatile add i64* %[[p_load]], i64 1 seq_cst
+// CHECK:   %[[res:.*]] = add i64 %[[atomic_add]], 1
+// CHECK:   ret i64 %[[res]]
+
+long test2(long *p) {
+  return _InterlockedDecrement(p);
+}
+// CHECK-DAG: define i64 @test2(
+// CHECK:   %[[p_addr:.*]] = alloca i64*, align 8
+// CHECK:   store i64* %p, i64** %[[p_addr]], align 8
+// CHECK:   %[[p_load:.*]] = load i64*, i64** %[[p_addr]], align 8
+// CHECK:   %[[atomic_sub:.*]] = atomicrmw volatile sub i64* %[[p_load]], i64 1 seq_cst
+// CHECK:   %[[res:.*]] = sub i64 %[[atomic_sub]], 1
+// CHECK:   ret i64 %[[res]]
diff --git a/test/CodeGen/pragma-comment.c b/test/CodeGen/pragma-comment.c
index 6da2068..71a7dfc 100644
--- a/test/CodeGen/pragma-comment.c
+++ b/test/CodeGen/pragma-comment.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple thumbv7-windows -fms-extensions -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple thumbv7-linux-gnueabihf -fms-extensions -emit-llvm -o - | FileCheck -check-prefix LINUX %s
 // RUN: %clang_cc1 %s -triple i686-pc-linux -fms-extensions -emit-llvm -o - | FileCheck -check-prefix LINUX %s
 // RUN: %clang_cc1 %s -triple x86_64-scei-ps4 -fms-extensions -emit-llvm -o - | FileCheck -check-prefix PS4 %s
 
diff --git a/test/CodeGen/pragma-detect_mismatch.c b/test/CodeGen/pragma-detect_mismatch.c
index c5f3af3..08259fc 100644
--- a/test/CodeGen/pragma-detect_mismatch.c
+++ b/test/CodeGen/pragma-detect_mismatch.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple thumbv7-windows -fms-extensions -emit-llvm -o - | FileCheck %s
 
 #pragma detect_mismatch("test", "1")
 
diff --git a/test/CodeGen/rd-builtins.c b/test/CodeGen/rd-builtins.c
new file mode 100644
index 0000000..5cad903
--- /dev/null
+++ b/test/CodeGen/rd-builtins.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <x86intrin.h>
+
+unsigned long long test_rdpmc(int a) {
+  return _rdpmc(a);
+// CHECK: @test_rdpmc
+// CHECK: call i64 @llvm.x86.rdpmc
+}
+
+int test_rdtsc() {
+  return _rdtsc();
+// CHECK: @test_rdtsc
+// CHECK: call i64 @llvm.x86.rdtsc
+}
diff --git a/test/CodeGen/relax.c b/test/CodeGen/relax.c
new file mode 100644
index 0000000..07b7589
--- /dev/null
+++ b/test/CodeGen/relax.c
@@ -0,0 +1,10 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-obj --mrelax-relocations %s -mrelocation-model pic -o %t
+// RUN: llvm-readobj -r %t | FileCheck  %s
+
+// CHECK: R_X86_64_REX_GOTPCRELX foo
+
+extern int foo;
+int *f(void) {
+  return &foo;
+}
diff --git a/test/CodeGen/renderscript.c b/test/CodeGen/renderscript.c
new file mode 100644
index 0000000..5482d36
--- /dev/null
+++ b/test/CodeGen/renderscript.c
@@ -0,0 +1,140 @@
+// RUN: %clang_cc1 %s -triple=renderscript32-none-linux-gnueabi -emit-llvm -o - -Werror | FileCheck %s -check-prefix=CHECK-RS32
+// RUN: %clang_cc1 %s -triple=renderscript64-none-linux-android -emit-llvm -o - -Werror | FileCheck %s -check-prefix=CHECK-RS64
+// RUN: %clang_cc1 %s -triple=armv7-none-linux-gnueabi -emit-llvm -o - -Werror | FileCheck %s -check-prefix=CHECK-ARM
+
+// Ensure that the bitcode has the correct triple
+// CHECK-RS32: target triple = "armv7-none-linux-gnueabi"
+// CHECK-RS64: target triple = "aarch64-none-linux-android"
+// CHECK-ARM: target triple = "armv7-none-linux-gnueabi"
+
+// Ensure that long data type has 8-byte size and alignment in RenderScript
+#ifdef __RENDERSCRIPT__
+#define LONG_WIDTH_AND_ALIGN 8
+#else
+#define LONG_WIDTH_AND_ALIGN 4
+#endif
+
+_Static_assert(sizeof(long) == LONG_WIDTH_AND_ALIGN, "sizeof long is wrong");
+_Static_assert(_Alignof(long) == LONG_WIDTH_AND_ALIGN, "sizeof long is wrong");
+
+// CHECK-RS32: i64 @test_long(i64 %v)
+// CHECK-RS64: i64 @test_long(i64 %v)
+// CHECK-ARM: i32 @test_long(i32 %v)
+long test_long(long v) {
+  return v + 1;
+}
+
+// =============================================================================
+// Test coercion of aggregate argument or return value into integer arrays
+// =============================================================================
+
+// =============================================================================
+// aggregate parameter <= 4 bytes: coerced to [a x iNN] for both 32-bit and
+// 64-bit RenderScript
+// ==============================================================================
+
+typedef struct {char c1, c2, c3; } sChar3;
+typedef struct {short s; char c;} sShortChar;
+
+// CHECK-RS32: void @argChar3([3 x i8] %s.coerce)
+// CHECK-RS64: void @argChar3([3 x i8] %s.coerce)
+void argChar3(sChar3 s) {}
+
+// CHECK-RS32: void @argShortChar([2 x i16] %s.coerce)
+// CHECK-RS64: void @argShortChar([2 x i16] %s.coerce)
+void argShortChar(sShortChar s) {}
+
+// =============================================================================
+// aggregate return value <= 4 bytes: coerced to [a x iNN] for both 32-bit and
+// 64-bit RenderScript
+// =============================================================================
+
+// CHECK-RS32: [3 x i8] @retChar3()
+// CHECK-RS64: [3 x i8] @retChar3()
+sChar3 retChar3() { sChar3 r; return r; }
+
+// CHECK-RS32: [2 x i16] @retShortChar()
+// CHECK-RS64: [2 x i16] @retShortChar()
+sShortChar retShortChar() { sShortChar r; return r; }
+
+// =============================================================================
+// aggregate parameter <= 16 bytes: coerced to [a x iNN] for both 32-bit and
+// 64-bit RenderScript
+// =============================================================================
+
+typedef struct {short s1; char c; short s2; } sShortCharShort;
+typedef struct {int i; short s; char c; } sIntShortChar;
+typedef struct {long l; int i; } sLongInt;
+
+// CHECK-RS32: void @argShortCharShort([3 x i16] %s.coerce)
+// CHECK-RS64: void @argShortCharShort([3 x i16] %s.coerce)
+void argShortCharShort(sShortCharShort s) {}
+
+// CHECK-RS32: void @argIntShortChar([2 x i32] %s.coerce)
+// CHECK-RS64: void @argIntShortChar([2 x i32] %s.coerce)
+void argIntShortChar(sIntShortChar s) {}
+
+// CHECK-RS32: void @argLongInt([2 x i64] %s.coerce)
+// CHECK-RS64: void @argLongInt([2 x i64] %s.coerce)
+void argLongInt(sLongInt s) {}
+
+// =============================================================================
+// aggregate return value <= 16 bytes: returned on stack for 32-bit RenderScript
+// and coerced to [a x iNN] for 64-bit RenderScript
+// =============================================================================
+
+// CHECK-RS32: void @retShortCharShort(%struct.sShortCharShort* noalias sret %agg.result)
+// CHECK-RS64: [3 x i16] @retShortCharShort()
+sShortCharShort retShortCharShort() { sShortCharShort r; return r; }
+
+// CHECK-RS32: void @retIntShortChar(%struct.sIntShortChar* noalias sret %agg.result)
+// CHECK-RS64: [2 x i32] @retIntShortChar()
+sIntShortChar retIntShortChar() { sIntShortChar r; return r; }
+
+// CHECK-RS32: void @retLongInt(%struct.sLongInt* noalias sret %agg.result)
+// CHECK-RS64: [2 x i64] @retLongInt()
+sLongInt retLongInt() { sLongInt r; return r; }
+
+// =============================================================================
+// aggregate parameter <= 64 bytes: coerced to [a x iNN] for 32-bit RenderScript
+// and passed on the stack for 64-bit RenderScript
+// =============================================================================
+
+typedef struct {int i1, i2, i3, i4, i5; } sInt5;
+typedef struct {long l1, l2; char c; } sLong2Char;
+
+// CHECK-RS32: void @argInt5([5 x i32] %s.coerce)
+// CHECK-RS64: void @argInt5(%struct.sInt5* %s)
+void argInt5(sInt5 s) {}
+
+// CHECK-RS32: void @argLong2Char([3 x i64] %s.coerce)
+// CHECK-RS64: void @argLong2Char(%struct.sLong2Char* %s)
+void argLong2Char(sLong2Char s) {}
+
+// =============================================================================
+// aggregate return value <= 64 bytes: returned on stack for both 32-bit and
+// 64-bit RenderScript
+// =============================================================================
+
+// CHECK-RS32: void @retInt5(%struct.sInt5* noalias sret %agg.result)
+// CHECK-RS64: void @retInt5(%struct.sInt5* noalias sret %agg.result)
+sInt5 retInt5() { sInt5 r; return r;}
+
+// CHECK-RS32: void @retLong2Char(%struct.sLong2Char* noalias sret %agg.result)
+// CHECK-RS64: void @retLong2Char(%struct.sLong2Char* noalias sret %agg.result)
+sLong2Char retLong2Char() { sLong2Char r; return r;}
+
+// =============================================================================
+// aggregate parameters and return values > 64 bytes: passed and returned on the
+// stack for both 32-bit and 64-bit RenderScript
+// =============================================================================
+
+typedef struct {long l1, l2, l3, l4, l5, l6, l7, l8, l9; } sLong9;
+
+// CHECK-RS32: void @argLong9(%struct.sLong9* byval align 8 %s)
+// CHECK-RS64: void @argLong9(%struct.sLong9* %s)
+void argLong9(sLong9 s) {}
+
+// CHECK-RS32: void @retLong9(%struct.sLong9* noalias sret %agg.result)
+// CHECK-RS64: void @retLong9(%struct.sLong9* noalias sret %agg.result)
+sLong9 retLong9() { sLong9 r; return r; }
diff --git a/test/CodeGen/sparc-vaarg.c b/test/CodeGen/sparc-vaarg.c
new file mode 100644
index 0000000..3e4dd7c
--- /dev/null
+++ b/test/CodeGen/sparc-vaarg.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple sparc -emit-llvm -o - %s | FileCheck %s
+#include <stdarg.h>
+
+// CHECK-LABEL: define i32 @get_int
+// CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, i32{{$}}
+// CHECK: store i32 [[RESULT]], i32* [[LOC:%[a-z_0-9]+]]
+// CHECK: [[RESULT2:%[a-z_0-9]+]] = load i32, i32* [[LOC]]
+// CHECK: ret i32 [[RESULT2]]
+int get_int(va_list *args) {
+  return va_arg(*args, int);
+}
+
+struct Foo {
+  int x;
+};
+
+struct Foo dest;
+
+// CHECK-LABEL: define void @get_struct
+// CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo*{{$}}
+// CHECK: [[RESULT2:%[a-z_0-9]+]] = bitcast {{.*}} [[RESULT]] to i8*
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[RESULT2]]
+void get_struct(va_list *args) {
+ dest = va_arg(*args, struct Foo);
+}
+
+enum E { Foo_one = 1 };
+
+enum E enum_dest;
+
+// CHECK-LABEL: define void @get_enum
+// CHECK: va_arg i8** {{.*}}, i32
+void get_enum(va_list *args) {
+  enum_dest = va_arg(*args, enum E);
+}
diff --git a/test/CodeGen/sparcv8-abi.c b/test/CodeGen/sparcv8-abi.c
new file mode 100644
index 0000000..cd8832f
--- /dev/null
+++ b/test/CodeGen/sparcv8-abi.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+// CHECK-LABEL: define { float, float } @p({ float, float }* byval align 4 %a, { float, float }* byval align 4 %b) #0 {
+float __complex__
+p (float __complex__  a, float __complex__  b)
+{
+}
+
+// CHECK-LABEL: define { double, double } @q({ double, double }* byval align 8 %a, { double, double }* byval align 8 %b) #0 {
+double __complex__
+q (double __complex__  a, double __complex__  b)
+{
+}
+
+// CHECK-LABEL: define { i64, i64 } @r({ i64, i64 }* byval align 8 %a, { i64, i64 }* byval align 8 %b) #0 {
+long long __complex__
+r (long long __complex__  a, long long __complex__  b)
+{
+}
diff --git a/test/CodeGen/sse-builtins.c b/test/CodeGen/sse-builtins.c
index 0f964e8..f952598 100644
--- a/test/CodeGen/sse-builtins.c
+++ b/test/CodeGen/sse-builtins.c
@@ -1,169 +1,47 @@
-// RUN: %clang_cc1 -ffreestanding -triple x86_64-apple-macosx10.8.0 -target-feature +sse4.1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
 
-__m128 test_rsqrt_ss(__m128 x) {
-  // CHECK: define {{.*}} @test_rsqrt_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ss
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  return _mm_rsqrt_ss(x);
+#include <x86intrin.h>
+
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+
+__m128 test_mm_add_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_add_ps
+  // CHECK: fadd <4 x float>
+  return _mm_add_ps(A, B);
 }
 
-__m128 test_rcp_ss(__m128 x) {
-  // CHECK: define {{.*}} @test_rcp_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.rcp.ss
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  return _mm_rcp_ss(x);
+__m128 test_mm_add_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_add_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fadd float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_add_ss(A, B);
 }
 
-__m128 test_sqrt_ss(__m128 x) {
-  // CHECK: define {{.*}} @test_sqrt_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.sqrt.ss
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  return _mm_sqrt_ss(x);
+__m128 test_mm_and_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_and_ps
+  // CHECK: and <4 x i32>
+  return _mm_and_ps(A, B);
 }
 
-__m128 test_loadl_pi(__m128 x, void* y) {
-  // CHECK: define {{.*}} @test_loadl_pi
-  // CHECK: load <2 x float>, <2 x float>* {{.*}}, align 1{{$}}
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  return _mm_loadl_pi(x,y);
+__m128 test_mm_andnot_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_andnot_ps
+  // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <4 x i32>
+  return _mm_andnot_ps(A, B);
 }
 
-__m128 test_loadh_pi(__m128 x, void* y) {
-  // CHECK: define {{.*}} @test_loadh_pi
-  // CHECK: load <2 x float>, <2 x float>* {{.*}}, align 1{{$}}
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  return _mm_loadh_pi(x,y);
-}
-
-__m128 test_load_ss(void* y) {
-  // CHECK: define {{.*}} @test_load_ss
-  // CHECK: load float, float* {{.*}}, align 1{{$}}
-  return _mm_load_ss(y);
-}
-
-__m128 test_load1_ps(void* y) {
-  // CHECK: define {{.*}} @test_load1_ps
-  // CHECK: load float, float* {{.*}}, align 1{{$}}
-  return _mm_load1_ps(y);
-}
-
-void test_store_ss(__m128 x, void* y) {
-  // CHECK-LABEL: define void @test_store_ss
-  // CHECK: store {{.*}} float* {{.*}}, align 1{{$}}
-  _mm_store_ss(y, x);
-}
-
-__m128d test_load1_pd(__m128 x, void* y) {
-  // CHECK: define {{.*}} @test_load1_pd
-  // CHECK: load double, double* {{.*}}, align 1{{$}}
-  return _mm_load1_pd(y);
-}
-
-__m128d test_loadr_pd(__m128 x, void* y) {
-  // CHECK: define {{.*}} @test_loadr_pd
-  // CHECK: load <2 x double>, <2 x double>* {{.*}}, align 16{{$}}
-  return _mm_loadr_pd(y);
-}
-
-__m128d test_load_sd(void* y) {
-  // CHECK: define {{.*}} @test_load_sd
-  // CHECK: load double, double* {{.*}}, align 1{{$}}
-  return _mm_load_sd(y);
-}
-
-__m128d test_loadh_pd(__m128d x, void* y) {
-  // CHECK: define {{.*}} @test_loadh_pd
-  // CHECK: load double, double* {{.*}}, align 1{{$}}
-  return _mm_loadh_pd(x, y);
-}
-
-__m128d test_loadl_pd(__m128d x, void* y) {
-  // CHECK: define {{.*}} @test_loadl_pd
-  // CHECK: load double, double* {{.*}}, align 1{{$}}
-  return _mm_loadl_pd(x, y);
-}
-
-void test_store_sd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_store_sd
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  _mm_store_sd(y, x);
-}
-
-void test_store1_pd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_store1_pd
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  _mm_store1_pd(y, x);
-}
-
-void test_storer_pd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_storer_pd
-  // CHECK: store {{.*}} <2 x double>* {{.*}}, align 16{{$}}
-  _mm_storer_pd(y, x);
-}
-
-void test_storeh_pd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_storeh_pd
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  _mm_storeh_pd(y, x);
-}
-
-void test_storel_pd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_storel_pd
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  _mm_storel_pd(y, x);
-}
-
-__m128i test_loadl_epi64(void* y) {
-  // CHECK: define {{.*}} @test_loadl_epi64
-  // CHECK: load i64, i64* {{.*}}, align 1{{$}}
-  return _mm_loadl_epi64(y);
-}
-
-void test_storel_epi64(__m128i x, void* y) {
-  // CHECK-LABEL: define void @test_storel_epi64
-  // CHECK: store {{.*}} i64* {{.*}}, align 1{{$}}
-  _mm_storel_epi64(y, x);
-}
-
-void test_stream_si32(int x, void *y) {
-  // CHECK-LABEL: define void @test_stream_si32
-  // CHECK: store {{.*}} i32* {{.*}}, align 1, !nontemporal
-  _mm_stream_si32(y, x);
-}
-
-void test_stream_si64(long long x, void *y) {
-  // CHECK-LABEL: define void @test_stream_si64
-  // CHECK: store {{.*}} i64* {{.*}}, align 1, !nontemporal
-  _mm_stream_si64(y, x);
-}
-
-void test_stream_si128(__m128i x, void *y) {
-  // CHECK-LABEL: define void @test_stream_si128
-  // CHECK: store {{.*}} <2 x i64>* {{.*}}, align 16, !nontemporal
-  _mm_stream_si128(y, x);
-}
-
-void test_extract_epi16(__m128i __a) {
-  // CHECK-LABEL: define void @test_extract_epi16
-  // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
-  // CHECK: extractelement <8 x i16> %{{.*}}, i32 [[x]]
-  _mm_extract_epi16(__a, 8);
+__m128 test_mm_cmpeq_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpeq_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpeq_ps(__a, __b);
 }
 
 __m128 test_mm_cmpeq_ss(__m128 __a, __m128 __b) {
@@ -172,10 +50,45 @@
   return _mm_cmpeq_ss(__a, __b);
 }
 
-__m128 test_mm_cmplt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmplt_ss
+__m128 test_mm_cmpge_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpge_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ole <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpge_ps(__a, __b);
+}
+
+__m128 test_mm_cmpge_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpge_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_cmpge_ss(__a, __b);
+}
+
+__m128 test_mm_cmpgt_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpgt_ps
+  // CHECK:         [[CMP:%.*]] = fcmp olt <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpgt_ps(__a, __b);
+}
+
+__m128 test_mm_cmpgt_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpgt_ss
   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  return _mm_cmplt_ss(__a, __b);
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_cmpgt_ss(__a, __b);
+}
+
+__m128 test_mm_cmple_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmple_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ole <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmple_ps(__a, __b);
 }
 
 __m128 test_mm_cmple_ss(__m128 __a, __m128 __b) {
@@ -184,10 +97,28 @@
   return _mm_cmple_ss(__a, __b);
 }
 
-__m128 test_mm_cmpunord_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpunord_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 3)
-  return _mm_cmpunord_ss(__a, __b);
+__m128 test_mm_cmplt_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmplt_ps
+  // CHECK:         [[CMP:%.*]] = fcmp olt <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmplt_ps(__a, __b);
+}
+
+__m128 test_mm_cmplt_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmplt_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
+  return _mm_cmplt_ss(__a, __b);
+}
+
+__m128 test_mm_cmpneq_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpneq_ps
+  // CHECK:         [[CMP:%.*]] = fcmp une <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpneq_ps(__a, __b);
 }
 
 __m128 test_mm_cmpneq_ss(__m128 __a, __m128 __b) {
@@ -196,10 +127,45 @@
   return _mm_cmpneq_ss(__a, __b);
 }
 
-__m128 test_mm_cmpnlt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnlt_ss
+__m128 test_mm_cmpnge_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnge_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ugt <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpnge_ps(__a, __b);
+}
+
+__m128 test_mm_cmpnge_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnge_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_cmpnge_ss(__a, __b);
+}
+
+__m128 test_mm_cmpngt_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpngt_ps
+  // CHECK:         [[CMP:%.*]] = fcmp uge <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpngt_ps(__a, __b);
+}
+
+__m128 test_mm_cmpngt_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpngt_ss
   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  return _mm_cmpnlt_ss(__a, __b);
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_cmpngt_ss(__a, __b);
+}
+
+__m128 test_mm_cmpnle_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnle_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ugt <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpnle_ps(__a, __b);
 }
 
 __m128 test_mm_cmpnle_ss(__m128 __a, __m128 __b) {
@@ -208,274 +174,632 @@
   return _mm_cmpnle_ss(__a, __b);
 }
 
+__m128 test_mm_cmpnlt_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnlt_ps
+  // CHECK:         [[CMP:%.*]] = fcmp uge <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpnlt_ps(__a, __b);
+}
+
+__m128 test_mm_cmpnlt_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnlt_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
+  return _mm_cmpnlt_ss(__a, __b);
+}
+
+__m128 test_mm_cmpord_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpord_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ord <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpord_ps(__a, __b);
+}
+
 __m128 test_mm_cmpord_ss(__m128 __a, __m128 __b) {
   // CHECK-LABEL: @test_mm_cmpord_ss
   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
   return _mm_cmpord_ss(__a, __b);
 }
 
-__m128 test_mm_cmpgt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpgt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  return _mm_cmpgt_ss(__a, __b);
-}
-
-__m128 test_mm_cmpge_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpge_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
-  return _mm_cmpge_ss(__a, __b);
-}
-
-__m128 test_mm_cmpngt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpngt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  return _mm_cmpngt_ss(__a, __b);
-}
-
-__m128 test_mm_cmpnge_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnge_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
-  return _mm_cmpnge_ss(__a, __b);
-}
-
-__m128 test_mm_cmpeq_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpeq_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 0)
-  return _mm_cmpeq_ps(__a, __b);
-}
-
-__m128 test_mm_cmplt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmplt_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  return _mm_cmplt_ps(__a, __b);
-}
-
-__m128 test_mm_cmple_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmple_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
-  return _mm_cmple_ps(__a, __b);
-}
-
 __m128 test_mm_cmpunord_ps(__m128 __a, __m128 __b) {
   // CHECK-LABEL: @test_mm_cmpunord_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 3)
+  // CHECK:         [[CMP:%.*]] = fcmp uno <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpunord_ps(__a, __b);
 }
 
-__m128 test_mm_cmpneq_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpneq_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
-  return _mm_cmpneq_ps(__a, __b);
+__m128 test_mm_cmpunord_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpunord_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 3)
+  return _mm_cmpunord_ss(__a, __b);
 }
 
-__m128 test_mm_cmpnlt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnlt_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  return _mm_cmpnlt_ps(__a, __b);
+int test_mm_comieq_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comieq_ss
+  // CHECK: call i32 @llvm.x86.sse.comieq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comieq_ss(A, B);
 }
 
-__m128 test_mm_cmpnle_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnle_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
-  return _mm_cmpnle_ps(__a, __b);
+int test_mm_comige_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comige_ss
+  // CHECK: call i32 @llvm.x86.sse.comige.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comige_ss(A, B);
 }
 
-__m128 test_mm_cmpord_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpord_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
-  return _mm_cmpord_ps(__a, __b);
+int test_mm_comigt_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comigt_ss
+  // CHECK: call i32 @llvm.x86.sse.comigt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comigt_ss(A, B);
 }
 
-__m128 test_mm_cmpgt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpgt_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  return _mm_cmpgt_ps(__a, __b);
+int test_mm_comile_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comile_ss
+  // CHECK: call i32 @llvm.x86.sse.comile.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comile_ss(A, B);
 }
 
-__m128 test_mm_cmpge_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpge_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
-  return _mm_cmpge_ps(__a, __b);
+int test_mm_comilt_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comilt_ss
+  // CHECK: call i32 @llvm.x86.sse.comilt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comilt_ss(A, B);
 }
 
-__m128 test_mm_cmpngt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpngt_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  return _mm_cmpngt_ps(__a, __b);
+int test_mm_comineq_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comineq_ss
+  // CHECK: call i32 @llvm.x86.sse.comineq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comineq_ss(A, B);
 }
 
-__m128 test_mm_cmpnge_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnge_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
-  return _mm_cmpnge_ps(__a, __b);
+int test_mm_cvt_ss2si(__m128 A) {
+  // CHECK-LABEL: test_mm_cvt_ss2si
+  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
+  return _mm_cvt_ss2si(A);
 }
 
-__m128d test_mm_cmpeq_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpeq_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
-  return _mm_cmpeq_sd(__a, __b);
+__m128 test_mm_cvtsi32_ss(__m128 A, int B) {
+  // CHECK-LABEL: test_mm_cvtsi32_ss
+  // CHECK: sitofp i32 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_cvtsi32_ss(A, B);
 }
 
-__m128d test_mm_cmplt_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmplt_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  return _mm_cmplt_sd(__a, __b);
+__m128 test_mm_cvtsi64_ss(__m128 A, long long B) {
+  // CHECK-LABEL: test_mm_cvtsi64_ss
+  // CHECK: sitofp i64 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_cvtsi64_ss(A, B);
 }
 
-__m128d test_mm_cmple_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmple_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  return _mm_cmple_sd(__a, __b);
+float test_mm_cvtss_f32(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_f32
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  return _mm_cvtss_f32(A);
 }
 
-__m128d test_mm_cmpunord_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpunord_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
-  return _mm_cmpunord_sd(__a, __b);
+int test_mm_cvtss_si32(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_si32
+  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
+  return _mm_cvtss_si32(A);
 }
 
-__m128d test_mm_cmpneq_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpneq_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
-  return _mm_cmpneq_sd(__a, __b);
+long long test_mm_cvtss_si64(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_si64
+  // CHECK: call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %{{.*}})
+  return _mm_cvtss_si64(A);
 }
 
-__m128d test_mm_cmpnlt_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnlt_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  return _mm_cmpnlt_sd(__a, __b);
+int test_mm_cvtt_ss2si(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtt_ss2si
+  // CHECK: call i32 @llvm.x86.sse.cvttss2si(<4 x float> %{{.*}})
+  return _mm_cvtt_ss2si(A);
 }
 
-__m128d test_mm_cmpnle_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnle_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  return _mm_cmpnle_sd(__a, __b);
+int test_mm_cvttss_si32(__m128 A) {
+  // CHECK-LABEL: test_mm_cvttss_si32
+  // CHECK: call i32 @llvm.x86.sse.cvttss2si(<4 x float> %{{.*}})
+  return _mm_cvttss_si32(A);
 }
 
-__m128d test_mm_cmpord_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpord_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
-  return _mm_cmpord_sd(__a, __b);
+long long test_mm_cvttss_si64(__m128 A) {
+  // CHECK-LABEL: test_mm_cvttss_si64
+  // CHECK: call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %{{.*}})
+  return _mm_cvttss_si64(A);
 }
 
-__m128d test_mm_cmpgt_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpgt_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  return _mm_cmpgt_sd(__a, __b);
+__m128 test_mm_div_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_div_ps
+  // CHECK: fdiv <4 x float>
+  return _mm_div_ps(A, B);
 }
 
-__m128d test_mm_cmpge_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpge_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  return _mm_cmpge_sd(__a, __b);
+__m128 test_mm_div_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_div_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fdiv float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_div_ss(A, B);
 }
 
-__m128d test_mm_cmpngt_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpngt_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  return _mm_cmpngt_sd(__a, __b);
+unsigned int test_MM_GET_EXCEPTION_MASK() {
+  // CHECK-LABEL: test_MM_GET_EXCEPTION_MASK
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: and i32 %{{.*}}, 8064
+  return _MM_GET_EXCEPTION_MASK();
 }
 
-__m128d test_mm_cmpnge_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnge_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  return _mm_cmpnge_sd(__a, __b);
+unsigned int test_MM_GET_EXCEPTION_STATE() {
+  // CHECK-LABEL: test_MM_GET_EXCEPTION_STATE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: and i32 %{{.*}}, 63
+  return _MM_GET_EXCEPTION_STATE();
 }
 
-__m128d test_mm_cmpeq_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpeq_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
-  return _mm_cmpeq_pd(__a, __b);
+unsigned int test_MM_GET_FLUSH_ZERO_MODE() {
+  // CHECK-LABEL: test_MM_GET_FLUSH_ZERO_MODE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: and i32 %{{.*}}, 32768
+  return _MM_GET_FLUSH_ZERO_MODE();
 }
 
-__m128d test_mm_cmplt_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmplt_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  return _mm_cmplt_pd(__a, __b);
+unsigned int test_MM_GET_ROUNDING_MODE() {
+  // CHECK-LABEL: test_MM_GET_ROUNDING_MODE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: and i32 %{{.*}}, 24576
+  return _MM_GET_ROUNDING_MODE();
 }
 
-__m128d test_mm_cmple_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmple_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  return _mm_cmple_pd(__a, __b);
+unsigned int test_mm_getcsr() {
+  // CHECK-LABEL: test_mm_getcsr
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: load i32
+  return _mm_getcsr();
 }
 
-__m128d test_mm_cmpunord_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpunord_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
-  return _mm_cmpunord_pd(__a, __b);
+__m128 test_mm_load_ps(float* y) {
+  // CHECK-LABEL: test_mm_load_ps
+  // CHECK: load <4 x float>, <4 x float>* {{.*}}, align 16
+  return _mm_load_ps(y);
 }
 
-__m128d test_mm_cmpneq_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpneq_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
-  return _mm_cmpneq_pd(__a, __b);
+__m128 test_mm_load_ps1(float* y) {
+  // CHECK-LABEL: test_mm_load_ps1
+  // CHECK: load float, float* %{{.*}}, align 4
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
+  return _mm_load_ps1(y);
 }
 
-__m128d test_mm_cmpnlt_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnlt_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  return _mm_cmpnlt_pd(__a, __b);
+__m128 test_mm_load_ss(float* y) {
+  // CHECK-LABEL: test_mm_load_ss
+  // CHECK: load float, float* {{.*}}, align 1{{$}}
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
+  return _mm_load_ss(y);
 }
 
-__m128d test_mm_cmpnle_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnle_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  return _mm_cmpnle_pd(__a, __b);
+__m128 test_mm_load1_ps(float* y) {
+  // CHECK-LABEL: test_mm_load1_ps
+  // CHECK: load float, float* %{{.*}}, align 4
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
+  return _mm_load1_ps(y);
 }
 
-__m128d test_mm_cmpord_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpord_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
-  return _mm_cmpord_pd(__a, __b);
+__m128 test_mm_loadh_pi(__m128 x, __m64* y) {
+  // CHECK-LABEL: test_mm_loadh_pi
+  // CHECK: load <2 x float>, <2 x float>* {{.*}}, align 1{{$}}
+  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
+  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm_loadh_pi(x,y);
 }
 
-__m128d test_mm_cmpgt_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpgt_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  return _mm_cmpgt_pd(__a, __b);
+__m128 test_mm_loadl_pi(__m128 x, __m64* y) {
+  // CHECK-LABEL: test_mm_loadl_pi
+  // CHECK: load <2 x float>, <2 x float>* {{.*}}, align 1{{$}}
+  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
+  // CHECK: shufflevector {{.*}} <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  return _mm_loadl_pi(x,y);
 }
 
-__m128d test_mm_cmpge_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpge_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  return _mm_cmpge_pd(__a, __b);
+__m128 test_mm_loadr_ps(float* A) {
+  // CHECK-LABEL: test_mm_loadr_ps
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 16
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  return _mm_loadr_ps(A);
 }
 
-__m128d test_mm_cmpngt_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpngt_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  return _mm_cmpngt_pd(__a, __b);
+__m128 test_mm_loadu_ps(float* A) {
+  // CHECK-LABEL: test_mm_loadu_ps
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
+  return _mm_loadu_ps(A);
 }
 
-__m128d test_mm_cmpnge_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnge_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  return _mm_cmpnge_pd(__a, __b);
+__m128 test_mm_max_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_max_ps
+  // CHECK: @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_max_ps(A, B);
 }
 
-__m128 test_mm_slli_si128(__m128 a) {
-  // CHECK-LABEL: @test_mm_slli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
-  return _mm_slli_si128(a, 5);
+__m128 test_mm_max_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_max_ss
+  // CHECK: @llvm.x86.sse.max.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_max_ss(A, B);
 }
 
-__m128 test_mm_bslli_si128(__m128 a) {
-  // CHECK-LABEL: @test_mm_bslli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
-  return _mm_bslli_si128(a, 5);
+__m128 test_mm_min_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_min_ps
+  // CHECK: @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_min_ps(A, B);
 }
 
-__m128 test_mm_srli_si128(__m128 a) {
-  // CHECK-LABEL: @test_mm_srli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
-  return _mm_srli_si128(a, 5);
+__m128 test_mm_min_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_min_ss
+  // CHECK: @llvm.x86.sse.min.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_min_ss(A, B);
 }
 
-__m128 test_mm_bsrli_si128(__m128 a) {
-  // CHECK-LABEL: @test_mm_bsrli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
-  return _mm_bsrli_si128(a, 5);
+__m128 test_mm_move_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_move_ss
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_move_ss(A, B);
+}
+
+__m128 test_mm_movehl_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_movehl_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  return _mm_movehl_ps(A, B);
+}
+
+__m128 test_mm_movelh_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_movelh_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm_movelh_ps(A, B);
+}
+
+int test_mm_movemask_ps(__m128 A) {
+  // CHECK-LABEL: test_mm_movemask_ps
+  // CHECK: call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %{{.*}})
+  return _mm_movemask_ps(A);
+}
+
+__m128 test_mm_mul_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_mul_ps
+  // CHECK: fmul <4 x float>
+  return _mm_mul_ps(A, B);
+}
+
+__m128 test_mm_mul_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_mul_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fmul float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_mul_ss(A, B);
+}
+
+__m128 test_mm_or_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_or_ps
+  // CHECK: or <4 x i32>
+  return _mm_or_ps(A, B);
+}
+
+void test_mm_prefetch(char const* p) {
+  // CHECK-LABEL: test_mm_prefetch
+  // CHECK: call void @llvm.prefetch(i8* {{.*}}, i32 0, i32 0, i32 1)
+  _mm_prefetch(p, 0);
+}
+
+__m128 test_mm_rcp_ps(__m128 x) {
+  // CHECK-LABEL: test_mm_rcp_ps
+  // CHECK: call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> {{.*}})
+  return _mm_rcp_ps(x);
+}
+
+__m128 test_mm_rcp_ss(__m128 x) {
+  // CHECK-LABEL: test_mm_rcp_ss
+  // CHECK: call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> {{.*}})
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: extractelement <4 x float> {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: extractelement <4 x float> {{.*}}, i32 3
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_rcp_ss(x);
+}
+
+__m128 test_mm_rsqrt_ps(__m128 x) {
+  // CHECK-LABEL: test_mm_rsqrt_ps
+  // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> {{.*}})
+  return _mm_rsqrt_ps(x);
+}
+
+__m128 test_mm_rsqrt_ss(__m128 x) {
+  // CHECK-LABEL: test_mm_rsqrt_ss
+  // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> {{.*}})
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: extractelement <4 x float> {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: extractelement <4 x float> {{.*}}, i32 3
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_rsqrt_ss(x);
+}
+
+void test_MM_SET_EXCEPTION_MASK(unsigned int A) {
+  // CHECK-LABEL: test_MM_SET_EXCEPTION_MASK
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* {{.*}})
+  // CHECK: load i32
+  // CHECK: and i32 {{.*}}, -8065
+  // CHECK: or i32
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _MM_SET_EXCEPTION_MASK(A);
+}
+
+void test_MM_SET_EXCEPTION_STATE(unsigned int A) {
+  // CHECK-LABEL: test_MM_SET_EXCEPTION_STATE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* {{.*}})
+  // CHECK: load i32
+  // CHECK: and i32 {{.*}}, -64
+  // CHECK: or i32
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _MM_SET_EXCEPTION_STATE(A);
+}
+
+void test_MM_SET_FLUSH_ZERO_MODE(unsigned int A) {
+  // CHECK-LABEL: test_MM_SET_FLUSH_ZERO_MODE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* {{.*}})
+  // CHECK: load i32
+  // CHECK: and i32 {{.*}}, -32769
+  // CHECK: or i32
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _MM_SET_FLUSH_ZERO_MODE(A);
+}
+
+__m128 test_mm_set_ps(float A, float B, float C, float D) {
+  // CHECK-LABEL: test_mm_set_ps
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_set_ps(A, B, C, D);
+}
+
+__m128 test_mm_set_ps1(float A) {
+  // CHECK-LABEL: test_mm_set_ps1
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_set_ps1(A);
+}
+
+void test_MM_SET_ROUNDING_MODE(unsigned int A) {
+  // CHECK-LABEL: test_MM_SET_ROUNDING_MODE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* {{.*}})
+  // CHECK: load i32
+  // CHECK: and i32 {{.*}}, -24577
+  // CHECK: or i32
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _MM_SET_ROUNDING_MODE(A);
+}
+
+__m128 test_mm_set_ss(float A) {
+  // CHECK-LABEL: test_mm_set_ss
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 3
+  return _mm_set_ss(A);
+}
+
+__m128 test_mm_set1_ps(float A) {
+  // CHECK-LABEL: test_mm_set1_ps
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_set1_ps(A);
+}
+
+void test_mm_setcsr(unsigned int A) {
+  // CHECK-LABEL: test_mm_setcsr
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _mm_setcsr(A);
+}
+
+__m128 test_mm_setr_ps(float A, float B, float C, float D) {
+  // CHECK-LABEL: test_mm_setr_ps
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_setr_ps(A, B, C, D);
+}
+
+__m128 test_mm_setzero_ps() {
+  // CHECK-LABEL: test_mm_setzero_ps
+  // CHECK: store <4 x float> zeroinitializer
+  return _mm_setzero_ps();
+}
+
+void test_mm_sfence() {
+  // CHECK-LABEL: test_mm_sfence
+  // CHECK: call void @llvm.x86.sse.sfence()
+  _mm_sfence();
+}
+
+__m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_shuffle_ps
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
+  return _mm_shuffle_ps(A, B, 0);
+}
+
+__m128 test_mm_sqrt_ps(__m128 x) {
+  // CHECK-LABEL: test_mm_sqrt_ps
+  // CHECK: call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> {{.*}})
+  return _mm_sqrt_ps(x);
+}
+
+__m128 test_sqrt_ss(__m128 x) {
+  // CHECK: define {{.*}} @test_sqrt_ss
+  // CHECK: call <4 x float> @llvm.x86.sse.sqrt.ss
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: extractelement <4 x float> {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: extractelement <4 x float> {{.*}}, i32 3
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_sqrt_ss(x);
+}
+
+void test_mm_store_ps(float* x, __m128 y) {
+  // CHECK-LABEL: test_mm_store_ps
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* {{.*}}, align 16
+  _mm_store_ps(x, y);
+}
+
+void test_mm_store_ps1(float* x, __m128 y) {
+  // CHECK-LABEL: test_mm_store_ps1
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 16
+  _mm_store_ps1(x, y);
+}
+
+void test_mm_store_ss(float* x, __m128 y) {
+  // CHECK-LABEL: test_mm_store_ss
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
+  // CHECK: store float %{{.*}}, float* {{.*}}, align 1{{$}}
+  _mm_store_ss(x, y);
+}
+
+void test_mm_store1_ps(float* x, __m128 y) {
+  // CHECK-LABEL: test_mm_store1_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 16
+  _mm_store1_ps(x, y);
+}
+
+void test_mm_storeh_pi(__m64* x,  __m128 y) {
+  // CHECK-LABEL: test_mm_storeh_pi
+  // CHECK: bitcast <4 x float> %{{.*}} to <2 x i64>
+  // CHECK: extractelement <2 x i64> %{{.*}}, i64 1
+  // CHECK: store i64 %{{.*}}, i64* {{.*}}
+  _mm_storeh_pi(x, y);
+}
+
+void test_mm_storel_pi(__m64* x,  __m128 y) {
+  // CHECK-LABEL: test_mm_storel_pi
+  // CHECK: bitcast <4 x float> %{{.*}} to <2 x i64>
+  // CHECK: extractelement <2 x i64> %{{.*}}, i64 0
+  // CHECK: store i64 %{{.*}}, i64* {{.*}}
+  _mm_storel_pi(x, y);
+}
+
+void test_mm_storer_ps(float* x,  __m128 y) {
+  // CHECK-LABEL: test_mm_storer_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* {{.*}}, align 16
+  _mm_storer_ps(x, y);
+}
+
+void test_mm_storeu_ps(float* x,  __m128 y) {
+  // CHECK-LABEL: test_mm_storeu_ps
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm_storeu_ps(x, y);
+}
+
+void test_mm_stream_ps(float*A, __m128d B) {
+  // CHECK-LABEL: test_mm_stream_ps
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 16, !nontemporal
+  _mm_stream_ps(A, B);
+}
+
+__m128 test_mm_sub_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_sub_ps
+  // CHECK: fsub <4 x float>
+  return _mm_sub_ps(A, B);
+}
+
+__m128 test_mm_sub_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_sub_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fsub float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_sub_ss(A, B);
+}
+
+void test_MM_TRANSPOSE4_PS(__m128 *A, __m128 *B, __m128 *C, __m128 *D) {
+  // CHECK-LABEL: test_MM_TRANSPOSE4_PS
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  _MM_TRANSPOSE4_PS(*A, *B, *C, *D);
+}
+
+int test_mm_ucomieq_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomieq_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomieq_ss(A, B);
+}
+
+int test_mm_ucomige_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomige_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomige_ss(A, B);
+}
+
+int test_mm_ucomigt_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomigt_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomigt_ss(A, B);
+}
+
+int test_mm_ucomile_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomile_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomile_ss(A, B);
+}
+
+int test_mm_ucomilt_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomilt_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomilt_ss(A, B);
+}
+
+int test_mm_ucomineq_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomineq_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomineq_ss(A, B);
 }
 
 __m128 test_mm_undefined_ps() {
@@ -484,38 +808,20 @@
   return _mm_undefined_ps();
 }
 
-__m128d test_mm_undefined_pd() {
-  // CHECK-LABEL: @test_mm_undefined_pd
-  // CHECK: ret <2 x double> undef
-  return _mm_undefined_pd();
+__m128 test_mm_unpackhi_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_unpackhi_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  return _mm_unpackhi_ps(A, B);
 }
 
-__m128i test_mm_undefined_si128() {
-  // CHECK-LABEL: @test_mm_undefined_si128
-  // CHECK: ret <2 x i64> undef
-  return _mm_undefined_si128();
+__m128 test_mm_unpacklo_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_unpacklo_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  return _mm_unpacklo_ps(A, B);
 }
 
-__m64 test_mm_add_si64(__m64 __a, __m64 __b) {
-  // CHECK-LABEL: @test_mm_add_si64
-  // CHECK @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
-  return _mm_add_si64(__a, __b);
-}
-
-__m64 test_mm_sub_si64(__m64 __a, __m64 __b) {
-  // CHECK-LABEL: @test_mm_sub_si64
-  // CHECK @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
-  return _mm_sub_si64(__a, __b);
-}
-
-__m64 test_mm_mul_su32(__m64 __a, __m64 __b) {
-  // CHECK-LABEL: @test_mm_mul_su32
-  // CHECK @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
-  return _mm_mul_su32(__a, __b);
-}
-
-void test_mm_pause() {
-  // CHECK-LABEL: @test_mm_pause
-  // CHECK @llvm.x86.sse2.pause()
-  return _mm_pause();
+__m128 test_mm_xor_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_xor_ps
+  // CHECK: xor <4 x i32>
+  return _mm_xor_ps(A, B);
 }
diff --git a/test/CodeGen/sse2-builtins.c b/test/CodeGen/sse2-builtins.c
index 4ceb93a..ffecb28 100644
--- a/test/CodeGen/sse2-builtins.c
+++ b/test/CodeGen/sse2-builtins.c
@@ -1,11 +1,13 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+
 __m128i test_mm_add_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_add_epi8
   // CHECK: add <16 x i8>
@@ -38,31 +40,34 @@
 
 __m128d test_mm_add_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_add_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: fadd double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_add_sd(A, B);
 }
 
 __m128i test_mm_adds_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_adds_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.padds.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_adds_epi8(A, B);
 }
 
 __m128i test_mm_adds_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_adds_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.padds.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_adds_epi16(A, B);
 }
 
 __m128i test_mm_adds_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_adds_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.paddus.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_adds_epu8(A, B);
 }
 
 __m128i test_mm_adds_epu16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_adds_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.paddus.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_adds_epu16(A, B);
 }
 
@@ -78,15 +83,29 @@
   return _mm_and_si128(A, B);
 }
 
+__m128d test_mm_andnot_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_andnot_pd
+  // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <4 x i32>
+  return _mm_andnot_pd(A, B);
+}
+
+__m128i test_mm_andnot_si128(__m128i A, __m128i B) {
+  // CHECK-LABEL: test_mm_andnot_si128
+  // CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  // CHECK: and <2 x i64>
+  return _mm_andnot_si128(A, B);
+}
+
 __m128i test_mm_avg_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_avg_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_avg_epu8(A, B);
 }
 
 __m128i test_mm_avg_epu16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_avg_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_avg_epu16(A, B);
 }
 
@@ -102,6 +121,42 @@
   return _mm_bsrli_si128(A, 5);
 }
 
+__m128 test_mm_castpd_ps(__m128d A) {
+  // CHECK-LABEL: test_mm_castpd_ps
+  // CHECK: bitcast <2 x double> %{{.*}} to <4 x float>
+  return _mm_castpd_ps(A);
+}
+
+__m128i test_mm_castpd_si128(__m128d A) {
+  // CHECK-LABEL: test_mm_castpd_si128
+  // CHECK: bitcast <2 x double> %{{.*}} to <2 x i64>
+  return _mm_castpd_si128(A);
+}
+
+__m128d test_mm_castps_pd(__m128 A) {
+  // CHECK-LABEL: test_mm_castps_pd
+  // CHECK: bitcast <4 x float> %{{.*}} to <2 x double>
+  return _mm_castps_pd(A);
+}
+
+__m128i test_mm_castps_si128(__m128 A) {
+  // CHECK-LABEL: test_mm_castps_si128
+  // CHECK: bitcast <4 x float> %{{.*}} to <2 x i64>
+  return _mm_castps_si128(A);
+}
+
+__m128d test_mm_castsi128_pd(__m128i A) {
+  // CHECK-LABEL: test_mm_castsi128_pd
+  // CHECK: bitcast <2 x i64> %{{.*}} to <2 x double>
+  return _mm_castsi128_pd(A);
+}
+
+__m128 test_mm_castsi128_ps(__m128i A) {
+  // CHECK-LABEL: test_mm_castsi128_ps
+  // CHECK: bitcast <2 x i64> %{{.*}} to <4 x float>
+  return _mm_castsi128_ps(A);
+}
+
 void test_mm_clflush(void* A) {
   // CHECK-LABEL: test_mm_clflush
   // CHECK: call void @llvm.x86.sse2.clflush(i8* %{{.*}})
@@ -128,7 +183,10 @@
 
 __m128d test_mm_cmpeq_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpeq_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpeq_pd(A, B);
 }
 
@@ -140,13 +198,20 @@
 
 __m128d test_mm_cmpge_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpge_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  // CHECK:         [[CMP:%.*]] = fcmp ole <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpge_pd(A, B);
 }
 
 __m128d test_mm_cmpge_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpge_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpge_sd(A, B);
 }
 
@@ -170,19 +235,29 @@
 
 __m128d test_mm_cmpgt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpgt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
+  // CHECK:         [[CMP:%.*]] = fcmp olt <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpgt_pd(A, B);
 }
 
 __m128d test_mm_cmpgt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpgt_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpgt_sd(A, B);
 }
 
 __m128d test_mm_cmple_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmple_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  // CHECK:         [[CMP:%.*]] = fcmp ole <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmple_pd(A, B);
 }
 
@@ -212,7 +287,10 @@
 
 __m128d test_mm_cmplt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmplt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
+  // CHECK:         [[CMP:%.*]] = fcmp olt <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmplt_pd(A, B);
 }
 
@@ -224,7 +302,10 @@
 
 __m128d test_mm_cmpneq_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpneq_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
+  // CHECK:         [[CMP:%.*]] = fcmp une <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpneq_pd(A, B);
 }
 
@@ -236,31 +317,48 @@
 
 __m128d test_mm_cmpnge_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpnge_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
+  // CHECK:         [[CMP:%.*]] = fcmp ugt <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpnge_pd(A, B);
 }
 
 __m128d test_mm_cmpnge_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpnge_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpnge_sd(A, B);
 }
 
 __m128d test_mm_cmpngt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpngt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
+  // CHECK:         [[CMP:%.*]] = fcmp uge <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpngt_pd(A, B);
 }
 
 __m128d test_mm_cmpngt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpngt_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpngt_sd(A, B);
 }
 
 __m128d test_mm_cmpnle_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpnle_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
+  // CHECK:         [[CMP:%.*]] = fcmp ugt <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpnle_pd(A, B);
 }
 
@@ -272,7 +370,10 @@
 
 __m128d test_mm_cmpnlt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpnlt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
+  // CHECK:         [[CMP:%.*]] = fcmp uge <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpnlt_pd(A, B);
 }
 
@@ -284,7 +385,10 @@
 
 __m128d test_mm_cmpord_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpord_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
+  // CHECK:         [[CMP:%.*]] = fcmp ord <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpord_pd(A, B);
 }
 
@@ -296,7 +400,10 @@
 
 __m128d test_mm_cmpunord_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpunord_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
+  // CHECK:         [[CMP:%.*]] = fcmp uno <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpunord_pd(A, B);
 }
 
@@ -308,73 +415,75 @@
 
 int test_mm_comieq_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comieq_sd
-  // CHECK: call i32 @llvm.x86.sse2.comieq.sd
+  // CHECK: call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comieq_sd(A, B);
 }
 
 int test_mm_comige_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comige_sd
-  // CHECK: call i32 @llvm.x86.sse2.comige.sd
+  // CHECK: call i32 @llvm.x86.sse2.comige.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comige_sd(A, B);
 }
 
 int test_mm_comigt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comigt_sd
-  // CHECK: call i32 @llvm.x86.sse2.comigt.sd
+  // CHECK: call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comigt_sd(A, B);
 }
 
 int test_mm_comile_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comile_sd
-  // CHECK: call i32 @llvm.x86.sse2.comile.sd
+  // CHECK: call i32 @llvm.x86.sse2.comile.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comile_sd(A, B);
 }
 
 int test_mm_comilt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comilt_sd
-  // CHECK: call i32 @llvm.x86.sse2.comilt.sd
+  // CHECK: call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comilt_sd(A, B);
 }
 
 int test_mm_comineq_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comineq_sd
-  // CHECK: call i32 @llvm.x86.sse2.comineq.sd
+  // CHECK: call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comineq_sd(A, B);
 }
 
 __m128d test_mm_cvtepi32_pd(__m128i A) {
   // CHECK-LABEL: test_mm_cvtepi32_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cvtdq2pd
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: sitofp <2 x i32> %{{.*}} to <2 x double>
   return _mm_cvtepi32_pd(A);
 }
 
 __m128 test_mm_cvtepi32_ps(__m128i A) {
   // CHECK-LABEL: test_mm_cvtepi32_ps
-  // CHECK: call <4 x float> @llvm.x86.sse2.cvtdq2ps
+  // CHECK: call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %{{.*}})
   return _mm_cvtepi32_ps(A);
 }
 
 __m128i test_mm_cvtpd_epi32(__m128d A) {
   // CHECK-LABEL: test_mm_cvtpd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %{{.*}})
   return _mm_cvtpd_epi32(A);
 }
 
 __m128 test_mm_cvtpd_ps(__m128d A) {
   // CHECK-LABEL: test_mm_cvtpd_ps
-  // CHECK: call <4 x float> @llvm.x86.sse2.cvtpd2ps
+  // CHECK: call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %{{.*}})
   return _mm_cvtpd_ps(A);
 }
 
 __m128i test_mm_cvtps_epi32(__m128 A) {
   // CHECK-LABEL: test_mm_cvtps_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %{{.*}})
   return _mm_cvtps_epi32(A);
 }
 
 __m128d test_mm_cvtps_pd(__m128 A) {
   // CHECK-LABEL: test_mm_cvtps_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cvtps2pd
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: fpext <2 x float> %{{.*}} to <2 x double>
   return _mm_cvtps_pd(A);
 }
 
@@ -386,19 +495,19 @@
 
 int test_mm_cvtsd_si32(__m128d A) {
   // CHECK-LABEL: test_mm_cvtsd_si32
-  // CHECK: call i32 @llvm.x86.sse2.cvtsd2si
+  // CHECK: call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %{{.*}})
   return _mm_cvtsd_si32(A);
 }
 
 long long test_mm_cvtsd_si64(__m128d A) {
   // CHECK-LABEL: test_mm_cvtsd_si64
-  // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64
+  // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %{{.*}})
   return _mm_cvtsd_si64(A);
 }
 
 __m128 test_mm_cvtsd_ss(__m128 A, __m128d B) {
   // CHECK-LABEL: test_mm_cvtsd_ss
-  // CHECK: fptrunc double %{{.*}} to float
+  // CHECK: call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %{{.*}}, <2 x double> %{{.*}})
   return _mm_cvtsd_ss(A, B);
 }
 
@@ -424,6 +533,9 @@
 __m128i test_mm_cvtsi32_si128(int A) {
   // CHECK-LABEL: test_mm_cvtsi32_si128
   // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 3
   return _mm_cvtsi32_si128(A);
 }
 
@@ -437,6 +549,7 @@
 __m128i test_mm_cvtsi64_si128(long long A) {
   // CHECK-LABEL: test_mm_cvtsi64_si128
   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 0, i32 1
   return _mm_cvtsi64_si128(A);
 }
 
@@ -450,27 +563,25 @@
 
 __m128i test_mm_cvttpd_epi32(__m128d A) {
   // CHECK-LABEL: test_mm_cvttpd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %{{.*}})
   return _mm_cvttpd_epi32(A);
 }
 
 __m128i test_mm_cvttps_epi32(__m128 A) {
   // CHECK-LABEL: test_mm_cvttps_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %{{.*}})
   return _mm_cvttps_epi32(A);
 }
 
 int test_mm_cvttsd_si32(__m128d A) {
   // CHECK-LABEL: test_mm_cvttsd_si32
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fptosi double %{{.*}} to i32
+  // CHECK: call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %{{.*}})
   return _mm_cvttsd_si32(A);
 }
 
 long long test_mm_cvttsd_si64(__m128d A) {
   // CHECK-LABEL: test_mm_cvttsd_si64
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fptosi double %{{.*}} to i64
+  // CHECK: call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %{{.*}})
   return _mm_cvttsd_si64(A);
 }
 
@@ -482,7 +593,10 @@
 
 __m128d test_mm_div_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_div_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: fdiv double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_div_sd(A, B);
 }
 
@@ -491,10 +605,11 @@
   // CHECK-LABEL: test_mm_extract_epi16
   // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
   // CHECK: extractelement <8 x i16> %{{.*}}, i32 [[x]]
-  return _mm_extract_epi16(A, 8);
+  // CHECK: zext i16 %{{.*}} to i32
+  return _mm_extract_epi16(A, 9);
 }
 
-__m128i test_mm_insert_epi16(__m128i A, short B) {
+__m128i test_mm_insert_epi16(__m128i A, int B) {
   // CHECK-LABEL: test_mm_insert_epi16
   // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
   // CHECK: insertelement <8 x i16> %{{.*}}, i32 [[x]]
@@ -513,9 +628,17 @@
   return _mm_load_pd(A);
 }
 
+__m128d test_mm_load_pd1(double const* A) {
+  // CHECK-LABEL: test_mm_load_pd1
+  // CHECK: load double, double* %{{.*}}, align 8
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_load_pd1(A);
+}
+
 __m128d test_mm_load_sd(double const* A) {
   // CHECK-LABEL: test_mm_load_sd
-  // CHECK: load double, double* %{{.*}}, align 1
+  // CHECK: load double, double* %{{.*}}, align 1{{$}}
   return _mm_load_sd(A);
 }
 
@@ -536,9 +659,27 @@
 __m128d test_mm_loadh_pd(__m128d x, void* y) {
   // CHECK-LABEL: test_mm_loadh_pd
   // CHECK: load double, double* %{{.*}}, align 1{{$}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_loadh_pd(x, y);
 }
 
+__m128i test_mm_loadl_epi64(__m128i* y) {
+  // CHECK: test_mm_loadl_epi64
+  // CHECK: load i64, i64* {{.*}}, align 1{{$}}
+  // CHECK: insertelement <2 x i64> undef, i64 {{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> {{.*}}, i64 0, i32 1
+  return _mm_loadl_epi64(y);
+}
+
+__m128d test_mm_loadl_pd(__m128d x, void* y) {
+  // CHECK-LABEL: test_mm_loadl_pd
+  // CHECK: load double, double* %{{.*}}, align 1{{$}}
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_loadl_pd(x, y);
+}
+
 __m128d test_mm_loadr_pd(double const* A) {
   // CHECK-LABEL: test_mm_loadr_pd
   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 16
@@ -548,16 +689,24 @@
 
 __m128d test_mm_loadu_pd(double const* A) {
   // CHECK-LABEL: test_mm_loadu_pd
-  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1
+  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
   return _mm_loadu_pd(A);
 }
 
 __m128i test_mm_loadu_si128(__m128i const* A) {
   // CHECK-LABEL: test_mm_loadu_si128
-  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1
+  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
   return _mm_loadu_si128(A);
 }
 
+__m128i test_mm_loadu_si64(void const* A) {
+  // CHECK-LABEL: test_mm_loadu_si64
+  // CHECK: load i64, i64* %{{.*}}, align 1{{$}}
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 0, i32 1
+  return _mm_loadu_si64(A);
+}
+
 __m128i test_mm_madd_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_madd_epi16
   // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
@@ -572,13 +721,15 @@
 
 __m128i test_mm_max_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_max_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
   return _mm_max_epi16(A, B);
 }
 
 __m128i test_mm_max_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_max_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
   return _mm_max_epu8(A, B);
 }
 
@@ -602,13 +753,15 @@
 
 __m128i test_mm_min_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_min_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
   return _mm_min_epi16(A, B);
 }
 
 __m128i test_mm_min_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_min_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
   return _mm_min_epu8(A, B);
 }
 
@@ -624,6 +777,21 @@
   return _mm_min_sd(A, B);
 }
 
+__m128i test_mm_move_epi64(__m128i A) {
+  // CHECK-LABEL: test_mm_move_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  return _mm_move_epi64(A);
+}
+
+__m128d test_mm_move_sd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_move_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_move_sd(A, B);
+}
+
 int test_mm_movemask_epi8(__m128i A) {
   // CHECK-LABEL: test_mm_movemask_epi8
   // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}})
@@ -650,7 +818,10 @@
 
 __m128d test_mm_mul_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_mul_sd
-  // CHECK: fmul double %{{.*}}, %{{.*}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fmul double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_mul_sd(A, B);
 }
 
@@ -714,6 +885,206 @@
   return _mm_sad_epu8(A, B);
 }
 
+__m128i test_mm_set_epi8(char A, char B, char C, char D,
+                         char E, char F, char G, char H,
+                         char I, char J, char K, char L,
+                         char M, char N, char O, char P) {
+  // CHECK-LABEL: test_mm_set_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  return _mm_set_epi8(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
+}
+
+__m128i test_mm_set_epi16(short A, short B, short C, short D,
+                          short E, short F, short G, short H) {
+  // CHECK-LABEL: test_mm_set_epi16
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  return _mm_set_epi16(A, B, C, D, E, F, G, H);
+}
+
+__m128i test_mm_set_epi32(int A, int B, int C, int D) {
+  // CHECK-LABEL: test_mm_set_epi32
+  // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  return _mm_set_epi32(A, B, C, D);
+}
+
+__m128i test_mm_set_epi64(__m64 A, __m64 B) {
+  // CHECK-LABEL: test_mm_set_epi64
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_set_epi64(A, B);
+}
+
+__m128i test_mm_set_epi64x(long long A, long long B) {
+  // CHECK-LABEL: test_mm_set_epi64x
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_set_epi64x(A, B);
+}
+
+__m128d test_mm_set_pd(double A, double B) {
+  // CHECK-LABEL: test_mm_set_pd
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_set_pd(A, B);
+}
+
+__m128d test_mm_set_sd(double A) {
+  // CHECK-LABEL: test_mm_set_sd
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
+  return _mm_set_sd(A);
+}
+
+__m128i test_mm_set1_epi8(char A) {
+  // CHECK-LABEL: test_mm_set1_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  return _mm_set1_epi8(A);
+}
+
+__m128i test_mm_set1_epi16(short A) {
+  // CHECK-LABEL: test_mm_set1_epi16
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  return _mm_set1_epi16(A);
+}
+
+__m128i test_mm_set1_epi32(int A) {
+  // CHECK-LABEL: test_mm_set1_epi32
+  // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  return _mm_set1_epi32(A);
+}
+
+__m128i test_mm_set1_epi64(__m64 A) {
+  // CHECK-LABEL: test_mm_set1_epi64
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_set1_epi64(A);
+}
+
+__m128i test_mm_set1_epi64x(long long A) {
+  // CHECK-LABEL: test_mm_set1_epi64x
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_set1_epi64x(A);
+}
+
+__m128d test_mm_set1_pd(double A) {
+  // CHECK-LABEL: test_mm_set1_pd
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_set1_pd(A);
+}
+
+__m128i test_mm_setr_epi8(char A, char B, char C, char D,
+                          char E, char F, char G, char H,
+                          char I, char J, char K, char L,
+                          char M, char N, char O, char P) {
+  // CHECK-LABEL: test_mm_setr_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  return _mm_setr_epi8(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
+}
+
+__m128i test_mm_setr_epi16(short A, short B, short C, short D,
+                           short E, short F, short G, short H) {
+  // CHECK-LABEL: test_mm_setr_epi16
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  return _mm_setr_epi16(A, B, C, D, E, F, G, H);
+}
+
+__m128i test_mm_setr_epi32(int A, int B, int C, int D) {
+  // CHECK-LABEL: test_mm_setr_epi32
+  // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  return _mm_setr_epi32(A, B, C, D);
+}
+
+__m128i test_mm_setr_epi64(__m64 A, __m64 B) {
+  // CHECK-LABEL: test_mm_setr_epi64
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_setr_epi64(A, B);
+}
+
+__m128d test_mm_setr_pd(double A, double B) {
+  // CHECK-LABEL: test_mm_setr_pd
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_setr_pd(A, B);
+}
+
 __m128d test_mm_setzero_pd() {
   // CHECK-LABEL: test_mm_setzero_pd
   // CHECK: store <2 x double> zeroinitializer
@@ -752,37 +1123,37 @@
 
 __m128i test_mm_sll_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sll_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sll_epi16(A, B);
 }
 
 __m128i test_mm_sll_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sll_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sll_epi32(A, B);
 }
 
 __m128i test_mm_sll_epi64(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sll_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_sll_epi64(A, B);
 }
 
 __m128i test_mm_slli_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_slli_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi16(A, 1);
 }
 
 __m128i test_mm_slli_epi32(__m128i A) {
   // CHECK-LABEL: test_mm_slli_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi32(A, 1);
 }
 
 __m128i test_mm_slli_epi64(__m128i A) {
   // CHECK-LABEL: test_mm_slli_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi64(A, 1);
 }
 
@@ -792,6 +1163,12 @@
   return _mm_slli_si128(A, 5);
 }
 
+__m128i test_mm_slli_si128_2(__m128i A) {
+  // CHECK-LABEL: test_mm_slli_si128_2
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm_slli_si128(A, 17);
+}
+
 __m128d test_mm_sqrt_pd(__m128d A) {
   // CHECK-LABEL: test_mm_sqrt_pd
   // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %{{.*}})
@@ -801,66 +1178,70 @@
 __m128d test_mm_sqrt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_sqrt_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %{{.*}})
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_sqrt_sd(A, B);
 }
 
 __m128i test_mm_sra_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sra_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sra_epi16(A, B);
 }
 
 __m128i test_mm_sra_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sra_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sra_epi32(A, B);
 }
 
 __m128i test_mm_srai_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_srai_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi16(A, 1);
 }
 
 __m128i test_mm_srai_epi32(__m128i A) {
   // CHECK-LABEL: test_mm_srai_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi32(A, 1);
 }
 
 __m128i test_mm_srl_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_srl_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_srl_epi16(A, B);
 }
 
 __m128i test_mm_srl_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_srl_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_srl_epi32(A, B);
 }
 
 __m128i test_mm_srl_epi64(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_srl_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_srl_epi64(A, B);
 }
 
 __m128i test_mm_srli_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_srli_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi16(A, 1);
 }
 
 __m128i test_mm_srli_epi32(__m128i A) {
   // CHECK-LABEL: test_mm_srli_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi32(A, 1);
 }
 
 __m128i test_mm_srli_epi64(__m128i A) {
   // CHECK-LABEL: test_mm_srli_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi64(A, 1);
 }
 
@@ -870,14 +1251,28 @@
   return _mm_srli_si128(A, 5);
 }
 
+__m128i test_mm_srli_si128_2(__m128i A) {
+  // CHECK-LABEL: test_mm_srli_si128_2
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  return _mm_srli_si128(A, 17);
+}
+
 void test_mm_store_pd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_store_pd
   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16
   _mm_store_pd(A, B);
 }
 
+void test_mm_store_pd1(double* x, __m128d y) {
+  // CHECK-LABEL: test_mm_store_pd1
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: store <2 x double> %{{.*}}, <2 x double>* {{.*}}, align 16
+  _mm_store_pd1(x, y);
+}
+
 void test_mm_store_sd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_store_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   _mm_store_sd(A, B);
 }
@@ -888,27 +1283,52 @@
   _mm_store_si128(A, B);
 }
 
+void test_mm_store1_pd(double* x, __m128d y) {
+  // CHECK-LABEL: test_mm_store1_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16
+  _mm_store1_pd(x, y);
+}
+
 void test_mm_storeh_pd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_storeh_pd
-  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   _mm_storeh_pd(A, B);
 }
 
+void test_mm_storel_epi64(__m128i x, void* y) {
+  // CHECK-LABEL: test_mm_storel_epi64
+  // CHECK: extractelement <2 x i64> %{{.*}}, i32 0
+  // CHECK: store {{.*}} i64* {{.*}}, align 1{{$}}
+  _mm_storel_epi64(y, x);
+}
+
 void test_mm_storel_pd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_storel_pd
-  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   _mm_storel_pd(A, B);
 }
 
+void test_mm_storer_pd(__m128d A, double* B) {
+  // CHECK-LABEL: test_mm_storer_pd
+  // CHECK: shufflevector <2 x double> {{.*}}, <2 x double> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK: store {{.*}} <2 x double>* {{.*}}, align 16{{$}}
+  _mm_storer_pd(B, A);
+}
+
 void test_mm_storeu_pd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_storeu_pd
-  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1
+  // CHECK: store {{.*}} <2 x double>* {{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
   _mm_storeu_pd(A, B);
 }
 
 void test_mm_storeu_si128(__m128i* A, __m128i B) {
   // CHECK-LABEL: test_mm_storeu_si128
-  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1
+  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
   _mm_storeu_si128(A, B);
 }
 
@@ -968,70 +1388,85 @@
 
 __m128d test_mm_sub_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_sub_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: fsub double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_sub_sd(A, B);
 }
 
 __m128i test_mm_subs_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_subs_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.psubs.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_subs_epi8(A, B);
 }
 
 __m128i test_mm_subs_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_subs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psubs.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_subs_epi16(A, B);
 }
 
 __m128i test_mm_subs_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_subs_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.psubus.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_subs_epu8(A, B);
 }
 
 __m128i test_mm_subs_epu16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_subs_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psubus.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_subs_epu16(A, B);
 }
 
 int test_mm_ucomieq_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomieq_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomieq.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomieq_sd(A, B);
 }
 
 int test_mm_ucomige_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomige_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomige.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomige_sd(A, B);
 }
 
 int test_mm_ucomigt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomigt_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomigt.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomigt_sd(A, B);
 }
 
 int test_mm_ucomile_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomile_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomile.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomile_sd(A, B);
 }
 
 int test_mm_ucomilt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomilt_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomilt.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomilt_sd(A, B);
 }
 
 int test_mm_ucomineq_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomineq_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomineq.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomineq_sd(A, B);
 }
 
+__m128d test_mm_undefined_pd() {
+  // CHECK-LABEL: @test_mm_undefined_pd
+  // CHECK: ret <2 x double> undef
+  return _mm_undefined_pd();
+}
+
+__m128i test_mm_undefined_si128() {
+  // CHECK-LABEL: @test_mm_undefined_si128
+  // CHECK: ret <2 x i64> undef
+  return _mm_undefined_si128();
+}
+
 __m128i test_mm_unpackhi_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_unpackhi_epi8
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
diff --git a/test/CodeGen/sse3-builtins.c b/test/CodeGen/sse3-builtins.c
index 71a34e9..ee6ff8d 100644
--- a/test/CodeGen/sse3-builtins.c
+++ b/test/CodeGen/sse3-builtins.c
@@ -1,55 +1,59 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse3 -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
+
 __m128d test_mm_addsub_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_addsub_pd
-  // CHECK: call <2 x double> @llvm.x86.sse3.addsub.pd
+  // CHECK: call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_addsub_pd(A, B);
 }
 
 __m128 test_mm_addsub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_addsub_ps
-  // CHECK: call <4 x float> @llvm.x86.sse3.addsub.ps
+  // CHECK: call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_addsub_ps(A, B);
 }
 
 __m128d test_mm_hadd_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_hadd_pd
-  // CHECK: call <2 x double> @llvm.x86.sse3.hadd.pd
+  // CHECK: call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_hadd_pd(A, B);
 }
 
 __m128 test_mm_hadd_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_hadd_ps
-  // CHECK: call <4 x float> @llvm.x86.sse3.hadd.ps
+  // CHECK: call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_hadd_ps(A, B);
 }
 
 __m128d test_mm_hsub_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_hsub_pd
-  // CHECK: call <2 x double> @llvm.x86.sse3.hsub.pd
+  // CHECK: call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_hsub_pd(A, B);
 }
 
 __m128 test_mm_hsub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_hsub_ps
-  // CHECK: call <4 x float> @llvm.x86.sse3.hsub.ps
+  // CHECK: call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_hsub_ps(A, B);
 }
 
 __m128i test_mm_lddqu_si128(__m128i const* P) {
   // CHECK-LABEL: test_mm_lddqu_si128
-  // CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq
+  // CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %{{.*}})
   return _mm_lddqu_si128(P);
 }
 
 __m128d test_mm_loaddup_pd(double const* P) {
   // CHECK-LABEL: test_mm_loaddup_pd
   // CHECK: load double*
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_loaddup_pd(P);
 }
 
diff --git a/test/CodeGen/sse41-builtins.c b/test/CodeGen/sse41-builtins.c
index 9cd5c45..09d0a4b 100644
--- a/test/CodeGen/sse41-builtins.c
+++ b/test/CodeGen/sse41-builtins.c
@@ -1,11 +1,13 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+
 __m128i test_mm_blend_epi16(__m128i V1, __m128i V2) {
   // CHECK-LABEL: test_mm_blend_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
@@ -26,140 +28,154 @@
 
 __m128i test_mm_blendv_epi8(__m128i V1, __m128i V2, __m128i V3) {
   // CHECK-LABEL: test_mm_blendv_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse41.pblendvb
+  // CHECK: call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_blendv_epi8(V1, V2, V3);
 }
 
 __m128d test_mm_blendv_pd(__m128d V1, __m128d V2, __m128d V3) {
   // CHECK-LABEL: test_mm_blendv_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.blendvpd
+  // CHECK: call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_blendv_pd(V1, V2, V3);
 }
 
 __m128 test_mm_blendv_ps(__m128 V1, __m128 V2, __m128 V3) {
   // CHECK-LABEL: test_mm_blendv_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.blendvps
+  // CHECK: call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_blendv_ps(V1, V2, V3);
 }
 
 __m128d test_mm_ceil_pd(__m128d x) {
   // CHECK-LABEL: test_mm_ceil_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
   return _mm_ceil_pd(x);
 }
 
 __m128 test_mm_ceil_ps(__m128 x) {
   // CHECK-LABEL: test_mm_ceil_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
   return _mm_ceil_ps(x);
 }
 
 __m128d test_mm_ceil_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_ceil_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
   return _mm_ceil_sd(x, y);
 }
 
 __m128 test_mm_ceil_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_ceil_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
   return _mm_ceil_ss(x, y);
 }
 
 __m128i test_mm_cmpeq_epi64(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpeq_epi64
   // CHECK: icmp eq <2 x i64>
+  // CHECK: sext <2 x i1> %{{.*}} to <2 x i64>
   return _mm_cmpeq_epi64(A, B);
 }
 
 __m128i test_mm_cvtepi8_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi16
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: sext <8 x i8> {{.*}} to <8 x i16>
   return _mm_cvtepi8_epi16(a);
 }
 
 __m128i test_mm_cvtepi8_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi32
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: sext <4 x i8> {{.*}} to <4 x i32>
   return _mm_cvtepi8_epi32(a);
 }
 
 __m128i test_mm_cvtepi8_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi64
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: sext <2 x i8> {{.*}} to <2 x i64>
   return _mm_cvtepi8_epi64(a);
 }
 
 __m128i test_mm_cvtepi16_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi16_epi32
+  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: sext <4 x i16> {{.*}} to <4 x i32>
   return _mm_cvtepi16_epi32(a);
 }
 
 __m128i test_mm_cvtepi16_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi16_epi64
+  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: sext <2 x i16> {{.*}} to <2 x i64>
   return _mm_cvtepi16_epi64(a);
 }
 
 __m128i test_mm_cvtepi32_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi32_epi64
+  // CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> {{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: sext <2 x i32> {{.*}} to <2 x i64>
   return _mm_cvtepi32_epi64(a);
 }
 
 __m128i test_mm_cvtepu8_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu8_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> {{.*}})
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: zext <8 x i8> {{.*}} to <8 x i16>
   return _mm_cvtepu8_epi16(a);
 }
 
 __m128i test_mm_cvtepu8_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu8_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> {{.*}})
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: zext <4 x i8> {{.*}} to <4 x i32>
   return _mm_cvtepu8_epi32(a);
 }
 
 __m128i test_mm_cvtepu8_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu8_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> {{.*}})
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: zext <2 x i8> {{.*}} to <2 x i64>
   return _mm_cvtepu8_epi64(a);
 }
 
 __m128i test_mm_cvtepu16_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu16_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> {{.*}})
+  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: zext <4 x i16> {{.*}} to <4 x i32>
   return _mm_cvtepu16_epi32(a);
 }
 
 __m128i test_mm_cvtepu16_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu16_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> {{.*}})
+  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: zext <2 x i16> {{.*}} to <2 x i64>
   return _mm_cvtepu16_epi64(a);
 }
 
 __m128i test_mm_cvtepu32_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu32_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> {{.*}})
+  // CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> {{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: zext <2 x i32> {{.*}} to <2 x i64>
   return _mm_cvtepu32_epi64(a);
 }
 
 __m128d test_mm_dp_pd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_dp_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.dppd
-  return _mm_dp_pd(x, y, 2);
+  // CHECK: call <2 x double> @llvm.x86.sse41.dppd(<2 x double> {{.*}}, <2 x double> {{.*}}, i8 7)
+  return _mm_dp_pd(x, y, 7);
 }
 
 __m128 test_mm_dp_ps(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_dp_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.dpps
-  return _mm_dp_ps(x, y, 2);
+  // CHECK: call <4 x float> @llvm.x86.sse41.dpps(<4 x float> {{.*}}, <4 x float> {{.*}}, i8 7)
+  return _mm_dp_ps(x, y, 7);
 }
 
 int test_mm_extract_epi8(__m128i x) {
   // CHECK-LABEL: test_mm_extract_epi8
-  // CHECK: extractelement <16 x i8> %{{.*}}, i32 0
-  return _mm_extract_epi8(x, 16);
+  // CHECK: extractelement <16 x i8> %{{.*}}, i32 1
+  // CHECK: zext i8 %{{.*}} to i32
+  return _mm_extract_epi8(x, 1);
 }
 
 int test_mm_extract_epi32(__m128i x) {
@@ -174,32 +190,33 @@
   return _mm_extract_epi64(x, 1);
 }
 
-//TODO
-//int test_mm_extract_ps(__m128i x) {
-//  return _mm_extract_ps(_mm_add_ps(x,x), 1);
-//}
+int test_mm_extract_ps(__m128 x) {
+  // CHECK-LABEL: test_mm_extract_ps
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 1
+  return _mm_extract_ps(x, 1);
+}
 
 __m128d test_mm_floor_pd(__m128d x) {
   // CHECK-LABEL: test_mm_floor_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
   return _mm_floor_pd(x);
 }
 
 __m128 test_mm_floor_ps(__m128 x) {
   // CHECK-LABEL: test_mm_floor_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
   return _mm_floor_ps(x);
 }
 
 __m128d test_mm_floor_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_floor_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
   return _mm_floor_sd(x, y);
 }
 
 __m128 test_mm_floor_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_floor_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
   return _mm_floor_ss(x, y);
 }
 
@@ -223,73 +240,81 @@
 
 __m128 test_mm_insert_ps(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_insert_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.insertps
-  return _mm_insert_ps(x, y, 5);
+  // CHECK: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
+  return _mm_insert_ps(x, y, 4);
 }
 
 __m128i test_mm_max_epi8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse41.pmaxsb
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
   return _mm_max_epi8(x, y);
 }
 
-__m128i test_mm_max_epu16(__m128i x, __m128i y) {
-  // CHECK-LABEL: test_mm_max_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.pmaxuw
-  return _mm_max_epu16(x, y);
-}
-
 __m128i test_mm_max_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmaxsd
+  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
   return _mm_max_epi32(x, y);
 }
 
+__m128i test_mm_max_epu16(__m128i x, __m128i y) {
+  // CHECK-LABEL: test_mm_max_epu16
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  return _mm_max_epu16(x, y);
+}
+
 __m128i test_mm_max_epu32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epu32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmaxud
+  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
   return _mm_max_epu32(x, y);
 }
 
 __m128i test_mm_min_epi8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse41.pminsb
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
   return _mm_min_epi8(x, y);
 }
 
-__m128i test_mm_min_epu16(__m128i x, __m128i y) {
-  // CHECK-LABEL: test_mm_min_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.pminuw
-  return _mm_min_epu16(x, y);
-}
-
 __m128i test_mm_min_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pminsd
+  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
   return _mm_min_epi32(x, y);
 }
 
+__m128i test_mm_min_epu16(__m128i x, __m128i y) {
+  // CHECK-LABEL: test_mm_min_epu16
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  return _mm_min_epu16(x, y);
+}
+
 __m128i test_mm_min_epu32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epu32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pminud
+  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
   return _mm_min_epu32(x, y);
 }
 
 __m128i test_mm_minpos_epu16(__m128i x) {
   // CHECK-LABEL: test_mm_minpos_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.phminposuw
+  // CHECK: call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %{{.*}})
   return _mm_minpos_epu16(x);
 }
 
 __m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mpsadbw_epu8
-  // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw
+  // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 1)
   return _mm_mpsadbw_epu8(x, y, 1);
 }
 
 __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mul_epi32
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq
+  // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_mul_epi32(x, y);
 }
 
@@ -301,72 +326,72 @@
 
 __m128i test_mm_packus_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_packus_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw
+  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_packus_epi32(x, y);
 }
 
 __m128d test_mm_round_pd(__m128d x) {
   // CHECK-LABEL: test_mm_round_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd
-  return _mm_round_pd(x, 2);
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4)
+  return _mm_round_pd(x, 4);
 }
 
 __m128 test_mm_round_ps(__m128 x) {
   // CHECK-LABEL: test_mm_round_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps
-  return _mm_round_ps(x, 2);
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4)
+  return _mm_round_ps(x, 4);
 }
 
 __m128d test_mm_round_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_round_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd
-  return _mm_round_sd(x, y, 2);
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4)
+  return _mm_round_sd(x, y, 4);
 }
 
 __m128 test_mm_round_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_round_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss
-  return _mm_round_ss(x, y, 2);
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4)
+  return _mm_round_ss(x, y, 4);
 }
 
 __m128i test_mm_stream_load_si128(__m128i const *a) {
   // CHECK-LABEL: test_mm_stream_load_si128
-  // CHECK: call <2 x i64> @llvm.x86.sse41.movntdqa
+  // CHECK: call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %{{.*}})
   return _mm_stream_load_si128(a);
 }
 
 int test_mm_test_all_ones(__m128i x) {
   // CHECK-LABEL: test_mm_test_all_ones
-  // CHECK: call i32 @llvm.x86.sse41.ptestc
+  // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_test_all_ones(x);
 }
 
 int test_mm_test_all_zeros(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_test_all_zeros
-  // CHECK: call i32 @llvm.x86.sse41.ptestz
+  // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_test_all_zeros(x, y);
 }
 
 int test_mm_test_mix_ones_zeros(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_test_mix_ones_zeros
-  // CHECK: call i32 @llvm.x86.sse41.ptestnzc
+  // CHECK: call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_test_mix_ones_zeros(x, y);
 }
 
 int test_mm_testc_si128(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_testc_si128
-  // CHECK: call i32 @llvm.x86.sse41.ptestc
+  // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_testc_si128(x, y);
 }
 
 int test_mm_testnzc_si128(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_testnzc_si128
-  // CHECK: call i32 @llvm.x86.sse41.ptestnzc
+  // CHECK: call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_testnzc_si128(x, y);
 }
 
 int test_mm_testz_si128(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_testz_si128
-  // CHECK: call i32 @llvm.x86.sse41.ptestz
+  // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_testz_si128(x, y);
 }
diff --git a/test/CodeGen/sse42-builtins.c b/test/CodeGen/sse42-builtins.c
index e3215dd..7a76293 100644
--- a/test/CodeGen/sse42-builtins.c
+++ b/test/CodeGen/sse42-builtins.c
@@ -1,27 +1,53 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -emit-llvm -o - -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
-__m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi8
-  // CHECK: icmp sgt <16 x i8>
-  return _mm_cmpgt_epi8(A, B);
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
+
+int test_mm_cmpestra(__m128i A, int LA, __m128i B, int LB) {
+  // CHECK-LABEL: test_mm_cmpestra
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
+  return _mm_cmpestra(A, LA, B, LB, 7);
 }
 
-__m128i test_mm_cmpgt_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi16
-  // CHECK: icmp sgt <8 x i16>
-  return _mm_cmpgt_epi16(A, B);
+int test_mm_cmpestrc(__m128i A, int LA, __m128i B, int LB) {
+  // CHECK-LABEL: test_mm_cmpestrc
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
+  return _mm_cmpestrc(A, LA, B, LB, 7);
 }
 
-__m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi32
-  // CHECK: icmp sgt <4 x i32>
-  return _mm_cmpgt_epi32(A, B);
+int test_mm_cmpestri(__m128i A, int LA, __m128i B, int LB) {
+  // CHECK-LABEL: test_mm_cmpestri
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
+  return _mm_cmpestri(A, LA, B, LB, 7);
+}
+
+__m128i test_mm_cmpestrm(__m128i A, int LA, __m128i B, int LB) {
+  // CHECK-LABEL: test_mm_cmpestrm
+  // CHECK: call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
+  return _mm_cmpestrm(A, LA, B, LB, 7);
+}
+
+int test_mm_cmpestro(__m128i A, int LA, __m128i B, int LB) {
+  // CHECK-LABEL: test_mm_cmpestro
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
+  return _mm_cmpestro(A, LA, B, LB, 7);
+}
+
+int test_mm_cmpestrs(__m128i A, int LA, __m128i B, int LB) {
+  // CHECK-LABEL: test_mm_cmpestrs
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
+  return _mm_cmpestrs(A, LA, B, LB, 7);
+}
+
+int test_mm_cmpestrz(__m128i A, int LA, __m128i B, int LB) {
+  // CHECK-LABEL: test_mm_cmpestrz
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
+  return _mm_cmpestrz(A, LA, B, LB, 7);
 }
 
 __m128i test_mm_cmpgt_epi64(__m128i A, __m128i B) {
@@ -30,110 +56,68 @@
   return _mm_cmpgt_epi64(A, B);
 }
 
-int test_mm_cmpestra(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK-LABEL: test_mm_cmpestra
-  // CHECK: @llvm.x86.sse42.pcmpestria128
-  return _mm_cmpestra(A, LA, B, LB, 7);
-}
-
-int test_mm_cmpestrc(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK-LABEL: test_mm_cmpestrc
-  // CHECK: @llvm.x86.sse42.pcmpestric128
-  return _mm_cmpestrc(A, LA, B, LB, 7);
-}
-
-int test_mm_cmpestri(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK-LABEL: test_mm_cmpestri
-  // CHECK: @llvm.x86.sse42.pcmpestri128
-  return _mm_cmpestri(A, LA, B, LB, 7);
-}
-
-__m128i test_mm_cmpestrm(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK-LABEL: test_mm_cmpestrm
-  // CHECK: @llvm.x86.sse42.pcmpestrm128
-  return _mm_cmpestrm(A, LA, B, LB, 7);
-}
-
-int test_mm_cmpestro(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK-LABEL: test_mm_cmpestro
-  // CHECK: @llvm.x86.sse42.pcmpestrio128
-  return _mm_cmpestro(A, LA, B, LB, 7);
-}
-
-int test_mm_cmpestrs(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK-LABEL: test_mm_cmpestrs
-  // CHECK: @llvm.x86.sse42.pcmpestris128
-  return _mm_cmpestrs(A, LA, B, LB, 7);
-}
-
-int test_mm_cmpestrz(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK-LABEL: test_mm_cmpestrz
-  // CHECK: @llvm.x86.sse42.pcmpestriz128
-  return _mm_cmpestrz(A, LA, B, LB, 7);
-}
-
 int test_mm_cmpistra(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistra
-  // CHECK: @llvm.x86.sse42.pcmpistria128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistra(A, B, 7);
 }
 
 int test_mm_cmpistrc(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistrc
-  // CHECK: @llvm.x86.sse42.pcmpistric128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistrc(A, B, 7);
 }
 
 int test_mm_cmpistri(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistri
-  // CHECK: @llvm.x86.sse42.pcmpistri128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistri(A, B, 7);
 }
 
 __m128i test_mm_cmpistrm(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistrm
-  // CHECK: @llvm.x86.sse42.pcmpistrm128
+  // CHECK: call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistrm(A, B, 7);
 }
 
 int test_mm_cmpistro(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistro
-  // CHECK: @llvm.x86.sse42.pcmpistrio128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistro(A, B, 7);
 }
 
 int test_mm_cmpistrs(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistrs
-  // CHECK: @llvm.x86.sse42.pcmpistris128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistrs(A, B, 7);
 }
 
 int test_mm_cmpistrz(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistrz
-  // CHECK: @llvm.x86.sse42.pcmpistriz128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistrz(A, B, 7);
 }
 
 unsigned int test_mm_crc32_u8(unsigned int CRC, unsigned char V) {
   // CHECK-LABEL: test_mm_crc32_u8
-  // CHECK: call i32 @llvm.x86.sse42.crc32.32.8
+  // CHECK: call i32 @llvm.x86.sse42.crc32.32.8(i32 %{{.*}}, i8 %{{.*}})
   return _mm_crc32_u8(CRC, V);
 }
 
 unsigned int test_mm_crc32_u16(unsigned int CRC, unsigned short V) {
   // CHECK-LABEL: test_mm_crc32_u16
-  // CHECK: call i32 @llvm.x86.sse42.crc32.32.16
+  // CHECK: call i32 @llvm.x86.sse42.crc32.32.16(i32 %{{.*}}, i16 %{{.*}})
   return _mm_crc32_u16(CRC, V);
 }
 
 unsigned int test_mm_crc32_u32(unsigned int CRC, unsigned int V) {
   // CHECK-LABEL: test_mm_crc32_u32
-  // CHECK: call i32 @llvm.x86.sse42.crc32.32.32
+  // CHECK: call i32 @llvm.x86.sse42.crc32.32.32(i32 %{{.*}}, i32 %{{.*}})
   return _mm_crc32_u32(CRC, V);
 }
 
-unsigned int test_mm_crc32_u64(unsigned long long CRC, unsigned long long V) {
+unsigned long long test_mm_crc32_u64(unsigned long long CRC, unsigned long long V) {
   // CHECK-LABEL: test_mm_crc32_u64
-  // CHECK: call i64 @llvm.x86.sse42.crc32.64.64
+  // CHECK: call i64 @llvm.x86.sse42.crc32.64.64(i64 %{{.*}}, i64 %{{.*}})
   return _mm_crc32_u64(CRC, V);
 }
diff --git a/test/CodeGen/sse4a-builtins.c b/test/CodeGen/sse4a-builtins.c
index 9a408b8..9367227 100644
--- a/test/CodeGen/sse4a-builtins.c
+++ b/test/CodeGen/sse4a-builtins.c
@@ -1,10 +1,12 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4a -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse4a -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
+
 __m128i test_mm_extracti_si64(__m128i x) {
   // CHECK-LABEL: test_mm_extracti_si64
   // CHECK: call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %{{[^,]+}}, i8 3, i8 2)
@@ -31,12 +33,14 @@
 
 void test_mm_stream_sd(double *p, __m128d a) {
   // CHECK-LABEL: test_mm_stream_sd
-  // CHECK: call void @llvm.x86.sse4a.movnt.sd(i8* %{{[^,]+}}, <2 x double> %{{[^,]+}})
-  _mm_stream_sd(p, a);
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1, !nontemporal
+   _mm_stream_sd(p, a);
 }
 
 void test_mm_stream_ss(float *p, __m128 a) {
   // CHECK-LABEL: test_mm_stream_ss
-  // CHECK: call void @llvm.x86.sse4a.movnt.ss(i8* %{{[^,]+}}, <4 x float> %{{[^,]+}})
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: store float %{{.*}}, float* %{{.*}}, align 1, !nontemporal
   _mm_stream_ss(p, a);
 }
diff --git a/test/CodeGen/ssse3-builtins.c b/test/CodeGen/ssse3-builtins.c
index d4b27a1..e6c2053 100644
--- a/test/CodeGen/ssse3-builtins.c
+++ b/test/CodeGen/ssse3-builtins.c
@@ -1,25 +1,27 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
+
 __m128i test_mm_abs_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_abs_epi8
-  // CHECK: call <16 x i8> @llvm.x86.ssse3.pabs.b.128
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %{{.*}})
   return _mm_abs_epi8(a);
 }
 
 __m128i test_mm_abs_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_abs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.pabs.w.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %{{.*}})
   return _mm_abs_epi16(a);
 }
 
 __m128i test_mm_abs_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_abs_epi32
-  // CHECK: call <4 x i32> @llvm.x86.ssse3.pabs.d.128
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %{{.*}})
   return _mm_abs_epi32(a);
 }
 
@@ -37,72 +39,72 @@
 
 __m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadd_epi16(a, b);
 }
 
 __m128i test_mm_hadd_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hadd_epi32(a, b);
 }
 
 __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadds_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
-  // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_maddubs_epi16(a, b);
 }
 
 __m128i test_mm_mulhrs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_mulhrs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_mulhrs_epi16(a, b);
 }
 
 __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shuffle_epi8
-  // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_shuffle_epi8(a, b);
 }
 
 __m128i test_mm_sign_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi8
-  // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_sign_epi8(a, b);
 }
 
 __m128i test_mm_sign_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sign_epi16(a, b);
 }
 
 __m128i test_mm_sign_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi32
-  // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sign_epi32(a, b);
 }
diff --git a/test/CodeGen/stack-protector.c b/test/CodeGen/stack-protector.c
index ecfbc90..7a45a2f 100644
--- a/test/CodeGen/stack-protector.c
+++ b/test/CodeGen/stack-protector.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 0 | FileCheck -check-prefix=NOSSP %s
-// NOSSP: define {{.*}}void @test1(i8* %msg) #0 {
-// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 1 | FileCheck -check-prefix=WITHSSP %s
-// WITHSSP: define {{.*}}void @test1(i8* %msg) #0 {
-// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 2 | FileCheck -check-prefix=SSPSTRONG %s
-// SSPSTRONG: define {{.*}}void @test1(i8* %msg) #0 {
-// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 3 | FileCheck -check-prefix=SSPREQ %s
-// SSPREQ: define {{.*}}void @test1(i8* %msg) #0 {
-// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack | FileCheck -check-prefix=SAFESTACK %s
-// SAFESTACK: define {{.*}}void @test1(i8* %msg) #0 {
+// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 0 | FileCheck -check-prefix=DEF -check-prefix=NOSSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 1 | FileCheck -check-prefix=DEF -check-prefix=SSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 2 | FileCheck -check-prefix=DEF -check-prefix=SSPSTRONG %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 3 | FileCheck -check-prefix=DEF -check-prefix=SSPREQ %s
+
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-NOSSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack -stack-protector 0 | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-NOSSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack -stack-protector 1 | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-SSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack -stack-protector 2 | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-SSPSTRONG %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack -stack-protector 3 | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-SSPREQ %s
 
 typedef __SIZE_TYPE__ size_t;
 
@@ -15,18 +15,21 @@
 size_t strlen(const char *s);
 char *strcpy(char *s1, const char *s2);
 
+// DEF: define {{.*}}void @test1(i8* %msg) #[[A:.*]] {
 void test1(const char *msg) {
   char a[strlen(msg) + 1];
   strcpy(a, msg);
   printf("%s\n", a);
 }
 
-// NOSSP: attributes #{{.*}} = { nounwind{{.*}} }
+// NOSSP-NOT: attributes #[[A]] = {{.*}} ssp
+// SSP: attributes #[[A]] = {{.*}} ssp{{ }}
+// SSPSTRONG: attributes #[[A]] = {{.*}} sspstrong
+// SSPREQ: attributes #[[A]] = {{.*}} sspreq
 
-// WITHSSP: attributes #{{.*}} = { nounwind ssp{{.*}} }
+// SAFESTACK-NOSSP: attributes #[[A]] = {{.*}} safestack
+// SAFESTACK-NOSSP-NOT: ssp
 
-// SSPSTRONG: attributes #{{.*}} = { nounwind sspstrong{{.*}} }
-
-// SSPREQ: attributes #{{.*}} = { nounwind sspreq{{.*}} }
-
-// SAFESTACK: attributes #{{.*}} = { nounwind safestack{{.*}} }
+// SAFESTACK-SSP: attributes #[[A]] = {{.*}} safestack ssp{{ }}
+// SAFESTACK-SSPSTRONG: attributes #[[A]] = {{.*}} safestack sspstrong
+// SAFESTACK-SSPREQ: attributes #[[A]] = {{.*}} safestack sspreq
diff --git a/test/CodeGen/struct-union-BE.c b/test/CodeGen/struct-union-BE.c
new file mode 100644
index 0000000..69ab1e8
--- /dev/null
+++ b/test/CodeGen/struct-union-BE.c
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -triple mips-linux-gnu  -S -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS
+// RUN: %clang_cc1 -triple mips64-linux-gnu  -S -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS64
+// RUN: %clang_cc1 -triple armebv7-linux-gnueabihf -S -emit-llvm %s -o - | FileCheck %s -check-prefix=ARM
+
+#include <stdarg.h>
+
+extern void abort() __attribute__((noreturn));
+
+struct tiny {
+  char c;
+};
+
+union data {
+  char c;
+};
+
+void fstr(int n, ...) {
+  struct tiny x;
+  va_list ap;
+  va_start (ap,n);
+  x = va_arg (ap, struct tiny);
+  if (x.c !=  10)
+    abort();
+  va_end (ap);
+// MIPS-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i32 3
+// MIPS64-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i64 7
+// ARM-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i32 3
+}
+
+void funi(int n, ...) {
+  union data x;
+  va_list ap;
+  va_start (ap,n);
+  x = va_arg (ap, union data);
+  if (x.c !=  10)
+    abort();
+  va_end (ap);
+// MIPS-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i32 3
+// MIPS64-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i64 7
+// ARM-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i32 3
+}
+
+void foo() {
+  struct tiny x[3];
+  union data y;
+  x[0].c = 10;
+  fstr(1, x[0]);
+  funi(1, y);
+}
diff --git a/test/CodeGen/target-builtin-error-2.c b/test/CodeGen/target-builtin-error-2.c
index 949f2cc..2e2691a 100644
--- a/test/CodeGen/target-builtin-error-2.c
+++ b/test/CodeGen/target-builtin-error-2.c
@@ -5,9 +5,9 @@
 
 // Since we do code generation on a function level this needs to error out since
 // the subtarget feature won't be available.
-__m256d wombat(__m128i a) {
+__m128 wombat(__m128i a) {
   if (__builtin_cpu_supports("avx"))
-    return __builtin_ia32_cvtdq2pd256((__v4si)a); // expected-error {{'__builtin_ia32_cvtdq2pd256' needs target feature avx}}
+    return __builtin_ia32_vpermilvarps((__v4sf) {0.0f, 1.0f, 2.0f, 3.0f}, (__v4si)a); // expected-error {{'__builtin_ia32_vpermilvarps' needs target feature avx}}
   else
-    return (__m256d){0, 0, 0, 0};
+    return (__m128){0, 0};
 }
diff --git a/test/CodeGen/target-builtin-noerror.c b/test/CodeGen/target-builtin-noerror.c
index 7d86b96..2a7d69f 100644
--- a/test/CodeGen/target-builtin-noerror.c
+++ b/test/CodeGen/target-builtin-noerror.c
@@ -42,3 +42,34 @@
 __m128 __attribute__((target("fma,fma4"))) fma_3(__m128 a, __m128 b, __m128 c) {
   return __builtin_ia32_vfmaddps(a, b, c);
 }
+
+void verifyfeaturestrings() {
+  (void)__builtin_cpu_supports("cmov");
+  (void)__builtin_cpu_supports("mmx");
+  (void)__builtin_cpu_supports("popcnt");
+  (void)__builtin_cpu_supports("sse");
+  (void)__builtin_cpu_supports("sse2");
+  (void)__builtin_cpu_supports("sse3");
+  (void)__builtin_cpu_supports("ssse3");
+  (void)__builtin_cpu_supports("sse4.1");
+  (void)__builtin_cpu_supports("sse4.2");
+  (void)__builtin_cpu_supports("avx");
+  (void)__builtin_cpu_supports("avx2");
+  (void)__builtin_cpu_supports("sse4a");
+  (void)__builtin_cpu_supports("fma4");
+  (void)__builtin_cpu_supports("xop");
+  (void)__builtin_cpu_supports("fma");
+  (void)__builtin_cpu_supports("avx512f");
+  (void)__builtin_cpu_supports("bmi");
+  (void)__builtin_cpu_supports("bmi2");
+  (void)__builtin_cpu_supports("aes");
+  (void)__builtin_cpu_supports("pclmul");
+  (void)__builtin_cpu_supports("avx512vl");
+  (void)__builtin_cpu_supports("avx512bw");
+  (void)__builtin_cpu_supports("avx512dq");
+  (void)__builtin_cpu_supports("avx512cd");
+  (void)__builtin_cpu_supports("avx512er");
+  (void)__builtin_cpu_supports("avx512pf");
+  (void)__builtin_cpu_supports("avx512vbmi");
+  (void)__builtin_cpu_supports("avx512ifma");
+}
diff --git a/test/CodeGen/target-data.c b/test/CodeGen/target-data.c
index 2ed7f09..c12ad2c 100644
--- a/test/CodeGen/target-data.c
+++ b/test/CodeGen/target-data.c
@@ -40,19 +40,19 @@
 
 // RUN: %clang_cc1 -triple mips64el-linux-gnu -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=MIPS-64EL
-// MIPS-64EL: target datalayout = "e-m:m-i8:8:32-i16:16:32-i64:64-n32:64-S128"
+// MIPS-64EL: target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-n32:64-S128"
 
 // RUN: %clang_cc1 -triple mips64el-linux-gnu -o - -emit-llvm -target-abi n32 \
 // RUN: %s | FileCheck %s -check-prefix=MIPS-64EL-N32
-// MIPS-64EL-N32: target datalayout = "e-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32:64-S128"
+// MIPS-64EL-N32: target datalayout = "e-m:e-p:32:32-i8:8:32-i16:16:32-i64:64-n32:64-S128"
 
 // RUN: %clang_cc1 -triple mips64-linux-gnu -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=MIPS-64EB
-// MIPS-64EB: target datalayout = "E-m:m-i8:8:32-i16:16:32-i64:64-n32:64-S128"
+// MIPS-64EB: target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-n32:64-S128"
 
 // RUN: %clang_cc1 -triple mips64-linux-gnu -o - -emit-llvm %s -target-abi n32 \
 // RUN: | FileCheck %s -check-prefix=MIPS-64EB-N32
-// MIPS-64EB-N32: target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32:64-S128"
+// MIPS-64EB-N32: target datalayout = "E-m:e-p:32:32-i8:8:32-i16:16:32-i64:64-n32:64-S128"
 
 // RUN: %clang_cc1 -triple powerpc64-lv2 -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=PS3
@@ -86,6 +86,10 @@
 // RUN: FileCheck %s -check-prefix=WEBASSEMBLY64
 // WEBASSEMBLY64: target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
 
+// RUN: %clang_cc1 -triple lanai-unknown-unknown -o - -emit-llvm %s | \
+// RUN: FileCheck %s -check-prefix=LANAI
+// LANAI: target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+
 // RUN: %clang_cc1 -triple powerpc-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=PPC
 // PPC: target datalayout = "E-m:e-p:32:32-i64:64-n32"
@@ -128,16 +132,16 @@
 
 // RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+// R600SI: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
 // Test default -target-cpu
 // RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+// R600SIDefault: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
 // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64
-// AARCH64: target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+// AARCH64: target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 // RUN: %clang_cc1 -triple thumb-unknown-gnueabi -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=THUMB
@@ -157,7 +161,7 @@
 
 // RUN: %clang_cc1 -triple hexagon-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=HEXAGON
-// HEXAGON: target datalayout = "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+// HEXAGON: target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
 
 // RUN: %clang_cc1 -triple s390x-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SYSTEMZ
diff --git a/test/CodeGen/target-features-error-2.c b/test/CodeGen/target-features-error-2.c
index c23d152..683d9ab 100644
--- a/test/CodeGen/target-features-error-2.c
+++ b/test/CodeGen/target-features-error-2.c
@@ -1,7 +1,38 @@
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o -
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_SSE42
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_1
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_2
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_3
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_4
+
 #define __MM_MALLOC_H
 #include <x86intrin.h>
 
+#if NEED_SSE42
 int baz(__m256i a) {
   return _mm256_extract_epi32(a, 3); // expected-error {{always_inline function '_mm256_extract_epi32' requires target feature 'sse4.2', but would be inlined into function 'baz' that is compiled without support for 'sse4.2'}}
 }
+#endif
+
+#if NEED_AVX_1
+__m128 need_avx(__m128 a, __m128 b) {
+  return _mm_cmp_ps(a, b, 0); // expected-error {{'__builtin_ia32_cmpps' needs target feature avx}}
+}
+#endif
+
+#if NEED_AVX_2
+__m128 need_avx(__m128 a, __m128 b) {
+  return _mm_cmp_ss(a, b, 0); // expected-error {{'__builtin_ia32_cmpss' needs target feature avx}}
+}
+#endif
+
+#if NEED_AVX_3
+__m128d need_avx(__m128d a, __m128d b) {
+  return _mm_cmp_pd(a, b, 0); // expected-error {{'__builtin_ia32_cmppd' needs target feature avx}}
+}
+#endif
+
+#if NEED_AVX_4
+__m128d need_avx(__m128d a, __m128d b) {
+  return _mm_cmp_sd(a, b, 0); // expected-error {{'__builtin_ia32_cmpsd' needs target feature avx}}
+}
+#endif
diff --git a/test/CodeGen/tbaa.cpp b/test/CodeGen/tbaa.cpp
index f98c46f..432c41e 100644
--- a/test/CodeGen/tbaa.cpp
+++ b/test/CodeGen/tbaa.cpp
@@ -1,6 +1,10 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
 // Test TBAA metadata generated by front-end.
+//
+// NO-TBAA-NOT: !tbaa
 
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
diff --git a/test/CodeGen/tbm-builtins.c b/test/CodeGen/tbm-builtins.c
index 29e147a..c8a9382 100644
--- a/test/CodeGen/tbm-builtins.c
+++ b/test/CodeGen/tbm-builtins.c
@@ -8,46 +8,56 @@
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
+
 unsigned int test__bextri_u32(unsigned int a) {
-  // CHECK: call i32 @llvm.x86.tbm.bextri.u32
+  // CHECK-LABEL: test__bextri_u32
+  // CHECK: call i32 @llvm.x86.tbm.bextri.u32(i32 %{{.*}}, i32 1)
   return __bextri_u32(a, 1);
 }
 
 unsigned long long test__bextri_u64(unsigned long long a) {
-  // CHECK: call i64 @llvm.x86.tbm.bextri.u64
+  // CHECK-LABEL: test__bextri_u64
+  // CHECK: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 2)
   return __bextri_u64(a, 2);
 }
 
 unsigned long long test__bextri_u64_bigint(unsigned long long a) {
-  // CHECK: call i64 @llvm.x86.tbm.bextri.u64
+  // CHECK-LABEL: test__bextri_u64_bigint
+  // CHECK: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 549755813887)
   return __bextri_u64(a, 0x7fffffffffLL);
 }
 
 unsigned int test__blcfill_u32(unsigned int a) {
+  // CHECK-LABEL: test__blcfill_u32
   // CHECK: [[TMP:%.*]] = add i32 [[SRC:%.*]], 1
   // CHECK-NEXT: %{{.*}} = and i32 [[TMP]], [[SRC]]
   return __blcfill_u32(a);
 }
 
 unsigned long long test__blcfill_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blcfill_u64
   // CHECK: [[TMPT:%.*]] = add i64 [[SRC:%.*]], 1
   // CHECK-NEXT: %{{.*}} = and i64 [[TMP]], [[SRC]]
   return __blcfill_u64(a);
 }
 
 unsigned int test__blci_u32(unsigned int a) {
+  // CHECK-LABEL: test__blci_u32
   // CHECK: [[TMP:%.*]] = sub i32 -2, [[SRC:%.*]]
   // CHECK-NEXT: %{{.*}} = or i32 [[TMP]], [[SRC]]
   return __blci_u32(a);
 }
 
 unsigned long long test__blci_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blci_u64
   // CHECK: [[TMP:%.*]] = sub i64 -2, [[SRC:%.*]]
   // CHECK-NEXT: %{{.*}} = or i64 [[TMP]], [[SRC]]
   return __blci_u64(a);
 }
 
 unsigned int test__blcic_u32(unsigned int a) {
+  // CHECK-LABEL: test__blcic_u32
   // CHECK: [[TMP1:%.*]] = xor i32 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC]], 1
   // CHECK-NEXT: {{.*}} = and i32 [[TMP2]], [[TMP1]]
@@ -55,6 +65,7 @@
 }
 
 unsigned long long test__blcic_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blcic_u64
   // CHECK: [[TMP1:%.*]] = xor i64 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC]], 1
   // CHECK-NEXT: {{.*}} = and i64 [[TMP2]], [[TMP1]]
@@ -62,42 +73,49 @@
 }
 
 unsigned int test__blcmsk_u32(unsigned int a) {
+  // CHECK-LABEL: test__blcmsk_u32
   // CHECK: [[TMP:%.*]] = add i32 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = xor i32 [[TMP]], [[SRC]]
   return __blcmsk_u32(a);
 }
 
 unsigned long long test__blcmsk_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blcmsk_u64
   // CHECK: [[TMP:%.*]] = add i64 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = xor i64 [[TMP]], [[SRC]]
   return __blcmsk_u64(a);
 }
 
 unsigned int test__blcs_u32(unsigned int a) {
+  // CHECK-LABEL: test__blcs_u32
   // CHECK: [[TMP:%.*]] = add i32 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = or i32 [[TMP]], [[SRC]]
   return __blcs_u32(a);
 }
 
 unsigned long long test__blcs_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blcs_u64
   // CHECK: [[TMP:%.*]] = add i64 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = or i64 [[TMP]], [[SRC]]
   return __blcs_u64(a);
 }
 
 unsigned int test__blsfill_u32(unsigned int a) {
+  // CHECK-LABEL: test__blsfill_u32
   // CHECK: [[TMP:%.*]] = add i32 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = or i32 [[TMP]], [[SRC]]
   return __blsfill_u32(a);
 }
 
 unsigned long long test__blsfill_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blsfill_u64
   // CHECK: [[TMP:%.*]] = add i64 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = or i64 [[TMP]], [[SRC]]
   return __blsfill_u64(a);
 }
 
 unsigned int test__blsic_u32(unsigned int a) {
+  // CHECK-LABEL: test__blsic_u32
   // CHECK: [[TMP1:%.*]] = xor i32 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = or i32 [[TMP2]], [[TMP1]]
@@ -105,6 +123,7 @@
 }
 
 unsigned long long test__blsic_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blsic_u64
   // CHECK: [[TMP1:%.*]] = xor i64 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = or i64 [[TMP2]], [[TMP1]]
@@ -112,6 +131,7 @@
 }
 
 unsigned int test__t1mskc_u32(unsigned int a) {
+  // CHECK-LABEL: test__t1mskc_u32
   // CHECK: [[TMP1:%.*]] = xor i32 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = or i32 [[TMP2]], [[TMP1]]
@@ -119,6 +139,7 @@
 }
 
 unsigned long long test__t1mskc_u64(unsigned long long a) {
+  // CHECK-LABEL: test__t1mskc_u64
   // CHECK: [[TMP1:%.*]] = xor i64 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = or i64 [[TMP2]], [[TMP1]]
@@ -126,6 +147,7 @@
 }
 
 unsigned int test__tzmsk_u32(unsigned int a) {
+  // CHECK-LABEL: test__tzmsk_u32
   // CHECK: [[TMP1:%.*]] = xor i32 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = and i32 [[TMP2]], [[TMP1]]
@@ -133,6 +155,7 @@
 }
 
 unsigned long long test__tzmsk_u64(unsigned long long a) {
+  // CHECK-LABEL: test__tzmsk_u64
   // CHECK: [[TMP1:%.*]] = xor i64 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = and i64 [[TMP2]], [[TMP1]]
diff --git a/test/CodeGen/temporary-lifetime-exceptions.cpp b/test/CodeGen/temporary-lifetime-exceptions.cpp
new file mode 100644
index 0000000..17e2168
--- /dev/null
+++ b/test/CodeGen/temporary-lifetime-exceptions.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 %s -fexceptions -fcxx-exceptions -std=c++11 -O1 -triple x86_64 -emit-llvm -o - | FileCheck %s
+
+// lifetime.end should be invoked even if the destructor doesn't run due to an
+// exception thrown from previous ctor call.
+
+struct A { A(); ~A(); };
+A Baz(const A&);
+
+void Test1() {
+  // CHECK-LABEL: @_Z5Test1v(
+  // CHECK: getelementptr
+  // CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* [[TMP:[^ ]+]])
+  // CHECK-NEXT: getelementptr
+  // CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* [[TMP1:[^ ]+]])
+
+  // Normal exit
+  // CHECK: call void @llvm.lifetime.end(i64 1, i8* [[TMP1]])
+  // CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* [[TMP]])
+
+  // Exception exit
+  // CHECK: call void @llvm.lifetime.end(i64 1, i8* [[TMP1]])
+  // CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* [[TMP]])
+  Baz(Baz(A()));
+}
diff --git a/test/CodeGen/temporary-lifetime.cpp b/test/CodeGen/temporary-lifetime.cpp
new file mode 100644
index 0000000..f6dd3e0
--- /dev/null
+++ b/test/CodeGen/temporary-lifetime.cpp
@@ -0,0 +1,168 @@
+// RUN: %clang_cc1 %s -std=c++11 -O1 -DWITH_DTOR -triple x86_64 -emit-llvm -o - | FileCheck -check-prefix=CHECK-DTOR %s
+// RUN: %clang_cc1 %s -std=c++11 -O1 -triple x86_64 -emit-llvm -o - | FileCheck -check-prefix=CHECK-NO-DTOR %s
+
+struct A {
+  A();
+#ifdef WITH_DTOR
+  ~A();
+#endif
+  char a[1024];
+  operator bool() const;
+};
+
+template <typename T>
+void Foo(T &&);
+
+template <typename T>
+void Bar(T &&);
+
+template <typename T>
+T Baz();
+
+void Test1() {
+  // CHECK-DTOR-LABEL: Test1
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: }
+
+  // CHECK-NO-DTOR-LABEL: Test1
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
+  // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
+  // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-NO-DTOR: }
+  {
+    const A &a = A{};
+    Foo(a);
+  }
+  {
+    const A &a = A{};
+    Foo(a);
+  }
+}
+
+void Test2() {
+  // CHECK-DTOR-LABEL: Test2
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR1:[0-9]+]])
+  // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR1:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR2:[0-9]+]])
+  // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR2:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR2]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR2]])
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR1]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR1]])
+  // CHECK-DTOR: }
+
+  // CHECK-NO-DTOR-LABEL: Test2
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR1:[0-9]+]])
+  // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR1:[^ ]+]])
+  // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR2:[0-9]+]])
+  // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR2:[^ ]+]])
+  // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR2]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR1]])
+  // CHECK-NO-DTOR: }
+  const A &a = A{};
+  Foo(a);
+  const A &b = A{};
+  Foo(b);
+}
+
+void Test3() {
+  // CHECK-DTOR-LABEL: Test3
+  // CHECK-DTOR: call void @llvm.lifetime.start
+  // CHECK-DTOR: call void @llvm.lifetime.start
+
+  // if.then:
+  // CHECK-DTOR: call void @llvm.lifetime.end
+
+  // cleanup:
+  // CHECK-DTOR: call void @llvm.lifetime.end
+
+  // cleanup:
+  // CHECK-DTOR: call void @llvm.lifetime.end
+  // CHECK-DTOR: }
+  const A &a = A{};
+  if (const A &b = A(a)) {
+    Foo(b);
+    return;
+  }
+  Bar(a);
+}
+
+void Test4() {
+  // CHECK-DTOR-LABEL: Test4
+  // CHECK-DTOR: call void @llvm.lifetime.start
+
+  // for.cond.cleanup:
+  // CHECK-DTOR: call void @llvm.lifetime.end
+
+  // for.body:
+  // CHECK-DTOR: }
+  for (const A &a = A{}; a;) {
+    Foo(a);
+  }
+}
+
+int Test5() {
+  // CHECK-DTOR-LABEL: Test5
+  // CHECK-DTOR: call void @llvm.lifetime.start
+  // CHECK-DTOR: call i32 @_Z3BazIiET_v()
+  // CHECK-DTOR: store
+  // CHECK-DTOR: call void @_Z3FooIRKiEvOT_
+  // CHECK-DTOR: load
+  // CHECK-DTOR: call void @llvm.lifetime.end
+  // CHECK-DTOR: }
+  const int &a = Baz<int>();
+  Foo(a);
+  return a;
+}
+
+void Test6() {
+  // CHECK-DTOR-LABEL: Test6
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 {{[0-9]+}}, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call i32 @_Z3BazIiET_v()
+  // CHECK-DTOR: store
+  // CHECK-DTOR: call void @_Z3FooIiEvOT_
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 {{[0-9]+}}, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 {{[0-9]+}}, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call i32 @_Z3BazIiET_v()
+  // CHECK-DTOR: store
+  // CHECK-DTOR: call void @_Z3FooIiEvOT_
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 {{[0-9]+}}, i8* %[[ADDR]])
+  // CHECK-DTOR: }
+  Foo(Baz<int>());
+  Foo(Baz<int>());
+}
+
+void Test7() {
+  // CHECK-DTOR-LABEL: Test7
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @_Z3BazI1AET_v({{.*}} %[[SLOT:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooI1AEvOT_({{.*}} %[[SLOT]])
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[SLOT]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @_Z3BazI1AET_v({{.*}} %[[SLOT:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooI1AEvOT_({{.*}} %[[SLOT]])
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[SLOT]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: }
+  Foo(Baz<A>());
+  Foo(Baz<A>());
+}
diff --git a/test/CodeGen/thinlto_backend.c b/test/CodeGen/thinlto_backend.c
deleted file mode 100644
index a2737fb..0000000
--- a/test/CodeGen/thinlto_backend.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %clang -O2 %s -flto=thin -c -o %t.o
-// RUN: llvm-lto -thinlto -o %t %t.o
-
-// Ensure clang -cc1 give expected error for incorrect input type
-// RUN: not %clang_cc1 -O2 -o %t1.o %s -c -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-WARNING
-// CHECK-WARNING: error: invalid argument '-fthinlto-index={{.*}}' only allowed with '-x ir'
-
-// Ensure we get expected error for missing index file
-// RUN: %clang -O2 -o %t1.o -x ir %t.o -c -fthinlto-index=bad.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR
-// CHECK-ERROR: Error loading index file 'bad.thinlto.bc'
-
-// Ensure Function Importing pass added
-// RUN: %clang -O2 -o %t1.o -x ir %t.o -c -fthinlto-index=%t.thinlto.bc -mllvm -debug-pass=Structure 2>&1 | FileCheck %s -check-prefix=CHECK-PASS
-// CHECK-PASS: Function Importing
diff --git a/test/CodeGen/thinlto_backend.ll b/test/CodeGen/thinlto_backend.ll
new file mode 100644
index 0000000..0fb2643
--- /dev/null
+++ b/test/CodeGen/thinlto_backend.ll
@@ -0,0 +1,29 @@
+; REQUIRES: x86-registered-target
+
+; RUN: opt -module-summary -o %t1.o %s
+; RUN: opt -module-summary -o %t2.o %S/Inputs/thinlto_backend.ll
+; RUN: llvm-lto -thinlto -o %t %t1.o %t2.o
+
+; Ensure clang -cc1 give expected error for incorrect input type
+; RUN: not %clang_cc1 -O2 -o %t1.o -x c %s -c -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-WARNING
+; CHECK-WARNING: error: invalid argument '-fthinlto-index={{.*}}' only allowed with '-x ir'
+
+; Ensure we get expected error for missing index file
+; RUN: %clang -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=bad.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR
+; CHECK-ERROR: Error loading index file 'bad.thinlto.bc'
+
+; Ensure f2 was imported
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc
+; RUN: llvm-nm %t3.o | FileCheck --check-prefix=CHECK-OBJ %s
+; CHECK-OBJ: T f1
+; CHECK-OBJ-NOT: U f2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f2()
+
+define void @f1() {
+  call void @f2()
+  ret void
+}
diff --git a/test/CodeGen/ubsan-strip-path-components.cpp b/test/CodeGen/ubsan-strip-path-components.cpp
new file mode 100644
index 0000000..7a95324
--- /dev/null
+++ b/test/CodeGen/ubsan-strip-path-components.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - | FileCheck %s -check-prefix=REGULAR -check-prefix=CHECK
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=0 | FileCheck %s -check-prefix=REGULAR -check-prefix=CHECK
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=2 | FileCheck %s -check-prefix=REMOVE-FIRST-TWO -check-prefix=CHECK
+
+// Try to strip too much:
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=-99999 | FileCheck %s -check-prefix=REGULAR
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=99999 | FileCheck %s -check-prefix=LAST-ONLY
+
+// Check stripping from the file name
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=-2 | FileCheck %s -check-prefix=LAST-TWO
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=-1 | FileCheck %s -check-prefix=LAST-ONLY
+
+// REGULAR: @[[SRC:[0-9.a-zA-Z_]+]] =      private unnamed_addr constant [{{.*}} x i8] c"{{.*test(.|\\5C)CodeGen(.|\\5C)ubsan-strip-path-components\.cpp}}\00", align 1
+
+// First path component: "/" or "$drive_letter:", then a name, or '\5C' on Windows
+// REMOVE-FIRST-TWO: @[[STR:[0-9.a-zA-Z_]+]] = private unnamed_addr constant [{{.*}} x i8] c"{{(.:|/)([^\\/]*(/|\\5C))}}[[REST:.*ubsan-strip-path-components\.cpp]]\00", align 1
+// REMOVE-FIRST-TWO: @[[SRC:[0-9.a-zA-Z_]+]] = private unnamed_addr constant [{{.*}} x i8] c"[[REST]]\00", align 1
+
+// LAST-TWO: @[[SRC:[0-9.a-zA-Z_]+]] =     private unnamed_addr constant [{{.*}} x i8] c"CodeGen{{/|\\5C}}ubsan-strip-path-components.cpp\00", align 1
+// LAST-ONLY: @[[SRC:[0-9.a-zA-Z_]+]] =    private unnamed_addr constant [{{.*}} x i8] c"ubsan-strip-path-components.cpp\00", align 1
+
+// CHECK: @[[STATIC_DATA:[0-9.a-zA-Z_]+]] = private unnamed_addr global { { [{{.*}} x i8]*, i32, i32 } } { { [{{.*}} x i8]*, i32, i32 } { [{{.*}} x i8]* @[[SRC]], i32 [[@LINE+6]], i32 3 } }
+void g(const char *);
+void f() {
+  // CHECK-LABEL: @_Z1fv(
+  g(__FILE__);
+  // CHECK: call void @__ubsan_handle_builtin_unreachable(i8* bitcast ({ { [{{.*}} x i8]*, i32, i32 } }* @[[STATIC_DATA]] to i8*)) {{.*}}, !nosanitize
+  __builtin_unreachable();
+}
diff --git a/test/CodeGen/vector.c b/test/CodeGen/vector.c
index 8e820f2..14f5079 100644
--- a/test/CodeGen/vector.c
+++ b/test/CodeGen/vector.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-apple-darwin9 -O1 -target-cpu pentium4 -target-feature +sse4.1 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i386-apple-darwin9 -O1 -target-cpu core2 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
 typedef short __v4hi __attribute__ ((__vector_size__ (8)));
 
 void test1() {
@@ -62,3 +62,23 @@
   extern __typeof(_mm_extract_epi16(_mm_setzero_si128(), 3)) check_result_int;
   extern __typeof(_mm_extract_epi32(_mm_setzero_si128(), 3)) check_result_int;
 }
+
+// Test some logic around our lax vector comparison rules with integers.
+
+typedef int vec_int1 __attribute__((vector_size(4)));
+vec_int1 lax_vector_compare1(int x, vec_int1 y) {
+  y = x == y;
+  return y;
+}
+
+// CHECK: define i32 @lax_vector_compare1(i32 {{.*}}, i32 {{.*}})
+// CHECK: icmp eq <1 x i32>
+
+typedef int vec_int2 __attribute__((vector_size(8)));
+vec_int2 lax_vector_compare2(long long x, vec_int2 y) {
+  y = x == y;
+  return y;
+}
+
+// CHECK: define void @lax_vector_compare2(<2 x i32>* {{.*sret.*}}, i64 {{.*}}, i64 {{.*}})
+// CHECK: icmp eq <2 x i32>
diff --git a/test/CodeGen/vectorcall.c b/test/CodeGen/vectorcall.c
index 9ee35b1..b38d5e5 100644
--- a/test/CodeGen/vectorcall.c
+++ b/test/CodeGen/vectorcall.c
@@ -9,9 +9,9 @@
 // CHECK: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
 // X64: define x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b)
 
-struct Small { int a; };
+struct Small { int x; };
 void __vectorcall v3(int a, struct Small b, int c) {}
-// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, %struct.Small* byval align 4 %b, i32 inreg %c)
+// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, i32 %b.0, i32 inreg %c)
 // X64: define x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c)
 
 struct Large { int a[5]; };
diff --git a/test/CodeGen/wasm-varargs.c b/test/CodeGen/wasm-varargs.c
new file mode 100644
index 0000000..b8e488e
--- /dev/null
+++ b/test/CodeGen/wasm-varargs.c
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -o - -emit-llvm %s | FileCheck %s
+
+#include <stdarg.h>
+
+int test_i32(char *fmt, ...) {
+  va_list va;
+
+  va_start(va, fmt);
+  int v = va_arg(va, int);
+  va_end(va);
+
+  return v;
+}
+
+// CHECK-LABEL: define i32 @test_i32(i8*{{.*}} %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[VA:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[V:%[^,=]+]] = alloca i32, align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%[^,=]+]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[ARGP_NEXT:%[^,=]+]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 4
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK:   [[R4:%[^,=]+]] = load i32, i32* [[R3]], align 4
+// CHECK:   store i32 [[R4]], i32* [[V]], align 4
+// CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[R5:%[^,=]+]] = load i32, i32* [[V]], align 4
+// CHECK:   ret i32 [[R5]]
+// CHECK: }
+
+long long test_i64(char *fmt, ...) {
+  va_list va;
+
+  va_start(va, fmt);
+  long long v = va_arg(va, long long);
+  va_end(va);
+
+  return v;
+}
+
+// CHECK-LABEL: define i64 @test_i64(i8*{{.*}} %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[VA:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[V:%[^,=]+]] = alloca i64, align 8
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%[^,=]+]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[R0:%[^,=]+]] = ptrtoint i8* [[ARGP_CUR]] to i32
+// CHECK:   [[R1:%[^,=]+]] = add i32 [[R0]], 7
+// CHECK:   [[R2:%[^,=]+]] = and i32 [[R1]], -8
+// CHECK:   [[ARGP_CUR_ALIGNED:%[^,=]+]] = inttoptr i32 [[R2]] to i8*
+// CHECK:   [[ARGP_NEXT:%[^,=]+]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i32 8
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to i64*
+// CHECK:   [[R4:%[^,=]+]] = load i64, i64* [[R3]], align 8
+// CHECK:   store i64 [[R4]], i64* [[V]], align 8
+// CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[R5:%[^,=]+]] = load i64, i64* [[V]], align 8
+// CHECK:   ret i64 [[R5]]
+// CHECK: }
+
+struct S {
+    int x;
+    int y;
+    int z;
+};
+
+struct S test_struct(char *fmt, ...) {
+  va_list va;
+
+  va_start(va, fmt);
+  struct S v = va_arg(va, struct S);
+  va_end(va);
+
+  return v;
+}
+
+// CHECK: define void @test_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret %agg.result, i8*{{.*}} %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[VA:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[V:%[^,=]+]] = alloca [[STRUCT_S]], align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%[^,=]+]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[ARGP_NEXT:%[^,=]+]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 12
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR]] to [[STRUCT_S]]*
+// CHECK:   [[R4:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
+// CHECK:   [[R5:%[^,=]+]] = bitcast [[STRUCT_S]]* [[R3]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R4]], i8* [[R5]], i32 12, i32 4, i1 false)
+// CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[R6:%[^,=]+]] = bitcast [[STRUCT_S]]* %agg.result to i8*
+// CHECK:   [[R7:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R6]], i8* [[R7]], i32 12, i32 4, i1 false)
+// CHECK:   ret void
+// CHECK: }
diff --git a/test/CodeGen/windows-on-arm-itanium-thread-local.c b/test/CodeGen/windows-on-arm-itanium-thread-local.c
new file mode 100644
index 0000000..7f12c36
--- /dev/null
+++ b/test/CodeGen/windows-on-arm-itanium-thread-local.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple thumbv7--windows-itanium -fdeclspec -fms-compatibility -fms-compatibility-version=19.0 -S -emit-llvm -o - %s | FileCheck %s
+
+__declspec(thread) static void *c;
+void f(void *p) {
+  c = p;
+}
+
+// CHECK-LABEL: @f(i8* %p)
+// CHECK-NOT: call i8** @_ZTWL1c()
+// CHECK: call arm_aapcs_vfpcc i8** @_ZTWL1c()
+
diff --git a/test/CodeGen/windows-on-arm-tls-support.c b/test/CodeGen/windows-on-arm-tls-support.c
new file mode 100644
index 0000000..dfb8b27
--- /dev/null
+++ b/test/CodeGen/windows-on-arm-tls-support.c
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -triple thumbv7--windows -fms-extensions -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+__declspec(thread) int i;
+
diff --git a/test/CodeGen/windows-struct-abi.c b/test/CodeGen/windows-struct-abi.c
index 4b4a6f1..1631f61 100644
--- a/test/CodeGen/windows-struct-abi.c
+++ b/test/CodeGen/windows-struct-abi.c
@@ -10,7 +10,7 @@
 
 void receive_f1(struct f1 a0) { }
 
-// CHECK: define void @receive_f1(%struct.f1* byval align 4 %a0)
+// CHECK: define void @receive_f1(float %a0.0)
 
 struct f2 {
   float f;
@@ -23,7 +23,7 @@
 
 void receive_f2(struct f2 a0) { }
 
-// CHECK: define void @receive_f2(%struct.f2* byval align 4 %a0)
+// CHECK: define void @receive_f2(float %a0.0, float %a0.1)
 
 struct f4 {
   float f;
@@ -38,5 +38,5 @@
 
 void receive_f4(struct f4 a0) { }
 
-// CHECK: define void @receive_f4(%struct.f4* byval align 4 %a0)
+// CHECK: define void @receive_f4(float %a0.0, float %a0.1, float %a0.2, float %a0.3)
 
diff --git a/test/CodeGen/wrapv-lshr-sanitize.c b/test/CodeGen/wrapv-lshr-sanitize.c
new file mode 100644
index 0000000..c09dab7
--- /dev/null
+++ b/test/CodeGen/wrapv-lshr-sanitize.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -fsanitize=shift-base -emit-llvm %s -o - -triple x86_64-linux-gnu -fwrapv | FileCheck %s
+
+// CHECK-LABEL: @lsh_overflow
+int lsh_overflow(int a, int b) {
+  // CHECK-NOT: br
+  // CHECK-NOT: call void @__ubsan_
+  // CHECK-NOT: call void @llvm.trap
+  
+  // CHECK:      %[[RET:.*]] = shl i32
+  // CHECK-NEXT: ret i32 %[[RET]]
+  return a << b;
+}
diff --git a/test/CodeGen/x86_32-arguments-win32.c b/test/CodeGen/x86_32-arguments-win32.c
index f8b0995..7b27fc7 100644
--- a/test/CodeGen/x86_32-arguments-win32.c
+++ b/test/CodeGen/x86_32-arguments-win32.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: define i64 @f1_1()
-// CHECK-LABEL: define void @f1_2(%struct.s1* byval align 4 %a0)
+// CHECK-LABEL: define void @f1_2(i32 %a0.0, i32 %a0.1)
 struct s1 {
   int a;
   int b;
@@ -31,7 +31,7 @@
 struct s4 f4_1(void) { while (1) {} }
 
 // CHECK-LABEL: define i64 @f5_1()
-// CHECK-LABEL: define void @f5_2(%struct.s5* byval align 4)
+// CHECK-LABEL: define void @f5_2(double %a0.0)
 struct s5 {
   double a;
 };
@@ -39,7 +39,7 @@
 void f5_2(struct s5 a0) {}
 
 // CHECK-LABEL: define i32 @f6_1()
-// CHECK-LABEL: define void @f6_2(%struct.s6* byval align 4 %a0)
+// CHECK-LABEL: define void @f6_2(float %a0.0)
 struct s6 {
   float a;
 };
diff --git a/test/CodeGen/x86_32-xsave.c b/test/CodeGen/x86_32-xsave.c
index da5d38a..b475d63 100644
--- a/test/CodeGen/x86_32-xsave.c
+++ b/test/CodeGen/x86_32-xsave.c
@@ -1,18 +1,18 @@
-// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=i686-unknown-unknown -target-feature +xsave -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVE
-// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=i686-unknown-unknown -target-feature +xsave -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVE
+// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=i686-unknown-unknown -target-feature +xsave -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVE
+// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=i686-unknown-unknown -target-feature +xsave -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVE
 
-// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEOPT
-// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEOPT
+// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEOPT
+// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEOPT
 
-// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsavec -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEC
-// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsavec -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEC
+// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsavec -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEC
+// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsavec -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEC
 
-// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaves -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVES
-// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaves -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVES
+// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaves -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVES
+// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=i686-unknown-unknown -target-feature +xsave -target-feature +xsaves -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVES
 
 void test() {
-  unsigned long long tmp_ULLi;
-  void*              tmp_vp;
+  unsigned long long tmp_ULLi = 0;
+  void*              tmp_vp = 0;
 
 #ifdef TEST_XSAVE
 // XSAVE: [[tmp_vp_1:%[0-9a-zA-z]+]] = load i8*, i8** %tmp_vp, align 4
diff --git a/test/CodeGen/x86_64-arguments.c b/test/CodeGen/x86_64-arguments.c
index e3b853d..9f375d7 100644
--- a/test/CodeGen/x86_64-arguments.c
+++ b/test/CodeGen/x86_64-arguments.c
@@ -261,12 +261,12 @@
 typedef unsigned long long v1i64 __attribute__((__vector_size__(8)));
 
 // rdar://8359248
-// CHECK-LABEL: define i64 @f34(i64 %arg.coerce)
+// CHECK-LABEL: define double @f34(double %arg.coerce)
 v1i64 f34(v1i64 arg) { return arg; }
 
 
 // rdar://8358475
-// CHECK-LABEL: define i64 @f35(i64 %arg.coerce)
+// CHECK-LABEL: define double @f35(double %arg.coerce)
 typedef unsigned long v1i64_2 __attribute__((__vector_size__(8)));
 v1i64_2 f35(v1i64_2 arg) { return arg+arg; }
 
@@ -470,13 +470,14 @@
 s512 x55;
 __m512 x56;
 
-// Even on AVX512, aggregates of size larger than four eightbytes have class
-// MEMORY (AVX512 draft 0.3 3.2.3p2 Rule 1).
+// On AVX512, aggregates which contain a __m512 type are classified as SSE/SSEUP
+// as per https://github.com/hjl-tools/x86-psABI/commit/30f9c9 3.2.3p2 Rule 1
 //
-// CHECK: declare void @f55(%struct.s512* byval align 64)
+// AVX512: declare void @f55(<16 x float>)
+// NO-AVX512: declare void @f55(%struct.s512* byval align 64)
 void f55(s512 x);
 
-// However, __m512 has type SSE/SSEUP on AVX512.
+// __m512 has type SSE/SSEUP on AVX512.
 //
 // AVX512: declare void @f56(<16 x float>)
 // NO-AVX512: declare void @f56(<16 x float>* byval align 64)
@@ -535,3 +536,12 @@
   f64_helper(x64, x64, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0i);
   f64_helper(x64, x64, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0i);
 }
+
+struct t65 {
+  __m256 m;
+  int : 0;
+};
+// SSE-LABEL: @f65(%struct.t65* byval align 32 %{{[^,)]+}})
+// AVX: @f65(<8 x float> %{{[^,)]+}})
+void f65(struct t65 a0) {
+}
diff --git a/test/CodeGen/x86_64-longdouble.c b/test/CodeGen/x86_64-longdouble.c
index 8baf4d1..8aeddb4 100644
--- a/test/CodeGen/x86_64-longdouble.c
+++ b/test/CodeGen/x86_64-longdouble.c
@@ -11,12 +11,12 @@
 // Android uses fp128 for long double but other x86_64 targets use x86_fp80.
 
 long double dataLD = 1.0L;
-// ANDROID: @dataLD = global fp128 0xL00000000000000003FFF000000000000, align 16
-// GNU: @dataLD = global x86_fp80 0xK3FFF8000000000000000, align 16
+// ANDROID: @dataLD = local_unnamed_addr global fp128 0xL00000000000000003FFF000000000000, align 16
+// GNU: @dataLD = local_unnamed_addr global x86_fp80 0xK3FFF8000000000000000, align 16
 
 long double _Complex dataLDC = {1.0L, 1.0L};
-// ANDROID: @dataLDC = global { fp128, fp128 } { fp128 0xL00000000000000003FFF000000000000, fp128 0xL00000000000000003FFF000000000000 }, align 16
-// GNU: @dataLDC = global { x86_fp80, x86_fp80 } { x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000 }, align 16
+// ANDROID: @dataLDC = local_unnamed_addr global { fp128, fp128 } { fp128 0xL00000000000000003FFF000000000000, fp128 0xL00000000000000003FFF000000000000 }, align 16
+// GNU: @dataLDC = local_unnamed_addr global { x86_fp80, x86_fp80 } { x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000 }, align 16
 
 long double TestLD(long double x) {
   return x * x;
diff --git a/test/CodeGen/x86_64-xsave.c b/test/CodeGen/x86_64-xsave.c
index ecdb725..496e982 100644
--- a/test/CodeGen/x86_64-xsave.c
+++ b/test/CodeGen/x86_64-xsave.c
@@ -1,18 +1,18 @@
-// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVE
-// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVE
+// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVE
+// RUN: %clang_cc1 %s -DTEST_XSAVE -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVE
 
-// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEOPT
-// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEOPT
+// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEOPT
+// RUN: %clang_cc1 %s -DTEST_XSAVEOPT -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaveopt -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEOPT
 
-// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsavec -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEC
-// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsavec -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVEC
+// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsavec -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEC
+// RUN: %clang_cc1 %s -DTEST_XSAVEC -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsavec -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVEC
 
-// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaves -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVES
-// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaves -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s --check-prefix=XSAVES
+// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaves -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVES
+// RUN: %clang_cc1 %s -DTEST_XSAVES -O0 -triple=x86_64-unknown-unknown -target-feature +xsave -target-feature +xsaves -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=XSAVES
 
 void test() {
-  unsigned long long tmp_ULLi;
-  void*              tmp_vp;
+  unsigned long long tmp_ULLi = 0;
+  void*              tmp_vp = 0;
 
 #ifdef TEST_XSAVE
 // XSAVE: [[tmp_vp_1:%[0-9a-zA-z]+]] = load i8*, i8** %tmp_vp, align 8
diff --git a/test/CodeGen/xop-builtins.c b/test/CodeGen/xop-builtins.c
index 5f0f20d..09ceb20 100644
--- a/test/CodeGen/xop-builtins.c
+++ b/test/CodeGen/xop-builtins.c
@@ -1,390 +1,393 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
+
 __m128i test_mm_maccs_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccs_epi16
-  // CHECK: @llvm.x86.xop.vpmacssww
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_maccs_epi16(a, b, c);
 }
 
 __m128i test_mm_macc_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_macc_epi16
-  // CHECK: @llvm.x86.xop.vpmacsww
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_macc_epi16(a, b, c);
 }
 
 __m128i test_mm_maccsd_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccsd_epi16
-  // CHECK: @llvm.x86.xop.vpmacsswd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maccsd_epi16(a, b, c);
 }
 
 __m128i test_mm_maccd_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccd_epi16
-  // CHECK: @llvm.x86.xop.vpmacswd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maccd_epi16(a, b, c);
 }
 
 __m128i test_mm_maccs_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccs_epi32
-  // CHECK: @llvm.x86.xop.vpmacssdd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maccs_epi32(a, b, c);
 }
 
 __m128i test_mm_macc_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_macc_epi32
-  // CHECK: @llvm.x86.xop.vpmacsdd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_macc_epi32(a, b, c);
 }
 
 __m128i test_mm_maccslo_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccslo_epi32
-  // CHECK: @llvm.x86.xop.vpmacssdql
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maccslo_epi32(a, b, c);
 }
 
 __m128i test_mm_macclo_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_macclo_epi32
-  // CHECK: @llvm.x86.xop.vpmacsdql
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_macclo_epi32(a, b, c);
 }
 
 __m128i test_mm_maccshi_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccshi_epi32
-  // CHECK: @llvm.x86.xop.vpmacssdqh
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maccshi_epi32(a, b, c);
 }
 
 __m128i test_mm_macchi_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_macchi_epi32
-  // CHECK: @llvm.x86.xop.vpmacsdqh
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_macchi_epi32(a, b, c);
 }
 
 __m128i test_mm_maddsd_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maddsd_epi16
-  // CHECK: @llvm.x86.xop.vpmadcsswd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maddsd_epi16(a, b, c);
 }
 
 __m128i test_mm_maddd_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maddd_epi16
-  // CHECK: @llvm.x86.xop.vpmadcswd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maddd_epi16(a, b, c);
 }
 
 __m128i test_mm_haddw_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_haddw_epi8
-  // CHECK: @llvm.x86.xop.vphaddbw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %{{.*}})
   return _mm_haddw_epi8(a);
 }
 
 __m128i test_mm_haddd_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_haddd_epi8
-  // CHECK: @llvm.x86.xop.vphaddbd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %{{.*}})
   return _mm_haddd_epi8(a);
 }
 
 __m128i test_mm_haddq_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epi8
-  // CHECK: @llvm.x86.xop.vphaddbq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %{{.*}})
   return _mm_haddq_epi8(a);
 }
 
 __m128i test_mm_haddd_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_haddd_epi16
-  // CHECK: @llvm.x86.xop.vphaddwd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %{{.*}})
   return _mm_haddd_epi16(a);
 }
 
 __m128i test_mm_haddq_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epi16
-  // CHECK: @llvm.x86.xop.vphaddwq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %{{.*}})
   return _mm_haddq_epi16(a);
 }
 
 __m128i test_mm_haddq_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epi32
-  // CHECK: @llvm.x86.xop.vphadddq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %{{.*}})
   return _mm_haddq_epi32(a);
 }
 
 __m128i test_mm_haddw_epu8(__m128i a) {
   // CHECK-LABEL: test_mm_haddw_epu8
-  // CHECK: @llvm.x86.xop.vphaddubw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %{{.*}})
   return _mm_haddw_epu8(a);
 }
 
 __m128i test_mm_haddd_epu8(__m128i a) {
   // CHECK-LABEL: test_mm_haddd_epu8
-  // CHECK: @llvm.x86.xop.vphaddubd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %{{.*}})
   return _mm_haddd_epu8(a);
 }
 
 __m128i test_mm_haddq_epu8(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epu8
-  // CHECK: @llvm.x86.xop.vphaddubq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %{{.*}})
   return _mm_haddq_epu8(a);
 }
 
 __m128i test_mm_haddd_epu16(__m128i a) {
   // CHECK-LABEL: test_mm_haddd_epu16
-  // CHECK: @llvm.x86.xop.vphadduwd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %{{.*}})
   return _mm_haddd_epu16(a);
 }
 
 __m128i test_mm_haddq_epu16(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epu16
-  // CHECK: @llvm.x86.xop.vphadduwq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %{{.*}})
   return _mm_haddq_epu16(a);
 }
 
 __m128i test_mm_haddq_epu32(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epu32
-  // CHECK: @llvm.x86.xop.vphaddudq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %{{.*}})
   return _mm_haddq_epu32(a);
 }
 
 __m128i test_mm_hsubw_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_hsubw_epi8
-  // CHECK: @llvm.x86.xop.vphsubbw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %{{.*}})
   return _mm_hsubw_epi8(a);
 }
 
 __m128i test_mm_hsubd_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_hsubd_epi16
-  // CHECK: @llvm.x86.xop.vphsubwd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %{{.*}})
   return _mm_hsubd_epi16(a);
 }
 
 __m128i test_mm_hsubq_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_hsubq_epi32
-  // CHECK: @llvm.x86.xop.vphsubdq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %{{.*}})
   return _mm_hsubq_epi32(a);
 }
 
 __m128i test_mm_cmov_si128(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_cmov_si128
-  // CHECK: @llvm.x86.xop.vpcmov
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_cmov_si128(a, b, c);
 }
 
 __m256i test_mm256_cmov_si256(__m256i a, __m256i b, __m256i c) {
   // CHECK-LABEL: test_mm256_cmov_si256
-  // CHECK: @llvm.x86.xop.vpcmov.256
+  // CHECK: call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_cmov_si256(a, b, c);
 }
 
 __m128i test_mm_perm_epi8(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_perm_epi8
-  // CHECK: @llvm.x86.xop.vpperm
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_perm_epi8(a, b, c);
 }
 
 __m128i test_mm_rot_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_rot_epi8
-  // CHECK: @llvm.x86.xop.vprotb
+  // CHECK: call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_rot_epi8(a, b);
 }
 
 __m128i test_mm_rot_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_rot_epi16
-  // CHECK: @llvm.x86.xop.vprotw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_rot_epi16(a, b);
 }
 
 __m128i test_mm_rot_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_rot_epi32
-  // CHECK: @llvm.x86.xop.vprotd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_rot_epi32(a, b);
 }
 
 __m128i test_mm_rot_epi64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_rot_epi64
-  // CHECK: @llvm.x86.xop.vprotq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_rot_epi64(a, b);
 }
 
 __m128i test_mm_roti_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_roti_epi8
-  // CHECK: @llvm.x86.xop.vprotbi
+  // CHECK: call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %{{.*}}, i8 1)
   return _mm_roti_epi8(a, 1);
 }
 
 __m128i test_mm_roti_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_roti_epi16
-  // CHECK: @llvm.x86.xop.vprotwi
+  // CHECK: call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %{{.*}}, i8 50)
   return _mm_roti_epi16(a, 50);
 }
 
 __m128i test_mm_roti_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_roti_epi32
-  // CHECK: @llvm.x86.xop.vprotdi
+  // CHECK: call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %{{.*}}, i8 -30)
   return _mm_roti_epi32(a, -30);
 }
 
 __m128i test_mm_roti_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_roti_epi64
-  // CHECK: @llvm.x86.xop.vprotqi
+  // CHECK: call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %{{.*}}, i8 100)
   return _mm_roti_epi64(a, 100);
 }
 
 __m128i test_mm_shl_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shl_epi8
-  // CHECK: @llvm.x86.xop.vpshlb
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_shl_epi8(a, b);
 }
 
 __m128i test_mm_shl_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shl_epi16
-  // CHECK: @llvm.x86.xop.vpshlw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_shl_epi16(a, b);
 }
 
 __m128i test_mm_shl_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shl_epi32
-  // CHECK: @llvm.x86.xop.vpshld
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_shl_epi32(a, b);
 }
 
 __m128i test_mm_shl_epi64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shl_epi64
-  // CHECK: @llvm.x86.xop.vpshlq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_shl_epi64(a, b);
 }
 
 __m128i test_mm_sha_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sha_epi8
-  // CHECK: @llvm.x86.xop.vpshab
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_sha_epi8(a, b);
 }
 
 __m128i test_mm_sha_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sha_epi16
-  // CHECK: @llvm.x86.xop.vpshaw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sha_epi16(a, b);
 }
 
 __m128i test_mm_sha_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sha_epi32
-  // CHECK: @llvm.x86.xop.vpshad
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sha_epi32(a, b);
 }
 
 __m128i test_mm_sha_epi64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sha_epi64
-  // CHECK: @llvm.x86.xop.vpshaq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_sha_epi64(a, b);
 }
 
 __m128i test_mm_com_epu8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epu8
-  // CHECK: @llvm.x86.xop.vpcomub
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 0)
   return _mm_com_epu8(a, b, 0);
 }
 
 __m128i test_mm_com_epu16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epu16
-  // CHECK: @llvm.x86.xop.vpcomuw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, i8 0)
   return _mm_com_epu16(a, b, 0);
 }
 
 __m128i test_mm_com_epu32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epu32
-  // CHECK: @llvm.x86.xop.vpcomud
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 0)
   return _mm_com_epu32(a, b, 0);
 }
 
 __m128i test_mm_com_epu64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epu64
-  // CHECK: @llvm.x86.xop.vpcomuq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 0)
   return _mm_com_epu64(a, b, 0);
 }
 
 __m128i test_mm_com_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epi8
-  // CHECK: @llvm.x86.xop.vpcomb
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 0)
   return _mm_com_epi8(a, b, 0);
 }
 
 __m128i test_mm_com_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epi16
-  // CHECK: @llvm.x86.xop.vpcomw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, i8 0)
   return _mm_com_epi16(a, b, 0);
 }
 
 __m128i test_mm_com_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epi32
-  // CHECK: @llvm.x86.xop.vpcomd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 0)
   return _mm_com_epi32(a, b, 0);
 }
 
 __m128i test_mm_com_epi64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epi64
-  // CHECK: @llvm.x86.xop.vpcomq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 0)
   return _mm_com_epi64(a, b, 0);
 }
 
 __m128d test_mm_permute2_pd(__m128d a, __m128d b, __m128i c) {
   // CHECK-LABEL: test_mm_permute2_pd
-  // CHECK: @llvm.x86.xop.vpermil2pd
+  // CHECK: call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i64> %{{.*}}, i8 0)
   return _mm_permute2_pd(a, b, c, 0);
 }
 
 __m256d test_mm256_permute2_pd(__m256d a, __m256d b, __m256i c) {
   // CHECK-LABEL: test_mm256_permute2_pd
-  // CHECK: @llvm.x86.xop.vpermil2pd.256
+  // CHECK: call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i64> %{{.*}}, i8 0)
   return _mm256_permute2_pd(a, b, c, 0);
 }
 
 __m128 test_mm_permute2_ps(__m128 a, __m128 b, __m128i c) {
   // CHECK-LABEL: test_mm_permute2_ps
-  // CHECK: @llvm.x86.xop.vpermil2ps
+  // CHECK: call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> %{{.*}}, i8 0)
   return _mm_permute2_ps(a, b, c, 0);
 }
 
 __m256 test_mm256_permute2_ps(__m256 a, __m256 b, __m256i c) {
   // CHECK-LABEL: test_mm256_permute2_ps
-  // CHECK: @llvm.x86.xop.vpermil2ps.256
+  // CHECK: call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> %{{.*}}, i8 0)
   return _mm256_permute2_ps(a, b, c, 0);
 }
 
 __m128 test_mm_frcz_ss(__m128 a) {
   // CHECK-LABEL: test_mm_frcz_ss
-  // CHECK: @llvm.x86.xop.vfrcz.ss
+  // CHECK: call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %{{.*}})
   return _mm_frcz_ss(a);
 }
 
 __m128d test_mm_frcz_sd(__m128d a) {
   // CHECK-LABEL: test_mm_frcz_sd
-  // CHECK: @llvm.x86.xop.vfrcz.sd
+  // CHECK: call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %{{.*}})
   return _mm_frcz_sd(a);
 }
 
 __m128 test_mm_frcz_ps(__m128 a) {
   // CHECK-LABEL: test_mm_frcz_ps
-  // CHECK: @llvm.x86.xop.vfrcz.ps
+  // CHECK: call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %{{.*}})
   return _mm_frcz_ps(a);
 }
 
 __m128d test_mm_frcz_pd(__m128d a) {
   // CHECK-LABEL: test_mm_frcz_pd
-  // CHECK: @llvm.x86.xop.vfrcz.pd
+  // CHECK: call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %{{.*}})
   return _mm_frcz_pd(a);
 }
 
 __m256 test_mm256_frcz_ps(__m256 a) {
   // CHECK-LABEL: test_mm256_frcz_ps
-  // CHECK: @llvm.x86.xop.vfrcz.ps.256
+  // CHECK: call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %{{.*}})
   return _mm256_frcz_ps(a);
 }
 
 __m256d test_mm256_frcz_pd(__m256d a) {
   // CHECK-LABEL: test_mm256_frcz_pd
-  // CHECK: @llvm.x86.xop.vfrcz.pd.256
+  // CHECK: call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %{{.*}})
   return _mm256_frcz_pd(a);
 }
diff --git a/test/CodeGen/xray-attributes-supported.cpp b/test/CodeGen/xray-attributes-supported.cpp
new file mode 100644
index 0000000..d70b3aa
--- /dev/null
+++ b/test/CodeGen/xray-attributes-supported.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -fxray-instrument -std=c++11 -x c++ -emit-llvm -o - -triple x86_64-unknown-linux-gnu | FileCheck %s
+
+// Make sure that the LLVM attribute for XRay-annotated functions do show up.
+[[clang::xray_always_instrument]] void foo() {
+// CHECK: define void @_Z3foov() #0
+};
+
+[[clang::xray_never_instrument]] void bar() {
+// CHECK: define void @_Z3barv() #1
+};
+
+// CHECK: #0 = {{.*}}"function-instrument"="xray-always"
+// CHECK: #1 = {{.*}}"function-instrument"="xray-never"
diff --git a/test/CodeGenCUDA/Inputs/cuda-initializers.h b/test/CodeGenCUDA/Inputs/cuda-initializers.h
new file mode 100644
index 0000000..186b160
--- /dev/null
+++ b/test/CodeGenCUDA/Inputs/cuda-initializers.h
@@ -0,0 +1,145 @@
+// CUDA struct types with interesting initialization properties.
+// Keep in sync with ../SemaCUDA/Inputs/cuda-initializers.h.
+
+// Base classes with different initializer variants.
+
+// trivial constructor -- allowed
+struct T {
+  int t;
+};
+
+// empty constructor
+struct EC {
+  int ec;
+  __device__ EC() {}     // -- allowed
+  __device__ EC(int) {}  // -- not allowed
+};
+
+// empty destructor
+struct ED {
+  __device__ ~ED() {}     // -- allowed
+};
+
+struct ECD {
+  __device__ ECD() {}     // -- allowed
+  __device__ ~ECD() {}    // -- allowed
+};
+
+// empty templated constructor -- allowed with no arguments
+struct ETC {
+  template <typename... T> __device__ ETC(T...) {}
+};
+
+// undefined constructor -- not allowed
+struct UC {
+  int uc;
+  __device__ UC();
+};
+
+// undefined destructor -- not allowed
+struct UD {
+  int ud;
+  __device__ ~UD();
+};
+
+// empty constructor w/ initializer list -- not allowed
+struct ECI {
+  int eci;
+  __device__ ECI() : eci(1) {}
+};
+
+// non-empty constructor -- not allowed
+struct NEC {
+  int nec;
+  __device__ NEC() { nec = 1; }
+};
+
+// non-empty destructor -- not allowed
+struct NED {
+  int ned;
+  __device__ ~NED() { ned = 1; }
+};
+
+// no-constructor,  virtual method -- not allowed
+struct NCV {
+  int ncv;
+  __device__ virtual void vm() {}
+};
+
+// virtual destructor -- not allowed.
+struct VD {
+  __device__ virtual ~VD() {}
+};
+
+// dynamic in-class field initializer -- not allowed
+__device__ int f();
+struct NCF {
+  int ncf = f();
+};
+
+// static in-class field initializer.  NVCC does not allow it, but
+// clang generates static initializer for this, so we'll accept it.
+// We still can't use it on __shared__ vars as they don't allow *any*
+// initializers.
+struct NCFS {
+  int ncfs = 3;
+};
+
+// undefined templated constructor -- not allowed
+struct UTC {
+  template <typename... T> __device__ UTC(T...);
+};
+
+// non-empty templated constructor -- not allowed
+struct NETC {
+  int netc;
+  template <typename... T> __device__ NETC(T...) { netc = 1; }
+};
+
+// Regular base class -- allowed
+struct T_B_T : T {};
+
+// Incapsulated object of allowed class -- allowed
+struct T_F_T {
+  T t;
+};
+
+// array of allowed objects -- allowed
+struct T_FA_T {
+  T t[2];
+};
+
+
+// Calling empty base class initializer is OK
+struct EC_I_EC : EC {
+  __device__ EC_I_EC() : EC() {}
+};
+
+// .. though passing arguments is not allowed.
+struct EC_I_EC1 : EC {
+  __device__ EC_I_EC1() : EC(1) {}
+};
+
+// Virtual base class -- not allowed
+struct T_V_T : virtual T {};
+
+// Inherited from or incapsulated class with non-empty constructor --
+// not allowed
+struct T_B_NEC : NEC {};
+struct T_F_NEC {
+  NEC nec;
+};
+struct T_FA_NEC {
+  NEC nec[2];
+};
+
+
+// Inherited from or incapsulated class with non-empty desstructor --
+// not allowed
+struct T_B_NED : NED {};
+struct T_F_NED {
+  NED ned;
+};
+struct T_FA_NED {
+  NED ned[2];
+};
diff --git a/test/CodeGenCUDA/Inputs/cuda.h b/test/CodeGenCUDA/Inputs/cuda.h
index a9a4595..9b9f43a 100644
--- a/test/CodeGenCUDA/Inputs/cuda.h
+++ b/test/CodeGenCUDA/Inputs/cuda.h
@@ -18,3 +18,5 @@
 
 int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
                       cudaStream_t stream = 0);
+
+extern "C" __device__ int printf(const char*, ...);
diff --git a/test/CodeGenCUDA/address-spaces.cu b/test/CodeGenCUDA/address-spaces.cu
index 31cba95..449529b 100644
--- a/test/CodeGenCUDA/address-spaces.cu
+++ b/test/CodeGenCUDA/address-spaces.cu
@@ -25,8 +25,6 @@
 // CHECK: @_ZZ5func3vE1a = internal addrspace(3) global float 0.000000e+00
 // CHECK: @_ZZ5func4vE1a = internal addrspace(3) global float 0.000000e+00
 // CHECK: @b = addrspace(3) global float undef
-// CHECK: @c = addrspace(3) global %struct.c undef
-// CHECK  @d = addrspace(3) global %struct.d undef
 
 __device__ void foo() {
   // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @i to i32*)
@@ -38,14 +36,6 @@
   // CHECK: load i32, i32* addrspacecast (i32 addrspace(3)* @k to i32*)
   k++;
 
-  static int li;
-  // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @_ZZ3foovE2li to i32*)
-  li++;
-
-  __constant__ int lj;
-  // CHECK: load i32, i32* addrspacecast (i32 addrspace(4)* @_ZZ3foovE2lj to i32*)
-  lj++;
-
   __shared__ int lk;
   // CHECK: load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZ3foovE2lk to i32*)
   lk++;
@@ -102,32 +92,3 @@
 }
 // CHECK: define float* @_Z5func5v()
 // CHECK: ret float* addrspacecast (float addrspace(3)* @b to float*)
-
-struct StructWithCtor {
-  __device__ StructWithCtor(): data(1) {}
-  __device__ StructWithCtor(const StructWithCtor &second): data(second.data) {}
-  __device__ int getData() { return data; }
-  int data;
-};
-
-__device__ int construct_shared_struct() {
-// CHECK-LABEL: define i32 @_Z23construct_shared_structv()
-  __shared__ StructWithCtor s;
-// CHECK: call void @_ZN14StructWithCtorC1Ev(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1s to %struct.StructWithCtor*))
-  __shared__ StructWithCtor t(s);
-// CHECK: call void @_ZN14StructWithCtorC1ERKS_(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1t to %struct.StructWithCtor*), %struct.StructWithCtor* dereferenceable(4) addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1s to %struct.StructWithCtor*))
-  return t.getData();
-// CHECK: call i32 @_ZN14StructWithCtor7getDataEv(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1t to %struct.StructWithCtor*))
-}
-
-// Make sure we allow __shared__ structures with default or empty constructors.
-struct c {
-  int i;
-};
-__shared__ struct c c;
-
-struct d {
-  int i;
-  d() {}
-};
-__shared__ struct d d;
diff --git a/test/CodeGenCUDA/alias.cu b/test/CodeGenCUDA/alias.cu
new file mode 100644
index 0000000..6efff6b
--- /dev/null
+++ b/test/CodeGenCUDA/alias.cu
@@ -0,0 +1,17 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -emit-llvm \
+// RUN:   -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// Check that we don't generate an alias from "foo" to the mangled name for
+// ns::foo() -- nvptx doesn't support aliases.
+
+namespace ns {
+extern "C" {
+// CHECK-NOT: @foo = internal alias
+__device__ __attribute__((used)) static int foo() { return 0; }
+}
+}
diff --git a/test/CodeGenCUDA/convergent.cu b/test/CodeGenCUDA/convergent.cu
new file mode 100644
index 0000000..6827c57
--- /dev/null
+++ b/test/CodeGenCUDA/convergent.cu
@@ -0,0 +1,45 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -emit-llvm \
+// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix DEVICE %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
+// RUN:   -disable-llvm-passes -o - %s | \
+// RUN:  FileCheck -check-prefix HOST %s
+
+#include "Inputs/cuda.h"
+
+// DEVICE: Function Attrs:
+// DEVICE-SAME: convergent
+// DEVICE-NEXT: define void @_Z3foov
+__device__ void foo() {}
+
+// HOST: Function Attrs:
+// HOST-NOT: convergent
+// HOST-NEXT: define void @_Z3barv
+// DEVICE: Function Attrs:
+// DEVICE-SAME: convergent
+// DEVICE-NEXT: define void @_Z3barv
+__host__ __device__ void baz();
+__host__ __device__ void bar() {
+  // DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
+  baz();
+  // DEVICE: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
+  int x;
+  asm ("trap;" : "=l"(x));
+  // DEVICE: call void asm sideeffect "trap;", ""() [[ASM_ATTR:#[0-9]+]]
+  asm volatile ("trap;");
+}
+
+// DEVICE: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
+// DEVICE: attributes [[BAZ_ATTR]] = {
+// DEVICE-SAME: convergent
+// DEVICE-SAME: }
+// DEVICE: attributes [[CALL_ATTR]] = { convergent }
+// DEVICE: attributes [[ASM_ATTR]] = { convergent
+
+// HOST: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
+// HOST: attributes [[BAZ_ATTR]] = {
+// HOST-NOT: convergent
+// NOST-SAME: }
diff --git a/test/CodeGenCUDA/cuda-builtin-vars.cu b/test/CodeGenCUDA/cuda-builtin-vars.cu
index 834e16d..c2159f5 100644
--- a/test/CodeGenCUDA/cuda-builtin-vars.cu
+++ b/test/CodeGenCUDA/cuda-builtin-vars.cu
@@ -6,21 +6,21 @@
 __attribute__((global))
 void kernel(int *out) {
   int i = 0;
-  out[i++] = threadIdx.x; // CHECK: call i32 @llvm.ptx.read.tid.x()
-  out[i++] = threadIdx.y; // CHECK: call i32 @llvm.ptx.read.tid.y()
-  out[i++] = threadIdx.z; // CHECK: call i32 @llvm.ptx.read.tid.z()
+  out[i++] = threadIdx.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  out[i++] = threadIdx.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  out[i++] = threadIdx.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 
-  out[i++] = blockIdx.x; // CHECK: call i32 @llvm.ptx.read.ctaid.x()
-  out[i++] = blockIdx.y; // CHECK: call i32 @llvm.ptx.read.ctaid.y()
-  out[i++] = blockIdx.z; // CHECK: call i32 @llvm.ptx.read.ctaid.z()
+  out[i++] = blockIdx.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  out[i++] = blockIdx.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  out[i++] = blockIdx.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
 
-  out[i++] = blockDim.x; // CHECK: call i32 @llvm.ptx.read.ntid.x()
-  out[i++] = blockDim.y; // CHECK: call i32 @llvm.ptx.read.ntid.y()
-  out[i++] = blockDim.z; // CHECK: call i32 @llvm.ptx.read.ntid.z()
+  out[i++] = blockDim.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  out[i++] = blockDim.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  out[i++] = blockDim.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
 
-  out[i++] = gridDim.x; // CHECK: call i32 @llvm.ptx.read.nctaid.x()
-  out[i++] = gridDim.y; // CHECK: call i32 @llvm.ptx.read.nctaid.y()
-  out[i++] = gridDim.z; // CHECK: call i32 @llvm.ptx.read.nctaid.z()
+  out[i++] = gridDim.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+  out[i++] = gridDim.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+  out[i++] = gridDim.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
 
   out[i++] = warpSize; // CHECK: store i32 32,
 
diff --git a/test/CodeGenCUDA/device-stub.cu b/test/CodeGenCUDA/device-stub.cu
index 7f5e159..3376803 100644
--- a/test/CodeGenCUDA/device-stub.cu
+++ b/test/CodeGenCUDA/device-stub.cu
@@ -1,15 +1,56 @@
-// RUN: %clang_cc1 -emit-llvm %s -fcuda-include-gpubinary %s -o - | FileCheck %s
+// RUN: echo "GPU binary would be here" > %t
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
+// RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=NOGPUBIN
 
 #include "Inputs/cuda.h"
 
+#ifndef NOGLOBALS
+// CHECK-DAG: @device_var = internal global i32
+__device__ int device_var;
+
+// CHECK-DAG: @constant_var = internal global i32
+__constant__ int constant_var;
+
+// CHECK-DAG: @shared_var = internal global i32
+__shared__ int shared_var;
+
+// Make sure host globals don't get internalized...
+// CHECK-DAG: @host_var = global i32
+int host_var;
+// ... and that extern vars remain external.
+// CHECK-DAG: @ext_host_var = external global i32
+extern int ext_host_var;
+
+// Shadows for external device-side variables are *definitions* of
+// those variables.
+// CHECK-DAG: @ext_device_var = internal global i32
+extern __device__ int ext_device_var;
+// CHECK-DAG: @ext_device_var = internal global i32
+extern __constant__ int ext_constant_var;
+
+void use_pointers() {
+  int *p;
+  p = &device_var;
+  p = &constant_var;
+  p = &shared_var;
+  p = &host_var;
+  p = &ext_device_var;
+  p = &ext_constant_var;
+  p = &ext_host_var;
+}
+
 // Make sure that all parts of GPU code init/cleanup are there:
 // * constant unnamed string with the kernel name
 // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
-// CHECK: private unnamed_addr constant{{.*}}\00"
+// CHECK: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
+// CHECK-SAME: section ".nv_fatbin", align 8
 // * constant struct that wraps GPU binary
 // CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
-// CHECK:       { i32 1180844977, i32 1, {{.*}}, i8* null }
+// CHECK-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
+// CHECK-SAME: section ".nvFatBinSegment"
 // * variable to save GPU binary handle after initialization
 // CHECK: @__cuda_gpubin_handle = internal global i8** null
 // * Make sure our constructor/destructor was added to global ctor/dtor list.
@@ -31,10 +72,16 @@
 // CHECK: call{{.*}}cudaConfigureCall
 // CHECK: call{{.*}}kernelfunc
 void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
+#endif
 
-// Test that we've built a function to register kernels
-// CHECK: define internal void @__cuda_register_kernels
+// Test that we've built a function to register kernels and global vars.
+// CHECK: define internal void @__cuda_register_globals
 // CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
+// CHECK: ret void
 
 // Test that we've built contructor..
 // CHECK: define internal void @__cuda_module_ctor
@@ -42,11 +89,26 @@
 // CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
 //   .. stores return value in __cuda_gpubin_handle
 // CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
-//   .. and then calls __cuda_register_kernels
-// CHECK-NEXT: call void @__cuda_register_kernels
+//   .. and then calls __cuda_register_globals
+// CHECK-NEXT: call void @__cuda_register_globals
 
 // Test that we've created destructor.
 // CHECK: define internal void @__cuda_module_dtor
 // CHECK: load{{.*}}__cuda_gpubin_handle
 // CHECK-NEXT: call void @__cudaUnregisterFatBinary
 
+// There should be no __cuda_register_globals if we have no
+// device-side globals, but we still need to register GPU binary.
+// Skip GPU binary string first.
+// NOGLOBALS: @0 = private unnamed_addr constant{{.*}}
+// NOGLOBALS-NOT: define internal void @__cuda_register_globals
+// NOGLOBALS: define internal void @__cuda_module_ctor
+// NOGLOBALS: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
+// NOGLOBALS-NOT: call void @__cuda_register_globals
+// NOGLOBALS: define internal void @__cuda_module_dtor
+// NOGLOBALS: call void @__cudaUnregisterFatBinary
+
+// There should be no constructors/destructors if we have no GPU binary.
+// NOGPUBIN-NOT: define internal void @__cuda_register_globals
+// NOGPUBIN-NOT: define internal void @__cuda_module_ctor
+// NOGPUBIN-NOT: define internal void @__cuda_module_dtor
diff --git a/test/CodeGenCUDA/device-var-init.cu b/test/CodeGenCUDA/device-var-init.cu
new file mode 100644
index 0000000..6f2d929
--- /dev/null
+++ b/test/CodeGenCUDA/device-var-init.cu
@@ -0,0 +1,198 @@
+// REQUIRES: nvptx-registered-target
+
+// Make sure we don't allow dynamic initialization for device
+// variables, but accept empty constructors allowed by CUDA.
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -std=c++11 \
+// RUN:     -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck %s
+
+#ifdef __clang__
+#include "Inputs/cuda.h"
+#endif
+
+// Use the types we share with Sema tests.
+#include "Inputs/cuda-initializers.h"
+
+__device__ int d_v;
+// CHECK: @d_v = addrspace(1) externally_initialized global i32 0,
+__shared__ int s_v;
+// CHECK: @s_v = addrspace(3) global i32 undef,
+__constant__ int c_v;
+// CHECK: addrspace(4) externally_initialized global i32 0,
+
+__device__ int d_v_i = 1;
+// CHECK: @d_v_i = addrspace(1) externally_initialized global i32 1,
+
+// trivial constructor -- allowed
+__device__ T d_t;
+// CHECK: @d_t = addrspace(1) externally_initialized global %struct.T zeroinitializer
+__shared__ T s_t;
+// CHECK: @s_t = addrspace(3) global %struct.T undef,
+__constant__ T c_t;
+// CHECK: @c_t = addrspace(4) externally_initialized global %struct.T zeroinitializer,
+
+__device__ T d_t_i = {2};
+// CHECK: @d_t_i = addrspace(1) externally_initialized global %struct.T { i32 2 },
+__constant__ T c_t_i = {2};
+// CHECK: @c_t_i = addrspace(4) externally_initialized global %struct.T { i32 2 },
+
+// empty constructor
+__device__ EC d_ec;
+// CHECK: @d_ec = addrspace(1) externally_initialized global %struct.EC zeroinitializer,
+__shared__ EC s_ec;
+// CHECK: @s_ec = addrspace(3) global %struct.EC undef,
+__constant__ EC c_ec;
+// CHECK: @c_ec = addrspace(4) externally_initialized global %struct.EC zeroinitializer,
+
+// empty destructor
+__device__ ED d_ed;
+// CHECK: @d_ed = addrspace(1) externally_initialized global %struct.ED zeroinitializer,
+__shared__ ED s_ed;
+// CHECK: @s_ed = addrspace(3) global %struct.ED undef,
+__constant__ ED c_ed;
+// CHECK: @c_ed = addrspace(4) externally_initialized global %struct.ED zeroinitializer,
+
+__device__ ECD d_ecd;
+// CHECK: @d_ecd = addrspace(1) externally_initialized global %struct.ECD zeroinitializer,
+__shared__ ECD s_ecd;
+// CHECK: @s_ecd = addrspace(3) global %struct.ECD undef,
+__constant__ ECD c_ecd;
+// CHECK: @c_ecd = addrspace(4) externally_initialized global %struct.ECD zeroinitializer,
+
+// empty templated constructor -- allowed with no arguments
+__device__ ETC d_etc;
+// CHECK: @d_etc = addrspace(1) externally_initialized global %struct.ETC zeroinitializer,
+__shared__ ETC s_etc;
+// CHECK: @s_etc = addrspace(3) global %struct.ETC undef,
+__constant__ ETC c_etc;
+// CHECK: @c_etc = addrspace(4) externally_initialized global %struct.ETC zeroinitializer,
+
+__device__ NCFS d_ncfs;
+// CHECK: @d_ncfs = addrspace(1) externally_initialized global %struct.NCFS { i32 3 }
+__constant__ NCFS c_ncfs;
+// CHECK: @c_ncfs = addrspace(4) externally_initialized global %struct.NCFS { i32 3 }
+
+// Regular base class -- allowed
+__device__ T_B_T d_t_b_t;
+// CHECK: @d_t_b_t = addrspace(1) externally_initialized global %struct.T_B_T zeroinitializer,
+__shared__ T_B_T s_t_b_t;
+// CHECK: @s_t_b_t = addrspace(3) global %struct.T_B_T undef,
+__constant__ T_B_T c_t_b_t;
+// CHECK: @c_t_b_t = addrspace(4) externally_initialized global %struct.T_B_T zeroinitializer,
+
+// Incapsulated object of allowed class -- allowed
+__device__ T_F_T d_t_f_t;
+// CHECK: @d_t_f_t = addrspace(1) externally_initialized global %struct.T_F_T zeroinitializer,
+__shared__ T_F_T s_t_f_t;
+// CHECK: @s_t_f_t = addrspace(3) global %struct.T_F_T undef,
+__constant__ T_F_T c_t_f_t;
+// CHECK: @c_t_f_t = addrspace(4) externally_initialized global %struct.T_F_T zeroinitializer,
+
+// array of allowed objects -- allowed
+__device__ T_FA_T d_t_fa_t;
+// CHECK: @d_t_fa_t = addrspace(1) externally_initialized global %struct.T_FA_T zeroinitializer,
+__shared__ T_FA_T s_t_fa_t;
+// CHECK: @s_t_fa_t = addrspace(3) global %struct.T_FA_T undef,
+__constant__ T_FA_T c_t_fa_t;
+// CHECK: @c_t_fa_t = addrspace(4) externally_initialized global %struct.T_FA_T zeroinitializer,
+
+
+// Calling empty base class initializer is OK
+__device__ EC_I_EC d_ec_i_ec;
+// CHECK: @d_ec_i_ec = addrspace(1) externally_initialized global %struct.EC_I_EC zeroinitializer,
+__shared__ EC_I_EC s_ec_i_ec;
+// CHECK: @s_ec_i_ec = addrspace(3) global %struct.EC_I_EC undef,
+__constant__ EC_I_EC c_ec_i_ec;
+// CHECK: @c_ec_i_ec = addrspace(4) externally_initialized global %struct.EC_I_EC zeroinitializer,
+
+// We should not emit global initializers for device-side variables.
+// CHECK-NOT: @__cxx_global_var_init
+
+// Make sure that initialization restrictions do not apply to local
+// variables.
+__device__ void df() {
+  T t;
+  // CHECK-NOT: call
+  EC ec;
+  // CHECK:   call void @_ZN2ECC1Ev(%struct.EC* %ec)
+  ED ed;
+  // CHECK-NOT: call
+  ECD ecd;
+  // CHECK:   call void @_ZN3ECDC1Ev(%struct.ECD* %ecd)
+  ETC etc;
+  // CHECK:   call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* %etc)
+  UC uc;
+  // undefined constructor -- not allowed
+  // CHECK:   call void @_ZN2UCC1Ev(%struct.UC* %uc)
+  UD ud;
+  // undefined destructor -- not allowed
+  // CHECK-NOT: call
+  ECI eci;
+  // empty constructor w/ initializer list -- not allowed
+  // CHECK:   call void @_ZN3ECIC1Ev(%struct.ECI* %eci)
+  NEC nec;
+  // non-empty constructor -- not allowed
+  // CHECK:   call void @_ZN3NECC1Ev(%struct.NEC* %nec)
+  // non-empty destructor -- not allowed
+  NED ned;
+  // no-constructor,  virtual method -- not allowed
+  // CHECK:   call void @_ZN3NCVC1Ev(%struct.NCV* %ncv)
+  NCV ncv;
+  // CHECK-NOT: call
+  VD vd;
+  // CHECK:   call void @_ZN2VDC1Ev(%struct.VD* %vd)
+  NCF ncf;
+  // CHECK:   call void @_ZN3NCFC1Ev(%struct.NCF* %ncf)
+  NCFS ncfs;
+  // CHECK:   call void @_ZN4NCFSC1Ev(%struct.NCFS* %ncfs)
+  UTC utc;
+  // CHECK:   call void @_ZN3UTCC1IJEEEDpT_(%struct.UTC* %utc)
+  NETC netc;
+  // CHECK:   call void @_ZN4NETCC1IJEEEDpT_(%struct.NETC* %netc)
+  T_B_T t_b_t;
+  // CHECK-NOT: call
+  T_F_T t_f_t;
+  // CHECK-NOT: call
+  T_FA_T t_fa_t;
+  // CHECK-NOT: call
+  EC_I_EC ec_i_ec;
+  // CHECK:   call void @_ZN7EC_I_ECC1Ev(%struct.EC_I_EC* %ec_i_ec)
+  EC_I_EC1 ec_i_ec1;
+  // CHECK:   call void @_ZN8EC_I_EC1C1Ev(%struct.EC_I_EC1* %ec_i_ec1)
+  T_V_T t_v_t;
+  // CHECK:   call void @_ZN5T_V_TC1Ev(%struct.T_V_T* %t_v_t)
+  T_B_NEC t_b_nec;
+  // CHECK:   call void @_ZN7T_B_NECC1Ev(%struct.T_B_NEC* %t_b_nec)
+  T_F_NEC t_f_nec;
+  // CHECK:   call void @_ZN7T_F_NECC1Ev(%struct.T_F_NEC* %t_f_nec)
+  T_FA_NEC t_fa_nec;
+  // CHECK:   call void @_ZN8T_FA_NECC1Ev(%struct.T_FA_NEC* %t_fa_nec)
+  T_B_NED t_b_ned;
+  // CHECK-NOT: call
+  T_F_NED t_f_ned;
+  // CHECK-NOT: call
+  T_FA_NED t_fa_ned;
+  // CHECK-NOT: call
+  static __shared__ EC s_ec;
+  // CHECK-NOT: call void @_ZN2ECC1Ev(%struct.EC* addrspacecast (%struct.EC addrspace(3)* @_ZZ2dfvE4s_ec to %struct.EC*))
+  static __shared__ ETC s_etc;
+  // CHECK-NOT: call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* addrspacecast (%struct.ETC addrspace(3)* @_ZZ2dfvE5s_etc to %struct.ETC*))
+
+  // anchor point separating constructors and destructors
+  df(); // CHECK: call void @_Z2dfv()
+
+  // Verify that we only call non-empty destructors
+  // CHECK-NEXT: call void @_ZN8T_FA_NEDD1Ev(%struct.T_FA_NED* %t_fa_ned) #6
+  // CHECK-NEXT: call void @_ZN7T_F_NEDD1Ev(%struct.T_F_NED* %t_f_ned) #6
+  // CHECK-NEXT: call void @_ZN7T_B_NEDD1Ev(%struct.T_B_NED* %t_b_ned) #6
+  // CHECK-NEXT: call void @_ZN2VDD1Ev(%struct.VD* %vd)
+  // CHECK-NEXT: call void @_ZN3NEDD1Ev(%struct.NED* %ned)
+  // CHECK-NEXT: call void @_ZN2UDD1Ev(%struct.UD* %ud)
+  // CHECK-NEXT: call void @_ZN3ECDD1Ev(%struct.ECD* %ecd)
+  // CHECK-NEXT: call void @_ZN2EDD1Ev(%struct.ED* %ed)
+
+  // CHECK-NEXT: ret void
+}
+
+// We should not emit global init function.
+// CHECK-NOT: @_GLOBAL__sub_I
diff --git a/test/CodeGenCUDA/filter-decl.cu b/test/CodeGenCUDA/filter-decl.cu
index 023ae61..bc744a0 100644
--- a/test/CodeGenCUDA/filter-decl.cu
+++ b/test/CodeGenCUDA/filter-decl.cu
@@ -9,15 +9,15 @@
 // CHECK-DEVICE-NOT: module asm "file scope asm is host only"
 __asm__("file scope asm is host only");
 
-// CHECK-HOST-NOT: constantdata = externally_initialized global
+// CHECK-HOST: constantdata = internal global
 // CHECK-DEVICE: constantdata = externally_initialized global
 __constant__ char constantdata[256];
 
-// CHECK-HOST-NOT: devicedata = externally_initialized global
+// CHECK-HOST: devicedata = internal global
 // CHECK-DEVICE: devicedata = externally_initialized global
 __device__ char devicedata[256];
 
-// CHECK-HOST-NOT: shareddata = global
+// CHECK-HOST: shareddata = internal global
 // CHECK-DEVICE: shareddata = global
 __shared__ char shareddata[256];
 
diff --git a/test/CodeGenCUDA/flush-denormals.cu b/test/CodeGenCUDA/flush-denormals.cu
new file mode 100644
index 0000000..e528d7b
--- /dev/null
+++ b/test/CodeGenCUDA/flush-denormals.cu
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fcuda-is-device \
+// RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix NOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -fcuda-flush-denormals-to-zero \
+// RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
+
+#include "Inputs/cuda.h"
+
+// Checks that device function calls get emitted with the "ntpvx-f32ftz"
+// attribute set to "true" when we compile CUDA device code with
+// -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
+// or absence of -fcuda-flush-denormals-to-zero in a module flag.
+
+// CHECK-LABEL: define void @foo() #0
+extern "C" __device__ void foo() {}
+
+// FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
+// NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+
+// FTZ:!llvm.module.flags = !{[[MODFLAG:![0-9]+]]}
+// FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+
+// NOFTZ:!llvm.module.flags = !{[[MODFLAG:![0-9]+]]}
+// NOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
diff --git a/test/CodeGenCUDA/fp-contract.cu b/test/CodeGenCUDA/fp-contract.cu
new file mode 100644
index 0000000..070ebae
--- /dev/null
+++ b/test/CodeGenCUDA/fp-contract.cu
@@ -0,0 +1,32 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// By default we should fuse multiply/add into fma instruction.
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s
+
+// Explicit -ffp-contract=fast
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -ffp-contract=fast -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix ENABLED %s
+
+// Explicit -ffp-contract=on -- fusing by front-end (disabled).
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -ffp-contract=on -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix DISABLED %s
+
+// Explicit -ffp-contract=off should disable instruction fusing.
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix DISABLED %s
+
+
+#include "Inputs/cuda.h"
+
+__host__ __device__ float func(float a, float b, float c) { return a + b * c; }
+// ENABLED:       fma.rn.f32
+// ENABLED-NEXT:  st.param.f32
+
+// DISABLED:      mul.rn.f32
+// DISABLED-NEXT: add.rn.f32
+// DISABLED-NEXT: st.param.f32
diff --git a/test/CodeGenCUDA/function-overload.cu b/test/CodeGenCUDA/function-overload.cu
index a12ef82..380304a 100644
--- a/test/CodeGenCUDA/function-overload.cu
+++ b/test/CodeGenCUDA/function-overload.cu
@@ -1,168 +1,18 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// Make sure we handle target overloads correctly.
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
-// RUN:     -fcuda-target-overloads -emit-llvm -o - %s \
+// Make sure we handle target overloads correctly.  Most of this is checked in
+// sema, but special functions like constructors and destructors are here.
+//
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s \
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device \
-// RUN:     -fcuda-target-overloads -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
 
-// Check target overloads handling with disabled call target checks.
-// RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \
-// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads -o - %s \
-// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST \
-// RUN:    -check-prefix=CHECK-BOTH-NC -check-prefix=CHECK-HOST-NC %s
-// RUN: %clang_cc1 -DNOCHECKS -triple nvptx64-nvidia-cuda -emit-llvm \
-// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads \
-// RUN:    -fcuda-is-device -o - %s \
-// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE \
-// RUN:    -check-prefix=CHECK-BOTH-NC -check-prefix=CHECK-DEVICE-NC %s
-
 #include "Inputs/cuda.h"
 
-typedef int (*fp_t)(void);
-typedef void (*gp_t)(void);
-
-// CHECK-HOST: @hp = global i32 ()* @_Z1hv
-// CHECK-HOST: @chp = global i32 ()* @ch
-// CHECK-HOST: @dhp = global i32 ()* @_Z2dhv
-// CHECK-HOST: @cdhp = global i32 ()* @cdh
-// CHECK-HOST: @gp = global void ()* @_Z1gv
-
-// CHECK-BOTH-LABEL: define i32 @_Z2dhv()
-__device__ int dh(void) { return 1; }
-// CHECK-DEVICE: ret i32 1
-__host__ int dh(void) { return 2; }
-// CHECK-HOST:   ret i32 2
-
-// CHECK-BOTH-LABEL: define i32 @_Z2hdv()
-__host__ __device__ int hd(void) { return 3; }
-// CHECK-BOTH:   ret i32 3
-
-// CHECK-DEVICE-LABEL: define i32 @_Z1dv()
-__device__ int d(void) { return 8; }
-// CHECK-DEVICE:   ret i32 8
-
-// CHECK-HOST-LABEL: define i32 @_Z1hv()
-__host__ int h(void) { return 9; }
-// CHECK-HOST:   ret i32 9
-
-// CHECK-BOTH-LABEL: define void @_Z1gv()
-__global__ void g(void) {}
-// CHECK-BOTH:   ret void
-
-// mangled names of extern "C" __host__ __device__ functions clash
-// with those of their __host__/__device__ counterparts, so
-// overloading of extern "C" functions can only happen for __host__
-// and __device__ functions -- we never codegen them in the same
-// compilation and therefore mangled name conflict is not a problem.
-
-// CHECK-BOTH-LABEL: define i32 @cdh()
-extern "C" __device__ int cdh(void) {return 10;}
-// CHECK-DEVICE:   ret i32 10
-extern "C" __host__ int cdh(void) {return 11;}
-// CHECK-HOST:     ret i32 11
-
-// CHECK-DEVICE-LABEL: define i32 @cd()
-extern "C" __device__ int cd(void) {return 12;}
-// CHECK-DEVICE:   ret i32 12
-
-// CHECK-HOST-LABEL: define i32 @ch()
-extern "C" __host__ int ch(void) {return 13;}
-// CHECK-HOST:     ret i32 13
-
-// CHECK-BOTH-LABEL: define i32 @chd()
-extern "C" __host__ __device__ int chd(void) {return 14;}
-// CHECK-BOTH:     ret i32 14
-
-// CHECK-HOST-LABEL: define void @_Z5hostfv()
-__host__ void hostf(void) {
-#if defined (NOCHECKS)
-  fp_t dp = d;   // CHECK-HOST-NC: store {{.*}} @_Z1dv, {{.*}} %dp,
-  fp_t cdp = cd; // CHECK-HOST-NC: store {{.*}} @cd, {{.*}} %cdp,
-#endif
-  fp_t hp = h; // CHECK-HOST: store {{.*}} @_Z1hv, {{.*}} %hp,
-  fp_t chp = ch; // CHECK-HOST: store {{.*}} @ch, {{.*}} %chp,
-  fp_t dhp = dh; // CHECK-HOST: store {{.*}} @_Z2dhv, {{.*}} %dhp,
-  fp_t cdhp = cdh; // CHECK-HOST: store {{.*}} @cdh, {{.*}} %cdhp,
-  fp_t hdp = hd; // CHECK-HOST: store {{.*}} @_Z2hdv, {{.*}} %hdp,
-  fp_t chdp = chd; // CHECK-HOST: store {{.*}} @chd, {{.*}} %chdp,
-  gp_t gp = g; // CHECK-HOST: store {{.*}} @_Z1gv, {{.*}} %gp,
-
-#if defined (NOCHECKS)
-  d();     // CHECK-HOST-NC: call i32 @_Z1dv()
-  cd();    // CHECK-HOST-NC: call i32 @cd()
-#endif
-  h();     // CHECK-HOST: call i32 @_Z1hv()
-  ch();    // CHECK-HOST: call i32 @ch()
-  dh();    // CHECK-HOST: call i32 @_Z2dhv()
-  cdh();   // CHECK-HOST: call i32 @cdh()
-  g<<<0,0>>>();  // CHECK-HOST: call void @_Z1gv()
-}
-
-// CHECK-DEVICE-LABEL: define void @_Z7devicefv()
-__device__ void devicef(void) {
-  fp_t dp = d;   // CHECK-DEVICE: store {{.*}} @_Z1dv, {{.*}} %dp,
-  fp_t cdp = cd; // CHECK-DEVICE: store {{.*}} @cd, {{.*}} %cdp,
-#if defined (NOCHECKS)
-  fp_t hp = h; // CHECK-DEVICE-NC: store {{.*}} @_Z1hv, {{.*}} %hp,
-  fp_t chp = ch; // CHECK-DEVICE-NC: store {{.*}} @ch, {{.*}} %chp,
-#endif
-  fp_t dhp = dh; // CHECK-DEVICE: store {{.*}} @_Z2dhv, {{.*}} %dhp,
-  fp_t cdhp = cdh; // CHECK-DEVICE: store {{.*}} @cdh, {{.*}} %cdhp,
-  fp_t hdp = hd; // CHECK-DEVICE: store {{.*}} @_Z2hdv, {{.*}} %hdp,
-  fp_t chdp = chd; // CHECK-DEVICE: store {{.*}} @chd, {{.*}} %chdp,
-
-  d();     // CHECK-DEVICE: call i32 @_Z1dv()
-  cd();    // CHECK-DEVICE: call i32 @cd()
-#if defined (NOCHECKS)
-  h();     // CHECK-DEVICE-NC: call i32 @_Z1hv()
-  ch();    // CHECK-DEVICE-NC: call i32 @ch()
-#endif
-  dh();    // CHECK-DEVICE: call i32 @_Z2dhv()
-  cdh();   // CHECK-DEVICE: call i32 @cdh()
-}
-
-// CHECK-BOTH-LABEL: define void @_Z11hostdevicefv()
-__host__ __device__ void hostdevicef(void) {
-#if defined (NOCHECKS)
-  fp_t dp = d;   // CHECK-BOTH-NC: store {{.*}} @_Z1dv, {{.*}} %dp,
-  fp_t cdp = cd; // CHECK-BOTH-NC: store {{.*}} @cd, {{.*}} %cdp,
-  fp_t hp = h; // CHECK-BOTH-NC: store {{.*}} @_Z1hv, {{.*}} %hp,
-  fp_t chp = ch; // CHECK-BOTH-NC: store {{.*}} @ch, {{.*}} %chp,
-#endif
-  fp_t dhp = dh; // CHECK-BOTH: store {{.*}} @_Z2dhv, {{.*}} %dhp,
-  fp_t cdhp = cdh; // CHECK-BOTH: store {{.*}} @cdh, {{.*}} %cdhp,
-  fp_t hdp = hd; // CHECK-BOTH: store {{.*}} @_Z2hdv, {{.*}} %hdp,
-  fp_t chdp = chd; // CHECK-BOTH: store {{.*}} @chd, {{.*}} %chdp,
-#if defined (NOCHECKS) && !defined(__CUDA_ARCH__)
-  gp_t gp = g; // CHECK-HOST-NC: store {{.*}} @_Z1gv, {{.*}} %gp,
-#endif
-
-#if defined (NOCHECKS)
-  d();     // CHECK-BOTH-NC: call i32 @_Z1dv()
-  cd();    // CHECK-BOTH-NC: call i32 @cd()
-  h();     // CHECK-BOTH-NC: call i32 @_Z1hv()
-  ch();    // CHECK-BOTH-NC: call i32 @ch()
-#endif
-  dh();    // CHECK-BOTH: call i32 @_Z2dhv()
-  cdh();   // CHECK-BOTH: call i32 @cdh()
-#if defined (NOCHECKS) && !defined(__CUDA_ARCH__)
-  g<<<0,0>>>();  // CHECK-HOST-NC: call void @_Z1gv()
-#endif
-}
-
-// Test for address of overloaded function resolution in the global context.
-fp_t hp = h;
-fp_t chp = ch;
-fp_t dhp = dh;
-fp_t cdhp = cdh;
-gp_t gp = g;
-
-int x;
 // Check constructors/destructors for D/H functions
+int x;
 struct s_cd_dh {
   __host__ s_cd_dh() { x = 11; }
   __device__ s_cd_dh() { x = 12; }
@@ -211,4 +61,3 @@
 // CHECK-HOST:   store i32 21,
 // CHECK-DEVICE: store i32 22,
 // CHECK-BOTH: ret void
-
diff --git a/test/CodeGenCUDA/host-device-calls-host.cu b/test/CodeGenCUDA/host-device-calls-host.cu
deleted file mode 100644
index 8140f61..0000000
--- a/test/CodeGenCUDA/host-device-calls-host.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-allow-host-calls-from-host-device -fcuda-is-device -Wno-cuda-compat -emit-llvm -o - | FileCheck %s
-
-#include "Inputs/cuda.h"
-
-extern "C"
-void host_function() {}
-
-// CHECK-LABEL: define void @hd_function_a
-extern "C"
-__host__ __device__ void hd_function_a() {
-  // CHECK: call void @host_function
-  host_function();
-}
-
-// CHECK: declare void @host_function
-
-// CHECK-LABEL: define void @hd_function_b
-extern "C"
-__host__ __device__ void hd_function_b(bool b) { if (b) host_function(); }
-
-// CHECK-LABEL: define void @device_function_b
-extern "C"
-__device__ void device_function_b() { hd_function_b(false); }
-
-// CHECK-LABEL: define void @global_function
-extern "C"
-__global__ void global_function() {
-  // CHECK: call void @device_function_b
-  device_function_b();
-}
-
-// CHECK: !{{[0-9]+}} = !{void ()* @global_function, !"kernel", i32 1}
diff --git a/test/CodeGenCUDA/kernel-args-alignment.cu b/test/CodeGenCUDA/kernel-args-alignment.cu
new file mode 100644
index 0000000..4bd5eb1
--- /dev/null
+++ b/test/CodeGenCUDA/kernel-args-alignment.cu
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \
+// RUN:  FileCheck -check-prefix HOST -check-prefix CHECK %s
+
+// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \
+// RUN:   -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s
+
+#include "Inputs/cuda.h"
+
+struct U {
+  short x;
+} __attribute__((packed));
+
+struct S {
+  int *ptr;
+  char a;
+  U u;
+};
+
+// Clang should generate a packed LLVM struct for S (denoted by the <>s),
+// otherwise this test isn't interesting.
+// CHECK: %struct.S = type <{ i32*, i8, %struct.U, [5 x i8] }>
+
+static_assert(alignof(S) == 8, "Unexpected alignment.");
+
+// HOST-LABEL: @_Z6kernelc1SPi
+// Marshalled kernel args should be:
+//   1. offset 0, width 1
+//   2. offset 8 (because alignof(S) == 8), width 16
+//   3. offset 24, width 8
+// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
+// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
+// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
+
+// DEVICE-LABEL: @_Z6kernelc1SPi
+// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32*
+__global__ void kernel(char a, S s, int *b) {}
diff --git a/test/CodeGenCUDA/launch-bounds.cu b/test/CodeGenCUDA/launch-bounds.cu
index ecbd0ad..6c369c6 100644
--- a/test/CodeGenCUDA/launch-bounds.cu
+++ b/test/CodeGenCUDA/launch-bounds.cu
@@ -79,3 +79,8 @@
 }
 // CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"maxntidx",
 // CHECK-NOT: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"minctasm",
+
+const char constchar = 12;
+__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
+// CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
+// CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12
diff --git a/test/CodeGenCUDA/link-device-bitcode.cu b/test/CodeGenCUDA/link-device-bitcode.cu
index de3d39c..869fcb1 100644
--- a/test/CodeGenCUDA/link-device-bitcode.cu
+++ b/test/CodeGenCUDA/link-device-bitcode.cu
@@ -4,10 +4,10 @@
 // REQUIRES: nvptx-registered-target
 //
 // Prepare bitcode file to link with
-// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \
-// RUN:    %S/Inputs/device-code.ll
-// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t-2.bc \
-// RUN:    %S/Inputs/device-code-2.ll
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc \
+// RUN:    -disable-llvm-passes -o %t.bc %S/Inputs/device-code.ll
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc \
+// RUN:    -disable-llvm-passes -o %t-2.bc %S/Inputs/device-code-2.ll
 //
 // Make sure function in device-code gets linked in and internalized.
 // RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
diff --git a/test/CodeGenCUDA/printf-aggregate.cu b/test/CodeGenCUDA/printf-aggregate.cu
new file mode 100644
index 0000000..2e703b8
--- /dev/null
+++ b/test/CodeGenCUDA/printf-aggregate.cu
@@ -0,0 +1,17 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: not %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm \
+// RUN:   -o - %s 2>&1 | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// Check that we don't crash when asked to printf a non-scalar arg.
+struct Struct {
+  int x;
+  int y;
+};
+__device__ void PrintfNonScalar() {
+  // CHECK: cannot compile this non-scalar arg to printf
+  printf("%d", Struct());
+}
diff --git a/test/CodeGenCUDA/printf.cu b/test/CodeGenCUDA/printf.cu
new file mode 100644
index 0000000..dc3f4ea
--- /dev/null
+++ b/test/CodeGenCUDA/printf.cu
@@ -0,0 +1,43 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm \
+// RUN:   -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+extern "C" __device__ int vprintf(const char*, const char*);
+
+// Check a simple call to printf end-to-end.
+// CHECK: [[SIMPLE_PRINTF_TY:%[a-zA-Z0-9_]+]] = type { i32, i64, double }
+__device__ int CheckSimple() {
+  // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca [[SIMPLE_PRINTF_TY]]
+  // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt
+  const char* fmt = "%d %lld %f";
+  // CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 0
+  // CHECK: store i32 1, i32* [[PTR0]], align 4
+  // CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 1
+  // CHECK: store i64 2, i64* [[PTR1]], align 8
+  // CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 2
+  // CHECK: store double 3.0{{[^,]*}}, double* [[PTR2]], align 8
+  // CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]* [[BUF]] to i8*
+  // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF_CAST]])
+  // CHECK: ret i32 [[RET]]
+  return printf(fmt, 1, 2ll, 3.0);
+}
+
+__device__ void CheckNoArgs() {
+  // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}}
+  printf("hello, world!");
+}
+
+// Check that printf's alloca happens in the entry block, not inside the if
+// statement.
+__device__ bool foo();
+__device__ void CheckAllocaIsInEntryBlock() {
+  // CHECK: alloca %printf_args
+  // CHECK: call {{.*}} @_Z3foov()
+  if (foo()) {
+    printf("%d", 42);
+  }
+}
diff --git a/test/CodeGenCUDA/ptx-kernels.cu b/test/CodeGenCUDA/ptx-kernels.cu
index 6280e60..1d330bd 100644
--- a/test/CodeGenCUDA/ptx-kernels.cu
+++ b/test/CodeGenCUDA/ptx-kernels.cu
@@ -19,8 +19,17 @@
 
 // Make sure host-instantiated kernels are preserved on device side.
 template <typename T> __global__ void templated_kernel(T param) {}
-// CHECK-LABEL: define weak_odr void @_Z16templated_kernelIiEvT_
-void host_function() { templated_kernel<<<0,0>>>(0); }
+// CHECK-DAG: define void @_Z16templated_kernelIiEvT_(
+
+namespace {
+__global__ void anonymous_ns_kernel() {}
+// CHECK-DAG: define void @_ZN12_GLOBAL__N_119anonymous_ns_kernelEv(
+}
+
+void host_function() {
+  templated_kernel<<<0, 0>>>(0);
+  anonymous_ns_kernel<<<0,0>>>();
+}
 
 // CHECK: !{{[0-9]+}} = !{void ()* @global_function, !"kernel", i32 1}
 // CHECK: !{{[0-9]+}} = !{void (i32)* @_Z16templated_kernelIiEvT_, !"kernel", i32 1}
diff --git a/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp b/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
index 3828388..dd5fa3e 100644
--- a/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
+++ b/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
@@ -12,10 +12,11 @@
 
 // CHECK: declare i32 @_Z1cv() [[NUW_RN:#[0-9]+]]
 // CHECK: declare i32 @_Z1pv() [[NUW_RO:#[0-9]+]]
-// CHECK: declare i32 @_Z1tv() [[TF]]
+// CHECK: declare i32 @_Z1tv() [[TF2:#[0-9]+]]
 
 // CHECK: attributes [[TF]] = { {{.*}} }
 // CHECK: attributes [[NUW_RN]] = { nounwind readnone{{.*}} }
 // CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
+// CHECK: attributes [[TF2]] = { {{.*}} }
 // CHECK: attributes [[NUW_RN_CALL]] = { nounwind readnone }
 // CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
diff --git a/test/CodeGenCXX/Inputs/debug-info-class-limited.cpp b/test/CodeGenCXX/Inputs/debug-info-class-limited.cpp
index 17fa456..34a1cfa 100644
--- a/test/CodeGenCXX/Inputs/debug-info-class-limited.cpp
+++ b/test/CodeGenCXX/Inputs/debug-info-class-limited.cpp
@@ -1,5 +1,6 @@
 
-// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "PR16214",{{.*}} line: [[@LINE+1]],{{.*}} isDefinition: true
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "PR16214",{{.*}} line: [[@LINE+2]],{{.*}}
+// CHECK-NOT: DIFlagFwdDecl
 struct PR16214 {
   int i;
 };
@@ -10,7 +11,8 @@
 bar b;
 
 namespace PR14467 {
-// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+1]],{{.*}} isDefinition: true
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+2]],{{.*}}
+// CHECK-NOT: DIFlagFwdDecl
 struct foo {
 };
 
@@ -21,7 +23,7 @@
 }
 
 namespace test1 {
-// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+1]],{{.*}} isDefinition: true
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+1]],{{.*}} flags: DIFlagFwdDecl
 struct foo {
 };
 
@@ -35,7 +37,8 @@
 // FIXME: if we were a bit fancier, we could realize that the 'foo' type is only
 // required because of the 'bar' type which is not required at all (or might
 // only be required to be declared)
-// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+1]],{{.*}} isDefinition: true
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+2]],{{.*}}
+// CHECK-NOT: DIFlagFwdDecl
 struct foo {
 };
 
diff --git a/test/CodeGenCXX/PR26569.cpp b/test/CodeGenCXX/PR26569.cpp
new file mode 100644
index 0000000..3e2d2ff
--- /dev/null
+++ b/test/CodeGenCXX/PR26569.cpp
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-llvm -O1 -disable-llvm-optzns %s -o - | FileCheck %s
+
+class A {
+  virtual void m_fn1();
+};
+template <typename>
+class B : virtual A {};
+
+extern template class __declspec(dllimport) B<int>;
+class __declspec(dllexport) C : B<int> {};
+
+// CHECK-DAG: @[[VTABLE_C:.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast (%rtti.CompleteObjectLocator* @"\01??_R4C@@6B@" to i8*), i8* bitcast (void (%class.A*)* @"\01?m_fn1@A@@EAEXXZ" to i8*)]
+// CHECK-DAG: @[[VTABLE_B:.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast (%rtti.CompleteObjectLocator* @"\01??_R4?$B@H@@6B@" to i8*), i8* bitcast (void (%class.A*)* @"\01?m_fn1@A@@EAEXXZ" to i8*)], comdat($"\01??_S?$B@H@@6B@")
+// CHECK-DAG: @[[VTABLE_A:.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast (%rtti.CompleteObjectLocator* @"\01??_R4A@@6B@" to i8*), i8* bitcast (void (%class.A*)* @"\01?m_fn1@A@@EAEXXZ" to i8*)], comdat($"\01??_7A@@6B@")
+
+// CHECK-DAG: @"\01??_7C@@6B@" = dllexport unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* @[[VTABLE_C]], i32 0, i32 1)
+// CHECK-DAG: @"\01??_S?$B@H@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* @[[VTABLE_B]], i32 0, i32 1)
+// CHECK-DAG: @"\01??_7A@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* @[[VTABLE_A]], i32 0, i32 1)
+
+// CHECK-DAG: @"\01??_8?$B@H@@7B@" = available_externally dllimport unnamed_addr constant [2 x i32] [i32 0, i32 4]
diff --git a/test/CodeGenCXX/PR28220.cpp b/test/CodeGenCXX/PR28220.cpp
new file mode 100644
index 0000000..6262c87
--- /dev/null
+++ b/test/CodeGenCXX/PR28220.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+
+template <typename>
+struct __declspec(dllimport) S {
+  S();
+};
+
+template <typename T>
+struct __declspec(dllimport) U {
+  static S<T> u;
+};
+
+template <typename T>
+S<T> U<T>::u;
+
+template S<int> U<int>::u;
+// CHECK-NOT: define internal void @"\01??__Eu@?$U@H@@2U?$S@H@@A@YAXXZ"(
+
+S<int> &i = U<int>::u;
diff --git a/test/CodeGenCXX/align-avx-complete-objects.cpp b/test/CodeGenCXX/align-avx-complete-objects.cpp
index 6ab17f5..ad4a914 100644
--- a/test/CodeGenCXX/align-avx-complete-objects.cpp
+++ b/test/CodeGenCXX/align-avx-complete-objects.cpp
@@ -13,7 +13,7 @@
 }
 
 // CHECK: [[R:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:  [[CALL:%.*]] = call noalias i8* @_Znwm(i64 32)
+// CHECK-NEXT:  [[CALL:%.*]] = call i8* @_Znwm(i64 32)
 // CHECK-NEXT:  [[ZERO:%.*]] = bitcast i8* [[CALL]] to <8 x float>*
 // CHECK-NEXT:  store <8 x float>* [[ZERO]], <8 x float>** [[P:%.*]], align 8
 // CHECK-NEXT:  [[ONE:%.*]] = load <8 x float>*, <8 x float>** [[P]], align 8
@@ -42,7 +42,7 @@
 }
 
 // CHECK: [[R:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:  [[CALL:%.*]] = call noalias i8* @_Znwm(i64 32)
+// CHECK-NEXT:  [[CALL:%.*]] = call i8* @_Znwm(i64 32)
 // CHECK-NEXT:  [[ZERO:%.*]] = bitcast i8* [[CALL]] to <8 x float>*
 // CHECK-NEXT:  store <8 x float>* [[ZERO]], <8 x float>** [[P:%.*]], align 8
 // CHECK-NEXT:  [[ONE:%.*]] = load <8 x float>*, <8 x float>** [[P]], align 8
diff --git a/test/CodeGenCXX/alignment.cpp b/test/CodeGenCXX/alignment.cpp
index 2a1fe71..4c44bad 100644
--- a/test/CodeGenCXX/alignment.cpp
+++ b/test/CodeGenCXX/alignment.cpp
@@ -32,7 +32,7 @@
     // CHECK: [[T2:%.*]] = or i8 [[T1]], [[T0]]
     // CHECK: store i8 [[T2]], i8* [[FIELD_P]], align 4
     b.onebit = int_source();
-    
+
     // CHECK: [[B_P:%.*]] = load [[B]]*, [[B]]**
     // CHECK: [[FIELD_P:%.*]] = bitcast [[B]]* [[B_P]] to i8*
     // CHECK: [[VALUE:%.*]] = load i8, i8* [[FIELD_P]], align 4
@@ -60,7 +60,7 @@
     // CHECK: [[T2:%.*]] = or i8 [[T1]], [[T0]]
     // CHECK: store i8 [[T2]], i8* [[FIELD_P]], align 2
     c.onebit = int_source();
-    
+
     // CHECK: [[C_P:%.*]] = load [[C]]*, [[C]]**
     // CHECK: [[T0:%.*]] = bitcast [[C]]* [[C_P]] to i8*
     // CHECK: [[T1:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 8
diff --git a/test/CodeGenCXX/arm.cpp b/test/CodeGenCXX/arm.cpp
index 11ae6b2..d0b896d 100644
--- a/test/CodeGenCXX/arm.cpp
+++ b/test/CodeGenCXX/arm.cpp
@@ -109,7 +109,7 @@
 
   void a() {
     // CHECK-LABEL: define void @_ZN5test31aEv()
-    // CHECK: call noalias i8* @_Znam(i32 48)
+    // CHECK: call i8* @_Znam(i32 48)
     // CHECK: store i32 4
     // CHECK: store i32 10
     A *x = new A[10];
@@ -122,7 +122,7 @@
     // CHECK: @llvm.uadd.with.overflow.i32(i32 {{.*}}, i32 8)
     // CHECK: [[OR:%.*]] = or i1
     // CHECK: [[SZ:%.*]] = select i1 [[OR]]
-    // CHECK: call noalias i8* @_Znam(i32 [[SZ]])
+    // CHECK: call i8* @_Znam(i32 [[SZ]])
     // CHECK: store i32 4
     // CHECK: store i32 [[N]]
     A *x = new A[n];
@@ -130,7 +130,7 @@
 
   void c() {
     // CHECK-LABEL: define void @_ZN5test31cEv()
-    // CHECK: call  noalias i8* @_Znam(i32 808)
+    // CHECK: call  i8* @_Znam(i32 808)
     // CHECK: store i32 4
     // CHECK: store i32 200
     A (*x)[20] = new A[10][20];
@@ -143,7 +143,7 @@
     // CHECK: [[NE:%.*]] = mul i32 [[N]], 20
     // CHECK: @llvm.uadd.with.overflow.i32(i32 {{.*}}, i32 8)
     // CHECK: [[SZ:%.*]] = select
-    // CHECK: call noalias i8* @_Znam(i32 [[SZ]])
+    // CHECK: call i8* @_Znam(i32 [[SZ]])
     // CHECK: store i32 4
     // CHECK: store i32 [[NE]]
     A (*x)[20] = new A[n][20];
@@ -182,7 +182,7 @@
 
   void a() {
     // CHECK-LABEL: define void @_ZN5test41aEv()
-    // CHECK: call noalias i8* @_Znam(i32 48)
+    // CHECK: call i8* @_Znam(i32 48)
     // CHECK: store i32 4
     // CHECK: store i32 10
     A *x = new A[10];
@@ -194,7 +194,7 @@
     // CHECK: @llvm.umul.with.overflow.i32(i32 [[N]], i32 4)
     // CHECK: @llvm.uadd.with.overflow.i32(i32 {{.*}}, i32 8)
     // CHECK: [[SZ:%.*]] = select
-    // CHECK: call noalias i8* @_Znam(i32 [[SZ]])
+    // CHECK: call i8* @_Znam(i32 [[SZ]])
     // CHECK: store i32 4
     // CHECK: store i32 [[N]]
     A *x = new A[n];
@@ -202,7 +202,7 @@
 
   void c() {
     // CHECK-LABEL: define void @_ZN5test41cEv()
-    // CHECK: call  noalias i8* @_Znam(i32 808)
+    // CHECK: call  i8* @_Znam(i32 808)
     // CHECK: store i32 4
     // CHECK: store i32 200
     A (*x)[20] = new A[10][20];
@@ -215,7 +215,7 @@
     // CHECK: [[NE:%.*]] = mul i32 [[N]], 20
     // CHECK: @llvm.uadd.with.overflow.i32(i32 {{.*}}, i32 8)
     // CHECK: [[SZ:%.*]] = select
-    // CHECK: call noalias i8* @_Znam(i32 [[SZ]])
+    // CHECK: call i8* @_Znam(i32 [[SZ]])
     // CHECK: store i32 4
     // CHECK: store i32 [[NE]]
     A (*x)[20] = new A[n][20];
@@ -383,7 +383,7 @@
 // CHECK-NEXT: [[OVERFLOW:%.*]] = or i1 [[O0]], [[O1]]
 // CHECK-NEXT: [[T3:%.*]] = extractvalue { i32, i1 } [[T2]], 0
 // CHECK-NEXT: [[T4:%.*]] = select i1 [[OVERFLOW]], i32 -1, i32 [[T3]]
-// CHECK-NEXT: [[ALLOC:%.*]] = call noalias i8* @_Znam(i32 [[T4]])
+// CHECK-NEXT: [[ALLOC:%.*]] = call i8* @_Znam(i32 [[T4]])
 // CHECK-NEXT: [[T0:%.*]] = bitcast i8* [[ALLOC]] to i32*
 // CHECK-NEXT: store i32 16, i32* [[T0]]
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[T0]], i32 1
diff --git a/test/CodeGenCXX/atomicinit.cpp b/test/CodeGenCXX/atomicinit.cpp
index 5e5174b..4c30ec3 100644
--- a/test/CodeGenCXX/atomicinit.cpp
+++ b/test/CodeGenCXX/atomicinit.cpp
@@ -1,13 +1,13 @@
 // RUN: %clang_cc1 %s -emit-llvm -O1 -o - -triple=i686-apple-darwin9 -std=c++11 | FileCheck %s
 
-// CHECK-DAG: @PR22043 = global i32 0, align 4
+// CHECK-DAG: @PR22043 = local_unnamed_addr global i32 0, align 4
 typedef _Atomic(int) AtomicInt;
 AtomicInt PR22043 = AtomicInt();
 
-// CHECK-DAG: @_ZN7PR180978constant1aE = global { i16, i8 } { i16 1, i8 6 }, align 4
-// CHECK-DAG: @_ZN7PR180978constant1bE = global { i16, i8 } { i16 2, i8 6 }, align 4
-// CHECK-DAG: @_ZN7PR180978constant1cE = global { i16, i8 } { i16 3, i8 6 }, align 4
-// CHECK-DAG: @_ZN7PR180978constant1yE = global { { i16, i8 }, i32 } { { i16, i8 } { i16 4, i8 6 }, i32 5 }, align 4
+// CHECK-DAG: @_ZN7PR180978constant1aE = local_unnamed_addr global { i16, i8 } { i16 1, i8 6 }, align 4
+// CHECK-DAG: @_ZN7PR180978constant1bE = local_unnamed_addr global { i16, i8 } { i16 2, i8 6 }, align 4
+// CHECK-DAG: @_ZN7PR180978constant1cE = local_unnamed_addr global { i16, i8 } { i16 3, i8 6 }, align 4
+// CHECK-DAG: @_ZN7PR180978constant1yE = local_unnamed_addr global { { i16, i8 }, i32 } { { i16, i8 } { i16 4, i8 6 }, i32 5 }, align 4
 
 struct A {
   _Atomic(int) i;
@@ -51,8 +51,8 @@
 };
 
 // CHECK-LABEL: define void @_ZN16AtomicBoolMemberC2Eb
-// CHECK: {{zext i1.*to i8}}
-// CHECK-NEXT: store i8
+// CHECK: zext i1 {{.*}} to i8
+// CHECK: store i8
 // CHECK-NEXT: ret void
 AtomicBoolMember::AtomicBoolMember(bool b) : ab(b) { }
 
diff --git a/test/CodeGenCXX/attr-mode-vector-types-tmpl.cpp b/test/CodeGenCXX/attr-mode-vector-types-tmpl.cpp
new file mode 100644
index 0000000..6373cf0
--- /dev/null
+++ b/test/CodeGenCXX/attr-mode-vector-types-tmpl.cpp
@@ -0,0 +1,108 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s
+
+template <class T>
+void CheckIntScalarTypes() {
+  // T will be substituted with 'int' and 'enum' types.
+
+  typedef T __attribute__((mode(QI))) T1;
+  typedef T __attribute__((mode(HI))) T2;
+  typedef T __attribute__((mode(SI))) T3;
+  typedef T __attribute__((mode(DI))) T4;
+
+  T1 a1;
+  T2 a2;
+  T3 a3;
+  T4 a4;
+}
+
+template <class T>
+void CheckIntVectorTypes() {
+  // T will be substituted with 'int'.
+
+  typedef int __attribute__((mode(QI))) __attribute__((vector_size(8)))  VT_11;
+  typedef T   __attribute__((mode(V8QI)))                                VT_12;
+  typedef int __attribute__((mode(SI))) __attribute__((vector_size(16))) VT_21;
+  typedef T   __attribute__((mode(V4SI)))                                VT_22;
+  typedef int __attribute__((mode(DI))) __attribute__((vector_size(64))) VT_31;
+  typedef T   __attribute__((mode(V8DI)))                                VT_32;
+
+  VT_11 v11;
+  VT_12 v12;
+
+  VT_21 v21;
+  VT_22 v22;
+
+  VT_31 v31;
+  VT_32 v32;
+}
+
+template <class T>
+void CheckFloatVectorTypes() {
+  // T will be substituted with 'float'.
+
+  typedef float __attribute__((mode(SF))) __attribute__((vector_size(128))) VT_41;
+  typedef T     __attribute__((mode(V32SF)))                                VT_42;
+  typedef float __attribute__((mode(DF))) __attribute__((vector_size(256))) VT_51;
+  typedef T     __attribute__((mode(V32DF)))                                VT_52;
+
+  VT_41 v41;
+  VT_42 v42;
+
+  VT_51 v51;
+  VT_52 v52;
+}
+
+template <class T>
+void CheckInstantiationWithModedType() {
+  T x1;
+}
+
+typedef enum { A1, B1 }                       EnumTy;
+typedef int __attribute__((mode(DI)))         Int64Ty1;
+typedef enum __attribute__((mode(DI))) { A2 } Int64Ty2;
+typedef int __attribute__((mode(V8HI)))       IntVecTy1;
+
+void test() {
+
+  // CHECK: define {{.*}} void @_Z19CheckIntScalarTypesIiEvv()
+  // CHECK: %{{.+}} = alloca i8
+  // CHECK: %{{.+}} = alloca i16
+  // CHECK: %{{.+}} = alloca i32
+  // CHECK: %{{.+}} = alloca i64
+  CheckIntScalarTypes<int>();
+
+  // CHECK: define {{.*}} void @_Z19CheckIntScalarTypesI6EnumTyEvv()
+  // CHECK: %{{.+}} = alloca i8
+  // CHECK: %{{.+}} = alloca i16
+  // CHECK: %{{.+}} = alloca i32
+  // CHECK: %{{.+}} = alloca i64
+  CheckIntScalarTypes<EnumTy>();
+
+  // CHECK: define {{.*}} void @_Z19CheckIntVectorTypesIiEvv()
+  // CHECK: %{{.+}} = alloca <8 x i8>
+  // CHECK: %{{.+}} = alloca <8 x i8>
+  // CHECK: %{{.+}} = alloca <4 x i32>
+  // CHECK: %{{.+}} = alloca <4 x i32>
+  // CHECK: %{{.+}} = alloca <8 x i64>
+  // CHECK: %{{.+}} = alloca <8 x i64>
+  CheckIntVectorTypes<int>();
+
+  // CHECK: define {{.*}} void @_Z21CheckFloatVectorTypesIfEvv()
+  // CHECK: %{{.+}} = alloca <32 x float>
+  // CHECK: %{{.+}} = alloca <32 x float>
+  // CHECK: %{{.+}} = alloca <32 x double>
+  // CHECK: %{{.+}} = alloca <32 x double>
+  CheckFloatVectorTypes<float>();
+
+  // CHECK: define {{.*}} void @_Z31CheckInstantiationWithModedTypeIlEvv()
+  // CHECK: [[X1:%.+]] = alloca i64
+  CheckInstantiationWithModedType<Int64Ty1>();
+
+  // CHECK: define {{.*}} void @_Z31CheckInstantiationWithModedTypeI8Int64Ty2Evv()
+  // CHECK: [[X1]] = alloca i64
+  CheckInstantiationWithModedType<Int64Ty2>();
+
+  // CHECK: define {{.*}} void @_Z31CheckInstantiationWithModedTypeIDv8_sEvv()
+  // CHECK: [[X1]] = alloca <8 x i16>
+  CheckInstantiationWithModedType<IntVecTy1>();
+}
diff --git a/test/CodeGenCXX/c-linkage.cpp b/test/CodeGenCXX/c-linkage.cpp
index a70a22e..0f4c327 100644
--- a/test/CodeGenCXX/c-linkage.cpp
+++ b/test/CodeGenCXX/c-linkage.cpp
@@ -15,10 +15,10 @@
 extern "C" {
   static void test2_f() {
   }
-  // CHECK-LABEL: define internal {{.*}}void @_Z7test2_fv
+  // CHECK-LABEL: define internal {{.*}}void @_ZL7test2_fv
   static void test2_f(int x) {
   }
-  // CHECK-LABEL: define internal {{.*}}void @_Z7test2_fi
+  // CHECK-LABEL: define internal {{.*}}void @_ZL7test2_fi
   void test2_use() {
     test2_f();
     test2_f(42);
diff --git a/test/CodeGenCXX/cfi-blacklist.cpp b/test/CodeGenCXX/cfi-blacklist.cpp
index 32ed05b..af8a106 100644
--- a/test/CodeGenCXX/cfi-blacklist.cpp
+++ b/test/CodeGenCXX/cfi-blacklist.cpp
@@ -1,9 +1,8 @@
-// RUN: echo "type:attr:uuid" > %t.txt
-// RUN: %clang_cc1 -fms-extensions -fsanitize=cfi-vcall -fsanitize-blacklist=%t.txt -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOUUID %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fvisibility hidden -fms-extensions -fsanitize=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOBL %s
 // RUN: echo "type:std::*" > %t.txt
-// RUN: %clang_cc1 -fms-extensions -fsanitize=cfi-vcall -fsanitize-blacklist=%t.txt -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTD %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fvisibility hidden -fms-extensions -fsanitize=cfi-vcall -fsanitize-blacklist=%t.txt -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTD %s
 
-struct __declspec(uuid("00000000-0000-0000-0000-000000000000")) S1 {
+struct S1 {
   virtual void f();
 };
 
@@ -16,15 +15,15 @@
 }
 
 // CHECK: define{{.*}}s1f
-// NOSTD: llvm.bitset.test
-// NOUUID-NOT: llvm.bitset.test
+// NOBL: llvm.type.test
+// NOSTD: llvm.type.test
 void s1f(S1 *s1) {
   s1->f();
 }
 
 // CHECK: define{{.*}}s2f
-// NOSTD-NOT: llvm.bitset.test
-// NOUUID: llvm.bitset.test
+// NOBL: llvm.type.test
+// NOSTD-NOT: llvm.type.test
 void s2f(std::S2 *s2) {
   s2->f();
 }
diff --git a/test/CodeGenCXX/cfi-cast.cpp b/test/CodeGenCXX/cfi-cast.cpp
index 845b955..54641b5 100644
--- a/test/CodeGenCXX/cfi-cast.cpp
+++ b/test/CodeGenCXX/cfi-cast.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-derived-cast -fsanitize-trap=cfi-derived-cast -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-DCAST %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-unrelated-cast -fsanitize-trap=cfi-unrelated-cast -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-UCAST %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-unrelated-cast,cfi-cast-strict -fsanitize-trap=cfi-unrelated-cast,cfi-cast-strict -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-UCAST-STRICT %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -std=c++11 -fsanitize=cfi-derived-cast -fsanitize-trap=cfi-derived-cast -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-DCAST %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -std=c++11 -fsanitize=cfi-unrelated-cast -fsanitize-trap=cfi-unrelated-cast -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-UCAST %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -std=c++11 -fsanitize=cfi-unrelated-cast,cfi-cast-strict -fsanitize-trap=cfi-unrelated-cast,cfi-cast-strict -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-UCAST-STRICT %s
 
 // In this test the main thing we are searching for is something like
 // 'metadata !"1B"' where "1B" is the mangled name of the class we are
@@ -17,9 +17,9 @@
 
 struct C : A {};
 
-// CHECK-DCAST-LABEL: define void @_Z3abpP1A
+// CHECK-DCAST-LABEL: define hidden void @_Z3abpP1A
 void abp(A *a) {
-  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-DCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-DCAST: [[TRAPBB]]
@@ -28,12 +28,12 @@
 
   // CHECK-DCAST: [[CONTBB]]
   // CHECK-DCAST: ret
-  static_cast<B*>(a);
+  (void)static_cast<B*>(a);
 }
 
-// CHECK-DCAST-LABEL: define void @_Z3abrR1A
+// CHECK-DCAST-LABEL: define hidden void @_Z3abrR1A
 void abr(A &a) {
-  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-DCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-DCAST: [[TRAPBB]]
@@ -42,12 +42,12 @@
 
   // CHECK-DCAST: [[CONTBB]]
   // CHECK-DCAST: ret
-  static_cast<B&>(a);
+  (void)static_cast<B&>(a);
 }
 
-// CHECK-DCAST-LABEL: define void @_Z4abrrO1A
+// CHECK-DCAST-LABEL: define hidden void @_Z4abrrO1A
 void abrr(A &&a) {
-  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-DCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-DCAST: [[TRAPBB]]
@@ -56,12 +56,12 @@
 
   // CHECK-DCAST: [[CONTBB]]
   // CHECK-DCAST: ret
-  static_cast<B&&>(a);
+  (void)static_cast<B&&>(a);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z3vbpPv
+// CHECK-UCAST-LABEL: define hidden void @_Z3vbpPv
 void vbp(void *p) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-UCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-UCAST: [[TRAPBB]]
@@ -70,12 +70,12 @@
 
   // CHECK-UCAST: [[CONTBB]]
   // CHECK-UCAST: ret
-  static_cast<B*>(p);
+  (void)static_cast<B*>(p);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z3vbrRc
+// CHECK-UCAST-LABEL: define hidden void @_Z3vbrRc
 void vbr(char &r) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-UCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-UCAST: [[TRAPBB]]
@@ -84,12 +84,12 @@
 
   // CHECK-UCAST: [[CONTBB]]
   // CHECK-UCAST: ret
-  reinterpret_cast<B&>(r);
+  (void)reinterpret_cast<B&>(r);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z4vbrrOc
+// CHECK-UCAST-LABEL: define hidden void @_Z4vbrrOc
 void vbrr(char &&r) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-UCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-UCAST: [[TRAPBB]]
@@ -98,37 +98,37 @@
 
   // CHECK-UCAST: [[CONTBB]]
   // CHECK-UCAST: ret
-  reinterpret_cast<B&&>(r);
+  (void)reinterpret_cast<B&&>(r);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z3vcpPv
-// CHECK-UCAST-STRICT-LABEL: define void @_Z3vcpPv
+// CHECK-UCAST-LABEL: define hidden void @_Z3vcpPv
+// CHECK-UCAST-STRICT-LABEL: define hidden void @_Z3vcpPv
 void vcp(void *p) {
-  // CHECK-UCAST: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
-  // CHECK-UCAST-STRICT: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
-  static_cast<C*>(p);
+  // CHECK-UCAST: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
+  // CHECK-UCAST-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
+  (void)static_cast<C*>(p);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z3bcpP1B
-// CHECK-UCAST-STRICT-LABEL: define void @_Z3bcpP1B
+// CHECK-UCAST-LABEL: define hidden void @_Z3bcpP1B
+// CHECK-UCAST-STRICT-LABEL: define hidden void @_Z3bcpP1B
 void bcp(B *p) {
-  // CHECK-UCAST: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
-  // CHECK-UCAST-STRICT: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
-  (C *)p;
+  // CHECK-UCAST: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
+  // CHECK-UCAST-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
+  (void)(C *)p;
 }
 
-// CHECK-UCAST-LABEL: define void @_Z8bcp_callP1B
-// CHECK-UCAST-STRICT-LABEL: define void @_Z8bcp_callP1B
+// CHECK-UCAST-LABEL: define hidden void @_Z8bcp_callP1B
+// CHECK-UCAST-STRICT-LABEL: define hidden void @_Z8bcp_callP1B
 void bcp_call(B *p) {
-  // CHECK-UCAST: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
-  // CHECK-UCAST-STRICT: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
+  // CHECK-UCAST: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
+  // CHECK-UCAST-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
   ((C *)p)->f();
 }
 
-// CHECK-UCAST-LABEL: define i32 @_Z6a_callP1A
-// CHECK-UCAST-STRICT-LABEL: define i32 @_Z6a_callP1A
+// CHECK-UCAST-LABEL: define hidden i32 @_Z6a_callP1A
+// CHECK-UCAST-STRICT-LABEL: define hidden i32 @_Z6a_callP1A
 int a_call(A *a) {
-  // CHECK-UCAST-NOT: @llvm.bitset.test
-  // CHECK-UCAST-STRICT-NOT: @llvm.bitset.test
+  // CHECK-UCAST-NOT: @llvm.type.test
+  // CHECK-UCAST-STRICT-NOT: @llvm.type.test
   return a->i();
 }
diff --git a/test/CodeGenCXX/cfi-cross-dso.cpp b/test/CodeGenCXX/cfi-cross-dso.cpp
index fbe6fc8..d67927d 100644
--- a/test/CodeGenCXX/cfi-cross-dso.cpp
+++ b/test/CodeGenCXX/cfi-cross-dso.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM %s
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall  -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fsanitize=cfi-vcall -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall  -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s
 
 struct A {
   A();
@@ -30,12 +30,12 @@
 
 // CHECK:   %[[VT:.*]] = load void (%struct.A*)**, void (%struct.A*)***
 // CHECK:   %[[VT2:.*]] = bitcast {{.*}}%[[VT]] to i8*, !nosanitize
-// ITANIUM:   %[[TEST:.*]] = call i1 @llvm.bitset.test(i8* %[[VT2]], metadata !"_ZTS1A"), !nosanitize
-// MS:   %[[TEST:.*]] = call i1 @llvm.bitset.test(i8* %[[VT2]], metadata !"?AUA@@"), !nosanitize
+// ITANIUM:   %[[TEST:.*]] = call i1 @llvm.type.test(i8* %[[VT2]], metadata !"_ZTS1A"), !nosanitize
+// MS:   %[[TEST:.*]] = call i1 @llvm.type.test(i8* %[[VT2]], metadata !"?AUA@@"), !nosanitize
 // CHECK:   br i1 %[[TEST]], label %[[CONT:.*]], label %[[SLOW:.*]], {{.*}} !nosanitize
 // CHECK: [[SLOW]]
-// ITANIUM:   call void @__cfi_slowpath(i64 7004155349499253778, i8* %[[VT2]]) {{.*}} !nosanitize
-// MS:   call void @__cfi_slowpath(i64 -8005289897957287421, i8* %[[VT2]]) {{.*}} !nosanitize
+// ITANIUM:   call void @__cfi_slowpath_diag(i64 7004155349499253778, i8* %[[VT2]], {{.*}}) {{.*}} !nosanitize
+// MS:   call void @__cfi_slowpath_diag(i64 -8005289897957287421, i8* %[[VT2]], {{.*}}) {{.*}} !nosanitize
 // CHECK:   br label %[[CONT]], !nosanitize
 // CHECK: [[CONT]]
 // CHECK:   call void %{{.*}}(%struct.A* %{{.*}})
diff --git a/test/CodeGenCXX/cfi-icall.cpp b/test/CodeGenCXX/cfi-icall.cpp
index eceb92a..c3c6ed3 100644
--- a/test/CodeGenCXX/cfi-icall.cpp
+++ b/test/CodeGenCXX/cfi-icall.cpp
@@ -15,9 +15,12 @@
 
 void g() {
   void (*fp)(S *) = f;
-  // CHECK: call i1 @llvm.bitset.test(i8* {{.*}}, metadata ![[VOIDS:[0-9]+]])
+  // CHECK: call i1 @llvm.type.test(i8* {{.*}}, metadata [[VOIDS:![0-9]+]])
   fp(0);
 }
 
-// ITANIUM: !{![[VOIDS]], void (%"struct.(anonymous namespace)::S"*)* @_ZN12_GLOBAL__N_11fEPNS_1SE, i64 0}
-// MS: !{![[VOIDS]], void (%"struct.(anonymous namespace)::S"*)* @"\01?f@?A@@YAXPEAUS@?A@@@Z", i64 0}
+// ITANIUM: define internal void @_ZN12_GLOBAL__N_11fEPNS_1SE({{.*}} !type [[TS:![0-9]+]]
+// MS: define internal void @"\01?f@?A@@YAXPEAUS@?A@@@Z"({{.*}} !type [[TS:![0-9]+]]
+
+// CHECK: [[VOIDS]] = distinct !{}
+// CHECK: [[TS]] = !{i64 0, [[VOIDS]]}
diff --git a/test/CodeGenCXX/cfi-ms-rtti.cpp b/test/CodeGenCXX/cfi-ms-rtti.cpp
index b6e9175..fbebad4 100644
--- a/test/CodeGenCXX/cfi-ms-rtti.cpp
+++ b/test/CodeGenCXX/cfi-ms-rtti.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm -o - -triple=x86_64-pc-win32 %s -fsanitize=cfi-vcall | FileCheck --check-prefix=RTTI %s
-// RUN: %clang_cc1 -emit-llvm -o - -triple=x86_64-pc-win32 %s -fsanitize=cfi-vcall -fno-rtti-data | FileCheck --check-prefix=NO-RTTI %s
+// RUN: %clang_cc1 -flto -emit-llvm -o - -triple=x86_64-pc-win32 %s -fsanitize=cfi-vcall | FileCheck --check-prefix=RTTI %s
+// RUN: %clang_cc1 -flto -emit-llvm -o - -triple=x86_64-pc-win32 %s -fsanitize=cfi-vcall -fno-rtti-data | FileCheck --check-prefix=NO-RTTI %s
 
 struct A {
   A();
@@ -8,5 +8,5 @@
 
 A::A() {}
 
-// RTTI: !{!"?AUA@@", [2 x i8*]* {{.*}}, i64 8}
-// NO-RTTI: !{!"?AUA@@", [1 x i8*]* {{.*}}, i64 0}
+// RTTI: !{i64 8, !"?AUA@@"}
+// NO-RTTI: !{i64 0, !"?AUA@@"}
diff --git a/test/CodeGenCXX/cfi-nvcall.cpp b/test/CodeGenCXX/cfi-nvcall.cpp
index be4d844..e968f05 100644
--- a/test/CodeGenCXX/cfi-nvcall.cpp
+++ b/test/CodeGenCXX/cfi-nvcall.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-nvcall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-nvcall,cfi-cast-strict -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-STRICT %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-nvcall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-nvcall,cfi-cast-strict -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-STRICT %s
 
 struct A {
   virtual void f();
@@ -17,8 +17,8 @@
 // CHECK-LABEL: @bg
 // CHECK-STRICT-LABEL: @bg
 extern "C" void bg(B *b) {
-  // CHECK: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
-  // CHECK-STRICT: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   b->g();
 }
 
@@ -29,7 +29,7 @@
   // In this case C's layout is the same as its base class, so we allow
   // c to be of type A in non-strict mode.
 
-  // CHECK: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
-  // CHECK-STRICT: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
+  // CHECK: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
+  // CHECK-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
   c->g();
 }
diff --git a/test/CodeGenCXX/cfi-stats.cpp b/test/CodeGenCXX/cfi-stats.cpp
index 49c0677..6d0dd5b 100644
--- a/test/CodeGenCXX/cfi-stats.cpp
+++ b/test/CodeGenCXX/cfi-stats.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall,cfi-nvcall,cfi-derived-cast,cfi-unrelated-cast,cfi-icall -fsanitize-stats -emit-llvm -o - %s | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall,cfi-nvcall,cfi-derived-cast,cfi-unrelated-cast,cfi-icall -fsanitize-stats -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall,cfi-nvcall,cfi-derived-cast,cfi-unrelated-cast,cfi-icall -fsanitize-trap=cfi-vcall -fwhole-program-vtables -fsanitize-stats -emit-llvm -o - %s | FileCheck %s
 
 // CHECK: [[STATS:@[^ ]*]] = internal global { i8*, i32, [5 x [2 x i8*]] } { i8* null, i32 5, [5 x [2 x i8*]]
 // CHECK: {{\[\[}}2 x i8*] zeroinitializer,
diff --git a/test/CodeGenCXX/cfi-vcall.cpp b/test/CodeGenCXX/cfi-vcall.cpp
deleted file mode 100644
index daa0531..0000000
--- a/test/CodeGenCXX/cfi-vcall.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM --check-prefix=NDIAG %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM --check-prefix=DIAG --check-prefix=DIAG-ABORT %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall -fsanitize-recover=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM --check-prefix=DIAG --check-prefix=DIAG-RECOVER %s
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS --check-prefix=NDIAG %s
-
-// MS: @[[VTA:[0-9]*]] {{.*}} comdat($"\01??_7A@@6B@")
-// MS: @[[VTB:[0-9]*]] {{.*}} comdat($"\01??_7B@@6B0@@")
-// MS: @[[VTAinB:[0-9]*]] {{.*}} comdat($"\01??_7B@@6BA@@@")
-// MS: @[[VTAinC:[0-9]*]] {{.*}} comdat($"\01??_7C@@6B@")
-// MS: @[[VTBinD:[0-9]*]] {{.*}} comdat($"\01??_7D@?A@@6BB@@@")
-// MS: @[[VTAinBinD:[0-9]*]] {{.*}} comdat($"\01??_7D@?A@@6BA@@@")
-// MS: @[[VTFA:[0-9]*]] {{.*}} comdat($"\01??_7FA@?1??foo@@YAXXZ@6B@")
-
-struct A {
-  A();
-  virtual void f();
-};
-
-struct B : virtual A {
-  B();
-  virtual void g();
-  virtual void h();
-};
-
-struct C : virtual A {
-  C();
-};
-
-namespace {
-
-struct D : B, C {
-  D();
-  virtual void f();
-  virtual void h();
-};
-
-}
-
-A::A() {}
-B::B() {}
-C::C() {}
-D::D() {}
-
-void A::f() {
-}
-
-void B::g() {
-}
-
-void D::f() {
-}
-
-void D::h() {
-}
-
-// DIAG: @[[SRC:.*]] = private unnamed_addr constant [{{.*}} x i8] c"{{.*}}cfi-vcall.cpp\00", align 1
-// DIAG: @[[TYPE:.*]] = private unnamed_addr constant { i16, i16, [4 x i8] } { i16 -1, i16 0, [4 x i8] c"'A'\00" }
-// DIAG: @[[BADTYPESTATIC:.*]] = private unnamed_addr global { { [{{.*}} x i8]*, i32, i32 }, { i16, i16, [4 x i8] }*, i8 } { { [{{.*}} x i8]*, i32, i32 } { [{{.*}} x i8]* @[[SRC]], i32 [[@LINE+21]], i32 3 }, { i16, i16, [4 x i8] }* @[[TYPE]], i8 0 }
-
-// ITANIUM: define void @_Z2afP1A
-// MS: define void @"\01?af@@YAXPEAUA@@@Z"
-void af(A *a) {
-  // ITANIUM: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* [[VT:%[^ ]*]], metadata !"_ZTS1A")
-  // MS: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* [[VT:%[^ ]*]], metadata !"?AUA@@")
-  // CHECK-NEXT: br i1 [[P]], label %[[CONTBB:[^ ,]*]], label %[[TRAPBB:[^ ,]*]]
-  // CHECK-NEXT: {{^$}}
-
-  // CHECK: [[TRAPBB]]
-  // NDIAG-NEXT: call void @llvm.trap()
-  // NDIAG-NEXT: unreachable
-  // DIAG-NEXT: [[VTINT:%[^ ]*]] = ptrtoint i8* [[VT]] to i64
-  // DIAG-ABORT-NEXT: call void @__ubsan_handle_cfi_bad_type_abort(i8* bitcast ({{.*}} @[[BADTYPESTATIC]] to i8*), i64 [[VTINT]])
-  // DIAG-ABORT-NEXT: unreachable
-  // DIAG-RECOVER-NEXT: call void @__ubsan_handle_cfi_bad_type(i8* bitcast ({{.*}} @[[BADTYPESTATIC]] to i8*), i64 [[VTINT]])
-  // DIAG-RECOVER-NEXT: br label %[[CONTBB]]
-
-  // CHECK: [[CONTBB]]
-  // CHECK: call void %
-  a->f();
-}
-
-// ITANIUM: define internal void @_Z3df1PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?df1@@YAXPEAUD@?A@@@Z"
-void df1(D *d) {
-  // ITANIUM: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata ![[DTYPE:[0-9]+]])
-  // MS: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"?AUA@@")
-  d->f();
-}
-
-// ITANIUM: define internal void @_Z3dg1PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?dg1@@YAXPEAUD@?A@@@Z"
-void dg1(D *d) {
-  // ITANIUM: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
-  // MS: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"?AUB@@")
-  d->g();
-}
-
-// ITANIUM: define internal void @_Z3dh1PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?dh1@@YAXPEAUD@?A@@@Z"
-void dh1(D *d) {
-  // ITANIUM: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata ![[DTYPE]])
-  // MS: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata ![[DTYPE:[0-9]+]])
-  d->h();
-}
-
-// ITANIUM: define internal void @_Z3df2PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?df2@@YAXPEAUD@?A@@@Z"
-__attribute__((no_sanitize("cfi")))
-void df2(D *d) {
-  // CHECK-NOT: call i1 @llvm.bitset.test
-  d->f();
-}
-
-// ITANIUM: define internal void @_Z3df3PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?df3@@YAXPEAUD@?A@@@Z"
-__attribute__((no_sanitize("address"))) __attribute__((no_sanitize("cfi-vcall")))
-void df3(D *d) {
-  // CHECK-NOT: call i1 @llvm.bitset.test
-  d->f();
-}
-
-D d;
-
-void foo() {
-  df1(&d);
-  dg1(&d);
-  dh1(&d);
-  df2(&d);
-  df3(&d);
-
-  struct FA : A {
-    void f() {}
-  } fa;
-  af(&fa);
-}
-
-namespace test2 {
-
-struct A {
-  virtual void m_fn1();
-};
-struct B {
-  virtual void m_fn2();
-};
-struct C : B, A {};
-struct D : C {
-  void m_fn1();
-};
-
-// ITANIUM: define void @_ZN5test21fEPNS_1DE
-// MS: define void @"\01?f@test2@@YAXPEAUD@1@@Z"
-void f(D *d) {
-  // ITANIUM: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTSN5test21DE")
-  // MS: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"?AUA@test2@@")
-  d->m_fn1();
-}
-
-}
-
-// Check for the expected number of elements (9 or 15 respectively).
-// MS: !llvm.bitsets = !{[[X:[^,]*(,[^,]*){8}]]}
-// ITANIUM: !llvm.bitsets = !{[[X:[^,]*(,[^,]*){14}]]}
-
-// ITANIUM-DAG: !{!"_ZTS1A", [3 x i8*]* @_ZTV1A, i64 16}
-// ITANIUM-DAG: !{!"_ZTS1A", [7 x i8*]* @_ZTCN12_GLOBAL__N_11DE0_1B, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1B", [7 x i8*]* @_ZTCN12_GLOBAL__N_11DE0_1B, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [9 x i8*]* @_ZTCN12_GLOBAL__N_11DE8_1C, i64 64}
-// ITANIUM-DAG: !{!"_ZTS1C", [9 x i8*]* @_ZTCN12_GLOBAL__N_11DE8_1C, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [12 x i8*]* @_ZTVN12_GLOBAL__N_11DE, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1B", [12 x i8*]* @_ZTVN12_GLOBAL__N_11DE, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1C", [12 x i8*]* @_ZTVN12_GLOBAL__N_11DE, i64 88}
-// ITANIUM-DAG: !{![[DTYPE]], [12 x i8*]* @_ZTVN12_GLOBAL__N_11DE, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [7 x i8*]* @_ZTV1B, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1B", [7 x i8*]* @_ZTV1B, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [5 x i8*]* @_ZTV1C, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1C", [5 x i8*]* @_ZTV1C, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [3 x i8*]* @_ZTVZ3foovE2FA, i64 16}
-// ITANIUM-DAG: !{!{{[0-9]+}}, [3 x i8*]* @_ZTVZ3foovE2FA, i64 16}
-
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTA]], i64 8}
-// MS-DAG: !{!"?AUB@@", [3 x i8*]* @[[VTB]], i64 8}
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTAinB]], i64 8}
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTAinC]], i64 8}
-// MS-DAG: !{!"?AUB@@", [3 x i8*]* @[[VTBinD]], i64 8}
-// MS-DAG: !{![[DTYPE]], [3 x i8*]* @[[VTBinD]], i64 8}
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTAinBinD]], i64 8}
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTFA]], i64 8}
-// MS-DAG: !{!{{[0-9]+}}, [2 x i8*]* @[[VTFA]], i64 8}
diff --git a/test/CodeGenCXX/const-init-cxx11.cpp b/test/CodeGenCXX/const-init-cxx11.cpp
index 99be265..0c2193f 100644
--- a/test/CodeGenCXX/const-init-cxx11.cpp
+++ b/test/CodeGenCXX/const-init-cxx11.cpp
@@ -343,13 +343,13 @@
     constexpr E() : B(3), c{'b','y','e'} {}
     char c[3];
   };
-  // CHECK: @_ZN14VirtualMembers1eE = global { i8**, double, i32, i8**, double, [5 x i8], i16, i8**, double, [5 x i8], [3 x i8] } { i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i64 0, i64 2), double 1.000000e+00, i32 64, i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i64 0, i64 5), double 2.000000e+00, [5 x i8] c"hello", i16 5, i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i64 0, i64 9), double 3.000000e+00, [5 x i8] c"world", [3 x i8] c"bye" }
+  // CHECK: @_ZN14VirtualMembers1eE = global { i8**, double, i32, i8**, double, [5 x i8], i16, i8**, double, [5 x i8], [3 x i8] } { i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i32 0, i32 2), double 1.000000e+00, i32 64, i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i32 0, i32 5), double 2.000000e+00, [5 x i8] c"hello", i16 5, i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i32 0, i32 9), double 3.000000e+00, [5 x i8] c"world", [3 x i8] c"bye" }
   E e;
 
   struct nsMemoryImpl {
     virtual void f();
   };
-  // CHECK: @_ZN14VirtualMembersL13sGlobalMemoryE = internal global { i8** } { i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN14VirtualMembers12nsMemoryImplE, i64 0, i64 2) }
+  // CHECK: @_ZN14VirtualMembersL13sGlobalMemoryE = internal global { i8** } { i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN14VirtualMembers12nsMemoryImplE, i32 0, i32 2) }
   __attribute__((used))
   static nsMemoryImpl sGlobalMemory;
 
@@ -360,7 +360,7 @@
 
     T t;
   };
-  // CHECK: @_ZN14VirtualMembers1tE = global { i8**, i32 } { i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN14VirtualMembers13TemplateClassIiEE, i64 0, i64 2), i32 42 }
+  // CHECK: @_ZN14VirtualMembers1tE = global { i8**, i32 } { i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN14VirtualMembers13TemplateClassIiEE, i32 0, i32 2), i32 42 }
   TemplateClass<int> t;
 }
 
diff --git a/test/CodeGenCXX/const-init.cpp b/test/CodeGenCXX/const-init.cpp
index deb923a..f5c9dae 100644
--- a/test/CodeGenCXX/const-init.cpp
+++ b/test/CodeGenCXX/const-init.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -std=c++98 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -std=c++11 -o - %s | FileCheck %s
 
 // CHECK: @a = global i32 10
 int a = 10;
@@ -27,8 +29,13 @@
 
 namespace test2 {
   struct A {
+#if __cplusplus <= 199711L
     static const double d = 1.0;
     static const float f = d / 2;
+#else
+    static constexpr double d = 1.0;
+    static constexpr float f = d / 2;
+#endif
     static int g();
   } a;
 
diff --git a/test/CodeGenCXX/constructor-init.cpp b/test/CodeGenCXX/constructor-init.cpp
index d7ae220..c78534a 100644
--- a/test/CodeGenCXX/constructor-init.cpp
+++ b/test/CodeGenCXX/constructor-init.cpp
@@ -95,14 +95,14 @@
 
   // CHECK-LABEL: define void @_ZN10InitVTable1BC2Ev(%"struct.InitVTable::B"* %this) unnamed_addr
   // CHECK:      [[T0:%.*]] = bitcast [[B:%.*]]* [[THIS:%.*]] to i32 (...)***
-  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i64 0, i64 2) to i32 (...)**), i32 (...)*** [[T0]]
+  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i32 0, i32 2) to i32 (...)**), i32 (...)*** [[T0]]
   // CHECK:      [[VTBL:%.*]] = load i32 ([[B]]*)**, i32 ([[B]]*)*** {{%.*}}
   // CHECK-NEXT: [[FNP:%.*]] = getelementptr inbounds i32 ([[B]]*)*, i32 ([[B]]*)** [[VTBL]], i64 0
   // CHECK-NEXT: [[FN:%.*]] = load i32 ([[B]]*)*, i32 ([[B]]*)** [[FNP]]
   // CHECK-NEXT: [[ARG:%.*]] = call i32 [[FN]]([[B]]* [[THIS]])
   // CHECK-NEXT: call void @_ZN10InitVTable1AC2Ei({{.*}}* {{%.*}}, i32 [[ARG]])
   // CHECK-NEXT: [[T0:%.*]] = bitcast [[B]]* [[THIS]] to i32 (...)***
-  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i64 0, i64 2) to i32 (...)**), i32 (...)*** [[T0]]
+  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i32 0, i32 2) to i32 (...)**), i32 (...)*** [[T0]]
   // CHECK-NEXT: ret void
   B::B() : A(foo()) {}
 
@@ -110,7 +110,7 @@
   // CHECK:      [[ARG:%.*]] = add nsw i32 {{%.*}}, 5
   // CHECK-NEXT: call void @_ZN10InitVTable1AC2Ei({{.*}}* {{%.*}}, i32 [[ARG]])
   // CHECK-NEXT: [[T0:%.*]] = bitcast [[B]]* {{%.*}} to i32 (...)***
-  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i64 0, i64 2) to i32 (...)**), i32 (...)*** [[T0]]
+  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i32 0, i32 2) to i32 (...)**), i32 (...)*** [[T0]]
   // CHECK-NEXT: ret void
   B::B(int x) : A(x + 5) {}
 }
diff --git a/test/CodeGenCXX/copy-constructor-elim.cpp b/test/CodeGenCXX/copy-constructor-elim.cpp
index d9b28ce..4abe456 100644
--- a/test/CodeGenCXX/copy-constructor-elim.cpp
+++ b/test/CodeGenCXX/copy-constructor-elim.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple %ms_abi_triple -emit-llvm -o - %s | FileCheck %s -check-prefix MS
 // CHECK-NOT: _ZN1CC1ERK1C
 // CHECK-NOT: _ZN1SC1ERK1S
diff --git a/test/CodeGenCXX/copy-constructor-synthesis-2.cpp b/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
index 02feed3..9790ca8 100644
--- a/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
+++ b/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
@@ -24,4 +24,4 @@
 A x(A& y) { return y; }
 
 // CHECK: define linkonce_odr {{.*}} @_ZN1AC1ERKS_(%struct.A* {{.*}}%this, %struct.A* dereferenceable({{[0-9]+}})) unnamed_addr
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i32 0, i32 2) to i32 (...)**)
diff --git a/test/CodeGenCXX/copy-constructor-synthesis.cpp b/test/CodeGenCXX/copy-constructor-synthesis.cpp
index 2f0aa3b..4928c61 100644
--- a/test/CodeGenCXX/copy-constructor-synthesis.cpp
+++ b/test/CodeGenCXX/copy-constructor-synthesis.cpp
@@ -166,7 +166,7 @@
 // CHECK-LABEL:    define linkonce_odr void @_ZN12rdar138169401AC2ERKS0_(
 // CHECK:      [[THIS:%.*]] = load [[A]]*, [[A]]**
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[A]]* [[THIS]] to i32 (...)***
-// CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN12rdar138169401AE, i64 0, i64 2) to i32 (...)**), i32 (...)*** [[T0]]
+// CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN12rdar138169401AE, i32 0, i32 2) to i32 (...)**), i32 (...)*** [[T0]]
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[A]], [[A]]* [[THIS]], i32 0, i32 1
 // CHECK-NEXT: [[OTHER:%.*]] = load [[A]]*, [[A]]**
 // CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds [[A]], [[A]]* [[OTHER]], i32 0, i32 1
diff --git a/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp b/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
index 311edaa..7bab114 100644
--- a/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
+++ b/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
@@ -249,7 +249,7 @@
 void fn10(int i) {
   // CHECK-LABEL: define void @_Z4fn10i
   // CHECK: alloca [3 x i32]
-  // CHECK: call noalias i8* @_Znw{{[jm]}}
+  // CHECK: call i8* @_Znw{{[jm]}}
   // CHECK: store i32 %
   // CHECK: store i32 2
   // CHECK: store i32 3
diff --git a/test/CodeGenCXX/cxx11-exception-spec.cpp b/test/CodeGenCXX/cxx11-exception-spec.cpp
index a3dff79..6a3a394 100644
--- a/test/CodeGenCXX/cxx11-exception-spec.cpp
+++ b/test/CodeGenCXX/cxx11-exception-spec.cpp
@@ -70,37 +70,37 @@
 
 // CHECK: define {{.*}} @_Z1iv
 void i() {
-  // CHECK: declare {{.*}} @_Z1gIiEvv() [[NUW]]
+  // CHECK: declare {{.*}} @_Z1gIiEvv() [[NUW2:#[0-9]+]]
   g<int>();
   // CHECK: declare {{.*}} @_Z1gIA2_iEvv()
   // CHECK-NOT: [[NUW]]
   g<int[2]>();
 
-  // CHECK: declare {{.*}} @_ZN1SIiE1gEv() [[NUW]]
+  // CHECK: declare {{.*}} @_ZN1SIiE1gEv() [[NUW2]]
   S<int>::g();
   // CHECK: declare {{.*}} @_ZN1SIA2_iE1gEv()
   // CHECK-NOT: [[NUW]]
   S<int[2]>::g();
 
-  // CHECK: declare {{.*}} @_Z1gIfEvv() [[NUW]]
+  // CHECK: declare {{.*}} @_Z1gIfEvv() [[NUW2]]
   void (*g1)() = &g<float>;
   // CHECK: declare {{.*}} @_Z1gIdEvv()
   // CHECK-NOT: [[NUW]]
   void (*g2)() = &g<double>;
 
-  // CHECK: declare {{.*}} @_ZN1SIfE1gEv() [[NUW]]
+  // CHECK: declare {{.*}} @_ZN1SIfE1gEv() [[NUW2]]
   void (*g3)() = &S<float>::g;
   // CHECK: declare {{.*}} @_ZN1SIdE1gEv()
   // CHECK-NOT: [[NUW]]
   void (*g4)() = &S<double>::g;
 
-  // CHECK: declare {{.*}} @_Z1gIA4_cEvv() [[NUW]]
+  // CHECK: declare {{.*}} @_Z1gIA4_cEvv() [[NUW2]]
   (void)&g<char[4]>;
   // CHECK: declare {{.*}} @_Z1gIcEvv()
   // CHECK-NOT: [[NUW]]
   (void)&g<char>;
 
-  // CHECK: declare {{.*}} @_ZN1SIA4_cE1gEv() [[NUW]]
+  // CHECK: declare {{.*}} @_ZN1SIA4_cE1gEv() [[NUW2]]
   (void)&S<char[4]>::g;
   // CHECK: declare {{.*}} @_ZN1SIcE1gEv()
   // CHECK-NOT: [[NUW]]
@@ -116,12 +116,15 @@
   // CHECK: declare {{.*}} @_ZN6NestedIiE1fILb1EcEEvv(
   // CHECK-NOT: [[NUW]]
   Nested<int>().f<true, char>();
-  // CHECK: declare {{.*}} @_ZN6NestedIlE1fILb0ElEEvv({{.*}}) [[NUW]]
+  // CHECK: declare {{.*}} @_ZN6NestedIlE1fILb0ElEEvv({{.*}}) [[NUW2]]
   Nested<long>().f<false, long>();
 }
 
 // CHECK: attributes [[NONE]] = { {{.*}} }
 // CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW2]] = { nounwind{{.*}} }
+
+
 
 namespace PR19190 {
 template <class T> struct DWFIterator { virtual void get() throw(int) = 0; };
diff --git a/test/CodeGenCXX/cxx11-initializer-array-new.cpp b/test/CodeGenCXX/cxx11-initializer-array-new.cpp
index c662190..59f9603 100644
--- a/test/CodeGenCXX/cxx11-initializer-array-new.cpp
+++ b/test/CodeGenCXX/cxx11-initializer-array-new.cpp
@@ -7,7 +7,7 @@
 void *p = new S[2][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 
 // CHECK-LABEL: define
-// CHECK: %[[ALLOC:.*]] = call noalias i8* @_Znam(i64 32)
+// CHECK: %[[ALLOC:.*]] = call i8* @_Znam(i64 32)
 // CHECK: %[[COOKIE:.*]] = bitcast i8* %[[ALLOC]] to i64*
 // CHECK: store i64 6, i64* %[[COOKIE]]
 // CHECK: %[[START_AS_i8:.*]] = getelementptr inbounds i8, i8* %[[ALLOC]], i64 8
@@ -50,7 +50,7 @@
 // CHECK: call {{.*}} @llvm.umul.with.overflow.i64(i64 %[[N:.*]], i64 12)
 // CHECK: %[[ELTS:.*]] = mul i64 %[[N]], 3
 // CHECK: call {{.*}} @llvm.uadd.with.overflow.i64(i64 %{{.*}}, i64 8)
-// CHECK: %[[ALLOC:.*]] = call noalias i8* @_Znam(i64 %{{.*}})
+// CHECK: %[[ALLOC:.*]] = call i8* @_Znam(i64 %{{.*}})
 //
 // CHECK: %[[COOKIE:.*]] = bitcast i8* %[[ALLOC]] to i64*
 // CHECK: store i64 %[[ELTS]], i64* %[[COOKIE]]
@@ -113,7 +113,7 @@
 // No cookie.
 // CHECK-NOT: @llvm.uadd.with.overflow
 //
-// CHECK: %[[ALLOC:.*]] = call noalias i8* @_Znam(i64 %{{.*}})
+// CHECK: %[[ALLOC:.*]] = call i8* @_Znam(i64 %{{.*}})
 //
 // CHECK: %[[START_AS_T:.*]] = bitcast i8* %[[ALLOC]] to %[[T:.*]]*
 //
diff --git a/test/CodeGenCXX/cxx1z-constexpr-if.cpp b/test/CodeGenCXX/cxx1z-constexpr-if.cpp
new file mode 100644
index 0000000..80a397f
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-constexpr-if.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -std=c++1z %s -emit-llvm -o - | FileCheck %s --implicit-check-not=should_not_be_used
+
+void should_be_used_1();
+void should_be_used_2();
+void should_not_be_used();
+void f() {
+  if constexpr (false)
+    should_not_be_used();
+  else
+    should_be_used_1();
+
+  if constexpr (true || ({ label: false; }))
+    should_be_used_2();
+  else {
+    goto foo;
+foo: should_not_be_used();
+  }
+}
+
+// CHECK: should_be_used_1
+// CHECK: should_be_used_2
diff --git a/test/CodeGenCXX/cxx1z-decomposition.cpp b/test/CodeGenCXX/cxx1z-decomposition.cpp
new file mode 100644
index 0000000..b921200
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-decomposition.cpp
@@ -0,0 +1,118 @@
+// RUN: %clang_cc1 -std=c++1z -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+namespace std {
+  using size_t = decltype(sizeof(0));
+  template<typename> struct tuple_size;
+  template<size_t, typename> struct tuple_element;
+}
+
+struct Y { int n; };
+struct X { X(); X(Y); X(const X&); ~X(); };
+
+struct A { int a : 13; bool b; };
+
+struct B {};
+template<> struct std::tuple_size<B> { enum { value = 2 }; };
+template<> struct std::tuple_element<0,B> { using type = X; };
+template<> struct std::tuple_element<1,B> { using type = const int&; };
+template<int N> auto get(B) {
+  if constexpr (N == 0)
+    return Y();
+  else
+    return 0.0;
+}
+
+using C = int[2];
+
+typedef int D __attribute__((ext_vector_type(2)));
+
+using E = _Complex int;
+
+template<typename T> T &make();
+
+// CHECK: @_ZDC2a12a2E = global {{.*}} zeroinitializer, align 4
+auto [a1, a2] = make<A>();
+// CHECK: @_ZDC2b12b2E = global {{.*}} zeroinitializer, align 1
+// CHECK: @b1 = global {{.*}}* null, align 8
+// CHECK: @_ZGR2b1_ = internal global {{.*}} zeroinitializer, align 1
+// CHECK: @b2 = global i32* null, align 8
+// CHECK: @_ZGR2b2_ = internal global i32 0, align 4
+auto [b1, b2] = make<B>();
+// CHECK: @_ZDC2c12c2E = global [2 x i32]* null, align 8
+auto &[c1, c2] = make<C>();
+// CHECK: @_ZDC2d12d2E = global <2 x i32> zeroinitializer, align 8
+auto [d1, d2] = make<D>();
+// CHECK: @_ZDC2e12e2E = global { i32, i32 } zeroinitializer, align 4
+auto [e1, e2] = make<E>();
+
+// CHECK: call {{.*}}* @_Z4makeI1AERT_v()
+// CHECK: call {{.*}}memcpy{{.*}}@_ZDC2a12a2E
+
+// CHECK: @_Z4makeI1BERT_v()
+//   CHECK: call i32 @_Z3getILi0EEDa1B()
+//   CHECK: call void @_ZN1XC1E1Y({{.*}}* @_ZGR2b1_, i32
+//   CHECK: call i32 @__cxa_atexit({{.*}}@_ZN1XD1Ev{{.*}}@_ZGR2b1_
+//   CHECK: store {{.*}}* @_ZGR2b1_,
+//
+//   CHECK: call double @_Z3getILi1EEDa1B()
+//   CHECK: fptosi double %{{.*}} to i32
+//   CHECK: store i32 %{{.*}}, i32* @_ZGR2b2_
+//   CHECK: store i32* @_ZGR2b2_, i32** @b2
+
+// CHECK: call {{.*}}* @_Z4makeIA2_iERT_v()
+// CHECK: store {{.*}}, [2 x i32]** @_ZDC2c12c2E
+
+// CHECK: call {{.*}}* @_Z4makeIDv2_iERT_v()
+// CHECK: store {{.*}}, <2 x i32>* @_ZDC2d12d2E, align 8
+
+// CHECK: call {{.*}}* @_Z4makeICiERT_v()
+// CHECK: store i32 %{{.*}}, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @_ZDC2e12e2E, i32 0, i32 0)
+// CHECK: store i32 %{{.*}}, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @_ZDC2e12e2E, i32 0, i32 1)
+
+// CHECK: define i32 @_Z12test_globalsv()
+int test_globals() {
+  return a2 + b2 + c2 + d2 + e2;
+  // CHECK: load i8, i8* getelementptr inbounds (%struct.A, %struct.A* @_ZDC2a12a2E, i32 0, i32 1)
+  //
+  // CHECK: %[[b2:.*]] = load i32*, i32** @b2
+  // CHECK: load i32, i32* %[[b2]]
+  //
+  // CHECK: %[[c1c2:.*]] = load [2 x i32]*, [2 x i32]** @_ZDC2c12c2E
+  // CHECK: %[[c2:.*]] = getelementptr inbounds [2 x i32], [2 x i32]* %[[c1c2]], i64 0, i64 1
+  // CHECK: load i32, i32* %[[c2]]
+  //
+  // CHECK: %[[d1d2:.*]] = load <2 x i32>, <2 x i32>* @_ZDC2d12d2E
+  // CHECK: extractelement <2 x i32> %[[d1d2]], i32 1
+  //
+  // CHECK: load i32, i32* getelementptr inbounds ({ i32, i32 }, { i32, i32 }* @_ZDC2e12e2E, i32 0, i32 1)
+}
+
+// CHECK: define i32 @_Z11test_localsv()
+int test_locals() {
+  auto [b1, b2] = make<B>();
+
+  // CHECK: @_Z4makeI1BERT_v()
+  //   CHECK: call i32 @_Z3getILi0EEDa1B()
+  //   CHECK: call void @_ZN1XC1E1Y({{.*}}* %[[b1:.*]], i32
+  //
+  //   CHECK: call double @_Z3getILi1EEDa1B()
+  //   CHECK: %[[cvt:.*]] = fptosi double %{{.*}} to i32
+  //   CHECK: store i32 %[[cvt]], i32* %[[b2:.*]],
+  //   CHECK: store i32* %[[b2]], i32** %[[b2ref:.*]],
+
+  return b2;
+  // CHECK: %[[b2:.*]] = load i32*, i32** %[[b2ref]]
+  // CHECK: load i32, i32* %[[b2]]
+
+  // CHECK: call {{.*}}@_ZN1XD1Ev({{.*}}%[[b1]])
+}
+
+// CHECK: define void @_Z13test_bitfieldR1A(
+void test_bitfield(A &a) {
+  auto &[a1, a2] = a;
+  a1 = 5;
+  // CHECK: load i16, i16* %[[BITFIELD:.*]],
+  // CHECK: and i16 %{{.*}}, -8192
+  // CHECK: or i16 %{{.*}}, 5
+  // CHECK: store i16 %{{.*}}, i16* %[[BITFIELD]],
+}
diff --git a/test/CodeGenCXX/cxx1z-init-statement.cpp b/test/CodeGenCXX/cxx1z-init-statement.cpp
new file mode 100644
index 0000000..5c05212
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-init-statement.cpp
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -std=c++1z -triple x86_64-apple-macosx10.7.0 -emit-llvm -o - %s -w | FileCheck %s
+
+typedef int T;
+void f() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: store i32 5, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[B:.*]] = load i32, i32* %[[A]], align 4
+  // CHECK-NEXT  %[[C:.*]] = icmp slt i32 %[[B]], 8
+  if (int a = 5; a < 8)
+    ;
+}
+
+void f1() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[B:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[C:.*]] = alloca i32, align 4
+  // CHECK-NEXT: store i32 5, i32* %[[B]], align 4
+  // CHECK-NEXT: store i32 7, i32* %[[C]], align 4
+  if (int a, b = 5; int c = 7)
+    ;
+}
+
+int f2() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[B:.*]] = call i32 @_Z2f2v()
+  // CHECK-NEXT: store i32 7, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[C:.*]] = load i32, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[D:.*]] = icmp ne i32 %[[C]], 0
+  if (T{f2()}; int c = 7)
+    ;
+  return 2;
+}
+
+void g() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: store i32 5, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[B:.*]] = load i32, i32* %[[A]], align 4
+  // CHECK-NEXT: switch i32 %[[B]], label %[[C:.*]] [
+  switch (int a = 5; a) {
+    case 0:
+      break;
+  }
+}
+
+void g1() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[B:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[C:.*]] = alloca i32, align 4
+  // CHECK-NEXT: store i32 5, i32* %[[B]], align 4
+  // CHECK-NEXT: store i32 7, i32* %[[C]], align 4
+  // CHECK-NEXT: %[[D:.*]] = load i32, i32* %[[C]], align 4
+  // CHECK-NEXT: switch i32 %[[D]], label %[[E:.*]] [
+  switch (int a, b = 5; int c = 7) {
+    case 0:
+      break;
+  }
+}
+
+int g2() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[B:.*]] = call i32 @_Z2f2v()
+  // CHECK-NEXT: store i32 7, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[C:.*]] = load i32, i32* %[[A]], align 4
+  // CHECK-NEXT: switch i32 %[[C]], label %[[E:.*]] [
+  switch (T{f2()}; int c = 7) {
+    case 0:
+      break;
+  }
+  return 2;
+}
diff --git a/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp b/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp
new file mode 100644
index 0000000..9110e49
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp
@@ -0,0 +1,114 @@
+// RUN: %clang_cc1 -std=c++1z %s -triple x86_64-linux-gnu -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s
+
+namespace Constant {
+  struct A {
+    int n;
+    char k;
+    ~A();
+  };
+
+  struct B {
+    char k2;
+  };
+
+  struct C : B {};
+
+  struct D : A, C {};
+
+  C c1 = {};
+  C c2 = {1};
+  // CHECK: @_ZN8Constant2c1E = global { i8 } zeroinitializer, align 1
+  // CHECK: @_ZN8Constant2c2E = global { i8 } { i8 1 }, align 1
+
+  // Test packing bases into tail padding.
+  D d1 = {};
+  D d2 = {1, 2, 3};
+  D d3 = {1};
+  // CHECK: @_ZN8Constant2d1E = global { i32, i8, i8 } zeroinitializer, align 4
+  // CHECK: @_ZN8Constant2d2E = global { i32, i8, i8 } { i32 1, i8 2, i8 3 }, align 4
+  // CHECK: @_ZN8Constant2d3E = global { i32, i8, i8 } { i32 1, i8 0, i8 0 }, align 4
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN8Constant1DD1Ev {{.*}} @_ZN8Constant2d1E
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN8Constant1DD1Ev {{.*}} @_ZN8Constant2d2E
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN8Constant1DD1Ev {{.*}} @_ZN8Constant2d3E
+}
+
+namespace Dynamic {
+  struct A {
+    A();
+    A(int);
+    A(const char*, unsigned);
+    ~A();
+    void *p;
+  };
+
+  struct B {
+    ~B();
+    int n = 5;
+  };
+
+  struct C {
+    C(bool = true);
+  };
+
+  int f(), g(), h(), i();
+  struct D : A, B, C {
+    int n = f();
+  };
+
+  D d1 = {};
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call void @_ZN7Dynamic1AC2Ev({{.*}} @_ZN7Dynamic2d1E
+  // CHECK: store i32 5, {{.*}}i8* getelementptr inbounds {{.*}} @_ZN7Dynamic2d1E{{.*}}, i64 8
+  // CHECK: invoke void @_ZN7Dynamic1CC2Eb({{.*}} @_ZN7Dynamic2d1E{{.*}}, i1 zeroext true)
+  // CHECK:   unwind label %[[UNWIND:.*]]
+  // CHECK: invoke i32 @_ZN7Dynamic1fEv()
+  // CHECK:   unwind label %[[UNWIND:.*]]
+  // CHECK: store i32 {{.*}}, i32* getelementptr {{.*}} @_ZN7Dynamic2d1E, i32 0, i32 2
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN7Dynamic1DD1Ev {{.*}} @_ZN7Dynamic2d1E
+  // CHECK: ret
+  //
+  //   UNWIND:
+  // CHECK: call void @_ZN7Dynamic1BD1Ev({{.*}}i8* getelementptr inbounds {{.*}} @_ZN7Dynamic2d1E{{.*}}, i64 8
+  // CHECK: call void @_ZN7Dynamic1AD1Ev({{.*}} @_ZN7Dynamic2d1E
+
+  D d2 = {1, 2, false};
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call void @_ZN7Dynamic1AC1Ei({{.*}} @_ZN7Dynamic2d2E{{.*}}, i32 1)
+  // CHECK: store i32 2, {{.*}}i8* getelementptr inbounds {{.*}}@_ZN7Dynamic2d2E{{.*}}, i64 8
+  // CHECK: invoke void @_ZN7Dynamic1CC1Eb({{.*}} @_ZN7Dynamic2d2E{{.*}}, i1 zeroext false)
+  // CHECK: invoke i32 @_ZN7Dynamic1fEv()
+  // CHECK: store i32 {{.*}}, i32* getelementptr {{.*}} @_ZN7Dynamic2d2E, i32 0, i32 2
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN7Dynamic1DD1Ev {{.*}} @_ZN7Dynamic2d2E
+  // CHECK: ret void
+
+  D d3 = {g(), h(), {}, i()};
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: %[[G_CALL:.*]] = call i32 @_ZN7Dynamic1gEv()
+  // CHECK: call void @_ZN7Dynamic1AC1Ei({{.*}} @_ZN7Dynamic2d3E{{.*}}, i32 %[[G_CALL]])
+  // CHECK: %[[H_CALL:.*]] = invoke i32 @_ZN7Dynamic1hEv()
+  // CHECK:   unwind label %[[DESTROY_A_LPAD:.*]]
+  // CHECK: store i32 %[[H_CALL]], {{.*}}i8* getelementptr inbounds {{.*}} @_ZN7Dynamic2d3E{{.*}}, i64 8
+  // CHECK: invoke void @_ZN7Dynamic1CC2Eb({{.*}} @_ZN7Dynamic2d3E{{.*}}, i1 zeroext true)
+  // CHECK:   unwind label %[[DESTROY_AB_LPAD:.*]]
+  // CHECK: %[[I_CALL:.*]] = invoke i32 @_ZN7Dynamic1iEv()
+  // CHECK:   unwind label %[[DESTROY_AB_LPAD:.*]]
+  // CHECK: store i32 %[[I_CALL]], i32* getelementptr {{.*}} @_ZN7Dynamic2d3E, i32 0, i32 2
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN7Dynamic1DD1Ev {{.*}} @_ZN7Dynamic2d3E to i8*
+  // CHECK: ret
+  //
+  //   DESTROY_A_LPAD:
+  // CHECK: br label %[[A_CLEANUP:.*]]
+  //
+  //   DESTROY_B_LPAD:
+  // CHECK: call void @_ZN7Dynamic1BD1Ev({{.*}}i8* getelementptr inbounds {{.*}} @_ZN7Dynamic2d3E{{.*}}, i64 8
+  // CHECK: br label %[[A_CLEANUP:.*]]
+  //
+  //   A_CLEANUP:
+  // CHECK: call void @_ZN7Dynamic1AD1Ev({{.*}} @_ZN7Dynamic2d3E
+}
diff --git a/test/CodeGenCXX/cxx1z-inline-variables.cpp b/test/CodeGenCXX/cxx1z-inline-variables.cpp
new file mode 100644
index 0000000..1837093
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-inline-variables.cpp
@@ -0,0 +1,83 @@
+// RUN: %clang_cc1 -std=c++1z %s -emit-llvm -o - -triple x86_64-linux-gnu | FileCheck %s
+
+struct Q {
+  // CHECK: @_ZN1Q1kE = linkonce_odr constant i32 5, comdat
+  static constexpr int k = 5;
+};
+const int &r = Q::k;
+
+int f();
+
+// const does not imply internal linkage.
+// CHECK: @external_inline = linkonce_odr constant i32 5, comdat
+inline const int external_inline = 5;
+const int &use1 = external_inline;
+
+// static still does, though.
+// CHECK: @_ZL15internal_inline = internal constant i32 5
+static inline const int internal_inline = 5;
+const int &use2 = internal_inline;
+
+int a = f();
+// CHECK: @b = linkonce_odr global i32 0, comdat
+// CHECK: @_ZGV1b = linkonce_odr global i64 0, comdat($b)
+inline int b = f();
+int c = f();
+
+// For compatibility with C++11 and C++14, an out-of-line declaration of a
+// static constexpr local variable promotes the variable to weak_odr.
+struct compat {
+  static constexpr int a = 1;
+  static constexpr int b = 2;
+  static constexpr int c = 3;
+  static inline constexpr int d = 4;
+};
+const int &compat_use_before_redecl = compat::b;
+const int compat::a;
+const int compat::b;
+const int compat::c;
+const int compat::d;
+const int &compat_use_after_redecl1 = compat::c;
+const int &compat_use_after_redecl2 = compat::d;
+// CHECK: @_ZN6compat1bE = weak_odr constant i32 2
+// CHECK: @_ZN6compat1aE = weak_odr constant i32 1
+// CHECK: @_ZN6compat1cE = weak_odr constant i32 3
+// CHECK: @_ZN6compat1dE = linkonce_odr constant i32 4
+
+template<typename T> struct X {
+  static int a;
+  static inline int b;
+  static int c;
+};
+// CHECK: @_ZN1XIiE1aE = linkonce_odr global i32 10
+// CHECK: @_ZN1XIiE1bE = global i32 20
+// CHECK-NOT: @_ZN1XIiE1cE
+template<> inline int X<int>::a = 10;
+int &use3 = X<int>::a;
+template<> int X<int>::b = 20;
+template<> inline int X<int>::c = 30;
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK: call i32 @_Z1fv
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK-NOT: comdat
+// CHECK-SAME: {{$}}
+// CHECK: load atomic {{.*}} acquire
+// CHECK: br
+// CHECK: __cxa_guard_acquire(i64* @_ZGV1b)
+// CHECK: br
+// CHECK: call i32 @_Z1fv
+// CHECK: __cxa_guard_release(i64* @_ZGV1b)
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK: call i32 @_Z1fv
+
+template<typename T> inline int d = f();
+int e = d<int>;
+
+// CHECK-LABEL: define {{.*}}global_var_init{{.*}}comdat
+// CHECK: _ZGV1dIiE
+// CHECK-NOT: __cxa_guard_acquire(i64* @_ZGV1b)
+// CHECK: call i32 @_Z1fv
+// CHECK-NOT: __cxa_guard_release(i64* @_ZGV1b)
diff --git a/test/CodeGenCXX/cxx1z-lambda-star-this.cpp b/test/CodeGenCXX/cxx1z-lambda-star-this.cpp
new file mode 100644
index 0000000..a7e4aad
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-lambda-star-this.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -std=c++1y -triple i686-pc-windows-msvc -emit-llvm %s -o - | FileCheck %s
+//CHECK: %[[A_LAMBDA:.*]] = type { %struct.A }
+//CHECK: %[[B_LAMBDA:.*]] = type { %struct.B* }
+struct A {
+  double a = 111;
+  auto foo() { return [*this] { return a; }; }
+};
+
+namespace ns1 {
+int X = A{}.foo()();
+} //end ns1
+
+//CHECK: @"\01?foo@A@@QAE?A?<auto>@@XZ"(%struct.A* %this, %class.anon* noalias sret %[[A_LAMBDA_RETVAL:.*]])
+// get the first object with the closure type, which is of type 'struct.A'
+//CHECK: %[[I0:.+]] = getelementptr inbounds %[[A_LAMBDA]], %[[A_LAMBDA]]* %[[A_LAMBDA_RETVAL]], i32 0, i32 0
+//CHECK: %[[I1:.+]] = bitcast %struct.A* %[[I0]] to i8*
+//CHECK: %[[I2:.+]] = bitcast %struct.A* %this1 to i8*
+// copy the contents ...
+//CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[I1]], i8* %[[I2]], i32 8, i32 8, i1 false)
+
+struct B {
+  double b = 222;
+  auto bar() { return [this] { return b; }; };
+};
+
+namespace ns2 {
+int X = B{}.bar()();
+}
+//CHECK: @"\01?bar@B@@QAE?A?<auto>@@XZ"(%struct.B* %this, %class.anon.0* noalias sret %agg.result)
+//CHECK: %[[I20:.+]] = getelementptr inbounds %class.anon.0, %class.anon.0* %agg.result, i32 0, i32 0
+//CHECK: store %struct.B* %this1, %struct.B** %[[I20]], align 4
diff --git a/test/CodeGenCXX/debug-info-calling-conventions.cpp b/test/CodeGenCXX/debug-info-calling-conventions.cpp
new file mode 100644
index 0000000..51d801e
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-calling-conventions.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 %s -triple=i686-pc-windows-msvc -debug-info-kind=limited -emit-llvm -o - | FileCheck %s
+
+struct A {
+  void thiscallcc();
+};
+void A::thiscallcc() {}
+
+// CHECK: !DISubprogram(name: "thiscallcc", {{.*}} type: ![[thiscallty:[^,]*]], {{.*}})
+// CHECK: ![[thiscallty]] = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: ![[thisargs:[^,)]*]])
+// CHECK: ![[thisargs]] = !{null, ![[thisptrty:[^,}]*]]}
+// CHECK: ![[thisptrty]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{.*}}, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+
+void cdeclcc() {}
+void __fastcall fastcallcc() {}
+void __stdcall stdcallcc() {}
+void __vectorcall vectorcallcc() {}
+
+// CHECK: !DISubprogram(name: "cdeclcc", {{.*}} type: ![[cdeclty:[^,]*]], {{.*}})
+// CHECK: ![[cdeclty]] = !DISubroutineType(types: ![[noargs:[^,)]*]])
+// CHECK: ![[noargs]] = !{null}
+// CHECK: !DISubprogram(name: "fastcallcc", {{.*}} type: ![[fastcallty:[^,]*]], {{.*}})
+// CHECK: ![[fastcallty]] = !DISubroutineType(cc: DW_CC_BORLAND_msfastcall, types: ![[noargs]])
+// CHECK: !DISubprogram(name: "stdcallcc", {{.*}} type: ![[stdcallty:[^,]*]], {{.*}})
+// CHECK: ![[stdcallty]] = !DISubroutineType(cc: DW_CC_BORLAND_stdcall, types: ![[noargs]])
+// CHECK: !DISubprogram(name: "vectorcallcc", {{.*}} type: ![[vectorcallty:[^,]*]], {{.*}})
+// CHECK: ![[vectorcallty]] = !DISubroutineType(cc: DW_CC_LLVM_vectorcall, types: ![[noargs]])
diff --git a/test/CodeGenCXX/debug-info-class-limited-plugin.test b/test/CodeGenCXX/debug-info-class-limited-plugin.test
index 61d258d..533c2f6 100644
--- a/test/CodeGenCXX/debug-info-class-limited-plugin.test
+++ b/test/CodeGenCXX/debug-info-class-limited-plugin.test
@@ -1,2 +1,2 @@
-RUN: %clang_cc1 -emit-llvm -fno-standalone-debug -g -o - -load %llvmshlibdir/PrintFunctionNames%pluginext -add-plugin print-function-names %S/Inputs/debug-info-class-limited.cpp 2>&1 | FileCheck %S/Inputs/debug-info-class-limited.cpp
+RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -o - -load %llvmshlibdir/PrintFunctionNames%pluginext -add-plugin print-function-names %S/Inputs/debug-info-class-limited.cpp 2>&1 | FileCheck %S/Inputs/debug-info-class-limited.cpp
 REQUIRES: plugins, examples
diff --git a/test/CodeGenCXX/debug-info-class-limited.test b/test/CodeGenCXX/debug-info-class-limited.test
index 0b10728..c2e3328 100644
--- a/test/CodeGenCXX/debug-info-class-limited.test
+++ b/test/CodeGenCXX/debug-info-class-limited.test
@@ -1 +1 @@
-RUN: %clang_cc1 -emit-llvm -fno-standalone-debug -g %S/Inputs/debug-info-class-limited.cpp -o - | FileCheck %S/Inputs/debug-info-class-limited.cpp
+RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited %S/Inputs/debug-info-class-limited.cpp -o - | FileCheck %S/Inputs/debug-info-class-limited.cpp
diff --git a/test/CodeGenCXX/debug-info-codeview-display-name.cpp b/test/CodeGenCXX/debug-info-codeview-display-name.cpp
index 1d0300c..b1b5a1e 100644
--- a/test/CodeGenCXX/debug-info-codeview-display-name.cpp
+++ b/test/CodeGenCXX/debug-info-codeview-display-name.cpp
@@ -1,14 +1,22 @@
-// RUN: %clang_cc1 -fblocks -debug-info-kind=limited -gcodeview -emit-llvm %s -o - -triple=x86_64-pc-win32 -std=c++98 | \
-// RUN:  grep 'DISubprogram' | sed -e 's/.*name: "\([^"]*\)".*/"\1"/' | FileCheck %s
+// RUN: %clang_cc1 -fblocks -debug-info-kind=limited -gcodeview -emit-llvm %s \
+// RUN:       -o - -triple=x86_64-pc-win32 -std=c++98 | \
+// RUN:    grep 'DISubprogram' | sed -e 's/.*name: "\([^"]*\)".*/"\1"/' | \
+// RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=UNQUAL
+// RUN: %clang_cc1 -fblocks -debug-info-kind=line-tables-only -gcodeview -emit-llvm %s \
+// RUN:       -o - -triple=x86_64-pc-win32 -std=c++98 | \
+// RUN:    grep 'DISubprogram' | sed -e 's/.*name: "\([^"]*\)".*/"\1"/' | \
+// RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=QUAL
 
 void freefunc() { }
 // CHECK-DAG: "freefunc"
 
 namespace N {
   int b() { return 0; }
-// CHECK-DAG: "N::b"
+// UNQUAL-DAG: "b"
+// QUAL-DAG: "N::b"
   namespace { void func() { } }
-// CHECK-DAG: "N::`anonymous namespace'::func
+// UNQUAL-DAG: "func"
+// QUAL-DAG: "N::`anonymous namespace'::func"
 }
 
 void _c(void) {
@@ -19,19 +27,24 @@
 struct foo {
   int operator+(int);
   foo(){}
-// CHECK-DAG: "foo::foo"
+// UNQUAL-DAG: "foo"
+// QUAL-DAG: "foo::foo"
 
   ~foo(){}
-// CHECK-DAG: "foo::~foo"
+// UNQUAL-DAG: "~foo"
+// QUAL-DAG: "foo::~foo"
 
   foo(int i){}
-// CHECK-DAG: "foo::foo"
+// UNQUAL-DAG: "foo"
+// QUAL-DAG: "foo::foo"
 
   foo(char *q){}
-// CHECK-DAG: "foo::foo"
+// UNQUAL-DAG: "foo"
+// QUAL-DAG: "foo::foo"
 
   static foo* static_method() { return 0; }
-// CHECK-DAG: "foo::static_method"
+// UNQUAL-DAG: "static_method"
+// QUAL-DAG: "foo::static_method"
 
 };
 
@@ -40,7 +53,8 @@
   foo::static_method();
 }
 
-// CHECK-DAG: "foo::operator+"
+// UNQUAL-DAG: "operator+"
+// QUAL-DAG: "foo::operator+"
 int foo::operator+(int a) { return a; }
 
 // PR17371
@@ -60,14 +74,20 @@
 void OverloadedNewDelete::operator delete[](void *) { }
 int OverloadedNewDelete::operator+(int x) { return x; };
 
-// CHECK-DAG: "OverloadedNewDelete::operator new"
-// CHECK-DAG: "OverloadedNewDelete::operator new[]"
-// CHECK-DAG: "OverloadedNewDelete::operator delete"
-// CHECK-DAG: "OverloadedNewDelete::operator delete[]"
-// CHECK-DAG: "OverloadedNewDelete::operator+"
+// UNQUAL-DAG: "operator new"
+// UNQUAL-DAG: "operator new[]"
+// UNQUAL-DAG: "operator delete"
+// UNQUAL-DAG: "operator delete[]"
+// UNQUAL-DAG: "operator+"
+// QUAL-DAG: "OverloadedNewDelete::operator new"
+// QUAL-DAG: "OverloadedNewDelete::operator new[]"
+// QUAL-DAG: "OverloadedNewDelete::operator delete"
+// QUAL-DAG: "OverloadedNewDelete::operator delete[]"
+// QUAL-DAG: "OverloadedNewDelete::operator+"
 
-template <void (*)(void)>
+
+template <typename T, void (*)(void)>
 void fn_tmpl() {}
 
-template void fn_tmpl<freefunc>();
-// CHECK-DAG: "fn_tmpl"
+template void fn_tmpl<int, freefunc>();
+// CHECK-DAG: "fn_tmpl<int,&freefunc>"
diff --git a/test/CodeGenCXX/debug-info-codeview-injected-class.cpp b/test/CodeGenCXX/debug-info-codeview-injected-class.cpp
new file mode 100644
index 0000000..b421b2b
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-codeview-injected-class.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 %s -std=c++11 -triple=i686-pc-windows-msvc -debug-info-kind=limited -gcodeview -emit-llvm -o - | FileCheck %s
+
+// The injected class names in this test were accidentally making it into our
+// nested class record debug info. Make sure they don't appear there.
+
+// PR28790
+
+struct A {
+  const char *m_fn1();
+  template <typename> class B;
+  template <typename> class C;
+  template <typename FunctionIdT> class C<B<FunctionIdT>>;
+};
+const char *A::m_fn1() { return nullptr; }
+
+// CHECK: ![[A:[^ ]*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "A",
+// CHECK-SAME: elements: ![[elements:[0-9]+]]
+
+// CHECK: ![[elements]] = !{![[m_fn1:[0-9]+]]}
+
+// CHECK: ![[m_fn1]] = !DISubprogram(name: "m_fn1",
diff --git a/test/CodeGenCXX/debug-info-cxx1y.cpp b/test/CodeGenCXX/debug-info-cxx1y.cpp
index faf29d3..36b3e09 100644
--- a/test/CodeGenCXX/debug-info-cxx1y.cpp
+++ b/test/CodeGenCXX/debug-info-cxx1y.cpp
@@ -1,16 +1,25 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only -std=c++14 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
 
+// CHECK: imports: [[IMPS:![0-9]*]]
 // CHECK: [[EMPTY:![0-9]*]] = !{}
+
 // CHECK: [[FOO:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo",
 // CHECK-SAME:             elements: [[EMPTY]]
-// FIXME: The context of this definition should be the CU/file scope, not the class.
-// CHECK: !DISubprogram(name: "func", {{.*}} scope: [[FOO]]
+
+// CHECK: [[IMPS]] = !{[[IMP:![0-9]*]]}
+// CHECK: [[IMP]] = !DIImportedEntity(
+// CHECK-SAME: entity: [[F3:![0-9]*]]
+// CHECK: [[F3]] = distinct !DISubprogram(name: "f3"
 // CHECK-SAME:          type: [[SUBROUTINE_TYPE:![0-9]*]]
-// CHECK-SAME:          isDefinition: true
-// CHECK-SAME:          declaration: [[FUNC_DECL:![0-9]*]]
 // CHECK: [[SUBROUTINE_TYPE]] = !DISubroutineType(types: [[TYPE_LIST:![0-9]*]])
 // CHECK: [[TYPE_LIST]] = !{[[INT:![0-9]*]]}
 // CHECK: [[INT]] = !DIBasicType(name: "int"
+
+// FIXME: The context of this definition should be the CU/file scope, not the class.
+// CHECK: !DISubprogram(name: "func", {{.*}} scope: [[FOO]]
+// CHECK-SAME:          type: [[SUBROUTINE_TYPE]]
+// CHECK-SAME:          isDefinition: true
+// CHECK-SAME:          declaration: [[FUNC_DECL:![0-9]*]]
 // CHECK: [[FUNC_DECL]] = !DISubprogram(name: "func",
 // CHECK-SAME:                          scope: [[FOO]]
 // CHECK-SAME:                          type: [[SUBROUTINE_TYPE]]
@@ -25,3 +34,12 @@
 auto foo::func() {
   return 1;
 }
+
+namespace ns {
+auto f2();
+auto f3() {
+  return 0;
+}
+}
+using ns::f2;
+using ns::f3;
diff --git a/test/CodeGenCXX/debug-info-dllimport-base-class.cpp b/test/CodeGenCXX/debug-info-dllimport-base-class.cpp
new file mode 100644
index 0000000..8b440e1
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-dllimport-base-class.cpp
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple i386-pc-windows -emit-llvm -gcodeview -debug-info-kind=limited -fms-compatibility %s -x c++ -o - | FileCheck %s
+
+// Ensure we emit debug info for the full definition of base classes that will
+// be imported from a DLL.  Otherwise, the debugger wouldn't be able to show the
+// members.
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "ImportedBase",
+// CHECK-NOT:              DIFlagFwdDecl
+// CHECK-SAME:             ){{$}}
+
+struct __declspec(dllimport) ImportedBase {
+  ImportedBase();
+  virtual void Foo();
+};
+
+struct DerivedFromImported : public ImportedBase {};
+
+int main() {
+  DerivedFromImported d;
+}
diff --git a/test/CodeGenCXX/debug-info-dup-fwd-decl.cpp b/test/CodeGenCXX/debug-info-dup-fwd-decl.cpp
index f7a2cfe..3b23ebf 100644
--- a/test/CodeGenCXX/debug-info-dup-fwd-decl.cpp
+++ b/test/CodeGenCXX/debug-info-dup-fwd-decl.cpp
@@ -19,6 +19,6 @@
 
 Test t;
 
-// CHECK: !DIDerivedType(tag: DW_TAG_pointer_type
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "data"
+// CHECK: !DIDerivedType(tag: DW_TAG_pointer_type
 // CHECK-NOT: !DICompositeType(tag: DW_TAG_structure_type, name: "data"
diff --git a/test/CodeGenCXX/debug-info-line-if.cpp b/test/CodeGenCXX/debug-info-line-if.cpp
index 2980635..b3f9c32 100644
--- a/test/CodeGenCXX/debug-info-line-if.cpp
+++ b/test/CodeGenCXX/debug-info-line-if.cpp
@@ -15,7 +15,7 @@
 
   // CHECK: br label
   // CHECK: br label
-  // CHECK: br label {{.*}}, !dbg [[DBG1:!.*]]
+  // CHECK: br label {{.*}}, !dbg [[DBG1:![0-9]*]], !llvm.loop [[L1:![0-9]*]]
 
 #line 200
   while (a)
@@ -25,7 +25,7 @@
       ++a; // CHECK: add nsw{{.*}}, 1
 
   // CHECK: br label
-  // CHECK: br label {{.*}}, !dbg [[DBG2:!.*]]
+  // CHECK: br label {{.*}}, !dbg [[DBG2:![0-9]*]], !llvm.loop [[L2:![0-9]*]]
 
 #line 300
   for (; a; )
@@ -35,7 +35,7 @@
       ++a; // CHECK: add nsw{{.*}}, 1
 
   // CHECK: br label
-  // CHECK: br label {{.*}}, !dbg [[DBG3:!.*]]
+  // CHECK: br label {{.*}}, !dbg [[DBG3:![0-9]*]], !llvm.loop [[L3:![0-9]*]]
 
 #line 400
   int x[] = {1, 2};
@@ -46,10 +46,22 @@
       ++a; // CHECK: add nsw{{.*}}, 1
 
   // CHECK: br label
-  // CHECK: br label {{.*}}, !dbg [[DBG4:!.*]]
+  // CHECK: br label {{.*}}, !dbg [[DBG4:![0-9]*]], !llvm.loop [[L4:![0-9]*]]
 
-  // CHECK: [[DBG1]] = !DILocation(line: 100, scope: !{{.*}})
-  // CHECK: [[DBG2]] = !DILocation(line: 200, scope: !{{.*}})
-  // CHECK: [[DBG3]] = !DILocation(line: 300, scope: !{{.*}})
-  // CHECK: [[DBG4]] = !DILocation(line: 401, scope: !{{.*}})
+  // CHECK-DAG: [[DBG1]] = !DILocation(line: 100, scope: !{{.*}})
+  // CHECK-DAG: [[DBG2]] = !DILocation(line: 200, scope: !{{.*}})
+  // CHECK-DAG: [[DBG3]] = !DILocation(line: 300, scope: !{{.*}})
+  // CHECK-DAG: [[DBG4]] = !DILocation(line: 401, scope: !{{.*}})
+
+  // CHECK-DAG: [[L1]] = distinct !{[[L1]], [[LDBG1:![0-9]*]]}
+  // CHECK-DAG: [[LDBG1]] = !DILocation(line: 100, scope: !{{.*}})
+
+  // CHECK-DAG: [[L2]] = distinct !{[[L2]], [[LDBG2:![0-9]*]]}
+  // CHECK-DAG: [[LDBG2]] = !DILocation(line: 200, scope: !{{.*}})
+
+  // CHECK-DAG: [[L3]] = distinct !{[[L3]], [[LDBG3:![0-9]*]]}
+  // CHECK-DAG: [[LDBG3]] = !DILocation(line: 300, scope: !{{.*}})
+
+  // CHECK-DAG: [[L4]] = distinct !{[[L4]], [[LDBG4:![0-9]*]]}
+  // CHECK-DAG: [[LDBG4]] = !DILocation(line: 401, scope: !{{.*}})
 }
diff --git a/test/CodeGenCXX/debug-info-line.cpp b/test/CodeGenCXX/debug-info-line.cpp
index 9fb6ba8..1165304 100644
--- a/test/CodeGenCXX/debug-info-line.cpp
+++ b/test/CodeGenCXX/debug-info-line.cpp
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - -triple %itanium_abi_triple | FileCheck %s
 // RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - -triple i686-linux-gnu | FileCheck %s
 
-// XFAIL: win32
-
 int &src();
 int *sink();
 extern "C" __complex float complex_src();
diff --git a/test/CodeGenCXX/debug-info-member-call.cpp b/test/CodeGenCXX/debug-info-member-call.cpp
new file mode 100644
index 0000000..3b5adb8
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-member-call.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=standalone -dwarf-column-info %s -o - | FileCheck %s
+void ext();
+
+struct Bar {
+  void bar() { ext(); }
+};
+
+struct Foo {
+  Bar *b;
+
+  Bar *foo() { return b; }
+};
+
+void test(Foo *f) {
+  f->foo()->bar();
+}
+
+// CHECK-LABEL: @_Z4testP3Foo
+// CHECK: call {{.*}} @_ZN3Foo3fooEv{{.*}}, !dbg ![[CALL1LOC:.*]]
+// CHECK: call void @_ZN3Bar3barEv{{.*}}, !dbg ![[CALL2LOC:.*]]
+
+// CHECK: ![[CALL1LOC]] = !DILocation(line: [[LINE:[0-9]+]], column: 6,
+// CHECK: ![[CALL2LOC]] = !DILocation(line: [[LINE]], column: 13,
+
diff --git a/test/CodeGenCXX/debug-info-method.cpp b/test/CodeGenCXX/debug-info-method.cpp
index b55e2c4..73d8b92 100644
--- a/test/CodeGenCXX/debug-info-method.cpp
+++ b/test/CodeGenCXX/debug-info-method.cpp
@@ -6,7 +6,7 @@
 // CHECK-SAME:                                  DIFlagArtificial
 // CHECK: !DIDerivedType(tag: DW_TAG_ptr_to_member_type
 // CHECK: !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: ![[MEMFUNTYPE:[0-9]+]]
-// CHECK: ![[MEMFUNTYPE]] = !DISubroutineType(types: ![[MEMFUNARGS:[0-9]+]])
+// CHECK: ![[MEMFUNTYPE]] = !DISubroutineType({{(cc: DW_CC_BORLAND_thiscall, )?}}types: ![[MEMFUNARGS:[0-9]+]])
 // CHECK: ![[MEMFUNARGS]] = {{.*}}, ![[THISTYPE]],
 // CHECK: !DILocalVariable(name: "this", arg: 1
 // CHECK: !DILocalVariable(arg: 2
diff --git a/test/CodeGenCXX/debug-info-ms-abi.cpp b/test/CodeGenCXX/debug-info-ms-abi.cpp
new file mode 100644
index 0000000..b1ce128
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-ms-abi.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 %s -triple=i686-pc-windows-msvc -debug-info-kind=limited -gcodeview -emit-llvm -o - | FileCheck %s
+
+// Tests that certain miscellaneous features work in the MS ABI.
+
+struct Foo {
+  virtual void f();
+  virtual void g();
+  virtual void h();
+  struct Nested {};
+};
+Foo f;
+Foo::Nested n;
+
+// CHECK: ![[Foo:[^ ]*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo",
+// CHECK-SAME: elements: ![[elements:[0-9]+]]
+// CHECK-SAME: identifier: ".?AUFoo@@"
+
+// CHECK: ![[elements]] = !{![[vptr:[0-9]+]], ![[Nested:[0-9]+]], ![[f:[0-9]+]], ![[g:[0-9]+]], ![[h:[0-9]+]]}
+
+// CHECK: ![[Nested]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Nested",
+// CHECK-SAME: identifier: ".?AUNested@Foo@@"
+
+// CHECK: ![[f]] = !DISubprogram(name: "f",
+// CHECK-SAME: containingType: ![[Foo]], virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0,
+// CHECK-SAME: flags: DIFlagPrototyped | DIFlagIntroducedVirtual,
+
+// CHECK: ![[g]] = !DISubprogram(name: "g",
+// CHECK-SAME: containingType: ![[Foo]], virtuality: DW_VIRTUALITY_virtual, virtualIndex: 1,
+// CHECK-SAME: flags: DIFlagPrototyped | DIFlagIntroducedVirtual,
+
+// CHECK: ![[h]] = !DISubprogram(name: "h",
+// CHECK-SAME: containingType: ![[Foo]], virtuality: DW_VIRTUALITY_virtual, virtualIndex: 2,
+// CHECK-SAME: flags: DIFlagPrototyped | DIFlagIntroducedVirtual,
diff --git a/test/CodeGenCXX/debug-info-ms-anonymous-tag.cpp b/test/CodeGenCXX/debug-info-ms-anonymous-tag.cpp
new file mode 100644
index 0000000..cef1eb8
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-ms-anonymous-tag.cpp
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-pc-win32 -debug-info-kind=limited -gcodeview %s -emit-llvm -o - | FileCheck %s
+
+typedef struct {
+} test1;
+
+test1 gv1;
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "test1"
+
+struct {
+} test2;
+void *use_test2 = &test2;
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "<unnamed-type-test2>"
+
+typedef struct {
+} *test3;
+test3 gv3;
+void *use_test3 = &gv3;
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "<unnamed-type-test3>"
diff --git a/test/CodeGenCXX/debug-info-ms-bitfields.cpp b/test/CodeGenCXX/debug-info-ms-bitfields.cpp
new file mode 100644
index 0000000..07d4c0c
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-ms-bitfields.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple x86_64-pc-win32 -debug-info-kind=limited -gcodeview %s -emit-llvm -o - | FileCheck %s
+
+#pragma pack(1)
+struct S {
+  char : 8;
+  short   : 8;
+  short x : 8;
+} s;
+
+// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "x", {{.*}}, size: 8, align: 16, offset: 16, flags: DIFlagBitField, extraData: i64 8)
diff --git a/test/CodeGenCXX/debug-info-ms-ptr-to-member.cpp b/test/CodeGenCXX/debug-info-ms-ptr-to-member.cpp
new file mode 100644
index 0000000..4b9f2a1
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-ms-ptr-to-member.cpp
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -triple x86_64-windows -debug-info-kind=limited -gcodeview %s -emit-llvm -o - | FileCheck %s
+
+// Test member pointer inheritance models.
+
+struct A { int a; };
+struct B { int b; };
+struct C : A, B { int c; };
+struct D : virtual C { int d; };
+struct E;
+int A::*pmd_a;
+int C::*pmd_b;
+int D::*pmd_c;
+int E::*pmd_d;
+void (A::*pmf_a)();
+void (C::*pmf_b)();
+void (D::*pmf_c)();
+void (E::*pmf_d)();
+
+// Test incomplete MPTs, which don't have inheritance models.
+
+struct Incomplete;
+int Incomplete::**ppmd;
+void (Incomplete::**ppmf)();
+
+// CHECK: distinct !DIGlobalVariable(name: "pmd_a", {{.*}} type: ![[pmd_a:[^, ]*]], {{.*}})
+// CHECK: ![[pmd_a]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 32, flags: DIFlagSingleInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmd_b", {{.*}} type: ![[pmd_b:[^, ]*]], {{.*}})
+// CHECK: ![[pmd_b]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 32, flags: DIFlagMultipleInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmd_c", {{.*}} type: ![[pmd_c:[^, ]*]], {{.*}})
+// CHECK: ![[pmd_c]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 64, flags: DIFlagVirtualInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmd_d", {{.*}} type: ![[pmd_d:[^, ]*]], {{.*}})
+// CHECK: ![[pmd_d]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 96,
+// CHECK-NOT: flags:
+// CHECK-SAME: ){{$}}
+
+// CHECK: distinct !DIGlobalVariable(name: "pmf_a", {{.*}} type: ![[pmf_a:[^, ]*]], {{.*}})
+// CHECK: ![[pmf_a]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 64, flags: DIFlagSingleInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmf_b", {{.*}} type: ![[pmf_b:[^, ]*]], {{.*}})
+// CHECK: ![[pmf_b]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 128, flags: DIFlagMultipleInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmf_c", {{.*}} type: ![[pmf_c:[^, ]*]], {{.*}})
+// CHECK: ![[pmf_c]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 128, flags: DIFlagVirtualInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmf_d", {{.*}} type: ![[pmf_d:[^, ]*]], {{.*}})
+// CHECK: ![[pmf_d]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 192,
+// CHECK-NOT: flags:
+// CHECK-SAME: ){{$}}
+
+// CHECK: distinct !DIGlobalVariable(name: "ppmd", {{.*}} type: ![[ppmd:[^, ]*]], {{.*}})
+// CHECK: ![[ppmd]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[ppmd2:[^ ]*]], size: 64, align: 64)
+// CHECK: ![[ppmd2]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{[0-9]*}}, extraData: !{{[0-9]*}}){{$}}
+// CHECK: distinct !DIGlobalVariable(name: "ppmf", {{.*}} type: ![[ppmf:[^, ]*]], {{.*}})
+// CHECK: ![[ppmf]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[ppmf2:[^ ]*]], size: 64, align: 64)
+// CHECK: ![[ppmf2]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{[0-9]*}}, extraData: !{{[0-9]*}}){{$}}
diff --git a/test/CodeGenCXX/debug-info-namespace.cpp b/test/CodeGenCXX/debug-info-namespace.cpp
index cd301fd..060a5ce 100644
--- a/test/CodeGenCXX/debug-info-namespace.cpp
+++ b/test/CodeGenCXX/debug-info-namespace.cpp
@@ -95,7 +95,7 @@
 // CHECK: [[M11]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, name: "X", scope: [[FUNC]], entity: [[CTXT]]
 // CHECK: [[M12]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, name: "Y", scope: [[FUNC]], entity: [[M11]]
 // CHECK: [[M13]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[VAR_DECL:![0-9]+]]
-// CHECK: [[VAR_DECL]] = !DIGlobalVariable(name: "var_decl", linkageName: "_ZN1A1B8var_declE", scope: [[NS]],{{.*}} line: 8,
+// CHECK: [[VAR_DECL]] = !DIGlobalVariable(name: "var_decl", linkageName: "{{[^"]*var_decl[^"]*}}", scope: [[NS]],{{.*}} line: 8,
 // CHECK: [[M14]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[FUNC_DECL:![0-9]+]]
 // CHECK: [[FUNC_DECL]] = !DISubprogram(name: "func_decl",
 // CHECK-SAME:                          scope: [[NS]], file: [[FOOCPP]], line: 9
@@ -111,5 +111,3 @@
 // CHECK-NOLIMIT: !DICompositeType(tag: DW_TAG_structure_type, name: "bar",{{.*}} line: 6,
 // CHECK-NOLIMIT-NOT:              DIFlagFwdDecl
 // CHECK-NOLIMIT-SAME:             ){{$}}
-
-// REQUIRES: dw2
diff --git a/test/CodeGenCXX/debug-info-nodebug.cpp b/test/CodeGenCXX/debug-info-nodebug.cpp
new file mode 100644
index 0000000..9f140ef
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-nodebug.cpp
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -DSETNODEBUG=0 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=YESINFO
+// RUN: %clang_cc1 -DSETNODEBUG=1 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=NOINFO
+
+#if SETNODEBUG
+#define NODEBUG __attribute__((nodebug))
+#else
+#define NODEBUG
+#endif
+
+// Const global variable. Use it so it gets emitted.
+NODEBUG static const int const_global_int_def = 1;
+void func1(int);
+void func2() { func1(const_global_int_def); }
+// YESINFO-DAG: !DIGlobalVariable(name: "const_global_int_def"
+// NOINFO-NOT:  !DIGlobalVariable(name: "const_global_int_def"
+
+// Global variable with a more involved type.
+// If the variable has no debug info, the type should not appear either.
+struct S1 {
+  int a;
+  int b;
+};
+NODEBUG S1 global_struct = { 2, 3 };
+// YESINFO-DAG: !DICompositeType({{.*}} name: "S1"
+// NOINFO-NOT:  !DICompositeType({{.*}} name: "S1"
+// YESINFO-DAG: !DIGlobalVariable(name: "global_struct"
+// NOINFO-NOT:  !DIGlobalVariable(name: "global_struct"
+
+// Static data members. Const member needs a use.
+// Also the class as a whole needs a use, so that we produce debug info for
+// the entire class (iterating over the members, demonstrably skipping those
+// with 'nodebug').
+struct S2 {
+  NODEBUG static int static_member;
+  NODEBUG static const int static_const_member = 4;
+};
+int S2::static_member = 5;
+void func3() {
+  S2 junk;
+  func1(S2::static_const_member);
+}
+// YESINFO-DAG: !DIGlobalVariable(name: "static_member"
+// NOINFO-NOT:  !DIGlobalVariable(name: "static_member"
+// YESINFO-DAG: !DIDerivedType({{.*}} name: "static_const_member"
+// NOINFO-NOT:  !DIDerivedType({{.*}} name: "static_const_member"
+
+// Function-local static and auto variables.
+void func4() {
+  NODEBUG static int static_local = 6;
+  NODEBUG        int normal_local = 7;
+}
+// YESINFO-DAG: !DIGlobalVariable(name: "static_local"
+// NOINFO-NOT:  !DIGlobalVariable(name: "static_local"
+// YESINFO-DAG: !DILocalVariable(name: "normal_local"
+// NOINFO-NOT:  !DILocalVariable(name: "normal_local"
diff --git a/test/CodeGenCXX/debug-info-static-member.cpp b/test/CodeGenCXX/debug-info-static-member.cpp
index f85cf8f..ed8ae01 100644
--- a/test/CodeGenCXX/debug-info-static-member.cpp
+++ b/test/CodeGenCXX/debug-info-static-member.cpp
@@ -1,4 +1,6 @@
 // RUN: %clangxx -target x86_64-unknown-unknown -g %s -emit-llvm -S -o - | FileCheck %s
+// RUN: %clangxx -target x86_64-unknown-unknown -g -std=c++98 %s -emit-llvm -S -o - | FileCheck %s
+// RUN: %clangxx -target x86_64-unknown-unknown -g -std=c++11 %s -emit-llvm -S -o - | FileCheck %s
 // PR14471
 
 enum X {
@@ -10,7 +12,11 @@
   const static bool const_a = true;
 protected:
   static int b;
+#if __cplusplus >= 201103L
+  constexpr static float const_b = 3.14;
+#else
   const static float const_b = 3.14;
+#endif
 public:
   static int c;
   const static int const_c = 18;
diff --git a/test/CodeGenCXX/default_calling_conv.cpp b/test/CodeGenCXX/default_calling_conv.cpp
new file mode 100644
index 0000000..95c214a
--- /dev/null
+++ b/test/CodeGenCXX/default_calling_conv.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -fdefault-calling-conv=cdecl -emit-llvm -o - %s | FileCheck %s --check-prefix=CDECL --check-prefix=ALL
+// RUN: %clang_cc1 -triple i786-unknown-linux-gnu -target-feature +sse4.2 -fdefault-calling-conv=fastcall -emit-llvm -o - %s | FileCheck %s --check-prefix=FASTCALL --check-prefix=ALL
+// RUN: %clang_cc1 -triple i486-unknown-linux-gnu -fdefault-calling-conv=stdcall -emit-llvm -o - %s | FileCheck %s --check-prefix=STDCALL --check-prefix=ALL
+// RUN: %clang_cc1 -triple i486-unknown-linux-gnu -mrtd -emit-llvm -o - %s | FileCheck %s --check-prefix=STDCALL --check-prefix=ALL
+// RUN: %clang_cc1 -triple i986-unknown-linux-gnu -fdefault-calling-conv=vectorcall -emit-llvm -o - %s | FileCheck %s --check-prefix=VECTORCALL --check-prefix=ALL
+
+// CDECL: define void @_Z5test1v
+// FASTCALL: define x86_fastcallcc void @_Z5test1v
+// STDCALL: define x86_stdcallcc void @_Z5test1v
+// VECTORCALL: define x86_vectorcallcc void @_Z5test1v
+void test1() {}
+
+// ALL: define void @_Z5test2v
+void __attribute__((cdecl)) test2() {}
+
+// ALL: define x86_fastcallcc void @_Z5test3v
+void __attribute__((fastcall)) test3() {}
+
+// ALL: define x86_stdcallcc void @_Z5test4v
+void __attribute__((stdcall)) test4() {}
+
+// ALL: define  x86_vectorcallcc void @_Z5test5v
+void __attribute__((vectorcall)) test5() {}
+
+// ALL: define linkonce_odr void @_ZN1A11test_memberEv
+class A {
+public:
+  void test_member() {}
+};
+
+void test() {
+  A a;
+  a.test_member();
+}
diff --git a/test/CodeGenCXX/delete-two-arg.cpp b/test/CodeGenCXX/delete-two-arg.cpp
index 85275b3..68a6fa6 100644
--- a/test/CodeGenCXX/delete-two-arg.cpp
+++ b/test/CodeGenCXX/delete-two-arg.cpp
@@ -27,7 +27,7 @@
 
   // CHECK: define [[A:%.*]]* @_ZN5test24testEv()
   A *test() {
-    // CHECK:      [[NEW:%.*]] = call noalias i8* @_Znaj(i32 44)
+    // CHECK:      [[NEW:%.*]] = call i8* @_Znaj(i32 44)
     // CHECK-NEXT: [[T0:%.*]] = bitcast i8* [[NEW]] to i32*
     // CHECK-NEXT: store i32 10, i32* [[T0]]
     // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i8, i8* [[NEW]], i32 4
@@ -63,7 +63,7 @@
 
   // CHECK-LABEL: define void @_ZN5test34testEv()
   void test() {
-    // CHECK:      call noalias i8* @_Znaj(i32 24)
+    // CHECK:      call i8* @_Znaj(i32 24)
     // CHECK-NEXT: bitcast
     // CHECK-NEXT: store i32 5
     (void) new B[5];
diff --git a/test/CodeGenCXX/dllexport-members.cpp b/test/CodeGenCXX/dllexport-members.cpp
index 76f692d..1c56251 100644
--- a/test/CodeGenCXX/dllexport-members.cpp
+++ b/test/CodeGenCXX/dllexport-members.cpp
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 -triple i686-windows-msvc -fms-compatibility   -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M32 %s
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-compatibility -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -triple i686-windows-msvc   -fms-compatibility -fms-compatibility-version=18 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M32 %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=18 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -triple i686-windows-msvc   -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M32VS2015 %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M64VS2015 %s
 // RUN: %clang_cc1 -triple i686-windows-gnu                       -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G32 %s
 // RUN: %clang_cc1 -triple x86_64-windows-gnu                     -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G64 %s
 
@@ -427,6 +429,32 @@
 ExportDefaultedDefs& ExportDefaultedDefs::operator=(ExportDefaultedDefs&&) = default;
 
 
+// Export defaulted member function definitions declared inside class.
+struct ExportDefaultedInclassDefs {
+  __declspec(dllexport) ExportDefaultedInclassDefs() = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M64VS2013-DAG: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M64VS2015-NOT: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+
+  __declspec(dllexport) ~ExportDefaultedInclassDefs() = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??1ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M64VS2013-DAG: define weak_odr dllexport                void @"\01??1ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc void @"\01??1ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M64VS2015-NOT: define weak_odr dllexport                void @"\01??1ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+
+  __declspec(dllexport) ExportDefaultedInclassDefs(const ExportDefaultedInclassDefs&) = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64VS2013-DAG: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64VS2015-NOT: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+
+  __declspec(dllexport) ExportDefaultedInclassDefs& operator=(const ExportDefaultedInclassDefs&) = default;
+  // M32-DAG: define weak_odr dllexport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ExportDefaultedInclassDefs* @"\01??4ExportDefaultedInclassDefs@@QAEAAU0@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64-DAG: define weak_odr dllexport                dereferenceable({{[0-9]+}}) %struct.ExportDefaultedInclassDefs* @"\01??4ExportDefaultedInclassDefs@@QEAAAEAU0@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+};
+
+
 // Export allocation functions.
 struct ExportAlloc {
   __declspec(dllexport) void* operator new(__SIZE_TYPE__);
diff --git a/test/CodeGenCXX/dllexport-ms-friend.cpp b/test/CodeGenCXX/dllexport-ms-friend.cpp
new file mode 100644
index 0000000..7bcf590
--- /dev/null
+++ b/test/CodeGenCXX/dllexport-ms-friend.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple %ms_abi_triple -fms-extensions -emit-llvm -O0 -o - %s | FileCheck %s
+
+// Friend functions defined in classes are emitted.
+// CHECK: define weak_odr dllexport void @"\01?friend1@@YAXXZ"()
+struct FuncFriend1 {
+  friend __declspec(dllexport) void friend1() {}
+};
+
+// But function templates and functions defined in class templates are not
+// emitted.
+// CHECK-NOT: friend2
+// CHECK-NOT: friend3
+// CHECK-NOT: friend4
+struct FuncFriend2 {
+  template<typename> friend __declspec(dllexport) void friend2() {}
+};
+template<typename> struct FuncFriend3 {
+  friend __declspec(dllexport) void friend3() {}
+  struct Inner {
+    friend __declspec(dllexport) void friend4() {}
+  };
+};
diff --git a/test/CodeGenCXX/dllexport-pr26549.cpp b/test/CodeGenCXX/dllexport-pr26549.cpp
new file mode 100644
index 0000000..ceb2e06
--- /dev/null
+++ b/test/CodeGenCXX/dllexport-pr26549.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -fms-extensions -triple x86_64-windows-msvc -emit-llvm -o - | FileCheck %s
+
+template <typename> struct MessageT { };
+extern template struct MessageT<int>;
+
+// CHECK: define weak_odr dllexport {{.*}} %struct.MessageT* @"\01??4?$MessageT@H@@QEAAAEAU0@AEBU0@@Z"(
+template struct __declspec(dllexport) MessageT<int>;
+// Previously we crashed when this dllexport was the last thing in the file.
+// DO NOT ADD MORE TESTS AFTER THIS LINE!
diff --git a/test/CodeGenCXX/dllexport.cpp b/test/CodeGenCXX/dllexport.cpp
index 1412ad8..7cef7c2 100644
--- a/test/CodeGenCXX/dllexport.cpp
+++ b/test/CodeGenCXX/dllexport.cpp
@@ -1,5 +1,9 @@
-// RUN: %clang_cc1 -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-optzns -o - %s -w | FileCheck --check-prefix=MSC --check-prefix=M32 %s
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-optzns -o - %s -w -fms-compatibility-version=19.00 | FileCheck --check-prefix=MSC --check-prefix=M32 -check-prefix=MSVC2015 -check-prefix=M32MSVC2015 %s
+// RUN: %clang_cc1 -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-optzns -o - %s -w -fms-compatibility-version=18.00 | FileCheck --check-prefix=MSC --check-prefix=M32 -check-prefix=MSVC2013 -check-prefix=M32MSVC2013 %s
+
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w -fms-compatibility-version=19.00 | FileCheck --check-prefix=MSC --check-prefix=M64 -check-prefix=MSVC2015 -check-prefix=M64MSVC2015 %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w -fms-compatibility-version=18.00 | FileCheck --check-prefix=MSC --check-prefix=M64 -check-prefix=MSVC2013 -check-prefix=M64MSVC2013 %s
+
 // RUN: %clang_cc1 -triple i686-windows-gnu    -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck --check-prefix=GNU --check-prefix=G32 %s
 // RUN: %clang_cc1 -triple x86_64-windows-gnu  -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck --check-prefix=GNU --check-prefix=G64 %s
 
@@ -486,7 +490,7 @@
 
 struct CtorWithClosure {
   __declspec(dllexport) CtorWithClosure(...) {}
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorWithClosure@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 // M32-DAG:   %[[this_addr:.*]] = alloca %struct.CtorWithClosure*, align 4
 // M32-DAG:   store %struct.CtorWithClosure* %this, %struct.CtorWithClosure** %[[this_addr]], align 4
 // M32-DAG:   %[[this:.*]] = load %struct.CtorWithClosure*, %struct.CtorWithClosure** %[[this_addr]]
@@ -503,7 +507,7 @@
 struct __declspec(dllexport) ClassWithClosure {
   DELETE_IMPLICIT_MEMBERS(ClassWithClosure);
   ClassWithClosure(...) {}
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FClassWithClosure@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FClassWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 // M32-DAG:   %[[this_addr:.*]] = alloca %struct.ClassWithClosure*, align 4
 // M32-DAG:   store %struct.ClassWithClosure* %this, %struct.ClassWithClosure** %[[this_addr]], align 4
 // M32-DAG:   %[[this:.*]] = load %struct.ClassWithClosure*, %struct.ClassWithClosure** %[[this_addr]]
@@ -520,17 +524,19 @@
   };
 };
 
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedOuter@@QAEXXZ"({{.*}}) comdat
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedInner@NestedOuter@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedInner@NestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 
 template <typename T>
 struct SomeTemplate {
   SomeTemplate(T o = T()) : o(o) {}
   T o;
 };
+// MSVC2015-DAG: define weak_odr dllexport {{.+}} @"\01??4?$SomeTemplate@H@@Q{{.+}}@$$Q{{.+}}@@Z"
+// MSVC2013-DAG: define weak_odr dllexport {{.+}} @"\01??4?$SomeTemplate@H@@Q{{.+}}0@A{{.+}}0@@Z"
 struct __declspec(dllexport) InheritFromTemplate : SomeTemplate<int> {};
 
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$SomeTemplate@H@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$SomeTemplate@H@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 
 namespace PR23801 {
 template <typename>
@@ -547,7 +553,7 @@
 
 }
 //
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FB@PR23801@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FB@PR23801@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 
 struct __declspec(dllexport) T {
   // Copy assignment operator:
@@ -555,7 +561,7 @@
 
   // Explicitly defaulted copy constructur:
   T(const T&) = default;
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc %struct.T* @"\01??0T@@QAE@ABU0@@Z"
+  // M32MSVC2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.T* @"\01??0T@@QAE@ABU0@@Z"
 
   void a() {}
   // M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01?a@T@@QAEXXZ"
@@ -570,6 +576,30 @@
 USEVAR(T::b)
 int T::c;
 
+// Export template class with static member variable
+// MSC-DAG: @"\01?StaticClassVarExpTmplClass@?$TmplClass@H@@2HA" = weak_odr dllexport global i32 0, comdat, align 4
+// GNU-DAG: @_ZN9TmplClassIiE26StaticClassVarExpTmplClassE = weak_odr dllexport global i32 0, comdat, align 4
+template<typename T>
+struct __declspec(dllexport) TmplClass
+{
+  static T StaticClassVarExpTmplClass;
+};
+
+template<typename T>
+T TmplClass<T>::StaticClassVarExpTmplClass;
+
+// Export a definition of a template function.
+// MSC-DAG: define weak_odr dllexport i32 @"\01??$TypeFunTmpl@H@@YAHH@Z"
+// GNU-DAG: define weak_odr dllexport i32 @_Z11TypeFunTmplIiET_S0_
+template<typename T>
+T __declspec(dllexport) TypeFunTmpl(T t) { return t + t; }
+
+// Instantiate the exported template class and the exported template function.
+int useExportedTmplStaticAndFun()
+{
+  return TmplClass<int>::StaticClassVarExpTmplClass + TypeFunTmpl<int>(10);
+}
+
 template <typename T> struct __declspec(dllexport) U { void foo() {} };
 struct __declspec(dllexport) V : public U<int> { };
 // U<int>'s assignment operator is emitted.
@@ -592,7 +622,8 @@
 
 struct __declspec(dllexport) Y {
   // Move assignment operator:
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.Y* @"\01??4Y@@QAEAAU0@$$QAU0@@Z"
+  // MSVC2015-DAG: define weak_odr dllexport {{.+}} @"\01??4Y@@Q{{.+}}@$$Q{{.+}}@@Z"
+  // MSVC2013-DAG: define weak_odr dllexport {{.+}} @"\01??4Y@@Q{{.+}}0@A{{.+}}0@@Z"
 
   int x;
 };
@@ -616,9 +647,34 @@
 
 struct __declspec(dllexport) DefaultedCtorsDtors {
   DefaultedCtorsDtors() = default;
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc %struct.DefaultedCtorsDtors* @"\01??0DefaultedCtorsDtors@@QAE@XZ"
+  // M32MSVC2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.DefaultedCtorsDtors* @"\01??0DefaultedCtorsDtors@@QAE@XZ"
   ~DefaultedCtorsDtors() = default;
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??1DefaultedCtorsDtors@@QAE@XZ"
+  // M32MSVC2013-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??1DefaultedCtorsDtors@@QAE@XZ"
+};
+
+// Export defaulted member function definitions declared inside class.
+struct __declspec(dllexport) ExportDefaultedInclassDefs {
+  ExportDefaultedInclassDefs() = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M64VS2013-DAG: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M64VS2015-NOT: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+
+  ~ExportDefaultedInclassDefs() = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??1ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M64VS2013-DAG: define weak_odr dllexport                void @"\01??1ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc void @"\01??1ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M64VS2015-NOT: define weak_odr dllexport                void @"\01??1ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+
+  ExportDefaultedInclassDefs(const ExportDefaultedInclassDefs&) = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64VS2013-DAG: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64VS2015-NOT: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+
+  ExportDefaultedInclassDefs& operator=(const ExportDefaultedInclassDefs&) = default;
+  // M32-DAG: define weak_odr dllexport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ExportDefaultedInclassDefs* @"\01??4ExportDefaultedInclassDefs@@QAEAAU0@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64-DAG: define weak_odr dllexport                dereferenceable({{[0-9]+}}) %struct.ExportDefaultedInclassDefs* @"\01??4ExportDefaultedInclassDefs@@QEAAAEAU0@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
 };
 
 namespace ReferencedInlineMethodInNestedClass {
@@ -690,7 +746,7 @@
   // M32-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExplicitInstConstexprMembers* @"\01??0?$ExplicitInstConstexprMembers@X@@QAE@XZ"
 
   ExplicitInstConstexprMembers(const ExplicitInstConstexprMembers&) = default;
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExplicitInstConstexprMembers* @"\01??0?$ExplicitInstConstexprMembers@X@@QAE@ABU0@@Z"
+  // M32MSVC2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExplicitInstConstexprMembers* @"\01??0?$ExplicitInstConstexprMembers@X@@QAE@ABU0@@Z"
 
   constexpr int f() const { return 42; }
   // M32-DAG: define weak_odr dllexport x86_thiscallcc i32 @"\01?f@?$ExplicitInstConstexprMembers@X@@QBEHXZ"
@@ -777,6 +833,22 @@
 // M32-DAG: define weak_odr dllexport x86_thiscallcc dereferenceable(1) %"struct.InClassInits::Baz"* @"\01??4Baz@InClassInits@@QAEAAU01@ABU01@@Z"
 }
 
+// We had an issue where instantiating A would force emission of B's delayed
+// exported methods.
+namespace pr26490 {
+template <typename T> struct A { };
+struct __declspec(dllexport) B {
+  B(int = 0) {}
+  A<int> m_fn1() {}
+};
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FB@pr26490@@QAEXXZ"
+}
+
+// dllexport trumps dllexport on an explicit instantiation.
+template <typename T> struct ExplicitInstantiationTwoAttributes { void f() {} };
+template struct __declspec(dllexport) __declspec(dllimport) ExplicitInstantiationTwoAttributes<int>;
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01?f@?$ExplicitInstantiationTwoAttributes@H@@QAEXXZ"
+
 
 //===----------------------------------------------------------------------===//
 // Classes with template base classes
@@ -891,10 +963,26 @@
 // M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01?func@?$ExplicitInstantiationDeclTemplateBase@H@@QAEXXZ"
 // G32-DAG: define weak_odr x86_thiscallcc void @_ZN37ExplicitInstantiationDeclTemplateBaseIiE4funcEv
 
-template <typename T> struct ExplicitInstantiationDeclTemplateBase2 { void func() {} };
-extern template struct ExplicitInstantiationDeclTemplateBase2<int>;
-struct __declspec(dllexport) DerivedFromExplicitInstantiationDeclTemplateBase2 : public ExplicitInstantiationDeclTemplateBase2<int> {};
-template struct __declspec(dllimport) ExplicitInstantiationDeclTemplateBase2<int>;
-USEMEMFUNC(ExplicitInstantiationDeclTemplateBase2<int>, func)
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01?func@?$ExplicitInstantiationDeclTemplateBase2@H@@QAEXXZ"
-// G32-DAG: define weak_odr x86_thiscallcc void @_ZN38ExplicitInstantiationDeclTemplateBase2IiE4funcEv
+// PR26076
+struct LayerSelectionBound;
+template <typename> struct Selection {};
+typedef Selection<LayerSelectionBound> LayerSelection;
+struct LayerImpl;
+struct __declspec(dllexport) LayerTreeImpl {
+  struct __declspec(dllexport) ElementLayers {
+    LayerImpl *main = nullptr;
+  };
+  LayerSelection foo;
+};
+// M32-DAG: define weak_odr dllexport x86_thiscallcc %"struct.LayerTreeImpl::ElementLayers"* @"\01??0ElementLayers@LayerTreeImpl@@QAE@XZ"
+// M64-DAG: define weak_odr dllexport %"struct.LayerTreeImpl::ElementLayers"* @"\01??0ElementLayers@LayerTreeImpl@@QEAA@XZ"
+
+class __declspec(dllexport) ACE_Shared_Object {
+public:
+  virtual ~ACE_Shared_Object();
+};
+class __declspec(dllexport) ACE_Service_Object : public ACE_Shared_Object {};
+// Implicit move constructor declaration.
+// MSVC2015-DAG: define weak_odr dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
+// The declarations should not be exported.
+// MSVC2013-NOT: define weak_odr dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
diff --git a/test/CodeGenCXX/dllimport-members.cpp b/test/CodeGenCXX/dllimport-members.cpp
index e88b7e9..1fed1bf 100644
--- a/test/CodeGenCXX/dllimport-members.cpp
+++ b/test/CodeGenCXX/dllimport-members.cpp
@@ -63,8 +63,8 @@
 struct ImportMembers {
   struct Nested;
 
-  // M32-DAG: define              x86_thiscallcc void @"\01?normalDef@ImportMembers@@QAEXXZ"(%struct.ImportMembers* %this)
-  // M64-DAG: define                             void @"\01?normalDef@ImportMembers@@QEAAXXZ"(%struct.ImportMembers* %this)
+  // M32-DAG: define  dllexport   x86_thiscallcc void @"\01?normalDef@ImportMembers@@QAEXXZ"(%struct.ImportMembers* %this)
+  // M64-DAG: define  dllexport                  void @"\01?normalDef@ImportMembers@@QEAAXXZ"(%struct.ImportMembers* %this)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?normalDecl@ImportMembers@@QAEXXZ"(%struct.ImportMembers*)
   // M64-DAG: declare dllimport                  void @"\01?normalDecl@ImportMembers@@QEAAXXZ"(%struct.ImportMembers*)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?normalInclass@ImportMembers@@QAEXXZ"(%struct.ImportMembers*)
@@ -95,8 +95,8 @@
   __declspec(dllimport)                void normalInlineDef();
   __declspec(dllimport)         inline void normalInlineDecl();
 
-  // M32-DAG: define              x86_thiscallcc void @"\01?virtualDef@ImportMembers@@UAEXXZ"(%struct.ImportMembers* %this)
-  // M64-DAG: define                             void @"\01?virtualDef@ImportMembers@@UEAAXXZ"(%struct.ImportMembers* %this)
+  // M32-DAG: define  dllexport   x86_thiscallcc void @"\01?virtualDef@ImportMembers@@UAEXXZ"(%struct.ImportMembers* %this)
+  // M64-DAG: define  dllexport                  void @"\01?virtualDef@ImportMembers@@UEAAXXZ"(%struct.ImportMembers* %this)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?virtualDecl@ImportMembers@@UAEXXZ"(%struct.ImportMembers*)
   // M64-DAG: declare dllimport                  void @"\01?virtualDecl@ImportMembers@@UEAAXXZ"(%struct.ImportMembers*)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?virtualInclass@ImportMembers@@UAEXXZ"(%struct.ImportMembers*)
@@ -127,7 +127,7 @@
   __declspec(dllimport) virtual        void virtualInlineDef();
   __declspec(dllimport) virtual inline void virtualInlineDecl();
 
-  // MSC-DAG: define                           void @"\01?staticDef@ImportMembers@@SAXXZ"()
+  // MSC-DAG: define  dllexport                void @"\01?staticDef@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticDecl@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticInclass@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticInlineDef@ImportMembers@@SAXXZ"()
@@ -235,8 +235,8 @@
 
 // Import individual members of a nested class.
 struct ImportMembers::Nested {
-  // M32-DAG: define              x86_thiscallcc void @"\01?normalDef@Nested@ImportMembers@@QAEXXZ"(%"struct.ImportMembers::Nested"* %this)
-  // M64-DAG: define                             void @"\01?normalDef@Nested@ImportMembers@@QEAAXXZ"(%"struct.ImportMembers::Nested"* %this)
+  // M32-DAG: define  dllexport   x86_thiscallcc void @"\01?normalDef@Nested@ImportMembers@@QAEXXZ"(%"struct.ImportMembers::Nested"* %this)
+  // M64-DAG: define  dllexport                  void @"\01?normalDef@Nested@ImportMembers@@QEAAXXZ"(%"struct.ImportMembers::Nested"* %this)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?normalDecl@Nested@ImportMembers@@QAEXXZ"(%"struct.ImportMembers::Nested"*)
   // M64-DAG: declare dllimport                  void @"\01?normalDecl@Nested@ImportMembers@@QEAAXXZ"(%"struct.ImportMembers::Nested"*)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?normalInclass@Nested@ImportMembers@@QAEXXZ"(%"struct.ImportMembers::Nested"*)
@@ -267,8 +267,8 @@
   __declspec(dllimport)                void normalInlineDef();
   __declspec(dllimport)         inline void normalInlineDecl();
 
-  // M32-DAG: define              x86_thiscallcc void @"\01?virtualDef@Nested@ImportMembers@@UAEXXZ"(%"struct.ImportMembers::Nested"* %this)
-  // M64-DAG: define                             void @"\01?virtualDef@Nested@ImportMembers@@UEAAXXZ"(%"struct.ImportMembers::Nested"* %this)
+  // M32-DAG: define  dllexport   x86_thiscallcc void @"\01?virtualDef@Nested@ImportMembers@@UAEXXZ"(%"struct.ImportMembers::Nested"* %this)
+  // M64-DAG: define  dllexport                  void @"\01?virtualDef@Nested@ImportMembers@@UEAAXXZ"(%"struct.ImportMembers::Nested"* %this)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?virtualDecl@Nested@ImportMembers@@UAEXXZ"(%"struct.ImportMembers::Nested"*)
   // M64-DAG: declare dllimport                  void @"\01?virtualDecl@Nested@ImportMembers@@UEAAXXZ"(%"struct.ImportMembers::Nested"*)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?virtualInclass@Nested@ImportMembers@@UAEXXZ"(%"struct.ImportMembers::Nested"*)
@@ -300,7 +300,7 @@
   __declspec(dllimport) virtual        void virtualInlineDef();
   __declspec(dllimport) virtual inline void virtualInlineDecl();
 
-  // MSC-DAG: define                           void @"\01?staticDef@Nested@ImportMembers@@SAXXZ"()
+  // MSC-DAG: define  dllexport                void @"\01?staticDef@Nested@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticDecl@Nested@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticInclass@Nested@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticInlineDef@Nested@ImportMembers@@SAXXZ"()
@@ -595,16 +595,16 @@
 // G64-DAG: define linkonce_odr                dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @_ZN19ImportDefaultedDefsaSERKS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 inline ImportDefaultedDefs& ImportDefaultedDefs::operator=(const ImportDefaultedDefs&) = default;
 
-// M32-DAG: define x86_thiscallcc %struct.ImportDefaultedDefs* @"\01??0ImportDefaultedDefs@@QAE@$$QAU0@@Z"(%struct.ImportDefaultedDefs* returned %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
-// M64-DAG: define                %struct.ImportDefaultedDefs* @"\01??0ImportDefaultedDefs@@QEAA@$$QEAU0@@Z"(%struct.ImportDefaultedDefs* returned %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
+// M32-DAG: define dllexport x86_thiscallcc %struct.ImportDefaultedDefs* @"\01??0ImportDefaultedDefs@@QAE@$$QAU0@@Z"(%struct.ImportDefaultedDefs* returned %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
+// M64-DAG: define dllexport                %struct.ImportDefaultedDefs* @"\01??0ImportDefaultedDefs@@QEAA@$$QEAU0@@Z"(%struct.ImportDefaultedDefs* returned %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G32-DAG: define x86_thiscallcc void @_ZN19ImportDefaultedDefsC1EOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G64-DAG: define                void @_ZN19ImportDefaultedDefsC1EOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G32-DAG: define x86_thiscallcc void @_ZN19ImportDefaultedDefsC2EOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G64-DAG: define                void @_ZN19ImportDefaultedDefsC2EOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 ImportDefaultedDefs::ImportDefaultedDefs(ImportDefaultedDefs&&) = default; // dllimport ignored
 
-// M32-DAG: define x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @"\01??4ImportDefaultedDefs@@QAEAAU0@$$QAU0@@Z"(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
-// M64-DAG: define                dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @"\01??4ImportDefaultedDefs@@QEAAAEAU0@$$QEAU0@@Z"(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
+// M32-DAG: define dllexport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @"\01??4ImportDefaultedDefs@@QAEAAU0@$$QAU0@@Z"(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
+// M64-DAG: define dllexport                dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @"\01??4ImportDefaultedDefs@@QEAAAEAU0@$$QEAU0@@Z"(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G32-DAG: define x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @_ZN19ImportDefaultedDefsaSEOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G64-DAG: define                dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @_ZN19ImportDefaultedDefsaSEOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 ImportDefaultedDefs& ImportDefaultedDefs::operator=(ImportDefaultedDefs&&) = default; // dllimport ignored
diff --git a/test/CodeGenCXX/dllimport-rtti.cpp b/test/CodeGenCXX/dllimport-rtti.cpp
index 071ce27..4baee50 100644
--- a/test/CodeGenCXX/dllimport-rtti.cpp
+++ b/test/CodeGenCXX/dllimport-rtti.cpp
@@ -4,7 +4,8 @@
 struct __declspec(dllimport) S {
   virtual void f() {}
 } s;
-// MSVC-DAG: @"\01??_7S@@6B@" = available_externally dllimport
+// MSVC: [[VF_S:.*]] = private unnamed_addr constant [2 x i8*]
+// MSVC-DAG: @"\01??_SS@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* [[VF_S]], i32 0, i32 1)
 // MSVC-DAG: @"\01??_R0?AUS@@@8" = linkonce_odr
 // MSVC-DAG: @"\01??_R1A@?0A@EA@S@@8" = linkonce_odr
 // MSVC-DAG: @"\01??_R2S@@8" = linkonce_odr
diff --git a/test/CodeGenCXX/dllimport.cpp b/test/CodeGenCXX/dllimport.cpp
index b9c850b..aff240f 100644
--- a/test/CodeGenCXX/dllimport.cpp
+++ b/test/CodeGenCXX/dllimport.cpp
@@ -27,6 +27,7 @@
 #define USEVAR(var) USEVARTYPE(int, var)
 #define USE(func) void UNIQ(use)() { func(); }
 #define USEMEMFUNC(class, func) void (class::*UNIQ(use)())() { return &class::func; }
+#define USESTATICMEMFUNC(class, func) void (*UNIQ(use)())() { return &class::func; }
 #define USECLASS(class) void UNIQ(USE)() { class x; }
 #define USECOPYASSIGN(class) class& (class::*UNIQ(use)())(class&) { return &class::operator=; }
 #define USEMOVEASSIGN(class) class& (class::*UNIQ(use)())(class&&) { return &class::operator=; }
@@ -263,7 +264,7 @@
                       void redecl2();
 USE(redecl2)
 
-// MSC-DAG: define void @"\01?redecl3@@YAXXZ"()
+// MSC-DAG: define dllexport void @"\01?redecl3@@YAXXZ"()
 // GNU-DAG: define void @_Z7redecl3v()
 __declspec(dllimport) void redecl3();
                       void redecl3() {} // dllimport ignored
@@ -275,7 +276,7 @@
 // GNU-DAG: declare dllimport void @_Z7friend1v()
 // MSC-DAG: declare           void @"\01?friend2@@YAXXZ"()
 // GNU-DAG: declare           void @_Z7friend2v()
-// MSC-DAG: define            void @"\01?friend3@@YAXXZ"()
+// MSC-DAG: define  dllexport void @"\01?friend3@@YAXXZ"()
 // GNU-DAG: define            void @_Z7friend3v()
 // MSC-DAG: declare           void @"\01?friend4@@YAXXZ"()
 // GNU-DAG: declare           void @_Z7friend4v()
@@ -590,6 +591,10 @@
   void a() {}
   // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"\01?a@T@@QAEXXZ"
 
+  static void StaticMethod();
+  // MSC-DAG: declare dllimport void @"\01?StaticMethod@T@@SAXXZ"()
+  // GNU-DAG: declare dllimport void @_ZN1T12StaticMethodEv()
+
   static int b;
   // MO1-DAG: @"\01?b@T@@2HA" = external dllimport global i32
 
@@ -602,6 +607,7 @@
   // M19-DAG: define available_externally dllimport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.T* @"\01??4T@@QAEAAU0@$$QAU0@@Z"
 };
 USEMEMFUNC(T, a)
+USESTATICMEMFUNC(T, StaticMethod)
 USEVAR(T::b)
 USECOPYASSIGN(T)
 USEMOVEASSIGN(T)
@@ -614,7 +620,7 @@
 struct __declspec(dllimport) W { virtual void foo() {} };
 USECLASS(W)
 // vftable:
-// MO1-DAG: @"\01??_7W@@6B@" = available_externally dllimport unnamed_addr constant [1 x i8*] [i8* bitcast (void (%struct.W*)* @"\01?foo@W@@UAEXXZ" to i8*)]
+// MO1-DAG: @"\01??_SW@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast (void (%struct.W*)* @"\01?foo@W@@UAEXXZ" to i8*)]
 // GO1-DAG: @_ZTV1W = available_externally dllimport unnamed_addr constant [3 x i8*] [i8* null, i8* null, i8* bitcast (void (%struct.W*)* @_ZN1W3fooEv to i8*)]
 
 struct __declspec(dllimport) KeyFuncClass {
@@ -650,7 +656,7 @@
 
 namespace Vtordisp {
   // Don't dllimport the vtordisp.
-  // MO1-DAG: define linkonce_odr x86_thiscallcc void @"\01?f@?$C@D@Vtordisp@@$4PPPPPPPM@A@AEXXZ"
+  // MO1-DAG: define linkonce_odr x86_thiscallcc void @"\01?f@?$C@H@Vtordisp@@$4PPPPPPPM@A@AEXXZ"
 
   class Base {
     virtual void f() {}
@@ -661,7 +667,7 @@
     C() {}
     virtual void f() {}
   };
-  template class C<char>;
+  USECLASS(C<int>);
 }
 
 namespace ClassTemplateStaticDef {
@@ -670,7 +676,7 @@
     static int x;
   };
   template <typename T> int S<T>::x;
-  // MSC-DAG: @"\01?x@?$S@H@ClassTemplateStaticDef@@2HA" = available_externally dllimport global i32 0
+  // MSC-DAG: @"\01?x@?$S@H@ClassTemplateStaticDef@@2HA" = external dllimport global i32
   int f() { return S<int>::x; }
 
   // Partial class template specialization static field:
@@ -679,7 +685,7 @@
     static int x;
   };
   template <typename A> int T<A*>::x;
-  // GNU-DAG: @_ZN22ClassTemplateStaticDef1TIPvE1xE = available_externally dllimport global i32 0
+  // GNU-DAG: @_ZN22ClassTemplateStaticDef1TIPvE1xE = external dllimport global i32
   int g() { return T<void*>::x; }
 }
 
@@ -692,26 +698,31 @@
   template <typename T> struct A { static NonPOD x; };
   template <typename T> NonPOD A<T>::x;
   template struct __declspec(dllimport) A<int>;
-  // MSC-DAG: @"\01?x@?$A@H@PR19933@@2UNonPOD@2@A" = available_externally dllimport global %"struct.PR19933::NonPOD" zeroinitializer
+  USEVARTYPE(NonPOD, A<int>::x);
+  // MSC-DAG: @"\01?x@?$A@H@PR19933@@2UNonPOD@2@A" = external dllimport global %"struct.PR19933::NonPOD"
 
   int f();
   template <typename T> struct B { static int x; };
   template <typename T> int B<T>::x = f();
   template struct __declspec(dllimport) B<int>;
-  // MSC-DAG: @"\01?x@?$B@H@PR19933@@2HA" = available_externally dllimport global i32 0
+  USEVAR(B<int>::x);
+  // MSC-DAG: @"\01?x@?$B@H@PR19933@@2HA" = external dllimport global i32
 
   constexpr int g() { return 42; }
   template <typename T> struct C { static int x; };
   template <typename T> int C<T>::x = g();
   template struct __declspec(dllimport) C<int>;
-  // MSC-DAG: @"\01?x@?$C@H@PR19933@@2HA" = available_externally dllimport global i32 42
+  USEVAR(C<int>::x);
+  // MSC-DAG: @"\01?x@?$C@H@PR19933@@2HA" = external dllimport global i32
 
   template <int I> struct D { static int x, y; };
   template <int I> int D<I>::x = I + 1;
   template <int I> int D<I>::y = I + f();
   template struct __declspec(dllimport) D<42>;
-  // MSC-DAG: @"\01?x@?$D@$0CK@@PR19933@@2HA" = available_externally dllimport global i32 43
-  // MSC-DAG: @"\01?y@?$D@$0CK@@PR19933@@2HA" = available_externally dllimport global i32 0
+  USEVAR(D<42>::x);
+  USEVAR(D<42>::y);
+  // MSC-DAG: @"\01?x@?$D@$0CK@@PR19933@@2HA" = external dllimport global i32
+  // MSC-DAG: @"\01?y@?$D@$0CK@@PR19933@@2HA" = external dllimport global i32
 }
 
 namespace PR21355 {
@@ -737,6 +748,17 @@
   inline void S::outOfClassInlineMethod() {}
 }
 
+namespace PR27319 {
+  // Make sure we don't assert due to not having checked for operator delete on
+  // the destructor.
+  template <typename> struct A {
+    virtual ~A() = default;
+  };
+  extern template struct __declspec(dllimport) A<int>;
+  void f() { new A<int>(); }
+  // MO1-DAG: @"\01??_S?$A@H@PR27319@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*]
+}
+
 // MS ignores DLL attributes on partial specializations.
 template <typename T> struct PartiallySpecializedClassTemplate {};
 template <typename T> struct __declspec(dllimport) PartiallySpecializedClassTemplate<T*> { void f(); };
@@ -788,6 +810,36 @@
 USEMEMFUNC(PR23770BaseTemplate<int>, f);
 // M32-DAG: declare dllimport x86_thiscallcc void @"\01?f@?$PR23770BaseTemplate@H@@QAEXXZ"
 
+namespace PR27810 {
+  template <class T>
+  struct basic_ostream {
+    struct sentry {
+      sentry() { }
+      void foo() { }
+    };
+  };
+  template class __declspec(dllimport) basic_ostream<char>;
+  // The explicit instantiation definition acts as an explicit instantiation
+  // *declaration*, dllimport is not inherited by the inner class, and no
+  // functions are emitted unless they are used.
+
+  USEMEMFUNC(basic_ostream<char>::sentry, foo);
+  // M32-DAG: define linkonce_odr x86_thiscallcc void @"\01?foo@sentry@?$basic_ostream@D@PR27810@@QAEXXZ"
+  // M32-NOT: ??0sentry@?$basic_ostream@D@PR27810@@QAE@XZ
+}
+
+namespace PR27811 {
+  template <class T> struct codecvt {
+    virtual ~codecvt() { }
+  };
+  template class __declspec(dllimport) codecvt<char>;
+
+  // dllimport means this explicit instantiation definition gets treated as a
+  // declaration. Thus, the vtable should not be marked used, and in fact
+  // nothing for this class should be emitted at all since it's not used.
+  // M32-NOT: codecvt
+}
+
 //===----------------------------------------------------------------------===//
 // Classes with template base classes
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGenCXX/exceptions-cxx-ehsc.cpp b/test/CodeGenCXX/exceptions-cxx-ehsc.cpp
new file mode 100644
index 0000000..c660d14
--- /dev/null
+++ b/test/CodeGenCXX/exceptions-cxx-ehsc.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -fexceptions -fcxx-exceptions -fexternc-nounwind | FileCheck %s
+
+namespace test1 {
+struct Cleanup { ~Cleanup(); };
+extern "C" void never_throws();
+void may_throw();
+
+void caller() {
+  Cleanup x;
+  never_throws();
+  may_throw();
+}
+}
+// CHECK-LABEL: define void @"\01?caller@test1@@YAXXZ"(
+// CHECK: call void @never_throws(
+// CHECK: invoke void @"\01?may_throw@test1@@YAXXZ"(
+
+namespace test2 {
+struct Cleanup { ~Cleanup(); };
+extern "C" void throws_int() throw(int);
+void may_throw();
+
+void caller() {
+  Cleanup x;
+  throws_int();
+  may_throw();
+}
+}
+// CHECK-LABEL: define void @"\01?caller@test2@@YAXXZ"(
+// CHECK: invoke void @throws_int(
+// CHECK: invoke void @"\01?may_throw@test2@@YAXXZ"(
diff --git a/test/CodeGenCXX/exceptions-cxx-new.cpp b/test/CodeGenCXX/exceptions-cxx-new.cpp
index 3767f33..3329aea 100644
--- a/test/CodeGenCXX/exceptions-cxx-new.cpp
+++ b/test/CodeGenCXX/exceptions-cxx-new.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -fexceptions -fcxx-exceptions -fnew-ms-eh -emit-llvm -o - -std=c++11 | FileCheck %s
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -fexceptions -fcxx-exceptions -emit-llvm -o - -std=c++11 | FileCheck %s
 
 int f(int);
 
@@ -72,6 +72,6 @@
 // CHECK:   ret void
 
 // CHECK: [[TERMINATE]]
-// CHECK:   cleanuppad within none []
-// CHECK-NEXT:   call void @"\01?terminate@@YAXXZ"()
+// CHECK:   %[[CLEANUPPAD:.*]] = cleanuppad within none []
+// CHECK-NEXT:   call void @"\01?terminate@@YAXXZ"() {{.*}} [ "funclet"(token %[[CLEANUPPAD]]) ]
 
diff --git a/test/CodeGenCXX/exceptions.cpp b/test/CodeGenCXX/exceptions.cpp
index ff76b11..86616d1 100644
--- a/test/CodeGenCXX/exceptions.cpp
+++ b/test/CodeGenCXX/exceptions.cpp
@@ -30,7 +30,7 @@
 
   A *a() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11aEv()
-    // CHECK:      [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK:      [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test11AC1Ei([[A]]* [[CAST]], i32 5)
     // CHECK:      ret [[A]]* [[CAST]]
@@ -40,7 +40,7 @@
 
   A *b() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11bEv()
-    // CHECK:      [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK:      [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: [[FOO:%.*]] = invoke i32 @_ZN5test13fooEv()
     // CHECK:      invoke void @_ZN5test11AC1Ei([[A]]* [[CAST]], i32 [[FOO]])
@@ -56,7 +56,7 @@
   A *c() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11cEv()
     // CHECK:      [[ACTIVE:%.*]] = alloca i1
-    // CHECK-NEXT: [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK-NEXT: [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] 
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test11BC1Ev([[B:%.*]]* [[T0:%.*]])
@@ -82,7 +82,7 @@
   A *d() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11dEv()
     // CHECK:      [[ACTIVE:%.*]] = alloca i1
-    // CHECK-NEXT: [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK-NEXT: [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] 
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test11BC1Ev([[B:%.*]]* [[T0:%.*]])
@@ -100,7 +100,7 @@
   A *e() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11eEv()
     // CHECK:      [[ACTIVE:%.*]] = alloca i1
-    // CHECK-NEXT: [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK-NEXT: [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] 
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test11BC1Ev([[B:%.*]]* [[T0:%.*]])
@@ -131,7 +131,7 @@
     // CHECK:    define [[A:%.*]]* @_ZN5test11iEv()
     // CHECK:      [[X:%.*]] = alloca [[A]]*, align 8
     // CHECK:      [[ACTIVE:%.*]] = alloca i1
-    // CHECK:      [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK:      [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] 
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test15makeBEv([[B:%.*]]* sret [[T0:%.*]])
@@ -422,7 +422,7 @@
     return new A[10];
   }
   // CHECK: define {{%.*}}* @_ZN5test94testEv
-  // CHECK: [[TEST9_NEW:%.*]] = call noalias i8* @_Znam
+  // CHECK: [[TEST9_NEW:%.*]] = call i8* @_Znam
   // CHECK: call void @_ZdaPv(i8* [[TEST9_NEW]])
 }
 
diff --git a/test/CodeGenCXX/explicit-instantiation.cpp b/test/CodeGenCXX/explicit-instantiation.cpp
index 6076444..7e00d78 100644
--- a/test/CodeGenCXX/explicit-instantiation.cpp
+++ b/test/CodeGenCXX/explicit-instantiation.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -emit-llvm -triple i686-pc-linux-gnu -std=c++1y -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO-OPT
 // RUN: %clang_cc1 -emit-llvm -triple i686-pc-linux-gnu -std=c++1y -O3 -disable-llvm-optzns -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT
+// RUN: %clang_cc1 -emit-llvm -triple i686-pc-win32 -std=c++1y -o - %s | FileCheck %s --check-prefix=CHECK-MS
 
 // This check logically is attached to 'template int S<int>::i;' below.
 // CHECK: @_ZN1SIiE1iE = weak_odr global i32
@@ -103,6 +104,28 @@
 template struct S<int>;
 }
 
+namespace NestedClasses {
+  // Check how explicit instantiation of an outer class affects the inner class.
+  template <typename T> struct Outer {
+    struct Inner {
+      void f() {}
+    };
+  };
+
+  // Explicit instantiation definition of Outer causes explicit instantiation
+  // definition of Inner.
+  template struct Outer<int>;
+  // CHECK: define weak_odr void @_ZN13NestedClasses5OuterIiE5Inner1fEv
+  // CHECK-MS: define weak_odr x86_thiscallcc void @"\01?f@Inner@?$Outer@H@NestedClasses@@QAEXXZ"
+
+  // Explicit instantiation declaration of Outer causes explicit instantiation
+  // declaration of Inner, but not in MSVC mode.
+  extern template struct Outer<char>;
+  auto use = &Outer<char>::Inner::f;
+  // CHECK: {{declare|define available_externally}} void @_ZN13NestedClasses5OuterIcE5Inner1fEv
+  // CHECK-MS: define linkonce_odr x86_thiscallcc void @"\01?f@Inner@?$Outer@D@NestedClasses@@QAEXXZ"
+}
+
 // Check that we emit definitions from explicit instantiations even when they
 // occur prior to the definition itself.
 template <typename T> struct S {
diff --git a/test/CodeGenCXX/extern-c.cpp b/test/CodeGenCXX/extern-c.cpp
index 5b59a38..1046915 100644
--- a/test/CodeGenCXX/extern-c.cpp
+++ b/test/CodeGenCXX/extern-c.cpp
@@ -16,8 +16,23 @@
 // CHECK-NOT: should_not_appear
 extern "C++" int should_not_appear;
 
+// CHECK: @_ZN3foo10extern_cxxE = global
+extern "C++" int extern_cxx = 0;
+
 }
 
+// CHECK-NOT: @global_a = global
+extern "C" int global_a;
+
+// CHECK: @global_b = global
+extern "C" int global_b = 0;
+
+// CHECK-NOT: should_not_appear
+extern "C++" int should_not_appear;
+
+// CHECK: @extern_cxx = global
+extern "C++" int extern_cxx = 0;
+
 namespace test1 {
   namespace {
     struct X {};
@@ -59,10 +74,10 @@
 
   // CHECK-NOT: @unused
   // CHECK-NOT: @duplicate_internal
-  // CHECK: @internal_var = internal alias i32, i32* @_Z12internal_var
+  // CHECK: @internal_var = internal alias i32, i32* @_ZL12internal_var
   // CHECK-NOT: @unused
   // CHECK-NOT: @duplicate_internal
-  // CHECK: @internal_fn = internal alias i32 (), i32 ()* @_Z11internal_fnv
+  // CHECK: @internal_fn = internal alias i32 (), i32 ()* @_ZL11internal_fnv
   // CHECK-NOT: @unused
   // CHECK-NOT: @duplicate_internal
 }
diff --git a/test/CodeGenCXX/float128-declarations.cpp b/test/CodeGenCXX/float128-declarations.cpp
new file mode 100644
index 0000000..e1604a6
--- /dev/null
+++ b/test/CodeGenCXX/float128-declarations.cpp
@@ -0,0 +1,138 @@
+// RUN: %clang_cc1 -emit-llvm -triple powerpc64-unknown-unknown \
+// RUN:   -target-feature +float128 -std=c++11 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple powerpc64le-unknown-unknown \
+// RUN:   -target-feature +float128 -std=c++11 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386-unknown-linux-gnu -std=c++11 \
+// RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-X86
+// RUN: %clang_cc1 -emit-llvm -triple x86_64-unknown-linux-gnu -std=c++11 \
+// RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-X86
+// RUN: %clang_cc1 -emit-llvm -triple systemz-unknown-linux-gnu -std=c++11 \
+// RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-SYSZ
+//
+/*  Various contexts where type __float128 can appear. The different check
+    prefixes are due to different mangling on X86 and different calling
+    convention on SystemZ. */
+
+/*  Namespace */
+namespace {
+  __float128 f1n;
+  __float128 f2n = 33.q;
+  __float128 arr1n[10];
+  __float128 arr2n[] = { 1.2q, 3.0q, 3.e11q };
+  const volatile __float128 func1n(const __float128 &arg) {
+    return arg + f2n + arr1n[4] - arr2n[1];
+  }
+}
+
+/* File */
+__float128 f1f;
+__float128 f2f = 32.4q;
+static __float128 f3f = f2f;
+__float128 arr1f[10];
+__float128 arr2f[] = { -1.2q, -3.0q, -3.e11q };
+__float128 func1f(__float128 arg);
+
+/* Class */
+class C1 {
+  __float128 f1c;
+  static const __float128 f2c;
+  volatile __float128 f3c;
+public:
+  C1(__float128 arg) : f1c(arg), f3c(arg) { }
+  __float128 func1c(__float128 arg ) {
+    return f1c + arg;
+  }
+  static __float128 func2c(__float128 arg) {
+    return arg * C1::f2c;
+  }
+};
+
+/*  Template */
+template <class C> C func1t(C arg) { return arg * 2.q; }
+template <class C> struct S1 {
+  C mem1;
+};
+template <> struct S1<__float128> {
+  __float128 mem2;
+};
+
+/* Local */
+int main(void) {
+  __float128 f1l = 123e220q;
+  __float128 f2l = -0.q;
+  __float128 f3l = 1.189731495357231765085759326628007e4932q;
+  C1 c1(f1l);
+  S1<__float128> s1 = { 132.q };
+  __float128 f4l = func1n(f1l) + func1f(f2l) + c1.func1c(f3l) + c1.func2c(f1l) +
+    func1t(f1l) + s1.mem2 - f1n + f2n;
+#if (__cplusplus >= 201103L)
+  auto f5l = -1.q, *f6l = &f2l, f7l = func1t(f3l);
+#endif
+  __float128 f8l = f4l++;
+  __float128 arr1l[] = { -1.q, -0.q, -11.q };
+}
+// CHECK-DAG: @_ZN12_GLOBAL__N_13f1nE = internal global fp128 0xL00000000000000000000000000000000
+// CHECK-DAG: @_ZN12_GLOBAL__N_13f2nE = internal global fp128 0xL00000000000000004004080000000000
+// CHECK-DAG: @_ZN12_GLOBAL__N_15arr1nE = internal global [10 x fp128]
+// CHECK-DAG: @_ZN12_GLOBAL__N_15arr2nE = internal global [3 x fp128] [fp128 0xL33333333333333333FFF333333333333, fp128 0xL00000000000000004000800000000000, fp128 0xL00000000000000004025176592E00000]
+// CHECK-DAG: define internal fp128 @_ZN12_GLOBAL__N_16func1nERKU10__float128(fp128*
+// CHECK-DAG: @f1f = global fp128 0xL00000000000000000000000000000000
+// CHECK-DAG: @f2f = global fp128 0xL33333333333333334004033333333333
+// CHECK-DAG: @arr1f = global [10 x fp128]
+// CHECK-DAG: @arr2f = global [3 x fp128] [fp128 0xL3333333333333333BFFF333333333333, fp128 0xL0000000000000000C000800000000000, fp128 0xL0000000000000000C025176592E00000]
+// CHECK-DAG: declare fp128 @_Z6func1fU10__float128(fp128)
+// CHECK-DAG: define linkonce_odr void @_ZN2C1C2EU10__float128(%class.C1* %this, fp128 %arg)
+// CHECK-DAG: define linkonce_odr fp128 @_ZN2C16func2cEU10__float128(fp128 %arg)
+// CHECK-DAG: define linkonce_odr fp128 @_Z6func1tIU10__float128ET_S0_(fp128 %arg)
+// CHECK-DAG: @_ZZ4mainE2s1 = private unnamed_addr constant %struct.S1 { fp128 0xL00000000000000004006080000000000 }
+// CHECK-DAG: store fp128 0xLF0AFD0EBFF292DCE42E0B38CDD83F26F, fp128* %f1l, align 16
+// CHECK-DAG: store fp128 0xL00000000000000008000000000000000, fp128* %f2l, align 16
+// CHECK-DAG: store fp128 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF, fp128* %f3l, align 16
+// CHECK-DAG: store fp128 0xL0000000000000000BFFF000000000000, fp128* %f5l, align 16
+// CHECK-DAG: [[F4L:%[a-z0-9]+]] = load fp128, fp128* %f4l
+// CHECK-DAG: [[INC:%[a-z0-9]+]] = fadd fp128 [[F4L]], 0xL00000000000000003FFF000000000000
+// CHECK-DAG: store fp128 [[INC]], fp128* %f4l
+
+// CHECK-X86-DAG: @_ZN12_GLOBAL__N_13f1nE = internal global fp128 0xL00000000000000000000000000000000
+// CHECK-X86-DAG: @_ZN12_GLOBAL__N_13f2nE = internal global fp128 0xL00000000000000004004080000000000
+// CHECK-X86-DAG: @_ZN12_GLOBAL__N_15arr1nE = internal global [10 x fp128]
+// CHECK-X86-DAG: @_ZN12_GLOBAL__N_15arr2nE = internal global [3 x fp128] [fp128 0xL33333333333333333FFF333333333333, fp128 0xL00000000000000004000800000000000, fp128 0xL00000000000000004025176592E00000]
+// CHECK-X86-DAG: define internal fp128 @_ZN12_GLOBAL__N_16func1nERKg(fp128*
+// CHECK-X86-DAG: @f1f = global fp128 0xL00000000000000000000000000000000
+// CHECK-X86-DAG: @f2f = global fp128 0xL33333333333333334004033333333333
+// CHECK-X86-DAG: @arr1f = global [10 x fp128]
+// CHECK-X86-DAG: @arr2f = global [3 x fp128] [fp128 0xL3333333333333333BFFF333333333333, fp128 0xL0000000000000000C000800000000000, fp128 0xL0000000000000000C025176592E00000]
+// CHECK-X86-DAG: declare fp128 @_Z6func1fg(fp128)
+// CHECK-X86-DAG: define linkonce_odr void @_ZN2C1C2Eg(%class.C1* %this, fp128 %arg)
+// CHECK-X86-DAG: define linkonce_odr fp128 @_ZN2C16func2cEg(fp128 %arg)
+// CHECK-X86-DAG: define linkonce_odr fp128 @_Z6func1tIgET_S0_(fp128 %arg)
+// CHECK-X86-DAG: @_ZZ4mainE2s1 = private unnamed_addr constant %struct.S1 { fp128 0xL00000000000000004006080000000000 }
+// CHECK-X86-DAG: store fp128 0xLF0AFD0EBFF292DCE42E0B38CDD83F26F, fp128* %f1l, align 16
+// CHECK-X86-DAG: store fp128 0xL00000000000000008000000000000000, fp128* %f2l, align 16
+// CHECK-X86-DAG: store fp128 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF, fp128* %f3l, align 16
+// CHECK-X86-DAG: store fp128 0xL0000000000000000BFFF000000000000, fp128* %f5l, align 16
+// CHECK-X86-DAG: [[F4L:%[a-z0-9]+]] = load fp128, fp128* %f4l
+// CHECK-X86-DAG: [[INC:%[a-z0-9]+]] = fadd fp128 [[F4L]], 0xL00000000000000003FFF000000000000
+// CHECK-X86-DAG: store fp128 [[INC]], fp128* %f4l
+
+// CHECK-SYSZ-DAG: @_ZN12_GLOBAL__N_13f1nE = internal global fp128 0xL00000000000000000000000000000000
+// CHECK-SYSZ-DAG: @_ZN12_GLOBAL__N_13f2nE = internal global fp128 0xL00000000000000004004080000000000
+// CHECK-SYSZ-DAG: @_ZN12_GLOBAL__N_15arr1nE = internal global [10 x fp128]
+// CHECK-SYSZ-DAG: @_ZN12_GLOBAL__N_15arr2nE = internal global [3 x fp128] [fp128 0xL33333333333333333FFF333333333333, fp128 0xL00000000000000004000800000000000, fp128 0xL00000000000000004025176592E00000]
+// CHECK-SYSZ-DAG: define internal void @_ZN12_GLOBAL__N_16func1nERKU10__float128(fp128*
+// CHECK-SYSZ-DAG: @f1f = global fp128 0xL00000000000000000000000000000000
+// CHECK-SYSZ-DAG: @f2f = global fp128 0xL33333333333333334004033333333333
+// CHECK-SYSZ-DAG: @arr1f = global [10 x fp128]
+// CHECK-SYSZ-DAG: @arr2f = global [3 x fp128] [fp128 0xL3333333333333333BFFF333333333333, fp128 0xL0000000000000000C000800000000000, fp128 0xL0000000000000000C025176592E00000]
+// CHECK-SYSZ-DAG: declare void @_Z6func1fU10__float128(fp128*
+// CHECK-SYSZ-DAG: define linkonce_odr void @_ZN2C1C2EU10__float128(%class.C1* %this, fp128*
+// CHECK-SYSZ-DAG: define linkonce_odr void @_ZN2C16func2cEU10__float128(fp128*
+// CHECK-SYSZ-DAG: define linkonce_odr void @_Z6func1tIU10__float128ET_S0_(fp128*
+// CHECK-SYSZ-DAG: @_ZZ4mainE2s1 = private unnamed_addr constant %struct.S1 { fp128 0xL00000000000000004006080000000000 }
+// CHECK-SYSZ-DAG: store fp128 0xLF0AFD0EBFF292DCE42E0B38CDD83F26F, fp128* %f1l, align 16
+// CHECK-SYSZ-DAG: store fp128 0xL00000000000000008000000000000000, fp128* %f2l, align 16
+// CHECK-SYSZ-DAG: store fp128 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF, fp128* %f3l, align 16
+// CHECK-SYSZ-DAG: store fp128 0xL0000000000000000BFFF000000000000, fp128* %f5l, align 16
+// CHECK-SYSZ-DAG: [[F4L:%[a-z0-9]+]] = load fp128, fp128* %f4l
+// CHECK-SYSZ-DAG: [[INC:%[a-z0-9]+]] = fadd fp128 [[F4L]], 0xL00000000000000003FFF000000000000
+// CHECK-SYSZ-DAG: store fp128 [[INC]], fp128* %f4l
diff --git a/test/CodeGenCXX/goto.cpp b/test/CodeGenCXX/goto.cpp
index c1a0eec..27bd7af 100644
--- a/test/CodeGenCXX/goto.cpp
+++ b/test/CodeGenCXX/goto.cpp
@@ -18,7 +18,7 @@
     // CHECK-NEXT: [[CLEANUPACTIVE:%.*]] = alloca i1
     // CHECK:      call void @_ZN5test01AC1Ev([[A]]* [[Y]])
     // CHECK-NEXT: invoke void @_ZN5test01AC1Ev([[A]]* [[Z]])
-    // CHECK:      [[NEW:%.*]] = invoke noalias i8* @_Znwm(i64 1)
+    // CHECK:      [[NEW:%.*]] = invoke i8* @_Znwm(i64 1)
     // CHECK:      store i1 true, i1* [[CLEANUPACTIVE]]
     // CHECK:      [[NEWCAST:%.*]] = bitcast i8* [[NEW]] to [[V]]*
     // CHECK-NEXT: invoke void @_ZN5test01AC1Ev([[A]]* [[TMP]])
diff --git a/test/CodeGenCXX/inheriting-constructor.cpp b/test/CodeGenCXX/inheriting-constructor.cpp
index 42080a2..a3adf70 100644
--- a/test/CodeGenCXX/inheriting-constructor.cpp
+++ b/test/CodeGenCXX/inheriting-constructor.cpp
@@ -1,4 +1,8 @@
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-apple-darwin10 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -triple i386-linux -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=ITANIUM
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-darwin -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=ITANIUM
+// RUN: %clang_cc1 -std=c++11 -triple arm64-ehabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=ITANIUM
+// RUN: %clang_cc1 -std=c++11 -triple i386-windows -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=MSABI --check-prefix=WIN32
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-windows -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=MSABI --check-prefix=WIN64
 
 // PR12219
 struct A { A(int); virtual ~A(); };
@@ -11,18 +15,396 @@
 struct D : C { using C::C; };
 D d(123);
 
-// CHECK-LABEL: define void @_ZN1BD2Ev
-// CHECK-LABEL: define void @_ZN1BD1Ev
-// CHECK-LABEL: define void @_ZN1BD0Ev
+// ITANIUM-LABEL: define void @_ZN1BD2Ev
+// ITANIUM-LABEL: define void @_ZN1BD1Ev
+// ITANIUM-LABEL: define void @_ZN1BD0Ev
+// WIN32-LABEL: define {{.*}}void @"\01??1B@@UAE@XZ"
+// WIN64-LABEL: define {{.*}}void @"\01??1B@@UEAA@XZ"
 
-// CHECK-LABEL: define linkonce_odr void @_ZN1BC1Ei(
-// CHECK: call void @_ZN1BC2Ei(
+// ITANIUM-LABEL: define linkonce_odr void @_ZN1BCI11AEi(
+// ITANIUM: call void @_ZN1BCI21AEi(
 
-// CHECK-LABEL: define linkonce_odr void @_ZN1DC1IiEET_(
-// CHECK: call void @_ZN1DC2IiEET_(
+// ITANIUM-LABEL: define linkonce_odr void @_ZN1DCI11CIiEET_(
+// ITANIUM: call void @_ZN1DCI21CIiEET_(
 
-// CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ei(
-// CHECK: call void @_ZN1AC2Ei(
+// WIN32-LABEL: define internal {{.*}} @"\01??0B@@QAE@H@Z"(
+// WIN32: call {{.*}} @"\01??0A@@QAE@H@Z"(
+// WIN64-LABEL: define internal {{.*}} @"\01??0B@@QEAA@H@Z"(
+// WIN64: call {{.*}} @"\01??0A@@QEAA@H@Z"(
 
-// CHECK-LABEL: define linkonce_odr void @_ZN1DC2IiEET_(
-// CHECK: call void @_ZN1CC2IiEET_(
+// WIN32-LABEL: define internal {{.*}} @"\01??0D@@QAE@H@Z"(
+// WIN32: call {{.*}} @"\01??$?0H@C@@QAE@H@Z"
+// WIN64-LABEL: define internal {{.*}} @"\01??0D@@QEAA@H@Z"(
+// WIN64: call {{.*}} @"\01??$?0H@C@@QEAA@H@Z"
+
+struct Q { Q(int); Q(const Q&); ~Q(); };
+struct Z { Z(); Z(int); ~Z(); int n; };
+
+namespace noninline_nonvirt {
+  struct A { A(int, Q&&, void *__attribute__((pass_object_size(0)))); int n; };
+  struct B : Z, A { Z z; using A::A; };
+  B b(1, 2, &b);
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}} %[[TMP:.*]], i32 2)
+  // ITANIUM: call void @_ZN17noninline_nonvirt1BCI1NS_1AEEiO1QPvU17pass_object_size0({{.*}} @_ZN17noninline_nonvirt1bE, i32 1, {{.*}} %[[TMP]], i8* {{.*}} @_ZN17noninline_nonvirt1bE{{.*}}, i{{32|64}} 12)
+  // ITANIUM: call void @_ZN1QD1Ev({{.*}} %[[TMP]])
+  // ITANIUM: call i32 @__cxa_atexit(
+
+  // Complete object ctor for B delegates to base object ctor.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN17noninline_nonvirt1BCI1NS_1AEEiO1QPvU17pass_object_size0(
+  // ITANIUM: call void @_ZN17noninline_nonvirt1BCI2NS_1AEEiO1QPvU17pass_object_size0({{.*}}, i32 {{.*}}, %{{.*}}* {{.*}}, i8* {{.*}}, i{{32|64}} {{.*}})
+
+  // In MSABI, we don't have ctor variants. B ctor forwards to A ctor.
+  // MSABI-LABEL: define internal {{.*}} @"\01??0B@noninline_nonvirt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: call {{.*}} @"\01??0Z@@Q{{AE|EAA}}@XZ"(
+  // MSABI: call {{.*}} @"\01??0A@noninline_nonvirt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: call {{.*}} @"\01??0Z@@Q{{AE|EAA}}@XZ"(
+
+  struct C : B { using B::B; };
+  C c(1, 2, &c);
+  // Complete object ctor for C delegates.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN17noninline_nonvirt1CCI1NS_1AEEiO1QPvU17pass_object_size0(
+  // ITANIUM: call void @_ZN17noninline_nonvirt1CCI2NS_1AEEiO1QPvU17pass_object_size0({{.*}}, i32 {{.*}}, %{{.*}}* {{.*}}, i8* {{.*}}, i{{32|64}} {{.*}})
+
+  // MSABI-LABEL: define internal {{.*}} @"\01??0C@noninline_nonvirt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: call {{.*}} @"\01??0B@noninline_nonvirt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+}
+
+namespace noninline_virt {
+  struct A { A(int, Q&&, void *__attribute__((pass_object_size(0)))); int n; };
+  struct B : Z, virtual A { Z z; using A::A; };
+  B b(1, 2, &b);
+  // Complete object ctor forwards to A ctor then constructs Zs.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN14noninline_virt1BCI1NS_1AEEiO1QPvU17pass_object_size0(
+  // ITANIUM: call void @_ZN14noninline_virt1AC2EiO1QPvU17pass_object_size0({{.*}} %{{.*}}, i32 %{{.*}}, %{{.*}}* {{.*}}, i8* {{.*}}, i{{32|64}} %{{.*}}
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM: store {{.*}} @_ZTVN14noninline_virt1BE
+  // ITANIUM: call void @_ZN1ZC1Ev(
+
+  // MSABI-LABEL: define internal {{.*}} @"\01??0B@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}}, i32 %{{.*}})
+  // MSABI: %[[COMPLETE:.*]] = icmp ne
+  // MSABI: br i1 %[[COMPLETE]],
+  // MSABI: call {{.*}} @"\01??0A@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: br
+  // MSABI: call {{.*}} @"\01??0Z@@Q{{AE|EAA}}@XZ"(
+  // MSABI: call {{.*}} @"\01??0Z@@Q{{AE|EAA}}@XZ"(
+
+  struct C : B { using B::B; };
+  C c(1, 2, &c);
+  // Complete object ctor forwards to A ctor, then calls B's base inheriting
+  // constructor, which takes no arguments other than the this pointer and VTT.
+  // ITANIUM_LABEL: define linkonce_odr void @_ZN14noninline_virt1CCI1NS_1AEEiO1QPvU17pass_object_size0(
+  // ITANIUM: call void @_ZN14noninline_virt1AC2EiO1QPvU17pass_object_size0({{.*}} %{{.*}}, i32 %{{.*}}, %{{.*}}* {{.*}}, i8* %{{.*}}, i{{32|64}} %{{.*}})
+  // ITANIUM: call void @_ZN14noninline_virt1BCI2NS_1AEEiO1QPvU17pass_object_size0(%{{.*}}* %{{.*}}, i8** getelementptr inbounds ([2 x i8*], [2 x i8*]* @_ZTTN14noninline_virt1CE, i64 0, i64 1))
+  // ITANIUM: store {{.*}} @_ZTVN14noninline_virt1CE
+
+  // C constructor forwards to B constructor and A constructor. We pass the args
+  // to both. FIXME: Can we pass undef here instead, for the base object
+  // constructor call?
+  // MSABI-LABEL: define internal {{.*}} @"\01??0C@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}}, i32 %{{.*}})
+  // MSABI: %[[COMPLETE:.*]] = icmp ne
+  // MSABI: br i1 %[[COMPLETE]],
+  // MSABI: call {{.*}} @"\01??0A@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: br
+  // MSABI: call {{.*}} @"\01??0B@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}}, i32 0)
+}
+
+// For MSABI only, check that inalloca arguments result in inlining.
+namespace inalloca_nonvirt {
+  struct A { A(Q, int, Q, Q&&); int n; };
+  struct B : Z, A { Z z; using A::A; };
+  B b(1, 2, 3, 4);
+  // No inlining implied for Itanium.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN16inalloca_nonvirt1BCI1NS_1AEE1QiS1_OS1_(
+  // ITANIUM: call void @_ZN16inalloca_nonvirt1BCI2NS_1AEE1QiS1_OS1_(
+
+  // MSABI-LABEL: define internal void @"\01??__Eb@inalloca_nonvirt@@YAXXZ"(
+
+  // On Win32, the inalloca call can't be forwarded so we force inlining.
+  // WIN32: %[[TMP:.*]] = alloca
+  // WIN32: call i8* @llvm.stacksave()
+  // WIN32: %[[ARGMEM:.*]] = alloca inalloca
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"(%{{.*}}* %[[TMP]], i32 4)
+  // WIN32: %[[ARG3:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN32: %[[ARG1:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: %[[ARG2:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store i32 2, i32* %[[ARG2]]
+  // WIN32: %[[ARG4:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store {{.*}}* %[[TMP]], {{.*}}** %[[ARG4]]
+  // WIN32: call {{.*}} @"\01??0A@inalloca_nonvirt@@QAE@UQ@@H0$$QAU2@@Z"(%{{[^,]*}}, <{{.*}}>* inalloca %[[ARGMEM]])
+  // WIN32: call void @llvm.stackrestore(
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??_DQ@@QAE@XZ"(
+
+  // On Win64, the Q arguments would be destroyed in the callee. We don't yet
+  // support that in the non-inlined case, so we force inlining.
+  // WIN64: %[[TMP:.*]] = alloca
+  // WIN64: %[[ARG3:.*]] = alloca
+  // WIN64: %[[ARG1:.*]] = alloca
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[TMP]], i32 4)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call {{.*}} @"\01??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(%{{.*}}, %{{.*}}* %[[ARG1]], i32 2, %{{.*}}* %[[ARG3]], %{{.*}} %[[TMP]])
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call void @"\01??_DQ@@QEAA@XZ"({{.*}}* %[[TMP]])
+
+  struct C : B { using B::B; };
+  C c(1, 2, 3, 4);
+  // MSABI-LABEL: define internal void @"\01??__Ec@inalloca_nonvirt@@YAXXZ"(
+
+  // On Win32, the inalloca call can't be forwarded so we force inlining.
+  // WIN32: %[[TMP:.*]] = alloca
+  // WIN32: call i8* @llvm.stacksave()
+  // WIN32: %[[ARGMEM:.*]] = alloca inalloca
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"(%{{.*}}* %[[TMP]], i32 4)
+  // WIN32: %[[ARG3:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN32: %[[ARG1:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: %[[ARG2:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store i32 2, i32* %[[ARG2]]
+  // WIN32: %[[ARG4:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store {{.*}}* %[[TMP]], {{.*}}** %[[ARG4]]
+  // WIN32: call {{.*}} @"\01??0A@inalloca_nonvirt@@QAE@UQ@@H0$$QAU2@@Z"(%{{[^,]*}}, <{{.*}}>* inalloca %[[ARGMEM]])
+  // WIN32: call void @llvm.stackrestore(
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??_DQ@@QAE@XZ"(
+
+  // On Win64, the Q arguments would be destroyed in the callee. We don't yet
+  // support that in the non-inlined case, so we force inlining.
+  // WIN64: %[[TMP:.*]] = alloca
+  // WIN64: %[[ARG3:.*]] = alloca
+  // WIN64: %[[ARG1:.*]] = alloca
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[TMP]], i32 4)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call {{.*}} @"\01??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(%{{.*}}, %{{.*}}* %[[ARG1]], i32 2, %{{.*}}* %[[ARG3]], %{{.*}} %[[TMP]])
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call void @"\01??_DQ@@QEAA@XZ"({{.*}}* %[[TMP]])
+}
+
+namespace inalloca_virt {
+  struct A { A(Q, int, Q, Q&&); int n; };
+  struct B : Z, virtual A { Z z; using A::A; };
+  B b(1, 2, 3, 4);
+
+  // MSABI-LABEL: define internal void @"\01??__Eb@inalloca_virt@@YAXXZ"(
+
+  // On Win32, the inalloca call can't be forwarded so we force inlining.
+  // WIN32: %[[TMP:.*]] = alloca
+  // WIN32: call i8* @llvm.stacksave()
+  // WIN32: %[[ARGMEM:.*]] = alloca inalloca
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"(%{{.*}}* %[[TMP]], i32 4)
+  // WIN32: %[[ARG3:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN32: %[[ARG1:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // FIXME: It's dumb to round-trip this though memory and generate a branch.
+  // WIN32: store i32 1, i32* %[[IS_MOST_DERIVED_ADDR:.*]]
+  // WIN32: %[[IS_MOST_DERIVED:.*]] = load i32, i32* %[[IS_MOST_DERIVED_ADDR]]
+  // WIN32: %[[IS_MOST_DERIVED_i1:.*]] = icmp ne i32 %[[IS_MOST_DERIVED]], 0
+  // WIN32: br i1 %[[IS_MOST_DERIVED_i1]]
+  //
+  // WIN32: store {{.*}} @"\01??_8B@inalloca_virt@@7B@"
+  // WIN32: %[[ARG2:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store i32 2, i32* %[[ARG2]]
+  // WIN32: %[[ARG4:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store {{.*}}* %[[TMP]], {{.*}}** %[[ARG4]]
+  // WIN32: call {{.*}} @"\01??0A@inalloca_virt@@QAE@UQ@@H0$$QAU2@@Z"(%{{[^,]*}}, <{{.*}}>* inalloca %[[ARGMEM]])
+  // WIN32: call void @llvm.stackrestore(
+  // WIN32: br
+  //
+  // Note that if we jumped directly to here we would fail to stackrestore and
+  // destroy the parameters, but that's not actually possible.
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??_DQ@@QAE@XZ"(
+
+  // On Win64, the Q arguments would be destroyed in the callee. We don't yet
+  // support that in the non-inlined case, so we force inlining.
+  // WIN64: %[[TMP:.*]] = alloca
+  // WIN64: %[[ARG3:.*]] = alloca
+  // WIN64: %[[ARG1:.*]] = alloca
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[TMP]], i32 4)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN64: br i1
+  // WIN64: call {{.*}} @"\01??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(%{{.*}}, %{{.*}}* %[[ARG1]], i32 2, %{{.*}}* %[[ARG3]], %{{.*}} %[[TMP]])
+  // WIN64: br
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call void @"\01??_DQ@@QEAA@XZ"({{.*}}* %[[TMP]])
+
+  struct C : B { using B::B; };
+  C c(1, 2, 3, 4);
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN13inalloca_virt1CD1Ev(
+
+  // MSABI-LABEL: define internal void @"\01??__Ec@inalloca_virt@@YAXXZ"(
+
+  // On Win32, the inalloca call can't be forwarded so we force inlining.
+  // WIN32: %[[TMP:.*]] = alloca
+  // WIN32: call i8* @llvm.stacksave()
+  // WIN32: %[[ARGMEM:.*]] = alloca inalloca
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"(%{{.*}}* %[[TMP]], i32 4)
+  // WIN32: %[[ARG3:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN32: %[[ARG1:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN32: store i32 1, i32* %[[IS_MOST_DERIVED_ADDR:.*]]
+  // WIN32: %[[IS_MOST_DERIVED:.*]] = load i32, i32* %[[IS_MOST_DERIVED_ADDR]]
+  // WIN32: %[[IS_MOST_DERIVED_i1:.*]] = icmp ne i32 %[[IS_MOST_DERIVED]], 0
+  // WIN32: br i1 %[[IS_MOST_DERIVED_i1]]
+  //
+  // WIN32: store {{.*}} @"\01??_8C@inalloca_virt@@7B@"
+  // WIN32: %[[ARG2:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store i32 2, i32* %[[ARG2]]
+  // WIN32: %[[ARG4:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store {{.*}}* %[[TMP]], {{.*}}** %[[ARG4]]
+  // WIN32: call {{.*}} @"\01??0A@inalloca_virt@@QAE@UQ@@H0$$QAU2@@Z"(%{{[^,]*}}, <{{.*}}>* inalloca %[[ARGMEM]])
+  // WIN32: call void @llvm.stackrestore(
+  // WIN32: br
+  //
+  // WIN32: store i32 0, i32* %[[IS_MOST_DERIVED_ADDR:.*]]
+  // WIN32: %[[IS_MOST_DERIVED:.*]] = load i32, i32* %[[IS_MOST_DERIVED_ADDR]]
+  // WIN32: %[[IS_MOST_DERIVED_i1:.*]] = icmp ne i32 %[[IS_MOST_DERIVED]], 0
+  // WIN32: br i1 %[[IS_MOST_DERIVED_i1]]
+  //
+  // Note: this block is unreachable.
+  // WIN32: store {{.*}} @"\01??_8B@inalloca_virt@@7B@"
+  // WIN32: br
+  //
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??_DQ@@QAE@XZ"(
+
+  // On Win64, the Q arguments would be destroyed in the callee. We don't yet
+  // support that in the non-inlined case, so we force inlining.
+  // WIN64: %[[TMP:.*]] = alloca
+  // WIN64: %[[ARG3:.*]] = alloca
+  // WIN64: %[[ARG1:.*]] = alloca
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[TMP]], i32 4)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN64: br i1
+  // WIN64: store {{.*}} @"\01??_8C@inalloca_virt@@7B@"
+  // WIN64: call {{.*}} @"\01??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(%{{.*}}, %{{.*}}* %[[ARG1]], i32 2, %{{.*}}* %[[ARG3]], %{{.*}} %[[TMP]])
+  // WIN64: br
+  // WIN64: br i1
+  // (Unreachable block)
+  // WIN64: store {{.*}} @"\01??_8B@inalloca_virt@@7B@"
+  // WIN64: br
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call void @"\01??_DQ@@QEAA@XZ"({{.*}}* %[[TMP]])
+}
+
+namespace inline_nonvirt {
+  struct A { A(Q, int, Q, Q&&, ...); int n; };
+  struct B : Z, A { Z z; using A::A; };
+  B b(1, 2, 3, 4, 5, 6);
+  // Inlined all the way down to the A ctor.
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 1)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 3)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 4)
+  // ITANIUM: %[[Z_BASE:.*]] = bitcast %{{.*}}* %[[THIS:.*]] to
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM: %[[B_CAST:.*]] = bitcast {{.*}} %[[THIS]]
+  // ITANIUM: %[[A_CAST:.*]] = getelementptr {{.*}} %[[B_CAST]], i{{32|64}} 4
+  // ITANIUM: %[[A:.*]] = bitcast {{.*}} %[[A_CAST]]
+  // ITANIUM: call void ({{.*}}, ...) @_ZN14inline_nonvirt1AC2E1QiS1_OS1_z(%{{.*}}* %[[A]], {{.*}}, i32 2, {{.*}}, {{.*}}, i32 5, i32 6)
+  // ITANIUM: %[[Z_MEMBER:.*]] = getelementptr {{.*}} %[[THIS]], i32 0, i32 2
+  // ITANIUM: call void @_ZN1ZC1Ev({{.*}} %[[Z_MEMBER]])
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+
+  struct C : B { using B::B; };
+  C c(1, 2, 3, 4, 5, 6);
+  // Inlined all the way down to the A ctor.
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 1)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 3)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 4)
+  // ITANIUM: %[[Z_BASE:.*]] = bitcast %{{.*}}* %[[THIS:.*]] to
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM: %[[B_CAST:.*]] = bitcast {{.*}} %[[THIS]]
+  // ITANIUM: %[[A_CAST:.*]] = getelementptr {{.*}} %[[B_CAST]], i{{32|64}} 4
+  // ITANIUM: %[[A:.*]] = bitcast {{.*}} %[[A_CAST]]
+  // ITANIUM: call void ({{.*}}, ...) @_ZN14inline_nonvirt1AC2E1QiS1_OS1_z(%{{.*}}* %[[A]], {{.*}}, i32 2, {{.*}}, {{.*}}, i32 5, i32 6)
+  // ITANIUM: %[[Z_MEMBER:.*]] = getelementptr {{.*}} %{{.*}}, i32 0, i32 2
+  // ITANIUM: call void @_ZN1ZC1Ev({{.*}} %[[Z_MEMBER]])
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+}
+
+namespace inline_virt {
+  struct A { A(Q, int, Q, Q&&, ...); int n; };
+  struct B : Z, virtual A { Z z; using A::A; };
+  B b(1, 2, 3, 4, 5, 6);
+  // Inlined all the way down to the A ctor.
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 1)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 3)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 4)
+  // ITANIUM: %[[B_CAST:.*]] = bitcast {{.*}} %[[THIS:.*]]
+  // ITANIUM: %[[A_CAST:.*]] = getelementptr {{.*}} %[[B_CAST]], i{{32|64}} {{12|16}}
+  // ITANIUM: %[[A:.*]] = bitcast {{.*}} %[[A_CAST]]
+  // ITANIUM: call void ({{.*}}, ...) @_ZN11inline_virt1AC2E1QiS1_OS1_z(%{{.*}}* %[[A]], {{.*}}, i32 2, {{.*}}, {{.*}}, i32 5, i32 6)
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM: call void @_ZN1ZC1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+
+  struct C : B { using B::B; };
+  C c(1, 2, 3, 4, 5, 6);
+  // Inlined all the way down to the A ctor, except that we can just call the
+  // B base inheriting constructor to construct that portion (it doesn't need
+  // the forwarded arguments).
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 1)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 3)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 4)
+  // ITANIUM: %[[B_CAST:.*]] = bitcast {{.*}} %[[THIS:.*]]
+  // ITANIUM: %[[A_CAST:.*]] = getelementptr {{.*}} %[[B_CAST]], i{{32|64}} {{12|16}}
+  // ITANIUM: %[[A:.*]] = bitcast {{.*}} %[[A_CAST]]
+  // ITANIUM: call void ({{.*}}, ...) @_ZN11inline_virt1AC2E1QiS1_OS1_z(%{{.*}}* %[[A]], {{.*}}, i32 2, {{.*}}, {{.*}}, i32 5, i32 6)
+  // ITANIUM: call void @_ZN11inline_virt1BCI2NS_1AEE1QiS1_OS1_z({{[^,]*}}, i8** getelementptr inbounds ([2 x i8*], [2 x i8*]* @_ZTTN11inline_virt1CE, i64 0, i64 1))
+  // ITANIUM: store {{.*}} @_ZTVN11inline_virt1CE
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+
+  // B base object inheriting constructor does not get passed arguments.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN11inline_virt1BCI2NS_1AEE1QiS1_OS1_z(
+  // ITANIUM-NOT: call
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM-NOT: call
+  // VTT -> vtable
+  // ITANIUM: store
+  // ITANIUM-NOT: call
+  // ITANIUM: call void @_ZN1ZC1Ev(
+  // ITANIUM-NOT: call
+  // ITANIUM: }
+}
+
+// ITANIUM-LABEL: define linkonce_odr void @_ZN1BCI21AEi(
+// ITANIUM: call void @_ZN1AC2Ei(
+
+// ITANIUM-LABEL: define linkonce_odr void @_ZN1DCI21CIiEET_(
+// ITANIUM: call void @_ZN1CC2IiEET_(
+
+// ITANIUM-LABEL: define linkonce_odr void @_ZN17noninline_nonvirt1BCI2NS_1AEEiO1QPvU17pass_object_size0(
+// ITANIUM: call void @_ZN1ZC2Ev(
+// ITANIUM: call void @_ZN17noninline_nonvirt1AC2EiO1QPvU17pass_object_size0(
+
+// ITANIUM-LABEL: define linkonce_odr void @_ZN17noninline_nonvirt1CCI2NS_1AEEiO1QPvU17pass_object_size0(
+// ITANIUM: call void @_ZN17noninline_nonvirt1BCI2NS_1AEEiO1QPvU17pass_object_size0(
diff --git a/test/CodeGenCXX/init-invariant.cpp b/test/CodeGenCXX/init-invariant.cpp
index 7f34825..815287c 100644
--- a/test/CodeGenCXX/init-invariant.cpp
+++ b/test/CodeGenCXX/init-invariant.cpp
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -triple i686-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-O0
 // RUN: %clang_cc1 -triple i686-linux-gnu -emit-llvm %s -O1 -o - | FileCheck %s
 
-// Check that we add an llvm.invariant.start to mark when a global becomes
+// Check that we add an llvm.invariant.start.p0i8 to mark when a global becomes
 // read-only. If globalopt can fold the initializer, it will then mark the
 // variable as constant.
 
 // Do not produce markers at -O0.
-// CHECK-O0-NOT: llvm.invariant.start
+// CHECK-O0-NOT: llvm.invariant.start.p0i8
 
 struct A {
   A();
@@ -42,19 +42,19 @@
 }
 
 // CHECK: call void @_ZN1AC1Ev({{.*}}* nonnull @a)
-// CHECK: call {{.*}}@llvm.invariant.start(i64 4, i8* bitcast ({{.*}} @a to i8*))
+// CHECK: call {{.*}}@llvm.invariant.start.p0i8(i64 4, i8* bitcast ({{.*}} @a to i8*))
 
 // CHECK: call void @_ZN1BC1Ev({{.*}}* nonnull @b)
-// CHECK-NOT: call {{.*}}@llvm.invariant.start(i64 4, i8* bitcast ({{.*}} @b to i8*))
+// CHECK-NOT: call {{.*}}@llvm.invariant.start.p0i8(i64 4, i8* bitcast ({{.*}} @b to i8*))
 
 // CHECK: call void @_ZN1CC1Ev({{.*}}* nonnull @c)
-// CHECK-NOT: call {{.*}}@llvm.invariant.start(i64 4, i8* bitcast ({{.*}} @c to i8*))
+// CHECK-NOT: call {{.*}}@llvm.invariant.start.p0i8(i64 4, i8* bitcast ({{.*}} @c to i8*))
 
 // CHECK: call i32 @_Z1fv(
 // CHECK: store {{.*}}, i32* @d
-// CHECK: call {{.*}}@llvm.invariant.start(i64 4, i8* bitcast ({{.*}} @d to i8*))
+// CHECK: call {{.*}}@llvm.invariant.start.p0i8(i64 4, i8* bitcast ({{.*}} @d to i8*))
 
 // CHECK-LABEL: define void @_Z1ev(
 // CHECK: call void @_ZN1AC1Ev(%struct.A* nonnull @_ZZ1evE1a)
-// CHECK: call {{.*}}@llvm.invariant.start(i64 4, i8* nonnull bitcast ({{.*}} @_ZZ1evE1a to i8*))
+// CHECK: call {{.*}}@llvm.invariant.start.p0i8(i64 4, i8* {{.*}}bitcast ({{.*}} @_ZZ1evE1a to i8*))
 // CHECK-NOT: llvm.invariant.end
diff --git a/test/CodeGenCXX/inline-hint.cpp b/test/CodeGenCXX/inline-hint.cpp
new file mode 100644
index 0000000..9c14032
--- /dev/null
+++ b/test/CodeGenCXX/inline-hint.cpp
@@ -0,0 +1,96 @@
+// RUN: %clang_cc1 %s -std=c++11 -triple=x86_64-linux -finline-functions -emit-llvm -disable-llvm-optzns -o - | FileCheck %s --check-prefix=CHECK --check-prefix=SUITABLE
+// RUN: %clang_cc1 %s -std=c++11 -triple=x86_64-linux -finline-hint-functions -emit-llvm -disable-llvm-optzns -o - | FileCheck %s --check-prefix=CHECK --check-prefix=HINTED
+// RUN: %clang_cc1 %s -std=c++11 -triple=x86_64-linux -fno-inline -emit-llvm -disable-llvm-optzns -o - | FileCheck %s --check-prefix=CHECK --check-prefix=NOINLINE
+
+// Force non-trivial implicit constructors/destructors/operators for B by having explicit ones for A
+struct A {
+  A() {}
+  A(const A&) {}
+  A& operator=(const A&) { return *this; }
+  ~A() {}
+};
+
+struct B {
+  A member;
+  int implicitFunction(int a) { return a + a; }
+  inline int explicitFunction(int a);
+  int noHintFunction(int a);
+  __attribute__((optnone)) int optNoneFunction(int a) { return a + a; }
+  template<int N> int implicitTplFunction(int a) { return N + a; }
+  template<int N> inline int explicitTplFunction(int a) { return N + a; }
+  template<int N> int noHintTplFunction(int a);
+  template<int N> int explicitRedeclTplFunction(int a);
+};
+
+int B::explicitFunction(int a) { return a + a; }
+// CHECK: @_ZN1B14noHintFunctionEi({{.*}}) [[NOHINT_ATTR:#[0-9]+]]
+int B::noHintFunction(int a) { return a + a; }
+
+// CHECK: @_ZN1B19implicitTplFunctionILi0EEEii({{.*}}) [[NOHINT_ATTR]]
+template<> int B::implicitTplFunction<0>(int a) { return a + a; }
+// CHECK: @_ZN1B19explicitTplFunctionILi0EEEii({{.*}}) [[NOHINT_ATTR]]
+template<> int B::explicitTplFunction<0>(int a) { return a + a; }
+// CHECK: @_ZN1B17noHintTplFunctionILi0EEEii({{.*}}) [[NOHINT_ATTR]]
+template<> int B::noHintTplFunction<0>(int a) { return a + a; }
+template<> inline int B::implicitTplFunction<1>(int a) { return a; }
+template<> inline int B::explicitTplFunction<1>(int a) { return a; }
+template<> inline int B::noHintTplFunction<1>(int a) { return a; }
+template<int N> int B::noHintTplFunction(int a) { return N + a; }
+template<int N> inline int B::explicitRedeclTplFunction(int a) { return N + a; }
+
+constexpr int constexprFunction(int a) { return a + a; }
+
+void foo()
+{
+// CHECK: @_ZN1BC1Ev({{.*}}) unnamed_addr [[IMPLICIT_CONSTR_ATTR:#[0-9]+]]
+  B b1;
+// CHECK: @_ZN1BC1ERKS_({{.*}}) unnamed_addr [[IMPLICIT_CONSTR_ATTR]]
+  B b2(b1);
+// CHECK: @_ZN1BaSERKS_({{.*}}) [[IMPLICIT_CONSTR_ATTR]]
+  b2 = b1;
+// CHECK: @_ZN1B16implicitFunctionEi({{.*}}) [[IMPLICIT_ATTR:#[0-9]+]]
+  b1.implicitFunction(1);
+// CHECK: @_ZN1B16explicitFunctionEi({{.*}}) [[EXPLICIT_ATTR:#[0-9]+]]
+  b1.explicitFunction(2);
+  b1.noHintFunction(3);
+// CHECK: @_ZN1B15optNoneFunctionEi({{.*}}) [[OPTNONE_ATTR:#[0-9]+]]
+  b1.optNoneFunction(4);
+// CHECK: @_Z17constexprFunctioni({{.*}}) [[IMPLICIT_ATTR]]
+  constexprFunction(5);
+  b1.implicitTplFunction<0>(6);
+// CHECK: @_ZN1B19implicitTplFunctionILi1EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.implicitTplFunction<1>(7);
+// CHECK: @_ZN1B19implicitTplFunctionILi2EEEii({{.*}}) [[IMPLICIT_ATTR]]
+  b1.implicitTplFunction<2>(8);
+  b1.explicitTplFunction<0>(9);
+// CHECK: @_ZN1B19explicitTplFunctionILi1EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.explicitTplFunction<1>(10);
+// CHECK: @_ZN1B19explicitTplFunctionILi2EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.explicitTplFunction<2>(11);
+  b1.noHintTplFunction<0>(12);
+// CHECK: @_ZN1B17noHintTplFunctionILi1EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.noHintTplFunction<1>(13);
+// CHECK: @_ZN1B17noHintTplFunctionILi2EEEii({{.*}}) [[NOHINT_ATTR]]
+  b1.noHintTplFunction<2>(14);
+// CHECK: @_ZN1B25explicitRedeclTplFunctionILi2EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.explicitRedeclTplFunction<2>(15);
+// CHECK: @_ZN1BD2Ev({{.*}}) unnamed_addr [[IMPLICIT_CONSTR_ATTR]]
+}
+
+// SUITABLE-NOT: attributes [[NOHINT_ATTR]] = { {{.*}}noinline{{.*}} }
+//   HINTED-DAG: attributes [[NOHINT_ATTR]] = { noinline{{.*}} }
+// NOINLINE-DAG: attributes [[NOHINT_ATTR]] = { noinline{{.*}} }
+
+// SUITABLE-NOT: attributes [[IMPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+//   HINTED-NOT: attributes [[IMPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+// NOINLINE-DAG: attributes [[IMPLICIT_ATTR]] = { noinline{{.*}} }
+
+// SUITABLE-NOT: attributes [[IMPLICIT_CONSTR_ATTR]] = { {{.*}}noinline{{.*}} }
+//   HINTED-NOT: attributes [[IMPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+// NOINLINE-DAG: attributes [[IMPLICIT_CONSTR_ATTR]] = { noinline{{.*}} }
+
+// SUITABLE-NOT: attributes [[EXPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+//   HINTED-NOT: attributes [[IMPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+// NOINLINE-DAG: attributes [[EXPLICIT_ATTR]] = { noinline{{.*}} }
+
+// CHECK-DAG: attributes [[OPTNONE_ATTR]] = { noinline{{.*}} }
diff --git a/test/CodeGenCXX/lambda-expressions.cpp b/test/CodeGenCXX/lambda-expressions.cpp
index f59d360..2006f0f 100644
--- a/test/CodeGenCXX/lambda-expressions.cpp
+++ b/test/CodeGenCXX/lambda-expressions.cpp
@@ -116,6 +116,56 @@
   return [&] { return &y; }();
 }
 
+namespace pr28595 {
+  struct Temp {
+    Temp();
+    ~Temp() noexcept(false);
+  };
+  struct A {
+    A();
+    A(const A &a, const Temp &temp = Temp());
+    ~A();
+  };
+
+  // CHECK-LABEL: define void @_ZN7pr285954testEv()
+  void test() {
+    // CHECK: [[ARRAY:%.*]] = alloca [3 x [5 x [[A:%.*]]]], align 1
+    // CHECK: [[DESTIDX:%.*]] = alloca i64, align 8
+    // CHECK: [[I0:%.*]] = alloca i64, align 8
+    // CHECK: [[I1:%.*]] = alloca i64, align 8
+    A array[3][5];
+
+    // CHECK: [[DESTBASE:%.*]] = bitcast [3 x [5 x [[A]]]]* {{.*}} to [[A]]*
+    // CHECK: store i64 0, i64* [[DESTIDX]], align 8
+    // CHECK: store i64 0, i64* [[I0]], align 8
+    // CHECK: br label
+    // CHECK: icmp ult
+    // CHECK: store i64 0, i64* [[I1]], align 8
+    // CHECK: br label
+    // CHECK: icmp ult
+    // CHECK: [[T0:%.*]] = load i64, i64* [[DESTIDX]], align 8
+    // CHECK: [[DEST:%.*]] = getelementptr inbounds [[A]], [[A]]* [[DESTBASE]], i64 [[T0]]
+    // CHECK: invoke void @_ZN7pr285954TempC1Ev
+    // CHECK: invoke void @_ZN7pr285951AC1ERKS0_RKNS_4TempE
+    // CHECK: invoke void @_ZN7pr285954TempD1Ev
+    // CHECK: landingpad
+    // CHECK: landingpad
+    // CHECK: br label [[CLEANUP:%.*]]{{$}}
+    // CHECK: landingpad
+    // CHECK: invoke void @_ZN7pr285954TempD1Ev
+    // CHECK: br label [[CLEANUP]]
+    // CHECK: icmp eq [[A]]* [[DESTBASE]], [[DEST]]
+    // CHECK: [[T0:%.*]] = phi [[A]]*
+    // CHECK: [[T1:%.*]] = getelementptr inbounds [[A]], [[A]]* [[T0]], i64 -1
+    // CHECK: call void @_ZN7pr285951AD1Ev([[A]]* [[T1]])
+    // CHECK: icmp eq [[A]]* [[T1]], [[DESTBASE]]
+    (void) [array]{};
+
+    //   Skip over the initialization loop.
+    // CHECK: [[BEGIN:%.*]] = getelementptr inbounds [3 x [5 x [[A]]]], [3 x [5 x [[A]]]]* [[ARRAY]], i32 0, i32 0, i32 0
+  }
+}
+
 // CHECK-LABEL: define internal void @"_ZZ1e1ES_bEN3$_5D2Ev"
 
 // CHECK-LABEL: define internal i32 @"_ZZ1fvEN3$_68__invokeEii"
@@ -126,9 +176,9 @@
 // CHECK-NEXT: call i32 @"_ZZ1fvENK3$_6clEii"
 // CHECK-NEXT: ret i32
 
-// CHECK-LABEL: define internal void @"_ZZ1hvEN4$_108__invokeEv"(%struct.A* noalias sret %agg.result) {{.*}} {
+// CHECK-LABEL: define internal void @"_ZZ1hvEN4$_118__invokeEv"(%struct.A* noalias sret %agg.result) {{.*}} {
 // CHECK-NOT: =
-// CHECK: call void @"_ZZ1hvENK4$_10clEv"(%struct.A* sret %agg.result,
+// CHECK: call void @"_ZZ1hvENK4$_11clEv"(%struct.A* sret %agg.result,
 // CHECK-NEXT: ret void
 struct A { ~A(); };
 void h() {
diff --git a/test/CodeGenCXX/lto-visibility-inference.cpp b/test/CodeGenCXX/lto-visibility-inference.cpp
new file mode 100644
index 0000000..8e57ef5
--- /dev/null
+++ b/test/CodeGenCXX/lto-visibility-inference.cpp
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -std=c++11 -fms-extensions -fvisibility hidden -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=ITANIUM %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -std=c++11 -fms-extensions -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=MS --check-prefix=MS-STD %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -std=c++11 -fms-extensions -fwhole-program-vtables -flto-visibility-public-std -emit-llvm -o - %s | FileCheck --check-prefix=MS --check-prefix=MS-NOSTD %s
+
+struct C1 {
+  virtual void f();
+};
+
+struct __attribute__((visibility("default"))) C2 {
+  virtual void f();
+};
+
+struct __declspec(dllexport) C3 {
+  virtual void f();
+};
+
+struct __declspec(dllimport) C4 {
+  virtual void f();
+};
+
+struct [[clang::lto_visibility_public]] C5 {
+  virtual void f();
+};
+
+struct __declspec(uuid("00000000-0000-0000-0000-000000000000")) C6 {
+  virtual void f();
+};
+
+namespace std {
+
+struct C7 {
+  virtual void f();
+  struct C8 {
+    virtual void f();
+  };
+};
+
+}
+
+extern "C++" {
+
+namespace stdext {
+
+struct C9 {
+  virtual void f();
+};
+
+}
+
+}
+
+namespace other {
+
+struct C10 {
+  virtual void f();
+};
+
+}
+
+namespace {
+
+struct C11 {
+  virtual void f();
+};
+
+}
+
+void f(C1 *c1, C2 *c2, C3 *c3, C4 *c4, C5 *c5, C6 *c6, std::C7 *c7,
+       std::C7::C8 *c8, stdext::C9 *c9, other::C10 *c10) {
+  // ITANIUM: type.test{{.*}}!"_ZTS2C1"
+  // MS: type.test{{.*}}!"?AUC1@@"
+  c1->f();
+  // ITANIUM-NOT: type.test{{.*}}!"_ZTS2C2"
+  // MS: type.test{{.*}}!"?AUC2@@"
+  c2->f();
+  // ITANIUM: type.test{{.*}}!"_ZTS2C3"
+  // MS-NOT: type.test{{.*}}!"?AUC3@@"
+  c3->f();
+  // ITANIUM: type.test{{.*}}!"_ZTS2C4"
+  // MS-NOT: type.test{{.*}}!"?AUC4@@"
+  c4->f();
+  // ITANIUM-NOT: type.test{{.*}}!"_ZTS2C5"
+  // MS-NOT: type.test{{.*}}!"?AUC5@@"
+  c5->f();
+  // ITANIUM-NOT: type.test{{.*}}!"_ZTS2C6"
+  // MS-NOT: type.test{{.*}}!"?AUC6@@"
+  c6->f();
+  // ITANIUM: type.test{{.*}}!"_ZTSSt2C7"
+  // MS-STD: type.test{{.*}}!"?AUC7@std@@"
+  // MS-NOSTD-NOT: type.test{{.*}}!"?AUC7@std@@"
+  c7->f();
+  // ITANIUM: type.test{{.*}}!"_ZTSNSt2C72C8E"
+  // MS-STD: type.test{{.*}}!"?AUC8@C7@std@@"
+  // MS-NOSTD-NOT: type.test{{.*}}!"?AUC8@C7@std@@"
+  c8->f();
+  // ITANIUM: type.test{{.*}}!"_ZTSN6stdext2C9E"
+  // MS-STD: type.test{{.*}}!"?AUC9@stdext@@"
+  // MS-NOSTD-NOT: type.test{{.*}}!"?AUC9@stdext@@"
+  c9->f();
+  // ITANIUM: type.test{{.*}}!"_ZTSN5other3C10E"
+  // MS: type.test{{.*}}!"?AUC10@other@@"
+  c10->f();
+  // ITANIUM: type.test{{.*}}!{{[0-9]}}
+  // MS: type.test{{.*}}!{{[0-9]}}
+  C11 *c11;
+  c11->f();
+}
diff --git a/test/CodeGenCXX/mangle-abi-tag.cpp b/test/CodeGenCXX/mangle-abi-tag.cpp
new file mode 100644
index 0000000..385a16f
--- /dev/null
+++ b/test/CodeGenCXX/mangle-abi-tag.cpp
@@ -0,0 +1,205 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple %itanium_abi_triple -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple i686-linux-gnu -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-linux-gnu -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple powerpc64le-unknown-linux-gnu -std=c++11 -o - | FileCheck %s
+
+struct __attribute__((abi_tag("A", "B"))) A { };
+
+struct B: A { };
+
+template<class T>
+
+struct C {
+};
+
+struct D { A* p; };
+
+template<class T>
+struct __attribute__((abi_tag("C", "D"))) E {
+};
+
+struct __attribute__((abi_tag("A", "B"))) F { };
+
+A a1;
+// CHECK-DAG: @_Z2a1B1AB1B =
+
+__attribute__((abi_tag("C", "D")))
+A a2;
+// CHECK-DAG: @_Z2a2B1AB1BB1CB1D =
+
+B a3;
+// CHECK-DAG: @a3 =
+
+C<A> a4;
+// CHECK-DAG: @_Z2a4B1AB1B =
+
+D a5;
+// CHECK-DAG: @a5 =
+
+E<int> a6;
+// CHECK-DAG: @_Z2a6B1CB1D =
+
+E<A> a7;
+// CHECK-DAG: @_Z2a7B1AB1BB1CB1D =
+
+template<>
+struct E<float> {
+  static float a8;
+};
+float E<float>::a8;
+// CHECK-DAG: @_ZN1EB1CB1DIfE2a8E =
+
+template<>
+struct E<F> {
+  static bool a9;
+};
+bool E<F>::a9;
+// CHECK-DAG: @_ZN1EB1CB1DI1FB1AB1BE2a9E =
+
+struct __attribute__((abi_tag("A", "B"))) A10 {
+  virtual ~A10() {}
+} a10;
+// vtable
+// CHECK-DAG: @_ZTV3A10B1AB1B =
+// typeinfo
+// CHECK-DAG: @_ZTI3A10B1AB1B =
+
+struct __attribute__((abi_tag("A"))) B11 {
+  static A10 b;
+};
+A10 B11::b;
+// B11[abi:A]::b[abi:B]
+// CHECK-DAG: @_ZN3B11B1A1bB1BE =
+
+__attribute__ ((abi_tag("C", "D")))
+void* f1() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f1B1CB1Dv(
+
+__attribute__ ((abi_tag("C", "D")))
+A* f2() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f2B1AB1BB1CB1Dv(
+
+B* f3() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f3v(
+
+C<A>* f4() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f4B1AB1Bv(
+
+D* f5() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f5v(
+
+E<char>* f6() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f6B1CB1Dv(
+
+E<A>* f7() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f7B1AB1BB1CB1Dv(
+
+void f8(E<A>*) {
+}
+// CHECK-DAG: define {{.*}} @_Z2f8P1EB1CB1DI1AB1AB1BE(
+
+inline namespace Names1 __attribute__((__abi_tag__)) {
+    class C1 {};
+}
+C1 f9() { return C1(); }
+// CHECK-DAG: @_Z2f9B6Names1v(
+
+inline namespace Names2 __attribute__((__abi_tag__("Tag1", "Tag2"))) {
+    class C2 {};
+}
+C2 f10() { return C2(); }
+// CHECK-DAG: @_Z3f10B4Tag1B4Tag2v(
+
+void __attribute__((abi_tag("A"))) f11(A) {}
+// f11[abi:A](A[abi:A][abi:B])
+// CHECK-DAG: define {{.*}} @_Z3f11B1A1AB1AB1B(
+
+A f12(A) { return A(); }
+// f12(A[abi:A][abi:B])
+// CHECK-DAG: define {{.*}} @_Z3f121AB1AB1B(
+
+inline void f13() {
+  struct L {
+    static E<int>* foo() {
+      static A10 a;
+      return 0;
+    }
+  };
+  L::foo();
+}
+void f13_test() {
+  f13();
+}
+// f13()::L::foo[abi:C][abi:D]()
+// CHECK-DAG: define linkonce_odr %struct.E* @_ZZ3f13vEN1L3fooB1CB1DEv(
+
+// f13()::L::foo[abi:C][abi:D]()::a[abi:A][abi:B]
+// CHECK-DAG: @_ZZZ3f13vEN1L3fooB1CB1DEvE1aB1AB1B =
+
+// guard variable for f13()::L::foo[abi:C][abi:D]()::a[abi:A][abi:B]
+// CHECK-DAG: @_ZGVZZ3f13vEN1L3fooB1CB1DEvE1aB1AB1B =
+
+struct __attribute__((abi_tag("TAG"))) A14 {
+  A14 f14();
+};
+A14 A14::f14() {
+  return A14();
+}
+// A14[abi:TAG]::f14()
+// CHECK-DAG: define {{.+}} @_ZN3A14B3TAG3f14Ev(
+
+template<class T>
+T f15() {
+  return T();
+}
+void f15_test() {
+  f15<A14>();
+}
+// A14[abi:TAG] f15<A14[abi:TAG]>()
+// CHECK-DAG: define linkonce_odr {{.+}} @_Z3f15I3A14B3TAGET_v(
+
+template<class T>
+A14 f16() {
+  return A14();
+}
+void f16_test() {
+  f16<int>();
+}
+// A14[abi:TAG] f16<int>()
+// CHECK-DAG: define linkonce_odr {{.+}} @_Z3f16IiE3A14B3TAGv(
+
+template<class T>
+struct __attribute__((abi_tag("TAG"))) A17 {
+  A17 operator+(const A17& a) {
+    return a;
+  }
+};
+void f17_test() {
+  A17<int> a, b;
+  a + b;
+}
+// A17[abi:TAG]<int>::operator+(A17[abi:TAG]<int> const&)
+// CHECK-DAG: define linkonce_odr {{.+}} @_ZN3A17B3TAGIiEplERKS0_(
+
+struct A18 {
+  operator A() { return A(); }
+};
+void f18_test() {
+  A a = A18();
+}
+// A18::operator A[abi:A][abi:B]() but GCC adds the same tags twice!
+// CHECK-DAG: define linkonce_odr {{.+}} @_ZN3A18cv1AB1AB1BEv(
diff --git a/test/CodeGenCXX/mangle-ms-cxx11.cpp b/test/CodeGenCXX/mangle-ms-cxx11.cpp
index 9938444..8e2577b 100644
--- a/test/CodeGenCXX/mangle-ms-cxx11.cpp
+++ b/test/CodeGenCXX/mangle-ms-cxx11.cpp
@@ -293,3 +293,28 @@
 }
 // CHECK-DAG: @"\01??R<lambda_0>@?0??PR26105@@YAHXZ@QBE@H@Z"
 // CHECK-DAG: @"\01??R<lambda_1>@?0???R<lambda_0>@?0??PR26105@@YAHXZ@QBE@H@Z@QBE@H@Z"
+
+int __unaligned * unaligned_foo1() { return 0; }
+int __unaligned * __unaligned * unaligned_foo2() { return 0; }
+__unaligned int unaligned_foo3() { return 0; }
+void unaligned_foo4(int __unaligned *p1) {}
+void unaligned_foo5(int __unaligned * __restrict p1) {}
+template <typename T> T unaligned_foo6(T t) { return t; }
+void unaligned_foo7() { unaligned_foo6<int *>(0); unaligned_foo6<int __unaligned *>(0); }
+
+// CHECK-DAG: @"\01?unaligned_foo1@@YAPFAHXZ"
+// CHECK-DAG: @"\01?unaligned_foo2@@YAPFAPFAHXZ"
+// CHECK-DAG: @"\01?unaligned_foo3@@YAHXZ"
+// CHECK-DAG: @"\01?unaligned_foo4@@YAXPFAH@Z"
+// CHECK-DAG: @"\01?unaligned_foo5@@YAXPIFAH@Z"
+// CHECK-DAG: @"\01??$unaligned_foo6@PAH@@YAPAHPAH@Z"
+// CHECK-DAG: @"\01??$unaligned_foo6@PFAH@@YAPFAHPFAH@Z"
+
+// __unaligned qualifier for function types
+struct unaligned_foo8_S {
+    void unaligned_foo8() volatile __unaligned;
+};
+void unaligned_foo8_S::unaligned_foo8() volatile __unaligned {}
+
+// CHECK-DAG: @"\01?unaligned_foo8@unaligned_foo8_S@@QFCEXXZ"
+
diff --git a/test/CodeGenCXX/mangle-ms-cxx14.cpp b/test/CodeGenCXX/mangle-ms-cxx14.cpp
index 084eb7d..798a390 100644
--- a/test/CodeGenCXX/mangle-ms-cxx14.cpp
+++ b/test/CodeGenCXX/mangle-ms-cxx14.cpp
@@ -55,3 +55,8 @@
 
 Foo<&x<int>, &x<int>> Zoo;
 // CHECK-DAG: "\01?Zoo@@3U?$Foo@$1??$x@H@@3HA$1?1@3HA@@A"
+
+template <typename T> T unaligned_x;
+extern auto test_unaligned() { return unaligned_x<int __unaligned *>; }
+// CHECK-DAG: "\01??$unaligned_x@PFAH@@3PFAHA"
+
diff --git a/test/CodeGenCXX/mangle-ms-md5.cpp b/test/CodeGenCXX/mangle-ms-md5.cpp
new file mode 100644
index 0000000..aef2683
--- /dev/null
+++ b/test/CodeGenCXX/mangle-ms-md5.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -emit-llvm -o - -triple i686-pc-win32 %s | FileCheck %s
+int xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx;
+// CHECK-DAG: @"\01??@bf7ea7b95f260b0b24e7f1e8fc8370ab@" = global i32 0, align 4
+
+struct yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy {
+  yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy();
+  virtual void f();
+};
+yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy::yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy() {}
+// CHECK-DAG: @"\01??@a6a285da2eea70dba6b578022be61d81@??_R4@" = linkonce_odr constant %rtti.CompleteObjectLocator
+// CHECK-DAG: @"\01??@a6a285da2eea70dba6b578022be61d81@" = unnamed_addr alias
diff --git a/test/CodeGenCXX/mangle-ms.cpp b/test/CodeGenCXX/mangle-ms.cpp
index c82fca4..ee0f50e 100644
--- a/test/CodeGenCXX/mangle-ms.cpp
+++ b/test/CodeGenCXX/mangle-ms.cpp
@@ -4,6 +4,11 @@
 int a;
 // CHECK-DAG: @"\01?a@@3HA"
 
+extern "C++" {
+static int __attribute__((used)) ignore_transparent_context;
+// CHECK-DAG: @ignore_transparent_context
+}
+
 namespace N {
   int b;
 // CHECK-DAG: @"\01?b@N@@3HA"
diff --git a/test/CodeGenCXX/mangle-template.cpp b/test/CodeGenCXX/mangle-template.cpp
index 7fa300a..2313469 100644
--- a/test/CodeGenCXX/mangle-template.cpp
+++ b/test/CodeGenCXX/mangle-template.cpp
@@ -201,3 +201,14 @@
 
   int call(bool b) { return inl<void>(b); }
 }
+
+namespace std {
+template <class _Tp, _Tp...> struct integer_sequence {};
+}
+
+namespace test15 {
+template <int N>
+__make_integer_seq<std::integer_sequence, int, N> make() {}
+template __make_integer_seq<std::integer_sequence, int, 5> make<5>();
+// CHECK: define weak_odr {{.*}} @_ZN6test154makeILi5EEE18__make_integer_seqISt16integer_sequenceiXT_EEv(
+}
diff --git a/test/CodeGenCXX/mangle.cpp b/test/CodeGenCXX/mangle.cpp
index 5012c3b..5d75710 100644
--- a/test/CodeGenCXX/mangle.cpp
+++ b/test/CodeGenCXX/mangle.cpp
@@ -1101,3 +1101,13 @@
   // CHECK-LABEL: @_ZN6test541cC2EPNS0_Ut0_E
 };
 }
+
+namespace test55 {
+enum E { R };
+
+template <typename T>
+void fn(T, __underlying_type(T)) {}
+
+template void fn<E>(E, __underlying_type(E));
+// CHECK-LABEL: @_ZN6test552fnINS_1EEEEvT_U3eutS2_
+}
diff --git a/test/CodeGenCXX/microsoft-abi-array-cookies.cpp b/test/CodeGenCXX/microsoft-abi-array-cookies.cpp
index 75c0621..9ef1879 100644
--- a/test/CodeGenCXX/microsoft-abi-array-cookies.cpp
+++ b/test/CodeGenCXX/microsoft-abi-array-cookies.cpp
@@ -7,7 +7,7 @@
 void check_array_no_cookies() {
 // CHECK: define void @"\01?check_array_no_cookies@@YAXXZ"() [[NUW:#[0-9]+]]
 
-// CHECK: call noalias i8* @"\01??_U@YAPAXI@Z"(i32 42)
+// CHECK: call i8* @"\01??_U@YAPAXI@Z"(i32 42)
   ClassWithoutDtor *array = new ClassWithoutDtor[42];
 
 // CHECK: call void @"\01??_V@YAXPAX@Z"(
@@ -24,7 +24,7 @@
 // CHECK: define {{.*}} @"\01?check_array_cookies_simple@@YAXXZ"()
 
   ClassWithDtor *array = new ClassWithDtor[42];
-// CHECK: [[ALLOCATED:%.*]] = call noalias i8* @"\01??_U@YAPAXI@Z"(i32 46)
+// CHECK: [[ALLOCATED:%.*]] = call i8* @"\01??_U@YAPAXI@Z"(i32 46)
 // 46 = 42 + size of cookie (4)
 // CHECK: [[COOKIE:%.*]] = bitcast i8* [[ALLOCATED]] to i32*
 // CHECK: store i32 42, i32* [[COOKIE]]
@@ -46,7 +46,7 @@
 void check_array_cookies_aligned() {
 // CHECK: define {{.*}} @"\01?check_array_cookies_aligned@@YAXXZ"()
   ClassWithAlignment *array = new ClassWithAlignment[42];
-// CHECK: [[ALLOCATED:%.*]] = call noalias i8* @"\01??_U@YAPAXI@Z"(i32 344)
+// CHECK: [[ALLOCATED:%.*]] = call i8* @"\01??_U@YAPAXI@Z"(i32 344)
 //   344 = 42*8 + size of cookie (8, due to alignment)
 // CHECK: [[COOKIE:%.*]] = bitcast i8* [[ALLOCATED]] to i32*
 // CHECK: store i32 42, i32* [[COOKIE]]
diff --git a/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp b/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp
index da58c46..6da7a50 100644
--- a/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp
+++ b/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp
@@ -2,10 +2,10 @@
 
 // PR15768
 
-// A trivial 12 byte struct is returned indirectly.
+// A trivial 20 byte struct is returned indirectly and taken as byval.
 struct S {
   S();
-  int a, b, c;
+  int a, b, c, d, e;
 };
 
 struct C {
diff --git a/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp b/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
index e9eba6e..f03cd6c 100644
--- a/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
+++ b/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
@@ -60,7 +60,7 @@
 // CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[VOIDP]], i32 [[VBOFFS]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[VBOFFS]], i8* nonnull bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUA@@@8" to i8*), i8* nonnull bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[VBOFFS]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUA@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RES:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi %struct.T*
@@ -78,7 +78,7 @@
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[CAST]], i32 [[DELTA]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* nonnull bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUB@@@8" to i8*), i8* nonnull bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUB@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RES:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi %struct.T*
diff --git a/test/CodeGenCXX/microsoft-abi-eh-catch.cpp b/test/CodeGenCXX/microsoft-abi-eh-catch.cpp
index 69ec347..ac1321e 100644
--- a/test/CodeGenCXX/microsoft-abi-eh-catch.cpp
+++ b/test/CodeGenCXX/microsoft-abi-eh-catch.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc \
-// RUN:     -mconstructor-aliases -fexceptions -fcxx-exceptions -fnew-ms-eh \
+// RUN:     -mconstructor-aliases -fexceptions -fcxx-exceptions \
 // RUN:     -O1 -disable-llvm-optzns \
 // RUN:     | FileCheck -check-prefix WIN64 %s
 
diff --git a/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp b/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp
index 298e70e..004dc45 100644
--- a/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp
+++ b/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fexceptions -fcxx-exceptions -fno-rtti | FileCheck -check-prefix WIN32 %s
-// RUN: %clang_cc1 -std=c++11 -emit-llvm -O3 -disable-llvm-optzns %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fexceptions -fcxx-exceptions -fno-rtti | FileCheck -check-prefix WIN32 -check-prefix WIN32-LIFETIME %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fexceptions -fcxx-exceptions -fno-rtti | FileCheck -check-prefix WIN32 -check-prefix WIN32-O0 %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm -O3 -disable-llvm-optzns %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fexceptions -fcxx-exceptions -fno-rtti | FileCheck -check-prefix WIN32 -check-prefix WIN32-O3 -check-prefix WIN32-LIFETIME %s
 
 struct A {
   A();
@@ -95,40 +95,78 @@
   return (cond ? TakesTwo((TakeRef(A()), A()), (TakeRef(A()), A())) : CouldThrow());
 }
 
-// WIN32-LABEL: define i32 @"\01?HasConditionalDeactivatedCleanups@@YAH_N@Z"{{.*}} {
-// WIN32:   alloca i1
-// WIN32:   %[[arg1_cond:.*]] = alloca i1
+// WIN32-O0-LABEL: define i32 @"\01?HasConditionalDeactivatedCleanups@@YAH_N@Z"{{.*}} {
+// WIN32-O0:   alloca i1
+// WIN32-O0:   %[[arg1_cond:.*]] = alloca i1
 //        Start all four cleanups as deactivated.
-// WIN32:   store i1 false
-// WIN32:   store i1 false
-// WIN32:   store i1 false
-// WIN32:   store i1 false
-// WIN32:   br i1
+// WIN32-O0:   store i1 false
+// WIN32-O0:   store i1 false
+// WIN32-O0:   store i1 false
+// WIN32-O0:   store i1 false
+// WIN32-O0:   br i1
 //        True condition.
-// WIN32:   call x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
-// WIN32:   store i1 true
-// WIN32:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
-// WIN32:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
-// WIN32:   store i1 true, i1* %[[arg1_cond]]
-// WIN32:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
-// WIN32:   store i1 true
-// WIN32:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
-// WIN32:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
-// WIN32:   store i1 true
-// WIN32:   store i1 false, i1* %[[arg1_cond]]
-// WIN32:   invoke i32 @"\01?TakesTwo@@YAHUA@@0@Z"
+// WIN32-O0:   call x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O0:   store i1 true
+// WIN32-O0:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
+// WIN32-O0:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O0:   store i1 true, i1* %[[arg1_cond]]
+// WIN32-O0:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O0:   store i1 true
+// WIN32-O0:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
+// WIN32-O0:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O0:   store i1 true
+// WIN32-O0:   store i1 false, i1* %[[arg1_cond]]
+// WIN32-O0:   invoke i32 @"\01?TakesTwo@@YAHUA@@0@Z"
 //        False condition.
-// WIN32:   invoke i32 @"\01?CouldThrow@@YAHXZ"()
+// WIN32-O0:   invoke i32 @"\01?CouldThrow@@YAHXZ"()
 //        Two normal cleanups for TakeRef args.
-// WIN32:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
-// WIN32-NOT:   invoke x86_thiscallcc void @"\01??1A@@QAE@XZ"
-// WIN32:   ret i32
+// WIN32-O0:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
+// WIN32-O0-NOT:   invoke x86_thiscallcc void @"\01??1A@@QAE@XZ"
+// WIN32-O0:   ret i32
 //
 //        Somewhere in the landing pad soup, we conditionally destroy arg1.
-// WIN32:   %[[isactive:.*]] = load i1, i1* %[[arg1_cond]]
-// WIN32:   br i1 %[[isactive]]
-// WIN32:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
-// WIN32: }
+// WIN32-O0:   %[[isactive:.*]] = load i1, i1* %[[arg1_cond]]
+// WIN32-O0:   br i1 %[[isactive]]
+// WIN32-O0:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
+// WIN32-O0: }
+
+// WIN32-O3-LABEL: define i32 @"\01?HasConditionalDeactivatedCleanups@@YAH_N@Z"{{.*}} {
+// WIN32-O3:   alloca i1
+// WIN32-O3:   alloca i1
+// WIN32-O3:   %[[arg1_cond:.*]] = alloca i1
+//        Start all four cleanups as deactivated.
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   br i1
+//        True condition.
+// WIN32-O3:   call x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O3:   store i1 true
+// WIN32-O3:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
+// WIN32-O3:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O3:   store i1 true, i1* %[[arg1_cond]]
+// WIN32-O3:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O3:   store i1 true
+// WIN32-O3:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
+// WIN32-O3:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O3:   store i1 true
+// WIN32-O3:   store i1 false, i1* %[[arg1_cond]]
+// WIN32-O3:   invoke i32 @"\01?TakesTwo@@YAHUA@@0@Z"
+//        False condition.
+// WIN32-O3:   invoke i32 @"\01?CouldThrow@@YAHXZ"()
+//        Two normal cleanups for TakeRef args.
+// WIN32-O3:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
+// WIN32-O3-NOT:   invoke x86_thiscallcc void @"\01??1A@@QAE@XZ"
+// WIN32-O3:   ret i32
+//
+//        Somewhere in the landing pad soup, we conditionally destroy arg1.
+// WIN32-O3:   %[[isactive:.*]] = load i1, i1* %[[arg1_cond]]
+// WIN32-O3:   br i1 %[[isactive]]
+// WIN32-O3:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
+// WIN32-O3: }
 
 namespace crash_on_partial_destroy {
 struct A {
diff --git a/test/CodeGenCXX/microsoft-abi-eh-terminate.cpp b/test/CodeGenCXX/microsoft-abi-eh-terminate.cpp
index 0b8d270..7836dcf 100644
--- a/test/CodeGenCXX/microsoft-abi-eh-terminate.cpp
+++ b/test/CodeGenCXX/microsoft-abi-eh-terminate.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc -mconstructor-aliases -fexceptions -fcxx-exceptions -fms-compatibility-version=18.00 | FileCheck -check-prefix=MSVC2013 %s
-// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc -mconstructor-aliases -fexceptions -fcxx-exceptions -fms-compatibility-version=19.00 | FileCheck -check-prefix=MSVC2015 %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc -mconstructor-aliases -fexceptions -fcxx-exceptions -fms-compatibility-version=18.00 | FileCheck -check-prefix=MSVC2013 -check-prefix=CHECK %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc -mconstructor-aliases -fexceptions -fcxx-exceptions -fms-compatibility-version=19.00 | FileCheck -check-prefix=MSVC2015 -check-prefix=CHECK %s
 
 void may_throw();
 void never_throws() noexcept(true) {
@@ -9,7 +9,8 @@
 // CHECK-LABEL: define void @"\01?never_throws@@YAXXZ"()
 // CHECK-SAME:          personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 // CHECK:      invoke void @"\01?may_throw@@YAXXZ"()
-// CHECK:      cleanuppad within none []
+// CHECK:      %[[cp:.*]] = cleanuppad within none []
 // MSVC2013:      call void @"\01?terminate@@YAXXZ"()
 // MSVC2015:      call void @__std_terminate()
+// CHECK-SAME:  [ "funclet"(token %[[cp]]) ]
 // CHECK-NEXT: unreachable
diff --git a/test/CodeGenCXX/microsoft-abi-extern-template.cpp b/test/CodeGenCXX/microsoft-abi-extern-template.cpp
new file mode 100644
index 0000000..de46d5b
--- /dev/null
+++ b/test/CodeGenCXX/microsoft-abi-extern-template.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fno-rtti-data -O1 -disable-llvm-optzns %s -emit-llvm -o - -triple x86_64-windows-msvc | FileCheck %s
+
+// Even though Foo<int> has an extern template declaration, we have to emit our
+// own copy the vftable when emitting the available externally constructor.
+
+// CHECK: @"\01??_7?$Foo@H@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [
+// CHECK-SAME:   i8* bitcast (i8* (%struct.Foo*, i32)* @"\01??_G?$Foo@H@@UEAAPEAXI@Z" to i8*)
+// CHECK-SAME: ], comdat
+
+// CHECK-LABEL: define %struct.Foo* @"\01?f@@YAPEAU?$Foo@H@@XZ"()
+// CHECK: call %struct.Foo* @"\01??0?$Foo@H@@QEAA@XZ"(%struct.Foo* %{{.*}})
+
+// CHECK: define available_externally %struct.Foo* @"\01??0?$Foo@H@@QEAA@XZ"(%struct.Foo* returned %this)
+// CHECK:   store {{.*}} @"\01??_7?$Foo@H@@6B@"
+
+// CHECK: define linkonce_odr i8* @"\01??_G?$Foo@H@@UEAAPEAXI@Z"(%struct.Foo* %this, i32 %should_call_delete)
+
+struct Base {
+  virtual ~Base();
+};
+template <typename T> struct Foo : Base {
+  Foo() {}
+};
+extern template class Foo<int>;
+Foo<int> *f() { return new Foo<int>(); }
diff --git a/test/CodeGenCXX/microsoft-abi-member-pointers.cpp b/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
index fd22c00..a3985ba 100644
--- a/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
+++ b/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
@@ -3,16 +3,29 @@
 // RUN: %clang_cc1 -std=c++11 -Wno-uninitialized -fno-rtti -emit-llvm %s -o - -triple=i386-pc-win32 -DINCOMPLETE_VIRTUAL -fms-extensions -verify
 // RUN: %clang_cc1 -std=c++11 -Wno-uninitialized -fno-rtti -emit-llvm %s -o - -triple=i386-pc-win32 -DINCOMPLETE_VIRTUAL -DMEMFUN -fms-extensions -verify
 
+struct PR26313_Y;
+typedef void (PR26313_Y::*PR26313_FUNC)();
+struct PR26313_X {
+  PR26313_FUNC *ptr;
+  PR26313_X();
+};
+PR26313_X::PR26313_X() {}
+void PR26313_f(PR26313_FUNC *p) { delete p; }
+
+struct PR26313_Z;
+int PR26313_Z::**a = nullptr;
+int PR26313_Z::*b = *a;
+// CHECK-DAG: @"\01?a@@3PAPQPR26313_Z@@HA" = global %0* null, align 4
+// CHECK-DAG: @"\01?b@@3PQPR26313_Z@@HQ1@" = global { i32, i32, i32 } { i32 0, i32 0, i32 -1 }, align 4
+
 namespace PR20947 {
 struct A;
 int A::**a = nullptr;
-// CHECK: %[[opaque0:.*]] = type opaque
-// CHECK: %[[opaque1:.*]] = type opaque
-// CHECK: @"\01?a@PR20947@@3PAPQA@1@HA" = global %[[opaque0]]* null, align 4
+// CHECK-DAG: @"\01?a@PR20947@@3PAPQA@1@HA" = global %{{.*}}* null, align 4
 
 struct B;
 int B::*&b = b;
-// CHECK: @"\01?b@PR20947@@3AAPQB@1@HA" = global %[[opaque1]]* null, align 4
+// CHECK-DAG: @"\01?b@PR20947@@3AAPQB@1@HA" = global %{{.*}}* null, align 4
 }
 
 namespace PR20017 {
diff --git a/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp b/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
index 4c2d850..f7dc524 100644
--- a/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
+++ b/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
@@ -22,6 +22,16 @@
   int x;
 };
 
+struct Multibyte {
+  char a, b, c, d;
+};
+
+struct Packed {
+  short a;
+  int b;
+  short c;
+};
+
 struct SmallWithDtor {
   SmallWithDtor();
   ~SmallWithDtor();
@@ -102,19 +112,30 @@
 
 void small_arg(Small s) {}
 // LINUX-LABEL: define void @_Z9small_arg5Small(i32 %s.0)
-// WIN32: define void @"\01?small_arg@@YAXUSmall@@@Z"(%struct.Small* byval align 4 %s)
+// WIN32: define void @"\01?small_arg@@YAXUSmall@@@Z"(i32 %s.0)
 // WIN64: define void @"\01?small_arg@@YAXUSmall@@@Z"(i32 %s.coerce)
 
 void medium_arg(Medium s) {}
 // LINUX-LABEL: define void @_Z10medium_arg6Medium(i32 %s.0, i32 %s.1)
-// WIN32: define void @"\01?medium_arg@@YAXUMedium@@@Z"(%struct.Medium* byval align 4 %s)
+// WIN32: define void @"\01?medium_arg@@YAXUMedium@@@Z"(i32 %s.0, i32 %s.1)
 // WIN64: define void @"\01?medium_arg@@YAXUMedium@@@Z"(i64 %s.coerce)
 
 void small_arg_with_ctor(SmallWithCtor s) {}
 // LINUX-LABEL: define void @_Z19small_arg_with_ctor13SmallWithCtor(%struct.SmallWithCtor* byval align 4 %s)
-// WIN32: define void @"\01?small_arg_with_ctor@@YAXUSmallWithCtor@@@Z"(%struct.SmallWithCtor* byval align 4 %s)
+// WIN32: define void @"\01?small_arg_with_ctor@@YAXUSmallWithCtor@@@Z"(i32 %s.0)
 // WIN64: define void @"\01?small_arg_with_ctor@@YAXUSmallWithCtor@@@Z"(i32 %s.coerce)
 
+// FIXME: We could coerce to a series of i32s here if we wanted to.
+void multibyte_arg(Multibyte s) {}
+// LINUX-LABEL: define void @_Z13multibyte_arg9Multibyte(%struct.Multibyte* byval align 4 %s)
+// WIN32: define void @"\01?multibyte_arg@@YAXUMultibyte@@@Z"(%struct.Multibyte* byval align 4 %s)
+// WIN64: define void @"\01?multibyte_arg@@YAXUMultibyte@@@Z"(i32 %s.coerce)
+
+void packed_arg(Packed s) {}
+// LINUX-LABEL: define void @_Z10packed_arg6Packed(%struct.Packed* byval align 4 %s)
+// WIN32: define void @"\01?packed_arg@@YAXUPacked@@@Z"(%struct.Packed* byval align 4 %s)
+// WIN64: define void @"\01?packed_arg@@YAXUPacked@@@Z"(%struct.Packed* %s)
+
 // Test that dtors are invoked in the callee.
 void small_arg_with_dtor(SmallWithDtor s) {}
 // WIN32: define void @"\01?small_arg_with_dtor@@YAXUSmallWithDtor@@@Z"(<{ %struct.SmallWithDtor }>* inalloca) {{.*}} {
@@ -196,6 +217,28 @@
 // WIN32: define void @"\01?big_arg@@YAXUBig@@@Z"(%struct.Big* byval align 4 %s)
 // WIN64: define void @"\01?big_arg@@YAXUBig@@@Z"(%struct.Big* %s)
 
+// PR27607: We would attempt to load i32 value out of the reference instead of
+// just loading the pointer from the struct during argument expansion.
+struct RefField {
+  RefField(int &x);
+  int &x;
+};
+void takes_ref_field(RefField s) {}
+// LINUX-LABEL: define void @_Z15takes_ref_field8RefField(%struct.RefField* byval align 4 %s)
+// WIN32: define void @"\01?takes_ref_field@@YAXURefField@@@Z"(i32* %s.0)
+// WIN64: define void @"\01?takes_ref_field@@YAXURefField@@@Z"(i64 %s.coerce)
+
+void pass_ref_field() {
+  int x;
+  takes_ref_field(RefField(x));
+}
+// LINUX-LABEL: define void @_Z14pass_ref_fieldv()
+// LINUX: call void @_Z15takes_ref_field8RefField(%struct.RefField* byval align 4 %{{.*}})
+// WIN32-LABEL: define void @"\01?pass_ref_field@@YAXXZ"()
+// WIN32: call void @"\01?takes_ref_field@@YAXURefField@@@Z"(i32* %{{.*}})
+// WIN64-LABEL: define void @"\01?pass_ref_field@@YAXXZ"()
+// WIN64: call void @"\01?takes_ref_field@@YAXURefField@@@Z"(i64 %{{.*}})
+
 class Class {
  public:
   Small thiscall_method_small() { return Small(); }
@@ -230,12 +273,12 @@
 
   void thiscall_method_arg(Small s) {}
   // LINUX: define {{.*}} void @_ZN5Class19thiscall_method_argE5Small(%class.Class* %this, i32 %s.0)
-  // WIN32: define {{.*}} void @"\01?thiscall_method_arg@Class@@QAEXUSmall@@@Z"(%class.Class* %this, %struct.Small* byval align 4 %s)
+  // WIN32: define {{.*}} void @"\01?thiscall_method_arg@Class@@QAEXUSmall@@@Z"(%class.Class* %this, i32 %s.0)
   // WIN64: define linkonce_odr void @"\01?thiscall_method_arg@Class@@QEAAXUSmall@@@Z"(%class.Class* %this, i32 %s.coerce)
 
   void thiscall_method_arg(SmallWithCtor s) {}
   // LINUX: define {{.*}} void @_ZN5Class19thiscall_method_argE13SmallWithCtor(%class.Class* %this, %struct.SmallWithCtor* byval align 4 %s)
-  // WIN32: define {{.*}} void @"\01?thiscall_method_arg@Class@@QAEXUSmallWithCtor@@@Z"(%class.Class* %this, %struct.SmallWithCtor* byval align 4 %s)
+  // WIN32: define {{.*}} void @"\01?thiscall_method_arg@Class@@QAEXUSmallWithCtor@@@Z"(%class.Class* %this, i32 %s.0)
   // WIN64: define linkonce_odr void @"\01?thiscall_method_arg@Class@@QEAAXUSmallWithCtor@@@Z"(%class.Class* %this, i32 %s.coerce)
 
   void thiscall_method_arg(Big s) {}
diff --git a/test/CodeGenCXX/microsoft-abi-structors.cpp b/test/CodeGenCXX/microsoft-abi-structors.cpp
index 0722f75..a576f0c 100644
--- a/test/CodeGenCXX/microsoft-abi-structors.cpp
+++ b/test/CodeGenCXX/microsoft-abi-structors.cpp
@@ -7,7 +7,7 @@
 // RUN: FileCheck --check-prefix DTORS3 %s < %t
 // RUN: FileCheck --check-prefix DTORS4 %s < %t
 //
-// RUN: %clang_cc1 -emit-llvm %s -o - -mconstructor-aliases -triple=x86_64-pc-win32 -fno-rtti | FileCheck --check-prefix DTORS-X64 %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -mconstructor-aliases -triple=x86_64-pc-win32 -fno-rtti -std=c++11 | FileCheck --check-prefix DTORS-X64 %s
 
 namespace basic {
 
@@ -177,7 +177,7 @@
 //      Do an adjustment from B* to C*.
 // DTORS2:   getelementptr i8, i8* %{{.*}}, i32 -4
 // DTORS2:   bitcast i8* %{{.*}} to %"struct.dtor_in_second_nvbase::C"*
-// DTORS2:   %[[CALL:.*]] = call x86_thiscallcc i8* @"\01??_GC@dtor_in_second_nvbase@@UAEPAXI@Z"
+// DTORS2:   %[[CALL:.*]] = tail call x86_thiscallcc i8* @"\01??_GC@dtor_in_second_nvbase@@UAEPAXI@Z"
 // DTORS2:   ret i8* %[[CALL]]
 
 }
@@ -443,6 +443,20 @@
 // CHECK: store {{.*}} @"\01??_7MoveOnly@implicit_copy_vtable@@6B@"
 }
 
+namespace delegating_ctor {
+struct Y {};
+struct X : virtual Y {
+  X(int);
+  X();
+};
+X::X(int) : X() {}
+}
+// CHECK: define x86_thiscallcc %"struct.delegating_ctor::X"* @"\01??0X@delegating_ctor@@QAE@H@Z"(
+// CHECK:  %[[is_most_derived_addr:.*]] = alloca i32, align 4
+// CHECK:  store i32 %is_most_derived, i32* %[[is_most_derived_addr]]
+// CHECK:  %[[is_most_derived:.*]] = load i32, i32* %[[is_most_derived_addr]]
+// CHECK:  call x86_thiscallcc {{.*}}* @"\01??0X@delegating_ctor@@QAE@XZ"({{.*}} i32 %[[is_most_derived]])
+
 // Dtor thunks for classes in anonymous namespaces should be internal, not
 // linkonce_odr.
 namespace {
@@ -471,4 +485,3 @@
 extern void testG() {
   G g;
 }
-
diff --git a/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp b/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
index 29b434e..0202586 100644
--- a/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
+++ b/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
@@ -9,12 +9,14 @@
 // CHECK-DAG: @"\01?s@?1??f@@YAAAUS@@XZ@4U2@A" = linkonce_odr thread_local global %struct.S zeroinitializer
 // CHECK-DAG: @"\01??__J?1??f@@YAAAUS@@XZ@51" = linkonce_odr thread_local global i32 0
 // CHECK-DAG: @"\01?s@?1??g@@YAAAUS@@XZ@4U2@A" = linkonce_odr global %struct.S zeroinitializer
-// CHECK-DAG: @"\01?$TSS0@?1??g@@YAAAUS@@XZ" = linkonce_odr global i32 0
+// CHECK-DAG: @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA" = linkonce_odr global i32 0
 // CHECK-DAG: @_Init_thread_epoch = external thread_local global i32, align 4
 // CHECK-DAG: @"\01?j@?1??h@@YAAAUS@@_N@Z@4U2@A" = linkonce_odr thread_local global %struct.S zeroinitializer
 // CHECK-DAG: @"\01??__J?1??h@@YAAAUS@@_N@Z@51" = linkonce_odr thread_local global i32 0
 // CHECK-DAG: @"\01?i@?1??h@@YAAAUS@@_N@Z@4U2@A" = linkonce_odr global %struct.S zeroinitializer
-// CHECK-DAG: @"\01?$TSS0@?1??h@@YAAAUS@@_N@Z" = linkonce_odr global i32 0
+// CHECK-DAG: @"\01?$TSS0@?1??h@@YAAAUS@@_N@Z@4HA" = linkonce_odr global i32 0
+// CHECK-DAG: @"\01?i@?1??g1@@YAHXZ@4HA" = internal global i32 0, align 4
+// CHECK-DAG: @"\01?$TSS0@?1??g1@@YAHXZ@4HA" = internal global i32 0, align 4
 
 // CHECK-LABEL: define {{.*}} @"\01?f@@YAAAUS@@XZ"()
 // CHECK-SAME:  personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
@@ -51,14 +53,14 @@
 // CHECK-LABEL: define {{.*}} @"\01?g@@YAAAUS@@XZ"()
 extern inline S &g() {
   static S s;
-// CHECK:  %[[guard:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ" unordered, align 4
+// CHECK:  %[[guard:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA" unordered, align 4
 // CHECK-NEXT:  %[[epoch:.*]] = load i32, i32* @_Init_thread_epoch
 // CHECK-NEXT:  %[[cmp:.*]] = icmp sgt i32 %[[guard]], %[[epoch]]
 // CHECK-NEXT:  br i1 %[[cmp]], label %[[init_attempt:.*]], label %[[init_end:.*]]
 //
 // CHECK:     [[init_attempt]]:
-// CHECK-NEXT:  call void @_Init_thread_header(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ")
-// CHECK-NEXT:  %[[guard2:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ" unordered, align 4
+// CHECK-NEXT:  call void @_Init_thread_header(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA")
+// CHECK-NEXT:  %[[guard2:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA" unordered, align 4
 // CHECK-NEXT:  %[[cmp2:.*]] = icmp eq i32 %[[guard2]], -1
 // CHECK-NEXT:  br i1 %[[cmp2]], label %[[init:.*]], label %[[init_end:.*]]
 //
@@ -68,7 +70,7 @@
 //
 // CHECK:     [[invoke_cont]]:
 // CHECK-NEXT:  call i32 @atexit(void ()* @"\01??__Fs@?1??g@@YAAAUS@@XZ@YAXXZ")
-// CHECK-NEXT:  call void @_Init_thread_footer(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ")
+// CHECK-NEXT:  call void @_Init_thread_footer(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA")
 // CHECK-NEXT:  br label %init.end
 //
 // CHECK:     [[init_end]]:
@@ -76,7 +78,7 @@
 //
 // CHECK:     [[lpad]]:
 // CHECK-NEXT: cleanuppad within none []
-// CHECK:       call void @_Init_thread_abort(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ")
+// CHECK:       call void @_Init_thread_abort(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA")
 // CHECK-NEXT:  cleanupret {{.*}} unwind to caller
   return s;
 }
@@ -86,3 +88,10 @@
   static S i;
   return b ? j : i;
 }
+
+// CHECK-LABEL: define i32 @"\01?g1@@YAHXZ"()
+int f1();
+int g1() {
+  static int i = f1();
+  return i;
+}
diff --git a/test/CodeGenCXX/microsoft-abi-throw.cpp b/test/CodeGenCXX/microsoft-abi-throw.cpp
index 080f1a0..7c2e2a8 100644
--- a/test/CodeGenCXX/microsoft-abi-throw.cpp
+++ b/test/CodeGenCXX/microsoft-abi-throw.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -emit-llvm -o - -triple=i386-pc-win32 -std=c++11 %s -fcxx-exceptions -fms-extensions | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -o - -triple=i386-pc-win32 -std=c++11 %s -fcxx-exceptions -fms-extensions -DSTD | FileCheck %s
 
 // CHECK-DAG: @"\01??_R0?AUY@@@8" = linkonce_odr global %rtti.TypeDescriptor7 { i8** @"\01??_7type_info@@6B@", i8* null, [8 x i8] c".?AUY@@\00" }, comdat
 // CHECK-DAG: @"_CT??_R0?AUY@@@8??0Y@@QAE@ABU0@@Z8" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 4, i8* bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUY@@@8" to i8*), i32 0, i32 -1, i32 0, i32 8, i8* bitcast (%struct.Y* (%struct.Y*, %struct.Y*, i32)* @"\01??0Y@@QAE@ABU0@@Z" to i8*) }, section ".xdata", comdat
@@ -19,6 +20,8 @@
 // CHECK-DAG: @"_CT??_R0P6AXXZ@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i8* bitcast (%rtti.TypeDescriptor7* @"\01??_R0P6AXXZ@8" to i8*), i32 0, i32 -1, i32 0, i32 4, i8* null }, section ".xdata", comdat
 // CHECK-DAG: @_CTA1P6AXXZ = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x %eh.CatchableType*] [%eh.CatchableType* @"_CT??_R0P6AXXZ@84"] }, section ".xdata", comdat
 // CHECK-DAG: @_TI1P6AXXZ = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i8* null, i8* null, i8* bitcast (%eh.CatchableTypeArray.1* @_CTA1P6AXXZ to i8*) }, section ".xdata", comdat
+// CHECK-DAG: @_TIU2PAPFAH = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 4, i8* null, i8* null, i8* bitcast (%eh.CatchableTypeArray.2* @_CTA2PAPFAH to i8*) }, section ".xdata", comdat
+// CHECK-DAG: @_CTA2PAPFAH = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.2 { i32 2, [2 x %eh.CatchableType*] [%eh.CatchableType* @"_CT??_R0PAPFAH@84", %eh.CatchableType* @"_CT??_R0PAX@84"] }, section ".xdata", comdat
 
 
 struct N { ~N(); };
@@ -43,6 +46,12 @@
   throw y;
 }
 
+void h(__unaligned int * __unaligned *y) {
+  // CHECK-LABEL: @"\01?h@@YAXPFAPFAH@Z"
+  // CHECK: call void @_CxxThrowException(i8* %{{.*}}, %eh.ThrowInfo* @_TIU2PAPFAH)
+  throw y;
+}
+
 struct Default {
   Default(Default &, int = 42);
 };
@@ -97,19 +106,25 @@
   throw nullptr;
 }
 
+#ifdef STD
 namespace std {
 template <typename T>
 void *__GetExceptionInfo(T);
 }
+#else
+template <typename T>
+void *__GetExceptionInfo(T);
+#endif
+using namespace std;
 
 void *GetExceptionInfo_test0() {
 // CHECK-LABEL: @"\01?GetExceptionInfo_test0@@YAPAXXZ"
 // CHECK:  ret i8* bitcast (%eh.ThrowInfo* @_TI1H to i8*)
-  return std::__GetExceptionInfo(0);
+  return __GetExceptionInfo(0);
 }
 
 void *GetExceptionInfo_test1() {
 // CHECK-LABEL: @"\01?GetExceptionInfo_test1@@YAPAXXZ"
 // CHECK:  ret i8* bitcast (%eh.ThrowInfo* @_TI1P6AXXZ to i8*)
-  return std::__GetExceptionInfo<void (*)()>(&h);
+  return __GetExceptionInfo<void (*)()>(&h);
 }
diff --git a/test/CodeGenCXX/microsoft-abi-try-throw.cpp b/test/CodeGenCXX/microsoft-abi-try-throw.cpp
index 6b1d2bf..bf1834e 100644
--- a/test/CodeGenCXX/microsoft-abi-try-throw.cpp
+++ b/test/CodeGenCXX/microsoft-abi-try-throw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fcxx-exceptions -fexceptions -fno-rtti -DTRY -fnew-ms-eh   | FileCheck %s -check-prefix=TRY
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fcxx-exceptions -fexceptions -fno-rtti -DTRY   | FileCheck %s -check-prefix=TRY
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fcxx-exceptions -fexceptions -fno-rtti -DTHROW | FileCheck %s -check-prefix=THROW
 
 // THROW-DAG: @"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
diff --git a/test/CodeGenCXX/microsoft-abi-typeid.cpp b/test/CodeGenCXX/microsoft-abi-typeid.cpp
index 60c31ab..d73f848 100644
--- a/test/CodeGenCXX/microsoft-abi-typeid.cpp
+++ b/test/CodeGenCXX/microsoft-abi-typeid.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -O1 -o - -triple=i386-pc-win32 %s | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -O1 -o - -triple=i386-pc-win32 %s -fexceptions -fcxx-exceptions | FileCheck %s
 
 struct type_info;
 namespace std { using ::type_info; }
@@ -49,3 +49,22 @@
 // CHECK:        [[RT:%.*]] = tail call i8* @__RTtypeid(i8* bitcast (%struct.V* @"\01?v@@3UV@@A" to i8*))
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[RT]] to %struct.type_info*
 // CHECK-NEXT:   ret %struct.type_info* [[RET]]
+
+namespace PR26329 {
+struct Polymorphic {
+  virtual ~Polymorphic();
+};
+
+void f(const Polymorphic &poly) {
+  try {
+    throw;
+  } catch (...) {
+    Polymorphic cleanup;
+    typeid(poly);
+  }
+}
+// CHECK-LABEL: define void @"\01?f@PR26329@@YAXABUPolymorphic@1@@Z"(
+// CHECK: %[[cs:.*]] = catchswitch within none [label %{{.*}}] unwind to caller
+// CHECK: %[[cp:.*]] = catchpad within %[[cs]] [i8* null, i32 64, i8* null]
+// CHECK: invoke i8* @__RTtypeid(i8* {{.*}}) [ "funclet"(token %[[cp]]) ]
+}
diff --git a/test/CodeGenCXX/microsoft-abi-vbtables.cpp b/test/CodeGenCXX/microsoft-abi-vbtables.cpp
index 9cce6f8..df06894 100644
--- a/test/CodeGenCXX/microsoft-abi-vbtables.cpp
+++ b/test/CodeGenCXX/microsoft-abi-vbtables.cpp
@@ -537,5 +537,5 @@
 
 extern template class B<int>;
 template B<int>::B();
-// CHECK-DAG: @"\01??_8?$B@H@Test30@@7B@" = external unnamed_addr constant [2 x i32]{{$}}
+// CHECK-DAG: @"\01??_8?$B@H@Test30@@7B@" = linkonce_odr unnamed_addr constant [2 x i32] [i32 0, i32 4], comdat
 }
diff --git a/test/CodeGenCXX/microsoft-abi-vftables.cpp b/test/CodeGenCXX/microsoft-abi-vftables.cpp
index 340675b..0c9b58b 100644
--- a/test/CodeGenCXX/microsoft-abi-vftables.cpp
+++ b/test/CodeGenCXX/microsoft-abi-vftables.cpp
@@ -17,9 +17,10 @@
   virtual ~U();
 } u;
 
-// RTTI-DAG: @"\01??_7U@@6B@" = available_externally dllimport unnamed_addr constant [1 x i8*] [i8* bitcast ({{.*}} @"\01??_GU@@UAEPAXI@Z" to i8*)]
+// RTTI-DAG: [[VTABLE_U:@.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast ({{.*}} @"\01??_R4U@@6B@" to i8*), i8* bitcast ({{.*}} @"\01??_GU@@UAEPAXI@Z" to i8*)]
+// RTTI-DAG: @"\01??_SU@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* [[VTABLE_U]], i32 0, i32 1)
 
-// NO-RTTI-DAG: @"\01??_7U@@6B@" = available_externally dllimport unnamed_addr constant [1 x i8*] [i8* bitcast ({{.*}} @"\01??_GU@@UAEPAXI@Z" to i8*)]
+// NO-RTTI-DAG: @"\01??_SU@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast ({{.*}} @"\01??_GU@@UAEPAXI@Z" to i8*)]
 
 struct __declspec(dllexport) V {
   virtual ~V();
@@ -32,7 +33,7 @@
 
 namespace {
 struct W {
-  virtual ~W();
+  virtual ~W() {}
 } w;
 }
 // RTTI-DAG: [[VTABLE_W:@.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast ({{.*}} @"\01??_R4W@?A@@6B@" to i8*), i8* bitcast ({{.*}} @"\01??_GW@?A@@UAEPAXI@Z" to i8*)]
@@ -48,5 +49,7 @@
 
 extern template class Y<int>;
 template Y<int>::Y();
-// RTTI-DAG: @"\01??_7?$Y@H@@6B@" = external unnamed_addr constant [1 x i8*]
-// NO-RTTI-DAG: @"\01??_7?$Y@H@@6B@" = external unnamed_addr constant [1 x i8*]
+// RTTI-DAG: [[VTABLE_Y:@.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast (%rtti.CompleteObjectLocator* @"\01??_R4?$Y@H@@6B@" to i8*), i8* bitcast (i8* (%struct.Y*, i32)* @"\01??_G?$Y@H@@UAEPAXI@Z" to i8*)], comdat($"\01??_7?$Y@H@@6B@")
+// RTTI-DAG: @"\01??_7?$Y@H@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* [[VTABLE_Y]], i32 0, i32 1)
+
+// NO-RTTI-DAG: @"\01??_7?$Y@H@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%struct.Y*, i32)* @"\01??_G?$Y@H@@UAEPAXI@Z" to i8*)], comdat
diff --git a/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp b/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
index 8897a38..480ae8c 100644
--- a/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
+++ b/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
@@ -481,3 +481,43 @@
 // CHECK:   %[[FIELD:.*]] = getelementptr inbounds i8, i8* %[[B_i8]], i32 4
 // CHECK:   call void @llvm.memset.p0i8.i32(i8* %[[FIELD]], i8 0, i32 4, i32 4, i1 false)
 }
+
+namespace pr27621 {
+// Devirtualization through a static_cast used to make us compute the 'this'
+// adjustment for B::g instead of C::g. When we directly call C::g, 'this' is a
+// B*, and the prologue of C::g will adjust it to a C*.
+struct A { virtual void f(); };
+struct B { virtual void g(); };
+struct C final : A, B {
+  virtual void h();
+  void g() override;
+};
+void callit(C *p) {
+  static_cast<B*>(p)->g();
+}
+// CHECK-LABEL: define void @"\01?callit@pr27621@@YAXPAUC@1@@Z"(%"struct.pr27621::C"* %{{.*}})
+// CHECK: %[[B_i8:.*]] = getelementptr i8, i8* %{{.*}}, i32 4
+// CHECK: call x86_thiscallcc void @"\01?g@C@pr27621@@UAEXXZ"(i8* %[[B_i8]])
+}
+
+namespace test6 {
+class A {};
+class B : virtual A {};
+class C : virtual B {
+  virtual void m_fn1();
+  float field;
+};
+class D : C {
+  D();
+};
+D::D() : C() {}
+// CHECK-LABEL: define x86_thiscallcc %"class.test6::D"* @"\01??0D@test6@@AAE@XZ"(
+// CHECK:   %[[THIS:.*]] = load %"class.test6::D"*, %"class.test6::D"**
+// CHECK:   br i1 %{{.*}}, label %[[INIT_VBASES:.*]], label %[[SKIP_VBASES:.*]]
+
+// CHECK: %[[SKIP_VBASES]]
+// CHECK:   %[[C:.*]] = bitcast %"class.test6::D"* %[[THIS]] to %"class.test6::C"*
+// CHECK:   %[[C_i8:.*]] = bitcast %"class.test6::C"* %[[C]] to i8*
+// CHECK:   %[[FIELD:.*]] = getelementptr inbounds i8, i8* %[[C_i8]], i32 8
+// CHECK:   call void @llvm.memset.p0i8.i32(i8* %[[FIELD]], i8 0, i32 4, i32 4, i1 false)
+}
diff --git a/test/CodeGenCXX/microsoft-interface.cpp b/test/CodeGenCXX/microsoft-interface.cpp
index a2dfb69..8f4670a 100644
--- a/test/CodeGenCXX/microsoft-interface.cpp
+++ b/test/CodeGenCXX/microsoft-interface.cpp
@@ -31,10 +31,10 @@
 
 // CHECK-LABEL: define linkonce_odr x86_thiscallcc void @_ZN1SC2Ev(%struct.S* %this)
 // CHECK:   call x86_thiscallcc void @_ZN1IC2Ev(%__interface.I* %{{[.0-9A-Z_a-z]+}})
-// CHECK:   store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1S, i64 0, i64 2) to i32 (...)**), i32 (...)*** %{{[.0-9A-Z_a-z]+}}
+// CHECK:   store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1S, i32 0, i32 2) to i32 (...)**), i32 (...)*** %{{[.0-9A-Z_a-z]+}}
 
 // CHECK-LABEL: define linkonce_odr x86_thiscallcc void @_ZN1IC2Ev(%__interface.I* %this)
-// CHECK:   store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1I, i64 0, i64 2) to i32 (...)**), i32 (...)*** %{{[.0-9A-Z_a-z]+}}
+// CHECK:   store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1I, i32 0, i32 2) to i32 (...)**), i32 (...)*** %{{[.0-9A-Z_a-z]+}}
 
 // CHECK-LABEL: define linkonce_odr x86_thiscallcc i32 @_ZN1I4testEv(%__interface.I* %this)
 // CHECK:   ret i32 1
diff --git a/test/CodeGenCXX/microsoft-templ-uuidof.cpp b/test/CodeGenCXX/microsoft-templ-uuidof.cpp
index 0ee3908..74d6069 100644
--- a/test/CodeGenCXX/microsoft-templ-uuidof.cpp
+++ b/test/CodeGenCXX/microsoft-templ-uuidof.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -triple=i386-pc-win32 -fms-extensions | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -triple=i386-pc-win32 -fms-extensions | FileCheck %s
 
 struct _GUID;
 
diff --git a/test/CodeGenCXX/microsoft-uuidof.cpp b/test/CodeGenCXX/microsoft-uuidof.cpp
index 2ac5f1b..62e4b88 100644
--- a/test/CodeGenCXX/microsoft-uuidof.cpp
+++ b/test/CodeGenCXX/microsoft-uuidof.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -triple=i386-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-DEFINE-GUID
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-linux -fms-extensions | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -DWRONG_GUID -triple=i386-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-DEFINE-WRONG-GUID
 
 #ifdef DEFINE_GUID
@@ -36,6 +37,7 @@
 // First global use of __uuidof(S1) forces the creation of the global.
 // CHECK: @_GUID_12345678_1234_1234_1234_1234567890ab = linkonce_odr constant { i32, i16, i16, [8 x i8] } { i32 305419896, i16 4660, i16 4660, [8 x i8] c"\124\124Vx\90\AB" }, comdat
 // CHECK: @gr = constant %struct._GUID* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to %struct._GUID*), align 4
+// CHECK-64: @gr = constant %struct._GUID* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to %struct._GUID*), align 8
 const GUID& gr = __uuidof(S1);
 
 // CHECK: @gp = global %struct._GUID* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to %struct._GUID*), align 4
diff --git a/test/CodeGenCXX/mips-size_t-ptrdiff_t.cpp b/test/CodeGenCXX/mips-size_t-ptrdiff_t.cpp
index 1ff0182..5b245a4 100644
--- a/test/CodeGenCXX/mips-size_t-ptrdiff_t.cpp
+++ b/test/CodeGenCXX/mips-size_t-ptrdiff_t.cpp
@@ -10,13 +10,13 @@
   return rv;
 }
 // O32-LABEL: define i32* @_Z10alloc_longv()
-// O32: call noalias i8* @_Znwj(i32 signext 4)
+// O32: call i8* @_Znwj(i32 signext 4)
 
 // N32-LABEL: define i32* @_Z10alloc_longv()
-// N32: call noalias i8* @_Znwj(i32 signext 4)
+// N32: call i8* @_Znwj(i32 signext 4)
 
 // N64-LABEL: define i64* @_Z10alloc_longv()
-// N64: call noalias i8* @_Znwm(i64 zeroext 8)
+// N64: call i8* @_Znwm(i64 zeroext 8)
 
 long *alloc_long_array() {
   long *rv = new long[2];
@@ -24,13 +24,13 @@
 }
 
 // O32-LABEL: define i32* @_Z16alloc_long_arrayv()
-// O32: call noalias i8* @_Znaj(i32 signext 8)
+// O32: call i8* @_Znaj(i32 signext 8)
 
 // N32-LABEL: define i32* @_Z16alloc_long_arrayv()
-// N32: call noalias i8* @_Znaj(i32 signext 8)
+// N32: call i8* @_Znaj(i32 signext 8)
 
 // N64-LABEL: define i64* @_Z16alloc_long_arrayv()
-// N64: call noalias i8* @_Znam(i64 zeroext 16)
+// N64: call i8* @_Znam(i64 zeroext 16)
 
 #include <stddef.h>
 
diff --git a/test/CodeGenCXX/multi-dim-operator-new.cpp b/test/CodeGenCXX/multi-dim-operator-new.cpp
index 7a235e8..0dfcffb 100644
--- a/test/CodeGenCXX/multi-dim-operator-new.cpp
+++ b/test/CodeGenCXX/multi-dim-operator-new.cpp
@@ -43,7 +43,7 @@
  return 0;
 }
 
-// CHECK: call noalias i8* @_Znam
-// CHECK: call noalias i8* @_Znam
-// CHECK: call noalias i8* @_Znam
+// CHECK: call i8* @_Znam
+// CHECK: call i8* @_Znam
+// CHECK: call i8* @_Znam
 
diff --git a/test/CodeGenCXX/naked.cpp b/test/CodeGenCXX/naked.cpp
new file mode 100644
index 0000000..34f22b3
--- /dev/null
+++ b/test/CodeGenCXX/naked.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -emit-llvm %s -o - | FileCheck %s
+
+class TestNaked {
+public:
+  void NakedFunction();
+};
+
+__attribute__((naked)) void TestNaked::NakedFunction() {
+  // CHECK-LABEL: define {{(x86_thiscallcc )?}}void @
+  // CHECK: call void asm sideeffect
+  asm("");
+}
diff --git a/test/CodeGenCXX/new-alias.cpp b/test/CodeGenCXX/new-alias.cpp
index 4afd942..b21638a 100644
--- a/test/CodeGenCXX/new-alias.cpp
+++ b/test/CodeGenCXX/new-alias.cpp
@@ -9,5 +9,5 @@
 void *operator new(size_t) __attribute__((alias("something")));
 
 // PR16715: don't assert here.
-// CHECK: call noalias i8* @_Znwm(i64 4){{$}}
+// CHECK: call i8* @_Znwm(i64 4){{$}}
 int *pr16715 = new int;
diff --git a/test/CodeGenCXX/new-array-init.cpp b/test/CodeGenCXX/new-array-init.cpp
index 6b76f47..602f93c 100644
--- a/test/CodeGenCXX/new-array-init.cpp
+++ b/test/CodeGenCXX/new-array-init.cpp
@@ -14,7 +14,7 @@
 // CHECK-LABEL: define void @_Z15const_underflowv
 void const_underflow() {
   // CHECK-NOT: icmp ult i{{32|64}} %{{[^ ]+}}, 3
-  // CHECK: call noalias i8* @_Zna{{.}}(i{{32|64}} -1)
+  // CHECK: call i8* @_Zna{{.}}(i{{32|64}} -1)
   new int[2] { 1, 2, 3 };
 }
 
@@ -37,7 +37,7 @@
   struct S;
   new (int S::*[3][4][5]) ();
 
-  // CHECK: call noalias i8* @_Zna{{.}}(i{{32 240|64 480}})
+  // CHECK: call i8* @_Zna{{.}}(i{{32 240|64 480}})
   // CHECK: getelementptr inbounds i{{32|64}}, i{{32|64}}* {{.*}}, i{{32|64}} 60
 
   // CHECK: phi
diff --git a/test/CodeGenCXX/new-overflow.cpp b/test/CodeGenCXX/new-overflow.cpp
index 9057e04..0c4c3c8 100644
--- a/test/CodeGenCXX/new-overflow.cpp
+++ b/test/CodeGenCXX/new-overflow.cpp
@@ -17,7 +17,7 @@
   // CHECK-NEXT: [[T1:%.*]] = extractvalue { i32, i1 } [[T0]], 1
   // CHECK-NEXT: [[T2:%.*]] = extractvalue { i32, i1 } [[T0]], 0
   // CHECK-NEXT: [[T3:%.*]] = select i1 [[T1]], i32 -1, i32 [[T2]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T3]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T3]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[N]]
   elt *test(short s) {
     return new elt[s];
@@ -40,7 +40,7 @@
   // CHECK-NEXT: [[T2:%.*]] = extractvalue { i32, i1 } [[T0]], 0
   // CHECK-NEXT: [[T3:%.*]] = mul i32 [[N]], 100
   // CHECK-NEXT: [[T4:%.*]] = select i1 [[T1]], i32 -1, i32 [[T2]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T4]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T4]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T3]]
   elt *test(short s) {
     return new elt[s];
@@ -68,7 +68,7 @@
   // CHECK-NEXT: [[T6:%.*]] = or i1 [[T1]], [[T5]]
   // CHECK-NEXT: [[T7:%.*]] = extractvalue { i32, i1 } [[T4]], 0
   // CHECK-NEXT: [[T8:%.*]] = select i1 [[T6]], i32 -1, i32 [[T7]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T8]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T8]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T3]]
   elt *test(short s) {
     return new elt[s];
@@ -87,7 +87,7 @@
   // CHECK:      [[N:%.*]] = sext i16 {{%.*}} to i32
   // CHECK-NEXT: [[T0:%.*]] = icmp slt i32 [[N]], 0
   // CHECK-NEXT: [[T1:%.*]] = select i1 [[T0]], i32 -1, i32 [[N]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T1]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T1]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[N]]
   elt *test(short s) {
     return new elt[s];
@@ -106,7 +106,7 @@
   // CHECK:      [[N:%.*]] = load i32, i32*
   // CHECK-NEXT: [[T0:%.*]] = icmp slt i32 [[N]], 0
   // CHECK-NEXT: [[T1:%.*]] = select i1 [[T0]], i32 -1, i32 [[N]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T1]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T1]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[N]]
   elt *test(int s) {
     return new elt[s];
@@ -128,7 +128,7 @@
   // CHECK-NEXT: [[T1:%.*]] = extractvalue { i32, i1 } [[T0]], 1
   // CHECK-NEXT: [[T2:%.*]] = extractvalue { i32, i1 } [[T0]], 0
   // CHECK-NEXT: [[T3:%.*]] = select i1 [[T1]], i32 -1, i32 [[T2]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T3]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T3]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[N]]
   elt *test(unsigned short s) {
     return new elt[s];
@@ -151,7 +151,7 @@
   // CHECK-NEXT: [[T2:%.*]] = extractvalue { i32, i1 } [[T0]], 0
   // CHECK-NEXT: [[T3:%.*]] = mul i32 [[N]], 100
   // CHECK-NEXT: [[T4:%.*]] = select i1 [[T1]], i32 -1, i32 [[T2]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T4]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T4]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T3]]
   elt *test(unsigned short s) {
     return new elt[s];
@@ -176,7 +176,7 @@
   // CHECK-NEXT: [[T4:%.*]] = or i1 [[T0]], [[T3]]
   // CHECK-NEXT: [[T5:%.*]] = extractvalue { i32, i1 } [[T2]], 0
   // CHECK-NEXT: [[T6:%.*]] = select i1 [[T4]], i32 -1, i32 [[T5]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T6]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T6]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T1]]
   elt *test(long long s) {
     return new elt[s];
@@ -201,7 +201,7 @@
   // CHECK-NEXT: [[T4:%.*]] = or i1 [[T0]], [[T3]]
   // CHECK-NEXT: [[T5:%.*]] = extractvalue { i32, i1 } [[T2]], 0
   // CHECK-NEXT: [[T6:%.*]] = select i1 [[T4]], i32 -1, i32 [[T5]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T6]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T6]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T1]]
   elt *test(unsigned long long s) {
     return new elt[s];
diff --git a/test/CodeGenCXX/new.cpp b/test/CodeGenCXX/new.cpp
index 6d6f701..ae2ec15 100644
--- a/test/CodeGenCXX/new.cpp
+++ b/test/CodeGenCXX/new.cpp
@@ -127,15 +127,15 @@
 struct Bmemptr { int Bmemptr::* memptr; int a; };
 
 void t11(int n) {
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: call void @llvm.memset.p0i8.i64(
   B* b = new B();
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: {{call void.*llvm.memset.p0i8.i64.*i8 0, i64 %}}
   B *b2 = new B[n]();
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
   // CHECK: br
   Bmemptr *b_memptr = new Bmemptr[n]();
@@ -148,11 +148,11 @@
 // We don't need to initialize an empty class.
 // CHECK-LABEL: define void @_Z3t12v
 void t12() {
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK-NOT: br
   (void)new Empty[10];
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK-NOT: br
   (void)new Empty[10]();
 
@@ -162,11 +162,11 @@
 // Zero-initialization
 // CHECK-LABEL: define void @_Z3t13i
 void t13(int n) {
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: store i32 0, i32*
   (void)new int();
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: {{call void.*llvm.memset.p0i8.i64.*i8 0, i64 %}}
   (void)new int[n]();
 
@@ -186,7 +186,7 @@
   // CHECK: call void @_ZN5AllocD1Ev(
   // CHECK: call void @_ZN5AllocdaEPv(i8*
   delete[] new Alloc[10][20];
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: call void @_ZdlPv(i8*
   delete new bool;
   // CHECK: ret void
@@ -274,7 +274,7 @@
   // CHECK-LABEL: define weak_odr void @_ZN7PR101971fIiEEvv()
   template<typename T>
   void f() {
-    // CHECK: [[CALL:%.*]] = call noalias i8* @_Znwm
+    // CHECK: [[CALL:%.*]] = call i8* @_Znwm
     // CHECK-NEXT: [[CASTED:%.*]] = bitcast i8* [[CALL]] to 
     new T;
     // CHECK-NEXT: ret void
@@ -296,7 +296,7 @@
   struct X { X(); X(const X&); };
   X* a(X* x) { return new X(X()); }
   // CHECK: define {{.*}} @_ZN7PR117571aEPNS_1XE
-  // CHECK: [[CALL:%.*]] = call noalias i8* @_Znwm
+  // CHECK: [[CALL:%.*]] = call i8* @_Znwm
   // CHECK-NEXT: [[CASTED:%.*]] = bitcast i8* [[CALL]] to
   // CHECK-NEXT: call void @_ZN7PR117571XC1Ev({{.*}}* [[CASTED]])
   // CHECK-NEXT: ret {{.*}} [[CASTED]]
@@ -306,7 +306,7 @@
   struct A { A() {} };
   struct B : public A { int x; };
   // CHECK-LABEL: define i8* @_ZN7PR133801fEv
-  // CHECK: call noalias i8* @_Znam(
+  // CHECK: call i8* @_Znam(
   // CHECK: call void @llvm.memset.p0i8
   // CHECK-NEXT: call void @_ZN7PR133801BC1Ev
   void* f() { return new B[2](); }
@@ -320,12 +320,12 @@
 
   // CHECK-LABEL: define void @_ZN5N36641fEv
   void f() {
-    // CHECK: call noalias i8* @_Znwm(i64 4) [[ATTR_BUILTIN_NEW:#[^ ]*]]
+    // CHECK: call i8* @_Znwm(i64 4) [[ATTR_BUILTIN_NEW:#[^ ]*]]
     int *p = new int; // expected-note {{allocated with 'new' here}}
     // CHECK: call void @_ZdlPv({{.*}}) [[ATTR_BUILTIN_DELETE:#[^ ]*]]
     delete p;
 
-    // CHECK: call noalias i8* @_Znam(i64 12) [[ATTR_BUILTIN_NEW]]
+    // CHECK: call i8* @_Znam(i64 12) [[ATTR_BUILTIN_NEW]]
     int *q = new int[3];
     // CHECK: call void @_ZdaPv({{.*}}) [[ATTR_BUILTIN_DELETE]]
     delete[] p; // expected-warning {{'delete[]' applied to a pointer that was allocated with 'new'; did you mean 'delete'?}}
@@ -337,19 +337,18 @@
     (void) new (mpt) int;
   }
 
-  // FIXME: Can we mark this noalias?
-  // CHECK: declare i8* @_ZnamRKSt9nothrow_t(i64, {{.*}}) [[ATTR_NOBUILTIN_NOUNWIND]]
+  // CHECK: declare noalias i8* @_ZnamRKSt9nothrow_t(i64, {{.*}}) [[ATTR_NOBUILTIN_NOUNWIND]]
 
   // CHECK-LABEL: define void @_ZN5N36641gEv
   void g() {
     // It's OK for there to be attributes here, so long as we don't have a
     // 'builtin' attribute.
-    // CHECK: call noalias i8* @_Znwm(i64 4){{$}}
+    // CHECK: call i8* @_Znwm(i64 4){{$}}
     int *p = (int*)operator new(4);
     // CHECK: call void @_ZdlPv({{.*}}) [[ATTR_NOUNWIND:#[^ ]*]]
     operator delete(p);
 
-    // CHECK: call noalias i8* @_Znam(i64 12){{$}}
+    // CHECK: call i8* @_Znam(i64 12){{$}}
     int *q = (int*)operator new[](12);
     // CHECK: call void @_ZdaPv({{.*}}) [[ATTR_NOUNWIND]]
     operator delete [](p);
@@ -362,7 +361,7 @@
 namespace builtins {
   // CHECK-LABEL: define void @_ZN8builtins1fEv
   void f() {
-    // CHECK: call noalias i8* @_Znwm(i64 4) [[ATTR_BUILTIN_NEW]]
+    // CHECK: call i8* @_Znwm(i64 4) [[ATTR_BUILTIN_NEW]]
     // CHECK: call void @_ZdlPv({{.*}}) [[ATTR_BUILTIN_DELETE]]
     __builtin_operator_delete(__builtin_operator_new(4));
   }
diff --git a/test/CodeGenCXX/operator-new.cpp b/test/CodeGenCXX/operator-new.cpp
index db56cda..dc1c36d 100644
--- a/test/CodeGenCXX/operator-new.cpp
+++ b/test/CodeGenCXX/operator-new.cpp
@@ -16,7 +16,6 @@
   new teste();
 }
 
-
 // rdar://5739832 - operator new should check for overflow in multiply.
 void *f2(long N) {
   return new int[N];
@@ -25,5 +24,8 @@
 // SANE-NEXT: [[OVER:%.*]] = extractvalue {{.*}} [[UWO]], 1
 // SANE-NEXT: [[SUM:%.*]] = extractvalue {{.*}} [[UWO]], 0
 // SANE-NEXT: [[RESULT:%.*]] = select i1 [[OVER]], i32 -1, i32 [[SUM]]
-// SANE-NEXT: call noalias i8* @_Znaj(i32 [[RESULT]])
+// SANE-NEXT: call i8* @_Znaj(i32 [[RESULT]])
 }
+
+// SANE: declare noalias i8* @_Znaj(
+// SANENOT: declare i8* @_Znaj(
diff --git a/test/CodeGenCXX/optnone-and-attributes.cpp b/test/CodeGenCXX/optnone-and-attributes.cpp
index 56173b5..870d5e9 100644
--- a/test/CodeGenCXX/optnone-and-attributes.cpp
+++ b/test/CodeGenCXX/optnone-and-attributes.cpp
@@ -79,4 +79,4 @@
 // CHECK: attributes [[NORETURN]] = { noinline noreturn {{.*}} optnone
 
 // CHECK: attributes [[DLLIMPORT]] =
-// CHECK-SAME-NOT: optnone
+// CHECK-NOT: optnone
diff --git a/test/CodeGenCXX/optnone-class-members.cpp b/test/CodeGenCXX/optnone-class-members.cpp
index 147b821..751f3dd 100644
--- a/test/CodeGenCXX/optnone-class-members.cpp
+++ b/test/CodeGenCXX/optnone-class-members.cpp
@@ -159,6 +159,6 @@
 
 
 // CHECK: attributes [[NORMAL]] =
-// CHECK-SAME-NOT: noinline
-// CHECK-SAME-NOT: optnone
+// CHECK-NOT: noinline
+// CHECK-NOT: optnone
 // CHECK: attributes [[OPTNONE]] = {{.*}} noinline {{.*}} optnone
diff --git a/test/CodeGenCXX/optnone-def-decl.cpp b/test/CodeGenCXX/optnone-def-decl.cpp
index cb3a677..0240189 100644
--- a/test/CodeGenCXX/optnone-def-decl.cpp
+++ b/test/CodeGenCXX/optnone-def-decl.cpp
@@ -91,5 +91,5 @@
 
 // CHECK: attributes [[OPTNONE]] = { noinline nounwind optnone {{.*}} }
 // CHECK: attributes [[NORMAL]] =
-// CHECK-SAME-NOT: noinline
-// CHECK-SAME-NOT: optnone
+// CHECK-NOT: noinline
+// CHECK-NOT: optnone
diff --git a/test/CodeGenCXX/optnone-templates.cpp b/test/CodeGenCXX/optnone-templates.cpp
index 45a72b3..9f97d83 100644
--- a/test/CodeGenCXX/optnone-templates.cpp
+++ b/test/CodeGenCXX/optnone-templates.cpp
@@ -100,5 +100,5 @@
 
 
 // CHECK: attributes [[NORMAL]] =
-// CHECK-SAME-NOT: optnone
+// CHECK-NOT: optnone
 // CHECK: attributes [[OPTNONE]] = {{.*}} optnone
diff --git a/test/CodeGenCXX/pass-object-size.cpp b/test/CodeGenCXX/pass-object-size.cpp
index 2c7f974..7fd8b59 100644
--- a/test/CodeGenCXX/pass-object-size.cpp
+++ b/test/CodeGenCXX/pass-object-size.cpp
@@ -43,3 +43,40 @@
   (&OvlFoo)(nullptr);
 }
 }
+
+namespace delegate {
+  struct A {
+    A(void *const p __attribute__((pass_object_size(0))));
+  };
+  A::A(void *const p __attribute__((pass_object_size(0)))) {}
+  // Ensure that we forward the size through a delegating constructor call.
+  // CHECK: define void @_ZN8delegate1AC1EPvU17pass_object_size0({{[^,]*}}, i8*{{[^,]*}}, i64{{[^,]*}})
+  // CHECK: call void @_ZN8delegate1AC2EPvU17pass_object_size0({{[^,]*}}, i8*{{[^,]*}}, i64{{[^,]*}})
+}
+
+namespace variadic {
+// We had an issue where variadic member/operator calls with pass_object_size
+// would cause crashes.
+
+struct AsCtor {
+  AsCtor(const char *const c __attribute__((pass_object_size(0))), double a,
+         ...) {}
+};
+
+struct AsMember {
+  void bar(const char *const c __attribute__((pass_object_size(0))), double a,
+           ...) {}
+  void operator()(const char *const c __attribute__((pass_object_size(0))),
+                  double a, ...) {}
+};
+
+// CHECK-LABEL: define void @_ZN8variadic4testEv()
+void test() {
+  // CHECK-RE: call{{[^@]+}}@_ZN8variadic6AsCtorC1EPKcU17pass_object_size0dz
+  AsCtor("a", 1.0);
+  // CHECK-RE: call{{[^@]+}}@_ZN8variadic8AsMember3barEPKcU17pass_object_size0dz
+  AsMember{}.bar("a", 1.0);
+  // CHECK-RE: call{{[^@]+}}@_ZN8variadic8AsMemberclEPKcU17pass_object_size0dz
+  AsMember{}("a", 1.0);
+}
+}
diff --git a/test/CodeGenCXX/pr27030.cpp b/test/CodeGenCXX/pr27030.cpp
new file mode 100644
index 0000000..5c24051
--- /dev/null
+++ b/test/CodeGenCXX/pr27030.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm -triple=i386-pc-win32 %s -o - | FileCheck %s
+struct A {};
+struct B : A {};
+extern "C" {
+extern int B::*a;
+void test1() { (int A::*)(a); }
+}
+// CHECK-LABEL: define void @test1(
+// CHECK: %[[load:.*]]       = load i32, i32* @a
+// CHECK: %[[memptr_cmp:.*]] = icmp ne i32 %[[load]], -1
+// CHECK: br i1 %[[memptr_cmp]]
+
+// CHECK: %[[adj:.*]] = sub nsw i32 %[[load]], 0
+// CHECK: %[[nv_adj:.*]] = select i1 true, i32 %[[adj]], i32 0
+
+// CHECK: %[[memptr_converted:.*]] = phi i32 [ -1, {{.*}} ], [ %[[nv_adj]], {{.*}} ]
diff --git a/test/CodeGenCXX/pr28360.cpp b/test/CodeGenCXX/pr28360.cpp
new file mode 100644
index 0000000..5d7e1ae
--- /dev/null
+++ b/test/CodeGenCXX/pr28360.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple i686-pc-win32 | FileCheck %s
+struct A {
+  void Foo();
+  void Foo(int);
+};
+
+using MpTy = void (A::*)();
+
+void Bar(const MpTy &);
+
+void Baz() { Bar(&A::Foo); }
+
+// CHECK-LABEL: define void @"\01?Baz@@YAXXZ"(
+// CHECK:  %[[ref_tmp:.*]] = alloca i8*, align 4
+// CHECK: store i8* bitcast (void (%struct.A*)* @"\01?Foo@A@@QAEXXZ" to i8*), i8** %[[ref_tmp]], align 4
+// CHECK: call void @"\01?Bar@@YAXABQ8A@@AEXXZ@Z"(i8** dereferenceable(4) %[[ref_tmp]])
diff --git a/test/CodeGenCXX/pragma-loop.cpp b/test/CodeGenCXX/pragma-loop.cpp
index b85e0b4..e337913 100644
--- a/test/CodeGenCXX/pragma-loop.cpp
+++ b/test/CodeGenCXX/pragma-loop.cpp
@@ -9,6 +9,7 @@
 #pragma clang loop interleave_count(4)
 #pragma clang loop vectorize_width(4)
 #pragma clang loop unroll(full)
+#pragma clang loop distribute(enable)
   while (i < Length) {
     // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_1:.*]]
     List[i] = i * 2;
@@ -20,7 +21,7 @@
 void do_test(int *List, int Length) {
   int i = 0;
 
-#pragma clang loop vectorize_width(8) interleave_count(4) unroll(disable)
+#pragma clang loop vectorize_width(8) interleave_count(4) unroll(disable) distribute(disable)
   do {
     // CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[LOOP_2:.*]]
     List[i] = i * 2;
@@ -55,7 +56,7 @@
 
 // Verify disable pragma clang loop directive generates correct metadata
 void disable_test(int *List, int Length) {
-#pragma clang loop vectorize(disable) unroll(disable)
+#pragma clang loop vectorize(disable) unroll(disable) distribute(disable)
   for (int i = 0; i < Length; i++) {
     // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_5:.*]]
     List[i] = i * 2;
@@ -157,20 +158,22 @@
   for_template_constant_expression_test<double, 2, 4, 8>(List, Length);
 }
 
-// CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], ![[WIDTH_4:.*]], ![[INTERLEAVE_4:.*]], ![[INTENABLE_1:.*]], ![[UNROLL_FULL:.*]]}
+// CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], ![[WIDTH_4:.*]], ![[INTERLEAVE_4:.*]], ![[INTENABLE_1:.*]], ![[UNROLL_FULL:.*]], ![[DISTRIBUTE_ENABLE:.*]]}
 // CHECK: ![[WIDTH_4]] = !{!"llvm.loop.vectorize.width", i32 4}
 // CHECK: ![[INTERLEAVE_4]] = !{!"llvm.loop.interleave.count", i32 4}
 // CHECK: ![[INTENABLE_1]] = !{!"llvm.loop.vectorize.enable", i1 true}
 // CHECK: ![[UNROLL_FULL]] = !{!"llvm.loop.unroll.full"}
-// CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[WIDTH_8:.*]], ![[INTERLEAVE_4:.*]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[DISTRIBUTE_ENABLE]] = !{!"llvm.loop.distribute.enable", i1 true}
+// CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[WIDTH_8:.*]], ![[INTERLEAVE_4:.*]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]]}
 // CHECK: ![[WIDTH_8]] = !{!"llvm.loop.vectorize.width", i32 8}
 // CHECK: ![[UNROLL_DISABLE]] = !{!"llvm.loop.unroll.disable"}
+// CHECK: ![[DISTRIBUTE_DISABLE]] = !{!"llvm.loop.distribute.enable", i1 false}
 // CHECK: ![[LOOP_3]] = distinct !{![[LOOP_3]], ![[INTERLEAVE_4:.*]], ![[UNROLL_8:.*]], ![[INTENABLE_1:.*]]}
 // CHECK: ![[UNROLL_8]] = !{!"llvm.loop.unroll.count", i32 8}
 // CHECK: ![[LOOP_4]] = distinct !{![[LOOP_4]], ![[WIDTH_2:.*]], ![[INTERLEAVE_2:.*]]}
 // CHECK: ![[WIDTH_2]] = !{!"llvm.loop.vectorize.width", i32 2}
 // CHECK: ![[INTERLEAVE_2]] = !{!"llvm.loop.interleave.count", i32 2}
-// CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[WIDTH_1:.*]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[WIDTH_1:.*]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]]}
 // CHECK: ![[WIDTH_1]] = !{!"llvm.loop.vectorize.width", i32 1}
 // CHECK: ![[LOOP_6]] = distinct !{![[LOOP_6]], ![[WIDTH_2:.*]], ![[INTERLEAVE_2:.*]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[WIDTH_5:.*]]}
diff --git a/test/CodeGenCXX/rtti-fundamental.cpp b/test/CodeGenCXX/rtti-fundamental.cpp
index e70c3aa..a0ad80d 100644
--- a/test/CodeGenCXX/rtti-fundamental.cpp
+++ b/test/CodeGenCXX/rtti-fundamental.cpp
@@ -89,6 +89,16 @@
 // CHECK: @_ZTIPy = constant
 // CHECK: @_ZTIPKy = constant
 
+// __int128
+// CHECK: @_ZTIn = constant
+// CHECK: @_ZTIPn = constant
+// CHECK: @_ZTIPKn = constant
+
+// unsigned __int128
+// CHECK: @_ZTIo = constant
+// CHECK: @_ZTIPo = constant
+// CHECK: @_ZTIPKo = constant
+
 // half
 // CHECK: @_ZTIDh = constant
 // CHECK: @_ZTIPDh = constant
diff --git a/test/CodeGenCXX/sections.cpp b/test/CodeGenCXX/sections.cpp
index bec2e2d..c33871a 100644
--- a/test/CodeGenCXX/sections.cpp
+++ b/test/CodeGenCXX/sections.cpp
@@ -31,6 +31,31 @@
 #pragma bss_seg(pop)
 int TEST2;
 
+
+// Check "save-restore" of pragma stacks.
+struct Outer {
+  void f() {
+    #pragma bss_seg(push, ".bss3")
+    #pragma code_seg(push, ".my_code1")
+    #pragma const_seg(push, ".my_const1")
+    #pragma data_seg(push, ".data3")
+    struct Inner {
+      void g() {
+        #pragma bss_seg(push, ".bss4")
+        #pragma code_seg(push, ".my_code2")
+        #pragma const_seg(push, ".my_const2")
+        #pragma data_seg(push, ".data4")
+      }
+    };
+  }
+};
+
+void h2(void) {} // should be in ".my_code"
+int TEST3; // should be in ".bss1"
+int d2 = 1; // should be in ".data"
+extern const int b2; // should be in ".my_const"
+const int b2 = 1;
+
 #pragma section("read_flag_section", read)
 // Even though they are not declared const, these become constant since they are
 // in a read-only section.
@@ -63,6 +88,9 @@
 //CHECK: @i = global i32 0
 //CHECK: @TEST1 = global i32 0
 //CHECK: @TEST2 = global i32 0, section ".bss1"
+//CHECK: @TEST3 = global i32 0, section ".bss1"
+//CHECK: @d2 = global i32 1, section ".data"
+//CHECK: @b2 = constant i32 1, section ".my_const"
 //CHECK: @unreferenced = constant i32 0, section "read_flag_section"
 //CHECK: @referenced = constant i32 42, section "read_flag_section"
 //CHECK: @implicitly_read_write = global i32 42, section "no_section_attributes"
@@ -70,3 +98,4 @@
 //CHECK: @short_var = global i16 42, section "short_section"
 //CHECK: define void @g()
 //CHECK: define void @h() {{.*}} section ".my_code"
+//CHECK: define void @h2() {{.*}} section ".my_code"
diff --git a/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp b/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
index f6f9098..4734c02 100644
--- a/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
+++ b/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
@@ -27,7 +27,7 @@
 };
 
 // CHECK-LABEL: define void @_ZN5Test21AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test21AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test21AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A() {
   f();
 }
@@ -50,7 +50,7 @@
 };
 
 // CHECK-LABEL: define void @_ZN5Test31AD2Ev
-// CHECK-NOT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test31AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK-NOT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test31AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A() {
   
 }
@@ -76,7 +76,7 @@
 };
 
 // CHECK-LABEL: define void @_ZN5Test41AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test41AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test41AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
@@ -100,7 +100,7 @@
 };
 
 // CHECK-LABEL: define void @_ZN5Test51AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test51AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test51AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
@@ -128,7 +128,7 @@
 };
 
 // CHECK-LABEL: define void @_ZN5Test61AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test61AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test61AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
@@ -154,7 +154,7 @@
 };
 
 // CHECK-LABEL: define void @_ZN5Test71AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test71AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test71AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
@@ -180,7 +180,7 @@
 };
 
 // CHECK-LABEL: define void @_ZN5Test81AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test81AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test81AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
diff --git a/test/CodeGenCXX/static-destructor.cpp b/test/CodeGenCXX/static-destructor.cpp
new file mode 100644
index 0000000..0ea84f8
--- /dev/null
+++ b/test/CodeGenCXX/static-destructor.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 %s -triple=x86_64-pc-linux -emit-llvm -o - | FileCheck --check-prefix=X86 %s
+// RUN: %clang_cc1 %s -triple=wasm32 -emit-llvm -o - | FileCheck --check-prefix=WASM %s
+// RUN: %clang_cc1 %s -triple=armv7-apple-darwin9 -emit-llvm -o - | FileCheck --check-prefix=ARM %s
+
+// Test that destructors are not passed directly to __cxa_atexit when their
+// signatures do not match the type of its first argument.
+// e.g. ARM and WebAssembly have destructors that return this instead of void.
+
+
+class Foo {
+ public:
+  ~Foo() {
+  }
+};
+
+Foo global;
+
+// X86 destructors have void return, and are registered directly with __cxa_atexit.
+// X86: define internal void @__cxx_global_var_init()
+// X86:   call i32 @__cxa_atexit(void (i8*)* bitcast (void (%class.Foo*)* @_ZN3FooD1Ev to void (i8*)*), i8* getelementptr inbounds (%class.Foo, %class.Foo* @global, i32 0, i32 0), i8* @__dso_handle)
+
+// ARM destructors return this, but can be registered directly with __cxa_atexit
+// because the calling conventions tolerate the mismatch.
+// ARM: define internal void @__cxx_global_var_init()
+// ARM:   call i32 @__cxa_atexit(void (i8*)* bitcast (%class.Foo* (%class.Foo*)* @_ZN3FooD1Ev to void (i8*)*), i8* getelementptr inbounds (%class.Foo, %class.Foo* @global, i32 0, i32 0), i8* @__dso_handle)
+
+// Wasm destructors return this, and use a wrapper function, which is registered
+// with __cxa_atexit.
+// WASM: define internal void @__cxx_global_var_init()
+// WASM: call i32 @__cxa_atexit(void (i8*)* @__cxx_global_array_dtor, i8* null, i8* @__dso_handle)
+
+// WASM: define internal void @__cxx_global_array_dtor(i8*)
+// WASM: %call = call %class.Foo* @_ZN3FooD1Ev(%class.Foo* @global)
diff --git a/test/CodeGenCXX/static-init.cpp b/test/CodeGenCXX/static-init.cpp
index 541f641..bb97494 100644
--- a/test/CodeGenCXX/static-init.cpp
+++ b/test/CodeGenCXX/static-init.cpp
@@ -26,7 +26,7 @@
 }
 
 void g() {
-  // CHECK: call noalias i8* @_Znwm(i64 1)
+  // CHECK: call i8* @_Znwm(i64 1)
   // CHECK: call void @_ZN1AC1Ev(
   static A& a = *new A;
 }
diff --git a/test/CodeGenCXX/strict-vtable-pointers.cpp b/test/CodeGenCXX/strict-vtable-pointers.cpp
index ee39191..33f6399 100644
--- a/test/CodeGenCXX/strict-vtable-pointers.cpp
+++ b/test/CodeGenCXX/strict-vtable-pointers.cpp
@@ -166,13 +166,13 @@
 
 
 // CHECK-CTORS: %[[THIS10:.*]] = bitcast %struct.DynamicDerivedMultiple* %[[THIS0]] to i32 (...)***
-// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i64 0, i64 2) {{.*}} %[[THIS10]]
+// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i32 0, i32 2) {{.*}} %[[THIS10]]
 // CHECK-CTORS: %[[THIS11:.*]] = bitcast %struct.DynamicDerivedMultiple* %[[THIS0]] to i8*
 // CHECK-CTORS: %[[THIS_ADD:.*]] = getelementptr inbounds i8, i8* %[[THIS11]], i64 16
 // CHECK-CTORS: %[[THIS12:.*]]  = bitcast i8* %[[THIS_ADD]] to i32 (...)***
 
 
-// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i64 0, i64 6) {{.*}} %[[THIS12]]
+// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i32 0, i32 6) {{.*}} %[[THIS12]]
 // CHECK-CTORS-LABEL: }
 
 struct DynamicFromStatic;
diff --git a/test/CodeGenCXX/switch-case-folding-2.cpp b/test/CodeGenCXX/switch-case-folding-2.cpp
index b0bbf32..558ca3c 100644
--- a/test/CodeGenCXX/switch-case-folding-2.cpp
+++ b/test/CodeGenCXX/switch-case-folding-2.cpp
@@ -18,4 +18,13 @@
  return test(5);
 }
 
+void other_test() {
+  switch(0) {
+  case 0:
+    do {
+    default:;
+    } while(0);
+  }
+}
+
 // CHECK: call i32 (i8*, ...) @_Z6printfPKcz
diff --git a/test/CodeGenCXX/type-metadata.cpp b/test/CodeGenCXX/type-metadata.cpp
new file mode 100644
index 0000000..076b1fd
--- /dev/null
+++ b/test/CodeGenCXX/type-metadata.cpp
@@ -0,0 +1,248 @@
+// Tests for the cfi-vcall feature:
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-NVT --check-prefix=ITANIUM --check-prefix=TT-ITANIUM --check-prefix=NDIAG %s
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-NVT --check-prefix=ITANIUM --check-prefix=TT-ITANIUM --check-prefix=ITANIUM-DIAG --check-prefix=DIAG --check-prefix=DIAG-ABORT %s
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall -fsanitize-recover=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-NVT --check-prefix=ITANIUM --check-prefix=TT-ITANIUM --check-prefix=ITANIUM-DIAG --check-prefix=DIAG --check-prefix=DIAG-RECOVER %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-NVT --check-prefix=MS --check-prefix=TT-MS --check-prefix=NDIAG %s
+
+// Tests for the whole-program-vtables feature:
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=VTABLE-OPT --check-prefix=ITANIUM --check-prefix=TT-ITANIUM %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=VTABLE-OPT --check-prefix=MS --check-prefix=TT-MS %s
+
+// Tests for cfi + whole-program-vtables:
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-VT --check-prefix=ITANIUM --check-prefix=TC-ITANIUM %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-VT --check-prefix=MS --check-prefix=TC-MS %s
+
+// ITANIUM: @_ZTV1A = {{[^!]*}}, !type [[A16:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL16:![0-9]+]]
+
+// ITANIUM: @_ZTV1B = {{[^!]*}}, !type [[A32:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL32:![0-9]+]]
+// ITANIUM-SAME: !type [[B32:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// ITANIUM: @_ZTV1C = {{[^!]*}}, !type [[A32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+// ITANIUM-SAME: !type [[C32:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// DIAG: @[[SRC:.*]] = private unnamed_addr constant [{{.*}} x i8] c"{{.*}}type-metadata.cpp\00", align 1
+// DIAG: @[[TYPE:.*]] = private unnamed_addr constant { i16, i16, [4 x i8] } { i16 -1, i16 0, [4 x i8] c"'A'\00" }
+// DIAG: @[[BADTYPESTATIC:.*]] = private unnamed_addr global { i8, { [{{.*}} x i8]*, i32, i32 }, { i16, i16, [4 x i8] }* } { i8 0, { [{{.*}} x i8]*, i32, i32 } { [{{.*}} x i8]* @[[SRC]], i32 123, i32 3 }, { i16, i16, [4 x i8] }* @[[TYPE]] }
+
+// ITANIUM: @_ZTVN12_GLOBAL__N_11DE = {{[^!]*}}, !type [[A32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+// ITANIUM-SAME: !type [[B32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+// ITANIUM-SAME: !type [[C88:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL88:![0-9]+]]
+// ITANIUM-SAME: !type [[D32:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// ITANIUM: @_ZTCN12_GLOBAL__N_11DE0_1B = {{[^!]*}}, !type [[A32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+// ITANIUM-SAME: !type [[B32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// ITANIUM: @_ZTCN12_GLOBAL__N_11DE8_1C = {{[^!]*}}, !type [[A64:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL64:![0-9]+]]
+// ITANIUM-SAME: !type [[C32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// ITANIUM: @_ZTVZ3foovE2FA = {{[^!]*}}, !type [[A16]]
+// ITANIUM-DIAG-SAME: !type [[ALL16]]
+// ITANIUM-SAME: !type [[FA16:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL16]]
+
+// MS: comdat($"\01??_7A@@6B@"), !type [[A8:![0-9]+]]
+// MS: comdat($"\01??_7B@@6B0@@"), !type [[B8:![0-9]+]]
+// MS: comdat($"\01??_7B@@6BA@@@"), !type [[A8]]
+// MS: comdat($"\01??_7C@@6B@"), !type [[A8]]
+// MS: comdat($"\01??_7D@?A@@6BB@@@"), !type [[B8]], !type [[D8:![0-9]+]]
+// MS: comdat($"\01??_7D@?A@@6BA@@@"), !type [[A8]]
+// MS: comdat($"\01??_7FA@?1??foo@@YAXXZ@6B@"), !type [[A8]], !type [[FA8:![0-9]+]]
+
+struct A {
+  A();
+  virtual void f();
+};
+
+struct B : virtual A {
+  B();
+  virtual void g();
+  virtual void h();
+};
+
+struct C : virtual A {
+  C();
+};
+
+namespace {
+
+struct D : B, C {
+  D();
+  virtual void f();
+  virtual void h();
+};
+
+}
+
+A::A() {}
+B::B() {}
+C::C() {}
+D::D() {}
+
+void A::f() {
+}
+
+void B::g() {
+}
+
+void D::f() {
+}
+
+void D::h() {
+}
+
+// ITANIUM: define hidden void @_Z2afP1A
+// MS: define void @"\01?af@@YAXPEAUA@@@Z"
+void af(A *a) {
+  // TT-ITANIUM: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* [[VT:%[^ ]*]], metadata !"_ZTS1A")
+  // TT-MS: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* [[VT:%[^ ]*]], metadata !"?AUA@@")
+  // TC-ITANIUM: [[PAIR:%[^ ]*]] = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"_ZTS1A")
+  // TC-MS: [[PAIR:%[^ ]*]] = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"?AUA@@")
+  // CFI-VT: [[P:%[^ ]*]] = extractvalue { i8*, i1 } [[PAIR]], 1
+  // DIAG-NEXT: [[VTVALID0:%[^ ]*]] = call i1 @llvm.type.test(i8* [[VT]], metadata !"all-vtables")
+  // VTABLE-OPT: call void @llvm.assume(i1 [[P]])
+  // CFI-NEXT: br i1 [[P]], label %[[CONTBB:[^ ,]*]], label %[[TRAPBB:[^ ,]*]]
+  // CFI-NEXT: {{^$}}
+
+  // CFI: [[TRAPBB]]
+  // NDIAG-NEXT: call void @llvm.trap()
+  // NDIAG-NEXT: unreachable
+  // DIAG-NEXT: [[VTINT:%[^ ]*]] = ptrtoint i8* [[VT]] to i64
+  // DIAG-NEXT: [[VTVALID:%[^ ]*]] = zext i1 [[VTVALID0]] to i64
+  // DIAG-ABORT-NEXT: call void @__ubsan_handle_cfi_check_fail_abort(i8* getelementptr inbounds ({{.*}} @[[BADTYPESTATIC]], i32 0, i32 0), i64 [[VTINT]], i64 [[VTVALID]])
+  // DIAG-ABORT-NEXT: unreachable
+  // DIAG-RECOVER-NEXT: call void @__ubsan_handle_cfi_check_fail(i8* getelementptr inbounds ({{.*}} @[[BADTYPESTATIC]], i32 0, i32 0), i64 [[VTINT]], i64 [[VTVALID]])
+  // DIAG-RECOVER-NEXT: br label %[[CONTBB]]
+
+  // CFI: [[CONTBB]]
+  // CFI-NVT: [[PTR:%[^ ]*]] = load
+  // CFI-VT: [[PTRI8:%[^ ]*]] = extractvalue { i8*, i1 } [[PAIR]], 0
+  // CFI-VT: [[PTR:%[^ ]*]] = bitcast i8* [[PTRI8]] to
+  // CFI: call void [[PTR]]
+#line 123
+  a->f();
+}
+
+// ITANIUM: define internal void @_Z3df1PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?df1@@YAXPEAUD@?A@@@Z"
+void df1(D *d) {
+  // TT-ITANIUM: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata ![[DTYPE:[0-9]+]])
+  // TT-MS: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"?AUA@@")
+  // TC-ITANIUM: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata ![[DTYPE:[0-9]+]])
+  // TC-MS: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"?AUA@@")
+  d->f();
+}
+
+// ITANIUM: define internal void @_Z3dg1PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?dg1@@YAXPEAUD@?A@@@Z"
+void dg1(D *d) {
+  // TT-ITANIUM: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // TT-MS: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"?AUB@@")
+  // TC-ITANIUM: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 8, metadata !"_ZTS1B")
+  // TC-MS: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"?AUB@@")
+  d->g();
+}
+
+// ITANIUM: define internal void @_Z3dh1PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?dh1@@YAXPEAUD@?A@@@Z"
+void dh1(D *d) {
+  // TT-ITANIUM: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata ![[DTYPE]])
+  // TT-MS: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata ![[DTYPE:[0-9]+]])
+  // TC-ITANIUM: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 16, metadata ![[DTYPE]])
+  // TC-MS: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 8, metadata ![[DTYPE:[0-9]+]])
+  d->h();
+}
+
+// ITANIUM: define internal void @_Z3df2PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?df2@@YAXPEAUD@?A@@@Z"
+__attribute__((no_sanitize("cfi")))
+void df2(D *d) {
+  // CFI-NVT-NOT: call i1 @llvm.type.test
+  // CFI-VT: [[P:%[^ ]*]] = call i1 @llvm.type.test
+  // CFI-VT: call void @llvm.assume(i1 [[P]])
+  d->f();
+}
+
+// ITANIUM: define internal void @_Z3df3PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?df3@@YAXPEAUD@?A@@@Z"
+__attribute__((no_sanitize("address"))) __attribute__((no_sanitize("cfi-vcall")))
+void df3(D *d) {
+  // CFI-NVT-NOT: call i1 @llvm.type.test
+  // CFI-VT: [[P:%[^ ]*]] = call i1 @llvm.type.test
+  // CFI-VT: call void @llvm.assume(i1 [[P]])
+  d->f();
+}
+
+D d;
+
+void foo() {
+  df1(&d);
+  dg1(&d);
+  dh1(&d);
+  df2(&d);
+  df3(&d);
+
+  struct FA : A {
+    void f() {}
+  } fa;
+  af(&fa);
+}
+
+namespace test2 {
+
+struct A {
+  virtual void m_fn1();
+};
+struct B {
+  virtual void m_fn2();
+};
+struct C : B, A {};
+struct D : C {
+  void m_fn1();
+};
+
+// ITANIUM: define hidden void @_ZN5test21fEPNS_1DE
+// MS: define void @"\01?f@test2@@YAXPEAUD@1@@Z"
+void f(D *d) {
+  // TT-ITANIUM: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTSN5test21DE")
+  // TT-MS: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"?AUA@test2@@")
+  // TC-ITANIUM: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 8, metadata !"_ZTSN5test21DE")
+  // TC-MS: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"?AUA@test2@@")
+  d->m_fn1();
+}
+
+}
+
+// ITANIUM: [[A16]] = !{i64 16, !"_ZTS1A"}
+// ITANIUM-DIAG: [[ALL16]] = !{i64 16, !"all-vtables"}
+// ITANIUM: [[A32]] = !{i64 32, !"_ZTS1A"}
+// ITANIUM-DIAG: [[ALL32]] = !{i64 32, !"all-vtables"}
+// ITANIUM: [[B32]] = !{i64 32, !"_ZTS1B"}
+// ITANIUM: [[C32]] = !{i64 32, !"_ZTS1C"}
+// ITANIUM: [[C88]] = !{i64 88, !"_ZTS1C"}
+// ITANIUM-DIAG: [[ALL88]] = !{i64 88, !"all-vtables"}
+// ITANIUM: [[D32]] = !{i64 32, [[D_ID:![0-9]+]]}
+// ITANIUM: [[D_ID]] = distinct !{}
+// ITANIUM: [[A64]] = !{i64 64, !"_ZTS1A"}
+// ITANIUM-DIAG: [[ALL64]] = !{i64 64, !"all-vtables"}
+// ITANIUM: [[FA16]] = !{i64 16, [[FA_ID:![0-9]+]]}
+// ITANIUM: [[FA_ID]] = distinct !{}
+
+// MS: [[A8]] = !{i64 8, !"?AUA@@"}
+// MS: [[B8]] = !{i64 8, !"?AUB@@"}
+// MS: [[D8]] = !{i64 8, [[D_ID:![0-9]+]]}
+// MS: [[D_ID]] = distinct !{}
+// MS: [[FA8]] = !{i64 8, [[FA_ID:![0-9]+]]}
+// MS: [[FA_ID]] = distinct !{}
diff --git a/test/CodeGenCXX/virtual-base-ctor.cpp b/test/CodeGenCXX/virtual-base-ctor.cpp
index 20a88cd..8c8c310 100644
--- a/test/CodeGenCXX/virtual-base-ctor.cpp
+++ b/test/CodeGenCXX/virtual-base-ctor.cpp
@@ -8,4 +8,4 @@
 struct B : virtual A { void* x; };    
 B x;
 
-// CHECK: @y = global i8 2
+// CHECK: @y = local_unnamed_addr global i8 2
diff --git a/test/CodeGenCXX/virtual-function-attrs.cpp b/test/CodeGenCXX/virtual-function-attrs.cpp
new file mode 100644
index 0000000..3a9a1a2
--- /dev/null
+++ b/test/CodeGenCXX/virtual-function-attrs.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 %s -triple %itanium_abi_triple -std=c++11 -emit-llvm -o - | FileCheck %s
+
+class A {
+  virtual void f();
+  virtual void g();
+  virtual ~A();
+};
+
+void A::f() {}
+
+// CHECK: define [[CC:(x86_thiscallcc )?]]void @_ZN1A1fEv({{.*}}) unnamed_addr
+// CHECK: declare [[CC]]void @_ZN1A1gEv({{.*}}) unnamed_addr
+// CHECK: declare {{.*}} @_ZN1AD1Ev({{.*}}) unnamed_addr
+// CHECK: declare [[CC]]void @_ZN1AD0Ev({{.*}}) unnamed_addr
diff --git a/test/CodeGenCXX/vla-consruct.cpp b/test/CodeGenCXX/vla-consruct.cpp
new file mode 100644
index 0000000..fd8314a
--- /dev/null
+++ b/test/CodeGenCXX/vla-consruct.cpp
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -O0 %s -emit-llvm -o - | FileCheck %s
+
+extern "C" int printf(const char *, ...);
+
+static int N;
+struct S {
+  S()
+  __attribute__((nothrow)) { printf("%d: S()\n", ++N); }
+  ~S() __attribute__((nothrow)) { printf("%d: ~S()\n", N--); }
+  int n[17];
+};
+// CHECK: [[struct_S:%.+]] = type { [17 x i32] }
+void print(int n, int a, int b, int c, int d) {
+  printf("n=%d\n,sizeof(S)=%d\nsizeof(array_t[0][0])=%d\nsizeof(array_t[0])=%d\nsizeof(array_t)=%d\n",
+         n, a, b, c, d);
+  if (n == 2)
+    throw(n);
+}
+
+void test(int n) {
+  // CHECK: define void {{.*test.*}}(i32 [[n:%.+]]) #
+  // CHECK: [[n_addr:%.+]] = alloca
+  // CHECK-NEXT: [[saved_stack:%.+]] = alloca
+  // CHECK-NEXT: [[sizeof_S:%.+]] = alloca
+  // CHECK-NEXT: [[sizeof_array_t_0_0:%.+]] = alloca
+  // CHECK-NEXT: [[sizeof_array_t_0:%.+]] = alloca
+  // CHECK-NEXT: [[sizeof_array_t:%.+]] = alloca
+  // CHECK-NEXT: [[exn_slot:%.+]] = alloca i8*
+  // CHECK-NEXT: [[ehselector_slot:%.+]] = alloca i32
+  // CHECK-NEXT: store i32 [[n]], i32* [[n_addr]]
+  // CHECK-NEXT: [[t0:%.+]] = load i32, i32* [[n_addr]]
+  // CHECK-NEXT: [[t1:%.+]] = zext i32 [[t0]] to i64
+  // CHECK-NEXT: [[t2:%.+]] = load i32, i32* [[n_addr]]
+  // CHECK-NEXT: [[add:%.+]] = add nsw i32 [[t2]], 1
+  // CHECK-NEXT: [[t3:%.+]] = zext i32 [[add]] to i64
+  // CHECK-NEXT: [[t4:%.+]] = call i8* @llvm.stacksave()
+  // CHECK-NEXT: store i8* [[t4]], i8** [[saved_stack]]
+  // CHECK-NEXT: [[t5:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  // CHECK-NEXT: [[vla:%.+]] = alloca [[struct_S]], i64 [[t5]]
+  // CHECK-NEXT: [[t6:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  // CHECK-NEXT: [[isempty:%.+]] = icmp eq i64 [[t6]], 0
+  // CHECK-NEXT: br i1 [[isempty]], label %[[arrayctor_cont:.+]], label %[[new_ctorloop:.+]]
+
+  S array_t[n][n + 1];
+
+  // CHECK: [[new_ctorloop]]
+  // CHECK-NEXT: [[arrayctor_end:%.+]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[vla]], i64 [[t6]]
+  // CHECK-NEXT: br label %[[arrayctor_loop:.+]]
+
+  // CHECK: [[arrayctor_loop]]
+  // CHECK-NEXT: [[arrayctor_cur:%.+]] = phi [[struct_S]]* [ [[vla]], %[[new_ctorloop]] ], [ [[arrayctor_next:%.+]], %[[arrayctor_loop]] ]
+  // CHECK-NEXT: call void [[ctor:@.+]]([[struct_S]]* [[arrayctor_cur]])
+  // CHECK-NEXT: [[arrayctor_next]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[arrayctor_cur]], i64 1
+  // CHECK-NEXT: [[arrayctor_done:%.+]] = icmp eq [[struct_S]]* [[arrayctor_next]], [[arrayctor_end]]
+  // CHECK-NEXT: br i1 [[arrayctor_done]], label %[[arrayctor_cont]], label %[[arrayctor_loop]]
+
+  int sizeof_S = sizeof(S);
+  int sizeof_array_t_0_0 = sizeof(array_t[0][0]);
+  int sizeof_array_t_0 = sizeof(array_t[0]);
+  int sizeof_array_t = sizeof(array_t);
+  print(n, sizeof_S, sizeof_array_t_0_0, sizeof_array_t_0, sizeof_array_t);
+
+  //  CHECK: [[arrayctor_cont]]
+  //  CHECK-NEXT: store i32 68, i32* [[sizeof_S]]
+  //  CHECK-NEXT: store i32 68, i32* [[sizeof_array_t_0_0]]
+  //  CHECK: [[t8:%.+]] = mul nuw i64 68, [[t3]]
+  //  CHECK-NEXT: [[conv:%.+]] = trunc i64 [[t8]] to i32
+  //  CHECK-NEXT: store i32 [[conv]], i32* [[sizeof_array_t_0]]
+  //  CHECK-NEXT: [[t9:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  //  CHECK-NEXT: [[t10:%.+]] = mul nuw i64 68, [[t9]]
+  //  CHECK-NEXT: [[conv1:%.+]] = trunc i64 [[t10]] to i32
+  //  CHECK-NEXT: store i32 [[conv1]], i32* [[sizeof_array_t]]
+  //  CHECK-NEXT: [[t11:%.+]] = load i32, i32* [[n_addr:%.+]]
+  //  CHECK-NEXT: [[t12:%.+]] = load i32, i32* [[sizeof_S]]
+  //  CHECK-NEXT: [[t13:%.+]] = load i32, i32* [[sizeof_array_t_0_0]]
+  //  CHECK-NEXT: [[t14:%.+]] = load i32, i32* [[sizeof_array_t_0]]
+  //  CHECK-NEXT: [[t15:%.+]] = load i32, i32* [[sizeof_array_t]]
+  //  CHECK-NEXT: invoke void @{{.*print.*}}(i32 [[t11]], i32 [[t12]], i32 [[t13]], i32 [[t14]], i32 [[t15]])
+  //  CHECK-NEXT: to label %[[invoke_cont:.+]] unwind label %[[lpad:.+]]
+
+  //  CHECK: [[invoke_cont]]
+  //  CHECK-NEXT: [[t16:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  //  CHECK-NEXT: [[t17:%.+]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[vla]], i64 [[t16]]
+  //  CHECK-NEXT: [[arraydestroy_isempty:%.+]] = icmp eq [[struct_S]]* [[vla]], [[t17]]
+  //  CHECK-NEXT: br i1 [[arraydestroy_isempty]], label %[[arraydestroy_done2:.+]], label %[[arraydestroy_body:.+]]
+
+  //  CHECK: [[arraydestroy_body]]
+  //  CHECK-NEXT: [[arraydestroy_elementPast:%.+]] = phi [[struct_S]]* [ [[t17]], %[[invoke_cont]] ], [ [[arraydestroy_element:%.+]], %[[arraydestroy_body]] ]
+  //  CHECK-NEXT: [[arraydestroy_element]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[arraydestroy_elementPast]]
+  //  CHECK-NEXT: call void @[[dtor:.+]]([[struct_S]]* [[arraydestroy_element]])
+  //  CHECK-NEXT: [[arraydestroy_done:%.+]] = icmp eq [[struct_S]]* [[arraydestroy_element]], [[vla]]
+  //  CHECK-NEXT: br i1 [[arraydestroy_done]], label %[[arraydestroy_done2]], label %[[arraydestroy_body]]
+
+  //  CHECK: [[arraydestroy_done2]]
+  //  CHECK-NEXT: [[t17:%.+]] = load i8*, i8** [[saved_stack]]
+  //  CHECK-NEXT: call void @llvm.stackrestore(i8* [[t17]])
+  //  CHECK: ret void
+
+  //  CHECK: [[lpad]]
+  //  CHECK-NEXT: [[t19:%.+]] = landingpad { i8*, i32 }
+  //  CHECK: [[t20:%.+]] = extractvalue { i8*, i32 } [[t19]], 0
+  //  CHECK-NEXT: store i8* [[t20]], i8** [[exn_slot]]
+  //  CHECK-NEXT: [[t21:%.+]] = extractvalue { i8*, i32 } [[t19]], 1
+  //  CHECK-NEXT: store i32 [[t21]], i32* [[ehselector_slot]]
+  //  CHECK-NEXT: [[t22:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  //  CHECK-NEXT: [[t23:%.+]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[vla]], i64 [[t22]]
+  //  CHECK-NEXT: [[arraydestroy_isempty3:%.+]] = icmp eq [[struct_S]]* [[vla]], [[t23]]
+  //  CHECK-NEXT: br i1 [[arraydestroy_isempty3]], label %[[arraydestroy_done8:.+]], label %[[arraydestroy_body4:.+]]
+
+  //  CHECK: [[arraydestroy_body4]]
+  //  CHECK: [[arraydestroy_elementPast5:%.+]] = phi [[struct_S]]* [ [[t23]], %[[lpad]] ], [ [[arraydestroy_element6:.+]], %[[arraydestroy_body4]] ]
+  //  CHECK-NEXT: [[arraydestroy_element6]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[arraydestroy_elementPast5]], i64 -1
+  //  CHECK-NEXT: call void @[[dtor]]([[struct_S]]* [[arraydestroy_element6]])
+  //  CHECK-NEXT: [[arraydestroy_done7:%.+]] = icmp eq [[struct_S]]* [[arraydestroy_element6]], [[vla]]
+  //  CHECK-NEXT: br i1 [[arraydestroy_done7]], label %[[arraydestroy_done8]], label %[[arraydestroy_body4]]
+
+  //  CHECK: [[arraydestroy_done8]]
+  //  CHECK-NEXT: br label %[[eh_resume:.+]]
+
+  //  CHECK: [[eh_resume]]
+  //  CHECK-NEXT: [[exn:%.+]] = load i8*, i8** [[exn_slot]]
+  //  CHECK-NEXT: [[sel:%.+]] = load i32, i32* [[ehselector_slot]]
+  //  CHECK-NEXT: [[lpad_val:%.+]] = insertvalue { i8*, i32 } undef, i8* [[exn]], 0
+  //  CHECK-NEXT: [[lpad_val9:%.+]] = insertvalue { i8*, i32 } [[lpad_val]], i32 [[sel]], 1
+  //  CHECK-NEXT: resume { i8*, i32 } [[lpad_val9]]
+}
+
+int main() {
+  try {
+    test(2);
+  } catch (int e) {
+    printf("expeption %d\n", e);
+  }
+  try {
+    test(3);
+  } catch (int e) {
+    printf("expeption %d", e);
+  }
+}
diff --git a/test/CodeGenCXX/vtable-assume-load.cpp b/test/CodeGenCXX/vtable-assume-load.cpp
index 30cfc00..819b09d 100644
--- a/test/CodeGenCXX/vtable-assume-load.cpp
+++ b/test/CodeGenCXX/vtable-assume-load.cpp
@@ -27,7 +27,7 @@
 // CHECK1-LABEL: define void @_ZN5test14fooAEv()
 // CHECK1: call void @_ZN5test11AC1Ev(%"struct.test1::A"*
 // CHECK1: %[[VTABLE:.*]] = load i8**, i8*** %{{.*}}
-// CHECK1: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5test11AE, i64 0, i64 2)
+// CHECK1: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5test11AE, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: }
 
@@ -39,7 +39,7 @@
 // CHECK1-LABEL: define void @_ZN5test14fooBEv()
 // CHECK1: call void @_ZN5test11BC1Ev(%"struct.test1::B"* %{{.*}})
 // CHECK1: %[[VTABLE:.*]] = load i8**, i8*** %{{.*}}
-// CHECK1: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5test11BE, i64 0, i64 2)
+// CHECK1: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5test11BE, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: }
 
@@ -73,14 +73,14 @@
 // CHECK2-LABEL: define void @_ZN5test24testEv()
 // CHECK2: call void @_ZN5test21CC1Ev(%"struct.test2::C"*
 // CHECK2: %[[VTABLE:.*]] = load i8**, i8*** {{.*}}
-// CHECK2: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([6 x i8*], [6 x i8*]* @_ZTVN5test21CE, i64 0, i64 2)
+// CHECK2: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([6 x i8*], [6 x i8*]* @_ZTVN5test21CE, i32 0, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP]])
 
 // CHECK2: %[[V2:.*]] = bitcast %"struct.test2::C"* %{{.*}} to i8*
 // CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, i8* %[[V2]], i64 8
 // CHECK2: %[[V3:.*]] = bitcast i8* %[[ADD_PTR]] to i8***
 // CHECK2: %[[VTABLE2:.*]] = load i8**, i8*** %[[V3]]
-// CHECK2: %[[CMP2:.*]] = icmp eq i8** %[[VTABLE2]], getelementptr inbounds ([6 x i8*], [6 x i8*]* @_ZTVN5test21CE, i64 0, i64 5)
+// CHECK2: %[[CMP2:.*]] = icmp eq i8** %[[VTABLE2]], getelementptr inbounds ([6 x i8*], [6 x i8*]* @_ZTVN5test21CE, i32 0, i32 5)
 // CHECK2: call void @llvm.assume(i1 %[[CMP2]])
 
 // CHECK2: call void @_ZN5test21gEPNS_1AE(
@@ -111,7 +111,7 @@
 
 // CHECK3-LABEL: define void @_ZN5test34testEv()
 // CHECK3: call void @_ZN5test31CC1Ev(%"struct.test3::C"*
-// CHECK3: %[[CMP:.*]] = icmp eq i8** %{{.*}}, getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN5test31CE, i64 0, i64 3)
+// CHECK3: %[[CMP:.*]] = icmp eq i8** %{{.*}}, getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN5test31CE, i32 0, i32 3)
 // CHECK3: call void @llvm.assume(i1 %[[CMP]])
 // CHECK3-LABLEL: }
 void test() {
@@ -140,11 +140,11 @@
 // CHECK4-LABEL: define void @_ZN5test44testEv()
 // CHECK4: call void @_ZN5test41CC1Ev(%"struct.test4::C"*
 // CHECK4: %[[VTABLE:.*]] = load i8**, i8*** %{{.*}}
-// CHECK4: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5test41CE, i64 0, i64 4)
+// CHECK4: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5test41CE, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP]]
 
 // CHECK4: %[[VTABLE2:.*]] = load i8**, i8*** %{{.*}}
-// CHECK4: %[[CMP2:.*]] = icmp eq i8** %[[VTABLE2]], getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5test41CE, i64 0, i64 4)
+// CHECK4: %[[CMP2:.*]] = icmp eq i8** %[[VTABLE2]], getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5test41CE, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP2]])
 // CHECK4-LABEL: }
 
diff --git a/test/CodeGenCXX/vtable-linkage.cpp b/test/CodeGenCXX/vtable-linkage.cpp
index ff398ff..0b556d1 100644
--- a/test/CodeGenCXX/vtable-linkage.cpp
+++ b/test/CodeGenCXX/vtable-linkage.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -triple=x86_64-pc-linux -emit-llvm -o %t
 // RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -disable-llvm-optzns -O3 -emit-llvm -o %t.opt
-// RUN: FileCheck --check-prefix=CHECK %s < %t
+// RUN: FileCheck %s < %t
 // RUN: FileCheck --check-prefix=CHECK-OPT %s < %t.opt
 
 namespace {
diff --git a/test/CodeGenCXX/vtable-pointer-initialization.cpp b/test/CodeGenCXX/vtable-pointer-initialization.cpp
index 2854291..130a55c 100644
--- a/test/CodeGenCXX/vtable-pointer-initialization.cpp
+++ b/test/CodeGenCXX/vtable-pointer-initialization.cpp
@@ -21,13 +21,13 @@
 
 // CHECK-LABEL: define void @_ZN1AC2Ev(%struct.A* %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i32 0, i32 2) to i32 (...)**)
 // CHECK: call void @_ZN5FieldC1Ev(
 // CHECK: ret void
 A::A() { }
 
 // CHECK-LABEL: define void @_ZN1AD2Ev(%struct.A* %this) unnamed_addr
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i32 0, i32 2) to i32 (...)**)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
@@ -49,12 +49,12 @@
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ev(%struct.B* %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1B, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**)
 // CHECK: call void @_ZN5FieldC1Ev
 // CHECK: ret void
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BD2Ev(%struct.B* %this) unnamed_addr
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1B, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
diff --git a/test/CodeGenCXX/vtt-layout.cpp b/test/CodeGenCXX/vtt-layout.cpp
index 2f441ff..7ff93dd 100644
--- a/test/CodeGenCXX/vtt-layout.cpp
+++ b/test/CodeGenCXX/vtt-layout.cpp
@@ -78,9 +78,11 @@
   }
 }
 
-// CHECK: @_ZTTN5Test11BE = unnamed_addr constant [1 x i8*] [i8* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN5Test11BE, i64 0, i64 3) to i8*)]
+// CHECK: @_ZTTN5Test11BE = unnamed_addr constant [1 x i8*] [i8* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN5Test11BE, i32 0, i32 3) to i8*)]
 // CHECK: @_ZTVN5Test51AE = unnamed_addr constant [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTIN5Test51AE to i8*), i8* bitcast (void ()* @__cxa_pure_virtual to i8*), i8* bitcast (void (%"struct.Test5::A"*)* @_ZN5Test51A6anchorEv to i8*)]
 // CHECK: @_ZTVN5Test61AE = unnamed_addr constant [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTIN5Test61AE to i8*), i8* bitcast (void ()* @__cxa_deleted_virtual to i8*), i8* bitcast (void (%"struct.Test6::A"*)* @_ZN5Test61A6anchorEv to i8*)]
-// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr constant [2 x i8*] [i8* bitcast (i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5Test21CE, i64 0, i64 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5Test21CE, i64 0, i64 4) to i8*)] 
-// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr constant [13 x i8*] [i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 0, i64 5) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE0_NS_2C1E, i64 0, i64 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE0_NS_2C1E, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i64 0, i64 10) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i64 0, i64 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 0, i64 15) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 0, i64 11) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 0, i64 11) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 1, i64 0) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE64_NS_2V2E, i64 0, i64 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE64_NS_2V2E, i64 0, i64 6) to i8*)] 
-// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr constant [19 x i8*] [i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i64 0, i64 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i64 0, i64 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i64 0, i64 10) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 12) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 15) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 18) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 17) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 20) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 1, i64 0) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test41DE40_NS_2V1E, i64 0, i64 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test41DE40_NS_2V1E, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i64 0, i64 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i64 0, i64 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i64 0, i64 10) to i8*)] 
+// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr constant [2 x i8*] [i8* bitcast (i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5Test21CE, i32 0, i32 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5Test21CE, i32 0, i32 4) to i8*)] 
+// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr constant [13 x i8*] [i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i32 0, i32 5) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE0_NS_2C1E, i32 0, i32 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE0_NS_2C1E, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 10) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i32 0, i32 15) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i32 0, i32 11) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i32 0, i32 11) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 1, i32 0) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE64_NS_2V2E, i32 0, i32 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE64_NS_2V2E, i32 0, i32 6) to i8*)] 
+// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr constant [19 x i8*] [i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 10) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 12) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 15) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 18) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 17) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 20) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 1, i32 0) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test41DE40_NS_2V1E, i32 0, i32 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test41DE40_NS_2V1E, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 10) to i8*)] 
+// CHECK: declare void @__cxa_pure_virtual() unnamed_addr
+// CHECK: declare void @__cxa_deleted_virtual() unnamed_addr
diff --git a/test/CodeGenCXX/windows-on-arm-stack-probe-size.cpp b/test/CodeGenCXX/windows-on-arm-stack-probe-size.cpp
new file mode 100644
index 0000000..235d8a0
--- /dev/null
+++ b/test/CodeGenCXX/windows-on-arm-stack-probe-size.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -triple thumbv7--windows-msvc -S -emit-llvm -o - -x c++ %s | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv7--windows-itanium -fno-use-cxa-atexit -S -emit-llvm -o - -x c++ %s | FileCheck %s
+
+class C {
+public:
+  ~C();
+};
+
+static C sc;
+void f(const C &ci) { sc = ci; }
+
+// CHECK: atexit
+
diff --git a/test/CodeGenObjC/2009-08-05-utf16.m b/test/CodeGenObjC/2009-08-05-utf16.m
index 18ac1db..92394d9 100644
--- a/test/CodeGenObjC/2009-08-05-utf16.m
+++ b/test/CodeGenObjC/2009-08-05-utf16.m
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -emit-llvm -w -x objective-c %s -o - | FileCheck %s
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-macho -emit-llvm -w -x objective-c %s -o - | FileCheck %s
 // rdar://7095855 rdar://7115749
 
 // CHECK: private unnamed_addr constant [6 x i16] [i16 105, i16 80, i16 111, i16 100, i16 8482, i16 0], section "__TEXT,__ustring", align 2
diff --git a/test/CodeGenObjC/2010-02-01-utf16-with-null.m b/test/CodeGenObjC/2010-02-01-utf16-with-null.m
index 7c103f2..097a3ea 100644
--- a/test/CodeGenObjC/2010-02-01-utf16-with-null.m
+++ b/test/CodeGenObjC/2010-02-01-utf16-with-null.m
@@ -2,6 +2,6 @@
 // rdar://7589850
 
 // CHECK: @.str = private unnamed_addr constant [9 x i16] [i16 103, i16 111, i16 111, i16 100, i16 0, i16 98, i16 121, i16 101, i16 0], section "__TEXT,__ustring", align 2
-// CHECK: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([9 x i16]* @.str to i8*), i32 8 }, section "__DATA,__cfstring"
+// CHECK: @_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([9 x i16]* @.str to i8*), i32 8 }, section "__DATA,__cfstring"
 // CHECK: @P = global i8* bitcast (%struct.__NSConstantString_tag* @_unnamed_cfstring_ to i8*), align 4
 void *P = @"good\0bye";
diff --git a/test/CodeGenObjC/constant-strings.m b/test/CodeGenObjC/constant-strings.m
index 0a65496..a1daa92 100644
--- a/test/CodeGenObjC/constant-strings.m
+++ b/test/CodeGenObjC/constant-strings.m
@@ -1,15 +1,17 @@
-// RUN: %clang_cc1 -emit-llvm -o %t %s
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-macho -emit-llvm -o %t %s
 // RUN: FileCheck --check-prefix=CHECK-NEXT < %t %s
 
 // Check that we set alignment 1 on the string.
 //
 // CHECK-NEXT: @.str = {{.*}}constant [13 x i8] c"Hello World!\00", section "__TEXT,__cstring,cstring_literals", align 1
 
-// RUN: %clang_cc1 -fobjc-runtime=gcc -emit-llvm -o %t %s
+// RUN: %clang_cc1 -triple x86_64-macho -fobjc-runtime=gcc -emit-llvm -o %t %s
 // RUN: FileCheck --check-prefix=CHECK-GNU < %t %s
 // CHECK-GNU: NXConstantString
 
-// RUN: %clang_cc1 -fobjc-runtime=gcc -fconstant-string-class NSConstantString -emit-llvm -o %t %s
+// RUN: %clang_cc1 -triple x86_64-macho -fobjc-runtime=gcc -fconstant-string-class NSConstantString -emit-llvm -o %t %s
 // RUN: FileCheck --check-prefix=CHECK-GNU-WITH-CLASS < %t %s
 // CHECK-GNU-WITH-CLASS: NSConstantString
 id a = @"Hello World!";
diff --git a/test/CodeGenObjC/debug-info-nodebug.m b/test/CodeGenObjC/debug-info-nodebug.m
new file mode 100644
index 0000000..42d630b
--- /dev/null
+++ b/test/CodeGenObjC/debug-info-nodebug.m
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple arm-apple-ios -emit-llvm -debug-info-kind=limited -fblocks  %s -o - | FileCheck %s
+// Objective-C code cargo-culted from debug-info-lifetime-crash.m.
+@protocol NSObject
+- (id)copy;
+@end
+@class W;
+@interface View1
+@end
+@implementation Controller {
+    void (^Block)(void);
+}
+- (void)View:(View1 *)View foo:(W *)W
+{
+  // The reference from inside the block implicitly creates another
+  // local variable for the referenced member. That is what gets
+  // suppressed by the attribute.  It still gets debug info as a
+  // member, though.
+  // CHECK-NOT: !DILocalVariable(name: "weakSelf"
+  // CHECK:     !DIDerivedType({{.*}} name: "weakSelf"
+  // CHECK-NOT: !DILocalVariable(name: "weakSelf"
+  __attribute__((nodebug)) __typeof(self) weakSelf = self;
+  Block = [^{
+    __typeof(self) strongSelf = weakSelf;
+    } copy];
+}
+@end
diff --git a/test/CodeGenObjC/dllstorage.m b/test/CodeGenObjC/dllstorage.m
new file mode 100644
index 0000000..4bdbd50
--- /dev/null
+++ b/test/CodeGenObjC/dllstorage.m
@@ -0,0 +1,151 @@
+// RUN: %clang_cc1 -triple i686-windows-itanium -fms-extensions -fobjc-runtime=macosx -fdeclspec -fobjc-exceptions -S -emit-llvm -o - %s | FileCheck -check-prefix CHECK-IR %s
+// RUN: %clang_cc1 -triple i686-windows-itanium -fms-extensions -fobjc-runtime=objfw -fdeclspec -fobjc-exceptions -S -emit-llvm -o - %s | FileCheck -check-prefix CHECK-FW %s
+
+// CHECK-IR-DAG: @_objc_empty_cache = external dllimport global %struct._objc_cache
+
+__declspec(dllimport)
+@interface I
++ (instancetype) new;
+@end
+
+// CHECK-IR-DAG: @"OBJC_METACLASS_$_I" = external dllimport global %struct._class_t
+// CHECK-IR-DAG: @"OBJC_CLASS_$_I" = external dllimport global %struct._class_t
+
+__declspec(dllexport)
+@interface J : I
+@end
+
+// CHECK-IR-DAG: @"OBJC_METACLASS_$_J" = dllexport global %struct._class_t
+// CHECK-IR-DAG: @"OBJC_CLASS_$_J" = dllexport global %struct._class_t
+
+// CHECK-FW-DAG: @_OBJC_METACLASS_J = dllexport global
+// CHECK-FW-DAG: @_OBJC_CLASS_J = dllexport global
+
+@implementation J {
+  id _ivar;
+}
+@end
+
+// CHECK-IR-DAG: @"OBJC_IVAR_$_J._ivar" = global i32
+
+@interface K : J
+@end
+
+// CHECK-IR-DAG: @"OBJC_METACLASS_$_K" = global %struct._class_t
+// CHECK-IR-DAG: @"OBJC_CLASS_$_K" = global %struct._class_t
+
+// CHECK-FW-DAG: @_OBJC_METACLASS_K = global
+// CHECK-FW-DAG: @_OBJC_CLASS_K = global
+
+@implementation K {
+  id _ivar;
+}
+@end
+
+// CHECK-IR-DAG: @"OBJC_IVAR_$_K._ivar" = global i32
+
+__declspec(dllexport)
+@interface L : K
+@end
+
+// CHECK-IR-DAG: @"OBJC_METACLASS_$_L" = dllexport global %struct._class_t
+// CHECK-IR-DAG: @"OBJC_CLASS_$_L" = dllexport global %struct._class_t
+
+// CHECK-FW-DAG: @_OBJC_METACLASS_L = dllexport global
+// CHECK-FW-DAG: @_OBJC_CLASS_L = dllexport global
+
+@implementation L {
+  id _none;
+
+  @public
+  id _public;
+
+  @protected
+  id _protected;
+
+  @package
+  id _package;
+
+  @private
+  id _private;
+}
+@end
+
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._none" = global i32
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._public" = dllexport global i32
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._protected" = dllexport global i32
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._package" = global i32
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._private" = global i32
+
+__declspec(dllimport)
+@interface M : I {
+  @public
+  id _ivar;
+}
+@end
+
+// CHEKC-FW-DAG: @_OBJC_CLASS_M = external dllimport global i32
+
+// CHECK-IR-DAG: @"OBJC_IVAR_$_M._ivar" = external dllimport global i32
+
+__declspec(dllexport)
+__attribute__((__objc_exception__))
+@interface N : I
+@end
+
+// CHECK-FW-DAG: @_OBJC_METACLASS_N = dllexport global
+// CHECK-FW-DAG: @_OBJC_CLASS_N = dllexport global
+
+@implementation N : I
+@end
+
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_N" = dllexport global %struct._objc_typeinfo
+
+__declspec(dllimport)
+__attribute__((__objc_exception__))
+@interface O : I
+@end
+
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_O" = external dllimport global %struct._objc_typeinfo
+
+__attribute__((__objc_exception__))
+@interface P : I
+@end
+
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_P" = external global %struct._objc_typeinfo
+
+int g() {
+  @autoreleasepool {
+    M *mi = [M new];
+    @try {
+      mi->_ivar = (void *)0;
+      @throw(@"CFConstantString");
+    } @catch (id) {
+      return 1;
+    } @catch (I *) {
+      return 2;
+    } @catch (J *) {
+      return 3;
+    } @catch (K *) {
+      return 4;
+    } @catch (L *) {
+      return 5;
+    } @catch (M *) {
+      return 6;
+    } @catch (N *) {
+      return 7;
+    } @catch (O *) {
+      return 8;
+    } @catch (P *) {
+      return 9;
+    }
+  }
+  return 0;
+}
+
+// CHECK-IR-DAG: @OBJC_EHTYPE_id = external dllimport global %struct._objc_typeinfo
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_I" = weak global %struct._objc_typeinfo
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_K" = weak global %struct._objc_typeinfo
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_L" = weak global %struct._objc_typeinfo
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_M" = weak global %struct._objc_typeinfo
+
diff --git a/test/CodeGenObjC/exceptions-asm-attribute.m b/test/CodeGenObjC/exceptions-asm-attribute.m
index efca3ce..5719198 100644
--- a/test/CodeGenObjC/exceptions-asm-attribute.m
+++ b/test/CodeGenObjC/exceptions-asm-attribute.m
@@ -24,7 +24,7 @@
 
 // CHECK-X86_64-HIDDEN: @"OBJC_CLASS_$_MySecretNamespace.A" = hidden global {{.*}}, section "__DATA, __objc_data", align 8
 // CHECK-X86_64-HIDDEN: @"OBJC_METACLASS_$_MySecretNamespace.A" = hidden global {{.*}}, section "__DATA, __objc_data", align 8
-// CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_MySecretNamespace.EH1" = weak hidden global {{.*}}
+// CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_MySecretNamespace.EH1" = weak hidden global
 // CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_MySecretNamespace.EH2" = external global
 // CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_MySecretNamespace.EH3" = hidden global {{.*}}, section "__DATA,__objc_const", align 8
 // CHECK-X86_64-HIDDEN: define internal void @"\01-[A im0]"
diff --git a/test/CodeGenObjC/hidden-visibility.m b/test/CodeGenObjC/hidden-visibility.m
index 9f5071d..cb23ca1 100644
--- a/test/CodeGenObjC/hidden-visibility.m
+++ b/test/CodeGenObjC/hidden-visibility.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fvisibility hidden -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-apple-macosx -fvisibility hidden -emit-llvm -o - %s | FileCheck %s
 // CHECK: @"OBJC_IVAR_$_I.P" = hidden
 // CHECK: @"OBJC_CLASS_$_I" = hidden
 // CHECK: @"OBJC_METACLASS_$_I" = hidden
diff --git a/test/CodeGenObjC/messages-2.m b/test/CodeGenObjC/messages-2.m
index 4f98fc7..be66f71 100644
--- a/test/CodeGenObjC/messages-2.m
+++ b/test/CodeGenObjC/messages-2.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin10 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NF
 
 // Most of this test is apparently just verifying that we don't crash.
diff --git a/test/CodeGenObjC/metadata_symbols.m b/test/CodeGenObjC/metadata_symbols.m
index 3e8f11f..4c365a1 100644
--- a/test/CodeGenObjC/metadata_symbols.m
+++ b/test/CodeGenObjC/metadata_symbols.m
@@ -23,7 +23,7 @@
 
 // CHECK-X86_64-HIDDEN: @"OBJC_CLASS_$_A" = hidden global {{.*}}, section "__DATA, __objc_data", align 8
 // CHECK-X86_64-HIDDEN: @"OBJC_METACLASS_$_A" = hidden global {{.*}}, section "__DATA, __objc_data", align 8
-// CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_EH1" = weak hidden global {{.*}}
+// CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_EH1" = weak hidden global
 // CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_EH2" = external global
 // CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_EH3" = hidden global {{.*}}, section "__DATA,__objc_const", align 8
 // CHECK-X86_64-HIDDEN: define internal void @"\01-[A im0]"
diff --git a/test/CodeGenObjC/tentative-cfconstantstring.m b/test/CodeGenObjC/tentative-cfconstantstring.m
index 9ff1a0a..bb76b03 100644
--- a/test/CodeGenObjC/tentative-cfconstantstring.m
+++ b/test/CodeGenObjC/tentative-cfconstantstring.m
@@ -32,7 +32,7 @@
 @end
 
 // CHECK: @__CFConstantStringClassReference = common global [24 x i32] zeroinitializer, align 16
-// CHECK: @_unnamed_cfstring_{{.*}} = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([24 x i32], [24 x i32]* @__CFConstantStringClassReference, i32 0, i32 0)
+// CHECK: @_unnamed_cfstring_{{.*}} = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([24 x i32], [24 x i32]* @__CFConstantStringClassReference, i32 0, i32 0)
 
 // CHECK-LABEL: define internal void @_inlineFunction()
 // CHECK:  [[ZERO:%.*]] = load %struct._class_t*, %struct._class_t** @"OBJC_CLASSLIST_REFERENCES_
diff --git a/test/CodeGenObjCXX/arc-attrs.mm b/test/CodeGenObjCXX/arc-attrs.mm
index d571677..0f0610f 100644
--- a/test/CodeGenObjCXX/arc-attrs.mm
+++ b/test/CodeGenObjCXX/arc-attrs.mm
@@ -12,7 +12,7 @@
   id x = makeObject1();
 
   // CHECK-NEXT: [[OBJ2:%.*]] = call i8* @_Z11makeObject2v()
-  // CHECK-NEXT: call void @_Z13releaseObjectU11ns_consumedP11objc_object(i8* [[OBJ2]])
+  // CHECK-NEXT: call void @_Z13releaseObjectP11objc_object(i8* [[OBJ2]])
   releaseObject(makeObject2());
 
   // CHECK-NEXT: call void @objc_storeStrong(i8** [[X]], i8* null)
@@ -31,16 +31,16 @@
 // CHECK-LABEL: define void @_Z12templateTestv
 void templateTest() {
   // CHECK: [[X:%.*]] = alloca i8*, align 8
-  // CHECK-NEXT: [[OBJ1:%.*]] = call i8* @_Z12makeObjectT1IU8__strongP11objc_objectEU19ns_returns_retainedT_v()
+  // CHECK-NEXT: [[OBJ1:%.*]] = call i8* @_Z12makeObjectT1IU8__strongP11objc_objectET_v()
   // CHECK-NEXT: store i8* [[OBJ1]], i8** [[X]], align 8
   id x = makeObjectT1<id>();
 
-  // CHECK-NEXT: [[OBJ2:%.*]] = call i8* @_Z12makeObjectT2IU8__strongP11objc_objectEU19ns_returns_retainedT_v()
-  // CHECK-NEXT: call void @_Z13releaseObjectU11ns_consumedP11objc_object(i8* [[OBJ2]])
+  // CHECK-NEXT: [[OBJ2:%.*]] = call i8* @_Z12makeObjectT2IU8__strongP11objc_objectET_v()
+  // CHECK-NEXT: call void @_Z13releaseObjectP11objc_object(i8* [[OBJ2]])
   releaseObject(makeObjectT2<id>());
 
   // CHECK-NEXT: [[OBJ3:%.*]] = call i8* @_Z11makeObject1v()
-  // CHECK-NEXT: call void @_Z14releaseObjectTIU8__strongP11objc_objectEvU11ns_consumedT_(i8* [[OBJ3]])
+  // CHECK-NEXT: call void @_Z14releaseObjectTIU8__strongP11objc_objectEvT_(i8* [[OBJ3]])
   releaseObjectT(makeObject1());
 
   // CHECK-NEXT: call void @objc_storeStrong(i8** [[X]], i8* null)
diff --git a/test/CodeGenObjCXX/arc-cxx11-init-list.mm b/test/CodeGenObjCXX/arc-cxx11-init-list.mm
index 594ced2..c3723c6 100644
--- a/test/CodeGenObjCXX/arc-cxx11-init-list.mm
+++ b/test/CodeGenObjCXX/arc-cxx11-init-list.mm
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -triple armv7-ios5.0 -std=c++11 -fobjc-arc -Os -emit-llvm -o - %s | FileCheck %s
 
 // CHECK: @[[STR0:.*]] = private unnamed_addr constant [5 x i8] c"str0\00", section "__TEXT,__cstring,cstring_literals"
-// CHECK: @[[UNNAMED_CFSTRING0:.*]] = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[STR0]], i32 0, i32 0), i32 4 }, section "__DATA,__cfstring"
+// CHECK: @[[UNNAMED_CFSTRING0:.*]] = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[STR0]], i32 0, i32 0), i32 4 }, section "__DATA,__cfstring"
 // CHECK: @[[STR1:.*]] = private unnamed_addr constant [5 x i8] c"str1\00", section "__TEXT,__cstring,cstring_literals"
-// CHECK: @[[UNNAMED_CFSTRING1:.*]] = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[STR1]], i32 0, i32 0), i32 4 }, section "__DATA,__cfstring"
+// CHECK: @[[UNNAMED_CFSTRING1:.*]] = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[STR1]], i32 0, i32 0), i32 4 }, section "__DATA,__cfstring"
 // CHECK: @[[REFTMP:.*]] = private constant [2 x i8*] [i8* bitcast (%struct.__NSConstantString_tag* @[[UNNAMED_CFSTRING0]] to i8*), i8* bitcast (%struct.__NSConstantString_tag* @[[UNNAMED_CFSTRING1]] to i8*)]
 
 typedef __SIZE_TYPE__ size_t;
@@ -37,7 +37,6 @@
 // CHECK-NEXT: [[CAST:%.*]] = bitcast [{{[0-9]+}} x %0*]* %{{.*}} to i8**
 // CHECK-NEXT: store i8* [[INSTANCE]], i8** [[CAST]],
 // CHECK: call void @objc_release(i8* {{.*}})
-// CHECK-NEXT: icmp eq
 
 std::initializer_list<id> foo1() {
   return {@"str0", @"str1"};
diff --git a/test/CodeGenObjCXX/arc-mangle.mm b/test/CodeGenObjCXX/arc-mangle.mm
index 84acbdb..82e3755 100644
--- a/test/CodeGenObjCXX/arc-mangle.mm
+++ b/test/CodeGenObjCXX/arc-mangle.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fobjc-arc -fobjc-runtime-has-weak -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fobjc-arc -fobjc-runtime-has-weak -triple %itanium_abi_triple -emit-llvm -fblocks -o - %s | FileCheck %s
 
 // CHECK-LABEL: define {{.*}}void @_Z1fPU8__strongP11objc_object(i8**)
 void f(__strong id *) {}
@@ -18,10 +18,14 @@
 void f(const __unsafe_unretained id *) {}
 // CHECK-LABEL: define {{.*}}void @_Z1fPFU19ns_returns_retainedP11objc_objectvE
 void f(__attribute__((ns_returns_retained)) id (*fn)()) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fP11objc_object
+void f(__attribute__((ns_consumed)) id) {}
 // CHECK-LABEL: define {{.*}}void @_Z1fPFP11objc_objectU11ns_consumedS0_S0_E
 void f(id (*fn)(__attribute__((ns_consumed)) id, id)) {}
 // CHECK-LABEL: define {{.*}}void @_Z1fPFP11objc_objectS0_U11ns_consumedS0_E
 void f(__strong id (*fn)(id, __attribute__((ns_consumed)) id)) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fU13block_pointerFvU11ns_consumedP11objc_objectE
+void f(void (^)(__attribute__((ns_consumed)) id)) {}
 
 template<unsigned N> struct unsigned_c { };
 
diff --git a/test/CodeGenObjCXX/arc-new-delete.mm b/test/CodeGenObjCXX/arc-new-delete.mm
index f853ea4..141f401 100644
--- a/test/CodeGenObjCXX/arc-new-delete.mm
+++ b/test/CodeGenObjCXX/arc-new-delete.mm
@@ -12,32 +12,32 @@
   // OPT-NEXT: [[T0:%.*]] = call i8* @objc_retain(i8* [[INVALUE:%.*]])
   // OPT-NEXT: store i8* [[T0]], i8** [[INVALUEADDR]]
 
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK-NEXT: {{bitcast i8\*.*to i8\*\*}}
   // CHECK-NEXT: store i8* null, i8**
   new strong_id;
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK-NEXT: {{bitcast i8\*.*to i8\*\*}}
   // UNOPT-NEXT: store i8* null, i8**
   // OPT-NEXT: call i8* @objc_initWeak(i8** {{.*}}, i8* null)
   new weak_id;
 
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK-NEXT: {{bitcast i8\*.*to i8\*\*}}
   // CHECK-NEXT: store i8* null, i8**
   new __strong id;
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK-NEXT: {{bitcast i8\*.*to i8\*\*}}
   // UNOPT-NEXT: store i8* null, i8**
   // OPT-NEXT: call i8* @objc_initWeak(i8** {{.*}}, i8* null)
   new __weak id;
 
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: call i8* @objc_retain
   // CHECK: store i8*
   new __strong id(invalue);
 
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: call i8* @objc_initWeak
   new __weak id(invalue);
 
@@ -48,12 +48,12 @@
 
 // CHECK-LABEL: define void @_Z14test_array_new
 void test_array_new() {
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: store i64 17, i64*
   // CHECK: call void @llvm.memset.p0i8.i64
   new strong_id[17];
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: store i64 17, i64*
   // CHECK: call void @llvm.memset.p0i8.i64
   new weak_id[17];
diff --git a/test/CodeGenObjCXX/copy.mm b/test/CodeGenObjCXX/copy.mm
index 9e41bf0..7783137 100644
--- a/test/CodeGenObjCXX/copy.mm
+++ b/test/CodeGenObjCXX/copy.mm
@@ -11,7 +11,7 @@
   // CHECK:      alloca
   // CHECK-NEXT: getelementptr
   // CHECK-NEXT: store
-  // CHECK-NEXT: call noalias i8* @_Znwm(
+  // CHECK-NEXT: call i8* @_Znwm(
   // CHECK-NEXT: bitcast
   // CHECK-NEXT: bitcast
   // CHECK-NEXT: bitcast
diff --git a/test/CodeGenObjCXX/exceptions-legacy.mm b/test/CodeGenObjCXX/exceptions-legacy.mm
index dac259d..0650509 100644
--- a/test/CodeGenObjCXX/exceptions-legacy.mm
+++ b/test/CodeGenObjCXX/exceptions-legacy.mm
@@ -16,7 +16,7 @@
 //   Enter the @synchronized block.
 // CHECK:      call i32 @objc_sync_enter(i8* [[OBJ:%.*]])
 // CHECK:      call void @objc_exception_try_enter([[BUF_T:%.*]]* nonnull [[BUF:%.*]])
-// CHECK-NEXT: [[T0:%.*]] = getelementptr [[BUF_T]], [[BUF_T]]* [[BUF]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[BUF_T]], [[BUF_T]]* [[BUF]], i32 0, i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = call i32 @_setjmp(i32* [[T0]])
 // CHECK-NEXT: [[T2:%.*]] = icmp eq i32 [[T1]], 0
 // CHECK-NEXT: br i1 [[T2]],
@@ -55,7 +55,7 @@
 // CHECK-LABEL:    define void @_Z5test1P11objc_objectPb(
 //   Enter the @try block.
 // CHECK:      call void @objc_exception_try_enter([[BUF_T]]* nonnull [[BUF:%.*]])
-// CHECK-NEXT: [[T0:%.*]] = getelementptr [[BUF_T]], [[BUF_T]]* [[BUF]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[BUF_T]], [[BUF_T]]* [[BUF]], i32 0, i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = call i32 @_setjmp(i32* [[T0]])
 // CHECK-NEXT: [[T2:%.*]] = icmp eq i32 [[T1]], 0
 // CHECK-NEXT: br i1 [[T2]],
diff --git a/test/CodeGenObjCXX/mangle.mm b/test/CodeGenObjCXX/mangle.mm
index bcb920b..2854cff 100644
--- a/test/CodeGenObjCXX/mangle.mm
+++ b/test/CodeGenObjCXX/mangle.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -std=c++11 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -std=c++11 -emit-llvm -fblocks -o - | FileCheck %s
 
 // CHECK: @"_ZZ11+[A shared]E1a" = internal global
 // CHECK: @"_ZZ11-[A(Foo) f]E1a" = internal global
@@ -113,3 +113,10 @@
 
 // CHECK-LABEL: define void @_Z19parameterized_test3P13Parameterized
 void parameterized_test3(Parameterized *p) {}
+
+// CHECK-LABEL: define {{.*}}void @_Z1fP11objc_object
+void f(__attribute__((ns_consumed)) id) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fPFP11objc_objectS0_S0_E
+void f(id (*fn)(__attribute__((ns_consumed)) id, id)) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fU13block_pointerFvP11objc_objectE
+void f(void (^)(__attribute__((ns_consumed)) id)) {}
diff --git a/test/CodeGenObjCXX/personality-abuse.mm b/test/CodeGenObjCXX/personality-abuse.mm
index f5170bf..2a3620d 100644
--- a/test/CodeGenObjCXX/personality-abuse.mm
+++ b/test/CodeGenObjCXX/personality-abuse.mm
@@ -16,4 +16,4 @@
   }
 }
 
-// CHECK: define void @_Z3foov() #1 personality i8* bitcast (i32 ()* @__objc_personality_v0 to i8*)
+// CHECK: define void @_Z3foov() {{#[0-9]+}} personality i8* bitcast (i32 ()* @__objc_personality_v0 to i8*)
diff --git a/test/CodeGenOpenCL/address-spaces-conversions.cl b/test/CodeGenOpenCL/address-spaces-conversions.cl
index bc80f47..c947db4 100644
--- a/test/CodeGenOpenCL/address-spaces-conversions.cl
+++ b/test/CodeGenOpenCL/address-spaces-conversions.cl
@@ -1,22 +1,89 @@
 // RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -O0 -ffake-address-space-map -cl-std=CL2.0 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -O0 -cl-std=CL2.0 -emit-llvm -o - | FileCheck --check-prefix=CHECK-NOFAKE %s
+// When -ffake-address-space-map is not used, all addr space mapped to 0 for x86_64.
 
 // test that we generate address space casts everywhere we need conversions of
 // pointers to different address spaces
 
+// CHECK: define void @test
 void test(global int *arg_glob, generic int *arg_gen) {
   int var_priv;
   arg_gen = arg_glob; // implicit cast global -> generic
   // CHECK: %{{[0-9]+}} = addrspacecast i32 addrspace(1)* %{{[0-9]+}} to i32 addrspace(4)*
+  // CHECK-NOFAKE-NOT: addrspacecast
+
   arg_gen = &var_priv; // implicit cast with obtaining adr, private -> generic
   // CHECK: %{{[0-9]+}} = addrspacecast i32* %var_priv to i32 addrspace(4)*
+  // CHECK-NOFAKE-NOT: addrspacecast
+
   arg_glob = (global int *)arg_gen; // explicit cast
   // CHECK: %{{[0-9]+}} = addrspacecast i32 addrspace(4)* %{{[0-9]+}} to i32 addrspace(1)*
+  // CHECK-NOFAKE-NOT: addrspacecast
+
   global int *var_glob =
       (global int *)arg_glob; // explicit cast in the same address space
   // CHECK-NOT: %{{[0-9]+}} = addrspacecast i32 addrspace(1)* %{{[0-9]+}} to i32 addrspace(1)*
+  // CHECK-NOFAKE-NOT: addrspacecast
+
   var_priv = arg_gen - arg_glob; // arithmetic operation
   // CHECK: %{{.*}} = ptrtoint i32 addrspace(4)* %{{.*}} to i64
   // CHECK: %{{.*}} = ptrtoint i32 addrspace(1)* %{{.*}} to i64
+  // CHECK-NOFAKE: %{{.*}} = ptrtoint i32* %{{.*}} to i64
+  // CHECK-NOFAKE: %{{.*}} = ptrtoint i32* %{{.*}} to i64
+
   var_priv = arg_gen > arg_glob; // comparison
   // CHECK: %{{[0-9]+}} = addrspacecast i32 addrspace(1)* %{{[0-9]+}} to i32 addrspace(4)*
+
+  generic void *var_gen_v = arg_glob;
+  // CHECK: addrspacecast
+  // CHECK-NOT: bitcast
+  // CHECK-NOFAKE: bitcast
+  // CHECK-NOFAKE-NOT: addrspacecast
+}
+
+// Test ternary operator.
+// CHECK: define void @test_ternary
+void test_ternary(void) {
+  global int *var_glob;
+  generic int *var_gen;
+  generic int *var_gen2;
+  generic float *var_gen_f;
+  generic void *var_gen_v;
+
+  var_gen = var_gen ? var_gen : var_gen2; // operands of the same addr spaces and the same type
+  // CHECK: icmp
+  // CHECK-NOT: addrspacecast
+  // CHECK-NOT: bitcast
+  // CHECK: phi
+  // CHECK: store i32 addrspace(4)* %{{.+}}, i32 addrspace(4)** %{{.+}}
+
+  var_gen = var_gen ? var_gen : var_glob; // operands of overlapping addr spaces and the same type
+  // CHECK: icmp
+  // CHECK-NOT: bitcast
+  // CHECK: %{{.+}} = addrspacecast i32 addrspace(1)* %{{.+}} to i32 addrspace(4)*
+  // CHECK: phi
+  // CHECK: store
+
+  typedef int int_t;
+  global int_t *var_glob_typedef;
+  var_gen = var_gen ? var_gen : var_glob_typedef; // operands of overlapping addr spaces and equivalent types
+  // CHECK: icmp
+  // CHECK-NOT: bitcast
+  // CHECK: %{{.+}} = addrspacecast i32 addrspace(1)* %{{.+}} to i32 addrspace(4)*
+  // CHECK: phi
+  // CHECK: store
+ 
+  var_gen_v = var_gen ? var_gen : var_gen_f; // operands of the same addr space and different types
+  // CHECK: icmp
+  // CHECK: %{{.+}} = bitcast i32 addrspace(4)* %{{.+}} to i8 addrspace(4)*
+  // CHECK: %{{.+}} = bitcast float addrspace(4)* %{{.+}} to i8 addrspace(4)*
+  // CHECK: phi
+  // CHECK: store
+
+  var_gen_v = var_gen ? var_glob : var_gen_f; // operands of overlapping addr spaces and different types
+  // CHECK: icmp
+  // CHECK: %{{.+}} = addrspacecast i32 addrspace(1)* %{{.+}} to i8 addrspace(4)*
+  // CHECK: %{{.+}} = bitcast float addrspace(4)* %{{.+}} to i8 addrspace(4)*
+  // CHECK: phi
+  // CHECK: store
 }
diff --git a/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl b/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
new file mode 100644
index 0000000..3a98e90
--- /dev/null
+++ b/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
@@ -0,0 +1,15 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @use_flat_scratch_name
+kernel void use_flat_scratch_name()
+{
+// CHECK: tail call void asm sideeffect "s_mov_b64 flat_scratch, 0", "~{flat_scratch}"()
+  __asm__ volatile("s_mov_b64 flat_scratch, 0" : : : "flat_scratch");
+
+// CHECK: tail call void asm sideeffect "s_mov_b32 flat_scratch_lo, 0", "~{flat_scratch_lo}"()
+  __asm__ volatile("s_mov_b32 flat_scratch_lo, 0" : : : "flat_scratch_lo");
+
+// CHECK: tail call void asm sideeffect "s_mov_b32 flat_scratch_hi, 0", "~{flat_scratch_hi}"()
+  __asm__ volatile("s_mov_b32 flat_scratch_hi, 0" : : : "flat_scratch_hi");
+}
diff --git a/test/CodeGenOpenCL/amdgpu-call-kernel.cl b/test/CodeGenOpenCL/amdgpu-call-kernel.cl
new file mode 100755
index 0000000..0057939
--- /dev/null
+++ b/test/CodeGenOpenCL/amdgpu-call-kernel.cl
@@ -0,0 +1,14 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// CHECK: define amdgpu_kernel void @test_call_kernel(i32 addrspace(1)* nocapture %out)
+// CHECK: store i32 4, i32 addrspace(1)* %out, align 4
+
+kernel void test_kernel(global int *out)
+{
+  out[0] = 4;
+}
+
+__kernel void test_call_kernel(__global int *out)
+{
+  test_kernel(out);
+}
diff --git a/test/CodeGenOpenCL/amdgpu-calling-conv.cl b/test/CodeGenOpenCL/amdgpu-calling-conv.cl
new file mode 100644
index 0000000..7da9d7f
--- /dev/null
+++ b/test/CodeGenOpenCL/amdgpu-calling-conv.cl
@@ -0,0 +1,12 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define amdgpu_kernel void @calling_conv_amdgpu_kernel()
+kernel void calling_conv_amdgpu_kernel()
+{
+}
+
+// CHECK: define void @calling_conv_none()
+void calling_conv_none()
+{
+}
diff --git a/test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl b/test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
index 35bdcea..589d00d 100644
--- a/test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
+++ b/test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
@@ -5,23 +5,23 @@
 
 __attribute__((amdgpu_num_vgpr(64))) // expected-no-diagnostics
 kernel void test_num_vgpr64() {
-// CHECK: define void @test_num_vgpr64() [[ATTR_VGPR64:#[0-9]+]]
+// CHECK: define amdgpu_kernel void @test_num_vgpr64() [[ATTR_VGPR64:#[0-9]+]]
 }
 
 __attribute__((amdgpu_num_sgpr(32))) // expected-no-diagnostics
 kernel void test_num_sgpr32() {
-// CHECK: define void @test_num_sgpr32() [[ATTR_SGPR32:#[0-9]+]]
+// CHECK: define amdgpu_kernel void @test_num_sgpr32() [[ATTR_SGPR32:#[0-9]+]]
 }
 
 __attribute__((amdgpu_num_vgpr(64), amdgpu_num_sgpr(32))) // expected-no-diagnostics
 kernel void test_num_vgpr64_sgpr32() {
-// CHECK: define void @test_num_vgpr64_sgpr32() [[ATTR_VGPR64_SGPR32:#[0-9]+]]
+// CHECK: define amdgpu_kernel void @test_num_vgpr64_sgpr32() [[ATTR_VGPR64_SGPR32:#[0-9]+]]
 
 }
 
 __attribute__((amdgpu_num_sgpr(20), amdgpu_num_vgpr(40))) // expected-no-diagnostics
 kernel void test_num_sgpr20_vgpr40() {
-// CHECK: define void @test_num_sgpr20_vgpr40() [[ATTR_SGPR20_VGPR40:#[0-9]+]]
+// CHECK: define amdgpu_kernel void @test_num_sgpr20_vgpr40() [[ATTR_SGPR20_VGPR40:#[0-9]+]]
 }
 
 __attribute__((amdgpu_num_vgpr(0))) // expected-no-diagnostics
@@ -40,8 +40,8 @@
 // X86-NOT: "amdgpu_num_vgpr"
 // X86-NOT: "amdgpu_num_sgpr"
 
-// CHECK-DAG-NOT: "amdgpu_num_vgpr"="0"
-// CHECK-DAG-NOT: "amdgpu_num_sgpr"="0"
+// CHECK-NOT: "amdgpu_num_vgpr"="0"
+// CHECK-NOT: "amdgpu_num_sgpr"="0"
 // CHECK-DAG: attributes [[ATTR_VGPR64]] = { nounwind "amdgpu_num_vgpr"="64"
 // CHECK-DAG: attributes [[ATTR_SGPR32]] = { nounwind "amdgpu_num_sgpr"="32"
 // CHECK-DAG: attributes [[ATTR_VGPR64_SGPR32]] = { nounwind "amdgpu_num_sgpr"="32" "amdgpu_num_vgpr"="64"
diff --git a/test/CodeGenOpenCL/as_type.cl b/test/CodeGenOpenCL/as_type.cl
new file mode 100644
index 0000000..7fc3b02
--- /dev/null
+++ b/test/CodeGenOpenCL/as_type.cl
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple spir-unknown-unknown -o - | FileCheck %s
+
+typedef __attribute__(( ext_vector_type(3) )) char char3;
+typedef __attribute__(( ext_vector_type(4) )) char char4;
+typedef __attribute__(( ext_vector_type(16) )) char char16;
+typedef __attribute__(( ext_vector_type(3) )) int int3;
+
+//CHECK: define spir_func <3 x i8> @f1(<4 x i8> %[[x:.*]])
+//CHECK: %[[astype:.*]] = shufflevector <4 x i8> %[[x]], <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+//CHECK: ret <3 x i8> %[[astype]]
+char3 f1(char4 x) {
+  return  __builtin_astype(x, char3);
+}
+
+//CHECK: define spir_func <4 x i8> @f2(<3 x i8> %[[x:.*]])
+//CHECK: %[[astype:.*]] = shufflevector <3 x i8> %[[x]], <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+//CHECK: ret <4 x i8> %[[astype]]
+char4 f2(char3 x) {
+  return __builtin_astype(x, char4);
+}
+
+//CHECK: define spir_func <3 x i8> @f3(i32 %[[x:.*]])
+//CHECK: %[[cast:.*]] = bitcast i32 %[[x]] to <4 x i8>
+//CHECK: %[[astype:.*]] = shufflevector <4 x i8> %[[cast]], <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+//CHECK: ret <3 x i8> %[[astype]]
+char3 f3(int x) {
+  return __builtin_astype(x, char3);
+}
+
+//CHECK: define spir_func <4 x i8> @f4(i32 %[[x:.*]])
+//CHECK: %[[astype:.*]] = bitcast i32 %[[x]] to <4 x i8>
+//CHECK-NOT: shufflevector
+//CHECK: ret <4 x i8> %[[astype]]
+char4 f4(int x) {
+  return __builtin_astype(x, char4);
+}
+
+//CHECK: define spir_func i32 @f5(<3 x i8> %[[x:.*]])
+//CHECK: %[[shuffle:.*]] = shufflevector <3 x i8> %[[x]], <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+//CHECK: %[[astype:.*]] = bitcast <4 x i8> %[[shuffle]] to i32
+//CHECK: ret i32 %[[astype]]
+int f5(char3 x) {
+  return __builtin_astype(x, int);
+}
+
+//CHECK: define spir_func i32 @f6(<4 x i8> %[[x:.*]])
+//CHECK: %[[astype:.*]] = bitcast <4 x i8> %[[x]] to i32
+//CHECK-NOT: shufflevector
+//CHECK: ret i32 %[[astype]]
+int f6(char4 x) {
+  return __builtin_astype(x, int);
+}
+
+//CHECK: define spir_func <3 x i8> @f7(<3 x i8> %[[x:.*]])
+//CHECK-NOT: bitcast
+//CHECK-NOT: shufflevector
+//CHECK: ret <3 x i8> %[[x]]
+char3 f7(char3 x) {
+  return __builtin_astype(x, char3);
+}
+
+//CHECK: define spir_func <3 x i32> @f8(<16 x i8> %[[x:.*]])
+//CHECK: %[[cast:.*]] = bitcast <16 x i8> %[[x]] to <4 x i32>
+//CHECK: %[[astype:.*]] = shufflevector <4 x i32> %[[cast]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+//CHECK: ret <3 x i32> %[[astype]]
+int3 f8(char16 x) {
+  return __builtin_astype(x, int3);
+}
diff --git a/test/CodeGenOpenCL/builtins-amdgcn-error.cl b/test/CodeGenOpenCL/builtins-amdgcn-error.cl
new file mode 100644
index 0000000..5c67666
--- /dev/null
+++ b/test/CodeGenOpenCL/builtins-amdgcn-error.cl
@@ -0,0 +1,50 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -target-cpu tahiti -verify -S -o - %s
+
+// FIXME: We only get one error if the functions are the other order in the
+// file.
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+typedef unsigned long ulong;
+typedef unsigned int uint;
+
+ulong test_s_memrealtime()
+{
+  return __builtin_amdgcn_s_memrealtime(); // expected-error {{'__builtin_amdgcn_s_memrealtime' needs target feature s-memrealtime}}
+}
+
+void test_s_sleep(int x)
+{
+  __builtin_amdgcn_s_sleep(x); // expected-error {{argument to '__builtin_amdgcn_s_sleep' must be a constant integer}}
+}
+
+void test_sicmp_i32(global ulong* out, int a, int b, uint c)
+{
+  *out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}}
+}
+
+void test_uicmp_i32(global ulong* out, uint a, uint b, uint c)
+{
+  *out = __builtin_amdgcn_uicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_uicmp' must be a constant integer}}
+}
+
+void test_sicmp_i64(global ulong* out, long a, long b, uint c)
+{
+  *out = __builtin_amdgcn_sicmpl(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmpl' must be a constant integer}}
+}
+
+void test_uicmp_i64(global ulong* out, ulong a, ulong b, uint c)
+{
+  *out = __builtin_amdgcn_uicmpl(a, b, c); // expected-error {{argument to '__builtin_amdgcn_uicmpl' must be a constant integer}}
+}
+
+void test_fcmp_f32(global ulong* out, float a, float b, uint c)
+{
+  *out = __builtin_amdgcn_fcmpf(a, b, c); // expected-error {{argument to '__builtin_amdgcn_fcmpf' must be a constant integer}}
+}
+
+void test_fcmp_f64(global ulong* out, double a, double b, uint c)
+{
+  *out = __builtin_amdgcn_fcmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_fcmp' must be a constant integer}}
+}
+
diff --git a/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
new file mode 100644
index 0000000..cda87a8
--- /dev/null
+++ b/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -0,0 +1,12 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -S -emit-llvm -o - %s | FileCheck %s
+
+typedef unsigned long ulong;
+
+
+// CHECK-LABEL: @test_s_memrealtime
+// CHECK: call i64 @llvm.amdgcn.s.memrealtime()
+void test_s_memrealtime(global ulong* out)
+{
+  *out = __builtin_amdgcn_s_memrealtime();
+}
diff --git a/test/CodeGenOpenCL/builtins-amdgcn.cl b/test/CodeGenOpenCL/builtins-amdgcn.cl
new file mode 100644
index 0000000..2347bc8
--- /dev/null
+++ b/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -0,0 +1,359 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+typedef unsigned long ulong;
+typedef unsigned int uint;
+
+// CHECK-LABEL: @test_div_scale_f64
+// CHECK: call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
+// CHECK-DAG: [[FLAG:%.+]] = extractvalue { double, i1 } %{{.+}}, 1
+// CHECK-DAG: [[VAL:%.+]] = extractvalue { double, i1 } %{{.+}}, 0
+// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32
+// CHECK: store i32 [[FLAGEXT]]
+void test_div_scale_f64(global double* out, global int* flagout, double a, double b)
+{
+  bool flag;
+  *out = __builtin_amdgcn_div_scale(a, b, true, &flag);
+  *flagout = flag;
+}
+
+// CHECK-LABEL: @test_div_scale_f32
+// CHECK: call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
+// CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1
+// CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0
+// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32
+// CHECK: store i32 [[FLAGEXT]]
+void test_div_scale_f32(global float* out, global int* flagout, float a, float b)
+{
+  bool flag;
+  *out = __builtin_amdgcn_div_scalef(a, b, true, &flag);
+  *flagout = flag;
+}
+
+// CHECK-LABEL: @test_div_fmas_f32
+// CHECK: call float @llvm.amdgcn.div.fmas.f32
+void test_div_fmas_f32(global float* out, float a, float b, float c, int d)
+{
+  *out = __builtin_amdgcn_div_fmasf(a, b, c, d);
+}
+
+// CHECK-LABEL: @test_div_fmas_f64
+// CHECK: call double @llvm.amdgcn.div.fmas.f64
+void test_div_fmas_f64(global double* out, double a, double b, double c, int d)
+{
+  *out = __builtin_amdgcn_div_fmas(a, b, c, d);
+}
+
+// CHECK-LABEL: @test_div_fixup_f32
+// CHECK: call float @llvm.amdgcn.div.fixup.f32
+void test_div_fixup_f32(global float* out, float a, float b, float c)
+{
+  *out = __builtin_amdgcn_div_fixupf(a, b, c);
+}
+
+// CHECK-LABEL: @test_div_fixup_f64
+// CHECK: call double @llvm.amdgcn.div.fixup.f64
+void test_div_fixup_f64(global double* out, double a, double b, double c)
+{
+  *out = __builtin_amdgcn_div_fixup(a, b, c);
+}
+
+// CHECK-LABEL: @test_trig_preop_f32
+// CHECK: call float @llvm.amdgcn.trig.preop.f32
+void test_trig_preop_f32(global float* out, float a, int b)
+{
+  *out = __builtin_amdgcn_trig_preopf(a, b);
+}
+
+// CHECK-LABEL: @test_trig_preop_f64
+// CHECK: call double @llvm.amdgcn.trig.preop.f64
+void test_trig_preop_f64(global double* out, double a, int b)
+{
+  *out = __builtin_amdgcn_trig_preop(a, b);
+}
+
+// CHECK-LABEL: @test_rcp_f32
+// CHECK: call float @llvm.amdgcn.rcp.f32
+void test_rcp_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_rcpf(a);
+}
+
+// CHECK-LABEL: @test_rcp_f64
+// CHECK: call double @llvm.amdgcn.rcp.f64
+void test_rcp_f64(global double* out, double a)
+{
+  *out = __builtin_amdgcn_rcp(a);
+}
+
+// CHECK-LABEL: @test_rsq_f32
+// CHECK: call float @llvm.amdgcn.rsq.f32
+void test_rsq_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_rsqf(a);
+}
+
+// CHECK-LABEL: @test_rsq_f64
+// CHECK: call double @llvm.amdgcn.rsq.f64
+void test_rsq_f64(global double* out, double a)
+{
+  *out = __builtin_amdgcn_rsq(a);
+}
+
+// CHECK-LABEL: @test_rsq_clamp_f32
+// CHECK: call float @llvm.amdgcn.rsq.clamp.f32
+void test_rsq_clamp_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_rsq_clampf(a);
+}
+
+// CHECK-LABEL: @test_rsq_clamp_f64
+// CHECK: call double @llvm.amdgcn.rsq.clamp.f64
+void test_rsq_clamp_f64(global double* out, double a)
+{
+  *out = __builtin_amdgcn_rsq_clamp(a);
+}
+
+// CHECK-LABEL: @test_sin_f32
+// CHECK: call float @llvm.amdgcn.sin.f32
+void test_sin_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_sinf(a);
+}
+
+// CHECK-LABEL: @test_cos_f32
+// CHECK: call float @llvm.amdgcn.cos.f32
+void test_cos_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_cosf(a);
+}
+
+// CHECK-LABEL: @test_log_clamp_f32
+// CHECK: call float @llvm.amdgcn.log.clamp.f32
+void test_log_clamp_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_log_clampf(a);
+}
+
+// CHECK-LABEL: @test_ldexp_f32
+// CHECK: call float @llvm.amdgcn.ldexp.f32
+void test_ldexp_f32(global float* out, float a, int b)
+{
+  *out = __builtin_amdgcn_ldexpf(a, b);
+}
+
+// CHECK-LABEL: @test_ldexp_f64
+// CHECK: call double @llvm.amdgcn.ldexp.f64
+void test_ldexp_f64(global double* out, double a, int b)
+{
+  *out = __builtin_amdgcn_ldexp(a, b);
+}
+
+// CHECK-LABEL: @test_frexp_mant_f32
+// CHECK: call float @llvm.amdgcn.frexp.mant.f32
+void test_frexp_mant_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_frexp_mantf(a);
+}
+
+// CHECK-LABEL: @test_frexp_mant_f64
+// CHECK: call double @llvm.amdgcn.frexp.mant.f64
+void test_frexp_mant_f64(global double* out, double a)
+{
+  *out = __builtin_amdgcn_frexp_mant(a);
+}
+
+// CHECK-LABEL: @test_frexp_exp_f32
+// CHECK: call i32 @llvm.amdgcn.frexp.exp.f32
+void test_frexp_exp_f32(global int* out, float a)
+{
+  *out = __builtin_amdgcn_frexp_expf(a);
+}
+
+// CHECK-LABEL: @test_frexp_exp_f64
+// CHECK: call i32 @llvm.amdgcn.frexp.exp.f64
+void test_frexp_exp_f64(global int* out, double a)
+{
+  *out = __builtin_amdgcn_frexp_exp(a);
+}
+
+// CHECK-LABEL: @test_fract_f32
+// CHECK: call float @llvm.amdgcn.fract.f32
+void test_fract_f32(global int* out, float a)
+{
+  *out = __builtin_amdgcn_fractf(a);
+}
+
+// CHECK-LABEL: @test_fract_f64
+// CHECK: call double @llvm.amdgcn.fract.f64
+void test_fract_f64(global int* out, double a)
+{
+  *out = __builtin_amdgcn_fract(a);
+}
+
+// CHECK-LABEL: @test_lerp
+// CHECK: call i32 @llvm.amdgcn.lerp
+void test_lerp(global int* out, int a, int b, int c)
+{
+  *out = __builtin_amdgcn_lerp(a, b, c);
+}
+
+// CHECK-LABEL: @test_sicmp_i32
+// CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+void test_sicmp_i32(global ulong* out, int a, int b)
+{
+  *out = __builtin_amdgcn_sicmp(a, b, 32);
+}
+
+// CHECK-LABEL: @test_uicmp_i32
+// CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+void test_uicmp_i32(global ulong* out, uint a, uint b)
+{
+  *out = __builtin_amdgcn_uicmp(a, b, 32);
+}
+
+// CHECK-LABEL: @test_sicmp_i64
+// CHECK: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 38)
+void test_sicmp_i64(global ulong* out, long a, long b)
+{
+  *out = __builtin_amdgcn_sicmpl(a, b, 39-1);
+}
+
+// CHECK-LABEL: @test_uicmp_i64
+// CHECK: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 35)
+void test_uicmp_i64(global ulong* out, ulong a, ulong b)
+{
+  *out = __builtin_amdgcn_uicmpl(a, b, 30+5);
+}
+
+// CHECK-LABEL: @test_fcmp_f32
+// CHECK: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 5)
+void test_fcmp_f32(global ulong* out, float a, float b)
+{
+  *out = __builtin_amdgcn_fcmpf(a, b, 5);
+}
+
+// CHECK-LABEL: @test_fcmp_f64
+// CHECK: call i64 @llvm.amdgcn.fcmp.f64(double %a, double %b, i32 6)
+void test_fcmp_f64(global ulong* out, double a, double b)
+{
+  *out = __builtin_amdgcn_fcmp(a, b, 3+3);
+}
+
+// CHECK-LABEL: @test_class_f32
+// CHECK: call i1 @llvm.amdgcn.class.f32
+void test_class_f32(global float* out, float a, int b)
+{
+  *out = __builtin_amdgcn_classf(a, b);
+}
+
+// CHECK-LABEL: @test_class_f64
+// CHECK: call i1 @llvm.amdgcn.class.f64
+void test_class_f64(global double* out, double a, int b)
+{
+  *out = __builtin_amdgcn_class(a, b);
+}
+
+// CHECK-LABEL: @test_s_barrier
+// CHECK: call void @llvm.amdgcn.s.barrier(
+void test_s_barrier()
+{
+  __builtin_amdgcn_s_barrier();
+}
+
+// CHECK-LABEL: @test_s_memtime
+// CHECK: call i64 @llvm.amdgcn.s.memtime()
+void test_s_memtime(global ulong* out)
+{
+  *out = __builtin_amdgcn_s_memtime();
+}
+
+// CHECK-LABEL: @test_s_sleep
+// CHECK: call void @llvm.amdgcn.s.sleep(i32 1)
+// CHECK: call void @llvm.amdgcn.s.sleep(i32 15)
+void test_s_sleep()
+{
+  __builtin_amdgcn_s_sleep(1);
+  __builtin_amdgcn_s_sleep(15);
+}
+
+// CHECK-LABEL: @test_cubeid(
+// CHECK: call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
+void test_cubeid(global float* out, float a, float b, float c) {
+  *out = __builtin_amdgcn_cubeid(a, b, c);
+}
+
+// CHECK-LABEL: @test_cubesc(
+// CHECK: call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
+void test_cubesc(global float* out, float a, float b, float c) {
+  *out = __builtin_amdgcn_cubesc(a, b, c);
+}
+
+// CHECK-LABEL: @test_cubetc(
+// CHECK: call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
+void test_cubetc(global float* out, float a, float b, float c) {
+  *out = __builtin_amdgcn_cubetc(a, b, c);
+}
+
+// CHECK-LABEL: @test_cubema(
+// CHECK: call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
+void test_cubema(global float* out, float a, float b, float c) {
+  *out = __builtin_amdgcn_cubema(a, b, c);
+}
+
+// CHECK-LABEL: @test_read_exec(
+// CHECK: call i64 @llvm.read_register.i64(metadata ![[EXEC:[0-9]+]]) #[[READ_EXEC_ATTRS:[0-9]+]]
+void test_read_exec(global ulong* out) {
+  *out = __builtin_amdgcn_read_exec();
+}
+
+// CHECK: declare i64 @llvm.read_register.i64(metadata) #[[NOUNWIND_READONLY:[0-9]+]]
+
+// CHECK-LABEL: @test_kernarg_segment_ptr
+// CHECK: call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+void test_kernarg_segment_ptr(__attribute__((address_space(2))) unsigned char ** out)
+{
+  *out = __builtin_amdgcn_kernarg_segment_ptr();
+}
+
+// CHECK-LABEL: @test_implicitarg_ptr
+// CHECK: call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+void test_implicitarg_ptr(__attribute__((address_space(2))) unsigned char ** out)
+{
+  *out = __builtin_amdgcn_implicitarg_ptr();
+}
+
+// CHECK-LABEL: @test_get_group_id(
+// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.x()
+// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.y()
+// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.z()
+void test_get_group_id(int d, global int *out)
+{
+	switch (d) {
+	case 0: *out = __builtin_amdgcn_workgroup_id_x(); break;
+	case 1: *out = __builtin_amdgcn_workgroup_id_y(); break;
+	case 2: *out = __builtin_amdgcn_workgroup_id_z(); break;
+	default: *out = 0;
+	}
+}
+
+// CHECK-LABEL: @test_get_local_id(
+// CHECK: tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[WI_RANGE:![0-9]*]]
+// CHECK: tail call i32 @llvm.amdgcn.workitem.id.y(), !range [[WI_RANGE]]
+// CHECK: tail call i32 @llvm.amdgcn.workitem.id.z(), !range [[WI_RANGE]]
+void test_get_local_id(int d, global int *out)
+{
+	switch (d) {
+	case 0: *out = __builtin_amdgcn_workitem_id_x(); break;
+	case 1: *out = __builtin_amdgcn_workitem_id_y(); break;
+	case 2: *out = __builtin_amdgcn_workitem_id_z(); break;
+	default: *out = 0;
+	}
+}
+
+// CHECK-DAG: [[WI_RANGE]] = !{i32 0, i32 1024}
+// CHECK-DAG: attributes #[[NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly }
+// CHECK-DAG: attributes #[[READ_EXEC_ATTRS]] = { convergent }
+// CHECK-DAG: ![[EXEC]] = !{!"exec"}
diff --git a/test/CodeGenOpenCL/builtins-generic-amdgcn.cl b/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
new file mode 100644
index 0000000..5a4756b
--- /dev/null
+++ b/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
@@ -0,0 +1,16 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_builtin_clz(
+// CHECK: tail call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+void test_builtin_clz(global int* out, int a)
+{
+  *out = __builtin_clz(a);
+}
+
+// CHECK-LABEL: @test_builtin_clzl(
+// CHECK: tail call i64 @llvm.ctlz.i64(i64 %a, i1 true)
+void test_builtin_clzl(global long* out, long a)
+{
+  *out = __builtin_clzl(a);
+}
diff --git a/test/CodeGenOpenCL/builtins-r600.cl b/test/CodeGenOpenCL/builtins-r600.cl
index 3e416b0..027a54a 100644
--- a/test/CodeGenOpenCL/builtins-r600.cl
+++ b/test/CodeGenOpenCL/builtins-r600.cl
@@ -1,143 +1,55 @@
-// REQUIRES: r600-registered-target
-// RUN: %clang_cc1 -triple r600-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple r600-unknown-unknown -target-cpu cypress -S -emit-llvm -o - %s | FileCheck %s
 
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// CHECK-LABEL: @test_div_scale_f64
-// CHECK: call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true)
-// CHECK-DAG: [[FLAG:%.+]] = extractvalue { double, i1 } %{{.+}}, 1
-// CHECK-DAG: [[VAL:%.+]] = extractvalue { double, i1 } %{{.+}}, 0
-// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32
-// CHECK: store i32 [[FLAGEXT]]
-void test_div_scale_f64(global double* out, global int* flagout, double a, double b)
+// CHECK-LABEL: @test_recipsqrt_ieee_f32
+// CHECK: call float @llvm.r600.recipsqrt.ieee.f32
+void test_recipsqrt_ieee_f32(global float* out, float a)
 {
-  bool flag;
-  *out = __builtin_amdgpu_div_scale(a, b, true, &flag);
-  *flagout = flag;
+  *out = __builtin_r600_recipsqrt_ieeef(a);
 }
 
-// CHECK-LABEL: @test_div_scale_f32
-// CHECK: call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true)
-// CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1
-// CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0
-// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32
-// CHECK: store i32 [[FLAGEXT]]
-void test_div_scale_f32(global float* out, global int* flagout, float a, float b)
+#if cl_khr_fp64
+// XCHECK-LABEL: @test_recipsqrt_ieee_f64
+// XCHECK: call double @llvm.r600.recipsqrt.ieee.f64
+void test_recipsqrt_ieee_f64(global double* out, double a)
 {
-  bool flag;
-  *out = __builtin_amdgpu_div_scalef(a, b, true, &flag);
-  *flagout = flag;
+  *out = __builtin_r600_recipsqrt_ieee(a);
+}
+#endif
+
+// CHECK-LABEL: @test_implicitarg_ptr
+// CHECK: call i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
+void test_implicitarg_ptr(__attribute__((address_space(7))) unsigned char ** out)
+{
+  *out = __builtin_r600_implicitarg_ptr();
 }
 
-// CHECK-LABEL: @test_div_fmas_f32
-// CHECK: call float @llvm.AMDGPU.div.fmas.f32
-void test_div_fmas_f32(global float* out, float a, float b, float c, int d)
+// CHECK-LABEL: @test_get_group_id(
+// CHECK: tail call i32 @llvm.r600.read.tgid.x()
+// CHECK: tail call i32 @llvm.r600.read.tgid.y()
+// CHECK: tail call i32 @llvm.r600.read.tgid.z()
+void test_get_group_id(int d, global int *out)
 {
-  *out = __builtin_amdgpu_div_fmasf(a, b, c, d);
+	switch (d) {
+	case 0: *out = __builtin_r600_read_tgid_x(); break;
+	case 1: *out = __builtin_r600_read_tgid_y(); break;
+	case 2: *out = __builtin_r600_read_tgid_z(); break;
+	default: *out = 0;
+	}
 }
 
-// CHECK-LABEL: @test_div_fmas_f64
-// CHECK: call double @llvm.AMDGPU.div.fmas.f64
-void test_div_fmas_f64(global double* out, double a, double b, double c, int d)
+// CHECK-LABEL: @test_get_local_id(
+// CHECK: tail call i32 @llvm.r600.read.tidig.x(), !range [[WI_RANGE:![0-9]*]]
+// CHECK: tail call i32 @llvm.r600.read.tidig.y(), !range [[WI_RANGE]]
+// CHECK: tail call i32 @llvm.r600.read.tidig.z(), !range [[WI_RANGE]]
+void test_get_local_id(int d, global int *out)
 {
-  *out = __builtin_amdgpu_div_fmas(a, b, c, d);
+	switch (d) {
+	case 0: *out = __builtin_r600_read_tidig_x(); break;
+	case 1: *out = __builtin_r600_read_tidig_y(); break;
+	case 2: *out = __builtin_r600_read_tidig_z(); break;
+	default: *out = 0;
+	}
 }
 
-// CHECK-LABEL: @test_div_fixup_f32
-// CHECK: call float @llvm.AMDGPU.div.fixup.f32
-void test_div_fixup_f32(global float* out, float a, float b, float c)
-{
-  *out = __builtin_amdgpu_div_fixupf(a, b, c);
-}
-
-// CHECK-LABEL: @test_div_fixup_f64
-// CHECK: call double @llvm.AMDGPU.div.fixup.f64
-void test_div_fixup_f64(global double* out, double a, double b, double c)
-{
-  *out = __builtin_amdgpu_div_fixup(a, b, c);
-}
-
-// CHECK-LABEL: @test_trig_preop_f32
-// CHECK: call float @llvm.AMDGPU.trig.preop.f32
-void test_trig_preop_f32(global float* out, float a, int b)
-{
-  *out = __builtin_amdgpu_trig_preopf(a, b);
-}
-
-// CHECK-LABEL: @test_trig_preop_f64
-// CHECK: call double @llvm.AMDGPU.trig.preop.f64
-void test_trig_preop_f64(global double* out, double a, int b)
-{
-  *out = __builtin_amdgpu_trig_preop(a, b);
-}
-
-// CHECK-LABEL: @test_rcp_f32
-// CHECK: call float @llvm.AMDGPU.rcp.f32
-void test_rcp_f32(global float* out, float a)
-{
-  *out = __builtin_amdgpu_rcpf(a);
-}
-
-// CHECK-LABEL: @test_rcp_f64
-// CHECK: call double @llvm.AMDGPU.rcp.f64
-void test_rcp_f64(global double* out, double a)
-{
-  *out = __builtin_amdgpu_rcp(a);
-}
-
-// CHECK-LABEL: @test_rsq_f32
-// CHECK: call float @llvm.AMDGPU.rsq.f32
-void test_rsq_f32(global float* out, float a)
-{
-  *out = __builtin_amdgpu_rsqf(a);
-}
-
-// CHECK-LABEL: @test_rsq_f64
-// CHECK: call double @llvm.AMDGPU.rsq.f64
-void test_rsq_f64(global double* out, double a)
-{
-  *out = __builtin_amdgpu_rsq(a);
-}
-
-// CHECK-LABEL: @test_rsq_clamped_f32
-// CHECK: call float @llvm.AMDGPU.rsq.clamped.f32
-void test_rsq_clamped_f32(global float* out, float a)
-{
-  *out = __builtin_amdgpu_rsq_clampedf(a);
-}
-
-// CHECK-LABEL: @test_rsq_clamped_f64
-// CHECK: call double @llvm.AMDGPU.rsq.clamped.f64
-void test_rsq_clamped_f64(global double* out, double a)
-{
-  *out = __builtin_amdgpu_rsq_clamped(a);
-}
-
-// CHECK-LABEL: @test_ldexp_f32
-// CHECK: call float @llvm.AMDGPU.ldexp.f32
-void test_ldexp_f32(global float* out, float a, int b)
-{
-  *out = __builtin_amdgpu_ldexpf(a, b);
-}
-
-// CHECK-LABEL: @test_ldexp_f64
-// CHECK: call double @llvm.AMDGPU.ldexp.f64
-void test_ldexp_f64(global double* out, double a, int b)
-{
-  *out = __builtin_amdgpu_ldexp(a, b);
-}
-
-// CHECK-LABEL: @test_class_f32
-// CHECK: call i1 @llvm.AMDGPU.class.f32
-void test_class_f32(global float* out, float a, int b)
-{
-  *out = __builtin_amdgpu_classf(a, b);
-}
-
-// CHECK-LABEL: @test_class_f64
-// CHECK: call i1 @llvm.AMDGPU.class.f64
-void test_class_f64(global double* out, double a, int b)
-{
-  *out = __builtin_amdgpu_class(a, b);
-}
+// CHECK-DAG: [[WI_RANGE]] = !{i32 0, i32 1024}
diff --git a/test/CodeGenOpenCL/cast_image.cl b/test/CodeGenOpenCL/cast_image.cl
new file mode 100644
index 0000000..479404a
--- /dev/null
+++ b/test/CodeGenOpenCL/cast_image.cl
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa %s | FileCheck --check-prefix=AMDGCN %s
+// RUN: %clang_cc1 -emit-llvm -o - -triple spir-unknown-unknown %s | FileCheck --check-prefix=SPIR %s
+
+#ifdef __AMDGCN__
+
+constant int* convert(image2d_t img) {
+  // AMDGCN: bitcast %opencl.image2d_ro_t addrspace(2)* %img to i32 addrspace(2)*
+  return __builtin_astype(img, constant int*);
+}
+
+#else
+
+global int* convert(image2d_t img) {
+  // SPIR: bitcast %opencl.image2d_ro_t addrspace(1)* %img to i32 addrspace(1)*
+  return __builtin_astype(img, global int*);
+}
+
+#endif
diff --git a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
new file mode 100644
index 0000000..783ce02
--- /dev/null
+++ b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -0,0 +1,110 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - | FileCheck %s
+
+typedef void (^bl_t)(local void *);
+
+const bl_t block_G = (bl_t) ^ (local void *a) {};
+
+kernel void device_side_enqueue(global int *a, global int *b, int i) {
+  // CHECK: %default_queue = alloca %opencl.queue_t*
+  queue_t default_queue;
+  // CHECK: %flags = alloca i32
+  unsigned flags = 0;
+  // CHECK: %ndrange = alloca %opencl.ndrange_t*
+  ndrange_t ndrange;
+  // CHECK: %clk_event = alloca %opencl.clk_event_t*
+  clk_event_t clk_event;
+  // CHECK: %event_wait_list = alloca %opencl.clk_event_t*
+  clk_event_t event_wait_list;
+  // CHECK: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*]
+  clk_event_t event_wait_list2[] = {clk_event};
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(3)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block to void ()*
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void ()* [[BL]] to i8*
+  // CHECK: call i32 @__enqueue_kernel_basic(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i8* [[BL_I8]])
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(void) {
+                   a[i] = b[i];
+                 });
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(3)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void ()* [[BL]] to i8*
+  // CHECK: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i32 2, %opencl.clk_event_t** %event_wait_list, %opencl.clk_event_t** %clk_event, i8* [[BL_I8]])
+  enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event,
+                 ^(void) {
+                   a[i] = b[i];
+                 });
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: call i32 (%opencl.queue_t*, i32, %opencl.ndrange_t*, i8*, i32, ...) @__enqueue_kernel_vaargs(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(3)* }* @__block_literal_global{{(.[0-9]+)?}} to i8*), i32 1, i32 256)
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *p) {
+                   return;
+                 },
+                 256);
+  char c;
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[SIZE:%[0-9]+]] = zext i8 {{%[0-9]+}} to i32
+  // CHECK: call i32 (%opencl.queue_t*, i32, %opencl.ndrange_t*, i8*, i32, ...) @__enqueue_kernel_vaargs(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(3)* }* @__block_literal_global{{(.[0-9]+)?}} to i8*), i32 1, i32 [[SIZE]])
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *p) {
+                   return;
+                 },
+                 c);
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
+  // CHECK: call i32 (%opencl.queue_t*, i32, %opencl.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, i8*, i32, ...) @__enqueue_kernel_events_vaargs(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i32 2, %opencl.clk_event_t** [[AD]], %opencl.clk_event_t** %clk_event, i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(3)* }* @__block_literal_global{{(.[0-9]+)?}} to i8*), i32 1, i32 256)
+  enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event,
+                 ^(local void *p) {
+                   return;
+                 },
+                 256);
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
+  // CHECK: [[SIZE:%[0-9]+]] = zext i8 {{%[0-9]+}} to i32
+  // CHECK: call i32 (%opencl.queue_t*, i32, %opencl.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, i8*, i32, ...) @__enqueue_kernel_events_vaargs(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i32 2, %opencl.clk_event_t** [[AD]], %opencl.clk_event_t** %clk_event, i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(3)* }* @__block_literal_global{{(.[0-9]+)?}} to i8*), i32 1, i32 [[SIZE]])
+  enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event,
+                 ^(local void *p) {
+                   return;
+                 },
+                 c);
+
+  void (^const block_A)(void) = ^{
+    return;
+  };
+  void (^const block_B)(local void *) = ^(local void *a) {
+    return;
+  };
+
+  // CHECK: [[BL:%[0-9]+]] = load void ()*, void ()** %block_A
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void ()* [[BL]] to i8*
+  // CHECK: call i32 @__get_kernel_work_group_size_impl(i8* [[BL_I8]])
+  unsigned size = get_kernel_work_group_size(block_A);
+  // CHECK: [[BL:%[0-9]+]] = load void (i8 addrspace(2)*)*, void (i8 addrspace(2)*)** %block_B
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void (i8 addrspace(2)*)* [[BL]] to i8*
+  // CHECK: call i32 @__get_kernel_work_group_size_impl(i8* [[BL_I8]])
+  size = get_kernel_work_group_size(block_B);
+  // CHECK: [[BL:%[0-9]+]] = load void ()*, void ()** %block_A
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void ()* [[BL]] to i8*
+  // CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8* [[BL_I8]])
+  size = get_kernel_preferred_work_group_size_multiple(block_A);
+  // CHECK: [[BL:%[0-9]+]] = load void (i8 addrspace(2)*)*, void (i8 addrspace(2)*)* addrspace(1)* @block_G
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void (i8 addrspace(2)*)* [[BL]] to i8*
+  // CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8* [[BL_I8]])
+  size = get_kernel_preferred_work_group_size_multiple(block_G);
+}
diff --git a/test/CodeGenOpenCL/constant-addr-space-globals.cl b/test/CodeGenOpenCL/constant-addr-space-globals.cl
index 92fb979..f81a514 100644
--- a/test/CodeGenOpenCL/constant-addr-space-globals.cl
+++ b/test/CodeGenOpenCL/constant-addr-space-globals.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -ffake-address-space-map -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -cl-opt-disable -ffake-address-space-map -emit-llvm -o - | FileCheck %s
 
 // CHECK: @array = addrspace({{[0-9]+}}) constant
 __constant float array[2] = {0.0f, 1.0f};
diff --git a/test/CodeGenOpenCL/denorms-are-zero.cl b/test/CodeGenOpenCL/denorms-are-zero.cl
index 488004f..4262730 100644
--- a/test/CodeGenOpenCL/denorms-are-zero.cl
+++ b/test/CodeGenOpenCL/denorms-are-zero.cl
@@ -1,5 +1,13 @@
 // RUN: %clang_cc1 -S -cl-denorms-are-zero -o - %s 2>&1
+// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck %s --check-prefix=CHECK-DENORM
 
-// This test just checks that the -cl-denorms-are-zero argument is accepted
+// For non-amdgcn targets, this test just checks that the -cl-denorms-are-zero argument is accepted
 // by clang.  This option is currently a no-op, which is allowed by the
 // OpenCL specification.
+
+// CHECK-DENORM-LABEL: define void @f()
+// CHECK-DENORM: attributes #{{[0-9]*}} = {{{[^}]*}} "target-features"="{{[^"]*}}+fp32-denormals,+fp64-denormals{{[^"]*}}"
+// CHECK-LABEL: define void @f()
+// CHECK-NOT: attributes #{{[0-9]*}} = {{{[^}]*}} "target-features"="{{[^"]*}}+fp32-denormals,+fp64-denormals{{[^"]*}}"
+void f() {}
diff --git a/test/CodeGenOpenCL/event_t.cl b/test/CodeGenOpenCL/event_t.cl
index a84d8bb..aad441f 100644
--- a/test/CodeGenOpenCL/event_t.cl
+++ b/test/CodeGenOpenCL/event_t.cl
@@ -9,4 +9,6 @@
 // CHECK: call {{.*}}void @foo(%opencl.event_t* %
   foo(0);
 // CHECK: call {{.*}}void @foo(%opencl.event_t* null)
+  foo((event_t)0);
+// CHECK: call {{.*}}void @foo(%opencl.event_t* null)
 }
diff --git a/test/CodeGenOpenCL/fpmath.cl b/test/CodeGenOpenCL/fpmath.cl
index ef4da84..780c95f 100644
--- a/test/CodeGenOpenCL/fpmath.cl
+++ b/test/CodeGenOpenCL/fpmath.cl
@@ -1,16 +1,23 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck --check-prefix=CHECK --check-prefix=NODIVOPT %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -cl-fp32-correctly-rounded-divide-sqrt | FileCheck --check-prefix=CHECK --check-prefix=DIVOPT %s
 
 typedef __attribute__(( ext_vector_type(4) )) float float4;
 
 float spscalardiv(float a, float b) {
   // CHECK: @spscalardiv
-  // CHECK: fdiv{{.*}}, !fpmath ![[MD:[0-9]+]]
+  // CHECK: #[[ATTR:[0-9]+]]
+  // CHECK: fdiv{{.*}},
+  // NODIVOPT: !fpmath ![[MD:[0-9]+]]
+  // DIVOPT-NOT: !fpmath ![[MD:[0-9]+]]
   return a / b;
 }
 
 float4 spvectordiv(float4 a, float4 b) {
   // CHECK: @spvectordiv
-  // CHECK: fdiv{{.*}}, !fpmath ![[MD]]
+  // CHECK: #[[ATTR]]
+  // CHECK: fdiv{{.*}},
+  // NODIVOPT: !fpmath ![[MD]]
+  // DIVOPT-NOT: !fpmath ![[MD]]
   return a / b;
 }
 
@@ -18,8 +25,13 @@
 
 double dpscalardiv(double a, double b) {
   // CHECK: @dpscalardiv
+  // CHECK: #[[ATTR]]
   // CHECK-NOT: !fpmath
   return a / b;
 }
 
-// CHECK: ![[MD]] = !{float 2.500000e+00}
+// CHECK: attributes #[[ATTR]] = {
+// NODIVOPT: "correctly-rounded-divide-sqrt-fp-math"="false"
+// DIVOPT: "correctly-rounded-divide-sqrt-fp-math"="true"
+// CHECK: }
+// NODIVOPT: ![[MD]] = !{float 2.500000e+00}
diff --git a/test/CodeGenOpenCL/half.cl b/test/CodeGenOpenCL/half.cl
index bd5ae7f..9acabf0 100644
--- a/test/CodeGenOpenCL/half.cl
+++ b/test/CodeGenOpenCL/half.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
diff --git a/test/CodeGenOpenCL/images.cl b/test/CodeGenOpenCL/images.cl
new file mode 100644
index 0000000..eb054ec
--- /dev/null
+++ b/test/CodeGenOpenCL/images.cl
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -O0 -emit-llvm -o - | FileCheck %s
+
+__attribute__((overloadable)) void read_image(read_only image1d_t img_ro);
+__attribute__((overloadable)) void read_image(write_only image1d_t img_wo);
+
+kernel void test_read_image(read_only image1d_t img_ro, write_only image1d_t img_wo) {
+  // CHECK: call void @_Z10read_image14ocl_image1d_ro(%opencl.image1d_ro_t* %{{[0-9]+}})
+  read_image(img_ro);
+  // CHECK: call void @_Z10read_image14ocl_image1d_wo(%opencl.image1d_wo_t* %{{[0-9]+}})
+  read_image(img_wo);
+}
diff --git a/test/CodeGenOpenCL/kernel-arg-info.cl b/test/CodeGenOpenCL/kernel-arg-info.cl
index 4bc191e..5a5c8f9 100644
--- a/test/CodeGenOpenCL/kernel-arg-info.cl
+++ b/test/CodeGenOpenCL/kernel-arg-info.cl
@@ -1,55 +1,88 @@
-// RUN: %clang_cc1 %s -cl-kernel-arg-info -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s -check-prefix ARGINFO
-// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s -check-prefix NO-ARGINFO
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -cl-kernel-arg-info | FileCheck %s -check-prefix ARGINFO
 
 kernel void foo(__global int * restrict X, const int Y, 
                 volatile int anotherArg, __constant float * restrict Z) {
   *X = Y + anotherArg;
 }
-
-// CHECK:  !{!"kernel_arg_addr_space", i32 1, i32 0, i32 0, i32 2}
-// CHECK:  !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
-// CHECK:  !{!"kernel_arg_type", !"int*", !"int", !"int", !"float*"}
-// CHECK:  !{!"kernel_arg_base_type", !"int*", !"int", !"int", !"float*"}
-// CHECK:  !{!"kernel_arg_type_qual", !"restrict", !"const", !"volatile", !"restrict const"}
-// ARGINFO: !{!"kernel_arg_name", !"X", !"Y", !"anotherArg", !"Z"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"X", !"Y", !"anotherArg", !"Z"}
+// CHECK: define spir_kernel void @foo{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD11:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD12:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD13:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD13]]
+// CHECK: !kernel_arg_type_qual ![[MD14:[0-9]+]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD15:[0-9]+]]
 
 kernel void foo2(read_only image1d_t img1, image2d_t img2, write_only image2d_array_t img3) {
 }
-// CHECK:  !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
-// CHECK:  !{!"kernel_arg_access_qual", !"read_only", !"read_only", !"write_only"}
-// CHECK:  !{!"kernel_arg_type", !"image1d_t", !"image2d_t", !"image2d_array_t"}
-// CHECK:  !{!"kernel_arg_base_type", !"image1d_t", !"image2d_t", !"image2d_array_t"}
-// CHECK:  !{!"kernel_arg_type_qual", !"", !"", !""}
-// ARGINFO: !{!"kernel_arg_name", !"img1", !"img2", !"img3"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"img1", !"img2", !"img3"}
+// CHECK: define spir_kernel void @foo2{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD21:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD22:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD23:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD23]]
+// CHECK: !kernel_arg_type_qual ![[MD24:[0-9]+]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD25:[0-9]+]]
 
 kernel void foo3(__global half * X) {
 }
-// CHECK:  !{!"kernel_arg_addr_space", i32 1}
-// CHECK:  !{!"kernel_arg_access_qual", !"none"}
-// CHECK:  !{!"kernel_arg_type", !"half*"}
-// CHECK:  !{!"kernel_arg_base_type", !"half*"}
-// CHECK:  !{!"kernel_arg_type_qual", !""}
-// ARGINFO: !{!"kernel_arg_name", !"X"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"X"}
+// CHECK: define spir_kernel void @foo3{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD31:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD32:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD33:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD33]]
+// CHECK: !kernel_arg_type_qual ![[MD34:[0-9]+]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD35:[0-9]+]]
 
 typedef unsigned int myunsignedint;
 kernel void foo4(__global unsigned int * X, __global myunsignedint * Y) {
 }
-// CHECK:  !{!"kernel_arg_addr_space", i32 1, i32 1}
-// CHECK:  !{!"kernel_arg_access_qual", !"none", !"none"}
-// CHECK:  !{!"kernel_arg_type", !"uint*", !"myunsignedint*"}
-// CHECK:  !{!"kernel_arg_base_type", !"uint*", !"uint*"}
-// CHECK:  !{!"kernel_arg_type_qual", !"", !""}
-// ARGINFO: !{!"kernel_arg_name", !"X", !"Y"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"X", !"Y"}
+// CHECK: define spir_kernel void @foo4{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD41:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD42:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD43:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD44:[0-9]+]]
+// CHECK: !kernel_arg_type_qual ![[MD45:[0-9]+]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD46:[0-9]+]]
 
 typedef image1d_t myImage;
-kernel void foo5(read_only myImage img1, write_only image1d_t img2) {
+kernel void foo5(myImage img1, write_only image1d_t img2) {
 }
-// CHECK:  !{!"kernel_arg_access_qual", !"read_only", !"write_only"}
-// CHECK:  !{!"kernel_arg_type", !"myImage", !"image1d_t"}
-// CHECK:  !{!"kernel_arg_base_type", !"image1d_t", !"image1d_t"}
-// ARGINFO: !{!"kernel_arg_name", !"img1", !"img2"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"img1", !"img2"}
+// CHECK: define spir_kernel void @foo5{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD41:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD51:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD52:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD53:[0-9]+]]
+// CHECK: !kernel_arg_type_qual ![[MD45]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD54:[0-9]+]]
+
+// CHECK: ![[MD11]] = !{i32 1, i32 0, i32 0, i32 2}
+// CHECK: ![[MD12]] = !{!"none", !"none", !"none", !"none"}
+// CHECK: ![[MD13]] = !{!"int*", !"int", !"int", !"float*"}
+// CHECK: ![[MD14]] = !{!"restrict", !"const", !"volatile", !"restrict const"}
+// ARGINFO: ![[MD15]] = !{!"X", !"Y", !"anotherArg", !"Z"}
+// CHECK: ![[MD21]] = !{i32 1, i32 1, i32 1}
+// CHECK: ![[MD22]] = !{!"read_only", !"read_only", !"write_only"}
+// CHECK: ![[MD23]] = !{!"__read_only image1d_t", !"__read_only image2d_t", !"__write_only image2d_array_t"}
+// CHECK: ![[MD24]] = !{!"", !"", !""}
+// ARGINFO: ![[MD25]] = !{!"img1", !"img2", !"img3"}
+// CHECK: ![[MD31]] = !{i32 1}
+// CHECK: ![[MD32]] = !{!"none"}
+// CHECK: ![[MD33]] = !{!"half*"}
+// CHECK: ![[MD34]] = !{!""}
+// ARGINFO: ![[MD35]] = !{!"X"}
+// CHECK: ![[MD41]] = !{i32 1, i32 1}
+// CHECK: ![[MD42]] = !{!"none", !"none"}
+// CHECK: ![[MD43]] = !{!"uint*", !"myunsignedint*"}
+// CHECK: ![[MD44]] = !{!"uint*", !"uint*"}
+// CHECK: ![[MD45]] = !{!"", !""}
+// ARGINFO: ![[MD46]] = !{!"X", !"Y"}
+// CHECK: ![[MD51]] = !{!"read_only", !"write_only"}
+// CHECK: ![[MD52]] = !{!"myImage", !"__write_only image1d_t"}
+// CHECK: ![[MD53]] = !{!"__read_only image1d_t", !"__write_only image1d_t"}
+// ARGINFO: ![[MD54]] = !{!"img1", !"img2"}
+
diff --git a/test/CodeGenOpenCL/kernel-attributes.cl b/test/CodeGenOpenCL/kernel-attributes.cl
index 8f22d61..4a116dd 100644
--- a/test/CodeGenOpenCL/kernel-attributes.cl
+++ b/test/CodeGenOpenCL/kernel-attributes.cl
@@ -3,14 +3,12 @@
 typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
 
 kernel  __attribute__((vec_type_hint(int))) __attribute__((reqd_work_group_size(1,2,4))) void kernel1(int a) {}
+// CHECK: define void @kernel1(i32 {{[^%]*}}%a) {{[^{]+}} !vec_type_hint ![[MD1:[0-9]+]] !reqd_work_group_size ![[MD2:[0-9]+]]
 
 kernel __attribute__((vec_type_hint(uint4))) __attribute__((work_group_size_hint(8,16,32))) void kernel2(int a) {}
+// CHECK: define void @kernel2(i32 {{[^%]*}}%a) {{[^{]+}} !vec_type_hint ![[MD3:[0-9]+]] !work_group_size_hint ![[MD4:[0-9]+]]
 
-// CHECK: opencl.kernels = !{[[MDNODE0:![0-9]+]], [[MDNODE3:![0-9]+]]}
-
-// CHECK: [[MDNODE0]] = !{void (i32)* @kernel1, {{.*}} [[MDNODE1:![0-9]+]], [[MDNODE2:![0-9]+]]}
-// CHECK: [[MDNODE1]] = !{!"vec_type_hint", i32 undef, i32 1}
-// CHECK: [[MDNODE2]] = !{!"reqd_work_group_size", i32 1, i32 2, i32 4}
-// CHECK: [[MDNODE3]] = !{void (i32)* @kernel2, {{.*}} [[MDNODE4:![0-9]+]], [[MDNODE5:![0-9]+]]}
-// CHECK: [[MDNODE4]] = !{!"vec_type_hint", <4 x i32> undef, i32 0}
-// CHECK: [[MDNODE5]] = !{!"work_group_size_hint", i32 8, i32 16, i32 32}
+// CHECK: [[MD1]] = !{i32 undef, i32 1}
+// CHECK: [[MD2]] = !{i32 1, i32 2, i32 4}
+// CHECK: [[MD3]] = !{<4 x i32> undef, i32 0}
+// CHECK: [[MD4]] = !{i32 8, i32 16, i32 32}
diff --git a/test/CodeGenOpenCL/kernel-metadata.cl b/test/CodeGenOpenCL/kernel-metadata.cl
index ef3758f..4165f1f 100644
--- a/test/CodeGenOpenCL/kernel-metadata.cl
+++ b/test/CodeGenOpenCL/kernel-metadata.cl
@@ -6,10 +6,5 @@
 __kernel void kernel_function() {
 }
 
-// CHECK: !opencl.kernels = !{!0}
-// CHECK: !0 = !{void ()* @kernel_function, !1, !2, !3, !4, !5}
-// CHECK: !1 = !{!"kernel_arg_addr_space"}
-// CHECK: !2 = !{!"kernel_arg_access_qual"}
-// CHECK: !3 = !{!"kernel_arg_type"}
-// CHECK: !4 = !{!"kernel_arg_base_type"}
-// CHECK: !5 = !{!"kernel_arg_type_qual"}
+// CHECK: define void @kernel_function() {{[^{]+}} !kernel_arg_addr_space ![[MD:[0-9]+]] !kernel_arg_access_qual ![[MD]] !kernel_arg_type ![[MD]] !kernel_arg_base_type ![[MD]] !kernel_arg_type_qual ![[MD]] {
+// CHECK: ![[MD]] = !{}
diff --git a/test/CodeGenOpenCL/no-signed-zeros.cl b/test/CodeGenOpenCL/no-signed-zeros.cl
new file mode 100644
index 0000000..14f6411
--- /dev/null
+++ b/test/CodeGenOpenCL/no-signed-zeros.cl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s -check-prefix=NORMAL
+// RUN: %clang_cc1 %s -emit-llvm -cl-no-signed-zeros -o - | FileCheck %s -check-prefix=NO-SIGNED-ZEROS
+
+float signedzeros(float a) {
+  return a;
+}
+
+// CHECK: attributes
+// NORMAL: "no-signed-zeros-fp-math"="false"
+// NO-SIGNED-ZEROS: "no-signed-zeros-fp-math"="true"
diff --git a/test/CodeGenOpenCL/opencl_types.cl b/test/CodeGenOpenCL/opencl_types.cl
index 5f4ebb8..73c57b7 100644
--- a/test/CodeGenOpenCL/opencl_types.cl
+++ b/test/CodeGenOpenCL/opencl_types.cl
@@ -1,40 +1,54 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 | FileCheck %s
+// RUN: %clang_cc1 %s -triple "spir-unknown-unknown" -emit-llvm -o - -O0 | FileCheck %s --check-prefix=CHECK-SPIR
+// RUN: %clang_cc1 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -O0 | FileCheck %s --check-prefix=CHECK-AMDGCN
 
-constant sampler_t glb_smp = 7;
-// CHECK: constant i32 7
+#define CLK_ADDRESS_CLAMP_TO_EDGE       2
+#define CLK_NORMALIZED_COORDS_TRUE      1
+#define CLK_FILTER_NEAREST              0x10
+#define CLK_FILTER_LINEAR               0x20
+
+constant sampler_t glb_smp = CLK_ADDRESS_CLAMP_TO_EDGE|CLK_NORMALIZED_COORDS_TRUE|CLK_FILTER_NEAREST;
+// CHECK-SPIR-NOT: constant i32
 
 void fnc1(image1d_t img) {}
-// CHECK: @fnc1(%opencl.image1d_t*
+// CHECK-SPIR: @fnc1(%opencl.image1d_ro_t addrspace(1)*
+// CHECK-AMDGCN: @fnc1(%opencl.image1d_ro_t addrspace(2)*
 
 void fnc1arr(image1d_array_t img) {}
-// CHECK: @fnc1arr(%opencl.image1d_array_t*
+// CHECK-SPIR: @fnc1arr(%opencl.image1d_array_ro_t addrspace(1)*
+// CHECK-AMDGCN: @fnc1arr(%opencl.image1d_array_ro_t addrspace(2)*
 
 void fnc1buff(image1d_buffer_t img) {}
-// CHECK: @fnc1buff(%opencl.image1d_buffer_t*
+// CHECK-SPIR: @fnc1buff(%opencl.image1d_buffer_ro_t addrspace(1)*
+// CHECK-AMDGCN: @fnc1buff(%opencl.image1d_buffer_ro_t addrspace(2)*
 
 void fnc2(image2d_t img) {}
-// CHECK: @fnc2(%opencl.image2d_t*
+// CHECK-SPIR: @fnc2(%opencl.image2d_ro_t addrspace(1)*
+// CHECK-AMDGCN: @fnc2(%opencl.image2d_ro_t addrspace(2)*
 
 void fnc2arr(image2d_array_t img) {}
-// CHECK: @fnc2arr(%opencl.image2d_array_t*
+// CHECK-SPIR: @fnc2arr(%opencl.image2d_array_ro_t addrspace(1)*
+// CHECK-AMDGCN: @fnc2arr(%opencl.image2d_array_ro_t addrspace(2)*
 
 void fnc3(image3d_t img) {}
-// CHECK: @fnc3(%opencl.image3d_t*
+// CHECK-SPIR: @fnc3(%opencl.image3d_ro_t addrspace(1)*
+// CHECK-AMDGCN: @fnc3(%opencl.image3d_ro_t addrspace(2)*
 
 void fnc4smp(sampler_t s) {}
-// CHECK-LABEL: define {{.*}}void @fnc4smp(i32
+// CHECK-SPIR-LABEL: define {{.*}}void @fnc4smp(%opencl.sampler_t addrspace(2)*
+// CHECK-AMDGCN-LABEL: define {{.*}}void @fnc4smp(%opencl.sampler_t addrspace(2)*
 
 kernel void foo(image1d_t img) {
-	sampler_t smp = 5;
-// CHECK: alloca i32
-	event_t evt;
-// CHECK: alloca %opencl.event_t*
-// CHECK: store i32 5,
+  sampler_t smp = CLK_ADDRESS_CLAMP_TO_EDGE|CLK_NORMALIZED_COORDS_TRUE|CLK_FILTER_LINEAR;
+  // CHECK-SPIR: alloca %opencl.sampler_t addrspace(2)*
+  event_t evt;
+  // CHECK-SPIR: alloca %opencl.event_t*
+  // CHECK-SPIR: store %opencl.sampler_t addrspace(2)*
   fnc4smp(smp);
-// CHECK: call {{.*}}void @fnc4smp(i32
+  // CHECK-SPIR: call {{.*}}void @fnc4smp(%opencl.sampler_t addrspace(2)*
   fnc4smp(glb_smp);
-// CHECK: call {{.*}}void @fnc4smp(i32
+  // CHECK-SPIR: call {{.*}}void @fnc4smp(%opencl.sampler_t addrspace(2)*
 }
 
-void __attribute__((overloadable)) bad1(image1d_t *b, image2d_t *c, image2d_t *d) {}
-// CHECK-LABEL: @{{_Z4bad1P11ocl_image1dP11ocl_image2dS2_|"\\01\?bad1@@\$\$J0YAXPE?APAUocl_image1d@@PE?APAUocl_image2d@@1@Z"}}
+void __attribute__((overloadable)) bad1(image1d_t b, image2d_t c, image2d_t d) {}
+// CHECK-SPIR-LABEL: @{{_Z4bad114ocl_image1d_ro14ocl_image2d_roS0_|"\\01\?bad1@@\$\$J0YAXPAUocl_image1d_ro@@PAUocl_image2d_ro@@1@Z"}}
+// CHECK-AMDGCN-LABEL: @{{_Z4bad114ocl_image1d_ro14ocl_image2d_roS0_|"\\01\?bad1@@\$\$J0YAXPAUocl_image1d_ro@@PAUocl_image2d_ro@@1@Z"}}(%opencl.image1d_ro_t addrspace(2)*{{.*}}%opencl.image2d_ro_t addrspace(2)*{{.*}}%opencl.image2d_ro_t addrspace(2)*{{.*}})
diff --git a/test/CodeGenOpenCL/pipe_builtin.cl b/test/CodeGenOpenCL/pipe_builtin.cl
new file mode 100644
index 0000000..db6893e
--- /dev/null
+++ b/test/CodeGenOpenCL/pipe_builtin.cl
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+
+// CHECK: %opencl.pipe_t = type opaque
+// CHECK: %opencl.reserve_id_t = type opaque
+
+void test1(read_only pipe int p, global int *ptr) {
+  // CHECK: call i32 @__read_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}})
+  read_pipe(p, ptr);
+  // CHECK: call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = reserve_read_pipe(p, 2);
+  // CHECK: call i32 @__read_pipe_4(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i8* %{{.*}})
+  read_pipe(p, rid, 2, ptr);
+  // CHECK: call void @__commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  commit_read_pipe(p, rid);
+}
+
+void test2(write_only pipe int p, global int *ptr) {
+  // CHECK: call i32 @__write_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}})
+  write_pipe(p, ptr);
+  // CHECK: call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = reserve_write_pipe(p, 2);
+  // CHECK: call i32 @__write_pipe_4(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i8* %{{.*}})
+  write_pipe(p, rid, 2, ptr);
+  // CHECK: call void @__commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  commit_write_pipe(p, rid);
+}
+
+void test3(read_only pipe int p, global int *ptr) {
+  // CHECK: call %opencl.reserve_id_t* @__work_group_reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = work_group_reserve_read_pipe(p, 2);
+  // CHECK: call void @__work_group_commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  work_group_commit_read_pipe(p, rid);
+}
+
+void test4(write_only pipe int p, global int *ptr) {
+  // CHECK: call %opencl.reserve_id_t* @__work_group_reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = work_group_reserve_write_pipe(p, 2);
+  // CHECK: call void @__work_group_commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  work_group_commit_write_pipe(p, rid);
+}
+
+void test5(read_only pipe int p, global int *ptr) {
+  // CHECK: call %opencl.reserve_id_t* @__sub_group_reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = sub_group_reserve_read_pipe(p, 2);
+  // CHECK: call void @__sub_group_commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  sub_group_commit_read_pipe(p, rid);
+}
+
+void test6(write_only pipe int p, global int *ptr) {
+  // CHECK: call %opencl.reserve_id_t* @__sub_group_reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = sub_group_reserve_write_pipe(p, 2);
+  // CHECK: call void @__sub_group_commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  sub_group_commit_write_pipe(p, rid);
+}
+
+void test7(write_only pipe int p, global int *ptr) {
+  // CHECK: call i32 @__get_pipe_num_packets(%opencl.pipe_t* %{{.*}})
+  *ptr = get_pipe_num_packets(p);
+  // CHECK: call i32 @__get_pipe_max_packets(%opencl.pipe_t* %{{.*}})
+  *ptr = get_pipe_max_packets(p);
+}
diff --git a/test/CodeGenOpenCL/pipe_types.cl b/test/CodeGenOpenCL/pipe_types.cl
index 547071c..b9c411b 100644
--- a/test/CodeGenOpenCL/pipe_types.cl
+++ b/test/CodeGenOpenCL/pipe_types.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
 
 // CHECK: %opencl.pipe_t = type opaque
 typedef unsigned char __attribute__((ext_vector_type(3))) uchar3;
@@ -25,3 +25,23 @@
 void test5(read_only pipe int4 p) {
 // CHECK: define void @test5(%opencl.pipe_t* %p)
 }
+
+typedef read_only pipe int MyPipe;
+kernel void test6(MyPipe p) {
+// CHECK: define void @test6(%opencl.pipe_t* %p)
+}
+
+struct Person {
+  const char *Name;
+  bool isFemale;
+  int ID;
+};
+
+void test_reserved_read_pipe(global struct Person *SDst,
+                             read_only pipe struct Person SPipe) {
+// CHECK: define void @test_reserved_read_pipe
+  read_pipe (SPipe, SDst);
+  // CHECK: call i32 @__read_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}})
+  read_pipe (SPipe, SDst);
+  // CHECK: call i32 @__read_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}})
+}
diff --git a/test/CodeGenOpenCL/sampler.cl b/test/CodeGenOpenCL/sampler.cl
new file mode 100644
index 0000000..6fc8c2c
--- /dev/null
+++ b/test/CodeGenOpenCL/sampler.cl
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple spir-unknown-unknown -o - -O0 | FileCheck %s
+//
+// This test covers 5 cases of sampler initialzation:
+//   1. function argument passing
+//      1a. argument is a file-scope variable
+//      1b. argument is a function-scope variable
+//      1c. argument is one of caller function's parameters
+//   2. variable initialization
+//      2a. initializing a file-scope variable
+//      2b. initializing a function-scope variable
+
+#define CLK_ADDRESS_CLAMP_TO_EDGE       2
+#define CLK_NORMALIZED_COORDS_TRUE      1
+#define CLK_FILTER_NEAREST              0x10
+#define CLK_FILTER_LINEAR               0x20
+
+// CHECK: %opencl.sampler_t = type opaque
+
+// Case 2a
+constant sampler_t glb_smp = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR;
+// CHECK-NOT: glb_smp
+
+void fnc4smp(sampler_t s) {}
+// CHECK: define spir_func void @fnc4smp(%opencl.sampler_t addrspace(2)* %
+
+kernel void foo(sampler_t smp_par) {
+  // CHECK-LABEL: define spir_kernel void @foo(%opencl.sampler_t addrspace(2)* %smp_par)
+  // CHECK: [[smp_par_ptr:%[A-Za-z0-9_\.]+]] = alloca %opencl.sampler_t addrspace(2)*
+
+  // Case 2b
+  sampler_t smp = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_NEAREST;
+  // CHECK: [[smp_ptr:%[A-Za-z0-9_\.]+]] = alloca %opencl.sampler_t addrspace(2)*
+  // CHECK: [[SAMP:%[0-9]+]] = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 19)
+  // CHECK: store %opencl.sampler_t addrspace(2)* [[SAMP]], %opencl.sampler_t addrspace(2)** [[smp_ptr]]
+
+  // Case 1b
+  fnc4smp(smp);
+  // CHECK-NOT: call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 19)
+  // CHECK: [[SAMP:%[0-9]+]] = load %opencl.sampler_t addrspace(2)*, %opencl.sampler_t addrspace(2)** [[smp_ptr]]
+  // CHECK: call spir_func void @fnc4smp(%opencl.sampler_t addrspace(2)* [[SAMP]])
+
+  // Case 1b
+  fnc4smp(smp);
+  // CHECK-NOT: call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 19)
+  // CHECK: [[SAMP:%[0-9]+]] = load %opencl.sampler_t addrspace(2)*, %opencl.sampler_t addrspace(2)** [[smp_ptr]]
+  // CHECK: call spir_func void @fnc4smp(%opencl.sampler_t addrspace(2)* [[SAMP]])
+
+  // Case 1a
+  fnc4smp(glb_smp);
+  // CHECK: [[SAMP:%[0-9]+]] = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
+  // CHECK: call spir_func void @fnc4smp(%opencl.sampler_t addrspace(2)* [[SAMP]])
+
+  // Case 1c
+  fnc4smp(smp_par);
+  // CHECK: [[SAMP:%[0-9]+]] = load %opencl.sampler_t addrspace(2)*, %opencl.sampler_t addrspace(2)** [[smp_par_ptr]]
+  // CHECK: call spir_func void @fnc4smp(%opencl.sampler_t addrspace(2)* [[SAMP]])
+}
diff --git a/test/CodeGenOpenCL/shifts.cl b/test/CodeGenOpenCL/shifts.cl
index ab64051..14cd7af 100644
--- a/test/CodeGenOpenCL/shifts.cl
+++ b/test/CodeGenOpenCL/shifts.cl
@@ -5,7 +5,7 @@
 // bits before evaluating. Test this both for variables and constants
 // evaluated in the front-end.
 
-// OPT: @gtest1 = constant i64 2147483648
+// OPT: @gtest1 = local_unnamed_addr constant i64 2147483648
 __constant const unsigned long gtest1 = 1UL << 31;
 
 // NOOPT: @negativeShift32
diff --git a/test/CodeGenOpenCL/spir_version.cl b/test/CodeGenOpenCL/spir_version.cl
new file mode 100644
index 0000000..54c851a
--- /dev/null
+++ b/test/CodeGenOpenCL/spir_version.cl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 %s -triple "spir-unknown-unknown" -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-SPIR-CL10
+// RUN: %clang_cc1 %s -triple "spir-unknown-unknown" -emit-llvm -o - -cl-std=CL1.2 | FileCheck %s --check-prefix=CHECK-SPIR-CL12
+// RUN: %clang_cc1 %s -triple "spir-unknown-unknown" -emit-llvm -o - -cl-std=CL2.0 | FileCheck %s --check-prefix=CHECK-SPIR-CL20
+// RUN: %clang_cc1 %s -triple "spir64-unknown-unknown" -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-SPIR-CL10
+// RUN: %clang_cc1 %s -triple "spir64-unknown-unknown" -emit-llvm -o - -cl-std=CL1.2 | FileCheck %s --check-prefix=CHECK-SPIR-CL12
+// RUN: %clang_cc1 %s -triple "spir64-unknown-unknown" -emit-llvm -o - -cl-std=CL2.0 | FileCheck %s --check-prefix=CHECK-SPIR-CL20
+
+// RUN: %clang_cc1 %s -triple "amdgcn--amdhsa" -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-AMDGCN-CL10
+// RUN: %clang_cc1 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -cl-std=CL1.2 | FileCheck %s --check-prefix=CHECK-AMDGCN-CL12
+// RUN: %clang_cc1 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -cl-std=CL2.0 | FileCheck %s --check-prefix=CHECK-AMDGCN-CL20
+
+kernel void foo() {}
+
+// CHECK-SPIR-CL10: !opencl.spir.version = !{[[SPIR:![0-9]+]]}
+// CHECK-SPIR-CL10: !opencl.ocl.version = !{[[OCL:![0-9]+]]}
+// CHECK-SPIR-CL10: [[SPIR]] = !{i32 2, i32 0}
+// CHECK-SPIR-CL10: [[OCL]] = !{i32 1, i32 0}
+// CHECK-SPIR-CL12: !opencl.spir.version = !{[[SPIR:![0-9]+]]}
+// CHECK-SPIR-CL12: !opencl.ocl.version = !{[[OCL:![0-9]+]]}
+// CHECK-SPIR-CL12: [[SPIR]] = !{i32 2, i32 0}
+// CHECK-SPIR-CL12: [[OCL]] = !{i32 1, i32 2}
+// CHECK-SPIR-CL20: !opencl.spir.version = !{[[SPIR:![0-9]+]]}
+// CHECK-SPIR-CL20: !opencl.ocl.version = !{[[SPIR:![0-9]+]]}
+// CHECK-SPIR-CL20: [[SPIR]] = !{i32 2, i32 0}
+
+// CHECK-AMDGCN-CL10: !opencl.ocl.version = !{[[OCL:![0-9]+]]}
+// CHECK-AMDGCN-CL10: [[OCL]] = !{i32 1, i32 0}
+// CHECK-AMDGCN-CL12: !opencl.ocl.version = !{[[OCL:![0-9]+]]}
+// CHECK-AMDGCN-CL12: [[OCL]] = !{i32 1, i32 2}
+// CHECK-AMDGCN-CL20: !opencl.ocl.version = !{[[OCL:![0-9]+]]}
+// CHECK-AMDGCN-CL20: [[OCL]] = !{i32 2, i32 0}
\ No newline at end of file
diff --git a/test/CodeGenOpenCL/str_literals.cl b/test/CodeGenOpenCL/str_literals.cl
index 092b637..1c0acd1 100644
--- a/test/CodeGenOpenCL/str_literals.cl
+++ b/test/CodeGenOpenCL/str_literals.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -ffake-address-space-map | FileCheck %s
+// RUN: %clang_cc1 %s -cl-opt-disable -emit-llvm -o - -ffake-address-space-map | FileCheck %s
 
 __constant char * __constant x = "hello world";
 __constant char * __constant y = "hello world";
diff --git a/test/CodeGenOpenCL/to_addr_builtin.cl b/test/CodeGenOpenCL/to_addr_builtin.cl
new file mode 100644
index 0000000..72c09da
--- /dev/null
+++ b/test/CodeGenOpenCL/to_addr_builtin.cl
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+
+// CHECK: %[[A:.*]] = type { float, float, float }
+typedef struct {
+  float x,y,z;
+} A;
+typedef private A *PA;
+typedef global A *GA;
+
+void test(void) {
+  global int *glob;
+  local int *loc;
+  private int *priv;
+  generic int *gen;
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(1)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @__to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to i32 addrspace(1)*
+  glob = to_global(glob);
+  
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(3)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @__to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to i32 addrspace(1)*
+  glob = to_global(loc);
+ 
+  //CHECK: %[[ARG:.*]] = addrspacecast i32* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @__to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to i32 addrspace(1)*
+  glob = to_global(priv);
+ 
+  //CHECK: %[[ARG:.*]] = bitcast i32 addrspace(4)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @__to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to i32 addrspace(1)*
+  glob = to_global(gen);
+  
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(1)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(3)* @__to_local(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(3)* %[[RET]] to i32 addrspace(3)*
+  loc = to_local(glob);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(3)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(3)* @__to_local(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(3)* %[[RET]] to i32 addrspace(3)*
+  loc = to_local(loc);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(3)* @__to_local(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(3)* %[[RET]] to i32 addrspace(3)*
+  loc = to_local(priv);
+
+  //CHECK: %[[ARG:.*]] = bitcast i32 addrspace(4)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(3)* @__to_local(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(3)* %[[RET]] to i32 addrspace(3)*
+  loc = to_local(gen);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(1)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8* @__to_private(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8* %[[RET]] to i32*
+  priv = to_private(glob);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(3)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8* @__to_private(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8* %[[RET]] to i32*
+  priv = to_private(loc);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8* @__to_private(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8* %[[RET]] to i32*
+  priv = to_private(priv);
+
+  //CHECK: %[[ARG:.*]] = bitcast i32 addrspace(4)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8* @__to_private(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8* %[[RET]] to i32*
+  priv = to_private(gen);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast %[[A]]* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @__to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to %[[A]] addrspace(1)*
+  PA pA;
+  GA gA = to_global(pA);
+
+  //CHECK-NOT: addrspacecast
+  //CHECK-NOT: bitcast
+  //CHECK: call i8 addrspace(1)* @__to_global(i8 addrspace(4)* %{{.*}})
+  //CHECK-NOT: addrspacecast
+  //CHECK-NOT: bitcast
+  generic void *gen_v;
+  global void *glob_v = to_global(gen_v);
+}
diff --git a/test/CodeGenOpenCL/unroll-hint.cl b/test/CodeGenOpenCL/unroll-hint.cl
new file mode 100644
index 0000000..a86762e
--- /dev/null
+++ b/test/CodeGenOpenCL/unroll-hint.cl
@@ -0,0 +1,96 @@
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+
+/*** for ***/
+void for_count()
+{
+// CHECK-LABEL: for_count
+    __attribute__((opencl_unroll_hint(8)))
+    for( int i = 0; i < 1000; ++i);
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_COUNT:.*]]
+}
+
+void for_disable()
+{
+// CHECK-LABEL: for_disable
+    __attribute__((opencl_unroll_hint(1)))
+    for( int i = 0; i < 1000; ++i);
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_DISABLE:.*]]
+}
+
+void for_full()
+{
+// CHECK-LABEL: for_full
+    __attribute__((opencl_unroll_hint))
+    for( int i = 0; i < 1000; ++i);
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_FULL:.*]]
+}
+
+/*** while ***/
+void while_count()
+{
+// CHECK-LABEL: while_count
+    int i = 1000;
+    __attribute__((opencl_unroll_hint(8)))
+    while(i-->0);
+// CHECK: br label %{{.*}}, !llvm.loop ![[WHILE_COUNT:.*]]
+}
+
+void while_disable()
+{
+// CHECK-LABEL: while_disable
+    int i = 1000;
+    __attribute__((opencl_unroll_hint(1)))
+    while(i-->0);
+// CHECK: br label %{{.*}}, !llvm.loop ![[WHILE_DISABLE:.*]]
+}
+
+void while_full()
+{
+// CHECK-LABEL: while_full
+    int i = 1000;
+    __attribute__((opencl_unroll_hint))
+    while(i-->0);
+// CHECK: br label %{{.*}}, !llvm.loop ![[WHILE_FULL:.*]]
+}
+
+/*** do ***/
+void do_count()
+{
+// CHECK-LABEL: do_count
+    int i = 1000;
+    __attribute__((opencl_unroll_hint(8)))
+    do {} while(i--> 0);
+// CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !llvm.loop ![[DO_COUNT:.*]]
+}
+
+void do_disable()
+{
+// CHECK-LABEL: do_disable
+    int i = 1000;
+    __attribute__((opencl_unroll_hint(1)))
+    do {} while(i--> 0);
+// CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !llvm.loop ![[DO_DISABLE:.*]]
+}
+
+void do_full()
+{
+// CHECK-LABEL: do_full
+    int i = 1000;
+    __attribute__((opencl_unroll_hint))
+    do {} while(i--> 0);
+// CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !llvm.loop ![[DO_FULL:.*]]
+}
+
+
+// CHECK: ![[FOR_COUNT]]     =  distinct !{![[FOR_COUNT]],  ![[COUNT:.*]]}
+// CHECK: ![[COUNT]]         =  !{!"llvm.loop.unroll.count", i32 8}
+// CHECK: ![[FOR_DISABLE]]   =  distinct !{![[FOR_DISABLE]],  ![[DISABLE:.*]]}
+// CHECK: ![[DISABLE]]       =  !{!"llvm.loop.unroll.disable"}
+// CHECK: ![[FOR_FULL]]      =  distinct !{![[FOR_FULL]],  ![[FULL:.*]]}
+// CHECK: ![[FULL]]          =  !{!"llvm.loop.unroll.full"}
+// CHECK: ![[WHILE_COUNT]]   =  distinct !{![[WHILE_COUNT]],    ![[COUNT]]}
+// CHECK: ![[WHILE_DISABLE]] =  distinct !{![[WHILE_DISABLE]],  ![[DISABLE]]}
+// CHECK: ![[WHILE_FULL]]    =  distinct !{![[WHILE_FULL]],     ![[FULL]]}
+// CHECK: ![[DO_COUNT]]      =  distinct !{![[DO_COUNT]],       ![[COUNT]]}
+// CHECK: ![[DO_DISABLE]]    =  distinct !{![[DO_DISABLE]],     ![[DISABLE]]}
+// CHECK: ![[DO_FULL]]       =  distinct !{![[DO_FULL]],        ![[FULL]]}
diff --git a/test/CodeGenOpenCL/vla.cl b/test/CodeGenOpenCL/vla.cl
new file mode 100644
index 0000000..cbf9844
--- /dev/null
+++ b/test/CodeGenOpenCL/vla.cl
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -emit-llvm -triple "spir-unknown-unknown" -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+
+constant int sz0 = 5;
+// CHECK: @sz0 = addrspace(2) constant i32 5
+const global int sz1 = 16;
+// CHECK: @sz1 = addrspace(1) constant i32 16
+const constant int sz2 = 8;
+// CHECK: @sz2 = addrspace(2) constant i32 8
+// CHECK: @testvla.vla2 = internal addrspace(3) global [8 x i16] undef
+
+kernel void testvla()
+{
+  int vla0[sz0];
+// CHECK: %vla0 = alloca [5 x i32]
+  char vla1[sz1];
+// CHECK: %vla1 = alloca [16 x i8]
+  local short vla2[sz2];
+}
diff --git a/test/CoverageMapping/Inputs/ends_a_scope_only b/test/CoverageMapping/Inputs/ends_a_scope_only
new file mode 100644
index 0000000..5c34318
--- /dev/null
+++ b/test/CoverageMapping/Inputs/ends_a_scope_only
@@ -0,0 +1 @@
+}
diff --git a/test/CoverageMapping/Inputs/starts_a_scope_only b/test/CoverageMapping/Inputs/starts_a_scope_only
new file mode 100644
index 0000000..98232c6
--- /dev/null
+++ b/test/CoverageMapping/Inputs/starts_a_scope_only
@@ -0,0 +1 @@
+{
diff --git a/test/CoverageMapping/abspath.cpp b/test/CoverageMapping/abspath.cpp
new file mode 100644
index 0000000..667172e
--- /dev/null
+++ b/test/CoverageMapping/abspath.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -emit-llvm -main-file-name abspath.cpp %S/Inputs/../abspath.cpp -o - | FileCheck -check-prefix=RMDOTS %s
+
+// RMDOTS: @__llvm_coverage_mapping = {{.*}}"\01
+// RMDOTS-NOT: Inputs
+// RMDOTS: "
+
+// RUN: cd %T && mkdir -p test && cd test
+// RUN: echo "void f1() {}" > f1.c
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -emit-llvm -main-file-name abspath.cpp ../test/f1.c -o - | FileCheck -check-prefix=RELPATH %s
+
+// RELPATH: @__llvm_coverage_mapping = {{.*}}"\01
+// RELPATH: {{[/\\]}}{{.*}}{{[/\\][^/\\]*}}test{{[/\\][^/\\]*}}f1.c
+// RELPATH: "
+
+void f1() {}
diff --git a/test/CoverageMapping/block-storage-starts-region.m b/test/CoverageMapping/block-storage-starts-region.m
index 7997c8d..7e25438 100644
--- a/test/CoverageMapping/block-storage-starts-region.m
+++ b/test/CoverageMapping/block-storage-starts-region.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.10.0 -fblocks -fobjc-arc %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.10.0 -fblocks -fobjc-arc %s | FileCheck %s
 
 @interface Foo
 @end
diff --git a/test/CoverageMapping/break.c b/test/CoverageMapping/break.c
index 99439c8..ee41271 100644
--- a/test/CoverageMapping/break.c
+++ b/test/CoverageMapping/break.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name break.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name break.c %s | FileCheck %s
 
 int main() {         // CHECK: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0
   int cnt = 0;       // CHECK-NEXT: File 0, [[@LINE+1]]:9 -> [[@LINE+1]]:18 = #0
diff --git a/test/CoverageMapping/builtinmacro.c b/test/CoverageMapping/builtinmacro.c
index 80b2672..63f5584 100644
--- a/test/CoverageMapping/builtinmacro.c
+++ b/test/CoverageMapping/builtinmacro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name builtinmacro.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name builtinmacro.c %s | FileCheck %s
 
 // Test the coverage mapping generation for built-in macroes.
 
diff --git a/test/CoverageMapping/casts.c b/test/CoverageMapping/casts.c
index 95289f6..d295f31 100644
--- a/test/CoverageMapping/casts.c
+++ b/test/CoverageMapping/casts.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name casts.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name casts.c %s | FileCheck %s
 
 int main() {                                                   // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+4]]:2 = #0
                                                                // CHECK-NEXT: File 0, [[@LINE+1]]:41 -> [[@LINE+1]]:54 = #1
diff --git a/test/CoverageMapping/classtemplate.cpp b/test/CoverageMapping/classtemplate.cpp
index 2e0b507..0ccdcb2 100644
--- a/test/CoverageMapping/classtemplate.cpp
+++ b/test/CoverageMapping/classtemplate.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name classtemplate.cpp %s > %tmapping
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name classtemplate.cpp %s > %tmapping
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-CONSTRUCTOR
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-GETTER
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-SETTER
diff --git a/test/CoverageMapping/comment-in-macro.c b/test/CoverageMapping/comment-in-macro.c
index ecc883f..06e8adb 100644
--- a/test/CoverageMapping/comment-in-macro.c
+++ b/test/CoverageMapping/comment-in-macro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
 
 #define x1 "" // ...
 #define x2 return 0
diff --git a/test/CoverageMapping/continue.c b/test/CoverageMapping/continue.c
index c86651e..7ea03fb 100644
--- a/test/CoverageMapping/continue.c
+++ b/test/CoverageMapping/continue.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name continue.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name continue.c %s | FileCheck %s
 
 int main() {                    // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+21]]:2 = #0
   int j = 0;                    // CHECK-NEXT: File 0, [[@LINE+2]]:18 -> [[@LINE+2]]:24 = (#0 + #1)
diff --git a/test/CoverageMapping/control-flow-macro.c b/test/CoverageMapping/control-flow-macro.c
index 149cb55..8508e53 100644
--- a/test/CoverageMapping/control-flow-macro.c
+++ b/test/CoverageMapping/control-flow-macro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
 
 #define ifc if
 
diff --git a/test/CoverageMapping/decl.c b/test/CoverageMapping/decl.c
index 96ee303..e477028 100644
--- a/test/CoverageMapping/decl.c
+++ b/test/CoverageMapping/decl.c
@@ -1,6 +1,6 @@
 // Ensure that declarations without definitions don't have maps emitted for them
 
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s > %t
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s > %t
 // FileCheck -input-file %t %s
 // RUN: FileCheck -check-prefix BAR -input-file %t %s
 
diff --git a/test/CoverageMapping/header.cpp b/test/CoverageMapping/header.cpp
index e495d5a..5e0b311 100644
--- a/test/CoverageMapping/header.cpp
+++ b/test/CoverageMapping/header.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name header.cpp %s > %tmapping
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name header.cpp %s > %tmapping
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-FUNC
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-STATIC-FUNC
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-STATIC-FUNC2
diff --git a/test/CoverageMapping/if.c b/test/CoverageMapping/if.c
index 73b2308..69544f6 100644
--- a/test/CoverageMapping/if.c
+++ b/test/CoverageMapping/if.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name if.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name if.c %s | FileCheck %s
 
 int main() {                    // CHECK: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0
   int i = 0;
diff --git a/test/CoverageMapping/implicit-def-in-macro.m b/test/CoverageMapping/implicit-def-in-macro.m
index 7e563ac..902fc8b 100644
--- a/test/CoverageMapping/implicit-def-in-macro.m
+++ b/test/CoverageMapping/implicit-def-in-macro.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.10.0 -fblocks -fobjc-arc %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.10.0 -fblocks -fobjc-arc %s | FileCheck %s
 
 @interface Foo
 @end
diff --git a/test/CoverageMapping/include-macros.c b/test/CoverageMapping/include-macros.c
index 78b2747..113721c 100644
--- a/test/CoverageMapping/include-macros.c
+++ b/test/CoverageMapping/include-macros.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name include-macros.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name include-macros.c %s | FileCheck %s
 
 #include "Inputs/macros.h"
 
diff --git a/test/CoverageMapping/includehell.cpp b/test/CoverageMapping/includehell.cpp
index 5a9ff78..9ad3683 100644
--- a/test/CoverageMapping/includehell.cpp
+++ b/test/CoverageMapping/includehell.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name includehell.cpp %s > %tmapping
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name includehell.cpp %s > %tmapping
 
 int main() {
   int x = 0;
diff --git a/test/CoverageMapping/ir.c b/test/CoverageMapping/ir.c
index a9c1439..469b299 100644
--- a/test/CoverageMapping/ir.c
+++ b/test/CoverageMapping/ir.c
@@ -1,5 +1,5 @@
 // Check the data structures emitted by coverage mapping
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name ir.c %s -o - -emit-llvm -fprofile-instr-generate -fcoverage-mapping | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name ir.c %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck %s
 
 
 void foo(void) { }
@@ -9,4 +9,4 @@
   return 0;
 }
 
-// CHECK: @__llvm_coverage_mapping = internal constant { { i32, i32, i32, i32 }, [2 x <{{.*}}>], [{{[0-9]+}} x i8] } { { i32, i32, i32, i32 } { i32 2, i32 {{[0-9]+}}, i32 {{[0-9]+}}, i32 {{[0-9]+}} }, [2 x <{{.*}}>] [<{{.*}}> <{{.*}}>, <{{.*}}> <{{.*}}>]
+// CHECK: @__llvm_coverage_mapping = internal constant { { i32, i32, i32, i32 }, [2 x <{ i64, i32, i64 }>], [{{[0-9]+}} x i8] } { { i32, i32, i32, i32 } { i32 2, i32 {{[0-9]+}}, i32 {{[0-9]+}}, i32 {{[0-9]+}} }, [2 x <{ i64, i32, i64 }>] [<{{.*}}> <{{.*}}>, <{{.*}}> <{{.*}}>]
diff --git a/test/CoverageMapping/label.cpp b/test/CoverageMapping/label.cpp
index 52618f7..1c5111a6 100644
--- a/test/CoverageMapping/label.cpp
+++ b/test/CoverageMapping/label.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name label.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name label.cpp %s | FileCheck %s
 
                              // CHECK: func
 void func() {                // CHECK-NEXT: File 0, [[@LINE]]:13 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/lambda.cpp b/test/CoverageMapping/lambda.cpp
index fb018e6..4f23c15 100644
--- a/test/CoverageMapping/lambda.cpp
+++ b/test/CoverageMapping/lambda.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x c++ -std=c++11 -triple %itanium_abi_triple -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s -main-file-name lambda.cpp | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s -main-file-name lambda.cpp | FileCheck %s
 
 // CHECK-LABEL: _Z3fooi:
 void foo(int i) { // CHECK: File 0, [[@LINE]]:17 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/logical.cpp b/test/CoverageMapping/logical.cpp
index ece3102..198cc60 100644
--- a/test/CoverageMapping/logical.cpp
+++ b/test/CoverageMapping/logical.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %s | FileCheck %s
 
 int main() {                        // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+10]]:2 = #0
   bool bt = true;
diff --git a/test/CoverageMapping/loopmacro.c b/test/CoverageMapping/loopmacro.c
index bbd0c45..cffeca0 100644
--- a/test/CoverageMapping/loopmacro.c
+++ b/test/CoverageMapping/loopmacro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loopmacro.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loopmacro.c %s | FileCheck %s
 
 // CHECK: main
 // CHECK-NEXT: File 0, {{[0-9]+}}:12 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/loops.cpp b/test/CoverageMapping/loops.cpp
index 84a9892..cb7d777 100644
--- a/test/CoverageMapping/loops.cpp
+++ b/test/CoverageMapping/loops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s
 
                                     // CHECK: rangedFor
 void rangedFor() {                  // CHECK-NEXT: File 0, [[@LINE]]:18 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/macro-expansion.c b/test/CoverageMapping/macro-expansion.c
index e87f444..3fca975 100644
--- a/test/CoverageMapping/macro-expansion.c
+++ b/test/CoverageMapping/macro-expansion.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s
 
 // CHECK: func
 // CHECK:      File 1, [[@LINE+5]]:12 -> [[@LINE+5]]:38 = #0
diff --git a/test/CoverageMapping/macro-expressions.cpp b/test/CoverageMapping/macro-expressions.cpp
index e910829..3852fc6 100644
--- a/test/CoverageMapping/macro-expressions.cpp
+++ b/test/CoverageMapping/macro-expressions.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expressions.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expressions.cpp %s | FileCheck %s
 
 #define EXPR(x) (x)
 #define NEXPR(x) (!x)
diff --git a/test/CoverageMapping/macroception.c b/test/CoverageMapping/macroception.c
index bde38ff..7848741 100644
--- a/test/CoverageMapping/macroception.c
+++ b/test/CoverageMapping/macroception.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroception.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroception.c %s | FileCheck %s
 
 #define M2 {
 #define M1 M2
diff --git a/test/CoverageMapping/macroparams.c b/test/CoverageMapping/macroparams.c
index d2c8e55..efffc77 100644
--- a/test/CoverageMapping/macroparams.c
+++ b/test/CoverageMapping/macroparams.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams.c %s | FileCheck %s
 
 // CHECK: main
 // CHECK-NEXT: File 0, {{[0-9]+}}:12 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/macroparams2.c b/test/CoverageMapping/macroparams2.c
index fc156de..4e04581 100644
--- a/test/CoverageMapping/macroparams2.c
+++ b/test/CoverageMapping/macroparams2.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams2.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams2.c %s | FileCheck %s
 
 #define MACRO(REFS, CALLS)  (4 * (CALLS) < (REFS))
 
diff --git a/test/CoverageMapping/macros.c b/test/CoverageMapping/macros.c
index 02ecceb..f633961 100644
--- a/test/CoverageMapping/macros.c
+++ b/test/CoverageMapping/macros.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macros.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macros.c %s | FileCheck %s
 
 #define MACRO return; bar()
 #define MACRO_2 bar()
diff --git a/test/CoverageMapping/macroscopes.cpp b/test/CoverageMapping/macroscopes.cpp
index 712e209..f5fd55c 100644
--- a/test/CoverageMapping/macroscopes.cpp
+++ b/test/CoverageMapping/macroscopes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroscopes.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroscopes.cpp %s | FileCheck %s
 
 #define starts_a_scope for (int i = 0; i < 2; ++i) {
 
diff --git a/test/CoverageMapping/md.cpp b/test/CoverageMapping/md.cpp
index fff0df3..20c696c 100644
--- a/test/CoverageMapping/md.cpp
+++ b/test/CoverageMapping/md.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++11 %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++11 %s | FileCheck %s
 
 #define BREAK break
 
diff --git a/test/CoverageMapping/moremacros.c b/test/CoverageMapping/moremacros.c
index d4a8f87..5666227 100644
--- a/test/CoverageMapping/moremacros.c
+++ b/test/CoverageMapping/moremacros.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s
 
 #define LBRAC {
 #define RBRAC }
diff --git a/test/CoverageMapping/nestedclass.cpp b/test/CoverageMapping/nestedclass.cpp
index be4e0ba..6cbddeb 100644
--- a/test/CoverageMapping/nestedclass.cpp
+++ b/test/CoverageMapping/nestedclass.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name nestedclass.cpp %s > %tmapping
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name nestedclass.cpp %s > %tmapping
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-OUTER
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-INNER
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-INNERMOST
diff --git a/test/CoverageMapping/objc.m b/test/CoverageMapping/objc.m
index 8456dc3..89da5da 100644
--- a/test/CoverageMapping/objc.m
+++ b/test/CoverageMapping/objc.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name objc.m -triple x86_64-apple-darwin -fobjc-runtime=macosx-fragile-10.5 %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name objc.m -triple x86_64-apple-darwin -fobjc-runtime=macosx-fragile-10.5 %s | FileCheck %s
 
 @interface A
 - (void)bork:(int)msg;
diff --git a/test/CoverageMapping/preprocessor.c b/test/CoverageMapping/preprocessor.c
index cdd448c..bd82b39 100644
--- a/test/CoverageMapping/preprocessor.c
+++ b/test/CoverageMapping/preprocessor.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name preprocessor.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name preprocessor.c %s | FileCheck %s
 
                  // CHECK: func
 void func() {    // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+5]]:2 = #0
diff --git a/test/CoverageMapping/return.c b/test/CoverageMapping/return.c
index ab63c2c..1b190b0 100644
--- a/test/CoverageMapping/return.c
+++ b/test/CoverageMapping/return.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name return.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name return.c %s | FileCheck %s
 
                                 // CHECK: func
 void func() {                   // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+3]]:2 = #0
diff --git a/test/CoverageMapping/switch.c b/test/CoverageMapping/switch.c
index 72b0852..6aa2b31 100644
--- a/test/CoverageMapping/switch.c
+++ b/test/CoverageMapping/switch.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switch.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switch.c %s | FileCheck %s
                     // CHECK: foo
 void foo(int i) {   // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+8]]:2 = #0
   switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+5]]:4 = #1
diff --git a/test/CoverageMapping/switchmacro.c b/test/CoverageMapping/switchmacro.c
index 96b381d..f83d26f 100644
--- a/test/CoverageMapping/switchmacro.c
+++ b/test/CoverageMapping/switchmacro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switchmacro.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switchmacro.c %s | FileCheck %s
 
 #define FOO(x) (void)x
 
diff --git a/test/CoverageMapping/system_macro.c b/test/CoverageMapping/system_macro.c
deleted file mode 100644
index b0ce360..0000000
--- a/test/CoverageMapping/system_macro.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name system_macro.c -o - %s | FileCheck %s
-
-#ifdef IS_SYSHEADER
-
-#pragma clang system_header
-#define Func(x) if (x) {}
-#define SomeType int
-
-#else
-
-#define IS_SYSHEADER
-#include __FILE__
-
-// CHECK-LABEL: doSomething:
-void doSomething(int x) { // CHECK: File 0, [[@LINE]]:25 -> {{[0-9:]+}} = #0
-  Func(x); // CHECK: Expansion,File 0, [[@LINE]]:3 -> [[@LINE]]:7
-  return;
-  // CHECK: Expansion,File 0, [[@LINE+1]]:3 -> [[@LINE+1]]:11
-  SomeType *f; // CHECK: File 0, [[@LINE]]:11 -> {{[0-9:]+}} = 0
-}
-
-int main() {}
-
-#endif
diff --git a/test/CoverageMapping/system_macro.cpp b/test/CoverageMapping/system_macro.cpp
new file mode 100644
index 0000000..ce0da17
--- /dev/null
+++ b/test/CoverageMapping/system_macro.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name system_macro.cpp -o - %s | FileCheck %s
+
+#ifdef IS_SYSHEADER
+
+#pragma clang system_header
+#define Func(x) if (x) {}
+#define SomeType int
+
+#else
+
+#define IS_SYSHEADER
+#include __FILE__
+
+// CHECK-LABEL: doSomething
+void doSomething(int x) { // CHECK: File 0, [[@LINE]]:25 -> {{[0-9:]+}} = #0
+  Func(x);
+  return;
+  SomeType *f; // CHECK: File 0, [[@LINE]]:11 -> {{[0-9:]+}} = 0
+}
+
+// CHECK-LABEL: main
+int main() { // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+2]]:2 = #0
+  Func([] { return true; }());
+}
+
+#endif
diff --git a/test/CoverageMapping/templates.cpp b/test/CoverageMapping/templates.cpp
index fcb92e1..bdba1d4 100644
--- a/test/CoverageMapping/templates.cpp
+++ b/test/CoverageMapping/templates.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name templates.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name templates.cpp %s | FileCheck %s
 
 template<typename T>
 void unused(T x) {
diff --git a/test/CoverageMapping/test.c b/test/CoverageMapping/test.c
index a274ce4..5affbaa 100644
--- a/test/CoverageMapping/test.c
+++ b/test/CoverageMapping/test.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name test.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name test.c %s | FileCheck %s
 
 void bar();
 static void static_func();
diff --git a/test/CoverageMapping/trycatch.cpp b/test/CoverageMapping/trycatch.cpp
index 2d0f629..01d8fb9 100644
--- a/test/CoverageMapping/trycatch.cpp
+++ b/test/CoverageMapping/trycatch.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trycatch.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trycatch.cpp %s | FileCheck %s
 
 class Error {
 };
@@ -23,7 +23,7 @@
                                       // CHECK-NEXT: main
 int main() {                          // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+13]]:2 = #0
   int j = 1;
-  try {
+  try {                               // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE+2]]:4 = #0
     func(j);
   } catch(const Error &e) {           // CHECK-NEXT: File 0, [[@LINE]]:27 -> [[@LINE+2]]:4 = #2
     j = 1;
diff --git a/test/CoverageMapping/trymacro.cpp b/test/CoverageMapping/trymacro.cpp
index 949186d..32f4438 100644
--- a/test/CoverageMapping/trymacro.cpp
+++ b/test/CoverageMapping/trymacro.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trymacro.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trymacro.cpp %s | FileCheck %s
 
 // CHECK: Z3fn1v:
 void fn1() try { return; } // CHECK: [[@LINE]]:12 -> [[@LINE+1]]:14 = #1
@@ -17,8 +17,27 @@
 void fn3() TRY { return; } // CHECK: [[@LINE]]:15 -> [[@LINE+1]]:14 = #1
 CATCH(...) {}              // CHECK: [[@LINE]]:12 -> [[@LINE]]:14 = #2
 
+// CHECK: Z3fn4v:
+#define TRY2 try { // CHECK-DAG: File 1, [[@LINE]]:18 -> [[@LINE]]:19 = #1
+void fn4() TRY2 // CHECK-DAG: Expansion,File 0, [[@LINE]]:12 -> [[@LINE]]:16 = #1 (Expanded file = 1)
+  for (;;)
+    return;
+}
+catch (...) {}
+
+// CHECK: Z3fn5v:
+#define TRY3 try { return; } catch (...) // CHECK-DAG: File 2, [[@LINE]]:18 -> [[@LINE]]:29 = #1
+#define TRY4 try { TRY3 { return; } } catch (...) // CHECK-DAG: Expansion,File 1, [[@LINE]]:20 -> [[@LINE]]:24 = #1 (Expanded file = 2)
+void fn5() {
+  for (;;) {
+    TRY4 { return; } // CHECK-DAG: Expansion,File 0, [[@LINE]]:5 -> [[@LINE]]:9 = #1 (Expanded file = 1)
+  }                  // CHECK-DAG: File 0, [[@LINE-1]]:10 -> [[@LINE-1]]:21 = #5
+}
+
 int main() {
   fn1();
   fn2();
   fn3();
+  fn4();
+  fn5();
 }
diff --git a/test/CoverageMapping/unreachable-macro.c b/test/CoverageMapping/unreachable-macro.c
index 4b33a23..b9d4f36 100644
--- a/test/CoverageMapping/unreachable-macro.c
+++ b/test/CoverageMapping/unreachable-macro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
 
 #define WHILE while (0) {}
 
diff --git a/test/CoverageMapping/unused_function.cpp b/test/CoverageMapping/unused_function.cpp
new file mode 100644
index 0000000..6a46b1d
--- /dev/null
+++ b/test/CoverageMapping/unused_function.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+#define START_SCOPE {
+#define END_SCOPE }
+
+// CHECK: {{_Z2f0v|\?f0@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+1]]:20 = 0
+inline void f0() {}
+
+// CHECK: {{_Z2f1v|\?f1@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+1]]:31 = 0
+inline void f1() START_SCOPE }
+
+// CHECK: {{_Z2f2v|\?f2@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+1]]:29 = 0
+inline void f2() { END_SCOPE
+
+// CHECK: {{_Z2f3v|\?f3@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+1]]:39 = 0
+inline void f3() START_SCOPE END_SCOPE
+
+// CHECK: {{_Z2f4v|\?f4@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+2]]:10 -> [[@LINE+3]]:2 = 0
+inline void f4()
+#include "Inputs/starts_a_scope_only"
+}
+
+// CHECK: {{_Z2f5v|\?f5@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+2]]:36 = 0
+inline void f5() {
+#include "Inputs/ends_a_scope_only"
+
+// CHECK: {{_Z2f6v|\?f6@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+2]]:10 -> [[@LINE+3]]:36 = 0
+inline void f6()
+#include "Inputs/starts_a_scope_only"
+#include "Inputs/ends_a_scope_only"
diff --git a/test/CoverageMapping/unused_names.c b/test/CoverageMapping/unused_names.c
index c0c10ea..a03d18b 100644
--- a/test/CoverageMapping/unused_names.c
+++ b/test/CoverageMapping/unused_names.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -emit-llvm -main-file-name unused_names.c -o - %s > %t
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -emit-llvm -main-file-name unused_names.c -o - %s > %t
 // RUN: FileCheck -input-file %t %s
 // RUN: FileCheck -check-prefix=SYSHEADER -input-file %t %s
 
@@ -7,6 +7,7 @@
 // CHECK-DAG: @__profn_bar = {{.*}} [3 x i8] c"bar"
 // CHECK-DAG: @__profn_baz = {{.*}} [3 x i8] c"baz"
 // CHECK-DAG: @__profn_unused_names.c_qux = {{.*}} [18 x i8] c"unused_names.c:qux"
+// CHECK-DAG: @__llvm_prf_nm = private constant {{.*}}, section "{{.*}}__llvm_prf_names"
 
 // SYSHEADER-NOT: @__profn_foo =
 
diff --git a/test/CoverageMapping/while.c b/test/CoverageMapping/while.c
index a85957f..7f09e4b 100644
--- a/test/CoverageMapping/while.c
+++ b/test/CoverageMapping/while.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s
 
                                     // CHECK: main
 int main() {                        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+8]]:2 = #0
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/CUDA_80/usr/local/cuda/bin/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/CUDA_80/usr/local/cuda/bin/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/CUDA_80/usr/local/cuda/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/CUDA_80/usr/local/cuda/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/CUDA_80/usr/local/cuda/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/CUDA_80/usr/local/cuda/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/CUDA_80/usr/local/cuda/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/CUDA_80/usr/local/cuda/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/version.txt b/test/Driver/Inputs/CUDA_80/usr/local/cuda/version.txt
new file mode 100644
index 0000000..ee238af
--- /dev/null
+++ b/test/Driver/Inputs/CUDA_80/usr/local/cuda/version.txt
@@ -0,0 +1 @@
+CUDA Version 8.0.42
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.bfd
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.bfd
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.gold
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.gold
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.bfd
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.bfd
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.gold
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.gold
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/armv7-a/thumb/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/armv7-a/thumb/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/thumb/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/thumb/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.bfd
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.bfd
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.gold
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.gold
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.bfd
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.bfd
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.gold
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.gold
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.bfd
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.bfd
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.gold
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.gold
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.bfd
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.bfd
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.gold
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.gold
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.bfd
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.bfd
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.gold
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.gold
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/aarch64-linux-android/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/aarch64-linux-android/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/thumb/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/thumb/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/backward/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/backward/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/i686-linux-android/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/i686-linux-android/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r2/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r2/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r6/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r6/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r1/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r1/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r1/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r1/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r2/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r2/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r2/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r2/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r6/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r6/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r6/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/32/mips-r6/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mips64el-linux-android/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtbegin.o
similarity index 100%
rename from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
rename to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtend.o
similarity index 100%
rename from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
rename to test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/bin/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/bin/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/libr2/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/libr2/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/libr6/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mips64el-linux-android/libr6/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.bfd
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.bfd
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.gold
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.gold
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/libr2/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/libr2/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/libr6/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/libr6/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_dynamic.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_dynamic.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_so.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_so.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_static.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_static.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_android.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_android.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_so.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_so.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtbegin_dynamic.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtbegin_dynamic.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtbegin_so.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtbegin_so.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtbegin_static.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtbegin_static.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtend_android.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtend_android.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtend_so.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr2/crtend_so.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtbegin_dynamic.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtbegin_dynamic.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtbegin_so.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtbegin_so.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtbegin_static.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtbegin_static.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtend_android.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtend_android.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtend_so.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/libr6/crtend_so.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbeginS.o b/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbeginS.o
deleted file mode 100644
index e69de29..0000000
--- a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbeginS.o
+++ /dev/null
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbeginT.o b/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbeginT.o
deleted file mode 100644
index e69de29..0000000
--- a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbeginT.o
+++ /dev/null
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtendS.o b/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtendS.o
deleted file mode 100644
index e69de29..0000000
--- a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtendS.o
+++ /dev/null
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/gcc_version_parsing5/bin/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/gcc_version_parsing5/bin/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/4.9.2/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/4.9.2/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/5/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/5/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/x86_64-pc-linux-gnu/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/x86_64-pc-linux-gnu/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/header0.h
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/header0.h
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/header1.h
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/header1.h
diff --git a/test/Driver/Inputs/header2.h b/test/Driver/Inputs/header2.h
new file mode 100644
index 0000000..243468d
--- /dev/null
+++ b/test/Driver/Inputs/header2.h
@@ -0,0 +1 @@
+#include "header1.h"
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/header3.h
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/header3.h
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/header4.h
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/header4.h
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/bin/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/bin/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/bin/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/bin/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/bin/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/bin/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtbegin.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtend.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/bin/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/bin/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib32/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib64/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/include/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/.keep
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crt1.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crti.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtend.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crtn.o
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/pchfile.cpp
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/pchfile.cpp
diff --git a/test/Driver/Inputs/pchfile.h b/test/Driver/Inputs/pchfile.h
new file mode 100644
index 0000000..1aafaee
--- /dev/null
+++ b/test/Driver/Inputs/pchfile.h
@@ -0,0 +1,3 @@
+#if defined(ERR_HEADER)
+#error nope1
+#endif
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Driver/Inputs/resource_dir/vtables_blacklist.txt
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Driver/Inputs/resource_dir/vtables_blacklist.txt
diff --git a/test/Driver/aarch64-cpus.c b/test/Driver/aarch64-cpus.c
index 7b0fac4..c451be9 100644
--- a/test/Driver/aarch64-cpus.c
+++ b/test/Driver/aarch64-cpus.c
@@ -74,6 +74,20 @@
 // RUN: %clang -target arm64 -mlittle-endian -mtune=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CA72 %s
 // ARM64-CA72: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "cortex-a72"
 
+// RUN: %clang -target aarch64 -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64 -mlittle-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64 -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64 -mlittle-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a73"
+
+// RUN: %clang -target arm64 -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CORTEX-A73 %s
+// RUN: %clang -target arm64 -mlittle-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CORTEX-A73 %s
+// RUN: %clang -target arm64 -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CORTEX-A73 %s
+// RUN: %clang -target arm64 -mlittle-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CORTEX-A73 %s
+// ARM64-CORTEX-A73: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "cortex-a73"
+
 // RUN: %clang -target aarch64 -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1 %s
 // RUN: %clang -target aarch64 -mlittle-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1 %s
 // RUN: %clang -target aarch64_be -mlittle-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1 %s
@@ -82,12 +96,52 @@
 // RUN: %clang -target aarch64_be -mlittle-endian -mtune=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1 %s
 // M1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "exynos-m1"
 
+// RUN: %clang -target aarch64 -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2 %s
+// RUN: %clang -target aarch64 -mlittle-endian -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2 %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2 %s
+// RUN: %clang -target aarch64 -mtune=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2 %s
+// RUN: %clang -target aarch64 -mlittle-endian -mtune=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2 %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mtune=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2 %s
+// M2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "exynos-m2"
+
 // RUN: %clang -target arm64 -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M1 %s
 // RUN: %clang -target arm64 -mlittle-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M1 %s
 // RUN: %clang -target arm64 -mtune=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M1 %s
 // RUN: %clang -target arm64 -mlittle-endian -mtune=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M1 %s
 // ARM64-M1: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "exynos-m1"
 
+// RUN: %clang -target arm64 -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M2 %s
+// RUN: %clang -target arm64 -mlittle-endian -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M2 %s
+// RUN: %clang -target arm64 -mtune=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M2 %s
+// RUN: %clang -target arm64 -mlittle-endian -mtune=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M2 %s
+// ARM64-M2: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "exynos-m2"
+
+// RUN: %clang -target aarch64 -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=KRYO %s
+// RUN: %clang -target aarch64 -mlittle-endian -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=KRYO %s
+// RUN: %clang -target aarch64 -mtune=kryo -### -c %s 2>&1 | FileCheck -check-prefix=KRYO %s
+// RUN: %clang -target aarch64 -mlittle-endian -mtune=kryo -### -c %s 2>&1 | FileCheck -check-prefix=KRYO %s
+// KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "kryo"
+
+// RUN: %clang -target arm64 -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-KRYO %s
+// RUN: %clang -target arm64 -mlittle-endian -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-KRYO %s
+// RUN: %clang -target arm64 -mtune=kryo -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-KRYO %s
+// RUN: %clang -target arm64 -mlittle-endian -mtune=kryo -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-KRYO %s
+// ARM64-KRYO: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "kryo"
+
+// RUN: %clang -target aarch64 -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64 -mlittle-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64 -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64 -mlittle-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// VULCAN: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "vulcan"
+
+// RUN: %clang -target arm64 -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-VULCAN %s
+// RUN: %clang -target arm64 -mlittle-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-VULCAN %s
+// RUN: %clang -target arm64 -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-VULCAN %s
+// RUN: %clang -target arm64 -mlittle-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-VULCAN %s
+// ARM64-VULCAN: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "vulcan"
+
 // RUN: %clang -target aarch64_be -### -c %s 2>&1 | FileCheck -check-prefix=GENERIC-BE %s
 // RUN: %clang -target aarch64 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=GENERIC-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=GENERIC-BE %s
@@ -125,6 +179,14 @@
 // RUN: %clang -target aarch64_be -mbig-endian -mtune=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CA72-BE %s
 // CA72-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "cortex-a72"
 
+// RUN: %clang -target aarch64_be -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64_be -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// CORTEX-A73-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "cortex-a73"
+
 // RUN: %clang -target aarch64_be -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1-BE %s
 // RUN: %clang -target aarch64 -mbig-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1-BE %s
@@ -133,10 +195,29 @@
 // RUN: %clang -target aarch64_be -mbig-endian -mtune=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1-BE %s
 // M1-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "exynos-m1"
 
+// RUN: %clang -target aarch64_be -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2-BE %s
+// RUN: %clang -target aarch64_be -mtune=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mtune=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mtune=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=M2-BE %s
+// M2-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "exynos-m2"
+
+// RUN: %clang -target aarch64_be -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64_be -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// VULCAN-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "vulcan"
+
 // RUN: %clang -target aarch64 -mcpu=cortex-a57 -mtune=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
 // RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=cortex-a57  -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
 // RUN: %clang -target aarch64 -mcpu=cortex-a72 -mtune=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
 // RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=cortex-a72  -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
+// RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=cortex-a73     -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
+// RUN: %clang -target aarch64 -mcpu=vulcan -mtune=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
+// RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=vulcan  -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
 // MCPU-MTUNE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a53"
 
 // RUN: %clang -target aarch64 -march=armv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV81A %s
diff --git a/test/Driver/aarch64-ras.c b/test/Driver/aarch64-ras.c
new file mode 100644
index 0000000..fe038ea
--- /dev/null
+++ b/test/Driver/aarch64-ras.c
@@ -0,0 +1,7 @@
+// RUN: %clang -target aarch64-none-none-eabi -march=armv8a+ras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-RAS %s
+// RUN: %clang -target aarch64-none-none-eabi -mcpu=generic+ras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-RAS %s
+// CHECK-RAS: "-target-feature" "+ras"
+
+// RUN: %clang -target aarch64-none-none-eabi -march=armv8a+noras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-NORAS %s
+// RUN: %clang -target aarch64-none-none-eabi -mcpu=generic+noras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-NORAS %s
+// CHECK-NORAS: "-target-feature" "-ras"
diff --git a/test/Driver/amdgpu-features.c b/test/Driver/amdgpu-features.c
new file mode 100644
index 0000000..235b88f
--- /dev/null
+++ b/test/Driver/amdgpu-features.c
@@ -0,0 +1,7 @@
+// RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=kaveri -mamdgpu-debugger-abi=0.0 %s -o 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MAMDGPU-DEBUGGER-ABI-0-0 %s
+// CHECK-MAMDGPU-DEBUGGER-ABI-0-0: the clang compiler does not support '-mamdgpu-debugger-abi=0.0'
+
+// RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=kaveri -mamdgpu-debugger-abi=1.0 %s -o 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MAMDGPU-DEBUGGER-ABI-1-0 %s
+// CHECK-MAMDGPU-DEBUGGER-ABI-1-0: "-target-feature" "+amdgpu-debugger-insert-nops" "-target-feature" "+amdgpu-debugger-reserve-regs" "-target-feature" "+amdgpu-debugger-emit-prologue"
diff --git a/test/Driver/amdgpu-toolchain.c b/test/Driver/amdgpu-toolchain.c
index c84a154..52a7197 100644
--- a/test/Driver/amdgpu-toolchain.c
+++ b/test/Driver/amdgpu-toolchain.c
@@ -1,3 +1,6 @@
 // RUN: %clang -### -target amdgcn--amdhsa -x assembler -mcpu=kaveri %s 2>&1 | FileCheck -check-prefix=AS_LINK %s
 // AS_LINK: clang{{.*}} "-cc1as"
-// AS_LINK: ld.lld{{.*}}
+// AS_LINK: ld.lld{{.*}} "-shared"
+
+// RUN: %clang -### -g -target amdgcn--amdhsa -mcpu=kaveri %s 2>&1 | FileCheck -check-prefix=DWARF_VER %s
+// DWARF_VER: "-dwarf-version=2"
diff --git a/test/Driver/android-ndk-standalone.cpp b/test/Driver/android-ndk-standalone.cpp
new file mode 100644
index 0000000..7fe6d3c
--- /dev/null
+++ b/test/Driver/android-ndk-standalone.cpp
@@ -0,0 +1,315 @@
+// Test header and library paths when Clang is used with Android standalone
+// toolchain.
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  %s
+// CHECK: {{.*}}clang{{.*}}" "-cc1"
+// CHECK: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK: "-internal-isystem" "{{.*}}/sysroot/usr/local/include"
+// CHECK: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/arm-linux-androideabi/lib"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target armv7a-none-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7 %s
+// CHECK-ARMV7: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-ARMV7: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-ARMV7: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-ARMV7: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-ARMV7: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-ARMV7: "-internal-isystem" "{{.*}}/sysroot/usr/local/include"
+// CHECK-ARMV7: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK-ARMV7: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-ARMV7: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-ARMV7: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-ARMV7: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-ARMV7: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-ARMV7: "-L{{.*}}/sysroot/usr/lib"
+//
+// Other flags that can trigger armv7 mode.
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -march=armv7 \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7 %s
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -march=armv7a \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7 %s
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -march=armv7-a \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7 %s
+//
+// ARM thumb mode.
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -mthumb \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-THUMB %s
+// CHECK-THUMB: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-THUMB: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7/thumb"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7/thumb"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-THUMB: "-internal-isystem" "{{.*}}/sysroot/usr/local/include"
+// CHECK-THUMB: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK-THUMB: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-THUMB: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-THUMB: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-THUMB: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-THUMB: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-THUMB: "-L{{.*}}/sysroot/usr/lib"
+//
+// ARM V7 thumb mode.
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -march=armv7-a -mthumb \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7THUMB %s
+// CHECK-ARMV7THUMB: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-ARMV7THUMB: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-ARMV7THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-ARMV7THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-ARMV7THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-ARMV7THUMB: "-internal-isystem" "{{.*}}/sysroot/usr/local/include"
+// CHECK-ARMV7THUMB: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK-ARMV7THUMB: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-ARMV7THUMB: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-ARMV7THUMB: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-ARMV7THUMB: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-ARMV7THUMB: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-ARMV7THUMB: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target armv7a-none-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -mthumb \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7THUMB %s
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target aarch64-linux-android -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-AARCH64 %s
+// CHECK-AARCH64: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-AARCH64: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-AARCH64: "-internal-isystem" "{{.*}}/include/c++/4.9/aarch64-linux-android"
+// CHECK-AARCH64: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-AARCH64: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-AARCH64: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-AARCH64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9"
+// CHECK-AARCH64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9/../../../../aarch64-linux-android/lib"
+// CHECK-AARCH64: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm64-linux-android -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-ARM64 %s
+// CHECK-ARM64: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-ARM64: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-ARM64: "-internal-isystem" "{{.*}}/include/c++/4.9/aarch64-linux-android"
+// CHECK-ARM64: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-ARM64: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-ARM64: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-ARM64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-ARM64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9"
+// CHECK-ARM64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9/../../../../aarch64-linux-android/lib"
+// CHECK-ARM64: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target mipsel-linux-android \
+// RUN:     -mips32 -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPS %s
+// CHECK-MIPS: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-MIPS: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-MIPS: "-internal-isystem" "{{.*}}/include/c++/4.9/mipsel-linux-android"
+// CHECK-MIPS: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-MIPS: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-MIPS: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-MIPS: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-MIPS: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9"
+// CHECK-MIPS: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/../../../../mipsel-linux-android/lib"
+// CHECK-MIPS: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target mipsel-linux-android \
+// RUN:     -march=mips32 -mips32r2 -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPSR2 %s
+// CHECK-MIPSR2: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-MIPSR2: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-MIPSR2: "-internal-isystem" "{{.*}}/include/c++/4.9/mipsel-linux-android/mips-r2"
+// CHECK-MIPSR2: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-MIPSR2: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-MIPSR2: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-MIPSR2: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-MIPSR2: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/mips-r2"
+// CHECK-MIPSR2: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/../../../../mipsel-linux-android/lib/../libr2"
+// CHECK-MIPSR2: "-L{{.*}}/sysroot/usr/lib/../libr2"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target mipsel-linux-android \
+// RUN:     -mips32r6 -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPSR6 %s
+// CHECK-MIPSR6: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-MIPSR6: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-MIPSR6: "-internal-isystem" "{{.*}}/include/c++/4.9/mipsel-linux-android/mips-r6"
+// CHECK-MIPSR6: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-MIPSR6: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-MIPSR6: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-MIPSR6: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-MIPSR6: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/mips-r6"
+// CHECK-MIPSR6: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/../../../../mipsel-linux-android/lib/../libr6"
+// CHECK-MIPSR6: "-L{{.*}}/sysroot/usr/lib/../libr6"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target mips64el-linux-android \
+// RUN:     -march=mips32 -mips32r2 -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPS64-R2 %s
+// CHECK-MIPS64-R2: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-MIPS64-R2: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-MIPS64-R2: "-internal-isystem" "{{.*}}/include/mips64el-linux-android/c++/4.9/mips-r2"
+// CHECK-MIPS64-R2: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-MIPS64-R2: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-MIPS64-R2: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-MIPS64-R2: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-MIPS64-R2: "-L{{.*}}/lib/gcc/mips64el-linux-android/4.9/32/mips-r2"
+// CHECK-MIPS64-R2: "-L{{.*}}/lib/gcc/mips64el-linux-android/4.9/../../../../mips64el-linux-android/lib/../libr2"
+// CHECK-MIPS64-R2: "-L{{.*}}/sysroot/usr/lib/../libr2"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target mips64el-linux-android \
+// RUN:     -march=mips32 -mips32r6 -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPS64-R6 %s
+// CHECK-MIPS64-R6: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-MIPS64-R6: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-MIPS64-R6: "-internal-isystem" "{{.*}}/include/mips64el-linux-android/c++/4.9/mips-r6"
+// CHECK-MIPS64-R6: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-MIPS64-R6: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-MIPS64-R6: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-MIPS64-R6: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-MIPS64-R6: "-L{{.*}}/lib/gcc/mips64el-linux-android/4.9/32/mips-r6"
+// CHECK-MIPS64-R6: "-L{{.*}}/lib/gcc/mips64el-linux-android/4.9/../../../../mips64el-linux-android/lib/../libr6"
+// CHECK-MIPS64-R6: "-L{{.*}}/sysroot/usr/lib/../libr6"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target i686-linux-android \
+// RUN:     -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-I686 %s
+// CHECK-I686: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-I686: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-I686: "-internal-isystem" "{{.*}}/include/c++/4.9/i686-linux-android"
+// CHECK-I686: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-I686: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-I686: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-I686: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-I686: "-L{{.*}}/lib/gcc/i686-linux-android/4.9"
+// CHECK-I686: "-L{{.*}}/lib/gcc/i686-linux-android/4.9/../../../../i686-linux-android/lib"
+// CHECK-I686: "-L{{.*}}/sysroot/usr/lib"
diff --git a/test/Driver/android-standalone.cpp b/test/Driver/android-standalone.cpp
index d563deb..0f8cf0b 100644
--- a/test/Driver/android-standalone.cpp
+++ b/test/Driver/android-standalone.cpp
@@ -2,7 +2,7 @@
 // toolchain.
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck  %s
@@ -17,7 +17,7 @@
 // CHECK: "-L{{.*}}/sysroot/usr/lib"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target aarch64-linux-android \
+// RUN:     -target aarch64-linux-android -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64 %s
@@ -32,7 +32,7 @@
 // CHECK-AARCH64: "-L{{.*}}/sysroot/usr/lib"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm64-linux-android \
+// RUN:     -target arm64-linux-android -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ARM64 %s
@@ -48,7 +48,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target mipsel-linux-android \
-// RUN:     -mips32 \
+// RUN:     -mips32 -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS %s
@@ -64,7 +64,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target mipsel-linux-android \
-// RUN:     -march=mips32 -mips32r2 \
+// RUN:     -march=mips32 -mips32r2 -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPSR2 %s
@@ -80,7 +80,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target mipsel-linux-android \
-// RUN:     -mips32 -march=mips32r2 \
+// RUN:     -mips32 -march=mips32r2 -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPSR2-A %s
@@ -93,19 +93,3 @@
 // CHECK-MIPSR2-A: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/mips-r2"
 // CHECK-MIPSR2-A: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/../../../../mipsel-linux-android/lib"
 // CHECK-MIPSR2-A: "-L{{.*}}/sysroot/usr/lib"
-//
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target mipsel-linux-android \
-// RUN:     -mips32r6 \
-// RUN:     -B%S/Inputs/basic_android_tree \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-MIPSR6 %s
-// CHECK-MIPSR6: {{.*}}clang{{.*}}" "-cc1"
-// CHECK-MIPSR6: "-internal-isystem" "{{.*}}/mipsel-linux-android/include/c++/4.4.3"
-// CHECK-MIPSR6: "-internal-isystem" "{{.*}}/mipsel-linux-android/include/c++/4.4.3/mipsel-linux-android"
-// CHECK-MIPSR6: "-internal-externc-isystem" "{{.*}}/sysroot/include"
-// CHECK-MIPSR6: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
-// CHECK-MIPSR6: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-MIPSR6: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/mips-r6"
-// CHECK-MIPSR6: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/../../../../mipsel-linux-android/lib"
-// CHECK-MIPSR6: "-L{{.*}}/sysroot/usr/lib"
diff --git a/test/Driver/arc.c b/test/Driver/arc.c
index 97d00ba..0025297 100644
--- a/test/Driver/arc.c
+++ b/test/Driver/arc.c
@@ -3,7 +3,7 @@
 // RUN: not %clang -x objective-c++ -target i386-apple-darwin10 -m32 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck %s
 // RUN: not %clang -x c -target i386-apple-darwin10 -m32 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck -check-prefix NOTOBJC %s
 // RUN: not %clang -x c++ -target i386-apple-darwin10 -m32 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck -check-prefix NOTOBJC %s
-// RUN: not %clang -x objective-c -target x86_64-apple-darwin11 -mmacosx-version-min=10.5 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck -check-prefix UNSUPPORTED %s
+// RUN: not %clang -x objective-c -target x86_64-apple-darwin11 -mmacosx-version-min=10.5 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck -check-prefix NOTSUPPORTED %s
 
 // Just to test clang is working.
 # foo
@@ -14,4 +14,4 @@
 // NOTOBJC-NOT: error: -fobjc-arc is not supported on platforms using the legacy runtime
 // NOTOBJC: invalid preprocessing directive
 
-// UNSUPPORTED: error: -fobjc-arc is not supported on versions of OS X prior to 10.6
+// NOTSUPPORTED: error: -fobjc-arc is not supported on versions of OS X prior to 10.6
diff --git a/test/Driver/arm-abi.c b/test/Driver/arm-abi.c
index 812a849..897c108 100644
--- a/test/Driver/arm-abi.c
+++ b/test/Driver/arm-abi.c
@@ -28,13 +28,17 @@
 // RUN: %clang -target arm--netbsd-eabihf %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS %s
 
-// Otherwise, ABI is celected based on environment
+// Otherwise, ABI is selected based on environment
 // RUN: %clang -target arm---android %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
 // RUN: %clang -target arm---gnueabi %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
 // RUN: %clang -target arm---gnueabihf %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
+// RUN: %clang -target arm---musleabi %s -### -o %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
+// RUN: %clang -target arm---musleabihf %s -### -o %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
 // RUN: %clang -target arm---eabi %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS %s
 // RUN: %clang -target arm---eabihf %s -### -o %t.o 2>&1 \
diff --git a/test/Driver/arm-alignment.c b/test/Driver/arm-alignment.c
index 3e21652..e0b4946 100644
--- a/test/Driver/arm-alignment.c
+++ b/test/Driver/arm-alignment.c
@@ -83,11 +83,13 @@
 // CHECK-ALIGNED-ARM: "-target-feature" "+strict-align"
 // CHECK-ALIGNED-AARCH64: "-target-feature" "+strict-align"
 
-// Make sure that v6M cores always trigger the unsupported aligned accesses error
-// for all supported architecture triples.
+// Make sure that v6M cores and v8M Baseline always trigger the unsupported
+// aligned accesses error for all supported architecture triples.
 // RUN: not %clang -c -target thumbv6m-none-gnueabi -mcpu=cortex-m0 -munaligned-access %s 2>&1 | \
 // RUN:   FileCheck --check-prefix CHECK-UNALIGN-NOT-SUPPORTED %s
 // RUN: not %clang -c -target thumb-none-gnueabi -mcpu=cortex-m0 -munaligned-access %s 2>&1 | \
 // RUN:   FileCheck --check-prefix CHECK-UNALIGN-NOT-SUPPORTED %s
+// RUN: not %clang -c -target thumbv8m.base-none-gnueabi -munaligned-access %s 2>&1 | \
+// RUN:   FileCheck --check-prefix CHECK-UNALIGN-NOT-SUPPORTED %s
 
-// CHECK-UNALIGN-NOT-SUPPORTED: error: the v6m sub-architecture does not support unaligned accesses
+// CHECK-UNALIGN-NOT-SUPPORTED: error: the {{.*}} sub-architecture does not support unaligned accesses
diff --git a/test/Driver/arm-cortex-cpus.c b/test/Driver/arm-cortex-cpus.c
index 6a4d2d6..5bf8939 100644
--- a/test/Driver/arm-cortex-cpus.c
+++ b/test/Driver/arm-cortex-cpus.c
@@ -204,7 +204,7 @@
 // RUN: %clang -mcpu=generic -target armv8.1a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A %s
 // RUN: %clang -mcpu=generic -target arm -march=armv8.1a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A %s
 // RUN: %clang -mcpu=generic -target arm -mlittle-endian -march=armv8.1-a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A %s
-// CHECK-V81A: "-cc1"{{.*}} "-triple" "armv8.1a-{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+// CHECK-V81A: "-cc1"{{.*}} "-triple" "armv8.1a-{{.*}}" "-target-cpu" "generic"
 
 // RUN: %clang -target armebv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
 // RUN: %clang -target armeb -march=armebv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
@@ -212,7 +212,7 @@
 // RUN: %clang -target armv8.1a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
 // RUN: %clang -target arm -march=armebv8.1a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
 // RUN: %clang -target arm -march=armebv8.1-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
-// CHECK-BE-V81A: "-cc1"{{.*}} "-triple" "armebv8.1a-{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+// CHECK-BE-V81A: "-cc1"{{.*}} "-triple" "armebv8.1a-{{.*}}" "-target-cpu" "generic"
 
 // RUN: %clang -target armv8.1a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
 // RUN: %clang -target arm -march=armv8.1a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
@@ -220,7 +220,7 @@
 // RUN: %clang -target armv8.1a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
 // RUN: %clang -target arm -march=armv8.1a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
 // RUN: %clang -target arm -march=armv8.1-a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
-// CHECK-V81A-THUMB: "-cc1"{{.*}} "-triple" "thumbv8.1a-{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+// CHECK-V81A-THUMB: "-cc1"{{.*}} "-triple" "thumbv8.1a-{{.*}}" "-target-cpu" "generic"
 
 // RUN: %clang -target armebv8.1a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
 // RUN: %clang -target armeb -march=armebv8.1a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
@@ -228,7 +228,68 @@
 // RUN: %clang -target armv8.1a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
 // RUN: %clang -target arm -march=armebv8.1a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
 // RUN: %clang -target arm -march=armebv8.1-a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
-// CHECK-BE-V81A-THUMB: "-cc1"{{.*}} "-triple" "thumbebv8.1a-{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+// CHECK-BE-V81A-THUMB: "-cc1"{{.*}} "-triple" "thumbebv8.1a-{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -march=armv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -march=armv8.2-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -march=armv8.2a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target armv8.2a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -march=armv8.2a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -mlittle-endian -march=armv8.2-a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// CHECK-V82A: "-cc1"{{.*}} "-triple" "armv8.2{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armebv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target armv8.2a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target armeb -march=armebv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target armeb -march=armebv8.2-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target arm -march=armebv8.2a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target arm -march=armebv8.2-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// CHECK-BE-V82A: "-cc1"{{.*}} "-triple" "armebv8.2{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armv8.2a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target arm -march=armv8.2a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target arm -march=armv8.2-a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target armv8.2a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target arm -march=armv8.2a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target arm -march=armv8.2-a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// CHECK-V82A-THUMB: "-cc1"{{.*}} "-triple" "thumbv8.2a-{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armebv8.2a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target armeb -march=armebv8.2a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target armeb -march=armebv8.2-a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target armv8.2a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target arm -march=armebv8.2a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target arm -march=armebv8.2-a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// CHECK-BE-V82A-THUMB: "-cc1"{{.*}} "-triple" "thumbebv8.2a-{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armv8a -march=armv8.2-a+fp16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-V82A-FP16 %s
+// CHECK-V82A-FP16: "-cc1"{{.*}} "-triple" "armv8.2{{.*}}" "-target-cpu" "generic" {{.*}}"-target-feature" "+fullfp16"
+
+// Once we have CPUs with optional v8.2-A FP16, we will need a way to turn it
+// on and off. Cortex-A53 is a placeholder for now.
+// RUN: %clang -target armv8a -mcpu=cortex-a53+fp16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-CORTEX-A53-FP16 %s
+// RUN: %clang -target armv8a -mcpu=cortex-a53+nofp16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-CORTEX-A53-NOFP16 %s
+// CHECK-CORTEX-A53-FP16: "-cc1" {{.*}}"-target-cpu" "cortex-a53" {{.*}}"-target-feature" "+fullfp16"
+// CHECK-CORTEX-A53-NOFP16: "-cc1" {{.*}}"-target-cpu" "cortex-a53" {{.*}}"-target-feature" "-fullfp16"
+
+// RUN: %clang -target armv8m.base %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_BASELINE
+// RUN: %clang -target arm -march=armv8-m.base %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_BASELINE
+// RUN: %clang -target arm -march=armv8m.base %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_BASELINE
+// RUN: %clang -target armv8m.base -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_BASELINE
+// RUN: %clang -target arm -march=armv8-m.base -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_BASELINE
+// RUN: %clang -target arm -march=armv8m.base -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_BASELINE
+// V8M_BASELINE: "-cc1"{{.*}} "-triple" "thumbv8m.base-{{.*}} "-target-cpu" "generic"
+// EBV8M_BASELINE: "-cc1"{{.*}} "-triple" "thumbebv8m.base-{{.*}} "-target-cpu" "generic"
+
+// RUN: %clang -target armv8m.main %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_MAINLINE
+// RUN: %clang -target arm -march=armv8-m.main %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_MAINLINE
+// RUN: %clang -target arm -march=armv8m.main %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_MAINLINE
+// RUN: %clang -target armv8m.main -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_MAINLINE
+// RUN: %clang -target arm -march=armv8-m.main -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_MAINLINE
+// RUN: %clang -target arm -march=armv8m.main -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_MAINLINE
+// V8M_MAINLINE: "-cc1"{{.*}} "-triple" "thumbv8m.main-{{.*}} "-target-cpu" "generic"
+// EBV8M_MAINLINE: "-cc1"{{.*}} "-triple" "thumbebv8m.main-{{.*}} "-target-cpu" "generic"
 
 // ================== Check that a bogus architecture gives an error
 // RUN: %clang -target arm -march=armbogusv6 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BOGUS %s
@@ -358,90 +419,131 @@
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // CHECK-CPUV7R: "-cc1"{{.*}} "-triple" "armv7r-{{.*}}
 
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r4 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r4f -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r7 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
+// RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // CHECK-BE-CPUV7R: "-cc1"{{.*}} "-triple" "armebv7r-{{.*}}
 
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // CHECK-CPUV7R-THUMB: "-cc1"{{.*}} "-triple" "thumbv7r-{{.*}}
 
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r4 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r4f -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r5 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r7 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
+// RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r8 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // CHECK-BE-CPUV7R-THUMB: "-cc1"{{.*}} "-triple" "thumbebv7r-{{.*}}
 
+// RUN: %clang -target arm -mcpu=cortex-a32 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
+// RUN: %clang -target arm -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a32 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
+// RUN: %clang -target arm -mcpu=exynos-m2 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // CHECK-CPUV8A: "-cc1"{{.*}} "-triple" "armv8-{{.*}}
 
+// RUN: %clang -target armeb -mcpu=cortex-a32 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=cortex-a35 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=cortex-a57 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
+// RUN: %clang -target armeb -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
+// RUN: %clang -target armeb -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a32 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
+// RUN: %clang -target arm -mcpu=exynos-m2 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // CHECK-BE-CPUV8A: "-cc1"{{.*}} "-triple" "armebv8-{{.*}}
 
+// RUN: %clang -target arm -mcpu=cortex-a32 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=exynos-m2 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a32 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=exynos-m2 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // CHECK-CPUV8A-THUMB: "-cc1"{{.*}} "-triple" "thumbv8-{{.*}}
 
+// RUN: %clang -target armeb -mcpu=cortex-a32 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=cortex-a35 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=cortex-a53 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=cortex-a57 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=cortex-a72 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
+// RUN: %clang -target armeb -mcpu=cortex-a73 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=exynos-m1 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
+// RUN: %clang -target armeb -mcpu=exynos-m2 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a32 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=exynos-m2 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // CHECK-BE-CPUV8A-THUMB: "-cc1"{{.*}} "-triple" "thumbebv8-{{.*}}
 
+// RUN: %clang -target armv8a-arm-none-eabi -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CORTEX-A73 %s
+// RUN: %clang -target armv8a-arm-none-eabi -mcpu=cortex-a73 -mfpu=crypto-neon-fp-armv8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CORTEX-A73-MFPU %s
+// RUN: %clang -target armv8a-arm-none-eabi -mcpu=cortex-a73 -mfloat-abi=soft -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CORTEX-A73-SOFT %s
+// CHECK-CORTEX-A73: "-cc1"{{.*}} "-triple" "armv8-{{.*}} "-target-cpu" "cortex-a73"
+// CHECK-CORTEX-A73-MFPU: "-cc1"{{.*}} "-target-feature" "+fp-armv8"
+// CHECK-CORTEX-A73-MFPU: "-target-feature" "+crypto"
+// CHECK-CORTEX-A73-SOFT: "-target-feature" "+soft-float"
+// CHECK-CORTEX-A73-SOFT: "-target-feature" "+soft-float-abi"
+
 // ================== Check whether -mcpu accepts mixed-case values.
 // RUN: %clang -target arm-linux-gnueabi -mcpu=Cortex-a5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CASE-INSENSITIVE-CPUV7A %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-A7 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CASE-INSENSITIVE-CPUV7A %s
diff --git a/test/Driver/arm-features.c b/test/Driver/arm-features.c
index eb197da..74cedf3 100644
--- a/test/Driver/arm-features.c
+++ b/test/Driver/arm-features.c
@@ -4,6 +4,9 @@
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic+crypto -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO %s
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8a+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO %s
 // CHECK-CRYPTO: "-cc1"{{.*}} "-triple" "armv8-{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "+crypto"
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic+dsp -march=armv8m.main -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-DSP %s
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8m.main+dsp -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-DSP %s
+// CHECK-DSP: "-cc1"{{.*}} "-triple" "thumbv8m.main-{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "+dsp"
 
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic+nocrc -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRC %s
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8a+nocrc -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRC %s
@@ -11,3 +14,6 @@
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic+nocrypto -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRYPTO %s
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8a+nocrypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRYPTO %s
 // CHECK-NOCRYPTO: "-cc1"{{.*}} "-triple" "armv8-{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "-crypto"
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic+nodsp -march=armv8m.main -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NODSP %s
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8m.main+nodsp -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NODSP %s
+// CHECK-NODSP: "-cc1"{{.*}} "-triple" "thumbv8m.main-{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "-dsp"
diff --git a/test/Driver/arm-implicit-it.s b/test/Driver/arm-implicit-it.s
new file mode 100644
index 0000000..48e4bdb
--- /dev/null
+++ b/test/Driver/arm-implicit-it.s
@@ -0,0 +1,24 @@
+// RUN: %clang -target armv7--none-eabi -### %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-DEFAULT
+
+// RUN: %clang -target armv7--none-eabi -mimplicit-it=arm -### %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-ARM
+
+// RUN: %clang -target armv7--none-eabi -mimplicit-it=thumb -### %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-THUMB
+
+// RUN: %clang -target armv7--none-eabi -mimplicit-it=never -### %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-NEVER
+
+// RUN: %clang -target armv7--none-eabi -mimplicit-it=always -### %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-ALWAYS
+
+// RUN: %clang -target armv7--none-eabi -mimplicit-it=thisisnotavalidoption -### %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-INVALID
+
+// CHECK-DEFAULT-NOT: "-arm-implicit-it
+// CHECK-ARM: "-arm-implicit-it=arm"
+// CHECK-THUMB: "-arm-implicit-it=thumb"
+// CHECK-NEVER: "-arm-implicit-it=never"
+// CHECK-ALWAYS: "-arm-implicit-it=always"
+// CHECK-INVALID: error: unsupported argument 'thisisnotavalidoption' to option 'mimplicit-it='
diff --git a/test/Driver/arm-mfpu.c b/test/Driver/arm-mfpu.c
index 93fb0a8..2e1c00d 100644
--- a/test/Driver/arm-mfpu.c
+++ b/test/Driver/arm-mfpu.c
@@ -207,6 +207,8 @@
 
 // RUN: %clang -target arm-linux-gnueabihf %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-HF %s
+// RUN: %clang -target arm-linux-musleabihf %s -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-HF %s
 // CHECK-HF: "-target-cpu" "arm1176jzf-s"
 
 // RUN: %clang -target armv7-apple-darwin -x assembler %s -### -c 2>&1 \
diff --git a/test/Driver/arm-ras.c b/test/Driver/arm-ras.c
new file mode 100644
index 0000000..6d2168c
--- /dev/null
+++ b/test/Driver/arm-ras.c
@@ -0,0 +1,7 @@
+// RUN: %clang -target arm-none-none-eabi -march=armv8a+ras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-RAS %s
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic+ras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-RAS %s
+// CHECK-RAS: "-target-feature" "+ras"
+
+// RUN: %clang -target arm-none-none-eabi -march=armv8a+noras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-NORAS %s
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic+noras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-NORAS %s
+// CHECK-NORAS: "-target-feature" "-ras"
diff --git a/test/Driver/at_file.c b/test/Driver/at_file.c
index 0541ece..56cc5c6 100644
--- a/test/Driver/at_file.c
+++ b/test/Driver/at_file.c
@@ -14,9 +14,9 @@
 // CHECK-NEXT: foo9'bar9'zed9
 // CHECK-NEXT: foo10"bar10"zed10
 // CHECK: bar
-// CHECK: zed12
+// CHECK: zed1
 // CHECK: one\two
-// CHECK: c:\foo\bar.c
+// CHECK: c:foobar.c
 
 foo1
 foo2
diff --git a/test/Driver/at_file.c.args b/test/Driver/at_file.c.args
index 8739000..ccedd82 100644
--- a/test/Driver/at_file.c.args
+++ b/test/Driver/at_file.c.args
@@ -8,6 +8,7 @@
 -Dfoo9=foo9\'bar9\'zed9
 -Dfoo10=foo10\"bar10\"zed10
 -D foo11
--Dfoo12=zed12\
+-Dfoo12=zed1\
+2
 -Dfoo13='one\\two'
 -Dfoo14='c:\foo\bar.c'
diff --git a/test/Driver/at_file_missing.c b/test/Driver/at_file_missing.c
index 0189a8b..23645a5 100644
--- a/test/Driver/at_file_missing.c
+++ b/test/Driver/at_file_missing.c
@@ -1,7 +1,7 @@
 // Make sure that arguments that begin with @ are left as is in the argument
 // stream, and also that @file arguments continue to be processed.
 
-// RUN: echo "%s -D FOO" > %t.args
-// RUN: %clang -rpath @executable_path/../lib @%t.args -### 2>&1 | FileCheck %s
+// RUN: echo "-D FOO" > %t.args
+// RUN: %clang -rpath @executable_path/../lib @%t.args %s -### 2>&1 | FileCheck %s
 // CHECK: "-D" "FOO"
 // CHECK: "-rpath" "@executable_path/../lib"
diff --git a/test/Driver/at_file_win.c b/test/Driver/at_file_win.c
new file mode 100644
index 0000000..9a8ede5
--- /dev/null
+++ b/test/Driver/at_file_win.c
@@ -0,0 +1,34 @@
+// RUN: %clang --rsp-quoting=windows -E %s @%s.args -o %t.log
+// RUN: FileCheck --input-file=%t.log %s
+
+// CHECK: bar1
+// CHECK-NEXT: bar2 zed2
+// CHECK-NEXT: bar3 zed3
+// CHECK-NEXT: bar4 zed4
+// CHECK-NEXT: bar5 zed5
+// CHECK-NEXT: 'bar6 zed6'
+// CHECK-NEXT: 'bar7 zed7'
+// CHECK-NEXT: foo8bar8zed8
+// CHECK-NEXT: foo9\'bar9\'zed9
+// CHECK-NEXT: foo10"bar10"zed10
+// CHECK: bar
+// CHECK: zed12
+// CHECK: one\two
+// CHECK: c:\foo\bar.c
+
+foo1
+foo2
+foo3
+foo4
+foo5
+foo6
+foo7
+foo8
+foo9
+foo10
+#ifdef foo11
+bar
+#endif
+foo12
+foo13
+foo14
diff --git a/test/Driver/at_file_win.c.args b/test/Driver/at_file_win.c.args
new file mode 100644
index 0000000..df109e4
--- /dev/null
+++ b/test/Driver/at_file_win.c.args
@@ -0,0 +1,13 @@
+-Dfoo1=bar1 -Dfoo2="bar2 zed2"
+-Dfoo3="bar3 zed3"
+"-Dfoo4=bar4 zed4"
+"-Dfoo5=bar5 zed5"
+-Dfoo6="'bar6 zed6'"
+-Dfoo7='"bar7 zed7"'
+-Dfoo8=foo8"bar8"zed8
+-Dfoo9=foo9\'bar9\'zed9
+-Dfoo10=foo10\"bar10\"zed10
+-D foo11
+-Dfoo12=zed12
+-Dfoo13=one\two
+-Dfoo14=c:\foo\bar.c
diff --git a/test/Driver/bitrig.c b/test/Driver/bitrig.c
index 934cb02..a20a95a 100644
--- a/test/Driver/bitrig.c
+++ b/test/Driver/bitrig.c
@@ -3,7 +3,7 @@
 // CHECK-LD-C: clang{{.*}}" "-cc1" "-triple" "amd64-pc-bitrig"
 // CHECK-LD-C: ld{{.*}}" {{.*}} "-lc" "-lclang_rt.amd64"
 
-// RUN: %clangxx -no-canonical-prefixes -target amd64-pc-bitrig %s -### 2>&1 \
+// RUN: %clangxx -stdlib=platform -no-canonical-prefixes -target amd64-pc-bitrig %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-CXX-STDLIB %s
 // CHECK-LD-CXX-STDLIB: clang{{.*}}" "-cc1" "-triple" "amd64-pc-bitrig"
 // CHECK-LD-CXX-STDLIB: ld{{.*}}" {{.*}} "-lc++" "-lc++abi" "-lpthread" "-lm" "-lc" "-lclang_rt.amd64"
diff --git a/test/Driver/cl-eh.cpp b/test/Driver/cl-eh.cpp
index 1745616..c54544b 100644
--- a/test/Driver/cl-eh.cpp
+++ b/test/Driver/cl-eh.cpp
@@ -21,6 +21,11 @@
 // EHs_EHa: "-fcxx-exceptions"
 // EHs_EHa: "-fexceptions"
 
+// RUN: %clang_cl /c /EHa /EHc -### -- %s 2>&1 | FileCheck -check-prefix=EHa_EHc %s
+// EHa_EHc: "-fcxx-exceptions"
+// EHa_EHc: "-fexceptions"
+// EHa_EHc-NOT: "-fexternc-nounwind"
+
 // RUN: %clang_cl /c /EHinvalid -### -- %s 2>&1 | FileCheck -check-prefix=EHinvalid %s
 // EHinvalid: error: invalid value 'invalid' in '/EH'
 // EHinvalid-NOT: error:
diff --git a/test/Driver/cl-fallback.c b/test/Driver/cl-fallback.c
index e5ebde5..e73f7c0 100644
--- a/test/Driver/cl-fallback.c
+++ b/test/Driver/cl-fallback.c
@@ -1,8 +1,9 @@
 // Note: %s must be preceded by --, otherwise it may be interpreted as a
 // command-line option, e.g. on Mac where %s is commonly under /Users.
 
-// RUN: %clang_cl --target=i686-pc-win32 /fallback /Dfoo=bar /Ubaz /Ifoo /O0 /Ox /GR /GR- /Gy /Gy- \
+// RUN: %clang_cl --target=i686-pc-win32 /fallback /Dfoo=bar /Ubaz /Ifoo /O0 /Ox /GR /GR- /GS /GS- /Gy /Gy- \
 // RUN:   /Gw /Gw- /LD /LDd /EHs /EHs- /Zl /MD /MDd /MTd /MT /FImyheader.h /Zi \
+// RUN:   -garbage -moregarbage \
 // RUN:   -### -- %s 2>&1 \
 // RUN:   | FileCheck %s
 // CHECK: "-fdiagnostics-format" "msvc-fallback"
@@ -21,6 +22,7 @@
 // CHECK: "/Oy"
 // CHECK: "/GF"
 // CHECK: "/GR-"
+// CHECK: "/GS-"
 // CHECK: "/Gy-"
 // CHECK: "/Gw-"
 // CHECK: "/Z7"
@@ -31,6 +33,8 @@
 // CHECK: "/EHs-"
 // CHECK: "/Zl"
 // CHECK: "/MT"
+// CHECK: "-garbage"
+// CHECK: "-moregarbage"
 // CHECK: "/Tc" "{{.*cl-fallback.c}}"
 // CHECK: "/Fo{{.*cl-fallback.*.obj}}"
 
@@ -38,6 +42,10 @@
 // GR: cl.exe
 // GR: "/GR-"
 
+// RUN: %clang_cl /fallback /GS- -### -- %s 2>&1 | FileCheck -check-prefix=GS %s
+// GS: cl.exe
+// GS: "/GS-"
+
 // RUN: %clang_cl /fallback /Od -### -- %s 2>&1 | FileCheck -check-prefix=O0 %s
 // O0: cl.exe
 // O0: "/Od"
diff --git a/test/Driver/cl-link.c b/test/Driver/cl-link.c
index 9813c51..026c433 100644
--- a/test/Driver/cl-link.c
+++ b/test/Driver/cl-link.c
@@ -3,6 +3,7 @@
 // under /Users.
 
 // RUN: %clang_cl /Tc%s -### /link foo bar baz 2>&1 | FileCheck --check-prefix=LINK %s
+// RUN: %clang_cl /Tc%s -### /linkfoo bar baz 2>&1 | FileCheck --check-prefix=LINK %s
 // LINK: link.exe
 // LINK: "foo"
 // LINK: "bar"
@@ -42,3 +43,11 @@
 // RUN: %clang_cl /Zi /Tc%s -### 2>&1 | FileCheck --check-prefix=DEBUG %s
 // DEBUG: link.exe
 // DEBUG: "-debug"
+
+// PR27234
+// RUN: %clang_cl /Tc%s nonexistent.obj -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s
+// RUN: %clang_cl /Tc%s nonexistent.lib -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s
+// NONEXISTENT-NOT: no such file
+// NONEXISTENT: link.exe
+// NONEXISTENT: "/libpath:somepath"
+// NONEXISTENT: nonexistent
diff --git a/test/Driver/cl-options.c b/test/Driver/cl-options.c
index c5985a9..4d9416b 100644
--- a/test/Driver/cl-options.c
+++ b/test/Driver/cl-options.c
@@ -59,6 +59,16 @@
 // RUN: %clang_cl /GR- -### -- %s 2>&1 | FileCheck -check-prefix=GR_ %s
 // GR_: -fno-rtti
 
+// Security Buffer Check is on by default.
+// RUN: %clang_cl -### -- %s 2>&1 | FileCheck -check-prefix=GS-default %s
+// GS-default: "-stack-protector" "2"
+
+// RUN: %clang_cl /GS -### -- %s 2>&1 | FileCheck -check-prefix=GS %s
+// GS: "-stack-protector" "2"
+
+// RUN: %clang_cl /GS- -### -- %s 2>&1 | FileCheck -check-prefix=GS_ %s
+// GS_-NOT: -stack-protector
+
 // RUN: %clang_cl /Gy -### -- %s 2>&1 | FileCheck -check-prefix=Gy %s
 // Gy: -ffunction-sections
 
@@ -82,6 +92,12 @@
 // RUN: %clang_cl /I myincludedir -### -- %s 2>&1 | FileCheck -check-prefix=SLASH_I %s
 // SLASH_I: "-I" "myincludedir"
 
+// RUN: %clang_cl /imsvcmyincludedir -### -- %s 2>&1 | FileCheck -check-prefix=SLASH_imsvc %s
+// RUN: %clang_cl /imsvc myincludedir -### -- %s 2>&1 | FileCheck -check-prefix=SLASH_imsvc %s
+// Clang's resource header directory should be first:
+// SLASH_imsvc: "-internal-isystem" "{{[^"]*}}lib{{(64)?/|\\\\}}clang{{[^"]*}}include"
+// SLASH_imsvc: "-internal-isystem" "myincludedir"
+
 // RUN: %clang_cl /J -### -- %s 2>&1 | FileCheck -check-prefix=J %s
 // J: -fno-signed-char
 
@@ -91,6 +107,16 @@
 // RUN: %clang_cl /Ob0 -### -- %s 2>&1 | FileCheck -check-prefix=Ob0 %s
 // Ob0: -fno-inline
 
+// RUN: %clang_cl /Ob2 -### -- %s 2>&1 | FileCheck -check-prefix=Ob2 %s
+// RUN: %clang_cl /Odb2 -### -- %s 2>&1 | FileCheck -check-prefix=Ob2 %s
+// RUN: %clang_cl /O2 /Ob2 -### -- %s 2>&1 | FileCheck -check-prefix=Ob2 %s
+// Ob2-NOT: warning: argument unused during compilation: '/O2'
+// Ob2: -finline-functions
+
+// RUN: %clang_cl /Ob1 -### -- %s 2>&1 | FileCheck -check-prefix=Ob1 %s
+// RUN: %clang_cl /Odb1 -### -- %s 2>&1 | FileCheck -check-prefix=Ob1 %s
+// Ob1: -finline-hint-functions
+
 // RUN: %clang_cl /Od -### -- %s 2>&1 | FileCheck -check-prefix=Od %s
 // Od: -O0
 
@@ -123,9 +149,13 @@
 // PR24003: -momit-leaf-frame-pointer
 // PR24003: -Os
 
-// RUN: %clang_cl /Zs /Oy -- %s 2>&1
+// RUN: %clang_cl --target=i686-pc-win32 -Werror /Oy- /O2 -### -- %s 2>&1 | FileCheck -check-prefix=Oy_2 %s
+// Oy_2: -momit-leaf-frame-pointer
+// Oy_2: -O2
 
-// RUN: %clang_cl --target=i686-pc-win32 /Oy- -### -- %s 2>&1 | FileCheck -check-prefix=Oy_ %s
+// RUN: %clang_cl /Zs -Werror /Oy -- %s 2>&1
+
+// RUN: %clang_cl --target=i686-pc-win32 -Werror /Oy- -### -- %s 2>&1 | FileCheck -check-prefix=Oy_ %s
 // Oy_: -mdisable-fp-elim
 
 // RUN: %clang_cl /Qvec -### -- %s 2>&1 | FileCheck -check-prefix=Qvec %s
@@ -207,6 +237,15 @@
 // RUN: %clang_cl /FI asdf.h -### -- %s 2>&1 | FileCheck -check-prefix=FI_ %s
 // FI_: "-include" "asdf.h"
 
+// RUN: %clang_cl /TP /c -### -- %s 2>&1 | FileCheck -check-prefix=NO-GX %s
+// NO-GX-NOT: "-fcxx-exceptions" "-fexceptions"
+
+// RUN: %clang_cl /TP /c /GX -### -- %s 2>&1 | FileCheck -check-prefix=GX %s
+// GX: "-fcxx-exceptions" "-fexceptions"
+
+// RUN: %clang_cl /TP /c /GX /GX- -### -- %s 2>&1 | FileCheck -check-prefix=GX_ %s
+// GX_-NOT: "-fcxx-exceptions" "-fexceptions"
+
 // We forward any unrecognized -W diagnostic options to cc1.
 // RUN: %clang_cl -Wunused-pragmas -### -- %s 2>&1 | FileCheck -check-prefix=WJoined %s
 // WJoined: "-cc1"
@@ -234,8 +273,10 @@
 // RUN:    /bigobj \
 // RUN:    /cgthreads4 \
 // RUN:    /cgthreads8 \
+// RUN:    /d2FastFail \
 // RUN:    /d2Zi+ \
 // RUN:    /errorReport:foo \
+// RUN:    /FC \
 // RUN:    /Fdfoo \
 // RUN:    /FS \
 // RUN:    /Gd \
@@ -243,8 +284,6 @@
 // RUN:    /GS- \
 // RUN:    /kernel- \
 // RUN:    /nologo \
-// RUN:    /Ob1 \
-// RUN:    /Ob2 \
 // RUN:    /openmp- \
 // RUN:    /RTC1 \
 // RUN:    /sdl \
@@ -366,7 +405,7 @@
 // RTTI-NOT: "-fno-rtti"
 
 // thread safe statics are off for versions < 19.
-// RUN: %clang_cl /c -### -- %s 2>&1 | FileCheck -check-prefix=NoThreadSafeStatics %s
+// RUN: %clang_cl /c -### -fms-compatibility-version=18 -- %s 2>&1 | FileCheck -check-prefix=NoThreadSafeStatics %s
 // RUN: %clang_cl /Zc:threadSafeInit /Zc:threadSafeInit- /c -### -- %s 2>&1 | FileCheck -check-prefix=NoThreadSafeStatics %s
 // NoThreadSafeStatics: "-fno-threadsafe-statics"
 
@@ -375,11 +414,19 @@
 
 // RUN: %clang_cl /Zi /c -### -- %s 2>&1 | FileCheck -check-prefix=Zi %s
 // Zi: "-gcodeview"
-// Zi: "-debug-info-kind=line-tables-only"
+// Zi: "-debug-info-kind=limited"
 
 // RUN: %clang_cl /Z7 /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7 %s
 // Z7: "-gcodeview"
-// Z7: "-debug-info-kind=line-tables-only"
+// Z7: "-debug-info-kind=limited"
+
+// RUN: %clang_cl /Zd /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7GMLT %s
+// Z7GMLT: "-gcodeview"
+// Z7GMLT: "-debug-info-kind=line-tables-only"
+
+// RUN: %clang_cl -gline-tables-only /c -### -- %s 2>&1 | FileCheck -check-prefix=ZGMLT %s
+// ZGMLT: "-gcodeview"
+// ZGMLT: "-debug-info-kind=line-tables-only"
 
 // RUN: %clang_cl /c -### -- %s 2>&1 | FileCheck -check-prefix=BreproDefault %s
 // BreproDefault: "-mincremental-linker-compatible"
@@ -409,6 +456,12 @@
 // RUN: %clang_cl -fmsc-version=1900 -TP -### -- %s 2>&1 | FileCheck -check-prefix=CXX14 %s
 // CXX14: -std=c++14
 
+// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++14 -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX14 %s
+// STDCXX14: -std=c++14
+
+// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++latest -### -- %s 2>&1 | FileCheck -check-prefix=STDCXXLATEST %s
+// STDCXXLATEST: -std=c++1z
+
 // RUN: env CL="/Gy" %clang_cl -### -- %s 2>&1 | FileCheck -check-prefix=ENV-CL %s
 // ENV-CL: "-ffunction-sections"
 
@@ -434,6 +487,7 @@
 // RUN:     -fms-extensions \
 // RUN:     -fno-ms-extensions \
 // RUN:     -mllvm -disable-llvm-optzns \
+// RUN:     -resource-dir \
 // RUN:     -Wunused-variable \
 // RUN:     -fmacro-backtrace-limit=0 \
 // RUN:     -Werror /Zs -- %s 2>&1
diff --git a/test/Driver/cl-pch-errorhandling.cpp b/test/Driver/cl-pch-errorhandling.cpp
new file mode 100644
index 0000000..33071be
--- /dev/null
+++ b/test/Driver/cl-pch-errorhandling.cpp
@@ -0,0 +1,15 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// /Yc but pch generation fails => main file not compiled
+// This is a separate file since executing this failure path requires
+// code generation, which makes this test require an x86 backend.
+// REQUIRES: x86-registered-target
+
+// RUN: not %clang_cl -Werror --target=x86_64 /Yc%S/Inputs/pchfile.h /FI%S/Inputs/pchfile.h /Fp%t.pch /c -DERR_HEADER -- %s 2>&1 \
+// RUN:   | FileCheck %s
+
+// CHECK: nope1
+// CHECK-NOT: nope2
+
+#error nope2
diff --git a/test/Driver/cl-pch-search.cpp b/test/Driver/cl-pch-search.cpp
new file mode 100644
index 0000000..ca62668
--- /dev/null
+++ b/test/Driver/cl-pch-search.cpp
@@ -0,0 +1,6 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// REQUIRES: x86-registered-target
+// Check that pchfile.h next to to pchfile.cc is found correctly.
+// RUN: %clang_cl -Werror --target=x86_64 /Ycpchfile.h /FIpchfile.h /c /Fo%t.obj /Fp%t.pch -- %S/Inputs/pchfile.cpp
diff --git a/test/Driver/cl-pch-showincludes.cpp b/test/Driver/cl-pch-showincludes.cpp
new file mode 100644
index 0000000..7e0e109
--- /dev/null
+++ b/test/Driver/cl-pch-showincludes.cpp
@@ -0,0 +1,50 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// Tests interaction of /Yc / /Yu with /showIncludes
+// REQUIRES: x86-registered-target
+
+#include "header3.h"
+
+// When building the pch, header1.h (included by header2.h), header2.h (the pch
+// input itself) and header3.h (included directly, above) should be printed.
+// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /Ycheader2.h /FIheader2.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YC %s
+// CHECK-YC: Note: including file: {{[^ ]*header2.h}}
+// CHECK-YC: Note: including file:  {{[^ ]*header1.h}}
+// CHECK-YC: Note: including file: {{[^ ]*header3.h}}
+
+// When using the pch, only the direct include is printed.
+// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /Yuheader2.h /FIheader2.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YU %s
+// CHECK-YU-NOT: Note: including file: {{.*pch}}
+// CHECK-YU-NOT: Note: including file: {{.*header1.h}}
+// CHECK-YU-NOT: Note: including file: {{.*header2.h}}
+// CHECK-YU: Note: including file: {{[^ ]*header3.h}}
+
+// When not using pch at all, all the /FI files are printed.
+// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /FIheader2.h /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-FI %s
+// CHECK-FI: Note: including file: {{[^ ]*header2.h}}
+// CHECK-FI: Note: including file:  {{[^ ]*header1.h}}
+// CHECK-FI: Note: including file: {{[^ ]*header3.h}}
+
+// Also check that /FI arguments before the /Yc / /Yu flags are printed right.
+
+// /FI flags before the /Yc arg should be printed, /FI flags after it shouldn't.
+// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /Ycheader2.h /FIheader0.h /FIheader2.h /FIheader4.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YCFI %s
+// CHECK-YCFI: Note: including file: {{[^ ]*header0.h}}
+// CHECK-YCFI: Note: including file: {{[^ ]*header2.h}}
+// CHECK-YCFI: Note: including file:  {{[^ ]*header1.h}}
+// CHECK-YCFI: Note: including file: {{[^ ]*header4.h}}
+// CHECK-YCFI: Note: including file: {{[^ ]*header3.h}}
+
+// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /Yuheader2.h /FIheader0.h /FIheader2.h /FIheader4.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YUFI %s
+// CHECK-YUFI-NOT: Note: including file: {{.*pch}}
+// CHECK-YUFI-NOT: Note: including file: {{.*header0.h}}
+// CHECK-YUFI-NOT: Note: including file: {{.*header2.h}}
+// CHECK-YUFI-NOT: Note: including file: {{.*header1.h}}
+// CHECK-YUFI: Note: including file: {{[^ ]*header4.h}}
+// CHECK-YUFI: Note: including file: {{[^ ]*header3.h}}
diff --git a/test/Driver/cl-pch.c b/test/Driver/cl-pch.c
new file mode 100644
index 0000000..3372c18
--- /dev/null
+++ b/test/Driver/cl-pch.c
@@ -0,0 +1,45 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// The main test for clang-cl pch handling is cl-pch.cpp.  This file only checks
+// a few things for .c inputs.
+
+// /Yc with a .c file should build a c pch file.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC %s
+// CHECK-YC: cc1
+// CHECK-YC: -emit-pch
+// CHECK-YC: -o
+// CHECK-YC: pchfile.pch
+// CHECK-YC: -x
+// CHECK-YC: "c"
+
+// But not if /TP changes the input language to C++.
+// RUN: %clang_cl /TP -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTP %s
+// CHECK-YCTP: cc1
+// CHECK-YCTP: -emit-pch
+// CHECK-YCTP: -o
+// CHECK-YCTP: pchfile.pch
+// CHECK-YCTP: -x
+// CHECK-YCTP: "c++"
+
+// Except if a later /TC changes it back.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTPTC %s
+// CHECK-YCTPTC: cc1
+// CHECK-YCTPTC: -emit-pch
+// CHECK-YCTPTC: -o
+// CHECK-YCTPTC: pchfile.pch
+// CHECK-YCTPTC: -x
+// CHECK-YCTPTC: "c"
+
+// Also check lower-case /Tp flag.
+// RUN: %clang_cl -Werror /Tp%s /Ycpchfile.h /FIpchfile.h /c -### 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTp %s
+// CHECK-YCTp: cc1
+// CHECK-YCTp: -emit-pch
+// CHECK-YCTp: -o
+// CHECK-YCTp: pchfile.pch
+// CHECK-YCTp: -x
+// CHECK-YCTp: "c++"
diff --git a/test/Driver/cl-pch.cpp b/test/Driver/cl-pch.cpp
new file mode 100644
index 0000000..8d701da
--- /dev/null
+++ b/test/Driver/cl-pch.cpp
@@ -0,0 +1,324 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// /Yc
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC %s
+// 1. Build .pch file.
+// CHECK-YC: cc1
+// CHECK-YC: -emit-pch
+// CHECK-YC: -o
+// CHECK-YC: pchfile.pch
+// CHECK-YC: -x
+// CHECK-YC: "c++"
+// 2. Use .pch file.
+// CHECK-YC: cc1
+// CHECK-YC: -emit-obj
+// CHECK-YC: -include-pch
+// CHECK-YC: pchfile.pch
+
+// /Yc /Fo
+// /Fo overrides the .obj output filename, but not the .pch filename
+// RUN: %clang_cl -Werror /Fomyobj.obj /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCO %s
+// 1. Build .pch file.
+// CHECK-YCO: cc1
+// CHECK-YCO: -emit-pch
+// CHECK-YCO: -o
+// CHECK-YCO: pchfile.pch
+// 2. Use .pch file.
+// CHECK-YCO: cc1
+// CHECK-YCO: -emit-obj
+// CHECK-YCO: -include-pch
+// CHECK-YCO: pchfile.pch
+// CHECK-YCO: -o
+// CHECK-YCO: myobj.obj
+
+// /Yc /Y-
+// /Y- disables pch generation
+// RUN: %clang_cl -Werror /Y- /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-Y_ %s
+// CHECK-YC-Y_-NOT: -emit-pch
+// CHECK-YC-Y_-NOT: -include-pch
+
+// /Yu
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU %s
+// Use .pch file, but don't build it.
+// CHECK-YU-NOT: -emit-pch
+// CHECK-YU: cc1
+// CHECK-YU: -emit-obj
+// CHECK-YU: -include-pch
+// CHECK-YU: pchfile.pch
+
+// /Yu /Y-
+// RUN: %clang_cl -Werror /Y- /Yupchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-Y_ %s
+// CHECK-YU-Y_-NOT: -emit-pch
+// CHECK-YU-Y_-NOT: -include-pch
+
+// /Yc /Yu -- /Yc overrides /Yc if they both refer to the same file
+// RUN: %clang_cl -Werror /Ycpchfile.h /Yupchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-YU %s
+// 1. Build .pch file.
+// CHECK-YC-YU: cc1
+// CHECK-YC-YU: -emit-pch
+// CHECK-YC-YU: -o
+// CHECK-YC-YU: pchfile.pch
+// 2. Use .pch file.
+// CHECK-YC-YU: cc1
+// CHECK-YC-YU: -emit-obj
+// CHECK-YC-YU: -include-pch
+// CHECK-YC-YU: pchfile.pch
+
+// If /Yc /Yu refer to different files, semantics are pretty wonky.  Since this
+// doesn't seem like something that's important in practice, just punt for now.
+// RUN: %clang_cl -Werror /Ycfoo1.h /Yufoo2.h /FIfoo1.h /FIfoo2.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-YU-MISMATCH %s
+// CHECK-YC-YU-MISMATCH: error: support for '/Yc' and '/Yu' with different filenames not implemented yet; flags ignored
+
+// Similarly, punt on /Yc with more than one input file.
+// RUN: %clang_cl -Werror /Ycfoo1.h /FIfoo1.h /c -### -- %s %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-MULTIINPUT %s
+// CHECK-YC-MULTIINPUT: error: support for '/Yc' with more than one source file not implemented yet; flag ignored
+
+// /Yc /Yu /Y-
+// RUN: %clang_cl -Werror /Ycpchfile.h /Yupchfile.h /FIpchfile.h /Y- /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-YU-Y_ %s
+// CHECK-YC-YU-Y_-NOT: -emit-pch
+// CHECK-YC-YU-Y_-NOT: -include-pch
+
+// Test computation of pch filename in various cases.
+
+// /Yu /Fpout.pch => out.pch is filename
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Fpout.pch /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP1 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP1: -include-pch
+// CHECK-YUFP1: out.pch
+
+// /Yu /Fpout => out.pch is filename (.pch gets added if no extension present)
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Fpout.pch /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP2 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP2: -include-pch
+// CHECK-YUFP2: out.pch
+
+// /Yu /Fpout.bmp => out.bmp is filename (.pch not added when extension present)
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Fpout.bmp /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP3 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP3: -include-pch
+// CHECK-YUFP3: out.bmp
+
+// /Yusub/dir.h => sub/dir.pch
+// RUN: %clang_cl -Werror /Yusub/pchfile.h /FIsub/pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP4 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP4: -include-pch
+// CHECK-YUFP4: sub/pchfile.pch
+
+// /Yudir.h /Isub => dir.pch
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Isub /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP5 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP5: -include-pch
+// CHECK-YUFP5: pchfile.pch
+
+// FIXME: /Fpdir: use dir/VCx0.pch when dir is directory, where x is major MSVS
+// version in use.
+
+// Spot-check one use of /Fp with /Yc too, else trust the /Yu test cases above
+// also all assume to /Yc.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /Fpsub/file.pch /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCFP %s
+// 1. Build .pch file.
+// CHECK-YCFP: cc1
+// CHECK-YCFP: -emit-pch
+// CHECK-YCFP: -o
+// CHECK-YCFP: sub/file.pch
+// 2. Use .pch file.
+// CHECK-YCFP: cc1
+// CHECK-YCFP: -emit-obj
+// CHECK-YCFP: -include-pch
+// CHECK-YCFP: sub/file.pch
+
+// /Ycfoo2.h /FIfoo1.h /FIfoo2.h /FIfoo3.h
+// => foo1 and foo2 go into pch, foo3 into main compilation
+// /Yc
+// RUN: %clang_cl -Werror /Ycfoo2.h /FIfoo1.h /FIfoo2.h /FIfoo3.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCFIFIFI %s
+// 1. Build .pch file: Includes foo1.h (but NOT foo3.h) and compiles foo2.h
+// CHECK-YCFIFIFI: cc1
+// CHECK-YCFIFIFI: -emit-pch
+// CHECK-YCFIFIFI: -include
+// CHECK-YCFIFIFI: foo1.h
+// CHECK-YCFIFIFI-NOT: foo2.h
+// CHECK-YCFIFIFI-NOT: foo3.h
+// CHECK-YCFIFIFI: -o
+// CHECK-YCFIFIFI: foo2.pch
+// CHECK-YCFIFIFI: -x
+// CHECK-YCFIFIFI: "c++"
+// CHECK-YCFIFIFI: foo2.h
+// 2. Use .pch file: Inlucdes foo2.pch and foo3.h
+// CHECK-YCFIFIFI: cc1
+// CHECK-YCFIFIFI: -emit-obj
+// CHECK-YCFIFIFI-NOT: foo1.h
+// CHECK-YCFIFIFI-NOT: foo2.h
+// CHECK-YCFIFIFI: -include-pch
+// CHECK-YCFIFIFI: foo2.pch
+// CHECK-YCFIFIFI: -include
+// CHECK-YCFIFIFI: foo3.h
+
+// /Yucfoo2.h /FIfoo1.h /FIfoo2.h /FIfoo3.h
+// => foo1 foo2 filtered out, foo3 into main compilation
+// RUN: %clang_cl -Werror /Yufoo2.h /FIfoo1.h /FIfoo2.h /FIfoo3.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFIFIFI %s
+// Use .pch file, but don't build it.
+// CHECK-YUFIFIFI-NOT: -emit-pch
+// CHECK-YUFIFIFI: cc1
+// CHECK-YUFIFIFI: -emit-obj
+// CHECK-YUFIFIFI-NOT: foo1.h
+// CHECK-YUFIFIFI-NOT: foo2.h
+// CHECK-YUFIFIFI: -include-pch
+// CHECK-YUFIFIFI: foo2.pch
+// CHECK-YUFIFIFI: -include
+// CHECK-YUFIFIFI: foo3.h
+
+// FIXME: Implement support for /Ycfoo.h / /Yufoo.h without /FIfoo.h
+// RUN: %clang_cl -Werror /Ycfoo.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-NOFI %s
+// CHECK-YC-NOFI: error: support for '/Yc' without a corresponding /FI flag not implemented yet; flag ignored
+// RUN: %clang_cl -Werror /Yufoo.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-NOFI %s
+// CHECK-YU-NOFI: error: support for '/Yu' without a corresponding /FI flag not implemented yet; flag ignored
+
+// /Yc and /FI relative to /I paths...
+// The rules are:
+// Yu/Yc and FI parameter must match exactly, else it's not found
+// Must match literally exactly: /FI./foo.h /Ycfoo.h does _not_ work.
+// However, the path can be relative to /I paths.
+// FIXME: Update the error messages below once /FI is no longer required, but
+// these test cases all should stay failures as they fail with cl.exe.
+
+// Check that ./ isn't canonicalized away.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FI./pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-I1 %s
+// CHECK-YC-I1: support for '/Yc' without a corresponding /FI flag not implemented yet; flag ignored
+
+// Check that ./ isn't canonicalized away.
+// RUN: %clang_cl -Werror /Yc./pchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-I2 %s
+// CHECK-YC-I2: support for '/Yc' without a corresponding /FI flag not implemented yet; flag ignored
+
+// With an actual /I argument.
+// RUN: %clang_cl -Werror /Ifoo /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-I3 %s
+// 1. This writes pchfile.pch into the root dir, even if this will pick up
+//    foo/pchfile.h
+// CHECK-YC-I3: cc1
+// CHECK-YC-I3: -emit-pch
+// CHECK-YC-I3: -o
+// CHECK-YC-I3: pchfile.pch
+// 2. Use .pch file.
+// CHECK-YC-I3: cc1
+// CHECK-YC-I3: -emit-obj
+// CHECK-YC-I3: -include-pch
+// CHECK-YC-I3: pchfile.pch
+
+// Check that ./ isn't canonicalized away for /Yu either.
+// RUN: %clang_cl -Werror /Yupchfile.h /FI./pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-I1 %s
+// CHECK-YU-I1: support for '/Yu' without a corresponding /FI flag not implemented yet; flag ignored
+
+// But /FIfoo/bar.h /Ycfoo\bar.h does work, as does /FIfOo.h /Ycfoo.H
+// FIXME: This part isn't implemented yet. The following two tests should not
+// show an error but do regular /Yu handling.
+// RUN: %clang_cl -Werror /YupchFILE.h /FI./pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-CASE %s
+// CHECK-YU-CASE: support for '/Yu' without a corresponding /FI flag not implemented yet; flag ignored
+// RUN: %clang_cl -Werror /Yu./pchfile.h /FI.\pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-SLASH %s
+// CHECK-YU-SLASH: support for '/Yu' without a corresponding /FI flag not implemented yet; flag ignored
+
+// cl.exe warns on multiple /Yc, /Yu, /Fp arguments, but clang-cl silently just
+// uses the last one.  This is true for e.g. /Fo too, so not warning on this
+// is self-consistent with clang-cl's flag handling.
+
+// Interaction with /fallback
+
+// /Yc /fallback => /Yc not passed on (but /FI is)
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /Fpfoo.pch /fallback /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-FALLBACK %s
+// Note that in /fallback builds, if creation of the pch fails the main compile
+// does still run so that /fallback can have an effect (this part is not tested)
+// CHECK-YC-FALLBACK: cc1
+// CHECK-YC-FALLBACK: -emit-obj
+// CHECK-YC-FALLBACK: -include-pch
+// CHECK-YC-FALLBACK: foo.pch
+// CHECK-YC-FALLBACK: ||
+// CHECK-YC-FALLBACK: cl.exe
+// CHECK-YC-FALLBACK-NOT: -include-pch
+// CHECK-YC-FALLBACK-NOT: /Ycpchfile.h
+// CHECK-YC-FALLBACK: /FIpchfile.h
+// CHECK-YC-FALLBACK-NOT: /Fpfoo.pch
+
+// /Yu /fallback => /Yu not passed on (but /FI is)
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Fpfoo.pch /fallback /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-FALLBACK %s
+// CHECK-YU-FALLBACK-NOT: -emit-pch
+// CHECK-YU-FALLBACK: cc1
+// CHECK-YU-FALLBACK: -emit-obj
+// CHECK-YU-FALLBACK: -include-pch
+// CHECK-YU-FALLBACK: foo.pch
+// CHECK-YU-FALLBACK: ||
+// CHECK-YU-FALLBACK: cl.exe
+// CHECK-YU-FALLBACK-NOT: -include-pch
+// CHECK-YU-FALLBACK-NOT: /Yupchfile.h
+// CHECK-YU-FALLBACK: /FIpchfile.h
+// CHECK-YU-FALLBACK-NOT: /Fpfoo.pch
+
+// /FI without /Yu => pch file not used, even if it exists (different from
+// -include, which picks up .gch files if they exist).
+// RUN: touch %t.pch
+// RUN: %clang_cl -Werror /FI%t.pch /Fp%t.pch /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-FI %s
+// CHECK-FI-NOT: -include-pch
+// CHECK-FI: -include
+
+// Test interaction of /Yc with language mode flags.
+
+// If /TC changes the input language to C, a c pch file should be produced.
+// RUN: %clang_cl /TC -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTC %s
+// CHECK-YCTC: cc1
+// CHECK-YCTC: -emit-pch
+// CHECK-YCTC: -o
+// CHECK-YCTC: pchfile.pch
+// CHECK-YCTC: -x
+// CHECK-YCTC: "c"
+
+// Also check lower-case /Tc variant.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### /Tc%s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTc %s
+// CHECK-YCTc: cc1
+// CHECK-YCTc: -emit-pch
+// CHECK-YCTc: -o
+// CHECK-YCTc: pchfile.pch
+// CHECK-YCTc: -x
+// CHECK-YCTc: "c"
+
+// Don't crash when a non-source file is passed.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %S/Inputs/file.prof 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NoSource %s
+// CHECK-NoSource: file.prof:{{.*}}input unused
+
+// ...but if an explicit file turns the file into a source file, handle it:
+// RUN: %clang_cl /TP -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %S/Inputs/file.prof 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NoSourceTP %s
+// CHECK-NoSourceTP: cc1
+// CHECK-NoSourceTP: -emit-pch
+// CHECK-NoSourceTP: -o
+// CHECK-NoSourceTP: pchfile.pch
+// CHECK-NoSourceTP: -x
+// CHECK-NoSourceTP: "c++"
diff --git a/test/Driver/cl-runtime-flags.c b/test/Driver/cl-runtime-flags.c
index a54aa1a..3fa036d 100644
--- a/test/Driver/cl-runtime-flags.c
+++ b/test/Driver/cl-runtime-flags.c
@@ -13,6 +13,7 @@
 // CHECK-MT-NOT: "-D_DEBUG"
 // CHECK-MT: "-D_MT"
 // CHECK-MT-NOT: "-D_DLL"
+// CHECK-MT: "-flto-visibility-public-std"
 // CHECK-MT: "--dependent-lib=libcmt"
 // CHECK-MT: "--dependent-lib=oldnames"
 
@@ -21,6 +22,7 @@
 // CHECK-MTd: "-D_DEBUG"
 // CHECK-MTd: "-D_MT"
 // CHECK-MTd-NOT: "-D_DLL"
+// CHECK-MTd: "-flto-visibility-public-std"
 // CHECK-MTd: "--dependent-lib=libcmtd"
 // CHECK-MTd: "--dependent-lib=oldnames"
 
diff --git a/test/Driver/clang-translation.c b/test/Driver/clang-translation.c
index 422aa13..cc3cd1b 100644
--- a/test/Driver/clang-translation.c
+++ b/test/Driver/clang-translation.c
@@ -245,9 +245,20 @@
 // RUN: FileCheck -check-prefix=MIPSEL-ANDROID %s
 // MIPSEL-ANDROID: clang
 // MIPSEL-ANDROID: "-cc1"
-// MIPSEL-ANDROID: "-target-cpu" "mips32r2"
+// MIPSEL-ANDROID: "-target-cpu" "mips32"
+// MIPSEL-ANDROID: "-target-feature" "+fpxx"
+// MIPSEL-ANDROID: "-target-feature" "+nooddspreg"
 // MIPSEL-ANDROID: "-mfloat-abi" "hard"
 
+// RUN: %clang -target mipsel-linux-android -### -S %s -mcpu=mips32r6 2>&1 | \
+// RUN: FileCheck -check-prefix=MIPSEL-ANDROID-R6 %s
+// MIPSEL-ANDROID-R6: clang
+// MIPSEL-ANDROID-R6: "-cc1"
+// MIPSEL-ANDROID-R6: "-target-cpu" "mips32r6"
+// MIPSEL-ANDROID-R6: "-target-feature" "+fp64"
+// MIPSEL-ANDROID-R6: "-target-feature" "+nooddspreg"
+// MIPSEL-ANDROID-R6: "-mfloat-abi" "hard"
+
 // RUN: %clang -target mips64-linux-gnu -### -S %s 2>&1 | \
 // RUN: FileCheck -check-prefix=MIPS64 %s
 // MIPS64: clang
diff --git a/test/Driver/clang_f_opts.c b/test/Driver/clang_f_opts.c
index 25a1930..2b49069 100644
--- a/test/Driver/clang_f_opts.c
+++ b/test/Driver/clang_f_opts.c
@@ -66,7 +66,7 @@
 // CHECK-PROFILE-ARCS: "-femit-coverage-data"
 // CHECK-NO-PROFILE-ARCS-NOT: "-femit-coverage-data"
 
-// RUN: %clang -### -S -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE %s
+// RUN: %clang -### -S -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-LLVM %s
 // RUN: %clang -### -S -fprofile-instr-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE %s
 // RUN: %clang -### -S -fprofile-generate=/some/dir %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-DIR %s
 // RUN: %clang -### -S -fprofile-instr-generate=/tmp/somefile.profraw %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-FILE %s
@@ -87,9 +87,9 @@
 // RUN: %clang -### -S -fprofile-generate=dir -fprofile-instr-use %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GEN-USE %s
 // RUN: %clang -### -S -fprofile-generate=dir -fprofile-instr-use=file %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GEN-USE %s
 // RUN: %clang -### -S -fprofile-instr-generate=file -fno-profile-instr-generate %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-GEN %s
-// RUN: %clang -### -S -fprofile-instr-generate=file -fno-profile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-GEN %s
+// RUN: %clang -### -S -fprofile-instr-generate -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GENERATE %s
+// RUN: %clang -### -S -fprofile-instr-generate -fprofile-generate=file %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GENERATE %s
 // RUN: %clang -### -S -fprofile-generate=dir -fno-profile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-GEN %s
-// RUN: %clang -### -S -fprofile-generate=dir -fno-profile-instr-generate %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-GEN %s
 // RUN: %clang -### -S -fprofile-instr-use=file -fno-profile-instr-use %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-USE %s
 // RUN: %clang -### -S -fprofile-instr-use=file -fno-profile-use %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-USE %s
 // RUN: %clang -### -S -fprofile-use=file -fno-profile-use %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-USE %s
@@ -97,11 +97,13 @@
 // RUN: %clang -### -S -fcoverage-mapping %s 2>&1 | FileCheck -check-prefix=CHECK-COVERAGE-AND-GEN %s
 // RUN: %clang -### -S -fcoverage-mapping -fno-coverage-mapping %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-COVERAGE %s
 // RUN: %clang -### -S -fprofile-instr-generate -fcoverage-mapping -fno-coverage-mapping %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-COVERAGE %s
-// CHECK-PROFILE-GENERATE: "-fprofile-instr-generate"
-// CHECK-PROFILE-GENERATE-DIR: "-fprofile-instr-generate=/some/dir{{/|\\\\}}default.profraw"
-// CHECK-PROFILE-GENERATE-FILE: "-fprofile-instr-generate=/tmp/somefile.profraw"
+// CHECK-PROFILE-GENERATE: "-fprofile-instrument=clang"
+// CHECK-PROFILE-GENERATE-LLVM: "-fprofile-instrument=llvm"
+// CHECK-PROFILE-GENERATE-DIR: "-fprofile-instrument-path=/some/dir{{/|\\\\}}{{.*}}"
+// CHECK-PROFILE-GENERATE-FILE: "-fprofile-instrument-path=/tmp/somefile.profraw"
 // CHECK-NO-MIX-GEN-USE: '{{[a-z=-]*}}' not allowed with '{{[a-z=-]*}}'
-// CHECK-DISABLE-GEN-NOT: "-fprofile-instr-generate"
+// CHECK-NO-MIX-GENERATE: '{{[a-z=-]*}}' not allowed with '{{[a-z=-]*}}'
+// CHECK-DISABLE-GEN-NOT: "-fprofile-instrument=clang"
 // CHECK-DISABLE-USE-NOT: "-fprofile-instr-use"
 // CHECK-COVERAGE-AND-GEN: '-fcoverage-mapping' only allowed with '-fprofile-instr-generate'
 // CHECK-DISABLE-COVERAGE-NOT: "-fcoverage-mapping"
@@ -111,9 +113,9 @@
 // RUN: mkdir -p %t.d/some/dir
 // RUN: %clang -### -S -fprofile-use=%t.d/some/dir %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-DIR %s
 // RUN: %clang -### -S -fprofile-instr-use=/tmp/somefile.prof %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
-// CHECK-PROFILE-USE: "-fprofile-instr-use=default.profdata"
-// CHECK-PROFILE-USE-DIR: "-fprofile-instr-use={{.*}}.d/some/dir{{/|\\\\}}default.profdata"
-// CHECK-PROFILE-USE-FILE: "-fprofile-instr-use=/tmp/somefile.prof"
+// CHECK-PROFILE-USE: "-fprofile-instrument-use-path=default.profdata"
+// CHECK-PROFILE-USE-DIR: "-fprofile-instrument-use-path={{.*}}.d/some/dir{{/|\\\\}}default.profdata"
+// CHECK-PROFILE-USE-FILE: "-fprofile-instrument-use-path=/tmp/somefile.prof"
 
 // RUN: %clang -### -S -fvectorize %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
 // RUN: %clang -### -S -fno-vectorize -fvectorize %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
@@ -285,7 +287,6 @@
 // RUN: -fexpensive-optimizations                                             \
 // RUN: -fno-expensive-optimizations                                          \
 // RUN: -fno-defer-pop                                                        \
-// RUN: -finline-functions                                                    \
 // RUN: -fkeep-inline-functions                                               \
 // RUN: -fno-keep-inline-functions                                            \
 // RUN: -freorder-blocks                                                      \
@@ -353,7 +354,6 @@
 // CHECK-WARNING-DAG: optimization flag '-fexpensive-optimizations' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fno-expensive-optimizations' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fno-defer-pop' is not supported
-// CHECK-WARNING-DAG: optimization flag '-finline-functions' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fkeep-inline-functions' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fno-keep-inline-functions' is not supported
 // CHECK-WARNING-DAG: optimization flag '-freorder-blocks' is not supported
diff --git a/test/Driver/cloudabi.c b/test/Driver/cloudabi.c
index 99a2bc2..053092a 100644
--- a/test/Driver/cloudabi.c
+++ b/test/Driver/cloudabi.c
@@ -1,3 +1,14 @@
-// RUN: %clang %s -### -target x86_64-unknown-cloudabi 2>&1 | FileCheck %s
-// CHECK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections"
-// CHECK: "-Bstatic" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc" "-lcompiler_rt" "crtend.o"
+// RUN: %clang %s -### -target x86_64-unknown-cloudabi 2>&1 | FileCheck %s -check-prefix=SAFESTACK
+// SAFESTACK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections" {{.*}} "-fsanitize=safe-stack"
+// SAFESTACK: "-Bstatic" "--no-dynamic-linker" "-pie" "-zrelro" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc" "-lcompiler_rt" "crtend.o"
+
+// RUN: %clang %s -### -target x86_64-unknown-cloudabi -fno-sanitize=safe-stack 2>&1 | FileCheck %s -check-prefix=NOSAFESTACK
+// NOSAFESTACK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections"
+// NOSAFESTACK-NOT: "-fsanitize=safe-stack"
+// NOSAFESTACK: "-Bstatic" "--no-dynamic-linker" "-pie" "-zrelro" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc" "-lcompiler_rt" "crtend.o"
+
+// PIE shouldn't be enabled on i686. Just on architectures that provide
+// PC-relative addressing.
+// RUN: %clang %s -### -target i686-unknown-cloudabi 2>&1 | FileCheck %s -check-prefix=NOPIE
+// NOPIE: "-cc1" "-triple" "i686-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections" {{.*}} "-fsanitize=safe-stack"
+// NOPIE: "-Bstatic" "--no-dynamic-linker" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc" "-lcompiler_rt" "crtend.o"
diff --git a/test/Driver/cloudabi.cpp b/test/Driver/cloudabi.cpp
index c3b68ae..f9e9ba5 100644
--- a/test/Driver/cloudabi.cpp
+++ b/test/Driver/cloudabi.cpp
@@ -1,3 +1,14 @@
-// RUN: %clangxx %s -### -target x86_64-unknown-cloudabi 2>&1 | FileCheck %s
-// CHECK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections"
-// CHECK: "-Bstatic" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc++" "-lc++abi" "-lunwind" "-lc" "-lcompiler_rt" "crtend.o"
+// RUN: %clangxx %s -### -target x86_64-unknown-cloudabi 2>&1 | FileCheck %s -check-prefix=SAFESTACK
+// SAFESTACK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections" {{.*}} "-fsanitize=safe-stack"
+// SAFESTACK: "-Bstatic" "--no-dynamic-linker" "-pie" "-zrelro" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc++" "-lc++abi" "-lunwind" "-lc" "-lcompiler_rt" "crtend.o"
+
+// RUN: %clangxx %s -### -target x86_64-unknown-cloudabi -fno-sanitize=safe-stack 2>&1 | FileCheck %s -check-prefix=NOSAFESTACK
+// NOSAFESTACK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections"
+// NOSAFESTACK-NOT: "-fsanitize=safe-stack"
+// NOSAFESTACK: "-Bstatic" "--no-dynamic-linker" "-pie" "-zrelro" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc++" "-lc++abi" "-lunwind" "-lc" "-lcompiler_rt" "crtend.o"
+
+// PIE shouldn't be enabled on i686. Just on architectures that provide
+// PC-relative addressing.
+// RUN: %clangxx %s -### -target i686-unknown-cloudabi 2>&1 | FileCheck %s -check-prefix=NOPIE
+// NOPIE: "-cc1" "-triple" "i686-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections" {{.*}} "-fsanitize=safe-stack"
+// NOPIE: "-Bstatic" "--no-dynamic-linker" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc++" "-lc++abi" "-lunwind" "-lc" "-lcompiler_rt" "crtend.o"
diff --git a/test/Driver/cuda-bad-arch.cu b/test/Driver/cuda-bad-arch.cu
index f92bdce..cbc2d11 100644
--- a/test/Driver/cuda-bad-arch.cu
+++ b/test/Driver/cuda-bad-arch.cu
@@ -19,4 +19,9 @@
 // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix OK %s
 
+// We don't allow using NVPTX for host compilation.
+// RUN: %clang -### --cuda-host-only -target nvptx-nvidia-cuda -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix HOST_NVPTX %s
+
 // OK-NOT: error: Unsupported CUDA gpu architecture
+// HOST_NVPTX: error: unsupported use of NVPTX for host compilation.
diff --git a/test/Driver/cuda-constructor-alias.cu b/test/Driver/cuda-constructor-alias.cu
new file mode 100644
index 0000000..e0fd329
--- /dev/null
+++ b/test/Driver/cuda-constructor-alias.cu
@@ -0,0 +1,13 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Check that we don't pass -mconstructor-aliases to CUDA device-side
+// compilation, but we do pass it to host-side compilation.
+
+// RUN: %clang -### -target x86_64-linux-gnu %s 2>&1 | FileCheck %s
+// CHECK: "-cc1"
+// CHECK-NOT: "-fcuda-is-device" {{.*}}"-mconstructor-aliases"
+// CHECK-NOT: "-mconstructor-aliases" {{.*}}"-fcuda-is-device"
+// CHECK: "-cc1"
+// CHECK-SAME: "-mconstructor-aliases"
diff --git a/test/Driver/cuda-detect.cu b/test/Driver/cuda-detect.cu
index d8fba06..22d3606 100644
--- a/test/Driver/cuda-detect.cu
+++ b/test/Driver/cuda-detect.cu
@@ -10,15 +10,41 @@
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s
 
-// Make sure we map libdevice bitcode files to proper GPUs.
+// Make sure we map libdevice bitcode files to proper GPUs. These
+// tests use Inputs/CUDA_80 which has full set of libdevice files.
+// However, libdevice mapping only matches CUDA-7.x at the moment.
+// sm_2x, sm_32 -> compute_20
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_21 \
-// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE21
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE20
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_32 \
+// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON \
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE20
+// sm_30, sm_5x and sm_6x map to compute_30
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \
+// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON \
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE30
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \
+// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON \
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE30
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \
+// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON \
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE30
+// sm_35 and sm_37 -> compute_35
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
-// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \
 // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE35
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_37 \
+// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE35
+
 // Verify that -nocudainc prevents adding include path to CUDA headers.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
@@ -29,12 +55,13 @@
 // RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC
 
-// Verify that no options related to bitcode linking are passes if
-// there's no bitcode file.
+// Verify that we get an error if there's no libdevice library to link with.
+// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_30  for this purpose.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
-// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOLIBDEVICE
-// .. or if we explicitly passed -nocudalib
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE
+
+// Verify that  -nocudalib prevents linking libdevice bitcode in.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOLIBDEVICE
@@ -45,20 +72,37 @@
 // RUN:   | FileCheck %s -check-prefix COMMON \
 // RUN:     -check-prefix NOCUDAINC -check-prefix NOLIBDEVICE
 
+// Verify that C++ include paths are passed for both host and device frontends.
+// RUN: %clang -### -no-canonical-prefixes -target x86_64-linux-gnu %s \
+// RUN: --stdlib=libstdc++ --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree2 \
+// RUN: --gcc-toolchain="" 2>&1 \
+// RUN: | FileCheck %s --check-prefix CHECK-CXXINCLUDE
+
 // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda
 // NOCUDA-NOT: Found CUDA installation:
 
+// MISSINGLIBDEVICE: error: cannot find libdevice for sm_30.
+
 // COMMON: "-triple" "nvptx-nvidia-cuda"
 // COMMON-SAME: "-fcuda-is-device"
 // LIBDEVICE-SAME: "-mlink-cuda-bitcode"
 // NOLIBDEVICE-NOT: "-mlink-cuda-bitcode"
-// LIBDEVICE21-SAME: libdevice.compute_20.10.bc
+// LIBDEVICE20-SAME: libdevice.compute_20.10.bc
+// LIBDEVICE30-SAME: libdevice.compute_30.10.bc
 // LIBDEVICE35-SAME: libdevice.compute_35.10.bc
 // NOLIBDEVICE-NOT: libdevice.compute_{{.*}}.bc
 // LIBDEVICE-SAME: "-target-feature" "+ptx42"
 // NOLIBDEVICE-NOT: "-target-feature" "+ptx42"
-// CUDAINC-SAME: "-internal-isystem" "{{.*}}/Inputs/CUDA/usr/local/cuda/include"
+// CUDAINC-SAME: "-internal-isystem" "{{.*}}/Inputs/CUDA{{[_0-9]+}}/usr/local/cuda/include"
 // NOCUDAINC-NOT: "-internal-isystem" "{{.*}}/cuda/include"
 // CUDAINC-SAME: "-include" "__clang_cuda_runtime_wrapper.h"
 // NOCUDAINC-NOT: "-include" "__clang_cuda_runtime_wrapper.h"
+// -internal-externc-isystem flags must come *after* the cuda include flags,
+// because we must search the cuda include directory first.
+// CUDAINC-SAME: "-internal-externc-isystem"
 // COMMON-SAME: "-x" "cuda"
+// CHECK-CXXINCLUDE: clang{{.*}} "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CHECK-CXXINCLUDE-SAME: {{.*}}"-internal-isystem" "{{.+}}/include/c++/4.8"
+// CHECK-CXXINCLUDE: clang{{.*}} "-cc1" "-triple" "x86_64--linux-gnu"
+// CHECK-CXXINCLUDE-SAME: {{.*}}"-internal-isystem" "{{.+}}/include/c++/4.8"
+// CHECK-CXXINCLUDE: ld{{.*}}"
diff --git a/test/Driver/cuda-external-tools.cu b/test/Driver/cuda-external-tools.cu
index 801b491..280c609 100644
--- a/test/Driver/cuda-external-tools.cu
+++ b/test/Driver/cuda-external-tools.cu
@@ -4,14 +4,40 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// Regular compile with -O2.
+// Regular compiles with -O{0,1,2,3,4,fast}.  -O4 and -Ofast map to ptxas O3.
+// RUN: %clang -### -target x86_64-linux-gnu -O0 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+// RUN: %clang -### -target x86_64-linux-gnu -O1 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT1 %s
 // RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+// RUN: %clang -### -target x86_64-linux-gnu -O3 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s
+// RUN: %clang -### -target x86_64-linux-gnu -O4 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s
+// RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s
+
+// With debugging enabled, ptxas should be run with with no ptxas optimizations.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix DBG %s
+
+// --no-cuda-noopt-device-debug overrides --cuda-noopt-device-debug.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-debug \
+// RUN:   --no-cuda-noopt-debug -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
 
 // Regular compile without -O.  This should result in us passing -O0 to ptxas.
 // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
 
+// Regular compiles with -Os and -Oz.  For lack of a better option, we map
+// these to ptxas -O3.
+// RUN: %clang -### -target x86_64-linux-gnu -Os -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+// RUN: %clang -### -target x86_64-linux-gnu -Oz -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+
 // Regular compile targeting sm_35.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
@@ -42,7 +68,14 @@
 // ARCH64: "-m64"
 // ARCH32: "-m32"
 // OPT0: "-O0"
+// OPT0-NOT: "-g"
+// OPT1: "-O1"
+// OPT1-NOT: "-g"
 // OPT2: "-O2"
+// OPT2-NOT: "-g"
+// OPT3: "-O3"
+// OPT3-NOT: "-g"
+// DBG: "-g" "--dont-merge-basicblocks" "--return-at-end"
 // SM20: "--gpu-name" "sm_20"
 // SM35: "--gpu-name" "sm_35"
 // SM20: "--output-file" "[[CUBINFILE:[^"]*]]"
diff --git a/test/Driver/cuda-march.cu b/test/Driver/cuda-march.cu
new file mode 100644
index 0000000..123b661
--- /dev/null
+++ b/test/Driver/cuda-march.cu
@@ -0,0 +1,22 @@
+// Checks that cuda compilation does the right thing when passed -march.
+// (Specifically, we want to pass it to host compilation, but not to device
+// compilation or ptxas!)
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c \
+// RUN: -march=haswell %s 2>&1 | FileCheck %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c \
+// RUN: -march=haswell --cuda-gpu-arch=sm_20 %s 2>&1 | FileCheck %s
+
+// CHECK: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-SAME: "-triple" "nvptx
+// CHECK-SAME: "-target-cpu" "sm_20"
+
+// CHECK: ptxas
+// CHECK-SAME: "--gpu-name" "sm_20"
+
+// CHECK: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-SAME: "-target-cpu" "haswell"
diff --git a/test/Driver/cuda-not-found.cu b/test/Driver/cuda-not-found.cu
new file mode 100644
index 0000000..b63623a
--- /dev/null
+++ b/test/Driver/cuda-not-found.cu
@@ -0,0 +1,12 @@
+// REQUIRES: clang-driver
+
+// Check that we raise an error if we're trying to compile CUDA code but can't
+// find a CUDA install, unless -nocudainc was passed.
+
+// RUN: %clang -### --sysroot=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix ERR
+// RUN: %clang -### --cuda-path=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix ERR
+// ERR: cannot find CUDA installation
+
+// RUN: %clang -### -nocudainc --sysroot=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix OK
+// RUN: %clang -### -nocudainc --cuda-path=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix OK
+// OK-NOT: cannot find CUDA installation
diff --git a/test/Driver/cuda-options.cu b/test/Driver/cuda-options.cu
index 9030090..5d65076 100644
--- a/test/Driver/cuda-options.cu
+++ b/test/Driver/cuda-options.cu
@@ -22,23 +22,46 @@
 // RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
 // RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
 
-// Same test as above, but with preceeding --cuda-device-only to make sure only
-// the last option has an effect.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only --cuda-host-only %s 2>&1 \
-// RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
-// RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
-
 // Verify that --cuda-device-only disables host-side compilation and linking.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
 
-// Same test as above, but with preceeding --cuda-host-only to make sure only
-// the last option has an effect.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only --cuda-device-only %s 2>&1 \
+// Check that the last of --cuda-compile-host-device, --cuda-host-only, and
+// --cuda-device-only wins.
+
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only \
+// RUN:    --cuda-host-only %s 2>&1 \
+// RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
+// RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
+
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-compile-host-device \
+// RUN:    --cuda-host-only %s 2>&1 \
+// RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
+// RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
+
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only \
+// RUN:    --cuda-device-only %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
 
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-compile-host-device \
+// RUN:    --cuda-device-only %s 2>&1 \
+// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
+// RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
+
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only \
+// RUN:   --cuda-compile-host-device %s 2>&1 \
+// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
+// RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
+// RUN:    -check-prefix LINK %s
+
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only \
+// RUN:   --cuda-compile-host-device %s 2>&1 \
+// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
+// RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
+// RUN:    -check-prefix LINK %s
+
 // Verify that --cuda-gpu-arch option passes the correct GPU archtecture to
 // device compilation.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
@@ -105,7 +128,7 @@
 
 // Match no device-side compilation.
 // NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
-// NODEVICE-SAME-NOT: "-fcuda-is-device"
+// NODEVICE-NOT: "-fcuda-is-device"
 
 // INCLUDES-DEVICE:fatbinary
 // INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
@@ -115,13 +138,13 @@
 // Match host-side preprocessor job with -save-temps.
 // HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
 // HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
-// HOST-SAVE-SAME-NOT: "-fcuda-is-device"
+// HOST-SAVE-NOT: "-fcuda-is-device"
 // HOST-SAVE-SAME: "-x" "cuda"
 
 // Match host-side compilation.
 // HOST: "-cc1" "-triple" "x86_64--linux-gnu"
 // HOST-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
-// HOST-SAME-NOT: "-fcuda-is-device"
+// HOST-NOT: "-fcuda-is-device"
 // HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
 // HOST-NOSAVE-SAME: "-x" "cuda"
 // HOST-SAVE-SAME: "-x" "cuda-cpp-output"
@@ -135,7 +158,7 @@
 
 // Match no host compilation.
 // NOHOST-NOT: "-cc1" "-triple"
-// NOHOST-SAME-NOT: "-x" "cuda"
+// NOHOST-NOT: "-x" "cuda"
 
 // Match linker.
 // LINK: "{{.*}}{{ld|link}}{{(.exe)?}}"
diff --git a/test/Driver/cuda-phases.cu b/test/Driver/cuda-phases.cu
new file mode 100644
index 0000000..ac6f473
--- /dev/null
+++ b/test/Driver/cuda-phases.cu
@@ -0,0 +1,206 @@
+// Tests the phases generated for a CUDA offloading target for different
+// combinations of:
+// - Number of gpu architectures;
+// - Host/device-only compilation;
+// - User-requested final phase - binary or assembly.
+
+// REQUIRES: clang-driver
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+//
+// Test single gpu architecture with complete compilation.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN %s
+// BIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// BIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// BIN: 2: compiler, {1}, ir, (host-cuda)
+// BIN: 3: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// BIN: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN: 5: compiler, {4}, ir, (device-cuda, sm_30)
+// BIN: 6: backend, {5}, assembler, (device-cuda, sm_30)
+// BIN: 7: assembler, {6}, object, (device-cuda, sm_30)
+// BIN: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
+// BIN: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
+// BIN: 10: linker, {8, 9}, cuda-fatbin, (device-cuda)
+// BIN: 11: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {10}, ir
+// BIN: 12: backend, {11}, assembler, (host-cuda)
+// BIN: 13: assembler, {12}, object, (host-cuda)
+// BIN: 14: linker, {13}, image, (host-cuda)
+
+//
+// Test single gpu architecture up to the assemble phase.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM %s
+// ASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// ASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// ASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// ASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// ASM: 5: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// ASM: 6: preprocessor, {5}, cuda-cpp-output, (host-cuda)
+// ASM: 7: compiler, {6}, ir, (host-cuda)
+// ASM: 8: backend, {7}, assembler, (host-cuda)
+
+//
+// Test two gpu architectures with complete compilation.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN2 %s
+// BIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// BIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// BIN2: 2: compiler, {1}, ir, (host-cuda)
+// BIN2: 3: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// BIN2: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN2: 5: compiler, {4}, ir, (device-cuda, sm_30)
+// BIN2: 6: backend, {5}, assembler, (device-cuda, sm_30)
+// BIN2: 7: assembler, {6}, object, (device-cuda, sm_30)
+// BIN2: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
+// BIN2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
+// BIN2: 10: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// BIN2: 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_35)
+// BIN2: 12: compiler, {11}, ir, (device-cuda, sm_35)
+// BIN2: 13: backend, {12}, assembler, (device-cuda, sm_35)
+// BIN2: 14: assembler, {13}, object, (device-cuda, sm_35)
+// BIN2: 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {14}, object
+// BIN2: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {13}, assembler
+// BIN2: 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda)
+// BIN2: 18: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir
+// BIN2: 19: backend, {18}, assembler, (host-cuda)
+// BIN2: 20: assembler, {19}, object, (host-cuda)
+// BIN2: 21: linker, {20}, image, (host-cuda)
+
+//
+// Test two gpu architecturess up to the assemble phase.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM2 %s
+// ASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// ASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// ASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// ASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// ASM2: 5: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// ASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
+// ASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
+// ASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
+// ASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
+// ASM2: 10: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// ASM2: 11: preprocessor, {10}, cuda-cpp-output, (host-cuda)
+// ASM2: 12: compiler, {11}, ir, (host-cuda)
+// ASM2: 13: backend, {12}, assembler, (host-cuda)
+
+//
+// Test single gpu architecture with complete compilation in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN %s
+// HBIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HBIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HBIN: 2: compiler, {1}, ir, (host-cuda)
+// HBIN: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HBIN: 4: backend, {3}, assembler, (host-cuda)
+// HBIN: 5: assembler, {4}, object, (host-cuda)
+// HBIN: 6: linker, {5}, image, (host-cuda)
+
+//
+// Test single gpu architecture up to the assemble phase in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM %s
+// HASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HASM: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HASM: 2: compiler, {1}, ir, (host-cuda)
+// HASM: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HASM: 4: backend, {3}, assembler, (host-cuda)
+
+//
+// Test two gpu architectures with complete compilation in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN2 %s
+// HBIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HBIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HBIN2: 2: compiler, {1}, ir, (host-cuda)
+// HBIN2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HBIN2: 4: backend, {3}, assembler, (host-cuda)
+// HBIN2: 5: assembler, {4}, object, (host-cuda)
+// HBIN2: 6: linker, {5}, image, (host-cuda)
+
+//
+// Test two gpu architectures up to the assemble phase in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM2 %s
+// HASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HASM2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HASM2: 2: compiler, {1}, ir, (host-cuda)
+// HASM2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HASM2: 4: backend, {3}, assembler, (host-cuda)
+
+//
+// Test single gpu architecture with complete compilation in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN %s
+// DBIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DBIN: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DBIN: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DBIN: 4: assembler, {3}, object, (device-cuda, sm_30)
+// DBIN: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+
+//
+// Test single gpu architecture up to the assemble phase in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM %s
+// DASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+
+//
+// Test two gpu architectures with complete compilation in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN2 %s
+// DBIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DBIN2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DBIN2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DBIN2: 4: assembler, {3}, object, (device-cuda, sm_30)
+// DBIN2: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+// DBIN2: 6: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// DBIN2: 7: preprocessor, {6}, cuda-cpp-output, (device-cuda, sm_35)
+// DBIN2: 8: compiler, {7}, ir, (device-cuda, sm_35)
+// DBIN2: 9: backend, {8}, assembler, (device-cuda, sm_35)
+// DBIN2: 10: assembler, {9}, object, (device-cuda, sm_35)
+// DBIN2: 11: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {10}, object
+
+//
+// Test two gpu architectures up to the assemble phase in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM2 %s
+// DASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// DASM2: 5: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// DASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
+// DASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
+// DASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
+// DASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
diff --git a/test/Driver/cuda-simple.cu b/test/Driver/cuda-simple.cu
index 3dc0bab..fbc5aa1 100644
--- a/test/Driver/cuda-simple.cu
+++ b/test/Driver/cuda-simple.cu
@@ -1,6 +1,6 @@
 // Verify that we can parse a simple CUDA file with or without -save-temps
 // http://llvm.org/PR22936
-// RUN: %clang -nocudainc -Werror -fsyntax-only -c %s
+// RUN: %clang -nocudainc -nocudalib -Werror -fsyntax-only -c %s
 //
 // Verify that we pass -x cuda-cpp-output to compiler after 
 // preprocessing a CUDA file
diff --git a/test/Driver/cuda-unused-arg-warning.cu b/test/Driver/cuda-unused-arg-warning.cu
index e8daad6..cbbb893 100644
--- a/test/Driver/cuda-unused-arg-warning.cu
+++ b/test/Driver/cuda-unused-arg-warning.cu
@@ -4,11 +4,16 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// --cuda-host-only should never trigger unused arg warning.
+// --cuda-host-only and --cuda-compile-host-device should never trigger an
+// unused arg warning.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \
 // RUN:    FileCheck %s
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -x c -c %s 2>&1 | \
 // RUN:    FileCheck %s
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-compile-host-device -c %s 2>&1 | \
+// RUN:    FileCheck %s
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-compile-host-device -x c -c %s 2>&1 | \
+// RUN:    FileCheck %s
 
 // --cuda-device-only should warn during non-CUDA compilation.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \
@@ -19,5 +24,6 @@
 // RUN:    FileCheck -check-prefix NO-UNUSED-WARNING %s
 
 // CHECK-NOT: warning: argument unused during compilation: '--cuda-host-only'
+// CHECK-NOT: warning: argument unused during compilation: '--cuda-compile-host-device'
 // UNUSED-WARNING: warning: argument unused during compilation: '--cuda-device-only'
 // NO-UNUSED-WARNING-NOT: warning: argument unused during compilation: '--cuda-device-only'
diff --git a/test/Driver/cuda-version-check.cu b/test/Driver/cuda-version-check.cu
new file mode 100644
index 0000000..cb2ac79
--- /dev/null
+++ b/test/Driver/cuda-version-check.cu
@@ -0,0 +1,60 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+// RUN: %clang -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+
+// The installation at Inputs/CUDA is CUDA 7.0, which doesn't support sm_60.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60
+
+// This should only complain about sm_60, not sm_35.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_35 \
+// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60 --check-prefix=OK_SM35
+
+// We should get two errors here, one for sm_60 and one for sm_61.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_61 \
+// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60 --check-prefix=ERR_SM61
+
+// We should still get an error if we pass -nocudainc, because this compilation
+// would invoke ptxas, and we do a version check on that, too.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 -nocudainc --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60
+
+// If with -nocudainc and -E, we don't touch the CUDA install, so we
+// shouldn't get an error.
+// RUN: %clang -v -### -E --cuda-device-only --cuda-gpu-arch=sm_60 -nocudainc \
+// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+
+// --no-cuda-version-check should suppress all of these errors.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 \
+// RUN:    --no-cuda-version-check %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+
+// We need to make sure the version check is done only for the device toolchain,
+// therefore we should not get an error in host-only mode. We use the -S here
+// to avoid the error being produced in case by the assembler tool, which does
+// the same check.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-host-only --sysroot=%S/Inputs/CUDA -S 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-device-only --sysroot=%S/Inputs/CUDA -S 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60
+
+// OK-NOT: error: GPU arch
+
+// OK_SM35-NOT: error: GPU arch sm_35
+
+// We should only get one error per architecture.
+// ERR_SM60: error: GPU arch sm_60 {{.*}}
+// ERR_SM60-NOT: error: GPU arch sm_60
+
+// ERR_SM61: error: GPU arch sm_61 {{.*}}
+// ERR_SM61-NOT: error: GPU arch sm_61
diff --git a/test/Driver/darwin-iphone-defaults.m b/test/Driver/darwin-iphone-defaults.m
index 3e2a912..63bbbe0 100644
--- a/test/Driver/darwin-iphone-defaults.m
+++ b/test/Driver/darwin-iphone-defaults.m
@@ -1,4 +1,4 @@
-// RUN: %clang -target i386-apple-darwin9 -miphoneos-version-min=3.0 -arch armv7 -flto -S -o - %s | FileCheck %s
+// RUN: %clang -target i386-apple-darwin9 -miphoneos-version-min=3.0 -arch armv7 -stdlib=platform -flto -S -o - %s | FileCheck %s
 
 // CHECK: @f0() [[F0:#[0-9]+]]
 // CHECK: @__f0_block_invoke
diff --git a/test/Driver/darwin-ld.c b/test/Driver/darwin-ld.c
index 66c5206..a7681fa 100644
--- a/test/Driver/darwin-ld.c
+++ b/test/Driver/darwin-ld.c
@@ -152,6 +152,15 @@
 // RUN: FileCheck -check-prefix=LINK_NO_IOS_ARM64_CRT1 %s < %t.log
 // LINK_NO_IOS_ARM64_CRT1-NOT: crt
 
+// RUN: %clang -target x86_64-apple-ios6.0 -miphoneos-version-min=6.0 -fprofile-instr-generate -### %t.o 2> %t.log
+// RUN: FileCheck -check-prefix=LINK_IOSSIM_PROFILE %s < %t.log
+// LINK_IOSSIM_PROFILE: {{ld(.exe)?"}}
+// LINK_IOSSIM_PROFILE: libclang_rt.profile_iossim.a
+
+// FIXME: Currently the builtin library is only added to the command line if it,
+// so we can't check for it here
+// FIXME_LINK_IOSSIM_PROFILE: libclang_rt.ios.a
+
 // RUN: %clang -target arm64-apple-tvos8.3 -mtvos-version-min=8.3 -### %t.o 2> %t.log
 // RUN: FileCheck -check-prefix=LINK_TVOS_ARM64 %s < %t.log
 // LINK_TVOS_ARM64: {{ld(.exe)?"}}
diff --git a/test/Driver/darwin-objc-gc.m b/test/Driver/darwin-objc-gc.m
index 06e3aea..aac6dc1 100644
--- a/test/Driver/darwin-objc-gc.m
+++ b/test/Driver/darwin-objc-gc.m
@@ -1,6 +1,6 @@
 // Check that we warn, but accept, -fobjc-gc for iPhone OS.
 
-// RUN: %clang -target i386-apple-darwin9 -miphoneos-version-min=3.0 -fobjc-gc -flto -S -o %t %s 2> %t.err
+// RUN: %clang -target i386-apple-darwin9 -miphoneos-version-min=3.0 -stdlib=platform -fobjc-gc -flto -S -o %t %s 2> %t.err
 // RUN: FileCheck --check-prefix=IPHONE_OBJC_GC_LL %s < %t 
 // RUN: FileCheck --check-prefix=IPHONE_OBJC_GC_STDERR %s < %t.err
 
diff --git a/test/Driver/darwin-sanitizer-ld.c b/test/Driver/darwin-sanitizer-ld.c
index fb318eb..53c7fce 100644
--- a/test/Driver/darwin-sanitizer-ld.c
+++ b/test/Driver/darwin-sanitizer-ld.c
@@ -1,26 +1,17 @@
 // Test sanitizer link flags on Darwin.
 
 // RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
-// RUN:   -fsanitize=address %s -o %t.o 2>&1 \
+// RUN:   -stdlib=platform -fsanitize=address %s -o %t.o 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN %s
 
 // CHECK-ASAN: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN: stdc++
+// CHECK-ASAN-NOT: "-lstdc++"
+// CHECK-ASAN-NOT: "-lc++"
 // CHECK-ASAN: libclang_rt.asan_osx_dynamic.dylib"
 // CHECK-ASAN: "-rpath" "@executable_path"
 // CHECK-ASAN: "-rpath" "{{.*}}lib{{.*}}darwin"
 
 // RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
-// RUN:   -fsanitize=address -mios-simulator-version-min=7.0 %s -o %t.o 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-IOSSIM %s
-
-// CHECK-ASAN-IOSSIM: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-IOSSIM: lc++
-// CHECK-ASAN-IOSSIM: libclang_rt.asan_iossim_dynamic.dylib"
-// CHECK-ASAN-IOSSIM: "-rpath" "@executable_path"
-// CHECK-ASAN-IOSSIM: "-rpath" "{{.*}}lib{{.*}}darwin"
-
-// RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
 // RUN:   -fPIC -shared -fsanitize=address %s -o %t.so 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-DYN-ASAN %s
 
@@ -31,11 +22,12 @@
 // CHECK-DYN-ASAN: "-rpath" "{{.*}}lib{{.*}}darwin"
 
 // RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
-// RUN:   -fsanitize=undefined %s -o %t.o 2>&1 \
+// RUN:   -stdlib=platform -fsanitize=undefined %s -o %t.o 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN %s
 
 // CHECK-UBSAN: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN: stdc++
+// CHECK-UBSAN-NOT: "-lstdc++"
+// CHECK-UBSAN-NOT: "-lc++"
 // CHECK-UBSAN: libclang_rt.ubsan_osx_dynamic.dylib"
 // CHECK-UBSAN: "-rpath" "@executable_path"
 // CHECK-UBSAN: "-rpath" "{{.*}}lib{{.*}}darwin"
@@ -65,3 +57,71 @@
 
 // CHECK-DYN-BOUNDS: "{{.*}}ld{{(.exe)?}}"
 // CHECK-DYN-BOUNDS-NOT: ubsan_osx
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
+// RUN:   -stdlib=platform -fsanitize=address -mios-simulator-version-min=7.0 \
+// RUN:   %s -o %t.o 2>&1 | FileCheck --check-prefix=CHECK-ASAN-IOSSIM %s
+
+// CHECK-ASAN-IOSSIM: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-IOSSIM-NOT: "-lstdc++"
+// CHECK-ASAN-IOSSIM-NOT: "-lc++"
+// CHECK-ASAN-IOSSIM: libclang_rt.asan_iossim_dynamic.dylib"
+// CHECK-ASAN-IOSSIM: "-rpath" "@executable_path"
+// CHECK-ASAN-IOSSIM: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
+// RUN:   -stdlib=platform -fsanitize=address \
+// RUN:   -mtvos-simulator-version-min=8.3.0 %s -o %t.o 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-TVOSSIM %s
+
+// CHECK-ASAN-TVOSSIM: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-TVOSSIM-NOT: "-lstdc++"
+// CHECK-ASAN-TVOSSIM-NOT: "-lc++"
+// CHECK-ASAN-TVOSSIM: libclang_rt.asan_tvossim_dynamic.dylib"
+// CHECK-ASAN-TVOSSIM: "-rpath" "@executable_path"
+// CHECK-ASAN-TVOSSIM: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
+// RUN:   -stdlib=platform -fsanitize=address \
+// RUN:   -mwatchos-simulator-version-min=2.0.0 %s -o %t.o 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-WATCHOSSIM %s
+
+// CHECK-ASAN-WATCHOSSIM: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-WATCHOSSIM-NOT: "-lstdc++"
+// CHECK-ASAN-WATCHOSSIM-NOT: "-lc++"
+// CHECK-ASAN-WATCHOSSIM: libclang_rt.asan_watchossim_dynamic.dylib"
+// CHECK-ASAN-WATCHOSSIM: "-rpath" "@executable_path"
+// CHECK-ASAN-WATCHOSSIM: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target armv7-apple-ios  \
+// RUN:   -stdlib=platform -fsanitize=address -miphoneos-version-min=7 \
+// RUN:   %s -o %t.o 2>&1 | FileCheck --check-prefix=CHECK-ASAN-IOS %s
+
+// CHECK-ASAN-IOS: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-IOS-NOT: "-lstdc++"
+// CHECK-ASAN-IOS-NOT: "-lc++"
+// CHECK-ASAN-IOS: libclang_rt.asan_ios_dynamic.dylib"
+// CHECK-ASAN-IOS: "-rpath" "@executable_path"
+// CHECK-ASAN-IOS: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target arm64-apple-tvos \
+// RUN:   -stdlib=platform -fsanitize=address -mtvos-version-min=8.3 \
+// RUN:   %s -o %t.o 2>&1 | FileCheck --check-prefix=CHECK-ASAN-TVOS %s
+
+// CHECK-ASAN-TVOS: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-TVOS-NOT: "-lstdc++"
+// CHECK-ASAN-TVOS-NOT: "-lc++"
+// CHECK-ASAN-TVOS: libclang_rt.asan_tvos_dynamic.dylib"
+// CHECK-ASAN-TVOS: "-rpath" "@executable_path"
+// CHECK-ASAN-TVOS: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target armv7k-apple-watchos \
+// RUN:   -stdlib=platform -fsanitize=address -mwatchos-version-min=2.0 \
+// RUN:   %s -o %t.o 2>&1 | FileCheck --check-prefix=CHECK-ASAN-WATCHOS %s
+
+// CHECK-ASAN-WATCHOS: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-WATCHOS-NOT: "-lstdc++"
+// CHECK-ASAN-WATCHOS-NOT: "-lc++"
+// CHECK-ASAN-WATCHOS: libclang_rt.asan_watchos_dynamic.dylib"
+// CHECK-ASAN-WATCHOS: "-rpath" "@executable_path"
+// CHECK-ASAN-WATCHOS: "-rpath" "{{.*}}lib{{.*}}darwin"
diff --git a/test/Driver/darwin-stdlib.cpp b/test/Driver/darwin-stdlib.cpp
new file mode 100644
index 0000000..c9be607
--- /dev/null
+++ b/test/Driver/darwin-stdlib.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang -target x86_64-apple-darwin -arch arm64 -miphoneos-version-min=7.0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBCXX
+// RUN: %clang -target x86_64-apple-darwin -mmacosx-version-min=10.8 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBSTDCXX
+// RUN: %clang -target x86_64-apple-darwin -mmacosx-version-min=10.9 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBCXX
+// RUN: %clang -target x86_64-apple-darwin -arch armv7s -miphoneos-version-min=6.1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBSTDCXX
+// RUN: %clang -target x86_64-apple-darwin -arch armv7s -miphoneos-version-min=7.0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBCXX
+// RUN: %clang -target x86_64-apple-darwin -arch armv7k %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBCXX
+
+// The purpose of this test is that the libc++ headers should be found
+// properly. At the moment this is done by passing -stdlib=libc++ down to the
+// cc1 invocation. If and when we change to finding them in the driver this test
+// should reflect that.
+
+// CHECK-LIBCXX: -stdlib=libc++
+
+// CHECK-LIBSTDCXX-NOT: -stdlib=libc++
+// CHECK-LIBSTDCXX-NOT: -stdlib=libstdc++
diff --git a/test/Driver/dyld-prefix.c b/test/Driver/dyld-prefix.c
index 2c2bc4f..5a79874 100644
--- a/test/Driver/dyld-prefix.c
+++ b/test/Driver/dyld-prefix.c
@@ -1,10 +1,10 @@
 // RUN: touch %t.o
 
 // RUN: %clang -target i386-unknown-linux --dyld-prefix /foo -### %t.o 2>&1 | FileCheck --check-prefix=CHECK-32 %s
-// CHECK-32: "-dynamic-linker" "/foo/lib/ld-linux.so.2"
+// CHECK-32: "-dynamic-linker" "/foo{{(/usr/i386-unknown-linux)?}}/lib/ld-linux.so.2"
 
 // RUN: %clang -target x86_64-unknown-linux --dyld-prefix /foo -### %t.o 2>&1 | FileCheck --check-prefix=CHECK-64 %s
-// CHECK-64: "-dynamic-linker" "/foo/lib64/ld-linux-x86-64.so.2"
+// CHECK-64: "-dynamic-linker" "/foo{{(/usr/x86_64-unknown-linux)?}}/lib{{(64)?}}/ld-linux-x86-64.so.2"
 
 // RUN: %clang -target x86_64-unknown-linux-gnux32 --dyld-prefix /foo -### %t.o 2>&1 | FileCheck --check-prefix=CHECK-X32 %s
-// CHECK-X32: "-dynamic-linker" "/foo/libx32/ld-linux-x32.so.2"
+// CHECK-X32: "-dynamic-linker" "/foo{{(/x86_64-unknown-linux-gnux32)?}}/lib{{(x32)?}}/ld-linux-x32.so.2"
diff --git a/test/Driver/dynamic-linker.c b/test/Driver/dynamic-linker.c
new file mode 100644
index 0000000..c7579f4
--- /dev/null
+++ b/test/Driver/dynamic-linker.c
@@ -0,0 +1,32 @@
+// RUN: %clang -target armv7-unknown-linux-gnueabi -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+// RUN: %clang -target i386-unknown-linux-gnu -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+// RUN: %clang -target mips64-unknown-linux-gnu -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+
+// RUN: %clang -target armv7-unknown-linux-gnueabi -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+// RUN: %clang -target i386-unknown-linux-gnu -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+// RUN: %clang -target mips64-unknown-linux-gnu -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+
+
+// RUN: %clang -target armv7-unknown-linux-gnueabi -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+// RUN: %clang -target i386-unknown-linux-gnu -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+// RUN: %clang -target mips64-unknown-linux-gnu -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+
+// RUN: %clang -target armv7-unknown-linux-gnueabi -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+// RUN: %clang -target i386-unknown-linux-gnu -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+// RUN: %clang -target mips64-unknown-linux-gnu -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+
+// CHECK-RDYNAMIC: "-export-dynamic"
+// CHECK-SHARED: "-shared"
+// CHECK-STATIC: "-{{B?}}static"
+// CHECK-DYNAMIC-LINKER: "-dynamic-linker"
+// CHECK-SHARED-NOT: "-dynamic-linker"
+// CHECK-STATIC-NOT: "-dynamic-linker"
+
diff --git a/test/Driver/embed-bitcode.c b/test/Driver/embed-bitcode.c
new file mode 100644
index 0000000..da60da3
--- /dev/null
+++ b/test/Driver/embed-bitcode.c
@@ -0,0 +1,43 @@
+// RUN: %clang -ccc-print-bindings -c %s -fembed-bitcode 2>&1 | FileCheck %s
+// CHECK: clang
+// CHECK: clang
+
+// RUN: %clang %s -c -fembed-bitcode -fintegrated-as 2>&1 -### | FileCheck %s -check-prefix=CHECK-CC
+// CHECK-CC: -cc1
+// CHECK-CC: -emit-llvm-bc
+// CHECK-CC: -cc1
+// CHECK-CC: -emit-obj
+// CHECK-CC: -fembed-bitcode=all
+
+// RUN: %clang %s -c -fembed-bitcode=bitcode -fintegrated-as 2>&1 -### | FileCheck %s -check-prefix=CHECK-BITCODE
+// CHECK-BITCODE: -cc1
+// CHECK-BITCODE: -emit-llvm-bc
+// CHECK-BITCODE: -cc1
+// CHECK-BITCODE: -emit-obj
+// CHECK-BITCODE: -fembed-bitcode=bitcode
+//
+// RUN: %clang %s -c -save-temps -fembed-bitcode -fintegrated-as 2>&1 -### | FileCheck %s -check-prefix=CHECK-SAVE-TEMP
+// CHECK-SAVE-TEMP: -cc1
+// CHECK-SAVE-TEMP: -E
+// CHECK-SAVE-TEMP: -cc1
+// CHECK-SAVE-TEMP: -emit-llvm-bc
+// CHECK-SAVE-TEMP: -cc1
+// CHECK-SAVE-TEMP: -S
+// CHECK-SAVE-TEMP: -fembed-bitcode=all
+// CHECK-SAVE-TEMP: -cc1as
+
+// RUN: %clang -c %s -flto -fembed-bitcode 2>&1 -### | FileCheck %s -check-prefix=CHECK-LTO
+// RUN: %clang -c %s -flto=full -fembed-bitcode 2>&1 -### | FileCheck %s -check-prefix=CHECK-LTO
+// RUN: %clang -c %s -flto=thin -fembed-bitcode 2>&1 -### | FileCheck %s -check-prefix=CHECK-LTO
+// CHECK-LTO: -cc1
+// CHECK-LTO: -emit-llvm-bc
+// CHECK-LTO-NOT: warning: argument unused during compilation: '-fembed-bitcode'
+// CHECK-LTO-NOT: -cc1
+// CHECK-LTO-NOT: -fembed-bitcode=all
+
+// RUN: %clang -c %s -fembed-bitcode-marker -fintegrated-as 2>&1 -### | FileCheck %s -check-prefix=CHECK-MARKER
+// CHECK-MARKER: -cc1
+// CHECK-MARKER: -emit-obj
+// CHECK-MARKER: -fembed-bitcode=marker
+// CHECK-MARKER-NOT: -cc1
+
diff --git a/test/Driver/esan.c b/test/Driver/esan.c
new file mode 100644
index 0000000..7951041
--- /dev/null
+++ b/test/Driver/esan.c
@@ -0,0 +1,12 @@
+// RUN: %clang     -target x86_64-unknown-linux -fsanitize=efficiency-cache-frag %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O1 -target x86_64-unknown-linux -fsanitize=efficiency-cache-frag %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O2 -target x86_64-unknown-linux -fsanitize=efficiency-cache-frag %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O3 -target x86_64-unknown-linux -fsanitize=efficiency-cache-frag %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang     -target x86_64-unknown-linux -fsanitize=efficiency-working-set %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O1 -target x86_64-unknown-linux -fsanitize=efficiency-working-set %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O2 -target x86_64-unknown-linux -fsanitize=efficiency-working-set %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O3 -target x86_64-unknown-linux -fsanitize=efficiency-working-set %s -S -emit-llvm -o - | FileCheck %s
+// Verify that -fsanitize=efficiency-* invokes esan instrumentation.
+
+int foo(int *a) { return *a; }
+// CHECK: __esan_init
diff --git a/test/Driver/frame-pointer-elim.c b/test/Driver/frame-pointer-elim.c
index 5355cbe..139c0cc 100644
--- a/test/Driver/frame-pointer-elim.c
+++ b/test/Driver/frame-pointer-elim.c
@@ -8,6 +8,15 @@
 // RUN:   FileCheck --check-prefix=LINUX %s
 // LINUX-NOT: "-momit-leaf-frame-pointer"
 
+// CloudABI follows the same rules as Linux.
+// RUN: %clang -### -target x86_64-unknown-cloudabi -S -O1 %s 2>&1 | \
+// RUN:   FileCheck --check-prefix=CLOUDABI-OPT %s
+// CLOUDABI-OPT: "-momit-leaf-frame-pointer"
+
+// RUN: %clang -### -target x86_64-unknown-cloudabi -S %s 2>&1 | \
+// RUN:   FileCheck --check-prefix=CLOUDABI %s
+// CLOUDABI-NOT: "-momit-leaf-frame-pointer"
+
 // Darwin disables omitting the leaf frame pointer even under optimization
 // unless the command lines are given.
 // RUN: %clang -### -target i386-apple-darwin -S %s 2>&1 | \
diff --git a/test/Driver/freebsd-mips-as.c b/test/Driver/freebsd-mips-as.c
index 7555888..af02c38 100644
--- a/test/Driver/freebsd-mips-as.c
+++ b/test/Driver/freebsd-mips-as.c
@@ -45,11 +45,6 @@
 // RUN:   | FileCheck -check-prefix=MIPS64-DEF-EL-AS %s
 // MIPS64-DEF-EL-AS: as{{(.exe)?}}" "-march" "mips64r2" "-mabi" "64" "-EL"
 //
-// RUN: %clang -target mips-unknown-freebsd -mabi=eabi -### \
-// RUN:   -no-integrated-as -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS-EABI %s
-// MIPS-EABI: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "eabi" "-EB"
-//
 // RUN: %clang -target mips64-unknown-freebsd -mabi=n32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-N32 %s
diff --git a/test/Driver/freebsd.c b/test/Driver/freebsd.c
index 45e9204..f008b76 100644
--- a/test/Driver/freebsd.c
+++ b/test/Driver/freebsd.c
@@ -82,6 +82,7 @@
 // RUN: %clang -no-canonical-prefixes -target x86_64-pc-freebsd8 -static %s \
 // RUN:   --sysroot=%S/Inputs/multiarch_freebsd64_tree -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-STATIC %s
+// CHECK-STATIC: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
 // CHECK-STATIC: crt1.o
 // CHECK-STATIC: crtbeginT.o
 
diff --git a/test/Driver/freebsd.cpp b/test/Driver/freebsd.cpp
index dea3267..baf52f7 100644
--- a/test/Driver/freebsd.cpp
+++ b/test/Driver/freebsd.cpp
@@ -1,6 +1,13 @@
-// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd10.0 2>&1 \
+// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-TEN %s
-// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd9.2 2>&1 \
+// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd9.2 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NINE %s
-// CHECK-TEN: -lc++
-// CHECK-NINE: -lstdc++
+// CHECK-TEN: "-lc++" "-lm"
+// CHECK-NINE: "-lstdc++" "-lm"
+
+// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-PG-TEN %s
+// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd9.2 -stdlib=platform 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-PG-NINE %s
+// CHECK-PG-TEN: "-lc++_p" "-lm_p"
+// CHECK-PG-NINE: "-lstdc++_p" "-lm_p"
diff --git a/test/Driver/fsanitize-coverage.c b/test/Driver/fsanitize-coverage.c
index fdaa9fa..16c5dfe 100644
--- a/test/Driver/fsanitize-coverage.c
+++ b/test/Driver/fsanitize-coverage.c
@@ -2,41 +2,51 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
 // CHECK-SANITIZE-COVERAGE-0-NOT: fsanitize-coverage-type
+// CHECK-SANITIZE-COVERAGE-0: -fsanitize=address
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// CHECK-SANITIZE-COVERAGE-FUNC: fsanitize-coverage-type=1
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-BB
+// CHECK-SANITIZE-COVERAGE-BB: fsanitize-coverage-type=2
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-EDGE
+// CHECK-SANITIZE-COVERAGE-EDGE: fsanitize-coverage-type=3
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC_INDIR
+// CHECK-SANITIZE-COVERAGE-FUNC_INDIR: fsanitize-coverage-type=3
+// CHECK-SANITIZE-COVERAGE-FUNC_INDIR: fsanitize-coverage-indirect-calls
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// CHECK-SANITIZE-COVERAGE-1: fsanitize-coverage-type=1
-
+// CHECK-SANITIZE-COVERAGE-1: warning: argument '-fsanitize-coverage=1' is deprecated, use '-fsanitize-coverage=func' instead
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=2 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-2
-// CHECK-SANITIZE-COVERAGE-2: fsanitize-coverage-type=2
-
+// CHECK-SANITIZE-COVERAGE-2: warning: argument '-fsanitize-coverage=2' is deprecated, use '-fsanitize-coverage=bb' instead
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=3 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-3
-// CHECK-SANITIZE-COVERAGE-3: fsanitize-coverage-type=3
-
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=4 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-4
-// CHECK-SANITIZE-COVERAGE-4: fsanitize-coverage-type=3
-// CHECK-SANITIZE-COVERAGE-4: fsanitize-coverage-indirect-calls
-
+// CHECK-SANITIZE-COVERAGE-3: warning: argument '-fsanitize-coverage=3' is deprecated, use '-fsanitize-coverage=edge' instead
+//
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-5
 // CHECK-SANITIZE-COVERAGE-5: error: unsupported argument '5' to option 'fsanitize-coverage='
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=thread   -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-UNUSED
-// RUN: %clang -target x86_64-linux-gnu                     -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-UNUSED
-// CHECK-SANITIZE-COVERAGE-UNUSED: argument unused during compilation: '-fsanitize-coverage=1'
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=thread   -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-UNUSED
+// RUN: %clang -target x86_64-linux-gnu                     -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// CHECK-SANITIZE-COVERAGE-UNUSED: argument unused during compilation: '-fsanitize-coverage=func'
+// CHECK-SANITIZE-COVERAGE-UNUSED-NOT: -fsanitize-coverage-type=1
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 -fno-sanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-SAN-DISABLED
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fno-sanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-SAN-DISABLED
 // CHECK-SANITIZE-COVERAGE-SAN-DISABLED-NOT: argument unused
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls,trace-bb,trace-cmp,8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FEATURES
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls,trace-bb,trace-pc,trace-cmp,8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FEATURES
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-type=3
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-indirect-calls
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-bb
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-cmp
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-8bit-counters
+// CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-pc
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,edge,indirect-calls,trace-bb,trace-cmp -fno-sanitize-coverage=edge,indirect-calls,trace-bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MASK
 // CHECK-MASK: -fsanitize-coverage-type=1
@@ -52,19 +62,27 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-TYPE
 // CHECK-MISSING-TYPE: error: invalid argument '-fsanitize-coverage=8bit-counters' only allowed with '-fsanitize-coverage=(func|bb|edge)'
 
+// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
+// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
+// CHECK-TRACE_PC_EDGE: -fsanitize-coverage-type=3
+// CHECK-TRACE_PC_EDGE: -fsanitize-coverage-trace-pc
+// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_FUNC
+// CHECK-TRACE_PC_FUNC: -fsanitize-coverage-type=1
+// CHECK-TRACE_PC_FUNC: -fsanitize-coverage-trace-pc
+
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=trace-cmp,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TYPE-NECESSARY
 // CHECK-NO-TYPE-NECESSARY-NOT: error:
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-indirect-calls
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-trace-cmp
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 -fsanitize-coverage=trace-cmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-EXTEND-LEGACY
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fsanitize-coverage=trace-cmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-EXTEND-LEGACY
 // CHECK-EXTEND-LEGACY: -fsanitize-coverage-type=1
 // CHECK-EXTEND-LEGACY: -fsanitize-coverage-trace-cmp
 
-// RUN: %clang_cl --target=i386-pc-win32 -fsanitize=address -fsanitize-coverage=1 -c -### -- %s 2>&1 | FileCheck %s -check-prefix=CLANG-CL-COVERAGE
+// RUN: %clang_cl --target=i386-pc-win32 -fsanitize=address -fsanitize-coverage=func -c -### -- %s 2>&1 | FileCheck %s -check-prefix=CLANG-CL-COVERAGE
 // CLANG-CL-COVERAGE-NOT: error:
 // CLANG-CL-COVERAGE-NOT: warning:
 // CLANG-CL-COVERAGE-NOT: argument unused
 // CLANG-CL-COVERAGE-NOT: unknown argument
-// CLANG-CL-COVERAGE: -fsanitize=address
 // CLANG-CL-COVERAGE: -fsanitize-coverage-type=1
+// CLANG-CL-COVERAGE: -fsanitize=address
diff --git a/test/Driver/fsanitize.c b/test/Driver/fsanitize.c
index 2236931..b0cef81 100644
--- a/test/Driver/fsanitize.c
+++ b/test/Driver/fsanitize.c
@@ -20,10 +20,15 @@
 // RUN: %clang -target i386-pc-win32 -fsanitize=undefined -x c++ %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-WIN --check-prefix=CHECK-UNDEFINED-WIN32 --check-prefix=CHECK-UNDEFINED-WIN-CXX
 // RUN: %clang -target x86_64-pc-win32 -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-WIN --check-prefix=CHECK-UNDEFINED-WIN64
 // RUN: %clang -target x86_64-pc-win32 -fsanitize=undefined -x c++ %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-WIN --check-prefix=CHECK-UNDEFINED-WIN64 --check-prefix=CHECK-UNDEFINED-WIN-CXX
-// CHECK-UNDEFINED-WIN: "-fsanitize={{((signed-integer-overflow|integer-divide-by-zero|float-divide-by-zero|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|object-size|float-cast-overflow|array-bounds|enum|bool|returns-nonnull-attribute|nonnull-attribute),?){17}"}}
-// CHECK-UNDEFINED-WIN32-SAME: "--dependent-lib={{[^"]*}}ubsan_standalone-i386.lib"
-// CHECK-UNDEFINED-WIN64-SAME: "--dependent-lib={{[^"]*}}ubsan_standalone-x86_64.lib"
-// CHECK-UNDEFINED-WIN-CXX-SAME: "--dependent-lib={{[^"]*}}ubsan_standalone_cxx{{[^"]*}}.lib"
+// CHECK-UNDEFINED-WIN32: "--dependent-lib={{[^"]*}}ubsan_standalone-i386.lib"
+// CHECK-UNDEFINED-WIN64: "--dependent-lib={{[^"]*}}ubsan_standalone-x86_64.lib"
+// CHECK-UNDEFINED-WIN-CXX: "--dependent-lib={{[^"]*}}ubsan_standalone_cxx{{[^"]*}}.lib"
+// CHECK-UNDEFINED-WIN-SAME: "-fsanitize={{((signed-integer-overflow|integer-divide-by-zero|float-divide-by-zero|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|object-size|float-cast-overflow|array-bounds|enum|bool|returns-nonnull-attribute|nonnull-attribute),?){17}"}}
+
+// RUN: %clang -target i386-pc-win32 -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COVERAGE-WIN32
+// CHECK-COVERAGE-WIN32: "--dependent-lib={{[^"]*}}ubsan_standalone-i386.lib"
+// RUN: %clang -target x86_64-pc-win32 -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COVERAGE-WIN64
+// CHECK-COVERAGE-WIN64: "--dependent-lib={{[^"]*}}ubsan_standalone-x86_64.lib"
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=integer %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INTEGER
 // CHECK-INTEGER: "-fsanitize={{((signed-integer-overflow|unsigned-integer-overflow|integer-divide-by-zero|shift-base|shift-exponent),?){5}"}}
@@ -83,6 +88,35 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-address,leak -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANKA-SANL
 // CHECK-SANKA-SANL: '-fsanitize=kernel-address' not allowed with '-fsanitize=leak'
 
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,address -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANA
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,address -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANA
+// CHECK-SANE-SANA: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=address'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,leak -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANL
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,leak -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANL
+// CHECK-SANE-SANL: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=leak'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,thread -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANT
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,thread -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANT
+// CHECK-SANE-SANT: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=thread'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,memory -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANM
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,memory -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANM
+// CHECK-SANE-SANM: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=memory'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,kernel-address -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANKA
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,kernel-address -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANKA
+// CHECK-SANE-SANKA: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=kernel-address'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize-address-use-after-scope %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ONLY-USE-AFTER-SCOPE
+// CHECK-ONLY-USE-AFTER-SCOPE: '-fsanitize-address-use-after-scope' only allowed with '-fsanitize=address'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-address-use-after-scope %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-SCOPE
+// CHECK-USE-AFTER-SCOPE: -cc1{{.*}}-fsanitize-address-use-after-scope
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-NO-USE-AFTER-SCOPE
+// CHECK-ASAN-NO-USE-AFTER-SCOPE-NOT: -cc1{{.*}}-fsanitize-address-use-after-scope
+
 // RUN: %clang -target x86_64-linux-gnu -fsanitize-memory-track-origins -pie %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ONLY-TRACK-ORIGINS
 // CHECK-ONLY-TRACK-ORIGINS: warning: argument unused during compilation: '-fsanitize-memory-track-origins'
 
@@ -154,7 +188,7 @@
 // CHECK-NO-PIE: "-mrelocation-model" "static"
 // CHECK-NO-PIE-NOT: "-pie"
 
-// CHECK-PIE: "-mrelocation-model" "pic" "-pic-level" "2" "-pie-level" "2"
+// CHECK-PIE: "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie"
 // CHECK-PIE: "-pie"
 
 // RUN: %clang -target arm-linux-androideabi %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ANDROID-NO-ASAN
@@ -181,8 +215,8 @@
 // CHECK-DIAG-RECOVER: unsupported argument 'unreachable' to option 'fsanitize-recover='
 
 // RUN: %clang -target x86_64-linux-gnu %s -fsanitize=undefined -fsanitize-recover -fno-sanitize-recover -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEPRECATED-RECOVER
-// CHECK-DEPRECATED-RECOVER: argument '-fsanitize-recover' is deprecated, use '-fsanitize-recover=undefined,integer' instead
-// CHECK-DEPRECATED-RECOVER: argument '-fno-sanitize-recover' is deprecated, use '-fno-sanitize-recover=undefined,integer' instead
+// CHECK-DEPRECATED-RECOVER: argument '-fsanitize-recover' is deprecated, use '-fsanitize-recover=undefined,integer' or '-fsanitize-recover=all' instead
+// CHECK-DEPRECATED-RECOVER: argument '-fno-sanitize-recover' is deprecated, use '-fno-sanitize-recover=undefined,integer' or '-fno-sanitize-recover=all' instead
 // CHECK-DEPRECATED-RECOVER-NOT: is deprecated
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=leak %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANL
@@ -251,26 +285,60 @@
 // CHECK-VPTR-DARWIN-NEW: -fsanitize=alignment,vptr
 
 // RUN: %clang -target armv7-apple-ios7 -miphoneos-version-min=7.0 -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-IOS
-// CHECK-ASAN-IOS: unsupported option '-fsanitize=address' for target 'arm-apple-ios7'
+// CHECK-ASAN-IOS: -fsanitize=address
 
 // RUN: %clang -target i386-pc-openbsd -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-OPENBSD
 // CHECK-ASAN-OPENBSD: unsupported option '-fsanitize=address' for target 'i386-pc-openbsd'
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
-// RUN: %clang -target x86_64-apple-darwin10 -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi-derived-cast -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-DCAST
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi-unrelated-cast -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-UCAST
-// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-nvcall -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NVCALL
-// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-vcall -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-VCALL
+// RUN: %clang -target i686-linux-gnu -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-X86
+// RUN: %clang -target i686-linux-gnu -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-X86
+// CHECK-ESAN-X86: error: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'i686--linux-gnu'
+
+// RUN: %clang -target x86_64-apple-darwin10 -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-DARWIN
+// RUN: %clang -target x86_64-apple-darwin10 -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-DARWIN
+// CHECK-ESAN-DARWIN: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'x86_64-apple-darwin10'
+
+// RUN: %clang -target i386-apple-darwin -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-DARWIN
+// RUN: %clang -target i386-apple-darwin -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-DARWIN
+// CHECK-ESAN-I386-DARWIN: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'i386-apple-darwin'
+
+// RUN: %clang -target arm-apple-ios -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-ARM-IOS
+// RUN: %clang -target arm-apple-ios -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-ARM-IOS
+// CHECK-ESAN-ARM-IOS: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'arm-apple-ios'
+
+// RUN: %clang -target i386-apple-iossimulator -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-IOSSIMULATOR
+// RUN: %clang -target i386-apple-iossimulator -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-IOSSIMULATOR
+// CHECK-ESAN-I386-IOSSIMULATOR: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'i386-apple-iossimulator'
+
+// RUN: %clang -target i386-apple-tvossimulator -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-TVOSSIMULATOR
+// RUN: %clang -target i386-apple-tvossimulator -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-TVOSSIMULATOR
+// CHECK-ESAN-I386-TVOSSIMULATOR: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'i386-apple-tvossimulator'
+
+
+
+// RUN: %clang -target x86_64-linux-gnu -fvisibility=hidden -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
+// RUN: %clang -target x86_64-apple-darwin10 -fvisibility=hidden -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
+// RUN: %clang -target x86_64-linux-gnu -fvisibility=hidden -fsanitize=cfi-derived-cast -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-DCAST
+// RUN: %clang -target x86_64-linux-gnu -fvisibility=hidden -fsanitize=cfi-unrelated-cast -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-UCAST
+// RUN: %clang -target x86_64-linux-gnu -flto -fvisibility=hidden -fsanitize=cfi-nvcall -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NVCALL
+// RUN: %clang -target x86_64-linux-gnu -flto -fvisibility=hidden -fsanitize=cfi-vcall -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-VCALL
 // CHECK-CFI: -emit-llvm-bc{{.*}}-fsanitize=cfi-derived-cast,cfi-icall,cfi-unrelated-cast,cfi-nvcall,cfi-vcall
 // CHECK-CFI-DCAST: -emit-llvm-bc{{.*}}-fsanitize=cfi-derived-cast
 // CHECK-CFI-UCAST: -emit-llvm-bc{{.*}}-fsanitize=cfi-unrelated-cast
 // CHECK-CFI-NVCALL: -emit-llvm-bc{{.*}}-fsanitize=cfi-nvcall
 // CHECK-CFI-VCALL: -emit-llvm-bc{{.*}}-fsanitize=cfi-vcall
 
-// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-derived-cast -fno-lto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOLTO
+// RUN: %clang -target x86_64-linux-gnu -fvisibility=hidden -flto -fsanitize=cfi-derived-cast -fno-lto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOLTO
 // CHECK-CFI-NOLTO: '-fsanitize=cfi-derived-cast' only allowed with '-flto'
 
+// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-derived-cast -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOVIS
+// CHECK-CFI-NOVIS: '-fsanitize=cfi-derived-cast' only allowed with '-fvisibility='
+
+// RUN: %clang -target x86_64-pc-win32 -flto -fsanitize=cfi-derived-cast -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOVIS-NOERROR
+// RUN: echo > %t.o
+// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-derived-cast %t.o -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOVIS-NOERROR
+// CHECK-CFI-NOVIS-NOERROR-NOT: only allowed with
+
 // RUN: %clang -target mips-unknown-linux -fsanitize=cfi-icall %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-ICALL-MIPS
 // CHECK-CFI-ICALL-MIPS: unsupported option '-fsanitize=cfi-icall' for target 'mips-unknown-linux'
 
@@ -317,21 +385,30 @@
 // RUN: %clang -fno-sanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NOSP
 // NOSP-NOT: "-fsanitize=safe-stack"
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address,safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP-ASAN
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NO-SP
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address,safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NO-SP-ASAN
 // RUN: %clang -target x86_64-linux-gnu -fstack-protector -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -fstack-protector-all -### %s 2>&1 | FileCheck %s -check-prefix=SP
-// RUN: %clang -target arm-linux-androideabi -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP
-// RUN: %clang -target aarch64-linux-android -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP
-// SP-NOT: stack-protector
+// RUN: %clang -target arm-linux-androideabi -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NO-SP
+// RUN: %clang -target aarch64-linux-android -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NO-SP
+// NO-SP-NOT: stack-protector
+// NO-SP: "-fsanitize=safe-stack"
 // SP: "-fsanitize=safe-stack"
-// SP-ASAN-NOT: stack-protector
-// SP-ASAN: "-fsanitize=address,safe-stack"
+// SP: -stack-protector
+// NO-SP-NOT: stack-protector
+
+// NO-SP-ASAN-NOT: stack-protector
+// NO-SP-ASAN: "-fsanitize=address,safe-stack"
+// NO-SP-ASAN-NOT: stack-protector
 
 // RUN: %clang -target powerpc64-unknown-linux-gnu -fsanitize=memory %s -### 2>&1 | FileCheck %s -check-prefix=CHECK-SANM
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -fsanitize=memory %s -### 2>&1 | FileCheck %s -check-prefix=CHECK-SANM
 // CHECK-SANM: "-fsanitize=memory"
 
+// RUN: %clang -target aarch64-unknown-cloudabi -fsanitize=safe-stack %s -### 2>&1 | FileCheck %s -check-prefix=SAFESTACK-CLOUDABI
+// RUN: %clang -target x86_64-unknown-cloudabi -fsanitize=safe-stack %s -### 2>&1 | FileCheck %s -check-prefix=SAFESTACK-CLOUDABI
+// SAFESTACK-CLOUDABI: "-fsanitize=safe-stack"
+
 // RUN: %clang -target x86_64-scei-ps4 -fsanitize=function -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FSAN-UBSAN-PS4
 // CHECK-FSAN-UBSAN-PS4: unsupported option '-fsanitize=function' for target 'x86_64-scei-ps4'
 // RUN: %clang -target x86_64-scei-ps4 -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FSAN-PS4
@@ -344,6 +421,9 @@
 // CHECK-MSAN-PS4: unsupported option '-fsanitize=memory' for target 'x86_64-scei-ps4'
 // RUN: %clang -target x86_64-scei-ps4 -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-PS4
 // CHECK-TSAN-PS4: unsupported option '-fsanitize=thread' for target 'x86_64-scei-ps4'
+// RUN: %clang -target x86_64-scei-ps4 -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-PS4
+// RUN: %clang -target x86_64-scei-ps4 -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-PS4
+// CHECK-ESAN-PS4: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'x86_64-scei-ps4'
 // RUN: %clang -target x86_64-scei-ps4 -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-PS4
 // Make sure there are no *.{o,bc} or -l passed before the ASan library.
 // CHECK-ASAN-PS4-NOT: {{(\.(o|bc)"? |-l).*-lSceDbgAddressSanitizer_stub_weak}}
diff --git a/test/Driver/fsjlj-exceptions.c b/test/Driver/fsjlj-exceptions.c
new file mode 100644
index 0000000..f44d5b3
--- /dev/null
+++ b/test/Driver/fsjlj-exceptions.c
@@ -0,0 +1,8 @@
+// RUN: %clang -target armv7-apple-ios -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-IOS %s
+// RUN: %clang -target i686-windows-gnu -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-MINGW-DEFAULT %s
+// RUN: %clang -target i686-windows-gnu -fexceptions -fsjlj-exceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-MINGW-SJLJ %s
+
+// CHECK-IOS: -fsjlj-exceptions
+// CHECK-MINGW-DEFAULT-NOT: -fsjlj-exceptions
+// CHECK-MINGW-SJLJ: -fsjlj-exceptions
+
diff --git a/test/Driver/fubsan-strip-path-components.cpp b/test/Driver/fubsan-strip-path-components.cpp
new file mode 100644
index 0000000..1300241
--- /dev/null
+++ b/test/Driver/fubsan-strip-path-components.cpp
@@ -0,0 +1,2 @@
+// RUN: %clang %s -### -o %t.o -fsanitize-undefined-strip-path-components=42 2>&1 | FileCheck %s
+// CHECK: "-fsanitize-undefined-strip-path-components=42"
diff --git a/test/Driver/gcc-toolchain.cpp b/test/Driver/gcc-toolchain.cpp
index aa0e078..ca96757 100644
--- a/test/Driver/gcc-toolchain.cpp
+++ b/test/Driver/gcc-toolchain.cpp
@@ -1,13 +1,13 @@
 // Test that gcc-toolchain option is working correctly
 //
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \
-// RUN:     --target=i386-unknown-linux \
+// RUN:     --target=i386-unknown-linux -stdlib=libstdc++ \
 // RUN:     --gcc-toolchain=%S/Inputs/ubuntu_11.04_multiarch_tree/usr \
 // RUN:   | FileCheck %s
 //
 // Additionally check that the legacy spelling of the flag works.
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \
-// RUN:     --target=i386-unknown-linux \
+// RUN:     --target=i386-unknown-linux -stdlib=libstdc++ \
 // RUN:     -gcc-toolchain %S/Inputs/ubuntu_11.04_multiarch_tree/usr \
 // RUN:   | FileCheck %s
 //
diff --git a/test/Driver/hexagon-toolchain-elf.c b/test/Driver/hexagon-toolchain-elf.c
index e3a54dd..827c191 100644
--- a/test/Driver/hexagon-toolchain-elf.c
+++ b/test/Driver/hexagon-toolchain-elf.c
@@ -41,7 +41,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK012 %s
 // CHECK012: "-cc1"
-// CHECK012-DAG-NOT: "-internal-isystem"
+// CHECK012-NOT: "-internal-isystem"
 // CHECK012-DAG: "-internal-externc-isystem" "{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/include"
 
 // RUN: %clangxx -### -target hexagon-unknown-elf -fno-integrated-as    \
@@ -51,8 +51,8 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK013 %s
 // CHECK013: "-cc1"
-// CHECK013-DAG-NOT: "-internal-isystem"
-// CHECK013-DAG-NOT: "-internal-externc-isystem"
+// CHECK013-NOT: "-internal-isystem"
+// CHECK013-NOT: "-internal-externc-isystem"
 
 // -----------------------------------------------------------------------------
 // Test -mcpu=<cpuname> -mv<number>
@@ -63,7 +63,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK020 %s
 // CHECK020: "-cc1" {{.*}} "-target-cpu" "hexagonv4"
-// CHECK020: "hexagon-link" {{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v4/crt0
+// CHECK020: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v4/crt0
 
 // RUN: %clang -### -target hexagon-unknown-elf \
 // RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
@@ -71,7 +71,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK021 %s
 // CHECK021: "-cc1" {{.*}} "-target-cpu" "hexagonv5"
-// CHECK021: "hexagon-link" {{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v5/crt0
+// CHECK021: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v5/crt0
 
 // RUN: %clang -### -target hexagon-unknown-elf \
 // RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
@@ -79,7 +79,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK022 %s
 // CHECK022: "-cc1" {{.*}} "-target-cpu" "hexagonv55"
-// CHECK022: "hexagon-link" {{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v55/crt0
+// CHECK022: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v55/crt0
 
 // RUN: %clang -### -target hexagon-unknown-elf \
 // RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
@@ -87,7 +87,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK023 %s
 // CHECK023: "-cc1" {{.*}} "-target-cpu" "hexagonv60"
-// CHECK023: "hexagon-link" {{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v60/crt0
+// CHECK023: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v60/crt0
 
 // -----------------------------------------------------------------------------
 // Test Linker related args
diff --git a/test/Driver/lanai-toolchain.c b/test/Driver/lanai-toolchain.c
new file mode 100644
index 0000000..5523666
--- /dev/null
+++ b/test/Driver/lanai-toolchain.c
@@ -0,0 +1,2 @@
+// RUN: %clang -target lanai-unknown-unknown -v 2> %t
+// RUN: grep 'Target: lanai-unknown-unknown' %t
diff --git a/test/Driver/lanai-unknown-unknown.cpp b/test/Driver/lanai-unknown-unknown.cpp
new file mode 100644
index 0000000..5ce0adf
--- /dev/null
+++ b/test/Driver/lanai-unknown-unknown.cpp
@@ -0,0 +1,86 @@
+// RUN: %clang -target lanai-unknown-unknown -### %s -emit-llvm-only -c 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=ECHO
+// RUN: %clang -target lanai-unknown-unknown %s -emit-llvm -S -o - \
+// RUN:   | FileCheck %s
+
+// ECHO: {{.*}} "-cc1" {{.*}}lanai-unknown-unknown.c
+
+typedef __builtin_va_list va_list;
+typedef __SIZE_TYPE__ size_t;
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+extern "C" {
+
+// CHECK: @align_c = global i32 1
+int align_c = __alignof(char);
+
+// CHECK: @align_s = global i32 2
+int align_s = __alignof(short);
+
+// CHECK: @align_i = global i32 4
+int align_i = __alignof(int);
+
+// CHECK: @align_l = global i32 4
+int align_l = __alignof(long);
+
+// CHECK: @align_ll = global i32 8
+int align_ll = __alignof(long long);
+
+// CHECK: @align_p = global i32 4
+int align_p = __alignof(void*);
+
+// CHECK: @align_vl = global i32 4
+int align_vl = __alignof(va_list);
+
+// Check types
+
+// CHECK: signext i8 @check_char()
+char check_char() { return 0; }
+
+// CHECK: signext i16 @check_short()
+short check_short() { return 0; }
+
+// CHECK: i32 @check_int()
+int check_int() { return 0; }
+
+// CHECK: i32 @check_long()
+long check_long() { return 0; }
+
+// CHECK: i64 @check_longlong()
+long long check_longlong() { return 0; }
+
+// CHECK: zeroext i8 @check_uchar()
+unsigned char check_uchar() { return 0; }
+
+// CHECK: zeroext i16 @check_ushort()
+unsigned short check_ushort() { return 0; }
+
+// CHECK: i32 @check_uint()
+unsigned int check_uint() { return 0; }
+
+// CHECK: i32 @check_ulong()
+unsigned long check_ulong() { return 0; }
+
+// CHECK: i64 @check_ulonglong()
+unsigned long long check_ulonglong() { return 0; }
+
+// CHECK: i32 @check_size_t()
+size_t check_size_t() { return 0; }
+
+}
+
+template<int> void Switch();
+template<> void Switch<4>();
+template<> void Switch<8>();
+template<> void Switch<16>();
+
+void check_pointer_size() {
+  // CHECK: SwitchILi4
+  Switch<sizeof(void*)>();
+
+  // CHECK: SwitchILi8
+  Switch<sizeof(long long)>();
+
+  // CHECK: SwitchILi4
+  Switch<sizeof(va_list)>();
+}
diff --git a/test/Driver/linux-header-search.cpp b/test/Driver/linux-header-search.cpp
index bd1da49..5f6ac50 100644
--- a/test/Driver/linux-header-search.cpp
+++ b/test/Driver/linux-header-search.cpp
@@ -64,7 +64,7 @@
 //
 // Test a very broken version of multiarch that shipped in Ubuntu 11.04.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_11.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-11-04 %s
@@ -80,7 +80,7 @@
 // CHECK-UBUNTU-11-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu \
+// RUN:     -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-13-04 %s
@@ -97,7 +97,7 @@
 // CHECK-UBUNTU-13-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnux32 \
+// RUN:     -target x86_64-unknown-linux-gnux32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-14-04 %s
@@ -114,7 +114,7 @@
 // CHECK-UBUNTU-14-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 ///
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target arm-linux-gnueabihf \
+// RUN:     -target arm-linux-gnueabihf -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-13-04-CROSS %s
@@ -131,7 +131,7 @@
 //
 // Test Ubuntu/Debian's new version of multiarch, with -m32.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu -m32 \
+// RUN:     -target x86_64-unknown-linux-gnu -m32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-13-04-M32 %s
@@ -145,7 +145,7 @@
 // Test Ubuntu/Debian's Ubuntu 14.04 config variant, with -m32
 // and an empty 4.9 directory.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu -m32 \
+// RUN:     -target x86_64-unknown-linux-gnu -m32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-14-04-M32 %s
@@ -160,7 +160,7 @@
 // installed rather than relying on multilib. Also happens to look like an
 // actual i686 Ubuntu system.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu -m32 \
+// RUN:     -target x86_64-unknown-linux-gnu -m32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree2 \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-14-04-I686 %s
@@ -173,7 +173,7 @@
 //
 // Test Ubuntu/Debian's Ubuntu 14.04 for powerpc64le
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target powerpc64le-unknown-linux-gnu -m32 \
+// RUN:     -target powerpc64le-unknown-linux-gnu -m32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-14-04-PPC64LE %s
@@ -189,7 +189,7 @@
 //
 // Thoroughly exercise the Debian multiarch environment.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target i686-linux-gnu \
+// RUN:     -target i686-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-X86 %s
@@ -205,7 +205,7 @@
 // CHECK-DEBIAN-X86: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-DEBIAN-X86: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-linux-gnu \
+// RUN:     -target x86_64-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-X86-64 %s
@@ -221,7 +221,7 @@
 // CHECK-DEBIAN-X86-64: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-DEBIAN-X86-64: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target powerpc-linux-gnu \
+// RUN:     -target powerpc-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-PPC %s
@@ -237,7 +237,7 @@
 // CHECK-DEBIAN-PPC: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-DEBIAN-PPC: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target powerpc64-linux-gnu \
+// RUN:     -target powerpc64-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-PPC64 %s
@@ -256,7 +256,7 @@
 // Test Gentoo's weirdness both before and after they changed it in their GCC
 // 4.6.4 release.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu \
+// RUN:     -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_4.6.2_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-GENTOO-4-6-2 %s
@@ -271,7 +271,7 @@
 // CHECK-GENTOO-4-6-2: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-GENTOO-4-6-2: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu \
+// RUN:     -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_4.6.4_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-GENTOO-4-6-4 %s
@@ -285,10 +285,25 @@
 // CHECK-GENTOO-4-6-4: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include"
 // CHECK-GENTOO-4-6-4: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-GENTOO-4-6-4: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
+// RUN:     -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \
+// RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_4.9.3_tree \
+// RUN:     --gcc-toolchain="" \
+// RUN:   | FileCheck --check-prefix=CHECK-GENTOO-4-9-3 %s
+// CHECK-GENTOO-4-9-3: "{{.*}}clang{{.*}}" "-cc1"
+// CHECK-GENTOO-4-9-3: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-GENTOO-4-9-3: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/x86_64-pc-linux-gnu"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/backward"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include"
+// CHECK-GENTOO-4-9-3: "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-GENTOO-4-9-3: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 //
 // Check header search on Debian 6 / MIPS64
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target mips64-unknown-linux-gnuabi64 \
+// RUN:     -target mips64-unknown-linux-gnuabi64 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_6_mips64_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64-GNUABI %s
@@ -306,7 +321,7 @@
 //
 // Check header search on Debian 6 / MIPS64
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target mips64el-unknown-linux-gnuabi64 \
+// RUN:     -target mips64el-unknown-linux-gnuabi64 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_6_mips64_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL-GNUABI %s
@@ -324,7 +339,7 @@
 
 // Check header search on Debian 8 / Sparc
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target sparc-unknown-linux-gnu \
+// RUN:     -target sparc-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_8_sparc_multilib_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-SPARC32 %s
@@ -342,7 +357,7 @@
 
 // Check header search on Debian 8 / Sparc, with the oldstyle multilib packages
 // RUN: %clang -no-canonical-prefixes -m64 %s -### -fsyntax-only 2>&1 \
-// RUN:     -target sparc-unknown-linux-gnu \
+// RUN:     -target sparc-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_8_sparc_multilib_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-SPARC32-LIB64 %s
@@ -363,7 +378,7 @@
 
 // Check header search on Debian 8 / Sparc64
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target sparc64-unknown-linux-gnu \
+// RUN:     -target sparc64-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_8_sparc64_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-SPARC64 %s
diff --git a/test/Driver/linux-ld.c b/test/Driver/linux-ld.c
index c15e24d..60b30dd 100644
--- a/test/Driver/linux-ld.c
+++ b/test/Driver/linux-ld.c
@@ -16,7 +16,7 @@
 // CHECK-LD-32: "-L[[SYSROOT]]/usr/lib"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-unknown-linux \
+// RUN:     --target=x86_64-unknown-linux -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64 %s
@@ -36,7 +36,7 @@
 // CHECK-LD-64: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-unknown-linux-gnux32 \
+// RUN:     --target=x86_64-unknown-linux-gnux32 -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-X32 %s
@@ -86,7 +86,7 @@
 // CHECK-LD-RT-ANDROID: libclang_rt.builtins-arm-android.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-unknown-linux \
+// RUN:     --target=x86_64-unknown-linux -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     --rtlib=libgcc \
@@ -107,7 +107,7 @@
 // CHECK-LD-GCC: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-unknown-linux \
+// RUN:     --target=x86_64-unknown-linux -rtlib=platform \
 // RUN:     -static-libgcc \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
@@ -128,7 +128,7 @@
 // CHECK-LD-64-STATIC-LIBGCC: "-lgcc" "-lgcc_eh"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-unknown-linux \
+// RUN:     --target=x86_64-unknown-linux -rtlib=platform \
 // RUN:     -static \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
@@ -149,7 +149,7 @@
 //
 // Check that flags can be combined. The -static dominates.
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-unknown-linux \
+// RUN:     --target=x86_64-unknown-linux -rtlib=platform \
 // RUN:     -static-libgcc -static \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
@@ -388,6 +388,15 @@
 // CHECK-GCC-VERSION4: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-GCC-VERSION4: "{{.*}}/Inputs/gcc_version_parsing4/bin/../lib/gcc/i386-unknown-linux/4.7.99{{/|\\\\}}crtbegin.o"
 // CHECK-GCC-VERSION4: "-L{{.*}}/Inputs/gcc_version_parsing4/bin/../lib/gcc/i386-unknown-linux/4.7.99"
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=i386-unknown-linux -m32 \
+// RUN:     -ccc-install-dir %S/Inputs/gcc_version_parsing5/bin \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-GCC-VERSION5 %s
+// CHECK-GCC-VERSION5: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-GCC-VERSION5: "{{.*}}/Inputs/gcc_version_parsing5/bin/../lib/gcc/i386-unknown-linux/5{{/|\\\\}}crtbegin.o"
+// CHECK-GCC-VERSION5: "-L{{.*}}/Inputs/gcc_version_parsing5/bin/../lib/gcc/i386-unknown-linux/5"
 //
 // Test a simulated installation of libc++ on Linux, both through sysroot and
 // the installation path of Clang.
@@ -474,7 +483,7 @@
 // RUN:     --sysroot=%S/Inputs/x86-64_ubuntu_13.10 \
 // RUN:   | FileCheck --check-prefix=CHECK-X86-64-UBUNTU-13-10-ARM-HF %s
 // CHECK-X86-64-UBUNTU-13-10-ARM-HF: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-UBUNTU-13-10-ARM-HF: "-dynamic-linker" "/lib/ld-linux-armhf.so.3"
+// CHECK-X86-64-UBUNTU-13-10-ARM-HF: "-dynamic-linker" "{{(/usr/arm--linux-gnueabihf)?}}/lib/ld-linux-armhf.so.3"
 // CHECK-X86-64-UBUNTU-13-10-ARM-HF: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabihf/4.8/../../../../arm-linux-gnueabihf/lib/../lib{{/|\\\\}}crt1.o"
 // CHECK-X86-64-UBUNTU-13-10-ARM-HF: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabihf/4.8/../../../../arm-linux-gnueabihf/lib/../lib{{/|\\\\}}crti.o"
 // CHECK-X86-64-UBUNTU-13-10-ARM-HF: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabihf/4.8{{/|\\\\}}crtbegin.o"
@@ -493,7 +502,7 @@
 // RUN:     --sysroot=%S/Inputs/x86-64_ubuntu_13.10 \
 // RUN:   | FileCheck --check-prefix=CHECK-X86-64-UBUNTU-13-10-ARM %s
 // CHECK-X86-64-UBUNTU-13-10-ARM: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-UBUNTU-13-10-ARM: "-dynamic-linker" "/lib/ld-linux.so.3"
+// CHECK-X86-64-UBUNTU-13-10-ARM: "-dynamic-linker" "{{(/usr/arm--linux-gnueabi)?}}/lib/ld-linux.so.3"
 // CHECK-X86-64-UBUNTU-13-10-ARM: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabi/4.7/../../../../arm-linux-gnueabi/lib/../lib{{/|\\\\}}crt1.o"
 // CHECK-X86-64-UBUNTU-13-10-ARM: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabi/4.7/../../../../arm-linux-gnueabi/lib/../lib{{/|\\\\}}crti.o"
 // CHECK-X86-64-UBUNTU-13-10-ARM: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabi/4.7{{/|\\\\}}crtbegin.o"
@@ -636,7 +645,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64 %s
 // CHECK-PPC64: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64: "-m" "elf64ppc"
-// CHECK-PPC64: "-dynamic-linker" "{{.*}}/lib64/ld64.so.1"
+// CHECK-PPC64: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.1"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64-linux-gnu -mabi=elfv1 \
@@ -646,35 +655,35 @@
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64-ELFv1 %s
 // CHECK-PPC64-ELFv1: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64-ELFv1: "-m" "elf64ppc"
-// CHECK-PPC64-ELFv1: "-dynamic-linker" "{{.*}}/lib64/ld64.so.1"
+// CHECK-PPC64-ELFv1: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.1"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64-linux-gnu -mabi=elfv2 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64-ELFv2 %s
 // CHECK-PPC64-ELFv2: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64-ELFv2: "-m" "elf64ppc"
-// CHECK-PPC64-ELFv2: "-dynamic-linker" "{{.*}}/lib64/ld64.so.2"
+// CHECK-PPC64-ELFv2: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.2"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64le-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LE %s
 // CHECK-PPC64LE: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64LE: "-m" "elf64lppc"
-// CHECK-PPC64LE: "-dynamic-linker" "{{.*}}/lib64/ld64.so.2"
+// CHECK-PPC64LE: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.2"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64le-linux-gnu -mabi=elfv1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LE-ELFv1 %s
 // CHECK-PPC64LE-ELFv1: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64LE-ELFv1: "-m" "elf64lppc"
-// CHECK-PPC64LE-ELFv1: "-dynamic-linker" "{{.*}}/lib64/ld64.so.1"
+// CHECK-PPC64LE-ELFv1: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.1"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64le-linux-gnu -mabi=elfv2 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LE-ELFv2 %s
 // CHECK-PPC64LE-ELFv2: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64LE-ELFv2: "-m" "elf64lppc"
-// CHECK-PPC64LE-ELFv2: "-dynamic-linker" "{{.*}}/lib64/ld64.so.2"
+// CHECK-PPC64LE-ELFv2: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.2"
 //
 // Check that we do not pass --hash-style=gnu and --hash-style=both to linker
 // and provide correct path to the dynamic linker and emulation mode when build
@@ -714,7 +723,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64 %s
 // CHECK-MIPS64: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64: "-m" "elf64btsmip"
-// CHECK-MIPS64: "-dynamic-linker" "{{.*}}/lib64/ld.so.1"
+// CHECK-MIPS64: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld.so.1"
 // CHECK-MIPS64-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
@@ -722,21 +731,21 @@
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL %s
 // CHECK-MIPS64EL: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64EL: "-m" "elf64ltsmip"
-// CHECK-MIPS64EL: "-dynamic-linker" "{{.*}}/lib64/ld.so.1"
+// CHECK-MIPS64EL: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld.so.1"
 // CHECK-MIPS64EL-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-linux-gnu -mnan=2008 \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL-NAN2008 %s
 // CHECK-MIPS64EL-NAN2008: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64EL-NAN2008: "-m" "elf64ltsmip"
-// CHECK-MIPS64EL-NAN2008: "-dynamic-linker" "{{.*}}/lib64/ld-linux-mipsn8.so.1"
+// CHECK-MIPS64EL-NAN2008: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld-linux-mipsn8.so.1"
 // CHECK-MIPS64EL-NAN2008-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-linux-gnu -mcpu=mips64r6 \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64R6EL %s
 // CHECK-MIPS64R6EL: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64R6EL: "-m" "elf64ltsmip"
-// CHECK-MIPS64R6EL: "-dynamic-linker" "{{.*}}/lib64/ld-linux-mipsn8.so.1"
+// CHECK-MIPS64R6EL: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld-linux-mipsn8.so.1"
 // CHECK-MIPS64R6EL-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
@@ -744,7 +753,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64-N32 %s
 // CHECK-MIPS64-N32: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64-N32: "-m" "elf32btsmipn32"
-// CHECK-MIPS64-N32: "-dynamic-linker" "{{.*}}/lib32/ld.so.1"
+// CHECK-MIPS64-N32: "-dynamic-linker" "{{.*}}/lib{{(32)?}}/ld.so.1"
 // CHECK-MIPS64-N32-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
@@ -752,36 +761,44 @@
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL-N32 %s
 // CHECK-MIPS64EL-N32: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64EL-N32: "-m" "elf32ltsmipn32"
-// CHECK-MIPS64EL-N32: "-dynamic-linker" "{{.*}}/lib32/ld.so.1"
+// CHECK-MIPS64EL-N32: "-dynamic-linker" "{{.*}}/lib{{(32)?}}/ld.so.1"
 // CHECK-MIPS64EL-N32-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-linux-gnu -mabi=n32 \
 // RUN:   -mnan=2008 | FileCheck --check-prefix=CHECK-MIPS64EL-N32-NAN2008 %s
 // CHECK-MIPS64EL-N32-NAN2008: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64EL-N32-NAN2008: "-m" "elf32ltsmipn32"
-// CHECK-MIPS64EL-N32-NAN2008: "-dynamic-linker" "{{.*}}/lib32/ld-linux-mipsn8.so.1"
+// CHECK-MIPS64EL-N32-NAN2008: "-dynamic-linker" "{{.*}}/lib{{(32)?}}/ld-linux-mipsn8.so.1"
 // CHECK-MIPS64EL-N32-NAN2008-NOT: "--hash-style={{gnu|both}}"
 //
+// RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-redhat-linux \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL-REDHAT %s
+// CHECK-MIPS64EL-REDHAT: "{{.*}}ld{{(.exe)?}}"
+// CHECK-MIPS64EL-REDHAT: "-m" "elf64ltsmip"
+// CHECK-MIPS64EL-REDHAT: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld.so.1"
+// CHECK-MIPS64EL-REDHAT-NOT: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld-musl-mipsel.so.1"
+// CHECK-MIPS64EL-REDHAT-NOT: "--hash-style={{gnu|both}}"
+//
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=sparc-unknown-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-SPARCV8 %s
 // CHECK-SPARCV8: "{{.*}}ld{{(.exe)?}}"
 // CHECK-SPARCV8: "-m" "elf32_sparc"
-// CHECK-SPARCV8: "-dynamic-linker" "/lib/ld-linux.so.2"
+// CHECK-SPARCV8: "-dynamic-linker" "{{(/usr/sparc-unknown-linux-gnu)?}}/lib/ld-linux.so.2"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=sparcel-unknown-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-SPARCV8EL %s
 // CHECK-SPARCV8EL: "{{.*}}ld{{(.exe)?}}"
 // CHECK-SPARCV8EL: "-m" "elf32_sparc"
-// CHECK-SPARCV8EL: "-dynamic-linker" "/lib/ld-linux.so.2"
+// CHECK-SPARCV8EL: "-dynamic-linker" "{{(/usr/sparcel-unknown-linux-gnu)?}}/lib/ld-linux.so.2"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=sparcv9-unknown-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-SPARCV9 %s
 // CHECK-SPARCV9: "{{.*}}ld{{(.exe)?}}"
 // CHECK-SPARCV9: "-m" "elf64_sparc"
-// CHECK-SPARCV9: "-dynamic-linker" "/lib64/ld-linux.so.2"
+// CHECK-SPARCV9: "-dynamic-linker" "{{(/usr/sparcv9-unknown-linux-gnu)?}}/lib{{(64)?}}/ld-linux.so.2"
 //
 // Thoroughly exercise the Debian multiarch environment.
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
@@ -987,42 +1004,42 @@
 //
 // Test linker invocation on Android.
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm-linux-android \
+// RUN:     --target=arm-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=aarch64-linux-android \
+// RUN:     --target=aarch64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm64-linux-android \
+// RUN:     --target=arm64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-android \
+// RUN:     --target=mipsel-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-android \
+// RUN:     --target=mips64el-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=i686-linux-android \
+// RUN:     --target=i686-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-linux-android \
+// RUN:     --target=x86_64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
@@ -1035,48 +1052,48 @@
 // CHECK-ANDROID-NOT: "gcc_s"
 // CHECK-ANDROID: "{{.*}}{{/|\\\\}}crtend_android.o"
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm-linux-android \
+// RUN:     --target=arm-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=aarch64-linux-android \
+// RUN:     --target=aarch64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm64-linux-android \
+// RUN:     --target=arm64-linux-android -rtlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-android \
+// RUN:     --target=mipsel-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-android \
+// RUN:     --target=mips64el-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=i686-linux-android \
+// RUN:     --target=i686-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-linux-android \
+// RUN:     --target=x86_64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
@@ -1091,47 +1108,47 @@
 // CHECK-ANDROID-SO-NOT: "gcc_s"
 // CHECK-ANDROID-SO: "{{.*}}{{/|\\\\}}crtend_so.o"
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm-linux-android \
+// RUN:     --target=arm-linux-android -rtlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=aarch64-linux-android \
+// RUN:     --target=aarch64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm64-linux-android \
+// RUN:     --target=arm64-linux-android -rtlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-android \
+// RUN:     --target=mipsel-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-android \
+// RUN:     --target=mips64el-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=i686-linux-android \
+// RUN:     --target=i686-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-linux-android \
+// RUN:     --target=x86_64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static \
@@ -1145,49 +1162,49 @@
 // CHECK-ANDROID-STATIC-NOT: "gcc_s"
 // CHECK-ANDROID-STATIC: "{{.*}}{{/|\\\\}}crtend_android.o"
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot  \
 // RUN:     -pie \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm-linux-android \
+// RUN:     --target=arm-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -pie \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=aarch64-linux-android \
+// RUN:     --target=aarch64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot  \
 // RUN:     -pie \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=arm64-linux-android \
+// RUN:     --target=arm64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot  \
 // RUN:     -pie \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-android \
+// RUN:     --target=mipsel-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -pie \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-android \
+// RUN:     --target=mips64el-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -pie \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=i686-linux-android \
+// RUN:     --target=i686-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -pie \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=x86_64-linux-android \
+// RUN:     --target=x86_64-linux-android -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -pie \
@@ -1553,7 +1570,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARMEB %s
 // CHECK-ARMEB: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-ARMEB-NOT: "--be8"
-// CHECK-ARMEB: "-m" "armebelf_linux_eabi"
+// CHECK-ARMEB: "-m" "armelfb_linux_eabi"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=armebv7-unknown-linux \
@@ -1562,4 +1579,86 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
 // CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-ARMV7EB: "--be8"
-// CHECK-ARMV7EB: "-m" "armebelf_linux_eabi"
+// CHECK-ARMV7EB: "-m" "armelfb_linux_eabi"
+
+// Check dynamic-linker for musl-libc
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=i386-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-X86 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=x86_64-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-X86_64 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=mips-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-MIPS %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=mipsel-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-MIPSEL %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=mips64-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-MIPS64 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=mips64el-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-MIPS64EL %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=powerpc-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-PPC %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=powerpc64-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-PPC64 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumb-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARM %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumb-pc-linux-musleabihf \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumbv7-pc-linux-musleabi -mhard-float \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumbeb-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEB %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumbeb-pc-linux-musleabihf \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEBHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumbv7eb-pc-linux-musleabi -mhard-float \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEBHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=arm-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARM %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=arm-pc-linux-musleabihf \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=armv7-pc-linux-musleabi -mhard-float \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=armeb-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEB %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=armeb-pc-linux-musleabihf \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEBHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=armv7eb-pc-linux-musleabi -mhard-float \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEBHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=aarch64-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-AARCH64 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=aarch64_be-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-AARCH64_BE %s
+// CHECK-MUSL-X86:        "-dynamic-linker" "/lib/ld-musl-i386.so.1"
+// CHECK-MUSL-X86_64:     "-dynamic-linker" "/lib/ld-musl-x86_64.so.1"
+// CHECK-MUSL-MIPS:       "-dynamic-linker" "/lib/ld-musl-mips.so.1"
+// CHECK-MUSL-MIPSEL:     "-dynamic-linker" "/lib/ld-musl-mipsel.so.1"
+// CHECK-MUSL-MIPS64:     "-dynamic-linker" "/lib/ld-musl-mips64.so.1"
+// CHECK-MUSL-MIPS64EL:   "-dynamic-linker" "/lib/ld-musl-mips64el.so.1"
+// CHECK-MUSL-PPC:        "-dynamic-linker" "/lib/ld-musl-powerpc.so.1"
+// CHECK-MUSL-PPC64:      "-dynamic-linker" "/lib/ld-musl-powerpc64.so.1"
+// CHECK-MUSL-ARM:        "-dynamic-linker" "/lib/ld-musl-arm.so.1"
+// CHECK-MUSL-ARMHF:      "-dynamic-linker" "/lib/ld-musl-armhf.so.1"
+// CHECK-MUSL-ARMEB:      "-dynamic-linker" "/lib/ld-musl-armeb.so.1"
+// CHECK-MUSL-ARMEBHF:    "-dynamic-linker" "/lib/ld-musl-armebhf.so.1"
+// CHECK-MUSL-AARCH64:    "-dynamic-linker" "/lib/ld-musl-aarch64.so.1"
+// CHECK-MUSL-AARCH64_BE: "-dynamic-linker" "/lib/ld-musl-aarch64_be.so.1"
diff --git a/test/Driver/lit.local.cfg b/test/Driver/lit.local.cfg
index 6c2373b..ff831e7 100644
--- a/test/Driver/lit.local.cfg
+++ b/test/Driver/lit.local.cfg
@@ -1,5 +1,5 @@
 config.suffixes = ['.c', '.cpp', '.h', '.m', '.mm', '.S', '.s', '.f90', '.f95',
-                   '.cu']
+                   '.cu', '.rs', '.cl']
 config.substitutions = list(config.substitutions)
 config.substitutions.insert(0,
     ('%clang_cc1',
diff --git a/test/Driver/lto.c b/test/Driver/lto.c
index 3f66274..d2f68f5 100644
--- a/test/Driver/lto.c
+++ b/test/Driver/lto.c
@@ -49,3 +49,12 @@
 // RUN: FileCheck -check-prefix=CHECK-LINK-NOLTO-ACTION < %t %s
 //
 // CHECK-LINK-NOLTO-ACTION-NOT: "-plugin" "{{.*}}/LLVMgold.so"
+
+// -flto passes along an explicit debugger tuning argument.
+// RUN: %clang -target x86_64-unknown-linux -### %s -flto -glldb 2> %t
+// RUN: FileCheck -check-prefix=CHECK-TUNING-LLDB < %t %s
+// RUN: %clang -target x86_64-unknown-linux -### %s -flto -g 2> %t
+// RUN: FileCheck -check-prefix=CHECK-NO-TUNING < %t %s
+//
+// CHECK-TUNING-LLDB:   "-plugin-opt=-debugger-tune=lldb"
+// CHECK-NO-TUNING-NOT: "-plugin-opt=-debugger-tune
diff --git a/test/Driver/masm.s b/test/Driver/masm.s
new file mode 100644
index 0000000..b77e836
--- /dev/null
+++ b/test/Driver/masm.s
@@ -0,0 +1,11 @@
+// RUN: %clang -target i386-unknown-linux -masm=intel -c %s -### 2>&1 | FileCheck --check-prefix=CHECK-INTEL %s
+// RUN: %clang -target i386-unknown-linux -masm=att -c %s -### 2>&1 | FileCheck --check-prefix=CHECK-ATT %s
+// RUN: %clang -target i386-unknown-linux -c -masm=somerequired %s -### 2>&1 | FileCheck --check-prefix=CHECK-SOMEREQUIRED %s
+// RUN: %clang -target arm-unknown-eabi -c -masm=intel %s -### 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
+
+// CHECK-INTEL: -x86-asm-syntax=intel
+// CHECK-ATT: -x86-asm-syntax=att
+// CHECK-SOMEREQUIRED: error: unsupported argument 'somerequired' to option 'masm='
+// CHECK-ARM: warning: argument unused during compilation: '-masm=intel'
+.text
+mov    al, 0
diff --git a/test/Driver/miamcu-opt.c b/test/Driver/miamcu-opt.c
new file mode 100644
index 0000000..577bd37
--- /dev/null
+++ b/test/Driver/miamcu-opt.c
@@ -0,0 +1,36 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+//
+// RUN: %clang -miamcu -rtlib=platform -no-canonical-prefixes %s -### -o %t.o 2>&1 | FileCheck %s
+// RUN: %clang -miamcu -rtlib=platform -no-canonical-prefixes -m32 %s -### -o %t.o 2>&1 | FileCheck %s
+// RUN: %clang -miamcu -rtlib=platform -no-canonical-prefixes -target x86_64-unknown-linux-gnu %s -### -o %t.o 2>&1 | FileCheck %s
+// RUN: %clang -mno-iamcu -miamcu -rtlib=platform -no-canonical-prefixes %s -### -o %t.o 2>&1 | FileCheck %s
+// RUN: %clang -miamcu -rtlib=platform -no-canonical-prefixes -m64 %s -### -o %t.o 2>&1 | FileCheck %s -check-prefix=M64
+// RUN: %clang -miamcu -rtlib=platform -no-canonical-prefixes -dynamic %s -### -o %t.o 2>&1 | FileCheck %s -check-prefix=DYNAMIC
+// RUN: %clang -miamcu -rtlib=platform -no-canonical-prefixes  -target armv8-eabi %s -### -o %t.o 2>&1 | FileCheck %s -check-prefix=NOT-X86
+// RUN: %clang -miamcu -mno-iamcu -no-canonical-prefixes -target x86_64-unknown-linux-gnu %s -### -o %t.o 2>&1 | FileCheck %s -check-prefix=MNOIAMCU
+
+// M64: error: invalid argument '-miamcu' not allowed with '-m64'
+
+// DYNAMIC: error: invalid argument '-dynamic' not allowed with '-static'
+
+// NOT-X86: error: unsupported option '-miamcu' for target 'armv8---eabi'
+
+// MNOIAMCU-NOT: "-triple" "i586-intel-elfiamcu"
+
+// CHECK: "{{.*}}clang{{.*}}" "-cc1"
+// CHECK: "-triple" "i586-intel-elfiamcu"
+// CHECK: "-static-define"
+// CHECK: "-mfloat-abi" "soft"
+// CHECK: "-mstack-alignment=4"
+
+// CHECK: "{{.*}}ld{{(.exe)?}}"
+// CHECK: "-m" "elf_iamcu"
+// CHECK: "-static"
+// CHECK-NOT: crt1
+// CHECK-NOT: crti
+// CHECK-NOT: ctrbegin
+// CHECK: crt0
+// CHECK: "--start-group" "-lgcc" "-lc" "-lgloss" "--end-group" "--as-needed" "-lsoftfp" "--no-as-needed"
+// CHECK-NOT: crtend
+// CHECK-NOT: ctrn
diff --git a/test/Driver/miamcu-opt.cpp b/test/Driver/miamcu-opt.cpp
new file mode 100644
index 0000000..b63c455
--- /dev/null
+++ b/test/Driver/miamcu-opt.cpp
@@ -0,0 +1,3 @@
+// RUN: %clang -miamcu -rtlib=platform %s -### -o %t.o 2>&1 | FileCheck %s
+
+// CHECK: error: the clang compiler does not support 'C++ for IAMCU'
diff --git a/test/Driver/mingw-libgcc.c b/test/Driver/mingw-libgcc.c
index 75a5696..1d45c91 100644
--- a/test/Driver/mingw-libgcc.c
+++ b/test/Driver/mingw-libgcc.c
@@ -2,24 +2,24 @@
 // Verified with gcc version 5.1.0 (i686-posix-dwarf-rev0, Built by MinGW-W64 project).
 
 // gcc, static
-// RUN: %clang -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
-// RUN: %clang -static -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
-// RUN: %clang -static-libgcc -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
-// RUN: %clang -static -shared -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
-// RUN: %clang -static-libgcc -shared -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -static -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -static-libgcc -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -static -shared -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -static-libgcc -shared -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
 
 // gcc, dynamic
-// RUN: %clang -shared -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_DYNAMIC %s
+// RUN: %clang -shared -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_DYNAMIC %s
 
 // g++, static
-// RUN: %clang -static --driver-mode=g++ -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
-// RUN: %clang -static-libgcc --driver-mode=g++ -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
-// RUN: %clang -static -shared --driver-mode=g++ -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
-// RUN: %clang -static-libgcc -shared --driver-mode=g++ -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -static --driver-mode=g++ -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -static-libgcc --driver-mode=g++ -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -static -shared --driver-mode=g++ -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
+// RUN: %clang -static-libgcc -shared --driver-mode=g++ -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_STATIC %s
 
 // g++, dynamic
-// RUN: %clang --driver-mode=g++ -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_DYNAMIC %s
-// RUN: %clang -shared --driver-mode=g++ -v -target i686-pc-windows-gnu -### %s 2>&1 | FileCheck -check-prefix=CHECK_DYNAMIC %s
+// RUN: %clang --driver-mode=g++ -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_DYNAMIC %s
+// RUN: %clang -shared --driver-mode=g++ -v -target i686-pc-windows-gnu -rtlib=platform -### %s 2>&1 | FileCheck -check-prefix=CHECK_DYNAMIC %s
 
 // CHECK_STATIC: "-lgcc" "-lgcc_eh"
 // CHECK_DYNAMIC: "-lgcc_s" "-lgcc"
diff --git a/test/Driver/mingw.cpp b/test/Driver/mingw.cpp
index 8dc5b96..c939c7a 100644
--- a/test/Driver/mingw.cpp
+++ b/test/Driver/mingw.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang -target i686-windows-gnu -c -### --sysroot=%S/Inputs/mingw_clang_tree/mingw32 %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_CLANG_TREE %s
+// RUN: %clang -target i686-windows-gnu -rtlib=platform -c -### --sysroot=%S/Inputs/mingw_clang_tree/mingw32 %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_CLANG_TREE %s
 // CHECK_MINGW_CLANG_TREE: "{{.*}}/Inputs/mingw_clang_tree/mingw32{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include"
 // CHECK_MINGW_CLANG_TREE: "{{.*}}/Inputs/mingw_clang_tree/mingw32{{/|\\\\}}include"
 
 
-// RUN: %clang -target i686-pc-windows-gnu -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_mingw_org_tree/mingw %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_ORG_TREE %s
+// RUN: %clang -target i686-pc-windows-gnu -rtlib=platform -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_mingw_org_tree/mingw %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_ORG_TREE %s
 // CHECK_MINGW_ORG_TREE: "{{.*}}/Inputs/mingw_mingw_org_tree/mingw{{/|\\\\}}lib{{/|\\\\}}gcc{{/|\\\\}}mingw32{{/|\\\\}}4.8.1{{/|\\\\}}include{{/|\\\\}}c++"
 // CHECK_MINGW_ORG_TREE: "{{.*}}/Inputs/mingw_mingw_org_tree/mingw{{/|\\\\}}lib{{/|\\\\}}gcc{{/|\\\\}}mingw32{{/|\\\\}}4.8.1{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}mingw32"
 // CHECK_MINGW_ORG_TREE: "{{.*}}{{/|\\\\}}Inputs/mingw_mingw_org_tree/mingw{{/|\\\\}}lib{{/|\\\\}}gcc{{/|\\\\}}mingw32{{/|\\\\}}4.8.1{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}backward"
@@ -13,7 +13,7 @@
 // CHECK_MINGW_ORG_TREE: {{.*}}/Inputs/mingw_mingw_org_tree/mingw{{/|\\\\}}include
 
 
-// RUN: %clang -target i686-pc-windows-gnu -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_mingw_builds_tree/mingw32 %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_BUILDS_TREE %s
+// RUN: %clang -target i686-pc-windows-gnu -rtlib=platform -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_mingw_builds_tree/mingw32 %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_BUILDS_TREE %s
 // CHECK_MINGW_BUILDS_TREE: "{{.*}}/Inputs/mingw_mingw_builds_tree/mingw32{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include{{/|\\\\}}c++"
 // CHECK_MINGW_BUILDS_TREE: "{{.*}}/Inputs/mingw_mingw_builds_tree/mingw32{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}i686-w64-mingw32"
 // CHECK_MINGW_BUILDS_TREE: "{{.*}}/Inputs/mingw_mingw_builds_tree/mingw32{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}backward"
@@ -22,7 +22,7 @@
 // CHECK_MINGW_BUILDS_TREE: "{{.*}}/Inputs/mingw_mingw_builds_tree/mingw32{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include"
 
 
-// RUN: %clang -target i686-pc-windows-gnu -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_msys2_tree/msys64/mingw32 %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_MSYS_TREE %s
+// RUN: %clang -target i686-pc-windows-gnu -rtlib=platform -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_msys2_tree/msys64/mingw32 %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_MSYS_TREE %s
 // CHECK_MINGW_MSYS_TREE: "{{.*}}/Inputs/mingw_msys2_tree/msys64{{/|\\\\}}mingw32{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}4.9.2"
 // CHECK_MINGW_MSYS_TREE: "{{.*}}/Inputs/mingw_msys2_tree/msys64/mingw32{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}4.9.2{{/|\\\\}}i686-w64-mingw32"
 // CHECK_MINGW_MSYS_TREE: "{{.*}}/Inputs/mingw_msys2_tree/msys64/mingw32{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}4.9.2{{/|\\\\}}backward"
@@ -32,7 +32,7 @@
 // CHECK_MINGW_MSYS_TREE: "{{.*}}/Inputs/mingw_msys2_tree/msys64/mingw32{{/|\\\\}}include"
 
 
-// RUN: %clang -target x86_64-pc-windows-gnu -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_opensuse_tree/usr %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_OPENSUSE_TREE %s
+// RUN: %clang -target x86_64-pc-windows-gnu -rtlib=platform -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_opensuse_tree/usr %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_OPENSUSE_TREE %s
 // CHECK_MINGW_OPENSUSE_TREE: "{{.*}}/Inputs/mingw_opensuse_tree/usr{{/|\\\\}}lib64{{/|\\\\}}gcc{{/|\\\\}}x86_64-w64-mingw32{{/|\\\\}}5.1.0{{/|\\\\}}include{{/|\\\\}}c++"
 // CHECK_MINGW_OPENSUSE_TREE: "{{.*}}/Inputs/mingw_opensuse_tree/usr{{/|\\\\}}lib64{{/|\\\\}}gcc{{/|\\\\}}x86_64-w64-mingw32{{/|\\\\}}5.1.0{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}x86_64-w64-mingw32"
 // CHECK_MINGW_OPENSUSE_TREE: "{{.*}}/Inputs/mingw_opensuse_tree/usr{{/|\\\\}}lib64{{/|\\\\}}gcc{{/|\\\\}}x86_64-w64-mingw32{{/|\\\\}}5.1.0{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}backward"
@@ -41,7 +41,7 @@
 // CHECK_MINGW_OPENSUSE_TREE: "{{.*}}/Inputs/mingw_opensuse_tree/usr{{/|\\\\}}lib64{{/|\\\\}}gcc{{/|\\\\}}x86_64-w64-mingw32{{/|\\\\}}5.1.0{{/|\\\\}}include-fixed"
 
 
-// RUN: %clang -target i686-pc-windows-gnu -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_arch_tree/usr %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_ARCH_TREE %s
+// RUN: %clang -target i686-pc-windows-gnu -rtlib=platform -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_arch_tree/usr %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_ARCH_TREE %s
 // CHECK_MINGW_ARCH_TREE: "{{.*}}/Inputs/mingw_arch_tree/usr{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}5.1.0"
 // CHECK_MINGW_ARCH_TREE: "{{.*}}/Inputs/mingw_arch_tree/usr{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}5.1.0{{/|\\\\}}i686-w64-mingw32"
 // CHECK_MINGW_ARCH_TREE: "{{.*}}/Inputs/mingw_arch_tree/usr{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}5.1.0{{/|\\\\}}backward"
@@ -50,7 +50,7 @@
 // CHECK_MINGW_ARCH_TREE: "{{.*}}/Inputs/mingw_arch_tree/usr{{/|\\\\}}i686-w64-mingw32{{/|\\\\}}include"
 
 
-// RUN: %clang -target x86_64-pc-windows-gnu -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_ubuntu_tree/usr %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_UBUNTU_TREE %s
+// RUN: %clang -target x86_64-pc-windows-gnu -rtlib=platform -stdlib=libstdc++ -c -### --sysroot=%S/Inputs/mingw_ubuntu_tree/usr %s 2>&1 | FileCheck -check-prefix=CHECK_MINGW_UBUNTU_TREE %s
 // CHECK_MINGW_UBUNTU_TREE: "{{.*}}/Inputs/mingw_ubuntu_tree/usr{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}4.8"
 // CHECK_MINGW_UBUNTU_TREE: "{{.*}}/Inputs/mingw_ubuntu_tree/usr{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}4.8{{/|\\\\}}x86_64-w64-mingw32"
 // CHECK_MINGW_UBUNTU_TREE: "{{.*}}/Inputs/mingw_ubuntu_tree/usr{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}4.8{{/|\\\\}}backward"
diff --git a/test/Driver/mips-abi.c b/test/Driver/mips-abi.c
index cede685..8e3f7c0 100644
--- a/test/Driver/mips-abi.c
+++ b/test/Driver/mips-abi.c
@@ -1,14 +1,38 @@
 // Check passing Mips ABI options to the backend.
 //
 // RUN: %clang -target mips-linux-gnu -### -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS-DEF %s
-// MIPS-DEF: "-target-cpu" "mips32r2"
-// MIPS-DEF: "-target-abi" "o32"
+// RUN:   | FileCheck -check-prefix=MIPS32R2-O32 %s
+// RUN: %clang -target mips64-linux-gnu -mips32r2 -mabi=32 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS32R2-O32 %s
+// MIPS32R2-O32: "-target-cpu" "mips32r2"
+// MIPS32R2-O32: "-target-abi" "o32"
+//
+// FIXME: This is a valid combination of options but we reject it at the moment
+//        because the backend can't handle it.
+// RUN: not %clang -target mips-linux-gnu -c %s \
+// RUN:        -march=mips64r2 -mabi=32 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R2-O32 %s
+// MIPS64R2-O32: error: ABI 'o32' is not supported on CPU 'mips64r2'
 //
 // RUN: %clang -target mips64-linux-gnu -### -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS64-DEF %s
-// MIPS64-DEF: "-target-cpu" "mips64r2"
-// MIPS64-DEF: "-target-abi" "n64"
+// RUN:   | FileCheck -check-prefix=MIPS64R2-N64 %s
+// RUN: %clang -target mips-img-linux-gnu -mips64r2 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R2-N64 %s
+// RUN: %clang -target mips-mti-linux-gnu -mips64r2 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R2-N64 %s
+// RUN: %clang -target mips-linux-gnu -mips64r2 -mabi=64 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R2-N64 %s
+// MIPS64R2-N64: "-target-cpu" "mips64r2"
+// MIPS64R2-N64: "-target-abi" "n64"
+//
+// RUN: %clang -target mips64-linux-gnu -### -mips64r3 -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R3-N64 %s
+// RUN: %clang -target mips-img-linux-gnu -mips64r3 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R3-N64 %s
+// RUN: %clang -target mips-mti-linux-gnu -mips64r3 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R3-N64 %s
+// MIPS64R3-N64: "-target-cpu" "mips64r3"
+// MIPS64R3-N64: "-target-abi" "n64"
 //
 // RUN: %clang -target mips-linux-gnu -### -c %s \
 // RUN:        -mabi=32 2>&1 \
@@ -45,12 +69,6 @@
 // RUN:   | FileCheck -check-prefix=MIPS-ABI-O64 %s
 // MIPS-ABI-O64: error: unknown target ABI 'o64'
 //
-// RUN: %clang -target mips-linux-gnu -### -c %s \
-// RUN:        -mabi=eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS-ABI-EABI %s
-// MIPS-ABI-EABI: "-target-cpu" "mips32r2"
-// MIPS-ABI-EABI: "-target-abi" "eabi"
-//
 // RUN: not %clang -target mips-linux-gnu -c %s \
 // RUN:        -mabi=unknown 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ABI-UNKNOWN %s
@@ -104,6 +122,11 @@
 // MIPS-ARCH-P5600: "-target-cpu" "p5600"
 // MIPS-ARCH-P5600: "-target-abi" "o32"
 //
+// RUN: not %clang -target mips-linux-gnu -c %s \
+// RUN:        -march=p5600 -mabi=64 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS-ARCH-P5600-N64 %s
+// MIPS-ARCH-P5600-N64: error: ABI 'n64' is not supported on CPU 'p5600'
+//
 // RUN: %clang -target mips-linux-gnu -### -c %s \
 // RUN:        -march=mips64 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ARCH-3264 %s
@@ -131,7 +154,7 @@
 // RUN: not %clang -target mips64-linux-gnu -c %s \
 // RUN:        -march=mips32 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ARCH-6432 %s
-// MIPS-ARCH-6432: error: unknown target CPU 'mips32'
+// MIPS-ARCH-6432: error: ABI 'n64' is not supported on CPU 'mips32'
 //
 // RUN: not %clang -target mips-linux-gnu -c %s \
 // RUN:        -march=unknown 2>&1 \
diff --git a/test/Driver/mips-as.c b/test/Driver/mips-as.c
index 63fc64c..4d95620 100644
--- a/test/Driver/mips-as.c
+++ b/test/Driver/mips-as.c
@@ -30,11 +30,6 @@
 // RUN:   | FileCheck -check-prefix=MIPS64R2-DEF-EL-AS %s
 // MIPS64R2-DEF-EL-AS: as{{(.exe)?}}" "-march" "mips64r2" "-mabi" "64"  "-mno-shared" "-KPIC" "-EL"
 //
-// RUN: %clang -target mips-linux-gnu -mabi=eabi -### \
-// RUN:   -no-integrated-as -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS-EABI %s
-// MIPS-EABI: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "eabi" "-mno-shared" "-call_nonpic" "-EB"
-//
 // RUN: %clang -target mips64-linux-gnu -mabi=n32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-N32 %s
diff --git a/test/Driver/mips-cs.cpp b/test/Driver/mips-cs.cpp
index 62a90f0..bca2ab9 100644
--- a/test/Driver/mips-cs.cpp
+++ b/test/Driver/mips-cs.cpp
@@ -3,7 +3,7 @@
 // = Big-endian, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32 %s
 // CHECK-BE-HF-32: "-internal-isystem"
 // CHECK-BE-HF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -31,7 +31,7 @@
 // = Big-endian, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-32 %s
 // CHECK-BE-UC-HF-32: "-internal-isystem"
 // CHECK-BE-UC-HF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -60,7 +60,7 @@
 // = Big-endian, hard float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -mips16 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16 %s
 // CHECK-BE-HF-16: "-internal-isystem"
 // CHECK-BE-HF-16: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -89,7 +89,7 @@
 // = Big-endian, hard float, mmicromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -mmicromips \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-MICRO %s
 // CHECK-BE-HF-MICRO: "-internal-isystem"
 // CHECK-BE-HF-MICRO: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -118,7 +118,7 @@
 // = Big-endian, hard float, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-NAN %s
 // CHECK-BE-HF-NAN: "-internal-isystem"
 // CHECK-BE-HF-NAN: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -147,7 +147,7 @@
 // = Big-endian, hard float, uclibc, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -muclibc -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-NAN %s
 // CHECK-BE-UC-HF-NAN: "-internal-isystem"
 // CHECK-BE-UC-HF-NAN: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -176,7 +176,7 @@
 // = Big-endian, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32 %s
 // CHECK-BE-SF-32: "-internal-isystem"
 // CHECK-BE-SF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -205,7 +205,7 @@
 // = Big-endian, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -muclibc -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-SF-32 %s
 // CHECK-BE-UC-SF-32: "-internal-isystem"
 // CHECK-BE-UC-SF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -234,7 +234,7 @@
 // = Big-endian, soft float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -msoft-float -mips16 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16 %s
 // CHECK-BE-SF-16: "-internal-isystem"
 // CHECK-BE-SF-16: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -263,7 +263,7 @@
 // = Big-endian, soft float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -msoft-float -mmicromips \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-MICRO %s
 // CHECK-BE-SF-MICRO: "-internal-isystem"
 // CHECK-BE-SF-MICRO: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -292,7 +292,7 @@
 // = Big-endian, hard float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-linux-gnu \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64 %s
 // CHECK-BE-HF-64: "-internal-isystem"
 // CHECK-BE-HF-64: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -321,7 +321,7 @@
 // = Big-endian, soft float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-linux-gnu -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64 %s
 // CHECK-BE-SF-64: "-internal-isystem"
 // CHECK-BE-SF-64: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -350,7 +350,7 @@
 // = Little-endian, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32 %s
 // CHECK-EL-HF-32: "-internal-isystem"
 // CHECK-EL-HF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -379,7 +379,7 @@
 // = Little-endian, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mhard-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-32 %s
 // CHECK-EL-UC-HF-32: "-internal-isystem"
 // CHECK-EL-UC-HF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -408,7 +408,7 @@
 // = Little-endian, hard float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mips16 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16 %s
 // CHECK-EL-HF-16: "-internal-isystem"
 // CHECK-EL-HF-16: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -437,7 +437,7 @@
 // = Little-endian, hard float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mmicromips \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-MICRO %s
 // CHECK-EL-HF-MICRO: "-internal-isystem"
 // CHECK-EL-HF-MICRO: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -466,7 +466,7 @@
 // = Little-endian, hard float, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-NAN %s
 // CHECK-EL-HF-NAN: "-internal-isystem"
 // CHECK-EL-HF-NAN: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -495,7 +495,7 @@
 // = Little-endian, hard float, uclibc, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -muclibc -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-NAN %s
 // CHECK-EL-UC-HF-NAN: "-internal-isystem"
 // CHECK-EL-UC-HF-NAN: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -524,7 +524,7 @@
 // = Little-endian, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mfloat-abi=soft \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32 %s
 // CHECK-EL-SF-32: "-internal-isystem"
 // CHECK-EL-SF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -553,7 +553,7 @@
 // = Little-endian, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mfloat-abi=soft -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-SF-32 %s
 // CHECK-EL-UC-SF-32: "-internal-isystem"
 // CHECK-EL-UC-SF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -582,7 +582,7 @@
 // = Little-endian, soft float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16 %s
 // CHECK-EL-SF-16: "-internal-isystem"
 // CHECK-EL-SF-16: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -611,7 +611,7 @@
 // = Little-endian, soft float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mmicromips -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-MICRO %s
 // CHECK-EL-SF-MICRO: "-internal-isystem"
 // CHECK-EL-SF-MICRO: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -640,7 +640,7 @@
 // = Little-endian, hard float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64el-linux-gnu \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64 %s
 // CHECK-EL-HF-64: "-internal-isystem"
 // CHECK-EL-HF-64: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -669,7 +669,7 @@
 // = Little-endian, soft float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64el-linux-gnu -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64 %s
 // CHECK-EL-SF-64: "-internal-isystem"
 // CHECK-EL-SF-64: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
diff --git a/test/Driver/mips-features.c b/test/Driver/mips-features.c
index 461d778..69fc20e 100644
--- a/test/Driver/mips-features.c
+++ b/test/Driver/mips-features.c
@@ -116,6 +116,24 @@
 // RUN:   | FileCheck --check-prefix=CHECK-NANLEGACY %s
 // CHECK-NANLEGACY: "-target-feature" "-nan2008"
 //
+// -mcompact-branches=never
+// RUN: %clang -target mips-linux-gnu -march=mips32r6 -### -c %s \
+// RUN:     -mcompact-branches=never 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CBNEVER %s
+// CHECK-CBNEVER: "-mllvm" "-mips-compact-branches=never"
+//
+// -mcompact-branches=optimal
+// RUN: %clang -target mips-linux-gnu -march=mips32r6 -### -c %s \
+// RUN:     -mcompact-branches=optimal 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CBOPTIMAL %s
+// CHECK-CBOPTIMAL: "-mllvm" "-mips-compact-branches=optimal"
+//
+// -mcompact-branches=always
+// RUN: %clang -target mips-linux-gnu -march=mips32r6 -### -c %s \
+// RUN:     -mcompact-branches=always 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CBALWAYS %s
+// CHECK-CBALWAYS: "-mllvm" "-mips-compact-branches=always"
+//
 // -mxgot
 // RUN: %clang -target mips-linux-gnu -### -c %s \
 // RUN:     -mno-xgot -mxgot 2>&1 \
diff --git a/test/Driver/mips-fsf.cpp b/test/Driver/mips-fsf.cpp
index e39b24e..68ee490 100644
--- a/test/Driver/mips-fsf.cpp
+++ b/test/Driver/mips-fsf.cpp
@@ -2,8 +2,8 @@
 //
 // = Big-endian, mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32 %s
 // CHECK-BE-HF-32: "-internal-isystem"
 // CHECK-BE-HF-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -29,8 +29,8 @@
 //
 // = Big-endian, mips32, hard float, fp64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-32 %s
 // CHECK-BE-HF64-32: "-internal-isystem"
 // CHECK-BE-HF64-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -56,8 +56,8 @@
 //
 // = Big-endian, mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32 %s
 // CHECK-BE-SF-32: "-internal-isystem"
 // CHECK-BE-SF-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -83,8 +83,8 @@
 //
 // = Big-endian, mips16 / mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16 %s
 // CHECK-BE-HF-16: "-internal-isystem"
 // CHECK-BE-HF-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -110,8 +110,8 @@
 //
 // = Big-endian, mips16 / mips32, hard float, fp64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-16 %s
 // CHECK-BE-HF64-16: "-internal-isystem"
 // CHECK-BE-HF64-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -137,8 +137,8 @@
 //
 // = Big-endian, mips16 / mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16 %s
 // CHECK-BE-SF-16: "-internal-isystem"
 // CHECK-BE-SF-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -164,8 +164,8 @@
 //
 // = Big-endian, mips32 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-16 %s
 // CHECK-BE-NAN-16: "-internal-isystem"
 // CHECK-BE-NAN-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -191,8 +191,8 @@
 //
 // = Big-endian, mips32 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-16 %s
 // CHECK-BE-NAN64-16: "-internal-isystem"
 // CHECK-BE-NAN64-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -218,8 +218,8 @@
 //
 // = Big-endian, mips32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-32 %s
 // CHECK-BE-NAN-32: "-internal-isystem"
 // CHECK-BE-NAN-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -245,8 +245,8 @@
 //
 // = Big-endian, mips32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32 %s
 // CHECK-BE-NAN64-32: "-internal-isystem"
 // CHECK-BE-NAN64-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -272,8 +272,8 @@
 //
 // = Big-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R2 %s
 // CHECK-BE-HF-32R2: "-internal-isystem"
 // CHECK-BE-HF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -299,8 +299,8 @@
 //
 // = Big-endian, mips32r2, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mhard-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mhard-float -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-32R2 %s
 // CHECK-BE-UC-HF-32R2: "-internal-isystem"
 // CHECK-BE-UC-HF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -326,8 +326,8 @@
 //
 // = Big-endian, mips32r2, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-32R2 %s
 // CHECK-BE-HF64-32R2: "-internal-isystem"
 // CHECK-BE-HF64-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -353,8 +353,8 @@
 //
 // = Big-endian, mips32r2, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32R2 %s
 // CHECK-BE-SF-32R2: "-internal-isystem"
 // CHECK-BE-SF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -380,8 +380,8 @@
 //
 // = Big-endian, mips32r2, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -msoft-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -msoft-float -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-SF-32R2 %s
 // CHECK-BE-UC-SF-32R2: "-internal-isystem"
 // CHECK-BE-UC-SF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -407,8 +407,8 @@
 //
 // = Big-endian, mips32r2 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16R2 %s
 // CHECK-BE-HF-16R2: "-internal-isystem"
 // CHECK-BE-HF-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -434,8 +434,8 @@
 //
 // = Big-endian, mips32r2 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-16R2 %s
 // CHECK-BE-HF64-16R2: "-internal-isystem"
 // CHECK-BE-HF64-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -461,8 +461,8 @@
 //
 // = Big-endian, mips32r2 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16R2 %s
 // CHECK-BE-SF-16R2: "-internal-isystem"
 // CHECK-BE-SF-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -488,8 +488,8 @@
 //
 // = Big-endian, mips32r2 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-16R2 %s
 // CHECK-BE-NAN-16R2: "-internal-isystem"
 // CHECK-BE-NAN-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -515,8 +515,8 @@
 //
 // = Big-endian, mips32r2 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-16R2 %s
 // CHECK-BE-NAN64-16R2: "-internal-isystem"
 // CHECK-BE-NAN64-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -542,8 +542,8 @@
 //
 // = Big-endian, mips32r2, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-32R2 %s
 // CHECK-BE-NAN-32R2: "-internal-isystem"
 // CHECK-BE-NAN-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -569,8 +569,8 @@
 //
 // = Big-endian, mips32r2, nan2008, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mnan=2008 -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mnan=2008 -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-NAN-32R2 %s
 // CHECK-BE-UC-NAN-32R2: "-internal-isystem"
 // CHECK-BE-UC-NAN-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -596,8 +596,8 @@
 //
 // = Big-endian, mips32r2, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32R2 %s
 // CHECK-BE-NAN64-32R2: "-internal-isystem"
 // CHECK-BE-NAN64-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -623,8 +623,8 @@
 //
 // = Big-endian, default (mips32r2), fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32R2-DEF %s
 // CHECK-BE-NAN64-32R2-DEF: "-internal-isystem"
 // CHECK-BE-NAN64-32R2-DEF: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -650,8 +650,8 @@
 //
 // = Big-endian, micromips, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-MM %s
 // CHECK-BE-HF-MM: "-internal-isystem"
 // CHECK-BE-HF-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -677,8 +677,8 @@
 //
 // = Big-endian, micromips, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-MM %s
 // CHECK-BE-HF64-MM: "-internal-isystem"
 // CHECK-BE-HF64-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -704,8 +704,8 @@
 //
 // = Big-endian, micromips, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-MM %s
 // CHECK-BE-SF-MM: "-internal-isystem"
 // CHECK-BE-SF-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -731,8 +731,8 @@
 //
 // = Big-endian, micromips, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-MM %s
 // CHECK-BE-NAN-MM: "-internal-isystem"
 // CHECK-BE-NAN-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -758,8 +758,8 @@
 //
 // = Big-endian, micromips, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-MM %s
 // CHECK-BE-NAN64-MM: "-internal-isystem"
 // CHECK-BE-NAN64-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -785,8 +785,8 @@
 //
 // = Big-endian, mips64, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64-N32 %s
 // CHECK-BE-HF-64-N32: "-internal-isystem"
 // CHECK-BE-HF-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -812,8 +812,8 @@
 //
 // = Big-endian, mips64, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64-N32 %s
 // CHECK-BE-HF64-64-N32: "-internal-isystem"
 // CHECK-BE-HF64-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -839,8 +839,8 @@
 //
 // = Big-endian, mips64, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64-N32 %s
 // CHECK-BE-SF-64-N32: "-internal-isystem"
 // CHECK-BE-SF-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -866,8 +866,8 @@
 //
 // = Big-endian, mips64, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64-N32 %s
 // CHECK-BE-NAN-64-N32: "-internal-isystem"
 // CHECK-BE-NAN-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -893,8 +893,8 @@
 //
 // = Big-endian, mips64, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64-N32 %s
 // CHECK-BE-NAN64-64-N32: "-internal-isystem"
 // CHECK-BE-NAN64-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -920,8 +920,8 @@
 //
 // = Big-endian, mips64, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64-64 %s
 // CHECK-BE-HF-64-64: "-internal-isystem"
 // CHECK-BE-HF-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -947,8 +947,8 @@
 //
 // = Big-endian, mips64, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64-64 %s
 // CHECK-BE-HF64-64-64: "-internal-isystem"
 // CHECK-BE-HF64-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -974,8 +974,8 @@
 //
 // = Big-endian, mips64, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64-64 %s
 // CHECK-BE-SF-64-64: "-internal-isystem"
 // CHECK-BE-SF-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1001,8 +1001,8 @@
 //
 // = Big-endian, mips64, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64-64 %s
 // CHECK-BE-NAN-64-64: "-internal-isystem"
 // CHECK-BE-NAN-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1028,8 +1028,8 @@
 //
 // = Big-endian, mips64, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64-64 %s
 // CHECK-BE-NAN64-64-64: "-internal-isystem"
 // CHECK-BE-NAN64-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1055,8 +1055,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R2-N32 %s
 // CHECK-BE-HF-64R2-N32: "-internal-isystem"
 // CHECK-BE-HF-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1082,8 +1082,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64R2-N32 %s
 // CHECK-BE-HF64-64R2-N32: "-internal-isystem"
 // CHECK-BE-HF64-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1109,8 +1109,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64R2-N32 %s
 // CHECK-BE-SF-64R2-N32: "-internal-isystem"
 // CHECK-BE-SF-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1136,8 +1136,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64R2-N32 %s
 // CHECK-BE-NAN-64R2-N32: "-internal-isystem"
 // CHECK-BE-NAN-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1163,8 +1163,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-N32 %s
 // CHECK-BE-NAN64-64R2-N32: "-internal-isystem"
 // CHECK-BE-NAN64-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1190,8 +1190,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R2-64 %s
 // CHECK-BE-HF-64R2-64: "-internal-isystem"
 // CHECK-BE-HF-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1217,8 +1217,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64R2-64 %s
 // CHECK-BE-HF64-64R2-64: "-internal-isystem"
 // CHECK-BE-HF64-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1244,8 +1244,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64R2-64 %s
 // CHECK-BE-SF-64R2-64: "-internal-isystem"
 // CHECK-BE-SF-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1271,8 +1271,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64R2-64 %s
 // CHECK-BE-NAN-64R2-64: "-internal-isystem"
 // CHECK-BE-NAN-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1298,8 +1298,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-64 %s
 // CHECK-BE-NAN64-64R2-64: "-internal-isystem"
 // CHECK-BE-NAN64-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1325,8 +1325,8 @@
 //
 // = Big-endian, default (mips64r2), ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-64-DEF %s
 // CHECK-BE-NAN64-64R2-64-DEF: "-internal-isystem"
 // CHECK-BE-NAN64-64R2-64-DEF: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1352,8 +1352,8 @@
 //
 // = Little-endian, mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32 %s
 // CHECK-EL-HF-32: "-internal-isystem"
 // CHECK-EL-HF-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1379,8 +1379,8 @@
 //
 // = Little-endian, mips32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-32 %s
 // CHECK-EL-HF64-32: "-internal-isystem"
 // CHECK-EL-HF64-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1406,8 +1406,8 @@
 //
 // = Little-endian, mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32 %s
 // CHECK-EL-SF-32: "-internal-isystem"
 // CHECK-EL-SF-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1433,8 +1433,8 @@
 //
 // = Little-endian, mips32 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16 %s
 // CHECK-EL-HF-16: "-internal-isystem"
 // CHECK-EL-HF-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1460,8 +1460,8 @@
 //
 // = Little-endian, mips32 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-16 %s
 // CHECK-EL-HF64-16: "-internal-isystem"
 // CHECK-EL-HF64-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1487,8 +1487,8 @@
 //
 // = Little-endian, mips32 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16 %s
 // CHECK-EL-SF-16: "-internal-isystem"
 // CHECK-EL-SF-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1514,8 +1514,8 @@
 //
 // = Little-endian, mips32 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-16 %s
 // CHECK-EL-NAN-16: "-internal-isystem"
 // CHECK-EL-NAN-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1541,8 +1541,8 @@
 //
 // = Little-endian, mips32 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-16 %s
 // CHECK-EL-NAN64-16: "-internal-isystem"
 // CHECK-EL-NAN64-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1568,8 +1568,8 @@
 //
 // = Little-endian, mips32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-32 %s
 // CHECK-EL-NAN-32: "-internal-isystem"
 // CHECK-EL-NAN-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1595,8 +1595,8 @@
 //
 // = Little-endian, mips32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32 %s
 // CHECK-EL-NAN64-32: "-internal-isystem"
 // CHECK-EL-NAN64-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1622,8 +1622,8 @@
 //
 // = Little-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32R2 %s
 // CHECK-EL-HF-32R2: "-internal-isystem"
 // CHECK-EL-HF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1649,8 +1649,8 @@
 //
 // = Little-endian, mips32r2, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mhard-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mhard-float -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-32R2 %s
 // CHECK-EL-UC-HF-32R2: "-internal-isystem"
 // CHECK-EL-UC-HF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1676,8 +1676,8 @@
 //
 // = Little-endian, mips32r2, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-32R2 %s
 // CHECK-EL-HF64-32R2: "-internal-isystem"
 // CHECK-EL-HF64-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1703,8 +1703,8 @@
 //
 // = Little-endian, mips32r2, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32R2 %s
 // CHECK-EL-SF-32R2: "-internal-isystem"
 // CHECK-EL-SF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1730,8 +1730,8 @@
 //
 // = Little-endian, mips32r2, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -msoft-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -msoft-float -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-SF-32R2 %s
 // CHECK-EL-UC-SF-32R2: "-internal-isystem"
 // CHECK-EL-UC-SF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1757,8 +1757,8 @@
 //
 // = Little-endian, mips32r2 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16R2 %s
 // CHECK-EL-HF-16R2: "-internal-isystem"
 // CHECK-EL-HF-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1784,8 +1784,8 @@
 //
 // = Little-endian, mips32r2 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-16R2 %s
 // CHECK-EL-HF64-16R2: "-internal-isystem"
 // CHECK-EL-HF64-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1811,8 +1811,8 @@
 //
 // = Little-endian, mips32r2 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16R2 %s
 // CHECK-EL-SF-16R2: "-internal-isystem"
 // CHECK-EL-SF-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1838,8 +1838,8 @@
 //
 // = Little-endian, mips32r2 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-16R2 %s
 // CHECK-EL-NAN-16R2: "-internal-isystem"
 // CHECK-EL-NAN-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1865,8 +1865,8 @@
 //
 // = Little-endian, mips32r2 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-16R2 %s
 // CHECK-EL-NAN64-16R2: "-internal-isystem"
 // CHECK-EL-NAN64-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1892,8 +1892,8 @@
 //
 // = Little-endian, mips32r2, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-32R2 %s
 // CHECK-EL-NAN-32R2: "-internal-isystem"
 // CHECK-EL-NAN-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1919,8 +1919,8 @@
 //
 // = Little-endian, mips32r2, nan2008, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mnan=2008 -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mnan=2008 -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-NAN-32R2 %s
 // CHECK-EL-UC-NAN-32R2: "-internal-isystem"
 // CHECK-EL-UC-NAN-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1946,8 +1946,8 @@
 //
 // = Little-endian, mips32r2, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32R2 %s
 // CHECK-EL-NAN64-32R2: "-internal-isystem"
 // CHECK-EL-NAN64-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1973,8 +1973,8 @@
 //
 // = Little-endian, default (mips32r2), fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32R2-DEF %s
 // CHECK-EL-NAN64-32R2-DEF: "-internal-isystem"
 // CHECK-EL-NAN64-32R2-DEF: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2000,8 +2000,8 @@
 //
 // = Little-endian, micromips, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-MM %s
 // CHECK-EL-HF-MM: "-internal-isystem"
 // CHECK-EL-HF-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2027,8 +2027,8 @@
 //
 // = Little-endian, micromips, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-MM %s
 // CHECK-EL-HF64-MM: "-internal-isystem"
 // CHECK-EL-HF64-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2054,8 +2054,8 @@
 //
 // = Little-endian, micromips, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-MM %s
 // CHECK-EL-SF-MM: "-internal-isystem"
 // CHECK-EL-SF-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2081,8 +2081,8 @@
 //
 // = Little-endian, micromips, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-MM %s
 // CHECK-EL-NAN-MM: "-internal-isystem"
 // CHECK-EL-NAN-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2108,8 +2108,8 @@
 //
 // = Little-endian, micromips, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-MM %s
 // CHECK-EL-NAN64-MM: "-internal-isystem"
 // CHECK-EL-NAN64-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2135,8 +2135,8 @@
 //
 // = Little-endian, mips64, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64-N32 %s
 // CHECK-EL-HF-64-N32: "-internal-isystem"
 // CHECK-EL-HF-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2162,8 +2162,8 @@
 //
 // = Little-endian, mips64, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64-N32 %s
 // CHECK-EL-HF64-64-N32: "-internal-isystem"
 // CHECK-EL-HF64-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2189,8 +2189,8 @@
 //
 // = Little-endian, mips64, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64-N32 %s
 // CHECK-EL-SF-64-N32: "-internal-isystem"
 // CHECK-EL-SF-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2216,8 +2216,8 @@
 //
 // = Little-endian, mips64, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64-N32 %s
 // CHECK-EL-NAN-64-N32: "-internal-isystem"
 // CHECK-EL-NAN-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2243,8 +2243,8 @@
 //
 // = Little-endian, mips64, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64-N32 %s
 // CHECK-EL-NAN64-64-N32: "-internal-isystem"
 // CHECK-EL-NAN64-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2270,8 +2270,8 @@
 //
 // = Little-endian, mips64, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64-64 %s
 // CHECK-EL-HF-64-64: "-internal-isystem"
 // CHECK-EL-HF-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2297,8 +2297,8 @@
 //
 // = Little-endian, mips64, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64-64 %s
 // CHECK-EL-HF64-64-64: "-internal-isystem"
 // CHECK-EL-HF64-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2324,8 +2324,8 @@
 //
 // = Little-endian, mips64, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64-64 %s
 // CHECK-EL-SF-64-64: "-internal-isystem"
 // CHECK-EL-SF-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2351,8 +2351,8 @@
 //
 // = Little-endian, mips64, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64-64 %s
 // CHECK-EL-NAN-64-64: "-internal-isystem"
 // CHECK-EL-NAN-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2378,8 +2378,8 @@
 //
 // = Little-endian, mips64, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64-64 %s
 // CHECK-EL-NAN64-64-64: "-internal-isystem"
 // CHECK-EL-NAN64-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2405,8 +2405,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64R2-N32 %s
 // CHECK-EL-HF-64R2-N32: "-internal-isystem"
 // CHECK-EL-HF-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2432,8 +2432,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64R2-N32 %s
 // CHECK-EL-HF64-64R2-N32: "-internal-isystem"
 // CHECK-EL-HF64-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2459,8 +2459,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64R2-N32 %s
 // CHECK-EL-SF-64R2-N32: "-internal-isystem"
 // CHECK-EL-SF-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2486,8 +2486,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64R2-N32 %s
 // CHECK-EL-NAN-64R2-N32: "-internal-isystem"
 // CHECK-EL-NAN-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2513,8 +2513,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-N32 %s
 // CHECK-EL-NAN64-64R2-N32: "-internal-isystem"
 // CHECK-EL-NAN64-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2540,8 +2540,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64R2-64 %s
 // CHECK-EL-HF-64R2-64: "-internal-isystem"
 // CHECK-EL-HF-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2567,8 +2567,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64R2-64 %s
 // CHECK-EL-HF64-64R2-64: "-internal-isystem"
 // CHECK-EL-HF64-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2594,8 +2594,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64R2-64 %s
 // CHECK-EL-SF-64R2-64: "-internal-isystem"
 // CHECK-EL-SF-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2621,8 +2621,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64R2-64 %s
 // CHECK-EL-NAN-64R2-64: "-internal-isystem"
 // CHECK-EL-NAN-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2648,8 +2648,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-64 %s
 // CHECK-EL-NAN64-64R2-64: "-internal-isystem"
 // CHECK-EL-NAN64-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2675,8 +2675,8 @@
 //
 // = Little-endian, default (mips64r2), ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-64-DEF %s
 // CHECK-EL-NAN64-64R2-64-DEF: "-internal-isystem"
 // CHECK-EL-NAN64-64R2-64-DEF: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2704,8 +2704,8 @@
 //
 // = Big-endian, mips32r3, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r3 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r3 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R3 %s
 // CHECK-BE-HF-32R3: "-internal-isystem"
 // CHECK-BE-HF-32R3: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2731,8 +2731,8 @@
 //
 // = Big-endian, mips32r5, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r5 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r5 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R5 %s
 // CHECK-BE-HF-32R5: "-internal-isystem"
 // CHECK-BE-HF-32R5: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2758,8 +2758,8 @@
 //
 // = Big-endian, mips64r3, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r3 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r3 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R3-64 %s
 // CHECK-BE-HF-64R3-64: "-internal-isystem"
 // CHECK-BE-HF-64R3-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2785,8 +2785,8 @@
 //
 // = Big-endian, mips64r5, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r5 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r5 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R5-64 %s
 // CHECK-BE-HF-64R5-64: "-internal-isystem"
 // CHECK-BE-HF-64R5-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
diff --git a/test/Driver/mips-img-v2.cpp b/test/Driver/mips-img-v2.cpp
new file mode 100644
index 0000000..34cf3d726
--- /dev/null
+++ b/test/Driver/mips-img-v2.cpp
@@ -0,0 +1,337 @@
+// Check frontend and linker invocations on the IMG v2 MIPS toolchain.
+
+// -EB -mips32r6 -mhard-float -mabi=32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips32r6 -mhard-float -mabi=32 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-O32 %s
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib"
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-O32: "-internal-externc-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r6-hard/lib/../usr/include"
+// EB-HARD-O32: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-O32: "--sysroot=[[TC]]/../../../../sysroot/mips-r6-hard"
+// EB-HARD-O32: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-O32: "[[TC]]/mips-r6-hard/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-O32: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mips-r6-hard/lib"
+// EB-HARD-O32: "-L[[TC]]/mips-r6-hard/lib"
+// EB-HARD-O32: "-L[[TC]]/../../../../sysroot/mips-r6-hard/lib/../lib"
+// EB-HARD-O32: "-L[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib"
+// EB-HARD-O32: "[[TC]]/mips-r6-hard/lib{{/|\\\\}}crtend.o"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mips64r6 -mhard-float -mabi=n32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips64r6 -mhard-float -mabi=n32 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-N32 %s
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib32"
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-N32: "-internal-externc-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r6-hard/lib32/../usr/include"
+// EB-HARD-N32: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-N32: "--sysroot=[[TC]]/../../../../sysroot/mips-r6-hard"
+// EB-HARD-N32: "-dynamic-linker" "/lib32/ld-linux-mipsn8.so.1"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib32{{/|\\\\}}crt1.o"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib32{{/|\\\\}}crti.o"
+// EB-HARD-N32: "[[TC]]/mips-r6-hard/lib32{{/|\\\\}}crtbegin.o"
+// EB-HARD-N32: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mips-r6-hard/lib32"
+// EB-HARD-N32: "-L[[TC]]/mips-r6-hard/lib32"
+// EB-HARD-N32: "-L[[TC]]/../../../../sysroot/mips-r6-hard/lib/../lib32"
+// EB-HARD-N32: "-L[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib32"
+// EB-HARD-N32: "[[TC]]/mips-r6-hard/lib32{{/|\\\\}}crtend.o"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib32{{/|\\\\}}crtn.o"
+
+// -EB -mips64r6 -mhard-float -mabi=64
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips64-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips64r6 -mhard-float -mabi=64 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-N64 %s
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib64"
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-N64: "-internal-externc-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r6-hard/lib64/../usr/include"
+// EB-HARD-N64: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-N64: "--sysroot=[[TC]]/../../../../sysroot/mips-r6-hard"
+// EB-HARD-N64: "-dynamic-linker" "/lib64/ld-linux-mipsn8.so.1"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib64{{/|\\\\}}crt1.o"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib64{{/|\\\\}}crti.o"
+// EB-HARD-N64: "[[TC]]/mips-r6-hard/lib64{{/|\\\\}}crtbegin.o"
+// EB-HARD-N64: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mips-r6-hard/lib64"
+// EB-HARD-N64: "-L[[TC]]/mips-r6-hard/lib64"
+// EB-HARD-N64: "-L[[TC]]/../../../../sysroot/mips-r6-hard/lib/../lib64"
+// EB-HARD-N64: "-L[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib64"
+// EB-HARD-N64: "[[TC]]/mips-r6-hard/lib64{{/|\\\\}}crtend.o"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib64{{/|\\\\}}crtn.o"
+
+// -EL -mips32r6 -mhard-float -mabi=32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips32r6 -mhard-float -mabi=32 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-O32 %s
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib"
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-O32: "-internal-externc-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/lib/../usr/include"
+// EL-HARD-O32: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-O32: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r6-hard"
+// EL-HARD-O32: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-O32: "[[TC]]/mipsel-r6-hard/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-O32: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mipsel-r6-hard/lib"
+// EL-HARD-O32: "-L[[TC]]/mipsel-r6-hard/lib"
+// EL-HARD-O32: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/lib/../lib"
+// EL-HARD-O32: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib"
+// EL-HARD-O32: "[[TC]]/mipsel-r6-hard/lib{{/|\\\\}}crtend.o"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mips64r6 -mhard-float -mabi=n32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips64r6 -mhard-float -mabi=n32 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-N32 %s
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib32"
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-N32: "-internal-externc-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/lib32/../usr/include"
+// EL-HARD-N32: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-N32: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r6-hard"
+// EL-HARD-N32: "-dynamic-linker" "/lib32/ld-linux-mipsn8.so.1"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib32{{/|\\\\}}crt1.o"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib32{{/|\\\\}}crti.o"
+// EL-HARD-N32: "[[TC]]/mipsel-r6-hard/lib32{{/|\\\\}}crtbegin.o"
+// EL-HARD-N32: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mipsel-r6-hard/lib32"
+// EL-HARD-N32: "-L[[TC]]/mipsel-r6-hard/lib32"
+// EL-HARD-N32: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/lib/../lib32"
+// EL-HARD-N32: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib32"
+// EL-HARD-N32: "[[TC]]/mipsel-r6-hard/lib32{{/|\\\\}}crtend.o"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib32{{/|\\\\}}crtn.o"
+
+// -EL -mips64r6 -mhard-float -mabi=64
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips64-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips64r6 -mhard-float -mabi=64 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-N64 %s
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib64"
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-N64: "-internal-externc-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r6-hard/lib64/../usr/include"
+// EL-HARD-N64: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-N64: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r6-hard"
+// EL-HARD-N64: "-dynamic-linker" "/lib64/ld-linux-mipsn8.so.1"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib64{{/|\\\\}}crt1.o"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib64{{/|\\\\}}crti.o"
+// EL-HARD-N64: "[[TC]]/mipsel-r6-hard/lib64{{/|\\\\}}crtbegin.o"
+// EL-HARD-N64: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mipsel-r6-hard/lib64"
+// EL-HARD-N64: "-L[[TC]]/mipsel-r6-hard/lib64"
+// EL-HARD-N64: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/lib/../lib64"
+// EL-HARD-N64: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib64"
+// EL-HARD-N64: "[[TC]]/mipsel-r6-hard/lib64{{/|\\\\}}crtend.o"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib64{{/|\\\\}}crtn.o"
+
+// -EB -mips32r6 -msoft-float
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips32r6 -msoft-float \
+// RUN:   | FileCheck --check-prefix=EB-SOFT %s
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-soft/lib"
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-SOFT: "-internal-externc-isystem"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r6-soft/lib/../usr/include"
+// EB-SOFT: "{{.*}}ld{{(.exe)?}}"
+// EB-SOFT: "--sysroot=[[TC]]/../../../../sysroot/mips-r6-soft"
+// EB-SOFT: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r6-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r6-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-SOFT: "[[TC]]/mips-r6-soft/lib{{/|\\\\}}crtbegin.o"
+// EB-SOFT: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mips-r6-soft/lib"
+// EB-SOFT: "-L[[TC]]/mips-r6-soft/lib"
+// EB-SOFT: "-L[[TC]]/../../../../sysroot/mips-r6-soft/lib/../lib"
+// EB-SOFT: "-L[[TC]]/../../../../sysroot/mips-r6-soft/usr/lib/../lib"
+// EB-SOFT: "[[TC]]/mips-r6-soft/lib{{/|\\\\}}crtend.o"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r6-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mips32r6 -msoft-float
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips32r6 -msoft-float \
+// RUN:   | FileCheck --check-prefix=EL-SOFT %s
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-soft/lib"
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT: "-internal-externc-isystem"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r6-soft/lib/../usr/include"
+// EL-SOFT: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r6-soft"
+// EL-SOFT: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT: "[[TC]]/mipsel-r6-soft/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mipsel-r6-soft/lib"
+// EL-SOFT: "-L[[TC]]/mipsel-r6-soft/lib"
+// EL-SOFT: "-L[[TC]]/../../../../sysroot/mipsel-r6-soft/lib/../lib"
+// EL-SOFT: "-L[[TC]]/../../../../sysroot/mipsel-r6-soft/usr/lib/../lib"
+// EL-SOFT: "[[TC]]/mipsel-r6-soft/lib{{/|\\\\}}crtend.o"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mips32r6 -mhard-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips32r6 -mhard-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EB-HARD-MICRO %s
+// EB-HARD-MICRO: "-internal-isystem"
+// EB-HARD-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-HARD-MICRO: "-internal-isystem"
+// EB-HARD-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-hard/lib"
+// EB-HARD-MICRO: "-internal-isystem"
+// EB-HARD-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-MICRO: "-internal-externc-isystem"
+// EB-HARD-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-hard/lib/../usr/include"
+// EB-HARD-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromips-r6-hard"
+// EB-HARD-MICRO: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-HARD-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-MICRO: "[[TC]]/micromips-r6-hard/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-MICRO: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/micromips-r6-hard/lib"
+// EB-HARD-MICRO: "-L[[TC]]/micromips-r6-hard/lib"
+// EB-HARD-MICRO: "-L[[TC]]/../../../../sysroot/micromips-r6-hard/lib/../lib"
+// EB-HARD-MICRO: "-L[[TC]]/../../../../sysroot/micromips-r6-hard/usr/lib/../lib"
+// EB-HARD-MICRO: "[[TC]]/micromips-r6-hard/lib{{/|\\\\}}crtend.o"
+// EB-HARD-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mips32r6 -msoft-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips32r6 -msoft-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EB-SOFT-MICRO %s
+// EB-SOFT-MICRO: "-internal-isystem"
+// EB-SOFT-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-SOFT-MICRO: "-internal-isystem"
+// EB-SOFT-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-soft/lib"
+// EB-SOFT-MICRO: "-internal-isystem"
+// EB-SOFT-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-SOFT-MICRO: "-internal-externc-isystem"
+// EB-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-soft/lib/../usr/include"
+// EB-SOFT-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EB-SOFT-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromips-r6-soft"
+// EB-SOFT-MICRO: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-SOFT-MICRO: "[[TC]]/micromips-r6-soft/lib{{/|\\\\}}crtbegin.o"
+// EB-SOFT-MICRO: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/micromips-r6-soft/lib"
+// EB-SOFT-MICRO: "-L[[TC]]/micromips-r6-soft/lib"
+// EB-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromips-r6-soft/lib/../lib"
+// EB-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromips-r6-soft/usr/lib/../lib"
+// EB-SOFT-MICRO: "[[TC]]/micromips-r6-soft/lib{{/|\\\\}}crtend.o"
+// EB-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mips32r6 -mhard-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips32r6 -mhard-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EL-HARD-MICRO %s
+// EL-HARD-MICRO: "-internal-isystem"
+// EL-HARD-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-HARD-MICRO: "-internal-isystem"
+// EL-HARD-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-hard/lib"
+// EL-HARD-MICRO: "-internal-isystem"
+// EL-HARD-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-MICRO: "-internal-externc-isystem"
+// EL-HARD-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-hard/lib/../usr/include"
+// EL-HARD-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromipsel-r6-hard"
+// EL-HARD-MICRO: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-HARD-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-MICRO: "[[TC]]/micromipsel-r6-hard/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-MICRO: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/micromipsel-r6-hard/lib"
+// EL-HARD-MICRO: "-L[[TC]]/micromipsel-r6-hard/lib"
+// EL-HARD-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r6-hard/lib/../lib"
+// EL-HARD-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r6-hard/usr/lib/../lib"
+// EL-HARD-MICRO: "[[TC]]/micromipsel-r6-hard/lib{{/|\\\\}}crtend.o"
+// EL-HARD-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mips32r6 -msoft-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips32r6 -msoft-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EL-SOFT-MICRO %s
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-soft/lib"
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT-MICRO: "-internal-externc-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-soft/lib/../usr/include"
+// EL-SOFT-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromipsel-r6-soft"
+// EL-SOFT-MICRO: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT-MICRO: "[[TC]]/micromipsel-r6-soft/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/micromipsel-r6-soft/lib"
+// EL-SOFT-MICRO: "-L[[TC]]/micromipsel-r6-soft/lib"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r6-soft/lib/../lib"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r6-soft/usr/lib/../lib"
+// EL-SOFT-MICRO: "[[TC]]/micromipsel-r6-soft/lib{{/|\\\\}}crtend.o"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
diff --git a/test/Driver/mips-img.cpp b/test/Driver/mips-img.cpp
index 389e0f7..9d8cfba 100644
--- a/test/Driver/mips-img.cpp
+++ b/test/Driver/mips-img.cpp
@@ -3,7 +3,7 @@
 // = Big-endian, mips32r6
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-img-linux-gnu -mips32r6 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-32R6 %s
 // CHECK-BE-32R6: "-internal-isystem"
 // CHECK-BE-32R6: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -30,7 +30,7 @@
 // = Little-endian, mips32r6
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-img-linux-gnu -mips32r6 -EL \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-32R6 %s
 // CHECK-LE-32R6: "-internal-isystem"
 // CHECK-LE-32R6: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -57,7 +57,7 @@
 // = Big-endian, mips64r6, N32
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-img-linux-gnu -mips64r6 -mabi=n32 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-64R6-N32 %s
 // CHECK-BE-64R6-N32: "-internal-isystem"
 // CHECK-BE-64R6-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -84,7 +84,7 @@
 // = Little-endian, mips64r6, N32
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-img-linux-gnu -mips64r6 -EL -mabi=n32 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-64R6-N32 %s
 // CHECK-LE-64R6-N32: "-internal-isystem"
 // CHECK-LE-64R6-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -111,7 +111,7 @@
 // = Big-endian, mips64r6, N64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-img-linux-gnu -mips64r6 -mabi=64 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-64R6-N64 %s
 // CHECK-BE-64R6-N64: "-internal-isystem"
 // CHECK-BE-64R6-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -138,7 +138,7 @@
 // = Little-endian, mips64r6, N64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-img-linux-gnu -mips64r6 -EL -mabi=64 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-64R6-N64 %s
 // CHECK-LE-64R6-N64: "-internal-isystem"
 // CHECK-LE-64R6-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
diff --git a/test/Driver/mips-mti-linux.c b/test/Driver/mips-mti-linux.c
index e3560e2..4835d79 100644
--- a/test/Driver/mips-mti-linux.c
+++ b/test/Driver/mips-mti-linux.c
@@ -8,7 +8,7 @@
 
 // = Big-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux -mips32r2 -mhard-float \
+// RUN:     --target=mips-mti-linux -mips32r2 -mhard-float -rtlib=platform \
 // RUN:     --sysroot=%S/Inputs/mips_mti_linux/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R2 %s
 //
@@ -26,7 +26,7 @@
 
 // = Little-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux -mips32r2 -EL -mhard-float \
+// RUN:     --target=mips-mti-linux -mips32r2 -EL -mhard-float -rtlib=platform \
 // RUN:     --sysroot=%S/Inputs/mips_mti_linux/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-HF-32R2 %s
 //
diff --git a/test/Driver/mips-mti.cpp b/test/Driver/mips-mti.cpp
new file mode 100644
index 0000000..147239c
--- /dev/null
+++ b/test/Driver/mips-mti.cpp
@@ -0,0 +1,449 @@
+// Check frontend and linker invocations on the MTI MIPS toolchain.
+
+// -EB -mhard-float -mabi=32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -mabi=32 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-O32 %s
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard/lib"
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-O32: "-internal-externc-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r2-hard/lib/../usr/include"
+// EB-HARD-O32: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-O32: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard"
+// EB-HARD-O32: "-dynamic-linker" "/lib/ld.so.1"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-O32: "[[TC]]/mips-r2-hard/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-O32: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard/lib"
+// EB-HARD-O32: "-L[[TC]]/mips-r2-hard/lib"
+// EB-HARD-O32: "-L[[TC]]/../../../../sysroot/mips-r2-hard/lib/../lib"
+// EB-HARD-O32: "-L[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib"
+// EB-HARD-O32: "[[TC]]/mips-r2-hard/lib{{/|\\\\}}crtend.o"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -mabi=n32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -mabi=n32 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-N32 %s
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard/lib32"
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-N32: "-internal-externc-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r2-hard/lib32/../usr/include"
+// EB-HARD-N32: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-N32: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard"
+// EB-HARD-N32: "-dynamic-linker" "/lib32/ld.so.1"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib32{{/|\\\\}}crt1.o"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib32{{/|\\\\}}crti.o"
+// EB-HARD-N32: "[[TC]]/mips-r2-hard/lib32{{/|\\\\}}crtbegin.o"
+// EB-HARD-N32: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard/lib32"
+// EB-HARD-N32: "-L[[TC]]/mips-r2-hard/lib32"
+// EB-HARD-N32: "-L[[TC]]/../../../../sysroot/mips-r2-hard/lib/../lib32"
+// EB-HARD-N32: "-L[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib32"
+// EB-HARD-N32: "[[TC]]/mips-r2-hard/lib32{{/|\\\\}}crtend.o"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib32{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -mabi=64
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips64-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -mabi=64 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-N64 %s
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard/lib64"
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-N64: "-internal-externc-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r2-hard/lib64/../usr/include"
+// EB-HARD-N64: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-N64: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard"
+// EB-HARD-N64: "-dynamic-linker" "/lib64/ld.so.1"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib64{{/|\\\\}}crt1.o"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib64{{/|\\\\}}crti.o"
+// EB-HARD-N64: "[[TC]]/mips-r2-hard/lib64{{/|\\\\}}crtbegin.o"
+// EB-HARD-N64: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard/lib64"
+// EB-HARD-N64: "-L[[TC]]/mips-r2-hard/lib64"
+// EB-HARD-N64: "-L[[TC]]/../../../../sysroot/mips-r2-hard/lib/../lib64"
+// EB-HARD-N64: "-L[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib64"
+// EB-HARD-N64: "[[TC]]/mips-r2-hard/lib64{{/|\\\\}}crtend.o"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib64{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mabi=32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mabi=32 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-O32 %s
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard/lib"
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-O32: "-internal-externc-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/lib/../usr/include"
+// EL-HARD-O32: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-O32: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard"
+// EL-HARD-O32: "-dynamic-linker" "/lib/ld.so.1"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-O32: "[[TC]]/mipsel-r2-hard/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-O32: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard/lib"
+// EL-HARD-O32: "-L[[TC]]/mipsel-r2-hard/lib"
+// EL-HARD-O32: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/lib/../lib"
+// EL-HARD-O32: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib"
+// EL-HARD-O32: "[[TC]]/mipsel-r2-hard/lib{{/|\\\\}}crtend.o"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mabi=n32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mabi=n32 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-N32 %s
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard/lib32"
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-N32: "-internal-externc-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/lib32/../usr/include"
+// EL-HARD-N32: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-N32: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard"
+// EL-HARD-N32: "-dynamic-linker" "/lib32/ld.so.1"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib32{{/|\\\\}}crt1.o"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib32{{/|\\\\}}crti.o"
+// EL-HARD-N32: "[[TC]]/mipsel-r2-hard/lib32{{/|\\\\}}crtbegin.o"
+// EL-HARD-N32: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard/lib32"
+// EL-HARD-N32: "-L[[TC]]/mipsel-r2-hard/lib32"
+// EL-HARD-N32: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/lib/../lib32"
+// EL-HARD-N32: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib32"
+// EL-HARD-N32: "[[TC]]/mipsel-r2-hard/lib32{{/|\\\\}}crtend.o"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib32{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mabi=64
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips64-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mabi=64 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-N64 %s
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard/lib64"
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-N64: "-internal-externc-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r2-hard/lib64/../usr/include"
+// EL-HARD-N64: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-N64: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard"
+// EL-HARD-N64: "-dynamic-linker" "/lib64/ld.so.1"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib64{{/|\\\\}}crt1.o"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib64{{/|\\\\}}crti.o"
+// EL-HARD-N64: "[[TC]]/mipsel-r2-hard/lib64{{/|\\\\}}crtbegin.o"
+// EL-HARD-N64: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard/lib64"
+// EL-HARD-N64: "-L[[TC]]/mipsel-r2-hard/lib64"
+// EL-HARD-N64: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/lib/../lib64"
+// EL-HARD-N64: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib64"
+// EL-HARD-N64: "[[TC]]/mipsel-r2-hard/lib64{{/|\\\\}}crtend.o"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib64{{/|\\\\}}crtn.o"
+
+// -EB -msoft-float
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -msoft-float \
+// RUN:   | FileCheck --check-prefix=EB-SOFT %s
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-soft/lib"
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-SOFT: "-internal-externc-isystem"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r2-soft/lib/../usr/include"
+// EB-SOFT: "{{.*}}ld{{(.exe)?}}"
+// EB-SOFT: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-soft"
+// EB-SOFT: "-dynamic-linker" "/lib/ld.so.1"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r2-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r2-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-SOFT: "[[TC]]/mips-r2-soft/lib{{/|\\\\}}crtbegin.o"
+// EB-SOFT: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-soft/lib"
+// EB-SOFT: "-L[[TC]]/mips-r2-soft/lib"
+// EB-SOFT: "-L[[TC]]/../../../../sysroot/mips-r2-soft/lib/../lib"
+// EB-SOFT: "-L[[TC]]/../../../../sysroot/mips-r2-soft/usr/lib/../lib"
+// EB-SOFT: "[[TC]]/mips-r2-soft/lib{{/|\\\\}}crtend.o"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r2-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -msoft-float
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -msoft-float \
+// RUN:   | FileCheck --check-prefix=EL-SOFT %s
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-soft/lib"
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT: "-internal-externc-isystem"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r2-soft/lib/../usr/include"
+// EL-SOFT: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-soft"
+// EL-SOFT: "-dynamic-linker" "/lib/ld.so.1"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT: "[[TC]]/mipsel-r2-soft/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-soft/lib"
+// EL-SOFT: "-L[[TC]]/mipsel-r2-soft/lib"
+// EL-SOFT: "-L[[TC]]/../../../../sysroot/mipsel-r2-soft/lib/../lib"
+// EL-SOFT: "-L[[TC]]/../../../../sysroot/mipsel-r2-soft/usr/lib/../lib"
+// EL-SOFT: "[[TC]]/mipsel-r2-soft/lib{{/|\\\\}}crtend.o"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -muclibc
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -muclibc \
+// RUN:   | FileCheck --check-prefix=EB-HARD-UCLIBC %s
+// EB-HARD-UCLIBC: "-internal-isystem"
+// EB-HARD-UCLIBC: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-UCLIBC: "-internal-isystem"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard-uclibc/lib"
+// EB-HARD-UCLIBC: "-internal-isystem"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-UCLIBC: "-internal-externc-isystem"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/lib/../usr/include"
+// EB-HARD-UCLIBC: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-UCLIBC: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard-uclibc"
+// EB-HARD-UCLIBC: "-dynamic-linker" "/lib/ld-uClibc.so.0"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-UCLIBC: "[[TC]]/mips-r2-hard-uclibc/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-UCLIBC: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard-uclibc/lib"
+// EB-HARD-UCLIBC: "-L[[TC]]/mips-r2-hard-uclibc/lib"
+// EB-HARD-UCLIBC: "-L[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/lib/../lib"
+// EB-HARD-UCLIBC: "-L[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/usr/lib/../lib"
+// EB-HARD-UCLIBC: "[[TC]]/mips-r2-hard-uclibc/lib{{/|\\\\}}crtend.o"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -muclibc
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -muclibc \
+// RUN:   | FileCheck --check-prefix=EL-HARD-UCLIBC %s
+// EL-HARD-UCLIBC: "-internal-isystem"
+// EL-HARD-UCLIBC: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-UCLIBC: "-internal-isystem"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard-uclibc/lib"
+// EL-HARD-UCLIBC: "-internal-isystem"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-UCLIBC: "-internal-externc-isystem"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/lib/../usr/include"
+// EL-HARD-UCLIBC: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-UCLIBC: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc"
+// EL-HARD-UCLIBC: "-dynamic-linker" "/lib/ld-uClibc.so.0"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-UCLIBC: "[[TC]]/mipsel-r2-hard-uclibc/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-UCLIBC: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard-uclibc/lib"
+// EL-HARD-UCLIBC: "-L[[TC]]/mipsel-r2-hard-uclibc/lib"
+// EL-HARD-UCLIBC: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/lib/../lib"
+// EL-HARD-UCLIBC: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/usr/lib/../lib"
+// EL-HARD-UCLIBC: "[[TC]]/mipsel-r2-hard-uclibc/lib{{/|\\\\}}crtend.o"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-NAN2008 %s
+// EB-HARD-NAN2008: "-internal-isystem"
+// EB-HARD-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-NAN2008: "-internal-isystem"
+// EB-HARD-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard-nan2008/lib"
+// EB-HARD-NAN2008: "-internal-isystem"
+// EB-HARD-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-NAN2008: "-internal-externc-isystem"
+// EB-HARD-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/lib/../usr/include"
+// EB-HARD-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard-nan2008"
+// EB-HARD-NAN2008: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-HARD-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-NAN2008: "[[TC]]/mips-r2-hard-nan2008/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard-nan2008/lib"
+// EB-HARD-NAN2008: "-L[[TC]]/mips-r2-hard-nan2008/lib"
+// EB-HARD-NAN2008: "-L[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/lib/../lib"
+// EB-HARD-NAN2008: "-L[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/usr/lib/../lib"
+// EB-HARD-NAN2008: "[[TC]]/mips-r2-hard-nan2008/lib{{/|\\\\}}crtend.o"
+// EB-HARD-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-NAN2008 %s
+// EL-HARD-NAN2008: "-internal-isystem"
+// EL-HARD-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-NAN2008: "-internal-isystem"
+// EL-HARD-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard-nan2008/lib"
+// EL-HARD-NAN2008: "-internal-isystem"
+// EL-HARD-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-NAN2008: "-internal-externc-isystem"
+// EL-HARD-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/lib/../usr/include"
+// EL-HARD-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008"
+// EL-HARD-NAN2008: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-HARD-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-NAN2008: "[[TC]]/mipsel-r2-hard-nan2008/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008/lib"
+// EL-HARD-NAN2008: "-L[[TC]]/mipsel-r2-hard-nan2008/lib"
+// EL-HARD-NAN2008: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/lib/../lib"
+// EL-HARD-NAN2008: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/usr/lib/../lib"
+// EL-HARD-NAN2008: "[[TC]]/mipsel-r2-hard-nan2008/lib{{/|\\\\}}crtend.o"
+// EL-HARD-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -muclibc -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -muclibc -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-UCLIBC-NAN2008 %s
+// EB-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EB-HARD-UCLIBC-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard-nan2008-uclibc/lib"
+// EB-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-UCLIBC-NAN2008: "-internal-externc-isystem"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/lib/../usr/include"
+// EB-HARD-UCLIBC-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-UCLIBC-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc"
+// EB-HARD-UCLIBC-NAN2008: "-dynamic-linker" "/lib/ld-uClibc-mipsn8.so.0"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/mips-r2-hard-nan2008-uclibc/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard-nan2008-uclibc/lib"
+// EB-HARD-UCLIBC-NAN2008: "-L[[TC]]/mips-r2-hard-nan2008-uclibc/lib"
+// EB-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/lib/../lib"
+// EB-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/../lib"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/mips-r2-hard-nan2008-uclibc/lib{{/|\\\\}}crtend.o"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -muclibc -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -muclibc -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-UCLIBC-NAN2008 %s
+// EL-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EL-HARD-UCLIBC-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard-nan2008-uclibc/lib"
+// EL-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-UCLIBC-NAN2008: "-internal-externc-isystem"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/lib/../usr/include"
+// EL-HARD-UCLIBC-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-UCLIBC-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc"
+// EL-HARD-UCLIBC-NAN2008: "-dynamic-linker" "/lib/ld-uClibc-mipsn8.so.0"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/mipsel-r2-hard-nan2008-uclibc/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008-uclibc/lib"
+// EL-HARD-UCLIBC-NAN2008: "-L[[TC]]/mipsel-r2-hard-nan2008-uclibc/lib"
+// EL-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/lib/../lib"
+// EL-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/../lib"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/mipsel-r2-hard-nan2008-uclibc/lib{{/|\\\\}}crtend.o"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -msoft-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -msoft-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EL-SOFT-MICRO %s
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/micromipsel-r2-soft/lib"
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT-MICRO: "-internal-externc-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r2-soft/lib/../usr/include"
+// EL-SOFT-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromipsel-r2-soft"
+// EL-SOFT-MICRO: "-dynamic-linker" "/lib/ld.so.1"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT-MICRO: "[[TC]]/micromipsel-r2-soft/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/micromipsel-r2-soft/lib"
+// EL-SOFT-MICRO: "-L[[TC]]/micromipsel-r2-soft/lib"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r2-soft/lib/../lib"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r2-soft/usr/lib/../lib"
+// EL-SOFT-MICRO: "[[TC]]/micromipsel-r2-soft/lib{{/|\\\\}}crtend.o"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mmicromips -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mmicromips -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EL-SOFT-MICRO-NAN2008 %s
+// EL-SOFT-MICRO-NAN2008: "-internal-isystem"
+// EL-SOFT-MICRO-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-SOFT-MICRO-NAN2008: "-internal-isystem"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/micromipsel-r2-hard-nan2008/lib"
+// EL-SOFT-MICRO-NAN2008: "-internal-isystem"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT-MICRO-NAN2008: "-internal-externc-isystem"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/lib/../usr/include"
+// EL-SOFT-MICRO-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT-MICRO-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008"
+// EL-SOFT-MICRO-NAN2008: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/micromipsel-r2-hard-nan2008/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT-MICRO-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/micromipsel-r2-hard-nan2008/lib"
+// EL-SOFT-MICRO-NAN2008: "-L[[TC]]/micromipsel-r2-hard-nan2008/lib"
+// EL-SOFT-MICRO-NAN2008: "-L[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/lib/../lib"
+// EL-SOFT-MICRO-NAN2008: "-L[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/usr/lib/../lib"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/micromipsel-r2-hard-nan2008/lib{{/|\\\\}}crtend.o"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crtn.o"
diff --git a/test/Driver/modules.m b/test/Driver/modules.m
index ec82cc1..fd66478 100644
--- a/test/Driver/modules.m
+++ b/test/Driver/modules.m
@@ -39,13 +39,6 @@
 // RUN: %clang -fmodules-disable-diagnostic-validation -### %s 2>&1 | FileCheck -check-prefix=MODULES_DISABLE_DIAGNOSTIC_VALIDATION %s
 // MODULES_DISABLE_DIAGNOSTIC_VALIDATION: -fmodules-disable-diagnostic-validation
 
-// RUN: %clang -fmodules -### %s 2>&1 | FileCheck -check-prefix=MODULES_PREBUILT_PATH_DEFAULT %s
-// MODULES_PREBUILT_PATH_DEFAULT-NOT: -fprebuilt-module-path
-
-// RUN: %clang -fmodules -fprebuilt-module-path=foo -fprebuilt-module-path=bar -### %s 2>&1 | FileCheck -check-prefix=MODULES_PREBUILT_PATH %s
-// MODULES_PREBUILT_PATH: "-fprebuilt-module-path=foo"
-// MODULES_PREBUILT_PATH: "-fprebuilt-module-path=bar"
-
 // RUN: %clang -fmodules -fmodule-map-file=foo.map -fmodule-map-file=bar.map -### %s 2>&1 | FileCheck -check-prefix=CHECK-MODULE-MAP-FILES %s
 // CHECK-MODULE-MAP-FILES: "-fmodules"
 // CHECK-MODULE-MAP-FILES: "-fmodule-map-file=foo.map"
diff --git a/test/Driver/msc-version.c b/test/Driver/msc-version.c
index 18fe731..9246335 100644
--- a/test/Driver/msc-version.c
+++ b/test/Driver/msc-version.c
@@ -1,15 +1,4 @@
 //
-// Verify defaults
-//
-
-// RUN: %clang -target i686-windows -fms-compatibility -dM -E - </dev/null -o - | FileCheck %s -check-prefix CHECK-NO-MSC-VERSION
-
-// CHECK-NO-MSC-VERSION: _MSC_BUILD 1
-// CHECK-NO-MSC-VERSION: _MSC_FULL_VER 180000000
-// CHECK-NO-MSC-VERSION: _MSC_VER 1800
-
-
-//
 // Verify -fms-compatibility-version parsing
 //
 
diff --git a/test/Driver/msvc-compiler-rt.c b/test/Driver/msvc-compiler-rt.c
new file mode 100644
index 0000000..abbca50
--- /dev/null
+++ b/test/Driver/msvc-compiler-rt.c
@@ -0,0 +1,5 @@
+// RUN: %clang -target x86_64-pc-windows-msvc --rtlib=compiler-rt -### %s 2>&1 | FileCheck %s -check-prefix MSVC-COMPILER-RT
+// RUN: not %clang %s -target x86_64-pc-windows-msvc --rtlib=libgcc 2>&1 | FileCheck %s -check-prefix CHECK-ERROR
+
+// MSVC-COMPILER-RT: "{{.*}}clang_rt.builtins{{.*}}"
+// CHECK-ERROR: unsupported runtime library 'libgcc' for platform 'MSVC'
diff --git a/test/Driver/msvc-link.c b/test/Driver/msvc-link.c
index b44e382..8fe5733 100644
--- a/test/Driver/msvc-link.c
+++ b/test/Driver/msvc-link.c
@@ -10,3 +10,9 @@
 // DLL: "-defaultlib:libcmt"
 // DLL: "-nologo"
 // DLL: "-dll"
+
+// RUN: %clang -target i686-pc-windows-msvc -L/var/empty -L/usr/lib -### %s 2>&1 | FileCheck --check-prefix LIBPATH %s
+// LIBPATH: "-libpath:/var/empty"
+// LIBPATH: "-libpath:/usr/lib"
+// LIBPATH: "-nologo"
+
diff --git a/test/Driver/msvc-triple.c b/test/Driver/msvc-triple.c
index f181b31..cb0c338 100644
--- a/test/Driver/msvc-triple.c
+++ b/test/Driver/msvc-triple.c
@@ -1,9 +1,7 @@
-// RUN: %clang -target i686-pc-windows-msvc   -S -emit-llvm %s -o - | FileCheck %s --check-prefix=DEFAULT
 // RUN: %clang -target i686-pc-windows-msvc19 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=TARGET-19
 // RUN: %clang -target i686-pc-windows-msvc   -S -emit-llvm %s -o - -fms-compatibility-version=19 | FileCheck %s --check-prefix=OVERRIDE-19
 // RUN: %clang -target i686-pc-windows-msvc-elf -S -emit-llvm %s -o - | FileCheck %s --check-prefix=ELF-DEFAULT
 
-// DEFAULT:     target triple = "i686-pc-windows-msvc18.0.0"
 // TARGET-19:   target triple = "i686-pc-windows-msvc19.0.0"
 // OVERRIDE-19: target triple = "i686-pc-windows-msvc19.0.0"
-// ELF-DEFAULT: target triple = "i686-pc-windows-msvc18.0.0-elf"
+// ELF-DEFAULT: target triple = "i686-pc-windows-msvc{{.*}}-elf"
diff --git a/test/Driver/myriad-toolchain.c b/test/Driver/myriad-toolchain.c
index 6c94cff..3e580ac 100644
--- a/test/Driver/myriad-toolchain.c
+++ b/test/Driver/myriad-toolchain.c
@@ -36,10 +36,10 @@
 // As such, we test only for a trailing quote in its rendering.
 // The same goes for "moviAsm".
 
-// RUN: %clang -target shave-myriad -c -### %s -isystem somewhere -Icommon -Wa,-yippee 2>&1 \
+// RUN: %clang -target shave-myriad -mcpu=myriad2.2 -c -### %s -isystem somewhere -Icommon -Wa,-yippee 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=MOVICOMPILE
-// MOVICOMPILE: moviCompile{{(.exe)?}}" "-S" "-fno-exceptions" "-mcpu=myriad2" "-DMYRIAD2" "-isystem" "somewhere" "-I" "common"
-// MOVICOMPILE: moviAsm{{(.exe)?}}" "-no6thSlotCompression" "-cv:myriad2" "-noSPrefixing" "-a"
+// MOVICOMPILE: moviCompile{{(.exe)?}}" "-S" "-fno-exceptions" "-DMYRIAD2" "-mcpu=myriad2.2" "-isystem" "somewhere" "-I" "common"
+// MOVICOMPILE: moviAsm{{(.exe)?}}" "-no6thSlotCompression" "-cv:myriad2.2" "-noSPrefixing" "-a"
 // MOVICOMPILE: "-yippee" "-i:somewhere" "-i:common" "-elf"
 
 // RUN: %clang -target shave-myriad -c -### %s -DEFINE_ME -UNDEFINE_ME 2>&1 \
@@ -58,15 +58,15 @@
 
 // RUN: %clang -target shave-myriad -c %s -o foo.o -### -MD -MF dep.d 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=MDMF
-// MDMF: "-S" "-fno-exceptions" "-mcpu=myriad2" "-DMYRIAD2" "-MD" "-MF" "dep.d" "-MT" "foo.o"
+// MDMF: "-S" "-fno-exceptions" "-DMYRIAD2" "-MD" "-MF" "dep.d" "-MT" "foo.o"
 
-// RUN: %clang -target shave-myriad -std=gnu++11 -S %s -o foo.o -### 2>&1 \
+// RUN: %clang -target shave-myriad -std=gnu++11 -mcpu=anothercpu -S %s -o foo.o -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=STDEQ
-// STDEQ: "-S" "-fno-exceptions" "-mcpu=myriad2" "-DMYRIAD2" "-std=gnu++11"
+// STDEQ: "-S" "-fno-exceptions" "-DMYRIAD2" "-std=gnu++11" "-mcpu=anothercpu"
 
 // RUN: %clang -target shave-myriad -E -Ifoo %s -o foo.i -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=PREPROCESS
-// PREPROCESS: "-E" "-mcpu=myriad2" "-DMYRIAD2" "-I" "foo"
+// PREPROCESS: "-E" "-DMYRIAD2" "-I" "foo"
 
 // RUN: %clang -target sparc-myriad -### --driver-mode=g++ %s 2>&1 | FileCheck %s --check-prefix=STDLIBCXX
 // STDLIBCXX: "-lstdc++" "-lc" "-lgcc"
@@ -77,3 +77,7 @@
 
 // RUN: %clang -### -c -g %s -target sparc-myriad 2>&1 | FileCheck -check-prefix=G_SPARC %s
 // G_SPARC: "-debug-info-kind=limited" "-dwarf-version=2"
+
+// RUN: %clang -### -c %s -target sparc-myriad-elf -fuse-init-array 2>&1 \
+// RUN: | FileCheck -check-prefix=USE-INIT-ARRAY %s
+// USE-INIT-ARRAY-NOT: argument unused
diff --git a/test/Driver/netbsd.c b/test/Driver/netbsd.c
index ffaab36..1a87d8e 100644
--- a/test/Driver/netbsd.c
+++ b/test/Driver/netbsd.c
@@ -1,5 +1,15 @@
 // RUN: %clang -no-canonical-prefixes -target x86_64--netbsd \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=STATIC %s
+// RUN: %clang -no-canonical-prefixes -target x86_64--netbsd \
+// RUN: -pie --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=PIE %s
+// RUN: %clang -no-canonical-prefixes -target x86_64--netbsd \
+// RUN: -shared --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SHARED %s
+
+// RUN: %clang -no-canonical-prefixes -target x86_64--netbsd \
+// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=X86_64 %s
 // RUN: %clang -no-canonical-prefixes -target x86_64--netbsd7.0.0 \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
@@ -86,12 +96,18 @@
 // RUN: %clang -no-canonical-prefixes -target arm--netbsd6.0.0-eabi -static \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-ARM-6 %s
-// RUN: %clang -no-canonical-prefixes -target sparc--netbsd -static \
+// RUN: %clang -no-canonical-prefixes -target sparc--netbsd7.0.0 -static \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
-// RUN: | FileCheck -check-prefix=S-SPARC %s
-// RUN: %clang -no-canonical-prefixes -target sparc64--netbsd -static \
+// RUN: | FileCheck -check-prefix=S-SPARC-7 %s
+// RUN: %clang -no-canonical-prefixes -target sparc--netbsd6.0.0 -static \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
-// RUN: | FileCheck -check-prefix=S-SPARC64 %s
+// RUN: | FileCheck -check-prefix=S-SPARC-6 %s
+// RUN: %clang -no-canonical-prefixes -target sparc64--netbsd7.0.0 -static \
+// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-SPARC64-7 %s
+// RUN: %clang -no-canonical-prefixes -target sparc64--netbsd6.0.0 -static \
+// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-SPARC64-6 %s
 // RUN: %clang -no-canonical-prefixes -target powerpc--netbsd -static \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-POWERPC %s
@@ -99,6 +115,32 @@
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-POWERPC64 %s
 
+// STATIC: ld{{.*}}" "--eh-frame-hdr"
+// STATIC-NOT: "-pie"
+// STATIC-NOT: "-Bshareable"
+// STATIC: "-dynamic-linker" "/libexec/ld.elf_so"
+// STATIC-NOT: "-pie"
+// STATIC-NOT: "-Bshareable"
+// STATIC: "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// STATIC: "{{.*}}/usr/lib{{/|\\\\}}crti.o" "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o"
+// STATIC: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// SHARED: ld{{.*}}" "--eh-frame-hdr"
+// SHARED-NOT: "-pie"
+// SHARED-NOT: "-dynamic-linker"
+// SHARED-NOT: "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// SHARED: "{{.*}}/usr/lib{{/|\\\\}}crti.o" "{{.*}}/usr/lib{{/|\\\\}}crtbeginS.o"
+// SHARED: "{{.*}}/usr/lib{{/|\\\\}}crtendS.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// PIE: ld{{.*}}" "--eh-frame-hdr"
+// PIE-NOT: "-Bshareable"
+// PIE "-pie" "-dynamic-linker" "/libexec/ld.elf_so"
+// PIE-NOT: "-Bshareable"
+// PIE: "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// PIE: "{{.*}}/usr/lib{{/|\\\\}}crtbeginS.o"
+// PIE: "{{.*}}/usr/lib{{/|\\\\}}crtendS.o"
+// PIE: "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // X86_64: clang{{.*}}" "-cc1" "-triple" "x86_64--netbsd"
 // X86_64: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // X86_64: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
@@ -302,22 +344,37 @@
 // S-ARM-6: "-lgcc_eh" "-lc" "-lgcc"
 // S-ARM-6: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
-// S-SPARC: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd"
-// S-SPARC: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
-// S-SPARC: "-m" "elf32_sparc"
-// S-SPARC: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
-// S-SPARC: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
-// S-SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
-// S-SPARC: "-lgcc_eh" "-lc" "-lgcc"
-// S-SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+// S-SPARC-6: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd6.0.0"
+// S-SPARC-6: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-SPARC-6: "-m" "elf32_sparc"
+// S-SPARC-6: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// S-SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
+// S-SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
+// S-SPARC-6: "-lgcc_eh" "-lc" "-lgcc"
+// S-SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
-// S-SPARC64: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd"
-// S-SPARC64: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
-// S-SPARC64: "-m" "elf64_sparc"
-// S-SPARC64: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
-// S-SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
-// S-SPARC64: "-lgcc_eh" "-lc" "-lgcc"
-// S-SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+// S-SPARC-7: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd7.0.0"
+// S-SPARC-7: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-SPARC-7: "-m" "elf32_sparc"
+// S-SPARC-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// S-SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
+// S-SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
+// S-SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// S-SPARC64-6: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd6.0.0"
+// S-SPARC64-6: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-SPARC64-6: "-m" "elf64_sparc"
+// S-SPARC64-6: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// S-SPARC64-6: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
+// S-SPARC64-6: "-lgcc_eh" "-lc" "-lgcc"
+// S-SPARC64-6: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// S-SPARC64-7: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd7.0.0"
+// S-SPARC64-7: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-SPARC64-7: "-m" "elf64_sparc"
+// S-SPARC64-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// S-SPARC64-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
+// S-SPARC64-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
 // S-POWERPC: clang{{.*}}" "-cc1" "-triple" "powerpc--netbsd"
 // S-POWERPC: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
diff --git a/test/Driver/netbsd.cpp b/test/Driver/netbsd.cpp
index e386a21..104d03e 100644
--- a/test/Driver/netbsd.cpp
+++ b/test/Driver/netbsd.cpp
@@ -1,69 +1,93 @@
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=X86_64 %s
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd7.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=X86_64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd6.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=X86_64-6 %s
 // RUN: %clangxx -no-canonical-prefixes -target arm--netbsd6.0.0-eabi \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=ARM %s
 // RUN: %clangxx -no-canonical-prefixes -target arm--netbsd7.0.0-eabi \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=ARM-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=AARCH64 %s
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd7.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=AARCH64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC %s
+// RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd6.0.0 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-6 %s
+// RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd7.0.0 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC64 %s
+// RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd6.0.0 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC64-6 %s
+// RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd7.0.0 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target powerpc--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=POWERPC %s
 // RUN: %clangxx -no-canonical-prefixes -target powerpc64--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=POWERPC64 %s
 
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-X86_64 %s
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd7.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-X86_64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd6.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-X86_64-6 %s
 // RUN: %clangxx -no-canonical-prefixes -target arm--netbsd6.0.0-eabi -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-ARM %s
 // RUN: %clangxx -no-canonical-prefixes -target arm--netbsd7.0.0-eabi -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-ARM-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-AARCH64 %s
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd7.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-AARCH64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC %s
+// RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd6.0.0 -static \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-SPARC-6 %s
+// RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd7.0.0 -static \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-SPARC-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC64 %s
+// RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd6.0.0 -static \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-SPARC64-6 %s
+// RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd7.0.0 -static \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-SPARC64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target powerpc--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-POWERPC %s
 // RUN: %clangxx -no-canonical-prefixes -target powerpc64--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-POWERPC64 %s
 
 // X86_64: clang{{.*}}" "-cc1" "-triple" "x86_64--netbsd"
@@ -116,17 +140,47 @@
 // SPARC: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // SPARC: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
 // SPARC: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
-// SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lstdc++"
-// SPARC: "-lm" "-lc" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
+// SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// SPARC: "-lm" "-lc"
 // SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
+// SPARC-7: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd7.0.0"
+// SPARC-7: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
+// SPARC-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
+// SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// SPARC-7: "-lm" "-lc"
+// SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// SPARC-6: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd6.0.0"
+// SPARC-6: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
+// SPARC-6: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
+// SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lstdc++"
+// SPARC-6: "-lm" "-lc" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
+// SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // SPARC64: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd"
 // SPARC64: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // SPARC64: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
-// SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lstdc++"
-// SPARC64: "-lm" "-lc" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
+// SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// SPARC64: "-lm" "-lc"
 // SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
+// SPARC64-7: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd7.0.0"
+// SPARC64-7: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
+// SPARC64-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// SPARC64-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// SPARC64-7: "-lm" "-lc"
+// SPARC64-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// SPARC64-6: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd6.0.0"
+// SPARC64-6: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
+// SPARC64-6: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// SPARC64-6: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lstdc++"
+// SPARC64-6: "-lm" "-lc" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
+// SPARC64-6: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // POWERPC: clang{{.*}}" "-cc1" "-triple" "powerpc--netbsd"
 // POWERPC: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // POWERPC: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
@@ -191,17 +245,47 @@
 // S-SPARC: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
 // S-SPARC: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
 // S-SPARC: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
-// S-SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lstdc++"
-// S-SPARC: "-lm" "-lc" "-lgcc_eh" "-lc" "-lgcc"
+// S-SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// S-SPARC: "-lm" "-lc"
 // S-SPARC: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
+// S-SPARC-7: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd7.0.0"
+// S-SPARC-7: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-SPARC-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// S-SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
+// S-SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// S-SPARC-7: "-lm" "-lc"
+// S-SPARC-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// S-SPARC-6: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd6.0.0"
+// S-SPARC-6: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-SPARC-6: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// S-SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}sparc{{/|\\\\}}crti.o"
+// S-SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lstdc++"
+// S-SPARC-6: "-lm" "-lc" "-lgcc_eh" "-lc" "-lgcc"
+// S-SPARC-6: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // S-SPARC64: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd"
 // S-SPARC64: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
 // S-SPARC64: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
-// S-SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lstdc++"
-// S-SPARC64: "-lm" "-lc" "-lgcc_eh" "-lc" "-lgcc"
+// S-SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// S-SPARC64: "-lm" "-lc"
 // S-SPARC64: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
+// S-SPARC64-7: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd7.0.0"
+// S-SPARC64-7: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-SPARC64-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// S-SPARC64-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// S-SPARC64-7: "-lm" "-lc"
+// S-SPARC64-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// S-SPARC64-6: clang{{.*}}" "-cc1" "-triple" "sparc64--netbsd6.0.0"
+// S-SPARC64-6: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-SPARC64-6: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// S-SPARC64-6: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lstdc++"
+// S-SPARC64-6: "-lm" "-lc" "-lgcc_eh" "-lc" "-lgcc"
+// S-SPARC64-6: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // S-POWERPC: clang{{.*}}" "-cc1" "-triple" "powerpc--netbsd"
 // S-POWERPC: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
 // S-POWERPC: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
diff --git a/test/Driver/noinline.c b/test/Driver/noinline.c
index e665b2f..70f950c 100644
--- a/test/Driver/noinline.c
+++ b/test/Driver/noinline.c
@@ -3,7 +3,7 @@
 
 // RUN: %clang -target x86_64-apple-darwin10 \
 // RUN:   -fno-inline -fno-inline-functions -### -fsyntax-only %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK < %t %s
+// RUN: FileCheck < %t %s
 
 // CHECK: clang
 // CHECK: "-fno-inline"
diff --git a/test/Driver/nostdlib.c b/test/Driver/nostdlib.c
index 47c6f8b..6e7bc0e 100644
--- a/test/Driver/nostdlib.c
+++ b/test/Driver/nostdlib.c
@@ -22,6 +22,10 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir -lclang_rt.builtins-i686 \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-NOSTDLIB %s
 //
+// RUN: %clang -target x86_64-pc-windows-msvc -nostdlib --rtlib=compiler-rt -### %s 2>&1 | FileCheck %s -check-prefix CHECK-MSVC-NOSTDLIB
+// RUN: %clang -target x86_64-pc-windows-msvc --rtlib=compiler-rt -nostdlib -### %s 2>&1 | FileCheck %s -check-prefix CHECK-MSVC-NOSTDLIB
+//
 // CHECK-LINUX-NOSTDLIB: warning: argument unused during compilation: '--rtlib=compiler-rt'
 // CHECK-LINUX-NOSTDLIB: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-LINUX-NOSTDLIB-NOT: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.builtins-i686.a"
+// CHECK-MSVC-NOSTDLIB: warning: argument unused during compilation: '--rtlib=compiler-rt'
diff --git a/test/Driver/nozlibcompress.c b/test/Driver/nozlibcompress.c
index 4eac066..9986c85 100644
--- a/test/Driver/nozlibcompress.c
+++ b/test/Driver/nozlibcompress.c
@@ -1,5 +1,5 @@
 // RUN: %clang -c %s -Wa,--compress-debug-sections 2>&1 | FileCheck %s
-// RUN: %clang -c %s -Wa,--compress-debug-sections -Wa,--nocompress-debug-sections 2>&1 | FileCheck --check-prefix=NOWARN %s
+// RUN: %clang -c %s -Wa,--compress-debug-sections -Wa,--nocompress-debug-sections 2>&1 | FileCheck --allow-empty --check-prefix=NOWARN %s
 // REQUIRES: nozlib
 
 // CHECK: warning: cannot compress debug sections (zlib not installed)
diff --git a/test/Driver/objc-weak.m b/test/Driver/objc-weak.m
index ff60759..68ae26e 100644
--- a/test/Driver/objc-weak.m
+++ b/test/Driver/objc-weak.m
@@ -10,9 +10,9 @@
 // ARC-NO-WEAK: -fobjc-arc
 // ARC-NO-WEAK: -fno-objc-weak
 
-// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fobjc-arc -fobjc-weak 2>&1 | FileCheck %s --check-prefix ARC-WEAK-UNSUPPORTED
-// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fno-objc-weak -fobjc-weak -fobjc-arc  2>&1 | FileCheck %s --check-prefix ARC-WEAK-UNSUPPORTED
-// ARC-WEAK-UNSUPPORTED: error: -fobjc-weak is not supported on the current deployment target
+// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fobjc-arc -fobjc-weak 2>&1 | FileCheck %s --check-prefix ARC-WEAK-NOTSUPPORTED
+// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fno-objc-weak -fobjc-weak -fobjc-arc  2>&1 | FileCheck %s --check-prefix ARC-WEAK-NOTSUPPORTED
+// ARC-WEAK-NOTSUPPORTED: error: -fobjc-weak is not supported on the current deployment target
 
 // RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.7 -S -### %s -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK
 // RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.7 -S -### %s -fno-objc-weak -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK
@@ -22,6 +22,6 @@
 // RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.7 -S -### %s -fobjc-weak -fno-objc-weak 2>&1 | FileCheck %s --check-prefix MRC-NO-WEAK
 // MRC-NO-WEAK: -fno-objc-weak
 
-// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK-UNSUPPORTED
-// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fno-objc-weak -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK-UNSUPPORTED
-// MRC-WEAK-UNSUPPORTED: error: -fobjc-weak is not supported on the current deployment target
+// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK-NOTSUPPORTED
+// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fno-objc-weak -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK-NOTSUPPORTED
+// MRC-WEAK-NOTSUPPORTED: error: -fobjc-weak is not supported on the current deployment target
diff --git a/test/Driver/offloading-interoperability.c b/test/Driver/offloading-interoperability.c
new file mode 100644
index 0000000..ab51322
--- /dev/null
+++ b/test/Driver/offloading-interoperability.c
@@ -0,0 +1,17 @@
+// REQUIRES: clang-driver
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+//
+// Verify that CUDA device commands do not get OpenMP flags.
+//
+// RUN: %clang -no-canonical-prefixes -### -x cuda -target powerpc64le-linux-gnu -std=c++11 --cuda-gpu-arch=sm_35 -fopenmp=libomp %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix NO-OPENMP-FLAGS-FOR-CUDA-DEVICE
+//
+// NO-OPENMP-FLAGS-FOR-CUDA-DEVICE:      clang{{.*}}" "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// NO-OPENMP-FLAGS-FOR-CUDA-DEVICE-NOT:  -fopenmp
+// NO-OPENMP-FLAGS-FOR-CUDA-DEVICE-NEXT: ptxas" "-m64"
+// NO-OPENMP-FLAGS-FOR-CUDA-DEVICE-NEXT: fatbinary" "--cuda" "-64"
+// NO-OPENMP-FLAGS-FOR-CUDA-DEVICE-NEXT: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux-gnu"
+// NO-OPENMP-FLAGS-FOR-CUDA-DEVICE:      -fopenmp
+// NO-OPENMP-FLAGS-FOR-CUDA-DEVICE-NEXT: {{ld(.exe)?"}} {{.*}}"-m" "elf64lppc"
diff --git a/test/Driver/opencl.cl b/test/Driver/opencl.cl
new file mode 100644
index 0000000..d68d424
--- /dev/null
+++ b/test/Driver/opencl.cl
@@ -0,0 +1,37 @@
+// RUN: %clang -S -### -cl-std=CL %s 2>&1 | FileCheck --check-prefix=CHECK-CL %s
+// RUN: %clang -S -### -cl-std=CL1.1 %s 2>&1 | FileCheck --check-prefix=CHECK-CL11 %s
+// RUN: %clang -S -### -cl-std=CL1.2 %s 2>&1 | FileCheck --check-prefix=CHECK-CL12 %s
+// RUN: %clang -S -### -cl-std=CL2.0 %s 2>&1 | FileCheck --check-prefix=CHECK-CL20 %s
+// RUN: %clang -S -### -cl-opt-disable %s 2>&1 | FileCheck --check-prefix=CHECK-OPT-DISABLE %s
+// RUN: %clang -S -### -cl-strict-aliasing %s 2>&1 | FileCheck --check-prefix=CHECK-STRICT-ALIASING %s
+// RUN: %clang -S -### -cl-single-precision-constant %s 2>&1 | FileCheck --check-prefix=CHECK-SINGLE-PRECISION-CONST %s
+// RUN: %clang -S -### -cl-finite-math-only %s 2>&1 | FileCheck --check-prefix=CHECK-FINITE-MATH-ONLY %s
+// RUN: %clang -S -### -cl-kernel-arg-info %s 2>&1 | FileCheck --check-prefix=CHECK-KERNEL-ARG-INFO %s
+// RUN: %clang -S -### -cl-unsafe-math-optimizations %s 2>&1 | FileCheck --check-prefix=CHECK-UNSAFE-MATH-OPT %s
+// RUN: %clang -S -### -cl-fast-relaxed-math %s 2>&1 | FileCheck --check-prefix=CHECK-FAST-RELAXED-MATH %s
+// RUN: %clang -S -### -cl-mad-enable %s 2>&1 | FileCheck --check-prefix=CHECK-MAD-ENABLE %s
+// RUN: %clang -S -### -cl-no-signed-zeros %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SIGNED-ZEROS %s
+// RUN: %clang -S -### -cl-denorms-are-zero %s 2>&1 | FileCheck --check-prefix=CHECK-DENORMS-ARE-ZERO %s
+// RUN: %clang -S -### -cl-fp32-correctly-rounded-divide-sqrt %s 2>&1 | FileCheck --check-prefix=CHECK-ROUND-DIV %s
+// RUN: not %clang -cl-std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s
+// RUN: not %clang -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
+
+// CHECK-CL: "-cc1" {{.*}} "-cl-std=CL"
+// CHECK-CL11: "-cc1" {{.*}} "-cl-std=CL1.1"
+// CHECK-CL12: "-cc1" {{.*}} "-cl-std=CL1.2"
+// CHECK-CL20: "-cc1" {{.*}} "-cl-std=CL2.0"
+// CHECK-OPT-DISABLE: "-cc1" {{.*}} "-cl-opt-disable"
+// CHECK-STRICT-ALIASING: "-cc1" {{.*}} "-cl-strict-aliasing"
+// CHECK-SINGLE-PRECISION-CONST: "-cc1" {{.*}} "-cl-single-precision-constant"
+// CHECK-FINITE-MATH-ONLY: "-cc1" {{.*}} "-cl-finite-math-only"
+// CHECK-KERNEL-ARG-INFO: "-cc1" {{.*}} "-cl-kernel-arg-info"
+// CHECK-UNSAFE-MATH-OPT: "-cc1" {{.*}} "-cl-unsafe-math-optimizations"
+// CHECK-FAST-RELAXED-MATH: "-cc1" {{.*}} "-cl-fast-relaxed-math"
+// CHECK-MAD-ENABLE: "-cc1" {{.*}} "-cl-mad-enable"
+// CHECK-NO-SIGNED-ZEROS: "-cc1" {{.*}} "-cl-no-signed-zeros"
+// CHECK-DENORMS-ARE-ZERO: "-cc1" {{.*}} "-cl-denorms-are-zero"
+// CHECK-ROUND-DIV: "-cc1" {{.*}} "-cl-fp32-correctly-rounded-divide-sqrt"
+// CHECK-C99: error: invalid value 'c99' in '-cl-std=c99'
+// CHECK-INVALID: error: invalid value 'invalid' in '-cl-std=invalid'
+
+kernel void func(void);
diff --git a/test/Driver/output-file-cleanup.c b/test/Driver/output-file-cleanup.c
index 065df8f..314af4d 100644
--- a/test/Driver/output-file-cleanup.c
+++ b/test/Driver/output-file-cleanup.c
@@ -1,3 +1,5 @@
+// RUN: rm -f "%t.d" "%t1.s" "%t2.s" "%t3.s" "%t4.s" "%t5.s"
+//
 // RUN: touch %t.s
 // RUN: not %clang -S -DCRASH -o %t.s -MMD -MF %t.d %s
 // RUN: test ! -f %t.s
@@ -36,6 +38,9 @@
 // RUN: test -f %t1.s
 // RUN: test ! -f %t2.s
 
+// When given multiple .c files to compile, clang compiles them in order until
+// it hits an error, at which point it stops.
+//
 // RUN: touch %t1.c
 // RUN: echo "invalid C code" > %t2.c
 // RUN: touch %t3.c
@@ -44,6 +49,6 @@
 // RUN: cd %T && not %clang -S %t1.c %t2.c %t3.c %t4.c %t5.c
 // RUN: test -f %t1.s
 // RUN: test ! -f %t2.s
-// RUN: test -f %t3.s
+// RUN: test ! -f %t3.s
 // RUN: test ! -f %t4.s
-// RUN: test -f %t5.s
+// RUN: test ! -f %t5.s
diff --git a/test/Driver/pic.c b/test/Driver/pic.c
index aeb2ee3..9f9d09c 100644
--- a/test/Driver/pic.c
+++ b/test/Driver/pic.c
@@ -3,24 +3,26 @@
 //
 // CHECK-NO-PIC: "-mrelocation-model" "static"
 // CHECK-NO-PIC-NOT: "-pic-level"
-// CHECK-NO-PIC-NOT: "-pie-level"
+// CHECK-NO-PIC-NOT: "-pic-is-pie"
 //
 // CHECK-PIC1: "-mrelocation-model" "pic"
 // CHECK-PIC1: "-pic-level" "1"
+// CHECK-PIC1-NOT: "-pic-is-pie"
 //
 // CHECK-PIC2: "-mrelocation-model" "pic"
 // CHECK-PIC2: "-pic-level" "2"
+// CHECK-PIC2-NOT: "-pic-is-pie"
 //
 // CHECK-STATIC: "-static"
 // CHECK-NO-STATIC-NOT: "-static"
 //
 // CHECK-PIE1: "-mrelocation-model" "pic"
 // CHECK-PIE1: "-pic-level" "1"
-// CHECK-PIE1: "-pie-level" "1"
+// CHECK-PIE1: "-pic-is-pie"
 //
 // CHECK-PIE2: "-mrelocation-model" "pic"
 // CHECK-PIE2: "-pic-level" "2"
-// CHECK-PIE2: "-pie-level" "2"
+// CHECK-PIE2: "-pic-is-pie"
 //
 // CHECK-PIE-LD: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PIE-LD: "-pie"
@@ -31,11 +33,11 @@
 //
 // CHECK-DYNAMIC-NO-PIC-32: "-mrelocation-model" "dynamic-no-pic"
 // CHECK-DYNAMIC-NO-PIC-32-NOT: "-pic-level"
-// CHECK-DYNAMIC-NO-PIC-32-NOT: "-pie-level"
+// CHECK-DYNAMIC-NO-PIC-32-NOT: "-pic-is-pie"
 //
 // CHECK-DYNAMIC-NO-PIC-64: "-mrelocation-model" "dynamic-no-pic"
 // CHECK-DYNAMIC-NO-PIC-64: "-pic-level" "2"
-// CHECK-DYNAMIC-NO-PIC-64-NOT: "-pie-level"
+// CHECK-DYNAMIC-NO-PIC-64-NOT: "-pic-is-pie"
 //
 // CHECK-NON-DARWIN-DYNAMIC-NO-PIC: error: unsupported option '-mdynamic-no-pic' for target 'i386-unknown-unknown'
 //
@@ -151,10 +153,9 @@
 // RUN:   | FileCheck %s --check-prefix=CHECK-NO-PIE
 //
 // Darwin is a beautiful and unique snowflake when it comes to these flags.
-// When targeting a 32-bit darwin system, the -fno-* flag variants work and
-// disable PIC, but any other flag enables PIC (*not* PIE) even if the flag
-// specifies PIE. On 64-bit targets, there is simply nothing you can do, there
-// is no PIE, there is only PIC when it comes to compilation.
+// When targeting a 32-bit darwin system, only level 2 is supported. On 64-bit
+// targets, there is simply nothing you can do, there is no PIE, there is only
+// PIC when it comes to compilation.
 // RUN: %clang -c %s -target i386-apple-darwin -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
 // RUN: %clang -c %s -target i386-apple-darwin -fpic -### 2>&1 \
@@ -162,9 +163,9 @@
 // RUN: %clang -c %s -target i386-apple-darwin -fPIC -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
 // RUN: %clang -c %s -target i386-apple-darwin -fpie -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
+// RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 // RUN: %clang -c %s -target i386-apple-darwin -fPIE -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
+// RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 // RUN: %clang -c %s -target i386-apple-darwin -fno-PIC -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-NO-PIC
 // RUN: %clang -c %s -target i386-apple-darwin -fno-PIE -### 2>&1 \
@@ -172,7 +173,7 @@
 // RUN: %clang -c %s -target i386-apple-darwin -fno-PIC -fpic -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
 // RUN: %clang -c %s -target i386-apple-darwin -fno-PIC -fPIE -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
+// RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 // RUN: %clang -c %s -target x86_64-apple-darwin -fno-PIC -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
 // RUN: %clang -c %s -target x86_64-apple-darwin -fno-PIE -### 2>&1 \
diff --git a/test/Driver/preserve-as-comments.c b/test/Driver/preserve-as-comments.c
new file mode 100644
index 0000000..7e623e4
--- /dev/null
+++ b/test/Driver/preserve-as-comments.c
@@ -0,0 +1,2 @@
+// RUN: %clang -S -fno-preserve-as-comments %s -### 2>&1 | FileCheck %s
+// CHECK: "-fno-preserve-as-comments"
diff --git a/test/Driver/ps4-header-search.c b/test/Driver/ps4-header-search.c
index 15e093f..3afef69 100644
--- a/test/Driver/ps4-header-search.c
+++ b/test/Driver/ps4-header-search.c
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
 
-// RUN: env SCE_PS4_SDK_DIR=%S/Inputs/scei-ps4_tree %clang -target x86_64-scei-ps4 -E -v %s 2>&1 | FileCheck %s --check-prefix=ENVPS4
+// RUN: env SCE_ORBIS_SDK_DIR=%S/Inputs/scei-ps4_tree %clang -target x86_64-scei-ps4 -E -v %s 2>&1 | FileCheck %s --check-prefix=ENVPS4
 // ENVPS4: Inputs/scei-ps4_tree/target/include{{$}}
 // ENVPS4: Inputs/scei-ps4_tree/target/include_common{{$}}
 
diff --git a/test/Driver/ps4-linker-non-win.c b/test/Driver/ps4-linker-non-win.c
index 1fce6d6..e2f8386 100644
--- a/test/Driver/ps4-linker-non-win.c
+++ b/test/Driver/ps4-linker-non-win.c
@@ -2,9 +2,9 @@
 // REQUIRES: x86-registered-target
 
 // RUN: mkdir -p %T/Output
-// RUN: rm -f %T/Output/ps4-ld
-// RUN: touch %T/Output/ps4-ld
-// RUN: chmod +x %T/Output/ps4-ld
+// RUN: rm -f %T/Output/orbis-ld
+// RUN: touch %T/Output/orbis-ld
+// RUN: chmod +x %T/Output/orbis-ld
 
 // RUN: env "PATH=%T/Output:%PATH%" %clang -### -target x86_64-scei-ps4  %s -fuse-ld=gold 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-LINKER %s
@@ -18,4 +18,4 @@
 // RUN: env "PATH=%T/Output:%PATH%" %clang -### -target x86_64-scei-ps4  %s -shared \
 // RUN:     -fuse-ld=ps4 2>&1 | FileCheck --check-prefix=CHECK-PS4-LINKER %s
 
-// CHECK-PS4-LINKER: Output/ps4-ld
+// CHECK-PS4-LINKER: /orbis-ld
diff --git a/test/Driver/ps4-linker-win.c b/test/Driver/ps4-linker-win.c
index e42fc96..6fbd84b 100644
--- a/test/Driver/ps4-linker-win.c
+++ b/test/Driver/ps4-linker-win.c
@@ -7,21 +7,20 @@
 
 // REQUIRES: system-windows, x86-registered-target
 
-// RUN: touch %T/ps4-ld.exe
-// RUN: touch %T/ps4-ld.gold.exe
+// RUN: touch %T/orbis-ld.exe
+// RUN: touch %T/orbis-ld.gold.exe
 
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -fuse-ld=gold -### 2>&1 \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -fuse-ld=gold -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-GOLD %s
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -shared -### 2>&1 \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -shared -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-GOLD %s
 
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -### 2>&1 \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-LINKER %s
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -fuse-ld=ps4 -### 2>&1 \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -fuse-ld=ps4 -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-LINKER %s
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -shared \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -shared \
 // RUN:     -fuse-ld=ps4 -### 2>&1 | FileCheck --check-prefix=CHECK-PS4-LINKER %s
 
-// FIXME: "Output\\" is hardcoded part of %T.
-// CHECK-PS4-GOLD: Output\\ps4-ld.gold.exe"
-// CHECK-PS4-LINKER: Output\\ps4-ld.exe"
+// CHECK-PS4-GOLD: \\orbis-ld.gold
+// CHECK-PS4-LINKER: \\orbis-ld
diff --git a/test/Driver/ps4-pic.c b/test/Driver/ps4-pic.c
index 0cf9ad5..c023dcf 100644
--- a/test/Driver/ps4-pic.c
+++ b/test/Driver/ps4-pic.c
@@ -6,7 +6,7 @@
 //
 // CHECK-NO-PIC: "-mrelocation-model" "static"
 // CHECK-NO-PIC-NOT: "-pic-level"
-// CHECK-NO-PIC-NOT: "-pie-level"
+// CHECK-NO-PIC-NOT: "-pic-is-pie"
 //
 // CHECK-DYNAMIC-NO-PIC2: unsupported option '-mdynamic-no-pic'
 // CHECK-DYNAMIC-NO-PIC2: "-mrelocation-model" "dynamic-no-pic"
@@ -15,7 +15,7 @@
 // CHECK-PIC2: "-pic-level" "2"
 //
 // CHECK-PIE2: "-mrelocation-model" "pic"
-// CHECK-PIE2: "-pie-level" "2"
+// CHECK-PIE2: "-pic-is-pie"
 //
 // CHECK-NOPIC-IGNORED: using '-fPIC'
 // CHECK-NOPIC-IGNORED: "-mrelocation-model" "pic"
diff --git a/test/Driver/ps4-sdk-root.c b/test/Driver/ps4-sdk-root.c
index f40a963..ee22d6c 100644
--- a/test/Driver/ps4-sdk-root.c
+++ b/test/Driver/ps4-sdk-root.c
@@ -1,45 +1,45 @@
 // REQUIRES: x86-registered-target
 
-// Check that ps4-clang doesn't report a warning message when locating
-// system header files (either by looking at the value of SCE_PS4_SDK_DIR
+// Check that PS4 clang doesn't report a warning message when locating
+// system header files (either by looking at the value of SCE_ORBIS_SDK_DIR
 // or relative to the location of the compiler driver), if "-nostdinc",
 // "--sysroot" or "-isysroot" option is specified on the command line.
-// Otherwise, check that ps4-clang reports a warning.
+// Otherwise, check that PS4 clang reports a warning.
 
-// Check that clang doesn't report a warning message when locating
-// system libraries (either by looking at the value of SCE_PS4_SDK_DIR
+// Check that PS4 clang doesn't report a warning message when locating
+// system libraries (either by looking at the value of SCE_ORBIS_SDK_DIR
 // or relative to the location of the compiler driver), if "-c", "-S", "-E",
 // "--sysroot", "-nostdlib" or "-nodefaultlibs" option is specified on
 // the command line.
-// Otherwise, check that ps4-clang reports a warning.
+// Otherwise, check that PS4 clang reports a warning.
 
-// setting up SCE_PS4_SDK_DIR to existing location, which is not a PS4 SDK.
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
+// Setting up SCE_ORBIS_SDK_DIR to existing location, which is not a PS4 SDK.
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
 
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
 
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
 
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
 
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### --sysroot=foo/ -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### --sysroot=foo/ -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
 
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nostdlib -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nodefaultlibs -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nostdlib -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nodefaultlibs -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
 
 // NO-WARN-NOT: {{warning:|error:}}
 // WARN-SYS-HEADERS: warning: unable to find PS4 system headers directory
diff --git a/test/Driver/r600-mcpu.cl b/test/Driver/r600-mcpu.cl
index 4fbec0c..325e571 100644
--- a/test/Driver/r600-mcpu.cl
+++ b/test/Driver/r600-mcpu.cl
@@ -38,6 +38,8 @@
 // RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=tonga %s -o - 2>&1 | FileCheck --check-prefix=TONGA-CHECK %s
 // RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=iceland %s -o - 2>&1 | FileCheck --check-prefix=ICELAND-CHECK %s
 // RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=carrizo %s -o - 2>&1 | FileCheck --check-prefix=CARRIZO-CHECK %s
+// RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=fiji %s -o - 2>&1 | FileCheck --check-prefix=FIJI-CHECK %s
+// RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=stoney %s -o - 2>&1 | FileCheck --check-prefix=STONEY-CHECK %s
 
 // R600-CHECK:  "-target-cpu" "r600"
 // RS880-CHECK: "-target-cpu" "rs880"
@@ -66,3 +68,5 @@
 // TONGA-CHECK: "-target-cpu" "tonga"
 // ICELAND-CHECK: "-target-cpu" "iceland"
 // CARRIZO-CHECK: "-target-cpu" "carrizo"
+// FIJI-CHECK: "-target-cpu" "fiji"
+// STONEY-CHECK: "-target-cpu" "stoney"
diff --git a/test/Driver/relax.c b/test/Driver/relax.c
new file mode 100644
index 0000000..170d275
--- /dev/null
+++ b/test/Driver/relax.c
@@ -0,0 +1,4 @@
+// RUN: %clang -### -c -integrated-as -Wa,--mrelax-relocations=yes %s 2>&1 | FileCheck  %s
+
+// CHECK: "-cc1"
+// CHECK: "--mrelax-relocations"
diff --git a/test/Driver/relax.s b/test/Driver/relax.s
new file mode 100644
index 0000000..d2941e2
--- /dev/null
+++ b/test/Driver/relax.s
@@ -0,0 +1,12 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang -### -c -integrated-as -Wa,--mrelax-relocations=yes %s 2>&1 | FileCheck  %s
+
+// CHECK: "-cc1as"
+// CHECK: "--mrelax-relocations"
+
+// RUN: %clang -cc1as -triple x86_64-pc-linux --mrelax-relocations %s -o %t  -filetype obj
+// RUN: llvm-readobj -r %t | FileCheck --check-prefix=REL %s
+
+// REL: R_X86_64_REX_GOTPCRELX foo
+
+        movq	foo@GOTPCREL(%rip), %rax
diff --git a/test/Driver/renderscript.rs b/test/Driver/renderscript.rs
new file mode 100644
index 0000000..84f5dc4
--- /dev/null
+++ b/test/Driver/renderscript.rs
@@ -0,0 +1,3 @@
+// RUN: %clang -### 2>&1 %s | FileCheck %s
+
+// CHECK: "-x" "renderscript"
diff --git a/test/Driver/response-file-extra-whitespace.c b/test/Driver/response-file-extra-whitespace.c
new file mode 100644
index 0000000..93b32bb
--- /dev/null
+++ b/test/Driver/response-file-extra-whitespace.c
@@ -0,0 +1,12 @@
+// Check that clang is able to process response files with extra whitespace.
+// We generate a dos-style file with \r\n for line endings, and then split
+// some joined arguments (like "-x c") across lines to ensure that regular
+// clang (not clang-cl) can process it correctly.
+//
+// RUN: echo -en "-x\r\nc\r\n-DTEST\r\n" > %t.0.txt
+// RUN: %clang -E @%t.0.txt %s -v 2>&1 | FileCheck %s -check-prefix=SHORT
+// SHORT: extern int it_works;
+
+#ifdef TEST
+extern int it_works;
+#endif
diff --git a/test/Driver/response-file.c b/test/Driver/response-file.c
index 208a941..bd33630 100644
--- a/test/Driver/response-file.c
+++ b/test/Driver/response-file.c
@@ -4,7 +4,7 @@
 // Since this is a short response file, clang must not use a response file
 // to pass its parameters to other tools. This is only necessary for a large
 // number of parameters.
-// RUN: echo "-DTEST" >> %t.0.txt
+// RUN: echo "-DTEST" > %t.0.txt
 // RUN: %clang -E @%t.0.txt %s -v 2>&1 | FileCheck %s -check-prefix=SHORT
 // SHORT-NOT: Arguments passed via response file
 // SHORT: extern int it_works;
diff --git a/test/Driver/ropi-rwpi.c b/test/Driver/ropi-rwpi.c
new file mode 100644
index 0000000..61fb8be
--- /dev/null
+++ b/test/Driver/ropi-rwpi.c
@@ -0,0 +1,38 @@
+// RUN: %clang -target arm-none-eabi               -### -c %s 2>&1 | FileCheck --check-prefix=STATIC %s
+// RUN: %clang -target arm-none-eabi -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=ROPI %s
+// RUN: %clang -target arm-none-eabi        -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=RWPI %s
+// RUN: %clang -target arm-none-eabi -fropi -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=ROPI-RWPI %s
+
+// RUN: %clang -target armeb-none-eabi   -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=ROPI %s
+// RUN: %clang -target thumb-none-eabi   -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=ROPI %s
+// RUN: %clang -target thumbeb-none-eabi -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=ROPI %s
+
+// RUN: %clang -target x86_64-linux-gnu -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=ROPI-NON-ARM %s
+// RUN: %clang -target x86_64-linux-gnu        -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=RWPI-NON-ARM %s
+// RUN: %clang -target x86_64-linux-gnu -fropi -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=ROPI-NON-ARM --check-prefix=RWPI-NON-ARM %s
+
+// RUN: %clang -target arm-none-eabi -fpic    -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=PIC %s
+// RUN: %clang -target arm-none-eabi -fpie           -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=PIC %s
+// RUN: %clang -target arm-none-eabi -fPIC    -fropi -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=PIC %s
+// RUN: %clang -target arm-none-eabi -fno-pic -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=ROPI %s
+
+// RUN: %clang -target arm-none-eabi -x c++ -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=CXX %s
+// RUN: %clang -target arm-none-eabi -x c++        -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=RWPI %s
+// RUN: %clang -target arm-none-eabi -x c++ -fropi -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=CXX %s
+// RUN: %clang -target arm-none-eabi -x c++ -fallow-unsupported -fropi        -### -c %s 2>&1 | FileCheck --check-prefix=ROPI %s
+
+
+// STATIC: "-mrelocation-model" "static"
+
+// ROPI: "-mrelocation-model" "ropi"
+
+// RWPI: "-mrelocation-model" "rwpi"
+
+// ROPI-RWPI: "-mrelocation-model" "ropi-rwpi"
+
+// ROPI-NON-ARM: error: unsupported option '-fropi' for target 'x86_64--linux-gnu'
+// RWPI-NON-ARM: error: unsupported option '-frwpi' for target 'x86_64--linux-gnu'
+
+// PIC: error: embedded and GOT-based position independence are incompatible
+
+// CXX: error: ROPI is not compatible with c++
diff --git a/test/Driver/sanitize_unwind_tables.c b/test/Driver/sanitize_unwind_tables.c
index 8b78899..b78843e 100644
--- a/test/Driver/sanitize_unwind_tables.c
+++ b/test/Driver/sanitize_unwind_tables.c
@@ -7,5 +7,7 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=memory %s -### 2>&1 |  FileCheck %s
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=thread %s -### 2>&1 |  FileCheck %s
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow %s -### 2>&1 |  FileCheck %s
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag %s -### 2>&1 |  FileCheck %s
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set %s -### 2>&1 |  FileCheck %s
 
 // CHECK: -munwind-tables
diff --git a/test/Driver/sanitizer-ld.c b/test/Driver/sanitizer-ld.c
index 4aa0fa4..9f6fae3 100644
--- a/test/Driver/sanitizer-ld.c
+++ b/test/Driver/sanitizer-ld.c
@@ -76,7 +76,7 @@
 // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux -fsanitize=address \
+// RUN:     -target i386-unknown-linux -stdlib=platform -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s
@@ -93,8 +93,8 @@
 // CHECK-ASAN-LINUX-CXX: "-ldl"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o /dev/null -fsanitize=address \
-// RUN:     -target i386-unknown-linux --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:     -lstdc++ -static 2>&1 \
+// RUN:     -target i386-unknown-linux -stdlib=platform \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC %s
 //
 // CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
@@ -152,7 +152,8 @@
 // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -lstdc++ -fsanitize=thread \
+// RUN:     -target x86_64-unknown-linux -stdlib=platform -lstdc++ \
+// RUN:     -fsanitize=thread \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-TSAN-LINUX-CXX %s
@@ -170,7 +171,8 @@
 // CHECK-TSAN-LINUX-CXX: "-ldl"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -lstdc++ -fsanitize=memory \
+// RUN:     -target x86_64-unknown-linux -stdlib=platform -lstdc++ \
+// RUN:     -fsanitize=memory \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-MSAN-LINUX-CXX %s
@@ -209,7 +211,7 @@
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
 
 // RUN: %clangxx -fsanitize=undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-CXX %s
@@ -234,7 +236,7 @@
 // CHECK-ASAN-UBSAN-LINUX: "-lpthread"
 
 // RUN: %clangxx -fsanitize=address,undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -stdlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX %s
 // CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
@@ -291,9 +293,57 @@
 // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan-x86_64
 // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
 
+// RUN: %clang -fsanitize=address -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-COV-LINUX %s
+// CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.asan-x86_64.a" "-no-whole-archive"
+// CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-ASAN-COV-LINUX-NOT: "-lstdc++"
+// CHECK-ASAN-COV-LINUX: "-lpthread"
+
+// RUN: %clang -fsanitize=memory -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-MSAN-COV-LINUX %s
+// CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-MSAN-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.msan-x86_64.a" "-no-whole-archive"
+// CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-MSAN-COV-LINUX-NOT: "-lstdc++"
+// CHECK-MSAN-COV-LINUX: "-lpthread"
+
+// RUN: %clang -fsanitize=dataflow -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-DFSAN-COV-LINUX %s
+// CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-DFSAN-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.dfsan-x86_64.a" "-no-whole-archive"
+// CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++"
+// CHECK-DFSAN-COV-LINUX: "-lpthread"
+
+// RUN: %clang -fsanitize=undefined -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-COV-LINUX %s
+// CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-UBSAN-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.ubsan_standalone-x86_64.a" "-no-whole-archive"
+// CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++"
+// CHECK-UBSAN-COV-LINUX: "-lpthread"
+
+// RUN: %clang -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-COV-LINUX %s
+// CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.ubsan_standalone-x86_64.a" "-no-whole-archive"
+// CHECK-COV-LINUX-NOT: "-lstdc++"
+// CHECK-COV-LINUX: "-lpthread"
+
 // CFI by itself does not link runtime libraries.
 // RUN: %clang -fsanitize=cfi %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -rtlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-LINUX %s
 // CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -327,7 +377,7 @@
 
 // RUN: %clangxx -fsanitize=address %s -### -o %t.o 2>&1 \
 // RUN:     -mmacosx-version-min=10.6 \
-// RUN:     -target x86_64-apple-darwin13.4.0 \
+// RUN:     -target x86_64-apple-darwin13.4.0 -stdlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-DARWIN106-CXX %s
 // CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}"
@@ -422,3 +472,13 @@
 // RUN:   | FileCheck --check-prefix=CHECK-AUBSAN-PS4 %s
 // CHECK-AUBSAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-AUBSAN-PS4: -lSceDbgAddressSanitizer_stub_weak
+
+// RUN: %clang -fsanitize=efficiency-cache-frag %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:   | FileCheck --check-prefix=CHECK-ESAN-LINUX %s
+// RUN: %clang -fsanitize=efficiency-working-set %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:   | FileCheck --check-prefix=CHECK-ESAN-LINUX %s
+//
+// CHECK-ESAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-ESAN-LINUX: libclang_rt.esan-x86_64.a
diff --git a/test/Driver/sparc-as.c b/test/Driver/sparc-as.c
index 5b93995..80122cf 100644
--- a/test/Driver/sparc-as.c
+++ b/test/Driver/sparc-as.c
@@ -76,6 +76,38 @@
 // RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC-V8PLUSD %s
 
+// RUN: %clang -mcpu=leon2 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=at697e -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=at697f -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=leon3 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=ut699 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=gr712rc -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=leon4 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=gr740 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
 // SPARC: as{{.*}}" "-32" "-Av8" "-o"
 // SPARC-V8: as{{.*}}" "-32" "-Av8" "-o"
 // SPARC-SPARCLITE: as{{.*}}" "-32" "-Asparclite" "-o"
diff --git a/test/Driver/sparc-float.c b/test/Driver/sparc-float.c
index 6fa47f0..c205f5d 100644
--- a/test/Driver/sparc-float.c
+++ b/test/Driver/sparc-float.c
@@ -18,7 +18,25 @@
 // RUN: %clang -c %s -### -o %t.o 2>&1 \
 // RUN:     -target sparc-linux-gnu -msoft-float \
 // RUN:   | FileCheck --check-prefix=CHECK-SOFT %s
-// CHECK-SOFT: error: unsupported option '-msoft-float'
+// CHECK-SOFT: "-target-feature" "+soft-float"
+//
+// -mfloat-abi=soft
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc-linux-gnu -mfloat-abi=soft \
+// RUN:   | FileCheck --check-prefix=CHECK-FLOATABISOFT %s
+// CHECK-FLOATABISOFT: "-target-feature" "+soft-float"
+//
+// -mfloat-abi=hard
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc-linux-gnu -mfloat-abi=hard \
+// RUN:   | FileCheck --check-prefix=CHECK-FLOATABIHARD %s
+// CHECK-FLOATABIHARD-NOT: "-target-feature" "+soft-float"
+//
+// check invalid -mfloat-abi
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc-linux-gnu -mfloat-abi=x \
+// RUN:   | FileCheck --check-prefix=CHECK-ERRMSG %s
+// CHECK-ERRMSG: error: invalid float ABI '-mfloat-abi=x'
 //
 // Default sparc64
 // RUN: %clang -c %s -### -o %t.o 2>&1 \
@@ -37,4 +55,22 @@
 // RUN: %clang -c %s -### -o %t.o 2>&1 \
 // RUN:     -target sparc64-linux-gnu -msoft-float \
 // RUN:   | FileCheck --check-prefix=CHECK-SOFT-SPARC64 %s
-// CHECK-SOFT-SPARC64: error: unsupported option '-msoft-float'
+// CHECK-SOFT-SPARC64: "-target-feature" "+soft-float"
+//
+// -mfloat-abi=soft
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc64-linux-gnu -mfloat-abi=soft \
+// RUN:   | FileCheck --check-prefix=CHECK-FLOATABISOFT64 %s
+// CHECK-FLOATABISOFT64: "-target-feature" "+soft-float"
+//
+// -mfloat-abi=hard
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc64-linux-gnu -mfloat-abi=hard \
+// RUN:   | FileCheck --check-prefix=CHECK-FLOATABIHARD64 %s
+// CHECK-FLOATABIHARD64-NOT: "-target-feature" "+soft-float"
+//
+// check invalid -mfloat-abi
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc64-linux-gnu -mfloat-abi=x \
+// RUN:   | FileCheck --check-prefix=CHECK-ERRMSG64 %s
+// CHECK-ERRMSG64: error: invalid float ABI '-mfloat-abi=x'
diff --git a/test/Driver/split-debug.h b/test/Driver/split-debug.h
new file mode 100644
index 0000000..bb05f30
--- /dev/null
+++ b/test/Driver/split-debug.h
@@ -0,0 +1,15 @@
+// Check that we aren't splitting debug output for modules builds that don't produce object files.
+//
+// RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -c -fmodules -### %s 2> %t
+// RUN: FileCheck -check-prefix=CHECK-NO-ACTIONS < %t %s
+//
+// RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -c -fmodules -emit-module -fmodules-embed-all-files -fno-implicit-modules -fno-implicit-module-maps -### %s 2> %t
+// RUN: FileCheck -check-prefix=CHECK-NO-ACTIONS < %t %s
+//
+// FIXME: This should fail using clang, except that the type of the output for
+// an object output with modules is given as clang::driver::types::TY_PCH
+// rather than TY_Object.
+// RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -c -fmodules -fmodule-format=obj -### %s 2> %t
+// RUN: FileCheck -check-prefix=CHECK-NO-ACTIONS < %t %s
+//
+// CHECK-NO-ACTIONS-NOT: objcopy
diff --git a/test/Driver/split-stack-ld.c b/test/Driver/split-stack-ld.c
new file mode 100644
index 0000000..3441d54
--- /dev/null
+++ b/test/Driver/split-stack-ld.c
@@ -0,0 +1,17 @@
+// Test split stack ld flags.
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target i386-unknown-linux -fsplit-stack \
+// RUN:     -resource-dir=%S/Inputs/resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386 %s
+//
+// CHECK-LINUX-I386: "--wrap=pthread_create"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux -fsplit-stack \
+// RUN:     -resource-dir=%S/Inputs/resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64 %s
+//
+// CHECK-LINUX-X86-64: "--wrap=pthread_create"
diff --git a/test/Driver/unknown-arg.c b/test/Driver/unknown-arg.c
index f834a0e..755d29f 100644
--- a/test/Driver/unknown-arg.c
+++ b/test/Driver/unknown-arg.c
@@ -1,13 +1,35 @@
-// RUN: not %clang %s -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option 2>&1 | \
+// RUN: %clang %s -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -### 2>&1 | \
 // RUN: FileCheck %s
+// RUN: %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -### -c -- %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=CL
+// RUN: %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -c -Werror=unknown-argument -### -- %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=CL-ERROR
+// RUN: %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -c -Wno-unknown-argument -### -- %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=SILENT
 
-// CHECK: unknown argument: '-cake-is-lie'
-// CHECK: unknown argument: '-%0'
-// CHECK: unknown argument: '-%d'
-// CHECK: unknown argument: '-HHHH'
-// CHECK: unknown argument: '-munknown-to-clang-option'
-// CHECK: unknown argument: '-print-stats'
-// CHECK: unknown argument: '-funknown-to-clang-option'
+// CHECK: error: unknown argument: '-cake-is-lie'
+// CHECK: error: unknown argument: '-%0'
+// CHECK: error: unknown argument: '-%d'
+// CHECK: error: unknown argument: '-HHHH'
+// CHECK: error: unknown argument: '-munknown-to-clang-option'
+// CHECK: error: unknown argument: '-print-stats'
+// CHECK: error: unknown argument: '-funknown-to-clang-option'
+// CL: warning: unknown argument ignored in clang-cl: '-cake-is-lie'
+// CL: warning: unknown argument ignored in clang-cl: '-%0'
+// CL: warning: unknown argument ignored in clang-cl: '-%d'
+// CL: warning: unknown argument ignored in clang-cl: '-HHHH'
+// CL: warning: unknown argument ignored in clang-cl: '-munknown-to-clang-option'
+// CL: warning: unknown argument ignored in clang-cl: '-print-stats'
+// CL: warning: unknown argument ignored in clang-cl: '-funknown-to-clang-option'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-cake-is-lie'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-%0'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-%d'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-HHHH'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-munknown-to-clang-option'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-print-stats'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-funknown-to-clang-option'
+// SILENT-NOT: error:
+// SILENT-NOT: warning:
 
 
 // RUN: %clang -S %s -o %t.s  -Wunknown-to-clang-option 2>&1 | FileCheck --check-prefix=IGNORED %s
diff --git a/test/Driver/wasm-toolchain.c b/test/Driver/wasm-toolchain.c
index b9685b1..d0b0293 100644
--- a/test/Driver/wasm-toolchain.c
+++ b/test/Driver/wasm-toolchain.c
@@ -25,20 +25,20 @@
 
 // A basic C link command-line.
 
-// RUN: %clang -### -no-canonical-prefixes -target wasm32-unknown-unknown %s 2>&1 | FileCheck -check-prefix=LINK %s
+// RUN: %clang -### -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo %s 2>&1 | FileCheck -check-prefix=LINK %s
 // LINK: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
-// LINK: lld{{.*}}" "-flavor" "ld" "[[temp]]" "-o" "a.out"
+// LINK: lld{{.*}}" "-flavor" "ld" "-L/foo/lib32" "crt1.o" "crti.o" "[[temp]]" "-lc" "-lcompiler_rt" "crtn.o" "-o" "a.out"
 
 // A basic C link command-line with optimization. WebAssembly is somewhat
 // special in enabling --gc-sections by default.
 
-// RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown %s 2>&1 | FileCheck -check-prefix=LINK_OPT %s
+// RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo %s 2>&1 | FileCheck -check-prefix=LINK_OPT %s
 // LINK_OPT: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
-// LINK_OPT: lld{{.*}}" "-flavor" "ld" "--gc-sections" "[[temp]]" "-o" "a.out"
+// LINK_OPT: lld{{.*}}" "-flavor" "ld" "--gc-sections" "-L/foo/lib32" "crt1.o" "crti.o" "[[temp]]" "-lc" "-lcompiler_rt" "crtn.o" "-o" "a.out"
 
 // Ditto, but ensure that a user --no-gc-sections comes after the
 // default --gc-sections.
 
-// RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown -Wl,--no-gc-sections %s 2>&1 | FileCheck -check-prefix=NO_GC_SECTIONS %s
+// RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo -Wl,--no-gc-sections %s 2>&1 | FileCheck -check-prefix=NO_GC_SECTIONS %s
 // NO_GC_SECTIONS: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
-// NO_GC_SECTIONS: lld{{.*}}" "-flavor" "ld" "--gc-sections" "--no-gc-sections" "[[temp]]" "-o" "a.out"
+// NO_GC_SECTIONS: lld{{.*}}" "-flavor" "ld" "--gc-sections" "-L/foo/lib32" "crt1.o" "crti.o" "--no-gc-sections" "[[temp]]" "-lc" "-lcompiler_rt" "crtn.o" "-o" "a.out"
diff --git a/test/Driver/whole-program-vtables.c b/test/Driver/whole-program-vtables.c
new file mode 100644
index 0000000..4ca985e
--- /dev/null
+++ b/test/Driver/whole-program-vtables.c
@@ -0,0 +1,2 @@
+// RUN: %clang -target x86_64-unknown-linux -fwhole-program-vtables -### %s 2>&1 | FileCheck --check-prefix=NO-LTO %s
+// NO-LTO: invalid argument '-fwhole-program-vtables' only allowed with '-flto'
diff --git a/test/Driver/windows-cross.c b/test/Driver/windows-cross.c
index d355fbc..84ef2df 100644
--- a/test/Driver/windows-cross.c
+++ b/test/Driver/windows-cross.c
@@ -1,9 +1,9 @@
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -o /dev/null %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -stdlib=libstdc++ -rtlib=platform -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-BASIC
 
 // CHECK-BASIC: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-Bdynamic" "--entry" "mainCRTStartup" "--allow-multiple-definition" "-o" "{{[^"]*}}" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbegin.obj" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/gcc" "{{.*}}.o" "-lmsvcrt" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %s/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -rtlib=compiler-rt -o /dev/null %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %s/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -rtlib=compiler-rt -stdlib=libstdc++ -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-RTLIB
 
 // CHECK-RTLIB: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-Bdynamic" "--entry" "mainCRTStartup" "--allow-multiple-definition" "-o" "{{[^"]*}}" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbegin.obj" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/gcc" "{{.*}}.o" "-lmsvcrt" "{{.*[\\/]}}clang_rt.builtins-arm.lib"
@@ -33,7 +33,7 @@
 
 // CHECK-STANDALONE: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-shared" "-Bdynamic" "--enable-auto-image-base" "--entry" "_DllMainCRTStartup" "--allow-multiple-definition" "-o" "shared.dll" "--out-implib" "shared.lib" "{{.*}}.o"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %/Inputs/Windows/ARM/8.1/usr/bin -shared -o shared.dll -x c++ %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %/Inputs/Windows/ARM/8.1/usr/bin -stdlib=libstdc++ -shared -o shared.dll -x c++ %s 2>&1 \
 // RUN:    | FileCheck %s --check-prefix CHECK-LIBSTDCXX
 
 // CHECK-LIBSTDCXX:  "-internal-isystem" "{{.*}}/usr/include/c++" "-internal-isystem" "{{.*}}/usr/include/c++/armv7--windows-itanium" "-internal-isystem" "{{.*}}/usr/include/c++/backwards"
@@ -67,3 +67,11 @@
 // CHECK-SANITIZE-TSAN: error: unsupported argument 'tsan' to option 'fsanitize='
 // CHECK-SANITIZE-TSAN-NOT: "-fsanitize={{.*}}"
 
+// RUN: %clang -### -target armv7-windows-itanium -isystem-after "Windows Kits/10/Include/10.0.10586.0/ucrt" -isystem-after "Windows Kits/10/Include/10.0.10586.0/um" -isystem-after "Windows Kits/10/Include/10.0.10586.0/shared" -c %s -o /dev/null 2>&1 \
+// RUN:     | FileCheck %s --check-prefix CHECK-ISYSTEM-AFTER
+// CHECK-ISYSTEM-AFTER: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-ISYSTEM-AFTER: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK-ISYSTEM-AFTER: "-internal-isystem" "Windows Kits{{[/\\]}}10{{[/\\]}}Include{{[/\\]}}10.0.10586.0{{[/\\]}}ucrt"
+// CHECK-ISYSTEM-AFTER: "-internal-isystem" "Windows Kits{{[/\\]}}10{{[/\\]}}Include{{[/\\]}}10.0.10586.0{{[/\\]}}um"
+// CHECK-ISYSTEM-AFTER: "-internal-isystem" "Windows Kits{{[/\\]}}10{{[/\\]}}Include{{[/\\]}}10.0.10586.0{{[/\\]}}shared"
+
diff --git a/test/Driver/windows-thumb.s b/test/Driver/windows-thumb.s
new file mode 100644
index 0000000..afe5b2d
--- /dev/null
+++ b/test/Driver/windows-thumb.s
@@ -0,0 +1,2 @@
+; RUN: %clang -target armv7-windows -c -### %s 2>&1 | FileCheck %s
+; CHECK: "-triple" "thumbv7-
diff --git a/test/Driver/x86-target-features.c b/test/Driver/x86-target-features.c
new file mode 100644
index 0000000..ce35b2c
--- /dev/null
+++ b/test/Driver/x86-target-features.c
@@ -0,0 +1,51 @@
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mx87 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=X87 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-x87 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-X87 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -m80387 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=X87 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-80387 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-X87 %s
+// X87: "-target-feature" "+x87"
+// NO-X87: "-target-feature" "-x87"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mmmx -m3dnow -m3dnowa %s -### -o %t.o 2>&1 | FileCheck -check-prefix=MMX %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-mmx -mno-3dnow -mno-3dnowa %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-MMX %s
+// MMX: "-target-feature" "+mmx" "-target-feature" "+3dnow" "-target-feature" "+3dnowa"
+// NO-MMX: "-target-feature" "-mmx" "-target-feature" "-3dnow" "-target-feature" "-3dnowa"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msse -msse2 -msse3 -mssse3 -msse4a -msse4.1 -msse4.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SSE %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sse -mno-sse2 -mno-sse3 -mno-ssse3 -mno-sse4a -mno-sse4.1 -mno-sse4.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SSE %s
+// SSE: "-target-feature" "+sse" "-target-feature" "+sse2" "-target-feature" "+sse3" "-target-feature" "+ssse3" "-target-feature" "+sse4a" "-target-feature" "+sse4.1" "-target-feature" "+sse4.2"
+// NO-SSE: "-target-feature" "-sse" "-target-feature" "-sse2" "-target-feature" "-sse3" "-target-feature" "-ssse3" "-target-feature" "-sse4a" "-target-feature" "-sse4.1" "-target-feature" "-sse4.2"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msse4 -maes %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SSE4-AES %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sse4 -mno-aes %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SSE4-AES %s
+// SSE4-AES: "-target-feature" "+sse4.2" "-target-feature" "+aes"
+// NO-SSE4-AES: "-target-feature" "-sse4.1" "-target-feature" "-aes"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavx -mavx2 -mavx512f -mavx512cd -mavx512er -mavx512pf -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512ifma %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avx -mno-avx2 -mno-avx512f -mno-avx512cd -mno-avx512er -mno-avx512pf -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512vbmi -mno-avx512ifma %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX %s
+// AVX: "-target-feature" "+avx" "-target-feature" "+avx2" "-target-feature" "+avx512f" "-target-feature" "+avx512cd" "-target-feature" "+avx512er" "-target-feature" "+avx512pf" "-target-feature" "+avx512dq" "-target-feature" "+avx512bw" "-target-feature" "+avx512vl" "-target-feature" "+avx512vbmi" "-target-feature" "+avx512ifma"
+// NO-AVX: "-target-feature" "-avx" "-target-feature" "-avx2" "-target-feature" "-avx512f" "-target-feature" "-avx512cd" "-target-feature" "-avx512er" "-target-feature" "-avx512pf" "-target-feature" "-avx512dq" "-target-feature" "-avx512bw" "-target-feature" "-avx512vl" "-target-feature" "-avx512vbmi" "-target-feature" "-avx512ifma"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mpclmul -mrdrnd -mfsgsbase -mbmi -mbmi2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BMI %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-pclmul -mno-rdrnd -mno-fsgsbase -mno-bmi -mno-bmi2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-BMI %s
+// BMI: "-target-feature" "+pclmul" "-target-feature" "+rdrnd" "-target-feature" "+fsgsbase" "-target-feature" "+bmi" "-target-feature" "+bmi2"
+// NO-BMI: "-target-feature" "-pclmul" "-target-feature" "-rdrnd" "-target-feature" "-fsgsbase" "-target-feature" "-bmi" "-target-feature" "-bmi2"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mlzcnt -mpopcnt -mtbm -mfma -mfma4 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=FMA %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-lzcnt -mno-popcnt -mno-tbm -mno-fma -mno-fma4 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-FMA %s
+// FMA: "-target-feature" "+lzcnt" "-target-feature" "+popcnt" "-target-feature" "+tbm" "-target-feature" "+fma" "-target-feature" "+fma4"
+// NO-FMA: "-target-feature" "-lzcnt" "-target-feature" "-popcnt" "-target-feature" "-tbm" "-target-feature" "-fma" "-target-feature" "-fma4"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mxop -mf16c -mrtm -mprfchw -mrdseed %s -### -o %t.o 2>&1 | FileCheck -check-prefix=XOP %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-xop -mno-f16c -mno-rtm -mno-prfchw -mno-rdseed %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-XOP %s
+// XOP: "-target-feature" "+xop" "-target-feature" "+f16c" "-target-feature" "+rtm" "-target-feature" "+prfchw" "-target-feature" "+rdseed"
+// NO-XOP: "-target-feature" "-xop" "-target-feature" "-f16c" "-target-feature" "-rtm" "-target-feature" "-prfchw" "-target-feature" "-rdseed"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msha -mpku -madx -mcx16 -mfxsr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SHA %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sha -mno-pku -mno-adx -mno-cx16 -mno-fxsr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SHA %s
+// SHA: "-target-feature" "+sha" "-target-feature" "+pku" "-target-feature" "+adx" "-target-feature" "+cx16" "-target-feature" "+fxsr"
+// NO-SHA: "-target-feature" "-sha" "-target-feature" "-pku" "-target-feature" "-adx" "-target-feature" "-cx16" "-target-feature" "-fxsr"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mxsave -mxsaveopt -mxsavec -mxsaves %s -### -o %t.o 2>&1 | FileCheck -check-prefix=XSAVE %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-xsave -mno-xsaveopt -mno-xsavec -mno-xsaves %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-XSAVE %s
+// XSAVE: "-target-feature" "+xsave" "-target-feature" "+xsaveopt" "-target-feature" "+xsavec" "-target-feature" "+xsaves"
+// NO-XSAVE: "-target-feature" "-xsave" "-target-feature" "-xsaveopt" "-target-feature" "-xsavec" "-target-feature" "-xsaves"
diff --git a/test/FixIt/fixit-errors.c b/test/FixIt/fixit-errors.c
index d727adb..1ac9d1c 100644
--- a/test/FixIt/fixit-errors.c
+++ b/test/FixIt/fixit-errors.c
@@ -22,6 +22,8 @@
   (void)get_origin->x; // expected-error {{base of member reference is a function; perhaps you meant to call it with no arguments?}}
 }
 
+// These errors require C11.
+#if __STDC_VERSION__ > 199901L
 void noreturn_1() _Noreturn; // expected-error {{must precede function declarator}}
 void noreturn_1() {
   return; // expected-warning {{should not return}}
@@ -29,3 +31,4 @@
 void noreturn_2() _Noreturn { // expected-error {{must precede function declarator}}
   return; // expected-warning {{should not return}}
 }
+#endif
diff --git a/test/FixIt/fixit-objc.m b/test/FixIt/fixit-objc.m
index f41f75f..3e9ff60 100644
--- a/test/FixIt/fixit-objc.m
+++ b/test/FixIt/fixit-objc.m
@@ -67,3 +67,11 @@
   sentinel(1, 2, 3); // expected-warning{{missing sentinel in function call}}
   [a sentinel:1, 2, 3]; // expected-warning{{missing sentinel in method dispatch}}
 }
+
+@interface A
+@property (class) int c;
+@end
+
+int test(A *a) {
+  return a.c; // expected-error {{property 'c' is a class property; did you mean to access it with class 'A'}}
+}
diff --git a/test/FixIt/fixit-vexing-parse.cpp b/test/FixIt/fixit-vexing-parse.cpp
index 0232f5d..71d3eff 100644
--- a/test/FixIt/fixit-vexing-parse.cpp
+++ b/test/FixIt/fixit-vexing-parse.cpp
@@ -60,7 +60,7 @@
     VO m(int (*p)[4]);
 
     // Don't emit warning and fixit because direct initializer is not permitted here.
-    if (int n(int())){} // expected-error {{function type is not allowed here}} expected-error {{condition must have an initializer}}
+    if (int n(int())){} // expected-error {{function type is not allowed here}}
 
     // CHECK: fix-it:"{{.*}}":{66:8-66:10}:" = {}"
     U u(); // expected-warning {{function declaration}} expected-note {{replace parentheses with an initializer}}
diff --git a/test/FixIt/typo.m b/test/FixIt/typo.m
index 143d026..53afe72 100644
--- a/test/FixIt/typo.m
+++ b/test/FixIt/typo.m
@@ -113,8 +113,6 @@
   
 @end
 
-double *isupper(int);
-
 @interface Sub2 : Super
 - (int)method2;
 @end
diff --git a/test/Format/remove-duplicate-includes.cpp b/test/Format/remove-duplicate-includes.cpp
new file mode 100644
index 0000000..dedb1f4
--- /dev/null
+++ b/test/Format/remove-duplicate-includes.cpp
@@ -0,0 +1,14 @@
+// RUN: grep -Ev "// *[A-Z-]+:" %s \
+// RUN:   | clang-format -style="{BasedOnStyle: LLVM, SortIncludes: true}" -lines=1:5 \
+// RUN:   | FileCheck -strict-whitespace %s
+// CHECK: {{^#include\ <a>$}}
+#include  <a>
+// CHECK: {{^#include\ <b>$}}
+#include  <b>
+#include  <a>
+#include  <b>
+#include  <b>
+{
+// CHECK: {{^\ \ int x\ \ ;$}}
+  int x  ;
+}
diff --git a/test/Frontend/aarch64-target-cpu.c b/test/Frontend/aarch64-target-cpu.c
new file mode 100644
index 0000000..9054647
--- /dev/null
+++ b/test/Frontend/aarch64-target-cpu.c
@@ -0,0 +1,14 @@
+// Ensure we support the various CPU names.
+//
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu cortex-a35 -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu cortex-a53 -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu cortex-a57 -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu cortex-a72 -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu cortex-a73 -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu cyclone -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu exynos-m1 -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu generic -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu kryo -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -target-cpu vulcan -verify %s
+//
+// expected-no-diagnostics
diff --git a/test/Frontend/backend-option.c b/test/Frontend/backend-option.c
new file mode 100644
index 0000000..e177574
--- /dev/null
+++ b/test/Frontend/backend-option.c
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 %s -emit-llvm -backend-option -time-passes -o - 2>&1 | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -backend-option -time-passes -o - -triple spir-unknown-unknown 2>&1 | FileCheck %s
+// CHECK: Pass execution timing report
+
diff --git a/test/Frontend/dependency-gen.c b/test/Frontend/dependency-gen.c
index 054aa79..e4b0fee 100644
--- a/test/Frontend/dependency-gen.c
+++ b/test/Frontend/dependency-gen.c
@@ -21,7 +21,7 @@
 // RUN: %clang -MD -MF - %s -fsyntax-only -I ./ | FileCheck -check-prefix=CHECK-SIX %s
 // CHECK-SIX: {{ }}x.h
 // RUN: echo "fun:foo" > %t.blacklist
-// RUN: %clang -MD -MF - %s -fsyntax-only -fsanitize=cfi-vcall -flto -fsanitize-blacklist=%t.blacklist -I ./ | FileCheck -check-prefix=CHECK-SEVEN %s
+// RUN: %clang -MD -MF - %s -fsyntax-only -fsanitize=cfi-vcall -flto -fvisibility=hidden -fsanitize-blacklist=%t.blacklist -I ./ | FileCheck -check-prefix=CHECK-SEVEN %s
 // CHECK-SEVEN: .blacklist
 // CHECK-SEVEN: {{ }}x.h
 #ifndef INCLUDE_FLAG_TEST
diff --git a/test/Frontend/embed-bitcode.ll b/test/Frontend/embed-bitcode.ll
new file mode 100644
index 0000000..bd2afb4
--- /dev/null
+++ b/test/Frontend/embed-bitcode.ll
@@ -0,0 +1,59 @@
+; REQUIRES: arm-registered-target
+; REQUIRES: aarch64-registered-target
+; check .ll input
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir %s -o - \
+; RUN:    | FileCheck %s
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=marker -x ir %s -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-MARKER
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnueabi -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir %s -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-ELF
+
+; check .bc input
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm-bc \
+; RUN:    -x ir %s -o %t.bc
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir %t.bc -o - \
+; RUN:    | FileCheck %s
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=bitcode -x ir %t.bc -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-ONLY-BITCODE
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=marker -x ir %t.bc -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-MARKER
+
+; run through -fembed-bitcode twice and make sure it doesn't crash
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm-bc \
+; RUN:    -fembed-bitcode=all -x ir %s -o - \
+; RUN: | %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir - -o /dev/null
+
+; check the magic number of bitcode at the beginning of the string
+; CHECK: @llvm.embedded.module = private constant
+; CHECK: c"\DE\C0\17\0B
+; CHECK: section "__LLVM,__bitcode"
+; CHECK: @llvm.cmdline = private constant
+; CHECK: section "__LLVM,__cmdline"
+
+; CHECK-ELF: @llvm.embedded.module
+; CHECK-ELF: section ".llvmbc"
+; CHECK-ELF: @llvm.cmdline
+; CHECK-ELF: section ".llvmcmd"
+
+; CHECK-ONLY-BITCODE: @llvm.embedded.module = private constant
+; CHECK-ONLY-BITCODE: c"\DE\C0\17\0B
+; CHECK-ONLY-BITCODE: section "__LLVM,__bitcode"
+; CHECK-ONLY-BITCODE-NOT: @llvm.cmdline = private constant
+; CHECK-ONLY-BITCODE-NOT: section "__LLVM,__cmdline"
+
+; CHECK-MARKER: @llvm.embedded.module
+; CHECK-MARKER: constant [0 x i8] zeroinitializer
+; CHECK-MARKER: section "__LLVM,__bitcode"
+; CHECK-MARKER: @llvm.cmdline
+; CHECK-MARKER: section "__LLVM,__cmdline"
+
+define i32 @f0() {
+  ret i32 0
+}
diff --git a/test/Frontend/gnu-mcount.c b/test/Frontend/gnu-mcount.c
new file mode 100644
index 0000000..c279b89
--- /dev/null
+++ b/test/Frontend/gnu-mcount.c
@@ -0,0 +1,78 @@
+// RUN: %clang -target armv7-unknown-none-eabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
+// RUN: %clang -target armv7-unknown-none-eabi -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
+// RUN: %clang -target aarch64-unknown-none-eabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI
+// RUN: %clang -target aarch64-unknown-none-eabi -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-MEABI-GNU
+// RUN: %clang -target armv7-unknown-linux-gnueabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
+// RUN: %clang -target armv7-unknown-linux-gnueabi -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
+// RUN: %clang -target aarch64-unknown-linux-gnueabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI
+// RUN: %clang -target aarch64-unknown-linux-gnueabi -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-MEABI-GNU
+// RUN: %clang -target armv7-unknown-linux-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
+// RUN: %clang -target armv7-unknown-linux-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
+// RUN: %clang -target aarch64-unknown-linux-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI
+// RUN: %clang -target aarch64-unknown-linux-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-MEABI-GNU
+// RUN: %clang -target armv7-unknown-freebsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-FREEBSD
+// RUN: %clang -target armv7-unknown-freebsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-FREEBSD
+// RUN: %clang -target aarch64-unknown-freebsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
+// RUN: %clang -target aarch64-unknown-freebsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
+// RUN: %clang -target armv7-unknown-openbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-OPENBSD
+// RUN: %clang -target armv7-unknown-openbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-OPENBSD
+// RUN: %clang -target aarch64-unknown-openbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-OPENBSD
+// RUN: %clang -target aarch64-unknown-openbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-OPENBSD
+// RUN: %clang -target armv7-unknown-netbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-NETBSD
+// RUN: %clang -target armv7-unknown-netbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-NETBSD
+// RUN: %clang -target aarch64-unknown-netbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-NETBSD
+// RUN: %clang -target aarch64-unknown-netbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-NETBSD
+// RUN: %clang -target armv7-apple-ios -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-IOS
+// RUN: %clang -target armv7-apple-ios -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-IOS
+// RUN: %clang -target arm64-apple-ios -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-IOS
+// RUN: %clang -target arm64-apple-ios -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-IOS
+// RUN: %clang -target armv7-unknown-bitrig-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-BIGRIG
+// RUN: %clang -target armv7-unknown-bitrig-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-BIGRIG
+// RUN: %clang -target aarch64-unknown-bitrig-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-BITRIG
+// RUN: %clang -target aarch64-unknown-bitrig-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-BITRIG
+// RUN: %clang -target armv7-unknown-rtems-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-RTEMS
+// RUN: %clang -target armv7-unknown-rtems-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-RTEMS
+// RUN: %clang -target aarch64-unknown-rtems-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-RTEMS
+// RUN: %clang -target aarch64-unknown-rtems-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-RTEMS
+// RUN: %clang -target armv7-unknown-cloudabi-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-CLOUDABI
+// RUN: %clang -target armv7-unknown-cloudabi-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-CLOUDABI
+// RUN: %clang -target aarch64-unknown-cloudabi-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-CLOUDABI
+// RUN: %clang -target aarch64-unknown-cloudabi-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-CLOUDABI
+
+int f() {
+  return 0;
+}
+
+// CHECK-LABEL: f
+// CHECK-ARM-IOS-NOT: call void @_mcount()
+// CHECK-ARM-IOS-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI: call void @"\01mcount"()
+// CHECK-ARM-EABI-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI: call void @mcount()
+// CHECK-ARM64-EABI-MEABI-GNU: call void @"\01_mcount"()
+// CHECK-ARM64-EABI-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-FREEBSD: call void @__mcount()
+// CHECK-ARM-EABI-FREEBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI-FREEBSD: call void @.mcount()
+// CHECK-ARM64-EABI-FREEBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-NETBSD: call void @_mcount()
+// CHECK-ARM-EABI-NETBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-OPENBSD: call void @__mcount()
+// CHECK-ARM-EABI-OPENBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI-OPENBSD: call void @mcount()
+// CHECK-ARM64-EABI-OPENBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-MEABI-GNU-NOT: call void @mcount()
+// CHECK-ARM-EABI-MEABI-GNU: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-BITRIG: call void @__mcount()
+// CHECK-ARM-EABI-BITRIG-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM54-EABI-BITRIG: call void @mcount()
+// CHECK-ARM54-EABI-BITRIG-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-RTEMS: call void @mcount()
+// CHECK-ARM-EABI-RTEMS-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI-RTEMS: call void @mcount()
+// CHECK-ARM64-EABI-RTEMS-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-CLOUDABI: call void @mcount()
+// CHECK-ARM-EABI-CLOUDABI-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI-CLOUDABI: call void @mcount()
+// CHECK-ARM64-EABI-CLOUDABI-NOT: call void @"\01__gnu_mcount_nc"()
+
diff --git a/test/Frontend/int128.cpp b/test/Frontend/int128.cpp
new file mode 100644
index 0000000..145144a
--- /dev/null
+++ b/test/Frontend/int128.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -x c   -std=gnu99   -dM -E -triple x86_64-pc-linux %s | FileCheck -check-prefix=NO %s
+// RUN: %clang_cc1 -x c++ -std=c++11   -dM -E -triple x86_64-pc-linux %s | FileCheck -check-prefix=NO %s
+// RUN: %clang_cc1 -x c++ -std=gnu++11 -dM -E -triple i686-pc-linux   %s | FileCheck -check-prefix=NO %s
+// RUN: %clang_cc1 -x c++ -std=gnu++11 -dM -E -triple x86_64-pc-linux %s | FileCheck -check-prefix=YES %s
+// RUN: %clang_cc1 -x c++ -std=gnu++1y -dM -E -triple x86_64-pc-linux %s | FileCheck -check-prefix=YES %s
+// PR23156
+
+// NO-NOT: __GLIBCXX_TYPE_INT_N_0
+// NO-NOT: __GLIBCXX_BITSIZE_INT_N_0
+// YES-DAG: __GLIBCXX_TYPE_INT_N_0
+// YES-DAG: __GLIBCXX_BITSIZE_INT_N_0
diff --git a/test/Frontend/lit.local.cfg b/test/Frontend/lit.local.cfg
index c11fb6d..7a05c5d 100644
--- a/test/Frontend/lit.local.cfg
+++ b/test/Frontend/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.c', '.cpp', '.m', '.mm', '.ll']
+config.suffixes = ['.c', '.cpp', '.m', '.mm', '.ll', '.cl']
diff --git a/test/Frontend/opencl.cl b/test/Frontend/opencl.cl
new file mode 100644
index 0000000..95b5f14
--- /dev/null
+++ b/test/Frontend/opencl.cl
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL1.1
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL1.2
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL2.0
+// RUN: %clang_cc1 %s -verify -fsyntax-only -fblocks -DBLOCKS
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL1.1 -fblocks -DBLOCKS
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL1.2 -fblocks -DBLOCKS
+// RUN: %clang_cc1 %s -triple amdgcn--amdhsa -x c -std=c99 -verify -fsyntax-only
+// RUN: %clang_cc1 -cl-std=CL1.1 -cl-strict-aliasing -fblocks %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID-OPENCL-VERSION11 %s
+// RUN: %clang_cc1 -cl-std=CL1.2 -cl-strict-aliasing -fblocks %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID-OPENCL-VERSION12 %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -cl-strict-aliasing %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID-OPENCL-VERSION20 %s
+
+void f(void (^g)(void)) {
+#ifdef __OPENCL_C_VERSION__
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0 && !defined(BLOCKS)
+  // expected-error@-3{{blocks support disabled - compile with -fblocks or for OpenCL 2.0 or above}}
+#else
+  // expected-no-diagnostics
+#endif
+#else
+  // expected-error@-8{{blocks support disabled - compile with -fblocks or pick a deployment target that supports them}}
+#endif
+}
+
+// CHECK-INVALID-OPENCL-VERSION11: warning: OpenCL version 1.1 does not support the option '-cl-strict-aliasing'
+// CHECK-INVALID-OPENCL-VERSION12: warning: OpenCL version 1.2 does not support the option '-cl-strict-aliasing'
+// CHECK-INVALID-OPENCL-VERSION20: warning: OpenCL version 2.0 does not support the option '-cl-strict-aliasing'
diff --git a/test/Frontend/optimization-remark-analysis.c b/test/Frontend/optimization-remark-analysis.c
index 5b4d9ae..b396327 100644
--- a/test/Frontend/optimization-remark-analysis.c
+++ b/test/Frontend/optimization-remark-analysis.c
@@ -1,8 +1,8 @@
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s
 
-// RPASS: {{.*}}:21:1: remark: loop not vectorized: loop contains a switch statement
-// CHECK-NOT: {{.*}}:21:1: remark: loop not vectorized: loop contains a switch statement
+// RPASS: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
+// CHECK-NOT: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
 
 double foo(int N, int *Array) {
   double v = 0.0;
diff --git a/test/Frontend/optimization-remark-options.c b/test/Frontend/optimization-remark-options.c
index 74fbeaf..a2d717a 100644
--- a/test/Frontend/optimization-remark-options.c
+++ b/test/Frontend/optimization-remark-options.c
@@ -11,7 +11,7 @@
   return v;
 }
 
-// CHECK: {{.*}}:18:13: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop. If the arrays will always be independent specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments. Erroneous results will occur if these options are incorrectly applied!
+// CHECK: {{.*}}:17:3: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop. If the arrays will always be independent specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments. Erroneous results will occur if these options are incorrectly applied!
 
 void foo2(int *dw, int *uw, int *A, int *B, int *C, int *D, int N) {
   for (int i = 0; i < N; i++) {
diff --git a/test/Frontend/plugin-annotate-functions.c b/test/Frontend/plugin-annotate-functions.c
new file mode 100644
index 0000000..b8baf7c
--- /dev/null
+++ b/test/Frontend/plugin-annotate-functions.c
@@ -0,0 +1,25 @@
+// RUN: %clang -fplugin=%llvmshlibdir/AnnotateFunctions%pluginext -emit-llvm -DPRAGMA_ON -S %s -o - | FileCheck %s --check-prefix=PRAGMA
+// RUN: %clang -fplugin=%llvmshlibdir/AnnotateFunctions%pluginext -emit-llvm -S %s -o - | FileCheck %s --check-prefix=NOPRAGMA
+// RUN: not %clang -fplugin=%llvmshlibdir/AnnotateFunctions%pluginext -emit-llvm -DBAD_PRAGMA -S %s -o - 2>&1 | FileCheck %s --check-prefix=BADPRAGMA
+// REQUIRES: plugins, examples
+
+#ifdef PRAGMA_ON
+#pragma enable_annotate
+#endif
+
+// BADPRAGMA: warning: extra tokens at end of #pragma directive
+#ifdef BAD_PRAGMA
+#pragma enable_annotate something
+#endif
+
+// PRAGMA: [[STR_VAR:@.+]] = private unnamed_addr constant [19 x i8] c"example_annotation\00"
+// PRAGMA: @llvm.global.annotations = {{.*}}@fn1{{.*}}[[STR_VAR]]{{.*}}@fn2{{.*}}[[STR_VAR]]
+// NOPRAGMA-NOT: [[STR_VAR:@.+]] = private unnamed_addr constant [19 x i8] c"example_annotation\00"
+// NOPRAGMA-NOT: @llvm.global.annotations = {{.*}}@fn1{{.*}}[[STR_VAR]]{{.*}}@fn2{{.*}}[[STR_VAR]]
+void fn1() { }
+void fn2() { }
+
+// BADPRAGMA: error: #pragma enable_annotate not allowed after declarations
+#ifdef BAD_PRAGMA
+#pragma enable_annotate
+#endif
diff --git a/test/Frontend/print-header-includes.c b/test/Frontend/print-header-includes.c
index 3f2b069..045c02b 100644
--- a/test/Frontend/print-header-includes.c
+++ b/test/Frontend/print-header-includes.c
@@ -1,24 +1,24 @@
-// RUN: cd %S
-// RUN: %clang_cc1 -include Inputs/test3.h -E -H -o %t.out %s 2> %t.stderr
+// RUN: %clang_cc1 -I%S -include Inputs/test3.h -E -H -o /dev/null %s 2> %t.stderr
 // RUN: FileCheck < %t.stderr %s
 
 // CHECK-NOT: test3.h
 // CHECK: . {{.*test.h}}
 // CHECK: .. {{.*test2.h}}
 
-// RUN: %clang_cc1 -include Inputs/test3.h -E --show-includes -o %t.out %s > %t.stdout
-// RUN: FileCheck --check-prefix=MS < %t.stdout %s
-// MS-NOT: test3.h
-// MS: Note: including file: {{.*test.h}}
-// MS: Note: including file:  {{.*test2.h}}
+// RUN: %clang_cc1 -I%S -include Inputs/test3.h -E --show-includes -o /dev/null %s | \
+// RUN:     FileCheck --strict-whitespace --check-prefix=MS %s
+// MS-NOT: <command line>
+// MS: Note: including file: {{[^ ]*test3.h}}
+// MS: Note: including file: {{[^ ]*test.h}}
+// MS: Note: including file:  {{[^ ]*test2.h}}
 // MS-NOT: Note
 
 // RUN: echo "fun:foo" > %t.blacklist
-// RUN: %clang_cc1 -fsanitize=address -fdepfile-entry=%t.blacklist -E --show-includes -o %t.out %s > %t.stdout
-// RUN: FileCheck --check-prefix=MS-BLACKLIST < %t.stdout %s
-// MS-BLACKLIST: Note: including file: {{.*\.blacklist}}
-// MS-BLACKLIST: Note: including file: {{.*test.h}}
-// MS-BLACKLIST: Note: including file:  {{.*test2.h}}
+// RUN: %clang_cc1 -I%S -fsanitize=address -fdepfile-entry=%t.blacklist -E --show-includes -o /dev/null %s | \
+// RUN:     FileCheck --strict-whitespace --check-prefix=MS-BLACKLIST %s
+// MS-BLACKLIST: Note: including file: {{[^ ]*\.blacklist}}
+// MS-BLACKLIST: Note: including file: {{[^ ]*test.h}}
+// MS-BLACKLIST: Note: including file:  {{[^ ]*test2.h}}
 // MS-BLACKLIST-NOT: Note
 
 #include "Inputs/test.h"
diff --git a/test/Frontend/std.cl b/test/Frontend/std.cl
deleted file mode 100644
index b811b64..0000000
--- a/test/Frontend/std.cl
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %clang_cc1 %s -fsyntax-only -cl-std=CL
-// RUN: %clang_cc1 %s -fsyntax-only -cl-std=CL1.1
-// RUN: %clang_cc1 %s -fsyntax-only -cl-std=CL1.2
-// RUN: %clang_cc1 %s -fsyntax-only -cl-std=CL2.0
-// RUN: not %clang_cc1 %s -fsyntax-only -cl-std=invalid -DINVALID 2>&1 | FileCheck %s
-
-#ifdef INVALID 
-// CHECK: invalid value 'invalid' in '-cl-std=invalid'
-#endif
diff --git a/test/Frontend/stdlang.c b/test/Frontend/stdlang.c
index 71997f1..9c3c307 100644
--- a/test/Frontend/stdlang.c
+++ b/test/Frontend/stdlang.c
@@ -1,6 +1,17 @@
 // RUN: %clang_cc1 -x cuda -std=c++11 -DCUDA %s
-// RUN: %clang_cc1 -x cl -std=c99 -DOPENCL %s
-// expected-no-diagnostics
+// RUN: %clang_cc1 -x cl -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=cl -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=cl1.1 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=cl1.2 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=cl2.0 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CL -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -DOPENCL %s
+// RUN: not %clang_cc1 -x cl -std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s
+// RUN: not %clang_cc1 -x cl -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
+// CHECK-C99: error: invalid argument '-std=c99' not allowed with 'OpenCL'
+// CHECK-INVALID: error: invalid value 'invalid' in '-cl-std=invalid'
 
 #if defined(CUDA)
   __attribute__((device)) void f_device();
diff --git a/test/Headers/float.c b/test/Headers/float.c
new file mode 100644
index 0000000..46e9cc3
--- /dev/null
+++ b/test/Headers/float.c
@@ -0,0 +1,219 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c89 -ffreestanding %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c99 -ffreestanding %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -ffreestanding %s
+// expected-no-diagnostics
+
+/* Basic floating point conformance checks against:
+    - N1570 draft of C11 Std.
+    - N1256 draft of C99 Std.
+    - http://port70.net/~nsz/c/c89/c89-draft.html draft of C89/C90 Std.
+*/
+/*
+    C11,    5.2.4.2.2p11,   pp. 30
+    C99,    5.2.4.2.2p9,    pp. 25
+    C89,    2.2.4.2 
+*/
+#include <float.h>
+
+#ifndef FLT_RADIX
+    #error "Mandatory macro FLT_RADIX is missing."
+#elif   FLT_RADIX < 2
+    #error "Mandatory macro FLT_RADIX is invalid."
+#endif
+
+
+#ifndef FLT_MANT_DIG
+    #error "Mandatory macro FLT_MANT_DIG is missing."
+#elif   FLT_MANT_DIG < 2
+    #error "Mandatory macro FLT_MANT_DIG is invalid."
+#endif
+#ifndef DBL_MANT_DIG
+    #error "Mandatory macro DBL_MANT_DIG is missing."
+#elif   DBL_MANT_DIG < 2
+    #error "Mandatory macro DBL_MANT_DIG is invalid."
+#endif
+#ifndef LDBL_MANT_DIG
+    #error "Mandatory macro LDBL_MANT_DIG is missing."
+#elif   LDBL_MANT_DIG < 2
+    #error "Mandatory macro LDBL_MANT_DIG is invalid."
+#endif
+#if ((FLT_MANT_DIG > DBL_MANT_DIG) || (DBL_MANT_DIG > LDBL_MANT_DIG))
+    #error "Mandatory macros {FLT,DBL,LDBL}_MANT_DIG are invalid."
+#endif
+
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+    #ifndef FLT_DECIMAL_DIG
+        #error "Mandatory macro FLT_DECIMAL_DIG is missing."
+    #elif   FLT_DECIMAL_DIG < 6
+        #error "Mandatory macro FLT_DECIMAL_DIG is invalid."
+    #endif
+    #ifndef DBL_DECIMAL_DIG
+        #error "Mandatory macro DBL_DECIMAL_DIG is missing."
+    #elif   DBL_DECIMAL_DIG < 10
+        #error "Mandatory macro DBL_DECIMAL_DIG is invalid."
+    #endif
+    #ifndef LDBL_DECIMAL_DIG
+        #error "Mandatory macro LDBL_DECIMAL_DIG is missing."
+    #elif   LDBL_DECIMAL_DIG < 10
+        #error "Mandatory macro LDBL_DECIMAL_DIG is invalid."
+    #endif
+    #if ((FLT_DECIMAL_DIG > DBL_DECIMAL_DIG) || (DBL_DECIMAL_DIG > LDBL_DECIMAL_DIG))
+        #error "Mandatory macros {FLT,DBL,LDBL}_DECIMAL_DIG are invalid."
+    #endif
+#else
+    #ifdef FLT_DECIMAL_DIG
+        #error "Macro FLT_DECIMAL_DIG should not be defined."
+    #endif
+    #ifdef DBL_DECIMAL_DIG
+        #error "Macro DBL_DECIMAL_DIG should not be defined."
+    #endif
+    #ifdef LDBL_DECIMAL_DIG
+        #error "Macro LDBL_DECIMAL_DIG should not be defined."
+    #endif
+#endif
+
+
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+    #ifndef DECIMAL_DIG
+        #error "Mandatory macro DECIMAL_DIG is missing."
+    #elif   DECIMAL_DIG < 10
+        #error "Mandatory macro DECIMAL_DIG is invalid."
+    #endif
+#else
+    #ifdef DECIMAL_DIG
+        #error "Macro DECIMAL_DIG should not be defined."
+    #endif
+#endif
+
+
+#ifndef FLT_DIG
+    #error "Mandatory macro FLT_DIG is missing."
+#elif   FLT_DIG < 6
+    #error "Mandatory macro FLT_DIG is invalid."
+#endif
+#ifndef DBL_DIG
+    #error "Mandatory macro DBL_DIG is missing."
+#elif   DBL_DIG < 10
+    #error "Mandatory macro DBL_DIG is invalid."
+#endif
+#ifndef LDBL_DIG
+    #error "Mandatory macro LDBL_DIG is missing."
+#elif   LDBL_DIG < 10
+    #error "Mandatory macro LDBL_DIG is invalid."
+#endif
+#if ((FLT_DIG > DBL_DIG) || (DBL_DIG > LDBL_DIG))
+    #error "Mandatory macros {FLT,DBL,LDBL}_DIG, are invalid."
+#endif
+
+
+#ifndef FLT_MIN_EXP
+    #error "Mandatory macro FLT_MIN_EXP is missing."
+#elif   FLT_MIN_EXP > -1
+    #error "Mandatory macro FLT_MIN_EXP is invalid."
+#endif
+#ifndef DBL_MIN_EXP
+    #error "Mandatory macro DBL_MIN_EXP is missing."
+#elif   DBL_MIN_EXP > -1
+    #error "Mandatory macro DBL_MIN_EXP is invalid."
+#endif
+#ifndef LDBL_MIN_EXP
+    #error "Mandatory macro LDBL_MIN_EXP is missing."
+#elif   LDBL_MIN_EXP > -1
+    #error "Mandatory macro LDBL_MIN_EXP is invalid."
+#endif
+
+
+#ifndef FLT_MIN_10_EXP
+    #error "Mandatory macro FLT_MIN_10_EXP is missing."
+#elif   FLT_MIN_10_EXP > -37
+    #error "Mandatory macro FLT_MIN_10_EXP is invalid."
+#endif
+#ifndef DBL_MIN_10_EXP
+    #error "Mandatory macro DBL_MIN_10_EXP is missing."
+#elif   DBL_MIN_10_EXP > -37
+    #error "Mandatory macro DBL_MIN_10_EXP is invalid."
+#endif
+#ifndef LDBL_MIN_10_EXP
+    #error "Mandatory macro LDBL_MIN_10_EXP is missing."
+#elif   LDBL_MIN_10_EXP > -37
+    #error "Mandatory macro LDBL_MIN_10_EXP is invalid."
+#endif
+
+
+#ifndef FLT_MAX_EXP
+    #error "Mandatory macro FLT_MAX_EXP is missing."
+#elif   FLT_MAX_EXP < 1
+    #error "Mandatory macro FLT_MAX_EXP is invalid."
+#endif
+#ifndef DBL_MAX_EXP
+    #error "Mandatory macro DBL_MAX_EXP is missing."
+#elif   DBL_MAX_EXP < 1
+    #error "Mandatory macro DBL_MAX_EXP is invalid."
+#endif
+#ifndef LDBL_MAX_EXP
+    #error "Mandatory macro LDBL_MAX_EXP is missing."
+#elif   LDBL_MAX_EXP < 1
+    #error "Mandatory macro LDBL_MAX_EXP is invalid."
+#endif
+#if ((FLT_MAX_EXP > DBL_MAX_EXP) || (DBL_MAX_EXP > LDBL_MAX_EXP))
+    #error "Mandatory macros {FLT,DBL,LDBL}_MAX_EXP are invalid."
+#endif
+
+
+#ifndef FLT_MAX_10_EXP
+    #error "Mandatory macro FLT_MAX_10_EXP is missing."
+#elif   FLT_MAX_10_EXP < 37
+    #error "Mandatory macro FLT_MAX_10_EXP is invalid."
+#endif
+#ifndef DBL_MAX_10_EXP
+    #error "Mandatory macro DBL_MAX_10_EXP is missing."
+#elif   DBL_MAX_10_EXP < 37
+    #error "Mandatory macro DBL_MAX_10_EXP is invalid."
+#endif
+#ifndef LDBL_MAX_10_EXP
+    #error "Mandatory macro LDBL_MAX_10_EXP is missing."
+#elif   LDBL_MAX_10_EXP < 37
+    #error "Mandatory macro LDBL_MAX_10_EXP is invalid."
+#endif
+#if ((FLT_MAX_10_EXP > DBL_MAX_10_EXP) || (DBL_MAX_10_EXP > LDBL_MAX_10_EXP))
+    #error "Mandatory macros {FLT,DBL,LDBL}_MAX_10_EXP are invalid."
+#endif
+
+
+/* Internal consistency checks */
+_Static_assert(FLT_RADIX == __FLT_RADIX__, "");
+
+_Static_assert(FLT_MANT_DIG == __FLT_MANT_DIG__, "");
+_Static_assert(DBL_MANT_DIG == __DBL_MANT_DIG__, "");
+_Static_assert(LDBL_MANT_DIG == __LDBL_MANT_DIG__, "");
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+_Static_assert(FLT_DECIMAL_DIG == __FLT_DECIMAL_DIG__, "");
+_Static_assert(DBL_DECIMAL_DIG == __DBL_DECIMAL_DIG__, "");
+_Static_assert(LDBL_DECIMAL_DIG == __LDBL_DECIMAL_DIG__, "");
+#endif
+
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+_Static_assert(DECIMAL_DIG == __DECIMAL_DIG__, "");
+#endif
+
+_Static_assert(FLT_DIG == __FLT_DIG__, "");
+_Static_assert(DBL_DIG == __DBL_DIG__, "");
+_Static_assert(LDBL_DIG == __LDBL_DIG__, "");
+
+_Static_assert(FLT_MIN_EXP == __FLT_MIN_EXP__, "");
+_Static_assert(DBL_MIN_EXP == __DBL_MIN_EXP__, "");
+_Static_assert(LDBL_MIN_EXP == __LDBL_MIN_EXP__, "");
+
+_Static_assert(FLT_MIN_10_EXP == __FLT_MIN_10_EXP__, "");
+_Static_assert(DBL_MIN_10_EXP == __DBL_MIN_10_EXP__, "");
+_Static_assert(LDBL_MIN_10_EXP == __LDBL_MIN_10_EXP__, "");
+
+_Static_assert(FLT_MAX_EXP == __FLT_MAX_EXP__, "");
+_Static_assert(DBL_MAX_EXP == __DBL_MAX_EXP__, "");
+_Static_assert(LDBL_MAX_EXP == __LDBL_MAX_EXP__, "");
+
+_Static_assert(FLT_MAX_10_EXP == __FLT_MAX_10_EXP__, "");
+_Static_assert(DBL_MAX_10_EXP == __DBL_MAX_10_EXP__, "");
+_Static_assert(LDBL_MAX_10_EXP == __LDBL_MAX_10_EXP__, "");
diff --git a/test/Headers/ms-intrin.cpp b/test/Headers/ms-intrin.cpp
index 9356d21..25c5531 100644
--- a/test/Headers/ms-intrin.cpp
+++ b/test/Headers/ms-intrin.cpp
@@ -20,11 +20,11 @@
 
 // REQUIRES: x86-registered-target
 
-// Intrin.h needs size_t, but -ffreestanding prevents us from getting it from
+// intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
 typedef __SIZE_TYPE__ size_t;
 
-#include <Intrin.h>
+#include <intrin.h>
 
 // Use some C++ to make sure we closed the extern "C" brackets.
 template <typename T>
@@ -60,4 +60,8 @@
   __readcr3();
   __writecr3(0);
 #endif
+
+#ifdef _M_ARM
+  __dmb(_ARM_BARRIER_ISHST);
+#endif
 }
diff --git a/test/Headers/opencl-c-header.cl b/test/Headers/opencl-c-header.cl
new file mode 100644
index 0000000..3723935
--- /dev/null
+++ b/test/Headers/opencl-c-header.cl
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple ppc64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple ppc64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple ppc64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple ppc64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -finclude-default-header -emit-llvm -o - %s | FileCheck %s
+// CHECK: _Z16convert_char_rtec
+// CHECK-NOT: _Z3ctzc
+// CHECK20: _Z3ctzc
+// CHECK20-NOT: _Z16convert_char_rtec
+// CHECK-MOD: Reading modules
+
+// Test including the default header as a module.
+// The module should be compiled only once and loaded from cache afterwards.
+// Change the directory mode to read only to make sure no new modules are created.
+// Check time report to make sure module is used.
+
+// ===
+// Clear current directory.
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+
+// ===
+// Compile for OpenCL 1.0 for the first time. A module should be generated.
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MOD %s
+// RUN: chmod u-w %t/opencl_c.pcm
+
+// ===
+// Compile for OpenCL 1.0 for the second time. The module should not be re-created.
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MOD %s
+// RUN: chmod u+w %t/opencl_c.pcm
+// RUN: mv %t/opencl_c.pcm %t/1_0.pcm
+
+// ===
+// Compile for OpenCL 2.0 for the first time. The module should change.
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: not diff %t/1_0.pcm %t/opencl_c.pcm
+// RUN: chmod u-w %t/opencl_c.pcm
+
+// ===
+// Compile for OpenCL 2.0 for the second time. The module should not change.
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+
+// Check cached module works for different OpenCL versions.
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MOD %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: %clang_cc1 -triple amdgcn--amdhsa -emit-llvm -o - -cl-std=CL2.0  -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: chmod u-w %t 
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MOD %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: %clang_cc1 -triple amdgcn--amdhsa -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: chmod u+w %t
+
+char f(char x) {
+#if __OPENCL_C_VERSION__ != CL_VERSION_2_0
+  return convert_char_rte(x);
+#ifdef NO_HEADER
+  //expected-warning@-2{{implicit declaration of function 'convert_char_rte' is invalid in C99}}
+#endif //NO_HEADER
+
+#else //__OPENCL_C_VERSION__
+  return ctz(x);
+#endif //__OPENCL_C_VERSION__
+}
diff --git a/test/Headers/x86intrin-2.c b/test/Headers/x86intrin-2.c
index f98fdbd..9be8545 100644
--- a/test/Headers/x86intrin-2.c
+++ b/test/Headers/x86intrin-2.c
@@ -72,7 +72,7 @@
   return _mm_cmpeq_epi32_mask(a, b);
 }
 
-__v64qi __attribute__((__target__("avx512bw"))) mm512_setzero_qi_wrap(void) {
+__m512i __attribute__((__target__("avx512bw"))) mm512_setzero_qi_wrap(void) {
   return _mm512_setzero_qi();
 }
 
diff --git a/test/Headers/xmmintrin.c b/test/Headers/xmmintrin.c
index 39743c9..c617504 100644
--- a/test/Headers/xmmintrin.c
+++ b/test/Headers/xmmintrin.c
@@ -7,6 +7,9 @@
 // REQUIRES: x86-registered-target
 #include <xmmintrin.h>
 
+// CHECK: @c = common global i8 0, align 16
+_MM_ALIGN16 char c;
+
 // Make sure the last step of _mm_cvtps_pi16 converts <4 x i32> to <4 x i16> by
 // checking that clang emits PACKSSDW instead of PACKSSWB.
 
@@ -23,3 +26,7 @@
   return _mm_add_sd(__a, __b);
 }
 
+#if __STDC_HOSTED__
+// Make sure stdlib.h symbols are accessible.
+void *p = NULL;
+#endif
diff --git a/test/Index/annotate-tokens.c b/test/Index/annotate-tokens.c
index 2f95ca6..c72e4f7 100644
--- a/test/Index/annotate-tokens.c
+++ b/test/Index/annotate-tokens.c
@@ -80,10 +80,10 @@
 // CHECK: Punctuation: "(" [5:3 - 5:4] CStyleCastExpr=
 // CHECK: Keyword: "void" [5:4 - 5:8] CStyleCastExpr=
 // CHECK: Punctuation: ")" [5:8 - 5:9] CStyleCastExpr=
-// CHECK: Keyword: "sizeof" [5:9 - 5:15] UnexposedExpr=
-// CHECK: Punctuation: "(" [5:15 - 5:16] UnexposedExpr=
+// CHECK: Keyword: "sizeof" [5:9 - 5:15] UnaryExpr=
+// CHECK: Punctuation: "(" [5:15 - 5:16] UnaryExpr=
 // CHECK: Identifier: "T" [5:16 - 5:17] TypeRef=T:1:13
-// CHECK: Punctuation: ")" [5:17 - 5:18] UnexposedExpr=
+// CHECK: Punctuation: ")" [5:17 - 5:18] UnaryExpr=
 // CHECK: Punctuation: ";" [5:18 - 5:19] CompoundStmt=
 // CHECK: Keyword: "struct" [7:3 - 7:9] VarDecl=x:7:12 (Definition)
 // CHECK: Identifier: "X" [7:10 - 7:11] TypeRef=struct X:2:8
diff --git a/test/Index/availability.cpp b/test/Index/availability.cpp
index d6f9038..d8cd3bf 100644
--- a/test/Index/availability.cpp
+++ b/test/Index/availability.cpp
@@ -10,4 +10,4 @@
 // CHECK: FunctionDecl=foo:1:6 (unavailable) [type=void ()] [typekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [isPOD=0]
 // CHECK: StructDecl=Foo:3:8 (Definition) [type=Foo] [typekind=Record] [isPOD=1]
 // CHECK: CXXMethod=foo:4:7 (unavailable) [type=int (){{.*}}] [typekind=FunctionProto] [resulttype=int] [resulttypekind=Int] [isPOD=0]
-// CHECK: CXXConstructor=Foo:5:3 (unavailable) [type=void (){{.*}}] [typekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [isPOD=0]
+// CHECK: CXXConstructor=Foo:5:3 (unavailable) (default constructor) [type=void (){{.*}}] [typekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [isPOD=0]
diff --git a/test/Index/complete-method-decls.m b/test/Index/complete-method-decls.m
index 4113408..8a17142 100644
--- a/test/Index/complete-method-decls.m
+++ b/test/Index/complete-method-decls.m
@@ -195,7 +195,7 @@
 // CHECK-CCG: NotImplemented:{TypedText void} (50)
 // CHECK-CCG: NotImplemented:{TypedText volatile} (50)
 // RUN: c-index-test -code-completion-at=%s:60:24 %s | FileCheck -check-prefix=CHECK-CCF %s
-// RUN: c-index-test -code-completion-at=%s:60:26 %s | FileCheck -check-prefix=CHECK-CCH %s
+// RUN: c-index-test -code-completion-at=%s:60:27 %s | FileCheck -check-prefix=CHECK-CCH %s
 // CHECK-CCH: ObjCInterfaceDecl:{TypedText A} (50)
 // CHECK-CCH: ObjCInterfaceDecl:{TypedText B} (50)
 // CHECK-CCH: NotImplemented:{TypedText bycopy} (40)
diff --git a/test/Index/complete-objc-message-id.m b/test/Index/complete-objc-message-id.m
index 415e0ff..044c828 100644
--- a/test/Index/complete-objc-message-id.m
+++ b/test/Index/complete-objc-message-id.m
@@ -68,7 +68,7 @@
 // CHECK-SELECTOR-PREF: ObjCClassMethodDecl:{ResultType id}{TypedText new} (35)
 // CHECK-SELECTOR-PREF: ObjCClassMethodDecl:{ResultType Class}{TypedText superclass} (35)
 
-// RUN: c-index-test -code-completion-at=%s:46:7 %s | FileCheck -check-prefix=CHECK-INSTANCE-QUAL-ID %s
-// RUN: c-index-test -code-completion-at=%s:47:7 %s | FileCheck -check-prefix=CHECK-INSTANCE-QUAL-ID %s
+// RUN: c-index-test -code-completion-at=%s:46:8 %s | FileCheck -check-prefix=CHECK-INSTANCE-QUAL-ID %s
+// RUN: c-index-test -code-completion-at=%s:47:8 %s | FileCheck -check-prefix=CHECK-INSTANCE-QUAL-ID %s
 // CHECK-INSTANCE-QUAL-ID: ObjCInstanceMethodDecl:{ResultType int}{TypedText P1_method1} (37)
 // CHECK-INSTANCE-QUAL-ID: ObjCInstanceMethodDecl:{ResultType int}{TypedText P2_method1} (35)
diff --git a/test/Index/complete-objc-message.m b/test/Index/complete-objc-message.m
index 193f1f8..e3fce6b 100644
--- a/test/Index/complete-objc-message.m
+++ b/test/Index/complete-objc-message.m
@@ -218,13 +218,13 @@
 // CHECK-CC2-NEXT: Container Kind: ObjCInterfaceDecl
 // CHECK-CC2-NEXT: Container is complete
 // CHECK-CC2-NEXT: Container USR: c:objc(cs)Foo
-// RUN: c-index-test -code-completion-at=%s:61:16 %s | FileCheck -check-prefix=CHECK-CC3 %s
+// RUN: c-index-test -code-completion-at=%s:61:17 %s | FileCheck -check-prefix=CHECK-CC3 %s
 // CHECK-CC3: ObjCClassMethodDecl:{ResultType int}{TypedText MyClassMethod:}{Placeholder (id)}
 // CHECK-CC3: ObjCClassMethodDecl:{ResultType int}{TypedText MyPrivateMethod}
-// RUN: c-index-test -code-completion-at=%s:65:16 %s | FileCheck -check-prefix=CHECK-CC4 %s
+// RUN: c-index-test -code-completion-at=%s:65:17 %s | FileCheck -check-prefix=CHECK-CC4 %s
 // CHECK-CC4: ObjCInstanceMethodDecl:{ResultType int}{TypedText MyInstMethod:}{Placeholder (id)}{HorizontalSpace  }{TypedText second:}{Placeholder (id)}
 // CHECK-CC4: ObjCInstanceMethodDecl:{ResultType int}{TypedText MyPrivateInstMethod}
-// RUN: c-index-test -code-completion-at=%s:74:9 %s | FileCheck -check-prefix=CHECK-CC5 %s
+// RUN: c-index-test -code-completion-at=%s:74:10 %s | FileCheck -check-prefix=CHECK-CC5 %s
 // CHECK-CC5: ObjCInstanceMethodDecl:{ResultType int}{TypedText MyInstMethod:}{Placeholder (id)}{HorizontalSpace  }{TypedText second:}{Placeholder (id)}
 // CHECK-CC5: ObjCInstanceMethodDecl:{ResultType int}{TypedText MySubInstMethod}
 // RUN: c-index-test -code-completion-at=%s:82:8 %s | FileCheck -check-prefix=CHECK-CC6 %s
@@ -311,7 +311,7 @@
 // CHECK-CCI: ObjCInstanceMethodDecl:{ResultType void}{TypedText method1} (37)
 // CHECK-CCI: ObjCInstanceMethodDecl:{ResultType void}{TypedText method2} (35)
 
-// RUN: c-index-test -code-completion-at=%s:150:5 %s | FileCheck -check-prefix=CHECK-REDUNDANT %s
+// RUN: c-index-test -code-completion-at=%s:150:6 %s | FileCheck -check-prefix=CHECK-REDUNDANT %s
 // CHECK-REDUNDANT: ObjCInstanceMethodDecl:{ResultType void}{TypedText method2} (35)
 // CHECK-REDUNDANT-NOT: ObjCInstanceMethodDecl:{ResultType void}{TypedText method2}
 // CHECK-REDUNDANT: ObjCInstanceMethodDecl:{ResultType void}{TypedText method3} (35)
diff --git a/test/Index/complete-recovery.m b/test/Index/complete-recovery.m
index ec5bf8a..bd920eb 100644
--- a/test/Index/complete-recovery.m
+++ b/test/Index/complete-recovery.m
@@ -26,7 +26,7 @@
 // Test case for fix committed in r145441.
 // RUN: env CINDEXTEST_CODE_COMPLETE_PATTERNS=1 c-index-test -code-completion-at=%s:9:20 %s -fms-compatibility | FileCheck -check-prefix=CHECK-CC1 %s
 
-// RUN: env CINDEXTEST_CODE_COMPLETE_PATTERNS=1 c-index-test -code-completion-at=%s:10:24 %s | FileCheck -check-prefix=CHECK-CC2 %s
+// RUN: env CINDEXTEST_CODE_COMPLETE_PATTERNS=1 c-index-test -code-completion-at=%s:10:25 %s | FileCheck -check-prefix=CHECK-CC2 %s
 // CHECK-CC2: NotImplemented:{ResultType char[]}{TypedText @encode}{LeftParen (}{Placeholder type-name}{RightParen )}
 // CHECK-CC2: NotImplemented:{TypedText _Bool}
 // CHECK-CC2: VarDecl:{ResultType A *}{TypedText a}
diff --git a/test/Index/complete-super.m b/test/Index/complete-super.m
index be7edfd..952ffbc 100644
--- a/test/Index/complete-super.m
+++ b/test/Index/complete-super.m
@@ -60,7 +60,7 @@
 // RUN: c-index-test -code-completion-at=%s:20:16 %s | FileCheck -check-prefix=CHECK-ADD-TO %s
 // CHECK-ADD-TO: ObjCInstanceMethodDecl:{ResultType void}{Informative add:}{TypedText to:}{Placeholder b} (20)
 
-// RUN: c-index-test -code-completion-at=%s:24:28 %s | FileCheck -check-prefix=CHECK-SELECTOR-FIRST %s
+// RUN: c-index-test -code-completion-at=%s:24:29 %s | FileCheck -check-prefix=CHECK-SELECTOR-FIRST %s
 // CHECK-SELECTOR-FIRST: ObjCClassMethodDecl:{ResultType void}{Informative select:}{TypedText first:}{Placeholder a}{HorizontalSpace  }{Text second:}{Placeholder b} (20)
 
 // Check "super" completion at the third identifier
@@ -69,7 +69,7 @@
 
 // Check "super" completion with missing '['.
 // RUN: c-index-test -code-completion-at=%s:25:10 %s | FileCheck -check-prefix=CHECK-SELECTOR-SELECTOR %s
-// RUN: c-index-test -code-completion-at=%s:25:28 %s | FileCheck -check-prefix=CHECK-SELECTOR-FIRST %s
+// RUN: c-index-test -code-completion-at=%s:25:29 %s | FileCheck -check-prefix=CHECK-SELECTOR-FIRST %s
 // RUN: c-index-test -code-completion-at=%s:25:37 %s | FileCheck -check-prefix=CHECK-SELECTOR-SECOND %s
 
 // Check "super" completion for a method declared in a category.
diff --git a/test/Index/file-refs.cpp b/test/Index/file-refs.cpp
index a96d27c..c5a728b 100644
--- a/test/Index/file-refs.cpp
+++ b/test/Index/file-refs.cpp
@@ -59,7 +59,7 @@
 // RUN:  -file-refs-at=%s:2:9 \
 // CHECK-NEXT: ClassDecl=C:2:9 (Definition)
 // CHECK-NEXT: ClassDecl=C:2:9 (Definition) =[2:9 - 2:10]
-// CHECK-NEXT: CXXConstructor=C:4:5 (Definition) =[4:5 - 4:6]
+// CHECK-NEXT: CXXConstructor=C:4:5 (Definition) (default constructor) =[4:5 - 4:6]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[9:10 - 9:11]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[10:3 - 10:4]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[15:7 - 15:8]
@@ -69,7 +69,7 @@
 // RUN:  -file-refs-at=%s:16:18 \
 // CHECK-NEXT: CallExpr=C:4:5
 // CHECK-NEXT: ClassDecl=C:2:9 (Definition) =[2:9 - 2:10]
-// CHECK-NEXT: CXXConstructor=C:4:5 (Definition) =[4:5 - 4:6]
+// CHECK-NEXT: CXXConstructor=C:4:5 (Definition) (default constructor) =[4:5 - 4:6]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[9:10 - 9:11]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[10:3 - 10:4]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[15:7 - 15:8]
@@ -91,7 +91,7 @@
 // CHECK-NEXT: CallExpr=S:35:3
 // CHECK-NEXT: StructDecl=S:34:8 (Definition) =[34:8 - 34:9]
 // CHECK-NEXT: CXXConstructor=S:35:3 =[35:3 - 35:4]
-// CHECK-NEXT: CXXConstructor=S:36:3 =[36:3 - 36:4]
+// CHECK-NEXT: CXXConstructor=S:36:3 (default constructor) =[36:3 - 36:4]
 // CHECK-NEXT: TypeRef=struct Test2::S:34:8 =[39:9 - 39:10]
 // CHECK-NEXT: TypeRef=struct Test2::S:34:8 =[43:14 - 43:15]
 
diff --git a/test/Index/get-cursor.cpp b/test/Index/get-cursor.cpp
index 60aab5f..a2c4693 100644
--- a/test/Index/get-cursor.cpp
+++ b/test/Index/get-cursor.cpp
@@ -208,7 +208,7 @@
 // CHECK-TEMPLSPEC: 66:23 ClassDecl=TC:66:23 (Definition) [Specialization of TC:59:7] Extent=[66:1 - 66:31] Spelling=TC ([66:23 - 66:25])
 
 // RUN: c-index-test -cursor-at=%s:69:3 -cursor-at=%s:70:11 -cursor-at=%s:73:6 -cursor-at=%s:74:6 -cursor-at=%s:77:8 -cursor-at=%s:78:8 -cursor-at=%s:79:8 -cursor-at=%s:80:8 -cursor-at=%s:81:8 -cursor-at=%s:82:8 -cursor-at=%s:85:6 -cursor-at=%s:86:6 -cursor-at=%s:87:6 -cursor-at=%s:88:6 -cursor-at=%s:91:5 -cursor-at=%s:92:5 -cursor-at=%s:93:5 -cursor-at=%s:94:5 -cursor-at=%s:95:5 -cursor-at=%s:96:5 -cursor-at=%s:97:5 -cursor-at=%s:98:5 -cursor-at=%s:100:5 -cursor-at=%s:101:5 -cursor-at=%s:104:6 -cursor-at=%s:105:6 -cursor-at=%s:106:6 -cursor-at=%s:107:6 -cursor-at=%s:108:6 -cursor-at=%s:109:6 -cursor-at=%s:110:6 -cursor-at=%s:111:6 -cursor-at=%s:113:6 -cursor-at=%s:114:6 -cursor-at=%s:117:8 -cursor-at=%s:118:8 -cursor-at=%s:120:8 -cursor-at=%s:121:8 -cursor-at=%s:122:8 -cursor-at=%s:123:8 -cursor-at=%s:124:8 -cursor-at=%s:125:8 -cursor-at=%s:128:6 -cursor-at=%s:129:6 -cursor-at=%s:130:6 -cursor-at=%s:132:3 -std=c++11 %s | FileCheck -check-prefix=CHECK-SPELLING %s
-// CHECK-SPELLING: 69:3 CXXConstructor=A:69:3 Extent=[69:3 - 69:6] Spelling=A ([69:3 - 69:4])
+// CHECK-SPELLING: 69:3 CXXConstructor=A:69:3 (default constructor) Extent=[69:3 - 69:6] Spelling=A ([69:3 - 69:4])
 // CHECK-SPELLING: 70:11 CXXDestructor=~A:70:11 (virtual) Extent=[70:3 - 70:15] Spelling=~A ([70:11 - 70:13])
 // CHECK-SPELLING: 73:6 CXXMethod=operator=:73:6 Extent=[73:3 - 73:25] Spelling=operator= ([73:6 - 73:15])
 // CHECK-SPELLING: 74:6 CXXMethod=operator=:74:6 Extent=[74:3 - 74:29] Spelling=operator= ([74:6 - 74:15])
diff --git a/test/Index/index-file.cpp b/test/Index/index-file.cpp
index f1ae68a..f2dbabb 100644
--- a/test/Index/index-file.cpp
+++ b/test/Index/index-file.cpp
@@ -27,7 +27,18 @@
 class B {
   mutable int x_;
   int y_;
+
+  B() = default;
+  B(int);
+  explicit B(double);
+  B(const B&);
+  B(B&&);
 };
+
+class C {
+  explicit C(const C&);
+};
+
 // RUN: c-index-test -index-file %s > %t
 // RUN: FileCheck %s -input-file=%t
 
@@ -37,3 +48,9 @@
 // CHECK: [indexDeclaration]: kind: c++-instance-method | name: meth | {{.*}} | loc: 23:26
 // CHECK: [indexDeclaration]: kind: field | name: x_ | USR: c:@S@B@FI@x_ | lang: C++ | cursor: FieldDecl=x_:28:15 (Definition) (mutable) | loc: 28:15 | semantic-container: [B:27:7] | lexical-container: [B:27:7] | isRedecl: 0 | isDef: 1 | isContainer: 0 | isImplicit: 0
 // CHECK: [indexDeclaration]: kind: field | name: y_ | USR: c:@S@B@FI@y_ | lang: C++ | cursor: FieldDecl=y_:29:7 (Definition) | loc: 29:7 | semantic-container: [B:27:7] | lexical-container: [B:27:7] | isRedecl: 0 | isDef: 1 | isContainer: 0 | isImplicit: 0
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} (default constructor) (defaulted) | loc: 31:3
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} (converting constructor) | loc: 32:3
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} | loc: 33:12
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} (copy constructor) (converting constructor) | loc: 34:3
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} (move constructor) (converting constructor) | loc: 35:3
+// CHECK: [indexDeclaration]: kind: constructor | name: C | {{.*}} (copy constructor) | loc: 39:12
diff --git a/test/Index/index-many-call-ops.cpp b/test/Index/index-many-call-ops.cpp
index b46029c..7644697 100644
--- a/test/Index/index-many-call-ops.cpp
+++ b/test/Index/index-many-call-ops.cpp
@@ -4,8 +4,8 @@
 // Check that we don't get stack overflow trying to index a huge number of
 // call operators.
 
-// ASan and UBSan increase stack usage.
-// REQUIRES: not_asan, not_ubsan
+// UBSan increses stack usage.
+// REQUIRES: not_ubsan
 
 struct S {
   S &operator()();
diff --git a/test/Index/index-many-logical-ops.c b/test/Index/index-many-logical-ops.c
index fd994a2..7940a21 100644
--- a/test/Index/index-many-logical-ops.c
+++ b/test/Index/index-many-logical-ops.c
@@ -4,8 +4,8 @@
 // Check that we don't get stack overflow trying to index a huge number of
 // logical operators.
 
-// ASan and UBSan increase stack usage.
-// REQUIRES: not_asan, not_ubsan
+// UBSan increases stack usage.
+// REQUIRES: not_ubsan
 
 // CHECK: [indexDeclaration]: kind: function | name: foo
 int foo(int x) {
diff --git a/test/Index/keep-going.cpp b/test/Index/keep-going.cpp
index 9bf2394..a25d1c4 100644
--- a/test/Index/keep-going.cpp
+++ b/test/Index/keep-going.cpp
@@ -9,7 +9,7 @@
 
 class C : public A<float> { };
 
-// RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_KEEP_GOING=1 c-index-test -test-print-type %s 2> %t.stderr.txt  | FileCheck %s
+// RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_KEEP_GOING=1 c-index-test -test-print-type %s -std=c++03 2> %t.stderr.txt  | FileCheck %s
 // RUN: FileCheck -check-prefix CHECK-DIAG %s < %t.stderr.txt
 
 // CHECK: inclusion directive=missing1.h ((null)) [type=] [typekind=Invalid] [isPOD=0]
diff --git a/test/Index/load-classes.cpp b/test/Index/load-classes.cpp
index 3b66be5..f527db5 100644
--- a/test/Index/load-classes.cpp
+++ b/test/Index/load-classes.cpp
@@ -25,10 +25,10 @@
 
 // RUN: c-index-test -test-load-source all %s | FileCheck %s
 // CHECK: load-classes.cpp:3:8: StructDecl=X:3:8 (Definition) Extent=[3:1 - 21:2]
-// CHECK: load-classes.cpp:4:3: CXXConstructor=X:4:3 Extent=[4:3 - 4:15] [access=public]
+// CHECK: load-classes.cpp:4:3: CXXConstructor=X:4:3 (converting constructor) Extent=[4:3 - 4:15] [access=public]
 // FIXME: missing TypeRef in the constructor name
 // CHECK: load-classes.cpp:4:9: ParmDecl=value:4:9 (Definition) Extent=[4:5 - 4:14]
-// CHECK: load-classes.cpp:5:3: CXXConstructor=X:5:3 Extent=[5:3 - 5:16] [access=public]
+// CHECK: load-classes.cpp:5:3: CXXConstructor=X:5:3 (copy constructor) (converting constructor) Extent=[5:3 - 5:16] [access=public]
 // FIXME: missing TypeRef in the constructor name
 // CHECK: load-classes.cpp:5:14: ParmDecl=x:5:14 (Definition) Extent=[5:5 - 5:15]
 // CHECK: load-classes.cpp:5:11: TypeRef=struct X:3:8 Extent=[5:11 - 5:12]
@@ -46,7 +46,7 @@
 // CHECK: load-classes.cpp:16:21: TemplateTypeParameter=T:16:21 (Definition) Extent=[16:12 - 16:22] [access=public]
 // CHECK: load-classes.cpp:19:16: CXXMethod=virtualMemberFunction:19:16 (virtual) Extent=[19:3 - 19:39] [access=private]
 // CHECK: load-classes.cpp:20:16: CXXMethod=pureVirtualMemberFunction:20:16 (virtual) (pure) Extent=[20:3 - 20:47] [access=private]
-// CHECK: load-classes.cpp:23:4: CXXConstructor=X:23:4 (Definition) Extent=[23:1 - 24:2] [access=public]
+// CHECK: load-classes.cpp:23:4: CXXConstructor=X:23:4 (Definition) (converting constructor) Extent=[23:1 - 24:2] [access=public]
 // CHECK: load-classes.cpp:23:1: TypeRef=struct X:3:8 Extent=[23:1 - 23:2]
 // CHECK: load-classes.cpp:23:10: ParmDecl=value:23:10 (Definition) Extent=[23:6 - 23:15]
 // CHECK: load-classes.cpp:23:17: CompoundStmt= Extent=[23:17 - 24:2]
diff --git a/test/Index/print-type.c b/test/Index/print-type.c
index 35aab71..ebe4297 100644
--- a/test/Index/print-type.c
+++ b/test/Index/print-type.c
@@ -12,6 +12,9 @@
 
 int f2(int incompletearray[]);
 
+enum Enum{i}; enum Enum elaboratedEnumType();
+struct Struct{}; struct Struct elaboratedStructType();
+
 // RUN: c-index-test -test-print-type %s | FileCheck %s
 // CHECK: FunctionDecl=f:3:6 (Definition) [type=int *(int *, char *, FooType, int *, void (*)(int))] [typekind=FunctionProto] [canonicaltype=int *(int *, char *, int, int *, void (*)(int))] [canonicaltypekind=FunctionProto] [resulttype=int *] [resulttypekind=Pointer] [args= [int *] [Pointer] [char *] [Pointer] [FooType] [Typedef] [int [5]] [ConstantArray] [void (*)(int)] [Pointer]] [isPOD=0]
 // CHECK: ParmDecl=p:3:13 (Definition) [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int]
@@ -45,3 +48,8 @@
 // CHECK: VarDecl=x:10:38 [type=__attribute__((__vector_size__(4 * sizeof(int)))) int] [typekind=Vector] [isPOD=1]
 // CHECK: TypedefDecl=int4_t:11:46 (Definition) [type=int4_t] [typekind=Typedef] [canonicaltype=__attribute__((__vector_size__(4 * sizeof(int)))) int] [canonicaltypekind=Vector] [isPOD=1]
 // CHECK: ParmDecl=incompletearray:13:12 (Definition) [type=int []] [typekind=IncompleteArray] [isPOD=1]
+// CHECK: FunctionDecl=elaboratedEnumType:15:25 [type=enum Enum ()] [typekind=FunctionNoProto] [canonicaltype=enum Enum ()] [canonicaltypekind=FunctionNoProto] [resulttype=enum Enum] [resulttypekind=Elaborated] [isPOD=0]
+// CHECK: TypeRef=enum Enum:15:6 [type=enum Enum] [typekind=Enum] [isPOD=1]
+// CHECK: StructDecl=Struct:16:8 (Definition) [type=struct Struct] [typekind=Record] [isPOD=1]
+// CHECK: FunctionDecl=elaboratedStructType:16:32 [type=struct Struct ()] [typekind=FunctionNoProto] [canonicaltype=struct Struct ()] [canonicaltypekind=FunctionNoProto] [resulttype=struct Struct] [resulttypekind=Elaborated] [isPOD=0]
+// CHECK: TypeRef=struct Struct:16:8 [type=struct Struct] [typekind=Record] [isPOD=1]
diff --git a/test/Index/print-type.cpp b/test/Index/print-type.cpp
index 61135e3..44fc11c 100644
--- a/test/Index/print-type.cpp
+++ b/test/Index/print-type.cpp
@@ -48,7 +48,7 @@
 };
 int Blob::*member_pointer;
 
-
+namespace NS { struct Type{}; } NS::Type elaboratedNamespaceType(const NS::Type t);
 
 auto autoI = 0;
 auto autoTbar = tbar<int>(0);
@@ -68,8 +68,8 @@
 // CHECK: TemplateTemplateParameter=W:8:60 (Definition) [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: Namespace=inner:14:11 (Definition) [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: StructDecl=Bar:16:8 (Definition) [type=outer::inner::Bar] [typekind=Record] [isPOD=0] [nbFields=3]
-// CHECK: CXXConstructor=Bar:17:3 (Definition) [type=void (outer::Foo<bool> *){{.*}}] [typekind=FunctionProto] [canonicaltype=void (outer::Foo<bool> *){{.*}}] [canonicaltypekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [args= [outer::Foo<bool> *] [Pointer]] [isPOD=0]
-// CHECK: ParmDecl=foo:17:25 (Definition) [type=outer::Foo<bool> *] [typekind=Pointer] [canonicaltype=outer::Foo<bool> *] [canonicaltypekind=Pointer] [isPOD=1] [pointeetype=outer::Foo<bool>] [pointeekind=Unexposed]
+// CHECK: CXXConstructor=Bar:17:3 (Definition) (converting constructor) [type=void (outer::Foo<bool> *){{.*}}] [typekind=FunctionProto] [canonicaltype=void (outer::Foo<bool> *){{.*}}] [canonicaltypekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [args= [outer::Foo<bool> *] [Pointer]] [isPOD=0]
+// CHECK: ParmDecl=foo:17:25 (Definition) [type=outer::Foo<bool> *] [typekind=Pointer] [canonicaltype=outer::Foo<bool> *] [canonicaltypekind=Pointer] [isPOD=1] [pointeetype=outer::Foo<bool>] [pointeekind=Elaborated]
 // CHECK: NamespaceRef=outer:1:11 [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: TemplateRef=Foo:4:8 [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: CompoundStmt= [type=] [typekind=Invalid] [isPOD=0]
@@ -127,6 +127,10 @@
 // CHECK: StructDecl=Blob:45:8 (Definition) [type=Blob] [typekind=Record] [isPOD=1] [nbFields=2]
 // CHECK: FieldDecl=i:46:7 (Definition) [type=int] [typekind=Int] [isPOD=1]
 // CHECK: VarDecl=member_pointer:49:12 (Definition) [type=int Blob::*] [typekind=MemberPointer] [isPOD=1]
+// CHECK: FunctionDecl=elaboratedNamespaceType:51:42 [type=NS::Type (const NS::Type)] [typekind=FunctionProto] [canonicaltype=NS::Type (NS::Type)] [canonicaltypekind=FunctionProto] [resulttype=NS::Type] [resulttypekind=Elaborated] [args= [const NS::Type] [Elaborated]] [isPOD=0]
+// CHECK: NamespaceRef=NS:51:11 [type=] [typekind=Invalid] [isPOD=0]
+// CHECK: TypeRef=struct NS::Type:51:23 [type=NS::Type] [typekind=Record] [isPOD=1]
+// CHECK: ParmDecl=t:51:81 (Definition) [type=const NS::Type] [typekind=Elaborated] const [canonicaltype=const NS::Type] [canonicaltypekind=Record] [isPOD=1]
 // CHECK: VarDecl=autoI:53:6 (Definition) [type=int] [typekind=Auto] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1]
 // CHECK: IntegerLiteral= [type=int] [typekind=Int] [isPOD=1]
 // CHECK: VarDecl=autoTbar:54:6 (Definition) [type=int] [typekind=Auto] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1]
diff --git a/test/Index/properties-class-extensions.m b/test/Index/properties-class-extensions.m
index 0fa0ecb..7af6553 100644
--- a/test/Index/properties-class-extensions.m
+++ b/test/Index/properties-class-extensions.m
@@ -70,7 +70,7 @@
 // CHECK-NOT: properties-class-extensions.m:16:25: ObjCInstanceMethodDecl=bar:16:25 Extent=[16:25 - 16:28]
 // CHECK: properties-class-extensions.m:19:26: ObjCInstanceMethodDecl=setBar::19:26 Extent=[19:26 - 19:29]
 // CHECK: properties-class-extensions.m:19:26: ParmDecl=bar:19:26 (Definition) Extent=[19:26 - 19:29]
-// CHECK: properties-class-extensions.m:24:8: ObjCInterfaceDecl=Rdar8467189_Bar:24:8 Extent=[24:1 - 24:23]
+// CHECK-NOT: properties-class-extensions.m:24:8: ObjCInterfaceDecl=Rdar8467189_Bar:24:8
 // CHECK: properties-class-extensions.m:24:8: ObjCClassRef=Rdar8467189_Bar:24:8 Extent=[24:8 - 24:23]
 // CHECK: properties-class-extensions.m:25:11: ObjCProtocolDecl=Rdar8467189_FooProtocol:25:11 (Definition) Extent=[25:1 - 27:5]
 // CHECK: properties-class-extensions.m:26:39: ObjCPropertyDecl=Rdar8467189_Bar:26:39 [readonly,] Extent=[26:1 - 26:54]
diff --git a/test/Index/recursive-cxx-member-calls.cpp b/test/Index/recursive-cxx-member-calls.cpp
index 34a5652..36d617f 100644
--- a/test/Index/recursive-cxx-member-calls.cpp
+++ b/test/Index/recursive-cxx-member-calls.cpp
@@ -1653,7 +1653,7 @@
 // CHECK: 45:58: DeclRefExpr=a:45:28 Extent=[45:58 - 45:59]
 // CHECK: 45:62: DeclRefExpr=b:45:38 Extent=[45:62 - 45:63]
 // CHECK: 46:1: CXXAccessSpecifier=:46:1 (Definition) Extent=[46:1 - 46:8]
-// CHECK: 47:3: CXXConstructor=StringRef:47:3 (Definition) Extent=[47:3 - 47:37]
+// CHECK: 47:3: CXXConstructor=StringRef:47:3 (Definition) (default constructor) Extent=[47:3 - 47:37]
 // CHECK: 47:16: MemberRef=Data:43:15 Extent=[47:16 - 47:20]
 // CHECK: 47:21: UnexposedExpr= Extent=[47:21 - 47:22]
 // CHECK: 47:21: IntegerLiteral= Extent=[47:21 - 47:22]
@@ -1661,7 +1661,7 @@
 // CHECK: 47:32: UnexposedExpr= Extent=[47:32 - 47:33]
 // CHECK: 47:32: IntegerLiteral= Extent=[47:32 - 47:33]
 // CHECK: 47:35: CompoundStmt= Extent=[47:35 - 47:37]
-// CHECK: 48:3: CXXConstructor=StringRef:48:3 (Definition) Extent=[48:3 - 48:71]
+// CHECK: 48:3: CXXConstructor=StringRef:48:3 (Definition) (converting constructor) Extent=[48:3 - 48:71]
 // CHECK: 48:25: ParmDecl=Str:48:25 (Definition) Extent=[48:13 - 48:28]
 // CHECK: 48:32: MemberRef=Data:43:15 Extent=[48:32 - 48:36]
 // CHECK: 48:37: DeclRefExpr=Str:48:25 Extent=[48:37 - 48:40]
@@ -1768,7 +1768,7 @@
 // CHECK: 65:11: Namespace=clang:65:11 (Definition) Extent=[65:1 - 81:2]
 // CHECK: 66:7: ClassDecl=IdentifierInfo:66:7 (Definition) Extent=[66:1 - 80:2]
 // CHECK: 67:1: CXXAccessSpecifier=:67:1 (Definition) Extent=[67:1 - 67:8]
-// CHECK: 67:8: CXXConstructor=IdentifierInfo:67:8 Extent=[67:8 - 67:24]
+// CHECK: 67:8: CXXConstructor=IdentifierInfo:67:8 (default constructor) Extent=[67:8 - 67:24]
 // CHECK: 68:15: CXXMethod=getNameStart:68:15 (Definition) (const) Extent=[68:3 - 71:4] [access=public]
 // CHECK: 68:36: CompoundStmt= Extent=[68:36 - 71:4]
 // CHECK: 69:5: DeclStmt= Extent=[69:5 - 69:65]
diff --git a/test/Index/remap-load.c b/test/Index/remap-load.c
index f433fa7..f886cea 100644
--- a/test/Index/remap-load.c
+++ b/test/Index/remap-load.c
@@ -1,4 +1,4 @@
-// RUN: c-index-test -test-load-source all -remap-file="%s,%S/Inputs/remap-load-to.c" %s | FileCheck -check-prefix=CHECK %s
+// RUN: c-index-test -test-load-source all -remap-file="%s,%S/Inputs/remap-load-to.c" %s | FileCheck %s
 
 // CHECK: remap-load.c:1:5: FunctionDecl=foo:1:5 (Definition) Extent=[1:1 - 3:2]
 // CHECK: remap-load.c:1:13: ParmDecl=parm1:1:13 (Definition) Extent=[1:9 - 1:18]
diff --git a/test/Index/retain-comments-from-system-headers.c b/test/Index/retain-comments-from-system-headers.c
index 490699d..ac4f4fa 100644
--- a/test/Index/retain-comments-from-system-headers.c
+++ b/test/Index/retain-comments-from-system-headers.c
@@ -13,7 +13,7 @@
 // RUN: c-index-test -test-load-source all %s -fretain-comments-from-system-headers -I %S/Inputs | FileCheck %s -check-prefix=CHECK-RETAIN
 
 // Modules:
-// RUN: c-index-test -test-load-source all %s -I %S/Inputs -fmodules -fmodules-cache-path=%t/cache -fmodule-map-file=%S/Inputs/retain-comments-from-system-headers-module.map | FileCheck %s -check-prefix=CHECK
+// RUN: c-index-test -test-load-source all %s -I %S/Inputs -fmodules -fmodules-cache-path=%t/cache -fmodule-map-file=%S/Inputs/retain-comments-from-system-headers-module.map | FileCheck %s
 // RUN: c-index-test -test-load-source all %s -fretain-comments-from-system-headers -I %S/Inputs -fmodules -fmodules-cache-path=%t/cache -fmodule-map-file=%S/Inputs/retain-comments-from-system-headers-module.map | FileCheck %s -check-prefix=CHECK-RETAIN
 
 // CHECK: retain-comments-from-system-headers.h:7:5: FunctionDecl=system_function:7:5 Extent=[7:1 - 7:27]
diff --git a/test/Index/skip-parsed-bodies/compile_commands.json b/test/Index/skip-parsed-bodies/compile_commands.json
index 30ede0d..62303cb 100644
--- a/test/Index/skip-parsed-bodies/compile_commands.json
+++ b/test/Index/skip-parsed-bodies/compile_commands.json
@@ -1,22 +1,21 @@
 [
 {
   "directory": ".",
-  "command": "/usr/bin/clang++ -fsyntax-only t1.cpp",
+  "command": "/usr/bin/clang++ -fsyntax-only -fno-ms-compatibility -fno-delayed-template-parsing t1.cpp",
   "file": "t1.cpp"
 },
 {
   "directory": ".",
-  "command": "/usr/bin/clang++ -fsyntax-only t2.cpp -DBLAH",
+  "command": "/usr/bin/clang++ -fsyntax-only -fno-ms-compatibility -fno-delayed-template-parsing t2.cpp -DBLAH",
   "file": "t2.cpp"
 },
 {
   "directory": ".",
-  "command": "/usr/bin/clang++ -fsyntax-only t3.cpp -DBLAH",
+  "command": "/usr/bin/clang++ -fsyntax-only -fno-ms-compatibility -fno-delayed-template-parsing t3.cpp -DBLAH",
   "file": "t2.cpp"
 }
 ]
 
-// XFAIL: mingw32,win32,windows-gnu
 // RUN: c-index-test -index-compile-db %s | FileCheck %s
 
 // CHECK:      [startedTranslationUnit]
diff --git a/test/Layout/ms-x86-declspec-empty_bases.cpp b/test/Layout/ms-x86-declspec-empty_bases.cpp
new file mode 100644
index 0000000..cc13a98
--- /dev/null
+++ b/test/Layout/ms-x86-declspec-empty_bases.cpp
@@ -0,0 +1,266 @@
+// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN:            | FileCheck %s
+// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN:            | FileCheck %s
+
+namespace test1 {
+
+struct A {
+  int a;
+};
+struct B {
+  int b;
+};
+struct C {};
+struct __declspec(align(16)) D {};
+struct __declspec(empty_bases) X : A, D, B, C {
+};
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::A
+// CHECK-NEXT:          0 |   int a
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::D (empty)
+// CHECK-NEXT:            | [sizeof=16, align=16,
+// CHECK-NEXT:            |  nvsize=0, nvalign=16]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::B
+// CHECK-NEXT:          0 |   int b
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::C (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::X
+// CHECK-NEXT:          0 |   struct test1::A (base)
+// CHECK-NEXT:          0 |     int a
+// CHECK-NEXT:          0 |   struct test1::D (base) (empty)
+// CHECK-NEXT:          0 |   struct test1::C (base) (empty)
+// CHECK-NEXT:          4 |   struct test1::B (base)
+// CHECK-NEXT:          4 |     int b
+// CHECK-NEXT:            | [sizeof=16, align=16,
+// CHECK-NEXT:            |  nvsize=16, nvalign=16]
+
+int _ = sizeof(X);
+}
+
+namespace test2 {
+struct A {
+  int a;
+};
+struct __declspec(empty_bases) B {};
+struct C : A {
+  B b;
+};
+
+struct D {};
+struct E {
+  int e;
+};
+struct F : D, E {};
+
+struct G : C, F {};
+
+int _ = sizeof(G);
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::A
+// CHECK-NEXT:          0 |   int a
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::B (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::C
+// CHECK-NEXT:          0 |   struct test2::A (base)
+// CHECK-NEXT:          0 |     int a
+// CHECK-NEXT:          4 |   struct test2::B b (empty)
+// CHECK-NEXT:            | [sizeof=8, align=4,
+// CHECK-NEXT:            |  nvsize=8, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::D (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::E
+// CHECK-NEXT:          0 |   int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::F
+// CHECK-NEXT:          0 |   struct test2::D (base) (empty)
+// CHECK-NEXT:          0 |   struct test2::E (base)
+// CHECK-NEXT:          0 |     int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::G
+// CHECK-NEXT:          0 |   struct test2::C (base)
+// CHECK-NEXT:          0 |     struct test2::A (base)
+// CHECK-NEXT:          0 |       int a
+// CHECK-NEXT:          4 |     struct test2::B b (empty)
+// CHECK-NEXT:          8 |   struct test2::F (base)
+// CHECK-NEXT:          8 |     struct test2::D (base) (empty)
+// CHECK-NEXT:          8 |     struct test2::E (base)
+// CHECK-NEXT:          8 |       int e
+// CHECK-NEXT:            | [sizeof=12, align=4,
+// CHECK-NEXT:            |  nvsize=12, nvalign=4]
+}
+
+namespace test3 {
+struct A {
+  int a;
+};
+struct B {};
+struct C : A {
+  B b;
+};
+
+struct D {};
+struct E {
+  int e;
+};
+struct F : D, E {};
+
+struct __declspec(empty_bases) G : C, F {};
+
+int _ = sizeof(G);
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::A
+// CHECK-NEXT:          0 |   int a
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::B (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::C
+// CHECK-NEXT:          0 |   struct test3::A (base)
+// CHECK-NEXT:          0 |     int a
+// CHECK-NEXT:          4 |   struct test3::B b (empty)
+// CHECK-NEXT:            | [sizeof=8, align=4,
+// CHECK-NEXT:            |  nvsize=8, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::D (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::E
+// CHECK-NEXT:          0 |   int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::F
+// CHECK-NEXT:          0 |   struct test3::D (base) (empty)
+// CHECK-NEXT:          0 |   struct test3::E (base)
+// CHECK-NEXT:          0 |     int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::G
+// CHECK-NEXT:          0 |   struct test3::C (base)
+// CHECK-NEXT:          0 |     struct test3::A (base)
+// CHECK-NEXT:          0 |       int a
+// CHECK-NEXT:          4 |     struct test3::B b (empty)
+// CHECK-NEXT:          8 |   struct test3::F (base)
+// CHECK-NEXT:          8 |     struct test3::D (base) (empty)
+// CHECK-NEXT:          8 |     struct test3::E (base)
+// CHECK-NEXT:          8 |       int e
+// CHECK-NEXT:            | [sizeof=12, align=4,
+// CHECK-NEXT:            |  nvsize=12, nvalign=4]
+}
+
+namespace test4 {
+struct A {
+  int a;
+};
+struct B {};
+struct C : A {
+  B b;
+};
+
+struct __declspec(empty_bases) D {};
+struct E {
+  int e;
+};
+struct F : D, E {};
+
+struct G : C, F {};
+
+int _ = sizeof(G);
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::A
+// CHECK-NEXT:          0 |   int a
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::B (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::C
+// CHECK-NEXT:          0 |   struct test4::A (base)
+// CHECK-NEXT:          0 |     int a
+// CHECK-NEXT:          4 |   struct test4::B b (empty)
+// CHECK-NEXT:            | [sizeof=8, align=4,
+// CHECK-NEXT:            |  nvsize=8, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::D (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::E
+// CHECK-NEXT:          0 |   int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::F
+// CHECK-NEXT:          0 |   struct test4::D (base) (empty)
+// CHECK-NEXT:          0 |   struct test4::E (base)
+// CHECK-NEXT:          0 |     int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::G
+// CHECK-NEXT:          0 |   struct test4::C (base)
+// CHECK-NEXT:          0 |     struct test4::A (base)
+// CHECK-NEXT:          0 |       int a
+// CHECK-NEXT:          4 |     struct test4::B b (empty)
+// CHECK-NEXT:          8 |   struct test4::F (base)
+// CHECK-NEXT:          8 |     struct test4::D (base) (empty)
+// CHECK-NEXT:          8 |     struct test4::E (base)
+// CHECK-NEXT:          8 |       int e
+// CHECK-NEXT:            | [sizeof=12, align=4,
+// CHECK-NEXT:            |  nvsize=12, nvalign=4]
+}
diff --git a/test/Lexer/Inputs/case-insensitive-include.h b/test/Lexer/Inputs/case-insensitive-include.h
new file mode 100644
index 0000000..954090f
--- /dev/null
+++ b/test/Lexer/Inputs/case-insensitive-include.h
@@ -0,0 +1,8 @@
+#ifndef CASE_INSENSITIVE_INCLUDE_H
+#define CASE_INSENSITIVE_INCLUDE_H
+
+struct S {
+  int x;
+};
+
+#endif
diff --git a/test/Lexer/case-insensitive-include-ms.c b/test/Lexer/case-insensitive-include-ms.c
new file mode 100644
index 0000000..86bd8bb
--- /dev/null
+++ b/test/Lexer/case-insensitive-include-ms.c
@@ -0,0 +1,18 @@
+// REQUIRES: case-insensitive-filesystem
+
+// RUN: mkdir -p %T/apath
+// RUN: cp %S/Inputs/case-insensitive-include.h %T
+// RUN: cd %T
+// RUN: %clang_cc1 -fsyntax-only -fms-compatibility %s -include %s -I %T -verify
+// RUN: %clang_cc1 -fsyntax-only -fms-compatibility -fdiagnostics-parseable-fixits %s -include %s -I %T 2>&1 | FileCheck %s
+
+#include "..\Output\.\case-insensitive-include.h"
+#include "..\Output\.\Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"..\\Output\\.\\case-insensitive-include.h\""
+#include "..\output\.\case-insensitive-include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"..\\Output\\.\\case-insensitive-include.h\""
+
+#include "apath\..\.\case-insensitive-include.h"
+#include "apath\..\.\Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:49}:"\"apath\\..\\.\\case-insensitive-include.h\""
+#include "APath\..\.\case-insensitive-include.h" // For the sake of efficiency, this case is not diagnosed. :-(
diff --git a/test/Lexer/case-insensitive-include.c b/test/Lexer/case-insensitive-include.c
new file mode 100644
index 0000000..13e5b59
--- /dev/null
+++ b/test/Lexer/case-insensitive-include.c
@@ -0,0 +1,35 @@
+// REQUIRES: case-insensitive-filesystem
+
+// RUN: mkdir -p %T/apath
+// RUN: mkdir -p %T/asystempath
+// RUN: cp %S/Inputs/case-insensitive-include.h %T
+// RUN: cp %S/Inputs/case-insensitive-include.h %T/asystempath/case-insensitive-include2.h
+// RUN: cd %T
+// RUN: %clang_cc1 -fsyntax-only %s -include %s -I %T -isystem %T/asystempath -verify
+// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s -include %s -I %T -isystem %T/asystempath 2>&1 | FileCheck %s
+
+// Known standard header, so warn:
+#include <StdDef.h> // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:20}:"<stddef.h>"
+
+#include "case-insensitive-include.h"
+#include "Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:38}:"\"case-insensitive-include.h\""
+
+#include "../Output/./case-insensitive-include.h"
+#include "../Output/./Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"../Output/./case-insensitive-include.h\""
+#include "../output/./case-insensitive-include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"../Output/./case-insensitive-include.h\""
+
+#include "apath/.././case-insensitive-include.h"
+#include "apath/.././Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:49}:"\"apath/.././case-insensitive-include.h\""
+#include "APath/.././case-insensitive-include.h" // For the sake of efficiency, this case is not diagnosed. :-(
+
+#include "../Output/./apath/.././case-insensitive-include.h"
+#include "../Output/./APath/.././case-insensitive-include.h" // For the sake of efficiency, this case is not diagnosed. :-(
+#include "../output/./apath/.././case-insensitive-include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:61}:"\"../Output/./apath/.././case-insensitive-include.h\""
+
+#include "CASE-INSENSITIVE-INCLUDE2.H" // Found in an -isystem directory. No warning.
diff --git a/test/Lexer/case-insensitive-system-include.c b/test/Lexer/case-insensitive-system-include.c
new file mode 100644
index 0000000..9d5289c
--- /dev/null
+++ b/test/Lexer/case-insensitive-system-include.c
@@ -0,0 +1,10 @@
+// REQUIRES: case-insensitive-filesystem
+
+// RUN: mkdir -p %T/asystempath
+// RUN: cp %S/Inputs/case-insensitive-include.h %T/asystempath/
+// RUN: cd %T
+// RUN: %clang_cc1 -fsyntax-only %s -include %s -isystem %T/asystempath -verify -Wnonportable-system-include-path
+// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s -include %s -isystem %T/asystempath -Wnonportable-system-include-path 2>&1 | FileCheck %s
+
+#include "CASE-INSENSITIVE-INCLUDE.H" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:38}:"\"case-insensitive-include.h\""
diff --git a/test/Lexer/cxx-features.cpp b/test/Lexer/cxx-features.cpp
index 6c4a092..e047ec3 100644
--- a/test/Lexer/cxx-features.cpp
+++ b/test/Lexer/cxx-features.cpp
@@ -1,132 +1,137 @@
 // RUN: %clang_cc1 -std=c++98 -verify %s
 // RUN: %clang_cc1 -std=c++11 -verify %s
 // RUN: %clang_cc1 -std=c++1y -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++1y -fsized-deallocation -fconcepts-ts -DCONCEPTS_TS=1 -verify %s
+// RUN: %clang_cc1 -std=c++14 -fsized-deallocation -verify %s
+// RUN: %clang_cc1 -std=c++1z -fsized-deallocation -verify %s
+// RUN: %clang_cc1 -std=c++1z -fsized-deallocation -fconcepts-ts -DCONCEPTS_TS=1 -verify %s
 // RUN: %clang_cc1 -fcoroutines -DCOROUTINES -verify %s
 
 // expected-no-diagnostics
 
+// FIXME using `defined` in a macro has undefined behavior.
 #if __cplusplus < 201103L
-#define check(macro, cxx98, cxx11, cxx1y) cxx98 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx98
-#elif __cplusplus < 201304L
-#define check(macro, cxx98, cxx11, cxx1y) cxx11 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx11
+#define check(macro, cxx98, cxx11, cxx14, cxx1z) cxx98 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx98
+#elif __cplusplus < 201402L
+#define check(macro, cxx98, cxx11, cxx14, cxx1z) cxx11 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx11
+#elif __cplusplus < 201406L
+#define check(macro, cxx98, cxx11, cxx14, cxx1z) cxx14 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx14
 #else
-#define check(macro, cxx98, cxx11, cxx1y) cxx1y == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx1y
+#define check(macro, cxx98, cxx11, cxx14, cxx1z) cxx1z == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx1z
 #endif
 
-#if check(binary_literals, 0, 0, 201304)
+#if check(binary_literals, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_binary_literals"
 #endif
 
-#if check(digit_separators, 0, 0, 201309)
+#if check(digit_separators, 0, 0, 201309, 201309)
 #error "wrong value for __cpp_digit_separators"
 #endif
 
-#if check(init_captures, 0, 0, 201304)
+#if check(init_captures, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_init_captures"
 #endif
 
-#if check(generic_lambdas, 0, 0, 201304)
+#if check(generic_lambdas, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_generic_lambdas"
 #endif
 
-#if check(sized_deallocation, 0, 0, 201309)
+#if check(sized_deallocation, 0, 0, 201309, 201309)
 #error "wrong value for __cpp_sized_deallocation"
 #endif
 
-#if check(constexpr, 0, 200704, 201304)
+#if check(constexpr, 0, 200704, 201304, 201304)
 #error "wrong value for __cpp_constexpr"
 #endif
 
-#if check(decltype_auto, 0, 0, 201304)
+#if check(decltype_auto, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_decltype_auto"
 #endif
 
-#if check(return_type_deduction, 0, 0, 201304)
+#if check(return_type_deduction, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_return_type_deduction"
 #endif
 
-#if check(runtime_arrays, 0, 0, 0)
+#if check(runtime_arrays, 0, 0, 0, 0)
 #error "wrong value for __cpp_runtime_arrays"
 #endif
 
-#if check(aggregate_nsdmi, 0, 0, 201304)
+#if check(aggregate_nsdmi, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_aggregate_nsdmi"
 #endif
 
-#if check(variable_templates, 0, 0, 201304)
+#if check(variable_templates, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_variable_templates"
 #endif
 
-#if check(unicode_characters, 0, 200704, 200704)
+#if check(unicode_characters, 0, 200704, 200704, 200704)
 #error "wrong value for __cpp_unicode_characters"
 #endif
 
-#if check(raw_strings, 0, 200710, 200710)
+#if check(raw_strings, 0, 200710, 200710, 200710)
 #error "wrong value for __cpp_raw_strings"
 #endif
 
-#if check(unicode_literals, 0, 200710, 200710)
+#if check(unicode_literals, 0, 200710, 200710, 200710)
 #error "wrong value for __cpp_unicode_literals"
 #endif
 
-#if check(user_defined_literals, 0, 200809, 200809)
+#if check(user_defined_literals, 0, 200809, 200809, 200809)
 #error "wrong value for __cpp_user_defined_literals"
 #endif
 
-#if check(lambdas, 0, 200907, 200907)
+#if check(lambdas, 0, 200907, 200907, 200907)
 #error "wrong value for __cpp_lambdas"
 #endif
 
-#if check(range_based_for, 0, 200907, 200907)
+#if check(range_based_for, 0, 200907, 200907, 200907)
 #error "wrong value for __cpp_range_based_for"
 #endif
 
-#if check(static_assert, 0, 200410, 200410)
+#if check(static_assert, 0, 200410, 200410, 200410)
 #error "wrong value for __cpp_static_assert"
 #endif
 
-#if check(decltype, 0, 200707, 200707)
+#if check(decltype, 0, 200707, 200707, 200707)
 #error "wrong value for __cpp_decltype"
 #endif
 
-#if check(attributes, 0, 200809, 200809)
+#if check(attributes, 0, 200809, 200809, 200809)
 #error "wrong value for __cpp_attributes"
 #endif
 
-#if check(rvalue_references, 0, 200610, 200610)
+#if check(rvalue_references, 0, 200610, 200610, 200610)
 #error "wrong value for __cpp_rvalue_references"
 #endif
 
-#if check(variadic_templates, 0, 200704, 200704)
+#if check(variadic_templates, 0, 200704, 200704, 200704)
 #error "wrong value for __cpp_variadic_templates"
 #endif
 
-#if check(initializer_lists, 0, 200806, 200806)
+#if check(initializer_lists, 0, 200806, 200806, 200806)
 #error "wrong value for __cpp_initializer_lists"
 #endif
 
-#if check(delegating_constructors, 0, 200604, 200604)
+#if check(delegating_constructors, 0, 200604, 200604, 200604)
 #error "wrong value for __cpp_delegating_constructors"
 #endif
 
-#if check(nsdmi, 0, 200809, 200809)
+#if check(nsdmi, 0, 200809, 200809, 200809)
 #error "wrong value for __cpp_nsdmi"
 #endif
 
-#if check(inheriting_constructors, 0, 200802, 200802)
+#if check(inheriting_constructors, 0, 200802, 200802, 200802)
 #error "wrong value for __cpp_inheriting_constructors"
 #endif
 
-#if check(ref_qualifiers, 0, 200710, 200710)
+#if check(ref_qualifiers, 0, 200710, 200710, 200710)
 #error "wrong value for __cpp_ref_qualifiers"
 #endif
 
-#if check(alias_templates, 0, 200704, 200704)
+#if check(alias_templates, 0, 200704, 200704, 200704)
 #error "wrong value for __cpp_alias_templates"
 #endif
 
-#if check(experimental_concepts, 0, 0, CONCEPTS_TS)
+#if check(experimental_concepts, 0, 0, CONCEPTS_TS, CONCEPTS_TS)
 #error "wrong value for __cpp_experimental_concepts"
 #endif
 
diff --git a/test/Lexer/cxx1y_digit_separators.cpp b/test/Lexer/cxx1y_digit_separators.cpp
index c4c6aee..5536634 100644
--- a/test/Lexer/cxx1y_digit_separators.cpp
+++ b/test/Lexer/cxx1y_digit_separators.cpp
@@ -48,6 +48,9 @@
   float r = 0.'0e1; // expected-error {{digit separator cannot appear at start of digit sequence}}
   float s = 0.0'e1; // expected-error {{digit separator cannot appear at end of digit sequence}}
   float t = 0.0e'1; // expected-error {{digit separator cannot appear at start of digit sequence}}
+  float u = 0x.'p1f; // expected-error {{hexadecimal floating literal requires a significand}}
+  float v = 0e'f; // expected-error {{exponent has no digits}}
+  float w = 0x0p'f; // expected-error {{exponent has no digits}}
 }
 
 #line 123'456
diff --git a/test/Lexer/half-literal.cpp b/test/Lexer/half-literal.cpp
new file mode 100644
index 0000000..32af3fa
--- /dev/null
+++ b/test/Lexer/half-literal.cpp
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic %s
+float a = 1.0h; // expected-error{{invalid suffix 'h' on floating constant}}
+float b = 1.0H; // expected-error{{invalid suffix 'H' on floating constant}}
diff --git a/test/Lexer/has_feature_efficiency_sanitizer.cpp b/test/Lexer/has_feature_efficiency_sanitizer.cpp
new file mode 100644
index 0000000..ef9e273
--- /dev/null
+++ b/test/Lexer/has_feature_efficiency_sanitizer.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -E -fsanitize=efficiency-cache-frag %s -o - | FileCheck --check-prefix=CHECK-ESAN %s
+// RUN: %clang_cc1 -E -fsanitize=efficiency-working-set %s -o - | FileCheck --check-prefix=CHECK-ESAN %s
+// RUN: %clang_cc1 -E  %s -o - | FileCheck --check-prefix=CHECK-NO-ESAN %s
+
+#if __has_feature(efficiency_sanitizer)
+int EfficiencySanitizerEnabled();
+#else
+int EfficiencySanitizerDisabled();
+#endif
+
+// CHECK-ESAN: EfficiencySanitizerEnabled
+// CHECK-NO-ESAN: EfficiencySanitizerDisabled
diff --git a/test/Lexer/hexfloat.cpp b/test/Lexer/hexfloat.cpp
index 6985c7f..163db72 100644
--- a/test/Lexer/hexfloat.cpp
+++ b/test/Lexer/hexfloat.cpp
@@ -1,15 +1,31 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -pedantic %s
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify -pedantic %s
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -pedantic %s
-float f = 0x1p+1; // expected-warning{{hexadecimal floating constants are a C99 feature}}
-double e = 0x.p0; //expected-error{{hexadecimal floating constants require a significand}}
-double d = 0x.2p2; // expected-warning{{hexadecimal floating constants are a C99 feature}}
-float g = 0x1.2p2; // expected-warning{{hexadecimal floating constants are a C99 feature}}
-double h = 0x1.p2; // expected-warning{{hexadecimal floating constants are a C99 feature}}
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify -pedantic %s
+// RUN: %clang_cc1 -std=c++1z -fsyntax-only -verify -pedantic %s
+double e = 0x.p0; // expected-error-re {{hexadecimal floating {{constant|literal}} requires a significand}}
+
+float f = 0x1p+1;
+double d = 0x.2p2;
+float g = 0x1.2p2;
+double h = 0x1.p2;
+#if __cplusplus <= 201402L
+// expected-warning@-5 {{hexadecimal floating literals are a C++1z feature}}
+// expected-warning@-5 {{hexadecimal floating literals are a C++1z feature}}
+// expected-warning@-5 {{hexadecimal floating literals are a C++1z feature}}
+// expected-warning@-5 {{hexadecimal floating literals are a C++1z feature}}
+#endif
 
 // PR12717: In order to minimally diverge from the C++ standard, we do not lex
 // 'p[+-]' as part of a pp-number unless the token starts 0x and doesn't contain
 // an underscore.
-double i = 0p+3; // expected-error{{invalid suffix 'p' on integer constant}}
+double i = 0p+3; // expected-error {{invalid suffix 'p' on integer constant}}
 #define PREFIX(x) foo ## x
 double foo0p = 1, j = PREFIX(0p+3); // ok
-double k = 0x42_amp+3; // expected-error-re{{{{invalid suffix '_amp' on integer constant|no matching literal operator for call to 'operator""_amp'}}}}
+double k = 0x42_amp+3;
+#if __cplusplus > 201402L
+// expected-error@-2 {{no matching literal operator for call to 'operator""_amp+3'}}
+#elif __cplusplus >= 201103L
+// expected-error@-4 {{no matching literal operator for call to 'operator""_amp'}}
+#else
+// expected-error@-6 {{invalid suffix '_amp' on integer constant}}
+#endif
diff --git a/test/Lexer/modules-ts.cpp b/test/Lexer/modules-ts.cpp
new file mode 100644
index 0000000..06be17c
--- /dev/null
+++ b/test/Lexer/modules-ts.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -fsyntax-only %s
+// RUN: %clang_cc1 -fmodules-ts -DMODULES -fsyntax-only %s
+
+#ifdef MODULES
+#define MODULES_KEYWORD(NAME) _Static_assert(!__is_identifier(NAME), #NAME)
+#else
+#define MODULES_KEYWORD(NAME) _Static_assert(__is_identifier(NAME), #NAME)
+#endif
+
+MODULES_KEYWORD(import);
+MODULES_KEYWORD(module);
diff --git a/test/Lexer/opencl-half-literal.cl b/test/Lexer/opencl-half-literal.cl
new file mode 100644
index 0000000..42ca514
--- /dev/null
+++ b/test/Lexer/opencl-half-literal.cl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple spir-unknown-unknown
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+constant half a = 1.0h; 
+constant half aa = 1.0H;
+constant half b = 1.0hh; // expected-error{{invalid suffix 'hh' on floating constant}}
+constant half c = 1.0fh; // expected-error{{invalid suffix 'fh' on floating constant}}
+constant half d = 1.0lh; // expected-error{{invalid suffix 'lh' on floating constant}}
+constant half e = 1.0hf; // expected-error{{invalid suffix 'hf' on floating constant}}
diff --git a/test/Makefile b/test/Makefile
deleted file mode 100644
index 5cb8a8b..0000000
--- a/test/Makefile
+++ /dev/null
@@ -1,77 +0,0 @@
-CLANG_LEVEL := ..
-include $(CLANG_LEVEL)/Makefile
-
-# Test in all immediate subdirectories if unset.
-ifdef TESTSUITE
-TESTDIRS := $(TESTSUITE:%=$(PROJ_SRC_DIR)/%)
-else
-TESTDIRS ?= $(PROJ_SRC_DIR)
-endif
-
-# 'lit' wants objdir paths, so it will pick up the lit.site.cfg.
-TESTDIRS := $(TESTDIRS:$(PROJ_SRC_DIR)%=$(PROJ_OBJ_DIR)%)
-
-# Allow EXTRA_TESTDIRS to provide additional test directories.
-TESTDIRS += $(EXTRA_TESTDIRS)
-
-ifndef TESTARGS
-ifdef VERBOSE
-TESTARGS = -v
-else
-TESTARGS = -s -v
-endif
-endif
-
-# Make sure any extra test suites can find the main site config.
-LIT_ARGS := --param clang_site_config=$(PROJ_OBJ_DIR)/lit.site.cfg
-
-ifdef VG
-  LIT_ARGS += "--vg"
-endif
-
-all:: lit.site.cfg Unit/lit.site.cfg
-	@ echo '--- Running clang tests for $(TARGET_TRIPLE) ---'
-	@ $(PYTHON) $(LLVM_SRC_ROOT)/utils/lit/lit.py \
-	  $(LIT_ARGS) $(TESTARGS) $(TESTDIRS)
-
-FORCE:
-
-lit.site.cfg: FORCE
-	@echo "Making Clang 'lit.site.cfg' file..."
-	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g > lit.tmp
-	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> lit.tmp
-	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
-	@$(ECHOPATH) s=@LLVM_LIBS_DIR@=$(LibDir)=g >> lit.tmp
-	@$(ECHOPATH) s=@CLANG_SOURCE_DIR@=$(PROJ_SRC_DIR)/..=g >> lit.tmp
-	@$(ECHOPATH) s=@CLANG_BINARY_DIR@=$(PROJ_OBJ_DIR)/..=g >> lit.tmp
-	@$(ECHOPATH) s=@CLANG_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
-	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g >> lit.tmp
-	@$(ECHOPATH) s=@LLVM_HOST_TRIPLE@=$(HOST_TRIPLE)=g >> lit.tmp
-	@$(ECHOPATH) s=@ENABLE_CLANG_ARCMT@=$(ENABLE_CLANG_ARCMT)=g >> lit.tmp
-	@$(ECHOPATH) s=@ENABLE_CLANG_STATIC_ANALYZER@=$(ENABLE_CLANG_STATIC_ANALYZER)=g >> lit.tmp
-	@$(ECHOPATH) s=@ENABLE_CLANG_EXAMPLES@=$(ENABLE_CLANG_EXAMPLES)=g >> lit.tmp
-	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
-	@sed -f lit.tmp $(PROJ_SRC_DIR)/lit.site.cfg.in > $@
-	@-rm -f lit.tmp
-
-Unit/lit.site.cfg: FORCE
-	@echo "Making Clang 'Unit/lit.site.cfg' file..."
-	@$(MKDIR) $(dir $@)
-	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g > unit.tmp
-	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> unit.tmp
-	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> unit.tmp
-	@$(ECHOPATH) s=@LLVM_LIBS_DIR@=$(LibDir)=g >> unit.tmp
-	@$(ECHOPATH) s=@CLANG_SOURCE_DIR@=$(PROJ_SRC_DIR)/..=g >> unit.tmp
-	@$(ECHOPATH) s=@CLANG_BINARY_DIR@=$(PROJ_OBJ_DIR)/..=g >> unit.tmp
-	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g >> unit.tmp
-	@$(ECHOPATH) s=@LLVM_BUILD_MODE@=$(BuildMode)=g >> unit.tmp
-	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> unit.tmp
-	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> unit.tmp
-	@$(ECHOPATH) s=@SHLIBPATH_VAR@=$(SHLIBPATH_VAR)=g >> unit.tmp
-	@sed -f unit.tmp $(PROJ_SRC_DIR)/Unit/lit.site.cfg.in > $@
-	@-rm -f unit.tmp
-
-clean::
-	@ find . -name Output | xargs rm -fr
-
-.PHONY: all report clean
diff --git a/test/Misc/amdgcn.languageOptsOpenCL.cl b/test/Misc/amdgcn.languageOptsOpenCL.cl
new file mode 100644
index 0000000..3befefd
--- /dev/null
+++ b/test/Misc/amdgcn.languageOptsOpenCL.cl
@@ -0,0 +1,223 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple amdgcn-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple amdgcn-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple amdgcn-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple amdgcn-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+
+// Extensions in all versions
+#ifndef cl_clang_storage_class_specifiers
+#error "Missing cl_clang_storage_class_specifiers define"
+#endif
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers: enable
+
+#ifndef cl_khr_fp16
+#error "Missing cl_khr_fp16 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp16: enable
+
+#ifndef cl_khr_int64_base_atomics
+#error "Missing cl_khr_int64_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+
+#ifndef cl_khr_int64_extended_atomics
+#error "Missing cl_khr_int64_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+
+#ifdef cl_khr_gl_sharing
+#error "Incorrect cl_khr_gl_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_sharing' - ignoring}}
+
+#ifndef cl_khr_icd
+#error "Missing cl_khr_icd define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_icd: enable
+
+// Core features in CL 1.1
+
+#ifndef cl_khr_byte_addressable_store
+#error "Missing cl_khr_byte_addressable_store define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_byte_addressable_store' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_base_atomics
+#error "Missing cl_khr_global_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_extended_atomics
+#error "Missing cl_khr_global_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_base_atomics
+#error "Missing cl_khr_local_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_extended_atomics
+#error "Missing cl_khr_local_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifdef cl_khr_select_fprounding_mode
+#error "Incorrect cl_khr_select_fprounding_mode define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_select_fprounding_mode' - ignoring}}
+
+
+// Core feature in CL 1.2
+#ifndef cl_khr_fp64
+#error "Missing cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#if (__OPENCL_C_VERSION__ >= 120) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
+
+//Core feature in CL 2.0
+#ifndef cl_khr_3d_image_writes
+#error "Missing cl_khr_3d_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable
+#if (__OPENCL_C_VERSION__ >= 200) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_3d_image_writes' is core feature or supported optional core feature - ignoring}}
+#endif
+
+
+
+#ifdef cl_khr_gl_event
+#error "Incorrect cl_khr_gl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_event' - ignoring}}
+
+#ifdef cl_khr_d3d10_sharing
+#error "Incorrect cl_khr_d3d10_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d10_sharing' - ignoring}}
+
+#ifdef cl_khr_context_abort
+#error "Incorrect cl_context_abort define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_context_abort: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_context_abort' - ignoring}}
+
+#ifdef cl_khr_d3d11_sharing
+#error "Incorrect cl_khr_d3d11_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d11_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d11_sharing' - ignoring}}
+
+#ifdef cl_khr_dx9_media_sharing
+#error "Incorrect cl_khr_dx9_media_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_dx9_media_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_dx9_media_sharing' - ignoring}}
+
+#ifdef cl_khr_image2d_from_buffer
+#error "Incorrect cl_khr_image2d_from_buffer define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_image2d_from_buffer: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_image2d_from_buffer' - ignoring}}
+
+#ifdef cl_khr_initialize_memory
+#error "Incorrect cl_khr_initialize_memory define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_initialize_memory: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_initialize_memory' - ignoring}}
+
+#ifdef cl_khr_gl_depth_images
+#error "Incorrect cl_khr_gl_depth_images define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_depth_images: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_depth_images' - ignoring}}
+
+#ifdef cl_khr_gl_msaa_sharing
+#error "Incorrect cl_khr_gl_msaa_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_msaa_sharing' - ignoring}}
+
+#ifdef cl_khr_spir
+#error "Incorrect cl_khr_spir define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_spir: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_spir' - ignoring}}
+
+#ifdef cl_khr_egl_event
+#error "Incorrect cl_khr_egl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_event' - ignoring}}
+
+#ifdef cl_khr_egl_image
+#error "Incorrect cl_khr_egl_image define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_image: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_image' - ignoring}}
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_mipmap_image
+#error "Missing cl_khr_mipmap_image define"
+#endif
+#else
+#ifdef cl_khr_mipmap_image
+#error "Incorrect cl_khr_mipmap_image define"
+#endif
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_mipmap_image' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable
+
+#ifdef cl_khr_srgb_image_writes
+#error "Incorrect cl_khr_srgb_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_srgb_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_srgb_image_writes' - ignoring}}
+
+#ifdef cl_khr_subgroups
+#error "Incorrect cl_khr_subgroups define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_subgroups: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_subgroups' - ignoring}}
+
+#ifdef cl_khr_terminate_context
+#error "Incorrect cl_khr_terminate_context define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_terminate_context: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_terminate_context' - ignoring}}
+
+#ifndef cl_amd_media_ops
+#error "Missing cl_amd_media_ops define"
+#endif
+#pragma OPENCL EXTENSION cl_amd_media_ops: enable
+
+#ifndef cl_amd_media_ops2
+#error "Missing cl_amd_media_ops2 define"
+#endif
+#pragma OPENCL EXTENSION cl_amd_media_ops2: enable
+
diff --git a/test/Misc/ast-dump-color.cpp b/test/Misc/ast-dump-color.cpp
index e93274e..852d689 100644
--- a/test/Misc/ast-dump-color.cpp
+++ b/test/Misc/ast-dump-color.cpp
@@ -34,7 +34,7 @@
 //CHECK: {{^}}[[Blue]]|-[[RESET]][[GREEN]]TypedefDecl[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]<invalid sloc>[[RESET]]> [[Yellow]]<invalid sloc>[[RESET]] implicit[[CYAN]] __uint128_t[[RESET]] [[Green]]'unsigned __int128'[[RESET]]{{$}}
 //CHECK: {{^}}[[Blue]]|-[[RESET]][[GREEN]]TypedefDecl[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]<invalid sloc>[[RESET]]> [[Yellow]]<invalid sloc>[[RESET]] implicit[[CYAN]] __builtin_va_list[[RESET]] [[Green]]'struct __va_list_tag [1]'[[RESET]]{{$}}
 //CHECK: {{^}}[[Blue]]|-[[RESET]][[GREEN]]VarDecl[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]{{.*}}ast-dump-color.cpp:6:1[[RESET]], [[Yellow]]col:5[[RESET]]> [[Yellow]]col:5[[RESET]][[CYAN]] Test[[RESET]] [[Green]]'int'[[RESET]]
-//CHECK: {{^}}[[Blue]]| |-[[RESET]][[BLUE:.\[0;1;34m]]UnusedAttr[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]col:25[[RESET]]>{{$}}
+//CHECK: {{^}}[[Blue]]| |-[[RESET]][[BLUE:.\[0;1;34m]]UnusedAttr[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]col:25[[RESET]]> unused{{$}}
 //CHECK: {{^}}[[Blue]]| `-[[RESET]][[Blue]]FullComment[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]line:4:4[[RESET]], [[Yellow]]line:5:8[[RESET]]>{{$}}
 //CHECK: {{^}}[[Blue]]|   `-[[RESET]][[Blue]]ParagraphComment[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]line:4:4[[RESET]], [[Yellow]]line:5:8[[RESET]]>{{$}}
 //CHECK: {{^}}[[Blue]]|     |-[[RESET]][[Blue]]TextComment[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]line:4:4[[RESET]]> Text=" "{{$}}
diff --git a/test/Misc/ast-dump-decl.mm b/test/Misc/ast-dump-decl.mm
index 06ab515..be245f7 100644
--- a/test/Misc/ast-dump-decl.mm
+++ b/test/Misc/ast-dump-decl.mm
@@ -21,3 +21,13 @@
 // CHECK-NEXT:     CXXConstructExpr
 // CHECK-NEXT:   ObjCIvarDecl{{.*}} X
 // CHECK-NEXT:   ObjCMethodDecl{{.*}} foo
+
+// @() boxing expressions.
+template <typename T>
+struct BoxingTest {
+  static id box(T value) {
+    return @(value);
+  }
+};
+
+// CHECK: ObjCBoxedExpr{{.*}} '<dependent type>'{{$}}
diff --git a/test/Misc/ast-dump-invalid.cpp b/test/Misc/ast-dump-invalid.cpp
index 7b02ba1..aa6cd52 100644
--- a/test/Misc/ast-dump-invalid.cpp
+++ b/test/Misc/ast-dump-invalid.cpp
@@ -34,6 +34,7 @@
 // CHECK-NEXT:   `-CompoundStmt
 // CHECK-NEXT:     `-IfStmt {{.*}} <line:25:3, line:28:12>
 // CHECK-NEXT:       |-<<<NULL>>>
+// CHECK-NEXT:       |-<<<NULL>>>
 // CHECK-NEXT:       |-OpaqueValueExpr {{.*}} <<invalid sloc>> '_Bool'
 // CHECK-NEXT:       |-ReturnStmt {{.*}} <line:26:5, col:12>
 // CHECK-NEXT:       | `-IntegerLiteral {{.*}} <col:12> 'int' 4
@@ -41,3 +42,23 @@
 // CHECK-NEXT:         `-ImplicitCastExpr {{.*}} <col:12> 'int' <LValueToRValue>
 // CHECK-NEXT:           `-DeclRefExpr {{.*}} <col:12> 'int' lvalue ParmVar {{.*}} 'i' 'int'
 
+namespace TestInvalidFunctionDecl {
+struct Str {
+   double foo1(double, invalid_type);
+};
+double Str::foo1(double, invalid_type)
+{ return 45; }
+}
+// CHECK: NamespaceDecl {{.*}} <{{.*}}> {{.*}} TestInvalidFunctionDecl
+// CHECK-NEXT: |-CXXRecordDecl {{.*}} <line:46:1, line:48:1> line:46:8 struct Str definition
+// CHECK-NEXT: | |-CXXRecordDecl {{.*}} <col:1, col:8> col:8 implicit struct Str
+// CHECK-NEXT: | `-CXXMethodDecl {{.*}} <line:47:4, col:36> col:11 invalid foo1 'double (double, int)'
+// CHECK-NEXT: |   |-ParmVarDecl {{.*}} <col:16> col:22 'double'
+// CHECK-NEXT: |   `-ParmVarDecl {{.*}} <col:24, <invalid sloc>> col:36 invalid 'int'
+// CHECK-NEXT: `-CXXMethodDecl {{.*}} parent {{.*}} <line:49:1, line:50:14> line:49:13 invalid foo1 'double (double, int)'
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} <col:18> col:24 'double'
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} <col:26, <invalid sloc>> col:38 invalid 'int'
+// CHECK-NEXT:   `-CompoundStmt {{.*}} <line:50:1, col:14>
+// CHECK-NEXT:     `-ReturnStmt {{.*}} <col:3, col:10>
+// CHECK-NEXT:       `-ImplicitCastExpr {{.*}} <col:10> 'double' <IntegralToFloating>
+// CHECK-NEXT:         `-IntegerLiteral {{.*}} <col:10> 'int' 45
diff --git a/test/Misc/ast-dump-pipe.cl b/test/Misc/ast-dump-pipe.cl
new file mode 100644
index 0000000..1690e5c
--- /dev/null
+++ b/test/Misc/ast-dump-pipe.cl
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -triple spir64 -cl-std=CL2.0 -ast-dump -ast-dump-filter pipetype %s | FileCheck -strict-whitespace %s
+typedef pipe int pipetype;
+// CHECK:      PipeType {{.*}} 'pipe int'
+// CHECK-NEXT:   BuiltinType {{.*}} 'int'
diff --git a/test/Misc/ast-print-char-literal.cpp b/test/Misc/ast-print-char-literal.cpp
index bb5daa2..614b3ca 100644
--- a/test/Misc/ast-print-char-literal.cpp
+++ b/test/Misc/ast-print-char-literal.cpp
@@ -13,6 +13,8 @@
   h<u8'2'>();
 }
 
+char j = '\xFF';
+
 // CHECK: char c = u8'1';
 // CHECK-NEXT: char d = '1';
 // CHECK-NEXT: char e = U'1';
@@ -22,3 +24,4 @@
 // CHECK: template <char c = u8'1'>
 
 // CHECK: h<u8'2'>();
+// CHECK: char j = '\xff';
diff --git a/test/Misc/ast-print-pragmas.cpp b/test/Misc/ast-print-pragmas.cpp
index c4fe1e2..5840c1a 100644
--- a/test/Misc/ast-print-pragmas.cpp
+++ b/test/Misc/ast-print-pragmas.cpp
@@ -19,7 +19,9 @@
 
 // CHECK: #pragma clang loop interleave(disable)
 // CHECK-NEXT: #pragma clang loop vectorize(enable)
+// CHECK-NEXT: #pragma clang loop distribute(disable)
 
+#pragma clang loop distribute(disable)
 #pragma clang loop vectorize(enable)
 #pragma clang loop interleave(disable)
 // CHECK-NEXT: while (i - 1 < Length)
@@ -30,7 +32,9 @@
 
 // CHECK: #pragma clang loop interleave(enable)
 // CHECK-NEXT: #pragma clang loop vectorize(disable)
+// CHECK-NEXT: #pragma clang loop distribute(enable)
 
+#pragma clang loop distribute(enable)
 #pragma clang loop vectorize(disable)
 #pragma clang loop interleave(enable)
 // CHECK-NEXT: while (i - 2 < Length)
diff --git a/test/Misc/backend-optimization-failure-nodbg.cpp b/test/Misc/backend-optimization-failure-nodbg.cpp
index 3c32646..1e84718 100644
--- a/test/Misc/backend-optimization-failure-nodbg.cpp
+++ b/test/Misc/backend-optimization-failure-nodbg.cpp
@@ -4,7 +4,7 @@
 // Test verifies optimization failures generated by the backend are handled
 // correctly by clang. LLVM tests verify all of the failure conditions.
 
-void test_switch(int *A, int *B, int Length) {
+void test_switch(int *A, int *B, int Length) { /* expected-warning {{loop not vectorized: failed explicitly specified loop vectorization}} */
 #pragma clang loop vectorize(enable) unroll(disable)
   for (int i = 0; i < Length; i++) {
     switch (A[i]) {
@@ -18,4 +18,4 @@
       B[i] = 3;
     }
   }
-/* expected-warning {{loop not vectorized: failed explicitly specified loop vectorization}} */ }
+}
diff --git a/test/Misc/backend-optimization-failure.cpp b/test/Misc/backend-optimization-failure.cpp
index c0f3bf4..bb50e96 100644
--- a/test/Misc/backend-optimization-failure.cpp
+++ b/test/Misc/backend-optimization-failure.cpp
@@ -7,7 +7,7 @@
 void test_switch(int *A, int *B, int Length) {
 #pragma clang loop vectorize(enable) unroll(disable)
   for (int i = 0; i < Length; i++) {
-/* expected-warning {{loop not vectorized: failed explicitly specified loop vectorization}} */ switch (A[i]) {
+/* expected-warning@-1 {{loop not vectorized: failed explicitly specified loop vectorization}} */ switch (A[i]) {
     case 0:
       B[i] = 1;
       break;
diff --git a/test/Misc/backend-resource-limit-diagnostics.cl b/test/Misc/backend-resource-limit-diagnostics.cl
new file mode 100644
index 0000000..6e7619b
--- /dev/null
+++ b/test/Misc/backend-resource-limit-diagnostics.cl
@@ -0,0 +1,9 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: not %clang_cc1 -emit-codegen-only -triple=amdgcn-- %s 2>&1 | FileCheck %s
+
+// CHECK: error: local memory limit exceeded (480000) in use_huge_lds
+kernel void use_huge_lds()
+{
+    volatile local int huge[120000];
+    huge[0] = 2;
+}
diff --git a/test/Misc/diag-format.c b/test/Misc/diag-format.c
index 8e30cf7..d34d25a 100644
--- a/test/Misc/diag-format.c
+++ b/test/Misc/diag-format.c
@@ -4,27 +4,27 @@
 //
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1300  %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fms-compatibility-version=13.00  %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
-// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc  %s 2>&1 | FileCheck %s -check-prefix=MSVC
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1300 -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fms-compatibility-version=13.00 -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
-// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1300 -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1800 -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC2013
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1900 -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC2015
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fms-compatibility-version=13.00 -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1800 -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC2013
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1900 -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC2015
 //
 // RUN: %clang -fsyntax-only -fdiagnostics-format=vi    %s 2>&1 | FileCheck %s -check-prefix=VI
 //
-// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fno-show-column %s 2>&1 | FileCheck %s -check-prefix=MSVC_ORIG
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fno-show-column -fmsc-version=1900 %s 2>&1 | FileCheck %s -check-prefix=MSVC2015_ORIG
 //
 // RUN: %clang -fsyntax-only -fno-show-column %s 2>&1 | FileCheck %s -check-prefix=NO_COLUMN
 //
 // RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback -fmsc-version=1300 %s 2>&1 | FileCheck %s -check-prefix=MSVC2010-FALLBACK
 // RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback -fms-compatibility-version=13.00 %s 2>&1 | FileCheck %s -check-prefix=MSVC2010-FALLBACK
-// RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback %s 2>&1 | FileCheck %s -check-prefix=MSVC-FALLBACK
-
-
-
-
+// RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback -fmsc-version=1800 %s 2>&1 | FileCheck %s -check-prefix=MSVC2013-FALLBACK
+// RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback -fmsc-version=1900 %s 2>&1 | FileCheck %s -check-prefix=MSVC2015-FALLBACK
 
 
 
@@ -36,10 +36,13 @@
 #endif bad // extension!
 // DEFAULT: {{.*}}:36:8: warning: extra tokens at end of #endif directive [-Wextra-tokens]
 // MSVC2010: {{.*}}(36,7) : warning: extra tokens at end of #endif directive [-Wextra-tokens]
-// MSVC: {{.*}}(36,8) : warning: extra tokens at end of #endif directive [-Wextra-tokens]
+// MSVC2013: {{.*}}(36,8) : warning: extra tokens at end of #endif directive [-Wextra-tokens]
+// MSVC: {{.*}}(36,8){{ ?}}: warning: extra tokens at end of #endif directive [-Wextra-tokens]
+// MSVC2015: {{.*}}(36,8): warning: extra tokens at end of #endif directive [-Wextra-tokens]
 // VI: {{.*}} +36:8: warning: extra tokens at end of #endif directive [-Wextra-tokens]
-// MSVC_ORIG: {{.*}}(36) : warning: extra tokens at end of #endif directive [-Wextra-tokens]
+// MSVC2015_ORIG: {{.*}}(36): warning: extra tokens at end of #endif directive [-Wextra-tokens]
 // NO_COLUMN: {{.*}}:36: warning: extra tokens at end of #endif directive [-Wextra-tokens]
 // MSVC2010-FALLBACK: {{.*}}(36,7) : error(clang): extra tokens at end of #endif directive
-// MSVC-FALLBACK: {{.*}}(36,8) : error(clang): extra tokens at end of #endif directive
+// MSVC2013-FALLBACK: {{.*}}(36,8) : error(clang): extra tokens at end of #endif directive
+// MSVC2015-FALLBACK: {{.*}}(36,8): error(clang): extra tokens at end of #endif directive
 int x;
diff --git a/test/Misc/diag-null-bytes-in-line.cpp b/test/Misc/diag-null-bytes-in-line.cpp
new file mode 100644
index 0000000..1eba91f
--- /dev/null
+++ b/test/Misc/diag-null-bytes-in-line.cpp
Binary files differ
diff --git a/test/Misc/diag-template-diffing-color.cpp b/test/Misc/diag-template-diffing-color.cpp
index bf20315..2010344 100644
--- a/test/Misc/diag-template-diffing-color.cpp
+++ b/test/Misc/diag-template-diffing-color.cpp
@@ -34,42 +34,38 @@
 void test16() {
   set16(vector<const vector<int> >());
 }
-// CHECK: {{.*}}candidate function not viable: no known conversion from 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' to 'vector<vector<[...]>>' for 1st argument
+// CHECK: {{.*}}candidate function not viable: no known conversion from 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<...>>' to 'vector<vector<...>>' for 1st argument
 // TREE: {{.*}}candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // TREE:   vector<
-// TREE:     {{\[}}[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}!= [[CYAN]](no qualifiers){{ ?}}[[RESET]]]{{ ?}}vector<
-// TREE:       [...]>>
+// TREE:     {{\[}}[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}!= [[CYAN]](no qualifiers){{ ?}}[[RESET]]]{{ ?}}vector<...>>
 
 void set17(vector<const vector<int> >) {}
 void test17() {
   set17(vector<vector<int> >());
 }
-// CHECK: candidate function not viable: no known conversion from 'vector<vector<[...]>>' to 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' for 1st argument
+// CHECK: candidate function not viable: no known conversion from 'vector<vector<...>>' to 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<...>>' for 1st argument
 // TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // TREE:   vector<
-// TREE:     {{\[}}[[CYAN]](no qualifiers){{ ?}}[[RESET]]{{ ?}}!= [[CYAN]]const[[RESET]]] vector<
-// TREE:       [...]>>
+// TREE:     {{\[}}[[CYAN]](no qualifiers){{ ?}}[[RESET]]{{ ?}}!= [[CYAN]]const[[RESET]]] vector<...>>
 
 void set18(vector<volatile vector<int> >) {}
 void test18() {
   set18(vector<const vector<int> >());
 }
-// CHECK: candidate function not viable: no known conversion from 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' to 'vector<[[CYAN]]volatile{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' for 1st argument
+// CHECK: candidate function not viable: no known conversion from 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<...>>' to 'vector<[[CYAN]]volatile{{ ?}}[[RESET]]{{ ?}}vector<...>>' for 1st argument
 // TREE: no matching function for call to 'set18'
 // TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // TREE:   vector<
-// TREE:     {{\[}}[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}!= [[CYAN]]volatile[[RESET]]] vector<
-// TREE:       [...]>>
+// TREE:     {{\[}}[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}!= [[CYAN]]volatile[[RESET]]] vector<...>>
 
 void set19(vector<const volatile vector<int> >) {}
 void test19() {
   set19(vector<const vector<int> >());
 }
-// CHECK: candidate function not viable: no known conversion from 'vector<const vector<[...]>>' to 'vector<const [[CYAN]]volatile{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' for 1st argument
+// CHECK: candidate function not viable: no known conversion from 'vector<const vector<...>>' to 'vector<const [[CYAN]]volatile{{ ?}}[[RESET]]{{ ?}}vector<...>>' for 1st argument
 // TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // TREE:   vector<
-// TREE:     [const != const [[CYAN]]volatile[[RESET]]] vector<
-// TREE:       [...]>>
+// TREE:     [const != const [[CYAN]]volatile[[RESET]]] vector<...>>
 
 namespace default_args {
   template <int x, int y = 1+1, int z = 2>
diff --git a/test/Misc/diag-template-diffing-cxx98.cpp b/test/Misc/diag-template-diffing-cxx98.cpp
index 9fa4612..7b1a08c 100644
--- a/test/Misc/diag-template-diffing-cxx98.cpp
+++ b/test/Misc/diag-template-diffing-cxx98.cpp
@@ -45,5 +45,5 @@
     foo(bar, V);
   }
 
-  // CHECK: candidate template ignored: deduced conflicting types for parameter 'T' ('const vector<[...]>' vs. 'volatile vector<[...]>')
+  // CHECK: candidate template ignored: deduced conflicting types for parameter 'T' ('const vector<...>' vs. 'volatile vector<...>')
 }
diff --git a/test/Misc/diag-template-diffing.cpp b/test/Misc/diag-template-diffing.cpp
index a714fd5..a4f29cc 100644
--- a/test/Misc/diag-template-diffing.cpp
+++ b/test/Misc/diag-template-diffing.cpp
@@ -479,14 +479,13 @@
   set17(vector<const vector<int>>());
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'set17'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<[...]>>' to 'vector<vector<[...]>>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<...>>' to 'vector<vector<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'set17'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<int>>' to 'vector<vector<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'set17'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:   vector<
-// CHECK-ELIDE-TREE:     [const != (no qualifiers)] vector<
-// CHECK-ELIDE-TREE:       [...]>>
+// CHECK-ELIDE-TREE:     [const != (no qualifiers)] vector<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'set17'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:   vector<
@@ -498,14 +497,13 @@
   set18(vector<vector<int>>());
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'set18'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<vector<[...]>>' to 'vector<const vector<[...]>>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<vector<...>>' to 'vector<const vector<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'set18'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<vector<int>>' to 'vector<const vector<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'set18'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:   vector<
-// CHECK-ELIDE-TREE:     [(no qualifiers) != const] vector<
-// CHECK-ELIDE-TREE:       [...]>>
+// CHECK-ELIDE-TREE:     [(no qualifiers) != const] vector<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'set18'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:   vector<
@@ -517,14 +515,13 @@
   set19(vector<const vector<int>>());
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'set19'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<[...]>>' to 'vector<volatile vector<[...]>>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<...>>' to 'vector<volatile vector<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'set19'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<int>>' to 'vector<volatile vector<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'set19'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:   vector<
-// CHECK-ELIDE-TREE:     [const != volatile] vector<
-// CHECK-ELIDE-TREE:       [...]>>
+// CHECK-ELIDE-TREE:     [const != volatile] vector<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'set19'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:   vector<
@@ -536,14 +533,13 @@
   set20(vector<const vector<int>>());
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'set20'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<[...]>>' to 'vector<const volatile vector<[...]>>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<...>>' to 'vector<const volatile vector<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'set20'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<int>>' to 'vector<const volatile vector<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'set20'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:   vector<
-// CHECK-ELIDE-TREE:     [const != const volatile] vector<
-// CHECK-ELIDE-TREE:       [...]>>
+// CHECK-ELIDE-TREE:     [const != const volatile] vector<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'set20'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:   vector<
@@ -557,14 +553,13 @@
 int f21(vector<const U21<int>>);
 int k21 = f21(vector<U21<int>>());
 // CHECK-ELIDE-NOTREE: no matching function for call to 'f21'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U21<[...]>>' to 'vector<const U21<[...]>>' for 1st argument 
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U21<...>>' to 'vector<const U21<...>>' for 1st argument 
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'f21'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U21<int>>' to 'vector<const U21<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'f21'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:    vector<
-// CHECK-ELIDE-TREE:      [(no qualifiers) != const] U21<
-// CHECK-ELIDE-TREE:        [...]>>
+// CHECK-ELIDE-TREE:      [(no qualifiers) != const] U21<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'f21'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:    vector<
@@ -577,14 +572,13 @@
 int f22(vector<volatile const U22<int>>);
 int k22 = f22(vector<volatile U22<int>>());
 // CHECK-ELIDE-NOTREE: no matching function for call to 'f22'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U22<[...]>>' to 'vector<const U22<[...]>>' for 1st argument 
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U22<...>>' to 'vector<const U22<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'f22'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U22<int>>' to 'vector<const U22<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'f22'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:    vector<
-// CHECK-ELIDE-TREE:      [(no qualifiers) != const] U22<
-// CHECK-ELIDE-TREE:        [...]>>
+// CHECK-ELIDE-TREE:      [(no qualifiers) != const] U22<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'f22'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:    vector<
@@ -1258,7 +1252,7 @@
 void foo(const T &t) {
   T &t2 = t;
 }
-// CHECK-ELIDE-NOTREE: binding value of type 'const condition<[...]>' to reference to type 'condition<[...]>' drops 'const' qualifier
+// CHECK-ELIDE-NOTREE: binding value of type 'const condition<...>' to reference to type 'condition<...>' drops 'const' qualifier
 }
 
 namespace BoolArgumentBitExtended {
@@ -1390,7 +1384,7 @@
 template <typename SizeType = int, SizeType = 0> struct A {};
 template <typename R = A<>> R bar();
 A<> &foo() { return bar(); }
-// CHECK-ELIDE-NOTREE: error: non-const lvalue reference to type 'A<[2 * ...]>' cannot bind to a temporary of type 'A<[2 * ...]>'
+// CHECK-ELIDE-NOTREE: error: non-const lvalue reference to type 'A<...>' cannot bind to a temporary of type 'A<...>'
 // CHECK-NOELIDE-NOTREE: error: non-const lvalue reference to type 'A<int, 0>' cannot bind to a temporary of type 'A<int, 0>'
 }
 
@@ -1423,8 +1417,8 @@
 // CHECK-ELIDE-NOTREE: error: no viable conversion from 'A<1>' to 'A<(default) 0>'
 // CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<int>' to 'B<(default) ZeroArgs::A<0>>'
 // CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<(default) ZeroArgs::A<0>>' to 'B<int>'
-// CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<const A<[...]>>' to 'B<A<[...]>>'
-// CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<A<[...]>>' to 'B<const A<[...]>>'
+// CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<const A<...>>' to 'B<A<...>>'
+// CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<A<...>>' to 'B<const A<...>>'
 }
 
 namespace TypeAlias {
@@ -1461,7 +1455,37 @@
 }
 // CHECK-ELIDE-NOTREE: error: no matching function for call to 'D'
 // CHECK-ELIDE-NOTREE: note: candidate function [with x = TypeAlias::X::X1] not viable: no known conversion from 'VectorType<X::X2>' to 'const VectorType<(TypeAlias::X)0>' for 1st argument
+}
 
+namespace TypeAlias2 {
+template <typename T>
+class A {};
+
+template <typename T>
+using A_reg = A<T>;
+void take_reg(A_reg<int>);
+
+template <typename T>
+using A_ptr = A<T> *;
+void take_ptr(A_ptr<int>);
+
+template <typename T>
+using A_ref = const A<T> &;
+void take_ref(A_ref<int>);
+
+void run(A_reg<float> reg, A_ptr<float> ptr, A_ref<float> ref) {
+  take_reg(reg);
+// CHECK-ELIDE-NOTREE: error: no matching function for call to 'take_reg'
+// CHECK-ELIDE-NOTREE: note:     candidate function not viable: no known conversion from 'A_reg<float>' to 'A_reg<int>' for 1st argument
+
+  take_ptr(ptr);
+// CHECK-ELIDE-NOTREE: error: no matching function for call to 'take_ptr'
+// CHECK-ELIDE-NOTREE: note:     candidate function not viable: no known conversion from 'A_ptr<float>' to 'A_ptr<int>' for 1st argument
+
+  take_ref(ref);
+// CHECK-ELIDE-NOTREE: error: no matching function for call to 'take_ref'
+// CHECK-ELIDE-NOTREE: note: candidate function not viable: no known conversion from 'const A<float>' to 'const A<int>' for 1st argument
+}
 }
 
 // CHECK-ELIDE-NOTREE: {{[0-9]*}} errors generated.
diff --git a/test/Misc/languageOptsOpenCL.cl b/test/Misc/languageOptsOpenCL.cl
index 82a8f36..9651f01 100644
--- a/test/Misc/languageOptsOpenCL.cl
+++ b/test/Misc/languageOptsOpenCL.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x cl %s -verify
+// RUN: %clang_cc1 -x cl %s -verify -triple spir-unknown-unknown
 // expected-no-diagnostics
 
 // Test the forced language options for OpenCL are set correctly.
diff --git a/test/Misc/nvptx.languageOptsOpenCL.cl b/test/Misc/nvptx.languageOptsOpenCL.cl
new file mode 100644
index 0000000..4c7e153
--- /dev/null
+++ b/test/Misc/nvptx.languageOptsOpenCL.cl
@@ -0,0 +1,211 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple nvptx-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple nvptx-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple nvptx-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple nvptx-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple nvptx-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple nvptx-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple nvptx-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple nvptx-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple nvptx64-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple nvptx64-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple nvptx64-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple nvptx64-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple nvptx64-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple nvptx64-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple nvptx64-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple nvptx64-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+
+// Extensions in all versions
+#ifndef cl_clang_storage_class_specifiers
+#error "Missing cl_clang_storage_class_specifiers define"
+#endif
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers: enable
+
+#ifdef cl_khr_fp16
+#error "Incorrect cl_khr_fp16 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp16: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_fp16' - ignoring}}
+
+#ifdef cl_khr_int64_base_atomics
+#error "Incorrect cl_khr_int64_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_int64_base_atomics' - ignoring}}
+
+#ifdef cl_khr_int64_extended_atomics
+#error "Incorrect cl_khr_int64_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_int64_extended_atomics' - ignoring}}
+
+#ifndef cl_khr_gl_sharing
+#error "Missing cl_khr_gl_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_sharing: enable
+
+#ifndef cl_khr_icd
+#error "Missing cl_khr_icd define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_icd: enable
+
+// Core features in CL 1.1
+
+#ifndef cl_khr_byte_addressable_store
+#error "Missing cl_khr_byte_addressable_store define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_byte_addressable_store' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_base_atomics
+#error "Missing cl_khr_global_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_extended_atomics
+#error "Missing cl_khr_global_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_base_atomics
+#error "Missing cl_khr_local_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_extended_atomics
+#error "Missing cl_khr_local_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#if (__OPENCL_C_VERSION__ < 110)
+// Deprecated above 1.0
+#ifdef cl_khr_select_fprounding_mode
+#error "Incorrect cl_khr_select_fprounding_mode define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_select_fprounding_mode' - ignoring}}
+#endif
+
+
+// Core feature in CL 1.2
+#ifndef cl_khr_fp64
+#error "Missing cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#if (__OPENCL_C_VERSION__ >= 120) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
+
+// Core feature in CL 2.0, but not supported on nvptx
+#ifdef cl_khr_3d_image_writes
+#error "Incorrect cl_khr_3d_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_3d_image_writes' - ignoring}}
+
+
+
+#ifdef cl_khr_gl_event
+#error "Incorrect cl_khr_gl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_event' - ignoring}}
+
+#ifdef cl_khr_d3d10_sharing
+#error "Incorrect cl_khr_d3d10_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d10_sharing' - ignoring}}
+
+#ifdef cl_khr_context_abort
+#error "Incorrect cl_context_abort define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_context_abort: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_context_abort' - ignoring}}
+
+#ifdef cl_khr_d3d11_sharing
+#error "Incorrect cl_khr_d3d11_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d11_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d11_sharing' - ignoring}}
+
+#ifdef cl_khr_dx9_media_sharing
+#error "Incorrect cl_khr_dx9_media_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_dx9_media_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_dx9_media_sharing' - ignoring}}
+
+#ifdef cl_khr_image2d_from_buffer
+#error "Incorrect cl_khr_image2d_from_buffer define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_image2d_from_buffer: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_image2d_from_buffer' - ignoring}}
+
+#ifdef cl_khr_initialize_memory
+#error "Incorrect cl_khr_initialize_memory define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_initialize_memory: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_initialize_memory' - ignoring}}
+
+#ifdef cl_khr_gl_depth_images
+#error "Incorrect cl_khr_gl_depth_images define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_depth_images: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_depth_images' - ignoring}}
+
+#ifdef cl_khr_gl_msaa_sharing
+#error "Incorrect cl_khr_gl_msaa_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_msaa_sharing' - ignoring}}
+
+#ifdef cl_khr_spir
+#error "Incorrect cl_khr_spir define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_spir: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_spir' - ignoring}}
+
+#ifdef cl_khr_egl_event
+#error "Incorrect cl_khr_egl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_event' - ignoring}}
+
+#ifdef cl_khr_egl_image
+#error "Missing cl_khr_egl_image define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_image: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_image' - ignoring}}
+
+#ifdef cl_khr_srgb_image_writes
+#error "Incorrect cl_khr_srgb_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_srgb_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_srgb_image_writes' - ignoring}}
+
+#ifdef cl_khr_subgroups
+#error "Incorrect cl_khr_subgroups define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_subgroups: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_subgroups' - ignoring}}
+
+#ifdef cl_khr_terminate_context
+#error "Incorrect cl_khr_terminate_context define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_terminate_context: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_terminate_context' - ignoring}}
diff --git a/test/Misc/r600.languageOptsOpenCL.cl b/test/Misc/r600.languageOptsOpenCL.cl
new file mode 100644
index 0000000..58444cf
--- /dev/null
+++ b/test/Misc/r600.languageOptsOpenCL.cl
@@ -0,0 +1,225 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu turks
+
+// Extensions in all versions
+#ifndef cl_clang_storage_class_specifiers
+#error "Missing cl_clang_storage_class_specifiers define"
+#endif
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers: enable
+
+#ifdef cl_khr_fp16
+#error "Incorrect cl_khr_fp16 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp16: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_fp16' - ignoring}}
+
+#ifdef cl_khr_int64_base_atomics
+#error "Incorrect cl_khr_int64_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_int64_base_atomics' - ignoring}}
+
+#ifdef cl_khr_int64_extended_atomics
+#error "Incorrect cl_khr_int64_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_int64_extended_atomics' - ignoring}}
+
+#ifdef cl_khr_gl_sharing
+#error "Incorrect cl_khr_gl_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_sharing' - ignoring}}
+
+#ifndef cl_khr_icd
+#error "Missing cl_khr_icd define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_icd: enable
+
+// Core features in CL 1.1
+
+#ifndef cl_khr_byte_addressable_store
+#error "Missing cl_khr_byte_addressable_store define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_byte_addressable_store' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_base_atomics
+#error "Missing cl_khr_global_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_extended_atomics
+#error "Missing cl_khr_global_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_base_atomics
+#error "Missing cl_khr_local_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_extended_atomics
+#error "Missing cl_khr_local_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+// Deprecated abvoe 1.0
+#ifdef cl_khr_select_fprounding_mode
+#error "Incorrect cl_khr_select_fprounding_mode define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_select_fprounding_mode' - ignoring}}
+
+
+// Core feature in CL 1.2
+#ifdef __HAS_FP64__
+#ifndef cl_khr_fp64
+#error "Missing cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#if (__OPENCL_C_VERSION__ >= 120) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
+#else
+#ifdef cl_khr_fp64
+#error "Incorrect cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_fp64' - ignoring}}
+#endif // __HAS_FP64__
+
+//Core feature in CL 2.0
+#ifdef cl_khr_3d_image_writes
+#error "Incorrect cl_khr_3d_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_3d_image_writes' - ignoring}}
+
+
+#ifdef cl_khr_gl_event
+#error "Incorrect cl_khr_gl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_event' - ignoring}}
+
+#ifdef cl_khr_d3d10_sharing
+#error "Incorrect cl_khr_d3d10_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d10_sharing' - ignoring}}
+
+#ifdef cl_khr_context_abort
+#error "Incorrect cl_context_abort define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_context_abort: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_context_abort' - ignoring}}
+
+#ifdef cl_khr_d3d11_sharing
+#error "Incorrect cl_khr_d3d11_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d11_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d11_sharing' - ignoring}}
+
+#ifdef cl_khr_dx9_media_sharing
+#error "Incorrect cl_khr_dx9_media_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_dx9_media_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_dx9_media_sharing' - ignoring}}
+
+#ifdef cl_khr_image2d_from_buffer
+#error "Incorrect cl_khr_image2d_from_buffer define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_image2d_from_buffer: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_image2d_from_buffer' - ignoring}}
+
+#ifdef cl_khr_initialize_memory
+#error "Incorrect cl_khr_initialize_memory define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_initialize_memory: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_initialize_memory' - ignoring}}
+
+#ifdef cl_khr_gl_depth_images
+#error "Incorrect cl_khr_gl_depth_images define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_depth_images: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_depth_images' - ignoring}}
+
+#ifdef cl_khr_gl_msaa_sharing
+#error "Incorrect cl_khr_gl_msaa_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_msaa_sharing' - ignoring}}
+
+#ifdef cl_khr_spir
+#error "Incorrect cl_khr_spir define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_spir: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_spir' - ignoring}}
+
+#ifdef cl_khr_egl_event
+#error "Incorrect cl_khr_egl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_event' - ignoring}}
+
+#ifdef cl_khr_egl_image
+#error "Incorrect cl_khr_egl_image define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_image: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_image' - ignoring}}
+
+#ifdef cl_khr_srgb_image_writes
+#error "Incorrect cl_khr_srgb_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_srgb_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_srgb_image_writes' - ignoring}}
+
+#ifdef cl_khr_subgroups
+#error "Incorrect cl_khr_subgroups define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_subgroups: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_subgroups' - ignoring}}
+
+#ifdef cl_khr_terminate_context
+#error "Incorrect cl_khr_terminate_context define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_terminate_context: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_terminate_context' - ignoring}}
diff --git a/test/Misc/serialized-diags-driver.c b/test/Misc/serialized-diags-driver.c
index ad07d66..617ac8c 100644
--- a/test/Misc/serialized-diags-driver.c
+++ b/test/Misc/serialized-diags-driver.c
@@ -5,10 +5,10 @@
 // doesn't litter the user's system with preprocessed output.
 
 // RUN: rm -f %t
-// RUN: %clang -Wx-unknown-warning -Wall -fsyntax-only --serialize-diagnostics %t.diag %s
+// RUN: %clang -Wx-typoed-warning -Wall -fsyntax-only --serialize-diagnostics %t.diag %s
 // RUN: c-index-test -read-diagnostics %t.diag 2>&1 | FileCheck %s
 
-// CHECK: warning: unknown warning option '-Wx-unknown-warning' [-Wunknown-warning-option] []
+// CHECK: warning: unknown warning option '-Wx-typoed-warning' [-Wunknown-warning-option] []
 
 // CHECK: warning: variable 'voodoo' is uninitialized when used here [-Wuninitialized]
 // CHECK: note: initialize the variable 'voodoo' to silence this warning []
diff --git a/test/Misc/target-parser.c b/test/Misc/target-parser.c
new file mode 100644
index 0000000..fb1c830
--- /dev/null
+++ b/test/Misc/target-parser.c
@@ -0,0 +1,2 @@
+// RUN: not %clang_cc1 -triple armv7--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s
+// CHECK: error: unknown target CPU 'not-a-cpu'
diff --git a/test/Modules/DebugInfoSubmoduleImport.c b/test/Modules/DebugInfoSubmoduleImport.c
index 9fb5d9c..1b31aad 100644
--- a/test/Modules/DebugInfoSubmoduleImport.c
+++ b/test/Modules/DebugInfoSubmoduleImport.c
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fmodule-format=obj -debug-info-kind=limited -dwarf-ext-refs \
 // RUN:     -fimplicit-module-maps -x c -fmodules-cache-path=%t -I %S/Inputs \
-// RUN:     %s -emit-llvm -o - | FileCheck %s
+// RUN:     %s -emit-llvm -debugger-tuning=lldb -o - | FileCheck %s
 #include "DebugSubmoduleA.h"
 #include "DebugSubmoduleB.h"
 
diff --git a/test/Modules/DebugInfoTransitiveImport.m b/test/Modules/DebugInfoTransitiveImport.m
index 206be2e..034a909 100644
--- a/test/Modules/DebugInfoTransitiveImport.m
+++ b/test/Modules/DebugInfoTransitiveImport.m
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fmodule-format=obj -debug-info-kind=limited -dwarf-ext-refs \
 // RUN:     -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs \
-// RUN:     %s -mllvm -debug-only=pchcontainer 2>&1 | FileCheck %s
+// RUN:     %s -mllvm -debug-only=pchcontainer -debugger-tuning=lldb 2>&1 | FileCheck %s
 // REQUIRES: asserts
 
 @import diamond_left;
@@ -20,3 +20,9 @@
 // Skeleton for top:
 // CHECK: !DICompileUnit({{.*}}splitDebugFilename: {{.*}}diamond_top{{.*}}dwoId:
 
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fmodule-format=obj -debug-info-kind=limited -dwarf-ext-refs \
+// RUN:     -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs \
+// RUN:     %s -mllvm -debug-only=pchcontainer 2>&1 | FileCheck %s --check-prefix=NOIMPORT
+
+// NOIMPORT-NOT: !DIImportedEntity
diff --git a/test/Modules/ExtDebugInfo.cpp b/test/Modules/ExtDebugInfo.cpp
index 89a3c39..ada9d61 100644
--- a/test/Modules/ExtDebugInfo.cpp
+++ b/test/Modules/ExtDebugInfo.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t
-// Test that only forward declarations are emitted for types dfined in modules.
+// Test that only forward declarations are emitted for types defined in modules.
 
 // Modules:
 // RUN: %clang_cc1 -x objective-c++ -std=c++11 -debug-info-kind=standalone \
diff --git a/test/Modules/Inputs/DebugNestedA.h b/test/Modules/Inputs/DebugNestedA.h
new file mode 100644
index 0000000..58dc2a7
--- /dev/null
+++ b/test/Modules/Inputs/DebugNestedA.h
@@ -0,0 +1,8 @@
+/* -*- C++ -*- */
+template <typename T> class Base {};
+template <typename T> struct A : public Base<A<T>> {
+  void f();
+};
+
+class F {};
+typedef A<F> AF;
diff --git a/test/Modules/Inputs/DebugNestedB.h b/test/Modules/Inputs/DebugNestedB.h
new file mode 100644
index 0000000..7f75d094
--- /dev/null
+++ b/test/Modules/Inputs/DebugNestedB.h
@@ -0,0 +1,7 @@
+/* -*- C++ -*- */
+#include "DebugNestedA.h"
+class C {
+  void run(AF &af) {
+    af.f();
+  }
+};
diff --git a/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/Headers/NeedsGNUInlineAsm.h b/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/Headers/NeedsGNUInlineAsm.h
new file mode 100644
index 0000000..7978a76
--- /dev/null
+++ b/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/Headers/NeedsGNUInlineAsm.h
@@ -0,0 +1 @@
+// NeedsGNUInlineAsm.h
diff --git a/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/Headers/asm.h b/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/Headers/asm.h
new file mode 100644
index 0000000..da52f82
--- /dev/null
+++ b/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/Headers/asm.h
@@ -0,0 +1 @@
+__asm("foo");
diff --git a/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/module.map b/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/module.map
new file mode 100644
index 0000000..a953610
--- /dev/null
+++ b/test/Modules/Inputs/GNUAsm/NeedsGNUInlineAsm.framework/module.map
@@ -0,0 +1,8 @@
+framework module NeedsGNUInlineAsm {
+  header "NeedsGNUInlineAsm.h"
+
+  explicit module Asm {
+    requires gnuinlineasm
+    header "asm.h"
+  }
+}
diff --git a/test/Modules/Inputs/PR21547/FirstHeader.h b/test/Modules/Inputs/PR21547/FirstHeader.h
new file mode 100644
index 0000000..a01c607
--- /dev/null
+++ b/test/Modules/Inputs/PR21547/FirstHeader.h
@@ -0,0 +1,13 @@
+template<class Element> struct TMatrixT;
+typedef TMatrixT<double> TMatrixD;
+
+void f(const TMatrixD &m);
+
+template<class Element> struct TMatrixT {
+  template <class Element2> TMatrixT(const TMatrixT<Element2> &);
+  ~TMatrixT() {}
+  void Determinant () { f(*this); }
+};
+
+template struct TMatrixT<float>;
+template struct TMatrixT<double>;
diff --git a/test/Modules/Inputs/PR21547/module.modulemap b/test/Modules/Inputs/PR21547/module.modulemap
new file mode 100644
index 0000000..8ca0643
--- /dev/null
+++ b/test/Modules/Inputs/PR21547/module.modulemap
@@ -0,0 +1,4 @@
+module M {
+  header "FirstHeader.h"
+  export *
+}
diff --git a/test/Modules/Inputs/PR26014/A.h b/test/Modules/Inputs/PR26014/A.h
new file mode 100644
index 0000000..49de5ba
--- /dev/null
+++ b/test/Modules/Inputs/PR26014/A.h
@@ -0,0 +1,13 @@
+#ifndef _LIBCPP_TYPE_TRAITS
+#define _LIBCPP_TYPE_TRAITS
+
+
+template <class _Tp>
+struct underlying_type
+{
+    typedef __underlying_type(_Tp) type;
+};
+
+#endif  // _LIBCPP_TYPE_TRAITS
+
+#include "B.h"
diff --git a/test/Modules/Inputs/PR26014/B.h b/test/Modules/Inputs/PR26014/B.h
new file mode 100644
index 0000000..58d1f8f
--- /dev/null
+++ b/test/Modules/Inputs/PR26014/B.h
@@ -0,0 +1,10 @@
+#ifndef _LIBCPP_TYPE_TRAITS
+#define _LIBCPP_TYPE_TRAITS
+
+template <class _Tp>
+struct underlying_type
+{
+    typedef __underlying_type(_Tp) type;
+};
+
+#endif  // _LIBCPP_TYPE_TRAITS
diff --git a/test/Modules/Inputs/PR26014/module.modulemap b/test/Modules/Inputs/PR26014/module.modulemap
new file mode 100644
index 0000000..4937418
--- /dev/null
+++ b/test/Modules/Inputs/PR26014/module.modulemap
@@ -0,0 +1,9 @@
+module A {
+  header "A.h"
+  export *
+}
+
+module B {
+  header "B.h"
+  export *
+}
diff --git a/test/Modules/Inputs/PR26179/A.h b/test/Modules/Inputs/PR26179/A.h
new file mode 100644
index 0000000..c264f4c
--- /dev/null
+++ b/test/Modules/Inputs/PR26179/A.h
@@ -0,0 +1,2 @@
+#include "basic_string.h"
+#include "B.h"
diff --git a/test/Modules/Inputs/PR26179/B.h b/test/Modules/Inputs/PR26179/B.h
new file mode 100644
index 0000000..46a109e
--- /dev/null
+++ b/test/Modules/Inputs/PR26179/B.h
@@ -0,0 +1 @@
+#include "basic_string.h"
diff --git a/test/Modules/Inputs/PR26179/basic_string.h b/test/Modules/Inputs/PR26179/basic_string.h
new file mode 100644
index 0000000..653ce07
--- /dev/null
+++ b/test/Modules/Inputs/PR26179/basic_string.h
@@ -0,0 +1,12 @@
+#ifndef _GLIBCXX_STRING
+#define _GLIBCXX_STRING 1
+
+template<typename T>
+struct basic_string {
+  static T _S_empty_rep_storage[];
+};
+
+template<typename T>
+T basic_string<T>::_S_empty_rep_storage[sizeof(T)];
+
+#endif
diff --git a/test/Modules/Inputs/PR26179/module.modulemap b/test/Modules/Inputs/PR26179/module.modulemap
new file mode 100644
index 0000000..4937418
--- /dev/null
+++ b/test/Modules/Inputs/PR26179/module.modulemap
@@ -0,0 +1,9 @@
+module A {
+  header "A.h"
+  export *
+}
+
+module B {
+  header "B.h"
+  export *
+}
diff --git a/test/Modules/Inputs/PR27041/Rtypes.h b/test/Modules/Inputs/PR27041/Rtypes.h
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/test/Modules/Inputs/PR27041/Rtypes.h
@@ -0,0 +1 @@
+
diff --git a/test/Modules/Inputs/PR27041/TGenericClassInfo.h b/test/Modules/Inputs/PR27041/TGenericClassInfo.h
new file mode 100644
index 0000000..b43b2c9
--- /dev/null
+++ b/test/Modules/Inputs/PR27041/TGenericClassInfo.h
@@ -0,0 +1,3 @@
+namespace std {}
+namespace std { enum float_round_style { denorm_present }; }
+#include "TSchemaHelper.h"
diff --git a/test/Modules/Inputs/PR27041/TSchemaHelper.h b/test/Modules/Inputs/PR27041/TSchemaHelper.h
new file mode 100644
index 0000000..31f726f
--- /dev/null
+++ b/test/Modules/Inputs/PR27041/TSchemaHelper.h
@@ -0,0 +1 @@
+namespace std { enum float_round_style { denorm_present }; }
diff --git a/test/Modules/Inputs/PR27041/module.modulemap b/test/Modules/Inputs/PR27041/module.modulemap
new file mode 100644
index 0000000..f0147cd
--- /dev/null
+++ b/test/Modules/Inputs/PR27041/module.modulemap
@@ -0,0 +1,2 @@
+module "Rtypes.h" { header "Rtypes.h" header "TGenericClassInfo.h" }
+module "TSchemaHelper.h" { header "TSchemaHelper.h" }
diff --git a/test/Modules/Inputs/PR27186/Rtypes.h b/test/Modules/Inputs/PR27186/Rtypes.h
new file mode 100644
index 0000000..ecbe10d
--- /dev/null
+++ b/test/Modules/Inputs/PR27186/Rtypes.h
@@ -0,0 +1,2 @@
+#include <stddef.h>
+typedef struct timespec timespec_t;
diff --git a/test/Modules/Inputs/PR27186/module.modulemap b/test/Modules/Inputs/PR27186/module.modulemap
new file mode 100644
index 0000000..58ce19d
--- /dev/null
+++ b/test/Modules/Inputs/PR27186/module.modulemap
@@ -0,0 +1,5 @@
+module "Rtypes.h" { header "Rtypes.h" }
+module a [extern_c] {
+  header "stddef.h"
+  header "time.h"
+}
diff --git a/test/Modules/Inputs/PR27186/stddef.h b/test/Modules/Inputs/PR27186/stddef.h
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/test/Modules/Inputs/PR27186/stddef.h
@@ -0,0 +1 @@
+
diff --git a/test/Modules/Inputs/PR27186/time.h b/test/Modules/Inputs/PR27186/time.h
new file mode 100644
index 0000000..9ac2ace
--- /dev/null
+++ b/test/Modules/Inputs/PR27186/time.h
@@ -0,0 +1 @@
+struct timespec;
diff --git a/test/Modules/Inputs/PR27401/a.h b/test/Modules/Inputs/PR27401/a.h
new file mode 100644
index 0000000..63d6b70
--- /dev/null
+++ b/test/Modules/Inputs/PR27401/a.h
@@ -0,0 +1,17 @@
+#ifndef _LIBCPP_ALGORITHM
+#define _LIBCPP_ALGORITHM
+template <class _Tp, _Tp>
+struct integral_constant {
+  static const _Tp value = _Tp();
+};
+
+template <class _Tp>
+struct is_nothrow_default_constructible
+	: integral_constant<bool, __is_constructible(_Tp)> {};
+
+template <class _Tp>
+struct is_nothrow_move_constructible
+    : integral_constant<bool, __is_constructible(_Tp, _Tp)> {};
+
+class allocator {};
+#endif
diff --git a/test/Modules/Inputs/PR27401/b.h b/test/Modules/Inputs/PR27401/b.h
new file mode 100644
index 0000000..2b4e7f1
--- /dev/null
+++ b/test/Modules/Inputs/PR27401/b.h
@@ -0,0 +1,21 @@
+#include "a.h"
+#ifndef _LIBCPP_VECTOR
+template <class, class _Allocator>
+class __vector_base {
+protected:
+  _Allocator __alloc() const;
+  __vector_base(_Allocator);
+};
+
+template <class _Tp, class _Allocator = allocator>
+class vector : __vector_base<_Tp, _Allocator> {
+public:
+  vector() noexcept(is_nothrow_default_constructible<_Allocator>::value);
+  vector(const vector &);
+  vector(vector &&)
+      noexcept(is_nothrow_move_constructible<_Allocator>::value);
+};
+
+#endif
+void GetUniquePtrType() { vector<char> v; }
+
diff --git a/test/Modules/Inputs/PR27401/module.modulemap b/test/Modules/Inputs/PR27401/module.modulemap
new file mode 100644
index 0000000..a0efada
--- /dev/null
+++ b/test/Modules/Inputs/PR27401/module.modulemap
@@ -0,0 +1 @@
+module "b" { header "b.h" export * }
diff --git a/test/Modules/Inputs/PR27513/a.h b/test/Modules/Inputs/PR27513/a.h
new file mode 100644
index 0000000..7eecbf4
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/a.h
@@ -0,0 +1,5 @@
+#include "b.h"
+
+inline void f() { basic_string<char> s; }
+
+#include "c.h"
diff --git a/test/Modules/Inputs/PR27513/b.h b/test/Modules/Inputs/PR27513/b.h
new file mode 100644
index 0000000..b514c1e
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b.h
@@ -0,0 +1,3 @@
+#include "mystring.h"
+#include "b1.h"
+#include "b2.h"
diff --git a/test/Modules/Inputs/PR27513/b1.h b/test/Modules/Inputs/PR27513/b1.h
new file mode 100644
index 0000000..a12b29f
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b1.h
@@ -0,0 +1 @@
+#include "b11.h"
diff --git a/test/Modules/Inputs/PR27513/b11.h b/test/Modules/Inputs/PR27513/b11.h
new file mode 100644
index 0000000..e7bfaec
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b11.h
@@ -0,0 +1,2 @@
+#include "mystring.h"
+#include "b111.h"
diff --git a/test/Modules/Inputs/PR27513/b111.h b/test/Modules/Inputs/PR27513/b111.h
new file mode 100644
index 0000000..b7a63b5
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b111.h
@@ -0,0 +1,3 @@
+#include "mystring.h"
+#include "b1111.h"
+#include "b1112.h"
diff --git a/test/Modules/Inputs/PR27513/b1111.h b/test/Modules/Inputs/PR27513/b1111.h
new file mode 100644
index 0000000..3f9cf44
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b1111.h
@@ -0,0 +1 @@
+#include "mystring.h"
diff --git a/test/Modules/Inputs/PR27513/b1112.h b/test/Modules/Inputs/PR27513/b1112.h
new file mode 100644
index 0000000..3f9cf44
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b1112.h
@@ -0,0 +1 @@
+#include "mystring.h"
diff --git a/test/Modules/Inputs/PR27513/b2.h b/test/Modules/Inputs/PR27513/b2.h
new file mode 100644
index 0000000..3f9cf44
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b2.h
@@ -0,0 +1 @@
+#include "mystring.h"
diff --git a/test/Modules/Inputs/PR27513/c.h b/test/Modules/Inputs/PR27513/c.h
new file mode 100644
index 0000000..3f9cf44
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/c.h
@@ -0,0 +1 @@
+#include "mystring.h"
diff --git a/test/Modules/Inputs/PR27513/module.modulemap b/test/Modules/Inputs/PR27513/module.modulemap
new file mode 100644
index 0000000..ee2a9ce
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/module.modulemap
@@ -0,0 +1,7 @@
+module "c.h" {header "c.h" export *}
+module "b2.h" { header "b2.h" export *}
+module "b.h" {header "b.h" export *}
+module "b111.h" { header "b111.h" export *}
+module "b11.h" { header "b11.h" export *}
+module "b1111.h" { header "b1111.h" export *}
+module "b1112.h" { header "b1112.h" export *}
diff --git a/test/Modules/Inputs/PR27513/mystring.h b/test/Modules/Inputs/PR27513/mystring.h
new file mode 100644
index 0000000..95680ed
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/mystring.h
@@ -0,0 +1,8 @@
+#ifndef _GLIBCXX_STRING
+#define _GLIBCXX_STRING
+template<typename> struct basic_string {
+  struct _Alloc_hider {} _M_dataplus;
+  ~basic_string() { _Alloc_hider h; } 
+};
+extern template class basic_string<char>;
+#endif
diff --git a/test/Modules/Inputs/PR27699/Subdir/a.h b/test/Modules/Inputs/PR27699/Subdir/a.h
new file mode 100644
index 0000000..6c36a1a
--- /dev/null
+++ b/test/Modules/Inputs/PR27699/Subdir/a.h
@@ -0,0 +1 @@
+#include "streambuf"
diff --git a/test/Modules/Inputs/PR27699/Subdir/b.h b/test/Modules/Inputs/PR27699/Subdir/b.h
new file mode 100644
index 0000000..6c36a1a
--- /dev/null
+++ b/test/Modules/Inputs/PR27699/Subdir/b.h
@@ -0,0 +1 @@
+#include "streambuf"
diff --git a/test/Modules/Inputs/PR27699/module.modulemap b/test/Modules/Inputs/PR27699/module.modulemap
new file mode 100644
index 0000000..1f58ca0
--- /dev/null
+++ b/test/Modules/Inputs/PR27699/module.modulemap
@@ -0,0 +1 @@
+module a {   umbrella "Subdir" module * {export *} }
diff --git a/test/Modules/Inputs/PR27699/streambuf b/test/Modules/Inputs/PR27699/streambuf
new file mode 100644
index 0000000..30ea73d
--- /dev/null
+++ b/test/Modules/Inputs/PR27699/streambuf
@@ -0,0 +1,7 @@
+ #ifndef STREAMBUF
+ #define STREAMBUF
+ template <typename> struct basic_streambuf {
+  basic_streambuf(const basic_streambuf &);
+  };
+template <typename T> basic_streambuf<T>::basic_streambuf(const basic_streambuf &) = default;
+#endif
diff --git a/test/Modules/Inputs/PR27739/DataInputHandler.h b/test/Modules/Inputs/PR27739/DataInputHandler.h
new file mode 100644
index 0000000..1ef02ec
--- /dev/null
+++ b/test/Modules/Inputs/PR27739/DataInputHandler.h
@@ -0,0 +1,19 @@
+template < typename > struct vector {};
+
+#include <map>
+#include "Types.h"
+
+struct TString {
+   TString (char *);
+};
+
+struct TreeInfo {};
+
+class DataInputHandler {
+   void AddTree ();
+   void SignalTreeInfo () {
+      fInputTrees[(char*)""];
+   }
+   map <TString, vector <TreeInfo> >fInputTrees;
+   map <string, bool> fExplicitTrainTest;
+};
diff --git a/test/Modules/Inputs/PR27739/Types.h b/test/Modules/Inputs/PR27739/Types.h
new file mode 100644
index 0000000..6d458a8
--- /dev/null
+++ b/test/Modules/Inputs/PR27739/Types.h
@@ -0,0 +1 @@
+#include <map>
diff --git a/test/Modules/Inputs/PR27739/map b/test/Modules/Inputs/PR27739/map
new file mode 100644
index 0000000..612685c
--- /dev/null
+++ b/test/Modules/Inputs/PR27739/map
@@ -0,0 +1,20 @@
+#ifndef _GLIBCXX_MAP
+#define _GLIBCXX_MAP
+struct basic_string {
+  basic_string(char *);
+} typedef string;
+
+template <typename> class D;
+template <typename _Elements> struct D {
+  _Elements _M_;
+  D(D &) = default;
+};
+
+template <typename _Elements> D<_Elements &&> forward_as_tuple(_Elements);
+
+template <typename _Key, typename _Tp> struct map {
+  _Tp operator[](_Key p1) {
+    auto b = &forward_as_tuple(p1);
+  }
+};
+#endif
diff --git a/test/Modules/Inputs/PR27739/module.modulemap b/test/Modules/Inputs/PR27739/module.modulemap
new file mode 100644
index 0000000..d611e80
--- /dev/null
+++ b/test/Modules/Inputs/PR27739/module.modulemap
@@ -0,0 +1,2 @@
+module "DataInputHandler.h" { header "DataInputHandler.h" export * }
+module "Types.h" { header "Types.h" export *}
diff --git a/test/Modules/Inputs/PR27754/RConversionRuleParser.h b/test/Modules/Inputs/PR27754/RConversionRuleParser.h
new file mode 100644
index 0000000..057dd14
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/RConversionRuleParser.h
@@ -0,0 +1,4 @@
+#include "algobase.h"
+typedef integral_constant<bool, true> true_type;
+class _Rb_tree { _Rb_tree() { true_type(); } };
+#include "TSchemaType.h"
diff --git a/test/Modules/Inputs/PR27754/TMetaUtils.h b/test/Modules/Inputs/PR27754/TMetaUtils.h
new file mode 100644
index 0000000..835b7c6
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/TMetaUtils.h
@@ -0,0 +1,2 @@
+#include "RConversionRuleParser.h"
+void fn1() { true_type(); }
diff --git a/test/Modules/Inputs/PR27754/TSchemaType.h b/test/Modules/Inputs/PR27754/TSchemaType.h
new file mode 100644
index 0000000..2c47793
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/TSchemaType.h
@@ -0,0 +1,2 @@
+#include "algobase.h"
+struct A : integral_constant<bool, true> {};
diff --git a/test/Modules/Inputs/PR27754/algobase.h b/test/Modules/Inputs/PR27754/algobase.h
new file mode 100644
index 0000000..f5e47d8
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/algobase.h
@@ -0,0 +1,4 @@
+#ifndef _STL_ALGOBASE_H
+#define _STL_ALGOBASE_H
+template<typename _Tp, _Tp> struct integral_constant {};
+#endif
diff --git a/test/Modules/Inputs/PR27754/module.modulemap b/test/Modules/Inputs/PR27754/module.modulemap
new file mode 100644
index 0000000..90dcdbb
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/module.modulemap
@@ -0,0 +1,3 @@
+module "RConversionRuleParser.h" { header "RConversionRuleParser.h" }
+module "TMetaUtils.h" { header "TMetaUtils.h" }
+module "TSchemaType.h" { header "TSchemaType.h" }
diff --git a/test/Modules/Inputs/PR27890/a.h b/test/Modules/Inputs/PR27890/a.h
new file mode 100644
index 0000000..9c6e562
--- /dev/null
+++ b/test/Modules/Inputs/PR27890/a.h
@@ -0,0 +1,9 @@
+template <class DataType> DataType values(DataType) { __builtin_va_list ValueArgs; return DataType(); }
+
+template <class DataType>
+class opt {
+public:
+  template <class Mods>
+  opt(Mods) {}
+};
+
diff --git a/test/Modules/Inputs/PR27890/module.modulemap b/test/Modules/Inputs/PR27890/module.modulemap
new file mode 100644
index 0000000..85074e8
--- /dev/null
+++ b/test/Modules/Inputs/PR27890/module.modulemap
@@ -0,0 +1 @@
+module A { header "a.h" export * }
diff --git a/test/Modules/Inputs/PR28332/TextualInclude.h b/test/Modules/Inputs/PR28332/TextualInclude.h
new file mode 100644
index 0000000..e4d2580
--- /dev/null
+++ b/test/Modules/Inputs/PR28332/TextualInclude.h
@@ -0,0 +1,7 @@
+#ifndef LLVM_ADT_SMALLVECTORIMPL_H
+#define LLVM_ADT_SMALLVECTORIMPL_H
+class SmallVectorImpl {
+public:
+  ~SmallVectorImpl();
+};
+#endif
\ No newline at end of file
diff --git a/test/Modules/Inputs/PR28332/a.h b/test/Modules/Inputs/PR28332/a.h
new file mode 100644
index 0000000..1dc96c8
--- /dev/null
+++ b/test/Modules/Inputs/PR28332/a.h
@@ -0,0 +1,8 @@
+#include "b.h"
+
+class A {
+  SmallVector<char, 8> LegalIntWidths;
+  A() {}
+};
+
+#include "c.h"
diff --git a/test/Modules/Inputs/PR28332/b.h b/test/Modules/Inputs/PR28332/b.h
new file mode 100644
index 0000000..e1e07e8
--- /dev/null
+++ b/test/Modules/Inputs/PR28332/b.h
@@ -0,0 +1,3 @@
+#include "TextualInclude.h"
+template <typename, int> class SmallVector : SmallVectorImpl {};
+
diff --git a/test/Modules/Inputs/PR28332/c.h b/test/Modules/Inputs/PR28332/c.h
new file mode 100644
index 0000000..e18bdac
--- /dev/null
+++ b/test/Modules/Inputs/PR28332/c.h
@@ -0,0 +1,2 @@
+#include "TextualInclude.h"
+
diff --git a/test/Modules/Inputs/PR28332/module.modulemap b/test/Modules/Inputs/PR28332/module.modulemap
new file mode 100644
index 0000000..8c3f4ec
--- /dev/null
+++ b/test/Modules/Inputs/PR28332/module.modulemap
@@ -0,0 +1,3 @@
+module "c.h" { header "c.h" export * }
+module "b.h" { header "b.h" export * }
+module "a.h" { header "a.h" }
diff --git a/test/Modules/Inputs/cxx-decls-imported.h b/test/Modules/Inputs/cxx-decls-imported.h
index a4910fe..0a17215 100644
--- a/test/Modules/Inputs/cxx-decls-imported.h
+++ b/test/Modules/Inputs/cxx-decls-imported.h
@@ -50,3 +50,8 @@
 
 struct InhCtorA { InhCtorA(int); };
 struct InhCtorB : InhCtorA { using InhCtorA::InhCtorA; };
+
+struct ClassWithVBases : HasFriends, virtual HasNontrivialDefaultConstructor {
+  int n;
+};
+struct ClassWithVBases;
diff --git a/test/Modules/Inputs/cxx-templates-common.h b/test/Modules/Inputs/cxx-templates-common.h
index a9ca624..8e730c8 100644
--- a/test/Modules/Inputs/cxx-templates-common.h
+++ b/test/Modules/Inputs/cxx-templates-common.h
@@ -53,4 +53,21 @@
   typedef int X;
 };
 
+namespace hidden_specializations {
+  template<typename T> void fn() {}
+
+  template<typename T> struct cls {
+    static void nested_fn() {}
+    struct nested_cls {};
+    static int nested_var;
+    enum class nested_enum {};
+
+    template<typename U> static void nested_fn_t() {}
+    template<typename U> struct nested_cls_t {};
+    template<typename U> static int nested_var_t;
+  };
+
+  template<typename T> int var;
+}
+
 #include "cxx-templates-textual.h"
diff --git a/test/Modules/Inputs/cxx-templates-unimported.h b/test/Modules/Inputs/cxx-templates-unimported.h
new file mode 100644
index 0000000..c2b6b91
--- /dev/null
+++ b/test/Modules/Inputs/cxx-templates-unimported.h
@@ -0,0 +1,43 @@
+#include "cxx-templates-common.h"
+
+namespace hidden_specializations {
+  // explicit specializations
+  template<> void fn<int>() {}
+  template<> struct cls<int> {
+    void nested_fn();
+    struct nested_cls;
+    static int nested_var;
+    enum nested_enum : int;
+  };
+  template<> int var<int>;
+
+  // partial specializations
+  template<typename T> struct cls<T*> {
+    void nested_fn();
+    struct nested_cls;
+    static int nested_var;
+    enum nested_enum : int;
+  };
+  template<typename T> int var<T*>;
+
+  // member specializations
+  template<> void cls<void>::nested_fn() {}
+  template<> struct cls<void>::nested_cls {};
+  template<> int cls<void>::nested_var;
+  template<> enum class cls<void>::nested_enum { e };
+  template<> template<typename U> void cls<void>::nested_fn_t() {}
+  template<> template<typename U> struct cls<void>::nested_cls_t {};
+  template<> template<typename U> int cls<void>::nested_var_t;
+
+  // specializations instantiated here are ok if their pattern is
+  inline void use_stuff() {
+    fn<char>();
+    cls<char>();
+    (void)var<char>;
+    cls<char*>();
+    (void)var<char*>;
+    cls<void>::nested_fn_t<char>();
+    cls<void>::nested_cls_t<char>();
+    (void)cls<void>::nested_var_t<char>;
+  }
+}
diff --git a/test/Modules/Inputs/explicit-build/a.h b/test/Modules/Inputs/explicit-build/a.h
index 5e3602f..a52f735 100644
--- a/test/Modules/Inputs/explicit-build/a.h
+++ b/test/Modules/Inputs/explicit-build/a.h
@@ -1,4 +1,4 @@
-#if !__building_module(a)
+#if !__building_module(a) && !BUILDING_A_PCH
 #error "should only get here when building module a"
 #endif
 
diff --git a/test/Modules/Inputs/getSourceDescriptor-crash/h1.h b/test/Modules/Inputs/getSourceDescriptor-crash/h1.h
new file mode 100644
index 0000000..6f70f09
--- /dev/null
+++ b/test/Modules/Inputs/getSourceDescriptor-crash/h1.h
@@ -0,0 +1 @@
+#pragma once
diff --git a/test/Modules/Inputs/getSourceDescriptor-crash/module.modulemap b/test/Modules/Inputs/getSourceDescriptor-crash/module.modulemap
new file mode 100644
index 0000000..2006ed5
--- /dev/null
+++ b/test/Modules/Inputs/getSourceDescriptor-crash/module.modulemap
@@ -0,0 +1,3 @@
+module foo {
+       header "h1.h"
+}
diff --git a/test/Modules/Inputs/merge-decl-context/a.h b/test/Modules/Inputs/merge-decl-context/a.h
index 89cc712..7be90b1 100644
--- a/test/Modules/Inputs/merge-decl-context/a.h
+++ b/test/Modules/Inputs/merge-decl-context/a.h
@@ -21,4 +21,8 @@
   return fff<A<int>>(&i);
 }
 
+struct Aggregate {
+  int member;
+};
+
 #endif
diff --git a/test/Modules/Inputs/module.map b/test/Modules/Inputs/module.map
index bf50867..2beb942 100644
--- a/test/Modules/Inputs/module.map
+++ b/test/Modules/Inputs/module.map
@@ -215,6 +215,8 @@
 
 module cxx_templates_common {
   header "cxx-templates-common.h"
+
+  explicit module unimported { header "cxx-templates-unimported.h" }
 }
 
 module cxx_templates_a {
@@ -420,3 +422,13 @@
 module DiagOutOfDate {
   header "DiagOutOfDate.h"
 }
+
+module DebugNestedA {
+  header "DebugNestedA.h"
+  export *
+}
+
+module DebugNestedB {
+  header "DebugNestedB.h"
+  export *
+}
diff --git a/test/Modules/Inputs/non-module.h b/test/Modules/Inputs/non-module.h
new file mode 100644
index 0000000..c295900
--- /dev/null
+++ b/test/Modules/Inputs/non-module.h
@@ -0,0 +1,4 @@
+#ifndef NON_MODULE_H
+#define NON_MODULE_H
+
+#endif
diff --git a/test/Modules/Inputs/prebuilt-module/a.h b/test/Modules/Inputs/prebuilt-module/a.h
deleted file mode 100644
index f86587a..0000000
--- a/test/Modules/Inputs/prebuilt-module/a.h
+++ /dev/null
@@ -1 +0,0 @@
-const int a = 1;
diff --git a/test/Modules/Inputs/prebuilt-module/module.modulemap b/test/Modules/Inputs/prebuilt-module/module.modulemap
deleted file mode 100644
index 54459bd..0000000
--- a/test/Modules/Inputs/prebuilt-module/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module prebuilt { header "a.h" }
diff --git a/test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o b/test/Modules/Inputs/suggest-include/empty.h
similarity index 100%
copy from test/Driver/Inputs/basic_android_tree/lib/gcc/mipsel-linux-android/4.4.3/mips-r6/crtbegin.o
copy to test/Modules/Inputs/suggest-include/empty.h
diff --git a/test/Modules/Inputs/suggest-include/module.modulemap b/test/Modules/Inputs/suggest-include/module.modulemap
new file mode 100644
index 0000000..46afd7b
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/module.modulemap
@@ -0,0 +1,22 @@
+module X {
+  module Empty { header "empty.h" }
+
+  exclude header "textual1.h"
+  textual header "textual2.h"
+  textual header "textual3.h"
+
+  module A { header "usetextual1.h" }
+  module B { header "usetextual2.h" }
+  module C { header "usetextual3.h" }
+  module D { header "usetextual4.h" }
+  module E { header "usetextual5.h" }
+
+  module P { private header "private1.h" }
+  module Q { private header "private2.h" }
+  module R { private header "private3.h" }
+  module S { header "useprivate1.h" export * }
+  module T { header "useprivate3.h" }
+}
+
+module Other { textual header "textual4.h" }
+
diff --git a/test/Modules/Inputs/suggest-include/private1.h b/test/Modules/Inputs/suggest-include/private1.h
new file mode 100644
index 0000000..afc7ac7
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/private1.h
@@ -0,0 +1 @@
+extern int private1;
diff --git a/test/Modules/Inputs/suggest-include/private2.h b/test/Modules/Inputs/suggest-include/private2.h
new file mode 100644
index 0000000..24a1893
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/private2.h
@@ -0,0 +1 @@
+extern int private2;
diff --git a/test/Modules/Inputs/suggest-include/private3.h b/test/Modules/Inputs/suggest-include/private3.h
new file mode 100644
index 0000000..26852af
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/private3.h
@@ -0,0 +1 @@
+extern int private3;
diff --git a/test/Modules/Inputs/suggest-include/textual1.h b/test/Modules/Inputs/suggest-include/textual1.h
new file mode 100644
index 0000000..5b18bfb
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual1.h
@@ -0,0 +1 @@
+#define FOO(X) X
diff --git a/test/Modules/Inputs/suggest-include/textual2.h b/test/Modules/Inputs/suggest-include/textual2.h
new file mode 100644
index 0000000..0c06d4e
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual2.h
@@ -0,0 +1 @@
+EXPAND_MACRO
diff --git a/test/Modules/Inputs/suggest-include/textual3.h b/test/Modules/Inputs/suggest-include/textual3.h
new file mode 100644
index 0000000..1e52521
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual3.h
@@ -0,0 +1 @@
+extern int textual3;
diff --git a/test/Modules/Inputs/suggest-include/textual4.h b/test/Modules/Inputs/suggest-include/textual4.h
new file mode 100644
index 0000000..091e0c0
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual4.h
@@ -0,0 +1 @@
+extern int textual4;
diff --git a/test/Modules/Inputs/suggest-include/textual5.h b/test/Modules/Inputs/suggest-include/textual5.h
new file mode 100644
index 0000000..d808617
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual5.h
@@ -0,0 +1 @@
+extern int textual5;
diff --git a/test/Modules/Inputs/suggest-include/useprivate1.h b/test/Modules/Inputs/suggest-include/useprivate1.h
new file mode 100644
index 0000000..817b900
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/useprivate1.h
@@ -0,0 +1 @@
+#include "private1.h"
diff --git a/test/Modules/Inputs/suggest-include/useprivate3.h b/test/Modules/Inputs/suggest-include/useprivate3.h
new file mode 100644
index 0000000..5d5d221
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/useprivate3.h
@@ -0,0 +1 @@
+#include "private3.h"
diff --git a/test/Modules/Inputs/suggest-include/usetextual1.h b/test/Modules/Inputs/suggest-include/usetextual1.h
new file mode 100644
index 0000000..34ab1c7
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual1.h
@@ -0,0 +1,2 @@
+#include "textual1.h"
+FOO(extern int usetextual1;)
diff --git a/test/Modules/Inputs/suggest-include/usetextual2.h b/test/Modules/Inputs/suggest-include/usetextual2.h
new file mode 100644
index 0000000..95b2445
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual2.h
@@ -0,0 +1,2 @@
+#define EXPAND_MACRO extern int usetextual2;
+#include "textual2.h"
diff --git a/test/Modules/Inputs/suggest-include/usetextual3.h b/test/Modules/Inputs/suggest-include/usetextual3.h
new file mode 100644
index 0000000..15a75cc
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual3.h
@@ -0,0 +1 @@
+#include "textual3.h"
diff --git a/test/Modules/Inputs/suggest-include/usetextual4.h b/test/Modules/Inputs/suggest-include/usetextual4.h
new file mode 100644
index 0000000..395bb6f
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual4.h
@@ -0,0 +1 @@
+#include "textual4.h"
diff --git a/test/Modules/Inputs/suggest-include/usetextual5.h b/test/Modules/Inputs/suggest-include/usetextual5.h
new file mode 100644
index 0000000..a7335d3
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual5.h
@@ -0,0 +1 @@
+#include "textual5.h"
diff --git a/test/Modules/Inputs/unused-global-init/init.h b/test/Modules/Inputs/unused-global-init/init.h
new file mode 100644
index 0000000..29a932a
--- /dev/null
+++ b/test/Modules/Inputs/unused-global-init/init.h
@@ -0,0 +1 @@
+struct Init { Init(); ~Init(); } init;
diff --git a/test/Modules/Inputs/unused-global-init/module.modulemap b/test/Modules/Inputs/unused-global-init/module.modulemap
new file mode 100644
index 0000000..c40f0ef
--- /dev/null
+++ b/test/Modules/Inputs/unused-global-init/module.modulemap
@@ -0,0 +1,3 @@
+module used { header "used.h" }
+module unused { header "unused.h" }
+module init { module a { header "init.h" } module b { header "other.h" } }
diff --git a/test/Modules/Inputs/unused-global-init/other.h b/test/Modules/Inputs/unused-global-init/other.h
new file mode 100644
index 0000000..c6be1ad
--- /dev/null
+++ b/test/Modules/Inputs/unused-global-init/other.h
@@ -0,0 +1 @@
+// other.h
diff --git a/test/Modules/Inputs/unused-global-init/unused.h b/test/Modules/Inputs/unused-global-init/unused.h
new file mode 100644
index 0000000..06c2a44
--- /dev/null
+++ b/test/Modules/Inputs/unused-global-init/unused.h
@@ -0,0 +1 @@
+// unused.h
diff --git a/test/Modules/Inputs/unused-global-init/used.h b/test/Modules/Inputs/unused-global-init/used.h
new file mode 100644
index 0000000..689a13f
--- /dev/null
+++ b/test/Modules/Inputs/unused-global-init/used.h
@@ -0,0 +1,2 @@
+// used.h
+#include "init.h"
diff --git a/test/Modules/ModuleDebugInfo.cpp b/test/Modules/ModuleDebugInfo.cpp
index e982986..1bc0613 100644
--- a/test/Modules/ModuleDebugInfo.cpp
+++ b/test/Modules/ModuleDebugInfo.cpp
@@ -20,25 +20,28 @@
 
 // CHECK: distinct !DICompileUnit(language: DW_LANG_{{.*}}C_plus_plus,
 // CHECK-SAME:                    isOptimized: false,
-// CHECK-SAME-NOT:                splitDebugFilename:
-// CHECK:                         dwoId:
+// CHECK-NOT:                     splitDebugFilename:
+// CHECK-SAME:                    dwoId:
 
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "Enum"
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX4EnumE")
 // CHECK: !DINamespace(name: "DebugCXX"
 
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
 
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
 
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
 // CHECK-SAME:             identifier: "_ZTS11TypedefEnum")
 
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
 // CHECK: !DIEnumerator(name: "e5", value: 5)
 
 // CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "B",
@@ -89,18 +92,20 @@
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$FwdVirtual"
 
 // CHECK: !DICompositeType(tag: DW_TAG_union_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
 // CHECK-SAME:             identifier: "_ZTS12TypedefUnion")
 
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
 // CHECK-SAME:             identifier: "_ZTS13TypedefStruct")
 
 // CHECK: !DICompositeType(tag: DW_TAG_union_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
 
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
 
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type,
 // CHECK-SAME:             name: "InAnonymousNamespace",
@@ -120,7 +125,7 @@
 // CHECK-SAME:                         flags: DIFlagFwdDecl,
 // CHECK-SAME:                         identifier: "_ZTS9Template1IPvE")
 
-// Explicit instatiation.
+// Explicit instantiation.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "Template1<int>",
 // CHECK-SAME:             templateParams:
 // CHECK-SAME:             identifier: "_ZTS9Template1IiE")
diff --git a/test/Modules/ModuleDebugInfo.m b/test/Modules/ModuleDebugInfo.m
index f41c4fc..ce35c7c 100644
--- a/test/Modules/ModuleDebugInfo.m
+++ b/test/Modules/ModuleDebugInfo.m
@@ -31,7 +31,7 @@
 // CHECK: ![[MODULE]] = !DIModule(scope: null, name: "DebugObjC
 
 // CHECK: ![[TD_ENUM:.*]] = !DICompositeType(tag: DW_TAG_enumeration_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
 // CHECK-SAME:             elements:
 
 // CHECK: !DISubprogram(name: "+[ObjCClass classMethod]",
@@ -55,7 +55,7 @@
 // CHECK-SAME:             elements:
 
 // CHECK: ![[TD_UNION:.*]] = distinct !DICompositeType(tag: DW_TAG_union_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
 // CHECK-SAME:             elements:
 
 // CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefUnion",
@@ -65,16 +65,18 @@
 // CHECK-SAME:           baseType: ![[TD_ENUM:.*]])
 
 // CHECK: ![[TD_STRUCT:.*]] = distinct !DICompositeType(tag: DW_TAG_structure_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
 // CHECK-SAME:             elements:
 // CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefStruct",
 // CHECK-SAME:           baseType: ![[TD_STRUCT]])
 
 // CHECK: !DICompositeType(tag: DW_TAG_union_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
 
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type,
-// CHECK-SAME-NOT:         name:
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
 
 // CHECK-NEG-NOT: !DICompositeType(tag: DW_TAG_structure_type, name: "PureForwardDecl"
 
diff --git a/test/Modules/ModuleModuleDebugInfo.cpp b/test/Modules/ModuleModuleDebugInfo.cpp
new file mode 100644
index 0000000..abc4bfd
--- /dev/null
+++ b/test/Modules/ModuleModuleDebugInfo.cpp
@@ -0,0 +1,18 @@
+// RUN: rm -rf %t
+
+// RUN: %clang_cc1 -x objective-c++ -std=c++11 -debug-info-kind=standalone \
+// RUN:     -dwarf-ext-refs -fmodules                                   \
+// RUN:     -fmodule-format=obj -fimplicit-module-maps -DMODULES \
+// RUN:     -triple %itanium_abi_triple \
+// RUN:     -fmodules-cache-path=%t %s -I %S/Inputs -I %t -emit-llvm -o - \
+// RUN:   | FileCheck %s
+
+#include "DebugNestedB.h"
+AF af; // This type is not anchored in the module.
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "AF",
+// CHECK-SAME:           baseType: ![[AF:.*]])
+
+// CHECK: ![[AF]] = {{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "A<F>",
+// CHECK-SAME:                             elements:
+
diff --git a/test/Modules/crash-vfs-run-reproducer.m b/test/Modules/crash-vfs-run-reproducer.m
index d0eaa93..e9ecb47 100644
--- a/test/Modules/crash-vfs-run-reproducer.m
+++ b/test/Modules/crash-vfs-run-reproducer.m
@@ -36,6 +36,7 @@
 // CHECKYAML: 'case-sensitive':
 // CHECKYAML-NEXT: 'use-external-names': 'false',
 // CHECKYAML-NEXT: 'overlay-relative': 'true',
+// CHECKYAML-NEXT: 'ignore-non-existent-contents': 'false'
 // CHECKYAML: 'type': 'directory'
 // CHECKYAML: 'name': "/[[PATH:.*]]/Inputs/crash-recovery/usr/include",
 // CHECKYAML-NEXT: 'contents': [
diff --git a/test/Modules/cxx-templates.cpp b/test/Modules/cxx-templates.cpp
index ef4e4e4..12dfdd0 100644
--- a/test/Modules/cxx-templates.cpp
+++ b/test/Modules/cxx-templates.cpp
@@ -1,9 +1,9 @@
 // RUN: rm -rf %t
-// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++11 -ast-dump-lookups | FileCheck %s --check-prefix=CHECK-GLOBAL
-// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++11 -ast-dump-lookups -ast-dump-filter N | FileCheck %s --check-prefix=CHECK-NAMESPACE-N
-// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++11 -ast-dump -ast-dump-filter SomeTemplate | FileCheck %s --check-prefix=CHECK-DUMP
-// RUN: %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -verify -std=c++11
-// RUN: %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -verify -std=c++11 -DEARLY_IMPORT
+// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++14 -ast-dump-lookups 2>/dev/null | FileCheck %s --check-prefix=CHECK-GLOBAL
+// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++14 -ast-dump-lookups -ast-dump-filter N 2>/dev/null | FileCheck %s --check-prefix=CHECK-NAMESPACE-N
+// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++14 -ast-dump -ast-dump-filter SomeTemplate 2>/dev/null | FileCheck %s --check-prefix=CHECK-DUMP
+// RUN: %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -verify -std=c++14
+// RUN: %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -verify -std=c++14 -DEARLY_IMPORT
 
 #ifdef EARLY_IMPORT
 #include "cxx-templates-textual.h"
@@ -105,7 +105,8 @@
 
   TemplateInstantiationVisibility<char[1]> tiv1;
   TemplateInstantiationVisibility<char[2]> tiv2;
-  TemplateInstantiationVisibility<char[3]> tiv3; // expected-error 2{{must be imported from module 'cxx_templates_b_impl'}}
+  TemplateInstantiationVisibility<char[3]> tiv3; // expected-error 5{{must be imported from module 'cxx_templates_b_impl'}}
+  // expected-note@cxx-templates-b-impl.h:10 3{{explicit specialization declared here}}
   // expected-note@cxx-templates-b-impl.h:10 2{{previous definition is here}}
   TemplateInstantiationVisibility<char[4]> tiv4;
 
@@ -172,6 +173,63 @@
   return wfi != wfi;
 }
 
+namespace hidden_specializations {
+  // expected-note@cxx-templates-unimported.h:* 1+{{here}}
+  void test() {
+    // For functions, uses that would trigger instantiations of definitions are
+    // not allowed.
+    fn<void>(); // ok
+    fn<char>(); // ok
+    fn<int>(); // expected-error 1+{{explicit specialization of 'fn<int>' must be imported}}
+    cls<void>::nested_fn(); // expected-error 1+{{explicit specialization of 'nested_fn' must be imported}}
+    cls<void>::nested_fn_t<int>(); // expected-error 1+{{explicit specialization of 'nested_fn_t' must be imported}}
+    cls<void>::nested_fn_t<char>(); // expected-error 1+{{explicit specialization of 'nested_fn_t' must be imported}}
+
+    // For classes, uses that would trigger instantiations of definitions are
+    // not allowed.
+    cls<void> *k0; // ok
+    cls<char> *k1; // ok
+    cls<int> *k2; // ok
+    cls<int*> *k3; // ok
+    cls<void>::nested_cls *nk1; // ok
+    cls<void>::nested_cls_t<int> *nk2; // ok
+    cls<void>::nested_cls_t<char> *nk3; // ok
+    cls<int> uk1; // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+    cls<int*> uk3; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}} expected-error 1+{{definition of}}
+    cls<char*> uk4; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}} expected-error 1+{{definition of}}
+    cls<void>::nested_cls unk1; // expected-error 1+{{explicit specialization of 'nested_cls' must be imported}} expected-error 1+{{definition of}}
+    cls<void>::nested_cls_t<int> unk2; // expected-error 1+{{explicit specialization of 'nested_cls_t' must be imported}} expected-error 1+{{definition of}}
+    cls<void>::nested_cls_t<char> unk3; // expected-error 1+{{explicit specialization of 'nested_cls_t' must be imported}}
+
+    // For enums, uses that would trigger instantiations of definitions are not
+    // allowed.
+    cls<void>::nested_enum e; // ok
+    (void)cls<void>::nested_enum::e; // expected-error 1+{{definition of 'nested_enum' must be imported}} expected-error 1+{{declaration of 'e'}}
+
+    // For variable template specializations, no uses are allowed because
+    // specializations can change the type.
+    (void)sizeof(var<void>); // ok
+    (void)sizeof(var<char>); // ok
+    (void)sizeof(var<int>); // expected-error 1+{{explicit specialization of 'var<int>' must be imported}}
+    (void)sizeof(var<int*>); // expected-error 1+{{partial specialization of 'var<type-parameter-0-0 *>' must be imported}}
+    (void)sizeof(var<char*>); // expected-error 1+{{partial specialization of 'var<type-parameter-0-0 *>' must be imported}}
+    (void)sizeof(cls<void>::nested_var); // ok
+    (void)cls<void>::nested_var; // expected-error 1+{{explicit specialization of 'nested_var' must be imported}}
+    (void)sizeof(cls<void>::nested_var_t<int>); // expected-error 1+{{explicit specialization of 'nested_var_t' must be imported}}
+    (void)sizeof(cls<void>::nested_var_t<char>); // expected-error 1+{{explicit specialization of 'nested_var_t' must be imported}}
+  }
+
+  void cls<int>::nested_fn() {} // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+  struct cls<int>::nested_cls {}; // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+  int cls<int>::nested_var; // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+  enum cls<int>::nested_enum : int {}; // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+
+  template<typename T> void cls<T*>::nested_fn() {} // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}}
+  template<typename T> struct cls<T*>::nested_cls {}; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}}
+  template<typename T> int cls<T*>::nested_var; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}}
+  template<typename T> enum cls<T*>::nested_enum : int {}; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}}
+}
+
 namespace Std {
   void g(); // expected-error {{functions that differ only in their return type cannot be overloaded}}
   // expected-note@cxx-templates-common.h:21 {{previous}}
diff --git a/test/Modules/debug-info-moduleimport.m b/test/Modules/debug-info-moduleimport.m
index bb0ea31..bf60690 100644
--- a/test/Modules/debug-info-moduleimport.m
+++ b/test/Modules/debug-info-moduleimport.m
@@ -1,10 +1,16 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -DGREETING="Hello World" -UNDEBUG -fimplicit-module-maps -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -DGREETING="Hello World" -UNDEBUG -fimplicit-module-maps -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - | FileCheck %s --check-prefix=NOIMPORT
+
+// NOIMPORT-NOT: !DIImportedEntity
+// NOIMPORT-NOT: !DIModule
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -DGREETING="Hello World" -UNDEBUG -fimplicit-module-maps -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -debugger-tuning=lldb -o - | FileCheck %s
 
 // CHECK: ![[CU:.*]] = distinct !DICompileUnit
 @import DebugObjC;
 // CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: ![[CU]],
-// CHECK-SAME:              entity: ![[MODULE:.*]], line: 5)
+// CHECK-SAME:              entity: ![[MODULE:.*]], line: [[@LINE-2]])
 // CHECK: ![[MODULE]] = !DIModule(scope: null, name: "DebugObjC",
 // CHECK-SAME:  configMacros: "\22-DGREETING=Hello World\22 \22-UNDEBUG\22",
 // CHECK-SAME:  includePath: "{{.*}}test{{.*}}Modules{{.*}}Inputs",
diff --git a/test/Modules/embed-files-compressed.cpp b/test/Modules/embed-files-compressed.cpp
new file mode 100644
index 0000000..cf33a66
--- /dev/null
+++ b/test/Modules/embed-files-compressed.cpp
@@ -0,0 +1,23 @@
+// REQUIRES: zlib
+// REQUIRES: shell
+//
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: echo '//////////////////////////////////////////////////////////////////////' > %t/a.h
+// RUN: cat %t/a.h %t/a.h %t/a.h %t/a.h > %t/b.h
+// RUN: cat %t/b.h %t/b.h %t/b.h %t/b.h > %t/a.h
+// RUN: cat %t/a.h %t/a.h %t/a.h %t/a.h > %t/b.h
+// RUN: cat %t/b.h %t/b.h %t/b.h %t/b.h > %t/a.h
+// RUN: cat %t/a.h %t/a.h %t/a.h %t/a.h > %t/b.h
+// RUN: cat %t/b.h %t/b.h %t/b.h %t/b.h > %t/a.h
+// RUN: cat %t/a.h %t/a.h %t/a.h %t/a.h > %t/b.h
+// RUN: cat %t/b.h %t/b.h %t/b.h %t/b.h > %t/a.h
+// RUN: echo 'module a { header "a.h" }' > %t/modulemap
+//
+// RUN: %clang_cc1 -fmodules -I%t -fmodules-cache-path=%t -fmodule-name=a -emit-module %t/modulemap -fmodules-embed-all-files -o %t/a.pcm
+//
+// The above embeds ~4.5MB of highly-predictable /s and \ns into the pcm file.
+// Check that the resulting file is under 40KB:
+//
+// RUN: wc -c %t/a.pcm | FileCheck --check-prefix=CHECK-SIZE %s
+// CHECK-SIZE: {{(^|[^0-9])[123][0-9][0-9][0-9][0-9]($|[^0-9])}}
diff --git a/test/Modules/embed-files.cpp b/test/Modules/embed-files.cpp
index a1db218..8e5a16e 100644
--- a/test/Modules/embed-files.cpp
+++ b/test/Modules/embed-files.cpp
@@ -1,11 +1,21 @@
 // RUN: rm -rf %t
 // RUN: mkdir %t
-// RUN: echo 'module a { header "a.h" } module b { header "b.h" }' > %t/modulemap
+// RUN: echo 'module a { header "a.h" header "x.h" } module b { header "b.h" }' > %t/modulemap
 // RUN: echo 'extern int t;' > %t/t.h
 // RUN: echo '#include "t.h"' > %t/a.h
 // RUN: echo '#include "t.h"' > %t/b.h
+// RUN: echo '#include "t.h"' > %t/x.h
 
 // RUN: %clang_cc1 -fmodules -I%t -fmodules-cache-path=%t -fmodule-map-file=%t/modulemap -fmodules-embed-all-files %s -verify
+//
+// RUN: %clang_cc1 -fmodules -I%t -fmodules-embed-all-files %t/modulemap -fmodule-name=a -x c++ -emit-module -o %t/a.pcm
+// RUN: %clang_cc1 -fmodules -I%t -fmodules-embed-all-files %t/modulemap -fmodule-name=b -x c++ -emit-module -o %t/b.pcm
+// FIXME: This test is flaky on Windows because attempting to delete a file
+// after writing it just doesn't seem to work well, at least not in the lit
+// shell.
+// REQUIRES: shell
+// RUN: rm %t/x.h
+// RUN: %clang_cc1 -fmodules -I%t -fmodule-map-file=%t/modulemap -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm %s -verify
 #include "a.h"
 char t; // expected-error {{different type}}
 // expected-note@t.h:1 {{here}}
@@ -13,3 +23,4 @@
 #include "b.h"
 char t; // expected-error {{different type}}
 // expected-note@t.h:1 {{here}}
+
diff --git a/test/Modules/explicit-build-flags.cpp b/test/Modules/explicit-build-flags.cpp
index 6ced215..6130043 100644
--- a/test/Modules/explicit-build-flags.cpp
+++ b/test/Modules/explicit-build-flags.cpp
@@ -7,8 +7,7 @@
 // Can use the module.
 // RUN: %clang_cc1 -fmodules -DFOO=1 -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
 
-// Can use the module if an input file is newer. (This happens on
-// remote file systems.)
+// Can use the module if an input file is newer. (This happens on remote file systems.)
 // RUN: sleep 1
 // RUN: touch %t/tmp.h
 // RUN: %clang_cc1 -fmodules -DFOO=1 -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
@@ -23,6 +22,22 @@
 // Can use the module if -I flags change.
 // RUN: %clang_cc1 -fmodules -DBAR=2 -I. -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
 
+// Can use the module if -fPIC/-fPIE flags change.
+// RUN: %clang_cc1 -fmodules -DBAR=2 -pic-level 2 -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+// RUN: %clang_cc1 -fmodules -DBAR=2 -pic-level 1 -pic-is-pie -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+
+// Can use the module if -static flag changes.
+// RUN: %clang_cc1 -fmodules -DBAR=2 -static-define -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+
+// Can use the module if -fsanitize= flags change.
+// RUN: %clang_cc1 -fmodules -DBAR=2 -fsanitize=address -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+//
+// RUN: %clang_cc1 -fmodules -DFOO=1 -fsanitize=address -x c++ -fmodule-name=tmp %t/map -emit-module -o %t/tmp-san.pcm
+// RUN: %clang_cc1 -fmodules -DBAR=2 -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp-san.pcm -verify -I%t %s
+
+// -fno-assume-sane-operator-new is implied by the driver -fsanitize=address flag.
+// RUN: %clang_cc1 -fmodules -DBAR=2 -fno-assume-sane-operator-new -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+
 // Can use the module if -O flags change.
 // RUN: %clang_cc1 -fmodules -DBAR=2 -Os -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
 //
diff --git a/test/Modules/explicit-build-missing-files.cpp b/test/Modules/explicit-build-missing-files.cpp
index 1ee65d9..e36b505 100644
--- a/test/Modules/explicit-build-missing-files.cpp
+++ b/test/Modules/explicit-build-missing-files.cpp
@@ -3,7 +3,7 @@
 // RUN: echo 'extern int a; template<typename T> int a2 = T::error;' > %t/a.h
 // RUN: echo 'extern int b;' > %t/b.h
 // RUN: echo 'extern int c = 0;' > %t/c.h
-// RUN: echo 'module a { header "a.h" header "b.h" header "c.h" }' > %t/modulemap
+// RUN: echo 'module a { module aa { header "a.h" header "b.h" header "c.h" } }' > %t/modulemap
 // RUN: echo 'module other {}' > %t/other.modulemap
 
 // We lazily check that the files referenced by an explicitly-specified .pcm
@@ -18,7 +18,7 @@
 // RUN:            -fmodules-embed-all-files
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s
 // RUN: not %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s -DERRORS 2>&1 | FileCheck %s
-// RUN: rm %t/modulemap
+// RUN: mv %t/modulemap %t/modulemap.moved
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s
 // RUN: not %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s -DERRORS 2>&1 | FileCheck %s
 // RUN: rm %t/other.modulemap
@@ -32,6 +32,9 @@
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/b.pcm %s
 // RUN: not %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s -DERRORS 2>&1 | FileCheck %s --check-prefix=MISSING-B
+// RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm -fmodule-map-file=%t/modulemap.moved %s
+// RUN: not %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm -fmodule-map-file=%t/modulemap.moved -std=c++1z %s
+// RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm -fmodule-map-file=%t/modulemap.moved -std=c++1z -Wno-module-file-config-mismatch %s -Db=a
 // RUN: rm %t/a.h
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s -verify
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/b.pcm %s -verify
diff --git a/test/Modules/explicit-build.cpp b/test/Modules/explicit-build.cpp
index 2a5b70d..a6f6a62 100644
--- a/test/Modules/explicit-build.cpp
+++ b/test/Modules/explicit-build.cpp
@@ -143,7 +143,7 @@
 // -------------------------------
 // Try to import a PCH with -fmodule-file=
 // RUN: %clang_cc1 -x c++ -std=c++11 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Rmodule-build -fno-modules-error-recovery \
-// RUN:            -fmodule-name=a -emit-pch %S/Inputs/explicit-build/a.h -o %t/a.pch \
+// RUN:            -fmodule-name=a -emit-pch %S/Inputs/explicit-build/a.h -o %t/a.pch -DBUILDING_A_PCH \
 // RUN:            2>&1 | FileCheck --check-prefix=CHECK-NO-IMPLICIT-BUILD %s --allow-empty
 //
 // RUN: not %clang_cc1 -x c++ -std=c++11 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Rmodule-build -fno-modules-error-recovery \
diff --git a/test/Modules/getSourceDescriptor-crash.cpp b/test/Modules/getSourceDescriptor-crash.cpp
new file mode 100644
index 0000000..84e527a
--- /dev/null
+++ b/test/Modules/getSourceDescriptor-crash.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -I %S/Inputs/getSourceDescriptor-crash -S -emit-llvm -debug-info-kind=limited -debugger-tuning=lldb -fimplicit-module-maps %s -o - | FileCheck %s
+
+#include "h1.h"
+#include "h1.h"
+
+// CHECK: DIImportedEntity
+// CHECK-SAME: entity: ![[ENTITY:[0-9]+]]
+// CHECK: ![[ENTITY]] = !DIModule
+// CHECK-SAME: name: "foo"
diff --git a/test/Modules/implementation-of-module.m b/test/Modules/implementation-of-module.m
index 37e2cfb..712f12c 100644
--- a/test/Modules/implementation-of-module.m
+++ b/test/Modules/implementation-of-module.m
@@ -1,7 +1,3 @@
-// RUN: not %clang_cc1 -fmodule-implementation-of Foo -fmodule-name=Bar %s 2>&1 \
-// RUN:     | FileCheck -check-prefix=CHECK-IMPL-OF-ERR %s
-// CHECK-IMPL-OF-ERR: conflicting module names specified: '-fmodule-name=Bar' and '-fmodule-implementation-of Foo'
-
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -w -Werror=auto-import %s -I %S/Inputs \
 // RUN:     -fmodule-implementation-of category_right -fsyntax-only
diff --git a/test/Modules/import-self.m b/test/Modules/import-self.m
index aa74371..e598015 100644
--- a/test/Modules/import-self.m
+++ b/test/Modules/import-self.m
@@ -6,6 +6,6 @@
 // RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \
 // RUN:                -I %S/Inputs/submodules -fmodule-name=import_self %s \
 // RUN:     2>&1 |  FileCheck -check-prefix=CHECK-fmodule-name %s
-// CHECK-fmodule-name: import of module 'import_self.b' appears within same top-level module 'import_self'
+// CHECK-fmodule-name: @import of module 'import_self.b' in implementation of 'import_self'
 
 @import import_self.b;
diff --git a/test/Modules/include-own-headers.m b/test/Modules/include-own-headers.m
new file mode 100644
index 0000000..a5a8531
--- /dev/null
+++ b/test/Modules/include-own-headers.m
@@ -0,0 +1,4 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fmodule-name=Module -fimplicit-module-maps -fmodules-cache-path=%t -Werror=non-modular-include-in-framework-module -F%S/Inputs -I%S -fsyntax-only %s
+#include "Module/Module.h"
+#include "Inputs/non-module.h"
diff --git a/test/Modules/merge-decl-context.cpp b/test/Modules/merge-decl-context.cpp
index 55219ed..5dbf3d1 100644
--- a/test/Modules/merge-decl-context.cpp
+++ b/test/Modules/merge-decl-context.cpp
@@ -18,7 +18,13 @@
 // RUN:     -fmodule-map-file=%S/Inputs/merge-decl-context/merge-decl-context.modulemap -I%S/Inputs \
 // RUN:     -emit-llvm -o %t/test.o %s
 
+// RUN: %clang_cc1 -x c++ -std=c++11 -fmodules -fmodules-cache-path=%t \
+// RUN:     -fmodule-map-file=%S/Inputs/merge-decl-context/merge-decl-context.modulemap -I%S/Inputs \
+// RUN:     -emit-llvm -o %t/test.o -DNO_TEXTUAL_INCLUSION %s
+
+#ifndef NO_TEXTUAL_INCLUSION
 #include "Inputs/merge-decl-context/a.h"
+#endif
 #include "Inputs/merge-decl-context/b.h"
 #include "Inputs/merge-decl-context/c.h"
 #include "Inputs/merge-decl-context/d.h"
@@ -26,3 +32,5 @@
 void t() {
   ff(42);
 }
+
+static_assert(Aggregate{.member = 1}.member == 1, "");
diff --git a/test/Modules/minimal-identifier-tables.cpp b/test/Modules/minimal-identifier-tables.cpp
new file mode 100644
index 0000000..0674746
--- /dev/null
+++ b/test/Modules/minimal-identifier-tables.cpp
@@ -0,0 +1,10 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: echo 'extern int some_long_variable_name;' > %t/x.h
+// RUN: echo 'extern int some_long_variable_name;' > %t/y.h
+// RUN: echo 'module X { header "x.h" } module Y { header "y.h" }' > %t/map
+// RUN: %clang_cc1 -fmodules -x c++ -fmodule-name=X %t/map -emit-module -o %t/x.pcm
+// RUN: %clang_cc1 -fmodules -x c++ -fmodule-name=Y %t/map -fmodule-file=%t/x.pcm -emit-module -o %t/y.pcm
+// RUN: cat %t/y.pcm | FileCheck %s
+//
+// CHECK-NOT: some_long_variable_name
diff --git a/test/Modules/module_file_info.m b/test/Modules/module_file_info.m
index 8693d2b..fa841b7 100644
--- a/test/Modules/module_file_info.m
+++ b/test/Modules/module_file_info.m
@@ -1,10 +1,17 @@
 
 @import DependsOnModule;
 
-// RUN: rm -rf %t
-// RUN: %clang_cc1 -w -Wunused -fmodules -fimplicit-module-maps -fdisable-module-hash -fmodules-cache-path=%t -F %S/Inputs -DBLARG -DWIBBLE=WOBBLE -fmodule-feature myfeature %s
+// RUN: rm -rf %t %t-obj
+// RUN: %clang_cc1 -w -Wunused -fmodules -fmodule-format=raw -fimplicit-module-maps -fdisable-module-hash -fmodules-cache-path=%t -F %S/Inputs -DBLARG -DWIBBLE=WOBBLE -fmodule-feature myfeature %s
 // RUN: %clang_cc1 -module-file-info %t/DependsOnModule.pcm | FileCheck %s
+// RUN: %clang_cc1 -module-file-info %t/DependsOnModule.pcm | FileCheck %s --check-prefix=RAW
 
+// RUN: %clang_cc1 -w -Wunused -fmodules -fmodule-format=obj -fimplicit-module-maps -fdisable-module-hash -fmodules-cache-path=%t-obj -F %S/Inputs -DBLARG -DWIBBLE=WOBBLE -fmodule-feature myfeature %s
+// RUN: %clang_cc1 -module-file-info %t-obj/DependsOnModule.pcm | FileCheck %s
+// RUN: %clang_cc1 -module-file-info %t-obj/DependsOnModule.pcm | FileCheck %s --check-prefix=OBJ
+
+// RAW:   Module format: raw
+// OBJ:   Module format: obj
 // CHECK: Generated by this Clang:
 
 // CHECK: Module name: DependsOnModule
diff --git a/test/Modules/no-stale-modtime.m b/test/Modules/no-stale-modtime.m
index b90daf1..c7ff21c 100644
--- a/test/Modules/no-stale-modtime.m
+++ b/test/Modules/no-stale-modtime.m
@@ -7,9 +7,13 @@
 // RUN: echo '@import l; @import r;' > %t/b.h
 // RUN: echo '@import t; // fromt l' > %t/l.h
 // RUN: echo '@import t; // fromt r' > %t/r.h
-// RUN: echo '// top' > %t/t.h
-// RUN: echo 'module b { header "b.h" } module l { header "l.h" }' > %t/module.map
-// RUN: echo 'module r { header "r.h" } module t { header "t.h" }' >> %t/module.map
+
+// RUN: echo '// top' > %t/t.h-1
+// RUN: cat %t/t.h-1 > %t/t.h
+
+// RUN: echo 'module b { header "b.h" } module l { header "l.h" }' > %t/module.map-1
+// RUN: echo 'module r { header "r.h" } module t { header "t.h" }' > %t/module.map-2
+// RUN: cat %t/module.map-1 %t/module.map-2 > %t/module.map
 
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash \
 // RUN:     -I %t -fsyntax-only %s -Rmodule-build 2>&1 \
@@ -18,7 +22,9 @@
 // RUN:     -I %t -fsyntax-only %s -Rmodule-build -verify
 
 // Add an identifier to ensure everything depending on t is out of date
-// RUN: echo 'extern int a;' >> %t/t.h
+// RUN: echo 'extern int a;' > %t/t.h-2
+// RUN: cat %t/t.h-1 %t/t.h-2 > %t/t.h
+
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash \
 // RUN:     -I %t -fsyntax-only %s -Rmodule-build 2>&1 \
 // RUN: | FileCheck -check-prefix=REBUILD-ALL %s
diff --git a/test/Modules/odr.cpp b/test/Modules/odr.cpp
index 3014396..9cdbb4f 100644
--- a/test/Modules/odr.cpp
+++ b/test/Modules/odr.cpp
@@ -15,9 +15,9 @@
 int x = f() + g();
 
 // expected-note@a.h:5 {{definition has no member 'e2'}}
-// expected-note@b.h:3 {{declaration of 'f' does not match}}
-// expected-note@b.h:1 {{definition has no member 'n'}}
+// expected-note@a.h:3 {{declaration of 'f' does not match}}
+// expected-note@a.h:1 {{definition has no member 'm'}}
 
 // expected-error@b.h:5 {{'E::e2' from module 'b' is not present in definition of 'E' in module 'a'}}
-// expected-error@a.h:3 {{'Y::f' from module 'a' is not present in definition of 'Y' in module 'b'}}
-// expected-error@a.h:2 {{'Y::n' from module 'a' is not present in definition of 'Y' in module 'b'}}
+// expected-error@b.h:3 {{'Y::f' from module 'b' is not present in definition of 'Y' in module 'a'}}
+// expected-error@b.h:2 {{'Y::m' from module 'b' is not present in definition of 'Y' in module 'a'}}
diff --git a/test/Modules/parse-attributes.modulemap b/test/Modules/parse-attributes.modulemap
new file mode 100644
index 0000000..0d18325
--- /dev/null
+++ b/test/Modules/parse-attributes.modulemap
@@ -0,0 +1,12 @@
+// RUN: rm -rf %t.modules
+// RUN: not %clang_cc1 -fmodules -fmodules-cache-path=%t.modules \
+// RUN:   -fmodule-map-file=%s -I%S -include "Inputs/empty.h" \
+// RUN:   -fsyntax-only -x c++ /dev/null 2>&1 | FileCheck %s
+
+// CHECK: error: expected ']' to close attribute
+// CHECK-NOT: error: expected '{' to start module 'A'
+
+module A [system {
+  header "Inputs/empty.h"
+  private header "Inputs/empty.h"
+}
diff --git a/test/Modules/pr21547.cpp b/test/Modules/pr21547.cpp
new file mode 100644
index 0000000..c6275b4
--- /dev/null
+++ b/test/Modules/pr21547.cpp
@@ -0,0 +1,8 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -I%S/Inputs/PR21547 -verify %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/PR21547 -verify %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/PR21547 -emit-llvm-only %s
+
+#include "Inputs/PR21547/FirstHeader.h"
+
+//expected-no-diagnostics
diff --git a/test/Modules/pr26014.cpp b/test/Modules/pr26014.cpp
new file mode 100644
index 0000000..f9ebd4e
--- /dev/null
+++ b/test/Modules/pr26014.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -I%S/Inputs/PR26014 -verify %s
+// RUN: %clang_cc1 -fmodules -fmodule-map-file=%S/Inputs/PR26014/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR26014 -verify %s
+
+#include "A.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr26179.cpp b/test/Modules/pr26179.cpp
new file mode 100644
index 0000000..f25f1ce
--- /dev/null
+++ b/test/Modules/pr26179.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -I%S/Inputs/PR26179 -verify %s
+// RUN: %clang_cc1 -fmodules -fmodule-map-file=%S/Inputs/PR26179/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR26179 -verify %s
+
+#include "A.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27041.cpp b/test/Modules/pr27041.cpp
new file mode 100644
index 0000000..9d06468
--- /dev/null
+++ b/test/Modules/pr27041.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27041 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27041/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27041 -verify %s
+
+#include "Rtypes.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27186.cpp b/test/Modules/pr27186.cpp
new file mode 100644
index 0000000..02a8fe5
--- /dev/null
+++ b/test/Modules/pr27186.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27186  -I%S/Inputs/PR27186/subdir/ -verify %s
+// RUN: %clang_cc1  -nostdsysteminc -std=c++11 -fmodules  -fmodule-map-file=%S/Inputs/PR27186/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27186/ -verify %s
+
+#include "Rtypes.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27401.cpp b/test/Modules/pr27401.cpp
new file mode 100644
index 0000000..7d5479c
--- /dev/null
+++ b/test/Modules/pr27401.cpp
@@ -0,0 +1,38 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27401 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27401/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27401 -verify %s
+
+#include "a.h"
+#define _LIBCPP_VECTOR
+template <class, class _Allocator>
+class __vector_base {
+protected:
+  _Allocator __alloc() const;
+  __vector_base(_Allocator);
+};
+
+template <class _Tp, class _Allocator = allocator>
+class vector : __vector_base<_Tp, _Allocator> {
+public:
+  vector() noexcept(is_nothrow_default_constructible<_Allocator>::value);
+  vector(const vector &);
+  vector(vector &&)
+      noexcept(is_nothrow_move_constructible<_Allocator>::value);
+};
+
+template <class _Tp, class _Allocator>
+vector<_Tp, _Allocator>::vector(const vector &__x) : __vector_base<_Tp, _Allocator>(__x.__alloc()) {}
+
+  struct CommentOptions {
+    vector<char>  ParseAllComments;
+    CommentOptions() {}
+  };
+  struct PrintingPolicy {
+    PrintingPolicy(CommentOptions LO) : LangOpts(LO) {}
+    CommentOptions LangOpts;
+  };
+
+#include "b.h"
+CommentOptions fn1() { return fn1(); }
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27513.cpp b/test/Modules/pr27513.cpp
new file mode 100644
index 0000000..28fbe5b
--- /dev/null
+++ b/test/Modules/pr27513.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27513 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27513/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27513 -verify %s
+
+#include "Inputs/PR27513/a.h"
+
+//expected-no-diagnostics
diff --git a/test/Modules/pr27699.cpp b/test/Modules/pr27699.cpp
new file mode 100644
index 0000000..0a17fcb
--- /dev/null
+++ b/test/Modules/pr27699.cpp
@@ -0,0 +1,9 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27699 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27699/module.modulemap -fmodules-cache-path=%t -fmodules-local-submodule-visibility -I%S/Inputs/PR27699 -verify %s
+
+#include "Subdir/a.h"
+#include "Subdir/b.h"
+
+// expected-no-diagnostics
+
diff --git a/test/Modules/pr27739.cpp b/test/Modules/pr27739.cpp
new file mode 100644
index 0000000..b27dc1b
--- /dev/null
+++ b/test/Modules/pr27739.cpp
@@ -0,0 +1,12 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -internal-externc-isystem %S/Inputs/PR27739 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27739/module.modulemap -fmodules-cache-path=%t -internal-externc-isystem %S/Inputs/PR27739/ -verify %s
+
+#include "DataInputHandler.h"
+
+void DataInputHandler::AddTree() {
+   fInputTrees[(char*)""];
+   fExplicitTrainTest[(char*)""];
+}
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27754.cpp b/test/Modules/pr27754.cpp
new file mode 100644
index 0000000..0482595
--- /dev/null
+++ b/test/Modules/pr27754.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27754 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27754/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27754/ -verify %s
+
+#include "TMetaUtils.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27890.cpp b/test/Modules/pr27890.cpp
new file mode 100644
index 0000000..8bb9a9f
--- /dev/null
+++ b/test/Modules/pr27890.cpp
@@ -0,0 +1,9 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27890 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27890/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27890 -verify %s
+
+#include "a.h"
+enum ActionType {};
+opt<ActionType> a(values(""));
+
+// expected-no-diagnostics
\ No newline at end of file
diff --git a/test/Modules/pr28332.cpp b/test/Modules/pr28332.cpp
new file mode 100644
index 0000000..596dd24
--- /dev/null
+++ b/test/Modules/pr28332.cpp
@@ -0,0 +1,8 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR28332 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR28332/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR28332 -verify %s
+
+#include "a.h"
+
+// expected-no-diagnostics
+
diff --git a/test/Modules/prebuilt-module.m b/test/Modules/prebuilt-module.m
deleted file mode 100644
index d5012e0..0000000
--- a/test/Modules/prebuilt-module.m
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: rm -rf %t
-//
-// RUN: %clang_cc1 -fmodules -x objective-c -I %S/Inputs/prebuilt-module -emit-module %S/Inputs/prebuilt-module/module.modulemap -fmodule-name=prebuilt -o %t/prebuilt.pcm
-// RUN: %clang_cc1 -fmodules -fprebuilt-module-path=%t/ -fdisable-module-hash %s -verify
-
-// expected-no-diagnostics
-@import prebuilt;
-int test() {
-  return a;
-}
diff --git a/test/Modules/preprocess.cpp b/test/Modules/preprocess.cpp
new file mode 100644
index 0000000..0615331
--- /dev/null
+++ b/test/Modules/preprocess.cpp
@@ -0,0 +1,24 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x c++ -E %s | \
+// RUN:   FileCheck -strict-whitespace %s --check-prefix=CHECK --check-prefix=CXX --check-prefix=CXX-DASHE
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x objective-c -E %s | \
+// RUN:   FileCheck -strict-whitespace %s --check-prefix=CHECK --check-prefix=OBJC
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x c++ -E -frewrite-includes %s | \
+// RUN:   FileCheck -strict-whitespace %s --check-prefix=CHECK --check-prefix=CXX
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x objective-c -E -frewrite-includes %s | \
+// RUN:   FileCheck -strict-whitespace %s --check-prefix=CHECK --check-prefix=OBJC
+#include "dummy.h"
+#include "dummy.h"
+foo bar baz
+
+// The weird {{ }} here is to prevent the -frewrite-includes test from matching its own CHECK lines.
+
+// CXX: #include{{ }}"dummy.h"
+// CXX-DASHE-SAME: /* clang -E: implicit import for module dummy */
+// CXX: #include{{ }}"dummy.h"
+// CXX-DASHE-SAME: /* clang -E: implicit import for module dummy */
+// CXX: foo bar baz
+
+// OBJC: @import{{ }}dummy; /* clang 
+// OBJC: @import{{ }}dummy; /* clang 
+// OBJC: foo bar baz
diff --git a/test/Modules/requires-gnuinlineasm.m b/test/Modules/requires-gnuinlineasm.m
new file mode 100644
index 0000000..80b1b18
--- /dev/null
+++ b/test/Modules/requires-gnuinlineasm.m
@@ -0,0 +1,16 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules \
+// RUN:     -fimplicit-module-maps -F %S/Inputs/GNUAsm %s \
+// RUN:     -fno-gnu-inline-asm -DNO_ASM_INLINE -verify
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules \
+// RUN:     -fimplicit-module-maps -F %S/Inputs/GNUAsm %s \
+// RUN:     -DASM_INLINE -verify
+
+#ifdef NO_ASM_INLINE
+@import NeedsGNUInlineAsm.Asm; // expected-error{{module 'NeedsGNUInlineAsm.Asm' requires feature 'gnuinlineasm'}}
+#endif
+
+#ifdef ASM_INLINE
+@import NeedsGNUInlineAsm.Asm; // expected-no-diagnostics
+#endif
diff --git a/test/Modules/submodules-merge-defs.cpp b/test/Modules/submodules-merge-defs.cpp
index 23d1f5c..4ab822a 100644
--- a/test/Modules/submodules-merge-defs.cpp
+++ b/test/Modules/submodules-merge-defs.cpp
@@ -58,6 +58,11 @@
 decltype(G::h) pre_gh = G::h; // expected-error +{{must be imported}}
 // expected-note@defs.h:51 +{{here}}
 
+int pre_h = H(); // expected-error +{{must be imported}}
+// expected-note@defs.h:56 +{{here}}
+using pre_i = I<>; // expected-error +{{must be imported}}
+// expected-note@defs.h:57 +{{here}}
+
 J<> pre_j; // expected-error {{declaration of 'J' must be imported}}
 #ifdef IMPORT_USE_2
 // expected-error-re@-2 {{default argument of 'J' must be imported from one of {{.*}}stuff.use{{.*}}stuff.use-2}}
@@ -99,6 +104,8 @@
 int post_fg = F<char>().g<int>();
 G::A post_ga = G::a;
 decltype(G::h) post_gh = G::h;
+int post_h = H();
+using post_i = I<>;
 J<> post_j;
 template<typename T, int N, template<typename> class K> struct J;
 J<> post_j2;
diff --git a/test/Modules/suggest-include.cpp b/test/Modules/suggest-include.cpp
new file mode 100644
index 0000000..e10c3f3
--- /dev/null
+++ b/test/Modules/suggest-include.cpp
@@ -0,0 +1,33 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -fimplicit-module-maps -I%S/Inputs/suggest-include %s -verify
+
+#include "empty.h" // import the module file
+
+// expected-note@usetextual1.h:2 {{previous}}
+// expected-note@textual2.h:1 {{previous}}
+// expected-note@textual3.h:1 {{previous}}
+// expected-note@textual4.h:1 {{previous}}
+// expected-note@textual5.h:1 {{previous}}
+// expected-note@private1.h:1 {{previous}}
+// expected-note@private2.h:1 {{previous}}
+// expected-note@private3.h:1 {{previous}}
+
+void f() {
+  (void)::usetextual1; // expected-error {{missing '#include "usetextual1.h"'}}
+  (void)::usetextual2; // expected-error {{missing '#include "usetextual2.h"'}}
+  (void)::textual3; // expected-error-re {{{{^}}missing '#include "usetextual3.h"'}}
+  // Don't suggest a #include that includes the entity via a path that leaves
+  // the module. In that case we can't be sure that we've picked the right header.
+  (void)::textual4; // expected-error-re {{{{^}}declaration of 'textual4'}}
+  (void)::textual5; // expected-error-re {{{{^}}declaration of 'textual5'}}
+
+  // Don't suggest #including a private header.
+  // FIXME: We could suggest including "useprivate1.h" here, as it's the only
+  // public way to get at this declaration.
+  (void)::private1; // expected-error-re {{{{^}}declaration of 'private1'}}
+  // FIXME: Should we be suggesting an import at all here? Should declarations
+  // in private headers be visible when the surrounding module is imported?
+  (void)::private2; // expected-error-re {{{{^}}declaration of 'private2'}}
+  // Even if we suggest an include for private1, we should not do so here.
+  (void)::private3; // expected-error-re {{{{^}}declaration of 'private3'}}
+}
diff --git a/test/Modules/tag-injection.c b/test/Modules/tag-injection.c
new file mode 100644
index 0000000..5bb1547
--- /dev/null
+++ b/test/Modules/tag-injection.c
@@ -0,0 +1,18 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: echo 'struct a;' > %t/a.h
+// RUN: echo 'struct b {}; void foo(struct b*);' > %t/b.h
+// RUN: echo 'module X { module a { header "a.h" } module b { header "b.h" } }' > %t/x.modulemap
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -fmodule-map-file=%t/x.modulemap %s -I%t -verify
+
+#include "a.h"
+
+void f(struct a *p);
+
+// FIXME: We should warn that 'b' will not be visible outside of this function,
+// but we merge this 'b' with X.b's 'b' because we don't yet implement C's
+// "compatible types" rule.
+void g(struct b *p);
+
+struct b b; // expected-error {{definition of 'b' must be imported from module 'X.b' before it is required}}
+// expected-note@b.h:1 {{here}}
diff --git a/test/Modules/tag-injection.cpp b/test/Modules/tag-injection.cpp
new file mode 100644
index 0000000..e55598b
--- /dev/null
+++ b/test/Modules/tag-injection.cpp
@@ -0,0 +1,25 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: echo 'struct tm;' > %t/a.h
+// RUN: echo 'struct X {}; void foo(struct tm*);' > %t/b.h
+// RUN: echo 'module X { module a { header "a.h" } module b { header "b.h" } }' > %t/x.modulemap
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -x c++ -fmodule-map-file=%t/x.modulemap %s -I%t -verify -std=c++11
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -x c++ -fmodule-map-file=%t/x.modulemap %s -I%t -verify -fmodules-local-submodule-visibility -std=c++11
+
+#include "a.h"
+
+using ::tm;
+
+struct A {
+  // This use of 'struct X' makes the declaration (but not definition) of X visible.
+  virtual void f(struct X *p);
+};
+
+namespace N {
+  struct B : A {
+    void f(struct X *q) override;
+  };
+}
+
+X x; // expected-error {{definition of 'X' must be imported from module 'X.b' before it is required}}
+// expected-note@b.h:1 {{here}}
diff --git a/test/Modules/templates.mm b/test/Modules/templates.mm
index 190084a..cd80e24 100644
--- a/test/Modules/templates.mm
+++ b/test/Modules/templates.mm
@@ -12,10 +12,10 @@
 
 @import templates_right;
 
-// CHECK-DAG: @list_left = global %class.List { %"struct.List<int>::node"* null, i32 8 }, align 8
-// CHECK-DAG: @list_right = global %class.List { %"struct.List<int>::node"* null, i32 12 }, align 8
-// CHECK-DAG: @_ZZ15testMixedStructvE1l = {{.*}} constant %class.List { %{{.*}}* null, i32 1 }, align 8
-// CHECK-DAG: @_ZZ15testMixedStructvE1r = {{.*}} constant %class.List { %{{.*}}* null, i32 2 }, align 8
+// CHECK-DAG: @list_left = global %[[LIST:.*]] { %[[LISTNODE:.*]]* null, i32 8 }, align 8
+// CHECK-DAG: @list_right = global %[[LIST]] { %[[LISTNODE]]* null, i32 12 }, align 8
+// CHECK-DAG: @_ZZ15testMixedStructvE1l = {{.*}} constant %[[LIST]] { %{{.*}}* null, i32 1 }, align 8
+// CHECK-DAG: @_ZZ15testMixedStructvE1r = {{.*}} constant %[[LIST]] { %{{.*}}* null, i32 2 }, align 8
 // CHECK-DAG: @_ZN29WithUndefinedStaticDataMemberIA_iE9undefinedE = external global
 
 void testTemplateClasses() {
diff --git a/test/Modules/unused-global-init.cpp b/test/Modules/unused-global-init.cpp
new file mode 100644
index 0000000..3bd7a7f
--- /dev/null
+++ b/test/Modules/unused-global-init.cpp
@@ -0,0 +1,37 @@
+// RUN: rm -rf %t
+//
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-module %S/Inputs/unused-global-init/module.modulemap -fmodule-name=init -o %t/init.pcm
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-module %S/Inputs/unused-global-init/module.modulemap -fmodule-name=unused -o %t/unused.pcm -fmodule-file=%t/init.pcm
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-module %S/Inputs/unused-global-init/module.modulemap -fmodule-name=used -o %t/used.pcm -fmodule-file=%t/init.pcm
+//
+// No module file: init.h performs init.
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-llvm -o - %s -DINIT | FileCheck --check-prefix=CHECK-INIT %s
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-llvm -o - %s -DUSED | FileCheck --check-prefix=CHECK-INIT %s
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-llvm -o - %s -DOTHER -DUNUSED | FileCheck --check-prefix=CHECK-NO-INIT %s
+//
+// With module files: if there is a transitive import of any part of the
+// module, we run its global initializers (even if the imported piece is not
+// visible here).
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-llvm -o - %s -fmodule-file=%t/used.pcm -fmodule-file=%t/unused.pcm -DINIT | FileCheck --check-prefix=CHECK-INIT %s
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-llvm -o - %s -fmodule-file=%t/used.pcm -fmodule-file=%t/unused.pcm -DOTHER | FileCheck --check-prefix=CHECK-NO-INIT %s
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-llvm -o - %s -fmodule-file=%t/used.pcm -fmodule-file=%t/unused.pcm -DUSED | FileCheck --check-prefix=CHECK-INIT %s
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -x c++ -I %S/Inputs/unused-global-init -triple %itanium_abi_triple -emit-llvm -o - %s -fmodule-file=%t/used.pcm -fmodule-file=%t/unused.pcm -DUNUSED | FileCheck --check-prefix=CHECK-NO-INIT %s
+
+#ifdef INIT
+#include "init.h"
+#endif
+
+#ifdef OTHER
+#include "other.h"
+#endif
+
+#ifdef USED
+#include "used.h"
+#endif
+
+#ifdef UNUSED
+#include "unused.h"
+#endif
+
+// CHECK-INIT: call {{.*}}@_ZN4InitC
+// CHECK-NO-INIT-NOT: call {{.*}}@_ZN4InitC
diff --git a/test/OpenMP/atomic_messages.c b/test/OpenMP/atomic_messages.c
index 8182465..7b3178b 100644
--- a/test/OpenMP/atomic_messages.c
+++ b/test/OpenMP/atomic_messages.c
@@ -313,6 +313,8 @@
 #pragma omp atomic capture
   {c = a; a++;}
 #pragma omp atomic capture
+  {c = a; (a)++;}
+#pragma omp atomic capture
   {++a;c = a;}
 #pragma omp atomic capture
   {c = a;a--;}
@@ -321,6 +323,8 @@
 #pragma omp atomic capture
   {c = a; a += b;}
 #pragma omp atomic capture
+  {c = a; (a) += b;}
+#pragma omp atomic capture
   {a %= b; c = a;}
 #pragma omp atomic capture
   {c = a; a *= b;}
diff --git a/test/OpenMP/atomic_messages.cpp b/test/OpenMP/atomic_messages.cpp
index 7f78ad4..efb368d 100644
--- a/test/OpenMP/atomic_messages.cpp
+++ b/test/OpenMP/atomic_messages.cpp
@@ -453,6 +453,8 @@
 #pragma omp atomic capture
   {c = a; a++;}
 #pragma omp atomic capture
+  {c = a; (a)++;}
+#pragma omp atomic capture
   {++a;c = a;}
 #pragma omp atomic capture
   {c = a;a--;}
@@ -461,6 +463,8 @@
 #pragma omp atomic capture
   {c = a; a += b;}
 #pragma omp atomic capture
+  {c = a; (a) += b;}
+#pragma omp atomic capture
   {a %= b; c = a;}
 #pragma omp atomic capture
   {c = a; a *= b;}
diff --git a/test/OpenMP/cancel_codegen.cpp b/test/OpenMP/cancel_codegen.cpp
index e2dd367..8234193 100644
--- a/test/OpenMP/cancel_codegen.cpp
+++ b/test/OpenMP/cancel_codegen.cpp
@@ -19,9 +19,10 @@
 {
 #pragma omp cancel sections
 }
-// CHECK: call i32 @__kmpc_single(
-// CHECK-NOT: @__kmpc_cancel
-// CHECK: call void @__kmpc_end_single(
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call i32 @__kmpc_cancel(
+// CHECK: call i32 @__kmpc_cancel_barrier(%ident_t*
+// CHECK: call void @__kmpc_for_static_fini(
 // CHECK: call void @__kmpc_barrier(%ident_t*
 #pragma omp sections
 {
@@ -125,9 +126,10 @@
 // CHECK: ret i32 0
 
 // CHECK: define internal void @{{[^(]+}}(i32* {{[^,]+}}, i32* {{[^,]+}})
-// CHECK: call i32 @__kmpc_single(
-// CHECK-NOT: @__kmpc_cancel
-// CHECK: call void @__kmpc_end_single(
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call i32 @__kmpc_cancel(
+// CHECK: call i32 @__kmpc_cancel_barrier(%ident_t*
+// CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
 // CHECK: define internal void @{{[^(]+}}(i32* {{[^,]+}}, i32* {{[^,]+}})
diff --git a/test/OpenMP/cancellation_point_codegen.cpp b/test/OpenMP/cancellation_point_codegen.cpp
index 795f69e..91e6c69 100644
--- a/test/OpenMP/cancellation_point_codegen.cpp
+++ b/test/OpenMP/cancellation_point_codegen.cpp
@@ -22,9 +22,16 @@
 #pragma omp cancel sections
   }
 }
-// CHECK: call i32 @__kmpc_single(
-// CHECK-NOT: @__kmpc_cancellationpoint
-// CHECK: call void @__kmpc_end_single(
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_cancellationpoint(%ident_t* {{[^,]+}}, i32 [[GTID]], i32 3)
+// CHECK: [[CMP:%.+]] = icmp ne i32 [[RES]], 0
+// CHECK: br i1 [[CMP]], label %[[EXIT:[^,].+]], label %[[CONTINUE:.+]]
+// CHECK: [[EXIT]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%ident_t*
+// CHECK: br label
+// CHECK: [[CONTINUE]]
+// CHECK: br label
+// CHECK: call void @__kmpc_for_static_fini(
 // CHECK: call void @__kmpc_barrier(%ident_t*
 #pragma omp sections
 {
@@ -126,9 +133,16 @@
 // CHECK: ret i32 0
 
 // CHECK: define internal void @{{[^(]+}}(i32* {{[^,]+}}, i32* {{[^,]+}})
-// CHECK: call i32 @__kmpc_single(
-// CHECK-NOT: @__kmpc_cancellationpoint
-// CHECK: call void @__kmpc_end_single(
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_cancellationpoint(%ident_t* {{[^,]+}}, i32 [[GTID:%.+]], i32 3)
+// CHECK: [[CMP:%.+]] = icmp ne i32 [[RES]], 0
+// CHECK: br i1 [[CMP]], label %[[EXIT:[^,].+]], label %[[CONTINUE:.+]]
+// CHECK: [[EXIT]]
+// CHECK: call i32 @__kmpc_cancel_barrier(%ident_t*
+// CHECK: br label
+// CHECK: [[CONTINUE]]
+// CHECK: br label
+// CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
 // CHECK: define internal void @{{[^(]+}}(i32* {{[^,]+}}, i32* {{[^,]+}})
diff --git a/test/OpenMP/critical_codegen.cpp b/test/OpenMP/critical_codegen.cpp
index e44e220..be749a6 100644
--- a/test/OpenMP/critical_codegen.cpp
+++ b/test/OpenMP/critical_codegen.cpp
@@ -39,7 +39,11 @@
 #pragma omp critical(the_name1) hint(23)
   foo();
 // CHECK:       call {{.*}}void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
+// CHECK:       br label
 // CHECK-NOT:   call {{.*}}void @__kmpc_end_critical(
+// CHECK:       br label
+// CHECK-NOT:   call {{.*}}void @__kmpc_end_critical(
+// CHECK:       br label
   if (a)
 #pragma omp critical(the_name)
     while (1)
@@ -60,6 +64,8 @@
   // CHECK: [[S_REF:%.+]] = load %struct.S*, %struct.S** [[S_ADDR]],
   // CHECK: [[S_A_REF:%.+]] = getelementptr inbounds %struct.S, %struct.S* [[S_REF]], i32 0, i32 0
   ++s.a;
+  // CHECK: [[S_REF:%.+]] = load %struct.S*, %struct.S** [[S_ADDR]],
+  // CHECK: store %struct.S* [[S_REF]], %struct.S** [[S_ADDR:%.+]],
   // CHECK: call void @__kmpc_critical(
 #pragma omp critical
   // CHECK: [[S_REF:%.+]] = load %struct.S*, %struct.S** [[S_ADDR]],
diff --git a/test/OpenMP/declare_reduction_ast_print.c b/test/OpenMP/declare_reduction_ast_print.c
new file mode 100644
index 0000000..7b97a7c
--- /dev/null
+++ b/test/OpenMP/declare_reduction_ast_print.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+// CHECK: #pragma omp declare reduction (+ : int : omp_out *= omp_in)
+// CHECK-NEXT: #pragma omp declare reduction (+ : char : omp_out *= omp_in)
+
+#pragma omp declare reduction(fun : float : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+// CHECK: #pragma omp declare reduction (fun : float : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+
+// CHECK: struct SSS {
+struct SSS {
+  int field;
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+  // CHECK: #pragma omp declare reduction (+ : int : omp_out *= omp_in)
+  // CHECK-NEXT: #pragma omp declare reduction (+ : char : omp_out *= omp_in)
+};
+// CHECK: };
+
+void init(struct SSS *priv, struct SSS orig);
+
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+// CHECK: #pragma omp declare reduction (fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+
+// CHECK: int main() {
+int main() {
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  // CHECK: #pragma omp declare reduction (fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  {
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  // CHECK: #pragma omp declare reduction (fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  }
+  return 0;
+}
+// CHECK: }
+
+#endif
diff --git a/test/OpenMP/declare_reduction_ast_print.cpp b/test/OpenMP/declare_reduction_ast_print.cpp
new file mode 100644
index 0000000..26b2ff9
--- /dev/null
+++ b/test/OpenMP/declare_reduction_ast_print.cpp
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+// CHECK: #pragma omp declare reduction (+ : int : omp_out *= omp_in)
+// CHECK-NEXT: #pragma omp declare reduction (+ : char : omp_out *= omp_in)
+
+// CHECK: #pragma omp declare reduction (fun : int : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+
+template <class T>
+class SSS {
+public:
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+  // CHECK: #pragma omp declare reduction (fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+};
+
+SSS<int> d;
+
+void init(SSS<int> &lhs, SSS<int> rhs);
+
+#pragma omp declare reduction(fun : SSS < int > : omp_out = omp_in) initializer(init(omp_priv, omp_orig))
+// CHECK: #pragma omp declare reduction (fun : SSS<int> : omp_out = omp_in) initializer(init(omp_priv, omp_orig))
+
+// CHECK: template <typename T = int> int foo(int a) {
+// CHECK: #pragma omp declare reduction (fun : int : omp_out += omp_in) initializer(omp_priv = omp_orig + 15);
+// CHECK: {
+// CHECK: #pragma omp declare reduction (fun : int : omp_out += omp_in) initializer(omp_priv = omp_orig + 15);
+// CHECK: }
+// CHECK: return a;
+// CHECK: }
+
+// CHECK: template <typename T> T foo(T a) {
+// CHECK: #pragma omp declare reduction (fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15);
+// CHECK: {
+// CHECK: #pragma omp declare reduction (fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15);
+// CHECK: }
+// CHECK: return a;
+// CHECK: }
+template <typename T>
+T foo(T a) {
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+  {
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+  }
+  return a;
+}
+
+int main() {
+  int i = 0;
+  SSS<int> sss;
+  // TODO: Add support for scoped reduction identifiers
+  //  #pragma omp parallel reduction(SSS<int>::fun : i)
+  // TODO-CHECK: #pragma omp parallel reduction(SSS<int>::fun: i)
+  {
+    i += 1;
+  }
+  // #pragma omp parallel reduction(::fun:sss)
+  // TODO-CHECK: #pragma omp parallel reduction(::fun: sss)
+  {
+  }
+  return foo(15);
+}
+
+#endif
diff --git a/test/OpenMP/declare_reduction_codegen.c b/test/OpenMP/declare_reduction_codegen.c
new file mode 100644
index 0000000..ecb9726
--- /dev/null
+++ b/test/OpenMP/declare_reduction_codegen.c
@@ -0,0 +1,158 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c -emit-llvm %s -triple %itanium_abi_triple -o - -femit-all-decls -disable-llvm-optzns | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c -triple %itanium_abi_triple -emit-pch -o %t %s -femit-all-decls -disable-llvm-optzns
+// RUN: %clang_cc1 -fopenmp -x c -triple %itanium_abi_triple -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls -disable-llvm-optzns | FileCheck --check-prefix=CHECK-LOAD %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[SSS_INT:.+]] = type { i32 }
+// CHECK-LOAD: [[SSS_INT:.+]] = type { i32 }
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[MUL:%.+]] = mul nsw i32
+// CHECK-NEXT: store i32 [[MUL]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: store i32 [[MUL]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK: sext i8
+// CHECK: sext i8
+// CHECK: [[MUL:%.+]] = mul nsw i32
+// CHECK-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-LOAD-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+#pragma omp declare reduction(fun : float : omp_out += omp_in) initializer(omp_priv = 15 + omp_orig)
+// CHECK: define internal {{.*}}void @{{[^(]+}}(float* noalias, float* noalias)
+// CHECK: [[ADD:%.+]] = fadd float
+// CHECK-NEXT: store float [[ADD]], float*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK: define internal {{.*}}void @{{[^(]+}}(float* noalias, float* noalias)
+// CHECK: [[ADD:%.+]] = fadd float 1.5
+// CHECK-NEXT: store float [[ADD]], float*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(float* noalias, float* noalias)
+// CHECK-LOAD: [[ADD:%.+]] = fadd float
+// CHECK-LOAD-NEXT: store float [[ADD]], float*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(float* noalias, float* noalias)
+// CHECK-LOAD: [[ADD:%.+]] = fadd float 1.5
+// CHECK-LOAD-NEXT: store float [[ADD]], float*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+struct SSS {
+  int field;
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+  // CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+  // CHECK: [[MUL:%.+]] = mul nsw i32
+  // CHECK-NEXT: store i32 [[MUL]], i32*
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+
+  // CHECK: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+  // CHECK: sext i8
+  // CHECK: sext i8
+  // CHECK: [[MUL:%.+]] = mul nsw i32
+  // CHECK-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+  // CHECK-NEXT: store i8 [[TRUNC]], i8*
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+};
+
+void init(struct SSS *priv, struct SSS orig);
+
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+// CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK: call void @llvm.memcpy
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK: call void @init(
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK-LOAD: call void @llvm.memcpy
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK-LOAD: call void @init(
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK-LABEL: @main
+// CHECK-LOAD-LABEL: @main
+int main() {
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+  // CHECK: call void @llvm.memcpy
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+  // CHECK: call void @init(
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+  // CHECK-LOAD: call void @llvm.memcpy
+  // CHECK-LOAD-NEXT: ret void
+  // CHECK-LOAD-NEXT: }
+  // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+  // CHECK-LOAD: call void @init(
+  // CHECK-LOAD-NEXT: ret void
+  // CHECK-LOAD-NEXT: }
+  {
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+    // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+    // CHECK: call void @llvm.memcpy
+    // CHECK-NEXT: ret void
+    // CHECK-NEXT: }
+    // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+    // CHECK: call void @init(
+    // CHECK-NEXT: ret void
+    // CHECK-NEXT: }
+    // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+    // CHECK-LOAD: call void @llvm.memcpy
+    // CHECK-LOAD-NEXT: ret void
+    // CHECK-LOAD-NEXT: }
+    // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+    // CHECK-LOAD: call void @init(
+    // CHECK-LOAD-NEXT: ret void
+    // CHECK-LOAD-NEXT: }
+  }
+  return 0;
+}
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: store i32 [[MUL]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-LOAD-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+#endif
diff --git a/test/OpenMP/declare_reduction_codegen.cpp b/test/OpenMP/declare_reduction_codegen.cpp
new file mode 100644
index 0000000..a18e73f
--- /dev/null
+++ b/test/OpenMP/declare_reduction_codegen.cpp
@@ -0,0 +1,182 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -o - -femit-all-decls -disable-llvm-optzns | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t %s -femit-all-decls -disable-llvm-optzns
+// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls -disable-llvm-optzns | FileCheck --check-prefix=CHECK-LOAD %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[SSS_INT:.+]] = type { i32 }
+// CHECK-LOAD: [[SSS_INT:.+]] = type { i32 }
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[MUL:%.+]] = mul nsw i32
+// CHECK-NEXT: store i32 [[MUL]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: store i32 [[MUL]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK: sext i8
+// CHECK: sext i8
+// CHECK: [[MUL:%.+]] = mul nsw i32
+// CHECK-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-LOAD-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+template <class T>
+struct SSS {
+  T a;
+  SSS() : a() {}
+#pragma omp declare reduction(fun : T : omp_out ^= omp_in) initializer(omp_priv = 24 + omp_orig)
+};
+
+SSS<int> d;
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[XOR:%.+]] = xor i32
+// CHECK-NEXT: store i32 [[XOR]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[ADD:%.+]] = add nsw i32 24,
+// CHECK-NEXT: store i32 [[ADD]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK: define {{.*}}void [[INIT:@[^(]+]]([[SSS_INT]]*
+// CHECK-LOAD: define {{.*}}void [[INIT:@[^(]+]]([[SSS_INT]]*
+void init(SSS<int> &lhs, SSS<int> &rhs) {}
+
+#pragma omp declare reduction(fun : SSS < int > : omp_out = omp_in) initializer(init(omp_priv, omp_orig))
+// CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK: call void @llvm.memcpy
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK: call {{.*}}void [[INIT]](
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK-LOAD: call void @llvm.memcpy
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK-LOAD: call {{.*}}void [[INIT]](
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+template <typename T>
+T foo(T a) {
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = 15 * omp_orig)
+  {
+#pragma omp declare reduction(fun : T : omp_out /= omp_in) initializer(omp_priv = 11 - omp_orig)
+  }
+  return a;
+}
+
+// CHECK-LABEL: @main
+int main() {
+  int i = 0;
+  SSS<int> sss;
+#pragma omp parallel reduction(SSS < int > ::fun : i)
+  {
+    i += 1;
+  }
+#pragma omp parallel reduction(::fun : sss)
+  {
+  }
+#pragma omp declare reduction(fun : SSS < int > : init(omp_out, omp_in))
+#pragma omp parallel reduction(fun : sss)
+  {
+  }
+  // CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+  // CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+  // CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call({{[^@]*}} @{{[^@]*}}[[REGION:@[^ ]+]]
+  // CHECK-LABEL: foo
+  return foo(15);
+}
+
+// CHECK: define internal {{.*}}void [[REGION]](
+// CHECK: [[SSS_PRIV:%.+]] = alloca %struct.SSS,
+// CHECK: invoke {{.*}} @_ZN3SSSIiEC1Ev(%struct.SSS* [[SSS_PRIV]])
+// CHECK-NOT: {{call |invoke }}
+// CHECK: call {{.*}}i32 @__kmpc_reduce_nowait(
+
+// CHECK-LABEL: i32 @{{.+}}foo{{[^(].+}}(i32
+// CHECK-LOAD-LABEL: i32 @{{.+}}foo{{[^(].+}}(i32
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[XOR:%.+]] = xor i32
+// CHECK-LOAD-NEXT: store i32 [[XOR]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[ADD:%.+]] = add nsw i32 24,
+// CHECK-LOAD-NEXT: store i32 [[ADD]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[ADD:%.+]] = add nsw i32
+// CHECK-NEXT: store i32 [[ADD]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[ADD:%.+]] = add nsw i32
+// CHECK-LOAD-NEXT: store i32 [[ADD]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[MUL:%.+]] = mul nsw i32 15,
+// CHECK-NEXT: store i32 [[MUL]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32 15,
+// CHECK-LOAD-NEXT: store i32 [[MUL]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[DIV:%.+]] = sdiv i32
+// CHECK-NEXT: store i32 [[DIV]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[DIV:%.+]] = sdiv i32
+// CHECK-LOAD-NEXT: store i32 [[DIV]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[SUB:%.+]] = sub nsw i32 11,
+// CHECK-NEXT: store i32 [[SUB]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[SUB:%.+]] = sub nsw i32 11,
+// CHECK-LOAD-NEXT: store i32 [[SUB]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+#endif
diff --git a/test/OpenMP/declare_reduction_messages.c b/test/OpenMP/declare_reduction_messages.c
new file mode 100644
index 0000000..fb9eacc
--- /dev/null
+++ b/test/OpenMP/declare_reduction_messages.c
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+int temp; // expected-note 6 {{'temp' declared here}}
+
+#pragma omp declare reduction                                              // expected-error {{expected '(' after 'declare reduction'}}
+#pragma omp declare reduction {                                            // expected-error {{expected '(' after 'declare reduction'}}
+#pragma omp declare reduction(                                             // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(#                                            // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(/                                            // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(+                                            // expected-error {{expected ':'}}
+#pragma omp declare reduction(for                                          // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(if:                                          // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}} expected-error {{expected a type}}
+#pragma omp declare reduction(oper:                                        // expected-error {{expected a type}}
+#pragma omp declare reduction(oper;                                        // expected-error {{expected ':'}} expected-error {{expected a type}}
+#pragma omp declare reduction(fun : int                                    // expected-error {{expected ':'}} expected-error {{expected expression}}
+#pragma omp declare reduction(+ : const int:                               // expected-error {{reduction type cannot be qualified with 'const', 'volatile' or 'restrict'}}
+#pragma omp declare reduction(- : volatile int:                            // expected-error {{reduction type cannot be qualified with 'const', 'volatile' or 'restrict'}}
+#pragma omp declare reduction(* : int;                                     // expected-error {{expected ','}} expected-error {{expected a type}}
+#pragma omp declare reduction(& : double char:                             // expected-error {{cannot combine with previous 'double' declaration specifier}} expected-error {{expected expression}}
+#pragma omp declare reduction(^ : double, char, :                          // expected-error {{expected a type}} expected-error {{expected expression}}
+#pragma omp declare reduction(&& : int, S:                                 // expected-error {{unknown type name 'S'}} expected-error {{expected expression}}
+#pragma omp declare reduction(|| : int, double : temp += omp_in)           // expected-error 2 {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(| : char, float : omp_out += temp)           // expected-error 2 {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun : long : omp_out += omp_in) {            // expected-error {{expected 'initializer'}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun : unsigned : omp_out += temp))           // expected-error {{expected 'initializer'}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} expected-error {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun : long(void) : omp_out += omp_in)        // expected-error {{reduction type cannot be a function type}}
+#pragma omp declare reduction(fun : long[3] : omp_out += omp_in)           // expected-error {{reduction type cannot be an array type}}
+#pragma omp declare reduction(fun23 : long, int, long : omp_out += omp_in) // expected-error {{redefinition of user-defined reduction for type 'long'}} expected-note {{previous definition is here}}
+
+#pragma omp declare reduction(fun222 : long : omp_out += omp_in)
+#pragma omp declare reduction(fun1 : long : omp_out += omp_in) initializer                 // expected-error {{expected '(' after 'initializer'}}
+#pragma omp declare reduction(fun2 : long : omp_out += omp_in) initializer {               // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun3 : long : omp_out += omp_in) initializer[                // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun4 : long : omp_out += omp_in) initializer()               // expected-error {{expected expression}}
+#pragma omp declare reduction(fun5 : long : omp_out += omp_in) initializer(temp)           // expected-error {{only 'omp_priv' or 'omp_orig' variables are allowed in initializer expression}}
+#pragma omp declare reduction(fun6 : long : omp_out += omp_in) initializer(omp_orig        // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun7 : long : omp_out += omp_in) initializer(omp_priv 12)    // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun8 : long : omp_out += omp_in) initializer(omp_priv = 23)  // expected-note {{previous definition is here}}
+#pragma omp declare reduction(fun8 : long : omp_out += omp_in) initializer(omp_priv = 23)) // expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} expected-error {{redefinition of user-defined reduction for type 'long'}}
+#pragma omp declare reduction(fun9 : long : omp_out += omp_in) initializer(omp_priv = )    // expected-error {{expected expression}}
+
+int fun(int arg) {
+#pragma omp declare reduction(red : int : omp_out++)
+  {
+#pragma omp declare reduction(red : int : omp_out++) // expected-note {{previous definition is here}}
+#pragma omp declare reduction(red : int : omp_out++) // expected-error {{redefinition of user-defined reduction for type 'int'}}
+    {
+#pragma omp declare reduction(red : int : omp_out++)
+    }
+  }
+  return arg;
+}
diff --git a/test/OpenMP/declare_reduction_messages.cpp b/test/OpenMP/declare_reduction_messages.cpp
new file mode 100644
index 0000000..a1373b1
--- /dev/null
+++ b/test/OpenMP/declare_reduction_messages.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+int temp; // expected-note 7 {{'temp' declared here}}
+
+#pragma omp declare reduction                                              // expected-error {{expected '(' after 'declare reduction'}}
+#pragma omp declare reduction {                                            // expected-error {{expected '(' after 'declare reduction'}}
+#pragma omp declare reduction(                                             // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(#                                            // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(/                                            // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(+                                            // expected-error {{expected ':'}}
+#pragma omp declare reduction(operator                                     // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(operator:                                    // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}} expected-error {{expected a type}}
+#pragma omp declare reduction(oper:                                        // expected-error {{expected a type}}
+#pragma omp declare reduction(oper;                                        // expected-error {{expected ':'}} expected-error {{expected a type}}
+#pragma omp declare reduction(fun : int                                    // expected-error {{expected ':'}} expected-error {{expected expression}}
+#pragma omp declare reduction(+ : const int:                               // expected-error {{reduction type cannot be qualified with 'const', 'volatile' or 'restrict'}}
+#pragma omp declare reduction(- : volatile int:                            // expected-error {{reduction type cannot be qualified with 'const', 'volatile' or 'restrict'}}
+#pragma omp declare reduction(* : int;                                     // expected-error {{expected ','}} expected-error {{expected a type}}
+#pragma omp declare reduction(& : double char:                             // expected-error {{cannot combine with previous 'double' declaration specifier}} expected-error {{expected expression}}
+#pragma omp declare reduction(^ : double, char, :                          // expected-error {{expected a type}} expected-error {{expected expression}}
+#pragma omp declare reduction(&& : int, S:                                 // expected-error {{unknown type name 'S'}} expected-error {{expected expression}}
+#pragma omp declare reduction(|| : int, double : temp += omp_in)           // expected-error 2 {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(| : char, float : omp_out += ::temp)         // expected-error 2 {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun : long : omp_out += omp_in) {            // expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} expected-error {{expected 'initializer'}}
+#pragma omp declare reduction(fun : unsigned : omp_out += ::temp))         // expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} expected-error {{expected 'initializer'}} expected-error {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun : long & : omp_out += omp_in)            // expected-error {{reduction type cannot be a reference type}}
+#pragma omp declare reduction(fun : long(void) : omp_out += omp_in)        // expected-error {{reduction type cannot be a function type}}
+#pragma omp declare reduction(fun : long[3] : omp_out += omp_in)           // expected-error {{reduction type cannot be an array type}}
+#pragma omp declare reduction(fun23 : long, int, long : omp_out += omp_in) // expected-error {{redefinition of user-defined reduction for type 'long'}} expected-note {{previous definition is here}}
+
+template <class T>
+class Class1 {
+ T a;
+public:
+  Class1() : a() {}
+#pragma omp declare reduction(fun : T : temp)               // expected-error {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun1 : T : omp_out++)         // expected-note {{previous definition is here}} expected-error {{reduction type cannot be a reference type}}
+#pragma omp declare reduction(fun1 : T : omp_out += omp_in) // expected-error {{redefinition of user-defined reduction for type 'T'}}
+#pragma omp declare reduction(fun2 : T, T : omp_out++)      // expected-error {{reduction type cannot be a reference type}} expected-error {{redefinition of user-defined reduction for type 'T'}} expected-note {{previous definition is here}}
+#pragma omp declare reduction(foo : T : omp_out += this->a) // expected-error {{invalid use of 'this' outside of a non-static member function}}
+};
+
+Class1<char &> e; // expected-note {{in instantiation of template class 'Class1<char &>' requested here}}
+
+template <class T>
+class Class2 : public Class1<T> {
+#pragma omp declare reduction(fun : T : omp_out += omp_in)
+};
+
+#pragma omp declare reduction(fun222 : long : omp_out += omp_in)                                        // expected-note {{previous definition is here}}
+#pragma omp declare reduction(fun222 : long : omp_out += omp_in)                                        // expected-error {{redefinition of user-defined reduction for type 'long'}}
+#pragma omp declare reduction(fun1 : long : omp_out += omp_in) initializer                              // expected-error {{expected '(' after 'initializer'}}
+#pragma omp declare reduction(fun2 : long : omp_out += omp_in) initializer {                            // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun3 : long : omp_out += omp_in) initializer[                             // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun4 : long : omp_out += omp_in) initializer()                            // expected-error {{expected expression}}
+#pragma omp declare reduction(fun5 : long : omp_out += omp_in) initializer(temp)                        // expected-error {{only 'omp_priv' or 'omp_orig' variables are allowed in initializer expression}}
+#pragma omp declare reduction(fun6 : long : omp_out += omp_in) initializer(omp_orig                     // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun7 : long : omp_out += omp_in) initializer(omp_priv Class1 < int > ())  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun77 : long : omp_out += omp_in) initializer(omp_priv Class2 < int > ()) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun8 : long : omp_out += omp_in) initializer(omp_priv 23)                 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun88 : long : omp_out += omp_in) initializer(omp_priv 23))               // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun9 : long : omp_out += omp_priv) initializer(omp_in = 23)               // expected-error {{use of undeclared identifier 'omp_priv'; did you mean 'omp_in'?}} expected-note {{'omp_in' declared here}}
+#pragma omp declare reduction(fun10 : long : omp_out += omp_in) initializer(omp_priv = 23)
+
+template <typename T>
+T fun(T arg) {
+#pragma omp declare reduction(red : T : omp_out++)
+  {
+#pragma omp declare reduction(red : T : omp_out++) // expected-note {{previous definition is here}}
+#pragma omp declare reduction(red : T : omp_out++) // expected-error {{redefinition of user-defined reduction for type 'T'}}
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = 23)
+  }
+  return arg;
+}
+
+template <typename T>
+T foo(T arg) {
+  T i;
+  {
+#pragma omp declare reduction(red : T : omp_out++)
+#pragma omp declare reduction(red1 : T : omp_out++)   // expected-note {{previous definition is here}}
+#pragma omp declare reduction(red1 : int : omp_out++) // expected-error {{redefinition of user-defined reduction for type 'int'}}
+  #pragma omp parallel reduction (red : i)
+  {
+  }
+  #pragma omp parallel reduction (red1 : i)
+  {
+  }
+  #pragma omp parallel reduction (red2 : i) // expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  {
+  }
+  }
+  {
+#pragma omp declare reduction(red1 : int : omp_out++) // expected-note {{previous definition is here}}
+#pragma omp declare reduction(red : T : omp_out++)
+#pragma omp declare reduction(red1 : T : omp_out++) // expected-error {{redefinition of user-defined reduction for type 'int'}}
+  #pragma omp parallel reduction (red : i)
+  {
+  }
+  #pragma omp parallel reduction (red1 : i)
+  {
+  }
+  #pragma omp parallel reduction (red2 : i) // expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  {
+  }
+  }
+  return arg;
+}
+
+#pragma omp declare reduction(foo : int : ({int a = omp_in; a = a * 2; omp_out += a; }))
+int main() {
+  Class1<int> c1;
+  int i;
+  #pragma omp parallel reduction (::fun : c1)
+  {
+  }
+  #pragma omp parallel reduction (::Class1<int>::fun : c1)
+  {
+  }
+  #pragma omp parallel reduction (::Class2<int>::fun : i) // expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  {
+  }
+  return fun(15) + foo(15); // expected-note {{in instantiation of function template specialization 'foo<int>' requested here}}
+}
diff --git a/test/OpenMP/declare_simd_ast_print.c b/test/OpenMP/declare_simd_ast_print.c
new file mode 100644
index 0000000..04fd73f
--- /dev/null
+++ b/test/OpenMP/declare_simd_ast_print.c
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare simd aligned(b : 64)
+#pragma omp declare simd simdlen(32) aligned(d, b)
+#pragma omp declare simd inbranch, uniform(d) linear(val(s1, s2) : 32)
+#pragma omp declare simd notinbranch simdlen(2), uniform(s1, s2) linear(d: s1)
+void add_1(float *d, int s1, float *s2, double b[]) __attribute__((cold));
+
+// CHECK: #pragma omp declare simd notinbranch simdlen(2) uniform(s1, s2) linear(val(d): s1)
+// CHECK-NEXT: #pragma omp declare simd inbranch uniform(d) linear(val(s1): 32) linear(val(s2): 32)
+// CHECK-NEXT: #pragma omp declare simd simdlen(32) aligned(d) aligned(b)
+// CHECK-NEXT: #pragma omp declare simd aligned(b: 64)
+// CHECK-NEXT: void add_1(float *d, int s1, float *s2, double b[]) __attribute__((cold))
+
+#endif
diff --git a/test/OpenMP/declare_simd_ast_print.cpp b/test/OpenMP/declare_simd_ast_print.cpp
new file mode 100644
index 0000000..5a32e61
--- /dev/null
+++ b/test/OpenMP/declare_simd_ast_print.cpp
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare simd linear(d: 8)
+#pragma omp declare simd inbranch simdlen(32)
+#pragma omp declare simd notinbranch
+void add_1(float *d) __attribute__((cold));
+
+// CHECK: #pragma omp declare simd notinbranch
+// CHECK-NEXT: #pragma omp declare simd inbranch simdlen(32)
+// CHECK-NEXT: #pragma omp declare simd linear(val(d): 8)
+// CHECK-NEXT: void add_1(float *d) __attribute__((cold));
+//
+
+#pragma omp declare simd aligned(hp, hp2)
+template <class C> void h(C *hp, C *hp2, C *hq, C *lin) {
+}
+
+// CHECK: #pragma omp declare simd aligned(hp) aligned(hp2)
+// CHECK-NEXT: template <class C = int> void h(int *hp, int *hp2, int *hq, int *lin) {
+// CHECK-NEXT: h((float *)hp, (float *)hp2, (float *)hq, (float *)lin);
+// CHECK-NEXT: }
+
+// CHECK: #pragma omp declare simd  aligned(hp) aligned(hp2)
+// CHECK-NEXT: template <class C = float> void h(float *hp, float *hp2, float *hq, float *lin) {
+// CHECK-NEXT: }
+
+// CHECK: #pragma omp declare simd aligned(hp) aligned(hp2)
+// CHECK: template <class C> void h(C *hp, C *hp2, C *hq, C *lin) {
+// CHECK-NEXT: }
+//
+
+// Explicit specialization with <C=int>.
+// Pragmas need to be same, otherwise standard says that's undefined behavior.
+#pragma omp declare simd aligned(hp, hp2)
+template <>
+void h(int *hp, int *hp2, int *hq, int *lin)
+{
+  // Implicit specialization with <C=float>.
+  // This is special case where the directive is stored by Sema and is
+  // generated together with the (pending) function instatiation.
+  h((float*) hp, (float*) hp2, (float*) hq, (float*) lin);
+}
+
+class VV {
+  // CHECK: #pragma omp declare simd uniform(this, a) linear(val(b): a)
+  // CHECK-NEXT: int add(int a, int b) __attribute__((cold))    {
+  // CHECK-NEXT: return a + b;
+  // CHECK-NEXT: }
+  #pragma omp declare simd uniform(this, a) linear(val(b): a)
+  int add(int a, int b) __attribute__((cold)) { return a + b; }
+
+  // CHECK: #pragma omp declare simd aligned(b: 4) aligned(a) linear(ref(b): 4) linear(val(this)) linear(val(a))
+  // CHECK-NEXT: float taddpf(float *a, float *&b)     {
+  // CHECK-NEXT: return *a + *b;
+  // CHECK-NEXT: }
+  #pragma omp declare simd aligned (b: 4) aligned(a) linear(ref(b): 4) linear(this, a)
+  float taddpf(float *a, float *&b) { return *a + *b; }
+
+// CHECK: #pragma omp declare simd aligned(b: 8)
+// CHECK-NEXT: #pragma omp declare simd linear(uval(c): 8)
+// CHECK-NEXT: int tadd(int (&b)[], int &c) {
+// CHECK-NEXT: return this->x[b[0]] + b[0];
+// CHECK-NEXT: }
+  #pragma omp declare simd linear(uval(c): 8)
+  #pragma omp declare simd aligned(b : 8)
+  int tadd(int (&b)[], int &c) { return x[b[0]] + b[0]; }
+
+private:
+  int x[10];
+};
+
+// CHECK: template <int X = 16, typename T = float> class TVV {
+// CHECK: #pragma omp declare simd
+// CHECK-NEXT: int tadd(int a, int b);
+// CHECK: #pragma omp declare simd aligned(a: 16 * 2) aligned(b) linear(ref(b): 16)
+// CHECK-NEXT: float taddpf(float *a, float *&b) {
+// CHECK-NEXT: return *a + *b;
+// CHECK-NEXT: }
+// CHECK: #pragma omp declare simd
+// CHECK-NEXT: #pragma omp declare simd
+// CHECK-NEXT: int tadd(int b) {
+// CHECK-NEXT: return this->x[b] + b;
+// CHECK-NEXT: }
+// CHECK: }
+template <int X, typename T>
+class TVV {
+public:
+// CHECK: template <int X, typename T> class TVV {
+  #pragma omp declare simd simdlen(X)
+  int tadd(int a, int b) { return a + b; }
+
+// CHECK: #pragma omp declare simd simdlen(X)
+// CHECK-NEXT: int tadd(int a, int b) {
+// CHECK-NEXT: return a + b;
+// CHECK-NEXT: }
+
+  #pragma omp declare simd aligned(a : X * 2) aligned(b) linear(ref(b): X)
+  float taddpf(float *a, T *&b) { return *a + *b; }
+
+// CHECK: #pragma omp declare simd aligned(a: X * 2) aligned(b)
+// CHECK-NEXT: float taddpf(float *a, T *&b) {
+// CHECK-NEXT: return *a + *b;
+// CHECK-NEXT: }
+
+  #pragma omp declare simd
+  #pragma omp declare simd uniform(this, b)
+  int tadd(int b) { return x[b] + b; }
+
+// CHECK: #pragma omp declare simd uniform(this, b)
+// CHECK-NEXT: #pragma omp declare simd
+// CHECK-NEXT: int tadd(int b) {
+// CHECK-NEXT: return this->x[b] + b;
+// CHECK-NEXT: }
+
+private:
+  int x[X];
+};
+// CHECK: };
+
+// CHECK: #pragma omp declare simd simdlen(64) aligned(b: 64 * 2) linear(uval(c): 64)
+// CHECK: template <int N = 64> void foo(int (&b)[64], float *&c)
+// CHECK: #pragma omp declare simd simdlen(N) aligned(b: N * 2) linear(uval(c): N)
+// CHECK: template <int N> void foo(int (&b)[N], float *&c)
+#pragma omp declare simd simdlen(N) aligned(b : N * 2) linear(uval(c): N)
+template <int N>
+void foo(int (&b)[N], float *&c);
+
+// CHECK: TVV<16, float> t16;
+TVV<16, float> t16;
+
+void f() {
+  float a = 1.0f, b = 2.0f;
+  float *p = &b;
+  float r = t16.taddpf(&a, p);
+  int res = t16.tadd(b);
+  int c[64];
+  foo(c, p);
+}
+
+#endif
diff --git a/test/OpenMP/declare_simd_codegen.cpp b/test/OpenMP/declare_simd_codegen.cpp
new file mode 100644
index 0000000..4ed7fb2
--- /dev/null
+++ b/test/OpenMP/declare_simd_codegen.cpp
@@ -0,0 +1,288 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare simd linear(d : 8)
+#pragma omp declare simd inbranch simdlen(32)
+#pragma omp declare simd notinbranch
+void add_1(float *d) {}
+
+#pragma omp declare simd aligned(hp, hp2)
+template <class C>
+void h(C *hp, C *hp2, C *hq, C *lin) {
+}
+
+// Explicit specialization with <C=int>.
+// Pragmas need to be same, otherwise standard says that's undefined behavior.
+#pragma omp declare simd aligned(hp, hp2)
+template <>
+void h(int *hp, int *hp2, int *hq, int *lin) {
+  // Implicit specialization with <C=float>.
+  // This is special case where the directive is stored by Sema and is
+  // generated together with the (pending) function instatiation.
+  h((float *)hp, (float *)hp2, (float *)hq, (float *)lin);
+}
+
+class VV {
+public:
+#pragma omp declare simd uniform(this, a) linear(val(b) : a)
+  int add(int a, int b) __attribute__((cold)) { return a + b; }
+
+#pragma omp declare simd aligned(b : 4) aligned(a) linear(ref(b) : 4) linear(this, a)
+  float taddpf(float *a, float *&b) { return *a + *b; }
+
+#pragma omp declare simd linear(uval(c) : 8)
+#pragma omp declare simd aligned(b : 8)
+  int tadd(int (&b)[], int &c) { return x[b[0]] + b[0]; }
+
+private:
+  int x[10];
+} vv;
+
+template <int X, typename T>
+class TVV {
+public:
+#pragma omp declare simd simdlen(X)
+  int tadd(int a, int b) { return a + b; }
+
+#pragma omp declare simd aligned(a : X * 2) aligned(b) linear(ref(b) : X)
+  float taddpf(float *a, T *&b) { return *a + *b; }
+
+#pragma omp declare simd
+#pragma omp declare simd uniform(this, b)
+  int tadd(int b) { return x[b] + b; }
+
+private:
+  int x[X];
+};
+
+#pragma omp declare simd simdlen(N) aligned(b : N * 2) linear(uval(c) : N)
+template <int N>
+void foo(int (&b)[N], float *&c) {}
+
+TVV<16, float> t16;
+
+void f(int (&g)[]) {
+  float a = 1.0f, b = 2.0f;
+  float *p = &b;
+  float r = t16.taddpf(&a, p);
+  int res = t16.tadd(b);
+  int c[64];
+  vv.add(res, res);
+  vv.taddpf(p, p);
+  vv.tadd(g, res);
+  foo(c, p);
+}
+
+#pragma omp declare simd
+#pragma omp declare simd notinbranch aligned(a : 32)
+int bar(VV v, float *a) { return 0; }
+#pragma omp declare simd
+#pragma omp declare simd notinbranch aligned(a)
+float baz(VV v, int a[]) { return 0; }
+#pragma omp declare simd
+#pragma omp declare simd notinbranch aligned(a)
+double bay(VV v, double *&a) { return 0; }
+#pragma omp declare simd
+#pragma omp declare simd inbranch linear(a : b) uniform(v, b)
+void bax(VV v, double *a, int b) {}
+#pragma omp declare simd uniform(q) aligned(q : 16) linear(k : 1)
+float foo(float *q, float x, int k) { return 0; }
+#pragma omp declare simd notinbranch
+double foo(double x) { return 0; }
+
+// CHECK-DAG: define {{.+}}@_Z5add_1Pf(
+// CHECK-DAG: define {{.+}}@_Z1hIiEvPT_S1_S1_S1_(
+// CHECK-DAG: define {{.+}}@_Z1hIfEvPT_S1_S1_S1_(
+// CHECK-DAG: define {{.+}}@_ZN2VV3addEii(
+// CHECK-DAG: define {{.+}}@_ZN2VV6taddpfEPfRS0_(
+// CHECK-DAG: define {{.+}}@_ZN2VV4taddERA_iRi(
+// CHECK-DAG: define {{.+}}@_Z1fRA_i(
+// CHECK-DAG: define {{.+}}@_ZN3TVVILi16EfE6taddpfEPfRS1_(
+// CHECK-DAG: define {{.+}}@_ZN3TVVILi16EfE4taddEi(
+// CHECK-DAG: define {{.+}}@_Z3fooILi64EEvRAT__iRPf(
+// CHECK-DAG: define {{.+}}@_Z3bar2VVPf(
+// CHECK-DAG: define {{.+}}@_Z3baz2VVPi(
+// CHECK-DAG: define {{.+}}@_Z3bay2VVRPd(
+// CHECK-DAG: define {{.+}}@_Z3bax2VVPdi(
+// CHECK-DAG: define {{.+}}@_Z3fooPffi(
+// CHECK-DAG: define {{.+}}@_Z3food(
+
+// CHECK-DAG: "_ZGVbM4l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVbN4l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVcM8l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVcN8l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVdM8l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVdN8l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVeM16l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVeN16l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVbM32v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVcM32v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVdM32v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVeM32v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVbN2v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVcN4v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVdN4v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVeN8v__Z5add_1Pf"
+
+// CHECK-DAG: "_ZGVbM2va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVbN2va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVcM4va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVcN4va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVdM4va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVdN4va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVeM8va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVeN8va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+
+// CHECK-DAG: "_ZGVbM2va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVbN2va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVcM4va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVcN4va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVdM4va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVdN4va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVeM8va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVeN8va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+
+// CHECK-DAG: "_ZGVbM4uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVbN4uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVcM8uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVcN8uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVdM8uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVdN8uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVeM16uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVeN16uus1__ZN2VV3addEii"
+
+// CHECK-DAG: "_ZGVbM4lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVbN4lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVcM8lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVcN8lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVdM8lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVdN8lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVeM16lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVeN16lla16l4a4__ZN2VV6taddpfEPfRS0_"
+
+// CHECK-DAG: "_ZGVbM4vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVbN4vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVcM8vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVcN8vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVdM8vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVdN8vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVeM16vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVeN16vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVbM4vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVbN4vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVcM8vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVcN8vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVdM8vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVdN8vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVeM16vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVeN16vva8v__ZN2VV4taddERA_iRi"
+
+// CHECK-DAG: "_ZGVbM4vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVbN4vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVcM8vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVcN8vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVdM8vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVdN8vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVeM16vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVeN16vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+
+// CHECK-DAG: "_ZGVbM4uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVbN4uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVcM8uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVcN8uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVdM8uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVdN8uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVeM16uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVeN16uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVbM4vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVbN4vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVcM8vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVcN8vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVdM8vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVdN8vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVeM16vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVeN16vv__ZN3TVVILi16EfE4taddEi"
+
+// CHECK-DAG: "_ZGVbM64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVbN64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVcM64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVcN64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVdM64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVdN64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVeM64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVeN64va128l64__Z3fooILi64EEvRAT__iRPf"
+
+// CHECK-DAG: "_ZGVbM4vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVbN4vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVcM8vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVcN8vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVdM8vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVdN8vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVeM16vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVeN16vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVbN4vva32__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVcN8vva32__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVdN8vva32__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVeN16vva32__Z3bar2VVPf"
+
+// CHECK-DAG: "_ZGVbM4vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVbN4vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVcM8vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVcN8vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVdM8vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVdN8vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVeM16vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVeN16vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVbN4vva16__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVcN8vva16__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVdN8vva16__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVeN16vva16__Z3baz2VVPi"
+
+// CHECK-DAG: "_ZGVbM2vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVbN2vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVcM4vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVcN4vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVdM4vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVdN4vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVeM8vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVeN8vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVbN2vva16__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVcN4vva16__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVdN4vva16__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVeN8vva16__Z3bay2VVRPd"
+
+// CHECK-DAG: "_ZGVbM4us2u__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVcM8us2u__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVdM8us2u__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVeM16us2u__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVbM4vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVbN4vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVcM8vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVcN8vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVdM8vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVdN8vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVeM16vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVeN16vvv__Z3bax2VVPdi"
+
+// CHECK-DAG: "_ZGVbM4ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVbN4ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVcM8ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVcN8ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVdM8ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVdN8ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVeM16ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVeN16ua16vl1__Z3fooPffi"
+
+// CHECK-DAG: "_ZGVbN2v__Z3food"
+// CHECK-DAG: "_ZGVcN4v__Z3food"
+// CHECK-DAG: "_ZGVdN4v__Z3food"
+// CHECK-DAG: "_ZGVeN8v__Z3food"
+
+// CHECK-NOT: "_ZGV{{.+}}__Z1fRA_i
+
+#endif
diff --git a/test/OpenMP/declare_simd_messages.cpp b/test/OpenMP/declare_simd_messages.cpp
new file mode 100644
index 0000000..15971eb
--- /dev/null
+++ b/test/OpenMP/declare_simd_messages.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -triple=x86_64-pc-win32 -verify -fopenmp -x c++ -std=c++11 -fms-extensions %s
+
+// expected-error@+1 {{expected an OpenMP directive}}
+#pragma omp declare
+
+// expected-error@+2 {{'#pragma omp declare simd' can only be applied to functions}}
+#pragma omp declare simd
+int a;
+// expected-error@+2 {{'#pragma omp declare simd' can only be applied to functions}}
+#pragma omp declare simd
+#pragma omp threadprivate(a)
+int var;
+#pragma omp threadprivate(var)
+
+// expected-error@+2 {{expected an OpenMP directive}} expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+#pragma omp declare
+
+// expected-error@+3 {{function declaration is expected after 'declare simd' directive}}
+// expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+#pragma omp declare simd
+#pragma options align=packed
+int main();
+
+// expected-error@+3 {{function declaration is expected after 'declare simd' directive}}
+// expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+#pragma omp declare simd
+#pragma init_seg(compiler)
+int main();
+
+// expected-error@+1 {{single declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+// expected-note@+1 {{declared here}}
+int b, c;
+
+// expected-error@+1 {{'C' does not refer to a value}}
+#pragma omp declare simd simdlen(C)
+// expected-note@+1 {{declared here}}
+template <class C>
+void h(C *hp, C *hp2, C *hq, C *lin) {
+  b = 0;
+}
+
+#pragma omp declare simd
+template <>
+void h(int *hp, int *hp2, int *hq, int *lin) {
+  h((float *)hp, (float *)hp2, (float *)hq, (float *)lin);
+}
+
+#pragma omp declare simd inbranch inbranch
+#pragma omp declare simd notinbranch notinbranch
+#pragma omp declare simd inbranch inbranch notinbranch // expected-error {{unexpected 'notinbranch' clause, 'inbranch' is specified already}}
+#pragma omp declare simd notinbranch notinbranch inbranch // expected-error {{unexpected 'inbranch' clause, 'notinbranch' is specified already}}
+// expected-note@+2 {{read of non-const variable 'b' is not allowed in a constant expression}}
+// expected-error@+1 {{expression is not an integral constant expression}}
+#pragma omp declare simd simdlen(b)
+// expected-error@+1 {{directive '#pragma omp declare simd' cannot contain more than one 'simdlen' clause}}
+#pragma omp declare simd simdlen(32) simdlen(c)
+// expected-error@+1 {{expected '(' after 'simdlen'}}
+#pragma omp declare simd simdlen
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd simdlen(
+// expected-error@+2 {{expected '(' after 'simdlen'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd simdlen(), simdlen
+// expected-error@+1 2 {{expected expression}}
+#pragma omp declare simd simdlen(), simdlen()
+// expected-warning@+3 {{extra tokens at the end of '#pragma omp declare simd' are ignored}}
+// expected-error@+2 {{expected '(' after 'simdlen'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd simdlen() simdlen)
+void foo();
+
+// expected-error@+3 4 {{expected reference to one of the parameters of function 'foo'}}
+// expected-error@+2 {{invalid use of 'this' outside of a non-static member function}}
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp declare simd simdlen(N) uniform(this, var) aligned(var)
+template<int N>
+void foo() {}
+
+void test() {
+  // expected-note@+1 {{in instantiation of function template specialization 'foo<-3>' requested here}}
+  foo<-3>();
+}
+
+// expected-error@+1 {{expected '(' after 'uniform'}}
+#pragma omp declare simd uniform
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd uniform(
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd uniform()
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd uniform(this
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd uniform(this,a
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd uniform(,a)
+// expected-error@+1 {{expected '(' after 'aligned'}}
+#pragma omp declare simd aligned
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned(
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned()
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned(a:
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned(a:)
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp declare simd' are ignored}}
+// expected-error@+1 {{expected '(' after 'aligned'}}
+#pragma omp declare simd aligned :)
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd aligned(this
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd aligned(this,b
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned(, b)
+// expected-note@+4 {{defined as aligned}}
+// expected-error@+3 {{a parameter cannot appear in more than one aligned clause}}
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ',' or ')' in 'aligned' clause}}
+#pragma omp declare simd aligned(b) aligned(b ; 64)
+// expected-note@+2 {{defined as aligned}}
+// expected-error@+1 {{a parameter cannot appear in more than one aligned clause}}
+#pragma omp declare simd aligned(b) aligned(b: 64)
+// expected-error@+1 {{argument to 'aligned' clause must be a strictly positive integer value}}
+#pragma omp declare simd aligned(b: -1)
+// expected-warning@+1 {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+#pragma omp declare simd aligned(b: 3)
+// expected-error@+1 {{expected '(' after 'linear'}}
+#pragma omp declare simd linear
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear(
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear()
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear(a:
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear(a:)
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp declare simd' are ignored}}
+// expected-error@+1 {{expected '(' after 'linear'}}
+#pragma omp declare simd linear :)
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd linear(this
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd linear(this,b
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear(, b)
+// expected-note@+4 {{defined as linear}}
+// expected-error@+3 {{linear variable cannot be linear}}
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ',' or ')' in 'linear' clause}}
+#pragma omp declare simd linear(b) linear(b ; 64)
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be linear}}
+#pragma omp declare simd linear(b) linear(b: 64)
+#pragma omp declare simd linear(b: -1)
+#pragma omp declare simd linear(b: 3)
+// expected-error@+1 {{expected a reference to a parameter specified in a 'uniform' clause}}
+#pragma omp declare simd linear(b: a)
+// expected-note@+2 {{defined as uniform}}
+// expected-error@+1 {{linear variable cannot be uniform}}
+#pragma omp declare simd uniform(a), linear(a: 4)
+// expected-note@+2 {{defined as uniform}}
+// expected-error@+1 {{linear variable cannot be uniform}}
+#pragma omp declare simd linear(a: 4) uniform(a)
+// expected-error@+1 {{variable of non-reference type 'int *' can be used only with 'val' modifier, but used with 'uval'}}
+#pragma omp declare simd linear(uval(b))
+// expected-error@+1 {{variable of non-reference type 'int *' can be used only with 'val' modifier, but used with 'ref'}}
+#pragma omp declare simd linear(ref(b))
+// expected-error@+1 {{expected one of 'ref', val' or 'uval' modifiers}}
+#pragma omp declare simd linear(uref(b))
+void bar(int a, int *b);
+
+template <class T>
+struct St {
+// expected-error@+2 {{function declaration is expected after 'declare simd' directive}}
+#pragma init_seg(compiler)
+#pragma omp declare simd
+#pragma init_seg(compiler)
+// expected-note@+7 {{defined as uniform}}
+// expected-error@+6 {{expected a reference to a parameter specified in a 'uniform' clause}}
+// expected-error@+5 {{linear variable cannot be uniform}}
+// expected-note@+4 {{defined as aligned}}
+// expected-error@+3 {{argument to 'aligned' clause must be a strictly positive integer value}}
+// expected-error@+2 {{'this' cannot appear in more than one aligned clause}}
+// expected-error@+1 {{use of undeclared identifier 't'}}
+#pragma omp declare simd uniform(this, t) aligned(this: 4) aligned(this: -4) linear(this: hp)
+  void h(T *hp) {
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp declare simd'}}
+#pragma omp declare simd
+    *hp = *t;
+  }
+
+private:
+  T t;
+};
+
+namespace N {
+  // expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+  #pragma omp declare simd
+}
+// expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+// expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
diff --git a/test/OpenMP/declare_target_ast_print.cpp b/test/OpenMP/declare_target_ast_print.cpp
new file mode 100644
index 0000000..78a9cf6
--- /dev/null
+++ b/test/OpenMP/declare_target_ast_print.cpp
@@ -0,0 +1,138 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare target
+// CHECK: #pragma omp declare target
+void foo() {}
+// CHECK-NEXT: void foo()
+#pragma omp end declare target
+// CHECK: #pragma omp end declare target
+
+extern "C" {
+#pragma omp declare target
+// CHECK: #pragma omp declare target
+void foo_c() {}
+// CHECK-NEXT: void foo_c()
+#pragma omp end declare target
+// CHECK: #pragma omp end declare target
+}
+
+extern "C++" {
+#pragma omp declare target
+// CHECK: #pragma omp declare target
+void foo_cpp() {}
+// CHECK-NEXT: void foo_cpp()
+#pragma omp end declare target
+// CHECK: #pragma omp end declare target
+}
+
+#pragma omp declare target
+template <class T>
+struct C {
+// CHECK: template <class T = int> struct C
+  T t;
+// CHECK-NEXT: int t;
+  static T ts;
+// CHECK-NEXT: #pragma omp declare target
+// CHECK-NEXT: static int ts;
+// CHECK: #pragma omp end declare target
+
+  C(T t) : t(t) {
+  }
+// CHECK: #pragma omp declare target
+// CHECK-NEXT: C(int t) : t(t) {
+// CHECK-NEXT: }
+// CHECK: #pragma omp end declare target
+
+  T foo() {
+    return t;
+  }
+// CHECK: #pragma omp declare target
+// CHECK-NEXT: int foo() {
+// CHECK-NEXT: return this->t;
+// CHECK-NEXT: }
+// CHECK: #pragma omp end declare target
+};
+
+// CHECK: template <class T> struct C {
+// CHECK: #pragma omp declare target
+// CHECK-NEXT: static T ts;
+// CHECK-NEXT: #pragma omp end declare target
+
+template<class T>
+T C<T>::ts = 1;
+// CHECK: #pragma omp declare target
+// CHECK: T ts = 1;
+// CHECK: #pragma omp end declare target
+
+// CHECK: #pragma omp declare target
+// CHECK: int test1()
+int test1() {
+  C<int> c(1);
+  return c.foo() + c.ts;
+}
+#pragma omp end declare target
+// CHECK: #pragma omp end declare target
+
+int a1;
+void f1() {
+}
+#pragma omp declare target (a1, f1)
+// CHECK: #pragma omp declare target
+// CHECK: int a1;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target
+// CHECK: void f1()
+// CHECK: #pragma omp end declare target
+
+int b1, b2, b3;
+void f2() {
+}
+#pragma omp declare target to(b1) to(b2), to(b3, f2)
+// CHECK: #pragma omp declare target
+// CHECK: int b1;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target
+// CHECK: int b2;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target
+// CHECK: int b3;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target
+// CHECK: void f2()
+// CHECK: #pragma omp end declare target
+
+int c1, c2, c3;
+void f3() {
+}
+#pragma omp declare target link(c1) link(c2), link(c3, f3)
+// CHECK: #pragma omp declare target link
+// CHECK: int c1;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target link
+// CHECK: int c2;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target link
+// CHECK: int c3;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target link
+// CHECK: void f3()
+// CHECK: #pragma omp end declare target
+
+int main (int argc, char **argv) {
+  foo();
+  foo_c();
+  foo_cpp();
+  test1();
+  return (0);
+}
+
+// CHECK: #pragma omp declare target
+// CHECK-NEXT: int ts = 1;
+// CHECK-NEXT: #pragma omp end declare target
+#endif
diff --git a/test/OpenMP/declare_target_messages.cpp b/test/OpenMP/declare_target_messages.cpp
new file mode 100644
index 0000000..b858d53
--- /dev/null
+++ b/test/OpenMP/declare_target_messages.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -fnoopenmp-use-tls -ferror-limit 100 -o - %s
+
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+
+int a, b; // expected-warning {{declaration is not declared in any declare target region}}
+__thread int t; // expected-note {{defined as threadprivate or thread local}}
+
+#pragma omp declare target . // expected-error {{expected '(' after 'declare target'}}
+
+#pragma omp declare target
+void f();
+#pragma omp end declare target shared(a) // expected-warning {{extra tokens at the end of '#pragma omp end declare target' are ignored}}
+
+#pragma omp declare target map(a) // expected-error {{unexpected 'map' clause, only 'to' or 'link' clauses expected}}
+
+void c(); // expected-warning {{declaration is not declared in any declare target region}}
+
+extern int b;
+
+struct NonT {
+  int a;
+};
+
+typedef int sint;
+
+#pragma omp declare target // expected-note {{to match this '#pragma omp declare target'}}
+#pragma omp threadprivate(a) // expected-note {{defined as threadprivate or thread local}}
+extern int b;
+int g;
+
+struct T { // expected-note {{mappable type cannot be polymorphic}}
+  int a;
+  virtual int method();
+};
+
+class VC { // expected-note {{mappable type cannot be polymorphic}}
+  T member;
+  NonT member1;
+  public:
+    virtual int method() { T a; return 0; } // expected-error {{type 'T' is not mappable to target}}
+};
+
+struct C {
+  NonT a;
+  sint b;
+  int method();
+  int method1();
+};
+
+int C::method1() {
+  return 0;
+}
+
+void foo() {
+  a = 0; // expected-error {{threadprivate variables cannot be used in target constructs}}
+  b = 0; // expected-note {{used here}}
+  t = 1; // expected-error {{threadprivate variables cannot be used in target constructs}}
+  C object;
+  VC object1; // expected-error {{type 'VC' is not mappable to target}}
+  g = object.method();
+  g += object.method1();
+  g += object1.method();
+  f();
+  c(); // expected-note {{used here}}
+}
+#pragma omp declare target // expected-error {{expected '#pragma omp end declare target'}}
+void foo1() {}
+#pragma omp end declare target
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+
+int C::method() {
+  return 0;
+}
+
+struct S {
+#pragma omp declare target // expected-error {{directive must be at file or namespace scope}}
+  int v;
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+};
+
+int main (int argc, char **argv) {
+#pragma omp declare target // expected-error {{unexpected OpenMP directive '#pragma omp declare target'}}
+  int v;
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+  foo();
+  return (0);
+}
+
+namespace {
+#pragma omp declare target // expected-note {{to match this '#pragma omp declare target'}}
+  int x;
+} //  expected-error {{expected '#pragma omp end declare target'}}
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+
+#pragma omp declare target link(S) // expected-error {{'S' used in declare target directive is not a variable or a function name}}
+
+#pragma omp declare target (x, x) // expected-error {{'x' appears multiple times in clauses on the same declare target directive}}
+#pragma omp declare target to(x) to(x) // expected-error {{'x' appears multiple times in clauses on the same declare target directive}}
+#pragma omp declare target link(x) // expected-error {{'x' must not appear in both clauses 'to' and 'link'}}
+
+#pragma omp declare target // expected-error {{expected '#pragma omp end declare target'}} expected-note {{to match this '#pragma omp declare target'}}
diff --git a/test/OpenMP/distribute_ast_print.cpp b/test/OpenMP/distribute_ast_print.cpp
index c3a175a..5748fc7 100644
--- a/test/OpenMP/distribute_ast_print.cpp
+++ b/test/OpenMP/distribute_ast_print.cpp
@@ -8,6 +8,75 @@
 
 void foo() {}
 
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute private(a) private(this->a) private(S7<S>::a) 
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a)
+
 template <class T, int N>
 T tmain(T argc) {
   T b = argc, c, d, e, f, g;
diff --git a/test/OpenMP/distribute_codegen.cpp b/test/OpenMP/distribute_codegen.cpp
new file mode 100644
index 0000000..37f00f0
--- /dev/null
+++ b/test/OpenMP/distribute_codegen.cpp
@@ -0,0 +1,263 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64  --check-prefix HCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32  --check-prefix HCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CHECK-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void without_schedule_clause(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute
+  for (int i = 33; i < 32000000; i += 7) {
+    a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 4571423, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 92, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp sgt i32 [[UBV0]], 4571423
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 4571423, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp sle i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add nsw i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_not_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute dist_schedule(static)
+  for (int i = 32000000; i > 33; i += -7) {
+        a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 4571423, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 92, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp sgt i32 [[UBV0]], 4571423
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 4571423, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp sle i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add nsw i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+#pragma omp distribute dist_schedule(static, 5)
+  for (unsigned i = 131071; i <= 2147483647; i += 127) {
+    a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 16908288, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 91, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 5)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp ugt i32 [[UBV0]], 16908288
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 16908288, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp ule i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+// CHECK-LABEL: test_precond
+void test_precond() {
+  char a = 0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute
+  for(char i = a; i < 10; ++i);
+}
+
+// a is passed as a parameter to the outlined functions
+// CHECK:  define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], i8* dereferenceable({{[0-9]+}}) [[APARM:%.+]])
+// CHECK:  store i8* [[APARM]], i8** [[APTRADDR:%.+]]
+// ..many loads of %0..
+// CHECK:  [[A2:%.+]] = load i8*, i8** [[APTRADDR]]
+// CHECK:  [[AVAL0:%.+]] = load i8, i8* [[A2]]
+// CHECK:  store i8 [[AVAL0]], i8* [[CAP_EXPR:%.+]],
+// CHECK:  [[AVAL1:%.+]] = load i8, i8* [[CAP_EXPR]]
+// CHECK:  load i8, i8* [[CAP_EXPR]]
+// CHECK:  [[AVAL2:%.+]] = load i8, i8* [[CAP_EXPR]]
+// CHECK:  [[ACONV:%.+]] = sext i8 [[AVAL2]] to i32
+// CHECK:  [[ACMP:%.+]] = icmp slt i32 [[ACONV]], 10
+// CHECK:  br i1 [[ACMP]], label %[[PRECOND_THEN:.+]], label %[[PRECOND_END:.+]]
+// CHECK:  [[PRECOND_THEN]]
+// CHECK:  call void @__kmpc_for_static_init_4
+// CHECK:  call void @__kmpc_for_static_fini
+// CHECK:  [[PRECOND_END]]
+
+// no templates for now, as these require special handling in target regions and/or declare target
+
+// HCHECK-LABEL: fint
+// HCHECK: call {{.*}}i32 {{.+}}ftemplate
+// HCHECK: ret i32
+
+// HCHECK: load i16, i16*
+// HCHECK: store i16 %
+// HCHECK: call i32 @__tgt_target_teams(
+// HCHECK: call void @__kmpc_for_static_init_4(
+template <typename T>
+T ftemplate() {
+  short aa = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute dist_schedule(static, aa)
+  for (int i = 0; i < 100; i++) {
+  }
+  return T();
+}
+
+int fint(void) { return ftemplate<int>(); }
+
+#endif
diff --git a/test/OpenMP/distribute_parallel_for_ast_print.cpp b/test/OpenMP/distribute_parallel_for_ast_print.cpp
new file mode 100644
index 0000000..993cc2a
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_ast_print.cpp
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, h;
+  static T a;
+// CHECK: static T a;
+  static T g;
+#pragma omp threadprivate(g)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule(static, a) schedule(dynamic) default(none) copyin(g) firstprivate(a)
+  // CHECK: #pragma omp distribute parallel for dist_schedule(static, a) schedule(dynamic) default(none) copyin(g)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) schedule(static, N) if (parallel :argc) num_threads(N) default(shared) shared(e) reduction(+ : h) dist_schedule(static,N)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+	    a++;
+  // CHECK: #pragma omp distribute parallel for private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) schedule(static, N) if(parallel: argc) num_threads(N) default(shared) shared(e) reduction(+: h) dist_schedule(static, N)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: a++;
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  static int a;
+// CHECK: static int a;
+  static float g;
+#pragma omp threadprivate(g)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule(guided, argc) default(none) copyin(g) dist_schedule(static, a) private(a)
+  // CHECK: #pragma omp distribute parallel for schedule(guided, argc) default(none) copyin(g) dist_schedule(static, a) private(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) schedule(auto) if (argc) num_threads(a) default(shared) shared(e) reduction(+ : h) dist_schedule(static, b)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      a++;
+  // CHECK: #pragma omp distribute parallel for private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) schedule(auto) if(argc) num_threads(a) default(shared) shared(e) reduction(+: h) dist_schedule(static, b)
+ // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: a++;
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
+}
+
+#endif
diff --git a/test/OpenMP/distribute_parallel_for_collapse_messages.cpp b/test/OpenMP/distribute_parallel_for_collapse_messages.cpp
new file mode 100644
index 0000000..41976a6
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_collapse_messages.cpp
@@ -0,0 +1,154 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp distribute parallel for collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp distribute parallel for', but found only 1}}
+  // expected-error@+8 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+7 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute parallel for'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for', but found only 1}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+8 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+6{{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+3 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute parallel for' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute parallel for'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_copyin_messages.cpp b/test/OpenMP/distribute_parallel_for_copyin_messages.cpp
new file mode 100644
index 0000000..7d70341
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_copyin_messages.cpp
@@ -0,0 +1,190 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2 &operator=(S2 &s2) { return *this; }
+};
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+  S3 &operator=(S3 &s3) { return *this; }
+};
+class S4 {
+  int a;
+  S4();
+  S4 &operator=(const S4 &s4); // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+  S5 &operator=(const S5 &s5) { return *this; } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+};
+template <class T>
+class ST {
+public:
+  static T s;
+};
+
+S2 k;
+S3 h;
+S4 l(3);
+S5 m(4);
+#pragma omp threadprivate(h, k, l, m)
+
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin // expected-error {{expected '(' after 'copyin'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(k // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(l) // expected-error 2 {{'operator=' is a private member of 'S4'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(i) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(m) // expected-error 2 {{'operator=' is a private member of 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin // expected-error {{expected '(' after 'copyin'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(k // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(l) // expected-error {{'operator=' is a private member of 'S4'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(i) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(m) // expected-error {{'operator=' is a private member of 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_default_messages.cpp b/test/OpenMP/distribute_parallel_for_default_messages.cpp
new file mode 100644
index 0000000..3437bd5
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_default_messages.cpp
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+template <class T, int N>
+T tmain(T argc) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error 2 {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(shared), default(shared) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(none)
+  for (i = 0; i < argc; ++i)  // expected-error 2 {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(shared), default(shared) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(none)
+  for (i = 0; i < argc; ++i)  // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_dist_schedule_messages.cpp b/test/OpenMP/distribute_parallel_for_dist_schedule_messages.cpp
new file mode 100644
index 0000000..0f5820e
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_dist_schedule_messages.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  char ** argv;
+  static T a;
+// CHECK: static T a;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  return T();
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp b/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
new file mode 100644
index 0000000..3e288c3
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
@@ -0,0 +1,359 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i)    // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_if_messages.cpp b/test/OpenMP/distribute_parallel_for_if_messages.cpp
new file mode 100644
index 0000000..c864340
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_if_messages.cpp
@@ -0,0 +1,179 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (parallel:argc) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(distribute : argc) // expected-error {{directive name modifier 'distribute' is not allowed for '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (parallel:argc) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(distribute : argc) // expected-error {{directive name modifier 'distribute' is not allowed for '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp b/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
new file mode 100644
index 0000000..745007f
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
@@ -0,0 +1,333 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator =(const S2&) const;
+  S2 &operator =(const S2&);
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  int v = 0;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target
+#pragma omp teams private(i)
+#pragma omp distribute parallel for lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(i)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_num_threads_messages.cpp b/test/OpenMP/distribute_parallel_for_num_threads_messages.cpp
new file mode 100644
index 0000000..7939514
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_num_threads_messages.cpp
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_private_messages.cpp b/test/OpenMP/distribute_parallel_for_private_messages.cpp
new file mode 100644
index 0000000..465357a
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_private_messages.cpp
@@ -0,0 +1,315 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_proc_bind_messages.cpp b/test/OpenMP/distribute_parallel_for_proc_bind_messages.cpp
new file mode 100644
index 0000000..9898f9d
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_proc_bind_messages.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+  return tmain<int, char, 3>(argc, argv);
+}
diff --git a/test/OpenMP/distribute_parallel_for_reduction_messages.cpp b/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
new file mode 100644
index 0000000..f23a25e
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
@@ -0,0 +1,441 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_schedule_messages.cpp b/test/OpenMP/distribute_parallel_for_schedule_messages.cpp
new file mode 100644
index 0000000..6363cd7
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_schedule_messages.cpp
@@ -0,0 +1,151 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 2 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (dynamic, 1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, (ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'schedule' clause}}
+  // expected-error@+3 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, foobool(argc)), schedule (dynamic, true), schedule (guided, -5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (dynamic, 1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, N) // expected-error {{argument to 'schedule' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, 4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, 2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (dynamic, foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'schedule' clause}}
+  // expected-error@+3 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, foobool(argc)), schedule (static, true), schedule (dynamic, -5)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute parallel for' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule(dynamic, schedule(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_shared_messages.cpp b/test/OpenMP/distribute_parallel_for_shared_messages.cpp
new file mode 100644
index 0000000..d5725e7
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_shared_messages.cpp
@@ -0,0 +1,396 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note 2 {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int acc = 0;
+  int n = 1000;
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared // expected-error {{expected '(' after 'shared'}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared () // expected-error {{expected expression}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (a, b, c, d, f)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argv[1]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(ba)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(ca)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(da)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(e, g)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+return T();
+}
+
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int acc = 0;
+  int n = argc;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared // expected-error {{expected '(' after 'shared'}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared () // expected-error {{expected expression}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (a, b, c, d, f)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argv[1]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(ba)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(ca)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(da)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(e, g)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+return tmain<int, char, 1000>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 1000>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_aligned_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_aligned_messages.cpp
new file mode 100644
index 0000000..9c9f3dd
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_aligned_messages.cpp
@@ -0,0 +1,306 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
+
+struct B {
+  static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
+  static constexpr int bfoo() { return 8; }
+};
+namespace X {
+  B x; // expected-note {{'x' defined here}}
+};
+constexpr int bfoo() { return 4; }
+
+int **z;
+const int C1 = 1;
+const int C2 = 2;
+void test_aligned_colons(int *&rp)
+{
+  int *B = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B::ib:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(z:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(X::x : ::z) // expected-error {{integral constant expression must have integral or unscoped enumeration type, not 'int **'}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'B'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B,rp,::z: X::x) // expected-error {{integral constant expression must have integral or unscoped enumeration type, not 'B'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp distribute parallel for simd aligned(B::bfoo()) // expected-error {{expected variable name}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B::ib,B:C1+C2) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (int i = 0; i < 10; ++i) ;
+}
+
+// expected-note@+1 {{'num' defined here}}
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L;
+  // Negative number is passed as L.
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(arr:L) // expected-error {{argument to 'aligned' clause must be a strictly positive integer value}}
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(num:4) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (i = 0; i < num; ++i);
+
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int *ind2 = 0;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(ind2:LEN) // expected-error {{argument to 'aligned' clause must be a strictly positive integer value}}
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return 0;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a; // expected-note {{'a' declared here}}
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 1 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h; // expected-note 2 {{'h' defined here}}
+#pragma omp threadprivate(h)
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(argc);
+  I g(argc);
+  int i; // expected-note {{declared here}} expected-note {{'i' defined here}}
+  // expected-note@+2 {{declared here}}
+  // expected-note@+1 {{reference to 'i' is not a constant expression}}
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned () // expected-error {{expected expression}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc : 5) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(e, g)
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(h) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(i) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int *v = 0;
+    I i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute parallel for simd aligned(v:16)
+      for (I k = 0; k < argc; ++k) { i = k; v += 2; }
+  }
+  float *f;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(f:j) // expected-note {{initializer of 'j' is not a constant expression}} expected-error {{expression is not an integral constant expression}}
+
+  for (I k = 0; k < argc; ++k) { ++k; v += j; }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+
+  return 0;
+}
+
+// expected-note@+1 2 {{'argc' defined here}}
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  test_warn<4>(); // ok
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argv // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (a, b) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S1'}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S2'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(h)  // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
+  foomain<int*,char>(pargc,argv);
+  return 0;
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp b/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp
new file mode 100644
index 0000000..56809dc
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp
@@ -0,0 +1,152 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, h;
+  static T a;
+// CHECK: static T a;
+  static T g;
+#pragma omp threadprivate(g)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule(static, a) schedule(dynamic) default(none) copyin(g) firstprivate(a)
+  // CHECK: #pragma omp distribute parallel for simd dist_schedule(static, a) schedule(dynamic) default(none) copyin(g)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) schedule(static, N) if (parallel :argc) num_threads(N) default(shared) shared(e) reduction(+ : h) dist_schedule(static,N)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+	    a++;
+  // CHECK: #pragma omp distribute parallel for simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) schedule(static, N) if(parallel: argc) num_threads(N) default(shared) shared(e) reduction(+: h) dist_schedule(static, N)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: a++;
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int x[200];
+  static int a;
+// CHECK: static int a;
+  static float g;
+#pragma omp threadprivate(g)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule(guided, argc) default(none) copyin(g) dist_schedule(static, a) private(a)
+  // CHECK: #pragma omp distribute parallel for simd schedule(guided, argc) default(none) copyin(g) dist_schedule(static, a) private(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) schedule(auto) if (argc) num_threads(a) default(shared) shared(e) reduction(+ : h) dist_schedule(static, b)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      a++;
+  // CHECK: #pragma omp distribute parallel for simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) schedule(auto) if(argc) num_threads(a) default(shared) shared(e) reduction(+: h) dist_schedule(static, b)
+  // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: a++;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(x:8) linear(h:2) safelen(8) simdlen(8)
+  for (int i = 0; i < 100; i++)
+    for (int j = 0; j < 200; j++)
+      a += h + x[j];
+  // CHECK: #pragma omp distribute parallel for simd aligned(x: 8) linear(h: 2) safelen(8) simdlen(8)
+  // CHECK-NEXT: for (int i = 0; i < 100; i++)
+  // CHECK-NEXT: for (int j = 0; j < 200; j++)
+  // CHECK-NEXT: a += h + x[j];
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
+}
+
+#endif
diff --git a/test/OpenMP/distribute_parallel_for_simd_collapse_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_collapse_messages.cpp
new file mode 100644
index 0000000..8690b4a
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_collapse_messages.cpp
@@ -0,0 +1,154 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp distribute parallel for simd collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+  // expected-error@+8 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+7 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute parallel for simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+8 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+6{{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+3 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute parallel for simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_copyin_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_copyin_messages.cpp
new file mode 100644
index 0000000..2d1a3dc
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_copyin_messages.cpp
@@ -0,0 +1,190 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2 &operator=(S2 &s2) { return *this; }
+};
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+  S3 &operator=(S3 &s3) { return *this; }
+};
+class S4 {
+  int a;
+  S4();
+  S4 &operator=(const S4 &s4); // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+  S5 &operator=(const S5 &s5) { return *this; } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+};
+template <class T>
+class ST {
+public:
+  static T s;
+};
+
+S2 k;
+S3 h;
+S4 l(3);
+S5 m(4);
+#pragma omp threadprivate(h, k, l, m)
+
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin // expected-error {{expected '(' after 'copyin'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(k // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(l) // expected-error 2 {{'operator=' is a private member of 'S4'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(i) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(m) // expected-error 2 {{'operator=' is a private member of 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin // expected-error {{expected '(' after 'copyin'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(k // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(l) // expected-error {{'operator=' is a private member of 'S4'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(i) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(m) // expected-error {{'operator=' is a private member of 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_default_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_default_messages.cpp
new file mode 100644
index 0000000..5c32306
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_default_messages.cpp
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+template <class T, int N>
+T tmain(T argc) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error 2 {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(shared), default(shared) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(none)
+  for (i = 0; i < argc; ++i)  // expected-error 2 {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(shared), default(shared) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(none)
+  for (i = 0; i < argc; ++i)  // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_dist_schedule_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_dist_schedule_messages.cpp
new file mode 100644
index 0000000..2e3ee2b
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_dist_schedule_messages.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  char ** argv;
+  static T a;
+// CHECK: static T a;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  return T();
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_firstprivate_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_firstprivate_messages.cpp
new file mode 100644
index 0000000..07d30e4
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_firstprivate_messages.cpp
@@ -0,0 +1,359 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_if_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_if_messages.cpp
new file mode 100644
index 0000000..01236b5
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_if_messages.cpp
@@ -0,0 +1,179 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp distribute parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (parallel:argc) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(distribute : argc) // expected-error {{directive name modifier 'distribute' is not allowed for '#pragma omp distribute parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp distribute parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (parallel:argc) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(distribute : argc) // expected-error {{directive name modifier 'distribute' is not allowed for '#pragma omp distribute parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp
new file mode 100644
index 0000000..109fde0
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp
@@ -0,0 +1,333 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator =(const S2&) const;
+  S2 &operator =(const S2&);
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  int v = 0;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for simd lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target
+#pragma omp teams private(i)
+#pragma omp distribute parallel for simd lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(i) // expected-note {{defined as lastprivate}}
+  for (i = 0; i < argc; ++i) // expected-error{{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be lastprivate, predetermined as linear}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_linear_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_linear_messages.cpp
new file mode 100644
index 0000000..632ef06
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_linear_messages.cpp
@@ -0,0 +1,338 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+  int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons()
+{
+  int B = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B::ib:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B:ib) // expected-error {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(z:B:ib) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B,::z, X::x)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B::bfoo()) // expected-error {{expected variable name}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B::ib,B:C1+C2)
+  for (int i = 0; i < 10; ++i) ;
+}
+
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L; // expected-note {{'ind2' defined here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(ind2:L) // expected-error {{argument of a linear clause should be of integral or pointer type}}
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int ind2 = 0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp parallel for simd linear(ind2:LEN) // expected-warning {{zero linear step (ind2 should probably be const)}}
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc : 5)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (a, b:B::ib) // expected-error {{linear variable with incomplete type 'S1'}} expected-error {{const-qualified variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(e, g)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int v = 0;
+    int i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute parallel for simd linear(v:i)
+    for (int k = 0; k < argc; ++k) { i = k; v += i; }
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(v:j)
+  for (int k = 0; k < argc; ++k) { ++k; v += j; }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (a, b) // expected-error {{linear variable with incomplete type 'S1'}} expected-error {{const-qualified variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(e, g) // expected-error {{argument of a linear clause should be of integral or pointer type, not 'S4'}} expected-error {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute parallel for simd linear(i)
+      for (int k = 0; k < argc; ++k) ++k;
+
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute parallel for simd linear(i : 4)
+      for (int k = 0; k < argc; ++k) { ++k; i += 4; }
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  foomain<int,char>(argc,argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp
new file mode 100644
index 0000000..6c322e6
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp
@@ -0,0 +1,816 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+class S {
+  int a;
+  S() : a(0) {}
+
+public:
+  S(int v) : a(v) {}
+  S(const S &s) : a(s.a) {}
+};
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; i += 1) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (char i = 0; i < 10; i += '\1') {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp distribute parallel for simd
+  for (long long i = 0; i < 10; i += 1.5) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (long long i = 0; i < 'z'; i += 1u) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp distribute parallel for simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp distribute parallel for simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (ii + 1; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (c[ii] = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// Ok to skip parenthesises.
+#pragma omp distribute parallel for simd
+  for (((ii)) = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0;; i++)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ++++ii)
+    c[ii] = a[ii];
+
+// Ok but undefined behavior (in general, cannot check that incr
+// is really loop-invariant).
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+// Ok - step was converted to integer type.
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+3 {{relational comparison result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii<10; jj> kk + 2)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; (ii) < 10; ii -= 25)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; (ii < 10); ii -= 0)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii > 10; (ii += 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; (ii) = (1 - 1) + (ii))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for ((ii = 0); ii > 10; (ii -= 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; (ii < 10); (ii -= 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2  {{defined as firstprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+#pragma omp distribute parallel for simd firstprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as private}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be private, predetermined as linear}}
+#pragma omp distribute parallel for simd private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as lastprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be lastprivate, predetermined as linear}}
+#pragma omp distribute parallel for simd lastprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  {
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be threadprivate or thread local, predetermined as linear}}
+#pragma omp distribute parallel for simd
+    for (sii = 0; sii < 10; sii += 1)
+      c[sii] = a[sii];
+  }
+
+  {
+#pragma omp distribute parallel for simd
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] = a[globalii];
+  }
+
+  {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+#pragma omp distribute parallel for simd
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int(*lb)[4] = nullptr;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int(*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (int a{0}; a < 10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag {};
+template <class Iter>
+struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+public:
+  Iter0() {}
+  Iter0(const Iter0 &) {}
+  Iter0 operator++() { return *this; }
+  Iter0 operator--() { return *this; }
+  bool operator<(Iter0 a) { return true; }
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'Iter0' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator-(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+public:
+  Iter1(float f = 0.0f, double d = 0.0) {}
+  Iter1(const Iter1 &) {}
+  Iter1 operator++() { return *this; }
+  Iter1 operator--() { return *this; }
+  bool operator<(Iter1 a) { return true; }
+  bool operator>=(Iter1 a) { return false; }
+};
+class GoodIter {
+public:
+  GoodIter() {}
+  GoodIter(const GoodIter &) {}
+  GoodIter(int fst, int snd) {}
+  GoodIter &operator=(const GoodIter &that) { return *this; }
+  GoodIter &operator=(const Iter0 &that) { return *this; }
+  GoodIter &operator+=(int x) { return *this; }
+  explicit GoodIter(void *) {}
+  GoodIter operator++() { return *this; }
+  GoodIter operator--() { return *this; }
+  bool operator!() { return true; }
+  bool operator<(GoodIter a) { return true; }
+  bool operator<=(GoodIter a) { return true; }
+  bool operator>=(GoodIter a) { return false; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator-(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator-(GoodIter a) { return a; }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
+GoodIter operator+(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'int' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator-(int v, GoodIter a) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 1st argument}}
+GoodIter operator+(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I(1, 2); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp distribute parallel for simd
+  for (begin = begin0; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (++begin; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{invalid operands to binary expression ('Iter0' and 'int')}}
+#pragma omp distribute parallel for simd
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// Initializer is constructor without params.
+// expected-error@+3 {{invalid operands to binary expression ('Iter0' and 'int')}}
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+  Iter1 begin1, end1;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp distribute parallel for simd
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+5 {{invalid operands to binary expression ('Iter1' and 'float')}}
+// expected-error@+4 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+// Initializer is constructor with all default params.
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (Iter1 I; I < end1; ++I) {
+  }
+  return 0;
+}
+
+template <typename IT, int ST>
+class TC {
+public:
+  int dotest_lt(IT begin, IT end) {
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+    for (IT I = begin; I < end; I = I + ST) {
+      ++I;
+    }
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+    for (IT I = begin; I <= end; I += ST) {
+      ++I;
+    }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+    for (IT I = begin; I < end; ++I) {
+      ++I;
+    }
+  }
+
+  static IT step() {
+    return IT(ST);
+  }
+};
+template <typename IT, int ST = 0>
+int dotest_gt(IT begin, IT end) {
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+#pragma omp distribute parallel for simd
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end);         // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end);            // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try { // expected-error {{'try' statement cannot be used in OpenMP simd region}}
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      }
+      throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    } catch (float f) {
+      if (f > 0.1)
+        throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
+void test_loop_firstprivate_lastprivate() {
+  S s(4);
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(s) firstprivate(s)
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
+void test_ordered() {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd ordered // expected-error {{unexpected OpenMP clause 'ordered' in directive '#pragma omp distribute parallel for simd'}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
+void test_nowait() {
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for simd'}}
+#pragma omp distribute parallel for simd nowait nowait // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'nowait' clause}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_misc_messages.c b/test/OpenMP/distribute_parallel_for_simd_misc_messages.c
new file mode 100644
index 0000000..01c079e
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_misc_messages.c
@@ -0,0 +1,971 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute parallel for simd'}}
+#pragma omp distribute parallel for simd
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute parallel for simd'}}
+#pragma omp distribute parallel for simd foo
+
+void test_no_clause() {
+  int i;
+#pragma omp distribute parallel for simd
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+#pragma omp distribute parallel for simd
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd;
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd linear(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+void test_safelen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd safelen
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd safelen()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd safelen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd safelen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd safelen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd safelen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd safelen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd safelen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_simdlen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd simdlen
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd simdlen()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd simdlen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd simdlen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd simdlen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd simdlen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd simdlen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd simdlen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_safelen_simdlen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp distribute parallel for simd simdlen(6) safelen(5)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp distribute parallel for simd safelen(5) simdlen(6)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_collapse() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd collapse
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse(2)
+  for (i = 0; i < 16; ++i)
+    for (int j = 0; j < 16; ++j)
+// expected-error@+1 {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp distribute parallel for simd reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+}
+
+void test_linear() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd linear(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd linear()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd linear(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd linear(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{use of undeclared identifier 'x'}}
+#pragma omp distribute parallel for simd linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{use of undeclared identifier 'x'}}
+// expected-error@+1 {{use of undeclared identifier 'y'}}
+#pragma omp distribute parallel for simd linear(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{use of undeclared identifier 'x'}}
+// expected-error@+2 {{use of undeclared identifier 'y'}}
+// expected-error@+1 {{use of undeclared identifier 'z'}}
+#pragma omp distribute parallel for simd linear(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd linear(x :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(x :, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(x : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(x : 2 * 2)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(x : 1, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(x : 1, y, z : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be linear}}
+#pragma omp distribute parallel for simd linear(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as private}}
+// expected-error@+1 {{private variable cannot be linear}}
+#pragma omp distribute parallel for simd private(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be private}}
+#pragma omp distribute parallel for simd linear(x) private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{zero linear step (x and other variables in clause should probably be const)}}
+#pragma omp distribute parallel for simd linear(x, y : 0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be lastprivate}}
+#pragma omp distribute parallel for simd linear(x) lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as lastprivate}}
+// expected-error@+1 {{lastprivate variable cannot be linear}}
+#pragma omp distribute parallel for simd lastprivate(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_aligned() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd aligned(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd aligned()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd aligned(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd aligned(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{use of undeclared identifier 'x'}}
+#pragma omp distribute parallel for simd aligned(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{use of undeclared identifier 'x'}}
+// expected-error@+1 {{use of undeclared identifier 'y'}}
+#pragma omp distribute parallel for simd aligned(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{use of undeclared identifier 'x'}}
+// expected-error@+2 {{use of undeclared identifier 'y'}}
+// expected-error@+1 {{use of undeclared identifier 'z'}}
+#pragma omp distribute parallel for simd aligned(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int *x, y, z[25]; // expected-note 4 {{'y' defined here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(z)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd aligned(x :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(x :, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(x : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(x : 2 * 2)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(x : 1, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(x : 1, y, z : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute parallel for simd aligned(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute parallel for simd aligned(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as aligned}}
+// expected-error@+1 {{a variable cannot appear in more than one aligned clause}}
+#pragma omp distribute parallel for simd aligned(x) aligned(z, x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{defined as aligned}}
+// expected-error@+2 {{a variable cannot appear in more than one aligned clause}}
+// expected-error@+1 2 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute parallel for simd aligned(x, y, z) aligned(y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+
+void test_private() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd private(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd private(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd private()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_lastprivate() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_firstprivate() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd firstprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x) firstprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x, y) firstprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x, y, z) firstprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp distribute parallel for simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp distribute parallel for simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_num_threads_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_num_threads_messages.cpp
new file mode 100644
index 0000000..0d6376d
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_num_threads_messages.cpp
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
new file mode 100644
index 0000000..9df3688
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
@@ -0,0 +1,315 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for simd'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for simd'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_proc_bind_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_proc_bind_messages.cpp
new file mode 100644
index 0000000..6b64cc3
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_proc_bind_messages.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+  return tmain<int, char, 3>(argc, argv);
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp
new file mode 100644
index 0000000..7b7e9ea
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp
@@ -0,0 +1,441 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_safelen_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_safelen_messages.cpp
new file mode 100644
index 0000000..a5fd1ae
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_safelen_messages.cpp
@@ -0,0 +1,177 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (argc  // expected-note {{to match this '('}} expected-error 2 {{expression is not an integral constant expression}} expected-note 2 {{read of non-const variable 'argc' is not allowed in a constant expression}} expected-error {{expected ')'}}
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (ST // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++)
+     argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'safelen' clause}} expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{expression is not an integral constant expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (4)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+  
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}} expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo(); // expected-error {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_schedule_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_schedule_messages.cpp
new file mode 100644
index 0000000..b3003dd
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_schedule_messages.cpp
@@ -0,0 +1,151 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 2 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (dynamic, 1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, (ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'schedule' clause}}
+  // expected-error@+3 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, foobool(argc)), schedule (dynamic, true), schedule (guided, -5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (dynamic, 1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, N) // expected-error {{argument to 'schedule' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, 4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, 2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (dynamic, foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'schedule' clause}}
+  // expected-error@+3 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, foobool(argc)), schedule (static, true), schedule (dynamic, -5)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule(dynamic, schedule(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
new file mode 100644
index 0000000..134b852
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
@@ -0,0 +1,396 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note 2 {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int acc = 0;
+  int n = 1000;
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared // expected-error {{expected '(' after 'shared'}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared () // expected-error {{expected expression}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (a, b, c, d, f)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argv[1]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(ba)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(ca)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(da)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(e, g)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+return T();
+}
+
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int acc = 0;
+  int n = argc;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared // expected-error {{expected '(' after 'shared'}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared () // expected-error {{expected expression}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (a, b, c, d, f)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argv[1]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(ba)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(ca)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(da)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(e, g)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+return tmain<int, char, 1000>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 1000>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_simdlen_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_simdlen_messages.cpp
new file mode 100644
index 0000000..2d813ec
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_simdlen_messages.cpp
@@ -0,0 +1,181 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+5 {{expected ')'}} expected-note@+5 {{to match this '('}}
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+// expected-note@+3 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (argc 
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+3 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'simdlen' clause}} expected-error 2 {{argument to 'simdlen' clause must be a strictly positive integer value}} expected-error 2 {{expression is not an integral constant expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (4)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (N) // expected-error {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) // expected-error {{expression is not an integral constant expression}} expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'simdlen' clause}} expected-error 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen(simdlen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  foo(); // expected-error {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_private_messages.cpp b/test/OpenMP/distribute_private_messages.cpp
index 94ba465..518b64d 100644
--- a/test/OpenMP/distribute_private_messages.cpp
+++ b/test/OpenMP/distribute_private_messages.cpp
@@ -98,6 +98,7 @@
   #pragma omp target
   #pragma omp teams firstprivate(i)
   #pragma omp parallel private(i)
+  {}
   #pragma omp target
   #pragma omp teams reduction(+:i)
   #pragma omp distribute private(i)
@@ -113,20 +114,20 @@
   #pragma omp teams
   #pragma omp distribute firstprivate(i)
   for (int k = 0; k < 10; ++k) {
-    #pragma omp target
-    #pragma omp teams firstprivate(i)
-    #pragma omp distribute private(i)
-    for (int x = 0; x < 10; ++x) foo();
   }
   #pragma omp target
+  #pragma omp teams firstprivate(i)
+  #pragma omp distribute private(i)
+  for (int x = 0; x < 10; ++x) foo();
+  #pragma omp target
   #pragma omp teams reduction(+:i)
   #pragma omp distribute
   for (int k = 0; k < 10; ++k) {
-    #pragma omp target
-    #pragma omp teams reduction(+:i)
-    #pragma omp distribute private(i)
-    for (int x = 0; x < 10; ++x) foo();
   }
+  #pragma omp target
+  #pragma omp teams reduction(+:i)
+  #pragma omp distribute private(i)
+  for (int x = 0; x < 10; ++x) foo();
 
   return 0;
 }
diff --git a/test/OpenMP/distribute_simd_aligned_messages.cpp b/test/OpenMP/distribute_simd_aligned_messages.cpp
new file mode 100644
index 0000000..59e5be2
--- /dev/null
+++ b/test/OpenMP/distribute_simd_aligned_messages.cpp
@@ -0,0 +1,306 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
+
+struct B {
+  static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
+  static constexpr int bfoo() { return 8; }
+};
+namespace X {
+  B x; // expected-note {{'x' defined here}}
+};
+constexpr int bfoo() { return 4; }
+
+int **z;
+const int C1 = 1;
+const int C2 = 2;
+void test_aligned_colons(int *&rp)
+{
+  int *B = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B::ib:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(z:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(X::x : ::z) // expected-error {{integral constant expression must have integral or unscoped enumeration type, not 'int **'}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'B'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B,rp,::z: X::x) // expected-error {{integral constant expression must have integral or unscoped enumeration type, not 'B'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp distribute simd aligned(B::bfoo()) // expected-error {{expected variable name}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B::ib,B:C1+C2) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (int i = 0; i < 10; ++i) ;
+}
+
+// expected-note@+1 {{'num' defined here}}
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L;
+  // Negative number is passed as L.
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(arr:L) // expected-error {{argument to 'aligned' clause must be a strictly positive integer value}}
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(num:4) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (i = 0; i < num; ++i);
+
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int *ind2 = 0;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(ind2:LEN) // expected-error {{argument to 'aligned' clause must be a strictly positive integer value}}
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return 0;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a; // expected-note {{'a' declared here}}
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 1 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h; // expected-note 2 {{'h' defined here}}
+#pragma omp threadprivate(h)
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(argc);
+  I g(argc);
+  int i; // expected-note {{declared here}} expected-note {{'i' defined here}}
+  // expected-note@+2 {{declared here}}
+  // expected-note@+1 {{reference to 'i' is not a constant expression}}
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned () // expected-error {{expected expression}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc : 5) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(e, g)
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(h) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(i) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int *v = 0;
+    I i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd aligned(v:16)
+      for (I k = 0; k < argc; ++k) { i = k; v += 2; }
+  }
+  float *f;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(f:j) // expected-note {{initializer of 'j' is not a constant expression}} expected-error {{expression is not an integral constant expression}}
+
+  for (I k = 0; k < argc; ++k) { ++k; v += j; }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+
+  return 0;
+}
+
+// expected-note@+1 2 {{'argc' defined here}}
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  test_warn<4>(); // ok
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argv // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (a, b) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S1'}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S2'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(h)  // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
+  foomain<int*,char>(pargc,argv);
+  return 0;
+}
+
diff --git a/test/OpenMP/distribute_simd_ast_print.cpp b/test/OpenMP/distribute_simd_ast_print.cpp
new file mode 100644
index 0000000..0435815
--- /dev/null
+++ b/test/OpenMP/distribute_simd_ast_print.cpp
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, h;
+  static T a;
+// CHECK: static T a;
+  static T g;
+#pragma omp threadprivate(g)
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule(static, a) firstprivate(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK: #pragma omp distribute simd dist_schedule(static, a) firstprivate(a)
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) reduction(+ : h) dist_schedule(static,N)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int k = 0; k < 10; ++k)
+        for (int m = 0; m < 10; ++m)
+          for (int n = 0; n < 10; ++n)
+            a++;
+// CHECK: #pragma omp distribute simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) reduction(+: h) dist_schedule(static, N)
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: for (int j = 0; j < 2; ++j)
+// CHECK-NEXT: for (int k = 0; k < 10; ++k)
+// CHECK-NEXT: for (int m = 0; m < 10; ++m)
+// CHECK-NEXT: for (int n = 0; n < 10; ++n)
+// CHECK-NEXT: a++;
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int x[200];
+  static int a;
+// CHECK: static int a;
+  static float g;
+#pragma omp threadprivate(g)
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule(static, a) private(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK: #pragma omp distribute simd  dist_schedule(static, a) private(a)
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) reduction(+ : h) dist_schedule(static, b)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+            a++;
+// CHECK: #pragma omp distribute simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) reduction(+: h) dist_schedule(static, b)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: for (int j = 0; j < 10; ++j)
+// CHECK-NEXT: a++;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(x:8) linear(h:2) safelen(8) simdlen(8)
+  for (int i = 0; i < 100; i++)
+    for (int j = 0; j < 200; j++)
+      a += h + x[j];
+// CHECK: #pragma omp distribute simd aligned(x: 8) linear(h: 2) safelen(8) simdlen(8)
+// CHECK-NEXT: for (int i = 0; i < 100; i++)
+// CHECK-NEXT: for (int j = 0; j < 200; j++)
+// CHECK-NEXT: a += h + x[j];
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
+}
+
+#endif
diff --git a/test/OpenMP/distribute_simd_collapse_messages.cpp b/test/OpenMP/distribute_simd_collapse_messages.cpp
new file mode 100644
index 0000000..182a09a
--- /dev/null
+++ b/test/OpenMP/distribute_simd_collapse_messages.cpp
@@ -0,0 +1,154 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp distribute simd collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute simd collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp distribute simd', but found only 1}}
+  // expected-error@+8 2 {{directive '#pragma omp distribute simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+7 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+8 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+6{{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+4 2 {{directive '#pragma omp distribute simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+3 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute simd' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_simd_dist_schedule_messages.cpp b/test/OpenMP/distribute_simd_dist_schedule_messages.cpp
new file mode 100644
index 0000000..6a8482d
--- /dev/null
+++ b/test/OpenMP/distribute_simd_dist_schedule_messages.cpp
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  char ** argv;
+  static T a;
+// CHECK: static T a;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute simd' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  return T();
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute simd' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_simd_firstprivate_messages.cpp b/test/OpenMP/distribute_simd_firstprivate_messages.cpp
new file mode 100644
index 0000000..b9267a3
--- /dev/null
+++ b/test/OpenMP/distribute_simd_firstprivate_messages.cpp
@@ -0,0 +1,359 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_simd_lastprivate_messages.cpp b/test/OpenMP/distribute_simd_lastprivate_messages.cpp
new file mode 100644
index 0000000..0f96cb4
--- /dev/null
+++ b/test/OpenMP/distribute_simd_lastprivate_messages.cpp
@@ -0,0 +1,333 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator =(const S2&) const;
+  S2 &operator =(const S2&);
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  int v = 0;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute simd lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target
+#pragma omp teams private(i)
+#pragma omp distribute simd lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(i) // expected-note {{defined as lastprivate}}
+  for (i = 0; i < argc; ++i) // expected-error{{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be lastprivate, predetermined as linear}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_simd_linear_messages.cpp b/test/OpenMP/distribute_simd_linear_messages.cpp
new file mode 100644
index 0000000..c60e0a2
--- /dev/null
+++ b/test/OpenMP/distribute_simd_linear_messages.cpp
@@ -0,0 +1,338 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+  int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons()
+{
+  int B = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B::ib:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B:ib) // expected-error {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(z:B:ib) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B,::z, X::x)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B::bfoo()) // expected-error {{expected variable name}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B::ib,B:C1+C2)
+  for (int i = 0; i < 10; ++i) ;
+}
+
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L; // expected-note {{'ind2' defined here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(ind2:L) // expected-error {{argument of a linear clause should be of integral or pointer type}}
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int ind2 = 0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp parallel for simd linear(ind2:LEN) // expected-warning {{zero linear step (ind2 should probably be const)}}
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc : 5)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (a, b:B::ib) // expected-error {{linear variable with incomplete type 'S1'}} expected-error {{const-qualified variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(e, g)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int v = 0;
+    int i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd linear(v:i)
+    for (int k = 0; k < argc; ++k) { i = k; v += i; }
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(v:j)
+  for (int k = 0; k < argc; ++k) { ++k; v += j; }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (a, b) // expected-error {{linear variable with incomplete type 'S1'}} expected-error {{const-qualified variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(e, g) // expected-error {{argument of a linear clause should be of integral or pointer type, not 'S4'}} expected-error {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd linear(i)
+      for (int k = 0; k < argc; ++k) ++k;
+
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd linear(i : 4)
+      for (int k = 0; k < argc; ++k) { ++k; i += 4; }
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  foomain<int,char>(argc,argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/distribute_simd_loop_messages.cpp b/test/OpenMP/distribute_simd_loop_messages.cpp
new file mode 100644
index 0000000..b690055
--- /dev/null
+++ b/test/OpenMP/distribute_simd_loop_messages.cpp
@@ -0,0 +1,782 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; i+=1) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (char i = 0; i < 10; i+='\1') {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+  #pragma omp distribute simd
+  for (long long i = 0; i < 10; i+=1.5) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (long long i = 0; i < 'z'; i+=1u) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{variable must be of integer or random access iterator type}}
+  #pragma omp distribute simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{variable must be of integer or random access iterator type}}
+  #pragma omp distribute simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (;ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+3 {{expression result unused}}
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (ii + 1;ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (c[ii] = 0;ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  // Ok to skip parenthesises.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (((ii)) = 0;ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; ; i++)
+    c[i] = a[i];
+
+  // Ok.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+  // Ok.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+    // Ok.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ++ ++ ii)
+    c[ii] = a[ii];
+
+  // Ok but undefined behavior (in general, cannot check that incr
+  // is really loop-invariant).
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+  // Ok - step was converted to integer type.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+3 {{relational comparison result unused}}
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; jj > kk + 2)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+3 {{expression result unused}}
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; (ii) < 10; ii-=25)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; (ii < 10); ii-=0)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; ii > 10; (ii+=0))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; (ii) = (1-1)+(ii))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for ((ii = 0); ii > 10; (ii-=0))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; (ii < 10); (ii-=0))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+2  {{defined as private}}
+  // expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be private, predetermined as linear}}
+  #pragma omp distribute simd private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+3 {{unexpected OpenMP clause 'shared' in directive '#pragma omp distribute simd'}}
+  // expected-note@+2  {{defined as shared}}
+  // expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be shared, predetermined as linear}}
+  #pragma omp distribute simd shared(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd lastprivate(ii) linear(jj) collapse(2) // expected-note {{defined as linear}}
+  for (ii = 0; ii < 10; ii++)
+  for (jj = 0; jj < 10; jj++) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be linear, predetermined as lastprivate}}
+    c[ii] = a[jj];
+
+
+  #pragma omp parallel
+  {
+    #pragma omp target
+    #pragma omp teams
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be threadprivate or thread local, predetermined as linear}}
+    #pragma omp distribute simd
+    for (sii = 0; sii < 10; sii+=1)
+      c[sii] = a[sii];
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd
+    for (globalii = 0; globalii < 10; globalii+=1)
+      c[globalii] = a[globalii];
+  }
+
+  #pragma omp parallel
+  {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{statement after '#pragma omp distribute simd' must be a for loop}}
+  #pragma omp distribute simd
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int (*lb)[4] = nullptr;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int (*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (int a{0}; a<10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag { };
+template <class Iter> struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+  public:
+    Iter0() { }
+    Iter0(const Iter0 &) { }
+    Iter0 operator ++() { return *this; }
+    Iter0 operator --() { return *this; }
+    Iter0 operator + (int delta) { return *this; }
+    bool operator <(Iter0 a) { return true; }
+};
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator -(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+  public:
+    Iter1(float f=0.0f, double d=0.0) { }
+    Iter1(const Iter1 &) { }
+    Iter1 operator ++() { return *this; }
+    Iter1 operator --() { return *this; }
+    bool operator <(Iter1 a) { return true; }
+    bool operator >=(Iter1 a) { return false; }
+};
+class GoodIter {
+  public:
+    GoodIter() { }
+    GoodIter(const GoodIter &) { }
+    GoodIter(int fst, int snd) { }
+    GoodIter &operator =(const GoodIter &that) { return *this; }
+    GoodIter &operator =(const Iter0 &that) { return *this; }
+    GoodIter &operator +=(int x) { return *this; }
+    explicit GoodIter(void *) { }
+    GoodIter operator ++() { return *this; }
+    GoodIter operator --() { return *this; }
+    bool operator !() { return true; }
+    bool operator <(GoodIter a) { return true; }
+    bool operator <=(GoodIter a) { return true; }
+    bool operator >=(GoodIter a) { return false; }
+    typedef int difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator -(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 2 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator -(GoodIter a) { return a; }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator -(GoodIter a, int v) { return GoodIter(); }
+GoodIter operator +(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator -(int v, GoodIter a) { return GoodIter(); }
+GoodIter operator +(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter I(1,2); I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (begin = GoodIter(1,2); begin < end; ++begin)
+    ++begin;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (++begin; begin < end; ++begin)
+    ++begin;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+
+  #pragma omp target
+  #pragma omp teams
+  // Initializer is constructor without params.
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+
+  Iter1 begin1, end1;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+  // expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+  #pragma omp distribute simd
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+
+  // Initializer is constructor with all default params.
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+4 {{invalid operands to binary expression ('Iter1' and 'float')}}
+  // expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (Iter1 I; I < end1; ++I) {
+  }
+
+  return 0;
+}
+
+template <typename IT, int ST> class TC {
+  public:
+    int dotest_lt(IT begin, IT end) {
+      // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+      // expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+      #pragma omp distribute simd
+      for (IT I = begin; I < end; I = I + ST) {
+        ++I;
+      }
+      #pragma omp target
+      #pragma omp teams
+      // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+      // expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+      #pragma omp distribute simd
+      for (IT I = begin; I <= end; I += ST) {
+        ++I;
+      }
+      #pragma omp distribute simd
+      for (IT I = begin; I < end; ++I) {
+        ++I;
+      }
+    }
+
+    static IT step() {
+      return IT(ST);
+    }
+};
+template <typename IT, int ST=0> int dotest_gt(IT begin, IT end) {
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (IT I = begin; I < end; I+=TC<int,ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end); // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end); // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch(i) {
+      case 1:
+        b[i]++;
+        break;
+      default:
+        break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try { // expected-error {{'try' statement cannot be used in OpenMP simd region}}
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      }
+      throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    }
+    catch (float f) {
+      if (f > 0.1)
+        throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch(i) {
+      case 1:
+        b[i]++;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
diff --git a/test/OpenMP/distribute_simd_misc_messages.c b/test/OpenMP/distribute_simd_misc_messages.c
new file mode 100644
index 0000000..5fc2cb6
--- /dev/null
+++ b/test/OpenMP/distribute_simd_misc_messages.c
@@ -0,0 +1,1108 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute simd'}}
+#pragma omp distribute simd
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute simd'}}
+#pragma omp distribute simd foo
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute simd'}}
+#pragma omp distribute simd safelen(4)
+
+void test_no_clause() {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{statement after '#pragma omp distribute simd' must be a for loop}}
+#pragma omp distribute simd
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+#pragma omp distribute simd foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+#pragma omp distribute simd;
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+#pragma omp distribute simd private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+#pragma omp distribute simd, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+void test_safelen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd safelen
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd safelen()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd safelen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// xxpected-error@+1 {{expected expression}}
+#pragma omp distribute simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd safelen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd safelen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd safelen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd safelen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd safelen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_simdlen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd simdlen
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd simdlen()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd simdlen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd simdlen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd simdlen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd simdlen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd simdlen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd simdlen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_safelen_simdlen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp distribute simd simdlen(6) safelen(5)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp distribute simd safelen(5) simdlen(6)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_collapse() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd collapse
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// xxpected-error@+1 {{expected expression}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute simd collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute simd collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute simd collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-note@+3 {{defined as reduction}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse(2) reduction(+ : i)
+  for (i = 0; i < 16; ++i)
+    // expected-note@+1 {{variable with automatic storage duration is predetermined as private; perhaps you forget to enclose 'omp for' directive into a parallel or another task region?}}
+    for (int j = 0; j < 16; ++j)
+// expected-error@+2 2 {{reduction variable must be shared}}
+// expected-error@+1 {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp for reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+
+#pragma omp target
+#pragma omp teams
+  for (i = 0; i < 16; ++i)
+    for (int j = 0; j < 16; ++j)
+#pragma omp distribute simd reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+}
+
+void test_linear() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd linear(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd linear()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd linear(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd linear(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{use of undeclared identifier 'x'}}
+#pragma omp distribute simd linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{use of undeclared identifier 'x'}}
+// expected-error@+1 {{use of undeclared identifier 'y'}}
+#pragma omp distribute simd linear(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{use of undeclared identifier 'x'}}
+// expected-error@+2 {{use of undeclared identifier 'y'}}
+// expected-error@+1 {{use of undeclared identifier 'z'}}
+#pragma omp distribute simd linear(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd linear(x :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(x :, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(x : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(x : 2 * 2)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(x : 1, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(x : 1, y, z : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be linear}}
+#pragma omp distribute simd linear(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as private}}
+// expected-error@+1 {{private variable cannot be linear}}
+#pragma omp distribute simd private(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be private}}
+#pragma omp distribute simd linear(x) private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{zero linear step (x and other variables in clause should probably be const)}}
+#pragma omp distribute simd linear(x, y : 0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be lastprivate}}
+#pragma omp distribute simd linear(x) lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as lastprivate}}
+// expected-error@+1 {{lastprivate variable cannot be linear}}
+#pragma omp distribute simd lastprivate(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_aligned() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd aligned(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd aligned()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd aligned(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd aligned(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{use of undeclared identifier 'x'}}
+#pragma omp distribute simd aligned(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{use of undeclared identifier 'x'}}
+// expected-error@+1 {{use of undeclared identifier 'y'}}
+#pragma omp distribute simd aligned(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{use of undeclared identifier 'x'}}
+// expected-error@+2 {{use of undeclared identifier 'y'}}
+// expected-error@+1 {{use of undeclared identifier 'z'}}
+#pragma omp distribute simd aligned(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int *x, y, z[25]; // expected-note 4 {{'y' defined here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(z)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd aligned(x :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(x :, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(x : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(x : 2 * 2)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(x : 1, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(x : 1, y, z : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute simd aligned(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute simd aligned(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as aligned}}
+// expected-error@+1 {{a variable cannot appear in more than one aligned clause}}
+#pragma omp distribute simd aligned(x) aligned(z, x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{defined as aligned}}
+// expected-error@+2 {{a variable cannot appear in more than one aligned clause}}
+// expected-error@+1 2 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute simd aligned(x, y, z) aligned(y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_private() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd private(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute simd private(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute simd private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd private()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_firstprivate() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_lastprivate() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute simd lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute simd lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_reduction() {
+  int i, x, y;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected identifier}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected identifier}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected identifier}}
+#pragma omp distribute simd reduction( : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected identifier}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected expression}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction(+
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+//
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd reduction(+:
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd reduction(+ :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd reduction(+ :, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd reduction(+ : x, + : y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected identifier}}
+#pragma omp distribute simd reduction(% : x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(* : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(& : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(| : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(|| : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(min : x)
+  for (i = 0; i < 16; ++i)
+    ;
+  struct X {
+    int x;
+  };
+  struct X X;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd reduction(+ : X.x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd reduction(+ : x + x)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp distribute simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp distribute simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
+void linear_modifiers(int argc) {
+  int f;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(f)
+  for (int k = 0; k < argc; ++k) ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(val(f))
+  for (int k = 0; k < argc; ++k) ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(uval(f)) // expected-error {{expected 'val' modifier}}
+  for (int k = 0; k < argc; ++k) ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(ref(f)) // expected-error {{expected 'val' modifier}}
+  for (int k = 0; k < argc; ++k) ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(foo(f)) // expected-error {{expected 'val' modifier}}
+  for (int k = 0; k < argc; ++k) ++k;
+}
+
diff --git a/test/OpenMP/distribute_simd_private_messages.cpp b/test/OpenMP/distribute_simd_private_messages.cpp
new file mode 100644
index 0000000..c777c99
--- /dev/null
+++ b/test/OpenMP/distribute_simd_private_messages.cpp
@@ -0,0 +1,315 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute simd'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute simd'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/distribute_simd_reduction_messages.cpp b/test/OpenMP/distribute_simd_reduction_messages.cpp
new file mode 100644
index 0000000..e03b852
--- /dev/null
+++ b/test/OpenMP/distribute_simd_reduction_messages.cpp
@@ -0,0 +1,441 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/distribute_simd_safelen_messages.cpp b/test/OpenMP/distribute_simd_safelen_messages.cpp
new file mode 100644
index 0000000..4ae35fb
--- /dev/null
+++ b/test/OpenMP/distribute_simd_safelen_messages.cpp
@@ -0,0 +1,177 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argc  // expected-note {{to match this '('}} expected-error 2 {{expression is not an integral constant expression}} expected-note 2 {{read of non-const variable 'argc' is not allowed in a constant expression}} expected-error {{expected ')'}}
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (ST // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = ST; i < N; i++)
+     argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{directive '#pragma omp distribute simd' cannot contain more than one 'safelen' clause}} expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{expression is not an integral constant expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (4)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+  
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}} expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo(); // expected-error {{statement after '#pragma omp distribute simd' must be a for loop}}
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_simd_simdlen_messages.cpp b/test/OpenMP/distribute_simd_simdlen_messages.cpp
new file mode 100644
index 0000000..4ae35fb
--- /dev/null
+++ b/test/OpenMP/distribute_simd_simdlen_messages.cpp
@@ -0,0 +1,177 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argc  // expected-note {{to match this '('}} expected-error 2 {{expression is not an integral constant expression}} expected-note 2 {{read of non-const variable 'argc' is not allowed in a constant expression}} expected-error {{expected ')'}}
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (ST // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = ST; i < N; i++)
+     argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{directive '#pragma omp distribute simd' cannot contain more than one 'safelen' clause}} expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{expression is not an integral constant expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (4)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+  
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}} expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo(); // expected-error {{statement after '#pragma omp distribute simd' must be a for loop}}
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/driver.c b/test/OpenMP/driver.c
index f84541b..74aaea5 100644
--- a/test/OpenMP/driver.c
+++ b/test/OpenMP/driver.c
@@ -8,3 +8,22 @@
 // CHECK-NO-TLS: -cc1
 // CHECK-NO-TLS-SAME: -fnoopenmp-use-tls
 //
+// RUN: %clang %s -c -E -dM -fopenmp=libomp | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=1 | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=0 | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=100 | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=31 | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// CHECK-DEFAULT-VERSION: #define _OPENMP 201107
+
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=40 | FileCheck --check-prefix=CHECK-40-VERSION %s
+// CHECK-40-VERSION: #define _OPENMP 201307
+
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=45 | FileCheck --check-prefix=CHECK-45-VERSION %s
+// CHECK-45-VERSION: #define _OPENMP 201511
+
+// RUN: %clang %s -c -E -dM -fopenmp-version=1 | FileCheck --check-prefix=CHECK-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp-version=31 | FileCheck --check-prefix=CHECK-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp-version=40 | FileCheck --check-prefix=CHECK-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp-version=45 | FileCheck --check-prefix=CHECK-VERSION %s
+// CHECK-VERSION-NOT: #define _OPENMP
+
diff --git a/test/OpenMP/dump.cpp b/test/OpenMP/dump.cpp
new file mode 100644
index 0000000..378b53c
--- /dev/null
+++ b/test/OpenMP/dump.cpp
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-dump %s | FileCheck %s
+// expected-no-diagnostics
+
+int ga, gb;
+#pragma omp threadprivate(ga, gb)
+
+// CHECK:      |-OMPThreadPrivateDecl {{.+}} <col:9> col:9
+// CHECK-NEXT: | |-DeclRefExpr {{.+}} <col:27> 'int' lvalue Var {{.+}} 'ga' 'int'
+// CHECK-NEXT: | `-DeclRefExpr {{.+}} <col:31> 'int' lvalue Var {{.+}} 'gb' 'int'
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+
+#pragma omp declare reduction(fun : float : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+
+// CHECK:      |-OMPDeclareReductionDecl {{.+}} <line:11:35> col:35 operator+ 'int' combiner
+// CHECK-NEXT: | |-CompoundAssignOperator {{.+}} <col:47, col:58> 'int' lvalue '*=' ComputeLHSTy='int' ComputeResultTy='int'
+// CHECK-NEXT: | | |-DeclRefExpr {{.+}} <col:47> 'int' lvalue Var {{.+}} 'omp_out' 'int'
+// CHECK-NEXT: | | `-ImplicitCastExpr {{.+}} <col:58> 'int' <LValueToRValue>
+// CHECK-NEXT: | |   `-DeclRefExpr {{.+}} <col:58> 'int' lvalue Var {{.+}} 'omp_in' 'int'
+// CHECK-NEXT: | |-VarDecl {{.+}} <col:35> col:35 implicit used omp_in 'int'
+// CHECK-NEXT: | `-VarDecl {{.+}} <col:35> col:35 implicit used omp_out 'int'
+// CHECK-NEXT: |-OMPDeclareReductionDecl {{.+}} <col:40> col:40 operator+ 'char' combiner
+// CHECK-NEXT: | |-CompoundAssignOperator {{.+}} <col:47, col:58> 'char' lvalue '*=' ComputeLHSTy='int' ComputeResultTy='int'
+// CHECK-NEXT: | | |-DeclRefExpr {{.+}} <col:47> 'char' lvalue Var {{.+}} 'omp_out' 'char'
+// CHECK-NEXT: | | `-ImplicitCastExpr {{.+}} <col:58> 'int' <IntegralCast>
+// CHECK-NEXT: | |   `-ImplicitCastExpr {{.+}} <col:58> 'char' <LValueToRValue>
+// CHECK-NEXT: | |     `-DeclRefExpr {{.+}} <col:58> 'char' lvalue Var {{.+}} 'omp_in' 'char'
+// CHECK-NEXT: | |-VarDecl {{.+}} <col:40> col:40 implicit used omp_in 'char'
+// CHECK-NEXT: | `-VarDecl {{.+}} <col:40> col:40 implicit used omp_out 'char'
+// CHECK-NEXT: |-OMPDeclareReductionDecl {{.+}} <line:13:37> col:37 fun 'float' combiner initializer
+// CHECK-NEXT: | |-CompoundAssignOperator {{.+}} <col:45, col:56> 'float' lvalue '+=' ComputeLHSTy='float' ComputeResultTy='float'
+// CHECK-NEXT: | | |-DeclRefExpr {{.+}} <col:45> 'float' lvalue Var {{.+}} 'omp_out' 'float'
+// CHECK-NEXT: | | `-ImplicitCastExpr {{.+}} <col:56> 'float' <LValueToRValue>
+// CHECK-NEXT: | |   `-DeclRefExpr {{.+}} <col:56> 'float' lvalue Var {{.+}} 'omp_in' 'float'
+
+struct S {
+  int a, b;
+  S() {
+#pragma omp parallel for default(none) private(a) shared(b) schedule(static, a)
+    for (int i = 0; i < 0; ++i)
+      ++a;
+  }
+};
+
+// CHECK:      |     `-OMPParallelForDirective {{.+}} <line:39:9, col:80>
+// CHECK-NEXT: |       |-OMPDefaultClause {{.+}} <col:26, col:40>
+// CHECK-NEXT: |       |-OMPPrivateClause {{.+}} <col:40, col:51>
+// CHECK-NEXT: |       | `-DeclRefExpr {{.+}} <col:48> 'int' lvalue OMPCapturedExpr {{.+}} 'a' 'int &'
+// CHECK-NEXT: |       |-OMPSharedClause {{.+}} <col:51, col:61>
+// CHECK-NEXT: |       | `-MemberExpr {{.+}} <col:58> 'int' lvalue ->b
+// CHECK-NEXT: |       |   `-CXXThisExpr {{.+}} <col:58> 'struct S *' this
+// CHECK-NEXT: |       |-OMPScheduleClause {{.+}} <col:61, col:79>
+// CHECK-NEXT: |       | `-ImplicitCastExpr {{.+}} <col:78> 'int' <LValueToRValue>
+// CHECK-NEXT: |       |   `-DeclRefExpr {{.+}} <col:78> 'int' lvalue OMPCapturedExpr {{.+}} '.capture_expr.' 'int'
+// CHECK-NEXT: |       |-CapturedStmt {{.+}} <line:40:5, <invalid sloc>>
+// CHECK-NEXT: |       | |-CapturedDecl {{.+}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       | | |-ForStmt {{.+}} <col:5, <invalid sloc>>
+// CHECK:      |       | | | `-UnaryOperator {{.+}} <line:41:7, <invalid sloc>> 'int' lvalue prefix '++'
+// CHECK-NEXT: |       | | |   `-DeclRefExpr {{.+}} <<invalid sloc>> 'int' lvalue OMPCapturedExpr {{.+}} 'a' 'int &'
+
+#pragma omp declare simd
+#pragma omp declare simd inbranch
+void foo();
+
+// CHECK:      `-FunctionDecl {{.+}} <line:63:1, col:10> col:6 foo 'void (void)'
+// CHECK-NEXT:   |-OMPDeclareSimdDeclAttr {{.+}} <line:62:9, col:34> Implicit BS_Inbranch
+// CHECK:        `-OMPDeclareSimdDeclAttr {{.+}} <line:61:9, col:25> Implicit BS_Undefined
+
diff --git a/test/OpenMP/for_ast_print.cpp b/test/OpenMP/for_ast_print.cpp
index 8fd82e7..182b395 100644
--- a/test/OpenMP/for_ast_print.cpp
+++ b/test/OpenMP/for_ast_print.cpp
@@ -8,6 +8,94 @@
 
 void foo() {}
 
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  T &b;
+  typename T::type c:12;
+  typename T::type &d;
+  S7() : a(0), b(a), c(0), d(a.a) {}
+
+public:
+  S7(typename T::type v) : a(v), b(a), c(v), d(a.a) {
+#pragma omp for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp for lastprivate(a) lastprivate(this->a) lastprivate(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp for linear(val(c))
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp for lastprivate(a) lastprivate(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp for linear(uval(this->b))
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp for private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a) lastprivate(this->S::a)
+// CHECK: #pragma omp for linear(val(this->c))
+// CHECK: #pragma omp for private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a) lastprivate(T::a)
+// CHECK: #pragma omp for linear(val(this->c))
+// CHECK: #pragma omp for private(this->a) private(this->a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a)
+// CHECK: #pragma omp for linear(uval(this->b))
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp for private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp for lastprivate(a) lastprivate(this->a) lastprivate(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp for linear(ref(S7<S>::d))
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp for lastprivate(a) lastprivate(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp for linear(this->c)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp for private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a) lastprivate(this->S7<S>::a)
+// CHECK: #pragma omp for linear(ref(this->S7<S>::d))
+// CHECK: #pragma omp for private(this->a) private(this->a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a)
+// CHECK: #pragma omp for linear(this->c)
+
 template <class T, int N>
 T tmain(T argc) {
   T b = argc, c, d, e, f, g;
@@ -68,6 +156,18 @@
   // CHECK-NEXT: for (int i = 0; i < 10; ++i)
   // CHECK-NEXT: for (int j = 0; j < 10; ++j)
   // CHECK-NEXT: foo();
+  char buf[9] = "01234567";
+  char *p, *q;
+#pragma omp parallel
+#pragma omp for
+  for (p = buf; p < &buf[8]; p++)
+    for (q = &buf[0]; q <= buf + 7; q++)
+      foo();
+  // CHECK: #pragma omp parallel
+  // CHECK-NEXT: #pragma omp for
+  // CHECK-NEXT: for (p = buf; p < &buf[8]; p++)
+  // CHECK-NEXT: for (q = &buf[0]; q <= buf + 7; q++)
+  // CHECK-NEXT: foo();
   return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
 }
 
diff --git a/test/OpenMP/for_codegen.cpp b/test/OpenMP/for_codegen.cpp
index 98761f5..1d24403 100644
--- a/test/OpenMP/for_codegen.cpp
+++ b/test/OpenMP/for_codegen.cpp
@@ -98,8 +98,8 @@
 // CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void static_chunked(float *a, float *b, float *c, float *d) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
-  #pragma omp for schedule(static, 5)
-// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
+  #pragma omp for schedule(monotonic: static, 5)
+// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 536870945, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
 // UB = min(UB, GlobalUB)
 // CHECK: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
 // CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288
@@ -158,8 +158,8 @@
 // CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void dynamic1(float *a, float *b, float *c, float *d) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
-  #pragma omp for schedule(dynamic)
-// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 35, i64 0, i64 16908287, i64 1, i64 1)
+  #pragma omp for schedule(nonmonotonic: dynamic)
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 1073741859, i64 0, i64 16908287, i64 1, i64 1)
 //
 // CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
 // CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
@@ -327,12 +327,13 @@
 // CHECK-LABEL: test_precond
 void test_precond() {
   // CHECK: [[A_ADDR:%.+]] = alloca i8,
+  // CHECK: [[CAP:%.+]] = alloca i8,
   // CHECK: [[I_ADDR:%.+]] = alloca i8,
   char a = 0;
   // CHECK: store i8 0,
   // CHECK: store i32
   // CHECK: store i8
-  // CHECK: [[A:%.+]] = load i8, i8* [[A_ADDR]],
+  // CHECK: [[A:%.+]] = load i8, i8* [[CAP]],
   // CHECK: [[CONV:%.+]] = sext i8 [[A]] to i32
   // CHECK: [[CMP:%.+]] = icmp slt i32 [[CONV]], 10
   // CHECK: br i1 [[CMP]], label %[[PRECOND_THEN:[^,]+]], label %[[PRECOND_END:[^,]+]]
@@ -491,4 +492,25 @@
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
 
+
+// CHECK-LABEL: fint
+// CHECK: call {{.*}}i32 {{.*}}ftemplate
+// CHECK: ret i32
+
+// CHECK: load i16, i16*
+// CHECK: store i16 %
+// CHECK: call void {{.+}}@__kmpc_fork_call(
+// CHECK: call void @__kmpc_for_static_init_4(
+template <typename T>
+T ftemplate() {
+  short aa = 0;
+
+#pragma omp parallel for schedule(static, aa)
+  for (int i = 0; i < 100; i++) {
+  }
+  return T();
+}
+
+int fint(void) { return ftemplate<int>(); }
+
 #endif // HEADER
diff --git a/test/OpenMP/for_collapse_messages.cpp b/test/OpenMP/for_collapse_messages.cpp
index d40c305..a6fdf00 100644
--- a/test/OpenMP/for_collapse_messages.cpp
+++ b/test/OpenMP/for_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp for', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp for' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp for collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for collapse (1)
@@ -59,16 +71,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp for', but found only 1}}
   #pragma omp for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp for' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp for', but found only 1}}
-  #pragma omp for collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp for collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp for' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp for collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp for' must be a for loop}}
diff --git a/test/OpenMP/for_firstprivate_codegen.cpp b/test/OpenMP/for_firstprivate_codegen.cpp
index 01a9355..3c6f372 100644
--- a/test/OpenMP/for_firstprivate_codegen.cpp
+++ b/test/OpenMP/for_firstprivate_codegen.cpp
@@ -95,7 +95,7 @@
     // LAMBDA: [[SIVAR2_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR2_PRIVATE_ADDR_REF]]
     // LAMBDA: store i{{[0-9]+}} [[SIVAR2_VAL]], i{{[0-9]+}}* [[SIVAR2_PRIVATE_ADDR]]
 
-    // LAMBDA: call void @__kmpc_barrier(
+    // LAMBDA-NOT: call void @__kmpc_barrier(
     g = 1;
     g1 = 1;
     sivar = 2;
@@ -158,7 +158,7 @@
     // BLOCKS: [[SIVAR2_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF_ADDRR]]
     // BLOCKS: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[SIVAR2_PRIVATE_ADDR]]
 
-    // BLOCKS: call void @__kmpc_barrier(
+    // BLOCKS-NOT: call void @__kmpc_barrier(
     g = 1;
     g1 =1;
     sivar = 2;
@@ -246,7 +246,7 @@
 // CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIV]]
 
 // Synchronization for initialization.
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK-NOT: call void @__kmpc_barrier(
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
@@ -262,31 +262,38 @@
 
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TVAR:%.+]] = alloca i32,
+// CHECK: [[TVAR_CAST:%.+]] = alloca i64,
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: [[TVAR_VAL:%.+]] = load i32, i32* [[TVAR]],
+// CHECK: [[TVAR_CONV:%.+]] = bitcast i64* [[TVAR_CAST]] to i32*
+// CHECK: store i32 [[TVAR_VAL]], i32* [[TVAR_CONV]],
+// CHECK: [[PVT_CASTVAL:%[^,]+]] = load i64, i64* [[TVAR_CAST]],
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void  (i32*, i32*, ...)*), i64 [[PVT_CASTVAL]],
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
-// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 {{.*}}%{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
 // Skip temp vars for loop
-// CHECK: alloca i{{[0-9]+}},
-// CHECK: alloca i{{[0-9]+}},
-// CHECK: alloca i{{[0-9]+}},
-// CHECK: alloca i{{[0-9]+}},
-// CHECK: alloca i{{[0-9]+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: %{{.+}} = bitcast i64* [[T_VAR_PRIV]] to i32*
 
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
 // CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** %
+// CHECK: [[VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 
 // firstprivate t_var(t_var)
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
-// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK-NOT: load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
 
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
@@ -310,10 +317,8 @@
 // CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
-// Synchronization for initialization.
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// No synchronization for initialization.
+// CHECK-NOT: call void @__kmpc_barrier(
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
diff --git a/test/OpenMP/for_firstprivate_messages.cpp b/test/OpenMP/for_firstprivate_messages.cpp
index 1933de2..60be4f5 100644
--- a/test/OpenMP/for_firstprivate_messages.cpp
+++ b/test/OpenMP/for_firstprivate_messages.cpp
@@ -143,7 +143,7 @@
     foo();
 #pragma omp parallel reduction(+ : i) // expected-note {{defined as reduction}}
 #pragma omp for firstprivate(i)       // expected-error {{firstprivate variable must be shared}}
-  for (i = 0; i < argc; ++i)
+  for (int k = 0; k < argc; ++k)
     foo();
   return 0;
 }
diff --git a/test/OpenMP/for_lastprivate_codegen.cpp b/test/OpenMP/for_lastprivate_codegen.cpp
index 36abd8d..2b1d6c3 100644
--- a/test/OpenMP/for_lastprivate_codegen.cpp
+++ b/test/OpenMP/for_lastprivate_codegen.cpp
@@ -8,6 +8,115 @@
 #ifndef HEADER
 #define HEADER
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel
+#pragma omp for lastprivate(a, b, c)
+    for (int i = 0; i < 2; ++i)
+#ifdef LAMBDA
+      [&]() {
+        ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for lastprivate(a, b, c)
+        for (int i = 0; i < 2; ++i)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ++a;
+        --this->b;
+        (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for lastprivate(a, b, c)
+        for (int i = 0; i < 2; ++i)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#else
+      ++this->a, --b, c /= 1;
+#endif
+#pragma omp for
+    for (a = 0; a < 2; ++a)
+#ifdef LAMBDA
+      [&]() {
+        ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for lastprivate(b)
+        for (b = 0; b < 2; ++b)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ++a;
+        --this->b;
+        (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for
+        for (c = 0; c < 2; ++c)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#else
+      ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template <typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel
+#pragma omp for lastprivate(a)
+    for (int i = 0; i < 2; ++i)
+#ifdef LAMBDA
+      [&]() {
+        [&]() {
+          ++this->a;
+#pragma omp parallel
+#pragma omp for lastprivate(a)
+          for (int i = 0; i < 2; ++i)
+            ++(this)->a;
+        }();
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ^{
+          ++a;
+#pragma omp parallel
+#pragma omp for lastprivate(a)
+          for (int i = 0; i < 2; ++i)
+            ++(this)->a;
+        }();
+      }();
+#else
+      ++(this)->a;
+#endif
+#pragma omp for
+    for (a = 0; a < 2; ++a)
+#ifdef LAMBDA
+      [&]() {
+        ++this->a;
+#pragma omp parallel
+#pragma omp for
+        for (a = 0; a < 2; ++(this)->a)
+          ++(this)->a;
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ++a;
+#pragma omp parallel
+#pragma omp for
+        for (this->a = 0; a < 2; ++a)
+          ++(this)->a;
+      }();
+#else
+      ++(this)->a;
+#endif
+  }
+};
+
 template <class T>
 struct S {
   T f;
@@ -23,6 +132,9 @@
 float f;
 char cnt;
 
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i32 }
 // CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
@@ -32,6 +144,7 @@
 template <typename T>
 T tmain() {
   S<T> test;
+  SST<T> sst;
   T t_var __attribute__((aligned(128))) = T();
   T vec[] __attribute__((aligned(128))) = {1, 2};
   S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
@@ -54,17 +167,75 @@
 
 int main() {
   static int sivar;
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* %{{.+}})
 #pragma omp parallel
 #pragma omp for lastprivate(g, g1, sivar)
   for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: store i8
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: call void {{.+}} [[SS_LAMBDA:@[^ ]+]]
+    // LAMBDA: call void @__kmpc_for_static_fini(%
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+    // LAMBDA: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 0
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 2
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]*
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: br i1
+    // LAMBDA: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: store i8 %{{.+}}, i8* [[B_REF]],
+    // LAMBDA: br label
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+    // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: br i1
+    // LAMBDA: br label
+    // LAMBDA: ret void
+
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) [[SIVAR:%.+]])
     // LAMBDA: alloca i{{[0-9]+}},
     // LAMBDA: alloca i{{[0-9]+}},
@@ -128,6 +299,7 @@
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
@@ -191,6 +363,60 @@
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: store i8
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: call void
+// BLOCKS: call void @__kmpc_for_static_fini(%
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// BLOCKS: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 0
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 2
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]*
+// BLOCKS: call{{.*}} void
+// BLOCKS: call void @__kmpc_for_static_fini(
+// BLOCKS: br i1
+// BLOCKS: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: store i8 %{{.+}}, i8* [[B_REF]],
+// BLOCKS: br label
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// BLOCKS: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS: call void @__kmpc_for_static_fini(
+// BLOCKS: br i1
+// BLOCKS: br label
+// BLOCKS: ret void
 #else
   S<float> test;
   int t_var = 0;
@@ -396,20 +622,9 @@
 // CHECK: br i1 [[IS_LAST_ITER:%.+]], label %[[LAST_THEN:.+]], label %[[LAST_DONE:.+]]
 // CHECK: [[LAST_THEN]]
 
-// Calculate last iter count
-// CHECK: store i32 1, i32* [[OMP_IV]]
-// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
-// CHECK-NEXT: [[CALC_I_2:%.+]] = add nsw i32 [[IV1_1]], 1
-// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[OMP_IV]]
-// Actual copying.
-
-// original cnt=private_cnt;
 // Calculate private cnt value.
-// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
-// CHECK: [[MUL:%.+]] = mul nsw i32 [[IV1_1]], 1
-// CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]]
-// CHECK: [[CONV:%.+]] = trunc i32 [[ADD]] to i8
-// CHECK: store i8 [[CONV]], i8* [[CNT_PRIV]]
+// CHECK: store i8 2, i8* [[CNT_PRIV]]
+// original cnt=private_cnt;
 // CHECK: [[CNT_VAL:%.+]] = load i8, i8* [[CNT_PRIV]],
 // CHECK: store i8 [[CNT_VAL]], i8* [[CNT]],
 
@@ -425,7 +640,52 @@
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
 // CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
-//
+
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: store i8
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void @__kmpc_for_static_fini(%
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: br i1
+// CHECK: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// CHECK: store i8 %{{.+}}, i8* [[B_REF]],
+// CHECK: br label
+// CHECK: ret void
+
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
diff --git a/test/OpenMP/for_linear_codegen.cpp b/test/OpenMP/for_linear_codegen.cpp
index db97883..0ad45f5 100644
--- a/test/OpenMP/for_linear_codegen.cpp
+++ b/test/OpenMP/for_linear_codegen.cpp
@@ -23,6 +23,74 @@
 float f;
 char cnt;
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel
+#pragma omp for linear(a, b, c)
+    for (int i = 0; i < 2; ++i)
+#ifdef LAMBDA
+      [&]() {
+        ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for linear(a, b) linear(ref(c))
+        for (int i = 0; i < 2; ++i)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ++a;
+        --this->b;
+        (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for linear(a, b) linear(uval(c))
+        for (int i = 0; i < 2; ++i)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#else
+      ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template <typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel
+#pragma omp for linear(a)
+    for (int i = 0; i < 2; ++i)
+#ifdef LAMBDA
+      [&]() {
+        [&]() {
+          ++this->a;
+#pragma omp parallel
+#pragma omp for linear(a)
+          for (int i = 0; i < 2; ++i)
+            ++(this)->a;
+        }();
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ^{
+          ++a;
+#pragma omp parallel
+#pragma omp for linear(a)
+          for (int i = 0; i < 2; ++i)
+            ++(this)->a;
+        }();
+      }();
+#else
+      ++(this)->a;
+#endif
+  }
+};
+
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i32 }
 // CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
@@ -31,6 +99,7 @@
 template <typename T>
 T tmain() {
   S<T> test;
+  SST<T> sst;
   T *pvar = &test.f;
   T &lvar = test.f;
 #pragma omp parallel
@@ -42,16 +111,75 @@
 }
 
 int main() {
+  static int sivar;
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 0, {{.+}}* [[OMP_REGION:@.+]] to {{.+}})
 #pragma omp parallel
 #pragma omp for linear(g, g1:5)
   for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: store i8
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+    // LAMBDA: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 0
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 2
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]*
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: br i1
+    // LAMBDA: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: store i8 %{{.+}}, i8* [[B_REF]],
+    // LAMBDA: br label
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+    // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: br i1
+    // LAMBDA: br label
+    // LAMBDA: ret void
+
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
     // LAMBDA: alloca i{{[0-9]+}},
     // LAMBDA: [[G_START_ADDR:%.+]] = alloca i{{[0-9]+}},
@@ -96,6 +224,7 @@
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
@@ -146,6 +275,60 @@
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: store i8
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// BLOCKS: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 0
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 2
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]*
+// BLOCKS: call{{.*}} void
+// BLOCKS: call void @__kmpc_for_static_fini(
+// BLOCKS: br i1
+// BLOCKS: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: store i8 %{{.+}}, i8* [[B_REF]],
+// BLOCKS: br label
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// BLOCKS: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS: call void @__kmpc_for_static_fini(
+// BLOCKS: br i1
+// BLOCKS: br label
+// BLOCKS: ret void
 #else
   S<float> test;
   float *pvar = &test.f;
@@ -216,7 +399,51 @@
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 2, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32**, i32*)* [[TMAIN_MICROTASK:@.+]] to void
 // CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
-//
+
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: store i8
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: br i1
+// CHECK: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// CHECK: store i8 %{{.+}}, i8* [[B_REF]],
+// CHECK: br label
+// CHECK: ret void
+
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32** dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}})
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: [[PVAR_START:%.+]] = alloca i32*,
diff --git a/test/OpenMP/for_linear_messages.cpp b/test/OpenMP/for_linear_messages.cpp
index 39fb21e..ab89349 100644
--- a/test/OpenMP/for_linear_messages.cpp
+++ b/test/OpenMP/for_linear_messages.cpp
@@ -212,7 +212,7 @@
   #pragma omp for linear(i) ordered(1) // expected-error {{'linear' clause cannot be specified along with 'ordered' clause with a parameter}}
   for (int k = 0; k < argc; ++k) ++k;
 
-  foomain<int,char>(argc,argv);
+  foomain<int,char>(argc,argv); // expected-note {{n instantiation of function template specialization 'foomain<int, char>' requested here}}
   return 0;
 }
 
diff --git a/test/OpenMP/for_loop_messages.cpp b/test/OpenMP/for_loop_messages.cpp
index 895baf5..bb58a77 100644
--- a/test/OpenMP/for_loop_messages.cpp
+++ b/test/OpenMP/for_loop_messages.cpp
@@ -426,12 +426,25 @@
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+class GoodIter1 {
+public:
+  GoodIter1() {}
+  GoodIter1(const GoodIter1 &) {}
+  GoodIter1 &operator++(int) { return *this; }
+  GoodIter1 &operator=(const GoodIter1 &that) { return *this; }
+  GoodIter1 &operator+=(int x) { return *this; }
+  friend long operator-(const GoodIter1 &, const GoodIter1 &);
+  GoodIter1 &operator-(int) { return *this; }
+  bool operator<(GoodIter1 a) { return true; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -482,7 +495,7 @@
 #pragma omp for
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel
 #pragma omp for
@@ -572,6 +585,10 @@
 #pragma omp for
   for (Iter1 I; I < end1; ++I) {
   }
+  GoodIter1 I1, E1;
+#pragma omp for
+  for (GoodIter1 I = I1; I < E1; I++)
+    ;
   return 0;
 }
 
diff --git a/test/OpenMP/for_ordered_clause.cpp b/test/OpenMP/for_ordered_clause.cpp
index 8af509a..3335f40 100644
--- a/test/OpenMP/for_ordered_clause.cpp
+++ b/test/OpenMP/for_ordered_clause.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -36,16 +41,23 @@
 #pragma omp for ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp for', but found only 1}}
-// expected-error@+3 2 {{directive '#pragma omp for' cannot contain more than one 'ordered' clause}}
-// expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
-// expected-error@+1 2 {{expression is not an integral constant expression}}
+// expected-error@+6 2 {{directive '#pragma omp for' cannot contain more than one 'ordered' clause}}
+// expected-error@+5 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
 #pragma omp for ordered(foobool(argc)), ordered(true), ordered(-5)
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
 #pragma omp for ordered(S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+// expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
 #pragma omp for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
@@ -84,10 +96,17 @@
 #pragma omp for ordered(2 + 2))              // expected-warning {{extra tokens at the end of '#pragma omp for' are ignored}}  expected-note {{as specified in 'ordered' clause}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];    // expected-error {{expected 4 for loops after '#pragma omp for', but found only 1}}
-#pragma omp for ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+// expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp for ordered(foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+3 {{expression is not an integral constant expression}}
+// expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
 // expected-error@+2 2 {{directive '#pragma omp for' cannot contain more than one 'ordered' clause}}
 // expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
 #pragma omp for ordered(foobool(argc)), ordered(true), ordered(-5)
@@ -96,7 +115,11 @@
 #pragma omp for ordered(S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+// expected-error@+4 {{expression is not an integral constant expression}}
+#else
+// expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
 #pragma omp for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
diff --git a/test/OpenMP/for_private_messages.cpp b/test/OpenMP/for_private_messages.cpp
index 3015f81..4045c5b 100644
--- a/test/OpenMP/for_private_messages.cpp
+++ b/test/OpenMP/for_private_messages.cpp
@@ -29,7 +29,11 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp for private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -126,6 +174,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp for private // expected-error {{expected '(' after 'private'}}
@@ -190,6 +240,8 @@
   for(int k = 0; k < argc; ++k)
     si = k + 1;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/for_reduction_codegen.cpp b/test/OpenMP/for_reduction_codegen.cpp
index 423ab3c..6997d81 100644
--- a/test/OpenMP/for_reduction_codegen.cpp
+++ b/test/OpenMP/for_reduction_codegen.cpp
@@ -52,6 +52,8 @@
   return T();
 }
 
+extern S<float> **foo();
+
 int main() {
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global double
@@ -182,6 +184,9 @@
   S<float> s_arr[] = {1, 2};
   S<float> &var = test;
   S<float> var1, arrs[10][4];
+  S<float> **var2 = foo();
+  S<float> vvar2[2];
+  S<float> (&var3)[2] = s_arr;
 #pragma omp parallel
 #pragma omp for reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1)
   for (int i = 0; i < 2; ++i) {
@@ -192,6 +197,26 @@
 #pragma omp parallel for reduction(+:arr[1][:vec[1]]) reduction(&:arrs[1:vec[1]][1:2])
   for (int i = 0; i < 10; ++i)
     ++arr[1][i];
+#pragma omp parallel
+#pragma omp for reduction(+:arr) reduction(&:arrs)
+  for (int i = 0; i < 10; ++i)
+    ++arr[1][i];
+#pragma omp parallel
+#pragma omp for reduction(& : var2[0 : 5][1 : 6])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : vvar2[0 : 5])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : var3[1 : 2])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : var3)
+  for (int i = 0; i < 10; ++i)
+    ;
   return tmain<int>();
 #endif
 }
@@ -201,6 +226,11 @@
 // CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, float*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float*, [2 x i32]*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK:@.+]] to void
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, i64, i32*, [2 x i32]*, [10 x [4 x [[S_FLOAT_TY]]]]*)* [[MAIN_MICROTASK1:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, i64, i32*, [10 x [4 x [[S_FLOAT_TY]]]]*)* [[MAIN_MICROTASK2:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[S_FLOAT_TY]]***)* [[MAIN_MICROTASK3:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK4:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK5:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK6:@.+]] to void
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 // CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
 // CHECK: ret
@@ -666,6 +696,316 @@
 
 // CHECK: ret void
 
+// CHECK: define internal void [[MAIN_MICROTASK2]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* nonnull %{{.+}}, [10 x [4 x [[S_FLOAT_TY]]]]* dereferenceable(160) %{{.+}})
+
+// CHECK: [[ARRS_PRIV:%.+]] = alloca [10 x [4 x [[S_FLOAT_TY]]]],
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [3 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[ARR_SIZE:%.+]] = mul nuw i64 %{{.+}}, 4
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],
+
+// Check initialization of private copy.
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_PRIV]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: store i32 0, i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// Check initialization of private copy.
+// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_ZN1SIfEC1Ev([[S_FLOAT_TY]]* %
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+// CHECK: [[ARRS_PRIV_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]] to [[S_FLOAT_TY]]*
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[ARR_PRIV_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[ARR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_PRIV_REF]],
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = inttoptr i64 [[ARR_SIZE]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARRS_PRIV_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[ARRS_PRIV_BEGIN]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARRS_PRIV_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast [3 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 2, i64 24, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// CHECK: [[CASE1]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0:%.+]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = add nsw i32 %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[LHS_BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// CHECK: [[CASE2]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: atomicrmw add i32* %{{.+}}, i32 %{{.+}} monotonic
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[LHS_BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+
+// Check destruction of private copy.
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: br
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_ZN1SIfED1Ev([[S_FLOAT_TY]]* %
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[BEGIN]]
+// CHECK: br i1 [[DONE]],
+// CHECK: call void @llvm.stackrestore(i8*
+// CHECK: call void @__kmpc_barrier(
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// arr_rhs = (int*)rhs[0];
+// CHECK: [[ARR_RHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_RHS_VOID:%.+]] = load i8*, i8** [[ARR_RHS_REF]],
+// CHECK: [[ARR_RHS:%.+]] = bitcast i8* [[ARR_RHS_VOID]] to i32*
+// arr_lhs = (int*)lhs[0];
+// CHECK: [[ARR_LHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_LHS_VOID:%.+]] = load i8*, i8** [[ARR_LHS_REF]],
+// CHECK: [[ARR_LHS:%.+]] = bitcast i8* [[ARR_LHS_VOID]] to i32*
+
+// arr_size = (size_t)lhs[1];
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[ARR_SIZE_VOID:%.+]] = load i8*, i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARR_SIZE:%.+]] = ptrtoint i8* [[ARR_SIZE_VOID]] to i64
+
+// arrs_rhs = (S<float>*)rhs[2];
+// CHECK: [[ARRS_RHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[ARRS_RHS_VOID:%.+]] = load i8*, i8** [[ARRS_RHS_REF]],
+// CHECK: [[ARRS_RHS:%.+]] = bitcast i8* [[ARRS_RHS_VOID]] to [[S_FLOAT_TY]]*
+// arrs_lhs = (S<float>*)lhs[2];
+// CHECK: [[ARRS_LHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[ARRS_LHS_VOID:%.+]] = load i8*, i8** [[ARRS_LHS_REF]],
+// CHECK: [[ARRS_LHS:%.+]] = bitcast i8* [[ARRS_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// arr_lhs[:] += arr_rhs[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_LHS]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_LHS]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = add nsw i32 %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs_lhs = arrs_lhs.operator &(arrs_rhs);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK3]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[S_FLOAT_TY]]*** dereferenceable(8) %{{.+}})
+
+// CHECK: [[VAR2_ORIG_ADDR:%.+]] = alloca [[S_FLOAT_TY]]***,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[VAR2_ORIG:%.+]] = load [[S_FLOAT_TY]]***, [[S_FLOAT_TY]]**** [[VAR2_ORIG_ADDR]],
+
+// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 0
+// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 1
+// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 4
+// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 6
+// CHECK: [[LD:%.+]] = load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: [[ORIG_START:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[LD]],
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VAR2_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VAR2_PRIV]], i64 [[OFFSET]]
+// CHECK: store [[S_FLOAT_TY]]** [[REF:.+]], [[S_FLOAT_TY]]*** %
+// CHECK: store [[S_FLOAT_TY]]* [[PSEUDO_VAR2_PRIV]], [[S_FLOAT_TY]]** [[REF]]
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK4]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}})
+
+// CHECK: [[VVAR2_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[VVAR2_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VVAR2_ORIG_ADDR]],
+
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 0
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 4
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VVAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VVAR2_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VVAR2_PRIV]], i64 [[OFFSET]]
+// CHECK: [[VVAR2_PRIV:%.+]] = bitcast [[S_FLOAT_TY]]* [[PSEUDO_VVAR2_PRIV]] to [2 x [[S_FLOAT_TY]]]*
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK5]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}})
+
+// CHECK: [[VAR3_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 1
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 2
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VAR3_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VAR3_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VAR3_PRIV]], i64 [[OFFSET]]
+// CHECK: [[VAR3_PRIV:%.+]] = bitcast [[S_FLOAT_TY]]* [[PSEUDO_VAR3_PRIV]] to [2 x [[S_FLOAT_TY]]]*
+
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK6]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}})
+
+// CHECK: [[VAR3_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR3_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], i32 0, i32 0
+// CHECK: getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 2
+
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+
+// CHECK: ret void
+
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
diff --git a/test/OpenMP/for_reduction_codegen_UDR.cpp b/test/OpenMP/for_reduction_codegen_UDR.cpp
new file mode 100644
index 0000000..a30df36
--- /dev/null
+++ b/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -0,0 +1,984 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+#ifndef HEADER
+#define HEADER
+
+volatile double g, g_orig;
+volatile double &g1 = g_orig;
+
+struct BaseS {
+  int x;
+};
+struct BaseS1 {
+  float y;
+};
+
+template <class T>
+struct S : public BaseS, public BaseS1 {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  ~S() {}
+};
+void red(BaseS1&, const BaseS1&);
+void red_plus(BaseS1&, const BaseS1&);
+void init(BaseS1&, const BaseS1&);
+void init1(BaseS1&, const BaseS1&);
+void init2(BaseS1&, const BaseS1&);
+void init_plus(BaseS1&, const BaseS1&);
+#pragma omp declare reduction(operator& : BaseS1 : red(omp_out, omp_in)) initializer(init(omp_priv, omp_orig))
+#pragma omp declare reduction(+ : BaseS1 : red_plus(omp_out, omp_in)) initializer(init_plus(omp_priv, omp_orig))
+#pragma omp declare reduction(&& : S<float>, S<int> : omp_out.f *= omp_in.f) initializer(init1(omp_priv, omp_orig))
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { %{{[^,]+}}, %{{[^,]+}}, float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { %{{[^,]+}}, %{{[^,]+}}, i{{[0-9]+}} }
+// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
+
+#pragma omp declare reduction(operator&& : int : omp_out = 111 & omp_in)
+template <typename T>
+T tmain() {
+  T t;
+  S<T> test;
+  T t_var = T(), t_var1;
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> &var = test;
+  S<T> var1;
+#pragma omp declare reduction(operator& : T : omp_out = 15 + omp_in)
+#pragma omp declare reduction(operator+ : T : omp_out = 1513 + omp_in) initializer(omp_priv = 321)
+#pragma omp declare reduction(min : T : omp_out = 47 - omp_in) initializer(omp_priv = 432 / omp_orig)
+#pragma omp declare reduction(operator&& : S<T> : omp_out.f = 17 * omp_in.f) initializer(init2(omp_priv, omp_orig))
+#pragma omp declare reduction(operator&& : T : omp_out = 17 * omp_in)
+#pragma omp parallel
+#pragma omp for reduction(+ : t_var) reduction(& : var) reduction(&& : var1) reduction(min : t_var1) nowait
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+#pragma omp parallel
+#pragma omp for reduction(&& : t_var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+extern S<float> **foo();
+
+#pragma omp declare reduction(operator- : float, double : omp_out = 333 + omp_in)
+#pragma omp declare reduction(min : float, double : omp_out = 555 + omp_in)
+int main() {
+#pragma omp declare reduction(operator+ : float, double : omp_out = 222 - omp_in) initializer(omp_priv = -1)
+  S<float> test;
+  float t_var = 0, t_var1;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> &var = test;
+  S<float> var1, arrs[10][4];
+  S<float> **var2 = foo();
+  S<float> vvar2[2];
+  S<float>(&var3)[2] = s_arr;
+#pragma omp declare reduction(operator+ : int : omp_out = 555 * omp_in) initializer(omp_priv = 888)
+#pragma omp parallel
+#pragma omp for reduction(+ : t_var) reduction(& : var) reduction(&& : var1) reduction(min : t_var1)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  int arr[10][vec[1]];
+#pragma omp parallel for reduction(+ : arr[1][ : vec[1]]) reduction(& : arrs[1 : vec[1]][1 : 2])
+  for (int i = 0; i < 10; ++i)
+    ++arr[1][i];
+#pragma omp parallel
+#pragma omp for reduction(+ : arr) reduction(& : arrs)
+  for (int i = 0; i < 10; ++i)
+    ++arr[1][i];
+#pragma omp parallel
+#pragma omp for reduction(& : var2[0 : 5][1 : 6])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : vvar2[0 : 5])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : var3[1 : 2])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : var3)
+  for (int i = 0; i < 10; ++i)
+    ;
+  return tmain<int>();
+}
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, float*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float*, [2 x i32]*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, i64, i32*, [2 x i32]*, [10 x [4 x [[S_FLOAT_TY]]]]*)* [[MAIN_MICROTASK1:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, i64, i32*, [10 x [4 x [[S_FLOAT_TY]]]]*)* [[MAIN_MICROTASK2:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[S_FLOAT_TY]]***)* [[MAIN_MICROTASK3:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK4:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK5:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK6:@.+]] to void
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, float* dereferenceable(4) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(12) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(12) %{{.+}}, float* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %vec, [2 x [[S_FLOAT_TY]]]* dereferenceable(24) %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca float,
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca float,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_REF:%.+]] = load float*, float** %
+// CHECK: [[VAR1_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: [[T_VAR1_REF:%.+]] = load float*, float** %
+
+// For + reduction operation initial value of private variable is -1.
+// CHECK: store float -1.0{{.+}}, float*
+
+// For & reduction operation initial value of private variable is defined by call of 'init()' function.
+// CHECK: call {{.*}}void @_Z4initR6BaseS1RKS_(
+
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call {{.*}}void @_Z5init1R6BaseS1RKS_(
+
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: [[INIT:%.+]] = load float, float* @
+// CHECK: store float [[INIT]], float* [[T_VAR1_PRIV]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 3
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: fsub float 2.220000e+02, %
+
+// var = var.operator &(var_reduction);
+// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: fmul float
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: fadd float 5.550000e+02, %
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: call void @__kmpc_critical(
+// CHECK: fsub float 2.220000e+02, %
+// CHECK: call void @__kmpc_end_critical(
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_(
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: fmul float
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: fadd float 5.550000e+02, %
+// CHECK: call void @__kmpc_end_critical(
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (float*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to float*
+// t_var_rhs = (float*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to float*
+
+// var_lhs = (S<float>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_FLOAT_TY]]*
+// var_rhs = (S<float>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// var1_lhs = (S<float>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_FLOAT_TY]]*
+// var1_rhs = (S<float>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// t_var1_lhs = (float*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to float*
+// t_var1_rhs = (float*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to float*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: fsub float 2.220000e+02, %
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_(
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: fmul float
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: fadd float 5.550000e+02, %
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* nonnull %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [10 x [4 x [[S_FLOAT_TY]]]]* dereferenceable(480) %{{.+}})
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[IDX1:%.+]] = mul nsw i64 1, %{{.+}}
+// CHECK: [[LB1:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
+// CHECK: [[LB1_0:%.+]] = getelementptr inbounds i32, i32* [[LB1]], i64 0
+// CHECK: [[IDX1:%.+]] = mul nsw i64 1, %{{.+}}
+// CHECK: [[UB1:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
+// CHECK: [[UB1_UP:%.+]] = getelementptr inbounds i32, i32* [[UB1]], i64 %
+// CHECK: [[UB_CAST:%.+]] = ptrtoint i32* [[UB1_UP]] to i64
+// CHECK: [[LB_CAST:%.+]] = ptrtoint i32* [[LB1_0]] to i64
+// CHECK: [[DIFF:%.+]] = sub i64 [[UB_CAST]], [[LB_CAST]]
+// CHECK: [[SIZE_1:%.+]] = sdiv exact i64 [[DIFF]], ptrtoint (i32* getelementptr (i32, i32* null, i32 1) to i64)
+// CHECK: [[ARR_SIZE:%.+]] = add nuw i64 [[SIZE_1]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],
+
+// Check initialization of private copy.
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_PRIV]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: store i32 888, i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: [[ARRS_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[ARRS_SIZE:%.+]],
+
+// Check initialization of private copy.
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_PRIV]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z4initR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[ARR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[ARR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_PRIV_REF]],
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = inttoptr i64 [[ARR_SIZE]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARRS_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[ARRS_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARRS_PRIV_REF]],
+// CHECK: [[ARRS_SIZE_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 3
+// CHECK: [[BITCAST:%.+]] = inttoptr i64 [[ARRS_SIZE]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARRS_SIZE_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 2, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// CHECK: [[CASE1]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// CHECK: [[CASE2]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_critical(
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+
+// Check destruction of private copy.
+// CHECK: [[END:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_PRIV]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_ZN1SIfED1Ev([[S_FLOAT_TY]]* %
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[ARRS_PRIV]]
+// CHECK: br i1 [[DONE]],
+// CHECK: call void @llvm.stackrestore(i8*
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// arr_rhs = (int*)rhs[0];
+// CHECK: [[ARR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_RHS_VOID:%.+]] = load i8*, i8** [[ARR_RHS_REF]],
+// CHECK: [[ARR_RHS:%.+]] = bitcast i8* [[ARR_RHS_VOID]] to i32*
+// arr_lhs = (int*)lhs[0];
+// CHECK: [[ARR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_LHS_VOID:%.+]] = load i8*, i8** [[ARR_LHS_REF]],
+// CHECK: [[ARR_LHS:%.+]] = bitcast i8* [[ARR_LHS_VOID]] to i32*
+
+// arr_size = (size_t)lhs[1];
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[ARR_SIZE_VOID:%.+]] = load i8*, i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARR_SIZE:%.+]] = ptrtoint i8* [[ARR_SIZE_VOID]] to i64
+
+// arrs_rhs = (S<float>*)rhs[2];
+// CHECK: [[ARRS_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[ARRS_RHS_VOID:%.+]] = load i8*, i8** [[ARRS_RHS_REF]],
+// CHECK: [[ARRS_RHS:%.+]] = bitcast i8* [[ARRS_RHS_VOID]] to [[S_FLOAT_TY]]*
+// arrs_lhs = (S<float>*)lhs[2];
+// CHECK: [[ARRS_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[ARRS_LHS_VOID:%.+]] = load i8*, i8** [[ARRS_LHS_REF]],
+// CHECK: [[ARRS_LHS:%.+]] = bitcast i8* [[ARRS_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// arrs_size = (size_t)lhs[3];
+// CHECK: [[ARRS_SIZE_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 3
+// CHECK: [[ARRS_SIZE_VOID:%.+]] = load i8*, i8** [[ARRS_SIZE_REF]],
+// CHECK: [[ARRS_SIZE:%.+]] = ptrtoint i8* [[ARRS_SIZE_VOID]] to i64
+
+// arr_lhs[:] += arr_rhs[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_LHS]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_LHS]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs_lhs = arrs_lhs.operator &(arrs_rhs);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK2]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* nonnull %{{.+}}, [10 x [4 x [[S_FLOAT_TY]]]]* dereferenceable(480) %{{.+}})
+
+// CHECK: [[ARRS_PRIV:%.+]] = alloca [10 x [4 x [[S_FLOAT_TY]]]],
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [3 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[ARR_SIZE:%.+]] = mul nuw i64 %{{.+}}, 4
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],
+
+// Check initialization of private copy.
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_PRIV]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: store i32 888, i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// Check initialization of private copy.
+// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z4initR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+// CHECK: [[ARRS_PRIV_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]] to [[S_FLOAT_TY]]*
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[ARR_PRIV_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[ARR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_PRIV_REF]],
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = inttoptr i64 [[ARR_SIZE]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARRS_PRIV_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[ARRS_PRIV_BEGIN]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARRS_PRIV_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast [3 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 2, i64 24, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// CHECK: [[CASE1]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0:%.+]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%[^ ]+]] = mul nsw i32 555, %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[LHS_BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// CHECK: [[CASE2]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[LHS_BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_critical(
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+
+// Check destruction of private copy.
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: br
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_ZN1SIfED1Ev([[S_FLOAT_TY]]* %
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[BEGIN]]
+// CHECK: br i1 [[DONE]],
+// CHECK: call void @llvm.stackrestore(i8*
+// CHECK: call void @__kmpc_barrier(
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// arr_rhs = (int*)rhs[0];
+// CHECK: [[ARR_RHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_RHS_VOID:%.+]] = load i8*, i8** [[ARR_RHS_REF]],
+// CHECK: [[ARR_RHS:%.+]] = bitcast i8* [[ARR_RHS_VOID]] to i32*
+// arr_lhs = (int*)lhs[0];
+// CHECK: [[ARR_LHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_LHS_VOID:%.+]] = load i8*, i8** [[ARR_LHS_REF]],
+// CHECK: [[ARR_LHS:%.+]] = bitcast i8* [[ARR_LHS_VOID]] to i32*
+
+// arr_size = (size_t)lhs[1];
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[ARR_SIZE_VOID:%.+]] = load i8*, i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARR_SIZE:%.+]] = ptrtoint i8* [[ARR_SIZE_VOID]] to i64
+
+// arrs_rhs = (S<float>*)rhs[2];
+// CHECK: [[ARRS_RHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[ARRS_RHS_VOID:%.+]] = load i8*, i8** [[ARRS_RHS_REF]],
+// CHECK: [[ARRS_RHS:%.+]] = bitcast i8* [[ARRS_RHS_VOID]] to [[S_FLOAT_TY]]*
+// arrs_lhs = (S<float>*)lhs[2];
+// CHECK: [[ARRS_LHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[ARRS_LHS_VOID:%.+]] = load i8*, i8** [[ARRS_LHS_REF]],
+// CHECK: [[ARRS_LHS:%.+]] = bitcast i8* [[ARRS_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// arr_lhs[:] += arr_rhs[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_LHS]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_LHS]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs_lhs = arrs_lhs.operator &(arrs_rhs);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK3]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[S_FLOAT_TY]]*** dereferenceable(8) %{{.+}})
+
+// CHECK: [[VAR2_ORIG_ADDR:%.+]] = alloca [[S_FLOAT_TY]]***,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[VAR2_ORIG:%.+]] = load [[S_FLOAT_TY]]***, [[S_FLOAT_TY]]**** [[VAR2_ORIG_ADDR]],
+
+// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 0
+// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 1
+// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 4
+// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 6
+// CHECK: [[LD:%.+]] = load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: [[ORIG_START:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[LD]],
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VAR2_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VAR2_PRIV]], i64 [[OFFSET]]
+// CHECK: store [[S_FLOAT_TY]]** [[REF:.+]], [[S_FLOAT_TY]]*** %
+// CHECK: store [[S_FLOAT_TY]]* [[PSEUDO_VAR2_PRIV]], [[S_FLOAT_TY]]** [[REF]]
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK4]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(24) %{{.+}})
+
+// CHECK: [[VVAR2_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[VVAR2_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VVAR2_ORIG_ADDR]],
+
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 0
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 4
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VVAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VVAR2_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VVAR2_PRIV]], i64 [[OFFSET]]
+// CHECK: [[VVAR2_PRIV:%.+]] = bitcast [[S_FLOAT_TY]]* [[PSEUDO_VVAR2_PRIV]] to [2 x [[S_FLOAT_TY]]]*
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK5]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(24) %{{.+}})
+
+// CHECK: [[VAR3_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 1
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 2
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VAR3_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VAR3_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VAR3_PRIV]], i64 [[OFFSET]]
+// CHECK: [[VAR3_PRIV:%.+]] = bitcast [[S_FLOAT_TY]]* [[PSEUDO_VAR3_PRIV]] to [2 x [[S_FLOAT_TY]]]*
+
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK6]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(24) %{{.+}})
+
+// CHECK: [[VAR3_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR3_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], i32 0, i32 0
+// CHECK: getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 2
+
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [[S_INT_TY]]*, [[S_INT_TY]]*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[S_INT_TY]]* dereferenceable(12) %{{.+}}, [[S_INT_TY]]* dereferenceable(12) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(24) %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca i{{[0-9]+}},
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK: [[VAR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
+// CHECK: [[T_VAR1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+
+// For + reduction operation initial value of private variable is 0.
+// CHECK: store i32 321, i32* %
+
+// For & reduction operation initial value of private variable is ones in all bits.
+// CHECK: call void @_Z4initR6BaseS1RKS_(
+
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call void @_Z5init2R6BaseS1RKS_(
+
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: sdiv i32 432, %
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 3
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce_nowait(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: add nsw i32 1513, %
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: mul nsw i32 17, %
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: sub nsw i32 47, %
+
+// __kmpc_end_reduce_nowait(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: call void @__kmpc_critical(
+// CHECK: add nsw i32 1513, %
+// CHECK: call void @__kmpc_end_critical(
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: mul nsw i32 17, %
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: sub nsw i32 47, %
+// CHECK: call void @__kmpc_end_critical(
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (i{{[0-9]+}}*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to i{{[0-9]+}}*
+// t_var_rhs = (i{{[0-9]+}}*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to i{{[0-9]+}}*
+
+// var_lhs = (S<i{{[0-9]+}}>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_INT_TY]]*
+// var_rhs = (S<i{{[0-9]+}}>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_INT_TY]]*
+
+// var1_lhs = (S<i{{[0-9]+}}>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_INT_TY]]*
+// var1_rhs = (S<i{{[0-9]+}}>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_INT_TY]]*
+
+// t_var1_lhs = (i{{[0-9]+}}*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to i{{[0-9]+}}*
+// t_var1_rhs = (i{{[0-9]+}}*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to i{{[0-9]+}}*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: add nsw i32 1513, %
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: mul nsw i32 17, %
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: sub nsw i32 47, %
+// CHECK: ret void
+
+#endif
+
diff --git a/test/OpenMP/for_reduction_messages.cpp b/test/OpenMP/for_reduction_messages.cpp
index 317f88c..45a4681 100644
--- a/test/OpenMP/for_reduction_messages.cpp
+++ b/test/OpenMP/for_reduction_messages.cpp
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -33,7 +33,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -55,9 +55,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable: no known conversion from 'int' to 'S6' for 1st argument}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable: no known conversion from 'int' to 'S6' for 1st argument}}
 #endif
   int a;
 
@@ -123,7 +123,7 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp for reduction(foo : argc) // expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -135,11 +135,11 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -147,15 +147,15 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -175,7 +175,7 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp for reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -187,7 +187,7 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp for reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -304,15 +304,15 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
diff --git a/test/OpenMP/for_simd_aligned_messages.cpp b/test/OpenMP/for_simd_aligned_messages.cpp
index 1007b3c..cef83c3 100644
--- a/test/OpenMP/for_simd_aligned_messages.cpp
+++ b/test/OpenMP/for_simd_aligned_messages.cpp
@@ -196,6 +196,7 @@
   #pragma omp for simd aligned(h)
   for (int k = 0; k < argc; ++k) ++k;
   int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
   foomain<int*,char>(pargc,argv);
   return 0;
 }
diff --git a/test/OpenMP/for_simd_ast_print.cpp b/test/OpenMP/for_simd_ast_print.cpp
index d4b13ba..54f0d46 100644
--- a/test/OpenMP/for_simd_ast_print.cpp
+++ b/test/OpenMP/for_simd_ast_print.cpp
@@ -6,6 +6,57 @@
 #ifndef HEADER
 #define HEADER
 
+struct S1 {
+  S1(): a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp for simd private(this->a) private(this->a) private(this->S1::a)
+// CHECK: #pragma omp for simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp for simd private(this->a) private(this->a)
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp for simd private(a) private(this->a) private(S7<S1>::a) 
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp for simd private(this->a) private(this->a) private(this->S7<S1>::a)
+// CHECK: #pragma omp for simd private(this->a) private(this->a)
+
 void foo() {}
 int g_ind = 1;
 template<class T, class N> T reduct(T* arr, N num) {
diff --git a/test/OpenMP/for_simd_codegen.cpp b/test/OpenMP/for_simd_codegen.cpp
index e1aa892..fb282d0 100644
--- a/test/OpenMP/for_simd_codegen.cpp
+++ b/test/OpenMP/for_simd_codegen.cpp
@@ -54,13 +54,13 @@
 
   long long k = get_val();
 
-  #pragma omp for simd linear(k : 3) schedule(dynamic)
+  #pragma omp for simd linear(k : 3) schedule(simd, nonmonotonic: dynamic)
 // CHECK: [[K0:%.+]] = call {{.*}}i64 @{{.*}}get_val
 // CHECK-NEXT: store i64 [[K0]], i64* [[K_VAR:%[^,]+]]
 // CHECK: [[K0LOAD:%.+]] = load i64, i64* [[K_VAR]]
 // CHECK-NEXT: store i64 [[K0LOAD]], i64* [[LIN0:%[^,]+]]
 
-// CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{.+}}, i32 %{{.+}}, i32 35, i32 0, i32 8, i32 1, i32 1)
+// CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{.+}}, i32 %{{.+}}, i32 1073741859, i32 0, i32 8, i32 1, i32 1)
 // CHECK: [[NEXT:%.+]] = call i32 @__kmpc_dispatch_next_4(%ident_t* {{.+}}, i32 %{{.+}}, i32* %{{.+}}, i32* [[LB:%.+]], i32* [[UB:%.+]], i32* %{{.+}})
 // CHECK: [[COND:%.+]] = icmp ne i32 [[NEXT]], 0
 // CHECK: br i1 [[COND]], label %[[CONT:.+]], label %[[END:.+]]
@@ -362,7 +362,7 @@
 
 template <typename T, unsigned N>
 int templ1(T a, T *z) {
-  #pragma omp for simd collapse(N)
+  #pragma omp for simd collapse(N) schedule(simd: static, N)
   for (int i = 0; i < N * 2; i++) {
     for (long long j = 0; j < (N + N + N + N); j += 2) {
       z[i + j] = a + tfoo<T, N>(i + j);
@@ -373,7 +373,7 @@
 
 // Instatiation templ1<float,2>
 // CHECK-LABEL: define {{.*i32}} @{{.*}}templ1{{.*}}(float {{.+}}, float* {{.+}})
-// CHECK: call void @__kmpc_for_static_init_8(%ident_t* {{[^,]+}}, i32 %{{[^,]+}}, i32 34, i32* %{{[^,]+}}, i64* [[LB:%[^,]+]], i64* [[UB:%[^,]+]], i64* [[STRIDE:%[^,]+]], i64 1, i64 1)
+// CHECK: call void @__kmpc_for_static_init_8(%ident_t* {{[^,]+}}, i32 %{{[^,]+}}, i32 45, i32* %{{[^,]+}}, i64* [[LB:%[^,]+]], i64* [[UB:%[^,]+]], i64* [[STRIDE:%[^,]+]], i64 1, i64 2)
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], 15
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
@@ -389,6 +389,7 @@
 // CHECK: store i64 [[LB_VAL]], i64* [[T1_OMP_IV:%[^,]+]],
 
 // ...
+// CHECK: icmp sle i64
 // CHECK: [[IV:%.+]] = load i64, i64* [[T1_OMP_IV]]
 // CHECK-NEXT: [[UB_VAL:%.+]] = load i64, i64* [[UB]]
 // CHECK-NEXT: [[CMP1:%.+]] = icmp sle i64 [[IV]], [[UB_VAL]]
@@ -581,9 +582,11 @@
   }
 // i,j,l are updated; k is not updated.
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
-// CHECK-NEXT: store i32 3, i32* [[I:%[^,]+]]
-// CHECK-NEXT: store i32 5, i32* [[I:%[^,]+]]
-// CHECK-NEXT: store i16 9, i16* [[I:%[^,]+]]
+// CHECK: br i1
+// CHECK: store i32 3, i32*
+// CHECK-NEXT: store i32 5,
+// CHECK-NEXT: store i32 7,
+// CHECK-NEXT: store i16 9, i16*
 // CHECK: call void @__kmpc_barrier(%ident_t* {{.+}}, i32 %{{.+}})
 // CHECK: ret void
 }
diff --git a/test/OpenMP/for_simd_collapse_messages.cpp b/test/OpenMP/for_simd_collapse_messages.cpp
index 5c9d058..2efd494 100644
--- a/test/OpenMP/for_simd_collapse_messages.cpp
+++ b/test/OpenMP/for_simd_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp for simd', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp for simd' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp for simd collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd collapse (1)
@@ -59,16 +71,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp for simd', but found only 1}}
   #pragma omp for simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp for simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp for simd', but found only 1}}
-  #pragma omp for simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp for simd collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp for simd' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp for simd collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp for simd' must be a for loop}}
diff --git a/test/OpenMP/for_simd_firstprivate_messages.cpp b/test/OpenMP/for_simd_firstprivate_messages.cpp
index cb74ee0..4e96866 100644
--- a/test/OpenMP/for_simd_firstprivate_messages.cpp
+++ b/test/OpenMP/for_simd_firstprivate_messages.cpp
@@ -147,7 +147,7 @@
     foo();
 #pragma omp parallel reduction(+ : i) // expected-note {{defined as reduction}}
 #pragma omp for simd firstprivate(i)       // expected-error {{firstprivate variable must be shared}}
-  for (i = 0; i < argc; ++i)
+  for (int k = 0; k < argc; ++k)
     foo();
   return 0;
 }
diff --git a/test/OpenMP/for_simd_linear_messages.cpp b/test/OpenMP/for_simd_linear_messages.cpp
index 44370a1..3f93125 100644
--- a/test/OpenMP/for_simd_linear_messages.cpp
+++ b/test/OpenMP/for_simd_linear_messages.cpp
@@ -208,7 +208,7 @@
   #pragma omp for simd linear(i)
   for (int k = 0; k < argc; ++k) ++k;
 
-  foomain<int,char>(argc,argv);
+  foomain<int,char>(argc,argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
   return 0;
 }
 
diff --git a/test/OpenMP/for_simd_loop_messages.cpp b/test/OpenMP/for_simd_loop_messages.cpp
index afd7b0b..e9729a8 100644
--- a/test/OpenMP/for_simd_loop_messages.cpp
+++ b/test/OpenMP/for_simd_loop_messages.cpp
@@ -408,12 +408,12 @@
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -465,7 +465,7 @@
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
 #pragma omp parallel
-// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp for simd
   for (begin = begin0; begin < end; ++begin)
diff --git a/test/OpenMP/for_simd_private_messages.cpp b/test/OpenMP/for_simd_private_messages.cpp
index 15e235c..ca4c3a3 100644
--- a/test/OpenMP/for_simd_private_messages.cpp
+++ b/test/OpenMP/for_simd_private_messages.cpp
@@ -29,7 +29,11 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp for simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -119,6 +167,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp for simd private // expected-error {{expected '(' after 'private'}}
@@ -180,6 +230,8 @@
   for (int k = 0; k < argc; ++k)
     m = k + 2;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/for_simd_reduction_messages.cpp b/test/OpenMP/for_simd_reduction_messages.cpp
index 000960f..2935cec 100644
--- a/test/OpenMP/for_simd_reduction_messages.cpp
+++ b/test/OpenMP/for_simd_reduction_messages.cpp
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -122,7 +122,7 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -134,11 +134,11 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -146,15 +146,15 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -174,7 +174,7 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp for simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -186,7 +186,7 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -295,15 +295,15 @@
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
diff --git a/test/OpenMP/for_simd_safelen_messages.cpp b/test/OpenMP/for_simd_safelen_messages.cpp
index d70e901..fa1b444 100644
--- a/test/OpenMP/for_simd_safelen_messages.cpp
+++ b/test/OpenMP/for_simd_safelen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd safelen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp for simd' cannot contain more than one 'safelen' clause}}
-  // expected-error@+2 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp for simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp for simd safelen (foobool(argc)), safelen (true), safelen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd safelen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd safelen (4)
@@ -57,16 +69,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp for simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp for simd safelen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp for simd' cannot contain more than one 'safelen' clause}}
   // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
   #pragma omp for simd safelen (foobool(argc)), safelen (true), safelen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp for simd' must be a for loop}}
diff --git a/test/OpenMP/for_simd_simdlen_messages.cpp b/test/OpenMP/for_simd_simdlen_messages.cpp
index c72e546..8fe1979 100644
--- a/test/OpenMP/for_simd_simdlen_messages.cpp
+++ b/test/OpenMP/for_simd_simdlen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd simdlen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp for simd' cannot contain more than one 'simdlen' clause}}
-  // expected-error@+2 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp for simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd simdlen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd simdlen (4)
@@ -57,16 +69,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp for simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp for simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp for simd simdlen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp for simd' cannot contain more than one 'simdlen' clause}}
   // expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
   #pragma omp for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp for simd' must be a for loop}}
diff --git a/test/OpenMP/linking.c b/test/OpenMP/linking.c
index 81706d4..7b30592 100644
--- a/test/OpenMP/linking.c
+++ b/test/OpenMP/linking.c
@@ -4,42 +4,42 @@
 // FIXME: Replace DEFAULT_OPENMP_LIB below with the value chosen at configure time.
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp -target i386-unknown-linux \
+// RUN:     -fopenmp -target i386-unknown-linux -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-32 %s
 // CHECK-LD-32: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD-32: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc"
 // CHECK-LD-32: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp -target x86_64-unknown-linux \
+// RUN:     -fopenmp -target x86_64-unknown-linux -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64 %s
 // CHECK-LD-64: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD-64: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc"
 // CHECK-LD-64: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp=libgomp -target i386-unknown-linux \
+// RUN:     -fopenmp=libgomp -target i386-unknown-linux -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-GOMP-LD-32 %s
 // CHECK-GOMP-LD-32: "{{.*}}ld{{(.exe)?}}"
 // CHECK-GOMP-LD-32: "-lgomp" "-lrt" "-lgcc"
 // CHECK-GOMP-LD-32: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp=libgomp -target x86_64-unknown-linux \
+// RUN:     -fopenmp=libgomp -target x86_64-unknown-linux -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-GOMP-LD-64 %s
 // CHECK-GOMP-LD-64: "{{.*}}ld{{(.exe)?}}"
 // CHECK-GOMP-LD-64: "-lgomp" "-lrt" "-lgcc"
 // CHECK-GOMP-LD-64: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp -target i386-unknown-linux \
+// RUN:     -fopenmp -target i386-unknown-linux -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-IOMP5-LD-32 %s
 // CHECK-IOMP5-LD-32: "{{.*}}ld{{(.exe)?}}"
 // CHECK-IOMP5-LD-32: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc"
 // CHECK-IOMP5-LD-32: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp -target x86_64-unknown-linux \
+// RUN:     -fopenmp -target x86_64-unknown-linux -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-IOMP5-LD-64 %s
 // CHECK-IOMP5-LD-64: "{{.*}}ld{{(.exe)?}}"
 // CHECK-IOMP5-LD-64: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc"
@@ -57,6 +57,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -fopenmp -fopenmp=libgomp -target i386-unknown-linux \
+// RUN:     -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-OVERRIDE-32 %s
 // CHECK-LD-OVERRIDE-32: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD-OVERRIDE-32: "-lgomp" "-lrt" "-lgcc"
@@ -64,13 +65,14 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -fopenmp -fopenmp=libgomp -target x86_64-unknown-linux \
+// RUN:     -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-OVERRIDE-64 %s
 // CHECK-LD-OVERRIDE-64: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD-OVERRIDE-64: "-lgomp" "-lrt" "-lgcc"
 // CHECK-LD-OVERRIDE-64: "-lpthread" "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp=libomp -target x86_64-msvc-win32 \
+// RUN:     -fopenmp=libomp -target x86_64-msvc-win32 -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-MSVC-LINK-64 %s
 // CHECK-MSVC-LINK-64: link.exe
 // CHECK-MSVC-LINK-64-SAME: -nodefaultlib:vcomp.lib
@@ -79,7 +81,7 @@
 // CHECK-MSVC-LINK-64-SAME: -defaultlib:libomp.lib
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -fopenmp=libiomp5 -target x86_64-msvc-win32 \
+// RUN:     -fopenmp=libiomp5 -target x86_64-msvc-win32 -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-MSVC-ILINK-64 %s
 // CHECK-MSVC-ILINK-64: link.exe
 // CHECK-MSVC-ILINK-64-SAME: -nodefaultlib:vcomp.lib
diff --git a/test/OpenMP/loops_explicit_clauses_codegen.cpp b/test/OpenMP/loops_explicit_clauses_codegen.cpp
new file mode 100644
index 0000000..dc21fd1
--- /dev/null
+++ b/test/OpenMP/loops_explicit_clauses_codegen.cpp
@@ -0,0 +1,162 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+
+#ifndef HEADER
+#define HEADER
+
+#define N 10
+int foo();
+int bar();
+int k;
+// CHECK-LABEL: @main
+int main(int argc, char **argv) {
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-NOT: @k
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK-NOT: @k
+#pragma omp for private(k)
+  for (k = 0; k < argc; k++)
+    ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: call void @__kmpc_for_static_init_8(
+// CHECK-NOT: @k
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: store i32 %{{.+}}, i32* @k
+#pragma omp for lastprivate(k) collapse(2)
+  for (int i = 0; i < 2; ++i)
+    for (k = 0; k < argc; k++)
+      ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: sdiv i32
+// CHECK: store i32 %{{.+}}, i32* @k,
+#pragma omp simd linear(k : 2)
+  for (k = 0; k < argc; k++)
+    bar();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: sdiv i32
+// CHECK: store i32 %{{.+}}, i32* @k,
+  foo();
+#pragma omp simd lastprivate(k) collapse(2)
+  for (int i = 0; i < 2; ++i)
+    for (k = 0; k < argc; k++)
+     bar() ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: sdiv i32
+// CHECK: store i32 %{{.+}}, i32* @k,
+#pragma omp simd
+  for (k = 0; k < argc; k++)
+    bar();
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: sdiv i32
+// CHECK: store i32 %{{.+}}, i32* @k,
+#pragma omp simd collapse(2)
+  for (int i = 0; i < 2; ++i)
+    for (k = 0; k < argc; k++)
+      bar();
+// CHECK: @{{.+}}foo
+  foo();
+  return 0;
+}
+
+struct S {
+  int k;
+  S(int argc) {
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp for private(k)
+    for (k = 0; k < argc; k++)
+      ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: call void @__kmpc_for_static_init_8(
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: call void @__kmpc_for_static_fini(
+#pragma omp for lastprivate(k) collapse(2)
+    for (int i = 0; i < 2; ++i)
+      for (k = 0; k < argc; k++)
+        ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: br i1
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: br label {{.+}}, !llvm.loop
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp simd linear(k : 2)
+    for (k = 0; k < argc; k++)
+      bar();
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: br i1
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: add nsw i64 %{{.+}}, 1
+// CHECK: br label {{.+}}, !llvm.loop
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp simd lastprivate(k) collapse(2)
+    for (int i = 0; i < 2; ++i)
+      for (k = 0; k < argc; k++)
+        bar();
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: br i1
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: br label {{.+}}, !llvm.loop
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp simd
+    for (k = 0; k < argc; k++)
+      bar();
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: br i1
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: add nsw i64 %{{.+}}, 1
+// CHECK: br label {{.+}}, !llvm.loop
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp simd collapse(2)
+    for (int i = 0; i < 2; ++i)
+      for (k = 0; k < argc; k++)
+        bar();
+// CHECK: @{{.+}}foo
+  foo();
+  }
+} s(N);
+
+#endif // HEADER
diff --git a/test/OpenMP/nesting_of_regions.cpp b/test/OpenMP/nesting_of_regions.cpp
index b2b87db..8f6fb35 100644
--- a/test/OpenMP/nesting_of_regions.cpp
+++ b/test/OpenMP/nesting_of_regions.cpp
@@ -97,6 +97,27 @@
   }
 #pragma omp parallel
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel
+  {
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+  }
+#pragma omp parallel
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp parallel
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp parallel
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -112,7 +133,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
-
+#pragma omp parallel
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+  
 // SIMD DIRECTIVE
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
@@ -122,7 +177,7 @@
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -226,6 +281,16 @@
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp ordered simd // OK
+    bar();
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered threads // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -236,6 +301,27 @@
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -251,6 +337,40 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for// expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // FOR DIRECTIVE
 #pragma omp for
@@ -398,6 +518,27 @@
   }
 #pragma omp for
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -413,6 +554,40 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // OK
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // FOR SIMD DIRECTIVE
 #pragma omp for simd
@@ -423,7 +598,7 @@
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -527,6 +702,16 @@
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp ordered simd // OK
+    bar();
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered threads // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -537,6 +722,27 @@
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -552,6 +758,41 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // SECTIONS DIRECTIVE
 #pragma omp sections
@@ -706,6 +947,25 @@
   }
 #pragma omp sections
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp sections
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp target enter data map(to: a)
+  }
+#pragma omp sections
+  {
+#pragma omp target exit data map(from: a)
+  }
+#pragma omp sections
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -721,6 +981,40 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp sections
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp sections
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SECTION DIRECTIVE
 #pragma omp section // expected-error {{orphaned 'omp section' directives are prohibited, it must be closely nested to a sections region}}
@@ -911,6 +1205,37 @@
 #pragma omp sections
   {
 #pragma omp section
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target parallel for
+      for (int i = 0; i < 10; ++i)
+        ;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target enter data map(to: a)
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target exit data map(from: a)
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -928,6 +1253,49 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+      bar();
+#pragma omp target update to(a)
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SINGLE DIRECTIVE
 #pragma omp single
@@ -1065,6 +1433,27 @@
   }
 #pragma omp single
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp single
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp single
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp single
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1080,6 +1469,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp single
+  {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp single
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // MASTER DIRECTIVE
 #pragma omp master
@@ -1217,6 +1641,27 @@
   }
 #pragma omp master
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp master
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1232,6 +1677,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp master
+  {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp master
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // CRITICAL DIRECTIVE
 #pragma omp critical
@@ -1383,6 +1863,27 @@
   }
 #pragma omp critical
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp critical
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp critical
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp critical
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1398,6 +1899,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp critical
+  {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp critical
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // PARALLEL FOR DIRECTIVE
 #pragma omp parallel for
@@ -1550,6 +2086,27 @@
   }
 #pragma omp parallel for
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1565,6 +2122,40 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // OK
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // PARALLEL FOR SIMD DIRECTIVE
 #pragma omp parallel for simd
@@ -1575,7 +2166,7 @@
   }
 #pragma omp parallel for simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd// expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -1717,6 +2308,27 @@
   }
 #pragma omp parallel for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -1732,6 +2344,41 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // PARALLEL SECTIONS DIRECTIVE
 #pragma omp parallel sections
@@ -1875,6 +2522,25 @@
   }
 #pragma omp parallel sections
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target enter data map(to: a)
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target exit data map(from: a)
+  }
+#pragma omp parallel sections
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1890,6 +2556,40 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp parallel sections
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // TASK DIRECTIVE
 #pragma omp task
@@ -1979,6 +2679,25 @@
   }
 #pragma omp task
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp task
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp task
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp task
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp task
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1994,6 +2713,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp task
+  {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp task
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // ORDERED DIRECTIVE
 #pragma omp ordered
@@ -2141,6 +2895,37 @@
   }
 #pragma omp ordered
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp ordered
+  {
+#pragma omp target parallel for ordered
+    for (int j = 0; j < 10; ++j) {
+#pragma omp ordered // OK
+      {
+        bar();
+      }
+    }
+  }
+#pragma omp ordered
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp ordered
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp ordered
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp ordered
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -2156,6 +2941,42 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp ordered
+  {
+    bar();
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp ordered
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp ordered
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp ordered
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp ordered
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp ordered
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // ATOMIC DIRECTIVE
 #pragma omp atomic
@@ -2322,6 +3143,35 @@
   // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
   // expected-note@+1 {{expected an expression statement}}
   {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
   }
@@ -2341,6 +3191,53 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    bar();
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // TARGET DIRECTIVE
 #pragma omp target
@@ -2432,11 +3329,20 @@
   }
 #pragma omp target
   {
-#pragma omp target
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
     ++a;
   }
 #pragma omp target
   {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+  {
 #pragma omp teams
     ++a;
   }
@@ -2446,6 +3352,12 @@
 #pragma omp teams  // expected-note {{nested teams construct here}}
     ++a;
   }
+#pragma omp target // expected-error {{target construct with nested teams region contains statements outside of the teams construct}}
+  {
+    while (0)      // expected-note {{statement outside teams construct here}}
+#pragma omp teams  // expected-note {{nested teams construct here}}
+    ++a;
+  }
 #pragma omp target
   {
 #pragma omp taskloop
@@ -2458,8 +3370,442 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp target
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  { 
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp teams distribute // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET PARALLEL DIRECTIVE
+#pragma omp target parallel
+#pragma omp parallel
+  bar();
+#pragma omp target parallel
+#pragma omp for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp sections
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a target parallel region}}
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp single
+  bar();
+
+#pragma omp target parallel
+#pragma omp master
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp critical
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp parallel sections
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp task
+  {
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp barrier
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target // expected-error {{region cannot be nested inside 'target parallel' region}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target parallel' region}}
+    ++a;
+  }
+#pragma omp target parallel
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target parallel' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+  {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+    ++a;
+#pragma omp teams  // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target parallel' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET PARALLEL FOR DIRECTIVE
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for simd' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp sections' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a target parallel for region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp single' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{region cannot be closely nested inside 'target parallel for' region}}
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#pragma omp single // OK
+      {
+        bar();
+      }
+#pragma omp for // OK
+      for (int i = 0; i < 10; ++i)
+        ;
+#pragma omp for simd // OK
+      for (int i = 0; i < 10; ++i)
+        ;
+#pragma omp sections // OK
+      {
+        bar();
+      }
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{region cannot be closely nested inside 'target parallel for' region}}
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target parallel for ordered
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // OK
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }    
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }    
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }    
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }    
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }    
 
 // TEAMS DIRECTIVE
+#pragma omp teams // expected-error {{orphaned 'omp teams' directives are prohibited; perhaps you forget to enclose the directive into a target region?}}
+  bar();
 #pragma omp target
 #pragma omp teams
 #pragma omp parallel
@@ -2575,6 +3921,29 @@
 #pragma omp target
 #pragma omp teams
   {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target enter data' directive into a parallel region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target exit data' directive into a parallel region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -2598,6 +3967,55 @@
 #pragma omp distribute
   for (int j = 0; j < 10; ++j)
     ;        
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target update' directive into a parallel region?}}
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute parallel for
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute parallel for simd
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute simd
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target simd // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target simd' directive into a parallel region?}}
+  for (int i = 0; i < 10; ++i)
+    ;
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+  for (int i = 0; i < 10; ++i)
+    ;
+  }
 
 // TASKLOOP DIRECTIVE
 #pragma omp taskloop
@@ -2740,6 +4158,27 @@
   }
 #pragma omp taskloop
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -2749,6 +4188,48 @@
   for (int i = 0; i < 10; ++i)
     ++a;
   }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // OK
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+
 // DISTRIBUTE DIRECTIVE
 #pragma omp target
 #pragma omp teams
@@ -2927,7 +4408,36 @@
 #pragma omp teams
 #pragma omp distribute
   for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
 #pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
     ++a;
   }
 #pragma omp target
@@ -2937,6 +4447,956 @@
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target' region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+
+// DISTRIBUTE PARALLEL FOR DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp for simd' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp sections' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a distribute parallel for region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp single' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// DISTRIBUTE PARALLEL FOR SIMD DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd 
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel  // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+
+// TARGET SIMD DIRECTIVE
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd 
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel  // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+
+// TEAMS DISTRIBUTE DIRECTIVE
+#pragma omp teams distribute // expected-error {{orphaned 'omp teams distribute' directives are prohibited; perhaps you forget to enclose the directive into a target region?}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a teams distribute region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // OK
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // OK
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // OK
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // OK
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // OK
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // OK
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel  // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
 }
 
 void foo() {
@@ -3033,6 +5493,25 @@
   }
 #pragma omp parallel
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp parallel
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp parallel
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -3048,6 +5527,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp parallel
+  {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SIMD DIRECTIVE
 #pragma omp simd
@@ -3058,7 +5572,7 @@
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -3165,6 +5679,27 @@
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -3180,6 +5715,41 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    a++;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // FOR DIRECTIVE
 #pragma omp for
@@ -3317,6 +5887,27 @@
   }
 #pragma omp for
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -3332,6 +5923,41 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // OK
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // FOR SIMD DIRECTIVE
 #pragma omp for simd
@@ -3342,7 +5968,7 @@
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -3449,6 +6075,27 @@
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -3464,6 +6111,41 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // SECTIONS DIRECTIVE
 #pragma omp sections
@@ -3593,6 +6275,25 @@
   }
 #pragma omp sections
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp sections
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp target enter data map(to: a)
+  }
+#pragma omp sections
+  {
+#pragma omp target exit data map(from: a)
+  }
+#pragma omp sections
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -3608,6 +6309,40 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp sections
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp sections
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SECTION DIRECTIVE
 #pragma omp section // expected-error {{orphaned 'omp section' directives are prohibited, it must be closely nested to a sections region}}
@@ -3803,6 +6538,39 @@
   {
 #pragma omp section
     {
+#pragma omp target parallel
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target parallel for
+      for (int i = 0; i < 10; ++i)
+        ;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target enter data map(to: a)
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target exit data map(from: a)
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
       ++a;
     }
@@ -3823,6 +6591,49 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target update to(a)
+      a++;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SINGLE DIRECTIVE
 #pragma omp single
@@ -3950,6 +6761,27 @@
   }
 #pragma omp single
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp single
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp single
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp single
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -3965,6 +6797,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp single
+  {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp single
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // MASTER DIRECTIVE
 #pragma omp master
@@ -4102,6 +6969,27 @@
   }
 #pragma omp master
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp master
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4117,6 +7005,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp master
+  {
+#pragma omp target update to(a)
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // CRITICAL DIRECTIVE
 #pragma omp critical
@@ -4273,6 +7196,27 @@
   }
 #pragma omp critical
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp critical
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp critical
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp critical
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4288,6 +7232,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp critical
+  {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // PARALLEL FOR DIRECTIVE
 #pragma omp parallel for
@@ -4440,6 +7419,27 @@
   }
 #pragma omp parallel for
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4455,6 +7455,41 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // OK
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // PARALLEL FOR SIMD DIRECTIVE
 #pragma omp parallel for simd
@@ -4465,7 +7500,7 @@
   }
 #pragma omp parallel for simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd// expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -4607,6 +7642,27 @@
   }
 #pragma omp parallel for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -4622,6 +7678,41 @@
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    a++;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // PARALLEL SECTIONS DIRECTIVE
 #pragma omp parallel sections
@@ -4761,6 +7852,25 @@
   }
 #pragma omp parallel sections
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target enter data map(to: a)
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target exit data map(from: a)
+  }
+#pragma omp parallel sections
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4776,6 +7886,40 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp parallel sections
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // TASK DIRECTIVE
 #pragma omp task
@@ -4864,6 +8008,25 @@
   }
 #pragma omp task
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp task
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp task
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp task
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp task
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4879,6 +8042,41 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp task
+  {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp task
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // ATOMIC DIRECTIVE
 #pragma omp atomic
@@ -5045,6 +8243,35 @@
   // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
   // expected-note@+1 {{expected an expression statement}}
   {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
   }
@@ -5064,6 +8291,52 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target update // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute parallel for// expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // TARGET DIRECTIVE
 #pragma omp target
@@ -5155,11 +8428,28 @@
   }
 #pragma omp target
   {
-#pragma omp target
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
     ++a;
   }
 #pragma omp target
   {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  {
 #pragma omp teams
     ++a;
   }
@@ -5181,6 +8471,434 @@
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    a++;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp teams distribute // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET PARALLEL DIRECTIVE
+#pragma omp target parallel
+#pragma omp parallel
+  bar();
+#pragma omp target parallel
+#pragma omp for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp sections
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a target parallel region}}
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp single
+  bar();
+
+#pragma omp target parallel
+#pragma omp master
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp critical
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp parallel sections
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp task
+  {
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp barrier
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target // expected-error {{region cannot be nested inside 'target parallel' region}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target parallel' region}}
+    ++a;
+  }
+#pragma omp target parallel
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target parallel' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+  {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+    ++a;
+#pragma omp teams  // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target parallel' regio}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET PARALLEL FOR DIRECTIVE
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for simd' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp sections' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a target parallel for region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp single' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{region cannot be closely nested inside 'target parallel for' region}}
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#pragma omp single // OK
+      {
+        bar();
+      }
+#pragma omp for // OK
+      for (int i = 0; i < 10; ++i)
+        ;
+#pragma omp for simd // OK
+      for (int i = 0; i < 10; ++i)
+        ;
+#pragma omp sections // OK
+      {
+        bar();
+      }
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{region cannot be closely nested inside 'target parallel for' region}}
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target parallel for ordered
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // OK
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    a++;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // TEAMS DIRECTIVE
 #pragma omp target
@@ -5298,6 +9016,27 @@
 #pragma omp target
 #pragma omp teams
   {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target enter data' directive into a parallel region?}}
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target exit data' directive into a parallel region?}}
+  }
+#pragma omp target
+#pragma omp teams
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -5321,6 +9060,61 @@
 #pragma omp distribute
   for (int j = 0; j < 10; ++j)
     ;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target update' directive into a parallel region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute parallel for
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute parallel for simd
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute simd
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target simd // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target simd' directive into a parallel region?}}
+  for (int i = 0; i < 10; ++i)
+    ;
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+  for (int i = 0; i < 10; ++i)
+    ;
+  }
 
 // TASKLOOP DIRECTIVE
 #pragma omp taskloop
@@ -5463,6 +9257,27 @@
   }
 #pragma omp taskloop
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -5472,6 +9287,47 @@
   for (int i = 0; i < 10; ++i)
     ++a;
   }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // DISTRIBUTE DIRECTIVE
 #pragma omp target
@@ -5651,16 +9507,1258 @@
 #pragma omp teams
 #pragma omp distribute
   for (int i = 0; i < 10; ++i) {
-#pragma omp target
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
     ++a;
   }
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
   return foo<int>();
-}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
 
+  // DISTRIBUTE PARALLEL FOR DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp for simd' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp sections' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a distribute parallel for region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp single' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+  return foo<int>();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+
+// DISTRIBUTE PARALLEL FOR SIMD DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+  return foo<int>();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// DISTRIBUTE SIMD DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+  return foo<int>();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET SIMD DIRECTIVE
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TEAMS DISTRIBUTE DIRECTIVE
+#pragma omp teams distribute // expected-error {{orphaned 'omp teams distribute' directives are prohibited; perhaps you forget to enclose the directive into a target region?}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a teams distribute region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // OK
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // OK
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // OK
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // OK
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // OK
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // OK
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // OK
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // OK
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel  // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target simd // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams distribute // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp teams distribute' directive into a target region?}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+  return foo<int>();
+}
diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp
new file mode 100644
index 0000000..c4df636
--- /dev/null
+++ b/test/OpenMP/nvptx_target_codegen.cpp
@@ -0,0 +1,581 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[OMP_NT:@.+]] = common addrspace(3) global i32 0
+// CHECK-DAG: [[OMP_WID:@.+]] = common addrspace(3) global i64 0
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l86}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l86]]()
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T1]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  #pragma omp target
+  {
+  }
+
+  // CHECK-NOT: define {{.*}}void [[T2:@__omp_offloading_.+foo.+]]_worker()
+  #pragma omp target if(0)
+  {
+  }
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l157}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l157]](i[[SZ:32|64]] [[ARG1:%.+]])
+  // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
+  // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
+  // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T3]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK: load i16, i16* [[AA_CADDR]],
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  #pragma omp target if(1)
+  {
+    aa += 1;
+  }
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l260}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+foo.+l260]](i[[SZ]]
+  // Create local storage for each capture.
+  // CHECK:    [[LOCAL_A:%.+]] = alloca i[[SZ]]
+  // CHECK:    [[LOCAL_B:%.+]] = alloca [10 x float]*
+  // CHECK:    [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]
+  // CHECK:    [[LOCAL_BN:%.+]] = alloca float*
+  // CHECK:    [[LOCAL_C:%.+]] = alloca [5 x [10 x double]]*
+  // CHECK:    [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]
+  // CHECK:    [[LOCAL_VLA3:%.+]] = alloca i[[SZ]]
+  // CHECK:    [[LOCAL_CN:%.+]] = alloca double*
+  // CHECK:    [[LOCAL_D:%.+]] = alloca [[TT:%.+]]*
+  // CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+  // CHECK-DAG: store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCAL_B]]
+  // CHECK-DAG: store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]]
+  // CHECK-DAG: store float* [[ARG_BN:%.+]], float** [[LOCAL_BN]]
+  // CHECK-DAG: store [5 x [10 x double]]* [[ARG_C:%.+]], [5 x [10 x double]]** [[LOCAL_C]]
+  // CHECK-DAG: store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]]
+  // CHECK-DAG: store i[[SZ]] [[ARG_VLA3:%.+]], i[[SZ]]* [[LOCAL_VLA3]]
+  // CHECK-DAG: store double* [[ARG_CN:%.+]], double** [[LOCAL_CN]]
+  // CHECK-DAG: store [[TT]]* [[ARG_D:%.+]], [[TT]]** [[LOCAL_D]]
+  //
+  // CHECK-64-DAG: [[REF_A:%.+]] = bitcast i64* [[LOCAL_A]] to i32*
+  // CHECK-DAG:    [[REF_B:%.+]] = load [10 x float]*, [10 x float]** [[LOCAL_B]],
+  // CHECK-DAG:    [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]],
+  // CHECK-DAG:    [[REF_BN:%.+]] = load float*, float** [[LOCAL_BN]],
+  // CHECK-DAG:    [[REF_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[LOCAL_C]],
+  // CHECK-DAG:    [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]],
+  // CHECK-DAG:    [[VAL_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA3]],
+  // CHECK-DAG:    [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]],
+  // CHECK-DAG:    [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]],
+  //
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T4]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  //
+  // Use captures.
+  // CHECK-64-DAG:  load i32, i32* [[REF_A]]
+  // CHECK-32-DAG:  load i32, i32* [[LOCAL_A]]
+  // CHECK-DAG:  getelementptr inbounds [10 x float], [10 x float]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  // CHECK-DAG:  getelementptr inbounds float, float* [[REF_BN]], i[[SZ]] 3
+  // CHECK-DAG:  getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[REF_C]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK-DAG:  getelementptr inbounds double, double* [[REF_CN]], i[[SZ]] %{{.+}}
+  // CHECK-DAG:     getelementptr inbounds [[TT]], [[TT]]* [[REF_D]], i32 0, i32 0
+  //
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  #pragma omp target if(n>20)
+  {
+    a += 1;
+    b[2] += 1.0;
+    bn[3] += 1.0;
+    c[1][2] += 1.0;
+    cn[1][3] += 1.0;
+    d.X += 1;
+    d.Y += 1;
+  }
+
+  return a;
+}
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+  #pragma omp target if(n>40)
+  {
+    a += 1;
+    aa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  short aa = 0;
+  char aaa = 0;
+  int b[10];
+
+  #pragma omp target if(n>50)
+  {
+    a += 1;
+    aa += 1;
+    aaa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+    short int c[2][n];
+
+    #pragma omp target if(n>60)
+    {
+      this->a = (double)b + 1.5;
+      c[1][1] = ++a;
+    }
+
+    return c[1][1] + (int)b;
+  }
+};
+
+int bar(int n){
+  int a = 0;
+
+  a += foo(n);
+
+  S1 S;
+  a += S.r1(n);
+
+  a += fstatic(n);
+
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+l297}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+static.+l297]](i[[SZ]]
+  // Create local storage for each capture.
+  // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_AAA:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_B:%.+]] = alloca [10 x i32]*
+  // CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+  // CHECK-DAG:  store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
+  // CHECK-DAG:  store i[[SZ]] [[ARG_AAA:%.+]], i[[SZ]]* [[LOCAL_AAA]]
+  // CHECK-DAG:  store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
+  // Store captures in the context.
+  // CHECK-64-DAG:   [[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+  // CHECK-DAG:      [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
+  // CHECK-DAG:      [[REF_AAA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8*
+  // CHECK-DAG:      [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
+  //
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T5]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  //
+  // CHECK-64-DAG: load i32, i32* [[REF_A]]
+  // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
+  // CHECK-DAG:    load i16, i16* [[REF_AA]]
+  // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  //
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l315}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+S1.+l315]](
+  // Create local storage for each capture.
+  // CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
+  // CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
+  // CHECK:       [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]
+  // CHECK:       [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]
+  // CHECK:       [[LOCAL_C:%.+]] = alloca i16*
+  // CHECK-DAG:   store [[S1]]* [[ARG_THIS:%.+]], [[S1]]** [[LOCAL_THIS]]
+  // CHECK-DAG:   store i[[SZ]] [[ARG_B:%.+]], i[[SZ]]* [[LOCAL_B]]
+  // CHECK-DAG:   store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]]
+  // CHECK-DAG:   store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]]
+  // CHECK-DAG:   store i16* [[ARG_C:%.+]], i16** [[LOCAL_C]]
+  // Store captures in the context.
+  // CHECK-DAG:   [[REF_THIS:%.+]] = load [[S1]]*, [[S1]]** [[LOCAL_THIS]],
+  // CHECK-64-DAG:[[REF_B:%.+]] = bitcast i[[SZ]]* [[LOCAL_B]] to i32*
+  // CHECK-DAG:   [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]],
+  // CHECK-DAG:   [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]],
+  // CHECK-DAG:   [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]],
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T6]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // Use captures.
+  // CHECK-DAG:   getelementptr inbounds [[S1]], [[S1]]* [[REF_THIS]], i32 0, i32 0
+  // CHECK-64-DAG:load i32, i32* [[REF_B]]
+  // CHECK-32-DAG:load i32, i32* [[LOCAL_B]]
+  // CHECK-DAG:   getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}}
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l280}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T7:@__omp_offloading_.+template.+l280]](i[[SZ]]
+  // Create local storage for each capture.
+  // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_B:%.+]] = alloca [10 x i32]*
+  // CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+  // CHECK-DAG:  store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
+  // CHECK-DAG:   store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
+  // Store captures in the context.
+  // CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+  // CHECK-DAG:   [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
+  // CHECK-DAG:   [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
+  //
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T7]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  //
+  // CHECK-64-DAG: load i32, i32* [[REF_A]]
+  // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
+  // CHECK-DAG:    load i16, i16* [[REF_AA]]
+  // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  //
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+#endif
diff --git a/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
new file mode 100644
index 0000000..5dcff8e
--- /dev/null
+++ b/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -0,0 +1,223 @@
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+// TCHECK:  [[TT:%.+]] = type { i64, i8 }
+// TCHECK:  [[S1:%.+]] = type { double }
+
+int foo(int n, double *ptr) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  double c[5][10];
+  TT<long long, char> d;
+  
+  #pragma omp target firstprivate(a)
+  {
+  }
+  
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]])
+  // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK-NOT:  alloca i{{[0-9]+}},
+  // TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+  // TCHECK:  ret void  
+
+#pragma omp target firstprivate(aa,b,c,d)
+  {
+    aa += 1;
+    b[2] = 1.0;
+    c[1][2] = 1.0;
+    d.X = 1;
+    d.Y = 1;    
+  }
+  
+  // make sure that firstprivate variables are generated in all cases and that we use those instances for operations inside the
+  // target region
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A2_IN:%.+]], [10 x float]* {{.+}} [[B_IN:%.+]], [5 x [10 x double]]* {{.+}} [[C_IN:%.+]], [[TT]]* {{.+}} [[D_IN:%.+]])
+  // TCHECK:  [[A2_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[B_ADDR:%.+]] = alloca [10 x float]*,
+  // TCHECK:  [[C_ADDR:%.+]] = alloca [5 x [10 x double]]*,
+  // TCHECK:  [[D_ADDR:%.+]] = alloca [[TT]]*,
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+  // TCHECK:  [[B_PRIV:%.+]] = alloca [10 x float],
+  // TCHECK:  [[C_PRIV:%.+]] = alloca [5 x [10 x double]],
+  // TCHECK:  [[D_PRIV:%.+]] = alloca [[TT]],
+  // TCHECK:  store i{{[0-9]+}} [[A2_IN]], i{{[0-9]+}}* [[A2_ADDR]],
+  // TCHECK:  store [10 x float]* [[B_IN]], [10 x float]** [[B_ADDR]],
+  // TCHECK:  store [5 x [10 x double]]* [[C_IN]], [5 x [10 x double]]** [[C_ADDR]],
+  // TCHECK:  store [[TT]]* [[D_IN]], [[TT]]** [[D_ADDR]],
+  // TCHECK:  [[CONV_A2ADDR:%.+]] = bitcast i{{[0-9]+}}* [[A2_ADDR]] to i{{[0-9]+}}*
+  // TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x float]*, [10 x float]** [[B_ADDR]],
+  // TCHECK:  [[C_ADDR_REF:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]],
+  // TCHECK:  [[D_ADDR_REF:%.+]] = load [[TT]]*, [[TT]]** [[D_ADDR]],
+
+  // firstprivate(aa): a_priv = a_in
+
+  //  firstprivate(b): memcpy(b_priv,b_in)
+  // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
+  // TCHECK:  [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_ADDR_REF_BCAST]], {{.+}})
+
+  // firstprivate(c)
+  // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
+  // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+  
+  // firstprivate(d)
+  // TCHECK:  [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
+  // TCHECK:  [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[D_PRIV_BCAST]], i8* [[D_IN_BCAST]],{{.+}})
+
+  // TCHECK: load i16, i16* [[CONV_A2ADDR]],
+
+  
+  #pragma omp target firstprivate(ptr)
+  {
+    ptr[0]++;
+  }
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}(double* [[PTR_IN:%.+]])
+  // TCHECK:  [[PTR_ADDR:%.+]] = alloca double*,
+  // TCHECK-NOT: alloca double*,
+  // TCHECK:  store double* [[PTR_IN]], double** [[PTR_ADDR]],
+  // TCHECK:  [[PTR_IN_REF:%.+]] = load double*, double** [[PTR_ADDR]],
+  // TCHECK-NOT:  store double* [[PTR_IN_REF]], double** [[PTR_PRIV]],
+
+  return a;
+}
+
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  tx b[10];
+
+#pragma omp target firstprivate(a,b)
+  {
+    a += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  char aaa = 0;
+  int b[10];
+
+#pragma omp target firstprivate(a,aaa,b)
+  {
+    a += 1;
+    aaa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+// TCHECK: define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[A3_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
+// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[A3_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
+// TCHECK-NOT:  alloca i{{[0-9]+}},
+// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+// TCHECK:  store i{{[0-9]+}} [[A3_IN]], i{{[0-9]+}}* [[A3_ADDR]],
+// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
+// TCHECK-64:  [[A_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
+// TCHECK:  [[A3_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A3_ADDR]] to i8*
+// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+// firstprivate(a): a_priv = a_in
+
+// firstprivate(aaa)
+
+// TCHECK-NOT:  store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}*
+
+// firstprivate(b)
+// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
+// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+
+// TCHECK:  ret void
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+
+#pragma omp target firstprivate(b)
+    {
+      this->a = (double)b + 1.5;
+    }
+
+    return (int)b;
+  }
+
+  // TCHECK: define void @__omp_offloading_{{.+}}([[S1]]* [[TH:%.+]], i{{[0-9]+}} [[B_IN:%.+]])
+  // TCHECK:  [[TH_ADDR:%.+]] = alloca [[S1]]*,
+  // TCHECK:  [[B_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+
+  // TCHECK:  store [[S1]]* [[TH]], [[S1]]** [[TH_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[B_ADDR]],
+  // TCHECK:  [[TH_ADDR_REF:%.+]] = load [[S1]]*, [[S1]]** [[TH_ADDR]],
+  // TCHECK-64:  [[B_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[B_ADDR]] to i{{[0-9]+}}*
+
+  // firstprivate(b)
+  // TCHECK-NOT:  store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}*
+
+  // TCHECK: ret void
+};
+
+
+
+int bar(int n, double *ptr){
+  int a = 0;
+  a += foo(n, ptr);
+  S1 S;
+  a += S.r1(n);
+  a += fstatic(n);
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+// template
+
+// TCHECK: define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
+// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
+// TCHECK-NOT: alloca i{{[0-9]+}},
+// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
+// TCHECK-64:  [[A_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
+// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+// firstprivate(a)
+// TCHECK-NOT:  store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}*
+
+// firstprivate(b)
+// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
+// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+
+// TCHECK: ret void
+
+#endif
diff --git a/test/OpenMP/nvptx_teams_codegen.cpp b/test/OpenMP/nvptx_teams_codegen.cpp
new file mode 100644
index 0000000..b26d47c
--- /dev/null
+++ b/test/OpenMP/nvptx_teams_codegen.cpp
@@ -0,0 +1,132 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#ifdef CK1
+
+template <typename T>
+int tmain(T argc) {
+#pragma omp target
+#pragma omp teams
+  argc = 0;
+  return 0;
+}
+
+
+int main (int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+  {
+  argc = 0;
+  }
+  return tmain(argv);
+}
+
+// only nvptx side: do not outline teams region and do not call fork_teams
+// CK1:  define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]])
+// CK1:  {{.+}} = alloca i{{[0-9]+}}*,
+// CK1:  {{.+}} = alloca i{{[0-9]+}}*,
+// CK1:  [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
+// CK1:  [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
+// CK1:  store {{.+}} 0, {{.+}},
+// CK1:  store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK1-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
+// CK1-64:  store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1-32:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1:  store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
+// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK1:  ret void
+// CK1-NEXT: }
+
+// target region in template
+// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}** [[ARGC:%.+]])
+// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
+// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**,
+// CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
+// CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]],
+// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]],
+// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
+// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK1:  ret void
+// CK1-NEXT: }
+
+
+#endif // CK1
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// expected-no-diagnostics
+#ifdef CK2
+
+template <typename T>
+int tmain(T argc) {
+  int a = 10;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int a = 20;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return tmain(argv);
+}
+
+// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]])
+// CK2: {{.}} = alloca i{{[0-9]+}}*,
+// CK2: {{.}} = alloca i{{[0-9]+}}*,
+// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
+// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2-NOT:  {{%.+}} = call i32 @__kmpc_global_thread_num(
+// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
+// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
+// CK2: store i{{[0-9]+}} [[ARGC_IN]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
+// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
+// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
+// CK2-64:  store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2-32:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
+// CK2-NOT:  {{.+}} = call i32 @__kmpc_push_num_teams(
+// CK2-NOT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK2: ret
+
+// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[BP:%.+]], i{{[0-9]+}}** [[ARGC:%.+]])
+// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}***,
+// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}**,
+// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num(
+// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
+// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
+// CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]],
+// CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]],
+// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]],
+// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
+// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams(
+// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK2:  ret void
+
+#endif // CK2
+#endif
diff --git a/test/OpenMP/ordered_doacross_codegen.cpp b/test/OpenMP/ordered_doacross_codegen.cpp
new file mode 100644
index 0000000..d1fe99d
--- /dev/null
+++ b/test/OpenMP/ordered_doacross_codegen.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// REQUIRES: x86-registered-target
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[KMP_DIM:%.+]] = type { i64, i64, i64 }
+extern int n;
+int a[10], b[10], c[10], d[10];
+void foo();
+
+// CHECK-LABEL: @main()
+int main() {
+  int i;
+// CHECK: [[DIMS:%.+]] = alloca [[KMP_DIM]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK: icmp
+// CHECK-NEXT: br i1 %
+// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
+// CHECK: call void @llvm.memset.p0i8.i64(i8* [[CAST]], i8 0, i64 24, i32 8, i1 false)
+// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 1
+// CHECK: store i64 %{{.+}}, i64* %
+// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 2
+// CHECK: store i64 1, i64* %
+// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
+// CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
+// CHECK: call void @__kmpc_for_static_init_4(
+#pragma omp for ordered(1)
+  for (i = 0; i < n; ++i) {
+    a[i] = b[i] + 1;
+    foo();
+// CHECK: invoke void [[FOO:.+]](
+// CHECK: load i32, i32* [[CNT:%.+]],
+// CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+#pragma omp ordered depend(source)
+    c[i] = c[i] + 1;
+    foo();
+// CHECK: invoke void [[FOO]]
+// CHECK: load i32, i32* [[CNT]],
+// CHECK-NEXT: sub nsw i32 %{{.+}}, 2
+// CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+#pragma omp ordered depend(sink : i - 2)
+    d[i] = a[i - 2];
+  }
+  // CHECK: landingpad
+  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK: br label %
+
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK: ret i32 0
+  return 0;
+}
+
+// CHECK: define {{.+}}TestStruct
+template <typename T>
+struct TestStruct {
+  static const int M = 10;
+  static const int N = 20;
+  T i;
+  T a[N][M];
+  T b[N][M];
+  T foo(T, T);
+  T bar(T, T, T);
+  void baz(T, T);
+  TestStruct() {
+// CHECK: [[CNT:%.+]] = alloca i64,
+// CHECK: [[DIMS:%.+]] = alloca [[KMP_DIM]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK: icmp
+// CHECK-NEXT: br i1 %
+// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
+// CHECK: call void @llvm.memset.p0i8.i64(i8* [[CAST]], i8 0, i64 24, i32 8, i1 false)
+// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 1
+// CHECK: store i64 %{{.+}}, i64* %
+// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 2
+// CHECK: store i64 1, i64* %
+// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
+// CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
+// CHECK: call void @__kmpc_for_static_init_8(
+#pragma omp for ordered(2)
+    for (T j = 0; j < M; j++)
+      for (i = 0; i < n; i += 2) {
+        a[i][j] = foo(i, j);
+// CHECK: invoke {{.+TestStruct.+foo}}
+// CHECK: load i64, i64* [[CNT]],
+// CHECK-NEXT: sub nsw i64 %{{.+}}, 1
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NEXT: load i64, i64* [[CNT]],
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: mul nsw i32 1, %
+// CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-NEXT: sub nsw i64 %
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+#pragma omp ordered depend(sink : j, i - 2) depend(sink : j - 1, i)
+        b[i][j] = bar(a[i][j], b[i - 1][j], b[i][j - 1]);
+// CHECK: invoke {{.+TestStruct.+bar}}
+// CHECK: load i64, i64* [[CNT]],
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+#pragma omp ordered depend(source)
+        baz(a[i][j], b[i][j]);
+      }
+  }
+  // CHECK: landingpad
+  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK: br label %
+
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK: ret
+};
+
+TestStruct<int> s;
+#endif // HEADER
diff --git a/test/OpenMP/parallel_ast_print.cpp b/test/OpenMP/parallel_ast_print.cpp
index 1e46fba..8a15339 100644
--- a/test/OpenMP/parallel_ast_print.cpp
+++ b/test/OpenMP/parallel_ast_print.cpp
@@ -8,6 +8,113 @@
 
 void foo() {}
 
+struct S1 {
+  S1(): a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+  S1& operator +(const S1&);
+  S1& operator *(const S1&);
+  S1& operator &&(const S1&);
+  S1& operator ^(const S1&);
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  T b[100];
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp parallel private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel firstprivate(a) firstprivate(this->a) firstprivate(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel shared(a) shared(this->a) shared(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel reduction(+ : a) reduction(*: b[:])
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel firstprivate(a) firstprivate(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel shared(a) shared(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel reduction(&& : this->a) reduction(^: b[s.a.a])
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel private(this->a) private(this->a) private(this->S1::a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a) firstprivate(this->S1::a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a) shared(this->S1::a)
+// CHECK: #pragma omp parallel reduction(+: this->a) reduction(*: this->b[:])
+// CHECK: #pragma omp parallel private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a) firstprivate(T::a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a) shared(T::a)
+// CHECK: #pragma omp parallel reduction(+: this->a) reduction(*: this->b[:])
+// CHECK: #pragma omp parallel private(this->a) private(this->a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a)
+// CHECK: #pragma omp parallel reduction(&&: this->a) reduction(^: this->b[s.a.a])
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp parallel private(a) private(this->a) private(S7 < S1 > ::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel firstprivate(a) firstprivate(this->a) firstprivate(S7 < S1 > ::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel shared(a) shared(this->a) shared(S7 < S1 > ::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel reduction(^ : S7 < S1 > ::a) reduction(+ : S7 < S1 > ::b[ : S7 < S1 > ::a.a])
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp parallel private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel firstprivate(a) firstprivate(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel shared(a) shared(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel reduction(* : this->a) reduction(&&:this->b[a.a:])
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel private(this->a) private(this->a) private(this->S7<S1>::a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a) firstprivate(this->S7<S1>::a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a) shared(this->S7<S1>::a)
+// CHECK: #pragma omp parallel reduction(^: this->S7<S1>::a) reduction(+: this->S7<S1>::b[:this->S7<S1>::a.a])
+// CHECK: #pragma omp parallel private(this->a) private(this->a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a)
+// CHECK: #pragma omp parallel reduction(*: this->a) reduction(&&: this->b[this->a.a:])
+
 template <class T>
 struct S {
   operator T() {return T();}
@@ -120,4 +227,7 @@
   }
 }
 
+template<typename T>
+T S<T>::TS = 0;
+
 #endif
diff --git a/test/OpenMP/parallel_copyin_codegen.cpp b/test/OpenMP/parallel_copyin_codegen.cpp
index ff76cfe..49e7b3f 100644
--- a/test/OpenMP/parallel_copyin_codegen.cpp
+++ b/test/OpenMP/parallel_copyin_codegen.cpp
@@ -87,10 +87,6 @@
   // TLS-LAMBDA:     [[G_CPY_VAL:%.+]] = call{{( cxx_fast_tlscc)?}} i{{[0-9]+}}* [[G_CTOR:@.+]]()
   // TLS-LAMBDA:     call {{.*}}void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G_CPY_VAL]])
 
-  // TLS-LAMBDA:     define {{.*}}i{{[0-9]+}}* [[G_CTOR]]()
-  // TLS-LAMBDA:     ret i{{[0-9]+}}* [[G]]
-  // TLS-LAMBDA:     }
-
 #pragma omp parallel copyin(g)
   {
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
@@ -122,6 +118,11 @@
     g = 1;
     // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}*
     // TLS-LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}*
+
+    // TLS-LAMBDA:     define {{.*}}i{{[0-9]+}}* [[G_CTOR]]()
+    // TLS-LAMBDA:     ret i{{[0-9]+}}* [[G]]
+    // TLS-LAMBDA:     }
+
     [&]() {
       // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
       // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
@@ -149,9 +150,6 @@
   // TLS-BLOCKS:     [[G_CPY_VAL:%.+]] = call{{( cxx_fast_tlscc)?}} i{{[0-9]+}}* [[G_CTOR:@.+]]()
   // TLS-BLOCKS:     call {{.*}}void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G_CPY_VAL]])
 
-  // TLS-BLOCKS:     define {{.*}}i{{[0-9]+}}* [[G_CTOR]]()
-  // TLS-BLOCKS:     ret i{{[0-9]+}}* [[G]]
-  // TLS-BLOCKS:     }
 #pragma omp parallel copyin(g)
   {
     // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
@@ -189,6 +187,10 @@
     // TLS-BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_CAPTURE_DST]]
     // TLS-BLOCKS-NOT: [[G]]{{[[^:word:]]}}
     // TLS-BLOCKS: call {{.*}}void {{%.+}}(i8
+
+    // TLS-BLOCKS:     define {{.*}}i{{[0-9]+}}* [[G_CTOR]]()
+    // TLS-BLOCKS:     ret i{{[0-9]+}}* [[G]]
+    // TLS-BLOCKS:     }
     ^{
       // BLOCKS: define {{.+}} void {{@.+}}(i8*
       // TLS-BLOCKS: define {{.+}} void {{@.+}}(i8*
diff --git a/test/OpenMP/parallel_firstprivate_codegen.cpp b/test/OpenMP/parallel_firstprivate_codegen.cpp
index d0da8ce..d20f3f5 100644
--- a/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -1,8 +1,15 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA -check-prefix=LAMBDA-32 %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS -check-prefix=BLOCKS-32 %s
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA -check-prefix=LAMBDA-64 %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS -check-prefix=BLOCKS-64 %s
+
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
 // expected-no-diagnostics
 #ifndef ARRAY
@@ -18,6 +25,60 @@
 
 volatile int g __attribute__((aligned(128))) = 1212;
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  int e[4];
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel firstprivate(a, b, c, e)
+#ifdef LAMBDA
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel firstprivate(a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ++a;
+      --this->b;
+      (this)->c /= 1;
+#pragma omp parallel firstprivate(a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#else
+    ++this->a, --b, c /= 1, e[2] = 1111;
+#endif
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel firstprivate(a)
+#ifdef LAMBDA
+    [&]() {
+      [&]() {
+        ++this->a;
+#pragma omp parallel firstprivate(a)
+        ++(this)->a;
+      }();
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ^{
+        ++a;
+#pragma omp parallel firstprivate(a)
+        ++(this)->a;
+      }();
+    }();
+#else
+    ++(this)->a;
+#endif
+  }
+};
+
 template <class T>
 struct S {
   T f;
@@ -28,14 +89,17 @@
   ~S() {}
 };
 
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
 // CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 
 template <typename T>
 T tmain() {
   S<T> test;
+  SST<T> sst;
   T t_var __attribute__((aligned(128))) = T();
   T vec[] __attribute__((aligned(128))) = {1, 2};
   S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
@@ -52,33 +116,79 @@
 
 int main() {
   static int sivar;
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@[^(]+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_call({{.+}}, i32 2, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G]], {{.+}})
 #pragma omp parallel firstprivate(g, sivar)
   {
-    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}})
-    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}, align 128
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: store i8
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, [[iz:i64|i32]], {{i64|i32}}, {{i64|i32}}, [4 x i{{[0-9]+}}]*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [4 x i{{[0-9]+}}]* {{.+}})
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}})
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV]] to i32*
+    // LAMBDA-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+    // LAMBDA-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
+    // LAMBDA-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV]] to i32*
+    // LAMBDA-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV]] to i32*
+    // LAMBDA-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
+    // LAMBDA-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
+    // LAMBDA-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-64-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_CONV]],
+    // LAMBDA-32-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-64-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_CONV]],
+    // LAMBDA-32-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: ret void
+
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[iz]] {{.*}}%{{.+}})
     // LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}, align 128
     // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_REF_ADDR:%.+]]
-    // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_REF_ADDR:%.+]]
+    // LAMBDA-64: [[SIVAR_PRIVATE_CONV:%.+]] = bitcast i64* [[SIVAR_PRIVATE_ADDR]] to i32*
     // LAMBDA: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G_REF]], align 128
     // LAMBDA: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]], align 128
-    // LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]]
-    // LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
-    // LAMBDA: call {{.*}}void @__kmpc_barrier(
+    // LAMBDA-NOT: call {{.*}}void @__kmpc_barrier(
     g = 1;
     sivar = 2;
     // LAMBDA: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
-    // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
+    // LAMBDA-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_CONV]],
+    // LAMBDA-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
     // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
     // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
     // LAMBDA: [[SIVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-    // LAMBDA: store i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
+    // LAMBDA-64: store i{{[0-9]+}}* [[SIVAR_PRIVATE_CONV]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
+    // LAMBDA-32: store i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
     // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
     [&]() {
       // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
@@ -98,33 +208,32 @@
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call {{.*}}void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
   // BLOCKS: call {{.*}}void {{.+}} @__kmpc_fork_call({{.+}}, i32 2, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G]], {{.+}})
 #pragma omp parallel firstprivate(g, sivar)
   {
-    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}})
-    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}, align 128
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[iz:i64|i32]] {{.*}}%{{.+}})
     // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}, align 128
     // BLOCKS: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_REF_ADDR:%.+]]
-    // BLOCKS: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_REF_ADDR:%.+]]
+    // BLOCKS-64: [[SIVAR_PRIVATE_CONV:%.+]] = bitcast i64* [[SIVAR_PRIVATE_ADDR]] to i32*
     // BLOCKS: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G_REF]], align 128
     // BLOCKS: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]], align 128
-    // BLOCK: [[SIVAR_REF_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-    // BLOCK: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_REF_ADDR]]
-    // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]],
-    // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
-    // BLOCKS: call {{.*}}void @__kmpc_barrier(
+    // BLOCKS-NOT: call {{.*}}void @__kmpc_barrier(
     g = 1;
     sivar = 2;
     // BLOCKS: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
-    // BLOCKS: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
+    // BLOCKS-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_CONV]],
+    // BLOCKS-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
     // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
     // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
     // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
     // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
-    // BLOCKS: i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+    // BLOCKS-64: i{{[0-9]+}}* [[SIVAR_PRIVATE_CONV]]
+    // BLOCKS-32: i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
     // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
     // BLOCKS: call {{.*}}void {{%.+}}(i8
     ^{
@@ -142,6 +251,48 @@
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: store i8
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, [[iz]], [[iz]], [[iz]], [4 x i{{[0-9]+}}]*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [4 x i{{[0-9]+}}]* {{.+}})
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+// BLOCKS: call{{.*}} void
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}})
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV]] to i32*
+// BLOCKS-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+// BLOCKS-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
+// BLOCKS-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV]] to i32*
+// BLOCKS-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV]] to i32*
+// BLOCKS-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
+// BLOCKS-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
+// BLOCKS-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-64-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_CONV]],
+// BLOCKS-32-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-64-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_CONV]],
+// BLOCKS-32-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: ret void
 #else
   S<float> test;
   int t_var = 0;
@@ -162,27 +313,40 @@
 
 // CHECK: define {{.*}}i{{[0-9]+}} @main()
 // CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[T_VAR:%.+]] = alloca i32,
+// CHECK: [[T_VARCAST:%.+]] = alloca [[iz:i64|i32]],
+// CHECK: [[SIVARCAST:%.+]] = alloca [[iz]],
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
-// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, i32*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}}*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR]],
+// CHECK-64: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST]] to i32*
+// CHECK-64: store i32 [[T_VARVAL]], i32* [[T_VARCONV]],
+// CHECK-32: store i32 [[T_VARVAL]], i32* [[T_VARCAST]],
+// CHECK: [[T_VARPVT:%.+]] = load [[iz]], [[iz]]* [[T_VARCAST]],
+// CHECK: [[SIVARVAL:%.+]] = load i32, i32* @{{.+}},
+// CHECK-64: [[SIVARCONV:%.+]] = bitcast i64* [[SIVARCAST]] to i32*
+// CHECK-64: store i32 [[SIVARVAL]], i32* [[SIVARCONV]],
+// CHECK-32: store i32 [[SIVARVAL]], i32* [[SIVARCAST]],
+// CHECK: [[SIVARPVT:%.+]] = load [[iz]], [[iz]]* [[SIVARCAST]],
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, [[iz]], [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[MAIN_MICROTASK:@.+]] to void {{.*}}[[iz]] [[T_VARPVT]],{{.*}}[[iz]] [[SIVARPVT]]
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 // CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
 // CHECK: ret
 //
-// CHECK: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}}, i32* dereferenceable(4) [[SIVAR:%.+]])
+// CHECK: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [[iz]] {{.*}}%{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}}, [[iz]] {{.*}}[[SIVAR:%.+]])
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[SIVAR7_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
-// CHECK: [[SIVAR7_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-64: [[T_VAR_CONV:%.+]] = bitcast i64* [[T_VAR_PRIV]] to i32*
 // CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** %
 // CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
-// CHECK: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %{{.+}},
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
-// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
@@ -200,9 +364,8 @@
 // CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
-// CHECK: [[SIVAR_REF_ADDR:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]],
-// CHECK: store i{{[0-9]+}} [[SIVAR_REF_ADDR]], i{{[0-9]+}}* [[SIVAR7_PRIV]],
-// CHECK: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_PRIV]],
+// CHECK-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_CONV]],
+// CHECK-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_PRIV]],
 
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
@@ -214,6 +377,55 @@
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: store i8
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, [[iz]], [[iz]], [[iz]], [4 x i32]*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [4 x i{{[0-9]+}}]* {{.+}})
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[E_PRIV:%.+]] = alloca [4 x i{{[0-9]+}}],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[A_PRIV]]
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[B_PRIV]]
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[C_PRIV]]
+// CHECK-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV:%.+]] to i32*
+// CHECK-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+// CHECK-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
+// CHECK-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV:%.+]] to i32*
+// CHECK-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV:%.+]] to i32*
+// CHECK-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
+// CHECK-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
+// CHECK: bitcast [4 x i{{[0-9]+}}]* [[E_PRIV]] to i8*
+// CHECK: bitcast [4 x i{{[0-9]+}}]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy
+// CHECK: store [4 x i{{[0-9]+}}]* [[E_PRIV]], [4 x i{{[0-9]+}}]** [[REFE:%.+]],
+// CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-64-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_CONV]],
+// CHECK-32-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-64-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_CONV]],
+// CHECK-32-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[E_PRIV:%.+]] = load [4 x i{{[0-9]+}}]*, [4 x i{{[0-9]+}}]** [[REFE]],
+// CHECK-NEXT: [[E_PRIV_2:%.+]] = getelementptr inbounds [4 x i{{[0-9]+}}], [4 x i{{[0-9]+}}]* [[E_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK-NEXT: store i32 1111, i32* [[E_PRIV_2]],
+// CHECK-NEXT: ret void
+
 // CHECK: define internal {{.*}}void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
@@ -244,9 +456,7 @@
 // CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
 // CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call {{.*}}void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK-NOT: call {{.*}}void @__kmpc_barrier(
 // CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
 // CHECK: ret void
@@ -270,13 +480,13 @@
 void array_func(float a[3], St s[2], int n, long double vla1[n]) {
   double vla2[n][n] __attribute__((aligned(128)));
 // ARRAY: @__kmpc_fork_call(
-// ARRAY-DAG: [[PRIV_A:%.+]] = alloca float**,
-// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St**,
-// ARRAY-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80**,
+// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St*,
+// ARRAY-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80*,
+// ARRAY-DAG: [[PRIV_A:%.+]] = alloca float*,
 // ARRAY-DAG: [[PRIV_VLA2:%.+]] = alloca double*,
-// ARRAY-DAG: store float** %{{.+}}, float*** [[PRIV_A]],
-// ARRAY-DAG: store %struct.St** %{{.+}}, %struct.St*** [[PRIV_S]],
-// ARRAY-DAG: store x86_fp80** %{{.+}}, x86_fp80*** [[PRIV_VLA1]],
+// ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
+// ARRAY-DAG: store float* %{{.+}}, float** [[PRIV_A]],
 // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
 // ARRAY: call i8* @llvm.stacksave()
 // ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
@@ -288,11 +498,11 @@
 
 // ARRAY-LABEL: St_func
 // ARRAY: @__kmpc_fork_call(
-// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St**,
-// ARRAY-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80**,
+// ARRAY-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80*,
+// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St*,
 // ARRAY-DAG: [[PRIV_VLA2:%.+]] = alloca double*,
-// ARRAY-DAG: store %struct.St** %{{.+}}, %struct.St*** [[PRIV_S]],
-// ARRAY-DAG: store x86_fp80** %{{.+}}, x86_fp80*** [[PRIV_VLA1]],
+// ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
 // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
 // ARRAY: call i8* @llvm.stacksave()
 // ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
diff --git a/test/OpenMP/parallel_for_ast_print.cpp b/test/OpenMP/parallel_for_ast_print.cpp
index c4be521..2476ee8 100644
--- a/test/OpenMP/parallel_for_ast_print.cpp
+++ b/test/OpenMP/parallel_for_ast_print.cpp
@@ -8,6 +8,57 @@
 
 void foo() {}
 
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel for private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp parallel for private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp parallel for private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp parallel for private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel for private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp parallel for private(this->a) private(this->a)
+
 template <class T, int N>
 T tmain(T argc) {
   T b = argc, c, d, e, f, h;
diff --git a/test/OpenMP/parallel_for_collapse_messages.cpp b/test/OpenMP/parallel_for_collapse_messages.cpp
index 6e5f71f..4461df8 100644
--- a/test/OpenMP/parallel_for_collapse_messages.cpp
+++ b/test/OpenMP/parallel_for_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp parallel for', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp parallel for' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp parallel for collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for collapse (1)
@@ -59,16 +71,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp parallel for', but found only 1}}
   #pragma omp parallel for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp parallel for', but found only 1}}
-  #pragma omp parallel for collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp parallel for collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4{{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp parallel for' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp parallel for collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp parallel for' must be a for loop}}
diff --git a/test/OpenMP/parallel_for_linear_messages.cpp b/test/OpenMP/parallel_for_linear_messages.cpp
index 7272aad..e5f5b61 100644
--- a/test/OpenMP/parallel_for_linear_messages.cpp
+++ b/test/OpenMP/parallel_for_linear_messages.cpp
@@ -263,7 +263,7 @@
   for (int k = 0; k < argc; ++k)
     ++k;
 
-  foomain<int, char>(argc, argv);
+  foomain<int, char>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
   return 0;
 }
 
diff --git a/test/OpenMP/parallel_for_loop_messages.cpp b/test/OpenMP/parallel_for_loop_messages.cpp
index 2bb32bd..7e136e7 100644
--- a/test/OpenMP/parallel_for_loop_messages.cpp
+++ b/test/OpenMP/parallel_for_loop_messages.cpp
@@ -354,12 +354,12 @@
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -402,7 +402,7 @@
 #pragma omp parallel for
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel for
   for (begin = begin0; begin < end; ++begin)
diff --git a/test/OpenMP/parallel_for_ordered_messages.cpp b/test/OpenMP/parallel_for_ordered_messages.cpp
index 3729eb9..055fe1b 100644
--- a/test/OpenMP/parallel_for_ordered_messages.cpp
+++ b/test/OpenMP/parallel_for_ordered_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -36,16 +41,23 @@
 #pragma omp parallel for ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp parallel for', but found only 1}}
-// expected-error@+3 2 {{directive '#pragma omp parallel for' cannot contain more than one 'ordered' clause}}
-// expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
-// expected-error@+1 2 {{expression is not an integral constant expression}}
+// expected-error@+6 2 {{directive '#pragma omp parallel for' cannot contain more than one 'ordered' clause}}
+// expected-error@+5 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
 #pragma omp parallel for ordered(foobool(argc)), ordered(true), ordered(-5)
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
 #pragma omp parallel for ordered(S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+// expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
 #pragma omp parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
@@ -76,10 +88,17 @@
 #pragma omp parallel for ordered(2 + 2))      // expected-warning {{extra tokens at the end of '#pragma omp parallel for' are ignored}}  expected-note {{as specified in 'ordered' clause}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];            // expected-error {{expected 4 for loops after '#pragma omp parallel for', but found only 1}}
-#pragma omp parallel for ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+// expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp parallel for ordered(foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+3 {{expression is not an integral constant expression}}
+// expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
 // expected-error@+2 2 {{directive '#pragma omp parallel for' cannot contain more than one 'ordered' clause}}
 // expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
 #pragma omp parallel for ordered(foobool(argc)), ordered(true), ordered(-5)
@@ -88,7 +107,11 @@
 #pragma omp parallel for ordered(S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+// expected-error@+4 {{expression is not an integral constant expression}}
+#else
+// expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
 #pragma omp parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
diff --git a/test/OpenMP/parallel_for_private_messages.cpp b/test/OpenMP/parallel_for_private_messages.cpp
index efc827b..cc1b79f 100644
--- a/test/OpenMP/parallel_for_private_messages.cpp
+++ b/test/OpenMP/parallel_for_private_messages.cpp
@@ -29,7 +29,11 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel for private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -119,6 +167,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp parallel for private // expected-error {{expected '(' after 'private'}}
@@ -180,6 +230,8 @@
   for (int k = 0; k < argc; ++k)
     m = k + 2;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/parallel_for_reduction_messages.cpp b/test/OpenMP/parallel_for_reduction_messages.cpp
index 22251b4..4d5a143 100644
--- a/test/OpenMP/parallel_for_reduction_messages.cpp
+++ b/test/OpenMP/parallel_for_reduction_messages.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
 
 void foo() {
 }
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -111,7 +111,7 @@
 #pragma omp parallel for reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp parallel for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(&& : argc)
@@ -120,22 +120,22 @@
 #pragma omp parallel for reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -150,7 +150,7 @@
 #pragma omp parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp parallel for reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -160,7 +160,7 @@
 #pragma omp parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
@@ -251,13 +251,13 @@
 #pragma omp parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
diff --git a/test/OpenMP/parallel_for_simd_aligned_messages.cpp b/test/OpenMP/parallel_for_simd_aligned_messages.cpp
index 8bffd21..2ccdf06 100644
--- a/test/OpenMP/parallel_for_simd_aligned_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_aligned_messages.cpp
@@ -196,6 +196,7 @@
   #pragma omp parallel for simd aligned(h)
   for (int k = 0; k < argc; ++k) ++k;
   int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
   foomain<int*,char>(pargc,argv);
   return 0;
 }
diff --git a/test/OpenMP/parallel_for_simd_ast_print.cpp b/test/OpenMP/parallel_for_simd_ast_print.cpp
index 1b9415d..cdd1b73 100644
--- a/test/OpenMP/parallel_for_simd_ast_print.cpp
+++ b/test/OpenMP/parallel_for_simd_ast_print.cpp
@@ -7,6 +7,58 @@
 #define HEADER
 
 void foo() {}
+
+struct S1 {
+  S1() : a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a) private(this->S1::a)
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a)
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp parallel for simd private(a) private(this->a) private(S7<S1>::a) 
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a) private(this->S7<S1>::a)
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a)
+
 int g_ind = 1;
 template<class T, class N> T reduct(T* arr, N num) {
   N i;
diff --git a/test/OpenMP/parallel_for_simd_codegen.cpp b/test/OpenMP/parallel_for_simd_codegen.cpp
index 9a6ad2f..3498916 100644
--- a/test/OpenMP/parallel_for_simd_codegen.cpp
+++ b/test/OpenMP/parallel_for_simd_codegen.cpp
@@ -30,12 +30,12 @@
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[UB_VAL]], 5
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i32 [ 5, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i32 [[UP]], i32* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i32, i32* [[LB]],
@@ -46,7 +46,7 @@
 // CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP]], label %[[SIMPLE_LOOP1_BODY:.+]], label %[[SIMPLE_LOOP1_END:[^,]+]]
   for (int i = 3; i < 32; i += 5) {
-// CHECK: [[SIMPLE_LOOP1_BODY]]
+// CHECK: [[SIMPLE_LOOP1_BODY]]:
 // Start of body: calculate i from IV:
 // CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
 // CHECK: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 5
@@ -61,7 +61,7 @@
 // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
 // br label %{{.+}}, !llvm.loop !{{.+}}
   }
-// CHECK: [[SIMPLE_LOOP1_END]]
+// CHECK: [[SIMPLE_LOOP1_END]]:
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
 
   long long k = get_val();
@@ -74,7 +74,7 @@
 // CHECK: [[NEXT:%.+]] = call i32 @__kmpc_dispatch_next_4(%ident_t* {{.+}}, i32 %{{.+}}, i32* %{{.+}}, i32* [[LB:%.+]], i32* [[UB:%.+]], i32* %{{.+}})
 // CHECK: [[COND:%.+]] = icmp ne i32 [[NEXT]], 0
 // CHECK: br i1 [[COND]], label %[[CONT:.+]], label %[[END:.+]]
-// CHECK: [[CONT]]
+// CHECK: [[CONT]]:
 // CHECK: [[LB_VAL:%.+]] = load i32, i32* [[LB]],
 // CHECK: store i32 [[LB_VAL]], i32* [[OMP_IV2:%[^,]+]],
 
@@ -83,7 +83,7 @@
 // CHECK-NEXT: [[CMP2:%.+]] = icmp sle i32 [[IV2]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP2]], label %[[SIMPLE_LOOP2_BODY:.+]], label %[[SIMPLE_LOOP2_END:[^,]+]]
   for (int i = 10; i > 1; i--) {
-// CHECK: [[SIMPLE_LOOP2_BODY]]
+// CHECK: [[SIMPLE_LOOP2_BODY]]:
 // Start of body: calculate i from IV:
 // CHECK: [[IV2_0:%.+]] = load i32, i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
 // FIXME: It is interesting, why the following "mul 1" was not constant folded?
@@ -105,7 +105,7 @@
 // CHECK-NEXT: store i32 [[ADD2_2]], i32* [[OMP_IV2]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP2_ID]]
 // br label {{.+}}, !llvm.loop ![[SIMPLE_LOOP2_ID]]
   }
-// CHECK: [[SIMPLE_LOOP2_END]]
+// CHECK: [[SIMPLE_LOOP2_END]]:
 //
 // Update linear vars after loop, as the loop was operating on a private version.
 // CHECK: [[LIN0_2:%.+]] = load i64, i64* [[LIN0]]
@@ -130,12 +130,12 @@
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp ugt i64 [[UB_VAL]], 3
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i64 [ 3, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i64 [[UP]], i64* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
@@ -146,7 +146,7 @@
 // CHECK-NEXT: [[CMP3:%.+]] = icmp ule i64 [[IV3]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP3]], label %[[SIMPLE_LOOP3_BODY:.+]], label %[[SIMPLE_LOOP3_END:[^,]+]]
   for (unsigned long long it = 2000; it >= 600; it-=400) {
-// CHECK: [[SIMPLE_LOOP3_BODY]]
+// CHECK: [[SIMPLE_LOOP3_BODY]]:
 // Start of body: calculate it from IV:
 // CHECK: [[IV3_0:%.+]] = load i64, i64* [[OMP_IV3]]
 // CHECK-NEXT: [[LC_IT_1:%.+]] = mul i64 [[IV3_0]], 400
@@ -172,7 +172,7 @@
 // CHECK-NEXT: [[ADD3_2:%.+]] = add i64 [[IV3_2]], 1
 // CHECK-NEXT: store i64 [[ADD3_2]], i64* [[OMP_IV3]]
   }
-// CHECK: [[SIMPLE_LOOP3_END]]
+// CHECK: [[SIMPLE_LOOP3_END]]:
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
 //
 // Linear start and step are used to calculate final value of the linear variables.
@@ -187,12 +187,12 @@
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[UB_VAL]], 3
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i32 [ 3, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i32 [[UP]], i32* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i32, i32* [[LB]],
@@ -203,7 +203,7 @@
 // CHECK-NEXT: [[CMP4:%.+]] = icmp sle i32 [[IV4]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP4]], label %[[SIMPLE_LOOP4_BODY:.+]], label %[[SIMPLE_LOOP4_END:[^,]+]]
   for (short it = 6; it <= 20; it-=-4) {
-// CHECK: [[SIMPLE_LOOP4_BODY]]
+// CHECK: [[SIMPLE_LOOP4_BODY]]:
 // Start of body: calculate it from IV:
 // CHECK: [[IV4_0:%.+]] = load i32, i32* [[OMP_IV4]]
 // CHECK-NEXT: [[LC_IT_1:%.+]] = mul nsw i32 [[IV4_0]], 4
@@ -215,7 +215,7 @@
 // CHECK-NEXT: [[ADD4_2:%.+]] = add nsw i32 [[IV4_2]], 1
 // CHECK-NEXT: store i32 [[ADD4_2]], i32* [[OMP_IV4]]
   }
-// CHECK: [[SIMPLE_LOOP4_END]]
+// CHECK: [[SIMPLE_LOOP4_END]]:
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
 
   #pragma omp parallel for simd
@@ -223,12 +223,12 @@
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[UB_VAL]], 25
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i32 [ 25, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i32 [[UP]], i32* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i32, i32* [[LB]],
@@ -239,7 +239,7 @@
 // CHECK-NEXT: [[CMP5:%.+]] = icmp sle i32 [[IV5]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP5]], label %[[SIMPLE_LOOP5_BODY:.+]], label %[[SIMPLE_LOOP5_END:[^,]+]]
   for (unsigned char it = 'z'; it >= 'a'; it+=-1) {
-// CHECK: [[SIMPLE_LOOP5_BODY]]
+// CHECK: [[SIMPLE_LOOP5_BODY]]:
 // Start of body: calculate it from IV:
 // CHECK: [[IV5_0:%.+]] = load i32, i32* [[OMP_IV5]]
 // CHECK-NEXT: [[IV5_1:%.+]] = mul nsw i32 [[IV5_0]], 1
@@ -251,7 +251,7 @@
 // CHECK-NEXT: [[ADD5_2:%.+]] = add nsw i32 [[IV5_2]], 1
 // CHECK-NEXT: store i32 [[ADD5_2]], i32* [[OMP_IV5]]
   }
-// CHECK: [[SIMPLE_LOOP5_END]]
+// CHECK: [[SIMPLE_LOOP5_END]]:
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
 
 // CHECK-NOT: mul i32 %{{.+}}, 10
@@ -267,25 +267,25 @@
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], 6
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i64 [ 6, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i64 [[UP]], i64* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
 // CHECK: store i64 [[LB_VAL]], i64* [[OMP_IV7:%[^,]+]],
 
 // CHECK: br label %[[SIMD_LOOP7_COND:[^,]+]]
-// CHECK: [[SIMD_LOOP7_COND]]
+// CHECK: [[SIMD_LOOP7_COND]]:
 // CHECK-NEXT: [[IV7:%.+]] = load i64, i64* [[OMP_IV7]]
 // CHECK-NEXT: [[UB_VAL:%.+]] = load i64, i64* [[UB]]
 // CHECK-NEXT: [[CMP7:%.+]] = icmp sle i64 [[IV7]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP7]], label %[[SIMPLE_LOOP7_BODY:.+]], label %[[SIMPLE_LOOP7_END:[^,]+]]
   for (long long i = -10; i < 10; i += 3) {
-// CHECK: [[SIMPLE_LOOP7_BODY]]
+// CHECK: [[SIMPLE_LOOP7_BODY]]:
 // Start of body: calculate i from IV:
 // CHECK: [[IV7_0:%.+]] = load i64, i64* [[OMP_IV7]]
 // CHECK-NEXT: [[LC_IT_1:%.+]] = mul nsw i64 [[IV7_0]], 3
@@ -299,7 +299,7 @@
 // CHECK-NEXT: [[ADD7_2:%.+]] = add nsw i64 [[IV7_2]], 1
 // CHECK-NEXT: store i64 [[ADD7_2]], i64* [[OMP_IV7]]
   }
-// CHECK: [[SIMPLE_LOOP7_END]]
+// CHECK: [[SIMPLE_LOOP7_END]]:
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
 // CHECK: load i32, i32*
 // CHECK: icmp ne i32 %{{.+}}, 0
@@ -317,25 +317,25 @@
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], 6
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i64 [ 6, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i64 [[UP]], i64* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
 // CHECK: store i64 [[LB_VAL]], i64* [[OMP_IV8:%[^,]+]],
 
 // CHECK: br label %[[SIMD_LOOP8_COND:[^,]+]]
-// CHECK: [[SIMD_LOOP8_COND]]
+// CHECK: [[SIMD_LOOP8_COND]]:
 // CHECK-NEXT: [[IV8:%.+]] = load i64, i64* [[OMP_IV8]]
 // CHECK-NEXT: [[UB_VAL:%.+]] = load i64, i64* [[UB]]
 // CHECK-NEXT: [[CMP8:%.+]] = icmp sle i64 [[IV8]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP8]], label %[[SIMPLE_LOOP8_BODY:.+]], label %[[SIMPLE_LOOP8_END:[^,]+]]
   for (long long i = -10; i < 10; i += 3) {
-// CHECK: [[SIMPLE_LOOP8_BODY]]
+// CHECK: [[SIMPLE_LOOP8_BODY]]:
 // Start of body: calculate i from IV:
 // CHECK: [[IV8_0:%.+]] = load i64, i64* [[OMP_IV8]]
 // CHECK-NEXT: [[LC_IT_1:%.+]] = mul nsw i64 [[IV8_0]], 3
@@ -348,7 +348,7 @@
 // CHECK-NEXT: [[ADD8_2:%.+]] = add nsw i64 [[IV8_2]], 1
 // CHECK-NEXT: store i64 [[ADD8_2]], i64* [[OMP_IV8]]
   }
-// CHECK: [[SIMPLE_LOOP8_END]]
+// CHECK: [[SIMPLE_LOOP8_END]]:
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
 // CHECK: call i32 @__kmpc_reduce_nowait(
 // CHECK: [[R_PRIV_VAL:%.+]] = load i32, i32* [[R_PRIV]],
@@ -426,13 +426,13 @@
 // CHECK-DAG: [[OMP_LAST_IT_VAL:%.+]] = load i32, i32* [[OMP_LAST_IT]],
 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[UB_VAL]], [[OMP_LAST_IT_VAL]]
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: [[OMP_LAST_IT_VAL:%.+]] = load i32, i32* [[OMP_LAST_IT]],
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i32 [ [[OMP_LAST_IT_VAL]], %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i32 [[UP]], i32* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i32, i32* [[LB]],
@@ -443,7 +443,7 @@
 // CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP]], label %[[IT_BODY:[^,]+]], label %[[IT_END:[^,]+]]
   for (IterDouble i = ia; i < ib; ++i) {
-// CHECK: [[IT_BODY]]
+// CHECK: [[IT_BODY]]:
 // Start of body: calculate i from index:
 // CHECK: [[IV1:%.+]] = load i32, i32* [[IT_OMP_IV]]
 // Call of operator+ (i, IV).
@@ -461,7 +461,7 @@
 // CHECK-NEXT: store i32 [[ADD2]], i32* [[IT_OMP_IV]]
 // br label %{{.*}}, !llvm.loop ![[ITER_LOOP_ID]]
   }
-// CHECK: [[IT_END]]
+// CHECK: [[IT_END]]:
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
 // CHECK: ret void
 }
@@ -477,12 +477,12 @@
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp ugt i32 [[UB_VAL]], 119
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i32, i32* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i32 [ 119, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i32 [[UP]], i32* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i32, i32* [[LB]],
@@ -499,7 +499,7 @@
       for (int k = 3; k <= 6; k++) // 4 iterations
         for (l = 4; l < 9; ++l) // 5 iterations
         {
-// CHECK: [[COLL1_BODY]]
+// CHECK: [[COLL1_BODY]]:
 // Start of body: calculate i from index:
 // CHECK: [[IV1:%.+]] = load i32, i32* [[OMP_IV]]
 // Calculation of the loop counters values.
@@ -534,7 +534,7 @@
 // CHECK-NEXT: [[ADD2:%.+]] = add i32 [[IV2]], 1
 // CHECK-NEXT: store i32 [[ADD2]], i32* [[OMP_IV]]
 // br label %{{[^,]+}}, !llvm.loop ![[COLL1_LOOP_ID]]
-// CHECK: [[COLL1_END]]
+// CHECK: [[COLL1_END]]:
   }
 // i,j,l are updated; k is not updated.
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
@@ -563,13 +563,13 @@
 // CHECK-DAG: [[OMP_LAST_IT_VAL:%.+]] = load i64, i64* [[OMP_LAST_IT]],
 // CHECK: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], [[OMP_LAST_IT_VAL]]
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: [[OMP_LAST_IT_VAL:%.+]] = load i64, i64* [[OMP_LAST_IT]],
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i64 [ [[OMP_LAST_IT_VAL]], %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i64 [[UP]], i64* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
@@ -584,7 +584,7 @@
   for (i = 1; i < 3; i++) // 2 iterations
     for (j = 0; j < foo(); j++) // foo() iterations
   {
-// CHECK: [[WIDE1_BODY]]
+// CHECK: [[WIDE1_BODY]]:
 // Start of body: calculate i from index:
 // CHECK: [[IV1:%.+]] = load i64, i64* [[OMP_IV]]
 // Calculation of the loop counters values...
@@ -608,7 +608,7 @@
 // CHECK-NEXT: store i64 [[ADD2]], i64* [[OMP_IV]]
 //
 // br label %{{[^,]+}}, !llvm.loop ![[WIDE1_LOOP_ID]]
-// CHECK: [[WIDE1_END]]
+// CHECK: [[WIDE1_END]]:
   }
 // i,j are updated.
 // CHECK: store i32 3, i32* [[I:%[^,]+]]
@@ -624,12 +624,12 @@
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], 15
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]
+// CHECK: [[TRUE]]:
 // CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]
+// CHECK: [[FALSE]]:
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]
+// CHECK: [[SWITCH]]:
 // CHECK: [[UP:%.+]] = phi i64 [ 15, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
 // CHECK: store i64 [[UP]], i64* [[UB]],
 // CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
@@ -640,7 +640,7 @@
 // CHECK-NEXT: [[UB_VAL:%.+]] = load i64, i64* [[UB]]
 // CHECK-NEXT: [[CMP1:%.+]] = icmp sle i64 [[IV]], [[UB_VAL]]
 // CHECK-NEXT: br i1 [[CMP1]], label %[[T1_BODY:.+]], label %[[T1_END:[^,]+]]
-// CHECK: [[T1_BODY]]
+// CHECK: [[T1_BODY]]:
 // Loop counters i and j updates:
 // CHECK: [[IV1:%.+]] = load i64, i64* [[T1_OMP_IV]]
 // CHECK-NEXT: [[I_1:%.+]] = sdiv i64 [[IV1]], 4
@@ -658,7 +658,7 @@
 // CHECK-NEXT: [[INC:%.+]] = add nsw i64 [[IV3]], 1
 // CHECK-NEXT: store i64 [[INC]], i64*
 // CHECK-NEXT: br label {{%.+}}
-// CHECK: [[T1_END]]
+// CHECK: [[T1_END]]:
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
 // CHECK: ret void
 //
diff --git a/test/OpenMP/parallel_for_simd_collapse_messages.cpp b/test/OpenMP/parallel_for_simd_collapse_messages.cpp
index 4f04cca..c7effbd 100644
--- a/test/OpenMP/parallel_for_simd_collapse_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp parallel for simd', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd collapse (1)
@@ -59,16 +71,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp parallel for simd', but found only 1}}
   #pragma omp parallel for simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp parallel for simd', but found only 1}}
-  #pragma omp parallel for simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp parallel for simd collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp parallel for simd' must be a for loop}}
diff --git a/test/OpenMP/parallel_for_simd_linear_messages.cpp b/test/OpenMP/parallel_for_simd_linear_messages.cpp
index 858f53f..fc1895a 100644
--- a/test/OpenMP/parallel_for_simd_linear_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_linear_messages.cpp
@@ -208,7 +208,7 @@
   #pragma omp parallel for simd linear(i)
   for (int k = 0; k < argc; ++k) ++k;
 
-  foomain<int,char>(argc,argv);
+  foomain<int,char>(argc,argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
   return 0;
 }
 
diff --git a/test/OpenMP/parallel_for_simd_loop_messages.cpp b/test/OpenMP/parallel_for_simd_loop_messages.cpp
index e5fd8c0..403e951 100644
--- a/test/OpenMP/parallel_for_simd_loop_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_loop_messages.cpp
@@ -355,12 +355,12 @@
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -403,7 +403,7 @@
 #pragma omp parallel for simd
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel for simd
   for (begin = begin0; begin < end; ++begin)
diff --git a/test/OpenMP/parallel_for_simd_private_messages.cpp b/test/OpenMP/parallel_for_simd_private_messages.cpp
index a031d40..a33b35d 100644
--- a/test/OpenMP/parallel_for_simd_private_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_private_messages.cpp
@@ -29,7 +29,11 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel for simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -119,6 +167,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp parallel for simd private // expected-error {{expected '(' after 'private'}}
@@ -180,6 +230,8 @@
   for (int k = 0; k < argc; ++k)
     m = k + 3;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/parallel_for_simd_reduction_messages.cpp b/test/OpenMP/parallel_for_simd_reduction_messages.cpp
index e2e9e1b..afb0b36 100644
--- a/test/OpenMP/parallel_for_simd_reduction_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_reduction_messages.cpp
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -111,7 +111,7 @@
 #pragma omp parallel for simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp parallel for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(&& : argc)
@@ -120,22 +120,22 @@
 #pragma omp parallel for simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -150,7 +150,7 @@
 #pragma omp parallel for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp parallel for simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -160,7 +160,7 @@
 #pragma omp parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
@@ -251,13 +251,13 @@
 #pragma omp parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
diff --git a/test/OpenMP/parallel_for_simd_safelen_messages.cpp b/test/OpenMP/parallel_for_simd_safelen_messages.cpp
index 45f2fa2..3e643c6 100644
--- a/test/OpenMP/parallel_for_simd_safelen_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_safelen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd safelen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}}
-  // expected-error@+2 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd safelen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd safelen (4)
@@ -57,16 +69,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}}
   // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
   #pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp parallel for simd' must be a for loop}}
diff --git a/test/OpenMP/parallel_for_simd_simdlen_messages.cpp b/test/OpenMP/parallel_for_simd_simdlen_messages.cpp
index dd1cf0f..fa9e0d6 100644
--- a/test/OpenMP/parallel_for_simd_simdlen_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_simdlen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd simdlen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'simdlen' clause}}
-  // expected-error@+2 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd simdlen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd simdlen (4)
@@ -57,16 +69,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp parallel for simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp parallel for simd simdlen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'simdlen' clause}}
   // expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
   #pragma omp parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp parallel for simd' must be a for loop}}
diff --git a/test/OpenMP/parallel_messages.cpp b/test/OpenMP/parallel_messages.cpp
index 8aee841..4db55a0 100644
--- a/test/OpenMP/parallel_messages.cpp
+++ b/test/OpenMP/parallel_messages.cpp
@@ -5,7 +5,12 @@
 
 #pragma omp parallel // expected-error {{unexpected OpenMP directive '#pragma omp parallel'}}
 
+struct S;
+S& bar();
 int main(int argc, char **argv) {
+  S &s = bar();
+  #pragma omp parallel
+  (void)&s;
   #pragma omp parallel { // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
   foo();
   #pragma omp parallel ( // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
diff --git a/test/OpenMP/parallel_num_threads_codegen.cpp b/test/OpenMP/parallel_num_threads_codegen.cpp
index d744e5e..c5f11bd 100644
--- a/test/OpenMP/parallel_num_threads_codegen.cpp
+++ b/test/OpenMP/parallel_num_threads_codegen.cpp
@@ -72,11 +72,11 @@
 // CHECK:       [[GTID:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEF_LOC_2]])
 // CHECK:       call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 1)
 // CHECK:       call {{.*}}void {{.*}} @__kmpc_fork_call(
-// CHECK:       call {{.*}} [[S_TY_CONSTR]]([[S_TY]]* [[S_TEMP:%.+]], [[INTPTR_T_TY]] [[INTPTR_T_TY_ATTR]]23)
+// CHECK:       {{(invoke|call)}} {{.*}} [[S_TY_CONSTR]]([[S_TY]]* [[S_TEMP:%.+]], [[INTPTR_T_TY]] [[INTPTR_T_TY_ATTR]]23)
 // CHECK:       [[S_CHAR_OP:%.+]] = invoke{{.*}} i8 [[S_TY_CHAR_OP]]([[S_TY]]* [[S_TEMP]])
 // CHECK:       [[RES:%.+]] = sext {{.*}}i8 [[S_CHAR_OP]] to i32
 // CHECK:       call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 [[RES]])
-// CHECK:       call {{.*}} [[S_TY_DESTR]]([[S_TY]]* [[S_TEMP]])
+// CHECK:       {{(invoke|call)}} {{.*}} [[S_TY_DESTR]]([[S_TY]]* [[S_TEMP]])
 // CHECK:       call {{.*}}void {{.*}} @__kmpc_fork_call(
 // CHECK:       ret [[INT_TY]] 0
 // CHECK:       }
diff --git a/test/OpenMP/parallel_private_codegen.cpp b/test/OpenMP/parallel_private_codegen.cpp
index 1d195be..1498d45 100644
--- a/test/OpenMP/parallel_private_codegen.cpp
+++ b/test/OpenMP/parallel_private_codegen.cpp
@@ -18,11 +18,69 @@
 
 volatile int g __attribute__((aligned(128))) = 1212;
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel private(a, b, c)
+#ifdef LAMBDA
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel private(a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ++a;
+      --this->b;
+      (this)->c /= 1;
+#pragma omp parallel private(a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#else
+    ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel private(a)
+#ifdef LAMBDA
+    [&]() {
+      [&]() {
+        ++this->a;
+#pragma omp parallel private(a)
+        ++(this)->a;
+      }();
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ^{
+        ++a;
+#pragma omp parallel private(a)
+        ++(this)->a;
+      }();
+    }();
+#else
+    ++(this)->a;
+#endif
+  }
+};
+
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK: [[SST_TY:%.+]] = type { i{{[0-9]+}} }
 template <typename T>
 T tmain() {
   S<T> test;
+  SST<T> sst;
   T t_var __attribute__((aligned(128))) = T();
   T vec[] __attribute__((aligned(128))) = {1, 2};
   S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
@@ -37,16 +95,49 @@
 
 int main() {
   static int sivar;
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@[^(]+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA-NOT: = getelementptr inbounds %{{.+}},
   // LAMBDA: call{{.*}} void {{.+}} @__kmpc_fork_call({{.+}}, i32 0, {{.+}}* [[OMP_REGION:@.+]] to {{.+}})
 #pragma omp parallel private(g, sivar)
   {
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: store i8
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+    // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+    // LAMBDA-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: ret void
+
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
     // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
@@ -80,6 +171,7 @@
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call{{.*}} void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
@@ -116,6 +208,35 @@
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: store i8
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+// BLOCKS: call{{.*}} void
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// BLOCKS: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// BLOCKS-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: ret void
 #else
   S<float> test;
   int t_var = 0;
@@ -166,6 +287,31 @@
 // CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: store i8
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: ret void
+
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
@@ -184,5 +330,20 @@
 // CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
 // CHECK: ret void
+
+// CHECK: define {{.+}} @{{.+}}([[SST_TY]]* %
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SST_TY]]*)* [[SST_MICROTASK:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[SST_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SST_TY]]* %{{.+}})
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REF:%.+]],
+// CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REF]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: ret void
+
 #endif
 
diff --git a/test/OpenMP/parallel_reduction_codegen.cpp b/test/OpenMP/parallel_reduction_codegen.cpp
index b9744b6..703750c 100644
--- a/test/OpenMP/parallel_reduction_codegen.cpp
+++ b/test/OpenMP/parallel_reduction_codegen.cpp
@@ -20,6 +20,62 @@
   ~S() {}
 };
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel reduction(+: a, b, c)
+#ifdef LAMBDA
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel reduction(&: a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ++a;
+      --this->b;
+      (this)->c /= 1;
+#pragma omp parallel reduction(-: a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#else
+    ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel reduction(*: a)
+#ifdef LAMBDA
+    [&]() {
+      [&]() {
+        ++this->a;
+#pragma omp parallel reduction(&& :a)
+        ++(this)->a;
+      }();
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ^{
+        ++a;
+#pragma omp parallel reduction(|: a)
+        ++(this)->a;
+      }();
+    }();
+#else
+    ++(this)->a;
+#endif
+  }
+};
+
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
 // CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
@@ -29,6 +85,7 @@
 T tmain() {
   T t;
   S<T> test;
+  SST<T> sst;
   T t_var __attribute__((aligned(128))) = T(), t_var1 __attribute__((aligned(128)));
   T vec[] = {1, 2};
   S<T> s_arr[]  = {1, 2};
@@ -41,16 +98,62 @@
   return T();
 }
 
+int sivar;
 int main() {
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@[^(]+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G]])
 #pragma omp parallel reduction(+:g)
   {
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: store i8
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, i32*, i32*, i32*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: store i8 %{{.+}}, i8* [[B_REF]],
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]*
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}} -1, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+    // LAMBDA: store i{{[0-9]+}} -1, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA: store i{{[0-9]+}} -1, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+    // LAMBDA: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA: call i32 @__kmpc_reduce_nowait(
+    // LAMBDA: ret void
+
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}})
     // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
 
@@ -100,6 +203,7 @@
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
@@ -152,12 +256,54 @@
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: store i8
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, i32*, i32*, i32*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: store i8 %{{.+}}, i8* [[B_REF]],
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+// BLOCKS: call{{.*}} void
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// BLOCKS: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS: call i32 @__kmpc_reduce_nowait(
+// BLOCKS: ret void
 #else
   S<float> test;
   float t_var = 0, t_var1;
   int vec[] = {1, 2};
   S<float> s_arr[] = {1, 2};
   S<float> var(3), var1;
+  float _Complex cf;
 #pragma omp parallel reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1)
   {
     vec[0] = t_var;
@@ -169,6 +315,8 @@
       vec[0] = t_var;
       s_arr[0] = var;
     }
+#pragma omp parallel reduction(+ : cf)
+    ;
   return tmain<int>();
 #endif
 }
@@ -178,6 +326,7 @@
 // CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, float*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float*)* [[MAIN_MICROTASK:@.+]] to void
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, float*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float*)* [[MAIN_MICROTASK1:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, { float, float }*)* [[MAIN_MICROTASK2:@.+]] to void
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 // CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
 // CHECK: ret
@@ -468,6 +617,43 @@
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: store i8
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}}*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// CHECK: store i8 %{{.+}}, i8* [[B_REF]],
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]*
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// CHECK: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK: call i32 @__kmpc_reduce_nowait(
+// CHECK: ret void
+
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], align 128
diff --git a/test/OpenMP/parallel_reduction_messages.cpp b/test/OpenMP/parallel_reduction_messages.cpp
index b29f7c9..af1f5ed 100644
--- a/test/OpenMP/parallel_reduction_messages.cpp
+++ b/test/OpenMP/parallel_reduction_messages.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
 
 void foo() {
 }
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -101,23 +101,23 @@
   foo();
 #pragma omp parallel reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   foo();
-#pragma omp parallel reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp parallel reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   foo();
 #pragma omp parallel reduction(&& : argc)
   foo();
 #pragma omp parallel reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   foo();
-#pragma omp parallel reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp parallel reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   foo();
-#pragma omp parallel reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp parallel reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp parallel reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   foo();
-#pragma omp parallel reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
-#pragma omp parallel reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
-#pragma omp parallel reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp parallel reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
@@ -127,14 +127,14 @@
   foo();
 #pragma omp parallel reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   foo();
-#pragma omp parallel reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp parallel reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   foo();
 #pragma omp parallel private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   foo();
 #pragma omp parallel private(k)
 #pragma omp parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   foo();
-#pragma omp parallel reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   foo();
 #pragma omp parallel reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
   foo();
@@ -208,11 +208,11 @@
   foo();
 #pragma omp parallel reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   foo();
-#pragma omp parallel reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
-#pragma omp parallel reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
-#pragma omp parallel reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp parallel reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
diff --git a/test/OpenMP/parallel_sections_ast_print.cpp b/test/OpenMP/parallel_sections_ast_print.cpp
index 9f5c1fa..a66b75e 100644
--- a/test/OpenMP/parallel_sections_ast_print.cpp
+++ b/test/OpenMP/parallel_sections_ast_print.cpp
@@ -141,4 +141,7 @@
   return tmain<int, 5>(b, &b) + tmain<long, 1>(x, &x);
 }
 
+template<typename T>
+T S<T>::TS = 0;
+
 #endif
diff --git a/test/OpenMP/parallel_sections_codegen.cpp b/test/OpenMP/parallel_sections_codegen.cpp
index b8c1e39..bc7e198 100644
--- a/test/OpenMP/parallel_sections_codegen.cpp
+++ b/test/OpenMP/parallel_sections_codegen.cpp
@@ -78,15 +78,10 @@
 // CHECK-LABEL: tmain
 // CHECK:       call void {{.*}} @__kmpc_fork_call(
 // CHECK-NOT:   __kmpc_global_thread_num
-// CHECK:       [[RES:%.+]] = call i32 @__kmpc_single(
-// CHECK-NEXT:  [[BOOLRES:%.+]] = icmp ne i32 [[RES]], 0
-// CHECK-NEXT:  br i1 [[BOOLRES]], label %[[THEN:.+]], label %[[END:.+]]
-// CHECK:       [[THEN]]
-// CHECK-NEXT:  invoke void @{{.*}}foo{{.*}}()
+// CHECK:       call void @__kmpc_for_static_init_4(
+// CHECK:       invoke void @{{.*}}foo{{.*}}()
 // CHECK-NEXT:  unwind label %[[TERM_LPAD:.+]]
-// CHECK:       call void @__kmpc_end_single(
-// CHECK-NEXT:  br label %[[END]]
-// CHECK:       [[END]]
+// CHECK:       call void @__kmpc_for_static_fini(
 // CHECK-NEXT:  ret
 // CHECK:       [[TERM_LPAD]]
 // CHECK:       call void @__clang_call_terminate(i8*
diff --git a/test/OpenMP/parallel_sections_private_messages.cpp b/test/OpenMP/parallel_sections_private_messages.cpp
index ac9280e..40b0138 100644
--- a/test/OpenMP/parallel_sections_private_messages.cpp
+++ b/test/OpenMP/parallel_sections_private_messages.cpp
@@ -29,7 +29,13 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp parallel sections private(a) private(this->a)
+    {
+      for (int k = 0; k < v; ++k)
+        ++this->a;
+    }
+  }
 };
 class S5 {
   int a;
@@ -37,6 +43,60 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp parallel sections private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a; ++k)
+        ++s.a;
+    }
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp parallel sections private(a) private(this->a)
+    {
+      for (int k = 0; k < v; ++k)
+        ++this->a;
+    }
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp parallel sections private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a; ++k)
+        ++s.a;
+    }
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp parallel sections private(a) private(this->a) private(T::a)
+    {
+      for (int k = 0; k < a.a; ++k)
+        ++this->a.a;
+    }
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel sections private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a.a; ++k)
+        ++s.a.a;
+    }
+    return *this;
+  }
 };
 
 S3 h;
@@ -134,6 +194,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp parallel sections private // expected-error {{expected '(' after 'private'}}
@@ -212,6 +274,8 @@
     foo();
   }
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/parallel_sections_reduction_messages.cpp b/test/OpenMP/parallel_sections_reduction_messages.cpp
index eff1849..52d4cb9 100644
--- a/test/OpenMP/parallel_sections_reduction_messages.cpp
+++ b/test/OpenMP/parallel_sections_reduction_messages.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
 
 void foo() {
 }
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -121,7 +121,7 @@
   {
     foo();
   }
-#pragma omp parallel sections reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp parallel sections reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   {
     foo();
   }
@@ -133,11 +133,11 @@
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp parallel sections reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp parallel sections reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
@@ -145,15 +145,15 @@
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel sections reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel sections reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp parallel sections reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
@@ -173,7 +173,7 @@
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp parallel sections reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   {
     foo();
   }
@@ -186,7 +186,7 @@
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp parallel sections reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   {
     foo();
   }
@@ -298,15 +298,15 @@
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel sections reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel sections reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp parallel sections reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
diff --git a/test/OpenMP/predefined_macro.c b/test/OpenMP/predefined_macro.c
index 9a961bc..e18c3d2 100644
--- a/test/OpenMP/predefined_macro.c
+++ b/test/OpenMP/predefined_macro.c
@@ -5,7 +5,7 @@
 // -fopenmp option is specified
 #ifndef _OPENMP
 #error "No _OPENMP macro is defined with -fopenmp option"
-#elsif _OPENMP != 201307
+#elsif _OPENMP != 201107
 #error "_OPENMP has incorrect value"
 #endif //_OPENMP
 #else
diff --git a/test/OpenMP/sections_codegen.cpp b/test/OpenMP/sections_codegen.cpp
index 44fdefe..291f059 100644
--- a/test/OpenMP/sections_codegen.cpp
+++ b/test/OpenMP/sections_codegen.cpp
@@ -6,7 +6,6 @@
 #ifndef HEADER
 #define HEADER
 // CHECK: [[IMPLICIT_BARRIER_SECTIONS_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
-// CHECK: [[IMPLICIT_BARRIER_SINGLE_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 // CHECK-LABEL: foo
 void foo() {};
 // CHECK-LABEL: bar
@@ -86,17 +85,12 @@
 // CHECK-LABEL: tmain
 // CHECK:       call void {{.*}} @__kmpc_fork_call(
 // CHECK-NOT:   __kmpc_global_thread_num
-// CHECK:       [[RES:%.+]] = call i32 @__kmpc_single(
-// CHECK-NEXT:  [[BOOLRES:%.+]] = icmp ne i32 [[RES]], 0
-// CHECK-NEXT:  br i1 [[BOOLRES]], label %[[THEN:.+]], label %[[END:.+]]
-// CHECK:       [[THEN]]
-// CHECK-NEXT:  invoke void @{{.*}}foo{{.*}}()
+// CHECK:       call void @__kmpc_for_static_init_4(
+// CHECK:       invoke void @{{.*}}foo{{.*}}()
 // CHECK-NEXT:  unwind label %[[TERM_LPAD:.+]]
-// CHECK:       call void @__kmpc_end_single(
-// CHECK-NEXT:  br label %[[END]]
-// CHECK:       [[END]]
-// CHECK-NEXT:  call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_SINGLE_LOC]],
-// CHECK:  ret
+// CHECK:       call void @__kmpc_for_static_fini(
+// CHECK:       call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_SECTIONS_LOC]],
+// CHECK:       ret
 // CHECK:       [[TERM_LPAD]]
 // CHECK:       call void @__clang_call_terminate(i8*
 // CHECK-NEXT:  unreachable
diff --git a/test/OpenMP/sections_firstprivate_codegen.cpp b/test/OpenMP/sections_firstprivate_codegen.cpp
index f673597..51d0c7b 100644
--- a/test/OpenMP/sections_firstprivate_codegen.cpp
+++ b/test/OpenMP/sections_firstprivate_codegen.cpp
@@ -59,7 +59,6 @@
 // CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
 S<float> var(3);
 // CHECK-DAG: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 // CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
 
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
@@ -94,7 +93,7 @@
     // LAMBDA: [[SIVAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR1_REF]]
     // LAMBDA: store i{{[0-9]+}} [[SIVAR1_VAL]], i{{[0-9]+}}* [[SIVAR1_PRIVATE_ADDR]]
 
-    // LAMBDA: call void @__kmpc_barrier(
+    // LAMBDA-NOT: call void @__kmpc_barrier(
     {
       g = 1;
       sivar = 10;
@@ -154,7 +153,7 @@
 
     // BLOCKS: [[SIVAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR1_REF]],
     // BLOCKS: store i{{[0-9]+}} [[SIVAR1_VAL]], i{{[0-9]+}}* [[SIVAR1_PRIVATE_ADDR]],
-    // BLOCKS: call void @__kmpc_barrier(
+    // BLOCKS-NOT: call void @__kmpc_barrier(
     {
       g = 1;
       sivar = 10;
@@ -202,17 +201,22 @@
 
 // CHECK: define {{.*}}i{{[0-9]+}} @main()
 // CHECK: alloca i{{[0-9]+}},
-// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
 // CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
 
-// CHECK: call i32 @__kmpc_single(
 // firstprivate t_var(t_var)
 // CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
@@ -235,15 +239,16 @@
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
 // firstprivate isvar
-// CHEC: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR]],
-// CHEC: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIV]],
+// CHECK: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR]],
+// CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIV]],
+
+// CHECK-NOT: call void @__kmpc_barrier(
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
 
 // ~(firstprivate var), ~(firstprivate s_arr)
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
-// CHECK: call void @__kmpc_end_single(
-
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
 
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 
@@ -252,7 +257,11 @@
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: [[T_VARVAL:%.+]] = load i32, i32* %{{.+}},
+// CHECK: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST:%.+]] to i32*
+// CHECK: store i32  [[T_VARVAL]], i32* [[T_VARCONV]],
+// CHECK: [[T_VARPVT:%.+]] = load i64, i64* [[T_VARCAST]],
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void {{.*}}i64 [[T_VARPVT]],
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
@@ -263,19 +272,20 @@
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
 // CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** %
 // CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 
 // firstprivate t_var(t_var)
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}*
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
 
 // firstprivate vec(vec)
@@ -299,10 +309,8 @@
 // CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
-// Synchronization for initialization.
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// No synchronization for initialization.
+// CHECK-NOT: call void @__kmpc_barrier(
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
diff --git a/test/OpenMP/sections_lastprivate_codegen.cpp b/test/OpenMP/sections_lastprivate_codegen.cpp
index a1ff007..6ee9f63 100644
--- a/test/OpenMP/sections_lastprivate_codegen.cpp
+++ b/test/OpenMP/sections_lastprivate_codegen.cpp
@@ -23,7 +23,6 @@
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK [[CAP_MAIN_TY:%.+]] = type { i{{[0-9]+}}*, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}}* }
 // CHECK: [[S_INT_TY:%.+]] = type { i32 }
-// CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 // CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
 // CHECK-DAG: [[X:@.+]] = global double 0.0
 template <typename T>
@@ -234,27 +233,29 @@
 // CHECK: ret
 
 // CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}},
-// CHECK-NOT: alloca i{{[0-9]+}},
-// CHECK-NOT: alloca [2 x i{{[0-9]+}}],
-// CHECK-NOT: alloca [2 x [[S_FLOAT_TY]]],
-// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca [2 x i{{[0-9]+}}],
+// CHECK: alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: alloca [[S_FLOAT_TY]],
+// CHECK: alloca i{{[0-9]+}},
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
 
 // CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_REF]]
 // CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call i32 @__kmpc_single(
 
-// CHECK-DAG: getelementptr inbounds [2 x i32], [2 x i32]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK-DAG: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-
+// CHECK: call void @__kmpc_for_static_init_4(
 // <Skip loop body>
+// CHECK: call void @__kmpc_for_static_fini(
 
-// CHECK-NOT: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
-// CHECK-NOT: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
 
-// CHECK: call void @__kmpc_end_single(
-
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[SINGLE_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call void @__kmpc_barrier(
 // CHECK: ret void
 
 //
diff --git a/test/OpenMP/sections_private_codegen.cpp b/test/OpenMP/sections_private_codegen.cpp
index cd22188..b812655 100644
--- a/test/OpenMP/sections_private_codegen.cpp
+++ b/test/OpenMP/sections_private_codegen.cpp
@@ -157,6 +157,11 @@
 // CHECK: ret
 //
 // CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
@@ -165,7 +170,6 @@
 // CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK-NOT: alloca [[S_FLOAT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
-// CHECK: call i32 @__kmpc_single(
 // CHECK-NOT: [[T_VAR_PRIV]]
 // CHECK-NOT: [[VEC_PRIV]]
 // CHECK-NOT: [[SIVAR_PRIV]]
@@ -175,9 +179,13 @@
 // CHECK-NOT: [[T_VAR_PRIV]]
 // CHECK-NOT: [[VEC_PRIV]]
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+
 // CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
-// CHECK: call void @__kmpc_end_single(
+// CHECK: call void @__kmpc_barrier(
 // CHECK: ret void
 
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
diff --git a/test/OpenMP/sections_private_messages.cpp b/test/OpenMP/sections_private_messages.cpp
index f13bbdb..27bb313 100644
--- a/test/OpenMP/sections_private_messages.cpp
+++ b/test/OpenMP/sections_private_messages.cpp
@@ -29,7 +29,13 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp sections private(a) private(this->a)
+    {
+      for (int k = 0; k < v; ++k)
+        ++this->a;
+    }
+  }
 };
 class S5 {
   int a;
@@ -37,6 +43,60 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp sections private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a; ++k)
+        ++s.a;
+    }
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp sections private(a) private(this->a)
+    {
+      for (int k = 0; k < v; ++k)
+        ++this->a;
+    }
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp sections private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a; ++k)
+        ++s.a;
+    }
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp sections private(a) private(this->a) private(T::a)
+    {
+      for (int k = 0; k < a.a; ++k)
+        ++this->a.a;
+    }
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp sections private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a.a; ++k)
+        ++s.a.a;
+    }
+    return *this;
+  }
 };
 
 S3 h;
@@ -134,6 +194,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp sections private // expected-error {{expected '(' after 'private'}}
@@ -212,6 +274,8 @@
     foo();
   }
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/sections_reduction_codegen.cpp b/test/OpenMP/sections_reduction_codegen.cpp
index f67977c..b52d2ee 100644
--- a/test/OpenMP/sections_reduction_codegen.cpp
+++ b/test/OpenMP/sections_reduction_codegen.cpp
@@ -23,7 +23,6 @@
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
 // CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
-// CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 // CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
 // CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
 
@@ -195,23 +194,23 @@
 // CHECK: ret
 //
 // CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}},
-// CHECK-NOT: alloca float,
-// CHECK-NOT: alloca [[S_FLOAT_TY]],
-// CHECK-NOT: alloca [[S_FLOAT_TY]],
-// CHECK-NOT: alloca float,
+// CHECK: alloca float,
+// CHECK: alloca [[S_FLOAT_TY]],
+// CHECK: alloca [[S_FLOAT_TY]],
+// CHECK: alloca float,
 
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
 // CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
 // CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call i32 @__kmpc_single(
 
 // CHECK-NOT: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK-NOT: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
 
-// CHECK: call void @__kmpc_end_single(
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
 
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[SINGLE_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK: call void @__kmpc_barrier(
 
 // CHECK: ret void
 
diff --git a/test/OpenMP/sections_reduction_messages.cpp b/test/OpenMP/sections_reduction_messages.cpp
index 79473d4..134bf61 100644
--- a/test/OpenMP/sections_reduction_messages.cpp
+++ b/test/OpenMP/sections_reduction_messages.cpp
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -132,7 +132,7 @@
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp sections reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   {
     foo();
   }
@@ -147,12 +147,12 @@
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp sections reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp sections reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
@@ -162,17 +162,17 @@
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp sections reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp sections reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp sections reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
@@ -197,7 +197,7 @@
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp sections reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   {
     foo();
   }
@@ -212,7 +212,7 @@
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp sections reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   {
     foo();
   }
@@ -342,17 +342,17 @@
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp sections reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp sections reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp sections reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
diff --git a/test/OpenMP/simd_aligned_messages.cpp b/test/OpenMP/simd_aligned_messages.cpp
index 6be7529..9515a0b 100644
--- a/test/OpenMP/simd_aligned_messages.cpp
+++ b/test/OpenMP/simd_aligned_messages.cpp
@@ -196,6 +196,7 @@
   #pragma omp simd aligned(h)
   for (int k = 0; k < argc; ++k) ++k;
   int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
   foomain<int*,char>(pargc,argv);
   return 0;
 }
diff --git a/test/OpenMP/simd_ast_print.cpp b/test/OpenMP/simd_ast_print.cpp
index cabbe33..99c00c6 100644
--- a/test/OpenMP/simd_ast_print.cpp
+++ b/test/OpenMP/simd_ast_print.cpp
@@ -6,6 +6,58 @@
 #ifndef HEADER
 #define HEADER
 
+struct SS {
+  SS(): a(0) {}
+  SS(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T *a;
+  T b[2];
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type &v) : a((T*)&v) {
+#pragma omp simd aligned(a)
+    for (int k = 0; k < a->a; ++k)
+      ++this->a->a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp simd aligned(this->b : 8)
+    for (int k = 0; k < s.a->a; ++k)
+      ++s.a->a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp simd aligned(this->a)
+// CHECK: #pragma omp simd aligned(this->a)
+// CHECK: #pragma omp simd aligned(this->b: 8)
+
+class S8 : public S7<SS> {
+  S8() {}
+
+public:
+  S8(int v) : S7<SS>(v){
+#pragma omp simd aligned(S7<SS>::a)
+    for (int k = 0; k < a->a; ++k)
+      ++this->a->a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp simd aligned(this->b: 4)
+    for (int k = 0; k < s.a->a; ++k)
+      ++s.a->a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp simd aligned(this->S7<SS>::a)
+// CHECK: #pragma omp simd aligned(this->b: 4)
+
 void foo() {}
 int g_ind = 1;
 template<class T, class N> T reduct(T* arr, N num) {
diff --git a/test/OpenMP/simd_codegen.cpp b/test/OpenMP/simd_codegen.cpp
index 6202833..29828b3 100644
--- a/test/OpenMP/simd_codegen.cpp
+++ b/test/OpenMP/simd_codegen.cpp
@@ -4,8 +4,10 @@
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 // REQUIRES: x86-registered-target
 // expected-no-diagnostics
-#ifndef HEADER
-#define HEADER
+ #ifndef HEADER
+ #define HEADER
+
+// CHECK: [[SS_TY:%.+]] = type { i32 }
 
 long long get_val() { return 0; }
 double *g_ptr;
@@ -207,6 +209,7 @@
 // CHECK-NEXT: store i64 [[ADD7_2]], i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID]]
   }
 // CHECK: [[SIMPLE_LOOP7_END]]
+// CHECK-NEXT: store i64 11, i64*
 // CHECK-NEXT: [[A_PRIV_VAL:%.+]] = load i32, i32* [[A_PRIV]],
 // CHECK-NEXT: store i32 [[A_PRIV_VAL]], i32* [[A]],
   int R;
@@ -321,7 +324,6 @@
 // CHECK-LABEL: define {{.*void}} @{{.*}}iter_simple{{.*}}
 void iter_simple(IterDouble ia, IterDouble ib, IterDouble ic) {
 //
-// CHECK: store i32 0, i32* [[IT_OMP_IV:%[^,]+]]
 // Calculate number of iterations before the loop body.
 // CHECK: [[DIFF1:%.+]] = invoke {{.*}}i32 @{{.*}}IterDouble{{.*}}
 // CHECK: [[DIFF2:%.+]] = sub nsw i32 [[DIFF1]], 1
@@ -329,6 +331,7 @@
 // CHECK-NEXT: [[DIFF4:%.+]] = sdiv i32 [[DIFF3]], 1
 // CHECK-NEXT: [[DIFF5:%.+]] = sub nsw i32 [[DIFF4]], 1
 // CHECK-NEXT: store i32 [[DIFF5]], i32* [[OMP_LAST_IT:%[^,]+]]{{.+}}
+// CHECK: store i32 0, i32* [[IT_OMP_IV:%[^,]+]]
   #pragma omp simd
 
 // CHECK: [[IV:%.+]] = load i32, i32* [[IT_OMP_IV]]{{.+}} !llvm.mem.parallel_loop_access ![[ITER_LOOP_ID:[0-9]+]]
@@ -416,9 +419,10 @@
 // CHECK: [[COLL1_END]]
   }
 // i,j,l are updated; k is not updated.
-// CHECK: store i32 3, i32* [[I:%[^,]+]]
-// CHECK-NEXT: store i32 5, i32* [[I:%[^,]+]]
-// CHECK-NEXT: store i16 9, i16* [[I:%[^,]+]]
+// CHECK: store i32 3, i32*
+// CHECK-NEXT: store i32 5, i32*
+// CHECK-NEXT: store i32 7, i32*
+// CHECK-NEXT: store i16 9, i16*
 // CHECK: ret void
 }
 
@@ -490,8 +494,10 @@
 
   #pragma omp simd linear(k : 3)
 // CHECK: store i64* [[VAL_ADDR]], i64** [[K_ADDR]],
+// CHECK: [[VAL_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: store i64* [[VAL_REF]], i64** [[K_ADDR_REF:%.+]],
 // CHECK: store i32 0, i32* [[OMP_IV:%[^,]+]]
-// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR_REF]],
 // CHECK: [[K0LOAD:%.+]] = load i64, i64* [[K_REF]]
 // CHECK-NEXT: store i64 [[K0LOAD]], i64* [[LIN0:%[^,]+]]
 
@@ -524,7 +530,7 @@
 // CHECK: [[SIMPLE_LOOP_END]]
 //
 // Update linear vars after loop, as the loop was operating on a private version.
-// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR_REF]],
 // CHECK: store i64* [[K_REF]], i64** [[K_PRIV_REF:%.+]],
 // CHECK: [[LIN0_2:%.+]] = load i64, i64* [[LIN0]]
 // CHECK-NEXT: [[LIN_ADD2:%.+]] = add nsw i64 [[LIN0_2]], 27
@@ -533,8 +539,10 @@
 //
 
   #pragma omp simd linear(val(k) : 3)
+// CHECK: [[VAL_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: store i64* [[VAL_REF]], i64** [[K_ADDR_REF:%.+]],
 // CHECK: store i32 0, i32* [[OMP_IV:%[^,]+]]
-// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR_REF]],
 // CHECK: [[K0LOAD:%.+]] = load i64, i64* [[K_REF]]
 // CHECK-NEXT: store i64 [[K0LOAD]], i64* [[LIN0:%[^,]+]]
 
@@ -567,7 +575,7 @@
 // CHECK: [[SIMPLE_LOOP_END]]
 //
 // Update linear vars after loop, as the loop was operating on a private version.
-// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR_REF]],
 // CHECK: store i64* [[K_REF]], i64** [[K_PRIV_REF:%.+]],
 // CHECK: [[LIN0_2:%.+]] = load i64, i64* [[LIN0]]
 // CHECK-NEXT: [[LIN_ADD2:%.+]] = add nsw i64 [[LIN0_2]], 27
@@ -632,5 +640,68 @@
     a[i] += bar();
 }
 // TERM_DEBUG: !{{[0-9]+}} = !DILocation(line: [[@LINE-11]],
+
+// CHECK-LABEL: S8
+// CHECK: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-NEXT: and i64 %{{.+}}, 15
+// CHECK-NEXT: icmp eq i64 %{{.+}}, 0
+// CHECK-NEXT: call void @llvm.assume(i1
+
+// CHECK: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-NEXT: and i64 %{{.+}}, 7
+// CHECK-NEXT: icmp eq i64 %{{.+}}, 0
+// CHECK-NEXT: call void @llvm.assume(i1
+
+// CHECK: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-NEXT: and i64 %{{.+}}, 15
+// CHECK-NEXT: icmp eq i64 %{{.+}}, 0
+// CHECK-NEXT: call void @llvm.assume(i1
+
+// CHECK: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-NEXT: and i64 %{{.+}}, 3
+// CHECK-NEXT: icmp eq i64 %{{.+}}, 0
+// CHECK-NEXT: call void @llvm.assume(i1
+struct SS {
+  SS(): a(0) {}
+  SS(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T *a;
+  T b[2];
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type &v) : a((T*)&v) {
+#pragma omp simd aligned(a)
+    for (int k = 0; k < a->a; ++k)
+      ++this->a->a;
+#pragma omp simd aligned(this->b : 8)
+    for (int k = 0; k < a->a; ++k)
+      ++a->a;
+  }
+};
+
+class S8 : private IterDouble, public S7<SS> {
+  S8() {}
+
+public:
+  S8(int v) : S7<SS>(v){
+#pragma omp parallel private(a)
+#pragma omp simd aligned(S7<SS>::a)
+    for (int k = 0; k < a->a; ++k)
+      ++this->a->a;
+#pragma omp parallel shared(b)
+#pragma omp simd aligned(this->b: 4)
+    for (int k = 0; k < a->a; ++k)
+      ++a->a;
+  }
+};
+S8 s8(0);
+
 #endif // HEADER
 
diff --git a/test/OpenMP/simd_collapse_messages.cpp b/test/OpenMP/simd_collapse_messages.cpp
index e34f0a1..5b88024 100644
--- a/test/OpenMP/simd_collapse_messages.cpp
+++ b/test/OpenMP/simd_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp simd', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp simd' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp simd collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd collapse (1)
@@ -59,16 +71,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp simd', but found only 1}}
   #pragma omp simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp simd' are ignored}} expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp simd', but found only 1}}
-  #pragma omp simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp simd collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp simd' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp simd collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
diff --git a/test/OpenMP/simd_lastprivate_messages.cpp b/test/OpenMP/simd_lastprivate_messages.cpp
index 7cc5ba8..16223db 100644
--- a/test/OpenMP/simd_lastprivate_messages.cpp
+++ b/test/OpenMP/simd_lastprivate_messages.cpp
@@ -217,5 +217,5 @@
 #pragma omp simd lastprivate(t) // OK
   for (i = 0; i < argc; ++i)
     foo();
-  return 0;
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
diff --git a/test/OpenMP/simd_private_messages.cpp b/test/OpenMP/simd_private_messages.cpp
index 3442d18..1850101 100644
--- a/test/OpenMP/simd_private_messages.cpp
+++ b/test/OpenMP/simd_private_messages.cpp
@@ -26,13 +26,61 @@
   int a;
   S4(); // expected-note {{implicitly declared private here}}
 public:
-  S4(int v):a(v) { }
+  S4(int v) : a(v) {
+#pragma omp simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
   S5():a(0) {} // expected-note {{implicitly declared private here}}
 public:
   S5(int v):a(v) { }
+  S5 &operator=(S5 &s) {
+#pragma omp simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -96,6 +144,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
   #pragma omp simd private // expected-error {{expected '(' after 'private'}}
@@ -137,6 +187,8 @@
   #pragma omp simd private(i)
   for (int k = 0; k < argc; ++k) ++k;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/simd_reduction_messages.cpp b/test/OpenMP/simd_reduction_messages.cpp
index e082921c..c47d53e 100644
--- a/test/OpenMP/simd_reduction_messages.cpp
+++ b/test/OpenMP/simd_reduction_messages.cpp
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -111,7 +111,7 @@
 #pragma omp simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(&& : argc)
@@ -120,22 +120,22 @@
 #pragma omp simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -150,7 +150,7 @@
 #pragma omp simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -163,7 +163,7 @@
 #pragma omp simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
@@ -254,13 +254,13 @@
 #pragma omp simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
diff --git a/test/OpenMP/simd_safelen_messages.cpp b/test/OpenMP/simd_safelen_messages.cpp
index aa31b7d..56cb868 100644
--- a/test/OpenMP/simd_safelen_messages.cpp
+++ b/test/OpenMP/simd_safelen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd safelen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp simd' cannot contain more than one 'safelen' clause}}
-  // expected-error@+2 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp simd safelen (foobool(argc)), safelen (true), safelen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd safelen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd safelen (4)
@@ -57,16 +69,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp simd safelen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp simd' cannot contain more than one 'safelen' clause}}
   // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
   #pragma omp simd safelen (foobool(argc)), safelen (true), safelen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp simd' must be a for loop}}
diff --git a/test/OpenMP/simd_simdlen_messages.cpp b/test/OpenMP/simd_simdlen_messages.cpp
index 91656f8..426d187 100644
--- a/test/OpenMP/simd_simdlen_messages.cpp
+++ b/test/OpenMP/simd_simdlen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd simdlen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp simd' cannot contain more than one 'simdlen' clause}}
-  // expected-error@+2 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd simdlen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd simdlen (4)
@@ -57,16 +69,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp simd simdlen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp simd' cannot contain more than one 'simdlen' clause}}
   // expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
   #pragma omp simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp simd' must be a for loop}}
diff --git a/test/OpenMP/single_ast_print.cpp b/test/OpenMP/single_ast_print.cpp
index 8eb3517..d30b7fe 100644
--- a/test/OpenMP/single_ast_print.cpp
+++ b/test/OpenMP/single_ast_print.cpp
@@ -8,15 +8,43 @@
 
 void foo() {}
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel firstprivate(a, b, c)
+#pragma omp single copyprivate(a, this->b, (this)->c)
+// CHECK: #pragma omp parallel firstprivate(this->a,this->b,this->c)
+// CHECK-NEXT: #pragma omp single copyprivate(this->a,this->b,this->c)
+    ++this->a, --b, (this)->c /= 1;
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+// CHECK: #pragma omp parallel firstprivate(this->a)
+// CHECK-NEXT: #pragma omp single copyprivate(this->a)
+// CHECK: #pragma omp parallel firstprivate(this->a)
+// CHECK-NEXT: #pragma omp single copyprivate(this->a)
+#pragma omp parallel firstprivate(a)
+#pragma omp single copyprivate(this->a)
+    ++this->a;
+  }
+};
+
 template <class T, int N>
 T tmain(T argc) {
   T b = argc, c, d, e, f, g;
   static T a;
+  SST<T> sst;
 // CHECK: static T a;
 #pragma omp parallel private(g)
 #pragma omp single private(argc, b), firstprivate(c, d), nowait
   foo();
-  // CHECK-NEXT: #pragma omp parallel private(g)
+  // CHECK: #pragma omp parallel private(g)
   // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(c,d) nowait
   // CHECK-NEXT: foo();
 #pragma omp parallel private(g)
@@ -31,11 +59,12 @@
 int main(int argc, char **argv) {
   int b = argc, c, d, e, f, g;
   static int a;
+  SS ss(a);
 // CHECK: static int a;
 #pragma omp parallel private(g)
 #pragma omp single private(argc, b), firstprivate(argv, c), nowait
   foo();
-  // CHECK-NEXT: #pragma omp parallel private(g)
+  // CHECK: #pragma omp parallel private(g)
   // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(argv,c) nowait
   // CHECK-NEXT: foo();
 #pragma omp parallel private(g)
diff --git a/test/OpenMP/single_codegen.cpp b/test/OpenMP/single_codegen.cpp
index 61a93a5..a2140c2 100644
--- a/test/OpenMP/single_codegen.cpp
+++ b/test/OpenMP/single_codegen.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fnoopenmp-use-tls -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -std=c++11 -fopenmp -fnoopenmp-use-tls -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 // RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
 // expected-no-diagnostics
 // REQUIRES: x86-registered-target
@@ -19,7 +19,9 @@
 };
 
 // CHECK-DAG:   [[TEST_CLASS_TY:%.+]] = type { i{{[0-9]+}} }
-// CHECK:       [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG:   [[SST_TY:%.+]] = type { double }
+// CHECK-DAG:   [[SS_TY:%.+]] = type { i32, i8, i32* }
+// CHECK-DAG:   [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK:       [[IMPLICIT_BARRIER_SINGLE_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 
 // CHECK:       define void [[FOO:@.+]]()
@@ -30,6 +32,39 @@
 
 void foo() {}
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel firstprivate(a, b, c)
+#pragma omp single copyprivate(a, this->b, (this)->c)
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel firstprivate(a, b, c)
+#pragma omp single copyprivate(a, this->b, (this)->c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel firstprivate(a)
+#pragma omp single copyprivate(this->a)
+    [&]() {
+      [&]() {
+        ++this->a;
+#pragma omp parallel firstprivate(a)
+#pragma omp single copyprivate((this)->a)
+        ++(this)->a;
+      }();
+    }();
+  }
+};
+
 // CHECK-LABEL: @main
 // TERM_DEBUG-LABEL: @main
 int main() {
@@ -39,6 +74,8 @@
   char a;
   char a2[2];
   TestClass &c = tc;
+  SST<double> sst;
+  SS ss(c.a);
 
 // CHECK:       [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:@.+]])
 // CHECK-DAG:   [[DID_IT:%.+]] = alloca i32,
@@ -74,8 +111,8 @@
 // CHECK-NEXT:  invoke void [[FOO]]()
 // CHECK:       to label {{%?}}[[CONT:.+]] unwind
 // CHECK:       [[CONT]]
-// CHECK:       store i32 1, i32* [[DID_IT]]
 // CHECK:       call void @__kmpc_end_single([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK:       store i32 1, i32* [[DID_IT]]
 // CHECK-NEXT:  br label {{%?}}[[EXIT]]
 // CHECK:       [[EXIT]]
 // CHECK:       [[A_PTR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[COPY_LIST]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -186,3 +223,210 @@
 // ARRAY: store i32* %{{.+}}, i32** %{{.+}},
 // ARRAY: store %struct.St* %{{.+}}, %struct.St** %{{.+}},
 #endif
+
+// CHECK-LABEL:@_ZN2SSC2ERi(
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[SS_TY]]*, i64, i64, i64)* [[SS_MICROTASK:@.+]] to void
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[SS_MICROTASK]](i32* {{[^,]+}}, i32* {{[^,]+}}, [[SS_TY]]* {{.+}}, i64 {{.+}}, i64 {{.+}}, i64 {{.+}})
+// Private a
+// CHECK: alloca i64,
+// Private b
+// CHECK: alloca i64,
+// Private c
+// CHECK: alloca i64,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: [[DID_IT:%.+]] = alloca i32,
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: store i32 0, i32* [[DID_IT]],
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: icmp ne i32 [[RES]], 0
+// CHECK-NEXT: br i1
+
+// CHECK: getelementptr inbounds [[CAP_TY:%.+]], [[CAP_TY]]* [[CAP:%.+]], i32 0, i32 0
+// CHECK: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 1
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: store i32* %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-NEXT: store i32* %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 3
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: store i32* %
+// CHECK-LABEL: invoke void @_ZZN2SSC1ERiENKUlvE_clEv(
+// CHECK-SAME: [[CAP_TY]]* [[CAP]])
+
+// CHECK: call void @__kmpc_end_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK: store i32 1, i32* [[DID_IT]],
+// CHECK: br label
+
+// CHECK: call void @__kmpc_end_single(%{{.+}}* @{{.+}}, i32 %{{.+}})
+// CHECK: br label
+
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST:%.+]], i64 0, i64 0
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST]], i64 0, i64 1
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST]], i64 0, i64 2
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK-NEXT: bitcast [3 x i8*]* [[LIST]] to i8*
+// CHECK-NEXT: load i32, i32* [[DID_IT]],
+// CHECK-NEXT: call void @__kmpc_copyprivate([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}}, i64 24, i8* %{{.+}}, void (i8*, i8*)* [[COPY_FUNC:@[^,]+]], i32 %{{.+}})
+// CHECK-NEXT: ret void
+
+// CHECK-LABEL: @_ZZN2SSC1ERiENKUlvE_clEv(
+// CHECK: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP:%.+]], i32 0, i32 1
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: add nsw i32 %{{.+}}, 1
+// CHECK-NEXT: store i32 %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: add nsw i32 %{{.+}}, -1
+// CHECK-NEXT: store i32 %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 3
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: sdiv i32 %{{.+}}, 1
+// CHECK-NEXT: store i32 %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 1
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: bitcast i64* %
+// CHECK-NEXT: store i32 %{{.+}}, i32* %
+// CHECK-NEXT: load i64, i64* %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: bitcast i64* %
+// CHECK-NEXT: store i32 %{{.+}}, i32* %
+// CHECK-NEXT: load i64, i64* %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 3
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: bitcast i64* %
+// CHECK-NEXT: store i32 %{{.+}}, i32* %
+// CHECK-NEXT: load i64, i64* %
+// CHECK-NEXT: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[SS_TY]]*, i64, i64, i64)* [[SS_MICROTASK1:@.+]] to void
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[COPY_FUNC]](i8*, i8*)
+// CHECK: ret void
+
+// CHECK: define internal void [[SS_MICROTASK1]](i32* {{[^,]+}}, i32* {{[^,]+}}, [[SS_TY]]* {{.+}}, i64 {{.+}}, i64 {{.+}}, i64 {{.+}})
+// Private a
+// CHECK: alloca i64,
+// Private b
+// CHECK: alloca i64,
+// Private c
+// CHECK: alloca i64,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: [[DID_IT:%.+]] = alloca i32,
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: icmp ne i32 [[RES]], 0
+// CHECK-NEXT: br i1
+
+// CHECK-NOT: getelementptr inbounds
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: add nsw i32 %{{.+}}, 1
+// CHECK-NEXT: store i32 %
+// CHECK-NOT: getelementptr inbounds
+// CHECK: load i32, i32* %
+// CHECK-NEXT: add nsw i32 %{{.+}}, -1
+// CHECK-NEXT: store i32 %
+// CHECK-NOT: getelementptr inbounds
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: sdiv i32 %{{.+}}, 1
+// CHECK-NEXT: store i32 %
+// CHECK-NEXT: call void @__kmpc_end_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: store i32 1, i32* [[DID_IT]],
+// CHECK-NEXT: br label
+
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST:%.+]], i64 0, i64 0
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST]], i64 0, i64 1
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST]], i64 0, i64 2
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK-NEXT: bitcast [3 x i8*]* [[LIST]] to i8*
+// CHECK-NEXT: load i32, i32* [[DID_IT]],
+// CHECK-NEXT: call void @__kmpc_copyprivate([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}}, i64 24, i8* %{{.+}}, void (i8*, i8*)* [[COPY_FUNC:@[^,]+]], i32 %{{.+}})
+// CHECK-NEXT:  ret void
+
+// CHECK: define internal void [[COPY_FUNC]](i8*, i8*)
+// CHECK: ret void
+
+// CHECK-LABEL: @_ZN3SSTIdEC2Ev
+// CHECK: getelementptr inbounds [[SST_TY]], [[SST_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK-NEXT: store double 0.000000e+00, double* %
+// CHECK-NEXT: getelementptr inbounds [[SST_TY]], [[SST_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK-NEXT: store double* %{{.+}}, double** %
+// CHECK-NEXT: load double*, double** %
+// CHECK-NEXT: load double, double* %
+// CHECK-NEXT: bitcast i64* %{{.+}} to double*
+// CHECK-NEXT: store double %{{.+}}, double* %
+// CHECK-NEXT: load i64, i64* %
+// CHECK-NEXT: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[SST_TY]]*, i64)* [[SST_MICROTASK:@.+]] to void
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[SST_MICROTASK]](i32* {{[^,]+}}, i32* {{[^,]+}}, [[SST_TY]]* {{.+}}, i64 {{.+}})
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: icmp ne i32 [[RES]], 0
+// CHECK-NEXT: br i1
+
+// CHECK: getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 1
+// CHECK-NEXT: load double*, double** %
+// CHECK-NEXT: store double* %
+// CHECK-LABEL: invoke void @_ZZN3SSTIdEC1EvENKUlvE_clEv(
+
+// CHECK: call void @__kmpc_end_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: store i32 1, i32* [[DID_IT]],
+// CHECK-NEXT: br label
+
+// CHECK: call void @__kmpc_end_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: br label
+
+// CHECK: getelementptr inbounds [1 x i8*], [1 x i8*]* [[LIST:%.+]], i64 0, i64 0
+// CHECK: load double*, double** %
+// CHECK-NEXT: bitcast double* %
+// CHECK-NEXT: store i8* %
+// CHECK-NEXT: bitcast [1 x i8*]* [[LIST]] to i8*
+// CHECK-NEXT: load i32, i32* [[DID_IT]],
+// CHECK-NEXT: call void @__kmpc_copyprivate([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}}, i64 8, i8* %{{.+}}, void (i8*, i8*)* [[COPY_FUNC:@[^,]+]], i32 %{{.+}})
+// CHECK-NEXT:  ret void
+
+// CHECK-LABEL: @_ZZN3SSTIdEC1EvENKUlvE_clEv(
+// CHECK: getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 1
+// CHECK-NEXT: getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 1
+// CHECK-NEXT: load double*, double** %
+// CHECK-NEXT: store double* %
+// CHECK-LABEL: call void @_ZZZN3SSTIdEC1EvENKUlvE_clEvENKUlvE_clEv(
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[COPY_FUNC]](i8*, i8*)
+// CHECK: ret void
+
+// CHECK-LABEL: @_ZZZN3SSTIdEC1EvENKUlvE_clEvENKUlvE_clEv(
diff --git a/test/OpenMP/single_firstprivate_codegen.cpp b/test/OpenMP/single_firstprivate_codegen.cpp
index cc72add..537ae76 100644
--- a/test/OpenMP/single_firstprivate_codegen.cpp
+++ b/test/OpenMP/single_firstprivate_codegen.cpp
@@ -57,7 +57,6 @@
 S<float> s_arr[] = {1, 2};
 // CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
 S<float> var(3);
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 // CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
@@ -215,7 +214,7 @@
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
 // CHECK: call void @__kmpc_end_single(
 
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK-NOT: call void @__kmpc_barrier(
 
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 
@@ -224,18 +223,24 @@
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR:%.+]],
+// CHECK: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST:%.+]] to i32*
+// CHECK: store i32 [[T_VARVAL]], i32* [[T_VARCONV]],
+// CHECK: [[T_VARPVT:%.+]] = load i64, i64* [[T_VARCAST]],
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void {{.*}}i64 [[T_VARPVT:%.+]],
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
-// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 {{.*}}%{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[T_VAR_ARG:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK: [[T_VAR_CONV:%.+]] = bitcast i64* [[T_VAR_ARG]] to i32*
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
 // CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** %
 // CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
@@ -245,7 +250,7 @@
 // CHECK: call i32 @__kmpc_single(
 
 // firstprivate t_var(t_var)
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}*
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
 
 // firstprivate vec(vec)
diff --git a/test/OpenMP/single_private_messages.cpp b/test/OpenMP/single_private_messages.cpp
index a24cf47..0ed0e6c 100644
--- a/test/OpenMP/single_private_messages.cpp
+++ b/test/OpenMP/single_private_messages.cpp
@@ -29,7 +29,11 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp single private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp single private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp single private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp single private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp single private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp single private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -102,6 +150,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp single private // expected-error {{expected '(' after 'private'}}
@@ -146,6 +196,8 @@
 #pragma omp single private(m) // OK
   foo();
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/target_ast_print.cpp b/test/OpenMP/target_ast_print.cpp
index acf032a..e093e29 100644
--- a/test/OpenMP/target_ast_print.cpp
+++ b/test/OpenMP/target_ast_print.cpp
@@ -25,6 +25,12 @@
   foo();
 #pragma omp target map(always,alloc: i)
   foo();
+#pragma omp target nowait
+  foo();
+#pragma omp target depend(in : argc, argv[i:argc], a[:])
+  foo();
+#pragma omp target defaultmap(tofrom: scalar)
+  foo();
   return 0;
 }
 
@@ -44,6 +50,12 @@
 // CHECK-NEXT: foo()
 // CHECK-NEXT: #pragma omp target map(always,alloc: i)
 // CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
 // CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
 // CHECK-NEXT: char i, j, a[20]
 // CHECK-NEXT: #pragma omp target
@@ -60,6 +72,12 @@
 // CHECK-NEXT: foo()
 // CHECK-NEXT: #pragma omp target map(always,alloc: i)
 // CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
 // CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
 // CHECK-NEXT: T i, j, a[20]
 // CHECK-NEXT: #pragma omp target
@@ -76,6 +94,12 @@
 // CHECK-NEXT: foo()
 // CHECK-NEXT: #pragma omp target map(always,alloc: i)
 // CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
 
 // CHECK-LABEL: int main(int argc, char **argv) {
 int main (int argc, char **argv) {
@@ -115,6 +139,21 @@
   foo();
 // CHECK-NEXT: foo();
 
+#pragma omp target nowait
+// CHECK-NEXT: #pragma omp target nowait
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target depend(in : argc, argv[i:argc], a[:])
+// CHECK-NEXT: #pragma omp target depend(in : argc,argv[i:argc],a[:])
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target defaultmap(tofrom: scalar)
+// CHECK-NEXT: #pragma omp target defaultmap(tofrom: scalar)
+  foo();
+// CHECK-NEXT: foo();
+
   return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
 }
 
diff --git a/test/OpenMP/target_codegen.cpp b/test/OpenMP/target_codegen.cpp
index c2e08d6..f263ebd 100644
--- a/test/OpenMP/target_codegen.cpp
+++ b/test/OpenMP/target_codegen.cpp
@@ -1,20 +1,20 @@
 // Test host codegen.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 
 // Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
 
 // expected-no-diagnostics
 #ifndef HEADER
@@ -33,15 +33,15 @@
 // sizes.
 
 // CHECK-DAG: [[SIZET2:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 2]
-// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: [[SIZET3:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2]
-// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 128, i32 128]
-// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 128, i32 3, i32 128, i32 3, i32 3, i32 128, i32 128, i32 3, i32 3]
+// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 288, i32 288]
+// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 288, i32 35, i32 288, i32 35, i32 35, i32 288, i32 288, i32 35, i32 35]
 // CHECK-DAG: [[SIZET5:@.+]] = private unnamed_addr constant [3 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 40]
-// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 128, i32 128, i32 3]
+// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 288, i32 288, i32 35]
 // CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 1, i[[SZ]] 40]
-// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 128, i32 128, i32 128, i32 3]
-// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 3, i32 128, i32 128, i32 128, i32 3]
+// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 288, i32 288, i32 288, i32 35]
+// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 35, i32 288, i32 288, i32 288, i32 35]
 // CHECK-DAG: @{{.*}} = private constant i8 0
 // CHECK-DAG: @{{.*}} = private constant i8 0
 // CHECK-DAG: @{{.*}} = private constant i8 0
diff --git a/test/OpenMP/target_codegen_global_capture.cpp b/test/OpenMP/target_codegen_global_capture.cpp
index 211a3cc..b08bf10 100644
--- a/test/OpenMP/target_codegen_global_capture.cpp
+++ b/test/OpenMP/target_codegen_global_capture.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -21,6 +21,11 @@
 // CHECK-DAG: [[BB:@.+]] = internal global float 1.000000e+01
 // CHECK-DAG: [[BC:@.+]] = internal global float 1.100000e+01
 // CHECK-DAG: [[BD:@.+]] = internal global float 1.200000e+01
+// CHECK-DAG: [[TBA:@.+]] = {{.*}}global float 1.700000e+01
+// CHECK-DAG: [[TBB:@.+]] = {{.*}}global float 1.800000e+01
+// CHECK-DAG: [[TBC:@.+]] = {{.*}}global float 1.900000e+01
+// CHECK-DAG: [[TBD:@.+]] = {{.*}}global float 2.000000e+01
+
 double Ga = 1.0;
 double Gb = 2.0;
 double Gc = 3.0;
@@ -42,14 +47,14 @@
   static float Sd = 8.0;
 
   // CHECK-DAG:    [[VALLB:%.+]] = load i16, i16* [[LB]],
-  // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* @Gb,
-  // CHECK-DAG:    [[VALFB:%.+]] = load float, float* @_ZZ3foossssE2Sb,
-  // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* @Gc,
+  // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* [[GB]],
+  // CHECK-DAG:    [[VALFB:%.+]] = load float, float* [[FB]],
+  // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* [[GC]],
   // CHECK-DAG:    [[VALLC:%.+]] = load i16, i16* [[LC]],
-  // CHECK-DAG:    [[VALFC:%.+]] = load float, float* @_ZZ3foossssE2Sc,
+  // CHECK-DAG:    [[VALFC:%.+]] = load float, float* [[FC]],
   // CHECK-DAG:    [[VALLD:%.+]] = load i16, i16* [[LD]],
-  // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* @Gd,
-  // CHECK-DAG:    [[VALFD:%.+]] = load float, float* @_ZZ3foossssE2Sd,
+  // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* [[GD]],
+  // CHECK-DAG:    [[VALFD:%.+]] = load float, float* [[FD]],
 
   // 3 local vars being captured.
 
@@ -178,14 +183,14 @@
   #pragma omp parallel
   {
     // CHECK-DAG:    [[VALLB:%.+]] = load i16, i16* [[LLB]],
-    // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* @Gb,
-    // CHECK-DAG:    [[VALFB:%.+]] = load float, float* @_ZZ3barssssE2Sb,
-    // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* @Gc,
+    // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* [[GB]],
+    // CHECK-DAG:    [[VALFB:%.+]] = load float, float* [[BB]],
+    // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* [[GC]],
     // CHECK-DAG:    [[VALLC:%.+]] = load i16, i16* [[LLC]],
-    // CHECK-DAG:    [[VALFC:%.+]] = load float, float* @_ZZ3barssssE2Sc,
+    // CHECK-DAG:    [[VALFC:%.+]] = load float, float* [[BC]],
     // CHECK-DAG:    [[VALLD:%.+]] = load i16, i16* [[LLD]],
-    // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* @Gd,
-    // CHECK-DAG:    [[VALFD:%.+]] = load float, float* @_ZZ3barssssE2Sd,
+    // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* [[GD]],
+    // CHECK-DAG:    [[VALFD:%.+]] = load float, float* [[BD]],
 
     // 3 local vars being captured.
 
@@ -284,4 +289,150 @@
   return a + b + c + d + (int)Sa + (int)Sb + (int)Sc + (int)Sd;
 }
 
+///
+/// Tests with template functions.
+///
+
+// CHECK: define {{.*}} @{{.*}}tbar2{{.*}}(
+
+// CHECK: define {{.*}} @{{.*}}tbar{{.*}}(
+// CHECK-SAME: i16 {{[^,]*}}[[A:%[^,]+]],
+// CHECK-SAME: i16 {{[^,]*}}[[B:%[^,]+]],
+// CHECK-SAME: i16 {{[^,]*}}[[C:%[^,]+]],
+// CHECK-SAME: i16 {{[^,]*}}[[D:%[^,]+]])
+// CHECK: [[LA:%.+]] = alloca i16
+// CHECK: [[LB:%.+]] = alloca i16
+// CHECK: [[LC:%.+]] = alloca i16
+// CHECK: [[LD:%.+]] = alloca i16
+template<typename T>
+int tbar(T a, T b, T c, T d){
+  static float Sa = 17.0;
+  static float Sb = 18.0;
+  static float Sc = 19.0;
+  static float Sd = 20.0;
+
+  // CHECK: call void {{.*}}@__kmpc_fork_call(%ident_t* {{.+}}, i32 {{.+}}, void (i32*, i32*, ...)* bitcast ({{.*}}[[PARF:@.+]] to {{.*}}), i16* %{{.+}}, i16* %{{.+}}, i16* %{{.+}}, i16* %{{.+}})
+  // CHECK: define internal void [[PARF]](i32* noalias %{{.*}}, i32* noalias %{{.*}}, i16* dereferenceable(2) [[A:%.+]], i16* dereferenceable(2) [[B:%.+]], i16* dereferenceable(2) [[C:%.+]], i16* dereferenceable(2) [[D:%.+]])
+  // Capture a, b, c, d
+  // CHECK: [[ALLOCLA:%.+]] = alloca i16
+  // CHECK: [[ALLOCLB:%.+]] = alloca i16
+  // CHECK: [[ALLOCLC:%.+]] = alloca i16
+  // CHECK: [[ALLOCLD:%.+]] = alloca i16
+  // CHECK: [[LLA:%.+]] = load i16*, i16** [[ALLOCLA]],
+  // CHECK: [[LLB:%.+]] = load i16*, i16** [[ALLOCLB]],
+  // CHECK: [[LLC:%.+]] = load i16*, i16** [[ALLOCLC]],
+  // CHECK: [[LLD:%.+]] = load i16*, i16** [[ALLOCLD]],
+  #pragma omp parallel
+  {
+    // CHECK-DAG:    [[VALLB:%.+]] = load i16, i16* [[LLB]],
+    // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* [[GB]],
+    // CHECK-DAG:    [[VALFB:%.+]] = load float, float* [[TBB]],
+    // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* [[GC]],
+    // CHECK-DAG:    [[VALLC:%.+]] = load i16, i16* [[LLC]],
+    // CHECK-DAG:    [[VALFC:%.+]] = load float, float* [[TBC]],
+    // CHECK-DAG:    [[VALLD:%.+]] = load i16, i16* [[LLD]],
+    // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* [[GD]],
+    // CHECK-DAG:    [[VALFD:%.+]] = load float, float* [[TBD]],
+
+    // 3 local vars being captured.
+
+    // CHECK-DAG: store i16 [[VALLB]], i16* [[CONVLB:%.+]],
+    // CHECK-DAG: [[CONVLB]] = bitcast i[[sz:64|32]]* [[CADDRLB:%.+]] to i16*
+    // CHECK-DAG: [[CVALLB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLB]],
+    // CHECK-DAG: [[CPTRLB:%.+]] = inttoptr i[[sz]] [[CVALLB]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLB]], i8** [[GEPLB:%.+]],
+    // CHECK-DAG: [[GEPLB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store i16 [[VALLC]], i16* [[CONVLC:%.+]],
+    // CHECK-DAG: [[CONVLC]] = bitcast i[[sz]]* [[CADDRLC:%.+]] to i16*
+    // CHECK-DAG: [[CVALLC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLC]],
+    // CHECK-DAG: [[CPTRLC:%.+]] = inttoptr i[[sz]] [[CVALLC]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLC]], i8** [[GEPLC:%.+]],
+    // CHECK-DAG: [[GEPLC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store i16 [[VALLD]], i16* [[CONVLD:%.+]],
+    // CHECK-DAG: [[CONVLD]] = bitcast i[[sz]]* [[CADDRLD:%.+]] to i16*
+    // CHECK-DAG: [[CVALLD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLD]],
+    // CHECK-DAG: [[CPTRLD:%.+]] = inttoptr i[[sz]] [[CVALLD]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLD]], i8** [[GEPLD:%.+]],
+    // CHECK-DAG: [[GEPLD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // 3 static vars being captured.
+
+    // CHECK-DAG: store float [[VALFB]], float* [[CONVFB:%.+]],
+    // CHECK-DAG: [[CONVFB]] = bitcast i[[sz]]* [[CADDRFB:%.+]] to float*
+    // CHECK-DAG: [[CVALFB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFB]],
+    // CHECK-DAG: [[CPTRFB:%.+]] = inttoptr i[[sz]] [[CVALFB]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFB]], i8** [[GEPFB:%.+]],
+    // CHECK-DAG: [[GEPFB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store float [[VALFC]], float* [[CONVFC:%.+]],
+    // CHECK-DAG: [[CONVFC]] = bitcast i[[sz]]* [[CADDRFC:%.+]] to float*
+    // CHECK-DAG: [[CVALFC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFC]],
+    // CHECK-DAG: [[CPTRFC:%.+]] = inttoptr i[[sz]] [[CVALFC]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFC]], i8** [[GEPFC:%.+]],
+    // CHECK-DAG: [[GEPFC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store float [[VALFD]], float* [[CONVFD:%.+]],
+    // CHECK-DAG: [[CONVFD]] = bitcast i[[sz]]* [[CADDRFD:%.+]] to float*
+    // CHECK-DAG: [[CVALFD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFD]],
+    // CHECK-DAG: [[CPTRFD:%.+]] = inttoptr i[[sz]] [[CVALFD]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFD]], i8** [[GEPFD:%.+]],
+    // CHECK-DAG: [[GEPFD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // 3 static global vars being captured.
+
+    // CHECK-64-DAG: store double [[VALGB]], double* [[CONVGB:%.+]],
+    // CHECK-64-DAG: [[CONVGB]] = bitcast i[[sz]]* [[CADDRGB:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGB]],
+    // CHECK-64-DAG: [[CPTRGB:%.+]] = inttoptr i[[sz]] [[CVALGB]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGB]], i8** [[GEPGB:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gb to i8*), i8** [[GEPGB:%.+]],
+    // CHECK-DAG: [[GEPGB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-64-DAG: store double [[VALGC]], double* [[CONVGC:%.+]],
+    // CHECK-64-DAG: [[CONVGC]] = bitcast i[[sz]]* [[CADDRGC:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGC]],
+    // CHECK-64-DAG: [[CPTRGC:%.+]] = inttoptr i[[sz]] [[CVALGC]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGC]], i8** [[GEPGC:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gc to i8*), i8** [[GEPGC:%.+]],
+    // CHECK-DAG: [[GEPGC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-64-DAG: store double [[VALGD]], double* [[CONVGD:%.+]],
+    // CHECK-64-DAG: [[CONVGD]] = bitcast i[[sz]]* [[CADDRGD:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGD]],
+    // CHECK-64-DAG: [[CPTRGD:%.+]] = inttoptr i[[sz]] [[CVALGD]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGD]], i8** [[GEPGD:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gd to i8*), i8** [[GEPGD:%.+]],
+    // CHECK-DAG: [[GEPGD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK: call i32 @__tgt_target
+    // CHECK: call void [[OFFLOADF:@.+]](
+    // Capture b, Gb, Sb, Gc, c, Sc, d, Gd, Sd
+    #pragma omp target if(Ga>0.0 && a>0 && Sa>0.0)
+    {
+      b += 1;
+      Gb += 1.0;
+      Sb += 1.0;
+
+      // CHECK: define internal void [[OFFLOADF]]({{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}})
+      // CHECK: call void {{.*}}@__kmpc_fork_call(%ident_t* {{.+}}, i32 {{.+}}, void (i32*, i32*, ...)* bitcast ({{.*}}[[PARF:@.+]] to {{.*}})
+
+      // CHECK: define internal void [[PARF]](i32* noalias %{{.*}}, i32* noalias %{{.*}}, {{.+}}* dereferenceable({{.+}}) %{{.+}}, {{.+}}* dereferenceable({{.+}}) %{{.+}}, {{.+}}* dereferenceable({{.+}}) %{{.+}})
+      // Capture d, Gd, Sd
+      #pragma omp parallel if(Gc>0.0 && c>0 && Sc>0.0)
+      {
+        d += 1;
+        Gd += 1.0;
+        Sd += 1.0;
+      }
+    }
+  }
+  return a + b + c + d + (int)Sa + (int)Sb + (int)Sc + (int)Sd;
+}
+
+int tbar2(short a, short b, short c, short d){
+  return tbar(a, b, c, d);
+}
+
 #endif
diff --git a/test/OpenMP/target_codegen_registration.cpp b/test/OpenMP/target_codegen_registration.cpp
index 7d515bb..a440faf 100644
--- a/test/OpenMP/target_codegen_registration.cpp
+++ b/test/OpenMP/target_codegen_registration.cpp
@@ -1,20 +1,20 @@
 // Test host codegen.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 
 // Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
 
 // Check that no target code is emmitted if no omptests flag was provided.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET
@@ -61,45 +61,45 @@
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // TCHECK-NOT: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 
 // CHECK-NTARGET-NOT: private constant i8 0
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
-// CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:\.omp_offloading\.[0-9a-f]+\.[0-9a-f]+\._Z.+\.l[0-9]+\.c[0-9]+]]\00"
+// CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
 // CHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
 // CHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
@@ -124,7 +124,7 @@
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
 // CHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
 
-// TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:\.omp_offloading\.[0-9a-f]+\.[0-9a-f]+\._Z.+\.l[0-9]+\.c[0-9]+]]\00"
+// TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
 // TCHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
 // TCHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
@@ -407,31 +407,31 @@
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 11, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 13, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 11, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 13, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 {{[0-9]+}}}
 
 #endif
diff --git a/test/OpenMP/target_codegen_registration_naming.cpp b/test/OpenMP/target_codegen_registration_naming.cpp
index ab7a469..ce133ee 100644
--- a/test/OpenMP/target_codegen_registration_naming.cpp
+++ b/test/OpenMP/target_codegen_registration_naming.cpp
@@ -1,20 +1,20 @@
 // Test host codegen.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 
 // Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
 
 // expected-no-diagnostics
 #ifndef HEADER
@@ -24,7 +24,7 @@
 
 // CHECK: define {{.*}}i32 @[[NNAME:.+]](i32 {{.*}}%{{.+}})
 int nested(int a){
-  // CHECK: call void @.omp_offloading.[[FILEID:[0-9a-f]+\.[0-9a-f]+]].[[NNAME]].l[[T1L:[0-9]+]].c[[T1C:[0-9]+]](
+  // CHECK: call void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME]]_l[[T1L:[0-9]+]](
   #pragma omp target
     ++a;
 
@@ -42,25 +42,25 @@
   return a;
 }
 
-// CHECK: define {{.*}}void @.omp_offloading.[[FILEID]].[[NNAME]].l[[T1L]].c[[T1C]](
-// TCHECK: define {{.*}}void @.omp_offloading.[[FILEID:[0-9a-f]+\.[0-9a-f]+]].[[NNAME:.+]].l[[T1L:[0-9]+]].c[[T1C:[0-9]+]](
+// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T1L]](
+// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME:.+]]_l[[T1L:[0-9]+]](
 
 // CHECK: define {{.*}}void @"[[LNAME]]"(
 // CHECK: call void {{.*}}@__kmpc_fork_call{{.+}}[[PNAME:@.+]] to
 
 // CHECK: define {{.*}}void [[PNAME]](
-// CHECK: call void @.omp_offloading.[[FILEID]].[[NNAME]].l[[T2L:[0-9]+]].c[[T2C:[0-9]+]](
+// CHECK: call void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L:[0-9]+]](
 
-// CHECK: define {{.*}}void @.omp_offloading.[[FILEID]].[[NNAME]].l[[T2L]].c[[T2C]](
-// TCHECK: define {{.*}}void @.omp_offloading.[[FILEID]].[[NNAME:.+]].l[[T2L:[0-9]+]].c[[T2C:[0-9]+]](
+// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L]](
+// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME:.+]]_l[[T2L:[0-9]+]](
 
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 [[T1C]], i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 [[T2C]], i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 [[T1C]], i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 [[T2C]], i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}}
 #endif
diff --git a/test/OpenMP/target_data_ast_print.cpp b/test/OpenMP/target_data_ast_print.cpp
index cdff857..ed7a965 100644
--- a/test/OpenMP/target_data_ast_print.cpp
+++ b/test/OpenMP/target_data_ast_print.cpp
@@ -12,13 +12,13 @@
 T tmain(T argc, T *argv) {
   T i, j, b, c, d, e, x[20];
 
-#pragma omp target data
+#pragma omp target data map(to: c)
   i = argc;
 
-#pragma omp target data if (target data: j > 0)
+#pragma omp target data map(to: c) if (target data: j > 0)
   foo();
 
-#pragma omp target data if (b)
+#pragma omp target data map(to: c) if (b)
   foo();
 
 #pragma omp target data map(c)
@@ -48,11 +48,11 @@
 
 // CHECK: template <typename T = int, int C = 5> int tmain(int argc, int *argv) {
 // CHECK-NEXT: int i, j, b, c, d, e, x[20];
-// CHECK-NEXT: #pragma omp target data
+// CHECK-NEXT: #pragma omp target data map(to: c)
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
 // CHECK-NEXT: foo();
-// CHECK-NEXT: #pragma omp target data if(b)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(tofrom: c)
 // CHECK-NEXT: foo();
@@ -70,11 +70,11 @@
 // CHECK-NEXT: foo();
 // CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
 // CHECK-NEXT: char i, j, b, c, d, e, x[20];
-// CHECK-NEXT: #pragma omp target data
+// CHECK-NEXT: #pragma omp target data map(to: c)
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
 // CHECK-NEXT: foo();
-// CHECK-NEXT: #pragma omp target data if(b)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(tofrom: c)
 // CHECK-NEXT: foo();
@@ -92,11 +92,11 @@
 // CHECK-NEXT: foo();
 // CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
 // CHECK-NEXT: T i, j, b, c, d, e, x[20];
-// CHECK-NEXT: #pragma omp target data
+// CHECK-NEXT: #pragma omp target data map(to: c)
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
 // CHECK-NEXT: foo();
-// CHECK-NEXT: #pragma omp target data if(b)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(tofrom: c)
 // CHECK-NEXT: foo();
@@ -118,17 +118,17 @@
   static int a;
 // CHECK: static int a;
 
-#pragma omp target data
-// CHECK:      #pragma omp target data
+#pragma omp target data map(to: c)
+// CHECK:      #pragma omp target data map(to: c)
   a=2;
 // CHECK-NEXT: a = 2;
-#pragma omp target data if (target data: b)
-// CHECK: #pragma omp target data if(target data: b)
+#pragma omp target data map(to: c) if (target data: b)
+// CHECK: #pragma omp target data map(to: c) if(target data: b)
   foo();
 // CHECK-NEXT: foo();
 
-#pragma omp target data if (b > g)
-// CHECK: #pragma omp target data if(b > g)
+#pragma omp target data map(to: c) if (b > g)
+// CHECK: #pragma omp target data map(to: c) if(b > g)
   foo();
 // CHECK-NEXT: foo();
 
diff --git a/test/OpenMP/target_data_codegen.cpp b/test/OpenMP/target_data_codegen.cpp
new file mode 100644
index 0000000..a149ba9
--- /dev/null
+++ b/test/OpenMP/target_data_codegen.cpp
@@ -0,0 +1,248 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+// CK1: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+};
+
+ST<int> gb;
+double gc[100];
+
+// CK1: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 800]
+// CK1: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 34]
+
+// CK1: [[SIZE02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] 4]
+// CK1: [[MTYPE02:@.+]] = {{.+}}constant [1 x i32] [i32 33]
+
+// CK1: [[MTYPE03:@.+]] = {{.+}}constant [1 x i32] [i32 37]
+
+// CK1: [[SIZE04:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] 24]
+// CK1: [[MTYPE04:@.+]] = {{.+}}constant [2 x i32] [i32 33, i32 17]
+
+// CK1-LABEL: _Z3fooi
+void foo(int arg) {
+  int la;
+  float lb[arg];
+
+  // Region 00
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[P0]]
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+
+  // CK1-DAG: call void @__tgt_target_data_end(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+  #pragma omp target data if(1+3-5) device(arg) map(from: gc)
+  {++arg;}
+
+  // Region 01
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target data map(la) if(1+3-4)
+  {++arg;}
+
+  // Region 02
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+  // CK1: br label %[[IFEND:[^,]+]]
+
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_end(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+  // CK1: br label %[[IFEND:[^,]+]]
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  #pragma omp target data map(to: arg) if(arg) device(4)
+  {++arg;}
+
+  // Region 03
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: store i[[sz]] [[CSVAL0:%[^,]+]], i[[sz]]* [[S0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+  // CK1-DAG: [[CSVAL0]] = mul nuw i[[sz]] %{{[^,]+}}, 4
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+
+  // CK1-DAG: call void @__tgt_target_data_end(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S]]
+  #pragma omp target data map(always, to: lb)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 04
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([[ST]]* @gb to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[P0]]
+
+
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[BP1]]
+  // CK1-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK1-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%.+]] to i8*
+  // CK1-DAG: [[SEC1]] = getelementptr inbounds {{.+}}double* [[SEC11:%[^,]+]], i{{.+}} 0
+  // CK1-DAG: [[SEC11]] = load double*, double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1),
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+
+  // CK1-DAG: call void @__tgt_target_data_end(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+  #pragma omp target data map(to: gb.b[:3])
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+
+  T foo(T arg) {
+    // Region 00
+    #pragma omp target data map(always, to: b[1:3]) if(a>123) device(arg)
+    {arg++;}
+    return arg;
+  }
+};
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 24]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [2 x i32] [i32 37, i32 21]
+
+// CK2-LABEL: _Z3bari
+int bar(int arg){
+  ST<int> A;
+  return A.foo(arg);
+}
+
+// Region 00
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_begin(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+// CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%[^,]+]] to i8*
+// CK2-DAG: [[SEC0]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+
+// CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK2-DAG: [[CBPVAL1]] = bitcast double** [[SEC0]] to i8*
+// CK2-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%[^,]+]] to i8*
+// CK2-DAG: [[SEC1]] = getelementptr inbounds {{.*}}double* [[SEC11:%[^,]+]], i{{.+}} 1
+// CK2-DAG: [[SEC11]] = load double*, double** [[SEC111:%[^,]+]],
+// CK2-DAG: [[SEC111]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+// CK2: br label %[[IFEND:[^,]+]]
+
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+// CK2: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_end(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+// CK2: br label %[[IFEND:[^,]+]]
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3-LABEL: no_target_devices
+void no_target_devices(int arg) {
+  // CK3-NOT: tgt_target_data_begin
+  // CK3: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK3-NOT: tgt_target_data_end
+  // CK3: ret
+  #pragma omp target data map(to: arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_data_device_messages.cpp b/test/OpenMP/target_data_device_messages.cpp
index 9e8e31a..9ed7a54 100644
--- a/test/OpenMP/target_data_device_messages.cpp
+++ b/test/OpenMP/target_data_device_messages.cpp
@@ -10,18 +10,19 @@
 struct S1; // expected-note {{declared here}}
 
 int main(int argc, char **argv) {
-  #pragma omp target data device // expected-error {{expected '(' after 'device'}}
-  #pragma omp target data device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data device () // expected-error {{expected expression}}
-  #pragma omp target data device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
-#pragma omp target data device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
-  #pragma omp target data device (argc + argc)
-  #pragma omp target data device (argc), device (argc+1) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'device' clause}}
-  #pragma omp target data device (S1) // expected-error {{'S1' does not refer to a value}}
-  #pragma omp target data device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
-  #pragma omp target device (-10u)
-  #pragma omp target device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  int a;
+  #pragma omp target data map(to: a) device // expected-error {{expected '(' after 'device'}}
+  #pragma omp target data map(to: a) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) device () // expected-error {{expected expression}}
+  #pragma omp target data map(to: a) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
+#pragma omp target data map(to: a) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target data map(to: a) device (argc + argc)
+  #pragma omp target data map(to: a) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'device' clause}}
+  #pragma omp target data map(to: a) device (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target data map(to: a) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  #pragma omp target data map(to: a) device (-10u)
+  #pragma omp target data map(to: a) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
   foo();
 
   return 0;
diff --git a/test/OpenMP/target_data_if_messages.cpp b/test/OpenMP/target_data_if_messages.cpp
index 77edefa..ec6fe26 100644
--- a/test/OpenMP/target_data_if_messages.cpp
+++ b/test/OpenMP/target_data_if_messages.cpp
@@ -10,22 +10,23 @@
 struct S1; // expected-note {{declared here}}
 
 int main(int argc, char **argv) {
-  #pragma omp target data if // expected-error {{expected '(' after 'if'}}
-  #pragma omp target data if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if () // expected-error {{expected expression}}
-  #pragma omp target data if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
-  #pragma omp target data if (argc > 0 ? argv[1] : argv[2])
-  #pragma omp target data if (argc + argc)
-  #pragma omp target data if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'if' clause}}
-  #pragma omp target data if (S1) // expected-error {{'S1' does not refer to a value}}
-  #pragma omp target data if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if(target data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if(target data : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if(target data : argc)
-  #pragma omp target data if(target data : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target data'}}
-  #pragma omp target data if(target data : argc) if (target data:argc) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'if' clause with 'target data' name modifier}}
-  #pragma omp target data if(target data : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  int a;
+  #pragma omp target data map(to: a) if // expected-error {{expected '(' after 'if'}}
+  #pragma omp target data map(to: a) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if () // expected-error {{expected expression}}
+  #pragma omp target data map(to: a) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
+  #pragma omp target data map(to: a) if (argc > 0 ? argv[1] : argv[2])
+  #pragma omp target data map(to: a) if (argc + argc)
+  #pragma omp target data map(to: a) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'if' clause}}
+  #pragma omp target data map(to: a) if (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target data map(to: a) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if(target data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if(target data : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if(target data : argc)
+  #pragma omp target data map(to: a) if(target data : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target data'}}
+  #pragma omp target data map(to: a) if(target data : argc) if (target data:argc) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'if' clause with 'target data' name modifier}}
+  #pragma omp target data map(to: a) if(target data : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
   foo();
 
   return 0;
diff --git a/test/OpenMP/target_data_messages.c b/test/OpenMP/target_data_messages.c
index cd60d85..153b437 100644
--- a/test/OpenMP/target_data_messages.c
+++ b/test/OpenMP/target_data_messages.c
@@ -3,19 +3,22 @@
 void foo() { }
 
 int main(int argc, char **argv) {
+  int a;
+  #pragma omp target data // expected-error {{expected at least one map clause for '#pragma omp target data'}}
+  {}
   L1:
     foo();
-  #pragma omp target data
+  #pragma omp target data map(a)
   {
     foo();
     goto L1; // expected-error {{use of undeclared label 'L1'}}
   }
   goto L2; // expected-error {{use of undeclared label 'L2'}}
-  #pragma omp target data
+  #pragma omp target data map(a)
   L2:
   foo();
 
-  #pragma omp target data(i) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
+  #pragma omp target data map(a)(i) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
   {
     foo();
   }
diff --git a/test/OpenMP/target_data_use_device_ptr_ast_print.cpp b/test/OpenMP/target_data_use_device_ptr_ast_print.cpp
new file mode 100644
index 0000000..4e3253b
--- /dev/null
+++ b/test/OpenMP/target_data_use_device_ptr_ast_print.cpp
@@ -0,0 +1,154 @@
+// RxUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct ST {
+  int *a;
+};
+struct SA {
+  int i, j;
+  int *k = &j;
+  int *&z = k;
+  void func(int arg) {
+#pragma omp target data map(tofrom: i) use_device_ptr(k)
+    {}
+#pragma omp target data map(tofrom: i) use_device_ptr(z)
+    {}
+  return;
+ }
+};
+// CHECK: struct SA
+// CHECK: void func(
+// CHECK: #pragma omp target data map(tofrom: this->i) use_device_ptr(this->k)
+// CHECK: #pragma omp target data map(tofrom: this->i) use_device_ptr(this->z)
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef int from;
+
+template <typename T>
+T tmain(T argc) {
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+#pragma omp target data map(tofrom: i) use_device_ptr(k)
+  {}
+#pragma omp target data map(tofrom: i) use_device_ptr(z)
+  {}
+  return 0;
+}
+
+// CHECK: template <typename T = int> int tmain(int argc) {
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = i;
+// CHECK-NEXT: int *k = &j;
+// CHECK-NEXT: int *&z = k;
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(z)
+
+// CHECK: template <typename T = int *> int *tmain(int *argc) {
+// CHECK-NEXT: int *i;
+// CHECK-NEXT: int *&j = i;
+// CHECK-NEXT: int **k = &j;
+// CHECK-NEXT: int **&z = k;
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(z)
+
+// CHECK-LABEL: int main(int argc, char **argv) {
+int main(int argc, char **argv) {
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = i;
+// CHECK-NEXT: int *k = &j;
+// CHECK-NEXT: int *&z = k;
+#pragma omp target data map(tofrom: i) use_device_ptr(k)
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(k)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target data map(tofrom: i) use_device_ptr(z)
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(z)
+  {}
+  return tmain<int>(argc) + (*tmain<int*>(&argc));
+}
+
+#endif
diff --git a/test/OpenMP/target_data_use_device_ptr_codegen.cpp b/test/OpenMP/target_data_use_device_ptr_codegen.cpp
new file mode 100644
index 0000000..c4b389a
--- /dev/null
+++ b/test/OpenMP/target_data_use_device_ptr_codegen.cpp
@@ -0,0 +1,464 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+double *g;
+
+// CK1: @g = global double*
+// CK1: [[MTYPE00:@.+]] = {{.*}}constant [1 x i32] [i32 99]
+// CK1: [[MTYPE01:@.+]] = {{.*}}constant [1 x i32] [i32 99]
+// CK1: [[MTYPE03:@.+]] = {{.*}}constant [1 x i32] [i32 99]
+// CK1: [[MTYPE04:@.+]] = {{.*}}constant [1 x i32] [i32 99]
+// CK1: [[MTYPE05:@.+]] = {{.*}}constant [1 x i32] [i32 99]
+// CK1: [[MTYPE06:@.+]] = {{.*}}constant [1 x i32] [i32 99]
+// CK1: [[MTYPE07:@.+]] = {{.*}}constant [1 x i32] [i32 99]
+// CK1: [[MTYPE08:@.+]] = {{.*}}constant [2 x i32] [{{i32 35, i32 99|i32 99, i32 35}}]
+// CK1: [[MTYPE09:@.+]] = {{.*}}constant [2 x i32] [i32 99, i32 99]
+// CK1: [[MTYPE10:@.+]] = {{.*}}constant [2 x i32] [i32 99, i32 99]
+// CK1: [[MTYPE11:@.+]] = {{.*}}constant [2 x i32] [i32 96, i32 35]
+// CK1: [[MTYPE12:@.+]] = {{.*}}constant [2 x i32] [i32 96, i32 35]
+
+// CK1-LABEL: @_Z3foo
+template<typename T>
+void foo(float *&lr, T *&tr) {
+  float *l;
+  T *t;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast double* [[T:%.+]] to i8*
+  // CK1-DAG: [[T]] = load double*, double** [[DECL:@g]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE00]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to double**
+  // CK1:     [[VAL:%.+]] = load double*, double** [[CBP]],
+  // CK1-NOT: store double* [[VAL]], double** [[DECL]],
+  // CK1:     store double* [[VAL]], double** [[PVT:%.+]],
+  // CK1:     [[TT:%.+]] = load double*, double** [[PVT]],
+  // CK1:     getelementptr inbounds double, double* [[TT]], i32 1
+  #pragma omp target data map(g[:10]) use_device_ptr(g)
+  {
+    ++g;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE00]]
+  // CK1:     [[TTT:%.+]] = load double*, double** [[DECL]],
+  // CK1:     getelementptr inbounds double, double* [[TTT]], i32 1
+  ++g;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast float* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load float*, float** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE01]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to float**
+  // CK1:     [[VAL:%.+]] = load float*, float** [[CBP]],
+  // CK1-NOT: store float* [[VAL]], float** [[DECL]],
+  // CK1:     store float* [[VAL]], float** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load float*, float** [[PVT]],
+  // CK1:     getelementptr inbounds float, float* [[TT1]], i32 1
+  #pragma omp target data map(l[:10]) use_device_ptr(l)
+  {
+    ++l;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE01]]
+  // CK1:     [[TTT:%.+]] = load float*, float** [[DECL]],
+  // CK1:     getelementptr inbounds float, float* [[TTT]], i32 1
+  ++l;
+
+  // CK1-NOT: call void @__tgt_target
+  // CK1:     [[TTT:%.+]] = load float*, float** [[DECL]],
+  // CK1:     getelementptr inbounds float, float* [[TTT]], i32 1
+  #pragma omp target data map(l[:10]) use_device_ptr(l) if(0)
+  {
+    ++l;
+  }
+  // CK1-NOT: call void @__tgt_target
+  // CK1:     [[TTT:%.+]] = load float*, float** [[DECL]],
+  // CK1:     getelementptr inbounds float, float* [[TTT]], i32 1
+  ++l;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast float* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load float*, float** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE03]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to float**
+  // CK1:     [[VAL:%.+]] = load float*, float** [[CBP]],
+  // CK1-NOT: store float* [[VAL]], float** [[DECL]],
+  // CK1:     store float* [[VAL]], float** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load float*, float** [[PVT]],
+  // CK1:     getelementptr inbounds float, float* [[TT1]], i32 1
+  #pragma omp target data map(l[:10]) use_device_ptr(l) if(1)
+  {
+    ++l;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE03]]
+  // CK1:     [[TTT:%.+]] = load float*, float** [[DECL]],
+  // CK1:     getelementptr inbounds float, float* [[TTT]], i32 1
+  ++l;
+
+  // CK1:     [[CMP:%.+]] = icmp ne float* %{{.+}}, null
+  // CK1:     br i1 [[CMP]], label %[[BTHEN:.+]], label %[[BELSE:.+]]
+
+  // CK1:     [[BTHEN]]:
+  // CK1-DAG: [[RVAL:%.+]] = bitcast float* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load float*, float** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE04]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to float**
+  // CK1:     [[VAL:%.+]] = load float*, float** [[CBP]],
+  // CK1-NOT: store float* [[VAL]], float** [[DECL]],
+  // CK1:     store float* [[VAL]], float** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load float*, float** [[PVT]],
+  // CK1:     getelementptr inbounds float, float* [[TT1]], i32 1
+  // CK1:     br label %[[BEND:.+]]
+
+  // CK1:     [[BELSE]]:
+  // CK1:     [[TTT:%.+]] = load float*, float** [[DECL]],
+  // CK1:     getelementptr inbounds float, float* [[TTT]], i32 1
+  // CK1:     br label %[[BEND]]
+  #pragma omp target data map(l[:10]) use_device_ptr(l) if(lr != 0)
+  {
+    ++l;
+  }
+  // CK1:     [[BEND]]:
+  // CK1:     [[CMP:%.+]] = icmp ne float* %{{.+}}, null
+  // CK1:     br i1 [[CMP]], label %[[BTHEN:.+]], label %[[BELSE:.+]]
+
+  // CK1:     [[BTHEN]]:
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE04]]
+  // CK1:     br label %[[BEND:.+]]
+
+  // CK1:     [[BELSE]]:
+  // CK1:     br label %[[BEND]]
+
+  // CK1:     [[BEND]]:
+  // CK1:     [[TTT:%.+]] = load float*, float** [[DECL]],
+  // CK1:     getelementptr inbounds float, float* [[TTT]], i32 1
+  ++l;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast float* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load float*, float** [[T2:%.+]],
+  // CK1-DAG: [[T2]] = load float**, float*** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE05]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to float**
+  // CK1:     [[VAL:%.+]] = load float*, float** [[CBP]],
+  // CK1:     store float* [[VAL]], float** [[PVTV:%.+]],
+  // CK1-NOT: store float** [[PVTV]], float*** [[DECL]],
+  // CK1:     store float** [[PVTV]], float*** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load float**, float*** [[PVT]],
+  // CK1:     [[TT2:%.+]] = load float*, float** [[TT1]],
+  // CK1:     getelementptr inbounds float, float* [[TT2]], i32 1
+  #pragma omp target data map(lr[:10]) use_device_ptr(lr)
+  {
+    ++lr;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE05]]
+  // CK1:     [[TTT:%.+]] = load float**, float*** [[DECL]],
+  // CK1:     [[TTTT:%.+]] = load float*, float** [[TTT]],
+  // CK1:     getelementptr inbounds float, float* [[TTTT]], i32 1
+  ++lr;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast i32* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load i32*, i32** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE06]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to i32**
+  // CK1:     [[VAL:%.+]] = load i32*, i32** [[CBP]],
+  // CK1-NOT: store i32* [[VAL]], i32** [[DECL]],
+  // CK1:     store i32* [[VAL]], i32** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load i32*, i32** [[PVT]],
+  // CK1:     getelementptr inbounds i32, i32* [[TT1]], i32 1
+  #pragma omp target data map(t[:10]) use_device_ptr(t)
+  {
+    ++t;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE06]]
+  // CK1:     [[TTT:%.+]] = load i32*, i32** [[DECL]],
+  // CK1:     getelementptr inbounds i32, i32* [[TTT]], i32 1
+  ++t;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast i32* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load i32*, i32** [[T2:%.+]],
+  // CK1-DAG: [[T2]] = load i32**, i32*** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE07]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to i32**
+  // CK1:     [[VAL:%.+]] = load i32*, i32** [[CBP]],
+  // CK1:     store i32* [[VAL]], i32** [[PVTV:%.+]],
+  // CK1-NOT: store i32** [[PVTV]], i32*** [[DECL]],
+  // CK1:     store i32** [[PVTV]], i32*** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load i32**, i32*** [[PVT]],
+  // CK1:     [[TT2:%.+]] = load i32*, i32** [[TT1]],
+  // CK1:     getelementptr inbounds i32, i32* [[TT2]], i32 1
+  #pragma omp target data map(tr[:10]) use_device_ptr(tr)
+  {
+    ++tr;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE07]]
+  // CK1:     [[TTT:%.+]] = load i32**, i32*** [[DECL]],
+  // CK1:     [[TTTT:%.+]] = load i32*, i32** [[TTT]],
+  // CK1:     getelementptr inbounds i32, i32* [[TTTT]], i32 1
+  ++tr;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast float* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load float*, float** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE08]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to float**
+  // CK1:     [[VAL:%.+]] = load float*, float** [[CBP]],
+  // CK1-NOT: store float* [[VAL]], float** [[DECL]],
+  // CK1:     store float* [[VAL]], float** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load float*, float** [[PVT]],
+  // CK1:     getelementptr inbounds float, float* [[TT1]], i32 1
+  #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l)
+  {
+    ++l; ++t;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE08]]
+  // CK1:     [[TTT:%.+]] = load float*, float** [[DECL]],
+  // CK1:     getelementptr inbounds float, float* [[TTT]], i32 1
+  ++l; ++t;
+
+
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE09]]
+  // CK1:     [[_CBP:%.+]] = bitcast i8** {{%.+}} to float**
+  // CK1:     [[_VAL:%.+]] = load float*, float** [[_CBP]],
+  // CK1:     store float* [[_VAL]], float** [[_PVT:%.+]],
+  // CK1:     [[CBP:%.+]] = bitcast i8** {{%.+}} to i32**
+  // CK1:     [[VAL:%.+]] = load i32*, i32** [[CBP]],
+  // CK1:     store i32* [[VAL]], i32** [[PVT:%.+]],
+  // CK1:     [[_TT1:%.+]] = load float*, float** [[_PVT]],
+  // CK1:     getelementptr inbounds float, float* [[_TT1]], i32 1
+  // CK1:     [[TT1:%.+]] = load i32*, i32** [[PVT]],
+  // CK1:     getelementptr inbounds i32, i32* [[TT1]], i32 1
+  #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l) use_device_ptr(t)
+  {
+    ++l; ++t;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE09]]
+  // CK1:     [[_TTT:%.+]] = load float*, float** {{%.+}},
+  // CK1:     getelementptr inbounds float, float* [[_TTT]], i32 1
+  // CK1:     [[TTT:%.+]] = load i32*, i32** {{%.+}},
+  // CK1:     getelementptr inbounds i32, i32* [[TTT]], i32 1
+  ++l; ++t;
+
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE10]]
+  // CK1:     [[_CBP:%.+]] = bitcast i8** {{%.+}} to float**
+  // CK1:     [[_VAL:%.+]] = load float*, float** [[_CBP]],
+  // CK1:     store float* [[_VAL]], float** [[_PVT:%.+]],
+  // CK1:     [[CBP:%.+]] = bitcast i8** {{%.+}} to i32**
+  // CK1:     [[VAL:%.+]] = load i32*, i32** [[CBP]],
+  // CK1:     store i32* [[VAL]], i32** [[PVT:%.+]],
+  // CK1:     [[_TT1:%.+]] = load float*, float** [[_PVT]],
+  // CK1:     getelementptr inbounds float, float* [[_TT1]], i32 1
+  // CK1:     [[TT1:%.+]] = load i32*, i32** [[PVT]],
+  // CK1:     getelementptr inbounds i32, i32* [[TT1]], i32 1
+  #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l,t)
+  {
+    ++l; ++t;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE10]]
+  // CK1:     [[_TTT:%.+]] = load float*, float** {{%.+}},
+  // CK1:     getelementptr inbounds float, float* [[_TTT]], i32 1
+  // CK1:     [[TTT:%.+]] = load i32*, i32** {{%.+}},
+  // CK1:     getelementptr inbounds i32, i32* [[TTT]], i32 1
+  ++l; ++t;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast i32* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load i32*, i32** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE11]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to i32**
+  // CK1:     [[VAL:%.+]] = load i32*, i32** [[CBP]],
+  // CK1-NOT: store i32* [[VAL]], i32** [[DECL]],
+  // CK1:     store i32* [[VAL]], i32** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load i32*, i32** [[PVT]],
+  // CK1:     getelementptr inbounds i32, i32* [[TT1]], i32 1
+  #pragma omp target data map(l[:10]) use_device_ptr(t)
+  {
+    ++l; ++t;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE11]]
+  // CK1:     [[TTT:%.+]] = load i32*, i32** [[DECL]],
+  // CK1:     getelementptr inbounds i32, i32* [[TTT]], i32 1
+  ++l; ++t;
+
+  // CK1-DAG: [[RVAL:%.+]] = bitcast i32* [[T1:%.+]] to i8*
+  // CK1-DAG: [[T1]] = load i32*, i32** [[T2:%.+]],
+  // CK1-DAG: [[T2]] = load i32**, i32*** [[DECL:%.+]],
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     store i8* [[RVAL]], i8** [[BP]],
+  // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE12]]
+  // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to i32**
+  // CK1:     [[VAL:%.+]] = load i32*, i32** [[CBP]],
+  // CK1:     store i32* [[VAL]], i32** [[PVTV:%.+]],
+  // CK1-NOT: store i32** [[PVTV]], i32*** [[DECL]],
+  // CK1:     store i32** [[PVTV]], i32*** [[PVT:%.+]],
+  // CK1:     [[TT1:%.+]] = load i32**, i32*** [[PVT]],
+  // CK1:     [[TT2:%.+]] = load i32*, i32** [[TT1]],
+  // CK1:     getelementptr inbounds i32, i32* [[TT2]], i32 1
+  #pragma omp target data map(l[:10]) use_device_ptr(tr)
+  {
+    ++l; ++tr;
+  }
+  // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE12]]
+  // CK1:     [[TTT:%.+]] = load i32**, i32*** [[DECL]],
+  // CK1:     [[TTTT:%.+]] = load i32*, i32** [[TTT]],
+  // CK1:     getelementptr inbounds i32, i32* [[TTTT]], i32 1
+  ++l; ++tr;
+
+}
+
+void bar(float *&a, int *&b) {
+  foo<int>(a,b);
+}
+
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { double*, double** }
+// CK2: [[MTYPE00:@.+]] = {{.*}}constant [2 x i32] [i32 35, i32 83]
+// CK2: [[MTYPE01:@.+]] = {{.*}}constant [3 x i32] [i32 32, i32 19, i32 83]
+// CK2: [[MTYPE02:@.+]] = {{.*}}constant [2 x i32] [i32 96, i32 35]
+// CK2: [[MTYPE03:@.+]] = {{.*}}constant [4 x i32] [i32 96, i32 32, i32 19, i32 83]
+
+template <typename T>
+struct ST {
+  T *a;
+  double *&b;
+  ST(double *&b) : a(0), b(b) {}
+
+  // CK2-LABEL: @{{.*}}foo{{.*}}
+  void foo(double *&arg) {
+    int *la = 0;
+
+    // CK2:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 1
+    // CK2:     store i8* [[RVAL:%.+]], i8** [[BP]],
+    // CK2:     call void @__tgt_target_data_begin{{.+}}[[MTYPE00]]
+    // CK2:     [[CBP:%.+]] = bitcast i8** [[BP]] to double**
+    // CK2:     [[VAL:%.+]] = load double*, double** [[CBP]],
+    // CK2:     store double* [[VAL]], double** [[PVT:%.+]],
+    // CK2:     store double** [[PVT]], double*** [[PVT2:%.+]],
+    // CK2:     [[TT1:%.+]] = load double**, double*** [[PVT2]],
+    // CK2:     [[TT2:%.+]] = load double*, double** [[TT1]],
+    // CK2:     getelementptr inbounds double, double* [[TT2]], i32 1
+    #pragma omp target data map(a[:10]) use_device_ptr(a)
+    {
+      a++;
+    }
+    // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE00]]
+    // CK2:     [[DECL:%.+]] = getelementptr inbounds [[ST]], [[ST]]* %this1, i32 0, i32 0
+    // CK2:     [[TTT:%.+]] = load double*, double** [[DECL]],
+    // CK2:     getelementptr inbounds double, double* [[TTT]], i32 1
+    a++;
+
+    // CK2:     [[BP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* %{{.+}}, i32 0, i32 2
+    // CK2:     store i8* [[RVAL:%.+]], i8** [[BP]],
+    // CK2:     call void @__tgt_target_data_begin{{.+}}[[MTYPE01]]
+    // CK2:     [[CBP:%.+]] = bitcast i8** [[BP]] to double**
+    // CK2:     [[VAL:%.+]] = load double*, double** [[CBP]],
+    // CK2:     store double* [[VAL]], double** [[PVT:%.+]],
+    // CK2:     store double** [[PVT]], double*** [[PVT2:%.+]],
+    // CK2:     [[TT1:%.+]] = load double**, double*** [[PVT2]],
+    // CK2:     [[TT2:%.+]] = load double*, double** [[TT1]],
+    // CK2:     getelementptr inbounds double, double* [[TT2]], i32 1
+    #pragma omp target data map(b[:10]) use_device_ptr(b)
+    {
+      b++;
+    }
+    // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE01]]
+    // CK2:     [[DECL:%.+]] = getelementptr inbounds [[ST]], [[ST]]* %{{.+}}, i32 0, i32 1
+    // CK2:     [[TTT:%.+]] = load double**, double*** [[DECL]],
+    // CK2:     [[TTTT:%.+]] = load double*, double** [[TTT]],
+    // CK2:     getelementptr inbounds double, double* [[TTTT]], i32 1
+    b++;
+
+    // CK2:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0
+    // CK2:     store i8* [[RVAL:%.+]], i8** [[BP]],
+    // CK2:     call void @__tgt_target_data_begin{{.+}}[[MTYPE02]]
+    // CK2:     [[CBP:%.+]] = bitcast i8** [[BP]] to double**
+    // CK2:     [[VAL:%.+]] = load double*, double** [[CBP]],
+    // CK2:     store double* [[VAL]], double** [[PVT:%.+]],
+    // CK2:     store double** [[PVT]], double*** [[PVT2:%.+]],
+    // CK2:     [[TT1:%.+]] = load double**, double*** [[PVT2]],
+    // CK2:     [[TT2:%.+]] = load double*, double** [[TT1]],
+    // CK2:     getelementptr inbounds double, double* [[TT2]], i32 1
+    #pragma omp target data map(la[:10]) use_device_ptr(a)
+    {
+      a++;
+      la++;
+    }
+    // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE02]]
+    // CK2:     [[DECL:%.+]] = getelementptr inbounds [[ST]], [[ST]]* %this1, i32 0, i32 0
+    // CK2:     [[TTT:%.+]] = load double*, double** [[DECL]],
+    // CK2:     getelementptr inbounds double, double* [[TTT]], i32 1
+    a++;
+    la++;
+
+    // CK2:     [[BP:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* %{{.+}}, i32 0, i32 0
+    // CK2:     store i8* [[RVAL:%.+]], i8** [[BP]],
+    // CK2:     [[_BP:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* %{{.+}}, i32 0, i32 3
+    // CK2:     store i8* [[_RVAL:%.+]], i8** [[_BP]],
+    // CK2:     call void @__tgt_target_data_begin{{.+}}[[MTYPE03]]
+    // CK2:     [[CBP:%.+]] = bitcast i8** [[BP]] to double**
+    // CK2:     [[VAL:%.+]] = load double*, double** [[CBP]],
+    // CK2:     store double* [[VAL]], double** [[PVT:%.+]],
+    // CK2:     store double** [[PVT]], double*** [[PVT2:%.+]],
+    // CK2:     [[_CBP:%.+]] = bitcast i8** [[_BP]] to double**
+    // CK2:     [[_VAL:%.+]] = load double*, double** [[_CBP]],
+    // CK2:     store double* [[_VAL]], double** [[_PVT:%.+]],
+    // CK2:     store double** [[_PVT]], double*** [[_PVT2:%.+]],
+    // CK2:     [[TT1:%.+]] = load double**, double*** [[PVT2]],
+    // CK2:     [[TT2:%.+]] = load double*, double** [[TT1]],
+    // CK2:     getelementptr inbounds double, double* [[TT2]], i32 1
+    // CK2:     [[_TT1:%.+]] = load double**, double*** [[_PVT2]],
+    // CK2:     [[_TT2:%.+]] = load double*, double** [[_TT1]],
+    // CK2:     getelementptr inbounds double, double* [[_TT2]], i32 1
+    #pragma omp target data map(b[:10]) use_device_ptr(a, b)
+    {
+      a++;
+      b++;
+    }
+    // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE03]]
+    // CK2:     [[DECL:%.+]] = getelementptr inbounds [[ST]], [[ST]]* %this1, i32 0, i32 0
+    // CK2:     [[TTT:%.+]] = load double*, double** [[DECL]],
+    // CK2:     getelementptr inbounds double, double* [[TTT]], i32 1
+    // CK2:     [[_DECL:%.+]] = getelementptr inbounds [[ST]], [[ST]]* %this1, i32 0, i32 1
+    // CK2:     [[_TTT:%.+]] = load double**, double*** [[_DECL]],
+    // CK2:     [[_TTTT:%.+]] = load double*, double** [[_TTT]],
+    // CK2:     getelementptr inbounds double, double* [[_TTTT]], i32 1
+    a++;
+    b++;
+  }
+};
+
+void bar(double *arg){
+  ST<double> A(arg);
+  A.foo(arg);
+  ++arg;
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_data_use_device_ptr_messages.cpp b/test/OpenMP/target_data_use_device_ptr_messages.cpp
new file mode 100644
index 0000000..1d8002c
--- /dev/null
+++ b/test/OpenMP/target_data_use_device_ptr_messages.cpp
@@ -0,0 +1,206 @@
+// RUN: %clang_cc1 -std=c++11 -verify -fopenmp -ferror-limit 200 %s
+struct ST {
+  int *a;
+};
+struct SA {
+  const int d = 5;
+  const int da[5] = { 0 };
+  ST e;
+  ST g[10];
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  void func(int arg) {
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+    {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+    {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+    {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+    {}
+#pragma omp target data map(i) use_device_ptr(arg // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+    {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+    {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+  return;
+ }
+};
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef int from;
+
+template <typename T, int I>
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+  T aa[10];
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+  {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target data map(i) use_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+  {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target data map(i) use_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+  return tmain<int, 3>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_defaultmap_messages.cpp b/test/OpenMP/target_defaultmap_messages.cpp
new file mode 100644
index 0000000..59348d4
--- /dev/null
+++ b/test/OpenMP/target_defaultmap_messages.cpp
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  foo();
+  #pragma omp target defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  foo();
+  #pragma omp target defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_depend_messages.cpp b/test/OpenMP/target_depend_messages.cpp
new file mode 100644
index 0000000..48bd941
--- /dev/null
+++ b/test/OpenMP/target_depend_messages.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+
+  #pragma omp target depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+  #pragma omp target depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target depend (in : argv[0])
+  foo();
+  #pragma omp target depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target depend (in : argv[-1:0])
+  foo();
+  #pragma omp target depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target depend(in : arr[0])
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_device_messages.cpp b/test/OpenMP/target_device_messages.cpp
index fb0f2de..3befcd6 100644
--- a/test/OpenMP/target_device_messages.cpp
+++ b/test/OpenMP/target_device_messages.cpp
@@ -11,16 +11,27 @@
 
 int main(int argc, char **argv) {
   #pragma omp target device // expected-error {{expected '(' after 'device'}}
+  foo();
   #pragma omp target device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target device () // expected-error {{expected expression}}
+  foo();
   #pragma omp target device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
-#pragma omp target device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
   #pragma omp target device (argc + argc)
+  foo();
   #pragma omp target device (argc), device (argc+1) // expected-error {{directive '#pragma omp target' cannot contain more than one 'device' clause}}
+  foo();
   #pragma omp target device (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
   #pragma omp target device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  foo();
   #pragma omp target device (-10u)
+  foo();
   #pragma omp target device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
   foo();
 
diff --git a/test/OpenMP/target_enter_data_ast_print.cpp b/test/OpenMP/target_enter_data_ast_print.cpp
new file mode 100644
index 0000000..10ec925
--- /dev/null
+++ b/test/OpenMP/target_enter_data_ast_print.cpp
@@ -0,0 +1,228 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+template <typename T, int C>
+T tmain(T argc, T *argv) {
+  T i, j, b, c, d, e, x[20];
+
+  i = argc;
+#pragma omp target enter data map(to: i)
+
+#pragma omp target enter data map(to: i) if (target enter data: j > 0)
+
+#pragma omp target enter data map(to: i) if (b)
+
+#pragma omp target enter data map(to: c)
+
+#pragma omp target enter data map(to: c) if(b>e)
+
+#pragma omp target enter data map(alloc: x[0:10], c)
+
+#pragma omp target enter data map(to: c) map(alloc: d)
+
+#pragma omp target enter data map(always,alloc: e)
+
+#pragma omp target enter data nowait map(to: i)
+
+#pragma omp target enter data nowait map(to: i) if (target enter data: j > 0)
+
+#pragma omp target enter data map(to: i) if (b) nowait
+
+#pragma omp target enter data map(to: c) nowait
+
+#pragma omp target enter data map(to: c) nowait if(b>e)
+
+#pragma omp target enter data nowait map(alloc: x[0:10], c)
+
+#pragma omp target enter data nowait map(to: c) map(alloc: d)
+
+#pragma omp target enter data nowait map(always,alloc: e)
+
+#pragma omp target enter data nowait depend(in : argc, argv[i:argc], x[:]) map(to: i)
+
+#pragma omp target enter data nowait map(to: i) if (target enter data: j > 0) depend(in : argc, argv[i:argc], x[:])
+
+#pragma omp target enter data depend(in : argc, argv[i:argc], x[:]) map(to: i) if (b) nowait
+
+#pragma omp target enter data map(to: c) depend(in : argc, argv[i:argc], x[:]) nowait
+
+#pragma omp target enter data map(to: c) nowait if(b>e) depend(in : argc, argv[i:argc], x[:])
+
+#pragma omp target enter data nowait map(alloc: x[0:10], c) depend(in : argc, argv[i:argc], x[:])
+
+#pragma omp target enter data nowait depend(in : argc, argv[i:argc], x[:]) map(to: c) map(alloc: d)
+
+#pragma omp target enter data nowait map(always,alloc: e) depend(in : argc, argv[i:argc], x[:])
+
+  return 0;
+}
+
+// CHECK: template <typename T = int, int C = 5> int tmain(int argc, int *argv) {
+// CHECK-NEXT: int i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b)
+// CHECK-NEXT: #pragma omp target enter data map(to: c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) if(b > e)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e) depend(in : argc,argv[i:argc],x[:])
+// CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
+// CHECK-NEXT: char i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b)
+// CHECK-NEXT: #pragma omp target enter data map(to: c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) if(b > e)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e) depend(in : argc,argv[i:argc],x[:])
+// CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
+// CHECK-NEXT: T i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b)
+// CHECK-NEXT: #pragma omp target enter data map(to: c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) if(b > e)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e) depend(in : argc,argv[i:argc],x[:])
+
+int main (int argc, char **argv) {
+  int b = argc, i, c, d, e, f, g, x[20];
+  static int a;
+// CHECK: static int a;
+
+#pragma omp target enter data map(to: a)
+// CHECK:      #pragma omp target enter data map(to: a)
+  a=2;
+// CHECK-NEXT: a = 2;
+#pragma omp target enter data map(to: a) if (target enter data: b)
+// CHECK: #pragma omp target enter data map(to: a) if(target enter data: b)
+
+#pragma omp target enter data map(to: a) if (b > g)
+// CHECK: #pragma omp target enter data map(to: a) if(b > g)
+
+#pragma omp target enter data map(to: c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c)
+
+#pragma omp target enter data map(alloc: c) if(b>g)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: c) if(b > g)
+
+#pragma omp target enter data map(to: x[0:10], c)
+// CHECK-NEXT: #pragma omp target enter data map(to: x[0:10],c)
+
+#pragma omp target enter data map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) map(alloc: d)
+
+#pragma omp target enter data map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data map(always,alloc: e)
+
+#pragma omp target enter data nowait map(to: a)
+// CHECK:      #pragma omp target enter data nowait map(to: a)
+
+#pragma omp target enter data nowait map(to: a) if (target enter data: b)
+// CHECK: #pragma omp target enter data nowait map(to: a) if(target enter data: b)
+
+#pragma omp target enter data map(to: a) if (b > g) nowait
+// CHECK: #pragma omp target enter data map(to: a) if(b > g) nowait
+
+#pragma omp target enter data map(to: c) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait
+
+#pragma omp target enter data map(alloc: c) nowait if(b>g)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: c) nowait if(b > g)
+
+#pragma omp target enter data nowait map(to: x[0:10], c)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: x[0:10],c)
+
+#pragma omp target enter data nowait map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) map(alloc: d)
+
+#pragma omp target enter data nowait map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e)
+
+#pragma omp target enter data depend(in : argc, argv[i:argc], x[:]) nowait map(to: a)
+// CHECK:      #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) nowait map(to: a)
+
+#pragma omp target enter data nowait depend(in : argc, argv[i:argc], x[:]) map(to: a) if (target enter data: b)
+// CHECK: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: a) if(target enter data: b)
+
+#pragma omp target enter data map(to: a) depend(in : argc, argv[i:argc], x[:]) if (b > g) nowait
+// CHECK: #pragma omp target enter data map(to: a) depend(in : argc,argv[i:argc],x[:]) if(b > g) nowait
+
+#pragma omp target enter data map(to: c) nowait depend(in : argc, argv[i:argc], x[:])
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait depend(in : argc,argv[i:argc],x[:])
+
+#pragma omp target enter data depend(in : argc, argv[i:argc], x[:]) map(alloc: c) nowait if(b>g)
+// CHECK-NEXT: #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) map(alloc: c) nowait if(b > g)
+
+#pragma omp target enter data nowait map(to: x[0:10], c) depend(in : argc, argv[i:argc], x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+
+#pragma omp target enter data nowait map(to: c) depend(in : argc, argv[i:argc], x[:]) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) depend(in : argc,argv[i:argc],x[:]) map(alloc: d)
+
+#pragma omp target enter data nowait map(always,alloc: e) depend(in : argc, argv[i:argc], x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e) depend(in : argc,argv[i:argc],x[:])
+
+  return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
+}
+
+#endif
diff --git a/test/OpenMP/target_enter_data_codegen.cpp b/test/OpenMP/target_enter_data_codegen.cpp
new file mode 100644
index 0000000..152cd46
--- /dev/null
+++ b/test/OpenMP/target_enter_data_codegen.cpp
@@ -0,0 +1,249 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+// CK1: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+};
+
+ST<int> gb;
+double gc[100];
+
+// CK1: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 800]
+// CK1: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 32]
+
+// CK1: [[SIZE02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] 4]
+// CK1: [[MTYPE02:@.+]] = {{.+}}constant [1 x i32] [i32 33]
+
+// CK1: [[MTYPE03:@.+]] = {{.+}}constant [1 x i32] [i32 37]
+
+// CK1: [[SIZE04:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] 24]
+// CK1: [[MTYPE04:@.+]] = {{.+}}constant [2 x i32] [i32 33, i32 17]
+
+// CK1-LABEL: _Z3fooi
+void foo(int arg) {
+  int la;
+  float lb[arg];
+
+  // Region 00
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[P0]]
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target enter data if(1+3-5) device(arg) map(alloc: gc)
+  {++arg;}
+
+  // Region 01
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target enter data map(to: la) if(1+3-4)
+  {++arg;}
+
+  // Region 02
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+  // CK1: br label %[[IFEND:[^,]+]]
+
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target enter data map(to: arg) if(arg) device(4)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 03
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: store i[[sz]] [[CSVAL0:%[^,]+]], i[[sz]]* [[S0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+  // CK1-DAG: [[CSVAL0]] = mul nuw i[[sz]] %{{[^,]+}}, 4
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target enter data map(always, to: lb)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 04
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([[ST]]* @gb to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[P0]]
+
+
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[BP1]]
+  // CK1-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK1-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%.+]] to i8*
+  // CK1-DAG: [[SEC1]] = getelementptr inbounds {{.+}}double* [[SEC11:%[^,]+]], i{{.+}} 0
+  // CK1-DAG: [[SEC11]] = load double*, double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1),
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target enter data map(to: gb.b[:3])
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+
+  T foo(T arg) {
+    // Region 00
+    #pragma omp target enter data map(always, to: b[1:3]) if(a>123) device(arg)
+    {arg++;}
+    return arg;
+  }
+};
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 24]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [2 x i32] [i32 37, i32 21]
+
+// CK2-LABEL: _Z3bari
+int bar(int arg){
+  ST<int> A;
+  return A.foo(arg);
+}
+
+// Region 00
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_begin(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+// CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%[^,]+]] to i8*
+// CK2-DAG: [[SEC0]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+
+// CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK2-DAG: [[CBPVAL1]] = bitcast double** [[SEC0]] to i8*
+// CK2-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%[^,]+]] to i8*
+// CK2-DAG: [[SEC1]] = getelementptr inbounds {{.*}}double* [[SEC11:%[^,]+]], i{{.+}} 1
+// CK2-DAG: [[SEC11]] = load double*, double** [[SEC111:%[^,]+]],
+// CK2-DAG: [[SEC111]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+// CK2: br label %[[IFEND:[^,]+]]
+
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+// CK2: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+// CK2-NOT: __tgt_target_data_end
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3-LABEL: no_target_devices
+void no_target_devices(int arg) {
+  // CK3-NOT: tgt_target_data_begin
+  // CK3: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK3-NOT: tgt_target_data_end
+  // CK3: ret
+  #pragma omp target enter data map(to: arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-32
+#ifdef CK4
+
+// CK4-LABEL: device_side_scan
+void device_side_scan(int arg) {
+  // CK4: tgt_target_data_begin
+  // CK4: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK4: ret
+  // TCK4-NOT: tgt_target_data_begin
+  #pragma omp target enter data map(to: arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_enter_data_depend_messages.cpp b/test/OpenMP/target_enter_data_depend_messages.cpp
new file mode 100644
index 0000000..4aa1223
--- /dev/null
+++ b/test/OpenMP/target_enter_data_depend_messages.cpp
@@ -0,0 +1,166 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+template <class T, class S, class R>
+int tmain(T argc, S **argv, R *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+
+  int i;
+  #pragma omp target enter data map(to: i) depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target enter data map(to: i) depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[0])
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : tmain) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[-1:0])
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : arr[0])
+  foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+
+  int i;
+  #pragma omp target enter data map(to: i) depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target enter data map(to: i) depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[0])
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[-1:0])
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : arr[0])
+  foo();
+
+  return tmain(argc, argv, env); // expected-note {{in instantiation of function template specialization 'tmain<int, char, char>' requested here}}
+}
diff --git a/test/OpenMP/target_enter_data_device_messages.cpp b/test/OpenMP/target_enter_data_device_messages.cpp
new file mode 100644
index 0000000..d954eca
--- /dev/null
+++ b/test/OpenMP/target_enter_data_device_messages.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target enter data map(to: i) device // expected-error {{expected '(' after 'device'}}
+  #pragma omp target enter data map(to: i) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) device () // expected-error {{expected expression}}
+  #pragma omp target enter data map(to: i) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+#pragma omp target enter data map(to: i) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target enter data map(to: i) device (argc + argc)
+  #pragma omp target enter data map(to: i) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'device' clause}}
+  #pragma omp target enter data map(to: i) device (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target enter data map(to: i) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  #pragma omp target enter data map(to: i) device (-10u)
+  #pragma omp target enter data map(to: i) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_enter_data_if_messages.cpp b/test/OpenMP/target_enter_data_if_messages.cpp
new file mode 100644
index 0000000..0d18af1
--- /dev/null
+++ b/test/OpenMP/target_enter_data_if_messages.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target enter data map(to: i) if // expected-error {{expected '(' after 'if'}}
+  #pragma omp target enter data map(to: i) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if () // expected-error {{expected expression}}
+  #pragma omp target enter data map(to: i) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  #pragma omp target enter data map(to: i) if (argc > 0 ? argv[1] : argv[2])
+  #pragma omp target enter data map(to: i) if (argc + argc)
+  #pragma omp target enter data map(to: i) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'if' clause}}
+  #pragma omp target enter data map(to: i) if (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target enter data map(to: i) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if(target data : true) // expected-error {{directive name modifier 'target data' is not allowed for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(to: i) if(target enter data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if(target enter data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if(target enter data : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if(target enter data : argc)
+  #pragma omp target enter data map(to: i) if(target enter data : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(to: i) if(target enter data : argc) if (target enter data:argc) // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'if' clause with 'target enter data' name modifier}}
+  #pragma omp target enter data map(to: i) if(target enter data : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_enter_data_map_messages.c b/test/OpenMP/target_enter_data_map_messages.c
new file mode 100644
index 0000000..6f5aad1
--- /dev/null
+++ b/test/OpenMP/target_enter_data_map_messages.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - -x c++ %s
+
+int main(int argc, char **argv) {
+
+  int r;
+  #pragma omp target enter data // expected-error {{expected at least one map clause for '#pragma omp target enter data'}}
+
+  #pragma omp target enter data map(r) // expected-error {{map type must be specified for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(tofrom: r) // expected-error {{map type 'tofrom' is not allowed for '#pragma omp target enter data'}}
+
+  #pragma omp target enter data map(always, to: r)
+  #pragma omp target enter data map(always, alloc: r)
+  #pragma omp target enter data map(always, from: r) // expected-error {{map type 'from' is not allowed for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(release: r) // expected-error {{map type 'release' is not allowed for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(delete: r) // expected-error {{map type 'delete' is not allowed for '#pragma omp target enter data'}}
+
+  return 0;
+}
diff --git a/test/OpenMP/target_enter_data_nowait_messages.cpp b/test/OpenMP/target_enter_data_nowait_messages.cpp
new file mode 100644
index 0000000..e682e8c
--- /dev/null
+++ b/test/OpenMP/target_enter_data_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+int main(int argc, char **argv) {
+  int i;
+
+  #pragma omp nowait target enter data map(to: i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target nowait enter data map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  #pragma omp target enter nowait data map(to: i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target enter data nowait() map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}} expected-error {{expected at least one map clause for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(to: i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  #pragma omp target enter data map(to: i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  #pragma omp target enter data map(to: i) nowait device (-10u)
+  #pragma omp target enter data map(to: i) nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  #pragma omp target enter data map(to: i) nowait nowait // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'nowait' clause}}
+  #pragma omp target enter data nowait map(to: i) nowait // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'nowait' clause}}
+  return 0;
+}
diff --git a/test/OpenMP/target_exit_data_ast_print.cpp b/test/OpenMP/target_exit_data_ast_print.cpp
new file mode 100644
index 0000000..e2c6d7f
--- /dev/null
+++ b/test/OpenMP/target_exit_data_ast_print.cpp
@@ -0,0 +1,244 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+template <typename T, int C>
+T tmain(T argc, T *argv) {
+  T i, j, b, c, d, e, x[20];
+
+  i = argc;
+#pragma omp target exit data map(from: i)
+
+#pragma omp target exit data map(from: i) if (target exit data: j > 0)
+
+#pragma omp target exit data map(from: i) if (b)
+
+#pragma omp target exit data map(from: c)
+
+#pragma omp target exit data map(from: c) if(b>e)
+
+#pragma omp target exit data map(release: x[0:10], c)
+
+#pragma omp target exit data map(delete: x[0:10])
+
+#pragma omp target exit data map(always, delete: x[0:10])
+
+#pragma omp target exit data map(from: c) map(release: d)
+
+#pragma omp target exit data map(always,release: e)
+
+#pragma omp target exit data nowait map(from: i)
+
+#pragma omp target exit data nowait map(from: i) if (target exit data: j > 0)
+
+#pragma omp target exit data map(from: i) if (b) nowait
+
+#pragma omp target exit data map(from: c) nowait
+
+#pragma omp target exit data map(from: c) nowait if(b>e)
+
+#pragma omp target exit data nowait map(release: x[0:10], c)
+
+#pragma omp target exit data nowait map(from: c) map(release: d)
+
+#pragma omp target exit data nowait map(always,release: e)
+
+#pragma omp target exit data depend(in : argc, argv[i:argc], x[:]) nowait map(from: i)
+
+#pragma omp target exit data nowait depend(in : argc, argv[i:argc], x[:]) map(from: i) if (target exit data: j > 0)
+
+#pragma omp target exit data map(from: i) depend(in : argc, argv[i:argc], x[:]) if (b) nowait
+
+#pragma omp target exit data map(from: c) depend(in : argc, argv[i:argc], x[:]) nowait
+
+#pragma omp target exit data map(from: c) depend(in : argc, argv[i:argc], x[:]) nowait if(b>e)
+
+#pragma omp target exit data nowait map(release: x[0:10], c) depend(in : argc, argv[i:argc], x[:])
+
+#pragma omp target exit data nowait map(from: c) depend(in : argc, argv[i:argc], x[:]) map(release: d)
+
+#pragma omp target exit data depend(in : argc, argv[i:argc], x[:]) nowait map(always,release: e)
+
+  return 0;
+}
+
+// CHECK: template <typename T = int, int C = 5> int tmain(int argc, int *argv) {
+// CHECK-NEXT: int i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b)
+// CHECK-NEXT: #pragma omp target exit data map(from: c)
+// CHECK-NEXT: #pragma omp target exit data map(from: c) if(b > e)
+// CHECK-NEXT: #pragma omp target exit data map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data map(delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(always,delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) depend(in : argc,argv[i:argc],x[:]) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(always,release: e)
+// CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
+// CHECK-NEXT: char i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b)
+// CHECK-NEXT: #pragma omp target exit data map(from: c)
+// CHECK-NEXT: #pragma omp target exit data map(from: c) if(b > e)
+// CHECK-NEXT: #pragma omp target exit data map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data map(delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(always,delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) depend(in : argc,argv[i:argc],x[:]) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(always,release: e)
+// CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
+// CHECK-NEXT: T i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b)
+// CHECK-NEXT: #pragma omp target exit data map(from: c)
+// CHECK-NEXT: #pragma omp target exit data map(from: c) if(b > e)
+// CHECK-NEXT: #pragma omp target exit data map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data map(delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(always,delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) depend(in : argc,argv[i:argc],x[:]) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(always,release: e)
+
+int main (int argc, char **argv) {
+  int b = argc, i, c, d, e, f, g, x[20];
+  static int a;
+// CHECK: static int a;
+
+#pragma omp target exit data map(from: a)
+// CHECK:      #pragma omp target exit data map(from: a)
+  a=2;
+// CHECK-NEXT: a = 2;
+#pragma omp target exit data map(from: a) if (target exit data: b)
+// CHECK: #pragma omp target exit data map(from: a) if(target exit data: b)
+
+#pragma omp target exit data map(from: a) if (b > g)
+// CHECK: #pragma omp target exit data map(from: a) if(b > g)
+
+#pragma omp target exit data map(from: c)
+// CHECK-NEXT: #pragma omp target exit data map(from: c)
+
+#pragma omp target exit data map(release: c) if(b>g)
+// CHECK-NEXT: #pragma omp target exit data map(release: c) if(b > g)
+
+#pragma omp target exit data map(from: x[0:10], c)
+// CHECK-NEXT: #pragma omp target exit data map(from: x[0:10],c)
+
+#pragma omp target exit data map(delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(delete: x[0:10])
+
+#pragma omp target exit data map(always, delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(always,delete: x[0:10])
+
+#pragma omp target exit data map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data map(from: c) map(release: d)
+
+#pragma omp target exit data map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data map(always,release: e)
+
+#pragma omp target exit data nowait map(from: a)
+// CHECK:      #pragma omp target exit data nowait map(from: a)
+
+#pragma omp target exit data nowait map(from: a) if (target exit data: b)
+// CHECK: #pragma omp target exit data nowait map(from: a) if(target exit data: b)
+
+#pragma omp target exit data map(from: a) if (b > g) nowait
+// CHECK: #pragma omp target exit data map(from: a) if(b > g) nowait
+
+#pragma omp target exit data map(from: c) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait
+
+#pragma omp target exit data map(release: c) nowait if(b>g)
+// CHECK-NEXT: #pragma omp target exit data map(release: c) nowait if(b > g)
+
+#pragma omp target exit data nowait map(from: x[0:10], c)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: x[0:10],c)
+
+#pragma omp target exit data nowait map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) map(release: d)
+
+#pragma omp target exit data nowait map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(always,release: e)
+
+#pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: a)
+// CHECK:      #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: a)
+
+#pragma omp target exit data nowait map(from: a) depend(in : argc,argv[i:argc],x[:]) if (target exit data: b)
+// CHECK: #pragma omp target exit data nowait map(from: a) depend(in : argc,argv[i:argc],x[:]) if(target exit data: b)
+
+#pragma omp target exit data map(from: a) if (b > g) nowait depend(in : argc,argv[i:argc],x[:])
+// CHECK: #pragma omp target exit data map(from: a) if(b > g) nowait depend(in : argc,argv[i:argc],x[:])
+
+#pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+
+#pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) map(release: c) nowait if(b>g)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) map(release: c) nowait if(b > g)
+
+#pragma omp target exit data nowait map(from: x[0:10], c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+
+#pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+
+#pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(always,release: e)
+
+  return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
+}
+
+#endif
diff --git a/test/OpenMP/target_exit_data_codegen.cpp b/test/OpenMP/target_exit_data_codegen.cpp
new file mode 100644
index 0000000..d3a3859
--- /dev/null
+++ b/test/OpenMP/target_exit_data_codegen.cpp
@@ -0,0 +1,221 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+// CK1: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+};
+
+ST<int> gb;
+double gc[100];
+
+// CK1: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 800]
+// CK1: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 34]
+
+// CK1: [[SIZE02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] 4]
+// CK1: [[MTYPE02:@.+]] = {{.+}}constant [1 x i32] [i32 32]
+
+// CK1: [[MTYPE03:@.+]] = {{.+}}constant [1 x i32] [i32 38]
+
+// CK1: [[SIZE04:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] 24]
+// CK1: [[MTYPE04:@.+]] = {{.+}}constant [2 x i32] [i32 32, i32 16]
+
+// CK1-LABEL: _Z3fooi
+void foo(int arg) {
+  int la;
+  float lb[arg];
+
+  // Region 00
+  // CK1-NOT: __tgt_target_data_begin
+  // CK1-DAG: call void @__tgt_target_data_end(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[P0]]
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data if(1+3-5) device(arg) map(from: gc)
+  {++arg;}
+
+  // Region 01
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data map(release: la) if(1+3-4)
+  {++arg;}
+
+  // Region 02
+  // CK1-NOT: __tgt_target_data_begin
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_end(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+  // CK1: br label %[[IFEND:[^,]+]]
+
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data map(release: arg) if(arg) device(4)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 03
+  // CK1-NOT: __tgt_target_data_begin
+  // CK1-DAG: call void @__tgt_target_data_end(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: store i[[sz]] [[CSVAL0:%[^,]+]], i[[sz]]* [[S0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+  // CK1-DAG: [[CSVAL0]] = mul nuw i[[sz]] %{{[^,]+}}, 4
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data map(always, from: lb)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 04
+  // CK1-NOT: __tgt_target_data_begin
+  // CK1-DAG: call void @__tgt_target_data_end(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([[ST]]* @gb to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[P0]]
+
+
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[BP1]]
+  // CK1-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK1-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%.+]] to i8*
+  // CK1-DAG: [[SEC1]] = getelementptr inbounds {{.+}}double* [[SEC11:%[^,]+]], i{{.+}} 0
+  // CK1-DAG: [[SEC11]] = load double*, double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1),
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data map(release: gb.b[:3])
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+
+  T foo(T arg) {
+    // Region 00
+    #pragma omp target exit data map(always, release: b[1:3]) if(a>123) device(arg)
+    {arg++;}
+    return arg;
+  }
+};
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 24]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [2 x i32] [i32 36, i32 20]
+
+// CK2-LABEL: _Z3bari
+int bar(int arg){
+  ST<int> A;
+  return A.foo(arg);
+}
+
+// Region 00
+// CK2-NOT: __tgt_target_data_begin
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_end(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+// CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%[^,]+]] to i8*
+// CK2-DAG: [[SEC0]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+
+// CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK2-DAG: [[CBPVAL1]] = bitcast double** [[SEC0]] to i8*
+// CK2-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%[^,]+]] to i8*
+// CK2-DAG: [[SEC1]] = getelementptr inbounds {{.*}}double* [[SEC11:%[^,]+]], i{{.+}} 1
+// CK2-DAG: [[SEC11]] = load double*, double** [[SEC111:%[^,]+]],
+// CK2-DAG: [[SEC111]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+// CK2: br label %[[IFEND:[^,]+]]
+
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+// CK2: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3-LABEL: no_target_devices
+void no_target_devices(int arg) {
+  // CK3-NOT: tgt_target_data_begin
+  // CK3: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK3-NOT: tgt_target_data_end
+  // CK3: ret
+  #pragma omp target exit data map(from: arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_exit_data_depend_messages.cpp b/test/OpenMP/target_exit_data_depend_messages.cpp
new file mode 100644
index 0000000..cdefbee
--- /dev/null
+++ b/test/OpenMP/target_exit_data_depend_messages.cpp
@@ -0,0 +1,166 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+template <class T, class S, class R>
+int tmain(T argc, S **argv, R *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+
+  int i;
+  #pragma omp target exit data map(from: i) depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target exit data map(from: i) depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[0])
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : tmain) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[-1:0])
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : arr[0])
+  foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+
+  int i;
+  #pragma omp target exit data map(from: i) depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target exit data map(from: i) depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[0])
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[-1:0])
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : arr[0])
+  foo();
+
+  return tmain(argc, argv, env); // expected-note {{in instantiation of function template specialization 'tmain<int, char, char>' requested here}}
+}
diff --git a/test/OpenMP/target_exit_data_device_messages.cpp b/test/OpenMP/target_exit_data_device_messages.cpp
new file mode 100644
index 0000000..d9ce0dd
--- /dev/null
+++ b/test/OpenMP/target_exit_data_device_messages.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target exit data map(from: i) device // expected-error {{expected '(' after 'device'}}
+  #pragma omp target exit data map(from: i) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) device () // expected-error {{expected expression}}
+  #pragma omp target exit data map(from: i) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+#pragma omp target exit data map(from: i) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target exit data map(from: i) device (argc + argc)
+  #pragma omp target exit data map(from: i) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'device' clause}}
+  #pragma omp target exit data map(from: i) device (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target exit data map(from: i) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  #pragma omp target exit data map(from: i) device (-10u)
+  #pragma omp target exit data map(from: i) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_exit_data_if_messages.cpp b/test/OpenMP/target_exit_data_if_messages.cpp
new file mode 100644
index 0000000..cc674e6
--- /dev/null
+++ b/test/OpenMP/target_exit_data_if_messages.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target exit data map(from: i) if // expected-error {{expected '(' after 'if'}}
+  #pragma omp target exit data map(from: i) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if () // expected-error {{expected expression}}
+  #pragma omp target exit data map(from: i) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  #pragma omp target exit data map(from: i) if (argc > 0 ? argv[1] : argv[2])
+  #pragma omp target exit data map(from: i) if (argc + argc)
+  #pragma omp target exit data map(from: i) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'if' clause}}
+  #pragma omp target exit data map(from: i) if (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target exit data map(from: i) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if(target data : true) // expected-error {{directive name modifier 'target data' is not allowed for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(from: i) if(target exit data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if(target exit data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if(target exit data : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if(target exit data : argc)
+  #pragma omp target exit data map(from: i) if(target exit data : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(from: i) if(target exit data : argc) if (target exit data:argc) // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'if' clause with 'target exit data' name modifier}}
+  #pragma omp target exit data map(from: i) if(target exit data : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_exit_data_map_messages.c b/test/OpenMP/target_exit_data_map_messages.c
new file mode 100644
index 0000000..a9953fb
--- /dev/null
+++ b/test/OpenMP/target_exit_data_map_messages.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - -x c++ %s
+
+int main(int argc, char **argv) {
+
+  int r;
+  #pragma omp target exit data // expected-error {{expected at least one map clause for '#pragma omp target exit data'}}
+
+  #pragma omp target exit data map(r) // expected-error {{map type must be specified for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(tofrom: r) // expected-error {{map type 'tofrom' is not allowed for '#pragma omp target exit data'}}
+
+  #pragma omp target exit data map(always, from: r)
+  #pragma omp target exit data map(delete: r)
+  #pragma omp target exit data map(release: r)
+  #pragma omp target exit data map(always, alloc: r) // expected-error {{map type 'alloc' is not allowed for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(to: r) // expected-error {{map type 'to' is not allowed for '#pragma omp target exit data'}}
+
+  return 0;
+}
diff --git a/test/OpenMP/target_exit_data_nowait_messages.cpp b/test/OpenMP/target_exit_data_nowait_messages.cpp
new file mode 100644
index 0000000..cd743d8
--- /dev/null
+++ b/test/OpenMP/target_exit_data_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+int main(int argc, char **argv) {
+  int i;
+
+  #pragma omp nowait target exit data map(from: i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target nowait exit data map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  #pragma omp target exit nowait data map(from: i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target exit data nowait() map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}} expected-error {{expected at least one map clause for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(from: i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  #pragma omp target exit data map(from: i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  #pragma omp target exit data map(from: i) nowait device (-10u)
+  #pragma omp target exit data map(from: i) nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  #pragma omp target exit data map(from: i) nowait nowait // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'nowait' clause}}
+  #pragma omp target exit data nowait map(from: i) nowait // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'nowait' clause}}
+  return 0;
+}
diff --git a/test/OpenMP/target_firstprivate_codegen.cpp b/test/OpenMP/target_firstprivate_codegen.cpp
new file mode 100644
index 0000000..ca459e0
--- /dev/null
+++ b/test/OpenMP/target_firstprivate_codegen.cpp
@@ -0,0 +1,580 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+// CHECK:  [[TT:%.+]] = type { i64, i8 }
+// CHECK:  [[S1:%.+]] = type { double }
+
+// TCHECK:  [[TT:%.+]] = type { i64, i8 }
+// TCHECK:  [[S1:%.+]] = type { double }
+
+// CHECK-DAG:  [[SIZET:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 4]
+// CHECK:  [[MAPT:@.+]] = private unnamed_addr constant [1 x i32] [i32 288]
+// CHECK-DAG:  [[MAPT2:@.+]] = private unnamed_addr constant [9 x i32] [i32 288, i32 161, i32 288, i32 161, i32 161, i32 288, i32 288, i32 161, i32 161]
+// CHECK-DAG:  [[SIZET3:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] zeroinitializer
+// CHECK-DAG:  [[MAPT3:@.+]] = private unnamed_addr constant [1 x i32] [i32 32]
+// CHECK-DAG:  [[MAPT4:@.+]] = private unnamed_addr constant [5 x i32] [i32 35, i32 288, i32 288, i32 288, i32 161]
+// CHECK-DAG:  [[SIZET5:@.+]] = private unnamed_addr constant [3 x i{{32|64}}] [i[[SZ]] 4, i[[SZ]] 1, i[[SZ]] 40]
+// CHECK-DAG:  [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 288, i32 288, i32 161]
+// CHECK-DAG:  [[SIZET6:@.+]] = private unnamed_addr constant [2 x i{{32|64}}] [i[[SZ]] 4, i[[SZ]] 40]
+// CHECK-DAG:  [[MAPT6:@.+]] = private unnamed_addr constant [2 x i32] [i32 288, i32 161]
+
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n, double *ptr) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  
+  #pragma omp target firstprivate(a)
+  {
+  }
+
+  // a is passed by value to tgt_target
+  // CHECK:  [[N_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  [[PTR_ADDR:%.+]] = alloca double*,
+  // CHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  [[B:%.+]] = alloca [10 x float],
+  // CHECK:  [[SSTACK:%.+]] = alloca i8*,
+  // CHECK:  [[C:%.+]] = alloca [5 x [10 x double]],
+  // CHECK:  [[D:%.+]] = alloca [[TT]],
+  // CHECK:  [[ACAST:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  {{.+}} = alloca i{{[0-9]+}},
+  // CHECK:  [[BASE_PTR_ARR:%.+]] = alloca [1 x i8*],
+  // CHECK:  [[PTR_ARR:%.+]] = alloca [1 x i8*],
+  // CHECK:  [[A2CAST:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  [[BASE_PTR_ARR2:%.+]] = alloca [9 x i8*],
+  // CHECK:  [[PTR_ARR2:%.+]] = alloca [9 x i8*],
+  // CHECK:  [[SIZET2:%.+]] = alloca [9 x i{{[0-9]+}}],
+  // CHECK:  [[BASE_PTR_ARR3:%.+]] = alloca [1 x i8*],
+  // CHECK:  [[PTR_ARR3:%.+]] = alloca [1 x i8*],  
+  // CHECK:  [[N_ADDR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[N_ADDR]],
+  // CHECK-64:  [[N_EXT:%.+]] = zext i{{[0-9]+}} [[N_ADDR_VAL]] to i{{[0-9]+}}
+  // CHECK:  [[SSAVE_RET:%.+]] = call i8* @llvm.stacksave()
+  // CHECK:  store i8* [[SSAVE_RET]], i8** [[SSTACK]],
+  // CHECK-64:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_EXT]],
+  // CHECK-32:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_ADDR_VAL]],  
+  // CHECK:  [[N_ADDR_VAL2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[N_ADDR]],
+  // CHECK-64:  [[N_EXT2:%.+]] = zext i{{[0-9]+}} [[N_ADDR_VAL2]] to i{{[0-9]+}}
+  // CHECK-64:  [[CN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_EXT2]]
+  // CHECK-32:  [[CN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_ADDR_VAL2]]
+  // CHECK:  [[CN_VLA:%.+]] = alloca double, i{{[0-9]+}} [[CN_SIZE]],
+  // CHECK:  [[AVAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A]],
+  // CHECK-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ACAST]] to i{{[0-9]+}}*
+  // CHECK-64:  store i{{[0-9]+}} [[AVAL]], i{{[0-9]+}}* [[CONV]],
+  // CHECK-32:  store i{{[0-9]+}} [[AVAL]], i{{[0-9]+}}* [[ACAST]],
+  // CHECK:  [[ACAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ACAST]],
+  // CHECK:  [[ACAST_TOPTR:%.+]] = inttoptr i{{[0-9]+}} [[ACAST_VAL]] to i8*
+  // CHECK:  [[BASE_PTR_GEP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[ACAST_TOPTR]], i8** [[BASE_PTR_GEP]],
+  // CHECK:  [[ACAST_TOPTR2:%.+]] = inttoptr i{{[0-9]+}} [[ACAST_VAL]] to i8*
+  // CHECK:  [[PTR_GEP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[ACAST_TOPTR2]], i8** [[PTR_GEP]],
+  // CHECK:  [[BASE_PTR_GEP_ARG:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[PTR_GEP_ARG:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  {{.+}} = call i32 @__tgt_target(i32 -1, {{.+}}, i32 1, i8** [[BASE_PTR_GEP_ARG]], i8** [[PTR_GEP_ARG]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET]], i32 0, i32 0), i32* getelementptr inbounds ([1 x i32], [1 x i32]* [[MAPT]], i32 0, i32 0))
+  
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]])
+  // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+  // TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+  // TCHECK-NOT: store i{{[0-9]+}} %
+  // TCHECK:  ret void  
+
+#pragma omp target firstprivate(aa,b,bn,c,cn,d)
+  {
+    aa += 1;
+    b[2] = 1.0;
+    bn[3] = 1.0;
+    c[1][2] = 1.0;
+    cn[1][3] = 1.0;
+    d.X = 1;
+    d.Y = 1;    
+  }
+
+  // CHECK:  [[A2VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A2]],
+  // CHECK:  [[A2CASTCONV:%.+]] = bitcast i{{[0-9]+}}* [[A2CAST]] to i{{[0-9]+}}*
+  // CHECK:  store i{{[0-9]+}} [[A2VAL]], i{{[0-9]+}}* [[A2CASTCONV]],
+  // CHECK:  [[A2CAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A2CAST]],
+  // CHECK-64:  [[BN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_EXT]], 4
+  // CHECK-32:  [[BN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_ADDR_VAL]], 4  
+  // CHECK-64:  [[CN_SIZE_1:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_EXT2]]
+  // CHECK-32:  [[CN_SIZE_1:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_ADDR_VAL2]]
+  // CHECK:  [[CN_SIZE_2:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SIZE_1]], 8
+
+  // firstprivate(aa) --> base_ptr = aa, ptr = aa, size = 2 (short)
+  // CHECK:  [[A2CAST_TO_INT:%.+]] = inttoptr i{{[0-9]+}} [[A2CAST_VAL]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[A2CAST_TO_INT]], i8** [[BASE_PTR_GEP2_0]],
+  // CHECK:  [[A2CAST_TO_INT_2:%.+]] = inttoptr i{{[0-9]+}} [[A2CAST_VAL]] to i8*
+  // CHECK:  [[PTR_GEP2_0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[A2CAST_TO_INT_2]], i8** [[PTR_GEP2_0]],
+  // CHECK:  [[SIZE_GEPA2:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIZE_GEPA2]],
+
+  // firstprivate(b): base_ptr = &b[0], ptr = &b[0], size = 40 (sizeof(float)*10)
+  // CHECK:  [[BCAST:%.+]] = bitcast [10 x float]* [[B]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[BCAST]], i8** [[BASE_PTR_GEP2_1]],
+  // CHECK:  [[BCAST2:%.+]] = bitcast [10 x float]* [[B]] to i8*
+  // CHECK:  [[PTR_GEP2_1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[BCAST2]], i8** [[PTR_GEP2_1]],
+  // CHECK:  [[SIZE_GEPB:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i{{[0-9]+}} 40, i{{[0-9]+}}* [[SIZE_GEPB]],
+
+  // firstprivate(bn), 2 entries, n and bn: (1) base_ptr = n, ptr = n, size = 8 ; (2) base_ptr = &c[0], ptr = &c[0], size = n*sizeof(float)
+  // CHECK-64:  [[N_EXT3_1:%.+]] = inttoptr i{{[0-9]+}} [[N_EXT]] to i8*
+  // CHECK-32:  [[N_EXT3_1:%.+]] = inttoptr i{{[0-9]+}} [[N_ADDR_VAL]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* [[N_EXT3_1]], i8** [[BASE_PTR_GEP2_2]],
+  // CHECK-64:  [[N_EXT3_2:%.+]] = inttoptr i{{[0-9]+}} [[N_EXT]] to i8*
+  // CHECK-32:  [[N_EXT3_2:%.+]] = inttoptr i{{[0-9]+}} [[N_ADDR_VAL]] to i8*
+  // CHECK:  [[PTR_GEP2_2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* [[N_EXT3_2]], i8** [[PTR_GEP2_2]],
+  // CHECK:  [[SIZE_GEPBN_1:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i{{[0-9]+}} {{[0-9]}}, i{{[0-9]+}}* [[SIZE_GEPBN_1]],
+  // CHECK:  [[VLABN_BCAST:%.+]] = bitcast float* [[BN_VLA]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK:  store i8* [[VLABN_BCAST]], i8** [[BASE_PTR_GEP2_3]],
+  // CHECK: [[SIZE_GEPBN_3:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK:  store i{{[0-9]+}} [[BN_SIZE]], i{{[0-9]+}}* [[SIZE_GEPBN_3]]
+  
+  // firstprivate(c): base_ptr = &c[0], ptr = &c[0], size = 400 (5*10*sizeof(double))
+  // CHECK:  [[C_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i8* [[C_BCAST]], i8** [[BASE_PTR_GEP2_4]],
+  // CHECK:  [[C_BCAST2:%.+]] = bitcast [5 x [10 x double]]* [[C]] to i8*
+  // CHECK:  [[PTR_GEP2_4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i8* [[C_BCAST2]], i8** [[PTR_GEP2_4]],
+  // CHECK:  [[SIZE_GEPC_4:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i{{[0-9]+}} 400, i{{[0-9]+}}* [[SIZE_GEPC_4]],
+  
+  // firstprivate(cn), 3 entries, 5, n, cn: (1) base_ptr = 5, ptr = 5, size = 8; (2) (1) base_ptr = n, ptr = n, size = 8; (3) base_ptr = &cn[0], ptr = &cn[0], size = 5*n*sizeof(double)
+  // CHECK:  [[BASE_PTR_GEP2_5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
+  // CHECK:  store i8* inttoptr (i{{[0-9]+}} 5 to i8*), i8** [[BASE_PTR_GEP2_5]],
+  // CHECK:  [[PTR_GEP2_5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
+  // CHECK:  store i8* inttoptr (i{{[0-9]+}} 5 to i8*), i8** [[PTR_GEP2_5]],
+  // CHECK:  [[SIZE_GEPCN_5:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
+  // CHECK:  store i{{[0-9]+}} {{[0-9]}}, i{{[0-9]+}}* [[SIZE_GEPCN_5]],
+  // CHECK-64:  [[CN_SZ_2_1:%.+]] = inttoptr i{{[0-9]+}} [[N_EXT2]] to i8*
+  // CHECK-32:  [[CN_SZ_2_1:%.+]] = inttoptr i{{[0-9]+}} [[N_ADDR_VAL2]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 6
+  // CHECK:  store i8* [[CN_SZ_2_1]], i8** [[BASE_PTR_GEP2_6]],
+  // CHECK-64:  [[CN_SZ_2_2:%.+]] = inttoptr i{{[0-9]+}} [[N_EXT2]] to i8*
+  // CHECK-32:  [[CN_SZ_2_2:%.+]] = inttoptr i{{[0-9]+}} [[N_ADDR_VAL2]] to i8*
+  // CHECK:  [[PTR_GEP2_6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 6
+  // CHECK:  store i8* [[CN_SZ_2_2]], i8** [[PTR_GEP2_6]],
+  // CHECK:  [[SIZE_GEPCN_6:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 6
+  // CHECK:  store i{{[0-9]+}} {{[0-9]}}, i{{[0-9]+}}* [[SIZE_GEPCN_6]],
+  // CHECK:  [[VLA_CN_BCAST:%.+]] = bitcast double* [[CN_VLA]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
+  // CHECK:  store i8* [[VLA_CN_BCAST]], i8** [[BASE_PTR_GEP2_7]],
+  // CHECK:  [[VLA_CN_BCAST2:%.+]] = bitcast double* [[CN_VLA]] to i8*
+  // CHECK:  [[PTR_GEP2_7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
+  // CHECK:  store i8* [[VLA_CN_BCAST2]], i8** [[PTR_GEP2_7]],
+  // CHECK:  [[SIZE_GEPCN_7:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
+  // CHECK:  store i{{[0-9]+}} [[CN_SIZE_2]], i{{[0-9]+}}* [[SIZE_GEPCN_7]],
+  
+  // firstprivate(d): base_ptr = &d, ptr = &d, size = 16 
+  // CHECK:  [[D_REF:%.+]] = bitcast [[TT]]* [[D]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
+  // CHECK:  store i8* [[D_REF]], i8** [[BASE_PTR_GEP2_8]],
+  // CHECK:  [[D_REF2:%.+]] = bitcast [[TT]]* [[D]] to i8*
+  // CHECK:  [[PTR_GEP2_8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
+  // CHECK:  store i8* [[D_REF2]], i8** [[PTR_GEP2_8]],
+  // CHECK:  [[SIZE_GEPCN_8:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
+  // CHECK:  store i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}}* [[SIZE_GEPCN_8]],
+  
+  
+  // CHECK:  [[BASE_PTR_GEP_ARG2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[PTR_GEP_ARG2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[SIZES_ARG2:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[SIZET2]],  i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK: {{.+}} = call i32 @__tgt_target(i32 -1, {{.+}}, i32 9, i8** [[BASE_PTR_GEP_ARG2]], i8** [[PTR_GEP_ARG2]], i[[SZ]]* [[SIZES_ARG2]], i32* getelementptr inbounds ([9 x i32], [9 x i32]* [[MAPT2]], i32 0, i32 0))
+  
+  // make sure that firstprivate variables are generated in all cases and that we use those instances for operations inside the
+  // target region
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A2_IN:%.+]], [10 x float]* {{.+}} [[B_IN:%.+]], i{{[0-9]+}} [[BN_SZ:%.+]], float* {{.+}} [[BN_IN:%.+]], [5 x [10 x double]]* {{.+}} [[C_IN:%.+]], i{{[0-9]+}} [[CN_SZ1:%.+]], i{{[0-9]+}} [[CN_SZ2:%.+]], double* {{.+}} [[CN_IN:%.+]], [[TT]]* {{.+}} [[D_IN:%.+]])
+  // TCHECK:  [[A2_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[B_ADDR:%.+]] = alloca [10 x float]*,
+  // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[BN_ADDR:%.+]] = alloca float*,
+  // TCHECK:  [[C_ADDR:%.+]] = alloca [5 x [10 x double]]*,
+  // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR4:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[CN_ADDR:%.+]] = alloca double*,
+  // TCHECK:  [[D_ADDR:%.+]] = alloca [[TT]]*,
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+  // TCHECK:  [[B_PRIV:%.+]] = alloca [10 x float],
+  // TCHECK:  [[SSTACK:%.+]] = alloca i8*,
+  // TCHECK:  [[C_PRIV:%.+]] = alloca [5 x [10 x double]],
+  // TCHECK:  [[D_PRIV:%.+]] = alloca [[TT]],
+  // TCHECK:  store i{{[0-9]+}} [[A2_IN]], i{{[0-9]+}}* [[A2_ADDR]],
+  // TCHECK:  store [10 x float]* [[B_IN]], [10 x float]** [[B_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[BN_SZ]], i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  store float* [[BN_IN]], float** [[BN_ADDR]],
+  // TCHECK:  store [5 x [10 x double]]* [[C_IN]], [5 x [10 x double]]** [[C_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[CN_SZ1]], i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  store i{{[0-9]+}} [[CN_SZ2]], i{{[0-9]+}}* [[VLA_ADDR4]],
+  // TCHECK:  store double* [[CN_IN]], double** [[CN_ADDR]],
+  // TCHECK:  store [[TT]]* [[D_IN]], [[TT]]** [[D_ADDR]],
+  // TCHECK:  [[CONV_A2ADDR:%.+]] = bitcast i{{[0-9]+}}* [[A2_ADDR]] to i{{[0-9]+}}*
+  // TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x float]*, [10 x float]** [[B_ADDR]],
+  // TCHECK:  [[BN_SZ_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  [[BN_ADDR_REF:%.+]] = load float*, float** [[BN_ADDR]],
+  // TCHECK:  [[C_ADDR_REF:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]],
+  // TCHECK:  [[CN_SZ1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  [[CN_SZ2_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR4]],
+  // TCHECK:  [[CN_ADDR_REF:%.+]] = load double*, double** [[CN_ADDR]],
+  // TCHECK:  [[D_ADDR_REF:%.+]] = load [[TT]]*, [[TT]]** [[D_ADDR]],
+
+  // firstprivate(aa): a_priv = a_in
+  // TCHECK-NOT:  store i{{[0-9]+}} %
+
+  //  firstprivate(b): memcpy(b_priv,b_in)
+  // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
+  // TCHECK:  [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_ADDR_REF_BCAST]], {{.+}})
+
+  // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
+  // TCHECK:  store i8* [[RET_STACK]], i8** [[SSTACK]],
+
+  // firstprivate(bn)
+  // TCHECK:  [[BN_PRIV:%.+]] = alloca float, i{{[0-9]+}} [[BN_SZ_VAL]],
+  // TCHECK:  [[BN_COPY_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[BN_SZ_VAL]], 4
+  // TCHECK:  [[BN_PRIV__BCAST:%.+]] = bitcast float* [[BN_PRIV]] to i8*
+  // TCHECK:  [[BN_REF_IN_BCAST:%.+]] = bitcast float* [[BN_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[BN_PRIV__BCAST]], i8* [[BN_REF_IN_BCAST]], i{{[0-9]+}} [[BN_COPY_SZ]],{{.+}})
+
+  // firstprivate(c)
+  // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
+  // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+  
+  // firstprivate(cn)
+  // TCHECK:  [[CN_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
+  // TCHECK:  [[CN_PRIV:%.+]] = alloca double, i{{[0-9]+}} [[CN_SZ]],
+  // TCHECK:  [[CN_SZ2:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
+  // TCHECK:  [[CN_SZ2_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ2]], 8
+  // TCHECK:  [[CN_PRIV_BCAST:%.+]] = bitcast double* [[CN_PRIV]] to i8*
+  // TCHECK:  [[CN_IN_BCAST:%.+]] = bitcast double* [[CN_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[CN_PRIV_BCAST]], i8* [[CN_IN_BCAST]], i{{[0-9]+}} [[CN_SZ2_CPY]],{{.+}})
+  
+  // firstprivate(d)
+  // TCHECK:  [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
+  // TCHECK:  [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[D_PRIV_BCAST]], i8* [[D_IN_BCAST]],{{.+}})
+
+  
+  #pragma omp target firstprivate(ptr)
+  {
+    ptr[0]++;
+  }
+  // CHECK:  [[PTR_ADDR_REF:%.+]] = load double*, double** [[PTR_ADDR]],
+  // CHECK:  [[PTR_ADDR_BCAST:%.+]] = bitcast double* [[PTR_ADDR_REF]] to i8*
+
+  // CHECK:  [[BASE_PTR_GEP3_0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[PTR_ADDR_BCAST]], i8** [[BASE_PTR_GEP3_0]],
+  // CHECK:  [[PTR_ADDR_BCAST2:%.+]] = bitcast double* [[PTR_ADDR_REF]] to i8*
+  // CHECK:  [[PTR_GEP3_0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[PTR_ADDR_BCAST2]], i8** [[PTR_GEP3_0]],
+
+  // CHECK:  [[BASE_PTR_GEP_ARG3:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[PTR_GEP_ARG3:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK: {{.+}} = call i32 @__tgt_target(i32 -1, {{.+}}, i32 1, i8** [[BASE_PTR_GEP_ARG3]], i8** [[PTR_GEP_ARG3]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET3]], i32 0, i32 0), i32* getelementptr inbounds ([1 x i32], [1 x i32]* [[MAPT3]], i32 0, i32 0))
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}(double* [[PTR_IN:%.+]])
+  // TCHECK:  [[PTR_ADDR:%.+]] = alloca double*,
+  // TCHECK-NOT: alloca double*,
+  // TCHECK:  store double* [[PTR_IN]], double** [[PTR_ADDR]],
+  // TCHECK-NOT: store double* %
+
+  return a;
+}
+
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  tx b[10];
+
+#pragma omp target firstprivate(a,b)
+  {
+    a += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  char aaa = 0;
+  int b[10];
+
+#pragma omp target firstprivate(a,aaa,b)
+  {
+    a += 1;
+    aaa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+// TCHECK: define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[A3_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
+// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[A3_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
+// TCHECK-NOT: alloca i{{[0-9]+}},
+// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+// TCHECK:  store i{{[0-9]+}} [[A3_IN]], i{{[0-9]+}}* [[A3_ADDR]],
+// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
+// TCHECK-64:  [[A_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
+// TCHECK:  [[A3_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A3_ADDR]] to i8*
+// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+// firstprivate(a): a_priv = a_in
+
+// firstprivate(aaa)
+// TCHECK-NOT:  store i{{[0-9]+}} %
+
+// firstprivate(b)
+// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
+// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+
+// TCHECK:  ret void
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+    short int c[2][n];
+
+#pragma omp target firstprivate(b,c)
+    {
+      this->a = (double)b + 1.5;
+      c[1][1] = ++a;
+    }
+
+    return c[1][1] + (int)b;
+  }
+
+  // on the host side, we first generate r1, then the static function and the template above
+  // CHECK:  define{{.+}} i32 {{.+}}([[S1]]* {{.+}}, i{{[0-9]+}} {{.+}})
+  // CHECK:  [[BASE_PTRS4:%.+]] = alloca [5 x i8*],
+  // CHECK:  [[PTRS4:%.+]] = alloca [5 x i8*],
+  // CHECK:  [[SIZET4:%.+]] = alloca [5 x i{{[0-9]+}}],
+
+  // map(this: this ptr is implicitly captured (not firstprivate matter)
+  // CHECK:  {{.+}} = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store {{.+}}, {{.+}},
+  // CHECK:  {{.+}} = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store {{.+}}, {{.+}},
+  // CHECK:  {{.+}} getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store {{.+}}, {{.+}}
+
+  // firstprivate(b): base_ptr = b, ptr = b, size = 4 (pass by-value)
+  // CHECK:  [[B_CAST_PTR:%.+]] = inttoptr i{{[0-9]+}} [[B_CAST:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP4_1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[B_CAST_PTR]], i8** [[BASE_PTRS_GEP4_1]],
+  // CHECK:  [[B_CAST_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[B_CAST:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP4_1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[B_CAST_PTR2]], i8** [[PTRS_GEP4_1]],
+  // CHECK:  [[SIZES_GEP4_1:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIZES_GEP4_1]],
+
+  // firstprivate(c), 3 entries: 2, n, c
+  // CHECK:  [[BASE_PTRS_GEP4_2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* inttoptr (i{{[0-9]+}} 2 to i8*), i8** [[BASE_PTRS_GEP4_2]],
+  // CHECK:  [[PTRS_GEP4_2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* inttoptr (i{{[0-9]+}} 2 to i8*), i8** [[PTRS_GEP4_2]],
+  // CHECK:  [[SIZES_GEP4_2:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK-64:  store i{{[0-9]+}} 8, i{{[0-9]+}}* [[SIZES_GEP4_2]],
+  // CHECK-32:  store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIZES_GEP4_2]],
+  // CHECK:  [[N_PTR:%.+]] = inttoptr i{{[0-9]+}} [[N:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP4_3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK:  store i8* [[N_PTR]], i8** [[BASE_PTRS_GEP4_3]],
+  // CHECK:  [[N_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[N:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP4_3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK:  store i8* [[N_PTR2]], i8** [[PTRS_GEP4_3]],
+  // CHECK:  [[SIZES_GEP4_3:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK-64:  store i{{[0-9]+}} 8, i{{[0-9]+}}* [[SIZES_GEP4_3]],
+  // CHECK-32:  store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIZES_GEP4_3]],
+  // CHECK:  [[B_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[B:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP4_4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i8* [[B_BCAST]], i8** [[BASE_PTRS_GEP4_4]],
+  // CHECK:  [[B_BCAST2:%.+]] = bitcast i{{[0-9]+}}* [[B:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP4_4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i8* [[B_BCAST2]], i8** [[PTRS_GEP4_4]],
+  // CHECK:  [[SIZES_GEP4_4:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i{{[0-9]+}} [[B_SIZE:%.+]], i{{[0-9]+}}* [[SIZES_GEP4_4]],
+
+  // only check that we use the map types stored in the global variable
+  // CHECK:  call i32 @__tgt_target(i32 -1, {{.+}}, i32 5, i8** {{.+}}, i8** {{.+}}, i{{[0-9]+}}* {{.+}}, i32* getelementptr inbounds ([5 x i32], [5 x i32]* [[MAPT4]], i32 0, i32 0))
+  
+  // TCHECK: define void @__omp_offloading_{{.+}}([[S1]]* [[TH:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[VLA:%.+]], i{{[0-9]+}} [[VLA1:%.+]], i{{[0-9]+}}{{.+}} [[C_IN:%.+]])
+  // TCHECK:  [[TH_ADDR:%.+]] = alloca [[S1]]*,
+  // TCHECK:  [[B_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[C_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+  // TCHECK:  [[SSTACK:%.+]] = alloca i8*,
+
+  // TCHECK:  store [[S1]]* [[TH]], [[S1]]** [[TH_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[B_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA]], i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA1]], i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  store i{{[0-9]+}}* [[C_IN]], i{{[0-9]+}}** [[C_ADDR]],
+  // TCHECK:  [[TH_ADDR_REF:%.+]] = load [[S1]]*, [[S1]]** [[TH_ADDR]],
+  // TCHECK-64:  [[B_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[B_ADDR]] to i{{[0-9]+}}*
+  // TCHECK:  [[VLA_ADDR_REF:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  [[VLA_ADDR_REF2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  [[C_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[C_ADDR]],
+
+  // firstprivate(b)
+  // TCHECK-NOT:  store i{{[0-9]+}} %
+ 
+  // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
+  // TCHECK:  store i8* [[RET_STACK:%.+]], i8** [[SSTACK]],
+
+  // firstprivate(c)
+  // TCHECK:  [[C_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF]], [[VLA_ADDR_REF2]]
+  // TCHECK:  [[C_PRIV:%.+]] = alloca i{{[0-9]+}}, i{{[0-9]+}} [[C_SZ]],
+  // TCHECK:  [[C_SZ2:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF]], [[VLA_ADDR_REF2]]
+  // TCHECK:  [[C_SZ_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[C_SZ2]],  2
+  // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_PRIV]] to i8*
+  // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+
+  // finish
+  // TCHECK: [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
+  // TCHECK: call void @llvm.stackrestore(i8* [[RELOAD_SSTACK]])
+  // TCHECK: ret void
+
+
+  // static host function
+  // CHECK:  define{{.+}} i32 {{.+}}(i{{[0-9]+}} {{.+}})
+  // CHECK:  [[BASE_PTRS5:%.+]] = alloca [3 x i8*],
+  // CHECK:  [[PTRS5:%.+]] = alloca [3 x i8*],
+
+  // firstprivate(a): by value
+  // CHECK:  [[A_CAST_PTR:%.+]] = inttoptr i{{[0-9]+}} [[A_CAST:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP5_0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[A_CAST_PTR]], i8** [[BASE_PTRS_GEP5_0]],
+  // CHECK:  [[A_CAST_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[A_CAST:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP5_0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[A_CAST_PTR2]], i8** [[PTRS_GEP5_0]],
+
+  // firstprivate(aaa): by value
+  // CHECK:  [[A3_CAST_PTR:%.+]] = inttoptr i{{[0-9]+}} [[A3_CAST:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP5_1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[A3_CAST_PTR]], i8** [[BASE_PTRS_GEP5_1]],
+  // CHECK:  [[A3_CAST_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[A3_CAST:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP5_1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[A3_CAST_PTR2]], i8** [[PTRS_GEP5_1]],
+
+  // firstprivate(b): base_ptr = &b[0], ptr= &b[0]
+  // CHECK:  [[B_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP5_2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* [[B_BCAST]], i8** [[BASE_PTRS_GEP5_2]],
+  // CHECK:  [[B_BCAST2:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP5_2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* [[B_BCAST2]], i8** [[PTRS_GEP5_2]],
+
+  // only check that the right sizes and map types are used
+  // CHECK:  call i32 @__tgt_target(i32 -1, {{.+}}, i32 3, i8** {{.+}}, i8** {{.+}}, i[[SZ]]* getelementptr inbounds ([3 x i[[SZ]]], [3 x i[[SZ]]]* [[SIZET5]], i32 0, i32 0), i32* getelementptr inbounds ([3 x i32], [3 x i32]* [[MAPT5]], i32 0, i32 0))
+};
+
+
+
+int bar(int n, double *ptr){
+  int a = 0;
+  a += foo(n, ptr);
+  S1 S;
+  a += S.r1(n);
+  a += fstatic(n);
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+// template host and device
+
+// CHECK:  define{{.+}} i32 {{.+}}(i{{[0-9]+}} {{.+}})
+// CHECK:  [[BASE_PTRS6:%.+]] = alloca [2 x i8*],
+// CHECK:  [[PTRS6:%.+]] = alloca [2 x i8*],
+
+// firstprivate(a): by value
+// CHECK:  [[AT_CAST_PTR:%.+]] = inttoptr i{{[0-9]+}} [[AT_CAST:%.+]] to i8*
+// CHECK:  [[BASE_PTRS_GEP6_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK:  store i8* [[AT_CAST_PTR]], i8** [[BASE_PTRS_GEP6_0]],
+// CHECK:  [[AT_CAST_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[AT_CAST:%.+]] to i8*
+// CHECK:  [[PTRS_GEP6_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK:  store i8* [[AT_CAST_PTR2]], i8** [[PTRS_GEP6_0]],
+
+// firstprivate(b): pointer
+// CHECK:  [[B_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B:%.+]] to i8*
+// CHECK:  [[BASE_PTRS_GEP6_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK:  store i8* [[B_BCAST]], i8** [[BASE_PTRS_GEP6_1]],
+// CHECK:  [[B_BCAST2:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B:%.+]] to i8*
+// CHECK:  [[PTRS_GEP6_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK:  store i8* [[B_BCAST2]], i8** [[PTRS_GEP6_1]],
+
+// CHECK:  call i32 @__tgt_target(i32 -1, {{.+}}, i32 2, i8** {{.+}}, i8** {{.+}}, i[[SZ]]* getelementptr inbounds ([2 x i[[SZ]]], [2 x i[[SZ]]]* [[SIZET6]], i32 0, i32 0), i32* getelementptr inbounds ([2 x i32], [2 x i32]* [[MAPT6]], i32 0, i32 0))
+
+
+// TCHECK: define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
+// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
+// TCHECK-NOT: alloca i{{[0-9]+}},
+// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
+// TCHECK-64:  [[A_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
+// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+// firstprivate(a)
+// TCHECK-NOT:  store i{{[0-9]+}} %
+
+// firstprivate(b)
+// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
+// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+
+// TCHECK: ret void
+
+#endif
diff --git a/test/OpenMP/target_firstprivate_messages.cpp b/test/OpenMP/target_firstprivate_messages.cpp
new file mode 100644
index 0000000..6dbad67
--- /dev/null
+++ b/test/OpenMP/target_firstprivate_messages.cpp
@@ -0,0 +1,198 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target firstprivate(a) firstprivate(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target firstprivate(a) firstprivate(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(s.a) firstprivate(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+{}
+#pragma omp target firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate() // expected-error {{expected expression}}
+{}
+#pragma omp target firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target firstprivate(argc)
+{}
+#pragma omp target firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+{}
+#pragma omp target firstprivate(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target firstprivate(e, g)
+{}
+#pragma omp target firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+{}
+#pragma omp target shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp target'}}
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel firstprivate(i)
+#pragma omp target firstprivate(j)
+{}
+#pragma omp target firstprivate(i)
+  {}
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp target firstprivate(a)
+  {}
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+{}
+#pragma omp target firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate() // expected-error {{expected expression}}
+{}
+#pragma omp target firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target firstprivate(argc)
+{}
+#pragma omp target firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+{}
+#pragma omp target firstprivate(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target firstprivate(e, g)
+{}
+#pragma omp target firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+{}
+#pragma omp target firstprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+{}
+#pragma omp target shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp target'}}
+#pragma omp parallel
+  {
+    int i;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel firstprivate(i)
+#pragma omp target firstprivate(j)
+{}
+#pragma omp target firstprivate(i)
+  {}
+  static int si;
+#pragma omp target firstprivate(si) // OK
+  {}
+#pragma omp target map(i) firstprivate(i) // expected-error {{firstprivate variable cannot be in a map clause in '#pragma omp target' directive}}
+  {}
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_if_messages.cpp b/test/OpenMP/target_if_messages.cpp
index 4ee7302..189256e 100644
--- a/test/OpenMP/target_if_messages.cpp
+++ b/test/OpenMP/target_if_messages.cpp
@@ -12,21 +12,37 @@
 template <class T, class S> // expected-note {{declared here}}
 int tmain(T argc, S **argv) {
   #pragma omp target if // expected-error {{expected '(' after 'if'}}
+  foo();
   #pragma omp target if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if () // expected-error {{expected expression}}
+  foo();
   #pragma omp target if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
   #pragma omp target if (argc > 0 ? argv[1] : argv[2])
+  foo();
   #pragma omp target if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'if' clause}}
+  foo();
   #pragma omp target if (S) // expected-error {{'S' does not refer to a value}}
+  foo();
   #pragma omp target if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(argc)
+  foo();
   #pragma omp target if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : argc)
+  foo();
   #pragma omp target if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target'}}
+  foo();
   #pragma omp target if(target : argc) if (target:argc) // expected-error {{directive '#pragma omp target' cannot contain more than one 'if' clause with 'target' name modifier}}
+  foo();
   #pragma omp target if(target : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
   foo();
 
@@ -35,22 +51,39 @@
 
 int main(int argc, char **argv) {
   #pragma omp target if // expected-error {{expected '(' after 'if'}}
+  foo();
   #pragma omp target if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if () // expected-error {{expected expression}}
+  foo();
   #pragma omp target if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
   #pragma omp target if (argc > 0 ? argv[1] : argv[2])
+  foo();
   #pragma omp target if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'if' clause}}
+  foo();
   #pragma omp target if (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
   #pragma omp target if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : argc)
+  foo();
   #pragma omp target if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target'}}
+  foo();
   #pragma omp target if(target : argc) if (target:argc) // expected-error {{directive '#pragma omp target' cannot contain more than one 'if' clause with 'target' name modifier}}
+  foo();
   #pragma omp target if(target : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
   foo();
 
diff --git a/test/OpenMP/target_is_device_ptr_ast_print.cpp b/test/OpenMP/target_is_device_ptr_ast_print.cpp
new file mode 100644
index 0000000..f519235
--- /dev/null
+++ b/test/OpenMP/target_is_device_ptr_ast_print.cpp
@@ -0,0 +1,294 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct ST {
+  int *a;
+};
+typedef int arr[10];
+typedef ST STarr[10];
+struct SA {
+  const int da[5] = { 0 };
+  ST g[10];
+  STarr &rg = g;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  arr &raa = aa;
+  void func(int arg) {
+#pragma omp target is_device_ptr(k)
+    {}
+#pragma omp target is_device_ptr(z)
+    {}
+#pragma omp target is_device_ptr(aa) // OK
+    {}
+#pragma omp target is_device_ptr(raa) // OK
+    {}
+#pragma omp target is_device_ptr(g) // OK
+    {}
+#pragma omp target is_device_ptr(rg) // OK
+    {}
+#pragma omp target is_device_ptr(da) // OK
+    {}
+  return;
+ }
+};
+// CHECK: struct SA
+// CHECK-NEXT: const int da[5] = {0};
+// CHECK-NEXT: ST g[10];
+// CHECK-NEXT: STarr &rg = this->g;
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = this->i;
+// CHECK-NEXT: int *k = &this->j;
+// CHECK-NEXT: int *&z = this->k;
+// CHECK-NEXT: int aa[10];
+// CHECK-NEXT: arr &raa = this->aa;
+// CHECK-NEXT: func(
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->z)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->aa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->raa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->g)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->rg)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->da)
+
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef struct {
+  int a;
+} S6;
+
+template <typename T>
+T tmain(T argc) {
+  const T da[5] = { 0 };
+  S6 h[10];
+  auto &rh = h;
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+  T aa[10];
+  auto &raa = aa;
+#pragma omp target is_device_ptr(k)
+  {}
+#pragma omp target is_device_ptr(z)
+  {}
+#pragma omp target is_device_ptr(aa)
+  {}
+#pragma omp target is_device_ptr(raa)
+  {}
+#pragma omp target is_device_ptr(h)
+  {}
+#pragma omp target is_device_ptr(rh)
+  {}
+#pragma omp target is_device_ptr(da)
+  {}
+  return 0;
+}
+
+// CHECK: template <typename T = int> int tmain(int argc) {
+// CHECK-NEXT: const int da[5] = {0};
+// CHECK-NEXT: S6 h[10];
+// CHECK-NEXT: auto &rh = h;
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = i;
+// CHECK-NEXT: int *k = &j;
+// CHECK-NEXT: int *&z = k;
+// CHECK-NEXT: int aa[10];
+// CHECK-NEXT: auto &raa = aa;
+// CHECK-NEXT: #pragma omp target is_device_ptr(k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(z)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(aa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(raa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(h)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(rh)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(da)
+
+// CHECK: template <typename T = int *> int *tmain(int *argc) {
+// CHECK-NEXT: int *const da[5] = {0};
+// CHECK-NEXT: S6 h[10];
+// CHECK-NEXT: auto &rh = h;
+// CHECK-NEXT: int *i;
+// CHECK-NEXT: int *&j = i;
+// CHECK-NEXT: int **k = &j;
+// CHECK-NEXT: int **&z = k;
+// CHECK-NEXT: int *aa[10];
+// CHECK-NEXT: auto &raa = aa;
+// CHECK-NEXT: #pragma omp target is_device_ptr(k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(z)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(aa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(raa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(h)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(rh)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(da)
+
+// CHECK-LABEL: int main(int argc, char **argv) {
+int main(int argc, char **argv) {
+  const int da[5] = { 0 };
+  S6 h[10];
+  auto &rh = h;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  auto &raa = aa;
+// CHECK-NEXT: const int da[5] = {0};
+// CHECK-NEXT: S6 h[10];
+// CHECK-NEXT: auto &rh = h;
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = i;
+// CHECK-NEXT: int *k = &j;
+// CHECK-NEXT: int *&z = k;
+// CHECK-NEXT: int aa[10];
+// CHECK-NEXT: auto &raa = aa;
+#pragma omp target is_device_ptr(k)
+// CHECK-NEXT: #pragma omp target is_device_ptr(k)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(z)
+// CHECK-NEXT: #pragma omp target is_device_ptr(z)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(aa)
+// CHECK-NEXT: #pragma omp target is_device_ptr(aa)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(raa)
+// CHECK-NEXT: #pragma omp target is_device_ptr(raa)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(h)
+// CHECK-NEXT: #pragma omp target is_device_ptr(h)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(rh)
+// CHECK-NEXT: #pragma omp target is_device_ptr(rh)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(da)
+// CHECK-NEXT: #pragma omp target is_device_ptr(da)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+  return tmain<int>(argc) + *tmain<int *>(&argc);
+}
+
+
+#endif
diff --git a/test/OpenMP/target_is_device_ptr_codegen.cpp b/test/OpenMP/target_is_device_ptr_codegen.cpp
new file mode 100644
index 0000000..6c80729
--- /dev/null
+++ b/test/OpenMP/target_is_device_ptr_codegen.cpp
@@ -0,0 +1,293 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+double *g;
+
+// CK1: @g = global double*
+// CK1: [[SIZES00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}]
+// CK1: [[TYPES00:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+
+// CK1: [[SIZES01:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
+// CK1: [[TYPES01:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+
+// CK1: [[SIZES02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
+// CK1: [[TYPES02:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+
+// CK1: [[SIZES03:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
+// CK1: [[TYPES03:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+
+// CK1: [[SIZES04:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
+// CK1: [[TYPES04:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+
+// CK1: [[SIZES05:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
+// CK1: [[TYPES05:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+
+// CK1: [[SIZES06:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] {{8|4}}]
+// CK1: [[TYPES06:@.+]] = {{.+}}constant [2 x i32] [i32 288, i32 288]
+
+// CK1-LABEL: @_Z3foo
+template<typename T>
+void foo(float *&lr, T *&tr) {
+  float *l;
+  T *t;
+
+  // CK1-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES00]]{{.+}}, {{.+}}[[TYPES00]]{{.+}})
+  // CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK1-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK1-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK1-DAG: [[VALBP]] = bitcast double* [[VAL:%.+]] to i8*
+  // CK1-DAG: [[VALP]] = bitcast double* [[VAL]] to i8*
+  // CK1-DAG: [[VAL]] = load double*, double** [[ADDR:@g]],
+
+  // CK1: call void [[KERNEL:@.+]](double* [[VAL]])
+  #pragma omp target is_device_ptr(g)
+  {
+    ++g;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES01]]{{.+}}, {{.+}}[[TYPES01]]{{.+}})
+  // CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK1-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK1-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK1-DAG: [[VALBP]] = bitcast float* [[VAL:%.+]] to i8*
+  // CK1-DAG: [[VALP]] = bitcast float* [[VAL]] to i8*
+  // CK1-DAG: [[VAL]] = load float*, float** [[ADDR:%.+]],
+
+  // CK1: call void [[KERNEL:@.+]](float* [[VAL]])
+  #pragma omp target is_device_ptr(l)
+  {
+    ++l;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES02]]{{.+}}, {{.+}}[[TYPES02]]{{.+}})
+  // CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK1-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK1-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK1-DAG: [[VALBP]] = bitcast i32* [[VAL:%.+]] to i8*
+  // CK1-DAG: [[VALP]] = bitcast i32* [[VAL]] to i8*
+  // CK1-DAG: [[VAL]] = load i32*, i32** [[ADDR:%.+]],
+
+  // CK1: call void [[KERNEL:@.+]](i32* [[VAL]])
+  #pragma omp target is_device_ptr(t)
+  {
+    ++t;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES03]]{{.+}}, {{.+}}[[TYPES03]]{{.+}})
+  // CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK1-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK1-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK1-DAG: [[VALBP]] = bitcast float* [[VAL:%.+]] to i8*
+  // CK1-DAG: [[VALP]] = bitcast float* [[VAL]] to i8*
+  // CK1-DAG: [[VAL]] = load float*, float** [[ADDR:%.+]],
+  // CK1-DAG: [[ADDR]] = load float**, float*** [[ADDR2:%.+]],
+
+  // CK1: call void [[KERNEL:@.+]](float* [[VAL]])
+  #pragma omp target is_device_ptr(lr)
+  {
+    ++lr;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES04]]{{.+}}, {{.+}}[[TYPES04]]{{.+}})
+  // CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK1-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK1-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK1-DAG: [[VALBP]] = bitcast i32* [[VAL:%.+]] to i8*
+  // CK1-DAG: [[VALP]] = bitcast i32* [[VAL]] to i8*
+  // CK1-DAG: [[VAL]] = load i32*, i32** [[ADDR:%.+]],
+  // CK1-DAG: [[ADDR]] = load i32**, i32*** [[ADDR2:%.+]],
+
+  // CK1: call void [[KERNEL:@.+]](i32* [[VAL]])
+  #pragma omp target is_device_ptr(tr)
+  {
+    ++tr;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES05]]{{.+}}, {{.+}}[[TYPES05]]{{.+}})
+  // CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK1-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK1-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK1-DAG: [[VALBP]] = bitcast i32* [[VAL:%.+]] to i8*
+  // CK1-DAG: [[VALP]] = bitcast i32* [[VAL]] to i8*
+  // CK1-DAG: [[VAL]] = load i32*, i32** [[ADDR:%.+]],
+  // CK1-DAG: [[ADDR]] = load i32**, i32*** [[ADDR2:%.+]],
+
+  // CK1: call void [[KERNEL:@.+]](i32* [[VAL]])
+  #pragma omp target is_device_ptr(tr,lr)
+  {
+    ++tr;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 2, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES06]]{{.+}}, {{.+}}[[TYPES06]]{{.+}})
+  // CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK1-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK1-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK1-DAG: [[VALBP]] = bitcast i32* [[VAL:%.+]] to i8*
+  // CK1-DAG: [[VALP]] = bitcast i32* [[VAL]] to i8*
+  // CK1-DAG: [[VAL]] = load i32*, i32** [[ADDR:%.+]],
+  // CK1-DAG: [[ADDR]] = load i32**, i32*** [[ADDR2:%.+]],
+
+  // CK1-DAG: [[_BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 1
+  // CK1-DAG: [[_P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 1
+  // CK1-DAG: store i8* [[_VALBP:%.+]], i8** [[_BP1]],
+  // CK1-DAG: store i8* [[_VALP:%.+]], i8** [[_P1]],
+  // CK1-DAG: [[_VALBP]] = bitcast float* [[_VAL:%.+]] to i8*
+  // CK1-DAG: [[_VALP]] = bitcast float* [[_VAL]] to i8*
+  // CK1-DAG: [[_VAL]] = load float*, float** [[_ADDR:%.+]],
+  // CK1-DAG: [[_ADDR]] = load float**, float*** [[_ADDR2:%.+]],
+
+  // CK1: call void [[KERNEL:@.+]](i32* [[VAL]], float* [[_VAL]])
+  #pragma omp target is_device_ptr(tr,lr)
+  {
+    ++tr,++lr;
+  }
+}
+
+void bar(float *&a, int *&b) {
+  foo<int>(a,b);
+}
+
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { double*, double** }
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 33]
+
+// CK2: [[SIZE01:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] {{8|4}}]
+// CK2: [[MTYPE01:@.+]] = {{.+}}constant [2 x i32] [i32 32, i32 17]
+
+// CK2: [[SIZE02:@.+]] = {{.+}}constant [3 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] {{8|4}}, i[[sz]] {{8|4}}]
+// CK2: [[MTYPE02:@.+]] = {{.+}}constant [3 x i32] [i32 33, i32 0, i32 17]
+
+template <typename T>
+struct ST {
+  T *a;
+  double *&b;
+  ST(double *&b) : a(0), b(b) {}
+
+  // CK2-LABEL: @{{.*}}foo{{.*}}
+  void foo(double *&arg) {
+    int *la = 0;
+
+    // CK2-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+    // CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%.+]] to i8*
+    // CK2-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+    #pragma omp target is_device_ptr(a)
+    {
+      a++;
+    }
+
+    // CK2-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE01]]{{.+}})
+    // CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK2-DAG: [[CPVAL0]] = bitcast double*** [[SEC0:%.+]] to i8*
+    // CK2-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+    // CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK2-DAG: [[CBPVAL1]] = bitcast double*** [[SEC0]] to i8*
+    // CK2-DAG: [[CPVAL1]] = bitcast double** [[SEC1:%.+]] to i8*
+    // CK2-DAG: [[SEC1]] = load double**, double*** [[SEC0]]
+    #pragma omp target is_device_ptr(b)
+    {
+      b++;
+    }
+
+    // CK2-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE02]]{{.+}})
+    // CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK2-DAG: [[CPVAL0]] = bitcast double*** [[SEC0:%.+]] to i8*
+    // CK2-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+    // CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+    // CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+    // CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK2-DAG: [[CBPVAL1]] = bitcast double*** [[SEC0]] to i8*
+    // CK2-DAG: [[CPVAL1]] = bitcast double** [[SEC1:%.+]] to i8*
+    // CK2-DAG: [[SEC1]] = load double**, double*** [[SEC0]]
+
+    // CK2-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK2-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK2-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+    // CK2-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+    // CK2-DAG: [[CBPVAL2]] = bitcast [[ST]]* [[VAR2:%.+]] to i8*
+    // CK2-DAG: [[CPVAL2]] = bitcast double** [[SEC2:%.+]] to i8*
+    // CK2-DAG: [[SEC2]] = getelementptr {{.*}}[[ST]]* [[VAR2]], i{{.+}} 0, i{{.+}} 0
+    #pragma omp target is_device_ptr(a, b)
+    {
+      a++;
+      b++;
+    }
+  }
+};
+
+void bar(double *arg){
+  ST<double> A(arg);
+  A.foo(arg);
+  ++arg;
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_is_device_ptr_messages.cpp b/test/OpenMP/target_is_device_ptr_messages.cpp
new file mode 100644
index 0000000..8cd1426
--- /dev/null
+++ b/test/OpenMP/target_is_device_ptr_messages.cpp
@@ -0,0 +1,268 @@
+// RUN: %clang_cc1 -std=c++11 -verify -fopenmp -ferror-limit 200 %s
+struct ST {
+  int *a;
+};
+typedef int arr[10];
+typedef ST STarr[10];
+struct SA {
+  const int d = 5;
+  const int da[5] = { 0 };
+  ST e;
+  ST g[10];
+  STarr &rg = g;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  arr &raa = aa;
+  void func(int arg) {
+#pragma omp target is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+    {}
+#pragma omp target is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+    {}
+#pragma omp target is_device_ptr() // expected-error {{expected expression}}
+    {}
+#pragma omp target is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+    {}
+#pragma omp target is_device_ptr(arg // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(k) // OK
+    {}
+#pragma omp target is_device_ptr(z) // OK
+    {}
+#pragma omp target is_device_ptr(aa) // OK
+    {}
+#pragma omp target is_device_ptr(raa) // OK
+    {}    
+#pragma omp target is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(g) // OK
+    {}
+#pragma omp target is_device_ptr(rg) // OK
+    {}
+#pragma omp target is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(da) // OK
+    {}
+  return;
+ }
+};
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef struct {
+  int a;
+} S6;
+
+template <typename T, int I>
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  S6 h[10];
+  auto &rh = h;
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+  T aa[10];
+  auto &raa = aa;
+  S6 *ps;
+#pragma omp target is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+  {}
+#pragma omp target is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target is_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target is_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(k) // OK
+  {}
+#pragma omp target is_device_ptr(z) // OK
+  {}
+#pragma omp target is_device_ptr(aa) // OK
+  {}
+#pragma omp target is_device_ptr(raa) // OK
+  {}
+#pragma omp target is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(g) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(h) // OK
+  {}
+#pragma omp target is_device_ptr(rh) // OK
+  {}
+#pragma omp target is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(da) // OK
+  {}
+#pragma omp target map(ps) is_device_ptr(ps) // expected-error{{variable already marked as mapped in current construct}} expected-note{{used here}}
+  {}
+#pragma omp target is_device_ptr(ps) map(ps) // expected-error{{variable already marked as mapped in current construct}} expected-note{{used here}}
+  {}
+#pragma omp target map(ps->a) is_device_ptr(ps) // expected-error{{variable already marked as mapped in current construct}} expected-note{{used here}}
+  {}
+#pragma omp target is_device_ptr(ps) map(ps->a) // expected-error{{pointer cannot be mapped along with a section derived from itself}} expected-note{{used here}}
+  {}
+#pragma omp target is_device_ptr(ps) firstprivate(ps) // expected-error{{firstprivate variable cannot be in a is_device_ptr clause in '#pragma omp target' directive}}
+  {}
+#pragma omp target firstprivate(ps) is_device_ptr(ps) // expected-error{{firstprivate variable cannot be in a is_device_ptr clause in '#pragma omp target' directive}} expected-note{{defined as firstprivate}}
+  {}
+#pragma omp target is_device_ptr(ps) private(ps) // expected-error{{private variable cannot be in a is_device_ptr clause in '#pragma omp target' directive}}
+  {}
+#pragma omp target private(ps) is_device_ptr(ps) // expected-error{{private variable cannot be in a is_device_ptr clause in '#pragma omp target' directive}} expected-note{{defined as private}}
+  {}
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  S6 h[10];
+  auto &rh = h;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  auto &raa = aa;
+  S6 *ps;
+#pragma omp target is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+  {}
+#pragma omp target is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target is_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target is_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(k) // OK
+  {}
+#pragma omp target is_device_ptr(z) // OK
+  {}
+#pragma omp target is_device_ptr(aa) // OK
+  {}
+#pragma omp target is_device_ptr(raa) // OK
+  {}
+#pragma omp target is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(g) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(h) // OK
+  {}
+#pragma omp target is_device_ptr(rh) // OK
+  {}
+#pragma omp target is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(da) // OK
+  {}
+#pragma omp target map(ps) is_device_ptr(ps) // expected-error{{variable already marked as mapped in current construct}} expected-note{{used here}}
+  {}
+#pragma omp target is_device_ptr(ps) map(ps) // expected-error{{variable already marked as mapped in current construct}} expected-note{{used here}}
+  {}
+#pragma omp target map(ps->a) is_device_ptr(ps) // expected-error{{variable already marked as mapped in current construct}} expected-note{{used here}}
+  {}
+#pragma omp target is_device_ptr(ps) map(ps->a) // expected-error{{pointer cannot be mapped along with a section derived from itself}} expected-note{{used here}}
+  {}
+#pragma omp target is_device_ptr(ps) firstprivate(ps) // expected-error{{firstprivate variable cannot be in a is_device_ptr clause in '#pragma omp target' directive}}
+  {}
+#pragma omp target firstprivate(ps) is_device_ptr(ps) // expected-error{{firstprivate variable cannot be in a is_device_ptr clause in '#pragma omp target' directive}} expected-note{{defined as firstprivate}}
+  {}
+#pragma omp target is_device_ptr(ps) private(ps) // expected-error{{private variable cannot be in a is_device_ptr clause in '#pragma omp target' directive}}
+  {}
+#pragma omp target private(ps) is_device_ptr(ps) // expected-error{{private variable cannot be in a is_device_ptr clause in '#pragma omp target' directive}} expected-note{{defined as private}}
+  {}
+  return tmain<int, 3>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_map_codegen.cpp b/test/OpenMP/target_map_codegen.cpp
index 942cc4c..000ce71 100644
--- a/test/OpenMP/target_map_codegen.cpp
+++ b/test/OpenMP/target_map_codegen.cpp
@@ -7,17 +7,17 @@
 ///
 
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
-// RUN: %clang_cc1 -DCK1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
-// RUN: %clang_cc1 -DCK1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
 #ifdef CK1
 
 // CK1-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK1-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK1-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK1-LABEL: implicit_maps_integer
 void implicit_maps_integer (int a){
@@ -52,20 +52,23 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
-// RUN: %clang_cc1 -DCK2 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
-// RUN: %clang_cc1 -DCK2 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
 #ifdef CK2
 
-// CK2-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK2-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// CK2: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK2: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+// CK2: [[SIZES2:@.+]] = {{.+}}constant [1 x i[[sz]]] zeroinitializer
+// Map types: OMP_MAP_IS_PTR = 32
+// CK2: [[TYPES2:@.+]] = {{.+}}constant [1 x i32] [i32 32]
 
-// CK2-LABEL: implicit_maps_integer_reference
-void implicit_maps_integer_reference (int a){
+// CK2-LABEL: implicit_maps_reference
+void implicit_maps_reference (int a, int *b){
   int &i = a;
   // CK2-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES]]{{.+}}, {{.+}}[[TYPES]]{{.+}})
   // CK2-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
@@ -85,6 +88,25 @@
   {
    ++i;
   }
+
+  int *&p = b;
+  // CK2-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES2]]{{.+}}, {{.+}}[[TYPES2]]{{.+}})
+  // CK2-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK2-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK2-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK2-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK2-DAG: [[VALBP]] = bitcast i32* [[VAL:%.+]] to i8*
+  // CK2-DAG: [[VALP]] = bitcast i32* [[VAL]] to i8*
+  // CK2-DAG: [[VAL]] = load i32*, i32** [[ADDR:%.+]],
+  // CK2-DAG: [[ADDR]] = load i32**, i32*** [[ADDR2:%.+]],
+
+  // CK2: call void [[KERNEL2:@.+]](i32* [[VAL]])
+  #pragma omp target
+  {
+   ++p;
+  }
 }
 
 // CK2: define internal void [[KERNEL]](i[[sz]] [[ARG:%.+]])
@@ -99,19 +121,27 @@
 // CK2-32: [[RVAL:%.+]] = load i32*, i32** [[REF]],
 // CK2-32: {{.+}} = load i32, i32* [[RVAL]],
 
+// CK2: define internal void [[KERNEL2]](i32* [[ARG:%.+]])
+// CK2: [[ADDR:%.+]] = alloca i32*,
+// CK2: [[REF:%.+]] = alloca i32**,
+// CK2: store i32* [[ARG]], i32** [[ADDR]],
+// CK2: store i32** [[ADDR]], i32*** [[REF]],
+// CK2: [[T:%.+]] = load i32**, i32*** [[REF]],
+// CK2: [[TT:%.+]] = load i32*, i32** [[T]],
+// CK2: getelementptr inbounds i32, i32* [[TT]], i32 1
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK3 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
-// RUN: %clang_cc1 -DCK3 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
-// RUN: %clang_cc1 -DCK3 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
-// RUN: %clang_cc1 -DCK3 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
 #ifdef CK3
 
 // CK3-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK3-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK3-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK3-LABEL: implicit_maps_parameter
 void implicit_maps_parameter (int a){
@@ -145,17 +175,17 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK4 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
-// RUN: %clang_cc1 -DCK4 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-64
-// RUN: %clang_cc1 -DCK4 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
-// RUN: %clang_cc1 -DCK4 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
 #ifdef CK4
 
 // CK4-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK4-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK4-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK4-LABEL: implicit_maps_nested_integer
 void implicit_maps_nested_integer (int a){
@@ -201,17 +231,17 @@
 // CK4: define internal void [[KERNELP2]](i32* {{[^,]+}}, i32* {{[^,]+}}, i32* {{[^,]+}})
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK5 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64
-// RUN: %clang_cc1 -DCK5 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-64
-// RUN: %clang_cc1 -DCK5 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-32
-// RUN: %clang_cc1 -DCK5 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-32
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64
+// RUN: %clang_cc1 -DCK5 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-64
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-32
+// RUN: %clang_cc1 -DCK5 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-32
 #ifdef CK5
 
 // CK5-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK5-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK5-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK5-LABEL: implicit_maps_nested_integer_and_enum
 void implicit_maps_nested_integer_and_enum (int a){
@@ -252,17 +282,17 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK6 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK6 --check-prefix CK6-64
-// RUN: %clang_cc1 -DCK6 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-64
-// RUN: %clang_cc1 -DCK6 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-32
-// RUN: %clang_cc1 -DCK6 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-32
+// RUN: %clang_cc1 -DCK6 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK6 --check-prefix CK6-64
+// RUN: %clang_cc1 -DCK6 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-64
+// RUN: %clang_cc1 -DCK6 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-32
+// RUN: %clang_cc1 -DCK6 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-32
 #ifdef CK6
 // CK6-DAG: [[GBL:@Gi]] = global i32 0
 // CK6-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK6-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK6-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK6-LABEL: implicit_maps_host_global
 int Gi;
@@ -298,22 +328,22 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK7 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK7 --check-prefix CK7-64
-// RUN: %clang_cc1 -DCK7 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-64
-// RUN: %clang_cc1 -DCK7 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-32
-// RUN: %clang_cc1 -DCK7 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-32
+// RUN: %clang_cc1 -DCK7 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK7 --check-prefix CK7-64
+// RUN: %clang_cc1 -DCK7 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-64
+// RUN: %clang_cc1 -DCK7 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-32
+// RUN: %clang_cc1 -DCK7 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-32
 #ifdef CK7
 
 // For a 32-bit targets, the value doesn't fit the size of the pointer,
 // therefore it is passed by reference with a map 'to' specification.
 
 // CK7-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 8]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK7-64-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
-// Map types: OMP_MAP_TO = 1
-// CK7-32-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 1]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK7-64-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+// Map types: OMP_MAP_TO  | OMP_MAP_IS_FIRST = 33
+// CK7-32-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 33]
 
 // CK7-LABEL: implicit_maps_double
 void implicit_maps_double (int a){
@@ -360,17 +390,17 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK8 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK8
-// RUN: %clang_cc1 -DCK8 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK8
-// RUN: %clang_cc1 -DCK8 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK8
-// RUN: %clang_cc1 -DCK8 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK8
+// RUN: %clang_cc1 -DCK8 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK8
+// RUN: %clang_cc1 -DCK8 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK8
+// RUN: %clang_cc1 -DCK8 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK8
+// RUN: %clang_cc1 -DCK8 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK8
 #ifdef CK8
 
 // CK8-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK8-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK8-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK8-LABEL: implicit_maps_float
 void implicit_maps_float (int a){
@@ -404,17 +434,17 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK9 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK9
-// RUN: %clang_cc1 -DCK9 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK9
-// RUN: %clang_cc1 -DCK9 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK9
-// RUN: %clang_cc1 -DCK9 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK9
+// RUN: %clang_cc1 -DCK9 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK9
+// RUN: %clang_cc1 -DCK9 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK9
+// RUN: %clang_cc1 -DCK9 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK9
+// RUN: %clang_cc1 -DCK9 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK9
 #ifdef CK9
 
 // CK9-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 16]
-// Map types: OMP_MAP_TO + OMP_MAP_FROM = 2 + 1
-// CK9-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 3]
+// Map types: OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// CK9-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 35]
 
 // CK9-LABEL: implicit_maps_array
 void implicit_maps_array (int a){
@@ -445,17 +475,17 @@
 // CK9: {{.+}} = getelementptr inbounds [2 x double], [2 x double]* [[REF]], i[[sz]] 0, i[[sz]] 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK10 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK10
-// RUN: %clang_cc1 -DCK10 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK10
-// RUN: %clang_cc1 -DCK10 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK10
-// RUN: %clang_cc1 -DCK10 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK10
+// RUN: %clang_cc1 -DCK10 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK10
+// RUN: %clang_cc1 -DCK10 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK10
+// RUN: %clang_cc1 -DCK10 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK10
+// RUN: %clang_cc1 -DCK10 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK10
 #ifdef CK10
 
-// CK10-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}]
-// Map types: OMP_MAP_BYCOPY | OMP_MAP_PTR = 128 + 32
-// CK10-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 160]
+// CK10-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] zeroinitializer
+// Map types: OMP_MAP_IS_FIRST = 32
+// CK10-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 32]
 
 // CK10-LABEL: implicit_maps_pointer
 void implicit_maps_pointer (){
@@ -487,17 +517,17 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK11 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK11
-// RUN: %clang_cc1 -DCK11 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
-// RUN: %clang_cc1 -DCK11 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK11
-// RUN: %clang_cc1 -DCK11 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
+// RUN: %clang_cc1 -DCK11 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK11
+// RUN: %clang_cc1 -DCK11 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
+// RUN: %clang_cc1 -DCK11 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK11
+// RUN: %clang_cc1 -DCK11 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
 #ifdef CK11
 
 // CK11-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 16]
-// Map types: OMP_MAP_TO = 1
-// CK11-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 1]
+// Map types: OMP_MAP_TO + OMP_MAP_IS_FIRST = 33
+// CK11-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 33]
 
 // CK11-LABEL: implicit_maps_double_complex
 void implicit_maps_double_complex (int a){
@@ -527,22 +557,22 @@
 // CK11: {{.+}} = getelementptr inbounds { double, double }, { double, double }* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK12 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK12 --check-prefix CK12-64
-// RUN: %clang_cc1 -DCK12 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-64
-// RUN: %clang_cc1 -DCK12 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-32
-// RUN: %clang_cc1 -DCK12 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-32
+// RUN: %clang_cc1 -DCK12 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK12 --check-prefix CK12-64
+// RUN: %clang_cc1 -DCK12 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-64
+// RUN: %clang_cc1 -DCK12 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-32
+// RUN: %clang_cc1 -DCK12 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-32
 #ifdef CK12
 
 // For a 32-bit targets, the value doesn't fit the size of the pointer,
 // therefore it is passed by reference with a map 'to' specification.
 
 // CK12-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 8]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK12-64-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
-// Map types: OMP_MAP_TO = 1
-// CK12-32-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 1]
+// Map types: OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK12-64-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+// Map types: OMP_MAP_TO + OMP_MAP_IS_FIRST = 33
+// CK12-32-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 33]
 
 // CK12-LABEL: implicit_maps_float_complex
 void implicit_maps_float_complex (int a){
@@ -588,20 +618,20 @@
 // CK12-32: {{.+}} = getelementptr inbounds { float, float }, { float, float }* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK13 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK13
-// RUN: %clang_cc1 -DCK13 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
-// RUN: %clang_cc1 -DCK13 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK13
-// RUN: %clang_cc1 -DCK13 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
+// RUN: %clang_cc1 -DCK13 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK13
+// RUN: %clang_cc1 -DCK13 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
+// RUN: %clang_cc1 -DCK13 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK13
+// RUN: %clang_cc1 -DCK13 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
 #ifdef CK13
 
 // We don't have a constant map size for VLAs.
 // Map types:
-//  - OMP_MAP_BYCOPY = 128 (vla size)
-//  - OMP_MAP_BYCOPY = 128 (vla size)
-//  - OMP_MAP_TO + OMP_MAP_FROM = 2 + 1
-// CK13-DAG: [[TYPES:@.+]] = {{.+}}constant [3 x i32] [i32 128, i32 128, i32 3]
+//  - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288 (vla size)
+//  - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288 (vla size)
+//  - OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// CK13-DAG: [[TYPES:@.+]] = {{.+}}constant [3 x i32] [i32 288, i32 288, i32 35]
 
 // CK13-LABEL: implicit_maps_variable_length_array
 void implicit_maps_variable_length_array (int a){
@@ -658,20 +688,20 @@
 // CK13: {{.+}} = getelementptr inbounds double, double* [[REF]], i[[sz]] %{{.+}}
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK14 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK14 --check-prefix CK14-64
-// RUN: %clang_cc1 -DCK14 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-64
-// RUN: %clang_cc1 -DCK14 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-32
-// RUN: %clang_cc1 -DCK14 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-32
+// RUN: %clang_cc1 -DCK14 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK14 --check-prefix CK14-64
+// RUN: %clang_cc1 -DCK14 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-64
+// RUN: %clang_cc1 -DCK14 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-32
+// RUN: %clang_cc1 -DCK14 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-32
 #ifdef CK14
 
 // CK14-DAG: [[ST:%.+]] = type { i32, double }
 // CK14-DAG: [[SIZES:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{16|12}}, i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_TO | OMP_MAP_FROM = 1 + 2
-// - OMP_MAP_BYCOPY = 128
-// CK14-DAG: [[TYPES:@.+]] = {{.+}}constant [2 x i32] [i32 3, i32 128]
+// - OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK14-DAG: [[TYPES:@.+]] = {{.+}}constant [2 x i32] [i32 35, i32 288]
 
 class SSS {
 public:
@@ -732,26 +762,26 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK15 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK15 --check-prefix CK15-64
-// RUN: %clang_cc1 -DCK15 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-64
-// RUN: %clang_cc1 -DCK15 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-32
-// RUN: %clang_cc1 -DCK15 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-32
+// RUN: %clang_cc1 -DCK15 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK15 --check-prefix CK15-64
+// RUN: %clang_cc1 -DCK15 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-64
+// RUN: %clang_cc1 -DCK15 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-32
+// RUN: %clang_cc1 -DCK15 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-32
 #ifdef CK15
 
 // CK15: [[ST:%.+]] = type { i32, double, i32* }
 // CK15: [[SIZES:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{24|16}}, i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_TO | OMP_MAP_FROM = 1 + 2
-// - OMP_MAP_BYCOPY = 128
-// CK15: [[TYPES:@.+]] = {{.+}}constant [2 x i32] [i32 3, i32 128]
+// - OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK15: [[TYPES:@.+]] = {{.+}}constant [2 x i32] [i32 35, i32 288]
 
 // CK15: [[SIZES2:@.+]] = {{.+}}constant [2 x i[[sz]]] [i{{64|32}} {{24|16}}, i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_TO | OMP_MAP_FROM = 1 + 2
-// - OMP_MAP_BYCOPY = 128
-// CK15: [[TYPES2:@.+]] = {{.+}}constant [2 x i32] [i32 3, i32 128]
+// - OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK15: [[TYPES2:@.+]] = {{.+}}constant [2 x i32] [i32 35, i32 288]
 
 template<int x>
 class SSST {
@@ -860,18 +890,18 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK16 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK16 --check-prefix CK16-64
-// RUN: %clang_cc1 -DCK16 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-64
-// RUN: %clang_cc1 -DCK16 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-32
-// RUN: %clang_cc1 -DCK16 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-32
+// RUN: %clang_cc1 -DCK16 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK16 --check-prefix CK16-64
+// RUN: %clang_cc1 -DCK16 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-64
+// RUN: %clang_cc1 -DCK16 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-32
+// RUN: %clang_cc1 -DCK16 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-32
 #ifdef CK16
 
 // CK16-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_BYCOPY = 128
-// CK16-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK16-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 template<int y>
 int foo(int d) {
@@ -913,18 +943,18 @@
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK17 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK17
-// RUN: %clang_cc1 -DCK17 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK17
-// RUN: %clang_cc1 -DCK17 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK17
-// RUN: %clang_cc1 -DCK17 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK17
+// RUN: %clang_cc1 -DCK17 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK17
+// RUN: %clang_cc1 -DCK17 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK17
+// RUN: %clang_cc1 -DCK17 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK17
+// RUN: %clang_cc1 -DCK17 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK17
 #ifdef CK17
 
 // CK17-DAG: [[ST:%.+]] = type { i32, double }
 // CK17-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} {{16|12}}]
-// Map types: OMP_MAP_TO + OMP_MAP_FROM = 2 + 1
-// CK17-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 3]
+// Map types: OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// CK17-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 35]
 
 class SSS {
 public:
@@ -961,18 +991,18 @@
 // CK17: {{.+}} = getelementptr inbounds [[ST]], [[ST]]* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK18 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK18 --check-prefix CK18-64
-// RUN: %clang_cc1 -DCK18 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-64
-// RUN: %clang_cc1 -DCK18 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
-// RUN: %clang_cc1 -DCK18 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
+// RUN: %clang_cc1 -DCK18 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK18 --check-prefix CK18-64
+// RUN: %clang_cc1 -DCK18 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-64
+// RUN: %clang_cc1 -DCK18 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
+// RUN: %clang_cc1 -DCK18 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
 #ifdef CK18
 
 // CK18-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_BYCOPY = 128
-// CK18-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK18-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 template<typename T>
 int foo(T d) {
@@ -1012,4 +1042,3715 @@
 // CK18-32: {{.+}} = load i32, i32* [[ADDR]],
 
 #endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK19 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK19 --check-prefix CK19-64
+// RUN: %clang_cc1 -DCK19 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK19 --check-prefix CK19-64
+// RUN: %clang_cc1 -DCK19 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK19 --check-prefix CK19-32
+// RUN: %clang_cc1 -DCK19 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK19 --check-prefix CK19-32
+#ifdef CK19
+
+// CK19: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK19: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK19: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK19: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 240]
+// CK19: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK19: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 240]
+// CK19: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK19: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[SIZE05:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE05:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK19: [[MTYPE06:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[MTYPE07:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[SIZE08:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE08:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE09:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK19: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK19: [[SIZE10:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 240]
+// CK19: [[MTYPE10:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE11:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 240]
+// CK19: [[MTYPE11:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[SIZE12:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE12:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK19: [[MTYPE13:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK19: [[SIZE15:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE15:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK19: [[MTYPE16:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 33]
+
+// CK19: [[SIZE17:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 240]
+// CK19: [[MTYPE17:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 34]
+
+// CK19: [[SIZE18:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 240]
+// CK19: [[MTYPE18:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 35]
+
+// CK19: [[MTYPE19:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 32]
+
+// CK19: [[SIZE20:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK19: [[MTYPE20:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 33]
+
+// CK19: [[MTYPE21:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 35]
+
+// CK19: [[SIZE22:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK19: [[MTYPE22:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 35]
+
+// CK19: [[SIZE23:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE23:@.+]] = private {{.*}}constant [1 x i32] [i32 39]
+
+// CK19: [[SIZE24:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 480]
+// CK19: [[MTYPE24:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE25:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK19: [[MTYPE25:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE26:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 24]
+// CK19: [[MTYPE26:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE27:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE27:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE28:@.+]] = private {{.*}}constant [3 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 16]
+// CK19: [[MTYPE28:@.+]] = private {{.*}}constant [3 x i32] [i32 35, i32 19, i32 19]
+
+// CK19: [[SIZE29:@.+]] = private {{.*}}constant [3 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK19: [[MTYPE29:@.+]] = private {{.*}}constant [3 x i32] [i32 35, i32 19, i32 19]
+
+// CK19: [[MTYPE30:@.+]] = private {{.*}}constant [4 x i32] [i32 288, i32 288, i32 288, i32 35]
+
+// CK19: [[SIZE31:@.+]] = private {{.*}}constant [4 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 40]
+// CK19: [[MTYPE31:@.+]] = private {{.*}}constant [4 x i32] [i32 288, i32 288, i32 288, i32 35]
+
+// CK19: [[SIZE32:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 13728]
+// CK19: [[MTYPE32:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE33:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 13728]
+// CK19: [[MTYPE33:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE34:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 13728]
+// CK19: [[MTYPE34:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[MTYPE35:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE36:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 208]
+// CK19: [[MTYPE36:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[MTYPE37:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[MTYPE38:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[MTYPE39:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[MTYPE40:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[SIZE41:@.+]] = private {{.*}}constant [3 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 208]
+// CK19: [[MTYPE41:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[SIZE42:@.+]] = private {{.*}}constant [3 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 104]
+// CK19: [[MTYPE42:@.+]] = private {{.*}}constant [3 x i32] [i32 35, i32 19, i32 19]
+
+// CK19: [[MTYPE43:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19-LABEL: explicit_maps_single
+void explicit_maps_single (int ii){
+  // Map of a scalar.
+  int a = ii;
+
+  // Region 00
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL00:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(alloc:a)
+  {
+    ++a;
+  }
+
+  // Map of an array.
+  int arra[100];
+
+  // Region 01
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [100 x i32]* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL01:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(to:arra)
+  {
+    arra[50]++;
+  }
+
+  // Region 02
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 20
+
+  // CK19: call void [[CALL02:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(from:arra[20:60])
+  {
+    arra[50]++;
+  }
+
+  // Region 03
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19: call void [[CALL03:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(tofrom:arra[:60])
+  {
+    arra[50]++;
+  }
+
+  // Region 04
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19: call void [[CALL04:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(alloc:arra[:])
+  {
+    arra[50]++;
+  }
+
+  // Region 05
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE05]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 15
+
+  // CK19: call void [[CALL05:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(to:arra[15])
+  {
+    arra[15]++;
+  }
+
+  // Region 06
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE06]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} [[CSVAL0:%[^,]+]], i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[CSVAL0]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} %{{.*}}
+
+  // CK19: call void [[CALL06:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(tofrom:arra[ii:ii+23])
+  {
+    arra[50]++;
+  }
+
+  // Region 07
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE07]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} [[CSVAL0:%[^,]+]], i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[CSVAL0]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19: call void [[CALL07:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(alloc:arra[:ii])
+  {
+    arra[50]++;
+  }
+
+  // Region 08
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE08]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE08]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} %{{.*}}
+
+  // CK19: call void [[CALL08:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(tofrom:arra[ii])
+  {
+    arra[15]++;
+  }
+
+  // Map of a pointer.
+  int *pa;
+
+  // Region 09
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE09]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE09]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32** [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32** [[VAR0]] to i8*
+
+  // CK19: call void [[CALL09:@.+]](i32** {{[^,]+}})
+  #pragma omp target map(from:pa)
+  {
+    pa[50]++;
+  }
+
+  // Region 10
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE10]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE10]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 20
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL10:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(tofrom:pa[20:60])
+  {
+    pa[50]++;
+  }
+
+  // Region 11
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE11]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE11]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL11:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(alloc:pa[:60])
+  {
+    pa[50]++;
+  }
+
+  // Region 12
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE12]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE12]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 15
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL12:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(to:pa[15])
+  {
+    pa[15]++;
+  }
+
+  // Region 13
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE13]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} [[CSVAL0:%[^,]+]], i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[CSVAL0]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} %{{.*}}
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL13:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(alloc:pa[ii-23:ii])
+  {
+    pa[50]++;
+  }
+
+  // Region 14
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE14]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} [[CSVAL0:%[^,]+]], i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[CSVAL0]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL14:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(to:pa[:ii])
+  {
+    pa[50]++;
+  }
+
+  // Region 15
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE15]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE15]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} %{{.*}}
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL15:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(from:pa[ii+12])
+  {
+    pa[15]++;
+  }
+
+  // Map of a variable-size array.
+  int va[ii];
+
+  // Region 16
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE16]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} {{8|4}}, i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i{{.+}} [[CSVAL1:%[^,]+]], i{{.+}}* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[VAR1]] to i8*
+  // CK19-DAG: [[CSVAL1]] = mul nuw i{{.+}} %{{.*}}, 4
+
+  // CK19: call void [[CALL16:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(to:va)
+  {
+   va[50]++;
+  }
+
+  // Region 17
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE17]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE17]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} 20
+
+  // CK19: call void [[CALL17:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(from:va[20:60])
+  {
+   va[50]++;
+  }
+
+  // Region 18
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE18]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE18]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} 0
+
+  // CK19: call void [[CALL18:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(tofrom:va[:60])
+  {
+   va[50]++;
+  }
+
+  // Region 19
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE19]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} {{8|4}}, i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i{{.+}} [[CSVAL1:%[^,]+]], i{{.+}}* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[CSVAL1]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} 0
+
+  // CK19: call void [[CALL19:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(alloc:va[:])
+  {
+   va[50]++;
+  }
+
+  // Region 20
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE20]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE20]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} 15
+
+  // CK19: call void [[CALL20:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(to:va[15])
+  {
+   va[15]++;
+  }
+
+  // Region 21
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE21]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} {{8|4}}, i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i{{.+}} [[CSVAL1:%[^,]+]], i{{.+}}* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[CSVAL1]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} %{{.+}}
+
+  // CK19: call void [[CALL21:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(tofrom:va[ii:ii+23])
+  {
+   va[50]++;
+  }
+
+  // Region 22
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE22]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE22]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} %{{.+}}
+
+  // CK19: call void [[CALL22:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(tofrom:va[ii])
+  {
+   va[15]++;
+  }
+
+  // Always.
+  // Region 23
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE23]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE23]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL23:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(always, tofrom: a)
+  {
+   a++;
+  }
+
+  // Multidimensional arrays.
+  int marr[4][5][6];
+  int ***mptr;
+
+  // Region 24
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE24]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE24]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL24:@.+]]([4 x [5 x [6 x i32]]]* {{[^,]+}})
+  #pragma omp target map(tofrom: marr)
+  {
+   marr[1][2][3]++;
+  }
+
+  // Region 25
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE25]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE25]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[6 x i32]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[SEC00]] = getelementptr {{.*}}[5 x [6 x i32]]* [[SEC000:[^,]+]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[SEC000]] = getelementptr {{.*}}[4 x [5 x [6 x i32]]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+  // CK19: call void [[CALL25:@.+]]([4 x [5 x [6 x i32]]]* {{[^,]+}})
+  #pragma omp target map(tofrom: marr[1][2][2:4])
+  {
+   marr[1][2][3]++;
+  }
+
+  // Region 26
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE26]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE26]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[6 x i32]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[SEC00]] = getelementptr {{.*}}[5 x [6 x i32]]* [[SEC000:[^,]+]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[SEC000]] = getelementptr {{.*}}[4 x [5 x [6 x i32]]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+  // CK19: call void [[CALL26:@.+]]([4 x [5 x [6 x i32]]]* {{[^,]+}})
+  #pragma omp target map(tofrom: marr[1][2][:])
+  {
+   marr[1][2][3]++;
+  }
+
+  // Region 27
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE27]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE27]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[6 x i32]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: [[SEC00]] = getelementptr {{.*}}[5 x [6 x i32]]* [[SEC000:[^,]+]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[SEC000]] = getelementptr {{.*}}[4 x [5 x [6 x i32]]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+  // CK19: call void [[CALL27:@.+]]([4 x [5 x [6 x i32]]]* {{[^,]+}})
+  #pragma omp target map(tofrom: marr[1][2][3])
+  {
+   marr[1][2][3]++;
+  }
+
+  // Region 28
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE28]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE28]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32*** [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32*** [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[VAR0]] = load i32***, i32**** [[PTR:%[^,]+]],
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32*** [[SEC00:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC00]] = load i32***, i32**** [[PTR]],
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32*** [[SEC0]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32** [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32** [[SEC11:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC11]] = load i32**, i32*** [[SEC111:%[^,]+]],
+  // CK19-DAG: [[SEC111]] = getelementptr {{.*}}i32*** [[SEC1111:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC1111]] = load i32***, i32**** [[PTR]],
+
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast i32** [[SEC1]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast i32* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.*}}i32* [[SEC22:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC22]] = load i32*, i32** [[SEC222:%[^,]+]],
+  // CK19-DAG: [[SEC222]] = getelementptr {{.*}}i32** [[SEC2222:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC2222]] = load i32**, i32*** [[SEC22222:%[^,]+]],
+  // CK19-DAG: [[SEC22222]] = getelementptr {{.*}}i32*** [[SEC222222:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC222222]] = load i32***, i32**** [[PTR]],
+
+  // CK19: call void [[CALL28:@.+]](i32*** {{[^,]+}})
+  #pragma omp target map(tofrom: mptr[1][2][2:4])
+  {
+    mptr[1][2][3]++;
+  }
+
+  // Region 29
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE29]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE29]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32*** [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32*** [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[VAR0]] = load i32***, i32**** [[PTR:%[^,]+]],
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32*** [[SEC00:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC00]] = load i32***, i32**** [[PTR]],
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32*** [[SEC0]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32** [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32** [[SEC11:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC11]] = load i32**, i32*** [[SEC111:%[^,]+]],
+  // CK19-DAG: [[SEC111]] = getelementptr {{.*}}i32*** [[SEC1111:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC1111]] = load i32***, i32**** [[PTR]],
+
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast i32** [[SEC1]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast i32* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.*}}i32* [[SEC22:[^,]+]], i{{.+}} 3
+  // CK19-DAG: [[SEC22]] = load i32*, i32** [[SEC222:%[^,]+]],
+  // CK19-DAG: [[SEC222]] = getelementptr {{.*}}i32** [[SEC2222:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC2222]] = load i32**, i32*** [[SEC22222:%[^,]+]],
+  // CK19-DAG: [[SEC22222]] = getelementptr {{.*}}i32*** [[SEC222222:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC222222]] = load i32***, i32**** [[PTR]],
+
+  // CK19: call void [[CALL29:@.+]](i32*** {{[^,]+}})
+  #pragma omp target map(tofrom: mptr[1][2][3])
+  {
+    mptr[1][2][3]++;
+  }
+
+  // Multidimensional VLA.
+  double mva[23][ii][ii+5];
+
+  // Region 30
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE30]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 23 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 23 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  // CK19-64-DAG: [[VAR1]] = zext i32 %{{[^,]+}} to i64
+  // CK19-64-DAG: [[VAR11]] = zext i32 %{{[^,]+}} to i64
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = inttoptr i[[Z]] [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = inttoptr i[[Z]] [[VAR22:%.+]] to i8*
+  // CK19-64-DAG: [[VAR2]] = zext i32 %{{[^,]+}} to i64
+  // CK19-64-DAG: [[VAR22]] = zext i32 %{{[^,]+}} to i64
+  //
+  // CK19-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: [[S3:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+  // CK19-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+  // CK19-DAG: store i[[Z]] [[CSVAL3:%[^,]+]], i[[Z]]* [[S3]]
+  // CK19-DAG: [[CBPVAL3]] = bitcast double* [[VAR3:%.+]] to i8*
+  // CK19-DAG: [[CPVAL3]] = bitcast double* [[VAR3]] to i8*
+  // CK19-DAG: [[CSVAL3]] = mul nuw i[[Z]] %{{[^,]+}}, {{8|4}}
+
+  // CK19: call void [[CALL30:@.+]](i[[Z]] 23, i[[Z]] %{{[^,]+}}, i[[Z]] %{{[^,]+}}, double* %{{[^,]+}})
+  #pragma omp target map(tofrom: mva)
+  {
+    mva[1][2][3]++;
+  }
+
+  // Region 31
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[SIZE31]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE31]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 23 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 23 to i8*), i8** [[P0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = inttoptr i[[Z]] [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = inttoptr i[[Z]] [[VAR22:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+  // CK19-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+  // CK19-DAG: [[CBPVAL3]] = bitcast double* [[VAR3:%.+]] to i8*
+  // CK19-DAG: [[CPVAL3]] = bitcast double* [[SEC3:%.+]] to i8*
+  // CK19-DAG: [[SEC3]] = getelementptr {{.*}}double* [[SEC33:%.+]], i[[Z]] 0
+  // CK19-DAG: [[SEC33]] = getelementptr {{.*}}double* [[SEC333:%.+]], i[[Z]] [[IDX3:%.+]]
+  // CK19-DAG: [[IDX3]] = mul nsw i[[Z]] %{{[^,]+}}, %{{[^,]+}}
+  // CK19-DAG: [[SEC333]] = getelementptr {{.*}}double* [[VAR3]], i[[Z]] [[IDX33:%.+]]
+  // CK19-DAG: [[IDX33]] = mul nsw i[[Z]] 1, %{{[^,]+}}
+
+  // CK19: call void [[CALL31:@.+]](i[[Z]] 23, i[[Z]] %{{[^,]+}}, i[[Z]] %{{[^,]+}}, double* %{{[^,]+}})
+  #pragma omp target map(tofrom: mva[1][ii-2][:5])
+  {
+    mva[1][2][3]++;
+  }
+
+  // Multidimensional array sections.
+  double marras[11][12][13];
+  double mvlaas[11][ii][13];
+  double ***mptras;
+
+  // Region 32
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE32]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE32]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL32:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras)
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 33
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE33]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE33]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [12 x [13 x double]]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i[[Z]] 0, i[[Z]] 0
+
+  // CK19: call void [[CALL33:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[:])
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 34
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE34]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE34]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [12 x [13 x double]]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i[[Z]] 0, i[[Z]] 0
+
+  // CK19: call void [[CALL34:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[:][:][:])
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 35
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE35]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i[[Z]] [[CSVAL0:%[^,]+]], i[[Z]]* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [13 x double]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[12 x [13 x double]]* [[SEC00:%[^,]+]], i[[Z]] 0, i[[Z]] 0
+  // CK19-DAG: [[SEC00]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i[[Z]] 0, i[[Z]] 1
+  // CK19-DAG: [[CSVAL0]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL35:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[1][:ii][:])
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 36
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE36]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE36]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [13 x double]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[13 x double]* [[SEC00:%[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC00]] = getelementptr {{.+}}[12 x [13 x double]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[SEC000]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19: call void [[CALL36:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[:1][:2][:13])
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 37
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE37]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] [[CSVAL2:%[^,]+]], i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[VAR2]] to i8*
+  // CK19-DAG: [[CSVAL2]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL37:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas)
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 38
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE38]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] [[CSVAL2:%[^,]+]], i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.+}}[13 x double]* [[VAR2]], i[[Z]] [[SEC22:%[^,]+]]
+  // CK19-DAG: [[SEC22]] = mul nsw i[[Z]] 0, %{{[^,]+}}
+  // CK19-DAG: [[CSVAL2]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL38:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas[:])
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 39
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE39]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] [[CSVAL2:%[^,]+]], i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.+}}[13 x double]* [[VAR2]], i[[Z]] [[SEC22:%[^,]+]]
+  // CK19-DAG: [[SEC22]] = mul nsw i[[Z]] 0, %{{[^,]+}}
+  // CK19-DAG: [[CSVAL2]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL39:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas[:][:][:])
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 40
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE40]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] [[CSVAL2:%[^,]+]], i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.+}}[13 x double]* [[SEC22:%[^,]+]], i[[Z]] 0
+  // CK19-DAG: [[SEC22]] = getelementptr {{.+}}[13 x double]* [[VAR2]], i[[Z]] [[SEC222:%[^,]+]]
+  // CK19-DAG: [[SEC222]] = mul nsw i[[Z]] 1, %{{[^,]+}}
+
+  // CK19: call void [[CALL40:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas[1][:ii][:])
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 41
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE41]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE41]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.+}}[13 x double]* [[SEC22:%[^,]+]], i[[Z]] 0
+  // CK19-DAG: [[SEC22]] = getelementptr {{.+}}[13 x double]* [[VAR2]], i[[Z]] [[SEC222:%[^,]+]]
+  // CK19-DAG: [[SEC222]] = mul nsw i[[Z]] 0, %{{[^,]+}}
+
+  // CK19: call void [[CALL41:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas[:1][:2][:13])
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 42
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE42]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE42]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast double*** [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast double*** [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[VAR0]] = load double***, double**** [[PTR:%[^,]+]],
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}double*** [[SEC00:[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC00]] = load double***, double**** [[PTR]],
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast double*** [[SEC0]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast double** [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}double** [[SEC11:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC11]] = load double**, double*** [[SEC111:%[^,]+]],
+  // CK19-DAG: [[SEC111]] = getelementptr {{.*}}double*** [[SEC1111:[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC1111]] = load double***, double**** [[PTR]],
+
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast double** [[SEC1]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast double* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.*}}double* [[SEC22:[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC22]] = load double*, double** [[SEC222:%[^,]+]],
+  // CK19-DAG: [[SEC222]] = getelementptr {{.*}}double** [[SEC2222:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC2222]] = load double**, double*** [[SEC22222:%[^,]+]],
+  // CK19-DAG: [[SEC22222]] = getelementptr {{.*}}double*** [[SEC222222:[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC222222]] = load double***, double**** [[PTR]],
+
+  // CK19: call void [[CALL42:@.+]](double*** {{[^,]+}})
+  #pragma omp target map(mptras[:1][2][:13])
+  {
+    mptras[1][2][3]++;
+  }
+
+  // Region 43 - the memory is not contiguous for this map - will map the whole last dimension.
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE43]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i[[Z]] [[CSVAL0:%[^,]+]], i[[Z]]* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [13 x double]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[12 x [13 x double]]* [[SEC00:%[^,]+]], i[[Z]] 0, i[[Z]] 0
+  // CK19-DAG: [[SEC00]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i[[Z]] 0, i[[Z]] 1
+  // CK19-DAG: [[CSVAL0]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL43:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[1][:ii][1:])
+  {
+    marras[1][2][3]++;
+  }
+
+}
+
+// CK19: define {{.+}}[[CALL00]]
+// CK19: define {{.+}}[[CALL01]]
+// CK19: define {{.+}}[[CALL02]]
+// CK19: define {{.+}}[[CALL03]]
+// CK19: define {{.+}}[[CALL04]]
+// CK19: define {{.+}}[[CALL05]]
+// CK19: define {{.+}}[[CALL06]]
+// CK19: define {{.+}}[[CALL07]]
+// CK19: define {{.+}}[[CALL08]]
+// CK19: define {{.+}}[[CALL09]]
+// CK19: define {{.+}}[[CALL10]]
+// CK19: define {{.+}}[[CALL11]]
+// CK19: define {{.+}}[[CALL12]]
+// CK19: define {{.+}}[[CALL13]]
+// CK19: define {{.+}}[[CALL14]]
+// CK19: define {{.+}}[[CALL15]]
+// CK19: define {{.+}}[[CALL16]]
+// CK19: define {{.+}}[[CALL17]]
+// CK19: define {{.+}}[[CALL18]]
+// CK19: define {{.+}}[[CALL19]]
+// CK19: define {{.+}}[[CALL20]]
+// CK19: define {{.+}}[[CALL21]]
+// CK19: define {{.+}}[[CALL22]]
+// CK19: define {{.+}}[[CALL23]]
+// CK19: define {{.+}}[[CALL24]]
+// CK19: define {{.+}}[[CALL25]]
+// CK19: define {{.+}}[[CALL26]]
+// CK19: define {{.+}}[[CALL27]]
+// CK19: define {{.+}}[[CALL28]]
+// CK19: define {{.+}}[[CALL29]]
+// CK19: define {{.+}}[[CALL30]]
+// CK19: define {{.+}}[[CALL31]]
+// CK19: define {{.+}}[[CALL32]]
+// CK19: define {{.+}}[[CALL33]]
+// CK19: define {{.+}}[[CALL34]]
+// CK19: define {{.+}}[[CALL35]]
+// CK19: define {{.+}}[[CALL36]]
+// CK19: define {{.+}}[[CALL37]]
+// CK19: define {{.+}}[[CALL38]]
+// CK19: define {{.+}}[[CALL39]]
+// CK19: define {{.+}}[[CALL40]]
+// CK19: define {{.+}}[[CALL41]]
+// CK19: define {{.+}}[[CALL42]]
+// CK19: define {{.+}}[[CALL43]]
+
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK20 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK20 --check-prefix CK20-64
+// RUN: %clang_cc1 -DCK20 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK20 --check-prefix CK20-64
+// RUN: %clang_cc1 -DCK20 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK20 --check-prefix CK20-32
+// RUN: %clang_cc1 -DCK20 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK20 --check-prefix CK20-32
+#ifdef CK20
+
+// CK20: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK20: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK20: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK20: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK20: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK20: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK20: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 12]
+// CK20: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK20-LABEL: explicit_maps_references_and_function_args
+void explicit_maps_references_and_function_args (int a, float b, int (&c)[10], float *d){
+
+  int &aa = a;
+  float &bb = b;
+  int (&cc)[10] = c;
+  float *&dd = d;
+
+  // Region 00
+  // CK20-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK20-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK20-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK20-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK20-DAG: [[CPVAL0]] = bitcast i32* [[RVAR00:%.+]] to i8*
+  // CK20-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK20-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK20: call void [[CALL00:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(to:aa)
+  {
+    aa += 1;
+  }
+
+  // Region 01
+  // CK20-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK20-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK20-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK20-DAG: [[CBPVAL0]] = bitcast [10 x i32]* [[RVAR0:%.+]] to i8*
+  // CK20-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK20-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[RVAR00:%.+]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[RVAR0]] = load [10 x i32]*, [10 x i32]** [[VAR0:%[^,]+]]
+  // CK20-DAG: [[RVAR00]] = load [10 x i32]*, [10 x i32]** [[VAR0]]
+
+  // CK20: call void [[CALL01:@.+]]([10 x i32]* {{[^,]+}})
+  #pragma omp target map(to:cc[:5])
+  {
+    cc[3] += 1;
+  }
+
+  // Region 02
+  // CK20-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK20-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK20-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK20-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK20-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+
+  // CK20: call void [[CALL02:@.+]](float* {{[^,]+}})
+  #pragma omp target map(from:b)
+  {
+    b += 1.0f;
+  }
+
+  // Region 03
+  // CK20-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK20-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK20-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK20-DAG: [[CBPVAL0]] = bitcast float* [[RVAR0:%.+]] to i8*
+  // CK20-DAG: [[CPVAL0]] = bitcast float* [[SEC0:%.+]] to i8*
+  // CK20-DAG: [[RVAR0]] = load float*, float** [[VAR0:%[^,]+]]
+  // CK20-DAG: [[SEC0]] = getelementptr {{.*}}float* [[RVAR00:%.+]], i{{.+}} 2
+  // CK20-DAG: [[RVAR00]] = load float*, float** [[VAR0]]
+
+  // CK20: call void [[CALL03:@.+]](float* {{[^,]+}})
+  #pragma omp target map(from:d[2:3])
+  {
+    d[2] += 1.0f;
+  }
+}
+
+// CK20: define {{.+}}[[CALL00]]
+// CK20: define {{.+}}[[CALL01]]
+// CK20: define {{.+}}[[CALL02]]
+// CK20: define {{.+}}[[CALL03]]
+
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK21 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK21 --check-prefix CK21-64
+// RUN: %clang_cc1 -DCK21 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK21 --check-prefix CK21-64
+// RUN: %clang_cc1 -DCK21 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK21 --check-prefix CK21-32
+// RUN: %clang_cc1 -DCK21 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK21 --check-prefix CK21-32
+#ifdef CK21
+// CK21: [[ST:%.+]] = type { i32, i32, float* }
+
+// CK21: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK21: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK21: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 492]
+// CK21: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK21: [[SIZE02:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 500]
+// CK21: [[MTYPE02:@.+]] = private {{.*}}constant [2 x i32] [i32 34, i32 18]
+
+// CK21: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 492]
+// CK21: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK21: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK21: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK21: [[SIZE05:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] 4, i[[Z]] 4]
+// CK21: [[MTYPE05:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 3]
+
+// CK21-LABEL: explicit_maps_template_args_and_members
+
+template <int X, typename T>
+struct CC {
+  T A;
+  int A2;
+  float *B;
+
+  int foo(T arg) {
+    float la[X];
+    T *lb;
+
+    // Region 00
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+    // CK21-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0:%.+]], i{{.+}} 0, i{{.+}} 0
+
+    // CK21: call void [[CALL00:@.+]]([[ST]]* {{[^,]+}})
+    #pragma omp target map(A)
+    {
+      A += 1;
+    }
+    
+    // Region 01
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+    // CK21-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+    // CK21-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+    // CK21-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+    // CK21: call void [[CALL01:@.+]](i32* {{[^,]+}})
+    #pragma omp target map(lb[:X])
+    {
+      lb[4] += 1;
+    }
+    
+    // Region 02
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE02]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast float** [[SEC0:%.+]] to i8*
+    // CK21-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+    
+    // CK21-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK21-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK21-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK21-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK21-DAG: [[CBPVAL1]] = bitcast float** [[SEC0]] to i8*
+    // CK21-DAG: [[CPVAL1]] = bitcast float* [[SEC1:%.+]] to i8*
+    // CK21-DAG: [[SEC1]] = getelementptr {{.*}}float* [[RVAR1:%[^,]+]], i{{.+}} 123
+    // CK21-DAG: [[RVAR1]] = load float*, float** [[SEC1_:%[^,]+]]
+    // CK21-DAG: [[SEC1_]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+    // CK21: call void [[CALL02:@.+]]([[ST]]* {{[^,]+}})
+    #pragma omp target map(from:B[X:X+2])
+    {
+      B[2] += 1.0f;
+    }
+    
+    // Region 03
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast [123 x float]* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast [123 x float]* [[VAR0]] to i8*
+
+    // CK21: call void [[CALL03:@.+]]([123 x float]* {{[^,]+}})
+    #pragma omp target map(from:la)
+    {
+      la[3] += 1.0f;
+    }
+    
+    // Region 04
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+    // CK21: call void [[CALL04:@.+]](i32* {{[^,]+}})
+    #pragma omp target map(from:arg)
+    {
+      arg +=1;
+    }
+    
+    // Make sure the extra flag is passed to the second map.
+    // Region 05
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE05]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+    // CK21-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+    // CK21-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK21-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK21-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK21-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK21-DAG: [[CBPVAL1]] = bitcast [[ST]]* [[VAR1:%.+]] to i8*
+    // CK21-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+    // CK21-DAG: [[SEC1]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+    // CK21: call void [[CALL05:@.+]]([[ST]]* {{[^,]+}})
+    #pragma omp target map(A, A2)
+    {
+      A += 1;
+      A2 += 1;
+    }
+    return A;
+  }
+};
+
+int explicit_maps_template_args_and_members(int a){
+  CC<123,int> c;
+  return c.foo(a);
+}
+
+// CK21: define {{.+}}[[CALL00]]
+// CK21: define {{.+}}[[CALL01]]
+// CK21: define {{.+}}[[CALL02]]
+// CK21: define {{.+}}[[CALL03]]
+// CK21: define {{.+}}[[CALL04]]
+// CK21: define {{.+}}[[CALL05]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK22 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK22 --check-prefix CK22-64
+// RUN: %clang_cc1 -DCK22 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK22 --check-prefix CK22-64
+// RUN: %clang_cc1 -DCK22 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK22 --check-prefix CK22-32
+// RUN: %clang_cc1 -DCK22 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK22 --check-prefix CK22-32
+#ifdef CK22
+
+// CK22-DAG: [[ST:%.+]] = type { float }
+// CK22-DAG: [[STT:%.+]] = type { i32 }
+
+// CK22: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK22: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK22: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK22: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK22: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK22: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE05:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK22: [[MTYPE05:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE06:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK22: [[MTYPE06:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE07:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK22: [[MTYPE07:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE08:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK22: [[MTYPE08:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE09:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK22: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE10:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK22: [[MTYPE10:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE11:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK22: [[MTYPE11:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE12:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK22: [[MTYPE12:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE13:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK22: [[MTYPE13:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE14:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK22: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+int a;
+int c[100];
+int *d;
+
+struct ST {
+  float fa;
+};
+
+ST sa ;
+ST sc[100];
+ST *sd;
+
+template<typename T>
+struct STT {
+  T fa;
+};
+
+STT<int> sta ;
+STT<int> stc[100];
+STT<int> *std;
+
+// CK22-LABEL: explicit_maps_globals
+int explicit_maps_globals(void){
+  // Region 00
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast (i32* @a to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast (i32* @a to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL00:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(a)
+  { a+=1; }
+  
+  // Region 01
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x i32]* @c to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([100 x i32]* @c to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL01:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(c)
+  { c[3]+=1; }
+  
+  // Region 02
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast (i32** @d to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast (i32** @d to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL02:@.+]](i32** {{[^,]+}})
+  #pragma omp target map(d)
+  { d[3]+=1; }
+    
+  // Region 03
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x i32]* @c to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast (i32* getelementptr inbounds ([100 x i32], [100 x i32]* @c, i{{.+}} 0, i{{.+}} 1) to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL03:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(c[1:4])
+  { c[3]+=1; }
+  
+  // Region 04
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK22-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK22-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK22-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK22-DAG: [[RVAR0]] = load i32*, i32** @d
+  // CK22-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 2
+  // CK22-DAG: [[RVAR00]] = load i32*, i32** @d
+
+  // CK22: call void [[CALL04:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(d[2:5])
+  { d[3]+=1; }
+  
+  // Region 05
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE05]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([[ST]]* @sa to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[ST]]* @sa to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL05:@.+]]([[ST]]* {{[^,]+}})
+  #pragma omp target map(sa)
+  { sa.fa+=1; }
+  
+  // Region 06
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE06]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE06]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x [[ST]]]* @sc to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([100 x [[ST]]]* @sc to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL06:@.+]]([100 x [[ST]]]* {{[^,]+}})
+  #pragma omp target map(sc)
+  { sc[3].fa+=1; }
+  
+  // Region 07
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE07]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE07]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([[ST]]** @sd to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[ST]]** @sd to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL07:@.+]]([[ST]]** {{[^,]+}})
+  #pragma omp target map(sd)
+  { sd[3].fa+=1; }
+  
+  // Region 08
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE08]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE08]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x [[ST]]]* @sc to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[ST]]* getelementptr inbounds ([100 x [[ST]]], [100 x [[ST]]]* @sc, i{{.+}} 0, i{{.+}} 1) to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL08:@.+]]([100 x [[ST]]]* {{[^,]+}})
+  #pragma omp target map(sc[1:4])
+  { sc[3].fa+=1; }
+  
+  // Region 09
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE09]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE09]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK22-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK22-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[RVAR0:%.+]] to i8*
+  // CK22-DAG: [[CPVAL0]] = bitcast [[ST]]* [[SEC0:%.+]] to i8*
+  // CK22-DAG: [[RVAR0]] = load [[ST]]*, [[ST]]** @sd
+  // CK22-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[RVAR00:%.+]], i{{.+}} 2
+  // CK22-DAG: [[RVAR00]] = load [[ST]]*, [[ST]]** @sd
+
+  // CK22: call void [[CALL09:@.+]]([[ST]]* {{[^,]+}})
+  #pragma omp target map(sd[2:5])
+  { sd[3].fa+=1; }
+  
+  // Region 10
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE10]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE10]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([[STT]]* @sta to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[STT]]* @sta to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL10:@.+]]([[STT]]* {{[^,]+}})
+  #pragma omp target map(sta)
+  { sta.fa+=1; }
+  
+  // Region 11
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE11]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE11]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x [[STT]]]* @stc to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([100 x [[STT]]]* @stc to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL11:@.+]]([100 x [[STT]]]* {{[^,]+}})
+  #pragma omp target map(stc)
+  { stc[3].fa+=1; }
+  
+  // Region 12
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE12]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE12]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([[STT]]** @std to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[STT]]** @std to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL12:@.+]]([[STT]]** {{[^,]+}})
+  #pragma omp target map(std)
+  { std[3].fa+=1; }
+  
+  // Region 13
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE13]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE13]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x [[STT]]]* @stc to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[STT]]* getelementptr inbounds ([100 x [[STT]]], [100 x [[STT]]]* @stc, i{{.+}} 0, i{{.+}} 1) to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL13:@.+]]([100 x [[STT]]]* {{[^,]+}})
+  #pragma omp target map(stc[1:4])
+  { stc[3].fa+=1; }
+  
+  // Region 14
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE14]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE14]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK22-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK22-DAG: [[CBPVAL0]] = bitcast [[STT]]* [[RVAR0:%.+]] to i8*
+  // CK22-DAG: [[CPVAL0]] = bitcast [[STT]]* [[SEC0:%.+]] to i8*
+  // CK22-DAG: [[RVAR0]] = load [[STT]]*, [[STT]]** @std
+  // CK22-DAG: [[SEC0]] = getelementptr {{.*}}[[STT]]* [[RVAR00:%.+]], i{{.+}} 2
+  // CK22-DAG: [[RVAR00]] = load [[STT]]*, [[STT]]** @std
+
+  // CK22: call void [[CALL14:@.+]]([[STT]]* {{[^,]+}})
+  #pragma omp target map(std[2:5])
+  { std[3].fa+=1; }
+
+  return 0;
+}
+// CK22: define {{.+}}[[CALL00]]
+// CK22: define {{.+}}[[CALL01]]
+// CK22: define {{.+}}[[CALL02]]
+// CK22: define {{.+}}[[CALL03]]
+// CK22: define {{.+}}[[CALL04]]
+// CK22: define {{.+}}[[CALL05]]
+// CK22: define {{.+}}[[CALL06]]
+// CK22: define {{.+}}[[CALL07]]
+// CK22: define {{.+}}[[CALL08]]
+// CK22: define {{.+}}[[CALL09]]
+// CK22: define {{.+}}[[CALL10]]
+// CK22: define {{.+}}[[CALL11]]
+// CK22: define {{.+}}[[CALL12]]
+// CK22: define {{.+}}[[CALL13]]
+// CK22: define {{.+}}[[CALL14]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -std=c++11 -DCK23 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK23 --check-prefix CK23-64
+// RUN: %clang_cc1 -std=c++11 -DCK23 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -std=c++11 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-64
+// RUN: %clang_cc1 -std=c++11 -DCK23 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-32
+// RUN: %clang_cc1 -std=c++11 -DCK23 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -std=c++11 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-32
+#ifdef CK23
+
+// CK23: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK23: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK23: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK23: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK23: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK23: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE05:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK23: [[MTYPE05:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23-LABEL: explicit_maps_inside_captured
+int explicit_maps_inside_captured(int a){
+  float b;
+  float c[100];
+  float *d;
+  
+  // CK23: call void @{{.*}}explicit_maps_inside_captured{{.*}}([[SA:%.+]]* {{.*}}) 
+  // CK23: define {{.*}}explicit_maps_inside_captured{{.*}}
+  [&](void){
+    // Region 00
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast i32* [[VAR00:%.+]] to i8*
+    // CK23-DAG: [[VAR0]] = load i32*, i32** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load i32*, i32** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+
+    // CK23: call void [[CALL00:@.+]](i32* {{[^,]+}})
+    #pragma omp target map(a)
+      { a+=1; }
+    // Region 01
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast float* [[VAR00:%.+]] to i8*
+    // CK23-DAG: [[VAR0]] = load float*, float** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load float*, float** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+
+    // CK23: call void [[CALL01:@.+]](float* {{[^,]+}})
+    #pragma omp target map(b)
+      { b+=1; }
+    // Region 02
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast [100 x float]* [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast [100 x float]* [[VAR00:%.+]] to i8*
+    // CK23-DAG: [[VAR0]] = load [100 x float]*, [100 x float]** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load [100 x float]*, [100 x float]** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+
+    // CK23: call void [[CALL02:@.+]]([100 x float]* {{[^,]+}})
+    #pragma omp target map(c)
+      { c[3]+=1; }
+      
+    // Region 03
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast float** [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast float** [[VAR00:%.+]] to i8*
+    // CK23-DAG: [[VAR0]] = load float**, float*** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load float**, float*** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+
+    // CK23: call void [[CALL03:@.+]](float** {{[^,]+}})
+    #pragma omp target map(d)
+      { d[3]+=1; }
+    // Region 04
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast [100 x float]* [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast float* [[SEC0:%.+]] to i8*
+    // CK23-DAG: [[SEC0]] = getelementptr {{.*}}[100 x float]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 2
+    // CK23-DAG: [[VAR0]] = load [100 x float]*, [100 x float]** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load [100 x float]*, [100 x float]** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+    
+    // CK23: call void [[CALL04:@.+]]([100 x float]* {{[^,]+}})
+    #pragma omp target map(c[2:4])
+      { c[3]+=1; }
+      
+    // Region 05
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE05]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast float* [[RVAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast float* [[SEC0:%.+]] to i8*
+    // CK23-DAG: [[RVAR0]] = load float*, float** [[VAR0:%[^,]+]]
+    // CK23-DAG: [[SEC0]] = getelementptr {{.*}}float* [[RVAR00:%.+]], i{{.+}} 2
+    // CK23-DAG: [[RVAR00]] = load float*, float** [[VAR00:%[^,]+]]
+    // CK23-DAG: [[VAR0]] = load float**, float*** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load float**, float*** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+        
+    // CK23: call void [[CALL05:@.+]](float* {{[^,]+}})
+    #pragma omp target map(d[2:4])
+      { d[3]+=1; }
+  }();
+  return b;
+}
+
+// CK23: define {{.+}}[[CALL00]]
+// CK23: define {{.+}}[[CALL01]]
+// CK23: define {{.+}}[[CALL02]]
+// CK23: define {{.+}}[[CALL03]]
+// CK23: define {{.+}}[[CALL04]]
+// CK23: define {{.+}}[[CALL05]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK24 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK24 --check-prefix CK24-64
+// RUN: %clang_cc1 -DCK24 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK24 --check-prefix CK24-64
+// RUN: %clang_cc1 -DCK24 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK24 --check-prefix CK24-32
+// RUN: %clang_cc1 -DCK24 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK24 --check-prefix CK24-32
+#ifdef CK24
+
+// CK24-DAG: [[SC:%.+]] = type { i32, [[SB:%.+]], [[SB:%.+]]*, [10 x i32] }
+// CK24-DAG: [[SB]] = type { i32, [[SA:%.+]], [10 x [[SA:%.+]]], [10 x [[SA:%.+]]*], [[SA:%.+]]* }
+// CK24-DAG: [[SA]] = type { i32, [[SA]]*, [10 x i32] }
+
+struct SA{
+  int a;
+  struct SA *p;
+  int b[10];
+};
+struct SB{
+  int a;
+  struct SA s;
+  struct SA sa[10];
+  struct SA *sp[10];
+  struct SA *p;
+};
+struct SC{
+  int a;
+  struct SB s;
+  struct SB *p;
+  int b[10];
+};
+
+// CK24: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK24: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{56|48}}]
+// CK24: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK24: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE05:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{3560|2880}}]
+// CK24: [[MTYPE05:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE06:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE06:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE07:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE07:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE08:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE08:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE09:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE09:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE10:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 8]
+// CK24: [[MTYPE10:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE11:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}]
+// CK24: [[MTYPE11:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE12:@.+]] = private {{.*}}constant [4 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE12:@.+]] = private {{.*}}constant [4 x i32] [i32 35, i32 19, i32 19, i32 19]
+
+// CK24: [[SIZE13:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE13:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE14:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{56|48}}]
+// CK24: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE15:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE15:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE16:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK24: [[MTYPE16:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE17:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{3560|2880}}]
+// CK24: [[MTYPE17:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE18:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE18:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE19:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE19:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE20:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE20:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE21:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE21:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE22:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK24: [[MTYPE22:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE23:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}]
+// CK24: [[MTYPE23:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE24:@.+]] = private {{.*}}constant [4 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE24:@.+]] = private {{.*}}constant [4 x i32] [i32 35, i32 19, i32 19, i32 19]
+
+// CK24-LABEL: explicit_maps_struct_fields
+int explicit_maps_struct_fields(int a){
+  SC s;
+  SC *p;
+
+// Region 01
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+// CK24: call void [[CALL01:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.a)
+  { s.a++; }
+  
+// Region 02
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL02:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.s.s)
+  { s.a++; }
+  
+// Region 03
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SA]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SB]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL03:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.s.s.a)
+  { s.a++; }
+
+// Region 04
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 3
+
+// CK24: call void [[CALL04:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.b[:5])
+  { s.a++; }
+  
+// Region 05
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE05]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast [[SB]]* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24: call void [[CALL05:@.+]]([[SC]]* {{[^,]+}})  
+#pragma omp target map(s.p[:5])
+  { s.a++; }
+
+// Region 06
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE06]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE06]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SA]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[10 x [[SA]]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SB]]* [[SEC0000:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC0000]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL06:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.s.sa[3].a)
+  { s.a++; }
+  
+// Region 07
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE07]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE07]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x [[SA]]*]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SB]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SA]]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SA]]*, [[SA]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[10 x [[SA]]*]* [[SEC1111:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SB]]* [[SEC11111:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC11111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL07:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(s.s.sp[3]->a)
+  { s.a++; }
+
+// Region 08
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE08]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE08]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24: call void [[CALL08:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(s.p->a)
+  { s.a++; }
+  
+// Region 09
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE09]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE09]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SA]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SA]]*, [[SA]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SB]]* [[SEC1111:[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL09:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(s.s.p->a)
+  { s.a++; }
+
+// Region 10
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE10]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE10]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SA]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SB]]* [[SEC0000:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC0000]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL10:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.s.s.b[:2])
+  { s.a++; }
+  
+// Region 11
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE11]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE11]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[10 x i32]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC11]] = getelementptr {{.*}}[[SA]]* [[SEC111:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC111]] = load [[SA]]*, [[SA]]** [[SEC1111:%[^,]+]],
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SB]]* [[SEC11111:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC11111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL11:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(s.s.p->b[:2])
+  { s.a++; }
+
+// Region 12
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[SIZE12]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE12]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast [[SA]]** [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+// CK24-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+// CK24-DAG: [[CBPVAL2]] = bitcast [[SA]]** [[SEC1]] to i8*
+// CK24-DAG: [[CPVAL2]] = bitcast [[SA]]** [[SEC2:%.+]] to i8*
+// CK24-DAG: [[SEC2]] = getelementptr {{.*}}[[SA]]* [[SEC22:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC22]] = load [[SA]]*, [[SA]]** [[SEC222:%[^,]+]],
+// CK24-DAG: [[SEC222]] = getelementptr {{.*}}[[SB]]* [[SEC2222:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC2222]] = load [[SB]]*, [[SB]]** [[SEC22222:%[^,]+]],
+// CK24-DAG: [[SEC22222]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+// CK24-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+// CK24-DAG: [[CBPVAL3]] = bitcast [[SA]]** [[SEC2]] to i8*
+// CK24-DAG: [[CPVAL3]] = bitcast i32* [[SEC3:%.+]] to i8*
+// CK24-DAG: [[SEC3]] = getelementptr {{.*}}[[SA]]* [[SEC33:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC33]] = load [[SA]]*, [[SA]]** [[SEC333:%[^,]+]],
+// CK24-DAG: [[SEC333]] = getelementptr {{.*}}[[SA]]* [[SEC3333:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC3333]] = load [[SA]]*, [[SA]]** [[SEC33333:%[^,]+]],
+// CK24-DAG: [[SEC33333]] = getelementptr {{.*}}[[SB]]* [[SEC333333:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC333333]] = load [[SB]]*, [[SB]]** [[SEC3333333:%[^,]+]],
+// CK24-DAG: [[SEC3333333]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24: call void [[CALL12:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.p->p->p->a)
+  { s.a++; }
+
+//
+// Same thing but starting from a pointer.
+//
+// Region 13
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE13]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE13]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 0
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL13:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->a)
+  { p->a++; }
+  
+// Region 14
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE14]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE14]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL14:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->s.s)
+  { p->a++; }
+  
+// Region 15
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE15]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE15]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SA]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SB]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL15:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->s.s.a)
+  { p->a++; }
+
+// Region 16
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE16]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE16]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 3
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL16:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->b[:5])
+  { p->a++; }
+  
+// Region 17
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE17]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE17]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast [[SB]]* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL17:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(p->p[:5])
+  { p->a++; }
+
+// Region 18
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE18]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE18]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SA]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[10 x [[SA]]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SB]]* [[SEC0000:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC0000]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL18:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->s.sa[3].a)
+  { p->a++; }
+  
+// Region 19
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE19]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE19]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x [[SA]]*]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SB]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SA]]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SA]]*, [[SA]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[10 x [[SA]]*]* [[SEC1111:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SB]]* [[SEC11111:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC11111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL19:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(p->s.sp[3]->a)
+  { p->a++; }
+
+// Region 20
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE20]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE20]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL20:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->p->a)
+  { p->a++; }
+  
+// Region 21
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE21]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE21]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SA]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SA]]*, [[SA]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SB]]* [[SEC1111:[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL21:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(p->s.p->a)
+  { p->a++; }
+
+// Region 22
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE22]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE22]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SA]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SB]]* [[SEC0000:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC0000]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL22:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->s.s.b[:2])
+  { p->a++; }
+  
+// Region 23
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE23]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE23]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[10 x i32]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC11]] = getelementptr {{.*}}[[SA]]* [[SEC111:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC111]] = load [[SA]]*, [[SA]]** [[SEC1111:%[^,]+]],
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SB]]* [[SEC11111:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC11111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL23:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(p->s.p->b[:2])
+  { p->a++; }
+
+// Region 24
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[SIZE24]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE24]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast [[SA]]** [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+// CK24-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+// CK24-DAG: [[CBPVAL2]] = bitcast [[SA]]** [[SEC1]] to i8*
+// CK24-DAG: [[CPVAL2]] = bitcast [[SA]]** [[SEC2:%.+]] to i8*
+// CK24-DAG: [[SEC2]] = getelementptr {{.*}}[[SA]]* [[SEC22:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC22]] = load [[SA]]*, [[SA]]** [[SEC222:%[^,]+]],
+// CK24-DAG: [[SEC222]] = getelementptr {{.*}}[[SB]]* [[SEC2222:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC2222]] = load [[SB]]*, [[SB]]** [[SEC22222:%[^,]+]],
+// CK24-DAG: [[SEC22222]] = getelementptr {{.*}}[[SC]]* [[VAR0000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+// CK24-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+// CK24-DAG: [[CBPVAL3]] = bitcast [[SA]]** [[SEC2]] to i8*
+// CK24-DAG: [[CPVAL3]] = bitcast i32* [[SEC3:%.+]] to i8*
+// CK24-DAG: [[SEC3]] = getelementptr {{.*}}[[SA]]* [[SEC33:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC33]] = load [[SA]]*, [[SA]]** [[SEC333:%[^,]+]],
+// CK24-DAG: [[SEC333]] = getelementptr {{.*}}[[SA]]* [[SEC3333:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC3333]] = load [[SA]]*, [[SA]]** [[SEC33333:%[^,]+]],
+// CK24-DAG: [[SEC33333]] = getelementptr {{.*}}[[SB]]* [[SEC333333:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC333333]] = load [[SB]]*, [[SB]]** [[SEC3333333:%[^,]+]],
+// CK24-DAG: [[SEC3333333]] = getelementptr {{.*}}[[SC]]* [[VAR00000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR0000]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL24:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->p->p->p->a)
+  { p->a++; }
+
+  return s.a;
+}
+
+// CK24: define {{.+}}[[CALL01]]
+// CK24: define {{.+}}[[CALL02]]
+// CK24: define {{.+}}[[CALL03]]
+// CK24: define {{.+}}[[CALL04]]
+// CK24: define {{.+}}[[CALL05]]
+// CK24: define {{.+}}[[CALL06]]
+// CK24: define {{.+}}[[CALL07]]
+// CK24: define {{.+}}[[CALL08]]
+// CK24: define {{.+}}[[CALL09]]
+// CK24: define {{.+}}[[CALL10]]
+// CK24: define {{.+}}[[CALL11]]
+// CK24: define {{.+}}[[CALL12]]
+// CK24: define {{.+}}[[CALL13]]
+// CK24: define {{.+}}[[CALL14]]
+// CK24: define {{.+}}[[CALL15]]
+// CK24: define {{.+}}[[CALL16]]
+// CK24: define {{.+}}[[CALL17]]
+// CK24: define {{.+}}[[CALL18]]
+// CK24: define {{.+}}[[CALL19]]
+// CK24: define {{.+}}[[CALL20]]
+// CK24: define {{.+}}[[CALL21]]
+// CK24: define {{.+}}[[CALL22]]
+// CK24: define {{.+}}[[CALL23]]
+// CK24: define {{.+}}[[CALL24]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK25 -std=c++11 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK25 --check-prefix CK25-64
+// RUN: %clang_cc1 -DCK25 -std=c++11 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK25 --check-prefix CK25-64
+// RUN: %clang_cc1 -DCK25 -std=c++11 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK25 --check-prefix CK25-32
+// RUN: %clang_cc1 -DCK25 -std=c++11 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK25 --check-prefix CK25-32
+#ifdef CK25
+// CK25: [[ST:%.+]] = type { i32, float }
+// CK25: [[CA00:%.+]] = type { [[ST]]* }
+// CK25: [[CA01:%.+]] = type { i32* }
+
+// CK25: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK25: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK25: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK25: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK25-LABEL: explicit_maps_with_inner_lambda
+
+template <int X, typename T>
+struct CC {
+  T A;
+  float B;
+
+  int foo(T arg) {
+    // Region 00
+    // CK25-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+    // CK25-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK25-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK25-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK25-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK25-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK25-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK25-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK25-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+    // CK25-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0:%.+]], i{{.+}} 0, i{{.+}} 0
+
+    // CK25: call void [[CALL00:@.+]]([[ST]]* {{[^,]+}})
+    #pragma omp target map(to:A)
+    {
+      [&]() {
+        A += 1;
+      }();
+    }
+
+    // Region 01
+    // CK25-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+    // CK25-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK25-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK25-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK25-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK25-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK25-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK25-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+    // CK25-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+    // CK25: call void [[CALL01:@.+]](i32* {{[^,]+}})
+    #pragma omp target map(to:arg)
+    {
+      [&]() {
+        arg += 1;
+      }();
+    }
+
+    return A+arg;
+  }
+};
+
+int explicit_maps_with_inner_lambda(int a){
+  CC<123,int> c;
+  return c.foo(a);
+}
+
+// CK25: define {{.+}}[[CALL00]]([[ST]]* [[VAL:%.+]])
+// CK25: store [[ST]]* [[VAL]], [[ST]]** [[VALADDR:%[^,]+]],
+// CK25: [[VAL1:%.+]] = load [[ST]]*, [[ST]]** [[VALADDR]],
+// CK25: [[VALADDR1:%.+]] = getelementptr inbounds [[CA00]], [[CA00]]* [[CA:%[^,]+]], i32 0, i32 0
+// CK25: store [[ST]]* [[VAL1]], [[ST]]** [[VALADDR1]],
+// CK25: call void {{.*}}[[LAMBDA:@.+]]{{.*}}([[CA00]]* [[CA]])
+
+// CK25: define {{.+}}[[LAMBDA]]
+
+// CK25: define {{.+}}[[CALL01]](i32* {{.*}}[[VAL:%.+]])
+// CK25: store i32* [[VAL]], i32** [[VALADDR:%[^,]+]],
+// CK25: [[VAL1:%.+]] = load i32*, i32** [[VALADDR]],
+// CK25: [[VALADDR1:%.+]] = getelementptr inbounds [[CA01]], [[CA01]]* [[CA:%[^,]+]], i32 0, i32 0
+// CK25: store i32* [[VAL1]], i32** [[VALADDR1]],
+// CK25: call void {{.*}}[[LAMBDA]]{{.*}}([[CA01]]* [[CA]])
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK26 -std=c++11 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -DCK26 -std=c++11 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -DCK26 -std=c++11 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-32
+// RUN: %clang_cc1 -DCK26 -std=c++11 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-32
+#ifdef CK26
+// CK26: [[ST:%.+]] = type { i32, float*, i32, float* }
+
+// CK26: [[SIZE00:@.+]] = private {{.*}}constant [2 x i[[Z:64|32]]] [i[[Z:64|32]] {{32|16}}, i[[Z:64|32]] 4]
+// CK26: [[MTYPE00:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 35]
+
+// CK26: [[SIZE01:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{32|16}}, i[[Z]] 4]
+// CK26: [[MTYPE01:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 35]
+
+// CK26: [[SIZE02:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{32|16}}, i[[Z]] 4]
+// CK26: [[MTYPE02:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 35]
+
+// CK26: [[SIZE03:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{32|16}}, i[[Z]] 4]
+// CK26: [[MTYPE03:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 35]
+
+// CK26-LABEL: explicit_maps_with_private_class_members
+
+struct CC {
+  int fA;
+  float &fB;
+  int pA;
+  float &pB;
+
+  CC(float &B) : fB(B), pB(B) {
+
+    // CK26: call {{.*}}@__kmpc_fork_call{{.*}} [[OUTCALL:@.+]] to void (i32*, i32*, ...)*
+    // define {{.*}}void [[OUTCALL]]
+    #pragma omp parallel firstprivate(fA,fB) private(pA,pB)
+    {
+      // Region 00
+      // CK26-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+      // CK26-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+      // CK26-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+      // CK26-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+      // CK26-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+      // CK26-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+      // CK26-DAG: [[CPVAL0]] = bitcast [[ST]]* [[VAR0]] to i8*
+
+      // CK26-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+      // CK26-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+      // CK26-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+      // CK26-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+      // CK26-DAG: [[VAR1]] = load i32*, i32** [[PVT:%.+]],
+      // CK26-DAG: [[SEC1]] = load i32*, i32** [[PVT]],
+
+      // CK26: call void [[CALL00:@.+]]([[ST]]* {{[^,]+}}, i32* {{[^,]+}})
+      #pragma omp target map(fA)
+      {
+        ++fA;
+      }
+
+      // Region 01
+      // CK26-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE01]]{{.+}})
+      // CK26-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+      // CK26-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+      // CK26-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+      // CK26-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+      // CK26-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+      // CK26-DAG: [[CPVAL0]] = bitcast [[ST]]* [[VAR0]] to i8*
+
+      // CK26-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+      // CK26-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+      // CK26-DAG: [[CBPVAL1]] = bitcast float* [[VAR1:%.+]] to i8*
+      // CK26-DAG: [[CPVAL1]] = bitcast float* [[SEC1:%.+]] to i8*
+      // CK26-DAG: [[VAR1]] = load float*, float** [[PVT:%.+]],
+      // CK26-DAG: [[SEC1]] = load float*, float** [[PVT]],
+
+      // CK26: call void [[CALL01:@.+]]([[ST]]* {{[^,]+}}, float* {{[^,]+}})
+      #pragma omp target map(fB)
+      {
+        fB += 1.0;
+      }
+
+      // Region 02
+      // CK26-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE02]]{{.+}})
+      // CK26-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+      // CK26-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+      // CK26-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+      // CK26-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+      // CK26-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+      // CK26-DAG: [[CPVAL0]] = bitcast [[ST]]* [[VAR0]] to i8*
+
+      // CK26-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+      // CK26-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+      // CK26-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+      // CK26-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+      // CK26-DAG: [[VAR1]] = load i32*, i32** [[PVT:%.+]],
+      // CK26-DAG: [[SEC1]] = load i32*, i32** [[PVT]],
+
+      // CK26: call void [[CALL02:@.+]]([[ST]]* {{[^,]+}}, i32* {{[^,]+}})
+      #pragma omp target map(pA)
+      {
+        ++pA;
+      }
+
+      // Region 01
+      // CK26-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE03]]{{.+}})
+      // CK26-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+      // CK26-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+      // CK26-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+      // CK26-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+      // CK26-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+      // CK26-DAG: [[CPVAL0]] = bitcast [[ST]]* [[VAR0]] to i8*
+
+      // CK26-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+      // CK26-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+      // CK26-DAG: [[CBPVAL1]] = bitcast float* [[VAR1:%.+]] to i8*
+      // CK26-DAG: [[CPVAL1]] = bitcast float* [[SEC1:%.+]] to i8*
+      // CK26-DAG: [[VAR1]] = load float*, float** [[PVT:%.+]],
+      // CK26-DAG: [[SEC1]] = load float*, float** [[PVT]],
+
+      // CK26: call void [[CALL03:@.+]]([[ST]]* {{[^,]+}}, float* {{[^,]+}})
+      #pragma omp target map(pB)
+      {
+        pB += 1.0;
+      }
+    }
+  }
+
+  int foo() {
+    return fA + pA;
+  }
+};
+
+// Make sure the private instance is used in all target regions.
+// CK26: define {{.+}}[[CALL00]]({{.*}}i32*{{.*}}[[PVTARG:%.+]])
+// CK26: store i32* [[PVTARG]], i32** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load i32*, i32** [[PVTADDR]],
+// CK26: [[VAL:%.+]] = load i32, i32* [[ADDR]],
+// CK26: add nsw i32 [[VAL]], 1
+
+// CK26: define {{.+}}[[CALL01]]({{.*}}float*{{.*}}[[PVTARG:%.+]])
+// CK26: store float* [[PVTARG]], float** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load float*, float** [[PVTADDR]],
+// CK26: [[VAL:%.+]] = load float, float* [[ADDR]],
+// CK26: [[EXT:%.+]] = fpext float [[VAL]] to double
+// CK26: fadd double [[EXT]], 1.000000e+00
+
+// CK26: define {{.+}}[[CALL02]]({{.*}}i32*{{.*}}[[PVTARG:%.+]])
+// CK26: store i32* [[PVTARG]], i32** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load i32*, i32** [[PVTADDR]],
+// CK26: [[VAL:%.+]] = load i32, i32* [[ADDR]],
+// CK26: add nsw i32 [[VAL]], 1
+
+// CK26: define {{.+}}[[CALL03]]({{.*}}float*{{.*}}[[PVTARG:%.+]])
+// CK26: store float* [[PVTARG]], float** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load float*, float** [[PVTADDR]],
+// CK26: [[VAL:%.+]] = load float, float* [[ADDR]],
+// CK26: [[EXT:%.+]] = fpext float [[VAL]] to double
+// CK26: fadd double [[EXT]], 1.000000e+00
+
+int explicit_maps_with_private_class_members(){
+  float B;
+  CC c(B);
+  return c.foo();
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK27 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK27 --check-prefix CK27-64
+// RUN: %clang_cc1 -DCK27 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK27 --check-prefix CK27-64
+// RUN: %clang_cc1 -DCK27 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK27 --check-prefix CK27-32
+// RUN: %clang_cc1 -DCK27 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK27 --check-prefix CK27-32
+#ifdef CK27
+
+// CK27: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] zeroinitializer
+// CK27: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK27: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] zeroinitializer
+// CK27: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK27: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] zeroinitializer
+// CK27: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK27: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] zeroinitializer
+// CK27: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK27: [[SIZE05:@.+]] = private {{.*}}constant [1 x i[[Z]]] zeroinitializer
+// CK27: [[MTYPE05:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK27: [[SIZE07:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK27: [[MTYPE07:@.+]] = private {{.*}}constant [1 x i32] [i32 288]
+
+// CK27: [[SIZE09:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 40]
+// CK27: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i32] [i32 161]
+
+// CK27-LABEL: zero_size_section_and_private_maps
+void zero_size_section_and_private_maps (int ii){
+
+  // Map of a pointer.
+  int *pa;
+
+  // Region 00
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+  // CK27: call void [[CALL00:@.+]](i32* {{[^,]+}})
+  #pragma omp target
+  {
+    pa[50]++;
+  }
+
+  // Region 01
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK27-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK27-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+  // CK27-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK27: call void [[CALL01:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(pa[:0])
+  {
+    pa[50]++;
+  }
+
+  // Region 02
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK27-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK27-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+  // CK27-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK27: call void [[CALL02:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(pa[0:0])
+  {
+    pa[50]++;
+  }
+
+  // Region 03
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK27-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK27-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} %{{.+}}
+  // CK27-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK27: call void [[CALL03:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(pa[ii:0])
+  {
+    pa[50]++;
+  }
+
+  int *pvtPtr;
+  int pvtScl;
+  int pvtArr[10];
+
+  // Region 04
+  // CK27: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 0, i8** null, i8** null, i{{64|32}}* null, i32* null)
+  // CK27: call void [[CALL04:@.+]]()
+  #pragma omp target private(pvtPtr)
+  {
+    pvtPtr[5]++;
+  }
+
+  // Region 05
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE05]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+  // CK27: call void [[CALL05:@.+]](i32* {{[^,]+}})
+  #pragma omp target firstprivate(pvtPtr)
+  {
+    pvtPtr[5]++;
+  }
+
+  // Region 06
+  // CK27: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 0, i8** null, i8** null, i{{64|32}}* null, i32* null)
+  // CK27: call void [[CALL06:@.+]]()
+  #pragma omp target private(pvtScl)
+  {
+    pvtScl++;
+  }
+
+  // Region 07
+  // CK27-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZE07]]{{.+}}, {{.+}}[[MTYPE07]]{{.+}})
+  // CK27-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK27-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK27-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK27-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK27-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK27-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK27-DAG: [[VALBP]] = inttoptr i[[Z]] [[VAL:%.+]] to i8*
+  // CK27-DAG: [[VALP]] = inttoptr i[[Z]] [[VAL:%.+]] to i8*
+  // CK27-DAG: [[VAL]] = load i[[Z]], i[[Z]]* [[ADDR:%.+]],
+  // CK27-64-DAG: [[CADDR:%.+]] = bitcast i[[Z]]* [[ADDR]] to i32*
+  // CK27-64-DAG: store i32 {{.+}}, i32* [[CADDR]],
+
+  // CK27: call void [[CALL07:@.+]](i[[Z]] [[VAL]])
+  #pragma omp target firstprivate(pvtScl)
+  {
+    pvtScl++;
+  }
+
+  // Region 08
+  // CK27: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 0, i8** null, i8** null, i{{64|32}}* null, i32* null)
+  // CK27: call void [[CALL08:@.+]]()
+  #pragma omp target private(pvtArr)
+  {
+    pvtArr[5]++;
+  }
+
+  // Region 09
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE09]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE09]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast [10 x i32]* [[VAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast [10 x i32]* [[VAR0]] to i8*
+
+  // CK27: call void [[CALL09:@.+]]([10 x i32]* {{[^,]+}})
+  #pragma omp target firstprivate(pvtArr)
+  {
+    pvtArr[5]++;
+  }
+}
+
+// CK27: define {{.+}}[[CALL00]]
+// CK27: define {{.+}}[[CALL01]]
+// CK27: define {{.+}}[[CALL02]]
+// CK27: define {{.+}}[[CALL03]]
+// CK27: define {{.+}}[[CALL04]]
+// CK27: define {{.+}}[[CALL05]]
+// CK27: define {{.+}}[[CALL06]]
+// CK27: define {{.+}}[[CALL07]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK28 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK28 --check-prefix CK28-64
+// RUN: %clang_cc1 -DCK28 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK28 --check-prefix CK28-64
+// RUN: %clang_cc1 -DCK28 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK28 --check-prefix CK28-32
+// RUN: %clang_cc1 -DCK28 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK28 --check-prefix CK28-32
+#ifdef CK28
+
+// CK28: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] {{8|4}}]
+// CK28: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK28: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK28: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK28-LABEL: explicit_maps_pointer_references
+void explicit_maps_pointer_references (int *p){
+  int *&a = p;
+
+  // Region 00
+  // CK28-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK28-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK28-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK28-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK28-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK28-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK28-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK28-DAG: [[CBPVAL0]] = bitcast i32** [[VAR0:%.+]] to i8*
+  // CK28-DAG: [[CPVAL0]] = bitcast i32** [[VAR1:%.+]] to i8*
+  // CK28-DAG: [[VAR0]] = load i32**, i32*** [[VAR00:%.+]],
+  // CK28-DAG: [[VAR1]] = load i32**, i32*** [[VAR11:%.+]],
+
+  // CK28: call void [[CALL00:@.+]](i32** {{[^,]+}})
+  #pragma omp target map(a)
+  {
+    ++a;
+  }
+
+  // Region 01
+  // CK28-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK28-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK28-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK28-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK28-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK28-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK28-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK28-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK28-DAG: [[CPVAL0]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK28-DAG: [[VAR0]] = load i32*, i32** [[VAR00:%.+]],
+  // CK28-DAG: [[VAR00]] = load i32**, i32*** [[VAR000:%.+]],
+  // CK28-DAG: [[VAR1]] = getelementptr inbounds i32, i32* [[VAR11:%.+]], i{{64|32}} 2
+  // CK28-DAG: [[VAR11]] = load i32*, i32** [[VAR111:%.+]],
+  // CK28-DAG: [[VAR111]] = load i32**, i32*** [[VAR1111:%.+]],
+
+  // CK28: call void [[CALL01:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(a[2:100])
+  {
+    ++a;
+  }
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK29 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK29 --check-prefix CK29-64
+// RUN: %clang_cc1 -DCK29 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK29 --check-prefix CK29-64
+// RUN: %clang_cc1 -DCK29 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK29 --check-prefix CK29-32
+// RUN: %clang_cc1 -DCK29 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK29 --check-prefix CK29-32
+#ifdef CK29
+
+// CK29: [[SSA:%.+]] = type { double*, double** }
+// CK29: [[SSB:%.+]]  = type { [[SSA]]*, [[SSA]]** }
+
+// CK29: [[SIZE00:@.+]] = private {{.*}}constant [4 x i[[Z:64|32]]] [i[[Z:64|32]] {{8|4}}, i[[Z:64|32]] {{8|4}}, i[[Z:64|32]] {{8|4}}, i[[Z:64|32]] 80]
+// CK29: [[MTYPE00:@.+]] = private {{.*}}constant [4 x i32] [i32 35, i32 16, i32 19, i32 19]
+
+// CK29: [[SIZE01:@.+]] = private {{.*}}constant [4 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 80]
+// CK29: [[MTYPE01:@.+]] = private {{.*}}constant [4 x i32] [i32 32, i32 19, i32 19, i32 19]
+
+// CK29: [[SIZE02:@.+]] = private {{.*}}constant [5 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 80]
+// CK29: [[MTYPE02:@.+]] = private {{.*}}constant [5 x i32] [i32 32, i32 19, i32 16, i32 19, i32 19]
+
+struct SSA{
+  double *p;
+  double *&pr;
+  SSA(double *&pr) : pr(pr) {}
+};
+
+struct SSB{
+  SSA *p;
+  SSA *&pr;
+  SSB(SSA *&pr) : pr(pr) {}
+
+  // CK29-LABEL: define {{.+}}foo
+  void foo() {
+
+    // Region 00
+    // CK29-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE00]]{{.+}})
+
+    // CK29-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK29-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK29-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK29-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK29-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK29-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK29-DAG: [[CBPVAL0]] = bitcast [[SSB]]* [[VAR0:%.+]] to i8*
+    // CK29-DAG: [[CPVAL0]] = bitcast [[SSA]]** [[VAR00:%.+]] to i8*
+    // CK29-DAG: [[VAR0]] = load [[SSB]]*, [[SSB]]** %
+    // CK29-DAG: [[VAR00]] = getelementptr inbounds [[SSB]], [[SSB]]* [[VAR0]], i32 0, i32 0
+
+    // CK29-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK29-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK29-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK29-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK29-DAG: [[CBPVAL1]] = bitcast [[SSA]]** [[VAR00]] to i8*
+    // CK29-DAG: [[CPVAL1]] = bitcast double*** [[VAR1:%.+]] to i8*
+    // CK29-DAG: [[VAR1]] = getelementptr inbounds [[SSA]], [[SSA]]* %{{.+}}, i32 0, i32 1
+
+    // CK29-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+    // CK29-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+    // CK29-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+    // CK29-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+    // CK29-DAG: [[CBPVAL2]] = bitcast double*** [[VAR1]] to i8*
+    // CK29-DAG: [[CPVAL2]] = bitcast double** [[VAR2:%.+]] to i8*
+    // CK29-DAG: [[VAR2]] = load double**, double*** [[VAR22:%.+]],
+    // CK29-DAG: [[VAR22]] = getelementptr inbounds [[SSA]], [[SSA]]* %{{.+}}, i32 0, i32 1
+
+    // CK29-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+    // CK29-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+    // CK29-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+    // CK29-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+    // CK29-DAG: [[CBPVAL3]] = bitcast double** [[VAR2]] to i8*
+    // CK29-DAG: [[CPVAL3]] = bitcast double* [[VAR3:%.+]] to i8*
+    // CK29-DAG: [[VAR3]] = getelementptr inbounds double, double* [[VAR33:%.+]], i{{.+}} 0
+    // CK29-DAG: [[VAR33]] = load double*, double** %{{.+}},
+
+    // CK29: call void [[CALL00:@.+]]([[SSB]]* {{[^,]+}})
+    #pragma omp target map(p->pr[:10])
+    {
+      p->pr++;
+    }
+
+    // Region 01
+    // CK29-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE01]]{{.+}})
+
+    // CK29-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK29-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK29-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK29-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK29-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK29-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK29-DAG: [[CBPVAL0]] = bitcast [[SSB]]* [[VAR0:%.+]] to i8*
+    // CK29-DAG: [[CPVAL0]] = bitcast [[SSA]]*** [[VAR00:%.+]] to i8*
+    // CK29-DAG: [[VAR00]] = getelementptr inbounds [[SSB]], [[SSB]]* [[VAR0]], i32 0, i32 1
+
+    // CK29-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK29-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK29-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK29-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK29-DAG: [[CBPVAL1]] = bitcast [[SSA]]*** [[VAR00]] to i8*
+    // CK29-DAG: [[CPVAL1]] = bitcast [[SSA]]** [[VAR1:%.+]] to i8*
+    // CK29-DAG: [[VAR1]] = load [[SSA]]**, [[SSA]]*** [[VAR00]],
+
+    // CK29-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+    // CK29-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+    // CK29-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+    // CK29-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+    // CK29-DAG: [[CBPVAL2]] = bitcast [[SSA]]** [[VAR1]] to i8*
+    // CK29-DAG: [[CPVAL2]] = bitcast double** [[VAR2:%.+]] to i8*
+    // CK29-DAG: [[VAR2]] = getelementptr inbounds [[SSA]], [[SSA]]* %{{.+}}, i32 0, i32 0
+
+    // CK29-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+    // CK29-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+    // CK29-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+    // CK29-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+    // CK29-DAG: [[CBPVAL3]] = bitcast double** [[VAR2]] to i8*
+    // CK29-DAG: [[CPVAL3]] = bitcast double* [[VAR3:%.+]] to i8*
+    // CK29-DAG: [[VAR3]] = getelementptr inbounds double, double* [[VAR33:%.+]], i{{.+}} 0
+    // CK29-DAG: [[VAR33]] = load double*, double** %{{.+}},
+
+    // CK29: call void [[CALL00:@.+]]([[SSB]]* {{[^,]+}})
+    #pragma omp target map(pr->p[:10])
+    {
+      pr->p++;
+    }
+
+    // Region 02
+    // CK29-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 5, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[5 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[5 x i{{.+}}]* [[MTYPE02]]{{.+}})
+
+    // CK29-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK29-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK29-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK29-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK29-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK29-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK29-DAG: [[CBPVAL0]] = bitcast [[SSB]]* [[VAR0:%.+]] to i8*
+    // CK29-DAG: [[CPVAL0]] = bitcast [[SSA]]*** [[VAR00:%.+]] to i8*
+    // CK29-DAG: [[VAR00]] = getelementptr inbounds [[SSB]], [[SSB]]* [[VAR0]], i32 0, i32 1
+
+    // CK29-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK29-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK29-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK29-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK29-DAG: [[CBPVAL1]] = bitcast [[SSA]]*** [[VAR00]] to i8*
+    // CK29-DAG: [[CPVAL1]] = bitcast [[SSA]]** [[VAR1:%.+]] to i8*
+    // CK29-DAG: [[VAR1]] = load [[SSA]]**, [[SSA]]*** [[VAR00]],
+
+    // CK29-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+    // CK29-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+    // CK29-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+    // CK29-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+    // CK29-DAG: [[CBPVAL2]] = bitcast [[SSA]]** [[VAR1]] to i8*
+    // CK29-DAG: [[CPVAL2]] = bitcast double*** [[VAR2:%.+]] to i8*
+    // CK29-DAG: [[VAR2]] = getelementptr inbounds [[SSA]], [[SSA]]* %{{.+}}, i32 0, i32 1
+
+    // CK29-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+    // CK29-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+    // CK29-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+    // CK29-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+    // CK29-DAG: [[CBPVAL3]] = bitcast double*** [[VAR2]] to i8*
+    // CK29-DAG: [[CPVAL3]] = bitcast double** [[VAR3:%.+]] to i8*
+    // CK29-DAG: [[VAR3]] = load double**, double*** [[VAR2]],
+
+    // CK29-DAG: [[BP4:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 4
+    // CK29-DAG: [[P4:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 4
+    // CK29-DAG: store i8* [[CBPVAL4:%[^,]+]], i8** [[BP4]]
+    // CK29-DAG: store i8* [[CPVAL4:%[^,]+]], i8** [[P4]]
+    // CK29-DAG: [[CBPVAL4]] = bitcast double** [[VAR3]] to i8*
+    // CK29-DAG: [[CPVAL4]] = bitcast double* [[VAR4:%.+]] to i8*
+    // CK29-DAG: [[VAR4]] = getelementptr inbounds double, double* [[VAR44:%.+]], i{{.+}} 0
+    // CK29-DAG: [[VAR44]] = load double*, double**
+
+    // CK29: call void [[CALL00:@.+]]([[SSB]]* {{[^,]+}})
+    #pragma omp target map(pr->pr[:10])
+    {
+      pr->pr++;
+    }
+  }
+};
+
+void explicit_maps_member_pointer_references(SSA *sap) {
+  double *d;
+  SSA sa(d);
+  SSB sb(sap);
+  sb.foo();
+}
+#endif
 #endif
diff --git a/test/OpenMP/target_map_messages.cpp b/test/OpenMP/target_map_messages.cpp
index d61e766..a3b2168 100644
--- a/test/OpenMP/target_map_messages.cpp
+++ b/test/OpenMP/target_map_messages.cpp
@@ -1,5 +1,308 @@
-// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 200 %s
+// RUN: %clang_cc1 -DCCODE -verify -fopenmp -ferror-limit 200 -x c %s
+#ifdef CCODE
+void foo(int arg) {
+  const int n = 0;
 
+  double marr[10][10][10];
+
+  #pragma omp target map(marr[2][0:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:][0:][:])
+  {}
+  #pragma omp target map(marr[:][1:][:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:][n:][:])
+  {}
+}
+#else
+template <typename T, int I>
+struct SA {
+  static int ss;
+  #pragma omp threadprivate(ss) // expected-note {{defined as threadprivate or thread local}}
+  float a;
+  int b[12];
+  float *c;
+  T d;
+  float e[I];
+  T *f;
+  void func(int arg) {
+    #pragma omp target map(arg,a,d)
+    {}
+    #pragma omp target map(arg[2:2],a,d) // expected-error {{subscripted value is not an array or pointer}}
+    {}
+    #pragma omp target map(arg,a*2) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+    {}
+    #pragma omp target map(arg,(c+1)[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+    {}
+    #pragma omp target map(arg,a[:2],d) // expected-error {{subscripted value is not an array or pointer}}
+    {}
+    #pragma omp target map(arg,a,d[:2]) // expected-error {{subscripted value is not an array or pointer}}
+    {}
+
+    #pragma omp target map(to:ss) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+    {}
+
+    #pragma omp target map(to:b,e)
+    {}
+    #pragma omp target map(to:b,e) map(to:b) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+    {}
+    #pragma omp target map(to:b[:2],e)
+    {}
+    #pragma omp target map(to:b,e[:])
+    {}
+    #pragma omp target map(b[-1:]) // expected-error {{array section must be a subset of the original array}}
+    {}
+    #pragma omp target map(b[:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+    {}
+
+    #pragma omp target map(always, tofrom: c,f)
+    {}
+    #pragma omp target map(always, tofrom: c[1:2],f)
+    {}
+    #pragma omp target map(always, tofrom: c,f[1:2])
+    {}
+    #pragma omp target map(always, tofrom: c[:],f)   // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+    {}
+    #pragma omp target map(always, tofrom: c,f[:])   // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+    {}
+    return;
+  }
+};
+
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+void SAclient(int arg) {
+  SA<int,123> s;
+  s.func(arg); // expected-note {{in instantiation of member function}}
+  double marr[10][10][10];
+  double marr2[5][10][1];
+  double mvla[5][arg][10];
+  double ***mptr;
+  const int n = 0;
+  const int m = 1;
+  double mvla2[5][arg][m+n+10];
+
+  SB *p;
+
+  SD u;
+  SC r(p),t(p);
+  #pragma omp target map(r)
+  {}
+  #pragma omp target map(marr[2][0:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:][0:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[2][3][0:2])
+  {}
+  #pragma omp target map(marr[:][:][:])
+  {}
+  #pragma omp target map(marr[:2][:][:])
+  {}
+  #pragma omp target map(marr[arg:][:][:])
+  {}
+  #pragma omp target map(marr[arg:])
+  {}
+  #pragma omp target map(marr[arg:][:arg][:]) // correct if arg is the size of dimension 2
+  {}
+  #pragma omp target map(marr[:arg][:])
+  {}
+  #pragma omp target map(marr[:arg][n:])
+  {}
+  #pragma omp target map(marr[:][:arg][n:]) // correct if arg is the size of  dimension 2
+  {}
+  #pragma omp target map(marr[:][:m][n:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[n:m][:arg][n:])
+  {}
+  #pragma omp target map(marr[:2][:1][:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:2][1:][:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:2][:][:1]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:2][:][1:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:1][:2][:])
+  {}
+  #pragma omp target map(marr[:1][0][:])
+  {}
+  #pragma omp target map(marr[:arg][:2][:]) // correct if arg is 1
+  {}
+  #pragma omp target map(marr[:1][3:1][:2])
+  {}
+  #pragma omp target map(marr[:1][3:arg][:2]) // correct if arg is 1
+  {}
+  #pragma omp target map(marr[:1][3:2][:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:2][:10][:])
+  {}
+  #pragma omp target map(marr[:2][:][:5+5])
+  {}
+  #pragma omp target map(marr[:2][2+2-4:][0:5+5])
+  {}
+
+  #pragma omp target map(marr[:1][:2][0]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr2[:1][:2][0])
+  {}
+
+  #pragma omp target map(mvla[:1][:][0]) // correct if the size of dimension 2 is 1.
+  {}
+  #pragma omp target map(mvla[:2][:arg][:]) // correct if arg is the size of dimension 2.
+  {}
+  #pragma omp target map(mvla[:1][:2][0]) // expected-error {{array section does not specify contiguous storage}}
+   {}
+  #pragma omp target map(mvla[1][2:arg][:])
+  {}
+  #pragma omp target map(mvla[:1][:][:])
+  {}
+  #pragma omp target map(mvla2[:1][:2][:11])
+  {}
+  #pragma omp target map(mvla2[:1][:2][:10]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+
+  #pragma omp target map(mptr[:2][2+2-4:1][0:5+5]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(mptr[:1][:2-1][2:4-3])
+  {}
+  #pragma omp target map(mptr[:1][:arg][2:4-3]) // correct if arg is 1.
+  {}
+  #pragma omp target map(mptr[:1][:2-1][0:2])
+  {}
+  #pragma omp target map(mptr[:1][:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(mptr[:1][:][0:2]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  {}
+  #pragma omp target map(mptr[:2][:1][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+
+  #pragma omp target map(r.ArrS[0].B)
+  {}
+  #pragma omp target map(r.ArrS[:1].B) // expected-error {{OpenMP array section is not allowed here}}
+  {}
+  #pragma omp target map(r.ArrS[:arg].B) // expected-error {{OpenMP array section is not allowed here}}
+  {}
+  #pragma omp target map(r.ArrS[0].Arr[1:23])
+  {}
+  #pragma omp target map(r.ArrS[0].Arr[1:arg])
+  {}
+  #pragma omp target map(r.ArrS[0].Arr[arg:23])
+  {}
+  #pragma omp target map(r.ArrS[0].Error) // expected-error {{no member named 'Error' in 'SB'}}
+  {}
+  #pragma omp target map(r.ArrS[0].A, r.ArrS[1].A) // expected-error {{multiple array elements associated with the same variable are not allowed in map clauses of the same construct}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.ArrS[0].A, t.ArrS[1].A)
+  {}
+  #pragma omp target map(r.PtrS[0], r.PtrS->B) // expected-error {{same pointer derreferenced in multiple different ways in map clause expressions}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.RPtrS[0], r.RPtrS->B) // expected-error {{same pointer derreferenced in multiple different ways in map clause expressions}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.S.Arr[:12])
+  {}
+  #pragma omp target map(r.S.foo()[:12]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  {}
+  #pragma omp target map(r.C, r.D)
+  {}
+  #pragma omp target map(r.C, r.C) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.C) map(r.C) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.C, r.S)  // this would be an error only caught at runtime - Sema would have to make sure there is not way for the missing data between fields to be mapped somewhere else.
+  {}
+  #pragma omp target map(r, r.S)  // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.C, t.C)
+  {}
+  #pragma omp target map(r.A)   // expected-error {{bit fields cannot be used to specify storage in a 'map' clause}}
+  {}
+  #pragma omp target map(r.Arr)
+  {}
+  #pragma omp target map(r.Arr[3:5])
+  {}
+  #pragma omp target map(r.Ptr[3:5])
+  {}
+  #pragma omp target map(r.ArrS[3:5].A)   // expected-error {{OpenMP array section is not allowed here}}
+  {}
+  #pragma omp target map(r.ArrS[3:5].Arr[6:7])   // expected-error {{OpenMP array section is not allowed here}}
+  {}
+  #pragma omp target map(r.ArrS[3].Arr[6:7])
+  {}
+  #pragma omp target map(r.S.Arr[4:5])
+  {}
+  #pragma omp target map(r.S.Ptr[4:5])
+  {}
+  #pragma omp target map(r.S.Ptr[:])  // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  {}
+  #pragma omp target map((p+1)->A)  // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  {}
+  #pragma omp target map(u.B)  // expected-error {{mapped storage cannot be derived from a union}}
+  {}
+
+  #pragma omp target data map(to: r.C) //expected-note {{used here}}
+  {
+    #pragma omp target map(r.D)  // expected-error {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+    {}
+  }
+
+  #pragma omp target data map(to: t.Ptr) //expected-note {{used here}}
+  {
+    #pragma omp target map(t.Ptr[:23])  // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+    {}
+  }
+
+  #pragma omp target data map(to: t.C, t.D)
+  {
+  #pragma omp target data map(to: t.C)
+  {
+    #pragma omp target map(t.D)
+    {}
+  }
+  }
+  #pragma omp target data map(marr[:][:][:])
+  {
+    #pragma omp target data map(marr)
+    {}
+  }
+
+  #pragma omp target data map(to: t)
+  {
+  #pragma omp target data map(to: t.C)
+  {
+    #pragma omp target map(t.D)
+    {}
+  }
+  }
+}
 void foo() {
 }
 
@@ -62,25 +365,31 @@
   T y;
   T to, tofrom, always;
   const T (&l)[5] = da;
-
-
 #pragma omp target map // expected-error {{expected '(' after 'map'}}
+  {}
 #pragma omp target map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
 #pragma omp target map() // expected-error {{expected expression}}
+  {}
 #pragma omp target map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
 #pragma omp target map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  {}
 #pragma omp target map(to:) // expected-error {{expected expression}}
+  {}
 #pragma omp target map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  {}
 #pragma omp target map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  {}
 #pragma omp target map(x)
   foo();
 #pragma omp target map(tofrom: t[:I])
   foo();
-#pragma omp target map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+#pragma omp target map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
   foo();
 #pragma omp target map(T) // expected-error {{'T' does not refer to a value}}
   foo();
-#pragma omp target map(I) // expected-error 2 {{expected variable name, array element or array section}}
+#pragma omp target map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
   foo();
 #pragma omp target map(S2::S2s)
   foo();
@@ -96,42 +405,41 @@
   foo();
 #pragma omp target map(to, x)
   foo();
-#pragma omp target map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
-#pragma omp target map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected variable name, array element or array section}} 
-#pragma omp target map(argc)
-#pragma omp target map(S1) // expected-error {{'S1' does not refer to a value}}
-#pragma omp target map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
-#pragma omp target map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
-#pragma omp target map(ca)
-#pragma omp target map(da)
-#pragma omp target map(S2::S2s)
-#pragma omp target map(S2::S2sc)
-#pragma omp target map(e, g)
-#pragma omp target map(h) // expected-error {{threadprivate variables are not allowed in map clause}}
-#pragma omp target map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
-#pragma omp target map(k), map(k[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+#pragma omp target data map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+#pragma omp target data map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target data map(argc)
+#pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target data map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target data map(ca)
+#pragma omp target data map(da)
+#pragma omp target data map(S2::S2s)
+#pragma omp target data map(S2::S2sc)
+#pragma omp target data map(e, g)
+#pragma omp target data map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+#pragma omp target data map(k) map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+#pragma omp target map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
   foo();
-#pragma omp target map(da)
+#pragma omp target data map(da)
 #pragma omp target map(da[:4])
   foo();
-#pragma omp target map(k, j, l) // expected-note 4 {{used here}}
-#pragma omp target map(k[:4]) // expected-error 2 {{variable already marked as mapped in current construct}}
-#pragma omp target map(j)
-#pragma omp target map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}}
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target data map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target data map(j)
+#pragma omp target map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
   foo();
-#pragma omp target map(k[:4], j, l[:5]) // expected-note 4 {{used here}}
-#pragma omp target map(k) // expected-error 2 {{variable already marked as mapped in current construct}}
-#pragma omp target map(j)
-#pragma omp target map(l) // expected-error 2 {{variable already marked as mapped in current construct}}
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+#pragma omp target data map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target data map(j)
+#pragma omp target map(l)
   foo();
 
-#pragma omp target map(always, tofrom: x)
-#pragma omp target map(always: x) // expected-error {{missing map type}}
-#pragma omp target map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
-#pragma omp target map(always, tofrom: always, tofrom, x)
+#pragma omp target data map(always, tofrom: x)
+#pragma omp target data map(always: x) // expected-error {{missing map type}}
+#pragma omp target data map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+#pragma omp target data map(always, tofrom: always, tofrom, x)
 #pragma omp target map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
   foo();
-
   return 0;
 }
 
@@ -147,14 +455,14 @@
   int y;
   int to, tofrom, always;
   const int (&l)[5] = da;
-#pragma omp target map // expected-error {{expected '(' after 'map'}}
-#pragma omp target map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
-#pragma omp target map() // expected-error {{expected expression}}
-#pragma omp target map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
-#pragma omp target map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
-#pragma omp target map(to:) // expected-error {{expected expression}}
-#pragma omp target map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp target map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one map clause for '#pragma omp target data'}}
+#pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+#pragma omp target data map() // expected-error {{expected expression}}
+#pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+#pragma omp target data map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+#pragma omp target data map(to:) // expected-error {{expected expression}}
+#pragma omp target data map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target data map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
 #pragma omp target map(x)
   foo();
 #pragma omp target map(to: x)
@@ -165,43 +473,46 @@
   foo();
 #pragma omp target map(to, x)
   foo();
-#pragma omp target map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
-#pragma omp target map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
-#pragma omp target map(argc)
-#pragma omp target map(S1) // expected-error {{'S1' does not refer to a value}}
-#pragma omp target map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
-#pragma omp target map(argv[1])
-#pragma omp target map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
-#pragma omp target map(ca)
-#pragma omp target map(da)
-#pragma omp target map(S2::S2s)
-#pragma omp target map(S2::S2sc)
-#pragma omp target map(e, g)
-#pragma omp target map(h) // expected-error {{threadprivate variables are not allowed in map clause}}
-#pragma omp target map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
-#pragma omp target map(k), map(k[:5]) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+#pragma omp target data map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+#pragma omp target data map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{xpected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target data map(argc)
+#pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target data map(argv[1])
+#pragma omp target data map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target data map(ca)
+#pragma omp target data map(da)
+#pragma omp target data map(S2::S2s)
+#pragma omp target data map(S2::S2sc)
+#pragma omp target data map(e, g)
+#pragma omp target data map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+#pragma omp target data map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+#pragma omp target map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
   foo();
-#pragma omp target map(da)
+#pragma omp target data map(da)
 #pragma omp target map(da[:4])
   foo();
-#pragma omp target map(k, j, l) // expected-note 2 {{used here}}
-#pragma omp target map(k[:4]) // expected-error {{variable already marked as mapped in current construct}}
-#pragma omp target map(j)
-#pragma omp target map(l[:5]) // expected-error {{variable already marked as mapped in current construct}}
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target data map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target data map(j)
+#pragma omp target map(l) map(l[:5]) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
   foo();
-#pragma omp target map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
-#pragma omp target map(k) // expected-error {{variable already marked as mapped in current construct}}
-#pragma omp target map(j)
-#pragma omp target map(l) // expected-error {{variable already marked as mapped in current construct}}
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note {{used here}}
+#pragma omp target data map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target data map(j)
+#pragma omp target map(l)
   foo();
 
-#pragma omp target map(always, tofrom: x)
-#pragma omp target map(always: x) // expected-error {{missing map type}}
-#pragma omp target map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
-#pragma omp target map(always, tofrom: always, tofrom, x)
+#pragma omp target data map(always, tofrom: x)
+#pragma omp target data map(always: x) // expected-error {{missing map type}}
+#pragma omp target data map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+#pragma omp target data map(always, tofrom: always, tofrom, x)
 #pragma omp target map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
   foo();
-
+#pragma omp target private(j) map(j) // expected-error {{private variable cannot be in a map clause in '#pragma omp target' directive}}  expected-note {{defined as private}}
+  {}
+#pragma omp target firstprivate(j) map(j)  // expected-error {{firstprivate variable cannot be in a map clause in '#pragma omp target' directive}} expected-note {{defined as firstprivate}}
+  {}
   return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
 }
-
+#endif
diff --git a/test/OpenMP/target_messages.cpp b/test/OpenMP/target_messages.cpp
index 86a9183..6f79f44 100644
--- a/test/OpenMP/target_messages.cpp
+++ b/test/OpenMP/target_messages.cpp
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -o - %s
-// RUN: not %clang_cc1 -fopenmp -std=c++11 -omptargets=aaa-bbb-ccc-ddd -o - %s 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=aaa-bbb-ccc-ddd -o - %s 2>&1 | FileCheck %s
 // CHECK: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd'
+// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx64-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
+// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
+// CHECK-UNSUPPORTED-HOST-TARGET: error: The target '{{nvptx64-nvidia-cuda|nvptx-nvidia-cuda}}' is not a supported OpenMP host target.
 
 void foo() {
 }
@@ -21,6 +24,7 @@
   #pragma omp target } // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
   foo();
   #pragma omp target
+  foo();
   // expected-warning@+1 {{extra tokens at the end of '#pragma omp target' are ignored}}
   #pragma omp target unknown()
   foo();
diff --git a/test/OpenMP/target_nowait_messages.cpp b/test/OpenMP/target_nowait_messages.cpp
new file mode 100644
index 0000000..7531c81
--- /dev/null
+++ b/test/OpenMP/target_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target nowait( // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+  #pragma omp target nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+  #pragma omp target nowait device (-10u)
+  foo();
+  #pragma omp target nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_ast_print.cpp b/test/OpenMP/target_parallel_ast_print.cpp
new file mode 100644
index 0000000..1c0fca5
--- /dev/null
+++ b/test/OpenMP/target_parallel_ast_print.cpp
@@ -0,0 +1,233 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+template <class T>
+struct S {
+  operator T() {return T();}
+  static T TS;
+  #pragma omp threadprivate(TS)
+};
+
+// CHECK:      template <class T = int> struct S {
+// CHECK:        static int TS;
+// CHECK-NEXT:   #pragma omp threadprivate(S<int>::TS)
+// CHECK-NEXT: }
+// CHECK:      template <class T = char> struct S {
+// CHECK:        static char TS;
+// CHECK-NEXT:   #pragma omp threadprivate(S<char>::TS)
+// CHECK-NEXT: }
+// CHECK:      template <class T> struct S {
+// CHECK:        static T TS;
+// CHECK-NEXT:   #pragma omp threadprivate(S::TS)
+// CHECK:      };
+
+template <typename T, int C>
+T tmain(T argc, T *argv) {
+  T b = argc, c, d, e, f, g;
+  static T h;
+  S<T> s;
+  T arr[C][10], arr1[C];
+  T i, j, a[20];
+#pragma omp target parallel
+  h=2;
+#pragma omp target parallel default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(C) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:C][0:10])
+  foo();
+#pragma omp target parallel if (C) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:C][:argc]) reduction(&& : g)
+  foo();
+#pragma omp target parallel if (target:argc > 0)
+  foo();
+#pragma omp target parallel if (parallel:argc > 0)
+  foo();
+#pragma omp target parallel if (C)
+  foo();
+#pragma omp target parallel map(i)
+  foo();
+#pragma omp target parallel map(a[0:10], i)
+  foo();
+#pragma omp target parallel map(to: i) map(from: j)
+  foo();
+#pragma omp target parallel map(always,alloc: i)
+  foo();
+#pragma omp target parallel nowait
+  foo();
+#pragma omp target parallel depend(in : argc, argv[i:argc], a[:])
+  foo();
+#pragma omp target parallel defaultmap(tofrom: scalar)
+  foo();
+  return 0;
+}
+
+// CHECK: template <typename T = int, int C = 5> int tmain(int argc, int *argv) {
+// CHECK-NEXT: int b = argc, c, d, e, f, g;
+// CHECK-NEXT: static int h;
+// CHECK-NEXT: S<int> s;
+// CHECK-NEXT: int arr[5][10], arr1[5];
+// CHECK-NEXT: int i, j, a[20]
+// CHECK-NEXT: #pragma omp target parallel
+// CHECK-NEXT: h = 2;
+// CHECK-NEXT: #pragma omp target parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(5) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(&&: g)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(target: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(parallel: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(5)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: a[0:10],i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(to: i) map(from: j)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(always,alloc: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
+// CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
+// CHECK-NEXT: char b = argc, c, d, e, f, g;
+// CHECK-NEXT: static char h;
+// CHECK-NEXT: S<char> s;
+// CHECK-NEXT: char arr[1][10], arr1[1];
+// CHECK-NEXT: char i, j, a[20]
+// CHECK-NEXT: #pragma omp target parallel
+// CHECK-NEXT: h = 2;
+// CHECK-NEXT: #pragma omp target parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(1) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:1][0:10])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(1) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:1][:argc]) reduction(&&: g)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(target: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(parallel: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(1)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: a[0:10],i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(to: i) map(from: j)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(always,alloc: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
+// CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
+// CHECK-NEXT: T b = argc, c, d, e, f, g;
+// CHECK-NEXT: static T h;
+// CHECK-NEXT: S<T> s;
+// CHECK-NEXT: T arr[C][10], arr1[C];
+// CHECK-NEXT: T i, j, a[20]
+// CHECK-NEXT: #pragma omp target parallel
+// CHECK-NEXT: h = 2;
+// CHECK-NEXT: #pragma omp target parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(C) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:C][0:10])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(C) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:C][:argc]) reduction(&&: g)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(target: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(parallel: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(C)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: a[0:10],i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(to: i) map(from: j)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(always,alloc: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
+
+// CHECK-LABEL: int main(int argc, char **argv) {
+int main (int argc, char **argv) {
+  int i, j, a[20];
+// CHECK-NEXT: int i, j, a[20]
+#pragma omp target parallel
+// CHECK-NEXT: #pragma omp target parallel
+  foo();
+// CHECK-NEXT: foo();
+#pragma omp target parallel if (argc > 0)
+// CHECK-NEXT: #pragma omp target parallel if(argc > 0)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel if (target: argc > 0)
+// CHECK-NEXT: #pragma omp target parallel if(target: argc > 0)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel if (parallel: argc > 0)
+// CHECK-NEXT: #pragma omp target parallel if(parallel: argc > 0)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(i) if(argc>0)
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i) if(argc > 0)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(i)
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(a[0:10], i)
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: a[0:10],i)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(to: i) map(from: j)
+// CHECK-NEXT: #pragma omp target parallel map(to: i) map(from: j)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(always,alloc: i)
+// CHECK-NEXT: #pragma omp target parallel map(always,alloc: i)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel nowait
+// CHECK-NEXT: #pragma omp target parallel nowait
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel depend(in : argc, argv[i:argc], a[:])
+// CHECK-NEXT: #pragma omp target parallel depend(in : argc,argv[i:argc],a[:])
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel defaultmap(tofrom: scalar)
+// CHECK-NEXT: #pragma omp target parallel defaultmap(tofrom: scalar)
+  foo();
+// CHECK-NEXT: foo();
+
+  return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
+}
+
+extern template int S<int>::TS;
+extern template char S<char>::TS;
+
+#endif
diff --git a/test/OpenMP/target_parallel_default_messages.cpp b/test/OpenMP/target_parallel_default_messages.cpp
new file mode 100644
index 0000000..40f31b8
--- /dev/null
+++ b/test/OpenMP/target_parallel_default_messages.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel default // expected-error {{expected '(' after 'default'}}
+  foo();
+  #pragma omp target parallel default ( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel default () // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  foo();
+  #pragma omp target parallel default (none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel default (shared), default(shared) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'default' clause}}
+  foo();
+  #pragma omp target parallel default (x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  foo();
+
+  #pragma omp target parallel default(none)
+  ++argc; // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+
+  #pragma omp target parallel default(none)
+  foo();
+  #pragma omp target parallel default(shared)
+  ++argc;
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_defaultmap_messages.cpp b/test/OpenMP/target_parallel_defaultmap_messages.cpp
new file mode 100644
index 0000000..49e7c30
--- /dev/null
+++ b/test/OpenMP/target_parallel_defaultmap_messages.cpp
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  #pragma omp target parallel defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_depend_messages.cpp b/test/OpenMP/target_parallel_depend_messages.cpp
new file mode 100644
index 0000000..fde940b
--- /dev/null
+++ b/test/OpenMP/target_parallel_depend_messages.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+
+  #pragma omp target parallel depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target parallel depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target parallel depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target parallel depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target parallel depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target parallel depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target parallel depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target parallel depend (in : argv[0])
+  foo();
+  #pragma omp target parallel depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target parallel depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target parallel depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target parallel depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target parallel depend (in : argv[-1:0])
+  foo();
+  #pragma omp target parallel depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target parallel depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target parallel depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target parallel depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target parallel depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target parallel depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target parallel depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target parallel depend(in : arr[0])
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_device_messages.cpp b/test/OpenMP/target_parallel_device_messages.cpp
new file mode 100644
index 0000000..6c8d4c2
--- /dev/null
+++ b/test/OpenMP/target_parallel_device_messages.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel device // expected-error {{expected '(' after 'device'}}
+  foo();
+  #pragma omp target parallel device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel device () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target parallel device (argc + argc)
+  foo();
+  #pragma omp target parallel device (argc), device (argc+1) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'device' clause}}
+  foo();
+  #pragma omp target parallel device (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  foo();
+  #pragma omp target parallel device (-10u)
+  foo();
+  #pragma omp target parallel device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_firstprivate_messages.cpp b/test/OpenMP/target_parallel_firstprivate_messages.cpp
new file mode 100644
index 0000000..dd6825a
--- /dev/null
+++ b/test/OpenMP/target_parallel_firstprivate_messages.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note{{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(const S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(const S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note {{implicitly declared private here}}
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { } // expected-note {{implicitly declared private here}}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  static int m;
+  #pragma omp target parallel firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  foo();
+  #pragma omp target parallel firstprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel firstprivate () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel firstprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel firstprivate (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel firstprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  foo();
+  #pragma omp target parallel firstprivate (argc)
+  foo();
+  #pragma omp target parallel firstprivate (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel firstprivate (a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  foo();
+  #pragma omp target parallel firstprivate (argv[1]) // expected-error {{expected variable name}}
+  foo();
+  #pragma omp target parallel firstprivate(ba)
+  foo();
+  #pragma omp target parallel firstprivate(ca)
+  foo();
+  #pragma omp target parallel firstprivate(da)
+  foo();
+  #pragma omp target parallel firstprivate(S2::S2s)
+  foo();
+  #pragma omp target parallel firstprivate(S2::S2sc)
+  foo();
+  #pragma omp target parallel firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  foo();
+  #pragma omp target parallel firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  foo();
+  #pragma omp target parallel private(i), firstprivate(i) // expected-error {{private variable cannot be firstprivate}} expected-note{{defined as private}}
+  foo();
+  #pragma omp target parallel shared(i)
+  foo();
+  #pragma omp target parallel firstprivate(i)
+  foo();
+  #pragma omp target parallel firstprivate(j)
+  foo();
+  #pragma omp target parallel firstprivate(m)
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_ast_print.cpp b/test/OpenMP/target_parallel_for_ast_print.cpp
new file mode 100644
index 0000000..6c551cb
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_ast_print.cpp
@@ -0,0 +1,252 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target parallel for private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc, T *argv) {
+  T b = argc, c, d, e, f, h;
+  T arr[N][10], arr1[N];
+  T i, j;
+  T s;
+  static T a;
+// CHECK: static T a;
+  static T g;
+#pragma omp threadprivate(g)
+#pragma omp target parallel for schedule(dynamic) default(none) linear(a)
+  // CHECK: #pragma omp target parallel for schedule(dynamic) default(none) linear(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target parallel for private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) schedule(static, N) ordered(N) if (parallel :argc) num_threads(N) default(shared) shared(e) reduction(+ : h)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+            foo();
+  // CHECK-NEXT: #pragma omp target parallel for private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) schedule(static, N) ordered(N) if(parallel: argc) num_threads(N) default(shared) shared(e) reduction(+: h)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: foo();
+#pragma omp target parallel for default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(N) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:N][0:10])
+  for (int i = 0; i < 2; ++i) {}
+// CHECK-NEXT: #pragma omp target parallel for default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(N) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:N][0:10])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (N) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:N][:argc]) reduction(&& : h)
+// CHECK-NEXT: #pragma omp target parallel for if(N) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:N][:argc]) reduction(&&: h)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (target:argc > 0)
+// CHECK-NEXT: #pragma omp target parallel for if(target: argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (parallel:argc > 0)
+// CHECK-NEXT: #pragma omp target parallel for if(parallel: argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (N)
+// CHECK-NEXT: #pragma omp target parallel for if(N)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(i)
+// CHECK-NEXT: #pragma omp target parallel for map(tofrom: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(arr1[0:10], i)
+// CHECK-NEXT: #pragma omp target parallel for map(tofrom: arr1[0:10],i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(to: i) map(from: j)
+// CHECK-NEXT: #pragma omp target parallel for map(to: i) map(from: j)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(always,alloc: i)
+// CHECK-NEXT: #pragma omp target parallel for map(always,alloc: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for nowait
+// CHECK-NEXT: #pragma omp target parallel for nowait
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for depend(in : argc, arr[i:argc], arr1[:])
+// CHECK-NEXT: #pragma omp target parallel for depend(in : argc,arr[i:argc],arr1[:])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for defaultmap(tofrom: scalar)
+// CHECK-NEXT: #pragma omp target parallel for defaultmap(tofrom: scalar)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int arr[5][10], arr1[5];
+  int i, j;
+  int s;
+  static int a;
+// CHECK: static int a;
+  static float g;
+#pragma omp threadprivate(g)
+#pragma omp target parallel for schedule(guided, argc) default(none) linear(a)
+  // CHECK: #pragma omp target parallel for schedule(guided, argc) default(none) linear(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target parallel for private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) schedule(auto) ordered if (target: argc) num_threads(a) default(shared) shared(e) reduction(+ : h) linear(a:-5)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      foo();
+  // CHECK-NEXT: #pragma omp target parallel for private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) schedule(auto) ordered if(target: argc) num_threads(a) default(shared) shared(e) reduction(+: h) linear(a: -5)
+ // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: foo();
+#pragma omp target parallel for default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(5) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:5][0:10])
+  for (int i = 0; i < 2; ++i) {}
+// CHECK-NEXT: #pragma omp target parallel for default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(5) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (5) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:5][:argc]) reduction(&& : h)
+// CHECK-NEXT: #pragma omp target parallel for if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(&&: h)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (target:argc > 0)
+// CHECK-NEXT: #pragma omp target parallel for if(target: argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (parallel:argc > 0)
+// CHECK-NEXT: #pragma omp target parallel for if(parallel: argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (5)
+// CHECK-NEXT: #pragma omp target parallel for if(5)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(i)
+// CHECK-NEXT: #pragma omp target parallel for map(tofrom: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(arr1[0:10], i)
+// CHECK-NEXT: #pragma omp target parallel for map(tofrom: arr1[0:10],i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(to: i) map(from: j)
+// CHECK-NEXT: #pragma omp target parallel for map(to: i) map(from: j)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(always,alloc: i)
+// CHECK-NEXT: #pragma omp target parallel for map(always,alloc: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for nowait
+// CHECK-NEXT: #pragma omp target parallel for nowait
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for depend(in : argc, arr[i:argc], arr1[:])
+// CHECK-NEXT: #pragma omp target parallel for depend(in : argc,arr[i:argc],arr1[:])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for defaultmap(tofrom: scalar)
+// CHECK-NEXT: #pragma omp target parallel for defaultmap(tofrom: scalar)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+  return (tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]));
+}
+
+#endif
diff --git a/test/OpenMP/target_parallel_for_collapse_messages.cpp b/test/OpenMP/target_parallel_for_collapse_messages.cpp
new file mode 100644
index 0000000..8cf502b
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_collapse_messages.cpp
@@ -0,0 +1,83 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+  #pragma omp target parallel for collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+  #pragma omp target parallel for collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for', but found only 1}}
+  // expected-error@+3 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target parallel for'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel for collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+  #pragma omp target parallel for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+  #pragma omp target parallel for collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  #pragma omp target parallel for collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target parallel for'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_default_messages.cpp b/test/OpenMP/target_parallel_for_default_messages.cpp
new file mode 100644
index 0000000..c1f04f4
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_default_messages.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target parallel for default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target parallel for default(shared), default(shared) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target parallel for default(none)
+  for (i = 0; i < argc; ++i)  // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target parallel for default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_defaultmap_messages.cpp b/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
new file mode 100644
index 0000000..24973ed
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_depend_messages.cpp b/test/OpenMP/target_parallel_for_depend_messages.cpp
new file mode 100644
index 0000000..cf17d7a
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_depend_messages.cpp
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+  int i;
+
+  #pragma omp target parallel for depend // expected-error {{expected '(' after 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (out: ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[0])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[-1:0])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in : argv[ : argc][1 : argc - 1])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in : arr[0])
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_device_messages.cpp b/test/OpenMP/target_parallel_for_device_messages.cpp
new file mode 100644
index 0000000..16a21ba
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_device_messages.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for device // expected-error {{expected '(' after 'device'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc + argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc), device (argc+1) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'device' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_firstprivate_messages.cpp b/test/OpenMP/target_parallel_for_firstprivate_messages.cpp
new file mode 100644
index 0000000..36bfe25
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_firstprivate_messages.cpp
@@ -0,0 +1,261 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target parallel for firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target parallel for firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp target parallel for'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i)    // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target parallel for firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+  static int si;
+#pragma omp target parallel for firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_if_messages.cpp b/test/OpenMP/target_parallel_for_if_messages.cpp
new file mode 100644
index 0000000..01173c1
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_if_messages.cpp
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target parallel for if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_parallel_for_lastprivate_messages.cpp b/test/OpenMP/target_parallel_for_lastprivate_messages.cpp
new file mode 100644
index 0000000..c001b7f
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_lastprivate_messages.cpp
@@ -0,0 +1,238 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  S2 &operator=(const S2 &);
+  const S2 &operator=(const S2 &) const;
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target parallel for lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp target parallel for'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(i)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(xa)
+#pragma omp target parallel for lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel reduction(+ : xa)
+#pragma omp target parallel for lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target parallel for lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 2;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_linear_messages.cpp b/test/OpenMP/target_parallel_for_linear_messages.cpp
new file mode 100644
index 0000000..36e897d
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_linear_messages.cpp
@@ -0,0 +1,269 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons() {
+  int B = 0;
+#pragma omp target parallel for linear(B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'}}
+#pragma omp target parallel for linear(B::ib : B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+#pragma omp target parallel for linear(B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'?}}
+#pragma omp target parallel for linear(z : B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(B : B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(B, ::z, X::x)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for linear(B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(B::ib, B : C1 + C2)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+
+template <int L, class T, class N>
+T test_template(T *arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = -num * L; // expected-note {{'ind2' defined here}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type}}
+#pragma omp target parallel for linear(ind2 : L)
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template <int LEN>
+int test_warn() {
+  int ind2 = 0;
+// expected-warning@+1 {{zero linear step (ind2 should probably be const)}}
+#pragma omp target parallel for linear(ind2 : LEN)
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+
+public:
+  S5(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc : 5)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target parallel for linear(a, b : B::ib)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for linear(v : i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target parallel for linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  int v = 0;
+#pragma omp target parallel for linear(v : j)
+  for (int k = 0; k < argc; ++k) {
+    ++k;
+    v += j;
+  }
+#pragma omp target parallel for linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+#pragma omp target parallel for linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target parallel for linear(a, b)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{argument of a linear clause should be of integral or pointer type, not 'S4'}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+#pragma omp target parallel for linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target parallel for linear(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+#pragma omp target parallel for linear(i : 4)
+    for (int k = 0; k < argc; ++k) {
+      ++k;
+      i += 4;
+    }
+  }
+#pragma omp target parallel for linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  foomain<int, char>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/target_parallel_for_loop_messages.cpp b/test/OpenMP/target_parallel_for_loop_messages.cpp
new file mode 100644
index 0000000..0e8eab1
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_loop_messages.cpp
@@ -0,0 +1,627 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+class S {
+  int a;
+  S() : a(0) {}
+
+public:
+  S(int v) : a(v) {}
+  S(const S &s) : a(s.a) {}
+};
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+#pragma omp target parallel for
+  for (int i = 0; i < 10; i += 1) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for
+  for (char i = 0; i < 10; i += '\1') {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp target parallel for
+  for (long long i = 0; i < 10; i += 1.5) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for
+  for (long long i = 0; i < 'z'; i += 1u) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target parallel for
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target parallel for
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (ii + 1; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (c[ii] = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// Ok to skip parenthesises.
+#pragma omp target parallel for
+  for (((ii)) = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+// expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0;; i++)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ++++ii)
+    c[ii] = a[ii];
+
+// Ok but undefined behavior (in general, cannot check that incr
+// is really loop-invariant).
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+// Ok - step was converted to integer type.
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{relational comparison result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii<10; jj> kk + 2)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; (ii) < 10; ii -= 25)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; (ii < 10); ii -= 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; ii > 10; (ii += 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; (ii) = (1 - 1) + (ii))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for ((ii = 0); ii > 10; (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; (ii < 10); (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as firstprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+#pragma omp target parallel for firstprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as linear}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be linear, predetermined as private}}
+#pragma omp target parallel for linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target parallel for private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target parallel for lastprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  {
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be threadprivate or thread local, predetermined as private}}
+#pragma omp target parallel for
+    for (sii = 0; sii < 10; sii += 1)
+      c[sii] = a[sii];
+  }
+
+  {
+#pragma omp target parallel for
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] = a[globalii];
+  }
+
+  {
+#pragma omp target parallel for collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+// expected-error@+2 {{statement after '#pragma omp target parallel for' must be a for loop}}
+#pragma omp target parallel for
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int(*lb)[4] = nullptr;
+#pragma omp target parallel for
+  for (int(*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (int a{0}; a < 10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag {};
+template <class Iter>
+struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+public:
+  Iter0() {}
+  Iter0(const Iter0 &) {}
+  Iter0 operator++() { return *this; }
+  Iter0 operator--() { return *this; }
+  bool operator<(Iter0 a) { return true; }
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'Iter0' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator-(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+public:
+  Iter1(float f = 0.0f, double d = 0.0) {}
+  Iter1(const Iter1 &) {}
+  Iter1 operator++() { return *this; }
+  Iter1 operator--() { return *this; }
+  bool operator<(Iter1 a) { return true; }
+  bool operator>=(Iter1 a) { return false; }
+};
+class GoodIter {
+public:
+  GoodIter() {}
+  GoodIter(const GoodIter &) {}
+  GoodIter(int fst, int snd) {}
+  GoodIter &operator=(const GoodIter &that) { return *this; }
+  GoodIter &operator=(const Iter0 &that) { return *this; }
+  GoodIter &operator+=(int x) { return *this; }
+  GoodIter &operator-=(int x) { return *this; }
+  explicit GoodIter(void *) {}
+  GoodIter operator++() { return *this; }
+  GoodIter operator--() { return *this; }
+  bool operator!() { return true; }
+  bool operator<(GoodIter a) { return true; }
+  bool operator<=(GoodIter a) { return true; }
+  bool operator>=(GoodIter a) { return false; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator-(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator-(GoodIter a) { return a; }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
+GoodIter operator+(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'int' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator-(int v, GoodIter a) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 1st argument}}
+GoodIter operator+(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+#pragma omp target parallel for
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter I(1, 2); I < end; ++I)
+    ++I;
+#pragma omp target parallel for
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target parallel for
+  for (begin = begin0; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (++begin; begin < end; ++begin)
+    ++begin;
+#pragma omp target parallel for
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+// expected-error@+2 {{invalid operands to binary expression ('Iter0' and 'int')}}
+#pragma omp target parallel for
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+// Initializer is constructor without params.
+// expected-error@+3 {{invalid operands to binary expression ('Iter0' and 'int')}}
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+  Iter1 begin1, end1;
+// expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target parallel for
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+// expected-error@+5 {{invalid operands to binary expression ('Iter1' and 'float')}}
+// expected-error@+4 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+// Initializer is constructor with all default params.
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (Iter1 I; I < end1; ++I) {
+  }
+  return 0;
+}
+
+template <typename IT, int ST>
+class TC {
+public:
+  int dotest_lt(IT begin, IT end) {
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+    for (IT I = begin; I < end; I = I + ST) {
+      ++I;
+    }
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+    for (IT I = begin; I <= end; I += ST) {
+      ++I;
+    }
+#pragma omp target parallel for
+    for (IT I = begin; I < end; ++I) {
+      ++I;
+    }
+  }
+
+  static IT step() {
+    return IT(ST);
+  }
+};
+template <typename IT, int ST = 0>
+int dotest_gt(IT begin, IT end) {
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+#pragma omp target parallel for
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end);         // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end);            // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target parallel for
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target parallel for
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try {
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i];
+      }
+      throw a[i];
+    } catch (float f) {
+      if (f > 0.1)
+        throw a[i];
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i];
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
+void test_loop_firstprivate_lastprivate() {
+  S s(4);
+#pragma omp target parallel for lastprivate(s) firstprivate(s)
+  for (int i = 0; i < 16; ++i)
+    ;
+}
diff --git a/test/OpenMP/target_parallel_for_map_messages.cpp b/test/OpenMP/target_parallel_for_map_messages.cpp
new file mode 100644
index 0000000..f0019f9
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_map_messages.cpp
@@ -0,0 +1,281 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to, tofrom, always;
+  const T (&l)[5] = da;
+
+
+#pragma omp target parallel for map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l[-1:]) // expected-error 2 {{array section must be a subset of the original array}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l[:-1]) // expected-error 2 {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom: t[:I])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(T) // expected-error {{'T' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target parallel for map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+{
+#pragma omp target parallel for map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l)
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target parallel for map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to, tofrom, always;
+  const int (&l)[5] = da;
+
+#pragma omp target parallel for map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l[-1:]) // expected-error {{array section must be a subset of the original array}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l[:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(argv[1])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target parallel for map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l) map(l[:5]) // expected-error 1 {{variable already marked as mapped in current construct}} expected-note 1 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note {{used here}}
+{
+#pragma omp target parallel for map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l)
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target parallel for map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_for_messages.cpp b/test/OpenMP/target_parallel_for_messages.cpp
new file mode 100644
index 0000000..173025c
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_messages.cpp
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 -o - %s
+
+void foo() {
+}
+
+static int pvt;
+#pragma omp threadprivate(pvt)
+
+#pragma omp target parallel for // expected-error {{unexpected OpenMP directive '#pragma omp target parallel for'}}
+
+int main(int argc, char **argv) {
+#pragma omp target parallel for { // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for ( // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for[ // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for] // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for } // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i)
+    foo();
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for unknown()
+  for (int i = 0; i < argc; ++i)
+    foo();
+L1:
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i) {
+    goto L1; // expected-error {{use of undeclared label 'L1'}}
+    argc++;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    switch (argc) {
+    case (0):
+#pragma omp target parallel for
+      for (int i = 0; i < argc; ++i) {
+        foo();
+        break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+        continue;
+      }
+    default:
+      break;
+    }
+  }
+#pragma omp target parallel for default(none)
+  for (int i = 0; i < 10; ++i)
+    ++argc; // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+
+  goto L2; // expected-error {{use of undeclared label 'L2'}}
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i)
+  L2:
+  foo();
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i) {
+    return 1; // expected-error {{cannot return from OpenMP region}}
+  }
+
+  [[]] // expected-error {{an attribute list cannot appear here}}
+#pragma omp target parallel for
+      for (int n = 0; n < 100; ++n) {
+  }
+
+#pragma omp target parallel for copyin(pvt) // expected-error {{unexpected OpenMP clause 'copyin' in directive '#pragma omp target parallel for'}}
+  for (int n = 0; n < 100; ++n) {}
+
+  return 0;
+}
+
+void test_ordered() {
+#pragma omp target parallel for ordered ordered // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'ordered' clause}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
diff --git a/test/OpenMP/target_parallel_for_misc_messages.c b/test/OpenMP/target_parallel_for_misc_messages.c
new file mode 100644
index 0000000..cfe83f1
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_misc_messages.c
@@ -0,0 +1,314 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target parallel for'}}
+#pragma omp target parallel for
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target parallel for'}}
+#pragma omp target parallel for foo
+
+void test_no_clause() {
+  int i;
+#pragma omp target parallel for
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{statement after '#pragma omp target parallel for' must be a for loop}}
+#pragma omp target parallel for
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target parallel for
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for;
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+
+void test_collapse() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for collapse
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+#pragma omp target parallel for collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-note@+1 {{defined as firstprivate}}
+#pragma omp target parallel for collapse(2) firstprivate(i)
+  for (i = 0; i < 16; ++i)
+// expected-note@+1 {{variable with automatic storage duration is predetermined as private; perhaps you forget to enclose 'omp for' directive into a parallel or another task region?}}
+    for (int j = 0; j < 16; ++j)
+// expected-error@+2 2 {{reduction variable must be shared}}
+// expected-error@+1 {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+#pragma omp for reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+}
+
+void test_private() {
+  int i;
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for private(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for private(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for private()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_lastprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_firstprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for firstprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for firstprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for firstprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for firstprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for firstprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for lastprivate(x) firstprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for lastprivate(x, y) firstprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for lastprivate(x, y, z) firstprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target parallel for
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target parallel for
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
diff --git a/test/OpenMP/target_parallel_for_nowait_messages.cpp b/test/OpenMP/target_parallel_for_nowait_messages.cpp
new file mode 100644
index 0000000..06d2296
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_nowait_messages.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for nowait( // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for nowait device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_num_threads_messages.cpp b/test/OpenMP/target_parallel_for_num_threads_messages.cpp
new file mode 100644
index 0000000..915cfb1
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_num_threads_messages.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  T i;
+  #pragma omp target parallel for num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_ordered_messages.cpp b/test/OpenMP/target_parallel_for_ordered_messages.cpp
new file mode 100644
index 0000000..36eb837
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_ordered_messages.cpp
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {                   //expected-note 2 {{declared here}}
+#pragma omp target parallel for ordered
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered() // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+// expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target parallel for ordered(argc
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+#pragma omp target parallel for ordered(ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+3 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'ordered' clause}}
+// expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+// expected-error@+1 2 {{expression is not an integral constant expression}}
+#pragma omp target parallel for ordered(foobool(argc)), ordered(true), ordered(-5)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+1 2 {{expression is not an integral constant expression}}
+#pragma omp target parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(1)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(N) // expected-error {{argument to 'ordered' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(2) // expected-note {{as specified in 'ordered' clause}}
+  foo();                            // expected-error {{expected 2 for loops after '#pragma omp target parallel for'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target parallel for ordered
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for ordered( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for ordered() // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for ordered(4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'ordered' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+#pragma omp target parallel for ordered(2 + 2))      // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}  expected-note {{as specified in 'ordered' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];            // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+#pragma omp target parallel for ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-error@+3 {{expression is not an integral constant expression}}
+// expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'ordered' clause}}
+// expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+#pragma omp target parallel for ordered(foobool(argc)), ordered(true), ordered(-5)
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for ordered(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-error@+1 {{expression is not an integral constant expression}}
+#pragma omp target parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-error@+3 {{statement after '#pragma omp target parallel for' must be a for loop}}
+// expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target parallel for ordered(ordered(tmain < int, char, -1, -2 > (argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target parallel for ordered(2) // expected-note {{as specified in 'ordered' clause}}
+  foo();                            // expected-error {{expected 2 for loops after '#pragma omp target parallel for'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_private_messages.cpp b/test/OpenMP/target_parallel_for_private_messages.cpp
new file mode 100644
index 0000000..1d6381a
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_private_messages.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target parallel for private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target parallel for private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target parallel for private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target parallel for private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_for_proc_bind_messages.cpp b/test/OpenMP/target_parallel_for_proc_bind_messages.cpp
new file mode 100644
index 0000000..eeb232a
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_proc_bind_messages.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target parallel for proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target parallel for proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target parallel for proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_reduction_messages.cpp b/test/OpenMP/target_parallel_for_reduction_messages.cpp
new file mode 100644
index 0000000..16697a9
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_reduction_messages.cpp
@@ -0,0 +1,313 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target parallel for reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel for reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target parallel for reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel for reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target parallel for reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_schedule_messages.cpp b/test/OpenMP/target_parallel_for_schedule_messages.cpp
new file mode 100644
index 0000000..075e1df
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_schedule_messages.cpp
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  #pragma omp target parallel for schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp target parallel for schedule (guided argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for schedule (static, ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (dynamic, 1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (guided, (ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'schedule' clause}}
+  // expected-error@+1 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for schedule (static, foobool(argc)), schedule (dynamic, true), schedule (guided, -5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (static, S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target parallel for schedule (guided, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (dynamic, 1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (static, N) // expected-error {{argument to 'schedule' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel for schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (guided, 4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (static, 2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (dynamic, foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'schedule' clause}}
+  // expected-error@+1 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for schedule (guided, foobool(argc)), schedule (static, true), schedule (dynamic, -5)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (guided, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+1 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target parallel for schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for schedule(dynamic, schedule(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_aligned_messages.cpp b/test/OpenMP/target_parallel_for_simd_aligned_messages.cpp
new file mode 100644
index 0000000..669cafe
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_aligned_messages.cpp
@@ -0,0 +1,203 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
+
+struct B {
+  static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
+  static constexpr int bfoo() { return 8; }
+};
+namespace X {
+  B x; // expected-note {{'x' defined here}}
+};
+constexpr int bfoo() { return 4; }
+
+int **z;
+const int C1 = 1;
+const int C2 = 2;
+void test_aligned_colons(int *&rp)
+{
+  int *B = 0;
+  #pragma omp target parallel for simd aligned(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'}}
+  #pragma omp target parallel for simd aligned(B::ib:B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target parallel for simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  #pragma omp target parallel for simd aligned(z:B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target parallel for simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'int **'}}
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'B'}}
+  #pragma omp target parallel for simd aligned(X::x : ::z)
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{integral constant expression must have integral or unscoped enumeration type, not 'B'}}
+  #pragma omp target parallel for simd aligned(B,rp,::z: X::x)
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target parallel for simd aligned(::z)
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{expected variable name}}
+  #pragma omp target parallel for simd aligned(B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-warning@+1 {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  #pragma omp target parallel for simd aligned(B::ib,B:C1+C2)
+  for (int i = 0; i < 10; ++i) ;
+}
+
+// expected-note@+1 {{'num' defined here}}
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L;
+  // Negative number is passed as L.
+  // expected-error@+1 {{argument to 'aligned' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd aligned(arr:L)
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target parallel for simd aligned(num:4)
+  for (i = 0; i < num; ++i);
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int *ind2 = 0;
+  // expected-error@+1 {{argument to 'aligned' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd aligned(ind2:LEN)
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return 0;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a; // expected-note {{'a' declared here}}
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 1 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h; // expected-note 2 {{'h' defined here}}
+#pragma omp threadprivate(h)
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(argc);
+  I g(argc);
+  int i; // expected-note {{declared here}} expected-note {{'i' defined here}}
+  // expected-note@+2 {{declared here}}
+  // expected-note@+1 {{reference to 'i' is not a constant expression}}
+  int &j = i;
+  #pragma omp target parallel for simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned () // expected-error {{expected expression}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc : 5) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned(e, g)
+  for (I k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  #pragma omp target parallel for simd aligned(h)
+  for (I k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target parallel for simd aligned(i)
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp parallel
+  {
+    int *v = 0;
+    I i;
+    #pragma omp target parallel for simd aligned(v:16)
+    for (I k = 0; k < argc; ++k) { i = k; v += 2; }
+  }
+  float *f;
+  #pragma omp target parallel for simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+  int v = 0;
+  // expected-note@+2 {{initializer of 'j' is not a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for simd aligned(f:j)
+  for (I k = 0; k < argc; ++k) { ++k; v += j; }
+  #pragma omp target parallel for simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+  return 0;
+}
+
+// expected-note@+1 2 {{'argc' defined here}}
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  test_warn<4>(); // ok
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  int i;
+  int &j = i;
+  #pragma omp target parallel for simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argv // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target parallel for simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target parallel for simd aligned (argc)
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+2 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S1'}}
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S2'}}
+  #pragma omp target parallel for simd aligned (a, b) 
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  #pragma omp target parallel for simd aligned(h)
+  for (int k = 0; k < argc; ++k) ++k;
+  int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
+  foomain<int*,char>(pargc,argv);
+  return 0;
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_ast_print.cpp b/test/OpenMP/target_parallel_for_simd_ast_print.cpp
new file mode 100644
index 0000000..e25f93f
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_ast_print.cpp
@@ -0,0 +1,308 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target parallel for simd private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc, T *argv) {
+  T b = argc, c, d, e, f, h;
+  T arr[N][10], arr1[N];
+  T i, j;
+  T s;
+  static T a;
+// CHECK: static T a;
+  static T g;
+  const T clen = 5;
+// CHECK: T clen = 5;
+#pragma omp threadprivate(g)
+#pragma omp target parallel for simd schedule(dynamic) default(none) linear(a)
+  // CHECK: #pragma omp target parallel for simd schedule(dynamic) default(none) linear(a)
+  for (T i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (T i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target parallel for simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) schedule(static, N) ordered(N) if (parallel :argc) num_threads(N) default(shared) shared(e) reduction(+ : h)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+            foo();
+  // CHECK-NEXT: #pragma omp target parallel for simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) schedule(static, N) ordered(N) if(parallel: argc) num_threads(N) default(shared) shared(e) reduction(+: h)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: foo();
+
+#pragma omp target parallel for simd default(none), private(argc,b) firstprivate(argv) shared (d) if(parallel:argc > 0) num_threads(N) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:N][0:10])
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(N) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:N][0:10])
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if(N) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:N][:argc]) reduction(&& : h)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(N) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:N][:argc]) reduction(&&: h)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if(target:argc > 0)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(target: argc > 0)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if(parallel:argc > 0)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(parallel: argc > 0)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if(N)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(N)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(tofrom: i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(arr1[0:10], i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(tofrom: arr1[0:10],i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(to: i) map(from: j)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(to: i) map(from: j)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(always,alloc: i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(always,alloc: i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd nowait
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd nowait
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd depend(in : argc, arr[i:argc], arr1[:])
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd depend(in : argc,arr[i:argc],arr1[:])
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd defaultmap(tofrom: scalar)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd defaultmap(tofrom: scalar)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd safelen(clen-1)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd safelen(clen - 1)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd simdlen(clen-1)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd simdlen(clen - 1)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd aligned(arr1:N-1)
+  for (T i = 0; i < N; ++i) {}
+  // CHECK: #pragma omp target parallel for simd aligned(arr1: N - 1)
+  // CHECK-NEXT: for (T i = 0; i < N; ++i) {
+  // CHECK-NEXT: }
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int arr[5][10], arr1[5];
+  int i, j;
+  int s;
+  static int a;
+// CHECK: static int a;
+  const int clen = 5;
+// CHECK: int clen = 5;
+  static float g;
+#pragma omp threadprivate(g)
+#pragma omp target parallel for simd schedule(guided, argc) default(none) linear(a)
+  // CHECK: #pragma omp target parallel for simd schedule(guided, argc) default(none) linear(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+
+#pragma omp target parallel for simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) schedule(auto) ordered if (target: argc) num_threads(a) default(shared) shared(e) reduction(+ : h) linear(a:-5)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      foo();
+  // CHECK: #pragma omp target parallel for simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) schedule(auto) ordered if(target: argc) num_threads(a) default(shared) shared(e) reduction(+: h) linear(a: -5)
+  // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: foo();
+
+#pragma omp target parallel for simd default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(5) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:5][0:10])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(5) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if (5) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:5][:argc]) reduction(&& : h)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(&&: h)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if (target:argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(target: argc > 0)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if (parallel:argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(parallel: argc > 0)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if (5)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(5)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd  map(tofrom: i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(arr1[0:10], i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(tofrom: arr1[0:10],i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(to: i) map(from: j)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(to: i) map(from: j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(always,alloc: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(always,alloc: i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd nowait
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd nowait
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd depend(in : argc, arr[i:argc], arr1[:])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd depend(in : argc,arr[i:argc],arr1[:])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd defaultmap(tofrom: scalar)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd defaultmap(tofrom: scalar)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd safelen(clen-1)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd safelen(clen - 1)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd simdlen(clen-1)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd simdlen(clen - 1)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd aligned(arr1:4)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd aligned(arr1: 4)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+  return (tmain<int, 5>(argc, &argc));
+}
+
+#endif
diff --git a/test/OpenMP/target_parallel_for_simd_collapse_messages.cpp b/test/OpenMP/target_parallel_for_simd_collapse_messages.cpp
new file mode 100644
index 0000000..ecf2d6e
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_collapse_messages.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+ // expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+  int j; // expected-note {{declared here}}
+  #pragma omp target parallel for simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+  #pragma omp target parallel for simd collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+5 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+3 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for simd collapse (j=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target parallel for simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int j; // expected-note {{declared here}}
+  #pragma omp target parallel for simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+  #pragma omp target parallel for simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif 
+  #pragma omp target parallel for simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus >= 201103L
+  // expected-note@+5 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for simd collapse (j=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for simd collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  #pragma omp target parallel for simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target parallel for simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_default_messages.cpp b/test/OpenMP/target_parallel_for_simd_default_messages.cpp
new file mode 100644
index 0000000..5d41cbc
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_default_messages.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target parallel for simd default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target parallel for simd default(shared), default(shared) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target parallel for simd default(none)
+  for (i = 0; i < argc; ++i)  // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target parallel for simd default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp b/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
new file mode 100644
index 0000000..e922e0a
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_depend_messages.cpp b/test/OpenMP/target_parallel_for_simd_depend_messages.cpp
new file mode 100644
index 0000000..a8b4de7
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_depend_messages.cpp
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+  int i;
+
+  #pragma omp target parallel for simd depend // expected-error {{expected '(' after 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (out: ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[0])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[-1:0])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in : argv[ : argc][1 : argc - 1])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in : arr[0])
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_device_messages.cpp b/test/OpenMP/target_parallel_for_simd_device_messages.cpp
new file mode 100644
index 0000000..2c9d43f
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_device_messages.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd device // expected-error {{expected '(' after 'device'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc + argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc), device (argc+1) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'device' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_firstprivate_messages.cpp b/test/OpenMP/target_parallel_for_simd_firstprivate_messages.cpp
new file mode 100644
index 0000000..9397314
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_firstprivate_messages.cpp
@@ -0,0 +1,261 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target parallel for simd firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i)    // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target parallel for simd firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+  static int si;
+#pragma omp target parallel for simd firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_simd_if_messages.cpp b/test/OpenMP/target_parallel_for_simd_if_messages.cpp
new file mode 100644
index 0000000..b9e2891
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_if_messages.cpp
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target parallel for simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_parallel_for_simd_lastprivate_messages.cpp b/test/OpenMP/target_parallel_for_simd_lastprivate_messages.cpp
new file mode 100644
index 0000000..51fc724
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_lastprivate_messages.cpp
@@ -0,0 +1,238 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  S2 &operator=(const S2 &);
+  const S2 &operator=(const S2 &) const;
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target parallel for simd lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(i)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(xa)
+#pragma omp target parallel for simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel reduction(+ : xa)
+#pragma omp target parallel for simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target parallel for simd lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 2;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_simd_linear_messages.cpp b/test/OpenMP/target_parallel_for_simd_linear_messages.cpp
new file mode 100644
index 0000000..e17f155
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_linear_messages.cpp
@@ -0,0 +1,269 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons() {
+  int B = 0;
+#pragma omp target parallel for simd linear(B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'}}
+#pragma omp target parallel for simd linear(B::ib : B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+#pragma omp target parallel for simd linear(B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'?}}
+#pragma omp target parallel for simd linear(z : B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(B : B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(B, ::z, X::x)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for simd linear(B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(B::ib, B : C1 + C2)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+
+template <int L, class T, class N>
+T test_template(T *arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = -num * L; // expected-note {{'ind2' defined here}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type}}
+#pragma omp target parallel for simd linear(ind2 : L)
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template <int LEN>
+int test_warn() {
+  int ind2 = 0;
+// expected-warning@+1 {{zero linear step (ind2 should probably be const)}}
+#pragma omp target parallel for simd linear(ind2 : LEN)
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+
+public:
+  S5(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc : 5)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target parallel for simd linear(a, b : B::ib)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd linear(v : i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  int v = 0;
+#pragma omp target parallel for simd linear(v : j)
+  for (int k = 0; k < argc; ++k) {
+    ++k;
+    v += j;
+  }
+#pragma omp target parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target parallel for simd linear(a, b)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{argument of a linear clause should be of integral or pointer type, not 'S4'}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+#pragma omp target parallel for simd linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target parallel for simd linear(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+#pragma omp target parallel for simd linear(i : 4)
+    for (int k = 0; k < argc; ++k) {
+      ++k;
+      i += 4;
+    }
+  }
+#pragma omp target parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  foomain<int, char>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_loop_messages.cpp b/test/OpenMP/target_parallel_for_simd_loop_messages.cpp
new file mode 100644
index 0000000..c0dceed
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_loop_messages.cpp
@@ -0,0 +1,627 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+class S {
+  int a;
+  S() : a(0) {}
+
+public:
+  S(int v) : a(v) {}
+  S(const S &s) : a(s.a) {}
+};
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; i += 1) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for simd
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for simd
+  for (char i = 0; i < 10; i += '\1') {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for simd
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp target parallel for simd
+  for (long long i = 0; i < 10; i += 1.5) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for simd
+  for (long long i = 0; i < 'z'; i += 1u) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target parallel for simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target parallel for simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (ii + 1; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (c[ii] = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// Ok to skip parenthesises.
+#pragma omp target parallel for simd
+  for (((ii)) = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+// expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0;; i++)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for simd
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ++++ii)
+    c[ii] = a[ii];
+
+// Ok but undefined behavior (in general, cannot check that incr
+// is really loop-invariant).
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+// Ok - step was converted to integer type.
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{relational comparison result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii<10; jj> kk + 2)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; (ii) < 10; ii -= 25)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; (ii < 10); ii -= 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii > 10; (ii += 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; (ii) = (1 - 1) + (ii))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for ((ii = 0); ii > 10; (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; (ii < 10); (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as firstprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+#pragma omp target parallel for simd firstprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as linear}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be linear, predetermined as private}}
+#pragma omp target parallel for simd linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target parallel for simd private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target parallel for simd lastprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  {
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be threadprivate or thread local, predetermined as private}}
+#pragma omp target parallel for simd
+    for (sii = 0; sii < 10; sii += 1)
+      c[sii] = a[sii];
+  }
+
+  {
+#pragma omp target parallel for simd
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] = a[globalii];
+  }
+
+  {
+#pragma omp target parallel for simd collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+// expected-error@+2 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+#pragma omp target parallel for simd
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int(*lb)[4] = nullptr;
+#pragma omp target parallel for simd
+  for (int(*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (int a{0}; a < 10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag {};
+template <class Iter>
+struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+public:
+  Iter0() {}
+  Iter0(const Iter0 &) {}
+  Iter0 operator++() { return *this; }
+  Iter0 operator--() { return *this; }
+  bool operator<(Iter0 a) { return true; }
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'Iter0' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator-(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+public:
+  Iter1(float f = 0.0f, double d = 0.0) {}
+  Iter1(const Iter1 &) {}
+  Iter1 operator++() { return *this; }
+  Iter1 operator--() { return *this; }
+  bool operator<(Iter1 a) { return true; }
+  bool operator>=(Iter1 a) { return false; }
+};
+class GoodIter {
+public:
+  GoodIter() {}
+  GoodIter(const GoodIter &) {}
+  GoodIter(int fst, int snd) {}
+  GoodIter &operator=(const GoodIter &that) { return *this; }
+  GoodIter &operator=(const Iter0 &that) { return *this; }
+  GoodIter &operator+=(int x) { return *this; }
+  GoodIter &operator-=(int x) { return *this; }
+  explicit GoodIter(void *) {}
+  GoodIter operator++() { return *this; }
+  GoodIter operator--() { return *this; }
+  bool operator!() { return true; }
+  bool operator<(GoodIter a) { return true; }
+  bool operator<=(GoodIter a) { return true; }
+  bool operator>=(GoodIter a) { return false; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator-(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator-(GoodIter a) { return a; }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
+GoodIter operator+(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'int' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator-(int v, GoodIter a) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 1st argument}}
+GoodIter operator+(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter I(1, 2); I < end; ++I)
+    ++I;
+#pragma omp target parallel for simd
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target parallel for simd
+  for (begin = begin0; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (++begin; begin < end; ++begin)
+    ++begin;
+#pragma omp target parallel for simd
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+// expected-error@+2 {{invalid operands to binary expression ('Iter0' and 'int')}}
+#pragma omp target parallel for simd
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+// Initializer is constructor without params.
+// expected-error@+3 {{invalid operands to binary expression ('Iter0' and 'int')}}
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+  Iter1 begin1, end1;
+// expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target parallel for simd
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+// expected-error@+5 {{invalid operands to binary expression ('Iter1' and 'float')}}
+// expected-error@+4 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+// Initializer is constructor with all default params.
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (Iter1 I; I < end1; ++I) {
+  }
+  return 0;
+}
+
+template <typename IT, int ST>
+class TC {
+public:
+  int dotest_lt(IT begin, IT end) {
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+    for (IT I = begin; I < end; I = I + ST) {
+      ++I;
+    }
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+    for (IT I = begin; I <= end; I += ST) {
+      ++I;
+    }
+#pragma omp target parallel for simd
+    for (IT I = begin; I < end; ++I) {
+      ++I;
+    }
+  }
+
+  static IT step() {
+    return IT(ST);
+  }
+};
+template <typename IT, int ST = 0>
+int dotest_gt(IT begin, IT end) {
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+#pragma omp target parallel for simd
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end);         // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end);            // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try {
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i];
+      }
+      throw a[i];
+    } catch (float f) {
+      if (f > 0.1)
+        throw a[i];
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i];
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
+void test_loop_firstprivate_lastprivate() {
+  S s(4);
+#pragma omp target parallel for simd lastprivate(s) firstprivate(s)
+  for (int i = 0; i < 16; ++i)
+    ;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_map_messages.cpp b/test/OpenMP/target_parallel_for_simd_map_messages.cpp
new file mode 100644
index 0000000..195b39c
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_map_messages.cpp
@@ -0,0 +1,281 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to, tofrom, always;
+  const T (&l)[5] = da;
+
+
+#pragma omp target parallel for simd map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l[-1:]) // expected-error 2 {{array section must be a subset of the original array}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l[:-1]) // expected-error 2 {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom: t[:I])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(T) // expected-error {{'T' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target parallel for simd map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+{
+#pragma omp target parallel for simd map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l)
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target parallel for simd map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to, tofrom, always;
+  const int (&l)[5] = da;
+
+#pragma omp target parallel for simd map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel map(l[-1:]) // expected-error {{array section must be a subset of the original array}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel map(l[:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(argv[1])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target parallel for simd map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l) map(l[:5]) // expected-error 1 {{variable already marked as mapped in current construct}} expected-note 1 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note {{used here}}
+{
+#pragma omp target parallel for simd map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l)
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target parallel for simd map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_messages.cpp b/test/OpenMP/target_parallel_for_simd_messages.cpp
new file mode 100644
index 0000000..0e1a0fe
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_messages.cpp
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+static int pvt;
+#pragma omp threadprivate(pvt)
+
+#pragma omp target parallel for simd // expected-error {{unexpected OpenMP directive '#pragma omp target parallel for simd'}}
+
+int main(int argc, char **argv) {
+#pragma omp target parallel for simd { // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd ( // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd[ // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd] // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd } // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i)
+    foo();
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd unknown()
+  for (int i = 0; i < argc; ++i)
+    foo();
+L1:
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i) {
+    goto L1; // expected-error {{use of undeclared label 'L1'}}
+    argc++;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    switch (argc) {
+    case (0):
+#pragma omp target parallel for simd
+      for (int i = 0; i < argc; ++i) {
+        foo();
+        break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+        continue;
+      }
+    default:
+      break;
+    }
+  }
+#pragma omp target parallel for simd default(none)
+  for (int i = 0; i < 10; ++i)
+    ++argc; // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+
+  goto L2; // expected-error {{use of undeclared label 'L2'}}
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i)
+  L2:
+  foo();
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i) {
+    return 1; // expected-error {{cannot return from OpenMP region}}
+  }
+
+  [[]] // expected-error {{an attribute list cannot appear here}}
+#pragma omp target parallel for simd
+      for (int n = 0; n < 100; ++n) {
+  }
+
+#pragma omp target parallel for simd copyin(pvt) // expected-error {{unexpected OpenMP clause 'copyin' in directive '#pragma omp target parallel for simd'}}
+  for (int n = 0; n < 100; ++n) {}
+
+  return 0;
+}
+
+void test_ordered() {
+#pragma omp target parallel for simd ordered ordered // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'ordered' clause}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_misc_messages.c b/test/OpenMP/target_parallel_for_simd_misc_messages.c
new file mode 100644
index 0000000..2adc6f8
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_misc_messages.c
@@ -0,0 +1,495 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target parallel for simd'}}
+#pragma omp target parallel for simd
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target parallel for simd'}}
+#pragma omp target parallel for simd foo
+
+void test_no_clause() {
+  int i;
+#pragma omp target parallel for simd
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+#pragma omp target parallel for simd
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target parallel for simd
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd;
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+
+void test_collapse() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd collapse
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#pragma omp target parallel for simd collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-note@+1 {{defined as firstprivate}}
+#pragma omp target parallel for simd collapse(2) firstprivate(i)
+  for (i = 0; i < 16; ++i)
+// expected-note@+1 {{variable with automatic storage duration is predetermined as private; perhaps you forget to enclose 'omp for' directive into a parallel or another task region?}}
+    for (int j = 0; j < 16; ++j)
+// expected-error@+2 2 {{reduction variable must be shared}}
+// expected-error@+1 {{region cannot be closely nested inside 'target parallel for simd' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+#pragma omp for reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+}
+
+void test_private() {
+  int i;
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd private(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd private(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd private()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for simd private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for simd private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_lastprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for simd lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for simd lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_firstprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd firstprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd firstprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd firstprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd firstprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for simd firstprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for simd lastprivate(x) firstprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd lastprivate(x, y) firstprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd lastprivate(x, y, z) firstprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target parallel for simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target parallel for simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
+void test_safelen() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd safelen
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd safelen()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd safelen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd safelen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd safelen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd safelen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd safelen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd safelen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_simdlen() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd simdlen
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd simdlen()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd simdlen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd simdlen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd simdlen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_safelen_simdlen() {
+  int i;
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp target parallel for simd simdlen(6) safelen(5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp target parallel for simd safelen(5) simdlen(6)
+  for (i = 0; i < 16; ++i)
+    ;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_nowait_messages.cpp b/test/OpenMP/target_parallel_for_simd_nowait_messages.cpp
new file mode 100644
index 0000000..3c4b512
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_nowait_messages.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd nowait( // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd nowait device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_num_threads_messages.cpp b/test/OpenMP/target_parallel_for_simd_num_threads_messages.cpp
new file mode 100644
index 0000000..1076b86
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_num_threads_messages.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  T i;
+  #pragma omp target parallel for simd num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp b/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp
new file mode 100644
index 0000000..70a3b4e
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp
@@ -0,0 +1,122 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+ // expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {                   //expected-note 2 {{declared here}}
+  int j; // expected-note {{declared here}}
+#pragma omp target parallel for simd ordered
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered() // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+// expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target parallel for simd ordered(argc
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd ordered(ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+5 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+// expected-error@+3 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'ordered' clause}}
+// expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+// expected-error@+1 2 {{expression is not an integral constant expression}}
+#pragma omp target parallel for simd ordered(foobool(argc)), ordered(true), ordered(-5)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+// expected-error@+1 {{expression is not an integral constant expression}}
+#pragma omp target parallel for simd ordered(j = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(1)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(N) // expected-error {{argument to 'ordered' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(2) // expected-note {{as specified in 'ordered' clause}}
+  foo();                            // expected-error {{expected 2 for loops after '#pragma omp target parallel for simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int j; // expected-note {{declared here}}
+#pragma omp target parallel for simd ordered
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for simd ordered( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for simd ordered() // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for simd ordered(4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'ordered' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#pragma omp target parallel for simd ordered(2 + 2))      // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}  expected-note {{as specified in 'ordered' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];            // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target parallel for simd ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#if __cplusplus >= 201103L
+  // expected-note@+5 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+// expected-error@+3 {{expression is not an integral constant expression}}
+// expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'ordered' clause}}
+// expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd ordered(foobool(argc)), ordered(true), ordered(-5)
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for simd ordered(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+// expected-error@+1 {{expression is not an integral constant expression}}
+#pragma omp target parallel for simd ordered(j = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+// expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target parallel for simd ordered(ordered(tmain < int, char, -1, -2 > (argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target parallel for simd ordered(2) // expected-note {{as specified in 'ordered' clause}}
+  foo();                            // expected-error {{expected 2 for loops after '#pragma omp target parallel for simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_private_messages.cpp b/test/OpenMP/target_parallel_for_simd_private_messages.cpp
new file mode 100644
index 0000000..57262a5
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_private_messages.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target parallel for simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target parallel for simd private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target parallel for simd private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_proc_bind_messages.cpp b/test/OpenMP/target_parallel_for_simd_proc_bind_messages.cpp
new file mode 100644
index 0000000..5bb6d92
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_proc_bind_messages.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target parallel for simd proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target parallel for simd proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target parallel for simd proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_reduction_messages.cpp b/test/OpenMP/target_parallel_for_simd_reduction_messages.cpp
new file mode 100644
index 0000000..3999d38
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_reduction_messages.cpp
@@ -0,0 +1,313 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel for simd reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel for simd reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target parallel for simd reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_simd_safelen_messages.cpp b/test/OpenMP/target_parallel_for_simd_safelen_messages.cpp
new file mode 100644
index 0000000..f990101
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_safelen_messages.cpp
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+  #pragma omp target parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+  #pragma omp target parallel for simd safelen (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd safelen (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+6 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp target parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+  #pragma omp target parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen (4)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp target parallel for simd safelen (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+  #pragma omp target parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_schedule_messages.cpp b/test/OpenMP/target_parallel_for_simd_schedule_messages.cpp
new file mode 100644
index 0000000..f0d86e9
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_schedule_messages.cpp
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  #pragma omp target parallel for simd schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp target parallel for simd schedule (guided argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd schedule (static, ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (dynamic, 1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (guided, (ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'schedule' clause}}
+  // expected-error@+1 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd schedule (static, foobool(argc)), schedule (dynamic, true), schedule (guided, -5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (static, S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target parallel for simd schedule (guided, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (dynamic, 1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (static, N) // expected-error {{argument to 'schedule' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel for simd schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (guided, 4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (static, 2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (dynamic, foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'schedule' clause}}
+  // expected-error@+1 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd schedule (guided, foobool(argc)), schedule (static, true), schedule (dynamic, -5)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (guided, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+1 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target parallel for simd schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for simd schedule(dynamic, schedule(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_simdlen_messages.cpp b/test/OpenMP/target_parallel_for_simd_simdlen_messages.cpp
new file mode 100644
index 0000000..e51e67b
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_simdlen_messages.cpp
@@ -0,0 +1,142 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target parallel for simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+// expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target parallel for simd simdlen (argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  // expected-error@+6 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+ #pragma omp target parallel for simd simdlen (4)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  #pragma omp target parallel for simd simdlen (N) // expected-error {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (2), safelen (4) // OK
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (4), safelen (4) // OK
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (8), safelen (4) // expected-error{{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target parallel for simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target parallel for simd simdlen (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+// expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'simdlen' clause}}
+// expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+// expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+// expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target parallel for simd simdlen(simdlen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+
+#pragma omp target parallel for simd simdlen (2), safelen (4) // OK
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (4), safelen (4) // OK
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (8), safelen (4) // expected-error{{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_if_messages.cpp b/test/OpenMP/target_parallel_if_messages.cpp
new file mode 100644
index 0000000..e22eb81
--- /dev/null
+++ b/test/OpenMP/target_parallel_if_messages.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  #pragma omp target parallel if // expected-error {{expected '(' after 'if'}}
+  foo();
+  #pragma omp target parallel if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel if (argc > 0 ? argv[1] : argv[2])
+  foo();
+  #pragma omp target parallel if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause}}
+  foo();
+  #pragma omp target parallel if (S) // expected-error {{'S' does not refer to a value}}
+  foo();
+  #pragma omp target parallel if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(argc)
+  foo();
+  #pragma omp target parallel if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(target : argc)
+  foo();
+  #pragma omp target parallel if(parallel : argc)
+  foo();
+  #pragma omp target parallel if(target : argc) if(parallel : argc)
+  foo();
+  #pragma omp target parallel if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel'}}
+  foo();
+  #pragma omp target parallel if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause with 'target' name modifier}}
+  foo();
+  #pragma omp target parallel if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  foo();
+  #pragma omp target parallel if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+  #pragma omp target parallel if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel if // expected-error {{expected '(' after 'if'}}
+  foo();
+  #pragma omp target parallel if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel if (argc > 0 ? argv[1] : argv[2])
+  foo();
+  #pragma omp target parallel if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause}}
+  foo();
+  #pragma omp target parallel if (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(parallel : argc)
+  foo();
+  #pragma omp target parallel if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel'}}
+  foo();
+  #pragma omp target parallel if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause with 'target' name modifier}}
+  foo();
+  #pragma omp target parallel if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  foo();
+  #pragma omp target parallel if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+  #pragma omp target parallel if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_parallel_map_messages.cpp b/test/OpenMP/target_parallel_map_messages.cpp
new file mode 100644
index 0000000..ca794ce
--- /dev/null
+++ b/test/OpenMP/target_parallel_map_messages.cpp
@@ -0,0 +1,280 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to, tofrom, always;
+  const T (&l)[5] = da;
+
+
+#pragma omp target parallel map // expected-error {{expected '(' after 'map'}}
+  foo();
+#pragma omp target parallel map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map() // expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  foo();
+#pragma omp target parallel map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+#pragma omp target parallel map(to:) // expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  foo();
+#pragma omp target parallel map(l[-1:]) // expected-error 2 {{array section must be a subset of the original array}}
+  foo();
+#pragma omp target parallel map(l[:-1]) // expected-error 2 {{section length is evaluated to a negative value -1}}
+  foo();
+#pragma omp target parallel map(x)
+  foo();
+#pragma omp target parallel map(tofrom: t[:I])
+  foo();
+#pragma omp target parallel map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
+  foo();
+#pragma omp target parallel map(T) // expected-error {{'T' does not refer to a value}}
+  foo();
+#pragma omp target parallel map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+  foo();
+#pragma omp target parallel map(S2::S2s)
+  foo();
+#pragma omp target parallel map(S2::S2sc)
+  foo();
+#pragma omp target parallel map(x)
+  foo();
+#pragma omp target parallel map(to: x)
+  foo();
+#pragma omp target parallel map(to: to)
+  foo();
+#pragma omp target parallel map(to)
+  foo();
+#pragma omp target parallel map(to, x)
+  foo();
+#pragma omp target parallel map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+#pragma omp target parallel map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+  foo();
+#pragma omp target parallel map(argc)
+  foo();
+#pragma omp target parallel map(S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+#pragma omp target parallel map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  foo();
+#pragma omp target parallel map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  foo();
+#pragma omp target parallel map(ca)
+  foo();
+#pragma omp target parallel map(da)
+  foo();
+#pragma omp target parallel map(S2::S2s)
+  foo();
+#pragma omp target parallel map(S2::S2sc)
+  foo();
+#pragma omp target parallel map(e, g)
+  foo();
+#pragma omp target parallel map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  foo();
+#pragma omp target parallel map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  foo();
+#pragma omp target parallel map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
+  foo();
+#pragma omp target parallel map(da)
+  foo();
+#pragma omp target parallel map(da[:4])
+  foo();
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target parallel map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  foo();
+#pragma omp target parallel map(j)
+  foo();
+#pragma omp target parallel map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+{
+#pragma omp target parallel map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  foo();
+#pragma omp target parallel map(j)
+  foo();
+#pragma omp target parallel map(l)
+  foo();
+}
+
+#pragma omp target parallel map(always, tofrom: x)
+  foo();
+#pragma omp target parallel map(always: x) // expected-error {{missing map type}}
+  foo();
+#pragma omp target parallel map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  foo();
+#pragma omp target parallel map(always, tofrom: always, tofrom, x)
+  foo();
+#pragma omp target parallel map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to, tofrom, always;
+  const int (&l)[5] = da;
+#pragma omp target parallel map // expected-error {{expected '(' after 'map'}}
+  foo();
+#pragma omp target parallel map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map() // expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  foo();
+#pragma omp target parallel map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+#pragma omp target parallel map(to:) // expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  foo();
+#pragma omp target parallel map(l[-1:]) // expected-error {{array section must be a subset of the original array}}
+  foo();
+#pragma omp target parallel map(l[:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+#pragma omp target parallel map(x)
+  foo();
+#pragma omp target parallel map(to: x)
+  foo();
+#pragma omp target parallel map(to: to)
+  foo();
+#pragma omp target parallel map(to)
+  foo();
+#pragma omp target parallel map(to, x)
+  foo();
+#pragma omp target parallel map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+#pragma omp target parallel map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  foo();
+#pragma omp target parallel map(argc)
+  foo();
+#pragma omp target parallel map(S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+#pragma omp target parallel map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  foo();
+#pragma omp target parallel map(argv[1])
+  foo();
+#pragma omp target parallel map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  foo();
+#pragma omp target parallel map(ca)
+  foo();
+#pragma omp target parallel map(da)
+  foo();
+#pragma omp target parallel map(S2::S2s)
+  foo();
+#pragma omp target parallel map(S2::S2sc)
+  foo();
+#pragma omp target parallel map(e, g)
+  foo();
+#pragma omp target parallel map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  foo();
+#pragma omp target parallel map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  foo();
+#pragma omp target parallel map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+  foo();
+#pragma omp target parallel map(da)
+  foo();
+#pragma omp target parallel map(da[:4])
+  foo();
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target parallel map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  foo();
+#pragma omp target parallel map(j)
+  foo();
+#pragma omp target parallel map(l) map(l[:5]) // expected-error 1 {{variable already marked as mapped in current construct}} expected-note 1 {{used here}}
+  foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 1 {{used here}}
+{
+#pragma omp target parallel map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  foo();
+#pragma omp target parallel map(j)
+  foo();
+#pragma omp target parallel map(l)
+  foo();
+}
+
+#pragma omp target parallel map(always, tofrom: x)
+  foo();
+#pragma omp target parallel map(always: x) // expected-error {{missing map type}}
+  foo();
+#pragma omp target parallel map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  foo();
+#pragma omp target parallel map(always, tofrom: always, tofrom, x)
+  foo();
+#pragma omp target parallel map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_messages.cpp b/test/OpenMP/target_parallel_messages.cpp
new file mode 100644
index 0000000..b6763d8
--- /dev/null
+++ b/test/OpenMP/target_parallel_messages.cpp
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -o - %s
+// RUN: not %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=aaa-bbb-ccc-ddd -o - %s 2>&1 | FileCheck %s
+// CHECK: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd'
+
+void foo() {
+}
+
+static int pvt;
+#pragma omp threadprivate(pvt)
+
+#pragma omp target parallel // expected-error {{unexpected OpenMP directive '#pragma omp target parallel'}}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel { // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel ( // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel [ // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel ] // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel ) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel } // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel
+  foo();
+  // expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  #pragma omp target parallel unknown()
+  foo();
+  L1:
+    foo();
+  #pragma omp target parallel
+  ;
+  #pragma omp target parallel
+  {
+    goto L1; // expected-error {{use of undeclared label 'L1'}}
+    argc++;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    switch(argc) {
+     case (0):
+      #pragma omp target parallel
+      {
+        foo();
+        break; // expected-error {{'break' statement not in loop or switch statement}}
+        continue; // expected-error {{'continue' statement not in loop statement}}
+      }
+      default:
+       break;
+    }
+  }
+
+  goto L2; // expected-error {{use of undeclared label 'L2'}}
+  #pragma omp target parallel
+  L2:
+  foo();
+  #pragma omp target parallel
+  {
+    return 1; // expected-error {{cannot return from OpenMP region}}
+  }
+
+  [[]] // expected-error {{an attribute list cannot appear here}}
+  #pragma omp target parallel
+  for (int n = 0; n < 100; ++n) {}
+
+  #pragma omp target parallel copyin(pvt) // expected-error {{unexpected OpenMP clause 'copyin' in directive '#pragma omp target parallel'}}
+  foo();
+
+  return 0;
+}
+
diff --git a/test/OpenMP/target_parallel_nowait_messages.cpp b/test/OpenMP/target_parallel_nowait_messages.cpp
new file mode 100644
index 0000000..91e26f2
--- /dev/null
+++ b/test/OpenMP/target_parallel_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel nowait( // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel nowait device (-10u)
+  foo();
+  #pragma omp target parallel nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_num_threads_messages.cpp b/test/OpenMP/target_parallel_num_threads_messages.cpp
new file mode 100644
index 0000000..95797ca
--- /dev/null
+++ b/test/OpenMP/target_parallel_num_threads_messages.cpp
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+#define redef_num_threads(a, b) num_threads(a)
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  #pragma omp target parallel num_threads // expected-error {{expected '(' after 'num_threads'}}
+  foo();
+  #pragma omp target parallel num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel num_threads () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target parallel num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  foo();
+  #pragma omp target parallel num_threads (S) // expected-error {{'S' does not refer to a value}}
+  foo();
+  #pragma omp target parallel num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target parallel num_threads (argc)
+  foo();
+  #pragma omp target parallel num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  foo();
+  #pragma omp target parallel redef_num_threads (argc, argc)
+  foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel num_threads // expected-error {{expected '(' after 'num_threads'}}
+  foo();
+  #pragma omp target parallel num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel num_threads () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  foo();
+  #pragma omp target parallel num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  foo();
+  #pragma omp target parallel num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target parallel num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  foo();
+  #pragma omp target parallel redef_num_threads (argc, argc)
+  foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_private_messages.cpp b/test/OpenMP/target_parallel_private_messages.cpp
new file mode 100644
index 0000000..fabd37d
--- /dev/null
+++ b/test/OpenMP/target_parallel_private_messages.cpp
@@ -0,0 +1,222 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}} expected-note 1 {{forward declaration of 'S1'}} expected-note {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  static float S2s; // expected-note {{static data member is predetermined as shared}} expected-note 1 {{static data member is predetermined as shared}}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 c; // expected-note {{global variable is predetermined as shared}} expected-note 1 {{global variable is predetermined as shared}}
+const S3 ca[5]; // expected-note {{global variable is predetermined as shared}} expected-note 1 {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}} expected-note 1 {{global variable is predetermined as shared}} 
+
+int threadvar;
+#pragma omp threadprivate(threadvar) // expected-note {{defined as threadprivate or thread local}} expected-note 1 {{defined as threadprivate or thread local}}
+
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}} expected-note 1 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}} expected-note 1 {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+};
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}} expected-note 1 {{defined as threadprivate or thread local}} expected-note 2 {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C, class D, class E>
+int foomain(I argc, C **argv) {
+  const I d = 5; // expected-note {{constant variable is predetermined as shared}}
+  const I da[5] = { 0 }; // expected-note {{constant variable is predetermined as shared}}
+  D e(4);
+  E g[] = {5, 6};
+  I i;
+  I &j = i;
+#pragma omp target parallel private // expected-error {{expected '(' after 'private'}}
+{}
+#pragma omp target parallel private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private() // expected-error {{expected expression}}
+{}
+#pragma omp target parallel private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target parallel private(argc argv) // expected-error {{expected ',' or ')' in 'private' clause}}
+{}
+#pragma omp target parallel private(argc)
+{}
+#pragma omp target parallel private(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target parallel private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+{}
+#pragma omp target parallel private (a, b, c, d, f) // expected-error {{a private variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target parallel private(ba)
+{}
+#pragma omp target parallel private(ca) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(da) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(S2::S2s) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+{}
+  #pragma omp target parallel private(threadvar, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  {}
+  #pragma omp target parallel shared(i), private(i) // expected-error {{shared variable cannot be private}} expected-note {{defined as shared}}
+  foo();
+  #pragma omp target parallel firstprivate(i) private(i) // expected-error {{firstprivate variable cannot be private}} expected-note {{defined as firstprivate}}
+  foo();
+  #pragma omp target parallel private(i)
+  {}
+  #pragma omp target parallel private(j)
+  foo();
+  #pragma omp parallel firstprivate(i)
+  for (int k = 0; k < 10; ++k) {
+    #pragma omp target parallel private(i)
+    foo();
+  }
+  static int m;
+  #pragma omp target parallel private(m) // OK
+  foo();
+#pragma omp target parallel private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target parallel private(B::x) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+  }
+#pragma omp target parallel shared(i)
+{}
+#pragma omp target parallel private(i)
+{}
+#pragma omp target parallel private(j)
+{}
+#pragma omp target parallel private(i)
+{}
+  static int si;
+#pragma omp target parallel private(si) // OK
+  {}
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp target parallel private(a)
+  {}
+}
+
+int main(int argc, char **argv) {
+  const int d = 5; // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = { 0 }; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g[] = {5, 6};
+  int i;
+  int &j = i;
+#pragma omp target parallel private // expected-error {{expected '(' after 'private'}}
+{}
+#pragma omp target parallel private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private() // expected-error {{expected expression}}
+{}
+#pragma omp target parallel private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target parallel private(argc argv) // expected-error {{expected ',' or ')' in 'private' clause}}
+{}
+#pragma omp target parallel private(argc)
+{}
+#pragma omp target parallel private(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target parallel private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+{}
+#pragma omp target parallel private (a, b, c, d, f) // expected-error {{a private variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target parallel private(ba)
+{}
+#pragma omp target parallel private(ca) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(da) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(S2::S2s) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+{}
+  #pragma omp target parallel private(threadvar, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  {}
+  #pragma omp target parallel shared(i), private(i) // expected-error {{shared variable cannot be private}} expected-note {{defined as shared}}
+  foo();
+  #pragma omp target parallel firstprivate(i) private(i) // expected-error {{firstprivate variable cannot be private}} expected-note {{defined as firstprivate}}
+  foo();
+  #pragma omp target parallel private(i)
+  {}
+  #pragma omp target parallel private(j)
+  foo();
+  #pragma omp parallel firstprivate(i)
+  for (int k = 0; k < 10; ++k) {
+    #pragma omp target parallel private(i)
+    foo();
+  }
+  static int m;
+  #pragma omp target parallel private(m) // OK
+  foo();
+#pragma omp target parallel private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target parallel private(B::x) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp parallel
+  {
+    int i;
+  }
+#pragma omp target parallel shared(i)
+{}
+#pragma omp target parallel private(i)
+{}
+#pragma omp target parallel private(j)
+{}
+#pragma omp target parallel private(i)
+{}
+  static int si;
+#pragma omp target parallel private(si) // OK
+  {}
+  return foomain<int, char, S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char, S4, S5>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_proc_bind_messages.cpp b/test/OpenMP/target_parallel_proc_bind_messages.cpp
new file mode 100644
index 0000000..56292ad
--- /dev/null
+++ b/test/OpenMP/target_parallel_proc_bind_messages.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  foo();
+  #pragma omp target parallel proc_bind ( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel proc_bind () // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  foo();
+  #pragma omp target parallel proc_bind (master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel proc_bind (close), proc_bind(spread) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'proc_bind' clause}}
+  foo();
+  #pragma omp target parallel proc_bind (x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  foo();
+
+  #pragma omp target parallel proc_bind(master)
+  ++argc;
+
+  #pragma omp target parallel proc_bind(close)
+  foo();
+  #pragma omp target parallel proc_bind(spread)
+  ++argc;
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_reduction_messages.cpp b/test/OpenMP/target_parallel_reduction_messages.cpp
new file mode 100644
index 0000000..c9434e7
--- /dev/null
+++ b/test/OpenMP/target_parallel_reduction_messages.cpp
@@ -0,0 +1,263 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                    // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                   // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];     // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];           // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target parallel reduction // expected-error {{expected '(' after 'reduction'}}
+  foo();
+#pragma omp target parallel reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+#pragma omp target parallel reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp target parallel reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp target parallel reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp target parallel reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp target parallel reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  foo();
+#pragma omp target parallel reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  foo();
+#pragma omp target parallel reduction(&& : argc)
+  foo();
+#pragma omp target parallel reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  foo();
+#pragma omp target parallel reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  foo();
+#pragma omp target parallel reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp target parallel reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp target parallel reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  foo();
+#pragma omp target parallel private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp target parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  foo();
+#pragma omp target parallel reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel shared(i)
+  foo();
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp target parallel
+#pragma omp for private(fl)
+  for (int i = 0; i < 10; ++i)
+  {}
+#pragma omp target parallel reduction(+ : fl)
+    foo();
+#pragma omp target parallel
+#pragma omp for reduction(- : fl)
+  for (int i = 0; i < 10; ++i)
+  {}
+#pragma omp target parallel reduction(+ : fl)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                  // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                   // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];        // expected-note {{'r' defined here}}
+  int &q = qa[i];              // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target parallel reduction // expected-error {{expected '(' after 'reduction'}}
+  foo();
+#pragma omp target parallel reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+#pragma omp target parallel reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp target parallel reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp target parallel reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  foo();
+#pragma omp target parallel reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp target parallel reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  foo();
+#pragma omp target parallel reduction(&& : argc)
+  foo();
+#pragma omp target parallel reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+#pragma omp target parallel reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  foo();
+#pragma omp target parallel reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp target parallel reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp target parallel reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{nvalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  foo();
+#pragma omp target parallel reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  foo();
+#pragma omp target parallel private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp target parallel reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  foo();
+#pragma omp target parallel reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel shared(i)
+  foo();
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp target parallel
+#pragma omp for private(fl)
+  for (int i = 0; i < 10; ++i)
+  {}
+#pragma omp target parallel reduction(+ : fl)
+    foo();
+#pragma omp target parallel
+#pragma omp for reduction(- : fl)
+  for (int i = 0; i < 10; ++i)
+  {}
+#pragma omp target parallel reduction(+ : fl)
+    foo();
+  static int m;
+#pragma omp target parallel reduction(+ : m) // OK
+  m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_shared_messages.cpp b/test/OpenMP/target_parallel_shared_messages.cpp
new file mode 100644
index 0000000..302a092
--- /dev/null
+++ b/test/OpenMP/target_parallel_shared_messages.cpp
@@ -0,0 +1,110 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  #pragma omp target parallel shared // expected-error {{expected '(' after 'shared'}}
+  foo();
+  #pragma omp target parallel shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel shared () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  foo();
+  #pragma omp target parallel shared (argc)
+  foo();
+  #pragma omp target parallel shared (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel shared (a, b, c, d, f)
+  foo();
+  #pragma omp target parallel shared (argv[1]) // expected-error {{expected variable name}}
+  foo();
+  #pragma omp target parallel shared(ba)
+  foo();
+  #pragma omp target parallel shared(ca)
+  foo();
+  #pragma omp target parallel shared(da)
+  foo();
+  #pragma omp target parallel shared(e, g)
+  foo();
+  #pragma omp target parallel shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  foo();
+  #pragma omp target parallel private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  foo();
+  #pragma omp target parallel firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  foo();
+  #pragma omp target parallel private(i)
+  foo();
+  #pragma omp target parallel shared(i)
+  foo();
+  #pragma omp target parallel shared(j)
+  foo();
+  #pragma omp target parallel firstprivate(i)
+  foo();
+  #pragma omp target parallel shared(i)
+  foo();
+  #pragma omp target parallel shared(j)
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_private_codegen.cpp b/test/OpenMP/target_private_codegen.cpp
new file mode 100644
index 0000000..5c738ee
--- /dev/null
+++ b/test/OpenMP/target_private_codegen.cpp
@@ -0,0 +1,264 @@
+// Only test codegen on target side, as private clause does not require any action on the host side
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+// TCHECK: [[TT:%.+]] = type { i64, i8 }
+// TCHECK: [[S1:%.+]] = type { double }
+
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+
+  #pragma omp target private(a)
+  {
+  }
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}()
+  // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK-NOT: store {{.+}}, {{.+}} [[A]],
+  // TCHECK:  ret void  
+
+#pragma omp target private(a)
+  {
+    a = 1;
+  }
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}()
+  // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+  // TCHECK:  ret void
+  
+  #pragma omp target private(a, aa)
+  {
+    a = 1;
+    aa = 1;
+  }
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}()
+  // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+  // TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A2]],
+  // TCHECK:  ret void
+
+  #pragma omp target private(a, b, bn, c, cn, d)
+  {
+    a = 1;
+    b[2] = 1.0;
+    bn[3] = 1.0;
+    c[1][2] = 1.0;
+    cn[1][3] = 1.0;
+    d.X = 1;
+    d.Y = 1;
+  }
+  // make sure that private variables are generated in all cases and that we use those instances for operations inside the
+  // target region
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[VLA:%.+]], i{{[0-9]+}} [[VLA1:%.+]], i{{[0-9]+}} [[VLA3:%.+]])
+  // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR4:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[B:%.+]] = alloca [10 x float],
+  // TCHECK:  [[SSTACK:%.+]] = alloca i8*,
+  // TCHECK:  [[C:%.+]] = alloca [5 x [10 x double]],
+  // TCHECK:  [[D:%.+]] = alloca [[TT]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA]], i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA1]], i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA3]], i{{[0-9]+}}* [[VLA_ADDR4]],
+  // TCHECK:  [[VLA_ADDR_REF:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  [[VLA_ADDR_REF2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  [[VLA_ADDR_REF4:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR4]],
+  // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
+  // TCHECK:  store i8* [[RET_STACK]], i8** [[SSTACK]],
+  // TCHECK:  [[VLA5:%.+]] = alloca float, i{{[0-9]+}} [[VLA_ADDR_REF]],
+  // TCHECK:  [[VLA6_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF2]], [[VLA_ADDR_REF4]]
+  // TCHECK:  [[VLA6:%.+]] = alloca double, i{{[0-9]+}} [[VLA6_SIZE]],
+
+  // a = 1
+  // TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+
+  // b[2] = 1.0
+  // TCHECK:  [[B_GEP:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // TCHECK:  store float 1.0{{.*}}, float* [[B_GEP]],
+  
+  // bn[3] = 1.0
+  // TCHECK:  [[BN_GEP:%.+]] = getelementptr inbounds float, float* [[VLA5]], i{{[0-9]+}} 3
+  // TCHECK:  store float 1.0{{.*}}, float* [[BN_GEP]],
+
+  // c[1][2] = 1.0
+  // TCHECK:  [[C_GEP1:%.+]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // TCHECK:  [[C_GEP2:%.+]] = getelementptr inbounds [10 x double], [10 x double]* [[C_GEP1]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // TCHECK:  store double 1.0{{.*}}, double* [[C_GEP2]],
+
+  // cn[1][3] = 1.0
+  // TCHECK:  [[CN_IND:%.+]] = mul{{.+}} i{{[0-9]+}} 1, [[VLA_ADDR_REF4]]
+  // TCHECK:  [[CN_GEP_IND:%.+]] = getelementptr inbounds double, double* [[VLA6]], i{{[0-9]+}} [[CN_IND]]
+  // TCHECK:  [[CN_GEP_3:%.+]] = getelementptr inbounds double, double* [[CN_GEP_IND]], i{{[0-9]+}} 3
+  // TCHECK:  store double 1.0{{.*}}, double* [[CN_GEP_3]],
+
+  // d.X = 1
+  // [[X_FIELD:%.+]] = getelementptr inbounds [[TT]] [[TT]]* [[D]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // store i{{[0-9]+}} 1, i{{[0-9]+}}* [[X_FIELD]],
+  
+  // d.Y = 1
+  // [[Y_FIELD:%.+]] = getelementptr inbounds [[TT]] [[TT]]* [[D]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // store i{{[0-9]+}} 1, i{{[0-9]+}}* [[Y_FIELD]],
+
+  // finish
+  // [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
+  // call ovid @llvm.stackrestore(i8* [[RELOAD_SSTACK]])
+  // ret void
+
+  return a;
+}
+
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+#pragma omp target private(a,aa,b)
+  {
+    a = 1;
+    aa = 1;
+    b[2] = 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  short aa = 0;
+  char aaa = 0;
+  int b[10];
+
+#pragma omp target private(a,aa,aaa,b)
+  {
+    a = 1;
+    aa = 1;
+    aaa = 1;
+    b[2] = 1;
+  }
+
+  return a;
+}
+
+// TCHECK: define void @__omp_offloading_{{.+}}()
+// TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[A3:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+// TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A2]],
+// TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A3]],
+// TCHECK:  [[B_GEP:%.+]] = getelementptr inbounds [10 x i{{[0-9]+}}], [10 x i{{[0-9]+}}]* [[B]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[B_GEP]],
+// TCHECK:  ret void
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+    short int c[2][n];
+
+#pragma omp target private(b,c)
+    {
+      this->a = (double)b + 1.5;
+      c[1][1] = ++a;
+    }
+
+    return c[1][1] + (int)b;
+  }
+
+  // TCHECK: define void @__omp_offloading_{{.+}}([[S1]]* [[TH:%.+]], i{{[0-9]+}} [[VLA:%.+]], i{{[0-9]+}} [[VLA1:%.+]])
+  // TCHECK: [[TH_ADDR:%.+]] = alloca [[S1]]*,
+  // TCHECK: [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK: [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK: [[B:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK: [[SSTACK:%.+]] = alloca i8*,
+  // TCHECK: store [[S1]]* [[TH]], [[S1]]** [[TH_ADDR]],
+  // TCHECK: store i{{[0-9]+}} [[VLA]], i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK: store i{{[0-9]+}} [[VLA1]], i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK: [[TH_ADDR_REF:%.+]] = load [[S1]]*, [[S1]]** [[TH_ADDR]],
+  // TCHECK: [[VLA_ADDR_REF:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK: [[VLA_ADDR_REF2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK: [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
+  // TCHECK: store i8* [[RET_STACK:%.+]], i8** [[SSTACK]],
+
+  // this->a = (double)b + 1.5;
+  // TCHECK: [[VLA_IND:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF]], [[VLA_ADDR_REF2]]
+  // TCHECK: [[VLA3:%.+]] = alloca i{{[0-9]+}}, i{{[0-9]+}} [[VLA_IND]],
+  // TCHECK: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B]],
+  // TCHECK: [[B_CONV:%.+]] = sitofp i{{[0-9]+}} [[B_VAL]] to double
+  // TCHECK: [[NEW_A_VAL:%.+]] = fadd double [[B_CONV]], 1.5{{.+}}+00
+  // TCHECK: [[A_FIELD:%.+]] = getelementptr inbounds [[S1]], [[S1]]* [[TH_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // TCHECK: store double [[NEW_A_VAL]], double* [[A_FIELD]],
+
+  // c[1][1] = ++a;
+  // TCHECK: [[A_FIELD4:%.+]] = getelementptr inbounds [[S1]], [[S1]]* [[TH_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // TCHECK: [[A_FIELD4_VAL:%.+]] = load double, double* [[A_FIELD4]],
+  // TCHECK: [[A_FIELD_INC:%.+]] = fadd double [[A_FIELD4_VAL]], 1.0{{.+}}+00
+  // TCHECK: store double [[A_FIELD_INC]], double* [[A_FIELD4]],
+  // TCHECK: [[A_FIELD_INC_CONV:%.+]] = fptosi double [[A_FIELD_INC]] to i{{[0-9]+}}
+  // TCHECK: [[C_IND:%.+]] = mul{{.+}} i{{[0-9]+}} 1, [[VLA_ADDR_REF2]]
+  // TCHECK: [[C_1_REF:%.+]] = getelementptr inbounds i{{[0-9]+}}, i{{[0-9]+}}* [[VLA3]], i{{[0-9]+}} [[C_IND]]
+  // TCHECK: [[C_1_1_REF:%.+]] = getelementptr inbounds i{{[0-9]+}}, i{{[0-9]+}}* [[C_1_REF]], i{{[0-9]+}} 1
+  // TCHECK: store i{{[0-9]+}} [[A_FIELD_INC_CONV]], i{{[0-9]+}}* [[C_1_1_REF]],
+
+  // finish
+  // TCHECK: [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
+  // TCHECK: call void @llvm.stackrestore(i8* [[RELOAD_SSTACK]])
+  // TCHECK: ret void
+};
+
+
+int bar(int n){
+  int a = 0;
+  a += foo(n);
+  S1 S;
+  a += S.r1(n);
+  a += fstatic(n);
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+// template
+// TCHECK: define void @__omp_offloading_{{.+}}()
+// TCHECK: [[A:%.+]] = alloca i{{[0-9]+}},
+// TCHECK: [[A2:%.+]] = alloca i{{[0-9]+}},
+// TCHECK: [[B:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+// TCHECK: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A2]],
+// TCHECK: [[B_GEP:%.+]] = getelementptr inbounds [10 x i{{[0-9]+}}], [10 x i{{[0-9]+}}]* [[B]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// TCHECK: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[B_GEP]],
+// TCHECK: ret void
+
+#endif
diff --git a/test/OpenMP/target_private_messages.cpp b/test/OpenMP/target_private_messages.cpp
new file mode 100644
index 0000000..a093a87
--- /dev/null
+++ b/test/OpenMP/target_private_messages.cpp
@@ -0,0 +1,198 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target private // expected-error {{expected '(' after 'private'}}
+{}
+#pragma omp target private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private() // expected-error {{expected expression}}
+{}
+#pragma omp target private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target private(argc)
+{}
+#pragma omp target private(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+{}
+#pragma omp target private(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target private(e, g)
+{}
+#pragma omp target private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp target'}}
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target private(j)
+{}
+#pragma omp target private(i)
+  {}
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp target private(a)
+  {}
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target private // expected-error {{expected '(' after 'private'}}
+{}
+#pragma omp target private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private() // expected-error {{expected expression}}
+{}
+#pragma omp target private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target private(argc)
+{}
+#pragma omp target private(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+{}
+#pragma omp target private(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+{}
+#pragma omp target private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target private(B::x) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp target'}}
+#pragma omp parallel
+  {
+    int i;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target private(j)
+{}
+#pragma omp target private(i)
+  {}
+  static int si;
+#pragma omp target private(si) // OK
+  {}
+#pragma omp target map(i) private(i) // expected-error {{private variable cannot be in a map clause in '#pragma omp target' directive}}
+  {}
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_simd_aligned_messages.cpp b/test/OpenMP/target_simd_aligned_messages.cpp
new file mode 100644
index 0000000..547f3b4
--- /dev/null
+++ b/test/OpenMP/target_simd_aligned_messages.cpp
@@ -0,0 +1,203 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
+
+struct B {
+  static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
+  static constexpr int bfoo() { return 8; }
+};
+namespace X {
+  B x; // expected-note {{'x' defined here}}
+};
+constexpr int bfoo() { return 4; }
+
+int **z;
+const int C1 = 1;
+const int C2 = 2;
+void test_aligned_colons(int *&rp)
+{
+  int *B = 0;
+  #pragma omp target simd aligned(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'}}
+  #pragma omp target simd aligned(B::ib:B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  #pragma omp target simd aligned(z:B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'int **'}}
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'B'}}
+  #pragma omp target simd aligned(X::x : ::z)
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{integral constant expression must have integral or unscoped enumeration type, not 'B'}}
+  #pragma omp target simd aligned(B,rp,::z: X::x)
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target simd aligned(::z)
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{expected variable name}}
+  #pragma omp target simd aligned(B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-warning@+1 {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  #pragma omp target simd aligned(B::ib,B:C1+C2)
+  for (int i = 0; i < 10; ++i) ;
+}
+
+// expected-note@+1 {{'num' defined here}}
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L;
+  // Negative number is passed as L.
+  // expected-error@+1 {{argument to 'aligned' clause must be a strictly positive integer value}}
+  #pragma omp target simd aligned(arr:L)
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target simd aligned(num:4)
+  for (i = 0; i < num; ++i);
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int *ind2 = 0;
+  // expected-error@+1 {{argument to 'aligned' clause must be a strictly positive integer value}}
+  #pragma omp target simd aligned(ind2:LEN)
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return 0;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a; // expected-note {{'a' declared here}}
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 1 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h; // expected-note 2 {{'h' defined here}}
+#pragma omp threadprivate(h)
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(argc);
+  I g(argc);
+  int i; // expected-note {{declared here}} expected-note {{'i' defined here}}
+  // expected-note@+2 {{declared here}}
+  // expected-note@+1 {{reference to 'i' is not a constant expression}}
+  int &j = i;
+  #pragma omp target simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned () // expected-error {{expected expression}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (argc : 5) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned(e, g)
+  for (I k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  #pragma omp target simd aligned(h)
+  for (I k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target simd aligned(i)
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp parallel
+  {
+    int *v = 0;
+    I i;
+    #pragma omp target simd aligned(v:16)
+    for (I k = 0; k < argc; ++k) { i = k; v += 2; }
+  }
+  float *f;
+  #pragma omp target simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+  int v = 0;
+  // expected-note@+2 {{initializer of 'j' is not a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target simd aligned(f:j)
+  for (I k = 0; k < argc; ++k) { ++k; v += j; }
+  #pragma omp target simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+  return 0;
+}
+
+// expected-note@+1 2 {{'argc' defined here}}
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  test_warn<4>(); // ok
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  int i;
+  int &j = i;
+  #pragma omp target simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (argv // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target simd aligned (argc)
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+2 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S1'}}
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S2'}}
+  #pragma omp target simd aligned (a, b) 
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  #pragma omp target simd aligned(h)
+  for (int k = 0; k < argc; ++k) ++k;
+  int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
+  foomain<int*,char>(pargc,argv);
+  return 0;
+}
+
diff --git a/test/OpenMP/target_simd_ast_print.cpp b/test/OpenMP/target_simd_ast_print.cpp
new file mode 100644
index 0000000..8f03950
--- /dev/null
+++ b/test/OpenMP/target_simd_ast_print.cpp
@@ -0,0 +1,310 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target simd private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp target simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp target simd private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target simd private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target simd private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp target simd private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc, T *argv) {
+  T b = argc, c, d, e, f, h;
+  T arr[N][10], arr1[N];
+  T i, j;
+  T s;
+  static T a;
+// CHECK: static T a;
+  static T g;
+  const T clen = 5;
+// CHECK: T clen = 5;
+  T *p;
+#pragma omp threadprivate(g)
+#pragma omp target simd linear(a)
+  // CHECK: #pragma omp target simd linear(a)
+  for (T i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (T i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) if (target :argc) reduction(+ : h)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+            foo();
+  // CHECK-NEXT: #pragma omp target simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) if(target: argc) reduction(+: h)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: foo();
+
+#pragma omp target simd private(argc,b) firstprivate(argv) if(target:argc > 0) reduction(+:c, arr1[argc]) reduction(max:e, arr[:N][0:10])
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd private(argc,b) firstprivate(argv) if(target: argc > 0) reduction(+: c,arr1[argc]) reduction(max: e,arr[:N][0:10])
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd if(N) reduction(^:e, f, arr[0:N][:argc]) reduction(&& : h)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd if(N) reduction(^: e,f,arr[0:N][:argc]) reduction(&&: h)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd if(target:argc > 0)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd if(target: argc > 0)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd if(N)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd if(N)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd map(i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd map(tofrom: i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd map(arr1[0:10], i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd map(tofrom: arr1[0:10],i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd map(to: i) map(from: j)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd map(to: i) map(from: j)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd map(always,alloc: i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd map(always,alloc: i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd nowait
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd nowait
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd depend(in : argc, arr[i:argc], arr1[:])
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd depend(in : argc,arr[i:argc],arr1[:])
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd defaultmap(tofrom: scalar)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd defaultmap(tofrom: scalar)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd safelen(clen-1)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd safelen(clen - 1)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd simdlen(clen-1)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd simdlen(clen - 1)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd aligned(arr1:N-1)
+  for (T i = 0; i < N; ++i) {}
+  // CHECK: #pragma omp target simd aligned(arr1: N - 1)
+  // CHECK-NEXT: for (T i = 0; i < N; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd is_device_ptr(p)
+  for (T i = 0; i < N; ++i) {}
+  // CHECK: #pragma omp target simd is_device_ptr(p)
+  // CHECK-NEXT: for (T i = 0; i < N; ++i) {
+  // CHECK-NEXT: }
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int arr[5][10], arr1[5];
+  int i, j;
+  int s;
+  static int a;
+// CHECK: static int a;
+  const int clen = 5;
+// CHECK: int clen = 5;
+  static float g;
+#pragma omp threadprivate(g)
+  int *p;
+#pragma omp target simd linear(a)
+  // CHECK: #pragma omp target simd linear(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+
+#pragma omp target simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) if (target: argc) reduction(+ : h) linear(a:-5)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      foo();
+  // CHECK: #pragma omp target simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) if(target: argc) reduction(+: h) linear(a: -5)
+  // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: foo();
+
+#pragma omp target simd private(argc,b) firstprivate(argv) if (argc > 0) reduction(+:c, arr1[argc]) reduction(max:e, arr[:5][0:10])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd private(argc,b) firstprivate(argv) if(argc > 0) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd if (5) reduction(^:e, f, arr[0:5][:argc]) reduction(&& : h)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd if(5) reduction(^: e,f,arr[0:5][:argc]) reduction(&&: h)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd if (target:argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd if(target: argc > 0)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd if (5)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd if(5)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd map(i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd  map(tofrom: i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd map(arr1[0:10], i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd map(tofrom: arr1[0:10],i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd map(to: i) map(from: j)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd map(to: i) map(from: j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd map(always,alloc: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd map(always,alloc: i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd nowait
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd nowait
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd depend(in : argc, arr[i:argc], arr1[:])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd depend(in : argc,arr[i:argc],arr1[:])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd defaultmap(tofrom: scalar)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd defaultmap(tofrom: scalar)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd safelen(clen-1)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd safelen(clen - 1)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd simdlen(clen-1)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd simdlen(clen - 1)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd aligned(arr1:4)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd aligned(arr1: 4)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target simd is_device_ptr(p)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target simd is_device_ptr(p)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+  return (tmain<int, 5>(argc, &argc));
+}
+
+#endif
diff --git a/test/OpenMP/target_simd_collapse_messages.cpp b/test/OpenMP/target_simd_collapse_messages.cpp
new file mode 100644
index 0000000..ce42273
--- /dev/null
+++ b/test/OpenMP/target_simd_collapse_messages.cpp
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+  int j; // expected-note {{declared here}}
+  #pragma omp target simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+  #pragma omp target simd collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target simd collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+5 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+3 2 {{directive '#pragma omp target simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  #pragma omp target simd collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target simd collapse (j=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int j; // expected-note {{declared here}}
+  #pragma omp target simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+  #pragma omp target simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp target simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus >= 201103L
+  // expected-note@+5 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+2 2 {{directive '#pragma omp target simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target simd collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target simd collapse (j=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target simd' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target simd collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  #pragma omp target simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_simd_defaultmap_messages.cpp b/test/OpenMP/target_simd_defaultmap_messages.cpp
new file mode 100644
index 0000000..4f62213
--- /dev/null
+++ b/test/OpenMP/target_simd_defaultmap_messages.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_simd_depend_messages.cpp b/test/OpenMP/target_simd_depend_messages.cpp
new file mode 100644
index 0000000..3fc46f4
--- /dev/null
+++ b/test/OpenMP/target_simd_depend_messages.cpp
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+  int i;
+
+  #pragma omp target simd depend // expected-error {{expected '(' after 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (out: ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[0])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[-1:0]) // OK
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend(in : argv[ : argc][1 : argc - 1])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd depend(in : arr[0])
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_simd_device_messages.cpp b/test/OpenMP/target_simd_device_messages.cpp
new file mode 100644
index 0000000..1e9afc3
--- /dev/null
+++ b/test/OpenMP/target_simd_device_messages.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target simd device // expected-error {{expected '(' after 'device'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (argc + argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (argc), device (argc+1) // expected-error {{directive '#pragma omp target simd' cannot contain more than one 'device' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_simd_firstprivate_messages.cpp b/test/OpenMP/target_simd_firstprivate_messages.cpp
new file mode 100644
index 0000000..5bfd951
--- /dev/null
+++ b/test/OpenMP/target_simd_firstprivate_messages.cpp
@@ -0,0 +1,261 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target simd firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target simd firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i)    // expected-error {{loop iteration variable in the associated loop of 'omp target simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target simd firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  static int si;
+#pragma omp target simd firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_simd_if_messages.cpp b/test/OpenMP/target_simd_if_messages.cpp
new file mode 100644
index 0000000..e0834e7
--- /dev/null
+++ b/test/OpenMP/target_simd_if_messages.cpp
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc) if (simd:argc) // expected-error {{directive name modifier 'simd' is not allowed for '#pragma omp target simd'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target simd' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc) if (argc) // expected-note {{previous clause with directive name modifier specified here}} expected-error {{no more 'if' clause is allowed}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc) if (simd:argc) // expected-error {{directive name modifier 'simd' is not allowed for '#pragma omp target simd'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target simd' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd if(target : argc) if (argc) // expected-note {{previous clause with directive name modifier specified here}} expected-error {{no more 'if' clause is allowed}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_simd_is_device_ptr_messages.cpp b/test/OpenMP/target_simd_is_device_ptr_messages.cpp
new file mode 100644
index 0000000..02e2b32
--- /dev/null
+++ b/test/OpenMP/target_simd_is_device_ptr_messages.cpp
@@ -0,0 +1,288 @@
+// RUN: %clang_cc1 -std=c++11 -verify -fopenmp %s
+
+struct ST {
+  int *a;
+};
+typedef int arr[10];
+typedef ST STarr[10];
+struct SA {
+  const int d = 5;
+  const int da[5] = { 0 };
+  ST e;
+  ST g[10];
+  STarr &rg = g;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  arr &raa = aa;
+  void func(int arg) {
+#pragma omp target simd is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr() // expected-error {{expected expression}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(arg // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(k) // OK
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(z) // OK
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(aa) // OK
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(raa) // OK
+    for (int ii=0; ii<10; ii++)
+      ;   
+#pragma omp target simd is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(g) // OK
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(rg) // OK
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    for (int ii=0; ii<10; ii++)
+      ;
+#pragma omp target simd is_device_ptr(da) // OK
+    for (int ii=0; ii<10; ii++)
+      ;
+  return;
+ }
+};
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef struct {
+  int a;
+} S6;
+
+template <typename T, int I>
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  S6 h[10];
+  auto &rh = h;
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+  T aa[10];
+  auto &raa = aa;
+#pragma omp target simd is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr() // expected-error {{expected expression}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(k) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(z) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(aa) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(raa) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(g) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(h) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(rh) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(da) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  S6 h[10];
+  auto &rh = h;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  auto &raa = aa;
+#pragma omp target simd is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr() // expected-error {{expected expression}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(k) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(z) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(aa) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(raa) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(g) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(h) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(rh) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  for (int kk=0; kk<20; kk++)
+    ;
+#pragma omp target simd is_device_ptr(da) // OK
+  for (int kk=0; kk<20; kk++)
+    ;
+  return tmain<int, 3>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_simd_lastprivate_messages.cpp b/test/OpenMP/target_simd_lastprivate_messages.cpp
new file mode 100644
index 0000000..afb38d9
--- /dev/null
+++ b/test/OpenMP/target_simd_lastprivate_messages.cpp
@@ -0,0 +1,238 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  S2 &operator=(const S2 &);
+  const S2 &operator=(const S2 &) const;
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target simd lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target simd lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target simd lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(i) // expected-note {{defined as lastprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target simd' directive may not be lastprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel private(xa)
+#pragma omp target simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel reduction(+ : xa)
+#pragma omp target simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target simd lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 2;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_simd_linear_messages.cpp b/test/OpenMP/target_simd_linear_messages.cpp
new file mode 100644
index 0000000..6319e31
--- /dev/null
+++ b/test/OpenMP/target_simd_linear_messages.cpp
@@ -0,0 +1,269 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons() {
+  int B = 0;
+#pragma omp target simd linear(B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'}}
+#pragma omp target simd linear(B::ib : B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+#pragma omp target simd linear(B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'?}}
+#pragma omp target simd linear(z : B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target simd linear(B : B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target simd linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target simd linear(B, ::z, X::x)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target simd linear(::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target simd linear(B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target simd linear(B::ib, B : C1 + C2)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+
+template <int L, class T, class N>
+T test_template(T *arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = -num * L; // expected-note {{'ind2' defined here}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type}}
+#pragma omp target simd linear(ind2 : L)
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template <int LEN>
+int test_warn() {
+  int ind2 = 0;
+// expected-warning@+1 {{zero linear step (ind2 should probably be const)}}
+#pragma omp target simd linear(ind2 : LEN)
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+
+public:
+  S5(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argc : 5)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target simd linear(a, b : B::ib)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target simd linear(v : i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target simd linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  int v = 0;
+#pragma omp target simd linear(v : j)
+  for (int k = 0; k < argc; ++k) {
+    ++k;
+    v += j;
+  }
+#pragma omp target simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+#pragma omp target simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target simd linear(a, b)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{argument of a linear clause should be of integral or pointer type, not 'S4'}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+#pragma omp target simd linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target simd linear(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+#pragma omp target simd linear(i : 4)
+    for (int k = 0; k < argc; ++k) {
+      ++k;
+      i += 4;
+    }
+  }
+#pragma omp target simd linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  foomain<int, char>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/target_simd_loop_messages.cpp b/test/OpenMP/target_simd_loop_messages.cpp
new file mode 100644
index 0000000..67201fb
--- /dev/null
+++ b/test/OpenMP/target_simd_loop_messages.cpp
@@ -0,0 +1,629 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+class S {
+  int a;
+  S() : a(0) {}
+
+public:
+  S(int v) : a(v) {}
+  S(const S &s) : a(s.a) {}
+};
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+#pragma omp target simd
+  for (int i = 0; i < 10; i += 1) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target simd
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target simd
+  for (char i = 0; i < 10; i += '\1') {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target simd
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp target simd
+  for (long long i = 0; i < 10; i += 1.5) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target simd
+  for (long long i = 0; i < 'z'; i += 1u) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (ii + 1; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (c[ii] = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// Ok to skip parenthesises.
+#pragma omp target simd
+  for (((ii)) = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target simd
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+// expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+#pragma omp target simd
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target simd
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target simd
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target simd
+  for (int i = 0;; i++)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target simd
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target simd
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; ++++ii)
+    c[ii] = a[ii];
+
+// Ok but undefined behavior (in general, cannot check that incr
+// is really loop-invariant).
+#pragma omp target simd
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+// Ok - step was converted to integer type.
+#pragma omp target simd
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{relational comparison result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target simd
+  for (ii = 0; ii<10; jj> kk + 2)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target simd
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (ii = 0; (ii) < 10; ii -= 25)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (ii = 0; (ii < 10); ii -= 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (ii = 0; ii > 10; (ii += 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (ii = 0; ii < 10; (ii) = (1 - 1) + (ii))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for ((ii = 0); ii > 10; (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (ii = 0; (ii < 10); (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as firstprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target simd' directive may not be firstprivate, predetermined as linear}}
+#pragma omp target simd firstprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target simd linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as private}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target simd' directive may not be private, predetermined as linear}}
+#pragma omp target simd private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as lastprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target simd' directive may not be lastprivate, predetermined as linear}}
+#pragma omp target simd lastprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  {
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target simd' directive may not be threadprivate or thread local, predetermined as linear}}
+#pragma omp target simd
+    for (sii = 0; sii < 10; sii += 1)
+      c[sii] = a[sii];
+  }
+
+  {
+#pragma omp target simd
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] = a[globalii];
+  }
+
+  {
+#pragma omp target simd collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+// expected-error@+2 {{statement after '#pragma omp target simd' must be a for loop}}
+#pragma omp target simd
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int(*lb)[4] = nullptr;
+#pragma omp target simd
+  for (int(*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (int a{0}; a < 10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag {};
+template <class Iter>
+struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+public:
+  Iter0() {}
+  Iter0(const Iter0 &) {}
+  Iter0 operator++() { return *this; }
+  Iter0 operator--() { return *this; }
+  bool operator<(Iter0 a) { return true; }
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'Iter0' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator-(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+public:
+  Iter1(float f = 0.0f, double d = 0.0) {}
+  Iter1(const Iter1 &) {}
+  Iter1 operator++() { return *this; }
+  Iter1 operator--() { return *this; }
+  bool operator<(Iter1 a) { return true; }
+  bool operator>=(Iter1 a) { return false; }
+};
+class GoodIter {
+public:
+  GoodIter() {}
+  GoodIter(const GoodIter &) {}
+  GoodIter(int fst, int snd) {}
+  GoodIter &operator=(const GoodIter &that) { return *this; }
+  GoodIter &operator=(const Iter0 &that) { return *this; }
+  GoodIter &operator+=(int x) { return *this; }
+  GoodIter &operator-=(int x) { return *this; }
+  explicit GoodIter(void *) {}
+  GoodIter operator++() { return *this; }
+  GoodIter operator--() { return *this; }
+  bool operator!() { return true; }
+  bool operator<(GoodIter a) { return true; }
+  bool operator<=(GoodIter a) { return true; }
+  bool operator>=(GoodIter a) { return false; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator-(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator-(GoodIter a) { return a; }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
+GoodIter operator+(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'int' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator-(int v, GoodIter a) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 1st argument}}
+GoodIter operator+(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+#pragma omp target simd
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+#pragma omp target simd
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (GoodIter I(1, 2); I < end; ++I)
+    ++I;
+#pragma omp target simd
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target simd
+  for (begin = begin0; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (++begin; begin < end; ++begin)
+    ++begin;
+#pragma omp target simd
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target simd
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target simd
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target simd
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+#pragma omp target simd
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target simd
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target simd
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+// expected-error@+2 {{invalid operands to binary expression ('Iter0' and 'int')}}
+#pragma omp target simd
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+// Initializer is constructor without params.
+// expected-error@+3 {{invalid operands to binary expression ('Iter0' and 'int')}}
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+  Iter1 begin1, end1;
+// expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target simd
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+// expected-error@+5 {{invalid operands to binary expression ('Iter1' and 'float')}}
+// expected-error@+4 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+// Initializer is constructor with all default params.
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target simd
+  for (Iter1 I; I < end1; ++I) {
+  }
+  return 0;
+}
+
+template <typename IT, int ST>
+class TC {
+public:
+  int dotest_lt(IT begin, IT end) {
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+    for (IT I = begin; I < end; I = I + ST) {
+      ++I;
+    }
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target simd
+    for (IT I = begin; I <= end; I += ST) {
+      ++I;
+    }
+#pragma omp target simd
+    for (IT I = begin; I < end; ++I) {
+      ++I;
+    }
+  }
+
+  static IT step() {
+    return IT(ST);
+  }
+};
+template <typename IT, int ST = 0>
+int dotest_gt(IT begin, IT end) {
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target simd
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+#pragma omp target simd
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end);         // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end);            // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+#pragma omp target simd
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try { // expected-error {{'try' statement cannot be used in OpenMP simd region}}
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      }
+      throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    } catch (float f) {
+      if (f > 0.1)
+        throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+#pragma omp target simd
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
+void test_loop_firstprivate_lastprivate() {
+  S s(4);
+#pragma omp target simd lastprivate(s) firstprivate(s)
+  for (int i = 0; i < 16; ++i)
+    ;
+}
diff --git a/test/OpenMP/target_simd_map_messages.cpp b/test/OpenMP/target_simd_map_messages.cpp
new file mode 100644
index 0000000..63bbdde
--- /dev/null
+++ b/test/OpenMP/target_simd_map_messages.cpp
@@ -0,0 +1,273 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to, tofrom, always;
+  const T (&l)[5] = da;
+
+
+#pragma omp target simd map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(tofrom: t[:I])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(T) // expected-error {{'T' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target simd map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+{
+#pragma omp target simd map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(l) // OK
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target simd map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to, tofrom, always;
+  const int (&l)[5] = da;
+
+#pragma omp target simd map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(argv[1])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target simd map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(l) map(l[:5]) // expected-error 1 {{variable already marked as mapped in current construct}} expected-note 1 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 1 {{used here}}
+{
+#pragma omp target simd map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(l) //
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target simd map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target simd map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_simd_messages.cpp b/test/OpenMP/target_simd_messages.cpp
new file mode 100644
index 0000000..d1b06e3
--- /dev/null
+++ b/test/OpenMP/target_simd_messages.cpp
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+static int pvt;
+#pragma omp threadprivate(pvt)
+
+#pragma omp target simd // expected-error {{unexpected OpenMP directive '#pragma omp target simd'}}
+
+int main(int argc, char **argv) {
+#pragma omp target simd { // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd ( // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd[ // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd] // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd } // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd
+  for (int i = 0; i < argc; ++i)
+    foo();
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+#pragma omp target simd unknown()
+  for (int i = 0; i < argc; ++i)
+    foo();
+L1:
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target simd
+  for (int i = 0; i < argc; ++i) {
+    goto L1; // expected-error {{use of undeclared label 'L1'}}
+    argc++;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    switch (argc) {
+    case (0):
+#pragma omp target simd
+      for (int i = 0; i < argc; ++i) {
+        foo();
+        break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+        continue;
+      }
+    default:
+      break;
+    }
+  }
+#pragma omp target simd default(none) // expected-error {{unexpected OpenMP clause 'default' in directive '#pragma omp target simd'}}
+  for (int i = 0; i < 10; ++i)
+    ++argc;
+
+  goto L2; // expected-error {{use of undeclared label 'L2'}}
+#pragma omp target simd
+  for (int i = 0; i < argc; ++i)
+  L2:
+  foo();
+#pragma omp target simd
+  for (int i = 0; i < argc; ++i) {
+    return 1; // expected-error {{cannot return from OpenMP region}}
+  }
+
+  [[]] // expected-error {{an attribute list cannot appear here}}
+#pragma omp target simd
+      for (int n = 0; n < 100; ++n) {
+  }
+
+#pragma omp target simd copyin(pvt) // expected-error {{unexpected OpenMP clause 'copyin' in directive '#pragma omp target simd'}}
+  for (int n = 0; n < 100; ++n) {}
+
+  return 0;
+}
+
+void test_ordered() {
+#pragma omp target simd ordered // expected-error {{unexpected OpenMP clause 'ordered' in directive '#pragma omp target simd'}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
diff --git a/test/OpenMP/target_simd_misc_messages.c b/test/OpenMP/target_simd_misc_messages.c
new file mode 100644
index 0000000..debe387
--- /dev/null
+++ b/test/OpenMP/target_simd_misc_messages.c
@@ -0,0 +1,485 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target simd'}}
+#pragma omp target simd
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target simd'}}
+#pragma omp target simd foo
+
+void test_no_clause() {
+  int i;
+#pragma omp target simd
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{statement after '#pragma omp target simd' must be a for loop}}
+#pragma omp target simd
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target simd
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+#pragma omp target simd foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+#pragma omp target simd;
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+#pragma omp target simd private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+#pragma omp target simd, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+
+void test_collapse() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target simd collapse
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target simd collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target simd collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target simd collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target simd collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+// expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target simd collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target simd collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target simd collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+#pragma omp target simd collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target simd collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target simd', but found only 1}}
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target simd collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target simd collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target simd collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target simd collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target simd collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_private() {
+  int i;
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd private(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target simd private(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target simd private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd private()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target simd private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target simd private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_lastprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target simd lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target simd lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target simd lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target simd lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_firstprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target simd firstprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target simd firstprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd firstprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd firstprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target simd firstprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target simd lastprivate(x) firstprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd lastprivate(x, y) firstprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd lastprivate(x, y, z) firstprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
+void test_safelen() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target simd safelen
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd safelen()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target simd safelen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(4
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd safelen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target simd safelen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target simd safelen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target simd safelen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target simd safelen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target simd safelen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_simdlen() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target simd simdlen
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target simd simdlen()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target simd simdlen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(4
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target simd simdlen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target simd simdlen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target simd simdlen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target simd simdlen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target simd simdlen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target simd simdlen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_safelen_simdlen() {
+  int i;
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp target simd simdlen(6) safelen(5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp target simd safelen(5) simdlen(6)
+  for (i = 0; i < 16; ++i)
+    ;
+}
diff --git a/test/OpenMP/target_simd_nowait_messages.cpp b/test/OpenMP/target_simd_nowait_messages.cpp
new file mode 100644
index 0000000..3635ce2
--- /dev/null
+++ b/test/OpenMP/target_simd_nowait_messages.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target simd nowait( // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd nowait device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target simd nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_simd_private_messages.cpp b/test/OpenMP/target_simd_private_messages.cpp
new file mode 100644
index 0000000..b20ff0a
--- /dev/null
+++ b/test/OpenMP/target_simd_private_messages.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target simd private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target simd private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target simd private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_simd_reduction_messages.cpp b/test/OpenMP/target_simd_reduction_messages.cpp
new file mode 100644
index 0000000..61c7dde
--- /dev/null
+++ b/test/OpenMP/target_simd_reduction_messages.cpp
@@ -0,0 +1,313 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target simd reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target simd reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target simd reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target simd reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/target_simd_safelen_messages.cpp b/test/OpenMP/target_simd_safelen_messages.cpp
new file mode 100644
index 0000000..79bee76
--- /dev/null
+++ b/test/OpenMP/target_simd_safelen_messages.cpp
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+  #pragma omp target simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+  #pragma omp target simd safelen (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  #pragma omp target simd safelen (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+6 2 {{directive '#pragma omp target simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp target simd safelen (foobool(argc)), safelen (true), safelen (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+  #pragma omp target simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd safelen (4)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp target simd safelen (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+2 2 {{directive '#pragma omp target simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  #pragma omp target simd safelen (foobool(argc)), safelen (true), safelen (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+  #pragma omp target simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target simd' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_simd_simdlen_messages.cpp b/test/OpenMP/target_simd_simdlen_messages.cpp
new file mode 100644
index 0000000..8a9a3dc
--- /dev/null
+++ b/test/OpenMP/target_simd_simdlen_messages.cpp
@@ -0,0 +1,142 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target simd simdlen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+// expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target simd simdlen (argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target simd simdlen (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target simd simdlen (1)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target simd simdlen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  // expected-error@+6 2 {{directive '#pragma omp target simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target simd simdlen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+ #pragma omp target simd simdlen (4)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  #pragma omp target simd simdlen (N) // expected-error {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target simd simdlen (2), safelen (4) // OK
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target simd simdlen (4), safelen (4) // OK
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target simd simdlen (8), safelen (4) // expected-error{{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target simd simdlen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target simd simdlen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target simd simdlen (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+// expected-error@+2 2 {{directive '#pragma omp target simd' cannot contain more than one 'simdlen' clause}}
+// expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+// expected-error@+3 {{statement after '#pragma omp target simd' must be a for loop}}
+// expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target simd simdlen(simdlen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+
+#pragma omp target simd simdlen (2), safelen (4) // OK
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target simd simdlen (4), safelen (4) // OK
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target simd simdlen (8), safelen (4) // expected-error{{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_update_ast_print.cpp b/test/OpenMP/target_update_ast_print.cpp
new file mode 100644
index 0000000..3a98f54
--- /dev/null
+++ b/test/OpenMP/target_update_ast_print.cpp
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+template <class T, class U>
+T foo(T targ, U uarg) {
+  static T a;
+  U b;
+  int l;
+#pragma omp target update to(a) if(l>5) device(l) nowait depend(inout:l)
+
+#pragma omp target update from(b) if(l<5) device(l-1) nowait depend(inout:l)
+  return a + targ + (T)b;
+}
+// CHECK:      static int a;
+// CHECK-NEXT: float b;
+// CHECK-NEXT: int l;
+// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
+// CHECK:      static char a;
+// CHECK-NEXT: float b;
+// CHECK-NEXT: int l;
+// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
+// CHECK:      static T a;
+// CHECK-NEXT: U b;
+// CHECK-NEXT: int l;
+// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
+
+int main(int argc, char **argv) {
+  static int a;
+  int n;
+  float f;
+
+// CHECK:      static int a;
+// CHECK-NEXT: int n;
+// CHECK-NEXT: float f;
+#pragma omp target update to(a) if(f>0.0) device(n) nowait depend(in:n)
+// CHECK-NEXT: #pragma omp target update to(a) if(f > 0.) device(n) nowait depend(in : n)
+#pragma omp target update from(f) if(f<0.0) device(n+1) nowait depend(in:n)
+// CHECK-NEXT: #pragma omp target update from(f) if(f < 0.) device(n + 1) nowait depend(in : n)
+  return foo(argc, f) + foo(argv[0][0], f) + a;
+}
+
+#endif
diff --git a/test/OpenMP/target_update_codegen.cpp b/test/OpenMP/target_update_codegen.cpp
new file mode 100644
index 0000000..f74ed49
--- /dev/null
+++ b/test/OpenMP/target_update_codegen.cpp
@@ -0,0 +1,245 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+// CK1: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+};
+
+ST<int> gb;
+double gc[100];
+
+// CK1: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 800]
+// CK1: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 34]
+
+// CK1: [[SIZE02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] 4]
+// CK1: [[MTYPE02:@.+]] = {{.+}}constant [1 x i32] [i32 33]
+
+// CK1: [[MTYPE03:@.+]] = {{.+}}constant [1 x i32] [i32 34]
+
+// CK1: [[SIZE04:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] 24]
+// CK1: [[MTYPE04:@.+]] = {{.+}}constant [2 x i32] [i32 33, i32 17]
+
+// CK1-LABEL: _Z3fooi
+void foo(int arg) {
+  int la;
+  float lb[arg];
+
+  // Region 00
+  // CK1-DAG: call void @__tgt_target_data_update(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[P0]]
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target update if(1+3-5) device(arg) from(gc)
+  {++arg;}
+
+  // Region 01
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target update to(la) if(1+3-4)
+  {++arg;}
+
+  // Region 02
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_update(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+  // CK1: br label %[[IFEND:[^,]+]]
+
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target update to(arg) if(arg) device(4)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 03
+  // CK1-DAG: call void @__tgt_target_data_update(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: store i[[sz]] [[CSVAL0:%[^,]+]], i[[sz]]* [[S0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+  // CK1-DAG: [[CSVAL0]] = mul nuw i[[sz]] %{{[^,]+}}, 4
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target update from(lb)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 04
+  // CK1-DAG: call void @__tgt_target_data_update(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([[ST]]* @gb to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[P0]]
+
+
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[BP1]]
+  // CK1-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK1-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%.+]] to i8*
+  // CK1-DAG: [[SEC1]] = getelementptr inbounds {{.+}}double* [[SEC11:%[^,]+]], i{{.+}} 0
+  // CK1-DAG: [[SEC11]] = load double*, double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1),
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target update to(gb.b[:3])
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+
+  T foo(T arg) {
+    // Region 00
+    #pragma omp target update from(b[1:3]) if(a>123) device(arg)
+    {arg++;}
+    return arg;
+  }
+};
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 24]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [2 x i32] [i32 34, i32 18]
+
+// CK2-LABEL: _Z3bari
+int bar(int arg){
+  ST<int> A;
+  return A.foo(arg);
+}
+
+// Region 00
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_update(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+// CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%[^,]+]] to i8*
+// CK2-DAG: [[SEC0]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+
+// CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK2-DAG: [[CBPVAL1]] = bitcast double** [[SEC0]] to i8*
+// CK2-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%[^,]+]] to i8*
+// CK2-DAG: [[SEC1]] = getelementptr inbounds {{.*}}double* [[SEC11:%[^,]+]], i{{.+}} 1
+// CK2-DAG: [[SEC11]] = load double*, double** [[SEC111:%[^,]+]],
+// CK2-DAG: [[SEC111]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+// CK2: br label %[[IFEND:[^,]+]]
+
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+// CK2: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3-LABEL: no_target_devices
+void no_target_devices(int arg) {
+  // CK3-NOT: tgt_target_data_update
+  // CK3: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK3: ret
+  #pragma omp target update to(arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-32
+#ifdef CK4
+
+// CK4-LABEL: device_side_scan
+void device_side_scan(int arg) {
+  // CK4: tgt_target_data_update
+  // CK4: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK4: ret
+  // TCK4-NOT: tgt_target_data_update
+  #pragma omp target update from(arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_update_depend_messages.cpp b/test/OpenMP/target_update_depend_messages.cpp
new file mode 100644
index 0000000..64383a0
--- /dev/null
+++ b/test/OpenMP/target_update_depend_messages.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+template <class T, class S, class R>
+int tmain(T argc, S **argv, R *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+  int i, z;
+
+  #pragma omp depend target update to(z) // expected-error{{expected an OpenMP directive}}
+  #pragma omp depend(out:argc) target update to(z) // expected-error{{expected an OpenMP directive}}
+  #pragma omp target depend(in:argc) update to(z) // expected-error{{unexpected OpenMP clause 'update' in directive '#pragma omp target'}} expected-error{{unexpected OpenMP clause 'to' in directive '#pragma omp target'}}
+  {}
+
+  #pragma omp target update to(z) depend // expected-error {{expected '(' after 'depend'}}
+  #pragma omp target update to(z) depend( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend() // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend(argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  #pragma omp target update to(z) depend(source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend(in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(z) depend(out: ) // expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(out :S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target update to(z) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : argv[0])
+  #pragma omp target update to(z) depend(in : ) // expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(in : tmain) // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  #pragma omp target update to(z) depend(in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  #pragma omp target update to(z) depend(in : argv[-1:0])
+  #pragma omp target update to(z) depend(in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  #pragma omp target update to(z) depend(in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  #pragma omp target update to(z) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  #pragma omp target update to(z) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  #pragma omp target update to(z) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  #pragma omp target update to(z) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  #pragma omp target update to(z) depend(in : argv[ : argc][1 : argc - 1])
+  #pragma omp target update to(z) depend(in : arr[0])
+
+  return 0;
+}
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+  int z;
+
+  #pragma omp depend target update to(z) // expected-error{{expected an OpenMP directive}}
+  #pragma omp depend(out:argc) target update to(z) // expected-error{{expected an OpenMP directive}}
+  #pragma omp target depend(in:argc) update to(z) // expected-error{{unexpected OpenMP clause 'update' in directive '#pragma omp target'}} expected-error{{unexpected OpenMP clause 'to' in directive '#pragma omp target'}}
+  {}
+
+  #pragma omp target update to(z) depend // expected-error {{expected '(' after 'depend'}}
+  #pragma omp target update to(z) depend( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend() // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend(argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  #pragma omp target update to(z) depend(source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend(in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(z) depend(out: ) // expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(out :S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target update to(z) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : argv[0])
+  #pragma omp target update to(z) depend(in : ) // expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(in : main) // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  #pragma omp target update to(z) depend(in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  #pragma omp target update to(z) depend(in : argv[-1:0])
+  #pragma omp target update to(z) depend(in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  #pragma omp target update to(z) depend(in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  #pragma omp target update to(z) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  #pragma omp target update to(z) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  #pragma omp target update to(z) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  #pragma omp target update to(z) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  #pragma omp target update to(z) depend(in : argv[ : argc][1 : argc - 1])
+  #pragma omp target update to(z) depend(in : arr[0])
+
+  return tmain(argc, argv, env); // expected-note {{in instantiation of function template specialization 'tmain<int, char, char>' requested here}}
+}
diff --git a/test/OpenMP/target_update_device_messages.cpp b/test/OpenMP/target_update_device_messages.cpp
new file mode 100644
index 0000000..3711275
--- /dev/null
+++ b/test/OpenMP/target_update_device_messages.cpp
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+
+template <class T, class S>
+int tmain(T argc, S **argv) {
+  int i;
+#pragma omp target update to(i) device // expected-error {{expected '(' after 'device'}}
+#pragma omp target update to(i) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(i) device () // expected-error {{expected expression}}
+#pragma omp target update to(i) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(i) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+#pragma omp target update from(i) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+#pragma omp target update from(i) device (argc + argc)
+#pragma omp target update from(i) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'device' clause}}
+#pragma omp target update from(i) device (S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target update from(i) device (3.14) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp target update from(i) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+}
+
+int main(int argc, char **argv) {
+  int j;
+#pragma omp target update to(j) device // expected-error {{expected '(' after 'device'}}
+#pragma omp target update from(j) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(j) device () // expected-error {{expected expression}}
+#pragma omp target update from(j) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(j) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+#pragma omp target update from(j) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+#pragma omp target update to(j) device (argc + argc)
+#pragma omp target update from(j) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'device' clause}}
+#pragma omp target update to(j) device (S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target update from(j) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+#pragma omp target update to(j) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+
+  return tmain(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
+}
diff --git a/test/OpenMP/target_update_from_messages.cpp b/test/OpenMP/target_update_from_messages.cpp
new file mode 100644
index 0000000..6aff083
--- /dev/null
+++ b/test/OpenMP/target_update_from_messages.cpp
@@ -0,0 +1,176 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+struct S6 {
+  int ii;
+  int aa[30];
+  float xx;
+  double *pp;
+};
+struct S7 {
+  int i;
+  int a[50];
+  float x;
+  S6 s6[5];
+  double *p;
+  unsigned bfa : 4;
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int to;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T from;
+  const T (&l)[5] = da;
+  T *m;
+  S7 s7;
+
+#pragma omp target update from // expected-error {{expected '(' after 'from'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from() // expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update() // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(alloc) // expected-error {{use of undeclared identifier 'alloc'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(x)
+#pragma omp target update from(t[:I])
+#pragma omp target update from(T) // expected-error {{'T' does not refer to a value}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update from(S2::S2s)
+#pragma omp target update from(S2::S2sc)
+#pragma omp target update from(from)
+#pragma omp target update from(y x) // expected-error {{expected ',' or ')' in 'from' clause}}
+#pragma omp target update from(argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+#pragma omp target update from(S1) // expected-error {{'S1' does not refer to a value}}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target update from(ba) // expected-error 2 {{type 'S2' is not mappable to target}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(h) // expected-error {{threadprivate variables are not allowed in 'from' clause}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(k), to(k) // expected-error 2 {{variable can appear only once in OpenMP 'target update' construct}} expected-note 2 {{used here}}
+#pragma omp target update from(t), from(t[:5]) // expected-error 2 {{variable can appear only once in OpenMP 'target update' construct}} expected-note 2 {{used here}}
+#pragma omp target update from(da)
+#pragma omp target update from(da[:4])
+
+#pragma omp target update from(x, a[:2]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update from(x, c[:]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update from(x, (m+1)[2]) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update from(s7.i, s7.a[:3])
+#pragma omp target update from(s7.s6[1].aa[0:5])
+#pragma omp target update from(x, s7.s6[:5].aa[6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update from(x, s7.s6[:5].aa[:6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update from(s7.p[:10])
+#pragma omp target update from(x, s7.bfa) // expected-error {{bit fields cannot be used to specify storage in a 'from' clause}}
+#pragma omp target update from(x, s7.p[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target data map(to: s7.i)
+  {
+#pragma omp target update from(s7.x)
+  }
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i, t[20];
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int from;
+  const int (&l)[5] = da;
+  int *m;
+  S7 s7;
+
+#pragma omp target update from // expected-error {{expected '(' after 'from'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from() // expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update() // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(alloc) // expected-error {{use of undeclared identifier 'alloc'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(x)
+#pragma omp target update from(t[:i])
+#pragma omp target update from(S2::S2s)
+#pragma omp target update from(S2::S2sc)
+#pragma omp target update from(from)
+#pragma omp target update from(y x) // expected-error {{expected ',' or ')' in 'from' clause}}
+#pragma omp target update from(argc > 0 ? x : y) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(S1) // expected-error {{'S1' does not refer to a value}}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target update from(ba) // expected-error 2 {{type 'S2' is not mappable to target}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(h) // expected-error {{threadprivate variables are not allowed in 'from' clause}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(k), to(k) // expected-error {{variable can appear only once in OpenMP 'target update' construct}} expected-note {{used here}}
+#pragma omp target update from(t), from(t[:5]) // expected-error {{variable can appear only once in OpenMP 'target update' construct}} expected-note {{used here}}
+#pragma omp target update from(da)
+#pragma omp target update from(da[:4])
+
+#pragma omp target update from(x, a[:2]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update from(x, c[:]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update from(x, (m+1)[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update from(s7.i, s7.a[:3])
+#pragma omp target update from(s7.s6[1].aa[0:5])
+#pragma omp target update from(x, s7.s6[:5].aa[6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update from(x, s7.s6[:5].aa[:6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update from(s7.p[:10])
+#pragma omp target update from(x, s7.bfa) // expected-error {{bit fields cannot be used to specify storage in a 'from' clause}}
+#pragma omp target update from(x, s7.p[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target data map(to: s7.i)
+  {
+#pragma omp target update from(s7.x)
+  }
+
+  return tmain<int, 3>(argc)+tmain<to, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_update_if_messages.cpp b/test/OpenMP/target_update_if_messages.cpp
new file mode 100644
index 0000000..97715e0
--- /dev/null
+++ b/test/OpenMP/target_update_if_messages.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int n;
+#pragma omp target update to(n) if // expected-error {{expected '(' after 'if'}}
+#pragma omp target update from(n) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if () // expected-error {{expected expression}}
+#pragma omp target update from(n) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+#pragma omp target update from(n) if (argc > 0 ? argv[1] : argv[2])
+#pragma omp target update to(n) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'if' clause}}
+#pragma omp target update from(n) if (S) // expected-error {{'S' does not refer to a value}}
+#pragma omp target update to(n) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(n) if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if(argc)
+#pragma omp target update from(n) if(target update // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if(target update : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(n) if(target update : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if(target update : argc)
+#pragma omp target update from(n) if(target update : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target update'}}
+#pragma omp target update to(n) if(target update : argc) if (target update:argc) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'if' clause with 'target update' name modifier}}
+#pragma omp target update from(n) if(target update : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int m;
+#pragma omp target update to(m) if // expected-error {{expected '(' after 'if'}}
+#pragma omp target update from(m) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if () // expected-error {{expected expression}}
+#pragma omp target update from(m) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+#pragma omp target update from(m) if (argc > 0 ? argv[1] : argv[2])
+#pragma omp target update to(m) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'if' clause}}
+#pragma omp target update from(m) if (S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target update to(m) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(m) if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(m) if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if(target update // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(m) if(target update : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if(target update : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(m) if(target update : argc)
+#pragma omp target update to(m) if(target update : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target update'}}
+#pragma omp target update from(m) if(target update : argc) if (target update:argc)  // expected-error {{directive '#pragma omp target update' cannot contain more than one 'if' clause with 'target update' name modifier}}
+#pragma omp target update to(m) if(target update : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_update_messages.cpp b/test/OpenMP/target_update_messages.cpp
new file mode 100644
index 0000000..73f1eec
--- /dev/null
+++ b/test/OpenMP/target_update_messages.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // Aexpected-note {{declared here}}
+
+template <class T, class S> // Aexpected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int n;
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int m;
+  #pragma omp target update // expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+  #pragma omp target update to(m) { // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(m) ( // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(m) [ // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(m) ] // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(m) ) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+
+  #pragma omp target update from(m) // OK
+  {
+    foo();
+  }
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_update_nowait_messages.cpp b/test/OpenMP/target_update_nowait_messages.cpp
new file mode 100644
index 0000000..19bc58e
--- /dev/null
+++ b/test/OpenMP/target_update_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+int main(int argc, char **argv) {
+  int i;
+
+  #pragma omp nowait target update to(i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target nowait update to(i) // expected-error {{unexpected OpenMP clause 'update' in directive '#pragma omp target'}} expected-error {{unexpected OpenMP clause 'to' in directive '#pragma omp target'}}
+  {}
+  #pragma omp target update nowait() to(i) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+  #pragma omp target update to(i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(i) nowait device (-10u)
+  #pragma omp target update to(i) nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(i) nowait nowait // expected-error {{directive '#pragma omp target update' cannot contain more than one 'nowait' clause}}
+  #pragma omp target update nowait to(i) nowait // expected-error {{directive '#pragma omp target update' cannot contain more than one 'nowait' clause}}
+  return 0;
+}
diff --git a/test/OpenMP/target_update_to_messages.cpp b/test/OpenMP/target_update_to_messages.cpp
new file mode 100644
index 0000000..641d0bd
--- /dev/null
+++ b/test/OpenMP/target_update_to_messages.cpp
@@ -0,0 +1,175 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+struct S6 {
+  int ii;
+  int aa[30];
+  float xx;
+  double *pp;
+};
+struct S7 {
+  int i;
+  int a[50];
+  float x;
+  S6 s6[5];
+  double *p;
+  unsigned bfa : 4;
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T *m;
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to;
+  const T (&l)[5] = da;
+  S7 s7;
+
+#pragma omp target update to // expected-error {{expected '(' after 'to'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to() // expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update() // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(alloc) // expected-error {{use of undeclared identifier 'alloc'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(x)
+#pragma omp target update to(t[:I])
+#pragma omp target update to(T) // expected-error {{'T' does not refer to a value}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update to(S2::S2s)
+#pragma omp target update to(S2::S2sc)
+#pragma omp target update to(to)
+#pragma omp target update to(y x) // expected-error {{expected ',' or ')' in 'to' clause}}
+#pragma omp target update to(argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+#pragma omp target update to(S1) // expected-error {{'S1' does not refer to a value}}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target update to(ba) // expected-error 2 {{type 'S2' is not mappable to target}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(h) // expected-error {{threadprivate variables are not allowed in 'to' clause}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(k), from(k) // expected-error 2 {{variable can appear only once in OpenMP 'target update' construct}} expected-note 2 {{used here}}
+#pragma omp target update to(t), to(t[:5]) // expected-error 2 {{variable can appear only once in OpenMP 'target update' construct}} expected-note 2 {{used here}}
+#pragma omp target update to(da)
+#pragma omp target update to(da[:4])
+
+#pragma omp target update to(x, a[:2]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update to(x, c[:]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update to(x, (m+1)[2]) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update to(s7.i, s7.a[:3])
+#pragma omp target update to(s7.s6[1].aa[0:5])
+#pragma omp target update to(x, s7.s6[:5].aa[6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update to(x, s7.s6[:5].aa[:6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update to(s7.p[:10])
+#pragma omp target update to(x, s7.bfa) // expected-error {{bit fields cannot be used to specify storage in a 'to' clause}}
+#pragma omp target update to(x, s7.p[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target data map(to: s7.i)
+  {
+#pragma omp target update to(s7.x)
+  }
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i, t[20];
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to;
+  const int (&l)[5] = da;
+  S7 s7;
+  int *m;
+
+#pragma omp target update to // expected-error {{expected '(' after 'to'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to() // expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update() // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(alloc) // expected-error {{use of undeclared identifier 'alloc'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(x)
+#pragma omp target update to(t[:i])
+#pragma omp target update to(S2::S2s)
+#pragma omp target update to(S2::S2sc)
+#pragma omp target update to(to)
+#pragma omp target update to(y x) // expected-error {{expected ',' or ')' in 'to' clause}}
+#pragma omp target update to(argc > 0 ? x : y) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(S1) // expected-error {{'S1' does not refer to a value}}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target update to(ba) // expected-error 2 {{type 'S2' is not mappable to target}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(h) // expected-error {{threadprivate variables are not allowed in 'to' clause}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(k), from(k) // expected-error {{variable can appear only once in OpenMP 'target update' construct}} expected-note {{used here}}
+#pragma omp target update to(t), to(t[:5]) // expected-error {{variable can appear only once in OpenMP 'target update' construct}} expected-note {{used here}}
+#pragma omp target update to(da)
+#pragma omp target update to(da[:4])
+
+#pragma omp target update to(x, a[:2]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update to(x, c[:]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update to(x, (m+1)[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update to(s7.i, s7.a[:3])
+#pragma omp target update to(s7.s6[1].aa[0:5])
+#pragma omp target update to(x, s7.s6[:5].aa[6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update to(x, s7.s6[:5].aa[:6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update to(s7.p[:10])
+#pragma omp target update to(x, s7.bfa) // expected-error {{bit fields cannot be used to specify storage in a 'to' clause}}
+#pragma omp target update to(x, s7.p[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target data map(to: s7.i)
+  {
+#pragma omp target update to(s7.x)
+  }
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/task_ast_print.cpp b/test/OpenMP/task_ast_print.cpp
index 723139b..37e5833 100644
--- a/test/OpenMP/task_ast_print.cpp
+++ b/test/OpenMP/task_ast_print.cpp
@@ -8,6 +8,57 @@
 
 void foo() {}
 
+struct S1 {
+  S1(): a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp task private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp task private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp task private(this->a) private(this->a) private(this->S1::a)
+// CHECK: #pragma omp task private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp task private(this->a) private(this->a)
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp task private(a) private(this->a) private(S7<S1>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp task private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp task private(this->a) private(this->a) private(this->S7<S1>::a)
+// CHECK: #pragma omp task private(this->a) private(this->a)
+
 template <class T>
 struct S {
   operator T() { return T(); }
@@ -98,4 +149,7 @@
   return tmain<int, 5>(b, &b) + tmain<long, 1>(x, &x);
 }
 
+extern template int S<int>::TS;
+extern template long S<long>::TS;
+
 #endif
diff --git a/test/OpenMP/task_codegen.cpp b/test/OpenMP/task_codegen.cpp
index 23dc014..08c9ce3 100644
--- a/test/OpenMP/task_codegen.cpp
+++ b/test/OpenMP/task_codegen.cpp
@@ -9,7 +9,7 @@
 // CHECK-DAG: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STRUCT_SHAREDS:%.+]] = type { i8*, [2 x [[STRUCT_S:%.+]]]* }
 // CHECK-DAG: [[STRUCT_SHAREDS1:%.+]] = type { [2 x [[STRUCT_S:%.+]]]* }
-// CHECK-DAG: [[KMP_TASK_T:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+// CHECK-DAG: [[KMP_TASK_T:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}} }
 // CHECK-DAG: [[KMP_DEPEND_INFO:%.+]] = type { i64, i64, i8 }
 struct S {
   int a;
@@ -30,15 +30,16 @@
 // CHECK: store i8* [[B]], i8** [[B_REF]]
 // CHECK: [[S_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS]], [[STRUCT_SHAREDS]]* [[CAPTURES]], i32 0, i32 1
 // CHECK: store [2 x [[STRUCT_S]]]* [[S]], [2 x [[STRUCT_S]]]** [[S_REF]]
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 32, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 33, i64 40, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY1:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
 // CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS]]* [[CAPTURES]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 16, i32 8, i1 false)
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[PRIORITY_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 4
+// CHECK: [[PRIORITY:%.+]] = bitcast %union{{.+}}* [[PRIORITY_REF_PTR]] to i32*
+// CHECK: store i32 {{.+}}, i32* [[PRIORITY]]
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
-#pragma omp task shared(a, b, s)
+#pragma omp task shared(a, b, s) priority(b)
   {
     a = 15;
     b = a;
@@ -46,13 +47,11 @@
   }
 // CHECK: [[S_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS1]], [[STRUCT_SHAREDS1]]* [[CAPTURES:%.+]], i32 0, i32 0
 // CHECK: store [2 x [[STRUCT_S]]]* [[S]], [2 x [[STRUCT_S]]]** [[S_REF]]
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 8,
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 8,
 // CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
 // CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS1]]* [[CAPTURES]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 8, i32 8, i1 false)
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
 // CHECK: [[DEP:%.*]] = getelementptr inbounds [4 x [[KMP_DEPEND_INFO]]], [4 x [[KMP_DEPEND_INFO]]]* [[DEPENDENCIES:%.*]], i64 0, i64 0
 // CHECK: [[T0:%.*]] = getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* [[DEP]], i32 0, i32 0
 // CHECK: store i64 ptrtoint (i32* @{{.+}} to i64), i64* [[T0]]
@@ -100,17 +99,13 @@
     a = 15;
     s[1].a = 10;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
 #pragma omp task untied
   {
     a = 1;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 32, i64 1,
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1,
 // CHECK: getelementptr inbounds [2 x [[STRUCT_S]]], [2 x [[STRUCT_S]]]* [[S]], i64 0, i64 0
 // CHECK: getelementptr inbounds [2 x [[KMP_DEPEND_INFO]]], [2 x [[KMP_DEPEND_INFO]]]* %{{[^,]+}}, i64 0, i64 0
 // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 0
@@ -120,15 +115,15 @@
 // CHECK: store i64 4, i64*
 // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 2
 // CHECK: store i8 3, i8*
+// CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]]
+// CHECK: [[IDX2:%.+]] = sext i8 [[B_VAL]] to i64
 // CHECK: [[IDX1:%.+]] = mul nsw i64 4, [[A_VAL]]
 // CHECK: [[START:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
+// CHECK: [[START1:%.+]] = getelementptr inbounds i32, i32* [[START]], i64 [[IDX2]]
 // CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]]
 // CHECK: [[IDX2:%.+]] = sext i8 [[B_VAL]] to i64
-// CHECK: [[START1:%.+]] = getelementptr inbounds i32, i32* [[START]], i64 [[IDX2]]
 // CHECK: [[IDX1:%.+]] = mul nsw i64 9, [[A_VAL]]
 // CHECK: [[END:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
-// CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]]
-// CHECK: [[IDX2:%.+]] = sext i8 [[B_VAL]] to i64
 // CHECK: [[END1:%.+]] = getelementptr inbounds i32, i32* [[END]], i64 [[IDX2]]
 // CHECK: [[END2:%.+]] = getelementptr i32, i32* [[END1]], i32 1
 // CHECK: [[START_INT:%.+]] = ptrtoint i32* [[START1]] to i64
@@ -149,9 +144,7 @@
   {
     a = 1;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 32, i64 1,
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 40, i64 1,
 // CHECK: getelementptr inbounds [3 x [[KMP_DEPEND_INFO]]], [3 x [[KMP_DEPEND_INFO]]]* %{{[^,]+}}, i64 0, i64 0
 // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 0
 // CHECK: store i64 ptrtoint (i32* @{{.+}} to i64), i64*
@@ -173,12 +166,12 @@
 // CHECK: [[START1:%.+]] = getelementptr inbounds i32, i32* [[START]], i64 3
 // CHECK: [[NEW_A_VAL:%.+]] = load i32, i32* @{{.+}},
 // CHECK: [[NEW_A_VAL_I64:%.+]] = sext i32 [[NEW_A_VAL]] to i64
+// CHECK: [[IDX2:%.+]] = sub nsw i64 [[NEW_A_VAL_I64]], 1
+// CHECK: [[NEW_A_VAL:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[NEW_A_VAL_I64:%.+]] = sext i32 [[NEW_A_VAL]] to i64
 // CHECK: [[SUB:%.+]] = add nsw i64 -1, [[NEW_A_VAL_I64]]
 // CHECK: [[IDX1:%.+]] = mul nsw i64 [[SUB]], [[A_VAL]]
 // CHECK: [[END:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
-// CHECK: [[NEW_A_VAL:%.+]] = load i32, i32* @{{.+}},
-// CHECK: [[NEW_A_VAL_I64:%.+]] = sext i32 [[NEW_A_VAL]] to i64
-// CHECK: [[IDX2:%.+]] = sub nsw i64 [[NEW_A_VAL_I64]], 1
 // CHECK: [[END1:%.+]] = getelementptr inbounds i32, i32* [[END]], i64 [[IDX2]]
 // CHECK: [[END2:%.+]] = getelementptr i32, i32* [[END1]], i32 1
 // CHECK: [[START_INT:%.+]] = ptrtoint i32* [[START1]] to i64
@@ -199,17 +192,13 @@
   {
     a = 2;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY3:@.+]] to i32 (i32, i8*)*))
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY3:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
 #pragma omp task final(true)
   {
     a = 2;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY4:@.+]] to i32 (i32, i8*)*))
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.*}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY4:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
   const bool flag = false;
 #pragma omp task final(flag)
@@ -220,9 +209,7 @@
 // CHECK: [[CMP:%.+]] = icmp ne i8 [[B_VAL]], 0
 // CHECK: [[FINAL:%.+]] = select i1 [[CMP]], i32 2, i32 0
 // CHECK: [[FLAGS:%.+]] = or i32 [[FINAL]], 1
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 [[FLAGS]], i64 32, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY5:@.+]] to i32 (i32, i8*)*))
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 [[FLAGS]], i64 40, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY5:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
   int c __attribute__((aligned(128)));
 #pragma omp task final(b) shared(c)
@@ -230,6 +217,17 @@
     a = 4;
     c = 5;
   }
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
+// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+#pragma omp task untied
+  {
+    S s1;
+#pragma omp task
+    a = 4;
+#pragma omp taskyield
+    s1 = S();
+#pragma omp taskwait
+  }
   return a;
 }
 // CHECK: define internal i32 [[TASK_ENTRY1]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
@@ -240,16 +238,41 @@
 // CHECK: store i32 10, i32* %{{.+}}
 
 // CHECK: define internal i32 [[TASK_ENTRY2]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
-// CHECK: store i32 1, i32* [[A_PTR:@.+]]
+// CHECK: store i32 1, i32* [[A_PTR]]
 
 // CHECK: define internal i32 [[TASK_ENTRY3]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
-// CHECK: store i32 2, i32* [[A_PTR:@.+]]
+// CHECK: store i32 2, i32* [[A_PTR]]
 
 // CHECK: define internal i32 [[TASK_ENTRY4]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
-// CHECK: store i32 3, i32* [[A_PTR:@.+]]
+// CHECK: store i32 3, i32* [[A_PTR]]
 
 // CHECK: define internal i32 [[TASK_ENTRY5]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
-// CHECK: store i32 4, i32* [[A_PTR:@.+]]
+// CHECK: store i32 4, i32* [[A_PTR]]
 // CHECK: store i32 5, i32* [[C_PTR:%.+]], align 128
+
+// CHECK: define internal i32
+// CHECK: store i32 4, i32* [[A_PTR]]
+
+// CHECK: define internal i32 [[TASK_ENTRY6]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
+// CHECK: switch i32 %{{.+}}, label
+// CHECK: load i32*, i32** %
+// CHECK: store i32 1, i32* %
+// CHECK: call i32 @__kmpc_omp_task(%
+
+// CHECK: call i8* @__kmpc_omp_task_alloc(
+// CHECK: call i32 @__kmpc_omp_task(%
+// CHECK: load i32*, i32** %
+// CHECK: store i32 2, i32* %
+// CHECK: call i32 @__kmpc_omp_task(%
+
+// CHECK: call i32 @__kmpc_omp_taskyield(%
+// CHECK: load i32*, i32** %
+// CHECK: store i32 3, i32* %
+// CHECK: call i32 @__kmpc_omp_task(%
+
+// CHECK: call i32 @__kmpc_omp_taskwait(%
+// CHECK: load i32*, i32** %
+// CHECK: store i32 4, i32* %
+// CHECK: call i32 @__kmpc_omp_task(%
 #endif
 
diff --git a/test/OpenMP/task_depend_messages.cpp b/test/OpenMP/task_depend_messages.cpp
index 39bf484..576738c 100644
--- a/test/OpenMP/task_depend_messages.cpp
+++ b/test/OpenMP/task_depend_messages.cpp
@@ -43,7 +43,7 @@
   #pragma omp task depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
   #pragma omp task depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
   #pragma omp task depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
-  #pragma omp task depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  #pragma omp task depend (in : argv[-1:0])
   #pragma omp task depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
   #pragma omp task depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
   #pragma omp task depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
diff --git a/test/OpenMP/task_firstprivate_codegen.cpp b/test/OpenMP/task_firstprivate_codegen.cpp
index e224414..0d8e1c4 100644
--- a/test/OpenMP/task_firstprivate_codegen.cpp
+++ b/test/OpenMP/task_firstprivate_codegen.cpp
@@ -24,10 +24,10 @@
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}} }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
-// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
 // CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
 // CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
@@ -58,18 +58,16 @@
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 48, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 56, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
 // LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
-// LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_ADDR_REF]]
-// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_REF]]
+// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
 // LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
 
 // LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
 // LAMBDA: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-// LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_ADDR_REF]]
-// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]]
+// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
 // LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
 
 // LAMBDA: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
@@ -104,18 +102,16 @@
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 48, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 56, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
   // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
   // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
-  // BLOCKS: [[G_REF:%.+]] = load double*, double** [[G_ADDR_REF]]
-  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_REF]]
+  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
   // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
 
   // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
   // BLOCKS: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-  // BLOCKS: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_ADDR_REF]]
-  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]]
+  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
   // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
   // BLOCKS: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
   // BLOCKS: ret
@@ -180,20 +176,22 @@
 // CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
 // CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
-// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[T_VAR:%.+]] = load i32, i32* [[T_VAR_ADDR]],
+// CHECK: store i32 [[T_VAR]], i32* [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
 // CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
-// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
 // CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
-// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
-// CHECK: store i{{[0-9]+}}* [[SIVAR]], i{{[0-9]+}}** [[SIVAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 5
+// CHECK: [[SIVAR_VAL:%.+]] = load i32, i32* [[SIVAR]],
+// CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_REF]],
 
 // Allocate task.
 // Returns struct kmp_task_t {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 72, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 80, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
@@ -211,7 +209,7 @@
 // Constructors for s_arr and var.
 // s_arr;
 // CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
 // CHECK: load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_ADDR_REF]],
 // CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]],
 // CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
@@ -221,14 +219,13 @@
 
 // var;
 // CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 4
 // CHECK: [[VAR_REF:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[VAR_ADDR_REF]],
 // CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}}[[VAR_REF]],
 
 // t_var;
 // CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
-// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
-// CHECK: [[T_VAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[T_VAR_ADDR_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
 // CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_REF]],
 // CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
 
@@ -239,14 +236,14 @@
 
 // sivar;
 // CHECK: [[PRIVATE_SIVAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 4
-// CHECK: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 4
-// CHECK: [[SIVAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[SIVAR_ADDR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 5
 // CHECK: [[SIVAR:%.+]] = load i{{.+}}, i{{.+}}* [[SIVAR_REF]],
 // CHECK: store i32 [[SIVAR]], i32* [[PRIVATE_SIVAR_REF]],
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
 // CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
 // CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
@@ -347,7 +344,7 @@
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_TMAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
@@ -391,7 +388,8 @@
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
 // CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
 // CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
@@ -422,11 +420,11 @@
 // CHECK: ret void
 
 // CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
-
-// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
-// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
-// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
-// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
 // CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
 // CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
diff --git a/test/OpenMP/task_firstprivate_messages.cpp b/test/OpenMP/task_firstprivate_messages.cpp
index ef5f385..11d8c57 100644
--- a/test/OpenMP/task_firstprivate_messages.cpp
+++ b/test/OpenMP/task_firstprivate_messages.cpp
@@ -7,6 +7,17 @@
   return argc;
 }
 
+template <typename T>
+struct S {
+  T b;
+  S(T a, T c) {
+#pragma omp task default(none) firstprivate(a, b)
+    a = b = c; // expected-error {{variable 'c' must have explicitly specified data sharing attributes}}
+  }
+};
+
+S<int> s(3, 4); // expected-note {{in instantiation of member function 'S<int>::S' requested here}}
+
 struct S1; // expected-note {{declared here}} expected-note{{forward declaration of 'S1'}}
 extern S1 a;
 class S2 {
diff --git a/test/OpenMP/task_if_codegen.cpp b/test/OpenMP/task_if_codegen.cpp
index 5992be0..4226dce 100644
--- a/test/OpenMP/task_if_codegen.cpp
+++ b/test/OpenMP/task_if_codegen.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 // REQUIRES: x86-registered-target
 // expected-no-diagnostics
 #ifndef HEADER
@@ -63,11 +63,11 @@
 // CHECK-LABEL: @main
 int main() {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN7:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN7:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
 #pragma omp task if (true)
   fn7();
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN8:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN8:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: call void @__kmpc_omp_task_begin_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
 // CHECK: call i32 [[CAP_FN8]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
@@ -75,7 +75,7 @@
 #pragma omp task if (false)
   fn8();
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN9:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN9:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -89,7 +89,7 @@
 // CHECK: [[OMP_END]]
 #pragma omp task if (Arg)
   fn9();
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN10:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN10:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -126,7 +126,7 @@
 
 // CHECK-LABEL: define {{.+}} @{{.+}}tmain
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32,  %{{[^*]+}}*)* [[CAP_FN1:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32,  %{{[^*]+}}*)* [[CAP_FN1:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
 
 // CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(
@@ -135,7 +135,7 @@
 // CHECK: call i32 [[CAP_FN2:@.+]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
 // CHECK: call void @__kmpc_omp_task_complete_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN3:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN3:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -148,7 +148,7 @@
 // CHECK: br label %[[OMP_END]]
 // CHECK: [[OMP_END]]
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN4:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN4:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -162,7 +162,7 @@
 // CHECK: br label %[[OMP_END]]
 // CHECK: [[OMP_END]]
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN5:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN5:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -176,7 +176,7 @@
 // CHECK: br label %[[OMP_END]]
 // CHECK: [[OMP_END]]
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN6:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN6:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
diff --git a/test/OpenMP/task_messages.cpp b/test/OpenMP/task_messages.cpp
index 64bf8a4..f42a37a 100644
--- a/test/OpenMP/task_messages.cpp
+++ b/test/OpenMP/task_messages.cpp
@@ -60,11 +60,10 @@
 // expected-error@+1 2 {{calling a private constructor of class 'S'}}
 #pragma omp parallel shared(a, b)
   ++a, ++b;
-// expected-note@+1 3 {{defined as reduction}}
+// expected-note@+1 2 {{defined as reduction}}
 #pragma omp parallel reduction(+ : r)
-// expected-error@+1 {{argument of a reduction clause of a parallel construct must not appear in a firstprivate clause on a task construct}}
+// expected-error@+1 2 {{argument of a reduction clause of a parallel construct must not appear in a firstprivate clause on a task construct}}
 #pragma omp task firstprivate(r)
-  // expected-error@+1 2 {{reduction variables may not be accessed in an explicit task}}
   ++r;
 // expected-note@+1 2 {{defined as reduction}}
 #pragma omp parallel reduction(+ : r)
@@ -77,12 +76,11 @@
   // expected-error@+1 2 {{reduction variables may not be accessed in an explicit task}}
   ++r;
 #pragma omp parallel
-// expected-note@+1 3 {{defined as reduction}}
+// expected-note@+1 2 {{defined as reduction}}
 #pragma omp for reduction(+ : r)
   for (int i = 0; i < 10; ++i)
-// expected-error@+1 {{argument of a reduction clause of a for construct must not appear in a firstprivate clause on a task construct}}
+// expected-error@+1 2 {{argument of a reduction clause of a for construct must not appear in a firstprivate clause on a task construct}}
 #pragma omp task firstprivate(r)
-    // expected-error@+1 2 {{reduction variables may not be accessed in an explicit task}}
     ++r;
 #pragma omp parallel
 // expected-note@+1 2 {{defined as reduction}}
diff --git a/test/OpenMP/task_private_codegen.cpp b/test/OpenMP/task_private_codegen.cpp
index 1455fd1..97155a7 100644
--- a/test/OpenMP/task_private_codegen.cpp
+++ b/test/OpenMP/task_private_codegen.cpp
@@ -24,7 +24,7 @@
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}} }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
@@ -56,10 +56,8 @@
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 48, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 56, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-// LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
 // LAMBDA: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
 // LAMBDA: ret
 #pragma omp task private(g, sivar)
@@ -94,10 +92,8 @@
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 48, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 56, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-  // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
   // BLOCKS: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
   // BLOCKS: ret
 #pragma omp task private(g, sivar)
@@ -162,7 +158,7 @@
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -186,7 +182,8 @@
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
 // CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
 // CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
@@ -275,7 +272,7 @@
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_TMAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
 
 // CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -299,7 +296,8 @@
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
 // CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
 // CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
@@ -331,10 +329,11 @@
 
 // CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
 
-// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
-// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
-// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
-// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
 // CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
 // CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
diff --git a/test/OpenMP/taskgroup_codegen.cpp b/test/OpenMP/taskgroup_codegen.cpp
index d1bc2aa..0f6e81b 100644
--- a/test/OpenMP/taskgroup_codegen.cpp
+++ b/test/OpenMP/taskgroup_codegen.cpp
@@ -32,6 +32,7 @@
   foo();
 // CHECK-NOT:   call {{.*}}void @__kmpc_taskgroup
 // CHECK-NOT:   call {{.*}}void @__kmpc_end_taskgroup
+// CHECK:       ret
   return a;
 }
 
diff --git a/test/OpenMP/taskloop_codegen.cpp b/test/OpenMP/taskloop_codegen.cpp
new file mode 100644
index 0000000..e585fce
--- /dev/null
+++ b/test/OpenMP/taskloop_codegen.cpp
@@ -0,0 +1,192 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: @main
+int main(int argc, char **argv) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 9, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 0, i64 0, i8* null)
+#pragma omp taskloop priority(argc)
+  for (int i = 0; i < 10; ++i)
+    ;
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 9, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[GRAINSIZE:%.+]] = zext i32 %{{.+}} to i64
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 1, i32 1, i64 [[GRAINSIZE]], i8* null)
+#pragma omp taskloop nogroup grainsize(argc)
+  for (int i = 0; i < 10; ++i)
+    ;
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
+// CHECK: [[IF_INT:%.+]] = sext i1 [[IF]] to i32
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 %{{.+}}, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 [[IF_INT]], i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 4, i8* null)
+  int i;
+#pragma omp taskloop if(argc) shared(argc, argv) collapse(2) num_tasks(4)
+  for (i = 0; i < argc; ++i)
+  for (int j = argc; j < argv[argc][argc]; ++j)
+    ;
+}
+
+// CHECK: define internal i32 [[TASK1]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %
+// CHECK: store i32 %
+// CHECK: load i32, i32* %
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label %
+// CHECK: ret i32 0
+
+// CHECK: define internal i32 [[TASK2]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %
+// CHECK: store i32 %
+// CHECK: load i32, i32* %
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label %
+// CHECK: ret i32 0
+
+// CHECK: define internal i32 [[TASK3]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: store i64 [[LB_VAL]], i64* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: ret i32 0
+
+// CHECK-LABEL: @_ZN1SC2Ei
+struct S {
+  int a;
+  S(int c) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 %{{.+}}, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[NUM_TASKS:%.+]] = zext i32 %{{.+}} to i64
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 [[NUM_TASKS]], i8* null)
+#pragma omp taskloop shared(c) num_tasks(a)
+    for (a = 0; a < c; ++a)
+      ;
+  }
+} s(1);
+
+// CHECK: define internal i32 [[TASK4]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %
+// CHECK: store i32 %
+// CHECK: load i32, i32* %
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label %
+// CHECK: ret i32 0
+
+#endif
diff --git a/test/OpenMP/taskloop_collapse_messages.cpp b/test/OpenMP/taskloop_collapse_messages.cpp
index f33da11..1a5620e 100644
--- a/test/OpenMP/taskloop_collapse_messages.cpp
+++ b/test/OpenMP/taskloop_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp taskloop', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp taskloop' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp taskloop' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp taskloop collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop collapse (1)
@@ -59,16 +71,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp taskloop', but found only 1}}
   #pragma omp taskloop collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp taskloop' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp taskloop', but found only 1}}
-  #pragma omp taskloop collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp taskloop collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp taskloop' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp taskloop collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp taskloop' must be a for loop}}
diff --git a/test/OpenMP/taskloop_firstprivate_codegen.cpp b/test/OpenMP/taskloop_firstprivate_codegen.cpp
new file mode 100644
index 0000000..822a5c6
--- /dev/null
+++ b/test/OpenMP/taskloop_firstprivate_codegen.cpp
@@ -0,0 +1,511 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test(ttt);
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop firstprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
+// LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+
+// LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
+// LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// LAMBDA: ret
+#pragma omp taskloop firstprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
+  // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+
+  // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
+  // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+  // BLOCKS: ret
+#pragma omp taskloop firstprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[ISVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test(ttt);
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop firstprivate(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 33;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[T_VAR_VAL:%.+]] = load i32, i32* [[T_VAR_ADDR]],
+// CHECK: store i32 [[T_VAR_VAL]], i32* [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 5
+// CHECK: [[SIVAR_VAL:%.+]] = load i32, i32* [[SIVAR]],
+// CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_MAIN_TY]]*
+
+// Constructors for s_arr and var.
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 4
+// CHECK: [[VAR_REF:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[VAR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}}[[VAR_REF]],
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_ADDR_REF]],
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// sivar;
+// CHECK: [[PRIVATE_SIVAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 4
+// CHECK: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 5
+// CHECK: [[SIVAR:%.+]] = load i{{.+}}, i{{.+}}* [[SIVAR_ADDR_REF]],
+// CHECK: store i32 [[SIVAR]], i32* [[PRIVATE_SIVAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
+// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
+// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK-DAG: [[PRIV_SIVAR]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br i1 %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i32 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_TMAIN_TY]]*
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[T_VAR_ADDR_REF]],
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_REF]], align 128
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]], align 128
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br i1 %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i32 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp taskloop firstprivate(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_firstprivate_messages.cpp b/test/OpenMP/taskloop_firstprivate_messages.cpp
index e2e87e4..fe22311 100644
--- a/test/OpenMP/taskloop_firstprivate_messages.cpp
+++ b/test/OpenMP/taskloop_firstprivate_messages.cpp
@@ -295,9 +295,9 @@
 #pragma omp taskloop firstprivate(i) // expected-note {{defined as firstprivate}}
   for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp taskloop' directive may not be firstprivate, predetermined as private}}
     foo();
-#pragma omp parallel reduction(+ : i)
-#pragma omp taskloop firstprivate(i) // expected-note {{defined as firstprivate}}
-  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp taskloop' directive may not be firstprivate, predetermined as private}}
+#pragma omp parallel reduction(+ : i) // expected-note 4 {{defined as reduction}}
+#pragma omp taskloop firstprivate(i) //expected-error {{argument of a reduction clause of a parallel construct must not appear in a firstprivate clause on a task construct}}
+  for (i = 0; i < argc; ++i) // expected-error 3 {{reduction variables may not be accessed in an explicit task}}
     foo();
 #pragma omp parallel
 #pragma omp taskloop firstprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
diff --git a/test/OpenMP/taskloop_lastprivate_codegen.cpp b/test/OpenMP/taskloop_lastprivate_codegen.cpp
new file mode 100644
index 0000000..8414b6f
--- /dev/null
+++ b/test/OpenMP/taskloop_lastprivate_codegen.cpp
@@ -0,0 +1,519 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop lastprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+// LAMBDA: ret
+#pragma omp taskloop lastprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: icmp ne i32 %{{.+}}, 0
+    // LAMBDA: br i1
+    // LAMBDA: load double, double* %
+    // LAMBDA: store volatile double %
+    // LAMBDA: load i32, i32* %
+    // LAMBDA: store i32 %
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+  // BLOCKS: ret
+#pragma omp taskloop lastprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[ISVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: icmp ne i32 %{{.+}}, 0
+    // BLOCKS: br i1
+    // BLOCKS: load double, double* %
+    // BLOCKS: store volatile double %
+    // BLOCKS: load i32, i32* %
+    // BLOCKS: store i32 %
+    ^{
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop lastprivate(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 33;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: store i{{[0-9]+}}* [[SIVAR]], i{{[0-9]+}}** [[SIVAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+
+// t_var;
+// vec;
+// sivar;
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
+// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
+// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK-DAG: [[PRIV_SIVAR]]
+
+// CHECK:     icmp ne i32 %{{.+}}, 0
+// CHECK-NEXT: br i1
+// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
+// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK-NEXT: br i1
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+
+// t_var;
+// vec;
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK:     icmp ne i32 %{{.+}}, 0
+// CHECK-NEXT: br i1
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK-NEXT: br i1
+// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
+// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+// ARRAY: icmp ne i32 %{{.+}}, 0
+// ARRAY: store float* %{{.+}}, float** %{{.+}},
+// ARRAY: store %struct.St* %{{.+}}, %struct.St** %{{.+}},
+#pragma omp taskloop lastprivate(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_loop_messages.cpp b/test/OpenMP/taskloop_loop_messages.cpp
index 02518e5..291cbdb 100644
--- a/test/OpenMP/taskloop_loop_messages.cpp
+++ b/test/OpenMP/taskloop_loop_messages.cpp
@@ -427,12 +427,12 @@
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -483,7 +483,7 @@
 #pragma omp taskloop
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel
 #pragma omp taskloop
diff --git a/test/OpenMP/taskloop_private_codegen.cpp b/test/OpenMP/taskloop_private_codegen.cpp
new file mode 100644
index 0000000..38b20c5
--- /dev/null
+++ b/test/OpenMP/taskloop_private_codegen.cpp
@@ -0,0 +1,420 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32. Investigating.
+// REQUIRES: shell
+
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i8 }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop private(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// LAMBDA: ret
+#pragma omp taskloop private(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+    // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+    // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+    // LAMBDA: store i{{[0-9]+}} 3, i{{[0-9]+}}* [[SIVAR_REF]]
+
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 2;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 3;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+  // BLOCKS: ret
+#pragma omp taskloop private(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 4, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 3;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 3, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+      sivar = 4;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop private(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 8;
+  }
+#pragma omp task
+  g+=1;
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Do not store original variables in capture struct.
+// CHECK-NOT: getelementptr inbounds [[CAP_MAIN_TY]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+// CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8*
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK_DAG: [[PRIV_SIVAR]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Do not store original variables in capture struct.
+// CHECK-NOT: getelementptr inbounds [[CAP_TMAIN_TY]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St &operator=(const St &) { return *this; };
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp taskloop private(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_private_messages.cpp b/test/OpenMP/taskloop_private_messages.cpp
index 3d00d3f..367d59d 100644
--- a/test/OpenMP/taskloop_private_messages.cpp
+++ b/test/OpenMP/taskloop_private_messages.cpp
@@ -29,7 +29,11 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp taskloop private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp taskloop private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp taskloop private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp taskloop private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp taskloop private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp taskloop private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -126,6 +174,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp taskloop private // expected-error {{expected '(' after 'private'}}
@@ -190,6 +240,8 @@
   for(int k = 0; k < argc; ++k)
     si = k + 1;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/taskloop_simd_aligned_messages.cpp b/test/OpenMP/taskloop_simd_aligned_messages.cpp
index b62af04..b45f44f 100644
--- a/test/OpenMP/taskloop_simd_aligned_messages.cpp
+++ b/test/OpenMP/taskloop_simd_aligned_messages.cpp
@@ -196,6 +196,7 @@
   #pragma omp taskloop simd aligned(h)
   for (int k = 0; k < argc; ++k) ++k;
   int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
   foomain<int*,char>(pargc,argv);
   return 0;
 }
diff --git a/test/OpenMP/taskloop_simd_codegen.cpp b/test/OpenMP/taskloop_simd_codegen.cpp
new file mode 100644
index 0000000..dc60009
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_codegen.cpp
@@ -0,0 +1,203 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: @main
+int main(int argc, char **argv) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 9, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 0, i64 0, i8* null)
+#pragma omp taskloop simd priority(argc)
+  for (int i = 0; i < 10; ++i)
+    ;
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 9, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[GRAINSIZE:%.+]] = zext i32 %{{.+}} to i64
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 1, i32 1, i64 [[GRAINSIZE]], i8* null)
+#pragma omp taskloop simd nogroup grainsize(argc) simdlen(4)
+  for (int i = 0; i < 10; ++i)
+    ;
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
+// CHECK: [[IF_INT:%.+]] = sext i1 [[IF]] to i32
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 %{{.+}}, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 [[IF_INT]], i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 4, i8* null)
+  int i;
+#pragma omp taskloop simd if(argc) shared(argc, argv) collapse(2) num_tasks(4) safelen(32)
+  for (i = 0; i < argc; ++i)
+  for (int j = argc; j < argv[argc][argc]; ++j)
+    ;
+}
+
+// CHECK: define internal i32 [[TASK1]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],{{.*}}!llvm.mem.parallel_loop_access [[LOOP1:!.+]]
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: store i32 %{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: load i32, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: br label %{{.*}}!llvm.loop [[LOOP1]]
+// CHECK: ret i32 0
+
+// CHECK: define internal i32 [[TASK2]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],{{.*}}!llvm.mem.parallel_loop_access [[LOOP2:!.+]]
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: store i32 %{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: load i32, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: br label %{{.*}}!llvm.loop [[LOOP2]]
+// CHECK: ret i32 0
+
+// CHECK: define internal i32 [[TASK3]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: store i64 [[LB_VAL]], i64* [[CNT:%.+]],
+// CHECK: br label
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: br label %{{.*}}!llvm.loop
+// CHECK: ret i32 0
+
+// CHECK-LABEL: @_ZN1SC2Ei
+struct S {
+  int a;
+  S(int c) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 %{{.+}}, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[NUM_TASKS:%.+]] = zext i32 %{{.+}} to i64
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 [[NUM_TASKS]], i8* null)
+#pragma omp taskloop simd shared(c) num_tasks(a) simdlen(64) safelen(8)
+    for (a = 0; a < c; ++a)
+      ;
+  }
+} s(1);
+
+// CHECK: define internal i32 [[TASK4]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: store i32 %
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: load i32, i32* %
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: br label %{{.*}}!llvm.loop
+// CHECK: ret i32 0
+
+// CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
+// CHECK: !{!"llvm.loop.vectorize.width", i32 4}
+// CHECK: !{!"llvm.loop.vectorize.width", i32 32}
+// CHECK: !{!"llvm.loop.vectorize.width", i32 64}
+
+#endif
diff --git a/test/OpenMP/taskloop_simd_collapse_messages.cpp b/test/OpenMP/taskloop_simd_collapse_messages.cpp
index d178c08..e4ce0c1 100644
--- a/test/OpenMP/taskloop_simd_collapse_messages.cpp
+++ b/test/OpenMP/taskloop_simd_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp taskloop simd', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp taskloop simd collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd collapse (1)
@@ -59,16 +71,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp taskloop simd', but found only 1}}
   #pragma omp taskloop simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp taskloop simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp taskloop simd', but found only 1}}
-  #pragma omp taskloop simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp taskloop simd collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp taskloop simd collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp taskloop simd' must be a for loop}}
diff --git a/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp b/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
new file mode 100644
index 0000000..0b87ddd
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
@@ -0,0 +1,511 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test(ttt);
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop simd firstprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
+// LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+
+// LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
+// LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// LAMBDA: ret
+#pragma omp taskloop simd firstprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
+  // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+
+  // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
+  // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+  // BLOCKS: ret
+#pragma omp taskloop simd firstprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[ISVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test(ttt);
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop simd firstprivate(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 33;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[T_VAR_VAL:%.+]] = load i32, i32* [[T_VAR_ADDR]],
+// CHECK: store i32 [[T_VAR_VAL]], i32* [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 5
+// CHECK: [[SIVAR_VAL:%.+]] = load i32, i32* [[SIVAR]],
+// CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_MAIN_TY]]*
+
+// Constructors for s_arr and var.
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 4
+// CHECK: [[VAR_REF:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[VAR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}}[[VAR_REF]],
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_ADDR_REF]],
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// sivar;
+// CHECK: [[PRIVATE_SIVAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 4
+// CHECK: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 5
+// CHECK: [[SIVAR:%.+]] = load i{{.+}}, i{{.+}}* [[SIVAR_ADDR_REF]],
+// CHECK: store i32 [[SIVAR]], i32* [[PRIVATE_SIVAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
+// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
+// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK-DAG: [[PRIV_SIVAR]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br i1 %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i32 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_TMAIN_TY]]*
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[T_VAR_ADDR_REF]],
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_REF]], align 128
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]], align 128
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br i1 %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i32 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp taskloop simd firstprivate(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_simd_firstprivate_messages.cpp b/test/OpenMP/taskloop_simd_firstprivate_messages.cpp
index 8394669..18cefc1 100644
--- a/test/OpenMP/taskloop_simd_firstprivate_messages.cpp
+++ b/test/OpenMP/taskloop_simd_firstprivate_messages.cpp
@@ -295,9 +295,9 @@
 #pragma omp taskloop simd firstprivate(i) // expected-note {{defined as firstprivate}}
   for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp taskloop simd' directive may not be firstprivate, predetermined as linear}}
     foo();
-#pragma omp parallel reduction(+ : i)
-#pragma omp taskloop simd firstprivate(i) // expected-note {{defined as firstprivate}}
-  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp taskloop simd' directive may not be firstprivate, predetermined as linear}}
+#pragma omp parallel reduction(+ : i) // expected-note 4 {{defined as reduction}}
+#pragma omp taskloop simd firstprivate(i) // expected-error {{argument of a reduction clause of a parallel construct must not appear in a firstprivate clause on a task construct}}
+  for (i = 0; i < argc; ++i) // expected-error 3 {{reduction variables may not be accessed in an explicit task}}
     foo();
 #pragma omp parallel
 #pragma omp taskloop simd firstprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
diff --git a/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp b/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
new file mode 100644
index 0000000..e3562a8
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
@@ -0,0 +1,519 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop simd lastprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+// LAMBDA: ret
+#pragma omp taskloop simd lastprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: icmp ne i32 %{{.+}}, 0
+    // LAMBDA: br i1
+    // LAMBDA: load double, double* %
+    // LAMBDA: store volatile double %
+    // LAMBDA: load i32, i32* %
+    // LAMBDA: store i32 %
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+  // BLOCKS: ret
+#pragma omp taskloop simd lastprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[ISVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: icmp ne i32 %{{.+}}, 0
+    // BLOCKS: br i1
+    // BLOCKS: load double, double* %
+    // BLOCKS: store volatile double %
+    // BLOCKS: load i32, i32* %
+    // BLOCKS: store i32 %
+    ^{
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop simd lastprivate(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 33;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: store i{{[0-9]+}}* [[SIVAR]], i{{[0-9]+}}** [[SIVAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+
+// t_var;
+// vec;
+// sivar;
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
+// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
+// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK-DAG: [[PRIV_SIVAR]]
+
+// CHECK:     icmp ne i32 %{{.+}}, 0
+// CHECK-NEXT: br i1
+// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
+// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK-NEXT: br i1
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+
+// t_var;
+// vec;
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK:     icmp ne i32 %{{.+}}, 0
+// CHECK-NEXT: br i1
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK-NEXT: br i1
+// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
+// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+// ARRAY: icmp ne i32 %{{.+}}, 0
+// ARRAY: store float* %{{.+}}, float** %{{.+}},
+// ARRAY: store %struct.St* %{{.+}}, %struct.St** %{{.+}},
+#pragma omp taskloop simd lastprivate(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_simd_loop_messages.cpp b/test/OpenMP/taskloop_simd_loop_messages.cpp
index 4731872..3326e6f 100644
--- a/test/OpenMP/taskloop_simd_loop_messages.cpp
+++ b/test/OpenMP/taskloop_simd_loop_messages.cpp
@@ -428,12 +428,12 @@
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -484,7 +484,7 @@
 #pragma omp taskloop simd
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel
 #pragma omp taskloop simd
diff --git a/test/OpenMP/taskloop_simd_private_codegen.cpp b/test/OpenMP/taskloop_simd_private_codegen.cpp
new file mode 100644
index 0000000..557601e
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_private_codegen.cpp
@@ -0,0 +1,420 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32. Investigating.
+// REQUIRES: shell
+
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i8 }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop simd private(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// LAMBDA: ret
+#pragma omp taskloop simd private(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+    // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+    // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+    // LAMBDA: store i{{[0-9]+}} 3, i{{[0-9]+}}* [[SIVAR_REF]]
+
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 2;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 3;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+  // BLOCKS: ret
+#pragma omp taskloop simd private(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 4, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 3;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 3, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+      sivar = 4;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop simd private(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 8;
+  }
+#pragma omp task
+  g+=1;
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Do not store original variables in capture struct.
+// CHECK-NOT: getelementptr inbounds [[CAP_MAIN_TY]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+// CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8*
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK_DAG: [[PRIV_SIVAR]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Do not store original variables in capture struct.
+// CHECK-NOT: getelementptr inbounds [[CAP_TMAIN_TY]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St &operator=(const St &) { return *this; };
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp taskloop simd private(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_simd_private_messages.cpp b/test/OpenMP/taskloop_simd_private_messages.cpp
index 4a9b08a..ba9e8da 100644
--- a/test/OpenMP/taskloop_simd_private_messages.cpp
+++ b/test/OpenMP/taskloop_simd_private_messages.cpp
@@ -29,7 +29,11 @@
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp taskloop simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp taskloop simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp taskloop simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp taskloop simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp taskloop simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp taskloop simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -126,6 +174,8 @@
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp taskloop simd private // expected-error {{expected '(' after 'private'}}
@@ -190,6 +240,8 @@
   for(int k = 0; k < argc; ++k)
     si = k + 1;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/taskloop_simd_safelen_messages.cpp b/test/OpenMP/taskloop_simd_safelen_messages.cpp
index 3182c8a..729f314 100644
--- a/test/OpenMP/taskloop_simd_safelen_messages.cpp
+++ b/test/OpenMP/taskloop_simd_safelen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd safelen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'safelen' clause}}
-  // expected-error@+2 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp taskloop simd safelen (foobool(argc)), safelen (true), safelen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd safelen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd safelen (4)
@@ -57,16 +69,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp taskloop simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp taskloop simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp taskloop simd safelen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'safelen' clause}}
   // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
   #pragma omp taskloop simd safelen (foobool(argc)), safelen (true), safelen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp taskloop simd' must be a for loop}}
diff --git a/test/OpenMP/taskloop_simd_simdlen_messages.cpp b/test/OpenMP/taskloop_simd_simdlen_messages.cpp
index ba3f20e..79655ba 100644
--- a/test/OpenMP/taskloop_simd_simdlen_messages.cpp
+++ b/test/OpenMP/taskloop_simd_simdlen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd simdlen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'simdlen' clause}}
-  // expected-error@+2 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp taskloop simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd simdlen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd simdlen (4)
@@ -57,16 +69,27 @@
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp taskloop simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp taskloop simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp taskloop simd simdlen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'simdlen' clause}}
   // expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
   #pragma omp taskloop simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp taskloop simd' must be a for loop}}
diff --git a/test/OpenMP/teams_ast_print.cpp b/test/OpenMP/teams_ast_print.cpp
index 292586a..f3d577c 100644
--- a/test/OpenMP/teams_ast_print.cpp
+++ b/test/OpenMP/teams_ast_print.cpp
@@ -109,4 +109,6 @@
   return tmain<int, 5>(b, &b) + tmain<long, 1>(x, &x);
 }
 
+extern template int S<int>::TS;
+extern template long S<long>::TS;
 #endif
diff --git a/test/OpenMP/teams_codegen.cpp b/test/OpenMP/teams_codegen.cpp
new file mode 100644
index 0000000..8aaa206
--- /dev/null
+++ b/test/OpenMP/teams_codegen.cpp
@@ -0,0 +1,353 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+int Gbla;
+long long Gblb;
+int &Gblc = Gbla;
+
+// CK1-LABEL: teams_argument_global_local
+int teams_argument_global_local(int a){
+  int comp = 1;
+
+  int la = 23;
+  float lc = 25.0;
+
+  // CK1: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+  // CK1: call void @{{.+}}(i{{64|32}} %{{.+}})
+  #pragma omp target
+  #pragma omp teams
+  {
+    ++comp;
+  }
+
+  // CK1: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+  // CK1: call void @{{.+}}(i{{64|32}} %{{.+}})
+  #pragma omp target
+  {{{
+    #pragma omp teams
+    {
+      ++comp;
+    }
+  }}}
+
+  // CK1-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 0)
+  // CK1-DAG: [[NT]] = load i32, i32* [[NTA:%[^,]+]],
+
+  // CK1: call void @{{.+}}(i{{64|32}} %{{.+}})
+  #pragma omp target
+  #pragma omp teams num_teams(la)
+  {
+    ++comp;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 [[NT:%[^,]+]])
+  // CK1-DAG: [[NT]] = load i32, i32* [[NTA:%[^,]+]],
+
+  // CK1: call void @{{.+}}(i{{64|32}} %{{.+}})
+  #pragma omp target
+  #pragma omp teams thread_limit(la)
+  {
+    ++comp;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 5, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 [[TL:%[^,]+]])
+
+  // CK1-DAG: [[NT]] = add nsw i32 [[NTA:%[^,]+]], [[NTB:%[^,]+]]
+  // CK1-DAG: [[NTA]] = load i32, i32* @Gbla,
+  // CK1-DAG: [[NTB]] = load i32, i32* %{{.+}},
+
+  // CK1-DAG: [[TL]] = trunc i64 [[TLA:%[^,]+]] to i32
+  // CK1-DAG: [[TLA]] = add nsw i64 [[TLB:%[^,]+]], [[TLC:%[^,]+]]
+  // CK1-DAG: [[TLC]] = fptosi float [[TLD:%[^,]+]] to i64
+  // CK1-DAG: [[TLD]] = load float, float* %{{.+}},
+  // CK1-DAG: [[TLB]] = load i64, i64* @Gblb,
+
+  // CK1: call void @{{.+}}(i{{.+}} {{.+}}, i{{.+}} {{.+}}, i{{.+}} {{.+}}, i{{.+}} {{.+}}, i{{.+}} {{.+}})
+  #pragma omp target
+  #pragma omp teams num_teams(Gbla+a) thread_limit(Gblb+(long long)lc)
+  {
+    ++comp;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 {{.+}}, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 [[TL:%[^,]+]])
+
+  // CK1-DAG: [[NT]] = add nsw i32 [[NTA:%[^,]+]], 1
+  // CK1-DAG: [[NTA]] = load i32, i32* @Gbla,
+
+  // CK1-DAG: [[TL]] = add nsw i32 [[TLA:%[^,]+]], 2
+  // CK1-DAG: [[TLA]] = load i32, i32* @Gbla,
+
+  // CK1: call void @{{.+}}(i{{.+}} {{.+}}
+  #pragma omp target
+  #pragma omp teams num_teams(Gblc+1) thread_limit(Gblc+2)
+  {
+    comp += Gblc;
+  }
+
+  return comp;
+}
+
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2-DAG: [[SSI:%.+]] = type { i32, float }
+// CK2-DAG: [[SSL:%.+]] = type { i64, float }
+template <typename T>
+struct SS{
+  T a;
+  float b;
+};
+
+SS<int> Gbla;
+SS<long long> Gblb;
+
+// CK2-LABEL: teams_template_arg
+int teams_template_arg(void) {
+  int comp = 1;
+
+  SS<int> la;
+  SS<long long> lb;
+
+  // CK2-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 [[TL:%[^,]+]])
+
+  // CK2-DAG: [[NT]] = load i32, i32* getelementptr inbounds ([[SSI]], [[SSI]]* @Gbla, i32 0, i32 0)
+
+  // CK2-DAG: [[TL]] = trunc i64 [[TLA:%[^,]+]] to i32
+  // CK2-DAG: [[TLA]] = fptosi float [[TLB:%[^,]+]] to i64
+  // CK2-DAG: [[TLB]] = load float, float* [[TLC:%[^,]+]],
+  // CK2-DAG: [[TLC]] = getelementptr inbounds [[SSI]], [[SSI]]* %{{.+}}, i32 0, i32 1
+
+  // CK2: call void @{{.+}}({{.+}} {{.+}}, {{.+}} {{.+}}, {{.+}} {{.+}})
+  #pragma omp target
+  #pragma omp teams num_teams(Gbla.a) thread_limit((long long)la.b)
+  {
+    ++comp;
+  }
+
+  // CK2-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 [[TL:%[^,]+]])
+
+  // CK2-DAG: [[TL]] = trunc i64 [[TLD:%[^,]+]] to i32
+  // CK2-DAG: [[TLD]] = load i64, i64* getelementptr inbounds ([[SSL]], [[SSL]]* @Gblb, i32 0, i32 0),
+
+  // CK2-DAG: [[NT]] = trunc i64 [[NTA:%[^,]+]] to i32
+  // CK2-DAG: [[NTA]] = fptosi float [[NTB:%[^,]+]] to i64
+  // CK2-DAG: [[NTB]] = load float, float* [[NTC:%[^,]+]],
+  // CK2-DAG: [[NTC]] = getelementptr inbounds [[SSL]], [[SSL]]* %{{.+}}, i32 0, i32 1
+
+  // CK2: call void @{{.+}}({{.+}} {{.+}}, {{.+}} {{.+}}, {{.+}} {{.+}})
+  #pragma omp target
+  #pragma omp teams num_teams((long long)lb.b) thread_limit(Gblb.a)
+  {
+    ++comp;
+  }
+  return comp;
+}
+#endif // CK2
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3: [[SSI:%.+]] = type { i32, float }
+// CK3-LABEL: teams_template_struct
+
+template <typename T, int X, long long Y>
+struct SS{
+  T a;
+  float b;
+
+  int foo(void) {
+    int comp = 1;
+
+    // CK3-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 123)
+
+    // CK3-DAG: [[NT]] = load i32, i32* [[NTA:%[^,]+]],
+    // CK3-DAG: [[NTA]] = getelementptr inbounds [[SSI]], [[SSI]]* [[NTB:%[^,]+]], i32 0, i32 0
+    // CK3-DAG: [[NTB]] = load [[SSI]]*, [[SSI]]** %{{.+}},
+
+    // CK3: call void @{{.+}}({{.+}} {{.+}}, {{.+}} {{.+}})
+    #pragma omp target
+    #pragma omp teams num_teams(a) thread_limit(X)
+    {
+      ++comp;
+    }
+
+    // CK3-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 456, i32 [[TL:%[^,]+]])
+
+    // CK3-DAG: [[TL]] = add nsw i32 [[TLA:%[^,]+]], 123
+    // CK3-DAG: [[TLA]] = fptosi float [[TLB:%[^,]+]] to i32
+    // CK3-DAG: [[TLB]] = load float, float* [[TLC:%[^,]+]],
+    // CK3-DAG: [[TLC]] = getelementptr inbounds [[SSI]], [[SSI]]* [[THIS:%[^,]+]], i32 0, i32 1
+
+    // CK3: call void @{{.+}}({{.+}} {{.+}}, {{.+}} {{.+}})
+    #pragma omp target
+    #pragma omp teams num_teams(Y) thread_limit((int)b+X)
+    {
+      ++comp;
+    }
+    return comp;
+  }
+};
+
+int teams_template_struct(void) {
+  SS<int, 123, 456> V;
+  return V.foo();
+
+}
+#endif // CK3
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-32
+
+#ifdef CK4
+
+// CK4-DAG: %ident_t = type { i32, i32, i32, i32, i8* }
+// CK4-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CK4-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK4-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;main;[[@LINE+14]];9;;\00"
+// CK4-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;tmain;[[@LINE+7]];9;;\00"
+
+template <typename T>
+int tmain(T argc) {
+#pragma omp target
+#pragma omp teams
+  argc = 0;
+  return 0;
+}
+
+int main (int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+  argc = 0;
+  return tmain(argv);
+}
+
+// CK4:  define {{.*}}void @{{[^,]+}}(i{{.+}} %[[ARGC:.+]])
+// CK4:  [[ARGCADDR:%.+]] = alloca i{{.+}}
+// CK4:  store i{{.+}} %[[ARGC]], i{{.+}}* [[ARGCADDR]]
+// CK4-64:  [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
+// CK4-64:  call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* {{.+}} to void (i32*, i32*, ...)*), i32* [[CONV]])
+// CK4-32:  call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* {{.+}} to void (i32*, i32*, ...)*), i32* [[ARGCADDR]])
+// CK4:  ret void
+// CK4-NEXT: }
+
+// CK4:  define {{.*}}void @{{[^,]+}}(i8** [[ARGC1:%.+]])
+// CK4:  [[ARGCADDR1:%.+]] = alloca i8**
+// CK4:  store i8** [[ARGC1]], i8*** [[ARGCADDR1]]
+// CK4:  call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i8***)* {{.+}} to void (i32*, i32*, ...)*), i8*** [[ARGCADDR1]])
+
+
+#endif // CK4
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64
+// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-32
+// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-32
+
+// expected-no-diagnostics
+#ifdef CK5
+
+// CK5-DAG: %ident_t = type { i32, i32, i32, i32, i8* }
+// CK5-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CK5-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK5-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;main;[[@LINE+14]];9;;\00"
+// CK5-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;tmain;[[@LINE+7]];9;;\00"
+
+template <typename T>
+int tmain(T argc) {
+  int a = 10;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int a = 20;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return tmain(argv);
+}
+
+// CK5:  define {{.*}}void @{{[^,]+}}(i{{.+}} [[AP:%.+]], i{{.+}} [[BP:%.+]], i{{.+}} [[ARGC:.+]])
+// CK5:  [[AADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[BADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[ARGCADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[GBL_TH_NUM:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEF_LOC_0]])
+// CK5:  store i{{.+}} [[AP]], i{{.+}}* [[AADDR]]
+// CK5:  store i{{.+}} [[BP]], i{{.+}}* [[BADDR]]
+// CK5:  store i{{.+}} [[ARGC]], i{{.+}}* [[ARGCADDR]]
+// CK5-64:  [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
+// CK5-64:  [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
+// CK5-64:  [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
+// CK5-64:  [[ACONVVAL:%.+]] = load i32, i32* [[ACONV]]
+// CK5-64:  [[BCONVVAL:%.+]] = load i32, i32* [[BCONV]]
+// CK5-32:  [[ACONVVAL:%.+]] = load i32, i32* [[AADDR]]
+// CK5-32:  [[BCONVVAL:%.+]] = load i32, i32* [[BADDR]]
+// CK5:  {{.+}} = call i32 @__kmpc_push_num_teams(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TH_NUM]], i32 [[ACONVVAL]], i32 [[BCONVVAL]])
+// CK5-64:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[CONV]])
+// CK5-32:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[ARGCADDR]])
+
+// CK5:  define {{.*}}void @{{[^,]+}}(i{{.+}} [[AP:%.+]], i{{.+}} [[BP:%.+]], i{{.+}}** [[ARGC:%.+]])
+// CK5:  [[AADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[BADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[ARGCADDR:%.+]] = alloca i{{.+}}**
+// CK5:  [[GBL_TH_NUM:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEF_LOC_0]])
+// CK5:  store i{{.+}} [[AP]], i{{.+}}* [[AADDR]]
+// CK5:  store i{{.+}} [[BP]], i{{.+}}* [[BADDR]]
+// CK5:  store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
+// CK5-64:  [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
+// CK5-64:  [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
+// CK5-64:  [[ACONVVAL:%.+]] = load i32, i32* [[ACONV]]
+// CK5-64:  [[BCONVVAL:%.+]] = load i32, i32* [[BCONV]]
+// CK5-64:  {{.+}} = call i32 @__kmpc_push_num_teams(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TH_NUM]], i32 [[ACONVVAL]], i32 [[BCONVVAL]])
+// CK5-32:  [[A_VAL:%.+]] = load i32, i32* [[AADDR]]
+// CK5-32:  [[B_VAL:%.+]] = load i32, i32* [[BADDR]]
+// CK5-32:  {{.+}} = call i32 @__kmpc_push_num_teams(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TH_NUM]], i32 [[A_VAL]], i32 [[B_VAL]])
+// CK5:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i{{.+}})* @.omp_outlined.{{.+}} to void (i32*, i32*, ...)*), i{{.+}}*** [[ARGCADDR]])
+// CK5:  ret void
+// CK5-NEXT: }
+
+#endif // CK5
+#endif
diff --git a/test/OpenMP/teams_distribute_ast_print.cpp b/test/OpenMP/teams_distribute_ast_print.cpp
new file mode 100644
index 0000000..5dd71c9
--- /dev/null
+++ b/test/OpenMP/teams_distribute_ast_print.cpp
@@ -0,0 +1,179 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams distribute private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams distribute private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+
+  void foo() {
+    int b, argv, d, c, e, f;
+#pragma omp target
+#pragma omp teams distribute default(none), private(b) firstprivate(argv) shared(d) reduction(+:c) reduction(max:e) num_teams(f) thread_limit(d)
+    for (int k = 0; k < a.a; ++k)
+      ++a.a;
+  }
+};
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute private(this->a) private(this->a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute default(none) private(b) firstprivate(argv) shared(d) reduction(+: c) reduction(max: e) num_teams(f) thread_limit(d)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams distribute private(a) private(this->a) private(S7<S>::a) 
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams distribute private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+
+  void bar() {
+    int b, argv, d, c, e, f;
+#pragma omp target
+#pragma omp teams distribute default(none), private(b) firstprivate(argv) shared(d) reduction(+:c) reduction(max:e) num_teams(f) thread_limit(d)
+    for (int k = 0; k < a.a; ++k)
+      ++a.a;
+  }
+};
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute private(this->a) private(this->a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute default(none) private(b) firstprivate(argv) shared(d) reduction(+: c) reduction(max: e) num_teams(f) thread_limit(d)
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  static T a;
+// CHECK: static T a;
+#pragma omp target
+#pragma omp teams distribute
+  for (int i=0; i < 2; ++i)
+    a = 2;
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams distribute private(argc, b), firstprivate(c, d), collapse(2)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      foo();
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute private(argc,b) firstprivate(c,d) collapse(2)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: for (int j = 0; j < 10; ++j)
+// CHECK-NEXT: foo();
+  for (int i = 0; i < 10; ++i)
+    foo();
+// CHECK: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: foo();
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i)
+    foo();
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: foo();  
+#pragma omp target
+#pragma omp teams distribute default(none), private(b) firstprivate(argc) shared(d) reduction(+:c) reduction(max:e) num_teams(f) thread_limit(d)
+    for (int k = 0; k < 10; ++k)
+      e += d + argc;
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute default(none) private(b) firstprivate(argc) shared(d) reduction(+: c) reduction(max: e) num_teams(f) thread_limit(d)
+// CHECK-NEXT: for (int k = 0; k < 10; ++k)
+// CHECK-NEXT: e += d + argc;
+  return T();
+}
+
+int main (int argc, char **argv) {
+  int b = argc, c, d, e, f, g;
+  static int a;
+// CHECK: static int a;
+#pragma omp target
+#pragma omp teams distribute
+  for (int i=0; i < 2; ++i)
+    a = 2;
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams distribute private(argc,b),firstprivate(argv, c), collapse(2)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      foo();
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute private(argc,b) firstprivate(argv,c) collapse(2)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: for (int j = 0; j < 10; ++j)
+// CHECK-NEXT: foo();
+  for (int i = 0; i < 10; ++i)
+    foo();
+// CHECK: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: foo();
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i)foo();
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: foo();
+#pragma omp target
+#pragma omp teams distribute default(none), private(b) firstprivate(argc) shared(d) reduction(+:c) reduction(max:e) num_teams(f) thread_limit(d)
+  for (int k = 0; k < 10; ++k)
+    e += d + argc;
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams distribute default(none) private(b) firstprivate(argc) shared(d) reduction(+: c) reduction(max: e) num_teams(f) thread_limit(d)
+// CHECK-NEXT: for (int k = 0; k < 10; ++k)
+// CHECK-NEXT: e += d + argc;
+  return (0);
+}
+
+#endif
diff --git a/test/OpenMP/teams_distribute_collapse_messages.cpp b/test/OpenMP/teams_distribute_collapse_messages.cpp
new file mode 100644
index 0000000..e21eab2
--- /dev/null
+++ b/test/OpenMP/teams_distribute_collapse_messages.cpp
@@ -0,0 +1,150 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams distribute collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams distribute collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams distribute collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+4 {{expected ')'}} expected-note@+4 {{to match this '('}}
+// expected-error@+3 2 {{expression is not an integral constant expression}}
+// expected-note@+2 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target
+#pragma omp teams distribute collapse (argc 
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams distribute collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams distribute collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams distribute collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp teams distribute', but found only 1}}
+
+// expected-error@+4 2 {{directive '#pragma omp teams distribute' cannot contain more than one 'collapse' clause}}
+// expected-error@+3 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+#pragma omp target
+#pragma omp teams distribute collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp distribute collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+#pragma omp target
+#pragma omp teams distribute collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams distribute collapse (1)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams distribute collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams distribute collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp teams distribute'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams distribute collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams distribute collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams distribute collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams distribute collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp teams distribute', but found only 1}}
+
+#pragma omp target
+#pragma omp teams distribute collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp teams distribute', but found only 1}}
+
+#pragma omp target
+#pragma omp teams distribute collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+// expected-error@+4 {{expression is not an integral constant expression}}
+// expected-error@+3 2 {{directive '#pragma omp teams distribute' cannot contain more than one 'collapse' clause}}
+// expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams distribute collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams distribute collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+// expected-error@+2 {{expression is not an integral constant expression}}
+#pragma omp target
+#pragma omp teams distribute collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+// expected-error@+4 {{statement after '#pragma omp teams distribute' must be a for loop}}
+// expected-note@+2 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams distribute collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+
+#pragma omp target
+#pragma omp teams distribute collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp teams distribute'}}
+
+// expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/teams_distribute_default_messages.cpp b/test/OpenMP/teams_distribute_default_messages.cpp
new file mode 100644
index 0000000..bf62930
--- /dev/null
+++ b/test/OpenMP/teams_distribute_default_messages.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  #pragma omp target
+  #pragma omp teams distribute default // expected-error {{expected '(' after 'default'}}
+  for (int i=0; i<200; i++) foo();
+  #pragma omp target
+  #pragma omp teams distribute default ( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i=0; i<200; i++) foo();
+  #pragma omp target
+  #pragma omp teams distribute default () // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (int i=0; i<200; i++) foo();
+  #pragma omp target
+  #pragma omp teams distribute default (none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i=0; i<200; i++) foo();
+  #pragma omp target
+  #pragma omp teams distribute default (shared), default(shared) // expected-error {{directive '#pragma omp teams distribute' cannot contain more than one 'default' clause}}
+  for (int i=0; i<200; i++) foo();
+  #pragma omp target
+  #pragma omp teams distribute default (x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (int i=0; i<200; i++) foo();
+
+  #pragma omp target
+  #pragma omp teams distribute default(none)
+  for (int i=0; i<200; i++) ++argc; // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+
+  return 0;
+}
diff --git a/test/OpenMP/teams_distribute_dist_schedule_messages.cpp b/test/OpenMP/teams_distribute_dist_schedule_messages.cpp
new file mode 100644
index 0000000..d86722d
--- /dev/null
+++ b/test/OpenMP/teams_distribute_dist_schedule_messages.cpp
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  char ** argv;
+  static T a;
+// CHECK: static T a;
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp teams distribute' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams distribute dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp teams distribute' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute dist_schedule (static, argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/teams_distribute_firstprivate_messages.cpp b/test/OpenMP/teams_distribute_firstprivate_messages.cpp
new file mode 100644
index 0000000..a710807
--- /dev/null
+++ b/test/OpenMP/teams_distribute_firstprivate_messages.cpp
@@ -0,0 +1,152 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note{{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+  
+public:
+  S3() : a(0) {} // expected-note {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
+  S3(S3 &s3) : a(s3.a) {} // expected-note {{candidate constructor not viable: 1st argument ('const S3') would lose const qualifier}}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+class S6 {
+  int a;
+public:
+  S6() : a(0) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  S6 p;
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams distribute firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate (argc)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate (a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate (argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(ba)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(da)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute private(i), firstprivate(i) // expected-error {{private variable cannot be firstprivate}} expected-note{{defined as private}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(i)
+  for (j = 0; j < argc; ++j) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) foo(); // expected-error {{loop iteration variable in the associated loop of 'omp teams distribute' directive may not be firstprivate, predetermined as private}}
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(j)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc), firstprivate(argc) // OK
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/teams_distribute_lastprivate_messages.cpp b/test/OpenMP/teams_distribute_lastprivate_messages.cpp
new file mode 100644
index 0000000..b892141
--- /dev/null
+++ b/test/OpenMP/teams_distribute_lastprivate_messages.cpp
@@ -0,0 +1,272 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator =(const S2&) const;
+  S2 &operator =(const S2&);
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams distribute lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+#pragma omp target
+#pragma omp teams distribute lastprivate(i)
+  for (int k = 0; k < argc; ++k) {
+    i = k;
+    v += i;
+  }
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(j) private(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp target
+#pragma omp teams distribute lastprivate(a)
+  for (int i = 0; i < 2; ++i) foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams distribute lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argc)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(ba)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i) foo();
+
+  int xa;
+#pragma omp target
+#pragma omp teams distribute lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(xa)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(j)
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i) foo();
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i) foo();
+
+  static int si;
+#pragma omp target
+#pragma omp teams distribute  lastprivate(si) // OK
+  for (i = 0; i < argc; ++i) si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/teams_distribute_loop_messages.cpp b/test/OpenMP/teams_distribute_loop_messages.cpp
new file mode 100644
index 0000000..dbfd9ef
--- /dev/null
+++ b/test/OpenMP/teams_distribute_loop_messages.cpp
@@ -0,0 +1,716 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+class S {
+  int a;
+  S() : a(0) {}
+
+public:
+  S(int v) : a(v) {}
+  S(const S &s) : a(s.a) {}
+};
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; i += 1) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (char i = 0; i < 10; i += '\1') {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (long long i = 0; i < 10; i += 1.5) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams distribute
+  for (long long i = 0; i < 'z'; i += 1u) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{variable must be of integer or random access iterator type}}
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{variable must be of integer or random access iterator type}}
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-warning@+2 {{expression result unused}}
+// expected-error@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (ii + 1; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (c[ii] = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// Ok to skip parenthesises.
+  for (((ii)) = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  for (int i = 0;; i++)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams distribute
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  for (ii = 0; ii < 10; ++++ii)
+    c[ii] = a[ii];
+
+// Ok but undefined behavior (in general, cannot check that incr
+// is really loop-invariant).
+#pragma omp target
+#pragma omp teams distribute
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{expression must have integral or unscoped enumeration type, not 'float'}}
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+// Ok - step was converted to integer type.
+#pragma omp target
+#pragma omp teams distribute
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-warning@+2 {{relational comparison result unused}}
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  for (ii = 0; ii<10; jj> kk + 2)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-warning@+2 {{expression result unused}}
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  for (ii = 0; (ii) < 10; ii -= 25)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  for (ii = 0; (ii < 10); ii -= 0)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+1 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+  for (ii = 0; ii > 10; (ii += 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  for (ii = 0; ii < 10; (ii) = (1 - 1) + (ii))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+1 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+  for ((ii = 0); ii > 10; (ii -= 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  for (ii = 0; (ii < 10); (ii -= 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute firstprivate(ii) // expected-note  {{defined as firstprivate}}
+// expected-error@+1 {{loop iteration variable in the associated loop of 'omp teams distribute' directive may not be firstprivate, predetermined as private}}
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute private(ii)
+// OK
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute lastprivate(ii)
+// OK
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{loop iteration variable in the associated loop of 'omp teams distribute' directive may not be threadprivate or thread local, predetermined as private}}
+  for (sii = 0; sii < 10; sii++)
+    c[sii] = a[sii];
+
+  {
+#pragma omp target
+#pragma omp teams distribute collapse(2)
+  for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{statement after '#pragma omp teams distribute' must be a for loop}}
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int(*lb)[4] = nullptr;
+#pragma omp target
+#pragma omp teams distribute
+  for (int(*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (int a{0}; a < 10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag {};
+template <class Iter>
+struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+public:
+  Iter0() {}
+  Iter0(const Iter0 &) {}
+  Iter0 operator++() { return *this; }
+  Iter0 operator--() { return *this; }
+  bool operator<(Iter0 a) { return true; }
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'Iter0' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator-(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+public:
+  Iter1(float f = 0.0f, double d = 0.0) {}
+  Iter1(const Iter1 &) {}
+  Iter1 operator++() { return *this; }
+  Iter1 operator--() { return *this; }
+  bool operator<(Iter1 a) { return true; }
+  bool operator>=(Iter1 a) { return false; }
+};
+class GoodIter {
+public:
+  GoodIter() {}
+  GoodIter(const GoodIter &) {}
+  GoodIter(int fst, int snd) {}
+  GoodIter &operator=(const GoodIter &that) { return *this; }
+  GoodIter &operator=(const Iter0 &that) { return *this; }
+  GoodIter &operator+=(int x) { return *this; }
+  explicit GoodIter(void *) {}
+  GoodIter operator++() { return *this; }
+  GoodIter operator--() { return *this; }
+  bool operator!() { return true; }
+  bool operator<(GoodIter a) { return true; }
+  bool operator<=(GoodIter a) { return true; }
+  bool operator>=(GoodIter a) { return false; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator-(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator-(GoodIter a) { return a; }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
+GoodIter operator+(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'int' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator-(int v, GoodIter a) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 1st argument}}
+GoodIter operator+(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+#pragma omp target
+#pragma omp teams distribute
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (GoodIter I(1, 2); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+2 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
+// expected-error@+1 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+  for (begin = begin0; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (++begin; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams distribute
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+1 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+1 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+1 {{invalid operands to binary expression ('Iter0' and 'int')}}
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// Initializer is constructor without params.
+// expected-error@+2 {{invalid operands to binary expression ('Iter0' and 'int')}}
+// expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+  Iter1 begin1, end1;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+2 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+// expected-error@+1 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+1 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams distribute
+// expected-error@+4 {{invalid operands to binary expression ('Iter1' and 'float')}}
+// expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+// Initializer is constructor with all default params.
+// expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  for (Iter1 I; I < end1; ++I) {
+  }
+  return 0;
+}
+
+template <typename IT, int ST>
+class TC {
+public:
+  int dotest_lt(IT begin, IT end) {
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+    for (IT I = begin; I < end; I = I + ST) {
+      ++I;
+    }
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be positive due to this condition}}
+// expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+    for (IT I = begin; I <= end; I += ST) {
+      ++I;
+    }
+#pragma omp target
+#pragma omp teams distribute
+    for (IT I = begin; I < end; ++I) {
+      ++I;
+    }
+  }
+
+  static IT step() {
+    return IT(ST);
+  }
+};
+template <typename IT, int ST = 0>
+int dotest_gt(IT begin, IT end) {
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+1 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+1 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+#pragma omp target
+#pragma omp teams distribute
+// expected-note@+2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+1 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+#pragma omp target
+#pragma omp teams distribute
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end);         // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end);            // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try { // OK
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i]; // OK
+      }
+      throw a[i]; // OK
+    } catch (float f) {
+      if (f > 0.1)
+        throw a[i]; // OK
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i]; // OK
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+#pragma omp target
+#pragma omp teams distribute
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
+void test_loop_firstprivate_lastprivate() {
+  S s(4);
+#pragma omp target
+#pragma omp teams distribute lastprivate(s) firstprivate(s)
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
+void test_ordered() {
+#pragma omp target
+#pragma omp teams distribute ordered // expected-error {{unexpected OpenMP clause 'ordered' in directive '#pragma omp teams distribute'}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
+void test_nowait() {
+#pragma omp target
+// expected-error@+1 2 {{unexpected OpenMP clause 'nowait' in directive '#pragma omp teams distribute'}}
+#pragma omp teams distribute nowait nowait // expected-error {{directive '#pragma omp teams distribute' cannot contain more than one 'nowait' clause}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
diff --git a/test/OpenMP/teams_distribute_num_teams_messages.cpp b/test/OpenMP/teams_distribute_num_teams_messages.cpp
new file mode 100644
index 0000000..6086abd
--- /dev/null
+++ b/test/OpenMP/teams_distribute_num_teams_messages.cpp
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+
+template <typename T, int C> // expected-note {{declared here}}
+T tmain(T argc) {
+  char **a;
+#pragma omp target
+#pragma omp teams distribute num_teams(C)
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(T) // expected-error {{'T' does not refer to a value}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams // expected-error {{expected '(' after 'num_teams'}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams() // expected-error {{expected expression}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(argc > 0 ? a[1] : a[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(argc + argc)
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(argc), num_teams (argc+1) // expected-error {{directive '#pragma omp teams distribute' cannot contain more than one 'num_teams' clause}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(-2) // expected-error {{argument to 'num_teams' clause must be a strictly positive integer value}}
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(-10u)
+  for (int i=0; i<100; i++) foo();
+#pragma omp target
+#pragma omp teams distribute num_teams(3.14) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (int i=0; i<100; i++) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams distribute num_teams // expected-error {{expected '(' after 'num_teams'}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams () // expected-error {{expected expression}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (argc + argc)
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (argc), num_teams (argc+1) // expected-error {{directive '#pragma omp teams distribute' cannot contain more than one 'num_teams' clause}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (-2) // expected-error {{argument to 'num_teams' clause must be a strictly positive integer value}}
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (-10u)
+  for (int i=0; i<100; i++) foo();
+
+#pragma omp target
+#pragma omp teams distribute num_teams (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (int i=0; i<100; i++) foo();
+
+  return tmain<int, 10>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 10>' requested here}}
+}
diff --git a/test/OpenMP/teams_distribute_private_messages.cpp b/test/OpenMP/teams_distribute_private_messages.cpp
new file mode 100644
index 0000000..aff1f53
--- /dev/null
+++ b/test/OpenMP/teams_distribute_private_messages.cpp
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  static float S2s; // expected-note {{predetermined as shared}}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 c; // expected-note {{predetermined as shared}}
+const S3 ca[5]; // expected-note {{predetermined as shared}}
+extern const int f;  // expected-note {{predetermined as shared}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+public:
+  S4(int v):a(v) { }
+};
+class S5 { 
+  int a;
+  S5():a(0) {} // expected-note {{implicitly declared private here}}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
+
+
+int main(int argc, char **argv) {
+  const int d = 5;  // expected-note {{predetermined as shared}}
+  const int da[5] = { 0 }; // expected-note {{predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+
+  #pragma omp target
+  #pragma omp teams distribute private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private (argc)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private (a, b, c, d, f) // expected-error {{private variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be private}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private(ba)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private(ca) // expected-error {{shared variable cannot be private}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private(da) // expected-error {{shared variable cannot be private}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private(S2::S2s) // expected-error {{shared variable cannot be private}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute shared(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute firstprivate(i), private(i) // expected-error {{firstprivate variable cannot be private}} expected-note {{defined as firstprivate}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute private(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp target
+  #pragma omp teams distribute reduction(+:i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp distribute private(i)
+  for (int k = 0; k < 10; ++k) {
+    #pragma omp target
+    #pragma omp teams distribute private(i)
+    for (int x = 0; x < 10; ++x) foo();
+  }
+
+  #pragma omp target
+  #pragma omp teams distribute firstprivate(i)
+  for (int k = 0; k < 10; ++k) {
+  }
+
+  return 0;
+}
diff --git a/test/OpenMP/teams_distribute_reduction_messages.cpp b/test/OpenMP/teams_distribute_reduction_messages.cpp
new file mode 100644
index 0000000..63023c2
--- /dev/null
+++ b/test/OpenMP/teams_distribute_reduction_messages.cpp
@@ -0,0 +1,303 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                    // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                   // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];     // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];           // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target
+#pragma omp teams distribute reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(&& : argc)
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams distribute reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams distribute reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : fl)
+    for (int j=0; j<100; j++) foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                  // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                   // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];        // expected-note {{'r' defined here}}
+  int &q = qa[i];              // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target
+#pragma omp teams distribute reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(&& : argc)
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams distribute reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams distribute reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute reduction(+ : fl)
+    for (int j=0; j<100; j++) foo();
+  static int m;
+#pragma omp target
+#pragma omp teams distribute reduction(+ : m) // OK
+  for (int j=0; j<100; j++) foo();
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/teams_distribute_shared_messages.cpp b/test/OpenMP/teams_distribute_shared_messages.cpp
new file mode 100644
index 0000000..b9e096c
--- /dev/null
+++ b/test/OpenMP/teams_distribute_shared_messages.cpp
@@ -0,0 +1,133 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  #pragma omp target
+  #pragma omp teams distribute shared // expected-error {{expected '(' after 'shared'}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared () // expected-error {{expected expression}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared (argc)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared (a, b, c, d, f)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared (argv[1]) // expected-error {{expected variable name}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(ba)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(ca)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(da)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(e, g)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute private(i)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(i)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(j)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute firstprivate(i)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(i)
+  for (int j=0; j<100; j++) foo();
+  #pragma omp target
+  #pragma omp teams distribute shared(j)
+  for (int j=0; j<100; j++) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/teams_distribute_thread_limit_messages.cpp b/test/OpenMP/teams_distribute_thread_limit_messages.cpp
new file mode 100644
index 0000000..ec4ca7a
--- /dev/null
+++ b/test/OpenMP/teams_distribute_thread_limit_messages.cpp
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+
+template <typename T, int C> // expected-note {{declared here}}
+T tmain(T argc) {
+  char **a;
+#pragma omp target
+#pragma omp teams distribute thread_limit(C)
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(T) // expected-error {{'T' does not refer to a value}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit // expected-error {{expected '(' after 'thread_limit'}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit() // expected-error {{expected expression}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(argc > 0 ? a[1] : a[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(argc + argc)
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(argc), thread_limit (argc+1) // expected-error {{directive '#pragma omp teams distribute' cannot contain more than one 'thread_limit' clause}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(-2) // expected-error {{argument to 'thread_limit' clause must be a strictly positive integer value}}
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(-10u)
+  for (int j=0; j<100; j++) foo();
+#pragma omp target
+#pragma omp teams distribute thread_limit(3.14) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (int j=0; j<100; j++) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams distribute thread_limit // expected-error {{expected '(' after 'thread_limit'}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit () // expected-error {{expected expression}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams distribute' are ignored}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (argc + argc)
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (argc), thread_limit (argc+1) // expected-error {{directive '#pragma omp teams distribute' cannot contain more than one 'thread_limit' clause}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (-2) // expected-error {{argument to 'thread_limit' clause must be a strictly positive integer value}}
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (-10u)
+  for (int j=0; j<100; j++) foo();
+
+#pragma omp target
+#pragma omp teams distribute thread_limit (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (int j=0; j<100; j++) foo();
+
+  return tmain<int, 10>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 10>' requested here}}
+}
diff --git a/test/OpenMP/teams_firstprivate_codegen.cpp b/test/OpenMP/teams_firstprivate_codegen.cpp
new file mode 100644
index 0000000..3248bfe
--- /dev/null
+++ b/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -0,0 +1,283 @@
+// Test host codegen.
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -DARRAY  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix ARRAY --check-prefix ARRAY-64
+// RUN: %clang_cc1 -DARRAY  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DARRAY  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix ARRAY --check-prefix ARRAY-64
+// RUN: %clang_cc1 -DARRAY  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix ARRAY --check-prefix ARRAY-32
+// RUN: %clang_cc1 -DARRAY  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DARRAY  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix ARRAY --check-prefix ARRAY-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+#ifndef ARRAY
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g __attribute__((aligned(128))) = 1212;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] __attribute__((aligned(128))) = {1, 2};
+  S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
+  S<T> var __attribute__((aligned(128))) (3);
+  #pragma omp target
+  #pragma omp teams firstprivate(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+#pragma omp target
+#pragma omp teams firstprivate(t_var)
+  {}
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {    
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 2, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* {{.+}}, {{.+}})    
+  #pragma omp target
+  #pragma omp teams firstprivate(g, sivar)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) [[G_IN:%.+]], i{{64|32}} {{.*}}[[SIVAR_IN:%.+]])
+    // LAMBDA: store i{{[0-9]+}}* [[G_IN]], i{{[0-9]+}}** [[G_ADDR:%.+]],
+    // LAMBDA: store i{{[0-9]+}} [[SIVAR_IN]], i{{[0-9]+}}* [[SIVAR_ADDR:%.+]],
+    // LAMBDA: [[G_ADDR_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_ADDR]],
+    // LAMBDA-64: [[SIVAR_CONV:%.+]] = bitcast i64*  [[SIVAR_ADDR]] to i32*
+    // LAMBDA: [[G_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[G_ADDR_VAL]],
+    // LAMBDA: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_LOCAL:%.+]],
+    g = 1;
+    sivar = 2;
+    // LAMBDA: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_LOCAL]],
+    // LAMBDA-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_CONV]],
+    // LAMBDA-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store i{{[0-9]+}}* [[G_LOCAL]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: [[SIVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+    // LAMBDA-64: store i{{[0-9]+}}* [[SIVAR_CONV]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
+    // LAMBDA-32: store i{{[0-9]+}}* [[SIVAR_ADDR]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
+    // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      sivar = 4;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+      // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIVAR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+  #pragma omp target
+  #pragma omp teams firstprivate(t_var, vec, s_arr, var, sivar)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 2;
+  }
+  #pragma omp target
+  #pragma omp teams firstprivate(t_var)
+  {}
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define internal {{.*}}void [[OMP_OFFLOADING:@.+]](
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, i{{32|64}}, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[OMP_OUTLINED:@.+]] to void
+// CHECK: ret
+//
+// CHECK: define internal {{.*}}void [[OMP_OUTLINED]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i{{32|64}} {{.*}}%{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}}, i{{32|64}} {{.*}}[[SIVAR:%.+]])
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[SIVAR7_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
+// CHECK-64: [[T_VAR_CONV:%.+]] = bitcast i64* [[T_VAR_PRIV]] to i32*
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** %
+// CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+
+// CHECK-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_CONV]],
+// CHECK-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_PRIV]],
+
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal {{.*}}void [[OMP_OFFLOADING_1:@.+]](
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}})* [[OMP_OUTLINED_1:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal {{.*}}void [[OMP_OUTLINED_1]](i{{[0-9]+}}* noalias {{%.+}}, i{{[0-9]+}}* noalias {{%.+}}, i{{32|64}} {{.*}}[[T_VAR:%.+]])
+// CHECK: [[T_VAR_LOC:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}} [[T_VAR]], i{{[0-9]+}}* [[T_VAR_LOC]],
+// CHECK: ret
+
+// CHECK: define internal {{.*}}void [[OMP_OFFLOADING_2:@.+]](i{{[0-9]+}}* {{.+}} {{%.+}}, [2 x i32]* {{.+}} {{%.+}}, [2 x [[S_INT_TY]]]* {{.+}} {{%.+}}, [[S_INT_TY]]* {{.+}} {{%.+}})
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[OMP_OUTLINED_2:@.+]] to void
+// CHECK: ret
+
+//
+// CHECK: define internal {{.*}}void [[OMP_OUTLINED_2]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]], align 128
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], align 128
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** %
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
+
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]], align 128
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]], align 128
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}} 128,
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_REF]] to [[S_INT_TY]]*
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal {{.*}}void [[OMP_OFFLOADING_3:@.+]](
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED_3:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal {{.*}}void [[OMP_OUTLINED_3]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) [[T_VAR:%.+]])
+// CHECK: [[T_VAR_LOC:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[T_VAR]], i{{[0-9]+}}** [[T_VAR_ADDR:%.+]],
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_LOC]],
+// CHECK: ret
+
+#else
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) { }
+  ~St() {}
+  void St_func(St s[2], int n, long double vla1[n]) {
+    double vla2[n][n] __attribute__((aligned(128)));
+    a = b;
+    #pragma omp target
+    #pragma omp teams firstprivate(s, vla1, vla2)
+    vla1[b] = vla2[1][n - 1] = a = b;
+  }
+};
+
+void array_func(float a[3], St s[2], int n, long double vla1[n]) {
+  double vla2[n][n] __attribute__((aligned(128)));
+// ARRAY: call {{.+}} @__kmpc_fork_teams(
+// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St*,
+// ARRAY-64-DAG: [[PRIV_VLA1:%.+]] = alloca ppc_fp128*,
+// ARRAY-32-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80*,
+// ARRAY-DAG: [[PRIV_A:%.+]] = alloca float*,
+// ARRAY-DAG: [[PRIV_VLA2:%.+]] = alloca double*,
+// ARRAY-DAG: store float* %{{.+}}, float** [[PRIV_A]],
+// ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY-64-DAG: store ppc_fp128* %{{.+}}, ppc_fp128** [[PRIV_VLA1]],
+// ARRAY-32-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
+// ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
+// ARRAY: call i8* @llvm.stacksave()
+// ARRAY: [[SIZE:%.+]] = mul nuw i{{[0-9]+}} %{{.+}}, 8
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* %{{.+}}, i8* %{{.+}}, i{{[0-9]+}} [[SIZE]], i32 128, i1 false)
+  #pragma omp target
+  #pragma omp teams firstprivate(a, s, vla1, vla2)
+  s[0].St_func(s, n, vla1);
+  ;
+}
+
+// ARRAY: @__kmpc_fork_teams(
+// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St*,
+// ARRAY-64-DAG: [[PRIV_VLA1:%.+]] = alloca ppc_fp128*,
+// ARRAY-32-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80*,
+// ARRAY-DAG: [[PRIV_VLA2:%.+]] = alloca double*,
+// ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY-64-DAG: store ppc_fp128* %{{.+}}, ppc_fp128** [[PRIV_VLA1]],
+// ARRAY-32-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
+// ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
+// ARRAY: call i8* @llvm.stacksave()
+// ARRAY: [[SIZE:%.+]] = mul nuw i{{[0-9]+}} %{{.+}}, 8
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* %{{.+}}, i8* %{{.+}}, i{{[0-9]+}} [[SIZE]], i32 128, i1 false)
+#endif
+#endif
diff --git a/test/OpenMP/teams_private_codegen.cpp b/test/OpenMP/teams_private_codegen.cpp
new file mode 100644
index 0000000..1ba010f
--- /dev/null
+++ b/test/OpenMP/teams_private_codegen.cpp
@@ -0,0 +1,298 @@
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile int g __attribute__((aligned(128))) = 1212;
+
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp target
+#pragma omp teams private(a, b, c)
+#ifdef LAMBDA
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+    }();
+#else
+    ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp target
+#pragma omp teams private(a)
+#ifdef LAMBDA
+    [&]() {
+      [&]() {
+        ++this->a;
+      }();
+    }();
+#else
+    ++(this)->a;
+#endif
+  }
+};
+
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[CAP_0_TY:%.+]] = type { [[SS_TY]]*, i{{[0-9]+}}*,
+// LAMBDA: [[CAP_1_TY:%.+]] = type { i{{[0-9]+}}*, i{{[0-9]+}}* }
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK: [[SST_TY:%.+]] = type { i{{[0-9]+}} }
+template <typename T>
+T tmain() {
+  S<T> test;
+  SST<T> sst;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] __attribute__((aligned(128))) = {1, 2};
+  S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
+  S<T> var __attribute__((aligned(128))) (3);
+#pragma omp target
+#pragma omp teams private(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+  SS ss(sivar);
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA: define {{.+}} @main()
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+
+  // LAMBDA: call{{.*}} [[ST_CONSTR_INIT:@.+]]([[SS_TY]]*
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@[^(]+]]([[CAP_TY]]*
+
+  // lambda and target region in main
+  // LAMBDA: define {{.+}} [[OUTER_LAMBDA]]([[CAP_TY]]* {{.+}})
+  // LAMBDA: call void @[[OMP_OFFLOADING:.+]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}} {{.+}}
+
+  // target region in struct constructor
+  // LAMBDA: define{{.*}} void [[ST_CONSTR:@.+]]([[SS_TY]]* %this,
+  // LAMBDA: call void [[OMP_OFFLOADING_1:@.+]]([[SS_TY]]
+
+  // offloading function in struct constructor
+  // LAMBDA: define{{.*}} void [[OMP_OFFLOADING_1]]([[SS_TY]]
+  // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[OMP_OUTLINED:@.+]] to void
+
+  // outlined teams region in struct constructor
+  // LAMBDA: define{{.*}} void [[OMP_OUTLINED]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}}* {{.+}}, [[SS_TY]]*
+  // LAMBDA: [[THIS_ADDR:%.+]] = alloca [[SS_TY]]*,
+  // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+  // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+  // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+  // LAMBDA: [[THIS_REF:%.+]] = load [[SS_TY]]*, [[SS_TY]]** [[THIS_ADDR]],
+  // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[A_TMP_REF:%.+]],
+  // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[C_TMP_REF:%.+]],
+  // LAMBDA: [[CAP_THIS_REF:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // LAMBDA: store [[SS_TY]]* [[THIS_REF]], [[SS_TY]]** [[CAP_THIS_REF]],
+  // LAMBDA: [[CAP_A_REF:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // LAMBDA: [[A_TMP_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[A_TMP_REF]],
+  // LAMBDA: store i{{[0-9]+}}* [[A_TMP_VAL]], i{{[0-9]+}}** [[CAP_A_REF]],
+  // LAMBDA: [[CAP_B_REF:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // LAMBDA: store i{{[0-9]+}}* [[B_PRIV]], i{{[0-9]+}}** [[CAP_B_REF]],
+  // LAMBDA: [[CAP_C_REF:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // LAMBDA: [[C_TMP_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[C_TMP_REF]],
+  // LAMBDA: store i{{[0-9]+}}* [[C_TMP_VAL]], i{{[0-9]+}}** [[CAP_C_REF]],
+  // call void [[INNER_LAMBDA_CONSTR:@.+]]([[CAP_0_TY]]*
+  
+  // inner lambda in struct constructor
+  // define{{.*}} void [[INNER_LAMBDA_CONSTR]]([[CAP_0_TY]]*
+  // LAMBDA: [[CAP_A_REF_1:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // LAMBDA: [[A_REF_FROM_CAP:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[CAP_A_REF_1]],
+  // LAMBDA: [[A_VAL_FROM_CAP:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_REF_FROM_CAP]],
+  // LAMBDA: [[A_INC_VAL:%.+]] = add {{.+}} i{{[0-9]+}} [[A_VAL_FROM_CAP]], 1
+  // LAMBDA: store i{{[0-9]+}} [[A_INC_VAL]], i{{[0-9]+}}* [[A_REF_FROM_CAP]],
+
+  // LAMBDA: [[CAP_B_REF_1:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // LAMBDA: [[B_REF_FROM_CAP:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[CAP_B_REF_1]],
+  // LAMBDA: [[B_VAL_FROM_CAP:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_REF_FROM_CAP]],
+  // LAMBDA: [[B_DEC_VAL:%.+]] = add {{.+}} i{{[0-9]+}} [[B_VAL_FROM_CAP]], -1
+  // LAMBDA: store i{{[0-9]+}} [[B_DEC_VAL]], i{{[0-9]+}}* [[B_REF_FROM_CAP]],
+
+  // LAMBDA: [[CAP_C_REF_1:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // LAMBDA: [[C_REF_FROM_CAP:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[CAP_C_REF_1]],
+  // LAMBDA: [[C_VAL_FROM_CAP:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_REF_FROM_CAP]],
+  // LAMBDA: [[C_DEC_VAL:%.+]] = sdiv{{.*}} i{{[0-9]+}} [[C_VAL_FROM_CAP]], 1
+  // LAMBDA: store i{{[0-9]+}} [[C_DEC_VAL]], i{{[0-9]+}}* [[C_REF_FROM_CAP]],
+  // ret
+    
+  [&]() {    
+#pragma omp target
+#pragma omp teams private(g, sivar)
+  {
+    // LAMBDA: define{{.+}} @[[OMP_OFFLOADING]](i{{[0-9]+}}* {{.+}} [[G_IN:%.+]], i{{[0-9]+}} [[SIVAR_IN:%.+]]
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED_1:@.+]] to void
+    
+    // LAMBDA: define {{.+}} [[OMP_OUTLINED_1]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}}* {{.+}}
+    // LAMBDA: [[G_LOC_OUTER:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_LOC_OUTER:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_LOC_OUTER]]
+    // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_LOC_OUTER]]
+    // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@[^(]+]]([[CAP_1_TY]]*
+    // LAMBDA: ret
+    g = 1;
+    sivar = 2;
+    [&]() {
+      // LAMBDA: define {{.+}} [[INNER_LAMBDA]]([[CAP_1_TY]]* {{.+}})
+      g = 2;
+      sivar = 4;
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}*
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+#pragma omp target
+#pragma omp teams private(t_var, vec, s_arr, var, sivar)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 3;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define{{.*}} i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: [[OFF_IN:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* {{%.+}},
+// CHECK: call void @[[OMP_OFFLOADING:.+]](i{{[0-9]+}} [[OFF_IN]]
+// CHECK: = call{{.*}} i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+
+// target region in main function
+// CHECK: define{{.+}} @[[OMP_OFFLOADING]](i{{[0-9]+}}
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED:@.+]] to void
+
+// CHECK: define internal void [[OMP_OUTLINED]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: ret void
+
+// template tmain
+// CHECK: define{{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, i{{[0-9]+}}{{.*}} 3)
+// CHECK: call void [[OMP_OFFLOADING_TMAIN:@.+]]()
+
+// target in SS constructor
+// CHECK: define{{.+}} [[OMP_OFFLOADING_SS:@.+]]([[SS_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[OMP_OUTLINED_SS:@.+]] to void
+
+// CHECK: define{{.*}} void [[OMP_OUTLINED_SS]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}}* {{.+}}, [[SS_TY]]*
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[A_REF:%.+]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[C_REF:%.+]],
+// CHECK: [[A_REF_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[A_REF]]
+// CHECK: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_REF_VAL]]
+// CHECK: [[A_INC:%.+]] = add{{.*}} i{{[0-9]+}} [[A_VAL]], 1
+// CHECK: store i{{[0-9]+}} [[A_INC]], i{{[0-9]+}}* [[A_REF_VAL]],
+// CHECK: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]]
+// CHECK: [[B_DEC:%.+]] = add{{.*}} i{{[0-9]+}} [[B_VAL]], -1
+// CHECK: store i{{[0-9]+}} [[B_DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK: [[C_REF_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[C_REF]]
+// CHECK: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_REF_VAL]]
+// CHECK: [[C_DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK: store i{{[0-9]+}} [[C_DIV]], i{{[0-9]+}}* [[C_REF_VAL]],
+// CHECK: ret
+
+// target in tmain template
+// CHECK: define{{.+}} [[OMP_OFFLOADING_TMAIN]]()
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED_TMAIN:@.+]] to void
+
+// CHECK: define{{.*}} void [[OMP_OUTLINED_TMAIN]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]], align 128
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], align 128
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret
+
+// SST constructor
+// CHECK: define{{.+}} [[SST_CONST:@.+]]([[SST_TY]]* {{.+}})
+// CHECK: call void [[OMP_OFFLOADING_SST:@.+]]([[SST_TY]]* {{.+}})
+
+// target in SST constructor
+// CHECK: define{{.+}} [[OMP_OFFLOADING_SST]]([[SST_TY]]* {{.+}})
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SST_TY]]*)* [[OMP_OUTLINED_SST:@.+]] to void
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_SST]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}}* noalias %{{.+}}, [[SST_TY]]* {{.+}})
+// CHECK: [[A_PRIV_1:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV_1]], i{{[0-9]+}}** [[A_REF_1:%.+]],
+// CHECK: [[A_REF_VAL_1:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[A_REF_1]]
+// CHECK: [[A_VAL_1:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_REF_VAL_1]]
+// CHECK: [[A_INC_1:%.+]] = add{{.*}} i{{[0-9]+}} [[A_VAL_1]], 1
+// CHECK: store i{{[0-9]+}} [[A_INC_1]], i{{[0-9]+}}* [[A_REF_VAL_1]],
+// CHECK: ret
+
+#endif
+
diff --git a/test/OpenMP/teams_reduction_messages.cpp b/test/OpenMP/teams_reduction_messages.cpp
index 87d0348..0420b01 100644
--- a/test/OpenMP/teams_reduction_messages.cpp
+++ b/test/OpenMP/teams_reduction_messages.cpp
@@ -13,7 +13,7 @@
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -112,7 +112,7 @@
 #pragma omp teams reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp teams reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(&& : argc)
@@ -121,22 +121,22 @@
 #pragma omp teams reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp teams reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp teams reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp teams reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp teams reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp teams reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -151,7 +151,7 @@
 #pragma omp teams reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp teams reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   foo();
 #pragma omp target
 #pragma omp teams private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -161,7 +161,7 @@
 #pragma omp teams reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp teams reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
@@ -175,6 +175,7 @@
 #pragma omp teams
 #pragma omp parallel for private(fl)
   for (int i = 0; i < 10; ++i)
+  {}
 #pragma omp target
 #pragma omp teams reduction(+ : fl)
     foo();
@@ -182,6 +183,7 @@
 #pragma omp teams
 #pragma omp parallel for reduction(- : fl)
   for (int i = 0; i < 10; ++i)
+  {}
 #pragma omp target
 #pragma omp teams reduction(+ : fl)
     foo();
@@ -258,13 +260,13 @@
 #pragma omp teams reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp teams reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp teams reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp teams reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -306,6 +308,7 @@
 #pragma omp teams
 #pragma omp parallel for private(fl)
   for (int i = 0; i < 10; ++i)
+  {}
 #pragma omp target
 #pragma omp teams reduction(+ : fl)
     foo();
@@ -313,6 +316,7 @@
 #pragma omp teams
 #pragma omp parallel for reduction(- : fl)
   for (int i = 0; i < 10; ++i)
+  {}
 #pragma omp target
 #pragma omp teams reduction(+ : fl)
     foo();
diff --git a/test/OpenMP/threadprivate_ast_print.cpp b/test/OpenMP/threadprivate_ast_print.cpp
index 2d876c1..f789c2f 100644
--- a/test/OpenMP/threadprivate_ast_print.cpp
+++ b/test/OpenMP/threadprivate_ast_print.cpp
@@ -69,4 +69,5 @@
   return (foo<int>());
 }
 
+extern template int ST<int>::m;
 #endif
diff --git a/test/OpenMP/threadprivate_codegen.cpp b/test/OpenMP/threadprivate_codegen.cpp
index 006d6a5..273f048 100644
--- a/test/OpenMP/threadprivate_codegen.cpp
+++ b/test/OpenMP/threadprivate_codegen.cpp
@@ -948,5 +948,5 @@
 // CHECK-TLS:      call void [[ST_S4_ST_CXX_INIT]]
 // CHECK-TLS:      [[DONE_LABEL]]
 
-// CHECK-TLS:      declare {{.*}} void [[GS3_TLS_INIT]]
-// CHECK-TLS:      declare {{.*}} void [[STATIC_S_TLS_INIT]]
+// CHECK-TLS-DAG:      declare {{.*}} void [[GS3_TLS_INIT]]
+// CHECK-TLS-DAG:      declare {{.*}} void [[STATIC_S_TLS_INIT]]
diff --git a/test/OpenMP/threadprivate_messages.cpp b/test/OpenMP/threadprivate_messages.cpp
index 8c442f4..9775bfa 100644
--- a/test/OpenMP/threadprivate_messages.cpp
+++ b/test/OpenMP/threadprivate_messages.cpp
@@ -70,7 +70,7 @@
 
 namespace ns {
   int m;
-#pragma omp threadprivate (m)
+#pragma omp threadprivate (m, m)
 }
 #pragma omp threadprivate (m) // expected-error {{use of undeclared identifier 'm'}}
 #pragma omp threadprivate (ns::m)
diff --git a/test/PCH/Inputs/__va_list_tag-typedef.h b/test/PCH/Inputs/__va_list_tag-typedef.h
new file mode 100644
index 0000000..33dc6ad
--- /dev/null
+++ b/test/PCH/Inputs/__va_list_tag-typedef.h
@@ -0,0 +1,4 @@
+// Header for PCH test __va_list_tag-typedef.c
+
+#include <stdarg.h>
+typedef va_list va_list_1;
diff --git a/test/PCH/Inputs/cxx11-statement-attributes.h b/test/PCH/Inputs/cxx11-statement-attributes.h
index f4d0619..3f85e1f 100644
--- a/test/PCH/Inputs/cxx11-statement-attributes.h
+++ b/test/PCH/Inputs/cxx11-statement-attributes.h
@@ -7,7 +7,8 @@
       [[clang::fallthrough]];  // This shouldn't generate a warning.
     case 1:
       n += 20;
-      [[clang::fallthrough]];  // This should generate a warning: "fallthrough annotation does not directly precede switch label".
+    case 2:  // This should generate a warning: "unannotated fallthrough"
+      n += 35;
       break;
   }
   return n;
diff --git a/test/PCH/libroot/usr/include/reloc.h b/test/PCH/Inputs/libroot/usr/include/reloc.h
similarity index 100%
rename from test/PCH/libroot/usr/include/reloc.h
rename to test/PCH/Inputs/libroot/usr/include/reloc.h
diff --git a/test/PCH/libroot/usr/include/reloc2.h b/test/PCH/Inputs/libroot/usr/include/reloc2.h
similarity index 100%
rename from test/PCH/libroot/usr/include/reloc2.h
rename to test/PCH/Inputs/libroot/usr/include/reloc2.h
diff --git a/test/PCH/Inputs/pr27445.h b/test/PCH/Inputs/pr27445.h
new file mode 100644
index 0000000..f78a1bc
--- /dev/null
+++ b/test/PCH/Inputs/pr27445.h
@@ -0,0 +1,4 @@
+struct Info {
+  virtual ~Info();
+  void hash() {}
+};
diff --git a/test/PCH/Inputs/pragma-once.h b/test/PCH/Inputs/pragma-once.h
new file mode 100644
index 0000000..831cf55
--- /dev/null
+++ b/test/PCH/Inputs/pragma-once.h
@@ -0,0 +1,5 @@
+#pragma once
+
+/* For use with the pragma-once.c test */
+
+int x = 3;
diff --git a/test/PCH/Inputs/pragma-once2-pch.h b/test/PCH/Inputs/pragma-once2-pch.h
new file mode 100644
index 0000000..642b50f
--- /dev/null
+++ b/test/PCH/Inputs/pragma-once2-pch.h
@@ -0,0 +1 @@
+#include "pragma-once2.h"
diff --git a/test/PCH/Inputs/pragma-once2.h b/test/PCH/Inputs/pragma-once2.h
new file mode 100644
index 0000000..7ed4a95
--- /dev/null
+++ b/test/PCH/Inputs/pragma-once2.h
@@ -0,0 +1,3 @@
+#pragma once
+
+inline void f() {}
diff --git a/test/PCH/__va_list_tag-typedef.c b/test/PCH/__va_list_tag-typedef.c
new file mode 100644
index 0000000..c3745ca
--- /dev/null
+++ b/test/PCH/__va_list_tag-typedef.c
@@ -0,0 +1,14 @@
+// This test checks the patch for the compilation error / crash described in D18557.
+
+// Test as a C source
+// RUN: %clang_cc1 -emit-pch -x c-header -o %t %S/Inputs/__va_list_tag-typedef.h
+// RUN: %clang_cc1 -fsyntax-only -include-pch %t %s
+
+// Test as a C++ source
+// RUN: %clang_cc1 -emit-pch -x c++-header -o %t %S/Inputs/__va_list_tag-typedef.h
+// RUN: %clang_cc1 -x c++ -fsyntax-only -include-pch %t %s
+
+// expected-no-diagnostics
+
+typedef __builtin_va_list va_list_2;
+void test(const char* format, ...) { va_list args; va_start( args, format ); }
diff --git a/test/PCH/attrs.c b/test/PCH/attrs.c
index 6a4b8f6..3f34d4d 100644
--- a/test/PCH/attrs.c
+++ b/test/PCH/attrs.c
@@ -9,10 +9,12 @@
 #define HEADER
 
 int f(int) __attribute__((visibility("default"), overloadable));
+int g(int) __attribute__((abi_tag("foo", "bar", "baz"), no_sanitize("address", "memory")));
 
 #else
 
 double f(double); // expected-error{{overloadable}}
                   // expected-note@11{{previous overload}}
+void h() { g(0); }
 
 #endif
diff --git a/test/PCH/case-insensitive-include.c b/test/PCH/case-insensitive-include.c
index 707de70..1dcda27 100644
--- a/test/PCH/case-insensitive-include.c
+++ b/test/PCH/case-insensitive-include.c
@@ -2,7 +2,7 @@
 
 // Test this without pch.
 // RUN: cp %S/Inputs/case-insensitive-include.h %T
-// RUN: %clang_cc1 -fsyntax-only %s -include %s -I %T -verify
+// RUN: %clang_cc1 -Wno-nonportable-include-path -fsyntax-only %s -include %s -I %T -verify
 
 // Test with pch.
 // RUN: %clang_cc1 -emit-pch -o %t.pch %s -I %T
diff --git a/test/PCH/cxx-traits.cpp b/test/PCH/cxx-traits.cpp
index fc3e133..b0f1d9d 100644
--- a/test/PCH/cxx-traits.cpp
+++ b/test/PCH/cxx-traits.cpp
@@ -18,6 +18,7 @@
 bool _is_abstract_result = __is_abstract(int);
 bool _is_arithmetic_result = __is_arithmetic(int);
 bool _is_array_result = __is_array(int);
+bool _is_assignable_result = __is_assignable(int, int);
 bool _is_base_of_result = __is_base_of(int, int);
 bool _is_class_result = __is_class(int);
 bool _is_complete_type_result = __is_complete_type(int);
diff --git a/test/PCH/cxx-traits.h b/test/PCH/cxx-traits.h
index 2132476..1d7d404 100644
--- a/test/PCH/cxx-traits.h
+++ b/test/PCH/cxx-traits.h
@@ -20,6 +20,7 @@
 struct __is_abstract {};  // expected-warning {{made available}}
 struct __is_arithmetic {};  // expected-warning {{made available}}
 struct __is_array {};  // expected-warning {{made available}}
+struct __is_assignable {};  // expected-warning {{made available}}
 struct __is_base_of {};  // expected-warning {{made available}}
 struct __is_class {};  // expected-warning {{made available}}
 struct __is_complete_type {};  // expected-warning {{made available}}
diff --git a/test/PCH/cxx11-inheriting-ctors.cpp b/test/PCH/cxx11-inheriting-ctors.cpp
index 79f78ba..bf9a2b7 100644
--- a/test/PCH/cxx11-inheriting-ctors.cpp
+++ b/test/PCH/cxx11-inheriting-ctors.cpp
@@ -1,10 +1,19 @@
-// RUN: %clang_cc1 -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -std=c++11 -include-pch %t -verify %s
+// RUN: %clang_cc1 -std=c++11 -include %s -include %s -verify %s
+//
+// Emit with definitions in the declaration:
+// RxN: %clang_cc1 -std=c++11 -emit-pch -o %t.12 -include %s %s
+// RxN: %clang_cc1 -std=c++11 -include-pch %t.12 -verify %s
+//
+// Emit with definitions in update records:
+// RxN: %clang_cc1 -std=c++11 -emit-pch -o %t.1 %s
+// RxN: %clang_cc1 -std=c++11 -include-pch %t.1 -emit-pch -o %t.2 -verify %s
+// RxN: %clang_cc1 -std=c++11 -include-pch %t.1 -include-pch %t.2 -verify %s
+
 
 // expected-no-diagnostics
 
-#ifndef HEADER_INCLUDED
-#define HEADER_INCLUDED
+#ifndef HEADER1
+#define HEADER1
 
 struct Base {
   Base(int) {}
@@ -27,7 +36,8 @@
   using B::B;
 };
 
-#else
+#elif !defined(HEADER2)
+#define HEADER2
 
 Test test1a(42);
 Test test1b(nullptr);
@@ -36,4 +46,16 @@
 Test3<Base> test3a(42);
 Test3<Base> test3b(nullptr);
 
-#endif // HEADER_INCLUDED
+#pragma clang __debug dump Test
+#pragma clang __debug dump Test2
+
+#else
+
+Test retest1a(42);
+Test retest1b(nullptr);
+Test2<int> retest2a(42);
+Test2<int> retest2b(nullptr);
+Test3<Base> retest3a(42);
+Test3<Base> retest3b(nullptr);
+
+#endif
diff --git a/test/PCH/cxx11-statement-attributes.cpp b/test/PCH/cxx11-statement-attributes.cpp
index 722ca6e..b5dfc6c 100644
--- a/test/PCH/cxx11-statement-attributes.cpp
+++ b/test/PCH/cxx11-statement-attributes.cpp
@@ -1,10 +1,15 @@
 // Sanity check.
 // RUN: %clang_cc1 -include %S/Inputs/cxx11-statement-attributes.h -std=c++11 -Wimplicit-fallthrough -fsyntax-only %s -o - -verify
+// RUN: %clang_cc1 -include %S/Inputs/cxx11-statement-attributes.h -std=c++1z -Wimplicit-fallthrough -fsyntax-only %s -o - -verify
 // Run the same tests, this time with the attributes loaded from the PCH file.
 // RUN: %clang_cc1 -x c++-header -emit-pch -std=c++11 -o %t %S/Inputs/cxx11-statement-attributes.h
 // RUN: %clang_cc1 -include-pch %t -std=c++11 -Wimplicit-fallthrough -fsyntax-only %s -o - -verify
+// RUN: %clang_cc1 -x c++-header -emit-pch -std=c++1z -o %t %S/Inputs/cxx11-statement-attributes.h
+// RUN: %clang_cc1 -include-pch %t -std=c++1z -Wimplicit-fallthrough -fsyntax-only %s -o - -verify
 
-// expected-warning@Inputs/cxx11-statement-attributes.h:10 {{fallthrough annotation does not directly precede switch label}}
+// expected-warning@Inputs/cxx11-statement-attributes.h:10 {{unannotated fall-through}}
+// expected-note-re@Inputs/cxx11-statement-attributes.h:10 {{insert '[[{{(clang::)?}}fallthrough]];'}}
+// expected-note@Inputs/cxx11-statement-attributes.h:10 {{insert 'break;'}}
 
 void g(int n) {
   f<1>(n);  // expected-note {{in instantiation of function template specialization 'f<1>' requested here}}
diff --git a/test/PCH/cxx1y-default-initializer.cpp b/test/PCH/cxx1y-default-initializer.cpp
index 1f8d9a5..c9593a5 100644
--- a/test/PCH/cxx1y-default-initializer.cpp
+++ b/test/PCH/cxx1y-default-initializer.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -pedantic -std=c++1y %s -o %t
-// RUN: %clang_cc1 -pedantic -std=c++1y -emit-pch %s -o %t
-// RUN: %clang_cc1 -pedantic -std=c++1y -include-pch %t -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1y -include %s -include %s -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1y -emit-pch -o %t.1 %s
+// RUN: %clang_cc1 -pedantic -std=c++1y -include-pch %t.1 -emit-pch -o %t.2 %s
+// RUN: %clang_cc1 -pedantic -std=c++1y -include-pch %t.2 -verify %s
 
-#ifndef HEADER_INCLUDED
-
-#define HEADER_INCLUDED
+#ifndef HEADER_1
+#define HEADER_1
 
 struct A {
   int x;
@@ -19,6 +19,20 @@
   constexpr B(int k) : z1(k) {}
 };
 
+template<typename T> struct C {
+  constexpr C() {}
+  T c = T();
+  struct U {};
+};
+// Instantiate C<int> but not the default initializer.
+C<int>::U ciu;
+
+#elif !defined(HEADER_2)
+#define HEADER_2
+
+// Instantiate the default initializer now, should create an update record.
+C<int> ci;
+
 #else
 
 static_assert(A{}.z == 3, "");
@@ -27,5 +41,6 @@
 static_assert(A{3, .y = 1}.z == 4, ""); // expected-warning {{C99}}
 static_assert(make<int>().z == 3, "");
 static_assert(make<int>(12).z == 15, "");
+static_assert(C<int>().c == 0, "");
 
 #endif
diff --git a/test/PCH/cxx1z-decomposition.cpp b/test/PCH/cxx1z-decomposition.cpp
new file mode 100644
index 0000000..e033577
--- /dev/null
+++ b/test/PCH/cxx1z-decomposition.cpp
@@ -0,0 +1,32 @@
+// No PCH:
+// RUN: %clang_cc1 -pedantic -std=c++1z -include %s -verify %s
+//
+// With PCH:
+// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch %s -o %t
+// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -verify %s
+
+#ifndef HEADER
+#define HEADER
+
+template<typename T> auto decomp(const T &t) {
+  auto &[a, b] = t;
+  return a + b;
+}
+
+struct Q { int a, b; };
+constexpr int foo(Q &&q) {
+  auto &[a, b] = q;
+  return a * 10 + b;
+}
+
+#else
+
+int arr[2];
+int k = decomp(arr);
+
+static_assert(foo({1, 2}) == 12);
+
+// expected-error@12 {{cannot decompose non-class, non-array type 'const int'}}
+int z = decomp(10); // expected-note {{instantiation of}}
+
+#endif
diff --git a/test/PCH/cxx1z-init-statement.cpp b/test/PCH/cxx1z-init-statement.cpp
new file mode 100644
index 0000000..d08fb7c
--- /dev/null
+++ b/test/PCH/cxx1z-init-statement.cpp
@@ -0,0 +1,17 @@
+// Test this without pch.
+// RUN: %clang_cc1 -std=c++1z -include %S/cxx1z-init-statement.h -fsyntax-only -emit-llvm -o - %s
+
+// Test with pch.
+// RUN: %clang_cc1 -x c++ -std=c++1z -emit-pch -o %t %S/cxx1z-init-statement.h
+// RUN: %clang_cc1 -std=c++1z -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+
+void g0(void) {
+  static_assert(test_if(-1) == -1, "");
+  static_assert(test_if(0) == 0, "");
+}
+
+void g1(void) {
+  static_assert(test_switch(-1) == -1, "");
+  static_assert(test_switch(0) == 0, "");
+  static_assert(test_switch(1) == 1, "");
+}
diff --git a/test/PCH/cxx1z-init-statement.h b/test/PCH/cxx1z-init-statement.h
new file mode 100644
index 0000000..16bd569
--- /dev/null
+++ b/test/PCH/cxx1z-init-statement.h
@@ -0,0 +1,22 @@
+// Header for PCH test cxx1z-init-statement.cpp
+
+constexpr int test_if(int x) { 
+  if (int a = ++x; a == 0) {
+    return -1;
+  } else if (++a; a == 2) {
+    return 0;
+  }
+  return 2;
+}
+
+constexpr int test_switch(int x) {
+  switch (int a = ++x; a) {
+    case 0:
+      return -1;
+    case 1:
+      return 0;
+    case 2:
+      return 1;
+  }
+  return 2;
+}
diff --git a/test/PCH/include-timestamp.cpp b/test/PCH/include-timestamp.cpp
new file mode 100644
index 0000000..d7d0fab
--- /dev/null
+++ b/test/PCH/include-timestamp.cpp
@@ -0,0 +1,32 @@
+// Test that the timestamp is not included in the produced pch file with
+// -fno-pch-timestamp.
+
+// Copying files allow for read-only checkouts to run this test.
+// RUN: cp %S/Inputs/pragma-once2-pch.h %T
+// RUN: cp %S/Inputs/pragma-once2.h %T
+// RUN: cp %s %t1.cpp
+
+// Check timestamp is included by default.
+// RUN: %clang_cc1 -x c++-header -emit-pch -o %t %T/pragma-once2-pch.h
+// RUN: touch -m -a -t 201008011501 %T/pragma-once2.h
+// RUN: not %clang_cc1 -include-pch %t %t1.cpp 2>&1 | FileCheck -check-prefix=CHECK-TIMESTAMP %s
+
+// Check bitcode output as well.
+// RUN: llvm-bcanalyzer -dump %t | FileCheck -check-prefix=CHECK-BITCODE-TIMESTAMP-ON %s
+
+// Check timestamp inclusion is disabled by -fno-pch-timestamp.
+// RUN: %clang_cc1 -x c++-header -emit-pch -o %t %T/pragma-once2-pch.h -fno-pch-timestamp
+// RUN: touch -m -a -t 201008011502 %T/pragma-once2.h
+// RUN: %clang_cc1 -include-pch %t %t1.cpp 2>&1
+
+// Check bitcode output as well.
+// RUN: llvm-bcanalyzer -dump %t | FileCheck -check-prefix=CHECK-BITCODE-TIMESTAMP-OFF %s
+
+#include "pragma-once2.h"
+
+void g() { f(); }
+
+// CHECK-BITCODE-TIMESTAMP-ON: <INPUT_FILE abbrevid={{.*}} op0={{.*}} op1={{.*}} op2={{[^0]}}
+// CHECK-BITCODE-TIMESTAMP-OFF: <INPUT_FILE abbrevid={{.*}} op0={{.*}} op1={{.*}} op2={{[0]}}
+
+// CHECK-TIMESTAMP: fatal error: file {{.*}} has been modified since the precompiled header {{.*}} was built
diff --git a/test/PCH/missing-file.cpp b/test/PCH/missing-file.cpp
index 502a9db65..8bdb08d 100644
--- a/test/PCH/missing-file.cpp
+++ b/test/PCH/missing-file.cpp
@@ -4,16 +4,15 @@
 // RUN: echo 'struct S{char c; int i; }; void foo() {}' > %t.h
 // RUN: echo 'template <typename T> void tf() { T::foo(); }' >> %t.h
 // RUN: %clang_cc1 -x c++ -emit-pch -o %t.h.pch %t.h
-
-// %t.h might be touched by scanners as a hot file on Windows,
-// to fail to remove %.h with single run.
-// FIXME: Do we really want to work around bugs in virus checkers here?
-// RUN: rm %t.h || rm %t.h || rm %t.h
+// RUN: rm %t.h
 
 // Check diagnostic with location in original source:
 // RUN: not %clang_cc1 -include-pch %t.h.pch -emit-obj -o %t.o %s 2> %t.stderr
 // RUN: grep 'could not find file' %t.stderr
 
+// Oftentimes on Windows there are open handles, and deletion will fail.
+// REQUIRES: can-remove-opened-file
+
 void qq(S*) {}
 
 #ifdef REDECL
diff --git a/test/PCH/opencl-extensions.cl b/test/PCH/opencl-extensions.cl
index a22b007..d6d5416 100644
--- a/test/PCH/opencl-extensions.cl
+++ b/test/PCH/opencl-extensions.cl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -emit-pch -o %t %s
-// RUN: %clang_cc1 -include-pch %t -fsyntax-only %s 
+// RUN: %clang_cc1 -emit-pch -o %t %s -triple spir-unknown-unknown
+// RUN: %clang_cc1 -include-pch %t -fsyntax-only %s  -triple spir-unknown-unknown
 
 #ifndef HEADER
 #define HEADER
diff --git a/test/PCH/pr27445.cpp b/test/PCH/pr27445.cpp
new file mode 100644
index 0000000..2a4af5e
--- /dev/null
+++ b/test/PCH/pr27445.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions -x c++ %S/Inputs/pr27445.h -emit-pch -o %t.pch
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions %s -include-pch %t.pch -emit-llvm -o - | FileCheck %s
+
+class A;
+void fn1(A &) {}
+
+class __declspec(dllexport) A {
+  int operator=(A) { return field_; }
+  void (*on_arena_allocation_)(Info);
+  int field_;
+};
+
+// CHECK: %class.A = type { void (%struct.Info*)*, i32 }
+// CHECK: %struct.Info = type { i32 (...)** }
diff --git a/test/PCH/pragma-comment.c b/test/PCH/pragma-comment.c
new file mode 100644
index 0000000..07c3d40
--- /dev/null
+++ b/test/PCH/pragma-comment.c
@@ -0,0 +1,25 @@
+// Test this without pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-llvm -include %s -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-llvm -include %s -o - | FileCheck %s
+
+// Test with pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-pch -o %t
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-llvm -include-pch %t -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-pch -o %t
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-llvm -include-pch %t -o - | FileCheck %s
+
+// The first run line creates a pch, and since at that point HEADER is not
+// defined, the only thing contained in the pch is the pragma. The second line
+// then includes that pch, so HEADER is defined and the actual code is compiled.
+// The check then makes sure that the pragma is in effect in the file that
+// includes the pch.
+
+#ifndef HEADER
+#define HEADER
+#pragma comment(lib, "foo.lib")
+
+#else
+
+// CHECK: "/DEFAULTLIB:foo.lib"
+
+#endif
diff --git a/test/PCH/pragma-detect_mismatch.c b/test/PCH/pragma-detect_mismatch.c
new file mode 100644
index 0000000..ced4cf9
--- /dev/null
+++ b/test/PCH/pragma-detect_mismatch.c
@@ -0,0 +1,25 @@
+// Test this without pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-llvm -include %s -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-llvm -include %s -o - | FileCheck %s
+
+// Test with pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-pch -o %t
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-llvm -include-pch %t -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-pch -o %t
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-llvm -include-pch %t -o - | FileCheck %s
+
+// The first run line creates a pch, and since at that point HEADER is not
+// defined, the only thing contained in the pch is the pragma. The second line
+// then includes that pch, so HEADER is defined and the actual code is compiled.
+// The check then makes sure that the pragma is in effect in the file that
+// includes the pch.
+
+#ifndef HEADER
+#define HEADER
+#pragma detect_mismatch("FruitKind", "Jaboticaba")
+
+#else
+
+// CHECK: "/FAILIFMISMATCH:\22FruitKind=Jaboticaba\22"
+
+#endif
diff --git a/test/PCH/pragma-loop.cpp b/test/PCH/pragma-loop.cpp
index 2640020..5975816 100644
--- a/test/PCH/pragma-loop.cpp
+++ b/test/PCH/pragma-loop.cpp
@@ -7,9 +7,11 @@
 // CHECK: #pragma clang loop unroll_count(16)
 // CHECK: #pragma clang loop interleave_count(8)
 // CHECK: #pragma clang loop vectorize_width(4)
+// CHECK: #pragma clang loop distribute(enable)
 // CHECK: #pragma clang loop unroll(disable)
 // CHECK: #pragma clang loop interleave(disable)
 // CHECK: #pragma clang loop vectorize(enable)
+// CHECK: #pragma clang loop distribute(disable)
 // CHECK: #pragma clang loop unroll(full)
 // CHECK: #pragma clang loop interleave(enable)
 // CHECK: #pragma clang loop vectorize(disable)
@@ -40,6 +42,7 @@
 #pragma clang loop vectorize(enable)
 #pragma clang loop interleave(disable)
 #pragma clang loop unroll(disable)
+#pragma clang loop distribute(enable)
     while (i - 1 < Length) {
       List[i] = i;
       i++;
@@ -51,6 +54,7 @@
 #pragma clang loop vectorize(disable)
 #pragma clang loop interleave(enable)
 #pragma clang loop unroll(full)
+#pragma clang loop distribute(disable)
     while (i - 3 < Length) {
       List[i] = i;
       i++;
diff --git a/test/PCH/pragma-ms_struct.cpp b/test/PCH/pragma-ms_struct.cpp
new file mode 100644
index 0000000..ac2a1e8
--- /dev/null
+++ b/test/PCH/pragma-ms_struct.cpp
@@ -0,0 +1,41 @@
+// Test this without pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-apple-darwin9 -fsyntax-only -include %s -verify -std=c++11
+
+// Test with pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-apple-darwin9 -emit-pch -o %t -std=c++11
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-apple-darwin9 -fsyntax-only -include-pch %t -verify -std=c++11
+
+// The first run line creates a pch, and since at that point HEADER is not
+// defined, the only thing contained in the pch is the pragma. The second line
+// then includes that pch, so HEADER is defined and the actual code is compiled.
+// The check then makes sure that the pragma is in effect in the file that
+// includes the pch.
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+struct SOffH {
+  short m : 9;
+  int q : 12;
+};
+
+#pragma ms_struct on
+
+struct SOnH {
+  short m : 9;
+  int q : 12;
+};
+
+#else
+
+struct SOnC {
+  short m : 9;
+  int q : 12;
+};
+
+static_assert(sizeof(SOffH) == 4, "");
+static_assert(sizeof(SOnH) == 8, "");
+static_assert(sizeof(SOnC) == 8, "");
+
+#endif
diff --git a/test/PCH/pragma-once.c b/test/PCH/pragma-once.c
new file mode 100644
index 0000000..15e8503
--- /dev/null
+++ b/test/PCH/pragma-once.c
@@ -0,0 +1,13 @@
+// Test this without pch.
+// RUN: %clang_cc1 -include %S/Inputs/pragma-once.h -fsyntax-only -verify %s
+
+// Test with pch.
+// RUN: %clang_cc1 -emit-pch -o %t %S/Inputs/pragma-once.h
+// RUN: %clang_cc1 -include-pch %t -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+// Including "pragma-once.h" twice, to verify the 'once' aspect is honored.
+#include "Inputs/pragma-once.h"
+#include "Inputs/pragma-once.h"
+int foo(void) { return 0; }
diff --git a/test/PCH/pragma-pointers_to_members.cpp b/test/PCH/pragma-pointers_to_members.cpp
new file mode 100644
index 0000000..53edd6b
--- /dev/null
+++ b/test/PCH/pragma-pointers_to_members.cpp
@@ -0,0 +1,34 @@
+// Test this without pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-pc-win32 -fms-extensions -fsyntax-only -include %s -verify -std=c++11
+
+// Test with pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-pc-win32  -fms-extensions -emit-pch -o %t -std=c++11
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-pc-win32  -fms-extensions -fsyntax-only -include-pch %t -verify -std=c++11
+
+// The first run line creates a pch, and since at that point HEADER is not
+// defined, the only thing contained in the pch is the pragma. The second line
+// then includes that pch, so HEADER is defined and the actual code is compiled.
+// The check then makes sure that the pragma is in effect in the file that
+// includes the pch.
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct S0;
+static_assert(sizeof(int S0::*) == 12, "");
+
+struct S1;
+struct S2;
+
+#pragma pointers_to_members(full_generality, single_inheritance)
+
+static_assert(sizeof(int S1::*) == 4, "");
+
+#else
+
+static_assert(sizeof(int S2::*) == 4, "");
+static_assert(sizeof(int S0::*) == 12, "");
+
+#endif
diff --git a/test/PCH/reloc.c b/test/PCH/reloc.c
index bf70ab6..14788f0 100644
--- a/test/PCH/reloc.c
+++ b/test/PCH/reloc.c
@@ -1,7 +1,7 @@
 // RUN: %clang -target x86_64-apple-darwin10 --relocatable-pch -o %t \
-// RUN:   -isysroot %S/libroot %S/libroot/usr/include/reloc.h
+// RUN:   -isysroot %S/Inputs/libroot %S/Inputs/libroot/usr/include/reloc.h
 // RUN: %clang -target x86_64-apple-darwin10 -fsyntax-only \
-// RUN:   -include-pch %t -isysroot %S/libroot %s -Xclang -verify
+// RUN:   -include-pch %t -isysroot %S/Inputs/libroot %s -Xclang -verify
 // RUN: not %clang -target x86_64-apple-darwin10 -include-pch %t %s
 // REQUIRES: x86-registered-target
 
@@ -11,5 +11,5 @@
 int y = 5; // expected-error{{redefinition}}
 
 
-// expected-note@libroot/usr/include/reloc.h:13{{previous definition}}
-// expected-note@libroot/usr/include/reloc2.h:14{{previous definition}}
+// expected-note@Inputs/libroot/usr/include/reloc.h:13{{previous definition}}
+// expected-note@Inputs/libroot/usr/include/reloc2.h:14{{previous definition}}
diff --git a/test/PCH/type_pack_element.cpp b/test/PCH/type_pack_element.cpp
new file mode 100644
index 0000000..c4ed6c8
--- /dev/null
+++ b/test/PCH/type_pack_element.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++14 -x c++-header %s -emit-pch -o %t.pch
+// RUN: %clang_cc1 -std=c++14 -x c++ /dev/null -include-pch %t.pch
+
+template <int i>
+struct X { };
+
+using SizeT = decltype(sizeof(int));
+
+template <SizeT i, typename ...T>
+using TypePackElement = __type_pack_element<i, T...>;
+
+void fn1() {
+  X<0> x0 = TypePackElement<0, X<0>, X<1>, X<2>>{};
+  X<1> x1 = TypePackElement<1, X<0>, X<1>, X<2>>{};
+  X<2> x2 = TypePackElement<2, X<0>, X<1>, X<2>>{};
+}
diff --git a/test/PCH/uuidof.cpp b/test/PCH/uuidof.cpp
new file mode 100644
index 0000000..207a8da
--- /dev/null
+++ b/test/PCH/uuidof.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fms-extensions -x c++-header -emit-pch -o %t %s
+// RUN: %clang_cc1 -fms-extensions -include-pch %t -fsyntax-only %s -emit-llvm -o - | FileCheck %s
+
+#ifndef HEADER
+#define HEADER
+struct _GUID {};
+const _GUID &x = __uuidof(0);
+// CHECK-DAG: @_GUID_00000000_0000_0000_0000_000000000000
+#endif
diff --git a/test/Parser/MicrosoftExtensions.c b/test/Parser/MicrosoftExtensions.c
index e58a745..2af6c42 100644
--- a/test/Parser/MicrosoftExtensions.c
+++ b/test/Parser/MicrosoftExtensions.c
@@ -35,6 +35,9 @@
 /* Charify extension. */
 #define FOO(x) #@x
 char x = FOO(a);
+#define HASHAT #@
+#define MISSING_ARG(x) #@
+/* expected-error@-1 {{'#@' is not followed by a macro parameter}} */
 
 typedef enum E { e1 };
 
@@ -103,3 +106,12 @@
 _Static_assert(__alignof(struct align_before_key1) == 16, "");
 _Static_assert(__alignof(struct align_before_key2) == 16, "");
 _Static_assert(__alignof(struct align_before_key3) == 16, "");
+
+void PR28782(int i) {
+foo:
+  int n;
+  switch (i) {
+  case 0:
+    int m;
+  }
+}
diff --git a/test/Parser/cxx-altivec.cpp b/test/Parser/cxx-altivec.cpp
index ac20de2..5b0da6c 100644
--- a/test/Parser/cxx-altivec.cpp
+++ b/test/Parser/cxx-altivec.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -triple=powerpc-apple-darwin8 -faltivec -fsyntax-only -verify -std=c++11 %s
 // RUN: %clang_cc1 -triple=powerpc64-unknown-linux-gnu -faltivec -fsyntax-only -verify -std=c++11 %s
 // RUN: %clang_cc1 -triple=powerpc64le-unknown-linux-gnu -faltivec -fsyntax-only -verify -std=c++11 %s
+#include <altivec.h>
 
 __vector char vv_c;
 __vector signed char vv_sc;
diff --git a/test/Parser/cxx-ambig-paren-expr.cpp b/test/Parser/cxx-ambig-paren-expr.cpp
index 3988205..cc509f7 100644
--- a/test/Parser/cxx-ambig-paren-expr.cpp
+++ b/test/Parser/cxx-ambig-paren-expr.cpp
@@ -21,8 +21,14 @@
   struct S{int operator()();};
   (S())();
 
-  // FIXME: Special case: "++" is postfix here, not prefix
-  // (S())++;
+  // Special case: "++" is postfix here, not prefix
+  (S())++; // expected-error {{cannot increment value of type 'S'}}
+
+  struct X { int &operator++(int); X operator[](int); int &operator++(); };
+  int &postfix_incr = (X()[3])++;
+  (X())++ ++; // ok, not a C-style cast
+  (X())++ ++X(); // expected-error {{C-style cast from 'int' to 'X ()'}}
+  int q = (int)++(x);
 }
 
 // Make sure we do tentative parsing correctly in conditions.
diff --git a/test/Parser/cxx-casting.cpp b/test/Parser/cxx-casting.cpp
index 43885bf..b1ae591 100644
--- a/test/Parser/cxx-casting.cpp
+++ b/test/Parser/cxx-casting.cpp
@@ -37,7 +37,7 @@
 // This was being incorrectly tentatively parsed.
 namespace test1 {
   template <class T> class A {}; // expected-note 2{{here}}
-  void foo() { A<int>(*(A<int>*)0); }
+  void foo() { A<int>(*(A<int>*)0); } // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
 }
 
 typedef char* c;
diff --git a/test/Parser/cxx-class.cpp b/test/Parser/cxx-class.cpp
index 9e907f1..3cc006a 100644
--- a/test/Parser/cxx-class.cpp
+++ b/test/Parser/cxx-class.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -pedantic -fcxx-exceptions %s
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -fcxx-exceptions -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -fcxx-exceptions -std=c++11 %s
+
 class C;
 class C {
 public:
@@ -69,11 +72,30 @@
 };
 
 class F {
-    int F1 { return 1; } // expected-error{{function definition does not declare parameters}}
-    void F2 {} // expected-error{{function definition does not declare parameters}}
+    int F1 { return 1; }
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{function definition does not declare parameters}}
+#else
+    // expected-error@-4 {{expected expression}}
+    // expected-error@-5 {{expected}}
+    // expected-note@-6 {{to match this '{'}}
+    // expected-error@-7 {{expected ';' after class}}
+#endif
+
+    void F2 {}
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{function definition does not declare parameters}}
+#else
+    // expected-error@-4 {{variable has incomplete type 'void'}}
+    // expected-error@-5 {{expected ';' after top level declarator}}
+#endif
+
     typedef int F3() { return 0; } // expected-error{{function definition declared 'typedef'}}
     typedef void F4() {} // expected-error{{function definition declared 'typedef'}}
 };
+#if __cplusplus >= 201103L
+// expected-error@-2 {{extraneous closing brace}}
+#endif
 
 namespace ctor_error {
   class Foo {};
@@ -203,14 +225,38 @@
 }
 
 class PR20760_a {
-  int a = ); // expected-warning {{extension}} expected-error {{expected expression}}
-  int b = }; // expected-warning {{extension}} expected-error {{expected expression}}
-  int c = ]; // expected-warning {{extension}} expected-error {{expected expression}}
+  int a = ); // expected-error {{expected expression}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
+  int b = }; // expected-error {{expected expression}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
+  int c = ]; // expected-error {{expected expression}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
 };
 class PR20760_b {
-  int d = d); // expected-warning {{extension}} expected-error {{expected ';'}}
-  int e = d]; // expected-warning {{extension}} expected-error {{expected ';'}}
-  int f = d // expected-warning {{extension}} expected-error {{expected ';'}}
+  int d = d); // expected-error {{expected ';'}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
+  int e = d]; // expected-error {{expected ';'}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
+  int f = d // expected-error {{expected ';'}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
 };
 
 namespace PR20887 {
diff --git a/test/Parser/cxx-decl.cpp b/test/Parser/cxx-decl.cpp
index be79eb4..8a7a388 100644
--- a/test/Parser/cxx-decl.cpp
+++ b/test/Parser/cxx-decl.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -verify -fsyntax-only -triple i386-linux -pedantic-errors -fcxx-exceptions -fexceptions %s
+// RUN: %clang_cc1 -verify -fsyntax-only -triple i386-linux -pedantic-errors -fcxx-exceptions -fexceptions -std=c++98 %s
+// RUN: %clang_cc1 -verify -fsyntax-only -triple i386-linux -pedantic-errors -fcxx-exceptions -fexceptions -std=c++11 %s
 
 const char const *x10; // expected-error {{duplicate 'const' declaration specifier}}
 
@@ -46,7 +48,10 @@
   void foo() __asm__("baz");
 };
 
-enum { fooenum = 1, }; // expected-error {{commas at the end of enumerator lists are a C++11 extension}}
+enum { fooenum = 1, };
+#if __cplusplus <= 199711L
+// expected-error@-2 {{commas at the end of enumerator lists are a C++11 extension}}
+#endif
 
 struct a {
   int Type : fooenum;
@@ -81,7 +86,11 @@
   (global5),
   *global6,
   &global7 = global1,
-  &&global8 = static_cast<int&&>(global1), // expected-error 2{{rvalue reference}}
+  &&global8 = static_cast<int&&>(global1),
+#if __cplusplus <= 199711L
+  // expected-error@-2 2{{rvalue references are a C++11 extension}}
+#endif
+
   S::a,
   global9,
   global10 = 0,
@@ -185,7 +194,13 @@
 }
 
 // Ensure we produce at least some diagnostic for attributes in C++98.
-[[]] struct S; // expected-error 2{{}}
+[[]] struct S;
+#if __cplusplus <= 199711L
+// expected-error@-2 {{expected expression}}
+// expected-error@-3 {{expected unqualified-id}}
+#else
+// expected-error@-5 {{an attribute list cannot appear here}}
+#endif
 
 namespace test7 {
   struct Foo {
@@ -212,14 +227,20 @@
   template<typename T> struct X {};
   X<int N> x; // expected-error {{type-id cannot have a name}}
 
-  using T = int (*T)(); // expected-error {{type-id cannot have a name}} expected-error {{C++11}}
+  using T = int (*T)(); // expected-error {{type-id cannot have a name}}
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{alias declarations are a C++11 extensio}}
+#endif
+
 }
 
 namespace PR17255 {
 void foo() {
-  typename A::template B<>; // expected-error {{use of undeclared identifier 'A'}} \
-                            // expected-error {{expected a qualified name after 'typename'}} \
-                            // expected-error {{'template' keyword outside of a template}}
+  typename A::template B<>; // expected-error {{use of undeclared identifier 'A'}}
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{'template' keyword outside of a template}}
+#endif
+  // expected-error@-4 {{expected a qualified name after 'typename'}}
 }
 }
 
@@ -236,12 +257,25 @@
   struct A {
     friend void friend f(); // expected-warning {{duplicate 'friend' declaration specifier}}
     friend struct B friend; // expected-warning {{duplicate 'friend' declaration specifier}}
+#if __cplusplus >= 201103L
+    // expected-error@-2 {{'friend' must appear first in a non-function declaration}}
+#endif
   };
 }
 
 // PR8380
 extern ""      // expected-error {{unknown linkage language}}
-test6a { ;// expected-error {{C++ requires a type specifier for all declarations}} \
-     // expected-error {{expected ';' after top level declarator}}
+test6a { ;// expected-error {{C++ requires a type specifier for all declarations}}
+#if __cplusplus <= 199711L
+// expected-error@-2 {{expected ';' after top level declarator}}
+#else
+// expected-error@-4 {{expected expression}}
+// expected-note@-5 {{to match this}}
+#endif
   
   int test6b;
+#if __cplusplus >= 201103L
+// expected-error@+3 {{expected}}
+// expected-error@-3 {{expected ';' after top level declarator}}
+#endif
+
diff --git a/test/Parser/cxx-friend.cpp b/test/Parser/cxx-friend.cpp
index ace0ff2..a4492ba 100644
--- a/test/Parser/cxx-friend.cpp
+++ b/test/Parser/cxx-friend.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 class C {
   friend class D;
@@ -21,9 +23,20 @@
   // 'A' here should refer to the declaration above.  
   friend class A;
 
-  friend C; // expected-warning {{specify 'class' to befriend}}
-  friend U; // expected-warning {{specify 'union' to befriend}}
-  friend int; // expected-warning {{non-class friend type 'int'}}
+  friend C;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{unelaborated friend declaration is a C++11 extension; specify 'class' to befriend 'C'}}
+#endif
+
+  friend U;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{unelaborated friend declaration is a C++11 extension; specify 'union' to befriend 'U'}}
+#endif
+
+  friend int;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{non-class friend type 'int' is a C++11 extension}}
+#endif
 
   friend void myfunc();
 
diff --git a/test/Parser/cxx-invalid-for-range.cpp b/test/Parser/cxx-invalid-for-range.cpp
new file mode 100644
index 0000000..557c1da
--- /dev/null
+++ b/test/Parser/cxx-invalid-for-range.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+
+// From PR23057 comment #18 (https://llvm.org/bugs/show_bug.cgi?id=23057#c18).
+
+namespace N {
+  int X[10]; // expected-note{{declared here}}}}
+}
+
+void f1() {
+  for (auto operator new : X); // expected-error{{'operator new' cannot be the name of a variable or data member}}
+                               // expected-error@-1{{use of undeclared identifier 'X'; did you mean 'N::X'?}}
+}
+
+void f2() {
+  for (a operator== :) // expected-error{{'operator==' cannot be the name of a variable or data member}}
+                       // expected-error@-1{{expected expression}}
+                       // expected-error@-2{{unknown type name 'a'}}
+} // expected-error{{expected statement}}
diff --git a/test/Parser/cxx0x-attributes.cpp b/test/Parser/cxx0x-attributes.cpp
index 7eec576..906d72b 100644
--- a/test/Parser/cxx0x-attributes.cpp
+++ b/test/Parser/cxx0x-attributes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++1z-extensions %s
 
 // Need std::initializer_list
 namespace std {
@@ -336,7 +336,6 @@
   // expected-warning@-1 {{use of the 'deprecated' attribute is a C++14 extension}}
   [[deprecated()]] void foo();
   // expected-error@-1 {{parentheses must be omitted if 'deprecated' attribute's argument list is empty}}
-  // expected-warning@-2 {{use of the 'deprecated' attribute is a C++14 extension}}
   [[gnu::deprecated()]] void quux();
 }
 
@@ -347,6 +346,18 @@
 ]] void bad();
 }
 
+int fallthru(int n) {
+  switch (n) {
+  case 0:
+    n += 5;
+    [[fallthrough]]; // expected-warning {{use of the 'fallthrough' attribute is a C++1z extension}}
+  case 1:
+    n *= 2;
+    break;
+  }
+  return n;
+}
+
 #define attr_name bitand
 #define attr_name_2(x) x
 #define attr_name_3(x, y) x##y
diff --git a/test/Parser/cxx0x-condition.cpp b/test/Parser/cxx0x-condition.cpp
index 8b64bcf..071e09e 100644
--- a/test/Parser/cxx0x-condition.cpp
+++ b/test/Parser/cxx0x-condition.cpp
@@ -23,9 +23,9 @@
 
   if (S b(a)) {} // expected-error {{variable declaration in condition cannot have a parenthesized initializer}}
 
-  if (S b(n)) {} // expected-error {{a function type is not allowed here}} expected-error {{must have an initializer}}
+  if (S b(n)) {} // expected-error {{a function type is not allowed here}}
   if (S b(n) = 0) {} // expected-error {{a function type is not allowed here}}
-  if (S b(n) == 0) {} // expected-error {{a function type is not allowed here}} expected-error {{did you mean '='?}}
+  if (S b(n) == 0) {} // expected-error {{a function type is not allowed here}}
 
   S s(a);
   if (S{s}) {} // ok
diff --git a/test/Parser/cxx0x-decl.cpp b/test/Parser/cxx0x-decl.cpp
index 23f46a1..c4f0356 100644
--- a/test/Parser/cxx0x-decl.cpp
+++ b/test/Parser/cxx0x-decl.cpp
@@ -17,6 +17,8 @@
   return E();
 }
 
+int decltype(f())::*ptr_mem_decltype;
+
 class ExtraSemiAfterMemFn {
   // Due to a peculiarity in the C++11 grammar, a deleted or defaulted function
   // is permitted to be followed by either one or two semicolons.
diff --git a/test/Parser/cxx1z-constexpr-lambdas.cpp b/test/Parser/cxx1z-constexpr-lambdas.cpp
new file mode 100644
index 0000000..ea000e3
--- /dev/null
+++ b/test/Parser/cxx1z-constexpr-lambdas.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -std=c++1z %s -verify 
+// RUN: %clang_cc1 -std=c++14 %s -verify 
+// RUN: %clang_cc1 -std=c++11 %s -verify 
+
+
+auto XL0 = [] constexpr { }; //expected-error{{requires '()'}} expected-error{{expected body}}
+auto XL1 = [] () mutable 
+                 mutable     //expected-error{{cannot appear multiple times}}
+                 mutable { }; //expected-error{{cannot appear multiple times}}
+
+#if __cplusplus > 201402L
+auto XL2 = [] () constexpr mutable constexpr { }; //expected-error{{cannot appear multiple times}}
+auto L = []() mutable constexpr { };
+auto L2 = []() constexpr { };
+auto L4 = []() constexpr mutable { }; 
+auto XL16 = [] () constexpr
+                  mutable
+                  constexpr   //expected-error{{cannot appear multiple times}}
+                  mutable     //expected-error{{cannot appear multiple times}}
+                  mutable     //expected-error{{cannot appear multiple times}}
+                  constexpr   //expected-error{{cannot appear multiple times}}
+                  constexpr   //expected-error{{cannot appear multiple times}}
+                  { };
+
+#else
+auto L = []() mutable constexpr {return 0; }; //expected-warning{{is a C++1z extension}}
+auto L2 = []() constexpr { return 0;};//expected-warning{{is a C++1z extension}}
+auto L4 = []() constexpr mutable { return 0; }; //expected-warning{{is a C++1z extension}}
+#endif
+
+
diff --git a/test/Parser/cxx1z-decomposition.cpp b/test/Parser/cxx1z-decomposition.cpp
new file mode 100644
index 0000000..c5651c5
--- /dev/null
+++ b/test/Parser/cxx1z-decomposition.cpp
@@ -0,0 +1,148 @@
+// RUN: %clang_cc1 -std=c++1z %s -verify -fcxx-exceptions
+
+struct S { int a, b, c; };
+
+// A simple-declaration can be a decompsition declaration.
+namespace SimpleDecl {
+  auto [a_x, b_x, c_x] = S();
+
+  void f(S s) {
+    auto [a, b, c] = S();
+    {
+      for (auto [a, b, c] = S();;) {}
+      if (auto [a, b, c] = S(); true) {}
+      switch (auto [a, b, c] = S(); 0) { case 0:; }
+    }
+  }
+}
+
+// A for-range-declaration can be a decomposition declaration.
+namespace ForRangeDecl {
+  extern S arr[10];
+  void h() {
+    for (auto [a, b, c] : arr) {
+    }
+  }
+}
+
+// Other kinds of declaration cannot.
+namespace OtherDecl {
+  // A parameter-declaration is not a simple-declaration.
+  // This parses as an array declaration.
+  void f(auto [a, b, c]); // expected-error {{'auto' not allowed in function prototype}} expected-error {{'a'}}
+
+  void g() {
+    // A condition is not a simple-declaration.
+    for (; auto [a, b, c] = S(); ) {} // expected-error {{not permitted in this context}}
+    if (auto [a, b, c] = S()) {} // expected-error {{not permitted in this context}}
+    if (int n; auto [a, b, c] = S()) {} // expected-error {{not permitted in this context}}
+    switch (auto [a, b, c] = S()) {} // expected-error {{not permitted in this context}}
+    switch (int n; auto [a, b, c] = S()) {} // expected-error {{not permitted in this context}}
+    while (auto [a, b, c] = S()) {} // expected-error {{not permitted in this context}}
+
+    // An exception-declaration is not a simple-declaration.
+    try {}
+    catch (auto [a, b, c]) {} // expected-error {{'auto' not allowed in exception declaration}} expected-error {{'a'}}
+  }
+
+  // A member-declaration is not a simple-declaration.
+  class A {
+    auto [a, b, c] = S(); // expected-error {{not permitted in this context}}
+    static auto [a, b, c] = S(); // expected-error {{not permitted in this context}}
+  };
+}
+
+namespace GoodSpecifiers {
+  void f() {
+    int n[1];
+    const volatile auto &[a] = n;
+  }
+}
+
+namespace BadSpecifiers {
+  typedef int I1[1];
+  I1 n;
+  struct S { int n; } s;
+  void f() {
+    // storage-class-specifiers
+    static auto &[a] = n; // expected-error {{cannot be declared 'static'}}
+    thread_local auto &[b] = n; // expected-error {{cannot be declared 'thread_local'}}
+    extern auto &[c] = n; // expected-error {{cannot be declared 'extern'}} expected-error {{cannot have an initializer}}
+    struct S {
+      mutable auto &[d] = n; // expected-error {{not permitted in this context}}
+
+      // function-specifiers
+      virtual auto &[e] = n; // expected-error {{not permitted in this context}}
+      explicit auto &[f] = n; // expected-error {{not permitted in this context}}
+
+      // misc decl-specifiers
+      friend auto &[g] = n; // expected-error {{'auto' not allowed}} expected-error {{friends can only be classes or functions}}
+    };
+    typedef auto &[h] = n; // expected-error {{cannot be declared 'typedef'}}
+    constexpr auto &[i] = n; // expected-error {{cannot be declared 'constexpr'}}
+
+    static constexpr thread_local auto &[j] = n; // expected-error {{cannot be declared with 'static thread_local constexpr' specifiers}}
+  }
+  inline auto &[k] = n; // expected-error {{cannot be declared 'inline'}}
+
+  const int K = 5;
+  void g() {
+    // defining-type-specifiers other than cv-qualifiers and 'auto'
+    S [a] = s; // expected-error {{cannot be declared with type 'BadSpecifiers::S'}}
+    decltype(auto) [b] = s; // expected-error {{cannot be declared with type 'decltype(auto)'}}
+    auto ([c]) = s; // expected-error {{cannot be declared with parentheses}}
+
+    // FIXME: This error is not very good.
+    auto [d]() = s; // expected-error {{expected ';'}} expected-error {{expected expression}}
+    auto [e][1] = s; // expected-error {{expected ';'}} expected-error {{requires an initializer}}
+
+    // FIXME: This should fire the 'misplaced array declarator' diagnostic.
+    int [K] arr = {0}; // expected-error {{expected ';'}} expected-error {{cannot be declared with type 'int'}} expected-error {{decomposition declaration '[K]' requires an initializer}}
+    int [5] arr = {0}; // expected-error {{place the brackets after the name}}
+
+    auto *[f] = s; // expected-error {{cannot be declared with type 'auto *'}} expected-error {{incompatible initializer}}
+    auto S::*[g] = s; // expected-error {{cannot be declared with type 'auto BadSpecifiers::S::*'}} expected-error {{incompatible initializer}}
+
+    // ref-qualifiers are OK.
+    auto &&[ok_1] = S();
+    auto &[ok_2] = s;
+
+    // attributes are OK.
+    [[]] auto [ok_3] = s;
+    alignas(S) auto [ok_4] = s;
+
+    // ... but not after the identifier or declarator.
+    // FIXME: These errors are not very good.
+    auto [bad_attr_1 [[]]] = s; // expected-error {{attribute list cannot appear here}} expected-error 2{{}}
+    auto [bad_attr_2] [[]] = s; // expected-error {{expected ';'}} expected-error {{}}
+  }
+}
+
+namespace MultiDeclarator {
+  struct S { int n; };
+  void f(S s) {
+    auto [a] = s, [b] = s; // expected-error {{must be the only declaration}}
+    auto [c] = s,  d = s; // expected-error {{must be the only declaration}}
+    auto  e  = s, [f] = s; // expected-error {{must be the only declaration}}
+    auto g = s, h = s, i = s, [j] = s; // expected-error {{must be the only declaration}}
+  }
+}
+
+namespace Template {
+  int n[3];
+  // FIXME: There's no actual rule against this...
+  template<typename T> auto [a, b, c] = n; // expected-error {{decomposition declaration template not supported}}
+}
+
+namespace Init {
+  void f() {
+    int arr[1];
+    struct S { int n; };
+    auto &[bad1]; // expected-error {{decomposition declaration '[bad1]' requires an initializer}}
+    const auto &[bad2](S{}); // expected-error {{decomposition declaration '[bad2]' cannot have a parenthesized initializer}}
+    auto &[good1] = arr;
+    auto &&[good2] = S{};
+    S [goodish3] = { 4 }; // expected-error {{cannot be declared with type 'S'}}
+    S [goodish4] { 4 }; // expected-error {{cannot be declared with type 'S'}}
+  }
+}
diff --git a/test/Parser/cxx1z-init-statement.cpp b/test/Parser/cxx1z-init-statement.cpp
new file mode 100644
index 0000000..3d119ef
--- /dev/null
+++ b/test/Parser/cxx1z-init-statement.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s -Wno-vexing-parse
+
+int g, h;
+typedef int T;
+int f() {
+  // init-statement declarations
+  if (T n = 0; n != 0) {}
+  if (T f(); f()) {}
+  if (T(f()); f()) {}
+  if (T(f()), g, h; f()) {}
+  if (T f(); f()) {}
+  if (T f(), g, h; f()) {}
+  if (T(n) = 0; n) {}
+
+  // init-statement expressions
+  if (T{f()}; f()) {}
+  if (T{f()}, g, h; f()) {} // expected-warning 2{{unused}}
+  if (T(f()), g, h + 1; f()) {} // expected-warning 2{{unused}}
+
+  // condition declarations
+  if (T(n){g}) {}
+  if (T f()) {} // expected-error {{function type}}
+  if (T f(), g, h) {} // expected-error {{function type}}
+  if (T(n) = 0) {}
+
+  // condition expressions
+  if (T(f())) {}
+  if (T{f()}) {}
+  if (T(f()), g, h) {} // expected-warning 2{{unused}}
+  if (T{f()}, g, h) {} // expected-warning 2{{unused}}
+
+  // none of the above, disambiguated as expression (can't be a declaration)
+  if (T(n)(g)) {} // expected-error {{undeclared identifier 'n'}}
+  if (T(n)(int())) {} // expected-error {{undeclared identifier 'n'}}
+
+  // Likewise for 'switch'
+  switch (int n; n) {}
+  switch (g; int g = 5) {}
+
+  if (int a, b; int c = a) { // expected-note 6{{previous}}
+    int a; // expected-error {{redefinition}}
+    int b; // expected-error {{redefinition}}
+    int c; // expected-error {{redefinition}}
+  } else {
+    int a; // expected-error {{redefinition}}
+    int b; // expected-error {{redefinition}}
+    int c; // expected-error {{redefinition}}
+  }
+
+  return 0;
+}
diff --git a/test/Parser/extra-semi.cpp b/test/Parser/extra-semi.cpp
index 1a44dae..7287f85 100644
--- a/test/Parser/extra-semi.cpp
+++ b/test/Parser/extra-semi.cpp
@@ -5,7 +5,6 @@
 
 void test1(int a;) { // expected-error{{unexpected ';' before ')'}}
   while (a > 5;) {} // expected-error{{unexpected ';' before ')'}}
-  if (int b = 10;) {} // expected-error{{unexpected ';' before ')'}}
   for (int c  = 0; c < 21; ++c;) {} // expected-error{{unexpected ';' before ')'}}
   int d = int(3 + 4;); // expected-error{{unexpected ';' before ')'}}
   int e[5;]; // expected-error{{unexpected ';' before ']'}}
diff --git a/test/Parser/gcc-__final-compatibility.cpp b/test/Parser/gcc-__final-compatibility.cpp
new file mode 100644
index 0000000..ddd14ba
--- /dev/null
+++ b/test/Parser/gcc-__final-compatibility.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -std=c++98 -fgnu-keywords -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++11 -fgnu-keywords -fsyntax-only -verify %s
+
+struct B {
+  virtual void g();
+};
+struct D __final : B { // expected-warning {{__final is a GNU extension, consider using C++11 final}}
+  virtual void g() __final; // expected-warning {{__final is a GNU extension, consider using C++11 final}}
+};
diff --git a/test/Parser/ms-anachronism.c b/test/Parser/ms-anachronism.c
new file mode 100644
index 0000000..3767639
--- /dev/null
+++ b/test/Parser/ms-anachronism.c
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 -triple i686-windows-msvc -fms-extensions -fsyntax-only -verify %s
+
+struct {} __cdecl s; // expected-warning {{'__cdecl' only applies to function types; type here is 'struct}}
diff --git a/test/Parser/ms-inline-asm.c b/test/Parser/ms-inline-asm.c
index 5b0cabf..7db4720 100644
--- a/test/Parser/ms-inline-asm.c
+++ b/test/Parser/ms-inline-asm.c
@@ -53,6 +53,10 @@
 void t12() {
   __asm jmp label // expected-error {{use of undeclared label 'label'}}
 }
+void t13() {
+  __asm m{o}v eax, ebx // expected-error {{unknown token in expression}}
+}
+
 int t_fail() { // expected-note {{to match this}}
   __asm 
   __asm { // expected-error 3 {{expected}} expected-note {{to match this}}
diff --git a/test/Parser/objc-available.m b/test/Parser/objc-available.m
new file mode 100644
index 0000000..d18ac1f
--- /dev/null
+++ b/test/Parser/objc-available.m
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunguarded-availability -triple x86_64-apple-macosx10.10.0 -verify %s
+
+void f() {
+
+  if (@available(macos 10.12, *)) {}
+  else if (@available(macos 10.11, *)) {}
+  else {}
+
+  (void)__builtin_available(ios 8, macos 10.10, *);
+
+  (void)@available(macos 10.11); // expected-error{{must handle potential future platforms with '*'}}
+  (void)@available(macos 10.11, macos 10.11, *); // expected-error{{version for 'macos' already specified}}
+
+  (void)@available(erik_os 10.11, *); // expected-error{{unrecognized platform name erik_os}}
+
+  (void)@available(erik_os 10.10, hat_os 1.0, *); // expected-error 2 {{unrecognized platform name}}
+
+  (void)@available(); // expected-error{{expected a platform name here}}
+  (void)@available(macos 10.10,); // expected-error{{expected a platform name here}}
+  (void)@available(macos); // expected-error{{expected a version}}
+  (void)@available; // expected-error{{expected '('}}
+}
diff --git a/test/Parser/objc-default-ctor-init.mm b/test/Parser/objc-default-ctor-init.mm
index fda8bef..ea4c064 100644
--- a/test/Parser/objc-default-ctor-init.mm
+++ b/test/Parser/objc-default-ctor-init.mm
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.10 -std=c++11 -ast-dump %s | FileCheck %s
 // CHECK: CXXCtorInitializer Field {{.*}} 'ptr' 'void *'
+// CHECK: CXXCtorInitializer Field {{.*}} 'q' 'struct Q'
 
 @interface NSObject
 @end
@@ -7,9 +8,12 @@
 @interface I : NSObject
 @end
 
+struct Q { Q(); };
+
 struct S {
   S();
   void *ptr = nullptr;
+  Q q;
 };
 
 @implementation I
diff --git a/test/Parser/opencl-astype.cl b/test/Parser/opencl-astype.cl
index 72f98a4..903c42e 100644
--- a/test/Parser/opencl-astype.cl
+++ b/test/Parser/opencl-astype.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -triple spir-unknown-unknown
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 void test_astype() {
diff --git a/test/Parser/opencl-atomics-cl20.cl b/test/Parser/opencl-atomics-cl20.cl
index cb2f597..cd37757 100644
--- a/test/Parser/opencl-atomics-cl20.cl
+++ b/test/Parser/opencl-atomics-cl20.cl
@@ -1,11 +1,14 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
-// RUN: %clang_cc1 %s -verify  -fsyntax-only -cl-std=CL2.0 -DCL20
-// RUN: %clang_cc1 %s -verify  -fsyntax-only -cl-std=CL2.0 -DCL20 -DEXT
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -fsyntax-only -cl-std=CL2.0 -DCL20
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -fsyntax-only -cl-std=CL2.0 -DCL20 -DEXT -Wpedantic-core-features
 
 #ifdef EXT
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics:enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics:enable
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
 #endif
 
 void atomic_types_test() {
@@ -44,15 +47,14 @@
 // expected-error@-28 {{use of type 'atomic_ulong' (aka '_Atomic(unsigned long)') requires cl_khr_int64_extended_atomics extension to be enabled}}
 // expected-error@-27 {{use of type 'atomic_double' (aka '_Atomic(double)') requires cl_khr_int64_base_atomics extension to be enabled}}
 // expected-error@-28 {{use of type 'atomic_double' (aka '_Atomic(double)') requires cl_khr_int64_extended_atomics extension to be enabled}}
-// expected-error@-29 {{use of type 'atomic_double' (aka '_Atomic(double)') requires cl_khr_fp64 extension to be enabled}}
-// expected-error-re@-28 {{use of type 'atomic_intptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
-// expected-error-re@-29 {{use of type 'atomic_intptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
-// expected-error-re@-29 {{use of type 'atomic_uintptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
-// expected-error-re@-30 {{use of type 'atomic_uintptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
-// expected-error-re@-30 {{use of type 'atomic_size_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
-// expected-error-re@-31 {{use of type 'atomic_size_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
-// expected-error-re@-31 {{use of type 'atomic_ptrdiff_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
-// expected-error-re@-32 {{use of type 'atomic_ptrdiff_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
+// expected-error-re@-27 {{use of type 'atomic_intptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
+// expected-error-re@-28 {{use of type 'atomic_intptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
+// expected-error-re@-28 {{use of type 'atomic_uintptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
+// expected-error-re@-29 {{use of type 'atomic_uintptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
+// expected-error-re@-29 {{use of type 'atomic_size_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
+// expected-error-re@-30 {{use of type 'atomic_size_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
+// expected-error-re@-30 {{use of type 'atomic_ptrdiff_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
+// expected-error-re@-31 {{use of type 'atomic_ptrdiff_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
 #endif
 
 #ifdef CL20
diff --git a/test/Parser/opencl-cl20.cl b/test/Parser/opencl-cl20.cl
index b718699..b14ad10 100644
--- a/test/Parser/opencl-cl20.cl
+++ b/test/Parser/opencl-cl20.cl
@@ -10,9 +10,9 @@
   return var;  
 }
 #ifndef CL20
-// expected-error@-5 {{OpenCL does not support the '__generic' type qualifier}}
-// expected-error@-6 {{OpenCL does not support the '__generic' type qualifier}}
-// expected-error@-6 {{OpenCL does not support the '__generic' type qualifier}}
+// expected-error@-5 {{OpenCL version 1.0 does not support the '__generic' type qualifier}}
+// expected-error@-6 {{OpenCL version 1.0 does not support the '__generic' type qualifier}}
+// expected-error@-6 {{OpenCL version 1.0 does not support the '__generic' type qualifier}}
 #endif
 
 generic int * generic_test(generic int *arg) {
@@ -20,7 +20,7 @@
   return var;  
 }
 #ifndef CL20
-// expected-error@-5 {{OpenCL does not support the 'generic' type qualifier}}
-// expected-error@-6 {{OpenCL does not support the 'generic' type qualifier}}
-// expected-error@-6 {{OpenCL does not support the 'generic' type qualifier}}
+// expected-error@-5 {{OpenCL version 1.0 does not support the 'generic' type qualifier}}
+// expected-error@-6 {{OpenCL version 1.0 does not support the 'generic' type qualifier}}
+// expected-error@-6 {{OpenCL version 1.0 does not support the 'generic' type qualifier}}
 #endif
diff --git a/test/Parser/opencl-image-access.cl b/test/Parser/opencl-image-access.cl
index e08d129..99ced8e 100644
--- a/test/Parser/opencl-image-access.cl
+++ b/test/Parser/opencl-image-access.cl
@@ -1,14 +1,19 @@
-// RUN: %clang_cc1 %s -fsyntax-only
+// RUN: %clang_cc1 %s -fsyntax-only -verify
+// RUN: %clang_cc1 %s -fsyntax-only -verify -cl-std=CL2.0 -DCL20
+// expected-no-diagnostics
 
 __kernel void f__ro(__read_only image2d_t a) { }
 
 __kernel void f__wo(__write_only image2d_t a) { }
 
+#if CL20
 __kernel void f__rw(__read_write image2d_t a) { }
-
+#endif
 
 __kernel void fro(read_only image2d_t a) { }
 
 __kernel void fwo(write_only image2d_t a) { }
 
+#if CL20
 __kernel void frw(read_write image2d_t a) { }
+#endif
diff --git a/test/Parser/opencl-pragma.cl b/test/Parser/opencl-pragma.cl
index 4c48b2a..b002b08 100644
--- a/test/Parser/opencl-pragma.cl
+++ b/test/Parser/opencl-pragma.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -verify -pedantic -Wno-empty-translation-unit -fsyntax-only
+// RUN: %clang_cc1 %s -verify -pedantic -Wno-empty-translation-unit -fsyntax-only -triple spir-unknown-unknown
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
diff --git a/test/Parser/opencl-storage-class.cl b/test/Parser/opencl-storage-class.cl
index 3d9aef5..a8ebc1a 100644
--- a/test/Parser/opencl-storage-class.cl
+++ b/test/Parser/opencl-storage-class.cl
@@ -1,15 +1,15 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -verify -fsyntax-only -triple spir-unknown-unknown
 
 void test_storage_class_specs()
 {
-  static int a;    // expected-error {{OpenCL does not support the 'static' storage class specifier}}
-  register int b;  // expected-error {{OpenCL does not support the 'register' storage class specifier}}
-  extern int c;    // expected-error {{OpenCL does not support the 'extern' storage class specifier}}
-  auto int d;      // expected-error {{OpenCL does not support the 'auto' storage class specifier}}
+  static int a;    // expected-error {{OpenCL version 1.0 does not support the 'static' storage class specifier}}
+  register int b;  // expected-error {{OpenCL version 1.0 does not support the 'register' storage class specifier}}
+  extern int c;    // expected-error {{OpenCL version 1.0 does not support the 'extern' storage class specifier}}
+  auto int d;      // expected-error {{OpenCL version 1.0 does not support the 'auto' storage class specifier}}
 
 #pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
-  static int e; // expected-error {{program scope variable must reside in constant address space}}
+  static int e; // expected-error {{static local variable must reside in constant address space}}
   register int f;
-  extern int g;
+  extern int g; // expected-error {{extern variable must reside in constant address space}}
   auto int h;
 }
diff --git a/test/Parser/opencl-unroll-hint.cl b/test/Parser/opencl-unroll-hint.cl
new file mode 100644
index 0000000..5742dcd
--- /dev/null
+++ b/test/Parser/opencl-unroll-hint.cl
@@ -0,0 +1,8 @@
+//RUN: %clang_cc1 -O0 -cl-std=CL2.0 -fsyntax-only -verify %s
+
+kernel void B (global int *x) {
+  __attribute__((opencl_unroll_hint(42)))
+  if (x[0])                             // expected-error {{OpenCL only supports 'opencl_unroll_hint' attribute on for, while, and do statements}}
+    x[0] = 15;
+}
+
diff --git a/test/Parser/pragma-loop-safety.cpp b/test/Parser/pragma-loop-safety.cpp
index 0776000..ab87dcd 100644
--- a/test/Parser/pragma-loop-safety.cpp
+++ b/test/Parser/pragma-loop-safety.cpp
@@ -16,6 +16,7 @@
 /* expected-error {{expected ')'}} */ #pragma clang loop interleave(assume_safety
 
 /* expected-error {{invalid argument; expected 'enable', 'full' or 'disable'}} */ #pragma clang loop unroll(assume_safety)
+/* expected-error {{invalid argument; expected 'enable' or 'disable'}} */ #pragma clang loop distribute(assume_safety)
 
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop vectorize(badidentifier)
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop interleave(badidentifier)
diff --git a/test/Parser/pragma-loop.cpp b/test/Parser/pragma-loop.cpp
index b9b5b41..f42d196 100644
--- a/test/Parser/pragma-loop.cpp
+++ b/test/Parser/pragma-loop.cpp
@@ -116,15 +116,27 @@
     VList[j] = List[j];
   }
 
+#pragma clang loop distribute(enable)
+  for (int j : VList) {
+    VList[j] = List[j];
+  }
+
+#pragma clang loop distribute(disable)
+  for (int j : VList) {
+    VList[j] = List[j];
+  }
+
   test_nontype_template_param<4, 8>(List, Length);
 
 /* expected-error {{expected '('}} */ #pragma clang loop vectorize
 /* expected-error {{expected '('}} */ #pragma clang loop interleave
 /* expected-error {{expected '('}} */ #pragma clang loop unroll
+/* expected-error {{expected '('}} */ #pragma clang loop distribute
 
 /* expected-error {{expected ')'}} */ #pragma clang loop vectorize(enable
 /* expected-error {{expected ')'}} */ #pragma clang loop interleave(enable
 /* expected-error {{expected ')'}} */ #pragma clang loop unroll(full
+/* expected-error {{expected ')'}} */ #pragma clang loop distribute(enable
 
 /* expected-error {{expected ')'}} */ #pragma clang loop vectorize_width(4
 /* expected-error {{expected ')'}} */ #pragma clang loop interleave_count(4
@@ -133,8 +145,9 @@
 /* expected-error {{missing argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop vectorize()
 /* expected-error {{missing argument; expected an integer value}} */ #pragma clang loop interleave_count()
 /* expected-error {{missing argument; expected 'enable', 'full' or 'disable'}} */ #pragma clang loop unroll()
+/* expected-error {{missing argument; expected 'enable' or 'disable'}} */ #pragma clang loop distribute()
 
-/* expected-error {{missing option; expected vectorize, vectorize_width, interleave, interleave_count, unroll, or unroll_count}} */ #pragma clang loop
+/* expected-error {{missing option; expected vectorize, vectorize_width, interleave, interleave_count, unroll, unroll_count, or distribute}} */ #pragma clang loop
 /* expected-error {{invalid option 'badkeyword'}} */ #pragma clang loop badkeyword
 /* expected-error {{invalid option 'badkeyword'}} */ #pragma clang loop badkeyword(enable)
 /* expected-error {{invalid option 'badkeyword'}} */ #pragma clang loop vectorize(enable) badkeyword(4)
@@ -187,6 +200,7 @@
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop vectorize(badidentifier)
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop interleave(badidentifier)
 /* expected-error {{invalid argument; expected 'enable', 'full' or 'disable'}} */ #pragma clang loop unroll(badidentifier)
+/* expected-error {{invalid argument; expected 'enable' or 'disable'}} */ #pragma clang loop distribute(badidentifier)
   while (i-7 < Length) {
     List[i] = i;
   }
@@ -196,6 +210,7 @@
 /* expected-error {{expected ')'}} */ #pragma clang loop vectorize(()
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop interleave(*)
 /* expected-error {{invalid argument; expected 'enable', 'full' or 'disable'}} */ #pragma clang loop unroll(=)
+/* expected-error {{invalid argument; expected 'enable' or 'disable'}} */ #pragma clang loop distribute(+)
 /* expected-error {{type name requires a specifier or qualifier}} expected-error {{expected expression}} */ #pragma clang loop vectorize_width(^)
 /* expected-error {{expected expression}} expected-error {{expected expression}} */ #pragma clang loop interleave_count(/)
 /* expected-error {{expected expression}} expected-error {{expected expression}} */ #pragma clang loop unroll_count(==)
@@ -232,6 +247,8 @@
 #pragma clang loop interleave(disable)
 /* expected-error {{duplicate directives 'unroll(disable)' and 'unroll(full)'}} */ #pragma clang loop unroll(full)
 #pragma clang loop unroll(disable)
+/* expected-error {{duplicate directives 'distribute(disable)' and 'distribute(enable)'}} */ #pragma clang loop distribute(enable)
+#pragma clang loop distribute(disable)
   while (i-9 < Length) {
     List[i] = i;
   }
diff --git a/test/Parser/pragma-pack.c b/test/Parser/pragma-pack.c
index 172a332..0859f41 100644
--- a/test/Parser/pragma-pack.c
+++ b/test/Parser/pragma-pack.c
@@ -44,3 +44,7 @@
 #pragma pack()
   int e;
 };
+
+_Pragma("pack(push, 1)") struct PR28094 {
+  int a;
+} _Pragma("pack(pop)");
diff --git a/test/Parser/skip-function-bodies.mm b/test/Parser/skip-function-bodies.mm
index 8462f69..e5b7b2a 100644
--- a/test/Parser/skip-function-bodies.mm
+++ b/test/Parser/skip-function-bodies.mm
@@ -30,7 +30,7 @@
 // CHECK: skip-function-bodies.mm:3:7: ClassDecl=A:3:7 (Definition) Extent=[3:1 - 14:2]
 // CHECK: skip-function-bodies.mm:4:9: ClassDecl=B:4:9 (Definition) Extent=[4:3 - 4:13]
 // CHECK: skip-function-bodies.mm:6:1: CXXAccessSpecifier=:6:1 (Definition) Extent=[6:1 - 6:8]
-// CHECK: skip-function-bodies.mm:7:3: CXXConstructor=A:7:3 Extent=[7:3 - 7:6]
+// CHECK: skip-function-bodies.mm:7:3: CXXConstructor=A:7:3 (default constructor) Extent=[7:3 - 7:6]
 // CHECK-NOT: skip-function-bodies.mm:8:12: StructDecl=C:8:12 (Definition) Extent=[8:5 - 10:6]
 // CHECK-NOT: skip-function-bodies.mm:9:12: CXXMethod=d:9:12 (Definition) Extent=[9:7 - 9:18]
 // CHECK: skip-function-bodies.mm:13:13: TypedefDecl=E:13:13 (Definition) Extent=[13:3 - 13:14]
diff --git a/test/Preprocessor/Weverything_pragma.c b/test/Preprocessor/Weverything_pragma.c
new file mode 100644
index 0000000..1425431
--- /dev/null
+++ b/test/Preprocessor/Weverything_pragma.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -Weverything   -fsyntax-only -verify %s
+
+// Test that the pragma overrides command line option -Weverythings,
+
+// a diagnostic with DefaultIgnore. This is part of a group 'unused-macro'
+// but -Weverything forces it
+#define UNUSED_MACRO1 1 // expected-warning{{macro is not used}}
+
+void foo() // expected-warning {{no previous prototype for function}}
+{
+ // A diagnostic without DefaultIgnore, and not part of a group.
+ (void) L'ab'; // expected-warning {{extraneous characters in character constant ignored}}
+
+#pragma clang diagnostic warning "-Weverything" // Should not change anyhting.
+#define UNUSED_MACRO2 1 // expected-warning{{macro is not used}}
+ (void) L'cd'; // expected-warning {{extraneous characters in character constant ignored}}
+
+#pragma clang diagnostic ignored "-Weverything" // Ignore warnings now.
+#define UNUSED_MACRO2 1 // no warning
+ (void) L'ef'; // no warning here
+
+#pragma clang diagnostic warning "-Weverything" // Revert back to warnings.
+#define UNUSED_MACRO3 1 // expected-warning{{macro is not used}}
+ (void) L'gh'; // expected-warning {{extraneous characters in character constant ignored}}
+
+#pragma clang diagnostic error "-Weverything"  // Give errors now.
+#define UNUSED_MACRO4 1 // expected-error{{macro is not used}}
+ (void) L'ij'; // expected-error {{extraneous characters in character constant ignored}}
+}
diff --git a/test/Preprocessor/aarch64-target-features.c b/test/Preprocessor/aarch64-target-features.c
index fca9cf6..9a968e0 100644
--- a/test/Preprocessor/aarch64-target-features.c
+++ b/test/Preprocessor/aarch64-target-features.c
@@ -92,13 +92,20 @@
 // RUN: %clang -target aarch64 -mcpu=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A53 %s
 // RUN: %clang -target aarch64 -mcpu=cortex-a57 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A57 %s
 // RUN: %clang -target aarch64 -mcpu=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A72 %s
+// RUN: %clang -target aarch64 -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-CORTEX-A73 %s
 // RUN: %clang -target aarch64 -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-M1 %s
+// RUN: %clang -target aarch64 -mcpu=exynos-m2 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-M1 %s
+// RUN: %clang -target aarch64 -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-KRYO %s
+// RUN: %clang -target aarch64 -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-VULCAN %s
 // CHECK-MCPU-CYCLONE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crypto" "-target-feature" "+zcm" "-target-feature" "+zcz"
 // CHECK-MCPU-A35: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 // CHECK-MCPU-A53: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 // CHECK-MCPU-A57: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 // CHECK-MCPU-A72: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
+// CHECK-MCPU-CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 // CHECK-MCPU-M1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
+// CHECK-MCPU-KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
+// CHECK-MCPU-VULCAN: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s
 // CHECK-ARCH-ARM64: "-target-cpu" "cyclone" "-target-feature" "+neon" "-target-feature" "+crypto" "-target-feature" "+zcm" "-target-feature" "+zcz"
diff --git a/test/Preprocessor/arm-acle-6.4.c b/test/Preprocessor/arm-acle-6.4.c
index 148ce6d..11be2c1 100644
--- a/test/Preprocessor/arm-acle-6.4.c
+++ b/test/Preprocessor/arm-acle-6.4.c
@@ -140,6 +140,7 @@
 
 // RUN: %clang -target arm-none-linux-eabi -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V7R-IDIV
 // RUN: %clang -target arm-none-linux-eabi -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V7R-IDIV
+// RUN: %clang -target arm-none-linux-eabi -mcpu=cortex-r8 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V7R-IDIV
 
 // CHECK-V7R-IDIV: __ARM_FEATURE_IDIV 1
 
diff --git a/test/Preprocessor/arm-acle-6.5.c b/test/Preprocessor/arm-acle-6.5.c
index 95adad9..cc158c8 100644
--- a/test/Preprocessor/arm-acle-6.5.c
+++ b/test/Preprocessor/arm-acle-6.5.c
@@ -49,10 +49,13 @@
 
 // CHECK-NO-FMA-NOT: __ARM_FEATURE_FMA
 
-// RUN: %clang -target armv7a-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
-// RUN: %clang -target armv7r-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
+// RUN: %clang -target armv7a-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-NO-FMA
+// RUN: %clang -target armv7a-eabi -mfpu=vfpv4 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
+// RUN: %clang -target armv7r-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-NO-FMA
+// RUN: %clang -target armv7r-eabi -mfpu=vfpv4 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
 // RUN: %clang -target armv7em-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
-// RUN: %clang -target armv8-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
+// RUN: %clang -target armv8-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-NO-FMA
+// RUN: %clang -target armv8-eabi -mfpu=vfpv4 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
 
 // CHECK-FMA: __ARM_FEATURE_FMA 1
 
diff --git a/test/Preprocessor/arm-target-features.c b/test/Preprocessor/arm-target-features.c
index 42816bf..be23560 100644
--- a/test/Preprocessor/arm-target-features.c
+++ b/test/Preprocessor/arm-target-features.c
@@ -1,225 +1,219 @@
-// RUN: %clang -target armv8a-none-linux-gnu -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-V8A %s
-// CHECK-V8A: __ARMEL__ 1
-// CHECK-V8A: __ARM_ARCH 8
-// CHECK-V8A: __ARM_ARCH_8A__ 1
-// CHECK-V8A: __ARM_FEATURE_CRC32 1
-// CHECK-V8A: __ARM_FEATURE_DIRECTED_ROUNDING 1
-// CHECK-V8A: __ARM_FEATURE_NUMERIC_MAXMIN 1
-// CHECK-V8A: __ARM_FP 0xE
-// CHECK-V8A: __ARM_FP16_ARGS 1
-// CHECK-V8A: __ARM_FP16_FORMAT_IEEE 1
+// RUN: %clang -target armv8a-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V8A %s
+// CHECK-V8A: #define __ARMEL__ 1
+// CHECK-V8A: #define __ARM_ARCH 8
+// CHECK-V8A: #define __ARM_ARCH_8A__ 1
+// CHECK-V8A: #define __ARM_FEATURE_CRC32 1
+// CHECK-V8A: #define __ARM_FEATURE_DIRECTED_ROUNDING 1
+// CHECK-V8A: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
+// CHECK-V8A: #define __ARM_FP 0xE
+// CHECK-V8A: #define __ARM_FP16_ARGS 1
+// CHECK-V8A: #define __ARM_FP16_FORMAT_IEEE 1
 
-// RUN: %clang -target armv7a-none-linux-gnu -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-V7 %s
-// CHECK-V7: __ARMEL__ 1
-// CHECK-V7: __ARM_ARCH 7
-// CHECK-V7: __ARM_ARCH_7A__ 1
+// RUN: %clang -target armv7a-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V7 %s
+// CHECK-V7: #define __ARMEL__ 1
+// CHECK-V7: #define __ARM_ARCH 7
+// CHECK-V7: #define __ARM_ARCH_7A__ 1
 // CHECK-V7-NOT: __ARM_FEATURE_CRC32
-// CHECK-V7-NOT: __ARM_FEATURE_NUMERIC_MAXMIN                                   
+// CHECK-V7-NOT: __ARM_FEATURE_NUMERIC_MAXMIN
 // CHECK-V7-NOT: __ARM_FEATURE_DIRECTED_ROUNDING
-// CHECK-V7: __ARM_FP 0xC
+// CHECK-V7: #define __ARM_FP 0xC
 
-// RUN: %clang -target x86_64-apple-macosx10.10 -arch armv7s -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-V7S %s
-// CHECK-V7S: __ARMEL__ 1
-// CHECK-V7S: __ARM_ARCH 7
-// CHECK-V7S: __ARM_ARCH_7S__ 1
+// RUN: %clang -target x86_64-apple-macosx10.10 -arch armv7s -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V7S %s
+// CHECK-V7S: #define __ARMEL__ 1
+// CHECK-V7S: #define __ARM_ARCH 7
+// CHECK-V7S: #define __ARM_ARCH_7S__ 1
 // CHECK-V7S-NOT: __ARM_FEATURE_CRC32
 // CHECK-V7S-NOT: __ARM_FEATURE_NUMERIC_MAXMIN
 // CHECK-V7S-NOT: __ARM_FEATURE_DIRECTED_ROUNDING
-// CHECK-V7S: __ARM_FP 0xE
+// CHECK-V7S: #define __ARM_FP 0xE
 
-// RUN: %clang -target armv8a -mfloat-abi=hard -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-BAREHF %s
-// CHECK-V8-BAREHF: __ARMEL__ 1
-// CHECK-V8-BAREHF: __ARM_ARCH 8
-// CHECK-V8-BAREHF: __ARM_ARCH_8A__ 1
-// CHECK-V8-BAREHF: __ARM_FEATURE_CRC32 1
-// CHECK-V8-BAREHF: __ARM_FEATURE_DIRECTED_ROUNDING 1
-// CHECK-V8-BAREHF: __ARM_FEATURE_NUMERIC_MAXMIN 1
-// CHECK-V8-BAREHP: __ARM_FP 0xE
-// CHECK-V8-BAREHF: __ARM_NEON__ 1
-// CHECK-V8-BAREHF: __ARM_PCS_VFP 1
-// CHECK-V8-BAREHF: __VFP_FP__ 1
+// RUN: %clang -target armv8a -mfloat-abi=hard -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-BAREHF %s
+// CHECK-V8-BAREHF: #define __ARMEL__ 1
+// CHECK-V8-BAREHF: #define __ARM_ARCH 8
+// CHECK-V8-BAREHF: #define __ARM_ARCH_8A__ 1
+// CHECK-V8-BAREHF: #define __ARM_FEATURE_CRC32 1
+// CHECK-V8-BAREHF: #define __ARM_FEATURE_DIRECTED_ROUNDING 1
+// CHECK-V8-BAREHF: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
+// CHECK-V8-BAREHP: #define __ARM_FP 0xE
+// CHECK-V8-BAREHF: #define __ARM_NEON__ 1
+// CHECK-V8-BAREHF: #define __ARM_PCS_VFP 1
+// CHECK-V8-BAREHF: #define __VFP_FP__ 1
 
-// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=fp-armv8 -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-BAREHF-FP %s
+// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=fp-armv8 -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-BAREHF-FP %s
 // CHECK-V8-BAREHF-FP-NOT: __ARM_NEON__ 1
-// CHECK-V8-BAREHP-FP: __ARM_FP 0xE
-// CHECK-V8-BAREHF-FP: __VFP_FP__ 1
+// CHECK-V8-BAREHP-FP: #define __ARM_FP 0xE
+// CHECK-V8-BAREHF-FP: #define __VFP_FP__ 1
 
-// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=neon-fp-armv8 -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-BAREHF-NEON-FP %s
-// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=crypto-neon-fp-armv8 -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-BAREHF-NEON-FP %s
-// CHECK-V8-BAREHP-NEON-FP: __ARM_FP 0xE
-// CHECK-V8-BAREHF-NEON-FP: __ARM_NEON__ 1
-// CHECK-V8-BAREHF-NEON-FP: __VFP_FP__ 1
+// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=neon-fp-armv8 -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-BAREHF-NEON-FP %s
+// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=crypto-neon-fp-armv8 -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-BAREHF-NEON-FP %s
+// CHECK-V8-BAREHP-NEON-FP: #define __ARM_FP 0xE
+// CHECK-V8-BAREHF-NEON-FP: #define __ARM_NEON__ 1
+// CHECK-V8-BAREHF-NEON-FP: #define __VFP_FP__ 1
 
-// RUN: %clang -target armv8a -mnocrc -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-NOCRC %s
+// RUN: %clang -target armv8a -mnocrc -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-NOCRC %s
 // CHECK-V8-NOCRC-NOT: __ARM_FEATURE_CRC32 1
 
 // Check that -mhwdiv works properly for armv8/thumbv8 (enabled by default).
 
-// RUN: %clang -target armv8 -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV8 %s
-// ARMV8:#define __ARM_ARCH_EXT_IDIV__ 1
+// RUN: %clang -target armv8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8 %s
+// RUN: %clang -target armv8 -mthumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8 %s
+// RUN: %clang -target armv8-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8 %s
+// RUN: %clang -target armv8-eabi -mthumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8 %s
+// V8:#define __ARM_ARCH_EXT_IDIV__ 1
 
-// RUN: %clang -target armv8 -mthumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBV8 %s
-// THUMBV8:#define __ARM_ARCH_EXT_IDIV__ 1
+// RUN: %clang -target armv8 -mhwdiv=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV-V8 %s
+// RUN: %clang -target armv8 -mthumb -mhwdiv=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV-V8 %s
+// RUN: %clang -target armv8 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV-V8 %s
+// RUN: %clang -target armv8 -mthumb -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV-V8 %s
+// NOHWDIV-V8-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang -target armv8-eabi -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV8-EABI %s
-// ARMV8-EABI:#define __ARM_ARCH_EXT_IDIV__ 1
+// RUN: %clang -target armv8a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8A %s
+// RUN: %clang -target armv8a -mthumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8A %s
+// RUN: %clang -target armv8a-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8A %s
+// RUN: %clang -target armv8a-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8A %s
+// V8A:#define __ARM_ARCH_EXT_IDIV__ 1
+// V8A:#define __ARM_FP 0xE
 
-// RUN: %clang -target armv8-eabi -mthumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBV8-EABI %s
-// THUMBV8-EABI:#define __ARM_ARCH_EXT_IDIV__ 1
+// RUN: %clang -target armv8m.base-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8M_BASELINE %s
+// V8M_BASELINE: #define __ARM_ARCH 8
+// V8M_BASELINE: #define __ARM_ARCH_8M_BASE__ 1
+// V8M_BASELINE: #define __ARM_ARCH_EXT_IDIV__ 1
+// V8M_BASELINE-NOT: __ARM_ARCH_ISA_ARM
+// V8M_BASELINE: #define __ARM_ARCH_ISA_THUMB 1
+// V8M_BASELINE: #define __ARM_ARCH_PROFILE 'M'
+// V8M_BASELINE-NOT: __ARM_FEATURE_CRC32
+// V8M_BASELINE-NOT: __ARM_FEATURE_DSP
+// V8M_BASELINE-NOT: __ARM_FP 0x{{.*}}
+// V8M_BASELINE-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
 
-// RUN: %clang -target armv8 -mhwdiv=none -x c -E -dM %s -o - | FileCheck --check-prefix=NONEHWDIV-ARMV8 %s
-// NONEHWDIV-ARMV8-NOT:#define __ARM_ARCH_EXT_IDIV__
+// RUN: %clang -target armv8m.main-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8M_MAINLINE %s
+// V8M_MAINLINE: #define __ARM_ARCH 8
+// V8M_MAINLINE: #define __ARM_ARCH_8M_MAIN__ 1
+// V8M_MAINLINE: #define __ARM_ARCH_EXT_IDIV__ 1
+// V8M_MAINLINE-NOT: __ARM_ARCH_ISA_ARM
+// V8M_MAINLINE: #define __ARM_ARCH_ISA_THUMB 2
+// V8M_MAINLINE: #define __ARM_ARCH_PROFILE 'M'
+// V8M_MAINLINE-NOT: __ARM_FEATURE_CRC32
+// V8M_MAINLINE-NOT: __ARM_FEATURE_DSP
+// V8M_MAINLINE: #define __ARM_FP 0xE
+// V8M_MAINLINE: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 
-// RUN: %clang -target armv8 -mthumb -mhwdiv=none -x c -E -dM %s -o - | FileCheck --check-prefix=NONEHWDIV-THUMBV8 %s
-// NONEHWDIV-THUMBV8-NOT:#define __ARM_ARCH_EXT_IDIV__
+// RUN: %clang -target arm-none-linux-gnu -march=armv8-m.main+dsp -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8M_MAINLINE_DSP %s
+// V8M_MAINLINE_DSP: #define __ARM_ARCH 8
+// V8M_MAINLINE_DSP: #define __ARM_ARCH_8M_MAIN__ 1
+// V8M_MAINLINE_DSP: #define __ARM_ARCH_EXT_IDIV__ 1
+// V8M_MAINLINE_DSP-NOT: __ARM_ARCH_ISA_ARM
+// V8M_MAINLINE_DSP: #define __ARM_ARCH_ISA_THUMB 2
+// V8M_MAINLINE_DSP: #define __ARM_ARCH_PROFILE 'M'
+// V8M_MAINLINE_DSP-NOT: __ARM_FEATURE_CRC32
+// V8M_MAINLINE_DSP: #define __ARM_FEATURE_DSP 1
+// V8M_MAINLINE_DSP: #define __ARM_FP 0xE
+// V8M_MAINLINE_DSP: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 
-// RUN: %clang -target armv8 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBHWDIV-ARMV8 %s
-// THUMBHWDIV-ARMV8-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target armv8 -mthumb -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARMHWDIV-THUMBV8 %s
-// ARMHWDIV-THUMBV8-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target armv8a -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV8A %s
-// ARMV8A:#define __ARM_ARCH_EXT_IDIV__ 1
-// ARMV8A: #define __ARM_FP 0xE
-
-// RUN: %clang -target armv8a -mthumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBV8A %s
-// THUMBV8A:#define __ARM_ARCH_EXT_IDIV__ 1
-// THUMBV8A: #define __ARM_FP 0xE
-
-// RUN: %clang -target armv8a-eabi -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV8A-EABI %s
-// ARMV8A-EABI:#define __ARM_ARCH_EXT_IDIV__ 1
-// ARMV8A-EABI: #define __ARM_FP 0xE
-
-// RUN: %clang -target armv8a-eabi -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBV8A-EABI %s
-// THUMBV8A-EABI:#define __ARM_ARCH_EXT_IDIV__ 1
-// THUMBV8A-EABI: #define __ARM_FP 0xE
-
-// RUN: %clang -target arm-none-linux-gnu -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DEFS %s
+// RUN: %clang -target arm-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-DEFS %s
 // CHECK-DEFS:#define __ARM_PCS 1
 // CHECK-DEFS:#define __ARM_SIZEOF_MINIMAL_ENUM 4
 // CHECK-DEFS:#define __ARM_SIZEOF_WCHAR_T 4
 
 // RUN: %clang -target arm-none-linux-gnu -fno-math-errno -fno-signed-zeros\
 // RUN:        -fno-trapping-math -fassociative-math -freciprocal-math\
-// RUN:        -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-FASTMATH %s
+// RUN:        -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FASTMATH %s
 // RUN: %clang -target arm-none-linux-gnu -ffast-math -x c -E -dM %s -o -\
-// RUN:        | FileCheck --check-prefix=CHECK-FASTMATH %s
-// CHECK-FASTMATH: __ARM_FP_FAST 1
+// RUN:        | FileCheck -match-full-lines --check-prefix=CHECK-FASTMATH %s
+// CHECK-FASTMATH: #define __ARM_FP_FAST 1
 
-// RUN: %clang -target arm-none-linux-gnu -fshort-wchar -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SHORTWCHAR %s
+// RUN: %clang -target arm-none-linux-gnu -fshort-wchar -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-SHORTWCHAR %s
 // CHECK-SHORTWCHAR:#define __ARM_SIZEOF_WCHAR_T 2
 
-// RUN: %clang -target arm-none-linux-gnu -fshort-enums -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SHORTENUMS %s
+// RUN: %clang -target arm-none-linux-gnu -fshort-enums -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-SHORTENUMS %s
 // CHECK-SHORTENUMS:#define __ARM_SIZEOF_MINIMAL_ENUM 1
 
 // Test that -mhwdiv has the right effect for a target CPU which has hwdiv enabled by default.
-// RUN: %clang -target armv7 -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-ARM %s
-// DEFAULTHWDIV-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
+// RUN: %clang -target armv7 -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=HWDIV %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=HWDIV %s
+// RUN: %clang -target armv7 -mcpu=cortex-a15 -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=HWDIV %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=HWDIV %s
+// HWDIV:#define __ARM_ARCH_EXT_IDIV__ 1
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-THUMB %s
-// DEFAULTHWDIV-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target armv7 -mcpu=cortex-a15 -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARMHWDIV-ARM %s
-// ARMHWDIV-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBHWDIV-THUMB %s
-// THUMBHWDIV-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target arm -mcpu=cortex-a15 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-THUMBHWDIV-ARM %s
-// DEFAULTHWDIV-THUMBHWDIV-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target arm -mthumb -mcpu=cortex-a15 -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-ARMHWDIV-THUMB %s
-// DEFAULTHWDIV-ARMHWDIV-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target arm -mcpu=cortex-a15 -mhwdiv=none -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-NONEHWDIV-ARM %s
-// DEFAULTHWDIV-NONEHWDIV-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target arm -mthumb -mcpu=cortex-a15 -mhwdiv=none -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-NONEHWDIV-THUMB %s
-// DEFAULTHWDIV-NONEHWDIV-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
+// RUN: %clang -target arm -mcpu=cortex-a15 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV %s
+// RUN: %clang -target arm -mthumb -mcpu=cortex-a15 -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV %s
+// RUN: %clang -target arm -mcpu=cortex-a15 -mhwdiv=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV %s
+// RUN: %clang -target arm -mthumb -mcpu=cortex-a15 -mhwdiv=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV %s
+// NOHWDIV-NOT:#define __ARM_ARCH_EXT_IDIV__
 
 
 // Check that -mfpu works properly for Cortex-A7 (enabled by default).
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A7 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A7 %s
 // DEFAULTFPU-A7:#define __ARM_FP 0xE
 // DEFAULTFPU-A7:#define __ARM_NEON__ 1
 // DEFAULTFPU-A7:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A7 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A7 %s
 // FPUNONE-A7-NOT:#define __ARM_FP 0x{{.*}}
 // FPUNONE-A7-NOT:#define __ARM_NEON__ 1
 // FPUNONE-A7-NOT:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -mfpu=vfp4 -x c -E -dM %s -o - | FileCheck --check-prefix=NONEON-A7 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -mfpu=vfp4 -x c -E -dM %s -o - | FileCheck --check-prefix=NONEON-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -mfpu=vfp4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NONEON-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -mfpu=vfp4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NONEON-A7 %s
 // NONEON-A7:#define __ARM_FP 0xE
 // NONEON-A7-NOT:#define __ARM_NEON__ 1
 // NONEON-A7:#define __ARM_VFPV4__ 1
 
 // Check that -mfpu works properly for Cortex-A5 (enabled by default).
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A5 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A5 %s
 // DEFAULTFPU-A5:#define __ARM_FP 0xE
 // DEFAULTFPU-A5:#define __ARM_NEON__ 1
 // DEFAULTFPU-A5:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A5 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A5 %s
 // FPUNONE-A5-NOT:#define __ARM_FP 0x{{.*}}
 // FPUNONE-A5-NOT:#define __ARM_NEON__ 1
 // FPUNONE-A5-NOT:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -mfpu=vfp4-d16 -x c -E -dM %s -o - | FileCheck --check-prefix=NONEON-A5 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -mfpu=vfp4-d16 -x c -E -dM %s -o - | FileCheck --check-prefix=NONEON-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -mfpu=vfp4-d16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NONEON-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -mfpu=vfp4-d16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NONEON-A5 %s
 // NONEON-A5:#define __ARM_FP 0xE
 // NONEON-A5-NOT:#define __ARM_NEON__ 1
 // NONEON-A5:#define __ARM_VFPV4__ 1
 
 // FIXME: add check for further predefines
 // Test whether predefines are as expected when targeting ep9312.
-// RUN: %clang -target armv4t -mcpu=ep9312 -x c -E -dM %s -o - | FileCheck --check-prefix=A4T %s
+// RUN: %clang -target armv4t -mcpu=ep9312 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A4T %s
 // A4T-NOT:#define __ARM_FEATURE_DSP
 // A4T-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting arm10tdmi.
-// RUN: %clang -target armv5 -mcpu=arm10tdmi -x c -E -dM %s -o - | FileCheck --check-prefix=A5T %s
+// RUN: %clang -target armv5 -mcpu=arm10tdmi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A5T %s
 // A5T-NOT:#define __ARM_FEATURE_DSP
 // A5T-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting cortex-a5.
-// RUN: %clang -target armv7 -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=A5-ARM %s
-// A5-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A5-ARM:#define __ARM_FEATURE_DSP
-// A5-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=A5-THUMB %s
-// A5-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A5-THUMB:#define __ARM_FEATURE_DSP
-// A5-THUMB:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=A5 %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=A5 %s
+// RUN: %clang -target armv7 -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A5 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A5 %s
 // A5:#define __ARM_ARCH 7
 // A5:#define __ARM_ARCH_7A__ 1
+// A5-NOT:#define __ARM_ARCH_EXT_IDIV__
 // A5:#define __ARM_ARCH_PROFILE 'A'
-// A5-NOT: #define __ARM_FEATURE_NUMERIC_MAXMIN
 // A5-NOT: #define __ARM_FEATURE_DIRECTED_ROUNDING
-// A5:#define __ARM_FEATURE_DSP
+// A5:#define __ARM_FEATURE_DSP 1
+// A5-NOT: #define __ARM_FEATURE_NUMERIC_MAXMIN
 // A5:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-a7.
-// RUN: %clang -target armv7k -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck --check-prefix=A7 %s
-// RUN: %clang -target armv7k -mthumb -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck --check-prefix=A7 %s
+// RUN: %clang -target armv7k -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A7 %s
+// RUN: %clang -target armv7k -mthumb -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A7 %s
 // A7:#define __ARM_ARCH 7
 // A7:#define __ARM_ARCH_EXT_IDIV__ 1
 // A7:#define __ARM_ARCH_PROFILE 'A'
-// A7:#define __ARM_FEATURE_DSP
+// A7:#define __ARM_FEATURE_DSP 1
 // A7:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-a7.
-// RUN: %clang -target x86_64-apple-darwin -arch armv7k -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV7K %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv7k -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV7K %s
 // ARMV7K:#define __ARM_ARCH 7
 // ARMV7K:#define __ARM_ARCH_EXT_IDIV__ 1
 // ARMV7K:#define __ARM_ARCH_PROFILE 'A'
@@ -230,193 +224,179 @@
 
 
 // Test whether predefines are as expected when targeting cortex-a8.
-// RUN: %clang -target armv7 -mcpu=cortex-a8 -x c -E -dM %s -o - | FileCheck --check-prefix=A8-ARM %s
-// A8-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A8-ARM:#define __ARM_FEATURE_DSP
-// A8-ARM:#define __ARM_FP 0xC
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a8 -x c -E -dM %s -o - | FileCheck --check-prefix=A8-THUMB %s
-// A8-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A8-THUMB:#define __ARM_FEATURE_DSP
-// A8-THUMB:#define __ARM_FP 0xC
+// RUN: %clang -target armv7 -mcpu=cortex-a8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A8 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A8 %s
+// A8-NOT:#define __ARM_ARCH_EXT_IDIV__
+// A8:#define __ARM_FEATURE_DSP 1
+// A8:#define __ARM_FP 0xC
 
 // Test whether predefines are as expected when targeting cortex-a9.
-// RUN: %clang -target armv7 -mcpu=cortex-a9 -x c -E -dM %s -o - | FileCheck --check-prefix=A9-ARM %s
-// A9-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A9-ARM:#define __ARM_FEATURE_DSP
-// A9-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a9 -x c -E -dM %s -o - | FileCheck --check-prefix=A9-THUMB %s
-// A9-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A9-THUMB:#define __ARM_FEATURE_DSP
-// A9-THUMB:#define __ARM_FP 0xE
+// RUN: %clang -target armv7 -mcpu=cortex-a9 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A9 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a9 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A9 %s
+// A9-NOT:#define __ARM_ARCH_EXT_IDIV__
+// A9:#define __ARM_FEATURE_DSP 1
+// A9:#define __ARM_FP 0xE
 
 
 // Check that -mfpu works properly for Cortex-A12 (enabled by default).
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A12 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A12 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A12 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A12 %s
 // DEFAULTFPU-A12:#define __ARM_FP 0xE
 // DEFAULTFPU-A12:#define __ARM_NEON__ 1
 // DEFAULTFPU-A12:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a12 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A12 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a12 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A12 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a12 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A12 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a12 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A12 %s
 // FPUNONE-A12-NOT:#define __ARM_FP 0x{{.*}}
 // FPUNONE-A12-NOT:#define __ARM_NEON__ 1
 // FPUNONE-A12-NOT:#define __ARM_VFPV4__ 1
 
 // Test whether predefines are as expected when targeting cortex-a12.
-// RUN: %clang -target armv7 -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck --check-prefix=A12 %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck --check-prefix=A12 %s
+// RUN: %clang -target armv7 -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A12 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A12 %s
 // A12:#define __ARM_ARCH 7
 // A12:#define __ARM_ARCH_7A__ 1
 // A12:#define __ARM_ARCH_EXT_IDIV__ 1
 // A12:#define __ARM_ARCH_PROFILE 'A'
-// A12:#define __ARM_FEATURE_DSP
+// A12:#define __ARM_FEATURE_DSP 1
 // A12:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-a15.
-// RUN: %clang -target armv7 -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck --check-prefix=A15-ARM %s
-// A15-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// A15-ARM:#define __ARM_FEATURE_DSP
-// A15-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck --check-prefix=A15-THUMB %s
-// A15-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// A15-THUMB:#define __ARM_FEATURE_DSP
-// A15-THUMB:#define __ARM_FP 0xE
+// RUN: %clang -target armv7 -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A15 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A15 %s
+// A15:#define __ARM_ARCH_EXT_IDIV__ 1
+// A15:#define __ARM_FEATURE_DSP 1
+// A15:#define __ARM_FP 0xE
 
 // Check that -mfpu works properly for Cortex-A17 (enabled by default).
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A17 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A17 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A17 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A17 %s
 // DEFAULTFPU-A17:#define __ARM_FP 0xE
 // DEFAULTFPU-A17:#define __ARM_NEON__ 1
 // DEFAULTFPU-A17:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a17 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A17 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a17 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A17 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a17 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A17 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a17 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A17 %s
 // FPUNONE-A17-NOT:#define __ARM_FP 0x{{.*}}
 // FPUNONE-A17-NOT:#define __ARM_NEON__ 1
 // FPUNONE-A17-NOT:#define __ARM_VFPV4__ 1
 
 // Test whether predefines are as expected when targeting cortex-a17.
-// RUN: %clang -target armv7 -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck --check-prefix=A17 %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck --check-prefix=A17 %s
+// RUN: %clang -target armv7 -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A17 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A17 %s
 // A17:#define __ARM_ARCH 7
 // A17:#define __ARM_ARCH_7A__ 1
 // A17:#define __ARM_ARCH_EXT_IDIV__ 1
 // A17:#define __ARM_ARCH_PROFILE 'A'
-// A17:#define __ARM_FEATURE_DSP
+// A17:#define __ARM_FEATURE_DSP 1
 // A17:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting swift.
-// RUN: %clang -target armv7s -mcpu=swift -x c -E -dM %s -o - | FileCheck --check-prefix=SWIFT-ARM %s
-// SWIFT-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// SWIFT-ARM:#define __ARM_FEATURE_DSP
-// SWIFT-ARM:#define __ARM_FP 0xE
+// RUN: %clang -target armv7s -mcpu=swift -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=SWIFT %s
+// RUN: %clang -target armv7s -mthumb -mcpu=swift -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=SWIFT %s
+// SWIFT:#define __ARM_ARCH_EXT_IDIV__ 1
+// SWIFT:#define __ARM_FEATURE_DSP 1
+// SWIFT:#define __ARM_FP 0xE
 
-// RUN: %clang -target armv7s -mthumb -mcpu=swift -x c -E -dM %s -o - | FileCheck --check-prefix=SWIFT-THUMB %s
-// SWIFT-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// SWIFT-THUMB:#define __ARM_FEATURE_DSP
-// SWIFT-THUMB:#define __ARM_FP 0xE
-
-// Test whether predefines are as expected when targeting cortex-a53.
-// RUN: %clang -target armv8 -mcpu=cortex-a53 -x c -E -dM %s -o - | FileCheck --check-prefix=A53-ARM %s
-// A53-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// A53-ARM:#define __ARM_FEATURE_DSP
-// A53-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a53 -x c -E -dM %s -o - | FileCheck --check-prefix=A53-THUMB %s
-// A53-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// A53-THUMB:#define __ARM_FEATURE_DSP
-// A53-THUMB:#define __ARM_FP 0xE
+// Test whether predefines are as expected when targeting ARMv8-A Cortex implementations
+// RUN: %clang -target armv8 -mcpu=cortex-a32 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a32 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a35 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a35 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a53 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a53 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a57 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a57 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a72 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a72 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a73 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a73 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// ARMV8:#define __ARM_ARCH_EXT_IDIV__ 1
+// ARMV8:#define __ARM_FEATURE_DSP 1
+// ARMV8:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-r4.
-// RUN: %clang -target armv7 -mcpu=cortex-r4 -x c -E -dM %s -o - | FileCheck --check-prefix=R4-ARM %s
+// RUN: %clang -target armv7 -mcpu=cortex-r4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R4-ARM %s
 // R4-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// R4-ARM:#define __ARM_FEATURE_DSP
+// R4-ARM:#define __ARM_FEATURE_DSP 1
 // R4-ARM-NOT:#define __ARM_FP 0x{{.*}}
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r4 -x c -E -dM %s -o - | FileCheck --check-prefix=R4-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R4-THUMB %s
 // R4-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// R4-THUMB:#define __ARM_FEATURE_DSP
+// R4-THUMB:#define __ARM_FEATURE_DSP 1
 // R4-THUMB-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting cortex-r4f.
-// RUN: %clang -target armv7 -mcpu=cortex-r4f -x c -E -dM %s -o - | FileCheck --check-prefix=R4F-ARM %s
+// RUN: %clang -target armv7 -mcpu=cortex-r4f -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R4F-ARM %s
 // R4F-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// R4F-ARM:#define __ARM_FEATURE_DSP
+// R4F-ARM:#define __ARM_FEATURE_DSP 1
 // R4F-ARM:#define __ARM_FP 0xC
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r4f -x c -E -dM %s -o - | FileCheck --check-prefix=R4F-THUMB %s
-// R4F-THUMBT:#define __ARM_ARCH_EXT_IDIV__ 1
-// R4F-THUMB:#define __ARM_FEATURE_DSP
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r4f -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R4F-THUMB %s
+// R4F-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
+// R4F-THUMB:#define __ARM_FEATURE_DSP 1
 // R4F-THUMB:#define __ARM_FP 0xC
 
 // Test whether predefines are as expected when targeting cortex-r5.
-// RUN: %clang -target armv7 -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck --check-prefix=R5-ARM %s
-// R5-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// R5-ARM:#define __ARM_FEATURE_DSP
-// R5-ARM:#define __ARM_FP 0xC
+// RUN: %clang -target armv7 -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R5 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R5 %s
+// R5:#define __ARM_ARCH_EXT_IDIV__ 1
+// R5:#define __ARM_FEATURE_DSP 1
+// R5:#define __ARM_FP 0xC
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck --check-prefix=R5-THUMB %s
-// R5-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// R5-THUMB:#define __ARM_FEATURE_DSP
-// R5-THUMB:#define __ARM_FP 0xC
-
-// Test whether predefines are as expected when targeting cortex-r7.
-// RUN: %clang -target armv7 -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck --check-prefix=R7-ARM %s
-// R7-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// R7-ARM:#define __ARM_FEATURE_DSP
-// R7-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck --check-prefix=R7-THUMB %s
-// R7-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// R7-THUMB:#define __ARM_FEATURE_DSP
-// R7-THUMB:#define __ARM_FP 0xE
+// Test whether predefines are as expected when targeting cortex-r7 and cortex-r8.
+// RUN: %clang -target armv7 -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R7-R8 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R7-R8 %s
+// RUN: %clang -target armv7 -mcpu=cortex-r8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R7-R8 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R7-R8 %s
+// R7-R8:#define __ARM_ARCH_EXT_IDIV__ 1
+// R7-R8:#define __ARM_FEATURE_DSP 1
+// R7-R8:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-m0.
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m0 -x c -E -dM %s -o - | FileCheck --check-prefix=M0-THUMB %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m0plus -x c -E -dM %s -o - | FileCheck --check-prefix=M0-THUMB %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m1 -x c -E -dM %s -o - | FileCheck --check-prefix=M0-THUMB %s
-// RUN: %clang -target armv7 -mthumb -mcpu=sc000 -x c -E -dM %s -o - | FileCheck --check-prefix=M0-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m0 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M0-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m0plus -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M0-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m1 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M0-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=sc000 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M0-THUMB %s
 // M0-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
 // M0-THUMB-NOT:#define __ARM_FEATURE_DSP
 // M0-THUMB-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting cortex-m3.
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m3 -x c -E -dM %s -o - | FileCheck --check-prefix=M3-THUMB %s
-// RUN: %clang -target armv7 -mthumb -mcpu=sc300 -x c -E -dM %s -o - | FileCheck --check-prefix=M3-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m3 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M3-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=sc300 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M3-THUMB %s
 // M3-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
 // M3-THUMB-NOT:#define __ARM_FEATURE_DSP
 // M3-THUMB-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting cortex-m4.
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m4 -x c -E -dM %s -o - | FileCheck --check-prefix=M4-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M4-THUMB %s
 // M4-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// M4-THUMB:#define __ARM_FEATURE_DSP
+// M4-THUMB:#define __ARM_FEATURE_DSP 1
 // M4-THUMB:#define __ARM_FP 0x6
 
 // Test whether predefines are as expected when targeting cortex-m7.
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m7 -x c -E -dM %s -o - | FileCheck --check-prefix=M7-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M7-THUMB %s
 // M7-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// M7-THUMB:#define __ARM_FEATURE_DSP
+// M7-THUMB:#define __ARM_FEATURE_DSP 1
 // M7-THUMB:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting krait.
-// RUN: %clang -target armv7 -mcpu=krait -x c -E -dM %s -o - | FileCheck --check-prefix=KRAIT-ARM %s
-// KRAIT-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// KRAIT-ARM:#define __ARM_FEATURE_DSP
-// KRAIT-ARM:#define  __ARM_VFPV4__ 1
+// RUN: %clang -target armv7 -mcpu=krait -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=KRAIT %s
+// RUN: %clang -target armv7 -mthumb -mcpu=krait -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=KRAIT %s
+// KRAIT:#define __ARM_ARCH_EXT_IDIV__ 1
+// KRAIT:#define __ARM_FEATURE_DSP 1
+// KRAIT:#define  __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7 -mthumb -mcpu=krait -x c -E -dM %s -o - | FileCheck --check-prefix=KRAIT-THUMB %s
-// KRAIT-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// KRAIT-THUMB:#define __ARM_FEATURE_DSP
-// KRAIT-THUMB:#define  __ARM_VFPV4__ 1
-
-// RUN: %clang -target armv8.1a-none-none-eabi -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-V81A %s
-// CHECK-V81A: __ARM_ARCH 8
-// CHECK-V81A: __ARM_ARCH_8_1A__ 1
+// RUN: %clang -target armv8.1a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V81A %s
+// CHECK-V81A: #define __ARM_ARCH 8
+// CHECK-V81A: #define __ARM_ARCH_8_1A__ 1
 // CHECK-V81A: #define __ARM_ARCH_PROFILE 'A'
-// CHECK-V81A: __ARM_FEATURE_QRDMX 1
+// CHECK-V81A: #define __ARM_FEATURE_QRDMX 1
 // CHECK-V81A: #define __ARM_FP 0xE
+
+// RUN: %clang -target armv8.2a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V82A %s
+// CHECK-V82A: #define __ARM_ARCH 8
+// CHECK-V82A: #define __ARM_ARCH_8_2A__ 1
+// CHECK-V82A: #define __ARM_ARCH_PROFILE 'A'
+// CHECK-V82A: #define __ARM_FP 0xE
diff --git a/test/Preprocessor/bigoutput.c b/test/Preprocessor/bigoutput.c
new file mode 100644
index 0000000..c5e02cb
--- /dev/null
+++ b/test/Preprocessor/bigoutput.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -E -x c %s > /dev/tty
+// The original bug requires UNIX line endings to trigger.
+// The original bug triggers only when outputting directly to console.
+// REQUIRES: console
+
+// Make sure clang does not crash during preprocessing
+
+#define M0 extern int x;
+#define M2  M0  M0  M0  M0
+#define M4  M2  M2  M2  M2
+#define M6  M4  M4  M4  M4
+#define M8  M6  M6  M6  M6
+#define M10 M8  M8  M8  M8
+#define M12 M10 M10 M10 M10
+#define M14 M12 M12 M12 M12
+
+M14
diff --git a/test/Preprocessor/comment_save_macro.c b/test/Preprocessor/comment_save_macro.c
index 6ad759f..f32ba56 100644
--- a/test/Preprocessor/comment_save_macro.c
+++ b/test/Preprocessor/comment_save_macro.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -E -CC %s | FileCheck -check-prefix=CHECK-CC -strict-whitespace %s
 // CHECK-CC: boo bork /* blah*/ bar // zot
 
-// RUN: %clang_cc1 -E %s | FileCheck -check-prefix=CHECK -strict-whitespace %s
+// RUN: %clang_cc1 -E %s | FileCheck -strict-whitespace %s
 // CHECK: boo bork bar
 
 
diff --git a/test/Preprocessor/cuda-approx-transcendentals.cu b/test/Preprocessor/cuda-approx-transcendentals.cu
new file mode 100644
index 0000000..409eabb
--- /dev/null
+++ b/test/Preprocessor/cuda-approx-transcendentals.cu
@@ -0,0 +1,8 @@
+// RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix HOST %s
+// RUN: %clang --cuda-device-only -nocudainc -nocudalib -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-NOFAST %s
+// RUN: %clang -fcuda-approx-transcendentals --cuda-device-only -nocudainc -nocudalib -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
+// RUN: %clang -ffast-math --cuda-device-only -nocudainc -nocudalib -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
+
+// HOST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+// DEVICE-NOFAST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+// DEVICE-FAST: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
diff --git a/test/Preprocessor/cuda-preprocess.cu b/test/Preprocessor/cuda-preprocess.cu
index 369dfa2..84a7bcf 100644
--- a/test/Preprocessor/cuda-preprocess.cu
+++ b/test/Preprocessor/cuda-preprocess.cu
@@ -13,20 +13,20 @@
 
 // CHECK-NOT: PREPROCESSED_AWAY
 
-// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 %s 2>&1 \
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 -nocudainc -nocudalib %s 2>&1 \
 // RUN:   | FileCheck -check-prefix NOARCH %s
-// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 --cuda-host-only %s 2>&1 \
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 --cuda-host-only -nocudainc %s 2>&1 \
 // RUN:   | FileCheck -check-prefix NOARCH %s
 // NOARCH: clang_unittest_no_arch
 
-// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 --cuda-device-only %s 2>&1 \
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 --cuda-device-only -nocudainc -nocudalib %s 2>&1 \
 // RUN:   | FileCheck -check-prefix SM20 %s
 // SM20: clang_unittest_cuda_arch 200
 
-// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_30 --cuda-device-only %s 2>&1 \
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_30 --cuda-device-only -nocudainc -nocudalib %s 2>&1 \
 // RUN:   | FileCheck -check-prefix SM30 %s
 // SM30: clang_unittest_cuda_arch 300
 
 // RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 --cuda-gpu-arch=sm_30 \
-// RUN:   --cuda-device-only %s 2>&1 \
+// RUN:   --cuda-device-only -nocudainc -nocudalib %s 2>&1 \
 // RUN:   | FileCheck -check-prefix SM20 -check-prefix SM30 %s
diff --git a/test/Preprocessor/cuda-types.cu b/test/Preprocessor/cuda-types.cu
new file mode 100644
index 0000000..32aa928
--- /dev/null
+++ b/test/Preprocessor/cuda-types.cu
@@ -0,0 +1,21 @@
+// Check that types, widths, etc. match on the host and device sides of CUDA
+// compilations.  Note that we filter out long double, as this is intentionally
+// different on host and device.
+
+// RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/i386-host-defines
+// RUN: %clang --cuda-device-only -nocudainc -nocudalib -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/i386-device-defines
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)' %T/i386-host-defines   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/i386-host-defines-filtered
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)' %T/i386-device-defines | grep -v '__LDBL\|_LONG_DOUBLE' > %T/i386-device-defines-filtered
+// RUN: diff %T/i386-host-defines-filtered %T/i386-device-defines-filtered
+
+// RUN: %clang --cuda-host-only -nocudainc -target x86_64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/x86_64-host-defines
+// RUN: %clang --cuda-device-only -nocudainc -nocudalib -target x86_64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/x86_64-device-defines
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/x86_64-host-defines   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/x86_64-host-defines-filtered
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/x86_64-device-defines | grep -v '__LDBL\|_LONG_DOUBLE' > %T/x86_64-device-defines-filtered
+// RUN: diff %T/x86_64-host-defines-filtered %T/x86_64-device-defines-filtered
+
+// RUN: %clang --cuda-host-only -nocudainc -target powerpc64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/powerpc64-host-defines
+// RUN: %clang --cuda-device-only -nocudainc -nocudalib -target powerpc64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/powerpc64-device-defines
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/powerpc64-host-defines   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/powerpc64-host-defines-filtered
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/powerpc64-device-defines | grep -v '__LDBL\|_LONG_DOUBLE' > %T/powerpc64-device-defines-filtered
+// RUN: diff %T/powerpc64-host-defines-filtered %T/powerpc64-device-defines-filtered
diff --git a/test/Preprocessor/elfiamcu-predefines.c b/test/Preprocessor/elfiamcu-predefines.c
index 7140c61..ea6824b 100644
--- a/test/Preprocessor/elfiamcu-predefines.c
+++ b/test/Preprocessor/elfiamcu-predefines.c
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -E -dM -triple i586-intel-elfiamcu | FileCheck %s
 
 // CHECK: #define __USER_LABEL_PREFIX__ {{$}}
+// CHECK: #define __WINT_TYPE__ unsigned int
 // CHECK: #define __iamcu
 // CHECK: #define __iamcu__
 
diff --git a/test/Preprocessor/expr_define_expansion.c b/test/Preprocessor/expr_define_expansion.c
index 3e5a2c4..23cb435 100644
--- a/test/Preprocessor/expr_define_expansion.c
+++ b/test/Preprocessor/expr_define_expansion.c
@@ -1,6 +1,28 @@
-// RUN: %clang_cc1 %s -E -CC -pedantic -verify
-// expected-no-diagnostics
+// RUN: %clang_cc1 %s -E -CC -verify
+// RUN: %clang_cc1 %s -E -CC -DPEDANTIC -pedantic -verify
 
 #define FOO && 1
 #if defined FOO FOO
 #endif
+
+#define A
+#define B defined(A)
+#if B // expected-warning{{macro expansion producing 'defined' has undefined behavior}}
+#endif
+
+#define m_foo
+#define TEST(a) (defined(m_##a) && a)
+
+#if defined(PEDANTIC)
+// expected-warning@+4{{macro expansion producing 'defined' has undefined behavior}}
+#endif
+
+// This shouldn't warn by default, only with pedantic:
+#if TEST(foo)
+#endif
+
+
+// Only one diagnostic for this case:
+#define INVALID defined(
+#if INVALID // expected-error{{macro name missing}}
+#endif
diff --git a/test/Preprocessor/expr_invalid_tok.c b/test/Preprocessor/expr_invalid_tok.c
index 5defcc5..0b97b25 100644
--- a/test/Preprocessor/expr_invalid_tok.c
+++ b/test/Preprocessor/expr_invalid_tok.c
@@ -1,15 +1,28 @@
-// RUN: not %clang_cc1 -E %s 2>&1 | grep 'invalid token at start of a preprocessor expression'
-// RUN: not %clang_cc1 -E %s 2>&1 | grep 'token is not a valid binary operator in a preprocessor subexpression'
-// RUN: not %clang_cc1 -E %s 2>&1 | grep ':14: error: expected end of line in preprocessor expression'
+// RUN: not %clang_cc1 -E %s 2>&1 | FileCheck %s
 // PR2220
 
+// CHECK: invalid token at start of a preprocessor expression
 #if 1 * * 2
 #endif
 
+// CHECK: token is not a valid binary operator in a preprocessor subexpression
 #if 4 [ 2
 #endif
 
 
 // PR2284 - The constant-expr production does not including comma.
+// CHECK: [[@LINE+1]]:14: error: expected end of line in preprocessor expression
 #if 1 ? 2 : 0, 1
 #endif
+
+// CHECK: [[@LINE+1]]:5: error: function-like macro 'FOO' is not defined
+#if FOO(1, 2, 3)
+#endif
+
+// CHECK: [[@LINE+1]]:9: error: function-like macro 'BAR' is not defined
+#if 1 + BAR(1, 2, 3)
+#endif
+
+// CHECK: [[@LINE+1]]:10: error: token is not a valid binary operator
+#if (FOO)(1, 2, 3)
+#endif
diff --git a/test/Preprocessor/feature_tests.c b/test/Preprocessor/feature_tests.c
index fbde6a6..52a1f17 100644
--- a/test/Preprocessor/feature_tests.c
+++ b/test/Preprocessor/feature_tests.c
@@ -55,8 +55,50 @@
 #endif
 
 #ifdef VERIFY
-// expected-error@+2 {{builtin feature check macro requires a parenthesized identifier}}
-// expected-error@+1 {{expected value in expression}}
+// expected-error@+1 {{builtin feature check macro requires a parenthesized identifier}}
 #if __has_feature('x')
 #endif
+
+// The following are not identifiers:
+_Static_assert(!__is_identifier("string"), "oops");
+_Static_assert(!__is_identifier('c'), "oops");
+_Static_assert(!__is_identifier(123), "oops");
+_Static_assert(!__is_identifier(int), "oops");
+
+// The following are:
+_Static_assert(__is_identifier(abc /* comment */), "oops");
+_Static_assert(__is_identifier /* comment */ (xyz), "oops");
+
+// expected-error@+1 {{too few arguments}}
+#if __is_identifier()
+#endif
+
+// expected-error@+1 {{too many arguments}}
+#if __is_identifier(,())
+#endif
+
+// expected-error@+1 {{missing ')' after 'abc'}} 
+#if __is_identifier(abc xyz) // expected-note {{to match this '('}}
+#endif
+
+// expected-error@+1 {{missing ')' after 'abc'}} 
+#if __is_identifier(abc())   // expected-note {{to match this '('}}
+#endif
+
+// expected-error@+1 {{missing ')' after '.'}} 
+#if __is_identifier(.abc)    // expected-note {{to match this '('}}
+#endif
+
+// expected-error@+1 {{nested parentheses not permitted in '__is_identifier'}} 
+#if __is_identifier((abc))
+#endif
+
+// expected-error@+1 {{missing '(' after '__is_identifier'}} expected-error@+1 {{expected value}}
+#if __is_identifier
+#endif
+
+// expected-error@+1 {{unterminated}} expected-error@+1 {{expected value}}
+#if __is_identifier(
+#endif
+
 #endif
diff --git a/test/Preprocessor/has_attribute.c b/test/Preprocessor/has_attribute.c
index 1a3c2a0..4970dc5 100644
--- a/test/Preprocessor/has_attribute.c
+++ b/test/Preprocessor/has_attribute.c
@@ -54,5 +54,5 @@
   int does_not_have_uuid
 #endif
 
-#if __has_cpp_attribute(selectany) // expected-error {{token is not a valid binary operator in a preprocessor subexpression}}
+#if __has_cpp_attribute(selectany) // expected-error {{function-like macro '__has_cpp_attribute' is not defined}}
 #endif
diff --git a/test/Preprocessor/has_attribute.cpp b/test/Preprocessor/has_attribute.cpp
index 1ab4502..2cfa005 100644
--- a/test/Preprocessor/has_attribute.cpp
+++ b/test/Preprocessor/has_attribute.cpp
@@ -52,6 +52,16 @@
   int has_cxx14_deprecated_vers();
 #endif
 
+// CHECK: has_cxx1z_nodiscard
+#if __has_cpp_attribute(nodiscard) == 201603
+  int has_cxx1z_nodiscard();
+#endif
+
+// CHECK: has_cxx1z_fallthrough
+#if __has_cpp_attribute(fallthrough) == 201603
+  int has_cxx1z_fallthrough();
+#endif
+
 // CHECK: has_declspec_uuid
 #if __has_declspec_attribute(uuid)
   int has_declspec_uuid();
diff --git a/test/Preprocessor/hexagon-predefines.c b/test/Preprocessor/hexagon-predefines.c
new file mode 100644
index 0000000..065ecc0
--- /dev/null
+++ b/test/Preprocessor/hexagon-predefines.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv5 %s | FileCheck %s -check-prefix CHECK-V5
+
+// CHECK-V5: #define __HEXAGON_ARCH__ 5
+// CHECK-V5: #define __HEXAGON_V5__ 1
+// CHECK-V5: #define __hexagon__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv55 %s | FileCheck %s -check-prefix CHECK-V55
+
+// CHECK-V55: #define __HEXAGON_ARCH__ 55
+// CHECK-V55: #define __HEXAGON_V55__ 1
+// CHECK-V55: #define __hexagon__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv60 %s | FileCheck %s -check-prefix CHECK-V60
+
+// CHECK-V60: #define __HEXAGON_ARCH__ 60
+// CHECK-V60: #define __HEXAGON_V60__ 1
+// CHECK-V60: #define __hexagon__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv60 -target-feature +hvx %s | FileCheck %s -check-prefix CHECK-V60HVX
+
+// CHECK-V60HVX: #define __HEXAGON_ARCH__ 60
+// CHECK-V60HVX: #define __HEXAGON_V60__ 1
+// CHECK-V60HVX: #define __HVX__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv60 -target-feature +hvx-double  %s | FileCheck %s -check-prefix CHECK-V60HVXD
+
+// CHECK-V60HVXD: #define __HEXAGON_ARCH__ 60
+// CHECK-V60HVXD: #define __HEXAGON_V60__ 1
+// CHECK-V60HVXD: #define __HVXDBL__ 1
+// CHECK-V60HVXD: #define __HVX__ 1
+// CHECK-V60HVXD: #define __hexagon__ 1
+
diff --git a/test/Preprocessor/init.c b/test/Preprocessor/init.c
index 8b07d7c..f7c320b 100644
--- a/test/Preprocessor/init.c
+++ b/test/Preprocessor/init.c
@@ -1,17 +1,17 @@
-// RUN: %clang_cc1 -E -dM -x assembler-with-cpp < /dev/null | FileCheck -check-prefix ASM %s
+// RUN: %clang_cc1 -E -dM -x assembler-with-cpp < /dev/null | FileCheck -match-full-lines -check-prefix ASM %s
 //
 // ASM:#define __ASSEMBLER__ 1
 //
 //
-// RUN: %clang_cc1 -fblocks -E -dM < /dev/null | FileCheck -check-prefix BLOCKS %s
+// RUN: %clang_cc1 -fblocks -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix BLOCKS %s
 //
 // BLOCKS:#define __BLOCKS__ 1
 // BLOCKS:#define __block __attribute__((__blocks__(byref)))
 //
 //
-// RUN: %clang_cc1 -x c++ -std=c++1z -E -dM < /dev/null | FileCheck -check-prefix CXX1Z %s
+// RUN: %clang_cc1 -x c++ -std=c++1z -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX1Z %s
 //
-// CXX1Z:#define __GNUG__
+// CXX1Z:#define __GNUG__ {{.*}}
 // CXX1Z:#define __GXX_EXPERIMENTAL_CXX0X__ 1
 // CXX1Z:#define __GXX_RTTI 1
 // CXX1Z:#define __GXX_WEAK__ 1
@@ -19,9 +19,9 @@
 // CXX1Z:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=c++1y -E -dM < /dev/null | FileCheck -check-prefix CXX1Y %s
+// RUN: %clang_cc1 -x c++ -std=c++1y -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX1Y %s
 //
-// CXX1Y:#define __GNUG__
+// CXX1Y:#define __GNUG__ {{.*}}
 // CXX1Y:#define __GXX_EXPERIMENTAL_CXX0X__ 1
 // CXX1Y:#define __GXX_RTTI 1
 // CXX1Y:#define __GXX_WEAK__ 1
@@ -29,9 +29,9 @@
 // CXX1Y:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=c++11 -E -dM < /dev/null | FileCheck -check-prefix CXX11 %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX11 %s
 //
-// CXX11:#define __GNUG__
+// CXX11:#define __GNUG__ {{.*}}
 // CXX11:#define __GXX_EXPERIMENTAL_CXX0X__ 1
 // CXX11:#define __GXX_RTTI 1
 // CXX11:#define __GXX_WEAK__ 1
@@ -39,100 +39,113 @@
 // CXX11:#define __private_extern__ extern
 //
 // 
-// RUN: %clang_cc1 -x c++ -std=c++98 -E -dM < /dev/null | FileCheck -check-prefix CXX98 %s
+// RUN: %clang_cc1 -x c++ -std=c++98 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX98 %s
 // 
-// CXX98:#define __GNUG__
+// CXX98:#define __GNUG__ {{.*}}
 // CXX98:#define __GXX_RTTI 1
 // CXX98:#define __GXX_WEAK__ 1
 // CXX98:#define __cplusplus 199711L
 // CXX98:#define __private_extern__ extern
 //
 // 
-// RUN: %clang_cc1 -fdeprecated-macro -E -dM < /dev/null | FileCheck -check-prefix DEPRECATED %s
+// RUN: %clang_cc1 -fdeprecated-macro -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix DEPRECATED %s
 //
 // DEPRECATED:#define __DEPRECATED 1
 //
 // 
-// RUN: %clang_cc1 -std=c99 -E -dM < /dev/null | FileCheck -check-prefix C99 %s
+// RUN: %clang_cc1 -std=c99 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix C99 %s
 //
 // C99:#define __STDC_VERSION__ 199901L
 // C99:#define __STRICT_ANSI__ 1
+// C99-NOT: __GXX_EXPERIMENTAL_CXX0X__
+// C99-NOT: __GXX_RTTI
+// C99-NOT: __GXX_WEAK__
+// C99-NOT: __cplusplus
 //
 // 
-// RUN: %clang_cc1 -std=c11 -E -dM < /dev/null | FileCheck -check-prefix C11 %s
+// RUN: %clang_cc1 -std=c11 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix C11 %s
 //
 // C11:#define __STDC_UTF_16__ 1
 // C11:#define __STDC_UTF_32__ 1
 // C11:#define __STDC_VERSION__ 201112L
 // C11:#define __STRICT_ANSI__ 1
+// C11-NOT: __GXX_EXPERIMENTAL_CXX0X__
+// C11-NOT: __GXX_RTTI
+// C11-NOT: __GXX_WEAK__
+// C11-NOT: __cplusplus
 //
 // 
-// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -check-prefix COMMON %s
+// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix COMMON %s
 //
 // COMMON:#define __CONSTANT_CFSTRINGS__ 1
 // COMMON:#define __FINITE_MATH_ONLY__ 0
-// COMMON:#define __GNUC_MINOR__
-// COMMON:#define __GNUC_PATCHLEVEL__
+// COMMON:#define __GNUC_MINOR__ {{.*}}
+// COMMON:#define __GNUC_PATCHLEVEL__ {{.*}}
 // COMMON:#define __GNUC_STDC_INLINE__ 1
-// COMMON:#define __GNUC__
-// COMMON:#define __GXX_ABI_VERSION
+// COMMON:#define __GNUC__ {{.*}}
+// COMMON:#define __GXX_ABI_VERSION {{.*}}
 // COMMON:#define __ORDER_BIG_ENDIAN__ 4321
 // COMMON:#define __ORDER_LITTLE_ENDIAN__ 1234
 // COMMON:#define __ORDER_PDP_ENDIAN__ 3412
 // COMMON:#define __STDC_HOSTED__ 1
-// COMMON:#define __STDC_VERSION__ 201112L
 // COMMON:#define __STDC__ 1
-// COMMON:#define __VERSION__
+// COMMON:#define __VERSION__ {{.*}}
 // COMMON:#define __clang__ 1
 // COMMON:#define __clang_major__ {{[0-9]+}}
 // COMMON:#define __clang_minor__ {{[0-9]+}}
 // COMMON:#define __clang_patchlevel__ {{[0-9]+}}
-// COMMON:#define __clang_version__
+// COMMON:#define __clang_version__ {{.*}}
 // COMMON:#define __llvm__ 1
 //
+// RUN: %clang_cc1 -E -dM -triple=x86_64-pc-win32 < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -triple=x86_64-pc-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -triple=x86_64-apple-darwin < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -triple=armv7a-apple-darwin < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
 // 
-// RUN: %clang_cc1 -ffreestanding -E -dM < /dev/null | FileCheck -check-prefix FREESTANDING %s
+// C-DEFAULT:#define __STDC_VERSION__ 201112L
+//
+// RUN: %clang_cc1 -ffreestanding -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix FREESTANDING %s
 // FREESTANDING:#define __STDC_HOSTED__ 0
 //
 //
-// RUN: %clang_cc1 -x c++ -std=gnu++1z -E -dM < /dev/null | FileCheck -check-prefix GXX1Z %s
+// RUN: %clang_cc1 -x c++ -std=gnu++1z -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX1Z %s
 //
-// GXX1Z:#define __GNUG__
+// GXX1Z:#define __GNUG__ {{.*}}
 // GXX1Z:#define __GXX_WEAK__ 1
 // GXX1Z:#define __cplusplus 201406L
 // GXX1Z:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=gnu++1y -E -dM < /dev/null | FileCheck -check-prefix GXX1Y %s
+// RUN: %clang_cc1 -x c++ -std=gnu++1y -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX1Y %s
 //
-// GXX1Y:#define __GNUG__
+// GXX1Y:#define __GNUG__ {{.*}}
 // GXX1Y:#define __GXX_WEAK__ 1
 // GXX1Y:#define __cplusplus 201402L
 // GXX1Y:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=gnu++11 -E -dM < /dev/null | FileCheck -check-prefix GXX11 %s
+// RUN: %clang_cc1 -x c++ -std=gnu++11 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX11 %s
 //
-// GXX11:#define __GNUG__
+// GXX11:#define __GNUG__ {{.*}}
 // GXX11:#define __GXX_WEAK__ 1
 // GXX11:#define __cplusplus 201103L
 // GXX11:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=gnu++98 -E -dM < /dev/null | FileCheck -check-prefix GXX98 %s
+// RUN: %clang_cc1 -x c++ -std=gnu++98 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX98 %s
 //
-// GXX98:#define __GNUG__
+// GXX98:#define __GNUG__ {{.*}}
 // GXX98:#define __GXX_WEAK__ 1
 // GXX98:#define __cplusplus 199711L
 // GXX98:#define __private_extern__ extern
 //
 // 
-// RUN: %clang_cc1 -std=iso9899:199409 -E -dM < /dev/null | FileCheck -check-prefix C94 %s
+// RUN: %clang_cc1 -std=iso9899:199409 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix C94 %s
 //
 // C94:#define __STDC_VERSION__ 199409L
 //
 // 
-// RUN: %clang_cc1 -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -check-prefix MSEXT %s
+// RUN: %clang_cc1 -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MSEXT %s
 //
 // MSEXT-NOT:#define __STDC__
 // MSEXT:#define _INTEGRAL_MAX_BITS 64
@@ -140,100 +153,100 @@
 // MSEXT-NOT:#define _WCHAR_T_DEFINED 1
 //
 //
-// RUN: %clang_cc1 -x c++ -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -check-prefix MSEXT-CXX %s
+// RUN: %clang_cc1 -x c++ -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MSEXT-CXX %s
 //
 // MSEXT-CXX:#define _NATIVE_WCHAR_T_DEFINED 1
 // MSEXT-CXX:#define _WCHAR_T_DEFINED 1
 // MSEXT-CXX:#define __BOOL_DEFINED 1
 //
 //
-// RUN: %clang_cc1 -x c++ -fno-wchar -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -check-prefix MSEXT-CXX-NOWCHAR %s
+// RUN: %clang_cc1 -x c++ -fno-wchar -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MSEXT-CXX-NOWCHAR %s
 //
 // MSEXT-CXX-NOWCHAR-NOT:#define _NATIVE_WCHAR_T_DEFINED 1
 // MSEXT-CXX-NOWCHAR-NOT:#define _WCHAR_T_DEFINED 1
 // MSEXT-CXX-NOWCHAR:#define __BOOL_DEFINED 1
 //
 // 
-// RUN: %clang_cc1 -x objective-c -E -dM < /dev/null | FileCheck -check-prefix OBJC %s
+// RUN: %clang_cc1 -x objective-c -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix OBJC %s
 //
 // OBJC:#define OBJC_NEW_PROPERTIES 1
 // OBJC:#define __NEXT_RUNTIME__ 1
 // OBJC:#define __OBJC__ 1
 //
 //
-// RUN: %clang_cc1 -x objective-c -fobjc-gc -E -dM < /dev/null | FileCheck -check-prefix OBJCGC %s
+// RUN: %clang_cc1 -x objective-c -fobjc-gc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix OBJCGC %s
 //
 // OBJCGC:#define __OBJC_GC__ 1
 //
 // 
-// RUN: %clang_cc1 -x objective-c -fobjc-exceptions -E -dM < /dev/null | FileCheck -check-prefix NONFRAGILE %s
+// RUN: %clang_cc1 -x objective-c -fobjc-exceptions -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix NONFRAGILE %s
 //
 // NONFRAGILE:#define OBJC_ZEROCOST_EXCEPTIONS 1
 // NONFRAGILE:#define __OBJC2__ 1
 //
 //
-// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -check-prefix O0 %s
+// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix O0 %s
 //
 // O0:#define __NO_INLINE__ 1
 // O0-NOT:#define __OPTIMIZE_SIZE__
 // O0-NOT:#define __OPTIMIZE__
 //
 //
-// RUN: %clang_cc1 -fno-inline -O3 -E -dM < /dev/null | FileCheck -check-prefix NO_INLINE %s
+// RUN: %clang_cc1 -fno-inline -O3 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix NO_INLINE %s
 //
 // NO_INLINE:#define __NO_INLINE__ 1
 // NO_INLINE-NOT:#define __OPTIMIZE_SIZE__
-// NO_INLINE:#define __OPTIMIZE__
+// NO_INLINE:#define __OPTIMIZE__ 1
 //
 //
-// RUN: %clang_cc1 -O1 -E -dM < /dev/null | FileCheck -check-prefix O1 %s
+// RUN: %clang_cc1 -O1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix O1 %s
 //
 // O1-NOT:#define __OPTIMIZE_SIZE__
 // O1:#define __OPTIMIZE__ 1
 //
 //
-// RUN: %clang_cc1 -Os -E -dM < /dev/null | FileCheck -check-prefix Os %s
+// RUN: %clang_cc1 -Os -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix Os %s
 //
 // Os:#define __OPTIMIZE_SIZE__ 1
 // Os:#define __OPTIMIZE__ 1
 //
 //
-// RUN: %clang_cc1 -Oz -E -dM < /dev/null | FileCheck -check-prefix Oz %s
+// RUN: %clang_cc1 -Oz -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix Oz %s
 //
 // Oz:#define __OPTIMIZE_SIZE__ 1
 // Oz:#define __OPTIMIZE__ 1
 //
 //
-// RUN: %clang_cc1 -fpascal-strings -E -dM < /dev/null | FileCheck -check-prefix PASCAL %s
+// RUN: %clang_cc1 -fpascal-strings -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix PASCAL %s
 //
 // PASCAL:#define __PASCAL_STRINGS__ 1
 //
 // 
-// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -check-prefix SCHAR %s
+// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix SCHAR %s
 // 
 // SCHAR:#define __STDC__ 1
 // SCHAR-NOT:#define __UNSIGNED_CHAR__
 // SCHAR:#define __clang__ 1
 //
-// RUN: %clang_cc1 -E -dM -fshort-wchar < /dev/null | FileCheck -check-prefix SHORTWCHAR %s
+// RUN: %clang_cc1 -E -dM -fshort-wchar < /dev/null | FileCheck -match-full-lines -check-prefix SHORTWCHAR %s
 // wchar_t is u16 for targeting Win32.
 // FIXME: Implement and check x86_64-cygwin.
-// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=x86_64-w64-mingw32 < /dev/null | FileCheck -check-prefix SHORTWCHAR %s
+// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=x86_64-w64-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix SHORTWCHAR %s
 //
 // SHORTWCHAR: #define __SIZEOF_WCHAR_T__ 2
 // SHORTWCHAR: #define __WCHAR_MAX__ 65535
 // SHORTWCHAR: #define __WCHAR_TYPE__ unsigned short
 // SHORTWCHAR: #define __WCHAR_WIDTH__ 16
 //
-// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=i686-unknown-unknown < /dev/null | FileCheck -check-prefix SHORTWCHAR2 %s
-// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=x86_64-unknown-unknown < /dev/null | FileCheck -check-prefix SHORTWCHAR2 %s
+// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=i686-unknown-unknown < /dev/null | FileCheck -match-full-lines -check-prefix SHORTWCHAR2 %s
+// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=x86_64-unknown-unknown < /dev/null | FileCheck -match-full-lines -check-prefix SHORTWCHAR2 %s
 //
 // SHORTWCHAR2: #define __SIZEOF_WCHAR_T__ 4
 // SHORTWCHAR2: #define __WCHAR_WIDTH__ 32
 // Other definitions vary from platform to platform
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-none-none < /dev/null | FileCheck -check-prefix AARCH64 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm64-none-none < /dev/null | FileCheck -check-prefix AARCH64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64 %s
 //
 // AARCH64:#define _LP64 1
 // AARCH64-NOT:#define __AARCH64EB__ 1
@@ -277,12 +290,12 @@
 // AARCH64:#define __FLT_MIN_EXP__ (-125)
 // AARCH64:#define __FLT_MIN__ 1.17549435e-38F
 // AARCH64:#define __FLT_RADIX__ 2
-// AARCH64:#define __INT16_C_SUFFIX__ {{$}}
+// AARCH64:#define __INT16_C_SUFFIX__
 // AARCH64:#define __INT16_FMTd__ "hd"
 // AARCH64:#define __INT16_FMTi__ "hi"
 // AARCH64:#define __INT16_MAX__ 32767
 // AARCH64:#define __INT16_TYPE__ short
-// AARCH64:#define __INT32_C_SUFFIX__ {{$}}
+// AARCH64:#define __INT32_C_SUFFIX__
 // AARCH64:#define __INT32_FMTd__ "d"
 // AARCH64:#define __INT32_FMTi__ "i"
 // AARCH64:#define __INT32_MAX__ 2147483647
@@ -292,7 +305,7 @@
 // AARCH64:#define __INT64_FMTi__ "li"
 // AARCH64:#define __INT64_MAX__ 9223372036854775807L
 // AARCH64:#define __INT64_TYPE__ long int
-// AARCH64:#define __INT8_C_SUFFIX__ {{$}}
+// AARCH64:#define __INT8_C_SUFFIX__
 // AARCH64:#define __INT8_FMTd__ "hhd"
 // AARCH64:#define __INT8_FMTi__ "hhi"
 // AARCH64:#define __INT8_MAX__ 127
@@ -380,7 +393,7 @@
 // AARCH64:#define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64:#define __SIZE_TYPE__ long unsigned int
 // AARCH64:#define __SIZE_WIDTH__ 64
-// AARCH64:#define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64:#define __UINT16_C_SUFFIX__
 // AARCH64:#define __UINT16_MAX__ 65535
 // AARCH64:#define __UINT16_TYPE__ unsigned short
 // AARCH64:#define __UINT32_C_SUFFIX__ U
@@ -389,7 +402,7 @@
 // AARCH64:#define __UINT64_C_SUFFIX__ UL
 // AARCH64:#define __UINT64_MAX__ 18446744073709551615UL
 // AARCH64:#define __UINT64_TYPE__ long unsigned int
-// AARCH64:#define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64:#define __UINT8_C_SUFFIX__
 // AARCH64:#define __UINT8_MAX__ 255
 // AARCH64:#define __UINT8_TYPE__ unsigned char
 // AARCH64:#define __UINTMAX_C_SUFFIX__ UL
@@ -415,7 +428,7 @@
 // AARCH64:#define __UINT_LEAST64_TYPE__ long unsigned int
 // AARCH64:#define __UINT_LEAST8_MAX__ 255
 // AARCH64:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64:#define __USER_LABEL_PREFIX__ _
+// AARCH64:#define __USER_LABEL_PREFIX__
 // AARCH64:#define __WCHAR_MAX__ 4294967295U
 // AARCH64:#define __WCHAR_TYPE__ unsigned int
 // AARCH64:#define __WCHAR_UNSIGNED__ 1
@@ -424,7 +437,7 @@
 // AARCH64:#define __WINT_WIDTH__ 32
 // AARCH64:#define __aarch64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64_be-none-none < /dev/null | FileCheck -check-prefix AARCH64-BE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64_be-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-BE %s
 //
 // AARCH64-BE:#define _LP64 1
 // AARCH64-BE:#define __AARCH64EB__ 1
@@ -468,12 +481,12 @@
 // AARCH64-BE:#define __FLT_MIN_EXP__ (-125)
 // AARCH64-BE:#define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-BE:#define __FLT_RADIX__ 2
-// AARCH64-BE:#define __INT16_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __INT16_C_SUFFIX__
 // AARCH64-BE:#define __INT16_FMTd__ "hd"
 // AARCH64-BE:#define __INT16_FMTi__ "hi"
 // AARCH64-BE:#define __INT16_MAX__ 32767
 // AARCH64-BE:#define __INT16_TYPE__ short
-// AARCH64-BE:#define __INT32_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __INT32_C_SUFFIX__
 // AARCH64-BE:#define __INT32_FMTd__ "d"
 // AARCH64-BE:#define __INT32_FMTi__ "i"
 // AARCH64-BE:#define __INT32_MAX__ 2147483647
@@ -483,7 +496,7 @@
 // AARCH64-BE:#define __INT64_FMTi__ "li"
 // AARCH64-BE:#define __INT64_MAX__ 9223372036854775807L
 // AARCH64-BE:#define __INT64_TYPE__ long int
-// AARCH64-BE:#define __INT8_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __INT8_C_SUFFIX__
 // AARCH64-BE:#define __INT8_FMTd__ "hhd"
 // AARCH64-BE:#define __INT8_FMTi__ "hhi"
 // AARCH64-BE:#define __INT8_MAX__ 127
@@ -571,7 +584,7 @@
 // AARCH64-BE:#define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-BE:#define __SIZE_TYPE__ long unsigned int
 // AARCH64-BE:#define __SIZE_WIDTH__ 64
-// AARCH64-BE:#define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __UINT16_C_SUFFIX__
 // AARCH64-BE:#define __UINT16_MAX__ 65535
 // AARCH64-BE:#define __UINT16_TYPE__ unsigned short
 // AARCH64-BE:#define __UINT32_C_SUFFIX__ U
@@ -580,7 +593,7 @@
 // AARCH64-BE:#define __UINT64_C_SUFFIX__ UL
 // AARCH64-BE:#define __UINT64_MAX__ 18446744073709551615UL
 // AARCH64-BE:#define __UINT64_TYPE__ long unsigned int
-// AARCH64-BE:#define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __UINT8_C_SUFFIX__
 // AARCH64-BE:#define __UINT8_MAX__ 255
 // AARCH64-BE:#define __UINT8_TYPE__ unsigned char
 // AARCH64-BE:#define __UINTMAX_C_SUFFIX__ UL
@@ -606,7 +619,7 @@
 // AARCH64-BE:#define __UINT_LEAST64_TYPE__ long unsigned int
 // AARCH64-BE:#define __UINT_LEAST8_MAX__ 255
 // AARCH64-BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64-BE:#define __USER_LABEL_PREFIX__ _
+// AARCH64-BE:#define __USER_LABEL_PREFIX__
 // AARCH64-BE:#define __WCHAR_MAX__ 4294967295U
 // AARCH64-BE:#define __WCHAR_TYPE__ unsigned int
 // AARCH64-BE:#define __WCHAR_UNSIGNED__ 1
@@ -615,7 +628,7 @@
 // AARCH64-BE:#define __WINT_WIDTH__ 32
 // AARCH64-BE:#define __aarch64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-netbsd < /dev/null | FileCheck -check-prefix AARCH64-NETBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-NETBSD %s
 //
 // AARCH64-NETBSD:#define _LP64 1
 // AARCH64-NETBSD-NOT:#define __AARCH64EB__ 1
@@ -660,12 +673,12 @@
 // AARCH64-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // AARCH64-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-NETBSD:#define __FLT_RADIX__ 2
-// AARCH64-NETBSD:#define __INT16_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __INT16_C_SUFFIX__
 // AARCH64-NETBSD:#define __INT16_FMTd__ "hd"
 // AARCH64-NETBSD:#define __INT16_FMTi__ "hi"
 // AARCH64-NETBSD:#define __INT16_MAX__ 32767
 // AARCH64-NETBSD:#define __INT16_TYPE__ short
-// AARCH64-NETBSD:#define __INT32_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __INT32_C_SUFFIX__
 // AARCH64-NETBSD:#define __INT32_FMTd__ "d"
 // AARCH64-NETBSD:#define __INT32_FMTi__ "i"
 // AARCH64-NETBSD:#define __INT32_MAX__ 2147483647
@@ -673,9 +686,9 @@
 // AARCH64-NETBSD:#define __INT64_C_SUFFIX__ LL
 // AARCH64-NETBSD:#define __INT64_FMTd__ "lld"
 // AARCH64-NETBSD:#define __INT64_FMTi__ "lli"
-// AARCH64-NETBSD:#define __INT64_MAX__ 9223372036854775807L
+// AARCH64-NETBSD:#define __INT64_MAX__ 9223372036854775807LL
 // AARCH64-NETBSD:#define __INT64_TYPE__ long long int
-// AARCH64-NETBSD:#define __INT8_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __INT8_C_SUFFIX__
 // AARCH64-NETBSD:#define __INT8_FMTd__ "hhd"
 // AARCH64-NETBSD:#define __INT8_FMTi__ "hhi"
 // AARCH64-NETBSD:#define __INT8_MAX__ 127
@@ -764,7 +777,7 @@
 // AARCH64-NETBSD:#define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-NETBSD:#define __SIZE_TYPE__ long unsigned int
 // AARCH64-NETBSD:#define __SIZE_WIDTH__ 64
-// AARCH64-NETBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __UINT16_C_SUFFIX__
 // AARCH64-NETBSD:#define __UINT16_MAX__ 65535
 // AARCH64-NETBSD:#define __UINT16_TYPE__ unsigned short
 // AARCH64-NETBSD:#define __UINT32_C_SUFFIX__ U
@@ -773,7 +786,7 @@
 // AARCH64-NETBSD:#define __UINT64_C_SUFFIX__ ULL
 // AARCH64-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL
 // AARCH64-NETBSD:#define __UINT64_TYPE__ long long unsigned int
-// AARCH64-NETBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __UINT8_C_SUFFIX__
 // AARCH64-NETBSD:#define __UINT8_MAX__ 255
 // AARCH64-NETBSD:#define __UINT8_TYPE__ unsigned char
 // AARCH64-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL
@@ -807,7 +820,7 @@
 // AARCH64-NETBSD:#define __WINT_WIDTH__ 32
 // AARCH64-NETBSD:#define __aarch64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-freebsd11 < /dev/null | FileCheck -check-prefix AARCH64-FREEBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-freebsd11 < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-FREEBSD %s
 //
 // AARCH64-FREEBSD:#define _LP64 1
 // AARCH64-FREEBSD-NOT:#define __AARCH64EB__ 1
@@ -853,12 +866,12 @@
 // AARCH64-FREEBSD:#define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-FREEBSD:#define __FLT_RADIX__ 2
 // AARCH64-FREEBSD:#define __FreeBSD__ 11
-// AARCH64-FREEBSD:#define __INT16_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __INT16_C_SUFFIX__
 // AARCH64-FREEBSD:#define __INT16_FMTd__ "hd"
 // AARCH64-FREEBSD:#define __INT16_FMTi__ "hi"
 // AARCH64-FREEBSD:#define __INT16_MAX__ 32767
 // AARCH64-FREEBSD:#define __INT16_TYPE__ short
-// AARCH64-FREEBSD:#define __INT32_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __INT32_C_SUFFIX__
 // AARCH64-FREEBSD:#define __INT32_FMTd__ "d"
 // AARCH64-FREEBSD:#define __INT32_FMTi__ "i"
 // AARCH64-FREEBSD:#define __INT32_MAX__ 2147483647
@@ -868,7 +881,7 @@
 // AARCH64-FREEBSD:#define __INT64_FMTi__ "li"
 // AARCH64-FREEBSD:#define __INT64_MAX__ 9223372036854775807L
 // AARCH64-FREEBSD:#define __INT64_TYPE__ long int
-// AARCH64-FREEBSD:#define __INT8_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __INT8_C_SUFFIX__
 // AARCH64-FREEBSD:#define __INT8_FMTd__ "hhd"
 // AARCH64-FREEBSD:#define __INT8_FMTi__ "hhi"
 // AARCH64-FREEBSD:#define __INT8_MAX__ 127
@@ -957,7 +970,7 @@
 // AARCH64-FREEBSD:#define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-FREEBSD:#define __SIZE_TYPE__ long unsigned int
 // AARCH64-FREEBSD:#define __SIZE_WIDTH__ 64
-// AARCH64-FREEBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __UINT16_C_SUFFIX__
 // AARCH64-FREEBSD:#define __UINT16_MAX__ 65535
 // AARCH64-FREEBSD:#define __UINT16_TYPE__ unsigned short
 // AARCH64-FREEBSD:#define __UINT32_C_SUFFIX__ U
@@ -966,7 +979,7 @@
 // AARCH64-FREEBSD:#define __UINT64_C_SUFFIX__ UL
 // AARCH64-FREEBSD:#define __UINT64_MAX__ 18446744073709551615UL
 // AARCH64-FREEBSD:#define __UINT64_TYPE__ long unsigned int
-// AARCH64-FREEBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __UINT8_C_SUFFIX__
 // AARCH64-FREEBSD:#define __UINT8_MAX__ 255
 // AARCH64-FREEBSD:#define __UINT8_TYPE__ unsigned char
 // AARCH64-FREEBSD:#define __UINTMAX_C_SUFFIX__ UL
@@ -1001,7 +1014,7 @@
 // AARCH64-FREEBSD:#define __WINT_WIDTH__ 32
 // AARCH64-FREEBSD:#define __aarch64__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-apple-ios7.0 < /dev/null | FileCheck -check-prefix AARCH64-DARWIN %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-apple-ios7.0 < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-DARWIN %s
 //
 // AARCH64-DARWIN: #define _LP64 1
 // AARCH64-NOT: #define __AARCH64EB__ 1
@@ -1045,22 +1058,22 @@
 // AARCH64-DARWIN: #define __FLT_MIN_EXP__ (-125)
 // AARCH64-DARWIN: #define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-DARWIN: #define __FLT_RADIX__ 2
-// AARCH64-DARWIN: #define __INT16_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __INT16_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT16_FMTd__ "hd"
 // AARCH64-DARWIN: #define __INT16_FMTi__ "hi"
 // AARCH64-DARWIN: #define __INT16_MAX__ 32767
 // AARCH64-DARWIN: #define __INT16_TYPE__ short
-// AARCH64-DARWIN: #define __INT32_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __INT32_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT32_FMTd__ "d"
 // AARCH64-DARWIN: #define __INT32_FMTi__ "i"
 // AARCH64-DARWIN: #define __INT32_MAX__ 2147483647
 // AARCH64-DARWIN: #define __INT32_TYPE__ int
-// AARCH64-DARWIN: #define __INT64_C_SUFFIX__ L
+// AARCH64-DARWIN: #define __INT64_C_SUFFIX__ LL
 // AARCH64-DARWIN: #define __INT64_FMTd__ "lld"
 // AARCH64-DARWIN: #define __INT64_FMTi__ "lli"
-// AARCH64-DARWIN: #define __INT64_MAX__ 9223372036854775807L
+// AARCH64-DARWIN: #define __INT64_MAX__ 9223372036854775807LL
 // AARCH64-DARWIN: #define __INT64_TYPE__ long long int
-// AARCH64-DARWIN: #define __INT8_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __INT8_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT8_FMTd__ "hhd"
 // AARCH64-DARWIN: #define __INT8_FMTi__ "hhi"
 // AARCH64-DARWIN: #define __INT8_MAX__ 127
@@ -1148,16 +1161,16 @@
 // AARCH64-DARWIN: #define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-DARWIN: #define __SIZE_TYPE__ long unsigned int
 // AARCH64-DARWIN: #define __SIZE_WIDTH__ 64
-// AARCH64-DARWIN: #define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __UINT16_C_SUFFIX__
 // AARCH64-DARWIN: #define __UINT16_MAX__ 65535
 // AARCH64-DARWIN: #define __UINT16_TYPE__ unsigned short
 // AARCH64-DARWIN: #define __UINT32_C_SUFFIX__ U
 // AARCH64-DARWIN: #define __UINT32_MAX__ 4294967295U
 // AARCH64-DARWIN: #define __UINT32_TYPE__ unsigned int
-// AARCH64-DARWIN: #define __UINT64_C_SUFFIX__ UL
-// AARCH64-DARWIN: #define __UINT64_MAX__ 18446744073709551615UL
+// AARCH64-DARWIN: #define __UINT64_C_SUFFIX__ ULL
+// AARCH64-DARWIN: #define __UINT64_MAX__ 18446744073709551615ULL
 // AARCH64-DARWIN: #define __UINT64_TYPE__ long long unsigned int
-// AARCH64-DARWIN: #define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __UINT8_C_SUFFIX__
 // AARCH64-DARWIN: #define __UINT8_MAX__ 255
 // AARCH64-DARWIN: #define __UINT8_TYPE__ unsigned char
 // AARCH64-DARWIN: #define __UINTMAX_C_SUFFIX__ UL
@@ -1192,7 +1205,7 @@
 // AARCH64-DARWIN: #define __WINT_WIDTH__ 32
 // AARCH64-DARWIN: #define __aarch64__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-none < /dev/null | FileCheck -check-prefix ARM %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARM %s
 //
 // ARM-NOT:#define _LP64
 // ARM:#define __APCS_32__ 1
@@ -1234,12 +1247,12 @@
 // ARM:#define __FLT_MIN_EXP__ (-125)
 // ARM:#define __FLT_MIN__ 1.17549435e-38F
 // ARM:#define __FLT_RADIX__ 2
-// ARM:#define __INT16_C_SUFFIX__ {{$}}
+// ARM:#define __INT16_C_SUFFIX__
 // ARM:#define __INT16_FMTd__ "hd"
 // ARM:#define __INT16_FMTi__ "hi"
 // ARM:#define __INT16_MAX__ 32767
 // ARM:#define __INT16_TYPE__ short
-// ARM:#define __INT32_C_SUFFIX__ {{$}}
+// ARM:#define __INT32_C_SUFFIX__
 // ARM:#define __INT32_FMTd__ "d"
 // ARM:#define __INT32_FMTi__ "i"
 // ARM:#define __INT32_MAX__ 2147483647
@@ -1249,7 +1262,7 @@
 // ARM:#define __INT64_FMTi__ "lli"
 // ARM:#define __INT64_MAX__ 9223372036854775807LL
 // ARM:#define __INT64_TYPE__ long long int
-// ARM:#define __INT8_C_SUFFIX__ {{$}}
+// ARM:#define __INT8_C_SUFFIX__
 // ARM:#define __INT8_FMTd__ "hhd"
 // ARM:#define __INT8_FMTi__ "hhi"
 // ARM:#define __INT8_MAX__ 127
@@ -1338,7 +1351,7 @@
 // ARM:#define __SIZE_MAX__ 4294967295U
 // ARM:#define __SIZE_TYPE__ unsigned int
 // ARM:#define __SIZE_WIDTH__ 32
-// ARM:#define __UINT16_C_SUFFIX__ {{$}}
+// ARM:#define __UINT16_C_SUFFIX__
 // ARM:#define __UINT16_MAX__ 65535
 // ARM:#define __UINT16_TYPE__ unsigned short
 // ARM:#define __UINT32_C_SUFFIX__ U
@@ -1347,14 +1360,14 @@
 // ARM:#define __UINT64_C_SUFFIX__ ULL
 // ARM:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM:#define __UINT64_TYPE__ long long unsigned int
-// ARM:#define __UINT8_C_SUFFIX__ {{$}}
+// ARM:#define __UINT8_C_SUFFIX__
 // ARM:#define __UINT8_MAX__ 255
 // ARM:#define __UINT8_TYPE__ unsigned char
 // ARM:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM:#define __UINTMAX_TYPE__ long long unsigned int
 // ARM:#define __UINTMAX_WIDTH__ 64
-// ARM:#define __UINTPTR_MAX__ 4294967295U
+// ARM:#define __UINTPTR_MAX__ 4294967295UL
 // ARM:#define __UINTPTR_TYPE__ long unsigned int
 // ARM:#define __UINTPTR_WIDTH__ 32
 // ARM:#define __UINT_FAST16_MAX__ 65535
@@ -1373,7 +1386,7 @@
 // ARM:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // ARM:#define __UINT_LEAST8_MAX__ 255
 // ARM:#define __UINT_LEAST8_TYPE__ unsigned char
-// ARM:#define __USER_LABEL_PREFIX__ _
+// ARM:#define __USER_LABEL_PREFIX__
 // ARM:#define __WCHAR_MAX__ 4294967295U
 // ARM:#define __WCHAR_TYPE__ unsigned int
 // ARM:#define __WCHAR_WIDTH__ 32
@@ -1382,7 +1395,7 @@
 // ARM:#define __arm 1
 // ARM:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armeb-none-none < /dev/null | FileCheck -check-prefix ARM-BE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armeb-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARM-BE %s
 //
 // ARM-BE-NOT:#define _LP64
 // ARM-BE:#define __APCS_32__ 1
@@ -1425,12 +1438,12 @@
 // ARM-BE:#define __FLT_MIN_EXP__ (-125)
 // ARM-BE:#define __FLT_MIN__ 1.17549435e-38F
 // ARM-BE:#define __FLT_RADIX__ 2
-// ARM-BE:#define __INT16_C_SUFFIX__ {{$}}
+// ARM-BE:#define __INT16_C_SUFFIX__
 // ARM-BE:#define __INT16_FMTd__ "hd"
 // ARM-BE:#define __INT16_FMTi__ "hi"
 // ARM-BE:#define __INT16_MAX__ 32767
 // ARM-BE:#define __INT16_TYPE__ short
-// ARM-BE:#define __INT32_C_SUFFIX__ {{$}}
+// ARM-BE:#define __INT32_C_SUFFIX__
 // ARM-BE:#define __INT32_FMTd__ "d"
 // ARM-BE:#define __INT32_FMTi__ "i"
 // ARM-BE:#define __INT32_MAX__ 2147483647
@@ -1440,7 +1453,7 @@
 // ARM-BE:#define __INT64_FMTi__ "lli"
 // ARM-BE:#define __INT64_MAX__ 9223372036854775807LL
 // ARM-BE:#define __INT64_TYPE__ long long int
-// ARM-BE:#define __INT8_C_SUFFIX__ {{$}}
+// ARM-BE:#define __INT8_C_SUFFIX__
 // ARM-BE:#define __INT8_FMTd__ "hhd"
 // ARM-BE:#define __INT8_FMTi__ "hhi"
 // ARM-BE:#define __INT8_MAX__ 127
@@ -1528,7 +1541,7 @@
 // ARM-BE:#define __SIZE_MAX__ 4294967295U
 // ARM-BE:#define __SIZE_TYPE__ unsigned int
 // ARM-BE:#define __SIZE_WIDTH__ 32
-// ARM-BE:#define __UINT16_C_SUFFIX__ {{$}}
+// ARM-BE:#define __UINT16_C_SUFFIX__
 // ARM-BE:#define __UINT16_MAX__ 65535
 // ARM-BE:#define __UINT16_TYPE__ unsigned short
 // ARM-BE:#define __UINT32_C_SUFFIX__ U
@@ -1537,14 +1550,14 @@
 // ARM-BE:#define __UINT64_C_SUFFIX__ ULL
 // ARM-BE:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM-BE:#define __UINT64_TYPE__ long long unsigned int
-// ARM-BE:#define __UINT8_C_SUFFIX__ {{$}}
+// ARM-BE:#define __UINT8_C_SUFFIX__
 // ARM-BE:#define __UINT8_MAX__ 255
 // ARM-BE:#define __UINT8_TYPE__ unsigned char
 // ARM-BE:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM-BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM-BE:#define __UINTMAX_TYPE__ long long unsigned int
 // ARM-BE:#define __UINTMAX_WIDTH__ 64
-// ARM-BE:#define __UINTPTR_MAX__ 4294967295U
+// ARM-BE:#define __UINTPTR_MAX__ 4294967295UL
 // ARM-BE:#define __UINTPTR_TYPE__ long unsigned int
 // ARM-BE:#define __UINTPTR_WIDTH__ 32
 // ARM-BE:#define __UINT_FAST16_MAX__ 65535
@@ -1563,7 +1576,7 @@
 // ARM-BE:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // ARM-BE:#define __UINT_LEAST8_MAX__ 255
 // ARM-BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// ARM-BE:#define __USER_LABEL_PREFIX__ _
+// ARM-BE:#define __USER_LABEL_PREFIX__
 // ARM-BE:#define __WCHAR_MAX__ 4294967295U
 // ARM-BE:#define __WCHAR_TYPE__ unsigned int
 // ARM-BE:#define __WCHAR_WIDTH__ 32
@@ -1572,7 +1585,7 @@
 // ARM-BE:#define __arm 1
 // ARM-BE:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-linux-gnueabi -target-feature +soft-float -target-feature +soft-float-abi < /dev/null | FileCheck -check-prefix ARMEABISOFTFP %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-linux-gnueabi -target-feature +soft-float -target-feature +soft-float-abi < /dev/null | FileCheck -match-full-lines -check-prefix ARMEABISOFTFP %s
 //
 // ARMEABISOFTFP-NOT:#define _LP64
 // ARMEABISOFTFP:#define __APCS_32__ 1
@@ -1618,12 +1631,12 @@
 // ARMEABISOFTFP:#define __FLT_MIN_EXP__ (-125)
 // ARMEABISOFTFP:#define __FLT_MIN__ 1.17549435e-38F
 // ARMEABISOFTFP:#define __FLT_RADIX__ 2
-// ARMEABISOFTFP:#define __INT16_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __INT16_C_SUFFIX__
 // ARMEABISOFTFP:#define __INT16_FMTd__ "hd"
 // ARMEABISOFTFP:#define __INT16_FMTi__ "hi"
 // ARMEABISOFTFP:#define __INT16_MAX__ 32767
 // ARMEABISOFTFP:#define __INT16_TYPE__ short
-// ARMEABISOFTFP:#define __INT32_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __INT32_C_SUFFIX__
 // ARMEABISOFTFP:#define __INT32_FMTd__ "d"
 // ARMEABISOFTFP:#define __INT32_FMTi__ "i"
 // ARMEABISOFTFP:#define __INT32_MAX__ 2147483647
@@ -1633,7 +1646,7 @@
 // ARMEABISOFTFP:#define __INT64_FMTi__ "lli"
 // ARMEABISOFTFP:#define __INT64_MAX__ 9223372036854775807LL
 // ARMEABISOFTFP:#define __INT64_TYPE__ long long int
-// ARMEABISOFTFP:#define __INT8_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __INT8_C_SUFFIX__
 // ARMEABISOFTFP:#define __INT8_FMTd__ "hhd"
 // ARMEABISOFTFP:#define __INT8_FMTi__ "hhi"
 // ARMEABISOFTFP:#define __INT8_MAX__ 127
@@ -1723,7 +1736,7 @@
 // ARMEABISOFTFP:#define __SIZE_TYPE__ unsigned int
 // ARMEABISOFTFP:#define __SIZE_WIDTH__ 32
 // ARMEABISOFTFP:#define __SOFTFP__ 1
-// ARMEABISOFTFP:#define __UINT16_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __UINT16_C_SUFFIX__
 // ARMEABISOFTFP:#define __UINT16_MAX__ 65535
 // ARMEABISOFTFP:#define __UINT16_TYPE__ unsigned short
 // ARMEABISOFTFP:#define __UINT32_C_SUFFIX__ U
@@ -1732,14 +1745,14 @@
 // ARMEABISOFTFP:#define __UINT64_C_SUFFIX__ ULL
 // ARMEABISOFTFP:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARMEABISOFTFP:#define __UINT64_TYPE__ long long unsigned int
-// ARMEABISOFTFP:#define __UINT8_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __UINT8_C_SUFFIX__
 // ARMEABISOFTFP:#define __UINT8_MAX__ 255
 // ARMEABISOFTFP:#define __UINT8_TYPE__ unsigned char
 // ARMEABISOFTFP:#define __UINTMAX_C_SUFFIX__ ULL
 // ARMEABISOFTFP:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARMEABISOFTFP:#define __UINTMAX_TYPE__ long long unsigned int
 // ARMEABISOFTFP:#define __UINTMAX_WIDTH__ 64
-// ARMEABISOFTFP:#define __UINTPTR_MAX__ 4294967295U
+// ARMEABISOFTFP:#define __UINTPTR_MAX__ 4294967295UL
 // ARMEABISOFTFP:#define __UINTPTR_TYPE__ long unsigned int
 // ARMEABISOFTFP:#define __UINTPTR_WIDTH__ 32
 // ARMEABISOFTFP:#define __UINT_FAST16_MAX__ 65535
@@ -1767,7 +1780,7 @@
 // ARMEABISOFTFP:#define __arm 1
 // ARMEABISOFTFP:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-linux-gnueabi < /dev/null | FileCheck -check-prefix ARMEABIHARDFP %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-linux-gnueabi < /dev/null | FileCheck -match-full-lines -check-prefix ARMEABIHARDFP %s
 //
 // ARMEABIHARDFP-NOT:#define _LP64
 // ARMEABIHARDFP:#define __APCS_32__ 1
@@ -1813,12 +1826,12 @@
 // ARMEABIHARDFP:#define __FLT_MIN_EXP__ (-125)
 // ARMEABIHARDFP:#define __FLT_MIN__ 1.17549435e-38F
 // ARMEABIHARDFP:#define __FLT_RADIX__ 2
-// ARMEABIHARDFP:#define __INT16_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __INT16_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT16_FMTd__ "hd"
 // ARMEABIHARDFP:#define __INT16_FMTi__ "hi"
 // ARMEABIHARDFP:#define __INT16_MAX__ 32767
 // ARMEABIHARDFP:#define __INT16_TYPE__ short
-// ARMEABIHARDFP:#define __INT32_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __INT32_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT32_FMTd__ "d"
 // ARMEABIHARDFP:#define __INT32_FMTi__ "i"
 // ARMEABIHARDFP:#define __INT32_MAX__ 2147483647
@@ -1828,7 +1841,7 @@
 // ARMEABIHARDFP:#define __INT64_FMTi__ "lli"
 // ARMEABIHARDFP:#define __INT64_MAX__ 9223372036854775807LL
 // ARMEABIHARDFP:#define __INT64_TYPE__ long long int
-// ARMEABIHARDFP:#define __INT8_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __INT8_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT8_FMTd__ "hhd"
 // ARMEABIHARDFP:#define __INT8_FMTi__ "hhi"
 // ARMEABIHARDFP:#define __INT8_MAX__ 127
@@ -1918,7 +1931,7 @@
 // ARMEABIHARDFP:#define __SIZE_TYPE__ unsigned int
 // ARMEABIHARDFP:#define __SIZE_WIDTH__ 32
 // ARMEABIHARDFP-NOT:#define __SOFTFP__ 1
-// ARMEABIHARDFP:#define __UINT16_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __UINT16_C_SUFFIX__
 // ARMEABIHARDFP:#define __UINT16_MAX__ 65535
 // ARMEABIHARDFP:#define __UINT16_TYPE__ unsigned short
 // ARMEABIHARDFP:#define __UINT32_C_SUFFIX__ U
@@ -1927,14 +1940,14 @@
 // ARMEABIHARDFP:#define __UINT64_C_SUFFIX__ ULL
 // ARMEABIHARDFP:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARMEABIHARDFP:#define __UINT64_TYPE__ long long unsigned int
-// ARMEABIHARDFP:#define __UINT8_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __UINT8_C_SUFFIX__
 // ARMEABIHARDFP:#define __UINT8_MAX__ 255
 // ARMEABIHARDFP:#define __UINT8_TYPE__ unsigned char
 // ARMEABIHARDFP:#define __UINTMAX_C_SUFFIX__ ULL
 // ARMEABIHARDFP:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARMEABIHARDFP:#define __UINTMAX_TYPE__ long long unsigned int
 // ARMEABIHARDFP:#define __UINTMAX_WIDTH__ 64
-// ARMEABIHARDFP:#define __UINTPTR_MAX__ 4294967295U
+// ARMEABIHARDFP:#define __UINTPTR_MAX__ 4294967295UL
 // ARMEABIHARDFP:#define __UINTPTR_TYPE__ long unsigned int
 // ARMEABIHARDFP:#define __UINTPTR_WIDTH__ 32
 // ARMEABIHARDFP:#define __UINT_FAST16_MAX__ 65535
@@ -1962,7 +1975,7 @@
 // ARMEABIHARDFP:#define __arm 1
 // ARMEABIHARDFP:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-netbsd-eabi < /dev/null | FileCheck -check-prefix ARM-NETBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-netbsd-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NETBSD %s
 //
 // ARM-NETBSD-NOT:#define _LP64
 // ARM-NETBSD:#define __APCS_32__ 1
@@ -2006,12 +2019,12 @@
 // ARM-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // ARM-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // ARM-NETBSD:#define __FLT_RADIX__ 2
-// ARM-NETBSD:#define __INT16_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __INT16_C_SUFFIX__
 // ARM-NETBSD:#define __INT16_FMTd__ "hd"
 // ARM-NETBSD:#define __INT16_FMTi__ "hi"
 // ARM-NETBSD:#define __INT16_MAX__ 32767
 // ARM-NETBSD:#define __INT16_TYPE__ short
-// ARM-NETBSD:#define __INT32_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __INT32_C_SUFFIX__
 // ARM-NETBSD:#define __INT32_FMTd__ "d"
 // ARM-NETBSD:#define __INT32_FMTi__ "i"
 // ARM-NETBSD:#define __INT32_MAX__ 2147483647
@@ -2021,7 +2034,7 @@
 // ARM-NETBSD:#define __INT64_FMTi__ "lli"
 // ARM-NETBSD:#define __INT64_MAX__ 9223372036854775807LL
 // ARM-NETBSD:#define __INT64_TYPE__ long long int
-// ARM-NETBSD:#define __INT8_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __INT8_C_SUFFIX__
 // ARM-NETBSD:#define __INT8_FMTd__ "hhd"
 // ARM-NETBSD:#define __INT8_FMTi__ "hhi"
 // ARM-NETBSD:#define __INT8_MAX__ 127
@@ -2107,10 +2120,10 @@
 // ARM-NETBSD:#define __SIZEOF_SIZE_T__ 4
 // ARM-NETBSD:#define __SIZEOF_WCHAR_T__ 4
 // ARM-NETBSD:#define __SIZEOF_WINT_T__ 4
-// ARM-NETBSD:#define __SIZE_MAX__ 4294967295U
+// ARM-NETBSD:#define __SIZE_MAX__ 4294967295UL
 // ARM-NETBSD:#define __SIZE_TYPE__ long unsigned int
 // ARM-NETBSD:#define __SIZE_WIDTH__ 32
-// ARM-NETBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __UINT16_C_SUFFIX__
 // ARM-NETBSD:#define __UINT16_MAX__ 65535
 // ARM-NETBSD:#define __UINT16_TYPE__ unsigned short
 // ARM-NETBSD:#define __UINT32_C_SUFFIX__ U
@@ -2119,14 +2132,14 @@
 // ARM-NETBSD:#define __UINT64_C_SUFFIX__ ULL
 // ARM-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM-NETBSD:#define __UINT64_TYPE__ long long unsigned int
-// ARM-NETBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __UINT8_C_SUFFIX__
 // ARM-NETBSD:#define __UINT8_MAX__ 255
 // ARM-NETBSD:#define __UINT8_TYPE__ unsigned char
-// ARM-NETBSD:#define __UINTMAX_C_SUFFIX__ UL
+// ARM-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM-NETBSD:#define __UINTMAX_TYPE__ long long unsigned int
 // ARM-NETBSD:#define __UINTMAX_WIDTH__ 64
-// ARM-NETBSD:#define __UINTPTR_MAX__ 4294967295U
+// ARM-NETBSD:#define __UINTPTR_MAX__ 4294967295UL
 // ARM-NETBSD:#define __UINTPTR_TYPE__ long unsigned int
 // ARM-NETBSD:#define __UINTPTR_WIDTH__ 32
 // ARM-NETBSD:#define __UINT_FAST16_MAX__ 65535
@@ -2154,75 +2167,78 @@
 // ARM-NETBSD:#define __arm 1
 // ARM-NETBSD:#define __arm__ 1
 
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NONE-EABI %s
+// ARM-NONE-EABI: #define __ELF__ 1
+
 // No MachO targets use the full EABI, even if AAPCS is used.
-// RUN: %clang -target x86_64-apple-darwin -arch armv7s -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-MACHO-NO-EABI %s
-// RUN: %clang -target x86_64-apple-darwin -arch armv6m -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-MACHO-NO-EABI %s
-// RUN: %clang -target x86_64-apple-darwin -arch armv7m -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-MACHO-NO-EABI %s
-// RUN: %clang -target x86_64-apple-darwin -arch armv7em -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-MACHO-NO-EABI %s
-// RUN: %clang -target x86_64-apple-darwin -arch armv7 -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv7s -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv6m -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv7m -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv7em -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
 // ARM-MACHO-NO-EABI-NOT: #define __ARM_EABI__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armv7-bitrig-gnueabihf < /dev/null | FileCheck -check-prefix ARM-BITRIG %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armv7-bitrig-gnueabihf < /dev/null | FileCheck -match-full-lines -check-prefix ARM-BITRIG %s
 // ARM-BITRIG:#define __ARM_DWARF_EH__ 1
 // ARM-BITRIG:#define __SIZEOF_SIZE_T__ 4
-// ARM-BITRIG:#define __SIZE_MAX__ 4294967295U
+// ARM-BITRIG:#define __SIZE_MAX__ 4294967295UL
 // ARM-BITRIG:#define __SIZE_TYPE__ long unsigned int
 // ARM-BITRIG:#define __SIZE_WIDTH__ 32
 
 // Check that -mhwdiv works properly for targets which don't have the hwdiv feature enabled by default.
 
-// RUN: %clang -target arm -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARMHWDIV-ARM %s
+// RUN: %clang -target arm -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMHWDIV-ARM %s
 // ARMHWDIV-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
 
-// RUN: %clang -target arm -mthumb -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBHWDIV-THUMB %s
+// RUN: %clang -target arm -mthumb -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=THUMBHWDIV-THUMB %s
 // THUMBHWDIV-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
 
-// RUN: %clang -target arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-FALSE %s
+// RUN: %clang -target arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-FALSE %s
 // ARM-FALSE-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang -target arm -mthumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMB-FALSE %s
+// RUN: %clang -target arm -mthumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=THUMB-FALSE %s
 // THUMB-FALSE-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang -target arm -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBHWDIV-ARM-FALSE %s
+// RUN: %clang -target arm -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=THUMBHWDIV-ARM-FALSE %s
 // THUMBHWDIV-ARM-FALSE-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang -target arm -mthumb -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARMHWDIV-THUMB-FALSE %s
+// RUN: %clang -target arm -mthumb -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMHWDIV-THUMB-FALSE %s
 // ARMHWDIV-THUMB-FALSE-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armv8-none-none < /dev/null | FileCheck -check-prefix ARMv8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armv8-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARMv8 %s
 // ARMv8: #define __THUMB_INTERWORK__ 1
 // ARMv8-NOT: #define __thumb2__
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armebv8-none-none < /dev/null | FileCheck -check-prefix ARMebv8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armebv8-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARMebv8 %s
 // ARMebv8: #define __THUMB_INTERWORK__ 1
 // ARMebv8-NOT: #define __thumb2__
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv8 < /dev/null | FileCheck -check-prefix Thumbv8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv8 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbv8 %s
 // Thumbv8: #define __THUMB_INTERWORK__ 1
-// Thumbv8: #define __thumb2__
+// Thumbv8: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbebv8 < /dev/null | FileCheck -check-prefix Thumbebv8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbebv8 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbebv8 %s
 // Thumbebv8: #define __THUMB_INTERWORK__ 1
-// Thumbebv8: #define __thumb2__
+// Thumbebv8: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv5 < /dev/null | FileCheck -check-prefix Thumbv5 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv5 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbv5 %s
 // Thumbv5: #define __THUMB_INTERWORK__ 1
-// Thumbv5-NOT: #define __thumb2__
+// Thumbv5-NOT: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv6t2 < /dev/null | FileCheck -check-prefix Thumbv6t2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv6t2 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbv6t2 %s
 // Thumbv6t2: #define __THUMB_INTERWORK__ 1
-// Thumbv6t2: #define __thumb2__
+// Thumbv6t2: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv7 < /dev/null | FileCheck -check-prefix Thumbv7 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv7 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbv7 %s
 // Thumbv7: #define __THUMB_INTERWORK__ 1
-// Thumbv7: #define __thumb2__
+// Thumbv7: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbebv7 < /dev/null | FileCheck -check-prefix Thumbebv7 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbebv7 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbebv7 %s
 // Thumbebv7: #define __THUMB_INTERWORK__ 1
-// Thumbebv7: #define __thumb2__
+// Thumbebv7: #define __thumb2__ 1
 
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-none-none < /dev/null | FileCheck -check-prefix I386 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-none-none < /dev/null | FileCheck -match-full-lines -check-prefix I386 %s
 //
 // I386-NOT:#define _LP64
 // I386:#define __BIGGEST_ALIGNMENT__ 16
@@ -2259,12 +2275,12 @@
 // I386:#define __FLT_MIN_EXP__ (-125)
 // I386:#define __FLT_MIN__ 1.17549435e-38F
 // I386:#define __FLT_RADIX__ 2
-// I386:#define __INT16_C_SUFFIX__ {{$}}
+// I386:#define __INT16_C_SUFFIX__
 // I386:#define __INT16_FMTd__ "hd"
 // I386:#define __INT16_FMTi__ "hi"
 // I386:#define __INT16_MAX__ 32767
 // I386:#define __INT16_TYPE__ short
-// I386:#define __INT32_C_SUFFIX__ {{$}}
+// I386:#define __INT32_C_SUFFIX__
 // I386:#define __INT32_FMTd__ "d"
 // I386:#define __INT32_FMTi__ "i"
 // I386:#define __INT32_MAX__ 2147483647
@@ -2274,7 +2290,7 @@
 // I386:#define __INT64_FMTi__ "lli"
 // I386:#define __INT64_MAX__ 9223372036854775807LL
 // I386:#define __INT64_TYPE__ long long int
-// I386:#define __INT8_C_SUFFIX__ {{$}}
+// I386:#define __INT8_C_SUFFIX__
 // I386:#define __INT8_FMTd__ "hhd"
 // I386:#define __INT8_FMTi__ "hhi"
 // I386:#define __INT8_MAX__ 127
@@ -2344,7 +2360,7 @@
 // I386:#define __POINTER_WIDTH__ 32
 // I386:#define __PTRDIFF_TYPE__ int
 // I386:#define __PTRDIFF_WIDTH__ 32
-// I386:#define __REGISTER_PREFIX__ 
+// I386:#define __REGISTER_PREFIX__
 // I386:#define __SCHAR_MAX__ 127
 // I386:#define __SHRT_MAX__ 32767
 // I386:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -2364,7 +2380,7 @@
 // I386:#define __SIZE_MAX__ 4294967295U
 // I386:#define __SIZE_TYPE__ unsigned int
 // I386:#define __SIZE_WIDTH__ 32
-// I386:#define __UINT16_C_SUFFIX__ {{$}}
+// I386:#define __UINT16_C_SUFFIX__
 // I386:#define __UINT16_MAX__ 65535
 // I386:#define __UINT16_TYPE__ unsigned short
 // I386:#define __UINT32_C_SUFFIX__ U
@@ -2373,7 +2389,7 @@
 // I386:#define __UINT64_C_SUFFIX__ ULL
 // I386:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386:#define __UINT64_TYPE__ long long unsigned int
-// I386:#define __UINT8_C_SUFFIX__ {{$}}
+// I386:#define __UINT8_C_SUFFIX__
 // I386:#define __UINT8_MAX__ 255
 // I386:#define __UINT8_TYPE__ unsigned char
 // I386:#define __UINTMAX_C_SUFFIX__ ULL
@@ -2399,7 +2415,7 @@
 // I386:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // I386:#define __UINT_LEAST8_MAX__ 255
 // I386:#define __UINT_LEAST8_TYPE__ unsigned char
-// I386:#define __USER_LABEL_PREFIX__ _
+// I386:#define __USER_LABEL_PREFIX__
 // I386:#define __WCHAR_MAX__ 2147483647
 // I386:#define __WCHAR_TYPE__ int
 // I386:#define __WCHAR_WIDTH__ 32
@@ -2409,7 +2425,7 @@
 // I386:#define __i386__ 1
 // I386:#define i386 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-pc-linux-gnu -target-cpu pentium4 < /dev/null | FileCheck -check-prefix I386-LINUX %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-pc-linux-gnu -target-cpu pentium4 < /dev/null | FileCheck -match-full-lines -check-prefix I386-LINUX %s
 //
 // I386-LINUX-NOT:#define _LP64
 // I386-LINUX:#define __BIGGEST_ALIGNMENT__ 16
@@ -2446,12 +2462,12 @@
 // I386-LINUX:#define __FLT_MIN_EXP__ (-125)
 // I386-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // I386-LINUX:#define __FLT_RADIX__ 2
-// I386-LINUX:#define __INT16_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __INT16_C_SUFFIX__
 // I386-LINUX:#define __INT16_FMTd__ "hd"
 // I386-LINUX:#define __INT16_FMTi__ "hi"
 // I386-LINUX:#define __INT16_MAX__ 32767
 // I386-LINUX:#define __INT16_TYPE__ short
-// I386-LINUX:#define __INT32_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __INT32_C_SUFFIX__
 // I386-LINUX:#define __INT32_FMTd__ "d"
 // I386-LINUX:#define __INT32_FMTi__ "i"
 // I386-LINUX:#define __INT32_MAX__ 2147483647
@@ -2461,7 +2477,7 @@
 // I386-LINUX:#define __INT64_FMTi__ "lli"
 // I386-LINUX:#define __INT64_MAX__ 9223372036854775807LL
 // I386-LINUX:#define __INT64_TYPE__ long long int
-// I386-LINUX:#define __INT8_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __INT8_C_SUFFIX__
 // I386-LINUX:#define __INT8_FMTd__ "hhd"
 // I386-LINUX:#define __INT8_FMTi__ "hhi"
 // I386-LINUX:#define __INT8_MAX__ 127
@@ -2531,7 +2547,7 @@
 // I386-LINUX:#define __POINTER_WIDTH__ 32
 // I386-LINUX:#define __PTRDIFF_TYPE__ int
 // I386-LINUX:#define __PTRDIFF_WIDTH__ 32
-// I386-LINUX:#define __REGISTER_PREFIX__ 
+// I386-LINUX:#define __REGISTER_PREFIX__
 // I386-LINUX:#define __SCHAR_MAX__ 127
 // I386-LINUX:#define __SHRT_MAX__ 32767
 // I386-LINUX:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -2551,7 +2567,7 @@
 // I386-LINUX:#define __SIZE_MAX__ 4294967295U
 // I386-LINUX:#define __SIZE_TYPE__ unsigned int
 // I386-LINUX:#define __SIZE_WIDTH__ 32
-// I386-LINUX:#define __UINT16_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __UINT16_C_SUFFIX__
 // I386-LINUX:#define __UINT16_MAX__ 65535
 // I386-LINUX:#define __UINT16_TYPE__ unsigned short
 // I386-LINUX:#define __UINT32_C_SUFFIX__ U
@@ -2560,7 +2576,7 @@
 // I386-LINUX:#define __UINT64_C_SUFFIX__ ULL
 // I386-LINUX:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386-LINUX:#define __UINT64_TYPE__ long long unsigned int
-// I386-LINUX:#define __UINT8_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __UINT8_C_SUFFIX__
 // I386-LINUX:#define __UINT8_MAX__ 255
 // I386-LINUX:#define __UINT8_TYPE__ unsigned char
 // I386-LINUX:#define __UINTMAX_C_SUFFIX__ ULL
@@ -2596,7 +2612,7 @@
 // I386-LINUX:#define __i386__ 1
 // I386-LINUX:#define i386 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd < /dev/null | FileCheck -check-prefix I386-NETBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD %s
 //
 // I386-NETBSD-NOT:#define _LP64
 // I386-NETBSD:#define __BIGGEST_ALIGNMENT__ 16
@@ -2633,12 +2649,12 @@
 // I386-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // I386-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // I386-NETBSD:#define __FLT_RADIX__ 2
-// I386-NETBSD:#define __INT16_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __INT16_C_SUFFIX__
 // I386-NETBSD:#define __INT16_FMTd__ "hd"
 // I386-NETBSD:#define __INT16_FMTi__ "hi"
 // I386-NETBSD:#define __INT16_MAX__ 32767
 // I386-NETBSD:#define __INT16_TYPE__ short
-// I386-NETBSD:#define __INT32_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __INT32_C_SUFFIX__
 // I386-NETBSD:#define __INT32_FMTd__ "d"
 // I386-NETBSD:#define __INT32_FMTi__ "i"
 // I386-NETBSD:#define __INT32_MAX__ 2147483647
@@ -2648,7 +2664,7 @@
 // I386-NETBSD:#define __INT64_FMTi__ "lli"
 // I386-NETBSD:#define __INT64_MAX__ 9223372036854775807LL
 // I386-NETBSD:#define __INT64_TYPE__ long long int
-// I386-NETBSD:#define __INT8_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __INT8_C_SUFFIX__
 // I386-NETBSD:#define __INT8_FMTd__ "hhd"
 // I386-NETBSD:#define __INT8_FMTi__ "hhi"
 // I386-NETBSD:#define __INT8_MAX__ 127
@@ -2718,7 +2734,7 @@
 // I386-NETBSD:#define __POINTER_WIDTH__ 32
 // I386-NETBSD:#define __PTRDIFF_TYPE__ int
 // I386-NETBSD:#define __PTRDIFF_WIDTH__ 32
-// I386-NETBSD:#define __REGISTER_PREFIX__ 
+// I386-NETBSD:#define __REGISTER_PREFIX__
 // I386-NETBSD:#define __SCHAR_MAX__ 127
 // I386-NETBSD:#define __SHRT_MAX__ 32767
 // I386-NETBSD:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -2738,7 +2754,7 @@
 // I386-NETBSD:#define __SIZE_MAX__ 4294967295U
 // I386-NETBSD:#define __SIZE_TYPE__ unsigned int
 // I386-NETBSD:#define __SIZE_WIDTH__ 32
-// I386-NETBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __UINT16_C_SUFFIX__
 // I386-NETBSD:#define __UINT16_MAX__ 65535
 // I386-NETBSD:#define __UINT16_TYPE__ unsigned short
 // I386-NETBSD:#define __UINT32_C_SUFFIX__ U
@@ -2747,7 +2763,7 @@
 // I386-NETBSD:#define __UINT64_C_SUFFIX__ ULL
 // I386-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386-NETBSD:#define __UINT64_TYPE__ long long unsigned int
-// I386-NETBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __UINT8_C_SUFFIX__
 // I386-NETBSD:#define __UINT8_MAX__ 255
 // I386-NETBSD:#define __UINT8_TYPE__ unsigned char
 // I386-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL
@@ -2783,21 +2799,21 @@
 // I386-NETBSD:#define __i386__ 1
 // I386-NETBSD:#define i386 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd -target-feature +sse2 < /dev/null | FileCheck -check-prefix I386-NETBSD-SSE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD-SSE %s
 // I386-NETBSD-SSE:#define __FLT_EVAL_METHOD__ 0
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6  < /dev/null | FileCheck -check-prefix I386-NETBSD6 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6  < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6 %s
 // I386-NETBSD6:#define __FLT_EVAL_METHOD__ 1
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6 -target-feature +sse2 < /dev/null | FileCheck -check-prefix I386-NETBSD6-SSE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6 -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6-SSE %s
 // I386-NETBSD6-SSE:#define __FLT_EVAL_METHOD__ 1
 
-// RUN: %clang_cc1 -E -dM -triple=i686-pc-mingw32 < /dev/null | FileCheck -check-prefix I386-DECLSPEC %s
-// RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-pc-mingw32 < /dev/null | FileCheck -check-prefix I386-DECLSPEC %s
-// RUN: %clang_cc1 -E -dM -triple=i686-unknown-cygwin < /dev/null | FileCheck -check-prefix I386-DECLSPEC %s
-// RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-unknown-cygwin < /dev/null | FileCheck -check-prefix I386-DECLSPEC %s
-// I386-DECLSPEC: #define __declspec
+// RUN: %clang_cc1 -E -dM -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
+// RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
+// RUN: %clang_cc1 -E -dM -triple=i686-unknown-cygwin < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
+// RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-unknown-cygwin < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
+// I386-DECLSPEC: #define __declspec{{.*}}
 
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none < /dev/null | FileCheck -check-prefix MIPS32BE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MIPS32BE %s
 //
 // MIPS32BE:#define MIPSEB 1
 // MIPS32BE:#define _ABIO32 1
@@ -2846,12 +2862,12 @@
 // MIPS32BE:#define __FLT_MIN_EXP__ (-125)
 // MIPS32BE:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS32BE:#define __FLT_RADIX__ 2
-// MIPS32BE:#define __INT16_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __INT16_C_SUFFIX__
 // MIPS32BE:#define __INT16_FMTd__ "hd"
 // MIPS32BE:#define __INT16_FMTi__ "hi"
 // MIPS32BE:#define __INT16_MAX__ 32767
 // MIPS32BE:#define __INT16_TYPE__ short
-// MIPS32BE:#define __INT32_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __INT32_C_SUFFIX__
 // MIPS32BE:#define __INT32_FMTd__ "d"
 // MIPS32BE:#define __INT32_FMTi__ "i"
 // MIPS32BE:#define __INT32_MAX__ 2147483647
@@ -2861,7 +2877,7 @@
 // MIPS32BE:#define __INT64_FMTi__ "lli"
 // MIPS32BE:#define __INT64_MAX__ 9223372036854775807LL
 // MIPS32BE:#define __INT64_TYPE__ long long int
-// MIPS32BE:#define __INT8_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __INT8_C_SUFFIX__
 // MIPS32BE:#define __INT8_FMTd__ "hhd"
 // MIPS32BE:#define __INT8_FMTi__ "hhi"
 // MIPS32BE:#define __INT8_MAX__ 127
@@ -2932,7 +2948,7 @@
 // MIPS32BE:#define __PRAGMA_REDEFINE_EXTNAME 1
 // MIPS32BE:#define __PTRDIFF_TYPE__ int
 // MIPS32BE:#define __PTRDIFF_WIDTH__ 32
-// MIPS32BE:#define __REGISTER_PREFIX__ 
+// MIPS32BE:#define __REGISTER_PREFIX__
 // MIPS32BE:#define __SCHAR_MAX__ 127
 // MIPS32BE:#define __SHRT_MAX__ 32767
 // MIPS32BE:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -2955,7 +2971,7 @@
 // MIPS32BE:#define __STDC_HOSTED__ 0
 // MIPS32BE:#define __STDC_VERSION__ 201112L
 // MIPS32BE:#define __STDC__ 1
-// MIPS32BE:#define __UINT16_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __UINT16_C_SUFFIX__
 // MIPS32BE:#define __UINT16_MAX__ 65535
 // MIPS32BE:#define __UINT16_TYPE__ unsigned short
 // MIPS32BE:#define __UINT32_C_SUFFIX__ U
@@ -2964,14 +2980,14 @@
 // MIPS32BE:#define __UINT64_C_SUFFIX__ ULL
 // MIPS32BE:#define __UINT64_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINT64_TYPE__ long long unsigned int
-// MIPS32BE:#define __UINT8_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __UINT8_C_SUFFIX__
 // MIPS32BE:#define __UINT8_MAX__ 255
 // MIPS32BE:#define __UINT8_TYPE__ unsigned char
 // MIPS32BE:#define __UINTMAX_C_SUFFIX__ ULL
 // MIPS32BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32BE:#define __UINTMAX_WIDTH__ 64
-// MIPS32BE:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32BE:#define __UINTPTR_MAX__ 4294967295UL
 // MIPS32BE:#define __UINTPTR_TYPE__ long unsigned int
 // MIPS32BE:#define __UINTPTR_WIDTH__ 32
 // MIPS32BE:#define __UINT_FAST16_MAX__ 65535
@@ -2990,7 +3006,7 @@
 // MIPS32BE:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // MIPS32BE:#define __UINT_LEAST8_MAX__ 255
 // MIPS32BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// MIPS32BE:#define __USER_LABEL_PREFIX__ _
+// MIPS32BE:#define __USER_LABEL_PREFIX__
 // MIPS32BE:#define __WCHAR_MAX__ 2147483647
 // MIPS32BE:#define __WCHAR_TYPE__ int
 // MIPS32BE:#define __WCHAR_WIDTH__ 32
@@ -3006,7 +3022,7 @@
 // MIPS32BE:#define _mips 1
 // MIPS32BE:#define mips 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mipsel-none-none < /dev/null | FileCheck -check-prefix MIPS32EL %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mipsel-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MIPS32EL %s
 //
 // MIPS32EL:#define MIPSEL 1
 // MIPS32EL:#define _ABIO32 1
@@ -3054,12 +3070,12 @@
 // MIPS32EL:#define __FLT_MIN_EXP__ (-125)
 // MIPS32EL:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS32EL:#define __FLT_RADIX__ 2
-// MIPS32EL:#define __INT16_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __INT16_C_SUFFIX__
 // MIPS32EL:#define __INT16_FMTd__ "hd"
 // MIPS32EL:#define __INT16_FMTi__ "hi"
 // MIPS32EL:#define __INT16_MAX__ 32767
 // MIPS32EL:#define __INT16_TYPE__ short
-// MIPS32EL:#define __INT32_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __INT32_C_SUFFIX__
 // MIPS32EL:#define __INT32_FMTd__ "d"
 // MIPS32EL:#define __INT32_FMTi__ "i"
 // MIPS32EL:#define __INT32_MAX__ 2147483647
@@ -3069,7 +3085,7 @@
 // MIPS32EL:#define __INT64_FMTi__ "lli"
 // MIPS32EL:#define __INT64_MAX__ 9223372036854775807LL
 // MIPS32EL:#define __INT64_TYPE__ long long int
-// MIPS32EL:#define __INT8_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __INT8_C_SUFFIX__
 // MIPS32EL:#define __INT8_FMTd__ "hhd"
 // MIPS32EL:#define __INT8_FMTi__ "hhi"
 // MIPS32EL:#define __INT8_MAX__ 127
@@ -3141,7 +3157,7 @@
 // MIPS32EL:#define __PRAGMA_REDEFINE_EXTNAME 1
 // MIPS32EL:#define __PTRDIFF_TYPE__ int
 // MIPS32EL:#define __PTRDIFF_WIDTH__ 32
-// MIPS32EL:#define __REGISTER_PREFIX__ 
+// MIPS32EL:#define __REGISTER_PREFIX__
 // MIPS32EL:#define __SCHAR_MAX__ 127
 // MIPS32EL:#define __SHRT_MAX__ 32767
 // MIPS32EL:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -3161,7 +3177,7 @@
 // MIPS32EL:#define __SIZE_MAX__ 4294967295U
 // MIPS32EL:#define __SIZE_TYPE__ unsigned int
 // MIPS32EL:#define __SIZE_WIDTH__ 32
-// MIPS32EL:#define __UINT16_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __UINT16_C_SUFFIX__
 // MIPS32EL:#define __UINT16_MAX__ 65535
 // MIPS32EL:#define __UINT16_TYPE__ unsigned short
 // MIPS32EL:#define __UINT32_C_SUFFIX__ U
@@ -3170,14 +3186,14 @@
 // MIPS32EL:#define __UINT64_C_SUFFIX__ ULL
 // MIPS32EL:#define __UINT64_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINT64_TYPE__ long long unsigned int
-// MIPS32EL:#define __UINT8_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __UINT8_C_SUFFIX__
 // MIPS32EL:#define __UINT8_MAX__ 255
 // MIPS32EL:#define __UINT8_TYPE__ unsigned char
 // MIPS32EL:#define __UINTMAX_C_SUFFIX__ ULL
 // MIPS32EL:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32EL:#define __UINTMAX_WIDTH__ 64
-// MIPS32EL:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32EL:#define __UINTPTR_MAX__ 4294967295UL
 // MIPS32EL:#define __UINTPTR_TYPE__ long unsigned int
 // MIPS32EL:#define __UINTPTR_WIDTH__ 32
 // MIPS32EL:#define __UINT_FAST16_MAX__ 65535
@@ -3196,7 +3212,7 @@
 // MIPS32EL:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // MIPS32EL:#define __UINT_LEAST8_MAX__ 255
 // MIPS32EL:#define __UINT_LEAST8_TYPE__ unsigned char
-// MIPS32EL:#define __USER_LABEL_PREFIX__ _
+// MIPS32EL:#define __USER_LABEL_PREFIX__
 // MIPS32EL:#define __WCHAR_MAX__ 2147483647
 // MIPS32EL:#define __WCHAR_TYPE__ int
 // MIPS32EL:#define __WCHAR_WIDTH__ 32
@@ -3214,7 +3230,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:            -triple=mips64-none-none -target-abi n32 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPSN32BE %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPSN32BE %s
 //
 // MIPSN32BE: #define MIPSEB 1
 // MIPSN32BE: #define _ABIN32 2
@@ -3286,7 +3302,6 @@
 // MIPSN32BE: #define __GNUC_STDC_INLINE__ 1
 // MIPSN32BE: #define __GNUC__ 4
 // MIPSN32BE: #define __GXX_ABI_VERSION 1002
-// MIPSN32BE: #define __GXX_RTTI 1
 // MIPSN32BE: #define __ILP32__ 1
 // MIPSN32BE: #define __INT16_C_SUFFIX__
 // MIPSN32BE: #define __INT16_FMTd__ "hd"
@@ -3500,7 +3515,7 @@
 // MIPSN32BE: #define __UINT_LEAST8_FMTx__ "hhx"
 // MIPSN32BE: #define __UINT_LEAST8_MAX__ 255
 // MIPSN32BE: #define __UINT_LEAST8_TYPE__ unsigned char
-// MIPSN32BE: #define __USER_LABEL_PREFIX__ _
+// MIPSN32BE: #define __USER_LABEL_PREFIX__
 // MIPSN32BE: #define __WCHAR_MAX__ 2147483647
 // MIPSN32BE: #define __WCHAR_TYPE__ int
 // MIPSN32BE: #define __WCHAR_WIDTH__ 32
@@ -3521,7 +3536,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:            -triple=mips64el-none-none -target-abi n32 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPSN32EL %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPSN32EL %s
 //
 // MIPSN32EL: #define MIPSEL 1
 // MIPSN32EL: #define _ABIN32 2
@@ -3592,7 +3607,6 @@
 // MIPSN32EL: #define __GNUC_STDC_INLINE__ 1
 // MIPSN32EL: #define __GNUC__ 4
 // MIPSN32EL: #define __GXX_ABI_VERSION 1002
-// MIPSN32EL: #define __GXX_RTTI 1
 // MIPSN32EL: #define __ILP32__ 1
 // MIPSN32EL: #define __INT16_C_SUFFIX__
 // MIPSN32EL: #define __INT16_FMTd__ "hd"
@@ -3807,7 +3821,7 @@
 // MIPSN32EL: #define __UINT_LEAST8_FMTx__ "hhx"
 // MIPSN32EL: #define __UINT_LEAST8_MAX__ 255
 // MIPSN32EL: #define __UINT_LEAST8_TYPE__ unsigned char
-// MIPSN32EL: #define __USER_LABEL_PREFIX__ _
+// MIPSN32EL: #define __USER_LABEL_PREFIX__
 // MIPSN32EL: #define __WCHAR_MAX__ 2147483647
 // MIPSN32EL: #define __WCHAR_TYPE__ int
 // MIPSN32EL: #define __WCHAR_WIDTH__ 32
@@ -3826,7 +3840,7 @@
 // MIPSN32EL: #define _mips 1
 // MIPSN32EL: #define mips 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none < /dev/null | FileCheck -check-prefix MIPS64BE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MIPS64BE %s
 //
 // MIPS64BE:#define MIPSEB 1
 // MIPS64BE:#define _ABI64 3
@@ -3875,12 +3889,12 @@
 // MIPS64BE:#define __FLT_MIN_EXP__ (-125)
 // MIPS64BE:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS64BE:#define __FLT_RADIX__ 2
-// MIPS64BE:#define __INT16_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __INT16_C_SUFFIX__
 // MIPS64BE:#define __INT16_FMTd__ "hd"
 // MIPS64BE:#define __INT16_FMTi__ "hi"
 // MIPS64BE:#define __INT16_MAX__ 32767
 // MIPS64BE:#define __INT16_TYPE__ short
-// MIPS64BE:#define __INT32_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __INT32_C_SUFFIX__
 // MIPS64BE:#define __INT32_FMTd__ "d"
 // MIPS64BE:#define __INT32_FMTi__ "i"
 // MIPS64BE:#define __INT32_MAX__ 2147483647
@@ -3890,7 +3904,7 @@
 // MIPS64BE:#define __INT64_FMTi__ "li"
 // MIPS64BE:#define __INT64_MAX__ 9223372036854775807L
 // MIPS64BE:#define __INT64_TYPE__ long int
-// MIPS64BE:#define __INT8_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __INT8_C_SUFFIX__
 // MIPS64BE:#define __INT8_FMTd__ "hhd"
 // MIPS64BE:#define __INT8_FMTi__ "hhi"
 // MIPS64BE:#define __INT8_MAX__ 127
@@ -3961,7 +3975,7 @@
 // MIPS64BE:#define __PRAGMA_REDEFINE_EXTNAME 1
 // MIPS64BE:#define __PTRDIFF_TYPE__ long int
 // MIPS64BE:#define __PTRDIFF_WIDTH__ 64
-// MIPS64BE:#define __REGISTER_PREFIX__ 
+// MIPS64BE:#define __REGISTER_PREFIX__
 // MIPS64BE:#define __SCHAR_MAX__ 127
 // MIPS64BE:#define __SHRT_MAX__ 32767
 // MIPS64BE:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -3982,7 +3996,7 @@
 // MIPS64BE:#define __SIZE_MAX__ 18446744073709551615UL
 // MIPS64BE:#define __SIZE_TYPE__ long unsigned int
 // MIPS64BE:#define __SIZE_WIDTH__ 64
-// MIPS64BE:#define __UINT16_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __UINT16_C_SUFFIX__
 // MIPS64BE:#define __UINT16_MAX__ 65535
 // MIPS64BE:#define __UINT16_TYPE__ unsigned short
 // MIPS64BE:#define __UINT32_C_SUFFIX__ U
@@ -3991,7 +4005,7 @@
 // MIPS64BE:#define __UINT64_C_SUFFIX__ UL
 // MIPS64BE:#define __UINT64_MAX__ 18446744073709551615UL
 // MIPS64BE:#define __UINT64_TYPE__ long unsigned int
-// MIPS64BE:#define __UINT8_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __UINT8_C_SUFFIX__
 // MIPS64BE:#define __UINT8_MAX__ 255
 // MIPS64BE:#define __UINT8_TYPE__ unsigned char
 // MIPS64BE:#define __UINTMAX_C_SUFFIX__ UL
@@ -4017,7 +4031,7 @@
 // MIPS64BE:#define __UINT_LEAST64_TYPE__ long unsigned int
 // MIPS64BE:#define __UINT_LEAST8_MAX__ 255
 // MIPS64BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// MIPS64BE:#define __USER_LABEL_PREFIX__ _
+// MIPS64BE:#define __USER_LABEL_PREFIX__
 // MIPS64BE:#define __WCHAR_MAX__ 2147483647
 // MIPS64BE:#define __WCHAR_TYPE__ int
 // MIPS64BE:#define __WCHAR_WIDTH__ 32
@@ -4035,7 +4049,7 @@
 // MIPS64BE:#define _mips 1
 // MIPS64BE:#define mips 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-none-none < /dev/null | FileCheck -check-prefix MIPS64EL %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MIPS64EL %s
 //
 // MIPS64EL:#define MIPSEL 1
 // MIPS64EL:#define _ABI64 3
@@ -4083,12 +4097,12 @@
 // MIPS64EL:#define __FLT_MIN_EXP__ (-125)
 // MIPS64EL:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS64EL:#define __FLT_RADIX__ 2
-// MIPS64EL:#define __INT16_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __INT16_C_SUFFIX__
 // MIPS64EL:#define __INT16_FMTd__ "hd"
 // MIPS64EL:#define __INT16_FMTi__ "hi"
 // MIPS64EL:#define __INT16_MAX__ 32767
 // MIPS64EL:#define __INT16_TYPE__ short
-// MIPS64EL:#define __INT32_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __INT32_C_SUFFIX__
 // MIPS64EL:#define __INT32_FMTd__ "d"
 // MIPS64EL:#define __INT32_FMTi__ "i"
 // MIPS64EL:#define __INT32_MAX__ 2147483647
@@ -4098,7 +4112,7 @@
 // MIPS64EL:#define __INT64_FMTi__ "li"
 // MIPS64EL:#define __INT64_MAX__ 9223372036854775807L
 // MIPS64EL:#define __INT64_TYPE__ long int
-// MIPS64EL:#define __INT8_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __INT8_C_SUFFIX__
 // MIPS64EL:#define __INT8_FMTd__ "hhd"
 // MIPS64EL:#define __INT8_FMTi__ "hhi"
 // MIPS64EL:#define __INT8_MAX__ 127
@@ -4170,7 +4184,7 @@
 // MIPS64EL:#define __PRAGMA_REDEFINE_EXTNAME 1
 // MIPS64EL:#define __PTRDIFF_TYPE__ long int
 // MIPS64EL:#define __PTRDIFF_WIDTH__ 64
-// MIPS64EL:#define __REGISTER_PREFIX__ 
+// MIPS64EL:#define __REGISTER_PREFIX__
 // MIPS64EL:#define __SCHAR_MAX__ 127
 // MIPS64EL:#define __SHRT_MAX__ 32767
 // MIPS64EL:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -4191,7 +4205,7 @@
 // MIPS64EL:#define __SIZE_MAX__ 18446744073709551615UL
 // MIPS64EL:#define __SIZE_TYPE__ long unsigned int
 // MIPS64EL:#define __SIZE_WIDTH__ 64
-// MIPS64EL:#define __UINT16_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __UINT16_C_SUFFIX__
 // MIPS64EL:#define __UINT16_MAX__ 65535
 // MIPS64EL:#define __UINT16_TYPE__ unsigned short
 // MIPS64EL:#define __UINT32_C_SUFFIX__ U
@@ -4200,7 +4214,7 @@
 // MIPS64EL:#define __UINT64_C_SUFFIX__ UL
 // MIPS64EL:#define __UINT64_MAX__ 18446744073709551615UL
 // MIPS64EL:#define __UINT64_TYPE__ long unsigned int
-// MIPS64EL:#define __UINT8_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __UINT8_C_SUFFIX__
 // MIPS64EL:#define __UINT8_MAX__ 255
 // MIPS64EL:#define __UINT8_TYPE__ unsigned char
 // MIPS64EL:#define __UINTMAX_C_SUFFIX__ UL
@@ -4226,7 +4240,7 @@
 // MIPS64EL:#define __UINT_LEAST64_TYPE__ long unsigned int
 // MIPS64EL:#define __UINT_LEAST8_MAX__ 255
 // MIPS64EL:#define __UINT_LEAST8_TYPE__ unsigned char
-// MIPS64EL:#define __USER_LABEL_PREFIX__ _
+// MIPS64EL:#define __USER_LABEL_PREFIX__
 // MIPS64EL:#define __WCHAR_MAX__ 2147483647
 // MIPS64EL:#define __WCHAR_TYPE__ int
 // MIPS64EL:#define __WCHAR_WIDTH__ 32
@@ -4248,7 +4262,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-DEF32 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-DEF32 %s
 //
 // MIPS-ARCH-DEF32:#define _MIPS_ARCH "mips32r2"
 // MIPS-ARCH-DEF32:#define _MIPS_ARCH_MIPS32R2 1
@@ -4257,7 +4271,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-nones \
 // RUN:            -target-cpu mips32 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32 %s
 //
 // MIPS-ARCH-32:#define _MIPS_ARCH "mips32"
 // MIPS-ARCH-32:#define _MIPS_ARCH_MIPS32 1
@@ -4266,7 +4280,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            -target-cpu mips32r2 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32R2 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32R2 %s
 //
 // MIPS-ARCH-32R2:#define _MIPS_ARCH "mips32r2"
 // MIPS-ARCH-32R2:#define _MIPS_ARCH_MIPS32R2 1
@@ -4275,7 +4289,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            -target-cpu mips32r3 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32R3 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32R3 %s
 //
 // MIPS-ARCH-32R3:#define _MIPS_ARCH "mips32r3"
 // MIPS-ARCH-32R3:#define _MIPS_ARCH_MIPS32R3 1
@@ -4284,7 +4298,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            -target-cpu mips32r5 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32R5 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32R5 %s
 //
 // MIPS-ARCH-32R5:#define _MIPS_ARCH "mips32r5"
 // MIPS-ARCH-32R5:#define _MIPS_ARCH_MIPS32R5 1
@@ -4293,7 +4307,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            -target-cpu mips32r6 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32R6 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32R6 %s
 //
 // MIPS-ARCH-32R6:#define _MIPS_ARCH "mips32r6"
 // MIPS-ARCH-32R6:#define _MIPS_ARCH_MIPS32R6 1
@@ -4302,7 +4316,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-DEF64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-DEF64 %s
 //
 // MIPS-ARCH-DEF64:#define _MIPS_ARCH "mips64r2"
 // MIPS-ARCH-DEF64:#define _MIPS_ARCH_MIPS64R2 1
@@ -4311,7 +4325,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64 %s
 //
 // MIPS-ARCH-64:#define _MIPS_ARCH "mips64"
 // MIPS-ARCH-64:#define _MIPS_ARCH_MIPS64 1
@@ -4320,7 +4334,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64r2 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64R2 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64R2 %s
 //
 // MIPS-ARCH-64R2:#define _MIPS_ARCH "mips64r2"
 // MIPS-ARCH-64R2:#define _MIPS_ARCH_MIPS64R2 1
@@ -4329,7 +4343,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64r3 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64R3 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64R3 %s
 //
 // MIPS-ARCH-64R3:#define _MIPS_ARCH "mips64r3"
 // MIPS-ARCH-64R3:#define _MIPS_ARCH_MIPS64R3 1
@@ -4338,7 +4352,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64r5 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64R5 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64R5 %s
 //
 // MIPS-ARCH-64R5:#define _MIPS_ARCH "mips64r5"
 // MIPS-ARCH-64R5:#define _MIPS_ARCH_MIPS64R5 1
@@ -4347,7 +4361,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64r6 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64R6 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64R6 %s
 //
 // MIPS-ARCH-64R6:#define _MIPS_ARCH "mips64r6"
 // MIPS-ARCH-64R6:#define _MIPS_ARCH_MIPS64R6 1
@@ -4358,23 +4372,23 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:   -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-FABI-HARD %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-FABI-HARD %s
 // MIPS-FABI-HARD:#define __mips_hard_float 1
 //
 // RUN: %clang_cc1 -target-feature +soft-float -E -dM -ffreestanding \
 // RUN:   -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-FABI-SOFT %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-FABI-SOFT %s
 // MIPS-FABI-SOFT:#define __mips_soft_float 1
 //
 // RUN: %clang_cc1 -target-feature +single-float -E -dM -ffreestanding \
 // RUN:   -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-FABI-SINGLE %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-FABI-SINGLE %s
 // MIPS-FABI-SINGLE:#define __mips_hard_float 1
 // MIPS-FABI-SINGLE:#define __mips_single_float 1
 //
 // RUN: %clang_cc1 -target-feature +soft-float -target-feature +single-float \
 // RUN:   -E -dM -ffreestanding -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-FABI-SINGLE-SOFT %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-FABI-SINGLE-SOFT %s
 // MIPS-FABI-SINGLE-SOFT:#define __mips_single_float 1
 // MIPS-FABI-SINGLE-SOFT:#define __mips_soft_float 1
 //
@@ -4382,94 +4396,94 @@
 //
 // RUN: %clang_cc1 -target-feature +mips16 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS16 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS16 %s
 // MIPS16:#define __mips16 1
 //
 // RUN: %clang_cc1 -target-feature -mips16 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix NOMIPS16 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix NOMIPS16 %s
 // NOMIPS16-NOT:#define __mips16 1
 //
 // RUN: %clang_cc1 -target-feature +micromips \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MICROMIPS %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MICROMIPS %s
 // MICROMIPS:#define __mips_micromips 1
 //
 // RUN: %clang_cc1 -target-feature -micromips \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix NOMICROMIPS %s
+// RUN:   | FileCheck -match-full-lines -check-prefix NOMICROMIPS %s
 // NOMICROMIPS-NOT:#define __mips_micromips 1
 //
 // RUN: %clang_cc1 -target-feature +dsp \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-DSP %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-DSP %s
 // MIPS-DSP:#define __mips_dsp 1
 // MIPS-DSP:#define __mips_dsp_rev 1
 // MIPS-DSP-NOT:#define __mips_dspr2 1
 //
 // RUN: %clang_cc1 -target-feature +dspr2 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-DSPR2 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-DSPR2 %s
 // MIPS-DSPR2:#define __mips_dsp 1
 // MIPS-DSPR2:#define __mips_dsp_rev 2
 // MIPS-DSPR2:#define __mips_dspr2 1
 //
 // RUN: %clang_cc1 -target-feature +msa \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-MSA %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-MSA %s
 // MIPS-MSA:#define __mips_msa 1
 //
 // RUN: %clang_cc1 -target-cpu mips32r3 -target-feature +nan2008 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-NAN2008 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-NAN2008 %s
 // MIPS-NAN2008:#define __mips_nan2008 1
 //
 // RUN: %clang_cc1 -target-cpu mips32r3 -target-feature -nan2008 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix NOMIPS-NAN2008 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix NOMIPS-NAN2008 %s
 // NOMIPS-NAN2008-NOT:#define __mips_nan2008 1
 //
 // RUN: %clang_cc1 -target-feature -fp64 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS32-MFP32 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS32-MFP32 %s
 // MIPS32-MFP32:#define _MIPS_FPSET 16
 // MIPS32-MFP32:#define __mips_fpr 32
 //
 // RUN: %clang_cc1 -target-feature +fp64 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS32-MFP64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS32-MFP64 %s
 // MIPS32-MFP64:#define _MIPS_FPSET 32
 // MIPS32-MFP64:#define __mips_fpr 64
 //
 // RUN: %clang_cc1 -target-feature +single-float \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS32-MFP32SF %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS32-MFP32SF %s
 // MIPS32-MFP32SF:#define _MIPS_FPSET 32
 // MIPS32-MFP32SF:#define __mips_fpr 32
 //
 // RUN: %clang_cc1 -target-feature +fp64 \
 // RUN:   -E -dM -triple=mips64-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS64-MFP64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS64-MFP64 %s
 // MIPS64-MFP64:#define _MIPS_FPSET 32
 // MIPS64-MFP64:#define __mips_fpr 64
 //
 // RUN: %clang_cc1 -target-feature -fp64 -target-feature +single-float \
 // RUN:   -E -dM -triple=mips64-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS64-NOMFP64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS64-NOMFP64 %s
 // MIPS64-NOMFP64:#define _MIPS_FPSET 32
 // MIPS64-NOMFP64:#define __mips_fpr 32
 //
 // RUN: %clang_cc1 -target-cpu mips32r6 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-XXR6 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-XXR6 %s
 // RUN: %clang_cc1 -target-cpu mips64r6 \
 // RUN:   -E -dM -triple=mips64-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-XXR6 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-XXR6 %s
 // MIPS-XXR6:#define _MIPS_FPSET 32
 // MIPS-XXR6:#define __mips_fpr 64
 // MIPS-XXR6:#define __mips_nan2008 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=msp430-none-none < /dev/null | FileCheck -check-prefix MSP430 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=msp430-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MSP430 %s
 //
 // MSP430:#define MSP430 1
 // MSP430-NOT:#define _LP64
@@ -4507,7 +4521,7 @@
 // MSP430:#define __FLT_MIN_EXP__ (-125)
 // MSP430:#define __FLT_MIN__ 1.17549435e-38F
 // MSP430:#define __FLT_RADIX__ 2
-// MSP430:#define __INT16_C_SUFFIX__ {{$}}
+// MSP430:#define __INT16_C_SUFFIX__
 // MSP430:#define __INT16_FMTd__ "hd"
 // MSP430:#define __INT16_FMTi__ "hi"
 // MSP430:#define __INT16_MAX__ 32767
@@ -4522,7 +4536,7 @@
 // MSP430:#define __INT64_FMTi__ "lli"
 // MSP430:#define __INT64_MAX__ 9223372036854775807LL
 // MSP430:#define __INT64_TYPE__ long long int
-// MSP430:#define __INT8_C_SUFFIX__ {{$}}
+// MSP430:#define __INT8_C_SUFFIX__
 // MSP430:#define __INT8_FMTd__ "hhd"
 // MSP430:#define __INT8_FMTi__ "hhi"
 // MSP430:#define __INT8_MAX__ 127
@@ -4591,10 +4605,10 @@
 // MSP430:#define __MSP430__ 1
 // MSP430:#define __POINTER_WIDTH__ 16
 // MSP430:#define __PTRDIFF_TYPE__ int
-// MSP430:#define __PTRDIFF_WIDTH__ 16 
+// MSP430:#define __PTRDIFF_WIDTH__ 16
 // MSP430:#define __SCHAR_MAX__ 127
 // MSP430:#define __SHRT_MAX__ 32767
-// MSP430:#define __SIG_ATOMIC_MAX__ 2147483647
+// MSP430:#define __SIG_ATOMIC_MAX__ 2147483647L
 // MSP430:#define __SIG_ATOMIC_WIDTH__ 32
 // MSP430:#define __SIZEOF_DOUBLE__ 8
 // MSP430:#define __SIZEOF_FLOAT__ 4
@@ -4608,11 +4622,11 @@
 // MSP430:#define __SIZEOF_SIZE_T__ 2
 // MSP430:#define __SIZEOF_WCHAR_T__ 2
 // MSP430:#define __SIZEOF_WINT_T__ 2
-// MSP430:#define __SIZE_MAX__ 65535
+// MSP430:#define __SIZE_MAX__ 65535U
 // MSP430:#define __SIZE_TYPE__ unsigned int
 // MSP430:#define __SIZE_WIDTH__ 16
 // MSP430:#define __UINT16_C_SUFFIX__ U
-// MSP430:#define __UINT16_MAX__ 65535
+// MSP430:#define __UINT16_MAX__ 65535U
 // MSP430:#define __UINT16_TYPE__ unsigned short
 // MSP430:#define __UINT32_C_SUFFIX__ UL
 // MSP430:#define __UINT32_MAX__ 4294967295UL
@@ -4620,17 +4634,17 @@
 // MSP430:#define __UINT64_C_SUFFIX__ ULL
 // MSP430:#define __UINT64_MAX__ 18446744073709551615ULL
 // MSP430:#define __UINT64_TYPE__ long long unsigned int
-// MSP430:#define __UINT8_C_SUFFIX__ {{$}}
+// MSP430:#define __UINT8_C_SUFFIX__
 // MSP430:#define __UINT8_MAX__ 255
 // MSP430:#define __UINT8_TYPE__ unsigned char
 // MSP430:#define __UINTMAX_C_SUFFIX__ ULL
 // MSP430:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MSP430:#define __UINTMAX_TYPE__ long long unsigned int
 // MSP430:#define __UINTMAX_WIDTH__ 64
-// MSP430:#define __UINTPTR_MAX__ 65535
+// MSP430:#define __UINTPTR_MAX__ 65535U
 // MSP430:#define __UINTPTR_TYPE__ unsigned int
 // MSP430:#define __UINTPTR_WIDTH__ 16
-// MSP430:#define __UINT_FAST16_MAX__ 65535
+// MSP430:#define __UINT_FAST16_MAX__ 65535U
 // MSP430:#define __UINT_FAST16_TYPE__ unsigned short
 // MSP430:#define __UINT_FAST32_MAX__ 4294967295UL
 // MSP430:#define __UINT_FAST32_TYPE__ long unsigned int
@@ -4638,7 +4652,7 @@
 // MSP430:#define __UINT_FAST64_TYPE__ long long unsigned int
 // MSP430:#define __UINT_FAST8_MAX__ 255
 // MSP430:#define __UINT_FAST8_TYPE__ unsigned char
-// MSP430:#define __UINT_LEAST16_MAX__ 65535
+// MSP430:#define __UINT_LEAST16_MAX__ 65535U
 // MSP430:#define __UINT_LEAST16_TYPE__ unsigned short
 // MSP430:#define __UINT_LEAST32_MAX__ 4294967295UL
 // MSP430:#define __UINT_LEAST32_TYPE__ long unsigned int
@@ -4646,7 +4660,7 @@
 // MSP430:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // MSP430:#define __UINT_LEAST8_MAX__ 255
 // MSP430:#define __UINT_LEAST8_TYPE__ unsigned char
-// MSP430:#define __USER_LABEL_PREFIX__ _
+// MSP430:#define __USER_LABEL_PREFIX__
 // MSP430:#define __WCHAR_MAX__ 32767
 // MSP430:#define __WCHAR_TYPE__ int
 // MSP430:#define __WCHAR_WIDTH__ 16
@@ -4654,7 +4668,7 @@
 // MSP430:#define __WINT_WIDTH__ 16
 // MSP430:#define __clang__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=nvptx-none-none < /dev/null | FileCheck -check-prefix NVPTX32 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=nvptx-none-none < /dev/null | FileCheck -match-full-lines -check-prefix NVPTX32 %s
 //
 // NVPTX32-NOT:#define _LP64
 // NVPTX32:#define __BIGGEST_ALIGNMENT__ 8
@@ -4693,12 +4707,12 @@
 // NVPTX32:#define __FLT_MIN_EXP__ (-125)
 // NVPTX32:#define __FLT_MIN__ 1.17549435e-38F
 // NVPTX32:#define __FLT_RADIX__ 2
-// NVPTX32:#define __INT16_C_SUFFIX__ {{$}}
+// NVPTX32:#define __INT16_C_SUFFIX__
 // NVPTX32:#define __INT16_FMTd__ "hd"
 // NVPTX32:#define __INT16_FMTi__ "hi"
 // NVPTX32:#define __INT16_MAX__ 32767
 // NVPTX32:#define __INT16_TYPE__ short
-// NVPTX32:#define __INT32_C_SUFFIX__ {{$}}
+// NVPTX32:#define __INT32_C_SUFFIX__
 // NVPTX32:#define __INT32_FMTd__ "d"
 // NVPTX32:#define __INT32_FMTi__ "i"
 // NVPTX32:#define __INT32_MAX__ 2147483647
@@ -4706,9 +4720,9 @@
 // NVPTX32:#define __INT64_C_SUFFIX__ LL
 // NVPTX32:#define __INT64_FMTd__ "lld"
 // NVPTX32:#define __INT64_FMTi__ "lli"
-// NVPTX32:#define __INT64_MAX__ 9223372036854775807L
+// NVPTX32:#define __INT64_MAX__ 9223372036854775807LL
 // NVPTX32:#define __INT64_TYPE__ long long int
-// NVPTX32:#define __INT8_C_SUFFIX__ {{$}}
+// NVPTX32:#define __INT8_C_SUFFIX__
 // NVPTX32:#define __INT8_FMTd__ "hhd"
 // NVPTX32:#define __INT8_FMTi__ "hhi"
 // NVPTX32:#define __INT8_MAX__ 127
@@ -4734,7 +4748,7 @@
 // NVPTX32:#define __INT_FAST32_TYPE__ int
 // NVPTX32:#define __INT_FAST64_FMTd__ "lld"
 // NVPTX32:#define __INT_FAST64_FMTi__ "lli"
-// NVPTX32:#define __INT_FAST64_MAX__ 9223372036854775807L
+// NVPTX32:#define __INT_FAST64_MAX__ 9223372036854775807LL
 // NVPTX32:#define __INT_FAST64_TYPE__ long long int
 // NVPTX32:#define __INT_FAST8_FMTd__ "hhd"
 // NVPTX32:#define __INT_FAST8_FMTi__ "hhi"
@@ -4750,7 +4764,7 @@
 // NVPTX32:#define __INT_LEAST32_TYPE__ int
 // NVPTX32:#define __INT_LEAST64_FMTd__ "lld"
 // NVPTX32:#define __INT_LEAST64_FMTi__ "lli"
-// NVPTX32:#define __INT_LEAST64_MAX__ 9223372036854775807L
+// NVPTX32:#define __INT_LEAST64_MAX__ 9223372036854775807LL
 // NVPTX32:#define __INT_LEAST64_TYPE__ long long int
 // NVPTX32:#define __INT_LEAST8_FMTd__ "hhd"
 // NVPTX32:#define __INT_LEAST8_FMTi__ "hhi"
@@ -4799,7 +4813,7 @@
 // NVPTX32:#define __SIZE_MAX__ 4294967295U
 // NVPTX32:#define __SIZE_TYPE__ unsigned int
 // NVPTX32:#define __SIZE_WIDTH__ 32
-// NVPTX32:#define __UINT16_C_SUFFIX__ {{$}}
+// NVPTX32:#define __UINT16_C_SUFFIX__
 // NVPTX32:#define __UINT16_MAX__ 65535
 // NVPTX32:#define __UINT16_TYPE__ unsigned short
 // NVPTX32:#define __UINT32_C_SUFFIX__ U
@@ -4808,7 +4822,7 @@
 // NVPTX32:#define __UINT64_C_SUFFIX__ ULL
 // NVPTX32:#define __UINT64_MAX__ 18446744073709551615ULL
 // NVPTX32:#define __UINT64_TYPE__ long long unsigned int
-// NVPTX32:#define __UINT8_C_SUFFIX__ {{$}}
+// NVPTX32:#define __UINT8_C_SUFFIX__
 // NVPTX32:#define __UINT8_MAX__ 255
 // NVPTX32:#define __UINT8_TYPE__ unsigned char
 // NVPTX32:#define __UINTMAX_C_SUFFIX__ ULL
@@ -4822,7 +4836,7 @@
 // NVPTX32:#define __UINT_FAST16_TYPE__ unsigned short
 // NVPTX32:#define __UINT_FAST32_MAX__ 4294967295U
 // NVPTX32:#define __UINT_FAST32_TYPE__ unsigned int
-// NVPTX32:#define __UINT_FAST64_MAX__ 18446744073709551615UL
+// NVPTX32:#define __UINT_FAST64_MAX__ 18446744073709551615ULL
 // NVPTX32:#define __UINT_FAST64_TYPE__ long long unsigned int
 // NVPTX32:#define __UINT_FAST8_MAX__ 255
 // NVPTX32:#define __UINT_FAST8_TYPE__ unsigned char
@@ -4830,18 +4844,18 @@
 // NVPTX32:#define __UINT_LEAST16_TYPE__ unsigned short
 // NVPTX32:#define __UINT_LEAST32_MAX__ 4294967295U
 // NVPTX32:#define __UINT_LEAST32_TYPE__ unsigned int
-// NVPTX32:#define __UINT_LEAST64_MAX__ 18446744073709551615UL
+// NVPTX32:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL
 // NVPTX32:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // NVPTX32:#define __UINT_LEAST8_MAX__ 255
 // NVPTX32:#define __UINT_LEAST8_TYPE__ unsigned char
-// NVPTX32:#define __USER_LABEL_PREFIX__ _
+// NVPTX32:#define __USER_LABEL_PREFIX__
 // NVPTX32:#define __WCHAR_MAX__ 2147483647
 // NVPTX32:#define __WCHAR_TYPE__ int
 // NVPTX32:#define __WCHAR_WIDTH__ 32
 // NVPTX32:#define __WINT_TYPE__ int
 // NVPTX32:#define __WINT_WIDTH__ 32
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=nvptx64-none-none < /dev/null | FileCheck -check-prefix NVPTX64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=nvptx64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix NVPTX64 %s
 //
 // NVPTX64:#define _LP64 1
 // NVPTX64:#define __BIGGEST_ALIGNMENT__ 8
@@ -4880,12 +4894,12 @@
 // NVPTX64:#define __FLT_MIN_EXP__ (-125)
 // NVPTX64:#define __FLT_MIN__ 1.17549435e-38F
 // NVPTX64:#define __FLT_RADIX__ 2
-// NVPTX64:#define __INT16_C_SUFFIX__ {{$}}
+// NVPTX64:#define __INT16_C_SUFFIX__
 // NVPTX64:#define __INT16_FMTd__ "hd"
 // NVPTX64:#define __INT16_FMTi__ "hi"
 // NVPTX64:#define __INT16_MAX__ 32767
 // NVPTX64:#define __INT16_TYPE__ short
-// NVPTX64:#define __INT32_C_SUFFIX__ {{$}}
+// NVPTX64:#define __INT32_C_SUFFIX__
 // NVPTX64:#define __INT32_FMTd__ "d"
 // NVPTX64:#define __INT32_FMTi__ "i"
 // NVPTX64:#define __INT32_MAX__ 2147483647
@@ -4893,9 +4907,9 @@
 // NVPTX64:#define __INT64_C_SUFFIX__ LL
 // NVPTX64:#define __INT64_FMTd__ "lld"
 // NVPTX64:#define __INT64_FMTi__ "lli"
-// NVPTX64:#define __INT64_MAX__ 9223372036854775807L
+// NVPTX64:#define __INT64_MAX__ 9223372036854775807LL
 // NVPTX64:#define __INT64_TYPE__ long long int
-// NVPTX64:#define __INT8_C_SUFFIX__ {{$}}
+// NVPTX64:#define __INT8_C_SUFFIX__
 // NVPTX64:#define __INT8_FMTd__ "hhd"
 // NVPTX64:#define __INT8_FMTi__ "hhi"
 // NVPTX64:#define __INT8_MAX__ 127
@@ -4986,7 +5000,7 @@
 // NVPTX64:#define __SIZE_MAX__ 18446744073709551615UL
 // NVPTX64:#define __SIZE_TYPE__ long unsigned int
 // NVPTX64:#define __SIZE_WIDTH__ 64
-// NVPTX64:#define __UINT16_C_SUFFIX__ {{$}}
+// NVPTX64:#define __UINT16_C_SUFFIX__
 // NVPTX64:#define __UINT16_MAX__ 65535
 // NVPTX64:#define __UINT16_TYPE__ unsigned short
 // NVPTX64:#define __UINT32_C_SUFFIX__ U
@@ -4995,7 +5009,7 @@
 // NVPTX64:#define __UINT64_C_SUFFIX__ ULL
 // NVPTX64:#define __UINT64_MAX__ 18446744073709551615ULL
 // NVPTX64:#define __UINT64_TYPE__ long long unsigned int
-// NVPTX64:#define __UINT8_C_SUFFIX__ {{$}}
+// NVPTX64:#define __UINT8_C_SUFFIX__
 // NVPTX64:#define __UINT8_MAX__ 255
 // NVPTX64:#define __UINT8_TYPE__ unsigned char
 // NVPTX64:#define __UINTMAX_C_SUFFIX__ ULL
@@ -5021,14 +5035,14 @@
 // NVPTX64:#define __UINT_LEAST64_TYPE__ long unsigned int
 // NVPTX64:#define __UINT_LEAST8_MAX__ 255
 // NVPTX64:#define __UINT_LEAST8_TYPE__ unsigned char
-// NVPTX64:#define __USER_LABEL_PREFIX__ _
+// NVPTX64:#define __USER_LABEL_PREFIX__
 // NVPTX64:#define __WCHAR_MAX__ 2147483647
 // NVPTX64:#define __WCHAR_TYPE__ int
 // NVPTX64:#define __WCHAR_WIDTH__ 32
 // NVPTX64:#define __WINT_TYPE__ int
 // NVPTX64:#define __WINT_WIDTH__ 32
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -target-cpu 603e < /dev/null | FileCheck -check-prefix PPC603E %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -target-cpu 603e < /dev/null | FileCheck -match-full-lines -check-prefix PPC603E %s
 //
 // PPC603E:#define _ARCH_603 1
 // PPC603E:#define _ARCH_603E 1
@@ -5071,12 +5085,12 @@
 // PPC603E:#define __FLT_MIN_EXP__ (-125)
 // PPC603E:#define __FLT_MIN__ 1.17549435e-38F
 // PPC603E:#define __FLT_RADIX__ 2
-// PPC603E:#define __INT16_C_SUFFIX__ {{$}}
+// PPC603E:#define __INT16_C_SUFFIX__
 // PPC603E:#define __INT16_FMTd__ "hd"
 // PPC603E:#define __INT16_FMTi__ "hi"
 // PPC603E:#define __INT16_MAX__ 32767
 // PPC603E:#define __INT16_TYPE__ short
-// PPC603E:#define __INT32_C_SUFFIX__ {{$}}
+// PPC603E:#define __INT32_C_SUFFIX__
 // PPC603E:#define __INT32_FMTd__ "d"
 // PPC603E:#define __INT32_FMTi__ "i"
 // PPC603E:#define __INT32_MAX__ 2147483647
@@ -5086,7 +5100,7 @@
 // PPC603E:#define __INT64_FMTi__ "lli"
 // PPC603E:#define __INT64_MAX__ 9223372036854775807LL
 // PPC603E:#define __INT64_TYPE__ long long int
-// PPC603E:#define __INT8_C_SUFFIX__ {{$}}
+// PPC603E:#define __INT8_C_SUFFIX__
 // PPC603E:#define __INT8_FMTd__ "hhd"
 // PPC603E:#define __INT8_FMTi__ "hhi"
 // PPC603E:#define __INT8_MAX__ 127
@@ -5175,10 +5189,10 @@
 // PPC603E:#define __SIZEOF_SIZE_T__ 4
 // PPC603E:#define __SIZEOF_WCHAR_T__ 4
 // PPC603E:#define __SIZEOF_WINT_T__ 4
-// PPC603E:#define __SIZE_MAX__ 4294967295U
+// PPC603E:#define __SIZE_MAX__ 4294967295UL
 // PPC603E:#define __SIZE_TYPE__ long unsigned int
 // PPC603E:#define __SIZE_WIDTH__ 32
-// PPC603E:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC603E:#define __UINT16_C_SUFFIX__
 // PPC603E:#define __UINT16_MAX__ 65535
 // PPC603E:#define __UINT16_TYPE__ unsigned short
 // PPC603E:#define __UINT32_C_SUFFIX__ U
@@ -5187,14 +5201,14 @@
 // PPC603E:#define __UINT64_C_SUFFIX__ ULL
 // PPC603E:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC603E:#define __UINT64_TYPE__ long long unsigned int
-// PPC603E:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC603E:#define __UINT8_C_SUFFIX__
 // PPC603E:#define __UINT8_MAX__ 255
 // PPC603E:#define __UINT8_TYPE__ unsigned char
 // PPC603E:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC603E:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC603E:#define __UINTMAX_TYPE__ long long unsigned int
 // PPC603E:#define __UINTMAX_WIDTH__ 64
-// PPC603E:#define __UINTPTR_MAX__ 4294967295U
+// PPC603E:#define __UINTPTR_MAX__ 4294967295UL
 // PPC603E:#define __UINTPTR_TYPE__ long unsigned int
 // PPC603E:#define __UINTPTR_WIDTH__ 32
 // PPC603E:#define __UINT_FAST16_MAX__ 65535
@@ -5213,7 +5227,7 @@
 // PPC603E:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // PPC603E:#define __UINT_LEAST8_MAX__ 255
 // PPC603E:#define __UINT_LEAST8_TYPE__ unsigned char
-// PPC603E:#define __USER_LABEL_PREFIX__ _
+// PPC603E:#define __USER_LABEL_PREFIX__
 // PPC603E:#define __WCHAR_MAX__ 2147483647
 // PPC603E:#define __WCHAR_TYPE__ int
 // PPC603E:#define __WCHAR_WIDTH__ 32
@@ -5222,7 +5236,7 @@
 // PPC603E:#define __powerpc__ 1
 // PPC603E:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC64 %s
 //
 // PPC64:#define _ARCH_PPC 1
 // PPC64:#define _ARCH_PPC64 1
@@ -5270,12 +5284,12 @@
 // PPC64:#define __FLT_MIN_EXP__ (-125)
 // PPC64:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64:#define __FLT_RADIX__ 2
-// PPC64:#define __INT16_C_SUFFIX__ {{$}}
+// PPC64:#define __INT16_C_SUFFIX__
 // PPC64:#define __INT16_FMTd__ "hd"
 // PPC64:#define __INT16_FMTi__ "hi"
 // PPC64:#define __INT16_MAX__ 32767
 // PPC64:#define __INT16_TYPE__ short
-// PPC64:#define __INT32_C_SUFFIX__ {{$}}
+// PPC64:#define __INT32_C_SUFFIX__
 // PPC64:#define __INT32_FMTd__ "d"
 // PPC64:#define __INT32_FMTi__ "i"
 // PPC64:#define __INT32_MAX__ 2147483647
@@ -5285,7 +5299,7 @@
 // PPC64:#define __INT64_FMTi__ "li"
 // PPC64:#define __INT64_MAX__ 9223372036854775807L
 // PPC64:#define __INT64_TYPE__ long int
-// PPC64:#define __INT8_C_SUFFIX__ {{$}}
+// PPC64:#define __INT8_C_SUFFIX__
 // PPC64:#define __INT8_FMTd__ "hhd"
 // PPC64:#define __INT8_FMTi__ "hhi"
 // PPC64:#define __INT8_MAX__ 127
@@ -5358,7 +5372,7 @@
 // PPC64:#define __PPC__ 1
 // PPC64:#define __PTRDIFF_TYPE__ long int
 // PPC64:#define __PTRDIFF_WIDTH__ 64
-// PPC64:#define __REGISTER_PREFIX__ 
+// PPC64:#define __REGISTER_PREFIX__
 // PPC64:#define __SCHAR_MAX__ 127
 // PPC64:#define __SHRT_MAX__ 32767
 // PPC64:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -5378,7 +5392,7 @@
 // PPC64:#define __SIZE_MAX__ 18446744073709551615UL
 // PPC64:#define __SIZE_TYPE__ long unsigned int
 // PPC64:#define __SIZE_WIDTH__ 64
-// PPC64:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC64:#define __UINT16_C_SUFFIX__
 // PPC64:#define __UINT16_MAX__ 65535
 // PPC64:#define __UINT16_TYPE__ unsigned short
 // PPC64:#define __UINT32_C_SUFFIX__ U
@@ -5387,7 +5401,7 @@
 // PPC64:#define __UINT64_C_SUFFIX__ UL
 // PPC64:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64:#define __UINT64_TYPE__ long unsigned int
-// PPC64:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC64:#define __UINT8_C_SUFFIX__
 // PPC64:#define __UINT8_MAX__ 255
 // PPC64:#define __UINT8_TYPE__ unsigned char
 // PPC64:#define __UINTMAX_C_SUFFIX__ UL
@@ -5413,7 +5427,7 @@
 // PPC64:#define __UINT_LEAST64_TYPE__ long unsigned int
 // PPC64:#define __UINT_LEAST8_MAX__ 255
 // PPC64:#define __UINT_LEAST8_TYPE__ unsigned char
-// PPC64:#define __USER_LABEL_PREFIX__ _
+// PPC64:#define __USER_LABEL_PREFIX__
 // PPC64:#define __WCHAR_MAX__ 2147483647
 // PPC64:#define __WCHAR_TYPE__ int
 // PPC64:#define __WCHAR_WIDTH__ 32
@@ -5422,7 +5436,7 @@
 // PPC64:#define __ppc64__ 1
 // PPC64:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC64LE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC64LE %s
 //
 // PPC64LE:#define _ARCH_PPC 1
 // PPC64LE:#define _ARCH_PPC64 1
@@ -5472,12 +5486,12 @@
 // PPC64LE:#define __FLT_MIN_EXP__ (-125)
 // PPC64LE:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64LE:#define __FLT_RADIX__ 2
-// PPC64LE:#define __INT16_C_SUFFIX__ {{$}}
+// PPC64LE:#define __INT16_C_SUFFIX__
 // PPC64LE:#define __INT16_FMTd__ "hd"
 // PPC64LE:#define __INT16_FMTi__ "hi"
 // PPC64LE:#define __INT16_MAX__ 32767
 // PPC64LE:#define __INT16_TYPE__ short
-// PPC64LE:#define __INT32_C_SUFFIX__ {{$}}
+// PPC64LE:#define __INT32_C_SUFFIX__
 // PPC64LE:#define __INT32_FMTd__ "d"
 // PPC64LE:#define __INT32_FMTi__ "i"
 // PPC64LE:#define __INT32_MAX__ 2147483647
@@ -5487,7 +5501,7 @@
 // PPC64LE:#define __INT64_FMTi__ "li"
 // PPC64LE:#define __INT64_MAX__ 9223372036854775807L
 // PPC64LE:#define __INT64_TYPE__ long int
-// PPC64LE:#define __INT8_C_SUFFIX__ {{$}}
+// PPC64LE:#define __INT8_C_SUFFIX__
 // PPC64LE:#define __INT8_FMTd__ "hhd"
 // PPC64LE:#define __INT8_FMTi__ "hhi"
 // PPC64LE:#define __INT8_MAX__ 127
@@ -5561,7 +5575,7 @@
 // PPC64LE:#define __PPC__ 1
 // PPC64LE:#define __PTRDIFF_TYPE__ long int
 // PPC64LE:#define __PTRDIFF_WIDTH__ 64
-// PPC64LE:#define __REGISTER_PREFIX__ 
+// PPC64LE:#define __REGISTER_PREFIX__
 // PPC64LE:#define __SCHAR_MAX__ 127
 // PPC64LE:#define __SHRT_MAX__ 32767
 // PPC64LE:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -5581,7 +5595,7 @@
 // PPC64LE:#define __SIZE_MAX__ 18446744073709551615UL
 // PPC64LE:#define __SIZE_TYPE__ long unsigned int
 // PPC64LE:#define __SIZE_WIDTH__ 64
-// PPC64LE:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC64LE:#define __UINT16_C_SUFFIX__
 // PPC64LE:#define __UINT16_MAX__ 65535
 // PPC64LE:#define __UINT16_TYPE__ unsigned short
 // PPC64LE:#define __UINT32_C_SUFFIX__ U
@@ -5590,7 +5604,7 @@
 // PPC64LE:#define __UINT64_C_SUFFIX__ UL
 // PPC64LE:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64LE:#define __UINT64_TYPE__ long unsigned int
-// PPC64LE:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC64LE:#define __UINT8_C_SUFFIX__
 // PPC64LE:#define __UINT8_MAX__ 255
 // PPC64LE:#define __UINT8_TYPE__ unsigned char
 // PPC64LE:#define __UINTMAX_C_SUFFIX__ UL
@@ -5616,7 +5630,7 @@
 // PPC64LE:#define __UINT_LEAST64_TYPE__ long unsigned int
 // PPC64LE:#define __UINT_LEAST8_MAX__ 255
 // PPC64LE:#define __UINT_LEAST8_TYPE__ unsigned char
-// PPC64LE:#define __USER_LABEL_PREFIX__ _
+// PPC64LE:#define __USER_LABEL_PREFIX__
 // PPC64LE:#define __WCHAR_MAX__ 2147483647
 // PPC64LE:#define __WCHAR_TYPE__ int
 // PPC64LE:#define __WCHAR_WIDTH__ 32
@@ -5625,7 +5639,7 @@
 // PPC64LE:#define __ppc64__ 1
 // PPC64LE:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu a2q -fno-signed-char < /dev/null | FileCheck -check-prefix PPCA2Q %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu a2q -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCA2Q %s
 //
 // PPCA2Q:#define _ARCH_A2 1
 // PPCA2Q:#define _ARCH_A2Q 1
@@ -5633,33 +5647,33 @@
 // PPCA2Q:#define _ARCH_PPC64 1
 // PPCA2Q:#define _ARCH_QP 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-bgq-linux -fno-signed-char < /dev/null | FileCheck -check-prefix PPCBGQ %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-bgq-linux -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCBGQ %s
 //
 // PPCBGQ:#define __THW_BLUEGENE__ 1
 // PPCBGQ:#define __TOS_BGQ__ 1
 // PPCBGQ:#define __bg__ 1
 // PPCBGQ:#define __bgq__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu 630 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC630 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu 630 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC630 %s
 //
 // PPC630:#define _ARCH_630 1
 // PPC630:#define _ARCH_PPC 1
 // PPC630:#define _ARCH_PPC64 1
 // PPC630:#define _ARCH_PPCGR 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr3 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR3 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr3 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR3 %s
 //
 // PPCPWR3:#define _ARCH_PPC 1
 // PPCPWR3:#define _ARCH_PPC64 1
 // PPCPWR3:#define _ARCH_PPCGR 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power3 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER3 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power3 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER3 %s
 //
 // PPCPOWER3:#define _ARCH_PPC 1
 // PPCPOWER3:#define _ARCH_PPC64 1
 // PPCPOWER3:#define _ARCH_PPCGR 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr4 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR4 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr4 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR4 %s
 //
 // PPCPWR4:#define _ARCH_PPC 1
 // PPCPWR4:#define _ARCH_PPC64 1
@@ -5667,7 +5681,7 @@
 // PPCPWR4:#define _ARCH_PPCSQ 1
 // PPCPWR4:#define _ARCH_PWR4 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power4 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER4 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power4 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER4 %s
 //
 // PPCPOWER4:#define _ARCH_PPC 1
 // PPCPOWER4:#define _ARCH_PPC64 1
@@ -5675,7 +5689,7 @@
 // PPCPOWER4:#define _ARCH_PPCSQ 1
 // PPCPOWER4:#define _ARCH_PWR4 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr5 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR5 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr5 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR5 %s
 //
 // PPCPWR5:#define _ARCH_PPC 1
 // PPCPWR5:#define _ARCH_PPC64 1
@@ -5684,7 +5698,7 @@
 // PPCPWR5:#define _ARCH_PWR4 1
 // PPCPWR5:#define _ARCH_PWR5 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power5 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER5 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power5 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER5 %s
 //
 // PPCPOWER5:#define _ARCH_PPC 1
 // PPCPOWER5:#define _ARCH_PPC64 1
@@ -5693,7 +5707,7 @@
 // PPCPOWER5:#define _ARCH_PWR4 1
 // PPCPOWER5:#define _ARCH_PWR5 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr5x -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR5X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr5x -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR5X %s
 //
 // PPCPWR5X:#define _ARCH_PPC 1
 // PPCPWR5X:#define _ARCH_PPC64 1
@@ -5703,7 +5717,7 @@
 // PPCPWR5X:#define _ARCH_PWR5 1
 // PPCPWR5X:#define _ARCH_PWR5X 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power5x -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER5X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power5x -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER5X %s
 //
 // PPCPOWER5X:#define _ARCH_PPC 1
 // PPCPOWER5X:#define _ARCH_PPC64 1
@@ -5713,7 +5727,7 @@
 // PPCPOWER5X:#define _ARCH_PWR5 1
 // PPCPOWER5X:#define _ARCH_PWR5X 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr6 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR6 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr6 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR6 %s
 //
 // PPCPWR6:#define _ARCH_PPC 1
 // PPCPWR6:#define _ARCH_PPC64 1
@@ -5724,7 +5738,7 @@
 // PPCPWR6:#define _ARCH_PWR5X 1
 // PPCPWR6:#define _ARCH_PWR6 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power6 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER6 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power6 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER6 %s
 //
 // PPCPOWER6:#define _ARCH_PPC 1
 // PPCPOWER6:#define _ARCH_PPC64 1
@@ -5735,7 +5749,7 @@
 // PPCPOWER6:#define _ARCH_PWR5X 1
 // PPCPOWER6:#define _ARCH_PWR6 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr6x -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR6X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr6x -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR6X %s
 //
 // PPCPWR6X:#define _ARCH_PPC 1
 // PPCPWR6X:#define _ARCH_PPC64 1
@@ -5747,7 +5761,7 @@
 // PPCPWR6X:#define _ARCH_PWR6 1
 // PPCPWR6X:#define _ARCH_PWR6X 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power6x -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER6X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power6x -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER6X %s
 //
 // PPCPOWER6X:#define _ARCH_PPC 1
 // PPCPOWER6X:#define _ARCH_PPC64 1
@@ -5759,7 +5773,7 @@
 // PPCPOWER6X:#define _ARCH_PWR6 1
 // PPCPOWER6X:#define _ARCH_PWR6X 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR7 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR7 %s
 //
 // PPCPWR7:#define _ARCH_PPC 1
 // PPCPWR7:#define _ARCH_PPC64 1
@@ -5772,7 +5786,7 @@
 // PPCPWR7:#define _ARCH_PWR6X 1
 // PPCPWR7:#define _ARCH_PWR7 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power7 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER7 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power7 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER7 %s
 //
 // PPCPOWER7:#define _ARCH_PPC 1
 // PPCPOWER7:#define _ARCH_PPC64 1
@@ -5785,7 +5799,7 @@
 // PPCPOWER7:#define _ARCH_PWR6X 1
 // PPCPOWER7:#define _ARCH_PWR7 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr8 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr8 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR8 %s
 //
 // PPCPWR8:#define _ARCH_PPC 1
 // PPCPWR8:#define _ARCH_PPC64 1
@@ -5799,7 +5813,7 @@
 // PPCPWR8:#define _ARCH_PWR7 1
 // PPCPWR8:#define _ARCH_PWR8 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power8 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power8 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER8 %s
 //
 // PPCPOWER8:#define _ARCH_PPC 1
 // PPCPOWER8:#define _ARCH_PPC64 1
@@ -5813,7 +5827,38 @@
 // PPCPOWER8:#define _ARCH_PWR7 1
 // PPCPOWER8:#define _ARCH_PWR8 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -fno-signed-char < /dev/null | FileCheck -check-prefix PPC64-LINUX %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr9 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR9 %s
+//
+// PPCPWR9:#define _ARCH_PPC 1
+// PPCPWR9:#define _ARCH_PPC64 1
+// PPCPWR9:#define _ARCH_PPCGR 1
+// PPCPWR9:#define _ARCH_PPCSQ 1
+// PPCPWR9:#define _ARCH_PWR4 1
+// PPCPWR9:#define _ARCH_PWR5 1
+// PPCPWR9:#define _ARCH_PWR5X 1
+// PPCPWR9:#define _ARCH_PWR6 1
+// PPCPWR9:#define _ARCH_PWR6X 1
+// PPCPWR9:#define _ARCH_PWR7 1
+// PPCPWR9:#define _ARCH_PWR9 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power9 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER9 %s
+//
+// PPCPOWER9:#define _ARCH_PPC 1
+// PPCPOWER9:#define _ARCH_PPC64 1
+// PPCPOWER9:#define _ARCH_PPCGR 1
+// PPCPOWER9:#define _ARCH_PPCSQ 1
+// PPCPOWER9:#define _ARCH_PWR4 1
+// PPCPOWER9:#define _ARCH_PWR5 1
+// PPCPOWER9:#define _ARCH_PWR5X 1
+// PPCPOWER9:#define _ARCH_PWR6 1
+// PPCPOWER9:#define _ARCH_PWR6X 1
+// PPCPOWER9:#define _ARCH_PWR7 1
+// PPCPOWER9:#define _ARCH_PWR9 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-feature +float128 -target-cpu power8 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC-FLOAT128 %s
+// PPC-FLOAT128:#define __FLOAT128__ 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-LINUX %s
 //
 // PPC64-LINUX:#define _ARCH_PPC 1
 // PPC64-LINUX:#define _ARCH_PPC64 1
@@ -5855,12 +5900,12 @@
 // PPC64-LINUX:#define __FLT_MIN_EXP__ (-125)
 // PPC64-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64-LINUX:#define __FLT_RADIX__ 2
-// PPC64-LINUX:#define __INT16_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __INT16_C_SUFFIX__
 // PPC64-LINUX:#define __INT16_FMTd__ "hd"
 // PPC64-LINUX:#define __INT16_FMTi__ "hi"
 // PPC64-LINUX:#define __INT16_MAX__ 32767
 // PPC64-LINUX:#define __INT16_TYPE__ short
-// PPC64-LINUX:#define __INT32_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __INT32_C_SUFFIX__
 // PPC64-LINUX:#define __INT32_FMTd__ "d"
 // PPC64-LINUX:#define __INT32_FMTi__ "i"
 // PPC64-LINUX:#define __INT32_MAX__ 2147483647
@@ -5870,7 +5915,7 @@
 // PPC64-LINUX:#define __INT64_FMTi__ "li"
 // PPC64-LINUX:#define __INT64_MAX__ 9223372036854775807L
 // PPC64-LINUX:#define __INT64_TYPE__ long int
-// PPC64-LINUX:#define __INT8_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __INT8_C_SUFFIX__
 // PPC64-LINUX:#define __INT8_FMTd__ "hhd"
 // PPC64-LINUX:#define __INT8_FMTi__ "hhi"
 // PPC64-LINUX:#define __INT8_MAX__ 127
@@ -5963,7 +6008,7 @@
 // PPC64-LINUX:#define __SIZE_MAX__ 18446744073709551615UL
 // PPC64-LINUX:#define __SIZE_TYPE__ long unsigned int
 // PPC64-LINUX:#define __SIZE_WIDTH__ 64
-// PPC64-LINUX:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __UINT16_C_SUFFIX__
 // PPC64-LINUX:#define __UINT16_MAX__ 65535
 // PPC64-LINUX:#define __UINT16_TYPE__ unsigned short
 // PPC64-LINUX:#define __UINT32_C_SUFFIX__ U
@@ -5972,7 +6017,7 @@
 // PPC64-LINUX:#define __UINT64_C_SUFFIX__ UL
 // PPC64-LINUX:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64-LINUX:#define __UINT64_TYPE__ long unsigned int
-// PPC64-LINUX:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __UINT8_C_SUFFIX__
 // PPC64-LINUX:#define __UINT8_MAX__ 255
 // PPC64-LINUX:#define __UINT8_TYPE__ unsigned char
 // PPC64-LINUX:#define __UINTMAX_C_SUFFIX__ UL
@@ -6010,17 +6055,17 @@
 // PPC64-LINUX:#define __ppc64__ 1
 // PPC64-LINUX:#define __ppc__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu < /dev/null | FileCheck -check-prefix PPC64-ELFv1 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -check-prefix PPC64-ELFv1 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv1-qpx < /dev/null | FileCheck -check-prefix PPC64-ELFv1 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -check-prefix PPC64-ELFv2 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu < /dev/null | FileCheck -check-prefix PPC64-ELFv2 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -check-prefix PPC64-ELFv1 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -check-prefix PPC64-ELFv2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv1-qpx < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s
 // PPC64-ELFv1:#define _CALL_ELF 1
 // PPC64-ELFv2:#define _CALL_ELF 2
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -fno-signed-char < /dev/null | FileCheck -check-prefix PPC %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC %s
 //
 // PPC:#define _ARCH_PPC 1
 // PPC:#define _BIG_ENDIAN 1
@@ -6061,12 +6106,12 @@
 // PPC:#define __FLT_MIN_EXP__ (-125)
 // PPC:#define __FLT_MIN__ 1.17549435e-38F
 // PPC:#define __FLT_RADIX__ 2
-// PPC:#define __INT16_C_SUFFIX__ {{$}}
+// PPC:#define __INT16_C_SUFFIX__
 // PPC:#define __INT16_FMTd__ "hd"
 // PPC:#define __INT16_FMTi__ "hi"
 // PPC:#define __INT16_MAX__ 32767
 // PPC:#define __INT16_TYPE__ short
-// PPC:#define __INT32_C_SUFFIX__ {{$}}
+// PPC:#define __INT32_C_SUFFIX__
 // PPC:#define __INT32_FMTd__ "d"
 // PPC:#define __INT32_FMTi__ "i"
 // PPC:#define __INT32_MAX__ 2147483647
@@ -6076,7 +6121,7 @@
 // PPC:#define __INT64_FMTi__ "lli"
 // PPC:#define __INT64_MAX__ 9223372036854775807LL
 // PPC:#define __INT64_TYPE__ long long int
-// PPC:#define __INT8_C_SUFFIX__ {{$}}
+// PPC:#define __INT8_C_SUFFIX__
 // PPC:#define __INT8_FMTd__ "hhd"
 // PPC:#define __INT8_FMTi__ "hhi"
 // PPC:#define __INT8_MAX__ 127
@@ -6148,7 +6193,7 @@
 // PPC:#define __PPC__ 1
 // PPC:#define __PTRDIFF_TYPE__ long int
 // PPC:#define __PTRDIFF_WIDTH__ 32
-// PPC:#define __REGISTER_PREFIX__ 
+// PPC:#define __REGISTER_PREFIX__
 // PPC:#define __SCHAR_MAX__ 127
 // PPC:#define __SHRT_MAX__ 32767
 // PPC:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -6165,10 +6210,10 @@
 // PPC:#define __SIZEOF_SIZE_T__ 4
 // PPC:#define __SIZEOF_WCHAR_T__ 4
 // PPC:#define __SIZEOF_WINT_T__ 4
-// PPC:#define __SIZE_MAX__ 4294967295U
+// PPC:#define __SIZE_MAX__ 4294967295UL
 // PPC:#define __SIZE_TYPE__ long unsigned int
 // PPC:#define __SIZE_WIDTH__ 32
-// PPC:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC:#define __UINT16_C_SUFFIX__
 // PPC:#define __UINT16_MAX__ 65535
 // PPC:#define __UINT16_TYPE__ unsigned short
 // PPC:#define __UINT32_C_SUFFIX__ U
@@ -6177,14 +6222,14 @@
 // PPC:#define __UINT64_C_SUFFIX__ ULL
 // PPC:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC:#define __UINT64_TYPE__ long long unsigned int
-// PPC:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC:#define __UINT8_C_SUFFIX__
 // PPC:#define __UINT8_MAX__ 255
 // PPC:#define __UINT8_TYPE__ unsigned char
 // PPC:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC:#define __UINTMAX_TYPE__ long long unsigned int
 // PPC:#define __UINTMAX_WIDTH__ 64
-// PPC:#define __UINTPTR_MAX__ 4294967295U
+// PPC:#define __UINTPTR_MAX__ 4294967295UL
 // PPC:#define __UINTPTR_TYPE__ long unsigned int
 // PPC:#define __UINTPTR_WIDTH__ 32
 // PPC:#define __UINT_FAST16_MAX__ 65535
@@ -6203,7 +6248,7 @@
 // PPC:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // PPC:#define __UINT_LEAST8_MAX__ 255
 // PPC:#define __UINT_LEAST8_TYPE__ unsigned char
-// PPC:#define __USER_LABEL_PREFIX__ _
+// PPC:#define __USER_LABEL_PREFIX__
 // PPC:#define __WCHAR_MAX__ 2147483647
 // PPC:#define __WCHAR_TYPE__ int
 // PPC:#define __WCHAR_WIDTH__ 32
@@ -6211,7 +6256,7 @@
 // PPC:#define __WINT_WIDTH__ 32
 // PPC:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -fno-signed-char < /dev/null | FileCheck -check-prefix PPC-LINUX %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC-LINUX %s
 //
 // PPC-LINUX:#define _ARCH_PPC 1
 // PPC-LINUX:#define _BIG_ENDIAN 1
@@ -6252,12 +6297,12 @@
 // PPC-LINUX:#define __FLT_MIN_EXP__ (-125)
 // PPC-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // PPC-LINUX:#define __FLT_RADIX__ 2
-// PPC-LINUX:#define __INT16_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __INT16_C_SUFFIX__
 // PPC-LINUX:#define __INT16_FMTd__ "hd"
 // PPC-LINUX:#define __INT16_FMTi__ "hi"
 // PPC-LINUX:#define __INT16_MAX__ 32767
 // PPC-LINUX:#define __INT16_TYPE__ short
-// PPC-LINUX:#define __INT32_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __INT32_C_SUFFIX__
 // PPC-LINUX:#define __INT32_FMTd__ "d"
 // PPC-LINUX:#define __INT32_FMTi__ "i"
 // PPC-LINUX:#define __INT32_MAX__ 2147483647
@@ -6267,7 +6312,7 @@
 // PPC-LINUX:#define __INT64_FMTi__ "lli"
 // PPC-LINUX:#define __INT64_MAX__ 9223372036854775807LL
 // PPC-LINUX:#define __INT64_TYPE__ long long int
-// PPC-LINUX:#define __INT8_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __INT8_C_SUFFIX__
 // PPC-LINUX:#define __INT8_FMTd__ "hhd"
 // PPC-LINUX:#define __INT8_FMTi__ "hhi"
 // PPC-LINUX:#define __INT8_MAX__ 127
@@ -6359,7 +6404,7 @@
 // PPC-LINUX:#define __SIZE_MAX__ 4294967295U
 // PPC-LINUX:#define __SIZE_TYPE__ unsigned int
 // PPC-LINUX:#define __SIZE_WIDTH__ 32
-// PPC-LINUX:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __UINT16_C_SUFFIX__
 // PPC-LINUX:#define __UINT16_MAX__ 65535
 // PPC-LINUX:#define __UINT16_TYPE__ unsigned short
 // PPC-LINUX:#define __UINT32_C_SUFFIX__ U
@@ -6368,7 +6413,7 @@
 // PPC-LINUX:#define __UINT64_C_SUFFIX__ ULL
 // PPC-LINUX:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC-LINUX:#define __UINT64_TYPE__ long long unsigned int
-// PPC-LINUX:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __UINT8_C_SUFFIX__
 // PPC-LINUX:#define __UINT8_MAX__ 255
 // PPC-LINUX:#define __UINT8_TYPE__ unsigned char
 // PPC-LINUX:#define __UINTMAX_C_SUFFIX__ ULL
@@ -6404,7 +6449,7 @@
 // PPC-LINUX:#define __powerpc__ 1
 // PPC-LINUX:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-apple-darwin8 < /dev/null | FileCheck -check-prefix PPC-DARWIN %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-apple-darwin8 < /dev/null | FileCheck -match-full-lines -check-prefix PPC-DARWIN %s
 //
 // PPC-DARWIN:#define _ARCH_PPC 1
 // PPC-DARWIN:#define _BIG_ENDIAN 1
@@ -6443,12 +6488,12 @@
 // PPC-DARWIN:#define __FLT_MIN_EXP__ (-125)
 // PPC-DARWIN:#define __FLT_MIN__ 1.17549435e-38F
 // PPC-DARWIN:#define __FLT_RADIX__ 2
-// PPC-DARWIN:#define __INT16_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __INT16_C_SUFFIX__
 // PPC-DARWIN:#define __INT16_FMTd__ "hd"
 // PPC-DARWIN:#define __INT16_FMTi__ "hi"
 // PPC-DARWIN:#define __INT16_MAX__ 32767
 // PPC-DARWIN:#define __INT16_TYPE__ short
-// PPC-DARWIN:#define __INT32_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __INT32_C_SUFFIX__
 // PPC-DARWIN:#define __INT32_FMTd__ "d"
 // PPC-DARWIN:#define __INT32_FMTi__ "i"
 // PPC-DARWIN:#define __INT32_MAX__ 2147483647
@@ -6458,7 +6503,7 @@
 // PPC-DARWIN:#define __INT64_FMTi__ "lli"
 // PPC-DARWIN:#define __INT64_MAX__ 9223372036854775807LL
 // PPC-DARWIN:#define __INT64_TYPE__ long long int
-// PPC-DARWIN:#define __INT8_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __INT8_C_SUFFIX__
 // PPC-DARWIN:#define __INT8_FMTd__ "hhd"
 // PPC-DARWIN:#define __INT8_FMTi__ "hhi"
 // PPC-DARWIN:#define __INT8_MAX__ 127
@@ -6533,7 +6578,7 @@
 // PPC-DARWIN:#define __PPC__ 1
 // PPC-DARWIN:#define __PTRDIFF_TYPE__ int
 // PPC-DARWIN:#define __PTRDIFF_WIDTH__ 32
-// PPC-DARWIN:#define __REGISTER_PREFIX__ 
+// PPC-DARWIN:#define __REGISTER_PREFIX__
 // PPC-DARWIN:#define __SCHAR_MAX__ 127
 // PPC-DARWIN:#define __SHRT_MAX__ 32767
 // PPC-DARWIN:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -6556,7 +6601,7 @@
 // PPC-DARWIN:#define __STDC_HOSTED__ 0
 // PPC-DARWIN:#define __STDC_VERSION__ 201112L
 // PPC-DARWIN:#define __STDC__ 1
-// PPC-DARWIN:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __UINT16_C_SUFFIX__
 // PPC-DARWIN:#define __UINT16_MAX__ 65535
 // PPC-DARWIN:#define __UINT16_TYPE__ unsigned short
 // PPC-DARWIN:#define __UINT32_C_SUFFIX__ U
@@ -6565,14 +6610,14 @@
 // PPC-DARWIN:#define __UINT64_C_SUFFIX__ ULL
 // PPC-DARWIN:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC-DARWIN:#define __UINT64_TYPE__ long long unsigned int
-// PPC-DARWIN:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __UINT8_C_SUFFIX__
 // PPC-DARWIN:#define __UINT8_MAX__ 255
 // PPC-DARWIN:#define __UINT8_TYPE__ unsigned char
 // PPC-DARWIN:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC-DARWIN:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC-DARWIN:#define __UINTMAX_TYPE__ long long unsigned int
 // PPC-DARWIN:#define __UINTMAX_WIDTH__ 64
-// PPC-DARWIN:#define __UINTPTR_MAX__ 4294967295U
+// PPC-DARWIN:#define __UINTPTR_MAX__ 4294967295UL
 // PPC-DARWIN:#define __UINTPTR_TYPE__ long unsigned int
 // PPC-DARWIN:#define __UINTPTR_WIDTH__ 32
 // PPC-DARWIN:#define __UINT_FAST16_MAX__ 65535
@@ -6600,8 +6645,8 @@
 // PPC-DARWIN:#define __powerpc__ 1
 // PPC-DARWIN:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -x cl -E -dM -ffreestanding -triple=amdgcn < /dev/null | FileCheck -check-prefix AMDGCN --check-prefix AMDGPU %s
-// RUN: %clang_cc1 -x cl -E -dM -ffreestanding -triple=r600 -target-cpu caicos < /dev/null | FileCheck --check-prefix AMDGPU %s
+// RUN: %clang_cc1 -x cl -E -dM -ffreestanding -triple=amdgcn < /dev/null | FileCheck -match-full-lines -check-prefix AMDGCN --check-prefix AMDGPU %s
+// RUN: %clang_cc1 -x cl -E -dM -ffreestanding -triple=r600 -target-cpu caicos < /dev/null | FileCheck -match-full-lines --check-prefix AMDGPU %s
 //
 // AMDGPU:#define cl_khr_byte_addressable_store 1
 // AMDGCN:#define cl_khr_fp64 1
@@ -6610,7 +6655,7 @@
 // AMDGPU:#define cl_khr_local_int32_base_atomics 1
 // AMDGPU:#define cl_khr_local_int32_extended_atomics 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=s390x-none-none -fno-signed-char < /dev/null | FileCheck -check-prefix S390X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=s390x-none-none -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix S390X %s
 //
 // S390X:#define __BIGGEST_ALIGNMENT__ 8
 // S390X:#define __CHAR16_TYPE__ unsigned short
@@ -6646,12 +6691,12 @@
 // S390X:#define __FLT_MIN_EXP__ (-125)
 // S390X:#define __FLT_MIN__ 1.17549435e-38F
 // S390X:#define __FLT_RADIX__ 2
-// S390X:#define __INT16_C_SUFFIX__ {{$}}
+// S390X:#define __INT16_C_SUFFIX__
 // S390X:#define __INT16_FMTd__ "hd"
 // S390X:#define __INT16_FMTi__ "hi"
 // S390X:#define __INT16_MAX__ 32767
 // S390X:#define __INT16_TYPE__ short
-// S390X:#define __INT32_C_SUFFIX__ {{$}}
+// S390X:#define __INT32_C_SUFFIX__
 // S390X:#define __INT32_FMTd__ "d"
 // S390X:#define __INT32_FMTi__ "i"
 // S390X:#define __INT32_MAX__ 2147483647
@@ -6661,7 +6706,7 @@
 // S390X:#define __INT64_FMTi__ "li"
 // S390X:#define __INT64_MAX__ 9223372036854775807L
 // S390X:#define __INT64_TYPE__ long int
-// S390X:#define __INT8_C_SUFFIX__ {{$}}
+// S390X:#define __INT8_C_SUFFIX__
 // S390X:#define __INT8_FMTd__ "hhd"
 // S390X:#define __INT8_FMTi__ "hhi"
 // S390X:#define __INT8_MAX__ 127
@@ -6747,7 +6792,7 @@
 // S390X:#define __SIZEOF_WINT_T__ 4
 // S390X:#define __SIZE_TYPE__ long unsigned int
 // S390X:#define __SIZE_WIDTH__ 64
-// S390X:#define __UINT16_C_SUFFIX__ {{$}}
+// S390X:#define __UINT16_C_SUFFIX__
 // S390X:#define __UINT16_MAX__ 65535
 // S390X:#define __UINT16_TYPE__ unsigned short
 // S390X:#define __UINT32_C_SUFFIX__ U
@@ -6756,7 +6801,7 @@
 // S390X:#define __UINT64_C_SUFFIX__ UL
 // S390X:#define __UINT64_MAX__ 18446744073709551615UL
 // S390X:#define __UINT64_TYPE__ long unsigned int
-// S390X:#define __UINT8_C_SUFFIX__ {{$}}
+// S390X:#define __UINT8_C_SUFFIX__
 // S390X:#define __UINT8_MAX__ 255
 // S390X:#define __UINT8_TYPE__ unsigned char
 // S390X:#define __UINTMAX_C_SUFFIX__ UL
@@ -6782,7 +6827,7 @@
 // S390X:#define __UINT_LEAST64_TYPE__ long unsigned int
 // S390X:#define __UINT_LEAST8_MAX__ 255
 // S390X:#define __UINT_LEAST8_TYPE__ unsigned char
-// S390X:#define __USER_LABEL_PREFIX__ _
+// S390X:#define __USER_LABEL_PREFIX__
 // S390X:#define __WCHAR_MAX__ 2147483647
 // S390X:#define __WCHAR_TYPE__ int
 // S390X:#define __WCHAR_WIDTH__ 32
@@ -6791,7 +6836,10 @@
 // S390X:#define __s390__ 1
 // S390X:#define __s390x__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-none < /dev/null | FileCheck -check-prefix SPARC %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-none < /dev/null | FileCheck -match-full-lines -check-prefix SPARC -check-prefix SPARC-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-rtems-elf < /dev/null | FileCheck -match-full-lines -check-prefix SPARC -check-prefix SPARC-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix SPARC -check-prefix SPARC-NETOPENBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix SPARC -check-prefix SPARC-NETOPENBSD %s
 //
 // SPARC-NOT:#define _LP64
 // SPARC:#define __BIGGEST_ALIGNMENT__ 8
@@ -6829,12 +6877,12 @@
 // SPARC:#define __FLT_MIN_EXP__ (-125)
 // SPARC:#define __FLT_MIN__ 1.17549435e-38F
 // SPARC:#define __FLT_RADIX__ 2
-// SPARC:#define __INT16_C_SUFFIX__ {{$}}
+// SPARC:#define __INT16_C_SUFFIX__
 // SPARC:#define __INT16_FMTd__ "hd"
 // SPARC:#define __INT16_FMTi__ "hi"
 // SPARC:#define __INT16_MAX__ 32767
 // SPARC:#define __INT16_TYPE__ short
-// SPARC:#define __INT32_C_SUFFIX__ {{$}}
+// SPARC:#define __INT32_C_SUFFIX__
 // SPARC:#define __INT32_FMTd__ "d"
 // SPARC:#define __INT32_FMTi__ "i"
 // SPARC:#define __INT32_MAX__ 2147483647
@@ -6844,7 +6892,7 @@
 // SPARC:#define __INT64_FMTi__ "lli"
 // SPARC:#define __INT64_MAX__ 9223372036854775807LL
 // SPARC:#define __INT64_TYPE__ long long int
-// SPARC:#define __INT8_C_SUFFIX__ {{$}}
+// SPARC:#define __INT8_C_SUFFIX__
 // SPARC:#define __INT8_FMTd__ "hhd"
 // SPARC:#define __INT8_FMTi__ "hhi"
 // SPARC:#define __INT8_MAX__ 127
@@ -6855,10 +6903,14 @@
 // SPARC:#define __INTMAX_MAX__ 9223372036854775807LL
 // SPARC:#define __INTMAX_TYPE__ long long int
 // SPARC:#define __INTMAX_WIDTH__ 64
-// SPARC:#define __INTPTR_FMTd__ "d"
-// SPARC:#define __INTPTR_FMTi__ "i"
-// SPARC:#define __INTPTR_MAX__ 2147483647
-// SPARC:#define __INTPTR_TYPE__ int
+// SPARC-DEFAULT:#define __INTPTR_FMTd__ "d"
+// SPARC-DEFAULT:#define __INTPTR_FMTi__ "i"
+// SPARC-DEFAULT:#define __INTPTR_MAX__ 2147483647
+// SPARC-DEFAULT:#define __INTPTR_TYPE__ int
+// SPARC-NETOPENBSD:#define __INTPTR_FMTd__ "ld"
+// SPARC-NETOPENBSD:#define __INTPTR_FMTi__ "li"
+// SPARC-NETOPENBSD:#define __INTPTR_MAX__ 2147483647L
+// SPARC-NETOPENBSD:#define __INTPTR_TYPE__ long int
 // SPARC:#define __INTPTR_WIDTH__ 32
 // SPARC:#define __INT_FAST16_FMTd__ "hd"
 // SPARC:#define __INT_FAST16_FMTi__ "hi"
@@ -6910,7 +6962,8 @@
 // SPARC:#define __LONG_MAX__ 2147483647L
 // SPARC-NOT:#define __LP64__
 // SPARC:#define __POINTER_WIDTH__ 32
-// SPARC:#define __PTRDIFF_TYPE__ int
+// SPARC-DEFAULT:#define __PTRDIFF_TYPE__ int
+// SPARC-NETOPENBSD:#define __PTRDIFF_TYPE__ long int
 // SPARC:#define __PTRDIFF_WIDTH__ 32
 // SPARC:#define __REGISTER_PREFIX__
 // SPARC:#define __SCHAR_MAX__ 127
@@ -6929,10 +6982,12 @@
 // SPARC:#define __SIZEOF_SIZE_T__ 4
 // SPARC:#define __SIZEOF_WCHAR_T__ 4
 // SPARC:#define __SIZEOF_WINT_T__ 4
-// SPARC:#define __SIZE_MAX__ 4294967295U
-// SPARC:#define __SIZE_TYPE__ unsigned int
+// SPARC-DEFAULT:#define __SIZE_MAX__ 4294967295U
+// SPARC-DEFAULT:#define __SIZE_TYPE__ unsigned int
+// SPARC-NETOPENBSD:#define __SIZE_MAX__ 4294967295UL
+// SPARC-NETOPENBSD:#define __SIZE_TYPE__ long unsigned int
 // SPARC:#define __SIZE_WIDTH__ 32
-// SPARC:#define __UINT16_C_SUFFIX__ {{$}}
+// SPARC:#define __UINT16_C_SUFFIX__
 // SPARC:#define __UINT16_MAX__ 65535
 // SPARC:#define __UINT16_TYPE__ unsigned short
 // SPARC:#define __UINT32_C_SUFFIX__ U
@@ -6941,15 +6996,17 @@
 // SPARC:#define __UINT64_C_SUFFIX__ ULL
 // SPARC:#define __UINT64_MAX__ 18446744073709551615ULL
 // SPARC:#define __UINT64_TYPE__ long long unsigned int
-// SPARC:#define __UINT8_C_SUFFIX__ {{$}}
+// SPARC:#define __UINT8_C_SUFFIX__
 // SPARC:#define __UINT8_MAX__ 255
 // SPARC:#define __UINT8_TYPE__ unsigned char
 // SPARC:#define __UINTMAX_C_SUFFIX__ ULL
 // SPARC:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // SPARC:#define __UINTMAX_TYPE__ long long unsigned int
 // SPARC:#define __UINTMAX_WIDTH__ 64
-// SPARC:#define __UINTPTR_MAX__ 4294967295U
-// SPARC:#define __UINTPTR_TYPE__ unsigned int
+// SPARC-DEFAULT:#define __UINTPTR_MAX__ 4294967295U
+// SPARC-DEFAULT:#define __UINTPTR_TYPE__ unsigned int
+// SPARC-NETOPENBSD:#define __UINTPTR_MAX__ 4294967295UL
+// SPARC-NETOPENBSD:#define __UINTPTR_TYPE__ long unsigned int
 // SPARC:#define __UINTPTR_WIDTH__ 32
 // SPARC:#define __UINT_FAST16_MAX__ 65535
 // SPARC:#define __UINT_FAST16_TYPE__ unsigned short
@@ -6967,8 +7024,8 @@
 // SPARC:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // SPARC:#define __UINT_LEAST8_MAX__ 255
 // SPARC:#define __UINT_LEAST8_TYPE__ unsigned char
-// SPARC:#define __USER_LABEL_PREFIX__ _
-// SPARC:#define __VERSION__ "4.2.1 Compatible
+// SPARC:#define __USER_LABEL_PREFIX__
+// SPARC:#define __VERSION__ "4.2.1 Compatible{{.*}}
 // SPARC:#define __WCHAR_MAX__ 2147483647
 // SPARC:#define __WCHAR_TYPE__ int
 // SPARC:#define __WCHAR_WIDTH__ 32
@@ -6978,18 +7035,8 @@
 // SPARC:#define __sparc__ 1
 // SPARC:#define __sparcv8 1
 // SPARC:#define sparc 1
-// 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-netbsd < /dev/null | FileCheck -check-prefix SPARC-NETOPENBSD %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-openbsd < /dev/null | FileCheck -check-prefix SPARC-NETOPENBSD %s
-// SPARC-NETOPENBSD:#define __INTPTR_FMTd__ "ld"
-// SPARC-NETOPENBSD:#define __INTPTR_FMTi__ "li"
-// SPARC-NETOPENBSD:#define __INTPTR_MAX__ 2147483647L
-// SPARC-NETOPENBSD:#define __INTPTR_TYPE__ long int
-// SPARC-NETOPENBSD:#define __PTRDIFF_TYPE__ long int
-// SPARC-NETOPENBSD:#define __SIZE_TYPE__ long unsigned int
-// SPARC-NETOPENBSD:#define __UINTPTR_TYPE__ long unsigned int
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=tce-none-none < /dev/null | FileCheck -check-prefix TCE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=tce-none-none < /dev/null | FileCheck -match-full-lines -check-prefix TCE %s
 //
 // TCE-NOT:#define _LP64
 // TCE:#define __BIGGEST_ALIGNMENT__ 4
@@ -7027,17 +7074,17 @@
 // TCE:#define __FLT_MIN_EXP__ (-125)
 // TCE:#define __FLT_MIN__ 1.17549435e-38F
 // TCE:#define __FLT_RADIX__ 2
-// TCE:#define __INT16_C_SUFFIX__ {{$}}
+// TCE:#define __INT16_C_SUFFIX__
 // TCE:#define __INT16_FMTd__ "hd"
 // TCE:#define __INT16_FMTi__ "hi"
 // TCE:#define __INT16_MAX__ 32767
 // TCE:#define __INT16_TYPE__ short
-// TCE:#define __INT32_C_SUFFIX__ {{$}}
+// TCE:#define __INT32_C_SUFFIX__
 // TCE:#define __INT32_FMTd__ "d"
 // TCE:#define __INT32_FMTi__ "i"
 // TCE:#define __INT32_MAX__ 2147483647
 // TCE:#define __INT32_TYPE__ int
-// TCE:#define __INT8_C_SUFFIX__ {{$}}
+// TCE:#define __INT8_C_SUFFIX__
 // TCE:#define __INT8_FMTd__ "hhd"
 // TCE:#define __INT8_FMTi__ "hhi"
 // TCE:#define __INT8_MAX__ 127
@@ -7118,13 +7165,13 @@
 // TCE:#define __SIZE_WIDTH__ 32
 // TCE:#define __TCE_V1__ 1
 // TCE:#define __TCE__ 1
-// TCE:#define __UINT16_C_SUFFIX__ {{$}}
+// TCE:#define __UINT16_C_SUFFIX__
 // TCE:#define __UINT16_MAX__ 65535
 // TCE:#define __UINT16_TYPE__ unsigned short
 // TCE:#define __UINT32_C_SUFFIX__ U
 // TCE:#define __UINT32_MAX__ 4294967295U
 // TCE:#define __UINT32_TYPE__ unsigned int
-// TCE:#define __UINT8_C_SUFFIX__ {{$}}
+// TCE:#define __UINT8_C_SUFFIX__
 // TCE:#define __UINT8_MAX__ 255
 // TCE:#define __UINT8_TYPE__ unsigned char
 // TCE:#define __UINTMAX_C_SUFFIX__ UL
@@ -7146,7 +7193,7 @@
 // TCE:#define __UINT_LEAST32_TYPE__ unsigned int
 // TCE:#define __UINT_LEAST8_MAX__ 255
 // TCE:#define __UINT_LEAST8_TYPE__ unsigned char
-// TCE:#define __USER_LABEL_PREFIX__ _
+// TCE:#define __USER_LABEL_PREFIX__
 // TCE:#define __WCHAR_MAX__ 2147483647
 // TCE:#define __WCHAR_TYPE__ int
 // TCE:#define __WCHAR_WIDTH__ 32
@@ -7156,7 +7203,7 @@
 // TCE:#define __tce__ 1
 // TCE:#define tce 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-none-none < /dev/null | FileCheck -check-prefix X86_64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix X86_64 %s
 //
 // X86_64:#define _LP64 1
 // X86_64-NOT:#define _LP32 1
@@ -7194,12 +7241,12 @@
 // X86_64:#define __FLT_MIN_EXP__ (-125)
 // X86_64:#define __FLT_MIN__ 1.17549435e-38F
 // X86_64:#define __FLT_RADIX__ 2
-// X86_64:#define __INT16_C_SUFFIX__ {{$}}
+// X86_64:#define __INT16_C_SUFFIX__
 // X86_64:#define __INT16_FMTd__ "hd"
 // X86_64:#define __INT16_FMTi__ "hi"
 // X86_64:#define __INT16_MAX__ 32767
 // X86_64:#define __INT16_TYPE__ short
-// X86_64:#define __INT32_C_SUFFIX__ {{$}}
+// X86_64:#define __INT32_C_SUFFIX__
 // X86_64:#define __INT32_FMTd__ "d"
 // X86_64:#define __INT32_FMTi__ "i"
 // X86_64:#define __INT32_MAX__ 2147483647
@@ -7209,7 +7256,7 @@
 // X86_64:#define __INT64_FMTi__ "li"
 // X86_64:#define __INT64_MAX__ 9223372036854775807L
 // X86_64:#define __INT64_TYPE__ long int
-// X86_64:#define __INT8_C_SUFFIX__ {{$}}
+// X86_64:#define __INT8_C_SUFFIX__
 // X86_64:#define __INT8_FMTd__ "hhd"
 // X86_64:#define __INT8_FMTi__ "hhi"
 // X86_64:#define __INT8_MAX__ 127
@@ -7281,7 +7328,7 @@
 // X86_64:#define __POINTER_WIDTH__ 64
 // X86_64:#define __PTRDIFF_TYPE__ long int
 // X86_64:#define __PTRDIFF_WIDTH__ 64
-// X86_64:#define __REGISTER_PREFIX__ 
+// X86_64:#define __REGISTER_PREFIX__
 // X86_64:#define __SCHAR_MAX__ 127
 // X86_64:#define __SHRT_MAX__ 32767
 // X86_64:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -7305,7 +7352,7 @@
 // X86_64:#define __SSE2__ 1
 // X86_64:#define __SSE_MATH__ 1
 // X86_64:#define __SSE__ 1
-// X86_64:#define __UINT16_C_SUFFIX__ {{$}}
+// X86_64:#define __UINT16_C_SUFFIX__
 // X86_64:#define __UINT16_MAX__ 65535
 // X86_64:#define __UINT16_TYPE__ unsigned short
 // X86_64:#define __UINT32_C_SUFFIX__ U
@@ -7314,7 +7361,7 @@
 // X86_64:#define __UINT64_C_SUFFIX__ UL
 // X86_64:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64:#define __UINT64_TYPE__ long unsigned int
-// X86_64:#define __UINT8_C_SUFFIX__ {{$}}
+// X86_64:#define __UINT8_C_SUFFIX__
 // X86_64:#define __UINT8_MAX__ 255
 // X86_64:#define __UINT8_TYPE__ unsigned char
 // X86_64:#define __UINTMAX_C_SUFFIX__ UL
@@ -7340,7 +7387,7 @@
 // X86_64:#define __UINT_LEAST64_TYPE__ long unsigned int
 // X86_64:#define __UINT_LEAST8_MAX__ 255
 // X86_64:#define __UINT_LEAST8_TYPE__ unsigned char
-// X86_64:#define __USER_LABEL_PREFIX__ _
+// X86_64:#define __USER_LABEL_PREFIX__
 // X86_64:#define __WCHAR_MAX__ 2147483647
 // X86_64:#define __WCHAR_TYPE__ int
 // X86_64:#define __WCHAR_WIDTH__ 32
@@ -7351,14 +7398,14 @@
 // X86_64:#define __x86_64 1
 // X86_64:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64h-none-none < /dev/null | FileCheck -check-prefix X86_64H %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64h-none-none < /dev/null | FileCheck -match-full-lines -check-prefix X86_64H %s
 //
 // X86_64H:#define __x86_64 1
 // X86_64H:#define __x86_64__ 1
 // X86_64H:#define __x86_64h 1
 // X86_64H:#define __x86_64h__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-none-none-gnux32 < /dev/null | FileCheck -check-prefix X32 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-none-none-gnux32 < /dev/null | FileCheck -match-full-lines -check-prefix X32 %s
 //
 // X32:#define _ILP32 1
 // X32-NOT:#define _LP64 1
@@ -7398,22 +7445,22 @@
 // X32:#define __FLT_RADIX__ 2
 // X32:#define __ILP32__ 1
 // X32-NOT:#define __LP64__ 1
-// X32:#define __INT16_C_SUFFIX__ {{$}}
+// X32:#define __INT16_C_SUFFIX__
 // X32:#define __INT16_FMTd__ "hd"
 // X32:#define __INT16_FMTi__ "hi"
 // X32:#define __INT16_MAX__ 32767
 // X32:#define __INT16_TYPE__ short
-// X32:#define __INT32_C_SUFFIX__ {{$}}
+// X32:#define __INT32_C_SUFFIX__
 // X32:#define __INT32_FMTd__ "d"
 // X32:#define __INT32_FMTi__ "i"
 // X32:#define __INT32_MAX__ 2147483647
 // X32:#define __INT32_TYPE__ int
-// X32:#define __INT64_C_SUFFIX__ L
+// X32:#define __INT64_C_SUFFIX__ LL
 // X32:#define __INT64_FMTd__ "lld"
 // X32:#define __INT64_FMTi__ "lli"
-// X32:#define __INT64_MAX__ 9223372036854775807L
+// X32:#define __INT64_MAX__ 9223372036854775807LL
 // X32:#define __INT64_TYPE__ long long int
-// X32:#define __INT8_C_SUFFIX__ {{$}}
+// X32:#define __INT8_C_SUFFIX__
 // X32:#define __INT8_FMTd__ "hhd"
 // X32:#define __INT8_FMTi__ "hhi"
 // X32:#define __INT8_MAX__ 127
@@ -7421,7 +7468,7 @@
 // X32:#define __INTMAX_C_SUFFIX__ LL
 // X32:#define __INTMAX_FMTd__ "lld"
 // X32:#define __INTMAX_FMTi__ "lli"
-// X32:#define __INTMAX_MAX__ 9223372036854775807L
+// X32:#define __INTMAX_MAX__ 9223372036854775807LL
 // X32:#define __INTMAX_TYPE__ long long int
 // X32:#define __INTMAX_WIDTH__ 64
 // X32:#define __INTPTR_FMTd__ "d"
@@ -7439,7 +7486,7 @@
 // X32:#define __INT_FAST32_TYPE__ int
 // X32:#define __INT_FAST64_FMTd__ "lld"
 // X32:#define __INT_FAST64_FMTi__ "lli"
-// X32:#define __INT_FAST64_MAX__ 9223372036854775807L
+// X32:#define __INT_FAST64_MAX__ 9223372036854775807LL
 // X32:#define __INT_FAST64_TYPE__ long long int
 // X32:#define __INT_FAST8_FMTd__ "hhd"
 // X32:#define __INT_FAST8_FMTi__ "hhi"
@@ -7455,7 +7502,7 @@
 // X32:#define __INT_LEAST32_TYPE__ int
 // X32:#define __INT_LEAST64_FMTd__ "lld"
 // X32:#define __INT_LEAST64_FMTi__ "lli"
-// X32:#define __INT_LEAST64_MAX__ 9223372036854775807L
+// X32:#define __INT_LEAST64_MAX__ 9223372036854775807LL
 // X32:#define __INT_LEAST64_TYPE__ long long int
 // X32:#define __INT_LEAST8_FMTd__ "hhd"
 // X32:#define __INT_LEAST8_FMTi__ "hhi"
@@ -7483,7 +7530,7 @@
 // X32:#define __POINTER_WIDTH__ 32
 // X32:#define __PTRDIFF_TYPE__ int
 // X32:#define __PTRDIFF_WIDTH__ 32
-// X32:#define __REGISTER_PREFIX__ 
+// X32:#define __REGISTER_PREFIX__
 // X32:#define __SCHAR_MAX__ 127
 // X32:#define __SHRT_MAX__ 32767
 // X32:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -7507,16 +7554,16 @@
 // X32:#define __SSE2__ 1
 // X32:#define __SSE_MATH__ 1
 // X32:#define __SSE__ 1
-// X32:#define __UINT16_C_SUFFIX__ {{$}}
+// X32:#define __UINT16_C_SUFFIX__
 // X32:#define __UINT16_MAX__ 65535
 // X32:#define __UINT16_TYPE__ unsigned short
 // X32:#define __UINT32_C_SUFFIX__ U
 // X32:#define __UINT32_MAX__ 4294967295U
 // X32:#define __UINT32_TYPE__ unsigned int
-// X32:#define __UINT64_C_SUFFIX__ UL
+// X32:#define __UINT64_C_SUFFIX__ ULL
 // X32:#define __UINT64_MAX__ 18446744073709551615ULL
 // X32:#define __UINT64_TYPE__ long long unsigned int
-// X32:#define __UINT8_C_SUFFIX__ {{$}}
+// X32:#define __UINT8_C_SUFFIX__
 // X32:#define __UINT8_MAX__ 255
 // X32:#define __UINT8_TYPE__ unsigned char
 // X32:#define __UINTMAX_C_SUFFIX__ ULL
@@ -7542,7 +7589,7 @@
 // X32:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // X32:#define __UINT_LEAST8_MAX__ 255
 // X32:#define __UINT_LEAST8_TYPE__ unsigned char
-// X32:#define __USER_LABEL_PREFIX__ _
+// X32:#define __USER_LABEL_PREFIX__
 // X32:#define __WCHAR_MAX__ 2147483647
 // X32:#define __WCHAR_TYPE__ int
 // X32:#define __WCHAR_WIDTH__ 32
@@ -7553,7 +7600,7 @@
 // X32:#define __x86_64 1
 // X32:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-unknown-cloudabi < /dev/null | FileCheck -check-prefix X86_64-CLOUDABI %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-unknown-cloudabi < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-CLOUDABI %s
 //
 // X86_64-CLOUDABI:#define _LP64 1
 // X86_64-CLOUDABI:#define __ATOMIC_ACQUIRE 2
@@ -7618,13 +7665,12 @@
 // X86_64-CLOUDABI:#define __GNUC_STDC_INLINE__ 1
 // X86_64-CLOUDABI:#define __GNUC__ 4
 // X86_64-CLOUDABI:#define __GXX_ABI_VERSION 1002
-// X86_64-CLOUDABI:#define __GXX_RTTI 1
-// X86_64-CLOUDABI:#define __INT16_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __INT16_C_SUFFIX__
 // X86_64-CLOUDABI:#define __INT16_FMTd__ "hd"
 // X86_64-CLOUDABI:#define __INT16_FMTi__ "hi"
 // X86_64-CLOUDABI:#define __INT16_MAX__ 32767
 // X86_64-CLOUDABI:#define __INT16_TYPE__ short
-// X86_64-CLOUDABI:#define __INT32_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __INT32_C_SUFFIX__
 // X86_64-CLOUDABI:#define __INT32_FMTd__ "d"
 // X86_64-CLOUDABI:#define __INT32_FMTi__ "i"
 // X86_64-CLOUDABI:#define __INT32_MAX__ 2147483647
@@ -7634,7 +7680,7 @@
 // X86_64-CLOUDABI:#define __INT64_FMTi__ "li"
 // X86_64-CLOUDABI:#define __INT64_MAX__ 9223372036854775807L
 // X86_64-CLOUDABI:#define __INT64_TYPE__ long int
-// X86_64-CLOUDABI:#define __INT8_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __INT8_C_SUFFIX__
 // X86_64-CLOUDABI:#define __INT8_FMTd__ "hhd"
 // X86_64-CLOUDABI:#define __INT8_FMTi__ "hhi"
 // X86_64-CLOUDABI:#define __INT8_MAX__ 127
@@ -7714,7 +7760,7 @@
 // X86_64-CLOUDABI:#define __PTRDIFF_MAX__ 9223372036854775807L
 // X86_64-CLOUDABI:#define __PTRDIFF_TYPE__ long int
 // X86_64-CLOUDABI:#define __PTRDIFF_WIDTH__ 64
-// X86_64-CLOUDABI:#define __REGISTER_PREFIX__ 
+// X86_64-CLOUDABI:#define __REGISTER_PREFIX__
 // X86_64-CLOUDABI:#define __SCHAR_MAX__ 127
 // X86_64-CLOUDABI:#define __SHRT_MAX__ 32767
 // X86_64-CLOUDABI:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -7749,7 +7795,7 @@
 // X86_64-CLOUDABI:#define __STDC_UTF_32__ 1
 // X86_64-CLOUDABI:#define __STDC_VERSION__ 201112L
 // X86_64-CLOUDABI:#define __STDC__ 1
-// X86_64-CLOUDABI:#define __UINT16_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __UINT16_C_SUFFIX__
 // X86_64-CLOUDABI:#define __UINT16_FMTX__ "hX"
 // X86_64-CLOUDABI:#define __UINT16_FMTo__ "ho"
 // X86_64-CLOUDABI:#define __UINT16_FMTu__ "hu"
@@ -7770,7 +7816,7 @@
 // X86_64-CLOUDABI:#define __UINT64_FMTx__ "lx"
 // X86_64-CLOUDABI:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64-CLOUDABI:#define __UINT64_TYPE__ long unsigned int
-// X86_64-CLOUDABI:#define __UINT8_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __UINT8_C_SUFFIX__
 // X86_64-CLOUDABI:#define __UINT8_FMTX__ "hhX"
 // X86_64-CLOUDABI:#define __UINT8_FMTo__ "hho"
 // X86_64-CLOUDABI:#define __UINT8_FMTu__ "hhu"
@@ -7840,8 +7886,8 @@
 // X86_64-CLOUDABI:#define __UINT_LEAST8_FMTx__ "hhx"
 // X86_64-CLOUDABI:#define __UINT_LEAST8_MAX__ 255
 // X86_64-CLOUDABI:#define __UINT_LEAST8_TYPE__ unsigned char
-// X86_64-CLOUDABI:#define __USER_LABEL_PREFIX__ 
-// X86_64-CLOUDABI:#define __VERSION__ "4.2.1 Compatible
+// X86_64-CLOUDABI:#define __USER_LABEL_PREFIX__
+// X86_64-CLOUDABI:#define __VERSION__ "4.2.1 Compatible{{.*}}
 // X86_64-CLOUDABI:#define __WCHAR_MAX__ 2147483647
 // X86_64-CLOUDABI:#define __WCHAR_TYPE__ int
 // X86_64-CLOUDABI:#define __WCHAR_WIDTH__ 32
@@ -7850,15 +7896,15 @@
 // X86_64-CLOUDABI:#define __amd64 1
 // X86_64-CLOUDABI:#define __amd64__ 1
 // X86_64-CLOUDABI:#define __clang__ 1
-// X86_64-CLOUDABI:#define __clang_major__ 
-// X86_64-CLOUDABI:#define __clang_minor__ 
-// X86_64-CLOUDABI:#define __clang_patchlevel__ 
-// X86_64-CLOUDABI:#define __clang_version__ 
+// X86_64-CLOUDABI:#define __clang_major__ {{.*}}
+// X86_64-CLOUDABI:#define __clang_minor__ {{.*}}
+// X86_64-CLOUDABI:#define __clang_patchlevel__ {{.*}}
+// X86_64-CLOUDABI:#define __clang_version__ {{.*}}
 // X86_64-CLOUDABI:#define __llvm__ 1
 // X86_64-CLOUDABI:#define __x86_64 1
 // X86_64-CLOUDABI:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-pc-linux-gnu < /dev/null | FileCheck -check-prefix X86_64-LINUX %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-pc-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-LINUX %s
 //
 // X86_64-LINUX:#define _LP64 1
 // X86_64-LINUX:#define __BIGGEST_ALIGNMENT__ 16
@@ -7895,12 +7941,12 @@
 // X86_64-LINUX:#define __FLT_MIN_EXP__ (-125)
 // X86_64-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // X86_64-LINUX:#define __FLT_RADIX__ 2
-// X86_64-LINUX:#define __INT16_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __INT16_C_SUFFIX__
 // X86_64-LINUX:#define __INT16_FMTd__ "hd"
 // X86_64-LINUX:#define __INT16_FMTi__ "hi"
 // X86_64-LINUX:#define __INT16_MAX__ 32767
 // X86_64-LINUX:#define __INT16_TYPE__ short
-// X86_64-LINUX:#define __INT32_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __INT32_C_SUFFIX__
 // X86_64-LINUX:#define __INT32_FMTd__ "d"
 // X86_64-LINUX:#define __INT32_FMTi__ "i"
 // X86_64-LINUX:#define __INT32_MAX__ 2147483647
@@ -7910,7 +7956,7 @@
 // X86_64-LINUX:#define __INT64_FMTi__ "li"
 // X86_64-LINUX:#define __INT64_MAX__ 9223372036854775807L
 // X86_64-LINUX:#define __INT64_TYPE__ long int
-// X86_64-LINUX:#define __INT8_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __INT8_C_SUFFIX__
 // X86_64-LINUX:#define __INT8_FMTd__ "hhd"
 // X86_64-LINUX:#define __INT8_FMTi__ "hhi"
 // X86_64-LINUX:#define __INT8_MAX__ 127
@@ -7981,7 +8027,7 @@
 // X86_64-LINUX:#define __POINTER_WIDTH__ 64
 // X86_64-LINUX:#define __PTRDIFF_TYPE__ long int
 // X86_64-LINUX:#define __PTRDIFF_WIDTH__ 64
-// X86_64-LINUX:#define __REGISTER_PREFIX__ 
+// X86_64-LINUX:#define __REGISTER_PREFIX__
 // X86_64-LINUX:#define __SCHAR_MAX__ 127
 // X86_64-LINUX:#define __SHRT_MAX__ 32767
 // X86_64-LINUX:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -8005,7 +8051,7 @@
 // X86_64-LINUX:#define __SSE2__ 1
 // X86_64-LINUX:#define __SSE_MATH__ 1
 // X86_64-LINUX:#define __SSE__ 1
-// X86_64-LINUX:#define __UINT16_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __UINT16_C_SUFFIX__
 // X86_64-LINUX:#define __UINT16_MAX__ 65535
 // X86_64-LINUX:#define __UINT16_TYPE__ unsigned short
 // X86_64-LINUX:#define __UINT32_C_SUFFIX__ U
@@ -8014,7 +8060,7 @@
 // X86_64-LINUX:#define __UINT64_C_SUFFIX__ UL
 // X86_64-LINUX:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64-LINUX:#define __UINT64_TYPE__ long unsigned int
-// X86_64-LINUX:#define __UINT8_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __UINT8_C_SUFFIX__
 // X86_64-LINUX:#define __UINT8_MAX__ 255
 // X86_64-LINUX:#define __UINT8_TYPE__ unsigned char
 // X86_64-LINUX:#define __UINTMAX_C_SUFFIX__ UL
@@ -8051,7 +8097,7 @@
 // X86_64-LINUX:#define __x86_64 1
 // X86_64-LINUX:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-unknown-freebsd9.1 < /dev/null | FileCheck -check-prefix X86_64-FREEBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-unknown-freebsd9.1 < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-FREEBSD %s
 //
 // X86_64-FREEBSD:#define __DBL_DECIMAL_DIG__ 17
 // X86_64-FREEBSD:#define __FLT_DECIMAL_DIG__ 9
@@ -8060,7 +8106,7 @@
 // X86_64-FREEBSD:#define __LDBL_DECIMAL_DIG__ 21
 // X86_64-FREEBSD:#define __STDC_MB_MIGHT_NEQ_WC__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-netbsd < /dev/null | FileCheck -check-prefix X86_64-NETBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-NETBSD %s
 //
 // X86_64-NETBSD:#define _LP64 1
 // X86_64-NETBSD:#define __BIGGEST_ALIGNMENT__ 16
@@ -8097,12 +8143,12 @@
 // X86_64-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // X86_64-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // X86_64-NETBSD:#define __FLT_RADIX__ 2
-// X86_64-NETBSD:#define __INT16_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __INT16_C_SUFFIX__
 // X86_64-NETBSD:#define __INT16_FMTd__ "hd"
 // X86_64-NETBSD:#define __INT16_FMTi__ "hi"
 // X86_64-NETBSD:#define __INT16_MAX__ 32767
 // X86_64-NETBSD:#define __INT16_TYPE__ short
-// X86_64-NETBSD:#define __INT32_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __INT32_C_SUFFIX__
 // X86_64-NETBSD:#define __INT32_FMTd__ "d"
 // X86_64-NETBSD:#define __INT32_FMTi__ "i"
 // X86_64-NETBSD:#define __INT32_MAX__ 2147483647
@@ -8112,7 +8158,7 @@
 // X86_64-NETBSD:#define __INT64_FMTi__ "li"
 // X86_64-NETBSD:#define __INT64_MAX__ 9223372036854775807L
 // X86_64-NETBSD:#define __INT64_TYPE__ long int
-// X86_64-NETBSD:#define __INT8_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __INT8_C_SUFFIX__
 // X86_64-NETBSD:#define __INT8_FMTd__ "hhd"
 // X86_64-NETBSD:#define __INT8_FMTi__ "hhi"
 // X86_64-NETBSD:#define __INT8_MAX__ 127
@@ -8183,7 +8229,7 @@
 // X86_64-NETBSD:#define __POINTER_WIDTH__ 64
 // X86_64-NETBSD:#define __PTRDIFF_TYPE__ long int
 // X86_64-NETBSD:#define __PTRDIFF_WIDTH__ 64
-// X86_64-NETBSD:#define __REGISTER_PREFIX__ 
+// X86_64-NETBSD:#define __REGISTER_PREFIX__
 // X86_64-NETBSD:#define __SCHAR_MAX__ 127
 // X86_64-NETBSD:#define __SHRT_MAX__ 32767
 // X86_64-NETBSD:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -8207,7 +8253,7 @@
 // X86_64-NETBSD:#define __SSE2__ 1
 // X86_64-NETBSD:#define __SSE_MATH__ 1
 // X86_64-NETBSD:#define __SSE__ 1
-// X86_64-NETBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __UINT16_C_SUFFIX__
 // X86_64-NETBSD:#define __UINT16_MAX__ 65535
 // X86_64-NETBSD:#define __UINT16_TYPE__ unsigned short
 // X86_64-NETBSD:#define __UINT32_C_SUFFIX__ U
@@ -8216,7 +8262,7 @@
 // X86_64-NETBSD:#define __UINT64_C_SUFFIX__ UL
 // X86_64-NETBSD:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64-NETBSD:#define __UINT64_TYPE__ long unsigned int
-// X86_64-NETBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __UINT8_C_SUFFIX__
 // X86_64-NETBSD:#define __UINT8_MAX__ 255
 // X86_64-NETBSD:#define __UINT8_TYPE__ unsigned char
 // X86_64-NETBSD:#define __UINTMAX_C_SUFFIX__ UL
@@ -8253,7 +8299,7 @@
 // X86_64-NETBSD:#define __x86_64 1
 // X86_64-NETBSD:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-scei-ps4 < /dev/null | FileCheck -check-prefix PS4 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-scei-ps4 < /dev/null | FileCheck -match-full-lines -check-prefix PS4 %s
 //
 // PS4:#define _LP64 1
 // PS4:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
@@ -8323,12 +8369,12 @@
 // PS4:#define __LP64__ 1
 // PS4:#define __MMX__ 1
 // PS4:#define __NO_MATH_INLINES 1
+// PS4:#define __ORBIS__ 1
 // PS4:#define __POINTER_WIDTH__ 64
-// PS4:#define __PS4__ 1
 // PS4:#define __PTRDIFF_MAX__ 9223372036854775807L
 // PS4:#define __PTRDIFF_TYPE__ long int
 // PS4:#define __PTRDIFF_WIDTH__ 64
-// PS4:#define __REGISTER_PREFIX__ 
+// PS4:#define __REGISTER_PREFIX__
 // PS4:#define __SCHAR_MAX__ 127
 // PS4:#define __SHRT_MAX__ 32767
 // PS4:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -8351,6 +8397,7 @@
 // PS4:#define __SSE2__ 1
 // PS4:#define __SSE_MATH__ 1
 // PS4:#define __SSE__ 1
+// PS4:#define __STDC_VERSION__ 199901L
 // PS4:#define __UINTMAX_TYPE__ long unsigned int
 // PS4:#define __USER_LABEL_PREFIX__
 // PS4:#define __WCHAR_MAX__ 65535
@@ -8366,11 +8413,11 @@
 // PS4:#define __x86_64 1
 // PS4:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -triple=x86_64-pc-mingw32 < /dev/null | FileCheck -check-prefix X86-64-DECLSPEC %s
-// RUN: %clang_cc1 -E -dM -fms-extensions -triple=x86_64-unknown-mingw32 < /dev/null | FileCheck -check-prefix X86-64-DECLSPEC %s
-// X86-64-DECLSPEC: #define __declspec
+// RUN: %clang_cc1 -E -dM -triple=x86_64-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix X86-64-DECLSPEC %s
+// RUN: %clang_cc1 -E -dM -fms-extensions -triple=x86_64-unknown-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix X86-64-DECLSPEC %s
+// X86-64-DECLSPEC: #define __declspec{{.*}}
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-none < /dev/null | FileCheck -check-prefix SPARCV9 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix SPARCV9 %s
 // SPARCV9:#define __INT64_TYPE__ long int
 // SPARCV9:#define __INTMAX_C_SUFFIX__ L
 // SPARCV9:#define __INTMAX_TYPE__ long int
@@ -8381,668 +8428,676 @@
 // SPARCV9:#define __SIZEOF_POINTER__ 8
 // SPARCV9:#define __UINTPTR_TYPE__ long unsigned int
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-openbsd < /dev/null | FileCheck -check-prefix SPARC64-OBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix SPARC64-OBSD %s
 // SPARC64-OBSD:#define __INT64_TYPE__ long long int
 // SPARC64-OBSD:#define __INTMAX_C_SUFFIX__ LL
 // SPARC64-OBSD:#define __INTMAX_TYPE__ long long int
 // SPARC64-OBSD:#define __UINTMAX_C_SUFFIX__ ULL
 // SPARC64-OBSD:#define __UINTMAX_TYPE__ long long unsigned int
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-pc-kfreebsd-gnu < /dev/null | FileCheck -check-prefix KFREEBSD-DEFINE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-pc-kfreebsd-gnu < /dev/null | FileCheck -match-full-lines -check-prefix KFREEBSD-DEFINE %s
 // KFREEBSD-DEFINE:#define __FreeBSD_kernel__ 1
 // KFREEBSD-DEFINE:#define __GLIBC__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i686-pc-kfreebsd-gnu < /dev/null | FileCheck -check-prefix KFREEBSDI686-DEFINE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i686-pc-kfreebsd-gnu < /dev/null | FileCheck -match-full-lines -check-prefix KFREEBSDI686-DEFINE %s
 // KFREEBSDI686-DEFINE:#define __FreeBSD_kernel__ 1
 // KFREEBSDI686-DEFINE:#define __GLIBC__ 1
 //
-// RUN: %clang_cc1 -x c++ -triple i686-pc-linux-gnu -fobjc-runtime=gcc -E -dM < /dev/null | FileCheck -check-prefix GNUSOURCE %s
+// RUN: %clang_cc1 -x c++ -triple i686-pc-linux-gnu -fobjc-runtime=gcc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GNUSOURCE %s
 // GNUSOURCE:#define _GNU_SOURCE 1
 //
-// RUN: %clang_cc1 -x c++ -std=c++98 -fno-rtti -E -dM < /dev/null | FileCheck -check-prefix NORTTI %s
-// NORTTI: __GXX_ABI_VERSION
+// RUN: %clang_cc1 -x c++ -std=c++98 -fno-rtti -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix NORTTI %s
+// NORTTI: #define __GXX_ABI_VERSION {{.*}}
 // NORTTI-NOT:#define __GXX_RTTI
-// NORTTI: __STDC__
+// NORTTI:#define __STDC__ 1
 //
-// RUN: %clang_cc1 -triple arm-linux-androideabi -E -dM < /dev/null | FileCheck -check-prefix ANDROID %s
-// ANDROID: __ANDROID__ 1
+// RUN: %clang_cc1 -triple arm-linux-androideabi -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix ANDROID %s
+// ANDROID:#define __ANDROID__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd < /dev/null | FileCheck -check-prefix PPC64-FREEBSD %s
+// RUN: %clang_cc1 -triple lanai-unknown-unknown -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix LANAI %s
+// LANAI: #define __lanai__ 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-FREEBSD %s
 // PPC64-FREEBSD-NOT: #define __LONG_DOUBLE_128__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -check-prefix XCORE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -match-full-lines -check-prefix XCORE %s
 // XCORE:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
 // XCORE:#define __LITTLE_ENDIAN__ 1
 // XCORE:#define __XS1B__ 1
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=wasm32-unknown-unknown \
 // RUN:   < /dev/null \
-// RUN:   | FileCheck -check-prefix=WEBASSEMBLY32 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix=WEBASSEMBLY32 %s
 //
-// WEBASSEMBLY32:#define _ILP32 1{{$}}
+// WEBASSEMBLY32:#define _ILP32 1
 // WEBASSEMBLY32-NOT:#define _LP64
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_ACQUIRE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_ACQ_REL 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_CONSUME 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_RELAXED 0{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_RELEASE 3{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_SEQ_CST 5{{$}}
-// WEBASSEMBLY32-NEXT:#define __BIGGEST_ALIGNMENT__ 16{{$}}
-// WEBASSEMBLY32-NEXT:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__{{$}}
-// WEBASSEMBLY32-NEXT:#define __CHAR16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY32-NEXT:#define __CHAR32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __CHAR_BIT__ 8{{$}}
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_ACQUIRE 2
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_ACQ_REL 4
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_CONSUME 1
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_RELAXED 0
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_RELEASE 3
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_SEQ_CST 5
+// WEBASSEMBLY32-NEXT:#define __BIGGEST_ALIGNMENT__ 16
+// WEBASSEMBLY32-NEXT:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// WEBASSEMBLY32-NEXT:#define __CHAR16_TYPE__ unsigned short
+// WEBASSEMBLY32-NEXT:#define __CHAR32_TYPE__ unsigned int
+// WEBASSEMBLY32-NEXT:#define __CHAR_BIT__ 8
 // WEBASSEMBLY32-NOT:#define __CHAR_UNSIGNED__
-// WEBASSEMBLY32-NEXT:#define __CONSTANT_CFSTRINGS__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_DECIMAL_DIG__ 17{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_DIG__ 15{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_EPSILON__ 2.2204460492503131e-16{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MANT_DIG__ 53{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MAX_10_EXP__ 308{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MAX_EXP__ 1024{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MAX__ 1.7976931348623157e+308{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MIN_10_EXP__ (-307){{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MIN_EXP__ (-1021){{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MIN__ 2.2250738585072014e-308{{$}}
-// WEBASSEMBLY32-NEXT:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__{{$}}
+// WEBASSEMBLY32-NEXT:#define __CONSTANT_CFSTRINGS__ 1
+// WEBASSEMBLY32-NEXT:#define __DBL_DECIMAL_DIG__ 17
+// WEBASSEMBLY32-NEXT:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// WEBASSEMBLY32-NEXT:#define __DBL_DIG__ 15
+// WEBASSEMBLY32-NEXT:#define __DBL_EPSILON__ 2.2204460492503131e-16
+// WEBASSEMBLY32-NEXT:#define __DBL_HAS_DENORM__ 1
+// WEBASSEMBLY32-NEXT:#define __DBL_HAS_INFINITY__ 1
+// WEBASSEMBLY32-NEXT:#define __DBL_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY32-NEXT:#define __DBL_MANT_DIG__ 53
+// WEBASSEMBLY32-NEXT:#define __DBL_MAX_10_EXP__ 308
+// WEBASSEMBLY32-NEXT:#define __DBL_MAX_EXP__ 1024
+// WEBASSEMBLY32-NEXT:#define __DBL_MAX__ 1.7976931348623157e+308
+// WEBASSEMBLY32-NEXT:#define __DBL_MIN_10_EXP__ (-307)
+// WEBASSEMBLY32-NEXT:#define __DBL_MIN_EXP__ (-1021)
+// WEBASSEMBLY32-NEXT:#define __DBL_MIN__ 2.2250738585072014e-308
+// WEBASSEMBLY32-NEXT:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
 // WEBASSEMBLY32-NOT:#define __ELF__
-// WEBASSEMBLY32-NEXT:#define __FINITE_MATH_ONLY__ 0{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_DECIMAL_DIG__ 9{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_DIG__ 6{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_EVAL_METHOD__ 0{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MANT_DIG__ 24{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MAX_10_EXP__ 38{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MAX_EXP__ 128{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MAX__ 3.40282347e+38F{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MIN_10_EXP__ (-37){{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MIN_EXP__ (-125){{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MIN__ 1.17549435e-38F{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_RADIX__ 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_INT_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_LLONG_LOCK_FREE 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_LONG_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GNUC_MINOR__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __GNUC_PATCHLEVEL__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __GNUC_STDC_INLINE__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __GNUC__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __GXX_ABI_VERSION 1002{{$}}
-// WEBASSEMBLY32-NEXT:#define __GXX_RTTI 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __ILP32__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_MAX__ 32767{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_TYPE__ short{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_FMTd__ "d"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_FMTi__ "i"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_TYPE__ int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_C_SUFFIX__ LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_TYPE__ long long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_MAX__ 127{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_TYPE__ signed char{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_C_SUFFIX__ LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_FMTd__ "lld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_FMTi__ "lli"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_TYPE__ long long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_WIDTH__ 64{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_FMTd__ "ld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_FMTi__ "li"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_MAX__ 2147483647L{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_TYPE__ long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST16_MAX__ 32767{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST16_TYPE__ short{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST32_FMTd__ "d"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST32_FMTi__ "i"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST32_TYPE__ int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST64_TYPE__ long long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST8_MAX__ 127{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST8_TYPE__ signed char{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_MAX__ 32767{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_TYPE__ short{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_FMTd__ "d"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_FMTi__ "i"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_TYPE__ int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_TYPE__ long long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_MAX__ 127{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_TYPE__ signed char{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_DECIMAL_DIG__ 36{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_DIG__ 33{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MANT_DIG__ 113{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MAX_10_EXP__ 4932{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MAX_EXP__ 16384{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MIN_10_EXP__ (-4931){{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MIN_EXP__ (-16381){{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L{{$}}
-// WEBASSEMBLY32-NEXT:#define __LITTLE_ENDIAN__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __LONG_LONG_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __LONG_MAX__ 2147483647L{{$}}
+// WEBASSEMBLY32-NEXT:#define __FINITE_MATH_ONLY__ 0
+// WEBASSEMBLY32-NEXT:#define __FLT_DECIMAL_DIG__ 9
+// WEBASSEMBLY32-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F
+// WEBASSEMBLY32-NEXT:#define __FLT_DIG__ 6
+// WEBASSEMBLY32-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F
+// WEBASSEMBLY32-NEXT:#define __FLT_EVAL_METHOD__ 0
+// WEBASSEMBLY32-NEXT:#define __FLT_HAS_DENORM__ 1
+// WEBASSEMBLY32-NEXT:#define __FLT_HAS_INFINITY__ 1
+// WEBASSEMBLY32-NEXT:#define __FLT_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY32-NEXT:#define __FLT_MANT_DIG__ 24
+// WEBASSEMBLY32-NEXT:#define __FLT_MAX_10_EXP__ 38
+// WEBASSEMBLY32-NEXT:#define __FLT_MAX_EXP__ 128
+// WEBASSEMBLY32-NEXT:#define __FLT_MAX__ 3.40282347e+38F
+// WEBASSEMBLY32-NEXT:#define __FLT_MIN_10_EXP__ (-37)
+// WEBASSEMBLY32-NEXT:#define __FLT_MIN_EXP__ (-125)
+// WEBASSEMBLY32-NEXT:#define __FLT_MIN__ 1.17549435e-38F
+// WEBASSEMBLY32-NEXT:#define __FLT_RADIX__ 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_INT_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_LLONG_LOCK_FREE 1
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_LONG_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GNUC_MINOR__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __GNUC_PATCHLEVEL__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __GNUC_STDC_INLINE__ 1
+// WEBASSEMBLY32-NEXT:#define __GNUC__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __GXX_ABI_VERSION 1002
+// WEBASSEMBLY32-NEXT:#define __ILP32__ 1
+// WEBASSEMBLY32-NEXT:#define __INT16_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __INT16_FMTd__ "hd"
+// WEBASSEMBLY32-NEXT:#define __INT16_FMTi__ "hi"
+// WEBASSEMBLY32-NEXT:#define __INT16_MAX__ 32767
+// WEBASSEMBLY32-NEXT:#define __INT16_TYPE__ short
+// WEBASSEMBLY32-NEXT:#define __INT32_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __INT32_FMTd__ "d"
+// WEBASSEMBLY32-NEXT:#define __INT32_FMTi__ "i"
+// WEBASSEMBLY32-NEXT:#define __INT32_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __INT32_TYPE__ int
+// WEBASSEMBLY32-NEXT:#define __INT64_C_SUFFIX__ LL
+// WEBASSEMBLY32-NEXT:#define __INT64_FMTd__ "lld"
+// WEBASSEMBLY32-NEXT:#define __INT64_FMTi__ "lli"
+// WEBASSEMBLY32-NEXT:#define __INT64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __INT64_TYPE__ long long int
+// WEBASSEMBLY32-NEXT:#define __INT8_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __INT8_FMTd__ "hhd"
+// WEBASSEMBLY32-NEXT:#define __INT8_FMTi__ "hhi"
+// WEBASSEMBLY32-NEXT:#define __INT8_MAX__ 127
+// WEBASSEMBLY32-NEXT:#define __INT8_TYPE__ signed char
+// WEBASSEMBLY32-NEXT:#define __INTMAX_C_SUFFIX__ LL
+// WEBASSEMBLY32-NEXT:#define __INTMAX_FMTd__ "lld"
+// WEBASSEMBLY32-NEXT:#define __INTMAX_FMTi__ "lli"
+// WEBASSEMBLY32-NEXT:#define __INTMAX_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __INTMAX_TYPE__ long long int
+// WEBASSEMBLY32-NEXT:#define __INTMAX_WIDTH__ 64
+// WEBASSEMBLY32-NEXT:#define __INTPTR_FMTd__ "ld"
+// WEBASSEMBLY32-NEXT:#define __INTPTR_FMTi__ "li"
+// WEBASSEMBLY32-NEXT:#define __INTPTR_MAX__ 2147483647L
+// WEBASSEMBLY32-NEXT:#define __INTPTR_TYPE__ long int
+// WEBASSEMBLY32-NEXT:#define __INTPTR_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __INT_FAST16_FMTd__ "hd"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST16_FMTi__ "hi"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST16_MAX__ 32767
+// WEBASSEMBLY32-NEXT:#define __INT_FAST16_TYPE__ short
+// WEBASSEMBLY32-NEXT:#define __INT_FAST32_FMTd__ "d"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST32_FMTi__ "i"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST32_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __INT_FAST32_TYPE__ int
+// WEBASSEMBLY32-NEXT:#define __INT_FAST64_FMTd__ "lld"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST64_FMTi__ "lli"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __INT_FAST64_TYPE__ long long int
+// WEBASSEMBLY32-NEXT:#define __INT_FAST8_FMTd__ "hhd"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST8_FMTi__ "hhi"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST8_MAX__ 127
+// WEBASSEMBLY32-NEXT:#define __INT_FAST8_TYPE__ signed char
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_FMTd__ "hd"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_FMTi__ "hi"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_MAX__ 32767
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_TYPE__ short
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_FMTd__ "d"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_FMTi__ "i"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_TYPE__ int
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_FMTd__ "lld"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_FMTi__ "lli"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_TYPE__ long long int
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_FMTd__ "hhd"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_FMTi__ "hhi"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_MAX__ 127
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_TYPE__ signed char
+// WEBASSEMBLY32-NEXT:#define __INT_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __LDBL_DECIMAL_DIG__ 36
+// WEBASSEMBLY32-NEXT:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
+// WEBASSEMBLY32-NEXT:#define __LDBL_DIG__ 33
+// WEBASSEMBLY32-NEXT:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
+// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_DENORM__ 1
+// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_INFINITY__ 1
+// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY32-NEXT:#define __LDBL_MANT_DIG__ 113
+// WEBASSEMBLY32-NEXT:#define __LDBL_MAX_10_EXP__ 4932
+// WEBASSEMBLY32-NEXT:#define __LDBL_MAX_EXP__ 16384
+// WEBASSEMBLY32-NEXT:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
+// WEBASSEMBLY32-NEXT:#define __LDBL_MIN_10_EXP__ (-4931)
+// WEBASSEMBLY32-NEXT:#define __LDBL_MIN_EXP__ (-16381)
+// WEBASSEMBLY32-NEXT:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
+// WEBASSEMBLY32-NEXT:#define __LITTLE_ENDIAN__ 1
+// WEBASSEMBLY32-NEXT:#define __LONG_LONG_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __LONG_MAX__ 2147483647L
 // WEBASSEMBLY32-NOT:#define __LP64__
-// WEBASSEMBLY32-NEXT:#define __NO_INLINE__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __ORDER_BIG_ENDIAN__ 4321{{$}}
-// WEBASSEMBLY32-NEXT:#define __ORDER_LITTLE_ENDIAN__ 1234{{$}}
-// WEBASSEMBLY32-NEXT:#define __ORDER_PDP_ENDIAN__ 3412{{$}}
-// WEBASSEMBLY32-NEXT:#define __POINTER_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __PRAGMA_REDEFINE_EXTNAME 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_FMTd__ "ld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_FMTi__ "li"{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_MAX__ 2147483647L{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_TYPE__ long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_WIDTH__ 32{{$}}
+// WEBASSEMBLY32-NEXT:#define __NO_INLINE__ 1
+// WEBASSEMBLY32-NEXT:#define __ORDER_BIG_ENDIAN__ 4321
+// WEBASSEMBLY32-NEXT:#define __ORDER_LITTLE_ENDIAN__ 1234
+// WEBASSEMBLY32-NEXT:#define __ORDER_PDP_ENDIAN__ 3412
+// WEBASSEMBLY32-NEXT:#define __POINTER_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __PRAGMA_REDEFINE_EXTNAME 1
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_FMTd__ "ld"
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_FMTi__ "li"
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_MAX__ 2147483647L
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_TYPE__ long int
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_WIDTH__ 32
 // WEBASSEMBLY32-NOT:#define __REGISTER_PREFIX__
-// WEBASSEMBLY32-NEXT:#define __SCHAR_MAX__ 127{{$}}
-// WEBASSEMBLY32-NEXT:#define __SHRT_MAX__ 32767{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIG_ATOMIC_MAX__ 2147483647L{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIG_ATOMIC_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_DOUBLE__ 8{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_FLOAT__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_INT128__ 16{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_INT__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG_LONG__ 8{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_POINTER__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_PTRDIFF_T__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_SHORT__ 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_SIZE_T__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_WCHAR_T__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_WINT_T__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_FMTX__ "lX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_FMTo__ "lo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_FMTu__ "lu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_FMTx__ "lx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_MAX__ 4294967295UL{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_TYPE__ long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __STDC_HOSTED__ 0{{$}}
+// WEBASSEMBLY32-NEXT:#define __SCHAR_MAX__ 127
+// WEBASSEMBLY32-NEXT:#define __SHRT_MAX__ 32767
+// WEBASSEMBLY32-NEXT:#define __SIG_ATOMIC_MAX__ 2147483647L
+// WEBASSEMBLY32-NEXT:#define __SIG_ATOMIC_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_DOUBLE__ 8
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_FLOAT__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_INT128__ 16
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_INT__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG_LONG__ 8
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_POINTER__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_PTRDIFF_T__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_SHORT__ 2
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_SIZE_T__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_WCHAR_T__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_WINT_T__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZE_FMTX__ "lX"
+// WEBASSEMBLY32-NEXT:#define __SIZE_FMTo__ "lo"
+// WEBASSEMBLY32-NEXT:#define __SIZE_FMTu__ "lu"
+// WEBASSEMBLY32-NEXT:#define __SIZE_FMTx__ "lx"
+// WEBASSEMBLY32-NEXT:#define __SIZE_MAX__ 4294967295UL
+// WEBASSEMBLY32-NEXT:#define __SIZE_TYPE__ long unsigned int
+// WEBASSEMBLY32-NEXT:#define __SIZE_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __STDC_HOSTED__ 0
 // WEBASSEMBLY32-NOT:#define __STDC_MB_MIGHT_NEQ_WC__
 // WEBASSEMBLY32-NOT:#define __STDC_NO_ATOMICS__
 // WEBASSEMBLY32-NOT:#define __STDC_NO_COMPLEX__
 // WEBASSEMBLY32-NOT:#define __STDC_NO_VLA__
 // WEBASSEMBLY32-NOT:#define __STDC_NO_THREADS__
-// WEBASSEMBLY32-NEXT:#define __STDC_UTF_16__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __STDC_UTF_32__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __STDC_VERSION__ 201112L{{$}}
-// WEBASSEMBLY32-NEXT:#define __STDC__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_MAX__ 65535{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_C_SUFFIX__ U{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_FMTX__ "X"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_FMTo__ "o"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_FMTu__ "u"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_FMTx__ "x"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_C_SUFFIX__ ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_MAX__ 255{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_C_SUFFIX__ ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTX__ "llX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTo__ "llo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTu__ "llu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTx__ "llx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_WIDTH__ 64{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTX__ "lX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTo__ "lo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTu__ "lu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTx__ "lx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_MAX__ 4294967295UL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_TYPE__ long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_MAX__ 65535{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTX__ "X"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTo__ "o"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTu__ "u"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTx__ "x"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_MAX__ 255{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_MAX__ 65535{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTX__ "X"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTo__ "o"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTu__ "u"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTx__ "x"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_MAX__ 255{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY32-NEXT:#define __USER_LABEL_PREFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __VERSION__ "{{.*}}"{{$}}
-// WEBASSEMBLY32-NEXT:#define __WCHAR_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __WCHAR_TYPE__ int{{$}}
+// WEBASSEMBLY32-NEXT:#define __STDC_UTF_16__ 1
+// WEBASSEMBLY32-NEXT:#define __STDC_UTF_32__ 1
+// WEBASSEMBLY32-NEXT:#define __STDC_VERSION__ 201112L
+// WEBASSEMBLY32-NEXT:#define __STDC__ 1
+// WEBASSEMBLY32-NEXT:#define __UINT16_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __UINT16_FMTX__ "hX"
+// WEBASSEMBLY32-NEXT:#define __UINT16_FMTo__ "ho"
+// WEBASSEMBLY32-NEXT:#define __UINT16_FMTu__ "hu"
+// WEBASSEMBLY32-NEXT:#define __UINT16_FMTx__ "hx"
+// WEBASSEMBLY32-NEXT:#define __UINT16_MAX__ 65535
+// WEBASSEMBLY32-NEXT:#define __UINT16_TYPE__ unsigned short
+// WEBASSEMBLY32-NEXT:#define __UINT32_C_SUFFIX__ U
+// WEBASSEMBLY32-NEXT:#define __UINT32_FMTX__ "X"
+// WEBASSEMBLY32-NEXT:#define __UINT32_FMTo__ "o"
+// WEBASSEMBLY32-NEXT:#define __UINT32_FMTu__ "u"
+// WEBASSEMBLY32-NEXT:#define __UINT32_FMTx__ "x"
+// WEBASSEMBLY32-NEXT:#define __UINT32_MAX__ 4294967295U
+// WEBASSEMBLY32-NEXT:#define __UINT32_TYPE__ unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT64_C_SUFFIX__ ULL
+// WEBASSEMBLY32-NEXT:#define __UINT64_FMTX__ "llX"
+// WEBASSEMBLY32-NEXT:#define __UINT64_FMTo__ "llo"
+// WEBASSEMBLY32-NEXT:#define __UINT64_FMTu__ "llu"
+// WEBASSEMBLY32-NEXT:#define __UINT64_FMTx__ "llx"
+// WEBASSEMBLY32-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY32-NEXT:#define __UINT64_TYPE__ long long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT8_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __UINT8_FMTX__ "hhX"
+// WEBASSEMBLY32-NEXT:#define __UINT8_FMTo__ "hho"
+// WEBASSEMBLY32-NEXT:#define __UINT8_FMTu__ "hhu"
+// WEBASSEMBLY32-NEXT:#define __UINT8_FMTx__ "hhx"
+// WEBASSEMBLY32-NEXT:#define __UINT8_MAX__ 255
+// WEBASSEMBLY32-NEXT:#define __UINT8_TYPE__ unsigned char
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_C_SUFFIX__ ULL
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTX__ "llX"
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTo__ "llo"
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTu__ "llu"
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTx__ "llx"
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_TYPE__ long long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_WIDTH__ 64
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTX__ "lX"
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTo__ "lo"
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTu__ "lu"
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTx__ "lx"
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_MAX__ 4294967295UL
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_TYPE__ long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTX__ "hX"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTo__ "ho"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTu__ "hu"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTx__ "hx"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_MAX__ 65535
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_TYPE__ unsigned short
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTX__ "X"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTo__ "o"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTu__ "u"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTx__ "x"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_MAX__ 4294967295U
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_TYPE__ unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTX__ "llX"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTo__ "llo"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTu__ "llu"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTx__ "llx"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_TYPE__ long long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTX__ "hhX"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTo__ "hho"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTu__ "hhu"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTx__ "hhx"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_MAX__ 255
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_TYPE__ unsigned char
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTX__ "hX"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTo__ "ho"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTu__ "hu"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTx__ "hx"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_MAX__ 65535
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_TYPE__ unsigned short
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTX__ "X"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTo__ "o"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTu__ "u"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTx__ "x"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_MAX__ 4294967295U
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_TYPE__ unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTX__ "llX"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTo__ "llo"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTu__ "llu"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTx__ "llx"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_TYPE__ long long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTX__ "hhX"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTo__ "hho"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTu__ "hhu"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTx__ "hhx"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_MAX__ 255
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_TYPE__ unsigned char
+// WEBASSEMBLY32-NEXT:#define __USER_LABEL_PREFIX__
+// WEBASSEMBLY32-NEXT:#define __VERSION__ "{{.*}}"
+// WEBASSEMBLY32-NEXT:#define __WCHAR_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __WCHAR_TYPE__ int
 // WEBASSEMBLY32-NOT:#define __WCHAR_UNSIGNED__
-// WEBASSEMBLY32-NEXT:#define __WCHAR_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __WINT_TYPE__ int{{$}}
+// WEBASSEMBLY32-NEXT:#define __WCHAR_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __WINT_TYPE__ int
 // WEBASSEMBLY32-NOT:#define __WINT_UNSIGNED__
-// WEBASSEMBLY32-NEXT:#define __WINT_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __clang__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __clang_major__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __clang_minor__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __clang_patchlevel__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __clang_version__ "{{.*}}"{{$}}
-// WEBASSEMBLY32-NEXT:#define __llvm__ 1{{$}}
+// WEBASSEMBLY32-NEXT:#define __WINT_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __clang__ 1
+// WEBASSEMBLY32-NEXT:#define __clang_major__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __clang_minor__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __clang_patchlevel__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __clang_version__ "{{.*}}"
+// WEBASSEMBLY32-NEXT:#define __llvm__ 1
 // WEBASSEMBLY32-NOT:#define __wasm_simd128__
 // WEBASSEMBLY32-NOT:#define __wasm_simd256__
 // WEBASSEMBLY32-NOT:#define __wasm_simd512__
 // WEBASSEMBLY32-NOT:#define __unix
 // WEBASSEMBLY32-NOT:#define __unix__
-// WEBASSEMBLY32-NEXT:#define __wasm 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __wasm32 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __wasm32__ 1{{$}}
+// WEBASSEMBLY32-NEXT:#define __wasm 1
+// WEBASSEMBLY32-NEXT:#define __wasm32 1
+// WEBASSEMBLY32-NEXT:#define __wasm32__ 1
 // WEBASSEMBLY32-NOT:#define __wasm64
 // WEBASSEMBLY32-NOT:#define __wasm64__
-// WEBASSEMBLY32-NEXT:#define __wasm__ 1{{$}}
+// WEBASSEMBLY32-NEXT:#define __wasm__ 1
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=wasm64-unknown-unknown \
 // RUN:   < /dev/null \
-// RUN:   | FileCheck -check-prefix=WEBASSEMBLY64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix=WEBASSEMBLY64 %s
 //
 // WEBASSEMBLY64-NOT:#define _ILP32
-// WEBASSEMBLY64:#define _LP64 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_ACQUIRE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_ACQ_REL 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_CONSUME 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_RELAXED 0{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_RELEASE 3{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_SEQ_CST 5{{$}}
-// WEBASSEMBLY64-NEXT:#define __BIGGEST_ALIGNMENT__ 16{{$}}
-// WEBASSEMBLY64-NEXT:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__{{$}}
-// WEBASSEMBLY64-NEXT:#define __CHAR16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY64-NEXT:#define __CHAR32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __CHAR_BIT__ 8{{$}}
+// WEBASSEMBLY64:#define _LP64 1
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_ACQUIRE 2
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_ACQ_REL 4
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_CONSUME 1
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_RELAXED 0
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_RELEASE 3
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_SEQ_CST 5
+// WEBASSEMBLY64-NEXT:#define __BIGGEST_ALIGNMENT__ 16
+// WEBASSEMBLY64-NEXT:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// WEBASSEMBLY64-NEXT:#define __CHAR16_TYPE__ unsigned short
+// WEBASSEMBLY64-NEXT:#define __CHAR32_TYPE__ unsigned int
+// WEBASSEMBLY64-NEXT:#define __CHAR_BIT__ 8
 // WEBASSEMBLY64-NOT:#define __CHAR_UNSIGNED__
-// WEBASSEMBLY64-NEXT:#define __CONSTANT_CFSTRINGS__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_DECIMAL_DIG__ 17{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_DIG__ 15{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_EPSILON__ 2.2204460492503131e-16{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MANT_DIG__ 53{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MAX_10_EXP__ 308{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MAX_EXP__ 1024{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MAX__ 1.7976931348623157e+308{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MIN_10_EXP__ (-307){{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MIN_EXP__ (-1021){{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MIN__ 2.2250738585072014e-308{{$}}
-// WEBASSEMBLY64-NEXT:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__{{$}}
+// WEBASSEMBLY64-NEXT:#define __CONSTANT_CFSTRINGS__ 1
+// WEBASSEMBLY64-NEXT:#define __DBL_DECIMAL_DIG__ 17
+// WEBASSEMBLY64-NEXT:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// WEBASSEMBLY64-NEXT:#define __DBL_DIG__ 15
+// WEBASSEMBLY64-NEXT:#define __DBL_EPSILON__ 2.2204460492503131e-16
+// WEBASSEMBLY64-NEXT:#define __DBL_HAS_DENORM__ 1
+// WEBASSEMBLY64-NEXT:#define __DBL_HAS_INFINITY__ 1
+// WEBASSEMBLY64-NEXT:#define __DBL_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY64-NEXT:#define __DBL_MANT_DIG__ 53
+// WEBASSEMBLY64-NEXT:#define __DBL_MAX_10_EXP__ 308
+// WEBASSEMBLY64-NEXT:#define __DBL_MAX_EXP__ 1024
+// WEBASSEMBLY64-NEXT:#define __DBL_MAX__ 1.7976931348623157e+308
+// WEBASSEMBLY64-NEXT:#define __DBL_MIN_10_EXP__ (-307)
+// WEBASSEMBLY64-NEXT:#define __DBL_MIN_EXP__ (-1021)
+// WEBASSEMBLY64-NEXT:#define __DBL_MIN__ 2.2250738585072014e-308
+// WEBASSEMBLY64-NEXT:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
 // WEBASSEMBLY64-NOT:#define __ELF__
-// WEBASSEMBLY64-NEXT:#define __FINITE_MATH_ONLY__ 0{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_DECIMAL_DIG__ 9{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_DIG__ 6{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_EVAL_METHOD__ 0{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MANT_DIG__ 24{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MAX_10_EXP__ 38{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MAX_EXP__ 128{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MAX__ 3.40282347e+38F{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MIN_10_EXP__ (-37){{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MIN_EXP__ (-125){{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MIN__ 1.17549435e-38F{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_RADIX__ 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_INT_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_LLONG_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_LONG_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GNUC_MINOR__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __GNUC_PATCHLEVEL__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __GNUC_STDC_INLINE__ 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __FINITE_MATH_ONLY__ 0
+// WEBASSEMBLY64-NEXT:#define __FLT_DECIMAL_DIG__ 9
+// WEBASSEMBLY64-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F
+// WEBASSEMBLY64-NEXT:#define __FLT_DIG__ 6
+// WEBASSEMBLY64-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F
+// WEBASSEMBLY64-NEXT:#define __FLT_EVAL_METHOD__ 0
+// WEBASSEMBLY64-NEXT:#define __FLT_HAS_DENORM__ 1
+// WEBASSEMBLY64-NEXT:#define __FLT_HAS_INFINITY__ 1
+// WEBASSEMBLY64-NEXT:#define __FLT_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY64-NEXT:#define __FLT_MANT_DIG__ 24
+// WEBASSEMBLY64-NEXT:#define __FLT_MAX_10_EXP__ 38
+// WEBASSEMBLY64-NEXT:#define __FLT_MAX_EXP__ 128
+// WEBASSEMBLY64-NEXT:#define __FLT_MAX__ 3.40282347e+38F
+// WEBASSEMBLY64-NEXT:#define __FLT_MIN_10_EXP__ (-37)
+// WEBASSEMBLY64-NEXT:#define __FLT_MIN_EXP__ (-125)
+// WEBASSEMBLY64-NEXT:#define __FLT_MIN__ 1.17549435e-38F
+// WEBASSEMBLY64-NEXT:#define __FLT_RADIX__ 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_INT_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_LONG_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GNUC_MINOR__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __GNUC_PATCHLEVEL__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __GNUC_STDC_INLINE__ 1
 // WEBASSEMBLY64-NEXT:#define __GNUC__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __GXX_ABI_VERSION 1002{{$}}
-// WEBASSEMBLY64-NEXT:#define __GXX_RTTI 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __GXX_ABI_VERSION 1002
 // WEBASSEMBLY64-NOT:#define __ILP32__
-// WEBASSEMBLY64-NEXT:#define __INT16_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __INT16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT16_MAX__ 32767{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT16_TYPE__ short{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_FMTd__ "d"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_FMTi__ "i"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_TYPE__ int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_C_SUFFIX__ LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_TYPE__ long long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_MAX__ 127{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_TYPE__ signed char{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_C_SUFFIX__ LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_FMTd__ "lld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_FMTi__ "lli"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_TYPE__ long long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_FMTd__ "ld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_FMTi__ "li"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_MAX__ 9223372036854775807L{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_TYPE__ long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST16_MAX__ 32767{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST16_TYPE__ short{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST32_FMTd__ "d"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST32_FMTi__ "i"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST32_TYPE__ int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST64_TYPE__ long long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST8_MAX__ 127{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST8_TYPE__ signed char{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_MAX__ 32767{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_TYPE__ short{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_FMTd__ "d"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_FMTi__ "i"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_TYPE__ int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_TYPE__ long long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_MAX__ 127{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_TYPE__ signed char{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_DECIMAL_DIG__ 36{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_DIG__ 33{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MANT_DIG__ 113{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MAX_10_EXP__ 4932{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MAX_EXP__ 16384{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MIN_10_EXP__ (-4931){{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MIN_EXP__ (-16381){{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LITTLE_ENDIAN__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __LONG_LONG_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LP64__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __NO_INLINE__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __ORDER_BIG_ENDIAN__ 4321{{$}}
-// WEBASSEMBLY64-NEXT:#define __ORDER_LITTLE_ENDIAN__ 1234{{$}}
-// WEBASSEMBLY64-NEXT:#define __ORDER_PDP_ENDIAN__ 3412{{$}}
-// WEBASSEMBLY64-NEXT:#define __POINTER_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __PRAGMA_REDEFINE_EXTNAME 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_FMTd__ "ld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_FMTi__ "li"{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_MAX__ 9223372036854775807L{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_TYPE__ long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_WIDTH__ 64{{$}}
+// WEBASSEMBLY64-NEXT:#define __INT16_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __INT16_FMTd__ "hd"
+// WEBASSEMBLY64-NEXT:#define __INT16_FMTi__ "hi"
+// WEBASSEMBLY64-NEXT:#define __INT16_MAX__ 32767
+// WEBASSEMBLY64-NEXT:#define __INT16_TYPE__ short
+// WEBASSEMBLY64-NEXT:#define __INT32_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __INT32_FMTd__ "d"
+// WEBASSEMBLY64-NEXT:#define __INT32_FMTi__ "i"
+// WEBASSEMBLY64-NEXT:#define __INT32_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __INT32_TYPE__ int
+// WEBASSEMBLY64-NEXT:#define __INT64_C_SUFFIX__ LL
+// WEBASSEMBLY64-NEXT:#define __INT64_FMTd__ "lld"
+// WEBASSEMBLY64-NEXT:#define __INT64_FMTi__ "lli"
+// WEBASSEMBLY64-NEXT:#define __INT64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __INT64_TYPE__ long long int
+// WEBASSEMBLY64-NEXT:#define __INT8_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __INT8_FMTd__ "hhd"
+// WEBASSEMBLY64-NEXT:#define __INT8_FMTi__ "hhi"
+// WEBASSEMBLY64-NEXT:#define __INT8_MAX__ 127
+// WEBASSEMBLY64-NEXT:#define __INT8_TYPE__ signed char
+// WEBASSEMBLY64-NEXT:#define __INTMAX_C_SUFFIX__ LL
+// WEBASSEMBLY64-NEXT:#define __INTMAX_FMTd__ "lld"
+// WEBASSEMBLY64-NEXT:#define __INTMAX_FMTi__ "lli"
+// WEBASSEMBLY64-NEXT:#define __INTMAX_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __INTMAX_TYPE__ long long int
+// WEBASSEMBLY64-NEXT:#define __INTMAX_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __INTPTR_FMTd__ "ld"
+// WEBASSEMBLY64-NEXT:#define __INTPTR_FMTi__ "li"
+// WEBASSEMBLY64-NEXT:#define __INTPTR_MAX__ 9223372036854775807L
+// WEBASSEMBLY64-NEXT:#define __INTPTR_TYPE__ long int
+// WEBASSEMBLY64-NEXT:#define __INTPTR_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __INT_FAST16_FMTd__ "hd"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST16_FMTi__ "hi"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST16_MAX__ 32767
+// WEBASSEMBLY64-NEXT:#define __INT_FAST16_TYPE__ short
+// WEBASSEMBLY64-NEXT:#define __INT_FAST32_FMTd__ "d"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST32_FMTi__ "i"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST32_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __INT_FAST32_TYPE__ int
+// WEBASSEMBLY64-NEXT:#define __INT_FAST64_FMTd__ "lld"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST64_FMTi__ "lli"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __INT_FAST64_TYPE__ long long int
+// WEBASSEMBLY64-NEXT:#define __INT_FAST8_FMTd__ "hhd"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST8_FMTi__ "hhi"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST8_MAX__ 127
+// WEBASSEMBLY64-NEXT:#define __INT_FAST8_TYPE__ signed char
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_FMTd__ "hd"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_FMTi__ "hi"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_MAX__ 32767
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_TYPE__ short
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_FMTd__ "d"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_FMTi__ "i"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_TYPE__ int
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_FMTd__ "lld"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_FMTi__ "lli"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_TYPE__ long long int
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_FMTd__ "hhd"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_FMTi__ "hhi"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_MAX__ 127
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_TYPE__ signed char
+// WEBASSEMBLY64-NEXT:#define __INT_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __LDBL_DECIMAL_DIG__ 36
+// WEBASSEMBLY64-NEXT:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
+// WEBASSEMBLY64-NEXT:#define __LDBL_DIG__ 33
+// WEBASSEMBLY64-NEXT:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
+// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_DENORM__ 1
+// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_INFINITY__ 1
+// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY64-NEXT:#define __LDBL_MANT_DIG__ 113
+// WEBASSEMBLY64-NEXT:#define __LDBL_MAX_10_EXP__ 4932
+// WEBASSEMBLY64-NEXT:#define __LDBL_MAX_EXP__ 16384
+// WEBASSEMBLY64-NEXT:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
+// WEBASSEMBLY64-NEXT:#define __LDBL_MIN_10_EXP__ (-4931)
+// WEBASSEMBLY64-NEXT:#define __LDBL_MIN_EXP__ (-16381)
+// WEBASSEMBLY64-NEXT:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
+// WEBASSEMBLY64-NEXT:#define __LITTLE_ENDIAN__ 1
+// WEBASSEMBLY64-NEXT:#define __LONG_LONG_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L
+// WEBASSEMBLY64-NEXT:#define __LP64__ 1
+// WEBASSEMBLY64-NEXT:#define __NO_INLINE__ 1
+// WEBASSEMBLY64-NEXT:#define __ORDER_BIG_ENDIAN__ 4321
+// WEBASSEMBLY64-NEXT:#define __ORDER_LITTLE_ENDIAN__ 1234
+// WEBASSEMBLY64-NEXT:#define __ORDER_PDP_ENDIAN__ 3412
+// WEBASSEMBLY64-NEXT:#define __POINTER_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __PRAGMA_REDEFINE_EXTNAME 1
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_FMTd__ "ld"
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_FMTi__ "li"
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_MAX__ 9223372036854775807L
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_TYPE__ long int
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_WIDTH__ 64
 // WEBASSEMBLY64-NOT:#define __REGISTER_PREFIX__
-// WEBASSEMBLY64-NEXT:#define __SCHAR_MAX__ 127{{$}}
-// WEBASSEMBLY64-NEXT:#define __SHRT_MAX__ 32767{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIG_ATOMIC_MAX__ 9223372036854775807L{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIG_ATOMIC_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_DOUBLE__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_FLOAT__ 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT128__ 16{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT__ 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG_LONG__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_POINTER__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_PTRDIFF_T__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_SHORT__ 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_SIZE_T__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_WCHAR_T__ 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_WINT_T__ 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_FMTX__ "lX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_FMTo__ "lo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_FMTu__ "lu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_FMTx__ "lx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_MAX__ 18446744073709551615UL{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_TYPE__ long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __STDC_HOSTED__ 0{{$}}
+// WEBASSEMBLY64-NEXT:#define __SCHAR_MAX__ 127
+// WEBASSEMBLY64-NEXT:#define __SHRT_MAX__ 32767
+// WEBASSEMBLY64-NEXT:#define __SIG_ATOMIC_MAX__ 9223372036854775807L
+// WEBASSEMBLY64-NEXT:#define __SIG_ATOMIC_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_DOUBLE__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_FLOAT__ 4
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT128__ 16
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT__ 4
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG_LONG__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_POINTER__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_PTRDIFF_T__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_SHORT__ 2
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_SIZE_T__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_WCHAR_T__ 4
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_WINT_T__ 4
+// WEBASSEMBLY64-NEXT:#define __SIZE_FMTX__ "lX"
+// WEBASSEMBLY64-NEXT:#define __SIZE_FMTo__ "lo"
+// WEBASSEMBLY64-NEXT:#define __SIZE_FMTu__ "lu"
+// WEBASSEMBLY64-NEXT:#define __SIZE_FMTx__ "lx"
+// WEBASSEMBLY64-NEXT:#define __SIZE_MAX__ 18446744073709551615UL
+// WEBASSEMBLY64-NEXT:#define __SIZE_TYPE__ long unsigned int
+// WEBASSEMBLY64-NEXT:#define __SIZE_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __STDC_HOSTED__ 0
 // WEBASSEMBLY64-NOT:#define __STDC_MB_MIGHT_NEQ_WC__
 // WEBASSEMBLY64-NOT:#define __STDC_NO_ATOMICS__
 // WEBASSEMBLY64-NOT:#define __STDC_NO_COMPLEX__
 // WEBASSEMBLY64-NOT:#define __STDC_NO_VLA__
 // WEBASSEMBLY64-NOT:#define __STDC_NO_THREADS__
-// WEBASSEMBLY64-NEXT:#define __STDC_UTF_16__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __STDC_UTF_32__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __STDC_VERSION__ 201112L{{$}}
-// WEBASSEMBLY64-NEXT:#define __STDC__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_MAX__ 65535{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_C_SUFFIX__ U{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_FMTX__ "X"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_FMTo__ "o"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_FMTu__ "u"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_FMTx__ "x"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_C_SUFFIX__ ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_MAX__ 255{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_C_SUFFIX__ ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTX__ "llX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTo__ "llo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTu__ "llu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTx__ "llx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTX__ "lX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTo__ "lo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTu__ "lu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTx__ "lx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_MAX__ 18446744073709551615UL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_TYPE__ long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_MAX__ 65535{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTX__ "X"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTo__ "o"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTu__ "u"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTx__ "x"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_MAX__ 255{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_MAX__ 65535{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTX__ "X"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTo__ "o"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTu__ "u"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTx__ "x"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_MAX__ 255{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY64-NEXT:#define __USER_LABEL_PREFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __VERSION__ "{{.*}}"{{$}}
-// WEBASSEMBLY64-NEXT:#define __WCHAR_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __WCHAR_TYPE__ int{{$}}
+// WEBASSEMBLY64-NEXT:#define __STDC_UTF_16__ 1
+// WEBASSEMBLY64-NEXT:#define __STDC_UTF_32__ 1
+// WEBASSEMBLY64-NEXT:#define __STDC_VERSION__ 201112L
+// WEBASSEMBLY64-NEXT:#define __STDC__ 1
+// WEBASSEMBLY64-NEXT:#define __UINT16_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __UINT16_FMTX__ "hX"
+// WEBASSEMBLY64-NEXT:#define __UINT16_FMTo__ "ho"
+// WEBASSEMBLY64-NEXT:#define __UINT16_FMTu__ "hu"
+// WEBASSEMBLY64-NEXT:#define __UINT16_FMTx__ "hx"
+// WEBASSEMBLY64-NEXT:#define __UINT16_MAX__ 65535
+// WEBASSEMBLY64-NEXT:#define __UINT16_TYPE__ unsigned short
+// WEBASSEMBLY64-NEXT:#define __UINT32_C_SUFFIX__ U
+// WEBASSEMBLY64-NEXT:#define __UINT32_FMTX__ "X"
+// WEBASSEMBLY64-NEXT:#define __UINT32_FMTo__ "o"
+// WEBASSEMBLY64-NEXT:#define __UINT32_FMTu__ "u"
+// WEBASSEMBLY64-NEXT:#define __UINT32_FMTx__ "x"
+// WEBASSEMBLY64-NEXT:#define __UINT32_MAX__ 4294967295U
+// WEBASSEMBLY64-NEXT:#define __UINT32_TYPE__ unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT64_C_SUFFIX__ ULL
+// WEBASSEMBLY64-NEXT:#define __UINT64_FMTX__ "llX"
+// WEBASSEMBLY64-NEXT:#define __UINT64_FMTo__ "llo"
+// WEBASSEMBLY64-NEXT:#define __UINT64_FMTu__ "llu"
+// WEBASSEMBLY64-NEXT:#define __UINT64_FMTx__ "llx"
+// WEBASSEMBLY64-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY64-NEXT:#define __UINT64_TYPE__ long long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT8_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __UINT8_FMTX__ "hhX"
+// WEBASSEMBLY64-NEXT:#define __UINT8_FMTo__ "hho"
+// WEBASSEMBLY64-NEXT:#define __UINT8_FMTu__ "hhu"
+// WEBASSEMBLY64-NEXT:#define __UINT8_FMTx__ "hhx"
+// WEBASSEMBLY64-NEXT:#define __UINT8_MAX__ 255
+// WEBASSEMBLY64-NEXT:#define __UINT8_TYPE__ unsigned char
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_C_SUFFIX__ ULL
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTX__ "llX"
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTo__ "llo"
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTu__ "llu"
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTx__ "llx"
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_TYPE__ long long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTX__ "lX"
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTo__ "lo"
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTu__ "lu"
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTx__ "lx"
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_MAX__ 18446744073709551615UL
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_TYPE__ long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTX__ "hX"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTo__ "ho"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTu__ "hu"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTx__ "hx"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_MAX__ 65535
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_TYPE__ unsigned short
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTX__ "X"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTo__ "o"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTu__ "u"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTx__ "x"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_MAX__ 4294967295U
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_TYPE__ unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTX__ "llX"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTo__ "llo"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTu__ "llu"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTx__ "llx"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_TYPE__ long long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTX__ "hhX"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTo__ "hho"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTu__ "hhu"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTx__ "hhx"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_MAX__ 255
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_TYPE__ unsigned char
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTX__ "hX"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTo__ "ho"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTu__ "hu"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTx__ "hx"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_MAX__ 65535
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_TYPE__ unsigned short
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTX__ "X"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTo__ "o"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTu__ "u"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTx__ "x"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_MAX__ 4294967295U
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_TYPE__ unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTX__ "llX"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTo__ "llo"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTu__ "llu"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTx__ "llx"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_TYPE__ long long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTX__ "hhX"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTo__ "hho"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTu__ "hhu"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTx__ "hhx"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_MAX__ 255
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_TYPE__ unsigned char
+// WEBASSEMBLY64-NEXT:#define __USER_LABEL_PREFIX__
+// WEBASSEMBLY64-NEXT:#define __VERSION__ "{{.*}}"
+// WEBASSEMBLY64-NEXT:#define __WCHAR_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __WCHAR_TYPE__ int
 // WEBASSEMBLY64-NOT:#define __WCHAR_UNSIGNED__
-// WEBASSEMBLY64-NEXT:#define __WCHAR_WIDTH__ 32{{$}}
-// WEBASSEMBLY64-NEXT:#define __WINT_TYPE__ int{{$}}
+// WEBASSEMBLY64-NEXT:#define __WCHAR_WIDTH__ 32
+// WEBASSEMBLY64-NEXT:#define __WINT_TYPE__ int
 // WEBASSEMBLY64-NOT:#define __WINT_UNSIGNED__
-// WEBASSEMBLY64-NEXT:#define __WINT_WIDTH__ 32{{$}}
-// WEBASSEMBLY64-NEXT:#define __clang__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __clang_major__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __clang_minor__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __clang_patchlevel__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __clang_version__ "{{.*}}"{{$}}
-// WEBASSEMBLY64-NEXT:#define __llvm__ 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __WINT_WIDTH__ 32
+// WEBASSEMBLY64-NEXT:#define __clang__ 1
+// WEBASSEMBLY64-NEXT:#define __clang_major__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __clang_minor__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __clang_patchlevel__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __clang_version__ "{{.*}}"
+// WEBASSEMBLY64-NEXT:#define __llvm__ 1
 // WEBASSEMBLY64-NOT:#define __wasm_simd128__
 // WEBASSEMBLY64-NOT:#define __wasm_simd256__
 // WEBASSEMBLY64-NOT:#define __wasm_simd512__
 // WEBASSEMBLY64-NOT:#define __unix
 // WEBASSEMBLY64-NOT:#define __unix__
-// WEBASSEMBLY64-NEXT:#define __wasm 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __wasm 1
 // WEBASSEMBLY64-NOT:#define __wasm32
 // WEBASSEMBLY64-NOT:#define __wasm32__
-// WEBASSEMBLY64-NEXT:#define __wasm64 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __wasm64__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __wasm__ 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __wasm64 1
+// WEBASSEMBLY64-NEXT:#define __wasm64__ 1
+// WEBASSEMBLY64-NEXT:#define __wasm__ 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple i686-windows-cygnus < /dev/null | FileCheck -match-full-lines -check-prefix CYGWIN-X32 %s
+// CYGWIN-X32: #define __USER_LABEL_PREFIX__ _
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple x86_64-windows-cygnus < /dev/null | FileCheck -match-full-lines -check-prefix CYGWIN-X64 %s
+// CYGWIN-X64: #define __USER_LABEL_PREFIX__
+
diff --git a/test/Preprocessor/invalid-__has_warning1.c b/test/Preprocessor/invalid-__has_warning1.c
index b6a0b2e..5e4f12f 100644
--- a/test/Preprocessor/invalid-__has_warning1.c
+++ b/test/Preprocessor/invalid-__has_warning1.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -verify %s
 
 // These must be the last lines in this test.
-// expected-error@+1{{expected string literal}} expected-error@+1 2{{expected}}
+// expected-error@+1{{unterminated}} expected-error@+1 2{{expected}}
 int i = __has_warning(
diff --git a/test/Preprocessor/invalid-__has_warning2.c b/test/Preprocessor/invalid-__has_warning2.c
index 8aba530..f54ff47 100644
--- a/test/Preprocessor/invalid-__has_warning2.c
+++ b/test/Preprocessor/invalid-__has_warning2.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -verify %s
 
 // These must be the last lines in this test.
-// expected-error@+1{{expected string literal}} expected-error@+1{{expected}}
+// expected-error@+1{{too few arguments}}
 int i = __has_warning();
diff --git a/test/Preprocessor/macro_expand.c b/test/Preprocessor/macro_expand.c
index cf98a2c..430068b 100644
--- a/test/Preprocessor/macro_expand.c
+++ b/test/Preprocessor/macro_expand.c
@@ -19,3 +19,9 @@
 // rdar://6880648
 #define f(x,y...) y
 f()
+
+// CHECK: #pragma omp parallel for
+#define FOO parallel
+#define Streaming _Pragma("omp FOO for")
+Streaming
+
diff --git a/test/Preprocessor/microsoft-ext.c b/test/Preprocessor/microsoft-ext.c
index b03f677..cb3cf4f 100644
--- a/test/Preprocessor/microsoft-ext.c
+++ b/test/Preprocessor/microsoft-ext.c
@@ -34,3 +34,12 @@
 
 MAKE_FUNC(MAK, ER, int a, _COMMA, int b);
 // CHECK: void func(int a , int b) {}
+
+#define macro(a, b) (a - b)
+void function(int a);
+#define COMMA_ELIDER(...) \
+  macro(x, __VA_ARGS__); \
+  function(x, __VA_ARGS__);
+COMMA_ELIDER();
+// CHECK: (x - );
+// CHECK: function(x);
diff --git a/test/Preprocessor/pic.c b/test/Preprocessor/pic.c
index 3e649ee..ec8c954 100644
--- a/test/Preprocessor/pic.c
+++ b/test/Preprocessor/pic.c
@@ -19,16 +19,16 @@
 // CHECK-PIC2: #define __pic__ 2
 // CHECK-PIC2-NOT: #define __pie__
 //
-// RUN: %clang_cc1 -pie-level 1 -dM -E -o - %s \
+// RUN: %clang_cc1 -pic-level 1 -pic-is-pie -dM -E -o - %s \
 // RUN:   | FileCheck --check-prefix=CHECK-PIE1 %s
-// CHECK-PIE1-NOT: #define __PIC__
+// CHECK-PIE1: #define __PIC__ 1
 // CHECK-PIE1: #define __PIE__ 1
-// CHECK-PIE1-NOT: #define __pic__
+// CHECK-PIE1: #define __pic__ 1
 // CHECK-PIE1: #define __pie__ 1
 //
-// RUN: %clang_cc1 -pie-level 2 -dM -E -o - %s \
+// RUN: %clang_cc1 -pic-level 2 -pic-is-pie -dM -E -o - %s \
 // RUN:   | FileCheck --check-prefix=CHECK-PIE2 %s
-// CHECK-PIE2-NOT: #define __PIC__
+// CHECK-PIE2: #define __PIC__ 2
 // CHECK-PIE2: #define __PIE__ 2
-// CHECK-PIE2-NOT: #define __pic__
+// CHECK-PIE2: #define __pic__ 2
 // CHECK-PIE2: #define __pie__ 2
diff --git a/test/Preprocessor/pragma_diagnostic.c b/test/Preprocessor/pragma_diagnostic.c
index e8a67ab..3970dbb 100644
--- a/test/Preprocessor/pragma_diagnostic.c
+++ b/test/Preprocessor/pragma_diagnostic.c
@@ -30,3 +30,18 @@
 
 #pragma GCC diagnostic error "-Winvalid-name"  // expected-warning {{unknown warning group '-Winvalid-name', ignored}}
 
+
+// Testing pragma clang diagnostic with -Weverything
+void ppo(){} // First test that we do not diagnose on this.
+
+#pragma clang diagnostic warning "-Weverything"
+void ppp(){} // expected-warning {{no previous prototype for function 'ppp'}}
+
+#pragma clang diagnostic ignored "-Weverything" // Reset it.
+void ppq(){}
+
+#pragma clang diagnostic error "-Weverything" // Now set to error
+void ppr(){} // expected-error {{no previous prototype for function 'ppr'}}
+
+#pragma clang diagnostic warning "-Weverything" // This should not be effective
+void pps(){} // expected-error {{no previous prototype for function 'pps'}}
diff --git a/test/Preprocessor/predefined-arch-macros.c b/test/Preprocessor/predefined-arch-macros.c
index 66a96e4..1149678 100644
--- a/test/Preprocessor/predefined-arch-macros.c
+++ b/test/Preprocessor/predefined-arch-macros.c
@@ -2,19 +2,19 @@
 //
 // RUN: %clang -march=i386 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I386_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I386_M32
 // CHECK_I386_M32: #define __i386 1
 // CHECK_I386_M32: #define __i386__ 1
 // CHECK_I386_M32: #define __tune_i386__ 1
 // CHECK_I386_M32: #define i386 1
 // RUN: not %clang -march=i386 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I386_M64
-// CHECK_I386_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I386_M64
+// CHECK_I386_M64: error: {{.*}}
 //
 // RUN: %clang -march=i486 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I486_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I486_M32
 // CHECK_I486_M32: #define __i386 1
 // CHECK_I486_M32: #define __i386__ 1
 // CHECK_I486_M32: #define __i486 1
@@ -23,12 +23,12 @@
 // CHECK_I486_M32: #define i386 1
 // RUN: not %clang -march=i486 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I486_M64
-// CHECK_I486_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I486_M64
+// CHECK_I486_M64: error: {{.*}}
 //
 // RUN: %clang -march=i586 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I586_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I586_M32
 // CHECK_I586_M32: #define __i386 1
 // CHECK_I586_M32: #define __i386__ 1
 // CHECK_I586_M32: #define __i586 1
@@ -40,12 +40,12 @@
 // CHECK_I586_M32: #define i386 1
 // RUN: not %clang -march=i586 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I586_M64
-// CHECK_I586_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I586_M64
+// CHECK_I586_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_M32
 // CHECK_PENTIUM_M32: #define __i386 1
 // CHECK_PENTIUM_M32: #define __i386__ 1
 // CHECK_PENTIUM_M32: #define __i586 1
@@ -57,12 +57,12 @@
 // CHECK_PENTIUM_M32: #define i386 1
 // RUN: not %clang -march=pentium -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_M64
-// CHECK_PENTIUM_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_M64
+// CHECK_PENTIUM_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium-mmx -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_MMX_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_MMX_M32
 // CHECK_PENTIUM_MMX_M32: #define __MMX__ 1
 // CHECK_PENTIUM_MMX_M32: #define __i386 1
 // CHECK_PENTIUM_MMX_M32: #define __i386__ 1
@@ -77,12 +77,12 @@
 // CHECK_PENTIUM_MMX_M32: #define i386 1
 // RUN: not %clang -march=pentium-mmx -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_MMX_M64
-// CHECK_PENTIUM_MMX_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_MMX_M64
+// CHECK_PENTIUM_MMX_M64: error: {{.*}}
 //
 // RUN: %clang -march=winchip-c6 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_WINCHIP_C6_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP_C6_M32
 // CHECK_WINCHIP_C6_M32: #define __MMX__ 1
 // CHECK_WINCHIP_C6_M32: #define __i386 1
 // CHECK_WINCHIP_C6_M32: #define __i386__ 1
@@ -92,12 +92,12 @@
 // CHECK_WINCHIP_C6_M32: #define i386 1
 // RUN: not %clang -march=winchip-c6 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_WINCHIP_C6_M64
-// CHECK_WINCHIP_C6_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP_C6_M64
+// CHECK_WINCHIP_C6_M64: error: {{.*}}
 //
 // RUN: %clang -march=winchip2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_WINCHIP2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP2_M32
 // CHECK_WINCHIP2_M32: #define __3dNOW__ 1
 // CHECK_WINCHIP2_M32: #define __MMX__ 1
 // CHECK_WINCHIP2_M32: #define __i386 1
@@ -108,12 +108,12 @@
 // CHECK_WINCHIP2_M32: #define i386 1
 // RUN: not %clang -march=winchip2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_WINCHIP2_M64
-// CHECK_WINCHIP2_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP2_M64
+// CHECK_WINCHIP2_M64: error: {{.*}}
 //
 // RUN: %clang -march=c3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_C3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_M32
 // CHECK_C3_M32: #define __3dNOW__ 1
 // CHECK_C3_M32: #define __MMX__ 1
 // CHECK_C3_M32: #define __i386 1
@@ -124,12 +124,12 @@
 // CHECK_C3_M32: #define i386 1
 // RUN: not %clang -march=c3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_C3_M64
-// CHECK_C3_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_M64
+// CHECK_C3_M64: error: {{.*}}
 //
 // RUN: %clang -march=c3-2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_C3_2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_2_M32
 // CHECK_C3_2_M32: #define __MMX__ 1
 // CHECK_C3_2_M32: #define __SSE__ 1
 // CHECK_C3_2_M32: #define __i386 1
@@ -144,12 +144,12 @@
 // CHECK_C3_2_M32: #define i386 1
 // RUN: not %clang -march=c3-2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_C3_2_M64
-// CHECK_C3_2_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_2_M64
+// CHECK_C3_2_M64: error: {{.*}}
 //
 // RUN: %clang -march=i686 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I686_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I686_M32
 // CHECK_I686_M32: #define __i386 1
 // CHECK_I686_M32: #define __i386__ 1
 // CHECK_I686_M32: #define __i686 1
@@ -159,12 +159,12 @@
 // CHECK_I686_M32: #define i386 1
 // RUN: not %clang -march=i686 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I686_M64
-// CHECK_I686_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I686_M64
+// CHECK_I686_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentiumpro -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUMPRO_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUMPRO_M32
 // CHECK_PENTIUMPRO_M32: #define __i386 1
 // CHECK_PENTIUMPRO_M32: #define __i386__ 1
 // CHECK_PENTIUMPRO_M32: #define __i686 1
@@ -176,12 +176,12 @@
 // CHECK_PENTIUMPRO_M32: #define i386 1
 // RUN: not %clang -march=pentiumpro -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUMPRO_M64
-// CHECK_PENTIUMPRO_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUMPRO_M64
+// CHECK_PENTIUMPRO_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM2_M32
 // CHECK_PENTIUM2_M32: #define __MMX__ 1
 // CHECK_PENTIUM2_M32: #define __i386 1
 // CHECK_PENTIUM2_M32: #define __i386__ 1
@@ -195,12 +195,12 @@
 // CHECK_PENTIUM2_M32: #define i386 1
 // RUN: not %clang -march=pentium2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM2_M64
-// CHECK_PENTIUM2_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM2_M64
+// CHECK_PENTIUM2_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM3_M32
 // CHECK_PENTIUM3_M32: #define __MMX__ 1
 // CHECK_PENTIUM3_M32: #define __SSE__ 1
 // CHECK_PENTIUM3_M32: #define __i386 1
@@ -216,12 +216,12 @@
 // CHECK_PENTIUM3_M32: #define i386 1
 // RUN: not %clang -march=pentium3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM3_M64
-// CHECK_PENTIUM3_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM3_M64
+// CHECK_PENTIUM3_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium3m -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM3M_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM3M_M32
 // CHECK_PENTIUM3M_M32: #define __MMX__ 1
 // CHECK_PENTIUM3M_M32: #define __SSE__ 1
 // CHECK_PENTIUM3M_M32: #define __i386 1
@@ -235,12 +235,12 @@
 // CHECK_PENTIUM3M_M32: #define i386 1
 // RUN: not %clang -march=pentium3m -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM3M_M64
-// CHECK_PENTIUM3M_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM3M_M64
+// CHECK_PENTIUM3M_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium-m -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_M_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_M_M32
 // CHECK_PENTIUM_M_M32: #define __MMX__ 1
 // CHECK_PENTIUM_M_M32: #define __SSE2__ 1
 // CHECK_PENTIUM_M_M32: #define __SSE__ 1
@@ -255,12 +255,12 @@
 // CHECK_PENTIUM_M_M32: #define i386 1
 // RUN: not %clang -march=pentium-m -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_M_M64
-// CHECK_PENTIUM_M_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_M_M64
+// CHECK_PENTIUM_M_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM4_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM4_M32
 // CHECK_PENTIUM4_M32: #define __MMX__ 1
 // CHECK_PENTIUM4_M32: #define __SSE2__ 1
 // CHECK_PENTIUM4_M32: #define __SSE__ 1
@@ -272,12 +272,12 @@
 // CHECK_PENTIUM4_M32: #define i386 1
 // RUN: not %clang -march=pentium4 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM4_M64
-// CHECK_PENTIUM4_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM4_M64
+// CHECK_PENTIUM4_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium4m -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM4M_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM4M_M32
 // CHECK_PENTIUM4M_M32: #define __MMX__ 1
 // CHECK_PENTIUM4M_M32: #define __SSE2__ 1
 // CHECK_PENTIUM4M_M32: #define __SSE__ 1
@@ -289,12 +289,12 @@
 // CHECK_PENTIUM4M_M32: #define i386 1
 // RUN: not %clang -march=pentium4m -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM4M_M64
-// CHECK_PENTIUM4M_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM4M_M64
+// CHECK_PENTIUM4M_M64: error: {{.*}}
 //
 // RUN: %clang -march=prescott -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PRESCOTT_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PRESCOTT_M32
 // CHECK_PRESCOTT_M32: #define __MMX__ 1
 // CHECK_PRESCOTT_M32: #define __SSE2__ 1
 // CHECK_PRESCOTT_M32: #define __SSE3__ 1
@@ -307,12 +307,12 @@
 // CHECK_PRESCOTT_M32: #define i386 1
 // RUN: not %clang -march=prescott -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PRESCOTT_M64
-// CHECK_PRESCOTT_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PRESCOTT_M64
+// CHECK_PRESCOTT_M64: error: {{.*}}
 //
 // RUN: %clang -march=nocona -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_NOCONA_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_NOCONA_M32
 // CHECK_NOCONA_M32: #define __MMX__ 1
 // CHECK_NOCONA_M32: #define __SSE2__ 1
 // CHECK_NOCONA_M32: #define __SSE3__ 1
@@ -325,7 +325,7 @@
 // CHECK_NOCONA_M32: #define i386 1
 // RUN: %clang -march=nocona -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_NOCONA_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_NOCONA_M64
 // CHECK_NOCONA_M64: #define __MMX__ 1
 // CHECK_NOCONA_M64: #define __SSE2_MATH__ 1
 // CHECK_NOCONA_M64: #define __SSE2__ 1
@@ -342,7 +342,7 @@
 //
 // RUN: %clang -march=core2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE2_M32
 // CHECK_CORE2_M32: #define __MMX__ 1
 // CHECK_CORE2_M32: #define __SSE2__ 1
 // CHECK_CORE2_M32: #define __SSE3__ 1
@@ -356,7 +356,7 @@
 // CHECK_CORE2_M32: #define i386 1
 // RUN: %clang -march=core2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE2_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE2_M64
 // CHECK_CORE2_M64: #define __MMX__ 1
 // CHECK_CORE2_M64: #define __SSE2_MATH__ 1
 // CHECK_CORE2_M64: #define __SSE2__ 1
@@ -374,7 +374,7 @@
 //
 // RUN: %clang -march=corei7 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_COREI7_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_COREI7_M32
 // CHECK_COREI7_M32: #define __MMX__ 1
 // CHECK_COREI7_M32: #define __POPCNT__ 1
 // CHECK_COREI7_M32: #define __SSE2__ 1
@@ -391,7 +391,7 @@
 // CHECK_COREI7_M32: #define i386 1
 // RUN: %clang -march=corei7 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_COREI7_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_COREI7_M64
 // CHECK_COREI7_M64: #define __MMX__ 1
 // CHECK_COREI7_M64: #define __POPCNT__ 1
 // CHECK_COREI7_M64: #define __SSE2_MATH__ 1
@@ -412,7 +412,7 @@
 //
 // RUN: %clang -march=corei7-avx -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_COREI7_AVX_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_COREI7_AVX_M32
 // CHECK_COREI7_AVX_M32: #define __AES__ 1
 // CHECK_COREI7_AVX_M32: #define __AVX__ 1
 // CHECK_COREI7_AVX_M32: #define __MMX__ 1
@@ -435,7 +435,7 @@
 // CHECK_COREI7_AVX_M32: #define i386 1
 // RUN: %clang -march=corei7-avx -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_COREI7_AVX_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_COREI7_AVX_M64
 // CHECK_COREI7_AVX_M64: #define __AES__ 1
 // CHECK_COREI7_AVX_M64: #define __AVX__ 1
 // CHECK_COREI7_AVX_M64: #define __MMX__ 1
@@ -462,7 +462,7 @@
 //
 // RUN: %clang -march=core-avx-i -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE_AVX_I_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE_AVX_I_M32
 // CHECK_CORE_AVX_I_M32: #define __AES__ 1
 // CHECK_CORE_AVX_I_M32: #define __AVX__ 1
 // CHECK_CORE_AVX_I_M32: #define __F16C__ 1
@@ -485,7 +485,7 @@
 // CHECK_CORE_AVX_I_M32: #define i386 1
 // RUN: %clang -march=core-avx-i -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE_AVX_I_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE_AVX_I_M64
 // CHECK_CORE_AVX_I_M64: #define __AES__ 1
 // CHECK_CORE_AVX_I_M64: #define __AVX__ 1
 // CHECK_CORE_AVX_I_M64: #define __F16C__ 1
@@ -512,7 +512,7 @@
 //
 // RUN: %clang -march=core-avx2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE_AVX2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE_AVX2_M32
 // CHECK_CORE_AVX2_M32: #define __AES__ 1
 // CHECK_CORE_AVX2_M32: #define __AVX2__ 1
 // CHECK_CORE_AVX2_M32: #define __AVX__ 1
@@ -542,7 +542,7 @@
 // CHECK_CORE_AVX2_M32: #define i386 1
 // RUN: %clang -march=core-avx2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE_AVX2_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE_AVX2_M64
 // CHECK_CORE_AVX2_M64: #define __AES__ 1
 // CHECK_CORE_AVX2_M64: #define __AVX2__ 1
 // CHECK_CORE_AVX2_M64: #define __AVX__ 1
@@ -576,7 +576,7 @@
 //
 // RUN: %clang -march=broadwell -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BROADWELL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BROADWELL_M32
 // CHECK_BROADWELL_M32: #define __ADX__ 1
 // CHECK_BROADWELL_M32: #define __AES__ 1
 // CHECK_BROADWELL_M32: #define __AVX2__ 1
@@ -608,7 +608,7 @@
 // CHECK_BROADWELL_M32: #define i386 1
 // RUN: %clang -march=broadwell -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BROADWELL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BROADWELL_M64
 // CHECK_BROADWELL_M64: #define __ADX__ 1
 // CHECK_BROADWELL_M64: #define __AES__ 1
 // CHECK_BROADWELL_M64: #define __AVX2__ 1
@@ -642,9 +642,74 @@
 // CHECK_BROADWELL_M64: #define __x86_64 1
 // CHECK_BROADWELL_M64: #define __x86_64__ 1
 //
+// RUN: %clang -march=skylake -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SKL_M32
+// CHECK_SKL_M32: #define __ADX__ 1
+// CHECK_SKL_M32: #define __AES__ 1
+// CHECK_SKL_M32: #define __AVX2__ 1
+// CHECK_SKL_M32: #define __AVX__ 1
+// CHECK_SKL_M32: #define __BMI2__ 1
+// CHECK_SKL_M32: #define __BMI__ 1
+// CHECK_SKL_M32: #define __F16C__ 1
+// CHECK_SKL_M32: #define __FMA__ 1
+// CHECK_SKL_M32: #define __LZCNT__ 1
+// CHECK_SKL_M32: #define __MMX__ 1
+// CHECK_SKL_M32: #define __PCLMUL__ 1
+// CHECK_SKL_M32: #define __POPCNT__ 1
+// CHECK_SKL_M32: #define __RDRND__ 1
+// CHECK_SKL_M32: #define __RDSEED__ 1
+// CHECK_SKL_M32: #define __RTM__ 1
+// CHECK_SKL_M32: #define __SSE2__ 1
+// CHECK_SKL_M32: #define __SSE3__ 1
+// CHECK_SKL_M32: #define __SSE4_1__ 1
+// CHECK_SKL_M32: #define __SSE4_2__ 1
+// CHECK_SKL_M32: #define __SSE__ 1
+// CHECK_SKL_M32: #define __SSSE3__ 1
+// CHECK_SKL_M32: #define __XSAVEC__ 1
+// CHECK_SKL_M32: #define __XSAVEOPT__ 1
+// CHECK_SKL_M32: #define __XSAVES__ 1
+// CHECK_SKL_M32: #define __XSAVE__ 1
+// CHECK_SKL_M32: #define i386 1
+
+// RUN: %clang -march=skylake -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SKL_M64
+// CHECK_SKL_M64: #define __ADX__ 1
+// CHECK_SKL_M64: #define __AES__ 1
+// CHECK_SKL_M64: #define __AVX2__ 1
+// CHECK_SKL_M64: #define __AVX__ 1
+// CHECK_SKL_M64: #define __BMI2__ 1
+// CHECK_SKL_M64: #define __BMI__ 1
+// CHECK_SKL_M64: #define __F16C__ 1
+// CHECK_SKL_M64: #define __FMA__ 1
+// CHECK_SKL_M64: #define __LZCNT__ 1
+// CHECK_SKL_M64: #define __MMX__ 1
+// CHECK_SKL_M64: #define __PCLMUL__ 1
+// CHECK_SKL_M64: #define __POPCNT__ 1
+// CHECK_SKL_M64: #define __RDRND__ 1
+// CHECK_SKL_M64: #define __RDSEED__ 1
+// CHECK_SKL_M64: #define __RTM__ 1
+// CHECK_SKL_M64: #define __SSE2_MATH__ 1
+// CHECK_SKL_M64: #define __SSE2__ 1
+// CHECK_SKL_M64: #define __SSE3__ 1
+// CHECK_SKL_M64: #define __SSE4_1__ 1
+// CHECK_SKL_M64: #define __SSE4_2__ 1
+// CHECK_SKL_M64: #define __SSE_MATH__ 1
+// CHECK_SKL_M64: #define __SSE__ 1
+// CHECK_SKL_M64: #define __SSSE3__ 1
+// CHECK_SKL_M64: #define __XSAVEC__ 1
+// CHECK_SKL_M64: #define __XSAVEOPT__ 1
+// CHECK_SKL_M64: #define __XSAVES__ 1
+// CHECK_SKL_M64: #define __XSAVE__ 1
+// CHECK_SKL_M64: #define __amd64 1
+// CHECK_SKL_M64: #define __amd64__ 1
+// CHECK_SKL_M64: #define __x86_64 1
+// CHECK_SKL_M64: #define __x86_64__ 1
+
 // RUN: %clang -march=knl -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_KNL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_KNL_M32
 // CHECK_KNL_M32: #define __AES__ 1
 // CHECK_KNL_M32: #define __AVX2__ 1
 // CHECK_KNL_M32: #define __AVX512CD__ 1
@@ -679,7 +744,7 @@
 
 // RUN: %clang -march=knl -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_KNL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_KNL_M64
 // CHECK_KNL_M64: #define __AES__ 1
 // CHECK_KNL_M64: #define __AVX2__ 1
 // CHECK_KNL_M64: #define __AVX512CD__ 1
@@ -715,9 +780,9 @@
 // CHECK_KNL_M64: #define __x86_64 1
 // CHECK_KNL_M64: #define __x86_64__ 1
 //
-// RUN: %clang -march=skx -m32 -E -dM %s -o - 2>&1 \
+// RUN: %clang -march=skylake-avx512 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SKX_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SKX_M32
 // CHECK_SKX_M32: #define __AES__ 1
 // CHECK_SKX_M32: #define __AVX2__ 1
 // CHECK_SKX_M32: #define __AVX512BW__ 1
@@ -753,9 +818,9 @@
 // CHECK_SKX_M32: #define __tune_skx__ 1
 // CHECK_SKX_M32: #define i386 1
 
-// RUN: %clang -march=skx -m64 -E -dM %s -o - 2>&1 \
+// RUN: %clang -march=skylake-avx512 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SKX_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SKX_M64
 // CHECK_SKX_M64: #define __AES__ 1
 // CHECK_SKX_M64: #define __AVX2__ 1
 // CHECK_SKX_M64: #define __AVX512BW__ 1
@@ -794,9 +859,86 @@
 // CHECK_SKX_M64: #define __x86_64 1
 // CHECK_SKX_M64: #define __x86_64__ 1
 //
+// RUN: %clang -march=cannonlake -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CNL_M32
+// CHECK_CNL_M32: #define __AES__ 1
+// CHECK_CNL_M32: #define __AVX2__ 1
+// CHECK_CNL_M32: #define __AVX512BW__ 1
+// CHECK_CNL_M32: #define __AVX512CD__ 1
+// CHECK_CNL_M32: #define __AVX512DQ__ 1
+// CHECK_CNL_M32: #define __AVX512F__ 1
+// CHECK_CNL_M32: #define __AVX512IFMA__ 1
+// CHECK_CNL_M32: #define __AVX512VBMI__ 1
+// CHECK_CNL_M32: #define __AVX512VL__ 1
+// CHECK_CNL_M32: #define __AVX__ 1
+// CHECK_CNL_M32: #define __BMI2__ 1
+// CHECK_CNL_M32: #define __BMI__ 1
+// CHECK_CNL_M32: #define __F16C__ 1
+// CHECK_CNL_M32: #define __FMA__ 1
+// CHECK_CNL_M32: #define __LZCNT__ 1
+// CHECK_CNL_M32: #define __MMX__ 1
+// CHECK_CNL_M32: #define __PCLMUL__ 1
+// CHECK_CNL_M32: #define __POPCNT__ 1
+// CHECK_CNL_M32: #define __RDRND__ 1
+// CHECK_CNL_M32: #define __RTM__ 1
+// CHECK_CNL_M32: #define __SHA__ 1
+// CHECK_CNL_M32: #define __SSE2__ 1
+// CHECK_CNL_M32: #define __SSE3__ 1
+// CHECK_CNL_M32: #define __SSE4_1__ 1
+// CHECK_CNL_M32: #define __SSE4_2__ 1
+// CHECK_CNL_M32: #define __SSE__ 1
+// CHECK_CNL_M32: #define __SSSE3__ 1
+// CHECK_CNL_M32: #define __XSAVEC__ 1
+// CHECK_CNL_M32: #define __XSAVEOPT__ 1
+// CHECK_CNL_M32: #define __XSAVES__ 1
+// CHECK_CNL_M32: #define __XSAVE__ 1
+// CHECK_CNL_M32: #define __i386 1
+// CHECK_CNL_M32: #define __i386__ 1
+// CHECK_CNL_M32: #define i386 1
+//
+// RUN: %clang -march=cannonlake -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CNL_M64
+// CHECK_CNL_M64: #define __AES__ 1
+// CHECK_CNL_M64: #define __AVX2__ 1
+// CHECK_CNL_M64: #define __AVX512BW__ 1
+// CHECK_CNL_M64: #define __AVX512CD__ 1
+// CHECK_CNL_M64: #define __AVX512DQ__ 1
+// CHECK_CNL_M64: #define __AVX512F__ 1
+// CHECK_CNL_M64: #define __AVX512IFMA__ 1
+// CHECK_CNL_M64: #define __AVX512VBMI__ 1
+// CHECK_CNL_M64: #define __AVX512VL__ 1
+// CHECK_CNL_M64: #define __AVX__ 1
+// CHECK_CNL_M64: #define __BMI2__ 1
+// CHECK_CNL_M64: #define __BMI__ 1
+// CHECK_CNL_M64: #define __F16C__ 1
+// CHECK_CNL_M64: #define __FMA__ 1
+// CHECK_CNL_M64: #define __LZCNT__ 1
+// CHECK_CNL_M64: #define __MMX__ 1
+// CHECK_CNL_M64: #define __PCLMUL__ 1
+// CHECK_CNL_M64: #define __POPCNT__ 1
+// CHECK_CNL_M64: #define __RDRND__ 1
+// CHECK_CNL_M64: #define __RTM__ 1
+// CHECK_CNL_M64: #define __SHA__ 1
+// CHECK_CNL_M64: #define __SSE2__ 1
+// CHECK_CNL_M64: #define __SSE3__ 1
+// CHECK_CNL_M64: #define __SSE4_1__ 1
+// CHECK_CNL_M64: #define __SSE4_2__ 1
+// CHECK_CNL_M64: #define __SSE__ 1
+// CHECK_CNL_M64: #define __SSSE3__ 1
+// CHECK_CNL_M64: #define __XSAVEC__ 1
+// CHECK_CNL_M64: #define __XSAVEOPT__ 1
+// CHECK_CNL_M64: #define __XSAVES__ 1
+// CHECK_CNL_M64: #define __XSAVE__ 1
+// CHECK_CNL_M64: #define __amd64 1
+// CHECK_CNL_M64: #define __amd64__ 1
+// CHECK_CNL_M64: #define __x86_64 1
+// CHECK_CNL_M64: #define __x86_64__ 1
+
 // RUN: %clang -march=atom -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATOM_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATOM_M32
 // CHECK_ATOM_M32: #define __MMX__ 1
 // CHECK_ATOM_M32: #define __SSE2__ 1
 // CHECK_ATOM_M32: #define __SSE3__ 1
@@ -810,7 +952,7 @@
 // CHECK_ATOM_M32: #define i386 1
 // RUN: %clang -march=atom -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATOM_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATOM_M64
 // CHECK_ATOM_M64: #define __MMX__ 1
 // CHECK_ATOM_M64: #define __SSE2_MATH__ 1
 // CHECK_ATOM_M64: #define __SSE2__ 1
@@ -828,7 +970,7 @@
 //
 // RUN: %clang -march=slm -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SLM_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SLM_M32
 // CHECK_SLM_M32: #define __MMX__ 1
 // CHECK_SLM_M32: #define __SSE2__ 1
 // CHECK_SLM_M32: #define __SSE3__ 1
@@ -844,7 +986,7 @@
 // CHECK_SLM_M32: #define i386 1
 // RUN: %clang -march=slm -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SLM_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SLM_M64
 // CHECK_SLM_M64: #define __MMX__ 1
 // CHECK_SLM_M64: #define __SSE2_MATH__ 1
 // CHECK_SLM_M64: #define __SSE2__ 1
@@ -862,9 +1004,21 @@
 // CHECK_SLM_M64: #define __x86_64 1
 // CHECK_SLM_M64: #define __x86_64__ 1
 //
+// RUN: %clang -march=lakemont -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck %s -check-prefix=CHECK_LMT_M32
+// CHECK_LMT_M32: #define __i386 1
+// CHECK_LMT_M32: #define __i386__ 1
+// CHECK_LMT_M32: #define __tune_lakemont__ 1
+// CHECK_LMT_M32: #define i386 1
+// RUN: not %clang -march=lakemont -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck %s -check-prefix=CHECK_LMT_M64
+// CHECK_LMT_M64: error:
+//
 // RUN: %clang -march=geode -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_GEODE_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_GEODE_M32
 // CHECK_GEODE_M32: #define __3dNOW_A__ 1
 // CHECK_GEODE_M32: #define __3dNOW__ 1
 // CHECK_GEODE_M32: #define __MMX__ 1
@@ -876,12 +1030,12 @@
 // CHECK_GEODE_M32: #define i386 1
 // RUN: not %clang -march=geode -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_GEODE_M64
-// CHECK_GEODE_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_GEODE_M64
+// CHECK_GEODE_M64: error: {{.*}}
 //
 // RUN: %clang -march=k6 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_M32
 // CHECK_K6_M32: #define __MMX__ 1
 // CHECK_K6_M32: #define __i386 1
 // CHECK_K6_M32: #define __i386__ 1
@@ -891,12 +1045,12 @@
 // CHECK_K6_M32: #define i386 1
 // RUN: not %clang -march=k6 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_M64
-// CHECK_K6_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_M64
+// CHECK_K6_M64: error: {{.*}}
 //
 // RUN: %clang -march=k6-2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_2_M32
 // CHECK_K6_2_M32: #define __3dNOW__ 1
 // CHECK_K6_2_M32: #define __MMX__ 1
 // CHECK_K6_2_M32: #define __i386 1
@@ -909,12 +1063,12 @@
 // CHECK_K6_2_M32: #define i386 1
 // RUN: not %clang -march=k6-2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_2_M64
-// CHECK_K6_2_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_2_M64
+// CHECK_K6_2_M64: error: {{.*}}
 //
 // RUN: %clang -march=k6-3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_3_M32
 // CHECK_K6_3_M32: #define __3dNOW__ 1
 // CHECK_K6_3_M32: #define __MMX__ 1
 // CHECK_K6_3_M32: #define __i386 1
@@ -927,12 +1081,12 @@
 // CHECK_K6_3_M32: #define i386 1
 // RUN: not %clang -march=k6-3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_3_M64
-// CHECK_K6_3_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_3_M64
+// CHECK_K6_3_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_M32
 // CHECK_ATHLON_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_M32: #define __MMX__ 1
@@ -944,12 +1098,12 @@
 // CHECK_ATHLON_M32: #define i386 1
 // RUN: not %clang -march=athlon -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_M64
-// CHECK_ATHLON_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_M64
+// CHECK_ATHLON_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon-tbird -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_TBIRD_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_TBIRD_M32
 // CHECK_ATHLON_TBIRD_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_TBIRD_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_TBIRD_M32: #define __MMX__ 1
@@ -961,12 +1115,12 @@
 // CHECK_ATHLON_TBIRD_M32: #define i386 1
 // RUN: not %clang -march=athlon-tbird -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_TBIRD_M64
-// CHECK_ATHLON_TBIRD_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_TBIRD_M64
+// CHECK_ATHLON_TBIRD_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon-4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_4_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_4_M32
 // CHECK_ATHLON_4_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_4_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_4_M32: #define __MMX__ 1
@@ -981,12 +1135,12 @@
 // CHECK_ATHLON_4_M32: #define i386 1
 // RUN: not %clang -march=athlon-4 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_4_M64
-// CHECK_ATHLON_4_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_4_M64
+// CHECK_ATHLON_4_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon-xp -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_XP_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_XP_M32
 // CHECK_ATHLON_XP_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_XP_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_XP_M32: #define __MMX__ 1
@@ -1001,12 +1155,12 @@
 // CHECK_ATHLON_XP_M32: #define i386 1
 // RUN: not %clang -march=athlon-xp -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_XP_M64
-// CHECK_ATHLON_XP_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_XP_M64
+// CHECK_ATHLON_XP_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon-mp -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_MP_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_MP_M32
 // CHECK_ATHLON_MP_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_MP_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_MP_M32: #define __MMX__ 1
@@ -1021,12 +1175,12 @@
 // CHECK_ATHLON_MP_M32: #define i386 1
 // RUN: not %clang -march=athlon-mp -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_MP_M64
-// CHECK_ATHLON_MP_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_MP_M64
+// CHECK_ATHLON_MP_M64: error: {{.*}}
 //
 // RUN: %clang -march=x86-64 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_X86_64_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_X86_64_M32
 // CHECK_X86_64_M32: #define __MMX__ 1
 // CHECK_X86_64_M32: #define __SSE2__ 1
 // CHECK_X86_64_M32: #define __SSE__ 1
@@ -1037,7 +1191,7 @@
 // CHECK_X86_64_M32: #define i386 1
 // RUN: %clang -march=x86-64 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_X86_64_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_X86_64_M64
 // CHECK_X86_64_M64: #define __MMX__ 1
 // CHECK_X86_64_M64: #define __SSE2_MATH__ 1
 // CHECK_X86_64_M64: #define __SSE2__ 1
@@ -1052,7 +1206,7 @@
 //
 // RUN: %clang -march=k8 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K8_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_M32
 // CHECK_K8_M32: #define __3dNOW_A__ 1
 // CHECK_K8_M32: #define __3dNOW__ 1
 // CHECK_K8_M32: #define __MMX__ 1
@@ -1066,7 +1220,7 @@
 // CHECK_K8_M32: #define i386 1
 // RUN: %clang -march=k8 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K8_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_M64
 // CHECK_K8_M64: #define __3dNOW_A__ 1
 // CHECK_K8_M64: #define __3dNOW__ 1
 // CHECK_K8_M64: #define __MMX__ 1
@@ -1084,7 +1238,7 @@
 //
 // RUN: %clang -march=k8-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K8_SSE3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_SSE3_M32
 // CHECK_K8_SSE3_M32: #define __3dNOW_A__ 1
 // CHECK_K8_SSE3_M32: #define __3dNOW__ 1
 // CHECK_K8_SSE3_M32: #define __MMX__ 1
@@ -1099,7 +1253,7 @@
 // CHECK_K8_SSE3_M32: #define i386 1
 // RUN: %clang -march=k8-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K8_SSE3_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_SSE3_M64
 // CHECK_K8_SSE3_M64: #define __3dNOW_A__ 1
 // CHECK_K8_SSE3_M64: #define __3dNOW__ 1
 // CHECK_K8_SSE3_M64: #define __MMX__ 1
@@ -1118,7 +1272,7 @@
 //
 // RUN: %clang -march=opteron -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_OPTERON_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_M32
 // CHECK_OPTERON_M32: #define __3dNOW_A__ 1
 // CHECK_OPTERON_M32: #define __3dNOW__ 1
 // CHECK_OPTERON_M32: #define __MMX__ 1
@@ -1132,7 +1286,7 @@
 // CHECK_OPTERON_M32: #define i386 1
 // RUN: %clang -march=opteron -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_OPTERON_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_M64
 // CHECK_OPTERON_M64: #define __3dNOW_A__ 1
 // CHECK_OPTERON_M64: #define __3dNOW__ 1
 // CHECK_OPTERON_M64: #define __MMX__ 1
@@ -1150,7 +1304,7 @@
 //
 // RUN: %clang -march=opteron-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_OPTERON_SSE3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_SSE3_M32
 // CHECK_OPTERON_SSE3_M32: #define __3dNOW_A__ 1
 // CHECK_OPTERON_SSE3_M32: #define __3dNOW__ 1
 // CHECK_OPTERON_SSE3_M32: #define __MMX__ 1
@@ -1165,7 +1319,7 @@
 // CHECK_OPTERON_SSE3_M32: #define i386 1
 // RUN: %clang -march=opteron-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_OPTERON_SSE3_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_SSE3_M64
 // CHECK_OPTERON_SSE3_M64: #define __3dNOW_A__ 1
 // CHECK_OPTERON_SSE3_M64: #define __3dNOW__ 1
 // CHECK_OPTERON_SSE3_M64: #define __MMX__ 1
@@ -1184,7 +1338,7 @@
 //
 // RUN: %clang -march=athlon64 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON64_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_M32
 // CHECK_ATHLON64_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON64_M32: #define __3dNOW__ 1
 // CHECK_ATHLON64_M32: #define __MMX__ 1
@@ -1198,7 +1352,7 @@
 // CHECK_ATHLON64_M32: #define i386 1
 // RUN: %clang -march=athlon64 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON64_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_M64
 // CHECK_ATHLON64_M64: #define __3dNOW_A__ 1
 // CHECK_ATHLON64_M64: #define __3dNOW__ 1
 // CHECK_ATHLON64_M64: #define __MMX__ 1
@@ -1216,7 +1370,7 @@
 //
 // RUN: %clang -march=athlon64-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON64_SSE3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_SSE3_M32
 // CHECK_ATHLON64_SSE3_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON64_SSE3_M32: #define __3dNOW__ 1
 // CHECK_ATHLON64_SSE3_M32: #define __MMX__ 1
@@ -1231,7 +1385,7 @@
 // CHECK_ATHLON64_SSE3_M32: #define i386 1
 // RUN: %clang -march=athlon64-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON64_SSE3_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_SSE3_M64
 // CHECK_ATHLON64_SSE3_M64: #define __3dNOW_A__ 1
 // CHECK_ATHLON64_SSE3_M64: #define __3dNOW__ 1
 // CHECK_ATHLON64_SSE3_M64: #define __MMX__ 1
@@ -1250,7 +1404,7 @@
 //
 // RUN: %clang -march=athlon-fx -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_FX_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_FX_M32
 // CHECK_ATHLON_FX_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_FX_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_FX_M32: #define __MMX__ 1
@@ -1264,7 +1418,7 @@
 // CHECK_ATHLON_FX_M32: #define i386 1
 // RUN: %clang -march=athlon-fx -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_FX_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_FX_M64
 // CHECK_ATHLON_FX_M64: #define __3dNOW_A__ 1
 // CHECK_ATHLON_FX_M64: #define __3dNOW__ 1
 // CHECK_ATHLON_FX_M64: #define __MMX__ 1
@@ -1281,7 +1435,7 @@
 // CHECK_ATHLON_FX_M64: #define __x86_64__ 1
 // RUN: %clang -march=amdfam10 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_AMDFAM10_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M32
 // CHECK_AMDFAM10_M32: #define __3dNOW_A__ 1
 // CHECK_AMDFAM10_M32: #define __3dNOW__ 1
 // CHECK_AMDFAM10_M32: #define __LZCNT__ 1
@@ -1300,7 +1454,7 @@
 // CHECK_AMDFAM10_M32: #define __tune_amdfam10__ 1
 // RUN: %clang -march=amdfam10 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_AMDFAM10_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M64
 // CHECK_AMDFAM10_M64: #define __3dNOW_A__ 1
 // CHECK_AMDFAM10_M64: #define __3dNOW__ 1
 // CHECK_AMDFAM10_M64: #define __LZCNT__ 1
@@ -1321,7 +1475,7 @@
 // CHECK_AMDFAM10_M64: #define __x86_64__ 1
 // RUN: %clang -march=btver1 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BTVER1_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER1_M32
 // CHECK_BTVER1_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BTVER1_M32-NOT: #define __3dNOW__ 1
 // CHECK_BTVER1_M32: #define __LZCNT__ 1
@@ -1335,7 +1489,6 @@
 // CHECK_BTVER1_M32: #define __SSE_MATH__ 1
 // CHECK_BTVER1_M32: #define __SSE__ 1
 // CHECK_BTVER1_M32: #define __SSSE3__ 1
-// CHECK_BTVER1_M32: #define __XSAVE__ 1
 // CHECK_BTVER1_M32: #define __btver1 1
 // CHECK_BTVER1_M32: #define __btver1__ 1
 // CHECK_BTVER1_M32: #define __i386 1
@@ -1343,7 +1496,7 @@
 // CHECK_BTVER1_M32: #define __tune_btver1__ 1
 // RUN: %clang -march=btver1 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BTVER1_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER1_M64
 // CHECK_BTVER1_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BTVER1_M64-NOT: #define __3dNOW__ 1
 // CHECK_BTVER1_M64: #define __LZCNT__ 1
@@ -1357,7 +1510,6 @@
 // CHECK_BTVER1_M64: #define __SSE_MATH__ 1
 // CHECK_BTVER1_M64: #define __SSE__ 1
 // CHECK_BTVER1_M64: #define __SSSE3__ 1
-// CHECK_BTVER1_M64: #define __XSAVE__ 1
 // CHECK_BTVER1_M64: #define __amd64 1
 // CHECK_BTVER1_M64: #define __amd64__ 1
 // CHECK_BTVER1_M64: #define __btver1 1
@@ -1367,7 +1519,7 @@
 // CHECK_BTVER1_M64: #define __x86_64__ 1
 // RUN: %clang -march=btver2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BTVER2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER2_M32
 // CHECK_BTVER2_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BTVER2_M32-NOT: #define __3dNOW__ 1
 // CHECK_BTVER2_M32: #define __AES__ 1
@@ -1395,7 +1547,7 @@
 // CHECK_BTVER2_M32: #define __tune_btver2__ 1
 // RUN: %clang -march=btver2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BTVER2_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER2_M64
 // CHECK_BTVER2_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BTVER2_M64-NOT: #define __3dNOW__ 1
 // CHECK_BTVER2_M64: #define __AES__ 1
@@ -1425,7 +1577,7 @@
 // CHECK_BTVER2_M64: #define __x86_64__ 1
 // RUN: %clang -march=bdver1 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER1_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER1_M32
 // CHECK_BDVER1_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER1_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER1_M32: #define __AES__ 1
@@ -1454,7 +1606,7 @@
 // CHECK_BDVER1_M32: #define __tune_bdver1__ 1
 // RUN: %clang -march=bdver1 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER1_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER1_M64
 // CHECK_BDVER1_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER1_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER1_M64: #define __AES__ 1
@@ -1485,7 +1637,7 @@
 // CHECK_BDVER1_M64: #define __x86_64__ 1
 // RUN: %clang -march=bdver2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER2_M32
 // CHECK_BDVER2_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER2_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER2_M32: #define __AES__ 1
@@ -1518,7 +1670,7 @@
 // CHECK_BDVER2_M32: #define __tune_bdver2__ 1
 // RUN: %clang -march=bdver2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER2_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER2_M64
 // CHECK_BDVER2_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER2_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER2_M64: #define __AES__ 1
@@ -1553,7 +1705,7 @@
 // CHECK_BDVER2_M64: #define __x86_64__ 1
 // RUN: %clang -march=bdver3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER3_M32
 // CHECK_BDVER3_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER3_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER3_M32: #define __AES__ 1
@@ -1588,7 +1740,7 @@
 // CHECK_BDVER3_M32: #define __tune_bdver3__ 1
 // RUN: %clang -march=bdver3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER3_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER3_M64
 // CHECK_BDVER3_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER3_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER3_M64: #define __AES__ 1
@@ -1625,7 +1777,7 @@
 // CHECK_BDVER3_M64: #define __x86_64__ 1
 // RUN: %clang -march=bdver4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER4_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER4_M32
 // CHECK_BDVER4_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER4_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER4_M32: #define __AES__ 1
@@ -1661,7 +1813,7 @@
 // CHECK_BDVER4_M32: #define __tune_bdver4__ 1
 // RUN: %clang -march=bdver4 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER4_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER4_M64
 // CHECK_BDVER4_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER4_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER4_M64: #define __AES__ 1
@@ -1703,36 +1855,36 @@
 // Begin PPC/GCC/Linux tests ----------------
 // RUN: %clang -mvsx -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_VSX_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_VSX_M64
 //
-// CHECK_PPC_VSX_M64: #define __VSX__
+// CHECK_PPC_VSX_M64: #define __VSX__ 1
 //
 // RUN: %clang -mpower8-vector -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_POWER8_VECTOR_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_POWER8_VECTOR_M64
 //
-// CHECK_PPC_POWER8_VECTOR_M64: #define __POWER8_VECTOR__
+// CHECK_PPC_POWER8_VECTOR_M64: #define __POWER8_VECTOR__ 1
 //
 // RUN: %clang -mcrypto -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_CRYPTO_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_CRYPTO_M64
 //
-// CHECK_PPC_CRYPTO_M64: #define __CRYPTO__
+// CHECK_PPC_CRYPTO_M64: #define __CRYPTO__ 1
 //
 // RUN: %clang -mcpu=ppc64 -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-unknown \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_GCC_ATOMICS
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_GCC_ATOMICS
 // RUN: %clang -mcpu=pwr8 -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-unknown \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_GCC_ATOMICS
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_GCC_ATOMICS
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64le-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_GCC_ATOMICS
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_GCC_ATOMICS
 //
-// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 //
 // End PPC/GCC/Linux tests ------------------
 
@@ -1740,10 +1892,10 @@
 //
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target sparc-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SPARC
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARC
 // RUN: %clang -mcpu=v9 -E -dM %s -o - 2>&1 \
 // RUN:     -target sparc-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SPARC-V9
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARC-V9
 //
 // CHECK_SPARC: #define __BIG_ENDIAN__ 1
 // CHECK_SPARC: #define __sparc 1
@@ -1762,16 +1914,28 @@
 //
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target sparcel-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SPARCEL
-//
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARCEL
+// RUN: %clang -E -dM %s -o - -target sparcel-myriad -mcpu=myriad2 2>&1 \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_MYRIAD2-1 -check-prefix=CHECK_SPARCEL
+// RUN: %clang -E -dM %s -o - -target sparcel-myriad -mcpu=myriad2.1 2>&1 \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_MYRIAD2-1 -check-prefix=CHECK_SPARCEL
+// RUN: %clang -E -dM %s -o - -target sparcel-myriad -mcpu=myriad2.2 2>&1 \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_MYRIAD2-2 -check-prefix=CHECK_SPARCEL
+// RUN: %clang -E -dM %s -o - -target sparcel-myriad -mcpu=ma2450 2>&1 \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_MYRIAD2-2 -check-prefix=CHECK_SPARCEL
 // CHECK_SPARCEL: #define __LITTLE_ENDIAN__ 1
+// CHECK_MYRIAD2-1: #define __myriad2 1
+// CHECK_MYRIAD2-1: #define __myriad2__ 1
+// CHECK_MYRIAD2-2: #define __myriad2 2
+// CHECK_MYRIAD2-2: #define __myriad2__ 2
 // CHECK_SPARCEL: #define __sparc 1
 // CHECK_SPARCEL: #define __sparc__ 1
+// CHECK_MYRIAD2-1: #define __sparc_v8__ 1
 // CHECK_SPARCEL: #define __sparcv8 1
 //
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target sparcv9-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SPARCV9
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARCV9
 //
 // CHECK_SPARCV9: #define __BIG_ENDIAN__ 1
 // CHECK_SPARCV9: #define __sparc 1
@@ -1785,8 +1949,12 @@
 //
 // RUN: %clang -march=z10 -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_Z10
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_Z10
 //
+// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 // CHECK_SYSTEMZ_Z10: #define __LONG_DOUBLE_128__ 1
 // CHECK_SYSTEMZ_Z10: #define __s390__ 1
 // CHECK_SYSTEMZ_Z10: #define __s390x__ 1
@@ -1794,8 +1962,12 @@
 //
 // RUN: %clang -march=zEC12 -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_ZEC12
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZEC12
 //
+// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 // CHECK_SYSTEMZ_ZEC12: #define __HTM__ 1
 // CHECK_SYSTEMZ_ZEC12: #define __LONG_DOUBLE_128__ 1
 // CHECK_SYSTEMZ_ZEC12: #define __s390__ 1
@@ -1804,15 +1976,29 @@
 //
 // RUN: %clang -mhtm -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_HTM
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_HTM
 //
 // CHECK_SYSTEMZ_HTM: #define __HTM__ 1
 //
 // RUN: %clang -fzvector -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
 // RUN: %clang -mzvector -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
 //
 // CHECK_SYSTEMZ_ZVECTOR: #define __VEC__ 10301
+
+// Begin amdgcn tests ----------------
+//
+// RUN: %clang -march=amdgcn -E -dM %s -o - 2>&1 \
+// RUN:     -target amdgcn-unknown-unknown \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDGCN
+// CHECK_AMDGCN: #define __AMDGCN__ 1
+
+// Begin r600 tests ----------------
+//
+// RUN: %clang -march=amdgcn -E -dM %s -o - 2>&1 \
+// RUN:     -target r600-unknown-unknown \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_R600
+// CHECK_R600: #define __R600__ 1
diff --git a/test/Preprocessor/predefined-macros.c b/test/Preprocessor/predefined-macros.c
index 1f68a08..7385cd2 100644
--- a/test/Preprocessor/predefined-macros.c
+++ b/test/Preprocessor/predefined-macros.c
@@ -1,23 +1,25 @@
 // This test verifies that the correct macros are predefined.
 //
-// RUN: %clang_cc1 %s -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
-// RUN:     -fms-compatibility-version=13.00 -o - | FileCheck %s --check-prefix=CHECK-MS
+// RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
+// RUN:     -fms-compatibility-version=19.00 -std=c++1z -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS
 // CHECK-MS: #define _INTEGRAL_MAX_BITS 64
 // CHECK-MS: #define _MSC_EXTENSIONS 1
-// CHECK-MS: #define _MSC_VER 1300
+// CHECK-MS: #define _MSC_VER 1900
+// CHECK-MS: #define _MSVC_LANG 201403L
 // CHECK-MS: #define _M_IX86 600
-// CHECK-MS: #define _M_IX86_FP
+// CHECK-MS: #define _M_IX86_FP 0
 // CHECK-MS: #define _WIN32 1
 // CHECK-MS-NOT: #define __STRICT_ANSI__
 // CHECK-MS-NOT: GCC
 // CHECK-MS-NOT: GNU
 // CHECK-MS-NOT: GXX
 //
-// RUN: %clang_cc1 %s -E -dM -triple x86_64-pc-win32 -fms-extensions -fms-compatibility \
-// RUN:     -fms-compatibility-version=13.00 -o - | FileCheck %s --check-prefix=CHECK-MS64
+// RUN: %clang_cc1 %s -x c++ -E -dM -triple x86_64-pc-win32 -fms-extensions -fms-compatibility \
+// RUN:     -fms-compatibility-version=19.00 -std=c++14 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS64
 // CHECK-MS64: #define _INTEGRAL_MAX_BITS 64
 // CHECK-MS64: #define _MSC_EXTENSIONS 1
-// CHECK-MS64: #define _MSC_VER 1300
+// CHECK-MS64: #define _MSC_VER 1900
+// CHECK-MS64: #define _MSVC_LANG 201402L
 // CHECK-MS64: #define _M_AMD64 100
 // CHECK-MS64: #define _M_X64 100
 // CHECK-MS64: #define _WIN64 1
@@ -27,7 +29,7 @@
 // CHECK-MS64-NOT: GXX
 //
 // RUN: %clang_cc1 %s -E -dM -triple i686-pc-win32 -fms-compatibility \
-// RUN:     -o - | FileCheck %s --check-prefix=CHECK-MS-STDINT
+// RUN:     -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-STDINT
 // CHECK-MS-STDINT:#define __INT16_MAX__ 32767
 // CHECK-MS-STDINT:#define __INT32_MAX__ 2147483647
 // CHECK-MS-STDINT:#define __INT64_MAX__ 9223372036854775807LL
@@ -83,66 +85,103 @@
 // CHECK-MS-STDINT:#define __UINT_LEAST8_TYPE__ unsigned char
 //
 // RUN: %clang_cc1 %s -E -dM -ffast-math -o - \
-// RUN:   | FileCheck %s --check-prefix=CHECK-FAST-MATH
-// CHECK-FAST-MATH: #define __FAST_MATH__
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FAST-MATH
+// CHECK-FAST-MATH: #define __FAST_MATH__ 1
 // CHECK-FAST-MATH: #define __FINITE_MATH_ONLY__ 1
 //
 // RUN: %clang_cc1 %s -E -dM -ffinite-math-only -o - \
-// RUN:   | FileCheck %s --check-prefix=CHECK-FINITE-MATH-ONLY
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FINITE-MATH-ONLY
 // CHECK-FINITE-MATH-ONLY: #define __FINITE_MATH_ONLY__ 1
 //
 // RUN: %clang %s -E -dM -fno-finite-math-only -o - \
-// RUN:   | FileCheck %s --check-prefix=CHECK-NO-FINITE-MATH-ONLY
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-FINITE-MATH-ONLY
 // CHECK-NO-FINITE-MATH-ONLY: #define __FINITE_MATH_ONLY__ 0
 //
 // RUN: %clang_cc1 %s -E -dM -o - \
-// RUN:   | FileCheck %s --check-prefix=CHECK-FINITE-MATH-FLAG-UNDEFINED
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FINITE-MATH-FLAG-UNDEFINED
 // CHECK-FINITE-MATH-FLAG-UNDEFINED: #define __FINITE_MATH_ONLY__ 0
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple i686 -target-cpu i386 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_I386
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_I386
 // CHECK-SYNC_CAS_I386-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple i686 -target-cpu i486 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_I486
-// CHECK-SYNC_CAS_I486: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_I486: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_I486: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_I486
+// CHECK-SYNC_CAS_I486: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_I486: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_I486: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
 // CHECK-SYNC_CAS_I486-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple i686 -target-cpu i586 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_I586
-// CHECK-SYNC_CAS_I586: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_I586: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_I586: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK-SYNC_CAS_I586: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_I586
+// CHECK-SYNC_CAS_I586: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_I586: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_I586: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK-SYNC_CAS_I586: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple armv6 -target-cpu arm1136j-s \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_ARM
-// CHECK-SYNC_CAS_ARM: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_ARM: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_ARM: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK-SYNC_CAS_ARM: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_ARM
+// CHECK-SYNC_CAS_ARM: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_ARM: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_ARM: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK-SYNC_CAS_ARM: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple armv7 -target-cpu cortex-a8 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_ARMv7
-// CHECK-SYNC_CAS_ARMv7: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_ARMv7: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_ARMv7: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK-SYNC_CAS_ARMv7: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_ARMv7
+// CHECK-SYNC_CAS_ARMv7: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_ARMv7: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_ARMv7: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK-SYNC_CAS_ARMv7: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple armv6 -target-cpu cortex-m0 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_ARMv6
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_ARMv6
 // CHECK-SYNC_CAS_ARMv6-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple mips -target-cpu mips2 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_MIPS \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_MIPS \
 // RUN:         --check-prefix=CHECK-SYNC_CAS_MIPS32
 // RUN: %clang_cc1 %s -E -dM -o - -triple mips64 -target-cpu mips3 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_MIPS \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_MIPS \
 // RUN:         --check-prefix=CHECK-SYNC_CAS_MIPS64
-// CHECK-SYNC_CAS_MIPS:       __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_MIPS:       __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_MIPS:       __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+// CHECK-SYNC_CAS_MIPS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_MIPS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_MIPS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
 // CHECK-SYNC_CAS_MIPS32-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
-// CHECK-SYNC_CAS_MIPS64:     __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// CHECK-SYNC_CAS_MIPS64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+
+// RUN: %clang_cc1 %s -E -dM -o - -x cl \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-CL10
+// RUN: %clang_cc1 %s -E -dM -o - -x cl -cl-std=CL1.1 \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-CL11
+// RUN: %clang_cc1 %s -E -dM -o - -x cl -cl-std=CL1.2 \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-CL12
+// RUN: %clang_cc1 %s -E -dM -o - -x cl -cl-std=CL2.0 \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-CL20
+// RUN: %clang_cc1 %s -E -dM -o - -x cl -cl-fast-relaxed-math \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FRM
+// CHECK-CL10: #define CL_VERSION_1_0 100
+// CHECK-CL10: #define CL_VERSION_1_1 110
+// CHECK-CL10: #define CL_VERSION_1_2 120
+// CHECK-CL10: #define CL_VERSION_2_0 200
+// CHECK-CL10: #define __OPENCL_C_VERSION__ 100
+// CHECK-CL10-NOT: #define __FAST_RELAXED_MATH__ 1
+// CHECK-CL11: #define CL_VERSION_1_0 100
+// CHECK-CL11: #define CL_VERSION_1_1 110
+// CHECK-CL11: #define CL_VERSION_1_2 120
+// CHECK-CL11: #define CL_VERSION_2_0 200
+// CHECK-CL11: #define __OPENCL_C_VERSION__ 110
+// CHECK-CL11-NOT: #define __FAST_RELAXED_MATH__ 1
+// CHECK-CL12: #define CL_VERSION_1_0 100
+// CHECK-CL12: #define CL_VERSION_1_1 110
+// CHECK-CL12: #define CL_VERSION_1_2 120
+// CHECK-CL12: #define CL_VERSION_2_0 200
+// CHECK-CL12: #define __OPENCL_C_VERSION__ 120
+// CHECK-CL12-NOT: #define __FAST_RELAXED_MATH__ 1
+// CHECK-CL20: #define CL_VERSION_1_0 100
+// CHECK-CL20: #define CL_VERSION_1_1 110
+// CHECK-CL20: #define CL_VERSION_1_2 120
+// CHECK-CL20: #define CL_VERSION_2_0 200
+// CHECK-CL20: #define __OPENCL_C_VERSION__ 200
+// CHECK-CL20-NOT: #define __FAST_RELAXED_MATH__ 1
+// CHECK-FRM: #define __FAST_RELAXED_MATH__ 1
+
diff --git a/test/Preprocessor/pushable-diagnostics.c b/test/Preprocessor/pushable-diagnostics.c
index 877eaaa..6e05d8e 100644
--- a/test/Preprocessor/pushable-diagnostics.c
+++ b/test/Preprocessor/pushable-diagnostics.c
@@ -15,3 +15,27 @@
 int c = 'df';  // expected-warning{{multi-character character constant}}
 
 #pragma clang diagnostic pop // expected-warning{{pragma diagnostic pop could not pop, no matching push}}
+
+// Test -Weverything
+
+void ppo0(){} // first verify that we do not give anything on this
+#pragma clang diagnostic push // now push
+
+#pragma clang diagnostic warning "-Weverything" 
+void ppr1(){} // expected-warning {{no previous prototype for function 'ppr1'}}
+
+#pragma clang diagnostic push // push again
+#pragma clang diagnostic ignored "-Weverything"  // Set to ignore in this level.
+void pps2(){}
+#pragma clang diagnostic warning "-Weverything"  // Set to warning in this level.
+void ppt2(){} // expected-warning {{no previous prototype for function 'ppt2'}}
+#pragma clang diagnostic error "-Weverything"  // Set to error in this level.
+void ppt3(){} // expected-error {{no previous prototype for function 'ppt3'}}
+#pragma clang diagnostic pop // pop should go back to warning level
+
+void pps1(){} // expected-warning {{no previous prototype for function 'pps1'}}
+
+
+#pragma clang diagnostic pop // Another pop should disble it again
+void ppu(){}
+
diff --git a/test/Preprocessor/stringize_misc.c b/test/Preprocessor/stringize_misc.c
index 6c2c78d..fc7253e 100644
--- a/test/Preprocessor/stringize_misc.c
+++ b/test/Preprocessor/stringize_misc.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -E %s | FileCheck -strict-whitespace %s
+#ifdef TEST1
+// RUN: %clang_cc1 -E %s -DTEST1 | FileCheck -strict-whitespace %s
 
 #define M(x, y) #x #y
 
@@ -28,3 +29,13 @@
 M(a COMMA b, (a, b)) 
 // CHECK: "a COMMA b" "(a, b)"
 
+#endif
+
+#ifdef TEST2
+// RUN: %clang_cc1 -fsyntax-only -verify %s -DTEST2
+
+#define HASH #
+#define INVALID() #
+// expected-error@-1{{'#' is not followed by a macro parameter}}
+
+#endif
diff --git a/test/Preprocessor/sysroot-prefix.c b/test/Preprocessor/sysroot-prefix.c
new file mode 100644
index 0000000..08c72f5
--- /dev/null
+++ b/test/Preprocessor/sysroot-prefix.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -v -isysroot /var/empty -I /var/empty/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_NO_SYSROOT %s
+// RUN: %clang_cc1 -v -isysroot /var/empty -I =/var/empty/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_SYSROOT_DEV_NULL %s
+// RUN: %clang_cc1 -v -I =/var/empty/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-NO_ISYSROOT_SYSROOT_DEV_NULL %s
+// RUN: %clang_cc1 -v -isysroot /var/empty -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_SYSROOT_NULL %s
+// RUN: %clang_cc1 -v -isysroot /var/empty -isysroot /var/empty/root -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSROOT_SYSROOT_NULL %s
+// RUN: %clang_cc1 -v -isysroot /var/empty/root -isysroot /var/empty -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL %s
+
+// CHECK-ISYSROOT_NO_SYSROOT: ignoring nonexistent directory "/var/empty/include"
+// CHECK-ISYSROOT_NO_SYSROOT-NOT: ignoring nonexistent directory "/var/empty/var/empty/include"
+
+// CHECK-ISYSROOT_SYSROOT_DEV_NULL: ignoring nonexistent directory "/var/empty/var/empty/include"
+// CHECK-ISYSROOT_SYSROOT_DEV_NULL-NOT: ignoring nonexistent directory "/var/empty"
+
+// CHECK-NO_ISYSROOT_SYSROOT_DEV_NULL: ignoring nonexistent directory "=/var/empty/include"
+// CHECK-NO_ISYSROOT_SYSROOT_DEV_NULL-NOT: ignoring nonexistent directory "/var/empty/include"
+
+// CHECK-ISYSROOT_SYSROOT_NULL: ignoring nonexistent directory "/var/empty{{.}}null"
+// CHECK-ISYSROOT_SYSROOT_NULL-NOT: ignoring nonexistent directory "=null"
+
+// CHECK-ISYSROOT_ISYSROOT_SYSROOT_NULL: ignoring nonexistent directory "/var/empty/root{{.}}null"
+// CHECK-ISYSROOT_ISYSROOT_SYSROOT_NULL-NOT: ignoring nonexistent directory "=null"
+
+// CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL: ignoring nonexistent directory "/var/empty{{.}}null"
+// CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL-NOT: ignoring nonexistent directory "=null"
+
diff --git a/test/Preprocessor/warning_tests.c b/test/Preprocessor/warning_tests.c
index c0c22ef..1f2e884 100644
--- a/test/Preprocessor/warning_tests.c
+++ b/test/Preprocessor/warning_tests.c
@@ -12,7 +12,7 @@
 #endif
 
 // expected-error@+2 {{expected string literal in '__has_warning'}}
-// expected-error@+1 {{expected value in expression}}
+// expected-error@+1 {{missing ')'}} expected-note@+1 {{match}}
 #if __has_warning(-Wfoo)
 #endif
 
@@ -22,8 +22,7 @@
 #warning Not a valid warning flag
 #endif
 
-// expected-error@+2 {{builtin warning check macro requires a parenthesized string}}
-// expected-error@+1 {{invalid token}}
+// expected-error@+1 {{missing '(' after '__has_warning'}}
 #if __has_warning "not valid"
 #endif
 
@@ -33,7 +32,7 @@
 
 #define MY_ALIAS "-Wparentheses"
 
-// expected-error@+1 2{{expected}}
+// expected-error@+1 {{expected}}
 #if __has_warning(MY_ALIAS)
 #error Alias expansion not allowed
 #endif
diff --git a/test/Preprocessor/x86_target_features.c b/test/Preprocessor/x86_target_features.c
index 9c4192c..ff79a69 100644
--- a/test/Preprocessor/x86_target_features.c
+++ b/test/Preprocessor/x86_target_features.c
@@ -1,4 +1,4 @@
-// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4 -x c -E -dM -o - %s | FileCheck --check-prefix=SSE4 %s
+// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE4 %s
 
 // SSE4: #define __SSE2_MATH__ 1
 // SSE4: #define __SSE2__ 1
@@ -9,11 +9,11 @@
 // SSE4: #define __SSE__ 1
 // SSE4: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4.1 -mno-sse4 -x c -E -dM -o - %s | FileCheck --check-prefix=NOSSE4 %s
+// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4.1 -mno-sse4 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOSSE4 %s
 
 // NOSSE4-NOT: #define __SSE4_1__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4 -mno-sse2 -x c -E -dM -o - %s | FileCheck --check-prefix=SSE %s
+// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4 -mno-sse2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE %s
 
 // SSE-NOT: #define __SSE2_MATH__ 1
 // SSE-NOT: #define __SSE2__ 1
@@ -24,7 +24,7 @@
 // SSE: #define __SSE__ 1
 // SSE-NOT: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentium-m -x c -E -dM -o - %s | FileCheck --check-prefix=SSE2 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentium-m -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE2 %s
 
 // SSE2: #define __SSE2_MATH__ 1
 // SSE2: #define __SSE2__ 1
@@ -35,7 +35,7 @@
 // SSE2: #define __SSE__ 1
 // SSE2-NOT: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentium-m -mno-sse -mavx -x c -E -dM -o - %s | FileCheck --check-prefix=AVX %s
+// RUN: %clang -target i386-unknown-unknown -march=pentium-m -mno-sse -mavx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX %s
 
 // AVX: #define __AVX__ 1
 // AVX: #define __SSE2_MATH__ 1
@@ -47,7 +47,7 @@
 // AVX: #define __SSE__ 1
 // AVX: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentium-m -mxop -mno-avx -x c -E -dM -o - %s | FileCheck --check-prefix=SSE4A %s
+// RUN: %clang -target i386-unknown-unknown -march=pentium-m -mxop -mno-avx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE4A %s
 
 // SSE4A: #define __SSE2_MATH__ 1
 // SSE4A: #define __SSE2__ 1
@@ -59,7 +59,7 @@
 // SSE4A: #define __SSE__ 1
 // SSE4A: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512F %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512F %s
 
 // AVX512F: #define __AVX2__ 1
 // AVX512F: #define __AVX512F__ 1
@@ -73,7 +73,7 @@
 // AVX512F: #define __SSE__ 1
 // AVX512F: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512CD %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512CD %s
 
 // AVX512CD: #define __AVX2__ 1
 // AVX512CD: #define __AVX512CD__ 1
@@ -88,7 +88,7 @@
 // AVX512CD: #define __SSE__ 1
 // AVX512CD: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512ER %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512ER %s
 
 // AVX512ER: #define __AVX2__ 1
 // AVX512ER: #define __AVX512ER__ 1
@@ -103,7 +103,7 @@
 // AVX512ER: #define __SSE__ 1
 // AVX512ER: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512PF %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512PF %s
 
 // AVX512PF: #define __AVX2__ 1
 // AVX512PF: #define __AVX512F__ 1
@@ -118,7 +118,7 @@
 // AVX512PF: #define __SSE__ 1
 // AVX512PF: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512dq -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512DQ %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512DQ %s
 
 // AVX512DQ: #define __AVX2__ 1
 // AVX512DQ: #define __AVX512DQ__ 1
@@ -133,7 +133,7 @@
 // AVX512DQ: #define __SSE__ 1
 // AVX512DQ: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bw -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512BW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512BW %s
 
 // AVX512BW: #define __AVX2__ 1
 // AVX512BW: #define __AVX512BW__ 1
@@ -148,7 +148,7 @@
 // AVX512BW: #define __SSE__ 1
 // AVX512BW: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512vl -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512VL %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512VL %s
 
 // AVX512VL: #define __AVX2__ 1
 // AVX512VL: #define __AVX512F__ 1
@@ -163,7 +163,7 @@
 // AVX512VL: #define __SSE__ 1
 // AVX512VL: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -mno-avx512f -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512F2 %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512F2 %s
 
 // AVX512F2: #define __AVX2__ 1
 // AVX512F2-NOT: #define __AVX512F__ 1
@@ -178,141 +178,171 @@
 // AVX512F2: #define __SSE__ 1
 // AVX512F2: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -msse4.2 -x c -E -dM -o - %s | FileCheck --check-prefix=SSE42POPCNT %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512ifma -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512IFMA %s
+
+// AVX512IFMA: #define __AVX2__ 1
+// AVX512IFMA: #define __AVX512F__ 1
+// AVX512IFMA: #define __AVX512IFMA__ 1
+// AVX512IFMA: #define __AVX__ 1
+// AVX512IFMA: #define __SSE2_MATH__ 1
+// AVX512IFMA: #define __SSE2__ 1
+// AVX512IFMA: #define __SSE3__ 1
+// AVX512IFMA: #define __SSE4_1__ 1
+// AVX512IFMA: #define __SSE4_2__ 1
+// AVX512IFMA: #define __SSE_MATH__ 1
+// AVX512IFMA: #define __SSE__ 1
+// AVX512IFMA: #define __SSSE3__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512vbmi -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512VBMI %s
+
+// AVX512VBMI: #define __AVX2__ 1
+// AVX512VBMI: #define __AVX512F__ 1
+// AVX512VBMI: #define __AVX512VBMI__ 1
+// AVX512VBMI: #define __AVX__ 1
+// AVX512VBMI: #define __SSE2_MATH__ 1
+// AVX512VBMI: #define __SSE2__ 1
+// AVX512VBMI: #define __SSE3__ 1
+// AVX512VBMI: #define __SSE4_1__ 1
+// AVX512VBMI: #define __SSE4_2__ 1
+// AVX512VBMI: #define __SSE_MATH__ 1
+// AVX512VBMI: #define __SSE__ 1
+// AVX512VBMI: #define __SSSE3__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -msse4.2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE42POPCNT %s
 
 // SSE42POPCNT: #define __POPCNT__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mno-popcnt -msse4.2 -x c -E -dM -o - %s | FileCheck --check-prefix=SSE42NOPOPCNT %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-popcnt -msse4.2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE42NOPOPCNT %s
 
 // SSE42NOPOPCNT-NOT: #define __POPCNT__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mpopcnt -mno-sse4.2 -x c -E -dM -o - %s | FileCheck --check-prefix=NOSSE42POPCNT %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mpopcnt -mno-sse4.2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOSSE42POPCNT %s
 
 // NOSSE42POPCNT: #define __POPCNT__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -msse -x c -E -dM -o - %s | FileCheck --check-prefix=SSEMMX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -msse -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSEMMX %s
 
 // SSEMMX: #define __MMX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -msse -mno-sse -x c -E -dM -o - %s | FileCheck --check-prefix=SSENOSSEMMX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -msse -mno-sse -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSENOSSEMMX %s
 
 // SSENOSSEMMX-NOT: #define __MMX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -msse -mno-mmx -x c -E -dM -o - %s | FileCheck --check-prefix=SSENOMMX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -msse -mno-mmx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSENOMMX %s
 
 // SSENOMMX-NOT: #define __MMX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mf16c -x c -E -dM -o - %s | FileCheck --check-prefix=F16C %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mf16c -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=F16C %s
 
 // F16C: #define __AVX__ 1
 // F16C: #define __F16C__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mf16c -mno-avx -x c -E -dM -o - %s | FileCheck --check-prefix=F16CNOAVX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mf16c -mno-avx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=F16CNOAVX %s
 
 // F16CNOAVX-NOT: #define __AVX__ 1
 // F16CNOAVX-NOT: #define __F16C__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mpclmul -x c -E -dM -o - %s | FileCheck --check-prefix=PCLMUL %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mpclmul -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=PCLMUL %s
 
 // PCLMUL: #define __PCLMUL__ 1
 // PCLMUL: #define __SSE2__ 1
 // PCLMUL-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mpclmul -mno-sse2 -x c -E -dM -o - %s | FileCheck --check-prefix=PCLMULNOSSE2 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mpclmul -mno-sse2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=PCLMULNOSSE2 %s
 
 // PCLMULNOSSE2-NOT: #define __PCLMUL__ 1
 // PCLMULNOSSE2-NOT: #define __SSE2__ 1
 // PCLMULNOSSE2-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -maes -x c -E -dM -o - %s | FileCheck --check-prefix=AES %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -maes -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AES %s
 
 // AES: #define __AES__ 1
 // AES: #define __SSE2__ 1
 // AES-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -maes -mno-sse2 -x c -E -dM -o - %s | FileCheck --check-prefix=AESNOSSE2 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -maes -mno-sse2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AESNOSSE2 %s
 
 // AESNOSSE2-NOT: #define __AES__ 1
 // AESNOSSE2-NOT: #define __SSE2__ 1
 // AESNOSSE2-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -x c -E -dM -o - %s | FileCheck --check-prefix=SHA %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SHA %s
 
 // SHA: #define __SHA__ 1
 // SHA: #define __SSE2__ 1
 // SHA-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -mno-sha -x c -E -dM -o - %s | FileCheck --check-prefix=SHANOSHA %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -mno-sha -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SHANOSHA %s
 
 // SHANOSHA-NOT: #define __SHA__ 1
 // SHANOSHA-NOT: #define __SSE2__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -mno-sse2 -x c -E -dM -o - %s | FileCheck --check-prefix=SHANOSSE2 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -mno-sse2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SHANOSSE2 %s
 
 // SHANOSSE2-NOT: #define __SHA__ 1
 // SHANOSSE2-NOT: #define __SSE2__ 1
 // SHANOSSE2-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mtbm -x c -E -dM -o - %s | FileCheck --check-prefix=TBM %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mtbm -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=TBM %s
 
 // TBM: #define __TBM__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=bdver2 -mno-tbm -x c -E -dM -o - %s | FileCheck --check-prefix=NOTBM %s
+// RUN: %clang -target i386-unknown-unknown -march=bdver2 -mno-tbm -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOTBM %s
 
 // NOTBM-NOT: #define __TBM__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mcx16 -x c -E -dM -o - %s | FileCheck --check-prefix=MCX16 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mcx16 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=MCX16 %s
 
 // MCX16: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -x c -E -dM -o - %s | FileCheck --check-prefix=PRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=PRFCHW %s
 
 // PRFCHW: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=btver2 -mno-prfchw -x c -E -dM -o - %s | FileCheck --check-prefix=NOPRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=btver2 -mno-prfchw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOPRFCHW %s
 
 // NOPRFCHW-NOT: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -m3dnow -x c -E -dM -o - %s | FileCheck --check-prefix=3DNOWPRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -m3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=3DNOWPRFCHW %s
 
 // 3DNOWPRFCHW: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mno-prfchw -m3dnow -x c -E -dM -o - %s | FileCheck --check-prefix=3DNOWNOPRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-prfchw -m3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=3DNOWNOPRFCHW %s
 
 // 3DNOWNOPRFCHW-NOT: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -mno-3dnow -x c -E -dM -o - %s | FileCheck --check-prefix=NO3DNOWPRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -mno-3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NO3DNOWPRFCHW %s
 
 // NO3DNOWPRFCHW: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -madx -x c -E -dM -o - %s | FileCheck --check-prefix=ADX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -madx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=ADX %s
 
 // ADX: #define __ADX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck --check-prefix=RDSEED %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=RDSEED %s
 
 // RDSEED: #define __RDSEED__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsave -x c -E -dM -o - %s | FileCheck --check-prefix=XSAVE %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=XSAVE %s
 
 // XSAVE: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -x c -E -dM -o - %s | FileCheck --check-prefix=XSAVEOPT %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=XSAVEOPT %s
 
 // XSAVEOPT: #define __XSAVEOPT__ 1
 // XSAVEOPT: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsavec -x c -E -dM -o - %s | FileCheck --check-prefix=XSAVEC %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsavec -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=XSAVEC %s
 
 // XSAVEC: #define __XSAVEC__ 1
 // XSAVEC: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaves -x c -E -dM -o - %s | FileCheck --check-prefix=XSAVES %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaves -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=XSAVES %s
 
 // XSAVES: #define __XSAVES__ 1
 // XSAVES: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mno-xsave -x c -E -dM -o - %s | FileCheck --check-prefix=NOXSAVE %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s
 
 // NOXSAVE-NOT: #define __XSAVEOPT__ 1
 // NOXSAVE-NOT: #define __XSAVE__ 1
diff --git a/test/Profile/Inputs/profile-summary.proftext b/test/Profile/Inputs/profile-summary.proftext
new file mode 100644
index 0000000..c744f7a
--- /dev/null
+++ b/test/Profile/Inputs/profile-summary.proftext
@@ -0,0 +1,26 @@
+begin
+# Func Hash:
+10
+# Num Counters:
+2
+# Counter Values:
+1
+0
+
+main
+# Func Hash:
+0
+# Num Counters:
+1
+# Counter Values:
+1
+
+end
+# Func Hash:
+10
+# Num Counters:
+2
+# Counter Values:
+2
+2
+
diff --git a/test/Profile/c-avoid-direct-call.c b/test/Profile/c-avoid-direct-call.c
index 30660bc..cd02e71 100644
--- a/test/Profile/c-avoid-direct-call.c
+++ b/test/Profile/c-avoid-direct-call.c
@@ -1,6 +1,6 @@
 // Check the value profiling instrinsics emitted by instrumentation.
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-avoid-direct-call.c %s -o - -emit-llvm -fprofile-instr-generate -mllvm -enable-value-profiling | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-avoid-direct-call.c %s -o - -emit-llvm -fprofile-instrument=clang -mllvm -enable-value-profiling | FileCheck %s
 
 void foo();
 
diff --git a/test/Profile/c-captured.c b/test/Profile/c-captured.c
index e859628..bae2dcb 100644
--- a/test/Profile/c-captured.c
+++ b/test/Profile/c-captured.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck -check-prefix=PGOGEN -check-prefix=PGOALL %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck -check-prefix=PGOGEN -check-prefix=PGOALL %s
 
 // RUN: llvm-profdata merge %S/Inputs/c-captured.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instr-use=%t.profdata | FileCheck -check-prefix=PGOUSE -check-prefix=PGOALL %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE -check-prefix=PGOALL %s
 
 // PGOGEN: @[[DCC:__profc_debug_captured]] = private global [3 x i64] zeroinitializer
 // PGOGEN: @[[CSC:__profc_c_captured.c___captured_stmt]] = private global [2 x i64] zeroinitializer
diff --git a/test/Profile/c-counter-overflows.c b/test/Profile/c-counter-overflows.c
index 18a3d33..5cb32bb 100644
--- a/test/Profile/c-counter-overflows.c
+++ b/test/Profile/c-counter-overflows.c
@@ -2,7 +2,7 @@
 // truncated.
 
 // RUN: llvm-profdata merge %S/Inputs/c-counter-overflows.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-counter-overflows.c %s -o - -emit-llvm -fprofile-instr-use=%t.profdata | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-counter-overflows.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
 
 typedef unsigned long long uint64_t;
 
diff --git a/test/Profile/c-general.c b/test/Profile/c-general.c
index 03631d8..da3b7f2 100644
--- a/test/Profile/c-general.c
+++ b/test/Profile/c-general.c
@@ -1,12 +1,12 @@
 // Test instrumentation of general constructs in C.
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck -check-prefix=PGOGEN %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/c-general.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instr-use=%t.profdata | FileCheck -check-prefix=PGOUSE %s
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instr-use=%S/Inputs/c-general.profdata.v3 | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v3 | FileCheck -check-prefix=PGOUSE %s
 // Also check compatibility with older profiles.
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instr-use=%S/Inputs/c-general.profdata.v1 | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v1 | FileCheck -check-prefix=PGOUSE %s
 
 // PGOGEN: @[[SLC:__profc_simple_loops]] = private global [4 x i64] zeroinitializer
 // PGOGEN: @[[IFC:__profc_conditionals]] = private global [11 x i64] zeroinitializer
diff --git a/test/Profile/c-generate.c b/test/Profile/c-generate.c
index 8be4e28..5e5b22e 100644
--- a/test/Profile/c-generate.c
+++ b/test/Profile/c-generate.c
@@ -1,9 +1,12 @@
-// Check that the -fprofile-instr-generate= form works.
-// RUN: %clang_cc1 -main-file-name c-generate.c %s -o - -emit-llvm -fprofile-instr-generate=c-generate-test.profraw | FileCheck %s
-
-// CHECK: private constant [24 x i8] c"c-generate-test.profraw\00"
-// CHECK: call void @__llvm_profile_override_default_filename(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @0, i32 0, i32 0))
-// CHECK: declare void @__llvm_profile_override_default_filename(i8*)
+// Check that the -fprofile-instrument-path= form works.
+// RUN: %clang_cc1 -main-file-name c-generate.c %s -o - -emit-llvm -fprofile-instrument=clang -fprofile-instrument-path=c-generate-test.profraw | FileCheck %s --check-prefix=PROF-INSTR-PATH
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=none | FileCheck %s --check-prefix=PROF-INSTR-NONE
+// RUN: not %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=garbage 2>&1 | FileCheck %s --check-prefix=PROF-INSTR-GARBAGE
+//
+// PROF-INSTR-PATH: constant [24 x i8] c"c-generate-test.profraw\00"
+//
+// PROF-INSTR-NONE-NOT: @__profn_main
+// PROF-INSTR-GARBAGE: invalid PGO instrumentor in argument '-fprofile-instrument=garbage'
 
 int main(void) {
   return 0;
diff --git a/test/Profile/c-indirect-call.c b/test/Profile/c-indirect-call.c
index d73d09a..b0ace37 100644
--- a/test/Profile/c-indirect-call.c
+++ b/test/Profile/c-indirect-call.c
@@ -1,13 +1,14 @@
-// Check the data structures emitted by instrumentation.
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-indirect-call.c %s -o - -emit-llvm -fprofile-instr-generate -mllvm -enable-value-profiling | FileCheck %s
+// Check the value profiling instrinsics emitted by instrumentation.
+
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-indirect-call.c %s -o - -emit-llvm -fprofile-instrument=clang -mllvm -enable-value-profiling | FileCheck %s
 
 void (*foo)(void);
 
 int main(void) {
 // CHECK:  [[REG1:%[0-9]+]] = load void ()*, void ()** @foo, align 8
-// CHECK-NEXT:  call void [[REG1]]()
 // CHECK-NEXT:  [[REG2:%[0-9]+]] = ptrtoint void ()* [[REG1]] to i64
 // CHECK-NEXT:  call void @__llvm_profile_instrument_target(i64 [[REG2]], i8* bitcast ({{.*}}* @__profd_main to i8*), i32 0)
+// CHECK-NEXT:  call void [[REG1]]()
   foo();
   return 0;
 }
diff --git a/test/Profile/c-linkage-available_externally.c b/test/Profile/c-linkage-available_externally.c
index 61a2586..8907839 100644
--- a/test/Profile/c-linkage-available_externally.c
+++ b/test/Profile/c-linkage-available_externally.c
@@ -1,6 +1,6 @@
 // Make sure instrumentation data from available_externally functions doesn't
 // get thrown out and are emitted with the expected linkage.
-// RUN: %clang_cc1 -O2 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage-available_externally.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -O2 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage-available_externally.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
 // CHECK: @__profc_foo = linkonce_odr hidden global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
 // CHECK: @__profd_foo = linkonce_odr hidden global {{.*}} i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_foo, i32 0, i32 0){{.*}}, section "__DATA,__llvm_prf_data", align 8
diff --git a/test/Profile/c-linkage.c b/test/Profile/c-linkage.c
index c82dcab..50ac558 100644
--- a/test/Profile/c-linkage.c
+++ b/test/Profile/c-linkage.c
@@ -1,5 +1,5 @@
 // Check that the profiling counters and data we create have the linkage we expect
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
 // CHECK: @__profc_foo = private global
 // CHECK: @__profd_foo = private global
diff --git a/test/Profile/c-outdated-data.c b/test/Profile/c-outdated-data.c
index d0503ac..e61ad02 100644
--- a/test/Profile/c-outdated-data.c
+++ b/test/Profile/c-outdated-data.c
@@ -4,7 +4,7 @@
 // doesn't play well with warnings that have no line number.
 
 // RUN: llvm-profdata merge %S/Inputs/c-outdated-data.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-outdated-data.c %s -o /dev/null -emit-llvm -fprofile-instr-use=%t.profdata -Wprofile-instr-dropped 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-outdated-data.c %s -o /dev/null -emit-llvm -fprofile-instrument-use-path=%t.profdata -Wprofile-instr-dropped 2>&1 | FileCheck %s
 // CHECK: warning: profile data may be out of date: of 3 functions, 1 has no data and 1 has mismatched data that will be ignored
 
 void no_usable_data() {
diff --git a/test/Profile/c-unprofiled-blocks.c b/test/Profile/c-unprofiled-blocks.c
index 58bef9e..a547400 100644
--- a/test/Profile/c-unprofiled-blocks.c
+++ b/test/Profile/c-unprofiled-blocks.c
@@ -2,7 +2,7 @@
 // runs) shouldn't have any branch weight metadata added.
 
 // RUN: llvm-profdata merge %S/Inputs/c-unprofiled-blocks.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled-blocks.c %s -o - -emit-llvm -fprofile-instr-use=%t.profdata | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled-blocks.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE %s
 
 // PGOUSE-LABEL: @never_called(i32 %i)
 int never_called(int i) {
diff --git a/test/Profile/c-unprofiled.c b/test/Profile/c-unprofiled.c
index 275cd2d..3466079 100644
--- a/test/Profile/c-unprofiled.c
+++ b/test/Profile/c-unprofiled.c
@@ -7,7 +7,7 @@
 // doesn't play well with warnings that have no line number.
 
 // RUN: llvm-profdata merge %S/Inputs/c-unprofiled.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled.c -I %S/Inputs/ %s -o /dev/null -emit-llvm -fprofile-instr-use=%t.profdata -Wprofile-instr-unprofiled 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled.c -I %S/Inputs/ %s -o /dev/null -emit-llvm -fprofile-instrument-use-path=%t.profdata -Wprofile-instr-unprofiled 2>&1 | FileCheck %s
 
 // CHECK: warning: no profile data available for file "c-unprofiled.c"
 
diff --git a/test/Profile/c-unreachable-after-switch.c b/test/Profile/c-unreachable-after-switch.c
index 7d1855d..36a7544 100644
--- a/test/Profile/c-unreachable-after-switch.c
+++ b/test/Profile/c-unreachable-after-switch.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O3 -triple x86_64-apple-macosx10.10 -main-file-name c-unreachable-after-switch.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -O3 -triple x86_64-apple-macosx10.10 -main-file-name c-unreachable-after-switch.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
 // CHECK: @[[C:__profc_foo]] = private global [3 x i64] zeroinitializer
 
diff --git a/test/Profile/cxx-class.cpp b/test/Profile/cxx-class.cpp
index a534140..dbc93377 100644
--- a/test/Profile/cxx-class.cpp
+++ b/test/Profile/cxx-class.cpp
@@ -1,13 +1,13 @@
 // Tests for instrumentation of C++ methods, constructors, and destructors.
 
-// RUN: %clang %s -o - -emit-llvm -S -fprofile-instr-generate -fno-exceptions -target %itanium_abi_triple > %tgen
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -triple %itanium_abi_triple > %tgen
 // RUN: FileCheck --input-file=%tgen -check-prefix=CTRGEN %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=DTRGEN %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=MTHGEN %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=WRPGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-class.proftext -o %t.profdata
-// RUN: %clang %s -o - -emit-llvm -S -fprofile-instr-use=%t.profdata -fno-exceptions -target %itanium_abi_triple > %tuse
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -triple %itanium_abi_triple > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=CTRUSE %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=DTRUSE %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=MTHUSE %s
diff --git a/test/Profile/cxx-implicit.cpp b/test/Profile/cxx-implicit.cpp
index b25486a..40598bf 100644
--- a/test/Profile/cxx-implicit.cpp
+++ b/test/Profile/cxx-implicit.cpp
@@ -1,17 +1,51 @@
 // Ensure that implicit methods aren't instrumented.
 
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-implicit.cpp -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple %itanium_abi_triple -main-file-name cxx-implicit.cpp -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
-// An implicit constructor is generated for Base. We should not emit counters
-// for it.
+// Implicit constructors are generated for Base. We should not emit counters
+// for them.
+// CHECK-DAG: define {{.*}}_ZN4BaseC2Ev
+// CHECK-DAG: define {{.*}}_ZN4BaseC2ERKS_
+// CHECK-DAG: define {{.*}}_ZN4BaseC2EOS_
+// CHECK-DAG: __profc__ZN7DerivedC2Ev,
+// CHECK-DAG: __profc__ZN7DerivedC2ERKS_
+// CHECK-DAG: __profc__ZN7DerivedC2EOS_
 // CHECK-NOT: @__profc__ZN4BaseC2Ev =
+// CHECK-NOT: @__profc__ZN4BaseC2ERKS_
+// CHECK-NOT: @__profc__ZN4BaseC2EOS_
+//
+// Implicit assignment operators are generated for Base. We should not emit counters
+// for them.
+// CHECK-NOT: @__profc__ZN4BaseaSEOS_
+// CHECK-NOT: @__profc__ZN4BaseaSERKS_
 
-struct Base {
+struct BaseBase {
+ BaseBase();
+ BaseBase(const BaseBase &);
+ BaseBase &operator=(const BaseBase &);
+ BaseBase &operator=(BaseBase &&);
+};
+
+struct Base : public BaseBase {
   virtual void foo();
 };
 
 struct Derived : public Base {
   Derived();
+  Derived(const Derived &);
+  Derived(Derived &&);
+  Derived &operator=(const Derived &);
+  Derived &operator=(Derived &&);
 };
 
 Derived::Derived() {}
+Derived::Derived(const Derived &d) : Base(d) {}
+Derived::Derived(Derived &&d) : Base(static_cast<Base&&>(d)) {}
+Derived& Derived::operator=(const Derived &d) {
+  Base::operator=(d);
+  return *this;
+}
+Derived& Derived::operator=(Derived &&d) {
+  Base::operator=(static_cast<Base &&>(d));
+  return *this;
+}
diff --git a/test/Profile/cxx-indirect-call.cpp b/test/Profile/cxx-indirect-call.cpp
new file mode 100644
index 0000000..f95d1af
--- /dev/null
+++ b/test/Profile/cxx-indirect-call.cpp
@@ -0,0 +1,21 @@
+// Check the value profiling instrinsics emitted by instrumentation.
+
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -mllvm -enable-value-profiling -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck %s
+
+void (*foo) (void);
+
+int main(int argc, const char *argv[]) {
+// CHECK:  [[REG1:%[0-9]+]] = load void ()*, void ()** @foo
+// CHECK-NEXT:  [[REG2:%[0-9]+]] = ptrtoint void ()* [[REG1]] to i64
+// CHECK-NEXT:  call void @__llvm_profile_instrument_target(i64 [[REG2]], i8* bitcast ({{.*}}* @__profd_main to i8*), i32 0)
+// CHECK-NEXT:  invoke void [[REG1]]()
+  try {
+    foo();
+  } catch (int) {}
+  return 0;
+}
+
+// CHECK: declare void @__llvm_profile_instrument_target(i64, i8*, i32)
+
+
+
diff --git a/test/Profile/cxx-lambda.cpp b/test/Profile/cxx-lambda.cpp
index 26314c8..2b42291 100644
--- a/test/Profile/cxx-lambda.cpp
+++ b/test/Profile/cxx-lambda.cpp
@@ -1,11 +1,11 @@
 // Tests for instrumentation of C++11 lambdas
 
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-generate > %tgen
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument=clang > %tgen
 // RUN: FileCheck --input-file=%tgen -check-prefix=PGOGEN %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=LMBGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-lambda.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-use=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=PGOUSE %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=LMBUSE %s
 
diff --git a/test/Profile/cxx-linkage.cpp b/test/Profile/cxx-linkage.cpp
index 5593403..6f7b2b7 100644
--- a/test/Profile/cxx-linkage.cpp
+++ b/test/Profile/cxx-linkage.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9.0 -emit-llvm -main-file-name cxx-linkage.cpp %s -o - -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9.0 -emit-llvm -main-file-name cxx-linkage.cpp %s -o - -fprofile-instrument=clang | FileCheck %s
 
 // CHECK: @__profc__Z3foov = private global
 // CHECK: @__profd__Z3foov = private global
diff --git a/test/Profile/cxx-rangefor.cpp b/test/Profile/cxx-rangefor.cpp
index 1007a70..a61557a 100644
--- a/test/Profile/cxx-rangefor.cpp
+++ b/test/Profile/cxx-rangefor.cpp
@@ -1,10 +1,10 @@
 // Tests for instrumentation of C++11 range-for
 
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-generate > %tgen
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument=clang > %tgen
 // RUN: FileCheck --input-file=%tgen -check-prefix=CHECK -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-rangefor.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-use=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=CHECK -check-prefix=PGOUSE %s
 
 // PGOGEN: @[[RFC:__profc__Z9range_forv]] = private global [5 x i64] zeroinitializer
diff --git a/test/Profile/cxx-structors.cpp b/test/Profile/cxx-structors.cpp
index 183df92..73562d3 100644
--- a/test/Profile/cxx-structors.cpp
+++ b/test/Profile/cxx-structors.cpp
@@ -1,6 +1,6 @@
 // Tests for instrumentation of C++ constructors and destructors.
 //
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11.0 -x c++ %s -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11.0 -x c++ %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
 struct Foo {
   Foo() {}
diff --git a/test/Profile/cxx-templates.cpp b/test/Profile/cxx-templates.cpp
index c24bae3..1cec605 100644
--- a/test/Profile/cxx-templates.cpp
+++ b/test/Profile/cxx-templates.cpp
@@ -1,12 +1,12 @@
 // Tests for instrumentation of templated code. Each instantiation of a template
 // should be instrumented separately.
 
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-generate > %tgen
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument=clang > %tgen
 // RUN: FileCheck --input-file=%tgen -check-prefix=T0GEN -check-prefix=ALL %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=T100GEN -check-prefix=ALL %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-templates.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-use=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=T0USE -check-prefix=ALL %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=T100USE -check-prefix=ALL %s
 
diff --git a/test/Profile/cxx-throws.cpp b/test/Profile/cxx-throws.cpp
index 6b33416..ef56c8b 100644
--- a/test/Profile/cxx-throws.cpp
+++ b/test/Profile/cxx-throws.cpp
@@ -3,12 +3,12 @@
 // FIXME: Don't seek bb labels, like "if.else"
 // REQUIRES: asserts
 
-// RUN: %clangxx %s -o - -emit-llvm -S -fprofile-instr-generate -fexceptions -target %itanium_abi_triple | FileCheck -check-prefix=PGOGEN %s
-// RUN: %clangxx %s -o - -emit-llvm -S -fprofile-instr-generate -fexceptions -target %itanium_abi_triple | FileCheck -check-prefix=PGOGEN-EXC %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOGEN %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOGEN-EXC %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-throws.proftext -o %t.profdata
-// RUN: %clang %s -o - -emit-llvm -S -fprofile-instr-use=%t.profdata -fcxx-exceptions -target %itanium_abi_triple | FileCheck -check-prefix=PGOUSE %s
-// RUN: %clang %s -o - -emit-llvm -S -fprofile-instr-use=%t.profdata -fcxx-exceptions -target %itanium_abi_triple | FileCheck -check-prefix=PGOUSE-EXC %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOUSE-EXC %s
 
 // PGOGEN: @[[THC:__profc__Z6throwsv]] = private global [9 x i64] zeroinitializer
 // PGOGEN-EXC: @[[THC:__profc__Z6throwsv]] = private global [9 x i64] zeroinitializer
diff --git a/test/Profile/cxx-virtual-destructor-calls.cpp b/test/Profile/cxx-virtual-destructor-calls.cpp
index 4affd26..cc3df68 100644
--- a/test/Profile/cxx-virtual-destructor-calls.cpp
+++ b/test/Profile/cxx-virtual-destructor-calls.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -main-file-name cxx-virtual-destructor-calls.cpp %s -o - -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -main-file-name cxx-virtual-destructor-calls.cpp %s -o - -fprofile-instrument=clang | FileCheck %s
 
 struct Member {
   ~Member();
diff --git a/test/Profile/def-assignop.cpp b/test/Profile/def-assignop.cpp
index 0a36ff8..2d45336 100644
--- a/test/Profile/def-assignop.cpp
+++ b/test/Profile/def-assignop.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-assignop.cpp -o - -emit-llvm -fprofile-instr-generate | FileCheck --check-prefix=PGOGEN %s
-// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-assignop.cpp -o - -emit-llvm -fprofile-instr-generate -fcoverage-mapping | FileCheck --check-prefix=COVMAP %s
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-assignop.cpp -o - -emit-llvm -fprofile-instrument=clang | FileCheck --check-prefix=PGOGEN %s
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-assignop.cpp -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck --check-prefix=COVMAP %s
 
 struct B {
   B& operator=(const B &b);
@@ -24,9 +24,8 @@
   B b;
 };
 
-int main() {
-  A a1, a2;
+A a1, a2;
+void foo() {
   a1 = a2;
   a2 = static_cast<A &&>(a1);
-  return 0;
 }
diff --git a/test/Profile/def-ctors.cpp b/test/Profile/def-ctors.cpp
new file mode 100644
index 0000000..1b52d55
--- /dev/null
+++ b/test/Profile/def-ctors.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu  -main-file-name def-ctors.cpp -o - -emit-llvm -fprofile-instrument=clang |  FileCheck --check-prefix=PGOGEN %s
+
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-ctors.cpp -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck --check-prefix=COVMAP %s
+
+struct Base {
+  int B;
+  Base() : B(2) {}
+  Base(const struct Base &b2) {}
+};
+
+struct Derived : public Base {
+  Derived(const Derived &) = default;
+  // PGOGEN-DAG: define {{.*}}@_ZN7DerivedC2ERKS_
+  // PGOGEN-DAG: %pgocount = load {{.*}} @__profc__ZN7DerivedC2ERKS_
+  // PGOGEN-DAG: {{.*}}add{{.*}}%pgocount, 1
+  // PGOGEN-DAG: store{{.*}}@__profc__ZN7DerivedC2ERKS_
+  Derived() = default;
+  // PGOGEN-DAG: define {{.*}}@_ZN7DerivedC2Ev
+  // PGOGEN-DAG: %pgocount = load {{.*}} @__profc__ZN7DerivedC2Ev
+  // PGOGEN-DAG: {{.*}}add{{.*}}%pgocount, 1
+  // PGOGEN-DAG: store{{.*}}@__profc__ZN7DerivedC2Ev
+
+  // Check that coverage mapping has 6 function records including
+  // the defaulted Derived::Derived(const Derived), and Derived::Derived()
+  // methds.
+  // COVMAP: @__llvm_coverage_mapping = {{.*}} { { i32, i32, i32, i32 }, [5 x
+  // <{{.*}}>],
+};
+
+Derived dd;
+int g;
+int main() {
+  Derived dd2(dd);
+  g = dd2.B;
+  return 0;
+}
diff --git a/test/Profile/def-dtors.cpp b/test/Profile/def-dtors.cpp
new file mode 100644
index 0000000..bfa5356
--- /dev/null
+++ b/test/Profile/def-dtors.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-dtors.cpp -o - -emit-llvm -fprofile-instrument=clang  | FileCheck --check-prefix=PGOGEN %s
+
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-dtors.cpp -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck --check-prefix=COVMAP %s
+
+struct Base {
+  int B;
+  Base(int B_) : B(B_) {}
+  ~Base() {}
+};
+
+struct Derived : public Base {
+  Derived(int K) : Base(K) {}
+  ~Derived() = default;
+  // PGOGEN-LABEL: define {{.*}}@_ZN7DerivedD2Ev
+  // PGOGEN: %pgocount = load {{.*}} @__profc__ZN7DerivedD2Ev
+  // PGOGEN: {{.*}}add{{.*}}%pgocount, 1
+  // PGOGEN: store{{.*}}@__profc__ZN7DerivedD2Ev
+
+  // Check that coverage mapping has 6 function records including
+  // the default destructor in the derived class.
+  // COVMAP: @__llvm_coverage_mapping = {{.*}} { { i32, i32, i32, i32 }, [5 x
+  // <{{.*}}>],
+};
+
+int main() {
+  Derived dd2(10);
+  if (dd2.B != 10)
+    return 1;
+  return 0;
+}
diff --git a/test/Profile/func-entry.c b/test/Profile/func-entry.c
index 1ecae60..430ccb3 100644
--- a/test/Profile/func-entry.c
+++ b/test/Profile/func-entry.c
@@ -1,7 +1,7 @@
 // Test that function entry counts are set correctly.
 
 // RUN: llvm-profdata merge %S/Inputs/func-entry.proftext -o %t.profdata
-// RUN: %clang %s -o - -mllvm -disable-llvm-optzns -emit-llvm -S -fprofile-instr-use=%t.profdata | FileCheck %s
+// RUN: %clang_cc1 %s -o - -disable-llvm-optzns -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
 
 void foo(void);
 
diff --git a/test/Profile/gcc-flag-compatibility.c b/test/Profile/gcc-flag-compatibility.c
index 679a722..0376b0b 100644
--- a/test/Profile/gcc-flag-compatibility.c
+++ b/test/Profile/gcc-flag-compatibility.c
@@ -7,16 +7,12 @@
 // -fprofile-use=<dir>        Uses the profile file <dir>/default.profdata
 // -fprofile-use=<dir>/file   Uses the profile file <dir>/file
 
-// Check that -fprofile-generate uses the runtime default profile file.
 // RUN: %clang %s -c -S -o - -emit-llvm -fprofile-generate | FileCheck -check-prefix=PROFILE-GEN %s
-// PROFILE-GEN-NOT: call void @__llvm_profile_override_default_filename
-// PROFILE-GEN-NOT: declare void @__llvm_profile_override_default_filename(i8*)
+// PROFILE-GEN: __llvm_profile_filename
 
 // Check that -fprofile-generate=/path/to generates /path/to/default.profraw
 // RUN: %clang %s -c -S -o - -emit-llvm -fprofile-generate=/path/to | FileCheck -check-prefix=PROFILE-GEN-EQ %s
-// PROFILE-GEN-EQ: private constant [25 x i8] c"/path/to{{/|\\5C}}default.profraw\00"
-// PROFILE-GEN-EQ: call void @__llvm_profile_override_default_filename(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @0, i32 0, i32 0))
-// PROFILE-GEN-EQ: declare void @__llvm_profile_override_default_filename(i8*)
+// PROFILE-GEN-EQ: constant [{{.*}} x i8] c"/path/to{{/|\\5C}}{{.*}}\00"
 
 // Check that -fprofile-use=some/path reads some/path/default.profdata
 // RUN: rm -rf %t.dir
diff --git a/test/Profile/max-function-count.c b/test/Profile/max-function-count.c
deleted file mode 100644
index 39490d7..0000000
--- a/test/Profile/max-function-count.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// Test that maximum function counts are set correctly.
-
-// RUN: llvm-profdata merge %S/Inputs/max-function-count.proftext -o %t.profdata
-// RUN: %clang %s -o - -mllvm -disable-llvm-optzns -emit-llvm -S -fprofile-instr-use=%t.profdata | FileCheck %s
-//
-int begin(int i) {
-  if (i)
-    return 0;
-  return 1;
-}
-
-int end(int i) {
-  if (i)
-    return 0;
-  return 1;
-}
-
-int main(int argc, const char *argv[]) {
-  begin(0);
-  end(1);
-  end(1);
-  return 0;
-}
-// CHECK: !{{[0-9]+}} = !{i32 1, !"MaxFunctionCount", i32 2}
diff --git a/test/Profile/objc-general.m b/test/Profile/objc-general.m
index b6435af..b679627 100644
--- a/test/Profile/objc-general.m
+++ b/test/Profile/objc-general.m
@@ -1,9 +1,9 @@
 // Test instrumentation of general constructs in objective C.
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instr-generate | FileCheck -check-prefix=PGOGEN %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instrument=clang | FileCheck -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/objc-general.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instr-use=%t.profdata | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE %s
 
 #ifdef HAVE_FOUNDATION
 
diff --git a/test/Profile/profile-does-not-exist.c b/test/Profile/profile-does-not-exist.c
index d45981f..5725f76 100644
--- a/test/Profile/profile-does-not-exist.c
+++ b/test/Profile/profile-does-not-exist.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -emit-llvm %s -o - -fprofile-instr-use=%t.nonexistent.profdata 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -emit-llvm %s -o - -fprofile-instrument-use-path=%t.nonexistent.profdata 2>&1 | FileCheck %s
 
 // CHECK: error: Could not read profile {{.*}}.nonexistent.profdata:
 // CHECK-NOT: Assertion failed
diff --git a/test/Profile/profile-summary.c b/test/Profile/profile-summary.c
new file mode 100644
index 0000000..dc3112c
--- /dev/null
+++ b/test/Profile/profile-summary.c
@@ -0,0 +1,25 @@
+// Test that profile summary is set correctly.
+
+// RUN: llvm-profdata merge %S/Inputs/max-function-count.proftext -o %t.profdata
+// RUN: %clang_cc1 %s -o - -disable-llvm-optzns -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
+//
+int begin(int i) {
+  if (i)
+    return 0;
+  return 1;
+}
+
+int end(int i) {
+  if (i)
+    return 0;
+  return 1;
+}
+
+int main(int argc, const char *argv[]) {
+  begin(0);
+  end(1);
+  end(1);
+  return 0;
+}
+// CHECK: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}}
+// CHECK: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}}
diff --git a/test/Sema/128bitfloat.cpp b/test/Sema/128bitfloat.cpp
index cb76dac..2449cb6 100644
--- a/test/Sema/128bitfloat.cpp
+++ b/test/Sema/128bitfloat.cpp
@@ -1,24 +1,35 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
-#if !defined(__STRICT_ANSI__)
-__float128 f;  // expected-error {{support for type '__float128' is not yet implemented}}
-// But this should work:
+#ifdef __FLOAT128__
+__float128 f;
 template<typename> struct __is_floating_point_helper {};
 template<> struct __is_floating_point_helper<__float128> {};
+int g(int x, __float128 *y) {
+  return x + *y;
+}
+
+// expected-no-diagnostics
+#else
+#if !defined(__STRICT_ANSI__)
+__float128 f;  // expected-error {{__float128 is not supported on this target}}
+// But this should work:
+template<typename> struct __is_floating_point_helper {};
+template<> struct __is_floating_point_helper<__float128> {};  // expected-error {{__float128 is not supported on this target}}
 
 // FIXME: This could have a better diag.
-void g(int x, __float128 *y) {
-  x + *y;  // expected-error {{invalid operands to binary expression ('int' and '__float128')}}
+int g(int x, __float128 *y) {  // expected-error {{__float128 is not supported on this target}}
+  return x + *y;
 }
 
 #else
-__float128 f;  // expected-error {{unknown type name '__float128'}}
+__float128 f;  // expected-error {{__float128 is not supported on this target}}
 template<typename> struct __is_floating_point_helper {};
-template<> struct __is_floating_point_helper<__float128> {};  // expected-error {{use of undeclared identifier '__float128'}}
+template<> struct __is_floating_point_helper<__float128> {};  // expected-error {{__float128 is not supported on this target}}
 
-void g(int x, __float128 *y) {  // expected-error {{unknown type name '__float128'}}
-  x + *y;
+int g(int x, __float128 *y) {  // expected-error {{__float128 is not supported on this target}}
+  return x + *y;
 }
 
 #endif
+#endif
diff --git a/test/Sema/MicrosoftExtensions.c b/test/Sema/MicrosoftExtensions.c
index e703230..62e5285 100644
--- a/test/Sema/MicrosoftExtensions.c
+++ b/test/Sema/MicrosoftExtensions.c
@@ -6,6 +6,12 @@
    int a[];  /* expected-warning {{flexible array member 'a' in otherwise empty struct is a Microsoft extension}} */
 };
 
+struct PR28407
+{
+  int : 1;
+  int a[]; /* expected-warning {{flexible array member 'a' in otherwise empty struct is a Microsoft extension}} */
+};
+
 struct C {
    int l;
    union {
@@ -170,3 +176,13 @@
     __va_start(ap, f); // expected-warning {{incompatible pointer types passing 'my_va_list'}}
   }
 }
+
+// __unaligned handling
+void test_unaligned() {
+  __unaligned int *p1 = 0;
+  int *p2 = p1; // expected-warning {{initializing 'int *' with an expression of type '__unaligned int *' discards qualifiers}}
+  __unaligned int *p3 = p2;
+}
+
+void test_unaligned2(int x[__unaligned 4]) {}
+
diff --git a/test/Sema/address_spaces.c b/test/Sema/address_spaces.c
index 1922c8a..3fe9315 100644
--- a/test/Sema/address_spaces.c
+++ b/test/Sema/address_spaces.c
@@ -20,7 +20,7 @@
   _AS1 int arrarr[5][5]; // expected-error {{automatic variable qualified with an address space}}
 
   __attribute__((address_space(-1))) int *_boundsA; // expected-error {{address space is negative}}
-  __attribute__((address_space(0xFFFFFF))) int *_boundsB;
+  __attribute__((address_space(0x7FFFFF))) int *_boundsB;
   __attribute__((address_space(0x1000000))) int *_boundsC; // expected-error {{address space is larger than the maximum supported}}
   // chosen specifically to overflow 32 bits and come out reasonable
   __attribute__((address_space(4294967500))) int *_boundsD; // expected-error {{address space is larger than the maximum supported}}
@@ -71,4 +71,4 @@
 // Clang extension doesn't forbid operations on pointers to different address spaces.
 char* cmp(_AS1 char *x,  _AS2 char *y) {
   return x < y ? x : y; // expected-warning {{pointer type mismatch ('__attribute__((address_space(1))) char *' and '__attribute__((address_space(2))) char *')}}
-}
\ No newline at end of file
+}
diff --git a/test/Sema/arm-no-fp16.c b/test/Sema/arm-no-fp16.c
new file mode 100644
index 0000000..6443d83
--- /dev/null
+++ b/test/Sema/arm-no-fp16.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple thumbv7-none-eabi %s -target-feature +neon -target-feature -fp16 -fsyntax-only -verify
+
+#include <arm_neon.h>
+
+float16x4_t test_vcvt_f16_f32(float32x4_t a) {
+  return vcvt_f16_f32(a); // expected-warning{{implicit declaration of function 'vcvt_f16_f32'}}  expected-error{{returning 'int' from a function with incompatible result type 'float16x4_t'}}
+}
+
+float32x4_t test_vcvt_f32_f16(float16x4_t a) {
+  return vcvt_f32_f16(a); // expected-warning{{implicit declaration of function 'vcvt_f32_f16'}} expected-error{{returning 'int' from a function with incompatible result type 'float32x4_t'}}
+}
diff --git a/test/Sema/arm64-neon-header.c b/test/Sema/arm64-neon-header.c
new file mode 100644
index 0000000..0ae0821
--- /dev/null
+++ b/test/Sema/arm64-neon-header.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple arm64-apple-darwin -target-feature +neon -Wvector-conversion -fsyntax-only -ffreestanding -verify %s
+
+#include <arm_neon.h>
+
+int16x8_t foo(int8x8_t p0, int16x8_t p1) {
+  return vqmovun_high_s16(p0, p1); // expected-warning {{incompatible vector types returning 'uint8x16_t'}}
+}
diff --git a/test/Sema/arm_vfma.c b/test/Sema/arm_vfma.c
index c50a414..8c08b4d 100644
--- a/test/Sema/arm_vfma.c
+++ b/test/Sema/arm_vfma.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple thumbv7s-apple-ios7.0 -target-feature +neon -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple thumbv7-none-eabi -target-feature +neon -target-feature +vfp4 -fsyntax-only -verify %s
 #include <arm_neon.h>
 
 // expected-no-diagnostics
diff --git a/test/Sema/asm.c b/test/Sema/asm.c
index d29b136..69c33f7 100644
--- a/test/Sema/asm.c
+++ b/test/Sema/asm.c
@@ -25,7 +25,7 @@
   asm ("nop" : : : "0", "%0", "#0");
   asm ("nop" : : : "foo"); // expected-error {{unknown register name 'foo' in asm}}
   asm ("nop" : : : "52");
-  asm ("nop" : : : "104"); // expected-error {{unknown register name '104' in asm}}
+  asm ("nop" : : : "204"); // expected-error {{unknown register name '204' in asm}}
   asm ("nop" : : : "-1"); // expected-error {{unknown register name '-1' in asm}}
   asm ("nop" : : : "+1"); // expected-error {{unknown register name '+1' in asm}}
 }
diff --git a/test/Sema/ast-print.c b/test/Sema/ast-print.c
index b4d7684..4c0aef5 100644
--- a/test/Sema/ast-print.c
+++ b/test/Sema/ast-print.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 %s -ast-print | %clang_cc1 -fsyntax-only -
 
 typedef void func_typedef();
 func_typedef xxx;
@@ -39,6 +40,7 @@
   return a[2];
 }
 
+// CHECK: typedef struct {
 typedef struct {
   int f;
 } T __attribute__ ((__aligned__));
@@ -53,3 +55,13 @@
 
 // CHECK: struct pair_t p = {a: 3, .b = 4};
 struct pair_t p = {a: 3, .b = 4};
+
+void initializers() {
+  // CHECK: int *x = ((void *)0), *y = ((void *)0);
+  int *x = ((void *)0), *y = ((void *)0);
+  struct Z{};
+  struct {
+    struct Z z;
+  // CHECK: } z = {(struct Z){}};
+  } z = {(struct Z){}};
+}
diff --git a/test/Sema/atomic-ops.c b/test/Sema/atomic-ops.c
index 9a37ec2..0583621 100644
--- a/test/Sema/atomic-ops.c
+++ b/test/Sema/atomic-ops.c
@@ -121,7 +121,10 @@
   __atomic_load(I, *P, memory_order_relaxed, 42); // expected-error {{too many arguments}}
   (int)__atomic_load(I, I, memory_order_seq_cst); // expected-error {{operand of type 'void'}}
   __atomic_load(s1, s2, memory_order_acquire);
-  (void)__atomic_load(I, CI, memory_order_relaxed); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
+  __atomic_load(CI, I, memory_order_relaxed);
+  __atomic_load(I, CI, memory_order_relaxed); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
+  __atomic_load(CI, CI, memory_order_relaxed); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
+
   __c11_atomic_store(i, 1, memory_order_seq_cst);
   __c11_atomic_store(p, 1, memory_order_seq_cst); // expected-warning {{incompatible integer to pointer conversion}}
   (int)__c11_atomic_store(d, 1, memory_order_seq_cst); // expected-error {{operand of type 'void'}}
diff --git a/test/Sema/attr-alias-elf.c b/test/Sema/attr-alias-elf.c
index f14514d..e56f23e 100644
--- a/test/Sema/attr-alias-elf.c
+++ b/test/Sema/attr-alias-elf.c
@@ -55,7 +55,7 @@
 
 void test2_bar() {}
 void test2_foo() __attribute__((weak, alias("test2_bar")));
-void test2_zed() __attribute__((alias("test2_foo"))); // expected-warning {{alias will always resolve to test2_bar even if weak definition of alias test2_foo is overridden}}
+void test2_zed() __attribute__((alias("test2_foo"))); // expected-warning {{alias will always resolve to test2_bar even if weak definition of test2_foo is overridden}}
 
 void test3_bar() { }
 void test3_foo() __attribute__((section("test"))); // expected-warning {{alias will not be in section 'test' but in the same section as the aliasee}}
diff --git a/test/Sema/attr-availability-swift.c b/test/Sema/attr-availability-swift.c
new file mode 100644
index 0000000..42e7524
--- /dev/null
+++ b/test/Sema/attr-availability-swift.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -fsyntax-only -fblocks -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -fsyntax-only -ast-dump %s | FileCheck %s
+//
+
+#if !__has_feature(attribute_availability_with_message)
+# error "Missing __has_feature"
+#endif
+
+#if __has_feature(attribute_availability_swift)
+# warning "okay"
+// expected-warning@-1{{okay}}
+#else
+# error "Missing __has_feature"
+#endif
+
+extern int noSwiftGlobal1 __attribute__((availability(swift, unavailable)));
+// CHECK: AvailabilityAttr {{.*}}swift 0 0 0 Unavailable "" ""
+extern int noSwiftGlobal1 __attribute__((availability(macosx, introduced=10.1))); // okay
+// CHECK: AvailabilityAttr {{.*}}macos 10.1 0 0 "" ""
+// CHECK: AvailabilityAttr {{.*}}Inherited swift 0 0 0 Unavailable "" ""
+extern int noSwiftGlobal1 __attribute__((availability(swift, unavailable, message="and this one has a message"))); // okay
+// CHECK: AvailabilityAttr {{.*}}swift 0 0 0 Unavailable "and this one has a message" ""
+// CHECK: AvailabilityAttr {{.*}}Inherited macos 10.1 0 0 "" ""
+extern int noSwiftGlobal2 __attribute__((availability(swift, introduced=5))); // expected-warning{{only 'unavailable' and 'deprecated' are supported for Swift availability}}
+// CHECK: VarDecl
+// CHECK-NOT: AvailabilityAttr
+extern int noSwiftGlobal3 __attribute__((availability(swift, deprecated, message="t")));
+// CHECK: VarDecl
+// CHECK: AvailabilityAttr {{.*}}swift 0 1 0 "t" ""
diff --git a/test/Sema/attr-availability.c b/test/Sema/attr-availability.c
index 6b1b3a6..e7a800f 100644
--- a/test/Sema/attr-availability.c
+++ b/test/Sema/attr-availability.c
@@ -30,7 +30,7 @@
   ATSFontGetPostScriptName(100); // expected-error {{'ATSFontGetPostScriptName' is unavailable: obsoleted in macOS 9.0 - use ATSFontGetFullPostScriptName}}
 
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'PartiallyAvailable' to silence this warning}}
+  // expected-warning@+2 {{is only available on macOS 10.8 or newer}} expected-note@+2 {{enclose 'PartiallyAvailable' in an @available check to silence this warning}}
 #endif
   PartiallyAvailable();
 }
@@ -80,22 +80,6 @@
 extern int x2 __attribute__((availability(macosx,introduced=10.2))); // expected-note {{previous attribute is here}}
 extern int x2 __attribute__((availability(macosx,introduced=10.5))); // expected-warning {{availability does not match previous declaration}}
 
-
-
-#if __has_feature(attribute_availability_swift)
-# warning "okay"
-// expected-warning@-1{{okay}}
-#else
-# error "Missing __has_feature"
-#endif
-
-
-extern int noSwiftGlobal1 __attribute__((availability(swift, unavailable)));
-extern int noSwiftGlobal1 __attribute__((availability(macosx, introduced=10.1))); // okay
-extern int noSwiftGlobal1 __attribute__((availability(swift, unavailable, message="and this one has a message"))); // okay
-
-extern int noSwiftGlobal2 __attribute__((availability(swift, introduced=5))); // expected-warning{{only 'unavailable' is supported for Swift availability}}
-
 enum Original {
   OriginalDeprecated __attribute__((availability(macosx, deprecated=10.2))), // expected-note + {{'OriginalDeprecated' has been explicitly marked deprecated here}}
   OriginalUnavailable __attribute__((availability(macosx, unavailable))) // expected-note + {{'OriginalUnavailable' has been explicitly marked unavailable here}}
diff --git a/test/Sema/attr-deprecated.c b/test/Sema/attr-deprecated.c
index 2e3e722..8566a0e 100644
--- a/test/Sema/attr-deprecated.c
+++ b/test/Sema/attr-deprecated.c
@@ -122,5 +122,10 @@
 };
 
 typedef int test23_ty __attribute((deprecated));
+// Redefining a typedef is a C11 feature.
+#if __STDC_VERSION__ <= 199901L
+// expected-note@-3 {{'test23_ty' has been explicitly marked deprecated here}}
+#else
 typedef int test23_ty; // expected-note {{'test23_ty' has been explicitly marked deprecated here}}
+#endif
 test23_ty test23_v; // expected-warning {{'test23_ty' is deprecated}}
diff --git a/test/Sema/attr-ifunc.c b/test/Sema/attr-ifunc.c
new file mode 100644
index 0000000..d177b71
--- /dev/null
+++ b/test/Sema/attr-ifunc.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple x86_64-windows -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -verify -emit-llvm-only -DCHECK_ALIASES %s
+// RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -verify -emit-llvm-only %s
+
+#if defined(_WIN32)
+void foo() {}
+void bar() __attribute__((ifunc("foo")));
+//expected-warning@-1 {{'ifunc' attribute ignored}}
+
+#else
+#if defined(CHECK_ALIASES)
+void* f1_ifunc();
+void f1() __attribute__((ifunc("f1_ifunc")));
+//expected-error@-1 {{ifunc must point to a defined function}}
+
+void* f2_a() __attribute__((ifunc("f2_b")));
+//expected-error@-1 {{ifunc definition is part of a cycle}}
+void* f2_b() __attribute__((ifunc("f2_a")));
+//expected-error@-1 {{ifunc definition is part of a cycle}}
+
+void* f3_a() __attribute__((ifunc("f3_b")));
+//expected-warning@-1 {{ifunc will always resolve to f3_c even if weak definition of f3_b is overridden}}
+void* f3_b() __attribute__((weak, alias("f3_c")));
+void* f3_c() { return 0; }
+
+void f4_ifunc() {}
+void f4() __attribute__((ifunc("f4_ifunc")));
+//expected-error@-1 {{ifunc resolver function must return a pointer}}
+
+void* f5_ifunc(int i) { return 0; }
+void f5() __attribute__((ifunc("f5_ifunc")));
+//expected-error@-1 {{ifunc resolver function must have no parameters}}
+
+#else
+void f1a() __asm("f1");
+void f1a() {}
+//expected-note@-1 {{previous definition is here}}
+void f1() __attribute__((ifunc("f1_ifunc")));
+//expected-error@-1 {{definition with same mangled name as another definition}}
+void* f1_ifunc() { return 0; }
+
+#endif
+#endif
diff --git a/test/Sema/attr-mode-enums.c b/test/Sema/attr-mode-enums.c
new file mode 100644
index 0000000..4b98c3b
--- /dev/null
+++ b/test/Sema/attr-mode-enums.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// Test checks that 'mode' attribute is handled correctly with enums, i. e. code
+//   1. "typedef enum { A } __attribute__((mode(HI))) T;" is accepted,
+//   2. "enum X __attribute__((mode(QI))) var;" forms a complete integer type.
+//   3. "enum { A } __attribute__((mode(V4SI))) var;" is not accepted (vector mode).
+
+typedef enum { E4 } EnumType;
+
+int main() {
+  // Vector mode are not allowed with enums.
+  typedef enum { E1 } __attribute__((mode(V4QI))) RejectedType1; // expected-error{{mode 'V4QI' is not supported for enumeration types}}
+  // expected-warning@-1{{specifying vector types with the 'mode' attribute is deprecated}}
+  typedef enum __attribute__((mode(V8HI))) { E2 } RejectedType2; // expected-error{{mode 'V8HI' is not supported for enumeration types}}
+                                                                 // expected-warning@-1{{deprecated}}
+  typedef enum E3 __attribute__((mode(V2SI))) RejectedType3; // expected-error{{mode 'V2SI' is not supported for enumeration types}}
+                                                             // expected-warning@-1{{deprecated}}
+  typedef EnumType __attribute__((mode(V4DI))) RejectedType4; // expected-error{{mode 'V4DI' is not supported for enumeration types}}
+                                                              // expected-warning@-1{{deprecated}}
+  EnumType v1 __attribute__((mode(V4QI))); // expected-error{{mode 'V4QI' is not supported for enumeration types}}
+                                           // expected-warning@-1{{deprecated}}
+  enum __attribute__((mode(V8HI))) { E5 } v2; // expected-error{{mode 'V8HI' is not supported for enumeration types}}
+                                              // expected-warning@-1{{deprecated}}
+
+  // Incomplete enums without mode attribute are not allowed.
+  typedef enum Y IncompleteYType; // expected-note{{forward declaration of 'enum Y'}}
+
+  enum X a1; // expected-error{{variable has incomplete type 'enum X'}}
+             // expected-note@-1{{forward declaration of 'enum X'}}
+  IncompleteYType a2; // expected-error{{variable has incomplete type 'IncompleteYType' (aka 'enum Y')}}
+
+  // OK with 'mode' attribute.
+  typedef enum Y __attribute__((mode(QI))) CompleteYType1;
+  typedef enum Y CompleteYType2 __attribute__((mode(HI)));
+  typedef enum { A1, B1 } __attribute__((mode(QI))) CompleteType3;
+  typedef enum { A2, B2 } CompleteType4 __attribute__((mode(QI)));
+  typedef enum __attribute__((mode(QI))) { A3, B3 } CompleteType5;
+
+  enum X __attribute__((mode(QI))) a3;
+  enum X a4 __attribute__((mode(HI)));
+  IncompleteYType __attribute__((mode(QI))) a5;
+  IncompleteYType a6 __attribute__((mode(HI)));
+  CompleteYType1 a7;
+  CompleteYType2 a8;
+  CompleteType3 a9;
+  CompleteType4 a10;
+  CompleteType5 a11;
+  enum __attribute__((mode(QI))) { A4, B4 } a12;
+
+  return 0;
+}
diff --git a/test/Sema/attr-mode.c b/test/Sema/attr-mode.c
index 179b181..e160d8d 100644
--- a/test/Sema/attr-mode.c
+++ b/test/Sema/attr-mode.c
@@ -4,6 +4,8 @@
 // RUN:   -verify %s
 // RUN: %clang_cc1 -triple powerpc64-pc-linux-gnu -DTEST_64BIT_PPC64 -fsyntax-only \
 // RUN:   -verify %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnux32 -DTEST_64BIT_X86 -fsyntax-only \
+// RUN:   -verify %s
 
 typedef int i16_1 __attribute((mode(HI)));
 int i16_1_test[sizeof(i16_1) == 2 ? 1 : -1];
@@ -24,8 +26,8 @@
 
 int **__attribute((mode(QI)))* i32;  // expected-error{{mode attribute}}
 
-__attribute__((mode(QI))) int invalid_func() { return 1; } // expected-error{{'mode' attribute only applies to variables, fields and typedefs}}
-enum invalid_enum { A1 __attribute__((mode(QI))) }; // expected-error{{'mode' attribute only applies to variables, fields and typedefs}}
+__attribute__((mode(QI))) int invalid_func() { return 1; } // expected-error{{'mode' attribute only applies to variables, enums, fields and typedefs}}
+enum invalid_enum { A1 __attribute__((mode(QI))) }; // expected-error{{'mode' attribute only applies to variables, enums, fields and typedefs}}
 
 typedef _Complex double c32 __attribute((mode(SC)));
 int c32_test[sizeof(c32) == 8 ? 1 : -1];
@@ -63,9 +65,18 @@
 void test_long_to_i64(long long* y) { f_i64_arg(y); }
 void test_long_to_ui64(unsigned long long* y) { f_ui64_arg(y); }
 #elif TEST_64BIT_X86
+#ifdef __ILP32__
+typedef unsigned int gcc_word __attribute__((mode(word)));
+int foo[sizeof(gcc_word) == 8 ? 1 : -1];
+typedef unsigned int gcc_unwind_word __attribute__((mode(unwind_word)));
+int foo[sizeof(gcc_unwind_word) == 8 ? 1 : -1];
+void test_long_to_i64(long long* y) { f_i64_arg(y); }
+void test_long_to_ui64(unsigned long long* y) { f_ui64_arg(y); }
+#else
 void test_long_to_i64(long* y) { f_i64_arg(y); }
 void test_long_to_ui64(unsigned long* y) { f_ui64_arg(y); }
-typedef          float f128ibm __attribute__ ((mode (TF)));     // expected-error{{unsupported machine mode 'TF'}}
+#endif
+typedef          float f128ibm __attribute__ ((mode (TF)));
 #elif TEST_64BIT_PPC64
 typedef          float f128ibm __attribute__ ((mode (TF)));
 typedef _Complex float c128ibm __attribute__ ((mode (TC)));
diff --git a/test/Sema/attr-nodebug.c b/test/Sema/attr-nodebug.c
index 03ec49b..e7ca58d 100644
--- a/test/Sema/attr-nodebug.c
+++ b/test/Sema/attr-nodebug.c
@@ -2,8 +2,8 @@
 
 int a __attribute__((nodebug));
 
-void b() {
-  int b __attribute__((nodebug)); // expected-warning {{'nodebug' only applies to variables with static storage duration and functions}}
+void b(int p __attribute__((nodebug))) { // expected-warning {{'nodebug' attribute only applies to variables and functions}}
+  int b __attribute__((nodebug));
 }
 
 void t1() __attribute__((nodebug));
diff --git a/test/Sema/attr-print.c b/test/Sema/attr-print.c
index b3bdfd7..7ffbbb8 100644
--- a/test/Sema/attr-print.c
+++ b/test/Sema/attr-print.c
@@ -32,3 +32,6 @@
 
 // CHECK: int * __sptr * __ptr32 ppsp32;
 int * __sptr * __ptr32 ppsp32;
+
+// CHECK: __attribute__((availability(macos, strict, introduced=10.6)));
+void f6(int) __attribute__((availability(macosx,strict,introduced=10.6)));
diff --git a/test/Sema/bitfield-layout.c b/test/Sema/bitfield-layout.c
index b96b386..079720c 100644
--- a/test/Sema/bitfield-layout.c
+++ b/test/Sema/bitfield-layout.c
@@ -1,6 +1,8 @@
 // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=i686-apple-darwin9
 // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=arm-linux-gnueabihf
 // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=aarch64-linux-gnu
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-pc-linux-gnu
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-scei-ps4
 // expected-no-diagnostics
 #include <stddef.h>
 
@@ -95,9 +97,15 @@
   char c;
 };
 
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g0, 16);
+CHECK_ALIGN(struct, g0, 16);
+CHECK_OFFSET(struct, g0, c, 2);
+#else
 CHECK_SIZE(struct, g0, 32);
 CHECK_ALIGN(struct, g0, 16);
 CHECK_OFFSET(struct, g0, c, 17);
+#endif
 
 // Bit-field with explicit align smaller than normal.
 struct g1 {
@@ -108,7 +116,11 @@
 
 CHECK_SIZE(struct, g1, 4);
 CHECK_ALIGN(struct, g1, 4);
+#if defined(__ORBIS__)
+CHECK_OFFSET(struct, g1, c, 2);
+#else
 CHECK_OFFSET(struct, g1, c, 3);
+#endif
 
 // Same as above but without explicit align.
 struct g2 {
@@ -129,9 +141,14 @@
   char c;
 };
 
-CHECK_SIZE(struct, g3, 32);
 CHECK_ALIGN(struct, g3, 16);
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g3, 16);
+CHECK_OFFSET(struct, g3, c, 2);
+#else
+CHECK_SIZE(struct, g3, 32);
 CHECK_OFFSET(struct, g3, c, 17);
+#endif
 
 struct __attribute__((packed)) g4 {
   char a;
@@ -141,7 +158,11 @@
 
 CHECK_SIZE(struct, g4, 4);
 CHECK_ALIGN(struct, g4, 2);
+#if defined(__ORBIS__)
+CHECK_OFFSET(struct, g4, c, 2);
+#else
 CHECK_OFFSET(struct, g4, c, 3);
+#endif
 
 struct g5 {
   char : 1;
@@ -161,28 +182,44 @@
   char : 1;
   __attribute__((aligned(1))) int n : 25;
 };
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g7, 4);
+#else
 CHECK_SIZE(struct, g7, 8);
+#endif
 CHECK_ALIGN(struct, g7, 4);
 
 struct __attribute__((packed)) g8 {
   char : 1;
   __attribute__((aligned(1))) int n : 25;
 };
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g8, 4);
+#else
 CHECK_SIZE(struct, g8, 5);
+#endif
 CHECK_ALIGN(struct, g8, 1);
 
 struct g9 {
   __attribute__((aligned(1))) char a : 2, b : 2, c : 2, d : 2, e : 2;
   int i;
 };
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g9, 8);
+#else
 CHECK_SIZE(struct, g9, 12);
+#endif
 CHECK_ALIGN(struct, g9, 4);
 
 struct __attribute__((packed)) g10 {
   __attribute__((aligned(1))) char a : 2, b : 2, c : 2, d : 2, e : 2;
   int i;
 };
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g10, 6);
+#else
 CHECK_SIZE(struct, g10, 9);
+#endif
 CHECK_ALIGN(struct, g10, 1);
 
 struct g11 {
@@ -190,7 +227,7 @@
   __attribute__((aligned(1))) long long b : 62;
   char c;
 };
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
 CHECK_SIZE(struct, g11, 24);
 CHECK_ALIGN(struct, g11, 8);
 CHECK_OFFSET(struct, g11, c, 16);
@@ -218,6 +255,10 @@
 CHECK_SIZE(struct, g13, 16);
 CHECK_ALIGN(struct, g13, 8);
 CHECK_OFFSET(struct, g13, c, 8);
+#elif defined(__x86_64__)
+CHECK_SIZE(struct, g13, 9);
+CHECK_ALIGN(struct, g13, 1);
+CHECK_OFFSET(struct, g13, c, 8);
 #else
 CHECK_SIZE(struct, g13, 5);
 CHECK_ALIGN(struct, g13, 1);
@@ -233,6 +274,10 @@
 CHECK_SIZE(struct, g14, 16);
 CHECK_ALIGN(struct, g14, 8);
 CHECK_OFFSET(struct, g14, c, 8);
+#elif defined(__x86_64__)
+CHECK_SIZE(struct, g14, 9);
+CHECK_ALIGN(struct, g14, 1);
+CHECK_OFFSET(struct, g14, c, 8);
 #else
 CHECK_SIZE(struct, g14, 5);
 CHECK_ALIGN(struct, g14, 1);
diff --git a/test/Sema/bitfield-layout_1.c b/test/Sema/bitfield-layout_1.c
new file mode 100644
index 0000000..24277c3
--- /dev/null
+++ b/test/Sema/bitfield-layout_1.c
@@ -0,0 +1,202 @@
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=i686-apple-darwin9
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=arm-linux-gnueabihf
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=aarch64-linux-gnu
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-pc-linux-gnu
+// expected-no-diagnostics
+
+#define CHECK_SIZE(name, size) \
+  extern int name##_1[sizeof(name) == size ? 1 : -1];
+
+
+struct  __attribute__((packed)) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s0;
+CHECK_SIZE(s0,9)
+
+#pragma pack (1)
+struct {
+  int a;
+  int b : 4;
+  int c : 32;
+} s1;
+CHECK_SIZE(s1,9)
+
+#pragma pack (2)
+struct {
+  int a;
+  int b : 4;
+  int c : 32;
+} s2;
+CHECK_SIZE(s2,10)
+
+#pragma pack (2)
+struct __attribute__((packed)) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s3;
+CHECK_SIZE(s3,10)
+
+#pragma pack (4)
+struct  __attribute__((packed)) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s4;
+CHECK_SIZE(s4,12)
+
+#pragma pack (16)
+struct {
+  int a;
+  int __attribute__((packed)) b : 4;
+  int __attribute__((packed)) c : 32;
+} s41;
+CHECK_SIZE(s41,12)
+
+#pragma pack (16)
+struct {
+  int a;
+  int b : 4;
+  int c : 32;
+} s5;
+CHECK_SIZE(s5,12)
+
+#pragma pack (1)
+struct  __attribute__((aligned(4))) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s6;
+CHECK_SIZE(s6,12)
+
+#pragma pack (2)
+struct {
+  char a;
+  int b : 4;
+  int c : 32;
+  char s;
+} s7;
+CHECK_SIZE(s7,8)
+
+#pragma pack (1)
+struct {
+  char a;
+  int b : 4;
+  int c : 28;
+  char s;
+} s8;
+CHECK_SIZE(s8,6)
+
+#pragma pack (8)
+struct {
+  char a;
+  int b : 4;
+  int c : 28;
+  char s;
+} s9;
+CHECK_SIZE(s9,8)
+
+#pragma pack (8)
+struct {
+  char a;
+  char s;
+} s10;
+CHECK_SIZE(s10,2)
+
+#pragma pack(4)
+struct {
+  char a;
+  int b : 4;
+  int c : 28;
+  char s1;
+  char s2;
+  char s3;
+} s11;
+CHECK_SIZE(s11,8)
+
+#pragma pack(4)
+struct {
+  short s1;
+  int a1 : 17;
+  int a2 : 17;
+  int a3 : 30;
+  short s2;
+} s12;
+CHECK_SIZE(s12,12)
+
+#pragma pack(4)
+struct {
+  char c1;
+  int i1 : 17;
+  int i2 : 17;
+  int i3 : 30;
+  char c2;
+} s13;
+CHECK_SIZE(s13,12)
+
+#pragma pack(2)
+struct {
+  char a;
+  int s;
+} s14;
+CHECK_SIZE(s14,6)
+
+#pragma pack(4)
+struct {
+  char a;
+  short s;
+} s15;
+CHECK_SIZE(s15,4)
+
+#pragma pack(2)
+struct {
+  char a;
+  int b : 4;
+  int c : 28;
+  char s1;
+  char s2;
+  char s3;
+} s16;
+CHECK_SIZE(s16,8)
+
+#pragma pack (16)
+struct {
+  int __attribute__((packed)) a;
+  int __attribute__((packed)) b : 4;
+  int __attribute__((packed)) c : 32;
+} s17;
+CHECK_SIZE(s17,12)
+
+#pragma pack (16)
+struct {
+  int __attribute__((aligned(8))) a;
+  int __attribute__((aligned(8))) b : 4;
+  int __attribute__((aligned(8))) c : 32;
+} s18;
+CHECK_SIZE(s18,24)
+
+#pragma pack (16)
+struct {
+  int __attribute__((aligned(1))) a;
+  int __attribute__((aligned(1))) b : 4;
+  int __attribute__((aligned(1))) c : 32;
+} s19;
+CHECK_SIZE(s19,12)
+
+#pragma pack (1)
+struct  __attribute__((aligned(8))) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s20;
+CHECK_SIZE(s20,16)
+
+#pragma pack (2)
+struct {
+  int __attribute__((aligned(8))) a;
+  int __attribute__((aligned(8))) b : 4;
+  int __attribute__((aligned(8))) c : 32;
+} s21;
+CHECK_SIZE(s21,10)
diff --git a/test/Sema/bitfield.c b/test/Sema/bitfield.c
index 810dc79..d625366 100644
--- a/test/Sema/bitfield.c
+++ b/test/Sema/bitfield.c
@@ -64,7 +64,7 @@
 
 struct Test5 { unsigned n : 2; } t5;
 // Bitfield is unsigned
-struct Test5 sometest5 = {-1}; // expected-warning {{implicit truncation from 'int' to bitfield changes value from -1 to 3}}
+struct Test5 sometest5 = {-1};
 typedef __typeof__(+t5.n) Signed;  // ... but promotes to signed.
 
 typedef __typeof__(t5.n + 0) Signed; // Arithmetic promotes.
diff --git a/test/Sema/builtin-classify-type.c b/test/Sema/builtin-classify-type.c
new file mode 100644
index 0000000..376e73d
--- /dev/null
+++ b/test/Sema/builtin-classify-type.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+enum gcc_type_class {
+  no_type_class = -1,
+  void_type_class, integer_type_class, char_type_class,
+  enumeral_type_class, boolean_type_class,
+  pointer_type_class, reference_type_class, offset_type_class,
+  real_type_class, complex_type_class,
+  function_type_class, method_type_class,
+  record_type_class, union_type_class,
+  array_type_class, string_type_class,
+  lang_type_class
+};
+
+void foo() {
+  int i;
+  char c;
+  enum { red, green, blue } enum_obj;
+  int *p;
+  double d;
+  _Complex double cc;
+  extern void f();
+  struct { int a; float b; } s_obj;
+  union { int a; float b; } u_obj;
+  int arr[10];
+
+  int a1[__builtin_classify_type(f()) == void_type_class ? 1 : -1];
+  int a2[__builtin_classify_type(i) == integer_type_class ? 1 : -1];
+  int a3[__builtin_classify_type(c) == integer_type_class ? 1 : -1];
+  int a4[__builtin_classify_type(enum_obj) == integer_type_class ? 1 : -1];
+  int a5[__builtin_classify_type(p) == pointer_type_class ? 1 : -1];
+  int a6[__builtin_classify_type(d) == real_type_class ? 1 : -1];
+  int a7[__builtin_classify_type(cc) == complex_type_class ? 1 : -1];
+  int a8[__builtin_classify_type(f) == pointer_type_class ? 1 : -1];
+  int a0[__builtin_classify_type(s_obj) == record_type_class ? 1 : -1];
+  int a10[__builtin_classify_type(u_obj) == union_type_class ? 1 : -1];
+  int a11[__builtin_classify_type(arr) == pointer_type_class ? 1 : -1];
+  int a12[__builtin_classify_type("abc") == pointer_type_class ? 1 : -1];
+}
+
diff --git a/test/Sema/builtin-longjmp.c b/test/Sema/builtin-longjmp.c
index fdfbcf8..d80208f 100644
--- a/test/Sema/builtin-longjmp.c
+++ b/test/Sema/builtin-longjmp.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple x86_64-windows -emit-llvm < %s| FileCheck %s
 // RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm < %s| FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64-unknown-unknown -emit-llvm < %s| FileCheck %s
+// RUN: %clang_cc1 -triple sparc-eabi-unknown -emit-llvm < %s | FileCheck %s
 
 // RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm-only -verify %s
 // RUN: %clang_cc1 -triple mips-unknown-unknown -emit-llvm-only -verify %s
diff --git a/test/Sema/builtins-arm.c b/test/Sema/builtins-arm.c
index 39cb2fa..668b828 100644
--- a/test/Sema/builtins-arm.c
+++ b/test/Sema/builtins-arm.c
@@ -48,6 +48,50 @@
 }
 
 void test6(int a, int b, int c) {
+  __builtin_arm_ldc(1, 2, &a);
+  __builtin_arm_ldc(a, 2, &a); // expected-error {{argument to '__builtin_arm_ldc' must be a constant integer}}
+  __builtin_arm_ldc(1, a, &a); // expected-error {{argument to '__builtin_arm_ldc' must be a constant integer}}
+
+  __builtin_arm_ldcl(1, 2, &a);
+  __builtin_arm_ldcl(a, 2, &a); // expected-error {{argument to '__builtin_arm_ldcl' must be a constant integer}}
+  __builtin_arm_ldcl(1, a, &a); // expected-error {{argument to '__builtin_arm_ldcl' must be a constant integer}}
+
+  __builtin_arm_ldc2(1, 2, &a);
+  __builtin_arm_ldc2(a, 2, &a); // expected-error {{argument to '__builtin_arm_ldc2' must be a constant integer}}
+  __builtin_arm_ldc2(1, a, &a); // expected-error {{argument to '__builtin_arm_ldc2' must be a constant integer}}
+
+  __builtin_arm_ldc2l(1, 2, &a);
+  __builtin_arm_ldc2l(a, 2, &a); // expected-error {{argument to '__builtin_arm_ldc2l' must be a constant integer}}
+  __builtin_arm_ldc2l(1, a, &a); // expected-error {{argument to '__builtin_arm_ldc2l' must be a constant integer}}
+
+  __builtin_arm_stc(1, 2, &a);
+  __builtin_arm_stc(a, 2, &a); // expected-error {{argument to '__builtin_arm_stc' must be a constant integer}}
+  __builtin_arm_stc(1, a, &a); // expected-error {{argument to '__builtin_arm_stc' must be a constant integer}}
+
+  __builtin_arm_stcl(1, 2, &a);
+  __builtin_arm_stcl(a, 2, &a); // expected-error {{argument to '__builtin_arm_stcl' must be a constant integer}}
+  __builtin_arm_stcl(1, a, &a); // expected-error {{argument to '__builtin_arm_stcl' must be a constant integer}}
+
+  __builtin_arm_stc2(1, 2, &a);
+  __builtin_arm_stc2(a, 2, &a); // expected-error {{argument to '__builtin_arm_stc2' must be a constant integer}}
+  __builtin_arm_stc2(1, a, &a); // expected-error {{argument to '__builtin_arm_stc2' must be a constant integer}}
+
+  __builtin_arm_stc2l(1, 2, &a);
+  __builtin_arm_stc2l(a, 2, &a); // expected-error {{argument to '__builtin_arm_stc2l' must be a constant integer}}
+  __builtin_arm_stc2l(1, a, &a); // expected-error {{argument to '__builtin_arm_stc2l' must be a constant integer}}
+
+  __builtin_arm_cdp(a, 2, 3, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+  __builtin_arm_cdp(1, a, 3, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+  __builtin_arm_cdp(1, 2, a, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+  __builtin_arm_cdp(1, 2, 3, a, 5, 6); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+  __builtin_arm_cdp(1, 2, 3, 4, 5, a); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+
+  __builtin_arm_cdp2(a, 2, 3, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+  __builtin_arm_cdp2(1, a, 3, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+  __builtin_arm_cdp2(1, 2, a, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+  __builtin_arm_cdp2(1, 2, 3, a, 5, 6); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+  __builtin_arm_cdp2(1, 2, 3, 4, 5, a); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+
   __builtin_arm_mrc( a, 0, 13, 0, 3); // expected-error {{argument to '__builtin_arm_mrc' must be a constant integer}}
   __builtin_arm_mrc(15, a, 13, 0, 3); // expected-error {{argument to '__builtin_arm_mrc' must be a constant integer}}
   __builtin_arm_mrc(15, 0,  a, 0, 3); // expected-error {{argument to '__builtin_arm_mrc' must be a constant integer}}
@@ -72,11 +116,23 @@
   __builtin_arm_mcr2(15, 0, b, 13, a, 3); // expected-error {{argument to '__builtin_arm_mcr2' must be a constant integer}}
   __builtin_arm_mcr2(15, 0, b, 13, 0, a); // expected-error {{argument to '__builtin_arm_mcr2' must be a constant integer}}
 
-  __builtin_arm_mcrr( a, 0, b, c, 0); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
-  __builtin_arm_mcrr(15, a, b, c, 0); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
-  __builtin_arm_mcrr(15, 0, b, c, a); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
+  __builtin_arm_mcrr(15, 0, b, 0);
+  __builtin_arm_mcrr( a, 0, b, 0); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
+  __builtin_arm_mcrr(15, a, b, 0); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
+  __builtin_arm_mcrr(15, 0, b, a); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
 
-  __builtin_arm_mcrr2( a, 0, b, c, 0); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
-  __builtin_arm_mcrr2(15, a, b, c, 0); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
-  __builtin_arm_mcrr2(15, 0, b, c, a); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
+  __builtin_arm_mcrr2(15, 0, b, 0);
+  __builtin_arm_mcrr2( a, 0, b, 0); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
+  __builtin_arm_mcrr2(15, a, b, 0); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
+  __builtin_arm_mcrr2(15, 0, b, a); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
+
+  __builtin_arm_mrrc(15, 0, 0);
+  __builtin_arm_mrrc( a, 0, 0); // expected-error {{argument to '__builtin_arm_mrrc' must be a constant integer}}
+  __builtin_arm_mrrc(15, a, 0); // expected-error {{argument to '__builtin_arm_mrrc' must be a constant integer}}
+  __builtin_arm_mrrc(15, 0, a); // expected-error {{argument to '__builtin_arm_mrrc' must be a constant integer}}
+
+  __builtin_arm_mrrc2(15, 0, 0);
+  __builtin_arm_mrrc2( a, 0, 0); // expected-error {{argument to '__builtin_arm_mrrc2' must be a constant integer}}
+  __builtin_arm_mrrc2(15, a, 0); // expected-error {{argument to '__builtin_arm_mrrc2' must be a constant integer}}
+  __builtin_arm_mrrc2(15, 0, a); // expected-error {{argument to '__builtin_arm_mrrc2' must be a constant integer}}
 }
diff --git a/test/Sema/builtins.cl b/test/Sema/builtins.cl
index 8cde8f3..7cde5e1 100644
--- a/test/Sema/builtins.cl
+++ b/test/Sema/builtins.cl
@@ -1,8 +1,11 @@
 // RUN: %clang_cc1 %s -fsyntax-only -verify -pedantic
-// expected-no-diagnostics
 
 kernel void test(global float *out, global float *in, global int* in2) {
   out[0] = __builtin_nanf("");
   __builtin_memcpy(out, in, 32);
   out[0] = __builtin_frexpf(in[0], in2);
 }
+
+void pr28651() {
+  __builtin_alloca(value); // expected-error{{use of undeclared identifier}}
+}
diff --git a/test/Sema/callingconv-cast.c b/test/Sema/callingconv-cast.c
new file mode 100644
index 0000000..12c0dcb
--- /dev/null
+++ b/test/Sema/callingconv-cast.c
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -fms-extensions -triple i686-pc-windows-msvc -Wcast-calling-convention -DMSVC -Wno-pointer-bool-conversion -verify -x c %s
+// RUN: %clang_cc1 -fms-extensions -triple i686-pc-windows-msvc -Wcast-calling-convention -DMSVC -Wno-pointer-bool-conversion -verify -x c++ %s
+// RUN: %clang_cc1 -fms-extensions -triple i686-pc-windows-msvc -Wcast-calling-convention -DMSVC -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s --check-prefix=MSFIXIT
+// RUN: %clang_cc1 -triple i686-pc-windows-gnu -Wcast-calling-convention -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s --check-prefix=GNUFIXIT
+
+// expected-note@+1 {{consider defining 'mismatched_before_winapi' with the 'stdcall' calling convention}}
+void mismatched_before_winapi(int x) {}
+
+#ifdef MSVC
+#define WINAPI __stdcall
+#else
+#define WINAPI __attribute__((stdcall))
+#endif
+
+// expected-note@+1 3 {{consider defining 'mismatched' with the 'stdcall' calling convention}}
+void mismatched(int x) {}
+
+typedef void (WINAPI *callback_t)(int);
+void take_callback(callback_t callback);
+
+void WINAPI mismatched_stdcall(int x) {}
+
+void take_opaque_fn(void (*callback)(int));
+
+int main() {
+  // expected-warning@+1 {{cast between incompatible calling conventions 'cdecl' and 'stdcall'}}
+  take_callback((callback_t)mismatched);
+
+  // expected-warning@+1 {{cast between incompatible calling conventions 'cdecl' and 'stdcall'}}
+  callback_t callback = (callback_t)mismatched; // warns
+  (void)callback;
+
+  // expected-warning@+1 {{cast between incompatible calling conventions 'cdecl' and 'stdcall'}}
+  callback = (callback_t)&mismatched; // warns
+
+  // No warning, just to show we don't drill through other kinds of unary operators.
+  callback = (callback_t)!mismatched;
+
+  // expected-warning@+1 {{cast between incompatible calling conventions 'cdecl' and 'stdcall'}}
+  callback = (callback_t)&mismatched_before_winapi; // warns
+
+  // Probably a bug, but we don't warn.
+  void (*callback2)(int) = mismatched;
+  take_callback((callback_t)callback2);
+
+  // Another way to suppress the warning.
+  take_callback((callback_t)(void*)mismatched);
+
+  // Don't warn, because we're casting from stdcall to cdecl. Usually that means
+  // the programmer is rinsing the function pointer through some kind of opaque
+  // API.
+  take_opaque_fn((void (*)(int))mismatched_stdcall);
+}
+
+// MSFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// MSFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// MSFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// MSFIXIT: fix-it:"{{.*}}callingconv-cast.c":{7:6-7:6}:"__stdcall "
+
+// GNUFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// GNUFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// GNUFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// GNUFIXIT: fix-it:"{{.*}}callingconv-cast.c":{7:6-7:6}:"__attribute__((stdcall)) "
diff --git a/test/Sema/constant-conversion.c b/test/Sema/constant-conversion.c
index 1376333..203e737 100644
--- a/test/Sema/constant-conversion.c
+++ b/test/Sema/constant-conversion.c
@@ -80,3 +80,48 @@
   struct { enum E x : 1; } f;
   f.x = C; // expected-warning {{implicit truncation from 'int' to bitfield changes value from 2 to 0}}
 }
+
+void test9() {
+  const char max_char = 0x7F;
+  const short max_short = 0x7FFF;
+  const int max_int = 0x7FFFFFFF;
+
+  const short max_char_plus_one = (short)max_char + 1;
+  const int max_short_plus_one = (int)max_short + 1;
+  const long max_int_plus_one = (long)max_int + 1;
+
+  char new_char = max_char_plus_one;  // expected-warning {{implicit conversion from 'const short' to 'char' changes value from 128 to -128}}
+  short new_short = max_short_plus_one;  // expected-warning {{implicit conversion from 'const int' to 'short' changes value from 32768 to -32768}}
+  int new_int = max_int_plus_one;  // expected-warning {{implicit conversion from 'const long' to 'int' changes value from 2147483648 to -2147483648}}
+
+  char hex_char = 0x80;
+  short hex_short = 0x8000;
+  int hex_int = 0x80000000;
+
+  char oct_char = 0200;
+  short oct_short = 0100000;
+  int oct_int = 020000000000;
+
+  char bin_char = 0b10000000;
+  short bin_short = 0b1000000000000000;
+  int bin_int = 0b10000000000000000000000000000000;
+
+#define CHAR_MACRO_HEX 0xff
+  char macro_char_hex = CHAR_MACRO_HEX;
+#define CHAR_MACRO_DEC 255
+  char macro_char_dec = CHAR_MACRO_DEC;  // expected-warning {{implicit conversion from 'int' to 'char' changes value from 255 to -1}}
+
+  char array_init[] = { 255, 127, 128, 129, 0 };
+}
+
+void test10() {
+  struct S {
+    unsigned a : 4;
+  } s;
+  s.a = -1;
+  s.a = 15;
+  s.a = -8;
+
+  s.a = -9;  // expected-warning{{implicit truncation from 'int' to bitfield changes value from -9 to 7}}
+  s.a = 16;  // expected-warning{{implicit truncation from 'int' to bitfield changes value from 16 to 0}}
+}
diff --git a/test/Sema/decl-in-prototype.c b/test/Sema/decl-in-prototype.c
index 4f581aa..3b8a3b8 100644
--- a/test/Sema/decl-in-prototype.c
+++ b/test/Sema/decl-in-prototype.c
@@ -35,3 +35,6 @@
 void pr19018_1 (enum e19018 { qq } x); // expected-warning{{declaration of 'enum e19018' will not be visible outside of this function}}
 enum e19018 qq; //expected-error{{tentative definition has type 'enum e19018' that is never completed}} \
                 //expected-note{{forward declaration of 'enum e19018'}}
+
+// Only warn once, even if we create two declarations.
+void f(struct q *, struct __attribute__((aligned(4))) q *); // expected-warning {{will not be visible outside}}
diff --git a/test/Sema/dllexport.c b/test/Sema/dllexport.c
index 56c9e74..7991a45 100644
--- a/test/Sema/dllexport.c
+++ b/test/Sema/dllexport.c
@@ -4,12 +4,18 @@
 // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c99 %s
 
 // Invalid usage.
-__declspec(dllexport) typedef int typedef1; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-typedef __declspec(dllexport) int typedef2; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-typedef int __declspec(dllexport) typedef3; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-typedef __declspec(dllexport) void (*FunTy)(); // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-enum __declspec(dllexport) Enum { EnumVal }; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-struct __declspec(dllexport) Record {}; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
+__declspec(dllexport) typedef int typedef1;
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+typedef __declspec(dllexport) int typedef2;
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+typedef int __declspec(dllexport) typedef3;
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+typedef __declspec(dllexport) void (*FunTy)();
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+enum __declspec(dllexport) Enum { EnumVal };
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+struct __declspec(dllexport) Record {};
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
 
 
 
diff --git a/test/Sema/dllimport.c b/test/Sema/dllimport.c
index f863499..0728cf1 100644
--- a/test/Sema/dllimport.c
+++ b/test/Sema/dllimport.c
@@ -4,12 +4,18 @@
 // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c99 -DGNU %s
 
 // Invalid usage.
-__declspec(dllimport) typedef int typedef1; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-typedef __declspec(dllimport) int typedef2; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-typedef int __declspec(dllimport) typedef3; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-typedef __declspec(dllimport) void (*FunTy)(); // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-enum __declspec(dllimport) Enum { EnumVal }; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-struct __declspec(dllimport) Record {}; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
+__declspec(dllimport) typedef int typedef1;
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+typedef __declspec(dllimport) int typedef2;
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+typedef int __declspec(dllimport) typedef3;
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+typedef __declspec(dllimport) void (*FunTy)();
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+enum __declspec(dllimport) Enum { EnumVal };
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+struct __declspec(dllimport) Record {};
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
 
 
 
@@ -34,17 +40,49 @@
 int __declspec(dllimport) GlobalInit2 = 1; // expected-error{{definition of dllimport data}}
 
 // Declare, then reject definition.
-__declspec(dllimport) extern int ExternGlobalDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int ExternGlobalDeclInit = 1; // expected-warning{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+__declspec(dllimport) extern int ExternGlobalDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int ExternGlobalDeclInit = 1;
 
-__declspec(dllimport) int GlobalDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int GlobalDeclInit = 1; // expected-warning{{'GlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+__declspec(dllimport) int GlobalDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int GlobalDeclInit = 1;
 
-int *__attribute__((dllimport)) GlobalDeclChunkAttrInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int *GlobalDeclChunkAttrInit = 0; // expected-warning{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+int *__attribute__((dllimport)) GlobalDeclChunkAttrInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int *GlobalDeclChunkAttrInit = 0;
 
-int GlobalDeclAttrInit __attribute__((dllimport)); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int GlobalDeclAttrInit = 1; // expected-warning{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+int GlobalDeclAttrInit __attribute__((dllimport)); // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int GlobalDeclAttrInit = 1;
 
 // Redeclarations
 __declspec(dllimport) extern int GlobalRedecl1;
@@ -59,8 +97,7 @@
 int GlobalRedecl2c __attribute__((dllimport));
 int GlobalRedecl2c __attribute__((dllimport));
 
-// NB: MSVC issues a warning and makes GlobalRedecl3 dllexport. We follow GCC
-// and drop the dllimport with a warning.
+// We follow GCC and drop the dllimport with a warning.
 __declspec(dllimport) extern int GlobalRedecl3; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
                       extern int GlobalRedecl3; // expected-warning{{'GlobalRedecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
 
@@ -133,13 +170,20 @@
 __declspec(dllimport) void redecl1();
 __declspec(dllimport) void redecl1();
 
-// NB: MSVC issues a warning and makes redecl2/redecl3 dllexport. We follow GCC
-// and drop the dllimport with a warning.
 __declspec(dllimport) void redecl2(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
                       void redecl2(); // expected-warning{{'redecl2' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
 
-__declspec(dllimport) void redecl3(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-                      void redecl3() {} // expected-warning{{'redecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+                      // expected-note@+2{{previous attribute is here}}
+#endif
+                      __declspec(dllimport) void redecl3(); // expected-note{{previous declaration is here}}
+                      // NB: Both MSVC and Clang issue a warning and make redecl3 dllexport.
+#ifdef MS
+                      // expected-warning@+4{{'redecl3' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                      // expected-warning@+2{{'redecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+                      void redecl3() {}
 
                       void redecl4(); // expected-note{{previous declaration is here}}
 void useRedecl4() { redecl4(); }
diff --git a/test/Sema/enable_if-ext.c b/test/Sema/enable_if-ext.c
new file mode 100644
index 0000000..1e605d49
--- /dev/null
+++ b/test/Sema/enable_if-ext.c
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -fsyntax-only %s -include %s -verify
+// RUN: %clang_cc1 -Wpedantic -fsyntax-only %s -include %s -verify -DWARN_PEDANTIC
+
+#ifndef enable_if_ext_included
+#define enable_if_ext_included
+
+#if !defined(WARN_PEDANTIC)
+// expected-no-diagnostics
+#endif
+
+__attribute__ (( enable_if(1, "") ))
+#if defined(WARN_PEDANTIC)
+// expected-warning@-2 {{'enable_if' is a clang extension}}
+#endif
+void f() { }
+
+__attribute__ (( __enable_if__(1, "") ))
+#if defined(WARN_PEDANTIC)
+// expected-warning@-2 {{'enable_if' is a clang extension}}
+#endif
+void g() { }
+
+__attribute__ (( enable_if(0, "") ))
+#if defined(WARN_PEDANTIC)
+// expected-warning@-2 {{'enable_if' is a clang extension}}
+#endif
+void h() { }
+
+__attribute__ (( __enable_if__(0, "") ))
+#if defined(WARN_PEDANTIC)
+// expected-warning@-2 {{'enable_if' is a clang extension}}
+#endif
+void i() { }
+
+#pragma clang system_header
+
+__attribute__ (( enable_if(1, "") ))
+void j() { }
+
+__attribute__ (( __enable_if__(1, "") ))
+void k() { }
+
+__attribute__ (( enable_if(0, "") ))
+void l() { }
+
+__attribute__ (( __enable_if__(0, "") ))
+void m() { }
+
+#endif
+
diff --git a/test/Sema/enable_if.c b/test/Sema/enable_if.c
index 4034aa2..a11f53e 100644
--- a/test/Sema/enable_if.c
+++ b/test/Sema/enable_if.c
@@ -72,8 +72,8 @@
   __attribute__((unavailable("'c' must have the value of an unsigned char or EOF")));
 
 void test3(int c) {
-  isdigit(c);
-  isdigit(10);
+  isdigit(c); // expected-warning{{ignoring return value of function declared with pure attribute}}
+  isdigit(10); // expected-warning{{ignoring return value of function declared with pure attribute}}
 #ifndef CODEGEN
   isdigit(-10);  // expected-error{{call to unavailable function 'isdigit': 'c' must have the value of an unsigned char or EOF}}
 #endif
@@ -149,4 +149,25 @@
   regular_enable_if(1, 2); // expected-error{{too many arguments}}
   regular_enable_if(); // expected-error{{too few arguments}}
 }
+
+// We had a bug where we'd crash upon trying to evaluate varargs.
+void variadic_enable_if(int a, ...) __attribute__((enable_if(a, ""))); // expected-note 6 {{disabled}}
+void variadic_test() {
+  variadic_enable_if(1);
+  variadic_enable_if(1, 2);
+  variadic_enable_if(1, "c", 3);
+
+  variadic_enable_if(0); // expected-error{{no matching}}
+  variadic_enable_if(0, 2); // expected-error{{no matching}}
+  variadic_enable_if(0, "c", 3); // expected-error{{no matching}}
+
+  int m;
+  variadic_enable_if(1);
+  variadic_enable_if(1, m);
+  variadic_enable_if(1, m, "c");
+
+  variadic_enable_if(0); // expected-error{{no matching}}
+  variadic_enable_if(0, m); // expected-error{{no matching}}
+  variadic_enable_if(0, m, 3); // expected-error{{no matching}}
+}
 #endif
diff --git a/test/Sema/float128-ld-incompatibility.cpp b/test/Sema/float128-ld-incompatibility.cpp
new file mode 100644
index 0000000..d993ed7
--- /dev/null
+++ b/test/Sema/float128-ld-incompatibility.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 \
+// RUN: -triple powerpc64le-unknown-linux-gnu -target-cpu pwr8 \
+// RUN: -target-feature +float128 %s
+
+__float128 qf();
+long double ldf();
+
+// FIXME: once operations between long double and __float128 are implemented for
+//        targets where the types are different, these next two will change
+long double ld{qf()}; // expected-error {{cannot initialize a variable of type 'long double' with an rvalue of type '__float128'}}
+__float128 q{ldf()};  // expected-error {{cannot initialize a variable of type '__float128' with an rvalue of type 'long double'}}
+
+auto test1(__float128 q, long double ld) -> decltype(q + ld) { // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  return q + ld;      // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+}
+
+auto test2(long double a, __float128 b) -> decltype(a + b) { // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  return a + b;      // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+}
+
+void test3(bool b) {
+  long double ld;
+  __float128 q;
+
+  ld + q; // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  q + ld; // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  ld - q; // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  q - ld; // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  ld * q; // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  q * ld; // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  ld / q; // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  q / ld; // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  ld = q; // expected-error {{assigning to 'long double' from incompatible type '__float128'}}
+  q = ld; // expected-error {{assigning to '__float128' from incompatible type 'long double'}}
+  q + b ? q : ld; // expected-error {{incompatible operand types ('__float128' and 'long double')}}
+}
diff --git a/test/Sema/format-strings-freebsd.c b/test/Sema/format-strings-freebsd.c
index cdf273a..965d7c2 100644
--- a/test/Sema/format-strings-freebsd.c
+++ b/test/Sema/format-strings-freebsd.c
@@ -1,10 +1,11 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -triple i386-unknown-freebsd %s
 // RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-freebsd %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-scei-ps4 %s
 
 // Test FreeBSD kernel printf extensions.
 int freebsd_kernel_printf(const char *, ...) __attribute__((__format__(__freebsd_kprintf__, 1, 2)));
 
-void check_freebsd_kernel_extensions(int i, long l, char *s)
+void check_freebsd_kernel_extensions(int i, long l, char *s, short h)
 {
   // %b expects an int and a char *
   freebsd_kernel_printf("reg=%b\n", i, "\10\2BITTWO\1BITONE\n"); // no-warning
@@ -32,6 +33,12 @@
   freebsd_kernel_printf("%lr", i); // expected-warning{{format specifies type 'long' but the argument has type 'int'}}
   freebsd_kernel_printf("%lr", l); // no-warning
 
+  // h modifier expects a short
+  freebsd_kernel_printf("%hr", i); // expected-warning{{format specifies type 'short' but the argument has type 'int'}}
+  freebsd_kernel_printf("%hr", h); // no-warning
+  freebsd_kernel_printf("%hy", i); // expected-warning{{format specifies type 'short' but the argument has type 'int'}}
+  freebsd_kernel_printf("%hy", h); // no-warning
+
   // %y expects an int
   freebsd_kernel_printf("%y", i); // no-warning
   freebsd_kernel_printf("%y", l); // expected-warning{{format specifies type 'int' but the argument has type 'long'}}
diff --git a/test/Sema/format-strings-scanf.c b/test/Sema/format-strings-scanf.c
index d3a03ad..7a92842 100644
--- a/test/Sema/format-strings-scanf.c
+++ b/test/Sema/format-strings-scanf.c
@@ -18,7 +18,7 @@
 int vsscanf(const char * restrict, const char * restrict, va_list);
 
 void test(const char *s, int *i) {
-  scanf(s, i); // expected-warning{{ormat string is not a string literal}}
+  scanf(s, i); // expected-warning{{format string is not a string literal}}
   scanf("%0d", i); // expected-warning{{zero field width in scanf format string is unused}}
   scanf("%00d", i); // expected-warning{{zero field width in scanf format string is unused}}
   scanf("%d%[asdfasdfd", i, s); // expected-warning{{no closing ']' for '%[' in scanf format string}}
@@ -171,3 +171,15 @@
   scanf("%d", (ip_t)0); // No warning.
   scanf("%d", (cip_t)0); // expected-warning{{format specifies type 'int *' but the argument has type 'cip_t' (aka 'const int *')}}
 }
+
+void check_conditional_literal(char *s, int *i) {
+  scanf(0 ? "%s" : "%d", i); // no warning
+  scanf(1 ? "%s" : "%d", i); // expected-warning{{format specifies type 'char *'}}
+  scanf(0 ? "%d %d" : "%d", i); // no warning
+  scanf(1 ? "%d %d" : "%d", i); // expected-warning{{more '%' conversions than data arguments}}
+  scanf(0 ? "%d %d" : "%d", i, s); // expected-warning{{data argument not used}}
+  scanf(1 ? "%d %s" : "%d", i, s); // no warning
+  scanf(i ? "%d %s" : "%d", i, s); // no warning
+  scanf(i ? "%d" : "%d", i, s); // expected-warning{{data argument not used}}
+  scanf(i ? "%s" : "%d", s); // expected-warning{{format specifies type 'int *'}}
+}
diff --git a/test/Sema/format-strings.c b/test/Sema/format-strings.c
index a67fd46..28a2db8 100644
--- a/test/Sema/format-strings.c
+++ b/test/Sema/format-strings.c
@@ -53,6 +53,9 @@
 
   vscanf(s, ap); // expected-warning {{format string is not a string literal}}
 
+  const char *const fmt = "%d"; // FIXME -- defined here
+  printf(fmt, 1, 2); // expected-warning{{data argument not used}}
+
   // rdar://6079877
   printf("abc"
          "%*d", 1, 1); // no-warning
@@ -99,6 +102,20 @@
   printf(i == 0 ? (i == 1 ? s : "no") : "dont know"); // expected-warning{{format string is not a string literal}}
   // expected-note@-1{{treat the string as an argument to avoid this}}
   printf("yes" ?: "no %d", 1); // expected-warning{{data argument not used by format string}}
+  printf(0 ? "yes %s" : "no %d", 1); // no-warning
+  printf(0 ? "yes %d" : "no %s", 1); // expected-warning{{format specifies type 'char *'}}
+
+  printf(0 ? "yes" : "no %d", 1); // no-warning
+  printf(0 ? "yes %d" : "no", 1); // expected-warning{{data argument not used by format string}}
+  printf(1 ? "yes" : "no %d", 1); // expected-warning{{data argument not used by format string}}
+  printf(1 ? "yes %d" : "no", 1); // no-warning
+  printf(i ? "yes" : "no %d", 1); // no-warning
+  printf(i ? "yes %s" : "no %d", 1); // expected-warning{{format specifies type 'char *'}}
+  printf(i ? "yes" : "no %d", 1, 2); // expected-warning{{data argument not used by format string}}
+
+  printf(i ? "%*s" : "-", i, s); // no-warning
+  printf(i ? "yes" : 0 ? "no %*d" : "dont know %d", 1, 2); // expected-warning{{data argument not used by format string}}
+  printf(i ? "%i\n" : "%i %s %s\n", i, s); // expected-warning{{more '%' conversions than data arguments}}
 }
 
 void check_writeback_specifier()
@@ -536,7 +553,7 @@
 
   // Make sure that the "format string is defined here" note is not emitted
   // when the original string is within the argument expression.
-  printf(1 ? "yes %d" : "no %d"); // expected-warning 2{{more '%' conversions than data arguments}}
+  printf(1 ? "yes %d" : "no %d"); // expected-warning{{more '%' conversions than data arguments}}
 
   const char kFormat17[] = "%hu"; // expected-note{{format string is defined here}}}
   printf(kFormat17, (int[]){0}); // expected-warning{{format specifies type 'unsigned short' but the argument}}
diff --git a/test/Sema/incompatible-function-pointer-types.c b/test/Sema/incompatible-function-pointer-types.c
new file mode 100644
index 0000000..f0f594f
--- /dev/null
+++ b/test/Sema/incompatible-function-pointer-types.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fsyntax-only %s -Wincompatible-pointer-types -verify
+// RUN: %clang_cc1 -fsyntax-only %s -Wincompatible-function-pointer-types -verify
+
+// This test ensures that the subgroup of -Wincompatible-pointer-types warnings
+// that concern function pointers can be promoted (or not promoted) to an error
+// *separately* from the other -Wincompatible-pointer-type warnings.
+typedef int (*MyFnTyA)(int *, char *);
+
+int bar(char *a, int *b) { return 0; }
+int foo(MyFnTyA x) { return 0; } // expected-note {{passing argument to parameter 'x' here}}
+
+void baz() {
+  foo(&bar); // expected-warning {{incompatible function pointer types passing 'int (*)(char *, int *)' to parameter of type 'MyFnTyA' (aka 'int (*)(int *, char *)')}}
+}
diff --git a/test/Sema/initialize-noreturn.c b/test/Sema/initialize-noreturn.c
index 5557862..b90d46d 100644
--- a/test/Sema/initialize-noreturn.c
+++ b/test/Sema/initialize-noreturn.c
@@ -8,7 +8,7 @@
 void foo_noret(void)  __attribute__((noreturn));
 
 void test() {
-  Fn_noret fn2 = &foo; // expected-warning {{incompatible pointer types initializing 'Fn_noret'}}
+  Fn_noret fn2 = &foo; // expected-warning {{incompatible function pointer types initializing 'Fn_noret'}}
   Fn_noret fn3 = &foo_noret; 
   Fn_ret fn4 = &foo_noret; 
   Fn_ret fn5 = &foo;
diff --git a/test/Sema/integer-overflow.c b/test/Sema/integer-overflow.c
index 02d99b3..e74bc11 100644
--- a/test/Sema/integer-overflow.c
+++ b/test/Sema/integer-overflow.c
@@ -1,6 +1,11 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -verify -fsyntax-only -triple x86_64-pc-linux-gnu
 typedef unsigned long long uint64_t;
-typedef unsigned long long uint32_t;
+typedef unsigned int uint32_t;
+
+// Check integer sizes.
+int array64[sizeof(uint64_t) == 8 ? 1 : -1];
+int array32[sizeof(uint32_t) == 4 ? 1 : -1];
+int arrayint[sizeof(int) < sizeof(uint64_t) ? 1 : -1];
 
 uint64_t f0(uint64_t);
 uint64_t f1(uint64_t, uint32_t);
diff --git a/test/Sema/invalid-assignment-constant-address-space.c b/test/Sema/invalid-assignment-constant-address-space.c
index de2af64..77d6b33 100644
--- a/test/Sema/invalid-assignment-constant-address-space.c
+++ b/test/Sema/invalid-assignment-constant-address-space.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
 
-#define OPENCL_CONSTANT 16776962
+#define OPENCL_CONSTANT 8388354
 int __attribute__((address_space(OPENCL_CONSTANT))) c[3] = {0};
 
 void foo() {
diff --git a/test/Sema/libbuiltins-ctype-powerpc64.c b/test/Sema/libbuiltins-ctype-powerpc64.c
new file mode 100644
index 0000000..bfd79ac
--- /dev/null
+++ b/test/Sema/libbuiltins-ctype-powerpc64.c
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -emit-llvm < %s | FileCheck %s
+
+int isalnum(int);
+int isalpha(int);
+int isblank(int);
+int iscntrl(int);
+int isdigit(int);
+int isgraph(int);
+int islower(int);
+int isprint(int);
+int ispunct(int);
+int isspace(int);
+int isupper(int);
+int isxdigit(int);
+int tolower(int);
+int toupper(int);
+
+void test(int x) {
+  // CHECK: call signext i32 @isalnum(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isalnum(x);
+  // CHECK: call signext i32 @isalpha(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isalpha(x);
+  // CHECK: call signext i32 @isblank(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isblank(x);
+  // CHECK: call signext i32 @iscntrl(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)iscntrl(x);
+  // CHECK: call signext i32 @isdigit(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isdigit(x);
+  // CHECK: call signext i32 @isgraph(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isgraph(x);
+  // CHECK: call signext i32 @islower(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)islower(x);
+  // CHECK: call signext i32 @isprint(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isprint(x);
+  // CHECK: call signext i32 @ispunct(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)ispunct(x);
+  // CHECK: call signext i32 @isspace(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isspace(x);
+  // CHECK: call signext i32 @isupper(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isupper(x);
+  // CHECK: call signext i32 @isxdigit(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isxdigit(x);
+  // CHECK: call signext i32 @tolower(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)tolower(x);
+  // CHECK: call signext i32 @toupper(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)toupper(x);
+}
+
+// CHECK: declare signext i32 @isalnum(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isalpha(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isblank(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @iscntrl(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isdigit(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isgraph(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @islower(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isprint(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @ispunct(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isspace(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isupper(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isxdigit(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @tolower(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @toupper(i32 signext) [[NUW_RO:#[0-9]+]]
+
+// CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
+// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
diff --git a/test/Sema/libbuiltins-ctype-x86_64.c b/test/Sema/libbuiltins-ctype-x86_64.c
new file mode 100644
index 0000000..4934e6f
--- /dev/null
+++ b/test/Sema/libbuiltins-ctype-x86_64.c
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm < %s | FileCheck %s
+
+int isalnum(int);
+int isalpha(int);
+int isblank(int);
+int iscntrl(int);
+int isdigit(int);
+int isgraph(int);
+int islower(int);
+int isprint(int);
+int ispunct(int);
+int isspace(int);
+int isupper(int);
+int isxdigit(int);
+int tolower(int);
+int toupper(int);
+
+void test(int x) {
+  // CHECK: call i32 @isalnum(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isalnum(x);
+  // CHECK: call i32 @isalpha(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isalpha(x);
+  // CHECK: call i32 @isblank(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isblank(x);
+  // CHECK: call i32 @iscntrl(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)iscntrl(x);
+  // CHECK: call i32 @isdigit(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isdigit(x);
+  // CHECK: call i32 @isgraph(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isgraph(x);
+  // CHECK: call i32 @islower(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)islower(x);
+  // CHECK: call i32 @isprint(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isprint(x);
+  // CHECK: call i32 @ispunct(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)ispunct(x);
+  // CHECK: call i32 @isspace(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isspace(x);
+  // CHECK: call i32 @isupper(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isupper(x);
+  // CHECK: call i32 @isxdigit(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isxdigit(x);
+  // CHECK: call i32 @tolower(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)tolower(x);
+  // CHECK: call i32 @toupper(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)toupper(x);
+}
+
+// CHECK: declare i32 @isalnum(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isalpha(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isblank(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @iscntrl(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isdigit(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isgraph(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @islower(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isprint(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @ispunct(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isspace(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isupper(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isxdigit(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @tolower(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @toupper(i32) [[NUW_RO:#[0-9]+]]
+
+// CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
+// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
diff --git a/test/Sema/negative-shift-wrapv.c b/test/Sema/negative-shift-wrapv.c
new file mode 100644
index 0000000..b874820
--- /dev/null
+++ b/test/Sema/negative-shift-wrapv.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -Wall -ffreestanding -fsyntax-only -fwrapv -verify %s
+
+int test() {
+  int i;
+  i = -1 << 1; // no-warning
+  return i;
+}
+
+// expected-no-diagnostics
diff --git a/test/Sema/nonnull.c b/test/Sema/nonnull.c
index 9503e7c..e98a819 100644
--- a/test/Sema/nonnull.c
+++ b/test/Sema/nonnull.c
@@ -86,7 +86,7 @@
 
 // rdar://18712242
 #define NULL (void*)0
-__attribute__((__nonnull__))
+__attribute__((__nonnull__))  // expected-note 2{{declared 'nonnull' here}}
 int evil_nonnull_func(int* pointer, void * pv)
 {
    if (pointer == NULL) {  // expected-warning {{comparison of nonnull parameter 'pointer' equal to a null pointer is 'false' on first encounter}}
@@ -105,7 +105,7 @@
 }
 
 void set_param_to_null(int**);
-int another_evil_nonnull_func(int* pointer, char ch, void * pv) __attribute__((nonnull(1, 3)));
+int another_evil_nonnull_func(int* pointer, char ch, void * pv) __attribute__((nonnull(1, 3)));  // expected-note 2{{declared 'nonnull' here}}
 int another_evil_nonnull_func(int* pointer, char ch, void * pv) {
    if (pointer == NULL) { // expected-warning {{comparison of nonnull parameter 'pointer' equal to a null pointer is 'false' on first encounter}}
      return 0;
@@ -127,7 +127,7 @@
 extern void FEE();
 
 extern void *pv;
-__attribute__((__nonnull__))
+__attribute__((__nonnull__))  // expected-note {{declared 'nonnull' here}}
 void yet_another_evil_nonnull_func(int* pointer)
 {
  while (pv) {
@@ -141,7 +141,7 @@
  }
 }
 
-void pr21668_1(__attribute__((nonnull)) const char *p, const char *s) {
+void pr21668_1(__attribute__((nonnull)) const char *p, const char *s) { // expected-note {{declared 'nonnull' here}}
   if (p) // expected-warning {{nonnull parameter 'p' will evaluate to 'true' on first encounter}}
     ;
   if (s) // No warning
@@ -154,7 +154,7 @@
     ;
 }
 
-__attribute__((returns_nonnull)) void *returns_nonnull_whee();
+__attribute__((returns_nonnull)) void *returns_nonnull_whee();  // expected-note 6{{declared 'returns_nonnull' here}}
 
 void returns_nonnull_warning_tests() {
   if (returns_nonnull_whee() == NULL) {} // expected-warning {{comparison of nonnull function call 'returns_nonnull_whee()' equal to a null pointer is 'false' on first encounter}}
diff --git a/test/Sema/nullability.c b/test/Sema/nullability.c
index bbe5cb41..9d3145d 100644
--- a/test/Sema/nullability.c
+++ b/test/Sema/nullability.c
@@ -8,7 +8,11 @@
 typedef int * int_ptr;
 
 // Parse nullability type specifiers.
-typedef int * _Nonnull nonnull_int_ptr; // expected-note{{'_Nonnull' specified here}}
+// This note requires C11.
+#if __STDC_VERSION__ > 199901L
+// expected-note@+2{{'_Nonnull' specified here}}
+#endif
+typedef int * _Nonnull nonnull_int_ptr;
 typedef int * _Nullable nullable_int_ptr;
 typedef int * _Null_unspecified null_unspecified_int_ptr;
 
@@ -23,9 +27,14 @@
 typedef nonnull_int_ptr _Nonnull redundant_okay_1;
 
 // Conflicting nullability specifiers via a typedef are not.
+// Some of these errors require C11.
+#if __STDC_VERSION__ > 199901L
 typedef nonnull_int_ptr _Nullable conflicting_2; // expected-error{{nullability specifier '_Nullable' conflicts with existing specifier '_Nonnull'}}
+#endif
 typedef nonnull_int_ptr nonnull_int_ptr_typedef;
+#if __STDC_VERSION__ > 199901L
 typedef nonnull_int_ptr_typedef _Nullable conflicting_2; // expected-error{{nullability specifier '_Nullable' conflicts with existing specifier '_Nonnull'}}
+#endif
 typedef nonnull_int_ptr_typedef nonnull_int_ptr_typedef_typedef;
 typedef nonnull_int_ptr_typedef_typedef _Null_unspecified conflicting_3; // expected-error{{nullability specifier '_Null_unspecified' conflicts with existing specifier '_Nonnull'}}
 
@@ -69,8 +78,11 @@
 
 // Nullability is not part of the canonical type.
 typedef int * _Nonnull ambiguous_int_ptr;
+// Redefining a typedef is a C11 feature.
+#if __STDC_VERSION__ > 199901L
 typedef int * ambiguous_int_ptr;
 typedef int * _Nullable ambiguous_int_ptr;
+#endif
 
 // Printing of nullability.
 float f;
@@ -116,3 +128,70 @@
 
   accepts_nonnull_1(ptr); // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
 }
+
+// Check nullability of conditional expressions.
+void conditional_expr(int c) {
+  int * _Nonnull p;
+  int * _Nonnull nonnullP;
+  int * _Nullable nullableP;
+  int * _Null_unspecified unspecifiedP;
+  int *noneP;
+
+  p = c ? nonnullP : nonnullP;
+  p = c ? nonnullP : nullableP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? nonnullP : unspecifiedP;
+  p = c ? nonnullP : noneP;
+  p = c ? nullableP : nonnullP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? nullableP : nullableP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? nullableP : unspecifiedP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? nullableP : noneP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? unspecifiedP : nonnullP;
+  p = c ? unspecifiedP : nullableP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? unspecifiedP : unspecifiedP;
+  p = c ? unspecifiedP : noneP;
+  p = c ? noneP : nonnullP;
+  p = c ? noneP : nullableP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? noneP : unspecifiedP;
+  p = c ? noneP : noneP;
+
+  // Check that we don't remove all sugar when creating a new QualType for the
+  // conditional expression.
+  typedef int *IntP;
+  typedef IntP _Nonnull NonnullIntP0;
+  typedef NonnullIntP0 _Nonnull NonnullIntP1;
+  typedef IntP _Nullable NullableIntP0;
+  typedef NullableIntP0 _Nullable NullableIntP1;
+  NonnullIntP1 nonnullP2;
+  NullableIntP1 nullableP2;
+
+  p = c ? nonnullP2 : nonnullP2;
+  p = c ? nonnullP2 : nullableP2; // expected-warning{{implicit conversion from nullable pointer 'IntP _Nullable' (aka 'int *') to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? nullableP2 : nonnullP2; // expected-warning{{implicit conversion from nullable pointer 'NullableIntP1' (aka 'int *') to non-nullable pointer type 'int * _Nonnull'}}
+  p = c ? nullableP2 : nullableP2; // expected-warning{{implicit conversion from nullable pointer 'NullableIntP1' (aka 'int *') to non-nullable pointer type 'int * _Nonnull'}}
+}
+
+// Check nullability of binary conditional expressions.
+void binary_conditional_expr() {
+  int * _Nonnull p;
+  int * _Nonnull nonnullP;
+  int * _Nullable nullableP;
+  int * _Null_unspecified unspecifiedP;
+  int *noneP;
+
+  p = nonnullP ?: nonnullP;
+  p = nonnullP ?: nullableP;
+  p = nonnullP ?: unspecifiedP;
+  p = nonnullP ?: noneP;
+  p = nullableP ?: nonnullP;
+  p = nullableP ?: nullableP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = nullableP ?: unspecifiedP;
+  p = nullableP ?: noneP;
+  p = unspecifiedP ?: nonnullP;
+  p = unspecifiedP ?: nullableP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = unspecifiedP ?: unspecifiedP;
+  p = unspecifiedP ?: noneP;
+  p = noneP ?: nonnullP;
+  p = noneP ?: nullableP; // expected-warning{{implicit conversion from nullable pointer 'int * _Nullable' to non-nullable pointer type 'int * _Nonnull'}}
+  p = noneP ?: unspecifiedP;
+  p = noneP ?: noneP;
+}
diff --git a/test/Sema/overloadable.c b/test/Sema/overloadable.c
index 3120649..b518aa9 100644
--- a/test/Sema/overloadable.c
+++ b/test/Sema/overloadable.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -Wincompatible-pointer-types
 
 int var __attribute__((overloadable)); // expected-error{{'overloadable' attribute only applies to functions}}
 void params(void) __attribute__((overloadable(12))); // expected-error {{'overloadable' attribute takes no arguments}}
@@ -99,3 +99,26 @@
   unsigned char *c;
   multi_type(c);
 }
+
+// Ensure that we allow C-specific type conversions in C
+void fn_type_conversions() {
+  void foo(void *c) __attribute__((overloadable));
+  void foo(char *c) __attribute__((overloadable));
+  void (*ptr1)(void *) = &foo;
+  void (*ptr2)(char *) = &foo;
+  void (*ambiguous)(int *) = &foo; // expected-error{{initializing 'void (*)(int *)' with an expression of incompatible type '<overloaded function type>'}} expected-note@105{{candidate function}} expected-note@106{{candidate function}}
+  void *vp_ambiguous = &foo; // expected-error{{initializing 'void *' with an expression of incompatible type '<overloaded function type>'}} expected-note@105{{candidate function}} expected-note@106{{candidate function}}
+
+  void (*specific1)(int *) = (void (*)(void *))&foo; // expected-warning{{incompatible function pointer types initializing 'void (*)(int *)' with an expression of type 'void (*)(void *)'}}
+  void *specific2 = (void (*)(void *))&foo;
+
+  void disabled(void *c) __attribute__((overloadable, enable_if(0, "")));
+  void disabled(int *c) __attribute__((overloadable, enable_if(c, "")));
+  void disabled(char *c) __attribute__((overloadable, enable_if(1, "The function name lies.")));
+  // To be clear, these should all point to the last overload of 'disabled'
+  void (*dptr1)(char *c) = &disabled;
+  void (*dptr2)(void *c) = &disabled; // expected-warning{{incompatible pointer types initializing 'void (*)(void *)' with an expression of type '<overloaded function type>'}} expected-note@115{{candidate function made ineligible by enable_if}} expected-note@116{{candidate function made ineligible by enable_if}} expected-note@117{{candidate function has type mismatch at 1st parameter (expected 'void *' but has 'char *')}}
+  void (*dptr3)(int *c) = &disabled; // expected-warning{{incompatible pointer types initializing 'void (*)(int *)' with an expression of type '<overloaded function type>'}} expected-note@115{{candidate function made ineligible by enable_if}} expected-note@116{{candidate function made ineligible by enable_if}} expected-note@117{{candidate function has type mismatch at 1st parameter (expected 'int *' but has 'char *')}}
+
+  void *specific_disabled = &disabled;
+}
diff --git a/test/Sema/pass-object-size.c b/test/Sema/pass-object-size.c
index 6f375c0..ddfbbd5 100644
--- a/test/Sema/pass-object-size.c
+++ b/test/Sema/pass-object-size.c
@@ -38,8 +38,8 @@
   void (*p)(void *) = NotOverloaded; //expected-error{{cannot take address of function 'NotOverloaded' because parameter 1 has pass_object_size attribute}}
   void (*p2)(void *) = &NotOverloaded; //expected-error{{cannot take address of function 'NotOverloaded' because parameter 1 has pass_object_size attribute}}
 
-  void (*p3)(void *) = IsOverloaded; //expected-error{{initializing 'void (*)(void *)' with an expression of incompatible type '<overloaded function type>'}} expected-note@-6{{candidate address cannot be taken because parameter 1 has pass_object_size attribute}} expected-note@-5{{type mismatch}}
-  void (*p4)(void *) = &IsOverloaded; //expected-error{{initializing 'void (*)(void *)' with an expression of incompatible type '<overloaded function type>'}} expected-note@-7{{candidate address cannot be taken because parameter 1 has pass_object_size attribute}} expected-note@-6{{type mismatch}}
+  void (*p3)(void *) = IsOverloaded; //expected-warning{{incompatible pointer types initializing 'void (*)(void *)' with an expression of type '<overloaded function type>'}} expected-note@-6{{candidate address cannot be taken because parameter 1 has pass_object_size attribute}} expected-note@-5{{type mismatch}}
+  void (*p4)(void *) = &IsOverloaded; //expected-warning{{incompatible pointer types initializing 'void (*)(void *)' with an expression of type '<overloaded function type>'}} expected-note@-7{{candidate address cannot be taken because parameter 1 has pass_object_size attribute}} expected-note@-6{{type mismatch}}
 
   void (*p5)(char *) = IsOverloaded;
   void (*p6)(char *) = &IsOverloaded;
diff --git a/test/Sema/pr25786.c b/test/Sema/pr25786.c
new file mode 100644
index 0000000..2ce6531
--- /dev/null
+++ b/test/Sema/pr25786.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -DTEST -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fsyntax-only -verify %s
+
+#if TEST
+void (__attribute__((regparm(3), stdcall)) *pf) (); //expected-warning {{calling convention 'stdcall' ignored for this target}}
+void (__attribute__((regparm(2), stdcall)) foo)(int a) { //expected-warning {{calling convention 'stdcall' ignored for this target}}
+}
+#else
+//expected-no-diagnostics
+void (__attribute__((regparm(3), stdcall)) *pf) ();
+void (__attribute__((regparm(2), stdcall)) foo)(int a) {}
+#endif
diff --git a/test/Sema/predefined-function.c b/test/Sema/predefined-function.c
index 1c40b6e..aa7b285 100644
--- a/test/Sema/predefined-function.c
+++ b/test/Sema/predefined-function.c
@@ -4,14 +4,13 @@
 enum Test {A=-1};
 char *funk(enum Test x);
 
-int eli(float b); // expected-note {{previous declaration is here}} \
-// expected-note{{passing argument to parameter 'b' here}}
+int eli(float b); // expected-note {{previous declaration is here}}
 int b(int c) {return 1;}
 
 int foo();
 int foo() {
   int eli(int (int)); // expected-error {{conflicting types for 'eli'}}
-  eli(b); // expected-error{{passing 'int (int)' to parameter of incompatible type 'float'}}
+  eli(b);
   return 0;
 }
 
diff --git a/test/Sema/renderscript.rs b/test/Sema/renderscript.rs
new file mode 100644
index 0000000..80be5ae
--- /dev/null
+++ b/test/Sema/renderscript.rs
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -x renderscript -D__RENDERSCRIPT__ %s
+// RUN: %clang_cc1 -fsyntax-only -verify -x c %s
+
+#ifndef __RENDERSCRIPT__
+// expected-warning@+2 {{'kernel' attribute ignored}}
+#endif
+void __attribute__((kernel)) kernel() {}
+
+#ifndef __RENDERSCRIPT__
+// expected-warning@+4 {{'kernel' attribute ignored}}
+#else
+// expected-warning@+2 {{'kernel' attribute only applies to functions}}
+#endif
+int __attribute__((kernel)) global;
+
+#ifndef __RENDERSCRIPT__
+// expected-error@+2 {{function return value cannot have __fp16 type; did you forget * ?}}
+#endif
+__fp16 fp16_return();
+
+#ifndef __RENDERSCRIPT__
+// expected-error@+2 {{parameters cannot have __fp16 type; did you forget * ?}}
+#endif
+void fp16_arg(__fp16 p);
diff --git a/test/Sema/shift.c b/test/Sema/shift.c
index 07c5fe5..47744fb 100644
--- a/test/Sema/shift.c
+++ b/test/Sema/shift.c
@@ -67,3 +67,14 @@
     (void) (x >> 80); // no-warning
   (void) (x >> 80); // expected-warning {{shift count >= width of type}}
 }
+
+typedef unsigned vec16 __attribute__((vector_size(16)));
+typedef unsigned vec8 __attribute__((vector_size(8)));
+
+void vect_shift_1(vec16 *x) { *x = *x << 4; }
+
+void vect_shift_2(vec16 *x, vec16 y) { *x = *x << y; }
+
+void vect_shift_3(vec16 *x, vec8 y) {
+  *x = *x << y; // expected-error {{vector operands do not have the same number of elements}}
+}
diff --git a/test/Sema/typo-correction.c b/test/Sema/typo-correction.c
index 4ef5057..59f022d 100644
--- a/test/Sema/typo-correction.c
+++ b/test/Sema/typo-correction.c
@@ -55,3 +55,28 @@
   f(THIS_IS_AN_ERROR, // expected-error {{use of undeclared identifier 'THIS_IS_AN_ERROR'}}
     afunction(afunction_));  // expected-error {{use of undeclared identifier 'afunction_'; did you mean 'afunction'?}}
 }
+
+int d = X ? d : L; // expected-error 2 {{use of undeclared identifier}}
+
+int fn_with_ids() { ID = ID == ID >= ID ; } // expected-error 4 {{use of undeclared identifier}}
+
+int fn_with_rs(int r) { r = TYPO + r * TYPO; } // expected-error 2 {{use of undeclared identifier}}
+
+void fn_with_unknown(int a, int b) {
+  fn_with_unknown(unknown, unknown | unknown); // expected-error 3 {{use of undeclared identifier}}
+}
+
+// Two typos in a parenthesized expression or argument list with a conditional
+// expression caused a crash in C mode.
+//
+// r272587 fixed a similar bug for binary operations. The same fix was needed for
+// conditional expressions.
+
+int g(int x, int y) {
+  return x + y;
+}
+
+int h() {
+  g(x, 5 ? z : 0); // expected-error 2 {{use of undeclared identifier}}
+  (x, 5 ? z : 0);  // expected-error 2 {{use of undeclared identifier}}
+}
diff --git a/test/Sema/unused-expr.c b/test/Sema/unused-expr.c
index 0935968..58ad827 100644
--- a/test/Sema/unused-expr.c
+++ b/test/Sema/unused-expr.c
@@ -76,7 +76,7 @@
 // rdar://7186119
 int t5f(void) __attribute__((warn_unused_result));
 void t5() {
-  t5f();   // expected-warning {{ignoring return value of function declared with warn_unused_result}}
+  t5f();   // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
 }
 
 
@@ -88,11 +88,11 @@
   if (fn1() < 0 || fn2(2,1) < 0 || fn3(2) < 0)  // no warnings
     return -1;
 
-  fn1();  // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
+  fn1();  // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
   fn2(92, 21);  // expected-warning {{ignoring return value of function declared with pure attribute}}
   fn3(42);  // expected-warning {{ignoring return value of function declared with const attribute}}
   __builtin_abs(0); // expected-warning {{ignoring return value of function declared with const attribute}}
-  (void)0, fn1();  // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
+  (void)0, fn1();  // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
   return 0;
 }
 
@@ -101,7 +101,7 @@
 // PR4010
 int (*fn4)(void) __attribute__ ((warn_unused_result));
 void t8() {
-  fn4(); // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
+  fn4(); // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
 }
 
 void t9() __attribute__((warn_unused_result)); // expected-warning {{attribute 'warn_unused_result' cannot be applied to functions without return value}}
diff --git a/test/Sema/vector-cast.c b/test/Sema/vector-cast.c
index 03db540..c038289 100644
--- a/test/Sema/vector-cast.c
+++ b/test/Sema/vector-cast.c
@@ -45,12 +45,25 @@
 }
 
 typedef float float2 __attribute__ ((vector_size (8)));
+typedef __attribute__((vector_size(8))) double float64x1_t;
+typedef __attribute__((vector_size(16))) double float64x2_t;
+float64x1_t vget_low_f64(float64x2_t __p0);
 
 void f4() {
   float2 f2;
-  double d;
+  double d, a, b, c;
+  float64x2_t v = {0.0, 1.0};
   f2 += d;
-  d += f2;
+  a = 3.0 + vget_low_f64(v);
+  b = vget_low_f64(v) + 3.0;
+  c = vget_low_f64(v);
+  // LAX conversions within compound assignments are not supported.
+  // FIXME: This diagnostic is inaccurate.
+  d += f2; // expected-error {{cannot convert between vector values of different size}}
+  c -= vget_low_f64(v); // expected-error {{cannot convert between vector values of different size}}
+  // LAX conversions between scalar and vector types require same size and one element sized vectors.
+  d = f2; // expected-error {{assigning to 'double' from incompatible type 'float2'}}
+  d = d + f2; // expected-error {{assigning to 'double' from incompatible type 'float2'}}
 }
 
 // rdar://15931426
diff --git a/test/Sema/warn-documentation.cpp b/test/Sema/warn-documentation.cpp
index 5d86635..34d8f5f 100644
--- a/test/Sema/warn-documentation.cpp
+++ b/test/Sema/warn-documentation.cpp
@@ -368,6 +368,101 @@
 /// \param aaa Meow.
 typedef foo::not_a_function_wrapper<1> test_not_function_like_typedef4;
 
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \returns aaa.
+using test_function_like_using1 = int (int aaa, int ccc);
+
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \returns aaa.
+using test_function_like_using2 = int (*)(int aaa, int ccc);
+
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \returns aaa.
+using test_function_like_using3 = int (* const)(int aaa, int ccc);
+
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \returns aaa.
+using test_function_like_using4 = int (C::*)(int aaa, int ccc);
+
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \returns aaa.
+using test_function_like_using5 = foo::function_wrapper<int (int aaa, int ccc)>;
+
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \returns aaa.
+using test_function_like_using6 = foo::function_wrapper<int (int aaa, int ccc)> *;
+
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \returns aaa.
+using test_function_like_using7 = foo::function_wrapper<int (int aaa, int ccc)> &;
+
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \returns aaa.
+using test_function_like_using8 = foo::function_wrapper<int (int aaa, int ccc)> &&;
+
+// expected-warning@+4 {{template parameter 'U' not found in the template declaration}} expected-note@+4 {{did you mean 'T'?}}
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \tparam U Uuu.
+template<typename T>
+using test_function_like_using9 = int(T aaa, int ccc);
+
+// expected-warning@+4 {{template parameter 'U' not found in the template declaration}} expected-note@+4 {{did you mean 'T'?}}
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \tparam U Uuu.
+template<typename T>
+using test_function_like_using10 = int (*)(T aaa, int ccc);
+
+// expected-warning@+4 {{template parameter 'U' not found in the template declaration}} expected-note@+4 {{did you mean 'T'?}}
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \tparam U Uuu.
+template<typename T>
+using test_function_like_using11 = foo::function_wrapper<int (T aaa, int ccc)>;
+
+// expected-warning@+4 {{template parameter 'U' not found in the template declaration}} expected-note@+4 {{did you mean 'T'?}}
+// expected-warning@+2 {{parameter 'bbb' not found in the function declaration}} expected-note@+2 {{did you mean 'ccc'?}}
+/// \param aaa Meow.
+/// \param bbb Bbb.
+/// \tparam U Uuu.
+template<typename T>
+using test_function_like_using12 = foo::function_wrapper<int (T aaa, int ccc)> *;
+
+using test_not_function_like_using1 = int (*)(int aaa);
+
+// expected-warning@+1 {{'\param' command used in a comment that is not attached to a function declaration}}
+/// \param aaa Meow.
+using test_not_function_like_using2 = test_not_function_like_using1;
+
+// Check that the diagnostic uses the same command marker as the comment.
+// expected-warning@+1 {{'@param' command used in a comment that is not attached to a function declaration}}
+/// @param aaa Meow.
+using test_not_function_like_using3 = unsigned int;
+
+// expected-warning@+1 {{'\param' command used in a comment that is not attached to a function declaration}}
+/// \param aaa Meow.
+using test_not_function_like_using4 = foo::not_a_function_wrapper<1>;
+
 /// \param aaa Aaa
 /// \param ... Vararg
 int test_vararg_param1(int aaa, ...);
diff --git a/test/Sema/warn-double-promotion.c b/test/Sema/warn-double-promotion.c
index b6fd0c5..0cf33e8 100644
--- a/test/Sema/warn-double-promotion.c
+++ b/test/Sema/warn-double-promotion.c
@@ -24,7 +24,7 @@
   return d;  //expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
 }
 
-void Convert(float f, double d, long double ld) {
+void Assignment(float f, double d, long double ld) {
   d = f;  //expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
   ld = f; //expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
   ld = d; //expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
@@ -32,3 +32,43 @@
   f = ld;
   d = ld;
 }
+
+extern void DoubleParameter(double);
+extern void LongDoubleParameter(long double);
+
+void ArgumentPassing(float f, double d) {
+  DoubleParameter(f); // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
+  LongDoubleParameter(f); // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
+  LongDoubleParameter(d); // expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
+}
+
+void BinaryOperator(float f, double d, long double ld) {
+  f = f * d; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
+  f = d * f; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
+  f = f * ld; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
+  f = ld * f; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
+  d = d * ld; // expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
+  d = ld * d; // expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
+}
+
+void MultiplicationAssignment(float f, double d, long double ld) {
+  d *= f; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
+  ld *= f; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
+  ld *= d; // expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
+
+  // FIXME: These cases should produce warnings as above.
+  f *= d;
+  f *= ld;
+  d *= ld;
+}
+
+// FIXME: As with a binary operator, the operands to the conditional operator are
+// converted to a common type and should produce a warning.
+void ConditionalOperator(float f, double d, long double ld, int i) {
+  f = i ? f : d;
+  f = i ? d : f;
+  f = i ? f : ld;
+  f = i ? ld : f;
+  d = i ? d : ld;
+  d = i ? ld : d;
+}
diff --git a/test/Sema/wchar.c b/test/Sema/wchar.c
index 9e41f53..74151ed 100644
--- a/test/Sema/wchar.c
+++ b/test/Sema/wchar.c
@@ -4,7 +4,7 @@
 typedef __WCHAR_TYPE__ wchar_t;
 
 #if defined(_WIN32) || defined(_M_IX86) || defined(__CYGWIN__) \
- || defined(_M_X64) || defined(__PS4__) || defined(SHORT_WCHAR)
+ || defined(_M_X64) || defined(__ORBIS__) || defined(SHORT_WCHAR)
   #define WCHAR_T_TYPE unsigned short
 #elif defined(__arm) || defined(__aarch64__)
   #define WCHAR_T_TYPE unsigned int
diff --git a/test/Sema/xray-always-instrument-attr.c b/test/Sema/xray-always-instrument-attr.c
new file mode 100644
index 0000000..3c063e2
--- /dev/null
+++ b/test/Sema/xray-always-instrument-attr.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=c11
+void foo() __attribute__((xray_always_instrument));
+
+struct __attribute__((xray_always_instrument)) a { int x; }; // expected-warning {{'xray_always_instrument' attribute only applies to functions and methods}}
+
+void bar() __attribute__((xray_always_instrument("not-supported"))); // expected-error {{'xray_always_instrument' attribute takes no arguments}}
diff --git a/test/Sema/xray-always-instrument-attr.cpp b/test/Sema/xray-always-instrument-attr.cpp
new file mode 100644
index 0000000..8d42837
--- /dev/null
+++ b/test/Sema/xray-always-instrument-attr.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=c++11 -x c++
+void foo [[clang::xray_always_instrument]] ();
+
+struct [[clang::xray_always_instrument]] a { int x; }; // expected-warning {{'xray_always_instrument' attribute only applies to functions and methods}}
+
+class b {
+ void c [[clang::xray_always_instrument]] ();
+};
+
+void baz [[clang::xray_always_instrument("not-supported")]] (); // expected-error {{'xray_always_instrument' attribute takes no arguments}}
diff --git a/test/SemaCUDA/Inputs/cuda-initializers.h b/test/SemaCUDA/Inputs/cuda-initializers.h
new file mode 100644
index 0000000..837b726
--- /dev/null
+++ b/test/SemaCUDA/Inputs/cuda-initializers.h
@@ -0,0 +1,145 @@
+// CUDA struct types with interesting initialization properties.
+// Keep in sync with ../CodeGenCUDA/Inputs/cuda-initializers.h.
+
+// Base classes with different initializer variants.
+
+// trivial constructor -- allowed
+struct T {
+  int t;
+};
+
+// empty constructor
+struct EC {
+  int ec;
+  __device__ EC() {}     // -- allowed
+  __device__ EC(int) {}  // -- not allowed
+};
+
+// empty destructor
+struct ED {
+  __device__ ~ED() {}     // -- allowed
+};
+
+struct ECD {
+  __device__ ECD() {}     // -- allowed
+  __device__ ~ECD() {}    // -- allowed
+};
+
+// empty templated constructor -- allowed with no arguments
+struct ETC {
+  template <typename... T> __device__ ETC(T...) {}
+};
+
+// undefined constructor -- not allowed
+struct UC {
+  int uc;
+  __device__ UC();
+};
+
+// undefined destructor -- not allowed
+struct UD {
+  int ud;
+  __device__ ~UD();
+};
+
+// empty constructor w/ initializer list -- not allowed
+struct ECI {
+  int eci;
+  __device__ ECI() : eci(1) {}
+};
+
+// non-empty constructor -- not allowed
+struct NEC {
+  int nec;
+  __device__ NEC() { nec = 1; }
+};
+
+// non-empty destructor -- not allowed
+struct NED {
+  int ned;
+  __device__ ~NED() { ned = 1; }
+};
+
+// no-constructor,  virtual method -- not allowed
+struct NCV {
+  int ncv;
+  __device__ virtual void vm() {}
+};
+
+// virtual destructor -- not allowed.
+struct VD {
+  __device__ virtual ~VD() {}
+};
+
+// dynamic in-class field initializer -- not allowed
+__device__ int f();
+struct NCF {
+  int ncf = f();
+};
+
+// static in-class field initializer.  NVCC does not allow it, but
+// clang generates static initializer for this, so we'll accept it.
+// We still can't use it on __shared__ vars as they don't allow *any*
+// initializers.
+struct NCFS {
+  int ncfs = 3;
+};
+
+// undefined templated constructor -- not allowed
+struct UTC {
+  template <typename... T> __device__ UTC(T...);
+};
+
+// non-empty templated constructor -- not allowed
+struct NETC {
+  int netc;
+  template <typename... T> __device__ NETC(T...) { netc = 1; }
+};
+
+// Regular base class -- allowed
+struct T_B_T : T {};
+
+// Incapsulated object of allowed class -- allowed
+struct T_F_T {
+  T t;
+};
+
+// array of allowed objects -- allowed
+struct T_FA_T {
+  T t[2];
+};
+
+
+// Calling empty base class initializer is OK
+struct EC_I_EC : EC {
+  __device__ EC_I_EC() : EC() {}
+};
+
+// .. though passing arguments is not allowed.
+struct EC_I_EC1 : EC {
+  __device__ EC_I_EC1() : EC(1) {}
+};
+
+// Virtual base class -- not allowed
+struct T_V_T : virtual T {};
+
+// Inherited from or incapsulated class with non-empty constructor --
+// not allowed
+struct T_B_NEC : NEC {};
+struct T_F_NEC {
+  NEC nec;
+};
+struct T_FA_NEC {
+  NEC nec[2];
+};
+
+
+// Inherited from or incapsulated class with non-empty desstructor --
+// not allowed
+struct T_B_NED : NED {};
+struct T_F_NED {
+  NED ned;
+};
+struct T_FA_NED {
+  NED ned[2];
+};
diff --git a/test/SemaCUDA/Inputs/cuda.h b/test/SemaCUDA/Inputs/cuda.h
index 18cafdf..d054670 100644
--- a/test/SemaCUDA/Inputs/cuda.h
+++ b/test/SemaCUDA/Inputs/cuda.h
@@ -21,4 +21,9 @@
 
 int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
                       cudaStream_t stream = 0);
+
+// Device-side placement new overloads.
+__device__ void *operator new(__SIZE_TYPE__, void *p) { return p; }
+__device__ void *operator new[](__SIZE_TYPE__, void *p) { return p; }
+
 #endif // !__NVCC__
diff --git a/test/SemaCUDA/Inputs/overload.h b/test/SemaCUDA/Inputs/overload.h
new file mode 100644
index 0000000..1c021f1
--- /dev/null
+++ b/test/SemaCUDA/Inputs/overload.h
@@ -0,0 +1,8 @@
+// This header is used by tests which are interested in __device__ functions
+// which appear in a system header.
+
+__device__ int OverloadMe();
+
+namespace ns {
+using ::OverloadMe;
+}
diff --git a/test/SemaCUDA/addr-of-overloaded-fn.cu b/test/SemaCUDA/addr-of-overloaded-fn.cu
new file mode 100644
index 0000000..03c7f7c
--- /dev/null
+++ b/test/SemaCUDA/addr-of-overloaded-fn.cu
@@ -0,0 +1,24 @@
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+__host__ void overload() {}
+__device__ void overload() {}
+
+__host__ __device__ void test_hd() {
+  // This should not be ambiguous -- we choose the host or the device overload
+  // depending on whether or not we're compiling for host or device.
+  void (*x)() = overload;
+}
+
+// These also shouldn't be ambiguous, but they're an easier test than the HD
+// function above.
+__host__ void test_host() {
+  void (*x)() = overload;
+}
+__device__ void test_device() {
+  void (*x)() = overload;
+}
diff --git a/test/SemaCUDA/alias.cu b/test/SemaCUDA/alias.cu
new file mode 100644
index 0000000..39251ed
--- /dev/null
+++ b/test/SemaCUDA/alias.cu
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fsyntax-only -fcuda-is-device -verify -DEXPECT_ERR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+
+// The alias attribute is not allowed in CUDA device code.
+void bar();
+__attribute__((alias("bar"))) void foo();
+#ifdef EXPECT_ERR
+// expected-error@-2 {{CUDA does not support aliases}}
+#else
+// expected-no-diagnostics
+#endif
diff --git a/test/SemaCUDA/bad-attributes.cu b/test/SemaCUDA/bad-attributes.cu
index 7e01e14..4cb43e2 100644
--- a/test/SemaCUDA/bad-attributes.cu
+++ b/test/SemaCUDA/bad-attributes.cu
@@ -4,8 +4,8 @@
 //
 // You should be able to run this file through nvcc for compatibility testing.
 //
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wcuda-compat -verify -DEXPECT_INLINE_WARNING %s
+// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -Wcuda-compat -verify %s
 
 #include "Inputs/cuda.h"
 
@@ -47,3 +47,15 @@
 // expected-note@-1 {{conflicting attribute is here}}
 __global__ __host__ void z12();  // expected-error {{attributes are not compatible}}
 // expected-note@-1 {{conflicting attribute is here}}
+
+struct S {
+  __global__ void foo() {};  // expected-error {{must be a free function or static member function}}
+  __global__ static void bar(); // expected-warning {{kernel function 'bar' is a member function}}
+  // Although this is implicitly inline, we shouldn't warn.
+  __global__ static void baz() {}; // expected-warning {{kernel function 'baz' is a member function}}
+};
+
+__global__ static inline void foobar() {};
+#ifdef EXPECT_INLINE_WARNING
+// expected-warning@-2 {{ignored 'inline' attribute on kernel function 'foobar'}}
+#endif
diff --git a/test/SemaCUDA/builtins.cu b/test/SemaCUDA/builtins.cu
index 32b5758..814fda2 100644
--- a/test/SemaCUDA/builtins.cu
+++ b/test/SemaCUDA/builtins.cu
@@ -7,10 +7,10 @@
 // REQUIRES: nvptx-registered-target
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown \
 // RUN:     -aux-triple nvptx64-unknown-cuda \
-// RUN:     -fcuda-target-overloads -fsyntax-only -verify %s
+// RUN:     -fsyntax-only -verify %s
 // RUN: %clang_cc1 -triple nvptx64-unknown-cuda -fcuda-is-device \
 // RUN:     -aux-triple x86_64-unknown-unknown \
-// RUN:     -fcuda-target-overloads -fsyntax-only -verify %s
+// RUN:     -fsyntax-only -verify %s
 
 #if !(defined(__amd64__) && defined(__PTX__))
 #error "Expected to see preprocessor macros from both sides of compilation."
@@ -18,13 +18,13 @@
 
 void hf() {
   int x = __builtin_ia32_rdtsc();
-  int y = __builtin_ptx_read_tid_x(); // expected-note  {{'__builtin_ptx_read_tid_x' declared here}}
-  // expected-error@-1 {{reference to __device__ function '__builtin_ptx_read_tid_x' in __host__ function}}
+  int y = __nvvm_read_ptx_sreg_tid_x(); // expected-note  {{'__nvvm_read_ptx_sreg_tid_x' declared here}}
+  // expected-error@-1 {{reference to __device__ function '__nvvm_read_ptx_sreg_tid_x' in __host__ function}}
   x = __builtin_abs(1);
 }
 
 __attribute__((device)) void df() {
-  int x = __builtin_ptx_read_tid_x();
+  int x = __nvvm_read_ptx_sreg_tid_x();
   int y = __builtin_ia32_rdtsc(); // expected-error {{reference to __host__ function '__builtin_ia32_rdtsc' in __device__ function}}
                                   // expected-note@20 {{'__builtin_ia32_rdtsc' declared here}}
   x = __builtin_abs(1);
diff --git a/test/SemaCUDA/call-device-fn-from-host.cu b/test/SemaCUDA/call-device-fn-from-host.cu
new file mode 100644
index 0000000..ab88338
--- /dev/null
+++ b/test/SemaCUDA/call-device-fn-from-host.cu
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 %s --std=c++11 -triple x86_64-unknown-linux -emit-llvm -o - -verify
+
+// Note: This test won't work with -fsyntax-only, because some of these errors
+// are emitted during codegen.
+
+#include "Inputs/cuda.h"
+
+__device__ void device_fn() {}
+// expected-note@-1 {{'device_fn' declared here}}
+// expected-note@-2 {{'device_fn' declared here}}
+// expected-note@-3 {{'device_fn' declared here}}
+// expected-note@-4 {{'device_fn' declared here}}
+// expected-note@-5 {{'device_fn' declared here}}
+
+struct S {
+  __device__ S() {}
+  // expected-note@-1 {{'S' declared here}}
+  // expected-note@-2 {{'S' declared here}}
+  __device__ ~S() { device_fn(); }
+  // expected-note@-1 {{'~S' declared here}}
+  int x;
+};
+
+struct T {
+  __host__ __device__ void hd() { device_fn(); }
+  // expected-error@-1 {{reference to __device__ function 'device_fn' in __host__ __device__ function}}
+
+  // No error; this is (implicitly) inline and is never called, so isn't
+  // codegen'ed.
+  __host__ __device__ void hd2() { device_fn(); }
+
+  __host__ __device__ void hd3();
+
+  __device__ void d() {}
+  // expected-note@-1 {{'d' declared here}}
+};
+
+__host__ __device__ void T::hd3() {
+  device_fn();
+  // expected-error@-1 {{reference to __device__ function 'device_fn' in __host__ __device__ function}}
+}
+
+template <typename T> __host__ __device__ void hd2() { device_fn(); }
+// expected-error@-1 {{reference to __device__ function 'device_fn' in __host__ __device__ function}}
+void host_fn() { hd2<int>(); }
+
+__host__ __device__ void hd() { device_fn(); }
+// expected-error@-1 {{reference to __device__ function 'device_fn' in __host__ __device__ function}}
+
+// No error because this is never instantiated.
+template <typename T> __host__ __device__ void hd3() { device_fn(); }
+
+__host__ __device__ void local_var() {
+  S s;
+  // expected-error@-1 {{reference to __device__ function 'S' in __host__ __device__ function}}
+}
+
+__host__ __device__ void placement_new(char *ptr) {
+  ::new(ptr) S();
+  // expected-error@-1 {{reference to __device__ function 'S' in __host__ __device__ function}}
+}
+
+__host__ __device__ void explicit_destructor(S *s) {
+  s->~S();
+  // expected-error@-1 {{reference to __device__ function '~S' in __host__ __device__ function}}
+}
+
+__host__ __device__ void hd_member_fn() {
+  T t;
+  // Necessary to trigger an error on T::hd.  It's (implicitly) inline, so
+  // isn't codegen'ed until we call it.
+  t.hd();
+}
+
+__host__ __device__ void h_member_fn() {
+  T t;
+  t.d();
+  // expected-error@-1 {{reference to __device__ function 'd' in __host__ __device__ function}}
+}
+
+__host__ __device__ void fn_ptr() {
+  auto* ptr = &device_fn;
+  // expected-error@-1 {{reference to __device__ function 'device_fn' in __host__ __device__ function}}
+}
+
+template <typename T>
+__host__ __device__ void fn_ptr_template() {
+  auto* ptr = &device_fn;  // Not an error because the template isn't instantiated.
+}
diff --git a/test/SemaCUDA/call-host-fn-from-device.cu b/test/SemaCUDA/call-host-fn-from-device.cu
new file mode 100644
index 0000000..4451883
--- /dev/null
+++ b/test/SemaCUDA/call-host-fn-from-device.cu
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 %s --std=c++11 -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - -verify
+
+// Note: This test won't work with -fsyntax-only, because some of these errors
+// are emitted during codegen.
+
+#include "Inputs/cuda.h"
+
+extern "C" void host_fn() {}
+// expected-note@-1 {{'host_fn' declared here}}
+// expected-note@-2 {{'host_fn' declared here}}
+// expected-note@-3 {{'host_fn' declared here}}
+// expected-note@-4 {{'host_fn' declared here}}
+// expected-note@-5 {{'host_fn' declared here}}
+// expected-note@-6 {{'host_fn' declared here}}
+
+struct S {
+  S() {}
+  // expected-note@-1 {{'S' declared here}}
+  // expected-note@-2 {{'S' declared here}}
+  ~S() { host_fn(); }
+  // expected-note@-1 {{'~S' declared here}}
+  int x;
+};
+
+struct T {
+  __host__ __device__ void hd() { host_fn(); }
+  // expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
+
+  // No error; this is (implicitly) inline and is never called, so isn't
+  // codegen'ed.
+  __host__ __device__ void hd2() { host_fn(); }
+
+  __host__ __device__ void hd3();
+
+  void h() {}
+  // expected-note@-1 {{'h' declared here}}
+};
+
+__host__ __device__ void T::hd3() {
+  host_fn();
+  // expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
+}
+
+template <typename T> __host__ __device__ void hd2() { host_fn(); }
+// expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
+__global__ void kernel() { hd2<int>(); }
+
+__host__ __device__ void hd() { host_fn(); }
+// expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
+
+template <typename T> __host__ __device__ void hd3() { host_fn(); }
+// expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
+__device__ void device_fn() { hd3<int>(); }
+
+// No error because this is never instantiated.
+template <typename T> __host__ __device__ void hd4() { host_fn(); }
+
+__host__ __device__ void local_var() {
+  S s;
+  // expected-error@-1 {{reference to __host__ function 'S' in __host__ __device__ function}}
+}
+
+__host__ __device__ void placement_new(char *ptr) {
+  ::new(ptr) S();
+  // expected-error@-1 {{reference to __host__ function 'S' in __host__ __device__ function}}
+}
+
+__host__ __device__ void explicit_destructor(S *s) {
+  s->~S();
+  // expected-error@-1 {{reference to __host__ function '~S' in __host__ __device__ function}}
+}
+
+__host__ __device__ void hd_member_fn() {
+  T t;
+  // Necessary to trigger an error on T::hd.  It's (implicitly) inline, so
+  // isn't codegen'ed until we call it.
+  t.hd();
+}
+
+__host__ __device__ void h_member_fn() {
+  T t;
+  t.h();
+  // expected-error@-1 {{reference to __host__ function 'h' in __host__ __device__ function}}
+}
+
+__host__ __device__ void fn_ptr() {
+  auto* ptr = &host_fn;
+  // expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
+}
+
+template <typename T>
+__host__ __device__ void fn_ptr_template() {
+  auto* ptr = &host_fn;  // Not an error because the template isn't instantiated.
+}
diff --git a/test/SemaCUDA/call-overloaded-destructor.cu b/test/SemaCUDA/call-overloaded-destructor.cu
new file mode 100644
index 0000000..24b0e7d
--- /dev/null
+++ b/test/SemaCUDA/call-overloaded-destructor.cu
@@ -0,0 +1,17 @@
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+struct S {
+  __host__ ~S() {}
+  __device__ ~S() {}
+};
+
+__host__ __device__ void test() {
+  S s;
+  // This should not crash clang.
+  s.~S();
+}
diff --git a/test/SemaCUDA/cuda-builtin-vars.cu b/test/SemaCUDA/cuda-builtin-vars.cu
index 97c5111..108e75c 100644
--- a/test/SemaCUDA/cuda-builtin-vars.cu
+++ b/test/SemaCUDA/cuda-builtin-vars.cu
@@ -34,20 +34,20 @@
 
   out[i++] = warpSize;
   warpSize = 0; // expected-error {{cannot assign to variable 'warpSize' with const-qualified type 'const int'}}
-  // expected-note@cuda_builtin_vars.h:104 {{variable 'warpSize' declared const here}}
+  // expected-note@cuda_builtin_vars.h:* {{variable 'warpSize' declared const here}}
 
   // Make sure we can't construct or assign to the special variables.
   __cuda_builtin_threadIdx_t x; // expected-error {{calling a private constructor of class '__cuda_builtin_threadIdx_t'}}
-  // expected-note@cuda_builtin_vars.h:67 {{declared private here}}
+  // expected-note@cuda_builtin_vars.h:* {{declared private here}}
 
   __cuda_builtin_threadIdx_t y = threadIdx; // expected-error {{calling a private constructor of class '__cuda_builtin_threadIdx_t'}}
-  // expected-note@cuda_builtin_vars.h:67 {{declared private here}}
+  // expected-note@cuda_builtin_vars.h:* {{declared private here}}
 
   threadIdx = threadIdx; // expected-error {{'operator=' is a private member of '__cuda_builtin_threadIdx_t'}}
-  // expected-note@cuda_builtin_vars.h:67 {{declared private here}}
+  // expected-note@cuda_builtin_vars.h:* {{declared private here}}
 
   void *ptr = &threadIdx; // expected-error {{'operator&' is a private member of '__cuda_builtin_threadIdx_t'}}
-  // expected-note@cuda_builtin_vars.h:67 {{declared private here}}
+  // expected-note@cuda_builtin_vars.h:* {{declared private here}}
 
   // Following line should've caused an error as one is not allowed to
   // take address of a built-in variable in CUDA. Alas there's no way
diff --git a/test/SemaCUDA/device-var-init.cu b/test/SemaCUDA/device-var-init.cu
new file mode 100644
index 0000000..d807a51
--- /dev/null
+++ b/test/SemaCUDA/device-var-init.cu
@@ -0,0 +1,215 @@
+// REQUIRES: nvptx-registered-target
+
+// Make sure we don't allow dynamic initialization for device
+// variables, but accept empty constructors allowed by CUDA.
+
+// RUN: %clang_cc1 -verify %s -triple nvptx64-nvidia-cuda -fcuda-is-device -std=c++11 %s
+
+#ifdef __clang__
+#include "Inputs/cuda.h"
+#endif
+
+// Use the types we share with CodeGen tests.
+#include "Inputs/cuda-initializers.h"
+
+__shared__ int s_v_i = 1;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+
+__device__ int d_v_f = f();
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ int s_v_f = f();
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ int c_v_f = f();
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__shared__ T s_t_i = {2};
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+
+__device__ EC d_ec_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ EC s_ec_i(3);
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ EC c_ec_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ EC d_ec_i2 = {3};
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ EC s_ec_i2 = {3};
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ EC c_ec_i2 = {3};
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ ETC d_etc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ ETC s_etc_i(3);
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ ETC c_etc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ ETC d_etc_i2 = {3};
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ ETC s_etc_i2 = {3};
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ ETC c_etc_i2 = {3};
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ UC d_uc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ UC s_uc;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ UC c_uc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ UD d_ud;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ UD s_ud;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ UD c_ud;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ ECI d_eci;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ ECI s_eci;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ ECI c_eci;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NEC d_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NEC s_nec;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NEC c_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NED d_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NED s_ned;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NED c_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NCV d_ncv;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NCV s_ncv;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NCV c_ncv;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ VD d_vd;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ VD s_vd;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ VD c_vd;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NCF d_ncf;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NCF s_ncf;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NCF c_ncf;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__shared__ NCFS s_ncfs;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+
+__device__ UTC d_utc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ UTC s_utc;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ UTC c_utc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ UTC d_utc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ UTC s_utc_i(3);
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ UTC c_utc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NETC d_netc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NETC s_netc;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NETC c_netc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NETC d_netc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NETC s_netc_i(3);
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NETC c_netc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ EC_I_EC1 d_ec_i_ec1;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ EC_I_EC1 s_ec_i_ec1;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ EC_I_EC1 c_ec_i_ec1;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_V_T d_t_v_t;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_V_T s_t_v_t;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_V_T c_t_v_t;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_B_NEC d_t_b_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_B_NEC s_t_b_nec;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_B_NEC c_t_b_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_F_NEC d_t_f_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_F_NEC s_t_f_nec;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_F_NEC c_t_f_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_FA_NEC d_t_fa_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_FA_NEC s_t_fa_nec;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_FA_NEC c_t_fa_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_B_NED d_t_b_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_B_NED s_t_b_ned;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_B_NED c_t_b_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_F_NED d_t_f_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_F_NED s_t_f_ned;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_F_NED c_t_f_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_FA_NED d_t_fa_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_FA_NED s_t_fa_ned;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_FA_NED c_t_fa_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+// Verify that only __shared__ local variables may be static on device
+// side and that they are not allowed to be initialized.
+__device__ void df_sema() {
+  static __shared__ NCFS s_ncfs;
+  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  static __shared__ UC s_uc;
+  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  static __shared__ NED s_ned;
+  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+
+  static __device__ int ds;
+  // expected-error@-1 {{Within a __device__/__global__ function, only __shared__ variables may be marked "static"}}
+  static __constant__ int dc;
+  // expected-error@-1 {{Within a __device__/__global__ function, only __shared__ variables may be marked "static"}}
+  static int v;
+  // expected-error@-1 {{Within a __device__/__global__ function, only __shared__ variables may be marked "static"}}
+}
diff --git a/test/SemaCUDA/function-overload.cu b/test/SemaCUDA/function-overload.cu
index bd3fb50..3c78600 100644
--- a/test/SemaCUDA/function-overload.cu
+++ b/test/SemaCUDA/function-overload.cu
@@ -1,237 +1,206 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// Make sure we handle target overloads correctly.
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
-// RUN:    -fsyntax-only -fcuda-target-overloads -verify %s
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda \
-// RUN:    -fsyntax-only -fcuda-target-overloads -fcuda-is-device -verify %s
-
-// Check target overloads handling with disabled call target checks.
-// RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -fsyntax-only \
-// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads -verify %s
-// RUN: %clang_cc1 -DNOCHECKS -triple nvptx64-nvidia-cuda -fsyntax-only \
-// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads \
-// RUN:    -fcuda-is-device -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
 
 #include "Inputs/cuda.h"
 
-typedef int (*fp_t)(void);
-typedef void (*gp_t)(void);
+// Opaque return types used to check that we pick the right overloads.
+struct HostReturnTy {};
+struct HostReturnTy2 {};
+struct DeviceReturnTy {};
+struct DeviceReturnTy2 {};
+struct HostDeviceReturnTy {};
+struct TemplateReturnTy {};
 
-// Host and unattributed functions can't be overloaded
-__host__ int hh(void) { return 1; } // expected-note {{previous definition is here}}
-int hh(void) { return 1; } // expected-error {{redefinition of 'hh'}}
+typedef HostReturnTy (*HostFnPtr)();
+typedef DeviceReturnTy (*DeviceFnPtr)();
+typedef HostDeviceReturnTy (*HostDeviceFnPtr)();
+typedef void (*GlobalFnPtr)();  // __global__ functions must return void.
 
-// H/D overloading is OK
-__host__ int dh(void) { return 2; }
-__device__ int dh(void) { return 2; }
+// CurrentReturnTy is {HostReturnTy,DeviceReturnTy} during {host,device}
+// compilation.
+#ifdef __CUDA_ARCH__
+typedef DeviceReturnTy CurrentReturnTy;
+#else
+typedef HostReturnTy CurrentReturnTy;
+#endif
 
-// H/HD and D/HD are not allowed
-__host__ __device__ int hdh(void) { return 5; } // expected-note {{previous definition is here}}
-__host__ int hdh(void) { return 4; } // expected-error {{redefinition of 'hdh'}}
+// CurrentFnPtr is a function pointer to a {host,device} function during
+// {host,device} compilation.
+typedef CurrentReturnTy (*CurrentFnPtr)();
 
-__host__ int hhd(void) { return 4; } // expected-note {{previous definition is here}}
-__host__ __device__ int hhd(void) { return 5; } // expected-error {{redefinition of 'hhd'}}
+// Host and unattributed functions can't be overloaded.
+__host__ void hh() {} // expected-note {{previous definition is here}}
+void hh() {} // expected-error {{redefinition of 'hh'}}
+
+// H/D overloading is OK.
+__host__ HostReturnTy dh() { return HostReturnTy(); }
+__device__ DeviceReturnTy dh() { return DeviceReturnTy(); }
+
+// H/HD and D/HD are not allowed.
+__host__ __device__ int hdh() { return 0; } // expected-note {{previous definition is here}}
+__host__ int hdh() { return 0; }            // expected-error {{redefinition of 'hdh'}}
+
+__host__ int hhd() { return 0; }            // expected-note {{previous definition is here}}
+__host__ __device__ int hhd() { return 0; } // expected-error {{redefinition of 'hhd'}}
 // expected-warning@-1 {{attribute declaration must precede definition}}
 // expected-note@-3 {{previous definition is here}}
 
-__host__ __device__ int hdd(void) { return 7; } // expected-note {{previous definition is here}}
-__device__ int hdd(void) { return 6; } // expected-error {{redefinition of 'hdd'}}
+__host__ __device__ int hdd() { return 0; } // expected-note {{previous definition is here}}
+__device__ int hdd() { return 0; }          // expected-error {{redefinition of 'hdd'}}
 
-__device__ int dhd(void) { return 6; } // expected-note {{previous definition is here}}
-__host__ __device__ int dhd(void) { return 7; } // expected-error {{redefinition of 'dhd'}}
+__device__ int dhd() { return 0; }          // expected-note {{previous definition is here}}
+__host__ __device__ int dhd() { return 0; } // expected-error {{redefinition of 'dhd'}}
 // expected-warning@-1 {{attribute declaration must precede definition}}
 // expected-note@-3 {{previous definition is here}}
 
-// Same tests for extern "C" functions
-extern "C" __host__ int chh(void) {return 11;} // expected-note {{previous definition is here}}
-extern "C" int chh(void) {return 11;} // expected-error {{redefinition of 'chh'}}
+// Same tests for extern "C" functions.
+extern "C" __host__ int chh() { return 0; } // expected-note {{previous definition is here}}
+extern "C" int chh() { return 0; }          // expected-error {{redefinition of 'chh'}}
 
-// H/D overloading is OK
-extern "C" __device__ int cdh(void) {return 10;}
-extern "C" __host__ int cdh(void) {return 11;}
+// H/D overloading is OK.
+extern "C" __device__ DeviceReturnTy cdh() { return DeviceReturnTy(); }
+extern "C" __host__ HostReturnTy cdh() { return HostReturnTy(); }
 
 // H/HD and D/HD overloading is not allowed.
-extern "C" __host__ __device__ int chhd1(void) {return 12;} // expected-note {{previous definition is here}}
-extern "C" __host__ int chhd1(void) {return 13;} // expected-error {{redefinition of 'chhd1'}}
+extern "C" __host__ __device__ int chhd1() { return 0; } // expected-note {{previous definition is here}}
+extern "C" __host__ int chhd1() { return 0; }            // expected-error {{redefinition of 'chhd1'}}
 
-extern "C" __host__ int chhd2(void) {return 13;} // expected-note {{previous definition is here}}
-extern "C" __host__ __device__ int chhd2(void) {return 12;} // expected-error {{redefinition of 'chhd2'}}
+extern "C" __host__ int chhd2() { return 0; }            // expected-note {{previous definition is here}}
+extern "C" __host__ __device__ int chhd2() { return 0; } // expected-error {{redefinition of 'chhd2'}}
 // expected-warning@-1 {{attribute declaration must precede definition}}
 // expected-note@-3 {{previous definition is here}}
 
 // Helper functions to verify calling restrictions.
-__device__ int d(void) { return 8; }
-__host__ int h(void) { return 9; }
-__global__ void g(void) {}
-extern "C" __device__ int cd(void) {return 10;}
-extern "C" __host__ int ch(void) {return 11;}
+__device__ DeviceReturnTy d() { return DeviceReturnTy(); }
+// expected-note@-1 1+ {{'d' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
 
-__host__ void hostf(void) {
-  fp_t dp = d;
-  fp_t cdp = cd;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __device__ function 'd' in __host__ function}}
-  // expected-note@65 {{'d' declared here}}
-  // expected-error@-4 {{reference to __device__ function 'cd' in __host__ function}}
-  // expected-note@68 {{'cd' declared here}}
-#endif
-  fp_t hp = h;
-  fp_t chp = ch;
-  fp_t dhp = dh;
-  fp_t cdhp = cdh;
-  gp_t gp = g;
+__host__ HostReturnTy h() { return HostReturnTy(); }
+// expected-note@-1 1+ {{'h' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
+// expected-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
 
-  d();
-  cd();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'd'}}
-  // expected-note@65 {{candidate function not viable: call to __device__ function from __host__ function}}
-  // expected-error@-4 {{no matching function for call to 'cd'}}
-  // expected-note@68 {{candidate function not viable: call to __device__ function from __host__ function}}
-#endif
-  h();
-  ch();
-  dh();
-  cdh();
+__global__ void g() {}
+// expected-note@-1 1+ {{'g' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __global__ function from __device__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __global__ function from __host__ __device__ function}}
+// expected-note@-4 1+ {{candidate function not viable: call to __global__ function from __global__ function}}
+
+extern "C" __device__ DeviceReturnTy cd() { return DeviceReturnTy(); }
+// expected-note@-1 1+ {{'cd' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
+
+extern "C" __host__ HostReturnTy ch() { return HostReturnTy(); }
+// expected-note@-1 1+ {{'ch' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
+// expected-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
+
+__host__ void hostf() {
+  DeviceFnPtr fp_d = d;         // expected-error {{reference to __device__ function 'd' in __host__ function}}
+  DeviceReturnTy ret_d = d();   // expected-error {{no matching function for call to 'd'}}
+  DeviceFnPtr fp_cd = cd;       // expected-error {{reference to __device__ function 'cd' in __host__ function}}
+  DeviceReturnTy ret_cd = cd(); // expected-error {{no matching function for call to 'cd'}}
+
+  HostFnPtr fp_h = h;
+  HostReturnTy ret_h = h();
+  HostFnPtr fp_ch = ch;
+  HostReturnTy ret_ch = ch();
+
+  HostFnPtr fp_dh = dh;
+  HostReturnTy ret_dh = dh();
+  HostFnPtr fp_cdh = cdh;
+  HostReturnTy ret_cdh = cdh();
+
+  GlobalFnPtr fp_g = g;
   g(); // expected-error {{call to global function g not configured}}
-  g<<<0,0>>>();
+  g<<<0, 0>>>();
 }
 
+__device__ void devicef() {
+  DeviceFnPtr fp_d = d;
+  DeviceReturnTy ret_d = d();
+  DeviceFnPtr fp_cd = cd;
+  DeviceReturnTy ret_cd = cd();
 
-__device__ void devicef(void) {
-  fp_t dp = d;
-  fp_t cdp = cd;
-  fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __host__ function 'h' in __device__ function}}
-  // expected-note@66 {{'h' declared here}}
-  // expected-error@-4 {{reference to __host__ function 'ch' in __device__ function}}
-  // expected-note@69 {{'ch' declared here}}
-#endif
-  fp_t dhp = dh;
-  fp_t cdhp = cdh;
-  gp_t gp = g; // expected-error {{reference to __global__ function 'g' in __device__ function}}
-               // expected-note@67 {{'g' declared here}}
+  HostFnPtr fp_h = h;         // expected-error {{reference to __host__ function 'h' in __device__ function}}
+  HostReturnTy ret_h = h();   // expected-error {{no matching function for call to 'h'}}
+  HostFnPtr fp_ch = ch;       // expected-error {{reference to __host__ function 'ch' in __device__ function}}
+  HostReturnTy ret_ch = ch(); // expected-error {{no matching function for call to 'ch'}}
 
-  d();
-  cd();
-  h();
-  ch();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'h'}}
-  // expected-note@66 {{candidate function not viable: call to __host__ function from __device__ function}}
-  // expected-error@-4 {{no matching function for call to 'ch'}}
-  // expected-note@69 {{candidate function not viable: call to __host__ function from __device__ function}}
-#endif
-  dh();
-  cdh();
+  DeviceFnPtr fp_dh = dh;
+  DeviceReturnTy ret_dh = dh();
+  DeviceFnPtr fp_cdh = cdh;
+  DeviceReturnTy ret_cdh = cdh();
+
+  GlobalFnPtr fp_g = g; // expected-error {{reference to __global__ function 'g' in __device__ function}}
   g(); // expected-error {{no matching function for call to 'g'}}
-  // expected-note@67 {{candidate function not viable: call to __global__ function from __device__ function}}
   g<<<0,0>>>(); // expected-error {{reference to __global__ function 'g' in __device__ function}}
-  // expected-note@67 {{'g' declared here}}
 }
 
-__global__ void globalf(void) {
-  fp_t dp = d;
-  fp_t cdp = cd;
-  fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __host__ function 'h' in __global__ function}}
-  // expected-note@66 {{'h' declared here}}
-  // expected-error@-4 {{reference to __host__ function 'ch' in __global__ function}}
-  // expected-note@69 {{'ch' declared here}}
-#endif
-  fp_t dhp = dh;
-  fp_t cdhp = cdh;
-  gp_t gp = g; // expected-error {{reference to __global__ function 'g' in __global__ function}}
-               // expected-note@67 {{'g' declared here}}
+__global__ void globalf() {
+  DeviceFnPtr fp_d = d;
+  DeviceReturnTy ret_d = d();
+  DeviceFnPtr fp_cd = cd;
+  DeviceReturnTy ret_cd = cd();
 
-  d();
-  cd();
-  h();
-  ch();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'h'}}
-  // expected-note@66 {{candidate function not viable: call to __host__ function from __global__ function}}
-  // expected-error@-4 {{no matching function for call to 'ch'}}
-  // expected-note@69 {{candidate function not viable: call to __host__ function from __global__ function}}
-#endif
-  dh();
-  cdh();
+  HostFnPtr fp_h = h;         // expected-error {{reference to __host__ function 'h' in __global__ function}}
+  HostReturnTy ret_h = h();   // expected-error {{no matching function for call to 'h'}}
+  HostFnPtr fp_ch = ch;       // expected-error {{reference to __host__ function 'ch' in __global__ function}}
+  HostReturnTy ret_ch = ch(); // expected-error {{no matching function for call to 'ch'}}
+
+  DeviceFnPtr fp_dh = dh;
+  DeviceReturnTy ret_dh = dh();
+  DeviceFnPtr fp_cdh = cdh;
+  DeviceReturnTy ret_cdh = cdh();
+
+  GlobalFnPtr fp_g = g; // expected-error {{reference to __global__ function 'g' in __global__ function}}
   g(); // expected-error {{no matching function for call to 'g'}}
-  // expected-note@67 {{candidate function not viable: call to __global__ function from __global__ function}}
   g<<<0,0>>>(); // expected-error {{reference to __global__ function 'g' in __global__ function}}
-  // expected-note@67 {{'g' declared here}}
 }
 
-__host__ __device__ void hostdevicef(void) {
-  fp_t dp = d;
-  fp_t cdp = cd;
-  fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-#if !defined(__CUDA_ARCH__)
-  // expected-error@-6 {{reference to __device__ function 'd' in __host__ __device__ function}}
-  // expected-note@65 {{'d' declared here}}
-  // expected-error@-7 {{reference to __device__ function 'cd' in __host__ __device__ function}}
-  // expected-note@68 {{'cd' declared here}}
-#else
-  // expected-error@-9 {{reference to __host__ function 'h' in __host__ __device__ function}}
-  // expected-note@66 {{'h' declared here}}
-  // expected-error@-10 {{reference to __host__ function 'ch' in __host__ __device__ function}}
-  // expected-note@69 {{'ch' declared here}}
-#endif
-#endif
-  fp_t dhp = dh;
-  fp_t cdhp = cdh;
-  gp_t gp = g;
+__host__ __device__ void hostdevicef() {
+  DeviceFnPtr fp_d = d;
+  DeviceReturnTy ret_d = d();
+  DeviceFnPtr fp_cd = cd;
+  DeviceReturnTy ret_cd = cd();
+
+  HostFnPtr fp_h = h;
+  HostReturnTy ret_h = h();
+  HostFnPtr fp_ch = ch;
+  HostReturnTy ret_ch = ch();
+
+  CurrentFnPtr fp_dh = dh;
+  CurrentReturnTy ret_dh = dh();
+  CurrentFnPtr fp_cdh = cdh;
+  CurrentReturnTy ret_cdh = cdh();
+
+  GlobalFnPtr fp_g = g;
 #if defined(__CUDA_ARCH__)
   // expected-error@-2 {{reference to __global__ function 'g' in __host__ __device__ function}}
-  // expected-note@67 {{'g' declared here}}
 #endif
-
-  d();
-  cd();
-  h();
-  ch();
-#if !defined(NOCHECKS)
-#if !defined(__CUDA_ARCH__)
-  // expected-error@-6 {{no matching function for call to 'd'}}
-  // expected-note@65 {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
-  // expected-error@-7 {{no matching function for call to 'cd'}}
-  // expected-note@68 {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
-#else
-  // expected-error@-9 {{no matching function for call to 'h'}}
-  // expected-note@66 {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-  // expected-error@-10 {{no matching function for call to 'ch'}}
-  // expected-note@69 {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-#endif
-#endif
-
-  dh();
-  cdh();
   g();
   g<<<0,0>>>();
 #if !defined(__CUDA_ARCH__)
   // expected-error@-3 {{call to global function g not configured}}
 #else
   // expected-error@-5 {{no matching function for call to 'g'}}
-  // expected-note@67 {{candidate function not viable: call to __global__ function from __host__ __device__ function}}
-  // expected-error@-6 {{reference to __global__ function 'g' in __host__ __device__ function}}
-  // expected-note@67 {{'g' declared here}}
+  // expected-error@-5 {{reference to __global__ function 'g' in __host__ __device__ function}}
 #endif  // __CUDA_ARCH__
 }
 
 // Test for address of overloaded function resolution in the global context.
-fp_t hp = h;
-fp_t chp = ch;
-fp_t dhp = dh;
-fp_t cdhp = cdh;
-gp_t gp = g;
+HostFnPtr fp_h = h;
+HostFnPtr fp_ch = ch;
+CurrentFnPtr fp_dh = dh;
+CurrentFnPtr fp_cdh = cdh;
+GlobalFnPtr fp_g = g;
 
 
 // Test overloading of destructors
@@ -315,3 +284,98 @@
   __host__ __device__ void operator delete(void *ptr) {} // expected-note {{previous declaration is here}}
   __device__ void operator delete(void *ptr) {} // expected-error {{class member cannot be redeclared}}
 };
+
+// __global__ functions can't be overloaded based on attribute
+// difference.
+struct G {
+  friend void friend_of_g(G &arg);
+private:
+  int x;
+};
+__global__ void friend_of_g(G &arg) { int x = arg.x; } // expected-note {{previous definition is here}}
+void friend_of_g(G &arg) { int x = arg.x; } // expected-error {{redefinition of 'friend_of_g'}}
+
+// HD functions are sometimes allowed to call H or D functions -- this
+// is an artifact of the source-to-source splitting performed by nvcc
+// that we need to mimic. During device mode compilation in nvcc, host
+// functions aren't present at all, so don't participate in
+// overloading. But in clang, H and D functions are present in both
+// compilation modes. Clang normally uses the target attribute as a
+// tiebreaker between overloads with otherwise identical priority, but
+// in order to match nvcc's behavior, we sometimes need to wholly
+// discard overloads that would not be present during compilation
+// under nvcc.
+
+template <typename T> TemplateReturnTy template_vs_function(T arg) {
+  return TemplateReturnTy();
+}
+__device__ DeviceReturnTy template_vs_function(float arg) {
+  return DeviceReturnTy();
+}
+
+// Here we expect to call the templated function during host compilation, even
+// if -fcuda-disable-target-call-checks is passed, and even though C++ overload
+// rules prefer the non-templated function.
+__host__ __device__ void test_host_device_calls_template(void) {
+#ifdef __CUDA_ARCH__
+  typedef DeviceReturnTy ExpectedReturnTy;
+#else
+  typedef TemplateReturnTy ExpectedReturnTy;
+#endif
+
+  ExpectedReturnTy ret1 = template_vs_function(1.0f);
+  ExpectedReturnTy ret2 = template_vs_function(2.0);
+}
+
+// Calls from __host__ and __device__ functions should always call the
+// overloaded function that matches their mode.
+__host__ void test_host_calls_template_fn() {
+  TemplateReturnTy ret1 = template_vs_function(1.0f);
+  TemplateReturnTy ret2 = template_vs_function(2.0);
+}
+
+__device__ void test_device_calls_template_fn() {
+  DeviceReturnTy ret1 = template_vs_function(1.0f);
+  DeviceReturnTy ret2 = template_vs_function(2.0);
+}
+
+// If we have a mix of HD and H-only or D-only candidates in the overload set,
+// normal C++ overload resolution rules apply first.
+template <typename T> TemplateReturnTy template_vs_hd_function(T arg) {
+  return TemplateReturnTy();
+}
+__host__ __device__ HostDeviceReturnTy template_vs_hd_function(float arg) {
+  return HostDeviceReturnTy();
+}
+
+__host__ __device__ void test_host_device_calls_hd_template() {
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  TemplateReturnTy ret2 = template_vs_hd_function(1);
+}
+
+__host__ void test_host_calls_hd_template() {
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  TemplateReturnTy ret2 = template_vs_hd_function(1);
+}
+
+__device__ void test_device_calls_hd_template() {
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  // Host-only function template is not callable with strict call checks,
+  // so for device side HD function will be the only choice.
+  HostDeviceReturnTy ret2 = template_vs_hd_function(1);
+}
+
+// Check that overloads still work the same way on both host and
+// device side when the overload set contains only functions from one
+// side of compilation.
+__device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); }
+__device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); }
+__host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); }
+__host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); }
+
+__host__ __device__ void test_host_device_single_side_overloading() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+}
diff --git a/test/SemaCUDA/function-target-disabled-check.cu b/test/SemaCUDA/function-target-disabled-check.cu
deleted file mode 100644
index 979d4ed..0000000
--- a/test/SemaCUDA/function-target-disabled-check.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-// Test that we can disable cross-target call checks in Sema with the
-// -fcuda-disable-target-call-checks flag. Without this flag we'd get a bunch
-// of errors here, since there are invalid cross-target calls present.
-
-// RUN: %clang_cc1 -fsyntax-only -verify %s -fcuda-disable-target-call-checks
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s -fcuda-disable-target-call-checks
-
-// expected-no-diagnostics
-
-#define __device__ __attribute__((device))
-#define __global__ __attribute__((global))
-#define __host__ __attribute__((host))
-
-__attribute__((host)) void h1();
-
-__attribute__((device)) void d1() {
-  h1();
-}
-
-__attribute__((host)) void h2() {
-  d1();
-}
-
-__attribute__((global)) void g1() {
-  h2();
-}
diff --git a/test/SemaCUDA/function-target-hd.cu b/test/SemaCUDA/function-target-hd.cu
deleted file mode 100644
index 685f4f9..0000000
--- a/test/SemaCUDA/function-target-hd.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-// Test the Sema analysis of caller-callee relationships of host device
-// functions when compiling CUDA code. There are 4 permutations of this test as
-// host and device compilation are separate compilation passes, and clang has
-// an option to allow host calls from host device functions. __CUDA_ARCH__ is
-// defined when compiling for the device and TEST_WARN_HD when host calls are
-// allowed from host device functions. So for example, if __CUDA_ARCH__ is
-// defined and TEST_WARN_HD is not then device compilation is happening but
-// host device functions are not allowed to call device functions.
-
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
-
-#include "Inputs/cuda.h"
-
-__host__ void hd1h(void);
-#if defined(__CUDA_ARCH__) && !defined(TEST_WARN_HD)
-// expected-note@-2 {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-#endif
-__device__ void hd1d(void);
-#ifndef __CUDA_ARCH__
-// expected-note@-2 {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
-#endif
-__host__ void hd1hg(void);
-__device__ void hd1dg(void);
-#ifdef __CUDA_ARCH__
-__host__ void hd1hig(void);
-#if !defined(TEST_WARN_HD)
-// expected-note@-2 {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-#endif
-#else
-__device__ void hd1dig(void); // expected-note {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
-#endif
-__host__ __device__ void hd1hd(void);
-__global__ void hd1g(void); // expected-note {{'hd1g' declared here}}
-
-__host__ __device__ void hd1(void) {
-#if defined(TEST_WARN_HD) && defined(__CUDA_ARCH__)
-// expected-warning@-2 {{calling __host__ function hd1h from __host__ __device__ function hd1}}
-// expected-warning@-3 {{calling __host__ function hd1hig from __host__ __device__ function hd1}}
-#endif
-  hd1d();
-#ifndef __CUDA_ARCH__
-// expected-error@-2 {{no matching function}}
-#endif
-  hd1h();
-#if defined(__CUDA_ARCH__) && !defined(TEST_WARN_HD)
-// expected-error@-2 {{no matching function}}
-#endif
-
-  // No errors as guarded
-#ifdef __CUDA_ARCH__
-  hd1d();
-#else
-  hd1h();
-#endif
-
-  // Errors as incorrectly guarded
-#ifndef __CUDA_ARCH__
-  hd1dig(); // expected-error {{no matching function}}
-#else
-  hd1hig();
-#ifndef TEST_WARN_HD
-// expected-error@-2 {{no matching function}}
-#endif
-#endif
-
-  hd1hd();
-  hd1g<<<1, 1>>>(); // expected-error {{reference to __global__ function 'hd1g' in __host__ __device__ function}}
-}
diff --git a/test/SemaCUDA/global-initializers-host.cu b/test/SemaCUDA/global-initializers-host.cu
new file mode 100644
index 0000000..810c6b9
--- /dev/null
+++ b/test/SemaCUDA/global-initializers-host.cu
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 %s --std=c++11 -triple x86_64-linux-unknown -fsyntax-only -o - -verify
+
+#include "Inputs/cuda.h"
+
+// Check that we get an error if we try to call a __device__ function from a
+// module initializer.
+
+struct S {
+  __device__ S() {}
+  // expected-note@-1 {{'S' declared here}}
+};
+
+S s;
+// expected-error@-1 {{reference to __device__ function 'S' in global initializer}}
+
+struct T {
+  __host__ __device__ T() {}
+};
+T t;  // No error, this is OK.
+
+struct U {
+  __host__ U() {}
+  __device__ U(int) {}
+  // expected-note@-1 {{'U' declared here}}
+};
+U u(42);
+// expected-error@-1 {{reference to __device__ function 'U' in global initializer}}
+
+__device__ int device_fn() { return 42; }
+// expected-note@-1 {{'device_fn' declared here}}
+int n = device_fn();
+// expected-error@-1 {{reference to __device__ function 'device_fn' in global initializer}}
diff --git a/test/SemaCUDA/host-device-constexpr.cu b/test/SemaCUDA/host-device-constexpr.cu
new file mode 100644
index 0000000..6625d72
--- /dev/null
+++ b/test/SemaCUDA/host-device-constexpr.cu
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -isystem %S/Inputs %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -isystem %S/Inputs %s -fcuda-is-device
+
+#include "Inputs/cuda.h"
+
+// Declares one function and pulls it into namespace ns:
+//
+//   __device__ int OverloadMe();
+//   namespace ns { using ::OverloadMe; }
+//
+// Clang cares that this is done in a system header.
+#include <overload.h>
+
+// Opaque type used to determine which overload we're invoking.
+struct HostReturnTy {};
+
+// These shouldn't become host+device because they already have attributes.
+__host__ constexpr int HostOnly() { return 0; }
+// expected-note@-1 0+ {{not viable}}
+__device__ constexpr int DeviceOnly() { return 0; }
+// expected-note@-1 0+ {{not viable}}
+
+constexpr int HostDevice() { return 0; }
+
+// This should be a host-only function, because there's a previous __device__
+// overload in <overload.h>.
+constexpr HostReturnTy OverloadMe() { return HostReturnTy(); }
+
+namespace ns {
+// The "using" statement in overload.h should prevent OverloadMe from being
+// implicitly host+device.
+constexpr HostReturnTy OverloadMe() { return HostReturnTy(); }
+}  // namespace ns
+
+// This is an error, because NonSysHdrOverload was not defined in a system
+// header.
+__device__ int NonSysHdrOverload() { return 0; }
+// expected-note@-1 {{conflicting __device__ function declared here}}
+constexpr int NonSysHdrOverload() { return 0; }
+// expected-error@-1 {{constexpr function 'NonSysHdrOverload' without __host__ or __device__ attributes}}
+
+// Variadic device functions are not allowed, so this is just treated as
+// host-only.
+constexpr void Variadic(const char*, ...);
+// expected-note@-1 {{call to __host__ function from __device__ function}}
+
+__host__ void HostFn() {
+  HostOnly();
+  DeviceOnly(); // expected-error {{no matching function}}
+  HostReturnTy x = OverloadMe();
+  HostReturnTy y = ns::OverloadMe();
+  Variadic("abc", 42);
+}
+
+__device__ void DeviceFn() {
+  HostOnly(); // expected-error {{no matching function}}
+  DeviceOnly();
+  int x = OverloadMe();
+  int y = ns::OverloadMe();
+  Variadic("abc", 42); // expected-error {{no matching function}}
+}
+
+__host__ __device__ void HostDeviceFn() {
+#ifdef __CUDA_ARCH__
+  int y = OverloadMe();
+#else
+  constexpr HostReturnTy y = OverloadMe();
+#endif
+}
diff --git a/test/SemaCUDA/implicit-intrinsic.cu b/test/SemaCUDA/implicit-intrinsic.cu
index 0793d64..dba26c5 100644
--- a/test/SemaCUDA/implicit-intrinsic.cu
+++ b/test/SemaCUDA/implicit-intrinsic.cu
@@ -1,7 +1,5 @@
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device \
 // RUN:     -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device \
-// RUN:     -fcuda-target-overloads -fsyntax-only -verify %s
 
 #include "Inputs/cuda.h"
 
diff --git a/test/SemaCUDA/implicit-member-target-collision-cxx11.cu b/test/SemaCUDA/implicit-member-target-collision-cxx11.cu
index f038c37..7aa1dd3 100644
--- a/test/SemaCUDA/implicit-member-target-collision-cxx11.cu
+++ b/test/SemaCUDA/implicit-member-target-collision-cxx11.cu
@@ -74,13 +74,11 @@
 struct C4_with_collision : A4_with_host_copy_ctor, B4_with_device_copy_ctor {
 };
 
-// expected-note@-3 {{candidate constructor (the implicit default constructor}} not viable
-// expected-note@-4 {{implicit copy constructor inferred target collision}}
-// expected-note@-5 {{candidate constructor (the implicit copy constructor}} not viable
+// expected-note@-3 {{copy constructor of 'C4_with_collision' is implicitly deleted because base class 'B4_with_device_copy_ctor' has no copy constructor}}
 
 void hostfoo4() {
   C4_with_collision c;
-  C4_with_collision c2 = c; // expected-error {{no matching constructor}}
+  C4_with_collision c2 = c; // expected-error {{call to implicitly-deleted copy constructor of 'C4_with_collision'}}
 }
 
 //------------------------------------------------------------------------------
diff --git a/test/SemaCUDA/implicit-member-target.cu b/test/SemaCUDA/implicit-member-target.cu
index 6064560..242d345 100644
--- a/test/SemaCUDA/implicit-member-target.cu
+++ b/test/SemaCUDA/implicit-member-target.cu
@@ -60,13 +60,14 @@
 
 struct B3_with_implicit_ctors : A3_with_device_ctors {
 };
+// expected-note@-2 2{{call to __device__ function from __host__ function}}
+// expected-note@-3 {{default constructor}}
 
-// expected-note@-3 {{copy constructor of 'B3_with_implicit_ctors' is implicitly deleted}}
 
 void hostfoo3() {
   B3_with_implicit_ctors b;  // this is OK because the inferred default ctor
                              // here is __host__
-  B3_with_implicit_ctors b2 = b; // expected-error {{call to implicitly-deleted copy constructor}}
+  B3_with_implicit_ctors b2 = b; // expected-error {{no matching constructor}}
 
 }
 
diff --git a/test/SemaCUDA/method-target.cu b/test/SemaCUDA/method-target.cu
index 4fa2907..5056645 100644
--- a/test/SemaCUDA/method-target.cu
+++ b/test/SemaCUDA/method-target.cu
@@ -6,7 +6,7 @@
 // Test 1: host method called from device function
 
 struct S1 {
-  void method() {}
+  void method() {} // expected-note {{'method' declared here}}
 };
 
 __device__ void foo1(S1& s) {
@@ -29,7 +29,7 @@
 // Test 3: device method called from host function
 
 struct S3 {
-  __device__ void method() {}
+  __device__ void method() {} // expected-note {{'method' declared here}};
 };
 
 void foo3(S3& s) {
@@ -44,7 +44,7 @@
 };
 
 __host__ __device__ void foo4(S4& s) {
-  s.method(); // expected-error {{reference to __device__ function 'method' in __host__ __device__ function}}
+  s.method();
 }
 
 //------------------------------------------------------------------------------
@@ -63,7 +63,7 @@
 // Test 6: call method through pointer
 
 struct S6 {
-  void method() {}
+  void method() {} // expected-note {{'method' declared here}};
 };
 
 __device__ void foo6(S6* s) {
diff --git a/test/SemaCUDA/no-host-device-constexpr.cu b/test/SemaCUDA/no-host-device-constexpr.cu
new file mode 100644
index 0000000..c70d97d
--- /dev/null
+++ b/test/SemaCUDA/no-host-device-constexpr.cu
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -fno-cuda-host-device-constexpr -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -fno-cuda-host-device-constexpr -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+// Check that, with -fno-cuda-host-device-constexpr, constexpr functions are
+// host-only, and __device__ constexpr functions are still device-only.
+
+constexpr int f() { return 0; } // expected-note {{not viable}}
+__device__ constexpr int g() { return 0; } // expected-note {{not viable}}
+
+void __device__ foo() {
+  f(); // expected-error {{no matching function}}
+  g();
+}
+
+void __host__ foo() {
+  f();
+  g(); // expected-error {{no matching function}}
+}
diff --git a/test/SemaCUDA/overloaded-delete.cu b/test/SemaCUDA/overloaded-delete.cu
new file mode 100644
index 0000000..e582fed
--- /dev/null
+++ b/test/SemaCUDA/overloaded-delete.cu
@@ -0,0 +1,25 @@
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+struct S {
+  __host__ static void operator delete(void*, size_t) {}
+  __device__ static void operator delete(void*, size_t) {}
+};
+
+__host__ __device__ void test(S* s) {
+  // This shouldn't be ambiguous -- we call the host overload in host mode and
+  // the device overload in device mode.
+  delete s;
+}
+
+__host__ void operator delete(void *ptr) {}
+__device__ void operator delete(void *ptr) {}
+
+__host__ __device__ void test_global_delete(int *ptr) {
+  // Again, there should be no ambiguity between which operator delete we call.
+  ::delete ptr;
+}
diff --git a/test/SemaCUDA/pr27778.cu b/test/SemaCUDA/pr27778.cu
new file mode 100644
index 0000000..101965b
--- /dev/null
+++ b/test/SemaCUDA/pr27778.cu
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -fsyntax-only %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 512;
+__launch_bounds__(constint) void TestConstInt(void) {}
diff --git a/test/SemaCUDA/vararg.cu b/test/SemaCUDA/vararg.cu
new file mode 100644
index 0000000..34ef367
--- /dev/null
+++ b/test/SemaCUDA/vararg.cu
@@ -0,0 +1,57 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fsyntax-only \
+// RUN:   -verify -DEXPECT_VA_ARG_ERR -DEXPECT_VARARG_ERR %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fsyntax-only \
+// RUN:   -fcuda-allow-variadic-functions -verify -DEXPECT_VA_ARG_ERR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify \
+// RUN:   -DEXPECT_VARARG_ERR %s
+
+#include <stdarg.h>
+#include "Inputs/cuda.h"
+
+__device__ void foo() {
+  va_list list;
+  va_arg(list, int);
+#ifdef EXPECT_VA_ARG_ERR
+  // expected-error@-2 {{CUDA device code does not support va_arg}}
+#endif
+}
+
+void bar() {
+  va_list list;
+  va_arg(list, int);  // OK: host-only
+}
+
+__device__ void baz() {
+#if !defined(__CUDA_ARCH__)
+  va_list list;
+  va_arg(list, int);  // OK: only seen when compiling for host
+#endif
+}
+
+__device__ void vararg(const char* x, ...) {}
+#ifdef EXPECT_VARARG_ERR
+// expected-error@-2 {{CUDA device code does not support variadic functions}}
+#endif
+
+template <typename T>
+__device__ void vararg(T t, ...) {}
+#ifdef EXPECT_VARARG_ERR
+// expected-error@-2 {{CUDA device code does not support variadic functions}}
+#endif
+
+extern "C" __device__ int printf(const char* fmt, ...);  // OK, special case.
+
+// Definition of printf not allowed.
+extern "C" __device__ int printf(const char* fmt, ...) { return 0; }
+#ifdef EXPECT_VARARG_ERR
+// expected-error@-2 {{CUDA device code does not support variadic functions}}
+#endif
+
+namespace ns {
+__device__ int printf(const char* fmt, ...);
+#ifdef EXPECT_VARARG_ERR
+// expected-error@-2 {{CUDA device code does not support variadic functions}}
+#endif
+}
diff --git a/test/SemaCXX/MicrosoftExtensions.cpp b/test/SemaCXX/MicrosoftExtensions.cpp
index 22cf2be..e10dead 100644
--- a/test/SemaCXX/MicrosoftExtensions.cpp
+++ b/test/SemaCXX/MicrosoftExtensions.cpp
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 %s -triple i686-pc-win32 -fsyntax-only -Wmicrosoft -Wc++11-extensions -Wno-long-long -verify -fms-extensions -fexceptions -fcxx-exceptions
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fsyntax-only -Wmicrosoft -Wc++11-extensions -Wno-long-long -verify -fms-extensions -fexceptions -fcxx-exceptions -DTEST1
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fsyntax-only -Wmicrosoft -Wc++11-extensions -Wno-long-long -verify -fexceptions -fcxx-exceptions -DTEST2
 
+#if TEST1
 
 // Microsoft doesn't validate exception specification.
 namespace microsoft_exception_spec {
@@ -80,7 +82,73 @@
 // __unaligned handling
 typedef char __unaligned *aligned_type;
 typedef struct UnalignedTag { int f; } __unaligned *aligned_type2;
+typedef char __unaligned aligned_type3;
 
+struct aligned_type4 {
+  int i;
+};
+
+__unaligned int aligned_type4::*p1_aligned_type4 = &aligned_type4::i;
+int aligned_type4::* __unaligned p2_aligned_type4 = &aligned_type4::i;
+__unaligned int aligned_type4::* __unaligned p3_aligned_type4 = &aligned_type4::i;
+void (aligned_type4::*__unaligned p4_aligned_type4)();
+
+// Check that __unaligned qualifier can be used for overloading
+void foo_unaligned(int *arg) {}
+void foo_unaligned(__unaligned int *arg) {}
+void foo_unaligned(int arg) {} // expected-note {{previous definition is here}}
+void foo_unaligned(__unaligned int arg) {} // expected-error {{redefinition of 'foo_unaligned'}}
+class A_unaligned {};
+class B_unaligned : public A_unaligned {};
+int foo_unaligned(__unaligned A_unaligned *arg) { return 0; }
+void *foo_unaligned(B_unaligned *arg) { return 0; }
+
+void test_unaligned() {
+  int *p1 = 0;
+  foo_unaligned(p1);
+
+  __unaligned int *p2 = 0;
+  foo_unaligned(p2);
+
+  __unaligned B_unaligned *p3 = 0;
+  int p4 = foo_unaligned(p3);
+
+  B_unaligned *p5 = p3; // expected-error {{cannot initialize a variable of type 'B_unaligned *' with an lvalue of type '__unaligned B_unaligned *'}}
+
+  __unaligned B_unaligned *p6 = p3;
+
+  p1_aligned_type4 = p2_aligned_type4;
+  p2_aligned_type4 = p1_aligned_type4; // expected-error {{assigning to 'int aligned_type4::*' from incompatible type '__unaligned int aligned_type4::*'}}
+  p3_aligned_type4 = p1_aligned_type4;
+
+  __unaligned int a[10];
+  int *b = a; // expected-error {{cannot initialize a variable of type 'int *' with an lvalue of type '__unaligned int [10]'}}
+}
+
+// Test from PR27367
+// We should accept assignment of an __unaligned pointer to a non-__unaligned
+// pointer to void
+typedef struct _ITEMIDLIST { int i; } ITEMIDLIST;
+typedef ITEMIDLIST __unaligned *LPITEMIDLIST;
+extern "C" __declspec(dllimport) void __stdcall CoTaskMemFree(void* pv);
+__inline void FreeIDListArray(LPITEMIDLIST *ppidls) {
+  CoTaskMemFree(*ppidls);
+  __unaligned int *x = 0;
+  void *y = x;
+}
+
+// Test from PR27666
+// We should accept type conversion of __unaligned to non-__unaligned references
+typedef struct in_addr {
+public:
+  in_addr(in_addr &a) {} // expected-note {{candidate constructor not viable: no known conversion from '__unaligned IN_ADDR *' (aka '__unaligned in_addr *') to 'in_addr &' for 1st argument; dereference the argument with *}}
+  in_addr(in_addr *a) {} // expected-note {{candidate constructor not viable: 1st argument ('__unaligned IN_ADDR *' (aka '__unaligned in_addr *')) would lose __unaligned qualifier}}
+} IN_ADDR;
+
+void f(IN_ADDR __unaligned *a) {
+  IN_ADDR local_addr = *a;
+  IN_ADDR local_addr2 = a; // expected-error {{no viable conversion from '__unaligned IN_ADDR *' (aka '__unaligned in_addr *') to 'IN_ADDR' (aka 'in_addr')}}
+}
 
 template<typename T> void h1(T (__stdcall M::* const )()) { }
 
@@ -420,3 +488,15 @@
 
 int S::fn() { return 0; } // expected-warning {{is missing exception specification}}
 }
+
+#elif TEST2
+
+// Check that __unaligned is not recognized if MS extensions are not enabled
+typedef char __unaligned *aligned_type; // expected-error {{expected ';' after top level declarator}}
+
+#else
+
+#error Unknown test mode
+
+#endif
+
diff --git a/test/SemaCXX/PR10177.cpp b/test/SemaCXX/PR10177.cpp
index e361ff3..9286e29 100644
--- a/test/SemaCXX/PR10177.cpp
+++ b/test/SemaCXX/PR10177.cpp
@@ -54,6 +54,7 @@
 
 namespace { template<typename> extern int n; }
 template<typename T> int g() { return n<int>; }
+namespace { extern template int n<int>; }
 
 #endif
 
diff --git a/test/SemaCXX/access.cpp b/test/SemaCXX/access.cpp
index cd65f90..29a58a1 100644
--- a/test/SemaCXX/access.cpp
+++ b/test/SemaCXX/access.cpp
@@ -167,5 +167,5 @@
   template <class T> void foo() {
     []() { A::foo(); }(); // expected-error {{private}}
   }
-  void bar() { foo<void>(); } // expected-note {{instantiation}}
+  void bar() { foo<void>(); }
 }
diff --git a/test/SemaCXX/aggregate-initialization.cpp b/test/SemaCXX/aggregate-initialization.cpp
index 4e41774..ddaf33f 100644
--- a/test/SemaCXX/aggregate-initialization.cpp
+++ b/test/SemaCXX/aggregate-initialization.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s 
 
 // Verify that using an initializer list for a non-aggregate looks for
 // constructors..
@@ -11,7 +13,7 @@
 };
 
 struct Base { };
-struct NonAggr2 : public Base { // expected-note 3 {{candidate constructor}}
+struct NonAggr2 : public Base { // expected-note 0-3 {{candidate constructor}}
   int m;
 };
 
@@ -25,9 +27,15 @@
 };
 
 NonAggr1 na1 = { 17 }; // expected-error{{no matching constructor for initialization of 'NonAggr1'}}
-NonAggr2 na2 = { 17 }; // expected-error{{no matching constructor for initialization of 'NonAggr2'}}
+NonAggr2 na2 = { 17 };
 NonAggr3 na3 = { 17 }; // expected-error{{no matching constructor for initialization of 'NonAggr3'}}
 NonAggr4 na4 = { 17 }; // expected-error{{no matching constructor for initialization of 'NonAggr4'}}
+#if __cplusplus <= 201402L
+// expected-error@-4{{no matching constructor for initialization of 'NonAggr2'}}
+#else
+// expected-error@-6{{requires explicit braces}}
+NonAggr2 na2b = { {}, 17 }; // ok
+#endif
 
 // PR5817
 typedef int type[][2];
@@ -82,3 +90,59 @@
 };
 
 AggAgg aggagg = { 1, 2, 3, 4 };
+
+namespace diff_cpp14_dcl_init_aggr_example {
+  struct derived;
+  struct base {
+    friend struct derived;
+  private:
+    base();
+  };
+  struct derived : base {};
+
+  derived d1{};
+#if __cplusplus > 201402L
+  // expected-error@-2 {{private}}
+  // expected-note@-7 {{here}}
+#endif
+  derived d2;
+}
+
+namespace ProtectedBaseCtor {
+  // FIXME: It's unclear whether f() and g() should be valid in C++1z. What is
+  // the object expression in a constructor call -- the base class subobject or
+  // the complete object?
+  struct A {
+  protected:
+    A();
+  };
+
+  struct B : public A {
+    friend B f();
+    friend B g();
+    friend B h();
+  };
+
+  B f() { return {}; }
+#if __cplusplus > 201402L
+  // expected-error@-2 {{protected default constructor}}
+  // expected-note@-12 {{here}}
+#endif
+
+  B g() { return {{}}; }
+#if __cplusplus <= 201402L
+  // expected-error@-2 {{no matching constructor}}
+  // expected-note@-15 3{{candidate}}
+#else
+  // expected-error@-5 {{protected default constructor}}
+  // expected-note@-21 {{here}}
+#endif
+
+  B h() { return {A{}}; }
+#if __cplusplus <= 201402L
+  // expected-error@-2 {{no matching constructor}}
+  // expected-note@-24 3{{candidate}}
+#endif
+  // expected-error@-5 {{protected constructor}}
+  // expected-note@-30 {{here}}
+}
diff --git a/test/SemaCXX/alias-template.cpp b/test/SemaCXX/alias-template.cpp
index bcfe428..b625610 100644
--- a/test/SemaCXX/alias-template.cpp
+++ b/test/SemaCXX/alias-template.cpp
@@ -35,8 +35,8 @@
   template<typename Z> using T = int[n]; // expected-error {{variable length array declaration not allowed at file scope}}
 
   const int m = 42;
-  template<typename Z> using U = int[m]; // expected-note {{previous definition}}
-  template<typename Z> using U = int[42]; // ok
+  template<typename Z> using U = int[m];
+  template<typename Z> using U = int[42]; // expected-note {{previous definition}} 
   template<typename Z> using U = int; // expected-error {{type alias template redefinition with different types ('int' vs 'int [42]')}}
 }
 
diff --git a/test/SemaCXX/anonymous-struct.cpp b/test/SemaCXX/anonymous-struct.cpp
index 1b5dc13..b584f89 100644
--- a/test/SemaCXX/anonymous-struct.cpp
+++ b/test/SemaCXX/anonymous-struct.cpp
@@ -1,7 +1,12 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 struct S {
-  S();  // expected-note {{because type 'S' has a user-provided default constructor}}
+  S();
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{because type 'S' has a user-provided default constructor}}
+#endif
 };
 
 struct { // expected-error {{anonymous structs and classes must be class members}}
@@ -9,15 +14,25 @@
 
 struct E {
   struct {
-    S x;  // expected-error {{anonymous struct member 'x' has a non-trivial constructor}}
+    S x;
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{anonymous struct member 'x' has a non-trivial constructor}}
+#endif
   };
   static struct {
   };
 };
 
 template <class T> void foo(T);
-typedef struct { // expected-note {{use a tag name here to establish linkage prior to definition}} expected-note {{declared here}}
+typedef struct { // expected-note {{use a tag name here to establish linkage prior to definition}}
+#if __cplusplus <= 199711L
+// expected-note@-2 {{declared here}}
+#endif
+
   void test() {
-    foo(this); // expected-warning {{template argument uses unnamed type}}
+    foo(this);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
   }
 } A; // expected-error {{unsupported: typedef changes linkage of anonymous type, but linkage was already computed}}
diff --git a/test/SemaCXX/ast-print.cpp b/test/SemaCXX/ast-print.cpp
index 39a52ab..408af35 100644
--- a/test/SemaCXX/ast-print.cpp
+++ b/test/SemaCXX/ast-print.cpp
@@ -227,3 +227,14 @@
   using T::operator-;
 };
 }
+
+namespace dont_crash_on_auto_vars {
+struct T { enum E {X = 12ll }; };
+struct S {
+  struct  { int I; } ADecl;
+  static const auto Y = T::X;
+};
+//CHECK: static const auto Y = T::X;
+constexpr auto var = T::X;
+//CHECK: constexpr auto var = T::X;
+}
diff --git a/test/SemaCXX/atomic-ops.cpp b/test/SemaCXX/atomic-ops.cpp
new file mode 100644
index 0000000..2131613
--- /dev/null
+++ b/test/SemaCXX/atomic-ops.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -triple=i686-linux-gnu -std=c++11
+
+// We crashed when we couldn't properly convert the first arg of __atomic_* to
+// an lvalue.
+void PR28623() {
+  void helper(int); // expected-note{{target}}
+  void helper(char); // expected-note{{target}}
+  __atomic_store_n(helper, 0, 0); // expected-error{{reference to overloaded function could not be resolved}}
+}
diff --git a/test/SemaCXX/attr-abi-tag-syntax.cpp b/test/SemaCXX/attr-abi-tag-syntax.cpp
new file mode 100644
index 0000000..4f14a3c
--- /dev/null
+++ b/test/SemaCXX/attr-abi-tag-syntax.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+
+namespace N1 {
+
+namespace __attribute__((__abi_tag__)) {}
+// expected-warning@-1 {{'abi_tag' attribute on non-inline namespace ignored}}
+
+namespace N __attribute__((__abi_tag__)) {}
+// expected-warning@-1 {{'abi_tag' attribute on non-inline namespace ignored}}
+
+} // namespace N1
+
+namespace N2 {
+
+inline namespace __attribute__((__abi_tag__)) {}
+// expected-warning@-1 {{'abi_tag' attribute on anonymous namespace ignored}}
+
+inline namespace N __attribute__((__abi_tag__)) {}
+
+} // namespcace N2
+
+__attribute__((abi_tag("B", "A"))) extern int a1;
+
+__attribute__((abi_tag("A", "B"))) extern int a1;
+// expected-note@-1 {{previous declaration is here}}
+
+__attribute__((abi_tag("A", "C"))) extern int a1;
+// expected-error@-1 {{'abi_tag' C missing in original declaration}}
+
+extern int a2;
+// expected-note@-1 {{previous declaration is here}}
+__attribute__((abi_tag("A")))extern int a2;
+// expected-error@-1 {{cannot add 'abi_tag' attribute in a redeclaration}}
diff --git a/test/SemaCXX/attr-deprecated-replacement-error.cpp b/test/SemaCXX/attr-deprecated-replacement-error.cpp
index 2bbefe1..54d0f9e 100644
--- a/test/SemaCXX/attr-deprecated-replacement-error.cpp
+++ b/test/SemaCXX/attr-deprecated-replacement-error.cpp
@@ -4,8 +4,8 @@
 #error "Missing __has_feature"
 #endif
 
-int a1 [[deprecated("warning", "fixit")]]; // expected-warning{{use of the 'deprecated' attribute is a C++14 extension}} expected-error{{'deprecated' attribute takes no more than 1 argument}}
-int a2 [[deprecated("warning", 1)]]; // expected-warning{{use of the 'deprecated' attribute is a C++14 extension}} expected-error{{'deprecated' attribute takes no more than 1 argument}}
+int a1 [[deprecated("warning", "fixit")]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+int a2 [[deprecated("warning", 1)]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
 
 int b1 [[gnu::deprecated("warning", "fixit")]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
 int b2 [[gnu::deprecated("warning", 1)]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
diff --git a/test/SemaCXX/attr-lto-visibility-public.cpp b/test/SemaCXX/attr-lto-visibility-public.cpp
new file mode 100644
index 0000000..2f9ed87
--- /dev/null
+++ b/test/SemaCXX/attr-lto-visibility-public.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+
+int i [[clang::lto_visibility_public]]; // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+typedef int t [[clang::lto_visibility_public]]; // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+[[clang::lto_visibility_public]] void f(); // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+void f() [[clang::lto_visibility_public]]; // expected-error {{'lto_visibility_public' attribute cannot be applied to types}}
+
+struct [[clang::lto_visibility_public]] s1 {
+  int i [[clang::lto_visibility_public]]; // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+  [[clang::lto_visibility_public]] void f(); // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+};
+
+struct [[clang::lto_visibility_public(1)]] s2 { // expected-error {{'lto_visibility_public' attribute takes no arguments}}
+};
diff --git a/test/SemaCXX/attr-mode-tmpl.cpp b/test/SemaCXX/attr-mode-tmpl.cpp
new file mode 100644
index 0000000..4e1489a
--- /dev/null
+++ b/test/SemaCXX/attr-mode-tmpl.cpp
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+typedef enum { XX } EnumType;
+struct S { int x; };
+
+// Check enumerations. Vector modes on enum types must cause an error.
+template <class T>
+void CheckEnumerations() {
+  // Check that non-vector 'mode' attribute is OK with enumeration types.
+  typedef T __attribute__((mode(QI))) T1;
+  typedef T T2 __attribute__((mode(HI)));
+  typedef T __attribute__((mode(V8SI))) T3; // expected-error{{mode 'V8SI' is not supported for enumeration types}}
+  // expected-warning@-1{{specifying vector types with the 'mode' attribute is deprecated}}
+
+  typedef enum __attribute__((mode(HI))) { A4, B4 } T4;
+  typedef enum { A5, B5 } __attribute__((mode(SI))) T5;
+  typedef enum __attribute__((mode(V2SI))) { A6, B6 } T6; // expected-error{{mode 'V2SI' is not supported for enumeration types}}
+                                                          // expected-warning@-1{{deprecated}}
+  typedef enum { A7, B7 } __attribute__((mode(V2QI))) T7; // expected-error{{mode 'V2QI' is not supported for enumeration types}}
+                                                          // expected-warning@-1{{deprecated}}
+}
+
+// Check that attribute applies only for integer and floating-point types.
+// OK when instantiated with 'int', error with structure types, for example.
+template <class T>
+void CheckPrimitiveTypes() {
+  typedef T __attribute__((mode(QI))) T1;    // expected-error{{mode attribute only supported for integer and floating-point types}}
+  typedef T __attribute__((mode(V2SI))) VT1; // expected-error{{mode attribute only supported for integer and floating-point types}}
+  // expected-warning@-1{{specifying vector types with the 'mode' attribute is deprecated}}
+}
+
+// Check that attribute supports certain modes. Check that wrong machine modes
+// are NOT diagnosed twice during instantiation.
+template <class T>
+void CheckMachineMode() {
+  typedef T __attribute__((mode(QI))) T1; // expected-error{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(HI))) T2; // expected-error{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(SI))) T3; // expected-error{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(DI))) T4; // expected-error{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(SF))) T5; // expected-error2{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(DF))) T6; // expected-error2{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(II))) T7; // expected-error{{unknown machine mode}}
+  typedef T __attribute__((mode(12))) T8; // expected-error{{'mode' attribute requires an identifier}}
+}
+
+// Check attributes on function parameters.
+template <class T1, class T2>
+void CheckParameters(T1 __attribute__((mode(SI)))   paramSI,     // expected-note2{{ignored: substitution failure}}
+                     T1 __attribute__((mode(V4DI))) paramV4DI,   // expected-warning{{deprecated}}
+                     T2 __attribute__((mode(SF)))   paramSF,
+                     T2 __attribute__((mode(V4DF))) paramV4DF) { // expected-warning{{deprecated}}
+}
+
+
+// Check dependent structure.
+template <class T>
+struct TemplatedStruct {
+  // Check fields.
+  T __attribute__((mode(HI)))     x1;
+  T __attribute__((mode(V4HI)))   x2;         // expected-error{{mode 'V4HI' is not supported for enumeration types}}
+                                              // expected-warning@-1{{deprecated}}
+
+  // Check typedefs.
+  typedef T __attribute__((mode(DI)))   T1;
+  typedef T __attribute__((mode(V8DI))) T2;   // expected-error{{mode 'V8DI' is not supported for enumeration types}}
+                                              // expected-warning@-1{{deprecated}}
+
+  // Check parameters.
+  void f1(T __attribute__((mode(QI))) x) {}
+  void f2(T __attribute__((mode(SF))) x) {}   // expected-error2{{type of machine mode does not match type of base type}}
+  void f3(T __attribute__((mode(V4QI))) x) {} // expected-error{{mode 'V4QI' is not supported for enumeration types}}
+                                              // expected-warning@-1{{deprecated}}
+
+  // Check attribute on methods - it is invalid.
+  __attribute__((mode(QI))) T g1() { return 0; } // expected-error{{'mode' attribute only applies to variables, enums, fields and typedefs}}
+};
+
+
+
+int main() {
+  CheckEnumerations<int>();
+  CheckEnumerations<EnumType>(); // expected-note{{in instantiation of}}
+
+  CheckPrimitiveTypes<int>();
+  CheckPrimitiveTypes<S>();      // expected-note{{in instantiation of}}
+
+  // 'II' mode is unknown, no matter what we instantiate with.
+  CheckMachineMode<int>();       // expected-note{{in instantiation of}}
+  CheckMachineMode<EnumType>();  // expected-note{{in instantiation of}}
+  CheckMachineMode<float>();     // expected-note{{in instantiation of}}
+
+  int   __attribute__((mode(V4DI))) valV4DI; // expected-warning{{deprecated}}
+  float __attribute__((mode(V4DF))) valV4DF; // expected-warning{{deprecated}}
+  // OK.
+  CheckParameters<int, float>(0, valV4DI, 1.0, valV4DF);
+  // Enumeral type with vector mode is invalid.
+  CheckParameters<EnumType, float>(0, valV4DI, 1.0, valV4DF); // expected-error{{no matching function for call}}
+  // 'V4DF' mode with 'int' type is invalid.
+  CheckParameters<int, int>(0, valV4DI, 1, valV4DF); // expected-error{{no matching function for call}}
+
+  TemplatedStruct<int>      s1; // expected-note{{in instantiation of}}
+  TemplatedStruct<EnumType> s2; // expected-note{{in instantiation of}}
+  return 0;
+}
diff --git a/test/SemaCXX/attr-selectany.cpp b/test/SemaCXX/attr-selectany.cpp
index 058f2fc..9dc14b3 100644
--- a/test/SemaCXX/attr-selectany.cpp
+++ b/test/SemaCXX/attr-selectany.cpp
@@ -39,7 +39,9 @@
 // The D3D11 headers do something like this.  MSVC doesn't error on this at
 // all, even without the __declspec(selectany), in violation of the standard.
 // We fall back to a warning for selectany to accept headers.
-struct SomeStruct {};
+struct SomeStruct {
+  int foo;
+};
 extern const __declspec(selectany) SomeStruct some_struct; // expected-warning {{default initialization of an object of const type 'const SomeStruct' without a user-provided default constructor is a Microsoft extension}}
 
 // It should be possible to redeclare variables that were defined
diff --git a/test/SemaCXX/builtin-classify-type.cpp b/test/SemaCXX/builtin-classify-type.cpp
new file mode 100644
index 0000000..f700c1d
--- /dev/null
+++ b/test/SemaCXX/builtin-classify-type.cpp
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+enum gcc_type_class {
+  no_type_class = -1,
+  void_type_class, integer_type_class, char_type_class,
+  enumeral_type_class, boolean_type_class,
+  pointer_type_class, reference_type_class, offset_type_class,
+  real_type_class, complex_type_class,
+  function_type_class, method_type_class,
+  record_type_class, union_type_class,
+  array_type_class, string_type_class,
+  lang_type_class
+};
+
+class cl {
+public:
+    void bar() {}
+    int baz;
+};
+
+int builtin_result;
+
+void foo() {
+  int i;
+  char c;
+  enum { red, green, blue} enum_obj;
+  bool b;
+  int *p;
+  int &r = i;
+  double d;
+  extern void f();
+  cl cl_obj;
+  union { int a; float b; } u_obj;
+  int arr[10];
+
+  int a1[__builtin_classify_type(f()) == void_type_class ? 1 : -1];
+  int a2[__builtin_classify_type(i) == integer_type_class ? 1 : -1];
+  int a3[__builtin_classify_type(c) == integer_type_class ? 1 : -1];
+  int a4[__builtin_classify_type(enum_obj) == enumeral_type_class ? 1 : -1];
+  int a5[__builtin_classify_type(b) == boolean_type_class ? 1 : -1];
+  int a6[__builtin_classify_type(p) == pointer_type_class ? 1 : -1];
+  int a7[__builtin_classify_type(r) == integer_type_class ? 1 : -1];
+  int a8[__builtin_classify_type(&cl::baz) == offset_type_class ? 1 : -1];
+  int a9[__builtin_classify_type(d) == real_type_class ? 1 : -1];
+  int a10[__builtin_classify_type(f) == function_type_class ? 1 : -1];
+  int a11[__builtin_classify_type(&cl::bar) == method_type_class ? 1 : -1];
+  int a12[__builtin_classify_type(cl_obj) == record_type_class ? 1 : -1];
+  int a13[__builtin_classify_type(u_obj) == union_type_class ? 1 : -1];
+  int a14[__builtin_classify_type(arr) == array_type_class ? 1 : -1];
+  int a15[__builtin_classify_type("abc") == array_type_class ? 1 : -1];
+}
+
diff --git a/test/SemaCXX/builtin-object-size-cxx14.cpp b/test/SemaCXX/builtin-object-size-cxx14.cpp
new file mode 100644
index 0000000..32d752d
--- /dev/null
+++ b/test/SemaCXX/builtin-object-size-cxx14.cpp
@@ -0,0 +1,99 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+
+namespace basic {
+// Ensuring that __bos can be used in constexpr functions without anything
+// sketchy going on...
+constexpr int bos0() {
+  int k = 5;
+  char cs[10] = {};
+  return __builtin_object_size(&cs[k], 0);
+}
+
+constexpr int bos1() {
+  int k = 5;
+  char cs[10] = {};
+  return __builtin_object_size(&cs[k], 1);
+}
+
+constexpr int bos2() {
+  int k = 5;
+  char cs[10] = {};
+  return __builtin_object_size(&cs[k], 2);
+}
+
+constexpr int bos3() {
+  int k = 5;
+  char cs[10] = {};
+  return __builtin_object_size(&cs[k], 3);
+}
+
+static_assert(bos0() == sizeof(char) * 5, "");
+static_assert(bos1() == sizeof(char) * 5, "");
+static_assert(bos2() == sizeof(char) * 5, "");
+static_assert(bos3() == sizeof(char) * 5, "");
+}
+
+namespace in_enable_if {
+// The code that prompted these changes was __bos in enable_if
+
+void copy5CharsInto(char *buf) // expected-note{{candidate}}
+    __attribute__((enable_if(__builtin_object_size(buf, 0) != -1 &&
+                                 __builtin_object_size(buf, 0) > 5,
+                             "")));
+
+// We use different EvalModes for __bos with type 0 versus 1. Ensure 1 works,
+// too...
+void copy5CharsIntoStrict(char *buf) // expected-note{{candidate}}
+    __attribute__((enable_if(__builtin_object_size(buf, 1) != -1 &&
+                                 __builtin_object_size(buf, 1) > 5,
+                             "")));
+
+struct LargeStruct {
+  int pad;
+  char buf[6];
+  int pad2;
+};
+
+struct SmallStruct {
+  int pad;
+  char buf[5];
+  int pad2;
+};
+
+void noWriteToBuf() {
+  char buf[6];
+  copy5CharsInto(buf);
+
+  LargeStruct large;
+  copy5CharsIntoStrict(large.buf);
+}
+
+void initTheBuf() {
+  char buf[6] = {};
+  copy5CharsInto(buf);
+
+  LargeStruct large = {0, {}, 0};
+  copy5CharsIntoStrict(large.buf);
+}
+
+int getI();
+void initTheBufWithALoop() {
+  char buf[6] = {};
+  for (unsigned I = getI(); I != sizeof(buf); ++I)
+    buf[I] = I;
+  copy5CharsInto(buf);
+
+  LargeStruct large;
+  for (unsigned I = getI(); I != sizeof(buf); ++I)
+    large.buf[I] = I;
+  copy5CharsIntoStrict(large.buf);
+}
+
+void tooSmallBuf() {
+  char buf[5];
+  copy5CharsInto(buf); // expected-error{{no matching function for call}}
+
+  SmallStruct small;
+  copy5CharsIntoStrict(small.buf); // expected-error{{no matching function for call}}
+}
+}
diff --git a/test/SemaCXX/c99-variable-length-array-cxx11.cpp b/test/SemaCXX/c99-variable-length-array-cxx11.cpp
index 03cf283..6885841 100644
--- a/test/SemaCXX/c99-variable-length-array-cxx11.cpp
+++ b/test/SemaCXX/c99-variable-length-array-cxx11.cpp
@@ -22,5 +22,9 @@
   POD array2[N]; // expected-warning{{variable length arrays are a C99 feature}}
   StillPOD array3[N]; // expected-warning{{variable length arrays are a C99 feature}}
   StillPOD2 array4[N][3]; // expected-warning{{variable length arrays are a C99 feature}}
-  NonPOD array5[N]; // expected-error{{variable length array of non-POD element type 'NonPOD'}}
+  NonPOD array5[N]; // expected-error{{no matching constructor for initialization of 'NonPOD [N]'}}
+  // expected-warning@-1{{variable length arrays are a C99 feature}}
+  // expected-note@-16{{candidate constructor not viable}}
+  // expected-note@-18{{candidate constructor (the implicit copy constructor) not viable}}
+  // expected-note@-19{{candidate constructor (the implicit move constructor) not viable}}
 }
diff --git a/test/SemaCXX/c99-variable-length-array.cpp b/test/SemaCXX/c99-variable-length-array.cpp
index 237f564..5fd7e37 100644
--- a/test/SemaCXX/c99-variable-length-array.cpp
+++ b/test/SemaCXX/c99-variable-length-array.cpp
@@ -16,8 +16,8 @@
 void vla(int N) {
   int array1[N]; // expected-warning{{variable length arrays are a C99 feature}}
   POD array2[N]; // expected-warning{{variable length arrays are a C99 feature}}
-  NonPOD array3[N]; // expected-error{{variable length array of non-POD element type 'NonPOD'}}
-  NonPOD2 array4[N][3]; // expected-error{{variable length array of non-POD element type 'NonPOD2'}}
+  NonPOD array3[N]; // expected-warning{{variable length arrays are a C99 feature}}
+  NonPOD2 array4[N][3]; // expected-warning{{variable length arrays are a C99 feature}}
 }
 
 /// Warn about VLAs in templates.
diff --git a/test/SemaCXX/class.cpp b/test/SemaCXX/class.cpp
index a669440..a359368 100644
--- a/test/SemaCXX/class.cpp
+++ b/test/SemaCXX/class.cpp
@@ -1,7 +1,12 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s 
 class C {
 public:
-  auto int errx; // expected-error {{storage class specified for a member declaration}} expected-warning {{'auto' storage class specifier is redundant}}
+  auto int errx; // expected-error {{storage class specified for a member declaration}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{'auto' storage class specifier is redundant}}
+#else
+  // expected-warning@-4 {{'auto' storage class specifier is not permitted in C++11, and will not be supported in future releases}}
+#endif
   register int erry; // expected-error {{storage class specified for a member declaration}}
   extern int errz; // expected-error {{storage class specified for a member declaration}}
 
@@ -36,12 +41,18 @@
 
   enum E1 { en1, en2 };
 
-  int i = 0; // expected-warning {{in-class initialization of non-static data member is a C++11 extension}}
+  int i = 0;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
   static int si = 0; // expected-error {{non-const static data member must be initialized out of line}}
   static const NestedC ci = 0; // expected-error {{static data member of type 'const C::NestedC' must be initialized out of line}}
   static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}}
   static const int vi = 0;
   static const volatile int cvi = 0; // ok, illegal in C++11
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{static const volatile data member must be initialized out of line}}
+#endif
   static const E evi = 0;
 
   void m() {
@@ -169,10 +180,18 @@
 
 namespace rdar8367341 {
   float foo();
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{declared here}}
+#endif
 
   struct A {
+#if __cplusplus <= 199711L
     static const float x = 5.0f; // expected-warning {{in-class initializer for static data member of type 'const float' is a GNU extension}}
     static const float y = foo(); // expected-warning {{in-class initializer for static data member of type 'const float' is a GNU extension}} expected-error {{in-class initializer for static data member is not a constant expression}}
+#else
+    static constexpr float x = 5.0f;
+    static constexpr float y = foo(); // expected-error {{constexpr variable 'y' must be initialized by a constant expression}} expected-note {{non-constexpr function 'foo' cannot be used in a constant expression}}
+#endif
   };
 }
 
diff --git a/test/SemaCXX/condition.cpp b/test/SemaCXX/condition.cpp
index b757fcb..5596564 100644
--- a/test/SemaCXX/condition.cpp
+++ b/test/SemaCXX/condition.cpp
@@ -65,3 +65,7 @@
 void test5_inst() {
    test5<int>();
 }
+
+void PR28373() {
+  if (!x) {} // expected-error {{undeclared}}
+}
diff --git a/test/SemaCXX/conditional-expr.cpp b/test/SemaCXX/conditional-expr.cpp
index 538de58..c12efc0 100644
--- a/test/SemaCXX/conditional-expr.cpp
+++ b/test/SemaCXX/conditional-expr.cpp
@@ -384,3 +384,12 @@
     int &test() { return b_ ? i_ : throw 1; }
   };
 }
+
+namespace PR26448 {
+struct Base {};
+struct Derived : Base {};
+Base b;
+Derived d;
+typedef decltype(true ? static_cast<Base&&>(b) : static_cast<Derived&&>(d)) x;
+typedef Base &&x;
+}
diff --git a/test/SemaCXX/constant-expression-cxx11.cpp b/test/SemaCXX/constant-expression-cxx11.cpp
index 7b9d015..e2b3f09 100644
--- a/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1181,6 +1181,20 @@
     constexpr int j = 0;
     constexpr int k; // expected-error {{default initialization of an object of const type}}
   }
+
+  extern const int q;
+  constexpr int g() { return q; }
+  constexpr int q = g();
+  static_assert(q == 0, "zero-initialization should precede static initialization");
+
+  extern int r; // expected-note {{here}}
+  constexpr int h() { return r; } // expected-error {{never produces a constant}} expected-note {{read of non-const}}
+
+  struct S { int n; };
+  extern const S s;
+  constexpr int x() { return s.n; }
+  constexpr S s = {x()};
+  static_assert(s.n == 0, "zero-initialization should precede static initialization");
 }
 
 namespace ComplexConstexpr {
@@ -2005,3 +2019,50 @@
   constexpr int a = *f().p;
   constexpr int b = *g().p;
 }
+
+namespace IncompleteClass {
+  struct XX {
+    static constexpr int f(XX*) { return 1; } // expected-note {{here}}
+    friend constexpr int g(XX*) { return 2; } // expected-note {{here}}
+
+    static constexpr int i = f(static_cast<XX*>(nullptr)); // expected-error {{constexpr variable 'i' must be initialized by a constant expression}}  expected-note {{undefined function 'f' cannot be used in a constant expression}}
+    static constexpr int j = g(static_cast<XX*>(nullptr)); // expected-error {{constexpr variable 'j' must be initialized by a constant expression}}  expected-note {{undefined function 'g' cannot be used in a constant expression}}
+  };
+}
+
+namespace InheritedCtor {
+  struct A { constexpr A(int) {} };
+
+  struct B : A { int n; using A::A; }; // expected-note {{here}}
+  constexpr B b(0); // expected-error {{constant expression}} expected-note {{derived class}}
+
+  struct C : A { using A::A; struct { union { int n, m = 0; }; union { int a = 0; }; int k = 0; }; struct {}; union {}; }; // expected-warning 4{{extension}}
+  constexpr C c(0);
+
+  struct D : A {
+    using A::A; // expected-note {{here}}
+    struct { // expected-warning {{extension}}
+      union { // expected-warning {{extension}}
+        int n;
+      };
+    };
+  };
+  constexpr D d(0); // expected-error {{constant expression}} expected-note {{derived class}}
+
+  struct E : virtual A { using A::A; }; // expected-note {{here}}
+  // We wrap a function around this to avoid implicit zero-initialization
+  // happening first; the zero-initialization step would produce the same
+  // error and defeat the point of this test.
+  void f() {
+    constexpr E e(0); // expected-error {{constant expression}} expected-note {{derived class}}
+  }
+  // FIXME: This produces a note with no source location.
+  //constexpr E e(0);
+
+  struct W { constexpr W(int n) : w(n) {} int w; };
+  struct X : W { using W::W; int x = 2; };
+  struct Y : X { using X::X; int y = 3; };
+  struct Z : Y { using Y::Y; int z = 4; };
+  constexpr Z z(1);
+  static_assert(z.w == 1 && z.x == 2 && z.y == 3 && z.z == 4, "");
+}
diff --git a/test/SemaCXX/constant-expression-cxx1y.cpp b/test/SemaCXX/constant-expression-cxx1y.cpp
index e9ecbe8..f810322 100644
--- a/test/SemaCXX/constant-expression-cxx1y.cpp
+++ b/test/SemaCXX/constant-expression-cxx1y.cpp
@@ -179,12 +179,10 @@
   static_assert(!test1(100), "");
   static_assert(!test1(101), ""); // expected-error {{constant expression}} expected-note {{in call to 'test1(101)'}}
 
-  // FIXME: We should be able to reject this before it's called
-  constexpr void f() {
+  constexpr void f() { // expected-error{{constexpr function never produces a constant expression}} expected-note@+2{{assignment to dereferenced one-past-the-end pointer is not allowed in a constant expression}}
     char foo[10] = { "z" }; // expected-note {{here}}
-    foo[10] = 'x'; // expected-warning {{past the end}} expected-note {{assignment to dereferenced one-past-the-end pointer}}
+    foo[10] = 'x'; // expected-warning {{past the end}}
   }
-  constexpr int k = (f(), 0); // expected-error {{constant expression}} expected-note {{in call}}
 }
 
 namespace array_resize {
@@ -938,3 +936,24 @@
   constexpr int testb = f(e2, 3); // expected-error {{constant expression}} expected-note {{in call}}
   constexpr int testc = f(e3, 3);
 }
+
+namespace SpeculativeEvalWrites {
+  // Ensure that we don't try to speculatively evaluate writes.
+  constexpr int f() {
+    int i = 0;
+    int a = 0;
+    // __builtin_object_size speculatively evaluates its first argument.
+    __builtin_object_size((i = 1, &a), 0);
+    return i;
+  }
+
+  static_assert(!f(), "");
+}
+
+namespace PR27989 {
+  constexpr int f(int n) {
+    int a = (n = 1, 0);
+    return n;
+  }
+  static_assert(f(0) == 1, "");
+}
diff --git a/test/SemaCXX/constant-expression-cxx1z.cpp b/test/SemaCXX/constant-expression-cxx1z.cpp
new file mode 100644
index 0000000..e84de44
--- /dev/null
+++ b/test/SemaCXX/constant-expression-cxx1z.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s -fcxx-exceptions -triple=x86_64-linux-gnu
+
+namespace BaseClassAggregateInit {
+  struct A {
+    int a, b, c;
+    constexpr A(int n) : a(n), b(3 * n), c(b - 1) {} // expected-note {{outside the range of representable}}
+    constexpr A() : A(10) {};
+  };
+  struct B : A {};
+  struct C { int q; };
+  struct D : B, C { int k; };
+
+  constexpr D d1 = { 1, 2, 3 };
+  static_assert(d1.a == 1 && d1.b == 3 && d1.c == 2 && d1.q == 2 && d1.k == 3);
+
+  constexpr D d2 = { 14 };
+  static_assert(d2.a == 14 && d2.b == 42 && d2.c == 41 && d2.q == 0 && d2.k == 0);
+
+  constexpr D d3 = { A(5), C{2}, 1 };
+  static_assert(d3.a == 5 && d3.b == 15 && d3.c == 14 && d3.q == 2 && d3.k == 1);
+
+  constexpr D d4 = {};
+  static_assert(d4.a == 10 && d4.b == 30 && d4.c == 29 && d4.q == 0 && d4.k == 0);
+
+  constexpr D d5 = { __INT_MAX__ }; // expected-error {{must be initialized by a constant expression}}
+  // expected-note-re@-1 {{in call to 'A({{.*}})'}}
+}
diff --git a/test/SemaCXX/constexpr-nqueens.cpp b/test/SemaCXX/constexpr-nqueens.cpp
index b158d6e..47133a2 100644
--- a/test/SemaCXX/constexpr-nqueens.cpp
+++ b/test/SemaCXX/constexpr-nqueens.cpp
@@ -10,26 +10,26 @@
   constexpr Board(const Board &O) : State(O.State), Failed(O.Failed) {}
   constexpr Board(uint64_t State, bool Failed = false) :
     State(State), Failed(Failed) {}
-  constexpr Board addQueen(int Row, int Col) {
+  constexpr Board addQueen(int Row, int Col) const {
     return Board(State | ((uint64_t)Row << (Col * 4)));
   }
-  constexpr int getQueenRow(int Col) {
+  constexpr int getQueenRow(int Col) const {
     return (State >> (Col * 4)) & 0xf;
   }
-  constexpr bool ok(int Row, int Col) {
+  constexpr bool ok(int Row, int Col) const {
     return okRecurse(Row, Col, 0);
   }
-  constexpr bool okRecurse(int Row, int Col, int CheckCol) {
+  constexpr bool okRecurse(int Row, int Col, int CheckCol) const {
     return Col == CheckCol ? true :
            getQueenRow(CheckCol) == Row ? false :
            getQueenRow(CheckCol) == Row + (Col - CheckCol) ? false :
            getQueenRow(CheckCol) == Row + (CheckCol - Col) ? false :
            okRecurse(Row, Col, CheckCol + 1);
   }
-  constexpr bool at(int Row, int Col) {
+  constexpr bool at(int Row, int Col) const {
     return getQueenRow(Col) == Row;
   }
-  constexpr bool check(const char *, int=0, int=0);
+  constexpr bool check(const char *, int=0, int=0) const;
 };
 
 constexpr Board buildBoardRecurse(int N, int Col, const Board &B);
@@ -54,7 +54,7 @@
 
 constexpr Board q8 = buildBoard(8);
 
-constexpr bool Board::check(const char *p, int Row, int Col) {
+constexpr bool Board::check(const char *p, int Row, int Col) const {
   return
     *p == '\n' ? check(p+1, Row+1, 0) :
     *p == 'o' ? at(Row, Col) && check(p+1, Row, Col+1) :
diff --git a/test/SemaCXX/constexpr-value-init.cpp b/test/SemaCXX/constexpr-value-init.cpp
index 0651111..3528fdc 100644
--- a/test/SemaCXX/constexpr-value-init.cpp
+++ b/test/SemaCXX/constexpr-value-init.cpp
@@ -14,7 +14,7 @@
   constexpr A a; // expected-error {{constant expression}} expected-note {{in call to 'A()'}}
 }
 
-constexpr B b1; // expected-error {{without a user-provided default constructor}}
+constexpr B b1; // ok
 constexpr B b2 = B(); // ok
 static_assert(b2.a.a == 1, "");
 static_assert(b2.a.b == 2, "");
diff --git a/test/SemaCXX/constructor-recovery.cpp b/test/SemaCXX/constructor-recovery.cpp
index c1bb436..a0d8441 100644
--- a/test/SemaCXX/constructor-recovery.cpp
+++ b/test/SemaCXX/constructor-recovery.cpp
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 
-struct C {
+struct C { // expected-note 1+{{candidate}}
   virtual C() = 0; // expected-error{{constructor cannot be declared 'virtual'}}
 };
 
 void f() {
- C c;
+ C c; // expected-error {{no matching constructor}}
 }
diff --git a/test/SemaCXX/conversion-function.cpp b/test/SemaCXX/conversion-function.cpp
index 649f6b4..3f494cc 100644
--- a/test/SemaCXX/conversion-function.cpp
+++ b/test/SemaCXX/conversion-function.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -fsyntax-only -Wbind-to-temporary-copy -verify %s 
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fsyntax-only -Wbind-to-temporary-copy -verify -std=c++98 %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fsyntax-only -Wbind-to-temporary-copy -verify -std=c++11 %s
+
 class X { 
 public:
   operator bool();
@@ -133,7 +136,12 @@
 
 A1 f() {
   // FIXME: redundant diagnostics!
-  return "Hello"; // expected-error {{calling a private constructor}} expected-warning {{an accessible copy constructor}}
+  return "Hello"; // expected-error {{calling a private constructor}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{an accessible copy constructor}}
+#else
+  // expected-warning@-4 {{copying parameter of type 'A1' when binding a reference to a temporary would invoke an inaccessible constructor in C++98}}
+#endif
 }
 
 namespace source_locations {
@@ -175,7 +183,13 @@
     (&operator bool())(); // expected-error {{use a typedef to declare a conversion to 'bool (&)()'}}
     *operator int();  // expected-error {{put the complete type after 'operator'}}
     // No suggestion of using a typedef here; that's not possible.
-    template<typename T> (&operator T())(); // expected-error-re {{cannot specify any part of a return type in the declaration of a conversion function{{$}}}}
+    template<typename T> (&operator T())();
+#if __cplusplus <= 199711L
+    // expected-error-re@-2 {{cannot specify any part of a return type in the declaration of a conversion function{{$}}}}
+#else
+    // expected-error-re@-4 {{cannot specify any part of a return type in the declaration of a conversion function; use an alias template to declare a conversion to 'T (&)()'{{$}}}}
+#endif
+
   };
 }
 
@@ -193,6 +207,10 @@
   };
 
   struct X { // expected-note{{candidate constructor (the implicit copy constructor) not}}
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{candidate constructor (the implicit move constructor) not}}
+#endif
+
     explicit X(Y);
   };
 
@@ -215,7 +233,12 @@
 };
 
 void test_any() {
-  Any any = Other(); // expected-error{{cannot pass object of non-POD type 'Other' through variadic constructor; call will abort at runtime}}
+  Any any = Other();
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{cannot pass object of non-POD type 'Other' through variadic constructor; call will abort at runtime}}
+#else
+  // expected-error@-4 {{cannot pass object of non-trivial type 'Other' through variadic constructor; call will abort at runtime}}
+#endif
 }
 
 namespace PR7055 {
diff --git a/test/SemaCXX/conversion.cpp b/test/SemaCXX/conversion.cpp
index eea8ac2..7b86cec 100644
--- a/test/SemaCXX/conversion.cpp
+++ b/test/SemaCXX/conversion.cpp
@@ -256,3 +256,45 @@
 }
 
 }
+
+// More tests with macros.  Specficially, test function-like macros that either
+// have a pointer return type or take pointer arguments.  Basically, if the
+// macro was changed into a function and Clang doesn't warn, then it shouldn't
+// warn for the macro either.
+namespace test13 {
+#define check_str_nullptr_13(str) ((str) ? str : nullptr)
+#define check_str_null_13(str) ((str) ? str : NULL)
+#define test13(condition) if (condition) return;
+#define identity13(arg) arg
+#define CHECK13(condition) test13(identity13(!(condition)))
+
+void function1(const char* str) {
+  CHECK13(check_str_nullptr_13(str));
+  CHECK13(check_str_null_13(str));
+}
+
+bool some_bool_function(bool);
+void function2() {
+  CHECK13(some_bool_function(nullptr));  // expected-warning{{implicit conversion of nullptr constant to 'bool'}}
+  CHECK13(some_bool_function(NULL));  // expected-warning{{implicit conversion of NULL constant to 'bool'}}
+}
+
+#define run_check_nullptr_13(str) \
+    if (check_str_nullptr_13(str)) return;
+#define run_check_null_13(str) \
+    if (check_str_null_13(str)) return;
+void function3(const char* str) {
+  run_check_nullptr_13(str)
+  run_check_null_13(str)
+  if (check_str_nullptr_13(str)) return;
+  if (check_str_null_13(str)) return;
+}
+
+void run(int* ptr);
+#define conditional_run_13(ptr) \
+    if (ptr) run(ptr);
+void function4() {
+  conditional_run_13(nullptr);
+  conditional_run_13(NULL);
+}
+}
diff --git a/test/SemaCXX/crashes.cpp b/test/SemaCXX/crashes.cpp
index 926d13a..a80587d 100644
--- a/test/SemaCXX/crashes.cpp
+++ b/test/SemaCXX/crashes.cpp
@@ -105,8 +105,7 @@
 namespace PR10270 {
   template<typename T> class C;
   template<typename T> void f() {
-    if (C<T> == 1) // expected-error{{expected unqualified-id}} \
-                   // expected-error{{invalid '==' at end of declaration}}
+    if (C<T> == 1) // expected-error{{expected unqualified-id}}
       return;
   }
 }
diff --git a/test/SemaCXX/cstyle-cast.cpp b/test/SemaCXX/cstyle-cast.cpp
index afac6a1..2327d7b 100644
--- a/test/SemaCXX/cstyle-cast.cpp
+++ b/test/SemaCXX/cstyle-cast.cpp
@@ -84,11 +84,11 @@
   (void)(void*)((int*)0);
   (void)(volatile const void*)((const int*)0);
   (void)(A*)((B*)0);
-  (void)(A&)(*((B*)0));
+  (void)(A&)(*((B*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)(const B*)((C1*)0);
-  (void)(B&)(*((C1*)0));
+  (void)(B&)(*((C1*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)(A*)((D*)0);
-  (void)(const A&)(*((D*)0));
+  (void)(const A&)(*((D*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)(int B::*)((int A::*)0);
   (void)(void (B::*)())((void (A::*)())0);
   (void)(A*)((E*)0); // C-style cast ignores access control
diff --git a/test/SemaCXX/cxx0x-cursory-default-delete.cpp b/test/SemaCXX/cxx0x-cursory-default-delete.cpp
index dfca17a..17215fe 100644
--- a/test/SemaCXX/cxx0x-cursory-default-delete.cpp
+++ b/test/SemaCXX/cxx0x-cursory-default-delete.cpp
@@ -11,6 +11,7 @@
   non_const_copy& operator = (non_const_copy&) &;
   non_const_copy& operator = (non_const_copy&) &&;
   non_const_copy() = default; // expected-note {{not viable}}
+  int uninit_field;
 };
 non_const_copy::non_const_copy(non_const_copy&) = default; // expected-note {{not viable}}
 non_const_copy& non_const_copy::operator = (non_const_copy&) & = default; // expected-note {{not viable}}
@@ -30,6 +31,98 @@
   ncc = cncc; // expected-error {{no viable overloaded}}
 };
 
+struct no_fields { };
+struct all_init {
+  int a = 0;
+  int b = 0;
+};
+struct some_init {
+  int a = 0;
+  int b;
+  int c = 0;
+};
+struct some_init_mutable {
+  int a = 0;
+  mutable int b;
+  int c = 0;
+};
+struct some_init_def {
+  some_init_def() = default;
+  int a = 0;
+  int b;
+  int c = 0;
+};
+struct some_init_ctor {
+  some_init_ctor();
+  int a = 0;
+  int b;
+  int c = 0;
+};
+struct sub_some_init : public some_init_def { };
+struct sub_some_init_ctor : public some_init_def {
+  sub_some_init_ctor();
+};
+struct sub_some_init_ctor2 : public some_init_ctor {
+};
+struct some_init_container {
+  some_init_def sid;
+};
+struct some_init_container_ctor {
+  some_init_container_ctor();
+  some_init_def sid;
+};
+struct no_fields_container {
+  no_fields nf;
+};
+struct param_pack_ctor {
+  template <typename... T>
+  param_pack_ctor(T...);
+  int n;
+};
+struct param_pack_ctor_field {
+  param_pack_ctor ndc;
+};
+struct multi_param_pack_ctor {
+  template <typename... T, typename... U>
+  multi_param_pack_ctor(T..., U..., int f = 0);
+  int n;
+};
+struct ignored_template_ctor_and_def {
+  template <class T> ignored_template_ctor_and_def(T* f = nullptr);
+  ignored_template_ctor_and_def() = default;
+  int field;
+};
+template<bool, typename = void> struct enable_if {};
+template<typename T> struct enable_if<true, T> { typedef T type; };
+struct multi_param_pack_and_defaulted {
+  template <typename... T,
+            typename enable_if<sizeof...(T) != 0>::type* = nullptr>
+  multi_param_pack_and_defaulted(T...);
+  multi_param_pack_and_defaulted() = default;
+  int n;
+};
+
+void constobjs() {
+  const no_fields nf; // ok
+  const all_init ai; // ok
+  const some_init si; // expected-error {{default initialization of an object of const type 'const some_init' without a user-provided default constructor}}
+  const some_init_mutable sim; // ok
+  const some_init_def sid; // expected-error {{default initialization of an object of const type 'const some_init_def' without a user-provided default constructor}}
+  const some_init_ctor sic; // ok
+  const sub_some_init ssi; // expected-error {{default initialization of an object of const type 'const sub_some_init' without a user-provided default constructor}}
+  const sub_some_init_ctor ssic; // ok
+  const sub_some_init_ctor2 ssic2; // ok
+  const some_init_container sicon; // expected-error {{default initialization of an object of const type 'const some_init_container' without a user-provided default constructor}}
+  const some_init_container_ctor siconc; // ok
+  const no_fields_container nfc; // ok
+  const param_pack_ctor ppc; // ok
+  const param_pack_ctor_field ppcf; // ok
+  const multi_param_pack_ctor mppc; // ok
+  const multi_param_pack_and_defaulted mppad; // expected-error {{default initialization of an object of const type 'const multi_param_pack_and_defaulted' without a user-provided default constructor}}
+  const ignored_template_ctor_and_def itcad; // expected-error {{default initialization of an object of const type 'const ignored_template_ctor_and_def' without a user-provided default constructor}}
+
+}
+
 struct non_const_derived : non_const_copy {
   non_const_derived(const non_const_derived&) = default; // expected-error {{requires it to be non-const}}
   non_const_derived& operator =(non_const_derived&) = default;
diff --git a/test/SemaCXX/cxx0x-defaulted-functions.cpp b/test/SemaCXX/cxx0x-defaulted-functions.cpp
index 617a257..16e20ff 100644
--- a/test/SemaCXX/cxx0x-defaulted-functions.cpp
+++ b/test/SemaCXX/cxx0x-defaulted-functions.cpp
@@ -150,6 +150,14 @@
   Y::~Y() = default; // expected-error {{definition of explicitly defaulted}}
 }
 
+namespace PR27699 {
+  struct X {
+    X();
+  };
+  X::X() = default; // expected-note {{here}}
+  X::X() = default; // expected-error {{redefinition of 'X'}}
+}
+
 namespace PR14577 {
   template<typename T>
   struct Outer {
@@ -188,3 +196,15 @@
   A<int> a;
   B<int> b; // expected-note {{here}}
 }
+
+namespace PR27941 {
+struct ExplicitBool {
+  ExplicitBool &operator=(bool) = default; // expected-error{{only special member functions may be defaulted}}
+  int member;
+};
+
+int fn() {
+  ExplicitBool t;
+  t = true;
+}
+}
diff --git a/test/SemaCXX/cxx11-inheriting-ctors.cpp b/test/SemaCXX/cxx11-inheriting-ctors.cpp
index 04aa117..9c33ac0 100644
--- a/test/SemaCXX/cxx11-inheriting-ctors.cpp
+++ b/test/SemaCXX/cxx11-inheriting-ctors.cpp
@@ -34,3 +34,25 @@
     using B::A;
   };
 }
+
+namespace DefaultCtorConflict {
+  struct A { A(int = 0); };
+  struct B : A {
+    using A::A;
+  } b; // ok, not ambiguous, inherited constructor suppresses implicit default constructor
+  struct C {
+    B b;
+  } c;
+}
+
+namespace InvalidConstruction {
+  struct A { A(int); };
+  struct B { B() = delete; };
+  struct C : A, B { using A::A; };
+  // Initialization here is performed as if by a defaulted default constructor,
+  // which would be ill-formed (in the immediate context) in this case because
+  // it would be defined as deleted.
+  template<typename T> void f(decltype(T(0))*);
+  template<typename T> int &f(...);
+  int &r = f<C>(0);
+}
diff --git a/test/SemaCXX/cxx1y-deduced-return-type.cpp b/test/SemaCXX/cxx1y-deduced-return-type.cpp
index e3f6f96..593ec48 100644
--- a/test/SemaCXX/cxx1y-deduced-return-type.cpp
+++ b/test/SemaCXX/cxx1y-deduced-return-type.cpp
@@ -502,3 +502,7 @@
   using T = decltype(x);
   void (T::*p)(int) const = &T::operator();
 }
+
+void forinit_decltypeauto() {
+  for (decltype(auto) forinit_decltypeauto_inner();;) {} // expected-warning {{interpreted as a function}} expected-note {{replace}}
+}
diff --git a/test/SemaCXX/cxx1y-init-captures.cpp b/test/SemaCXX/cxx1y-init-captures.cpp
index d36882d..d681954 100644
--- a/test/SemaCXX/cxx1y-init-captures.cpp
+++ b/test/SemaCXX/cxx1y-init-captures.cpp
@@ -196,3 +196,13 @@
   auto a = [x{X()}] { return x.n; }; // ok
   auto b = [x = {X()}] {}; // expected-error{{<initializer_list>}}
 }
+
+namespace init_capture_non_mutable {
+void test(double weight) {
+  double init;
+  auto find = [max = init](auto current) {
+    max = current; // expected-error{{cannot assign to a variable captured by copy in a non-mutable lambda}}
+  };
+  find(weight); // expected-note {{in instantiation of function template specialization}}
+}
+}
diff --git a/test/SemaCXX/cxx1y-variable-templates_in_class.cpp b/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
index 1c59585..e2fbdfd 100644
--- a/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
+++ b/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fsyntax-only %s -Wno-c++11-extensions -Wno-c++1y-extensions -DPRECXX11
+// RUN: %clang_cc1 -std=c++98 -verify -fsyntax-only %s -Wno-c++11-extensions -Wno-c++1y-extensions -DPRECXX11
 // RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only -Wno-c++1y-extensions %s
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only %s -DCPP1Y
 
diff --git a/test/SemaCXX/cxx1y-variable-templates_top_level.cpp b/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
index 787868f..496ae88 100644
--- a/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
+++ b/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fsyntax-only -Wno-c++11-extensions -Wno-c++1y-extensions %s -DPRECXX11
+// RUN: %clang_cc1 -std=c++98 -verify -fsyntax-only -Wno-c++11-extensions -Wno-c++1y-extensions %s -DPRECXX11
 // RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only -Wno-c++1y-extensions %s
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only %s
 
@@ -458,3 +458,9 @@
   template<> int g<double>; // expected-error {{no variable template matches specialization; did you mean to use 'g' as function template instead?}}
 }
 
+#ifndef PRECXX11
+template <typename... Args> struct Variadic_t { };
+template <typename... Args> Variadic_t<Args...> Variadic;
+auto variadic1 = Variadic<>;
+auto variadic2 = Variadic<int, int>;
+#endif
diff --git a/test/SemaCXX/cxx1z-constexpr-lambdas.cpp b/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
new file mode 100644
index 0000000..526dd27
--- /dev/null
+++ b/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks %s
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s 
+// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -fblocks %s -DCPP14_AND_EARLIER
+
+
+namespace test_lambda_is_literal {
+#ifdef CPP14_AND_EARLIER
+//expected-error@+4{{not a literal type}}
+//expected-note@+2{{not an aggregate and has no constexpr constructors}}
+#endif
+auto L = [] { };
+constexpr int foo(decltype(L) l) { return 0; }
+
+}
+
+#ifndef CPP14_AND_EARLIER
+namespace test_constexpr_checking {
+
+namespace ns1 {
+  struct NonLit { ~NonLit(); };  //expected-note{{not literal}}
+  auto L = [](NonLit NL) constexpr { }; //expected-error{{not a literal type}}
+} // end ns1
+
+namespace ns2 {
+  auto L = [](int I) constexpr { asm("non-constexpr");  }; //expected-error{{not allowed in constexpr function}}
+} // end ns1
+
+} // end ns test_constexpr_checking
+
+namespace test_constexpr_call {
+
+namespace ns1 {
+  auto L = [](int I) { return I; };
+  static_assert(L(3) == 3);
+} // end ns1
+namespace ns2 {
+  auto L = [](auto a) { return a; };
+  static_assert(L(3) == 3);
+  static_assert(L(3.14) == 3.14);
+}
+namespace ns3 {
+  auto L = [](auto a) { asm("non-constexpr"); return a; }; //expected-note{{declared here}}
+  constexpr int I =  //expected-error{{must be initialized by a constant expression}}
+      L(3); //expected-note{{non-constexpr function}}
+} 
+
+} // end ns test_constexpr_call
+
+#endif // ndef CPP14_AND_EARLIER
+
diff --git a/test/SemaCXX/cxx1z-decomposition.cpp b/test/SemaCXX/cxx1z-decomposition.cpp
new file mode 100644
index 0000000..12c863c
--- /dev/null
+++ b/test/SemaCXX/cxx1z-decomposition.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+void use_from_own_init() {
+  auto [a] = a; // expected-error {{binding 'a' cannot appear in the initializer of its own decomposition declaration}}
+}
+
+// As a Clang extension, _Complex can be decomposed.
+float decompose_complex(_Complex float cf) {
+  static _Complex float scf;
+  auto &[sre, sim] = scf;
+  // ok, this is references initialized by constant expressions all the way down
+  static_assert(&sre == &__real scf);
+  static_assert(&sim == &__imag scf);
+
+  auto [re, im] = cf;
+  return re*re + im*im;
+}
+
+// As a Clang extension, vector types can be decomposed.
+typedef float vf3 __attribute__((ext_vector_type(3)));
+float decompose_vector(vf3 v) {
+  auto [x, y, z] = v;
+  auto *p = &x; // expected-error {{address of vector element requested}}
+  return x + y + z;
+}
+
+struct S { int a, b; };
+constexpr int f(S s) {
+  auto &[a, b] = s;
+  return a * 10 + b;
+}
+static_assert(f({1, 2}) == 12);
+
+constexpr bool g(S &&s) { 
+  auto &[a, b] = s;
+  return &a == &s.a && &b == &s.b && &a != &b;
+}
+static_assert(g({1, 2}));
+
+void enclosing() {
+  struct S { int a; };
+  auto [n] = S(); // expected-note 2{{'n' declared here}}
+
+  struct Q { int f() { return n; } }; // expected-error {{reference to local binding 'n' declared in enclosing function}}
+  // FIXME: This is probably supposed to be valid, but we do not have clear rules on how it's supposed to work.
+  (void) [&] { return n; }; // expected-error {{reference to local binding 'n' declared in enclosing function}}
+  (void) [n] {}; // expected-error {{'n' in capture list does not name a variable}}
+}
+
+// FIXME: by-value array copies
diff --git a/test/SemaCXX/cxx1z-init-statement-warn-unused.cpp b/test/SemaCXX/cxx1z-init-statement-warn-unused.cpp
new file mode 100644
index 0000000..5390da4
--- /dev/null
+++ b/test/SemaCXX/cxx1z-init-statement-warn-unused.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -std=c++1z -verify -Wuninitialized %s
+
+void testIf() {
+  if (bool b; b) // expected-warning {{uninitialized}} expected-note {{to silence}}
+    ;
+  if (int a, b = 2; a) // expected-warning {{uninitialized}} expected-note {{to silence}}
+    ;
+  int a;
+  if (a = 0; a) {} // OK
+}
+
+void testSwitch() {
+  switch (bool b; b) { // expected-warning {{uninitialized}} expected-warning {{boolean value}} expected-note {{to silence}}
+    case 0:
+      break;
+  }
+  switch (int a, b = 7; a) { // expected-warning {{uninitialized}} expected-note {{to silence}}
+    case 0:
+      break;
+  }
+  int c;
+  switch (c = 0; c) { // OK
+    case 0:
+      break;
+  }
+}
diff --git a/test/SemaCXX/cxx1z-init-statement.cpp b/test/SemaCXX/cxx1z-init-statement.cpp
new file mode 100644
index 0000000..4afe040
--- /dev/null
+++ b/test/SemaCXX/cxx1z-init-statement.cpp
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+void testIf() {
+  int x = 0;
+  if (x; x) ++x;
+  if (int t = 0; t) ++t; else --t;
+
+  if (int x, y = 0; y) // expected-note 2 {{previous definition is here}}
+    int x = 0; // expected-error {{redefinition of 'x'}}
+  else
+    int x = 0; // expected-error {{redefinition of 'x'}}
+
+  if (x; int a = 0) ++a;
+  if (x, +x; int a = 0) // expected-note 2 {{previous definition is here}} expected-warning {{unused}}
+    int a = 0; // expected-error {{redefinition of 'a'}}
+  else
+    int a = 0; // expected-error {{redefinition of 'a'}}
+
+  if (int b = 0; b)
+    ;
+  b = 2; // expected-error {{use of undeclared identifier}}
+}
+
+void testSwitch() {
+  int x = 0;
+  switch (x; x) {
+    case 1:
+      ++x;
+  }
+
+  switch (int x, y = 0; y) {
+    case 1:
+      ++x;
+    default:
+      ++y;
+  }
+
+  switch (int x, y = 0; y) { // expected-note 2 {{previous definition is here}}
+    case 0:
+      int x = 0; // expected-error {{redefinition of 'x'}}
+    case 1:
+      int y = 0; // expected-error {{redefinition of 'y'}}
+  };
+
+  switch (x; int a = 0) {
+    case 0:
+      ++a;
+  }
+
+  switch (x, +x; int a = 0) { // expected-note {{previous definition is here}} expected-warning {{unused}}
+    case 0:
+      int a = 0; // expected-error {{redefinition of 'a'}} // expected-note {{previous definition is here}}
+    case 1:
+      int a = 0; // expected-error {{redefinition of 'a'}}
+  }
+
+  switch (int b = 0; b) {
+    case 0:
+      break;
+  }
+  b = 2; // expected-error {{use of undeclared identifier}}
+}
+
+constexpr bool constexpr_if_init(int n) {
+  if (int a = n; ++a > 0)
+    return true;
+  else
+    return false;
+}
+
+constexpr int constexpr_switch_init(int n) {
+  switch (int p = n + 2; p) {
+    case 0:
+      return 0;
+    case 1:
+      return 1;
+    default:
+      return -1;
+  }
+}
+
+void test_constexpr_init_stmt() {
+  constexpr bool a = constexpr_if_init(-2);
+  static_assert(!a, "");
+  static_assert(constexpr_if_init(1), "");
+
+  constexpr int b = constexpr_switch_init(-1);
+  static_assert(b == 1, "");
+  static_assert(constexpr_switch_init(-2) == 0, "");
+  static_assert(constexpr_switch_init(-5) == -1, "");
+}
diff --git a/test/SemaCXX/cxx1z-lambda-star-this.cpp b/test/SemaCXX/cxx1z-lambda-star-this.cpp
new file mode 100644
index 0000000..a84e653
--- /dev/null
+++ b/test/SemaCXX/cxx1z-lambda-star-this.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -DDELAYED_TEMPLATE_PARSING
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fms-extensions %s -DMS_EXTENSIONS
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING
+
+template<class, class> constexpr bool is_same = false;
+template<class T> constexpr bool is_same<T, T> = true;
+
+namespace test_star_this {
+namespace ns1 {
+class A {
+  int x = 345;
+  auto foo() {
+    (void) [*this, this] { };  //expected-error{{'this' can appear only once}}
+    (void) [this] { ++x; };
+    (void) [*this] { ++x; };  //expected-error{{read-only variable}}
+    (void) [*this] () mutable { ++x; };
+    (void) [=] { return x; };
+    (void) [&, this] { return x; };
+    (void) [=, *this] { return x; };
+    (void) [&, *this] { return x; };
+  }
+};
+} // end ns1
+
+namespace ns2 {
+  class B {
+    B(const B&) = delete; //expected-note{{deleted here}}
+    int *x = (int *) 456;
+    void foo() {
+      (void)[this] { return x; };
+      (void)[*this] { return x; }; //expected-error{{call to deleted}}
+    }
+  };
+} // end ns2
+namespace ns3 {
+  class B {
+    B(const B&) = delete; //expected-note2{{deleted here}}
+    
+    int *x = (int *) 456;
+    public: 
+    template<class T = int>
+    void foo() {
+      (void)[this] { return x; };
+      (void)[*this] { return x; }; //expected-error2{{call to deleted}}
+    }
+    
+    B() = default;
+  } b;
+  B *c = (b.foo(), nullptr); //expected-note{{in instantiation}}
+} // end ns3
+
+namespace ns4 {
+template<class U>
+class B {
+  B(const B&) = delete; //expected-note{{deleted here}}
+  double d = 3.14;
+  public: 
+  template<class T = int>
+  auto foo() {
+    const auto &L = [*this] (auto a) mutable { //expected-error{{call to deleted}}
+      d += a; 
+      return [this] (auto b) { return d +=b; }; 
+    }; 
+  }
+  
+  B() = default;
+};
+void main() {
+  B<int*> b;
+  b.foo(); //expected-note{{in instantiation}}
+} // end main  
+} // end ns4
+namespace ns5 {
+
+struct X {
+  double d = 3.14;
+  X(const volatile X&);
+  void foo() {
+      
+  }
+  
+  void foo() const { //expected-note{{const}}
+    
+    auto L = [*this] () mutable { 
+      static_assert(is_same<decltype(this), X*>);
+      ++d;
+      auto M = [this] { 
+        static_assert(is_same<decltype(this), X*>);  
+        ++d;
+        auto N = [] {
+          static_assert(is_same<decltype(this), X*>); 
+        };
+      };
+    };
+    
+    auto L1 = [*this] { 
+      static_assert(is_same<decltype(this), const X*>);
+      auto M = [this] () mutable { 
+        static_assert(is_same<decltype(this), const X*>);  
+        auto N = [] {
+          static_assert(is_same<decltype(this), const X*>); 
+        };
+      };
+      auto M2 = [*this] () mutable { 
+        static_assert(is_same<decltype(this), X*>);  
+        auto N = [] {
+          static_assert(is_same<decltype(this), X*>); 
+        };
+      };
+    };
+    
+    auto GL1 = [*this] (auto a) { 
+      static_assert(is_same<decltype(this), const X*>);
+      auto M = [this] (auto b) mutable { 
+        static_assert(is_same<decltype(this), const X*>);  
+        auto N = [] (auto c) {
+          static_assert(is_same<decltype(this), const X*>); 
+        };
+        return N;
+      };
+      
+      auto M2 = [*this] (auto a) mutable { 
+        static_assert(is_same<decltype(this), X*>);  
+        auto N = [] (auto b) {
+          static_assert(is_same<decltype(this), X*>); 
+        };
+        return N;
+      };
+      return [=](auto a) mutable { M(a)(a); M2(a)(a); };
+    };
+    
+    GL1("abc")("abc");
+    
+    
+    auto L2 = [this] () mutable {
+      static_assert(is_same<decltype(this), const X*>);  
+      ++d; //expected-error{{cannot assign}}
+    };
+    auto GL = [*this] (auto a) mutable {
+      static_assert(is_same<decltype(this), X*>);
+      ++d;
+      auto M = [this] (auto b) { 
+        static_assert(is_same<decltype(this), X*>);  
+        ++d;
+        auto N = [] (auto c) {
+          static_assert(is_same<decltype(this), X*>); 
+        };
+        N(3.14);
+      };
+      M("abc");
+    };
+    GL(3.14);
+ 
+  }
+  void foo() volatile const {
+    auto L = [this] () {
+      static_assert(is_same<decltype(this), const volatile X*>);
+      auto M = [*this] () mutable { 
+        static_assert(is_same<decltype(this), X*>);
+        auto N = [this] {
+          static_assert(is_same<decltype(this), X*>);
+          auto M = [] {
+            static_assert(is_same<decltype(this), X*>);
+          };
+        };
+        auto N2 = [*this] {
+          static_assert(is_same<decltype(this), const X*>);
+        };
+      };
+      auto M2 = [*this] () {
+        static_assert(is_same<decltype(this), const X*>); 
+        auto N = [this] {
+          static_assert(is_same<decltype(this), const X*>);
+        };
+      };
+    };
+  }
+  
+};
+
+} //end ns5
+namespace ns6 {
+struct X {
+  double d;
+  auto foo() const {
+    auto L = [*this] () mutable {
+      auto M = [=] (auto a) {
+        auto N = [this] {
+          ++d;
+          static_assert(is_same<decltype(this), X*>);
+          auto O = [*this] {
+            static_assert(is_same<decltype(this), const X*>);
+          };
+        };
+        N();
+        static_assert(is_same<decltype(this), X*>);
+      };
+      return M;
+    };
+    return L;
+  }
+}; 
+
+int main() {
+  auto L = X{}.foo();
+  auto M = L();
+  M(3.14);
+}
+} // end ns6
+namespace ns7 {
+
+struct X {
+  double d;
+  X();
+  X(const X&); 
+  X(X&) = delete;
+  auto foo() const {
+    //OK - the object used to initialize our capture is a const object and so prefers the non-deleted ctor.
+    const auto &&L = [*this] { };
+  }
+  
+}; 
+int main() {
+  X x;
+  x.foo();
+}
+} // end ns7
+
+} //end ns test_star_this
+
diff --git a/test/SemaCXX/dcl_init_aggr.cpp b/test/SemaCXX/dcl_init_aggr.cpp
index 432c116..2b5149c 100644
--- a/test/SemaCXX/dcl_init_aggr.cpp
+++ b/test/SemaCXX/dcl_init_aggr.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify -std=c++11 %s
 // C++ [dcl.init.aggr]p2
 struct A { 
   int x;
@@ -9,14 +11,29 @@
 } a1 = { 1, { 2, 3 } };
 
 struct NonAggregate {
+#if __cplusplus >= 201103L
+// expected-note@-2 3 {{candidate constructor (the implicit copy constructor) not viable}}
+// expected-note@-3 3 {{candidate constructor (the implicit move constructor) not viable}}
+#endif
   NonAggregate();
-
+#if __cplusplus >= 201103L
+// expected-note@-2 3 {{candidate constructor not viable: requires 0 arguments, but 2 were provided}}
+#endif
   int a, b;
 };
-NonAggregate non_aggregate_test = { 1, 2 }; // expected-error{{non-aggregate type 'NonAggregate' cannot be initialized with an initializer list}}
+NonAggregate non_aggregate_test = { 1, 2 };
+#if __cplusplus <= 199711L
+// expected-error@-2 {{non-aggregate type 'NonAggregate' cannot be initialized with an initializer list}}
+#else
+// expected-error@-4 {{no matching constructor for initialization of 'NonAggregate'}}
+#endif
 
-NonAggregate non_aggregate_test2[2] = { { 1, 2 }, { 3, 4 } }; // expected-error 2 {{non-aggregate type 'NonAggregate' cannot be initialized with an initializer list}}
-
+NonAggregate non_aggregate_test2[2] = { { 1, 2 }, { 3, 4 } };
+#if __cplusplus <= 199711L
+// expected-error@-2 2 {{non-aggregate type 'NonAggregate' cannot be initialized with an initializer list}}
+#else
+// expected-error@-4 2 {{no matching constructor for initialization of 'NonAggregate'}}
+#endif
 
 // C++ [dcl.init.aggr]p3
 A a_init = A(); 
@@ -38,20 +55,55 @@
 
 // C++ [dcl.init.aggr]p7
 struct TooFew { int a; char* b; int c; }; 
-TooFew too_few = { 1, "asdf" }; // expected-warning{{conversion from string literal to 'char *' is deprecated}}
+TooFew too_few = { 1, "asdf" };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{conversion from string literal to 'char *' is deprecated}}
+#else
+// expected-warning@-4 {{ISO C++11 does not allow conversion from string literal to 'char *'}}
+#endif
 
-struct NoDefaultConstructor { // expected-note 3 {{candidate constructor (the implicit copy constructor)}} \
-                              // expected-note{{declared here}}
-  NoDefaultConstructor(int); // expected-note 3 {{candidate constructor}}
+struct NoDefaultConstructor {
+#if __cplusplus <= 199711L
+// expected-note@-2 3 {{candidate constructor (the implicit copy constructor)}}
+// expected-note@-3 {{declared here}}
+#else
+// expected-note@-5 4 {{candidate constructor (the implicit copy constructor)}}
+// expected-note@-6 4 {{candidate constructor (the implicit move constructor)}}
+#endif
+
+  NoDefaultConstructor(int);
+#if __cplusplus <= 199711L
+  // expected-note@-2 3 {{candidate constructor not viable: requires 1 argument, but 0 were provided}}
+#else
+  // expected-note@-4 4 {{candidate constructor not viable: requires 1 argument, but 0 were provided}}
+#endif
+
 };
-struct TooFewError { // expected-error{{implicit default constructor for}}
+struct TooFewError {
+#if __cplusplus <= 199711L
+// expected-error@-2 {{implicit default constructor for}}
+#endif
+
   int a;
-  NoDefaultConstructor nodef; // expected-note{{member is declared here}} expected-note 2{{in implicit initialization of field 'nodef'}}
+  NoDefaultConstructor nodef;
+#if __cplusplus <= 199711L
+// expected-note@-2 {{member is declared here}}
+// expected-note@-3 2{{in implicit initialization of field 'nodef' with omitted initializer}}
+#else
+// expected-note@-5 3{{in implicit initialization of field 'nodef' with omitted initializer}}
+#endif
 };
 TooFewError too_few_okay = { 1, 1 };
 TooFewError too_few_error = { 1 }; // expected-error{{no matching constructor}}
 
-TooFewError too_few_okay2[2] = { 1, 1 }; // expected-note{{implicit default constructor for 'TooFewError' first required here}}
+TooFewError too_few_okay2[2] = { 1, 1 };
+#if __cplusplus <= 199711L
+// expected-note@-2 {{implicit default constructor for 'TooFewError' first required here}}
+#else
+// expected-error@-4 {{no matching constructor for initialization of 'NoDefaultConstructor'}}
+// expected-note@-5 {{in implicit initialization of array element 1 with omitted initializer}}
+#endif
+
 TooFewError too_few_error2[2] = { 1 }; // expected-error{{no matching constructor}}
 
 NoDefaultConstructor too_few_error3[3] = { }; // expected-error {{no matching constructor}} expected-note {{implicit initialization of array element 0}}
@@ -116,6 +168,10 @@
 
 // C++ [dcl.init.aggr]p15:
 union u { int a; char* b; }; // expected-note{{candidate constructor (the implicit copy constructor)}}
+#if __cplusplus >= 201103L
+// expected-note@-2 {{candidate constructor (the implicit move constructor)}}
+#endif
+
 u u1 = { 1 }; 
 u u2 = u1; 
 u u3 = 1; // expected-error{{no viable conversion}}
diff --git a/test/SemaCXX/default2.cpp b/test/SemaCXX/default2.cpp
index c4d40b4..8f77f30 100644
--- a/test/SemaCXX/default2.cpp
+++ b/test/SemaCXX/default2.cpp
@@ -128,3 +128,7 @@
 
 template <int I1 = I2, int I2 = 1> struct T {};  // expected-error-re {{use of undeclared identifier 'I2'{{$}}}}
 T<0, 1> t;
+
+struct PR28105 {
+  PR28105 (int = 0, int = 0, PR28105 = 0);  // expected-error{{recursive evaluation of default argument}}
+};
diff --git a/test/SemaCXX/delete-and-function-templates.cpp b/test/SemaCXX/delete-and-function-templates.cpp
new file mode 100644
index 0000000..22e95cb
--- /dev/null
+++ b/test/SemaCXX/delete-and-function-templates.cpp
@@ -0,0 +1,133 @@
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fdelayed-template-parsing %s 
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fms-extensions %s 
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fdelayed-template-parsing -fms-extensions %s 
+
+template<class T, class U> struct is_same { enum { value = false }; };
+template<class T> struct is_same<T, T> { enum { value = true }; };
+
+namespace test_sfinae_and_delete {
+
+namespace ns1 {
+template<class T> double f(T) = delete; //expected-note{{candidate}}
+char f(...); //expected-note{{candidate}}
+
+static_assert(is_same<decltype(f(3)),char>::value, ""); //expected-error{{call to deleted function}} expected-error{{static_assert failed}}
+
+template<class T> decltype(f(T{})) g(T); // this one sfinae's out.
+template<class T> int *g(T);
+void foo() {
+  int *ip = g(3);
+}
+} //end ns1
+
+namespace ns2 {
+template<class T> double* f(T);
+template<> double* f(double) = delete;
+
+template<class T> decltype(f(T{})) g(T); // expected-note{{candidate}}
+template<class T> int *g(T); //expected-note{{candidate}}
+void foo() {
+  double *dp = g(3); //expected-error{{ambiguous}}
+  int *ip = g(3.14); // this is OK - because the explicit specialization is deleted and sfinae's out one of the template candidates
+}
+
+} // end ns2
+
+namespace ns3 {
+template<class T> double* f(T) = delete;
+template<> double* f(double);
+
+template<class T> decltype(f(T{})) g(T); // expected-note{{candidate}}
+template<class T> int *g(T); //expected-note{{candidate}}
+
+void foo() {
+  int *dp = g(3); // this is OK - because the non-double specializations are deleted and sfinae's out one of the template candidates
+  double *ip = g(3.14); //expected-error{{ambiguous}}
+}
+
+} // end ns3
+} // end ns test_sfinae_and_delete
+
+namespace test_explicit_specialization_of_member {
+namespace ns1 {
+template<class T> struct X {
+  int* f(T) = delete;
+}; 
+template<> int* X<int>::f(int) { }
+
+template<class T> decltype(X<T>{}.f(T{})) g(T); // expected-note{{candidate}}
+template<class T> int *g(T); //expected-note{{candidate}}
+
+void foo() {
+  int *ip2 = g(3.14); // this is OK - because the non-int specializations are deleted and sfinae's out one of the template candidates
+  int *ip = g(3); //expected-error{{ambiguous}}
+}
+
+} // end ns1
+
+namespace ns2 {
+struct X {
+template<class T> double* f(T) = delete;
+}; 
+template<> double* X::f(int);
+
+template<class T> decltype(X{}.f(T{})) g(T); // expected-note{{candidate}}
+template<class T> int *g(T); //expected-note{{candidate}}
+
+void foo() {
+  int *ip2 = g(3.14); // this is OK - because the non-int specializations are deleted and sfinae's out one of the template candidates
+  int *ip = g(3); //expected-error{{ambiguous}}
+}
+
+} // end ns2
+
+namespace ns3 {
+template<class T> struct X {
+  template<class U> double *f1(U, T) = delete;
+  template<class U> double *f2(U, T) = delete;
+};
+template<> template<> double* X<int>::f1(int, int);
+template<> template<class U> double* X<int>::f2(U, int);
+
+template<class T, class U> decltype(X<T>{}.f1(U{}, T{})) g1(U, T); // expected-note{{candidate}}
+template<class T, class U> int *g1(U, T); //expected-note{{candidate}}
+
+template<class T, class U> decltype(X<T>{}.f2(U{}, T{})) g2(U, T); // expected-note2{{candidate}}
+template<class T, class U> int *g2(U, T); //expected-note2{{candidate}}
+
+
+void foo() {
+  int *ip2 = g1(3.14, 3); // this is OK - because the non-int specializations are deleted and sfinae's out one of the template candidates
+  int *ip = g1(3, 3); //expected-error{{ambiguous}}
+  {
+   int *ip3 = g2(3.14, 3); //expected-error{{ambiguous}}
+   int *ip4 = g2(3, 3); //expected-error{{ambiguous}}
+  }
+  {
+   int *ip3 = g2(3.14, 3.14); 
+   int *ip4 = g2(3, 3.14); 
+  }
+}
+
+
+} // end ns3
+
+namespace ns4 {
+template < typename T> T* foo (T);
+template <> int* foo(int) = delete;
+template <> int* foo(int); //expected-note{{candidate}}
+
+int *IP = foo(2); //expected-error{{deleted}}
+double *DP = foo(3.14);
+} //end ns4
+
+namespace ns5 {
+template < typename T> T* foo (T);
+template <> int* foo(int); //expected-note{{previous}}
+template <> int* foo(int) = delete; //expected-error{{deleted definition must be first declaration}}
+
+} //end ns5
+
+
+} // end test_explicit_specializations_and_delete
diff --git a/test/SemaCXX/deleted-operator.cpp b/test/SemaCXX/deleted-operator.cpp
index df67978..f71e83a 100644
--- a/test/SemaCXX/deleted-operator.cpp
+++ b/test/SemaCXX/deleted-operator.cpp
@@ -9,7 +9,7 @@
   PR10757 a1;
   // FIXME: We get a ridiculous number of "built-in candidate" notes here...
   if(~a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 8 {{built-in candidate}}
-  if(a1==a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 121 {{built-in candidate}}
+  if(a1==a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 144 {{built-in candidate}}
 }
 
 struct DelOpDel {
diff --git a/test/SemaCXX/dllexport.cpp b/test/SemaCXX/dllexport.cpp
index 0bbf9b3..b4850fc 100644
--- a/test/SemaCXX/dllexport.cpp
+++ b/test/SemaCXX/dllexport.cpp
@@ -16,13 +16,19 @@
 
 
 // Invalid usage.
-__declspec(dllexport) typedef int typedef1; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
-typedef __declspec(dllexport) int typedef2; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
-typedef int __declspec(dllexport) typedef3; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
-typedef __declspec(dllexport) void (*FunTy)(); // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
-enum __declspec(dllexport) Enum {}; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
+__declspec(dllexport) typedef int typedef1;
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
+typedef __declspec(dllexport) int typedef2;
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
+typedef int __declspec(dllexport) typedef3;
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
+typedef __declspec(dllexport) void (*FunTy)();
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
+enum __declspec(dllexport) Enum {};
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
 #if __has_feature(cxx_strong_enums)
-  enum class __declspec(dllexport) EnumClass {}; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
+enum class __declspec(dllexport) EnumClass {};
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
 #endif
 
 
diff --git a/test/SemaCXX/dllimport.cpp b/test/SemaCXX/dllimport.cpp
index 5d8ce78..36a8ac6 100644
--- a/test/SemaCXX/dllimport.cpp
+++ b/test/SemaCXX/dllimport.cpp
@@ -15,13 +15,19 @@
 
 
 // Invalid usage.
-__declspec(dllimport) typedef int typedef1; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
-typedef __declspec(dllimport) int typedef2; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
-typedef int __declspec(dllimport) typedef3; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
-typedef __declspec(dllimport) void (*FunTy)(); // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
-enum __declspec(dllimport) Enum {}; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
+__declspec(dllimport) typedef int typedef1;
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
+typedef __declspec(dllimport) int typedef2;
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
+typedef int __declspec(dllimport) typedef3;
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
+typedef __declspec(dllimport) void (*FunTy)();
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
+enum __declspec(dllimport) Enum {};
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
 #if __has_feature(cxx_strong_enums)
-  enum class __declspec(dllimport) EnumClass {}; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
+enum class __declspec(dllimport) EnumClass {};
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
 #endif
 
 
@@ -44,17 +50,49 @@
 int __declspec(dllimport) GlobalInit2 = 1; // expected-error{{definition of dllimport data}}
 
 // Declare, then reject definition.
-__declspec(dllimport) extern int ExternGlobalDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int ExternGlobalDeclInit = 1; // expected-warning{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+__declspec(dllimport) extern int ExternGlobalDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int ExternGlobalDeclInit = 1;
 
-__declspec(dllimport) int GlobalDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int GlobalDeclInit = 1; // expected-warning{{'GlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+__declspec(dllimport) int GlobalDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int GlobalDeclInit = 1;
 
-int *__attribute__((dllimport)) GlobalDeclChunkAttrInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int *GlobalDeclChunkAttrInit = 0; // expected-warning{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+int *__attribute__((dllimport)) GlobalDeclChunkAttrInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int *GlobalDeclChunkAttrInit = 0;
 
-int GlobalDeclAttrInit __attribute__((dllimport)); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int GlobalDeclAttrInit = 1; // expected-warning{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+int GlobalDeclAttrInit __attribute__((dllimport)); // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int GlobalDeclAttrInit = 1;
 
 // Redeclarations
 __declspec(dllimport) extern int GlobalRedecl1;
@@ -69,8 +107,6 @@
 int GlobalRedecl2c __attribute__((dllimport));
 int GlobalRedecl2c __attribute__((dllimport));
 
-// NB: MSVC issues a warning and makes GlobalRedecl3 dllexport. We follow GCC
-// and drop the dllimport with a warning.
 __declspec(dllimport) extern int GlobalRedecl3; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
                       extern int GlobalRedecl3; // expected-warning{{'GlobalRedecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
 
@@ -135,11 +171,31 @@
 template<typename T> int __declspec(dllimport) VarTmplInit2 = 1; // expected-error{{definition of dllimport data}}
 
 // Declare, then reject definition.
-template<typename T> __declspec(dllimport) extern int ExternVarTmplDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-template<typename T>                              int ExternVarTmplDeclInit = 1; // expected-warning{{'ExternVarTmplDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+3{{previous attribute is here}}
+#endif
+template <typename T>
+__declspec(dllimport) extern int ExternVarTmplDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+5{{'ExternVarTmplDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ExternVarTmplDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+int ExternVarTmplDeclInit = 1;
 
-template<typename T> __declspec(dllimport) int VarTmplDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-template<typename T>                       int VarTmplDeclInit = 1; // expected-warning{{'VarTmplDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+3{{previous attribute is here}}
+#endif
+template <typename T>
+__declspec(dllimport) int VarTmplDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+5{{'VarTmplDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'VarTmplDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+int VarTmplDeclInit = 1;
 
 // Redeclarations
 template<typename T> __declspec(dllimport) extern int VarTmplRedecl1;
@@ -238,13 +294,20 @@
 __declspec(dllimport) void redecl1();
 __declspec(dllimport) void redecl1();
 
-// NB: MSVC issues a warning and makes redecl2/redecl3 dllexport. We follow GCC
-// and drop the dllimport with a warning.
 __declspec(dllimport) void redecl2(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
                       void redecl2(); // expected-warning{{'redecl2' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
 
-__declspec(dllimport) void redecl3(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-                      void redecl3() {} // expected-warning{{'redecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+                      // expected-note@+2{{previous attribute is here}}
+#endif
+                      __declspec(dllimport) void redecl3(); // expected-note{{previous declaration is here}}
+                      // NB: Both MSVC and Clang issue a warning and make redecl3 dllexport.
+#ifdef MS
+                      // expected-warning@+4{{'redecl3' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                      // expected-warning@+2{{'redecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+                      void redecl3() {}
 
                       void redecl4(); // expected-note{{previous declaration is here}}
 __declspec(dllimport) void redecl4(); // expected-warning{{redeclaration of 'redecl4' should not add 'dllimport' attribute}}
@@ -266,7 +329,10 @@
 struct FuncFriend {
   friend __declspec(dllimport) void friend1();
   friend __declspec(dllimport) void friend2(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-  friend __declspec(dllimport) void friend3(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  friend __declspec(dllimport) void friend3(); // expected-note{{previous declaration is here}}
   friend                       void friend4(); // expected-note{{previous declaration is here}}
 #ifdef MS
 // expected-note@+2{{previous declaration is here}}
@@ -275,7 +341,12 @@
 };
 __declspec(dllimport) void friend1();
                       void friend2(); // expected-warning{{'friend2' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
-                      void friend3() {} // expected-warning{{'friend3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+                      // expected-warning@+4{{'friend3' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                      // expected-warning@+2{{'friend3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+                      void friend3() {}
 __declspec(dllimport) void friend4(); // expected-warning{{redeclaration of 'friend4' should not add 'dllimport' attribute}}
 #ifdef MS
 __declspec(dllimport) inline void friend5() {} // expected-warning{{redeclaration of 'friend5' should not add 'dllimport' attribute}}
@@ -447,33 +518,39 @@
 struct ImportMembers {
   struct Nested {
     __declspec(dllimport) void normalDecl();
-    __declspec(dllimport) void normalDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+    __declspec(dllimport) void normalDef(); // expected-note{{previous declaration is here}}
   };
 
 #ifdef GNU
+// expected-note@+5{{previous attribute is here}}
 // expected-warning@+5{{'dllimport' attribute ignored on inline function}}
 // expected-warning@+6{{'dllimport' attribute ignored on inline function}}
 #endif
   __declspec(dllimport)                void normalDecl();
-  __declspec(dllimport)                void normalDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+  __declspec(dllimport) void normalDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport)                void normalInclass() {}
   __declspec(dllimport)                void normalInlineDef();
   __declspec(dllimport)         inline void normalInlineDecl();
 #ifdef GNU
+// expected-note@+5{{previous attribute is here}}
 // expected-warning@+5{{'dllimport' attribute ignored on inline function}}
 // expected-warning@+6{{'dllimport' attribute ignored on inline function}}
 #endif
   __declspec(dllimport) virtual        void virtualDecl();
-  __declspec(dllimport) virtual        void virtualDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+  __declspec(dllimport) virtual void virtualDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport) virtual        void virtualInclass() {}
   __declspec(dllimport) virtual        void virtualInlineDef();
   __declspec(dllimport) virtual inline void virtualInlineDecl();
 #ifdef GNU
+// expected-note@+5{{previous attribute is here}}
 // expected-warning@+5{{'dllimport' attribute ignored on inline function}}
 // expected-warning@+6{{'dllimport' attribute ignored on inline function}}
 #endif
   __declspec(dllimport) static         void staticDecl();
-  __declspec(dllimport) static         void staticDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+  __declspec(dllimport) static void staticDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport) static         void staticInclass() {}
   __declspec(dllimport) static         void staticInlineDef();
   __declspec(dllimport) static  inline void staticInlineDecl();
@@ -495,20 +572,40 @@
   __declspec(dllimport) constexpr static int ConstexprFieldDef = 1; // expected-note{{attribute is here}}
 };
 
-       void ImportMembers::Nested::normalDef() {} // expected-warning{{'ImportMembers::Nested::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
-       void ImportMembers::normalDef() {} // expected-warning{{'ImportMembers::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+// expected-warning@+4{{'ImportMembers::Nested::normalDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                                                                                 // expected-warning@+2{{'ImportMembers::Nested::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+void ImportMembers::Nested::normalDef() {}
+#ifdef MS
+// expected-warning@+4{{'ImportMembers::normalDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                                                                                 // expected-warning@+2{{'ImportMembers::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+void ImportMembers::normalDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportMembers::normalInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
 inline void ImportMembers::normalInlineDef() {}
        void ImportMembers::normalInlineDecl() {}
-       void ImportMembers::virtualDef() {} // expected-warning{{'ImportMembers::virtualDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+       // expected-warning@+4{{'ImportMembers::virtualDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                                                                                 // expected-warning@+2{{'ImportMembers::virtualDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+       void ImportMembers::virtualDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportMembers::virtualInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
 inline void ImportMembers::virtualInlineDef() {}
        void ImportMembers::virtualInlineDecl() {}
-       void ImportMembers::staticDef() {} // expected-warning{{'ImportMembers::staticDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+       // expected-warning@+4{{'ImportMembers::staticDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                                                                                 // expected-warning@+2{{'ImportMembers::staticDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+       void ImportMembers::staticDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportMembers::staticInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
@@ -620,7 +717,10 @@
 // Import defaulted member function definitions.
 struct ImportDefaultedDefs {
   __declspec(dllimport) ImportDefaultedDefs();
-  __declspec(dllimport) ~ImportDefaultedDefs(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) ~ImportDefaultedDefs(); // expected-note{{previous declaration is here}}
 
 #ifdef GNU
 // expected-warning@+3{{'dllimport' attribute ignored on inline function}}
@@ -630,14 +730,22 @@
   __declspec(dllimport) ImportDefaultedDefs& operator=(const ImportDefaultedDefs&);
 
   __declspec(dllimport) ImportDefaultedDefs(ImportDefaultedDefs&&);
-  __declspec(dllimport) ImportDefaultedDefs& operator=(ImportDefaultedDefs&&); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) ImportDefaultedDefs &operator=(ImportDefaultedDefs &&); // expected-note{{previous declaration is here}}
 };
 
 // Not allowed on definitions.
 __declspec(dllimport) ImportDefaultedDefs::ImportDefaultedDefs() = default; // expected-error{{dllimport cannot be applied to non-inline function definition}}
 
+#ifdef MS
+// expected-warning@+5{{'ImportDefaultedDefs::~ImportDefaultedDefs' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ImportDefaultedDefs::~ImportDefaultedDefs' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
 // dllimport cannot be dropped.
-ImportDefaultedDefs::~ImportDefaultedDefs() = default; // expected-warning{{'ImportDefaultedDefs::~ImportDefaultedDefs' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+ImportDefaultedDefs::~ImportDefaultedDefs() = default;
 
 // Import inline declaration and definition.
 #ifdef GNU
@@ -648,8 +756,12 @@
 inline ImportDefaultedDefs& ImportDefaultedDefs::operator=(const ImportDefaultedDefs&) = default;
 
 __declspec(dllimport) ImportDefaultedDefs::ImportDefaultedDefs(ImportDefaultedDefs&&) = default; // expected-error{{dllimport cannot be applied to non-inline function definition}}
-ImportDefaultedDefs& ImportDefaultedDefs::operator=(ImportDefaultedDefs&&) = default; // expected-warning{{'ImportDefaultedDefs::operator=' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
-
+#ifdef MS
+// expected-warning@+4{{'ImportDefaultedDefs::operator=' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'ImportDefaultedDefs::operator=' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+ImportDefaultedDefs &ImportDefaultedDefs::operator=(ImportDefaultedDefs &&) = default;
 
 // Redeclarations cannot add dllimport.
 struct MemberRedecl {
@@ -970,13 +1082,22 @@
 template<typename T>
 struct ImportClassTmplMembers {
   __declspec(dllimport)                void normalDecl();
-  __declspec(dllimport)                void normalDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) void normalDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport)                void normalInlineDef();
   __declspec(dllimport) virtual        void virtualDecl();
-  __declspec(dllimport) virtual        void virtualDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) virtual void virtualDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport) virtual        void virtualInlineDef();
   __declspec(dllimport) static         void staticDecl();
-  __declspec(dllimport) static         void staticDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) static void staticDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport) static         void staticInlineDef();
 
 #ifdef GNU
@@ -1013,19 +1134,37 @@
 
 // NB: MSVC is inconsistent here and disallows *InlineDef on class templates,
 // but allows it on classes. We allow both.
-template<typename T>        void ImportClassTmplMembers<T>::normalDef() {} // expected-warning{{'ImportClassTmplMembers::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+// expected-warning@+5{{'ImportClassTmplMembers::normalDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ImportClassTmplMembers::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+void ImportClassTmplMembers<T>::normalDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportClassTmplMembers::normalInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
 template<typename T> inline void ImportClassTmplMembers<T>::normalInlineDef() {}
 template<typename T>        void ImportClassTmplMembers<T>::normalInlineDecl() {}
-template<typename T>        void ImportClassTmplMembers<T>::virtualDef() {} // expected-warning{{'ImportClassTmplMembers::virtualDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+// expected-warning@+5{{'ImportClassTmplMembers::virtualDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ImportClassTmplMembers::virtualDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+void ImportClassTmplMembers<T>::virtualDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportClassTmplMembers::virtualInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
 template<typename T> inline void ImportClassTmplMembers<T>::virtualInlineDef() {}
 template<typename T>        void ImportClassTmplMembers<T>::virtualInlineDecl() {}
-template<typename T>        void ImportClassTmplMembers<T>::staticDef() {} // expected-warning{{'ImportClassTmplMembers::staticDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+// expected-warning@+5{{'ImportClassTmplMembers::staticDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ImportClassTmplMembers::staticDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+void ImportClassTmplMembers<T>::staticDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportClassTmplMembers::staticInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
diff --git a/test/SemaCXX/enable_if.cpp b/test/SemaCXX/enable_if.cpp
index a2795c4..8130813 100644
--- a/test/SemaCXX/enable_if.cpp
+++ b/test/SemaCXX/enable_if.cpp
@@ -116,9 +116,9 @@
   void g() { f(); }
 };
 
-int fn3(bool b) __attribute__((enable_if(b, "")));
+int fn3(bool b) __attribute__((enable_if(b, ""))); // FIXME: This test should net 0 error messages.
 template <class T> void test3() {
-  fn3(sizeof(T) == 1);
+  fn3(sizeof(T) == 1); // expected-error{{no matching function for call to 'fn3'}} expected-note@-2{{candidate disabled}}
 }
 
 template <typename T>
@@ -138,7 +138,7 @@
 void h(int);
 template <typename T> void outer() {
   void local_function() __attribute__((enable_if(::h(T()), "")));
-  local_function();
+  local_function(); // expected-error{{no matching function for call to 'local_function'}} expected-note@-1{{candidate disabled}}
 };
 
 namespace PR20988 {
@@ -158,9 +158,9 @@
     fn2(expr);  // expected-error{{no matching function for call to 'fn2'}}
   }
 
-  int fn3(bool b) __attribute__((enable_if(b, "")));
+  int fn3(bool b) __attribute__((enable_if(b, ""))); // FIXME: This test should net 0 error messages.
   template <class T> void test3() {
-    fn3(sizeof(T) == 1);
+    fn3(sizeof(T) == 1); // expected-error{{no matching function for call to 'fn3'}} expected-note@-2{{candidate disabled}}
   }
 }
 
@@ -386,3 +386,57 @@
   f.bar(1, 2); // expected-error{{too many arguments}}
 }
 }
+
+// Ideally, we should be able to handle value-dependent expressions sanely.
+// Sadly, that isn't the case at the moment.
+namespace dependent {
+int error(int N) __attribute__((enable_if(N, ""))); // expected-note{{candidate disabled}}
+int error(int N) __attribute__((enable_if(!N, ""))); // expected-note{{candidate disabled}}
+template <int N> int callUnavailable() {
+  return error(N); // expected-error{{no matching function for call to 'error'}}
+}
+
+constexpr int noError(int N) __attribute__((enable_if(N, ""))) { return -1; }
+constexpr int noError(int N) __attribute__((enable_if(!N, ""))) { return -1; }
+constexpr int noError(int N) { return 0; }
+
+template <int N>
+constexpr int callNoError() { return noError(N); }
+static_assert(callNoError<0>() == 0, "");
+static_assert(callNoError<1>() == 0, "");
+
+template <int N> constexpr int templated() __attribute__((enable_if(N, ""))) {
+  return 1;
+}
+
+constexpr int A = templated<0>(); // expected-error{{no matching function for call to 'templated'}} expected-note@-4{{candidate disabled}}
+static_assert(templated<1>() == 1, "");
+
+template <int N> constexpr int callTemplated() { return templated<N>(); }
+
+constexpr int B = callTemplated<0>(); // expected-error{{initialized by a constant expression}} expected-error@-2{{no matching function for call to 'templated'}} expected-note{{in instantiation of function template}} expected-note@-9{{candidate disabled}}
+static_assert(callTemplated<1>() == 1, "");
+}
+
+namespace variadic {
+void foo(int a, int b = 0, ...) __attribute__((enable_if(a && b, ""))); // expected-note 6{{disabled}}
+
+void testFoo() {
+  foo(1, 1);
+  foo(1, 1, 2);
+  foo(1, 1, 2, 3);
+
+  foo(1, 0); // expected-error{{no matching}}
+  foo(1, 0, 2); // expected-error{{no matching}}
+  foo(1, 0, 2, 3); // expected-error{{no matching}}
+
+  int m;
+  foo(1, 1);
+  foo(1, 1, m);
+  foo(1, 1, m, 3);
+
+  foo(1, 0); // expected-error{{no matching}}
+  foo(1, 0, m); // expected-error{{no matching}}
+  foo(1, 0, m, 3); // expected-error{{no matching}}
+}
+}
diff --git a/test/SemaCXX/enum-scoped.cpp b/test/SemaCXX/enum-scoped.cpp
index 9098023..142edd3 100644
--- a/test/SemaCXX/enum-scoped.cpp
+++ b/test/SemaCXX/enum-scoped.cpp
@@ -298,8 +298,8 @@
   int E::*p; // expected-error {{does not point into a class}}
   using E::f; // expected-error {{no member named 'f'}}
 
-  using E::a; // ok!
-  E b = a;
+  using E::a; // expected-error {{using declaration cannot refer to a scoped enumerator}}
+  E b = a; // expected-error {{undeclared}}
 }
 
 namespace test11 {
diff --git a/test/SemaCXX/eval-sizeof-dependent-type.cpp b/test/SemaCXX/eval-sizeof-dependent-type.cpp
new file mode 100644
index 0000000..1a5564a
--- /dev/null
+++ b/test/SemaCXX/eval-sizeof-dependent-type.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -std=c++11 -x c++ %s
+
+typedef __SIZE_TYPE__ size_t;
+template <typename _Tp, size_t _Nm> struct array { _Tp _M_elems[_Nm]; };
+template <typename T> struct s {
+  array<int, 1> v{static_cast<int>(sizeof (T) / sizeof(T))};
+};
+
diff --git a/test/SemaCXX/exceptions.cpp b/test/SemaCXX/exceptions.cpp
index 9802a1a..9e76783 100644
--- a/test/SemaCXX/exceptions.cpp
+++ b/test/SemaCXX/exceptions.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++11 %s
 
 struct A; // expected-note 4 {{forward declaration of 'A'}}
 
@@ -135,16 +137,29 @@
   void f() throw (int*, int());
 
   template<typename T> struct C {
-    void f() throw (T); // expected-error {{pointer to incomplete type 'Decay::E' is not allowed in exception specification}}
+    void f() throw (T);
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{pointer to incomplete type 'Decay::E' is not allowed in exception specification}}
+#endif
   };
   struct D {
     C<D[10]> c;
   };
-  struct E; // expected-note {{forward declaration}}
-  C<E[10]> e; // expected-note {{in instantiation of}}
+  struct E;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{forward declaration of 'Decay::E'}}
+#endif
+
+  C<E[10]> e;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{in instantiation of template class 'Decay::C<Decay::E [10]>' requested here}}
+#endif
 }
 
-void rval_ref() throw (int &&); // expected-error {{rvalue reference type 'int &&' is not allowed in exception specification}} expected-warning {{C++11}}
+void rval_ref() throw (int &&); // expected-error {{rvalue reference type 'int &&' is not allowed in exception specification}}
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{rvalue references are a C++11 extension}}
+#endif
 
 namespace HandlerInversion {
 struct B {};
@@ -253,3 +268,17 @@
   }
 }
 }
+
+namespace PR28047 {
+void test1(int i) {
+  try {
+  } catch (int(*)[i]) { // expected-error{{cannot catch variably modified type}}
+  }
+}
+void test2() {
+  int i;
+  try {
+  } catch (int(*)[i]) { // expected-error{{cannot catch variably modified type}}
+  }
+}
+}
diff --git a/test/SemaCXX/extern-c.cpp b/test/SemaCXX/extern-c.cpp
index 295d1f3..fa6c2b1 100644
--- a/test/SemaCXX/extern-c.cpp
+++ b/test/SemaCXX/extern-c.cpp
@@ -204,3 +204,41 @@
   struct pr5065_n6 : public virtual pr5065_3 {};
 }
 struct pr5065_n7 {};
+
+namespace tag_hiding {
+  namespace namespace_with_injected_name {
+    class Boo {
+      friend struct ExternCStruct1;
+    };
+    void ExternCStruct4(); // expected-note 2{{candidate}}
+  }
+
+  class Baz {
+    friend struct ExternCStruct2;
+    friend void ExternCStruct3();
+  };
+
+  using namespace namespace_with_injected_name;
+
+  extern "C" {
+    struct ExternCStruct1;
+    struct ExternCStruct2;
+    struct ExternCStruct3;
+    struct ExternCStruct4; // expected-note {{candidate}}
+  }
+  ExternCStruct1 *p1;
+  ExternCStruct2 *p2;
+  ExternCStruct3 *p3;
+  ExternCStruct4 *p4; // expected-error {{ambiguous}}
+
+  extern "C" {
+    struct ExternCStruct1;
+    struct ExternCStruct2;
+    struct ExternCStruct3;
+    struct ExternCStruct4; // expected-note {{candidate}}
+  }
+  ExternCStruct1 *q1 = p1;
+  ExternCStruct2 *q2 = p2;
+  ExternCStruct3 *q3 = p3;
+  ExternCStruct4 *q4 = p4; // expected-error {{ambiguous}}
+}
diff --git a/test/SemaCXX/for-range-examples.cpp b/test/SemaCXX/for-range-examples.cpp
index 9359ae6..08a9982 100644
--- a/test/SemaCXX/for-range-examples.cpp
+++ b/test/SemaCXX/for-range-examples.cpp
@@ -176,9 +176,9 @@
 
     // Make sure these don't crash. Better diagnostics would be nice.
     for (: {1, 2, 3}) {} // expected-error {{expected expression}} expected-error {{expected ';'}}
-    for (1 : {1, 2, 3}) {} // expected-error {{must declare a variable}} expected-warning {{result unused}}
+    for (1 : {1, 2, 3}) {} // expected-error {{must declare a variable}}
     for (+x : {1, 2, 3}) {} // expected-error {{undeclared identifier}} expected-error {{expected ';'}}
-    for (+y : {1, 2, 3}) {} // expected-error {{must declare a variable}} expected-warning {{result unused}}
+    for (+y : {1, 2, 3}) {} // expected-error {{must declare a variable}}
   }
 }
 
@@ -226,7 +226,7 @@
     // we check the alignment attribute before we perform the auto
     // deduction.
     for (d alignas(1) : arr) {} // expected-error {{requires type for loop variable}}
-    for (e [[deprecated]] : arr) { e = 0; } // expected-warning{{use of the 'deprecated' attribute is a C++14 extension}} expected-warning {{deprecated}} expected-note {{here}} expected-error {{requires type for loop variable}}
+    for (e [[deprecated]] : arr) { e = 0; } // expected-warning {{deprecated}} expected-note {{here}} expected-error {{requires type for loop variable}}
   }
 }
 
diff --git a/test/SemaCXX/friend.cpp b/test/SemaCXX/friend.cpp
index a8e2043..4f27f4d 100644
--- a/test/SemaCXX/friend.cpp
+++ b/test/SemaCXX/friend.cpp
@@ -147,11 +147,13 @@
     }
     using ns2::f; // expected-note {{using declaration}}
   }
-  struct A { void f(); }; // expected-note {{target of using declaration}}
+  struct A { void f(); }; // expected-note 2{{target of using declaration}}
   struct B : public A { using A::f; }; // expected-note {{using declaration}}
+  template<typename T> struct C : A { using A::f; }; // expected-note {{using declaration}}
   struct X {
     template<class T> friend void ns1::f(T t); // expected-error {{cannot befriend target of using declaration}}
     friend void B::f(); // expected-error {{cannot befriend target of using declaration}}
+    friend void C<int>::f(); // expected-error {{cannot befriend target of using declaration}}
   };
 }
 
@@ -363,3 +365,17 @@
   f_pr6954(5); // expected-error{{undeclared identifier 'f_pr6954'}}
 }
 
+namespace tag_redecl {
+  namespace N {
+    struct X *p;
+    namespace {
+      class K {
+        friend struct X;
+      };
+    }
+  }
+  namespace N {
+    struct X;
+    X *q = p;
+  }
+}
diff --git a/test/SemaCXX/function-redecl.cpp b/test/SemaCXX/function-redecl.cpp
index 2bc0d90..f91e670 100644
--- a/test/SemaCXX/function-redecl.cpp
+++ b/test/SemaCXX/function-redecl.cpp
@@ -7,7 +7,7 @@
     void bar(int); // expected-note 2{{previous declaration is here}}
   }
 
-  void foo(int); // expected-note 2{{previous declaration is here}}
+  void foo(int); // expected-note 3{{previous declaration is here}}
 
   void f2() {
     int foo(int); // expected-error {{functions that differ only in their return type cannot be overloaded}}
@@ -25,6 +25,13 @@
       }
     }
   }
+
+  void f3() {
+    int foo(float);
+    {
+      float foo(int); // expected-error {{functions that differ only in their return type cannot be overloaded}}
+    }
+  }
 }
 
 class A {
diff --git a/test/SemaCXX/functional-cast.cpp b/test/SemaCXX/functional-cast.cpp
index 9db95e8..216ee24 100644
--- a/test/SemaCXX/functional-cast.cpp
+++ b/test/SemaCXX/functional-cast.cpp
@@ -126,14 +126,14 @@
   typedef A *Ap;
   (void)Ap((B*)0);
   typedef A &Ar;
-  (void)Ar(*((B*)0));
+  (void)Ar(*((B*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   typedef const B *cBp;
   (void)cBp((C1*)0);
   typedef B &Br;
-  (void)Br(*((C1*)0));
+  (void)Br(*((C1*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)Ap((D*)0);
   typedef const A &cAr;
-  (void)cAr(*((D*)0));
+  (void)cAr(*((D*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   typedef int B::*Bmp;
   (void)Bmp((int A::*)0);
   typedef void (B::*Bmfp)();
diff --git a/test/SemaCXX/generalized-deprecated.cpp b/test/SemaCXX/generalized-deprecated.cpp
index 8fa20d0..43efea1 100644
--- a/test/SemaCXX/generalized-deprecated.cpp
+++ b/test/SemaCXX/generalized-deprecated.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only -fms-extensions -Wno-deprecated %s
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only -fms-extensions -Wno-deprecated -Wc++14-extensions %s
 
 // NOTE: use -Wno-deprecated to avoid cluttering the output with deprecated
 // warnings
diff --git a/test/SemaCXX/illegal-member-initialization.cpp b/test/SemaCXX/illegal-member-initialization.cpp
index 87069ef..17faed7 100644
--- a/test/SemaCXX/illegal-member-initialization.cpp
+++ b/test/SemaCXX/illegal-member-initialization.cpp
@@ -7,6 +7,7 @@
 };
 
 struct B {
+  int field;
 };
 
 struct X {
diff --git a/test/SemaCXX/inline.cpp b/test/SemaCXX/inline.cpp
index e569300..b20bc18 100644
--- a/test/SemaCXX/inline.cpp
+++ b/test/SemaCXX/inline.cpp
@@ -1,5 +1,18 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s -Wc++98-c++11-c++14-compat
 
 // Check that we don't allow illegal uses of inline
 // (checking C++-only constructs here)
 struct c {inline int a;}; // expected-error{{'inline' can only appear on functions}}
+
+void localVar() {
+  inline int a; // expected-error{{inline declaration of 'a' not allowed in block scope}}
+}
+
+// Check that we warn appropriately.
+#if __cplusplus <= 201402L
+inline int a; // expected-warning{{inline variables are a C++1z extension}}
+#else
+inline int a; // expected-warning{{inline variables are incompatible with C++ standards before C++1z}}
+#endif
diff --git a/test/SemaCXX/integer-overflow.cpp b/test/SemaCXX/integer-overflow.cpp
index 566bb05..a119f0e 100644
--- a/test/SemaCXX/integer-overflow.cpp
+++ b/test/SemaCXX/integer-overflow.cpp
@@ -1,6 +1,11 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only -std=gnu++98
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=gnu++98 -triple x86_64-pc-linux-gnu
 typedef unsigned long long uint64_t;
-typedef unsigned long long uint32_t;
+typedef unsigned int uint32_t;
+
+// Check integer sizes.
+int array64[sizeof(uint64_t) == 8 ? 1 : -1];
+int array32[sizeof(uint32_t) == 4 ? 1 : -1];
+int arrayint[sizeof(int) < sizeof(uint64_t) ? 1 : -1];
 
 uint64_t f0(uint64_t);
 uint64_t f1(uint64_t, uint32_t);
diff --git a/test/SemaCXX/lambda-expressions.cpp b/test/SemaCXX/lambda-expressions.cpp
index 17808ce..5fffe41 100644
--- a/test/SemaCXX/lambda-expressions.cpp
+++ b/test/SemaCXX/lambda-expressions.cpp
@@ -1,5 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -Wno-unused-value -fsyntax-only -verify -fblocks %s
-// RUN: %clang_cc1 -std=c++1y -Wno-unused-value -fsyntax-only -verify -fblocks %s
+// RUN: %clang_cc1 -std=c++14 -Wno-unused-value -fsyntax-only -verify -fblocks %s
 
 namespace std { class type_info; };
 
@@ -499,3 +498,30 @@
   };
 }
 }
+
+namespace PR27994 {
+struct A { template <class T> A(T); };
+
+template <class T>
+struct B {
+  int x;
+  A a = [&] { int y = x; };
+  A b = [&] { [&] { [&] { int y = x; }; }; };
+  A d = [&](auto param) { int y = x; };
+  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; };
+};
+
+B<int> b;
+
+template <class T> struct C {
+  struct D {
+    int x;
+    A f = [&] { int y = x; };
+  };
+};
+
+int func() {
+  C<int> a;
+  decltype(a)::D b;
+}
+}
diff --git a/test/SemaCXX/literal-operators.cpp b/test/SemaCXX/literal-operators.cpp
index ba57178..304aa7c 100644
--- a/test/SemaCXX/literal-operators.cpp
+++ b/test/SemaCXX/literal-operators.cpp
@@ -35,13 +35,14 @@
 void operator "" _good (c*);
 
 // Check extra cv-qualifiers
-void operator "" _cv_good (volatile const char *, const size_t); // expected-error {{parameter declaration for literal operator 'operator""_cv_good' is not valid}}
+void operator "" _cv_good (volatile const char *, const size_t); // expected-error {{invalid literal operator parameter type 'const volatile char *', did you mean 'const char *'?}}
 
 // Template declaration
 template <char...> void operator "" _good ();
 
-// FIXME: Test some invalid decls that might crop up.
-template <typename...> void operator "" _invalid(); // expected-error {{parameter declaration for literal operator 'operator""_invalid' is not valid}}
+template <typename...> void operator "" _invalid(); // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
+template <wchar_t...> void operator "" _invalid();  // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
+template <unsigned long long...> void operator "" _invalid();  // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
 
 _Complex float operator""if(long double); // expected-warning {{reserved}}
 _Complex float test_if_1() { return 2.0f + 1.5if; };
diff --git a/test/SemaCXX/make_integer_seq.cpp b/test/SemaCXX/make_integer_seq.cpp
index 4e15414..a9b8d2b 100644
--- a/test/SemaCXX/make_integer_seq.cpp
+++ b/test/SemaCXX/make_integer_seq.cpp
@@ -47,3 +47,7 @@
 
 template <typename T, T N> void f() {}
 __make_integer_seq<f, int, 0> x; // expected-error{{template template parameter must be a class template or type alias template}}
+
+__make_integer_seq<__make_integer_seq, int, 10> PR28494; // expected-error{{different template parameters}}
+// expected-note@make_integer_seq.cpp:* {{template parameter has a different kind}}
+// expected-note@make_integer_seq.cpp:* {{previous template template parameter is here}}
diff --git a/test/SemaCXX/member-init.cpp b/test/SemaCXX/member-init.cpp
index b3ee30b..65c8873 100644
--- a/test/SemaCXX/member-init.cpp
+++ b/test/SemaCXX/member-init.cpp
@@ -192,3 +192,13 @@
   int x[3] = {[N] = 3};
 };
 }
+
+namespace PR28060 {
+template <class T>
+void foo(T v) {
+  struct s {
+    T *s = 0;
+  };
+}
+template void foo(int);
+}
diff --git a/test/SemaCXX/ms-const-member-expr.cpp b/test/SemaCXX/ms-const-member-expr.cpp
new file mode 100644
index 0000000..72cfe76
--- /dev/null
+++ b/test/SemaCXX/ms-const-member-expr.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 %s -std=c++11 -fms-compatibility -fsyntax-only -verify
+
+struct S {
+  enum { E = 1 };
+  static const int sdm = 1;
+};
+
+void f(S *s) {
+  char array[s->E] = { 0 };
+}
+
+extern S *s;
+constexpr int e1 = s->E;
+
+S *side_effect();  // expected-note{{declared here}}
+constexpr int e2 = // expected-error{{must be initialized by a constant expression}}
+    side_effect()->E; // expected-note{{cannot be used in a constant expression}}
+
+constexpr int e4 = s->sdm;
diff --git a/test/SemaCXX/ms-empty_bases.cpp b/test/SemaCXX/ms-empty_bases.cpp
new file mode 100644
index 0000000..69d9e27
--- /dev/null
+++ b/test/SemaCXX/ms-empty_bases.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple i386-pc-win32 %s -fsyntax-only -verify -fms-extensions -Wno-microsoft -std=c++11
+
+struct __declspec(empty_bases) S {};
+enum __declspec(empty_bases) E {}; // expected-warning{{'empty_bases' attribute only applies to classes}}
+int __declspec(empty_bases) I; // expected-warning{{'empty_bases' attribute only applies to classes}}
+typedef struct T __declspec(empty_bases) U; // expected-warning{{'empty_bases' attribute only applies to classes}}
+auto z = []() __declspec(empty_bases) { return nullptr; }; // expected-warning{{'empty_bases' attribute only applies to classes}}
+
+struct __declspec(empty_bases(1)) X {}; // expected-error{{'empty_bases' attribute takes no arguments}}
diff --git a/test/SemaCXX/ms-exception-spec.cpp b/test/SemaCXX/ms-exception-spec.cpp
index 1be8ec2..0763379 100644
--- a/test/SemaCXX/ms-exception-spec.cpp
+++ b/test/SemaCXX/ms-exception-spec.cpp
@@ -1,4 +1,9 @@
-// RUN: %clang_cc1 %s -fsyntax-only -verify -fms-extensions
-// expected-no-diagnostics
+// RUN: %clang_cc1 %s -fsyntax-only -verify -fms-extensions -fexceptions -fcxx-exceptions
 
 void f() throw(...) { }
+
+namespace PR28080 {
+struct S;           // expected-note {{forward declaration}}
+void fn() throw(S); // expected-warning {{incomplete type}} expected-note{{previous declaration}}
+void fn() throw();  // expected-warning {{does not match previous declaration}}
+}
diff --git a/test/SemaCXX/ms-layout_version.cpp b/test/SemaCXX/ms-layout_version.cpp
new file mode 100644
index 0000000..7f83b2d
--- /dev/null
+++ b/test/SemaCXX/ms-layout_version.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple i386-pc-win32 %s -fsyntax-only -verify -fms-extensions -Wno-microsoft -std=c++11
+
+struct __declspec(layout_version(19)) S {};
+enum __declspec(layout_version(19)) E {}; // expected-warning{{'layout_version' attribute only applies to classes}}
+int __declspec(layout_version(19)) I; // expected-warning{{'layout_version' attribute only applies to classes}}
+typedef struct T __declspec(layout_version(19)) U; // expected-warning{{'layout_version' attribute only applies to classes}}
+auto z = []() __declspec(layout_version(19)) { return nullptr; }; // expected-warning{{'layout_version' attribute only applies to classes}}
+
+struct __declspec(layout_version(18)) X {}; // expected-error{{'layout_version' attribute parameter 18 is out of bounds}}
+struct __declspec(layout_version(20)) Y {}; // expected-error{{'layout_version' attribute parameter 20 is out of bounds}}
+struct __declspec(layout_version) Z {}; // expected-error{{attribute takes one argument}}
diff --git a/test/SemaCXX/new-delete.cpp b/test/SemaCXX/new-delete.cpp
index 7bc724b..e96603d 100644
--- a/test/SemaCXX/new-delete.cpp
+++ b/test/SemaCXX/new-delete.cpp
@@ -444,11 +444,11 @@
 
   template<typename X>
   void tfn() {
-    new (*(PlacementArg*)0) T[1];
+    new (*(PlacementArg*)0) T[1]; // expected-warning 2 {{binding dereferenced null pointer to reference has undefined behavior}}
   }
 
   void fn() {
-    tfn<int>();
+    tfn<int>();  // expected-note {{in instantiation of function template specialization 'r150682::tfn<int>' requested here}}
   }
 
 }
diff --git a/test/SemaCXX/no-wchar.cpp b/test/SemaCXX/no-wchar.cpp
index 291b657..b6dcddf 100644
--- a/test/SemaCXX/no-wchar.cpp
+++ b/test/SemaCXX/no-wchar.cpp
@@ -7,3 +7,24 @@
 void bar() {
   foo(L"wide string literal");
 }
+
+void foo1(wchar_t * t = L"");
+// expected-warning@-1 {{conversion from string literal to 'wchar_t *' (aka 'unsigned short *') is deprecated}}
+
+short *a = L"";
+// expected-error@-1 {{cannot initialize a variable of type 'short *' with an lvalue of type 'const unsigned short [1]'}}
+char *b = L"";
+// expected-error@-1 {{cannot initialize a variable of type 'char *' with an lvalue of type 'const unsigned short [1]'}}
+
+// NOTE: MSVC allows deprecated conversion in conditional expression if at least
+// one of the operand is a string literal but Clang doesn't allow it.
+wchar_t *c = true ? L"a" : L"";
+// expected-error@-1 {{cannot initialize a variable of type 'wchar_t *' (aka 'unsigned short *') with}}
+
+const wchar_t *d1 = 0;
+const wchar_t *d2 = 0;
+wchar_t *d = true ? d1 : d2;
+// expected-error@-1 {{cannot initialize a variable of type 'wchar_t *' (aka 'unsigned short *') with}}
+
+wchar_t* e = (const wchar_t*)L"";
+// expected-error@-1 {{cannot initialize a variable of type 'wchar_t *' (aka 'unsigned short *') with an rvalue of type 'const wchar_t *' (aka 'const unsigned short *')}}
diff --git a/test/SemaCXX/nullability.cpp b/test/SemaCXX/nullability.cpp
index c73c01a..2af2573 100644
--- a/test/SemaCXX/nullability.cpp
+++ b/test/SemaCXX/nullability.cpp
@@ -97,3 +97,23 @@
 
   TakeNonnull(ReturnNullable()); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}}
 }
+
+void ConditionalExpr(bool c) {
+  struct Base {};
+  struct Derived : Base {};
+
+  Base * _Nonnull p;
+  Base * _Nonnull nonnullB;
+  Base * _Nullable nullableB;
+  Derived * _Nonnull nonnullD;
+  Derived * _Nullable nullableD;
+
+  p = c ? nonnullB : nonnullD;
+  p = c ? nonnullB : nullableD; // expected-warning{{implicit conversion from nullable pointer 'Base * _Nullable' to non-nullable pointer type 'Base * _Nonnull}}
+  p = c ? nullableB : nonnullD; // expected-warning{{implicit conversion from nullable pointer 'Base * _Nullable' to non-nullable pointer type 'Base * _Nonnull}}
+  p = c ? nullableB : nullableD; // expected-warning{{implicit conversion from nullable pointer 'Base * _Nullable' to non-nullable pointer type 'Base * _Nonnull}}
+  p = c ? nonnullD : nonnullB;
+  p = c ? nonnullD : nullableB; // expected-warning{{implicit conversion from nullable pointer 'Base * _Nullable' to non-nullable pointer type 'Base * _Nonnull}}
+  p = c ? nullableD : nonnullB; // expected-warning{{implicit conversion from nullable pointer 'Base * _Nullable' to non-nullable pointer type 'Base * _Nonnull}}
+  p = c ? nullableD : nullableB; // expected-warning{{implicit conversion from nullable pointer 'Base * _Nullable' to non-nullable pointer type 'Base * _Nonnull}}
+}
diff --git a/test/SemaCXX/overload-call.cpp b/test/SemaCXX/overload-call.cpp
index 3d286a9..7eaf98b 100644
--- a/test/SemaCXX/overload-call.cpp
+++ b/test/SemaCXX/overload-call.cpp
@@ -375,16 +375,24 @@
 }
 
 // PR 6117
-namespace test3 {
-  struct Base {};
+namespace IncompleteConversion {
+  struct Complete {};
   struct Incomplete;
 
-  void foo(Base *); // expected-note 2 {{cannot convert argument of incomplete type}}
-  void foo(Base &); // expected-note 2 {{cannot convert argument of incomplete type}}
-
-  void test(Incomplete *P) {
-    foo(P); // expected-error {{no matching function for call to 'foo'}}
-    foo(*P); // expected-error {{no matching function for call to 'foo'}}
+  void completeFunction(Complete *); // expected-note 2 {{cannot convert argument of incomplete type}}
+  void completeFunction(Complete &); // expected-note 2 {{cannot convert argument of incomplete type}}
+  
+  void testTypeConversion(Incomplete *P) {
+    completeFunction(P); // expected-error {{no matching function for call to 'completeFunction'}}
+    completeFunction(*P); // expected-error {{no matching function for call to 'completeFunction'}}
+  }
+  
+  void incompletePointerFunction(Incomplete *); // expected-note {{candidate function not viable: cannot convert argument of incomplete type 'IncompleteConversion::Incomplete' to 'IncompleteConversion::Incomplete *' for 1st argument; take the address of the argument with &}}
+  void incompleteReferenceFunction(Incomplete &); // expected-note {{candidate function not viable: cannot convert argument of incomplete type 'IncompleteConversion::Incomplete *' to 'IncompleteConversion::Incomplete &' for 1st argument; dereference the argument with *}}
+  
+  void testPointerReferenceConversion(Incomplete &reference, Incomplete *pointer) {
+    incompletePointerFunction(reference); // expected-error {{no matching function for call to 'incompletePointerFunction'}}
+    incompleteReferenceFunction(pointer); // expected-error {{no matching function for call to 'incompleteReferenceFunction'}}
   }
 }
 
diff --git a/test/SemaCXX/overloaded-builtin-operators.cpp b/test/SemaCXX/overloaded-builtin-operators.cpp
index 4c2953b..7a99a89 100644
--- a/test/SemaCXX/overloaded-builtin-operators.cpp
+++ b/test/SemaCXX/overloaded-builtin-operators.cpp
@@ -183,7 +183,7 @@
   // FIXME: lots of candidates here!
   (void)(1.0f * a); // expected-error{{ambiguous}} \
                     // expected-note 4{{candidate}} \
-                    // expected-note {{remaining 117 candidates omitted; pass -fshow-overloads=all to show them}}
+                    // expected-note {{remaining 140 candidates omitted; pass -fshow-overloads=all to show them}}
 }
 
 // pr5432
diff --git a/test/SemaCXX/pr25181-crash-on-invalid.cpp b/test/SemaCXX/pr25181-crash-on-invalid.cpp
new file mode 100644
index 0000000..41178c9
--- /dev/null
+++ b/test/SemaCXX/pr25181-crash-on-invalid.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// Don't crash (PR25181).
+
+template <typename T> class Foo { // expected-note {{template parameter is declared here}}
+  template <typename T> // expected-error {{declaration of 'T' shadows template parameter}}
+  void Foo<T>::method(T *) const throw() {} // expected-error {{nested name specifier 'Foo<T>::' for declaration does not refer into a class, class template or class template partial specialization}}
+};
diff --git a/test/SemaCXX/pr27047-default-init-expr-name-conflict.cpp b/test/SemaCXX/pr27047-default-init-expr-name-conflict.cpp
new file mode 100644
index 0000000..772db99
--- /dev/null
+++ b/test/SemaCXX/pr27047-default-init-expr-name-conflict.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s
+
+template <typename T>
+struct A {
+  // Used to crash when field was named after class.
+  int A = 0;
+};
+A<int> a;
diff --git a/test/SemaCXX/pr28050.cpp b/test/SemaCXX/pr28050.cpp
new file mode 100644
index 0000000..57e90eb
--- /dev/null
+++ b/test/SemaCXX/pr28050.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -std=c++11 %s -fsyntax-only
+//
+// expected-no-diagnostics
+
+class A {
+public:
+  A(char *s) {}
+  A(A &&) = delete;
+};
+
+int main() { A a("OK"); }
diff --git a/test/SemaCXX/pragma-optimize.cpp b/test/SemaCXX/pragma-optimize.cpp
index 48a1546..cda46c5 100644
--- a/test/SemaCXX/pragma-optimize.cpp
+++ b/test/SemaCXX/pragma-optimize.cpp
@@ -151,14 +151,14 @@
 // CHECK-DAG: attributes [[ATTRYETANOTHEROPTNONE]] = { {{.*}}noinline{{.*}}optnone{{.*}} }
 
 // Check that the other functions do NOT have optnone.
-// CHECK-DAG-NOT: attributes [[ATTRFOO]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRBAZ]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRBAX]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRWOMBAT]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRCONTAINER]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRTWICE]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRCONTAINER2]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRCONTAINER3]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRTHRICEINT]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRANOTHERNORMAL]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRYETANOTHERNORMAL]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRFOO]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRBAZ]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRBAX]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRWOMBAT]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRCONTAINER]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRTWICE]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRCONTAINER2]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRCONTAINER3]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRTHRICEINT]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRANOTHERNORMAL]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRYETANOTHERNORMAL]] = { {{.*}}optnone{{.*}} }
diff --git a/test/SemaCXX/pragma-vtordisp.cpp b/test/SemaCXX/pragma-vtordisp.cpp
index 649c0ee..1421c33 100644
--- a/test/SemaCXX/pragma-vtordisp.cpp
+++ b/test/SemaCXX/pragma-vtordisp.cpp
@@ -22,7 +22,8 @@
 
 // Test a reset.
 #pragma vtordisp()
-#pragma vtordisp(pop) // expected-warning {{#pragma vtordisp(pop, ...) failed: stack empty}}
+#pragma vtordisp(pop) // stack should NOT be affected by reset.
+                      // Now stack contains '1'.
 
 #pragma vtordisp(      // expected-warning {{unknown action for '#pragma vtordisp' - ignored}}
 #pragma vtordisp(asdf) // expected-warning {{unknown action for '#pragma vtordisp' - ignored}}
@@ -42,6 +43,7 @@
   virtual void f();
 };
 
+#pragma vtordisp(pop) // After this stack should be empty.
 #pragma vtordisp(pop) // expected-warning {{#pragma vtordisp(pop, ...) failed: stack empty}}
 
 void g() {
diff --git a/test/SemaCXX/qual-id-test.cpp b/test/SemaCXX/qual-id-test.cpp
index 9994d75..61e60ae 100644
--- a/test/SemaCXX/qual-id-test.cpp
+++ b/test/SemaCXX/qual-id-test.cpp
@@ -1,9 +1,15 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 namespace A
 {
     namespace B
     {
-        struct base // expected-note{{object type}}
+        struct base
+#if __cplusplus <= 199711L
+        // expected-note@-2 {{lookup in the object type 'A::sub' refers here}}
+#endif
         {
             void x() {}
             void y() {}
@@ -85,8 +91,14 @@
     void fun4a() {
       A::sub *a;
       
-      typedef A::member base; // expected-note{{current scope}}
-      a->base::x(); // expected-error{{ambiguous}}      
+      typedef A::member base;
+#if __cplusplus <= 199711L
+      // expected-note@-2 {{lookup from the current scope refers here}}
+#endif
+      a->base::x();
+#if __cplusplus <= 199711L
+      // expected-error@-2 {{lookup of 'base' in member access expression is ambiguous}}
+#endif
     }
 
     void fun4b() {
diff --git a/test/SemaCXX/return-stack-addr-2.cpp b/test/SemaCXX/return-stack-addr-2.cpp
new file mode 100644
index 0000000..47b4595
--- /dev/null
+++ b/test/SemaCXX/return-stack-addr-2.cpp
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -std=c++11 %s
+
+namespace PR26599 {
+template <typename>
+struct S;
+
+struct I {};
+
+template <typename T>
+void *&non_pointer() {
+  void *&r = S<T>()[I{}];
+  return r;
+}
+
+template <typename T>
+void *&pointer() {
+  void *&r = S<T>()[nullptr];
+  return r;
+}
+}
+
+namespace LocalTemporary {
+
+template <class T>
+class QMap {
+public:
+  T value(const T &t = T()) const {
+    return t;
+  }
+};
+
+struct A {};
+
+void test() {
+  QMap<A *> map;
+  map.value();
+}
+
+typedef int* ptr;
+ptr int1(const ptr &p = ptr()) {
+  return (p);
+}
+
+ptr int2(const ptr &p = nullptr) {
+  return p;
+}
+
+ptr int3() {
+  const ptr &p = ptr();
+  return p;
+}
+
+const int *int4(const int &x = 5) {
+  return &x;
+}
+
+const int *int5(const int &x) {
+  return &x;
+}
+
+const int *int6() {
+  const int &x = 11;  //expected-note{{binding reference variable 'x' here}}
+  return &x;  //expected-warning{{returning address of local temporary object}}
+}
+
+const int *int7(int x) {
+  const int &x2 = x;  // expected-note{{binding reference variable 'x2' here}}
+  return &x2;  //  expected-warning{{address of stack memory associated with local variable 'x' returned}}
+}
+
+const int *int8(const int &x = 5) {
+  const int &x2 = x;
+  return &x2;
+}
+
+const int *int9() {
+  const int &x = 5;  // expected-note{{binding reference variable 'x' here}}
+  const int &x2 = x;  // expected-note{{binding reference variable 'x2' here}}
+  return &x2;  // expected-warning{{returning address of local temporary object}}
+}
+}
diff --git a/test/SemaCXX/return.cpp b/test/SemaCXX/return.cpp
index 8c16645..db28924 100644
--- a/test/SemaCXX/return.cpp
+++ b/test/SemaCXX/return.cpp
@@ -118,5 +118,5 @@
   // CXXUnresolvedConstructExpr, and the missing ')' gives it an invalid source
   // location for its rparen.  Check that emitting a diag on the range of the
   // expr doesn't assert.
-  return int(undeclared, 4; // expected-error {{expected ')'}} expected-note{{to match this '('}} expected-error {{void function 'cxx_unresolved_expr' should not return a value}} expected-error {{use of undeclared identifier 'undeclared'}}
+  return int(undeclared, 4; // expected-error {{expected ')'}} expected-note{{to match this '('}} expected-error {{use of undeclared identifier 'undeclared'}}
 }
diff --git a/test/SemaCXX/static-cast.cpp b/test/SemaCXX/static-cast.cpp
index b3fe49a..ff47c0b 100644
--- a/test/SemaCXX/static-cast.cpp
+++ b/test/SemaCXX/static-cast.cpp
@@ -43,11 +43,11 @@
   (void)static_cast<void*>((int*)0);
   (void)static_cast<volatile const void*>((const int*)0);
   (void)static_cast<A*>((B*)0);
-  (void)static_cast<A&>(*((B*)0));
+  (void)static_cast<A&>(*((B*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)static_cast<const B*>((C1*)0);
-  (void)static_cast<B&>(*((C1*)0));
+  (void)static_cast<B&>(*((C1*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)static_cast<A*>((D*)0);
-  (void)static_cast<const A&>(*((D*)0));
+  (void)static_cast<const A&>(*((D*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)static_cast<int B::*>((int A::*)0);
   (void)static_cast<void (B::*)()>((void (A::*)())0);
 
diff --git a/test/SemaCXX/switch-implicit-fallthrough-macro.cpp b/test/SemaCXX/switch-implicit-fallthrough-macro.cpp
index add212f..11df2cb 100644
--- a/test/SemaCXX/switch-implicit-fallthrough-macro.cpp
+++ b/test/SemaCXX/switch-implicit-fallthrough-macro.cpp
@@ -1,4 +1,8 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -DCOMMAND_LINE_FALLTHROUGH=[[clang::fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -DCLANG_PREFIX -DCOMMAND_LINE_FALLTHROUGH=[[clang::fallthrough]] -DUNCHOSEN=[[fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -DCOMMAND_LINE_FALLTHROUGH=[[fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z -Wimplicit-fallthrough -DCLANG_PREFIX -DCOMMAND_LINE_FALLTHROUGH=[[clang::fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z -Wimplicit-fallthrough -DCOMMAND_LINE_FALLTHROUGH=[[clang::fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z -Wimplicit-fallthrough -DCOMMAND_LINE_FALLTHROUGH=[[fallthrough]] -DUNCHOSEN=[[clang::fallthrough]] %s
 
 int fallthrough_compatibility_macro_from_command_line(int n) {
   switch (n) {
@@ -10,15 +14,12 @@
   return n;
 }
 
-#ifdef __clang__
-#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#ifdef CLANG_PREFIX
 #define COMPATIBILITY_FALLTHROUGH   [ [ /* test */  clang /* test */ \
     ::  fallthrough  ]  ]    // testing whitespace and comments in macro definition
-#endif
-#endif
-
-#ifndef COMPATIBILITY_FALLTHROUGH
-#define COMPATIBILITY_FALLTHROUGH do { } while (0)
+#else
+#define COMPATIBILITY_FALLTHROUGH   [ [ /* test */  /* test */ \
+    fallthrough  ]  ]    // testing whitespace and comments in macro definition
 #endif
 
 int fallthrough_compatibility_macro_from_source(int n) {
@@ -32,7 +33,11 @@
 }
 
 // Deeper macro substitution
+#ifdef CLANG_PREFIX
 #define M1 [[clang::fallthrough]]
+#else
+#define M1 [[fallthrough]]
+#endif
 #ifdef __clang__
 #define M2 M1
 #else
@@ -59,12 +64,17 @@
 #undef M2
 #undef COMPATIBILITY_FALLTHROUGH
 #undef COMMAND_LINE_FALLTHROUGH
+#undef UNCHOSEN
 
 int fallthrough_compatibility_macro_undefined(int n) {
   switch (n) {
     case 0:
       n = n * 20;
+#if __cplusplus <= 201402L
     case 1: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[clang::fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
+#else
+    case 1: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
+#endif
       ;
   }
 #define TOO_LATE [[clang::fallthrough]]
@@ -83,7 +93,11 @@
     case 0:
       n = n * 20;
 #undef MACRO_WITH_HISTORY
+#if __cplusplus <= 201402L
     case 1: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[clang::fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
+#else
+    case 1: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
+#endif
       ;
 #define MACRO_WITH_HISTORY [[clang::fallthrough]]
   }
diff --git a/test/SemaCXX/switch-implicit-fallthrough-off-by-default.cpp b/test/SemaCXX/switch-implicit-fallthrough-off-by-default.cpp
new file mode 100644
index 0000000..6ab6370
--- /dev/null
+++ b/test/SemaCXX/switch-implicit-fallthrough-off-by-default.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -DUNREACHABLE=1 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -DUNREACHABLE=0 -Wimplicit-fallthrough %s
+
+void fallthrough(int n) {
+  switch (n) {
+  case 1:
+    if (UNREACHABLE)
+      return;
+    [[fallthrough]]; // expected-no-diagnostics, only checked when UNREACHABLE=0
+  case 2:
+    break;
+  }
+}
diff --git a/test/SemaCXX/switch-implicit-fallthrough-per-method.cpp b/test/SemaCXX/switch-implicit-fallthrough-per-method.cpp
index 009c818..6880bdd 100644
--- a/test/SemaCXX/switch-implicit-fallthrough-per-method.cpp
+++ b/test/SemaCXX/switch-implicit-fallthrough-per-method.cpp
@@ -41,9 +41,8 @@
 void unscoped(int n) {
   switch (n % 2) {
     case 0:
-      // FIXME: This should be typo-corrected, probably.
-      [[fallthrough]]; // expected-warning{{unknown attribute 'fallthrough' ignored}}
-    case 2: // expected-warning{{unannotated fall-through}} expected-note{{clang::fallthrough}} expected-note{{break;}}
+      [[fallthrough]];
+    case 2:
       [[clang::fallthrough]];
     case 1:
       break;
diff --git a/test/SemaCXX/switch-implicit-fallthrough.cpp b/test/SemaCXX/switch-implicit-fallthrough.cpp
index 0bc43cd..9540b1f 100644
--- a/test/SemaCXX/switch-implicit-fallthrough.cpp
+++ b/test/SemaCXX/switch-implicit-fallthrough.cpp
@@ -179,18 +179,15 @@
 
 int fallthrough_position(int n) {
   switch (n) {
-      [[clang::fallthrough]];  // expected-warning{{fallthrough annotation does not directly precede switch label}}
       n += 300;
       [[clang::fallthrough]];  // expected-warning{{fallthrough annotation in unreachable code}}
     case 221:
-      [[clang::fallthrough]];  // expected-warning{{fallthrough annotation does not directly precede switch label}}
       return 1;
       [[clang::fallthrough]];  // expected-warning{{fallthrough annotation in unreachable code}}
     case 222:
-      [[clang::fallthrough]];  // expected-warning{{fallthrough annotation does not directly precede switch label}}
       n += 400;
     case 223:          // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[clang::fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
-      [[clang::fallthrough]]; // expected-warning{{fallthrough annotation does not directly precede switch label}}
+      ;
   }
 
   long p = static_cast<long>(n) * n;
@@ -282,6 +279,23 @@
   }
 }
 
+int fallthrough_placement_error(int n) {
+  switch (n) {
+      [[clang::fallthrough]]; // expected-warning{{fallthrough annotation in unreachable code}}
+      n += 300;
+    case 221:
+      [[clang::fallthrough]]; // expected-error{{fallthrough annotation does not directly precede switch label}}
+      return 1;
+    case 222:
+      [[clang::fallthrough]]; // expected-error{{fallthrough annotation does not directly precede switch label}}
+      n += 400;
+      [[clang::fallthrough]];
+    case 223:
+      [[clang::fallthrough]]; // expected-error{{fallthrough annotation does not directly precede switch label}}
+  }
+  return n;
+}
+
 int fallthrough_targets(int n) {
   [[clang::fallthrough]]; // expected-error{{fallthrough annotation is outside switch statement}}
 
diff --git a/test/SemaCXX/type-convert-construct.cpp b/test/SemaCXX/type-convert-construct.cpp
index 2dec50a..7ae8363 100644
--- a/test/SemaCXX/type-convert-construct.cpp
+++ b/test/SemaCXX/type-convert-construct.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++98 %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 %s 
 
 void f() {
   float v1 = float(1);
@@ -12,8 +14,21 @@
   typedef int T;
   int *p;
   bool v6 = T(0) == p;
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{comparison between pointer and integer ('T' (aka 'int') and 'int *')}}
+#endif
   char *str;
-  str = "a string"; // expected-warning{{conversion from string literal to 'char *' is deprecated}}
+  str = "a string";
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{conversion from string literal to 'char *' is deprecated}}
+#else
+  // expected-warning@-4 {{ISO C++11 does not allow conversion from string literal to 'char *'}}
+#endif
   wchar_t *wstr;
-  wstr = L"a wide string"; // expected-warning{{conversion from string literal to 'wchar_t *' is deprecated}}
+  wstr = L"a wide string";
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{conversion from string literal to 'wchar_t *' is deprecated}}
+#else
+  // expected-warning@-4 {{ISO C++11 does not allow conversion from string literal to 'wchar_t *'}}
+#endif
 }
diff --git a/test/SemaCXX/type-traits.cpp b/test/SemaCXX/type-traits.cpp
index 69760fd..c53b027 100644
--- a/test/SemaCXX/type-traits.cpp
+++ b/test/SemaCXX/type-traits.cpp
@@ -1514,6 +1514,9 @@
 
   { int arr[T(__is_nothrow_assignable(HasNoThrowMoveAssign, HasNoThrowMoveAssign))]; }
   { int arr[F(__is_nothrow_assignable(HasThrowMoveAssign, HasThrowMoveAssign))]; }
+
+  { int arr[T(__is_assignable(HasNoThrowMoveAssign, HasNoThrowMoveAssign))]; }
+  { int arr[T(__is_assignable(HasThrowMoveAssign, HasThrowMoveAssign))]; }
 }
 
 void has_trivial_move_assign() {
@@ -1974,6 +1977,46 @@
                                          TrivialMoveButNotCopy)))]; }
   { int arr[T((__is_trivially_assignable(TrivialMoveButNotCopy&,
                                          TrivialMoveButNotCopy&&)))]; }
+  { int arr[T((__is_trivially_assignable(int&, int)))]; }
+  { int arr[T((__is_trivially_assignable(int&, int&)))]; }
+  { int arr[T((__is_trivially_assignable(int&, int&&)))]; }
+  { int arr[T((__is_trivially_assignable(int&, const int&)))]; }
+  { int arr[T((__is_trivially_assignable(POD&, POD)))]; }
+  { int arr[T((__is_trivially_assignable(POD&, POD&)))]; }
+  { int arr[T((__is_trivially_assignable(POD&, POD&&)))]; }
+  { int arr[T((__is_trivially_assignable(POD&, const POD&)))]; }
+  { int arr[T((__is_trivially_assignable(int*&, int*)))]; }
+  { int arr[T((__is_trivially_assignable(AllDefaulted,
+                                         const AllDefaulted &)))]; }
+  { int arr[T((__is_trivially_assignable(AllDefaulted,
+                                         AllDefaulted &&)))]; }
+
+  { int arr[F((__is_assignable(int *&, float *)))]; }
+  { int arr[T((__is_assignable(HasCopyAssign &, HasCopyAssign)))]; }
+  { int arr[T((__is_assignable(HasCopyAssign &, HasCopyAssign &)))]; }
+  { int arr[T((__is_assignable(HasCopyAssign &, const HasCopyAssign &)))]; }
+  { int arr[T((__is_assignable(HasCopyAssign &, HasCopyAssign &&)))]; }
+  { int arr[T((__is_assignable(TrivialMoveButNotCopy &,
+                               TrivialMoveButNotCopy &)))]; }
+  { int arr[T((__is_assignable(TrivialMoveButNotCopy &,
+                               const TrivialMoveButNotCopy &)))]; }
+  { int arr[F((__is_assignable(AllDeleted,
+                               const AllDeleted &)))]; }
+  { int arr[F((__is_assignable(AllDeleted,
+                               AllDeleted &&)))]; }
+  { int arr[T((__is_assignable(ExtDefaulted,
+                               const ExtDefaulted &)))]; }
+  { int arr[T((__is_assignable(ExtDefaulted,
+                               ExtDefaulted &&)))]; }
+
+  { int arr[T((__is_assignable(HasDefaultTrivialCopyAssign &,
+                               HasDefaultTrivialCopyAssign &)))]; }
+  { int arr[T((__is_assignable(HasDefaultTrivialCopyAssign &,
+                               const HasDefaultTrivialCopyAssign &)))]; }
+  { int arr[T((__is_assignable(TrivialMoveButNotCopy &,
+                               TrivialMoveButNotCopy)))]; }
+  { int arr[T((__is_assignable(TrivialMoveButNotCopy &,
+                               TrivialMoveButNotCopy &&)))]; }
 }
 
 void constructible_checks() {
diff --git a/test/SemaCXX/type_pack_element.cpp b/test/SemaCXX/type_pack_element.cpp
new file mode 100644
index 0000000..d22d5fa
--- /dev/null
+++ b/test/SemaCXX/type_pack_element.cpp
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
+static_assert(__has_builtin(__type_pack_element), "");
+
+using SizeT = decltype(sizeof(int));
+
+template <SizeT i, typename ...T>
+using TypePackElement = __type_pack_element<i, T...>;
+
+template <int i>
+struct X;
+
+static_assert(__is_same(TypePackElement<0, X<0>>, X<0>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>>, X<1>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>, X<2>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>, X<2>>, X<1>), "");
+static_assert(__is_same(TypePackElement<2, X<0>, X<1>, X<2>>, X<2>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>, X<2>, X<3>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>, X<2>, X<3>>, X<1>), "");
+static_assert(__is_same(TypePackElement<2, X<0>, X<1>, X<2>, X<3>>, X<2>), "");
+static_assert(__is_same(TypePackElement<3, X<0>, X<1>, X<2>, X<3>>, X<3>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>, X<2>, X<3>, X<4>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>, X<2>, X<3>, X<4>>, X<1>), "");
+static_assert(__is_same(TypePackElement<2, X<0>, X<1>, X<2>, X<3>, X<4>>, X<2>), "");
+static_assert(__is_same(TypePackElement<3, X<0>, X<1>, X<2>, X<3>, X<4>>, X<3>), "");
+static_assert(__is_same(TypePackElement<4, X<0>, X<1>, X<2>, X<3>, X<4>>, X<4>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<1>), "");
+static_assert(__is_same(TypePackElement<2, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<2>), "");
+static_assert(__is_same(TypePackElement<3, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<3>), "");
+static_assert(__is_same(TypePackElement<4, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<4>), "");
+static_assert(__is_same(TypePackElement<5, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<5>), "");
+
+// Test __type_pack_element with more than 2 top-level template arguments.
+static_assert(__is_same(__type_pack_element<5, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<5>), "");
+
+template <SizeT Index, typename ...T>
+using ErrorTypePackElement1 = __type_pack_element<Index, T...>; // expected-error{{may not be accessed at an out of bounds index}}
+using illformed1 = ErrorTypePackElement1<3, X<0>, X<1>>;  // expected-note{{in instantiation}}
diff --git a/test/SemaCXX/typo-correction-crash.cpp b/test/SemaCXX/typo-correction-crash.cpp
index f01facd..0b8383d 100644
--- a/test/SemaCXX/typo-correction-crash.cpp
+++ b/test/SemaCXX/typo-correction-crash.cpp
@@ -9,3 +9,13 @@
   return "s";
   return tes; // expected-error {{use of undeclared identifier 'tes'; did you mean 'test'?}}
 }
+
+namespace BarNamespace {
+namespace NestedNamespace { // expected-note {{'BarNamespace::NestedNamespace' declared here}}
+typedef int type;
+}
+}
+struct FooRecord { };
+FooRecord::NestedNamespace::type x; // expected-error {{no member named 'NestedNamespace' in 'FooRecord'; did you mean 'BarNamespace::NestedNamespace'?}}
+
+void cast_expr(int g) { +int(n)(g); } // expected-error {{undeclared identifier 'n'}}
diff --git a/test/SemaCXX/typo-correction.cpp b/test/SemaCXX/typo-correction.cpp
index 07c1634..48597de 100644
--- a/test/SemaCXX/typo-correction.cpp
+++ b/test/SemaCXX/typo-correction.cpp
@@ -663,3 +663,19 @@
 
 using C::D::Foofoo;  // expected-error {{no member named 'Foofoo' in namespace 'PR24781_using_crash::C::D'; did you mean 'A::B::Foofoo'?}}
 }
+
+int d = ? L : d; // expected-error {{expected expression}} expected-error {{undeclared identifier}}
+
+struct B0 {
+  int : 0 |         // expected-error {{invalid operands to binary expression}}
+      (struct B0)e; // expected-error {{use of undeclared identifier}}
+};
+
+namespace {
+struct a0is0 {};
+struct b0is0 {};
+int g() {
+  0 [                 // expected-error {{subscripted value is not an array}}
+      sizeof(c0is0)]; // expected-error {{use of undeclared identifier}}
+};
+}
diff --git a/test/SemaCXX/unaddressable-functions.cpp b/test/SemaCXX/unaddressable-functions.cpp
index a382ccf..286cbee 100644
--- a/test/SemaCXX/unaddressable-functions.cpp
+++ b/test/SemaCXX/unaddressable-functions.cpp
@@ -34,3 +34,114 @@
 
 void *Ptr = reinterpret_cast<void*>(foo); // expected-error{{'foo' is unavailable: don't call this}} expected-note@-3{{explicitly marked unavailable here}}
 }
+
+namespace template_deduction {
+void foo() __attribute__((enable_if(false, "")));
+
+void bar() __attribute__((enable_if(true, "")));
+void bar() __attribute__((enable_if(false, "")));
+
+void baz(int a) __attribute__((enable_if(true, "")));
+void baz(int a) __attribute__((enable_if(a, "")));
+void baz(int a) __attribute__((enable_if(false, "")));
+
+void qux(int a) __attribute__((enable_if(1, "")));
+void qux(int a) __attribute__((enable_if(true, "")));
+void qux(int a) __attribute__((enable_if(a, "")));
+void qux(int a) __attribute__((enable_if(false, "")));
+
+template <typename Fn, typename... Args> void call(Fn F, Args... As) {
+  F(As...);
+}
+
+void test() {
+  call(foo); // expected-error{{cannot take address of function 'foo'}}
+  call(bar);
+  call(baz, 0);
+  call(qux, 0); // expected-error{{no matching function for call to 'call'}} expected-note@53{{candidate template ignored: couldn't infer template argument 'Fn'}}
+
+  auto Ptr1 = foo; // expected-error{{cannot take address of function 'foo'}}
+  auto Ptr2 = bar;
+  auto Ptr3 = baz;
+  auto Ptr4 = qux; // expected-error{{variable 'Ptr4' with type 'auto' has incompatible initializer of type '<overloaded function type>'}}
+}
+
+template <typename Fn, typename T, typename... Args>
+void callMem(Fn F, T t, Args... As) {
+  (t.*F)(As...);
+}
+
+class Foo {
+  void bar() __attribute__((enable_if(true, "")));
+  void bar() __attribute__((enable_if(false, "")));
+
+  static void staticBar() __attribute__((enable_if(true, "")));
+  static void staticBar() __attribute__((enable_if(false, "")));
+};
+
+void testAccess() {
+  callMem(&Foo::bar, Foo()); // expected-error{{'bar' is a private member of 'template_deduction::Foo'}} expected-note@-8{{implicitly declared private here}}
+  call(&Foo::staticBar); // expected-error{{'staticBar' is a private member of 'template_deduction::Foo'}} expected-note@-6{{implicitly declared private here}}
+}
+}
+
+namespace template_template_deduction {
+void foo() __attribute__((enable_if(false, "")));
+template <typename T>
+T foo() __attribute__((enable_if(true, "")));
+
+template <typename Fn, typename... Args> auto call(Fn F, Args... As) {
+  return F(As...);
+}
+
+auto Ok = call(&foo<int>);
+auto Fail = call(&foo); // expected-error{{no matching function for call to 'call'}} expected-note@-5{{candidate template ignored: couldn't infer template argument 'Fn'}}
+
+auto PtrOk = &foo<int>;
+auto PtrFail = &foo; // expected-error{{variable 'PtrFail' with type 'auto' has incompatible initializer of type '<overloaded function type>'}}
+}
+
+namespace pointer_equality {
+  using FnTy = void (*)();
+
+  void bothEnableIf() __attribute__((enable_if(false, "")));
+  void bothEnableIf() __attribute__((enable_if(true, "")));
+
+  void oneEnableIf() __attribute__((enable_if(false, "")));
+  void oneEnableIf();
+
+  void test() {
+    FnTy Fn;
+    (void)(Fn == bothEnableIf);
+    (void)(Fn == &bothEnableIf);
+    (void)(Fn == oneEnableIf);
+    (void)(Fn == &oneEnableIf);
+  }
+
+  void unavailableEnableIf() __attribute__((enable_if(false, "")));
+  void unavailableEnableIf() __attribute__((unavailable("noooo"))); // expected-note 2{{marked unavailable here}}
+
+  void testUnavailable() {
+    FnTy Fn;
+    (void)(Fn == unavailableEnableIf); // expected-error{{is unavailable}}
+    (void)(Fn == &unavailableEnableIf); // expected-error{{is unavailable}}
+  }
+
+  class Foo {
+    static void staticAccessEnableIf(); // expected-note 2{{declared private here}}
+    void accessEnableIf(); // expected-note{{declared private here}}
+
+  public:
+    static void staticAccessEnableIf() __attribute__((enable_if(false, "")));
+    void accessEnableIf() __attribute__((enable_if(false, "")));
+  };
+
+  void testAccess() {
+    FnTy Fn;
+    (void)(Fn == Foo::staticAccessEnableIf); // expected-error{{is a private member}}
+    (void)(Fn == &Foo::staticAccessEnableIf); // expected-error{{is a private member}}
+
+    void (Foo::*MemFn)();
+    (void)(MemFn == &Foo::accessEnableIf); // expected-error{{is a private member}}
+  }
+}
diff --git a/test/SemaCXX/undefined-internal.cpp b/test/SemaCXX/undefined-internal.cpp
index 29ca5de..59e6fdf 100644
--- a/test/SemaCXX/undefined-internal.cpp
+++ b/test/SemaCXX/undefined-internal.cpp
@@ -82,6 +82,7 @@
     static int var; // expected-warning {{variable 'test5::B<test5::(anonymous namespace)::A>::var' has internal linkage but is not defined}}
     static void foo(); // expected-warning {{function 'test5::B<test5::(anonymous namespace)::A>::foo' has internal linkage but is not defined}}
   };
+  extern template int B<A>::var;
 
   void test() {
     B<A>::var = 0; // expected-note {{used here}}
diff --git a/test/SemaCXX/underlying_type.cpp b/test/SemaCXX/underlying_type.cpp
index 61208c7..dd019ae 100644
--- a/test/SemaCXX/underlying_type.cpp
+++ b/test/SemaCXX/underlying_type.cpp
@@ -55,3 +55,10 @@
     // expected-error@-2 {{constant expression}}
   };
 }
+
+template<typename T> void f(__underlying_type(T));
+template<typename T> void f(__underlying_type(T));
+enum E {};
+void PR26014() { f<E>(0); } // should not yield an ambiguity error.
+
+template<typename ...T> void f(__underlying_type(T) v); // expected-error {{declaration type contains unexpanded parameter pack 'T'}}
diff --git a/test/SemaCXX/unused.cpp b/test/SemaCXX/unused.cpp
index fbaf8c8..09a179e 100644
--- a/test/SemaCXX/unused.cpp
+++ b/test/SemaCXX/unused.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // PR4103 : Make sure we don't get a bogus unused expression warning
 namespace PR4103 {
@@ -28,8 +30,14 @@
 
 namespace derefvolatile {
   void f(volatile char* x) {
-    *x; // expected-warning {{expression result unused; assign into a variable to force a volatile load}}
-    (void)*x; // expected-warning {{expression result unused; assign into a variable to force a volatile load}}
+    *x;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{expression result unused; assign into a variable to force a volatile load}}
+#endif
+    (void)*x;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{expression result unused; assign into a variable to force a volatile load}}
+#endif
     volatile char y = 10;
     (void)y; // don't warn here, because it's a common pattern.
   }
diff --git a/test/SemaCXX/using-decl-1.cpp b/test/SemaCXX/using-decl-1.cpp
index e17612d..93f38f2 100644
--- a/test/SemaCXX/using-decl-1.cpp
+++ b/test/SemaCXX/using-decl-1.cpp
@@ -243,6 +243,41 @@
   struct F : E {
     using E::EE; // expected-error-re {{no member named 'EE' in 'PR19171::E'{{$}}}}
   };
+
+  struct TypoDuplicate { // expected-note 0-4{{here}}
+    TypoDuplicate(int);
+    void foobar(); // expected-note 2{{here}}
+  };
+  struct TypoDuplicateDerived1 : TypoDuplicate {
+#if __cplusplus >= 201103L
+    using TypoDuplicate::TypoFuplicate; // expected-error {{did you mean 'TypoDuplicate'}} expected-note {{previous}}
+    using TypoDuplicate::TypoDuplicate; // expected-error {{redeclaration}}
+#endif
+    using TypoDuplicate::goobar; // expected-error {{did you mean 'foobar'}} expected-note {{previous}}
+    using TypoDuplicate::foobar; // expected-error {{redeclaration}}
+  };
+  struct TypoDuplicateDerived2 : TypoDuplicate {
+#if __cplusplus >= 201103L
+    using TypoFuplicate::TypoDuplicate; // expected-error {{did you mean 'TypoDuplicate'}} expected-note {{previous}}
+    using TypoDuplicate::TypoDuplicate; // expected-error {{redeclaration}}
+#endif
+  };
+  struct TypoDuplicateDerived3 : TypoDuplicate {
+#if __cplusplus >= 201103L
+    // FIXME: Don't suggest a correction that would lead to a redeclaration
+    // error here... or at least diagnose the error.
+    using TypoDuplicate::TypoDuplicate;
+    using TypoDuplicate::TypoFuplicate; // expected-error {{did you mean 'TypoDuplicate'}}
+#endif
+    using TypoDuplicate::foobar;
+    using TypoDuplicate::goobar; // expected-error {{did you mean 'foobar'}}
+  };
+  struct TypoDuplicateDerived4 : TypoDuplicate {
+#if __cplusplus >= 201103L
+    using TypoDuplicate::TypoDuplicate; // expected-note {{previous}}
+    using TypoFuplicate::TypoDuplicate; // expected-error {{did you mean 'TypoDuplicate'}} expected-error {{redeclaration}}
+#endif
+  };
 }
 
 namespace TypoCorrectTemplateMember {
diff --git a/test/SemaCXX/vararg-non-pod.cpp b/test/SemaCXX/vararg-non-pod.cpp
index 39d4ccc..1b7f3b6 100644
--- a/test/SemaCXX/vararg-non-pod.cpp
+++ b/test/SemaCXX/vararg-non-pod.cpp
@@ -1,7 +1,11 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -fblocks %s -Wno-error=non-pod-varargs
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -std=c++98 %s -Wno-error=non-pod-varargs
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -std=c++11 %s -Wno-error=non-pod-varargs
 
 // Check that the warning is still there under -fms-compatibility.
 // RUN: %clang_cc1 -fsyntax-only -verify -fblocks %s -Wno-error=non-pod-varargs -fms-compatibility
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -std=c++98 %s -Wno-error=non-pod-varargs -fms-compatibility
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -std=c++11 %s -Wno-error=non-pod-varargs -fms-compatibility
 
 extern char version[];
 
@@ -18,11 +22,19 @@
 {
   C c(10);
   
-  g(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+  g(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
   g(10, version);
 
   void (*ptr)(int, ...) = g;
-  ptr(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+  ptr(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
   ptr(10, version);
 }
 
@@ -30,18 +42,34 @@
 {
   C c(10);
 
-  c.g(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+  c.g(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
+
   c.g(10, version);
 
   void (C::*ptr)(int, ...) = &C::g;
-  (c.*ptr)(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+  (c.*ptr)(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
+
   (c.*ptr)(10, version);
  
-  C::h(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+  C::h(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
   C::h(10, version);
 
   void (*static_ptr)(int, ...) = &C::h; 
-  static_ptr(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+  static_ptr(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
   static_ptr(10, version);
 }
 
@@ -51,7 +79,11 @@
 {
   C c(10);
   
-  block(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+  block(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+#endif
+
   block(10, version);
 }
 
@@ -66,7 +98,11 @@
 
   D d;
   
-  d(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+  d(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
+
   d(10, version);
 }
 
@@ -78,10 +114,16 @@
 {
   C c(10);
   
-  E e(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic constructor; call will abort at runtime}} \
-    // expected-error{{calling a private constructor of class 'E'}}
-  (void)E(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic constructor; call will abort at runtime}} \
-    // expected-error{{calling a private constructor of class 'E'}}
+  E e(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic constructor; call will abort at runtime}}
+#endif
+  // expected-error@-4 {{calling a private constructor of class 'E'}}
+  (void)E(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic constructor; call will abort at runtime}}
+#endif
+  // expected-error@-4 {{calling a private constructor of class 'E'}}
 
 }
 
@@ -103,7 +145,13 @@
 int eat_base(...);
 
 void test_typeid(Base &base) {
-  (void)typeid(get_base(base)); // expected-warning{{cannot pass object of non-POD type 'Base' through variadic function; call will abort at runtime}} expected-warning{{expression with side effects will be evaluated despite being used as an operand to 'typeid'}}
+  (void)typeid(get_base(base));
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'Base' through variadic function; call will abort at runtime}}
+#else
+  // expected-warning@-4 {{cannot pass object of non-trivial type 'Base' through variadic function; call will abort at runtime}}
+#endif
+  // expected-warning@-6 {{expression with side effects will be evaluated despite being used as an operand to 'typeid'}}
   (void)typeid(eat_base(base)); // okay
 }
 
@@ -136,7 +184,10 @@
 
 int t9(int n) {
   // Make sure the error works in potentially-evaluated sizeof
-  return (int)sizeof(*(Helper(Foo()), (int (*)[n])0)); // expected-warning{{cannot pass object of non-POD type}}
+  return (int)sizeof(*(Helper(Foo()), (int (*)[n])0));
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'Foo' through variadic function; call will abort at runtime}}
+#endif
 }
 
 // PR14057
@@ -173,22 +224,43 @@
   void test() {
     C c(10);
 
-    (get_f_ptr())(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+    (get_f_ptr())(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
     (get_f_ptr())(10, version);
 
-    (c.*get_m_ptr())(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+    (c.*get_m_ptr())(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
     (c.*get_m_ptr())(10, version);
 
-    (get_b_ptr())(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+    (get_b_ptr())(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+#endif
+
     (get_b_ptr())(10, version);
 
-    (arr_f_ptr[3])(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+    (arr_f_ptr[3])(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
     (arr_f_ptr[3])(10, version);
 
-    (c.*arr_m_ptr[3])(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+    (c.*arr_m_ptr[3])(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
+
     (c.*arr_m_ptr[3])(10, version);
 
-    (arr_b_ptr[3])(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+    (arr_b_ptr[3])(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+#endif
     (arr_b_ptr[3])(10, version);
   }
 }
diff --git a/test/SemaCXX/virtual-override.cpp b/test/SemaCXX/virtual-override.cpp
index 4249117..ec884f3 100644
--- a/test/SemaCXX/virtual-override.cpp
+++ b/test/SemaCXX/virtual-override.cpp
@@ -289,15 +289,3 @@
     static void foo() {} // expected-error{{'static' member function 'foo' overrides a virtual function}}
   };
 }
-
-namespace PR26297 {
-struct Incomplete;
-
-struct Base {
-  virtual const Incomplete *meow() = 0;
-};
-
-struct Derived : Base {
-  virtual Incomplete *meow() override { return nullptr; }
-};
-}
diff --git a/test/SemaCXX/vla-consruct.cpp b/test/SemaCXX/vla-consruct.cpp
new file mode 100644
index 0000000..09b7370
--- /dev/null
+++ b/test/SemaCXX/vla-consruct.cpp
@@ -0,0 +1,48 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -O0 -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -pedantic-errors -DPE -O0 -verify %s
+
+# ifndef PE
+// expected-no-diagnostics
+# endif
+
+extern "C" int printf(const char*, ...);
+
+static int N;
+struct S {
+  S() __attribute__ ((nothrow))  { printf("%d: S()\n", ++N); }
+  ~S()  __attribute__ ((nothrow))  { printf("%d: ~S()\n", N--); }
+  int n[17];
+};
+
+void print(int n, int a, int b, int c, int d) {
+  printf("n=%d\n,sizeof(S)=%d\nsizeof(array_t[0][0])=%d\nsizeof(array_t[0])=%d\nsizeof(array_t)=%d\n",
+         n, a, b, c, d);
+  if (n == 2) throw(n);
+}
+
+void test(int n) {
+  S array_t[n][n+1];
+# ifdef PE
+   // expected-error@-2 {{variable length arrays are a C99 feature}}
+   // expected-error@-3 {{variable length arrays are a C99 feature}}
+# endif
+  int sizeof_S = sizeof(S);
+  int sizeof_array_t_0_0 = sizeof(array_t[0][0]);
+  int sizeof_array_t_0 = sizeof(array_t[0]);
+  int sizeof_array_t = sizeof(array_t);
+  print(n, sizeof_S, sizeof_array_t_0_0, sizeof_array_t_0, sizeof_array_t);
+}
+
+int main()
+{
+  try {
+    test(2);
+  } catch(int e) {
+    printf("expeption %d\n", e);
+  }
+  try {
+    test(3);
+  } catch(int e) {
+    printf("expeption %d", e);
+  }
+}
diff --git a/test/SemaCXX/warn-comma-operator.cpp b/test/SemaCXX/warn-comma-operator.cpp
new file mode 100644
index 0000000..3192f68
--- /dev/null
+++ b/test/SemaCXX/warn-comma-operator.cpp
@@ -0,0 +1,278 @@
+// RUN: %clang_cc1 -fsyntax-only -Wcomma -std=c++11 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wcomma -std=c++11 -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+
+// Test builtin operators
+void test1() {
+  int x = 0, y = 0;
+  for (; y < 10; x++, y++) {}
+  for (; y < 10; ++x, y++) {}
+  for (; y < 10; x++, ++y) {}
+  for (; y < 10; ++x, ++y) {}
+  for (; y < 10; x--, ++y) {}
+  for (; y < 10; --x, ++y) {}
+  for (; y < 10; x = 5, ++y) {}
+  for (; y < 10; x *= 5, ++y) {}
+  for (; y < 10; x /= 5, ++y) {}
+  for (; y < 10; x %= 5, ++y) {}
+  for (; y < 10; x += 5, ++y) {}
+  for (; y < 10; x -= 5, ++y) {}
+  for (; y < 10; x <<= 5, ++y) {}
+  for (; y < 10; x >>= 5, ++y) {}
+  for (; y < 10; x &= 5, ++y) {}
+  for (; y < 10; x |= 5, ++y) {}
+  for (; y < 10; x ^= 5, ++y) {}
+}
+
+class S2 {
+public:
+  void advance();
+
+  S2 operator++();
+  S2 operator++(int);
+  S2 operator--();
+  S2 operator--(int);
+  S2 operator=(int);
+  S2 operator*=(int);
+  S2 operator/=(int);
+  S2 operator%=(int);
+  S2 operator+=(int);
+  S2 operator-=(int);
+  S2 operator<<=(int);
+  S2 operator>>=(int);
+  S2 operator&=(int);
+  S2 operator|=(int);
+  S2 operator^=(int);
+};
+
+// Test overloaded operators
+void test2() {
+  S2 x;
+  int y;
+  for (; y < 10; x++, y++) {}
+  for (; y < 10; ++x, y++) {}
+  for (; y < 10; x++, ++y) {}
+  for (; y < 10; ++x, ++y) {}
+  for (; y < 10; x--, ++y) {}
+  for (; y < 10; --x, ++y) {}
+  for (; y < 10; x = 5, ++y) {}
+  for (; y < 10; x *= 5, ++y) {}
+  for (; y < 10; x /= 5, ++y) {}
+  for (; y < 10; x %= 5, ++y) {}
+  for (; y < 10; x += 5, ++y) {}
+  for (; y < 10; x -= 5, ++y) {}
+  for (; y < 10; x <<= 5, ++y) {}
+  for (; y < 10; x >>= 5, ++y) {}
+  for (; y < 10; x &= 5, ++y) {}
+  for (; y < 10; x |= 5, ++y) {}
+  for (; y < 10; x ^= 5, ++y) {}
+}
+
+// Test nested comma operators
+void test3() {
+  int x1, x2, x3;
+  int y1, *y2 = 0, y3 = 5;
+  for (int z1 = 5, z2 = 4, z3 = 3; x1 <4; ++x1) {}
+}
+
+class Stream {
+ public:
+  Stream& operator<<(int);
+} cout;
+
+int return_four() { return 5; }
+
+// Confusing "," for "<<"
+void test4() {
+  cout << 5 << return_four();
+  cout << 5, return_four();
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:3-[[@LINE-3]]:3}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:12-[[@LINE-4]]:12}:")"
+}
+
+// Confusing "," for "=="
+void test5() {
+  if (return_four(), 5) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:7-[[@LINE-3]]:7}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:20-[[@LINE-4]]:20}:")"
+
+  if (return_four() == 5) {}
+}
+
+// Confusing "," for "+"
+int test6() {
+  return return_four(), return_four();
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:10-[[@LINE-3]]:10}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:23-[[@LINE-4]]:23}:")"
+
+  return return_four() + return_four();
+}
+
+void Concat(int);
+void Concat(int, int);
+
+// Testing extra parentheses in function call
+void test7() {
+  Concat((return_four() , 5));
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:11-[[@LINE-3]]:11}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:24-[[@LINE-4]]:24}:")"
+
+  Concat(return_four() , 5);
+}
+
+// Be sure to look through parentheses
+void test8() {
+  int x, y;
+  for (x = 0; return_four(), x;) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:15-[[@LINE-3]]:15}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:28-[[@LINE-4]]:28}:")"
+
+  for (x = 0; (return_four()), (x) ;) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:15-[[@LINE-3]]:15}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:30-[[@LINE-4]]:30}:")"
+}
+
+bool DoStuff();
+class S9 {
+public:
+ bool Advance();
+ bool More();
+};
+
+// Ignore comma operator in for-loop initializations and increments.
+void test9() {
+  int x, y;
+  for (x = 0, y = 5; x < y; ++x) {}
+  for (x = 0; x < 10; DoStuff(), ++x) {}
+  for (S9 s; s.More(); s.Advance(), ++x) {}
+}
+
+void test10() {
+  int x, y;
+  ++x, ++y;
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:3-[[@LINE-3]]:3}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:6-[[@LINE-4]]:6}:")"
+}
+
+// Ignore comma operator in templates.
+namespace test11 {
+template <bool T>
+struct B { static const bool value = T; };
+
+typedef B<true> true_type;
+typedef B<false> false_type;
+
+template <bool...>
+struct bool_seq;
+
+template <typename... xs>
+class Foo {
+  typedef bool_seq<(xs::value, true)...> all_true;
+  typedef bool_seq<(xs::value, false)...> all_false;
+  typedef bool_seq<xs::value...> seq;
+};
+
+const auto X = Foo<true_type>();
+}
+
+namespace test12 {
+class Mutex {
+ public:
+  Mutex();
+  ~Mutex();
+};
+class MutexLock {
+public:
+  MutexLock(Mutex &);
+  MutexLock();
+  ~MutexLock();
+};
+class BuiltinMutex {
+  Mutex M;
+};
+Mutex StatusMutex;
+bool Status;
+
+bool get_status() {
+  return (MutexLock(StatusMutex), Status);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:11-[[@LINE-3]]:11}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:33-[[@LINE-4]]:33}:")"
+  return (MutexLock(), Status);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:11-[[@LINE-3]]:11}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:22-[[@LINE-4]]:22}:")"
+  return (BuiltinMutex(), Status);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:11-[[@LINE-3]]:11}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:25-[[@LINE-4]]:25}:")"
+}
+}
+
+// Check for comma operator in conditions.
+void test13(int x) {
+  x = (return_four(), x);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:8-[[@LINE-3]]:8}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:21-[[@LINE-4]]:21}:")"
+
+  int y = (return_four(), x);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:12-[[@LINE-3]]:12}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:25-[[@LINE-4]]:25}:")"
+
+  for (; return_four(), x;) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:10-[[@LINE-3]]:10}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:23-[[@LINE-4]]:23}:")"
+
+  while (return_four(), x) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:10-[[@LINE-3]]:10}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:23-[[@LINE-4]]:23}:")"
+
+  if (return_four(), x) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:7-[[@LINE-3]]:7}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:20-[[@LINE-4]]:20}:")"
+
+  do { } while (return_four(), x);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:17-[[@LINE-3]]:17}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:30-[[@LINE-4]]:30}:")"
+}
+
+// Nested comma operator with fix-its.
+void test14() {
+  return_four(), return_four(), return_four(), return_four();
+  // expected-warning@-1 3{{comma operator}}
+  // expected-note@-2 3{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:3-[[@LINE-3]]:3}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:16-[[@LINE-4]]:16}:")"
+  // CHECK: fix-it:{{.*}}:{[[@LINE-5]]:18-[[@LINE-5]]:18}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-6]]:31-[[@LINE-6]]:31}:")"
+  // CHECK: fix-it:{{.*}}:{[[@LINE-7]]:33-[[@LINE-7]]:33}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-8]]:46-[[@LINE-8]]:46}:")"
+}
diff --git a/test/SemaCXX/warn-float-conversion.cpp b/test/SemaCXX/warn-float-conversion.cpp
index 22c3304..fc22189 100644
--- a/test/SemaCXX/warn-float-conversion.cpp
+++ b/test/SemaCXX/warn-float-conversion.cpp
@@ -1,5 +1,10 @@
-// RUN: %clang_cc1 -verify -fsyntax-only %s -Wfloat-conversion
+// RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-pc-linux-gnu %s -Wno-literal-conversion -Wfloat-conversion -DFLOAT_CONVERSION -DZERO -DBOOL -DCONSTANT_BOOL -DOVERFLOW
+// RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-pc-linux-gnu %s -Wno-conversion -Wfloat-overflow-conversion -DOVERFLOW
+// RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-pc-linux-gnu %s -Wno-conversion -Wfloat-zero-conversion -DZERO
 
+float ReturnFloat();
+
+#ifdef FLOAT_CONVERSION
 bool ReturnBool(float f) {
   return f;  //expected-warning{{conversion}}
 }
@@ -36,3 +41,49 @@
   l = ld;  //expected-warning{{conversion}}
 }
 
+void Test() {
+  int a1 = 10.0/2.0;  //expected-warning{{conversion}}
+  int a2 = 1.0/2.0;  //expected-warning{{conversion}}
+  bool a3 = ReturnFloat();  //expected-warning{{conversion}}
+  int a4 = 1e30 + 1;  //expected-warning{{conversion}}
+}
+
+void TestConstantFloat() {
+  // Don't warn on exact floating literals.
+  int a1 = 5.0;
+  int a2 = 1e3;
+
+  int a3 = 5.5;  // caught by -Wliteral-conversion
+  int a4 = 500.44;  // caught by -Wliteral-convserion
+
+  int b1 = 5.0 / 1.0;  //expected-warning{{conversion}}
+  int b2 = 5.0 / 2.0;  //expected-warning{{conversion}}
+
+  const float five = 5.0;
+
+  int b3 = five / 1.0;  //expected-warning{{conversion}}
+  int b4 = five / 2.0;  //expected-warning{{conversion}}
+}
+#endif  // FLOAT_CONVERSION
+
+#ifdef ZERO
+void TestZero() {
+  const float half = .5;
+  int a1 = half;  // expected-warning{{implicit conversion from 'const float' to 'int' changes non-zero value from 0.5 to 0}}
+  int a2 = 1.0 / 2.0;  // expected-warning{{implicit conversion from 'double' to 'int' changes non-zero value from 0.5 to 0}}
+  int a3 = 5;
+}
+#endif  // ZERO
+
+#ifdef OVERFLOW
+void TestOverflow() {
+  char a = 500.0;  // caught by -Wliteral-conversion
+  char b = -500.0;  // caught by -Wliteral-conversion
+
+  const float LargeNumber = 1024;
+  char c = LargeNumber;  // expected-warning{{implicit conversion of out of range value from 'const float' to 'char' changes value from 1024 to 127}}
+  char d = 400.0 + 400.0;  // expected-warning{{implicit conversion of out of range value from 'double' to 'char' changes value from 800 to 127}}
+
+  char e = 1.0 / 0.0;  // expected-warning{{implicit conversion of out of range value from 'double' to 'char' changes value from +Inf to 127}}
+}
+#endif  // OVERFLOW
diff --git a/test/SemaCXX/warn-literal-conversion.cpp b/test/SemaCXX/warn-literal-conversion.cpp
index 5d4b6f7..875aa1d 100644
--- a/test/SemaCXX/warn-literal-conversion.cpp
+++ b/test/SemaCXX/warn-literal-conversion.cpp
@@ -25,7 +25,7 @@
   // Test passing a literal floating-point value to a function that takes an integer.
   foo(1.2F); // expected-warning {{implicit conversion from 'float' to 'int' changes value from 1.2 to 1}}
 
-  int y10 = -1.2F;  // expected-warning {{implicit conversion from 'float' to 'int' changes value from 1.2 to 1}}
+  int y10 = -1.2F;  // expected-warning {{implicit conversion from 'float' to 'int' changes value from -1.2 to -1}}
 
   // -Wliteral-conversion does NOT catch const values.
   // (-Wconversion DOES catch them.)
diff --git a/test/SemaCXX/warn-loop-analysis.cpp b/test/SemaCXX/warn-loop-analysis.cpp
index c666c48..25ec7a7 100644
--- a/test/SemaCXX/warn-loop-analysis.cpp
+++ b/test/SemaCXX/warn-loop-analysis.cpp
@@ -260,3 +260,9 @@
     i--;
   }
 }
+
+int f(int);
+void test9() {
+  // Don't warn when variable is defined by the loop condition.
+  for (int i = 0; int x = f(i); ++i) {}
+}
diff --git a/test/SemaCXX/warn-memset-bad-sizeof.cpp b/test/SemaCXX/warn-memset-bad-sizeof.cpp
index cca15fc..0a78caa 100644
--- a/test/SemaCXX/warn-memset-bad-sizeof.cpp
+++ b/test/SemaCXX/warn-memset-bad-sizeof.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wno-sizeof-array-argument %s
 //
+extern "C" void *bzero(void *, unsigned);
 extern "C" void *memset(void *, int, unsigned);
 extern "C" void *memmove(void *s1, const void *s2, unsigned n);
 extern "C" void *memcpy(void *s1, const void *s2, unsigned n);
@@ -47,6 +48,19 @@
   memset(heap_buffer, 0, sizeof(heap_buffer));  // \
       // expected-warning {{'memset' call operates on objects of type 'char' while the size is based on a different type 'char *'}} expected-note{{did you mean to provide an explicit length?}}
 
+  bzero(&s, sizeof(&s));  // \
+      // expected-warning {{'bzero' call operates on objects of type 'S' while the size is based on a different type 'S *'}} expected-note{{did you mean to remove the addressof in the argument to 'sizeof' (and multiply it by the number of elements)?}}
+  bzero(ps, sizeof(ps));  // \
+      // expected-warning {{'bzero' call operates on objects of type 'S' while the size is based on a different type 'S *'}} expected-note{{did you mean to dereference the argument to 'sizeof' (and multiply it by the number of elements)?}}
+  bzero(ps2, sizeof(ps2));  // \
+      // expected-warning {{'bzero' call operates on objects of type 'S' while the size is based on a different type 'PS' (aka 'S *')}} expected-note{{did you mean to dereference the argument to 'sizeof' (and multiply it by the number of elements)?}}
+  bzero(ps2, sizeof(typeof(ps2)));  // \
+      // expected-warning {{argument to 'sizeof' in 'bzero' call is the same pointer type}}
+  bzero(ps2, sizeof(PS));  // \
+      // expected-warning {{argument to 'sizeof' in 'bzero' call is the same pointer type}}
+  bzero(heap_buffer, sizeof(heap_buffer));  // \
+      // expected-warning {{'bzero' call operates on objects of type 'char' while the size is based on a different type 'char *'}} expected-note{{did you mean to provide an explicit length?}}
+
   memcpy(&s, 0, sizeof(&s));  // \
       // expected-warning {{'memcpy' call operates on objects of type 'S' while the size is based on a different type 'S *'}} expected-note{{did you mean to remove the addressof in the argument to 'sizeof' (and multiply it by the number of elements)?}}
   memcpy(0, &s, sizeof(&s));  // \
@@ -73,6 +87,21 @@
   memset(arr, 0, sizeof(arr));
   memset(parr, 0, sizeof(parr));
 
+  bzero((void*)&s, sizeof(&s));
+  bzero(&s, sizeof(s));
+  bzero(&s, sizeof(S));
+  bzero(&s, sizeof(const S));
+  bzero(&s, sizeof(volatile S));
+  bzero(&s, sizeof(volatile const S));
+  bzero(&foo, sizeof(CFoo));
+  bzero(&foo, sizeof(VFoo));
+  bzero(&foo, sizeof(CVFoo));
+  bzero(ps, sizeof(*ps));
+  bzero(ps2, sizeof(*ps2));
+  bzero(ps2, sizeof(typeof(*ps2)));
+  bzero(arr, sizeof(arr));
+  bzero(parr, sizeof(parr));
+
   memcpy(&foo, &const_foo, sizeof(Foo));
   memcpy((void*)&s, 0, sizeof(&s));
   memcpy(0, (void*)&s, sizeof(&s));
@@ -96,12 +125,17 @@
   int iarr[14];
   memset(&iarr[0], 0, sizeof iarr);
   memset(iarr, 0, sizeof iarr);
+  bzero(&iarr[0], sizeof iarr);
+  bzero(iarr, sizeof iarr);
 
   int* iparr[14];
   memset(&iparr[0], 0, sizeof iparr);
   memset(iparr, 0, sizeof iparr);
+  bzero(&iparr[0], sizeof iparr);
+  bzero(iparr, sizeof iparr);
 
   memset(m, 0, sizeof(Mat));
+  bzero(m, sizeof(Mat));
 
   // Copy to raw buffer shouldn't warn either
   memcpy(&foo, &arr, sizeof(Foo));
@@ -114,12 +148,21 @@
     for (;;) {}
     &s;
   }), 0, sizeof(s));
+
+  bzero(({
+    if (0) {}
+    while (0) {}
+    for (;;) {}
+    &s;
+  }), sizeof(s));
 }
 
 namespace ns {
 void memset(void* s, char c, int n);
+void bzero(void* s, int n);
 void f(int* i) {
   memset(i, 0, sizeof(i));
+  bzero(i, sizeof(i));
 }
 }
 
diff --git a/test/SemaCXX/warn-shadow.cpp b/test/SemaCXX/warn-shadow.cpp
index 5ad2233..9d68fe7 100644
--- a/test/SemaCXX/warn-shadow.cpp
+++ b/test/SemaCXX/warn-shadow.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fsyntax-only -Wshadow %s
+// RUN: %clang_cc1 -verify -fsyntax-only -Wshadow-all %s
 
 namespace {
   int i; // expected-note {{previous declaration is here}}
@@ -29,7 +29,23 @@
 
 class A {
   static int data; // expected-note {{previous declaration}}
-  int field; // expected-note {{previous declaration}}
+  // expected-note@+1 {{previous declaration}}
+  int field;
+  int f1, f2, f3, f4; // expected-note 8 {{previous declaration is here}}
+
+  // The initialization is safe, but the modifications are not.
+  A(int f1, int f2, int f3, int f4) // expected-note-re 4 {{variable 'f{{[0-4]}}' is declared here}}
+	  : f1(f1) {
+    f1 = 3; // expected-warning {{modifying constructor parameter 'f1' that shadows a field of 'A'}}
+    f1 = 4; // one warning per shadow
+    f2++; // expected-warning {{modifying constructor parameter 'f2' that shadows a field of 'A'}}
+    --f3; // expected-warning {{modifying constructor parameter 'f3' that shadows a field of 'A'}}
+    f4 += 2; // expected-warning {{modifying constructor parameter 'f4' that shadows a field of 'A'}}
+  }
+
+  // The initialization is safe, but the modifications are not.
+  // expected-warning-re@+1 4 {{constructor parameter 'f{{[0-4]}}' shadows the field 'f{{[0-9]}}' of 'A'}}
+  A(int f1, int f2, int f3, int f4, double overload_dummy) {}
 
   void test() {
     char *field; // expected-warning {{declaration shadows a field of 'A'}}
diff --git a/test/SemaCXX/warn-thread-safety-analysis.cpp b/test/SemaCXX/warn-thread-safety-analysis.cpp
index b5d2f8e..bbb4f9b 100644
--- a/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -5160,6 +5160,21 @@
 }  // end namespace  GlobalAcquiredBeforeAfterTest
 
 
+namespace LifetimeExtensionText {
+
+struct Holder {
+  virtual ~Holder() throw() {}
+  int i = 0;
+};
+
+void test() {
+  // Should not crash.
+  const auto &value = Holder().i;
+}
+
+} // end namespace LifetimeExtensionTest
+
+
 namespace LockableUnions {
 
 union LOCKABLE MutexUnion {
diff --git a/test/SemaCXX/warn-unsequenced.cpp b/test/SemaCXX/warn-unsequenced.cpp
index 54e16a5..9e8a5b4 100644
--- a/test/SemaCXX/warn-unsequenced.cpp
+++ b/test/SemaCXX/warn-unsequenced.cpp
@@ -113,3 +113,58 @@
   (__builtin_object_size(&(++a, a), 0) ? 1 : 0) + ++a; // ok
   (__builtin_expect(++a, 0) ? 1 : 0) + ++a; // expected-warning {{multiple unsequenced modifications}}
 }
+
+namespace templates {
+
+template <typename T>
+struct Bar {
+  T get() { return 0; }
+};
+
+template <typename X>
+struct Foo {
+  int Run();
+  Bar<int> bar;
+};
+
+enum E {e1, e2};
+bool operator&&(E, E);
+
+void foo(int, int);
+
+template <typename X>
+int Foo<X>::Run() {
+  char num = 0;
+
+  // Before instantiation, Clang may consider the builtin operator here as
+  // unresolved function calls, and treat the arguments as unordered when
+  // the builtin operator evaluatation is well-ordered.  Waiting until
+  // instantiation to check these expressions will prevent false positives.
+  if ((num = bar.get()) < 5 && num < 10) { }
+  if ((num = bar.get()) < 5 || num < 10) { }
+  if (static_cast<E>((num = bar.get()) < 5) || static_cast<E>(num < 10)) { }
+
+  if (static_cast<E>((num = bar.get()) < 5) && static_cast<E>(num < 10)) { }
+  // expected-warning@-1 {{unsequenced modification and access to 'num'}}
+
+  foo(num++, num++);
+  // expected-warning@-1 2{{multiple unsequenced modifications to 'num'}}
+  return 1;
+}
+
+int x = Foo<int>().Run();
+// expected-note@-1 {{in instantiation of member function 'templates::Foo<int>::Run'}}
+
+
+template <typename T>
+int Run2() {
+  T t = static_cast<T>(0);
+  return (t = static_cast<T>(1)) && t;
+  // expected-warning@-1 {{unsequenced modification and access to 't'}}
+}
+
+int y = Run2<bool>();
+int z = Run2<E>();
+// expected-note@-1{{in instantiation of function template specialization 'templates::Run2<templates::E>' requested here}}
+
+}
diff --git a/test/SemaCXX/warn-unused-private-field.cpp b/test/SemaCXX/warn-unused-private-field.cpp
index 932a7dc..fb34fa9 100644
--- a/test/SemaCXX/warn-unused-private-field.cpp
+++ b/test/SemaCXX/warn-unused-private-field.cpp
@@ -128,6 +128,7 @@
     int *use = &by_reference_;
     int test[2];
     test[as_array_index_] = 42;
+    int EverythingUsed::*ptr = &EverythingUsed::by_pointer_to_member_;
   }
 
   template<class T>
@@ -142,6 +143,7 @@
   int by_template_function_;
   int as_array_index_;
   int by_initializer_;
+  int by_pointer_to_member_;
 };
 
 class HasFeatureTest {
diff --git a/test/SemaCXX/warn-unused-value.cpp b/test/SemaCXX/warn-unused-value.cpp
index efabd50..d6ec0fb 100644
--- a/test/SemaCXX/warn-unused-value.cpp
+++ b/test/SemaCXX/warn-unused-value.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wunused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wunused-value -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wunused-value -std=c++11 %s
 
 // PR4806
 namespace test0 {
@@ -12,7 +14,10 @@
     // pointer to volatile has side effect (thus no warning)
     Box* box = new Box;
     box->i; // expected-warning {{expression result unused}}
-    box->j; // expected-warning {{expression result unused}}
+    box->j;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{expression result unused}}
+#endif
   }
 }
 
diff --git a/test/SemaObjC/attr-availability.m b/test/SemaObjC/attr-availability.m
index 6cbb3cc..02b7c5c 100644
--- a/test/SemaObjC/attr-availability.m
+++ b/test/SemaObjC/attr-availability.m
@@ -46,16 +46,16 @@
   [b proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in macOS 10.2}}
 
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'partialMethod' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'partialMethod' to silence this warning}}
+  // expected-warning@+2 {{'partialMethod' is only available on macOS 10.8 or newer}} expected-note@+2 {{enclose 'partialMethod' in an @available check to silence this warning}}
 #endif
   [a partialMethod];
   [b partialMethod];  // no warning
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'partial_proto_method' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'partial_proto_method' to silence this warning}}
+  // expected-warning@+2 {{'partial_proto_method' is only available on macOS 10.8 or newer}} expected-note@+2 {{enclose 'partial_proto_method' in an @available check to silence this warning}}
 #endif
   [a partial_proto_method];
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'partial_proto_method' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'partial_proto_method' to silence this warning}}
+  // expected-warning@+2 {{'partial_proto_method' is only available on macOS 10.8 or newer}} expected-note@+2 {{enclose 'partial_proto_method' in an @available check to silence this warning}}
 #endif
   [b partial_proto_method];
 }
@@ -163,14 +163,14 @@
   [a partialMethod]; // no warning
   [a ipartialMethod1]; // no warning
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'ipartialMethod2' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'ipartialMethod2' to silence this warning}}
+  // expected-warning@+2 {{'ipartialMethod2' is only available on macOS 10.8 or newer}} expected-note@+2 {{enclose 'ipartialMethod2' in an @available check to silence this warning}}
 #endif
   [a ipartialMethod2];
   [a ppartialMethod]; // no warning
   [PartialI partialMethod]; // no warning
   [PartialI ipartialMethod1]; // no warning
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'ipartialMethod2' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'ipartialMethod2' to silence this warning}}
+  // expected-warning@+2 {{'ipartialMethod2' is only available on macOS 10.8 or newer}} expected-note@+2 {{enclose 'ipartialMethod2' in an @available check to silence this warning}}
 #endif
   [PartialI ipartialMethod2];
   [PartialI ppartialMethod]; // no warning
@@ -294,3 +294,39 @@
   [obj method]; // expected-error{{'method' is unavailable}}
 }
 @end
+
+#if defined(WARN_PARTIAL)
+
+int fn_10_5() __attribute__((availability(macosx, introduced=10.5)));
+int fn_10_7() __attribute__((availability(macosx, introduced=10.7))); // expected-note{{marked partial here}}
+int fn_10_8() __attribute__((availability(macosx, introduced=10.8))) { // expected-note{{marked partial here}}
+  return fn_10_7();
+}
+
+__attribute__((objc_root_class))
+@interface LookupAvailabilityBase
+-(void) method1;
+@end
+
+@implementation LookupAvailabilityBase
+-(void)method1 { fn_10_7(); } // expected-warning{{only available on macOS 10.7}} expected-note{{@available}}
+@end
+
+__attribute__((availability(macosx, introduced=10.7)))
+@interface LookupAvailability : LookupAvailabilityBase
+- (void)method2;
+- (void)method3;
+- (void)method4 __attribute__((availability(macosx, introduced=10.8)));
+@end
+
+@implementation LookupAvailability
+-(void)method2 { fn_10_7(); }
+-(void)method3 { fn_10_8(); } // expected-warning{{only available on macOS 10.8}} expected-note{{@available}}
+-(void)method4 { fn_10_8(); }
+@end
+
+int old_func() __attribute__((availability(macos, introduced=10.4))) {
+  fn_10_5();
+}
+
+#endif
diff --git a/test/SemaObjC/attr-nodebug.m b/test/SemaObjC/attr-nodebug.m
new file mode 100644
index 0000000..7cf8e6c
--- /dev/null
+++ b/test/SemaObjC/attr-nodebug.m
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+@interface NSObject
+- (void)doSomething __attribute__((nodebug));
+@end
diff --git a/test/SemaObjC/call-super-2.m b/test/SemaObjC/call-super-2.m
index 8927f3b..01acff7 100644
--- a/test/SemaObjC/call-super-2.m
+++ b/test/SemaObjC/call-super-2.m
@@ -106,3 +106,18 @@
 }
 @end
 
+@class C;
+@interface A // expected-note {{receiver is instance of class declared here}}
+- (instancetype)initWithCoder:(A *)coder;
+@end
+
+@interface B : A
+@end
+
+@implementation B
+- (instancetype)initWithCoder:(C *)coder {
+  if (0 != (self = [super initWithCode:code])) // expected-error {{use of undeclared identifier 'code'}} expected-warning {{instance method '-initWithCode:' not found}}
+    return (void *)0;
+  return (void *)0;
+}
+@end
diff --git a/test/SemaObjC/dllexport.m b/test/SemaObjC/dllexport.m
new file mode 100644
index 0000000..e90b982
--- /dev/null
+++ b/test/SemaObjC/dllexport.m
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple i686-windows -fdeclspec -fsyntax-only -verify %s
+
+__declspec(dllexport) typedef int typedef1;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef __declspec(dllexport) int typedef2;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef int __declspec(dllexport) typedef3;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef __declspec(dllexport) void (*FunTy)();
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+enum __declspec(dllexport) E { Val };
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+struct __declspec(dllexport) Record {};
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+
+__declspec(dllexport)
+__attribute__((__objc_root_class__))
+@interface NSObject
+@end
+
+__declspec(dllexport)
+@interface I : NSObject
+- (void)method;
+@end
+
+@implementation I
+- (void)method {
+}
+@end
+
diff --git a/test/SemaObjC/dllimport.m b/test/SemaObjC/dllimport.m
new file mode 100644
index 0000000..b836077
--- /dev/null
+++ b/test/SemaObjC/dllimport.m
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple i686-windows -fdeclspec -fsyntax-only -verify %s
+
+__declspec(dllimport) typedef int typedef1;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef __declspec(dllimport) int typedef2;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef int __declspec(dllimport) typedef3;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef __declspec(dllimport) void (*FunTy)();
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+enum __declspec(dllimport) E { Val };
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+struct __declspec(dllimport) Record {};
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+
+__declspec(dllimport)
+__attribute__((__objc_root_class__))
+@interface NSObject
+@end
+
+__declspec(dllimport)
+@interface I : NSObject
+- (void)method;
+@end
+
+@implementation I
+- (void)method {
+}
+@end
+
diff --git a/test/SemaObjC/method-warn-unused-attribute.m b/test/SemaObjC/method-warn-unused-attribute.m
index 042f442..b83dabf 100644
--- a/test/SemaObjC/method-warn-unused-attribute.m
+++ b/test/SemaObjC/method-warn-unused-attribute.m
@@ -9,8 +9,8 @@
 
 void foo(INTF *a) {
   [a garf];
-  [a fee]; // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
-  [INTF c]; // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
+  [a fee]; // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
+  [INTF c]; // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
 }
 
 
diff --git a/test/SemaObjC/nsobject-attribute.m b/test/SemaObjC/nsobject-attribute.m
index 6bd2d5d..7c8d75d 100644
--- a/test/SemaObjC/nsobject-attribute.m
+++ b/test/SemaObjC/nsobject-attribute.m
@@ -21,6 +21,8 @@
 @property (nonatomic, retain) CGColorRefNoNSObject color;
 // rdar://problem/12197822
 @property (strong) __attribute__((NSObject)) CFTypeRef myObj; // no-warning
+//rdar://problem/27747154
+@property (strong, nullable) CGColorRefNoNSObject color2; // no-warning
 @end
 
 void setProperty(id self, id value)  {
diff --git a/test/SemaObjC/nullability.m b/test/SemaObjC/nullability.m
index 36ac6b9..fbf014c 100644
--- a/test/SemaObjC/nullability.m
+++ b/test/SemaObjC/nullability.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fblocks -Woverriding-method-mismatch -Wno-nullability-declspec %s -verify
+// RUN: %clang_cc1 -fsyntax-only -fblocks -Woverriding-method-mismatch -Wno-nullability-declspec -Wnullable-to-nonnull-conversion %s -verify
 
 __attribute__((objc_root_class))
 @interface NSFoo
@@ -230,3 +230,29 @@
 
   int *x = (^ _Nullable id(void) { return 0; })(); // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'id _Nullable'}}
 }
+
+// Check nullability of conditional expressions.
+void conditional_expr(int c) {
+  NSFoo * _Nonnull p;
+  NSFoo * _Nonnull nonnullP;
+  NSFoo * _Nullable nullableP;
+  NSFoo * _Null_unspecified unspecifiedP;
+  NSFoo *noneP;
+
+  p = c ? nonnullP : nonnullP;
+  p = c ? nonnullP : nullableP; // expected-warning{{implicit conversion from nullable pointer 'NSFoo * _Nullable' to non-nullable pointer type 'NSFoo * _Nonnull'}}
+  p = c ? nonnullP : unspecifiedP;
+  p = c ? nonnullP : noneP;
+  p = c ? nullableP : nonnullP; // expected-warning{{implicit conversion from nullable pointer 'NSFoo * _Nullable' to non-nullable pointer type 'NSFoo * _Nonnull'}}
+  p = c ? nullableP : nullableP; // expected-warning{{implicit conversion from nullable pointer 'NSFoo * _Nullable' to non-nullable pointer type 'NSFoo * _Nonnull'}}
+  p = c ? nullableP : unspecifiedP; // expected-warning{{implicit conversion from nullable pointer 'NSFoo * _Nullable' to non-nullable pointer type 'NSFoo * _Nonnull'}}
+  p = c ? nullableP : noneP; // expected-warning{{implicit conversion from nullable pointer 'NSFoo * _Nullable' to non-nullable pointer type 'NSFoo * _Nonnull'}}
+  p = c ? unspecifiedP : nonnullP;
+  p = c ? unspecifiedP : nullableP; // expected-warning{{implicit conversion from nullable pointer 'NSFoo * _Nullable' to non-nullable pointer type 'NSFoo * _Nonnull'}}
+  p = c ? unspecifiedP : unspecifiedP;
+  p = c ? unspecifiedP : noneP;
+  p = c ? noneP : nonnullP;
+  p = c ? noneP : nullableP; // expected-warning{{implicit conversion from nullable pointer 'NSFoo * _Nullable' to non-nullable pointer type 'NSFoo * _Nonnull'}}
+  p = c ? noneP : unspecifiedP;
+  p = c ? noneP : noneP;
+}
diff --git a/test/SemaObjC/objc-array-literal.m b/test/SemaObjC/objc-array-literal.m
index 281994a..c0a0c67 100644
--- a/test/SemaObjC/objc-array-literal.m
+++ b/test/SemaObjC/objc-array-literal.m
@@ -67,3 +67,11 @@
   x = @[ @"stuff", @"hello" "world"]; // expected-warning {{concatenated NSString literal for an NSArray expression}}
   return x;
 }
+
+enum XXXYYYZZZType { XXXYYYZZZTypeAny }; // expected-note {{'XXXYYYZZZTypeAny' declared here}}
+void foo() {
+  NSArray *array = @[
+    @(XXXYYYZZZTypeA),                 // expected-error {{use of undeclared identifier 'XXXYYYZZZTypeA'; did you mean 'XXXYYYZZZTypeAny'}}
+    @(XXXYYYZZZTypeSomethingSomething) // expected-error {{use of undeclared identifier 'XXXYYYZZZTypeSomethingSomething'}}
+  ];
+}
diff --git a/test/SemaObjC/objc-class-property.m b/test/SemaObjC/objc-class-property.m
index 5e1b866..5628597 100644
--- a/test/SemaObjC/objc-class-property.m
+++ b/test/SemaObjC/objc-class-property.m
@@ -33,6 +33,7 @@
 
 int test() {
   A *a = [[A alloc] init];
+  a.c; // expected-error {{property 'c' is a class property; did you mean to access it with class 'A'}}
   return a.x + A.c;
 }
 
@@ -43,3 +44,16 @@
 void message_class(Class me) {
   [me c2];
 }
+
+@interface NSObject
+@end
+
+@interface MyClass : NSObject
+@property(class, readonly) int classProp; // expected-note {{property declared here}}
+@end
+
+@implementation MyClass // expected-warning {{class property 'classProp' requires method 'classProp' to be defined}}
+- (int)classProp { // Oops, mistakenly made this an instance method.
+  return 8;
+}
+@end
diff --git a/test/SemaObjC/objc-dictionary-literal.m b/test/SemaObjC/objc-dictionary-literal.m
index 1a2c29f..ce301a0 100644
--- a/test/SemaObjC/objc-dictionary-literal.m
+++ b/test/SemaObjC/objc-dictionary-literal.m
@@ -63,3 +63,10 @@
 	return 0;
 }
 
+enum XXXYYYZZZType { XXXYYYZZZTypeAny }; // expected-note {{'XXXYYYZZZTypeAny' declared here}}
+void foo() {
+  NSDictionary *d = @{
+    @"A" : @(XXXYYYZZZTypeA), // expected-error {{use of undeclared identifier 'XXXYYYZZZTypeA'; did you mean 'XXXYYYZZZTypeAny'}}
+    @"F" : @(XXXYYYZZZTypeSomethingSomething), // expected-error {{use of undeclared identifier 'XXXYYYZZZTypeSomethingSomething'}}
+  };
+}
diff --git a/test/SemaObjC/objcbridge-attribute-arc.m b/test/SemaObjC/objcbridge-attribute-arc.m
index 3bcfdf4..26dbce0 100644
--- a/test/SemaObjC/objcbridge-attribute-arc.m
+++ b/test/SemaObjC/objcbridge-attribute-arc.m
@@ -23,7 +23,10 @@
 
 typedef union __CFUColor __attribute__((objc_bridge(NSUColor))) * CFUColorRef; // expected-error {{parameter of 'objc_bridge' attribute must be 'id' when used on a typedef}}
 
-typedef union __CFUColor __attribute__((objc_bridge(NSUColor))) * CFUColorRef; // expected-error {{parameter of 'objc_bridge' attribute must be 'id' when used on a typedef}}
+// This error requires C11.
+#if __STDC_VERSION__ > 199901L
+typedef union __CFUColor __attribute__((objc_bridge(NSUColor))) * CFUColorRef; // expected-error {{parameter of 'objc_bridge' attribute must be 'id' when used on a typedef}}
+#endif
 
 typedef union __CFUColor __attribute__((objc_bridge(NSUColor))) *CFUColor1Ref; // expected-error {{parameter of 'objc_bridge' attribute must be 'id' when used on a typedef}}
 
diff --git a/test/SemaObjC/property-deprecated-warning.m b/test/SemaObjC/property-deprecated-warning.m
index cec3768..8cf4f97 100644
--- a/test/SemaObjC/property-deprecated-warning.m
+++ b/test/SemaObjC/property-deprecated-warning.m
@@ -9,7 +9,7 @@
 @property(nonatomic,assign) id ptarget __attribute__((availability(ios,introduced=2.0,deprecated=3.0))); // expected-note {{property 'ptarget' is declared deprecated here}} expected-note {{'ptarget' has been explicitly marked deprecated here}}
 
 #if defined(WARN_PARTIAL)
-// expected-note@+2 {{property 'partialPtarget' is declared partial here}} expected-note@+2 {{'partialPtarget' has been explicitly marked partial here}}
+// expected-note@+2 {{'partialPtarget' has been explicitly marked partial here}}
 #endif
 @property(nonatomic,assign) id partialPtarget __attribute__((availability(ios,introduced=5.0)));
 @end
@@ -24,7 +24,7 @@
 @property(nonatomic,assign) id target __attribute__((availability(ios,introduced=2.0,deprecated=3.0))); // expected-note {{property 'target' is declared deprecated here}} expected-note {{'setTarget:' has been explicitly marked deprecated here}}
 
 #if defined(WARN_PARTIAL)
-// expected-note@+2 {{property 'partialTarget' is declared partial here}} expected-note@+2 {{'setPartialTarget:' has been explicitly marked partial here}}
+// expected-note@+2 {{'setPartialTarget:' has been explicitly marked partial here}}
 #endif
 @property(nonatomic,assign) id partialTarget __attribute__((availability(ios,introduced=5.0)));
 @end
@@ -40,7 +40,7 @@
                                                                                     // expected-note 2 {{'setDep_target:' has been explicitly marked deprecated here}}
 
 #if defined(WARN_PARTIAL)
-// expected-note@+2 4 {{property 'partial_dep_target' is declared partial here}} expected-note@+2 2 {{'partial_dep_target' has been explicitly marked partial here}} expected-note@+2 2 {{'setPartial_dep_target:' has been explicitly marked partial here}}
+// expected-note@+2 2 {{'partial_dep_target' has been explicitly marked partial here}} expected-note@+2 2 {{'setPartial_dep_target:' has been explicitly marked partial here}}
 #endif
 @property(nonatomic,assign) id partial_dep_target  __attribute__((availability(ios,introduced=5.0)));
 @end
@@ -57,7 +57,7 @@
   [self setPtarget: (id)0]; // no-warning
   [self setPartialTarget: (id)0]; // no-warning
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'partial_dep_target' is partial: introduced in iOS 5.0}} expected-warning@+2 {{'setPartial_dep_target:' is partial: introduced in iOS 5.0}} expected-note@+2 {{explicitly redeclare 'partial_dep_target' to silence this warning}} expected-note@+2 {{explicitly redeclare 'setPartial_dep_target:' to silence this warning}}
+  // expected-warning@+2 {{'partial_dep_target' is only available on iOS 5.0 or newer}} expected-warning@+2 {{'setPartial_dep_target:' is only available on iOS 5.0 or newer}} expected-note@+2 {{enclose 'partial_dep_target' in an @available check to silence this warning}} expected-note@+2 {{enclose 'setPartial_dep_target:' in an @available check to silence this warning}}
 #endif
   [self setPartial_dep_target: [self partial_dep_target]];
 
@@ -82,11 +82,11 @@
   [self setPtarget: (id)0]; // no-warning
 
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'setPartialTarget:' is partial: introduced in iOS 5.0}} expected-note@+2 {{explicitly redeclare 'setPartialTarget:' to silence this warning}}
+  // expected-warning@+2 {{'setPartialTarget:' is only available on iOS 5.0 or newer}} expected-note@+2 {{enclose 'setPartialTarget:' in an @available check to silence this warning}}
 #endif
   [self setPartialTarget: (id)0];
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'partial_dep_target' is partial: introduced in iOS 5.0}} expected-warning@+2 {{'setPartial_dep_target:' is partial: introduced in iOS 5.0}} expected-note@+2 {{explicitly redeclare 'partial_dep_target' to silence this warning}} expected-note@+2 {{explicitly redeclare 'setPartial_dep_target:' to silence this warning}}
+  // expected-warning@+2 {{'partial_dep_target' is only available on iOS 5.0 or newer}} expected-warning@+2 {{'setPartial_dep_target:' is only available on iOS 5.0 or newer}} expected-note@+2 {{enclose 'partial_dep_target' in an @available check to silence this warning}} expected-note@+2 {{enclose 'setPartial_dep_target:' in an @available check to silence this warning}}
 #endif
   [self setPartial_dep_target: [self partial_dep_target]];
   [self setPartialPtarget: (id)0]; // no-warning
@@ -100,12 +100,12 @@
 @property(setter=setNewDelegate:,assign) id delegate __attribute__((availability(ios,introduced=2.0,deprecated=3.0))); // expected-note {{'setNewDelegate:' has been explicitly marked deprecated here}} expected-note {{property 'delegate' is declared deprecated here}}
 
 #if defined(WARN_PARTIAL)
-// expected-note@+2 {{property 'partialEnabled' is declared partial here}} expected-note@+2 {{'partialIsEnabled' has been explicitly marked partial here}}
+// expected-note@+2 {{'partialIsEnabled' has been explicitly marked partial here}}
 #endif
 @property(getter=partialIsEnabled,assign) BOOL partialEnabled __attribute__((availability(ios,introduced=5.0)));
 
 #if defined(WARN_PARTIAL)
-// expected-note@+2 {{property 'partialDelegate' is declared partial here}} expected-note@+2 {{'partialSetNewDelegate:' has been explicitly marked partial here}}
+// expected-note@+2 {{'partialSetNewDelegate:' has been explicitly marked partial here}}
 #endif
 @property(setter=partialSetNewDelegate:,assign) id partialDelegate __attribute__((availability(ios,introduced=5.0)));
 @end
@@ -115,7 +115,7 @@
     [obj setNewDelegate:0]; // expected-warning {{'setNewDelegate:' is deprecated: first deprecated in iOS 3.0}}
 
 #if defined(WARN_PARTIAL)
-// expected-warning@+2 {{'partialIsEnabled' is partial: introduced in iOS 5.0}} expected-warning@+3 {{'partialSetNewDelegate:' is partial: introduced in iOS 5.0}} expected-note@+2 {{explicitly redeclare 'partialIsEnabled' to silence this warning}} expected-note@+3 {{explicitly redeclare 'partialSetNewDelegate:' to silence this warning}}
+  // expected-warning@+2 {{'partialIsEnabled' is only available on iOS 5.0 or newer}} expected-warning@+3 {{'partialSetNewDelegate:' is only available on iOS 5.0 or newer}} expected-note@+2 {{enclose 'partialIsEnabled' in an @available check to silence this warning}} expected-note@+3 {{enclose 'partialSetNewDelegate:' in an @available check to silence this warning}}
 #endif
   if ([obj partialIsEnabled])
     [obj partialSetNewDelegate:0];
@@ -138,7 +138,7 @@
   if (flag)
     return [obj partialPtarget];  // no-warning
 #if defined(WARN_PARTIAL)
-// expected-warning@+2 {{'partialPtarget' is partial: introduced in iOS 5.0}} expected-note@+2 {{explicitly redeclare 'partialPtarget' to silence this warning}}
+// expected-warning@+2 {{'partialPtarget' is only available on iOS 5.0 or newer}} expected-note@+2 {{enclose 'partialPtarget' in an @available check to silence this warning}}
 #endif
   return [obj2 partialPtarget];
 }
diff --git a/test/SemaObjC/typo-correction-arc.m b/test/SemaObjC/typo-correction-arc.m
new file mode 100644
index 0000000..206d545
--- /dev/null
+++ b/test/SemaObjC/typo-correction-arc.m
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple i386-apple-macosx10.10 -fobjc-arc -fsyntax-only -Wno-objc-root-class %s -verify
+
+typedef unsigned long NSUInteger;
+
+id nameless;                                  // expected-note{{'nameless' declared here}}
+
+@interface NSArray
+- (instancetype)initWithObjects:(const id[])objects count:(NSUInteger)count;
+@end
+
+@interface I
+@property NSArray *array;
+- (id)getArrayById:(id)name;
+- (void)setArrayValue:(id)array;
+@end
+
+@interface J
+- (void)setArray:(id)array;
+- (void)setIvarArray;
+@end
+
+@implementation J {
+  I *i;
+}
+- (void)setArray:(id)array {  // expected-note{{'array' declared here}}
+  i.array = aray;             // expected-error{{use of undeclared identifier 'aray'; did you mean 'array'}}
+}
+- (void)setIvarArray {
+  [i setArrayValue:[i getArrayById:nameles]]; // expected-error{{use of undeclared identifier 'nameles'; did you mean 'nameless'}}
+}
+@end
+
diff --git a/test/SemaObjC/unguarded-availability.m b/test/SemaObjC/unguarded-availability.m
new file mode 100644
index 0000000..fe25c8b
--- /dev/null
+++ b/test/SemaObjC/unguarded-availability.m
@@ -0,0 +1,180 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macosx-10.9 -Wunguarded-availability -fblocks -fsyntax-only -verify %s
+// RUN: %clang_cc1 -xobjective-c++ -DOBJCPP -triple x86_64-apple-macosx-10.9 -Wunguarded-availability -fblocks -fsyntax-only -verify %s
+
+#define AVAILABLE_10_0  __attribute__((availability(macos, introduced = 10.0)))
+#define AVAILABLE_10_11 __attribute__((availability(macos, introduced = 10.11)))
+#define AVAILABLE_10_12 __attribute__((availability(macos, introduced = 10.12)))
+
+int func_10_11() AVAILABLE_10_11; // expected-note 4 {{'func_10_11' has been explicitly marked partial here}}
+
+#ifdef OBJCPP
+// expected-note@+2 {{marked partial here}}
+#endif
+int func_10_12() AVAILABLE_10_12; // expected-note 5 {{'func_10_12' has been explicitly marked partial here}}
+
+int func_10_0() AVAILABLE_10_0;
+
+void use_func() {
+  func_10_11(); // expected-warning{{'func_10_11' is only available on macOS 10.11 or newer}} expected-note{{enclose 'func_10_11' in an @available check to silence this warning}}
+
+  if (@available(macos 10.11, *))
+    func_10_11();
+  else
+    func_10_11(); // expected-warning{{'func_10_11' is only available on macOS 10.11 or newer}} expected-note{{enclose 'func_10_11' in an @available check to silence this warning}}
+}
+
+void defn_10_11() AVAILABLE_10_11;
+
+void defn_10_11() {
+  func_10_11();
+}
+
+void nested_ifs() {
+  if (@available(macos 10.12, *)) {
+    if (@available(macos 10.10, *)) {
+      func_10_12();
+    } else {
+      func_10_12();
+    }
+  } else {
+    func_10_12(); // expected-warning{{'func_10_12' is only available on macOS 10.12 or newer}} expected-note{{enclose 'func_10_12' in an @available check to silence this warning}}
+  }
+}
+
+void star_case() {
+  if (@available(ios 9, *)) {
+    func_10_11(); // expected-warning{{'func_10_11' is only available on macOS 10.11 or newer}} expected-note{{enclose 'func_10_11' in an @available check to silence this warning}}
+    func_10_0();
+  } else
+    func_10_11(); // expected-warning{{'func_10_11' is only available on macOS 10.11 or newer}} expected-note{{enclose 'func_10_11' in an @available check to silence this warning}}
+
+  if (@available(macos 10.11, *)) {
+    if (@available(ios 8, *)) {
+      func_10_11();
+      func_10_12(); // expected-warning{{'func_10_12' is only available on macOS 10.12 or newer}} expected-note{{enclose}}
+    } else {
+      func_10_11();
+      func_10_12(); // expected-warning{{'func_10_12' is only available on macOS 10.12 or newer}} expected-note{{enclose}}
+    }
+  }
+}
+
+typedef int int_10_11 AVAILABLE_10_11; // expected-note {{'int_10_11' has been explicitly marked partial here}}
+#ifdef OBJCPP
+// expected-note@+2 {{marked partial here}}
+#endif
+typedef int int_10_12 AVAILABLE_10_12; // expected-note 3 {{'int_10_12' has been explicitly marked partial here}}
+
+void use_typedef() {
+  int_10_11 x; // expected-warning{{'int_10_11' is only available on macOS 10.11 or newer}} expected-note{{enclose 'int_10_11' in an @available check to silence this warning}}
+}
+
+__attribute__((objc_root_class))
+AVAILABLE_10_11 @interface Class_10_11 {
+  int_10_11 foo;
+  int_10_12 bar; // expected-warning {{'int_10_12' is partial: introduced in macOS 10.12}} expected-note{{redeclare}}
+}
+- (void)method1;
+- (void)method2;
+@end
+
+@implementation Class_10_11
+- (void) method1 {
+  func_10_11();
+  func_10_12(); // expected-warning{{'func_10_12' is only available on macOS 10.12 or newer}} expected-note{{enclose 'func_10_12' in an @available check to silence this warning}}
+}
+
+- (void)method2 AVAILABLE_10_12 {
+  func_10_12();
+}
+
+@end
+
+int protected_scope() {
+  if (@available(macos 10.20, *)) { // expected-note 2 {{jump enters controlled statement of if available}}
+  label1:
+    return 0;
+  } else {
+  label2:
+    goto label1; // expected-error{{cannot jump from this goto statement to its label}}
+  }
+
+  goto label2; // expected-error{{cannot jump from this goto statement to its label}}
+}
+
+struct S {
+  int m1;
+  int m2 __attribute__((availability(macos, introduced = 10.12))); // expected-note{{marked partial here}}
+
+  struct Nested {
+    int nested_member __attribute__((availability(macos, introduced = 10.12))); // expected-note{{marked partial here}}
+  } n;
+};
+
+int test_members() {
+  struct S s;
+  (void)s.m1;
+  (void)s.m2; // expected-warning{{'m2' is only available on macOS 10.12 or newer}} expected-note{{@available}}
+
+  (void)s.n.nested_member; // expected-warning{{'nested_member' is only available on macOS 10.12 or newer}} expected-note{{@available}}
+}
+
+void test_blocks() {
+  (void) ^{
+    func_10_12(); // expected-warning{{'func_10_12' is only available on macOS 10.12 or newer}} expected-note{{@available}}
+  };
+}
+
+void test_params(int_10_12 x); // expected-warning {{'int_10_12' is partial: introduced in macOS 10.12}} expected-note{{redeclare}}
+
+// FIXME: This should be fine!
+void test_params2(int_10_12 x) AVAILABLE_10_12; // expected-warning {{'int_10_12' is partial: introduced in macOS 10.12}} expected-note{{redeclare}}
+
+#ifdef OBJCPP
+
+int f(char) AVAILABLE_10_12;
+int f(int);
+
+template <class T> int use_f() {
+  // FIXME: We should warn here!
+  return f(T());
+}
+
+int a = use_f<int>();
+int b = use_f<char>();
+
+template <class> int use_at_available() {
+  if (@available(macos 10.12, *))
+    return func_10_12();
+  else
+    return func_10_12(); // expected-warning {{'func_10_12' is only available on macOS 10.12 or newer}} expected-note{{enclose}}
+}
+
+int instantiate_template() {
+  if (@available(macos 10.12, *)) {
+    use_at_available<char>();
+  } else {
+    use_at_available<float>();
+  }
+}
+
+template <class>
+int with_availability_attr() AVAILABLE_10_11 { // expected-note 2 {{marked partial here}}
+  return 0;
+}
+
+int instantiate_with_availability_attr() {
+  if (@available(macos 10.12, *))
+    with_availability_attr<char>();
+  else
+    with_availability_attr<int>(); // expected-warning {{'with_availability_attr<int>' is only available on macOS 10.11 or newer}} expected-note {{enclose}}
+}
+
+int instantiate_availability() {
+  if (@available(macos 10.12, *))
+    with_availability_attr<int_10_12>();
+  else
+    with_availability_attr<int_10_12>(); // expected-warning{{'with_availability_attr<int>' is only available on macOS 10.11 or newer}} expected-warning{{'int_10_12' is only available on macOS 10.12 or newer}} expected-note 2 {{enclose}}
+}
+
+#endif
diff --git a/test/SemaObjCXX/dllexport.mm b/test/SemaObjCXX/dllexport.mm
new file mode 100644
index 0000000..739749f
--- /dev/null
+++ b/test/SemaObjCXX/dllexport.mm
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -triple i686-windows -fdeclspec -fsyntax-only -verify %s
+
+__declspec(dllexport) typedef int typedef1;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef __declspec(dllexport) int typedef2;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef int __declspec(dllexport) typedef3;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef __declspec(dllexport) void (*FunTy)();
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+enum __declspec(dllexport) E { };
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+#if __has_feature(cxx_strong_enums)
+enum class __declspec(dllexport) F { };
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+#endif
+
+__declspec(dllexport)
+__attribute__((__objc_root_class__))
+@interface NSObject
+@end
+
+__declspec(dllexport)
+@interface I : NSObject
+- (void)method;
+@end
+
+@implementation I
+- (void)method {
+}
+@end
+
+
diff --git a/test/SemaObjCXX/dllimport.mm b/test/SemaObjCXX/dllimport.mm
new file mode 100644
index 0000000..4c348c4
--- /dev/null
+++ b/test/SemaObjCXX/dllimport.mm
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple i686-windows -fdeclspec -fsyntax-only -verify %s
+
+__declspec(dllimport) typedef int typedef1;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef __declspec(dllimport) int typedef2;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef int __declspec(dllimport) typedef3;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef __declspec(dllimport) void (*FunTy)();
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+enum __declspec(dllimport) E { };
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+#if __has_feature(cxx_strong_enums)
+enum class __declspec(dllimport) F { };
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+#endif
+
+__declspec(dllimport)
+__attribute__((__objc_root_class__))
+@interface NSObject
+@end
+
+__declspec(dllimport)
+@interface I : NSObject
+- (void)method;
+@end
+
+@implementation I
+- (void)method {
+}
+@end
+
diff --git a/test/SemaObjCXX/foreach.mm b/test/SemaObjCXX/foreach.mm
index d1302c1..99f5d0c 100644
--- a/test/SemaObjCXX/foreach.mm
+++ b/test/SemaObjCXX/foreach.mm
@@ -6,10 +6,8 @@
 void f(NSArray *a) {
     id keys;
     for (int i : a); // expected-error{{selector element type 'int' is not a valid object}} 
-    for ((id)2 : a);  // expected-error {{for range declaration must declare a variable}} \
-                      // expected-warning {{expression result unused}}
-    for (2 : a); // expected-error {{for range declaration must declare a variable}} \
-                 // expected-warning {{expression result unused}}
+    for ((id)2 : a);  // expected-error {{for range declaration must declare a variable}}
+    for (2 : a); // expected-error {{for range declaration must declare a variable}}
   
   for (id thisKey : keys);
 
@@ -65,8 +63,7 @@
 @end
 void test2(NSObject<NSFastEnumeration> *collection) {
   Test2 *obj;
-  for (obj.prop : collection) { // expected-error {{for range declaration must declare a variable}} \
-                                // expected-warning {{property access result unused - getters should not be used for side effects}}
+  for (obj.prop : collection) { // expected-error {{for range declaration must declare a variable}}
   }
 }
 
diff --git a/test/SemaObjCXX/instancetype.mm b/test/SemaObjCXX/instancetype.mm
index 89ff2b4..f61d6bf 100644
--- a/test/SemaObjCXX/instancetype.mm
+++ b/test/SemaObjCXX/instancetype.mm
@@ -214,3 +214,10 @@
   return 0;
 }
 @end
+
+// PR27822
+@class NSString;
+namespace pr27822 { }
+@interface AXPlatformNodeCocoa
++ (NSString*)nativeRoleFromAXRole:(pr27822::UndeclaredIdentifier)role; // expected-error {{expected a type}}
+@end
diff --git a/test/SemaOpenCL/access-qualifier.cl b/test/SemaOpenCL/access-qualifier.cl
new file mode 100644
index 0000000..7e5c70f
--- /dev/null
+++ b/test/SemaOpenCL/access-qualifier.cl
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -pedantic -fsyntax-only -cl-std=CL1.2 %s
+// RUN: %clang_cc1 -verify -pedantic -fsyntax-only -cl-std=CL2.0 %s
+
+typedef image1d_t img1d_ro_default; // expected-note {{previously declared 'read_only' here}}
+
+typedef write_only image1d_t img1d_wo; // expected-note {{previously declared 'write_only' here}}
+typedef read_only image1d_t img1d_ro;
+
+#if __OPENCL_C_VERSION__ >= 200
+typedef read_write image1d_t img1d_rw;
+#endif
+
+typedef int Int;
+typedef read_only int IntRO; // expected-error {{access qualifier can only be used for pipe and image type}}
+
+
+void myWrite(write_only image1d_t); // expected-note {{passing argument to parameter here}} expected-note {{passing argument to parameter here}}
+void myRead(read_only image1d_t); // expected-note {{passing argument to parameter here}}
+
+#if __OPENCL_C_VERSION__ >= 200
+void myReadWrite(read_write image1d_t);
+#else
+void myReadWrite(read_write image1d_t); // expected-error {{access qualifier 'read_write' can not be used for '__read_write image1d_t' prior to OpenCL version 2.0}}
+#endif
+
+
+kernel void k1(img1d_wo img) {
+  myRead(img); // expected-error {{passing 'img1d_wo' (aka '__write_only image1d_t') to parameter of incompatible type '__read_only image1d_t'}}
+}
+
+kernel void k2(img1d_ro img) {
+  myWrite(img); // expected-error {{passing 'img1d_ro' (aka '__read_only image1d_t') to parameter of incompatible type '__write_only image1d_t'}}
+}
+
+kernel void k3(img1d_wo img) {
+  myWrite(img);
+}
+
+#if __OPENCL_C_VERSION__ >= 200
+kernel void k4(img1d_rw img) {
+  myReadWrite(img);
+}
+#endif
+
+kernel void k5(img1d_ro_default img) {
+  myWrite(img); // expected-error {{passing 'img1d_ro_default' (aka '__read_only image1d_t') to parameter of incompatible type '__write_only image1d_t'}}
+}
+
+kernel void k6(img1d_ro img) {
+  myRead(img);
+}
+
+kernel void k7(read_only img1d_wo img){} // expected-error {{multiple access qualifiers}}
+
+kernel void k8(write_only img1d_ro_default img){} // expected-error {{multiple access qualifiers}}
+
+kernel void k9(read_only int i){} // expected-error{{access qualifier can only be used for pipe and image type}}
+
+kernel void k10(read_only Int img){} // expected-error {{access qualifier can only be used for pipe and image type}}
+
+kernel void k11(read_only write_only image1d_t i){} // expected-error{{multiple access qualifiers}}
+
+kernel void k12(read_only read_only image1d_t i){} // expected-error{{multiple access qualifiers}}
+
+#if __OPENCL_C_VERSION__ >= 200
+kernel void k13(read_write pipe int i){} // expected-error{{access qualifier 'read_write' can not be used for 'pipe int'}}
+#else
+kernel void k13(__read_write image1d_t i){} // expected-error{{access qualifier '__read_write' can not be used for '__read_write image1d_t' prior to OpenCL version 2.0}}
+#endif
diff --git a/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl b/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
index 50363f2..97fd07a 100644
--- a/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
+++ b/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
@@ -225,3 +225,69 @@
 // expected-error@-2{{passing '__constant int *' to parameter of type '__generic int *' changes address space of pointer}}
 #endif
 }
+
+void test_ternary() {
+  AS int *var_cond;
+  generic int *var_gen;
+  global int *var_glob;
+  var_gen = 0 ? var_cond : var_glob;
+#ifdef CONSTANT
+// expected-error@-2{{conditional operator with the second and third operands of type  ('__constant int *' and '__global int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  local int *var_loc;
+  var_gen = 0 ? var_cond : var_loc;
+#ifndef GENERIC
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|constant}} int *' and '__local int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  constant int *var_const;
+  var_cond = 0 ? var_cond : var_const;
+#ifndef CONSTANT
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|generic}} int *' and '__constant int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  private int *var_priv;
+  var_gen = 0 ? var_cond : var_priv;
+#ifndef GENERIC
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|constant}} int *' and 'int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  var_gen = 0 ? var_cond : var_gen;
+#ifdef CONSTANT
+// expected-error@-2{{conditional operator with the second and third operands of type  ('__constant int *' and '__generic int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  void *var_void_gen;
+  global char *var_glob_ch;
+  var_void_gen = 0 ? var_cond : var_glob_ch;
+#ifdef CONSTANT
+// expected-error@-2{{conditional operator with the second and third operands of type  ('__constant int *' and '__global char *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  local char *var_loc_ch;
+  var_void_gen = 0 ? var_cond : var_loc_ch;
+#ifndef GENERIC
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|constant}} int *' and '__local char *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  constant void *var_void_const;
+  constant char *var_const_ch;
+  var_void_const = 0 ? var_cond : var_const_ch;
+#ifndef CONSTANT
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|generic}} int *' and '__constant char *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  private char *var_priv_ch;
+  var_void_gen = 0 ? var_cond : var_priv_ch;
+#ifndef GENERIC
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|constant}} int *' and 'char *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  generic char *var_gen_ch;
+  var_void_gen = 0 ? var_cond : var_gen_ch;
+#ifdef CONSTANT
+// expected-error@-2{{conditional operator with the second and third operands of type  ('__constant int *' and '__generic char *') which are pointers to non-overlapping address spaces}}
+#endif
+}
+
diff --git a/test/SemaOpenCL/as_type.cl b/test/SemaOpenCL/as_type.cl
new file mode 100644
index 0000000..f0bf4d7
--- /dev/null
+++ b/test/SemaOpenCL/as_type.cl
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple spir-unknown-unknown -o - -verify -fsyntax-only
+
+typedef __attribute__(( ext_vector_type(3) )) char char3;
+typedef __attribute__(( ext_vector_type(16) )) char char16;
+
+char3 f1(char16 x) {
+  return  __builtin_astype(x, char3); // expected-error{{invalid reinterpretation: sizes of 'char3' (vector of 3 'char' values) and 'char16' (vector of 16 'char' values) must match}}
+}
+
+char16 f3(int x) {
+  return __builtin_astype(x, char16); // expected-error{{invalid reinterpretation: sizes of 'char16' (vector of 16 'char' values) and 'int' must match}}
+}
+
diff --git a/test/SemaOpenCL/builtin.cl b/test/SemaOpenCL/builtin.cl
new file mode 100644
index 0000000..d48a0c4
--- /dev/null
+++ b/test/SemaOpenCL/builtin.cl
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
+
+// expected-no-diagnostics
+
+float __attribute__((overloadable)) acos(float);
+
+typedef float float4 __attribute__((ext_vector_type(4)));
+int printf(__constant const char* st, ...);
+
+void test(void)
+{
+  float4 a;
+  printf("%8.4v4hlf\n", a);
+}
diff --git a/test/SemaOpenCL/cl20-device-side-enqueue.cl b/test/SemaOpenCL/cl20-device-side-enqueue.cl
new file mode 100644
index 0000000..298b810
--- /dev/null
+++ b/test/SemaOpenCL/cl20-device-side-enqueue.cl
@@ -0,0 +1,172 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -verify -pedantic -fsyntax-only
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -verify -pedantic -fsyntax-only -Wconversion -DWCONV
+
+// Diagnostic tests for different overloads of enqueue_kernel from Table 6.13.17.1 of OpenCL 2.0 Spec.
+kernel void enqueue_kernel_tests() {
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  clk_event_t evt;
+  clk_event_t event_wait_list;
+  clk_event_t event_wait_list2[] = {evt, evt};
+  void *vptr;
+
+  // Testing the first overload type
+  enqueue_kernel(default_queue, flags, ndrange, ^(void) {
+    return 0;
+  });
+
+  enqueue_kernel(vptr, flags, ndrange, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'queue_t' argument type}}
+    return 0;
+  });
+
+  enqueue_kernel(default_queue, vptr, ndrange, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'kernel_enqueue_flags_t' (i.e. uint) argument type}}
+    return 0;
+  });
+
+  enqueue_kernel(default_queue, flags, vptr, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'ndrange_t' argument type}}
+    return 0;
+  });
+
+  enqueue_kernel(default_queue, flags, ndrange, vptr); // expected-error{{illegal call to enqueue_kernel, expected block argument}}
+
+  enqueue_kernel(default_queue, flags, ndrange, ^(int i) { // expected-error{{blocks in this form of device side enqueue call are expected to have have no parameters}}
+    return 0;
+  });
+
+  // Testing the second overload type
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt, ^(void) {
+    return 0;
+  });
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, vptr, &evt, ^(void) // expected-error{{illegal call to enqueue_kernel, expected 'clk_event_t *' argument type}}
+                                                               {
+                                                                 return 0;
+                                                               });
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, vptr, ^(void) // expected-error{{illegal call to enqueue_kernel, expected 'clk_event_t *' argument type}}
+                                                                           {
+                                                                             return 0;
+                                                                           });
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt, vptr); // expected-error{{illegal call to enqueue_kernel, expected block argument}}
+
+  // Testing the third overload type
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024, 1024);
+
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024, 1024L); // expected-error{{local memory sizes need to be specified as uint}}
+
+  char c;
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 c, 1024);
+#ifdef WCONV
+// expected-warning@-2{{implicit conversion changes signedness: 'char' to 'unsigned int'}}
+#endif
+
+  typedef void (^bl_A_t)(local void *);
+
+  const bl_A_t block_A = (bl_A_t) ^ (local void *a) {};
+
+  enqueue_kernel(default_queue, flags, ndrange, block_A, 1024);
+
+  typedef void (^bl_B_t)(local void *, local int *);
+
+  const bl_B_t block_B = (bl_B_t) ^ (local void *a, local int *b) {};
+
+  enqueue_kernel(default_queue, flags, ndrange, block_B, 1024, 1024); // expected-error{{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+
+  enqueue_kernel(default_queue, flags, ndrange, // expected-error{{mismatch in number of block parameters and local size arguments passed}}
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024);
+
+  float illegal_mem_size = (float)0.5f;
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 illegal_mem_size, illegal_mem_size); // expected-error{{local memory sizes need to be specified as uint}} expected-error{{local memory sizes need to be specified as uint}}
+#ifdef WCONV
+// expected-warning@-2{{implicit conversion turns floating-point number into integer: 'float' to 'unsigned int'}} expected-warning@-2{{implicit conversion turns floating-point number into integer: 'float' to 'unsigned int'}}
+#endif
+
+  // Testing the forth overload type
+  enqueue_kernel(default_queue, flags, ndrange, 1, event_wait_list2, &evt,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024, 1024);
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt, // expected-error{{mismatch in number of block parameters and local size arguments passed}}
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024, 1024, 1024);
+
+  // More random misc cases that can't be deduced
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt); // expected-error{{illegal call to enqueue_kernel, incorrect argument types}}
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, 1); // expected-error{{illegal call to enqueue_kernel, incorrect argument types}}
+}
+
+// Diagnostic tests for get_kernel_work_group_size and allowed block parameter types in dynamic parallelism.
+kernel void work_group_size_tests() {
+  void (^const block_A)(void) = ^{
+    return;
+  };
+  void (^const block_B)(int) = ^(int a) {
+    return;
+  };
+  void (^const block_C)(local void *) = ^(local void *a) {
+    return;
+  };
+  void (^const block_D)(local int *) = ^(local int *a) {
+    return;
+  };
+
+  unsigned size = get_kernel_work_group_size(block_A);
+  size = get_kernel_work_group_size(block_C);
+  size = get_kernel_work_group_size(^(local void *a) {
+    return;
+  });
+  size = get_kernel_work_group_size(^(local int *a) { // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+    return;
+  });
+  size = get_kernel_work_group_size(block_B);   // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+  size = get_kernel_work_group_size(block_D);   // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+  size = get_kernel_work_group_size(^(int a) {  // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+    return;
+  });
+  size = get_kernel_work_group_size();          // expected-error {{too few arguments to function call, expected 1, have 0}}
+  size = get_kernel_work_group_size(1);         // expected-error{{expected block argument}}
+  size = get_kernel_work_group_size(block_A, 1); // expected-error{{too many arguments to function call, expected 1, have 2}}
+
+  size = get_kernel_preferred_work_group_size_multiple(block_A);
+  size = get_kernel_preferred_work_group_size_multiple(block_C);
+  size = get_kernel_preferred_work_group_size_multiple(^(local void *a) {
+    return;
+  });
+  size = get_kernel_preferred_work_group_size_multiple(^(local int *a) { // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+    return;
+  });
+  size = get_kernel_preferred_work_group_size_multiple(^(int a) {  // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+    return;
+  });
+  size = get_kernel_preferred_work_group_size_multiple(block_B);   // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+  size = get_kernel_preferred_work_group_size_multiple(block_D);   // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+  size = get_kernel_preferred_work_group_size_multiple();          // expected-error {{too few arguments to function call, expected 1, have 0}}
+  size = get_kernel_preferred_work_group_size_multiple(1);         // expected-error{{expected block argument}}
+  size = get_kernel_preferred_work_group_size_multiple(block_A, 1); // expected-error{{too many arguments to function call, expected 1, have 2}}
+}
diff --git a/test/SemaOpenCL/clang-builtin-version.cl b/test/SemaOpenCL/clang-builtin-version.cl
new file mode 100644
index 0000000..8574682
--- /dev/null
+++ b/test/SemaOpenCL/clang-builtin-version.cl
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 %s -fblocks -verify -pedantic -fsyntax-only -ferror-limit 100
+
+// Confirm CL2.0 Clang builtins are not available in earlier versions
+
+kernel void dse_builtins() {
+  int tmp;
+  enqueue_kernel(tmp, tmp, tmp, ^(void) { // expected-warning{{implicit declaration of function 'enqueue_kernel' is invalid in C99}}
+    return;
+  });
+  unsigned size = get_kernel_work_group_size(^(void) { // expected-warning{{implicit declaration of function 'get_kernel_work_group_size' is invalid in C99}}
+    return;
+  });
+  size = get_kernel_preferred_work_group_size_multiple(^(void) { // expected-warning{{implicit declaration of function 'get_kernel_preferred_work_group_size_multiple' is invalid in C99}}
+    return;
+  });
+}
+
+void pipe_builtins() {
+  int tmp;
+
+  read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'read_pipe' is invalid in C99}}
+  write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'write_pipe' is invalid in C99}}
+
+  reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'reserve_read_pipe' is invalid in C99}}
+  reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'reserve_write_pipe' is invalid in C99}}
+
+  work_group_reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'work_group_reserve_read_pipe' is invalid in C99}}
+  work_group_reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'work_group_reserve_write_pipe' is invalid in C99}}
+
+  sub_group_reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'sub_group_reserve_write_pipe' is invalid in C99}}
+  sub_group_reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'sub_group_reserve_read_pipe' is invalid in C99}}
+
+  commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'commit_read_pipe' is invalid in C99}}
+  commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'commit_write_pipe' is invalid in C99}}
+
+  work_group_commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'work_group_commit_read_pipe' is invalid in C99}}
+  work_group_commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'work_group_commit_write_pipe' is invalid in C99}}
+
+  sub_group_commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'sub_group_commit_write_pipe' is invalid in C99}}
+  sub_group_commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'sub_group_commit_read_pipe' is invalid in C99}}
+
+  get_pipe_num_packets(tmp); // expected-warning{{implicit declaration of function 'get_pipe_num_packets' is invalid in C99}}
+  get_pipe_max_packets(tmp); // expected-warning{{implicit declaration of function 'get_pipe_max_packets' is invalid in C99}}
+}
diff --git a/test/SemaOpenCL/event_t.cl b/test/SemaOpenCL/event_t.cl
index e098839..990c063 100644
--- a/test/SemaOpenCL/event_t.cl
+++ b/test/SemaOpenCL/event_t.cl
@@ -3,7 +3,7 @@
 event_t glb_evt; // expected-error {{the event_t type cannot be used to declare a program scope variable}}
 
 constant struct evt_s {
-  event_t evt;  // expected-error {{the event_t type cannot be used to declare a structure or union field}}
+  event_t evt; // expected-error {{the 'event_t' type cannot be used to declare a structure or union field}}
 } evt_str = {0};
 
 void foo(event_t evt); // expected-note {{passing argument to parameter 'evt' here}}
@@ -14,5 +14,6 @@
   foo(e);
   foo(0);
   foo(5); // expected-error {{passing 'int' to parameter of incompatible type 'event_t'}}
+  foo((event_t)1); // expected-error {{cannot cast non-zero value '1' to 'event_t'}}
 }
 
diff --git a/test/SemaOpenCL/extension-fp64-cl1.1.cl b/test/SemaOpenCL/extension-fp64-cl1.1.cl
deleted file mode 100644
index 7e852ae..0000000
--- a/test/SemaOpenCL/extension-fp64-cl1.1.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL1.1
-
-void f1(double da) { // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-  (void) 1.0; // expected-warning {{double precision constant requires cl_khr_fp64}}
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-void f2(void) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-void f3(void) {
-  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-}
diff --git a/test/SemaOpenCL/extension-fp64.cl b/test/SemaOpenCL/extension-fp64.cl
deleted file mode 100644
index e0c2b1e..0000000
--- a/test/SemaOpenCL/extension-fp64.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
-
-void f1(double da) { // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-  (void) 1.0; // expected-warning {{double precision constant requires cl_khr_fp64}}
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-void f2(void) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-void f3(void) {
-  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-}
diff --git a/test/SemaOpenCL/extension-version.cl b/test/SemaOpenCL/extension-version.cl
new file mode 100644
index 0000000..6a3cfde
--- /dev/null
+++ b/test/SemaOpenCL/extension-version.cl
@@ -0,0 +1,272 @@
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple spir-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple spir-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple spir-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple spir-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple spir-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple spir-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple spir-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple spir-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+
+#if __OPENCL_C_VERSION__ >= 200 && ! defined TEST_CORE_FEATURES
+// expected-no-diagnostics
+#endif
+
+// Extensions in all versions
+#ifndef cl_clang_storage_class_specifiers
+#error "Missing cl_clang_storage_class_specifiers define"
+#endif
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers: enable
+
+#ifndef cl_khr_fp16
+#error "Missing cl_khr_fp16 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp16: enable
+
+#ifndef cl_khr_int64_base_atomics
+#error "Missing cl_khr_int64_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+
+#ifndef cl_khr_int64_extended_atomics
+#error "Missing cl_khr_int64_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+
+#ifndef cl_khr_gl_sharing
+#error "Missing cl_khr_gl_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_sharing: enable
+
+#ifndef cl_khr_icd
+#error "Missing cl_khr_icd define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_icd: enable
+
+// Core features in CL 1.1
+
+#ifndef cl_khr_byte_addressable_store
+#error "Missing cl_khr_byte_addressable_store define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_byte_addressable_store' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_base_atomics
+#error "Missing cl_khr_global_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_extended_atomics
+#error "Missing cl_khr_global_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_base_atomics
+#error "Missing cl_khr_local_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_extended_atomics
+#error "Missing cl_khr_local_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#if (__OPENCL_C_VERSION__ < 110)
+// Deprecated abvoe 1.0
+#ifndef cl_khr_select_fprounding_mode
+#error "Missing cl_khr_select_fp_rounding_mode define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode: enable
+#endif
+
+
+// Core feature in CL 1.2
+#ifndef cl_khr_fp64
+#error "Missing cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#if (__OPENCL_C_VERSION__ >= 120) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
+
+//Core feature in CL 2.0
+#ifndef cl_khr_3d_image_writes
+#error "Missing cl_khr_3d_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable
+#if (__OPENCL_C_VERSION__ >= 200) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_3d_image_writes' is core feature or supported optional core feature - ignoring}}
+#endif
+
+
+
+#if (__OPENCL_C_VERSION__ >= 110)
+#ifndef cl_khr_gl_event
+#error "Missing cl_khr_gl_event define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_gl_event' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_event: enable
+
+#if (__OPENCL_C_VERSION__ >= 110)
+#ifndef cl_khr_d3d10_sharing
+#error "Missing cl_khr_d3d10_sharing define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_d3d10_sharing' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_context_abort
+#error "Missing cl_context_abort define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_context_abort' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_context_abort: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_d3d11_sharing
+#error "Missing cl_khr_d3d11_sharing define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_d3d11_sharing' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d11_sharing: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_dx9_media_sharing
+#error "Missing cl_khr_dx9_media_sharing define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_dx9_media_sharing' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_dx9_media_sharing: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_image2d_from_buffer
+#error "Missing cl_khr_image2d_from_buffer define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_image2d_from_buffer' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_image2d_from_buffer: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_initialize_memory
+#error "Missing cl_khr_initialize_memory define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_initialize_memory' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_initialize_memory: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_gl_depth_images
+#error "Missing cl_khr_gl_depth_images define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_gl_depth_images' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_depth_images: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_gl_msaa_sharing
+#error "Missing cl_khr_gl_msaa_sharing define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_gl_msaa_sharing' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_spir
+#error "Missing cl_khr_spir define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_spir' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_spir: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_egl_event
+#error "Missing cl_khr_egl_event define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_egl_event' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_event: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_egl_image
+#error "Missing cl_khr_egl_image define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_egl_image' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_image: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_mipmap_image
+#error "Missing cl_khr_mipmap_image define"
+#endif
+#else
+#ifdef cl_khr_mipmap_image
+#error "Incorrect cl_khr_mipmap_image define"
+#endif
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_mipmap_image' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_srgb_image_writes
+#error "Missing cl_khr_srgb_image_writes define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_srgb_image_writes' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_srgb_image_writes: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_subgroups
+#error "Missing cl_khr_subgroups define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_subgroups' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_subgroups: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_terminate_context
+#error "Missing cl_khr_terminate_context define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_terminate_context' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_terminate_context: enable
+
+#ifndef cl_amd_media_ops
+#error "Missing cl_amd_media_ops define"
+#endif
+#pragma OPENCL EXTENSION cl_amd_media_ops: enable
+
+#ifndef cl_amd_media_ops2
+#error "Missing cl_amd_media_ops2 define"
+#endif
+#pragma OPENCL EXTENSION cl_amd_media_ops2: enable
+
diff --git a/test/SemaOpenCL/extensions.cl b/test/SemaOpenCL/extensions.cl
new file mode 100644
index 0000000..31224e0
--- /dev/null
+++ b/test/SemaOpenCL/extensions.cl
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only -cl-std=CL1.1
+
+// Test with a target not supporting fp64.
+// RUN: %clang_cc1 %s -triple r600-unknown-unknown -target-cpu r600 -verify -pedantic -fsyntax-only -DNOFP64
+
+void f1(double da) { // expected-error {{type 'double' requires cl_khr_fp64 extension}}
+  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
+  (void) 1.0; // expected-warning {{double precision constant requires cl_khr_fp64}}
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#ifdef NOFP64
+// expected-warning@-2{{unsupported OpenCL extension 'cl_khr_fp64' - ignoring}}
+#endif
+
+void f2(void) {
+  double d;
+#ifdef NOFP64
+// expected-error@-2{{use of type 'double' requires cl_khr_fp64 extension to be enabled}}
+#endif
+
+  (void) 1.0;
+#ifdef NOFP64
+// expected-warning@-2{{double precision constant requires cl_khr_fp64, casting to single precision}}
+#endif
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : disable
+#ifdef NOFP64
+// expected-warning@-2{{unsupported OpenCL extension 'cl_khr_fp64' - ignoring}}
+#endif
+
+void f3(void) {
+  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
+}
diff --git a/test/SemaOpenCL/extern.cl b/test/SemaOpenCL/extern.cl
index b2e4857..5f1f9f8 100644
--- a/test/SemaOpenCL/extern.cl
+++ b/test/SemaOpenCL/extern.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -emit-llvm -ffake-address-space-map %s -o - -verify | FileCheck %s
+// RUN: %clang_cc1 -x cl -cl-opt-disable -cl-std=CL1.2 -emit-llvm -ffake-address-space-map %s -o - -verify | FileCheck %s
 // expected-no-diagnostics
 
 // CHECK: @foo = external addrspace(3) constant float
diff --git a/test/SemaOpenCL/half.cl b/test/SemaOpenCL/half.cl
index 11abf64..dd7bb9a 100644
--- a/test/SemaOpenCL/half.cl
+++ b/test/SemaOpenCL/half.cl
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -Wno-unused-value
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -Wno-unused-value -triple spir-unknown-unknown
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
+constant float f = 1.0h; // expected-error{{half precision constant requires cl_khr_fp16}}
 
 half half_disabled(half *p, // expected-error{{declaring function return value of type 'half' is not allowed}}
                    half h)  // expected-error{{declaring function parameter of type 'half' is not allowed}}
@@ -12,6 +13,8 @@
 
   float c = 1.0f;
   b = (half) c;  // expected-error{{casting to type 'half' is not allowed}}
+  c = (float) 1.0h;  // expected-error{{half precision constant requires cl_khr_fp16}}
+  b = 1.0h; // expected-error{{half precision constant requires cl_khr_fp16}}
 
   half *allowed = &p[1];
   half *allowed2 = &*p;
@@ -22,6 +25,7 @@
 
 // Exactly the same as above but with the cl_khr_fp16 extension enabled.
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+constant half a = 1.0h;
 half half_enabled(half *p, half h)
 {
   half a[2];
@@ -31,6 +35,8 @@
 
   float c = 1.0f;
   b = (half) c;
+  c = (float) 1.0h;
+  b = 1.0h;
 
   half *allowed = &p[1];
   half *allowed2 = &*p;
diff --git a/test/SemaOpenCL/images.cl b/test/SemaOpenCL/images.cl
new file mode 100644
index 0000000..f963de4
--- /dev/null
+++ b/test/SemaOpenCL/images.cl
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
+
+void img2d_ro(__read_only image2d_t img) {} // expected-note{{passing argument to parameter 'img' here}} expected-note{{passing argument to parameter 'img' here}}
+
+void imgage_access_test(image2d_t img2dro, write_only image2d_t img2dwo, image3d_t img3dro) {
+  img2d_ro(img2dro);
+  img2d_ro(img2dwo); // expected-error{{passing '__write_only image2d_t' to parameter of incompatible type '__read_only image2d_t'}}
+  img2d_ro(img3dro); // expected-error{{passing '__read_only image3d_t' to parameter of incompatible type '__read_only image2d_t'}}
+}
diff --git a/test/SemaOpenCL/invalid-block.cl b/test/SemaOpenCL/invalid-block.cl
new file mode 100644
index 0000000..6721d0e
--- /dev/null
+++ b/test/SemaOpenCL/invalid-block.cl
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -verify -fblocks -cl-std=CL2.0 %s
+
+// OpenCL v2.0 s6.12.5
+void f0(int (^const bl)());
+// All blocks declarations must be const qualified and initialized.
+void f1() {
+  int (^bl1)() = ^() {return 1;};
+  int (^const bl2)() = ^(){return 1;};
+  f0(bl1);
+  f0(bl2);
+  bl1 = bl2; // expected-error{{invalid operands to binary expression ('int (^const)()' and 'int (^const)()')}}
+  int (^const bl3)(); // expected-error{{invalid block variable declaration - must be initialized}}
+}
+
+// A block with extern storage class is not allowed.
+extern int (^bl)() = ^(){return 1;}; // expected-error{{invalid block variable declaration - using 'extern' storage class is disallowed}}
+void f2() {
+  extern int (^bl)() = ^(){return 1;}; // expected-error{{invalid block variable declaration - using 'extern' storage class is disallowed}}
+}
+
+// A block cannot be the return value of a function.
+typedef int (^bl_t)(void);
+bl_t f3(bl_t bl); // expected-error{{declaring function return value of type 'bl_t' (aka 'int (^const)(void)') is not allowed}}
+
+struct bl_s {
+  int (^bl)(void); // expected-error {{the 'int (^const)(void)' type cannot be used to declare a structure or union field}}
+};
+
+void f4() {
+  __block int a = 10; // expected-error {{the __block storage type is not permitted}}
+}
+
+// A block with variadic argument is not allowed.
+int (^bl)(int, ...) = ^int(int I, ...) { // expected-error {{invalid block prototype, variadic arguments are not allowed in OpenCL}}
+  return 0;
+};
+
+// A block can't be used to declare an array
+typedef int (^bl1_t)(int);
+void f5(int i) {
+  bl1_t bl1 = ^(int i) {return 1;};
+  bl1_t bl2 = ^(int i) {return 2;};
+  bl1_t arr[] = {bl1, bl2}; // expected-error {{array of 'bl1_t' (aka 'int (^const)(int)') type is invalid in OpenCL}}
+  int tmp = i ? bl1(i)      // expected-error {{block type cannot be used as expression in ternary expression in OpenCL}}
+              : bl2(i);     // expected-error {{block type cannot be used as expression in ternary expression in OpenCL}}
+}
+// A block pointer type and all pointer operations are disallowed
+void f6(bl1_t * bl_ptr) { // expected-error{{pointer to type '__generic bl1_t' (aka 'int (^const __generic)(int)') is invalid in OpenCL}}
+  bl1_t bl = ^(int i) {return 1;};
+  bl1_t *p; // expected-error {{pointer to type '__generic bl1_t' (aka 'int (^const __generic)(int)') is invalid in OpenCL}}
+  *bl;  // expected-error {{invalid argument type 'bl1_t' (aka 'int (^const)(int)') to unary expression}}
+  &bl; // expected-error {{invalid argument type 'bl1_t' (aka 'int (^const)(int)') to unary expression}}
+}
diff --git a/test/SemaOpenCL/invalid-image.cl b/test/SemaOpenCL/invalid-image.cl
new file mode 100644
index 0000000..d15746f
--- /dev/null
+++ b/test/SemaOpenCL/invalid-image.cl
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -verify %s
+
+void test1(image1d_t *i) {} // expected-error{{pointer to type '__read_only image1d_t' is invalid in OpenCL}}
+
+void test2(image1d_t i) {
+  image1d_t ti;            // expected-error{{type '__read_only image1d_t' can only be used as a function parameter}}
+  image1d_t ai[] = {i, i}; // expected-error{{array of '__read_only image1d_t' type is invalid in OpenCL}}
+  i=i; // expected-error{{invalid operands to binary expression ('__read_only image1d_t' and '__read_only image1d_t')}}
+  i+1; // expected-error{{invalid operands to binary expression ('__read_only image1d_t' and 'int')}}
+  &i; // expected-error{{invalid argument type '__read_only image1d_t' to unary expression}}
+  *i; // expected-error{{invalid argument type '__read_only image1d_t' to unary expression}}
+}
+
+image1d_t test3() {} // expected-error{{declaring function return value of type '__read_only image1d_t' is not allowed}}
diff --git a/test/SemaOpenCL/invalid-kernel-attrs.cl b/test/SemaOpenCL/invalid-kernel-attrs.cl
index 4b4fdf7..cedbb06 100644
--- a/test/SemaOpenCL/invalid-kernel-attrs.cl
+++ b/test/SemaOpenCL/invalid-kernel-attrs.cl
@@ -28,8 +28,6 @@
 
 void f_kernel_image2d_t( kernel image2d_t image ) { // expected-error {{'kernel' attribute only applies to functions}}
   int __kernel x; // expected-error {{'__kernel' attribute only applies to functions}}
-  read_only int i; // expected-error {{'read_only' attribute only applies to parameters}}
-  __write_only int j; // expected-error {{'__write_only' attribute only applies to parameters}}
 }
 
 kernel __attribute__((reqd_work_group_size(1,2,0))) void kernel11(){} // expected-error {{'reqd_work_group_size' attribute must be greater than 0}}
diff --git a/test/SemaOpenCL/invalid-kernel-parameters.cl b/test/SemaOpenCL/invalid-kernel-parameters.cl
index de32eae..e2e48e8 100644
--- a/test/SemaOpenCL/invalid-kernel-parameters.cl
+++ b/test/SemaOpenCL/invalid-kernel-parameters.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -triple spir-unknown-unknown
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
@@ -24,7 +24,10 @@
 
 typedef struct FooImage2D // expected-note{{within field of type 'FooImage2D' declared here}}
 {
-  image2d_t imageField; // expected-note{{field of illegal type 'image2d_t' declared here}}
+  // TODO: Clean up needed - we don't really need to check for image, event, etc
+  // as a note here any longer.
+  // They are diagnosed as an error for all struct fields (OpenCL v1.2 s6.9b,r).
+  image2d_t imageField; // expected-note{{field of illegal type '__read_only image2d_t' declared here}} expected-error{{the '__read_only image2d_t' type cannot be used to declare a structure or union field}}
 } FooImage2D;
 
 kernel void image_in_struct_arg(FooImage2D arg) { } // expected-error{{struct kernel parameters may not contain pointers}}
diff --git a/test/SemaOpenCL/invalid-logical-ops-1.2.cl b/test/SemaOpenCL/invalid-logical-ops-1.2.cl
index 7ba1adb..bee5239 100644
--- a/test/SemaOpenCL/invalid-logical-ops-1.2.cl
+++ b/test/SemaOpenCL/invalid-logical-ops-1.2.cl
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 %s -verify -cl-std=CL1.2 -triple x86_64-unknown-linux-gnu
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
 typedef __attribute__((ext_vector_type(4))) float float4;
 typedef __attribute__((ext_vector_type(4))) double double4;
 typedef __attribute__((ext_vector_type(4))) int int4;
diff --git a/test/SemaOpenCL/invalid-pipe-builtin-cl2.0.cl b/test/SemaOpenCL/invalid-pipe-builtin-cl2.0.cl
new file mode 100644
index 0000000..386c6b6
--- /dev/null
+++ b/test/SemaOpenCL/invalid-pipe-builtin-cl2.0.cl
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL2.0
+
+void test1(read_only pipe int p, global int* ptr){
+  int tmp;
+  reserve_id_t rid;
+
+  // read/write_pipe
+  read_pipe(p, &tmp);
+  read_pipe(p, ptr);
+  read_pipe(tmp, p);    // expected-error {{first argument to 'read_pipe' must be a pipe type}}
+  read_pipe(p);   // expected-error {{invalid number of arguments to function: 'read_pipe'}}
+  read_pipe(p, rid, tmp, ptr);
+  read_pipe(p, tmp, tmp, ptr);   // expected-error {{invalid argument type to function 'read_pipe' (expecting 'reserve_id_t' having 'int')}}
+  read_pipe(p, rid, rid, ptr);   // expected-error {{invalid argument type to function 'read_pipe' (expecting 'unsigned int' having 'reserve_id_t')}}
+  read_pipe(p, tmp);   // expected-error {{invalid argument type to function 'read_pipe' (expecting 'int *' having 'int')}}
+  write_pipe(p, ptr);    // expected-error {{invalid pipe access modifier (expecting write_only)}}
+  write_pipe(p, rid, tmp, ptr);    // expected-error {{invalid pipe access modifier (expecting write_only)}}
+
+  // reserve_read/write_pipe
+  reserve_read_pipe(p, tmp);
+  reserve_read_pipe(p, ptr);    // expected-error{{invalid argument type to function 'reserve_read_pipe' (expecting 'unsigned int' having '__global int *')}}
+  work_group_reserve_read_pipe(tmp, tmp);    // expected-error{{first argument to 'work_group_reserve_read_pipe' must be a pipe type}}
+  sub_group_reserve_write_pipe(p, tmp);    // expected-error{{invalid pipe access modifier (expecting write_only)}}
+
+  // commit_read/write_pipe
+  commit_read_pipe(p, rid);
+  commit_read_pipe(tmp, rid);    // expected-error{{first argument to 'commit_read_pipe' must be a pipe type}}
+  work_group_commit_read_pipe(p, tmp);    // expected-error{{invalid argument type to function 'work_group_commit_read_pipe' (expecting 'reserve_id_t' having 'int')}}
+  sub_group_commit_write_pipe(p, tmp);    // expected-error{{invalid pipe access modifier (expecting write_only)}}
+}
+
+void test2(write_only pipe int p, global int* ptr){
+  int tmp;
+  reserve_id_t rid;
+
+  // read/write_pipe
+  write_pipe(p, &tmp);
+  write_pipe(p, ptr);
+  write_pipe(tmp, p);    // expected-error {{first argument to 'write_pipe' must be a pipe type}}
+  write_pipe(p);   // expected-error {{invalid number of arguments to function: 'write_pipe'}}
+  write_pipe(p, rid, tmp, ptr);
+  write_pipe(p, tmp, tmp, ptr);   // expected-error {{invalid argument type to function 'write_pipe' (expecting 'reserve_id_t' having 'int')}}
+  write_pipe(p, rid, rid, ptr);   // expected-error {{invalid argument type to function 'write_pipe' (expecting 'unsigned int' having 'reserve_id_t')}}
+  write_pipe(p, tmp);   // expected-error {{invalid argument type to function 'write_pipe' (expecting 'int *' having 'int')}}
+  read_pipe(p, ptr);    // expected-error {{invalid pipe access modifier (expecting read_only)}}
+  read_pipe(p, rid, tmp, ptr);    // expected-error {{invalid pipe access modifier (expecting read_only)}}
+
+  // reserve_read/write_pipe
+  reserve_write_pipe(p, tmp);
+  reserve_write_pipe(p, ptr);    // expected-error{{invalid argument type to function 'reserve_write_pipe' (expecting 'unsigned int' having '__global int *')}}
+  work_group_reserve_write_pipe(tmp, tmp);    // expected-error{{first argument to 'work_group_reserve_write_pipe' must be a pipe type}}
+  sub_group_reserve_read_pipe(p, tmp);    // expected-error{{invalid pipe access modifier (expecting read_only)}}
+
+  // commit_read/write_pipe
+  commit_write_pipe(p, rid);
+  commit_write_pipe(tmp, rid);    // expected-error{{first argument to 'commit_write_pipe' must be a pipe type}}
+  work_group_commit_write_pipe(p, tmp);    // expected-error{{invalid argument type to function 'work_group_commit_write_pipe' (expecting 'reserve_id_t' having 'int')}}
+  sub_group_commit_read_pipe(p, tmp);    // expected-error{{invalid pipe access modifier (expecting read_only)}}
+}
+
+void test3(){
+  int tmp;
+  get_pipe_num_packets(tmp);    // expected-error {{first argument to 'get_pipe_num_packets' must be a pipe type}}
+  get_pipe_max_packets(tmp);    // expected-error {{first argument to 'get_pipe_max_packets' must be a pipe type}}
+}
diff --git a/test/SemaOpenCL/invalid-pipes-cl2.0.cl b/test/SemaOpenCL/invalid-pipes-cl2.0.cl
index ee36892..1993df5 100644
--- a/test/SemaOpenCL/invalid-pipes-cl2.0.cl
+++ b/test/SemaOpenCL/invalid-pipes-cl2.0.cl
@@ -1,8 +1,22 @@
 // RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL2.0
 
-void test1(pipe int *p){// expected-error {{pipes packet types cannot be of reference type}}
+void test1(pipe int *p) {// expected-error {{pipes packet types cannot be of reference type}}
 }
-void test2(pipe p){// expected-error {{missing actual type specifier for pipe}}
+void test2(pipe p) {// expected-error {{missing actual type specifier for pipe}}
 }
-void test3(int pipe p){// expected-error {{cannot combine with previous 'int' declaration specifier}}
+void test3(int pipe p) {// expected-error {{cannot combine with previous 'int' declaration specifier}}
 }
+void test4() {
+  pipe int p; // expected-error {{type 'pipe int' can only be used as a function parameter}}
+  //TODO: fix parsing of this pipe int (*p);
+}
+
+void test5(pipe int p) {
+  p+p; // expected-error{{invalid operands to binary expression ('pipe int' and 'pipe int')}}
+  p=p; // expected-error{{invalid operands to binary expression ('pipe int' and 'pipe int')}}
+  &p; // expected-error{{invalid argument type 'pipe int' to unary expression}}
+  *p; // expected-error{{invalid argument type 'pipe int' to unary expression}}
+}
+
+typedef pipe int pipe_int_t;
+pipe_int_t test6() {} // expected-error{{declaring function return value of type 'pipe_int_t' (aka 'pipe int') is not allowed}}
diff --git a/test/SemaOpenCL/nosvm.cl b/test/SemaOpenCL/nosvm.cl
new file mode 100644
index 0000000..658cb3a
--- /dev/null
+++ b/test/SemaOpenCL/nosvm.cl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -verify %s
+// RUN: %clang_cc1 -verify -cl-std=CL2.0 -D CL20 %s
+// RUN: %clang_cc1 -verify -x c -D NOCL %s
+
+#ifndef NOCL
+kernel void f(__attribute__((nosvm)) global int* a);
+#ifndef CL20
+// expected-error@-2 {{'nosvm' attribute requires OpenCL version 2.0}}
+#else
+// expected-warning@-4 {{'nosvm' attribute is deprecated and ignored in OpenCL version 2.0}}
+#endif
+
+__attribute__((nosvm)) void g(); // expected-warning {{'nosvm' attribute only applies to variables}}
+
+#else
+void f(__attribute__((nosvm)) int* a); // expected-warning {{'nosvm' attribute ignored}}
+#endif
diff --git a/test/SemaOpenCL/optional-core-fp64-cl1.2.cl b/test/SemaOpenCL/optional-core-fp64-cl1.2.cl
deleted file mode 100644
index e0f7f1d..0000000
--- a/test/SemaOpenCL/optional-core-fp64-cl1.2.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL1.2
-// expected-no-diagnostics
-
-void f1(double da) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-void f2(void) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-void f3(void) {
-  double d;
-}
diff --git a/test/SemaOpenCL/optional-core-fp64-cl2.0.cl b/test/SemaOpenCL/optional-core-fp64-cl2.0.cl
deleted file mode 100644
index 832529d..0000000
--- a/test/SemaOpenCL/optional-core-fp64-cl2.0.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL2.0
-// expected-no-diagnostics
-
-void f1(double da) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-void f2(void) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-void f3(void) {
-  double d;
-}
diff --git a/test/SemaOpenCL/sampler_t.cl b/test/SemaOpenCL/sampler_t.cl
index 96f6dbf..c87b6da 100644
--- a/test/SemaOpenCL/sampler_t.cl
+++ b/test/SemaOpenCL/sampler_t.cl
@@ -1,13 +1,85 @@
 // RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -DCHECK_SAMPLER_VALUE -Wspir-compat -triple amdgcn--amdhsa
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -DCHECK_SAMPLER_VALUE -triple spir-unknown-unknown
 
-constant sampler_t glb_smp = 5;
+#define CLK_ADDRESS_CLAMP_TO_EDGE       2
+#define CLK_NORMALIZED_COORDS_TRUE      1
+#define CLK_FILTER_NEAREST              0x10
+#define CLK_FILTER_LINEAR               0x20
 
-void foo(sampler_t); 
+constant sampler_t glb_smp = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR;
+constant sampler_t glb_smp2; // expected-error{{variable in constant address space must be initialized}}
+global sampler_t glb_smp3 = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_NEAREST; // expected-error{{sampler type cannot be used with the __local and __global address space qualifiers}}
+
+constant sampler_t glb_smp4 = 0;
+#ifdef CHECK_SAMPLER_VALUE
+// expected-warning@-2{{sampler initializer has invalid Filter Mode bits}}
+#endif
+
+constant sampler_t glb_smp5 = 0x1f;
+#ifdef CHECK_SAMPLER_VALUE
+// expected-warning@-2{{sampler initializer has invalid Addressing Mode bits}}
+#endif
+
+constant sampler_t glb_smp6 = glb_smp; // expected-error{{initializer element is not a compile-time constant}}
+
+int f(void);
+constant sampler_t glb_smp7 = f(); // expected-error{{initializer element is not a compile-time constant}}
+
+constant sampler_t glb_smp8 = 1.0f; // expected-error{{initializing '__constant sampler_t' with an expression of incompatible type 'float'}}
+
+constant sampler_t glb_smp9 = 0x100000000LL; // expected-error{{sampler_t initialization requires 32-bit integer, not 'long long'}}
+
+void foo(sampler_t);
+
+constant struct sampler_s {
+  sampler_t smp; // expected-error{{the 'sampler_t' type cannot be used to declare a structure or union field}}
+} sampler_str = {0};
+
+sampler_t bad(void); //expected-error{{declaring function return value of type 'sampler_t' is not allowed}}
 
 void kernel ker(sampler_t argsmp) {
-  local sampler_t smp; // expected-error {{sampler type cannot be used with the __local and __global address space qualifiers}}
-  const sampler_t const_smp = 7;
+  local sampler_t smp; // expected-error{{sampler type cannot be used with the __local and __global address space qualifiers}}
+  const sampler_t const_smp = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR;
+  const sampler_t const_smp2;
+  const sampler_t const_smp3 = const_smp;
+  const sampler_t const_smp4 = f();
+  const sampler_t const_smp5 = 1.0f; // expected-error{{initializing 'const sampler_t' with an expression of incompatible type 'float'}}
+  const sampler_t const_smp6 = 0x100000000LL; // expected-error{{sampler_t initialization requires 32-bit integer, not 'long long'}}
+
   foo(glb_smp);
+  foo(glb_smp2);
+  foo(glb_smp3);
+  foo(glb_smp4);
+  foo(glb_smp5);
+  foo(glb_smp6);
+  foo(glb_smp7);
+  foo(glb_smp8);
+  foo(glb_smp9);
+  foo(smp);
+  foo(sampler_str.smp);
   foo(const_smp);
-  foo(5); // expected-error {{sampler_t variable required - got 'int'}}
+  foo(const_smp2);
+  foo(const_smp3);
+  foo(const_smp4);
+  foo(const_smp5);
+  foo(const_smp6);
+  foo(argsmp);
+  foo(5); // expected-error{{sampler_t variable required - got 'int'}}
+  sampler_t sa[] = {argsmp, const_smp}; // expected-error {{array of 'sampler_t' type is invalid in OpenCL}}
+  foo(sa[0]);
+  foo(bad());
 }
+
+void bad(sampler_t*); // expected-error{{pointer to type 'sampler_t' is invalid in OpenCL}}
+
+void bar() {
+  sampler_t smp1 = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR;
+  sampler_t smp2 = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_NEAREST;
+  smp1=smp2; //expected-error{{invalid operands to binary expression ('sampler_t' and 'sampler_t')}}
+  smp1+1; //expected-error{{invalid operands to binary expression ('sampler_t' and 'int')}}
+  &smp1; //expected-error{{invalid argument type 'sampler_t' to unary expression}}
+  *smp2; //expected-error{{invalid argument type 'sampler_t' to unary expression}}
+  foo(smp1+1); //expected-error{{invalid operands to binary expression ('sampler_t' and 'int')}}
+}
+
diff --git a/test/SemaOpenCL/storageclass-cl20.cl b/test/SemaOpenCL/storageclass-cl20.cl
index c8e7faa..1eba64b 100644
--- a/test/SemaOpenCL/storageclass-cl20.cl
+++ b/test/SemaOpenCL/storageclass-cl20.cl
@@ -1,15 +1,19 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -DCL20 -cl-std=CL2.0
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL2.0
 
 static constant int G1 = 0;
 int G2 = 0;
 global int G3 = 0;
-local int G4 = 0;// expected-error{{program scope variable must reside in global or constant address space}}
+local int G4 = 0;              // expected-error{{program scope variable must reside in global or constant address space}}
 
 void kernel foo() {
   static int S1 = 5;
   static global int S2 = 5;
-  static private int S3 = 5;// expected-error{{program scope variable must reside in global or constant address space}}
+  static private int S3 = 5;   // expected-error{{static local variable must reside in global or constant address space}}
 
   constant int L1 = 0;
   local int L2;
+  global int L3; // expected-error{{function scope variable cannot be declared in global address space}}
+
+  extern global int G5;
+  extern int G6; // expected-error{{extern variable must reside in global or constant address space}}
 }
diff --git a/test/SemaOpenCL/storageclass.cl b/test/SemaOpenCL/storageclass.cl
index c7d8ab9..a93f824 100644
--- a/test/SemaOpenCL/storageclass.cl
+++ b/test/SemaOpenCL/storageclass.cl
@@ -13,7 +13,8 @@
   constant int L1 = 0;
   local int L2;
 
-  auto int L3 = 7; // expected-error{{OpenCL does not support the 'auto' storage class specifier}}
+  auto int L3 = 7; // expected-error{{OpenCL version 1.2 does not support the 'auto' storage class specifier}}
+  global int L4;   // expected-error{{function scope variable cannot be declared in global address space}}
 }
 
 static void kernel bar() { // expected-error{{kernel functions cannot be declared static}}
@@ -26,4 +27,6 @@
     constant int L1 = 0; // expected-error{{non-kernel function variable cannot be declared in constant address space}}
     local int L2;        // expected-error{{non-kernel function variable cannot be declared in local address space}}
   }
+  global int L3; // expected-error{{function scope variable cannot be declared in global address space}}
+  extern constant float L4;
 }
diff --git a/test/SemaOpenCL/to_addr_builtin.cl b/test/SemaOpenCL/to_addr_builtin.cl
new file mode 100644
index 0000000..a145626
--- /dev/null
+++ b/test/SemaOpenCL/to_addr_builtin.cl
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -verify -fsyntax-only %s
+// RUN: %clang_cc1 -verify -fsyntax-only -cl-std=CL2.0 %s
+
+void test(void) {
+  global int *glob;
+  local int *loc;
+  constant int *con;
+  typedef constant int const_int_ty;
+  const_int_ty *con_typedef;
+
+  glob = to_global(glob, loc);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{implicit declaration of function 'to_global' is invalid in C99}}
+  // expected-warning@-3{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
+#else
+  // expected-error@-5{{invalid number of arguments to function: 'to_global'}}
+#endif
+
+  int x;
+  glob = to_global(x);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
+#else
+  // expected-error@-4{{invalid argument x to function: 'to_global', expecting a generic pointer argument}}
+#endif
+
+  glob = to_global(con);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
+#else
+  // expected-error@-4{{invalid argument con to function: 'to_global', expecting a generic pointer argument}}
+#endif
+
+  glob = to_global(con_typedef);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
+#else
+  // expected-error@-4{{invalid argument con_typedef to function: 'to_global', expecting a generic pointer argument}}
+#endif
+
+  loc = to_global(glob);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion assigning to '__local int *' from 'int'}}
+#else
+  // expected-error@-4{{assigning '__global int *' to '__local int *' changes address space of pointer}}
+#endif
+
+  global char *glob_c = to_global(loc);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion initializing '__global char *' with an expression of type 'int'}}
+#else
+  // expected-warning@-4{{incompatible pointer types initializing '__global char *' with an expression of type '__global int *'}}
+#endif
+
+}
diff --git a/test/SemaOpenCL/unroll-hint.cl b/test/SemaOpenCL/unroll-hint.cl
new file mode 100644
index 0000000..9969866
--- /dev/null
+++ b/test/SemaOpenCL/unroll-hint.cl
@@ -0,0 +1,30 @@
+//RUN: %clang_cc1 -O0 -fsyntax-only -verify %s
+//RUN: %clang_cc1 -O0 -cl-std=CL2.0 -fsyntax-only -verify -DCL20 %s
+
+kernel void D (global int *x) {
+  int i = 10;
+#ifndef CL20
+  // expected-error@+2 {{'opencl_unroll_hint' attribute requires OpenCL version 2.0 or above}}
+#endif
+  __attribute__((opencl_unroll_hint))
+  do {
+  } while(i--);
+}
+
+#ifdef CL20
+kernel void C (global int *x) {
+  int I = 3;
+  __attribute__((opencl_unroll_hint(I))) // expected-error {{'opencl_unroll_hint' attribute requires an integer constant}}
+  while (I--);
+}
+
+kernel void E() {
+  __attribute__((opencl_unroll_hint(2,4))) // expected-error {{'opencl_unroll_hint' attribute takes no more than 1 argument}}
+  for(int i=0; i<100; i++);
+}
+
+kernel void F() {
+  __attribute__((opencl_unroll_hint(-1))) // expected-error {{'opencl_unroll_hint' attribute requires a positive integral compile time constant expression}}
+  for(int i=0; i<100; i++);
+}
+#endif
diff --git a/test/SemaOpenCL/unsupported.cl b/test/SemaOpenCL/unsupported.cl
index bb9da4b..a39a61b 100644
--- a/test/SemaOpenCL/unsupported.cl
+++ b/test/SemaOpenCL/unsupported.cl
@@ -7,3 +7,7 @@
 void no_vla(int n) {
   int a[n]; // expected-error {{variable length arrays are not supported in OpenCL}}
 }
+
+void no_logxor(int n) {
+  int logxor = n ^^ n; // expected-error {{^^ is a reserved operator in OpenCL}}
+}
diff --git a/test/SemaTemplate/alias-templates.cpp b/test/SemaTemplate/alias-templates.cpp
index 1849ff6..b707835 100644
--- a/test/SemaTemplate/alias-templates.cpp
+++ b/test/SemaTemplate/alias-templates.cpp
@@ -221,3 +221,9 @@
   template<typename ...T, typename ...U> void h(X<T...> &) {}
   template<typename ...T, typename ...U> void h(X<U...> &) {} // ok, different
 }
+
+namespace redecl {
+  template<typename> using A = int;
+  template<typename = void> using A = int;
+  A<> a; // ok
+}
diff --git a/test/SemaTemplate/array-redeclaration.cpp b/test/SemaTemplate/array-redeclaration.cpp
new file mode 100644
index 0000000..4edee70
--- /dev/null
+++ b/test/SemaTemplate/array-redeclaration.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+extern int array[1];
+
+template <typename>
+class C {
+  enum { D };
+public:
+  template <typename A> void foo1() {
+    extern int array[((int)C<A>::k > (int)D) ? 1 : -1];
+  }
+};
+
+template<>
+class C<int> {
+public:
+  const static int k = 2;
+};
+
+void foo2() {
+  C<char> c;
+  c.foo1<int>();
+}
+
+template<int n>
+void foo3() {
+  extern int array[n ? 1 : -1];
+}
+
+void foo4() {
+  foo3<5>();
+}
diff --git a/test/SemaTemplate/class-template-spec.cpp b/test/SemaTemplate/class-template-spec.cpp
index 0292c1b..86cace1 100644
--- a/test/SemaTemplate/class-template-spec.cpp
+++ b/test/SemaTemplate/class-template-spec.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 template<typename T, typename U = int> struct A; // expected-note {{template is declared here}} \
                                                  // expected-note{{explicitly specialized}}
 
@@ -75,7 +77,10 @@
 template<> struct ::A<double>;
 
 namespace N {
-  template<typename T> struct B; // expected-note 2{{explicitly specialized}}
+  template<typename T> struct B; // expected-note {{explicitly specialized}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized}}
+#endif
 
   template<> struct ::N::B<char>; // okay
   template<> struct ::N::B<short>; // okay
@@ -86,7 +91,11 @@
 
 template<> struct N::B<int> { }; // okay
 
-template<> struct N::B<float> { }; // expected-warning{{C++11 extension}}
+template<> struct N::B<float> { };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of class template specialization of 'B' outside namespace 'N' is a C++11 extension}}
+#endif
+
 
 namespace M {
   template<> struct ::N::B<short> { }; // expected-error{{class template specialization of 'B' not in a namespace enclosing 'N'}}
@@ -142,13 +151,26 @@
 }
 
 namespace PR16519 {
-  template<typename T, T...N> struct integer_sequence { typedef T value_type; }; // expected-warning {{extension}}
+  template<typename T, T...N> struct integer_sequence { typedef T value_type; };
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{variadic templates are a C++11 extension}}
+#endif
 
   template<typename T> struct __make_integer_sequence;
-  template<typename T, T N> using make_integer_sequence = typename __make_integer_sequence<T>::template make<N, N % 2>::type; // expected-warning {{extension}}
+  template<typename T, T N> using make_integer_sequence = typename __make_integer_sequence<T>::template make<N, N % 2>::type;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{alias declarations are a C++11 extension}}
+#endif
 
-  template<typename T, typename T::value_type ...Extra> struct __make_integer_sequence_impl; // expected-warning {{extension}}
-  template<typename T, T ...N, T ...Extra> struct __make_integer_sequence_impl<integer_sequence<T, N...>, Extra...> { // expected-warning 2{{extension}}
+  template<typename T, typename T::value_type ...Extra> struct __make_integer_sequence_impl;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{variadic templates are a C++11 extension}}
+#endif
+
+  template<typename T, T ...N, T ...Extra> struct __make_integer_sequence_impl<integer_sequence<T, N...>, Extra...> {
+#if __cplusplus <= 199711L
+  // expected-warning@-2 2 {{variadic templates are a C++11 extension}}
+#endif
     typedef integer_sequence<T, N..., sizeof...(N) + N..., Extra...> type;
   };
 
@@ -160,8 +182,15 @@
     template<T N, typename Dummy> struct make<N, 1, Dummy> : __make_integer_sequence_impl<make_integer_sequence<T, N/2>, N - 1> {};
   };
 
-  using X = make_integer_sequence<int, 5>; // expected-warning {{extension}}
-  using X = integer_sequence<int, 0, 1, 2, 3, 4>; // expected-warning {{extension}}
+  using X = make_integer_sequence<int, 5>;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{alias declarations are a C++11 extension}}
+#endif
+
+  using X = integer_sequence<int, 0, 1, 2, 3, 4>;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{alias declarations are a C++11 extension}}
+#endif
 }
 
 namespace DefaultArgVsPartialSpec {
diff --git a/test/SemaTemplate/cxx1z-decomposition.cpp b/test/SemaTemplate/cxx1z-decomposition.cpp
new file mode 100644
index 0000000..779c4cf
--- /dev/null
+++ b/test/SemaTemplate/cxx1z-decomposition.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+struct A { int x, y; };
+typedef int B[2];
+struct C { template<int> int get(); };
+struct D { int x, y, z; };
+struct E { int *p, n; };
+
+namespace std {
+  using size_t = decltype(sizeof(0));
+  template<typename> struct tuple_size;
+  template<size_t, typename> struct tuple_element { using type = int; };
+}
+
+template<> struct std::tuple_size<C> { enum { value = 2 }; };
+
+template<typename T> int decomp(T &t) { 
+  auto &[a, b] = t; // expected-error {{type 'D' decomposes into 3 elements, but only 2 names were provided}}
+  return a + b; // expected-error {{cannot initialize return object of type 'int' with an rvalue of type 'int *'}}
+}
+
+void test() {
+  A a;
+  B b;
+  C c;
+  D d;
+  E e;
+  decomp(a);
+  decomp(b);
+  decomp(c);
+  decomp(d); // expected-note {{in instantiation of}}
+  decomp(e); // expected-note {{in instantiation of}}
+}
diff --git a/test/SemaTemplate/cxx1z-fold-expressions.cpp b/test/SemaTemplate/cxx1z-fold-expressions.cpp
index 8bb7911..aefee92 100644
--- a/test/SemaTemplate/cxx1z-fold-expressions.cpp
+++ b/test/SemaTemplate/cxx1z-fold-expressions.cpp
@@ -25,10 +25,6 @@
 static_assert(check());
 
 template<int ...N> void empty() {
-  static_assert((N + ...) == 0);
-  static_assert((N * ...) == 1);
-  static_assert((N | ...) == 0);
-  static_assert((N & ...) == -1);
   static_assert((N || ...) == false);
   static_assert((N && ...) == true);
   (N, ...);
@@ -36,14 +32,19 @@
 template void empty<>();
 
 // An empty fold-expression isn't a null pointer just because it's an integer
-// with value 0.
+// with value 0. (This is no longer an issue since empty pack expansions don't
+// produce integers any more.)
 template<int ...N> void null_ptr() {
-  void *p = (N + ...); // expected-error {{rvalue of type 'int'}}
-  void *q = (N | ...); // expected-error {{rvalue of type 'int'}}
+  void *p = (N || ...); // expected-error {{rvalue of type 'bool'}}
+  void *q = (N , ...); // expected-error {{rvalue of type 'void'}}
 }
 template void null_ptr<>(); // expected-note {{in instantiation of}}
 
 template<int ...N> void bad_empty() {
+  (N + ...); // expected-error {{empty expansion for operator '+' with no fallback}}
+  (N * ...); // expected-error {{empty expansion for operator '*' with no fallback}}
+  (N | ...); // expected-error {{empty expansion for operator '|' with no fallback}}
+  (N & ...); // expected-error {{empty expansion for operator '&' with no fallback}}
   (N - ...); // expected-error {{empty expansion for operator '-' with no fallback}}
   (N / ...); // expected-error {{empty expansion for operator '/' with no fallback}}
   (N % ...); // expected-error {{empty expansion for operator '%' with no fallback}}
diff --git a/test/SemaTemplate/deduction.cpp b/test/SemaTemplate/deduction.cpp
index 6826774..d024c31 100644
--- a/test/SemaTemplate/deduction.cpp
+++ b/test/SemaTemplate/deduction.cpp
@@ -218,3 +218,50 @@
   template<typename T> int f(A<T>, typename A<T>::template B<T>);
   int k = f(A<int>(), 0);
 }
+
+namespace PR27601_RecursivelyInheritedBaseSpecializationsDeductionAmbiguity {
+namespace ns1 {
+
+template<class...> struct B { };
+template<class H, class ... Ts> struct B<H, Ts...> : B<> { };
+template<class ... Ts> struct D : B<Ts...> { };
+
+template<class T, class ... Ts> void f(B<T, Ts...> &) { }
+
+int main() {
+  D<int, char> d;
+  f<int>(d);
+}
+} //end ns1
+
+namespace ns2 {
+
+template <int i, typename... Es> struct tup_impl;
+
+template <int i> struct tup_impl<i> {}; // empty tail
+
+template <int i, typename Head, typename... Tail>
+struct tup_impl<i, Head, Tail...> : tup_impl<i + 1, Tail...> {
+  using value_type = Head;
+  Head head;
+};
+
+template <typename... Es> struct tup : tup_impl<0, Es...> {};
+
+template <typename Head, int i, typename... Tail>
+Head &get_helper(tup_impl<i, Head, Tail...> &t) {
+  return t.head;
+}
+
+template <typename Head, int i, typename... Tail>
+Head const &get_helper(tup_impl<i, Head, Tail...> const &t) {
+  return t.head;
+}
+
+int main() {
+  tup<int, double, char> t;
+  get_helper<double>(t);
+  return 0;
+}
+} // end ns2 
+}
\ No newline at end of file
diff --git a/test/SemaTemplate/extern-templates.cpp b/test/SemaTemplate/extern-templates.cpp
index eca64ed..5eb9c9d 100644
--- a/test/SemaTemplate/extern-templates.cpp
+++ b/test/SemaTemplate/extern-templates.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fsyntax-only -verify %s -DMS
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu-pc-win32 -fsyntax-only -verify %s
 
 template<typename T>
 class X0 {
@@ -21,12 +22,20 @@
 
 template<typename T>
 void X0<T>::Inner::g(T t) {
-  t = 17; // expected-error{{incompatible}}
+#ifdef MS
+  t = 17; // expected-error{{assigning to 'long *' from incompatible}} expected-error{{assigning to 'int *' from incompatible}}
+#else
+  t = 17; // expected-error{{assigning to 'long *' from incompatible}}
+#endif
 }
 
 void test_intptr(X0<int*> xi, X0<int*>::Inner xii) {
   xi.f(0);
+#ifdef MS
+  xii.g(0); // expected-note {{instantiation}}
+#else
   xii.g(0);
+#endif
 }
 
 extern template class X0<long*>; 
diff --git a/test/SemaTemplate/instantiate-cast.cpp b/test/SemaTemplate/instantiate-cast.cpp
index b3babf1..32a1cfd 100644
--- a/test/SemaTemplate/instantiate-cast.cpp
+++ b/test/SemaTemplate/instantiate-cast.cpp
@@ -1,6 +1,13 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
-struct A { int x; }; // expected-note 2 {{candidate constructor}}
+struct A { int x; };
+// expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const A' for 1st argument}}
+#if __cplusplus >= 201103L
+// expected-note@-3 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'A' for 1st argument}}
+#endif
+// expected-note@-5 {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
 
 class Base { 
 public:
diff --git a/test/SemaTemplate/instantiate-expr-4.cpp b/test/SemaTemplate/instantiate-expr-4.cpp
index d95ccfe..9a1a1d2 100644
--- a/test/SemaTemplate/instantiate-expr-4.cpp
+++ b/test/SemaTemplate/instantiate-expr-4.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++11 %s
 
 // ---------------------------------------------------------------------
 // C++ Functional Casts
@@ -22,6 +24,9 @@
 template struct FunctionalCast0<5>;
 
 struct X { // expected-note 3 {{candidate constructor (the implicit copy constructor)}}
+#if __cplusplus >= 201103L
+// expected-note@-2 3 {{candidate constructor (the implicit move constructor) not viable}}
+#endif
   X(int, int); // expected-note 3 {{candidate constructor}}
 };
 
@@ -213,6 +218,10 @@
 struct InitList1 {
   void f(Val1 val1) { 
     T x = { val1 };
+#if __cplusplus >= 201103L
+    // expected-error@-2 {{type 'float' cannot be narrowed to 'int' in initializer list}}
+    // expected-note@-3 {{insert an explicit cast to silence this issue}}
+#endif
   }
 };
 
@@ -222,6 +231,9 @@
 };
 
 template struct InitList1<int[1], float>;
+#if __cplusplus >= 201103L
+// expected-note@-2 {{instantiation of member function}}
+#endif
 template struct InitList1<APair, int*>;
 
 template<typename T, typename Val1, typename Val2>
diff --git a/test/SemaTemplate/instantiate-member-class.cpp b/test/SemaTemplate/instantiate-member-class.cpp
index 3f49606..159bccb 100644
--- a/test/SemaTemplate/instantiate-member-class.cpp
+++ b/test/SemaTemplate/instantiate-member-class.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 namespace PR8965 {
   template<typename T>
@@ -106,7 +108,10 @@
 namespace AliasTagDef {
   template<typename T>
   struct F {
-    using S = struct U { // expected-warning {{C++11}}
+    using S = struct U {
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{alias declarations are a C++11 extension}}
+#endif
       T g() {
         return T();
       }
@@ -122,8 +127,13 @@
   {
     struct B
     {
-      struct C { C() { int *ptr = I; } }; // expected-error{{cannot initialize a variable of type 'int *' with an rvalue of type 'int'}} \
-                                             expected-warning{{expression which evaluates to zero treated as a null pointer constant of type 'int *'}}
+      struct C { C() { int *ptr = I; } };
+#if __cplusplus >= 201103L
+      // expected-error@-2 {{cannot initialize a variable of type 'int *' with an rvalue of type 'int'}}
+#else
+      // expected-warning@-4 {{expression which evaluates to zero treated as a null pointer constant of type 'int *'}}
+#endif
+      // expected-error@-6 {{cannot initialize a variable of type 'int *' with an rvalue of type 'int'}}
     };
   };
 
diff --git a/test/SemaTemplate/member-access-expr.cpp b/test/SemaTemplate/member-access-expr.cpp
index f1aa30e..8dba2e6 100644
--- a/test/SemaTemplate/member-access-expr.cpp
+++ b/test/SemaTemplate/member-access-expr.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 template<typename T>
 void call_f0(T x) {
   x.Base::f0();
@@ -28,15 +31,25 @@
 
 template<typename TheBase, typename T>
 void call_f0_through_typedef2(T x) {
-  typedef TheBase CrazyBase; // expected-note{{current scope}}
-  x.CrazyBase::f0(); // expected-error{{ambiguous}} \
-                     // expected-error 2{{no member named}}
+  typedef TheBase CrazyBase;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{lookup from the current scope refers here}}
+#endif
+
+  x.CrazyBase::f0(); // expected-error 2{{no member named}}
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{lookup of 'CrazyBase' in member access expression is ambiguous}}
+#endif
+
 }
 
 struct OtherBase { };
 
 struct X1 : Base, OtherBase { 
-  typedef OtherBase CrazyBase; // expected-note{{object type}}
+  typedef OtherBase CrazyBase;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{lookup in the object type 'X1' refers here}}
+#endif
 };
 
 void test_f0_through_typedef2(X0 x0, X1 x1) {
diff --git a/test/SemaTemplate/ms-delayed-default-template-args.cpp b/test/SemaTemplate/ms-delayed-default-template-args.cpp
index ca9ddb0..0c05469 100644
--- a/test/SemaTemplate/ms-delayed-default-template-args.cpp
+++ b/test/SemaTemplate/ms-delayed-default-template-args.cpp
@@ -55,6 +55,15 @@
 typedef int Weber;
 }
 
+// MSVC accepts this, but Clang doesn't.
+namespace test_scope_spec {
+template <typename T = ns::Bar>  // expected-error {{use of undeclared identifier 'ns'}}
+struct Foo {
+  static_assert(sizeof(T) == 4, "Bar should have gotten int");
+};
+namespace ns { typedef int Bar; }
+}
+
 #ifdef __clang__
 // These are negative test cases that MSVC doesn't compile either.  Try to use
 // unique undeclared identifiers so typo correction doesn't find types declared
diff --git a/test/SemaTemplate/ms-function-specialization-class-scope.cpp b/test/SemaTemplate/ms-function-specialization-class-scope.cpp
index 5da0083..3c7111d 100644
--- a/test/SemaTemplate/ms-function-specialization-class-scope.cpp
+++ b/test/SemaTemplate/ms-function-specialization-class-scope.cpp
@@ -75,3 +75,12 @@
   // here.
   template struct A<int>;
 }
+
+namespace PR28082 {
+struct S {
+  template <int>
+  int f(int = 0);
+  template <>
+  int f<0>(int); // expected-warning {{Microsoft extension}}
+};
+}
diff --git a/test/SemaTemplate/ms-lookup-template-base-classes.cpp b/test/SemaTemplate/ms-lookup-template-base-classes.cpp
index 4f3df27..6afc709 100644
--- a/test/SemaTemplate/ms-lookup-template-base-classes.cpp
+++ b/test/SemaTemplate/ms-lookup-template-base-classes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1y -fms-compatibility -fno-spell-checking -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -std=c++1y -fms-compatibility -fno-spell-checking -fsyntax-only -verify %s
 
 
 template <class T>
@@ -573,3 +573,62 @@
 template <typename T> decltype(h(T())) check2(); // expected-note{{candidate template ignored: substitution failure [with T = int]: no matching function for call to 'h'}}
 decltype(check2<int>()) y; // expected-error{{no matching function for call to 'check2'}}
 }
+
+// We also allow unqualified lookup into bases in contexts where the we know the
+// undeclared identifier *must* be a type, such as a new expression or catch
+// parameter type.
+template <typename T>
+struct UseUnqualifiedTypeNames : T {
+  void foo() {
+    void *P = new TheType; // expected-warning {{unqualified lookup}} expected-error {{no type}}
+    size_t x = __builtin_offsetof(TheType, f2); // expected-warning {{unqualified lookup}} expected-error {{no type}}
+    try {
+    } catch (TheType) { // expected-warning {{unqualified lookup}} expected-error {{no type}}
+    }
+    enum E : IntegerType { E0 = 42 }; // expected-warning {{unqualified lookup}} expected-error {{no type}}
+    _Atomic(TheType) a; // expected-warning {{unqualified lookup}} expected-error {{no type}}
+  }
+  void out_of_line();
+};
+template <typename T>
+void UseUnqualifiedTypeNames<T>::out_of_line() {
+  void *p = new TheType; // expected-warning {{unqualified lookup}} expected-error {{no type}}
+}
+struct Base {
+  typedef int IntegerType;
+  struct TheType {
+    int f1, f2;
+  };
+};
+template struct UseUnqualifiedTypeNames<Base>;
+struct BadBase { };
+template struct UseUnqualifiedTypeNames<BadBase>; // expected-note-re 2 {{in instantiation {{.*}} requested here}}
+
+namespace partial_template_lookup {
+
+class Bar;
+class Spare;
+
+template <class T, class X = Bar>
+class FooTemplated;
+
+class FooBase {
+public:
+  typedef int BaseTypedef;
+};
+
+// Partial template spec (unused)
+template <class T>
+class FooTemplated<T, Spare> {};
+
+// Partial template spec (used)
+template <class T>
+class FooTemplated<T, Bar> : public FooBase {};
+
+// Full template spec
+template <class T, class X>
+class FooTemplated : public FooTemplated<T, Bar> {
+public:
+  BaseTypedef Member; // expected-warning {{unqualified lookup}}
+};
+}
diff --git a/test/SemaTemplate/recovery-crash.cpp b/test/SemaTemplate/recovery-crash.cpp
index 02f8049..c8e783f 100644
--- a/test/SemaTemplate/recovery-crash.cpp
+++ b/test/SemaTemplate/recovery-crash.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // Clang used to crash trying to recover while adding 'this->' before Work(x);
 
@@ -25,14 +27,20 @@
 
 namespace PR16225 {
   template <typename T> void f();
-  template<typename C> void g(C*) {
+  template <typename C> void g(C*) {
     struct LocalStruct : UnknownBase<Mumble, C> { };  // expected-error {{unknown template name 'UnknownBase'}} \
                                                       // expected-error {{use of undeclared identifier 'Mumble'}}
-    f<LocalStruct>();  // expected-warning {{template argument uses local type 'LocalStruct'}}
+    f<LocalStruct>();
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses local type 'LocalStruct'}}
+#endif
   }
   struct S;
   void h() {
-    g<S>(0);  // expected-note {{in instantiation of function template specialization}}
+    g<S>(0);
+#if __cplusplus <= 199711L
+    // expected-note@-2 {{in instantiation of function template specialization}}
+#endif
   }
 }
 
diff --git a/test/SemaTemplate/temp_arg_type.cpp b/test/SemaTemplate/temp_arg_type.cpp
index 637b563..daad61c 100644
--- a/test/SemaTemplate/temp_arg_type.cpp
+++ b/test/SemaTemplate/temp_arg_type.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 template<typename T> class A; // expected-note 2 {{template parameter is declared here}} expected-note{{template is declared here}}
 
 // [temp.arg.type]p1
@@ -24,11 +27,21 @@
 // [temp.arg.type]p2
 void f() {
   class X { };
-  A<X> * a = 0; // expected-warning{{template argument uses local type 'X'}}
+  A<X> * a = 0;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{template argument uses local type 'X'}}
+#endif
 }
 
-struct { int x; } Unnamed; // expected-note{{unnamed type used in template argument was declared here}}
-A<__typeof__(Unnamed)> *a9; // expected-warning{{template argument uses unnamed type}}
+struct { int x; } Unnamed;
+#if __cplusplus <= 199711L
+// expected-note@-2 {{unnamed type used in template argument was declared here}}
+#endif
+
+A<__typeof__(Unnamed)> *a9;
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{template argument uses unnamed type}}
+#endif
 
 template<typename T, unsigned N>
 struct Array {
diff --git a/test/SemaTemplate/undefined-template.cpp b/test/SemaTemplate/undefined-template.cpp
new file mode 100644
index 0000000..a03d0b7
--- /dev/null
+++ b/test/SemaTemplate/undefined-template.cpp
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 -Wundefined-func-template %s
+
+template <class T> struct C1 {
+  static char s_var_1;       // expected-note{{forward declaration of template entity is here}}
+  static char s_var_2;       // expected-note{{forward declaration of template entity is here}}
+  static void s_func_1();    // expected-note{{forward declaration of template entity is here}}
+  static void s_func_2();    // expected-note{{forward declaration of template entity is here}}
+  void meth_1();             // expected-note2{{forward declaration of template entity is here}}
+  void meth_2();
+  template <class T1> static char s_tvar_2;      // expected-note{{forward declaration of template entity is here}}
+  template <class T1> static void s_tfunc_2();   // expected-note{{forward declaration of template entity is here}}
+  template<typename T1> struct C2 {
+    static char s_var_2;     // expected-note{{forward declaration of template entity is here}}
+    static void s_func_2();  // expected-note{{forward declaration of template entity is here}}
+    void meth_2();           // expected-note{{forward declaration of template entity is here}}
+    template <class T2> static char s_tvar_2;    // expected-note{{forward declaration of template entity is here}}
+    template <class T2> void tmeth_2();          // expected-note{{forward declaration of template entity is here}}
+  };
+};
+
+extern template char C1<int>::s_var_2;
+extern template void C1<int>::s_func_2();
+extern template void C1<int>::meth_2();
+extern template char C1<int>::s_tvar_2<char>;
+extern template void C1<int>::s_tfunc_2<char>();
+extern template void C1<int>::C2<long>::s_var_2;
+extern template void C1<int>::C2<long>::s_func_2();
+extern template void C1<int>::C2<long>::meth_2();
+extern template char C1<int>::C2<long>::s_tvar_2<char>;
+extern template void C1<int>::C2<long>::tmeth_2<char>();
+
+char func_01() {
+  return C1<int>::s_var_2;
+}
+
+char func_02() {
+  return C1<int>::s_var_1; // expected-warning{{instantiation of variable 'C1<int>::s_var_1' required here, but no definition is available}}
+                           // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::s_var_1' is explicitly instantiated in another translation unit}}
+}
+
+char func_03() {
+  return C1<char>::s_var_2; // expected-warning{{instantiation of variable 'C1<char>::s_var_2' required here, but no definition is available}}
+                            // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<char>::s_var_2' is explicitly instantiated in another translation unit}}
+}
+
+void func_04() {
+  C1<int>::s_func_1(); // expected-warning{{instantiation of function 'C1<int>::s_func_1' required here, but no definition is available}}
+                       // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::s_func_1' is explicitly instantiated in another translation unit}}
+}
+
+void func_05() {
+  C1<int>::s_func_2();
+}
+
+void func_06() {
+  C1<char>::s_func_2(); // expected-warning{{instantiation of function 'C1<char>::s_func_2' required here, but no definition is available}}
+                        // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<char>::s_func_2' is explicitly instantiated in another translation unit}}
+}
+
+void func_07(C1<int> *x) {
+  x->meth_1();  // expected-warning{{instantiation of function 'C1<int>::meth_1' required here, but no definition is available}}
+                // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::meth_1' is explicitly instantiated in another translation unit}}
+}
+
+void func_08(C1<int> *x) {
+  x->meth_2();
+}
+
+void func_09(C1<char> *x) {
+  x->meth_1();  // expected-warning{{instantiation of function 'C1<char>::meth_1' required here, but no definition is available}}
+                // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<char>::meth_1' is explicitly instantiated in another translation unit}}
+}
+
+char func_10() {
+  return C1<int>::s_tvar_2<char>;
+}
+
+char func_11() {
+  return C1<int>::s_tvar_2<long>; // expected-warning{{instantiation of variable 'C1<int>::s_tvar_2<long>' required here, but no definition is available}}
+                                  // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::s_tvar_2<long>' is explicitly instantiated in another translation unit}}
+}
+
+void func_12() {
+  C1<int>::s_tfunc_2<char>();
+}
+
+void func_13() {
+  C1<int>::s_tfunc_2<long>(); // expected-warning{{instantiation of function 'C1<int>::s_tfunc_2<long>' required here, but no definition is available}}
+                              // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::s_tfunc_2<long>' is explicitly instantiated in another translation unit}}
+}
+
+char func_14() {
+  return C1<int>::C2<long>::s_var_2;
+}
+
+char func_15() {
+  return C1<int>::C2<char>::s_var_2;  //expected-warning {{instantiation of variable 'C1<int>::C2<char>::s_var_2' required here, but no definition is available}}
+                                      // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<char>::s_var_2' is explicitly instantiated in another translation unit}}
+}
+
+void func_16() {
+  C1<int>::C2<long>::s_func_2();
+}
+
+void func_17() {
+  C1<int>::C2<char>::s_func_2(); // expected-warning{{instantiation of function 'C1<int>::C2<char>::s_func_2' required here, but no definition is available}}
+                        // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<char>::s_func_2' is explicitly instantiated in another translation unit}}
+}
+
+void func_18(C1<int>::C2<long> *x) {
+  x->meth_2();
+}
+
+void func_19(C1<int>::C2<char> *x) {
+  x->meth_2();   // expected-warning{{instantiation of function 'C1<int>::C2<char>::meth_2' required here, but no definition is available}}
+                        // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<char>::meth_2' is explicitly instantiated in another translation unit}}
+}
+
+char func_20() {
+  return C1<int>::C2<long>::s_tvar_2<char>;
+}
+
+char func_21() {
+  return C1<int>::C2<long>::s_tvar_2<long>; // expected-warning{{instantiation of variable 'C1<int>::C2<long>::s_tvar_2<long>' required here, but no definition is available}}
+                                  // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<long>::s_tvar_2<long>' is explicitly instantiated in another translation unit}}
+}
+
+void func_22(C1<int>::C2<long> *x) {
+  x->tmeth_2<char>();
+}
+
+void func_23(C1<int>::C2<long> *x) {
+  x->tmeth_2<int>();    // expected-warning{{instantiation of function 'C1<int>::C2<long>::tmeth_2<int>' required here, but no definition is available}}
+                        // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<long>::tmeth_2<int>' is explicitly instantiated in another translation unit}}
+}
+
+int main() {
+  return 0;
+}
diff --git a/test/Unit/lit.site.cfg.in b/test/Unit/lit.site.cfg.in
index 37e8cb0..c2f8146 100644
--- a/test/Unit/lit.site.cfg.in
+++ b/test/Unit/lit.site.cfg.in
@@ -1,7 +1,7 @@
+@LIT_SITE_CFG_IN_HEADER@
+
 import sys
 
-## Autogenerated by LLVM/Clang configuration.
-# Do not edit!
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
diff --git a/test/VFS/Inputs/Nonmodular/A.h b/test/VFS/Inputs/Nonmodular/A.h
new file mode 100644
index 0000000..975f1f0
--- /dev/null
+++ b/test/VFS/Inputs/Nonmodular/A.h
@@ -0,0 +1 @@
+// A.h
diff --git a/test/VFS/Inputs/Nonmodular/Nonmodular.modulemap b/test/VFS/Inputs/Nonmodular/Nonmodular.modulemap
new file mode 100644
index 0000000..91f1690
--- /dev/null
+++ b/test/VFS/Inputs/Nonmodular/Nonmodular.modulemap
@@ -0,0 +1,5 @@
+framework module Nonmodular [extern_c] {
+  umbrella header "umbrella.h"
+  export *
+  module * { export * }
+}
diff --git a/test/VFS/Inputs/Nonmodular/nonmodular-headers.yaml b/test/VFS/Inputs/Nonmodular/nonmodular-headers.yaml
new file mode 100644
index 0000000..a041728
--- /dev/null
+++ b/test/VFS/Inputs/Nonmodular/nonmodular-headers.yaml
@@ -0,0 +1,34 @@
+{
+  'version': 0,
+  'case-sensitive': 'false',
+  'ignore-non-existent-contents': 'true',
+  'roots': [
+    {
+      'type': 'directory',
+      'name': "VDIR/Nonmodular.framework/Headers",
+      'contents': [
+        {
+          'type': 'file',
+          'name': "umbrella.h",
+          'external-contents': "IN_DIR/Inputs/Nonmodular/umbrella.h"
+        },
+        {
+          'type': 'file',
+          'name': "A.h",
+          'external-contents': "IN_DIR/Inputs/Nonmodular/A.h"
+        }
+      ]
+    },
+    {
+      'type': 'directory',
+      'name': "VDIR/Nonmodular.framework/Modules",
+      'contents': [
+        {
+          'type': 'file',
+          'name': "module.modulemap",
+          'external-contents': "OUT_DIR/module.modulemap"
+        }
+      ]
+    }
+  ]
+}
diff --git a/test/VFS/Inputs/Nonmodular/test.c b/test/VFS/Inputs/Nonmodular/test.c
new file mode 100644
index 0000000..62807d0
--- /dev/null
+++ b/test/VFS/Inputs/Nonmodular/test.c
@@ -0,0 +1,3 @@
+// expected-no-diagnostics
+
+#include "umbrella.h"
diff --git a/test/VFS/Inputs/Nonmodular/umbrella.h b/test/VFS/Inputs/Nonmodular/umbrella.h
new file mode 100644
index 0000000..bb79a62
--- /dev/null
+++ b/test/VFS/Inputs/Nonmodular/umbrella.h
@@ -0,0 +1,5 @@
+#ifndef __umbrella_h__
+#define __umbrella_h__
+
+#include <Nonmodular/A.h>
+#endif
diff --git a/test/VFS/Inputs/bar-headers.yaml b/test/VFS/Inputs/bar-headers.yaml
index 710e6cb..846d55c 100644
--- a/test/VFS/Inputs/bar-headers.yaml
+++ b/test/VFS/Inputs/bar-headers.yaml
@@ -1,6 +1,7 @@
 {
   'version': 0,
   'case-sensitive': 'false',
+  'ignore-non-existent-contents': 'true',
   'roots': [
     {
       'type': 'directory',
diff --git a/test/VFS/Inputs/vfsoverlay2.yaml b/test/VFS/Inputs/vfsoverlay2.yaml
index ae2a0ce..688ae64 100644
--- a/test/VFS/Inputs/vfsoverlay2.yaml
+++ b/test/VFS/Inputs/vfsoverlay2.yaml
@@ -1,5 +1,6 @@
 {
   'version': 0,
+  'ignore-non-existent-contents': false,
   'roots': [
     { 'name': 'OUT_DIR', 'type': 'directory',
       'contents': [
diff --git a/test/VFS/test_nonmodular.c b/test/VFS/test_nonmodular.c
new file mode 100644
index 0000000..cff4de7
--- /dev/null
+++ b/test/VFS/test_nonmodular.c
@@ -0,0 +1,11 @@
+// REQUIRES: shell
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/vdir %t/cache %t/outdir
+// We can't have module.map inside Inputs/Nonmodular.
+// RUN: cp %S/Inputs/Nonmodular/Nonmodular.modulemap %t/outdir/module.modulemap
+//
+// RUN: sed -e "s:VDIR:%t/vdir:g" -e "s:IN_DIR:%S:g" -e "s:OUT_DIR:%t/outdir:g" %S/Inputs/Nonmodular/nonmodular-headers.yaml > %t/vdir/nonmodular-headers.yaml
+// RUN: %clang_cc1 -fmodule-name=Nonmodular -fmodules -Wnon-modular-include-in-framework-module -verify -fimplicit-module-maps -fmodules-cache-path=%t/cache -ivfsoverlay %t/vdir/nonmodular-headers.yaml -I %S/Inputs -F %t/vdir -fsyntax-only %S/Inputs/Nonmodular/test.c
+
+// expected-no-diagnostics
diff --git a/test/VFS/umbrella-framework-import-skipnonexist.m b/test/VFS/umbrella-framework-import-skipnonexist.m
index 39af831..5c7cd6d 100644
--- a/test/VFS/umbrella-framework-import-skipnonexist.m
+++ b/test/VFS/umbrella-framework-import-skipnonexist.m
@@ -5,7 +5,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/vdir %t/outdir %t/cache
-// RUN: cp -a %S/Inputs/Bar.Framework %t/outdir
+// RUN: cp -a %S/Inputs/Bar.framework %t/outdir/
 //
 // RUN: sed -e "s:VDIR:%t/vdir:g" -e "s:OUT_DIR:%t/outdir:g" %S/Inputs/bar-headers.yaml > %t/vdir/bar-headers.yaml
 // RUN: rm -f %t/outdir/Bar.framework/Headers/B.h
diff --git a/test/lit.cfg b/test/lit.cfg
index c602650..e7ce8fa 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -44,7 +44,7 @@
 config.test_format = lit.formats.ShTest(execute_external)
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap']
+config.suffixes = ['.c', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test', '.rs']
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent
@@ -397,10 +397,6 @@
 if os.path.exists("/dev/fd/0") and sys.platform not in ['cygwin']:
     config.available_features.add('dev-fd-fs')
 
-# DW2 Target
-if not re.match(r'.*-win32$', config.target_triple):
-    config.available_features.add('dw2')
-
 # Not set on native MS environment.
 if not re.match(r'.*-win32$', config.target_triple):
     config.available_features.add('non-ms-sdk')
@@ -467,6 +463,11 @@
 if config.enable_backtrace == "1":
     config.available_features.add("backtrace")
 
+if config.have_zlib == "1":
+    config.available_features.add("zlib")
+else:
+    config.available_features.add("nozlib")
+
 # Check if we should run long running tests.
 if lit_config.params.get("run_long_tests", None) == "true":
     config.available_features.add("long_tests")
@@ -491,4 +492,9 @@
 if use_gmalloc:
      config.environment.update({'DYLD_INSERT_LIBRARIES' : gmalloc_path_str})
 
+# Check if we should allow outputs to console.
+run_console_tests = int(lit_config.params.get('enable_console', '0'))
+if run_console_tests != 0:
+  config.available_features.add('console')
+
 lit.util.usePlatformSdkOnDarwin(config, lit_config)
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 332bcec..f368c99 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -1,7 +1,7 @@
+@LIT_SITE_CFG_IN_HEADER@
+
 import sys
 
-## Autogenerated by LLVM/Clang configuration.
-# Do not edit!
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
@@ -14,6 +14,7 @@
 config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.target_triple = "@TARGET_TRIPLE@"
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.have_zlib = "@HAVE_LIBZ@"
 config.clang_arcmt = @ENABLE_CLANG_ARCMT@
 config.clang_staticanalyzer = @ENABLE_CLANG_STATIC_ANALYZER@
 config.clang_examples = @ENABLE_CLANG_EXAMPLES@
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 510bc44..d734493 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -7,7 +7,6 @@
 add_clang_subdirectory(clang-fuzzer)
 
 add_clang_subdirectory(c-index-test)
-add_clang_subdirectory(libclang)
 
 if(CLANG_ENABLE_ARCMT)
   add_clang_subdirectory(arcmt-test)
@@ -26,3 +25,6 @@
 # to keep the primary Clang repository small and focused.
 # It also may be included by LLVM_EXTERNAL_CLANG_TOOLS_EXTRA_SOURCE_DIR.
 add_llvm_external_project(clang-tools-extra extra)
+
+# libclang may require clang-tidy in clang-tools-extra.
+add_clang_subdirectory(libclang)
diff --git a/tools/Makefile b/tools/Makefile
deleted file mode 100644
index 5c362bf..0000000
--- a/tools/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-##===- tools/Makefile --------------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ..
-
-include $(CLANG_LEVEL)/../../Makefile.config
-
-DIRS := 
-PARALLEL_DIRS := clang-format driver diagtool
-
-ifeq ($(ENABLE_CLANG_STATIC_ANALYZER), 1)
-  PARALLEL_DIRS += clang-check scan-build scan-view
-endif
-
-ifeq ($(ENABLE_CLANG_ARCMT), 1)
-  DIRS += libclang c-index-test c-arcmt-test
-  PARALLEL_DIRS += arcmt-test
-endif
-
-# Recurse into the extra repository of tools if present.
-OPTIONAL_PARALLEL_DIRS := extra
-
-ifeq ($(BUILD_CLANG_ONLY),YES)
-  DIRS := libclang c-index-test
-  PARALLEL_DIRS := driver
-  OPTIONAL_PARALLEL_DIRS :=
-endif
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/tools/arcmt-test/Makefile b/tools/arcmt-test/Makefile
deleted file mode 100644
index ec7683b..0000000
--- a/tools/arcmt-test/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-##===- tools/arcmt-test/Makefile ---------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-CLANG_LEVEL := ../..
-
-TOOLNAME = arcmt-test
-
-# No plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-# Don't install this. It is used for tests.
-NO_INSTALL = 1
-
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangARCMigrate.a clangRewrite.a \
-		 clangFrontend.a clangDriver.a clangSerialization.a clangParse.a \
-		 clangSema.a clangEdit.a clangAnalysis.a clangAST.a clangLex.a \
-		 clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/tools/arcmt-test/arcmt-test.cpp b/tools/arcmt-test/arcmt-test.cpp
index 7c8e46a..e57d69f 100644
--- a/tools/arcmt-test/arcmt-test.cpp
+++ b/tools/arcmt-test/arcmt-test.cpp
@@ -7,15 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/ARCMigrate/ARCMT.h"
-#include "clang/Frontend/ASTUnit.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/Frontend/PCHContainerOperations.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Frontend/VerifyDiagnosticConsumer.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Signals.h"
 #include <system_error>
 
@@ -341,7 +343,7 @@
 
 int main(int argc, const char **argv) {
   void *MainAddr = (void*) (intptr_t) GetExecutablePath;
-  llvm::sys::PrintStackTraceOnErrorSignal();
+  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
 
   std::string
     resourcesPath = CompilerInvocation::GetResourcesPath(argv[0], MainAddr);
diff --git a/tools/c-arcmt-test/Makefile b/tools/c-arcmt-test/Makefile
deleted file mode 100644
index ec5e122..0000000
--- a/tools/c-arcmt-test/Makefile
+++ /dev/null
@@ -1,51 +0,0 @@
-##===- tools/c-arcmt-test/Makefile -------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-CLANG_LEVEL := ../..
-
-TOOLNAME = c-arcmt-test
-
-# No plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-# Don't install this. It is used for tests.
-NO_INSTALL = 1
-
-# Include this here so we can get the configuration of the targets that have
-# been configured for construction. We have to do this early so we can set up
-# LINK_COMPONENTS before including Makefile.rules
-include $(CLANG_LEVEL)/../../Makefile.config
-
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) \
-	AsmParser \
-	BitReader \
-	BitWriter \
-	IPO \
-	MC \
-	ObjCARCOpts \
-	Option \
-	Support
-
-# Note that 'USEDLIBS' must include all of the core clang libraries
-# when -static is given to linker on cygming.
-USEDLIBS = clang.a \
-	   clangCodeGen.a \
-	   clangARCMigrate.a \
-	   clangIndex.a \
-	   clangFormat.a \
-	   clangTooling.a \
-	   clangToolingCore.a \
-	   clangRewriteFrontend.a \
-	   clangRewrite.a \
-	   clangFrontend.a clangDriver.a \
-	   clangStaticAnalyzerCheckers.a clangStaticAnalyzerCore.a \
-	   clangSerialization.a clangParse.a clangSema.a \
-	   clangAnalysis.a clangEdit.a clangAST.a clangLex.a clangAPINotes.a \
-           clangBasic.a
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/tools/c-index-test/Makefile b/tools/c-index-test/Makefile
deleted file mode 100644
index 7e13377..0000000
--- a/tools/c-index-test/Makefile
+++ /dev/null
@@ -1,58 +0,0 @@
-##===- tools/index-test/Makefile ---------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-CLANG_LEVEL := ../..
-
-TOOLNAME = c-index-test
-
-# If a separate install prefix was specified for internal tools, use it
-# when installing c-index-test.
-INTERNAL_TOOL = 1
-
-# No plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-# Include this here so we can get the configuration of the targets that have
-# been configured for construction. We have to do this early so we can set up
-# LINK_COMPONENTS before including Makefile.rules
-include $(CLANG_LEVEL)/../../Makefile.config
-
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) \
-	AsmParser \
-	BitReader \
-	BitWriter \
-	IPO \
-	MC \
-	ObjCARCOpts \
-	Option \
-	Support
-
-# Note that 'USEDLIBS' must include all of the core clang libraries
-# when -static is given to linker on cygming.
-USEDLIBS = clang.a \
-	   clangCodeGen.a \
-	   clangIndex.a clangFormat.a clangRewrite.a \
-	   clangFrontend.a clangDriver.a \
-	   clangTooling.a \
-	   clangToolingCore.a \
-	   clangSerialization.a clangParse.a clangSema.a \
-	   clangAnalysis.a clangEdit.a clangAST.a clangLex.a \
-	   clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/Makefile
-
-LIBS += $(LIBXML2_LIBS)
-
-# Headers in $(LIBXML2_INC) should not be checked with clang's -Wdocumentation.
-# Use -isystem instead of -I then.
-# FIXME: Could autoconf detect clang or availability of -isystem?
-ifneq ($(findstring -Wdocumentation,$(OPTIMIZE_OPTION)),)
-CPPFLAGS += $(subst -I,-isystem ,$(LIBXML2_INC))
-else
-CPPFLAGS += $(LIBXML2_INC)
-endif
diff --git a/tools/c-index-test/c-index-test.c b/tools/c-index-test/c-index-test.c
index d46a6e0..2878ba2 100644
--- a/tools/c-index-test/c-index-test.c
+++ b/tools/c-index-test/c-index-test.c
@@ -772,9 +772,20 @@
     
     clang_disposeString(DeprecatedMessage);
     clang_disposeString(UnavailableMessage);
-    
+
+    if (clang_CXXConstructor_isDefaultConstructor(Cursor))
+      printf(" (default constructor)");
+
+    if (clang_CXXConstructor_isMoveConstructor(Cursor))
+      printf(" (move constructor)");
+    if (clang_CXXConstructor_isCopyConstructor(Cursor))
+      printf(" (copy constructor)");
+    if (clang_CXXConstructor_isConvertingConstructor(Cursor))
+      printf(" (converting constructor)");
     if (clang_CXXField_isMutable(Cursor))
       printf(" (mutable)");
+    if (clang_CXXMethod_isDefaulted(Cursor))
+      printf(" (defaulted)");
     if (clang_CXXMethod_isStatic(Cursor))
       printf(" (static)");
     if (clang_CXXMethod_isVirtual(Cursor))
@@ -827,8 +838,11 @@
 
       if (Cursor.kind == CXCursor_FunctionDecl) {
         /* Collect the template parameter kinds from the base template. */
-        unsigned NumTemplateArgs = clang_Cursor_getNumTemplateArguments(Cursor);
-        unsigned I;
+        int NumTemplateArgs = clang_Cursor_getNumTemplateArguments(Cursor);
+        int I;
+        if (NumTemplateArgs < 0) {
+          printf(" [no template arg info]");
+        }
         for (I = 0; I < NumTemplateArgs; I++) {
           enum CXTemplateArgumentKind TAK =
               clang_Cursor_getTemplateArgumentKind(Cursor, I);
@@ -1422,10 +1436,10 @@
     CXString FieldSpelling = clang_getCursorSpelling(cursor);
     const char *FieldName = clang_getCString(FieldSpelling);
     /* recurse to get the first parent record that is not anonymous. */
-    CXCursor Parent, Record;
     unsigned RecordIsAnonymous = 0;
     if (clang_getCursorKind(cursor) == CXCursor_FieldDecl) {
-      Record = Parent = p;
+      CXCursor Record;
+      CXCursor Parent = p;
       do {
         Record = Parent;
         Parent = clang_getCursorSemanticParent(Record);
@@ -1998,6 +2012,7 @@
   enum CXCursorKind ParentKind;
   CXString ParentName;
   CXString BriefComment;
+  CXString Annotation;
   const char *BriefCommentCString;
   
   fprintf(file, "%s:", clang_getCString(ks));
@@ -2031,9 +2046,10 @@
     for (i = 0; i < annotationCount; ++i) {
       if (i != 0)
         fprintf(file, ", ");
-      fprintf(file, "\"%s\"",
-              clang_getCString(clang_getCompletionAnnotation(
-                                 completion_result->CompletionString, i)));
+      Annotation =
+          clang_getCompletionAnnotation(completion_result->CompletionString, i);
+      fprintf(file, "\"%s\"", clang_getCString(Annotation));
+      clang_disposeString(Annotation);
     }
     fprintf(file, ")");
   }
@@ -2135,25 +2151,6 @@
   }
 }
 
-int my_stricmp(const char *s1, const char *s2) {
-  while (*s1 && *s2) {
-    int c1 = tolower((unsigned char)*s1), c2 = tolower((unsigned char)*s2);
-    if (c1 < c2)
-      return -1;
-    else if (c1 > c2)
-      return 1;
-    
-    ++s1;
-    ++s2;
-  }
-  
-  if (*s1)
-    return 1;
-  else if (*s2)
-    return -1;
-  return 0;
-}
-
 int perform_code_completion(int argc, const char **argv, int timing_only) {
   const char *input = argv[1];
   char *filename = 0;
@@ -4446,11 +4443,8 @@
   client_data.argc = argc;
   client_data.argv = argv;
 
-  if (argc > 1 && strcmp(argv[1], "core") == 0) {
+  if (argc > 1 && strcmp(argv[1], "core") == 0)
     client_data.main_func = indextest_core_main;
-    --client_data.argc;
-    ++client_data.argv;
-  }
 
   if (getenv("CINDEXTEST_NOTHREADS"))
     return client_data.main_func(client_data.argc, client_data.argv);
diff --git a/tools/c-index-test/core_main.cpp b/tools/c-index-test/core_main.cpp
index d11b490..e64dae7 100644
--- a/tools/c-index-test/core_main.cpp
+++ b/tools/c-index-test/core_main.cpp
@@ -196,9 +196,13 @@
 //===----------------------------------------------------------------------===//
 
 int indextest_core_main(int argc, const char **argv) {
-  sys::PrintStackTraceOnErrorSignal();
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
   PrettyStackTraceProgram X(argc, argv);
 
+  assert(argv[1] == StringRef("core"));
+  ++argv;
+  --argc;
+
   std::vector<const char *> CompArgs;
   const char **DoubleDash = std::find(argv, argv + argc, StringRef("--"));
   if (DoubleDash != argv + argc) {
diff --git a/tools/clang-check/ClangCheck.cpp b/tools/clang-check/ClangCheck.cpp
index a9934c9..b4177d4 100644
--- a/tools/clang-check/ClangCheck.cpp
+++ b/tools/clang-check/ClangCheck.cpp
@@ -142,7 +142,7 @@
       return clang::CreateASTDumper(ASTDumpFilter, /*DumpDecls=*/true,
                                     /*DumpLookups=*/false);
     if (ASTPrint)
-      return clang::CreateASTPrinter(&llvm::outs(), ASTDumpFilter);
+      return clang::CreateASTPrinter(nullptr, ASTDumpFilter);
     return llvm::make_unique<clang::ASTConsumer>();
   }
 };
@@ -150,7 +150,7 @@
 } // namespace
 
 int main(int argc, const char **argv) {
-  llvm::sys::PrintStackTraceOnErrorSignal();
+  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
 
   // Initialize targets for clang module support.
   llvm::InitializeAllTargets();
diff --git a/tools/clang-check/Makefile b/tools/clang-check/Makefile
deleted file mode 100644
index f2e280d..0000000
--- a/tools/clang-check/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-##===- tools/clang-check/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-
-TOOLNAME = clang-check
-
-# No plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader ipo objcarcopts \
-                   instrumentation bitwriter support mc option
-USEDLIBS = clangFrontend.a clangCodeGen.a clangIndex.a \
-           clangSerialization.a clangDriver.a \
-           clangTooling.a clangParse.a clangSema.a \
-           clangStaticAnalyzerFrontend.a clangStaticAnalyzerCheckers.a \
-           clangStaticAnalyzerCore.a clangAnalysis.a clangRewriteFrontend.a \
-           clangRewrite.a clangEdit.a clangAST.a clangLex.a \
-           clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/tools/clang-format-vs/ClangFormat/ClangFormatPackage.cs b/tools/clang-format-vs/ClangFormat/ClangFormatPackage.cs
index df872b2..6af2fd1 100644
--- a/tools/clang-format-vs/ClangFormat/ClangFormatPackage.cs
+++ b/tools/clang-format-vs/ClangFormat/ClangFormatPackage.cs
@@ -202,9 +202,10 @@
             if (start >= text.Length && text.Length > 0)
                 start = text.Length - 1;
             string path = GetDocumentParent(view);
+            string filePath = GetDocumentPath(view);
             try
             {
-                var root = XElement.Parse(RunClangFormat(text, start, length, path));
+                var root = XElement.Parse(RunClangFormat(text, start, length, path, filePath));
                 var edit = view.TextBuffer.CreateEdit();
                 foreach (XElement replacement in root.Descendants("replacement"))
                 {
@@ -237,7 +238,7 @@
         /// 
         /// Formats the text range starting at offset of the given length.
         /// </summary>
-        private string RunClangFormat(string text, int offset, int length, string path)
+        private string RunClangFormat(string text, int offset, int length, string path, string filePath)
         {
             string vsixPath = Path.GetDirectoryName(
                 typeof(ClangFormatPackage).Assembly.Location);
@@ -257,6 +258,8 @@
             if (GetSortIncludes())
               process.StartInfo.Arguments += " -sort-includes ";
             string assumeFilename = GetAssumeFilename();
+            if (string.IsNullOrEmpty(assumeFilename))
+                assumeFilename = filePath;
             if (!string.IsNullOrEmpty(assumeFilename))
               process.StartInfo.Arguments += " -assume-filename \"" + assumeFilename + "\"";
             process.StartInfo.CreateNoWindow = true;
@@ -355,5 +358,15 @@
             }
             return null;
         }
+
+        private string GetDocumentPath(IWpfTextView view)
+        {
+            ITextDocument document;
+            if (view.TextBuffer.Properties.TryGetProperty(typeof(ITextDocument), out document))
+            {
+                return document.FilePath;
+            }
+            return null;
+        }
     }
 }
diff --git a/tools/clang-format/ClangFormat.cpp b/tools/clang-format/ClangFormat.cpp
index 36f237f..c097239 100644
--- a/tools/clang-format/ClangFormat.cpp
+++ b/tools/clang-format/ClangFormat.cpp
@@ -20,9 +20,7 @@
 #include "clang/Basic/Version.h"
 #include "clang/Format/Format.h"
 #include "clang/Rewrite/Core/Rewriter.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Signals.h"
 
@@ -257,25 +255,27 @@
   unsigned CursorPosition = Cursor;
   Replacements Replaces = sortIncludes(FormatStyle, Code->getBuffer(), Ranges,
                                        AssumedFileName, &CursorPosition);
-  std::string ChangedCode =
-      tooling::applyAllReplacements(Code->getBuffer(), Replaces);
-  for (const auto &R : Replaces)
-    Ranges.push_back({R.getOffset(), R.getLength()});
-
+  auto ChangedCode = tooling::applyAllReplacements(Code->getBuffer(), Replaces);
+  if (!ChangedCode) {
+    llvm::errs() << llvm::toString(ChangedCode.takeError()) << "\n";
+    return true;
+  }
+  // Get new affected ranges after sorting `#includes`.
+  Ranges = tooling::calculateRangesAfterReplacements(Replaces, Ranges);
   bool IncompleteFormat = false;
-  Replacements FormatChanges = reformat(FormatStyle, ChangedCode, Ranges,
+  Replacements FormatChanges = reformat(FormatStyle, *ChangedCode, Ranges,
                                         AssumedFileName, &IncompleteFormat);
-  Replaces = tooling::mergeReplacements(Replaces, FormatChanges);
+  Replaces = Replaces.merge(FormatChanges);
   if (OutputXML) {
     outs() << "<?xml version='1.0'?>\n<replacements "
               "xml:space='preserve' incomplete_format='"
            << (IncompleteFormat ? "true" : "false") << "'>\n";
     if (Cursor.getNumOccurrences() != 0)
       outs() << "<cursor>"
-             << tooling::shiftedCodePosition(FormatChanges, CursorPosition)
+             << FormatChanges.getShiftedCodePosition(CursorPosition)
              << "</cursor>\n";
 
-    outputReplacementsXML(Replaces); 
+    outputReplacementsXML(Replaces);
     outs() << "</replacements>\n";
   } else {
     IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
@@ -297,7 +297,7 @@
     } else {
       if (Cursor.getNumOccurrences() != 0)
         outs() << "{ \"Cursor\": "
-               << tooling::shiftedCodePosition(FormatChanges, CursorPosition)
+               << FormatChanges.getShiftedCodePosition(CursorPosition)
                << ", \"IncompleteFormat\": "
                << (IncompleteFormat ? "true" : "false") << " }\n";
       Rewrite.getEditBuffer(ID).write(outs());
@@ -315,7 +315,7 @@
 }
 
 int main(int argc, const char **argv) {
-  llvm::sys::PrintStackTraceOnErrorSignal();
+  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
 
   cl::HideUnrelatedOptions(ClangFormatCategory);
 
diff --git a/tools/clang-format/Makefile b/tools/clang-format/Makefile
deleted file mode 100644
index 58642f1..0000000
--- a/tools/clang-format/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-##===- clang-format/Makefile -------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-
-TOOLNAME = clang-format
-
-# No plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangFormat.a clangToolingCore.a clangDriver.a clangRewrite.a \
-           clangLex.a clangAPINotes.a clangBasic.a 
-
-include $(CLANG_LEVEL)/Makefile
diff --git a/tools/clang-format/clang-format-diff.py b/tools/clang-format/clang-format-diff.py
index 9e02bb0..5e728f5 100755
--- a/tools/clang-format/clang-format-diff.py
+++ b/tools/clang-format/clang-format-diff.py
@@ -31,10 +31,6 @@
 import sys
 
 
-# Change this to the full path if clang-format is not on the path.
-binary = 'clang-format'
-
-
 def main():
   parser = argparse.ArgumentParser(description=
                                    'Reformat changed lines in diff. Without -i '
@@ -56,10 +52,11 @@
                       help='let clang-format sort include blocks')
   parser.add_argument('-v', '--verbose', action='store_true',
                       help='be more verbose, ineffective without -i')
-  parser.add_argument(
-      '-style',
-      help=
-      'formatting style to apply (LLVM, Google, Chromium, Mozilla, WebKit)')
+  parser.add_argument('-style',
+                      help='formatting style to apply (LLVM, Google, Chromium, '
+                      'Mozilla, WebKit)')
+  parser.add_argument('-binary', default='clang-format',
+                      help='location of binary to use for clang-format')
   args = parser.parse_args()
 
   # Extract changed lines for each file.
@@ -95,7 +92,7 @@
   for filename, lines in lines_by_file.iteritems():
     if args.i and args.verbose:
       print 'Formatting', filename
-    command = [binary, filename]
+    command = [args.binary, filename]
     if args.i:
       command.append('-i')
     if args.sort_includes:
diff --git a/tools/clang-fuzzer/ClangFuzzer.cpp b/tools/clang-fuzzer/ClangFuzzer.cpp
index d07cf50..afe57d4 100644
--- a/tools/clang-fuzzer/ClangFuzzer.cpp
+++ b/tools/clang-fuzzer/ClangFuzzer.cpp
@@ -16,6 +16,7 @@
 #include "clang/Tooling/Tooling.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Frontend/CompilerInstance.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/Option/Option.h"
 
 using namespace clang;
diff --git a/tools/diagtool/DiagTool.cpp b/tools/diagtool/DiagTool.cpp
index 0e4d808..7582d51 100644
--- a/tools/diagtool/DiagTool.cpp
+++ b/tools/diagtool/DiagTool.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "DiagTool.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include <vector>
 
diff --git a/tools/diagtool/Makefile b/tools/diagtool/Makefile
deleted file mode 100644
index b502621..0000000
--- a/tools/diagtool/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-##===- tools/diagtool/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-CLANG_LEVEL := ../..
-
-TOOLNAME = diagtool 
-
-# No plugins, optimize startup time.
-TOOL_NO_EXPORTS := 1
-
-# Don't install this.
-NO_INSTALL = 1
-
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangFrontend.a clangDriver.a clangSerialization.a clangParse.a \
-           clangSema.a clangAnalysis.a clangEdit.a clangAST.a clangLex.a \
-           clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/Makefile
-
diff --git a/tools/diagtool/TreeView.cpp b/tools/diagtool/TreeView.cpp
index 3647e39..07af944 100644
--- a/tools/diagtool/TreeView.cpp
+++ b/tools/diagtool/TreeView.cpp
@@ -14,7 +14,6 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Process.h"
 
diff --git a/tools/driver/CMakeLists.txt b/tools/driver/CMakeLists.txt
index dc6cd4d..4a2c8d2 100644
--- a/tools/driver/CMakeLists.txt
+++ b/tools/driver/CMakeLists.txt
@@ -89,7 +89,8 @@
   set(TOOL_INFO_BUILD_VERSION)
 endif()
 
-if(CLANG_ORDER_FILE)
+# the linker -order_file flag is only supported by ld64
+if(LD64_EXECUTABLE AND CLANG_ORDER_FILE)
   include(CMakePushCheckState)
 
   function(check_linker_flag flag out_var)
@@ -102,8 +103,15 @@
   # This is a test to ensure the actual order file works with the linker.
   check_linker_flag("-Wl,-order_file,${CLANG_ORDER_FILE}"
     LINKER_ORDER_FILE_WORKS)
-  
-  if(LINKER_ORDER_FILE_WORKS)
+
+  # Passing an empty order file disables some linker layout optimizations.
+  # To work around this and enable workflows for re-linking when the order file
+  # changes we check during configuration if the file is empty, and make it a
+  # configuration dependency.
+  file(READ ${CLANG_ORDER_FILE} ORDER_FILE LIMIT 20)
+  if("${ORDER_FILE}" STREQUAL "\n")
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CLANG_ORDER_FILE})
+  elseif(LINKER_ORDER_FILE_WORKS)
     target_link_libraries(clang "-Wl,-order_file,${CLANG_ORDER_FILE}")
     set_target_properties(clang PROPERTIES LINK_DEPENDS ${CLANG_ORDER_FILE})
   endif()
diff --git a/tools/driver/Info.plist.in b/tools/driver/Info.plist.in
index c938fb0..c2b1570 100644
--- a/tools/driver/Info.plist.in
+++ b/tools/driver/Info.plist.in
@@ -7,7 +7,7 @@
         <key>CFBundleInfoDictionaryVersion</key>
         <string>6.0</string>
         <key>CFBundleName</key>
-        <string>@TOOL_INFO_NAME</string>
+        <string>@TOOL_INFO_NAME@</string>
 	<key>CFBundleShortVersionString</key>
 	<string>@TOOL_INFO_VERSION@</string>
         <key>CFBundleVersion</key>
diff --git a/tools/driver/Makefile b/tools/driver/Makefile
deleted file mode 100644
index d2626a1..0000000
--- a/tools/driver/Makefile
+++ /dev/null
@@ -1,76 +0,0 @@
-##===- tools/driver/Makefile -------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-CLANG_LEVEL := ../..
-
-TOOLNAME = clang
-TOOLALIAS = clang++
-
-ifdef CLANG_ORDER_FILE
-TOOL_ORDER_FILE := $(CLANG_ORDER_FILE)
-endif
-
-# Include tool version information on OS X.
-TOOL_INFO_PLIST := Info.plist
-
-# Include this here so we can get the configuration of the targets that have
-# been configured for construction. We have to do this early so we can set up
-# LINK_COMPONENTS before including Makefile.rules
-include $(CLANG_LEVEL)/../../Makefile.config
-
-# Have the option of not supporting plugins. This is important for startup
-# performance.
-ifeq ($(CLANG_PLUGIN_SUPPORT), 1)
-NO_DEAD_STRIP := 1
-else
-TOOL_NO_EXPORTS := 1
-endif
-
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader bitwriter codegen \
-                   instrumentation ipo irreader linker objcarcopts option \
-                   profiledata selectiondag
-USEDLIBS = clangFrontendTool.a clangFrontend.a clangDriver.a \
-           clangSerialization.a clangCodeGen.a clangParse.a clangSema.a \
-           clangRewriteFrontend.a clangRewrite.a
-
-ifeq ($(ENABLE_CLANG_STATIC_ANALYZER),1)
-USEDLIBS += clangStaticAnalyzerFrontend.a clangStaticAnalyzerCheckers.a \
-            clangStaticAnalyzerCore.a
-endif
-
-ifeq ($(ENABLE_CLANG_ARCMT),1)
-USEDLIBS += clangARCMigrate.a
-endif
-
-USEDLIBS += clangAnalysis.a clangEdit.a clangAST.a clangLex.a \
-            clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/Makefile
-
-# Set the tool version information values.
-ifeq ($(HOST_OS),Darwin)
-ifdef CLANG_VENDOR
-TOOL_INFO_NAME := $(CLANG_VENDOR) clang
-else
-TOOL_INFO_NAME := clang
-endif
-
-ifdef CLANG_VENDOR_UTI
-TOOL_INFO_UTI := $(CLANG_VENDOR_UTI)
-else
-TOOL_INFO_UTI := org.llvm.clang
-endif
-
-TOOL_INFO_VERSION := $(word 3,$(shell grep "CLANG_VERSION " \
-	$(PROJ_OBJ_DIR)/$(CLANG_LEVEL)/include/clang/Basic/Version.inc))
-ifdef LLVM_SUBMIT_VERSION
-TOOL_INFO_BUILD_VERSION := $(LLVM_SUBMIT_VERSION).$(LLVM_SUBMIT_SUBVERSION)
-else
-TOOL_INFO_BUILD_VERSION := 
-endif
-endif
diff --git a/tools/driver/cc1_main.cpp b/tools/driver/cc1_main.cpp
index 8240561..d78a31e 100644
--- a/tools/driver/cc1_main.cpp
+++ b/tools/driver/cc1_main.cpp
@@ -126,15 +126,9 @@
 
   // When running with -disable-free, don't do any destruction or shutdown.
   if (Clang->getFrontendOpts().DisableFree) {
-    if (llvm::AreStatisticsEnabled() || Clang->getFrontendOpts().ShowStats)
-      llvm::PrintStatistics();
     BuryPointer(std::move(Clang));
     return !Success;
   }
 
-  // Managed static deconstruction. Useful for making things like
-  // -time-passes usable.
-  llvm::llvm_shutdown();
-
   return !Success;
 }
diff --git a/tools/driver/cc1as_main.cpp b/tools/driver/cc1as_main.cpp
index bcc2cde..6b818a9 100644
--- a/tools/driver/cc1as_main.cpp
+++ b/tools/driver/cc1as_main.cpp
@@ -43,10 +43,8 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -88,6 +86,7 @@
   unsigned SaveTemporaryLabels : 1;
   unsigned GenDwarfForAssembly : 1;
   unsigned CompressDebugSections : 1;
+  unsigned RelaxELFRelocations : 1;
   unsigned DwarfVersion;
   std::string DwarfDebugFlags;
   std::string DwarfDebugProducer;
@@ -200,6 +199,7 @@
   // Any DebugInfoKind implies GenDwarfForAssembly.
   Opts.GenDwarfForAssembly = Args.hasArg(OPT_debug_info_kind_EQ);
   Opts.CompressDebugSections = Args.hasArg(OPT_compress_debug_sections);
+  Opts.RelaxELFRelocations = Args.hasArg(OPT_mrelax_relocations);
   Opts.DwarfVersion = getLastArgIntValue(Args, OPT_dwarf_version_EQ, 2, Diags);
   Opts.DwarfDebugFlags = Args.getLastArgValue(OPT_dwarf_debug_flags);
   Opts.DwarfDebugProducer = Args.getLastArgValue(OPT_dwarf_debug_producer);
@@ -313,7 +313,9 @@
   // Ensure MCAsmInfo initialization occurs before any use, otherwise sections
   // may be created with a combination of default and explicit settings.
   if (Opts.CompressDebugSections)
-    MAI->setCompressDebugSections(true);
+    MAI->setCompressDebugSections(DebugCompressionType::DCT_ZlibGnu);
+
+  MAI->setRelaxELFRelocations(Opts.RelaxELFRelocations);
 
   bool IsBinary = Opts.OutputType == AssemblerInvocation::FT_Obj;
   std::unique_ptr<raw_fd_ostream> FDOS = getOutputStream(Opts, Diags, IsBinary);
@@ -326,19 +328,18 @@
 
   MCContext Ctx(MAI.get(), MRI.get(), MOFI.get(), &SrcMgr);
 
-  llvm::Reloc::Model RM = llvm::Reloc::Default;
+  bool PIC = false;
   if (Opts.RelocationModel == "static") {
-    RM = llvm::Reloc::Static;
+    PIC = false;
   } else if (Opts.RelocationModel == "pic") {
-    RM = llvm::Reloc::PIC_;
+    PIC = true;
   } else {
     assert(Opts.RelocationModel == "dynamic-no-pic" &&
            "Invalid PIC model!");
-    RM = llvm::Reloc::DynamicNoPIC;
+    PIC = false;
   }
 
-  MOFI->InitMCObjectFileInfo(Triple(Opts.Triple), RM,
-                             CodeModel::Default, Ctx);
+  MOFI->InitMCObjectFileInfo(Triple(Opts.Triple), PIC, CodeModel::Default, Ctx);
   if (Opts.SaveTemporaryLabels)
     Ctx.setAllowTemporaryLabels(false);
   if (Opts.GenDwarfForAssembly)
@@ -378,7 +379,8 @@
     MCAsmBackend *MAB = nullptr;
     if (Opts.ShowEncoding) {
       CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
-      MAB = TheTarget->createMCAsmBackend(*MRI, Opts.Triple, Opts.CPU);
+      MCTargetOptions Options;
+      MAB = TheTarget->createMCAsmBackend(*MRI, Opts.Triple, Opts.CPU, Options);
     }
     auto FOut = llvm::make_unique<formatted_raw_ostream>(*Out);
     Str.reset(TheTarget->createAsmStreamer(
@@ -395,8 +397,9 @@
     }
 
     MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
+    MCTargetOptions Options;
     MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*MRI, Opts.Triple,
-                                                      Opts.CPU);
+                                                      Opts.CPU, Options);
     Triple T(Opts.Triple);
     Str.reset(TheTarget->createMCObjectStreamer(
         T, Ctx, *MAB, *Out, CE, *STI, Opts.RelaxAll,
@@ -447,11 +450,6 @@
 }
 
 int cc1as_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal();
-  PrettyStackTraceProgram X(Argv.size(), Argv.data());
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
-
   // Initialize targets and assembly printers/parsers.
   InitializeAllTargetInfos();
   InitializeAllTargetMCs();
diff --git a/tools/driver/driver.cpp b/tools/driver/driver.cpp
index 1a677ea..01512f1 100644
--- a/tools/driver/driver.cpp
+++ b/tools/driver/driver.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
@@ -25,7 +24,6 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h"
@@ -37,7 +35,6 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
@@ -49,6 +46,7 @@
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
+#include <set>
 #include <system_error>
 using namespace clang;
 using namespace clang::driver;
@@ -130,7 +128,7 @@
       }
     }
   } else if (Edit[0] == 'x' || Edit[0] == 'X') {
-    std::string Option = Edit.substr(1, std::string::npos);
+    auto Option = Edit.substr(1);
     for (unsigned i = 1; i < Args.size();) {
       if (Option == Args[i]) {
         OS << "### Deleting argument " << Args[i] << '\n';
@@ -312,8 +310,9 @@
 }
 
 int main(int argc_, const char **argv_) {
-  llvm::sys::PrintStackTraceOnErrorSignal();
+  llvm::sys::PrintStackTraceOnErrorSignal(argv_[0]);
   llvm::PrettyStackTraceProgram X(argc_, argv_);
+  llvm::llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
   if (llvm::sys::Process::FixupStandardFileDescriptors())
     return 1;
@@ -342,18 +341,33 @@
   // have to manually search for a --driver-mode=cl argument the hard way.
   // Finally, our -cc1 tools don't care which tokenization mode we use because
   // response files written by clang will tokenize the same way in either mode.
-  llvm::cl::TokenizerCallback Tokenizer = &llvm::cl::TokenizeGNUCommandLine;
+  bool ClangCLMode = false;
   if (TargetAndMode.second == "--driver-mode=cl" ||
       std::find_if(argv.begin(), argv.end(), [](const char *F) {
         return F && strcmp(F, "--driver-mode=cl") == 0;
       }) != argv.end()) {
-    Tokenizer = &llvm::cl::TokenizeWindowsCommandLine;
+    ClangCLMode = true;
+  }
+  enum { Default, POSIX, Windows } RSPQuoting = Default;
+  for (const char *F : argv) {
+    if (strcmp(F, "--rsp-quoting=posix") == 0)
+      RSPQuoting = POSIX;
+    else if (strcmp(F, "--rsp-quoting=windows") == 0)
+      RSPQuoting = Windows;
   }
 
   // Determines whether we want nullptr markers in argv to indicate response
-  // files end-of-lines. We only use this for the /LINK driver argument.
-  bool MarkEOLs = true;
-  if (argv.size() > 1 && StringRef(argv[1]).startswith("-cc1"))
+  // files end-of-lines. We only use this for the /LINK driver argument with
+  // clang-cl.exe on Windows.
+  bool MarkEOLs = ClangCLMode;
+
+  llvm::cl::TokenizerCallback Tokenizer;
+  if (RSPQuoting == Windows || (RSPQuoting == Default && ClangCLMode))
+    Tokenizer = &llvm::cl::TokenizeWindowsCommandLine;
+  else
+    Tokenizer = &llvm::cl::TokenizeGNUCommandLine;
+
+  if (MarkEOLs && argv.size() > 1 && StringRef(argv[1]).startswith("-cc1"))
     MarkEOLs = false;
   llvm::cl::ExpandResponseFiles(Saver, Tokenizer, argv, MarkEOLs);
 
@@ -486,8 +500,6 @@
   // results now.  This happens in -disable-free mode.
   llvm::TimerGroup::printAll(llvm::errs());
 
-  llvm::llvm_shutdown();
-
 #ifdef LLVM_ON_WIN32
   // Exit status should not be negative on Win32, unless abnormal termination.
   // Once abnormal termiation was caught, negative status should not be
diff --git a/tools/libclang/CIndex.cpp b/tools/libclang/CIndex.cpp
index 33f6e20..4a35929 100644
--- a/tools/libclang/CIndex.cpp
+++ b/tools/libclang/CIndex.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CIndexer.h"
 #include "CIndexDiagnostic.h"
+#include "CIndexer.h"
 #include "CLog.h"
 #include "CXCursor.h"
 #include "CXSourceLocation.h"
@@ -523,8 +523,10 @@
           for (ASTUnit::top_level_iterator TL = CXXUnit->top_level_begin(),
                                         TLEnd = CXXUnit->top_level_end();
                TL != TLEnd; ++TL) {
-            if (Visit(MakeCXCursor(*TL, TU, RegionOfInterest), true))
-              return true;
+            const Optional<bool> V = handleDeclForVisitation(*TL);
+            if (!V.hasValue())
+              continue;
+            return V.getValue();
           }
         } else if (VisitDeclContext(
                                 CXXUnit->getASTContext().getTranslationUnitDecl()))
@@ -621,42 +623,50 @@
     Decl *D = *I;
     if (D->getLexicalDeclContext() != DC)
       continue;
-    CXCursor Cursor = MakeCXCursor(D, TU, RegionOfInterest);
-
-    // Ignore synthesized ivars here, otherwise if we have something like:
-    //   @synthesize prop = _prop;
-    // and '_prop' is not declared, we will encounter a '_prop' ivar before
-    // encountering the 'prop' synthesize declaration and we will think that
-    // we passed the region-of-interest.
-    if (ObjCIvarDecl *ivarD = dyn_cast<ObjCIvarDecl>(D)) {
-      if (ivarD->getSynthesize())
-        continue;
-    }
-
-    // FIXME: ObjCClassRef/ObjCProtocolRef for forward class/protocol
-    // declarations is a mismatch with the compiler semantics.
-    if (Cursor.kind == CXCursor_ObjCInterfaceDecl) {
-      ObjCInterfaceDecl *ID = cast<ObjCInterfaceDecl>(D);
-      if (!ID->isThisDeclarationADefinition())
-        Cursor = MakeCursorObjCClassRef(ID, ID->getLocation(), TU);
-
-    } else if (Cursor.kind == CXCursor_ObjCProtocolDecl) {
-      ObjCProtocolDecl *PD = cast<ObjCProtocolDecl>(D);
-      if (!PD->isThisDeclarationADefinition())
-        Cursor = MakeCursorObjCProtocolRef(PD, PD->getLocation(), TU);
-    }
-
-    const Optional<bool> &V = shouldVisitCursor(Cursor);
+    const Optional<bool> V = handleDeclForVisitation(D);
     if (!V.hasValue())
       continue;
-    if (!V.getValue())
-      return false;
-    if (Visit(Cursor, true))
-      return true;
+    return V.getValue();
   }
   return false;
 }
 
+Optional<bool> CursorVisitor::handleDeclForVisitation(const Decl *D) {
+  CXCursor Cursor = MakeCXCursor(D, TU, RegionOfInterest);
+
+  // Ignore synthesized ivars here, otherwise if we have something like:
+  //   @synthesize prop = _prop;
+  // and '_prop' is not declared, we will encounter a '_prop' ivar before
+  // encountering the 'prop' synthesize declaration and we will think that
+  // we passed the region-of-interest.
+  if (auto *ivarD = dyn_cast<ObjCIvarDecl>(D)) {
+    if (ivarD->getSynthesize())
+      return None;
+  }
+
+  // FIXME: ObjCClassRef/ObjCProtocolRef for forward class/protocol
+  // declarations is a mismatch with the compiler semantics.
+  if (Cursor.kind == CXCursor_ObjCInterfaceDecl) {
+    auto *ID = cast<ObjCInterfaceDecl>(D);
+    if (!ID->isThisDeclarationADefinition())
+      Cursor = MakeCursorObjCClassRef(ID, ID->getLocation(), TU);
+
+  } else if (Cursor.kind == CXCursor_ObjCProtocolDecl) {
+    auto *PD = cast<ObjCProtocolDecl>(D);
+    if (!PD->isThisDeclarationADefinition())
+      Cursor = MakeCursorObjCProtocolRef(PD, PD->getLocation(), TU);
+  }
+
+  const Optional<bool> V = shouldVisitCursor(Cursor);
+  if (!V.hasValue())
+    return None;
+  if (!V.getValue())
+    return false;
+  if (Visit(Cursor, true))
+    return true;
+  return None;
+}
+
 bool CursorVisitor::VisitTranslationUnitDecl(TranslationUnitDecl *D) {
   llvm_unreachable("Translation units are visited directly by Visit()");
 }
@@ -935,7 +945,7 @@
     if (Visit(TSInfo->getTypeLoc()))
       return true;
 
-  for (const auto *P : ND->params()) {
+  for (const auto *P : ND->parameters()) {
     if (Visit(MakeCXCursor(P, TU, RegionOfInterest)))
       return true;
   }
@@ -1230,6 +1240,14 @@
   return false;
 }
 
+bool CursorVisitor::VisitStaticAssertDecl(StaticAssertDecl *D) {
+  if (Visit(MakeCXCursor(D->getAssertExpr(), StmtParent, TU, RegionOfInterest)))
+    return true;
+  if (Visit(MakeCXCursor(D->getMessage(), StmtParent, TU, RegionOfInterest)))
+    return true;
+  return false;
+}
+
 bool CursorVisitor::VisitDeclarationNameInfo(DeclarationNameInfo Name) {
   switch (Name.getName().getNameKind()) {
   case clang::DeclarationName::Identifier:
@@ -1454,18 +1472,9 @@
   case BuiltinType::Void:
   case BuiltinType::NullPtr:
   case BuiltinType::Dependent:
-  case BuiltinType::OCLImage1d:
-  case BuiltinType::OCLImage1dArray:
-  case BuiltinType::OCLImage1dBuffer:
-  case BuiltinType::OCLImage2d:
-  case BuiltinType::OCLImage2dArray:
-  case BuiltinType::OCLImage2dDepth:
-  case BuiltinType::OCLImage2dArrayDepth:
-  case BuiltinType::OCLImage2dMSAA:
-  case BuiltinType::OCLImage2dArrayMSAA:
-  case BuiltinType::OCLImage2dMSAADepth:
-  case BuiltinType::OCLImage2dArrayMSAADepth:
-  case BuiltinType::OCLImage3d:
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
+  case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
   case BuiltinType::OCLEvent:
   case BuiltinType::OCLClkEvent:
@@ -1951,10 +1960,24 @@
   void VisitOMPAtomicDirective(const OMPAtomicDirective *D);
   void VisitOMPTargetDirective(const OMPTargetDirective *D);
   void VisitOMPTargetDataDirective(const OMPTargetDataDirective *D);
+  void VisitOMPTargetEnterDataDirective(const OMPTargetEnterDataDirective *D);
+  void VisitOMPTargetExitDataDirective(const OMPTargetExitDataDirective *D);
+  void VisitOMPTargetParallelDirective(const OMPTargetParallelDirective *D);
+  void
+  VisitOMPTargetParallelForDirective(const OMPTargetParallelForDirective *D);
   void VisitOMPTeamsDirective(const OMPTeamsDirective *D);
   void VisitOMPTaskLoopDirective(const OMPTaskLoopDirective *D);
   void VisitOMPTaskLoopSimdDirective(const OMPTaskLoopSimdDirective *D);
   void VisitOMPDistributeDirective(const OMPDistributeDirective *D);
+  void VisitOMPDistributeParallelForDirective(
+      const OMPDistributeParallelForDirective *D);
+  void VisitOMPDistributeParallelForSimdDirective(
+      const OMPDistributeParallelForSimdDirective *D);
+  void VisitOMPDistributeSimdDirective(const OMPDistributeSimdDirective *D);
+  void VisitOMPTargetParallelForSimdDirective(
+      const OMPTargetParallelForSimdDirective *D);
+  void VisitOMPTargetSimdDirective(const OMPTargetSimdDirective *D);
+  void VisitOMPTeamsDistributeDirective(const OMPTeamsDistributeDirective *D);
 
 private:
   void AddDeclarationNameInfo(const Stmt *S);
@@ -2025,8 +2048,21 @@
 #define OPENMP_CLAUSE(Name, Class)                                             \
   void Visit##Class(const Class *C);
 #include "clang/Basic/OpenMPKinds.def"
+  void VisitOMPClauseWithPreInit(const OMPClauseWithPreInit *C);
+  void VisitOMPClauseWithPostUpdate(const OMPClauseWithPostUpdate *C);
 };
 
+void OMPClauseEnqueue::VisitOMPClauseWithPreInit(
+    const OMPClauseWithPreInit *C) {
+  Visitor->AddStmt(C->getPreInitStmt());
+}
+
+void OMPClauseEnqueue::VisitOMPClauseWithPostUpdate(
+    const OMPClauseWithPostUpdate *C) {
+  VisitOMPClauseWithPreInit(C);
+  Visitor->AddStmt(C->getPostUpdateExpr());
+}
+
 void OMPClauseEnqueue::VisitOMPIfClause(const OMPIfClause *C) {
   Visitor->AddStmt(C->getCondition());
 }
@@ -2056,8 +2092,8 @@
 void OMPClauseEnqueue::VisitOMPProcBindClause(const OMPProcBindClause *C) { }
 
 void OMPClauseEnqueue::VisitOMPScheduleClause(const OMPScheduleClause *C) {
+  VisitOMPClauseWithPreInit(C);
   Visitor->AddStmt(C->getChunkSize());
-  Visitor->AddStmt(C->getHelperChunkSize());
 }
 
 void OMPClauseEnqueue::VisitOMPOrderedClause(const OMPOrderedClause *C) {
@@ -2130,10 +2166,18 @@
 void OMPClauseEnqueue::VisitOMPFirstprivateClause(
                                         const OMPFirstprivateClause *C) {
   VisitOMPClauseList(C);
+  VisitOMPClauseWithPreInit(C);
+  for (const auto *E : C->private_copies()) {
+    Visitor->AddStmt(E);
+  }
+  for (const auto *E : C->inits()) {
+    Visitor->AddStmt(E);
+  }
 }
 void OMPClauseEnqueue::VisitOMPLastprivateClause(
                                         const OMPLastprivateClause *C) {
   VisitOMPClauseList(C);
+  VisitOMPClauseWithPostUpdate(C);
   for (auto *E : C->private_copies()) {
     Visitor->AddStmt(E);
   }
@@ -2152,6 +2196,7 @@
 }
 void OMPClauseEnqueue::VisitOMPReductionClause(const OMPReductionClause *C) {
   VisitOMPClauseList(C);
+  VisitOMPClauseWithPostUpdate(C);
   for (auto *E : C->privates()) {
     Visitor->AddStmt(E);
   }
@@ -2167,6 +2212,7 @@
 }
 void OMPClauseEnqueue::VisitOMPLinearClause(const OMPLinearClause *C) {
   VisitOMPClauseList(C);
+  VisitOMPClauseWithPostUpdate(C);
   for (const auto *E : C->privates()) {
     Visitor->AddStmt(E);
   }
@@ -2222,8 +2268,22 @@
 }
 void OMPClauseEnqueue::VisitOMPDistScheduleClause(
     const OMPDistScheduleClause *C) {
+  VisitOMPClauseWithPreInit(C);
   Visitor->AddStmt(C->getChunkSize());
-  Visitor->AddStmt(C->getHelperChunkSize());
+}
+void OMPClauseEnqueue::VisitOMPDefaultmapClause(
+    const OMPDefaultmapClause * /*C*/) {}
+void OMPClauseEnqueue::VisitOMPToClause(const OMPToClause *C) {
+  VisitOMPClauseList(C);
+}
+void OMPClauseEnqueue::VisitOMPFromClause(const OMPFromClause *C) {
+  VisitOMPClauseList(C);
+}
+void OMPClauseEnqueue::VisitOMPUseDevicePtrClause(const OMPUseDevicePtrClause *C) {
+  VisitOMPClauseList(C);
+}
+void OMPClauseEnqueue::VisitOMPIsDevicePtrClause(const OMPIsDevicePtrClause *C) {
+  VisitOMPClauseList(C);
 }
 }
 
@@ -2365,21 +2425,20 @@
 }
 void EnqueueVisitor::VisitDesignatedInitExpr(const DesignatedInitExpr *E) {
   AddStmt(E->getInit());
-  for (DesignatedInitExpr::const_reverse_designators_iterator
-         D = E->designators_rbegin(), DEnd = E->designators_rend();
-         D != DEnd; ++D) {
-    if (D->isFieldDesignator()) {
-      if (FieldDecl *Field = D->getField())
-        AddMemberRef(Field, D->getFieldLoc());
+  for (const DesignatedInitExpr::Designator &D :
+       llvm::reverse(E->designators())) {
+    if (D.isFieldDesignator()) {
+      if (FieldDecl *Field = D.getField())
+        AddMemberRef(Field, D.getFieldLoc());
       continue;
     }
-    if (D->isArrayDesignator()) {
-      AddStmt(E->getArrayIndex(*D));
+    if (D.isArrayDesignator()) {
+      AddStmt(E->getArrayIndex(D));
       continue;
     }
-    assert(D->isArrayRangeDesignator() && "Unknown designator kind");
-    AddStmt(E->getArrayRangeEnd(*D));
-    AddStmt(E->getArrayRangeStart(*D));
+    assert(D.isArrayRangeDesignator() && "Unknown designator kind");
+    AddStmt(E->getArrayRangeEnd(D));
+    AddStmt(E->getArrayRangeStart(D));
   }
 }
 void EnqueueVisitor::VisitExplicitCastExpr(const ExplicitCastExpr *E) {
@@ -2631,6 +2690,26 @@
   VisitOMPExecutableDirective(D);
 }
 
+void EnqueueVisitor::VisitOMPTargetEnterDataDirective(
+    const OMPTargetEnterDataDirective *D) {
+  VisitOMPExecutableDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPTargetExitDataDirective(
+    const OMPTargetExitDataDirective *D) {
+  VisitOMPExecutableDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPTargetParallelDirective(
+    const OMPTargetParallelDirective *D) {
+  VisitOMPExecutableDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPTargetParallelForDirective(
+    const OMPTargetParallelForDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
 void EnqueueVisitor::VisitOMPTeamsDirective(const OMPTeamsDirective *D) {
   VisitOMPExecutableDirective(D);
 }
@@ -2658,6 +2737,36 @@
   VisitOMPLoopDirective(D);
 }
 
+void EnqueueVisitor::VisitOMPDistributeParallelForDirective(
+    const OMPDistributeParallelForDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPDistributeParallelForSimdDirective(
+    const OMPDistributeParallelForSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPDistributeSimdDirective(
+    const OMPDistributeSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPTargetParallelForSimdDirective(
+    const OMPTargetParallelForSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPTargetSimdDirective(
+    const OMPTargetSimdDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPTeamsDistributeDirective(
+    const OMPTeamsDistributeDirective *D) {
+  VisitOMPLoopDirective(D);
+}
+
 void CursorVisitor::EnqueueWorkList(VisitorWorkList &WL, const Stmt *S) {
   EnqueueVisitor(WL, MakeCXCursor(S, StmtParent, TU,RegionOfInterest)).Visit(S);
 }
@@ -3368,26 +3477,23 @@
   return S;
 }
 
-typedef struct {
+struct ExprEvalResult {
   CXEvalResultKind EvalType;
   union {
     int intVal;
     double floatVal;
     char *stringVal;
   } EvalData;
-} ExprEvalResult;
+  ~ExprEvalResult() {
+    if (EvalType != CXEval_UnExposed && EvalType != CXEval_Float &&
+        EvalType != CXEval_Int) {
+      delete EvalData.stringVal;
+    }
+  }
+};
 
 void clang_EvalResult_dispose(CXEvalResult E) {
-  ExprEvalResult *ER = (ExprEvalResult *)E;
-  if (ER) {
-    CXEvalResultKind evalType = ER->EvalType;
-
-    if (evalType != CXEval_UnExposed &&  evalType != CXEval_Float &&
-            evalType != CXEval_Int && ER->EvalData.stringVal) {
-            free((void *) ER->EvalData.stringVal);
-    }
-    free((void *)ER);
-  }
+  delete static_cast<ExprEvalResult *>(E);
 }
 
 CXEvalResultKind clang_EvalResult_getKind(CXEvalResult E) {
@@ -3421,156 +3527,140 @@
 static const ExprEvalResult* evaluateExpr(Expr *expr, CXCursor C) {
   Expr::EvalResult ER;
   ASTContext &ctx = getCursorContext(C);
-  if (!expr) {
+  if (!expr)
     return nullptr;
-  }
+
   expr = expr->IgnoreParens();
-  bool res = expr->EvaluateAsRValue(ER, ctx);
+  if (!expr->EvaluateAsRValue(ER, ctx))
+    return nullptr;
+
   QualType rettype;
   CallExpr *callExpr;
-  ExprEvalResult *result = (ExprEvalResult *) malloc(sizeof(ExprEvalResult));
-  if (!result) {
-    return nullptr;
-  }
+  auto result = llvm::make_unique<ExprEvalResult>();
   result->EvalType = CXEval_UnExposed;
 
-  if (res) {
+  if (ER.Val.isInt()) {
+    result->EvalType = CXEval_Int;
+    result->EvalData.intVal = ER.Val.getInt().getExtValue();
+    return result.release();
+  }
 
-    if (ER.Val.isInt()) {
-      result->EvalType = CXEval_Int;
-      result->EvalData.intVal = ER.Val.getInt().getExtValue();
-      return result;
-    } else if (ER.Val.isFloat()) {
+  if (ER.Val.isFloat()) {
+    llvm::SmallVector<char, 100> Buffer;
+    ER.Val.getFloat().toString(Buffer);
+    std::string floatStr(Buffer.data(), Buffer.size());
+    result->EvalType = CXEval_Float;
+    bool ignored;
+    llvm::APFloat apFloat = ER.Val.getFloat();
+    apFloat.convert(llvm::APFloat::IEEEdouble,
+                    llvm::APFloat::rmNearestTiesToEven, &ignored);
+    result->EvalData.floatVal = apFloat.convertToDouble();
+    return result.release();
+  }
 
-      llvm::SmallVector<char, 100> Buffer;
-      ER.Val.getFloat().toString(Buffer);
-      std::string floatStr(Buffer.data(), Buffer.size());
-      result->EvalType = CXEval_Float;
-      bool ignored;
-      llvm::APFloat apFloat = ER.Val.getFloat();
-      apFloat.convert(llvm::APFloat::IEEEdouble,
-                      llvm::APFloat::rmNearestTiesToEven, &ignored);
-      result->EvalData.floatVal = apFloat.convertToDouble();
-      return result;
-
-    } else if (expr->getStmtClass() == Stmt::ImplicitCastExprClass) {
-
-      const ImplicitCastExpr *I = dyn_cast<ImplicitCastExpr>(expr);
-      auto *subExpr = I->getSubExprAsWritten();
-      if (subExpr->getStmtClass() == Stmt::StringLiteralClass ||
-          subExpr->getStmtClass() == Stmt::ObjCStringLiteralClass) {
-
-        const StringLiteral *StrE = nullptr;
-        const ObjCStringLiteral *ObjCExpr;
-        ObjCExpr = dyn_cast<ObjCStringLiteral>(subExpr);
-
-        if (ObjCExpr) {
-          StrE = ObjCExpr->getString();
-          result->EvalType = CXEval_ObjCStrLiteral;
-        } else {
-          StrE = cast<StringLiteral>(I->getSubExprAsWritten());
-          result->EvalType = CXEval_StrLiteral;
-        }
-
-        std::string strRef(StrE->getString().str());
-        result->EvalData.stringVal = (char *)malloc(strRef.size()+1);
-        strncpy((char*)result->EvalData.stringVal, strRef.c_str(),
-                   strRef.size());
-        result->EvalData.stringVal[strRef.size()] = '\0';
-        return result;
-      }
-
-    } else if (expr->getStmtClass() == Stmt::ObjCStringLiteralClass ||
-             expr->getStmtClass() == Stmt::StringLiteralClass) {
-
+  if (expr->getStmtClass() == Stmt::ImplicitCastExprClass) {
+    const ImplicitCastExpr *I = dyn_cast<ImplicitCastExpr>(expr);
+    auto *subExpr = I->getSubExprAsWritten();
+    if (subExpr->getStmtClass() == Stmt::StringLiteralClass ||
+        subExpr->getStmtClass() == Stmt::ObjCStringLiteralClass) {
       const StringLiteral *StrE = nullptr;
       const ObjCStringLiteral *ObjCExpr;
-      ObjCExpr = dyn_cast<ObjCStringLiteral>(expr);
+      ObjCExpr = dyn_cast<ObjCStringLiteral>(subExpr);
 
       if (ObjCExpr) {
         StrE = ObjCExpr->getString();
         result->EvalType = CXEval_ObjCStrLiteral;
       } else {
-        StrE = cast<StringLiteral>(expr);
+        StrE = cast<StringLiteral>(I->getSubExprAsWritten());
         result->EvalType = CXEval_StrLiteral;
       }
 
       std::string strRef(StrE->getString().str());
-      result->EvalData.stringVal = (char *)malloc(strRef.size()+1);
-      strncpy((char*)result->EvalData.stringVal, strRef.c_str(),
-                  strRef.size());
+      result->EvalData.stringVal = new char[strRef.size() + 1];
+      strncpy((char *)result->EvalData.stringVal, strRef.c_str(),
+              strRef.size());
       result->EvalData.stringVal[strRef.size()] = '\0';
-      return result;
+      return result.release();
+    }
+  } else if (expr->getStmtClass() == Stmt::ObjCStringLiteralClass ||
+             expr->getStmtClass() == Stmt::StringLiteralClass) {
+    const StringLiteral *StrE = nullptr;
+    const ObjCStringLiteral *ObjCExpr;
+    ObjCExpr = dyn_cast<ObjCStringLiteral>(expr);
 
-    } else if (expr->getStmtClass() == Stmt::CStyleCastExprClass) {
+    if (ObjCExpr) {
+      StrE = ObjCExpr->getString();
+      result->EvalType = CXEval_ObjCStrLiteral;
+    } else {
+      StrE = cast<StringLiteral>(expr);
+      result->EvalType = CXEval_StrLiteral;
+    }
 
-      CStyleCastExpr *CC = static_cast<CStyleCastExpr *>(expr);
+    std::string strRef(StrE->getString().str());
+    result->EvalData.stringVal = new char[strRef.size() + 1];
+    strncpy((char *)result->EvalData.stringVal, strRef.c_str(), strRef.size());
+    result->EvalData.stringVal[strRef.size()] = '\0';
+    return result.release();
+  }
 
-      rettype = CC->getType();
-      if (rettype.getAsString() == "CFStringRef" &&
-            CC->getSubExpr()->getStmtClass() == Stmt::CallExprClass) {
+  if (expr->getStmtClass() == Stmt::CStyleCastExprClass) {
+    CStyleCastExpr *CC = static_cast<CStyleCastExpr *>(expr);
 
-        callExpr = static_cast<CallExpr *>(CC->getSubExpr());
-        StringLiteral* S = getCFSTR_value(callExpr);
-        if (S) {
-          std::string strLiteral(S->getString().str());
-          result->EvalType = CXEval_CFStr;
+    rettype = CC->getType();
+    if (rettype.getAsString() == "CFStringRef" &&
+        CC->getSubExpr()->getStmtClass() == Stmt::CallExprClass) {
 
-          result->EvalData.stringVal = (char *)malloc(strLiteral.size()+1);
-          strncpy((char*)result->EvalData.stringVal, strLiteral.c_str(),
-                     strLiteral.size());
-          result->EvalData.stringVal[strLiteral.size()] = '\0';
-          return result;
-        }
-      }
+      callExpr = static_cast<CallExpr *>(CC->getSubExpr());
+      StringLiteral *S = getCFSTR_value(callExpr);
+      if (S) {
+        std::string strLiteral(S->getString().str());
+        result->EvalType = CXEval_CFStr;
 
-    } else if (expr->getStmtClass() == Stmt::CallExprClass) {
-
-      callExpr = static_cast<CallExpr *>(expr);
-      rettype = callExpr->getCallReturnType(ctx);
-
-      if (rettype->isVectorType() || callExpr->getNumArgs() > 1) {
-        clang_EvalResult_dispose((CXEvalResult *)result);
-        return nullptr;
-      }
-      if (rettype->isIntegralType(ctx) || rettype->isRealFloatingType()) {
-        if(callExpr->getNumArgs() == 1 &&
-              !callExpr->getArg(0)->getType()->isIntegralType(ctx)) {
-          clang_EvalResult_dispose((CXEvalResult *)result);
-          return nullptr;
-        }
-      } else if(rettype.getAsString() == "CFStringRef") {
-
-        StringLiteral* S = getCFSTR_value(callExpr);
-        if (S) {
-          std::string strLiteral(S->getString().str());
-          result->EvalType = CXEval_CFStr;
-          result->EvalData.stringVal = (char *)malloc(strLiteral.size()+1);
-          strncpy((char*)result->EvalData.stringVal, strLiteral.c_str(),
-                     strLiteral.size());
-          result->EvalData.stringVal[strLiteral.size()] = '\0';
-          return result;
-        }
-      }
-
-    } else if (expr->getStmtClass() == Stmt::DeclRefExprClass) {
-
-      DeclRefExpr *D = static_cast<DeclRefExpr *>(expr);
-      ValueDecl *V = D->getDecl();
-      if (V->getKind() == Decl::Function) {
-        std::string strName(V->getNameAsString());
-        result->EvalType = CXEval_Other;
-        result->EvalData.stringVal = (char *)malloc(strName.size()+1);
-        strncpy((char*)result->EvalData.stringVal, strName.c_str(),
-                   strName.size());
-        result->EvalData.stringVal[strName.size()] = '\0';
-        return result;
+        result->EvalData.stringVal = new char[strLiteral.size() + 1];
+        strncpy((char *)result->EvalData.stringVal, strLiteral.c_str(),
+                strLiteral.size());
+        result->EvalData.stringVal[strLiteral.size()] = '\0';
+        return result.release();
       }
     }
 
+  } else if (expr->getStmtClass() == Stmt::CallExprClass) {
+    callExpr = static_cast<CallExpr *>(expr);
+    rettype = callExpr->getCallReturnType(ctx);
+
+    if (rettype->isVectorType() || callExpr->getNumArgs() > 1)
+      return nullptr;
+
+    if (rettype->isIntegralType(ctx) || rettype->isRealFloatingType()) {
+      if (callExpr->getNumArgs() == 1 &&
+          !callExpr->getArg(0)->getType()->isIntegralType(ctx))
+        return nullptr;
+    } else if (rettype.getAsString() == "CFStringRef") {
+
+      StringLiteral *S = getCFSTR_value(callExpr);
+      if (S) {
+        std::string strLiteral(S->getString().str());
+        result->EvalType = CXEval_CFStr;
+        result->EvalData.stringVal = new char[strLiteral.size() + 1];
+        strncpy((char *)result->EvalData.stringVal, strLiteral.c_str(),
+                strLiteral.size());
+        result->EvalData.stringVal[strLiteral.size()] = '\0';
+        return result.release();
+      }
+    }
+  } else if (expr->getStmtClass() == Stmt::DeclRefExprClass) {
+    DeclRefExpr *D = static_cast<DeclRefExpr *>(expr);
+    ValueDecl *V = D->getDecl();
+    if (V->getKind() == Decl::Function) {
+      std::string strName = V->getNameAsString();
+      result->EvalType = CXEval_Other;
+      result->EvalData.stringVal = new char[strName.size() + 1];
+      strncpy(result->EvalData.stringVal, strName.c_str(), strName.size());
+      result->EvalData.stringVal[strName.size()] = '\0';
+      return result.release();
+    }
   }
 
-  clang_EvalResult_dispose((CXEvalResult *)result);
   return nullptr;
 }
 
@@ -3913,6 +4003,9 @@
   if (const CXXConstructExpr *CE = dyn_cast<CXXConstructExpr>(E))
     if (!CE->isElidable())
     return CE->getConstructor();
+  if (const CXXInheritedCtorInitExpr *CE =
+          dyn_cast<CXXInheritedCtorInitExpr>(E))
+    return CE->getConstructor();
   if (const ObjCMessageExpr *OME = dyn_cast<ObjCMessageExpr>(E))
     return OME->getMethodDecl();
 
@@ -4390,10 +4483,8 @@
     SmallString<128> Str;
     llvm::raw_svector_ostream OS(Str);
     OS << *ClassSpec;
-    TemplateSpecializationType::PrintTemplateArgumentList(OS,
-                                      ClassSpec->getTemplateArgs().data(),
-                                      ClassSpec->getTemplateArgs().size(),
-                                                                Policy);
+    TemplateSpecializationType::PrintTemplateArgumentList(
+        OS, ClassSpec->getTemplateArgs().asArray(), Policy);
     return cxstring::createDup(OS.str());
   }
   
@@ -4532,6 +4623,8 @@
       return cxstring::createRef("ObjCStringLiteral");
   case CXCursor_ObjCBoolLiteralExpr:
       return cxstring::createRef("ObjCBoolLiteralExpr");
+  case CXCursor_ObjCAvailabilityCheckExpr:
+      return cxstring::createRef("ObjCAvailabilityCheckExpr");
   case CXCursor_ObjCSelfExpr:
       return cxstring::createRef("ObjCSelfExpr");
   case CXCursor_ObjCEncodeExpr:
@@ -4768,6 +4861,16 @@
     return cxstring::createRef("OMPTargetDirective");
   case CXCursor_OMPTargetDataDirective:
     return cxstring::createRef("OMPTargetDataDirective");
+  case CXCursor_OMPTargetEnterDataDirective:
+    return cxstring::createRef("OMPTargetEnterDataDirective");
+  case CXCursor_OMPTargetExitDataDirective:
+    return cxstring::createRef("OMPTargetExitDataDirective");
+  case CXCursor_OMPTargetParallelDirective:
+    return cxstring::createRef("OMPTargetParallelDirective");
+  case CXCursor_OMPTargetParallelForDirective:
+    return cxstring::createRef("OMPTargetParallelForDirective");
+  case CXCursor_OMPTargetUpdateDirective:
+    return cxstring::createRef("OMPTargetUpdateDirective");
   case CXCursor_OMPTeamsDirective:
     return cxstring::createRef("OMPTeamsDirective");
   case CXCursor_OMPCancellationPointDirective:
@@ -4780,10 +4883,24 @@
     return cxstring::createRef("OMPTaskLoopSimdDirective");
   case CXCursor_OMPDistributeDirective:
     return cxstring::createRef("OMPDistributeDirective");
+  case CXCursor_OMPDistributeParallelForDirective:
+    return cxstring::createRef("OMPDistributeParallelForDirective");
+  case CXCursor_OMPDistributeParallelForSimdDirective:
+    return cxstring::createRef("OMPDistributeParallelForSimdDirective");
+  case CXCursor_OMPDistributeSimdDirective:
+    return cxstring::createRef("OMPDistributeSimdDirective");
+  case CXCursor_OMPTargetParallelForSimdDirective:
+    return cxstring::createRef("OMPTargetParallelForSimdDirective");
+  case CXCursor_OMPTargetSimdDirective:
+    return cxstring::createRef("OMPTargetSimdDirective");
+  case CXCursor_OMPTeamsDistributeDirective:
+    return cxstring::createRef("OMPTeamsDistributeDirective");
   case CXCursor_OverloadCandidate:
       return cxstring::createRef("OverloadCandidate");
   case CXCursor_TypeAliasTemplateDecl:
       return cxstring::createRef("TypeAliasTemplateDecl");
+  case CXCursor_StaticAssert:
+      return cxstring::createRef("StaticAssert");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
@@ -5505,6 +5622,7 @@
   case Decl::TemplateTypeParm:
   case Decl::EnumConstant:
   case Decl::Field:
+  case Decl::Binding:
   case Decl::MSProperty:
   case Decl::IndirectField:
   case Decl::ObjCIvar:
@@ -5522,12 +5640,16 @@
   case Decl::StaticAssert:
   case Decl::Block:
   case Decl::Captured:
+  case Decl::OMPCapturedExpr:
   case Decl::Label:  // FIXME: Is this right??
   case Decl::ClassScopeFunctionSpecialization:
   case Decl::Import:
   case Decl::OMPThreadPrivate:
+  case Decl::OMPDeclareReduction:
   case Decl::ObjCTypeParam:
   case Decl::BuiltinTemplate:
+  case Decl::PragmaComment:
+  case Decl::PragmaDetectMismatch:
     return C;
 
   // Declaration kinds that don't make any sense here, but are
@@ -5571,7 +5693,8 @@
 
   case Decl::Var:
   case Decl::VarTemplateSpecialization:
-  case Decl::VarTemplatePartialSpecialization: {
+  case Decl::VarTemplatePartialSpecialization:
+  case Decl::Decomposition: {
     // Ask the variable if it has a definition.
     if (const VarDecl *Def = cast<VarDecl>(D)->getDefinition())
       return MakeCXCursor(Def, TU);
@@ -5605,6 +5728,7 @@
                                        D->getLocation(), TU);
 
   case Decl::UsingShadow:
+  case Decl::ConstructorUsingShadow:
     return clang_getCursorDefinition(
                        MakeCXCursor(cast<UsingShadowDecl>(D)->getTargetDecl(),
                                     TU));
@@ -5828,7 +5952,8 @@
 }
 
 void clang_enableStackTraces(void) {
-  llvm::sys::PrintStackTraceOnErrorSignal();
+  // FIXME: Provide an argv0 here so we can find llvm-symbolizer.
+  llvm::sys::PrintStackTraceOnErrorSignal(StringRef());
 }
 
 void clang_executeOnThread(void (*fn)(void*), void *user_data,
@@ -6226,7 +6351,7 @@
         if (Method->getObjCDeclQualifier())
           HasContextSensitiveKeywords = true;
         else {
-          for (const auto *P : Method->params()) {
+          for (const auto *P : Method->parameters()) {
             if (P->getObjCDeclQualifier()) {
               HasContextSensitiveKeywords = true;
               break;
@@ -7328,6 +7453,48 @@
 //===----------------------------------------------------------------------===//
 
 extern "C" {
+
+unsigned clang_CXXConstructor_isDefaultConstructor(CXCursor C) {
+  if (!clang_isDeclaration(C.kind))
+    return 0;
+
+  const Decl *D = cxcursor::getCursorDecl(C);
+  const CXXConstructorDecl *Constructor =
+      D ? dyn_cast_or_null<CXXConstructorDecl>(D->getAsFunction()) : nullptr;
+  return (Constructor && Constructor->isDefaultConstructor()) ? 1 : 0;
+}
+
+unsigned clang_CXXConstructor_isCopyConstructor(CXCursor C) {
+  if (!clang_isDeclaration(C.kind))
+    return 0;
+
+  const Decl *D = cxcursor::getCursorDecl(C);
+  const CXXConstructorDecl *Constructor =
+      D ? dyn_cast_or_null<CXXConstructorDecl>(D->getAsFunction()) : nullptr;
+  return (Constructor && Constructor->isCopyConstructor()) ? 1 : 0;
+}
+
+unsigned clang_CXXConstructor_isMoveConstructor(CXCursor C) {
+  if (!clang_isDeclaration(C.kind))
+    return 0;
+
+  const Decl *D = cxcursor::getCursorDecl(C);
+  const CXXConstructorDecl *Constructor =
+      D ? dyn_cast_or_null<CXXConstructorDecl>(D->getAsFunction()) : nullptr;
+  return (Constructor && Constructor->isMoveConstructor()) ? 1 : 0;
+}
+
+unsigned clang_CXXConstructor_isConvertingConstructor(CXCursor C) {
+  if (!clang_isDeclaration(C.kind))
+    return 0;
+
+  const Decl *D = cxcursor::getCursorDecl(C);
+  const CXXConstructorDecl *Constructor =
+      D ? dyn_cast_or_null<CXXConstructorDecl>(D->getAsFunction()) : nullptr;
+  // Passing 'false' excludes constructors marked 'explicit'.
+  return (Constructor && Constructor->isConvertingConstructor(false)) ? 1 : 0;
+}
+
 unsigned clang_CXXField_isMutable(CXCursor C) {
   if (!clang_isDeclaration(C.kind))
     return 0;
@@ -7358,6 +7525,16 @@
   return (Method && (Method->getTypeQualifiers() & Qualifiers::Const)) ? 1 : 0;
 }
 
+unsigned clang_CXXMethod_isDefaulted(CXCursor C) {
+  if (!clang_isDeclaration(C.kind))
+    return 0;
+
+  const Decl *D = cxcursor::getCursorDecl(C);
+  const CXXMethodDecl *Method =
+      D ? dyn_cast_or_null<CXXMethodDecl>(D->getAsFunction()) : nullptr;
+  return (Method && Method->isDefaulted()) ? 1 : 0;
+}
+
 unsigned clang_CXXMethod_isStatic(CXCursor C) {
   if (!clang_isDeclaration(C.kind))
     return 0;
@@ -7874,3 +8051,10 @@
     OS << "--------------------------------------------------\n";
   }
 }
+
+#ifdef CLANG_TOOL_EXTRA_BUILD
+// This anchor is used to force the linker to link the clang-tidy plugin.
+extern volatile int ClangTidyPluginAnchorSource;
+static int LLVM_ATTRIBUTE_UNUSED ClangTidyPluginAnchorDestination =
+    ClangTidyPluginAnchorSource;
+#endif
diff --git a/tools/libclang/CIndexDiagnostic.cpp b/tools/libclang/CIndexDiagnostic.cpp
index 9ba36a6..18031e5 100644
--- a/tools/libclang/CIndexDiagnostic.cpp
+++ b/tools/libclang/CIndexDiagnostic.cpp
@@ -16,13 +16,11 @@
 #include "CXSourceLocation.h"
 #include "CXString.h"
 
-#include "clang/Frontend/ASTUnit.h"
-#include "clang/Frontend/FrontendDiagnostic.h"
-#include "clang/Frontend/DiagnosticRenderer.h"
 #include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Frontend/ASTUnit.h"
+#include "clang/Frontend/DiagnosticRenderer.h"
+#include "clang/Frontend/FrontendDiagnostic.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
diff --git a/tools/libclang/CIndexInclusionStack.cpp b/tools/libclang/CIndexInclusionStack.cpp
index 0959374..5ebb5ec 100644
--- a/tools/libclang/CIndexInclusionStack.cpp
+++ b/tools/libclang/CIndexInclusionStack.cpp
@@ -17,8 +17,6 @@
 #include "CXTranslationUnit.h"
 #include "clang/AST/DeclVisitor.h"
 #include "clang/Frontend/ASTUnit.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Support/raw_ostream.h"
 using namespace clang;
 
 static void getInclusions(const SrcMgr::SLocEntry &(SourceManager::*Getter)(unsigned, bool*) const, unsigned n,
diff --git a/tools/libclang/CIndexer.cpp b/tools/libclang/CIndexer.cpp
index 91154cc..694ed60 100644
--- a/tools/libclang/CIndexer.cpp
+++ b/tools/libclang/CIndexer.cpp
@@ -12,22 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "CIndexer.h"
-#include "clang/AST/Decl.h"
-#include "clang/AST/DeclVisitor.h"
-#include "clang/AST/StmtVisitor.h"
-#include "clang/Basic/FileManager.h"
-#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/LLVM.h"
 #include "clang/Basic/Version.h"
-#include "clang/Sema/CodeCompleteConsumer.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstdio>
-#include <sstream>
-#include <vector>
 
 #ifdef __CYGWIN__
 #include <cygwin/version.h>
diff --git a/tools/libclang/CIndexer.h b/tools/libclang/CIndexer.h
index 94c27a0..b227f94 100644
--- a/tools/libclang/CIndexer.h
+++ b/tools/libclang/CIndexer.h
@@ -17,10 +17,8 @@
 
 #include "clang-c/Index.h"
 #include "clang/Frontend/PCHContainerOperations.h"
-#include "clang/Lex/ModuleLoader.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Path.h"
-#include <vector>
+#include "llvm/ADT/STLExtras.h"
+#include <utility>
 
 namespace llvm {
   class CrashRecoveryContext;
@@ -46,7 +44,8 @@
   CIndexer(std::shared_ptr<PCHContainerOperations> PCHContainerOps =
                std::make_shared<PCHContainerOperations>())
       : OnlyLocalDecls(false), DisplayDiagnostics(false),
-        Options(CXGlobalOpt_None), PCHContainerOps(PCHContainerOps) {}
+        Options(CXGlobalOpt_None), PCHContainerOps(std::move(PCHContainerOps)) {
+  }
 
   /// \brief Whether we only want to see "local" declarations (that did not
   /// come from a previous precompiled header). If false, we want to see all
diff --git a/tools/libclang/CMakeLists.txt b/tools/libclang/CMakeLists.txt
index 20a2db3..cf1deed 100644
--- a/tools/libclang/CMakeLists.txt
+++ b/tools/libclang/CMakeLists.txt
@@ -48,6 +48,11 @@
   list(APPEND LIBS clangARCMigrate)
 endif ()
 
+if (TARGET clangTidyPlugin)
+  add_definitions(-DCLANG_TOOL_EXTRA_BUILD)
+  list(APPEND LIBS clangTidyPlugin)
+endif ()
+
 find_library(DL_LIBRARY_PATH dl)
 if (DL_LIBRARY_PATH)
   list(APPEND LIBS dl)
diff --git a/tools/libclang/CXComment.cpp b/tools/libclang/CXComment.cpp
index 9cc05ed..c02eea9 100644
--- a/tools/libclang/CXComment.cpp
+++ b/tools/libclang/CXComment.cpp
@@ -11,15 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang-c/Index.h"
 #include "CXComment.h"
 #include "CXCursor.h"
 #include "CXString.h"
 #include "clang-c/Documentation.h"
+#include "clang-c/Index.h"
 #include "clang/AST/Decl.h"
 #include "clang/Index/CommentToXML.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <climits>
 
diff --git a/tools/libclang/CXCursor.cpp b/tools/libclang/CXCursor.cpp
index c766d2d..047f822 100644
--- a/tools/libclang/CXCursor.cpp
+++ b/tools/libclang/CXCursor.cpp
@@ -256,7 +256,6 @@
   case Stmt::PredefinedExprClass:
   case Stmt::ShuffleVectorExprClass:
   case Stmt::ConvertVectorExprClass:
-  case Stmt::UnaryExprOrTypeTraitExprClass:
   case Stmt::VAArgExprClass:
   case Stmt::ObjCArrayLiteralClass:
   case Stmt::ObjCDictionaryLiteralClass:
@@ -327,6 +326,7 @@
     K = CXCursor_UnaryOperator;
     break;
 
+  case Stmt::UnaryExprOrTypeTraitExprClass:
   case Stmt::CXXNoexceptExprClass:
     K = CXCursor_UnaryExpr;
     break;
@@ -447,7 +447,11 @@
   case Stmt::ObjCBoolLiteralExprClass:
     K = CXCursor_ObjCBoolLiteralExpr;
     break;
-      
+
+  case Stmt::ObjCAvailabilityCheckExprClass:
+    K = CXCursor_ObjCAvailabilityCheckExpr;
+    break;
+
   case Stmt::ObjCBridgedCastExprClass:
     K = CXCursor_ObjCBridgedCastExpr;
     break;
@@ -504,6 +508,7 @@
   case Stmt::CXXMemberCallExprClass:
   case Stmt::CUDAKernelCallExprClass:
   case Stmt::CXXConstructExprClass:  
+  case Stmt::CXXInheritedCtorInitExprClass:  
   case Stmt::CXXTemporaryObjectExprClass:
   case Stmt::CXXUnresolvedConstructExprClass:
   case Stmt::UserDefinedLiteralClass:
@@ -600,6 +605,21 @@
   case Stmt::OMPTargetDataDirectiveClass:
     K = CXCursor_OMPTargetDataDirective;
     break;
+  case Stmt::OMPTargetEnterDataDirectiveClass:
+    K = CXCursor_OMPTargetEnterDataDirective;
+    break;
+  case Stmt::OMPTargetExitDataDirectiveClass:
+    K = CXCursor_OMPTargetExitDataDirective;
+    break;
+  case Stmt::OMPTargetParallelDirectiveClass:
+    K = CXCursor_OMPTargetParallelDirective;
+    break;
+  case Stmt::OMPTargetParallelForDirectiveClass:
+    K = CXCursor_OMPTargetParallelForDirective;
+    break;
+  case Stmt::OMPTargetUpdateDirectiveClass:
+    K = CXCursor_OMPTargetUpdateDirective;
+    break;
   case Stmt::OMPTeamsDirectiveClass:
     K = CXCursor_OMPTeamsDirective;
     break;
@@ -618,6 +638,24 @@
   case Stmt::OMPDistributeDirectiveClass:
     K = CXCursor_OMPDistributeDirective;
     break;
+  case Stmt::OMPDistributeParallelForDirectiveClass:
+    K = CXCursor_OMPDistributeParallelForDirective;
+    break;
+  case Stmt::OMPDistributeParallelForSimdDirectiveClass:
+    K = CXCursor_OMPDistributeParallelForSimdDirective;
+    break;
+  case Stmt::OMPDistributeSimdDirectiveClass:
+    K = CXCursor_OMPDistributeSimdDirective;
+    break;
+  case Stmt::OMPTargetParallelForSimdDirectiveClass:
+    K = CXCursor_OMPTargetParallelForSimdDirective;
+    break;
+  case Stmt::OMPTargetSimdDirectiveClass:
+    K = CXCursor_OMPTargetSimdDirective;
+    break;
+  case Stmt::OMPTeamsDistributeDirectiveClass:
+    K = CXCursor_OMPTeamsDistributeDirective;
+    break;
   }
 
   CXCursor C = { K, 0, { Parent, S, TU } };
diff --git a/tools/libclang/CXIndexDataConsumer.h b/tools/libclang/CXIndexDataConsumer.h
index 308fa79..406831f 100644
--- a/tools/libclang/CXIndexDataConsumer.h
+++ b/tools/libclang/CXIndexDataConsumer.h
@@ -16,7 +16,6 @@
 #include "clang/AST/DeclGroup.h"
 #include "clang/AST/DeclObjC.h"
 #include "llvm/ADT/DenseSet.h"
-#include <deque>
 
 namespace clang {
   class FileEntry;
diff --git a/tools/libclang/CXLoadedDiagnostic.cpp b/tools/libclang/CXLoadedDiagnostic.cpp
index 2c10d34..c866717 100644
--- a/tools/libclang/CXLoadedDiagnostic.cpp
+++ b/tools/libclang/CXLoadedDiagnostic.cpp
@@ -18,13 +18,11 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Frontend/SerializedDiagnosticReader.h"
 #include "clang/Frontend/SerializedDiagnostics.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Bitcode/BitstreamReader.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
 
 using namespace clang;
 
diff --git a/tools/libclang/CXLoadedDiagnostic.h b/tools/libclang/CXLoadedDiagnostic.h
index d5006a4..1209d76 100644
--- a/tools/libclang/CXLoadedDiagnostic.h
+++ b/tools/libclang/CXLoadedDiagnostic.h
@@ -17,7 +17,6 @@
 #include "CIndexDiagnostic.h"
 #include "llvm/ADT/StringRef.h"
 #include "clang/Basic/LLVM.h"
-#include <string>
 #include <vector>
 
 namespace clang {
diff --git a/tools/libclang/CXSourceLocation.cpp b/tools/libclang/CXSourceLocation.cpp
index 64a441e..1b7464b 100644
--- a/tools/libclang/CXSourceLocation.cpp
+++ b/tools/libclang/CXSourceLocation.cpp
@@ -190,7 +190,6 @@
     *column = 0;
   if (offset)
     *offset = 0;
-  return;
 }
 
 static void createNullLocation(CXString *filename, unsigned *line,
@@ -203,7 +202,6 @@
     *column = 0;
   if (offset)
     *offset = 0;
-  return;
 }
 
 extern "C" {
@@ -235,7 +233,6 @@
                                 unsigned *line,
                                 unsigned *column,
                                 unsigned *offset) {
-  
   if (!isASTUnitSourceLocation(location)) {
     CXLoadedDiagnostic::decodeLocation(location, file, line, column, offset);
     return;
@@ -276,7 +273,6 @@
                                CXString *filename,
                                unsigned *line,
                                unsigned *column) {
-
   if (!isASTUnitSourceLocation(location)) {
     // Other SourceLocation implementations do not support presumed locations
     // at this time.
@@ -318,7 +314,6 @@
                                unsigned *line,
                                unsigned *column,
                                unsigned *offset) {
-  
   if (!isASTUnitSourceLocation(location)) {
     CXLoadedDiagnostic::decodeLocation(location, file, line,
                                            column, offset);
@@ -356,7 +351,6 @@
                            unsigned *line,
                            unsigned *column,
                            unsigned *offset) {
-
   if (!isASTUnitSourceLocation(location)) {
     CXLoadedDiagnostic::decodeLocation(location, file, line,
                                            column, offset);
diff --git a/tools/libclang/CXStoredDiagnostic.cpp b/tools/libclang/CXStoredDiagnostic.cpp
index faaf746..f2e9c1d 100644
--- a/tools/libclang/CXStoredDiagnostic.cpp
+++ b/tools/libclang/CXStoredDiagnostic.cpp
@@ -19,10 +19,7 @@
 
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
 using namespace clang::cxloc;
diff --git a/tools/libclang/CXString.cpp b/tools/libclang/CXString.cpp
index 1ccbed3..2148181 100644
--- a/tools/libclang/CXString.cpp
+++ b/tools/libclang/CXString.cpp
@@ -17,7 +17,6 @@
 #include "CXTranslationUnit.h"
 #include "clang-c/Index.h"
 #include "clang/Frontend/ASTUnit.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
diff --git a/tools/libclang/CXType.cpp b/tools/libclang/CXType.cpp
index 5cde236..4fcd886 100644
--- a/tools/libclang/CXType.cpp
+++ b/tools/libclang/CXType.cpp
@@ -51,6 +51,7 @@
     BTCASE(Float);
     BTCASE(Double);
     BTCASE(LongDouble);
+    BTCASE(Float128);
     BTCASE(NullPtr);
     BTCASE(Overload);
     BTCASE(Dependent);
@@ -91,6 +92,7 @@
     TKCASE(Vector);
     TKCASE(MemberPointer);
     TKCASE(Auto);
+    TKCASE(Elaborated);
     default:
       return CXType_Unexposed;
   }
@@ -466,6 +468,7 @@
     TKIND(Float);
     TKIND(Double);
     TKIND(LongDouble);
+    TKIND(Float128);
     TKIND(NullPtr);
     TKIND(Overload);
     TKIND(Dependent);
@@ -491,6 +494,7 @@
     TKIND(Vector);
     TKIND(MemberPointer);
     TKIND(Auto);
+    TKIND(Elaborated);
   }
 #undef TKIND
   return cxstring::createRef(s);
@@ -537,7 +541,7 @@
       TCALLINGCONV(PreserveMost);
       TCALLINGCONV(PreserveAll);
     case CC_SpirFunction: return CXCallingConv_Unexposed;
-    case CC_SpirKernel: return CXCallingConv_Unexposed;
+    case CC_OpenCLKernel: return CXCallingConv_Unexposed;
       break;
     }
 #undef TCALLINGCONV
@@ -987,4 +991,14 @@
   return 0;
 }
 
+CXType clang_Type_getNamedType(CXType CT){
+  QualType T = GetQualType(CT);
+  const Type *TP = T.getTypePtrOrNull();
+
+  if (TP && TP->getTypeClass() == Type::Elaborated)
+    return MakeCXType(cast<ElaboratedType>(TP)->getNamedType(), GetTU(CT));
+
+  return MakeCXType(QualType(), GetTU(CT));
+}
+
 } // end: extern "C"
diff --git a/tools/libclang/CursorVisitor.h b/tools/libclang/CursorVisitor.h
index 3e5b0c9..a2dfaee 100644
--- a/tools/libclang/CursorVisitor.h
+++ b/tools/libclang/CursorVisitor.h
@@ -238,6 +238,7 @@
   bool VisitUsingDecl(UsingDecl *D);
   bool VisitUnresolvedUsingValueDecl(UnresolvedUsingValueDecl *D);
   bool VisitUnresolvedUsingTypenameDecl(UnresolvedUsingTypenameDecl *D);
+  bool VisitStaticAssertDecl(StaticAssertDecl *D);
   
   // Name visitor
   bool VisitDeclarationNameInfo(DeclarationNameInfo Name);
@@ -264,6 +265,9 @@
   bool RunVisitorWorkList(VisitorWorkList &WL);
   void EnqueueWorkList(VisitorWorkList &WL, const Stmt *S);
   LLVM_ATTRIBUTE_NOINLINE bool Visit(const Stmt *S);
+
+private:
+  Optional<bool> handleDeclForVisitation(const Decl *D);
 };
 
 }
diff --git a/tools/libclang/Indexing.cpp b/tools/libclang/Indexing.cpp
index 7b2bcc7..878d75e 100644
--- a/tools/libclang/Indexing.cpp
+++ b/tools/libclang/Indexing.cpp
@@ -7,11 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CXIndexDataConsumer.h"
 #include "CIndexDiagnostic.h"
 #include "CIndexer.h"
 #include "CLog.h"
 #include "CXCursor.h"
+#include "CXIndexDataConsumer.h"
 #include "CXSourceLocation.h"
 #include "CXString.h"
 #include "CXTranslationUnit.h"
@@ -26,12 +26,13 @@
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/PPConditionalDirectiveRecord.h"
 #include "clang/Lex/Preprocessor.h"
-#include "clang/Sema/SemaConsumer.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include <cstdio>
+#include <utility>
 
 using namespace clang;
 using namespace clang::index;
@@ -44,26 +45,6 @@
 // Skip Parsed Bodies
 //===----------------------------------------------------------------------===//
 
-#ifdef LLVM_ON_WIN32
-
-// FIXME: On windows it is disabled since current implementation depends on
-// file inodes.
-
-class SessionSkipBodyData { };
-
-class TUSkipBodyControl {
-public:
-  TUSkipBodyControl(SessionSkipBodyData &sessionData,
-                    PPConditionalDirectiveRecord &ppRec,
-                    Preprocessor &pp) { }
-  bool isParsed(SourceLocation Loc, FileID FID, const FileEntry *FE) {
-    return false;
-  }
-  void finished() { }
-};
-
-#else
-
 /// \brief A "region" in source code identified by the file/offset of the
 /// preprocessor conditional directive that it belongs to.
 /// Multiple, non-consecutive ranges can be parts of the same region.
@@ -237,8 +218,6 @@
   }
 };
 
-#endif
-
 //===----------------------------------------------------------------------===//
 // IndexPPCallbacks
 //===----------------------------------------------------------------------===//
@@ -378,7 +357,7 @@
 public:
   IndexingFrontendAction(std::shared_ptr<CXIndexDataConsumer> dataConsumer,
                          SessionSkipBodyData *skData)
-    : DataConsumer(dataConsumer), SKData(skData) { }
+      : DataConsumer(std::move(dataConsumer)), SKData(skData) {}
 
   std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
                                                  StringRef InFile) override {
diff --git a/tools/libclang/Makefile b/tools/libclang/Makefile
deleted file mode 100644
index 1b8f5f9..0000000
--- a/tools/libclang/Makefile
+++ /dev/null
@@ -1,64 +0,0 @@
-##===- tools/libclang/Makefile -----------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-LIBRARYNAME = clang
-
-EXPORTED_SYMBOL_FILE = $(PROJ_SRC_DIR)/libclang.exports
-
-LINK_LIBS_IN_SHARED = 1
-SHARED_LIBRARY = 1
-
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader bitwriter core \
-                   instrumentation ipo mc mcparser objcarcopts option support \
-                   object
-USEDLIBS = clangIndex.a clangARCMigrate.a \
-	   clangRewriteFrontend.a \
-	   clangFormat.a \
-	   clangTooling.a clangToolingCore.a \
-	   clangFrontend.a clangCodeGen.a clangDriver.a \
-	   clangSerialization.a \
-	   clangParse.a clangSema.a \
-	   clangStaticAnalyzerCheckers.a clangStaticAnalyzerCore.a \
-	   clangRewrite.a \
-	   clangAnalysis.a clangEdit.a \
-	   clangASTMatchers.a \
-	   clangAST.a clangLex.a clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/Makefile
-
-# Add soname to the library.
-ifeq ($(HOST_OS), $(filter $(HOST_OS), Linux FreeBSD GNU GNU/kFreeBSD))
-        LLVMLibsOptions += -Wl,-soname,lib$(LIBRARYNAME)$(SHLIBEXT)
-endif
-
-ifeq ($(ENABLE_CLANG_ARCMT),1)
-  CXX.Flags += -DCLANG_ENABLE_ARCMT
-endif
-
-##===----------------------------------------------------------------------===##
-# FIXME: This is copied from the 'lto' makefile.  Should we share this?
-##===----------------------------------------------------------------------===##
-
-ifeq ($(HOST_OS),Darwin)
-    LLVMLibsOptions += -Wl,-compatibility_version,1
-
-    # Set dylib internal version number to submission number.
-    ifdef LLVM_SUBMIT_VERSION
-        LLVMLibsOptions += -Wl,-current_version \
-                           -Wl,$(LLVM_SUBMIT_VERSION).$(LLVM_SUBMIT_SUBVERSION)
-    endif
-
-    # If we're doing an Apple-style build, add the LTO object path.
-    ifeq ($(RC_XBS),YES)
-       TempFile        := $(shell mkdir -p ${OBJROOT}/dSYMs ; mktemp ${OBJROOT}/dSYMs/clang-lto.XXXXXX)
-       LLVMLibsOptions += -Wl,-object_path_lto -Wl,$(TempFile)
-    endif
-endif
diff --git a/tools/libclang/libclang.exports b/tools/libclang/libclang.exports
index e2dffe8..c8fe0a2 100644
--- a/tools/libclang/libclang.exports
+++ b/tools/libclang/libclang.exports
@@ -2,7 +2,12 @@
 clang_CXCursorSet_insert
 clang_CXIndex_getGlobalOptions
 clang_CXIndex_setGlobalOptions
+clang_CXXConstructor_isConvertingConstructor
+clang_CXXConstructor_isCopyConstructor
+clang_CXXConstructor_isDefaultConstructor
+clang_CXXConstructor_isMoveConstructor
 clang_CXXField_isMutable
+clang_CXXMethod_isDefaulted
 clang_CXXMethod_isConst
 clang_CXXMethod_isPureVirtual
 clang_CXXMethod_isStatic
@@ -82,6 +87,7 @@
 clang_Type_getTemplateArgumentAsType
 clang_Type_getCXXRefQualifier
 clang_Type_visitFields
+clang_Type_getNamedType
 clang_VerbatimBlockLineComment_getText
 clang_VerbatimLineComment_getText
 clang_HTMLTagComment_getAsString
diff --git a/tools/scan-build-py/bin/analyze-build b/tools/scan-build-py/bin/analyze-build
old mode 100644
new mode 100755
diff --git a/tools/scan-build-py/bin/analyze-c++ b/tools/scan-build-py/bin/analyze-c++
old mode 100644
new mode 100755
diff --git a/tools/scan-build-py/bin/analyze-cc b/tools/scan-build-py/bin/analyze-cc
old mode 100644
new mode 100755
diff --git a/tools/scan-build-py/bin/intercept-build b/tools/scan-build-py/bin/intercept-build
old mode 100644
new mode 100755
diff --git a/tools/scan-build-py/bin/intercept-c++ b/tools/scan-build-py/bin/intercept-c++
old mode 100644
new mode 100755
diff --git a/tools/scan-build-py/bin/intercept-cc b/tools/scan-build-py/bin/intercept-cc
old mode 100644
new mode 100755
diff --git a/tools/scan-build-py/bin/scan-build b/tools/scan-build-py/bin/scan-build
old mode 100644
new mode 100755
diff --git a/tools/scan-build-py/libscanbuild/analyze.py b/tools/scan-build-py/libscanbuild/analyze.py
index 9b00d04..0ed0aef 100644
--- a/tools/scan-build-py/libscanbuild/analyze.py
+++ b/tools/scan-build-py/libscanbuild/analyze.py
@@ -25,8 +25,7 @@
 from libscanbuild.intercept import capture
 from libscanbuild.report import report_directory, document
 from libscanbuild.clang import get_checkers
-from libscanbuild.runner import action_check
-from libscanbuild.command import classify_parameters, classify_source
+from libscanbuild.compilation import split_command
 
 __all__ = ['analyze_build_main', 'analyze_build_wrapper']
 
@@ -107,7 +106,7 @@
         'output_format': args.output_format,
         'output_failures': args.output_failures,
         'direct_args': analyzer_params(args),
-        'force_analyze_debug_code' : args.force_analyze_debug_code
+        'force_debug': args.force_debug
     }
 
     logging.debug('run analyzer against compilation database')
@@ -140,8 +139,7 @@
         'ANALYZE_BUILD_REPORT_FORMAT': args.output_format,
         'ANALYZE_BUILD_REPORT_FAILURES': 'yes' if args.output_failures else '',
         'ANALYZE_BUILD_PARAMETERS': ' '.join(analyzer_params(args)),
-        'ANALYZE_BUILD_FORCE_ANALYZE_DEBUG_CODE'
-            : 'yes' if args.force_analyze_debug_code else ''
+        'ANALYZE_BUILD_FORCE_DEBUG': 'yes' if args.force_debug else ''
     })
     return environment
 
@@ -163,32 +161,34 @@
         return result
     # ... and run the analyzer if all went well.
     try:
+        # check is it a compilation
+        compilation = split_command(sys.argv)
+        if compilation is None:
+            return result
         # collect the needed parameters from environment, crash when missing
-        consts = {
+        parameters = {
             'clang': os.getenv('ANALYZE_BUILD_CLANG'),
             'output_dir': os.getenv('ANALYZE_BUILD_REPORT_DIR'),
             'output_format': os.getenv('ANALYZE_BUILD_REPORT_FORMAT'),
             'output_failures': os.getenv('ANALYZE_BUILD_REPORT_FAILURES'),
             'direct_args': os.getenv('ANALYZE_BUILD_PARAMETERS',
                                      '').split(' '),
-            'force_analyze_debug_code':
-                os.getenv('ANALYZE_BUILD_FORCE_ANALYZE_DEBUG_CODE'),
+            'force_debug': os.getenv('ANALYZE_BUILD_FORCE_DEBUG'),
             'directory': os.getcwd(),
+            'command': [sys.argv[0], '-c'] + compilation.flags
         }
-        # get relevant parameters from command line arguments
-        args = classify_parameters(sys.argv)
-        filenames = args.pop('files', [])
-        for filename in (name for name in filenames if classify_source(name)):
-            parameters = dict(args, file=filename, **consts)
+        # call static analyzer against the compilation
+        for source in compilation.files:
+            parameters.update({'file': source})
             logging.debug('analyzer parameters %s', parameters)
-            current = action_check(parameters)
+            current = run(parameters)
             # display error message from the static analyzer
             if current is not None:
                 for line in current['error_output']:
                     logging.info(line.rstrip())
     except Exception:
         logging.exception("run analyzer inside compiler wrapper failed.")
-    return 0
+    return result
 
 
 def analyzer_params(args):
@@ -208,8 +208,8 @@
     if args.store_model:
         result.append('-analyzer-store={0}'.format(args.store_model))
     if args.constraints_model:
-        result.append(
-            '-analyzer-constraints={0}'.format(args.constraints_model))
+        result.append('-analyzer-constraints={0}'.format(
+            args.constraints_model))
     if args.internal_stats:
         result.append('-analyzer-stats')
     if args.analyze_headers:
@@ -457,11 +457,10 @@
                 the compilation database.""")
     advanced.add_argument(
         '--force-analyze-debug-code',
-        dest='force_analyze_debug_code',
+        dest='force_debug',
         action='store_true',
         help="""Tells analyzer to enable assertions in code even if they were
-                disabled during compilation, enabling more precise
-                results.""")
+                disabled during compilation, enabling more precise results.""")
 
     plugins = parser.add_argument_group('checker options')
     plugins.add_argument(
diff --git a/tools/scan-build-py/libscanbuild/command.py b/tools/scan-build-py/libscanbuild/command.py
deleted file mode 100644
index 69ca339..0000000
--- a/tools/scan-build-py/libscanbuild/command.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# -*- coding: utf-8 -*-
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-""" This module is responsible for to parse a compiler invocation. """
-
-import re
-import os
-
-__all__ = ['Action', 'classify_parameters', 'classify_source']
-
-
-class Action(object):
-    """ Enumeration class for compiler action. """
-
-    Link, Compile, Ignored = range(3)
-
-
-def classify_parameters(command):
-    """ Parses the command line arguments of the given invocation. """
-
-    # result value of this method.
-    # some value are preset, some will be set only when found.
-    result = {
-        'action': Action.Link,
-        'files': [],
-        'output': None,
-        'compile_options': [],
-        'c++': is_cplusplus_compiler(command[0])
-        # archs_seen
-        # language
-    }
-
-    # data structure to ignore compiler parameters.
-    # key: parameter name, value: number of parameters to ignore afterwards.
-    ignored = {
-        '-g': 0,
-        '-fsyntax-only': 0,
-        '-save-temps': 0,
-        '-install_name': 1,
-        '-exported_symbols_list': 1,
-        '-current_version': 1,
-        '-compatibility_version': 1,
-        '-init': 1,
-        '-e': 1,
-        '-seg1addr': 1,
-        '-bundle_loader': 1,
-        '-multiply_defined': 1,
-        '-sectorder': 3,
-        '--param': 1,
-        '--serialize-diagnostics': 1
-    }
-
-    args = iter(command[1:])
-    for arg in args:
-        # compiler action parameters are the most important ones...
-        if arg in {'-E', '-S', '-cc1', '-M', '-MM', '-###'}:
-            result.update({'action': Action.Ignored})
-        elif arg == '-c':
-            result.update({'action': max(result['action'], Action.Compile)})
-        # arch flags are taken...
-        elif arg == '-arch':
-            archs = result.get('archs_seen', [])
-            result.update({'archs_seen': archs + [next(args)]})
-        # explicit language option taken...
-        elif arg == '-x':
-            result.update({'language': next(args)})
-        # output flag taken...
-        elif arg == '-o':
-            result.update({'output': next(args)})
-        # warning disable options are taken...
-        elif re.match(r'^-Wno-', arg):
-            result['compile_options'].append(arg)
-        # warning options are ignored...
-        elif re.match(r'^-[mW].+', arg):
-            pass
-        # some preprocessor parameters are ignored...
-        elif arg in {'-MD', '-MMD', '-MG', '-MP'}:
-            pass
-        elif arg in {'-MF', '-MT', '-MQ'}:
-            next(args)
-        # linker options are ignored...
-        elif arg in {'-static', '-shared', '-s', '-rdynamic'} or \
-                re.match(r'^-[lL].+', arg):
-            pass
-        elif arg in {'-l', '-L', '-u', '-z', '-T', '-Xlinker'}:
-            next(args)
-        # some other options are ignored...
-        elif arg in ignored.keys():
-            for _ in range(ignored[arg]):
-                next(args)
-        # parameters which looks source file are taken...
-        elif re.match(r'^[^-].+', arg) and classify_source(arg):
-            result['files'].append(arg)
-        # and consider everything else as compile option.
-        else:
-            result['compile_options'].append(arg)
-
-    return result
-
-
-def classify_source(filename, cplusplus=False):
-    """ Return the language from file name extension. """
-
-    mapping = {
-        '.c': 'c++' if cplusplus else 'c',
-        '.i': 'c++-cpp-output' if cplusplus else 'c-cpp-output',
-        '.ii': 'c++-cpp-output',
-        '.m': 'objective-c',
-        '.mi': 'objective-c-cpp-output',
-        '.mm': 'objective-c++',
-        '.mii': 'objective-c++-cpp-output',
-        '.C': 'c++',
-        '.cc': 'c++',
-        '.CC': 'c++',
-        '.cp': 'c++',
-        '.cpp': 'c++',
-        '.cxx': 'c++',
-        '.c++': 'c++',
-        '.C++': 'c++',
-        '.txx': 'c++'
-    }
-
-    __, extension = os.path.splitext(os.path.basename(filename))
-    return mapping.get(extension)
-
-
-def is_cplusplus_compiler(name):
-    """ Returns true when the compiler name refer to a C++ compiler. """
-
-    match = re.match(r'^([^/]*/)*(\w*-)*(\w+\+\+)(-(\d+(\.\d+){0,3}))?$', name)
-    return False if match is None else True
diff --git a/tools/scan-build-py/libscanbuild/compilation.py b/tools/scan-build-py/libscanbuild/compilation.py
new file mode 100644
index 0000000..ef906fa
--- /dev/null
+++ b/tools/scan-build-py/libscanbuild/compilation.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+""" This module is responsible for to parse a compiler invocation. """
+
+import re
+import os
+import collections
+
+__all__ = ['split_command', 'classify_source', 'compiler_language']
+
+# Ignored compiler options map for compilation database creation.
+# The map is used in `split_command` method. (Which does ignore and classify
+# parameters.) Please note, that these are not the only parameters which
+# might be ignored.
+#
+# Keys are the option name, value number of options to skip
+IGNORED_FLAGS = {
+    # compiling only flag, ignored because the creator of compilation
+    # database will explicitly set it.
+    '-c': 0,
+    # preprocessor macros, ignored because would cause duplicate entries in
+    # the output (the only difference would be these flags). this is actual
+    # finding from users, who suffered longer execution time caused by the
+    # duplicates.
+    '-MD': 0,
+    '-MMD': 0,
+    '-MG': 0,
+    '-MP': 0,
+    '-MF': 1,
+    '-MT': 1,
+    '-MQ': 1,
+    # linker options, ignored because for compilation database will contain
+    # compilation commands only. so, the compiler would ignore these flags
+    # anyway. the benefit to get rid of them is to make the output more
+    # readable.
+    '-static': 0,
+    '-shared': 0,
+    '-s': 0,
+    '-rdynamic': 0,
+    '-l': 1,
+    '-L': 1,
+    '-u': 1,
+    '-z': 1,
+    '-T': 1,
+    '-Xlinker': 1
+}
+
+# Known C/C++ compiler executable name patterns
+COMPILER_PATTERNS = frozenset([
+    re.compile(r'^(intercept-|analyze-|)c(c|\+\+)$'),
+    re.compile(r'^([^-]*-)*[mg](cc|\+\+)(-\d+(\.\d+){0,2})?$'),
+    re.compile(r'^([^-]*-)*clang(\+\+)?(-\d+(\.\d+){0,2})?$'),
+    re.compile(r'^llvm-g(cc|\+\+)$'),
+])
+
+
+def split_command(command):
+    """ Returns a value when the command is a compilation, None otherwise.
+
+    The value on success is a named tuple with the following attributes:
+
+        files:    list of source files
+        flags:    list of compile options
+        compiler: string value of 'c' or 'c++' """
+
+    # the result of this method
+    result = collections.namedtuple('Compilation',
+                                    ['compiler', 'flags', 'files'])
+    result.compiler = compiler_language(command)
+    result.flags = []
+    result.files = []
+    # quit right now, if the program was not a C/C++ compiler
+    if not result.compiler:
+        return None
+    # iterate on the compile options
+    args = iter(command[1:])
+    for arg in args:
+        # quit when compilation pass is not involved
+        if arg in {'-E', '-S', '-cc1', '-M', '-MM', '-###'}:
+            return None
+        # ignore some flags
+        elif arg in IGNORED_FLAGS:
+            count = IGNORED_FLAGS[arg]
+            for _ in range(count):
+                next(args)
+        elif re.match(r'^-(l|L|Wl,).+', arg):
+            pass
+        # some parameters could look like filename, take as compile option
+        elif arg in {'-D', '-I'}:
+            result.flags.extend([arg, next(args)])
+        # parameter which looks source file is taken...
+        elif re.match(r'^[^-].+', arg) and classify_source(arg):
+            result.files.append(arg)
+        # and consider everything else as compile option.
+        else:
+            result.flags.append(arg)
+    # do extra check on number of source files
+    return result if result.files else None
+
+
+def classify_source(filename, c_compiler=True):
+    """ Return the language from file name extension. """
+
+    mapping = {
+        '.c': 'c' if c_compiler else 'c++',
+        '.i': 'c-cpp-output' if c_compiler else 'c++-cpp-output',
+        '.ii': 'c++-cpp-output',
+        '.m': 'objective-c',
+        '.mi': 'objective-c-cpp-output',
+        '.mm': 'objective-c++',
+        '.mii': 'objective-c++-cpp-output',
+        '.C': 'c++',
+        '.cc': 'c++',
+        '.CC': 'c++',
+        '.cp': 'c++',
+        '.cpp': 'c++',
+        '.cxx': 'c++',
+        '.c++': 'c++',
+        '.C++': 'c++',
+        '.txx': 'c++'
+    }
+
+    __, extension = os.path.splitext(os.path.basename(filename))
+    return mapping.get(extension)
+
+
+def compiler_language(command):
+    """ A predicate to decide the command is a compiler call or not.
+
+    Returns 'c' or 'c++' when it match. None otherwise. """
+
+    cplusplus = re.compile(r'^(.+)(\+\+)(-.+|)$')
+
+    if command:
+        executable = os.path.basename(command[0])
+        if any(pattern.match(executable) for pattern in COMPILER_PATTERNS):
+            return 'c++' if cplusplus.match(executable) else 'c'
+    return None
diff --git a/tools/scan-build-py/libscanbuild/intercept.py b/tools/scan-build-py/libscanbuild/intercept.py
index 6062e2e..6a9f753 100644
--- a/tools/scan-build-py/libscanbuild/intercept.py
+++ b/tools/scan-build-py/libscanbuild/intercept.py
@@ -31,9 +31,9 @@
 import logging
 import subprocess
 from libear import build_libear, TemporaryDirectory
-from libscanbuild import duplicate_check, tempdir, initialize_logging
 from libscanbuild import command_entry_point
-from libscanbuild.command import Action, classify_parameters
+from libscanbuild import duplicate_check, tempdir, initialize_logging
+from libscanbuild.compilation import split_command
 from libscanbuild.shell import encode, decode
 
 __all__ = ['capture', 'intercept_build_main', 'intercept_build_wrapper']
@@ -72,23 +72,23 @@
         from the arguments. And do shell escaping on the command.
 
         To support incremental builds, it is desired to read elements from
-        an existing compilation database from a previous run. These elemets
+        an existing compilation database from a previous run. These elements
         shall be merged with the new elements. """
 
         # create entries from the current run
         current = itertools.chain.from_iterable(
             # creates a sequence of entry generators from an exec,
-            # but filter out non compiler calls before.
-            (format_entry(x) for x in commands if is_compiler_call(x)))
+            format_entry(command) for command in commands)
         # read entries from previous run
-        if 'append' in args and args.append and os.path.exists(args.cdb):
+        if 'append' in args and args.append and os.path.isfile(args.cdb):
             with open(args.cdb) as handle:
                 previous = iter(json.load(handle))
         else:
             previous = iter([])
         # filter out duplicate entries from both
         duplicate = duplicate_check(entry_hash)
-        return (entry for entry in itertools.chain(previous, current)
+        return (entry
+                for entry in itertools.chain(previous, current)
                 if os.path.exists(entry['file']) and not duplicate(entry))
 
     with TemporaryDirectory(prefix='intercept-', dir=tempdir()) as tmp_dir:
@@ -98,14 +98,14 @@
         exit_code = subprocess.call(args.build, env=environment)
         logging.info('build finished with exit code: %d', exit_code)
         # read the intercepted exec calls
-        commands = itertools.chain.from_iterable(
+        exec_traces = itertools.chain.from_iterable(
             parse_exec_trace(os.path.join(tmp_dir, filename))
             for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd'))))
         # do post processing only if that was requested
         if 'raw_entries' not in args or not args.raw_entries:
-            entries = post_processing(commands)
+            entries = post_processing(exec_traces)
         else:
-            entries = commands
+            entries = exec_traces
         # dump the compilation database
         with open(args.cdb, 'w+') as handle:
             json.dump(list(entries), handle, sort_keys=True, indent=4)
@@ -209,7 +209,7 @@
             }
 
 
-def format_entry(entry):
+def format_entry(exec_trace):
     """ Generate the desired fields for compilation database entries. """
 
     def abspath(cwd, name):
@@ -217,40 +217,20 @@
         fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
         return os.path.normpath(fullname)
 
-    logging.debug('format this command: %s', entry['command'])
-    atoms = classify_parameters(entry['command'])
-    if atoms['action'] <= Action.Compile:
-        for source in atoms['files']:
-            compiler = 'c++' if atoms['c++'] else 'cc'
-            flags = atoms['compile_options']
-            flags += ['-o', atoms['output']] if atoms['output'] else []
-            flags += ['-x', atoms['language']] if 'language' in atoms else []
-            flags += [elem
-                      for arch in atoms.get('archs_seen', [])
-                      for elem in ['-arch', arch]]
-            command = [compiler, '-c'] + flags + [source]
+    logging.debug('format this command: %s', exec_trace['command'])
+    compilation = split_command(exec_trace['command'])
+    if compilation:
+        for source in compilation.files:
+            compiler = 'c++' if compilation.compiler == 'c++' else 'cc'
+            command = [compiler, '-c'] + compilation.flags + [source]
             logging.debug('formated as: %s', command)
             yield {
-                'directory': entry['directory'],
+                'directory': exec_trace['directory'],
                 'command': encode(command),
-                'file': abspath(entry['directory'], source)
+                'file': abspath(exec_trace['directory'], source)
             }
 
 
-def is_compiler_call(entry):
-    """ A predicate to decide the entry is a compiler call or not. """
-
-    patterns = [
-        re.compile(r'^([^/]*/)*intercept-c(c|\+\+)$'),
-        re.compile(r'^([^/]*/)*c(c|\+\+)$'),
-        re.compile(r'^([^/]*/)*([^-]*-)*[mg](cc|\+\+)(-\d+(\.\d+){0,2})?$'),
-        re.compile(r'^([^/]*/)*([^-]*-)*clang(\+\+)?(-\d+(\.\d+){0,2})?$'),
-        re.compile(r'^([^/]*/)*llvm-g(cc|\+\+)$'),
-    ]
-    executable = entry['command'][0]
-    return any((pattern.match(executable) for pattern in patterns))
-
-
 def is_preload_disabled(platform):
     """ Library-based interposition will fail silently if SIP is enabled,
     so this should be detected. You can detect whether SIP is enabled on
diff --git a/tools/scan-build-py/libscanbuild/runner.py b/tools/scan-build-py/libscanbuild/runner.py
index 63b9f74..628ad90 100644
--- a/tools/scan-build-py/libscanbuild/runner.py
+++ b/tools/scan-build-py/libscanbuild/runner.py
@@ -5,18 +5,44 @@
 # License. See LICENSE.TXT for details.
 """ This module is responsible to run the analyzer commands. """
 
+import re
 import os
 import os.path
 import tempfile
 import functools
 import subprocess
 import logging
-from libscanbuild.command import classify_parameters, Action, classify_source
-from libscanbuild.clang import get_arguments, get_version
+from libscanbuild.compilation import classify_source, compiler_language
+from libscanbuild.clang import get_version, get_arguments
 from libscanbuild.shell import decode
 
 __all__ = ['run']
 
+# To have good results from static analyzer certain compiler options shall be
+# omitted. The compiler flag filtering only affects the static analyzer run.
+#
+# Keys are the option name, value number of options to skip
+IGNORED_FLAGS = {
+    '-c': 0,  # compile option will be overwritten
+    '-fsyntax-only': 0,  # static analyzer option will be overwritten
+    '-o': 1,  # will set up own output file
+    # flags below are inherited from the perl implementation.
+    '-g': 0,
+    '-save-temps': 0,
+    '-install_name': 1,
+    '-exported_symbols_list': 1,
+    '-current_version': 1,
+    '-compatibility_version': 1,
+    '-init': 1,
+    '-e': 1,
+    '-seg1addr': 1,
+    '-bundle_loader': 1,
+    '-multiply_defined': 1,
+    '-sectorder': 3,
+    '--param': 1,
+    '--serialize-diagnostics': 1
+}
+
 
 def require(required):
     """ Decorator for checking the required values in state.
@@ -29,8 +55,8 @@
         def wrapper(*args, **kwargs):
             for key in required:
                 if key not in args[0]:
-                    raise KeyError(
-                        '{0} not passed to {1}'.format(key, function.__name__))
+                    raise KeyError('{0} not passed to {1}'.format(
+                        key, function.__name__))
 
             return function(*args, **kwargs)
 
@@ -39,10 +65,15 @@
     return decorator
 
 
-@require(['command', 'directory', 'file',  # an entry from compilation database
-          'clang', 'direct_args',  # compiler name, and arguments from command
-          'force_analyze_debug_code',  # preprocessing options
-          'output_dir', 'output_format', 'output_failures'])
+@require(['command',  # entry from compilation database
+          'directory',  # entry from compilation database
+          'file',  # entry from compilation database
+          'clang',  # clang executable name (and path)
+          'direct_args',  # arguments from command line
+          'force_debug',  # kill non debug macros
+          'output_dir',  # where generated report files shall go
+          'output_format',  # it's 'plist' or 'html' or both
+          'output_failures'])  # generate crash reports or not
 def run(opts):
     """ Entry point to run (or not) static analyzer against a single entry
     of the compilation database.
@@ -58,16 +89,17 @@
 
     try:
         command = opts.pop('command')
+        command = command if isinstance(command, list) else decode(command)
         logging.debug("Run analyzer against '%s'", command)
-        opts.update(classify_parameters(decode(command)))
+        opts.update(classify_parameters(command))
 
-        return action_check(opts)
+        return arch_check(opts)
     except Exception:
         logging.error("Problem occured during analyzis.", exc_info=1)
         return None
 
 
-@require(['report', 'directory', 'clang', 'output_dir', 'language', 'file',
+@require(['clang', 'directory', 'flags', 'file', 'output_dir', 'language',
           'error_type', 'error_output', 'exit_code'])
 def report_failure(opts):
     """ Create report when analyzer failed.
@@ -96,36 +128,49 @@
                                       dir=destination(opts))
     os.close(handle)
     cwd = opts['directory']
-    cmd = get_arguments([opts['clang']] + opts['report'] + ['-o', name], cwd)
+    cmd = get_arguments([opts['clang'], '-fsyntax-only', '-E'] +
+                        opts['flags'] + [opts['file'], '-o', name], cwd)
     logging.debug('exec command in %s: %s', cwd, ' '.join(cmd))
     subprocess.call(cmd, cwd=cwd)
-
+    # write general information about the crash
     with open(name + '.info.txt', 'w') as handle:
         handle.write(opts['file'] + os.linesep)
         handle.write(error.title().replace('_', ' ') + os.linesep)
         handle.write(' '.join(cmd) + os.linesep)
         handle.write(' '.join(os.uname()) + os.linesep)
-        handle.write(get_version(cmd[0]))
+        handle.write(get_version(opts['clang']))
         handle.close()
-
+    # write the captured output too
     with open(name + '.stderr.txt', 'w') as handle:
         handle.writelines(opts['error_output'])
         handle.close()
-
+    # return with the previous step exit code and output
     return {
         'error_output': opts['error_output'],
         'exit_code': opts['exit_code']
     }
 
 
-@require(['clang', 'analyze', 'directory', 'output'])
+@require(['clang', 'directory', 'flags', 'direct_args', 'file', 'output_dir',
+          'output_format'])
 def run_analyzer(opts, continuation=report_failure):
     """ It assembles the analysis command line and executes it. Capture the
     output of the analysis and returns with it. If failure reports are
     requested, it calls the continuation to generate it. """
 
+    def output():
+        """ Creates output file name for reports. """
+        if opts['output_format'] in {'plist', 'plist-html'}:
+            (handle, name) = tempfile.mkstemp(prefix='report-',
+                                              suffix='.plist',
+                                              dir=opts['output_dir'])
+            os.close(handle)
+            return name
+        return opts['output_dir']
+
     cwd = opts['directory']
-    cmd = get_arguments([opts['clang']] + opts['analyze'] + opts['output'],
+    cmd = get_arguments([opts['clang'], '--analyze'] + opts['direct_args'] +
+                        opts['flags'] + [opts['file'], '-o', output()],
                         cwd)
     logging.debug('exec command in %s: %s', cwd, ' '.join(cmd))
     child = subprocess.Popen(cmd,
@@ -145,119 +190,124 @@
             'exit_code': child.returncode
         })
         return continuation(opts)
+    # return the output for logging and exit code for testing
     return {'error_output': output, 'exit_code': child.returncode}
 
 
-@require(['output_dir'])
-def set_analyzer_output(opts, continuation=run_analyzer):
-    """ Create output file if was requested.
+@require(['flags', 'force_debug'])
+def filter_debug_flags(opts, continuation=run_analyzer):
+    """ Filter out nondebug macros when requested. """
 
-    This plays a role only if .plist files are requested. """
-
-    if opts.get('output_format') in {'plist', 'plist-html'}:
-        with tempfile.NamedTemporaryFile(prefix='report-',
-                                         suffix='.plist',
-                                         delete=False,
-                                         dir=opts['output_dir']) as output:
-            opts.update({'output': ['-o', output.name]})
-            return continuation(opts)
-    else:
-        opts.update({'output': ['-o', opts['output_dir']]})
-        return continuation(opts)
-
-def force_analyze_debug_code(cmd):
-    """ Enable assert()'s by undefining NDEBUG. """
-    cmd.append('-UNDEBUG')
-
-@require(['file', 'directory', 'clang', 'direct_args',
-          'force_analyze_debug_code', 'language', 'output_dir',
-          'output_format', 'output_failures'])
-def create_commands(opts, continuation=set_analyzer_output):
-    """ Create command to run analyzer or failure report generation.
-
-    It generates commands (from compilation database entries) which contains
-    enough information to run the analyzer (and the crash report generation
-    if that was requested). """
-
-    common = []
-    if 'arch' in opts:
-        common.extend(['-arch', opts.pop('arch')])
-    common.extend(opts.pop('compile_options', []))
-    if opts['force_analyze_debug_code']:
-        force_analyze_debug_code(common)
-    common.extend(['-x', opts['language']])
-    common.append(os.path.relpath(opts['file'], opts['directory']))
-
-    opts.update({
-        'analyze': ['--analyze'] + opts['direct_args'] + common,
-        'report': ['-fsyntax-only', '-E'] + common
-    })
+    if opts.pop('force_debug'):
+        # lazy implementation just append an undefine macro at the end
+        opts.update({'flags': opts['flags'] + ['-UNDEBUG']})
 
     return continuation(opts)
 
 
-@require(['file', 'c++'])
-def language_check(opts, continuation=create_commands):
+@require(['file', 'directory'])
+def set_file_path_relative(opts, continuation=filter_debug_flags):
+    """ Set source file path to relative to the working directory.
+
+    The only purpose of this function is to pass the SATestBuild.py tests. """
+
+    opts.update({'file': os.path.relpath(opts['file'], opts['directory'])})
+
+    return continuation(opts)
+
+
+@require(['language', 'compiler', 'file', 'flags'])
+def language_check(opts, continuation=set_file_path_relative):
     """ Find out the language from command line parameters or file name
     extension. The decision also influenced by the compiler invocation. """
 
-    accepteds = {
+    accepted = frozenset({
         'c', 'c++', 'objective-c', 'objective-c++', 'c-cpp-output',
         'c++-cpp-output', 'objective-c-cpp-output'
-    }
+    })
 
-    key = 'language'
-    language = opts[key] if key in opts else \
-        classify_source(opts['file'], opts['c++'])
+    # language can be given as a parameter...
+    language = opts.pop('language')
+    compiler = opts.pop('compiler')
+    # ... or find out from source file extension
+    if language is None and compiler is not None:
+        language = classify_source(opts['file'], compiler == 'c')
 
     if language is None:
         logging.debug('skip analysis, language not known')
         return None
-    elif language not in accepteds:
+    elif language not in accepted:
         logging.debug('skip analysis, language not supported')
         return None
     else:
         logging.debug('analysis, language: %s', language)
-        opts.update({key: language})
+        opts.update({'language': language,
+                     'flags': ['-x', language] + opts['flags']})
         return continuation(opts)
 
 
-@require([])
+@require(['arch_list', 'flags'])
 def arch_check(opts, continuation=language_check):
     """ Do run analyzer through one of the given architectures. """
 
-    disableds = {'ppc', 'ppc64'}
+    disabled = frozenset({'ppc', 'ppc64'})
 
-    key = 'archs_seen'
-    if key in opts:
+    received_list = opts.pop('arch_list')
+    if received_list:
         # filter out disabled architectures and -arch switches
-        archs = [a for a in opts[key] if a not in disableds]
-
-        if not archs:
-            logging.debug('skip analysis, found not supported arch')
-            return None
-        else:
+        filtered_list = [a for a in received_list if a not in disabled]
+        if filtered_list:
             # There should be only one arch given (or the same multiple
             # times). If there are multiple arch are given and are not
             # the same, those should not change the pre-processing step.
             # But that's the only pass we have before run the analyzer.
-            arch = archs.pop()
-            logging.debug('analysis, on arch: %s', arch)
+            current = filtered_list.pop()
+            logging.debug('analysis, on arch: %s', current)
 
-            opts.update({'arch': arch})
-            del opts[key]
+            opts.update({'flags': ['-arch', current] + opts['flags']})
             return continuation(opts)
+        else:
+            logging.debug('skip analysis, found not supported arch')
+            return None
     else:
         logging.debug('analysis, on default arch')
         return continuation(opts)
 
 
-@require(['action'])
-def action_check(opts, continuation=arch_check):
-    """ Continue analysis only if it compilation or link. """
+def classify_parameters(command):
+    """ Prepare compiler flags (filters some and add others) and take out
+    language (-x) and architecture (-arch) flags for future processing. """
 
-    if opts.pop('action') <= Action.Compile:
-        return continuation(opts)
-    else:
-        logging.debug('skip analysis, not compilation nor link')
-        return None
+    result = {
+        'flags': [],  # the filtered compiler flags
+        'arch_list': [],  # list of architecture flags
+        'language': None,  # compilation language, None, if not specified
+        'compiler': compiler_language(command)  # 'c' or 'c++'
+    }
+
+    # iterate on the compile options
+    args = iter(command[1:])
+    for arg in args:
+        # take arch flags into a separate basket
+        if arg == '-arch':
+            result['arch_list'].append(next(args))
+        # take language
+        elif arg == '-x':
+            result['language'] = next(args)
+        # parameters which looks source file are not flags
+        elif re.match(r'^[^-].+', arg) and classify_source(arg):
+            pass
+        # ignore some flags
+        elif arg in IGNORED_FLAGS:
+            count = IGNORED_FLAGS[arg]
+            for _ in range(count):
+                next(args)
+        # we don't care about extra warnings, but we should suppress ones
+        # that we don't want to see.
+        elif re.match(r'^-W.+', arg) and not re.match(r'^-Wno-.+', arg):
+            pass
+        # and consider everything else as compilation flag.
+        else:
+            result['flags'].append(arg)
+
+    return result
diff --git a/tools/scan-build-py/tests/functional/cases/test_create_cdb.py b/tools/scan-build-py/tests/functional/cases/test_create_cdb.py
index 6d449ba..c26fce0 100644
--- a/tools/scan-build-py/tests/functional/cases/test_create_cdb.py
+++ b/tools/scan-build-py/tests/functional/cases/test_create_cdb.py
@@ -4,7 +4,7 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
-from ...unit import fixtures
+import libear
 from . import make_args, silent_check_call, silent_call, create_empty_file
 import unittest
 
@@ -28,13 +28,13 @@
             return len(content)
 
     def test_successful_build(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             result = self.run_intercept(tmpdir, ['build_regular'])
             self.assertTrue(os.path.isfile(result))
             self.assertEqual(5, self.count_entries(result))
 
     def test_successful_build_with_wrapper(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             result = os.path.join(tmpdir, 'cdb.json')
             make = make_args(tmpdir) + ['build_regular']
             silent_check_call(['intercept-build', '--cdb', result,
@@ -44,14 +44,14 @@
 
     @unittest.skipIf(os.getenv('TRAVIS'), 'ubuntu make return -11')
     def test_successful_build_parallel(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             result = self.run_intercept(tmpdir, ['-j', '4', 'build_regular'])
             self.assertTrue(os.path.isfile(result))
             self.assertEqual(5, self.count_entries(result))
 
     @unittest.skipIf(os.getenv('TRAVIS'), 'ubuntu env remove clang from path')
     def test_successful_build_on_empty_env(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             result = os.path.join(tmpdir, 'cdb.json')
             make = make_args(tmpdir) + ['CC=clang', 'build_regular']
             silent_check_call(['intercept-build', '--cdb', result,
@@ -60,13 +60,13 @@
             self.assertEqual(5, self.count_entries(result))
 
     def test_successful_build_all_in_one(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             result = self.run_intercept(tmpdir, ['build_all_in_one'])
             self.assertTrue(os.path.isfile(result))
             self.assertEqual(5, self.count_entries(result))
 
     def test_not_successful_build(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             result = os.path.join(tmpdir, 'cdb.json')
             make = make_args(tmpdir) + ['build_broken']
             silent_call(
@@ -84,12 +84,12 @@
             ['intercept-build', '--cdb', result] + make)
 
     def test_successful_build(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             exitcode = self.run_intercept(tmpdir, 'build_clean')
             self.assertFalse(exitcode)
 
     def test_not_successful_build(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             exitcode = self.run_intercept(tmpdir, 'build_broken')
             self.assertTrue(exitcode)
 
@@ -110,7 +110,7 @@
             return len(content)
 
     def test_overwrite_existing_cdb(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             result = self.run_intercept(tmpdir, 'build_clean', [])
             self.assertTrue(os.path.isfile(result))
             result = self.run_intercept(tmpdir, 'build_regular', [])
@@ -118,7 +118,7 @@
             self.assertEqual(2, self.count_entries(result))
 
     def test_append_to_existing_cdb(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             result = self.run_intercept(tmpdir, 'build_clean', [])
             self.assertTrue(os.path.isfile(result))
             result = self.run_intercept(tmpdir, 'build_regular', ['--append'])
@@ -138,7 +138,7 @@
             return content
 
     def assert_creates_number_of_entries(self, command, count):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             filename = os.path.join(tmpdir, 'test.c')
             create_empty_file(filename)
             command.append(filename)
@@ -153,7 +153,7 @@
         self.assert_creates_number_of_entries(['cc', '-c', '-MM'], 0)
 
     def assert_command_creates_entry(self, command, expected):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             filename = os.path.join(tmpdir, command[-1])
             create_empty_file(filename)
             cmd = ['sh', '-c', ' '.join(command)]
diff --git a/tools/scan-build-py/tests/functional/cases/test_exec_anatomy.py b/tools/scan-build-py/tests/functional/cases/test_exec_anatomy.py
index 329a477..d58a612 100644
--- a/tools/scan-build-py/tests/functional/cases/test_exec_anatomy.py
+++ b/tools/scan-build-py/tests/functional/cases/test_exec_anatomy.py
@@ -4,7 +4,7 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
-from ...unit import fixtures
+import libear
 import unittest
 
 import os.path
@@ -45,6 +45,6 @@
     def test_all_exec_calls(self):
         this_dir, _ = os.path.split(__file__)
         source_dir = os.path.normpath(os.path.join(this_dir, '..', 'exec'))
-        with fixtures.TempDir() as tmp_dir:
+        with libear.TemporaryDirectory() as tmp_dir:
             expected, result = run(source_dir, tmp_dir)
             self.assertEqualJson(expected, result)
diff --git a/tools/scan-build-py/tests/functional/cases/test_from_cdb.py b/tools/scan-build-py/tests/functional/cases/test_from_cdb.py
index c579020..5026400 100644
--- a/tools/scan-build-py/tests/functional/cases/test_from_cdb.py
+++ b/tools/scan-build-py/tests/functional/cases/test_from_cdb.py
@@ -4,13 +4,12 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
-from ...unit import fixtures
+import libear
 from . import call_and_report
 import unittest
 
 import os.path
 import string
-import subprocess
 import glob
 
 
@@ -37,19 +36,19 @@
 
 class OutputDirectoryTest(unittest.TestCase):
     def test_regular_keeps_report_dir(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('regular', tmpdir)
             exit_code, reportdir = run_analyzer(tmpdir, cdb, [])
             self.assertTrue(os.path.isdir(reportdir))
 
     def test_clear_deletes_report_dir(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('clean', tmpdir)
             exit_code, reportdir = run_analyzer(tmpdir, cdb, [])
             self.assertFalse(os.path.isdir(reportdir))
 
     def test_clear_keeps_report_dir_when_asked(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('clean', tmpdir)
             exit_code, reportdir = run_analyzer(tmpdir, cdb, ['--keep-empty'])
             self.assertTrue(os.path.isdir(reportdir))
@@ -57,38 +56,38 @@
 
 class ExitCodeTest(unittest.TestCase):
     def test_regular_does_not_set_exit_code(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('regular', tmpdir)
             exit_code, __ = run_analyzer(tmpdir, cdb, [])
             self.assertFalse(exit_code)
 
     def test_clear_does_not_set_exit_code(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('clean', tmpdir)
             exit_code, __ = run_analyzer(tmpdir, cdb, [])
             self.assertFalse(exit_code)
 
     def test_regular_sets_exit_code_if_asked(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('regular', tmpdir)
             exit_code, __ = run_analyzer(tmpdir, cdb, ['--status-bugs'])
             self.assertTrue(exit_code)
 
     def test_clear_does_not_set_exit_code_if_asked(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('clean', tmpdir)
             exit_code, __ = run_analyzer(tmpdir, cdb, ['--status-bugs'])
             self.assertFalse(exit_code)
 
     def test_regular_sets_exit_code_if_asked_from_plist(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('regular', tmpdir)
             exit_code, __ = run_analyzer(
                 tmpdir, cdb, ['--status-bugs', '--plist'])
             self.assertTrue(exit_code)
 
     def test_clear_does_not_set_exit_code_if_asked_from_plist(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('clean', tmpdir)
             exit_code, __ = run_analyzer(
                 tmpdir, cdb, ['--status-bugs', '--plist'])
@@ -105,7 +104,7 @@
         return len(glob.glob(os.path.join(directory, 'report-*.plist')))
 
     def test_default_creates_html_report(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('regular', tmpdir)
             exit_code, reportdir = run_analyzer(tmpdir, cdb, [])
             self.assertTrue(
@@ -114,7 +113,7 @@
             self.assertEqual(self.get_plist_count(reportdir), 0)
 
     def test_plist_and_html_creates_html_report(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('regular', tmpdir)
             exit_code, reportdir = run_analyzer(tmpdir, cdb, ['--plist-html'])
             self.assertTrue(
@@ -123,7 +122,7 @@
             self.assertEqual(self.get_plist_count(reportdir), 5)
 
     def test_plist_does_not_creates_html_report(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('regular', tmpdir)
             exit_code, reportdir = run_analyzer(tmpdir, cdb, ['--plist'])
             self.assertFalse(
@@ -134,14 +133,14 @@
 
 class FailureReportTest(unittest.TestCase):
     def test_broken_creates_failure_reports(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('broken', tmpdir)
             exit_code, reportdir = run_analyzer(tmpdir, cdb, [])
             self.assertTrue(
                 os.path.isdir(os.path.join(reportdir, 'failures')))
 
     def test_broken_does_not_creates_failure_reports(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('broken', tmpdir)
             exit_code, reportdir = run_analyzer(
                 tmpdir, cdb, ['--no-failure-reports'])
@@ -170,13 +169,13 @@
         self.assertEqual(result['page'], expected)
 
     def test_default_title_in_report(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('broken', tmpdir)
             exit_code, reportdir = run_analyzer(tmpdir, cdb, [])
             self.assertTitleEqual(reportdir, 'src - analyzer results')
 
     def test_given_title_in_report(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             cdb = prepare_cdb('broken', tmpdir)
             exit_code, reportdir = run_analyzer(
                 tmpdir, cdb, ['--html-title', 'this is the title'])
diff --git a/tools/scan-build-py/tests/functional/cases/test_from_cmd.py b/tools/scan-build-py/tests/functional/cases/test_from_cmd.py
index fe7ecf6..0eee4bb 100644
--- a/tools/scan-build-py/tests/functional/cases/test_from_cmd.py
+++ b/tools/scan-build-py/tests/functional/cases/test_from_cmd.py
@@ -4,7 +4,7 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
-from ...unit import fixtures
+import libear
 from . import make_args, check_call_and_report, create_empty_file
 import unittest
 
@@ -22,19 +22,19 @@
             cmd)
 
     def test_regular_keeps_report_dir(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             make = make_args(tmpdir) + ['build_regular']
             outdir = self.run_analyzer(tmpdir, [], make)
             self.assertTrue(os.path.isdir(outdir))
 
     def test_clear_deletes_report_dir(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             make = make_args(tmpdir) + ['build_clean']
             outdir = self.run_analyzer(tmpdir, [], make)
             self.assertFalse(os.path.isdir(outdir))
 
     def test_clear_keeps_report_dir_when_asked(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             make = make_args(tmpdir) + ['build_clean']
             outdir = self.run_analyzer(tmpdir, ['--keep-empty'], make)
             self.assertTrue(os.path.isdir(outdir))
@@ -47,7 +47,7 @@
         return len(glob.glob(os.path.join(directory, 'report-*.plist')))
 
     def test_interposition_works(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             make = make_args(tmpdir) + ['build_regular']
             outdir = check_call_and_report(
                 ['scan-build', '--plist', '-o', tmpdir, '--override-compiler'],
@@ -57,7 +57,7 @@
             self.assertEqual(self.get_plist_count(outdir), 5)
 
     def test_intercept_wrapper_works(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             make = make_args(tmpdir) + ['build_regular']
             outdir = check_call_and_report(
                 ['scan-build', '--plist', '-o', tmpdir, '--intercept-first',
@@ -68,7 +68,7 @@
             self.assertEqual(self.get_plist_count(outdir), 5)
 
     def test_intercept_library_works(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             make = make_args(tmpdir) + ['build_regular']
             outdir = check_call_and_report(
                 ['scan-build', '--plist', '-o', tmpdir, '--intercept-first'],
@@ -88,21 +88,21 @@
         return ['sh', '-c', command]
 
     def test_interposition_cc_works(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             outdir = check_call_and_report(
                 ['scan-build', '--plist', '-o', tmpdir, '--override-compiler'],
                 self.compile_empty_source_file(tmpdir, False))
             self.assertEqual(self.get_plist_count(outdir), 1)
 
     def test_interposition_cxx_works(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             outdir = check_call_and_report(
                 ['scan-build', '--plist', '-o', tmpdir, '--override-compiler'],
                 self.compile_empty_source_file(tmpdir, True))
             self.assertEqual(self.get_plist_count(outdir), 1)
 
     def test_intercept_cc_works(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             outdir = check_call_and_report(
                 ['scan-build', '--plist', '-o', tmpdir, '--override-compiler',
                  '--intercept-first'],
@@ -110,7 +110,7 @@
             self.assertEqual(self.get_plist_count(outdir), 1)
 
     def test_intercept_cxx_works(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             outdir = check_call_and_report(
                 ['scan-build', '--plist', '-o', tmpdir, '--override-compiler',
                  '--intercept-first'],
diff --git a/tools/scan-build-py/tests/functional/exec/CMakeLists.txt b/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
index 6e5d2e9..42ee1d1 100644
--- a/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
+++ b/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(exec C)
 
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.4.3)
 
 include(CheckCCompilerFlag)
 check_c_compiler_flag("-std=c99" C99_SUPPORTED)
diff --git a/tools/scan-build-py/tests/unit/__init__.py b/tools/scan-build-py/tests/unit/__init__.py
index 4fa9edc..dc8bf12 100644
--- a/tools/scan-build-py/tests/unit/__init__.py
+++ b/tools/scan-build-py/tests/unit/__init__.py
@@ -4,7 +4,8 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
-from . import test_command
+from . import test_libear
+from . import test_compilation
 from . import test_clang
 from . import test_runner
 from . import test_report
@@ -13,8 +14,9 @@
 from . import test_shell
 
 
-def load_tests(loader, suite, pattern):
-    suite.addTests(loader.loadTestsFromModule(test_command))
+def load_tests(loader, suite, _):
+    suite.addTests(loader.loadTestsFromModule(test_libear))
+    suite.addTests(loader.loadTestsFromModule(test_compilation))
     suite.addTests(loader.loadTestsFromModule(test_clang))
     suite.addTests(loader.loadTestsFromModule(test_runner))
     suite.addTests(loader.loadTestsFromModule(test_report))
diff --git a/tools/scan-build-py/tests/unit/fixtures.py b/tools/scan-build-py/tests/unit/fixtures.py
deleted file mode 100644
index d80f5e6..0000000
--- a/tools/scan-build-py/tests/unit/fixtures.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# -*- coding: utf-8 -*-
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-
-import contextlib
-import tempfile
-import shutil
-import unittest
-
-
-class Spy(object):
-    def __init__(self):
-        self.arg = None
-        self.success = 0
-
-    def call(self, params):
-        self.arg = params
-        return self.success
-
-
-@contextlib.contextmanager
-def TempDir():
-    name = tempfile.mkdtemp(prefix='scan-build-test-')
-    try:
-        yield name
-    finally:
-        shutil.rmtree(name)
-
-
-class TestCase(unittest.TestCase):
-    def assertIn(self, element, collection):
-        found = False
-        for it in collection:
-            if element == it:
-                found = True
-
-        self.assertTrue(found, '{0} does not have {1}'.format(collection,
-                                                              element))
diff --git a/tools/scan-build-py/tests/unit/test_analyze.py b/tools/scan-build-py/tests/unit/test_analyze.py
index b77db48..481cc0c 100644
--- a/tools/scan-build-py/tests/unit/test_analyze.py
+++ b/tools/scan-build-py/tests/unit/test_analyze.py
@@ -5,4 +5,3 @@
 # License. See LICENSE.TXT for details.
 
 import libscanbuild.analyze as sut
-from . import fixtures
diff --git a/tools/scan-build-py/tests/unit/test_clang.py b/tools/scan-build-py/tests/unit/test_clang.py
index 2f1fd79..04414a8 100644
--- a/tools/scan-build-py/tests/unit/test_clang.py
+++ b/tools/scan-build-py/tests/unit/test_clang.py
@@ -4,14 +4,15 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
+import libear
 import libscanbuild.clang as sut
-from . import fixtures
+import unittest
 import os.path
 
 
-class GetClangArgumentsTest(fixtures.TestCase):
+class GetClangArgumentsTest(unittest.TestCase):
     def test_get_clang_arguments(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             filename = os.path.join(tmpdir, 'test.c')
             with open(filename, 'w') as handle:
                 handle.write('')
@@ -20,8 +21,8 @@
                 ['clang', '-c', filename, '-DNDEBUG', '-Dvar="this is it"'],
                 tmpdir)
 
-            self.assertIn('NDEBUG', result)
-            self.assertIn('var="this is it"', result)
+            self.assertTrue('NDEBUG' in result)
+            self.assertTrue('var="this is it"' in result)
 
     def test_get_clang_arguments_fails(self):
         self.assertRaises(
@@ -29,7 +30,7 @@
             ['clang', '-###', '-fsyntax-only', '-x', 'c', 'notexist.c'], '.')
 
 
-class GetCheckersTest(fixtures.TestCase):
+class GetCheckersTest(unittest.TestCase):
     def test_get_checkers(self):
         # this test is only to see is not crashing
         result = sut.get_checkers('clang', [])
diff --git a/tools/scan-build-py/tests/unit/test_command.py b/tools/scan-build-py/tests/unit/test_command.py
deleted file mode 100644
index 9a6aae6..0000000
--- a/tools/scan-build-py/tests/unit/test_command.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# -*- coding: utf-8 -*-
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-
-import libscanbuild.command as sut
-from . import fixtures
-import unittest
-
-
-class ParseTest(unittest.TestCase):
-
-    def test_action(self):
-        def test(expected, cmd):
-            opts = sut.classify_parameters(cmd)
-            self.assertEqual(expected, opts['action'])
-
-        Link = sut.Action.Link
-        test(Link, ['clang', 'source.c'])
-
-        Compile = sut.Action.Compile
-        test(Compile, ['clang', '-c', 'source.c'])
-        test(Compile, ['clang', '-c', 'source.c', '-MF', 'source.d'])
-
-        Preprocess = sut.Action.Ignored
-        test(Preprocess, ['clang', '-E', 'source.c'])
-        test(Preprocess, ['clang', '-c', '-E', 'source.c'])
-        test(Preprocess, ['clang', '-c', '-M', 'source.c'])
-        test(Preprocess, ['clang', '-c', '-MM', 'source.c'])
-
-    def test_optimalizations(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('compile_options', [])
-
-        self.assertEqual(['-O'],  test(['clang', '-c', 'source.c', '-O']))
-        self.assertEqual(['-O1'], test(['clang', '-c', 'source.c', '-O1']))
-        self.assertEqual(['-Os'], test(['clang', '-c', 'source.c', '-Os']))
-        self.assertEqual(['-O2'], test(['clang', '-c', 'source.c', '-O2']))
-        self.assertEqual(['-O3'], test(['clang', '-c', 'source.c', '-O3']))
-
-    def test_language(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('language')
-
-        self.assertEqual(None, test(['clang', '-c', 'source.c']))
-        self.assertEqual('c', test(['clang', '-c', 'source.c', '-x', 'c']))
-        self.assertEqual('cpp', test(['clang', '-c', 'source.c', '-x', 'cpp']))
-
-    def test_output(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('output')
-
-        self.assertEqual(None, test(['clang', '-c', 'source.c']))
-        self.assertEqual('source.o',
-                         test(['clang', '-c', '-o', 'source.o', 'source.c']))
-
-    def test_arch(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('archs_seen', [])
-
-        eq = self.assertEqual
-
-        eq([], test(['clang', '-c', 'source.c']))
-        eq(['mips'],
-           test(['clang', '-c', 'source.c', '-arch', 'mips']))
-        eq(['mips', 'i386'],
-           test(['clang', '-c', 'source.c', '-arch', 'mips', '-arch', 'i386']))
-
-    def test_input_file(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('files', [])
-
-        eq = self.assertEqual
-
-        eq(['src.c'], test(['clang', 'src.c']))
-        eq(['src.c'], test(['clang', '-c', 'src.c']))
-        eq(['s1.c', 's2.c'], test(['clang', '-c', 's1.c', 's2.c']))
-
-    def test_include(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('compile_options', [])
-
-        eq = self.assertEqual
-
-        eq([], test(['clang', '-c', 'src.c']))
-        eq(['-include', '/usr/local/include'],
-           test(['clang', '-c', 'src.c', '-include', '/usr/local/include']))
-        eq(['-I.'],
-           test(['clang', '-c', 'src.c', '-I.']))
-        eq(['-I', '.'],
-           test(['clang', '-c', 'src.c', '-I', '.']))
-        eq(['-I/usr/local/include'],
-           test(['clang', '-c', 'src.c', '-I/usr/local/include']))
-        eq(['-I', '/usr/local/include'],
-           test(['clang', '-c', 'src.c', '-I', '/usr/local/include']))
-        eq(['-I/opt', '-I', '/opt/otp/include'],
-           test(['clang', '-c', 'src.c', '-I/opt', '-I', '/opt/otp/include']))
-        eq(['-isystem', '/path'],
-           test(['clang', '-c', 'src.c', '-isystem', '/path']))
-        eq(['-isystem=/path'],
-           test(['clang', '-c', 'src.c', '-isystem=/path']))
-
-    def test_define(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('compile_options', [])
-
-        eq = self.assertEqual
-
-        eq([], test(['clang', '-c', 'src.c']))
-        eq(['-DNDEBUG'],
-           test(['clang', '-c', 'src.c', '-DNDEBUG']))
-        eq(['-UNDEBUG'],
-           test(['clang', '-c', 'src.c', '-UNDEBUG']))
-        eq(['-Dvar1=val1', '-Dvar2=val2'],
-           test(['clang', '-c', 'src.c', '-Dvar1=val1', '-Dvar2=val2']))
-        eq(['-Dvar="val ues"'],
-           test(['clang', '-c', 'src.c', '-Dvar="val ues"']))
-
-    def test_ignored_flags(self):
-        def test(flags):
-            cmd = ['clang', 'src.o']
-            opts = sut.classify_parameters(cmd + flags)
-            self.assertEqual(['src.o'], opts.get('compile_options'))
-
-        test([])
-        test(['-lrt', '-L/opt/company/lib'])
-        test(['-static'])
-        test(['-Wnoexcept', '-Wall'])
-        test(['-mtune=i386', '-mcpu=i386'])
-
-    def test_compile_only_flags(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('compile_options', [])
-
-        eq = self.assertEqual
-
-        eq(['-std=C99'],
-           test(['clang', '-c', 'src.c', '-std=C99']))
-        eq(['-nostdinc'],
-           test(['clang', '-c', 'src.c', '-nostdinc']))
-        eq(['-isystem', '/image/debian'],
-           test(['clang', '-c', 'src.c', '-isystem', '/image/debian']))
-        eq(['-iprefix', '/usr/local'],
-           test(['clang', '-c', 'src.c', '-iprefix', '/usr/local']))
-        eq(['-iquote=me'],
-           test(['clang', '-c', 'src.c', '-iquote=me']))
-        eq(['-iquote', 'me'],
-           test(['clang', '-c', 'src.c', '-iquote', 'me']))
-
-    def test_compile_and_link_flags(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('compile_options', [])
-
-        eq = self.assertEqual
-
-        eq(['-fsinged-char'],
-           test(['clang', '-c', 'src.c', '-fsinged-char']))
-        eq(['-fPIC'],
-           test(['clang', '-c', 'src.c', '-fPIC']))
-        eq(['-stdlib=libc++'],
-           test(['clang', '-c', 'src.c', '-stdlib=libc++']))
-        eq(['--sysroot', '/'],
-           test(['clang', '-c', 'src.c', '--sysroot', '/']))
-        eq(['-isysroot', '/'],
-           test(['clang', '-c', 'src.c', '-isysroot', '/']))
-        eq([],
-           test(['clang', '-c', 'src.c', '-fsyntax-only']))
-        eq([],
-           test(['clang', '-c', 'src.c', '-sectorder', 'a', 'b', 'c']))
-
-    def test_detect_cxx_from_compiler_name(self):
-        def test(cmd):
-            opts = sut.classify_parameters(cmd)
-            return opts.get('c++')
-
-        eq = self.assertEqual
-
-        eq(False, test(['cc', '-c', 'src.c']))
-        eq(True, test(['c++', '-c', 'src.c']))
-        eq(False, test(['clang', '-c', 'src.c']))
-        eq(True, test(['clang++', '-c', 'src.c']))
-        eq(False, test(['gcc', '-c', 'src.c']))
-        eq(True, test(['g++', '-c', 'src.c']))
diff --git a/tools/scan-build-py/tests/unit/test_compilation.py b/tools/scan-build-py/tests/unit/test_compilation.py
new file mode 100644
index 0000000..124feba
--- /dev/null
+++ b/tools/scan-build-py/tests/unit/test_compilation.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+
+import libscanbuild.compilation as sut
+import unittest
+
+
+class CompilerTest(unittest.TestCase):
+
+    def test_is_compiler_call(self):
+        self.assertIsNotNone(sut.compiler_language(['clang']))
+        self.assertIsNotNone(sut.compiler_language(['clang-3.6']))
+        self.assertIsNotNone(sut.compiler_language(['clang++']))
+        self.assertIsNotNone(sut.compiler_language(['clang++-3.5.1']))
+        self.assertIsNotNone(sut.compiler_language(['cc']))
+        self.assertIsNotNone(sut.compiler_language(['c++']))
+        self.assertIsNotNone(sut.compiler_language(['gcc']))
+        self.assertIsNotNone(sut.compiler_language(['g++']))
+        self.assertIsNotNone(sut.compiler_language(['/usr/local/bin/gcc']))
+        self.assertIsNotNone(sut.compiler_language(['/usr/local/bin/g++']))
+        self.assertIsNotNone(sut.compiler_language(['/usr/local/bin/clang']))
+        self.assertIsNotNone(
+            sut.compiler_language(['armv7_neno-linux-gnueabi-g++']))
+
+        self.assertIsNone(sut.compiler_language([]))
+        self.assertIsNone(sut.compiler_language(['']))
+        self.assertIsNone(sut.compiler_language(['ld']))
+        self.assertIsNone(sut.compiler_language(['as']))
+        self.assertIsNone(sut.compiler_language(['/usr/local/bin/compiler']))
+
+
+class SplitTest(unittest.TestCase):
+
+    def test_detect_cxx_from_compiler_name(self):
+        def test(cmd):
+            result = sut.split_command([cmd, '-c', 'src.c'])
+            self.assertIsNotNone(result, "wrong input for test")
+            return result.compiler == 'c++'
+
+        self.assertFalse(test('cc'))
+        self.assertFalse(test('gcc'))
+        self.assertFalse(test('clang'))
+
+        self.assertTrue(test('c++'))
+        self.assertTrue(test('g++'))
+        self.assertTrue(test('g++-5.3.1'))
+        self.assertTrue(test('clang++'))
+        self.assertTrue(test('clang++-3.7.1'))
+        self.assertTrue(test('armv7_neno-linux-gnueabi-g++'))
+
+    def test_action(self):
+        self.assertIsNotNone(sut.split_command(['clang', 'source.c']))
+        self.assertIsNotNone(sut.split_command(['clang', '-c', 'source.c']))
+        self.assertIsNotNone(sut.split_command(['clang', '-c', 'source.c',
+                                                '-MF', 'a.d']))
+
+        self.assertIsNone(sut.split_command(['clang', '-E', 'source.c']))
+        self.assertIsNone(sut.split_command(['clang', '-c', '-E', 'source.c']))
+        self.assertIsNone(sut.split_command(['clang', '-c', '-M', 'source.c']))
+        self.assertIsNone(
+            sut.split_command(['clang', '-c', '-MM', 'source.c']))
+
+    def test_source_file(self):
+        def test(expected, cmd):
+            self.assertEqual(expected, sut.split_command(cmd).files)
+
+        test(['src.c'], ['clang', 'src.c'])
+        test(['src.c'], ['clang', '-c', 'src.c'])
+        test(['src.C'], ['clang', '-x', 'c', 'src.C'])
+        test(['src.cpp'], ['clang++', '-c', 'src.cpp'])
+        test(['s1.c', 's2.c'], ['clang', '-c', 's1.c', 's2.c'])
+        test(['s1.c', 's2.c'], ['cc', 's1.c', 's2.c', '-ldep', '-o', 'a.out'])
+        test(['src.c'], ['clang', '-c', '-I', './include', 'src.c'])
+        test(['src.c'], ['clang', '-c', '-I', '/opt/me/include', 'src.c'])
+        test(['src.c'], ['clang', '-c', '-D', 'config=file.c', 'src.c'])
+
+        self.assertIsNone(
+            sut.split_command(['cc', 'this.o', 'that.o', '-o', 'a.out']))
+        self.assertIsNone(
+            sut.split_command(['cc', 'this.o', '-lthat', '-o', 'a.out']))
+
+    def test_filter_flags(self):
+        def test(expected, flags):
+            command = ['clang', '-c', 'src.c'] + flags
+            self.assertEqual(expected, sut.split_command(command).flags)
+
+        def same(expected):
+            test(expected, expected)
+
+        def filtered(flags):
+            test([], flags)
+
+        same([])
+        same(['-I', '/opt/me/include', '-DNDEBUG', '-ULIMITS'])
+        same(['-O', '-O2'])
+        same(['-m32', '-mmms'])
+        same(['-Wall', '-Wno-unused', '-g', '-funroll-loops'])
+
+        filtered([])
+        filtered(['-lclien', '-L/opt/me/lib', '-L', '/opt/you/lib'])
+        filtered(['-static'])
+        filtered(['-MD', '-MT', 'something'])
+        filtered(['-MMD', '-MF', 'something'])
+
+
+class SourceClassifierTest(unittest.TestCase):
+
+    def test_sources(self):
+        self.assertIsNone(sut.classify_source('file.o'))
+        self.assertIsNone(sut.classify_source('file.exe'))
+        self.assertIsNone(sut.classify_source('/path/file.o'))
+        self.assertIsNone(sut.classify_source('clang'))
+
+        self.assertEqual('c', sut.classify_source('file.c'))
+        self.assertEqual('c', sut.classify_source('./file.c'))
+        self.assertEqual('c', sut.classify_source('/path/file.c'))
+        self.assertEqual('c++', sut.classify_source('file.c', False))
+        self.assertEqual('c++', sut.classify_source('./file.c', False))
+        self.assertEqual('c++', sut.classify_source('/path/file.c', False))
diff --git a/tools/scan-build-py/tests/unit/test_intercept.py b/tools/scan-build-py/tests/unit/test_intercept.py
index b6f01f3..5b6ed2c 100644
--- a/tools/scan-build-py/tests/unit/test_intercept.py
+++ b/tools/scan-build-py/tests/unit/test_intercept.py
@@ -4,62 +4,37 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
+import libear
 import libscanbuild.intercept as sut
-from . import fixtures
+import unittest
 import os.path
 
 
-class InterceptUtilTest(fixtures.TestCase):
-
-    def test_is_compiler_call_filter(self):
-        def test(command):
-            return sut.is_compiler_call({'command': [command]})
-
-        self.assertTrue(test('clang'))
-        self.assertTrue(test('clang-3.6'))
-        self.assertTrue(test('clang++'))
-        self.assertTrue(test('clang++-3.5.1'))
-        self.assertTrue(test('cc'))
-        self.assertTrue(test('c++'))
-        self.assertTrue(test('gcc'))
-        self.assertTrue(test('g++'))
-        self.assertTrue(test('/usr/local/bin/gcc'))
-        self.assertTrue(test('/usr/local/bin/g++'))
-        self.assertTrue(test('/usr/local/bin/clang'))
-        self.assertTrue(test('armv7_neno-linux-gnueabi-g++'))
-
-        self.assertFalse(test(''))
-        self.assertFalse(test('ld'))
-        self.assertFalse(test('as'))
-        self.assertFalse(test('/usr/local/bin/compiler'))
+class InterceptUtilTest(unittest.TestCase):
 
     def test_format_entry_filters_action(self):
         def test(command):
-            return list(sut.format_entry(
-                {'command': command, 'directory': '/opt/src/project'}))
+            trace = {'command': command, 'directory': '/opt/src/project'}
+            return list(sut.format_entry(trace))
 
         self.assertTrue(test(['cc', '-c', 'file.c', '-o', 'file.o']))
         self.assertFalse(test(['cc', '-E', 'file.c']))
         self.assertFalse(test(['cc', '-MM', 'file.c']))
         self.assertFalse(test(['cc', 'this.o', 'that.o', '-o', 'a.out']))
-        self.assertFalse(test(['cc', '-print-prog-name']))
 
     def test_format_entry_normalize_filename(self):
-        directory = os.path.join(os.sep, 'home', 'me', 'project')
+        parent = os.path.join(os.sep, 'home', 'me')
+        current = os.path.join(parent, 'project')
 
-        def test(command):
-            result = list(sut.format_entry(
-                {'command': command, 'directory': directory}))
-            return result[0]['file']
+        def test(filename):
+            trace = {'directory': current, 'command': ['cc', '-c', filename]}
+            return list(sut.format_entry(trace))[0]['file']
 
-        self.assertEqual(test(['cc', '-c', 'file.c']),
-                         os.path.join(directory, 'file.c'))
-        self.assertEqual(test(['cc', '-c', './file.c']),
-                         os.path.join(directory, 'file.c'))
-        self.assertEqual(test(['cc', '-c', '../file.c']),
-                         os.path.join(os.path.dirname(directory), 'file.c'))
-        self.assertEqual(test(['cc', '-c', '/opt/file.c']),
-                         '/opt/file.c')
+        self.assertEqual(os.path.join(current, 'file.c'), test('file.c'))
+        self.assertEqual(os.path.join(current, 'file.c'), test('./file.c'))
+        self.assertEqual(os.path.join(parent, 'file.c'), test('../file.c'))
+        self.assertEqual(os.path.join(current, 'file.c'),
+                         test(os.path.join(current, 'file.c')))
 
     def test_sip(self):
         def create_status_report(filename, message):
@@ -92,7 +67,7 @@
         OSX = 'darwin'
         LINUX = 'linux'
 
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             try:
                 saved = os.environ['PATH']
                 os.environ['PATH'] = tmpdir + ':' + saved
diff --git a/tools/scan-build-py/tests/unit/test_libear.py b/tools/scan-build-py/tests/unit/test_libear.py
new file mode 100644
index 0000000..f5b9280
--- /dev/null
+++ b/tools/scan-build-py/tests/unit/test_libear.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+
+import libear as sut
+import unittest
+import os.path
+
+
+class TemporaryDirectoryTest(unittest.TestCase):
+    def test_creates_directory(self):
+        dirname = None
+        with sut.TemporaryDirectory() as tmpdir:
+            self.assertTrue(os.path.isdir(tmpdir))
+            dirname = tmpdir
+        self.assertIsNotNone(dirname)
+        self.assertFalse(os.path.exists(dirname))
+
+    def test_removes_directory_when_exception(self):
+        dirname = None
+        try:
+            with sut.TemporaryDirectory() as tmpdir:
+                self.assertTrue(os.path.isdir(tmpdir))
+                dirname = tmpdir
+                raise RuntimeError('message')
+        except:
+            self.assertIsNotNone(dirname)
+            self.assertFalse(os.path.exists(dirname))
diff --git a/tools/scan-build-py/tests/unit/test_report.py b/tools/scan-build-py/tests/unit/test_report.py
index d505afc..3f249ce 100644
--- a/tools/scan-build-py/tests/unit/test_report.py
+++ b/tools/scan-build-py/tests/unit/test_report.py
@@ -4,15 +4,15 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
+import libear
 import libscanbuild.report as sut
-from . import fixtures
 import unittest
 import os
 import os.path
 
 
 def run_bug_parse(content):
-    with fixtures.TempDir() as tmpdir:
+    with libear.TemporaryDirectory() as tmpdir:
         file_name = os.path.join(tmpdir, 'test.html')
         with open(file_name, 'w') as handle:
             handle.writelines(content)
@@ -21,7 +21,7 @@
 
 
 def run_crash_parse(content, preproc):
-    with fixtures.TempDir() as tmpdir:
+    with libear.TemporaryDirectory() as tmpdir:
         file_name = os.path.join(tmpdir, preproc + '.info.txt')
         with open(file_name, 'w') as handle:
             handle.writelines(content)
@@ -77,20 +77,22 @@
     def test_parse_real_crash(self):
         import libscanbuild.runner as sut2
         import re
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             filename = os.path.join(tmpdir, 'test.c')
             with open(filename, 'w') as handle:
                 handle.write('int main() { return 0')
             # produce failure report
-            opts = {'directory': os.getcwd(),
-                    'clang': 'clang',
-                    'file': filename,
-                    'report': ['-fsyntax-only', '-E', filename],
-                    'language': 'c',
-                    'output_dir': tmpdir,
-                    'error_type': 'other_error',
-                    'error_output': 'some output',
-                    'exit_code': 13}
+            opts = {
+                'clang': 'clang',
+                'directory': os.getcwd(),
+                'flags': [],
+                'file': filename,
+                'output_dir': tmpdir,
+                'language': 'c',
+                'error_type': 'other_error',
+                'error_output': 'some output',
+                'exit_code': 13
+            }
             sut2.report_failure(opts)
             # find the info file
             pp_file = None
@@ -123,7 +125,7 @@
                                                  '/prefix/src/file'))
 
 
-class GetPrefixFromCompilationDatabaseTest(fixtures.TestCase):
+class GetPrefixFromCompilationDatabaseTest(unittest.TestCase):
 
     def test_with_different_filenames(self):
         self.assertEqual(
diff --git a/tools/scan-build-py/tests/unit/test_runner.py b/tools/scan-build-py/tests/unit/test_runner.py
index de15d23..b4730a1 100644
--- a/tools/scan-build-py/tests/unit/test_runner.py
+++ b/tools/scan-build-py/tests/unit/test_runner.py
@@ -4,96 +4,164 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
+import libear
 import libscanbuild.runner as sut
-from . import fixtures
 import unittest
 import re
 import os
 import os.path
 
 
-def run_analyzer(content, opts):
-    with fixtures.TempDir() as tmpdir:
-        filename = os.path.join(tmpdir, 'test.cpp')
-        with open(filename, 'w') as handle:
-            handle.write(content)
+class FilteringFlagsTest(unittest.TestCase):
 
-        opts.update({
-            'directory': os.getcwd(),
-            'clang': 'clang',
-            'file': filename,
-            'language': 'c++',
-            'analyze': ['--analyze', '-x', 'c++', filename],
-            'output': ['-o', tmpdir]})
-        spy = fixtures.Spy()
-        result = sut.run_analyzer(opts, spy.call)
-        return (result, spy.arg)
+    def test_language_captured(self):
+        def test(flags):
+            cmd = ['clang', '-c', 'source.c'] + flags
+            opts = sut.classify_parameters(cmd)
+            return opts['language']
+
+        self.assertEqual(None, test([]))
+        self.assertEqual('c', test(['-x', 'c']))
+        self.assertEqual('cpp', test(['-x', 'cpp']))
+
+    def test_arch(self):
+        def test(flags):
+            cmd = ['clang', '-c', 'source.c'] + flags
+            opts = sut.classify_parameters(cmd)
+            return opts['arch_list']
+
+        self.assertEqual([], test([]))
+        self.assertEqual(['mips'], test(['-arch', 'mips']))
+        self.assertEqual(['mips', 'i386'],
+                         test(['-arch', 'mips', '-arch', 'i386']))
+
+    def assertFlagsChanged(self, expected, flags):
+        cmd = ['clang', '-c', 'source.c'] + flags
+        opts = sut.classify_parameters(cmd)
+        self.assertEqual(expected, opts['flags'])
+
+    def assertFlagsUnchanged(self, flags):
+        self.assertFlagsChanged(flags, flags)
+
+    def assertFlagsFiltered(self, flags):
+        self.assertFlagsChanged([], flags)
+
+    def test_optimalizations_pass(self):
+        self.assertFlagsUnchanged(['-O'])
+        self.assertFlagsUnchanged(['-O1'])
+        self.assertFlagsUnchanged(['-Os'])
+        self.assertFlagsUnchanged(['-O2'])
+        self.assertFlagsUnchanged(['-O3'])
+
+    def test_include_pass(self):
+        self.assertFlagsUnchanged([])
+        self.assertFlagsUnchanged(['-include', '/usr/local/include'])
+        self.assertFlagsUnchanged(['-I.'])
+        self.assertFlagsUnchanged(['-I', '.'])
+        self.assertFlagsUnchanged(['-I/usr/local/include'])
+        self.assertFlagsUnchanged(['-I', '/usr/local/include'])
+        self.assertFlagsUnchanged(['-I/opt', '-I', '/opt/otp/include'])
+        self.assertFlagsUnchanged(['-isystem', '/path'])
+        self.assertFlagsUnchanged(['-isystem=/path'])
+
+    def test_define_pass(self):
+        self.assertFlagsUnchanged(['-DNDEBUG'])
+        self.assertFlagsUnchanged(['-UNDEBUG'])
+        self.assertFlagsUnchanged(['-Dvar1=val1', '-Dvar2=val2'])
+        self.assertFlagsUnchanged(['-Dvar="val ues"'])
+
+    def test_output_filtered(self):
+        self.assertFlagsFiltered(['-o', 'source.o'])
+
+    def test_some_warning_filtered(self):
+        self.assertFlagsFiltered(['-Wall'])
+        self.assertFlagsFiltered(['-Wnoexcept'])
+        self.assertFlagsFiltered(['-Wreorder', '-Wunused', '-Wundef'])
+        self.assertFlagsUnchanged(['-Wno-reorder', '-Wno-unused'])
+
+    def test_compile_only_flags_pass(self):
+        self.assertFlagsUnchanged(['-std=C99'])
+        self.assertFlagsUnchanged(['-nostdinc'])
+        self.assertFlagsUnchanged(['-isystem', '/image/debian'])
+        self.assertFlagsUnchanged(['-iprefix', '/usr/local'])
+        self.assertFlagsUnchanged(['-iquote=me'])
+        self.assertFlagsUnchanged(['-iquote', 'me'])
+
+    def test_compile_and_link_flags_pass(self):
+        self.assertFlagsUnchanged(['-fsinged-char'])
+        self.assertFlagsUnchanged(['-fPIC'])
+        self.assertFlagsUnchanged(['-stdlib=libc++'])
+        self.assertFlagsUnchanged(['--sysroot', '/'])
+        self.assertFlagsUnchanged(['-isysroot', '/'])
+
+    def test_some_flags_filtered(self):
+        self.assertFlagsFiltered(['-g'])
+        self.assertFlagsFiltered(['-fsyntax-only'])
+        self.assertFlagsFiltered(['-save-temps'])
+        self.assertFlagsFiltered(['-init', 'my_init'])
+        self.assertFlagsFiltered(['-sectorder', 'a', 'b', 'c'])
+
+
+class Spy(object):
+    def __init__(self):
+        self.arg = None
+        self.success = 0
+
+    def call(self, params):
+        self.arg = params
+        return self.success
 
 
 class RunAnalyzerTest(unittest.TestCase):
 
+    @staticmethod
+    def run_analyzer(content, failures_report):
+        with libear.TemporaryDirectory() as tmpdir:
+            filename = os.path.join(tmpdir, 'test.cpp')
+            with open(filename, 'w') as handle:
+                handle.write(content)
+
+            opts = {
+                'clang': 'clang',
+                'directory': os.getcwd(),
+                'flags': [],
+                'direct_args': [],
+                'file': filename,
+                'output_dir': tmpdir,
+                'output_format': 'plist',
+                'output_failures': failures_report
+            }
+            spy = Spy()
+            result = sut.run_analyzer(opts, spy.call)
+            return (result, spy.arg)
+
     def test_run_analyzer(self):
         content = "int div(int n, int d) { return n / d; }"
-        (result, fwds) = run_analyzer(content, dict())
+        (result, fwds) = RunAnalyzerTest.run_analyzer(content, False)
         self.assertEqual(None, fwds)
         self.assertEqual(0, result['exit_code'])
 
     def test_run_analyzer_crash(self):
         content = "int div(int n, int d) { return n / d }"
-        (result, fwds) = run_analyzer(content, dict())
+        (result, fwds) = RunAnalyzerTest.run_analyzer(content, False)
         self.assertEqual(None, fwds)
         self.assertEqual(1, result['exit_code'])
 
     def test_run_analyzer_crash_and_forwarded(self):
         content = "int div(int n, int d) { return n / d }"
-        (_, fwds) = run_analyzer(content, {'output_failures': True})
+        (_, fwds) = RunAnalyzerTest.run_analyzer(content, True)
         self.assertEqual('crash', fwds['error_type'])
         self.assertEqual(1, fwds['exit_code'])
         self.assertTrue(len(fwds['error_output']) > 0)
 
 
-class SetAnalyzerOutputTest(fixtures.TestCase):
-
-    def test_not_defined(self):
-        with fixtures.TempDir() as tmpdir:
-            opts = {'output_dir': tmpdir}
-            spy = fixtures.Spy()
-            sut.set_analyzer_output(opts, spy.call)
-            self.assertTrue(os.path.exists(spy.arg['output'][1]))
-            self.assertTrue(os.path.isdir(spy.arg['output'][1]))
-
-    def test_html(self):
-        with fixtures.TempDir() as tmpdir:
-            opts = {'output_dir': tmpdir, 'output_format': 'html'}
-            spy = fixtures.Spy()
-            sut.set_analyzer_output(opts, spy.call)
-            self.assertTrue(os.path.exists(spy.arg['output'][1]))
-            self.assertTrue(os.path.isdir(spy.arg['output'][1]))
-
-    def test_plist_html(self):
-        with fixtures.TempDir() as tmpdir:
-            opts = {'output_dir': tmpdir, 'output_format': 'plist-html'}
-            spy = fixtures.Spy()
-            sut.set_analyzer_output(opts, spy.call)
-            self.assertTrue(os.path.exists(spy.arg['output'][1]))
-            self.assertTrue(os.path.isfile(spy.arg['output'][1]))
-
-    def test_plist(self):
-        with fixtures.TempDir() as tmpdir:
-            opts = {'output_dir': tmpdir, 'output_format': 'plist'}
-            spy = fixtures.Spy()
-            sut.set_analyzer_output(opts, spy.call)
-            self.assertTrue(os.path.exists(spy.arg['output'][1]))
-            self.assertTrue(os.path.isfile(spy.arg['output'][1]))
-
-
-class ReportFailureTest(fixtures.TestCase):
+class ReportFailureTest(unittest.TestCase):
 
     def assertUnderFailures(self, path):
         self.assertEqual('failures', os.path.basename(os.path.dirname(path)))
 
     def test_report_failure_create_files(self):
-        with fixtures.TempDir() as tmpdir:
+        with libear.TemporaryDirectory() as tmpdir:
             # create input file
             filename = os.path.join(tmpdir, 'test.c')
             with open(filename, 'w') as handle:
@@ -101,15 +169,17 @@
             uname_msg = ' '.join(os.uname()) + os.linesep
             error_msg = 'this is my error output'
             # execute test
-            opts = {'directory': os.getcwd(),
-                    'clang': 'clang',
-                    'file': filename,
-                    'report': ['-fsyntax-only', '-E', filename],
-                    'language': 'c',
-                    'output_dir': tmpdir,
-                    'error_type': 'other_error',
-                    'error_output': error_msg,
-                    'exit_code': 13}
+            opts = {
+                'clang': 'clang',
+                'directory': os.getcwd(),
+                'flags': [],
+                'file': filename,
+                'output_dir': tmpdir,
+                'language': 'c',
+                'error_type': 'other_error',
+                'error_output': error_msg,
+                'exit_code': 13
+            }
             sut.report_failure(opts)
             # verify the result
             result = dict()
@@ -126,57 +196,110 @@
             self.assertUnderFailures(pp_file)
             # info file generated and content dumped
             info_file = pp_file + '.info.txt'
-            self.assertIn(info_file, result)
+            self.assertTrue(info_file in result)
             self.assertEqual('Other Error\n', result[info_file][1])
             self.assertEqual(uname_msg, result[info_file][3])
             # error file generated and content dumped
             error_file = pp_file + '.stderr.txt'
-            self.assertIn(error_file, result)
+            self.assertTrue(error_file in result)
             self.assertEqual([error_msg], result[error_file])
 
 
 class AnalyzerTest(unittest.TestCase):
 
-    def test_set_language(self):
+    def test_nodebug_macros_appended(self):
+        def test(flags):
+            spy = Spy()
+            opts = {'flags': flags, 'force_debug': True}
+            self.assertEqual(spy.success,
+                             sut.filter_debug_flags(opts, spy.call))
+            return spy.arg['flags']
+
+        self.assertEqual(['-UNDEBUG'], test([]))
+        self.assertEqual(['-DNDEBUG', '-UNDEBUG'], test(['-DNDEBUG']))
+        self.assertEqual(['-DSomething', '-UNDEBUG'], test(['-DSomething']))
+
+    def test_set_file_relative_path(self):
         def test(expected, input):
-            spy = fixtures.Spy()
+            spy = Spy()
+            self.assertEqual(spy.success,
+                             sut.set_file_path_relative(input, spy.call))
+            self.assertEqual(expected, spy.arg['file'])
+
+        test('source.c',
+             {'file': '/home/me/source.c', 'directory': '/home/me'})
+        test('me/source.c',
+             {'file': '/home/me/source.c', 'directory': '/home'})
+        test('../home/me/source.c',
+             {'file': '/home/me/source.c', 'directory': '/tmp'})
+
+    def test_set_language_fall_through(self):
+        def language(expected, input):
+            spy = Spy()
+            input.update({'compiler': 'c', 'file': 'test.c'})
             self.assertEqual(spy.success, sut.language_check(input, spy.call))
             self.assertEqual(expected, spy.arg['language'])
 
-        l = 'language'
-        f = 'file'
-        i = 'c++'
-        test('c',   {f: 'file.c', l: 'c', i: False})
-        test('c++', {f: 'file.c', l: 'c++', i: False})
-        test('c++', {f: 'file.c', i: True})
-        test('c',   {f: 'file.c', i: False})
-        test('c++', {f: 'file.cxx', i: False})
-        test('c-cpp-output',   {f: 'file.i', i: False})
-        test('c++-cpp-output', {f: 'file.i', i: True})
-        test('c-cpp-output',   {f: 'f.i', l: 'c-cpp-output', i: True})
+        language('c',   {'language': 'c', 'flags': []})
+        language('c++', {'language': 'c++', 'flags': []})
 
-    def test_arch_loop(self):
-        def test(input):
-            spy = fixtures.Spy()
+    def test_set_language_stops_on_not_supported(self):
+        spy = Spy()
+        input = {
+            'compiler': 'c',
+            'flags': [],
+            'file': 'test.java',
+            'language': 'java'
+        }
+        self.assertIsNone(sut.language_check(input, spy.call))
+        self.assertIsNone(spy.arg)
+
+    def test_set_language_sets_flags(self):
+        def flags(expected, input):
+            spy = Spy()
+            input.update({'compiler': 'c', 'file': 'test.c'})
+            self.assertEqual(spy.success, sut.language_check(input, spy.call))
+            self.assertEqual(expected, spy.arg['flags'])
+
+        flags(['-x', 'c'],   {'language': 'c', 'flags': []})
+        flags(['-x', 'c++'], {'language': 'c++', 'flags': []})
+
+    def test_set_language_from_filename(self):
+        def language(expected, input):
+            spy = Spy()
+            input.update({'language': None, 'flags': []})
+            self.assertEqual(spy.success, sut.language_check(input, spy.call))
+            self.assertEqual(expected, spy.arg['language'])
+
+        language('c',   {'file': 'file.c',   'compiler': 'c'})
+        language('c++', {'file': 'file.c',   'compiler': 'c++'})
+        language('c++', {'file': 'file.cxx', 'compiler': 'c'})
+        language('c++', {'file': 'file.cxx', 'compiler': 'c++'})
+        language('c++', {'file': 'file.cpp', 'compiler': 'c++'})
+        language('c-cpp-output',   {'file': 'file.i', 'compiler': 'c'})
+        language('c++-cpp-output', {'file': 'file.i', 'compiler': 'c++'})
+
+    def test_arch_loop_sets_flags(self):
+        def flags(archs):
+            spy = Spy()
+            input = {'flags': [], 'arch_list': archs}
             sut.arch_check(input, spy.call)
-            return spy.arg
+            return spy.arg['flags']
 
-        input = {'key': 'value'}
-        self.assertEqual(input, test(input))
+        self.assertEqual([], flags([]))
+        self.assertEqual(['-arch', 'i386'], flags(['i386']))
+        self.assertEqual(['-arch', 'i386'], flags(['i386', 'ppc']))
+        self.assertEqual(['-arch', 'sparc'], flags(['i386', 'sparc']))
 
-        input = {'archs_seen': ['i386']}
-        self.assertEqual({'arch': 'i386'}, test(input))
+    def test_arch_loop_stops_on_not_supported(self):
+        def stop(archs):
+            spy = Spy()
+            input = {'flags': [], 'arch_list': archs}
+            self.assertIsNone(sut.arch_check(input, spy.call))
+            self.assertIsNone(spy.arg)
 
-        input = {'archs_seen': ['ppc']}
-        self.assertEqual(None, test(input))
-
-        input = {'archs_seen': ['i386', 'ppc']}
-        self.assertEqual({'arch': 'i386'}, test(input))
-
-        input = {'archs_seen': ['i386', 'sparc']}
-        result = test(input)
-        self.assertTrue(result == {'arch': 'i386'} or
-                        result == {'arch': 'sparc'})
+        stop(['ppc'])
+        stop(['ppc64'])
 
 
 @sut.require([])
@@ -211,14 +334,3 @@
 
     def test_method_exception_not_caught(self):
         self.assertRaises(Exception, method_exception_from_inside, dict())
-
-class ForceAnalyzeDebugTest(unittest.TestCase):
-
-    def test_force_analyze_debug_code(self):
-        for a, b in [
-                ([], ['-UNDEBUG']),
-                (['-O2'], ['-O2', '-UNDEBUG']),
-                (['-Dkey=val'], ['-Dkey=val', '-UNDEBUG']),
-                (['-D', 'NDEBUG'], ['-D', 'NDEBUG', '-UNDEBUG']) ]:
-            sut.force_analyze_debug_code(a)
-            self.assertEqual(a, b)
diff --git a/tools/scan-build/Makefile b/tools/scan-build/Makefile
deleted file mode 100644
index 23aa198..0000000
--- a/tools/scan-build/Makefile
+++ /dev/null
@@ -1,53 +0,0 @@
-##===- tools/scan-build/Makefile ---------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-
-include $(CLANG_LEVEL)/../../Makefile.config
-include $(CLANG_LEVEL)/Makefile
-
-ifeq ($(HOST_OS),MingW)
-  Suffix := .bat
-endif
-
-CLANG_INSTALL_SCANBUILD ?= 1
-
-ifeq ($(CLANG_INSTALL_SCANBUILD), 1)
-  InstallTargets := $(ToolDir)/scan-build$(Suffix) \
-                    $(LibexecDir)/c++-analyzer$(Suffix) \
-                    $(LibexecDir)/ccc-analyzer$(Suffix) \
-                    $(ShareDir)/scan-build/scanview.css \
-                    $(ShareDir)/scan-build/sorttable.js \
-                    $(ShareDir)/man/man1/scan-build.1
-
-  ifeq ($(HOST_OS),Darwin)
-    InstallTargets := $(InstallTargets) $(ToolDir)/set-xcode-analyzer
-  endif
-endif
-
-all:: $(InstallTargets)
-
-$(ToolDir)/%: bin/% Makefile $(ToolDir)/.dir
-	$(Echo) "Copying $(notdir $<) to the 'bin' directory..."
-	$(Verb)cp $< $@
-	$(Verb)chmod +x $@
-
-$(LibexecDir)/%: libexec/% Makefile $(LibexecDir)/.dir
-	$(Echo) "Copying $(notdir $<) to the 'libexec' directory..."
-	$(Verb)cp $< $@
-	$(Verb)chmod +x $@
-
-$(ShareDir)/man/man1/%: man/% Makefile $(ShareDir)/man/man1/.dir
-	$(Echo) "Copying $(notdir $<) to the 'man' directory..."
-	$(Verb)cp $< $@
-
-$(ShareDir)/scan-build/%: share/scan-build/% Makefile $(ShareDir)/scan-build/.dir
-	$(Echo) "Copying $(notdir $<) to the 'share' directory..."
-	$(Verb)cp $< $@
-
diff --git a/tools/scan-build/bin/scan-build b/tools/scan-build/bin/scan-build
index 3182a29..cbf3bf3 100755
--- a/tools/scan-build/bin/scan-build
+++ b/tools/scan-build/bin/scan-build
@@ -53,6 +53,7 @@
   IgnoreErrors => 0,         # Ignore build errors.
   ViewResults => 0,          # View results when the build terminates.
   ExitStatusFoundBugs => 0,  # Exit status reflects whether bugs were found
+  ShowDescription => 0,      # Display the description of the defect in the list
   KeepEmpty => 0,            # Don't remove output directory even with 0 results.
   EnableCheckers => {},
   DisableCheckers => {},
@@ -453,6 +454,10 @@
 
   push @$Index,[ $FName, $BugCategory, $BugType, $BugFile, $BugFunction, $BugLine,
                  $BugPathLength ];
+
+  if ($Options{ShowDescription}) {
+      push @{ $Index->[-1] }, $BugDescription
+  }
 }
 
 ##----------------------------------------------------------------------------##
@@ -746,6 +751,15 @@
   <td>Function/Method</td>
   <td class="Q">Line</td>
   <td class="Q">Path Length</td>
+ENDTEXT
+
+if ($Options{ShowDescription}) {
+print OUT <<ENDTEXT;
+    <td class="Q">Description</td>
+ENDTEXT
+}
+
+print OUT <<ENDTEXT;
   <td class="sorttable_nosort"></td>
   <!-- REPORTBUGCOL -->
 </tr></thead>
@@ -771,10 +785,10 @@
 
       print OUT "<tr class=\"bt_$x\">";
       print OUT "<td class=\"DESC\">";
-      print OUT $row->[1];
+      print OUT $row->[1]; # $BugCategory
       print OUT "</td>";
       print OUT "<td class=\"DESC\">";
-      print OUT $row->[2];
+      print OUT $row->[2]; # $BugType
       print OUT "</td>";
 
       # Update the file prefix.
@@ -802,11 +816,11 @@
       print OUT "</td>";
 
       print OUT "<td class=\"DESC\">";
-      print OUT $row->[4];
+      print OUT $row->[4]; # Function
       print OUT "</td>";
 
       # Print out the quantities.
-      for my $j ( 5 .. 6 ) {
+      for my $j ( 5 .. 6 ) { # Line & Path length
         print OUT "<td class=\"Q\">$row->[$j]</td>";
       }
 
@@ -1150,6 +1164,10 @@
    Specify the title used on generated HTML pages. If not specified, a default
    title will be used.
 
+ --show-description
+
+   Display the description of defects in the list
+
  -plist
 
    By default the output of scan-build is a set of HTML files. This option
@@ -1586,6 +1604,12 @@
       next;
     }
 
+    if ($arg eq "--show-description") {
+      shift @$Args;
+      $Options{ShowDescription} = 1;
+      next;
+    }
+
     if ($arg eq "-store") {
       shift @$Args;
       $Options{StoreModel} = shift @$Args;
diff --git a/tools/scan-view/Makefile b/tools/scan-view/Makefile
deleted file mode 100644
index 37e4404..0000000
--- a/tools/scan-view/Makefile
+++ /dev/null
@@ -1,37 +0,0 @@
-##===- tools/scan-view/Makefile ----------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL := ../..
-
-include $(CLANG_LEVEL)/../../Makefile.config
-include $(CLANG_LEVEL)/Makefile
-
-CLANG_INSTALL_SCANVIEW ?= 1
-
-ifeq ($(CLANG_INSTALL_SCANVIEW), 1)
-  InstallTargets := $(ToolDir)/scan-view \
-                    $(ShareDir)/scan-view/Reporter.py \
-                    $(ShareDir)/scan-view/ScanView.py \
-                    $(ShareDir)/scan-view/startfile.py \
-                    $(ShareDir)/scan-view/FileRadar.scpt \
-                    $(ShareDir)/scan-view/GetRadarVersion.scpt \
-                    $(ShareDir)/scan-view/bugcatcher.ico
-endif
-
-all:: $(InstallTargets)
-
-$(ToolDir)/%: bin/% Makefile $(ToolDir)/.dir
-	$(Echo) "Copying $(notdir $<) to the 'bin' directory..."
-	$(Verb)cp $< $@
-	$(Verb)chmod +x $@
-
-$(ShareDir)/scan-view/%: share/% Makefile $(ShareDir)/scan-view/.dir
-	$(Echo) "Copying $(notdir $<) to the 'share' directory..."
-	$(Verb)cp $< $@
-
diff --git a/unittests/AST/ASTContextParentMapTest.cpp b/unittests/AST/ASTContextParentMapTest.cpp
index b1d7db4..a391896 100644
--- a/unittests/AST/ASTContextParentMapTest.cpp
+++ b/unittests/AST/ASTContextParentMapTest.cpp
@@ -21,10 +21,6 @@
 namespace clang {
 namespace ast_matchers {
 
-using clang::tooling::newFrontendActionFactory;
-using clang::tooling::runToolOnCodeWithArgs;
-using clang::tooling::FrontendActionFactory;
-
 TEST(GetParents, ReturnsParentForDecl) {
   MatchVerifier<Decl> Verifier;
   EXPECT_TRUE(
diff --git a/unittests/AST/ASTImporterTest.cpp b/unittests/AST/ASTImporterTest.cpp
new file mode 100644
index 0000000..3cc38fb
--- /dev/null
+++ b/unittests/AST/ASTImporterTest.cpp
@@ -0,0 +1,460 @@
+//===- unittest/AST/ASTImporterTest.cpp - AST node import test ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for the correct import of AST nodes from one AST context to another.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTImporter.h"
+#include "MatchVerifier.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace ast_matchers {
+
+typedef std::vector<std::string> StringVector;
+
+void getLangArgs(Language Lang, StringVector &Args) {
+  switch (Lang) {
+  case Lang_C:
+    Args.insert(Args.end(), { "-x", "c", "-std=c99" });
+    break;
+  case Lang_C89:
+    Args.insert(Args.end(), { "-x", "c", "-std=c89" });
+    break;
+  case Lang_CXX:
+    Args.push_back("-std=c++98");
+    break;
+  case Lang_CXX11:
+    Args.push_back("-std=c++11");
+    break;
+  case Lang_OpenCL:
+  case Lang_OBJCXX:
+    break;
+  }
+}
+
+template<typename NodeType, typename MatcherType>
+testing::AssertionResult
+testImport(const std::string &FromCode, Language FromLang,
+           const std::string &ToCode, Language ToLang,
+           MatchVerifier<NodeType> &Verifier,
+           const MatcherType &AMatcher) {
+  StringVector FromArgs, ToArgs;
+  getLangArgs(FromLang, FromArgs);
+  getLangArgs(ToLang, ToArgs);
+
+  const char *const InputFileName = "input.cc";
+  const char *const OutputFileName = "output.cc";
+
+  std::unique_ptr<ASTUnit>
+      FromAST = tooling::buildASTFromCodeWithArgs(
+        FromCode, FromArgs, InputFileName),
+      ToAST = tooling::buildASTFromCodeWithArgs(ToCode, ToArgs, OutputFileName);
+
+  ASTContext &FromCtx = FromAST->getASTContext(),
+      &ToCtx = ToAST->getASTContext();
+
+  // Add input.cc to virtual file system so importer can 'find' it
+  // while importing SourceLocations.
+  vfs::OverlayFileSystem *OFS = static_cast<vfs::OverlayFileSystem *>(
+        ToCtx.getSourceManager().getFileManager().getVirtualFileSystem().get());
+  vfs::InMemoryFileSystem *MFS = static_cast<vfs::InMemoryFileSystem *>(
+        OFS->overlays_begin()->get());
+  MFS->addFile(InputFileName, 0,
+               llvm::MemoryBuffer::getMemBuffer(FromCode.c_str()));
+
+  ASTImporter Importer(ToCtx, ToAST->getFileManager(),
+                       FromCtx, FromAST->getFileManager(), false);
+
+  IdentifierInfo *ImportedII = &FromCtx.Idents.get("declToImport");
+  assert(ImportedII && "Declaration with 'declToImport' name"
+                       "should be specified in test!");
+  DeclarationName ImportDeclName(ImportedII);
+  SmallVector<NamedDecl *, 4> FoundDecls;
+  FromCtx.getTranslationUnitDecl()->localUncachedLookup(
+        ImportDeclName, FoundDecls);
+
+  if (FoundDecls.size() != 1)
+    return testing::AssertionFailure() << "Multiple declarations were found!";
+
+  auto Imported = Importer.Import(*FoundDecls.begin());
+  if (!Imported)
+    return testing::AssertionFailure() << "Import failed, nullptr returned!";
+
+  // This should dump source locations and assert if some source locations
+  // were not imported
+  SmallString<1024> ImportChecker;
+  llvm::raw_svector_ostream ToNothing(ImportChecker);
+  ToCtx.getTranslationUnitDecl()->print(ToNothing);
+
+  return Verifier.match(Imported, AMatcher);
+}
+
+TEST(ImportExpr, ImportStringLiteral) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(testImport("void declToImport() { \"foo\"; }",
+                         Lang_CXX, "", Lang_CXX, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 stringLiteral(
+                                   hasType(
+                                     asString("const char [4]")))))))));
+  EXPECT_TRUE(testImport("void declToImport() { L\"foo\"; }",
+                         Lang_CXX, "", Lang_CXX, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 stringLiteral(
+                                   hasType(
+                                     asString("const wchar_t [4]")))))))));
+  EXPECT_TRUE(testImport("void declToImport() { \"foo\" \"bar\"; }",
+                         Lang_CXX, "", Lang_CXX, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 stringLiteral(
+                                   hasType(
+                                     asString("const char [7]")))))))));
+}
+
+TEST(ImportExpr, ImportGNUNullExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(testImport("void declToImport() { __null; }",
+                         Lang_CXX, "", Lang_CXX, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 gnuNullExpr(
+                                   hasType(isInteger()))))))));
+}
+
+TEST(ImportExpr, ImportCXXNullPtrLiteralExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(testImport("void declToImport() { nullptr; }",
+                         Lang_CXX11, "", Lang_CXX11, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 cxxNullPtrLiteralExpr()))))));
+}
+
+
+TEST(ImportExpr, ImportFloatinglLiteralExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(testImport("void declToImport() { 1.0; }",
+                         Lang_CXX, "", Lang_CXX, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 floatLiteral(
+                                   equals(1.0),
+                                   hasType(asString("double")))))))));
+  EXPECT_TRUE(testImport("void declToImport() { 1.0e-5f; }",
+                         Lang_CXX, "", Lang_CXX, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 floatLiteral(
+                                   equals(1.0e-5f),
+                                   hasType(asString("float")))))))));
+}
+
+TEST(ImportExpr, ImportCompoundLiteralExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(
+        testImport(
+          "void declToImport() {"
+          "  struct s { int x; long y; unsigned z; }; "
+          "  (struct s){ 42, 0L, 1U }; }",
+          Lang_CXX, "", Lang_CXX, Verifier,
+          functionDecl(
+            hasBody(
+              compoundStmt(
+                has(
+                  compoundLiteralExpr(
+                    hasType(asString("struct s")),
+                    has(initListExpr(
+                      hasType(asString("struct s")),
+                      has(integerLiteral(
+                            equals(42), hasType(asString("int")))),
+                      has(integerLiteral(
+                            equals(0), hasType(asString("long")))),
+                      has(integerLiteral(
+                            equals(1),
+                            hasType(asString("unsigned int"))))
+                      )))))))));
+}
+
+TEST(ImportExpr, ImportCXXThisExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(
+        testImport("class declToImport { void f() { this; } };",
+                   Lang_CXX, "", Lang_CXX, Verifier,
+                   cxxRecordDecl(
+                     hasMethod(
+                       hasBody(
+                         compoundStmt(
+                           has(
+                             cxxThisExpr(
+                               hasType(
+                                 asString("class declToImport *"))))))))));
+}
+
+TEST(ImportExpr, ImportAtomicExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(testImport(
+      "void declToImport() { int *ptr; __atomic_load_n(ptr, 1); }", Lang_CXX,
+      "", Lang_CXX, Verifier,
+      functionDecl(hasBody(compoundStmt(has(atomicExpr(
+          has(ignoringParenImpCasts(
+              declRefExpr(hasDeclaration(varDecl(hasName("ptr"))),
+                          hasType(asString("int *"))))),
+          has(integerLiteral(equals(1), hasType(asString("int")))))))))));
+}
+
+TEST(ImportExpr, ImportLabelDeclAndAddrLabelExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(
+        testImport(
+          "void declToImport() { loop: goto loop; &&loop; }",
+          Lang_CXX, "", Lang_CXX, Verifier,
+          functionDecl(
+            hasBody(
+              compoundStmt(
+                has(labelStmt(hasDeclaration(labelDecl(hasName("loop"))))),
+                has(addrLabelExpr(hasDeclaration(labelDecl(hasName("loop")))))
+                )))));
+}
+
+AST_MATCHER_P(TemplateDecl, hasTemplateDecl,
+              internal::Matcher<NamedDecl>, InnerMatcher) {
+  const NamedDecl *Template = Node.getTemplatedDecl();
+  return Template && InnerMatcher.matches(*Template, Finder, Builder);
+}
+
+TEST(ImportExpr, ImportParenListExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(
+        testImport(
+          "template<typename T> class dummy { void f() { dummy X(*this); } };"
+          "typedef dummy<int> declToImport;"
+          "template class dummy<int>;",
+          Lang_CXX, "", Lang_CXX, Verifier,
+          typedefDecl(
+            hasType(
+              templateSpecializationType(
+                hasDeclaration(
+                  classTemplateDecl(
+                    hasTemplateDecl(
+                      cxxRecordDecl(
+                        hasMethod(
+                        allOf(
+                          hasName("f"),
+                          hasBody(
+                            compoundStmt(
+                              has(
+                                declStmt(
+                                  hasSingleDecl(
+                                    varDecl(
+                                      hasInitializer(
+                                        parenListExpr(
+                                          has(
+                                            unaryOperator(
+                                              hasOperatorName("*"),
+                                              hasUnaryOperand(cxxThisExpr())
+                                              )))))))))))))))))))));
+}
+
+TEST(ImportExpr, ImportStmtExpr) {
+  MatchVerifier<Decl> Verifier;
+  // NOTE: has() ignores implicit casts, using hasDescendant() to match it
+  EXPECT_TRUE(
+        testImport(
+          "void declToImport() { int b; int a = b ?: 1; int C = ({int X=4; X;}); }",
+          Lang_CXX, "", Lang_CXX, Verifier,
+          functionDecl(
+            hasBody(
+              compoundStmt(
+                has(
+                  declStmt(
+                    hasSingleDecl(
+                      varDecl(
+                        hasName("C"),
+                        hasType(asString("int")),
+                        hasInitializer(
+                          stmtExpr(
+                            hasAnySubstatement(
+                              declStmt(
+                                hasSingleDecl(
+                                  varDecl(
+                                    hasName("X"),
+                                    hasType(asString("int")),
+                                    hasInitializer(
+                                      integerLiteral(equals(4))))))),
+                            hasDescendant(
+                              implicitCastExpr()
+                              ))))))))))));
+}
+
+TEST(ImportExpr, ImportConditionalOperator) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(
+        testImport(
+          "void declToImport() { true ? 1 : -5; }",
+          Lang_CXX, "", Lang_CXX, Verifier,
+          functionDecl(
+            hasBody(
+              compoundStmt(
+                has(
+                  conditionalOperator(
+                    hasCondition(cxxBoolLiteral(equals(true))),
+                    hasTrueExpression(integerLiteral(equals(1))),
+                    hasFalseExpression(
+                      unaryOperator(hasUnaryOperand(integerLiteral(equals(5))))
+                      ))))))));
+}
+
+TEST(ImportExpr, ImportBinaryConditionalOperator) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(
+        testImport(
+          "void declToImport() { 1 ?: -5; }",
+          Lang_CXX, "", Lang_CXX, Verifier,
+          functionDecl(
+            hasBody(
+              compoundStmt(
+                has(
+                  binaryConditionalOperator(
+                    hasCondition(
+                      implicitCastExpr(
+                        hasSourceExpression(
+                          opaqueValueExpr(
+                            hasSourceExpression(integerLiteral(equals(1))))),
+                        hasType(booleanType()))),
+                    hasTrueExpression(
+                      opaqueValueExpr(hasSourceExpression(
+                                        integerLiteral(equals(1))))),
+                    hasFalseExpression(
+                      unaryOperator(hasOperatorName("-"),
+                                    hasUnaryOperand(integerLiteral(equals(5)))))
+                      )))))));
+}
+
+TEST(ImportExpr, ImportDesignatedInitExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(testImport("void declToImport() {"
+                         "  struct point { double x; double y; };"
+                         "  struct point ptarray[10] = "
+                                "{ [2].y = 1.0, [2].x = 2.0, [0].x = 1.0 }; }",
+                         Lang_C, "", Lang_C, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 declStmt(
+                                   hasSingleDecl(
+                                     varDecl(
+                                       hasInitializer(
+                                         initListExpr(
+                                           hasSyntacticForm(
+                                             initListExpr(
+                                               has(
+                                                 designatedInitExpr(
+                                                   designatorCountIs(2),
+                                                   has(floatLiteral(
+                                                         equals(1.0))),
+                                                   has(integerLiteral(
+                                                         equals(2))))),
+                                               has(
+                                                 designatedInitExpr(
+                                                   designatorCountIs(2),
+                                                   has(floatLiteral(
+                                                         equals(2.0))),
+                                                   has(integerLiteral(
+                                                         equals(2))))),
+                                               has(
+                                                 designatedInitExpr(
+                                                   designatorCountIs(2),
+                                                   has(floatLiteral(
+                                                         equals(1.0))),
+                                                   has(integerLiteral(
+                                                         equals(0)))))
+                                               )))))))))))));
+}
+
+
+TEST(ImportExpr, ImportPredefinedExpr) {
+  MatchVerifier<Decl> Verifier;
+  // __func__ expands as StringLiteral("declToImport")
+  EXPECT_TRUE(testImport("void declToImport() { __func__; }",
+                         Lang_CXX, "", Lang_CXX, Verifier,
+                         functionDecl(
+                           hasBody(
+                             compoundStmt(
+                               has(
+                                 predefinedExpr(
+                                   hasType(
+                                     asString("const char [13]")),
+                                   has(
+                                     stringLiteral(
+                                       hasType(
+                                         asString("const char [13]")))))))))));
+}
+
+TEST(ImportExpr, ImportInitListExpr) {
+  MatchVerifier<Decl> Verifier;
+  EXPECT_TRUE(
+        testImport(
+          "void declToImport() {"
+          "  struct point { double x; double y; };"
+          "  point ptarray[10] = { [2].y = 1.0, [2].x = 2.0,"
+          "                        [0].x = 1.0 }; }",
+          Lang_CXX, "", Lang_CXX, Verifier,
+          functionDecl(
+            hasBody(
+              compoundStmt(
+                has(
+                  declStmt(
+                    hasSingleDecl(
+                      varDecl(
+                        hasInitializer(
+                          initListExpr(
+                            has(
+                              cxxConstructExpr(
+                                requiresZeroInitialization())),
+                            has(
+                              initListExpr(
+                                hasType(asString("struct point")),
+                                has(floatLiteral(equals(1.0))),
+                                has(implicitValueInitExpr(
+                                      hasType(asString("double")))))),
+                            has(
+                              initListExpr(
+                                hasType(asString("struct point")),
+                                has(floatLiteral(equals(2.0))),
+                                has(floatLiteral(equals(1.0)))))
+                              )))))))))));
+}
+
+
+} // end namespace ast_matchers
+} // end namespace clang
diff --git a/unittests/AST/ASTVectorTest.cpp b/unittests/AST/ASTVectorTest.cpp
index 55c06d0..359d2f4 100644
--- a/unittests/AST/ASTVectorTest.cpp
+++ b/unittests/AST/ASTVectorTest.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/Compiler.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTVector.h"
 #include "clang/Basic/Builtins.h"
diff --git a/unittests/AST/CMakeLists.txt b/unittests/AST/CMakeLists.txt
index 2fa1078..a7008f3 100644
--- a/unittests/AST/CMakeLists.txt
+++ b/unittests/AST/CMakeLists.txt
@@ -4,6 +4,7 @@
 
 add_clang_unittest(ASTTests
   ASTContextParentMapTest.cpp
+  ASTImporterTest.cpp
   ASTTypeTraitsTest.cpp
   ASTVectorTest.cpp
   CommentLexer.cpp
@@ -13,6 +14,7 @@
   EvaluateAsRValueTest.cpp
   ExternalASTSourceTest.cpp
   NamedDeclPrinterTest.cpp
+  PostOrderASTVisitor.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
   )
diff --git a/unittests/AST/CommentParser.cpp b/unittests/AST/CommentParser.cpp
index f6ef9b9..a185f73 100644
--- a/unittests/AST/CommentParser.cpp
+++ b/unittests/AST/CommentParser.cpp
@@ -20,7 +20,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Allocator.h"
 #include "gtest/gtest.h"
-#include <vector>
 
 using namespace llvm;
 using namespace clang;
diff --git a/unittests/AST/ExternalASTSourceTest.cpp b/unittests/AST/ExternalASTSourceTest.cpp
index 4f42dcf..4b3bb3e 100644
--- a/unittests/AST/ExternalASTSourceTest.cpp
+++ b/unittests/AST/ExternalASTSourceTest.cpp
@@ -17,6 +17,7 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "gtest/gtest.h"
 
 using namespace clang;
diff --git a/unittests/AST/Makefile b/unittests/AST/Makefile
deleted file mode 100644
index a306ac9..0000000
--- a/unittests/AST/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-##===- unittests/AST/Makefile ------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = AST
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangTooling.a clangFrontend.a clangSerialization.a clangDriver.a \
-           clangRewrite.a clangRewriteFrontend.a \
-           clangParse.a clangSema.a clangAnalysis.a \
-           clangEdit.a clangAST.a clangASTMatchers.a clangLex.a \
-           clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/AST/MatchVerifier.h b/unittests/AST/MatchVerifier.h
index 3193247..74b9bdf 100644
--- a/unittests/AST/MatchVerifier.h
+++ b/unittests/AST/MatchVerifier.h
@@ -62,6 +62,9 @@
                                  std::vector<std::string>& Args,
                                  Language L);
 
+  template <typename MatcherType>
+  testing::AssertionResult match(const Decl *D, const MatcherType &AMatcher);
+
 protected:
   void run(const MatchFinder::MatchResult &Result) override;
   virtual void verify(const MatchFinder::MatchResult &Result,
@@ -127,6 +130,22 @@
   return testing::AssertionSuccess();
 }
 
+/// \brief Runs a matcher over some AST, and returns the result of the
+/// verifier for the matched node.
+template <typename NodeType> template <typename MatcherType>
+testing::AssertionResult MatchVerifier<NodeType>::match(
+    const Decl *D, const MatcherType &AMatcher) {
+  MatchFinder Finder;
+  Finder.addMatcher(AMatcher.bind(""), this);
+
+  setFailure("Could not find match");
+  Finder.match(*D, D->getASTContext());
+
+  if (!Verified)
+    return testing::AssertionFailure() << VerifyResult;
+  return testing::AssertionSuccess();
+}
+
 template <typename NodeType>
 void MatchVerifier<NodeType>::run(const MatchFinder::MatchResult &Result) {
   const NodeType *Node = Result.Nodes.getNodeAs<NodeType>("");
diff --git a/unittests/AST/PostOrderASTVisitor.cpp b/unittests/AST/PostOrderASTVisitor.cpp
new file mode 100644
index 0000000..012f63a
--- /dev/null
+++ b/unittests/AST/PostOrderASTVisitor.cpp
@@ -0,0 +1,123 @@
+//===- unittests/AST/PostOrderASTVisitor.cpp - Declaration printer tests --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains tests for the post-order traversing functionality
+// of RecursiveASTVisitor.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+
+using namespace clang;
+
+namespace {
+
+  class RecordingVisitor
+    : public RecursiveASTVisitor<RecordingVisitor> {
+
+    bool VisitPostOrder;
+  public:
+    explicit RecordingVisitor(bool VisitPostOrder)
+      : VisitPostOrder(VisitPostOrder) {
+    }
+
+    // List of visited nodes during traversal.
+    std::vector<std::string> VisitedNodes;
+
+    bool shouldTraversePostOrder() const { return VisitPostOrder; }
+
+    bool VisitBinaryOperator(BinaryOperator *Op) {
+      VisitedNodes.push_back(Op->getOpcodeStr());
+      return true;
+    }
+
+    bool VisitIntegerLiteral(IntegerLiteral *Lit) {
+      VisitedNodes.push_back(Lit->getValue().toString(10, false));
+      return true;
+    }
+
+    bool VisitVarDecl(VarDecl* D) {
+      VisitedNodes.push_back(D->getNameAsString());
+      return true;
+    }
+
+    bool VisitCXXMethodDecl(CXXMethodDecl *D) {
+      VisitedNodes.push_back(D->getQualifiedNameAsString());
+      return true;
+    }
+
+    bool VisitReturnStmt(ReturnStmt *S) {
+      VisitedNodes.push_back("return");
+      return true;
+    }
+
+    bool VisitCXXRecordDecl(CXXRecordDecl *Declaration) {
+      VisitedNodes.push_back(Declaration->getQualifiedNameAsString());
+      return true;
+    }
+
+    bool VisitTemplateTypeParmType(TemplateTypeParmType *T) {
+      VisitedNodes.push_back(T->getDecl()->getQualifiedNameAsString());
+      return true;
+    }
+  };
+
+}
+
+TEST(RecursiveASTVisitor, PostOrderTraversal) {
+  auto ASTUnit = tooling::buildASTFromCode(
+    "class A {"
+    "  class B {"
+    "    int foo() { while(4) { int i = 9; } return (1 + 3) + 2; }"
+    "  };"
+    "};"
+  );
+  auto TU = ASTUnit->getASTContext().getTranslationUnitDecl();
+  // We traverse the translation unit and store all
+  // visited nodes.
+  RecordingVisitor Visitor(true);
+  Visitor.TraverseTranslationUnitDecl(TU);
+
+  std::vector<std::string> expected = {
+    "4", "9", "i", "1", "3", "+", "2", "+", "return", "A::B::foo", "A::B", "A"
+  };
+  // Compare the list of actually visited nodes
+  // with the expected list of visited nodes.
+  ASSERT_EQ(expected.size(), Visitor.VisitedNodes.size());
+  for (std::size_t I = 0; I < expected.size(); I++) {
+    ASSERT_EQ(expected[I], Visitor.VisitedNodes[I]);
+  }
+}
+
+TEST(RecursiveASTVisitor, NoPostOrderTraversal) {
+  auto ASTUnit = tooling::buildASTFromCode(
+    "class A {"
+    "  class B {"
+    "    int foo() { return 1 + 2; }"
+    "  };"
+    "};"
+  );
+  auto TU = ASTUnit->getASTContext().getTranslationUnitDecl();
+  // We traverse the translation unit and store all
+  // visited nodes.
+  RecordingVisitor Visitor(false);
+  Visitor.TraverseTranslationUnitDecl(TU);
+
+  std::vector<std::string> expected = {
+    "A", "A::B", "A::B::foo", "return", "+", "1", "2"
+  };
+  // Compare the list of actually visited nodes
+  // with the expected list of visited nodes.
+  ASSERT_EQ(expected.size(), Visitor.VisitedNodes.size());
+  for (std::size_t I = 0; I < expected.size(); I++) {
+    ASSERT_EQ(expected[I], Visitor.VisitedNodes[I]);
+  }
+}
diff --git a/unittests/ASTMatchers/ASTMatchersInternalTest.cpp b/unittests/ASTMatchers/ASTMatchersInternalTest.cpp
new file mode 100644
index 0000000..c12056f
--- /dev/null
+++ b/unittests/ASTMatchers/ASTMatchersInternalTest.cpp
@@ -0,0 +1,240 @@
+// unittests/ASTMatchers/ASTMatchersInternalTest.cpp - AST matcher unit tests //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ASTMatchersTest.h"
+#include "clang/AST/PrettyPrinter.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Host.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace ast_matchers {
+
+#if GTEST_HAS_DEATH_TEST
+TEST(HasNameDeathTest, DiesOnEmptyName) {
+  ASSERT_DEBUG_DEATH({
+    DeclarationMatcher HasEmptyName = recordDecl(hasName(""));
+    EXPECT_TRUE(notMatches("class X {};", HasEmptyName));
+  }, "");
+}
+
+TEST(HasNameDeathTest, DiesOnEmptyPattern) {
+  ASSERT_DEBUG_DEATH({
+      DeclarationMatcher HasEmptyName = recordDecl(matchesName(""));
+      EXPECT_TRUE(notMatches("class X {};", HasEmptyName));
+    }, "");
+}
+
+TEST(IsDerivedFromDeathTest, DiesOnEmptyBaseName) {
+  ASSERT_DEBUG_DEATH({
+    DeclarationMatcher IsDerivedFromEmpty = cxxRecordDecl(isDerivedFrom(""));
+    EXPECT_TRUE(notMatches("class X {};", IsDerivedFromEmpty));
+  }, "");
+}
+#endif
+
+TEST(ConstructVariadic, MismatchedTypes_Regression) {
+  EXPECT_TRUE(
+      matches("const int a = 0;",
+              internal::DynTypedMatcher::constructVariadic(
+                  internal::DynTypedMatcher::VO_AnyOf,
+                  ast_type_traits::ASTNodeKind::getFromNodeKind<QualType>(),
+                  {isConstQualified(), arrayType()})
+                  .convertTo<QualType>()));
+}
+
+// For testing AST_MATCHER_P().
+AST_MATCHER_P(Decl, just, internal::Matcher<Decl>, AMatcher) {
+  // Make sure all special variables are used: node, match_finder,
+  // bound_nodes_builder, and the parameter named 'AMatcher'.
+  return AMatcher.matches(Node, Finder, Builder);
+}
+
+TEST(AstMatcherPMacro, Works) {
+  DeclarationMatcher HasClassB = just(has(recordDecl(hasName("B")).bind("b")));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue("class A { class B {}; };",
+      HasClassB, llvm::make_unique<VerifyIdIsBoundTo<Decl>>("b")));
+
+  EXPECT_TRUE(matchAndVerifyResultFalse("class A { class B {}; };",
+      HasClassB, llvm::make_unique<VerifyIdIsBoundTo<Decl>>("a")));
+
+  EXPECT_TRUE(matchAndVerifyResultFalse("class A { class C {}; };",
+      HasClassB, llvm::make_unique<VerifyIdIsBoundTo<Decl>>("b")));
+}
+
+AST_POLYMORPHIC_MATCHER_P(polymorphicHas,
+                          AST_POLYMORPHIC_SUPPORTED_TYPES(Decl, Stmt),
+                          internal::Matcher<Decl>, AMatcher) {
+  return Finder->matchesChildOf(
+      Node, AMatcher, Builder,
+      ASTMatchFinder::TK_IgnoreImplicitCastsAndParentheses,
+      ASTMatchFinder::BK_First);
+}
+
+TEST(AstPolymorphicMatcherPMacro, Works) {
+  DeclarationMatcher HasClassB =
+      polymorphicHas(recordDecl(hasName("B")).bind("b"));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue("class A { class B {}; };",
+      HasClassB, llvm::make_unique<VerifyIdIsBoundTo<Decl>>("b")));
+
+  EXPECT_TRUE(matchAndVerifyResultFalse("class A { class B {}; };",
+      HasClassB, llvm::make_unique<VerifyIdIsBoundTo<Decl>>("a")));
+
+  EXPECT_TRUE(matchAndVerifyResultFalse("class A { class C {}; };",
+      HasClassB, llvm::make_unique<VerifyIdIsBoundTo<Decl>>("b")));
+
+  StatementMatcher StatementHasClassB =
+      polymorphicHas(recordDecl(hasName("B")));
+
+  EXPECT_TRUE(matches("void x() { class B {}; }", StatementHasClassB));
+}
+
+TEST(MatchFinder, CheckProfiling) {
+  MatchFinder::MatchFinderOptions Options;
+  llvm::StringMap<llvm::TimeRecord> Records;
+  Options.CheckProfiling.emplace(Records);
+  MatchFinder Finder(std::move(Options));
+
+  struct NamedCallback : public MatchFinder::MatchCallback {
+    void run(const MatchFinder::MatchResult &Result) override {}
+    StringRef getID() const override { return "MyID"; }
+  } Callback;
+  Finder.addMatcher(decl(), &Callback);
+  std::unique_ptr<FrontendActionFactory> Factory(
+      newFrontendActionFactory(&Finder));
+  ASSERT_TRUE(tooling::runToolOnCode(Factory->create(), "int x;"));
+
+  EXPECT_EQ(1u, Records.size());
+  EXPECT_EQ("MyID", Records.begin()->getKey());
+}
+
+class VerifyStartOfTranslationUnit : public MatchFinder::MatchCallback {
+public:
+  VerifyStartOfTranslationUnit() : Called(false) {}
+  void run(const MatchFinder::MatchResult &Result) override {
+    EXPECT_TRUE(Called);
+  }
+  void onStartOfTranslationUnit() override { Called = true; }
+  bool Called;
+};
+
+TEST(MatchFinder, InterceptsStartOfTranslationUnit) {
+  MatchFinder Finder;
+  VerifyStartOfTranslationUnit VerifyCallback;
+  Finder.addMatcher(decl(), &VerifyCallback);
+  std::unique_ptr<FrontendActionFactory> Factory(
+      newFrontendActionFactory(&Finder));
+  ASSERT_TRUE(tooling::runToolOnCode(Factory->create(), "int x;"));
+  EXPECT_TRUE(VerifyCallback.Called);
+
+  VerifyCallback.Called = false;
+  std::unique_ptr<ASTUnit> AST(tooling::buildASTFromCode("int x;"));
+  ASSERT_TRUE(AST.get());
+  Finder.matchAST(AST->getASTContext());
+  EXPECT_TRUE(VerifyCallback.Called);
+}
+
+class VerifyEndOfTranslationUnit : public MatchFinder::MatchCallback {
+public:
+  VerifyEndOfTranslationUnit() : Called(false) {}
+  void run(const MatchFinder::MatchResult &Result) override {
+    EXPECT_FALSE(Called);
+  }
+  void onEndOfTranslationUnit() override { Called = true; }
+  bool Called;
+};
+
+TEST(MatchFinder, InterceptsEndOfTranslationUnit) {
+  MatchFinder Finder;
+  VerifyEndOfTranslationUnit VerifyCallback;
+  Finder.addMatcher(decl(), &VerifyCallback);
+  std::unique_ptr<FrontendActionFactory> Factory(
+      newFrontendActionFactory(&Finder));
+  ASSERT_TRUE(tooling::runToolOnCode(Factory->create(), "int x;"));
+  EXPECT_TRUE(VerifyCallback.Called);
+
+  VerifyCallback.Called = false;
+  std::unique_ptr<ASTUnit> AST(tooling::buildASTFromCode("int x;"));
+  ASSERT_TRUE(AST.get());
+  Finder.matchAST(AST->getASTContext());
+  EXPECT_TRUE(VerifyCallback.Called);
+}
+
+TEST(Matcher, matchOverEntireASTContext) {
+  std::unique_ptr<ASTUnit> AST =
+      clang::tooling::buildASTFromCode("struct { int *foo; };");
+  ASSERT_TRUE(AST.get());
+  auto PT = selectFirst<PointerType>(
+      "x", match(pointerType().bind("x"), AST->getASTContext()));
+  EXPECT_NE(nullptr, PT);
+}
+
+TEST(IsInlineMatcher, IsInline) {
+  EXPECT_TRUE(matches("void g(); inline void f();",
+                      functionDecl(isInline(), hasName("f"))));
+  EXPECT_TRUE(matches("namespace n { inline namespace m {} }",
+                      namespaceDecl(isInline(), hasName("m"))));
+}
+
+// FIXME: Figure out how to specify paths so the following tests pass on
+// Windows.
+#ifndef LLVM_ON_WIN32
+
+TEST(Matcher, IsExpansionInMainFileMatcher) {
+  EXPECT_TRUE(matches("class X {};",
+                      recordDecl(hasName("X"), isExpansionInMainFile())));
+  EXPECT_TRUE(notMatches("", recordDecl(isExpansionInMainFile())));
+  FileContentMappings M;
+  M.push_back(std::make_pair("/other", "class X {};"));
+  EXPECT_TRUE(matchesConditionally("#include <other>\n",
+                                   recordDecl(isExpansionInMainFile()), false,
+                                   "-isystem/", M));
+}
+
+TEST(Matcher, IsExpansionInSystemHeader) {
+  FileContentMappings M;
+  M.push_back(std::make_pair("/other", "class X {};"));
+  EXPECT_TRUE(matchesConditionally(
+      "#include \"other\"\n", recordDecl(isExpansionInSystemHeader()), true,
+      "-isystem/", M));
+  EXPECT_TRUE(matchesConditionally("#include \"other\"\n",
+                                   recordDecl(isExpansionInSystemHeader()),
+                                   false, "-I/", M));
+  EXPECT_TRUE(notMatches("class X {};",
+                         recordDecl(isExpansionInSystemHeader())));
+  EXPECT_TRUE(notMatches("", recordDecl(isExpansionInSystemHeader())));
+}
+
+TEST(Matcher, IsExpansionInFileMatching) {
+  FileContentMappings M;
+  M.push_back(std::make_pair("/foo", "class A {};"));
+  M.push_back(std::make_pair("/bar", "class B {};"));
+  EXPECT_TRUE(matchesConditionally(
+      "#include <foo>\n"
+      "#include <bar>\n"
+      "class X {};",
+      recordDecl(isExpansionInFileMatching("b.*"), hasName("B")), true,
+      "-isystem/", M));
+  EXPECT_TRUE(matchesConditionally(
+      "#include <foo>\n"
+      "#include <bar>\n"
+      "class X {};",
+      recordDecl(isExpansionInFileMatching("f.*"), hasName("X")), false,
+      "-isystem/", M));
+}
+
+#endif // LLVM_ON_WIN32
+
+} // end namespace ast_matchers
+} // end namespace clang
diff --git a/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
new file mode 100644
index 0000000..108fd43
--- /dev/null
+++ b/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -0,0 +1,1941 @@
+// unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp - AST matcher unit tests//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ASTMatchersTest.h"
+#include "clang/AST/PrettyPrinter.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Host.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace ast_matchers {
+
+
+TEST(AllOf, AllOverloadsWork) {
+  const char Program[] =
+      "struct T { };"
+      "int f(int, T*, int, int);"
+      "void g(int x) { T t; f(x, &t, 3, 4); }";
+  EXPECT_TRUE(matches(Program,
+      callExpr(allOf(callee(functionDecl(hasName("f"))),
+                     hasArgument(0, declRefExpr(to(varDecl())))))));
+  EXPECT_TRUE(matches(Program,
+      callExpr(allOf(callee(functionDecl(hasName("f"))),
+                     hasArgument(0, declRefExpr(to(varDecl()))),
+                     hasArgument(1, hasType(pointsTo(
+                                        recordDecl(hasName("T")))))))));
+  EXPECT_TRUE(matches(Program,
+      callExpr(allOf(callee(functionDecl(hasName("f"))),
+                     hasArgument(0, declRefExpr(to(varDecl()))),
+                     hasArgument(1, hasType(pointsTo(
+                                        recordDecl(hasName("T"))))),
+                     hasArgument(2, integerLiteral(equals(3)))))));
+  EXPECT_TRUE(matches(Program,
+      callExpr(allOf(callee(functionDecl(hasName("f"))),
+                     hasArgument(0, declRefExpr(to(varDecl()))),
+                     hasArgument(1, hasType(pointsTo(
+                                        recordDecl(hasName("T"))))),
+                     hasArgument(2, integerLiteral(equals(3))),
+                     hasArgument(3, integerLiteral(equals(4)))))));
+}
+
+TEST(DeclarationMatcher, MatchHas) {
+  DeclarationMatcher HasClassX = recordDecl(has(recordDecl(hasName("X"))));
+  EXPECT_TRUE(matches("class Y { class X {}; };", HasClassX));
+  EXPECT_TRUE(matches("class X {};", HasClassX));
+
+  DeclarationMatcher YHasClassX =
+    recordDecl(hasName("Y"), has(recordDecl(hasName("X"))));
+  EXPECT_TRUE(matches("class Y { class X {}; };", YHasClassX));
+  EXPECT_TRUE(notMatches("class X {};", YHasClassX));
+  EXPECT_TRUE(
+    notMatches("class Y { class Z { class X {}; }; };", YHasClassX));
+}
+
+TEST(DeclarationMatcher, MatchHasRecursiveAllOf) {
+  DeclarationMatcher Recursive =
+    recordDecl(
+      has(recordDecl(
+        has(recordDecl(hasName("X"))),
+        has(recordDecl(hasName("Y"))),
+        hasName("Z"))),
+      has(recordDecl(
+        has(recordDecl(hasName("A"))),
+        has(recordDecl(hasName("B"))),
+        hasName("C"))),
+      hasName("F"));
+
+  EXPECT_TRUE(matches(
+    "class F {"
+      "  class Z {"
+      "    class X {};"
+      "    class Y {};"
+      "  };"
+      "  class C {"
+      "    class A {};"
+      "    class B {};"
+      "  };"
+      "};", Recursive));
+
+  EXPECT_TRUE(matches(
+    "class F {"
+      "  class Z {"
+      "    class A {};"
+      "    class X {};"
+      "    class Y {};"
+      "  };"
+      "  class C {"
+      "    class X {};"
+      "    class A {};"
+      "    class B {};"
+      "  };"
+      "};", Recursive));
+
+  EXPECT_TRUE(matches(
+    "class O1 {"
+      "  class O2 {"
+      "    class F {"
+      "      class Z {"
+      "        class A {};"
+      "        class X {};"
+      "        class Y {};"
+      "      };"
+      "      class C {"
+      "        class X {};"
+      "        class A {};"
+      "        class B {};"
+      "      };"
+      "    };"
+      "  };"
+      "};", Recursive));
+}
+
+TEST(DeclarationMatcher, MatchHasRecursiveAnyOf) {
+  DeclarationMatcher Recursive =
+    recordDecl(
+      anyOf(
+        has(recordDecl(
+          anyOf(
+            has(recordDecl(
+              hasName("X"))),
+            has(recordDecl(
+              hasName("Y"))),
+            hasName("Z")))),
+        has(recordDecl(
+          anyOf(
+            hasName("C"),
+            has(recordDecl(
+              hasName("A"))),
+            has(recordDecl(
+              hasName("B")))))),
+        hasName("F")));
+
+  EXPECT_TRUE(matches("class F {};", Recursive));
+  EXPECT_TRUE(matches("class Z {};", Recursive));
+  EXPECT_TRUE(matches("class C {};", Recursive));
+  EXPECT_TRUE(matches("class M { class N { class X {}; }; };", Recursive));
+  EXPECT_TRUE(matches("class M { class N { class B {}; }; };", Recursive));
+  EXPECT_TRUE(
+    matches("class O1 { class O2 {"
+              "  class M { class N { class B {}; }; }; "
+              "}; };", Recursive));
+}
+
+TEST(DeclarationMatcher, MatchNot) {
+  DeclarationMatcher NotClassX =
+    cxxRecordDecl(
+      isDerivedFrom("Y"),
+      unless(hasName("X")));
+  EXPECT_TRUE(notMatches("", NotClassX));
+  EXPECT_TRUE(notMatches("class Y {};", NotClassX));
+  EXPECT_TRUE(matches("class Y {}; class Z : public Y {};", NotClassX));
+  EXPECT_TRUE(notMatches("class Y {}; class X : public Y {};", NotClassX));
+  EXPECT_TRUE(
+    notMatches("class Y {}; class Z {}; class X : public Y {};",
+               NotClassX));
+
+  DeclarationMatcher ClassXHasNotClassY =
+    recordDecl(
+      hasName("X"),
+      has(recordDecl(hasName("Z"))),
+      unless(
+        has(recordDecl(hasName("Y")))));
+  EXPECT_TRUE(matches("class X { class Z {}; };", ClassXHasNotClassY));
+  EXPECT_TRUE(notMatches("class X { class Y {}; class Z {}; };",
+                         ClassXHasNotClassY));
+
+  DeclarationMatcher NamedNotRecord =
+    namedDecl(hasName("Foo"), unless(recordDecl()));
+  EXPECT_TRUE(matches("void Foo(){}", NamedNotRecord));
+  EXPECT_TRUE(notMatches("struct Foo {};", NamedNotRecord));
+}
+
+TEST(CastExpression, HasCastKind) {
+  EXPECT_TRUE(matches("char *p = 0;",
+              castExpr(hasCastKind(CK_NullToPointer))));
+  EXPECT_TRUE(notMatches("char *p = 0;",
+              castExpr(hasCastKind(CK_DerivedToBase))));
+  EXPECT_TRUE(matches("char *p = 0;",
+              implicitCastExpr(hasCastKind(CK_NullToPointer))));
+}
+
+TEST(DeclarationMatcher, HasDescendant) {
+  DeclarationMatcher ZDescendantClassX =
+    recordDecl(
+      hasDescendant(recordDecl(hasName("X"))),
+      hasName("Z"));
+  EXPECT_TRUE(matches("class Z { class X {}; };", ZDescendantClassX));
+  EXPECT_TRUE(
+    matches("class Z { class Y { class X {}; }; };", ZDescendantClassX));
+  EXPECT_TRUE(
+    matches("class Z { class A { class Y { class X {}; }; }; };",
+            ZDescendantClassX));
+  EXPECT_TRUE(
+    matches("class Z { class A { class B { class Y { class X {}; }; }; }; };",
+            ZDescendantClassX));
+  EXPECT_TRUE(notMatches("class Z {};", ZDescendantClassX));
+
+  DeclarationMatcher ZDescendantClassXHasClassY =
+    recordDecl(
+      hasDescendant(recordDecl(has(recordDecl(hasName("Y"))),
+                               hasName("X"))),
+      hasName("Z"));
+  EXPECT_TRUE(matches("class Z { class X { class Y {}; }; };",
+                      ZDescendantClassXHasClassY));
+  EXPECT_TRUE(
+    matches("class Z { class A { class B { class X { class Y {}; }; }; }; };",
+            ZDescendantClassXHasClassY));
+  EXPECT_TRUE(notMatches(
+    "class Z {"
+      "  class A {"
+      "    class B {"
+      "      class X {"
+      "        class C {"
+      "          class Y {};"
+      "        };"
+      "      };"
+      "    }; "
+      "  };"
+      "};", ZDescendantClassXHasClassY));
+
+  DeclarationMatcher ZDescendantClassXDescendantClassY =
+    recordDecl(
+      hasDescendant(recordDecl(hasDescendant(recordDecl(hasName("Y"))),
+                               hasName("X"))),
+      hasName("Z"));
+  EXPECT_TRUE(
+    matches("class Z { class A { class X { class B { class Y {}; }; }; }; };",
+            ZDescendantClassXDescendantClassY));
+  EXPECT_TRUE(matches(
+    "class Z {"
+      "  class A {"
+      "    class X {"
+      "      class B {"
+      "        class Y {};"
+      "      };"
+      "      class Y {};"
+      "    };"
+      "  };"
+      "};", ZDescendantClassXDescendantClassY));
+}
+
+TEST(DeclarationMatcher, HasDescendantMemoization) {
+  DeclarationMatcher CannotMemoize =
+    decl(hasDescendant(typeLoc().bind("x")), has(decl()));
+  EXPECT_TRUE(matches("void f() { int i; }", CannotMemoize));
+}
+
+TEST(DeclarationMatcher, HasDescendantMemoizationUsesRestrictKind) {
+  auto Name = hasName("i");
+  auto VD = internal::Matcher<VarDecl>(Name).dynCastTo<Decl>();
+  auto RD = internal::Matcher<RecordDecl>(Name).dynCastTo<Decl>();
+  // Matching VD first should not make a cache hit for RD.
+  EXPECT_TRUE(notMatches("void f() { int i; }",
+                         decl(hasDescendant(VD), hasDescendant(RD))));
+  EXPECT_TRUE(notMatches("void f() { int i; }",
+                         decl(hasDescendant(RD), hasDescendant(VD))));
+  // Not matching RD first should not make a cache hit for VD either.
+  EXPECT_TRUE(matches("void f() { int i; }",
+                      decl(anyOf(hasDescendant(RD), hasDescendant(VD)))));
+}
+
+TEST(DeclarationMatcher, HasAncestorMemoization) {
+  // This triggers an hasAncestor with a TemplateArgument in the bound nodes.
+  // That node can't be memoized so we have to check for it before trying to put
+  // it on the cache.
+  DeclarationMatcher CannotMemoize = classTemplateSpecializationDecl(
+    hasAnyTemplateArgument(templateArgument().bind("targ")),
+    forEach(fieldDecl(hasAncestor(forStmt()))));
+
+  EXPECT_TRUE(notMatches("template <typename T> struct S;"
+                           "template <> struct S<int>{ int i; int j; };",
+                         CannotMemoize));
+}
+
+TEST(DeclarationMatcher, HasAttr) {
+  EXPECT_TRUE(matches("struct __attribute__((warn_unused)) X {};",
+                      decl(hasAttr(clang::attr::WarnUnused))));
+  EXPECT_FALSE(matches("struct X {};",
+                       decl(hasAttr(clang::attr::WarnUnused))));
+}
+
+
+TEST(DeclarationMatcher, MatchAnyOf) {
+  DeclarationMatcher YOrZDerivedFromX = cxxRecordDecl(
+    anyOf(hasName("Y"), allOf(isDerivedFrom("X"), hasName("Z"))));
+  EXPECT_TRUE(matches("class X {}; class Z : public X {};", YOrZDerivedFromX));
+  EXPECT_TRUE(matches("class Y {};", YOrZDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("class X {}; class W : public X {};", YOrZDerivedFromX));
+  EXPECT_TRUE(notMatches("class Z {};", YOrZDerivedFromX));
+
+  DeclarationMatcher XOrYOrZOrU =
+    recordDecl(anyOf(hasName("X"), hasName("Y"), hasName("Z"), hasName("U")));
+  EXPECT_TRUE(matches("class X {};", XOrYOrZOrU));
+  EXPECT_TRUE(notMatches("class V {};", XOrYOrZOrU));
+
+  DeclarationMatcher XOrYOrZOrUOrV =
+    recordDecl(anyOf(hasName("X"), hasName("Y"), hasName("Z"), hasName("U"),
+                     hasName("V")));
+  EXPECT_TRUE(matches("class X {};", XOrYOrZOrUOrV));
+  EXPECT_TRUE(matches("class Y {};", XOrYOrZOrUOrV));
+  EXPECT_TRUE(matches("class Z {};", XOrYOrZOrUOrV));
+  EXPECT_TRUE(matches("class U {};", XOrYOrZOrUOrV));
+  EXPECT_TRUE(matches("class V {};", XOrYOrZOrUOrV));
+  EXPECT_TRUE(notMatches("class A {};", XOrYOrZOrUOrV));
+
+  StatementMatcher MixedTypes = stmt(anyOf(ifStmt(), binaryOperator()));
+  EXPECT_TRUE(matches("int F() { return 1 + 2; }", MixedTypes));
+  EXPECT_TRUE(matches("int F() { if (true) return 1; }", MixedTypes));
+  EXPECT_TRUE(notMatches("int F() { return 1; }", MixedTypes));
+
+  EXPECT_TRUE(
+    matches("void f() try { } catch (int) { } catch (...) { }",
+            cxxCatchStmt(anyOf(hasDescendant(varDecl()), isCatchAll()))));
+}
+
+TEST(DeclarationMatcher, ClassIsDerived) {
+  DeclarationMatcher IsDerivedFromX = cxxRecordDecl(isDerivedFrom("X"));
+
+  EXPECT_TRUE(matches("class X {}; class Y : public X {};", IsDerivedFromX));
+  EXPECT_TRUE(notMatches("class X {};", IsDerivedFromX));
+  EXPECT_TRUE(notMatches("class X;", IsDerivedFromX));
+  EXPECT_TRUE(notMatches("class Y;", IsDerivedFromX));
+  EXPECT_TRUE(notMatches("", IsDerivedFromX));
+
+  DeclarationMatcher IsAX = cxxRecordDecl(isSameOrDerivedFrom("X"));
+
+  EXPECT_TRUE(matches("class X {}; class Y : public X {};", IsAX));
+  EXPECT_TRUE(matches("class X {};", IsAX));
+  EXPECT_TRUE(matches("class X;", IsAX));
+  EXPECT_TRUE(notMatches("class Y;", IsAX));
+  EXPECT_TRUE(notMatches("", IsAX));
+
+  DeclarationMatcher ZIsDerivedFromX =
+    cxxRecordDecl(hasName("Z"), isDerivedFrom("X"));
+  EXPECT_TRUE(
+    matches("class X {}; class Y : public X {}; class Z : public Y {};",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class X {};"
+              "template<class T> class Y : public X {};"
+              "class Z : public Y<int> {};", ZIsDerivedFromX));
+  EXPECT_TRUE(matches("class X {}; template<class T> class Z : public X {};",
+                      ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template<class T> class X {}; "
+              "template<class T> class Z : public X<T> {};",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template<class T, class U=T> class X {}; "
+              "template<class T> class Z : public X<T> {};",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("template<class X> class A { class Z : public X {}; };",
+               ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template<class X> class A { public: class Z : public X {}; }; "
+              "class X{}; void y() { A<X>::Z z; }", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template <class T> class X {}; "
+              "template<class Y> class A { class Z : public X<Y> {}; };",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("template<template<class T> class X> class A { "
+                 "  class Z : public X<int> {}; };", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template<template<class T> class X> class A { "
+              "  public: class Z : public X<int> {}; }; "
+              "template<class T> class X {}; void y() { A<X>::Z z; }",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("template<class X> class A { class Z : public X::D {}; };",
+               ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template<class X> class A { public: "
+              "  class Z : public X::D {}; }; "
+              "class Y { public: class X {}; typedef X D; }; "
+              "void y() { A<Y>::Z z; }", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class X {}; typedef X Y; class Z : public Y {};",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template<class T> class Y { typedef typename T::U X; "
+              "  class Z : public X {}; };", ZIsDerivedFromX));
+  EXPECT_TRUE(matches("class X {}; class Z : public ::X {};",
+                      ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("template<class T> class X {}; "
+                 "template<class T> class A { class Z : public X<T>::D {}; };",
+               ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template<class T> class X { public: typedef X<T> D; }; "
+              "template<class T> class A { public: "
+              "  class Z : public X<T>::D {}; }; void y() { A<int>::Z z; }",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("template<class X> class A { class Z : public X::D::E {}; };",
+               ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class X {}; typedef X V; typedef V W; class Z : public W {};",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class X {}; class Y : public X {}; "
+              "typedef Y V; typedef V W; class Z : public W {};",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template<class T, class U> class X {}; "
+              "template<class T> class A { class Z : public X<T, int> {}; };",
+            ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("template<class X> class D { typedef X A; typedef A B; "
+                 "  typedef B C; class Z : public C {}; };",
+               ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class X {}; typedef X A; typedef A B; "
+              "class Z : public B {};", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class X {}; typedef X A; typedef A B; typedef B C; "
+              "class Z : public C {};", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class U {}; typedef U X; typedef X V; "
+              "class Z : public V {};", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class Base {}; typedef Base X; "
+              "class Z : public Base {};", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class Base {}; typedef Base Base2; typedef Base2 X; "
+              "class Z : public Base {};", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("class Base {}; class Base2 {}; typedef Base2 X; "
+                 "class Z : public Base {};", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    matches("class A {}; typedef A X; typedef A Y; "
+              "class Z : public Y {};", ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("template <typename T> class Z;"
+                 "template <> class Z<void> {};"
+                 "template <typename T> class Z : public Z<void> {};",
+               IsDerivedFromX));
+  EXPECT_TRUE(
+    matches("template <typename T> class X;"
+              "template <> class X<void> {};"
+              "template <typename T> class X : public X<void> {};",
+            IsDerivedFromX));
+  EXPECT_TRUE(matches(
+    "class X {};"
+      "template <typename T> class Z;"
+      "template <> class Z<void> {};"
+      "template <typename T> class Z : public Z<void>, public X {};",
+    ZIsDerivedFromX));
+  EXPECT_TRUE(
+    notMatches("template<int> struct X;"
+                 "template<int i> struct X : public X<i-1> {};",
+               cxxRecordDecl(isDerivedFrom(recordDecl(hasName("Some"))))));
+  EXPECT_TRUE(matches(
+    "struct A {};"
+      "template<int> struct X;"
+      "template<int i> struct X : public X<i-1> {};"
+      "template<> struct X<0> : public A {};"
+      "struct B : public X<42> {};",
+    cxxRecordDecl(hasName("B"), isDerivedFrom(recordDecl(hasName("A"))))));
+
+  // FIXME: Once we have better matchers for template type matching,
+  // get rid of the Variable(...) matching and match the right template
+  // declarations directly.
+  const char *RecursiveTemplateOneParameter =
+    "class Base1 {}; class Base2 {};"
+      "template <typename T> class Z;"
+      "template <> class Z<void> : public Base1 {};"
+      "template <> class Z<int> : public Base2 {};"
+      "template <> class Z<float> : public Z<void> {};"
+      "template <> class Z<double> : public Z<int> {};"
+      "template <typename T> class Z : public Z<float>, public Z<double> {};"
+      "void f() { Z<float> z_float; Z<double> z_double; Z<char> z_char; }";
+  EXPECT_TRUE(matches(
+    RecursiveTemplateOneParameter,
+    varDecl(hasName("z_float"),
+            hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base1")))))));
+  EXPECT_TRUE(notMatches(
+    RecursiveTemplateOneParameter,
+    varDecl(hasName("z_float"),
+            hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base2")))))));
+  EXPECT_TRUE(matches(
+    RecursiveTemplateOneParameter,
+    varDecl(hasName("z_char"),
+            hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base1"),
+                                                 isDerivedFrom("Base2")))))));
+
+  const char *RecursiveTemplateTwoParameters =
+    "class Base1 {}; class Base2 {};"
+      "template <typename T1, typename T2> class Z;"
+      "template <typename T> class Z<void, T> : public Base1 {};"
+      "template <typename T> class Z<int, T> : public Base2 {};"
+      "template <typename T> class Z<float, T> : public Z<void, T> {};"
+      "template <typename T> class Z<double, T> : public Z<int, T> {};"
+      "template <typename T1, typename T2> class Z : "
+      "    public Z<float, T2>, public Z<double, T2> {};"
+      "void f() { Z<float, void> z_float; Z<double, void> z_double; "
+      "           Z<char, void> z_char; }";
+  EXPECT_TRUE(matches(
+    RecursiveTemplateTwoParameters,
+    varDecl(hasName("z_float"),
+            hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base1")))))));
+  EXPECT_TRUE(notMatches(
+    RecursiveTemplateTwoParameters,
+    varDecl(hasName("z_float"),
+            hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base2")))))));
+  EXPECT_TRUE(matches(
+    RecursiveTemplateTwoParameters,
+    varDecl(hasName("z_char"),
+            hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base1"),
+                                                 isDerivedFrom("Base2")))))));
+  EXPECT_TRUE(matches(
+    "namespace ns { class X {}; class Y : public X {}; }",
+    cxxRecordDecl(isDerivedFrom("::ns::X"))));
+  EXPECT_TRUE(notMatches(
+    "class X {}; class Y : public X {};",
+    cxxRecordDecl(isDerivedFrom("::ns::X"))));
+
+  EXPECT_TRUE(matches(
+    "class X {}; class Y : public X {};",
+    cxxRecordDecl(isDerivedFrom(recordDecl(hasName("X")).bind("test")))));
+
+  EXPECT_TRUE(matches(
+    "template<typename T> class X {};"
+      "template<typename T> using Z = X<T>;"
+      "template <typename T> class Y : Z<T> {};",
+    cxxRecordDecl(isDerivedFrom(namedDecl(hasName("X"))))));
+}
+
+TEST(DeclarationMatcher, IsLambda) {
+  const auto IsLambda = cxxMethodDecl(ofClass(cxxRecordDecl(isLambda())));
+  EXPECT_TRUE(matches("auto x = []{};", IsLambda));
+  EXPECT_TRUE(notMatches("struct S { void operator()() const; };", IsLambda));
+}
+
+TEST(Matcher, BindMatchedNodes) {
+  DeclarationMatcher ClassX = has(recordDecl(hasName("::X")).bind("x"));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue("class X {};",
+                                       ClassX, llvm::make_unique<VerifyIdIsBoundTo<CXXRecordDecl>>("x")));
+
+  EXPECT_TRUE(matchAndVerifyResultFalse("class X {};",
+                                        ClassX, llvm::make_unique<VerifyIdIsBoundTo<CXXRecordDecl>>("other-id")));
+
+  TypeMatcher TypeAHasClassB = hasDeclaration(
+    recordDecl(hasName("A"), has(recordDecl(hasName("B")).bind("b"))));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue("class A { public: A *a; class B {}; };",
+                                       TypeAHasClassB,
+                                       llvm::make_unique<VerifyIdIsBoundTo<Decl>>("b")));
+
+  StatementMatcher MethodX =
+    callExpr(callee(cxxMethodDecl(hasName("x")))).bind("x");
+
+  EXPECT_TRUE(matchAndVerifyResultTrue("class A { void x() { x(); } };",
+                                       MethodX,
+                                       llvm::make_unique<VerifyIdIsBoundTo<CXXMemberCallExpr>>("x")));
+}
+
+TEST(Matcher, BindTheSameNameInAlternatives) {
+  StatementMatcher matcher = anyOf(
+    binaryOperator(hasOperatorName("+"),
+                   hasLHS(expr().bind("x")),
+                   hasRHS(integerLiteral(equals(0)))),
+    binaryOperator(hasOperatorName("+"),
+                   hasLHS(integerLiteral(equals(0))),
+                   hasRHS(expr().bind("x"))));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    // The first branch of the matcher binds x to 0 but then fails.
+    // The second branch binds x to f() and succeeds.
+    "int f() { return 0 + f(); }",
+    matcher,
+    llvm::make_unique<VerifyIdIsBoundTo<CallExpr>>("x")));
+}
+
+TEST(Matcher, BindsIDForMemoizedResults) {
+  // Using the same matcher in two match expressions will make memoization
+  // kick in.
+  DeclarationMatcher ClassX = recordDecl(hasName("X")).bind("x");
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { class B { class X {}; }; };",
+    DeclarationMatcher(anyOf(
+      recordDecl(hasName("A"), hasDescendant(ClassX)),
+      recordDecl(hasName("B"), hasDescendant(ClassX)))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 2)));
+}
+
+TEST(HasType, MatchesAsString) {
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z() {Y* y; y->x(); }",
+            cxxMemberCallExpr(on(hasType(asString("class Y *"))))));
+  EXPECT_TRUE(
+    matches("class X { void x(int x) {} };",
+            cxxMethodDecl(hasParameter(0, hasType(asString("int"))))));
+  EXPECT_TRUE(matches("namespace ns { struct A {}; }  struct B { ns::A a; };",
+                      fieldDecl(hasType(asString("ns::A")))));
+  EXPECT_TRUE(matches("namespace { struct A {}; }  struct B { A a; };",
+                      fieldDecl(hasType(asString("struct (anonymous namespace)::A")))));
+}
+
+TEST(Matcher, HasOperatorNameForOverloadedOperatorCall) {
+  StatementMatcher OpCallAndAnd =
+    cxxOperatorCallExpr(hasOverloadedOperatorName("&&"));
+  EXPECT_TRUE(matches("class Y { }; "
+                        "bool operator&&(Y x, Y y) { return true; }; "
+                        "Y a; Y b; bool c = a && b;", OpCallAndAnd));
+  StatementMatcher OpCallLessLess =
+    cxxOperatorCallExpr(hasOverloadedOperatorName("<<"));
+  EXPECT_TRUE(notMatches("class Y { }; "
+                           "bool operator&&(Y x, Y y) { return true; }; "
+                           "Y a; Y b; bool c = a && b;",
+                         OpCallLessLess));
+  StatementMatcher OpStarCall =
+    cxxOperatorCallExpr(hasOverloadedOperatorName("*"));
+  EXPECT_TRUE(matches("class Y; int operator*(Y &); void f(Y &y) { *y; }",
+                      OpStarCall));
+  DeclarationMatcher ClassWithOpStar =
+    cxxRecordDecl(hasMethod(hasOverloadedOperatorName("*")));
+  EXPECT_TRUE(matches("class Y { int operator*(); };",
+                      ClassWithOpStar));
+  EXPECT_TRUE(notMatches("class Y { void myOperator(); };",
+                         ClassWithOpStar)) ;
+  DeclarationMatcher AnyOpStar = functionDecl(hasOverloadedOperatorName("*"));
+  EXPECT_TRUE(matches("class Y; int operator*(Y &);", AnyOpStar));
+  EXPECT_TRUE(matches("class Y { int operator*(); };", AnyOpStar));
+}
+
+
+TEST(Matcher, NestedOverloadedOperatorCalls) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class Y { }; "
+      "Y& operator&&(Y& x, Y& y) { return x; }; "
+      "Y a; Y b; Y c; Y d = a && b && c;",
+    cxxOperatorCallExpr(hasOverloadedOperatorName("&&")).bind("x"),
+    llvm::make_unique<VerifyIdIsBoundTo<CXXOperatorCallExpr>>("x", 2)));
+  EXPECT_TRUE(matches("class Y { }; "
+                        "Y& operator&&(Y& x, Y& y) { return x; }; "
+                        "Y a; Y b; Y c; Y d = a && b && c;",
+                      cxxOperatorCallExpr(hasParent(cxxOperatorCallExpr()))));
+  EXPECT_TRUE(
+    matches("class Y { }; "
+              "Y& operator&&(Y& x, Y& y) { return x; }; "
+              "Y a; Y b; Y c; Y d = a && b && c;",
+            cxxOperatorCallExpr(hasDescendant(cxxOperatorCallExpr()))));
+}
+
+TEST(Matcher, VarDecl_Storage) {
+  auto M = varDecl(hasName("X"), hasLocalStorage());
+  EXPECT_TRUE(matches("void f() { int X; }", M));
+  EXPECT_TRUE(notMatches("int X;", M));
+  EXPECT_TRUE(notMatches("void f() { static int X; }", M));
+
+  M = varDecl(hasName("X"), hasGlobalStorage());
+  EXPECT_TRUE(notMatches("void f() { int X; }", M));
+  EXPECT_TRUE(matches("int X;", M));
+  EXPECT_TRUE(matches("void f() { static int X; }", M));
+}
+
+TEST(Matcher, VarDecl_StorageDuration) {
+  std::string T =
+    "void f() { int x; static int y; } int a;";
+
+  EXPECT_TRUE(matches(T, varDecl(hasName("x"), hasAutomaticStorageDuration())));
+  EXPECT_TRUE(
+    notMatches(T, varDecl(hasName("y"), hasAutomaticStorageDuration())));
+  EXPECT_TRUE(
+    notMatches(T, varDecl(hasName("a"), hasAutomaticStorageDuration())));
+
+  EXPECT_TRUE(matches(T, varDecl(hasName("y"), hasStaticStorageDuration())));
+  EXPECT_TRUE(matches(T, varDecl(hasName("a"), hasStaticStorageDuration())));
+  EXPECT_TRUE(notMatches(T, varDecl(hasName("x"), hasStaticStorageDuration())));
+
+  // FIXME: It is really hard to test with thread_local itself because not all
+  // targets support TLS, which causes this to be an error depending on what
+  // platform the test is being run on. We do not have access to the TargetInfo
+  // object to be able to test whether the platform supports TLS or not.
+  EXPECT_TRUE(notMatches(T, varDecl(hasName("x"), hasThreadStorageDuration())));
+  EXPECT_TRUE(notMatches(T, varDecl(hasName("y"), hasThreadStorageDuration())));
+  EXPECT_TRUE(notMatches(T, varDecl(hasName("a"), hasThreadStorageDuration())));
+}
+
+TEST(Matcher, FindsVarDeclInFunctionParameter) {
+  EXPECT_TRUE(matches(
+    "void f(int i) {}",
+    varDecl(hasName("i"))));
+}
+
+TEST(UnaryExpressionOrTypeTraitExpression, MatchesCorrectType) {
+  EXPECT_TRUE(matches("void x() { int a = sizeof(a); }", sizeOfExpr(
+    hasArgumentOfType(asString("int")))));
+  EXPECT_TRUE(notMatches("void x() { int a = sizeof(a); }", sizeOfExpr(
+    hasArgumentOfType(asString("float")))));
+  EXPECT_TRUE(matches(
+    "struct A {}; void x() { A a; int b = sizeof(a); }",
+    sizeOfExpr(hasArgumentOfType(hasDeclaration(recordDecl(hasName("A")))))));
+  EXPECT_TRUE(notMatches("void x() { int a = sizeof(a); }", sizeOfExpr(
+    hasArgumentOfType(hasDeclaration(recordDecl(hasName("string")))))));
+}
+
+TEST(IsInteger, MatchesIntegers) {
+  EXPECT_TRUE(matches("int i = 0;", varDecl(hasType(isInteger()))));
+  EXPECT_TRUE(matches(
+    "long long i = 0; void f(long long) { }; void g() {f(i);}",
+    callExpr(hasArgument(0, declRefExpr(
+      to(varDecl(hasType(isInteger()))))))));
+}
+
+TEST(IsInteger, ReportsNoFalsePositives) {
+  EXPECT_TRUE(notMatches("int *i;", varDecl(hasType(isInteger()))));
+  EXPECT_TRUE(notMatches("struct T {}; T t; void f(T *) { }; void g() {f(&t);}",
+                         callExpr(hasArgument(0, declRefExpr(
+                           to(varDecl(hasType(isInteger()))))))));
+}
+
+TEST(IsSignedInteger, MatchesSignedIntegers) {
+  EXPECT_TRUE(matches("int i = 0;", varDecl(hasType(isSignedInteger()))));
+  EXPECT_TRUE(notMatches("unsigned i = 0;",
+                         varDecl(hasType(isSignedInteger()))));
+}
+
+TEST(IsUnsignedInteger, MatchesUnsignedIntegers) {
+  EXPECT_TRUE(notMatches("int i = 0;", varDecl(hasType(isUnsignedInteger()))));
+  EXPECT_TRUE(matches("unsigned i = 0;",
+                      varDecl(hasType(isUnsignedInteger()))));
+}
+
+TEST(IsAnyPointer, MatchesPointers) {
+  EXPECT_TRUE(matches("int* i = nullptr;", varDecl(hasType(isAnyPointer()))));
+}
+
+TEST(IsAnyPointer, MatchesObjcPointer) {
+  EXPECT_TRUE(matchesObjC("@interface Foo @end Foo *f;",
+                          varDecl(hasType(isAnyPointer()))));
+}
+
+TEST(IsAnyPointer, ReportsNoFalsePositives) {
+  EXPECT_TRUE(notMatches("int i = 0;", varDecl(hasType(isAnyPointer()))));
+}
+
+TEST(IsAnyCharacter, MatchesCharacters) {
+  EXPECT_TRUE(matches("char i = 0;", varDecl(hasType(isAnyCharacter()))));
+}
+
+TEST(IsAnyCharacter, ReportsNoFalsePositives) {
+  EXPECT_TRUE(notMatches("int i;", varDecl(hasType(isAnyCharacter()))));
+}
+
+TEST(IsArrow, MatchesMemberVariablesViaArrow) {
+  EXPECT_TRUE(matches("class Y { void x() { this->y; } int y; };",
+                      memberExpr(isArrow())));
+  EXPECT_TRUE(matches("class Y { void x() { y; } int y; };",
+                      memberExpr(isArrow())));
+  EXPECT_TRUE(notMatches("class Y { void x() { (*this).y; } int y; };",
+                         memberExpr(isArrow())));
+}
+
+TEST(IsArrow, MatchesStaticMemberVariablesViaArrow) {
+  EXPECT_TRUE(matches("class Y { void x() { this->y; } static int y; };",
+                      memberExpr(isArrow())));
+  EXPECT_TRUE(notMatches("class Y { void x() { y; } static int y; };",
+                         memberExpr(isArrow())));
+  EXPECT_TRUE(notMatches("class Y { void x() { (*this).y; } static int y; };",
+                         memberExpr(isArrow())));
+}
+
+TEST(IsArrow, MatchesMemberCallsViaArrow) {
+  EXPECT_TRUE(matches("class Y { void x() { this->x(); } };",
+                      memberExpr(isArrow())));
+  EXPECT_TRUE(matches("class Y { void x() { x(); } };",
+                      memberExpr(isArrow())));
+  EXPECT_TRUE(notMatches("class Y { void x() { Y y; y.x(); } };",
+                         memberExpr(isArrow())));
+}
+
+TEST(ConversionDeclaration, IsExplicit) {
+  EXPECT_TRUE(matches("struct S { explicit operator int(); };",
+                      cxxConversionDecl(isExplicit())));
+  EXPECT_TRUE(notMatches("struct S { operator int(); };",
+                         cxxConversionDecl(isExplicit())));
+}
+
+TEST(Matcher, ArgumentCount) {
+  StatementMatcher Call1Arg = callExpr(argumentCountIs(1));
+
+  EXPECT_TRUE(matches("void x(int) { x(0); }", Call1Arg));
+  EXPECT_TRUE(matches("class X { void x(int) { x(0); } };", Call1Arg));
+  EXPECT_TRUE(notMatches("void x(int, int) { x(0, 0); }", Call1Arg));
+}
+
+TEST(Matcher, ParameterCount) {
+  DeclarationMatcher Function1Arg = functionDecl(parameterCountIs(1));
+  EXPECT_TRUE(matches("void f(int i) {}", Function1Arg));
+  EXPECT_TRUE(matches("class X { void f(int i) {} };", Function1Arg));
+  EXPECT_TRUE(notMatches("void f() {}", Function1Arg));
+  EXPECT_TRUE(notMatches("void f(int i, int j, int k) {}", Function1Arg));
+  EXPECT_TRUE(matches("void f(int i, ...) {};", Function1Arg));
+}
+
+TEST(Matcher, References) {
+  DeclarationMatcher ReferenceClassX = varDecl(
+    hasType(references(recordDecl(hasName("X")))));
+  EXPECT_TRUE(matches("class X {}; void y(X y) { X &x = y; }",
+                      ReferenceClassX));
+  EXPECT_TRUE(
+    matches("class X {}; void y(X y) { const X &x = y; }", ReferenceClassX));
+  // The match here is on the implicit copy constructor code for
+  // class X, not on code 'X x = y'.
+  EXPECT_TRUE(
+    matches("class X {}; void y(X y) { X x = y; }", ReferenceClassX));
+  EXPECT_TRUE(
+    notMatches("class X {}; extern X x;", ReferenceClassX));
+  EXPECT_TRUE(
+    notMatches("class X {}; void y(X *y) { X *&x = y; }", ReferenceClassX));
+}
+
+TEST(QualType, hasLocalQualifiers) {
+  EXPECT_TRUE(notMatches("typedef const int const_int; const_int i = 1;",
+                         varDecl(hasType(hasLocalQualifiers()))));
+  EXPECT_TRUE(matches("int *const j = nullptr;",
+                      varDecl(hasType(hasLocalQualifiers()))));
+  EXPECT_TRUE(matches("int *volatile k;",
+                      varDecl(hasType(hasLocalQualifiers()))));
+  EXPECT_TRUE(notMatches("int m;",
+                         varDecl(hasType(hasLocalQualifiers()))));
+}
+
+TEST(IsExternC, MatchesExternCFunctionDeclarations) {
+  EXPECT_TRUE(matches("extern \"C\" void f() {}", functionDecl(isExternC())));
+  EXPECT_TRUE(matches("extern \"C\" { void f() {} }",
+                      functionDecl(isExternC())));
+  EXPECT_TRUE(notMatches("void f() {}", functionDecl(isExternC())));
+}
+
+TEST(IsExternC, MatchesExternCVariableDeclarations) {
+  EXPECT_TRUE(matches("extern \"C\" int i;", varDecl(isExternC())));
+  EXPECT_TRUE(matches("extern \"C\" { int i; }", varDecl(isExternC())));
+  EXPECT_TRUE(notMatches("int i;", varDecl(isExternC())));
+}
+
+TEST(IsDefaulted, MatchesDefaultedFunctionDeclarations) {
+  EXPECT_TRUE(notMatches("class A { ~A(); };",
+                         functionDecl(hasName("~A"), isDefaulted())));
+  EXPECT_TRUE(matches("class B { ~B() = default; };",
+                      functionDecl(hasName("~B"), isDefaulted())));
+}
+
+TEST(IsDeleted, MatchesDeletedFunctionDeclarations) {
+  EXPECT_TRUE(
+    notMatches("void Func();", functionDecl(hasName("Func"), isDeleted())));
+  EXPECT_TRUE(matches("void Func() = delete;",
+                      functionDecl(hasName("Func"), isDeleted())));
+}
+
+TEST(IsNoThrow, MatchesNoThrowFunctionDeclarations) {
+  EXPECT_TRUE(notMatches("void f();", functionDecl(isNoThrow())));
+  EXPECT_TRUE(notMatches("void f() throw(int);", functionDecl(isNoThrow())));
+  EXPECT_TRUE(
+    notMatches("void f() noexcept(false);", functionDecl(isNoThrow())));
+  EXPECT_TRUE(matches("void f() throw();", functionDecl(isNoThrow())));
+  EXPECT_TRUE(matches("void f() noexcept;", functionDecl(isNoThrow())));
+
+  EXPECT_TRUE(notMatches("void f();", functionProtoType(isNoThrow())));
+  EXPECT_TRUE(notMatches("void f() throw(int);", functionProtoType(isNoThrow())));
+  EXPECT_TRUE(
+    notMatches("void f() noexcept(false);", functionProtoType(isNoThrow())));
+  EXPECT_TRUE(matches("void f() throw();", functionProtoType(isNoThrow())));
+  EXPECT_TRUE(matches("void f() noexcept;", functionProtoType(isNoThrow())));
+}
+
+TEST(isConstexpr, MatchesConstexprDeclarations) {
+  EXPECT_TRUE(matches("constexpr int foo = 42;",
+                      varDecl(hasName("foo"), isConstexpr())));
+  EXPECT_TRUE(matches("constexpr int bar();",
+                      functionDecl(hasName("bar"), isConstexpr())));
+}
+
+TEST(TemplateArgumentCountIs, Matches) {
+  EXPECT_TRUE(
+    matches("template<typename T> struct C {}; C<int> c;",
+            classTemplateSpecializationDecl(templateArgumentCountIs(1))));
+  EXPECT_TRUE(
+    notMatches("template<typename T> struct C {}; C<int> c;",
+               classTemplateSpecializationDecl(templateArgumentCountIs(2))));
+
+  EXPECT_TRUE(matches("template<typename T> struct C {}; C<int> c;",
+                      templateSpecializationType(templateArgumentCountIs(1))));
+  EXPECT_TRUE(
+    notMatches("template<typename T> struct C {}; C<int> c;",
+               templateSpecializationType(templateArgumentCountIs(2))));
+}
+
+TEST(IsIntegral, Matches) {
+  EXPECT_TRUE(matches("template<int T> struct C {}; C<42> c;",
+                      classTemplateSpecializationDecl(
+                        hasAnyTemplateArgument(isIntegral()))));
+  EXPECT_TRUE(notMatches("template<typename T> struct C {}; C<int> c;",
+                         classTemplateSpecializationDecl(hasAnyTemplateArgument(
+                           templateArgument(isIntegral())))));
+}
+
+TEST(EqualsIntegralValue, Matches) {
+  EXPECT_TRUE(matches("template<int T> struct C {}; C<42> c;",
+                      classTemplateSpecializationDecl(
+                        hasAnyTemplateArgument(equalsIntegralValue("42")))));
+  EXPECT_TRUE(matches("template<int T> struct C {}; C<-42> c;",
+                      classTemplateSpecializationDecl(
+                        hasAnyTemplateArgument(equalsIntegralValue("-42")))));
+  EXPECT_TRUE(matches("template<int T> struct C {}; C<-0042> c;",
+                      classTemplateSpecializationDecl(
+                        hasAnyTemplateArgument(equalsIntegralValue("-34")))));
+  EXPECT_TRUE(notMatches("template<int T> struct C {}; C<42> c;",
+                         classTemplateSpecializationDecl(hasAnyTemplateArgument(
+                           equalsIntegralValue("0042")))));
+}
+
+TEST(Matcher, MatchesAccessSpecDecls) {
+  EXPECT_TRUE(matches("class C { public: int i; };", accessSpecDecl()));
+  EXPECT_TRUE(
+      matches("class C { public: int i; };", accessSpecDecl(isPublic())));
+  EXPECT_TRUE(
+      notMatches("class C { public: int i; };", accessSpecDecl(isProtected())));
+  EXPECT_TRUE(
+      notMatches("class C { public: int i; };", accessSpecDecl(isPrivate())));
+
+  EXPECT_TRUE(notMatches("class C { int i; };", accessSpecDecl()));
+}
+
+TEST(Matcher, MatchesFinal) {
+  EXPECT_TRUE(matches("class X final {};", cxxRecordDecl(isFinal())));
+  EXPECT_TRUE(matches("class X { virtual void f() final; };",
+                      cxxMethodDecl(isFinal())));
+  EXPECT_TRUE(notMatches("class X {};", cxxRecordDecl(isFinal())));
+  EXPECT_TRUE(
+    notMatches("class X { virtual void f(); };", cxxMethodDecl(isFinal())));
+}
+
+TEST(Matcher, MatchesVirtualMethod) {
+  EXPECT_TRUE(matches("class X { virtual int f(); };",
+                      cxxMethodDecl(isVirtual(), hasName("::X::f"))));
+  EXPECT_TRUE(notMatches("class X { int f(); };", cxxMethodDecl(isVirtual())));
+}
+
+TEST(Matcher, MatchesVirtualAsWrittenMethod) {
+  EXPECT_TRUE(matches("class A { virtual int f(); };"
+                        "class B : public A { int f(); };",
+                      cxxMethodDecl(isVirtualAsWritten(), hasName("::A::f"))));
+  EXPECT_TRUE(
+    notMatches("class A { virtual int f(); };"
+                 "class B : public A { int f(); };",
+               cxxMethodDecl(isVirtualAsWritten(), hasName("::B::f"))));
+}
+
+TEST(Matcher, MatchesPureMethod) {
+  EXPECT_TRUE(matches("class X { virtual int f() = 0; };",
+                      cxxMethodDecl(isPure(), hasName("::X::f"))));
+  EXPECT_TRUE(notMatches("class X { int f(); };", cxxMethodDecl(isPure())));
+}
+
+TEST(Matcher, MatchesCopyAssignmentOperator) {
+  EXPECT_TRUE(matches("class X { X &operator=(X); };",
+                      cxxMethodDecl(isCopyAssignmentOperator())));
+  EXPECT_TRUE(matches("class X { X &operator=(X &); };",
+                      cxxMethodDecl(isCopyAssignmentOperator())));
+  EXPECT_TRUE(matches("class X { X &operator=(const X &); };",
+                      cxxMethodDecl(isCopyAssignmentOperator())));
+  EXPECT_TRUE(matches("class X { X &operator=(volatile X &); };",
+                      cxxMethodDecl(isCopyAssignmentOperator())));
+  EXPECT_TRUE(matches("class X { X &operator=(const volatile X &); };",
+                      cxxMethodDecl(isCopyAssignmentOperator())));
+  EXPECT_TRUE(notMatches("class X { X &operator=(X &&); };",
+                         cxxMethodDecl(isCopyAssignmentOperator())));
+}
+
+TEST(Matcher, MatchesMoveAssignmentOperator) {
+  EXPECT_TRUE(notMatches("class X { X &operator=(X); };",
+                         cxxMethodDecl(isMoveAssignmentOperator())));
+  EXPECT_TRUE(matches("class X { X &operator=(X &&); };",
+                      cxxMethodDecl(isMoveAssignmentOperator())));
+  EXPECT_TRUE(matches("class X { X &operator=(const X &&); };",
+                      cxxMethodDecl(isMoveAssignmentOperator())));
+  EXPECT_TRUE(matches("class X { X &operator=(volatile X &&); };",
+                      cxxMethodDecl(isMoveAssignmentOperator())));
+  EXPECT_TRUE(matches("class X { X &operator=(const volatile X &&); };",
+                      cxxMethodDecl(isMoveAssignmentOperator())));
+  EXPECT_TRUE(notMatches("class X { X &operator=(X &); };",
+                         cxxMethodDecl(isMoveAssignmentOperator())));
+}
+
+TEST(Matcher, MatchesConstMethod) {
+  EXPECT_TRUE(
+    matches("struct A { void foo() const; };", cxxMethodDecl(isConst())));
+  EXPECT_TRUE(
+    notMatches("struct A { void foo(); };", cxxMethodDecl(isConst())));
+}
+
+TEST(Matcher, MatchesOverridingMethod) {
+  EXPECT_TRUE(matches("class X { virtual int f(); }; "
+                        "class Y : public X { int f(); };",
+                      cxxMethodDecl(isOverride(), hasName("::Y::f"))));
+  EXPECT_TRUE(notMatches("class X { virtual int f(); }; "
+                           "class Y : public X { int f(); };",
+                         cxxMethodDecl(isOverride(), hasName("::X::f"))));
+  EXPECT_TRUE(notMatches("class X { int f(); }; "
+                           "class Y : public X { int f(); };",
+                         cxxMethodDecl(isOverride())));
+  EXPECT_TRUE(notMatches("class X { int f(); int f(int); }; ",
+                         cxxMethodDecl(isOverride())));
+  EXPECT_TRUE(
+    matches("template <typename Base> struct Y : Base { void f() override;};",
+            cxxMethodDecl(isOverride(), hasName("::Y::f"))));
+}
+
+TEST(Matcher, ConstructorArgument) {
+  StatementMatcher Constructor = cxxConstructExpr(
+    hasArgument(0, declRefExpr(to(varDecl(hasName("y"))))));
+
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { int y; X x(y); }",
+            Constructor));
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { int y; X x = X(y); }",
+            Constructor));
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { int y; X x = y; }",
+            Constructor));
+  EXPECT_TRUE(
+    notMatches("class X { public: X(int); }; void x() { int z; X x(z); }",
+               Constructor));
+
+  StatementMatcher WrongIndex = cxxConstructExpr(
+    hasArgument(42, declRefExpr(to(varDecl(hasName("y"))))));
+  EXPECT_TRUE(
+    notMatches("class X { public: X(int); }; void x() { int y; X x(y); }",
+               WrongIndex));
+}
+
+TEST(Matcher, ConstructorArgumentCount) {
+  StatementMatcher Constructor1Arg = cxxConstructExpr(argumentCountIs(1));
+
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { X x(0); }",
+            Constructor1Arg));
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { X x = X(0); }",
+            Constructor1Arg));
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { X x = 0; }",
+            Constructor1Arg));
+  EXPECT_TRUE(
+    notMatches("class X { public: X(int, int); }; void x() { X x(0, 0); }",
+               Constructor1Arg));
+}
+
+TEST(Matcher, ConstructorListInitialization) {
+  StatementMatcher ConstructorListInit =
+    cxxConstructExpr(isListInitialization());
+
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { X x{0}; }",
+            ConstructorListInit));
+  EXPECT_FALSE(
+    matches("class X { public: X(int); }; void x() { X x(0); }",
+            ConstructorListInit));
+}
+
+TEST(ConstructorDeclaration, IsImplicit) {
+  // This one doesn't match because the constructor is not added by the
+  // compiler (it is not needed).
+  EXPECT_TRUE(notMatches("class Foo { };",
+                         cxxConstructorDecl(isImplicit())));
+  // The compiler added the implicit default constructor.
+  EXPECT_TRUE(matches("class Foo { }; Foo* f = new Foo();",
+                      cxxConstructorDecl(isImplicit())));
+  EXPECT_TRUE(matches("class Foo { Foo(){} };",
+                      cxxConstructorDecl(unless(isImplicit()))));
+  // The compiler added an implicit assignment operator.
+  EXPECT_TRUE(matches("struct A { int x; } a = {0}, b = a; void f() { a = b; }",
+                      cxxMethodDecl(isImplicit(), hasName("operator="))));
+}
+
+TEST(ConstructorDeclaration, IsExplicit) {
+  EXPECT_TRUE(matches("struct S { explicit S(int); };",
+                      cxxConstructorDecl(isExplicit())));
+  EXPECT_TRUE(notMatches("struct S { S(int); };",
+                         cxxConstructorDecl(isExplicit())));
+}
+
+TEST(ConstructorDeclaration, Kinds) {
+  EXPECT_TRUE(matches("struct S { S(); };",
+                      cxxConstructorDecl(isDefaultConstructor())));
+  EXPECT_TRUE(notMatches("struct S { S(); };",
+                         cxxConstructorDecl(isCopyConstructor())));
+  EXPECT_TRUE(notMatches("struct S { S(); };",
+                         cxxConstructorDecl(isMoveConstructor())));
+
+  EXPECT_TRUE(notMatches("struct S { S(const S&); };",
+                         cxxConstructorDecl(isDefaultConstructor())));
+  EXPECT_TRUE(matches("struct S { S(const S&); };",
+                      cxxConstructorDecl(isCopyConstructor())));
+  EXPECT_TRUE(notMatches("struct S { S(const S&); };",
+                         cxxConstructorDecl(isMoveConstructor())));
+
+  EXPECT_TRUE(notMatches("struct S { S(S&&); };",
+                         cxxConstructorDecl(isDefaultConstructor())));
+  EXPECT_TRUE(notMatches("struct S { S(S&&); };",
+                         cxxConstructorDecl(isCopyConstructor())));
+  EXPECT_TRUE(matches("struct S { S(S&&); };",
+                      cxxConstructorDecl(isMoveConstructor())));
+}
+
+TEST(ConstructorDeclaration, IsUserProvided) {
+  EXPECT_TRUE(notMatches("struct S { int X = 0; };",
+                         cxxConstructorDecl(isUserProvided())));
+  EXPECT_TRUE(notMatches("struct S { S() = default; };",
+                         cxxConstructorDecl(isUserProvided())));
+  EXPECT_TRUE(notMatches("struct S { S() = delete; };",
+                         cxxConstructorDecl(isUserProvided())));
+  EXPECT_TRUE(
+    matches("struct S { S(); };", cxxConstructorDecl(isUserProvided())));
+  EXPECT_TRUE(matches("struct S { S(); }; S::S(){}",
+                      cxxConstructorDecl(isUserProvided())));
+}
+
+TEST(ConstructorDeclaration, IsDelegatingConstructor) {
+  EXPECT_TRUE(notMatches("struct S { S(); S(int); int X; };",
+                         cxxConstructorDecl(isDelegatingConstructor())));
+  EXPECT_TRUE(notMatches("struct S { S(){} S(int X) : X(X) {} int X; };",
+                         cxxConstructorDecl(isDelegatingConstructor())));
+  EXPECT_TRUE(matches(
+    "struct S { S() : S(0) {} S(int X) : X(X) {} int X; };",
+    cxxConstructorDecl(isDelegatingConstructor(), parameterCountIs(0))));
+  EXPECT_TRUE(matches(
+    "struct S { S(); S(int X); int X; }; S::S(int X) : S() {}",
+    cxxConstructorDecl(isDelegatingConstructor(), parameterCountIs(1))));
+}
+
+TEST(StringLiteral, HasSize) {
+  StatementMatcher Literal = stringLiteral(hasSize(4));
+  EXPECT_TRUE(matches("const char *s = \"abcd\";", Literal));
+  // wide string
+  EXPECT_TRUE(matches("const wchar_t *s = L\"abcd\";", Literal));
+  // with escaped characters
+  EXPECT_TRUE(matches("const char *s = \"\x05\x06\x07\x08\";", Literal));
+  // no matching, too small
+  EXPECT_TRUE(notMatches("const char *s = \"ab\";", Literal));
+}
+
+TEST(Matcher, HasNameSupportsNamespaces) {
+  EXPECT_TRUE(matches("namespace a { namespace b { class C; } }",
+                      recordDecl(hasName("a::b::C"))));
+  EXPECT_TRUE(matches("namespace a { namespace b { class C; } }",
+                      recordDecl(hasName("::a::b::C"))));
+  EXPECT_TRUE(matches("namespace a { namespace b { class C; } }",
+                      recordDecl(hasName("b::C"))));
+  EXPECT_TRUE(matches("namespace a { namespace b { class C; } }",
+                      recordDecl(hasName("C"))));
+  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
+                         recordDecl(hasName("c::b::C"))));
+  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
+                         recordDecl(hasName("a::c::C"))));
+  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
+                         recordDecl(hasName("a::b::A"))));
+  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
+                         recordDecl(hasName("::C"))));
+  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
+                         recordDecl(hasName("::b::C"))));
+  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
+                         recordDecl(hasName("z::a::b::C"))));
+  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
+                         recordDecl(hasName("a+b::C"))));
+  EXPECT_TRUE(notMatches("namespace a { namespace b { class AC; } }",
+                         recordDecl(hasName("C"))));
+}
+
+TEST(Matcher, HasNameSupportsOuterClasses) {
+  EXPECT_TRUE(
+    matches("class A { class B { class C; }; };",
+            recordDecl(hasName("A::B::C"))));
+  EXPECT_TRUE(
+    matches("class A { class B { class C; }; };",
+            recordDecl(hasName("::A::B::C"))));
+  EXPECT_TRUE(
+    matches("class A { class B { class C; }; };",
+            recordDecl(hasName("B::C"))));
+  EXPECT_TRUE(
+    matches("class A { class B { class C; }; };",
+            recordDecl(hasName("C"))));
+  EXPECT_TRUE(
+    notMatches("class A { class B { class C; }; };",
+               recordDecl(hasName("c::B::C"))));
+  EXPECT_TRUE(
+    notMatches("class A { class B { class C; }; };",
+               recordDecl(hasName("A::c::C"))));
+  EXPECT_TRUE(
+    notMatches("class A { class B { class C; }; };",
+               recordDecl(hasName("A::B::A"))));
+  EXPECT_TRUE(
+    notMatches("class A { class B { class C; }; };",
+               recordDecl(hasName("::C"))));
+  EXPECT_TRUE(
+    notMatches("class A { class B { class C; }; };",
+               recordDecl(hasName("::B::C"))));
+  EXPECT_TRUE(notMatches("class A { class B { class C; }; };",
+                         recordDecl(hasName("z::A::B::C"))));
+  EXPECT_TRUE(
+    notMatches("class A { class B { class C; }; };",
+               recordDecl(hasName("A+B::C"))));
+}
+
+TEST(Matcher, HasNameSupportsInlinedNamespaces) {
+  std::string code = "namespace a { inline namespace b { class C; } }";
+  EXPECT_TRUE(matches(code, recordDecl(hasName("a::b::C"))));
+  EXPECT_TRUE(matches(code, recordDecl(hasName("a::C"))));
+  EXPECT_TRUE(matches(code, recordDecl(hasName("::a::b::C"))));
+  EXPECT_TRUE(matches(code, recordDecl(hasName("::a::C"))));
+}
+
+TEST(Matcher, HasNameSupportsAnonymousNamespaces) {
+  std::string code = "namespace a { namespace { class C; } }";
+  EXPECT_TRUE(
+    matches(code, recordDecl(hasName("a::(anonymous namespace)::C"))));
+  EXPECT_TRUE(matches(code, recordDecl(hasName("a::C"))));
+  EXPECT_TRUE(
+    matches(code, recordDecl(hasName("::a::(anonymous namespace)::C"))));
+  EXPECT_TRUE(matches(code, recordDecl(hasName("::a::C"))));
+}
+
+TEST(Matcher, HasNameSupportsAnonymousOuterClasses) {
+  EXPECT_TRUE(matches("class A { class { class C; } x; };",
+                      recordDecl(hasName("A::(anonymous class)::C"))));
+  EXPECT_TRUE(matches("class A { class { class C; } x; };",
+                      recordDecl(hasName("::A::(anonymous class)::C"))));
+  EXPECT_FALSE(matches("class A { class { class C; } x; };",
+                       recordDecl(hasName("::A::C"))));
+  EXPECT_TRUE(matches("class A { struct { class C; } x; };",
+                      recordDecl(hasName("A::(anonymous struct)::C"))));
+  EXPECT_TRUE(matches("class A { struct { class C; } x; };",
+                      recordDecl(hasName("::A::(anonymous struct)::C"))));
+  EXPECT_FALSE(matches("class A { struct { class C; } x; };",
+                       recordDecl(hasName("::A::C"))));
+}
+
+TEST(Matcher, HasNameSupportsFunctionScope) {
+  std::string code =
+    "namespace a { void F(int a) { struct S { int m; }; int i; } }";
+  EXPECT_TRUE(matches(code, varDecl(hasName("i"))));
+  EXPECT_FALSE(matches(code, varDecl(hasName("F()::i"))));
+
+  EXPECT_TRUE(matches(code, fieldDecl(hasName("m"))));
+  EXPECT_TRUE(matches(code, fieldDecl(hasName("S::m"))));
+  EXPECT_TRUE(matches(code, fieldDecl(hasName("F(int)::S::m"))));
+  EXPECT_TRUE(matches(code, fieldDecl(hasName("a::F(int)::S::m"))));
+  EXPECT_TRUE(matches(code, fieldDecl(hasName("::a::F(int)::S::m"))));
+}
+
+TEST(Matcher, HasAnyName) {
+  const std::string Code = "namespace a { namespace b { class C; } }";
+
+  EXPECT_TRUE(matches(Code, recordDecl(hasAnyName("XX", "a::b::C"))));
+  EXPECT_TRUE(matches(Code, recordDecl(hasAnyName("a::b::C", "XX"))));
+  EXPECT_TRUE(matches(Code, recordDecl(hasAnyName("XX::C", "a::b::C"))));
+  EXPECT_TRUE(matches(Code, recordDecl(hasAnyName("XX", "C"))));
+
+  EXPECT_TRUE(notMatches(Code, recordDecl(hasAnyName("::C", "::b::C"))));
+  EXPECT_TRUE(
+    matches(Code, recordDecl(hasAnyName("::C", "::b::C", "::a::b::C"))));
+
+  std::vector<StringRef> Names = {"::C", "::b::C", "::a::b::C"};
+  EXPECT_TRUE(matches(Code, recordDecl(hasAnyName(Names))));
+}
+
+TEST(Matcher, IsDefinition) {
+  DeclarationMatcher DefinitionOfClassA =
+    recordDecl(hasName("A"), isDefinition());
+  EXPECT_TRUE(matches("class A {};", DefinitionOfClassA));
+  EXPECT_TRUE(notMatches("class A;", DefinitionOfClassA));
+
+  DeclarationMatcher DefinitionOfVariableA =
+    varDecl(hasName("a"), isDefinition());
+  EXPECT_TRUE(matches("int a;", DefinitionOfVariableA));
+  EXPECT_TRUE(notMatches("extern int a;", DefinitionOfVariableA));
+
+  DeclarationMatcher DefinitionOfMethodA =
+    cxxMethodDecl(hasName("a"), isDefinition());
+  EXPECT_TRUE(matches("class A { void a() {} };", DefinitionOfMethodA));
+  EXPECT_TRUE(notMatches("class A { void a(); };", DefinitionOfMethodA));
+}
+
+TEST(Matcher, HandlesNullQualTypes) {
+  // FIXME: Add a Type matcher so we can replace uses of this
+  // variable with Type(True())
+  const TypeMatcher AnyType = anything();
+
+  // We don't really care whether this matcher succeeds; we're testing that
+  // it completes without crashing.
+  EXPECT_TRUE(matches(
+    "struct A { };"
+      "template <typename T>"
+      "void f(T t) {"
+      "  T local_t(t /* this becomes a null QualType in the AST */);"
+      "}"
+      "void g() {"
+      "  f(0);"
+      "}",
+    expr(hasType(TypeMatcher(
+      anyOf(
+        TypeMatcher(hasDeclaration(anything())),
+        pointsTo(AnyType),
+        references(AnyType)
+        // Other QualType matchers should go here.
+      ))))));
+}
+
+
+TEST(StatementCountIs, FindsNoStatementsInAnEmptyCompoundStatement) {
+  EXPECT_TRUE(matches("void f() { }",
+                      compoundStmt(statementCountIs(0))));
+  EXPECT_TRUE(notMatches("void f() {}",
+                         compoundStmt(statementCountIs(1))));
+}
+
+TEST(StatementCountIs, AppearsToMatchOnlyOneCount) {
+  EXPECT_TRUE(matches("void f() { 1; }",
+                      compoundStmt(statementCountIs(1))));
+  EXPECT_TRUE(notMatches("void f() { 1; }",
+                         compoundStmt(statementCountIs(0))));
+  EXPECT_TRUE(notMatches("void f() { 1; }",
+                         compoundStmt(statementCountIs(2))));
+}
+
+TEST(StatementCountIs, WorksWithMultipleStatements) {
+  EXPECT_TRUE(matches("void f() { 1; 2; 3; }",
+                      compoundStmt(statementCountIs(3))));
+}
+
+TEST(StatementCountIs, WorksWithNestedCompoundStatements) {
+  EXPECT_TRUE(matches("void f() { { 1; } { 1; 2; 3; 4; } }",
+                      compoundStmt(statementCountIs(1))));
+  EXPECT_TRUE(matches("void f() { { 1; } { 1; 2; 3; 4; } }",
+                      compoundStmt(statementCountIs(2))));
+  EXPECT_TRUE(notMatches("void f() { { 1; } { 1; 2; 3; 4; } }",
+                         compoundStmt(statementCountIs(3))));
+  EXPECT_TRUE(matches("void f() { { 1; } { 1; 2; 3; 4; } }",
+                      compoundStmt(statementCountIs(4))));
+}
+
+TEST(Member, WorksInSimplestCase) {
+  EXPECT_TRUE(matches("struct { int first; } s; int i(s.first);",
+                      memberExpr(member(hasName("first")))));
+}
+
+TEST(Member, DoesNotMatchTheBaseExpression) {
+  // Don't pick out the wrong part of the member expression, this should
+  // be checking the member (name) only.
+  EXPECT_TRUE(notMatches("struct { int i; } first; int i(first.i);",
+                         memberExpr(member(hasName("first")))));
+}
+
+TEST(Member, MatchesInMemberFunctionCall) {
+  EXPECT_TRUE(matches("void f() {"
+                        "  struct { void first() {}; } s;"
+                        "  s.first();"
+                        "};",
+                      memberExpr(member(hasName("first")))));
+}
+
+TEST(Member, MatchesMember) {
+  EXPECT_TRUE(matches(
+    "struct A { int i; }; void f() { A a; a.i = 2; }",
+    memberExpr(hasDeclaration(fieldDecl(hasType(isInteger()))))));
+  EXPECT_TRUE(notMatches(
+    "struct A { float f; }; void f() { A a; a.f = 2.0f; }",
+    memberExpr(hasDeclaration(fieldDecl(hasType(isInteger()))))));
+}
+
+TEST(Member, BitFields) {
+  EXPECT_TRUE(matches("class C { int a : 2; int b; };",
+                      fieldDecl(isBitField(), hasName("a"))));
+  EXPECT_TRUE(notMatches("class C { int a : 2; int b; };",
+                         fieldDecl(isBitField(), hasName("b"))));
+  EXPECT_TRUE(matches("class C { int a : 2; int b : 4; };",
+                      fieldDecl(isBitField(), hasBitWidth(2), hasName("a"))));
+}
+
+TEST(Member, UnderstandsAccess) {
+  EXPECT_TRUE(matches(
+    "struct A { int i; };", fieldDecl(isPublic(), hasName("i"))));
+  EXPECT_TRUE(notMatches(
+    "struct A { int i; };", fieldDecl(isProtected(), hasName("i"))));
+  EXPECT_TRUE(notMatches(
+    "struct A { int i; };", fieldDecl(isPrivate(), hasName("i"))));
+
+  EXPECT_TRUE(notMatches(
+    "class A { int i; };", fieldDecl(isPublic(), hasName("i"))));
+  EXPECT_TRUE(notMatches(
+    "class A { int i; };", fieldDecl(isProtected(), hasName("i"))));
+  EXPECT_TRUE(matches(
+    "class A { int i; };", fieldDecl(isPrivate(), hasName("i"))));
+
+  EXPECT_TRUE(notMatches(
+    "class A { protected: int i; };", fieldDecl(isPublic(), hasName("i"))));
+  EXPECT_TRUE(matches("class A { protected: int i; };",
+                      fieldDecl(isProtected(), hasName("i"))));
+  EXPECT_TRUE(notMatches(
+    "class A { protected: int i; };", fieldDecl(isPrivate(), hasName("i"))));
+
+  // Non-member decls have the AccessSpecifier AS_none and thus aren't matched.
+  EXPECT_TRUE(notMatches("int i;", varDecl(isPublic(), hasName("i"))));
+  EXPECT_TRUE(notMatches("int i;", varDecl(isProtected(), hasName("i"))));
+  EXPECT_TRUE(notMatches("int i;", varDecl(isPrivate(), hasName("i"))));
+}
+
+TEST(hasDynamicExceptionSpec, MatchesDynamicExceptionSpecifications) {
+  EXPECT_TRUE(notMatches("void f();", functionDecl(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(notMatches("void g() noexcept;",
+                         functionDecl(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(notMatches("void h() noexcept(true);",
+                         functionDecl(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(notMatches("void i() noexcept(false);",
+                         functionDecl(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(
+      matches("void j() throw();", functionDecl(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(
+      matches("void k() throw(int);", functionDecl(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(
+      matches("void l() throw(...);", functionDecl(hasDynamicExceptionSpec())));
+
+  EXPECT_TRUE(notMatches("void f();", functionProtoType(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(notMatches("void g() noexcept;",
+                         functionProtoType(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(notMatches("void h() noexcept(true);",
+                         functionProtoType(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(notMatches("void i() noexcept(false);",
+                         functionProtoType(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(
+      matches("void j() throw();", functionProtoType(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(
+      matches("void k() throw(int);", functionProtoType(hasDynamicExceptionSpec())));
+  EXPECT_TRUE(
+      matches("void l() throw(...);", functionProtoType(hasDynamicExceptionSpec())));
+}
+
+TEST(HasObjectExpression, DoesNotMatchMember) {
+  EXPECT_TRUE(notMatches(
+    "class X {}; struct Z { X m; }; void f(Z z) { z.m; }",
+    memberExpr(hasObjectExpression(hasType(recordDecl(hasName("X")))))));
+}
+
+TEST(HasObjectExpression, MatchesBaseOfVariable) {
+  EXPECT_TRUE(matches(
+    "struct X { int m; }; void f(X x) { x.m; }",
+    memberExpr(hasObjectExpression(hasType(recordDecl(hasName("X")))))));
+  EXPECT_TRUE(matches(
+    "struct X { int m; }; void f(X* x) { x->m; }",
+    memberExpr(hasObjectExpression(
+      hasType(pointsTo(recordDecl(hasName("X"))))))));
+}
+
+TEST(HasObjectExpression,
+     MatchesObjectExpressionOfImplicitlyFormedMemberExpression) {
+  EXPECT_TRUE(matches(
+    "class X {}; struct S { X m; void f() { this->m; } };",
+    memberExpr(hasObjectExpression(
+      hasType(pointsTo(recordDecl(hasName("S"))))))));
+  EXPECT_TRUE(matches(
+    "class X {}; struct S { X m; void f() { m; } };",
+    memberExpr(hasObjectExpression(
+      hasType(pointsTo(recordDecl(hasName("S"))))))));
+}
+
+TEST(Field, DoesNotMatchNonFieldMembers) {
+  EXPECT_TRUE(notMatches("class X { void m(); };", fieldDecl(hasName("m"))));
+  EXPECT_TRUE(notMatches("class X { class m {}; };", fieldDecl(hasName("m"))));
+  EXPECT_TRUE(notMatches("class X { enum { m }; };", fieldDecl(hasName("m"))));
+  EXPECT_TRUE(notMatches("class X { enum m {}; };", fieldDecl(hasName("m"))));
+}
+
+TEST(Field, MatchesField) {
+  EXPECT_TRUE(matches("class X { int m; };", fieldDecl(hasName("m"))));
+}
+
+TEST(IsVolatileQualified, QualifiersMatch) {
+  EXPECT_TRUE(matches("volatile int i = 42;",
+                      varDecl(hasType(isVolatileQualified()))));
+  EXPECT_TRUE(notMatches("volatile int *i;",
+                         varDecl(hasType(isVolatileQualified()))));
+  EXPECT_TRUE(matches("typedef volatile int v_int; v_int i = 42;",
+                      varDecl(hasType(isVolatileQualified()))));
+}
+
+TEST(IsConstQualified, MatchesConstInt) {
+  EXPECT_TRUE(matches("const int i = 42;",
+                      varDecl(hasType(isConstQualified()))));
+}
+
+TEST(IsConstQualified, MatchesConstPointer) {
+  EXPECT_TRUE(matches("int i = 42; int* const p(&i);",
+                      varDecl(hasType(isConstQualified()))));
+}
+
+TEST(IsConstQualified, MatchesThroughTypedef) {
+  EXPECT_TRUE(matches("typedef const int const_int; const_int i = 42;",
+                      varDecl(hasType(isConstQualified()))));
+  EXPECT_TRUE(matches("typedef int* int_ptr; const int_ptr p(0);",
+                      varDecl(hasType(isConstQualified()))));
+}
+
+TEST(IsConstQualified, DoesNotMatchInappropriately) {
+  EXPECT_TRUE(notMatches("typedef int nonconst_int; nonconst_int i = 42;",
+                         varDecl(hasType(isConstQualified()))));
+  EXPECT_TRUE(notMatches("int const* p;",
+                         varDecl(hasType(isConstQualified()))));
+}
+
+TEST(DeclCount, DeclCountIsCorrect) {
+  EXPECT_TRUE(matches("void f() {int i,j;}",
+                      declStmt(declCountIs(2))));
+  EXPECT_TRUE(notMatches("void f() {int i,j; int k;}",
+                         declStmt(declCountIs(3))));
+  EXPECT_TRUE(notMatches("void f() {int i,j, k, l;}",
+                         declStmt(declCountIs(3))));
+}
+
+
+TEST(EachOf, TriggersForEachMatch) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { int a; int b; };",
+    recordDecl(eachOf(has(fieldDecl(hasName("a")).bind("v")),
+                      has(fieldDecl(hasName("b")).bind("v")))),
+    llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("v", 2)));
+}
+
+TEST(EachOf, BehavesLikeAnyOfUnlessBothMatch) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { int a; int c; };",
+    recordDecl(eachOf(has(fieldDecl(hasName("a")).bind("v")),
+                      has(fieldDecl(hasName("b")).bind("v")))),
+    llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("v", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { int c; int b; };",
+    recordDecl(eachOf(has(fieldDecl(hasName("a")).bind("v")),
+                      has(fieldDecl(hasName("b")).bind("v")))),
+    llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("v", 1)));
+  EXPECT_TRUE(notMatches(
+    "class A { int c; int d; };",
+    recordDecl(eachOf(has(fieldDecl(hasName("a")).bind("v")),
+                      has(fieldDecl(hasName("b")).bind("v"))))));
+}
+
+TEST(IsTemplateInstantiation, MatchesImplicitClassTemplateInstantiation) {
+  // Make sure that we can both match the class by name (::X) and by the type
+  // the template was instantiated with (via a field).
+
+  EXPECT_TRUE(matches(
+    "template <typename T> class X {}; class A {}; X<A> x;",
+    cxxRecordDecl(hasName("::X"), isTemplateInstantiation())));
+
+  EXPECT_TRUE(matches(
+    "template <typename T> class X { T t; }; class A {}; X<A> x;",
+    cxxRecordDecl(isTemplateInstantiation(), hasDescendant(
+      fieldDecl(hasType(recordDecl(hasName("A"))))))));
+}
+
+TEST(IsTemplateInstantiation, MatchesImplicitFunctionTemplateInstantiation) {
+  EXPECT_TRUE(matches(
+    "template <typename T> void f(T t) {} class A {}; void g() { f(A()); }",
+    functionDecl(hasParameter(0, hasType(recordDecl(hasName("A")))),
+                 isTemplateInstantiation())));
+}
+
+TEST(IsTemplateInstantiation, MatchesExplicitClassTemplateInstantiation) {
+  EXPECT_TRUE(matches(
+    "template <typename T> class X { T t; }; class A {};"
+      "template class X<A>;",
+    cxxRecordDecl(isTemplateInstantiation(), hasDescendant(
+      fieldDecl(hasType(recordDecl(hasName("A"))))))));
+}
+
+TEST(IsTemplateInstantiation,
+     MatchesInstantiationOfPartiallySpecializedClassTemplate) {
+  EXPECT_TRUE(matches(
+    "template <typename T> class X {};"
+      "template <typename T> class X<T*> {}; class A {}; X<A*> x;",
+    cxxRecordDecl(hasName("::X"), isTemplateInstantiation())));
+}
+
+TEST(IsTemplateInstantiation,
+     MatchesInstantiationOfClassTemplateNestedInNonTemplate) {
+  EXPECT_TRUE(matches(
+    "class A {};"
+      "class X {"
+      "  template <typename U> class Y { U u; };"
+      "  Y<A> y;"
+      "};",
+    cxxRecordDecl(hasName("::X::Y"), isTemplateInstantiation())));
+}
+
+TEST(IsTemplateInstantiation, DoesNotMatchInstantiationsInsideOfInstantiation) {
+  // FIXME: Figure out whether this makes sense. It doesn't affect the
+  // normal use case as long as the uppermost instantiation always is marked
+  // as template instantiation, but it might be confusing as a predicate.
+  EXPECT_TRUE(matches(
+    "class A {};"
+      "template <typename T> class X {"
+      "  template <typename U> class Y { U u; };"
+      "  Y<T> y;"
+      "}; X<A> x;",
+    cxxRecordDecl(hasName("::X<A>::Y"), unless(isTemplateInstantiation()))));
+}
+
+TEST(IsTemplateInstantiation, DoesNotMatchExplicitClassTemplateSpecialization) {
+  EXPECT_TRUE(notMatches(
+    "template <typename T> class X {}; class A {};"
+      "template <> class X<A> {}; X<A> x;",
+    cxxRecordDecl(hasName("::X"), isTemplateInstantiation())));
+}
+
+TEST(IsTemplateInstantiation, DoesNotMatchNonTemplate) {
+  EXPECT_TRUE(notMatches(
+    "class A {}; class Y { A a; };",
+    cxxRecordDecl(isTemplateInstantiation())));
+}
+
+TEST(IsInstantiated, MatchesInstantiation) {
+  EXPECT_TRUE(
+    matches("template<typename T> class A { T i; }; class Y { A<int> a; };",
+            cxxRecordDecl(isInstantiated())));
+}
+
+TEST(IsInstantiated, NotMatchesDefinition) {
+  EXPECT_TRUE(notMatches("template<typename T> class A { T i; };",
+                         cxxRecordDecl(isInstantiated())));
+}
+
+TEST(IsInTemplateInstantiation, MatchesInstantiationStmt) {
+  EXPECT_TRUE(matches("template<typename T> struct A { A() { T i; } };"
+                        "class Y { A<int> a; }; Y y;",
+                      declStmt(isInTemplateInstantiation())));
+}
+
+TEST(IsInTemplateInstantiation, NotMatchesDefinitionStmt) {
+  EXPECT_TRUE(notMatches("template<typename T> struct A { void x() { T i; } };",
+                         declStmt(isInTemplateInstantiation())));
+}
+
+TEST(IsInstantiated, MatchesFunctionInstantiation) {
+  EXPECT_TRUE(
+    matches("template<typename T> void A(T t) { T i; } void x() { A(0); }",
+            functionDecl(isInstantiated())));
+}
+
+TEST(IsInstantiated, NotMatchesFunctionDefinition) {
+  EXPECT_TRUE(notMatches("template<typename T> void A(T t) { T i; }",
+                         varDecl(isInstantiated())));
+}
+
+TEST(IsInTemplateInstantiation, MatchesFunctionInstantiationStmt) {
+  EXPECT_TRUE(
+    matches("template<typename T> void A(T t) { T i; } void x() { A(0); }",
+            declStmt(isInTemplateInstantiation())));
+}
+
+TEST(IsInTemplateInstantiation, NotMatchesFunctionDefinitionStmt) {
+  EXPECT_TRUE(notMatches("template<typename T> void A(T t) { T i; }",
+                         declStmt(isInTemplateInstantiation())));
+}
+
+TEST(IsInTemplateInstantiation, Sharing) {
+  auto Matcher = binaryOperator(unless(isInTemplateInstantiation()));
+  // FIXME: Node sharing is an implementation detail, exposing it is ugly
+  // and makes the matcher behave in non-obvious ways.
+  EXPECT_TRUE(notMatches(
+    "int j; template<typename T> void A(T t) { j += 42; } void x() { A(0); }",
+    Matcher));
+  EXPECT_TRUE(matches(
+    "int j; template<typename T> void A(T t) { j += t; } void x() { A(0); }",
+    Matcher));
+}
+
+TEST(IsExplicitTemplateSpecialization,
+     DoesNotMatchPrimaryTemplate) {
+  EXPECT_TRUE(notMatches(
+    "template <typename T> class X {};",
+    cxxRecordDecl(isExplicitTemplateSpecialization())));
+  EXPECT_TRUE(notMatches(
+    "template <typename T> void f(T t);",
+    functionDecl(isExplicitTemplateSpecialization())));
+}
+
+TEST(IsExplicitTemplateSpecialization,
+     DoesNotMatchExplicitTemplateInstantiations) {
+  EXPECT_TRUE(notMatches(
+    "template <typename T> class X {};"
+      "template class X<int>; extern template class X<long>;",
+    cxxRecordDecl(isExplicitTemplateSpecialization())));
+  EXPECT_TRUE(notMatches(
+    "template <typename T> void f(T t) {}"
+      "template void f(int t); extern template void f(long t);",
+    functionDecl(isExplicitTemplateSpecialization())));
+}
+
+TEST(IsExplicitTemplateSpecialization,
+     DoesNotMatchImplicitTemplateInstantiations) {
+  EXPECT_TRUE(notMatches(
+    "template <typename T> class X {}; X<int> x;",
+    cxxRecordDecl(isExplicitTemplateSpecialization())));
+  EXPECT_TRUE(notMatches(
+    "template <typename T> void f(T t); void g() { f(10); }",
+    functionDecl(isExplicitTemplateSpecialization())));
+}
+
+TEST(IsExplicitTemplateSpecialization,
+     MatchesExplicitTemplateSpecializations) {
+  EXPECT_TRUE(matches(
+    "template <typename T> class X {};"
+      "template<> class X<int> {};",
+    cxxRecordDecl(isExplicitTemplateSpecialization())));
+  EXPECT_TRUE(matches(
+    "template <typename T> void f(T t) {}"
+      "template<> void f(int t) {}",
+    functionDecl(isExplicitTemplateSpecialization())));
+}
+
+TEST(TypeMatching, MatchesBool) {
+  EXPECT_TRUE(matches("struct S { bool func(); };",
+                      cxxMethodDecl(returns(booleanType()))));
+  EXPECT_TRUE(notMatches("struct S { void func(); };",
+                         cxxMethodDecl(returns(booleanType()))));
+}
+
+TEST(TypeMatching, MatchesVoid) {
+  EXPECT_TRUE(matches("struct S { void func(); };",
+                      cxxMethodDecl(returns(voidType()))));
+}
+
+TEST(TypeMatching, MatchesRealFloats) {
+  EXPECT_TRUE(matches("struct S { float func(); };",
+                      cxxMethodDecl(returns(realFloatingPointType()))));
+  EXPECT_TRUE(notMatches("struct S { int func(); };",
+                         cxxMethodDecl(returns(realFloatingPointType()))));
+  EXPECT_TRUE(matches("struct S { long double func(); };",
+                      cxxMethodDecl(returns(realFloatingPointType()))));
+}
+
+TEST(TypeMatching, MatchesArrayTypes) {
+  EXPECT_TRUE(matches("int a[] = {2,3};", arrayType()));
+  EXPECT_TRUE(matches("int a[42];", arrayType()));
+  EXPECT_TRUE(matches("void f(int b) { int a[b]; }", arrayType()));
+
+  EXPECT_TRUE(notMatches("struct A {}; A a[7];",
+                         arrayType(hasElementType(builtinType()))));
+
+  EXPECT_TRUE(matches(
+    "int const a[] = { 2, 3 };",
+    qualType(arrayType(hasElementType(builtinType())))));
+  EXPECT_TRUE(matches(
+    "int const a[] = { 2, 3 };",
+    qualType(isConstQualified(), arrayType(hasElementType(builtinType())))));
+  EXPECT_TRUE(matches(
+    "typedef const int T; T x[] = { 1, 2 };",
+    qualType(isConstQualified(), arrayType())));
+
+  EXPECT_TRUE(notMatches(
+    "int a[] = { 2, 3 };",
+    qualType(isConstQualified(), arrayType(hasElementType(builtinType())))));
+  EXPECT_TRUE(notMatches(
+    "int a[] = { 2, 3 };",
+    qualType(arrayType(hasElementType(isConstQualified(), builtinType())))));
+  EXPECT_TRUE(notMatches(
+    "int const a[] = { 2, 3 };",
+    qualType(arrayType(hasElementType(builtinType())),
+             unless(isConstQualified()))));
+
+  EXPECT_TRUE(matches("int a[2];",
+                      constantArrayType(hasElementType(builtinType()))));
+  EXPECT_TRUE(matches("const int a = 0;", qualType(isInteger())));
+}
+
+TEST(TypeMatching, DecayedType) {
+  EXPECT_TRUE(matches("void f(int i[]);", valueDecl(hasType(decayedType(hasDecayedType(pointerType()))))));
+  EXPECT_TRUE(notMatches("int i[7];", decayedType()));
+}
+
+TEST(TypeMatching, MatchesComplexTypes) {
+  EXPECT_TRUE(matches("_Complex float f;", complexType()));
+  EXPECT_TRUE(matches(
+    "_Complex float f;",
+    complexType(hasElementType(builtinType()))));
+  EXPECT_TRUE(notMatches(
+    "_Complex float f;",
+    complexType(hasElementType(isInteger()))));
+}
+
+TEST(NS, Anonymous) {
+  EXPECT_TRUE(notMatches("namespace N {}", namespaceDecl(isAnonymous())));
+  EXPECT_TRUE(matches("namespace {}", namespaceDecl(isAnonymous())));
+}
+
+TEST(EqualsBoundNodeMatcher, QualType) {
+  EXPECT_TRUE(matches(
+    "int i = 1;", varDecl(hasType(qualType().bind("type")),
+                          hasInitializer(ignoringParenImpCasts(
+                            hasType(qualType(equalsBoundNode("type"))))))));
+  EXPECT_TRUE(notMatches("int i = 1.f;",
+                         varDecl(hasType(qualType().bind("type")),
+                                 hasInitializer(ignoringParenImpCasts(hasType(
+                                   qualType(equalsBoundNode("type"))))))));
+}
+
+TEST(EqualsBoundNodeMatcher, NonMatchingTypes) {
+  EXPECT_TRUE(notMatches(
+    "int i = 1;", varDecl(namedDecl(hasName("i")).bind("name"),
+                          hasInitializer(ignoringParenImpCasts(
+                            hasType(qualType(equalsBoundNode("type"))))))));
+}
+
+TEST(EqualsBoundNodeMatcher, Stmt) {
+  EXPECT_TRUE(
+    matches("void f() { if(true) {} }",
+            stmt(allOf(ifStmt().bind("if"),
+                       hasParent(stmt(has(stmt(equalsBoundNode("if")))))))));
+
+  EXPECT_TRUE(notMatches(
+    "void f() { if(true) { if (true) {} } }",
+    stmt(allOf(ifStmt().bind("if"), has(stmt(equalsBoundNode("if")))))));
+}
+
+TEST(EqualsBoundNodeMatcher, Decl) {
+  EXPECT_TRUE(matches(
+    "class X { class Y {}; };",
+    decl(allOf(recordDecl(hasName("::X::Y")).bind("record"),
+               hasParent(decl(has(decl(equalsBoundNode("record")))))))));
+
+  EXPECT_TRUE(notMatches("class X { class Y {}; };",
+                         decl(allOf(recordDecl(hasName("::X")).bind("record"),
+                                    has(decl(equalsBoundNode("record")))))));
+}
+
+TEST(EqualsBoundNodeMatcher, Type) {
+  EXPECT_TRUE(matches(
+    "class X { int a; int b; };",
+    recordDecl(
+      has(fieldDecl(hasName("a"), hasType(type().bind("t")))),
+      has(fieldDecl(hasName("b"), hasType(type(equalsBoundNode("t"))))))));
+
+  EXPECT_TRUE(notMatches(
+    "class X { int a; double b; };",
+    recordDecl(
+      has(fieldDecl(hasName("a"), hasType(type().bind("t")))),
+      has(fieldDecl(hasName("b"), hasType(type(equalsBoundNode("t"))))))));
+}
+
+TEST(EqualsBoundNodeMatcher, UsingForEachDescendant) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "int f() {"
+      "  if (1) {"
+      "    int i = 9;"
+      "  }"
+      "  int j = 10;"
+      "  {"
+      "    float k = 9.0;"
+      "  }"
+      "  return 0;"
+      "}",
+    // Look for variable declarations within functions whose type is the same
+    // as the function return type.
+    functionDecl(returns(qualType().bind("type")),
+                 forEachDescendant(varDecl(hasType(
+                   qualType(equalsBoundNode("type")))).bind("decl"))),
+    // Only i and j should match, not k.
+    llvm::make_unique<VerifyIdIsBoundTo<VarDecl>>("decl", 2)));
+}
+
+TEST(EqualsBoundNodeMatcher, FiltersMatchedCombinations) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void f() {"
+      "  int x;"
+      "  double d;"
+      "  x = d + x - d + x;"
+      "}",
+    functionDecl(
+      hasName("f"), forEachDescendant(varDecl().bind("d")),
+      forEachDescendant(declRefExpr(to(decl(equalsBoundNode("d")))))),
+    llvm::make_unique<VerifyIdIsBoundTo<VarDecl>>("d", 5)));
+}
+
+TEST(EqualsBoundNodeMatcher, UnlessDescendantsOfAncestorsMatch) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "struct StringRef { int size() const; const char* data() const; };"
+      "void f(StringRef v) {"
+      "  v.data();"
+      "}",
+    cxxMemberCallExpr(
+      callee(cxxMethodDecl(hasName("data"))),
+      on(declRefExpr(to(
+        varDecl(hasType(recordDecl(hasName("StringRef")))).bind("var")))),
+      unless(hasAncestor(stmt(hasDescendant(cxxMemberCallExpr(
+        callee(cxxMethodDecl(anyOf(hasName("size"), hasName("length")))),
+        on(declRefExpr(to(varDecl(equalsBoundNode("var")))))))))))
+      .bind("data"),
+    llvm::make_unique<VerifyIdIsBoundTo<Expr>>("data", 1)));
+
+  EXPECT_FALSE(matches(
+    "struct StringRef { int size() const; const char* data() const; };"
+      "void f(StringRef v) {"
+      "  v.data();"
+      "  v.size();"
+      "}",
+    cxxMemberCallExpr(
+      callee(cxxMethodDecl(hasName("data"))),
+      on(declRefExpr(to(
+        varDecl(hasType(recordDecl(hasName("StringRef")))).bind("var")))),
+      unless(hasAncestor(stmt(hasDescendant(cxxMemberCallExpr(
+        callee(cxxMethodDecl(anyOf(hasName("size"), hasName("length")))),
+        on(declRefExpr(to(varDecl(equalsBoundNode("var")))))))))))
+      .bind("data")));
+}
+
+TEST(NullPointerConstants, Basic) {
+  EXPECT_TRUE(matches("#define NULL ((void *)0)\n"
+                        "void *v1 = NULL;", expr(nullPointerConstant())));
+  EXPECT_TRUE(matches("void *v2 = nullptr;", expr(nullPointerConstant())));
+  EXPECT_TRUE(matches("void *v3 = __null;", expr(nullPointerConstant())));
+  EXPECT_TRUE(matches("char *cp = (char *)0;", expr(nullPointerConstant())));
+  EXPECT_TRUE(matches("int *ip = 0;", expr(nullPointerConstant())));
+  EXPECT_TRUE(notMatches("int i = 0;", expr(nullPointerConstant())));
+}
+
+} // namespace ast_matchers
+} // namespace clang
diff --git a/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
new file mode 100644
index 0000000..6c8a5e0
--- /dev/null
+++ b/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -0,0 +1,1539 @@
+//== unittests/ASTMatchers/ASTMatchersNodeTest.cpp - AST matcher unit tests ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ASTMatchersTest.h"
+#include "clang/AST/PrettyPrinter.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Host.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace ast_matchers {
+
+TEST(Finder, DynamicOnlyAcceptsSomeMatchers) {
+  MatchFinder Finder;
+  EXPECT_TRUE(Finder.addDynamicMatcher(decl(), nullptr));
+  EXPECT_TRUE(Finder.addDynamicMatcher(callExpr(), nullptr));
+  EXPECT_TRUE(Finder.addDynamicMatcher(constantArrayType(hasSize(42)),
+                                       nullptr));
+
+  // Do not accept non-toplevel matchers.
+  EXPECT_FALSE(Finder.addDynamicMatcher(isArrow(), nullptr));
+  EXPECT_FALSE(Finder.addDynamicMatcher(hasName("x"), nullptr));
+}
+
+TEST(Decl, MatchesDeclarations) {
+  EXPECT_TRUE(notMatches("", decl(usingDecl())));
+  EXPECT_TRUE(matches("namespace x { class X {}; } using x::X;",
+                      decl(usingDecl())));
+}
+
+TEST(NameableDeclaration, MatchesVariousDecls) {
+  DeclarationMatcher NamedX = namedDecl(hasName("X"));
+  EXPECT_TRUE(matches("typedef int X;", NamedX));
+  EXPECT_TRUE(matches("int X;", NamedX));
+  EXPECT_TRUE(matches("class foo { virtual void X(); };", NamedX));
+  EXPECT_TRUE(matches("void foo() try { } catch(int X) { }", NamedX));
+  EXPECT_TRUE(matches("void foo() { int X; }", NamedX));
+  EXPECT_TRUE(matches("namespace X { }", NamedX));
+  EXPECT_TRUE(matches("enum X { A, B, C };", NamedX));
+
+  EXPECT_TRUE(notMatches("#define X 1", NamedX));
+}
+
+TEST(NameableDeclaration, REMatchesVariousDecls) {
+  DeclarationMatcher NamedX = namedDecl(matchesName("::X"));
+  EXPECT_TRUE(matches("typedef int Xa;", NamedX));
+  EXPECT_TRUE(matches("int Xb;", NamedX));
+  EXPECT_TRUE(matches("class foo { virtual void Xc(); };", NamedX));
+  EXPECT_TRUE(matches("void foo() try { } catch(int Xdef) { }", NamedX));
+  EXPECT_TRUE(matches("void foo() { int Xgh; }", NamedX));
+  EXPECT_TRUE(matches("namespace Xij { }", NamedX));
+  EXPECT_TRUE(matches("enum X { A, B, C };", NamedX));
+
+  EXPECT_TRUE(notMatches("#define Xkl 1", NamedX));
+
+  DeclarationMatcher StartsWithNo = namedDecl(matchesName("::no"));
+  EXPECT_TRUE(matches("int no_foo;", StartsWithNo));
+  EXPECT_TRUE(matches("class foo { virtual void nobody(); };", StartsWithNo));
+
+  DeclarationMatcher Abc = namedDecl(matchesName("a.*b.*c"));
+  EXPECT_TRUE(matches("int abc;", Abc));
+  EXPECT_TRUE(matches("int aFOObBARc;", Abc));
+  EXPECT_TRUE(notMatches("int cab;", Abc));
+  EXPECT_TRUE(matches("int cabc;", Abc));
+
+  DeclarationMatcher StartsWithK = namedDecl(matchesName(":k[^:]*$"));
+  EXPECT_TRUE(matches("int k;", StartsWithK));
+  EXPECT_TRUE(matches("int kAbc;", StartsWithK));
+  EXPECT_TRUE(matches("namespace x { int kTest; }", StartsWithK));
+  EXPECT_TRUE(matches("class C { int k; };", StartsWithK));
+  EXPECT_TRUE(notMatches("class C { int ckc; };", StartsWithK));
+}
+
+TEST(DeclarationMatcher, MatchClass) {
+  DeclarationMatcher ClassMatcher(recordDecl());
+
+  // This passes on Windows only because we explicitly pass -target
+  // i386-unknown-unknown.  If we were to compile with the default target
+  // triple, we'd want to EXPECT_TRUE if it's Win32 or MSVC.
+  EXPECT_FALSE(matches("", ClassMatcher));
+
+  DeclarationMatcher ClassX = recordDecl(recordDecl(hasName("X")));
+  EXPECT_TRUE(matches("class X;", ClassX));
+  EXPECT_TRUE(matches("class X {};", ClassX));
+  EXPECT_TRUE(matches("template<class T> class X {};", ClassX));
+  EXPECT_TRUE(notMatches("", ClassX));
+}
+
+TEST(DeclarationMatcher, translationUnitDecl) {
+  const std::string Code = "int MyVar1;\n"
+    "namespace NameSpace {\n"
+    "int MyVar2;\n"
+    "}  // namespace NameSpace\n";
+  EXPECT_TRUE(matches(
+    Code, varDecl(hasName("MyVar1"), hasDeclContext(translationUnitDecl()))));
+  EXPECT_FALSE(matches(
+    Code, varDecl(hasName("MyVar2"), hasDeclContext(translationUnitDecl()))));
+  EXPECT_TRUE(matches(
+    Code,
+    varDecl(hasName("MyVar2"),
+            hasDeclContext(decl(hasDeclContext(translationUnitDecl()))))));
+}
+
+TEST(DeclarationMatcher, LinkageSpecification) {
+  EXPECT_TRUE(matches("extern \"C\" { void foo() {}; }", linkageSpecDecl()));
+  EXPECT_TRUE(notMatches("void foo() {};", linkageSpecDecl()));
+}
+
+TEST(ClassTemplate, DoesNotMatchClass) {
+  DeclarationMatcher ClassX = classTemplateDecl(hasName("X"));
+  EXPECT_TRUE(notMatches("class X;", ClassX));
+  EXPECT_TRUE(notMatches("class X {};", ClassX));
+}
+
+TEST(ClassTemplate, MatchesClassTemplate) {
+  DeclarationMatcher ClassX = classTemplateDecl(hasName("X"));
+  EXPECT_TRUE(matches("template<typename T> class X {};", ClassX));
+  EXPECT_TRUE(matches("class Z { template<class T> class X {}; };", ClassX));
+}
+
+TEST(ClassTemplate, DoesNotMatchClassTemplateExplicitSpecialization) {
+  EXPECT_TRUE(notMatches("template<typename T> class X { };"
+                           "template<> class X<int> { int a; };",
+                         classTemplateDecl(hasName("X"),
+                                           hasDescendant(fieldDecl(hasName("a"))))));
+}
+
+TEST(ClassTemplate, DoesNotMatchClassTemplatePartialSpecialization) {
+  EXPECT_TRUE(notMatches("template<typename T, typename U> class X { };"
+                           "template<typename T> class X<T, int> { int a; };",
+                         classTemplateDecl(hasName("X"),
+                                           hasDescendant(fieldDecl(hasName("a"))))));
+}
+
+TEST(DeclarationMatcher, MatchCudaDecl) {
+  EXPECT_TRUE(matchesWithCuda("__global__ void f() { }"
+                                "void g() { f<<<1, 2>>>(); }",
+                              cudaKernelCallExpr()));
+  EXPECT_TRUE(matchesWithCuda("__attribute__((device)) void f() {}",
+                              hasAttr(clang::attr::CUDADevice)));
+  EXPECT_TRUE(notMatchesWithCuda("void f() {}",
+                                 cudaKernelCallExpr()));
+  EXPECT_FALSE(notMatchesWithCuda("__attribute__((global)) void f() {}",
+                                  hasAttr(clang::attr::CUDAGlobal)));
+}
+
+TEST(ValueDecl, Matches) {
+  EXPECT_TRUE(matches("enum EnumType { EnumValue };",
+                      valueDecl(hasType(asString("enum EnumType")))));
+  EXPECT_TRUE(matches("void FunctionDecl();",
+                      valueDecl(hasType(asString("void (void)")))));
+}
+
+TEST(Enum, DoesNotMatchClasses) {
+  EXPECT_TRUE(notMatches("class X {};", enumDecl(hasName("X"))));
+}
+
+TEST(Enum, MatchesEnums) {
+  EXPECT_TRUE(matches("enum X {};", enumDecl(hasName("X"))));
+}
+
+TEST(EnumConstant, Matches) {
+  DeclarationMatcher Matcher = enumConstantDecl(hasName("A"));
+  EXPECT_TRUE(matches("enum X{ A };", Matcher));
+  EXPECT_TRUE(notMatches("enum X{ B };", Matcher));
+  EXPECT_TRUE(notMatches("enum X {};", Matcher));
+}
+
+TEST(Matcher, UnresolvedLookupExpr) {
+  // FIXME: The test is known to be broken on Windows with delayed template
+  // parsing.
+  EXPECT_TRUE(matchesConditionally("template<typename T>"
+                                   "T foo() { T a; return a; }"
+                                   "template<typename T>"
+                                   "void bar() {"
+                                   "  foo<T>();"
+                                   "}",
+                                   unresolvedLookupExpr(),
+                                   /*ExpectMatch=*/true,
+                                   "-fno-delayed-template-parsing"));
+}
+
+TEST(Matcher, Call) {
+  // FIXME: Do we want to overload Call() to directly take
+  // Matcher<Decl>, too?
+  StatementMatcher MethodX =
+    callExpr(hasDeclaration(cxxMethodDecl(hasName("x"))));
+
+  EXPECT_TRUE(matches("class Y { void x() { x(); } };", MethodX));
+  EXPECT_TRUE(notMatches("class Y { void x() {} };", MethodX));
+
+  StatementMatcher MethodOnY =
+    cxxMemberCallExpr(on(hasType(recordDecl(hasName("Y")))));
+
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z() { Y y; y.x(); }",
+            MethodOnY));
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z(Y &y) { y.x(); }",
+            MethodOnY));
+  EXPECT_TRUE(
+    notMatches("class Y { public: void x(); }; void z(Y *&y) { y->x(); }",
+               MethodOnY));
+  EXPECT_TRUE(
+    notMatches("class Y { public: void x(); }; void z(Y y[]) { y->x(); }",
+               MethodOnY));
+  EXPECT_TRUE(
+    notMatches("class Y { public: void x(); }; void z() { Y *y; y->x(); }",
+               MethodOnY));
+
+  StatementMatcher MethodOnYPointer =
+    cxxMemberCallExpr(on(hasType(pointsTo(recordDecl(hasName("Y"))))));
+
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z() { Y *y; y->x(); }",
+            MethodOnYPointer));
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z(Y *&y) { y->x(); }",
+            MethodOnYPointer));
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z(Y y[]) { y->x(); }",
+            MethodOnYPointer));
+  EXPECT_TRUE(
+    notMatches("class Y { public: void x(); }; void z() { Y y; y.x(); }",
+               MethodOnYPointer));
+  EXPECT_TRUE(
+    notMatches("class Y { public: void x(); }; void z(Y &y) { y.x(); }",
+               MethodOnYPointer));
+}
+TEST(Matcher, Lambda) {
+  EXPECT_TRUE(matches("auto f = [] (int i) { return i; };",
+                      lambdaExpr()));
+}
+
+TEST(Matcher, ForRange) {
+  EXPECT_TRUE(matches("int as[] = { 1, 2, 3 };"
+                        "void f() { for (auto &a : as); }",
+                      cxxForRangeStmt()));
+  EXPECT_TRUE(notMatches("void f() { for (int i; i<5; ++i); }",
+                         cxxForRangeStmt()));
+}
+
+TEST(Matcher, SubstNonTypeTemplateParm) {
+  EXPECT_FALSE(matches("template<int N>\n"
+                         "struct A {  static const int n = 0; };\n"
+                         "struct B : public A<42> {};",
+                       substNonTypeTemplateParmExpr()));
+  EXPECT_TRUE(matches("template<int N>\n"
+                        "struct A {  static const int n = N; };\n"
+                        "struct B : public A<42> {};",
+                      substNonTypeTemplateParmExpr()));
+}
+
+TEST(Matcher, NonTypeTemplateParmDecl) {
+  EXPECT_TRUE(matches("template <int N> void f();",
+                      nonTypeTemplateParmDecl(hasName("N"))));
+  EXPECT_TRUE(
+    notMatches("template <typename T> void f();", nonTypeTemplateParmDecl()));
+}
+
+TEST(Matcher, templateTypeParmDecl) {
+  EXPECT_TRUE(matches("template <typename T> void f();",
+                      templateTypeParmDecl(hasName("T"))));
+  EXPECT_TRUE(
+    notMatches("template <int N> void f();", templateTypeParmDecl()));
+}
+
+TEST(Matcher, UserDefinedLiteral) {
+  EXPECT_TRUE(matches("constexpr char operator \"\" _inc (const char i) {"
+                        "  return i + 1;"
+                        "}"
+                        "char c = 'a'_inc;",
+                      userDefinedLiteral()));
+}
+
+TEST(Matcher, FlowControl) {
+  EXPECT_TRUE(matches("void f() { while(true) { break; } }", breakStmt()));
+  EXPECT_TRUE(matches("void f() { while(true) { continue; } }",
+                      continueStmt()));
+  EXPECT_TRUE(matches("void f() { goto FOO; FOO: ;}", gotoStmt()));
+  EXPECT_TRUE(matches("void f() { goto FOO; FOO: ;}",
+                      labelStmt(
+                        hasDeclaration(
+                          labelDecl(hasName("FOO"))))));
+  EXPECT_TRUE(matches("void f() { FOO: ; void *ptr = &&FOO; goto *ptr; }",
+                      addrLabelExpr()));
+  EXPECT_TRUE(matches("void f() { return; }", returnStmt()));
+}
+
+TEST(Matcher, OverloadedOperatorCall) {
+  StatementMatcher OpCall = cxxOperatorCallExpr();
+  // Unary operator
+  EXPECT_TRUE(matches("class Y { }; "
+                        "bool operator!(Y x) { return false; }; "
+                        "Y y; bool c = !y;", OpCall));
+  // No match -- special operators like "new", "delete"
+  // FIXME: operator new takes size_t, for which we need stddef.h, for which
+  // we need to figure out include paths in the test.
+  // EXPECT_TRUE(NotMatches("#include <stddef.h>\n"
+  //             "class Y { }; "
+  //             "void *operator new(size_t size) { return 0; } "
+  //             "Y *y = new Y;", OpCall));
+  EXPECT_TRUE(notMatches("class Y { }; "
+                           "void operator delete(void *p) { } "
+                           "void a() {Y *y = new Y; delete y;}", OpCall));
+  // Binary operator
+  EXPECT_TRUE(matches("class Y { }; "
+                        "bool operator&&(Y x, Y y) { return true; }; "
+                        "Y a; Y b; bool c = a && b;",
+                      OpCall));
+  // No match -- normal operator, not an overloaded one.
+  EXPECT_TRUE(notMatches("bool x = true, y = true; bool t = x && y;", OpCall));
+  EXPECT_TRUE(notMatches("int t = 5 << 2;", OpCall));
+}
+
+TEST(Matcher, ThisPointerType) {
+  StatementMatcher MethodOnY =
+    cxxMemberCallExpr(thisPointerType(recordDecl(hasName("Y"))));
+
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z() { Y y; y.x(); }",
+            MethodOnY));
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z(Y &y) { y.x(); }",
+            MethodOnY));
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z(Y *&y) { y->x(); }",
+            MethodOnY));
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z(Y y[]) { y->x(); }",
+            MethodOnY));
+  EXPECT_TRUE(
+    matches("class Y { public: void x(); }; void z() { Y *y; y->x(); }",
+            MethodOnY));
+
+  EXPECT_TRUE(matches(
+    "class Y {"
+      "  public: virtual void x();"
+      "};"
+      "class X : public Y {"
+      "  public: virtual void x();"
+      "};"
+      "void z() { X *x; x->Y::x(); }", MethodOnY));
+}
+
+TEST(Matcher, VariableUsage) {
+  StatementMatcher Reference =
+    declRefExpr(to(
+      varDecl(hasInitializer(
+        cxxMemberCallExpr(thisPointerType(recordDecl(hasName("Y"))))))));
+
+  EXPECT_TRUE(matches(
+    "class Y {"
+      " public:"
+      "  bool x() const;"
+      "};"
+      "void z(const Y &y) {"
+      "  bool b = y.x();"
+      "  if (b) {}"
+      "}", Reference));
+
+  EXPECT_TRUE(notMatches(
+    "class Y {"
+      " public:"
+      "  bool x() const;"
+      "};"
+      "void z(const Y &y) {"
+      "  bool b = y.x();"
+      "}", Reference));
+}
+
+TEST(Matcher, CalledVariable) {
+  StatementMatcher CallOnVariableY =
+    cxxMemberCallExpr(on(declRefExpr(to(varDecl(hasName("y"))))));
+
+  EXPECT_TRUE(matches(
+    "class Y { public: void x() { Y y; y.x(); } };", CallOnVariableY));
+  EXPECT_TRUE(matches(
+    "class Y { public: void x() const { Y y; y.x(); } };", CallOnVariableY));
+  EXPECT_TRUE(matches(
+    "class Y { public: void x(); };"
+      "class X : public Y { void z() { X y; y.x(); } };", CallOnVariableY));
+  EXPECT_TRUE(matches(
+    "class Y { public: void x(); };"
+      "class X : public Y { void z() { X *y; y->x(); } };", CallOnVariableY));
+  EXPECT_TRUE(notMatches(
+    "class Y { public: void x(); };"
+      "class X : public Y { void z() { unsigned long y; ((X*)y)->x(); } };",
+    CallOnVariableY));
+}
+
+TEST(UnaryExprOrTypeTraitExpr, MatchesSizeOfAndAlignOf) {
+  EXPECT_TRUE(matches("void x() { int a = sizeof(a); }",
+                      unaryExprOrTypeTraitExpr()));
+  EXPECT_TRUE(notMatches("void x() { int a = sizeof(a); }",
+                         alignOfExpr(anything())));
+  // FIXME: Uncomment once alignof is enabled.
+  // EXPECT_TRUE(matches("void x() { int a = alignof(a); }",
+  //                     unaryExprOrTypeTraitExpr()));
+  // EXPECT_TRUE(notMatches("void x() { int a = alignof(a); }",
+  //                        sizeOfExpr()));
+}
+
+TEST(MemberExpression, DoesNotMatchClasses) {
+  EXPECT_TRUE(notMatches("class Y { void x() {} };", memberExpr()));
+}
+
+TEST(MemberExpression, MatchesMemberFunctionCall) {
+  EXPECT_TRUE(matches("class Y { void x() { x(); } };", memberExpr()));
+}
+
+TEST(MemberExpression, MatchesVariable) {
+  EXPECT_TRUE(
+    matches("class Y { void x() { this->y; } int y; };", memberExpr()));
+  EXPECT_TRUE(
+    matches("class Y { void x() { y; } int y; };", memberExpr()));
+  EXPECT_TRUE(
+    matches("class Y { void x() { Y y; y.y; } int y; };", memberExpr()));
+}
+
+TEST(MemberExpression, MatchesStaticVariable) {
+  EXPECT_TRUE(matches("class Y { void x() { this->y; } static int y; };",
+                      memberExpr()));
+  EXPECT_TRUE(notMatches("class Y { void x() { y; } static int y; };",
+                         memberExpr()));
+  EXPECT_TRUE(notMatches("class Y { void x() { Y::y; } static int y; };",
+                         memberExpr()));
+}
+
+TEST(Function, MatchesFunctionDeclarations) {
+  StatementMatcher CallFunctionF = callExpr(callee(functionDecl(hasName("f"))));
+
+  EXPECT_TRUE(matches("void f() { f(); }", CallFunctionF));
+  EXPECT_TRUE(notMatches("void f() { }", CallFunctionF));
+
+  if (llvm::Triple(llvm::sys::getDefaultTargetTriple()).getOS() !=
+    llvm::Triple::Win32) {
+    // FIXME: Make this work for MSVC.
+    // Dependent contexts, but a non-dependent call.
+    EXPECT_TRUE(matches("void f(); template <int N> void g() { f(); }",
+                        CallFunctionF));
+    EXPECT_TRUE(
+      matches("void f(); template <int N> struct S { void g() { f(); } };",
+              CallFunctionF));
+  }
+
+  // Depedent calls don't match.
+  EXPECT_TRUE(
+    notMatches("void f(int); template <typename T> void g(T t) { f(t); }",
+               CallFunctionF));
+  EXPECT_TRUE(
+    notMatches("void f(int);"
+                 "template <typename T> struct S { void g(T t) { f(t); } };",
+               CallFunctionF));
+
+  EXPECT_TRUE(matches("void f(...);", functionDecl(isVariadic())));
+  EXPECT_TRUE(notMatches("void f(int);", functionDecl(isVariadic())));
+  EXPECT_TRUE(notMatches("template <typename... Ts> void f(Ts...);",
+                         functionDecl(isVariadic())));
+  EXPECT_TRUE(notMatches("void f();", functionDecl(isVariadic())));
+  EXPECT_TRUE(notMatchesC("void f();", functionDecl(isVariadic())));
+  EXPECT_TRUE(matches("void f(...);", functionDecl(parameterCountIs(0))));
+  EXPECT_TRUE(matchesC("void f();", functionDecl(parameterCountIs(0))));
+  EXPECT_TRUE(matches("void f(int, ...);", functionDecl(parameterCountIs(1))));
+}
+
+TEST(FunctionTemplate, MatchesFunctionTemplateDeclarations) {
+  EXPECT_TRUE(
+    matches("template <typename T> void f(T t) {}",
+            functionTemplateDecl(hasName("f"))));
+}
+
+TEST(FunctionTemplate, DoesNotMatchFunctionDeclarations) {
+  EXPECT_TRUE(
+    notMatches("void f(double d); void f(int t) {}",
+               functionTemplateDecl(hasName("f"))));
+}
+
+TEST(FunctionTemplate, DoesNotMatchFunctionTemplateSpecializations) {
+  EXPECT_TRUE(
+    notMatches("void g(); template <typename T> void f(T t) {}"
+                 "template <> void f(int t) { g(); }",
+               functionTemplateDecl(hasName("f"),
+                                    hasDescendant(declRefExpr(to(
+                                      functionDecl(hasName("g"))))))));
+}
+
+TEST(Matcher, MatchesClassTemplateSpecialization) {
+  EXPECT_TRUE(matches("template<typename T> struct A {};"
+                        "template<> struct A<int> {};",
+                      classTemplateSpecializationDecl()));
+  EXPECT_TRUE(matches("template<typename T> struct A {}; A<int> a;",
+                      classTemplateSpecializationDecl()));
+  EXPECT_TRUE(notMatches("template<typename T> struct A {};",
+                         classTemplateSpecializationDecl()));
+}
+
+TEST(DeclaratorDecl, MatchesDeclaratorDecls) {
+  EXPECT_TRUE(matches("int x;", declaratorDecl()));
+  EXPECT_TRUE(notMatches("class A {};", declaratorDecl()));
+}
+
+TEST(ParmVarDecl, MatchesParmVars) {
+  EXPECT_TRUE(matches("void f(int x);", parmVarDecl()));
+  EXPECT_TRUE(notMatches("void f();", parmVarDecl()));
+}
+
+TEST(Matcher, ConstructorCall) {
+  StatementMatcher Constructor = cxxConstructExpr();
+
+  EXPECT_TRUE(
+    matches("class X { public: X(); }; void x() { X x; }", Constructor));
+  EXPECT_TRUE(
+    matches("class X { public: X(); }; void x() { X x = X(); }",
+            Constructor));
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { X x = 0; }",
+            Constructor));
+  EXPECT_TRUE(matches("class X {}; void x(int) { X x; }", Constructor));
+}
+
+TEST(Matcher, ThisExpr) {
+  EXPECT_TRUE(
+    matches("struct X { int a; int f () { return a; } };", cxxThisExpr()));
+  EXPECT_TRUE(
+    notMatches("struct X { int f () { int a; return a; } };", cxxThisExpr()));
+}
+
+TEST(Matcher, BindTemporaryExpression) {
+  StatementMatcher TempExpression = cxxBindTemporaryExpr();
+
+  std::string ClassString = "class string { public: string(); ~string(); }; ";
+
+  EXPECT_TRUE(
+    matches(ClassString +
+              "string GetStringByValue();"
+                "void FunctionTakesString(string s);"
+                "void run() { FunctionTakesString(GetStringByValue()); }",
+            TempExpression));
+
+  EXPECT_TRUE(
+    notMatches(ClassString +
+                 "string* GetStringPointer(); "
+                   "void FunctionTakesStringPtr(string* s);"
+                   "void run() {"
+                   "  string* s = GetStringPointer();"
+                   "  FunctionTakesStringPtr(GetStringPointer());"
+                   "  FunctionTakesStringPtr(s);"
+                   "}",
+               TempExpression));
+
+  EXPECT_TRUE(
+    notMatches("class no_dtor {};"
+                 "no_dtor GetObjByValue();"
+                 "void ConsumeObj(no_dtor param);"
+                 "void run() { ConsumeObj(GetObjByValue()); }",
+               TempExpression));
+}
+
+TEST(MaterializeTemporaryExpr, MatchesTemporary) {
+  std::string ClassString =
+    "class string { public: string(); int length(); }; ";
+
+  EXPECT_TRUE(
+    matches(ClassString +
+              "string GetStringByValue();"
+                "void FunctionTakesString(string s);"
+                "void run() { FunctionTakesString(GetStringByValue()); }",
+            materializeTemporaryExpr()));
+
+  EXPECT_TRUE(
+    notMatches(ClassString +
+                 "string* GetStringPointer(); "
+                   "void FunctionTakesStringPtr(string* s);"
+                   "void run() {"
+                   "  string* s = GetStringPointer();"
+                   "  FunctionTakesStringPtr(GetStringPointer());"
+                   "  FunctionTakesStringPtr(s);"
+                   "}",
+               materializeTemporaryExpr()));
+
+  EXPECT_TRUE(
+    notMatches(ClassString +
+                 "string GetStringByValue();"
+                   "void run() { int k = GetStringByValue().length(); }",
+               materializeTemporaryExpr()));
+
+  EXPECT_TRUE(
+    notMatches(ClassString +
+                 "string GetStringByValue();"
+                   "void run() { GetStringByValue(); }",
+               materializeTemporaryExpr()));
+}
+
+TEST(Matcher, NewExpression) {
+  StatementMatcher New = cxxNewExpr();
+
+  EXPECT_TRUE(matches("class X { public: X(); }; void x() { new X; }", New));
+  EXPECT_TRUE(
+    matches("class X { public: X(); }; void x() { new X(); }", New));
+  EXPECT_TRUE(
+    matches("class X { public: X(int); }; void x() { new X(0); }", New));
+  EXPECT_TRUE(matches("class X {}; void x(int) { new X; }", New));
+}
+
+TEST(Matcher, DeleteExpression) {
+  EXPECT_TRUE(matches("struct A {}; void f(A* a) { delete a; }",
+                      cxxDeleteExpr()));
+}
+
+TEST(Matcher, DefaultArgument) {
+  StatementMatcher Arg = cxxDefaultArgExpr();
+
+  EXPECT_TRUE(matches("void x(int, int = 0) { int y; x(y); }", Arg));
+  EXPECT_TRUE(
+    matches("class X { void x(int, int = 0) { int y; x(y); } };", Arg));
+  EXPECT_TRUE(notMatches("void x(int, int = 0) { int y; x(y, 0); }", Arg));
+}
+
+TEST(Matcher, StringLiterals) {
+  StatementMatcher Literal = stringLiteral();
+  EXPECT_TRUE(matches("const char *s = \"string\";", Literal));
+  // wide string
+  EXPECT_TRUE(matches("const wchar_t *s = L\"string\";", Literal));
+  // with escaped characters
+  EXPECT_TRUE(matches("const char *s = \"\x05five\";", Literal));
+  // no matching -- though the data type is the same, there is no string literal
+  EXPECT_TRUE(notMatches("const char s[1] = {'a'};", Literal));
+}
+
+TEST(Matcher, CharacterLiterals) {
+  StatementMatcher CharLiteral = characterLiteral();
+  EXPECT_TRUE(matches("const char c = 'c';", CharLiteral));
+  // wide character
+  EXPECT_TRUE(matches("const char c = L'c';", CharLiteral));
+  // wide character, Hex encoded, NOT MATCHED!
+  EXPECT_TRUE(notMatches("const wchar_t c = 0x2126;", CharLiteral));
+  EXPECT_TRUE(notMatches("const char c = 0x1;", CharLiteral));
+}
+
+TEST(Matcher, IntegerLiterals) {
+  StatementMatcher HasIntLiteral = integerLiteral();
+  EXPECT_TRUE(matches("int i = 10;", HasIntLiteral));
+  EXPECT_TRUE(matches("int i = 0x1AB;", HasIntLiteral));
+  EXPECT_TRUE(matches("int i = 10L;", HasIntLiteral));
+  EXPECT_TRUE(matches("int i = 10U;", HasIntLiteral));
+
+  // Non-matching cases (character literals, float and double)
+  EXPECT_TRUE(notMatches("int i = L'a';",
+                         HasIntLiteral));  // this is actually a character
+  // literal cast to int
+  EXPECT_TRUE(notMatches("int i = 'a';", HasIntLiteral));
+  EXPECT_TRUE(notMatches("int i = 1e10;", HasIntLiteral));
+  EXPECT_TRUE(notMatches("int i = 10.0;", HasIntLiteral));
+}
+
+TEST(Matcher, FloatLiterals) {
+  StatementMatcher HasFloatLiteral = floatLiteral();
+  EXPECT_TRUE(matches("float i = 10.0;", HasFloatLiteral));
+  EXPECT_TRUE(matches("float i = 10.0f;", HasFloatLiteral));
+  EXPECT_TRUE(matches("double i = 10.0;", HasFloatLiteral));
+  EXPECT_TRUE(matches("double i = 10.0L;", HasFloatLiteral));
+  EXPECT_TRUE(matches("double i = 1e10;", HasFloatLiteral));
+  EXPECT_TRUE(matches("double i = 5.0;", floatLiteral(equals(5.0))));
+  EXPECT_TRUE(matches("double i = 5.0;", floatLiteral(equals(5.0f))));
+  EXPECT_TRUE(
+    matches("double i = 5.0;", floatLiteral(equals(llvm::APFloat(5.0)))));
+
+  EXPECT_TRUE(notMatches("float i = 10;", HasFloatLiteral));
+  EXPECT_TRUE(notMatches("double i = 5.0;", floatLiteral(equals(6.0))));
+  EXPECT_TRUE(notMatches("double i = 5.0;", floatLiteral(equals(6.0f))));
+  EXPECT_TRUE(
+    notMatches("double i = 5.0;", floatLiteral(equals(llvm::APFloat(6.0)))));
+}
+
+TEST(Matcher, NullPtrLiteral) {
+  EXPECT_TRUE(matches("int* i = nullptr;", cxxNullPtrLiteralExpr()));
+}
+
+TEST(Matcher, GNUNullExpr) {
+  EXPECT_TRUE(matches("int* i = __null;", gnuNullExpr()));
+}
+
+TEST(Matcher, AtomicExpr) {
+  EXPECT_TRUE(matches("void foo() { int *ptr; __atomic_load_n(ptr, 1); }",
+                      atomicExpr()));
+}
+
+TEST(Matcher, Initializers) {
+  const char *ToMatch = "void foo() { struct point { double x; double y; };"
+    "  struct point ptarray[10] = "
+    "      { [2].y = 1.0, [2].x = 2.0, [0].x = 1.0 }; }";
+  EXPECT_TRUE(matchesConditionally(
+    ToMatch,
+    initListExpr(
+      has(
+        cxxConstructExpr(
+          requiresZeroInitialization())),
+      has(
+        initListExpr(
+          hasType(asString("struct point")),
+          has(floatLiteral(equals(1.0))),
+          has(implicitValueInitExpr(
+            hasType(asString("double")))))),
+      has(
+        initListExpr(
+          hasType(asString("struct point")),
+          has(floatLiteral(equals(2.0))),
+          has(floatLiteral(equals(1.0)))))
+    ), true, "-std=gnu++98"));
+
+  EXPECT_TRUE(matchesC99(ToMatch,
+                         initListExpr(
+                           hasSyntacticForm(
+                             initListExpr(
+                               has(
+                                 designatedInitExpr(
+                                   designatorCountIs(2),
+                                   has(floatLiteral(
+                                     equals(1.0))),
+                                   has(integerLiteral(
+                                     equals(2))))),
+                               has(
+                                 designatedInitExpr(
+                                   designatorCountIs(2),
+                                   has(floatLiteral(
+                                     equals(2.0))),
+                                   has(integerLiteral(
+                                     equals(2))))),
+                               has(
+                                 designatedInitExpr(
+                                   designatorCountIs(2),
+                                   has(floatLiteral(
+                                     equals(1.0))),
+                                   has(integerLiteral(
+                                     equals(0)))))
+                             )))));
+}
+
+TEST(Matcher, ParenListExpr) {
+  EXPECT_TRUE(
+    matches("template<typename T> class foo { void bar() { foo X(*this); } };"
+              "template class foo<int>;",
+            varDecl(hasInitializer(parenListExpr(has(unaryOperator()))))));
+}
+
+TEST(Matcher, StmtExpr) {
+  EXPECT_TRUE(matches("void declToImport() { int C = ({int X=4; X;}); }",
+                      varDecl(hasInitializer(stmtExpr()))));
+}
+
+TEST(Matcher, ImportPredefinedExpr) {
+  // __func__ expands as StringLiteral("foo")
+  EXPECT_TRUE(matches("void foo() { __func__; }",
+                      predefinedExpr(
+                        hasType(asString("const char [4]")),
+                        has(stringLiteral()))));
+}
+
+TEST(Matcher, AsmStatement) {
+  EXPECT_TRUE(matches("void foo() { __asm(\"mov al, 2\"); }", asmStmt()));
+}
+
+TEST(Matcher, Conditions) {
+  StatementMatcher Condition =
+    ifStmt(hasCondition(cxxBoolLiteral(equals(true))));
+
+  EXPECT_TRUE(matches("void x() { if (true) {} }", Condition));
+  EXPECT_TRUE(notMatches("void x() { if (false) {} }", Condition));
+  EXPECT_TRUE(notMatches("void x() { bool a = true; if (a) {} }", Condition));
+  EXPECT_TRUE(notMatches("void x() { if (true || false) {} }", Condition));
+  EXPECT_TRUE(notMatches("void x() { if (1) {} }", Condition));
+}
+
+TEST(Matcher, ConditionalOperator) {
+  StatementMatcher Conditional = conditionalOperator(
+    hasCondition(cxxBoolLiteral(equals(true))),
+    hasTrueExpression(cxxBoolLiteral(equals(false))));
+
+  EXPECT_TRUE(matches("void x() { true ? false : true; }", Conditional));
+  EXPECT_TRUE(notMatches("void x() { false ? false : true; }", Conditional));
+  EXPECT_TRUE(notMatches("void x() { true ? true : false; }", Conditional));
+
+  StatementMatcher ConditionalFalse = conditionalOperator(
+    hasFalseExpression(cxxBoolLiteral(equals(false))));
+
+  EXPECT_TRUE(matches("void x() { true ? true : false; }", ConditionalFalse));
+  EXPECT_TRUE(
+    notMatches("void x() { true ? false : true; }", ConditionalFalse));
+
+  EXPECT_TRUE(matches("void x() { true ? true : false; }", ConditionalFalse));
+  EXPECT_TRUE(
+    notMatches("void x() { true ? false : true; }", ConditionalFalse));
+}
+
+TEST(Matcher, BinaryConditionalOperator) {
+  StatementMatcher AlwaysOne = binaryConditionalOperator(
+    hasCondition(implicitCastExpr(
+      has(
+        opaqueValueExpr(
+          hasSourceExpression((integerLiteral(equals(1)))))))),
+    hasFalseExpression(integerLiteral(equals(0))));
+
+  EXPECT_TRUE(matches("void x() { 1 ?: 0; }", AlwaysOne));
+
+  StatementMatcher FourNotFive = binaryConditionalOperator(
+    hasTrueExpression(opaqueValueExpr(
+      hasSourceExpression((integerLiteral(equals(4)))))),
+    hasFalseExpression(integerLiteral(equals(5))));
+
+  EXPECT_TRUE(matches("void x() { 4 ?: 5; }", FourNotFive));
+}
+
+TEST(ArraySubscriptMatchers, ArraySubscripts) {
+  EXPECT_TRUE(matches("int i[2]; void f() { i[1] = 1; }",
+                      arraySubscriptExpr()));
+  EXPECT_TRUE(notMatches("int i; void f() { i = 1; }",
+                         arraySubscriptExpr()));
+}
+
+TEST(For, FindsForLoops) {
+  EXPECT_TRUE(matches("void f() { for(;;); }", forStmt()));
+  EXPECT_TRUE(matches("void f() { if(true) for(;;); }", forStmt()));
+  EXPECT_TRUE(notMatches("int as[] = { 1, 2, 3 };"
+                           "void f() { for (auto &a : as); }",
+                         forStmt()));
+}
+
+TEST(For, ReportsNoFalsePositives) {
+  EXPECT_TRUE(notMatches("void f() { ; }", forStmt()));
+  EXPECT_TRUE(notMatches("void f() { if(true); }", forStmt()));
+}
+
+TEST(CompoundStatement, HandlesSimpleCases) {
+  EXPECT_TRUE(notMatches("void f();", compoundStmt()));
+  EXPECT_TRUE(matches("void f() {}", compoundStmt()));
+  EXPECT_TRUE(matches("void f() {{}}", compoundStmt()));
+}
+
+TEST(CompoundStatement, DoesNotMatchEmptyStruct) {
+  // It's not a compound statement just because there's "{}" in the source
+  // text. This is an AST search, not grep.
+  EXPECT_TRUE(notMatches("namespace n { struct S {}; }",
+                         compoundStmt()));
+  EXPECT_TRUE(matches("namespace n { struct S { void f() {{}} }; }",
+                      compoundStmt()));
+}
+
+TEST(CastExpression, MatchesExplicitCasts) {
+  EXPECT_TRUE(matches("char *p = reinterpret_cast<char *>(&p);",castExpr()));
+  EXPECT_TRUE(matches("void *p = (void *)(&p);", castExpr()));
+  EXPECT_TRUE(matches("char q, *p = const_cast<char *>(&q);", castExpr()));
+  EXPECT_TRUE(matches("char c = char(0);", castExpr()));
+}
+TEST(CastExpression, MatchesImplicitCasts) {
+  // This test creates an implicit cast from int to char.
+  EXPECT_TRUE(matches("char c = 0;", castExpr()));
+  // This test creates an implicit cast from lvalue to rvalue.
+  EXPECT_TRUE(matches("char c = 0, d = c;", castExpr()));
+}
+
+TEST(CastExpression, DoesNotMatchNonCasts) {
+  EXPECT_TRUE(notMatches("char c = '0';", castExpr()));
+  EXPECT_TRUE(notMatches("char c, &q = c;", castExpr()));
+  EXPECT_TRUE(notMatches("int i = (0);", castExpr()));
+  EXPECT_TRUE(notMatches("int i = 0;", castExpr()));
+}
+
+TEST(ReinterpretCast, MatchesSimpleCase) {
+  EXPECT_TRUE(matches("char* p = reinterpret_cast<char*>(&p);",
+                      cxxReinterpretCastExpr()));
+}
+
+TEST(ReinterpretCast, DoesNotMatchOtherCasts) {
+  EXPECT_TRUE(notMatches("char* p = (char*)(&p);", cxxReinterpretCastExpr()));
+  EXPECT_TRUE(notMatches("char q, *p = const_cast<char*>(&q);",
+                         cxxReinterpretCastExpr()));
+  EXPECT_TRUE(notMatches("void* p = static_cast<void*>(&p);",
+                         cxxReinterpretCastExpr()));
+  EXPECT_TRUE(notMatches("struct B { virtual ~B() {} }; struct D : B {};"
+                           "B b;"
+                           "D* p = dynamic_cast<D*>(&b);",
+                         cxxReinterpretCastExpr()));
+}
+
+TEST(FunctionalCast, MatchesSimpleCase) {
+  std::string foo_class = "class Foo { public: Foo(const char*); };";
+  EXPECT_TRUE(matches(foo_class + "void r() { Foo f = Foo(\"hello world\"); }",
+                      cxxFunctionalCastExpr()));
+}
+
+TEST(FunctionalCast, DoesNotMatchOtherCasts) {
+  std::string FooClass = "class Foo { public: Foo(const char*); };";
+  EXPECT_TRUE(
+    notMatches(FooClass + "void r() { Foo f = (Foo) \"hello world\"; }",
+               cxxFunctionalCastExpr()));
+  EXPECT_TRUE(
+    notMatches(FooClass + "void r() { Foo f = \"hello world\"; }",
+               cxxFunctionalCastExpr()));
+}
+
+TEST(DynamicCast, MatchesSimpleCase) {
+  EXPECT_TRUE(matches("struct B { virtual ~B() {} }; struct D : B {};"
+                        "B b;"
+                        "D* p = dynamic_cast<D*>(&b);",
+                      cxxDynamicCastExpr()));
+}
+
+TEST(StaticCast, MatchesSimpleCase) {
+  EXPECT_TRUE(matches("void* p(static_cast<void*>(&p));",
+                      cxxStaticCastExpr()));
+}
+
+TEST(StaticCast, DoesNotMatchOtherCasts) {
+  EXPECT_TRUE(notMatches("char* p = (char*)(&p);", cxxStaticCastExpr()));
+  EXPECT_TRUE(notMatches("char q, *p = const_cast<char*>(&q);",
+                         cxxStaticCastExpr()));
+  EXPECT_TRUE(notMatches("void* p = reinterpret_cast<char*>(&p);",
+                         cxxStaticCastExpr()));
+  EXPECT_TRUE(notMatches("struct B { virtual ~B() {} }; struct D : B {};"
+                           "B b;"
+                           "D* p = dynamic_cast<D*>(&b);",
+                         cxxStaticCastExpr()));
+}
+
+TEST(CStyleCast, MatchesSimpleCase) {
+  EXPECT_TRUE(matches("int i = (int) 2.2f;", cStyleCastExpr()));
+}
+
+TEST(CStyleCast, DoesNotMatchOtherCasts) {
+  EXPECT_TRUE(notMatches("char* p = static_cast<char*>(0);"
+                           "char q, *r = const_cast<char*>(&q);"
+                           "void* s = reinterpret_cast<char*>(&s);"
+                           "struct B { virtual ~B() {} }; struct D : B {};"
+                           "B b;"
+                           "D* t = dynamic_cast<D*>(&b);",
+                         cStyleCastExpr()));
+}
+
+TEST(ImplicitCast, MatchesSimpleCase) {
+  // This test creates an implicit const cast.
+  EXPECT_TRUE(matches("int x = 0; const int y = x;",
+                      varDecl(hasInitializer(implicitCastExpr()))));
+  // This test creates an implicit cast from int to char.
+  EXPECT_TRUE(matches("char c = 0;",
+                      varDecl(hasInitializer(implicitCastExpr()))));
+  // This test creates an implicit array-to-pointer cast.
+  EXPECT_TRUE(matches("int arr[6]; int *p = arr;",
+                      varDecl(hasInitializer(implicitCastExpr()))));
+}
+
+TEST(ImplicitCast, DoesNotMatchIncorrectly) {
+  // This test verifies that implicitCastExpr() matches exactly when implicit casts
+  // are present, and that it ignores explicit and paren casts.
+
+  // These two test cases have no casts.
+  EXPECT_TRUE(notMatches("int x = 0;",
+                         varDecl(hasInitializer(implicitCastExpr()))));
+  EXPECT_TRUE(notMatches("int x = 0, &y = x;",
+                         varDecl(hasInitializer(implicitCastExpr()))));
+
+  EXPECT_TRUE(notMatches("int x = 0; double d = (double) x;",
+                         varDecl(hasInitializer(implicitCastExpr()))));
+  EXPECT_TRUE(notMatches("const int *p; int *q = const_cast<int *>(p);",
+                         varDecl(hasInitializer(implicitCastExpr()))));
+
+  EXPECT_TRUE(notMatches("int x = (0);",
+                         varDecl(hasInitializer(implicitCastExpr()))));
+}
+
+TEST(Statement, DoesNotMatchDeclarations) {
+  EXPECT_TRUE(notMatches("class X {};", stmt()));
+}
+
+TEST(Statement, MatchesCompoundStatments) {
+  EXPECT_TRUE(matches("void x() {}", stmt()));
+}
+
+TEST(DeclarationStatement, DoesNotMatchCompoundStatements) {
+  EXPECT_TRUE(notMatches("void x() {}", declStmt()));
+}
+
+TEST(DeclarationStatement, MatchesVariableDeclarationStatements) {
+  EXPECT_TRUE(matches("void x() { int a; }", declStmt()));
+}
+
+TEST(ExprWithCleanups, MatchesExprWithCleanups) {
+  EXPECT_TRUE(matches("struct Foo { ~Foo(); };"
+                        "const Foo f = Foo();",
+                      varDecl(hasInitializer(exprWithCleanups()))));
+  EXPECT_FALSE(matches("struct Foo { }; Foo a;"
+                       "const Foo f = a;",
+                       varDecl(hasInitializer(exprWithCleanups()))));
+}
+
+TEST(InitListExpression, MatchesInitListExpression) {
+  EXPECT_TRUE(matches("int a[] = { 1, 2 };",
+                      initListExpr(hasType(asString("int [2]")))));
+  EXPECT_TRUE(matches("struct B { int x, y; }; B b = { 5, 6 };",
+                      initListExpr(hasType(recordDecl(hasName("B"))))));
+  EXPECT_TRUE(matches("struct S { S(void (*a)()); };"
+                        "void f();"
+                        "S s[1] = { &f };",
+                      declRefExpr(to(functionDecl(hasName("f"))))));
+  EXPECT_TRUE(
+    matches("int i[1] = {42, [0] = 43};", integerLiteral(equals(42))));
+}
+
+TEST(UsingDeclaration, MatchesUsingDeclarations) {
+  EXPECT_TRUE(matches("namespace X { int x; } using X::x;",
+                      usingDecl()));
+}
+
+TEST(UsingDeclaration, MatchesShadowUsingDelcarations) {
+  EXPECT_TRUE(matches("namespace f { int a; } using f::a;",
+                      usingDecl(hasAnyUsingShadowDecl(hasName("a")))));
+}
+
+TEST(UsingDirectiveDeclaration, MatchesUsingNamespace) {
+  EXPECT_TRUE(matches("namespace X { int x; } using namespace X;",
+                      usingDirectiveDecl()));
+  EXPECT_FALSE(
+    matches("namespace X { int x; } using X::x;", usingDirectiveDecl()));
+}
+
+
+TEST(While, MatchesWhileLoops) {
+  EXPECT_TRUE(notMatches("void x() {}", whileStmt()));
+  EXPECT_TRUE(matches("void x() { while(true); }", whileStmt()));
+  EXPECT_TRUE(notMatches("void x() { do {} while(true); }", whileStmt()));
+}
+
+TEST(Do, MatchesDoLoops) {
+  EXPECT_TRUE(matches("void x() { do {} while(true); }", doStmt()));
+  EXPECT_TRUE(matches("void x() { do ; while(false); }", doStmt()));
+}
+
+TEST(Do, DoesNotMatchWhileLoops) {
+  EXPECT_TRUE(notMatches("void x() { while(true) {} }", doStmt()));
+}
+
+TEST(SwitchCase, MatchesCase) {
+  EXPECT_TRUE(matches("void x() { switch(42) { case 42:; } }", switchCase()));
+  EXPECT_TRUE(matches("void x() { switch(42) { default:; } }", switchCase()));
+  EXPECT_TRUE(matches("void x() { switch(42) default:; }", switchCase()));
+  EXPECT_TRUE(notMatches("void x() { switch(42) {} }", switchCase()));
+}
+
+TEST(SwitchCase, MatchesSwitch) {
+  EXPECT_TRUE(matches("void x() { switch(42) { case 42:; } }", switchStmt()));
+  EXPECT_TRUE(matches("void x() { switch(42) { default:; } }", switchStmt()));
+  EXPECT_TRUE(matches("void x() { switch(42) default:; }", switchStmt()));
+  EXPECT_TRUE(notMatches("void x() {}", switchStmt()));
+}
+
+TEST(ExceptionHandling, SimpleCases) {
+  EXPECT_TRUE(matches("void foo() try { } catch(int X) { }", cxxCatchStmt()));
+  EXPECT_TRUE(matches("void foo() try { } catch(int X) { }", cxxTryStmt()));
+  EXPECT_TRUE(
+    notMatches("void foo() try { } catch(int X) { }", cxxThrowExpr()));
+  EXPECT_TRUE(matches("void foo() try { throw; } catch(int X) { }",
+                      cxxThrowExpr()));
+  EXPECT_TRUE(matches("void foo() try { throw 5;} catch(int X) { }",
+                      cxxThrowExpr()));
+  EXPECT_TRUE(matches("void foo() try { throw; } catch(...) { }",
+                      cxxCatchStmt(isCatchAll())));
+  EXPECT_TRUE(notMatches("void foo() try { throw; } catch(int) { }",
+                         cxxCatchStmt(isCatchAll())));
+  EXPECT_TRUE(matches("void foo() try {} catch(int X) { }",
+                      varDecl(isExceptionVariable())));
+  EXPECT_TRUE(notMatches("void foo() try { int X; } catch (...) { }",
+                         varDecl(isExceptionVariable())));
+}
+
+TEST(ParenExpression, SimpleCases) {
+  EXPECT_TRUE(matches("int i = (3);", parenExpr()));
+  EXPECT_TRUE(matches("int i = (3 + 7);", parenExpr()));
+  EXPECT_TRUE(notMatches("int i = 3;", parenExpr()));
+  EXPECT_TRUE(notMatches("int foo() { return 1; }; int a = foo();",
+                         parenExpr()));
+}
+
+TEST(TypeMatching, MatchesTypes) {
+  EXPECT_TRUE(matches("struct S {};", qualType().bind("loc")));
+}
+
+TEST(TypeMatching, MatchesConstantArrayTypes) {
+  EXPECT_TRUE(matches("int a[2];", constantArrayType()));
+  EXPECT_TRUE(notMatches(
+    "void f() { int a[] = { 2, 3 }; int b[a[0]]; }",
+    constantArrayType(hasElementType(builtinType()))));
+
+  EXPECT_TRUE(matches("int a[42];", constantArrayType(hasSize(42))));
+  EXPECT_TRUE(matches("int b[2*21];", constantArrayType(hasSize(42))));
+  EXPECT_TRUE(notMatches("int c[41], d[43];", constantArrayType(hasSize(42))));
+}
+
+TEST(TypeMatching, MatchesDependentSizedArrayTypes) {
+  EXPECT_TRUE(matches(
+    "template <typename T, int Size> class array { T data[Size]; };",
+    dependentSizedArrayType()));
+  EXPECT_TRUE(notMatches(
+    "int a[42]; int b[] = { 2, 3 }; void f() { int c[b[0]]; }",
+    dependentSizedArrayType()));
+}
+
+TEST(TypeMatching, MatchesIncompleteArrayType) {
+  EXPECT_TRUE(matches("int a[] = { 2, 3 };", incompleteArrayType()));
+  EXPECT_TRUE(matches("void f(int a[]) {}", incompleteArrayType()));
+
+  EXPECT_TRUE(notMatches("int a[42]; void f() { int b[a[0]]; }",
+                         incompleteArrayType()));
+}
+
+TEST(TypeMatching, MatchesVariableArrayType) {
+  EXPECT_TRUE(matches("void f(int b) { int a[b]; }", variableArrayType()));
+  EXPECT_TRUE(notMatches("int a[] = {2, 3}; int b[42];", variableArrayType()));
+
+  EXPECT_TRUE(matches(
+    "void f(int b) { int a[b]; }",
+    variableArrayType(hasSizeExpr(ignoringImpCasts(declRefExpr(to(
+      varDecl(hasName("b")))))))));
+}
+
+
+TEST(TypeMatching, MatchesAtomicTypes) {
+  if (llvm::Triple(llvm::sys::getDefaultTargetTriple()).getOS() !=
+    llvm::Triple::Win32) {
+    // FIXME: Make this work for MSVC.
+    EXPECT_TRUE(matches("_Atomic(int) i;", atomicType()));
+
+    EXPECT_TRUE(matches("_Atomic(int) i;",
+                        atomicType(hasValueType(isInteger()))));
+    EXPECT_TRUE(notMatches("_Atomic(float) f;",
+                           atomicType(hasValueType(isInteger()))));
+  }
+}
+
+TEST(TypeMatching, MatchesAutoTypes) {
+  EXPECT_TRUE(matches("auto i = 2;", autoType()));
+  EXPECT_TRUE(matches("int v[] = { 2, 3 }; void f() { for (int i : v) {} }",
+                      autoType()));
+
+  // FIXME: Matching against the type-as-written can't work here, because the
+  //        type as written was not deduced.
+  //EXPECT_TRUE(matches("auto a = 1;",
+  //                    autoType(hasDeducedType(isInteger()))));
+  //EXPECT_TRUE(notMatches("auto b = 2.0;",
+  //                       autoType(hasDeducedType(isInteger()))));
+}
+
+TEST(TypeMatching, MatchesFunctionTypes) {
+  EXPECT_TRUE(matches("int (*f)(int);", functionType()));
+  EXPECT_TRUE(matches("void f(int i) {}", functionType()));
+}
+
+TEST(TypeMatching, IgnoringParens) {
+  EXPECT_TRUE(
+      notMatches("void (*fp)(void);", pointerType(pointee(functionType()))));
+  EXPECT_TRUE(matches("void (*fp)(void);",
+                      pointerType(pointee(ignoringParens(functionType())))));
+}
+
+TEST(TypeMatching, MatchesFunctionProtoTypes) {
+  EXPECT_TRUE(matches("int (*f)(int);", functionProtoType()));
+  EXPECT_TRUE(matches("void f(int i);", functionProtoType()));
+  EXPECT_TRUE(matches("void f();", functionProtoType(parameterCountIs(0))));
+  EXPECT_TRUE(notMatchesC("void f();", functionProtoType()));
+  EXPECT_TRUE(
+    matchesC("void f(void);", functionProtoType(parameterCountIs(0))));
+}
+
+TEST(TypeMatching, MatchesParenType) {
+  EXPECT_TRUE(
+    matches("int (*array)[4];", varDecl(hasType(pointsTo(parenType())))));
+  EXPECT_TRUE(notMatches("int *array[4];", varDecl(hasType(parenType()))));
+
+  EXPECT_TRUE(matches(
+    "int (*ptr_to_func)(int);",
+    varDecl(hasType(pointsTo(parenType(innerType(functionType())))))));
+  EXPECT_TRUE(notMatches(
+    "int (*ptr_to_array)[4];",
+    varDecl(hasType(pointsTo(parenType(innerType(functionType())))))));
+}
+
+TEST(TypeMatching, PointerTypes) {
+  // FIXME: Reactive when these tests can be more specific (not matching
+  // implicit code on certain platforms), likely when we have hasDescendant for
+  // Types/TypeLocs.
+  //EXPECT_TRUE(matchAndVerifyResultTrue(
+  //    "int* a;",
+  //    pointerTypeLoc(pointeeLoc(typeLoc().bind("loc"))),
+  //    llvm::make_unique<VerifyIdIsBoundTo<TypeLoc>>("loc", 1)));
+  //EXPECT_TRUE(matchAndVerifyResultTrue(
+  //    "int* a;",
+  //    pointerTypeLoc().bind("loc"),
+  //    llvm::make_unique<VerifyIdIsBoundTo<TypeLoc>>("loc", 1)));
+  EXPECT_TRUE(matches(
+    "int** a;",
+    loc(pointerType(pointee(qualType())))));
+  EXPECT_TRUE(matches(
+    "int** a;",
+    loc(pointerType(pointee(pointerType())))));
+  EXPECT_TRUE(matches(
+    "int* b; int* * const a = &b;",
+    loc(qualType(isConstQualified(), pointerType()))));
+
+  std::string Fragment = "struct A { int i; }; int A::* ptr = &A::i;";
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
+                                           hasType(blockPointerType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ptr"),
+                                        hasType(memberPointerType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
+                                           hasType(pointerType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
+                                           hasType(referenceType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
+                                           hasType(lValueReferenceType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
+                                           hasType(rValueReferenceType()))));
+
+  Fragment = "int *ptr;";
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
+                                           hasType(blockPointerType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
+                                           hasType(memberPointerType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ptr"),
+                                        hasType(pointerType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
+                                           hasType(referenceType()))));
+
+  Fragment = "int a; int &ref = a;";
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
+                                           hasType(blockPointerType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
+                                           hasType(memberPointerType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
+                                           hasType(pointerType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ref"),
+                                        hasType(referenceType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ref"),
+                                        hasType(lValueReferenceType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
+                                           hasType(rValueReferenceType()))));
+
+  Fragment = "int &&ref = 2;";
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
+                                           hasType(blockPointerType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
+                                           hasType(memberPointerType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
+                                           hasType(pointerType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ref"),
+                                        hasType(referenceType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
+                                           hasType(lValueReferenceType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ref"),
+                                        hasType(rValueReferenceType()))));
+}
+
+TEST(TypeMatching, AutoRefTypes) {
+  std::string Fragment = "auto a = 1;"
+    "auto b = a;"
+    "auto &c = a;"
+    "auto &&d = c;"
+    "auto &&e = 2;";
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("a"),
+                                           hasType(referenceType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("b"),
+                                           hasType(referenceType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("c"),
+                                        hasType(referenceType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("c"),
+                                        hasType(lValueReferenceType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("c"),
+                                           hasType(rValueReferenceType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("d"),
+                                        hasType(referenceType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("d"),
+                                        hasType(lValueReferenceType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("d"),
+                                           hasType(rValueReferenceType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("e"),
+                                        hasType(referenceType()))));
+  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("e"),
+                                           hasType(lValueReferenceType()))));
+  EXPECT_TRUE(matches(Fragment, varDecl(hasName("e"),
+                                        hasType(rValueReferenceType()))));
+}
+
+TEST(TypeMatching, MatchesEnumTypes) {
+  EXPECT_TRUE(matches("enum Color { Green }; Color color;",
+                      loc(enumType())));
+  EXPECT_TRUE(matches("enum class Color { Green }; Color color;",
+                      loc(enumType())));
+}
+
+TEST(TypeMatching, MatchesPointersToConstTypes) {
+  EXPECT_TRUE(matches("int b; int * const a = &b;",
+                      loc(pointerType())));
+  EXPECT_TRUE(matches("int b; int * const a = &b;",
+                      loc(pointerType())));
+  EXPECT_TRUE(matches(
+    "int b; const int * a = &b;",
+    loc(pointerType(pointee(builtinType())))));
+  EXPECT_TRUE(matches(
+    "int b; const int * a = &b;",
+    pointerType(pointee(builtinType()))));
+}
+
+TEST(TypeMatching, MatchesTypedefTypes) {
+  EXPECT_TRUE(matches("typedef int X; X a;", varDecl(hasName("a"),
+                                                     hasType(typedefType()))));
+}
+
+TEST(TypeMatching, MatchesTemplateSpecializationType) {
+  EXPECT_TRUE(matches("template <typename T> class A{}; A<int> a;",
+                      templateSpecializationType()));
+}
+
+TEST(TypeMatching, MatchesRecordType) {
+  EXPECT_TRUE(matches("class C{}; C c;", recordType()));
+  EXPECT_TRUE(matches("struct S{}; S s;",
+                      recordType(hasDeclaration(recordDecl(hasName("S"))))));
+  EXPECT_TRUE(notMatches("int i;",
+                         recordType(hasDeclaration(recordDecl(hasName("S"))))));
+}
+
+TEST(TypeMatching, MatchesElaboratedType) {
+  EXPECT_TRUE(matches(
+    "namespace N {"
+      "  namespace M {"
+      "    class D {};"
+      "  }"
+      "}"
+      "N::M::D d;", elaboratedType()));
+  EXPECT_TRUE(matches("class C {} c;", elaboratedType()));
+  EXPECT_TRUE(notMatches("class C {}; C c;", elaboratedType()));
+}
+
+TEST(TypeMatching, MatchesSubstTemplateTypeParmType) {
+  const std::string code = "template <typename T>"
+    "int F() {"
+    "  return 1 + T();"
+    "}"
+    "int i = F<int>();";
+  EXPECT_FALSE(matches(code, binaryOperator(hasLHS(
+    expr(hasType(substTemplateTypeParmType()))))));
+  EXPECT_TRUE(matches(code, binaryOperator(hasRHS(
+    expr(hasType(substTemplateTypeParmType()))))));
+}
+
+TEST(NNS, MatchesNestedNameSpecifiers) {
+  EXPECT_TRUE(matches("namespace ns { struct A {}; } ns::A a;",
+                      nestedNameSpecifier()));
+  EXPECT_TRUE(matches("template <typename T> class A { typename T::B b; };",
+                      nestedNameSpecifier()));
+  EXPECT_TRUE(matches("struct A { void f(); }; void A::f() {}",
+                      nestedNameSpecifier()));
+  EXPECT_TRUE(matches("namespace a { namespace b {} } namespace ab = a::b;",
+                      nestedNameSpecifier()));
+
+  EXPECT_TRUE(matches(
+    "struct A { static void f() {} }; void g() { A::f(); }",
+    nestedNameSpecifier()));
+  EXPECT_TRUE(notMatches(
+    "struct A { static void f() {} }; void g(A* a) { a->f(); }",
+    nestedNameSpecifier()));
+}
+
+TEST(NullStatement, SimpleCases) {
+  EXPECT_TRUE(matches("void f() {int i;;}", nullStmt()));
+  EXPECT_TRUE(notMatches("void f() {int i;}", nullStmt()));
+}
+
+TEST(NS, Alias) {
+  EXPECT_TRUE(matches("namespace test {} namespace alias = ::test;",
+                      namespaceAliasDecl(hasName("alias"))));
+}
+
+TEST(NNS, MatchesTypes) {
+  NestedNameSpecifierMatcher Matcher = nestedNameSpecifier(
+    specifiesType(hasDeclaration(recordDecl(hasName("A")))));
+  EXPECT_TRUE(matches("struct A { struct B {}; }; A::B b;", Matcher));
+  EXPECT_TRUE(matches("struct A { struct B { struct C {}; }; }; A::B::C c;",
+                      Matcher));
+  EXPECT_TRUE(notMatches("namespace A { struct B {}; } A::B b;", Matcher));
+}
+
+TEST(NNS, MatchesNamespaceDecls) {
+  NestedNameSpecifierMatcher Matcher = nestedNameSpecifier(
+    specifiesNamespace(hasName("ns")));
+  EXPECT_TRUE(matches("namespace ns { struct A {}; } ns::A a;", Matcher));
+  EXPECT_TRUE(notMatches("namespace xx { struct A {}; } xx::A a;", Matcher));
+  EXPECT_TRUE(notMatches("struct ns { struct A {}; }; ns::A a;", Matcher));
+}
+
+TEST(NNS, MatchesNestedNameSpecifierPrefixes) {
+  EXPECT_TRUE(matches(
+    "struct A { struct B { struct C {}; }; }; A::B::C c;",
+    nestedNameSpecifier(hasPrefix(specifiesType(asString("struct A"))))));
+  EXPECT_TRUE(matches(
+    "struct A { struct B { struct C {}; }; }; A::B::C c;",
+    nestedNameSpecifierLoc(hasPrefix(
+      specifiesTypeLoc(loc(qualType(asString("struct A"))))))));
+}
+
+
+template <typename T>
+class VerifyAncestorHasChildIsEqual : public BoundNodesCallback {
+public:
+  bool run(const BoundNodes *Nodes) override { return false; }
+
+  bool run(const BoundNodes *Nodes, ASTContext *Context) override {
+    const T *Node = Nodes->getNodeAs<T>("");
+    return verify(*Nodes, *Context, Node);
+  }
+
+  bool verify(const BoundNodes &Nodes, ASTContext &Context, const Stmt *Node) {
+    // Use the original typed pointer to verify we can pass pointers to subtypes
+    // to equalsNode.
+    const T *TypedNode = cast<T>(Node);
+    return selectFirst<T>(
+      "", match(stmt(hasParent(
+        stmt(has(stmt(equalsNode(TypedNode)))).bind(""))),
+                *Node, Context)) != nullptr;
+  }
+  bool verify(const BoundNodes &Nodes, ASTContext &Context, const Decl *Node) {
+    // Use the original typed pointer to verify we can pass pointers to subtypes
+    // to equalsNode.
+    const T *TypedNode = cast<T>(Node);
+    return selectFirst<T>(
+      "", match(decl(hasParent(
+        decl(has(decl(equalsNode(TypedNode)))).bind(""))),
+                *Node, Context)) != nullptr;
+  }
+  bool verify(const BoundNodes &Nodes, ASTContext &Context, const Type *Node) {
+    // Use the original typed pointer to verify we can pass pointers to subtypes
+    // to equalsNode.
+    const T *TypedNode = cast<T>(Node);
+    const auto *Dec = Nodes.getNodeAs<FieldDecl>("decl");
+    return selectFirst<T>(
+      "", match(fieldDecl(hasParent(decl(has(fieldDecl(
+        hasType(type(equalsNode(TypedNode)).bind(""))))))),
+                *Dec, Context)) != nullptr;
+  }
+};
+
+TEST(IsEqualTo, MatchesNodesByIdentity) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X { class Y {}; };", recordDecl(hasName("::X::Y")).bind(""),
+    llvm::make_unique<VerifyAncestorHasChildIsEqual<CXXRecordDecl>>()));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void f() { if (true) if(true) {} }", ifStmt().bind(""),
+    llvm::make_unique<VerifyAncestorHasChildIsEqual<IfStmt>>()));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X { class Y {} y; };",
+    fieldDecl(hasName("y"), hasType(type().bind(""))).bind("decl"),
+    llvm::make_unique<VerifyAncestorHasChildIsEqual<Type>>()));
+}
+
+TEST(TypedefDeclMatcher, Match) {
+  EXPECT_TRUE(matches("typedef int typedefDeclTest;",
+                      typedefDecl(hasName("typedefDeclTest"))));
+  EXPECT_TRUE(notMatches("using typedefDeclTest2 = int;",
+                         typedefDecl(hasName("typedefDeclTest2"))));
+}
+
+TEST(TypeAliasDeclMatcher, Match) {
+  EXPECT_TRUE(matches("using typeAliasTest2 = int;",
+                      typeAliasDecl(hasName("typeAliasTest2"))));
+  EXPECT_TRUE(notMatches("typedef int typeAliasTest;",
+                         typeAliasDecl(hasName("typeAliasTest"))));
+}
+
+TEST(TypedefNameDeclMatcher, Match) {
+  EXPECT_TRUE(matches("typedef int typedefNameDeclTest1;",
+                      typedefNameDecl(hasName("typedefNameDeclTest1"))));
+  EXPECT_TRUE(matches("using typedefNameDeclTest2 = int;",
+                      typedefNameDecl(hasName("typedefNameDeclTest2"))));
+}
+
+TEST(ObjCMessageExprMatcher, SimpleExprs) {
+  // don't find ObjCMessageExpr where none are present
+  EXPECT_TRUE(notMatchesObjC("", objcMessageExpr(anything())));
+
+  std::string Objc1String =
+    "@interface Str "
+      " - (Str *)uppercaseString:(Str *)str;"
+      "@end "
+      "@interface foo "
+      "- (void)meth:(Str *)text;"
+      "@end "
+      " "
+      "@implementation foo "
+      "- (void) meth:(Str *)text { "
+      "  [self contents];"
+      "  Str *up = [text uppercaseString];"
+      "} "
+      "@end ";
+  EXPECT_TRUE(matchesObjC(
+    Objc1String,
+    objcMessageExpr(anything())));
+  EXPECT_TRUE(matchesObjC(
+    Objc1String,
+    objcMessageExpr(hasSelector("contents"))));
+  EXPECT_TRUE(matchesObjC(
+    Objc1String,
+    objcMessageExpr(matchesSelector("cont*"))));
+  EXPECT_FALSE(matchesObjC(
+    Objc1String,
+    objcMessageExpr(matchesSelector("?cont*"))));
+  EXPECT_TRUE(notMatchesObjC(
+    Objc1String,
+    objcMessageExpr(hasSelector("contents"), hasNullSelector())));
+  EXPECT_TRUE(matchesObjC(
+    Objc1String,
+    objcMessageExpr(hasSelector("contents"), hasUnarySelector())));
+  EXPECT_TRUE(matchesObjC(
+    Objc1String,
+    objcMessageExpr(hasSelector("contents"), numSelectorArgs(0))));
+  EXPECT_TRUE(matchesObjC(
+    Objc1String,
+    objcMessageExpr(matchesSelector("uppercase*"),
+                    argumentCountIs(0)
+    )));
+}
+
+} // namespace ast_matchers
+} // namespace clang
diff --git a/unittests/ASTMatchers/ASTMatchersTest.cpp b/unittests/ASTMatchers/ASTMatchersTest.cpp
deleted file mode 100644
index 1e5401d..0000000
--- a/unittests/ASTMatchers/ASTMatchersTest.cpp
+++ /dev/null
@@ -1,5181 +0,0 @@
-//===- unittest/Tooling/ASTMatchersTest.cpp - AST matcher unit tests ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ASTMatchersTest.h"
-#include "clang/AST/PrettyPrinter.h"
-#include "clang/ASTMatchers/ASTMatchFinder.h"
-#include "clang/ASTMatchers/ASTMatchers.h"
-#include "clang/Tooling/Tooling.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/Host.h"
-#include "gtest/gtest.h"
-
-namespace clang {
-namespace ast_matchers {
-
-#if GTEST_HAS_DEATH_TEST
-TEST(HasNameDeathTest, DiesOnEmptyName) {
-  ASSERT_DEBUG_DEATH({
-    DeclarationMatcher HasEmptyName = recordDecl(hasName(""));
-    EXPECT_TRUE(notMatches("class X {};", HasEmptyName));
-  }, "");
-}
-
-TEST(HasNameDeathTest, DiesOnEmptyPattern) {
-  ASSERT_DEBUG_DEATH({
-      DeclarationMatcher HasEmptyName = recordDecl(matchesName(""));
-      EXPECT_TRUE(notMatches("class X {};", HasEmptyName));
-    }, "");
-}
-
-TEST(IsDerivedFromDeathTest, DiesOnEmptyBaseName) {
-  ASSERT_DEBUG_DEATH({
-    DeclarationMatcher IsDerivedFromEmpty = cxxRecordDecl(isDerivedFrom(""));
-    EXPECT_TRUE(notMatches("class X {};", IsDerivedFromEmpty));
-  }, "");
-}
-#endif
-
-TEST(Finder, DynamicOnlyAcceptsSomeMatchers) {
-  MatchFinder Finder;
-  EXPECT_TRUE(Finder.addDynamicMatcher(decl(), nullptr));
-  EXPECT_TRUE(Finder.addDynamicMatcher(callExpr(), nullptr));
-  EXPECT_TRUE(Finder.addDynamicMatcher(constantArrayType(hasSize(42)),
-                                       nullptr));
-
-  // Do not accept non-toplevel matchers.
-  EXPECT_FALSE(Finder.addDynamicMatcher(isArrow(), nullptr));
-  EXPECT_FALSE(Finder.addDynamicMatcher(hasSize(2), nullptr));
-  EXPECT_FALSE(Finder.addDynamicMatcher(hasName("x"), nullptr));
-}
-
-TEST(Decl, MatchesDeclarations) {
-  EXPECT_TRUE(notMatches("", decl(usingDecl())));
-  EXPECT_TRUE(matches("namespace x { class X {}; } using x::X;",
-                      decl(usingDecl())));
-}
-
-TEST(NameableDeclaration, MatchesVariousDecls) {
-  DeclarationMatcher NamedX = namedDecl(hasName("X"));
-  EXPECT_TRUE(matches("typedef int X;", NamedX));
-  EXPECT_TRUE(matches("int X;", NamedX));
-  EXPECT_TRUE(matches("class foo { virtual void X(); };", NamedX));
-  EXPECT_TRUE(matches("void foo() try { } catch(int X) { }", NamedX));
-  EXPECT_TRUE(matches("void foo() { int X; }", NamedX));
-  EXPECT_TRUE(matches("namespace X { }", NamedX));
-  EXPECT_TRUE(matches("enum X { A, B, C };", NamedX));
-
-  EXPECT_TRUE(notMatches("#define X 1", NamedX));
-}
-
-TEST(NameableDeclaration, REMatchesVariousDecls) {
-  DeclarationMatcher NamedX = namedDecl(matchesName("::X"));
-  EXPECT_TRUE(matches("typedef int Xa;", NamedX));
-  EXPECT_TRUE(matches("int Xb;", NamedX));
-  EXPECT_TRUE(matches("class foo { virtual void Xc(); };", NamedX));
-  EXPECT_TRUE(matches("void foo() try { } catch(int Xdef) { }", NamedX));
-  EXPECT_TRUE(matches("void foo() { int Xgh; }", NamedX));
-  EXPECT_TRUE(matches("namespace Xij { }", NamedX));
-  EXPECT_TRUE(matches("enum X { A, B, C };", NamedX));
-
-  EXPECT_TRUE(notMatches("#define Xkl 1", NamedX));
-
-  DeclarationMatcher StartsWithNo = namedDecl(matchesName("::no"));
-  EXPECT_TRUE(matches("int no_foo;", StartsWithNo));
-  EXPECT_TRUE(matches("class foo { virtual void nobody(); };", StartsWithNo));
-
-  DeclarationMatcher Abc = namedDecl(matchesName("a.*b.*c"));
-  EXPECT_TRUE(matches("int abc;", Abc));
-  EXPECT_TRUE(matches("int aFOObBARc;", Abc));
-  EXPECT_TRUE(notMatches("int cab;", Abc));
-  EXPECT_TRUE(matches("int cabc;", Abc));
-
-  DeclarationMatcher StartsWithK = namedDecl(matchesName(":k[^:]*$"));
-  EXPECT_TRUE(matches("int k;", StartsWithK));
-  EXPECT_TRUE(matches("int kAbc;", StartsWithK));
-  EXPECT_TRUE(matches("namespace x { int kTest; }", StartsWithK));
-  EXPECT_TRUE(matches("class C { int k; };", StartsWithK));
-  EXPECT_TRUE(notMatches("class C { int ckc; };", StartsWithK));
-}
-
-TEST(DeclarationMatcher, MatchClass) {
-  DeclarationMatcher ClassMatcher(recordDecl());
-  llvm::Triple Triple(llvm::sys::getDefaultTargetTriple());
-  if (Triple.getOS() != llvm::Triple::Win32 ||
-      Triple.getEnvironment() != llvm::Triple::MSVC)
-    EXPECT_FALSE(matches("", ClassMatcher));
-  else
-    // Matches class type_info.
-    EXPECT_TRUE(matches("", ClassMatcher));
-
-  DeclarationMatcher ClassX = recordDecl(recordDecl(hasName("X")));
-  EXPECT_TRUE(matches("class X;", ClassX));
-  EXPECT_TRUE(matches("class X {};", ClassX));
-  EXPECT_TRUE(matches("template<class T> class X {};", ClassX));
-  EXPECT_TRUE(notMatches("", ClassX));
-}
-
-TEST(DeclarationMatcher, ClassIsDerived) {
-  DeclarationMatcher IsDerivedFromX = cxxRecordDecl(isDerivedFrom("X"));
-
-  EXPECT_TRUE(matches("class X {}; class Y : public X {};", IsDerivedFromX));
-  EXPECT_TRUE(notMatches("class X {};", IsDerivedFromX));
-  EXPECT_TRUE(notMatches("class X;", IsDerivedFromX));
-  EXPECT_TRUE(notMatches("class Y;", IsDerivedFromX));
-  EXPECT_TRUE(notMatches("", IsDerivedFromX));
-
-  DeclarationMatcher IsAX = cxxRecordDecl(isSameOrDerivedFrom("X"));
-
-  EXPECT_TRUE(matches("class X {}; class Y : public X {};", IsAX));
-  EXPECT_TRUE(matches("class X {};", IsAX));
-  EXPECT_TRUE(matches("class X;", IsAX));
-  EXPECT_TRUE(notMatches("class Y;", IsAX));
-  EXPECT_TRUE(notMatches("", IsAX));
-
-  DeclarationMatcher ZIsDerivedFromX =
-      cxxRecordDecl(hasName("Z"), isDerivedFrom("X"));
-  EXPECT_TRUE(
-      matches("class X {}; class Y : public X {}; class Z : public Y {};",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class X {};"
-              "template<class T> class Y : public X {};"
-              "class Z : public Y<int> {};", ZIsDerivedFromX));
-  EXPECT_TRUE(matches("class X {}; template<class T> class Z : public X {};",
-                      ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template<class T> class X {}; "
-              "template<class T> class Z : public X<T> {};",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template<class T, class U=T> class X {}; "
-              "template<class T> class Z : public X<T> {};",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("template<class X> class A { class Z : public X {}; };",
-                 ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template<class X> class A { public: class Z : public X {}; }; "
-              "class X{}; void y() { A<X>::Z z; }", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template <class T> class X {}; "
-              "template<class Y> class A { class Z : public X<Y> {}; };",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("template<template<class T> class X> class A { "
-                 "  class Z : public X<int> {}; };", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template<template<class T> class X> class A { "
-              "  public: class Z : public X<int> {}; }; "
-              "template<class T> class X {}; void y() { A<X>::Z z; }",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("template<class X> class A { class Z : public X::D {}; };",
-                 ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template<class X> class A { public: "
-              "  class Z : public X::D {}; }; "
-              "class Y { public: class X {}; typedef X D; }; "
-              "void y() { A<Y>::Z z; }", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class X {}; typedef X Y; class Z : public Y {};",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template<class T> class Y { typedef typename T::U X; "
-              "  class Z : public X {}; };", ZIsDerivedFromX));
-  EXPECT_TRUE(matches("class X {}; class Z : public ::X {};",
-                      ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("template<class T> class X {}; "
-                "template<class T> class A { class Z : public X<T>::D {}; };",
-                ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template<class T> class X { public: typedef X<T> D; }; "
-              "template<class T> class A { public: "
-              "  class Z : public X<T>::D {}; }; void y() { A<int>::Z z; }",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("template<class X> class A { class Z : public X::D::E {}; };",
-                 ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class X {}; typedef X V; typedef V W; class Z : public W {};",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class X {}; class Y : public X {}; "
-              "typedef Y V; typedef V W; class Z : public W {};",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template<class T, class U> class X {}; "
-              "template<class T> class A { class Z : public X<T, int> {}; };",
-              ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("template<class X> class D { typedef X A; typedef A B; "
-                 "  typedef B C; class Z : public C {}; };",
-                 ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class X {}; typedef X A; typedef A B; "
-              "class Z : public B {};", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class X {}; typedef X A; typedef A B; typedef B C; "
-              "class Z : public C {};", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class U {}; typedef U X; typedef X V; "
-              "class Z : public V {};", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class Base {}; typedef Base X; "
-              "class Z : public Base {};", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class Base {}; typedef Base Base2; typedef Base2 X; "
-              "class Z : public Base {};", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("class Base {}; class Base2 {}; typedef Base2 X; "
-                 "class Z : public Base {};", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      matches("class A {}; typedef A X; typedef A Y; "
-              "class Z : public Y {};", ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("template <typename T> class Z;"
-                 "template <> class Z<void> {};"
-                 "template <typename T> class Z : public Z<void> {};",
-                 IsDerivedFromX));
-  EXPECT_TRUE(
-      matches("template <typename T> class X;"
-              "template <> class X<void> {};"
-              "template <typename T> class X : public X<void> {};",
-              IsDerivedFromX));
-  EXPECT_TRUE(matches(
-      "class X {};"
-      "template <typename T> class Z;"
-      "template <> class Z<void> {};"
-      "template <typename T> class Z : public Z<void>, public X {};",
-      ZIsDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("template<int> struct X;"
-                 "template<int i> struct X : public X<i-1> {};",
-                 cxxRecordDecl(isDerivedFrom(recordDecl(hasName("Some"))))));
-  EXPECT_TRUE(matches(
-      "struct A {};"
-      "template<int> struct X;"
-      "template<int i> struct X : public X<i-1> {};"
-      "template<> struct X<0> : public A {};"
-      "struct B : public X<42> {};",
-      cxxRecordDecl(hasName("B"), isDerivedFrom(recordDecl(hasName("A"))))));
-
-  // FIXME: Once we have better matchers for template type matching,
-  // get rid of the Variable(...) matching and match the right template
-  // declarations directly.
-  const char *RecursiveTemplateOneParameter =
-      "class Base1 {}; class Base2 {};"
-      "template <typename T> class Z;"
-      "template <> class Z<void> : public Base1 {};"
-      "template <> class Z<int> : public Base2 {};"
-      "template <> class Z<float> : public Z<void> {};"
-      "template <> class Z<double> : public Z<int> {};"
-      "template <typename T> class Z : public Z<float>, public Z<double> {};"
-      "void f() { Z<float> z_float; Z<double> z_double; Z<char> z_char; }";
-  EXPECT_TRUE(matches(
-      RecursiveTemplateOneParameter,
-      varDecl(hasName("z_float"),
-              hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base1")))))));
-  EXPECT_TRUE(notMatches(
-      RecursiveTemplateOneParameter,
-      varDecl(hasName("z_float"),
-              hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base2")))))));
-  EXPECT_TRUE(matches(
-      RecursiveTemplateOneParameter,
-      varDecl(hasName("z_char"),
-              hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base1"),
-                                                isDerivedFrom("Base2")))))));
-
-  const char *RecursiveTemplateTwoParameters =
-      "class Base1 {}; class Base2 {};"
-      "template <typename T1, typename T2> class Z;"
-      "template <typename T> class Z<void, T> : public Base1 {};"
-      "template <typename T> class Z<int, T> : public Base2 {};"
-      "template <typename T> class Z<float, T> : public Z<void, T> {};"
-      "template <typename T> class Z<double, T> : public Z<int, T> {};"
-      "template <typename T1, typename T2> class Z : "
-      "    public Z<float, T2>, public Z<double, T2> {};"
-      "void f() { Z<float, void> z_float; Z<double, void> z_double; "
-      "           Z<char, void> z_char; }";
-  EXPECT_TRUE(matches(
-      RecursiveTemplateTwoParameters,
-      varDecl(hasName("z_float"),
-              hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base1")))))));
-  EXPECT_TRUE(notMatches(
-      RecursiveTemplateTwoParameters,
-      varDecl(hasName("z_float"),
-              hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base2")))))));
-  EXPECT_TRUE(matches(
-      RecursiveTemplateTwoParameters,
-      varDecl(hasName("z_char"),
-              hasInitializer(hasType(cxxRecordDecl(isDerivedFrom("Base1"),
-                                                isDerivedFrom("Base2")))))));
-  EXPECT_TRUE(matches(
-      "namespace ns { class X {}; class Y : public X {}; }",
-      cxxRecordDecl(isDerivedFrom("::ns::X"))));
-  EXPECT_TRUE(notMatches(
-      "class X {}; class Y : public X {};",
-      cxxRecordDecl(isDerivedFrom("::ns::X"))));
-
-  EXPECT_TRUE(matches(
-      "class X {}; class Y : public X {};",
-    cxxRecordDecl(isDerivedFrom(recordDecl(hasName("X")).bind("test")))));
-
-  EXPECT_TRUE(matches(
-      "template<typename T> class X {};"
-      "template<typename T> using Z = X<T>;"
-      "template <typename T> class Y : Z<T> {};",
-    cxxRecordDecl(isDerivedFrom(namedDecl(hasName("X"))))));
-}
-
-TEST(DeclarationMatcher, hasMethod) {
-  EXPECT_TRUE(matches("class A { void func(); };",
-                      cxxRecordDecl(hasMethod(hasName("func")))));
-  EXPECT_TRUE(notMatches("class A { void func(); };",
-                         cxxRecordDecl(hasMethod(isPublic()))));
-}
-
-TEST(DeclarationMatcher, ClassDerivedFromDependentTemplateSpecialization) {
-  EXPECT_TRUE(matches(
-     "template <typename T> struct A {"
-     "  template <typename T2> struct F {};"
-     "};"
-     "template <typename T> struct B : A<T>::template F<T> {};"
-     "B<int> b;",
-    cxxRecordDecl(hasName("B"), isDerivedFrom(recordDecl()))));
-}
-
-TEST(DeclarationMatcher, hasDeclContext) {
-  EXPECT_TRUE(matches(
-      "namespace N {"
-      "  namespace M {"
-      "    class D {};"
-      "  }"
-      "}",
-      recordDecl(hasDeclContext(namespaceDecl(hasName("M"))))));
-  EXPECT_TRUE(notMatches(
-      "namespace N {"
-      "  namespace M {"
-      "    class D {};"
-      "  }"
-      "}",
-      recordDecl(hasDeclContext(namespaceDecl(hasName("N"))))));
-
-  EXPECT_TRUE(matches("namespace {"
-                      "  namespace M {"
-                      "    class D {};"
-                      "  }"
-                      "}",
-                      recordDecl(hasDeclContext(namespaceDecl(
-                          hasName("M"), hasDeclContext(namespaceDecl()))))));
-
-  EXPECT_TRUE(matches("class D{};", decl(hasDeclContext(decl()))));
-}
-
-TEST(DeclarationMatcher, translationUnitDecl) {
-  const std::string Code = "int MyVar1;\n"
-                           "namespace NameSpace {\n"
-                           "int MyVar2;\n"
-                           "}  // namespace NameSpace\n";
-  EXPECT_TRUE(matches(
-      Code, varDecl(hasName("MyVar1"), hasDeclContext(translationUnitDecl()))));
-  EXPECT_FALSE(matches(
-      Code, varDecl(hasName("MyVar2"), hasDeclContext(translationUnitDecl()))));
-  EXPECT_TRUE(matches(
-      Code,
-      varDecl(hasName("MyVar2"),
-              hasDeclContext(decl(hasDeclContext(translationUnitDecl()))))));
-}
-
-TEST(DeclarationMatcher, LinkageSpecification) {
-  EXPECT_TRUE(matches("extern \"C\" { void foo() {}; }", linkageSpecDecl()));
-  EXPECT_TRUE(notMatches("void foo() {};", linkageSpecDecl()));
-}
-
-TEST(ClassTemplate, DoesNotMatchClass) {
-  DeclarationMatcher ClassX = classTemplateDecl(hasName("X"));
-  EXPECT_TRUE(notMatches("class X;", ClassX));
-  EXPECT_TRUE(notMatches("class X {};", ClassX));
-}
-
-TEST(ClassTemplate, MatchesClassTemplate) {
-  DeclarationMatcher ClassX = classTemplateDecl(hasName("X"));
-  EXPECT_TRUE(matches("template<typename T> class X {};", ClassX));
-  EXPECT_TRUE(matches("class Z { template<class T> class X {}; };", ClassX));
-}
-
-TEST(ClassTemplate, DoesNotMatchClassTemplateExplicitSpecialization) {
-  EXPECT_TRUE(notMatches("template<typename T> class X { };"
-                         "template<> class X<int> { int a; };",
-              classTemplateDecl(hasName("X"),
-                                hasDescendant(fieldDecl(hasName("a"))))));
-}
-
-TEST(ClassTemplate, DoesNotMatchClassTemplatePartialSpecialization) {
-  EXPECT_TRUE(notMatches("template<typename T, typename U> class X { };"
-                         "template<typename T> class X<T, int> { int a; };",
-              classTemplateDecl(hasName("X"),
-                                hasDescendant(fieldDecl(hasName("a"))))));
-}
-
-TEST(AllOf, AllOverloadsWork) {
-  const char Program[] =
-      "struct T { };"
-      "int f(int, T*, int, int);"
-      "void g(int x) { T t; f(x, &t, 3, 4); }";
-  EXPECT_TRUE(matches(Program,
-      callExpr(allOf(callee(functionDecl(hasName("f"))),
-                     hasArgument(0, declRefExpr(to(varDecl())))))));
-  EXPECT_TRUE(matches(Program,
-      callExpr(allOf(callee(functionDecl(hasName("f"))),
-                     hasArgument(0, declRefExpr(to(varDecl()))),
-                     hasArgument(1, hasType(pointsTo(
-                                        recordDecl(hasName("T")))))))));
-  EXPECT_TRUE(matches(Program,
-      callExpr(allOf(callee(functionDecl(hasName("f"))),
-                     hasArgument(0, declRefExpr(to(varDecl()))),
-                     hasArgument(1, hasType(pointsTo(
-                                        recordDecl(hasName("T"))))),
-                     hasArgument(2, integerLiteral(equals(3)))))));
-  EXPECT_TRUE(matches(Program,
-      callExpr(allOf(callee(functionDecl(hasName("f"))),
-                     hasArgument(0, declRefExpr(to(varDecl()))),
-                     hasArgument(1, hasType(pointsTo(
-                                        recordDecl(hasName("T"))))),
-                     hasArgument(2, integerLiteral(equals(3))),
-                     hasArgument(3, integerLiteral(equals(4)))))));
-}
-
-TEST(ConstructVariadic, MismatchedTypes_Regression) {
-  EXPECT_TRUE(
-      matches("const int a = 0;",
-              internal::DynTypedMatcher::constructVariadic(
-                  internal::DynTypedMatcher::VO_AnyOf,
-                  ast_type_traits::ASTNodeKind::getFromNodeKind<QualType>(),
-                  {isConstQualified(), arrayType()})
-                  .convertTo<QualType>()));
-}
-
-TEST(DeclarationMatcher, MatchAnyOf) {
-  DeclarationMatcher YOrZDerivedFromX = cxxRecordDecl(
-      anyOf(hasName("Y"), allOf(isDerivedFrom("X"), hasName("Z"))));
-  EXPECT_TRUE(matches("class X {}; class Z : public X {};", YOrZDerivedFromX));
-  EXPECT_TRUE(matches("class Y {};", YOrZDerivedFromX));
-  EXPECT_TRUE(
-      notMatches("class X {}; class W : public X {};", YOrZDerivedFromX));
-  EXPECT_TRUE(notMatches("class Z {};", YOrZDerivedFromX));
-
-  DeclarationMatcher XOrYOrZOrU =
-      recordDecl(anyOf(hasName("X"), hasName("Y"), hasName("Z"), hasName("U")));
-  EXPECT_TRUE(matches("class X {};", XOrYOrZOrU));
-  EXPECT_TRUE(notMatches("class V {};", XOrYOrZOrU));
-
-  DeclarationMatcher XOrYOrZOrUOrV =
-      recordDecl(anyOf(hasName("X"), hasName("Y"), hasName("Z"), hasName("U"),
-                       hasName("V")));
-  EXPECT_TRUE(matches("class X {};", XOrYOrZOrUOrV));
-  EXPECT_TRUE(matches("class Y {};", XOrYOrZOrUOrV));
-  EXPECT_TRUE(matches("class Z {};", XOrYOrZOrUOrV));
-  EXPECT_TRUE(matches("class U {};", XOrYOrZOrUOrV));
-  EXPECT_TRUE(matches("class V {};", XOrYOrZOrUOrV));
-  EXPECT_TRUE(notMatches("class A {};", XOrYOrZOrUOrV));
-
-  StatementMatcher MixedTypes = stmt(anyOf(ifStmt(), binaryOperator()));
-  EXPECT_TRUE(matches("int F() { return 1 + 2; }", MixedTypes));
-  EXPECT_TRUE(matches("int F() { if (true) return 1; }", MixedTypes));
-  EXPECT_TRUE(notMatches("int F() { return 1; }", MixedTypes));
-
-  EXPECT_TRUE(
-      matches("void f() try { } catch (int) { } catch (...) { }",
-              cxxCatchStmt(anyOf(hasDescendant(varDecl()), isCatchAll()))));
-}
-
-TEST(DeclarationMatcher, MatchHas) {
-  DeclarationMatcher HasClassX = recordDecl(has(recordDecl(hasName("X"))));
-  EXPECT_TRUE(matches("class Y { class X {}; };", HasClassX));
-  EXPECT_TRUE(matches("class X {};", HasClassX));
-
-  DeclarationMatcher YHasClassX =
-      recordDecl(hasName("Y"), has(recordDecl(hasName("X"))));
-  EXPECT_TRUE(matches("class Y { class X {}; };", YHasClassX));
-  EXPECT_TRUE(notMatches("class X {};", YHasClassX));
-  EXPECT_TRUE(
-      notMatches("class Y { class Z { class X {}; }; };", YHasClassX));
-}
-
-TEST(DeclarationMatcher, MatchHasRecursiveAllOf) {
-  DeclarationMatcher Recursive =
-    recordDecl(
-      has(recordDecl(
-        has(recordDecl(hasName("X"))),
-        has(recordDecl(hasName("Y"))),
-        hasName("Z"))),
-      has(recordDecl(
-        has(recordDecl(hasName("A"))),
-        has(recordDecl(hasName("B"))),
-        hasName("C"))),
-      hasName("F"));
-
-  EXPECT_TRUE(matches(
-      "class F {"
-      "  class Z {"
-      "    class X {};"
-      "    class Y {};"
-      "  };"
-      "  class C {"
-      "    class A {};"
-      "    class B {};"
-      "  };"
-      "};", Recursive));
-
-  EXPECT_TRUE(matches(
-      "class F {"
-      "  class Z {"
-      "    class A {};"
-      "    class X {};"
-      "    class Y {};"
-      "  };"
-      "  class C {"
-      "    class X {};"
-      "    class A {};"
-      "    class B {};"
-      "  };"
-      "};", Recursive));
-
-  EXPECT_TRUE(matches(
-      "class O1 {"
-      "  class O2 {"
-      "    class F {"
-      "      class Z {"
-      "        class A {};"
-      "        class X {};"
-      "        class Y {};"
-      "      };"
-      "      class C {"
-      "        class X {};"
-      "        class A {};"
-      "        class B {};"
-      "      };"
-      "    };"
-      "  };"
-      "};", Recursive));
-}
-
-TEST(DeclarationMatcher, MatchHasRecursiveAnyOf) {
-  DeclarationMatcher Recursive =
-      recordDecl(
-          anyOf(
-              has(recordDecl(
-                  anyOf(
-                      has(recordDecl(
-                          hasName("X"))),
-                      has(recordDecl(
-                          hasName("Y"))),
-                      hasName("Z")))),
-              has(recordDecl(
-                  anyOf(
-                      hasName("C"),
-                      has(recordDecl(
-                          hasName("A"))),
-                      has(recordDecl(
-                          hasName("B")))))),
-              hasName("F")));
-
-  EXPECT_TRUE(matches("class F {};", Recursive));
-  EXPECT_TRUE(matches("class Z {};", Recursive));
-  EXPECT_TRUE(matches("class C {};", Recursive));
-  EXPECT_TRUE(matches("class M { class N { class X {}; }; };", Recursive));
-  EXPECT_TRUE(matches("class M { class N { class B {}; }; };", Recursive));
-  EXPECT_TRUE(
-      matches("class O1 { class O2 {"
-              "  class M { class N { class B {}; }; }; "
-              "}; };", Recursive));
-}
-
-TEST(DeclarationMatcher, MatchNot) {
-  DeclarationMatcher NotClassX =
-    cxxRecordDecl(
-          isDerivedFrom("Y"),
-          unless(hasName("X")));
-  EXPECT_TRUE(notMatches("", NotClassX));
-  EXPECT_TRUE(notMatches("class Y {};", NotClassX));
-  EXPECT_TRUE(matches("class Y {}; class Z : public Y {};", NotClassX));
-  EXPECT_TRUE(notMatches("class Y {}; class X : public Y {};", NotClassX));
-  EXPECT_TRUE(
-      notMatches("class Y {}; class Z {}; class X : public Y {};",
-                 NotClassX));
-
-  DeclarationMatcher ClassXHasNotClassY =
-      recordDecl(
-          hasName("X"),
-          has(recordDecl(hasName("Z"))),
-          unless(
-              has(recordDecl(hasName("Y")))));
-  EXPECT_TRUE(matches("class X { class Z {}; };", ClassXHasNotClassY));
-  EXPECT_TRUE(notMatches("class X { class Y {}; class Z {}; };",
-                         ClassXHasNotClassY));
-
-  DeclarationMatcher NamedNotRecord =
-      namedDecl(hasName("Foo"), unless(recordDecl()));
-  EXPECT_TRUE(matches("void Foo(){}", NamedNotRecord));
-  EXPECT_TRUE(notMatches("struct Foo {};", NamedNotRecord));
-}
-
-TEST(DeclarationMatcher, HasDescendant) {
-  DeclarationMatcher ZDescendantClassX =
-      recordDecl(
-          hasDescendant(recordDecl(hasName("X"))),
-          hasName("Z"));
-  EXPECT_TRUE(matches("class Z { class X {}; };", ZDescendantClassX));
-  EXPECT_TRUE(
-      matches("class Z { class Y { class X {}; }; };", ZDescendantClassX));
-  EXPECT_TRUE(
-      matches("class Z { class A { class Y { class X {}; }; }; };",
-              ZDescendantClassX));
-  EXPECT_TRUE(
-      matches("class Z { class A { class B { class Y { class X {}; }; }; }; };",
-              ZDescendantClassX));
-  EXPECT_TRUE(notMatches("class Z {};", ZDescendantClassX));
-
-  DeclarationMatcher ZDescendantClassXHasClassY =
-      recordDecl(
-          hasDescendant(recordDecl(has(recordDecl(hasName("Y"))),
-                              hasName("X"))),
-          hasName("Z"));
-  EXPECT_TRUE(matches("class Z { class X { class Y {}; }; };",
-              ZDescendantClassXHasClassY));
-  EXPECT_TRUE(
-      matches("class Z { class A { class B { class X { class Y {}; }; }; }; };",
-              ZDescendantClassXHasClassY));
-  EXPECT_TRUE(notMatches(
-      "class Z {"
-      "  class A {"
-      "    class B {"
-      "      class X {"
-      "        class C {"
-      "          class Y {};"
-      "        };"
-      "      };"
-      "    }; "
-      "  };"
-      "};", ZDescendantClassXHasClassY));
-
-  DeclarationMatcher ZDescendantClassXDescendantClassY =
-      recordDecl(
-          hasDescendant(recordDecl(hasDescendant(recordDecl(hasName("Y"))),
-                                   hasName("X"))),
-          hasName("Z"));
-  EXPECT_TRUE(
-      matches("class Z { class A { class X { class B { class Y {}; }; }; }; };",
-              ZDescendantClassXDescendantClassY));
-  EXPECT_TRUE(matches(
-      "class Z {"
-      "  class A {"
-      "    class X {"
-      "      class B {"
-      "        class Y {};"
-      "      };"
-      "      class Y {};"
-      "    };"
-      "  };"
-      "};", ZDescendantClassXDescendantClassY));
-}
-
-TEST(DeclarationMatcher, HasDescendantMemoization) {
-  DeclarationMatcher CannotMemoize =
-      decl(hasDescendant(typeLoc().bind("x")), has(decl()));
-  EXPECT_TRUE(matches("void f() { int i; }", CannotMemoize));
-}
-
-TEST(DeclarationMatcher, HasDescendantMemoizationUsesRestrictKind) {
-  auto Name = hasName("i");
-  auto VD = internal::Matcher<VarDecl>(Name).dynCastTo<Decl>();
-  auto RD = internal::Matcher<RecordDecl>(Name).dynCastTo<Decl>();
-  // Matching VD first should not make a cache hit for RD.
-  EXPECT_TRUE(notMatches("void f() { int i; }",
-                         decl(hasDescendant(VD), hasDescendant(RD))));
-  EXPECT_TRUE(notMatches("void f() { int i; }",
-                         decl(hasDescendant(RD), hasDescendant(VD))));
-  // Not matching RD first should not make a cache hit for VD either.
-  EXPECT_TRUE(matches("void f() { int i; }",
-                      decl(anyOf(hasDescendant(RD), hasDescendant(VD)))));
-}
-
-TEST(DeclarationMatcher, HasAttr) {
-  EXPECT_TRUE(matches("struct __attribute__((warn_unused)) X {};",
-                      decl(hasAttr(clang::attr::WarnUnused))));
-  EXPECT_FALSE(matches("struct X {};",
-                       decl(hasAttr(clang::attr::WarnUnused))));
-}
-
-TEST(DeclarationMatcher, MatchCudaDecl) {
-  EXPECT_TRUE(matchesWithCuda("__global__ void f() { }"
-                              "void g() { f<<<1, 2>>>(); }",
-                              cudaKernelCallExpr()));
-  EXPECT_TRUE(matchesWithCuda("__attribute__((device)) void f() {}",
-                              hasAttr(clang::attr::CUDADevice)));
-  EXPECT_TRUE(notMatchesWithCuda("void f() {}",
-                                 cudaKernelCallExpr()));
-  EXPECT_FALSE(notMatchesWithCuda("__attribute__((global)) void f() {}",
-                                  hasAttr(clang::attr::CUDAGlobal)));
-}
-
-// Implements a run method that returns whether BoundNodes contains a
-// Decl bound to Id that can be dynamically cast to T.
-// Optionally checks that the check succeeded a specific number of times.
-template <typename T>
-class VerifyIdIsBoundTo : public BoundNodesCallback {
-public:
-  // Create an object that checks that a node of type \c T was bound to \c Id.
-  // Does not check for a certain number of matches.
-  explicit VerifyIdIsBoundTo(llvm::StringRef Id)
-    : Id(Id), ExpectedCount(-1), Count(0) {}
-
-  // Create an object that checks that a node of type \c T was bound to \c Id.
-  // Checks that there were exactly \c ExpectedCount matches.
-  VerifyIdIsBoundTo(llvm::StringRef Id, int ExpectedCount)
-    : Id(Id), ExpectedCount(ExpectedCount), Count(0) {}
-
-  // Create an object that checks that a node of type \c T was bound to \c Id.
-  // Checks that there was exactly one match with the name \c ExpectedName.
-  // Note that \c T must be a NamedDecl for this to work.
-  VerifyIdIsBoundTo(llvm::StringRef Id, llvm::StringRef ExpectedName,
-                    int ExpectedCount = 1)
-      : Id(Id), ExpectedCount(ExpectedCount), Count(0),
-        ExpectedName(ExpectedName) {}
-
-  void onEndOfTranslationUnit() override {
-    if (ExpectedCount != -1)
-      EXPECT_EQ(ExpectedCount, Count);
-    if (!ExpectedName.empty())
-      EXPECT_EQ(ExpectedName, Name);
-    Count = 0;
-    Name.clear();
-  }
-
-  ~VerifyIdIsBoundTo() override {
-    EXPECT_EQ(0, Count);
-    EXPECT_EQ("", Name);
-  }
-
-  bool run(const BoundNodes *Nodes) override {
-    const BoundNodes::IDToNodeMap &M = Nodes->getMap();
-    if (Nodes->getNodeAs<T>(Id)) {
-      ++Count;
-      if (const NamedDecl *Named = Nodes->getNodeAs<NamedDecl>(Id)) {
-        Name = Named->getNameAsString();
-      } else if (const NestedNameSpecifier *NNS =
-                 Nodes->getNodeAs<NestedNameSpecifier>(Id)) {
-        llvm::raw_string_ostream OS(Name);
-        NNS->print(OS, PrintingPolicy(LangOptions()));
-      }
-      BoundNodes::IDToNodeMap::const_iterator I = M.find(Id);
-      EXPECT_NE(M.end(), I);
-      if (I != M.end())
-        EXPECT_EQ(Nodes->getNodeAs<T>(Id), I->second.get<T>());
-      return true;
-    }
-    EXPECT_TRUE(M.count(Id) == 0 ||
-                M.find(Id)->second.template get<T>() == nullptr);
-    return false;
-  }
-
-  bool run(const BoundNodes *Nodes, ASTContext *Context) override {
-    return run(Nodes);
-  }
-
-private:
-  const std::string Id;
-  const int ExpectedCount;
-  int Count;
-  const std::string ExpectedName;
-  std::string Name;
-};
-
-TEST(HasDescendant, MatchesDescendantTypes) {
-  EXPECT_TRUE(matches("void f() { int i = 3; }",
-                      decl(hasDescendant(loc(builtinType())))));
-  EXPECT_TRUE(matches("void f() { int i = 3; }",
-                      stmt(hasDescendant(builtinType()))));
-
-  EXPECT_TRUE(matches("void f() { int i = 3; }",
-                      stmt(hasDescendant(loc(builtinType())))));
-  EXPECT_TRUE(matches("void f() { int i = 3; }",
-                      stmt(hasDescendant(qualType(builtinType())))));
-
-  EXPECT_TRUE(notMatches("void f() { float f = 2.0f; }",
-                         stmt(hasDescendant(isInteger()))));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void f() { int a; float c; int d; int e; }",
-      functionDecl(forEachDescendant(
-          varDecl(hasDescendant(isInteger())).bind("x"))),
-      new VerifyIdIsBoundTo<Decl>("x", 3)));
-}
-
-TEST(HasDescendant, MatchesDescendantsOfTypes) {
-  EXPECT_TRUE(matches("void f() { int*** i; }",
-                      qualType(hasDescendant(builtinType()))));
-  EXPECT_TRUE(matches("void f() { int*** i; }",
-                      qualType(hasDescendant(
-                          pointerType(pointee(builtinType()))))));
-  EXPECT_TRUE(matches("void f() { int*** i; }",
-                      typeLoc(hasDescendant(loc(builtinType())))));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void f() { int*** i; }",
-      qualType(asString("int ***"), forEachDescendant(pointerType().bind("x"))),
-      new VerifyIdIsBoundTo<Type>("x", 2)));
-}
-
-TEST(Has, MatchesChildrenOfTypes) {
-  EXPECT_TRUE(matches("int i;",
-                      varDecl(hasName("i"), has(isInteger()))));
-  EXPECT_TRUE(notMatches("int** i;",
-                         varDecl(hasName("i"), has(isInteger()))));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "int (*f)(float, int);",
-      qualType(functionType(), forEach(qualType(isInteger()).bind("x"))),
-      new VerifyIdIsBoundTo<QualType>("x", 2)));
-}
-
-TEST(Has, MatchesChildTypes) {
-  EXPECT_TRUE(matches(
-      "int* i;",
-      varDecl(hasName("i"), hasType(qualType(has(builtinType()))))));
-  EXPECT_TRUE(notMatches(
-      "int* i;",
-      varDecl(hasName("i"), hasType(qualType(has(pointerType()))))));
-}
-
-TEST(ValueDecl, Matches) {
-  EXPECT_TRUE(matches("enum EnumType { EnumValue };",
-                      valueDecl(hasType(asString("enum EnumType")))));
-  EXPECT_TRUE(matches("void FunctionDecl();",
-                      valueDecl(hasType(asString("void (void)")))));
-}
-
-TEST(Enum, DoesNotMatchClasses) {
-  EXPECT_TRUE(notMatches("class X {};", enumDecl(hasName("X"))));
-}
-
-TEST(Enum, MatchesEnums) {
-  EXPECT_TRUE(matches("enum X {};", enumDecl(hasName("X"))));
-}
-
-TEST(EnumConstant, Matches) {
-  DeclarationMatcher Matcher = enumConstantDecl(hasName("A"));
-  EXPECT_TRUE(matches("enum X{ A };", Matcher));
-  EXPECT_TRUE(notMatches("enum X{ B };", Matcher));
-  EXPECT_TRUE(notMatches("enum X {};", Matcher));
-}
-
-TEST(StatementMatcher, Has) {
-  StatementMatcher HasVariableI =
-      expr(hasType(pointsTo(recordDecl(hasName("X")))),
-           has(declRefExpr(to(varDecl(hasName("i"))))));
-
-  EXPECT_TRUE(matches(
-      "class X; X *x(int); void c() { int i; x(i); }", HasVariableI));
-  EXPECT_TRUE(notMatches(
-      "class X; X *x(int); void c() { int i; x(42); }", HasVariableI));
-}
-
-TEST(StatementMatcher, HasDescendant) {
-  StatementMatcher HasDescendantVariableI =
-      expr(hasType(pointsTo(recordDecl(hasName("X")))),
-           hasDescendant(declRefExpr(to(varDecl(hasName("i"))))));
-
-  EXPECT_TRUE(matches(
-      "class X; X *x(bool); bool b(int); void c() { int i; x(b(i)); }",
-      HasDescendantVariableI));
-  EXPECT_TRUE(notMatches(
-      "class X; X *x(bool); bool b(int); void c() { int i; x(b(42)); }",
-      HasDescendantVariableI));
-}
-
-TEST(TypeMatcher, MatchesClassType) {
-  TypeMatcher TypeA = hasDeclaration(recordDecl(hasName("A")));
-
-  EXPECT_TRUE(matches("class A { public: A *a; };", TypeA));
-  EXPECT_TRUE(notMatches("class A {};", TypeA));
-
-  TypeMatcher TypeDerivedFromA =
-      hasDeclaration(cxxRecordDecl(isDerivedFrom("A")));
-
-  EXPECT_TRUE(matches("class A {}; class B : public A { public: B *b; };",
-              TypeDerivedFromA));
-  EXPECT_TRUE(notMatches("class A {};", TypeA));
-
-  TypeMatcher TypeAHasClassB = hasDeclaration(
-      recordDecl(hasName("A"), has(recordDecl(hasName("B")))));
-
-  EXPECT_TRUE(
-      matches("class A { public: A *a; class B {}; };", TypeAHasClassB));
-
-  EXPECT_TRUE(matchesC("struct S {}; void f(void) { struct S s; }",
-                       varDecl(hasType(namedDecl(hasName("S"))))));
-}
-
-TEST(TypeMatcher, MatchesDeclTypes) {
-  // TypedefType -> TypedefNameDecl
-  EXPECT_TRUE(matches("typedef int I; void f(I i);",
-                      parmVarDecl(hasType(namedDecl(hasName("I"))))));
-  // ObjCObjectPointerType
-  EXPECT_TRUE(matchesObjC("@interface Foo @end void f(Foo *f);",
-                          parmVarDecl(hasType(objcObjectPointerType()))));
-  // ObjCObjectPointerType -> ObjCInterfaceType -> ObjCInterfaceDecl
-  EXPECT_TRUE(matchesObjC(
-      "@interface Foo @end void f(Foo *f);",
-      parmVarDecl(hasType(pointsTo(objcInterfaceDecl(hasName("Foo")))))));
-  // TemplateTypeParmType
-  EXPECT_TRUE(matches("template <typename T> void f(T t);",
-                      parmVarDecl(hasType(templateTypeParmType()))));
-  // TemplateTypeParmType -> TemplateTypeParmDecl
-  EXPECT_TRUE(matches("template <typename T> void f(T t);",
-                      parmVarDecl(hasType(namedDecl(hasName("T"))))));
-  // InjectedClassNameType
-  EXPECT_TRUE(matches("template <typename T> struct S {"
-                      "  void f(S s);"
-                      "};",
-                      parmVarDecl(hasType(injectedClassNameType()))));
-  EXPECT_TRUE(notMatches("template <typename T> struct S {"
-                         "  void g(S<T> s);"
-                         "};",
-                         parmVarDecl(hasType(injectedClassNameType()))));
-  // InjectedClassNameType -> CXXRecordDecl
-  EXPECT_TRUE(matches("template <typename T> struct S {"
-                      "  void f(S s);"
-                      "};",
-                      parmVarDecl(hasType(namedDecl(hasName("S"))))));
-
-  static const char Using[] = "template <typename T>"
-                              "struct Base {"
-                              "  typedef T Foo;"
-                              "};"
-                              ""
-                              "template <typename T>"
-                              "struct S : private Base<T> {"
-                              "  using typename Base<T>::Foo;"
-                              "  void f(Foo);"
-                              "};";
-  // UnresolvedUsingTypenameDecl
-  EXPECT_TRUE(matches(Using, unresolvedUsingTypenameDecl(hasName("Foo"))));
-  // UnresolvedUsingTypenameType -> UnresolvedUsingTypenameDecl
-  EXPECT_TRUE(matches(Using, parmVarDecl(hasType(namedDecl(hasName("Foo"))))));
-}
-
-TEST(Matcher, BindMatchedNodes) {
-  DeclarationMatcher ClassX = has(recordDecl(hasName("::X")).bind("x"));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue("class X {};",
-      ClassX, new VerifyIdIsBoundTo<CXXRecordDecl>("x")));
-
-  EXPECT_TRUE(matchAndVerifyResultFalse("class X {};",
-      ClassX, new VerifyIdIsBoundTo<CXXRecordDecl>("other-id")));
-
-  TypeMatcher TypeAHasClassB = hasDeclaration(
-      recordDecl(hasName("A"), has(recordDecl(hasName("B")).bind("b"))));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue("class A { public: A *a; class B {}; };",
-      TypeAHasClassB,
-      new VerifyIdIsBoundTo<Decl>("b")));
-
-  StatementMatcher MethodX =
-      callExpr(callee(cxxMethodDecl(hasName("x")))).bind("x");
-
-  EXPECT_TRUE(matchAndVerifyResultTrue("class A { void x() { x(); } };",
-      MethodX,
-      new VerifyIdIsBoundTo<CXXMemberCallExpr>("x")));
-}
-
-TEST(Matcher, BindTheSameNameInAlternatives) {
-  StatementMatcher matcher = anyOf(
-      binaryOperator(hasOperatorName("+"),
-                     hasLHS(expr().bind("x")),
-                     hasRHS(integerLiteral(equals(0)))),
-      binaryOperator(hasOperatorName("+"),
-                     hasLHS(integerLiteral(equals(0))),
-                     hasRHS(expr().bind("x"))));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      // The first branch of the matcher binds x to 0 but then fails.
-      // The second branch binds x to f() and succeeds.
-      "int f() { return 0 + f(); }",
-      matcher,
-      new VerifyIdIsBoundTo<CallExpr>("x")));
-}
-
-TEST(Matcher, BindsIDForMemoizedResults) {
-  // Using the same matcher in two match expressions will make memoization
-  // kick in.
-  DeclarationMatcher ClassX = recordDecl(hasName("X")).bind("x");
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { class B { class X {}; }; };",
-      DeclarationMatcher(anyOf(
-          recordDecl(hasName("A"), hasDescendant(ClassX)),
-          recordDecl(hasName("B"), hasDescendant(ClassX)))),
-      new VerifyIdIsBoundTo<Decl>("x", 2)));
-}
-
-TEST(HasDeclaration, HasDeclarationOfEnumType) {
-  EXPECT_TRUE(matches("enum X {}; void y(X *x) { x; }",
-                      expr(hasType(pointsTo(
-                          qualType(hasDeclaration(enumDecl(hasName("X")))))))));
-}
-
-TEST(HasDeclaration, HasGetDeclTraitTest) {
-  EXPECT_TRUE(internal::has_getDecl<TypedefType>::value);
-  EXPECT_TRUE(internal::has_getDecl<RecordType>::value);
-  EXPECT_FALSE(internal::has_getDecl<TemplateSpecializationType>::value);
-}
-
-TEST(HasDeclaration, HasDeclarationOfTypeWithDecl) {
-  EXPECT_TRUE(matches("typedef int X; X a;",
-                      varDecl(hasName("a"),
-                              hasType(typedefType(hasDeclaration(decl()))))));
-
-  // FIXME: Add tests for other types with getDecl() (e.g. RecordType)
-}
-
-TEST(HasDeclaration, HasDeclarationOfTemplateSpecializationType) {
-  EXPECT_TRUE(matches("template <typename T> class A {}; A<int> a;",
-                      varDecl(hasType(templateSpecializationType(
-                          hasDeclaration(namedDecl(hasName("A"))))))));
-}
-
-TEST(HasType, TakesQualTypeMatcherAndMatchesExpr) {
-  TypeMatcher ClassX = hasDeclaration(recordDecl(hasName("X")));
-  EXPECT_TRUE(
-      matches("class X {}; void y(X &x) { x; }", expr(hasType(ClassX))));
-  EXPECT_TRUE(
-      notMatches("class X {}; void y(X *x) { x; }",
-                 expr(hasType(ClassX))));
-  EXPECT_TRUE(
-      matches("class X {}; void y(X *x) { x; }",
-              expr(hasType(pointsTo(ClassX)))));
-}
-
-TEST(HasType, TakesQualTypeMatcherAndMatchesValueDecl) {
-  TypeMatcher ClassX = hasDeclaration(recordDecl(hasName("X")));
-  EXPECT_TRUE(
-      matches("class X {}; void y() { X x; }", varDecl(hasType(ClassX))));
-  EXPECT_TRUE(
-      notMatches("class X {}; void y() { X *x; }", varDecl(hasType(ClassX))));
-  EXPECT_TRUE(
-      matches("class X {}; void y() { X *x; }",
-              varDecl(hasType(pointsTo(ClassX)))));
-}
-
-TEST(HasType, TakesDeclMatcherAndMatchesExpr) {
-  DeclarationMatcher ClassX = recordDecl(hasName("X"));
-  EXPECT_TRUE(
-      matches("class X {}; void y(X &x) { x; }", expr(hasType(ClassX))));
-  EXPECT_TRUE(
-      notMatches("class X {}; void y(X *x) { x; }",
-                 expr(hasType(ClassX))));
-}
-
-TEST(HasType, TakesDeclMatcherAndMatchesValueDecl) {
-  DeclarationMatcher ClassX = recordDecl(hasName("X"));
-  EXPECT_TRUE(
-      matches("class X {}; void y() { X x; }", varDecl(hasType(ClassX))));
-  EXPECT_TRUE(
-      notMatches("class X {}; void y() { X *x; }", varDecl(hasType(ClassX))));
-}
-
-TEST(HasTypeLoc, MatchesDeclaratorDecls) {
-  EXPECT_TRUE(matches("int x;",
-                      varDecl(hasName("x"), hasTypeLoc(loc(asString("int"))))));
-
-  // Make sure we don't crash on implicit constructors.
-  EXPECT_TRUE(notMatches("class X {}; X x;",
-                         declaratorDecl(hasTypeLoc(loc(asString("int"))))));
-}
-
-TEST(Matcher, Call) {
-  // FIXME: Do we want to overload Call() to directly take
-  // Matcher<Decl>, too?
-  StatementMatcher MethodX =
-      callExpr(hasDeclaration(cxxMethodDecl(hasName("x"))));
-
-  EXPECT_TRUE(matches("class Y { void x() { x(); } };", MethodX));
-  EXPECT_TRUE(notMatches("class Y { void x() {} };", MethodX));
-
-  StatementMatcher MethodOnY =
-      cxxMemberCallExpr(on(hasType(recordDecl(hasName("Y")))));
-
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z() { Y y; y.x(); }",
-              MethodOnY));
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z(Y &y) { y.x(); }",
-              MethodOnY));
-  EXPECT_TRUE(
-      notMatches("class Y { public: void x(); }; void z(Y *&y) { y->x(); }",
-                 MethodOnY));
-  EXPECT_TRUE(
-      notMatches("class Y { public: void x(); }; void z(Y y[]) { y->x(); }",
-                 MethodOnY));
-  EXPECT_TRUE(
-      notMatches("class Y { public: void x(); }; void z() { Y *y; y->x(); }",
-                 MethodOnY));
-
-  StatementMatcher MethodOnYPointer =
-      cxxMemberCallExpr(on(hasType(pointsTo(recordDecl(hasName("Y"))))));
-
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z() { Y *y; y->x(); }",
-              MethodOnYPointer));
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z(Y *&y) { y->x(); }",
-              MethodOnYPointer));
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z(Y y[]) { y->x(); }",
-              MethodOnYPointer));
-  EXPECT_TRUE(
-      notMatches("class Y { public: void x(); }; void z() { Y y; y.x(); }",
-                 MethodOnYPointer));
-  EXPECT_TRUE(
-      notMatches("class Y { public: void x(); }; void z(Y &y) { y.x(); }",
-                 MethodOnYPointer));
-}
-
-TEST(Matcher, Lambda) {
-  EXPECT_TRUE(matches("auto f = [] (int i) { return i; };",
-                      lambdaExpr()));
-}
-
-TEST(Matcher, ForRange) {
-  EXPECT_TRUE(matches("int as[] = { 1, 2, 3 };"
-                      "void f() { for (auto &a : as); }",
-                      cxxForRangeStmt()));
-  EXPECT_TRUE(notMatches("void f() { for (int i; i<5; ++i); }",
-                         cxxForRangeStmt()));
-}
-
-TEST(Matcher, SubstNonTypeTemplateParm) {
-  EXPECT_FALSE(matches("template<int N>\n"
-                       "struct A {  static const int n = 0; };\n"
-                       "struct B : public A<42> {};",
-                       substNonTypeTemplateParmExpr()));
-  EXPECT_TRUE(matches("template<int N>\n"
-                      "struct A {  static const int n = N; };\n"
-                      "struct B : public A<42> {};",
-                      substNonTypeTemplateParmExpr()));
-}
-
-TEST(Matcher, NonTypeTemplateParmDecl) {
-  EXPECT_TRUE(matches("template <int N> void f();",
-                      nonTypeTemplateParmDecl(hasName("N"))));
-  EXPECT_TRUE(
-      notMatches("template <typename T> void f();", nonTypeTemplateParmDecl()));
-}
-
-TEST(Matcher, templateTypeParmDecl) {
-  EXPECT_TRUE(matches("template <typename T> void f();",
-                      templateTypeParmDecl(hasName("T"))));
-  EXPECT_TRUE(
-      notMatches("template <int N> void f();", templateTypeParmDecl()));
-}
-
-TEST(Matcher, UserDefinedLiteral) {
-  EXPECT_TRUE(matches("constexpr char operator \"\" _inc (const char i) {"
-                      "  return i + 1;"
-                      "}"
-                      "char c = 'a'_inc;",
-                      userDefinedLiteral()));
-}
-
-TEST(Matcher, FlowControl) {
-  EXPECT_TRUE(matches("void f() { while(true) { break; } }", breakStmt()));
-  EXPECT_TRUE(matches("void f() { while(true) { continue; } }",
-                      continueStmt()));
-  EXPECT_TRUE(matches("void f() { goto FOO; FOO: ;}", gotoStmt()));
-  EXPECT_TRUE(matches("void f() { goto FOO; FOO: ;}", labelStmt()));
-  EXPECT_TRUE(matches("void f() { return; }", returnStmt()));
-}
-
-TEST(HasType, MatchesAsString) {
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z() {Y* y; y->x(); }",
-              cxxMemberCallExpr(on(hasType(asString("class Y *"))))));
-  EXPECT_TRUE(
-      matches("class X { void x(int x) {} };",
-              cxxMethodDecl(hasParameter(0, hasType(asString("int"))))));
-  EXPECT_TRUE(matches("namespace ns { struct A {}; }  struct B { ns::A a; };",
-      fieldDecl(hasType(asString("ns::A")))));
-  EXPECT_TRUE(matches("namespace { struct A {}; }  struct B { A a; };",
-      fieldDecl(hasType(asString("struct (anonymous namespace)::A")))));
-}
-
-TEST(Matcher, OverloadedOperatorCall) {
-  StatementMatcher OpCall = cxxOperatorCallExpr();
-  // Unary operator
-  EXPECT_TRUE(matches("class Y { }; "
-              "bool operator!(Y x) { return false; }; "
-              "Y y; bool c = !y;", OpCall));
-  // No match -- special operators like "new", "delete"
-  // FIXME: operator new takes size_t, for which we need stddef.h, for which
-  // we need to figure out include paths in the test.
-  // EXPECT_TRUE(NotMatches("#include <stddef.h>\n"
-  //             "class Y { }; "
-  //             "void *operator new(size_t size) { return 0; } "
-  //             "Y *y = new Y;", OpCall));
-  EXPECT_TRUE(notMatches("class Y { }; "
-              "void operator delete(void *p) { } "
-              "void a() {Y *y = new Y; delete y;}", OpCall));
-  // Binary operator
-  EXPECT_TRUE(matches("class Y { }; "
-              "bool operator&&(Y x, Y y) { return true; }; "
-              "Y a; Y b; bool c = a && b;",
-              OpCall));
-  // No match -- normal operator, not an overloaded one.
-  EXPECT_TRUE(notMatches("bool x = true, y = true; bool t = x && y;", OpCall));
-  EXPECT_TRUE(notMatches("int t = 5 << 2;", OpCall));
-}
-
-TEST(Matcher, HasOperatorNameForOverloadedOperatorCall) {
-  StatementMatcher OpCallAndAnd =
-      cxxOperatorCallExpr(hasOverloadedOperatorName("&&"));
-  EXPECT_TRUE(matches("class Y { }; "
-              "bool operator&&(Y x, Y y) { return true; }; "
-              "Y a; Y b; bool c = a && b;", OpCallAndAnd));
-  StatementMatcher OpCallLessLess =
-      cxxOperatorCallExpr(hasOverloadedOperatorName("<<"));
-  EXPECT_TRUE(notMatches("class Y { }; "
-              "bool operator&&(Y x, Y y) { return true; }; "
-              "Y a; Y b; bool c = a && b;",
-              OpCallLessLess));
-  StatementMatcher OpStarCall =
-      cxxOperatorCallExpr(hasOverloadedOperatorName("*"));
-  EXPECT_TRUE(matches("class Y; int operator*(Y &); void f(Y &y) { *y; }",
-              OpStarCall));
-  DeclarationMatcher ClassWithOpStar =
-    cxxRecordDecl(hasMethod(hasOverloadedOperatorName("*")));
-  EXPECT_TRUE(matches("class Y { int operator*(); };",
-                      ClassWithOpStar));
-  EXPECT_TRUE(notMatches("class Y { void myOperator(); };",
-              ClassWithOpStar)) ;
-  DeclarationMatcher AnyOpStar = functionDecl(hasOverloadedOperatorName("*"));
-  EXPECT_TRUE(matches("class Y; int operator*(Y &);", AnyOpStar));
-  EXPECT_TRUE(matches("class Y { int operator*(); };", AnyOpStar));
-}
-
-TEST(Matcher, NestedOverloadedOperatorCalls) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class Y { }; "
-      "Y& operator&&(Y& x, Y& y) { return x; }; "
-      "Y a; Y b; Y c; Y d = a && b && c;",
-      cxxOperatorCallExpr(hasOverloadedOperatorName("&&")).bind("x"),
-      new VerifyIdIsBoundTo<CXXOperatorCallExpr>("x", 2)));
-  EXPECT_TRUE(matches("class Y { }; "
-                      "Y& operator&&(Y& x, Y& y) { return x; }; "
-                      "Y a; Y b; Y c; Y d = a && b && c;",
-                      cxxOperatorCallExpr(hasParent(cxxOperatorCallExpr()))));
-  EXPECT_TRUE(
-      matches("class Y { }; "
-              "Y& operator&&(Y& x, Y& y) { return x; }; "
-              "Y a; Y b; Y c; Y d = a && b && c;",
-              cxxOperatorCallExpr(hasDescendant(cxxOperatorCallExpr()))));
-}
-
-TEST(Matcher, ThisPointerType) {
-  StatementMatcher MethodOnY =
-    cxxMemberCallExpr(thisPointerType(recordDecl(hasName("Y"))));
-
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z() { Y y; y.x(); }",
-              MethodOnY));
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z(Y &y) { y.x(); }",
-              MethodOnY));
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z(Y *&y) { y->x(); }",
-              MethodOnY));
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z(Y y[]) { y->x(); }",
-              MethodOnY));
-  EXPECT_TRUE(
-      matches("class Y { public: void x(); }; void z() { Y *y; y->x(); }",
-              MethodOnY));
-
-  EXPECT_TRUE(matches(
-      "class Y {"
-      "  public: virtual void x();"
-      "};"
-      "class X : public Y {"
-      "  public: virtual void x();"
-      "};"
-      "void z() { X *x; x->Y::x(); }", MethodOnY));
-}
-
-TEST(Matcher, VariableUsage) {
-  StatementMatcher Reference =
-      declRefExpr(to(
-          varDecl(hasInitializer(
-              cxxMemberCallExpr(thisPointerType(recordDecl(hasName("Y"))))))));
-
-  EXPECT_TRUE(matches(
-      "class Y {"
-      " public:"
-      "  bool x() const;"
-      "};"
-      "void z(const Y &y) {"
-      "  bool b = y.x();"
-      "  if (b) {}"
-      "}", Reference));
-
-  EXPECT_TRUE(notMatches(
-      "class Y {"
-      " public:"
-      "  bool x() const;"
-      "};"
-      "void z(const Y &y) {"
-      "  bool b = y.x();"
-      "}", Reference));
-}
-
-TEST(Matcher, VarDecl_Storage) {
-  auto M = varDecl(hasName("X"), hasLocalStorage());
-  EXPECT_TRUE(matches("void f() { int X; }", M));
-  EXPECT_TRUE(notMatches("int X;", M));
-  EXPECT_TRUE(notMatches("void f() { static int X; }", M));
-
-  M = varDecl(hasName("X"), hasGlobalStorage());
-  EXPECT_TRUE(notMatches("void f() { int X; }", M));
-  EXPECT_TRUE(matches("int X;", M));
-  EXPECT_TRUE(matches("void f() { static int X; }", M));
-}
-
-TEST(Matcher, VarDecl_StorageDuration) {
-  std::string T =
-      "void f() { int x; static int y; } int a;";
-
-  EXPECT_TRUE(matches(T, varDecl(hasName("x"), hasAutomaticStorageDuration())));
-  EXPECT_TRUE(
-      notMatches(T, varDecl(hasName("y"), hasAutomaticStorageDuration())));
-  EXPECT_TRUE(
-      notMatches(T, varDecl(hasName("a"), hasAutomaticStorageDuration())));
-
-  EXPECT_TRUE(matches(T, varDecl(hasName("y"), hasStaticStorageDuration())));
-  EXPECT_TRUE(matches(T, varDecl(hasName("a"), hasStaticStorageDuration())));
-  EXPECT_TRUE(notMatches(T, varDecl(hasName("x"), hasStaticStorageDuration())));
-
-  // FIXME: It is really hard to test with thread_local itself because not all
-  // targets support TLS, which causes this to be an error depending on what
-  // platform the test is being run on. We do not have access to the TargetInfo
-  // object to be able to test whether the platform supports TLS or not.
-  EXPECT_TRUE(notMatches(T, varDecl(hasName("x"), hasThreadStorageDuration())));
-  EXPECT_TRUE(notMatches(T, varDecl(hasName("y"), hasThreadStorageDuration())));
-  EXPECT_TRUE(notMatches(T, varDecl(hasName("a"), hasThreadStorageDuration())));
-}
-
-TEST(Matcher, FindsVarDeclInFunctionParameter) {
-  EXPECT_TRUE(matches(
-      "void f(int i) {}",
-      varDecl(hasName("i"))));
-}
-
-TEST(Matcher, CalledVariable) {
-  StatementMatcher CallOnVariableY =
-      cxxMemberCallExpr(on(declRefExpr(to(varDecl(hasName("y"))))));
-
-  EXPECT_TRUE(matches(
-      "class Y { public: void x() { Y y; y.x(); } };", CallOnVariableY));
-  EXPECT_TRUE(matches(
-      "class Y { public: void x() const { Y y; y.x(); } };", CallOnVariableY));
-  EXPECT_TRUE(matches(
-      "class Y { public: void x(); };"
-      "class X : public Y { void z() { X y; y.x(); } };", CallOnVariableY));
-  EXPECT_TRUE(matches(
-      "class Y { public: void x(); };"
-      "class X : public Y { void z() { X *y; y->x(); } };", CallOnVariableY));
-  EXPECT_TRUE(notMatches(
-      "class Y { public: void x(); };"
-      "class X : public Y { void z() { unsigned long y; ((X*)y)->x(); } };",
-      CallOnVariableY));
-}
-
-TEST(UnaryExprOrTypeTraitExpr, MatchesSizeOfAndAlignOf) {
-  EXPECT_TRUE(matches("void x() { int a = sizeof(a); }",
-                      unaryExprOrTypeTraitExpr()));
-  EXPECT_TRUE(notMatches("void x() { int a = sizeof(a); }",
-                         alignOfExpr(anything())));
-  // FIXME: Uncomment once alignof is enabled.
-  // EXPECT_TRUE(matches("void x() { int a = alignof(a); }",
-  //                     unaryExprOrTypeTraitExpr()));
-  // EXPECT_TRUE(notMatches("void x() { int a = alignof(a); }",
-  //                        sizeOfExpr()));
-}
-
-TEST(UnaryExpressionOrTypeTraitExpression, MatchesCorrectType) {
-  EXPECT_TRUE(matches("void x() { int a = sizeof(a); }", sizeOfExpr(
-      hasArgumentOfType(asString("int")))));
-  EXPECT_TRUE(notMatches("void x() { int a = sizeof(a); }", sizeOfExpr(
-      hasArgumentOfType(asString("float")))));
-  EXPECT_TRUE(matches(
-      "struct A {}; void x() { A a; int b = sizeof(a); }",
-      sizeOfExpr(hasArgumentOfType(hasDeclaration(recordDecl(hasName("A")))))));
-  EXPECT_TRUE(notMatches("void x() { int a = sizeof(a); }", sizeOfExpr(
-      hasArgumentOfType(hasDeclaration(recordDecl(hasName("string")))))));
-}
-
-TEST(MemberExpression, DoesNotMatchClasses) {
-  EXPECT_TRUE(notMatches("class Y { void x() {} };", memberExpr()));
-}
-
-TEST(MemberExpression, MatchesMemberFunctionCall) {
-  EXPECT_TRUE(matches("class Y { void x() { x(); } };", memberExpr()));
-}
-
-TEST(MemberExpression, MatchesVariable) {
-  EXPECT_TRUE(
-      matches("class Y { void x() { this->y; } int y; };", memberExpr()));
-  EXPECT_TRUE(
-      matches("class Y { void x() { y; } int y; };", memberExpr()));
-  EXPECT_TRUE(
-      matches("class Y { void x() { Y y; y.y; } int y; };", memberExpr()));
-}
-
-TEST(MemberExpression, MatchesStaticVariable) {
-  EXPECT_TRUE(matches("class Y { void x() { this->y; } static int y; };",
-              memberExpr()));
-  EXPECT_TRUE(notMatches("class Y { void x() { y; } static int y; };",
-              memberExpr()));
-  EXPECT_TRUE(notMatches("class Y { void x() { Y::y; } static int y; };",
-              memberExpr()));
-}
-
-TEST(IsInteger, MatchesIntegers) {
-  EXPECT_TRUE(matches("int i = 0;", varDecl(hasType(isInteger()))));
-  EXPECT_TRUE(matches(
-      "long long i = 0; void f(long long) { }; void g() {f(i);}",
-      callExpr(hasArgument(0, declRefExpr(
-                                  to(varDecl(hasType(isInteger()))))))));
-}
-
-TEST(IsInteger, ReportsNoFalsePositives) {
-  EXPECT_TRUE(notMatches("int *i;", varDecl(hasType(isInteger()))));
-  EXPECT_TRUE(notMatches("struct T {}; T t; void f(T *) { }; void g() {f(&t);}",
-                      callExpr(hasArgument(0, declRefExpr(
-                          to(varDecl(hasType(isInteger()))))))));
-}
-
-TEST(IsAnyCharacter, MatchesCharacters) {
-  EXPECT_TRUE(matches("char i = 0;", varDecl(hasType(isAnyCharacter()))));
-}
-
-TEST(IsAnyCharacter, ReportsNoFalsePositives) {
-  EXPECT_TRUE(notMatches("int i;", varDecl(hasType(isAnyCharacter()))));
-}
-
-TEST(IsArrow, MatchesMemberVariablesViaArrow) {
-  EXPECT_TRUE(matches("class Y { void x() { this->y; } int y; };",
-              memberExpr(isArrow())));
-  EXPECT_TRUE(matches("class Y { void x() { y; } int y; };",
-              memberExpr(isArrow())));
-  EXPECT_TRUE(notMatches("class Y { void x() { (*this).y; } int y; };",
-              memberExpr(isArrow())));
-}
-
-TEST(IsArrow, MatchesStaticMemberVariablesViaArrow) {
-  EXPECT_TRUE(matches("class Y { void x() { this->y; } static int y; };",
-              memberExpr(isArrow())));
-  EXPECT_TRUE(notMatches("class Y { void x() { y; } static int y; };",
-              memberExpr(isArrow())));
-  EXPECT_TRUE(notMatches("class Y { void x() { (*this).y; } static int y; };",
-              memberExpr(isArrow())));
-}
-
-TEST(IsArrow, MatchesMemberCallsViaArrow) {
-  EXPECT_TRUE(matches("class Y { void x() { this->x(); } };",
-              memberExpr(isArrow())));
-  EXPECT_TRUE(matches("class Y { void x() { x(); } };",
-              memberExpr(isArrow())));
-  EXPECT_TRUE(notMatches("class Y { void x() { Y y; y.x(); } };",
-              memberExpr(isArrow())));
-}
-
-TEST(Callee, MatchesDeclarations) {
-  StatementMatcher CallMethodX = callExpr(callee(cxxMethodDecl(hasName("x"))));
-
-  EXPECT_TRUE(matches("class Y { void x() { x(); } };", CallMethodX));
-  EXPECT_TRUE(notMatches("class Y { void x() {} };", CallMethodX));
-
-  CallMethodX = callExpr(callee(cxxConversionDecl()));
-  EXPECT_TRUE(
-      matches("struct Y { operator int() const; }; int i = Y();", CallMethodX));
-  EXPECT_TRUE(notMatches("struct Y { operator int() const; }; Y y = Y();",
-                         CallMethodX));
-}
-
-TEST(ConversionDeclaration, IsExplicit) {
-  EXPECT_TRUE(matches("struct S { explicit operator int(); };",
-                      cxxConversionDecl(isExplicit())));
-  EXPECT_TRUE(notMatches("struct S { operator int(); };",
-                         cxxConversionDecl(isExplicit())));
-}
-
-TEST(Callee, MatchesMemberExpressions) {
-  EXPECT_TRUE(matches("class Y { void x() { this->x(); } };",
-              callExpr(callee(memberExpr()))));
-  EXPECT_TRUE(
-      notMatches("class Y { void x() { this->x(); } };", callExpr(callee(callExpr()))));
-}
-
-TEST(Function, MatchesFunctionDeclarations) {
-  StatementMatcher CallFunctionF = callExpr(callee(functionDecl(hasName("f"))));
-
-  EXPECT_TRUE(matches("void f() { f(); }", CallFunctionF));
-  EXPECT_TRUE(notMatches("void f() { }", CallFunctionF));
-
-  if (llvm::Triple(llvm::sys::getDefaultTargetTriple()).getOS() !=
-      llvm::Triple::Win32) {
-    // FIXME: Make this work for MSVC.
-    // Dependent contexts, but a non-dependent call.
-    EXPECT_TRUE(matches("void f(); template <int N> void g() { f(); }",
-                        CallFunctionF));
-    EXPECT_TRUE(
-        matches("void f(); template <int N> struct S { void g() { f(); } };",
-                CallFunctionF));
-  }
-
-  // Depedent calls don't match.
-  EXPECT_TRUE(
-      notMatches("void f(int); template <typename T> void g(T t) { f(t); }",
-                 CallFunctionF));
-  EXPECT_TRUE(
-      notMatches("void f(int);"
-                 "template <typename T> struct S { void g(T t) { f(t); } };",
-                 CallFunctionF));
-
-  EXPECT_TRUE(matches("void f(...);", functionDecl(isVariadic())));
-  EXPECT_TRUE(notMatches("void f(int);", functionDecl(isVariadic())));
-  EXPECT_TRUE(notMatches("template <typename... Ts> void f(Ts...);",
-                         functionDecl(isVariadic())));
-  EXPECT_TRUE(notMatches("void f();", functionDecl(isVariadic())));
-  EXPECT_TRUE(notMatchesC("void f();", functionDecl(isVariadic())));
-}
-
-TEST(FunctionTemplate, MatchesFunctionTemplateDeclarations) {
-  EXPECT_TRUE(
-      matches("template <typename T> void f(T t) {}",
-      functionTemplateDecl(hasName("f"))));
-}
-
-TEST(FunctionTemplate, DoesNotMatchFunctionDeclarations) {
-  EXPECT_TRUE(
-      notMatches("void f(double d); void f(int t) {}",
-      functionTemplateDecl(hasName("f"))));
-}
-
-TEST(FunctionTemplate, DoesNotMatchFunctionTemplateSpecializations) {
-  EXPECT_TRUE(
-      notMatches("void g(); template <typename T> void f(T t) {}"
-                 "template <> void f(int t) { g(); }",
-      functionTemplateDecl(hasName("f"),
-                           hasDescendant(declRefExpr(to(
-                               functionDecl(hasName("g"))))))));
-}
-
-TEST(Matcher, Argument) {
-  StatementMatcher CallArgumentY = callExpr(
-      hasArgument(0, declRefExpr(to(varDecl(hasName("y"))))));
-
-  EXPECT_TRUE(matches("void x(int) { int y; x(y); }", CallArgumentY));
-  EXPECT_TRUE(
-      matches("class X { void x(int) { int y; x(y); } };", CallArgumentY));
-  EXPECT_TRUE(notMatches("void x(int) { int z; x(z); }", CallArgumentY));
-
-  StatementMatcher WrongIndex = callExpr(
-      hasArgument(42, declRefExpr(to(varDecl(hasName("y"))))));
-  EXPECT_TRUE(notMatches("void x(int) { int y; x(y); }", WrongIndex));
-}
-
-TEST(Matcher, AnyArgument) {
-  StatementMatcher CallArgumentY = callExpr(
-      hasAnyArgument(declRefExpr(to(varDecl(hasName("y"))))));
-  EXPECT_TRUE(matches("void x(int, int) { int y; x(1, y); }", CallArgumentY));
-  EXPECT_TRUE(matches("void x(int, int) { int y; x(y, 42); }", CallArgumentY));
-  EXPECT_TRUE(notMatches("void x(int, int) { x(1, 2); }", CallArgumentY));
-}
-
-TEST(ForEachArgumentWithParam, ReportsNoFalsePositives) {
-  StatementMatcher ArgumentY =
-      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
-  DeclarationMatcher IntParam = parmVarDecl(hasType(isInteger())).bind("param");
-  StatementMatcher CallExpr =
-      callExpr(forEachArgumentWithParam(ArgumentY, IntParam));
-
-  // IntParam does not match.
-  EXPECT_TRUE(notMatches("void f(int* i) { int* y; f(y); }", CallExpr));
-  // ArgumentY does not match.
-  EXPECT_TRUE(notMatches("void f(int i) { int x; f(x); }", CallExpr));
-}
-
-TEST(ForEachArgumentWithParam, MatchesCXXMemberCallExpr) {
-  StatementMatcher ArgumentY =
-      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
-  DeclarationMatcher IntParam = parmVarDecl(hasType(isInteger())).bind("param");
-  StatementMatcher CallExpr =
-      callExpr(forEachArgumentWithParam(ArgumentY, IntParam));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "struct S {"
-      "  const S& operator[](int i) { return *this; }"
-      "};"
-      "void f(S S1) {"
-      "  int y = 1;"
-      "  S1[y];"
-      "}",
-      CallExpr, new VerifyIdIsBoundTo<ParmVarDecl>("param", 1)));
-
-  StatementMatcher CallExpr2 =
-      callExpr(forEachArgumentWithParam(ArgumentY, IntParam));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "struct S {"
-      "  static void g(int i);"
-      "};"
-      "void f() {"
-      "  int y = 1;"
-      "  S::g(y);"
-      "}",
-      CallExpr2, new VerifyIdIsBoundTo<ParmVarDecl>("param", 1)));
-}
-
-TEST(ForEachArgumentWithParam, MatchesCallExpr) {
-  StatementMatcher ArgumentY =
-      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
-  DeclarationMatcher IntParam = parmVarDecl(hasType(isInteger())).bind("param");
-  StatementMatcher CallExpr =
-      callExpr(forEachArgumentWithParam(ArgumentY, IntParam));
-
-  EXPECT_TRUE(
-      matchAndVerifyResultTrue("void f(int i) { int y; f(y); }", CallExpr,
-                               new VerifyIdIsBoundTo<ParmVarDecl>("param")));
-  EXPECT_TRUE(
-      matchAndVerifyResultTrue("void f(int i) { int y; f(y); }", CallExpr,
-                               new VerifyIdIsBoundTo<DeclRefExpr>("arg")));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void f(int i, int j) { int y; f(y, y); }", CallExpr,
-      new VerifyIdIsBoundTo<ParmVarDecl>("param", 2)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void f(int i, int j) { int y; f(y, y); }", CallExpr,
-      new VerifyIdIsBoundTo<DeclRefExpr>("arg", 2)));
-}
-
-TEST(ForEachArgumentWithParam, MatchesConstructExpr) {
-  StatementMatcher ArgumentY =
-      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
-  DeclarationMatcher IntParam = parmVarDecl(hasType(isInteger())).bind("param");
-  StatementMatcher ConstructExpr =
-      cxxConstructExpr(forEachArgumentWithParam(ArgumentY, IntParam));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "struct C {"
-      "  C(int i) {}"
-      "};"
-      "int y = 0;"
-      "C Obj(y);",
-      ConstructExpr, new VerifyIdIsBoundTo<ParmVarDecl>("param")));
-}
-
-TEST(ForEachArgumentWithParam, HandlesBoundNodesForNonMatches) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void g(int i, int j) {"
-      "  int a;"
-      "  int b;"
-      "  int c;"
-      "  g(a, 0);"
-      "  g(a, b);"
-      "  g(0, b);"
-      "}",
-      functionDecl(
-          forEachDescendant(varDecl().bind("v")),
-          forEachDescendant(callExpr(forEachArgumentWithParam(
-              declRefExpr(to(decl(equalsBoundNode("v")))), parmVarDecl())))),
-      new VerifyIdIsBoundTo<VarDecl>("v", 4)));
-}
-
-TEST(Matcher, ArgumentCount) {
-  StatementMatcher Call1Arg = callExpr(argumentCountIs(1));
-
-  EXPECT_TRUE(matches("void x(int) { x(0); }", Call1Arg));
-  EXPECT_TRUE(matches("class X { void x(int) { x(0); } };", Call1Arg));
-  EXPECT_TRUE(notMatches("void x(int, int) { x(0, 0); }", Call1Arg));
-}
-
-TEST(Matcher, ParameterCount) {
-  DeclarationMatcher Function1Arg = functionDecl(parameterCountIs(1));
-  EXPECT_TRUE(matches("void f(int i) {}", Function1Arg));
-  EXPECT_TRUE(matches("class X { void f(int i) {} };", Function1Arg));
-  EXPECT_TRUE(notMatches("void f() {}", Function1Arg));
-  EXPECT_TRUE(notMatches("void f(int i, int j, int k) {}", Function1Arg));
-}
-
-TEST(Matcher, References) {
-  DeclarationMatcher ReferenceClassX = varDecl(
-      hasType(references(recordDecl(hasName("X")))));
-  EXPECT_TRUE(matches("class X {}; void y(X y) { X &x = y; }",
-                      ReferenceClassX));
-  EXPECT_TRUE(
-      matches("class X {}; void y(X y) { const X &x = y; }", ReferenceClassX));
-  // The match here is on the implicit copy constructor code for
-  // class X, not on code 'X x = y'.
-  EXPECT_TRUE(
-      matches("class X {}; void y(X y) { X x = y; }", ReferenceClassX));
-  EXPECT_TRUE(
-      notMatches("class X {}; extern X x;", ReferenceClassX));
-  EXPECT_TRUE(
-      notMatches("class X {}; void y(X *y) { X *&x = y; }", ReferenceClassX));
-}
-
-TEST(QualType, hasCanonicalType) {
-  EXPECT_TRUE(notMatches("typedef int &int_ref;"
-                         "int a;"
-                         "int_ref b = a;",
-                         varDecl(hasType(qualType(referenceType())))));
-  EXPECT_TRUE(
-      matches("typedef int &int_ref;"
-              "int a;"
-              "int_ref b = a;",
-              varDecl(hasType(qualType(hasCanonicalType(referenceType()))))));
-}
-
-TEST(QualType, hasLocalQualifiers) {
-  EXPECT_TRUE(notMatches("typedef const int const_int; const_int i = 1;",
-                         varDecl(hasType(hasLocalQualifiers()))));
-  EXPECT_TRUE(matches("int *const j = nullptr;",
-                      varDecl(hasType(hasLocalQualifiers()))));
-  EXPECT_TRUE(matches("int *volatile k;",
-                      varDecl(hasType(hasLocalQualifiers()))));
-  EXPECT_TRUE(notMatches("int m;",
-                         varDecl(hasType(hasLocalQualifiers()))));
-}
-
-TEST(HasParameter, CallsInnerMatcher) {
-  EXPECT_TRUE(matches("class X { void x(int) {} };",
-                      cxxMethodDecl(hasParameter(0, varDecl()))));
-  EXPECT_TRUE(notMatches("class X { void x(int) {} };",
-                         cxxMethodDecl(hasParameter(0, hasName("x")))));
-}
-
-TEST(HasParameter, DoesNotMatchIfIndexOutOfBounds) {
-  EXPECT_TRUE(notMatches("class X { void x(int) {} };",
-                         cxxMethodDecl(hasParameter(42, varDecl()))));
-}
-
-TEST(HasType, MatchesParameterVariableTypesStrictly) {
-  EXPECT_TRUE(matches(
-      "class X { void x(X x) {} };",
-      cxxMethodDecl(hasParameter(0, hasType(recordDecl(hasName("X")))))));
-  EXPECT_TRUE(notMatches(
-      "class X { void x(const X &x) {} };",
-      cxxMethodDecl(hasParameter(0, hasType(recordDecl(hasName("X")))))));
-  EXPECT_TRUE(matches("class X { void x(const X *x) {} };",
-                      cxxMethodDecl(hasParameter(
-                          0, hasType(pointsTo(recordDecl(hasName("X"))))))));
-  EXPECT_TRUE(matches("class X { void x(const X &x) {} };",
-                      cxxMethodDecl(hasParameter(
-                          0, hasType(references(recordDecl(hasName("X"))))))));
-}
-
-TEST(HasAnyParameter, MatchesIndependentlyOfPosition) {
-  EXPECT_TRUE(matches(
-      "class Y {}; class X { void x(X x, Y y) {} };",
-      cxxMethodDecl(hasAnyParameter(hasType(recordDecl(hasName("X")))))));
-  EXPECT_TRUE(matches(
-      "class Y {}; class X { void x(Y y, X x) {} };",
-      cxxMethodDecl(hasAnyParameter(hasType(recordDecl(hasName("X")))))));
-}
-
-TEST(Returns, MatchesReturnTypes) {
-  EXPECT_TRUE(matches("class Y { int f() { return 1; } };",
-                      functionDecl(returns(asString("int")))));
-  EXPECT_TRUE(notMatches("class Y { int f() { return 1; } };",
-                         functionDecl(returns(asString("float")))));
-  EXPECT_TRUE(matches("class Y { Y getMe() { return *this; } };",
-                      functionDecl(returns(hasDeclaration(
-                          recordDecl(hasName("Y")))))));
-}
-
-TEST(IsExternC, MatchesExternCFunctionDeclarations) {
-  EXPECT_TRUE(matches("extern \"C\" void f() {}", functionDecl(isExternC())));
-  EXPECT_TRUE(matches("extern \"C\" { void f() {} }",
-              functionDecl(isExternC())));
-  EXPECT_TRUE(notMatches("void f() {}", functionDecl(isExternC())));
-}
-
-TEST(IsDefaulted, MatchesDefaultedFunctionDeclarations) {
-  EXPECT_TRUE(notMatches("class A { ~A(); };",
-                         functionDecl(hasName("~A"), isDefaulted())));
-  EXPECT_TRUE(matches("class B { ~B() = default; };",
-                      functionDecl(hasName("~B"), isDefaulted())));
-}
-
-TEST(IsDeleted, MatchesDeletedFunctionDeclarations) {
-  EXPECT_TRUE(
-      notMatches("void Func();", functionDecl(hasName("Func"), isDeleted())));
-  EXPECT_TRUE(matches("void Func() = delete;",
-                      functionDecl(hasName("Func"), isDeleted())));
-}
-
-TEST(IsNoThrow, MatchesNoThrowFunctionDeclarations) {
-  EXPECT_TRUE(notMatches("void f();", functionDecl(isNoThrow())));
-  EXPECT_TRUE(notMatches("void f() throw(int);", functionDecl(isNoThrow())));
-  EXPECT_TRUE(
-      notMatches("void f() noexcept(false);", functionDecl(isNoThrow())));
-  EXPECT_TRUE(matches("void f() throw();", functionDecl(isNoThrow())));
-  EXPECT_TRUE(matches("void f() noexcept;", functionDecl(isNoThrow())));
-}
-
-TEST(isConstexpr, MatchesConstexprDeclarations) {
-  EXPECT_TRUE(matches("constexpr int foo = 42;",
-                      varDecl(hasName("foo"), isConstexpr())));
-  EXPECT_TRUE(matches("constexpr int bar();",
-                      functionDecl(hasName("bar"), isConstexpr())));
-}
-
-TEST(HasAnyParameter, DoesntMatchIfInnerMatcherDoesntMatch) {
-  EXPECT_TRUE(notMatches(
-      "class Y {}; class X { void x(int) {} };",
-      cxxMethodDecl(hasAnyParameter(hasType(recordDecl(hasName("X")))))));
-}
-
-TEST(HasAnyParameter, DoesNotMatchThisPointer) {
-  EXPECT_TRUE(notMatches("class Y {}; class X { void x() {} };",
-                         cxxMethodDecl(hasAnyParameter(
-                             hasType(pointsTo(recordDecl(hasName("X"))))))));
-}
-
-TEST(HasName, MatchesParameterVariableDeclarations) {
-  EXPECT_TRUE(matches("class Y {}; class X { void x(int x) {} };",
-                      cxxMethodDecl(hasAnyParameter(hasName("x")))));
-  EXPECT_TRUE(notMatches("class Y {}; class X { void x(int) {} };",
-                         cxxMethodDecl(hasAnyParameter(hasName("x")))));
-}
-
-TEST(Matcher, MatchesClassTemplateSpecialization) {
-  EXPECT_TRUE(matches("template<typename T> struct A {};"
-                      "template<> struct A<int> {};",
-                      classTemplateSpecializationDecl()));
-  EXPECT_TRUE(matches("template<typename T> struct A {}; A<int> a;",
-                      classTemplateSpecializationDecl()));
-  EXPECT_TRUE(notMatches("template<typename T> struct A {};",
-                         classTemplateSpecializationDecl()));
-}
-
-TEST(DeclaratorDecl, MatchesDeclaratorDecls) {
-  EXPECT_TRUE(matches("int x;", declaratorDecl()));
-  EXPECT_TRUE(notMatches("class A {};", declaratorDecl()));
-}
-
-TEST(ParmVarDecl, MatchesParmVars) {
-  EXPECT_TRUE(matches("void f(int x);", parmVarDecl()));
-  EXPECT_TRUE(notMatches("void f();", parmVarDecl()));
-}
-
-TEST(Matcher, MatchesTypeTemplateArgument) {
-  EXPECT_TRUE(matches(
-      "template<typename T> struct B {};"
-      "B<int> b;",
-      classTemplateSpecializationDecl(hasAnyTemplateArgument(refersToType(
-          asString("int"))))));
-}
-
-TEST(Matcher, MatchesDeclarationReferenceTemplateArgument) {
-  EXPECT_TRUE(matches(
-      "struct B { int next; };"
-      "template<int(B::*next_ptr)> struct A {};"
-      "A<&B::next> a;",
-      classTemplateSpecializationDecl(hasAnyTemplateArgument(
-          refersToDeclaration(fieldDecl(hasName("next")))))));
-
-  EXPECT_TRUE(notMatches(
-      "template <typename T> struct A {};"
-      "A<int> a;",
-      classTemplateSpecializationDecl(hasAnyTemplateArgument(
-          refersToDeclaration(decl())))));
-
-  EXPECT_TRUE(matches(
-      "struct B { int next; };"
-      "template<int(B::*next_ptr)> struct A {};"
-      "A<&B::next> a;",
-      templateSpecializationType(hasAnyTemplateArgument(isExpr(
-          hasDescendant(declRefExpr(to(fieldDecl(hasName("next"))))))))));
-
-  EXPECT_TRUE(notMatches(
-      "template <typename T> struct A {};"
-      "A<int> a;",
-      templateSpecializationType(hasAnyTemplateArgument(
-          refersToDeclaration(decl())))));
-}
-
-TEST(Matcher, MatchesSpecificArgument) {
-  EXPECT_TRUE(matches(
-      "template<typename T, typename U> class A {};"
-      "A<bool, int> a;",
-      classTemplateSpecializationDecl(hasTemplateArgument(
-          1, refersToType(asString("int"))))));
-  EXPECT_TRUE(notMatches(
-      "template<typename T, typename U> class A {};"
-      "A<int, bool> a;",
-      classTemplateSpecializationDecl(hasTemplateArgument(
-          1, refersToType(asString("int"))))));
-
-  EXPECT_TRUE(matches(
-      "template<typename T, typename U> class A {};"
-      "A<bool, int> a;",
-      templateSpecializationType(hasTemplateArgument(
-          1, refersToType(asString("int"))))));
-  EXPECT_TRUE(notMatches(
-      "template<typename T, typename U> class A {};"
-      "A<int, bool> a;",
-      templateSpecializationType(hasTemplateArgument(
-          1, refersToType(asString("int"))))));
-}
-
-TEST(TemplateArgument, Matches) {
-  EXPECT_TRUE(matches("template<typename T> struct C {}; C<int> c;",
-                      classTemplateSpecializationDecl(
-                          hasAnyTemplateArgument(templateArgument()))));
-  EXPECT_TRUE(matches(
-      "template<typename T> struct C {}; C<int> c;",
-      templateSpecializationType(hasAnyTemplateArgument(templateArgument()))));
-}
-
-TEST(TemplateArgumentCountIs, Matches) {
-  EXPECT_TRUE(
-      matches("template<typename T> struct C {}; C<int> c;",
-              classTemplateSpecializationDecl(templateArgumentCountIs(1))));
-  EXPECT_TRUE(
-      notMatches("template<typename T> struct C {}; C<int> c;",
-                 classTemplateSpecializationDecl(templateArgumentCountIs(2))));
-
-  EXPECT_TRUE(matches("template<typename T> struct C {}; C<int> c;",
-                      templateSpecializationType(templateArgumentCountIs(1))));
-  EXPECT_TRUE(
-      notMatches("template<typename T> struct C {}; C<int> c;",
-                 templateSpecializationType(templateArgumentCountIs(2))));
-}
-
-TEST(IsIntegral, Matches) {
-  EXPECT_TRUE(matches("template<int T> struct C {}; C<42> c;",
-                      classTemplateSpecializationDecl(
-                          hasAnyTemplateArgument(isIntegral()))));
-  EXPECT_TRUE(notMatches("template<typename T> struct C {}; C<int> c;",
-                         classTemplateSpecializationDecl(hasAnyTemplateArgument(
-                             templateArgument(isIntegral())))));
-}
-
-TEST(RefersToIntegralType, Matches) {
-  EXPECT_TRUE(matches("template<int T> struct C {}; C<42> c;",
-                      classTemplateSpecializationDecl(
-                          hasAnyTemplateArgument(refersToIntegralType(
-                              asString("int"))))));
-  EXPECT_TRUE(notMatches("template<unsigned T> struct C {}; C<42> c;",
-                         classTemplateSpecializationDecl(hasAnyTemplateArgument(
-                             refersToIntegralType(asString("int"))))));
-}
-
-TEST(EqualsIntegralValue, Matches) {
-  EXPECT_TRUE(matches("template<int T> struct C {}; C<42> c;",
-                      classTemplateSpecializationDecl(
-                          hasAnyTemplateArgument(equalsIntegralValue("42")))));
-  EXPECT_TRUE(matches("template<int T> struct C {}; C<-42> c;",
-                      classTemplateSpecializationDecl(
-                          hasAnyTemplateArgument(equalsIntegralValue("-42")))));
-  EXPECT_TRUE(matches("template<int T> struct C {}; C<-0042> c;",
-                      classTemplateSpecializationDecl(
-                          hasAnyTemplateArgument(equalsIntegralValue("-34")))));
-  EXPECT_TRUE(notMatches("template<int T> struct C {}; C<42> c;",
-                         classTemplateSpecializationDecl(hasAnyTemplateArgument(
-                             equalsIntegralValue("0042")))));
-}
-
-TEST(Matcher, MatchesAccessSpecDecls) {
-  EXPECT_TRUE(matches("class C { public: int i; };", accessSpecDecl()));
-  EXPECT_TRUE(
-      matches("class C { public: int i; };", accessSpecDecl(isPublic())));
-  EXPECT_TRUE(
-      notMatches("class C { public: int i; };", accessSpecDecl(isProtected())));
-  EXPECT_TRUE(
-      notMatches("class C { public: int i; };", accessSpecDecl(isPrivate())));
-
-  EXPECT_TRUE(notMatches("class C { int i; };", accessSpecDecl()));
-}
-
-TEST(Matcher, MatchesFinal) {
-  EXPECT_TRUE(matches("class X final {};", cxxRecordDecl(isFinal())));
-  EXPECT_TRUE(matches("class X { virtual void f() final; };",
-                      cxxMethodDecl(isFinal())));
-  EXPECT_TRUE(notMatches("class X {};", cxxRecordDecl(isFinal())));
-  EXPECT_TRUE(
-      notMatches("class X { virtual void f(); };", cxxMethodDecl(isFinal())));
-}
-
-TEST(Matcher, MatchesVirtualMethod) {
-  EXPECT_TRUE(matches("class X { virtual int f(); };",
-                      cxxMethodDecl(isVirtual(), hasName("::X::f"))));
-  EXPECT_TRUE(notMatches("class X { int f(); };", cxxMethodDecl(isVirtual())));
-}
-
-TEST(Matcher, MatchesPureMethod) {
-  EXPECT_TRUE(matches("class X { virtual int f() = 0; };",
-                      cxxMethodDecl(isPure(), hasName("::X::f"))));
-  EXPECT_TRUE(notMatches("class X { int f(); };", cxxMethodDecl(isPure())));
-}
-
-TEST(Matcher, MatchesCopyAssignmentOperator) {
-  EXPECT_TRUE(matches("class X { X &operator=(X); };",
-                      cxxMethodDecl(isCopyAssignmentOperator())));
-  EXPECT_TRUE(matches("class X { X &operator=(X &); };",
-                      cxxMethodDecl(isCopyAssignmentOperator())));
-  EXPECT_TRUE(matches("class X { X &operator=(const X &); };",
-                      cxxMethodDecl(isCopyAssignmentOperator())));
-  EXPECT_TRUE(matches("class X { X &operator=(volatile X &); };",
-                      cxxMethodDecl(isCopyAssignmentOperator())));
-  EXPECT_TRUE(matches("class X { X &operator=(const volatile X &); };",
-                      cxxMethodDecl(isCopyAssignmentOperator())));
-  EXPECT_TRUE(notMatches("class X { X &operator=(X &&); };",
-                      cxxMethodDecl(isCopyAssignmentOperator())));
-}
-
-TEST(Matcher, MatchesConstMethod) {
-  EXPECT_TRUE(
-      matches("struct A { void foo() const; };", cxxMethodDecl(isConst())));
-  EXPECT_TRUE(
-      notMatches("struct A { void foo(); };", cxxMethodDecl(isConst())));
-}
-
-TEST(Matcher, MatchesOverridingMethod) {
-  EXPECT_TRUE(matches("class X { virtual int f(); }; "
-                      "class Y : public X { int f(); };",
-                      cxxMethodDecl(isOverride(), hasName("::Y::f"))));
-  EXPECT_TRUE(notMatches("class X { virtual int f(); }; "
-                         "class Y : public X { int f(); };",
-                         cxxMethodDecl(isOverride(), hasName("::X::f"))));
-  EXPECT_TRUE(notMatches("class X { int f(); }; "
-                         "class Y : public X { int f(); };",
-                         cxxMethodDecl(isOverride())));
-  EXPECT_TRUE(notMatches("class X { int f(); int f(int); }; ",
-                         cxxMethodDecl(isOverride())));
-  EXPECT_TRUE(
-      matches("template <typename Base> struct Y : Base { void f() override;};",
-              cxxMethodDecl(isOverride(), hasName("::Y::f"))));
-}
-
-TEST(Matcher, ConstructorCall) {
-  StatementMatcher Constructor = cxxConstructExpr();
-
-  EXPECT_TRUE(
-      matches("class X { public: X(); }; void x() { X x; }", Constructor));
-  EXPECT_TRUE(
-      matches("class X { public: X(); }; void x() { X x = X(); }",
-              Constructor));
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { X x = 0; }",
-              Constructor));
-  EXPECT_TRUE(matches("class X {}; void x(int) { X x; }", Constructor));
-}
-
-TEST(Matcher, ConstructorArgument) {
-  StatementMatcher Constructor = cxxConstructExpr(
-      hasArgument(0, declRefExpr(to(varDecl(hasName("y"))))));
-
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { int y; X x(y); }",
-              Constructor));
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { int y; X x = X(y); }",
-              Constructor));
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { int y; X x = y; }",
-              Constructor));
-  EXPECT_TRUE(
-      notMatches("class X { public: X(int); }; void x() { int z; X x(z); }",
-                 Constructor));
-
-  StatementMatcher WrongIndex = cxxConstructExpr(
-      hasArgument(42, declRefExpr(to(varDecl(hasName("y"))))));
-  EXPECT_TRUE(
-      notMatches("class X { public: X(int); }; void x() { int y; X x(y); }",
-                 WrongIndex));
-}
-
-TEST(Matcher, ConstructorArgumentCount) {
-  StatementMatcher Constructor1Arg = cxxConstructExpr(argumentCountIs(1));
-
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { X x(0); }",
-              Constructor1Arg));
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { X x = X(0); }",
-              Constructor1Arg));
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { X x = 0; }",
-              Constructor1Arg));
-  EXPECT_TRUE(
-      notMatches("class X { public: X(int, int); }; void x() { X x(0, 0); }",
-                 Constructor1Arg));
-}
-
-TEST(Matcher, ConstructorListInitialization) {
-  StatementMatcher ConstructorListInit =
-      cxxConstructExpr(isListInitialization());
-
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { X x{0}; }",
-              ConstructorListInit));
-  EXPECT_FALSE(
-      matches("class X { public: X(int); }; void x() { X x(0); }",
-              ConstructorListInit));
-}
-
-TEST(Matcher,ThisExpr) {
-  EXPECT_TRUE(
-      matches("struct X { int a; int f () { return a; } };", cxxThisExpr()));
-  EXPECT_TRUE(
-      notMatches("struct X { int f () { int a; return a; } };", cxxThisExpr()));
-}
-
-TEST(Matcher, BindTemporaryExpression) {
-  StatementMatcher TempExpression = cxxBindTemporaryExpr();
-
-  std::string ClassString = "class string { public: string(); ~string(); }; ";
-
-  EXPECT_TRUE(
-      matches(ClassString +
-              "string GetStringByValue();"
-              "void FunctionTakesString(string s);"
-              "void run() { FunctionTakesString(GetStringByValue()); }",
-              TempExpression));
-
-  EXPECT_TRUE(
-      notMatches(ClassString +
-                 "string* GetStringPointer(); "
-                 "void FunctionTakesStringPtr(string* s);"
-                 "void run() {"
-                 "  string* s = GetStringPointer();"
-                 "  FunctionTakesStringPtr(GetStringPointer());"
-                 "  FunctionTakesStringPtr(s);"
-                 "}",
-                 TempExpression));
-
-  EXPECT_TRUE(
-      notMatches("class no_dtor {};"
-                 "no_dtor GetObjByValue();"
-                 "void ConsumeObj(no_dtor param);"
-                 "void run() { ConsumeObj(GetObjByValue()); }",
-                 TempExpression));
-}
-
-TEST(MaterializeTemporaryExpr, MatchesTemporary) {
-  std::string ClassString =
-      "class string { public: string(); int length(); }; ";
-
-  EXPECT_TRUE(
-      matches(ClassString +
-              "string GetStringByValue();"
-              "void FunctionTakesString(string s);"
-              "void run() { FunctionTakesString(GetStringByValue()); }",
-              materializeTemporaryExpr()));
-
-  EXPECT_TRUE(
-      notMatches(ClassString +
-                 "string* GetStringPointer(); "
-                 "void FunctionTakesStringPtr(string* s);"
-                 "void run() {"
-                 "  string* s = GetStringPointer();"
-                 "  FunctionTakesStringPtr(GetStringPointer());"
-                 "  FunctionTakesStringPtr(s);"
-                 "}",
-                 materializeTemporaryExpr()));
-
-  EXPECT_TRUE(
-      notMatches(ClassString +
-                 "string GetStringByValue();"
-                 "void run() { int k = GetStringByValue().length(); }",
-                 materializeTemporaryExpr()));
-
-  EXPECT_TRUE(
-      notMatches(ClassString +
-                 "string GetStringByValue();"
-                 "void run() { GetStringByValue(); }",
-                 materializeTemporaryExpr()));
-}
-
-TEST(ConstructorDeclaration, SimpleCase) {
-  EXPECT_TRUE(matches("class Foo { Foo(int i); };",
-                      cxxConstructorDecl(ofClass(hasName("Foo")))));
-  EXPECT_TRUE(notMatches("class Foo { Foo(int i); };",
-                         cxxConstructorDecl(ofClass(hasName("Bar")))));
-}
-
-TEST(ConstructorDeclaration, IsImplicit) {
-  // This one doesn't match because the constructor is not added by the
-  // compiler (it is not needed).
-  EXPECT_TRUE(notMatches("class Foo { };",
-                         cxxConstructorDecl(isImplicit())));
-  // The compiler added the implicit default constructor.
-  EXPECT_TRUE(matches("class Foo { }; Foo* f = new Foo();",
-                      cxxConstructorDecl(isImplicit())));
-  EXPECT_TRUE(matches("class Foo { Foo(){} };",
-                      cxxConstructorDecl(unless(isImplicit()))));
-  // The compiler added an implicit assignment operator.
-  EXPECT_TRUE(matches("struct A { int x; } a = {0}, b = a; void f() { a = b; }",
-                      cxxMethodDecl(isImplicit(), hasName("operator="))));
-}
-
-TEST(ConstructorDeclaration, IsExplicit) {
-  EXPECT_TRUE(matches("struct S { explicit S(int); };",
-                      cxxConstructorDecl(isExplicit())));
-  EXPECT_TRUE(notMatches("struct S { S(int); };",
-                         cxxConstructorDecl(isExplicit())));
-}
-
-TEST(ConstructorDeclaration, Kinds) {
-  EXPECT_TRUE(matches("struct S { S(); };",
-                      cxxConstructorDecl(isDefaultConstructor())));
-  EXPECT_TRUE(notMatches("struct S { S(); };",
-                         cxxConstructorDecl(isCopyConstructor())));
-  EXPECT_TRUE(notMatches("struct S { S(); };",
-                         cxxConstructorDecl(isMoveConstructor())));
-
-  EXPECT_TRUE(notMatches("struct S { S(const S&); };",
-                         cxxConstructorDecl(isDefaultConstructor())));
-  EXPECT_TRUE(matches("struct S { S(const S&); };",
-                      cxxConstructorDecl(isCopyConstructor())));
-  EXPECT_TRUE(notMatches("struct S { S(const S&); };",
-                         cxxConstructorDecl(isMoveConstructor())));
-
-  EXPECT_TRUE(notMatches("struct S { S(S&&); };",
-                         cxxConstructorDecl(isDefaultConstructor())));
-  EXPECT_TRUE(notMatches("struct S { S(S&&); };",
-                         cxxConstructorDecl(isCopyConstructor())));
-  EXPECT_TRUE(matches("struct S { S(S&&); };",
-                      cxxConstructorDecl(isMoveConstructor())));
-}
-
-TEST(DestructorDeclaration, MatchesVirtualDestructor) {
-  EXPECT_TRUE(matches("class Foo { virtual ~Foo(); };",
-                      cxxDestructorDecl(ofClass(hasName("Foo")))));
-}
-
-TEST(DestructorDeclaration, DoesNotMatchImplicitDestructor) {
-  EXPECT_TRUE(notMatches("class Foo {};",
-                         cxxDestructorDecl(ofClass(hasName("Foo")))));
-}
-
-TEST(HasAnyConstructorInitializer, SimpleCase) {
-  EXPECT_TRUE(
-      notMatches("class Foo { Foo() { } };",
-                 cxxConstructorDecl(hasAnyConstructorInitializer(anything()))));
-  EXPECT_TRUE(
-      matches("class Foo {"
-              "  Foo() : foo_() { }"
-              "  int foo_;"
-              "};",
-              cxxConstructorDecl(hasAnyConstructorInitializer(anything()))));
-}
-
-TEST(HasAnyConstructorInitializer, ForField) {
-  static const char Code[] =
-      "class Baz { };"
-      "class Foo {"
-      "  Foo() : foo_() { }"
-      "  Baz foo_;"
-      "  Baz bar_;"
-      "};";
-  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
-      forField(hasType(recordDecl(hasName("Baz"))))))));
-  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
-      forField(hasName("foo_"))))));
-  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
-      forField(hasType(recordDecl(hasName("Bar"))))))));
-}
-
-TEST(HasAnyConstructorInitializer, WithInitializer) {
-  static const char Code[] =
-      "class Foo {"
-      "  Foo() : foo_(0) { }"
-      "  int foo_;"
-      "};";
-  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
-      withInitializer(integerLiteral(equals(0)))))));
-  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
-      withInitializer(integerLiteral(equals(1)))))));
-}
-
-TEST(HasAnyConstructorInitializer, IsWritten) {
-  static const char Code[] =
-      "struct Bar { Bar(){} };"
-      "class Foo {"
-      "  Foo() : foo_() { }"
-      "  Bar foo_;"
-      "  Bar bar_;"
-      "};";
-  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
-      allOf(forField(hasName("foo_")), isWritten())))));
-  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
-      allOf(forField(hasName("bar_")), isWritten())))));
-  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
-      allOf(forField(hasName("bar_")), unless(isWritten()))))));
-}
-
-TEST(HasAnyConstructorInitializer, IsBaseInitializer) {
-  static const char Code[] =
-      "struct B {};"
-      "struct D : B {"
-      "  int I;"
-      "  D(int i) : I(i) {}"
-      "};"
-      "struct E : B {"
-      "  E() : B() {}"
-      "};";
-  EXPECT_TRUE(matches(Code, cxxConstructorDecl(allOf(
-    hasAnyConstructorInitializer(allOf(isBaseInitializer(), isWritten())),
-    hasName("E")))));
-  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(allOf(
-    hasAnyConstructorInitializer(allOf(isBaseInitializer(), isWritten())),
-    hasName("D")))));
-  EXPECT_TRUE(matches(Code, cxxConstructorDecl(allOf(
-    hasAnyConstructorInitializer(allOf(isMemberInitializer(), isWritten())),
-    hasName("D")))));
-  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(allOf(
-    hasAnyConstructorInitializer(allOf(isMemberInitializer(), isWritten())),
-    hasName("E")))));
-}
-
-TEST(Matcher, NewExpression) {
-  StatementMatcher New = cxxNewExpr();
-
-  EXPECT_TRUE(matches("class X { public: X(); }; void x() { new X; }", New));
-  EXPECT_TRUE(
-      matches("class X { public: X(); }; void x() { new X(); }", New));
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { new X(0); }", New));
-  EXPECT_TRUE(matches("class X {}; void x(int) { new X; }", New));
-}
-
-TEST(Matcher, NewExpressionArgument) {
-  StatementMatcher New = cxxConstructExpr(
-      hasArgument(0, declRefExpr(to(varDecl(hasName("y"))))));
-
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { int y; new X(y); }",
-              New));
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { int y; new X(y); }",
-              New));
-  EXPECT_TRUE(
-      notMatches("class X { public: X(int); }; void x() { int z; new X(z); }",
-                 New));
-
-  StatementMatcher WrongIndex = cxxConstructExpr(
-      hasArgument(42, declRefExpr(to(varDecl(hasName("y"))))));
-  EXPECT_TRUE(
-      notMatches("class X { public: X(int); }; void x() { int y; new X(y); }",
-                 WrongIndex));
-}
-
-TEST(Matcher, NewExpressionArgumentCount) {
-  StatementMatcher New = cxxConstructExpr(argumentCountIs(1));
-
-  EXPECT_TRUE(
-      matches("class X { public: X(int); }; void x() { new X(0); }", New));
-  EXPECT_TRUE(
-      notMatches("class X { public: X(int, int); }; void x() { new X(0, 0); }",
-                 New));
-}
-
-TEST(Matcher, DeleteExpression) {
-  EXPECT_TRUE(matches("struct A {}; void f(A* a) { delete a; }",
-                      cxxDeleteExpr()));
-}
-
-TEST(Matcher, DefaultArgument) {
-  StatementMatcher Arg = cxxDefaultArgExpr();
-
-  EXPECT_TRUE(matches("void x(int, int = 0) { int y; x(y); }", Arg));
-  EXPECT_TRUE(
-      matches("class X { void x(int, int = 0) { int y; x(y); } };", Arg));
-  EXPECT_TRUE(notMatches("void x(int, int = 0) { int y; x(y, 0); }", Arg));
-}
-
-TEST(Matcher, StringLiterals) {
-  StatementMatcher Literal = stringLiteral();
-  EXPECT_TRUE(matches("const char *s = \"string\";", Literal));
-  // wide string
-  EXPECT_TRUE(matches("const wchar_t *s = L\"string\";", Literal));
-  // with escaped characters
-  EXPECT_TRUE(matches("const char *s = \"\x05five\";", Literal));
-  // no matching -- though the data type is the same, there is no string literal
-  EXPECT_TRUE(notMatches("const char s[1] = {'a'};", Literal));
-}
-
-TEST(Matcher, CharacterLiterals) {
-  StatementMatcher CharLiteral = characterLiteral();
-  EXPECT_TRUE(matches("const char c = 'c';", CharLiteral));
-  // wide character
-  EXPECT_TRUE(matches("const char c = L'c';", CharLiteral));
-  // wide character, Hex encoded, NOT MATCHED!
-  EXPECT_TRUE(notMatches("const wchar_t c = 0x2126;", CharLiteral));
-  EXPECT_TRUE(notMatches("const char c = 0x1;", CharLiteral));
-}
-
-TEST(Matcher, IntegerLiterals) {
-  StatementMatcher HasIntLiteral = integerLiteral();
-  EXPECT_TRUE(matches("int i = 10;", HasIntLiteral));
-  EXPECT_TRUE(matches("int i = 0x1AB;", HasIntLiteral));
-  EXPECT_TRUE(matches("int i = 10L;", HasIntLiteral));
-  EXPECT_TRUE(matches("int i = 10U;", HasIntLiteral));
-
-  // Non-matching cases (character literals, float and double)
-  EXPECT_TRUE(notMatches("int i = L'a';",
-                HasIntLiteral));  // this is actually a character
-                                  // literal cast to int
-  EXPECT_TRUE(notMatches("int i = 'a';", HasIntLiteral));
-  EXPECT_TRUE(notMatches("int i = 1e10;", HasIntLiteral));
-  EXPECT_TRUE(notMatches("int i = 10.0;", HasIntLiteral));
-}
-
-TEST(Matcher, FloatLiterals) {
-  StatementMatcher HasFloatLiteral = floatLiteral();
-  EXPECT_TRUE(matches("float i = 10.0;", HasFloatLiteral));
-  EXPECT_TRUE(matches("float i = 10.0f;", HasFloatLiteral));
-  EXPECT_TRUE(matches("double i = 10.0;", HasFloatLiteral));
-  EXPECT_TRUE(matches("double i = 10.0L;", HasFloatLiteral));
-  EXPECT_TRUE(matches("double i = 1e10;", HasFloatLiteral));
-  EXPECT_TRUE(matches("double i = 5.0;", floatLiteral(equals(5.0))));
-  EXPECT_TRUE(matches("double i = 5.0;", floatLiteral(equals(5.0f))));
-  EXPECT_TRUE(
-      matches("double i = 5.0;", floatLiteral(equals(llvm::APFloat(5.0)))));
-
-  EXPECT_TRUE(notMatches("float i = 10;", HasFloatLiteral));
-  EXPECT_TRUE(notMatches("double i = 5.0;", floatLiteral(equals(6.0))));
-  EXPECT_TRUE(notMatches("double i = 5.0;", floatLiteral(equals(6.0f))));
-  EXPECT_TRUE(
-      notMatches("double i = 5.0;", floatLiteral(equals(llvm::APFloat(6.0)))));
-}
-
-TEST(Matcher, NullPtrLiteral) {
-  EXPECT_TRUE(matches("int* i = nullptr;", cxxNullPtrLiteralExpr()));
-}
-
-TEST(Matcher, GNUNullExpr) {
-  EXPECT_TRUE(matches("int* i = __null;", gnuNullExpr()));
-}
-
-TEST(Matcher, AsmStatement) {
-  EXPECT_TRUE(matches("void foo() { __asm(\"mov al, 2\"); }", asmStmt()));
-}
-
-TEST(Matcher, Conditions) {
-  StatementMatcher Condition =
-      ifStmt(hasCondition(cxxBoolLiteral(equals(true))));
-
-  EXPECT_TRUE(matches("void x() { if (true) {} }", Condition));
-  EXPECT_TRUE(notMatches("void x() { if (false) {} }", Condition));
-  EXPECT_TRUE(notMatches("void x() { bool a = true; if (a) {} }", Condition));
-  EXPECT_TRUE(notMatches("void x() { if (true || false) {} }", Condition));
-  EXPECT_TRUE(notMatches("void x() { if (1) {} }", Condition));
-}
-
-TEST(IfStmt, ChildTraversalMatchers) {
-  EXPECT_TRUE(matches("void f() { if (false) true; else false; }",
-                      ifStmt(hasThen(cxxBoolLiteral(equals(true))))));
-  EXPECT_TRUE(notMatches("void f() { if (false) false; else true; }",
-                         ifStmt(hasThen(cxxBoolLiteral(equals(true))))));
-  EXPECT_TRUE(matches("void f() { if (false) false; else true; }",
-                      ifStmt(hasElse(cxxBoolLiteral(equals(true))))));
-  EXPECT_TRUE(notMatches("void f() { if (false) true; else false; }",
-                         ifStmt(hasElse(cxxBoolLiteral(equals(true))))));
-}
-
-TEST(MatchBinaryOperator, HasOperatorName) {
-  StatementMatcher OperatorOr = binaryOperator(hasOperatorName("||"));
-
-  EXPECT_TRUE(matches("void x() { true || false; }", OperatorOr));
-  EXPECT_TRUE(notMatches("void x() { true && false; }", OperatorOr));
-}
-
-TEST(MatchBinaryOperator, HasLHSAndHasRHS) {
-  StatementMatcher OperatorTrueFalse =
-      binaryOperator(hasLHS(cxxBoolLiteral(equals(true))),
-                     hasRHS(cxxBoolLiteral(equals(false))));
-
-  EXPECT_TRUE(matches("void x() { true || false; }", OperatorTrueFalse));
-  EXPECT_TRUE(matches("void x() { true && false; }", OperatorTrueFalse));
-  EXPECT_TRUE(notMatches("void x() { false || true; }", OperatorTrueFalse));
-
-  StatementMatcher OperatorIntPointer = arraySubscriptExpr(
-      hasLHS(hasType(isInteger())), hasRHS(hasType(pointsTo(qualType()))));
-  EXPECT_TRUE(matches("void x() { 1[\"abc\"]; }", OperatorIntPointer));
-  EXPECT_TRUE(notMatches("void x() { \"abc\"[1]; }", OperatorIntPointer));
-}
-
-TEST(MatchBinaryOperator, HasEitherOperand) {
-  StatementMatcher HasOperand =
-      binaryOperator(hasEitherOperand(cxxBoolLiteral(equals(false))));
-
-  EXPECT_TRUE(matches("void x() { true || false; }", HasOperand));
-  EXPECT_TRUE(matches("void x() { false && true; }", HasOperand));
-  EXPECT_TRUE(notMatches("void x() { true || true; }", HasOperand));
-}
-
-TEST(Matcher, BinaryOperatorTypes) {
-  // Integration test that verifies the AST provides all binary operators in
-  // a way we expect.
-  // FIXME: Operator ','
-  EXPECT_TRUE(
-      matches("void x() { 3, 4; }", binaryOperator(hasOperatorName(","))));
-  EXPECT_TRUE(
-      matches("bool b; bool c = (b = true);",
-              binaryOperator(hasOperatorName("="))));
-  EXPECT_TRUE(
-      matches("bool b = 1 != 2;", binaryOperator(hasOperatorName("!="))));
-  EXPECT_TRUE(
-      matches("bool b = 1 == 2;", binaryOperator(hasOperatorName("=="))));
-  EXPECT_TRUE(matches("bool b = 1 < 2;", binaryOperator(hasOperatorName("<"))));
-  EXPECT_TRUE(
-      matches("bool b = 1 <= 2;", binaryOperator(hasOperatorName("<="))));
-  EXPECT_TRUE(
-      matches("int i = 1 << 2;", binaryOperator(hasOperatorName("<<"))));
-  EXPECT_TRUE(
-      matches("int i = 1; int j = (i <<= 2);",
-              binaryOperator(hasOperatorName("<<="))));
-  EXPECT_TRUE(matches("bool b = 1 > 2;", binaryOperator(hasOperatorName(">"))));
-  EXPECT_TRUE(
-      matches("bool b = 1 >= 2;", binaryOperator(hasOperatorName(">="))));
-  EXPECT_TRUE(
-      matches("int i = 1 >> 2;", binaryOperator(hasOperatorName(">>"))));
-  EXPECT_TRUE(
-      matches("int i = 1; int j = (i >>= 2);",
-              binaryOperator(hasOperatorName(">>="))));
-  EXPECT_TRUE(
-      matches("int i = 42 ^ 23;", binaryOperator(hasOperatorName("^"))));
-  EXPECT_TRUE(
-      matches("int i = 42; int j = (i ^= 42);",
-              binaryOperator(hasOperatorName("^="))));
-  EXPECT_TRUE(
-      matches("int i = 42 % 23;", binaryOperator(hasOperatorName("%"))));
-  EXPECT_TRUE(
-      matches("int i = 42; int j = (i %= 42);",
-              binaryOperator(hasOperatorName("%="))));
-  EXPECT_TRUE(
-      matches("bool b = 42  &23;", binaryOperator(hasOperatorName("&"))));
-  EXPECT_TRUE(
-      matches("bool b = true && false;",
-              binaryOperator(hasOperatorName("&&"))));
-  EXPECT_TRUE(
-      matches("bool b = true; bool c = (b &= false);",
-              binaryOperator(hasOperatorName("&="))));
-  EXPECT_TRUE(
-      matches("bool b = 42 | 23;", binaryOperator(hasOperatorName("|"))));
-  EXPECT_TRUE(
-      matches("bool b = true || false;",
-              binaryOperator(hasOperatorName("||"))));
-  EXPECT_TRUE(
-      matches("bool b = true; bool c = (b |= false);",
-              binaryOperator(hasOperatorName("|="))));
-  EXPECT_TRUE(
-      matches("int i = 42  *23;", binaryOperator(hasOperatorName("*"))));
-  EXPECT_TRUE(
-      matches("int i = 42; int j = (i *= 23);",
-              binaryOperator(hasOperatorName("*="))));
-  EXPECT_TRUE(
-      matches("int i = 42 / 23;", binaryOperator(hasOperatorName("/"))));
-  EXPECT_TRUE(
-      matches("int i = 42; int j = (i /= 23);",
-              binaryOperator(hasOperatorName("/="))));
-  EXPECT_TRUE(
-      matches("int i = 42 + 23;", binaryOperator(hasOperatorName("+"))));
-  EXPECT_TRUE(
-      matches("int i = 42; int j = (i += 23);",
-              binaryOperator(hasOperatorName("+="))));
-  EXPECT_TRUE(
-      matches("int i = 42 - 23;", binaryOperator(hasOperatorName("-"))));
-  EXPECT_TRUE(
-      matches("int i = 42; int j = (i -= 23);",
-              binaryOperator(hasOperatorName("-="))));
-  EXPECT_TRUE(
-      matches("struct A { void x() { void (A::*a)(); (this->*a)(); } };",
-              binaryOperator(hasOperatorName("->*"))));
-  EXPECT_TRUE(
-      matches("struct A { void x() { void (A::*a)(); ((*this).*a)(); } };",
-              binaryOperator(hasOperatorName(".*"))));
-
-  // Member expressions as operators are not supported in matches.
-  EXPECT_TRUE(
-      notMatches("struct A { void x(A *a) { a->x(this); } };",
-                 binaryOperator(hasOperatorName("->"))));
-
-  // Initializer assignments are not represented as operator equals.
-  EXPECT_TRUE(
-      notMatches("bool b = true;", binaryOperator(hasOperatorName("="))));
-
-  // Array indexing is not represented as operator.
-  EXPECT_TRUE(notMatches("int a[42]; void x() { a[23]; }", unaryOperator()));
-
-  // Overloaded operators do not match at all.
-  EXPECT_TRUE(notMatches(
-      "struct A { bool operator&&(const A &a) const { return false; } };"
-      "void x() { A a, b; a && b; }",
-      binaryOperator()));
-}
-
-TEST(MatchUnaryOperator, HasOperatorName) {
-  StatementMatcher OperatorNot = unaryOperator(hasOperatorName("!"));
-
-  EXPECT_TRUE(matches("void x() { !true; } ", OperatorNot));
-  EXPECT_TRUE(notMatches("void x() { true; } ", OperatorNot));
-}
-
-TEST(MatchUnaryOperator, HasUnaryOperand) {
-  StatementMatcher OperatorOnFalse =
-      unaryOperator(hasUnaryOperand(cxxBoolLiteral(equals(false))));
-
-  EXPECT_TRUE(matches("void x() { !false; }", OperatorOnFalse));
-  EXPECT_TRUE(notMatches("void x() { !true; }", OperatorOnFalse));
-}
-
-TEST(Matcher, UnaryOperatorTypes) {
-  // Integration test that verifies the AST provides all unary operators in
-  // a way we expect.
-  EXPECT_TRUE(matches("bool b = !true;", unaryOperator(hasOperatorName("!"))));
-  EXPECT_TRUE(
-      matches("bool b; bool *p = &b;", unaryOperator(hasOperatorName("&"))));
-  EXPECT_TRUE(matches("int i = ~ 1;", unaryOperator(hasOperatorName("~"))));
-  EXPECT_TRUE(
-      matches("bool *p; bool b = *p;", unaryOperator(hasOperatorName("*"))));
-  EXPECT_TRUE(
-      matches("int i; int j = +i;", unaryOperator(hasOperatorName("+"))));
-  EXPECT_TRUE(
-      matches("int i; int j = -i;", unaryOperator(hasOperatorName("-"))));
-  EXPECT_TRUE(
-      matches("int i; int j = ++i;", unaryOperator(hasOperatorName("++"))));
-  EXPECT_TRUE(
-      matches("int i; int j = i++;", unaryOperator(hasOperatorName("++"))));
-  EXPECT_TRUE(
-      matches("int i; int j = --i;", unaryOperator(hasOperatorName("--"))));
-  EXPECT_TRUE(
-      matches("int i; int j = i--;", unaryOperator(hasOperatorName("--"))));
-
-  // We don't match conversion operators.
-  EXPECT_TRUE(notMatches("int i; double d = (double)i;", unaryOperator()));
-
-  // Function calls are not represented as operator.
-  EXPECT_TRUE(notMatches("void f(); void x() { f(); }", unaryOperator()));
-
-  // Overloaded operators do not match at all.
-  // FIXME: We probably want to add that.
-  EXPECT_TRUE(notMatches(
-      "struct A { bool operator!() const { return false; } };"
-      "void x() { A a; !a; }", unaryOperator(hasOperatorName("!"))));
-}
-
-TEST(Matcher, ConditionalOperator) {
-  StatementMatcher Conditional = conditionalOperator(
-      hasCondition(cxxBoolLiteral(equals(true))),
-      hasTrueExpression(cxxBoolLiteral(equals(false))));
-
-  EXPECT_TRUE(matches("void x() { true ? false : true; }", Conditional));
-  EXPECT_TRUE(notMatches("void x() { false ? false : true; }", Conditional));
-  EXPECT_TRUE(notMatches("void x() { true ? true : false; }", Conditional));
-
-  StatementMatcher ConditionalFalse = conditionalOperator(
-      hasFalseExpression(cxxBoolLiteral(equals(false))));
-
-  EXPECT_TRUE(matches("void x() { true ? true : false; }", ConditionalFalse));
-  EXPECT_TRUE(
-      notMatches("void x() { true ? false : true; }", ConditionalFalse));
-}
-
-TEST(ArraySubscriptMatchers, ArraySubscripts) {
-  EXPECT_TRUE(matches("int i[2]; void f() { i[1] = 1; }",
-                      arraySubscriptExpr()));
-  EXPECT_TRUE(notMatches("int i; void f() { i = 1; }",
-                         arraySubscriptExpr()));
-}
-
-TEST(ArraySubscriptMatchers, ArrayIndex) {
-  EXPECT_TRUE(matches(
-      "int i[2]; void f() { i[1] = 1; }",
-      arraySubscriptExpr(hasIndex(integerLiteral(equals(1))))));
-  EXPECT_TRUE(matches(
-      "int i[2]; void f() { 1[i] = 1; }",
-      arraySubscriptExpr(hasIndex(integerLiteral(equals(1))))));
-  EXPECT_TRUE(notMatches(
-      "int i[2]; void f() { i[1] = 1; }",
-      arraySubscriptExpr(hasIndex(integerLiteral(equals(0))))));
-}
-
-TEST(ArraySubscriptMatchers, MatchesArrayBase) {
-  EXPECT_TRUE(matches(
-      "int i[2]; void f() { i[1] = 2; }",
-      arraySubscriptExpr(hasBase(implicitCastExpr(
-          hasSourceExpression(declRefExpr()))))));
-}
-
-TEST(Matcher, HasNameSupportsNamespaces) {
-  EXPECT_TRUE(matches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("a::b::C"))));
-  EXPECT_TRUE(matches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("::a::b::C"))));
-  EXPECT_TRUE(matches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("b::C"))));
-  EXPECT_TRUE(matches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("C"))));
-  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("c::b::C"))));
-  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("a::c::C"))));
-  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("a::b::A"))));
-  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("::C"))));
-  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("::b::C"))));
-  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("z::a::b::C"))));
-  EXPECT_TRUE(notMatches("namespace a { namespace b { class C; } }",
-              recordDecl(hasName("a+b::C"))));
-  EXPECT_TRUE(notMatches("namespace a { namespace b { class AC; } }",
-              recordDecl(hasName("C"))));
-}
-
-TEST(Matcher, HasNameSupportsOuterClasses) {
-  EXPECT_TRUE(
-      matches("class A { class B { class C; }; };",
-              recordDecl(hasName("A::B::C"))));
-  EXPECT_TRUE(
-      matches("class A { class B { class C; }; };",
-              recordDecl(hasName("::A::B::C"))));
-  EXPECT_TRUE(
-      matches("class A { class B { class C; }; };",
-              recordDecl(hasName("B::C"))));
-  EXPECT_TRUE(
-      matches("class A { class B { class C; }; };",
-              recordDecl(hasName("C"))));
-  EXPECT_TRUE(
-      notMatches("class A { class B { class C; }; };",
-                 recordDecl(hasName("c::B::C"))));
-  EXPECT_TRUE(
-      notMatches("class A { class B { class C; }; };",
-                 recordDecl(hasName("A::c::C"))));
-  EXPECT_TRUE(
-      notMatches("class A { class B { class C; }; };",
-                 recordDecl(hasName("A::B::A"))));
-  EXPECT_TRUE(
-      notMatches("class A { class B { class C; }; };",
-                 recordDecl(hasName("::C"))));
-  EXPECT_TRUE(
-      notMatches("class A { class B { class C; }; };",
-                 recordDecl(hasName("::B::C"))));
-  EXPECT_TRUE(notMatches("class A { class B { class C; }; };",
-              recordDecl(hasName("z::A::B::C"))));
-  EXPECT_TRUE(
-      notMatches("class A { class B { class C; }; };",
-                 recordDecl(hasName("A+B::C"))));
-}
-
-TEST(Matcher, IsDefinition) {
-  DeclarationMatcher DefinitionOfClassA =
-      recordDecl(hasName("A"), isDefinition());
-  EXPECT_TRUE(matches("class A {};", DefinitionOfClassA));
-  EXPECT_TRUE(notMatches("class A;", DefinitionOfClassA));
-
-  DeclarationMatcher DefinitionOfVariableA =
-      varDecl(hasName("a"), isDefinition());
-  EXPECT_TRUE(matches("int a;", DefinitionOfVariableA));
-  EXPECT_TRUE(notMatches("extern int a;", DefinitionOfVariableA));
-
-  DeclarationMatcher DefinitionOfMethodA =
-      cxxMethodDecl(hasName("a"), isDefinition());
-  EXPECT_TRUE(matches("class A { void a() {} };", DefinitionOfMethodA));
-  EXPECT_TRUE(notMatches("class A { void a(); };", DefinitionOfMethodA));
-}
-
-TEST(Matcher, OfClass) {
-  StatementMatcher Constructor = cxxConstructExpr(hasDeclaration(cxxMethodDecl(
-      ofClass(hasName("X")))));
-
-  EXPECT_TRUE(
-      matches("class X { public: X(); }; void x(int) { X x; }", Constructor));
-  EXPECT_TRUE(
-      matches("class X { public: X(); }; void x(int) { X x = X(); }",
-              Constructor));
-  EXPECT_TRUE(
-      notMatches("class Y { public: Y(); }; void x(int) { Y y; }",
-                 Constructor));
-}
-
-TEST(Matcher, VisitsTemplateInstantiations) {
-  EXPECT_TRUE(matches(
-      "class A { public: void x(); };"
-      "template <typename T> class B { public: void y() { T t; t.x(); } };"
-      "void f() { B<A> b; b.y(); }",
-      callExpr(callee(cxxMethodDecl(hasName("x"))))));
-
-  EXPECT_TRUE(matches(
-      "class A { public: void x(); };"
-      "class C {"
-      " public:"
-      "  template <typename T> class B { public: void y() { T t; t.x(); } };"
-      "};"
-      "void f() {"
-      "  C::B<A> b; b.y();"
-      "}",
-      recordDecl(hasName("C"), hasDescendant(callExpr(
-                                   callee(cxxMethodDecl(hasName("x"))))))));
-}
-
-TEST(Matcher, HandlesNullQualTypes) {
-  // FIXME: Add a Type matcher so we can replace uses of this
-  // variable with Type(True())
-  const TypeMatcher AnyType = anything();
-
-  // We don't really care whether this matcher succeeds; we're testing that
-  // it completes without crashing.
-  EXPECT_TRUE(matches(
-      "struct A { };"
-      "template <typename T>"
-      "void f(T t) {"
-      "  T local_t(t /* this becomes a null QualType in the AST */);"
-      "}"
-      "void g() {"
-      "  f(0);"
-      "}",
-      expr(hasType(TypeMatcher(
-          anyOf(
-              TypeMatcher(hasDeclaration(anything())),
-              pointsTo(AnyType),
-              references(AnyType)
-              // Other QualType matchers should go here.
-                ))))));
-}
-
-// For testing AST_MATCHER_P().
-AST_MATCHER_P(Decl, just, internal::Matcher<Decl>, AMatcher) {
-  // Make sure all special variables are used: node, match_finder,
-  // bound_nodes_builder, and the parameter named 'AMatcher'.
-  return AMatcher.matches(Node, Finder, Builder);
-}
-
-TEST(AstMatcherPMacro, Works) {
-  DeclarationMatcher HasClassB = just(has(recordDecl(hasName("B")).bind("b")));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue("class A { class B {}; };",
-      HasClassB, new VerifyIdIsBoundTo<Decl>("b")));
-
-  EXPECT_TRUE(matchAndVerifyResultFalse("class A { class B {}; };",
-      HasClassB, new VerifyIdIsBoundTo<Decl>("a")));
-
-  EXPECT_TRUE(matchAndVerifyResultFalse("class A { class C {}; };",
-      HasClassB, new VerifyIdIsBoundTo<Decl>("b")));
-}
-
-AST_POLYMORPHIC_MATCHER_P(polymorphicHas,
-                          AST_POLYMORPHIC_SUPPORTED_TYPES(Decl, Stmt),
-                          internal::Matcher<Decl>, AMatcher) {
-  return Finder->matchesChildOf(
-      Node, AMatcher, Builder,
-      ASTMatchFinder::TK_IgnoreImplicitCastsAndParentheses,
-      ASTMatchFinder::BK_First);
-}
-
-TEST(AstPolymorphicMatcherPMacro, Works) {
-  DeclarationMatcher HasClassB =
-      polymorphicHas(recordDecl(hasName("B")).bind("b"));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue("class A { class B {}; };",
-      HasClassB, new VerifyIdIsBoundTo<Decl>("b")));
-
-  EXPECT_TRUE(matchAndVerifyResultFalse("class A { class B {}; };",
-      HasClassB, new VerifyIdIsBoundTo<Decl>("a")));
-
-  EXPECT_TRUE(matchAndVerifyResultFalse("class A { class C {}; };",
-      HasClassB, new VerifyIdIsBoundTo<Decl>("b")));
-
-  StatementMatcher StatementHasClassB =
-      polymorphicHas(recordDecl(hasName("B")));
-
-  EXPECT_TRUE(matches("void x() { class B {}; }", StatementHasClassB));
-}
-
-TEST(For, FindsForLoops) {
-  EXPECT_TRUE(matches("void f() { for(;;); }", forStmt()));
-  EXPECT_TRUE(matches("void f() { if(true) for(;;); }", forStmt()));
-  EXPECT_TRUE(notMatches("int as[] = { 1, 2, 3 };"
-                         "void f() { for (auto &a : as); }",
-                         forStmt()));
-}
-
-TEST(For, ForLoopInternals) {
-  EXPECT_TRUE(matches("void f(){ int i; for (; i < 3 ; ); }",
-                      forStmt(hasCondition(anything()))));
-  EXPECT_TRUE(matches("void f() { for (int i = 0; ;); }",
-                      forStmt(hasLoopInit(anything()))));
-}
-
-TEST(For, ForRangeLoopInternals) {
-  EXPECT_TRUE(matches("void f(){ int a[] {1, 2}; for (int i : a); }",
-                      cxxForRangeStmt(hasLoopVariable(anything()))));
-  EXPECT_TRUE(matches(
-      "void f(){ int a[] {1, 2}; for (int i : a); }",
-      cxxForRangeStmt(hasRangeInit(declRefExpr(to(varDecl(hasName("a"))))))));
-}
-
-TEST(For, NegativeForLoopInternals) {
-  EXPECT_TRUE(notMatches("void f(){ for (int i = 0; ; ++i); }",
-                         forStmt(hasCondition(expr()))));
-  EXPECT_TRUE(notMatches("void f() {int i; for (; i < 4; ++i) {} }",
-                         forStmt(hasLoopInit(anything()))));
-}
-
-TEST(For, ReportsNoFalsePositives) {
-  EXPECT_TRUE(notMatches("void f() { ; }", forStmt()));
-  EXPECT_TRUE(notMatches("void f() { if(true); }", forStmt()));
-}
-
-TEST(CompoundStatement, HandlesSimpleCases) {
-  EXPECT_TRUE(notMatches("void f();", compoundStmt()));
-  EXPECT_TRUE(matches("void f() {}", compoundStmt()));
-  EXPECT_TRUE(matches("void f() {{}}", compoundStmt()));
-}
-
-TEST(CompoundStatement, DoesNotMatchEmptyStruct) {
-  // It's not a compound statement just because there's "{}" in the source
-  // text. This is an AST search, not grep.
-  EXPECT_TRUE(notMatches("namespace n { struct S {}; }",
-              compoundStmt()));
-  EXPECT_TRUE(matches("namespace n { struct S { void f() {{}} }; }",
-              compoundStmt()));
-}
-
-TEST(HasBody, FindsBodyOfForWhileDoLoops) {
-  EXPECT_TRUE(matches("void f() { for(;;) {} }",
-              forStmt(hasBody(compoundStmt()))));
-  EXPECT_TRUE(notMatches("void f() { for(;;); }",
-              forStmt(hasBody(compoundStmt()))));
-  EXPECT_TRUE(matches("void f() { while(true) {} }",
-              whileStmt(hasBody(compoundStmt()))));
-  EXPECT_TRUE(matches("void f() { do {} while(true); }",
-              doStmt(hasBody(compoundStmt()))));
-  EXPECT_TRUE(matches("void f() { int p[2]; for (auto x : p) {} }",
-              cxxForRangeStmt(hasBody(compoundStmt()))));
-}
-
-TEST(HasAnySubstatement, MatchesForTopLevelCompoundStatement) {
-  // The simplest case: every compound statement is in a function
-  // definition, and the function body itself must be a compound
-  // statement.
-  EXPECT_TRUE(matches("void f() { for (;;); }",
-              compoundStmt(hasAnySubstatement(forStmt()))));
-}
-
-TEST(HasAnySubstatement, IsNotRecursive) {
-  // It's really "has any immediate substatement".
-  EXPECT_TRUE(notMatches("void f() { if (true) for (;;); }",
-              compoundStmt(hasAnySubstatement(forStmt()))));
-}
-
-TEST(HasAnySubstatement, MatchesInNestedCompoundStatements) {
-  EXPECT_TRUE(matches("void f() { if (true) { for (;;); } }",
-              compoundStmt(hasAnySubstatement(forStmt()))));
-}
-
-TEST(HasAnySubstatement, FindsSubstatementBetweenOthers) {
-  EXPECT_TRUE(matches("void f() { 1; 2; 3; for (;;); 4; 5; 6; }",
-              compoundStmt(hasAnySubstatement(forStmt()))));
-}
-
-TEST(StatementCountIs, FindsNoStatementsInAnEmptyCompoundStatement) {
-  EXPECT_TRUE(matches("void f() { }",
-              compoundStmt(statementCountIs(0))));
-  EXPECT_TRUE(notMatches("void f() {}",
-              compoundStmt(statementCountIs(1))));
-}
-
-TEST(StatementCountIs, AppearsToMatchOnlyOneCount) {
-  EXPECT_TRUE(matches("void f() { 1; }",
-              compoundStmt(statementCountIs(1))));
-  EXPECT_TRUE(notMatches("void f() { 1; }",
-              compoundStmt(statementCountIs(0))));
-  EXPECT_TRUE(notMatches("void f() { 1; }",
-              compoundStmt(statementCountIs(2))));
-}
-
-TEST(StatementCountIs, WorksWithMultipleStatements) {
-  EXPECT_TRUE(matches("void f() { 1; 2; 3; }",
-              compoundStmt(statementCountIs(3))));
-}
-
-TEST(StatementCountIs, WorksWithNestedCompoundStatements) {
-  EXPECT_TRUE(matches("void f() { { 1; } { 1; 2; 3; 4; } }",
-              compoundStmt(statementCountIs(1))));
-  EXPECT_TRUE(matches("void f() { { 1; } { 1; 2; 3; 4; } }",
-              compoundStmt(statementCountIs(2))));
-  EXPECT_TRUE(notMatches("void f() { { 1; } { 1; 2; 3; 4; } }",
-              compoundStmt(statementCountIs(3))));
-  EXPECT_TRUE(matches("void f() { { 1; } { 1; 2; 3; 4; } }",
-              compoundStmt(statementCountIs(4))));
-}
-
-TEST(Member, WorksInSimplestCase) {
-  EXPECT_TRUE(matches("struct { int first; } s; int i(s.first);",
-                      memberExpr(member(hasName("first")))));
-}
-
-TEST(Member, DoesNotMatchTheBaseExpression) {
-  // Don't pick out the wrong part of the member expression, this should
-  // be checking the member (name) only.
-  EXPECT_TRUE(notMatches("struct { int i; } first; int i(first.i);",
-                         memberExpr(member(hasName("first")))));
-}
-
-TEST(Member, MatchesInMemberFunctionCall) {
-  EXPECT_TRUE(matches("void f() {"
-                      "  struct { void first() {}; } s;"
-                      "  s.first();"
-                      "};",
-                      memberExpr(member(hasName("first")))));
-}
-
-TEST(Member, MatchesMember) {
-  EXPECT_TRUE(matches(
-      "struct A { int i; }; void f() { A a; a.i = 2; }",
-      memberExpr(hasDeclaration(fieldDecl(hasType(isInteger()))))));
-  EXPECT_TRUE(notMatches(
-      "struct A { float f; }; void f() { A a; a.f = 2.0f; }",
-      memberExpr(hasDeclaration(fieldDecl(hasType(isInteger()))))));
-}
-
-TEST(Member, UnderstandsAccess) {
-  EXPECT_TRUE(matches(
-      "struct A { int i; };", fieldDecl(isPublic(), hasName("i"))));
-  EXPECT_TRUE(notMatches(
-      "struct A { int i; };", fieldDecl(isProtected(), hasName("i"))));
-  EXPECT_TRUE(notMatches(
-      "struct A { int i; };", fieldDecl(isPrivate(), hasName("i"))));
-
-  EXPECT_TRUE(notMatches(
-      "class A { int i; };", fieldDecl(isPublic(), hasName("i"))));
-  EXPECT_TRUE(notMatches(
-      "class A { int i; };", fieldDecl(isProtected(), hasName("i"))));
-  EXPECT_TRUE(matches(
-      "class A { int i; };", fieldDecl(isPrivate(), hasName("i"))));
-
-  EXPECT_TRUE(notMatches(
-      "class A { protected: int i; };", fieldDecl(isPublic(), hasName("i"))));
-  EXPECT_TRUE(matches("class A { protected: int i; };",
-                      fieldDecl(isProtected(), hasName("i"))));
-  EXPECT_TRUE(notMatches(
-      "class A { protected: int i; };", fieldDecl(isPrivate(), hasName("i"))));
-  
-  // Non-member decls have the AccessSpecifier AS_none and thus aren't matched.
-  EXPECT_TRUE(notMatches("int i;", varDecl(isPublic(), hasName("i"))));
-  EXPECT_TRUE(notMatches("int i;", varDecl(isProtected(), hasName("i"))));
-  EXPECT_TRUE(notMatches("int i;", varDecl(isPrivate(), hasName("i"))));
-}
-
-TEST(Member, MatchesMemberAllocationFunction) {
-  // Fails in C++11 mode
-  EXPECT_TRUE(matchesConditionally(
-      "namespace std { typedef typeof(sizeof(int)) size_t; }"
-      "class X { void *operator new(std::size_t); };",
-      cxxMethodDecl(ofClass(hasName("X"))), true, "-std=gnu++98"));
-
-  EXPECT_TRUE(matches("class X { void operator delete(void*); };",
-                      cxxMethodDecl(ofClass(hasName("X")))));
-
-  // Fails in C++11 mode
-  EXPECT_TRUE(matchesConditionally(
-      "namespace std { typedef typeof(sizeof(int)) size_t; }"
-      "class X { void operator delete[](void*, std::size_t); };",
-      cxxMethodDecl(ofClass(hasName("X"))), true, "-std=gnu++98"));
-}
-
-TEST(HasObjectExpression, DoesNotMatchMember) {
-  EXPECT_TRUE(notMatches(
-      "class X {}; struct Z { X m; }; void f(Z z) { z.m; }",
-      memberExpr(hasObjectExpression(hasType(recordDecl(hasName("X")))))));
-}
-
-TEST(HasObjectExpression, MatchesBaseOfVariable) {
-  EXPECT_TRUE(matches(
-      "struct X { int m; }; void f(X x) { x.m; }",
-      memberExpr(hasObjectExpression(hasType(recordDecl(hasName("X")))))));
-  EXPECT_TRUE(matches(
-      "struct X { int m; }; void f(X* x) { x->m; }",
-      memberExpr(hasObjectExpression(
-          hasType(pointsTo(recordDecl(hasName("X"))))))));
-}
-
-TEST(HasObjectExpression,
-     MatchesObjectExpressionOfImplicitlyFormedMemberExpression) {
-  EXPECT_TRUE(matches(
-      "class X {}; struct S { X m; void f() { this->m; } };",
-      memberExpr(hasObjectExpression(
-          hasType(pointsTo(recordDecl(hasName("S"))))))));
-  EXPECT_TRUE(matches(
-      "class X {}; struct S { X m; void f() { m; } };",
-      memberExpr(hasObjectExpression(
-          hasType(pointsTo(recordDecl(hasName("S"))))))));
-}
-
-TEST(Field, DoesNotMatchNonFieldMembers) {
-  EXPECT_TRUE(notMatches("class X { void m(); };", fieldDecl(hasName("m"))));
-  EXPECT_TRUE(notMatches("class X { class m {}; };", fieldDecl(hasName("m"))));
-  EXPECT_TRUE(notMatches("class X { enum { m }; };", fieldDecl(hasName("m"))));
-  EXPECT_TRUE(notMatches("class X { enum m {}; };", fieldDecl(hasName("m"))));
-}
-
-TEST(Field, MatchesField) {
-  EXPECT_TRUE(matches("class X { int m; };", fieldDecl(hasName("m"))));
-}
-
-TEST(IsVolatileQualified, QualifiersMatch) {
-  EXPECT_TRUE(matches("volatile int i = 42;",
-                      varDecl(hasType(isVolatileQualified()))));
-  EXPECT_TRUE(notMatches("volatile int *i;",
-                         varDecl(hasType(isVolatileQualified()))));
-  EXPECT_TRUE(matches("typedef volatile int v_int; v_int i = 42;",
-                      varDecl(hasType(isVolatileQualified()))));
-}
-
-TEST(IsConstQualified, MatchesConstInt) {
-  EXPECT_TRUE(matches("const int i = 42;",
-                      varDecl(hasType(isConstQualified()))));
-}
-
-TEST(IsConstQualified, MatchesConstPointer) {
-  EXPECT_TRUE(matches("int i = 42; int* const p(&i);",
-                      varDecl(hasType(isConstQualified()))));
-}
-
-TEST(IsConstQualified, MatchesThroughTypedef) {
-  EXPECT_TRUE(matches("typedef const int const_int; const_int i = 42;",
-                      varDecl(hasType(isConstQualified()))));
-  EXPECT_TRUE(matches("typedef int* int_ptr; const int_ptr p(0);",
-                      varDecl(hasType(isConstQualified()))));
-}
-
-TEST(IsConstQualified, DoesNotMatchInappropriately) {
-  EXPECT_TRUE(notMatches("typedef int nonconst_int; nonconst_int i = 42;",
-                         varDecl(hasType(isConstQualified()))));
-  EXPECT_TRUE(notMatches("int const* p;",
-                         varDecl(hasType(isConstQualified()))));
-}
-
-TEST(CastExpression, MatchesExplicitCasts) {
-  EXPECT_TRUE(matches("char *p = reinterpret_cast<char *>(&p);",castExpr()));
-  EXPECT_TRUE(matches("void *p = (void *)(&p);", castExpr()));
-  EXPECT_TRUE(matches("char q, *p = const_cast<char *>(&q);", castExpr()));
-  EXPECT_TRUE(matches("char c = char(0);", castExpr()));
-}
-TEST(CastExpression, MatchesImplicitCasts) {
-  // This test creates an implicit cast from int to char.
-  EXPECT_TRUE(matches("char c = 0;", castExpr()));
-  // This test creates an implicit cast from lvalue to rvalue.
-  EXPECT_TRUE(matches("char c = 0, d = c;", castExpr()));
-}
-
-TEST(CastExpression, DoesNotMatchNonCasts) {
-  EXPECT_TRUE(notMatches("char c = '0';", castExpr()));
-  EXPECT_TRUE(notMatches("char c, &q = c;", castExpr()));
-  EXPECT_TRUE(notMatches("int i = (0);", castExpr()));
-  EXPECT_TRUE(notMatches("int i = 0;", castExpr()));
-}
-
-TEST(ReinterpretCast, MatchesSimpleCase) {
-  EXPECT_TRUE(matches("char* p = reinterpret_cast<char*>(&p);",
-                      cxxReinterpretCastExpr()));
-}
-
-TEST(ReinterpretCast, DoesNotMatchOtherCasts) {
-  EXPECT_TRUE(notMatches("char* p = (char*)(&p);", cxxReinterpretCastExpr()));
-  EXPECT_TRUE(notMatches("char q, *p = const_cast<char*>(&q);",
-                         cxxReinterpretCastExpr()));
-  EXPECT_TRUE(notMatches("void* p = static_cast<void*>(&p);",
-                         cxxReinterpretCastExpr()));
-  EXPECT_TRUE(notMatches("struct B { virtual ~B() {} }; struct D : B {};"
-                         "B b;"
-                         "D* p = dynamic_cast<D*>(&b);",
-                         cxxReinterpretCastExpr()));
-}
-
-TEST(FunctionalCast, MatchesSimpleCase) {
-  std::string foo_class = "class Foo { public: Foo(const char*); };";
-  EXPECT_TRUE(matches(foo_class + "void r() { Foo f = Foo(\"hello world\"); }",
-                      cxxFunctionalCastExpr()));
-}
-
-TEST(FunctionalCast, DoesNotMatchOtherCasts) {
-  std::string FooClass = "class Foo { public: Foo(const char*); };";
-  EXPECT_TRUE(
-      notMatches(FooClass + "void r() { Foo f = (Foo) \"hello world\"; }",
-                 cxxFunctionalCastExpr()));
-  EXPECT_TRUE(
-      notMatches(FooClass + "void r() { Foo f = \"hello world\"; }",
-                 cxxFunctionalCastExpr()));
-}
-
-TEST(DynamicCast, MatchesSimpleCase) {
-  EXPECT_TRUE(matches("struct B { virtual ~B() {} }; struct D : B {};"
-                      "B b;"
-                      "D* p = dynamic_cast<D*>(&b);",
-                      cxxDynamicCastExpr()));
-}
-
-TEST(StaticCast, MatchesSimpleCase) {
-  EXPECT_TRUE(matches("void* p(static_cast<void*>(&p));",
-                      cxxStaticCastExpr()));
-}
-
-TEST(StaticCast, DoesNotMatchOtherCasts) {
-  EXPECT_TRUE(notMatches("char* p = (char*)(&p);", cxxStaticCastExpr()));
-  EXPECT_TRUE(notMatches("char q, *p = const_cast<char*>(&q);",
-                         cxxStaticCastExpr()));
-  EXPECT_TRUE(notMatches("void* p = reinterpret_cast<char*>(&p);",
-                         cxxStaticCastExpr()));
-  EXPECT_TRUE(notMatches("struct B { virtual ~B() {} }; struct D : B {};"
-                         "B b;"
-                         "D* p = dynamic_cast<D*>(&b);",
-                         cxxStaticCastExpr()));
-}
-
-TEST(CStyleCast, MatchesSimpleCase) {
-  EXPECT_TRUE(matches("int i = (int) 2.2f;", cStyleCastExpr()));
-}
-
-TEST(CStyleCast, DoesNotMatchOtherCasts) {
-  EXPECT_TRUE(notMatches("char* p = static_cast<char*>(0);"
-                         "char q, *r = const_cast<char*>(&q);"
-                         "void* s = reinterpret_cast<char*>(&s);"
-                         "struct B { virtual ~B() {} }; struct D : B {};"
-                         "B b;"
-                         "D* t = dynamic_cast<D*>(&b);",
-                         cStyleCastExpr()));
-}
-
-TEST(HasDestinationType, MatchesSimpleCase) {
-  EXPECT_TRUE(matches("char* p = static_cast<char*>(0);",
-                      cxxStaticCastExpr(hasDestinationType(
-                          pointsTo(TypeMatcher(anything()))))));
-}
-
-TEST(HasImplicitDestinationType, MatchesSimpleCase) {
-  // This test creates an implicit const cast.
-  EXPECT_TRUE(matches("int x; const int i = x;",
-                      implicitCastExpr(
-                          hasImplicitDestinationType(isInteger()))));
-  // This test creates an implicit array-to-pointer cast.
-  EXPECT_TRUE(matches("int arr[3]; int *p = arr;",
-                      implicitCastExpr(hasImplicitDestinationType(
-                          pointsTo(TypeMatcher(anything()))))));
-}
-
-TEST(HasImplicitDestinationType, DoesNotMatchIncorrectly) {
-  // This test creates an implicit cast from int to char.
-  EXPECT_TRUE(notMatches("char c = 0;",
-                      implicitCastExpr(hasImplicitDestinationType(
-                          unless(anything())))));
-  // This test creates an implicit array-to-pointer cast.
-  EXPECT_TRUE(notMatches("int arr[3]; int *p = arr;",
-                      implicitCastExpr(hasImplicitDestinationType(
-                          unless(anything())))));
-}
-
-TEST(ImplicitCast, MatchesSimpleCase) {
-  // This test creates an implicit const cast.
-  EXPECT_TRUE(matches("int x = 0; const int y = x;",
-                      varDecl(hasInitializer(implicitCastExpr()))));
-  // This test creates an implicit cast from int to char.
-  EXPECT_TRUE(matches("char c = 0;",
-                      varDecl(hasInitializer(implicitCastExpr()))));
-  // This test creates an implicit array-to-pointer cast.
-  EXPECT_TRUE(matches("int arr[6]; int *p = arr;",
-                      varDecl(hasInitializer(implicitCastExpr()))));
-}
-
-TEST(ImplicitCast, DoesNotMatchIncorrectly) {
-  // This test verifies that implicitCastExpr() matches exactly when implicit casts
-  // are present, and that it ignores explicit and paren casts.
-
-  // These two test cases have no casts.
-  EXPECT_TRUE(notMatches("int x = 0;",
-                         varDecl(hasInitializer(implicitCastExpr()))));
-  EXPECT_TRUE(notMatches("int x = 0, &y = x;",
-                         varDecl(hasInitializer(implicitCastExpr()))));
-
-  EXPECT_TRUE(notMatches("int x = 0; double d = (double) x;",
-                         varDecl(hasInitializer(implicitCastExpr()))));
-  EXPECT_TRUE(notMatches("const int *p; int *q = const_cast<int *>(p);",
-                         varDecl(hasInitializer(implicitCastExpr()))));
-
-  EXPECT_TRUE(notMatches("int x = (0);",
-                         varDecl(hasInitializer(implicitCastExpr()))));
-}
-
-TEST(IgnoringImpCasts, MatchesImpCasts) {
-  // This test checks that ignoringImpCasts matches when implicit casts are
-  // present and its inner matcher alone does not match.
-  // Note that this test creates an implicit const cast.
-  EXPECT_TRUE(matches("int x = 0; const int y = x;",
-                      varDecl(hasInitializer(ignoringImpCasts(
-                          declRefExpr(to(varDecl(hasName("x")))))))));
-  // This test creates an implict cast from int to char.
-  EXPECT_TRUE(matches("char x = 0;",
-                      varDecl(hasInitializer(ignoringImpCasts(
-                          integerLiteral(equals(0)))))));
-}
-
-TEST(IgnoringImpCasts, DoesNotMatchIncorrectly) {
-  // These tests verify that ignoringImpCasts does not match if the inner
-  // matcher does not match.
-  // Note that the first test creates an implicit const cast.
-  EXPECT_TRUE(notMatches("int x; const int y = x;",
-                         varDecl(hasInitializer(ignoringImpCasts(
-                             unless(anything()))))));
-  EXPECT_TRUE(notMatches("int x; int y = x;",
-                         varDecl(hasInitializer(ignoringImpCasts(
-                             unless(anything()))))));
-
-  // These tests verify that ignoringImplictCasts does not look through explicit
-  // casts or parentheses.
-  EXPECT_TRUE(notMatches("char* p = static_cast<char*>(0);",
-                         varDecl(hasInitializer(ignoringImpCasts(
-                             integerLiteral())))));
-  EXPECT_TRUE(notMatches("int i = (0);",
-                         varDecl(hasInitializer(ignoringImpCasts(
-                             integerLiteral())))));
-  EXPECT_TRUE(notMatches("float i = (float)0;",
-                         varDecl(hasInitializer(ignoringImpCasts(
-                             integerLiteral())))));
-  EXPECT_TRUE(notMatches("float i = float(0);",
-                         varDecl(hasInitializer(ignoringImpCasts(
-                             integerLiteral())))));
-}
-
-TEST(IgnoringImpCasts, MatchesWithoutImpCasts) {
-  // This test verifies that expressions that do not have implicit casts
-  // still match the inner matcher.
-  EXPECT_TRUE(matches("int x = 0; int &y = x;",
-                      varDecl(hasInitializer(ignoringImpCasts(
-                          declRefExpr(to(varDecl(hasName("x")))))))));
-}
-
-TEST(IgnoringParenCasts, MatchesParenCasts) {
-  // This test checks that ignoringParenCasts matches when parentheses and/or
-  // casts are present and its inner matcher alone does not match.
-  EXPECT_TRUE(matches("int x = (0);",
-                      varDecl(hasInitializer(ignoringParenCasts(
-                          integerLiteral(equals(0)))))));
-  EXPECT_TRUE(matches("int x = (((((0)))));",
-                      varDecl(hasInitializer(ignoringParenCasts(
-                          integerLiteral(equals(0)))))));
-
-  // This test creates an implict cast from int to char in addition to the
-  // parentheses.
-  EXPECT_TRUE(matches("char x = (0);",
-                      varDecl(hasInitializer(ignoringParenCasts(
-                          integerLiteral(equals(0)))))));
-
-  EXPECT_TRUE(matches("char x = (char)0;",
-                      varDecl(hasInitializer(ignoringParenCasts(
-                          integerLiteral(equals(0)))))));
-  EXPECT_TRUE(matches("char* p = static_cast<char*>(0);",
-                      varDecl(hasInitializer(ignoringParenCasts(
-                          integerLiteral(equals(0)))))));
-}
-
-TEST(IgnoringParenCasts, MatchesWithoutParenCasts) {
-  // This test verifies that expressions that do not have any casts still match.
-  EXPECT_TRUE(matches("int x = 0;",
-                      varDecl(hasInitializer(ignoringParenCasts(
-                          integerLiteral(equals(0)))))));
-}
-
-TEST(IgnoringParenCasts, DoesNotMatchIncorrectly) {
-  // These tests verify that ignoringImpCasts does not match if the inner
-  // matcher does not match.
-  EXPECT_TRUE(notMatches("int x = ((0));",
-                         varDecl(hasInitializer(ignoringParenCasts(
-                             unless(anything()))))));
-
-  // This test creates an implicit cast from int to char in addition to the
-  // parentheses.
-  EXPECT_TRUE(notMatches("char x = ((0));",
-                         varDecl(hasInitializer(ignoringParenCasts(
-                             unless(anything()))))));
-
-  EXPECT_TRUE(notMatches("char *x = static_cast<char *>((0));",
-                         varDecl(hasInitializer(ignoringParenCasts(
-                             unless(anything()))))));
-}
-
-TEST(IgnoringParenAndImpCasts, MatchesParenImpCasts) {
-  // This test checks that ignoringParenAndImpCasts matches when
-  // parentheses and/or implicit casts are present and its inner matcher alone
-  // does not match.
-  // Note that this test creates an implicit const cast.
-  EXPECT_TRUE(matches("int x = 0; const int y = x;",
-                      varDecl(hasInitializer(ignoringParenImpCasts(
-                          declRefExpr(to(varDecl(hasName("x")))))))));
-  // This test creates an implicit cast from int to char.
-  EXPECT_TRUE(matches("const char x = (0);",
-                      varDecl(hasInitializer(ignoringParenImpCasts(
-                          integerLiteral(equals(0)))))));
-}
-
-TEST(IgnoringParenAndImpCasts, MatchesWithoutParenImpCasts) {
-  // This test verifies that expressions that do not have parentheses or
-  // implicit casts still match.
-  EXPECT_TRUE(matches("int x = 0; int &y = x;",
-                      varDecl(hasInitializer(ignoringParenImpCasts(
-                          declRefExpr(to(varDecl(hasName("x")))))))));
-  EXPECT_TRUE(matches("int x = 0;",
-                      varDecl(hasInitializer(ignoringParenImpCasts(
-                          integerLiteral(equals(0)))))));
-}
-
-TEST(IgnoringParenAndImpCasts, DoesNotMatchIncorrectly) {
-  // These tests verify that ignoringParenImpCasts does not match if
-  // the inner matcher does not match.
-  // This test creates an implicit cast.
-  EXPECT_TRUE(notMatches("char c = ((3));",
-                         varDecl(hasInitializer(ignoringParenImpCasts(
-                             unless(anything()))))));
-  // These tests verify that ignoringParenAndImplictCasts does not look
-  // through explicit casts.
-  EXPECT_TRUE(notMatches("float y = (float(0));",
-                         varDecl(hasInitializer(ignoringParenImpCasts(
-                             integerLiteral())))));
-  EXPECT_TRUE(notMatches("float y = (float)0;",
-                         varDecl(hasInitializer(ignoringParenImpCasts(
-                             integerLiteral())))));
-  EXPECT_TRUE(notMatches("char* p = static_cast<char*>(0);",
-                         varDecl(hasInitializer(ignoringParenImpCasts(
-                             integerLiteral())))));
-}
-
-TEST(HasSourceExpression, MatchesImplicitCasts) {
-  EXPECT_TRUE(matches("class string {}; class URL { public: URL(string s); };"
-                      "void r() {string a_string; URL url = a_string; }",
-                      implicitCastExpr(
-                          hasSourceExpression(cxxConstructExpr()))));
-}
-
-TEST(HasSourceExpression, MatchesExplicitCasts) {
-  EXPECT_TRUE(matches("float x = static_cast<float>(42);",
-                      explicitCastExpr(
-                          hasSourceExpression(hasDescendant(
-                              expr(integerLiteral()))))));
-}
-
-TEST(Statement, DoesNotMatchDeclarations) {
-  EXPECT_TRUE(notMatches("class X {};", stmt()));
-}
-
-TEST(Statement, MatchesCompoundStatments) {
-  EXPECT_TRUE(matches("void x() {}", stmt()));
-}
-
-TEST(DeclarationStatement, DoesNotMatchCompoundStatements) {
-  EXPECT_TRUE(notMatches("void x() {}", declStmt()));
-}
-
-TEST(DeclarationStatement, MatchesVariableDeclarationStatements) {
-  EXPECT_TRUE(matches("void x() { int a; }", declStmt()));
-}
-
-TEST(ExprWithCleanups, MatchesExprWithCleanups) {
-  EXPECT_TRUE(matches("struct Foo { ~Foo(); };"
-                      "const Foo f = Foo();",
-                      varDecl(hasInitializer(exprWithCleanups()))));
-  EXPECT_FALSE(matches("struct Foo { };"
-                      "const Foo f = Foo();",
-                      varDecl(hasInitializer(exprWithCleanups()))));
-}
-
-TEST(InitListExpression, MatchesInitListExpression) {
-  EXPECT_TRUE(matches("int a[] = { 1, 2 };",
-                      initListExpr(hasType(asString("int [2]")))));
-  EXPECT_TRUE(matches("struct B { int x, y; }; B b = { 5, 6 };",
-                      initListExpr(hasType(recordDecl(hasName("B"))))));
-  EXPECT_TRUE(matches("struct S { S(void (*a)()); };"
-                      "void f();"
-                      "S s[1] = { &f };",
-                      declRefExpr(to(functionDecl(hasName("f"))))));
-  EXPECT_TRUE(
-      matches("int i[1] = {42, [0] = 43};", integerLiteral(equals(42))));
-}
-
-TEST(UsingDeclaration, MatchesUsingDeclarations) {
-  EXPECT_TRUE(matches("namespace X { int x; } using X::x;",
-                      usingDecl()));
-}
-
-TEST(UsingDeclaration, MatchesShadowUsingDelcarations) {
-  EXPECT_TRUE(matches("namespace f { int a; } using f::a;",
-                      usingDecl(hasAnyUsingShadowDecl(hasName("a")))));
-}
-
-TEST(UsingDeclaration, MatchesSpecificTarget) {
-  EXPECT_TRUE(matches("namespace f { int a; void b(); } using f::b;",
-                      usingDecl(hasAnyUsingShadowDecl(
-                          hasTargetDecl(functionDecl())))));
-  EXPECT_TRUE(notMatches("namespace f { int a; void b(); } using f::a;",
-                         usingDecl(hasAnyUsingShadowDecl(
-                             hasTargetDecl(functionDecl())))));
-}
-
-TEST(UsingDeclaration, ThroughUsingDeclaration) {
-  EXPECT_TRUE(matches(
-      "namespace a { void f(); } using a::f; void g() { f(); }",
-      declRefExpr(throughUsingDecl(anything()))));
-  EXPECT_TRUE(notMatches(
-      "namespace a { void f(); } using a::f; void g() { a::f(); }",
-      declRefExpr(throughUsingDecl(anything()))));
-}
-
-TEST(UsingDirectiveDeclaration, MatchesUsingNamespace) {
-  EXPECT_TRUE(matches("namespace X { int x; } using namespace X;",
-                      usingDirectiveDecl()));
-  EXPECT_FALSE(
-      matches("namespace X { int x; } using X::x;", usingDirectiveDecl()));
-}
-
-TEST(SingleDecl, IsSingleDecl) {
-  StatementMatcher SingleDeclStmt =
-      declStmt(hasSingleDecl(varDecl(hasInitializer(anything()))));
-  EXPECT_TRUE(matches("void f() {int a = 4;}", SingleDeclStmt));
-  EXPECT_TRUE(notMatches("void f() {int a;}", SingleDeclStmt));
-  EXPECT_TRUE(notMatches("void f() {int a = 4, b = 3;}",
-                          SingleDeclStmt));
-}
-
-TEST(DeclStmt, ContainsDeclaration) {
-  DeclarationMatcher MatchesInit = varDecl(hasInitializer(anything()));
-
-  EXPECT_TRUE(matches("void f() {int a = 4;}",
-                      declStmt(containsDeclaration(0, MatchesInit))));
-  EXPECT_TRUE(matches("void f() {int a = 4, b = 3;}",
-                      declStmt(containsDeclaration(0, MatchesInit),
-                               containsDeclaration(1, MatchesInit))));
-  unsigned WrongIndex = 42;
-  EXPECT_TRUE(notMatches("void f() {int a = 4, b = 3;}",
-                         declStmt(containsDeclaration(WrongIndex,
-                                                      MatchesInit))));
-}
-
-TEST(DeclCount, DeclCountIsCorrect) {
-  EXPECT_TRUE(matches("void f() {int i,j;}",
-                      declStmt(declCountIs(2))));
-  EXPECT_TRUE(notMatches("void f() {int i,j; int k;}",
-                         declStmt(declCountIs(3))));
-  EXPECT_TRUE(notMatches("void f() {int i,j, k, l;}",
-                         declStmt(declCountIs(3))));
-}
-
-TEST(While, MatchesWhileLoops) {
-  EXPECT_TRUE(notMatches("void x() {}", whileStmt()));
-  EXPECT_TRUE(matches("void x() { while(true); }", whileStmt()));
-  EXPECT_TRUE(notMatches("void x() { do {} while(true); }", whileStmt()));
-}
-
-TEST(Do, MatchesDoLoops) {
-  EXPECT_TRUE(matches("void x() { do {} while(true); }", doStmt()));
-  EXPECT_TRUE(matches("void x() { do ; while(false); }", doStmt()));
-}
-
-TEST(Do, DoesNotMatchWhileLoops) {
-  EXPECT_TRUE(notMatches("void x() { while(true) {} }", doStmt()));
-}
-
-TEST(SwitchCase, MatchesCase) {
-  EXPECT_TRUE(matches("void x() { switch(42) { case 42:; } }", switchCase()));
-  EXPECT_TRUE(matches("void x() { switch(42) { default:; } }", switchCase()));
-  EXPECT_TRUE(matches("void x() { switch(42) default:; }", switchCase()));
-  EXPECT_TRUE(notMatches("void x() { switch(42) {} }", switchCase()));
-}
-
-TEST(SwitchCase, MatchesSwitch) {
-  EXPECT_TRUE(matches("void x() { switch(42) { case 42:; } }", switchStmt()));
-  EXPECT_TRUE(matches("void x() { switch(42) { default:; } }", switchStmt()));
-  EXPECT_TRUE(matches("void x() { switch(42) default:; }", switchStmt()));
-  EXPECT_TRUE(notMatches("void x() {}", switchStmt()));
-}
-
-TEST(SwitchCase, MatchesEachCase) {
-  EXPECT_TRUE(notMatches("void x() { switch(42); }",
-                         switchStmt(forEachSwitchCase(caseStmt()))));
-  EXPECT_TRUE(matches("void x() { switch(42) case 42:; }",
-                      switchStmt(forEachSwitchCase(caseStmt()))));
-  EXPECT_TRUE(matches("void x() { switch(42) { case 42:; } }",
-                      switchStmt(forEachSwitchCase(caseStmt()))));
-  EXPECT_TRUE(notMatches(
-      "void x() { if (1) switch(42) { case 42: switch (42) { default:; } } }",
-      ifStmt(has(switchStmt(forEachSwitchCase(defaultStmt()))))));
-  EXPECT_TRUE(matches("void x() { switch(42) { case 1+1: case 4:; } }",
-                      switchStmt(forEachSwitchCase(
-                          caseStmt(hasCaseConstant(integerLiteral()))))));
-  EXPECT_TRUE(notMatches("void x() { switch(42) { case 1+1: case 2+2:; } }",
-                         switchStmt(forEachSwitchCase(
-                             caseStmt(hasCaseConstant(integerLiteral()))))));
-  EXPECT_TRUE(notMatches("void x() { switch(42) { case 1 ... 2:; } }",
-                         switchStmt(forEachSwitchCase(
-                             caseStmt(hasCaseConstant(integerLiteral()))))));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void x() { switch (42) { case 1: case 2: case 3: default:; } }",
-      switchStmt(forEachSwitchCase(caseStmt().bind("x"))),
-      new VerifyIdIsBoundTo<CaseStmt>("x", 3)));
-}
-
-TEST(ForEachConstructorInitializer, MatchesInitializers) {
-  EXPECT_TRUE(matches(
-      "struct X { X() : i(42), j(42) {} int i, j; };",
-      cxxConstructorDecl(forEachConstructorInitializer(cxxCtorInitializer()))));
-}
-
-TEST(ExceptionHandling, SimpleCases) {
-  EXPECT_TRUE(matches("void foo() try { } catch(int X) { }", cxxCatchStmt()));
-  EXPECT_TRUE(matches("void foo() try { } catch(int X) { }", cxxTryStmt()));
-  EXPECT_TRUE(
-      notMatches("void foo() try { } catch(int X) { }", cxxThrowExpr()));
-  EXPECT_TRUE(matches("void foo() try { throw; } catch(int X) { }",
-                      cxxThrowExpr()));
-  EXPECT_TRUE(matches("void foo() try { throw 5;} catch(int X) { }",
-                      cxxThrowExpr()));
-  EXPECT_TRUE(matches("void foo() try { throw; } catch(...) { }",
-                      cxxCatchStmt(isCatchAll())));
-  EXPECT_TRUE(notMatches("void foo() try { throw; } catch(int) { }",
-                         cxxCatchStmt(isCatchAll())));
-  EXPECT_TRUE(matches("void foo() try {} catch(int X) { }",
-                      varDecl(isExceptionVariable())));
-  EXPECT_TRUE(notMatches("void foo() try { int X; } catch (...) { }",
-                         varDecl(isExceptionVariable())));
-}
-
-TEST(HasConditionVariableStatement, DoesNotMatchCondition) {
-  EXPECT_TRUE(notMatches(
-      "void x() { if(true) {} }",
-      ifStmt(hasConditionVariableStatement(declStmt()))));
-  EXPECT_TRUE(notMatches(
-      "void x() { int x; if((x = 42)) {} }",
-      ifStmt(hasConditionVariableStatement(declStmt()))));
-}
-
-TEST(HasConditionVariableStatement, MatchesConditionVariables) {
-  EXPECT_TRUE(matches(
-      "void x() { if(int* a = 0) {} }",
-      ifStmt(hasConditionVariableStatement(declStmt()))));
-}
-
-TEST(ForEach, BindsOneNode) {
-  EXPECT_TRUE(matchAndVerifyResultTrue("class C { int x; };",
-      recordDecl(hasName("C"), forEach(fieldDecl(hasName("x")).bind("x"))),
-      new VerifyIdIsBoundTo<FieldDecl>("x", 1)));
-}
-
-TEST(ForEach, BindsMultipleNodes) {
-  EXPECT_TRUE(matchAndVerifyResultTrue("class C { int x; int y; int z; };",
-      recordDecl(hasName("C"), forEach(fieldDecl().bind("f"))),
-      new VerifyIdIsBoundTo<FieldDecl>("f", 3)));
-}
-
-TEST(ForEach, BindsRecursiveCombinations) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class C { class D { int x; int y; }; class E { int y; int z; }; };",
-      recordDecl(hasName("C"),
-                 forEach(recordDecl(forEach(fieldDecl().bind("f"))))),
-      new VerifyIdIsBoundTo<FieldDecl>("f", 4)));
-}
-
-TEST(ForEachDescendant, BindsOneNode) {
-  EXPECT_TRUE(matchAndVerifyResultTrue("class C { class D { int x; }; };",
-      recordDecl(hasName("C"),
-                 forEachDescendant(fieldDecl(hasName("x")).bind("x"))),
-      new VerifyIdIsBoundTo<FieldDecl>("x", 1)));
-}
-
-TEST(ForEachDescendant, NestedForEachDescendant) {
-  DeclarationMatcher m = recordDecl(
-      isDefinition(), decl().bind("x"), hasName("C"));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-    "class A { class B { class C {}; }; };",
-    recordDecl(hasName("A"), anyOf(m, forEachDescendant(m))),
-    new VerifyIdIsBoundTo<Decl>("x", "C")));
-
-  // Check that a partial match of 'm' that binds 'x' in the
-  // first part of anyOf(m, anything()) will not overwrite the
-  // binding created by the earlier binding in the hasDescendant.
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { class B { class C {}; }; };",
-      recordDecl(hasName("A"), allOf(hasDescendant(m), anyOf(m, anything()))),
-      new VerifyIdIsBoundTo<Decl>("x", "C")));
-}
-
-TEST(ForEachDescendant, BindsMultipleNodes) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class C { class D { int x; int y; }; "
-      "          class E { class F { int y; int z; }; }; };",
-      recordDecl(hasName("C"), forEachDescendant(fieldDecl().bind("f"))),
-      new VerifyIdIsBoundTo<FieldDecl>("f", 4)));
-}
-
-TEST(ForEachDescendant, BindsRecursiveCombinations) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class C { class D { "
-      "          class E { class F { class G { int y; int z; }; }; }; }; };",
-      recordDecl(hasName("C"), forEachDescendant(recordDecl(
-          forEachDescendant(fieldDecl().bind("f"))))),
-      new VerifyIdIsBoundTo<FieldDecl>("f", 8)));
-}
-
-TEST(ForEachDescendant, BindsCombinations) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void f() { if(true) {} if (true) {} while (true) {} if (true) {} while "
-      "(true) {} }",
-      compoundStmt(forEachDescendant(ifStmt().bind("if")),
-                   forEachDescendant(whileStmt().bind("while"))),
-      new VerifyIdIsBoundTo<IfStmt>("if", 6)));
-}
-
-TEST(Has, DoesNotDeleteBindings) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X { int a; };", recordDecl(decl().bind("x"), has(fieldDecl())),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-}
-
-TEST(LoopingMatchers, DoNotOverwritePreviousMatchResultOnFailure) {
-  // Those matchers cover all the cases where an inner matcher is called
-  // and there is not a 1:1 relationship between the match of the outer
-  // matcher and the match of the inner matcher.
-  // The pattern to look for is:
-  //   ... return InnerMatcher.matches(...); ...
-  // In which case no special handling is needed.
-  //
-  // On the other hand, if there are multiple alternative matches
-  // (for example forEach*) or matches might be discarded (for example has*)
-  // the implementation must make sure that the discarded matches do not
-  // affect the bindings.
-  // When new such matchers are added, add a test here that:
-  // - matches a simple node, and binds it as the first thing in the matcher:
-  //     recordDecl(decl().bind("x"), hasName("X")))
-  // - uses the matcher under test afterwards in a way that not the first
-  //   alternative is matched; for anyOf, that means the first branch
-  //   would need to return false; for hasAncestor, it means that not
-  //   the direct parent matches the inner matcher.
-
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X { int y; };",
-      recordDecl(
-          recordDecl().bind("x"), hasName("::X"),
-          anyOf(forEachDescendant(recordDecl(hasName("Y"))), anything())),
-      new VerifyIdIsBoundTo<CXXRecordDecl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X {};", recordDecl(recordDecl().bind("x"), hasName("::X"),
-                                anyOf(unless(anything()), anything())),
-      new VerifyIdIsBoundTo<CXXRecordDecl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "template<typename T1, typename T2> class X {}; X<float, int> x;",
-      classTemplateSpecializationDecl(
-          decl().bind("x"),
-          hasAnyTemplateArgument(refersToType(asString("int")))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X { void f(); void g(); };",
-      cxxRecordDecl(decl().bind("x"), hasMethod(hasName("g"))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X { X() : a(1), b(2) {} double a; int b; };",
-      recordDecl(decl().bind("x"),
-                 has(cxxConstructorDecl(
-                     hasAnyConstructorInitializer(forField(hasName("b")))))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void x(int, int) { x(0, 42); }",
-      callExpr(expr().bind("x"), hasAnyArgument(integerLiteral(equals(42)))),
-      new VerifyIdIsBoundTo<Expr>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void x(int, int y) {}",
-      functionDecl(decl().bind("x"), hasAnyParameter(hasName("y"))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void x() { return; if (true) {} }",
-      functionDecl(decl().bind("x"),
-                   has(compoundStmt(hasAnySubstatement(ifStmt())))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "namespace X { void b(int); void b(); }"
-      "using X::b;",
-      usingDecl(decl().bind("x"), hasAnyUsingShadowDecl(hasTargetDecl(
-                                      functionDecl(parameterCountIs(1))))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A{}; class B{}; class C : B, A {};",
-      cxxRecordDecl(decl().bind("x"), isDerivedFrom("::A")),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A{}; typedef A B; typedef A C; typedef A D;"
-      "class E : A {};",
-      cxxRecordDecl(decl().bind("x"), isDerivedFrom("C")),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { class B { void f() {} }; };",
-      functionDecl(decl().bind("x"), hasAncestor(recordDecl(hasName("::A")))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "template <typename T> struct A { struct B {"
-      "  void f() { if(true) {} }"
-      "}; };"
-      "void t() { A<int>::B b; b.f(); }",
-      ifStmt(stmt().bind("x"), hasAncestor(recordDecl(hasName("::A")))),
-      new VerifyIdIsBoundTo<Stmt>("x", 2)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A {};",
-      recordDecl(hasName("::A"), decl().bind("x"), unless(hasName("fooble"))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { A() : s(), i(42) {} const char *s; int i; };",
-      cxxConstructorDecl(hasName("::A::A"), decl().bind("x"),
-                         forEachConstructorInitializer(forField(hasName("i")))),
-      new VerifyIdIsBoundTo<Decl>("x", 1)));
-}
-
-TEST(ForEachDescendant, BindsCorrectNodes) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class C { void f(); int i; };",
-      recordDecl(hasName("C"), forEachDescendant(decl().bind("decl"))),
-      new VerifyIdIsBoundTo<FieldDecl>("decl", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class C { void f() {} int i; };",
-      recordDecl(hasName("C"), forEachDescendant(decl().bind("decl"))),
-      new VerifyIdIsBoundTo<FunctionDecl>("decl", 1)));
-}
-
-TEST(FindAll, BindsNodeOnMatch) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A {};",
-      recordDecl(hasName("::A"), findAll(recordDecl(hasName("::A")).bind("v"))),
-      new VerifyIdIsBoundTo<CXXRecordDecl>("v", 1)));
-}
-
-TEST(FindAll, BindsDescendantNodeOnMatch) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { int a; int b; };",
-      recordDecl(hasName("::A"), findAll(fieldDecl().bind("v"))),
-      new VerifyIdIsBoundTo<FieldDecl>("v", 2)));
-}
-
-TEST(FindAll, BindsNodeAndDescendantNodesOnOneMatch) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { int a; int b; };",
-      recordDecl(hasName("::A"),
-                 findAll(decl(anyOf(recordDecl(hasName("::A")).bind("v"),
-                                    fieldDecl().bind("v"))))),
-      new VerifyIdIsBoundTo<Decl>("v", 3)));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { class B {}; class C {}; };",
-      recordDecl(hasName("::A"), findAll(recordDecl(isDefinition()).bind("v"))),
-      new VerifyIdIsBoundTo<CXXRecordDecl>("v", 3)));
-}
-
-TEST(EachOf, TriggersForEachMatch) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { int a; int b; };",
-      recordDecl(eachOf(has(fieldDecl(hasName("a")).bind("v")),
-                        has(fieldDecl(hasName("b")).bind("v")))),
-      new VerifyIdIsBoundTo<FieldDecl>("v", 2)));
-}
-
-TEST(EachOf, BehavesLikeAnyOfUnlessBothMatch) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { int a; int c; };",
-      recordDecl(eachOf(has(fieldDecl(hasName("a")).bind("v")),
-                        has(fieldDecl(hasName("b")).bind("v")))),
-      new VerifyIdIsBoundTo<FieldDecl>("v", 1)));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class A { int c; int b; };",
-      recordDecl(eachOf(has(fieldDecl(hasName("a")).bind("v")),
-                        has(fieldDecl(hasName("b")).bind("v")))),
-      new VerifyIdIsBoundTo<FieldDecl>("v", 1)));
-  EXPECT_TRUE(notMatches(
-      "class A { int c; int d; };",
-      recordDecl(eachOf(has(fieldDecl(hasName("a")).bind("v")),
-                        has(fieldDecl(hasName("b")).bind("v"))))));
-}
-
-TEST(IsTemplateInstantiation, MatchesImplicitClassTemplateInstantiation) {
-  // Make sure that we can both match the class by name (::X) and by the type
-  // the template was instantiated with (via a field).
-
-  EXPECT_TRUE(matches(
-      "template <typename T> class X {}; class A {}; X<A> x;",
-      cxxRecordDecl(hasName("::X"), isTemplateInstantiation())));
-
-  EXPECT_TRUE(matches(
-      "template <typename T> class X { T t; }; class A {}; X<A> x;",
-      cxxRecordDecl(isTemplateInstantiation(), hasDescendant(
-          fieldDecl(hasType(recordDecl(hasName("A"))))))));
-}
-
-TEST(IsTemplateInstantiation, MatchesImplicitFunctionTemplateInstantiation) {
-  EXPECT_TRUE(matches(
-      "template <typename T> void f(T t) {} class A {}; void g() { f(A()); }",
-      functionDecl(hasParameter(0, hasType(recordDecl(hasName("A")))),
-               isTemplateInstantiation())));
-}
-
-TEST(IsTemplateInstantiation, MatchesExplicitClassTemplateInstantiation) {
-  EXPECT_TRUE(matches(
-      "template <typename T> class X { T t; }; class A {};"
-      "template class X<A>;",
-      cxxRecordDecl(isTemplateInstantiation(), hasDescendant(
-          fieldDecl(hasType(recordDecl(hasName("A"))))))));
-}
-
-TEST(IsTemplateInstantiation,
-     MatchesInstantiationOfPartiallySpecializedClassTemplate) {
-  EXPECT_TRUE(matches(
-      "template <typename T> class X {};"
-      "template <typename T> class X<T*> {}; class A {}; X<A*> x;",
-      cxxRecordDecl(hasName("::X"), isTemplateInstantiation())));
-}
-
-TEST(IsTemplateInstantiation,
-     MatchesInstantiationOfClassTemplateNestedInNonTemplate) {
-  EXPECT_TRUE(matches(
-      "class A {};"
-      "class X {"
-      "  template <typename U> class Y { U u; };"
-      "  Y<A> y;"
-      "};",
-      cxxRecordDecl(hasName("::X::Y"), isTemplateInstantiation())));
-}
-
-TEST(IsTemplateInstantiation, DoesNotMatchInstantiationsInsideOfInstantiation) {
-  // FIXME: Figure out whether this makes sense. It doesn't affect the
-  // normal use case as long as the uppermost instantiation always is marked
-  // as template instantiation, but it might be confusing as a predicate.
-  EXPECT_TRUE(matches(
-      "class A {};"
-      "template <typename T> class X {"
-      "  template <typename U> class Y { U u; };"
-      "  Y<T> y;"
-      "}; X<A> x;",
-      cxxRecordDecl(hasName("::X<A>::Y"), unless(isTemplateInstantiation()))));
-}
-
-TEST(IsTemplateInstantiation, DoesNotMatchExplicitClassTemplateSpecialization) {
-  EXPECT_TRUE(notMatches(
-      "template <typename T> class X {}; class A {};"
-      "template <> class X<A> {}; X<A> x;",
-      cxxRecordDecl(hasName("::X"), isTemplateInstantiation())));
-}
-
-TEST(IsTemplateInstantiation, DoesNotMatchNonTemplate) {
-  EXPECT_TRUE(notMatches(
-      "class A {}; class Y { A a; };",
-      cxxRecordDecl(isTemplateInstantiation())));
-}
-
-TEST(IsInstantiated, MatchesInstantiation) {
-  EXPECT_TRUE(
-      matches("template<typename T> class A { T i; }; class Y { A<int> a; };",
-              cxxRecordDecl(isInstantiated())));
-}
-
-TEST(IsInstantiated, NotMatchesDefinition) {
-  EXPECT_TRUE(notMatches("template<typename T> class A { T i; };",
-                         cxxRecordDecl(isInstantiated())));
-}
-
-TEST(IsInTemplateInstantiation, MatchesInstantiationStmt) {
-  EXPECT_TRUE(matches("template<typename T> struct A { A() { T i; } };"
-                      "class Y { A<int> a; }; Y y;",
-                      declStmt(isInTemplateInstantiation())));
-}
-
-TEST(IsInTemplateInstantiation, NotMatchesDefinitionStmt) {
-  EXPECT_TRUE(notMatches("template<typename T> struct A { void x() { T i; } };",
-                         declStmt(isInTemplateInstantiation())));
-}
-
-TEST(IsInstantiated, MatchesFunctionInstantiation) {
-  EXPECT_TRUE(
-      matches("template<typename T> void A(T t) { T i; } void x() { A(0); }",
-              functionDecl(isInstantiated())));
-}
-
-TEST(IsInstantiated, NotMatchesFunctionDefinition) {
-  EXPECT_TRUE(notMatches("template<typename T> void A(T t) { T i; }",
-                         varDecl(isInstantiated())));
-}
-
-TEST(IsInTemplateInstantiation, MatchesFunctionInstantiationStmt) {
-  EXPECT_TRUE(
-      matches("template<typename T> void A(T t) { T i; } void x() { A(0); }",
-              declStmt(isInTemplateInstantiation())));
-}
-
-TEST(IsInTemplateInstantiation, NotMatchesFunctionDefinitionStmt) {
-  EXPECT_TRUE(notMatches("template<typename T> void A(T t) { T i; }",
-                         declStmt(isInTemplateInstantiation())));
-}
-
-TEST(IsInTemplateInstantiation, Sharing) {
-  auto Matcher = binaryOperator(unless(isInTemplateInstantiation()));
-  // FIXME: Node sharing is an implementation detail, exposing it is ugly
-  // and makes the matcher behave in non-obvious ways.
-  EXPECT_TRUE(notMatches(
-      "int j; template<typename T> void A(T t) { j += 42; } void x() { A(0); }",
-      Matcher));
-  EXPECT_TRUE(matches(
-      "int j; template<typename T> void A(T t) { j += t; } void x() { A(0); }",
-      Matcher));
-}
-
-TEST(IsExplicitTemplateSpecialization,
-     DoesNotMatchPrimaryTemplate) {
-  EXPECT_TRUE(notMatches(
-      "template <typename T> class X {};",
-      cxxRecordDecl(isExplicitTemplateSpecialization())));
-  EXPECT_TRUE(notMatches(
-      "template <typename T> void f(T t);",
-      functionDecl(isExplicitTemplateSpecialization())));
-}
-
-TEST(IsExplicitTemplateSpecialization,
-     DoesNotMatchExplicitTemplateInstantiations) {
-  EXPECT_TRUE(notMatches(
-      "template <typename T> class X {};"
-      "template class X<int>; extern template class X<long>;",
-      cxxRecordDecl(isExplicitTemplateSpecialization())));
-  EXPECT_TRUE(notMatches(
-      "template <typename T> void f(T t) {}"
-      "template void f(int t); extern template void f(long t);",
-      functionDecl(isExplicitTemplateSpecialization())));
-}
-
-TEST(IsExplicitTemplateSpecialization,
-     DoesNotMatchImplicitTemplateInstantiations) {
-  EXPECT_TRUE(notMatches(
-      "template <typename T> class X {}; X<int> x;",
-      cxxRecordDecl(isExplicitTemplateSpecialization())));
-  EXPECT_TRUE(notMatches(
-      "template <typename T> void f(T t); void g() { f(10); }",
-      functionDecl(isExplicitTemplateSpecialization())));
-}
-
-TEST(IsExplicitTemplateSpecialization,
-     MatchesExplicitTemplateSpecializations) {
-  EXPECT_TRUE(matches(
-      "template <typename T> class X {};"
-      "template<> class X<int> {};",
-      cxxRecordDecl(isExplicitTemplateSpecialization())));
-  EXPECT_TRUE(matches(
-      "template <typename T> void f(T t) {}"
-      "template<> void f(int t) {}",
-      functionDecl(isExplicitTemplateSpecialization())));
-}
-
-TEST(HasAncenstor, MatchesDeclarationAncestors) {
-  EXPECT_TRUE(matches(
-      "class A { class B { class C {}; }; };",
-      recordDecl(hasName("C"), hasAncestor(recordDecl(hasName("A"))))));
-}
-
-TEST(HasAncenstor, FailsIfNoAncestorMatches) {
-  EXPECT_TRUE(notMatches(
-      "class A { class B { class C {}; }; };",
-      recordDecl(hasName("C"), hasAncestor(recordDecl(hasName("X"))))));
-}
-
-TEST(HasAncestor, MatchesDeclarationsThatGetVisitedLater) {
-  EXPECT_TRUE(matches(
-      "class A { class B { void f() { C c; } class C {}; }; };",
-      varDecl(hasName("c"), hasType(recordDecl(hasName("C"), 
-          hasAncestor(recordDecl(hasName("A"))))))));
-}
-
-TEST(HasAncenstor, MatchesStatementAncestors) {
-  EXPECT_TRUE(matches(
-      "void f() { if (true) { while (false) { 42; } } }",
-      integerLiteral(equals(42), hasAncestor(ifStmt()))));
-}
-
-TEST(HasAncestor, DrillsThroughDifferentHierarchies) {
-  EXPECT_TRUE(matches(
-      "void f() { if (true) { int x = 42; } }",
-      integerLiteral(equals(42), hasAncestor(functionDecl(hasName("f"))))));
-}
-
-TEST(HasAncestor, BindsRecursiveCombinations) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class C { class D { class E { class F { int y; }; }; }; };",
-      fieldDecl(hasAncestor(recordDecl(hasAncestor(recordDecl().bind("r"))))),
-      new VerifyIdIsBoundTo<CXXRecordDecl>("r", 1)));
-}
-
-TEST(HasAncestor, BindsCombinationsWithHasDescendant) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class C { class D { class E { class F { int y; }; }; }; };",
-      fieldDecl(hasAncestor(
-          decl(
-            hasDescendant(recordDecl(isDefinition(),
-                                     hasAncestor(recordDecl())))
-          ).bind("d")
-      )),
-      new VerifyIdIsBoundTo<CXXRecordDecl>("d", "E")));
-}
-
-TEST(HasAncestor, MatchesClosestAncestor) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "template <typename T> struct C {"
-      "  void f(int) {"
-      "    struct I { void g(T) { int x; } } i; i.g(42);"
-      "  }"
-      "};"
-      "template struct C<int>;",
-      varDecl(hasName("x"),
-              hasAncestor(functionDecl(hasParameter(
-                  0, varDecl(hasType(asString("int"))))).bind("f"))).bind("v"),
-      new VerifyIdIsBoundTo<FunctionDecl>("f", "g", 2)));
-}
-
-TEST(HasAncestor, MatchesInTemplateInstantiations) {
-  EXPECT_TRUE(matches(
-      "template <typename T> struct A { struct B { struct C { T t; }; }; }; "
-      "A<int>::B::C a;",
-      fieldDecl(hasType(asString("int")),
-                hasAncestor(recordDecl(hasName("A"))))));
-}
-
-TEST(HasAncestor, MatchesInImplicitCode) {
-  EXPECT_TRUE(matches(
-      "struct X {}; struct A { A() {} X x; };",
-      cxxConstructorDecl(
-          hasAnyConstructorInitializer(withInitializer(expr(
-              hasAncestor(recordDecl(hasName("A")))))))));
-}
-
-TEST(HasParent, MatchesOnlyParent) {
-  EXPECT_TRUE(matches(
-      "void f() { if (true) { int x = 42; } }",
-      compoundStmt(hasParent(ifStmt()))));
-  EXPECT_TRUE(notMatches(
-      "void f() { for (;;) { int x = 42; } }",
-      compoundStmt(hasParent(ifStmt()))));
-  EXPECT_TRUE(notMatches(
-      "void f() { if (true) for (;;) { int x = 42; } }",
-      compoundStmt(hasParent(ifStmt()))));
-}
-
-TEST(HasAncestor, MatchesAllAncestors) {
-  EXPECT_TRUE(matches(
-      "template <typename T> struct C { static void f() { 42; } };"
-      "void t() { C<int>::f(); }",
-      integerLiteral(
-          equals(42),
-          allOf(
-              hasAncestor(cxxRecordDecl(isTemplateInstantiation())),
-              hasAncestor(cxxRecordDecl(unless(isTemplateInstantiation())))))));
-}
-
-TEST(HasParent, MatchesAllParents) {
-  EXPECT_TRUE(matches(
-      "template <typename T> struct C { static void f() { 42; } };"
-      "void t() { C<int>::f(); }",
-      integerLiteral(
-          equals(42),
-          hasParent(compoundStmt(hasParent(functionDecl(
-              hasParent(cxxRecordDecl(isTemplateInstantiation())))))))));
-  EXPECT_TRUE(
-      matches("template <typename T> struct C { static void f() { 42; } };"
-              "void t() { C<int>::f(); }",
-              integerLiteral(
-                  equals(42),
-                  hasParent(compoundStmt(hasParent(functionDecl(hasParent(
-                      cxxRecordDecl(unless(isTemplateInstantiation()))))))))));
-  EXPECT_TRUE(matches(
-      "template <typename T> struct C { static void f() { 42; } };"
-      "void t() { C<int>::f(); }",
-      integerLiteral(equals(42),
-                     hasParent(compoundStmt(
-                         allOf(hasParent(functionDecl(hasParent(
-                                   cxxRecordDecl(isTemplateInstantiation())))),
-                               hasParent(functionDecl(hasParent(cxxRecordDecl(
-                                   unless(isTemplateInstantiation())))))))))));
-  EXPECT_TRUE(
-      notMatches("template <typename T> struct C { static void f() {} };"
-                 "void t() { C<int>::f(); }",
-                 compoundStmt(hasParent(recordDecl()))));
-}
-
-TEST(HasParent, NoDuplicateParents) {
-  class HasDuplicateParents : public BoundNodesCallback {
-  public:
-    bool run(const BoundNodes *Nodes) override { return false; }
-    bool run(const BoundNodes *Nodes, ASTContext *Context) override {
-      const Stmt *Node = Nodes->getNodeAs<Stmt>("node");
-      std::set<const void *> Parents;
-      for (const auto &Parent : Context->getParents(*Node)) {
-        if (!Parents.insert(Parent.getMemoizationData()).second) {
-          return true;
-        }
-      }
-      return false;
-    }
-  };
-  EXPECT_FALSE(matchAndVerifyResultTrue(
-      "template <typename T> int Foo() { return 1 + 2; }\n"
-      "int x = Foo<int>() + Foo<unsigned>();",
-      stmt().bind("node"), new HasDuplicateParents()));
-}
-
-TEST(TypeMatching, MatchesTypes) {
-  EXPECT_TRUE(matches("struct S {};", qualType().bind("loc")));
-}
-
-TEST(TypeMatching, MatchesBool) {
-  EXPECT_TRUE(matches("struct S { bool func(); };",
-                      cxxMethodDecl(returns(booleanType()))));
-  EXPECT_TRUE(notMatches("struct S { void func(); };",
-                         cxxMethodDecl(returns(booleanType()))));
-}
-
-TEST(TypeMatching, MatchesVoid) {
-  EXPECT_TRUE(matches("struct S { void func(); };",
-                      cxxMethodDecl(returns(voidType()))));
-}
-
-TEST(TypeMatching, MatchesArrayTypes) {
-  EXPECT_TRUE(matches("int a[] = {2,3};", arrayType()));
-  EXPECT_TRUE(matches("int a[42];", arrayType()));
-  EXPECT_TRUE(matches("void f(int b) { int a[b]; }", arrayType()));
-
-  EXPECT_TRUE(notMatches("struct A {}; A a[7];",
-                         arrayType(hasElementType(builtinType()))));
-
-  EXPECT_TRUE(matches(
-      "int const a[] = { 2, 3 };",
-      qualType(arrayType(hasElementType(builtinType())))));
-  EXPECT_TRUE(matches(
-      "int const a[] = { 2, 3 };",
-      qualType(isConstQualified(), arrayType(hasElementType(builtinType())))));
-  EXPECT_TRUE(matches(
-      "typedef const int T; T x[] = { 1, 2 };",
-      qualType(isConstQualified(), arrayType())));
-
-  EXPECT_TRUE(notMatches(
-      "int a[] = { 2, 3 };",
-      qualType(isConstQualified(), arrayType(hasElementType(builtinType())))));
-  EXPECT_TRUE(notMatches(
-      "int a[] = { 2, 3 };",
-      qualType(arrayType(hasElementType(isConstQualified(), builtinType())))));
-  EXPECT_TRUE(notMatches(
-      "int const a[] = { 2, 3 };",
-      qualType(arrayType(hasElementType(builtinType())),
-               unless(isConstQualified()))));
-
-  EXPECT_TRUE(matches("int a[2];",
-                      constantArrayType(hasElementType(builtinType()))));
-  EXPECT_TRUE(matches("const int a = 0;", qualType(isInteger())));
-}
-
-TEST(TypeMatching, DecayedType) {
-  EXPECT_TRUE(matches("void f(int i[]);", valueDecl(hasType(decayedType(hasDecayedType(pointerType()))))));
-  EXPECT_TRUE(notMatches("int i[7];", decayedType()));
-}
-
-TEST(TypeMatching, MatchesComplexTypes) {
-  EXPECT_TRUE(matches("_Complex float f;", complexType()));
-  EXPECT_TRUE(matches(
-    "_Complex float f;",
-    complexType(hasElementType(builtinType()))));
-  EXPECT_TRUE(notMatches(
-    "_Complex float f;",
-    complexType(hasElementType(isInteger()))));
-}
-
-TEST(TypeMatching, MatchesConstantArrayTypes) {
-  EXPECT_TRUE(matches("int a[2];", constantArrayType()));
-  EXPECT_TRUE(notMatches(
-    "void f() { int a[] = { 2, 3 }; int b[a[0]]; }",
-    constantArrayType(hasElementType(builtinType()))));
-
-  EXPECT_TRUE(matches("int a[42];", constantArrayType(hasSize(42))));
-  EXPECT_TRUE(matches("int b[2*21];", constantArrayType(hasSize(42))));
-  EXPECT_TRUE(notMatches("int c[41], d[43];", constantArrayType(hasSize(42))));
-}
-
-TEST(TypeMatching, MatchesDependentSizedArrayTypes) {
-  EXPECT_TRUE(matches(
-    "template <typename T, int Size> class array { T data[Size]; };",
-    dependentSizedArrayType()));
-  EXPECT_TRUE(notMatches(
-    "int a[42]; int b[] = { 2, 3 }; void f() { int c[b[0]]; }",
-    dependentSizedArrayType()));
-}
-
-TEST(TypeMatching, MatchesIncompleteArrayType) {
-  EXPECT_TRUE(matches("int a[] = { 2, 3 };", incompleteArrayType()));
-  EXPECT_TRUE(matches("void f(int a[]) {}", incompleteArrayType()));
-
-  EXPECT_TRUE(notMatches("int a[42]; void f() { int b[a[0]]; }",
-                         incompleteArrayType()));
-}
-
-TEST(TypeMatching, MatchesVariableArrayType) {
-  EXPECT_TRUE(matches("void f(int b) { int a[b]; }", variableArrayType()));
-  EXPECT_TRUE(notMatches("int a[] = {2, 3}; int b[42];", variableArrayType()));
-  
-  EXPECT_TRUE(matches(
-    "void f(int b) { int a[b]; }",
-    variableArrayType(hasSizeExpr(ignoringImpCasts(declRefExpr(to(
-      varDecl(hasName("b")))))))));
-}
-
-TEST(TypeMatching, MatchesAtomicTypes) {
-  if (llvm::Triple(llvm::sys::getDefaultTargetTriple()).getOS() !=
-      llvm::Triple::Win32) {
-    // FIXME: Make this work for MSVC.
-    EXPECT_TRUE(matches("_Atomic(int) i;", atomicType()));
-
-    EXPECT_TRUE(matches("_Atomic(int) i;",
-                        atomicType(hasValueType(isInteger()))));
-    EXPECT_TRUE(notMatches("_Atomic(float) f;",
-                           atomicType(hasValueType(isInteger()))));
-  }
-}
-
-TEST(TypeMatching, MatchesAutoTypes) {
-  EXPECT_TRUE(matches("auto i = 2;", autoType()));
-  EXPECT_TRUE(matches("int v[] = { 2, 3 }; void f() { for (int i : v) {} }",
-                      autoType()));
-
-  // FIXME: Matching against the type-as-written can't work here, because the
-  //        type as written was not deduced.
-  //EXPECT_TRUE(matches("auto a = 1;",
-  //                    autoType(hasDeducedType(isInteger()))));
-  //EXPECT_TRUE(notMatches("auto b = 2.0;",
-  //                       autoType(hasDeducedType(isInteger()))));
-}
-
-TEST(TypeMatching, MatchesFunctionTypes) {
-  EXPECT_TRUE(matches("int (*f)(int);", functionType()));
-  EXPECT_TRUE(matches("void f(int i) {}", functionType()));
-}
-
-TEST(TypeMatching, MatchesParenType) {
-  EXPECT_TRUE(
-      matches("int (*array)[4];", varDecl(hasType(pointsTo(parenType())))));
-  EXPECT_TRUE(notMatches("int *array[4];", varDecl(hasType(parenType()))));
-
-  EXPECT_TRUE(matches(
-      "int (*ptr_to_func)(int);",
-      varDecl(hasType(pointsTo(parenType(innerType(functionType())))))));
-  EXPECT_TRUE(notMatches(
-      "int (*ptr_to_array)[4];",
-      varDecl(hasType(pointsTo(parenType(innerType(functionType())))))));
-}
-
-TEST(TypeMatching, PointerTypes) {
-  // FIXME: Reactive when these tests can be more specific (not matching
-  // implicit code on certain platforms), likely when we have hasDescendant for
-  // Types/TypeLocs.
-  //EXPECT_TRUE(matchAndVerifyResultTrue(
-  //    "int* a;",
-  //    pointerTypeLoc(pointeeLoc(typeLoc().bind("loc"))),
-  //    new VerifyIdIsBoundTo<TypeLoc>("loc", 1)));
-  //EXPECT_TRUE(matchAndVerifyResultTrue(
-  //    "int* a;",
-  //    pointerTypeLoc().bind("loc"),
-  //    new VerifyIdIsBoundTo<TypeLoc>("loc", 1)));
-  EXPECT_TRUE(matches(
-      "int** a;",
-      loc(pointerType(pointee(qualType())))));
-  EXPECT_TRUE(matches(
-      "int** a;",
-      loc(pointerType(pointee(pointerType())))));
-  EXPECT_TRUE(matches(
-      "int* b; int* * const a = &b;",
-      loc(qualType(isConstQualified(), pointerType()))));
-
-  std::string Fragment = "struct A { int i; }; int A::* ptr = &A::i;";
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
-                                           hasType(blockPointerType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ptr"),
-                                        hasType(memberPointerType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
-                                           hasType(pointerType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
-                                           hasType(referenceType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
-                                           hasType(lValueReferenceType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
-                                           hasType(rValueReferenceType()))));
-
-  Fragment = "int *ptr;";
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
-                                           hasType(blockPointerType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
-                                           hasType(memberPointerType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ptr"),
-                                        hasType(pointerType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ptr"),
-                                           hasType(referenceType()))));
-
-  Fragment = "int a; int &ref = a;";
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
-                                           hasType(blockPointerType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
-                                           hasType(memberPointerType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
-                                           hasType(pointerType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ref"),
-                                        hasType(referenceType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ref"),
-                                        hasType(lValueReferenceType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
-                                           hasType(rValueReferenceType()))));
-
-  Fragment = "int &&ref = 2;";
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
-                                           hasType(blockPointerType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
-                                           hasType(memberPointerType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
-                                           hasType(pointerType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ref"),
-                                        hasType(referenceType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("ref"),
-                                           hasType(lValueReferenceType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("ref"),
-                                        hasType(rValueReferenceType()))));
-}
-
-TEST(TypeMatching, AutoRefTypes) {
-  std::string Fragment = "auto a = 1;"
-                         "auto b = a;"
-                         "auto &c = a;"
-                         "auto &&d = c;"
-                         "auto &&e = 2;";
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("a"),
-                                           hasType(referenceType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("b"),
-                                           hasType(referenceType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("c"),
-                                        hasType(referenceType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("c"),
-                                        hasType(lValueReferenceType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("c"),
-                                           hasType(rValueReferenceType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("d"),
-                                        hasType(referenceType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("d"),
-                                        hasType(lValueReferenceType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("d"),
-                                           hasType(rValueReferenceType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("e"),
-                                        hasType(referenceType()))));
-  EXPECT_TRUE(notMatches(Fragment, varDecl(hasName("e"),
-                                           hasType(lValueReferenceType()))));
-  EXPECT_TRUE(matches(Fragment, varDecl(hasName("e"),
-                                        hasType(rValueReferenceType()))));
-}
-
-TEST(TypeMatching, PointeeTypes) {
-  EXPECT_TRUE(matches("int b; int &a = b;",
-                      referenceType(pointee(builtinType()))));
-  EXPECT_TRUE(matches("int *a;", pointerType(pointee(builtinType()))));
-
-  EXPECT_TRUE(matches("int *a;",
-                      loc(pointerType(pointee(builtinType())))));
-
-  EXPECT_TRUE(matches(
-      "int const *A;",
-      pointerType(pointee(isConstQualified(), builtinType()))));
-  EXPECT_TRUE(notMatches(
-      "int *A;",
-      pointerType(pointee(isConstQualified(), builtinType()))));
-}
-
-TEST(TypeMatching, MatchesPointersToConstTypes) {
-  EXPECT_TRUE(matches("int b; int * const a = &b;",
-                      loc(pointerType())));
-  EXPECT_TRUE(matches("int b; int * const a = &b;",
-                      loc(pointerType())));
-  EXPECT_TRUE(matches(
-      "int b; const int * a = &b;",
-      loc(pointerType(pointee(builtinType())))));
-  EXPECT_TRUE(matches(
-      "int b; const int * a = &b;",
-      pointerType(pointee(builtinType()))));
-}
-
-TEST(TypeMatching, MatchesTypedefTypes) {
-  EXPECT_TRUE(matches("typedef int X; X a;", varDecl(hasName("a"),
-                                                     hasType(typedefType()))));
-}
-
-TEST(TypeMatching, MatchesTemplateSpecializationType) {
-  EXPECT_TRUE(matches("template <typename T> class A{}; A<int> a;",
-                      templateSpecializationType()));
-}
-
-TEST(TypeMatching, MatchesRecordType) {
-  EXPECT_TRUE(matches("class C{}; C c;", recordType()));
-  EXPECT_TRUE(matches("struct S{}; S s;",
-                      recordType(hasDeclaration(recordDecl(hasName("S"))))));
-  EXPECT_TRUE(notMatches("int i;",
-                         recordType(hasDeclaration(recordDecl(hasName("S"))))));
-}
-
-TEST(TypeMatching, MatchesElaboratedType) {
-  EXPECT_TRUE(matches(
-    "namespace N {"
-    "  namespace M {"
-    "    class D {};"
-    "  }"
-    "}"
-    "N::M::D d;", elaboratedType()));
-  EXPECT_TRUE(matches("class C {} c;", elaboratedType()));
-  EXPECT_TRUE(notMatches("class C {}; C c;", elaboratedType()));
-}
-
-TEST(ElaboratedTypeNarrowing, hasQualifier) {
-  EXPECT_TRUE(matches(
-    "namespace N {"
-    "  namespace M {"
-    "    class D {};"
-    "  }"
-    "}"
-    "N::M::D d;",
-    elaboratedType(hasQualifier(hasPrefix(specifiesNamespace(hasName("N")))))));
-  EXPECT_TRUE(notMatches(
-    "namespace M {"
-    "  class D {};"
-    "}"
-    "M::D d;",
-    elaboratedType(hasQualifier(hasPrefix(specifiesNamespace(hasName("N")))))));
-  EXPECT_TRUE(notMatches(
-    "struct D {"
-    "} d;",
-    elaboratedType(hasQualifier(nestedNameSpecifier()))));
-}
-
-TEST(ElaboratedTypeNarrowing, namesType) {
-  EXPECT_TRUE(matches(
-    "namespace N {"
-    "  namespace M {"
-    "    class D {};"
-    "  }"
-    "}"
-    "N::M::D d;",
-    elaboratedType(elaboratedType(namesType(recordType(
-        hasDeclaration(namedDecl(hasName("D")))))))));
-  EXPECT_TRUE(notMatches(
-    "namespace M {"
-    "  class D {};"
-    "}"
-    "M::D d;",
-    elaboratedType(elaboratedType(namesType(typedefType())))));
-}
-
-TEST(TypeMatching, MatchesSubstTemplateTypeParmType) {
-  const std::string code = "template <typename T>"
-                           "int F() {"
-                           "  return 1 + T();"
-                           "}"
-                           "int i = F<int>();";
-  EXPECT_FALSE(matches(code, binaryOperator(hasLHS(
-                                 expr(hasType(substTemplateTypeParmType()))))));
-  EXPECT_TRUE(matches(code, binaryOperator(hasRHS(
-                                expr(hasType(substTemplateTypeParmType()))))));
-}
-
-TEST(NNS, MatchesNestedNameSpecifiers) {
-  EXPECT_TRUE(matches("namespace ns { struct A {}; } ns::A a;",
-                      nestedNameSpecifier()));
-  EXPECT_TRUE(matches("template <typename T> class A { typename T::B b; };",
-                      nestedNameSpecifier()));
-  EXPECT_TRUE(matches("struct A { void f(); }; void A::f() {}",
-                      nestedNameSpecifier()));
-  EXPECT_TRUE(matches("namespace a { namespace b {} } namespace ab = a::b;",
-                      nestedNameSpecifier()));
-
-  EXPECT_TRUE(matches(
-    "struct A { static void f() {} }; void g() { A::f(); }",
-    nestedNameSpecifier()));
-  EXPECT_TRUE(notMatches(
-    "struct A { static void f() {} }; void g(A* a) { a->f(); }",
-    nestedNameSpecifier()));
-}
-
-TEST(NullStatement, SimpleCases) {
-  EXPECT_TRUE(matches("void f() {int i;;}", nullStmt()));
-  EXPECT_TRUE(notMatches("void f() {int i;}", nullStmt()));
-}
-
-TEST(NS, Anonymous) {
-  EXPECT_TRUE(notMatches("namespace N {}", namespaceDecl(isAnonymous())));
-  EXPECT_TRUE(matches("namespace {}", namespaceDecl(isAnonymous())));
-}
-
-TEST(NS, Alias) {
-  EXPECT_TRUE(matches("namespace test {} namespace alias = ::test;",
-                      namespaceAliasDecl(hasName("alias"))));
-}
-
-TEST(NNS, MatchesTypes) {
-  NestedNameSpecifierMatcher Matcher = nestedNameSpecifier(
-    specifiesType(hasDeclaration(recordDecl(hasName("A")))));
-  EXPECT_TRUE(matches("struct A { struct B {}; }; A::B b;", Matcher));
-  EXPECT_TRUE(matches("struct A { struct B { struct C {}; }; }; A::B::C c;",
-                      Matcher));
-  EXPECT_TRUE(notMatches("namespace A { struct B {}; } A::B b;", Matcher));
-}
-
-TEST(NNS, MatchesNamespaceDecls) {
-  NestedNameSpecifierMatcher Matcher = nestedNameSpecifier(
-    specifiesNamespace(hasName("ns")));
-  EXPECT_TRUE(matches("namespace ns { struct A {}; } ns::A a;", Matcher));
-  EXPECT_TRUE(notMatches("namespace xx { struct A {}; } xx::A a;", Matcher));
-  EXPECT_TRUE(notMatches("struct ns { struct A {}; }; ns::A a;", Matcher));
-}
-
-TEST(NNS, BindsNestedNameSpecifiers) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "namespace ns { struct E { struct B {}; }; } ns::E::B b;",
-      nestedNameSpecifier(specifiesType(asString("struct ns::E"))).bind("nns"),
-      new VerifyIdIsBoundTo<NestedNameSpecifier>("nns", "ns::struct E::")));
-}
-
-TEST(NNS, BindsNestedNameSpecifierLocs) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "namespace ns { struct B {}; } ns::B b;",
-      loc(nestedNameSpecifier()).bind("loc"),
-      new VerifyIdIsBoundTo<NestedNameSpecifierLoc>("loc", 1)));
-}
-
-TEST(NNS, MatchesNestedNameSpecifierPrefixes) {
-  EXPECT_TRUE(matches(
-      "struct A { struct B { struct C {}; }; }; A::B::C c;",
-      nestedNameSpecifier(hasPrefix(specifiesType(asString("struct A"))))));
-  EXPECT_TRUE(matches(
-      "struct A { struct B { struct C {}; }; }; A::B::C c;",
-      nestedNameSpecifierLoc(hasPrefix(
-          specifiesTypeLoc(loc(qualType(asString("struct A"))))))));
-}
-
-TEST(NNS, DescendantsOfNestedNameSpecifiers) {
-  std::string Fragment =
-      "namespace a { struct A { struct B { struct C {}; }; }; };"
-      "void f() { a::A::B::C c; }";
-  EXPECT_TRUE(matches(
-      Fragment,
-      nestedNameSpecifier(specifiesType(asString("struct a::A::B")),
-                          hasDescendant(nestedNameSpecifier(
-                              specifiesNamespace(hasName("a")))))));
-  EXPECT_TRUE(notMatches(
-      Fragment,
-      nestedNameSpecifier(specifiesType(asString("struct a::A::B")),
-                          has(nestedNameSpecifier(
-                              specifiesNamespace(hasName("a")))))));
-  EXPECT_TRUE(matches(
-      Fragment,
-      nestedNameSpecifier(specifiesType(asString("struct a::A")),
-                          has(nestedNameSpecifier(
-                              specifiesNamespace(hasName("a")))))));
-
-  // Not really useful because a NestedNameSpecifier can af at most one child,
-  // but to complete the interface.
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      Fragment,
-      nestedNameSpecifier(specifiesType(asString("struct a::A::B")),
-                          forEach(nestedNameSpecifier().bind("x"))),
-      new VerifyIdIsBoundTo<NestedNameSpecifier>("x", 1)));
-}
-
-TEST(NNS, NestedNameSpecifiersAsDescendants) {
-  std::string Fragment =
-      "namespace a { struct A { struct B { struct C {}; }; }; };"
-      "void f() { a::A::B::C c; }";
-  EXPECT_TRUE(matches(
-      Fragment,
-      decl(hasDescendant(nestedNameSpecifier(specifiesType(
-          asString("struct a::A")))))));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      Fragment,
-      functionDecl(hasName("f"),
-                   forEachDescendant(nestedNameSpecifier().bind("x"))),
-      // Nested names: a, a::A and a::A::B.
-      new VerifyIdIsBoundTo<NestedNameSpecifier>("x", 3)));
-}
-
-TEST(NNSLoc, DescendantsOfNestedNameSpecifierLocs) {
-  std::string Fragment =
-      "namespace a { struct A { struct B { struct C {}; }; }; };"
-      "void f() { a::A::B::C c; }";
-  EXPECT_TRUE(matches(
-      Fragment,
-      nestedNameSpecifierLoc(loc(specifiesType(asString("struct a::A::B"))),
-                             hasDescendant(loc(nestedNameSpecifier(
-                                 specifiesNamespace(hasName("a"))))))));
-  EXPECT_TRUE(notMatches(
-      Fragment,
-      nestedNameSpecifierLoc(loc(specifiesType(asString("struct a::A::B"))),
-                             has(loc(nestedNameSpecifier(
-                                 specifiesNamespace(hasName("a"))))))));
-  EXPECT_TRUE(matches(
-      Fragment,
-      nestedNameSpecifierLoc(loc(specifiesType(asString("struct a::A"))),
-                             has(loc(nestedNameSpecifier(
-                                 specifiesNamespace(hasName("a"))))))));
-
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      Fragment,
-      nestedNameSpecifierLoc(loc(specifiesType(asString("struct a::A::B"))),
-                             forEach(nestedNameSpecifierLoc().bind("x"))),
-      new VerifyIdIsBoundTo<NestedNameSpecifierLoc>("x", 1)));
-}
-
-TEST(NNSLoc, NestedNameSpecifierLocsAsDescendants) {
-  std::string Fragment =
-      "namespace a { struct A { struct B { struct C {}; }; }; };"
-      "void f() { a::A::B::C c; }";
-  EXPECT_TRUE(matches(
-      Fragment,
-      decl(hasDescendant(loc(nestedNameSpecifier(specifiesType(
-          asString("struct a::A"))))))));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      Fragment,
-      functionDecl(hasName("f"),
-                   forEachDescendant(nestedNameSpecifierLoc().bind("x"))),
-      // Nested names: a, a::A and a::A::B.
-      new VerifyIdIsBoundTo<NestedNameSpecifierLoc>("x", 3)));
-}
-
-template <typename T> class VerifyMatchOnNode : public BoundNodesCallback {
-public:
-  VerifyMatchOnNode(StringRef Id, const internal::Matcher<T> &InnerMatcher,
-                    StringRef InnerId)
-      : Id(Id), InnerMatcher(InnerMatcher), InnerId(InnerId) {
-  }
-
-  bool run(const BoundNodes *Nodes) override { return false; }
-
-  bool run(const BoundNodes *Nodes, ASTContext *Context) override {
-    const T *Node = Nodes->getNodeAs<T>(Id);
-    return selectFirst<T>(InnerId, match(InnerMatcher, *Node, *Context)) !=
-           nullptr;
-  }
-private:
-  std::string Id;
-  internal::Matcher<T> InnerMatcher;
-  std::string InnerId;
-};
-
-TEST(MatchFinder, CanMatchDeclarationsRecursively) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X { class Y {}; };", recordDecl(hasName("::X")).bind("X"),
-      new VerifyMatchOnNode<clang::Decl>(
-          "X", decl(hasDescendant(recordDecl(hasName("X::Y")).bind("Y"))),
-          "Y")));
-  EXPECT_TRUE(matchAndVerifyResultFalse(
-      "class X { class Y {}; };", recordDecl(hasName("::X")).bind("X"),
-      new VerifyMatchOnNode<clang::Decl>(
-          "X", decl(hasDescendant(recordDecl(hasName("X::Z")).bind("Z"))),
-          "Z")));
-}
-
-TEST(MatchFinder, CanMatchStatementsRecursively) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void f() { if (1) { for (;;) { } } }", ifStmt().bind("if"),
-      new VerifyMatchOnNode<clang::Stmt>(
-          "if", stmt(hasDescendant(forStmt().bind("for"))), "for")));
-  EXPECT_TRUE(matchAndVerifyResultFalse(
-      "void f() { if (1) { for (;;) { } } }", ifStmt().bind("if"),
-      new VerifyMatchOnNode<clang::Stmt>(
-          "if", stmt(hasDescendant(declStmt().bind("decl"))), "decl")));
-}
-
-TEST(MatchFinder, CanMatchSingleNodesRecursively) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X { class Y {}; };", recordDecl(hasName("::X")).bind("X"),
-      new VerifyMatchOnNode<clang::Decl>(
-          "X", recordDecl(has(recordDecl(hasName("X::Y")).bind("Y"))), "Y")));
-  EXPECT_TRUE(matchAndVerifyResultFalse(
-      "class X { class Y {}; };", recordDecl(hasName("::X")).bind("X"),
-      new VerifyMatchOnNode<clang::Decl>(
-          "X", recordDecl(has(recordDecl(hasName("X::Z")).bind("Z"))), "Z")));
-}
-
-template <typename T>
-class VerifyAncestorHasChildIsEqual : public BoundNodesCallback {
-public:
-  bool run(const BoundNodes *Nodes) override { return false; }
-
-  bool run(const BoundNodes *Nodes, ASTContext *Context) override {
-    const T *Node = Nodes->getNodeAs<T>("");
-    return verify(*Nodes, *Context, Node);
-  }
-
-  bool verify(const BoundNodes &Nodes, ASTContext &Context, const Stmt *Node) {
-    // Use the original typed pointer to verify we can pass pointers to subtypes
-    // to equalsNode.
-    const T *TypedNode = cast<T>(Node);
-    return selectFirst<T>(
-               "", match(stmt(hasParent(
-                             stmt(has(stmt(equalsNode(TypedNode)))).bind(""))),
-                         *Node, Context)) != nullptr;
-  }
-  bool verify(const BoundNodes &Nodes, ASTContext &Context, const Decl *Node) {
-    // Use the original typed pointer to verify we can pass pointers to subtypes
-    // to equalsNode.
-    const T *TypedNode = cast<T>(Node);
-    return selectFirst<T>(
-               "", match(decl(hasParent(
-                             decl(has(decl(equalsNode(TypedNode)))).bind(""))),
-                         *Node, Context)) != nullptr;
-  }
-  bool verify(const BoundNodes &Nodes, ASTContext &Context, const Type *Node) {
-    // Use the original typed pointer to verify we can pass pointers to subtypes
-    // to equalsNode.
-    const T *TypedNode = cast<T>(Node);
-    const auto *Dec = Nodes.getNodeAs<FieldDecl>("decl");
-    return selectFirst<T>(
-               "", match(fieldDecl(hasParent(decl(has(fieldDecl(
-                             hasType(type(equalsNode(TypedNode)).bind(""))))))),
-                         *Dec, Context)) != nullptr;
-  }
-};
-
-TEST(IsEqualTo, MatchesNodesByIdentity) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X { class Y {}; };", recordDecl(hasName("::X::Y")).bind(""),
-      new VerifyAncestorHasChildIsEqual<CXXRecordDecl>()));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void f() { if (true) if(true) {} }", ifStmt().bind(""),
-      new VerifyAncestorHasChildIsEqual<IfStmt>()));
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "class X { class Y {} y; };",
-      fieldDecl(hasName("y"), hasType(type().bind(""))).bind("decl"),
-      new VerifyAncestorHasChildIsEqual<Type>()));
-}
-
-TEST(MatchFinder, CheckProfiling) {
-  MatchFinder::MatchFinderOptions Options;
-  llvm::StringMap<llvm::TimeRecord> Records;
-  Options.CheckProfiling.emplace(Records);
-  MatchFinder Finder(std::move(Options));
-
-  struct NamedCallback : public MatchFinder::MatchCallback {
-    void run(const MatchFinder::MatchResult &Result) override {}
-    StringRef getID() const override { return "MyID"; }
-  } Callback;
-  Finder.addMatcher(decl(), &Callback);
-  std::unique_ptr<FrontendActionFactory> Factory(
-      newFrontendActionFactory(&Finder));
-  ASSERT_TRUE(tooling::runToolOnCode(Factory->create(), "int x;"));
-
-  EXPECT_EQ(1u, Records.size());
-  EXPECT_EQ("MyID", Records.begin()->getKey());
-}
-
-class VerifyStartOfTranslationUnit : public MatchFinder::MatchCallback {
-public:
-  VerifyStartOfTranslationUnit() : Called(false) {}
-  void run(const MatchFinder::MatchResult &Result) override {
-    EXPECT_TRUE(Called);
-  }
-  void onStartOfTranslationUnit() override { Called = true; }
-  bool Called;
-};
-
-TEST(MatchFinder, InterceptsStartOfTranslationUnit) {
-  MatchFinder Finder;
-  VerifyStartOfTranslationUnit VerifyCallback;
-  Finder.addMatcher(decl(), &VerifyCallback);
-  std::unique_ptr<FrontendActionFactory> Factory(
-      newFrontendActionFactory(&Finder));
-  ASSERT_TRUE(tooling::runToolOnCode(Factory->create(), "int x;"));
-  EXPECT_TRUE(VerifyCallback.Called);
-
-  VerifyCallback.Called = false;
-  std::unique_ptr<ASTUnit> AST(tooling::buildASTFromCode("int x;"));
-  ASSERT_TRUE(AST.get());
-  Finder.matchAST(AST->getASTContext());
-  EXPECT_TRUE(VerifyCallback.Called);
-}
-
-class VerifyEndOfTranslationUnit : public MatchFinder::MatchCallback {
-public:
-  VerifyEndOfTranslationUnit() : Called(false) {}
-  void run(const MatchFinder::MatchResult &Result) override {
-    EXPECT_FALSE(Called);
-  }
-  void onEndOfTranslationUnit() override { Called = true; }
-  bool Called;
-};
-
-TEST(MatchFinder, InterceptsEndOfTranslationUnit) {
-  MatchFinder Finder;
-  VerifyEndOfTranslationUnit VerifyCallback;
-  Finder.addMatcher(decl(), &VerifyCallback);
-  std::unique_ptr<FrontendActionFactory> Factory(
-      newFrontendActionFactory(&Finder));
-  ASSERT_TRUE(tooling::runToolOnCode(Factory->create(), "int x;"));
-  EXPECT_TRUE(VerifyCallback.Called);
-
-  VerifyCallback.Called = false;
-  std::unique_ptr<ASTUnit> AST(tooling::buildASTFromCode("int x;"));
-  ASSERT_TRUE(AST.get());
-  Finder.matchAST(AST->getASTContext());
-  EXPECT_TRUE(VerifyCallback.Called);
-}
-
-TEST(EqualsBoundNodeMatcher, QualType) {
-  EXPECT_TRUE(matches(
-      "int i = 1;", varDecl(hasType(qualType().bind("type")),
-                            hasInitializer(ignoringParenImpCasts(
-                                hasType(qualType(equalsBoundNode("type"))))))));
-  EXPECT_TRUE(notMatches("int i = 1.f;",
-                         varDecl(hasType(qualType().bind("type")),
-                                 hasInitializer(ignoringParenImpCasts(hasType(
-                                     qualType(equalsBoundNode("type"))))))));
-}
-
-TEST(EqualsBoundNodeMatcher, NonMatchingTypes) {
-  EXPECT_TRUE(notMatches(
-      "int i = 1;", varDecl(namedDecl(hasName("i")).bind("name"),
-                            hasInitializer(ignoringParenImpCasts(
-                                hasType(qualType(equalsBoundNode("type"))))))));
-}
-
-TEST(EqualsBoundNodeMatcher, Stmt) {
-  EXPECT_TRUE(
-      matches("void f() { if(true) {} }",
-              stmt(allOf(ifStmt().bind("if"),
-                         hasParent(stmt(has(stmt(equalsBoundNode("if")))))))));
-
-  EXPECT_TRUE(notMatches(
-      "void f() { if(true) { if (true) {} } }",
-      stmt(allOf(ifStmt().bind("if"), has(stmt(equalsBoundNode("if")))))));
-}
-
-TEST(EqualsBoundNodeMatcher, Decl) {
-  EXPECT_TRUE(matches(
-      "class X { class Y {}; };",
-      decl(allOf(recordDecl(hasName("::X::Y")).bind("record"),
-                 hasParent(decl(has(decl(equalsBoundNode("record")))))))));
-
-  EXPECT_TRUE(notMatches("class X { class Y {}; };",
-                         decl(allOf(recordDecl(hasName("::X")).bind("record"),
-                                    has(decl(equalsBoundNode("record")))))));
-}
-
-TEST(EqualsBoundNodeMatcher, Type) {
-  EXPECT_TRUE(matches(
-      "class X { int a; int b; };",
-      recordDecl(
-          has(fieldDecl(hasName("a"), hasType(type().bind("t")))),
-          has(fieldDecl(hasName("b"), hasType(type(equalsBoundNode("t"))))))));
-
-  EXPECT_TRUE(notMatches(
-      "class X { int a; double b; };",
-      recordDecl(
-          has(fieldDecl(hasName("a"), hasType(type().bind("t")))),
-          has(fieldDecl(hasName("b"), hasType(type(equalsBoundNode("t"))))))));
-}
-
-TEST(EqualsBoundNodeMatcher, UsingForEachDescendant) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "int f() {"
-      "  if (1) {"
-      "    int i = 9;"
-      "  }"
-      "  int j = 10;"
-      "  {"
-      "    float k = 9.0;"
-      "  }"
-      "  return 0;"
-      "}",
-      // Look for variable declarations within functions whose type is the same
-      // as the function return type.
-      functionDecl(returns(qualType().bind("type")),
-                   forEachDescendant(varDecl(hasType(
-                       qualType(equalsBoundNode("type")))).bind("decl"))),
-      // Only i and j should match, not k.
-      new VerifyIdIsBoundTo<VarDecl>("decl", 2)));
-}
-
-TEST(EqualsBoundNodeMatcher, FiltersMatchedCombinations) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "void f() {"
-      "  int x;"
-      "  double d;"
-      "  x = d + x - d + x;"
-      "}",
-      functionDecl(
-          hasName("f"), forEachDescendant(varDecl().bind("d")),
-          forEachDescendant(declRefExpr(to(decl(equalsBoundNode("d")))))),
-      new VerifyIdIsBoundTo<VarDecl>("d", 5)));
-}
-
-TEST(EqualsBoundNodeMatcher, UnlessDescendantsOfAncestorsMatch) {
-  EXPECT_TRUE(matchAndVerifyResultTrue(
-      "struct StringRef { int size() const; const char* data() const; };"
-      "void f(StringRef v) {"
-      "  v.data();"
-      "}",
-      cxxMemberCallExpr(
-          callee(cxxMethodDecl(hasName("data"))),
-          on(declRefExpr(to(
-              varDecl(hasType(recordDecl(hasName("StringRef")))).bind("var")))),
-          unless(hasAncestor(stmt(hasDescendant(cxxMemberCallExpr(
-              callee(cxxMethodDecl(anyOf(hasName("size"), hasName("length")))),
-              on(declRefExpr(to(varDecl(equalsBoundNode("var")))))))))))
-          .bind("data"),
-      new VerifyIdIsBoundTo<Expr>("data", 1)));
-
-  EXPECT_FALSE(matches(
-      "struct StringRef { int size() const; const char* data() const; };"
-      "void f(StringRef v) {"
-      "  v.data();"
-      "  v.size();"
-      "}",
-      cxxMemberCallExpr(
-          callee(cxxMethodDecl(hasName("data"))),
-          on(declRefExpr(to(
-              varDecl(hasType(recordDecl(hasName("StringRef")))).bind("var")))),
-          unless(hasAncestor(stmt(hasDescendant(cxxMemberCallExpr(
-              callee(cxxMethodDecl(anyOf(hasName("size"), hasName("length")))),
-              on(declRefExpr(to(varDecl(equalsBoundNode("var")))))))))))
-          .bind("data")));
-}
-
-TEST(TypeDefDeclMatcher, Match) {
-  EXPECT_TRUE(matches("typedef int typedefDeclTest;",
-                      typedefDecl(hasName("typedefDeclTest"))));
-}
-
-TEST(IsInlineMatcher, IsInline) {
-  EXPECT_TRUE(matches("void g(); inline void f();",
-                      functionDecl(isInline(), hasName("f"))));
-  EXPECT_TRUE(matches("namespace n { inline namespace m {} }",
-                      namespaceDecl(isInline(), hasName("m"))));
-}
-
-// FIXME: Figure out how to specify paths so the following tests pass on Windows.
-#ifndef LLVM_ON_WIN32
-
-TEST(Matcher, IsExpansionInMainFileMatcher) {
-  EXPECT_TRUE(matches("class X {};",
-                      recordDecl(hasName("X"), isExpansionInMainFile())));
-  EXPECT_TRUE(notMatches("", recordDecl(isExpansionInMainFile())));
-  FileContentMappings M;
-  M.push_back(std::make_pair("/other", "class X {};"));
-  EXPECT_TRUE(matchesConditionally("#include <other>\n",
-                                   recordDecl(isExpansionInMainFile()), false,
-                                   "-isystem/", M));
-}
-
-TEST(Matcher, IsExpansionInSystemHeader) {
-  FileContentMappings M;
-  M.push_back(std::make_pair("/other", "class X {};"));
-  EXPECT_TRUE(matchesConditionally(
-      "#include \"other\"\n", recordDecl(isExpansionInSystemHeader()), true,
-      "-isystem/", M));
-  EXPECT_TRUE(matchesConditionally("#include \"other\"\n",
-                                   recordDecl(isExpansionInSystemHeader()),
-                                   false, "-I/", M));
-  EXPECT_TRUE(notMatches("class X {};",
-                         recordDecl(isExpansionInSystemHeader())));
-  EXPECT_TRUE(notMatches("", recordDecl(isExpansionInSystemHeader())));
-}
-
-TEST(Matcher, IsExpansionInFileMatching) {
-  FileContentMappings M;
-  M.push_back(std::make_pair("/foo", "class A {};"));
-  M.push_back(std::make_pair("/bar", "class B {};"));
-  EXPECT_TRUE(matchesConditionally(
-      "#include <foo>\n"
-      "#include <bar>\n"
-      "class X {};",
-      recordDecl(isExpansionInFileMatching("b.*"), hasName("B")), true,
-      "-isystem/", M));
-  EXPECT_TRUE(matchesConditionally(
-      "#include <foo>\n"
-      "#include <bar>\n"
-      "class X {};",
-      recordDecl(isExpansionInFileMatching("f.*"), hasName("X")), false,
-      "-isystem/", M));
-}
-
-#endif // LLVM_ON_WIN32
-
-  
-TEST(ObjCMessageExprMatcher, SimpleExprs) {
-  // don't find ObjCMessageExpr where none are present
-  EXPECT_TRUE(notMatchesObjC("", objcMessageExpr(anything())));
- 
-  std::string Objc1String =
-  "@interface Str "
-  " - (Str *)uppercaseString:(Str *)str;"
-  "@end "
-  "@interface foo "
-  "- (void)meth:(Str *)text;"
-  "@end "
-  " "
-  "@implementation foo "
-  "- (void) meth:(Str *)text { "
-  "  [self contents];"
-  "  Str *up = [text uppercaseString];"
-  "} "
-  "@end ";
-  EXPECT_TRUE(matchesObjC(
-      Objc1String,
-      objcMessageExpr(anything())));
-  EXPECT_TRUE(matchesObjC(
-      Objc1String,
-      objcMessageExpr(hasSelector("contents"))));
-  EXPECT_TRUE(matchesObjC(
-      Objc1String,
-      objcMessageExpr(matchesSelector("cont*"))));
-  EXPECT_FALSE(matchesObjC(
-      Objc1String,
-      objcMessageExpr(matchesSelector("?cont*"))));
-  EXPECT_TRUE(notMatchesObjC(
-      Objc1String,
-      objcMessageExpr(hasSelector("contents"), hasNullSelector())));
-  EXPECT_TRUE(matchesObjC(
-      Objc1String,
-      objcMessageExpr(hasSelector("contents"), hasUnarySelector())));
-  EXPECT_TRUE(matchesObjC(
-      Objc1String,
-      objcMessageExpr(hasSelector("contents"), numSelectorArgs(0))));
-  EXPECT_TRUE(matchesObjC(
-      Objc1String,
-      objcMessageExpr(matchesSelector("uppercase*"),
-                      argumentCountIs(0)
-                      )));
-  
-}
-
-} // end namespace ast_matchers
-} // end namespace clang
diff --git a/unittests/ASTMatchers/ASTMatchersTest.h b/unittests/ASTMatchers/ASTMatchersTest.h
index 68824e6..fdb273c 100644
--- a/unittests/ASTMatchers/ASTMatchersTest.h
+++ b/unittests/ASTMatchers/ASTMatchersTest.h
@@ -37,8 +37,8 @@
 // If 'FindResultVerifier' is NULL, sets *Verified to true when Run is called.
 class VerifyMatch : public MatchFinder::MatchCallback {
 public:
-  VerifyMatch(BoundNodesCallback *FindResultVerifier, bool *Verified)
-      : Verified(Verified), FindResultReviewer(FindResultVerifier) {}
+  VerifyMatch(std::unique_ptr<BoundNodesCallback> FindResultVerifier, bool *Verified)
+      : Verified(Verified), FindResultReviewer(std::move(FindResultVerifier)) {}
 
   void run(const MatchFinder::MatchResult &Result) override {
     if (FindResultReviewer != nullptr) {
@@ -55,7 +55,7 @@
 
 private:
   bool *const Verified;
-  BoundNodesCallback *const FindResultReviewer;
+  const std::unique_ptr<BoundNodesCallback> FindResultReviewer;
 };
 
 template <typename T>
@@ -73,15 +73,19 @@
     return testing::AssertionFailure() << "Could not add dynamic matcher";
   std::unique_ptr<FrontendActionFactory> Factory(
       newFrontendActionFactory(&Finder));
-  // Some tests use typeof, which is a gnu extension.
-  std::vector<std::string> Args;
-  Args.push_back(CompileArg);
-  // Some tests need rtti/exceptions on
-  Args.push_back("-frtti");
-  Args.push_back("-fexceptions");
-  if (!runToolOnCodeWithArgs(Factory->create(), Code, Args, Filename,
-                             std::make_shared<PCHContainerOperations>(),
-                             VirtualMappedFiles)) {
+  // Some tests need rtti/exceptions on.  Use an unknown-unknown triple so we
+  // don't instantiate the full system toolchain.  On Linux, instantiating the
+  // toolchain involves stat'ing large portions of /usr/lib, and this slows down
+  // not only this test, but all other tests, via contention in the kernel.
+  //
+  // FIXME: This is a hack to work around the fact that there's no way to do the
+  // equivalent of runToolOnCodeWithArgs without instantiating a full Driver.
+  // We should consider having a function, at least for tests, that invokes cc1.
+  std::vector<std::string> Args = {CompileArg, "-frtti", "-fexceptions",
+                                   "-target", "i386-unknown-unknown"};
+  if (!runToolOnCodeWithArgs(
+          Factory->create(), Code, Args, Filename, "clang-tool",
+          std::make_shared<PCHContainerOperations>(), VirtualMappedFiles)) {
     return testing::AssertionFailure() << "Parsing error in \"" << Code << "\"";
   }
   if (Found != DynamicFound) {
@@ -126,6 +130,13 @@
 }
 
 template <typename T>
+testing::AssertionResult matchesC99(const std::string &Code,
+                                    const T &AMatcher) {
+  return matchesConditionally(Code, AMatcher, true, "-std=c99",
+                              FileContentMappings(), "input.c");
+}
+
+template <typename T>
 testing::AssertionResult notMatchesC(const std::string &Code,
                                      const T &AMatcher) {
   return matchesConditionally(Code, AMatcher, false, "", FileContentMappings(),
@@ -173,13 +184,12 @@
     return testing::AssertionFailure() << "Could not add dynamic matcher";
   std::unique_ptr<FrontendActionFactory> Factory(
       newFrontendActionFactory(&Finder));
-  // Some tests use typeof, which is a gnu extension.
-  std::vector<std::string> Args;
-  Args.push_back("-xcuda");
-  Args.push_back("-fno-ms-extensions");
-  Args.push_back("--cuda-host-only");
-  Args.push_back("-nocudainc");
-  Args.push_back(CompileArg);
+  // Some tests use typeof, which is a gnu extension.  Using an explicit
+  // unknown-unknown triple is good for a large speedup, because it lets us
+  // avoid constructing a full system triple.
+  std::vector<std::string> Args = {
+      "-xcuda",  "-fno-ms-extensions",      "--cuda-host-only", "-nocudainc",
+      "-target", "x86_64-unknown-unknown", CompileArg};
   if (!runToolOnCodeWithArgs(Factory->create(),
                              CudaHeader + Code, Args)) {
     return testing::AssertionFailure() << "Parsing error in \"" << Code << "\"";
@@ -215,17 +225,19 @@
 template <typename T>
 testing::AssertionResult
 matchAndVerifyResultConditionally(const std::string &Code, const T &AMatcher,
-                                  BoundNodesCallback *FindResultVerifier,
+                                  std::unique_ptr<BoundNodesCallback> FindResultVerifier,
                                   bool ExpectResult) {
-  std::unique_ptr<BoundNodesCallback> ScopedVerifier(FindResultVerifier);
   bool VerifiedResult = false;
   MatchFinder Finder;
-  VerifyMatch VerifyVerifiedResult(FindResultVerifier, &VerifiedResult);
+  VerifyMatch VerifyVerifiedResult(std::move(FindResultVerifier), &VerifiedResult);
   Finder.addMatcher(AMatcher, &VerifyVerifiedResult);
   std::unique_ptr<FrontendActionFactory> Factory(
       newFrontendActionFactory(&Finder));
-  // Some tests use typeof, which is a gnu extension.
-  std::vector<std::string> Args(1, "-std=gnu++98");
+  // Some tests use typeof, which is a gnu extension.  Using an explicit
+  // unknown-unknown triple is good for a large speedup, because it lets us
+  // avoid constructing a full system triple.
+  std::vector<std::string> Args = {"-std=gnu++98", "-target",
+                                   "i386-unknown-unknown"};
   if (!runToolOnCodeWithArgs(Factory->create(), Code, Args)) {
     return testing::AssertionFailure() << "Parsing error in \"" << Code << "\"";
   }
@@ -259,20 +271,92 @@
 template <typename T>
 testing::AssertionResult
 matchAndVerifyResultTrue(const std::string &Code, const T &AMatcher,
-                         BoundNodesCallback *FindResultVerifier) {
+                         std::unique_ptr<BoundNodesCallback> FindResultVerifier) {
   return matchAndVerifyResultConditionally(
-      Code, AMatcher, FindResultVerifier, true);
+      Code, AMatcher, std::move(FindResultVerifier), true);
 }
 
 template <typename T>
 testing::AssertionResult
 matchAndVerifyResultFalse(const std::string &Code, const T &AMatcher,
-                          BoundNodesCallback *FindResultVerifier) {
+                          std::unique_ptr<BoundNodesCallback> FindResultVerifier) {
   return matchAndVerifyResultConditionally(
-      Code, AMatcher, FindResultVerifier, false);
+      Code, AMatcher, std::move(FindResultVerifier), false);
 }
 
-} // end namespace ast_matchers
-} // end namespace clang
+// Implements a run method that returns whether BoundNodes contains a
+// Decl bound to Id that can be dynamically cast to T.
+// Optionally checks that the check succeeded a specific number of times.
+template <typename T>
+class VerifyIdIsBoundTo : public BoundNodesCallback {
+public:
+  // Create an object that checks that a node of type \c T was bound to \c Id.
+  // Does not check for a certain number of matches.
+  explicit VerifyIdIsBoundTo(llvm::StringRef Id)
+    : Id(Id), ExpectedCount(-1), Count(0) {}
+
+  // Create an object that checks that a node of type \c T was bound to \c Id.
+  // Checks that there were exactly \c ExpectedCount matches.
+  VerifyIdIsBoundTo(llvm::StringRef Id, int ExpectedCount)
+    : Id(Id), ExpectedCount(ExpectedCount), Count(0) {}
+
+  // Create an object that checks that a node of type \c T was bound to \c Id.
+  // Checks that there was exactly one match with the name \c ExpectedName.
+  // Note that \c T must be a NamedDecl for this to work.
+  VerifyIdIsBoundTo(llvm::StringRef Id, llvm::StringRef ExpectedName,
+                    int ExpectedCount = 1)
+    : Id(Id), ExpectedCount(ExpectedCount), Count(0),
+      ExpectedName(ExpectedName) {}
+
+  void onEndOfTranslationUnit() override {
+    if (ExpectedCount != -1)
+      EXPECT_EQ(ExpectedCount, Count);
+    if (!ExpectedName.empty())
+      EXPECT_EQ(ExpectedName, Name);
+    Count = 0;
+    Name.clear();
+  }
+
+  ~VerifyIdIsBoundTo() override {
+    EXPECT_EQ(0, Count);
+    EXPECT_EQ("", Name);
+  }
+
+  bool run(const BoundNodes *Nodes) override {
+    const BoundNodes::IDToNodeMap &M = Nodes->getMap();
+    if (Nodes->getNodeAs<T>(Id)) {
+      ++Count;
+      if (const NamedDecl *Named = Nodes->getNodeAs<NamedDecl>(Id)) {
+        Name = Named->getNameAsString();
+      } else if (const NestedNameSpecifier *NNS =
+        Nodes->getNodeAs<NestedNameSpecifier>(Id)) {
+        llvm::raw_string_ostream OS(Name);
+        NNS->print(OS, PrintingPolicy(LangOptions()));
+      }
+      BoundNodes::IDToNodeMap::const_iterator I = M.find(Id);
+      EXPECT_NE(M.end(), I);
+      if (I != M.end())
+        EXPECT_EQ(Nodes->getNodeAs<T>(Id), I->second.get<T>());
+      return true;
+    }
+    EXPECT_TRUE(M.count(Id) == 0 ||
+      M.find(Id)->second.template get<T>() == nullptr);
+    return false;
+  }
+
+  bool run(const BoundNodes *Nodes, ASTContext *Context) override {
+    return run(Nodes);
+  }
+
+private:
+  const std::string Id;
+  const int ExpectedCount;
+  int Count;
+  const std::string ExpectedName;
+  std::string Name;
+};
+
+} // namespace ast_matchers
+} // namespace clang
 
 #endif  // LLVM_CLANG_UNITTESTS_AST_MATCHERS_AST_MATCHERS_TEST_H
diff --git a/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
new file mode 100644
index 0000000..fcd3dcb
--- /dev/null
+++ b/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -0,0 +1,2110 @@
+//= unittests/ASTMatchers/ASTMatchersTraversalTest.cpp - matchers unit tests =//
+//
+//                     The LLVM Compiler Infrastructure
+//`
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ASTMatchersTest.h"
+#include "clang/AST/PrettyPrinter.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Host.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace ast_matchers {
+
+TEST(DeclarationMatcher, hasMethod) {
+  EXPECT_TRUE(matches("class A { void func(); };",
+                      cxxRecordDecl(hasMethod(hasName("func")))));
+  EXPECT_TRUE(notMatches("class A { void func(); };",
+                         cxxRecordDecl(hasMethod(isPublic()))));
+}
+
+TEST(DeclarationMatcher, ClassDerivedFromDependentTemplateSpecialization) {
+  EXPECT_TRUE(matches(
+    "template <typename T> struct A {"
+      "  template <typename T2> struct F {};"
+      "};"
+      "template <typename T> struct B : A<T>::template F<T> {};"
+      "B<int> b;",
+    cxxRecordDecl(hasName("B"), isDerivedFrom(recordDecl()))));
+}
+
+TEST(DeclarationMatcher, hasDeclContext) {
+  EXPECT_TRUE(matches(
+    "namespace N {"
+      "  namespace M {"
+      "    class D {};"
+      "  }"
+      "}",
+    recordDecl(hasDeclContext(namespaceDecl(hasName("M"))))));
+  EXPECT_TRUE(notMatches(
+    "namespace N {"
+      "  namespace M {"
+      "    class D {};"
+      "  }"
+      "}",
+    recordDecl(hasDeclContext(namespaceDecl(hasName("N"))))));
+
+  EXPECT_TRUE(matches("namespace {"
+                        "  namespace M {"
+                        "    class D {};"
+                        "  }"
+                        "}",
+                      recordDecl(hasDeclContext(namespaceDecl(
+                        hasName("M"), hasDeclContext(namespaceDecl()))))));
+
+  EXPECT_TRUE(matches("class D{};", decl(hasDeclContext(decl()))));
+}
+
+TEST(HasDescendant, MatchesDescendantTypes) {
+  EXPECT_TRUE(matches("void f() { int i = 3; }",
+                      decl(hasDescendant(loc(builtinType())))));
+  EXPECT_TRUE(matches("void f() { int i = 3; }",
+                      stmt(hasDescendant(builtinType()))));
+
+  EXPECT_TRUE(matches("void f() { int i = 3; }",
+                      stmt(hasDescendant(loc(builtinType())))));
+  EXPECT_TRUE(matches("void f() { int i = 3; }",
+                      stmt(hasDescendant(qualType(builtinType())))));
+
+  EXPECT_TRUE(notMatches("void f() { float f = 2.0f; }",
+                         stmt(hasDescendant(isInteger()))));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void f() { int a; float c; int d; int e; }",
+    functionDecl(forEachDescendant(
+      varDecl(hasDescendant(isInteger())).bind("x"))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 3)));
+}
+
+TEST(HasDescendant, MatchesDescendantsOfTypes) {
+  EXPECT_TRUE(matches("void f() { int*** i; }",
+                      qualType(hasDescendant(builtinType()))));
+  EXPECT_TRUE(matches("void f() { int*** i; }",
+                      qualType(hasDescendant(
+                        pointerType(pointee(builtinType()))))));
+  EXPECT_TRUE(matches("void f() { int*** i; }",
+                      typeLoc(hasDescendant(loc(builtinType())))));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void f() { int*** i; }",
+    qualType(asString("int ***"), forEachDescendant(pointerType().bind("x"))),
+    llvm::make_unique<VerifyIdIsBoundTo<Type>>("x", 2)));
+}
+
+
+TEST(Has, MatchesChildrenOfTypes) {
+  EXPECT_TRUE(matches("int i;",
+                      varDecl(hasName("i"), has(isInteger()))));
+  EXPECT_TRUE(notMatches("int** i;",
+                         varDecl(hasName("i"), has(isInteger()))));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "int (*f)(float, int);",
+    qualType(functionType(), forEach(qualType(isInteger()).bind("x"))),
+    llvm::make_unique<VerifyIdIsBoundTo<QualType>>("x", 2)));
+}
+
+TEST(Has, MatchesChildTypes) {
+  EXPECT_TRUE(matches(
+    "int* i;",
+    varDecl(hasName("i"), hasType(qualType(has(builtinType()))))));
+  EXPECT_TRUE(notMatches(
+    "int* i;",
+    varDecl(hasName("i"), hasType(qualType(has(pointerType()))))));
+}
+
+TEST(StatementMatcher, Has) {
+  StatementMatcher HasVariableI =
+      expr(hasType(pointsTo(recordDecl(hasName("X")))),
+           has(ignoringParenImpCasts(declRefExpr(to(varDecl(hasName("i")))))));
+
+  EXPECT_TRUE(matches(
+    "class X; X *x(int); void c() { int i; x(i); }", HasVariableI));
+  EXPECT_TRUE(notMatches(
+    "class X; X *x(int); void c() { int i; x(42); }", HasVariableI));
+}
+
+TEST(StatementMatcher, HasDescendant) {
+  StatementMatcher HasDescendantVariableI =
+    expr(hasType(pointsTo(recordDecl(hasName("X")))),
+         hasDescendant(declRefExpr(to(varDecl(hasName("i"))))));
+
+  EXPECT_TRUE(matches(
+    "class X; X *x(bool); bool b(int); void c() { int i; x(b(i)); }",
+    HasDescendantVariableI));
+  EXPECT_TRUE(notMatches(
+    "class X; X *x(bool); bool b(int); void c() { int i; x(b(42)); }",
+    HasDescendantVariableI));
+}
+
+TEST(TypeMatcher, MatchesClassType) {
+  TypeMatcher TypeA = hasDeclaration(recordDecl(hasName("A")));
+
+  EXPECT_TRUE(matches("class A { public: A *a; };", TypeA));
+  EXPECT_TRUE(notMatches("class A {};", TypeA));
+
+  TypeMatcher TypeDerivedFromA =
+    hasDeclaration(cxxRecordDecl(isDerivedFrom("A")));
+
+  EXPECT_TRUE(matches("class A {}; class B : public A { public: B *b; };",
+                      TypeDerivedFromA));
+  EXPECT_TRUE(notMatches("class A {};", TypeA));
+
+  TypeMatcher TypeAHasClassB = hasDeclaration(
+    recordDecl(hasName("A"), has(recordDecl(hasName("B")))));
+
+  EXPECT_TRUE(
+    matches("class A { public: A *a; class B {}; };", TypeAHasClassB));
+
+  EXPECT_TRUE(matchesC("struct S {}; void f(void) { struct S s; }",
+                       varDecl(hasType(namedDecl(hasName("S"))))));
+}
+
+TEST(TypeMatcher, MatchesDeclTypes) {
+  // TypedefType -> TypedefNameDecl
+  EXPECT_TRUE(matches("typedef int I; void f(I i);",
+                      parmVarDecl(hasType(namedDecl(hasName("I"))))));
+  // ObjCObjectPointerType
+  EXPECT_TRUE(matchesObjC("@interface Foo @end void f(Foo *f);",
+                          parmVarDecl(hasType(objcObjectPointerType()))));
+  // ObjCObjectPointerType -> ObjCInterfaceType -> ObjCInterfaceDecl
+  EXPECT_TRUE(matchesObjC(
+    "@interface Foo @end void f(Foo *f);",
+    parmVarDecl(hasType(pointsTo(objcInterfaceDecl(hasName("Foo")))))));
+  // TemplateTypeParmType
+  EXPECT_TRUE(matches("template <typename T> void f(T t);",
+                      parmVarDecl(hasType(templateTypeParmType()))));
+  // TemplateTypeParmType -> TemplateTypeParmDecl
+  EXPECT_TRUE(matches("template <typename T> void f(T t);",
+                      parmVarDecl(hasType(namedDecl(hasName("T"))))));
+  // InjectedClassNameType
+  EXPECT_TRUE(matches("template <typename T> struct S {"
+                        "  void f(S s);"
+                        "};",
+                      parmVarDecl(hasType(injectedClassNameType()))));
+  EXPECT_TRUE(notMatches("template <typename T> struct S {"
+                           "  void g(S<T> s);"
+                           "};",
+                         parmVarDecl(hasType(injectedClassNameType()))));
+  // InjectedClassNameType -> CXXRecordDecl
+  EXPECT_TRUE(matches("template <typename T> struct S {"
+                        "  void f(S s);"
+                        "};",
+                      parmVarDecl(hasType(namedDecl(hasName("S"))))));
+
+  static const char Using[] = "template <typename T>"
+    "struct Base {"
+    "  typedef T Foo;"
+    "};"
+    ""
+    "template <typename T>"
+    "struct S : private Base<T> {"
+    "  using typename Base<T>::Foo;"
+    "  void f(Foo);"
+    "};";
+  // UnresolvedUsingTypenameDecl
+  EXPECT_TRUE(matches(Using, unresolvedUsingTypenameDecl(hasName("Foo"))));
+  // UnresolvedUsingTypenameType -> UnresolvedUsingTypenameDecl
+  EXPECT_TRUE(matches(Using, parmVarDecl(hasType(namedDecl(hasName("Foo"))))));
+}
+
+TEST(HasDeclaration, HasDeclarationOfEnumType) {
+  EXPECT_TRUE(matches("enum X {}; void y(X *x) { x; }",
+                      expr(hasType(pointsTo(
+                        qualType(hasDeclaration(enumDecl(hasName("X")))))))));
+}
+
+TEST(HasDeclaration, HasGetDeclTraitTest) {
+  EXPECT_TRUE(internal::has_getDecl<TypedefType>::value);
+  EXPECT_TRUE(internal::has_getDecl<RecordType>::value);
+  EXPECT_FALSE(internal::has_getDecl<TemplateSpecializationType>::value);
+}
+
+TEST(HasDeclaration, HasDeclarationOfTypeWithDecl) {
+  EXPECT_TRUE(matches("typedef int X; X a;",
+                      varDecl(hasName("a"),
+                              hasType(typedefType(hasDeclaration(decl()))))));
+
+  // FIXME: Add tests for other types with getDecl() (e.g. RecordType)
+}
+
+TEST(HasDeclaration, HasDeclarationOfTemplateSpecializationType) {
+  EXPECT_TRUE(matches("template <typename T> class A {}; A<int> a;",
+                      varDecl(hasType(templateSpecializationType(
+                        hasDeclaration(namedDecl(hasName("A"))))))));
+}
+
+TEST(HasUnderlyingDecl, Matches) {
+  EXPECT_TRUE(matches("namespace N { template <class T> void f(T t); }"
+                      "template <class T> void g() { using N::f; f(T()); }",
+                      unresolvedLookupExpr(hasAnyDeclaration(
+                          namedDecl(hasUnderlyingDecl(hasName("::N::f")))))));
+  EXPECT_TRUE(matches(
+      "namespace N { template <class T> void f(T t); }"
+      "template <class T> void g() { N::f(T()); }",
+      unresolvedLookupExpr(hasAnyDeclaration(namedDecl(hasName("::N::f"))))));
+  EXPECT_TRUE(notMatches(
+      "namespace N { template <class T> void f(T t); }"
+      "template <class T> void g() { using N::f; f(T()); }",
+      unresolvedLookupExpr(hasAnyDeclaration(namedDecl(hasName("::N::f"))))));
+}
+
+TEST(HasType, TakesQualTypeMatcherAndMatchesExpr) {
+  TypeMatcher ClassX = hasDeclaration(recordDecl(hasName("X")));
+  EXPECT_TRUE(
+    matches("class X {}; void y(X &x) { x; }", expr(hasType(ClassX))));
+  EXPECT_TRUE(
+    notMatches("class X {}; void y(X *x) { x; }",
+               expr(hasType(ClassX))));
+  EXPECT_TRUE(
+    matches("class X {}; void y(X *x) { x; }",
+            expr(hasType(pointsTo(ClassX)))));
+}
+
+TEST(HasType, TakesQualTypeMatcherAndMatchesValueDecl) {
+  TypeMatcher ClassX = hasDeclaration(recordDecl(hasName("X")));
+  EXPECT_TRUE(
+    matches("class X {}; void y() { X x; }", varDecl(hasType(ClassX))));
+  EXPECT_TRUE(
+    notMatches("class X {}; void y() { X *x; }", varDecl(hasType(ClassX))));
+  EXPECT_TRUE(
+    matches("class X {}; void y() { X *x; }",
+            varDecl(hasType(pointsTo(ClassX)))));
+}
+
+TEST(HasType, TakesDeclMatcherAndMatchesExpr) {
+  DeclarationMatcher ClassX = recordDecl(hasName("X"));
+  EXPECT_TRUE(
+    matches("class X {}; void y(X &x) { x; }", expr(hasType(ClassX))));
+  EXPECT_TRUE(
+    notMatches("class X {}; void y(X *x) { x; }",
+               expr(hasType(ClassX))));
+}
+
+TEST(HasType, TakesDeclMatcherAndMatchesValueDecl) {
+  DeclarationMatcher ClassX = recordDecl(hasName("X"));
+  EXPECT_TRUE(
+    matches("class X {}; void y() { X x; }", varDecl(hasType(ClassX))));
+  EXPECT_TRUE(
+    notMatches("class X {}; void y() { X *x; }", varDecl(hasType(ClassX))));
+}
+
+TEST(HasType, MatchesTypedefDecl) {
+  EXPECT_TRUE(matches("typedef int X;", typedefDecl(hasType(asString("int")))));
+  EXPECT_TRUE(matches("typedef const int T;",
+                      typedefDecl(hasType(asString("const int")))));
+  EXPECT_TRUE(notMatches("typedef const int T;",
+                         typedefDecl(hasType(asString("int")))));
+  EXPECT_TRUE(matches("typedef int foo; typedef foo bar;",
+                      typedefDecl(hasType(asString("foo")), hasName("bar"))));
+}
+
+TEST(HasType, MatchesTypedefNameDecl) {
+  EXPECT_TRUE(matches("using X = int;", typedefNameDecl(hasType(asString("int")))));
+  EXPECT_TRUE(matches("using T = const int;",
+                      typedefNameDecl(hasType(asString("const int")))));
+  EXPECT_TRUE(notMatches("using T = const int;",
+                         typedefNameDecl(hasType(asString("int")))));
+  EXPECT_TRUE(matches("using foo = int; using bar = foo;",
+                      typedefNameDecl(hasType(asString("foo")), hasName("bar"))));
+}
+
+TEST(HasTypeLoc, MatchesDeclaratorDecls) {
+  EXPECT_TRUE(matches("int x;",
+                      varDecl(hasName("x"), hasTypeLoc(loc(asString("int"))))));
+
+  // Make sure we don't crash on implicit constructors.
+  EXPECT_TRUE(notMatches("class X {}; X x;",
+                         declaratorDecl(hasTypeLoc(loc(asString("int"))))));
+}
+
+
+TEST(Callee, MatchesDeclarations) {
+  StatementMatcher CallMethodX = callExpr(callee(cxxMethodDecl(hasName("x"))));
+
+  EXPECT_TRUE(matches("class Y { void x() { x(); } };", CallMethodX));
+  EXPECT_TRUE(notMatches("class Y { void x() {} };", CallMethodX));
+
+  CallMethodX = callExpr(callee(cxxConversionDecl()));
+  EXPECT_TRUE(
+    matches("struct Y { operator int() const; }; int i = Y();", CallMethodX));
+  EXPECT_TRUE(notMatches("struct Y { operator int() const; }; Y y = Y();",
+                         CallMethodX));
+}
+
+TEST(Callee, MatchesMemberExpressions) {
+  EXPECT_TRUE(matches("class Y { void x() { this->x(); } };",
+                      callExpr(callee(memberExpr()))));
+  EXPECT_TRUE(
+    notMatches("class Y { void x() { this->x(); } };", callExpr(callee(callExpr()))));
+}
+
+TEST(Matcher, Argument) {
+  StatementMatcher CallArgumentY = callExpr(
+    hasArgument(0, declRefExpr(to(varDecl(hasName("y"))))));
+
+  EXPECT_TRUE(matches("void x(int) { int y; x(y); }", CallArgumentY));
+  EXPECT_TRUE(
+    matches("class X { void x(int) { int y; x(y); } };", CallArgumentY));
+  EXPECT_TRUE(notMatches("void x(int) { int z; x(z); }", CallArgumentY));
+
+  StatementMatcher WrongIndex = callExpr(
+    hasArgument(42, declRefExpr(to(varDecl(hasName("y"))))));
+  EXPECT_TRUE(notMatches("void x(int) { int y; x(y); }", WrongIndex));
+}
+
+TEST(Matcher, AnyArgument) {
+  StatementMatcher CallArgumentY = callExpr(
+    hasAnyArgument(
+      ignoringParenImpCasts(declRefExpr(to(varDecl(hasName("y")))))));
+  EXPECT_TRUE(matches("void x(int, int) { int y; x(1, y); }", CallArgumentY));
+  EXPECT_TRUE(matches("void x(int, int) { int y; x(y, 42); }", CallArgumentY));
+  EXPECT_TRUE(notMatches("void x(int, int) { x(1, 2); }", CallArgumentY));
+
+  StatementMatcher ImplicitCastedArgument = callExpr(
+    hasAnyArgument(implicitCastExpr()));
+  EXPECT_TRUE(matches("void x(long) { int y; x(y); }", ImplicitCastedArgument));
+}
+
+TEST(ForEachArgumentWithParam, ReportsNoFalsePositives) {
+  StatementMatcher ArgumentY =
+    declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  DeclarationMatcher IntParam = parmVarDecl(hasType(isInteger())).bind("param");
+  StatementMatcher CallExpr =
+    callExpr(forEachArgumentWithParam(ArgumentY, IntParam));
+
+  // IntParam does not match.
+  EXPECT_TRUE(notMatches("void f(int* i) { int* y; f(y); }", CallExpr));
+  // ArgumentY does not match.
+  EXPECT_TRUE(notMatches("void f(int i) { int x; f(x); }", CallExpr));
+}
+
+TEST(ForEachArgumentWithParam, MatchesCXXMemberCallExpr) {
+  StatementMatcher ArgumentY =
+    declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  DeclarationMatcher IntParam = parmVarDecl(hasType(isInteger())).bind("param");
+  StatementMatcher CallExpr =
+    callExpr(forEachArgumentWithParam(ArgumentY, IntParam));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "struct S {"
+      "  const S& operator[](int i) { return *this; }"
+      "};"
+      "void f(S S1) {"
+      "  int y = 1;"
+      "  S1[y];"
+      "}",
+    CallExpr, llvm::make_unique<VerifyIdIsBoundTo<ParmVarDecl>>("param", 1)));
+
+  StatementMatcher CallExpr2 =
+    callExpr(forEachArgumentWithParam(ArgumentY, IntParam));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "struct S {"
+      "  static void g(int i);"
+      "};"
+      "void f() {"
+      "  int y = 1;"
+      "  S::g(y);"
+      "}",
+    CallExpr2, llvm::make_unique<VerifyIdIsBoundTo<ParmVarDecl>>("param", 1)));
+}
+
+TEST(ForEachArgumentWithParam, MatchesCallExpr) {
+  StatementMatcher ArgumentY =
+    declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  DeclarationMatcher IntParam = parmVarDecl(hasType(isInteger())).bind("param");
+  StatementMatcher CallExpr =
+    callExpr(forEachArgumentWithParam(ArgumentY, IntParam));
+
+  EXPECT_TRUE(
+    matchAndVerifyResultTrue("void f(int i) { int y; f(y); }", CallExpr,
+                             llvm::make_unique<VerifyIdIsBoundTo<ParmVarDecl>>(
+                               "param")));
+  EXPECT_TRUE(
+    matchAndVerifyResultTrue("void f(int i) { int y; f(y); }", CallExpr,
+                             llvm::make_unique<VerifyIdIsBoundTo<DeclRefExpr>>(
+                               "arg")));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void f(int i, int j) { int y; f(y, y); }", CallExpr,
+    llvm::make_unique<VerifyIdIsBoundTo<ParmVarDecl>>("param", 2)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void f(int i, int j) { int y; f(y, y); }", CallExpr,
+    llvm::make_unique<VerifyIdIsBoundTo<DeclRefExpr>>("arg", 2)));
+}
+
+TEST(ForEachArgumentWithParam, MatchesConstructExpr) {
+  StatementMatcher ArgumentY =
+    declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  DeclarationMatcher IntParam = parmVarDecl(hasType(isInteger())).bind("param");
+  StatementMatcher ConstructExpr =
+    cxxConstructExpr(forEachArgumentWithParam(ArgumentY, IntParam));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "struct C {"
+      "  C(int i) {}"
+      "};"
+      "int y = 0;"
+      "C Obj(y);",
+    ConstructExpr,
+    llvm::make_unique<VerifyIdIsBoundTo<ParmVarDecl>>("param")));
+}
+
+TEST(ForEachArgumentWithParam, HandlesBoundNodesForNonMatches) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void g(int i, int j) {"
+      "  int a;"
+      "  int b;"
+      "  int c;"
+      "  g(a, 0);"
+      "  g(a, b);"
+      "  g(0, b);"
+      "}",
+    functionDecl(
+      forEachDescendant(varDecl().bind("v")),
+      forEachDescendant(callExpr(forEachArgumentWithParam(
+        declRefExpr(to(decl(equalsBoundNode("v")))), parmVarDecl())))),
+    llvm::make_unique<VerifyIdIsBoundTo<VarDecl>>("v", 4)));
+}
+
+TEST(QualType, hasCanonicalType) {
+  EXPECT_TRUE(notMatches("typedef int &int_ref;"
+                           "int a;"
+                           "int_ref b = a;",
+                         varDecl(hasType(qualType(referenceType())))));
+  EXPECT_TRUE(
+    matches("typedef int &int_ref;"
+              "int a;"
+              "int_ref b = a;",
+            varDecl(hasType(qualType(hasCanonicalType(referenceType()))))));
+}
+
+TEST(HasParameter, CallsInnerMatcher) {
+  EXPECT_TRUE(matches("class X { void x(int) {} };",
+                      cxxMethodDecl(hasParameter(0, varDecl()))));
+  EXPECT_TRUE(notMatches("class X { void x(int) {} };",
+                         cxxMethodDecl(hasParameter(0, hasName("x")))));
+}
+
+TEST(HasParameter, DoesNotMatchIfIndexOutOfBounds) {
+  EXPECT_TRUE(notMatches("class X { void x(int) {} };",
+                         cxxMethodDecl(hasParameter(42, varDecl()))));
+}
+
+TEST(HasType, MatchesParameterVariableTypesStrictly) {
+  EXPECT_TRUE(matches(
+    "class X { void x(X x) {} };",
+    cxxMethodDecl(hasParameter(0, hasType(recordDecl(hasName("X")))))));
+  EXPECT_TRUE(notMatches(
+    "class X { void x(const X &x) {} };",
+    cxxMethodDecl(hasParameter(0, hasType(recordDecl(hasName("X")))))));
+  EXPECT_TRUE(matches("class X { void x(const X *x) {} };",
+                      cxxMethodDecl(hasParameter(
+                        0, hasType(pointsTo(recordDecl(hasName("X"))))))));
+  EXPECT_TRUE(matches("class X { void x(const X &x) {} };",
+                      cxxMethodDecl(hasParameter(
+                        0, hasType(references(recordDecl(hasName("X"))))))));
+}
+
+TEST(HasAnyParameter, MatchesIndependentlyOfPosition) {
+  EXPECT_TRUE(matches(
+    "class Y {}; class X { void x(X x, Y y) {} };",
+    cxxMethodDecl(hasAnyParameter(hasType(recordDecl(hasName("X")))))));
+  EXPECT_TRUE(matches(
+    "class Y {}; class X { void x(Y y, X x) {} };",
+    cxxMethodDecl(hasAnyParameter(hasType(recordDecl(hasName("X")))))));
+}
+
+TEST(Returns, MatchesReturnTypes) {
+  EXPECT_TRUE(matches("class Y { int f() { return 1; } };",
+                      functionDecl(returns(asString("int")))));
+  EXPECT_TRUE(notMatches("class Y { int f() { return 1; } };",
+                         functionDecl(returns(asString("float")))));
+  EXPECT_TRUE(matches("class Y { Y getMe() { return *this; } };",
+                      functionDecl(returns(hasDeclaration(
+                        recordDecl(hasName("Y")))))));
+}
+
+TEST(HasAnyParameter, DoesntMatchIfInnerMatcherDoesntMatch) {
+  EXPECT_TRUE(notMatches(
+    "class Y {}; class X { void x(int) {} };",
+    cxxMethodDecl(hasAnyParameter(hasType(recordDecl(hasName("X")))))));
+}
+
+TEST(HasAnyParameter, DoesNotMatchThisPointer) {
+  EXPECT_TRUE(notMatches("class Y {}; class X { void x() {} };",
+                         cxxMethodDecl(hasAnyParameter(
+                           hasType(pointsTo(recordDecl(hasName("X"))))))));
+}
+
+TEST(HasName, MatchesParameterVariableDeclarations) {
+  EXPECT_TRUE(matches("class Y {}; class X { void x(int x) {} };",
+                      cxxMethodDecl(hasAnyParameter(hasName("x")))));
+  EXPECT_TRUE(notMatches("class Y {}; class X { void x(int) {} };",
+                         cxxMethodDecl(hasAnyParameter(hasName("x")))));
+}
+
+TEST(Matcher, MatchesTypeTemplateArgument) {
+  EXPECT_TRUE(matches(
+    "template<typename T> struct B {};"
+      "B<int> b;",
+    classTemplateSpecializationDecl(hasAnyTemplateArgument(refersToType(
+      asString("int"))))));
+}
+
+TEST(Matcher, MatchesTemplateTemplateArgument) {
+  EXPECT_TRUE(matches("template<template <typename> class S> class X {};"
+                      "template<typename T> class Y {};"
+                      "X<Y> xi;",
+                      classTemplateSpecializationDecl(hasAnyTemplateArgument(
+                          refersToTemplate(templateName())))));
+}
+
+TEST(Matcher, MatchesDeclarationReferenceTemplateArgument) {
+  EXPECT_TRUE(matches(
+    "struct B { int next; };"
+      "template<int(B::*next_ptr)> struct A {};"
+      "A<&B::next> a;",
+    classTemplateSpecializationDecl(hasAnyTemplateArgument(
+      refersToDeclaration(fieldDecl(hasName("next")))))));
+
+  EXPECT_TRUE(notMatches(
+    "template <typename T> struct A {};"
+      "A<int> a;",
+    classTemplateSpecializationDecl(hasAnyTemplateArgument(
+      refersToDeclaration(decl())))));
+
+  EXPECT_TRUE(matches(
+    "struct B { int next; };"
+      "template<int(B::*next_ptr)> struct A {};"
+      "A<&B::next> a;",
+    templateSpecializationType(hasAnyTemplateArgument(isExpr(
+      hasDescendant(declRefExpr(to(fieldDecl(hasName("next"))))))))));
+
+  EXPECT_TRUE(notMatches(
+    "template <typename T> struct A {};"
+      "A<int> a;",
+    templateSpecializationType(hasAnyTemplateArgument(
+      refersToDeclaration(decl())))));
+}
+
+
+TEST(Matcher, MatchesSpecificArgument) {
+  EXPECT_TRUE(matches(
+    "template<typename T, typename U> class A {};"
+      "A<bool, int> a;",
+    classTemplateSpecializationDecl(hasTemplateArgument(
+      1, refersToType(asString("int"))))));
+  EXPECT_TRUE(notMatches(
+    "template<typename T, typename U> class A {};"
+      "A<int, bool> a;",
+    classTemplateSpecializationDecl(hasTemplateArgument(
+      1, refersToType(asString("int"))))));
+
+  EXPECT_TRUE(matches(
+    "template<typename T, typename U> class A {};"
+      "A<bool, int> a;",
+    templateSpecializationType(hasTemplateArgument(
+      1, refersToType(asString("int"))))));
+  EXPECT_TRUE(notMatches(
+    "template<typename T, typename U> class A {};"
+      "A<int, bool> a;",
+    templateSpecializationType(hasTemplateArgument(
+      1, refersToType(asString("int"))))));
+
+  EXPECT_TRUE(matches(
+    "template<typename T> void f() {};"
+      "void func() { f<int>(); }",
+    functionDecl(hasTemplateArgument(0, refersToType(asString("int"))))));
+  EXPECT_TRUE(notMatches(
+    "template<typename T> void f() {};",
+    functionDecl(hasTemplateArgument(0, refersToType(asString("int"))))));
+}
+
+TEST(TemplateArgument, Matches) {
+  EXPECT_TRUE(matches("template<typename T> struct C {}; C<int> c;",
+                      classTemplateSpecializationDecl(
+                        hasAnyTemplateArgument(templateArgument()))));
+  EXPECT_TRUE(matches(
+    "template<typename T> struct C {}; C<int> c;",
+    templateSpecializationType(hasAnyTemplateArgument(templateArgument()))));
+
+  EXPECT_TRUE(matches(
+    "template<typename T> void f() {};"
+      "void func() { f<int>(); }",
+    functionDecl(hasAnyTemplateArgument(templateArgument()))));
+}
+
+TEST(RefersToIntegralType, Matches) {
+  EXPECT_TRUE(matches("template<int T> struct C {}; C<42> c;",
+                      classTemplateSpecializationDecl(
+                        hasAnyTemplateArgument(refersToIntegralType(
+                          asString("int"))))));
+  EXPECT_TRUE(notMatches("template<unsigned T> struct C {}; C<42> c;",
+                         classTemplateSpecializationDecl(hasAnyTemplateArgument(
+                           refersToIntegralType(asString("int"))))));
+}
+
+TEST(ConstructorDeclaration, SimpleCase) {
+  EXPECT_TRUE(matches("class Foo { Foo(int i); };",
+                      cxxConstructorDecl(ofClass(hasName("Foo")))));
+  EXPECT_TRUE(notMatches("class Foo { Foo(int i); };",
+                         cxxConstructorDecl(ofClass(hasName("Bar")))));
+}
+
+TEST(DestructorDeclaration, MatchesVirtualDestructor) {
+  EXPECT_TRUE(matches("class Foo { virtual ~Foo(); };",
+                      cxxDestructorDecl(ofClass(hasName("Foo")))));
+}
+
+TEST(DestructorDeclaration, DoesNotMatchImplicitDestructor) {
+  EXPECT_TRUE(notMatches("class Foo {};",
+                         cxxDestructorDecl(ofClass(hasName("Foo")))));
+}
+
+TEST(HasAnyConstructorInitializer, SimpleCase) {
+  EXPECT_TRUE(
+    notMatches("class Foo { Foo() { } };",
+               cxxConstructorDecl(hasAnyConstructorInitializer(anything()))));
+  EXPECT_TRUE(
+    matches("class Foo {"
+              "  Foo() : foo_() { }"
+              "  int foo_;"
+              "};",
+            cxxConstructorDecl(hasAnyConstructorInitializer(anything()))));
+}
+
+TEST(HasAnyConstructorInitializer, ForField) {
+  static const char Code[] =
+    "class Baz { };"
+      "class Foo {"
+      "  Foo() : foo_() { }"
+      "  Baz foo_;"
+      "  Baz bar_;"
+      "};";
+  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
+    forField(hasType(recordDecl(hasName("Baz"))))))));
+  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
+    forField(hasName("foo_"))))));
+  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
+    forField(hasType(recordDecl(hasName("Bar"))))))));
+}
+
+TEST(HasAnyConstructorInitializer, WithInitializer) {
+  static const char Code[] =
+    "class Foo {"
+      "  Foo() : foo_(0) { }"
+      "  int foo_;"
+      "};";
+  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
+    withInitializer(integerLiteral(equals(0)))))));
+  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
+    withInitializer(integerLiteral(equals(1)))))));
+}
+
+TEST(HasAnyConstructorInitializer, IsWritten) {
+  static const char Code[] =
+    "struct Bar { Bar(){} };"
+      "class Foo {"
+      "  Foo() : foo_() { }"
+      "  Bar foo_;"
+      "  Bar bar_;"
+      "};";
+  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
+    allOf(forField(hasName("foo_")), isWritten())))));
+  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
+    allOf(forField(hasName("bar_")), isWritten())))));
+  EXPECT_TRUE(matches(Code, cxxConstructorDecl(hasAnyConstructorInitializer(
+    allOf(forField(hasName("bar_")), unless(isWritten()))))));
+}
+
+TEST(HasAnyConstructorInitializer, IsBaseInitializer) {
+  static const char Code[] =
+    "struct B {};"
+      "struct D : B {"
+      "  int I;"
+      "  D(int i) : I(i) {}"
+      "};"
+      "struct E : B {"
+      "  E() : B() {}"
+      "};";
+  EXPECT_TRUE(matches(Code, cxxConstructorDecl(allOf(
+    hasAnyConstructorInitializer(allOf(isBaseInitializer(), isWritten())),
+    hasName("E")))));
+  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(allOf(
+    hasAnyConstructorInitializer(allOf(isBaseInitializer(), isWritten())),
+    hasName("D")))));
+  EXPECT_TRUE(matches(Code, cxxConstructorDecl(allOf(
+    hasAnyConstructorInitializer(allOf(isMemberInitializer(), isWritten())),
+    hasName("D")))));
+  EXPECT_TRUE(notMatches(Code, cxxConstructorDecl(allOf(
+    hasAnyConstructorInitializer(allOf(isMemberInitializer(), isWritten())),
+    hasName("E")))));
+}
+
+TEST(IfStmt, ChildTraversalMatchers) {
+  EXPECT_TRUE(matches("void f() { if (false) true; else false; }",
+                      ifStmt(hasThen(cxxBoolLiteral(equals(true))))));
+  EXPECT_TRUE(notMatches("void f() { if (false) false; else true; }",
+                         ifStmt(hasThen(cxxBoolLiteral(equals(true))))));
+  EXPECT_TRUE(matches("void f() { if (false) false; else true; }",
+                      ifStmt(hasElse(cxxBoolLiteral(equals(true))))));
+  EXPECT_TRUE(notMatches("void f() { if (false) true; else false; }",
+                         ifStmt(hasElse(cxxBoolLiteral(equals(true))))));
+}
+
+TEST(MatchBinaryOperator, HasOperatorName) {
+  StatementMatcher OperatorOr = binaryOperator(hasOperatorName("||"));
+
+  EXPECT_TRUE(matches("void x() { true || false; }", OperatorOr));
+  EXPECT_TRUE(notMatches("void x() { true && false; }", OperatorOr));
+}
+
+TEST(MatchBinaryOperator, HasLHSAndHasRHS) {
+  StatementMatcher OperatorTrueFalse =
+    binaryOperator(hasLHS(cxxBoolLiteral(equals(true))),
+                   hasRHS(cxxBoolLiteral(equals(false))));
+
+  EXPECT_TRUE(matches("void x() { true || false; }", OperatorTrueFalse));
+  EXPECT_TRUE(matches("void x() { true && false; }", OperatorTrueFalse));
+  EXPECT_TRUE(notMatches("void x() { false || true; }", OperatorTrueFalse));
+
+  StatementMatcher OperatorIntPointer = arraySubscriptExpr(
+    hasLHS(hasType(isInteger())), hasRHS(hasType(pointsTo(qualType()))));
+  EXPECT_TRUE(matches("void x() { 1[\"abc\"]; }", OperatorIntPointer));
+  EXPECT_TRUE(notMatches("void x() { \"abc\"[1]; }", OperatorIntPointer));
+}
+
+TEST(MatchBinaryOperator, HasEitherOperand) {
+  StatementMatcher HasOperand =
+    binaryOperator(hasEitherOperand(cxxBoolLiteral(equals(false))));
+
+  EXPECT_TRUE(matches("void x() { true || false; }", HasOperand));
+  EXPECT_TRUE(matches("void x() { false && true; }", HasOperand));
+  EXPECT_TRUE(notMatches("void x() { true || true; }", HasOperand));
+}
+
+TEST(Matcher, BinaryOperatorTypes) {
+  // Integration test that verifies the AST provides all binary operators in
+  // a way we expect.
+  // FIXME: Operator ','
+  EXPECT_TRUE(
+    matches("void x() { 3, 4; }", binaryOperator(hasOperatorName(","))));
+  EXPECT_TRUE(
+    matches("bool b; bool c = (b = true);",
+            binaryOperator(hasOperatorName("="))));
+  EXPECT_TRUE(
+    matches("bool b = 1 != 2;", binaryOperator(hasOperatorName("!="))));
+  EXPECT_TRUE(
+    matches("bool b = 1 == 2;", binaryOperator(hasOperatorName("=="))));
+  EXPECT_TRUE(matches("bool b = 1 < 2;", binaryOperator(hasOperatorName("<"))));
+  EXPECT_TRUE(
+    matches("bool b = 1 <= 2;", binaryOperator(hasOperatorName("<="))));
+  EXPECT_TRUE(
+    matches("int i = 1 << 2;", binaryOperator(hasOperatorName("<<"))));
+  EXPECT_TRUE(
+    matches("int i = 1; int j = (i <<= 2);",
+            binaryOperator(hasOperatorName("<<="))));
+  EXPECT_TRUE(matches("bool b = 1 > 2;", binaryOperator(hasOperatorName(">"))));
+  EXPECT_TRUE(
+    matches("bool b = 1 >= 2;", binaryOperator(hasOperatorName(">="))));
+  EXPECT_TRUE(
+    matches("int i = 1 >> 2;", binaryOperator(hasOperatorName(">>"))));
+  EXPECT_TRUE(
+    matches("int i = 1; int j = (i >>= 2);",
+            binaryOperator(hasOperatorName(">>="))));
+  EXPECT_TRUE(
+    matches("int i = 42 ^ 23;", binaryOperator(hasOperatorName("^"))));
+  EXPECT_TRUE(
+    matches("int i = 42; int j = (i ^= 42);",
+            binaryOperator(hasOperatorName("^="))));
+  EXPECT_TRUE(
+    matches("int i = 42 % 23;", binaryOperator(hasOperatorName("%"))));
+  EXPECT_TRUE(
+    matches("int i = 42; int j = (i %= 42);",
+            binaryOperator(hasOperatorName("%="))));
+  EXPECT_TRUE(
+    matches("bool b = 42  &23;", binaryOperator(hasOperatorName("&"))));
+  EXPECT_TRUE(
+    matches("bool b = true && false;",
+            binaryOperator(hasOperatorName("&&"))));
+  EXPECT_TRUE(
+    matches("bool b = true; bool c = (b &= false);",
+            binaryOperator(hasOperatorName("&="))));
+  EXPECT_TRUE(
+    matches("bool b = 42 | 23;", binaryOperator(hasOperatorName("|"))));
+  EXPECT_TRUE(
+    matches("bool b = true || false;",
+            binaryOperator(hasOperatorName("||"))));
+  EXPECT_TRUE(
+    matches("bool b = true; bool c = (b |= false);",
+            binaryOperator(hasOperatorName("|="))));
+  EXPECT_TRUE(
+    matches("int i = 42  *23;", binaryOperator(hasOperatorName("*"))));
+  EXPECT_TRUE(
+    matches("int i = 42; int j = (i *= 23);",
+            binaryOperator(hasOperatorName("*="))));
+  EXPECT_TRUE(
+    matches("int i = 42 / 23;", binaryOperator(hasOperatorName("/"))));
+  EXPECT_TRUE(
+    matches("int i = 42; int j = (i /= 23);",
+            binaryOperator(hasOperatorName("/="))));
+  EXPECT_TRUE(
+    matches("int i = 42 + 23;", binaryOperator(hasOperatorName("+"))));
+  EXPECT_TRUE(
+    matches("int i = 42; int j = (i += 23);",
+            binaryOperator(hasOperatorName("+="))));
+  EXPECT_TRUE(
+    matches("int i = 42 - 23;", binaryOperator(hasOperatorName("-"))));
+  EXPECT_TRUE(
+    matches("int i = 42; int j = (i -= 23);",
+            binaryOperator(hasOperatorName("-="))));
+  EXPECT_TRUE(
+    matches("struct A { void x() { void (A::*a)(); (this->*a)(); } };",
+            binaryOperator(hasOperatorName("->*"))));
+  EXPECT_TRUE(
+    matches("struct A { void x() { void (A::*a)(); ((*this).*a)(); } };",
+            binaryOperator(hasOperatorName(".*"))));
+
+  // Member expressions as operators are not supported in matches.
+  EXPECT_TRUE(
+    notMatches("struct A { void x(A *a) { a->x(this); } };",
+               binaryOperator(hasOperatorName("->"))));
+
+  // Initializer assignments are not represented as operator equals.
+  EXPECT_TRUE(
+    notMatches("bool b = true;", binaryOperator(hasOperatorName("="))));
+
+  // Array indexing is not represented as operator.
+  EXPECT_TRUE(notMatches("int a[42]; void x() { a[23]; }", unaryOperator()));
+
+  // Overloaded operators do not match at all.
+  EXPECT_TRUE(notMatches(
+    "struct A { bool operator&&(const A &a) const { return false; } };"
+      "void x() { A a, b; a && b; }",
+    binaryOperator()));
+}
+
+TEST(MatchUnaryOperator, HasOperatorName) {
+  StatementMatcher OperatorNot = unaryOperator(hasOperatorName("!"));
+
+  EXPECT_TRUE(matches("void x() { !true; } ", OperatorNot));
+  EXPECT_TRUE(notMatches("void x() { true; } ", OperatorNot));
+}
+
+TEST(MatchUnaryOperator, HasUnaryOperand) {
+  StatementMatcher OperatorOnFalse =
+    unaryOperator(hasUnaryOperand(cxxBoolLiteral(equals(false))));
+
+  EXPECT_TRUE(matches("void x() { !false; }", OperatorOnFalse));
+  EXPECT_TRUE(notMatches("void x() { !true; }", OperatorOnFalse));
+}
+
+TEST(Matcher, UnaryOperatorTypes) {
+  // Integration test that verifies the AST provides all unary operators in
+  // a way we expect.
+  EXPECT_TRUE(matches("bool b = !true;", unaryOperator(hasOperatorName("!"))));
+  EXPECT_TRUE(
+    matches("bool b; bool *p = &b;", unaryOperator(hasOperatorName("&"))));
+  EXPECT_TRUE(matches("int i = ~ 1;", unaryOperator(hasOperatorName("~"))));
+  EXPECT_TRUE(
+    matches("bool *p; bool b = *p;", unaryOperator(hasOperatorName("*"))));
+  EXPECT_TRUE(
+    matches("int i; int j = +i;", unaryOperator(hasOperatorName("+"))));
+  EXPECT_TRUE(
+    matches("int i; int j = -i;", unaryOperator(hasOperatorName("-"))));
+  EXPECT_TRUE(
+    matches("int i; int j = ++i;", unaryOperator(hasOperatorName("++"))));
+  EXPECT_TRUE(
+    matches("int i; int j = i++;", unaryOperator(hasOperatorName("++"))));
+  EXPECT_TRUE(
+    matches("int i; int j = --i;", unaryOperator(hasOperatorName("--"))));
+  EXPECT_TRUE(
+    matches("int i; int j = i--;", unaryOperator(hasOperatorName("--"))));
+
+  // We don't match conversion operators.
+  EXPECT_TRUE(notMatches("int i; double d = (double)i;", unaryOperator()));
+
+  // Function calls are not represented as operator.
+  EXPECT_TRUE(notMatches("void f(); void x() { f(); }", unaryOperator()));
+
+  // Overloaded operators do not match at all.
+  // FIXME: We probably want to add that.
+  EXPECT_TRUE(notMatches(
+    "struct A { bool operator!() const { return false; } };"
+      "void x() { A a; !a; }", unaryOperator(hasOperatorName("!"))));
+}
+
+TEST(ArraySubscriptMatchers, ArrayIndex) {
+  EXPECT_TRUE(matches(
+    "int i[2]; void f() { i[1] = 1; }",
+    arraySubscriptExpr(hasIndex(integerLiteral(equals(1))))));
+  EXPECT_TRUE(matches(
+    "int i[2]; void f() { 1[i] = 1; }",
+    arraySubscriptExpr(hasIndex(integerLiteral(equals(1))))));
+  EXPECT_TRUE(notMatches(
+    "int i[2]; void f() { i[1] = 1; }",
+    arraySubscriptExpr(hasIndex(integerLiteral(equals(0))))));
+}
+
+TEST(ArraySubscriptMatchers, MatchesArrayBase) {
+  EXPECT_TRUE(matches(
+    "int i[2]; void f() { i[1] = 2; }",
+    arraySubscriptExpr(hasBase(implicitCastExpr(
+      hasSourceExpression(declRefExpr()))))));
+}
+
+TEST(Matcher, OfClass) {
+  StatementMatcher Constructor = cxxConstructExpr(hasDeclaration(cxxMethodDecl(
+    ofClass(hasName("X")))));
+
+  EXPECT_TRUE(
+    matches("class X { public: X(); }; void x(int) { X x; }", Constructor));
+  EXPECT_TRUE(
+    matches("class X { public: X(); }; void x(int) { X x = X(); }",
+            Constructor));
+  EXPECT_TRUE(
+    notMatches("class Y { public: Y(); }; void x(int) { Y y; }",
+               Constructor));
+}
+
+TEST(Matcher, VisitsTemplateInstantiations) {
+  EXPECT_TRUE(matches(
+    "class A { public: void x(); };"
+      "template <typename T> class B { public: void y() { T t; t.x(); } };"
+      "void f() { B<A> b; b.y(); }",
+    callExpr(callee(cxxMethodDecl(hasName("x"))))));
+
+  EXPECT_TRUE(matches(
+    "class A { public: void x(); };"
+      "class C {"
+      " public:"
+      "  template <typename T> class B { public: void y() { T t; t.x(); } };"
+      "};"
+      "void f() {"
+      "  C::B<A> b; b.y();"
+      "}",
+    recordDecl(hasName("C"), hasDescendant(callExpr(
+      callee(cxxMethodDecl(hasName("x"))))))));
+}
+
+TEST(Matcher, HasCondition) {
+  StatementMatcher IfStmt =
+    ifStmt(hasCondition(cxxBoolLiteral(equals(true))));
+  EXPECT_TRUE(matches("void x() { if (true) {} }", IfStmt));
+  EXPECT_TRUE(notMatches("void x() { if (false) {} }", IfStmt));
+
+  StatementMatcher ForStmt =
+    forStmt(hasCondition(cxxBoolLiteral(equals(true))));
+  EXPECT_TRUE(matches("void x() { for (;true;) {} }", ForStmt));
+  EXPECT_TRUE(notMatches("void x() { for (;false;) {} }", ForStmt));
+
+  StatementMatcher WhileStmt =
+    whileStmt(hasCondition(cxxBoolLiteral(equals(true))));
+  EXPECT_TRUE(matches("void x() { while (true) {} }", WhileStmt));
+  EXPECT_TRUE(notMatches("void x() { while (false) {} }", WhileStmt));
+
+  StatementMatcher SwitchStmt =
+    switchStmt(hasCondition(integerLiteral(equals(42))));
+  EXPECT_TRUE(matches("void x() { switch (42) {case 42:;} }", SwitchStmt));
+  EXPECT_TRUE(notMatches("void x() { switch (43) {case 43:;} }", SwitchStmt));
+}
+
+TEST(For, ForLoopInternals) {
+  EXPECT_TRUE(matches("void f(){ int i; for (; i < 3 ; ); }",
+                      forStmt(hasCondition(anything()))));
+  EXPECT_TRUE(matches("void f() { for (int i = 0; ;); }",
+                      forStmt(hasLoopInit(anything()))));
+}
+
+TEST(For, ForRangeLoopInternals) {
+  EXPECT_TRUE(matches("void f(){ int a[] {1, 2}; for (int i : a); }",
+                      cxxForRangeStmt(hasLoopVariable(anything()))));
+  EXPECT_TRUE(matches(
+    "void f(){ int a[] {1, 2}; for (int i : a); }",
+    cxxForRangeStmt(hasRangeInit(declRefExpr(to(varDecl(hasName("a"))))))));
+}
+
+TEST(For, NegativeForLoopInternals) {
+  EXPECT_TRUE(notMatches("void f(){ for (int i = 0; ; ++i); }",
+                         forStmt(hasCondition(expr()))));
+  EXPECT_TRUE(notMatches("void f() {int i; for (; i < 4; ++i) {} }",
+                         forStmt(hasLoopInit(anything()))));
+}
+
+TEST(HasBody, FindsBodyOfForWhileDoLoops) {
+  EXPECT_TRUE(matches("void f() { for(;;) {} }",
+                      forStmt(hasBody(compoundStmt()))));
+  EXPECT_TRUE(notMatches("void f() { for(;;); }",
+                         forStmt(hasBody(compoundStmt()))));
+  EXPECT_TRUE(matches("void f() { while(true) {} }",
+                      whileStmt(hasBody(compoundStmt()))));
+  EXPECT_TRUE(matches("void f() { do {} while(true); }",
+                      doStmt(hasBody(compoundStmt()))));
+  EXPECT_TRUE(matches("void f() { int p[2]; for (auto x : p) {} }",
+                      cxxForRangeStmt(hasBody(compoundStmt()))));
+  EXPECT_TRUE(matches("void f() {}", functionDecl(hasBody(compoundStmt()))));
+  EXPECT_TRUE(notMatches("void f();", functionDecl(hasBody(compoundStmt()))));
+  EXPECT_TRUE(matches("void f(); void f() {}",
+                      functionDecl(hasBody(compoundStmt()))));
+}
+
+TEST(HasAnySubstatement, MatchesForTopLevelCompoundStatement) {
+  // The simplest case: every compound statement is in a function
+  // definition, and the function body itself must be a compound
+  // statement.
+  EXPECT_TRUE(matches("void f() { for (;;); }",
+                      compoundStmt(hasAnySubstatement(forStmt()))));
+}
+
+TEST(HasAnySubstatement, IsNotRecursive) {
+  // It's really "has any immediate substatement".
+  EXPECT_TRUE(notMatches("void f() { if (true) for (;;); }",
+                         compoundStmt(hasAnySubstatement(forStmt()))));
+}
+
+TEST(HasAnySubstatement, MatchesInNestedCompoundStatements) {
+  EXPECT_TRUE(matches("void f() { if (true) { for (;;); } }",
+                      compoundStmt(hasAnySubstatement(forStmt()))));
+}
+
+TEST(HasAnySubstatement, FindsSubstatementBetweenOthers) {
+  EXPECT_TRUE(matches("void f() { 1; 2; 3; for (;;); 4; 5; 6; }",
+                      compoundStmt(hasAnySubstatement(forStmt()))));
+}
+
+TEST(Member, MatchesMemberAllocationFunction) {
+  // Fails in C++11 mode
+  EXPECT_TRUE(matchesConditionally(
+    "namespace std { typedef typeof(sizeof(int)) size_t; }"
+      "class X { void *operator new(std::size_t); };",
+    cxxMethodDecl(ofClass(hasName("X"))), true, "-std=gnu++98"));
+
+  EXPECT_TRUE(matches("class X { void operator delete(void*); };",
+                      cxxMethodDecl(ofClass(hasName("X")))));
+
+  // Fails in C++11 mode
+  EXPECT_TRUE(matchesConditionally(
+    "namespace std { typedef typeof(sizeof(int)) size_t; }"
+      "class X { void operator delete[](void*, std::size_t); };",
+    cxxMethodDecl(ofClass(hasName("X"))), true, "-std=gnu++98"));
+}
+
+TEST(HasDestinationType, MatchesSimpleCase) {
+  EXPECT_TRUE(matches("char* p = static_cast<char*>(0);",
+                      cxxStaticCastExpr(hasDestinationType(
+                        pointsTo(TypeMatcher(anything()))))));
+}
+
+TEST(HasImplicitDestinationType, MatchesSimpleCase) {
+  // This test creates an implicit const cast.
+  EXPECT_TRUE(matches("int x; const int i = x;",
+                      implicitCastExpr(
+                        hasImplicitDestinationType(isInteger()))));
+  // This test creates an implicit array-to-pointer cast.
+  EXPECT_TRUE(matches("int arr[3]; int *p = arr;",
+                      implicitCastExpr(hasImplicitDestinationType(
+                        pointsTo(TypeMatcher(anything()))))));
+}
+
+TEST(HasImplicitDestinationType, DoesNotMatchIncorrectly) {
+  // This test creates an implicit cast from int to char.
+  EXPECT_TRUE(notMatches("char c = 0;",
+                         implicitCastExpr(hasImplicitDestinationType(
+                           unless(anything())))));
+  // This test creates an implicit array-to-pointer cast.
+  EXPECT_TRUE(notMatches("int arr[3]; int *p = arr;",
+                         implicitCastExpr(hasImplicitDestinationType(
+                           unless(anything())))));
+}
+
+TEST(IgnoringImplicit, MatchesImplicit) {
+  EXPECT_TRUE(matches("class C {}; C a = C();",
+                      varDecl(has(ignoringImplicit(cxxConstructExpr())))));
+}
+
+TEST(IgnoringImplicit, DoesNotMatchIncorrectly) {
+  EXPECT_TRUE(
+      notMatches("class C {}; C a = C();", varDecl(has(cxxConstructExpr()))));
+}
+
+TEST(IgnoringImpCasts, MatchesImpCasts) {
+  // This test checks that ignoringImpCasts matches when implicit casts are
+  // present and its inner matcher alone does not match.
+  // Note that this test creates an implicit const cast.
+  EXPECT_TRUE(matches("int x = 0; const int y = x;",
+                      varDecl(hasInitializer(ignoringImpCasts(
+                        declRefExpr(to(varDecl(hasName("x")))))))));
+  // This test creates an implict cast from int to char.
+  EXPECT_TRUE(matches("char x = 0;",
+                      varDecl(hasInitializer(ignoringImpCasts(
+                        integerLiteral(equals(0)))))));
+}
+
+TEST(IgnoringImpCasts, DoesNotMatchIncorrectly) {
+  // These tests verify that ignoringImpCasts does not match if the inner
+  // matcher does not match.
+  // Note that the first test creates an implicit const cast.
+  EXPECT_TRUE(notMatches("int x; const int y = x;",
+                         varDecl(hasInitializer(ignoringImpCasts(
+                           unless(anything()))))));
+  EXPECT_TRUE(notMatches("int x; int y = x;",
+                         varDecl(hasInitializer(ignoringImpCasts(
+                           unless(anything()))))));
+
+  // These tests verify that ignoringImplictCasts does not look through explicit
+  // casts or parentheses.
+  EXPECT_TRUE(notMatches("char* p = static_cast<char*>(0);",
+                         varDecl(hasInitializer(ignoringImpCasts(
+                           integerLiteral())))));
+  EXPECT_TRUE(notMatches("int i = (0);",
+                         varDecl(hasInitializer(ignoringImpCasts(
+                           integerLiteral())))));
+  EXPECT_TRUE(notMatches("float i = (float)0;",
+                         varDecl(hasInitializer(ignoringImpCasts(
+                           integerLiteral())))));
+  EXPECT_TRUE(notMatches("float i = float(0);",
+                         varDecl(hasInitializer(ignoringImpCasts(
+                           integerLiteral())))));
+}
+
+TEST(IgnoringImpCasts, MatchesWithoutImpCasts) {
+  // This test verifies that expressions that do not have implicit casts
+  // still match the inner matcher.
+  EXPECT_TRUE(matches("int x = 0; int &y = x;",
+                      varDecl(hasInitializer(ignoringImpCasts(
+                        declRefExpr(to(varDecl(hasName("x")))))))));
+}
+
+TEST(IgnoringParenCasts, MatchesParenCasts) {
+  // This test checks that ignoringParenCasts matches when parentheses and/or
+  // casts are present and its inner matcher alone does not match.
+  EXPECT_TRUE(matches("int x = (0);",
+                      varDecl(hasInitializer(ignoringParenCasts(
+                        integerLiteral(equals(0)))))));
+  EXPECT_TRUE(matches("int x = (((((0)))));",
+                      varDecl(hasInitializer(ignoringParenCasts(
+                        integerLiteral(equals(0)))))));
+
+  // This test creates an implict cast from int to char in addition to the
+  // parentheses.
+  EXPECT_TRUE(matches("char x = (0);",
+                      varDecl(hasInitializer(ignoringParenCasts(
+                        integerLiteral(equals(0)))))));
+
+  EXPECT_TRUE(matches("char x = (char)0;",
+                      varDecl(hasInitializer(ignoringParenCasts(
+                        integerLiteral(equals(0)))))));
+  EXPECT_TRUE(matches("char* p = static_cast<char*>(0);",
+                      varDecl(hasInitializer(ignoringParenCasts(
+                        integerLiteral(equals(0)))))));
+}
+
+TEST(IgnoringParenCasts, MatchesWithoutParenCasts) {
+  // This test verifies that expressions that do not have any casts still match.
+  EXPECT_TRUE(matches("int x = 0;",
+                      varDecl(hasInitializer(ignoringParenCasts(
+                        integerLiteral(equals(0)))))));
+}
+
+TEST(IgnoringParenCasts, DoesNotMatchIncorrectly) {
+  // These tests verify that ignoringImpCasts does not match if the inner
+  // matcher does not match.
+  EXPECT_TRUE(notMatches("int x = ((0));",
+                         varDecl(hasInitializer(ignoringParenCasts(
+                           unless(anything()))))));
+
+  // This test creates an implicit cast from int to char in addition to the
+  // parentheses.
+  EXPECT_TRUE(notMatches("char x = ((0));",
+                         varDecl(hasInitializer(ignoringParenCasts(
+                           unless(anything()))))));
+
+  EXPECT_TRUE(notMatches("char *x = static_cast<char *>((0));",
+                         varDecl(hasInitializer(ignoringParenCasts(
+                           unless(anything()))))));
+}
+
+TEST(IgnoringParenAndImpCasts, MatchesParenImpCasts) {
+  // This test checks that ignoringParenAndImpCasts matches when
+  // parentheses and/or implicit casts are present and its inner matcher alone
+  // does not match.
+  // Note that this test creates an implicit const cast.
+  EXPECT_TRUE(matches("int x = 0; const int y = x;",
+                      varDecl(hasInitializer(ignoringParenImpCasts(
+                        declRefExpr(to(varDecl(hasName("x")))))))));
+  // This test creates an implicit cast from int to char.
+  EXPECT_TRUE(matches("const char x = (0);",
+                      varDecl(hasInitializer(ignoringParenImpCasts(
+                        integerLiteral(equals(0)))))));
+}
+
+TEST(IgnoringParenAndImpCasts, MatchesWithoutParenImpCasts) {
+  // This test verifies that expressions that do not have parentheses or
+  // implicit casts still match.
+  EXPECT_TRUE(matches("int x = 0; int &y = x;",
+                      varDecl(hasInitializer(ignoringParenImpCasts(
+                        declRefExpr(to(varDecl(hasName("x")))))))));
+  EXPECT_TRUE(matches("int x = 0;",
+                      varDecl(hasInitializer(ignoringParenImpCasts(
+                        integerLiteral(equals(0)))))));
+}
+
+TEST(IgnoringParenAndImpCasts, DoesNotMatchIncorrectly) {
+  // These tests verify that ignoringParenImpCasts does not match if
+  // the inner matcher does not match.
+  // This test creates an implicit cast.
+  EXPECT_TRUE(notMatches("char c = ((3));",
+                         varDecl(hasInitializer(ignoringParenImpCasts(
+                           unless(anything()))))));
+  // These tests verify that ignoringParenAndImplictCasts does not look
+  // through explicit casts.
+  EXPECT_TRUE(notMatches("float y = (float(0));",
+                         varDecl(hasInitializer(ignoringParenImpCasts(
+                           integerLiteral())))));
+  EXPECT_TRUE(notMatches("float y = (float)0;",
+                         varDecl(hasInitializer(ignoringParenImpCasts(
+                           integerLiteral())))));
+  EXPECT_TRUE(notMatches("char* p = static_cast<char*>(0);",
+                         varDecl(hasInitializer(ignoringParenImpCasts(
+                           integerLiteral())))));
+}
+
+TEST(HasSourceExpression, MatchesImplicitCasts) {
+  EXPECT_TRUE(matches("class string {}; class URL { public: URL(string s); };"
+                        "void r() {string a_string; URL url = a_string; }",
+                      implicitCastExpr(
+                        hasSourceExpression(cxxConstructExpr()))));
+}
+
+TEST(HasSourceExpression, MatchesExplicitCasts) {
+  EXPECT_TRUE(matches("float x = static_cast<float>(42);",
+                      explicitCastExpr(
+                        hasSourceExpression(hasDescendant(
+                          expr(integerLiteral()))))));
+}
+
+TEST(UsingDeclaration, MatchesSpecificTarget) {
+  EXPECT_TRUE(matches("namespace f { int a; void b(); } using f::b;",
+                      usingDecl(hasAnyUsingShadowDecl(
+                        hasTargetDecl(functionDecl())))));
+  EXPECT_TRUE(notMatches("namespace f { int a; void b(); } using f::a;",
+                         usingDecl(hasAnyUsingShadowDecl(
+                           hasTargetDecl(functionDecl())))));
+}
+
+TEST(UsingDeclaration, ThroughUsingDeclaration) {
+  EXPECT_TRUE(matches(
+    "namespace a { void f(); } using a::f; void g() { f(); }",
+    declRefExpr(throughUsingDecl(anything()))));
+  EXPECT_TRUE(notMatches(
+    "namespace a { void f(); } using a::f; void g() { a::f(); }",
+    declRefExpr(throughUsingDecl(anything()))));
+}
+
+TEST(SingleDecl, IsSingleDecl) {
+  StatementMatcher SingleDeclStmt =
+    declStmt(hasSingleDecl(varDecl(hasInitializer(anything()))));
+  EXPECT_TRUE(matches("void f() {int a = 4;}", SingleDeclStmt));
+  EXPECT_TRUE(notMatches("void f() {int a;}", SingleDeclStmt));
+  EXPECT_TRUE(notMatches("void f() {int a = 4, b = 3;}",
+                         SingleDeclStmt));
+}
+
+TEST(DeclStmt, ContainsDeclaration) {
+  DeclarationMatcher MatchesInit = varDecl(hasInitializer(anything()));
+
+  EXPECT_TRUE(matches("void f() {int a = 4;}",
+                      declStmt(containsDeclaration(0, MatchesInit))));
+  EXPECT_TRUE(matches("void f() {int a = 4, b = 3;}",
+                      declStmt(containsDeclaration(0, MatchesInit),
+                               containsDeclaration(1, MatchesInit))));
+  unsigned WrongIndex = 42;
+  EXPECT_TRUE(notMatches("void f() {int a = 4, b = 3;}",
+                         declStmt(containsDeclaration(WrongIndex,
+                                                      MatchesInit))));
+}
+
+TEST(SwitchCase, MatchesEachCase) {
+  EXPECT_TRUE(notMatches("void x() { switch(42); }",
+                         switchStmt(forEachSwitchCase(caseStmt()))));
+  EXPECT_TRUE(matches("void x() { switch(42) case 42:; }",
+                      switchStmt(forEachSwitchCase(caseStmt()))));
+  EXPECT_TRUE(matches("void x() { switch(42) { case 42:; } }",
+                      switchStmt(forEachSwitchCase(caseStmt()))));
+  EXPECT_TRUE(notMatches(
+    "void x() { if (1) switch(42) { case 42: switch (42) { default:; } } }",
+    ifStmt(has(switchStmt(forEachSwitchCase(defaultStmt()))))));
+  EXPECT_TRUE(matches("void x() { switch(42) { case 1+1: case 4:; } }",
+                      switchStmt(forEachSwitchCase(
+                        caseStmt(hasCaseConstant(integerLiteral()))))));
+  EXPECT_TRUE(notMatches("void x() { switch(42) { case 1+1: case 2+2:; } }",
+                         switchStmt(forEachSwitchCase(
+                           caseStmt(hasCaseConstant(integerLiteral()))))));
+  EXPECT_TRUE(notMatches("void x() { switch(42) { case 1 ... 2:; } }",
+                         switchStmt(forEachSwitchCase(
+                           caseStmt(hasCaseConstant(integerLiteral()))))));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void x() { switch (42) { case 1: case 2: case 3: default:; } }",
+    switchStmt(forEachSwitchCase(caseStmt().bind("x"))),
+    llvm::make_unique<VerifyIdIsBoundTo<CaseStmt>>("x", 3)));
+}
+
+TEST(ForEachConstructorInitializer, MatchesInitializers) {
+  EXPECT_TRUE(matches(
+    "struct X { X() : i(42), j(42) {} int i, j; };",
+    cxxConstructorDecl(forEachConstructorInitializer(cxxCtorInitializer()))));
+}
+
+TEST(HasConditionVariableStatement, DoesNotMatchCondition) {
+  EXPECT_TRUE(notMatches(
+    "void x() { if(true) {} }",
+    ifStmt(hasConditionVariableStatement(declStmt()))));
+  EXPECT_TRUE(notMatches(
+    "void x() { int x; if((x = 42)) {} }",
+    ifStmt(hasConditionVariableStatement(declStmt()))));
+}
+
+TEST(HasConditionVariableStatement, MatchesConditionVariables) {
+  EXPECT_TRUE(matches(
+    "void x() { if(int* a = 0) {} }",
+    ifStmt(hasConditionVariableStatement(declStmt()))));
+}
+
+TEST(ForEach, BindsOneNode) {
+  EXPECT_TRUE(matchAndVerifyResultTrue("class C { int x; };",
+                                       recordDecl(hasName("C"), forEach(fieldDecl(hasName("x")).bind("x"))),
+                                       llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("x", 1)));
+}
+
+TEST(ForEach, BindsMultipleNodes) {
+  EXPECT_TRUE(matchAndVerifyResultTrue("class C { int x; int y; int z; };",
+                                       recordDecl(hasName("C"), forEach(fieldDecl().bind("f"))),
+                                       llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("f", 3)));
+}
+
+TEST(ForEach, BindsRecursiveCombinations) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class C { class D { int x; int y; }; class E { int y; int z; }; };",
+    recordDecl(hasName("C"),
+               forEach(recordDecl(forEach(fieldDecl().bind("f"))))),
+    llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("f", 4)));
+}
+
+TEST(ForEachDescendant, BindsOneNode) {
+  EXPECT_TRUE(matchAndVerifyResultTrue("class C { class D { int x; }; };",
+                                       recordDecl(hasName("C"),
+                                                  forEachDescendant(fieldDecl(hasName("x")).bind("x"))),
+                                       llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("x", 1)));
+}
+
+TEST(ForEachDescendant, NestedForEachDescendant) {
+  DeclarationMatcher m = recordDecl(
+    isDefinition(), decl().bind("x"), hasName("C"));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { class B { class C {}; }; };",
+    recordDecl(hasName("A"), anyOf(m, forEachDescendant(m))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", "C")));
+
+  // Check that a partial match of 'm' that binds 'x' in the
+  // first part of anyOf(m, anything()) will not overwrite the
+  // binding created by the earlier binding in the hasDescendant.
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { class B { class C {}; }; };",
+    recordDecl(hasName("A"), allOf(hasDescendant(m), anyOf(m, anything()))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", "C")));
+}
+
+TEST(ForEachDescendant, BindsMultipleNodes) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class C { class D { int x; int y; }; "
+      "          class E { class F { int y; int z; }; }; };",
+    recordDecl(hasName("C"), forEachDescendant(fieldDecl().bind("f"))),
+    llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("f", 4)));
+}
+
+TEST(ForEachDescendant, BindsRecursiveCombinations) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class C { class D { "
+      "          class E { class F { class G { int y; int z; }; }; }; }; };",
+    recordDecl(hasName("C"), forEachDescendant(recordDecl(
+      forEachDescendant(fieldDecl().bind("f"))))),
+    llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("f", 8)));
+}
+
+TEST(ForEachDescendant, BindsCombinations) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void f() { if(true) {} if (true) {} while (true) {} if (true) {} while "
+      "(true) {} }",
+    compoundStmt(forEachDescendant(ifStmt().bind("if")),
+                 forEachDescendant(whileStmt().bind("while"))),
+    llvm::make_unique<VerifyIdIsBoundTo<IfStmt>>("if", 6)));
+}
+
+TEST(Has, DoesNotDeleteBindings) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X { int a; };", recordDecl(decl().bind("x"), has(fieldDecl())),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+}
+
+TEST(LoopingMatchers, DoNotOverwritePreviousMatchResultOnFailure) {
+  // Those matchers cover all the cases where an inner matcher is called
+  // and there is not a 1:1 relationship between the match of the outer
+  // matcher and the match of the inner matcher.
+  // The pattern to look for is:
+  //   ... return InnerMatcher.matches(...); ...
+  // In which case no special handling is needed.
+  //
+  // On the other hand, if there are multiple alternative matches
+  // (for example forEach*) or matches might be discarded (for example has*)
+  // the implementation must make sure that the discarded matches do not
+  // affect the bindings.
+  // When new such matchers are added, add a test here that:
+  // - matches a simple node, and binds it as the first thing in the matcher:
+  //     recordDecl(decl().bind("x"), hasName("X")))
+  // - uses the matcher under test afterwards in a way that not the first
+  //   alternative is matched; for anyOf, that means the first branch
+  //   would need to return false; for hasAncestor, it means that not
+  //   the direct parent matches the inner matcher.
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X { int y; };",
+    recordDecl(
+      recordDecl().bind("x"), hasName("::X"),
+      anyOf(forEachDescendant(recordDecl(hasName("Y"))), anything())),
+    llvm::make_unique<VerifyIdIsBoundTo<CXXRecordDecl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X {};", recordDecl(recordDecl().bind("x"), hasName("::X"),
+                              anyOf(unless(anything()), anything())),
+    llvm::make_unique<VerifyIdIsBoundTo<CXXRecordDecl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "template<typename T1, typename T2> class X {}; X<float, int> x;",
+    classTemplateSpecializationDecl(
+      decl().bind("x"),
+      hasAnyTemplateArgument(refersToType(asString("int")))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X { void f(); void g(); };",
+    cxxRecordDecl(decl().bind("x"), hasMethod(hasName("g"))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X { X() : a(1), b(2) {} double a; int b; };",
+    recordDecl(decl().bind("x"),
+               has(cxxConstructorDecl(
+                 hasAnyConstructorInitializer(forField(hasName("b")))))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void x(int, int) { x(0, 42); }",
+    callExpr(expr().bind("x"), hasAnyArgument(integerLiteral(equals(42)))),
+    llvm::make_unique<VerifyIdIsBoundTo<Expr>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void x(int, int y) {}",
+    functionDecl(decl().bind("x"), hasAnyParameter(hasName("y"))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void x() { return; if (true) {} }",
+    functionDecl(decl().bind("x"),
+                 has(compoundStmt(hasAnySubstatement(ifStmt())))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "namespace X { void b(int); void b(); }"
+      "using X::b;",
+    usingDecl(decl().bind("x"), hasAnyUsingShadowDecl(hasTargetDecl(
+      functionDecl(parameterCountIs(1))))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A{}; class B{}; class C : B, A {};",
+    cxxRecordDecl(decl().bind("x"), isDerivedFrom("::A")),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A{}; typedef A B; typedef A C; typedef A D;"
+      "class E : A {};",
+    cxxRecordDecl(decl().bind("x"), isDerivedFrom("C")),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { class B { void f() {} }; };",
+    functionDecl(decl().bind("x"), hasAncestor(recordDecl(hasName("::A")))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "template <typename T> struct A { struct B {"
+      "  void f() { if(true) {} }"
+      "}; };"
+      "void t() { A<int>::B b; b.f(); }",
+    ifStmt(stmt().bind("x"), hasAncestor(recordDecl(hasName("::A")))),
+    llvm::make_unique<VerifyIdIsBoundTo<Stmt>>("x", 2)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A {};",
+    recordDecl(hasName("::A"), decl().bind("x"), unless(hasName("fooble"))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { A() : s(), i(42) {} const char *s; int i; };",
+    cxxConstructorDecl(hasName("::A::A"), decl().bind("x"),
+                       forEachConstructorInitializer(forField(hasName("i")))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("x", 1)));
+}
+
+TEST(ForEachDescendant, BindsCorrectNodes) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class C { void f(); int i; };",
+    recordDecl(hasName("C"), forEachDescendant(decl().bind("decl"))),
+    llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("decl", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class C { void f() {} int i; };",
+    recordDecl(hasName("C"), forEachDescendant(decl().bind("decl"))),
+    llvm::make_unique<VerifyIdIsBoundTo<FunctionDecl>>("decl", 1)));
+}
+
+TEST(FindAll, BindsNodeOnMatch) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A {};",
+    recordDecl(hasName("::A"), findAll(recordDecl(hasName("::A")).bind("v"))),
+    llvm::make_unique<VerifyIdIsBoundTo<CXXRecordDecl>>("v", 1)));
+}
+
+TEST(FindAll, BindsDescendantNodeOnMatch) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { int a; int b; };",
+    recordDecl(hasName("::A"), findAll(fieldDecl().bind("v"))),
+    llvm::make_unique<VerifyIdIsBoundTo<FieldDecl>>("v", 2)));
+}
+
+TEST(FindAll, BindsNodeAndDescendantNodesOnOneMatch) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { int a; int b; };",
+    recordDecl(hasName("::A"),
+               findAll(decl(anyOf(recordDecl(hasName("::A")).bind("v"),
+                                  fieldDecl().bind("v"))))),
+    llvm::make_unique<VerifyIdIsBoundTo<Decl>>("v", 3)));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class A { class B {}; class C {}; };",
+    recordDecl(hasName("::A"), findAll(recordDecl(isDefinition()).bind("v"))),
+    llvm::make_unique<VerifyIdIsBoundTo<CXXRecordDecl>>("v", 3)));
+}
+
+TEST(HasAncenstor, MatchesDeclarationAncestors) {
+  EXPECT_TRUE(matches(
+    "class A { class B { class C {}; }; };",
+    recordDecl(hasName("C"), hasAncestor(recordDecl(hasName("A"))))));
+}
+
+TEST(HasAncenstor, FailsIfNoAncestorMatches) {
+  EXPECT_TRUE(notMatches(
+    "class A { class B { class C {}; }; };",
+    recordDecl(hasName("C"), hasAncestor(recordDecl(hasName("X"))))));
+}
+
+TEST(HasAncestor, MatchesDeclarationsThatGetVisitedLater) {
+  EXPECT_TRUE(matches(
+    "class A { class B { void f() { C c; } class C {}; }; };",
+    varDecl(hasName("c"), hasType(recordDecl(hasName("C"),
+                                             hasAncestor(recordDecl(hasName("A"))))))));
+}
+
+TEST(HasAncenstor, MatchesStatementAncestors) {
+  EXPECT_TRUE(matches(
+    "void f() { if (true) { while (false) { 42; } } }",
+    integerLiteral(equals(42), hasAncestor(ifStmt()))));
+}
+
+TEST(HasAncestor, DrillsThroughDifferentHierarchies) {
+  EXPECT_TRUE(matches(
+    "void f() { if (true) { int x = 42; } }",
+    integerLiteral(equals(42), hasAncestor(functionDecl(hasName("f"))))));
+}
+
+TEST(HasAncestor, BindsRecursiveCombinations) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class C { class D { class E { class F { int y; }; }; }; };",
+    fieldDecl(hasAncestor(recordDecl(hasAncestor(recordDecl().bind("r"))))),
+    llvm::make_unique<VerifyIdIsBoundTo<CXXRecordDecl>>("r", 1)));
+}
+
+TEST(HasAncestor, BindsCombinationsWithHasDescendant) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class C { class D { class E { class F { int y; }; }; }; };",
+    fieldDecl(hasAncestor(
+      decl(
+        hasDescendant(recordDecl(isDefinition(),
+                                 hasAncestor(recordDecl())))
+      ).bind("d")
+    )),
+    llvm::make_unique<VerifyIdIsBoundTo<CXXRecordDecl>>("d", "E")));
+}
+
+TEST(HasAncestor, MatchesClosestAncestor) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "template <typename T> struct C {"
+      "  void f(int) {"
+      "    struct I { void g(T) { int x; } } i; i.g(42);"
+      "  }"
+      "};"
+      "template struct C<int>;",
+    varDecl(hasName("x"),
+            hasAncestor(functionDecl(hasParameter(
+              0, varDecl(hasType(asString("int"))))).bind("f"))).bind("v"),
+    llvm::make_unique<VerifyIdIsBoundTo<FunctionDecl>>("f", "g", 2)));
+}
+
+TEST(HasAncestor, MatchesInTemplateInstantiations) {
+  EXPECT_TRUE(matches(
+    "template <typename T> struct A { struct B { struct C { T t; }; }; }; "
+      "A<int>::B::C a;",
+    fieldDecl(hasType(asString("int")),
+              hasAncestor(recordDecl(hasName("A"))))));
+}
+
+TEST(HasAncestor, MatchesInImplicitCode) {
+  EXPECT_TRUE(matches(
+    "struct X {}; struct A { A() {} X x; };",
+    cxxConstructorDecl(
+      hasAnyConstructorInitializer(withInitializer(expr(
+        hasAncestor(recordDecl(hasName("A")))))))));
+}
+
+TEST(HasParent, MatchesOnlyParent) {
+  EXPECT_TRUE(matches(
+    "void f() { if (true) { int x = 42; } }",
+    compoundStmt(hasParent(ifStmt()))));
+  EXPECT_TRUE(notMatches(
+    "void f() { for (;;) { int x = 42; } }",
+    compoundStmt(hasParent(ifStmt()))));
+  EXPECT_TRUE(notMatches(
+    "void f() { if (true) for (;;) { int x = 42; } }",
+    compoundStmt(hasParent(ifStmt()))));
+}
+
+TEST(HasAncestor, MatchesAllAncestors) {
+  EXPECT_TRUE(matches(
+    "template <typename T> struct C { static void f() { 42; } };"
+      "void t() { C<int>::f(); }",
+    integerLiteral(
+      equals(42),
+      allOf(
+        hasAncestor(cxxRecordDecl(isTemplateInstantiation())),
+        hasAncestor(cxxRecordDecl(unless(isTemplateInstantiation())))))));
+}
+
+TEST(HasAncestor, ImplicitArrayCopyCtorDeclRefExpr) {
+  EXPECT_TRUE(matches("struct MyClass {\n"
+                        "  int c[1];\n"
+                        "  static MyClass Create() { return MyClass(); }\n"
+                        "};",
+                      declRefExpr(to(decl(hasAncestor(decl()))))));
+}
+
+TEST(HasAncestor, AnonymousUnionMemberExpr) {
+  EXPECT_TRUE(matches("int F() {\n"
+                        "  union { int i; };\n"
+                        "  return i;\n"
+                        "}\n",
+                      memberExpr(member(hasAncestor(decl())))));
+  EXPECT_TRUE(matches("void f() {\n"
+                        "  struct {\n"
+                        "    struct { int a; int b; };\n"
+                        "  } s;\n"
+                        "  s.a = 4;\n"
+                        "}\n",
+                      memberExpr(member(hasAncestor(decl())))));
+  EXPECT_TRUE(matches("void f() {\n"
+                        "  struct {\n"
+                        "    struct { int a; int b; };\n"
+                        "  } s;\n"
+                        "  s.a = 4;\n"
+                        "}\n",
+                      declRefExpr(to(decl(hasAncestor(decl()))))));
+}
+TEST(HasAncestor, NonParmDependentTemplateParmVarDeclRefExpr) {
+  EXPECT_TRUE(matches("struct PartitionAllocator {\n"
+                        "  template<typename T>\n"
+                        "  static int quantizedSize(int count) {\n"
+                        "    return count;\n"
+                        "  }\n"
+                        "  void f() { quantizedSize<int>(10); }\n"
+                        "};",
+                      declRefExpr(to(decl(hasAncestor(decl()))))));
+}
+
+TEST(HasAncestor, AddressOfExplicitSpecializationFunction) {
+  EXPECT_TRUE(matches("template <class T> void f();\n"
+                        "template <> void f<int>();\n"
+                        "void (*get_f())() { return f<int>; }\n",
+                      declRefExpr(to(decl(hasAncestor(decl()))))));
+}
+
+TEST(HasParent, MatchesAllParents) {
+  EXPECT_TRUE(matches(
+    "template <typename T> struct C { static void f() { 42; } };"
+      "void t() { C<int>::f(); }",
+    integerLiteral(
+      equals(42),
+      hasParent(compoundStmt(hasParent(functionDecl(
+        hasParent(cxxRecordDecl(isTemplateInstantiation())))))))));
+  EXPECT_TRUE(
+    matches("template <typename T> struct C { static void f() { 42; } };"
+              "void t() { C<int>::f(); }",
+            integerLiteral(
+              equals(42),
+              hasParent(compoundStmt(hasParent(functionDecl(hasParent(
+                cxxRecordDecl(unless(isTemplateInstantiation()))))))))));
+  EXPECT_TRUE(matches(
+    "template <typename T> struct C { static void f() { 42; } };"
+      "void t() { C<int>::f(); }",
+    integerLiteral(equals(42),
+                   hasParent(compoundStmt(
+                     allOf(hasParent(functionDecl(hasParent(
+                       cxxRecordDecl(isTemplateInstantiation())))),
+                           hasParent(functionDecl(hasParent(cxxRecordDecl(
+                             unless(isTemplateInstantiation())))))))))));
+  EXPECT_TRUE(
+    notMatches("template <typename T> struct C { static void f() {} };"
+                 "void t() { C<int>::f(); }",
+               compoundStmt(hasParent(recordDecl()))));
+}
+
+TEST(HasParent, NoDuplicateParents) {
+  class HasDuplicateParents : public BoundNodesCallback {
+  public:
+    bool run(const BoundNodes *Nodes) override { return false; }
+    bool run(const BoundNodes *Nodes, ASTContext *Context) override {
+      const Stmt *Node = Nodes->getNodeAs<Stmt>("node");
+      std::set<const void *> Parents;
+      for (const auto &Parent : Context->getParents(*Node)) {
+        if (!Parents.insert(Parent.getMemoizationData()).second) {
+          return true;
+        }
+      }
+      return false;
+    }
+  };
+  EXPECT_FALSE(matchAndVerifyResultTrue(
+    "template <typename T> int Foo() { return 1 + 2; }\n"
+      "int x = Foo<int>() + Foo<unsigned>();",
+    stmt().bind("node"), llvm::make_unique<HasDuplicateParents>()));
+}
+
+TEST(TypeMatching, PointeeTypes) {
+  EXPECT_TRUE(matches("int b; int &a = b;",
+                      referenceType(pointee(builtinType()))));
+  EXPECT_TRUE(matches("int *a;", pointerType(pointee(builtinType()))));
+
+  EXPECT_TRUE(matches("int *a;",
+                      loc(pointerType(pointee(builtinType())))));
+
+  EXPECT_TRUE(matches(
+    "int const *A;",
+    pointerType(pointee(isConstQualified(), builtinType()))));
+  EXPECT_TRUE(notMatches(
+    "int *A;",
+    pointerType(pointee(isConstQualified(), builtinType()))));
+}
+
+TEST(ElaboratedTypeNarrowing, hasQualifier) {
+  EXPECT_TRUE(matches(
+    "namespace N {"
+      "  namespace M {"
+      "    class D {};"
+      "  }"
+      "}"
+      "N::M::D d;",
+    elaboratedType(hasQualifier(hasPrefix(specifiesNamespace(hasName("N")))))));
+  EXPECT_TRUE(notMatches(
+    "namespace M {"
+      "  class D {};"
+      "}"
+      "M::D d;",
+    elaboratedType(hasQualifier(hasPrefix(specifiesNamespace(hasName("N")))))));
+  EXPECT_TRUE(notMatches(
+    "struct D {"
+      "} d;",
+    elaboratedType(hasQualifier(nestedNameSpecifier()))));
+}
+
+TEST(ElaboratedTypeNarrowing, namesType) {
+  EXPECT_TRUE(matches(
+    "namespace N {"
+      "  namespace M {"
+      "    class D {};"
+      "  }"
+      "}"
+      "N::M::D d;",
+    elaboratedType(elaboratedType(namesType(recordType(
+      hasDeclaration(namedDecl(hasName("D")))))))));
+  EXPECT_TRUE(notMatches(
+    "namespace M {"
+      "  class D {};"
+      "}"
+      "M::D d;",
+    elaboratedType(elaboratedType(namesType(typedefType())))));
+}
+
+TEST(NNS, BindsNestedNameSpecifiers) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "namespace ns { struct E { struct B {}; }; } ns::E::B b;",
+    nestedNameSpecifier(specifiesType(asString("struct ns::E"))).bind("nns"),
+    llvm::make_unique<VerifyIdIsBoundTo<NestedNameSpecifier>>(
+      "nns", "ns::struct E::")));
+}
+
+TEST(NNS, BindsNestedNameSpecifierLocs) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "namespace ns { struct B {}; } ns::B b;",
+    loc(nestedNameSpecifier()).bind("loc"),
+    llvm::make_unique<VerifyIdIsBoundTo<NestedNameSpecifierLoc>>("loc", 1)));
+}
+
+TEST(NNS, DescendantsOfNestedNameSpecifiers) {
+  std::string Fragment =
+    "namespace a { struct A { struct B { struct C {}; }; }; };"
+      "void f() { a::A::B::C c; }";
+  EXPECT_TRUE(matches(
+    Fragment,
+    nestedNameSpecifier(specifiesType(asString("struct a::A::B")),
+                        hasDescendant(nestedNameSpecifier(
+                          specifiesNamespace(hasName("a")))))));
+  EXPECT_TRUE(notMatches(
+    Fragment,
+    nestedNameSpecifier(specifiesType(asString("struct a::A::B")),
+                        has(nestedNameSpecifier(
+                          specifiesNamespace(hasName("a")))))));
+  EXPECT_TRUE(matches(
+    Fragment,
+    nestedNameSpecifier(specifiesType(asString("struct a::A")),
+                        has(nestedNameSpecifier(
+                          specifiesNamespace(hasName("a")))))));
+
+  // Not really useful because a NestedNameSpecifier can af at most one child,
+  // but to complete the interface.
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    Fragment,
+    nestedNameSpecifier(specifiesType(asString("struct a::A::B")),
+                        forEach(nestedNameSpecifier().bind("x"))),
+    llvm::make_unique<VerifyIdIsBoundTo<NestedNameSpecifier>>("x", 1)));
+}
+
+TEST(NNS, NestedNameSpecifiersAsDescendants) {
+  std::string Fragment =
+    "namespace a { struct A { struct B { struct C {}; }; }; };"
+      "void f() { a::A::B::C c; }";
+  EXPECT_TRUE(matches(
+    Fragment,
+    decl(hasDescendant(nestedNameSpecifier(specifiesType(
+      asString("struct a::A")))))));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    Fragment,
+    functionDecl(hasName("f"),
+                 forEachDescendant(nestedNameSpecifier().bind("x"))),
+    // Nested names: a, a::A and a::A::B.
+    llvm::make_unique<VerifyIdIsBoundTo<NestedNameSpecifier>>("x", 3)));
+}
+
+TEST(NNSLoc, DescendantsOfNestedNameSpecifierLocs) {
+  std::string Fragment =
+    "namespace a { struct A { struct B { struct C {}; }; }; };"
+      "void f() { a::A::B::C c; }";
+  EXPECT_TRUE(matches(
+    Fragment,
+    nestedNameSpecifierLoc(loc(specifiesType(asString("struct a::A::B"))),
+                           hasDescendant(loc(nestedNameSpecifier(
+                             specifiesNamespace(hasName("a"))))))));
+  EXPECT_TRUE(notMatches(
+    Fragment,
+    nestedNameSpecifierLoc(loc(specifiesType(asString("struct a::A::B"))),
+                           has(loc(nestedNameSpecifier(
+                             specifiesNamespace(hasName("a"))))))));
+  EXPECT_TRUE(matches(
+    Fragment,
+    nestedNameSpecifierLoc(loc(specifiesType(asString("struct a::A"))),
+                           has(loc(nestedNameSpecifier(
+                             specifiesNamespace(hasName("a"))))))));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    Fragment,
+    nestedNameSpecifierLoc(loc(specifiesType(asString("struct a::A::B"))),
+                           forEach(nestedNameSpecifierLoc().bind("x"))),
+    llvm::make_unique<VerifyIdIsBoundTo<NestedNameSpecifierLoc>>("x", 1)));
+}
+
+TEST(NNSLoc, NestedNameSpecifierLocsAsDescendants) {
+  std::string Fragment =
+    "namespace a { struct A { struct B { struct C {}; }; }; };"
+      "void f() { a::A::B::C c; }";
+  EXPECT_TRUE(matches(
+    Fragment,
+    decl(hasDescendant(loc(nestedNameSpecifier(specifiesType(
+      asString("struct a::A"))))))));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    Fragment,
+    functionDecl(hasName("f"),
+                 forEachDescendant(nestedNameSpecifierLoc().bind("x"))),
+    // Nested names: a, a::A and a::A::B.
+    llvm::make_unique<VerifyIdIsBoundTo<NestedNameSpecifierLoc>>("x", 3)));
+}
+template <typename T> class VerifyMatchOnNode : public BoundNodesCallback {
+public:
+  VerifyMatchOnNode(StringRef Id, const internal::Matcher<T> &InnerMatcher,
+                    StringRef InnerId)
+    : Id(Id), InnerMatcher(InnerMatcher), InnerId(InnerId) {
+  }
+
+  bool run(const BoundNodes *Nodes) override { return false; }
+
+  bool run(const BoundNodes *Nodes, ASTContext *Context) override {
+    const T *Node = Nodes->getNodeAs<T>(Id);
+    return selectFirst<T>(InnerId, match(InnerMatcher, *Node, *Context)) !=
+      nullptr;
+  }
+private:
+  std::string Id;
+  internal::Matcher<T> InnerMatcher;
+  std::string InnerId;
+};
+
+TEST(MatchFinder, CanMatchDeclarationsRecursively) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X { class Y {}; };", recordDecl(hasName("::X")).bind("X"),
+    llvm::make_unique<VerifyMatchOnNode<Decl>>(
+      "X", decl(hasDescendant(recordDecl(hasName("X::Y")).bind("Y"))),
+      "Y")));
+  EXPECT_TRUE(matchAndVerifyResultFalse(
+    "class X { class Y {}; };", recordDecl(hasName("::X")).bind("X"),
+    llvm::make_unique<VerifyMatchOnNode<Decl>>(
+      "X", decl(hasDescendant(recordDecl(hasName("X::Z")).bind("Z"))),
+      "Z")));
+}
+
+TEST(MatchFinder, CanMatchStatementsRecursively) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "void f() { if (1) { for (;;) { } } }", ifStmt().bind("if"),
+    llvm::make_unique<VerifyMatchOnNode<Stmt>>(
+      "if", stmt(hasDescendant(forStmt().bind("for"))), "for")));
+  EXPECT_TRUE(matchAndVerifyResultFalse(
+    "void f() { if (1) { for (;;) { } } }", ifStmt().bind("if"),
+    llvm::make_unique<VerifyMatchOnNode<Stmt>>(
+      "if", stmt(hasDescendant(declStmt().bind("decl"))), "decl")));
+}
+
+TEST(MatchFinder, CanMatchSingleNodesRecursively) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+    "class X { class Y {}; };", recordDecl(hasName("::X")).bind("X"),
+    llvm::make_unique<VerifyMatchOnNode<Decl>>(
+      "X", recordDecl(has(recordDecl(hasName("X::Y")).bind("Y"))), "Y")));
+  EXPECT_TRUE(matchAndVerifyResultFalse(
+    "class X { class Y {}; };", recordDecl(hasName("::X")).bind("X"),
+    llvm::make_unique<VerifyMatchOnNode<Decl>>(
+      "X", recordDecl(has(recordDecl(hasName("X::Z")).bind("Z"))), "Z")));
+}
+
+TEST(StatementMatcher, HasReturnValue) {
+  StatementMatcher RetVal = returnStmt(hasReturnValue(binaryOperator()));
+  EXPECT_TRUE(matches("int F() { int a, b; return a + b; }", RetVal));
+  EXPECT_FALSE(matches("int F() { int a; return a; }", RetVal));
+  EXPECT_FALSE(matches("void F() { return; }", RetVal));
+}
+
+TEST(StatementMatcher, ForFunction) {
+  const auto CppString1 =
+    "struct PosVec {"
+      "  PosVec& operator=(const PosVec&) {"
+      "    auto x = [] { return 1; };"
+      "    return *this;"
+      "  }"
+      "};";
+  const auto CppString2 =
+    "void F() {"
+      "  struct S {"
+      "    void F2() {"
+      "       return;"
+      "    }"
+      "  };"
+      "}";
+  EXPECT_TRUE(
+    matches(
+      CppString1,
+      returnStmt(forFunction(hasName("operator=")),
+                 has(unaryOperator(hasOperatorName("*"))))));
+  EXPECT_TRUE(
+    notMatches(
+      CppString1,
+      returnStmt(forFunction(hasName("operator=")),
+                 has(integerLiteral()))));
+  EXPECT_TRUE(
+    matches(
+      CppString1,
+      returnStmt(forFunction(hasName("operator()")),
+                 has(integerLiteral()))));
+  EXPECT_TRUE(matches(CppString2, returnStmt(forFunction(hasName("F2")))));
+  EXPECT_TRUE(notMatches(CppString2, returnStmt(forFunction(hasName("F")))));
+}
+
+TEST(Matcher, ForEachOverriden) {
+  const auto ForEachOverriddenInClass = [](const char *ClassName) {
+    return cxxMethodDecl(ofClass(hasName(ClassName)), isVirtual(),
+                         forEachOverridden(cxxMethodDecl().bind("overridden")))
+        .bind("override");
+  };
+  static const char Code1[] = "class A { virtual void f(); };"
+                              "class B : public A { void f(); };"
+                              "class C : public B { void f(); };";
+  // C::f overrides A::f.
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      Code1, ForEachOverriddenInClass("C"),
+      llvm::make_unique<VerifyIdIsBoundTo<CXXMethodDecl>>("override", "f", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      Code1, ForEachOverriddenInClass("C"),
+      llvm::make_unique<VerifyIdIsBoundTo<CXXMethodDecl>>("overridden", "f",
+                                                          1)));
+  // B::f overrides A::f.
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      Code1, ForEachOverriddenInClass("B"),
+      llvm::make_unique<VerifyIdIsBoundTo<CXXMethodDecl>>("override", "f", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      Code1, ForEachOverriddenInClass("B"),
+      llvm::make_unique<VerifyIdIsBoundTo<CXXMethodDecl>>("overridden", "f",
+                                                          1)));
+  // A::f overrides nothing.
+  EXPECT_TRUE(notMatches(Code1, ForEachOverriddenInClass("A")));
+
+  static const char Code2[] =
+      "class A1 { virtual void f(); };"
+      "class A2 { virtual void f(); };"
+      "class B : public A1, public A2 { void f(); };";
+  // B::f overrides A1::f and A2::f. This produces two matches.
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      Code2, ForEachOverriddenInClass("B"),
+      llvm::make_unique<VerifyIdIsBoundTo<CXXMethodDecl>>("override", "f", 2)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      Code2, ForEachOverriddenInClass("B"),
+      llvm::make_unique<VerifyIdIsBoundTo<CXXMethodDecl>>("overridden", "f",
+                                                          2)));
+  // A1::f overrides nothing.
+  EXPECT_TRUE(notMatches(Code2, ForEachOverriddenInClass("A1")));
+}
+
+TEST(Matcher, HasAnyDeclaration) {
+  std::string Fragment = "void foo(int p1);"
+                         "void foo(int *p2);"
+                         "void bar(int p3);"
+                         "template <typename T> void baz(T t) { foo(t); }";
+
+  EXPECT_TRUE(
+      matches(Fragment, unresolvedLookupExpr(hasAnyDeclaration(functionDecl(
+                            hasParameter(0, parmVarDecl(hasName("p1"))))))));
+  EXPECT_TRUE(
+      matches(Fragment, unresolvedLookupExpr(hasAnyDeclaration(functionDecl(
+                            hasParameter(0, parmVarDecl(hasName("p2"))))))));
+  EXPECT_TRUE(
+      notMatches(Fragment, unresolvedLookupExpr(hasAnyDeclaration(functionDecl(
+                               hasParameter(0, parmVarDecl(hasName("p3"))))))));
+  EXPECT_TRUE(notMatches(Fragment, unresolvedLookupExpr(hasAnyDeclaration(
+                                       functionDecl(hasName("bar"))))));
+}
+
+} // namespace ast_matchers
+} // namespace clang
diff --git a/unittests/ASTMatchers/CMakeLists.txt b/unittests/ASTMatchers/CMakeLists.txt
index 3ace9fe..5633031 100644
--- a/unittests/ASTMatchers/CMakeLists.txt
+++ b/unittests/ASTMatchers/CMakeLists.txt
@@ -2,8 +2,20 @@
   Support
   )
 
+# By default MSVC has a 2^16 limit on the number of sections in an object file,
+# and this needs more than that.
+if (MSVC)
+  set_source_files_properties(InternalASTMatchersTest.cpp PROPERTIES COMPILE_FLAGS /bigobj)
+  set_source_files_properties(NodeMatchersTest.cpp PROPERTIES COMPILE_FLAGS /bigobj)
+  set_source_files_properties(NarrowingMatchersTest.cpp PROPERTIES COMPILE_FLAGS /bigobj)
+  set_source_files_properties(ASTTraversalMatchersTest.cpp PROPERTIES COMPILE_FLAGS /bigobj)
+endif()
+
 add_clang_unittest(ASTMatchersTests
-  ASTMatchersTest.cpp)
+  ASTMatchersInternalTest.cpp
+  ASTMatchersNodeTest.cpp
+  ASTMatchersNarrowingTest.cpp
+  ASTMatchersTraversalTest.cpp)
 
 target_link_libraries(ASTMatchersTests
   clangAST
diff --git a/unittests/ASTMatchers/Dynamic/Makefile b/unittests/ASTMatchers/Dynamic/Makefile
deleted file mode 100644
index d0c4852..0000000
--- a/unittests/ASTMatchers/Dynamic/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- unittests/ASTMatchers/Dynamic/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../../..
-
-TESTNAME = DynamicASTMatchers
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangTooling.a clangFrontend.a clangSerialization.a clangDriver.a \
-           clangRewrite.a clangRewriteFrontend.a clangParse.a clangSema.a \
-           clangDynamicASTMatchers.a \
-           clangAnalysis.a clangEdit.a clangAST.a clangASTMatchers.a \
-           clangLex.a \
-           clangBasic.a \
-           clangAPINotes.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/ASTMatchers/Dynamic/ParserTest.cpp b/unittests/ASTMatchers/Dynamic/ParserTest.cpp
index 9c5e9e6..d241f5b 100644
--- a/unittests/ASTMatchers/Dynamic/ParserTest.cpp
+++ b/unittests/ASTMatchers/Dynamic/ParserTest.cpp
@@ -11,7 +11,6 @@
 #include "clang/ASTMatchers/Dynamic/Parser.h"
 #include "clang/ASTMatchers/Dynamic/Registry.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringMap.h"
 #include "gtest/gtest.h"
 #include <string>
 #include <vector>
@@ -263,7 +262,7 @@
             "1:1: Matcher does not support binding.",
             ParseWithError("isArrow().bind(\"foo\")"));
   EXPECT_EQ("Input value has unresolved overloaded type: "
-            "Matcher<DoStmt|ForStmt|WhileStmt|CXXForRangeStmt>",
+            "Matcher<DoStmt|ForStmt|WhileStmt|CXXForRangeStmt|FunctionDecl>",
             ParseMatcherWithError("hasBody(stmt())"));
 }
 
diff --git a/unittests/ASTMatchers/Dynamic/RegistryTest.cpp b/unittests/ASTMatchers/Dynamic/RegistryTest.cpp
index a597fbd..6bbbc2b 100644
--- a/unittests/ASTMatchers/Dynamic/RegistryTest.cpp
+++ b/unittests/ASTMatchers/Dynamic/RegistryTest.cpp
@@ -421,7 +421,7 @@
                        constructMatcher("parameterCountIs", 3), Error.get())
           .isNull());
   EXPECT_EQ("Incorrect type for arg 2. (Expected = Matcher<CXXRecordDecl>) != "
-            "(Actual = Matcher<FunctionDecl>)",
+            "(Actual = Matcher<FunctionDecl|FunctionProtoType>)",
             Error->toString());
 
   // Bad argument type with variadic.
@@ -505,6 +505,12 @@
   EXPECT_FALSE(matches("struct X {};", Value));
 }
 
+TEST_F(RegistryTest, ParenExpr) {
+  Matcher<Stmt> Value = constructMatcher("parenExpr").getTypedMatcher<Stmt>();
+  EXPECT_TRUE(matches("int i = (1);", Value));
+  EXPECT_FALSE(matches("int i = 1;", Value));
+}
+
 } // end anonymous namespace
 } // end namespace dynamic
 } // end namespace ast_matchers
diff --git a/unittests/ASTMatchers/Dynamic/VariantValueTest.cpp b/unittests/ASTMatchers/Dynamic/VariantValueTest.cpp
index 268463d..9df7b78 100644
--- a/unittests/ASTMatchers/Dynamic/VariantValueTest.cpp
+++ b/unittests/ASTMatchers/Dynamic/VariantValueTest.cpp
@@ -16,9 +16,6 @@
 namespace dynamic {
 namespace {
 
-using ast_matchers::internal::DynTypedMatcher;
-using ast_matchers::internal::Matcher;
-
 TEST(VariantValueTest, Unsigned) {
   const unsigned kUnsigned = 17;
   VariantValue Value = kUnsigned;
diff --git a/unittests/ASTMatchers/Makefile b/unittests/ASTMatchers/Makefile
deleted file mode 100644
index accbca0..0000000
--- a/unittests/ASTMatchers/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- unittests/ASTMatchers/Makefile ----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-
-PARALLEL_DIRS = Dynamic
-
-TESTNAME = ASTMatchers
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangTooling.a clangFrontend.a clangSerialization.a clangDriver.a \
-           clangRewrite.a clangRewriteFrontend.a \
-           clangParse.a clangSema.a clangAnalysis.a \
-           clangEdit.a clangAST.a clangASTMatchers.a clangLex.a \
-           clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Analysis/CFGTest.cpp b/unittests/Analysis/CFGTest.cpp
new file mode 100644
index 0000000..a8d397e
--- /dev/null
+++ b/unittests/Analysis/CFGTest.cpp
@@ -0,0 +1,58 @@
+//===- unittests/Analysis/CFGTest.cpp - CFG tests -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Analysis/CFG.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace analysis {
+namespace {
+
+// Constructing a CFG for a range-based for over a dependent type fails (but
+// should not crash).
+TEST(CFG, RangeBasedForOverDependentType) {
+  const char *Code = "class Foo;\n"
+                     "template <typename T>\n"
+                     "void f(const T &Range) {\n"
+                     "  for (const Foo *TheFoo : Range) {\n"
+                     "  }\n"
+                     "}\n";
+
+  class CFGCallback : public ast_matchers::MatchFinder::MatchCallback {
+  public:
+    bool SawFunctionBody = false;
+
+    void run(const ast_matchers::MatchFinder::MatchResult &Result) override {
+      const auto *Func = Result.Nodes.getNodeAs<FunctionDecl>("func");
+      Stmt *Body = Func->getBody();
+      if (!Body)
+        return;
+      SawFunctionBody = true;
+      std::unique_ptr<CFG> cfg =
+          CFG::buildCFG(nullptr, Body, Result.Context, CFG::BuildOptions());
+      EXPECT_EQ(nullptr, cfg);
+    }
+  } Callback;
+
+  ast_matchers::MatchFinder Finder;
+  Finder.addMatcher(ast_matchers::functionDecl().bind("func"), &Callback);
+  std::unique_ptr<tooling::FrontendActionFactory> Factory(
+      tooling::newFrontendActionFactory(&Finder));
+  std::vector<std::string> Args = {"-std=c++11", "-fno-delayed-template-parsing"};
+  ASSERT_TRUE(tooling::runToolOnCodeWithArgs(Factory->create(), Code, Args));
+  EXPECT_TRUE(Callback.SawFunctionBody);
+}
+
+} // namespace
+} // namespace analysis
+} // namespace clang
diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
new file mode 100644
index 0000000..926f586
--- /dev/null
+++ b/unittests/Analysis/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_clang_unittest(CFGTests
+  CFGTest.cpp
+  )
+
+target_link_libraries(CFGTests
+  clangAnalysis
+  clangAST
+  clangASTMatchers
+  clangBasic
+  clangFrontend
+  clangTooling
+  )
diff --git a/unittests/Basic/Makefile b/unittests/Basic/Makefile
deleted file mode 100644
index 82de790..0000000
--- a/unittests/Basic/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- unittests/Basic/Makefile ----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = Basic
-LINK_COMPONENTS := support mc
-USEDLIBS = clangLex.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Basic/SourceManagerTest.cpp b/unittests/Basic/SourceManagerTest.cpp
index 5a1a393..f418761 100644
--- a/unittests/Basic/SourceManagerTest.cpp
+++ b/unittests/Basic/SourceManagerTest.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "gtest/gtest.h"
 
-using namespace llvm;
 using namespace clang;
 
 namespace {
@@ -73,7 +72,8 @@
   const char *source =
     "#define M(x) [x]\n"
     "M(foo)";
-  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(source);
+  std::unique_ptr<llvm::MemoryBuffer> Buf =
+      llvm::MemoryBuffer::getMemBuffer(source);
   FileID mainFileID = SourceMgr.createFileID(std::move(Buf));
   SourceMgr.setMainFileID(mainFileID);
 
@@ -126,7 +126,8 @@
     "int x;\n"
     "int y;";
 
-  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Source);
+  std::unique_ptr<llvm::MemoryBuffer> Buf =
+      llvm::MemoryBuffer::getMemBuffer(Source);
   FileID MainFileID = SourceMgr.createFileID(std::move(Buf));
   SourceMgr.setMainFileID(MainFileID);
 
@@ -185,8 +186,10 @@
     "#define CONCAT(X, Y) X##Y\n"
     "CONCAT(1,1)\n";
 
-  std::unique_ptr<MemoryBuffer> HeaderBuf = MemoryBuffer::getMemBuffer(header);
-  std::unique_ptr<MemoryBuffer> MainBuf = MemoryBuffer::getMemBuffer(main);
+  std::unique_ptr<llvm::MemoryBuffer> HeaderBuf =
+      llvm::MemoryBuffer::getMemBuffer(header);
+  std::unique_ptr<llvm::MemoryBuffer> MainBuf =
+      llvm::MemoryBuffer::getMemBuffer(main);
   FileID mainFileID = SourceMgr.createFileID(std::move(MainBuf));
   SourceMgr.setMainFileID(mainFileID);
 
@@ -284,8 +287,10 @@
     "#define INC2 </test-header.h>\n"
     "#include M(INC2)\n";
 
-  std::unique_ptr<MemoryBuffer> HeaderBuf = MemoryBuffer::getMemBuffer(header);
-  std::unique_ptr<MemoryBuffer> MainBuf = MemoryBuffer::getMemBuffer(main);
+  std::unique_ptr<llvm::MemoryBuffer> HeaderBuf =
+      llvm::MemoryBuffer::getMemBuffer(header);
+  std::unique_ptr<llvm::MemoryBuffer> MainBuf =
+      llvm::MemoryBuffer::getMemBuffer(main);
   SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(MainBuf)));
 
   const FileEntry *headerFile = FileMgr.getVirtualFile("/test-header.h",
diff --git a/unittests/Basic/VirtualFileSystemTest.cpp b/unittests/Basic/VirtualFileSystemTest.cpp
index c0f2d0d..547cba1 100644
--- a/unittests/Basic/VirtualFileSystemTest.cpp
+++ b/unittests/Basic/VirtualFileSystemTest.cpp
@@ -12,7 +12,6 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 #include <map>
@@ -31,7 +30,7 @@
             bool IsVolatile) override {
     llvm_unreachable("unimplemented");
   }
-  virtual std::error_code close() override { return std::error_code(); }
+  std::error_code close() override { return std::error_code(); }
 };
 
 class DummyFileSystem : public vfs::FileSystem {
@@ -350,7 +349,6 @@
   ASSERT_FALSE(EC);
   ASSERT_NE(vfs::recursive_directory_iterator(), I);
 
-
   std::vector<std::string> Contents;
   for (auto E = vfs::recursive_directory_iterator(); !EC && I != E;
        I.increment(EC)) {
@@ -373,16 +371,23 @@
 }
 
 template <typename DirIter>
-static void checkContents(DirIter I, ArrayRef<StringRef> Expected) {
+static void checkContents(DirIter I, ArrayRef<StringRef> ExpectedOut) {
   std::error_code EC;
-  auto ExpectedIter = Expected.begin(), ExpectedEnd = Expected.end();
-  for (DirIter E;
-       !EC && I != E && ExpectedIter != ExpectedEnd;
-       I.increment(EC), ++ExpectedIter)
-    EXPECT_EQ(*ExpectedIter, I->getName());
+  SmallVector<StringRef, 4> Expected(ExpectedOut.begin(), ExpectedOut.end());
+  SmallVector<std::string, 4> InputToCheck;
 
-  EXPECT_EQ(ExpectedEnd, ExpectedIter);
-  EXPECT_EQ(DirIter(), I);
+  // Do not rely on iteration order to check for contents, sort both
+  // content vectors before comparison.
+  for (DirIter E; !EC && I != E; I.increment(EC))
+    InputToCheck.push_back(I->getName());
+
+  std::sort(InputToCheck.begin(), InputToCheck.end());
+  std::sort(Expected.begin(), Expected.end());
+  EXPECT_EQ(InputToCheck.size(), Expected.size());
+
+  unsigned LastElt = std::min(InputToCheck.size(), Expected.size());
+  for (unsigned Idx = 0; Idx != LastElt; ++Idx)
+    EXPECT_EQ(StringRef(InputToCheck[Idx]), Expected[Idx]);
 }
 
 TEST(VirtualFileSystemTest, OverlayIteration) {
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index b85ec7e..7d407ce 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -13,6 +13,7 @@
 add_subdirectory(Lex)
 add_subdirectory(Driver)
 if(CLANG_ENABLE_STATIC_ANALYZER)
+  add_subdirectory(Analysis)
   add_subdirectory(StaticAnalyzer)
   add_subdirectory(Frontend)
 endif()
diff --git a/unittests/CodeGen/BufferSourceTest.cpp b/unittests/CodeGen/BufferSourceTest.cpp
index 85df768..1934e66 100644
--- a/unittests/CodeGen/BufferSourceTest.cpp
+++ b/unittests/CodeGen/BufferSourceTest.cpp
@@ -39,6 +39,7 @@
     "EmitCXXGlobalInitFunc test;    ";
 
 TEST(BufferSourceTest, EmitCXXGlobalInitFunc) {
+    LLVMContext Context;
     CompilerInstance compiler;
 
     compiler.createDiagnostics();
@@ -65,7 +66,7 @@
             compiler.getHeaderSearchOpts(),
             compiler.getPreprocessorOpts(),
             compiler.getCodeGenOpts(),
-            llvm::getGlobalContext())));
+            Context)));
 
     compiler.createSema(clang::TU_Prefix, nullptr);
 
diff --git a/unittests/CodeGen/Makefile b/unittests/CodeGen/Makefile
deleted file mode 100644
index 01bdf76..0000000
--- a/unittests/CodeGen/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-##===- unittests/CodeGen/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = CodeGen
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader mc option \
-                   profiledata support
-USEDLIBS = clangCodeGen.a clangFrontend.a clangSerialization.a \
-           clangDriver.a \
-           clangParse.a clangSema.a clangAnalysis.a \
-           clangEdit.a clangAST.a clangLex.a clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Driver/Makefile b/unittests/Driver/Makefile
deleted file mode 100644
index 21d19f3..0000000
--- a/unittests/Driver/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- unittests/Driver/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = Multilib
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) support option
-USEDLIBS = clangDriver.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Driver/ToolChainTest.cpp b/unittests/Driver/ToolChainTest.cpp
index ef21e2d..f7ba3ee 100644
--- a/unittests/Driver/ToolChainTest.cpp
+++ b/unittests/Driver/ToolChainTest.cpp
@@ -117,4 +117,29 @@
             S);
 }
 
+TEST(ToolChainTest, DefaultDriverMode) {
+  IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
+
+  IntrusiveRefCntPtr<DiagnosticIDs> DiagID(new DiagnosticIDs());
+  struct TestDiagnosticConsumer : public DiagnosticConsumer {};
+  DiagnosticsEngine Diags(DiagID, &*DiagOpts, new TestDiagnosticConsumer);
+  IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
+      new vfs::InMemoryFileSystem);
+
+  Driver CCDriver("/home/test/bin/clang", "arm-linux-gnueabi", Diags,
+                  InMemoryFileSystem);
+  Driver CXXDriver("/home/test/bin/clang++", "arm-linux-gnueabi", Diags,
+                   InMemoryFileSystem);
+  Driver CLDriver("/home/test/bin/clang-cl", "arm-linux-gnueabi", Diags,
+                  InMemoryFileSystem);
+
+  std::unique_ptr<Compilation> CC(CCDriver.BuildCompilation({"foo.cpp"}));
+  std::unique_ptr<Compilation> CXX(CXXDriver.BuildCompilation({"foo.cpp"}));
+  std::unique_ptr<Compilation> CL(CLDriver.BuildCompilation({"foo.cpp"}));
+
+  EXPECT_TRUE(CCDriver.CCCIsCC());
+  EXPECT_TRUE(CXXDriver.CCCIsCXX());
+  EXPECT_TRUE(CLDriver.IsCLMode());
+}
+
 } // end anonymous namespace
diff --git a/unittests/Format/CMakeLists.txt b/unittests/Format/CMakeLists.txt
index 01af435..240be6e 100644
--- a/unittests/Format/CMakeLists.txt
+++ b/unittests/Format/CMakeLists.txt
@@ -3,15 +3,20 @@
   )
 
 add_clang_unittest(FormatTests
+  CleanupTest.cpp
   FormatTest.cpp
   FormatTestJava.cpp
   FormatTestJS.cpp
   FormatTestProto.cpp
   FormatTestSelective.cpp
+  SortImportsTestJS.cpp
   SortIncludesTest.cpp
   )
 
 target_link_libraries(FormatTests
+  clangBasic
   clangFormat
+  clangFrontend
+  clangRewrite
   clangToolingCore
   )
diff --git a/unittests/Format/CleanupTest.cpp b/unittests/Format/CleanupTest.cpp
new file mode 100644
index 0000000..cb1c742
--- /dev/null
+++ b/unittests/Format/CleanupTest.cpp
@@ -0,0 +1,750 @@
+//===- unittest/Format/CleanupTest.cpp - Code cleanup unit tests ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Format/Format.h"
+
+#include "../Tooling/ReplacementTest.h"
+#include "../Tooling/RewriterTestContext.h"
+#include "clang/Tooling/Core/Replacement.h"
+
+#include "gtest/gtest.h"
+
+using clang::tooling::ReplacementTest;
+using clang::tooling::toReplacements;
+
+namespace clang {
+namespace format {
+namespace {
+
+class CleanupTest : public ::testing::Test {
+protected:
+  std::string cleanup(llvm::StringRef Code,
+                      const std::vector<tooling::Range> &Ranges,
+                      const FormatStyle &Style = getLLVMStyle()) {
+    tooling::Replacements Replaces = format::cleanup(Style, Code, Ranges);
+
+    auto Result = applyAllReplacements(Code, Replaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    return *Result;
+  }
+};
+
+TEST_F(CleanupTest, DeleteEmptyNamespaces) {
+  std::string Code = "namespace A {\n"
+                     "namespace B {\n"
+                     "} // namespace B\n"
+                     "} // namespace A\n\n"
+                     "namespace C {\n"
+                     "namespace D { int i; }\n"
+                     "inline namespace E { namespace { } }\n"
+                     "}";
+  std::string Expected = "\n\n\n\n\nnamespace C {\n"
+                         "namespace D { int i; }\n   \n"
+                         "}";
+  std::vector<tooling::Range> Ranges;
+  Ranges.push_back(tooling::Range(28, 0));
+  Ranges.push_back(tooling::Range(91, 6));
+  Ranges.push_back(tooling::Range(132, 0));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, NamespaceWithSyntaxError) {
+  std::string Code = "namespace A {\n"
+                     "namespace B {\n" // missing r_brace
+                     "} // namespace A\n\n"
+                     "namespace C {\n"
+                     "namespace D int i; }\n"
+                     "inline namespace E { namespace { } }\n"
+                     "}";
+  std::string Expected = "namespace A {\n"
+                         "\n\n\nnamespace C {\n"
+                         "namespace D int i; }\n   \n"
+                         "}";
+  std::vector<tooling::Range> Ranges(1, tooling::Range(0, Code.size()));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, EmptyNamespaceNotAffected) {
+  std::string Code = "namespace A {\n\n"
+                     "namespace {\n\n}}";
+  // Even though the namespaces are empty, but the inner most empty namespace
+  // block is not affected by the changed ranges.
+  std::string Expected = "namespace A {\n\n"
+                         "namespace {\n\n}}";
+  // Set the changed range to be the second "\n".
+  std::vector<tooling::Range> Ranges(1, tooling::Range(14, 0));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, EmptyNamespaceWithCommentsNoBreakBeforeBrace) {
+  std::string Code = "namespace A {\n"
+                     "namespace B {\n"
+                     "// Yo\n"
+                     "} // namespace B\n"
+                     "} // namespace A\n"
+                     "namespace C { // Yo\n"
+                     "}";
+  std::string Expected = "\n\n\n\n\n\n";
+  std::vector<tooling::Range> Ranges(1, tooling::Range(0, Code.size()));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, EmptyNamespaceWithCommentsBreakBeforeBrace) {
+  std::string Code = "namespace A\n"
+                     "/* Yo */ {\n"
+                     "namespace B\n"
+                     "{\n"
+                     "// Yo\n"
+                     "} // namespace B\n"
+                     "} // namespace A\n"
+                     "namespace C\n"
+                     "{ // Yo\n"
+                     "}\n";
+  std::string Expected = "\n\n\n\n\n\n\n\n\n\n";
+  std::vector<tooling::Range> Ranges(1, tooling::Range(0, Code.size()));
+  FormatStyle Style = getLLVMStyle();
+  Style.BraceWrapping.AfterNamespace = true;
+  std::string Result = cleanup(Code, Ranges, Style);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, CtorInitializationSimpleRedundantComma) {
+  std::string Code = "class A {\nA() : , {} };";
+  std::string Expected = "class A {\nA()  {} };";
+  std::vector<tooling::Range> Ranges;
+  Ranges.push_back(tooling::Range(17, 0));
+  Ranges.push_back(tooling::Range(19, 0));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+
+  Code = "class A {\nA() : x(1), {} };";
+  Expected = "class A {\nA() : x(1) {} };";
+  Ranges.clear();
+  Ranges.push_back(tooling::Range(23, 0));
+  Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+
+  Code = "class A {\nA() :,,,,{} };";
+  Expected = "class A {\nA() {} };";
+  Ranges.clear();
+  Ranges.push_back(tooling::Range(15, 0));
+  Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, ListSimpleRedundantComma) {
+  std::string Code = "void f() { std::vector<int> v = {1,2,,,3,{4,5}}; }";
+  std::string Expected = "void f() { std::vector<int> v = {1,2,3,{4,5}}; }";
+  std::vector<tooling::Range> Ranges;
+  Ranges.push_back(tooling::Range(40, 0));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+
+  Code = "int main() { f(1,,2,3,,4);}";
+  Expected = "int main() { f(1,2,3,4);}";
+  Ranges.clear();
+  Ranges.push_back(tooling::Range(17, 0));
+  Ranges.push_back(tooling::Range(22, 0));
+  Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, CtorInitializationBracesInParens) {
+  std::string Code = "class A {\nA() : x({1}),, {} };";
+  std::string Expected = "class A {\nA() : x({1}) {} };";
+  std::vector<tooling::Range> Ranges;
+  Ranges.push_back(tooling::Range(24, 0));
+  Ranges.push_back(tooling::Range(26, 0));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, RedundantCommaNotInAffectedRanges) {
+  std::string Code =
+      "class A {\nA() : x({1}), /* comment */, { int x = 0; } };";
+  std::string Expected =
+      "class A {\nA() : x({1}), /* comment */, { int x = 0; } };";
+  // Set the affected range to be "int x = 0", which does not intercept the
+  // constructor initialization list.
+  std::vector<tooling::Range> Ranges(1, tooling::Range(42, 9));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+
+  Code = "class A {\nA() : x(1), {} };";
+  Expected = "class A {\nA() : x(1), {} };";
+  // No range. Fixer should do nothing.
+  Ranges.clear();
+  Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+// FIXME: delete comments too.
+TEST_F(CleanupTest, CtorInitializationCommentAroundCommas) {
+  // Remove redundant commas around comment.
+  std::string Code = "class A {\nA() : x({1}), /* comment */, {} };";
+  std::string Expected = "class A {\nA() : x({1}) /* comment */ {} };";
+  std::vector<tooling::Range> Ranges;
+  Ranges.push_back(tooling::Range(25, 0));
+  Ranges.push_back(tooling::Range(40, 0));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+
+  // Remove trailing comma and ignore comment.
+  Code = "class A {\nA() : x({1}), // comment\n{} };";
+  Expected = "class A {\nA() : x({1}) // comment\n{} };";
+  Ranges = std::vector<tooling::Range>(1, tooling::Range(25, 0));
+  Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+
+  // Remove trailing comma and ignore comment.
+  Code = "class A {\nA() : x({1}), // comment\n , y(1),{} };";
+  Expected = "class A {\nA() : x({1}), // comment\n  y(1){} };";
+  Ranges = std::vector<tooling::Range>(1, tooling::Range(38, 0));
+  Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+
+  // Remove trailing comma and ignore comment.
+  Code = "class A {\nA() : x({1}), \n/* comment */, y(1),{} };";
+  Expected = "class A {\nA() : x({1}), \n/* comment */ y(1){} };";
+  Ranges = std::vector<tooling::Range>(1, tooling::Range(40, 0));
+  Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+
+  // Remove trailing comma and ignore comment.
+  Code = "class A {\nA() : , // comment\n y(1),{} };";
+  Expected = "class A {\nA() :  // comment\n y(1){} };";
+  Ranges = std::vector<tooling::Range>(1, tooling::Range(17, 0));
+  Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST_F(CleanupTest, CtorInitializerInNamespace) {
+  std::string Code = "namespace A {\n"
+                     "namespace B {\n" // missing r_brace
+                     "} // namespace A\n\n"
+                     "namespace C {\n"
+                     "class A { A() : x(0),, {} };\n"
+                     "inline namespace E { namespace { } }\n"
+                     "}";
+  std::string Expected = "namespace A {\n"
+                         "\n\n\nnamespace C {\n"
+                         "class A { A() : x(0) {} };\n   \n"
+                         "}";
+  std::vector<tooling::Range> Ranges(1, tooling::Range(0, Code.size()));
+  std::string Result = cleanup(Code, Ranges);
+  EXPECT_EQ(Expected, Result);
+}
+
+class CleanUpReplacementsTest : public ReplacementTest {
+protected:
+  tooling::Replacement createReplacement(unsigned Offset, unsigned Length,
+                                         StringRef Text) {
+    return tooling::Replacement(FileName, Offset, Length, Text);
+  }
+
+  tooling::Replacement createInsertion(StringRef HeaderName) {
+    return createReplacement(UINT_MAX, 0, HeaderName);
+  }
+
+  inline std::string apply(StringRef Code,
+                           const tooling::Replacements Replaces) {
+    auto CleanReplaces = cleanupAroundReplacements(Code, Replaces, Style);
+    EXPECT_TRUE(static_cast<bool>(CleanReplaces))
+        << llvm::toString(CleanReplaces.takeError()) << "\n";
+    auto Result = applyAllReplacements(Code, *CleanReplaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    return *Result;
+  }
+
+  inline std::string formatAndApply(StringRef Code,
+                                    const tooling::Replacements Replaces) {
+
+    auto CleanReplaces = cleanupAroundReplacements(Code, Replaces, Style);
+    EXPECT_TRUE(static_cast<bool>(CleanReplaces))
+        << llvm::toString(CleanReplaces.takeError()) << "\n";
+    auto FormattedReplaces = formatReplacements(Code, *CleanReplaces, Style);
+    EXPECT_TRUE(static_cast<bool>(FormattedReplaces))
+        << llvm::toString(FormattedReplaces.takeError()) << "\n";
+    auto Result = applyAllReplacements(Code, *FormattedReplaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    return *Result;
+  }
+
+  int getOffset(StringRef Code, int Line, int Column) {
+    RewriterTestContext Context;
+    FileID ID = Context.createInMemoryFile(FileName, Code);
+    auto DecomposedLocation =
+        Context.Sources.getDecomposedLoc(Context.getLocation(ID, Line, Column));
+    return DecomposedLocation.second;
+  }
+
+  const std::string FileName = "fix.cpp";
+  FormatStyle Style = getLLVMStyle();
+};
+
+TEST_F(CleanUpReplacementsTest, FixOnlyAffectedCodeAfterReplacements) {
+  std::string Code = "namespace A {\n"
+                     "namespace B {\n"
+                     "  int x;\n"
+                     "} // namespace B\n"
+                     "} // namespace A\n"
+                     "\n"
+                     "namespace C {\n"
+                     "namespace D { int i; }\n"
+                     "inline namespace E { namespace { int y; } }\n"
+                     "int x=     0;"
+                     "}";
+  std::string Expected = "\n\nnamespace C {\n"
+                         "namespace D { int i; }\n\n"
+                         "int x=     0;"
+                         "}";
+  tooling::Replacements Replaces =
+      toReplacements({createReplacement(getOffset(Code, 3, 3), 6, ""),
+                      createReplacement(getOffset(Code, 9, 34), 6, "")});
+
+  EXPECT_EQ(Expected, formatAndApply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, NoExistingIncludeWithoutDefine) {
+  std::string Code = "int main() {}";
+  std::string Expected = "#include \"a.h\"\n"
+                         "int main() {}";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include \"a.h\"")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, NoExistingIncludeWithDefine) {
+  std::string Code = "#ifndef A_H\n"
+                     "#define A_H\n"
+                     "class A {};\n"
+                     "#define MMM 123\n"
+                     "#endif";
+  std::string Expected = "#ifndef A_H\n"
+                         "#define A_H\n"
+                         "#include \"b.h\"\n"
+                         "class A {};\n"
+                         "#define MMM 123\n"
+                         "#endif";
+
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include \"b.h\"")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertBeforeCategoryWithLowerPriority) {
+  std::string Code = "#ifndef A_H\n"
+                     "#define A_H\n"
+                     "\n"
+                     "\n"
+                     "\n"
+                     "#include <vector>\n"
+                     "class A {};\n"
+                     "#define MMM 123\n"
+                     "#endif";
+  std::string Expected = "#ifndef A_H\n"
+                         "#define A_H\n"
+                         "\n"
+                         "\n"
+                         "\n"
+                         "#include \"a.h\"\n"
+                         "#include <vector>\n"
+                         "class A {};\n"
+                         "#define MMM 123\n"
+                         "#endif";
+
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include \"a.h\"")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertAfterMainHeader) {
+  std::string Code = "#include \"fix.h\"\n"
+                     "\n"
+                     "int main() {}";
+  std::string Expected = "#include \"fix.h\"\n"
+                         "#include <a>\n"
+                         "\n"
+                         "int main() {}";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <a>")});
+  Style = format::getGoogleStyle(format::FormatStyle::LanguageKind::LK_Cpp);
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertBeforeSystemHeaderLLVM) {
+  std::string Code = "#include <memory>\n"
+                     "\n"
+                     "int main() {}";
+  std::string Expected = "#include \"z.h\"\n"
+                         "#include <memory>\n"
+                         "\n"
+                         "int main() {}";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include \"z.h\"")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertAfterSystemHeaderGoogle) {
+  std::string Code = "#include <memory>\n"
+                     "\n"
+                     "int main() {}";
+  std::string Expected = "#include <memory>\n"
+                         "#include \"z.h\"\n"
+                         "\n"
+                         "int main() {}";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include \"z.h\"")});
+  Style = format::getGoogleStyle(format::FormatStyle::LanguageKind::LK_Cpp);
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertOneIncludeLLVMStyle) {
+  std::string Code = "#include \"x/fix.h\"\n"
+                     "#include \"a.h\"\n"
+                     "#include \"b.h\"\n"
+                     "#include \"clang/Format/Format.h\"\n"
+                     "#include <memory>\n";
+  std::string Expected = "#include \"x/fix.h\"\n"
+                         "#include \"a.h\"\n"
+                         "#include \"b.h\"\n"
+                         "#include \"d.h\"\n"
+                         "#include \"clang/Format/Format.h\"\n"
+                         "#include \"llvm/x/y.h\"\n"
+                         "#include <memory>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include \"d.h\""),
+                      createInsertion("#include \"llvm/x/y.h\"")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertMultipleIncludesLLVMStyle) {
+  std::string Code = "#include \"x/fix.h\"\n"
+                     "#include \"a.h\"\n"
+                     "#include \"b.h\"\n"
+                     "#include \"clang/Format/Format.h\"\n"
+                     "#include <memory>\n";
+  std::string Expected = "#include \"x/fix.h\"\n"
+                         "#include \"a.h\"\n"
+                         "#include \"b.h\"\n"
+                         "#include \"new/new.h\"\n"
+                         "#include \"clang/Format/Format.h\"\n"
+                         "#include <memory>\n"
+                         "#include <list>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <list>"),
+                      createInsertion("#include \"new/new.h\"")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertNewSystemIncludeGoogleStyle) {
+  std::string Code = "#include \"x/fix.h\"\n"
+                     "\n"
+                     "#include \"y/a.h\"\n"
+                     "#include \"z/b.h\"\n";
+  // FIXME: inserting after the empty line following the main header might be
+  // prefered.
+  std::string Expected = "#include \"x/fix.h\"\n"
+                         "#include <vector>\n"
+                         "\n"
+                         "#include \"y/a.h\"\n"
+                         "#include \"z/b.h\"\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  Style = format::getGoogleStyle(format::FormatStyle::LanguageKind::LK_Cpp);
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertMultipleIncludesGoogleStyle) {
+  std::string Code = "#include \"x/fix.h\"\n"
+                     "\n"
+                     "#include <vector>\n"
+                     "\n"
+                     "#include \"y/a.h\"\n"
+                     "#include \"z/b.h\"\n";
+  std::string Expected = "#include \"x/fix.h\"\n"
+                         "\n"
+                         "#include <vector>\n"
+                         "#include <list>\n"
+                         "\n"
+                         "#include \"y/a.h\"\n"
+                         "#include \"z/b.h\"\n"
+                         "#include \"x/x.h\"\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <list>"),
+                      createInsertion("#include \"x/x.h\"")});
+  Style = format::getGoogleStyle(format::FormatStyle::LanguageKind::LK_Cpp);
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertMultipleNewHeadersAndSortLLVM) {
+  std::string Code = "\nint x;";
+  std::string Expected = "\n#include \"fix.h\"\n"
+                         "#include \"a.h\"\n"
+                         "#include \"b.h\"\n"
+                         "#include \"c.h\"\n"
+                         "#include <list>\n"
+                         "#include <vector>\n"
+                         "int x;";
+  tooling::Replacements Replaces = toReplacements(
+      {createInsertion("#include \"a.h\""), createInsertion("#include \"c.h\""),
+       createInsertion("#include \"b.h\""),
+       createInsertion("#include <vector>"), createInsertion("#include <list>"),
+       createInsertion("#include \"fix.h\"")});
+  EXPECT_EQ(Expected, formatAndApply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, InsertMultipleNewHeadersAndSortGoogle) {
+  std::string Code = "\nint x;";
+  std::string Expected = "\n#include \"fix.h\"\n"
+                         "#include <list>\n"
+                         "#include <vector>\n"
+                         "#include \"a.h\"\n"
+                         "#include \"b.h\"\n"
+                         "#include \"c.h\"\n"
+                         "int x;";
+  tooling::Replacements Replaces = toReplacements(
+      {createInsertion("#include \"a.h\""), createInsertion("#include \"c.h\""),
+       createInsertion("#include \"b.h\""),
+       createInsertion("#include <vector>"), createInsertion("#include <list>"),
+       createInsertion("#include \"fix.h\"")});
+  Style = format::getGoogleStyle(format::FormatStyle::LanguageKind::LK_Cpp);
+  EXPECT_EQ(Expected, formatAndApply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, FormatCorrectLineWhenHeadersAreInserted) {
+  std::string Code = "\n"
+                     "int x;\n"
+                     "int    a;\n"
+                     "int    a;\n"
+                     "int    a;";
+
+  std::string Expected = "\n#include \"x.h\"\n"
+                         "#include \"y.h\"\n"
+                         "#include \"clang/x/x.h\"\n"
+                         "#include <list>\n"
+                         "#include <vector>\n"
+                         "int x;\n"
+                         "int    a;\n"
+                         "int b;\n"
+                         "int    a;";
+  tooling::Replacements Replaces = toReplacements(
+      {createReplacement(getOffset(Code, 4, 8), 1, "b"),
+       createInsertion("#include <vector>"), createInsertion("#include <list>"),
+       createInsertion("#include \"clang/x/x.h\""),
+       createInsertion("#include \"y.h\""),
+       createInsertion("#include \"x.h\"")});
+  EXPECT_EQ(Expected, formatAndApply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, NotConfusedByDefine) {
+  std::string Code = "void f() {}\n"
+                     "#define A \\\n"
+                     "  int i;";
+  std::string Expected = "#include <vector>\n"
+                         "void f() {}\n"
+                         "#define A \\\n"
+                         "  int i;";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, formatAndApply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, SkippedTopComment) {
+  std::string Code = "// comment\n"
+                     "\n"
+                     "   // comment\n";
+  std::string Expected = "// comment\n"
+                         "\n"
+                         "   // comment\n"
+                         "#include <vector>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, SkippedMixedComments) {
+  std::string Code = "// comment\n"
+                     "// comment \\\n"
+                     " comment continued\n"
+                     "/*\n"
+                     "* comment\n"
+                     "*/\n";
+  std::string Expected = "// comment\n"
+                         "// comment \\\n"
+                         " comment continued\n"
+                         "/*\n"
+                         "* comment\n"
+                         "*/\n"
+                         "#include <vector>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, MultipleBlockCommentsInOneLine) {
+  std::string Code = "/*\n"
+                     "* comment\n"
+                     "*/ /* comment\n"
+                     "*/\n"
+                     "\n\n"
+                     "/* c1 */ /*c2 */\n";
+  std::string Expected = "/*\n"
+                         "* comment\n"
+                         "*/ /* comment\n"
+                         "*/\n"
+                         "\n\n"
+                         "/* c1 */ /*c2 */\n"
+                         "#include <vector>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, CodeAfterComments) {
+  std::string Code = "/*\n"
+                     "* comment\n"
+                     "*/ /* comment\n"
+                     "*/\n"
+                     "\n\n"
+                     "/* c1 */ /*c2 */\n"
+                     "\n"
+                     "int x;\n";
+  std::string Expected = "/*\n"
+                         "* comment\n"
+                         "*/ /* comment\n"
+                         "*/\n"
+                         "\n\n"
+                         "/* c1 */ /*c2 */\n"
+                         "\n"
+                         "#include <vector>\n"
+                         "int x;\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, FakeHeaderGuardIfDef) {
+  std::string Code = "// comment \n"
+                     "#ifdef X\n"
+                     "#define X\n";
+  std::string Expected = "// comment \n"
+                         "#include <vector>\n"
+                         "#ifdef X\n"
+                         "#define X\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, RealHeaderGuardAfterComments) {
+  std::string Code = "// comment \n"
+                     "#ifndef X\n"
+                     "#define X\n"
+                     "int x;\n"
+                     "#define Y 1\n";
+  std::string Expected = "// comment \n"
+                         "#ifndef X\n"
+                         "#define X\n"
+                         "#include <vector>\n"
+                         "int x;\n"
+                         "#define Y 1\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, IfNDefWithNoDefine) {
+  std::string Code = "// comment \n"
+                     "#ifndef X\n"
+                     "int x;\n"
+                     "#define Y 1\n";
+  std::string Expected = "// comment \n"
+                         "#include <vector>\n"
+                         "#ifndef X\n"
+                         "int x;\n"
+                         "#define Y 1\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, HeaderGuardWithComment) {
+  std::string Code = "// comment \n"
+                     "#ifndef X // comment\n"
+                     "// comment\n"
+                     "/* comment\n"
+                     "*/\n"
+                     "/* comment */ #define X\n"
+                     "int x;\n"
+                     "#define Y 1\n";
+  std::string Expected = "// comment \n"
+                         "#ifndef X // comment\n"
+                         "// comment\n"
+                         "/* comment\n"
+                         "*/\n"
+                         "/* comment */ #define X\n"
+                         "#include <vector>\n"
+                         "int x;\n"
+                         "#define Y 1\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, EmptyCode) {
+  std::string Code = "";
+  std::string Expected = "#include <vector>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+// FIXME: although this case does not crash, the insertion is wrong. A '\n'
+// should be inserted between the two #includes.
+TEST_F(CleanUpReplacementsTest, NoNewLineAtTheEndOfCode) {
+  std::string Code = "#include <map>";
+  std::string Expected = "#include <map>#include <vector>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, SkipExistingHeaders) {
+  std::string Code = "#include \"a.h\"\n"
+                     "#include <vector>\n";
+  std::string Expected = "#include \"a.h\"\n"
+                         "#include <vector>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include <vector>"),
+                      createInsertion("#include \"a.h\"")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+TEST_F(CleanUpReplacementsTest, AddIncludesWithDifferentForms) {
+  std::string Code = "#include \"a.h\"\n"
+                     "#include <vector>\n";
+  // FIXME: this might not be the best behavior.
+  std::string Expected = "#include \"a.h\"\n"
+                         "#include \"vector\"\n"
+                         "#include <vector>\n"
+                         "#include <a.h>\n";
+  tooling::Replacements Replaces =
+      toReplacements({createInsertion("#include \"vector\""),
+                      createInsertion("#include <a.h>")});
+  EXPECT_EQ(Expected, apply(Code, Replaces));
+}
+
+} // end namespace
+} // end namespace format
+} // end namespace clang
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index 48269ca..6cffb3b 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -7,13 +7,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "FormatTestUtils.h"
 #include "clang/Format/Format.h"
+
+#include "../Tooling/ReplacementTest.h"
+#include "FormatTestUtils.h"
+
+#include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "gtest/gtest.h"
 
 #define DEBUG_TYPE "format-test"
 
+using clang::tooling::ReplacementTest;
+using clang::tooling::toReplacements;
+
 namespace clang {
 namespace format {
 namespace {
@@ -42,10 +50,10 @@
       EXPECT_EQ(ExpectedIncompleteFormat, IncompleteFormat) << Code << "\n\n";
     }
     ReplacementCount = Replaces.size();
-    std::string Result = applyAllReplacements(Code, Replaces);
-    EXPECT_NE("", Result);
-    DEBUG(llvm::errs() << "\n" << Result << "\n\n");
-    return Result;
+    auto Result = applyAllReplacements(Code, Replaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    DEBUG(llvm::errs() << "\n" << *Result << "\n\n");
+    return *Result;
   }
 
   FormatStyle getLLVMStyleWithColumns(unsigned ColumnLimit) {
@@ -291,11 +299,23 @@
   verifyFormat("if (a)\n  if (b) {\n    f();\n  }\ng();");
 
   FormatStyle AllowsMergedIf = getLLVMStyle();
+  AllowsMergedIf.AlignEscapedNewlinesLeft = true;
   AllowsMergedIf.AllowShortIfStatementsOnASingleLine = true;
   verifyFormat("if (a)\n"
                "  // comment\n"
                "  f();",
                AllowsMergedIf);
+  verifyFormat("{\n"
+               "  if (a)\n"
+               "  label:\n"
+               "    f();\n"
+               "}",
+               AllowsMergedIf);
+  verifyFormat("#define A \\\n"
+               "  if (a)  \\\n"
+               "  label:  \\\n"
+               "    f()",
+               AllowsMergedIf);
   verifyFormat("if (a)\n"
                "  ;",
                AllowsMergedIf);
@@ -738,6 +758,7 @@
   verifyFormat("switch (x) {\n"
                "case 'A' ... 'Z':\n"
                "case 1 ... 5:\n"
+               "case a ... b:\n"
                "  break;\n"
                "}");
 }
@@ -1105,7 +1126,7 @@
 
 TEST_F(FormatTest, UnderstandsBlockComments) {
   verifyFormat("f(/*noSpaceAfterParameterNamingComment=*/true);");
-  verifyFormat("void f() { g(/*aaa=*/x, /*bbb=*/!y); }");
+  verifyFormat("void f() { g(/*aaa=*/x, /*bbb=*/!y, /*c=*/::c); }");
   EXPECT_EQ("f(aaaaaaaaaaaaaaaaaaaaaaaaa, /* Trailing comment for aa... */\n"
             "  bbbbbbbbbbbbbbbbbbbbbbbbb);",
             format("f(aaaaaaaaaaaaaaaaaaaaaaaaa ,   \\\n"
@@ -1125,6 +1146,8 @@
              "                      aaaaaaaaaaaaaaaaaa  ,\n"
              "    aaaaaaaaaaaaaaaaaa) {   /*aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa*/\n"
              "}"));
+  verifyFormat("f(/* aaaaaaaaaaaaaaaaaa = */\n"
+               "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa);");
 
   FormatStyle NoBinPacking = getLLVMStyle();
   NoBinPacking.BinPackParameters = false;
@@ -1914,6 +1937,10 @@
   verifyFormat("{\n"
                "  signals.set(); // This needs indentation.\n"
                "}");
+  verifyFormat("void f() {\n"
+               "label:\n"
+               "  signals.baz();\n"
+               "}");
 }
 
 TEST_F(FormatTest, SeparatesLogicalBlocks) {
@@ -2749,6 +2776,12 @@
                "  case 1:          \\\n"
                "  case 2\n",
                getLLVMStyleWithColumns(20));
+  verifyFormat("#define MACRO(a) \\\n"
+               "  if (a)         \\\n"
+               "    f();         \\\n"
+               "  else           \\\n"
+               "    g()",
+               getLLVMStyleWithColumns(18));
   verifyFormat("#define A template <typename T>");
   verifyIncompleteFormat("#define STR(x) #x\n"
                          "f(STR(this_is_a_string_literal{));");
@@ -3992,6 +4025,12 @@
                "                << bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
   verifyFormat("TEST_F(ThisIsATestFixtureeeeeeeeeeeee,\n"
                "       ThisIsATestWithAReallyReallyReallyReallyLongName) {}");
+  verifyFormat("MACRO(abc).function() // wrap\n"
+               "    << abc;");
+  verifyFormat("MACRO(abc)->function() // wrap\n"
+               "    << abc;");
+  verifyFormat("MACRO(abc)::function() // wrap\n"
+               "    << abc;");
 }
 
 TEST_F(FormatTest, BreaksDesireably) {
@@ -4457,12 +4496,31 @@
                "    aaaaaaaaaaa aaaaaaaaa,\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa);",
                Style);
-  verifyFormat("SomeLongVariableName->someFunction(\n"
-               "    foooooooo(\n"
-               "        aaaaaaaaaaaaaaa,\n"
-               "        aaaaaaaaaaaaaaaaaaaaa,\n"
-               "        aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa));",
+  verifyFormat("SomeLongVariableName->someFunction(foooooooo(\n"
+               "    aaaaaaaaaaaaaaa,\n"
+               "    aaaaaaaaaaaaaaaaaaaaa,\n"
+               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa));",
                Style);
+  verifyFormat(
+      "aaaaaaaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaaaaaa(\n"
+      "    aaaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaa)));",
+      Style);
+  verifyFormat(
+      "aaaaaaaaaaaaaaaaaaaaaaaa(aaaaaaaaaa.aaaaaaaaaa(\n"
+      "    aaaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaa)));",
+      Style);
+  verifyFormat(
+      "aaaaaaaaaaaaaaaaaaaaaaaa(\n"
+      "    aaaaaaaaaaaaaaaaaaaaa(\n"
+      "        aaaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaa)),\n"
+      "    aaaaaaaaaaaaaaaa);",
+      Style);
+  verifyFormat(
+      "aaaaaaaaaaaaaaaaaaaaaaaa(\n"
+      "    aaaaaaaaaaaaaaaaaaaaa(\n"
+      "        aaaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaa)) &&\n"
+      "    aaaaaaaaaaaaaaaa);",
+      Style);
 }
 
 TEST_F(FormatTest, ParenthesesAndOperandAlignment) {
@@ -4708,6 +4766,10 @@
                "            /*bbbbbbbbbbbbbbb=*/bbbbbbbbbbbbbbbbbbbbbbbbb :\n"
                "            ccccccccccccccccccccccccccc;",
                Style);
+  verifyFormat("return aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ?\n"
+               "           aaaaa :\n"
+               "           bbbbbbbbbbbbbbb + cccccccccccccccc;",
+               Style);
 }
 
 TEST_F(FormatTest, DeclarationsOfMultipleVariables) {
@@ -5335,6 +5397,10 @@
   verifyFormat("template <typename T> // T can be A, B or C.\n"
                "struct C {};",
                AlwaysBreak);
+  verifyFormat("template <enum E> class A {\n"
+               "public:\n"
+               "  E *f();\n"
+               "};");
 }
 
 TEST_F(FormatTest, WrapsAtNestedNameSpecifiers) {
@@ -5435,6 +5501,7 @@
 
 TEST_F(FormatTest, UnderstandsBinaryOperators) {
   verifyFormat("COMPARE(a, ==, b);");
+  verifyFormat("auto s = sizeof...(Ts) - 1;");
 }
 
 TEST_F(FormatTest, UnderstandsPointersToMembers) {
@@ -5567,14 +5634,21 @@
   verifyFormat("SomeType MemberFunction(const Deleted &) && {}");
   verifyFormat("SomeType MemberFunction(const Deleted &) && final {}");
   verifyFormat("SomeType MemberFunction(const Deleted &) && override {}");
+  verifyFormat("SomeType MemberFunction(const Deleted &) const &;");
 
   FormatStyle AlignLeft = getLLVMStyle();
   AlignLeft.PointerAlignment = FormatStyle::PAS_Left;
+  verifyFormat("void A::b() && {}", AlignLeft);
   verifyFormat("Deleted& operator=(const Deleted&) & = default;", AlignLeft);
   verifyFormat("SomeType MemberFunction(const Deleted&) & = delete;",
                AlignLeft);
   verifyFormat("Deleted& operator=(const Deleted&) &;", AlignLeft);
   verifyFormat("SomeType MemberFunction(const Deleted&) &;", AlignLeft);
+  verifyFormat("auto Function(T t) & -> void {}", AlignLeft);
+  verifyFormat("auto Function(T... t) & -> void {}", AlignLeft);
+  verifyFormat("auto Function(T) & -> void {}", AlignLeft);
+  verifyFormat("auto Function(T) & -> void;", AlignLeft);
+  verifyFormat("SomeType MemberFunction(const Deleted&) const &;", AlignLeft);
 
   FormatStyle Spaces = getLLVMStyle();
   Spaces.SpacesInCStyleCastParentheses = true;
@@ -5688,6 +5762,7 @@
       "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
       "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa, *aaaaaaaaaaaaaaaaaaaaaaaaaaaaa);");
 
+  verifyGoogleFormat("int const* a = &b;");
   verifyGoogleFormat("**outparam = 1;");
   verifyGoogleFormat("*outparam = a * b;");
   verifyGoogleFormat("int main(int argc, char** argv) {}");
@@ -5956,6 +6031,7 @@
   verifyFormat("my_int a = (my_int)(my_int)-1;");
   verifyFormat("my_int a = (ns::my_int)-2;");
   verifyFormat("case (my_int)ONE:");
+  verifyFormat("auto x = (X)this;");
 
   // FIXME: single value wrapped with paren will be treated as cast.
   verifyFormat("void f(int i = (kValue)*kMask) {}");
@@ -6029,6 +6105,7 @@
   verifyFormat("some_var = function(*some_pointer_var)[0];");
   verifyFormat("void f() { function(*some_pointer_var)[0] = 10; }");
   verifyFormat("int x = f(&h)();");
+  verifyFormat("returnsFunction(&param1, &param2)(param);");
 }
 
 TEST_F(FormatTest, FormatsPointersToArrayTypes) {
@@ -6072,6 +6149,10 @@
                "LooooooooooooooooooooooooooooooooooongFunctionDefinition() {}");
   verifyFormat("decltype(LoooooooooooooooooooooooooooooooooooooooongName)\n"
                "LooooooooooooooooooooooooooooooooooongFunctionDefinition() {}");
+  verifyFormat("LoooooooooooooooooooooooooooooooooooooooongReturnType\n"
+               "LooooooooooooooooooooooooooongFunctionDeclaration(T... t);");
+  verifyFormat("LoooooooooooooooooooooooooooooooooooooooongReturnType\n"
+               "LooooooooooooooooooooooooooongFunctionDeclaration(T /*t*/) {}");
   FormatStyle Indented = getLLVMStyle();
   Indented.IndentWrappedFunctionNames = true;
   verifyFormat("LoooooooooooooooooooooooooooooooooooooooongReturnType\n"
@@ -6160,6 +6241,8 @@
       "llvm::outs() << \"aaaaaaaaaaaa: \"\n"
       "             << (*aaaaaaaiaaaaaaa)[aaaaaaaaaaaaaaaaaaaaaaaaa]\n"
       "                                  [aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa];");
+  verifyFormat("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa[aaaaaaaaaaaaaaaaa][a]\n"
+               "    .aaaaaaaaaaaaaaaaaaaaaa();");
 
   verifyGoogleFormat("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa<int>\n"
                      "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa[aaaaaaaaaaaa];");
@@ -6642,7 +6725,7 @@
                "                   bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb};");
 
   verifyNoCrash("a<,");
-  
+
   // No braced initializer here.
   verifyFormat("void f() {\n"
                "  struct Dummy {};\n"
@@ -6953,6 +7036,14 @@
                    "* aaaaaa aaaaaa\n"
                    "*/",
                    getLLVMStyleWithColumns(10)));
+  EXPECT_EQ("int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n"
+            "    /* line 1\n"
+            "       bbbbbbbbbbbb */\n"
+            "    bbbbbbbbbbbbbbbbbbbbbbbbbbbb;",
+            format("int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n"
+                   "    /* line 1\n"
+                   "       bbbbbbbbbbbb */ bbbbbbbbbbbbbbbbbbbbbbbbbbbb;",
+            getLLVMStyleWithColumns(50)));
 
   FormatStyle NoBinPacking = getLLVMStyle();
   NoBinPacking.BinPackParameters = false;
@@ -7061,10 +7152,9 @@
 }
 
 TEST_F(FormatTest, IndentLineCommentsInStartOfBlockAtEndOfFile) {
-  // FIXME: This is not what we want...
   verifyFormat("{\n"
-               "// a"
-               "// b");
+               "  // a\n"
+               "  // b");
 }
 
 TEST_F(FormatTest, FormatStarDependingOnContext) {
@@ -8523,6 +8613,230 @@
                    "\t */\n"
                    "\t int i;\n"
                    "}"));
+
+  Tab.UseTab = FormatStyle::UT_ForContinuationAndIndentation;
+  Tab.TabWidth = 8;
+  Tab.IndentWidth = 8;
+  EXPECT_EQ("if (aaaaaaaa && // q\n"
+            "    bb)         // w\n"
+            "\t;",
+            format("if (aaaaaaaa &&// q\n"
+                   "bb)// w\n"
+                   ";",
+                   Tab));
+  EXPECT_EQ("if (aaa && bbb) // w\n"
+            "\t;",
+            format("if(aaa&&bbb)// w\n"
+                   ";",
+                   Tab));
+  verifyFormat("class X {\n"
+               "\tvoid f() {\n"
+               "\t\tsomeFunction(parameter1,\n"
+               "\t\t\t     parameter2);\n"
+               "\t}\n"
+               "};",
+               Tab);
+  verifyFormat("#define A                        \\\n"
+               "\tvoid f() {               \\\n"
+               "\t\tsomeFunction(    \\\n"
+               "\t\t    parameter1,  \\\n"
+               "\t\t    parameter2); \\\n"
+               "\t}",
+               Tab);
+  Tab.TabWidth = 4;
+  Tab.IndentWidth = 8;
+  verifyFormat("class TabWidth4Indent8 {\n"
+               "\t\tvoid f() {\n"
+               "\t\t\t\tsomeFunction(parameter1,\n"
+               "\t\t\t\t\t\t\t parameter2);\n"
+               "\t\t}\n"
+               "};",
+               Tab);
+  Tab.TabWidth = 4;
+  Tab.IndentWidth = 4;
+  verifyFormat("class TabWidth4Indent4 {\n"
+               "\tvoid f() {\n"
+               "\t\tsomeFunction(parameter1,\n"
+               "\t\t\t\t\t parameter2);\n"
+               "\t}\n"
+               "};",
+               Tab);
+  Tab.TabWidth = 8;
+  Tab.IndentWidth = 4;
+  verifyFormat("class TabWidth8Indent4 {\n"
+               "    void f() {\n"
+               "\tsomeFunction(parameter1,\n"
+               "\t\t     parameter2);\n"
+               "    }\n"
+               "};",
+               Tab);
+  Tab.TabWidth = 8;
+  Tab.IndentWidth = 8;
+  EXPECT_EQ("/*\n"
+            "\t      a\t\tcomment\n"
+            "\t      in multiple lines\n"
+            "       */",
+            format("   /*\t \t \n"
+                   " \t \t a\t\tcomment\t \t\n"
+                   " \t \t in multiple lines\t\n"
+                   " \t  */",
+                   Tab));
+  verifyFormat("{\n"
+               "\taaaaaaaaaaaaaaaaaaaaaaaaaaaa();\n"
+               "\taaaaaaaaaaaaaaaaaaaaaaaaaaaa();\n"
+               "\taaaaaaaaaaaaaaaaaaaaaaaaaaaa();\n"
+               "\taaaaaaaaaaaaaaaaaaaaaaaaaaaa();\n"
+               "\taaaaaaaaaaaaaaaaaaaaaaaaaaaa();\n"
+               "\taaaaaaaaaaaaaaaaaaaaaaaaaaaa();\n"
+               "};",
+               Tab);
+  verifyFormat("enum AA {\n"
+               "\ta1, // Force multiple lines\n"
+               "\ta2,\n"
+               "\ta3\n"
+               "};",
+               Tab);
+  EXPECT_EQ("if (aaaaaaaa && // q\n"
+            "    bb)         // w\n"
+            "\t;",
+            format("if (aaaaaaaa &&// q\n"
+                   "bb)// w\n"
+                   ";",
+                   Tab));
+  verifyFormat("class X {\n"
+               "\tvoid f() {\n"
+               "\t\tsomeFunction(parameter1,\n"
+               "\t\t\t     parameter2);\n"
+               "\t}\n"
+               "};",
+               Tab);
+  verifyFormat("{\n"
+               "\tQ(\n"
+               "\t    {\n"
+               "\t\t    int a;\n"
+               "\t\t    someFunction(aaaaaaaa,\n"
+               "\t\t\t\t bbbbbbb);\n"
+               "\t    },\n"
+               "\t    p);\n"
+               "}",
+               Tab);
+  EXPECT_EQ("{\n"
+            "\t/* aaaa\n"
+            "\t   bbbb */\n"
+            "}",
+            format("{\n"
+                   "/* aaaa\n"
+                   "   bbbb */\n"
+                   "}",
+                   Tab));
+  EXPECT_EQ("{\n"
+            "\t/*\n"
+            "\t  aaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+            "\t  bbbbbbbbbbbbb\n"
+            "\t*/\n"
+            "}",
+            format("{\n"
+                   "/*\n"
+                   "  aaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbb\n"
+                   "*/\n"
+                   "}",
+                   Tab));
+  EXPECT_EQ("{\n"
+            "\t// aaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+            "\t// bbbbbbbbbbbbb\n"
+            "}",
+            format("{\n"
+                   "\t// aaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbb\n"
+                   "}",
+                   Tab));
+  EXPECT_EQ("{\n"
+            "\t/*\n"
+            "\t  aaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+            "\t  bbbbbbbbbbbbb\n"
+            "\t*/\n"
+            "}",
+            format("{\n"
+                   "\t/*\n"
+                   "\t  aaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbb\n"
+                   "\t*/\n"
+                   "}",
+                   Tab));
+  EXPECT_EQ("{\n"
+            "\t/*\n"
+            "\n"
+            "\t*/\n"
+            "}",
+            format("{\n"
+                   "\t/*\n"
+                   "\n"
+                   "\t*/\n"
+                   "}",
+                   Tab));
+  EXPECT_EQ("{\n"
+            "\t/*\n"
+            " asdf\n"
+            "\t*/\n"
+            "}",
+            format("{\n"
+                   "\t/*\n"
+                   " asdf\n"
+                   "\t*/\n"
+                   "}",
+                   Tab));
+  EXPECT_EQ("/*\n"
+            "\t      a\t\tcomment\n"
+            "\t      in multiple lines\n"
+            "       */",
+            format("   /*\t \t \n"
+                   " \t \t a\t\tcomment\t \t\n"
+                   " \t \t in multiple lines\t\n"
+                   " \t  */",
+                   Tab));
+  EXPECT_EQ("/* some\n"
+            "   comment */",
+            format(" \t \t /* some\n"
+                   " \t \t    comment */",
+                   Tab));
+  EXPECT_EQ("int a; /* some\n"
+            "   comment */",
+            format(" \t \t int a; /* some\n"
+                   " \t \t    comment */",
+                   Tab));
+  EXPECT_EQ("int a; /* some\n"
+            "comment */",
+            format(" \t \t int\ta; /* some\n"
+                   " \t \t    comment */",
+                   Tab));
+  EXPECT_EQ("f(\"\t\t\"); /* some\n"
+            "    comment */",
+            format(" \t \t f(\"\t\t\"); /* some\n"
+                   " \t \t    comment */",
+                   Tab));
+  EXPECT_EQ("{\n"
+            "  /*\n"
+            "   * Comment\n"
+            "   */\n"
+            "  int i;\n"
+            "}",
+            format("{\n"
+                   "\t/*\n"
+                   "\t * Comment\n"
+                   "\t */\n"
+                   "\t int i;\n"
+                   "}"));
+  Tab.AlignConsecutiveAssignments = true;
+  Tab.AlignConsecutiveDeclarations = true;
+  Tab.TabWidth = 4;
+  Tab.IndentWidth = 4;
+  verifyFormat("class Assign {\n"
+               "\tvoid f() {\n"
+               "\t\tint         x      = 123;\n"
+               "\t\tint         random = 4;\n"
+               "\t\tstd::string alphabet =\n"
+               "\t\t\t\"abcdefghijklmnopqrstuvwxyz\";\n"
+               "\t}\n"
+               "};",
+               Tab);
 }
 
 TEST_F(FormatTest, CalculatesOriginalColumn) {
@@ -8683,7 +8997,8 @@
   verifyFormat("#define x (( int )-1)", Spaces);
 
   // Run the first set of tests again with:
-  Spaces.SpacesInParentheses = false, Spaces.SpaceInEmptyParentheses = true;
+  Spaces.SpacesInParentheses = false;
+  Spaces.SpaceInEmptyParentheses = true;
   Spaces.SpacesInCStyleCastParentheses = true;
   verifyFormat("call(x, y, z);", Spaces);
   verifyFormat("call( );", Spaces);
@@ -9876,6 +10191,7 @@
   CHECK_PARSE_BOOL(SpacesInContainerLiterals);
   CHECK_PARSE_BOOL(SpacesInCStyleCastParentheses);
   CHECK_PARSE_BOOL(SpaceAfterCStyleCast);
+  CHECK_PARSE_BOOL(SpaceAfterTemplateKeyword);
   CHECK_PARSE_BOOL(SpaceBeforeAssignmentOperators);
 
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, AfterClass);
@@ -9911,6 +10227,7 @@
               SpacesBeforeTrailingComments, 1234u);
   CHECK_PARSE("IndentWidth: 32", IndentWidth, 32u);
   CHECK_PARSE("ContinuationIndentWidth: 11", ContinuationIndentWidth, 11u);
+  CHECK_PARSE("CommentPragmas: '// abc$'", CommentPragmas, "// abc$");
 
   Style.PointerAlignment = FormatStyle::PAS_Middle;
   CHECK_PARSE("PointerAlignment: Left", PointerAlignment,
@@ -9964,6 +10281,8 @@
   CHECK_PARSE("UseTab: Never", UseTab, FormatStyle::UT_Never);
   CHECK_PARSE("UseTab: ForIndentation", UseTab, FormatStyle::UT_ForIndentation);
   CHECK_PARSE("UseTab: Always", UseTab, FormatStyle::UT_Always);
+  CHECK_PARSE("UseTab: ForContinuationAndIndentation", UseTab,
+              FormatStyle::UT_ForContinuationAndIndentation);
   // For backward compatibility:
   CHECK_PARSE("UseTab: false", UseTab, FormatStyle::UT_Never);
   CHECK_PARSE("UseTab: true", UseTab, FormatStyle::UT_Always);
@@ -10069,6 +10388,7 @@
               "  - Regex: .*\n"
               "    Priority: 1",
               IncludeCategories, ExpectedCategories);
+  CHECK_PARSE("IncludeIsMainRegex: 'abc$'", IncludeIsMainRegex, "abc$");
 }
 
 TEST_F(FormatTest, ParsesConfigurationWithLanguages) {
@@ -10378,6 +10698,9 @@
   verifyFormat("SomeClass::Constructor()\n"
                "    : a(a) {}",
                Style);
+  verifyFormat("SomeClass::Constructor() noexcept\n"
+               "    : a(a) {}",
+               Style);
   verifyFormat("SomeClass::Constructor()\n"
                "    : a(a)\n"
                "    , b(b)\n"
@@ -10687,6 +11010,10 @@
       "      return aaaaaaaaaaaaaaaaa;\n"
       "    });",
       getLLVMStyleWithColumns(70));
+  verifyFormat("[]() //\n"
+               "    -> int {\n"
+               "  return 1; //\n"
+               "};");
 
   // Multiple lambdas in the same parentheses change indentation rules.
   verifyFormat("SomeFunction(\n"
@@ -11003,6 +11330,12 @@
   verifyFormat("A<A<int>>();", Spaces);
 }
 
+TEST_F(FormatTest, SpaceAfterTemplateKeyword) {
+  FormatStyle Style = getLLVMStyle();
+  Style.SpaceAfterTemplateKeyword = false;
+  verifyFormat("template<int> void foo();", Style);
+}
+
 TEST_F(FormatTest, TripleAngleBrackets) {
   verifyFormat("f<<<1, 1>>>();");
   verifyFormat("f<<<1, 1, 1, s>>>();");
@@ -11164,6 +11497,102 @@
   verifyFormat("include \"a.td\"\ninclude \"b.td\"", Style);
 }
 
+// Since this test case uses UNIX-style file path. We disable it for MS
+// compiler.
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+
+TEST(FormatStyle, GetStyleOfFile) {
+  vfs::InMemoryFileSystem FS;
+  // Test 1: format file in the same directory.
+  ASSERT_TRUE(
+      FS.addFile("/a/.clang-format", 0,
+                 llvm::MemoryBuffer::getMemBuffer("BasedOnStyle: LLVM")));
+  ASSERT_TRUE(
+      FS.addFile("/a/test.cpp", 0, llvm::MemoryBuffer::getMemBuffer("int i;")));
+  auto Style1 = getStyle("file", "/a/.clang-format", "Google", &FS);
+  ASSERT_EQ(Style1, getLLVMStyle());
+
+  // Test 2: fallback to default.
+  ASSERT_TRUE(
+      FS.addFile("/b/test.cpp", 0, llvm::MemoryBuffer::getMemBuffer("int i;")));
+  auto Style2 = getStyle("file", "/b/test.cpp", "Mozilla", &FS);
+  ASSERT_EQ(Style2, getMozillaStyle());
+
+  // Test 3: format file in parent directory.
+  ASSERT_TRUE(
+      FS.addFile("/c/.clang-format", 0,
+                 llvm::MemoryBuffer::getMemBuffer("BasedOnStyle: Google")));
+  ASSERT_TRUE(FS.addFile("/c/sub/sub/sub/test.cpp", 0,
+                         llvm::MemoryBuffer::getMemBuffer("int i;")));
+  auto Style3 = getStyle("file", "/c/sub/sub/sub/test.cpp", "LLVM", &FS);
+  ASSERT_EQ(Style3, getGoogleStyle());
+}
+
+#endif // _MSC_VER
+
+TEST_F(ReplacementTest, FormatCodeAfterReplacements) {
+  // Column limit is 20.
+  std::string Code = "Type *a =\n"
+                     "    new Type();\n"
+                     "g(iiiii, 0, jjjjj,\n"
+                     "  0, kkkkk, 0, mm);\n"
+                     "int  bad     = format   ;";
+  std::string Expected = "auto a = new Type();\n"
+                         "g(iiiii, nullptr,\n"
+                         "  jjjjj, nullptr,\n"
+                         "  kkkkk, nullptr,\n"
+                         "  mm);\n"
+                         "int  bad     = format   ;";
+  FileID ID = Context.createInMemoryFile("format.cpp", Code);
+  tooling::Replacements Replaces = toReplacements(
+      {tooling::Replacement(Context.Sources, Context.getLocation(ID, 1, 1), 6,
+                            "auto "),
+       tooling::Replacement(Context.Sources, Context.getLocation(ID, 3, 10), 1,
+                            "nullptr"),
+       tooling::Replacement(Context.Sources, Context.getLocation(ID, 4, 3), 1,
+                            "nullptr"),
+       tooling::Replacement(Context.Sources, Context.getLocation(ID, 4, 13), 1,
+                            "nullptr")});
+
+  format::FormatStyle Style = format::getLLVMStyle();
+  Style.ColumnLimit = 20; // Set column limit to 20 to increase readibility.
+  auto FormattedReplaces = formatReplacements(Code, Replaces, Style);
+  EXPECT_TRUE(static_cast<bool>(FormattedReplaces))
+      << llvm::toString(FormattedReplaces.takeError()) << "\n";
+  auto Result = applyAllReplacements(Code, *FormattedReplaces);
+  EXPECT_TRUE(static_cast<bool>(Result));
+  EXPECT_EQ(Expected, *Result);
+}
+
+TEST_F(ReplacementTest, SortIncludesAfterReplacement) {
+  std::string Code = "#include \"a.h\"\n"
+                     "#include \"c.h\"\n"
+                     "\n"
+                     "int main() {\n"
+                     "  return 0;\n"
+                     "}";
+  std::string Expected = "#include \"a.h\"\n"
+                         "#include \"b.h\"\n"
+                         "#include \"c.h\"\n"
+                         "\n"
+                         "int main() {\n"
+                         "  return 0;\n"
+                         "}";
+  FileID ID = Context.createInMemoryFile("fix.cpp", Code);
+  tooling::Replacements Replaces = toReplacements(
+      {tooling::Replacement(Context.Sources, Context.getLocation(ID, 1, 1), 0,
+                            "#include \"b.h\"\n")});
+
+  format::FormatStyle Style = format::getLLVMStyle();
+  Style.SortIncludes = true;
+  auto FormattedReplaces = formatReplacements(Code, Replaces, Style);
+  EXPECT_TRUE(static_cast<bool>(FormattedReplaces))
+      << llvm::toString(FormattedReplaces.takeError()) << "\n";
+  auto Result = applyAllReplacements(Code, *FormattedReplaces);
+  EXPECT_TRUE(static_cast<bool>(Result));
+  EXPECT_EQ(Expected, *Result);
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/unittests/Format/FormatTestJS.cpp b/unittests/Format/FormatTestJS.cpp
index 38fa5cd..2819383 100644
--- a/unittests/Format/FormatTestJS.cpp
+++ b/unittests/Format/FormatTestJS.cpp
@@ -28,10 +28,10 @@
     tooling::Replacements Replaces =
         reformat(Style, Code, Ranges, "<stdin>", &IncompleteFormat);
     EXPECT_FALSE(IncompleteFormat);
-    std::string Result = applyAllReplacements(Code, Replaces);
-    EXPECT_NE("", Result);
-    DEBUG(llvm::errs() << "\n" << Result << "\n\n");
-    return Result;
+    auto Result = applyAllReplacements(Code, Replaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    DEBUG(llvm::errs() << "\n" << *Result << "\n\n");
+    return *Result;
   }
 
   static std::string format(
@@ -49,11 +49,24 @@
   static void verifyFormat(
       llvm::StringRef Code,
       const FormatStyle &Style = getGoogleStyle(FormatStyle::LK_JavaScript)) {
-    std::string result = format(test::messUp(Code), Style);
-    EXPECT_EQ(Code.str(), result) << "Formatted:\n" << result;
+    std::string Result = format(test::messUp(Code), Style);
+    EXPECT_EQ(Code.str(), Result) << "Formatted:\n" << Result;
+  }
+
+  static void verifyFormat(
+      llvm::StringRef Expected,
+      llvm::StringRef Code,
+      const FormatStyle &Style = getGoogleStyle(FormatStyle::LK_JavaScript)) {
+    std::string Result = format(Code, Style);
+    EXPECT_EQ(Expected.str(), Result) << "Formatted:\n" << Result;
   }
 };
 
+TEST_F(FormatTestJS, BlockComments) {
+  verifyFormat("/* aaaaaaaaaaaaa */ aaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
+               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa);");
+}
+
 TEST_F(FormatTestJS, UnderstandsJavaScriptOperators) {
   verifyFormat("a == = b;");
   verifyFormat("a != = b;");
@@ -86,6 +99,17 @@
 
   verifyFormat("var b = a.map((x) => x + 1);");
   verifyFormat("return ('aaa') in bbbb;");
+  verifyFormat("var x = aaaaaaaaaaaaaaaaaaaaaaaaa() in\n"
+               "    aaaa.aaaaaa.aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;");
+  FormatStyle Style = getGoogleJSStyleWithColumns(80);
+  Style.AlignOperands = true;
+  verifyFormat("var x = aaaaaaaaaaaaaaaaaaaaaaaaa() in\n"
+               "        aaaa.aaaaaa.aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;",
+               Style);
+  Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All;
+  verifyFormat("var x = aaaaaaaaaaaaaaaaaaaaaaaaa()\n"
+               "            in aaaa.aaaaaa.aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;",
+               Style);
 
   // ES6 spread operator.
   verifyFormat("someFunction(...a);");
@@ -106,6 +130,11 @@
   verifyFormat("x.class.struct = 1;");
   verifyFormat("x.case = 1;");
   verifyFormat("x.interface = 1;");
+  verifyFormat("x.for = 1;");
+  verifyFormat("x.of() = 1;");
+  verifyFormat("x.in() = 1;");
+  verifyFormat("x.let() = 1;");
+  verifyFormat("x.var() = 1;");
   verifyFormat("x = {\n"
                "  a: 12,\n"
                "  interface: 1,\n"
@@ -113,6 +142,9 @@
                "};");
   verifyFormat("var struct = 2;");
   verifyFormat("var union = 2;");
+  verifyFormat("var interface = 2;");
+  verifyFormat("interface = 2;");
+  verifyFormat("x = interface instanceof y;");
 }
 
 TEST_F(FormatTestJS, CppKeywords) {
@@ -122,6 +154,7 @@
 
 TEST_F(FormatTestJS, ES6DestructuringAssignment) {
   verifyFormat("var [a, b, c] = [1, 2, 3];");
+  verifyFormat("const [a, b, c] = [1, 2, 3];");
   verifyFormat("let [a, b, c] = [1, 2, 3];");
   verifyFormat("var {a, b} = {a: 1, b: 2};");
   verifyFormat("let {a, b} = {a: 1, b: 2};");
@@ -189,6 +222,18 @@
                "  b: 2,\n"
                "  [c]: 3,\n"
                "};");
+
+  // Object literals can leave out labels.
+  verifyFormat("f({a}, () => {\n"
+               "  g();  //\n"
+               "});");
+
+  // Keys can be quoted.
+  verifyFormat("var x = {\n"
+               "  a: a,\n"
+               "  b: b,\n"
+               "  'c': c,\n"
+               "};");
 }
 
 TEST_F(FormatTestJS, MethodsInObjectLiterals) {
@@ -234,7 +279,7 @@
   verifyFormat("f({'a': [{}]});");
 }
 
-TEST_F(FormatTestJS, SingleQuoteStrings) {
+TEST_F(FormatTestJS, SingleQuotedStrings) {
   verifyFormat("this.function('', true);");
 }
 
@@ -261,6 +306,8 @@
                getGoogleJSStyleWithColumns(40));
   verifyFormat("goog.setTestOnly('this.is.really.absurdly.long');",
                getGoogleJSStyleWithColumns(40));
+  verifyFormat("goog.forwardDeclare('this.is.really.absurdly.long');",
+               getGoogleJSStyleWithColumns(40));
 
   // These should be wrapped normally.
   verifyFormat(
@@ -268,6 +315,15 @@
       "    goog.module.get('my.long.module.name.followedBy.MyLongClassName');");
 }
 
+TEST_F(FormatTestJS, FormatsNamespaces) {
+  verifyFormat("namespace Foo {\n"
+               "  export let x = 1;\n"
+               "}\n");
+  verifyFormat("declare namespace Foo {\n"
+               "  export let x: number;\n"
+               "}\n");
+}
+
 TEST_F(FormatTestJS, FormatsFreestandingFunctions) {
   verifyFormat("function outer1(a, b) {\n"
                "  function inner1(a, b) { return a; }\n"
@@ -280,6 +336,44 @@
   verifyFormat("function f() {}");
 }
 
+TEST_F(FormatTestJS, GeneratorFunctions) {
+  verifyFormat("function* f() {\n"
+               "  let x = 1;\n"
+               "  yield x;\n"
+               "  yield* something();\n"
+               "}");
+  verifyFormat("function*\n"
+               "    f() {\n"
+               "}",
+               getGoogleJSStyleWithColumns(8));
+  verifyFormat("export function* f() {\n"
+               "  yield 1;\n"
+               "}\n");
+  verifyFormat("class X {\n"
+               "  * generatorMethod() { yield x; }\n"
+               "}");
+}
+
+TEST_F(FormatTestJS, AsyncFunctions) {
+  verifyFormat("async function f() {\n"
+               "  let x = 1;\n"
+               "  return fetch(x);\n"
+               "}");
+  verifyFormat("async function* f() {\n"
+               "  yield fetch(x);\n"
+               "}");
+  verifyFormat("export async function f() {\n"
+               "  return fetch(x);\n"
+               "}");
+  verifyFormat("class X {\n"
+               "  async asyncMethod() { return fetch(1); }\n"
+               "}");
+  verifyFormat("function initialize() {\n"
+               "  // Comment.\n"
+               "  return async.then();\n"
+               "}\n");
+}
+
 TEST_F(FormatTestJS, ArrayLiterals) {
   verifyFormat("var aaaaa: List<SomeThing> =\n"
                "    [new SomeThingAAAAAAAAAAAA(), new SomeThingBBBBBBBBB()];");
@@ -579,9 +673,15 @@
 TEST_F(FormatTestJS, ForLoops) {
   verifyFormat("for (var i in [2, 3]) {\n"
                "}");
+  verifyFormat("for (var i of [2, 3]) {\n"
+               "}");
+  verifyFormat("for (let {a, b} of x) {\n"
+               "}");
+  verifyFormat("for (let {a, b} in x) {\n"
+               "}");
 }
 
-TEST_F(FormatTestJS, AutomaticSemicolonInsertion) {
+TEST_F(FormatTestJS, WrapRespectsAutomaticSemicolonInsertion) {
   // The following statements must not wrap, as otherwise the program meaning
   // would change due to automatic semicolon insertion.
   // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.9.1.
@@ -597,6 +697,56 @@
                getGoogleJSStyleWithColumns(12));
 }
 
+TEST_F(FormatTestJS, AutomaticSemicolonInsertionHeuristic) {
+  verifyFormat("a\n"
+               "b;",
+               " a \n"
+               " b ;");
+  verifyFormat("a()\n"
+               "b;",
+               " a ()\n"
+               " b ;");
+  verifyFormat("a[b]\n"
+               "c;",
+               "a [b]\n"
+               "c ;");
+  verifyFormat("1\n"
+               "a;",
+               "1 \n"
+               "a ;");
+  verifyFormat("a\n"
+               "1;",
+               "a \n"
+               "1 ;");
+  verifyFormat("a\n"
+               "'x';",
+               "a \n"
+               " 'x';");
+  verifyFormat("a++\n"
+               "b;",
+               "a ++\n"
+               "b ;");
+  verifyFormat("a\n"
+               "!b && c;",
+               "a \n"
+               " ! b && c;");
+  verifyFormat("a\n"
+               "if (1) f();",
+               " a\n"
+               " if (1) f();");
+  verifyFormat("a\n"
+               "class X {}",
+               " a\n"
+               " class X {}");
+  verifyFormat("var a", "var\n"
+                        "a");
+  verifyFormat("x instanceof String", "x\n"
+                                      "instanceof\n"
+                                      "String");
+  verifyFormat("function f(@Foo bar) {}", "function f(@Foo\n"
+                                          "  bar) {}");
+}
+
 TEST_F(FormatTestJS, ClosureStyleCasts) {
   verifyFormat("var x = /** @type {foo} */ (bar);");
 }
@@ -695,13 +845,13 @@
   verifyFormat("var regex = /\a\\//g;");
   verifyFormat("var regex = /a\\//;\n"
                "var x = 0;");
-  EXPECT_EQ("var regex = /'/g;", format("var regex = /'/g ;"));
-  EXPECT_EQ("var regex = /'/g;  //'", format("var regex = /'/g ; //'"));
-  EXPECT_EQ("var regex = /\\/*/;\n"
-            "var x = 0;",
-            format("var regex = /\\/*/;\n"
-                   "var x=0;"));
-  EXPECT_EQ("var x = /a\\//;", format("var x = /a\\//  \n;"));
+  verifyFormat("var regex = /'/g;", "var regex = /'/g ;");
+  verifyFormat("var regex = /'/g;  //'", "var regex = /'/g ; //'");
+  verifyFormat("var regex = /\\/*/;\n"
+               "var x = 0;",
+               "var regex = /\\/*/;\n"
+               "var x=0;");
+  verifyFormat("var x = /a\\//;", "var x = /a\\//  \n;");
   verifyFormat("var regex = /\"/;", getGoogleJSStyleWithColumns(16));
   verifyFormat("var regex =\n"
                "    /\"/;",
@@ -742,6 +892,7 @@
   verifyFormat("function x(): {x: string} {\n  return {x: 'x'};\n}");
   verifyFormat("function x(y: string): string {\n  return 'x';\n}");
   verifyFormat("for (var y: string in x) {\n  x();\n}");
+  verifyFormat("for (var y: string of x) {\n  x();\n}");
   verifyFormat("function x(y: {a?: number;} = {}): number {\n"
                "  return 12;\n"
                "}");
@@ -755,6 +906,22 @@
                getGoogleJSStyleWithColumns(60));
 }
 
+TEST_F(FormatTestJS, UnionIntersectionTypes) {
+  verifyFormat("let x: A|B = A | B;");
+  verifyFormat("let x: A&B|C = A & B;");
+  verifyFormat("let x: Foo<A|B> = new Foo<A|B>();");
+  verifyFormat("function(x: A|B): C&D {}");
+  verifyFormat("function(x: A|B = A | B): C&D {}");
+  verifyFormat("function x(path: number|string) {}");
+  verifyFormat("function x(): string|number {}");
+  verifyFormat("type Foo = Bar|Baz;");
+  verifyFormat("type Foo = Bar<X>|Baz;");
+  verifyFormat("type Foo = (Bar<X>|Baz);");
+  verifyFormat("let x: Bar|Baz;");
+  verifyFormat("let x: Bar<X>|Baz;");
+  verifyFormat("let x: (Foo|Bar)[];");
+}
+
 TEST_F(FormatTestJS, ClassDeclarations) {
   verifyFormat("class C {\n  x: string = 12;\n}");
   verifyFormat("class C {\n  x(): string => 12;\n}");
@@ -787,12 +954,18 @@
                "    },\n"
                "  };\n"
                "}");
+  verifyFormat("@Component({\n"
+               "  moduleId: module.id,\n"
+               "})\n"
+               "class SessionListComponent implements OnDestroy, OnInit {\n"
+               "}");
 }
 
 TEST_F(FormatTestJS, InterfaceDeclarations) {
   verifyFormat("interface I {\n"
                "  x: string;\n"
                "  enum: string[];\n"
+               "  enum?: string[];\n"
                "}\n"
                "var y;");
   // Ensure that state is reset after parsing the interface.
@@ -833,30 +1006,32 @@
                "    return 'y';\n"
                "  }\n"
                "}");
+  verifyFormat("class C {\n"
+               "  private x(@A x: string) {}\n"
+               "}");
   verifyFormat("class X {}\n"
                "class Y {}");
 }
 
+TEST_F(FormatTestJS, TypeAliases) {
+  verifyFormat("type X = number;\n"
+               "class C {}");
+  verifyFormat("type X<Y> = Z<Y>;");
+  verifyFormat("type X = {\n"
+               "  y: number\n"
+               "};\n"
+               "class C {}");
+}
+
 TEST_F(FormatTestJS, Modules) {
   verifyFormat("import SomeThing from 'some/module.js';");
   verifyFormat("import {X, Y} from 'some/module.js';");
   verifyFormat("import a, {X, Y} from 'some/module.js';");
-  verifyFormat("import {\n"
-               "  VeryLongImportsAreAnnoying,\n"
-               "  VeryLongImportsAreAnnoying,\n"
-               "  VeryLongImportsAreAnnoying,\n"
-               "  VeryLongImportsAreAnnoying\n"
-               "} from 'some/module.js';");
-  verifyFormat("import {\n"
-               "  X,\n"
-               "  Y,\n"
-               "} from 'some/module.js';");
-  verifyFormat("import {\n"
-               "  X,\n"
-               "  Y,\n"
-               "} from 'some/long/module.js';",
-               getGoogleJSStyleWithColumns(20));
+  verifyFormat("import {X, Y,} from 'some/module.js';");
   verifyFormat("import {X as myLocalX, Y as myLocalY} from 'some/module.js';");
+  // Ensure Automatic Semicolon Insertion does not break on "as\n".
+  verifyFormat("import {X as myX} from 'm';", "import {X as\n"
+                                              " myX} from 'm';");
   verifyFormat("import * as lib from 'some/module.js';");
   verifyFormat("var x = {import: 1};\nx.import = 2;");
 
@@ -866,13 +1041,26 @@
   verifyFormat("export function A() {}\n"
                "export default function B() {}\n"
                "export function C() {}");
+  verifyFormat("export default () => {\n"
+               "  let x = 1;\n"
+               "  return x;\n"
+               "}");
   verifyFormat("export const x = 12;");
   verifyFormat("export default class X {}");
   verifyFormat("export {X, Y} from 'some/module.js';");
+  verifyFormat("export {X, Y,} from 'some/module.js';");
+  verifyFormat("export {SomeVeryLongExport as X, "
+               "SomeOtherVeryLongExport as Y} from 'some/module.js';");
+  // export without 'from' is wrapped.
+  verifyFormat("export let someRatherLongVariableName =\n"
+               "    someSurprisinglyLongVariable + someOtherRatherLongVar;");
+  // ... but not if from is just an identifier.
   verifyFormat("export {\n"
-               "  X,\n"
-               "  Y,\n"
-               "} from 'some/module.js';");
+               "  from as from,\n"
+               "  someSurprisinglyLongVariable as\n"
+               "      from\n"
+               "};",
+               getGoogleJSStyleWithColumns(20));
   verifyFormat("export class C {\n"
                "  x: number;\n"
                "  y: string;\n"
@@ -907,42 +1095,74 @@
                "}");
 }
 
+TEST_F(FormatTestJS, ImportWrapping) {
+  verifyFormat("import {VeryLongImportsAreAnnoying, VeryLongImportsAreAnnoying,"
+               " VeryLongImportsAreAnnoying, VeryLongImportsAreAnnoying"
+               "} from 'some/module.js';");
+  FormatStyle Style = getGoogleJSStyleWithColumns(80);
+  Style.JavaScriptWrapImports = true;
+  verifyFormat("import {\n"
+               "  VeryLongImportsAreAnnoying,\n"
+               "  VeryLongImportsAreAnnoying,\n"
+               "  VeryLongImportsAreAnnoying,\n"
+               "} from 'some/module.js';",
+               Style);
+  verifyFormat("import {\n"
+               "  A,\n"
+               "  A,\n"
+               "} from 'some/module.js';",
+               Style);
+  verifyFormat("export {\n"
+               "  A,\n"
+               "  A,\n"
+               "} from 'some/module.js';",
+               Style);
+}
+
 TEST_F(FormatTestJS, TemplateStrings) {
   // Keeps any whitespace/indentation within the template string.
-  EXPECT_EQ("var x = `hello\n"
+  verifyFormat("var x = `hello\n"
             "     ${  name    }\n"
             "  !`;",
-            format("var x    =    `hello\n"
+            "var x    =    `hello\n"
                    "     ${  name    }\n"
-                   "  !`;"));
+                   "  !`;");
 
   verifyFormat("var x =\n"
                "    `hello ${world}` >= some();",
                getGoogleJSStyleWithColumns(34)); // Barely doesn't fit.
   verifyFormat("var x = `hello ${world}` >= some();",
                getGoogleJSStyleWithColumns(35)); // Barely fits.
-  EXPECT_EQ("var x = `hello\n"
+  verifyFormat("var x = `hellö ${wörld}` >= söme();",
+               getGoogleJSStyleWithColumns(35)); // Fits due to UTF-8.
+  verifyFormat("var x = `hello\n"
             "  ${world}` >=\n"
             "    some();",
-            format("var x =\n"
+            "var x =\n"
                    "    `hello\n"
                    "  ${world}` >= some();",
-                   getGoogleJSStyleWithColumns(21))); // Barely doesn't fit.
-  EXPECT_EQ("var x = `hello\n"
+                   getGoogleJSStyleWithColumns(21)); // Barely doesn't fit.
+  verifyFormat("var x = `hello\n"
             "  ${world}` >= some();",
-            format("var x =\n"
+            "var x =\n"
                    "    `hello\n"
                    "  ${world}` >= some();",
-                   getGoogleJSStyleWithColumns(22))); // Barely fits.
+                   getGoogleJSStyleWithColumns(22)); // Barely fits.
 
   verifyFormat("var x =\n"
                "    `h`;",
                getGoogleJSStyleWithColumns(11));
-  EXPECT_EQ(
-      "var x =\n    `multi\n  line`;",
-      format("var x = `multi\n  line`;", getGoogleJSStyleWithColumns(13)));
+  verifyFormat("var x =\n    `multi\n  line`;", "var x = `multi\n  line`;",
+               getGoogleJSStyleWithColumns(13));
   verifyFormat("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
                "    `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`);");
+  // Repro for an obscure width-miscounting issue with template strings.
+  verifyFormat(
+      "someLongVariable =\n"
+      "    "
+      "`${logPrefix[11]}/${logPrefix[12]}/${logPrefix[13]}${logPrefix[14]}`;",
+      "someLongVariable = "
+      "`${logPrefix[11]}/${logPrefix[12]}/${logPrefix[13]}${logPrefix[14]}`;");
 
   // Make sure template strings get a proper ColumnWidth assigned, even if they
   // are first token in line.
@@ -954,42 +1174,51 @@
   verifyFormat("var x = `hello` == `hello`;");
 
   // Comments in template strings.
-  EXPECT_EQ("var x = `//a`;\n"
+  verifyFormat("var x = `//a`;\n"
             "var y;",
-            format("var x =\n `//a`;\n"
-                   "var y  ;"));
-  EXPECT_EQ("var x = `/*a`;\n"
-            "var y;",
-            format("var x =\n `/*a`;\n"
-                   "var y;"));
+            "var x =\n `//a`;\n"
+                   "var y  ;");
+  verifyFormat("var x = `/*a`;\n"
+               "var y;",
+               "var x =\n `/*a`;\n"
+               "var y;");
   // Unterminated string literals in a template string.
   verifyFormat("var x = `'`;  // comment with matching quote '\n"
                "var y;");
   verifyFormat("var x = `\"`;  // comment with matching quote \"\n"
                "var y;");
-  EXPECT_EQ("it(`'aaaaaaaaaaaaaaa   `, aaaaaaaaa);",
-            format("it(`'aaaaaaaaaaaaaaa   `,   aaaaaaaaa) ;",
-                   getGoogleJSStyleWithColumns(40)));
+  verifyFormat("it(`'aaaaaaaaaaaaaaa   `, aaaaaaaaa);",
+               "it(`'aaaaaaaaaaaaaaa   `,   aaaaaaaaa) ;",
+               getGoogleJSStyleWithColumns(40));
   // Backticks in a comment - not a template string.
-  EXPECT_EQ("var x = 1  // `/*a`;\n"
-            "    ;",
-            format("var x =\n 1  // `/*a`;\n"
-                   "    ;"));
-  EXPECT_EQ("/* ` */ var x = 1; /* ` */",
-            format("/* ` */ var x\n= 1; /* ` */"));
+  verifyFormat("var x = 1  // `/*a`;\n"
+               "    ;",
+               "var x =\n 1  // `/*a`;\n"
+               "    ;");
+  verifyFormat("/* ` */ var x = 1; /* ` */", "/* ` */ var x\n= 1; /* ` */");
   // Comment spans multiple template strings.
-  EXPECT_EQ("var x = `/*a`;\n"
-            "var y = ` */ `;",
-            format("var x =\n `/*a`;\n"
-                   "var y =\n ` */ `;"));
+  verifyFormat("var x = `/*a`;\n"
+               "var y = ` */ `;",
+               "var x =\n `/*a`;\n"
+               "var y =\n ` */ `;");
   // Escaped backtick.
-  EXPECT_EQ("var x = ` \\` a`;\n"
-            "var y;",
-            format("var x = ` \\` a`;\n"
-                   "var y;"));
+  verifyFormat("var x = ` \\` a`;\n"
+               "var y;",
+               "var x = ` \\` a`;\n"
+               "var y;");
 }
 
-TEST_F(FormatTestJS, CastSyntax) { verifyFormat("var x = <type>foo;"); }
+TEST_F(FormatTestJS, CastSyntax) {
+  verifyFormat("var x = <type>foo;");
+  verifyFormat("var x = foo as type;");
+  verifyFormat("let x = (a + b) as\n"
+               "    LongTypeIsLong;",
+               getGoogleJSStyleWithColumns(20));
+  verifyFormat("foo = <Bar[]>[\n"
+               "  1,  //\n"
+               "  2\n"
+               "];");
+}
 
 TEST_F(FormatTestJS, TypeArguments) {
   verifyFormat("class X<Y> {}");
@@ -1024,7 +1253,6 @@
   verifyFormat("interface X {\n"
                "  y?(): z;\n"
                "}");
-  verifyFormat("x ? 1 : 2;");
   verifyFormat("constructor({aa}: {\n"
                "  aa?: string,\n"
                "  aaaaaaaa?: string,\n"
@@ -1056,13 +1284,78 @@
 }
 
 TEST_F(FormatTestJS, JSDocAnnotations) {
-  EXPECT_EQ("/**\n"
-            " * @export {this.is.a.long.path.to.a.Type}\n"
-            " */",
-            format("/**\n"
-                   " * @export {this.is.a.long.path.to.a.Type}\n"
-                   " */",
-                   getGoogleJSStyleWithColumns(20)));
+  verifyFormat("/**\n"
+               " * @export {this.is.a.long.path.to.a.Type}\n"
+               " */",
+               "/**\n"
+               " * @export {this.is.a.long.path.to.a.Type}\n"
+               " */",
+               getGoogleJSStyleWithColumns(20));
+}
+
+TEST_F(FormatTestJS, RequoteStringsSingle) {
+  verifyFormat("var x = 'foo';", "var x = \"foo\";");
+  verifyFormat("var x = 'fo\\'o\\'';", "var x = \"fo'o'\";");
+  verifyFormat("var x = 'fo\\'o\\'';", "var x = \"fo\\'o'\";");
+  verifyFormat(
+      "var x =\n"
+      "    'foo\\'';",
+      // Code below is 15 chars wide, doesn't fit into the line with the
+      // \ escape added.
+      "var x = \"foo'\";", getGoogleJSStyleWithColumns(15));
+  // Removes no-longer needed \ escape from ".
+  verifyFormat("var x = 'fo\"o';", "var x = \"fo\\\"o\";");
+  // Code below fits into 15 chars *after* removing the \ escape.
+  verifyFormat("var x = 'fo\"o';", "var x = \"fo\\\"o\";",
+               getGoogleJSStyleWithColumns(15));
+  verifyFormat("// clang-format off\n"
+               "let x = \"double\";\n"
+               "// clang-format on\n"
+               "let x = 'single';\n",
+               "// clang-format off\n"
+               "let x = \"double\";\n"
+               "// clang-format on\n"
+               "let x = \"single\";\n");
+}
+
+TEST_F(FormatTestJS, RequoteStringsDouble) {
+  FormatStyle DoubleQuotes = getGoogleStyle(FormatStyle::LK_JavaScript);
+  DoubleQuotes.JavaScriptQuotes = FormatStyle::JSQS_Double;
+  verifyFormat("var x = \"foo\";", DoubleQuotes);
+  verifyFormat("var x = \"foo\";", "var x = 'foo';", DoubleQuotes);
+  verifyFormat("var x = \"fo'o\";", "var x = 'fo\\'o';", DoubleQuotes);
+}
+
+TEST_F(FormatTestJS, RequoteStringsLeave) {
+  FormatStyle LeaveQuotes = getGoogleStyle(FormatStyle::LK_JavaScript);
+  LeaveQuotes.JavaScriptQuotes = FormatStyle::JSQS_Leave;
+  verifyFormat("var x = \"foo\";", LeaveQuotes);
+  verifyFormat("var x = 'foo';", LeaveQuotes);
+}
+
+TEST_F(FormatTestJS, SupportShebangLines) {
+  verifyFormat("#!/usr/bin/env node\n"
+               "var x = hello();",
+               "#!/usr/bin/env node\n"
+               "var x   =  hello();");
+}
+
+TEST_F(FormatTestJS, NonNullAssertionOperator) {
+  verifyFormat("let x = foo!.bar();\n");
+  verifyFormat("let x = foo ? bar! : baz;\n");
+  verifyFormat("let x = !foo;\n");
+  verifyFormat("let x = foo[0]!;\n");
+  verifyFormat("let x = (foo)!;\n");
+  verifyFormat("let x = {foo: 1}!;\n");
+}
+
+TEST_F(FormatTestJS, Conditional) {
+  verifyFormat("y = x ? 1 : 2;");
+  verifyFormat("x ? 1 : 2;");
+  verifyFormat("class Foo {\n"
+               "  field = true ? 1 : 2;\n"
+               "  method(a = true ? 1 : 2) {}\n"
+               "}");
 }
 
 } // end namespace tooling
diff --git a/unittests/Format/FormatTestJava.cpp b/unittests/Format/FormatTestJava.cpp
index 8fadfc0..dfc3deb 100644
--- a/unittests/Format/FormatTestJava.cpp
+++ b/unittests/Format/FormatTestJava.cpp
@@ -25,10 +25,10 @@
     DEBUG(llvm::errs() << Code << "\n\n");
     std::vector<tooling::Range> Ranges(1, tooling::Range(Offset, Length));
     tooling::Replacements Replaces = reformat(Style, Code, Ranges);
-    std::string Result = applyAllReplacements(Code, Replaces);
-    EXPECT_NE("", Result);
-    DEBUG(llvm::errs() << "\n" << Result << "\n\n");
-    return Result;
+    auto Result = applyAllReplacements(Code, Replaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    DEBUG(llvm::errs() << "\n" << *Result << "\n\n");
+    return *Result;
   }
 
   static std::string
@@ -312,6 +312,9 @@
                "      String bbbbbbbbbbbbbbb) {}\n"
                "}",
                getStyleWithColumns(60));
+  verifyFormat("@Annotation(\"Some\"\n"
+               "    + \" text\")\n"
+               "List<Integer> list;");
 }
 
 TEST_F(FormatTestJava, Generics) {
diff --git a/unittests/Format/FormatTestProto.cpp b/unittests/Format/FormatTestProto.cpp
index d3d3d42..6881af4 100644
--- a/unittests/Format/FormatTestProto.cpp
+++ b/unittests/Format/FormatTestProto.cpp
@@ -25,10 +25,10 @@
     DEBUG(llvm::errs() << Code << "\n\n");
     std::vector<tooling::Range> Ranges(1, tooling::Range(Offset, Length));
     tooling::Replacements Replaces = reformat(Style, Code, Ranges);
-    std::string Result = applyAllReplacements(Code, Replaces);
-    EXPECT_NE("", Result);
-    DEBUG(llvm::errs() << "\n" << Result << "\n\n");
-    return Result;
+    auto Result = applyAllReplacements(Code, Replaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    DEBUG(llvm::errs() << "\n" << *Result << "\n\n");
+    return *Result;
   }
 
   static std::string format(llvm::StringRef Code) {
@@ -74,8 +74,11 @@
                "  TYPE_B = 2;\n"
                "};");
   verifyFormat("enum Type {\n"
+               "  UNKNOWN = 0 [(some_options) = {a: aa, b: bb}];\n"
+               "};");
+  verifyFormat("enum Type {\n"
                "  UNKNOWN = 0 [(some_options) = {\n"
-               "    a: aa,\n"
+               "    a: aa,  // wrap\n"
                "    b: bb\n"
                "  }];\n"
                "};");
@@ -153,10 +156,7 @@
                "  field_a: OK\n"
                "  field_b: \"OK\"\n"
                "  field_c: \"OK\"\n"
-               "  msg_field: {\n"
-               "    field_d: 123\n"
-               "    field_e: OK\n"
-               "  }\n"
+               "  msg_field: {field_d: 123 field_e: OK}\n"
                "};");
   verifyFormat("option (MyProto.options) = {\n"
                "  field_a: OK  // Comment\n"
@@ -189,5 +189,26 @@
                "}");
 }
 
+TEST_F(FormatTestProto, FormatsImports) {
+  verifyFormat("import \"a.proto\";\n"
+               "import \"b.proto\";\n"
+               "// comment\n"
+               "message A {\n"
+               "}");
+
+  verifyFormat("import public \"a.proto\";\n"
+               "import \"b.proto\";\n"
+               "// comment\n"
+               "message A {\n"
+               "}");
+
+  // Missing semicolons should not confuse clang-format.
+  verifyFormat("import \"a.proto\"\n"
+               "import \"b.proto\"\n"
+               "// comment\n"
+               "message A {\n"
+               "}");
+}
+
 } // end namespace tooling
 } // end namespace clang
diff --git a/unittests/Format/FormatTestSelective.cpp b/unittests/Format/FormatTestSelective.cpp
index 699600c..2bc60fd 100644
--- a/unittests/Format/FormatTestSelective.cpp
+++ b/unittests/Format/FormatTestSelective.cpp
@@ -28,10 +28,10 @@
     tooling::Replacements Replaces =
         reformat(Style, Code, Ranges, "<stdin>", &IncompleteFormat);
     EXPECT_FALSE(IncompleteFormat) << Code << "\n\n";
-    std::string Result = applyAllReplacements(Code, Replaces);
-    EXPECT_NE("", Result);
-    DEBUG(llvm::errs() << "\n" << Result << "\n\n");
-    return Result;
+    auto Result = applyAllReplacements(Code, Replaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    DEBUG(llvm::errs() << "\n" << *Result << "\n\n");
+    return *Result;
   }
 
   FormatStyle Style = getLLVMStyle();
@@ -278,6 +278,23 @@
                    "  };\n"
                    "});",
                    0, 0));
+  EXPECT_EQ("SomeFunction(\n"
+            "    [] {\n"
+            "      int i;\n"
+            "      return i;\n" // Format this line.
+            "    },\n"
+            "    [] {\n"
+            "       return 2;\n" // Don't fix this.
+            "    });",
+            format("SomeFunction(\n"
+                   "    [] {\n"
+                   "      int i;\n"
+                   "       return i;\n" // Format this line.
+                   "    },\n"
+                   "    [] {\n"
+                   "       return 2;\n" // Don't fix this.
+                   "    });",
+                   40, 0));
 }
 
 TEST_F(FormatTestSelective, WrongIndent) {
@@ -495,6 +512,18 @@
              15, 0));
 }
 
+TEST_F(FormatTestSelective, SelectivelyRequoteJavaScript) {
+  Style = getGoogleStyle(FormatStyle::LK_JavaScript);
+  EXPECT_EQ(
+      "var x = \"a\";\n"
+      "var x = 'a';\n"
+      "var x = \"a\";",
+      format("var x = \"a\";\n"
+             "var x = \"a\";\n"
+             "var x = \"a\";",
+             20, 0));
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/unittests/Format/Makefile b/unittests/Format/Makefile
deleted file mode 100644
index 7029ea7..0000000
--- a/unittests/Format/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-##===- unittests/Format/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = Format
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangFormat.a clangTooling.a clangToolingCore.a clangFrontend.a \
-	   clangSerialization.a clangDriver.a clangParse.a clangRewrite.a \
-           clangRewriteFrontend.a clangSema.a clangAnalysis.a clangEdit.a \
-           clangAST.a clangASTMatchers.a clangLex.a clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Format/SortImportsTestJS.cpp b/unittests/Format/SortImportsTestJS.cpp
new file mode 100644
index 0000000..77c37e3
--- /dev/null
+++ b/unittests/Format/SortImportsTestJS.cpp
@@ -0,0 +1,269 @@
+//===- unittest/Format/SortImportsTestJS.cpp - JS import sort unit tests --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FormatTestUtils.h"
+#include "clang/Format/Format.h"
+#include "llvm/Support/Debug.h"
+#include "gtest/gtest.h"
+
+#define DEBUG_TYPE "format-test"
+
+namespace clang {
+namespace format {
+namespace {
+
+class SortImportsTestJS : public ::testing::Test {
+protected:
+  std::string sort(StringRef Code, unsigned Offset = 0, unsigned Length = 0) {
+    StringRef FileName = "input.js";
+    if (Length == 0U)
+      Length = Code.size() - Offset;
+    std::vector<tooling::Range> Ranges(1, tooling::Range(Offset, Length));
+    auto Sorted =
+        applyAllReplacements(Code, sortIncludes(Style, Code, Ranges, FileName));
+    EXPECT_TRUE(static_cast<bool>(Sorted));
+    auto Formatted = applyAllReplacements(
+        *Sorted, reformat(Style, *Sorted, Ranges, FileName));
+    EXPECT_TRUE(static_cast<bool>(Formatted));
+    return *Formatted;
+  }
+
+  void verifySort(llvm::StringRef Expected, llvm::StringRef Code,
+                  unsigned Offset = 0, unsigned Length = 0) {
+    std::string Result = sort(Code, Offset, Length);
+    EXPECT_EQ(Expected.str(), Result) << "Expected:\n"
+                                      << Expected << "\nActual:\n"
+                                      << Result;
+  }
+
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_JavaScript);
+};
+
+TEST_F(SortImportsTestJS, AlreadySorted) {
+  verifySort("import {sym} from 'a';\n"
+             "import {sym} from 'b';\n"
+             "import {sym} from 'c';\n"
+             "\n"
+             "let x = 1;",
+             "import {sym} from 'a';\n"
+             "import {sym} from 'b';\n"
+             "import {sym} from 'c';\n"
+             "\n"
+             "let x = 1;");
+}
+
+TEST_F(SortImportsTestJS, BasicSorting) {
+  verifySort("import {sym} from 'a';\n"
+             "import {sym} from 'b';\n"
+             "import {sym} from 'c';\n"
+             "\n"
+             "let x = 1;",
+             "import {sym} from 'a';\n"
+             "import {sym} from 'c';\n"
+             "import {sym} from 'b';\n"
+             "let x = 1;");
+}
+
+TEST_F(SortImportsTestJS, WrappedImportStatements) {
+  verifySort("import {sym1, sym2} from 'a';\n"
+             "import {sym} from 'b';\n"
+             "\n"
+             "1;",
+             "import\n"
+             "  {sym}\n"
+             "  from 'b';\n"
+             "import {\n"
+             "  sym1,\n"
+             "  sym2\n"
+             "} from 'a';\n"
+             "1;");
+}
+
+TEST_F(SortImportsTestJS, SeparateMainCodeBody) {
+  verifySort("import {sym} from 'a';"
+             "\n"
+             "let x = 1;\n",
+             "import {sym} from 'a'; let x = 1;\n");
+}
+
+TEST_F(SortImportsTestJS, Comments) {
+  verifySort("/** @fileoverview This is a great file. */\n"
+             "// A very important import follows.\n"
+             "import {sym} from 'a';  /* more comments */\n"
+             "import {sym} from 'b';  // from //foo:bar\n",
+             "/** @fileoverview This is a great file. */\n"
+             "import {sym} from 'b';  // from //foo:bar\n"
+             "// A very important import follows.\n"
+             "import {sym} from 'a';  /* more comments */\n");
+}
+
+TEST_F(SortImportsTestJS, SortStar) {
+  verifySort("import * as foo from 'a';\n"
+             "import {sym} from 'a';\n"
+             "import * as bar from 'b';\n",
+             "import {sym} from 'a';\n"
+             "import * as foo from 'a';\n"
+             "import * as bar from 'b';\n");
+}
+
+TEST_F(SortImportsTestJS, AliasesSymbols) {
+  verifySort("import {sym1 as alias1} from 'b';\n"
+             "import {sym2 as alias2, sym3 as alias3} from 'c';\n",
+             "import {sym2 as alias2, sym3 as alias3} from 'c';\n"
+             "import {sym1 as alias1} from 'b';\n");
+}
+
+TEST_F(SortImportsTestJS, SortSymbols) {
+  verifySort("import {sym1, sym2 as a, sym3} from 'b';\n",
+             "import {sym2 as a, sym1, sym3} from 'b';\n");
+  verifySort("import {sym1 /* important! */, /*!*/ sym2 as a} from 'b';\n",
+             "import {/*!*/ sym2 as a, sym1 /* important! */} from 'b';\n");
+  verifySort("import {sym1, sym2} from 'b';\n", "import {\n"
+                                                "  sym2 \n"
+                                                ",\n"
+                                                " sym1 \n"
+                                                "} from 'b';\n");
+}
+
+TEST_F(SortImportsTestJS, GroupImports) {
+  verifySort("import {a} from 'absolute';\n"
+             "\n"
+             "import {b} from '../parent';\n"
+             "import {b} from '../parent/nested';\n"
+             "\n"
+             "import {b} from './relative/path';\n"
+             "import {b} from './relative/path/nested';\n"
+             "\n"
+             "let x = 1;\n",
+             "import {b} from './relative/path/nested';\n"
+             "import {b} from './relative/path';\n"
+             "import {b} from '../parent/nested';\n"
+             "import {b} from '../parent';\n"
+             "import {a} from 'absolute';\n"
+             "let x = 1;\n");
+}
+
+TEST_F(SortImportsTestJS, Exports) {
+  verifySort("import {S} from 'bpath';\n"
+             "\n"
+             "import {T} from './cpath';\n"
+             "\n"
+             "export {A, B} from 'apath';\n"
+             "export {P} from '../parent';\n"
+             "export {R} from './relative';\n"
+             "export {S};\n"
+             "\n"
+             "let x = 1;\n"
+             "export y = 1;\n",
+             "export {R} from './relative';\n"
+             "import {T} from './cpath';\n"
+             "export {S};\n"
+             "export {A, B} from 'apath';\n"
+             "import {S} from 'bpath';\n"
+             "export {P} from '../parent';\n"
+             "let x = 1;\n"
+             "export y = 1;\n");
+  verifySort("import {S} from 'bpath';\n"
+             "\n"
+             "export {T} from 'epath';\n",
+             "export {T} from 'epath';\n"
+             "import {S} from 'bpath';\n");
+}
+
+TEST_F(SortImportsTestJS, SideEffectImports) {
+  verifySort("import 'ZZside-effect';\n"
+             "import 'AAside-effect';\n"
+             "\n"
+             "import {A} from 'absolute';\n"
+             "\n"
+             "import {R} from './relative';\n",
+             "import {R} from './relative';\n"
+             "import 'ZZside-effect';\n"
+             "import {A} from 'absolute';\n"
+             "import 'AAside-effect';\n");
+}
+
+TEST_F(SortImportsTestJS, AffectedRange) {
+  // Sort excluding a suffix.
+  verifySort("import {sym} from 'b';\n"
+             "import {sym} from 'c';\n"
+             "import {sym} from 'a';\n"
+             "let x = 1;",
+             "import {sym} from 'c';\n"
+             "import {sym} from 'b';\n"
+             "import {sym} from 'a';\n"
+             "let x = 1;",
+             0, 30);
+  // Sort excluding a prefix.
+  verifySort("import {sym} from 'c';\n"
+             "import {sym} from 'a';\n"
+             "import {sym} from 'b';\n"
+             "\n"
+             "let x = 1;",
+             "import {sym} from 'c';\n"
+             "import {sym} from 'b';\n"
+             "import {sym} from 'a';\n"
+             "\n"
+             "let x = 1;",
+             30, 0);
+  // Sort a range within imports.
+  verifySort("import {sym} from 'c';\n"
+             "import {sym} from 'a';\n"
+             "import {sym} from 'b';\n"
+             "import {sym} from 'c';\n"
+             "let x = 1;",
+             "import {sym} from 'c';\n"
+             "import {sym} from 'b';\n"
+             "import {sym} from 'a';\n"
+             "import {sym} from 'c';\n"
+             "let x = 1;",
+             24, 30);
+}
+
+TEST_F(SortImportsTestJS, SortingCanShrink) {
+  // Sort excluding a suffix.
+  verifySort("import {B} from 'a';\n"
+             "import {A} from 'b';\n"
+             "\n"
+             "1;",
+             "import {A} from 'b';\n"
+             "\n"
+             "import {B} from 'a';\n"
+             "\n"
+             "1;");
+}
+
+TEST_F(SortImportsTestJS, TrailingComma) {
+  verifySort("import {A, B,} from 'aa';\n", "import {B, A,} from 'aa';\n");
+}
+
+TEST_F(SortImportsTestJS, SortCaseInsensitive) {
+  verifySort("import {A} from 'aa';\n"
+             "import {A} from 'Ab';\n"
+             "import {A} from 'b';\n"
+             "import {A} from 'Bc';\n"
+             "\n"
+             "1;",
+             "import {A} from 'b';\n"
+             "import {A} from 'Bc';\n"
+             "import {A} from 'Ab';\n"
+             "import {A} from 'aa';\n"
+             "\n"
+             "1;");
+  verifySort("import {aa, Ab, b, Bc} from 'x';\n"
+             "\n"
+             "1;",
+             "import {b, Bc, Ab, aa} from 'x';\n"
+             "\n"
+             "1;");
+}
+
+} // end namespace
+} // end namespace format
+} // end namespace clang
diff --git a/unittests/Format/SortIncludesTest.cpp b/unittests/Format/SortIncludesTest.cpp
index dbe1174..b6ee2dd 100644
--- a/unittests/Format/SortIncludesTest.cpp
+++ b/unittests/Format/SortIncludesTest.cpp
@@ -20,17 +20,24 @@
 
 class SortIncludesTest : public ::testing::Test {
 protected:
-  std::string sort(llvm::StringRef Code, StringRef FileName = "input.cpp") {
-    std::vector<tooling::Range> Ranges(1, tooling::Range(0, Code.size()));
-    std::string Sorted =
-        applyAllReplacements(Code, sortIncludes(Style, Code, Ranges, FileName));
-    return applyAllReplacements(Sorted,
-                                reformat(Style, Sorted, Ranges, FileName));
+  std::vector<tooling::Range> GetCodeRange(StringRef Code) {
+    return std::vector<tooling::Range>(1, tooling::Range(0, Code.size()));
+  }
+
+  std::string sort(StringRef Code, StringRef FileName = "input.cpp") {
+    auto Ranges = GetCodeRange(Code);
+    auto Replaces = sortIncludes(Style, Code, Ranges, FileName);
+    Ranges = tooling::calculateRangesAfterReplacements(Replaces, Ranges);
+    auto Sorted = applyAllReplacements(Code, Replaces);
+    EXPECT_TRUE(static_cast<bool>(Sorted));
+    auto Result = applyAllReplacements(
+        *Sorted, reformat(Style, *Sorted, Ranges, FileName));
+    EXPECT_TRUE(static_cast<bool>(Result));
+    return *Result;
   }
 
   unsigned newCursor(llvm::StringRef Code, unsigned Cursor) {
-    std::vector<tooling::Range> Ranges(1, tooling::Range(0, Code.size()));
-    sortIncludes(Style, Code, Ranges, "input.cpp", &Cursor);
+    sortIncludes(Style, Code, GetCodeRange(Code), "input.cpp", &Cursor);
     return Cursor;
   }
 
@@ -47,6 +54,17 @@
                  "#include \"b.h\"\n"));
 }
 
+TEST_F(SortIncludesTest, NoReplacementsForValidIncludes) {
+  // Identical #includes have led to a failure with an unstable sort.
+  std::string Code = "#include <a>\n"
+                     "#include <b>\n"
+                     "#include <c>\n"
+                     "#include <d>\n"
+                     "#include <e>\n"
+                     "#include <f>\n";
+  EXPECT_TRUE(sortIncludes(Style, Code, GetCodeRange(Code), "a.cc").empty());
+}
+
 TEST_F(SortIncludesTest, SupportClangFormatOff) {
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
@@ -161,6 +179,7 @@
 }
 
 TEST_F(SortIncludesTest, LeavesMainHeaderFirst) {
+  Style.IncludeIsMainRegex = "([-_](test|unittest))?$";
   EXPECT_EQ("#include \"llvm/a.h\"\n"
             "#include \"b.h\"\n"
             "#include \"c.h\"\n",
@@ -174,7 +193,7 @@
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
                  "#include \"b.h\"\n",
-                 "a_main.cc"));
+                 "a_test.cc"));
   EXPECT_EQ("#include \"llvm/input.h\"\n"
             "#include \"b.h\"\n"
             "#include \"c.h\"\n",
@@ -183,6 +202,24 @@
                  "#include \"b.h\"\n",
                  "input.mm"));
 
+  // Don't allow prefixes.
+  EXPECT_EQ("#include \"b.h\"\n"
+            "#include \"c.h\"\n"
+            "#include \"llvm/not_a.h\"\n",
+            sort("#include \"llvm/not_a.h\"\n"
+                 "#include \"c.h\"\n"
+                 "#include \"b.h\"\n",
+                 "a.cc"));
+
+  // Don't do this for _main and other suffixes.
+  EXPECT_EQ("#include \"b.h\"\n"
+            "#include \"c.h\"\n"
+            "#include \"llvm/a.h\"\n",
+            sort("#include \"llvm/a.h\"\n"
+                 "#include \"c.h\"\n"
+                 "#include \"b.h\"\n",
+                 "a_main.cc"));
+
   // Don't do this in headers.
   EXPECT_EQ("#include \"b.h\"\n"
             "#include \"c.h\"\n"
@@ -250,6 +287,82 @@
   EXPECT_EQ(10u, newCursor(Code, 43));
 }
 
+TEST_F(SortIncludesTest, DeduplicateIncludes) {
+  EXPECT_EQ("#include <a>\n"
+            "#include <b>\n"
+            "#include <c>\n",
+            sort("#include <a>\n"
+                 "#include <b>\n"
+                 "#include <b>\n"
+                 "#include <b>\n"
+                 "#include <b>\n"
+                 "#include <c>\n"));
+}
+
+TEST_F(SortIncludesTest, SortAndDeduplicateIncludes) {
+  EXPECT_EQ("#include <a>\n"
+            "#include <b>\n"
+            "#include <c>\n",
+            sort("#include <b>\n"
+                 "#include <a>\n"
+                 "#include <b>\n"
+                 "#include <b>\n"
+                 "#include <c>\n"
+                 "#include <b>\n"));
+}
+
+TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionAfterDeduplicate) {
+  std::string Code = "#include <b>\n"      // Start of line: 0
+                     "#include <a>\n"      // Start of line: 13
+                     "#include <b>\n"      // Start of line: 26
+                     "#include <b>\n"      // Start of line: 39
+                     "#include <c>\n"      // Start of line: 52
+                     "#include <b>\n";     // Start of line: 65
+  std::string Expected = "#include <a>\n"  // Start of line: 0
+                         "#include <b>\n"  // Start of line: 13
+                         "#include <c>\n"; // Start of line: 26
+  EXPECT_EQ(Expected, sort(Code));
+  // Cursor on 'i' in "#include <a>".
+  EXPECT_EQ(1u, newCursor(Code, 14));
+  // Cursor on 'b' in "#include <b>".
+  EXPECT_EQ(23u, newCursor(Code, 10));
+  EXPECT_EQ(23u, newCursor(Code, 36));
+  EXPECT_EQ(23u, newCursor(Code, 49));
+  EXPECT_EQ(23u, newCursor(Code, 36));
+  EXPECT_EQ(23u, newCursor(Code, 75));
+  // Cursor on '#' in "#include <c>".
+  EXPECT_EQ(26u, newCursor(Code, 52));
+}
+
+TEST_F(SortIncludesTest, DeduplicateLocallyInEachBlock) {
+  EXPECT_EQ("#include <a>\n"
+            "#include <b>\n"
+            "\n"
+            "#include <b>\n"
+            "#include <c>\n",
+            sort("#include <a>\n"
+                 "#include <b>\n"
+                 "\n"
+                 "#include <c>\n"
+                 "#include <b>\n"
+                 "#include <b>\n"));
+}
+
+TEST_F(SortIncludesTest, ValidAffactedRangesAfterDeduplicatingIncludes) {
+  std::string Code = "#include <a>\n"
+                     "#include <b>\n"
+                     "#include <a>\n"
+                     "#include <a>\n"
+                     "\n"
+                     "   int     x ;";
+  std::vector<tooling::Range> Ranges = {tooling::Range(0, 52)};
+  auto Replaces = sortIncludes(Style, Code, Ranges, "input.cpp");
+  Ranges = tooling::calculateRangesAfterReplacements(Replaces, Ranges);
+  EXPECT_EQ(1u, Ranges.size());
+  EXPECT_EQ(0u, Ranges[0].getOffset());
+  EXPECT_EQ(26u, Ranges[0].getLength());
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/unittests/Frontend/CMakeLists.txt b/unittests/Frontend/CMakeLists.txt
index 5b5fdc9..674f77b 100644
--- a/unittests/Frontend/CMakeLists.txt
+++ b/unittests/Frontend/CMakeLists.txt
@@ -4,6 +4,7 @@
 
 add_clang_unittest(FrontendTests
   FrontendActionTest.cpp
+  CodeGenActionTest.cpp
   )
 target_link_libraries(FrontendTests
   clangAST
@@ -11,4 +12,5 @@
   clangFrontend
   clangLex
   clangSema
+  clangCodeGen
   )
diff --git a/unittests/Frontend/CodeGenActionTest.cpp b/unittests/Frontend/CodeGenActionTest.cpp
new file mode 100644
index 0000000..356b513
--- /dev/null
+++ b/unittests/Frontend/CodeGenActionTest.cpp
@@ -0,0 +1,62 @@
+//===- unittests/Frontend/CodeGenActionTest.cpp --- FrontendAction tests --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Unit tests for CodeGenAction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/CodeGen/BackendUtil.h"
+#include "clang/CodeGen/CodeGenAction.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace clang;
+using namespace clang::frontend;
+
+namespace {
+
+
+class NullCodeGenAction : public CodeGenAction {
+public:
+  NullCodeGenAction(llvm::LLVMContext *_VMContext = nullptr)
+    : CodeGenAction(Backend_EmitMCNull, _VMContext) {}
+
+  // The action does not call methods of ATContext.
+  void ExecuteAction() override {
+    CompilerInstance &CI = getCompilerInstance();
+    if (!CI.hasPreprocessor())
+      return;
+    if (!CI.hasSema())
+      CI.createSema(getTranslationUnitKind(), nullptr);
+  }
+};
+
+
+TEST(CodeGenTest, TestNullCodeGen) {
+  CompilerInvocation *Invocation = new CompilerInvocation;
+  Invocation->getPreprocessorOpts().addRemappedFile(
+      "test.cc",
+      MemoryBuffer::getMemBuffer("").release());
+  Invocation->getFrontendOpts().Inputs.push_back(
+      FrontendInputFile("test.cc", IK_CXX));
+  Invocation->getFrontendOpts().ProgramAction = EmitLLVM;
+  Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
+  CompilerInstance Compiler;
+  Compiler.setInvocation(Invocation);
+  Compiler.createDiagnostics();
+  EXPECT_TRUE(Compiler.hasDiagnostics());
+
+  std::unique_ptr<FrontendAction> Act(new NullCodeGenAction);
+  bool Success = Compiler.ExecuteAction(*Act);
+  EXPECT_TRUE(Success);
+}
+
+}
diff --git a/unittests/Frontend/FrontendActionTest.cpp b/unittests/Frontend/FrontendActionTest.cpp
index 90afd77..39a131f 100644
--- a/unittests/Frontend/FrontendActionTest.cpp
+++ b/unittests/Frontend/FrontendActionTest.cpp
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Frontend/FrontendAction.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Frontend/FrontendAction.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Sema/Sema.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/unittests/Frontend/Makefile b/unittests/Frontend/Makefile
deleted file mode 100644
index a6b6091..0000000
--- a/unittests/Frontend/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-##===- unittests/Frontend/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = Frontend
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangFrontendTool.a clangFrontend.a clangDriver.a \
-           clangSerialization.a clangCodeGen.a clangParse.a clangSema.a \
-           clangStaticAnalyzerCheckers.a clangStaticAnalyzerCore.a \
-           clangARCMigrate.a clangRewrite.a \
-		   clangRewriteFrontend.a clangEdit.a \
-           clangAnalysis.a clangAST.a clangLex.a clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Lex/HeaderMapTest.cpp b/unittests/Lex/HeaderMapTest.cpp
index 742c9aa..d16efe8 100644
--- a/unittests/Lex/HeaderMapTest.cpp
+++ b/unittests/Lex/HeaderMapTest.cpp
@@ -11,7 +11,6 @@
 #include "clang/Lex/HeaderMap.h"
 #include "clang/Lex/HeaderMapTypes.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "gtest/gtest.h"
 #include <cassert>
diff --git a/unittests/Lex/LexerTest.cpp b/unittests/Lex/LexerTest.cpp
index 0a8c8b5..2046018 100644
--- a/unittests/Lex/LexerTest.cpp
+++ b/unittests/Lex/LexerTest.cpp
@@ -22,7 +22,6 @@
 #include "clang/Lex/PreprocessorOptions.h"
 #include "gtest/gtest.h"
 
-using namespace llvm;
 using namespace clang;
 
 namespace {
@@ -60,7 +59,8 @@
   }
 
   std::vector<Token> Lex(StringRef Source) {
-    std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Source);
+    std::unique_ptr<llvm::MemoryBuffer> Buf =
+        llvm::MemoryBuffer::getMemBuffer(Source);
     SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
 
     VoidModuleLoader ModLoader;
diff --git a/unittests/Lex/Makefile b/unittests/Lex/Makefile
deleted file mode 100644
index c34ef76..0000000
--- a/unittests/Lex/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- unittests/Lex/Makefile ------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = Lex
-LINK_COMPONENTS := mcparser support mc bitreader
-USEDLIBS = clangParse.a clangSema.a clangAnalysis.a clangEdit.a \
-	clangSerialization.a clangAST.a clangLex.a clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Lex/PPConditionalDirectiveRecordTest.cpp b/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
index 9345fc2..bceeac5 100644
--- a/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
+++ b/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
@@ -22,7 +22,6 @@
 #include "clang/Lex/PreprocessorOptions.h"
 #include "gtest/gtest.h"
 
-using namespace llvm;
 using namespace clang;
 
 namespace {
@@ -89,7 +88,8 @@
       "#endif\n"
       "9\n";
 
-  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(source);
+  std::unique_ptr<llvm::MemoryBuffer> Buf =
+      llvm::MemoryBuffer::getMemBuffer(source);
   SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
 
   VoidModuleLoader ModLoader;
diff --git a/unittests/Makefile b/unittests/Makefile
deleted file mode 100644
index 2a0b5bc..0000000
--- a/unittests/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-##===- unittests/Makefile ----------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-# If CLANG_LEVEL is not set, then we are the top-level Makefile. Otherwise, we
-# are being included from a subdirectory makefile.
-
-ifndef CLANG_LEVEL
-
-IS_UNITTEST_LEVEL := 1
-CLANG_LEVEL := ..
-PARALLEL_DIRS = CodeGen Basic Lex Driver Format ASTMatchers AST Tooling \
-	        Rewrite Sema
-
-include $(CLANG_LEVEL)/../..//Makefile.config
-
-ifeq ($(ENABLE_CLANG_ARCMT),1)
-PARALLEL_DIRS += Frontend libclang StaticAnalyzer
-endif
-
-endif  # CLANG_LEVEL
-
-include $(CLANG_LEVEL)/Makefile
-
-ifndef IS_UNITTEST_LEVEL
-
-MAKEFILE_UNITTEST_NO_INCLUDE_COMMON := 1
-include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
-
-endif  # IS_UNITTEST_LEVEL
diff --git a/unittests/Rewrite/Makefile b/unittests/Rewrite/Makefile
deleted file mode 100644
index 43538d5..0000000
--- a/unittests/Rewrite/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- unittests/Rewrite/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = Rewrite
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) support
-USEDLIBS = clangRewrite.a clangLex.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Sema/ExternalSemaSourceTest.cpp b/unittests/Sema/ExternalSemaSourceTest.cpp
index 703e97b..d2cdd63 100644
--- a/unittests/Sema/ExternalSemaSourceTest.cpp
+++ b/unittests/Sema/ExternalSemaSourceTest.cpp
@@ -39,19 +39,18 @@
   bool Result;
 };
 
-// \brief Counts the number of err_using_directive_member_suggest diagnostics
-// correcting from one namespace to another while still passing all diagnostics
-// along a chain of consumers.
-class NamespaceDiagnosticWatcher : public clang::DiagnosticConsumer {
+/// Counts the number of typo-correcting diagnostics correcting from one name to
+/// another while still passing all diagnostics along a chain of consumers.
+class DiagnosticWatcher : public clang::DiagnosticConsumer {
   DiagnosticConsumer *Chained;
-  std::string FromNS;
-  std::string ToNS;
+  std::string FromName;
+  std::string ToName;
 
 public:
-  NamespaceDiagnosticWatcher(StringRef From, StringRef To)
-      : Chained(nullptr), FromNS(From), ToNS("'"), SeenCount(0) {
-    ToNS.append(To);
-    ToNS.append("'");
+  DiagnosticWatcher(StringRef From, StringRef To)
+      : Chained(nullptr), FromName(From), ToName("'"), SeenCount(0) {
+    ToName.append(To);
+    ToName.append("'");
   }
 
   void HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
@@ -61,7 +60,12 @@
     if (Info.getID() - 1 == diag::err_using_directive_member_suggest) {
       const IdentifierInfo *Ident = Info.getArgIdentifier(0);
       const std::string &CorrectedQuotedStr = Info.getArgStdStr(1);
-      if (Ident->getName() == FromNS && CorrectedQuotedStr == ToNS)
+      if (Ident->getName() == FromName && CorrectedQuotedStr == ToName)
+        ++SeenCount;
+    } else if (Info.getID() == diag::err_no_member_suggest) {
+      auto Ident = DeclarationName::getFromOpaqueInteger(Info.getRawArg(0));
+      const std::string &CorrectedQuotedStr = Info.getArgStdStr(3);
+      if (Ident.getAsString() == FromName && CorrectedQuotedStr == ToName)
         ++SeenCount;
     }
   }
@@ -78,7 +82,7 @@
     return false;
   }
 
-  NamespaceDiagnosticWatcher *Chain(DiagnosticConsumer *ToChain) {
+  DiagnosticWatcher *Chain(DiagnosticConsumer *ToChain) {
     Chained = ToChain;
     return this;
   }
@@ -130,11 +134,53 @@
   int CallCount;
 };
 
-// \brief Chains together a vector of NamespaceDiagnosticWatchers and
+class FunctionTypoProvider : public clang::ExternalSemaSource {
+  std::string CorrectFrom;
+  std::string CorrectTo;
+  Sema *CurrentSema;
+
+public:
+  FunctionTypoProvider(StringRef From, StringRef To)
+      : CorrectFrom(From), CorrectTo(To), CurrentSema(nullptr), CallCount(0) {}
+
+  void InitializeSema(Sema &S) override { CurrentSema = &S; }
+
+  void ForgetSema() override { CurrentSema = nullptr; }
+
+  TypoCorrection CorrectTypo(const DeclarationNameInfo &Typo, int LookupKind,
+                             Scope *S, CXXScopeSpec *SS,
+                             CorrectionCandidateCallback &CCC,
+                             DeclContext *MemberContext, bool EnteringContext,
+                             const ObjCObjectPointerType *OPT) override {
+    ++CallCount;
+    if (CurrentSema && Typo.getName().getAsString() == CorrectFrom) {
+      DeclContext *DestContext = nullptr;
+      ASTContext &Context = CurrentSema->getASTContext();
+      if (SS)
+        DestContext = CurrentSema->computeDeclContext(*SS, EnteringContext);
+      if (!DestContext)
+        DestContext = Context.getTranslationUnitDecl();
+      IdentifierInfo *ToIdent =
+          CurrentSema->getPreprocessor().getIdentifierInfo(CorrectTo);
+      auto *NewFunction = FunctionDecl::Create(
+          Context, DestContext, SourceLocation(), SourceLocation(), ToIdent,
+          Context.getFunctionType(Context.VoidTy, {}, {}), nullptr, SC_Static);
+      DestContext->addDecl(NewFunction);
+      TypoCorrection Correction(ToIdent);
+      Correction.addCorrectionDecl(NewFunction);
+      return Correction;
+    }
+    return TypoCorrection();
+  }
+
+  int CallCount;
+};
+
+// \brief Chains together a vector of DiagnosticWatchers and
 // adds a vector of ExternalSemaSources to the CompilerInstance before
 // performing semantic analysis.
 class ExternalSemaSourceInstaller : public clang::ASTFrontendAction {
-  std::vector<NamespaceDiagnosticWatcher *> Watchers;
+  std::vector<DiagnosticWatcher *> Watchers;
   std::vector<clang::ExternalSemaSource *> Sources;
   std::unique_ptr<DiagnosticConsumer> OwnedClient;
 
@@ -170,16 +216,14 @@
     Sources.push_back(Source);
   }
 
-  void PushWatcher(NamespaceDiagnosticWatcher *Watcher) {
-    Watchers.push_back(Watcher);
-  }
+  void PushWatcher(DiagnosticWatcher *Watcher) { Watchers.push_back(Watcher); }
 };
 
-// Make sure that the NamespaceDiagnosticWatcher is not miscounting.
+// Make sure that the DiagnosticWatcher is not miscounting.
 TEST(ExternalSemaSource, SanityCheck) {
   std::unique_ptr<ExternalSemaSourceInstaller> Installer(
       new ExternalSemaSourceInstaller);
-  NamespaceDiagnosticWatcher Watcher("AAB", "BBB");
+  DiagnosticWatcher Watcher("AAB", "BBB");
   Installer->PushWatcher(&Watcher);
   std::vector<std::string> Args(1, "-std=c++11");
   ASSERT_TRUE(clang::tooling::runToolOnCodeWithArgs(
@@ -193,7 +237,7 @@
   std::unique_ptr<ExternalSemaSourceInstaller> Installer(
       new ExternalSemaSourceInstaller);
   NamespaceTypoProvider Provider("AAB", "BBB");
-  NamespaceDiagnosticWatcher Watcher("AAB", "BBB");
+  DiagnosticWatcher Watcher("AAB", "BBB");
   Installer->PushSource(&Provider);
   Installer->PushWatcher(&Watcher);
   std::vector<std::string> Args(1, "-std=c++11");
@@ -211,7 +255,7 @@
   NamespaceTypoProvider First("XXX", "BBB");
   NamespaceTypoProvider Second("AAB", "CCC");
   NamespaceTypoProvider Third("AAB", "DDD");
-  NamespaceDiagnosticWatcher Watcher("AAB", "CCC");
+  DiagnosticWatcher Watcher("AAB", "CCC");
   Installer->PushSource(&First);
   Installer->PushSource(&Second);
   Installer->PushSource(&Third);
@@ -225,6 +269,21 @@
   ASSERT_EQ(1, Watcher.SeenCount);
 }
 
+TEST(ExternalSemaSource, ExternalDelayedTypoCorrection) {
+  std::unique_ptr<ExternalSemaSourceInstaller> Installer(
+      new ExternalSemaSourceInstaller);
+  FunctionTypoProvider Provider("aaa", "bbb");
+  DiagnosticWatcher Watcher("aaa", "bbb");
+  Installer->PushSource(&Provider);
+  Installer->PushWatcher(&Watcher);
+  std::vector<std::string> Args(1, "-std=c++11");
+  ASSERT_TRUE(clang::tooling::runToolOnCodeWithArgs(
+      Installer.release(), "namespace AAA { } void foo() { AAA::aaa(); }",
+      Args));
+  ASSERT_LE(0, Provider.CallCount);
+  ASSERT_EQ(1, Watcher.SeenCount);
+}
+
 // We should only try MaybeDiagnoseMissingCompleteType if we can't otherwise
 // solve the problem.
 TEST(ExternalSemaSource, TryOtherTacticsBeforeDiagnosing) {
diff --git a/unittests/Sema/Makefile b/unittests/Sema/Makefile
deleted file mode 100644
index ef8852d..0000000
--- a/unittests/Sema/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-##===- unittests/Sema/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = Sema
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangTooling.a clangFrontend.a clangSerialization.a clangDriver.a \
-           clangRewrite.a clangRewriteFrontend.a \
-           clangParse.a clangSema.a clangAnalysis.a \
-           clangEdit.a clangAST.a clangASTMatchers.a clangLex.a \
-           clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/StaticAnalyzer/Makefile b/unittests/StaticAnalyzer/Makefile
deleted file mode 100644
index af85b71..0000000
--- a/unittests/StaticAnalyzer/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- unittests/Basic/Makefile ----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = StaticAnalysis
-LINK_COMPONENTS := support mc
-USEDLIBS = clangBasic.a clangAnalysis.a clangStaticAnalyzerCore.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Tooling/CMakeLists.txt b/unittests/Tooling/CMakeLists.txt
index 33b2046..b4b3f40 100644
--- a/unittests/Tooling/CMakeLists.txt
+++ b/unittests/Tooling/CMakeLists.txt
@@ -3,26 +3,35 @@
   Support
   )
 
+# By default MSVC has a 2^16 limit on the number of sections in an object file,
+# and this needs more than that.
+if (MSVC)
+  set_source_files_properties(RecursiveASTVisitorTestExprVisitor.cpp PROPERTIES COMPILE_FLAGS /bigobj)
+endif()
+
 add_clang_unittest(ToolingTests
   CommentHandlerTest.cpp
   CompilationDatabaseTest.cpp
+  FixItTest.cpp  
   LookupTest.cpp
-  ToolingTest.cpp
+  QualTypeNamesTest.cpp
   RecursiveASTVisitorTest.cpp
   RecursiveASTVisitorTestCallVisitor.cpp
   RecursiveASTVisitorTestDeclVisitor.cpp
   RecursiveASTVisitorTestExprVisitor.cpp
   RecursiveASTVisitorTestTypeLocVisitor.cpp
-  RefactoringTest.cpp
-  RewriterTest.cpp
   RefactoringCallbacksTest.cpp
+  RefactoringTest.cpp
   ReplacementsYamlTest.cpp
+  RewriterTest.cpp
+  ToolingTest.cpp
   )
 
 target_link_libraries(ToolingTests
   clangAST
   clangASTMatchers
   clangBasic
+  clangFormat
   clangFrontend
   clangLex
   clangRewrite
diff --git a/unittests/Tooling/CompilationDatabaseTest.cpp b/unittests/Tooling/CompilationDatabaseTest.cpp
index 380d86f..13d6023 100644
--- a/unittests/Tooling/CompilationDatabaseTest.cpp
+++ b/unittests/Tooling/CompilationDatabaseTest.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/AST/ASTConsumer.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclGroup.h"
 #include "clang/Frontend/FrontendAction.h"
diff --git a/unittests/Tooling/FixItTest.cpp b/unittests/Tooling/FixItTest.cpp
new file mode 100644
index 0000000..365180e
--- /dev/null
+++ b/unittests/Tooling/FixItTest.cpp
@@ -0,0 +1,232 @@
+//===- unittest/Tooling/FixitTest.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestVisitor.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Tooling/FixIt.h"
+
+using namespace clang;
+
+using tooling::fixit::getText;
+using tooling::fixit::createRemoval;
+using tooling::fixit::createReplacement;
+
+namespace {
+
+struct CallsVisitor : TestVisitor<CallsVisitor> {
+  bool VisitCallExpr(CallExpr *Expr) {
+    OnCall(Expr, Context);
+    return true;
+  }
+
+  std::function<void(CallExpr *, ASTContext *Context)> OnCall;
+};
+
+std::string LocationToString(SourceLocation Loc, ASTContext *Context) {
+  return Loc.printToString(Context->getSourceManager());
+}
+
+TEST(FixItTest, getText) {
+  CallsVisitor Visitor;
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    EXPECT_EQ("foo(x, y)", getText(*CE, *Context));
+    EXPECT_EQ("foo(x, y)", getText(CE->getSourceRange(), *Context));
+
+    Expr *P0 = CE->getArg(0);
+    Expr *P1 = CE->getArg(1);
+    EXPECT_EQ("x", getText(*P0, *Context));
+    EXPECT_EQ("y", getText(*P1, *Context));
+  };
+  Visitor.runOver("void foo(int x, int y) { foo(x, y); }");
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    EXPECT_EQ("APPLY(foo, x, y)", getText(*CE, *Context));
+  };
+  Visitor.runOver("#define APPLY(f, x, y) f(x, y)\n"
+                  "void foo(int x, int y) { APPLY(foo, x, y); }");
+}
+
+TEST(FixItTest, getTextWithMacro) {
+  CallsVisitor Visitor;
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    EXPECT_EQ("F OO", getText(*CE, *Context));
+    Expr *P0 = CE->getArg(0);
+    Expr *P1 = CE->getArg(1);
+    EXPECT_EQ("", getText(*P0, *Context));
+    EXPECT_EQ("", getText(*P1, *Context));
+  };
+  Visitor.runOver("#define F foo(\n"
+                  "#define OO x, y)\n"
+                  "void foo(int x, int y) { F OO ; }");
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    EXPECT_EQ("", getText(*CE, *Context));
+    Expr *P0 = CE->getArg(0);
+    Expr *P1 = CE->getArg(1);
+    EXPECT_EQ("x", getText(*P0, *Context));
+    EXPECT_EQ("y", getText(*P1, *Context));
+  };
+  Visitor.runOver("#define FOO(x, y) (void)x; (void)y; foo(x, y);\n"
+                  "void foo(int x, int y) { FOO(x,y) }");
+}
+
+TEST(FixItTest, createRemoval) {
+  CallsVisitor Visitor;
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    FixItHint Hint = createRemoval(*CE);
+    EXPECT_EQ("foo(x, y)", getText(Hint.RemoveRange.getAsRange(), *Context));
+    EXPECT_TRUE(Hint.InsertFromRange.isInvalid());
+    EXPECT_TRUE(Hint.CodeToInsert.empty());
+
+    Expr *P0 = CE->getArg(0);
+    FixItHint Hint0 = createRemoval(*P0);
+    EXPECT_EQ("x", getText(Hint0.RemoveRange.getAsRange(), *Context));
+    EXPECT_TRUE(Hint0.InsertFromRange.isInvalid());
+    EXPECT_TRUE(Hint0.CodeToInsert.empty());
+
+    Expr *P1 = CE->getArg(1);
+    FixItHint Hint1 = createRemoval(*P1);
+    EXPECT_EQ("y", getText(Hint1.RemoveRange.getAsRange(), *Context));
+    EXPECT_TRUE(Hint1.InsertFromRange.isInvalid());
+    EXPECT_TRUE(Hint1.CodeToInsert.empty());
+  };
+  Visitor.runOver("void foo(int x, int y) { foo(x, y); }");
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    Expr *P0 = CE->getArg(0);
+    FixItHint Hint0 = createRemoval(*P0);
+    EXPECT_EQ("x + y", getText(Hint0.RemoveRange.getAsRange(), *Context));
+
+    Expr *P1 = CE->getArg(1);
+    FixItHint Hint1 = createRemoval(*P1);
+    EXPECT_EQ("y + x", getText(Hint1.RemoveRange.getAsRange(), *Context));
+  };
+  Visitor.runOver("void foo(int x, int y) { foo(x + y, y + x); }");
+}
+
+TEST(FixItTest, createRemovalWithMacro) {
+  CallsVisitor Visitor;
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    FixItHint Hint = createRemoval(*CE);
+    EXPECT_EQ("FOO", getText(Hint.RemoveRange.getAsRange(), *Context));
+    EXPECT_TRUE(Hint.InsertFromRange.isInvalid());
+    EXPECT_TRUE(Hint.CodeToInsert.empty());
+
+    Expr *P0 = CE->getArg(0);
+    FixItHint Hint0 = createRemoval(*P0);
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:1:17>",
+              LocationToString(Hint0.RemoveRange.getBegin(), Context));
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:1:17>",
+              LocationToString(Hint0.RemoveRange.getEnd(), Context));
+    EXPECT_TRUE(Hint0.InsertFromRange.isInvalid());
+    EXPECT_TRUE(Hint0.CodeToInsert.empty());
+
+    Expr *P1 = CE->getArg(1);
+    FixItHint Hint1 = createRemoval(*P1);
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:1:20>",
+              LocationToString(Hint1.RemoveRange.getBegin(), Context));
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:1:20>",
+              LocationToString(Hint1.RemoveRange.getEnd(), Context));
+    EXPECT_TRUE(Hint1.InsertFromRange.isInvalid());
+    EXPECT_TRUE(Hint1.CodeToInsert.empty());
+  };
+  Visitor.runOver("#define FOO foo(1, 1)\n"
+                  "void foo(int x, int y) { FOO; }");
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    FixItHint Hint = createRemoval(*CE);
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:1:37>",
+              LocationToString(Hint.RemoveRange.getBegin(), Context));
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:1:45>",
+              LocationToString(Hint.RemoveRange.getEnd(), Context));
+    EXPECT_TRUE(Hint.InsertFromRange.isInvalid());
+    EXPECT_TRUE(Hint.CodeToInsert.empty());
+  };
+  Visitor.runOver("#define FOO(x, y) (void)x; (void)y; foo(x, y);\n"
+                  "void foo(int x, int y) { FOO(x,y) }");
+}
+
+TEST(FixItTest, createReplacement) {
+  CallsVisitor Visitor;
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    Expr *P0 = CE->getArg(0);
+    Expr *P1 = CE->getArg(1);
+    FixItHint Hint0 = createReplacement(*P0, *P1, *Context);
+    FixItHint Hint1 = createReplacement(*P1, *P0, *Context);
+
+    // Validate Hint0 fields.
+    EXPECT_EQ("x", getText(Hint0.RemoveRange.getAsRange(), *Context));
+    EXPECT_TRUE(Hint0.InsertFromRange.isInvalid());
+    EXPECT_EQ(Hint0.CodeToInsert, "y");
+
+    // Validate Hint1 fields.
+    EXPECT_EQ("y", getText(Hint1.RemoveRange.getAsRange(), *Context));
+    EXPECT_TRUE(Hint1.InsertFromRange.isInvalid());
+    EXPECT_EQ(Hint1.CodeToInsert, "x");
+  };
+
+  Visitor.runOver("void foo(int x, int y) { foo(x, y); }");
+
+  Visitor.runOver("#define APPLY(f, x, y) f(x, y)\n"
+                  "void foo(int x, int y) { APPLY(foo, x, y); }");
+
+  Visitor.runOver("#define APPLY(f, P) f(P)\n"
+                  "#define PAIR(x, y) x, y\n"
+                  "void foo(int x, int y) { APPLY(foo, PAIR(x, y)); }\n");
+}
+
+TEST(FixItTest, createReplacementWithMacro) {
+  CallsVisitor Visitor;
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    Expr *P0 = CE->getArg(0);
+    Expr *P1 = CE->getArg(1);
+    FixItHint Hint = createReplacement(*P0, *P1, *Context);
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:1:17>",
+              LocationToString(Hint.RemoveRange.getBegin(), Context));
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:1:17>",
+              LocationToString(Hint.RemoveRange.getEnd(), Context));
+    EXPECT_TRUE(Hint.InsertFromRange.isInvalid());
+    EXPECT_TRUE(Hint.CodeToInsert.empty());
+  };
+
+  Visitor.runOver("#define FOO foo(1, 1)\n"
+                  "void foo(int x, int y) { FOO; }");
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    Expr *P0 = CE->getArg(0);
+    Expr *P1 = CE->getArg(1);
+    FixItHint Hint = createReplacement(*P0, *P1, *Context);
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:2:30>",
+              LocationToString(Hint.RemoveRange.getBegin(), Context));
+    EXPECT_EQ("input.cc:2:26 <Spelling=input.cc:2:30>",
+              LocationToString(Hint.RemoveRange.getEnd(), Context));
+    EXPECT_TRUE(Hint.InsertFromRange.isInvalid());
+    EXPECT_EQ("y", Hint.CodeToInsert);
+  };
+  Visitor.runOver("#define FOO(x, y) (void)x; (void)y; foo(x, y);\n"
+                  "void foo(int x, int y) { FOO(x,y) }");
+
+  Visitor.OnCall = [](CallExpr *CE, ASTContext *Context) {
+    Expr *P0 = CE->getArg(0);
+    Expr *P1 = CE->getArg(1);
+    FixItHint Hint = createReplacement(*P0, *P1, *Context);
+    EXPECT_EQ("x + y", getText(Hint.RemoveRange.getAsRange(), *Context));
+    EXPECT_TRUE(Hint.InsertFromRange.isInvalid());
+    EXPECT_EQ("y + x", Hint.CodeToInsert);
+  };
+  Visitor.runOver("void foo(int x, int y) { foo(x + y, y + x); }");
+}
+
+} // end anonymous namespace
diff --git a/unittests/Tooling/LookupTest.cpp b/unittests/Tooling/LookupTest.cpp
index d847a29..842d587 100644
--- a/unittests/Tooling/LookupTest.cpp
+++ b/unittests/Tooling/LookupTest.cpp
@@ -103,6 +103,14 @@
   };
   Visitor.runOver(
       "namespace a { int foo(); }\nusing a::foo;\nauto f = foo();\n");
+
+  Visitor.OnCall = [&](CallExpr *Expr) {
+    EXPECT_EQ("c::bar", replaceCallExpr(Expr, "::a::c::bar"));
+  };
+  Visitor.runOver("namespace a { namespace b { void foo(); } }\n"
+                  "namespace a { namespace b { namespace {"
+                  "void f() { foo(); }"
+                  "} } }\n");
 }
 
 } // end anonymous namespace
diff --git a/unittests/Tooling/Makefile b/unittests/Tooling/Makefile
deleted file mode 100644
index 93483d9..0000000
--- a/unittests/Tooling/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-##===- unittests/Tooling/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = Tooling
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser bitreader support mc option
-USEDLIBS = clangTooling.a clangToolingCore.a clangFrontend.a \
-	   clangSerialization.a clangDriver.a \
-           clangParse.a clangRewrite.a clangRewriteFrontend.a \
-	   clangSema.a clangAnalysis.a clangEdit.a \
-           clangAST.a clangASTMatchers.a clangLex.a clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/unittests/Tooling/QualTypeNamesTest.cpp b/unittests/Tooling/QualTypeNamesTest.cpp
new file mode 100644
index 0000000..edd5060
--- /dev/null
+++ b/unittests/Tooling/QualTypeNamesTest.cpp
@@ -0,0 +1,222 @@
+//===- unittest/Tooling/QualTypeNameTest.cpp ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Core/QualTypeNames.h"
+#include "TestVisitor.h"
+using namespace clang;
+
+namespace {
+struct TypeNameVisitor : TestVisitor<TypeNameVisitor> {
+  llvm::StringMap<std::string> ExpectedQualTypeNames;
+  bool WithGlobalNsPrefix = false;
+
+  // ValueDecls are the least-derived decl with both a qualtype and a
+  // name.
+  bool traverseDecl(Decl *D) {
+    return true;  // Always continue
+  }
+
+  bool VisitValueDecl(const ValueDecl *VD) {
+    std::string ExpectedName =
+        ExpectedQualTypeNames.lookup(VD->getNameAsString());
+    if (ExpectedName != "") {
+      std::string ActualName =
+          TypeName::getFullyQualifiedName(VD->getType(), *Context,
+                                          WithGlobalNsPrefix);
+      if (ExpectedName != ActualName) {
+        // A custom message makes it much easier to see what declaration
+        // failed compared to EXPECT_EQ.
+        EXPECT_TRUE(false) << "Typename::getFullyQualifiedName failed for "
+                           << VD->getQualifiedNameAsString() << std::endl
+                           << "   Actual: " << ActualName << std::endl
+                           << " Exepcted: " << ExpectedName;
+      }
+    }
+    return true;
+  }
+};
+
+// named namespaces inside anonymous namespaces
+
+TEST(QualTypeNameTest, getFullyQualifiedName) {
+  TypeNameVisitor Visitor;
+  // Simple case to test the test framework itself.
+  Visitor.ExpectedQualTypeNames["CheckInt"] = "int";
+
+  // Keeping the names of the variables whose types we check unique
+  // within the entire test--regardless of their own scope--makes it
+  // easier to diagnose test failures.
+
+  // Simple namespace qualifier
+  Visitor.ExpectedQualTypeNames["CheckA"] = "A::B::Class0";
+  // Lookup up the enclosing scopes, then down another one. (These
+  // appear as elaborated type in the AST. In that case--even if
+  // policy.SuppressScope = 0--qual_type.getAsString(policy) only
+  // gives the name as it appears in the source, not the full name.
+  Visitor.ExpectedQualTypeNames["CheckB"] = "A::B::C::Class1";
+  // Template parameter expansion.
+  Visitor.ExpectedQualTypeNames["CheckC"] =
+      "A::B::Template0<A::B::C::MyInt, A::B::AnotherClass>";
+  // Recursive template parameter expansion.
+  Visitor.ExpectedQualTypeNames["CheckD"] =
+      "A::B::Template0<A::B::Template1<A::B::C::MyInt, A::B::AnotherClass>, "
+      "A::B::Template0<int, long> >";
+  // Variadic Template expansion.
+  Visitor.ExpectedQualTypeNames["CheckE"] =
+      "A::Variadic<int, A::B::Template0<int, char>, "
+      "A::B::Template1<int, long>, A::B::C::MyInt>";
+  // Using declarations should be fully expanded.
+  Visitor.ExpectedQualTypeNames["CheckF"] = "A::B::Class0";
+  // Elements found within "using namespace foo;" should be fully
+  // expanded.
+  Visitor.ExpectedQualTypeNames["CheckG"] = "A::B::C::MyInt";
+  // Type inside function
+  Visitor.ExpectedQualTypeNames["CheckH"] = "struct X";
+  // Anonymous Namespaces
+  Visitor.ExpectedQualTypeNames["CheckI"] = "aClass";
+  // Keyword inclusion with namespaces
+  Visitor.ExpectedQualTypeNames["CheckJ"] = "struct A::aStruct";
+  // Anonymous Namespaces nested in named namespaces and vice-versa.
+  Visitor.ExpectedQualTypeNames["CheckK"] = "D::aStruct";
+  // Namespace alias
+  Visitor.ExpectedQualTypeNames["CheckL"] = "A::B::C::MyInt";
+  Visitor.ExpectedQualTypeNames["non_dependent_type_var"] =
+      "Foo<X>::non_dependent_type";
+  Visitor.ExpectedQualTypeNames["AnEnumVar"] = "EnumScopeClass::AnEnum";
+  Visitor.ExpectedQualTypeNames["AliasTypeVal"] = "A::B::C::InnerAlias<int>";
+  Visitor.ExpectedQualTypeNames["CheckM"] = "const A::B::Class0 *";
+  Visitor.ExpectedQualTypeNames["CheckN"] = "const X *";
+  Visitor.runOver(
+      "int CheckInt;\n"
+      "template <typename T>\n"
+      "class OuterTemplateClass { };\n"
+      "namespace A {\n"
+      " namespace B {\n"
+      "   class Class0 { };\n"
+      "   namespace C {\n"
+      "     typedef int MyInt;"
+      "     template <typename T>\n"
+      "     using InnerAlias = OuterTemplateClass<T>;\n"
+      "     InnerAlias<int> AliasTypeVal;\n"
+      "   }\n"
+      "   template<class X, class Y> class Template0;"
+      "   template<class X, class Y> class Template1;"
+      "   typedef B::Class0 AnotherClass;\n"
+      "   void Function1(Template0<C::MyInt,\n"
+      "                  AnotherClass> CheckC);\n"
+      "   void Function2(Template0<Template1<C::MyInt, AnotherClass>,\n"
+      "                            Template0<int, long> > CheckD);\n"
+      "   void Function3(const B::Class0* CheckM);\n"
+      "  }\n"
+      "template<typename... Values> class Variadic {};\n"
+      "Variadic<int, B::Template0<int, char>, "
+      "         B::Template1<int, long>, "
+      "         B::C::MyInt > CheckE;\n"
+      " namespace BC = B::C;\n"
+      " BC::MyInt CheckL;\n"
+      "}\n"
+      "using A::B::Class0;\n"
+      "void Function(Class0 CheckF);\n"
+      "using namespace A::B::C;\n"
+      "void Function(MyInt CheckG);\n"
+      "void f() {\n"
+      "  struct X {} CheckH;\n"
+      "}\n"
+      "struct X;\n"
+      "void f(const ::X* CheckN) {}\n"
+      "namespace {\n"
+      "  class aClass {};\n"
+      "   aClass CheckI;\n"
+      "}\n"
+      "namespace A {\n"
+      "  struct aStruct {} CheckJ;\n"
+      "}\n"
+      "namespace {\n"
+      "  namespace D {\n"
+      "    namespace {\n"
+      "      class aStruct {};\n"
+      "      aStruct CheckK;\n"
+      "    }\n"
+      "  }\n"
+      "}\n"
+      "template<class T> struct Foo {\n"
+      "  typedef typename T::A dependent_type;\n"
+      "  typedef int non_dependent_type;\n"
+      "  dependent_type dependent_type_var;\n"
+      "  non_dependent_type non_dependent_type_var;\n"
+      "};\n"
+      "struct X { typedef int A; };"
+      "Foo<X> var;"
+      "void F() {\n"
+      "  var.dependent_type_var = 0;\n"
+      "var.non_dependent_type_var = 0;\n"
+      "}\n"
+      "class EnumScopeClass {\n"
+      "public:\n"
+      "  enum AnEnum { ZERO, ONE };\n"
+      "};\n"
+      "EnumScopeClass::AnEnum AnEnumVar;\n",
+      TypeNameVisitor::Lang_CXX11
+);
+
+  TypeNameVisitor Complex;
+  Complex.ExpectedQualTypeNames["CheckTX"] = "B::TX";
+  Complex.runOver(
+      "namespace A {"
+      "  struct X {};"
+      "}"
+      "using A::X;"
+      "namespace fake_std {"
+      "  template<class... Types > class tuple {};"
+      "}"
+      "namespace B {"
+      "  using fake_std::tuple;"
+      "  typedef tuple<X> TX;"
+      "  TX CheckTX;"
+      "  struct A { typedef int X; };"
+      "}");
+
+  TypeNameVisitor GlobalNsPrefix;
+  GlobalNsPrefix.WithGlobalNsPrefix = true;
+  GlobalNsPrefix.ExpectedQualTypeNames["IntVal"] = "int";
+  GlobalNsPrefix.ExpectedQualTypeNames["BoolVal"] = "bool";
+  GlobalNsPrefix.ExpectedQualTypeNames["XVal"] = "::A::B::X";
+  GlobalNsPrefix.ExpectedQualTypeNames["IntAliasVal"] = "::A::B::Alias<int>";
+  GlobalNsPrefix.ExpectedQualTypeNames["ZVal"] = "::A::B::Y::Z";
+  GlobalNsPrefix.ExpectedQualTypeNames["GlobalZVal"] = "::Z";
+  GlobalNsPrefix.ExpectedQualTypeNames["CheckK"] = "D::aStruct";
+  GlobalNsPrefix.runOver(
+      "namespace A {\n"
+      "  namespace B {\n"
+      "    int IntVal;\n"
+      "    bool BoolVal;\n"
+      "    struct X {};\n"
+      "    X XVal;\n"
+      "    template <typename T> class CCC { };\n"
+      "    template <typename T>\n"
+      "    using Alias = CCC<T>;\n"
+      "    Alias<int> IntAliasVal;\n"
+      "    struct Y { struct Z {}; };\n"
+      "    Y::Z ZVal;\n"
+      "  }\n"
+      "}\n"
+      "struct Z {};\n"
+      "Z GlobalZVal;\n"
+      "namespace {\n"
+      "  namespace D {\n"
+      "    namespace {\n"
+      "      class aStruct {};\n"
+      "      aStruct CheckK;\n"
+      "    }\n"
+      "  }\n"
+      "}\n"
+  );
+}
+
+}  // end anonymous namespace
diff --git a/unittests/Tooling/RecursiveASTVisitorTest.cpp b/unittests/Tooling/RecursiveASTVisitorTest.cpp
index c287045..991ae8b 100644
--- a/unittests/Tooling/RecursiveASTVisitorTest.cpp
+++ b/unittests/Tooling/RecursiveASTVisitorTest.cpp
@@ -42,13 +42,13 @@
   LambdaExprVisitor Visitor;
   Visitor.ExpectMatch("", 1, 12);
   EXPECT_TRUE(Visitor.runOver("void f() { []{ return; }(); }",
-			      LambdaExprVisitor::Lang_CXX11));
+                              LambdaExprVisitor::Lang_CXX11));
 }
 
 TEST(RecursiveASTVisitor, TraverseLambdaBodyCanBeOverridden) {
   LambdaExprVisitor Visitor;
   EXPECT_TRUE(Visitor.runOver("void f() { []{ return; }(); }",
-			      LambdaExprVisitor::Lang_CXX11));
+                              LambdaExprVisitor::Lang_CXX11));
   EXPECT_TRUE(Visitor.allBodiesHaveBeenTraversed());
 }
 
@@ -92,8 +92,7 @@
 
 TEST(RecursiveASTVisitor, LambdaClosureTypesAreImplicit) {
   ClassVisitor Visitor;
-  EXPECT_TRUE(Visitor.runOver("auto lambda = []{};",
-			      ClassVisitor::Lang_CXX11));
+  EXPECT_TRUE(Visitor.runOver("auto lambda = []{};", ClassVisitor::Lang_CXX11));
   EXPECT_TRUE(Visitor.sawOnlyImplicitLambdaClasses());
 }
 
@@ -134,4 +133,23 @@
     "};\n"));
 }
 
+// Check to ensure that VarDecls are visited.
+class VarDeclVisitor : public ExpectedLocationVisitor<VarDeclVisitor> {
+public:
+  bool VisitVarDecl(VarDecl *VD) {
+    Match(VD->getNameAsString(), VD->getLocStart());
+    return true;
+  }
+};
+
+TEST(RecursiveASTVisitor, ArrayInitializersAreVisited) {
+  VarDeclVisitor Visitor;
+  Visitor.ExpectMatch("__i0", 1, 8);
+  EXPECT_TRUE(
+      Visitor.runOver("struct MyClass {\n"
+                      "  int c[1];\n"
+                      "  static MyClass Create() { return MyClass(); }\n"
+                      "};\n"));
+}
+
 } // end anonymous namespace
diff --git a/unittests/Tooling/RecursiveASTVisitorTestCallVisitor.cpp b/unittests/Tooling/RecursiveASTVisitorTestCallVisitor.cpp
index f8ff5bd..b981585 100644
--- a/unittests/Tooling/RecursiveASTVisitorTestCallVisitor.cpp
+++ b/unittests/Tooling/RecursiveASTVisitorTestCallVisitor.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "TestVisitor.h"
-#include <stack>
 
 using namespace clang;
 
diff --git a/unittests/Tooling/RecursiveASTVisitorTestDeclVisitor.cpp b/unittests/Tooling/RecursiveASTVisitorTestDeclVisitor.cpp
index 02676a7..63bfb8b 100644
--- a/unittests/Tooling/RecursiveASTVisitorTestDeclVisitor.cpp
+++ b/unittests/Tooling/RecursiveASTVisitorTestDeclVisitor.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "TestVisitor.h"
-#include <stack>
 
 using namespace clang;
 
diff --git a/unittests/Tooling/RecursiveASTVisitorTestExprVisitor.cpp b/unittests/Tooling/RecursiveASTVisitorTestExprVisitor.cpp
index 6af5906..d39ca4b 100644
--- a/unittests/Tooling/RecursiveASTVisitorTestExprVisitor.cpp
+++ b/unittests/Tooling/RecursiveASTVisitorTestExprVisitor.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "TestVisitor.h"
-#include <stack>
 
 using namespace clang;
 
@@ -192,6 +191,14 @@
     "void x(); void y() { x(); }"));
 }
 
+TEST(RecursiveASTVisitor, VisitsLambdaCaptureInit) {
+  DeclRefExprVisitor Visitor;
+  Visitor.ExpectMatch("i", 1, 20);
+  EXPECT_TRUE(Visitor.runOver(
+    "void f() { int i; [i]{}; };",
+    DeclRefExprVisitor::Lang_CXX11));
+}
+
 /* FIXME: According to Richard Smith this is a bug in the AST.
 TEST(RecursiveASTVisitor, VisitsBaseClassTemplateArgumentsInInstantiation) {
   DeclRefExprVisitor Visitor;
diff --git a/unittests/Tooling/RecursiveASTVisitorTestTypeLocVisitor.cpp b/unittests/Tooling/RecursiveASTVisitorTestTypeLocVisitor.cpp
index 63e2e8b..dc2adaf 100644
--- a/unittests/Tooling/RecursiveASTVisitorTestTypeLocVisitor.cpp
+++ b/unittests/Tooling/RecursiveASTVisitorTestTypeLocVisitor.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "TestVisitor.h"
-#include <stack>
 
 using namespace clang;
 
diff --git a/unittests/Tooling/RefactoringTest.cpp b/unittests/Tooling/RefactoringTest.cpp
index ff11aea..d5877ac 100644
--- a/unittests/Tooling/RefactoringTest.cpp
+++ b/unittests/Tooling/RefactoringTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ReplacementTest.h"
 #include "RewriterTestContext.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
@@ -18,6 +19,7 @@
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
@@ -25,22 +27,11 @@
 #include "clang/Tooling/Refactoring.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Support/Path.h"
 #include "gtest/gtest.h"
 
 namespace clang {
 namespace tooling {
 
-class ReplacementTest : public ::testing::Test {
- protected:
-  Replacement createReplacement(SourceLocation Start, unsigned Length,
-                                llvm::StringRef ReplacementText) {
-    return Replacement(Context.Sources, Start, Length, ReplacementText);
-  }
-
-  RewriterTestContext Context;
-};
-
 TEST_F(ReplacementTest, CanDeleteAllText) {
   FileID ID = Context.createInMemoryFile("input.cpp", "text");
   SourceLocation Location = Context.getLocation(ID, 1, 1);
@@ -108,29 +99,92 @@
   EXPECT_TRUE(Replace2.getFilePath().empty());
 }
 
+TEST_F(ReplacementTest, FailAddReplacements) {
+  Replacements Replaces;
+  auto Err = Replaces.add(Replacement("x.cc", 0, 10, "3"));
+  EXPECT_TRUE(!Err);
+  llvm::consumeError(std::move(Err));
+  Err = Replaces.add(Replacement("x.cc", 0, 2, ""));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+  Err = Replaces.add(Replacement("x.cc", 2, 2, ""));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+  Err = Replaces.add(Replacement("y.cc", 20, 2, ""));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+}
+
+TEST_F(ReplacementTest, FailAddOverlappingInsertions) {
+  Replacements Replaces;
+  // Test adding an insertion at the offset of an existing replacement.
+  auto Err = Replaces.add(Replacement("x.cc", 10, 3, "replace"));
+  EXPECT_TRUE(!Err);
+  llvm::consumeError(std::move(Err));
+  Err = Replaces.add(Replacement("x.cc", 10, 0, "insert"));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+
+  Replaces.clear();
+  // Test overlap with an existing insertion.
+  Err = Replaces.add(Replacement("x.cc", 10, 0, "insert"));
+  EXPECT_TRUE(!Err);
+  llvm::consumeError(std::move(Err));
+  Err = Replaces.add(Replacement("x.cc", 10, 3, "replace"));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+}
+
+TEST_F(ReplacementTest, FailAddRegression) {
+  Replacements Replaces;
+  // Create two replacements, where the second one is an insertion of the empty
+  // string exactly at the end of the first one.
+  auto Err = Replaces.add(Replacement("x.cc", 0, 10, "1"));
+  EXPECT_TRUE(!Err);
+  llvm::consumeError(std::move(Err));
+  Err = Replaces.add(Replacement("x.cc", 10, 0, ""));
+  EXPECT_TRUE(!Err);
+  llvm::consumeError(std::move(Err));
+
+  // Make sure we find the overlap with the first entry when inserting a
+  // replacement that ends exactly at the seam of the existing replacements.
+  Err = Replaces.add(Replacement("x.cc", 5, 5, "fail"));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+
+  Err = Replaces.add(Replacement("x.cc", 10, 0, ""));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+}
+
+TEST_F(ReplacementTest, FailAddInsertAtOffsetOfReplacement) {
+  Replacements Replaces;
+  auto Err = Replaces.add(Replacement("x.cc", 10, 2, ""));
+  EXPECT_TRUE(!Err);
+  llvm::consumeError(std::move(Err));
+  Err = Replaces.add(Replacement("x.cc", 10, 0, ""));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+}
+
+TEST_F(ReplacementTest, FailAddInsertAtOtherInsert) {
+  Replacements Replaces;
+  auto Err = Replaces.add(Replacement("x.cc", 10, 0, "a"));
+  EXPECT_TRUE(!Err);
+  llvm::consumeError(std::move(Err));
+  Err = Replaces.add(Replacement("x.cc", 10, 0, "b"));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+}
+
 TEST_F(ReplacementTest, CanApplyReplacements) {
   FileID ID = Context.createInMemoryFile("input.cpp",
                                          "line1\nline2\nline3\nline4");
-  Replacements Replaces;
-  Replaces.insert(Replacement(Context.Sources, Context.getLocation(ID, 2, 1),
-                              5, "replaced"));
-  Replaces.insert(Replacement(Context.Sources, Context.getLocation(ID, 3, 1),
-                              5, "other"));
-  EXPECT_TRUE(applyAllReplacements(Replaces, Context.Rewrite));
-  EXPECT_EQ("line1\nreplaced\nother\nline4", Context.getRewrittenText(ID));
-}
-
-// FIXME: Remove this test case when Replacements is implemented as std::vector
-// instead of std::set. The other ReplacementTest tests will need to be updated
-// at that point as well.
-TEST_F(ReplacementTest, VectorCanApplyReplacements) {
-  FileID ID = Context.createInMemoryFile("input.cpp",
-                                         "line1\nline2\nline3\nline4");
-  std::vector<Replacement> Replaces;
-  Replaces.push_back(Replacement(Context.Sources, Context.getLocation(ID, 2, 1),
-                                 5, "replaced"));
-  Replaces.push_back(
-      Replacement(Context.Sources, Context.getLocation(ID, 3, 1), 5, "other"));
+  Replacements Replaces =
+      toReplacements({Replacement(Context.Sources,
+                                  Context.getLocation(ID, 2, 1), 5, "replaced"),
+                      Replacement(Context.Sources,
+                                  Context.getLocation(ID, 3, 1), 5, "other")});
   EXPECT_TRUE(applyAllReplacements(Replaces, Context.Rewrite));
   EXPECT_EQ("line1\nreplaced\nother\nline4", Context.getRewrittenText(ID));
 }
@@ -138,84 +192,103 @@
 TEST_F(ReplacementTest, SkipsDuplicateReplacements) {
   FileID ID = Context.createInMemoryFile("input.cpp",
                                          "line1\nline2\nline3\nline4");
-  Replacements Replaces;
-  Replaces.insert(Replacement(Context.Sources, Context.getLocation(ID, 2, 1),
-                              5, "replaced"));
-  Replaces.insert(Replacement(Context.Sources, Context.getLocation(ID, 2, 1),
-                              5, "replaced"));
-  Replaces.insert(Replacement(Context.Sources, Context.getLocation(ID, 2, 1),
-                              5, "replaced"));
+  auto Replaces = toReplacements({Replacement(
+      Context.Sources, Context.getLocation(ID, 2, 1), 5, "replaced")});
+
+  auto Err = Replaces.add(Replacement(
+      Context.Sources, Context.getLocation(ID, 2, 1), 5, "replaced"));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+
+  Err = Replaces.add(Replacement(Context.Sources, Context.getLocation(ID, 2, 1),
+                                 5, "replaced"));
+  EXPECT_TRUE((bool)Err);
+  llvm::consumeError(std::move(Err));
+
   EXPECT_TRUE(applyAllReplacements(Replaces, Context.Rewrite));
   EXPECT_EQ("line1\nreplaced\nline3\nline4", Context.getRewrittenText(ID));
 }
 
-TEST_F(ReplacementTest, ApplyAllFailsIfOneApplyFails) {
-  // This test depends on the value of the file name of an invalid source
-  // location being in the range ]a, z[.
-  FileID IDa = Context.createInMemoryFile("a.cpp", "text");
-  FileID IDz = Context.createInMemoryFile("z.cpp", "text");
-  Replacements Replaces;
-  Replaces.insert(Replacement(Context.Sources, Context.getLocation(IDa, 1, 1),
-                              4, "a"));
-  Replaces.insert(Replacement(Context.Sources, SourceLocation(),
-                              5, "2"));
-  Replaces.insert(Replacement(Context.Sources, Context.getLocation(IDz, 1, 1),
-                              4, "z"));
+TEST_F(ReplacementTest, InvalidSourceLocationFailsApplyAll) {
+  Replacements Replaces =
+      toReplacements({Replacement(Context.Sources, SourceLocation(), 5, "2")});
+
   EXPECT_FALSE(applyAllReplacements(Replaces, Context.Rewrite));
-  EXPECT_EQ("a", Context.getRewrittenText(IDa));
-  EXPECT_EQ("z", Context.getRewrittenText(IDz));
+}
+
+TEST_F(ReplacementTest, MultipleFilesReplaceAndFormat) {
+  // Column limit is 20.
+  std::string Code1 = "Long *a =\n"
+                      "    new Long();\n"
+                      "long x = 1;";
+  std::string Expected1 = "auto a = new Long();\n"
+                          "long x =\n"
+                          "    12345678901;";
+  std::string Code2 = "int x = 123;\n"
+                      "int y = 0;";
+  std::string Expected2 = "int x =\n"
+                          "    1234567890123;\n"
+                          "int y = 10;";
+  StringRef File1 = "format_1.cpp";
+  StringRef File2 = "format_2.cpp";
+  FileID ID1 = Context.createInMemoryFile(File1, Code1);
+  FileID ID2 = Context.createInMemoryFile(File2, Code2);
+
+  // Scrambled the order of replacements.
+  std::map<std::string, Replacements> FileToReplaces;
+  FileToReplaces[File1] = toReplacements(
+      {tooling::Replacement(Context.Sources, Context.getLocation(ID1, 1, 1), 6,
+                            "auto "),
+       tooling::Replacement(Context.Sources, Context.getLocation(ID1, 3, 10), 1,
+                            "12345678901")});
+  FileToReplaces[File2] = toReplacements(
+      {tooling::Replacement(Context.Sources, Context.getLocation(ID2, 1, 12), 0,
+                            "4567890123"),
+       tooling::Replacement(Context.Sources, Context.getLocation(ID2, 2, 9), 1,
+                            "10")});
+  EXPECT_TRUE(
+      formatAndApplyAllReplacements(FileToReplaces, Context.Rewrite,
+                                    "{BasedOnStyle: LLVM, ColumnLimit: 20}"));
+  EXPECT_EQ(Expected1, Context.getRewrittenText(ID1));
+  EXPECT_EQ(Expected2, Context.getRewrittenText(ID2));
 }
 
 TEST(ShiftedCodePositionTest, FindsNewCodePosition) {
-  Replacements Replaces;
-  Replaces.insert(Replacement("", 0, 1, ""));
-  Replaces.insert(Replacement("", 4, 3, " "));
+  Replacements Replaces =
+      toReplacements({Replacement("", 0, 1, ""), Replacement("", 4, 3, " ")});
   // Assume ' int   i;' is turned into 'int i;' and cursor is located at '|'.
-  EXPECT_EQ(0u, shiftedCodePosition(Replaces, 0)); // |int   i;
-  EXPECT_EQ(0u, shiftedCodePosition(Replaces, 1)); //  |nt   i;
-  EXPECT_EQ(1u, shiftedCodePosition(Replaces, 2)); //  i|t   i;
-  EXPECT_EQ(2u, shiftedCodePosition(Replaces, 3)); //  in|   i;
-  EXPECT_EQ(3u, shiftedCodePosition(Replaces, 4)); //  int|  i;
-  EXPECT_EQ(3u, shiftedCodePosition(Replaces, 5)); //  int | i;
-  EXPECT_EQ(3u, shiftedCodePosition(Replaces, 6)); //  int  |i;
-  EXPECT_EQ(4u, shiftedCodePosition(Replaces, 7)); //  int   |;
-  EXPECT_EQ(5u, shiftedCodePosition(Replaces, 8)); //  int   i|
-}
-
-// FIXME: Remove this test case when Replacements is implemented as std::vector
-// instead of std::set. The other ReplacementTest tests will need to be updated
-// at that point as well.
-TEST(ShiftedCodePositionTest, VectorFindsNewCodePositionWithInserts) {
-  std::vector<Replacement> Replaces;
-  Replaces.push_back(Replacement("", 0, 1, ""));
-  Replaces.push_back(Replacement("", 4, 3, " "));
-  // Assume ' int   i;' is turned into 'int i;' and cursor is located at '|'.
-  EXPECT_EQ(0u, shiftedCodePosition(Replaces, 0)); // |int   i;
-  EXPECT_EQ(0u, shiftedCodePosition(Replaces, 1)); //  |nt   i;
-  EXPECT_EQ(1u, shiftedCodePosition(Replaces, 2)); //  i|t   i;
-  EXPECT_EQ(2u, shiftedCodePosition(Replaces, 3)); //  in|   i;
-  EXPECT_EQ(3u, shiftedCodePosition(Replaces, 4)); //  int|  i;
-  EXPECT_EQ(3u, shiftedCodePosition(Replaces, 5)); //  int | i;
-  EXPECT_EQ(3u, shiftedCodePosition(Replaces, 6)); //  int  |i;
-  EXPECT_EQ(4u, shiftedCodePosition(Replaces, 7)); //  int   |;
-  EXPECT_EQ(5u, shiftedCodePosition(Replaces, 8)); //  int   i|
+  EXPECT_EQ(0u, Replaces.getShiftedCodePosition(0)); // |int   i;
+  EXPECT_EQ(0u, Replaces.getShiftedCodePosition(1)); //  |nt   i;
+  EXPECT_EQ(1u, Replaces.getShiftedCodePosition(2)); //  i|t   i;
+  EXPECT_EQ(2u, Replaces.getShiftedCodePosition(3)); //  in|   i;
+  EXPECT_EQ(3u, Replaces.getShiftedCodePosition(4)); //  int|  i;
+  EXPECT_EQ(3u, Replaces.getShiftedCodePosition(5)); //  int | i;
+  EXPECT_EQ(3u, Replaces.getShiftedCodePosition(6)); //  int  |i;
+  EXPECT_EQ(4u, Replaces.getShiftedCodePosition(7)); //  int   |;
+  EXPECT_EQ(5u, Replaces.getShiftedCodePosition(8)); //  int   i|
 }
 
 TEST(ShiftedCodePositionTest, FindsNewCodePositionWithInserts) {
-  Replacements Replaces;
-  Replaces.insert(Replacement("", 4, 0, "\"\n\""));
+  Replacements Replaces = toReplacements({Replacement("", 4, 0, "\"\n\"")});
   // Assume '"12345678"' is turned into '"1234"\n"5678"'.
-  EXPECT_EQ(3u, shiftedCodePosition(Replaces, 3)); // "123|5678"
-  EXPECT_EQ(7u, shiftedCodePosition(Replaces, 4)); // "1234|678"
-  EXPECT_EQ(8u, shiftedCodePosition(Replaces, 5)); // "12345|78"
+  EXPECT_EQ(3u, Replaces.getShiftedCodePosition(3)); // "123|5678"
+  EXPECT_EQ(7u, Replaces.getShiftedCodePosition(4)); // "1234|678"
+  EXPECT_EQ(8u, Replaces.getShiftedCodePosition(5)); // "12345|78"
 }
 
 TEST(ShiftedCodePositionTest, FindsNewCodePositionInReplacedText) {
-  Replacements Replaces;
   // Replace the first four characters with "abcd".
-  Replaces.insert(Replacement("", 0, 4, "abcd"));
+  auto Replaces = toReplacements({Replacement("", 0, 4, "abcd")});
   for (unsigned i = 0; i < 3; ++i)
-    EXPECT_EQ(i, shiftedCodePosition(Replaces, i));
+    EXPECT_EQ(i, Replaces.getShiftedCodePosition(i));
+}
+
+TEST(ShiftedCodePositionTest, NoReplacementText) {
+  Replacements Replaces = toReplacements({Replacement("", 0, 42, "")});
+  EXPECT_EQ(0u, Replaces.getShiftedCodePosition(0));
+  EXPECT_EQ(0u, Replaces.getShiftedCodePosition(39));
+  EXPECT_EQ(3u, Replaces.getShiftedCodePosition(45));
+  EXPECT_EQ(0u, Replaces.getShiftedCodePosition(42));
 }
 
 class FlushRewrittenFilesTest : public ::testing::Test {
@@ -271,9 +344,8 @@
 
 TEST_F(FlushRewrittenFilesTest, StoresChangesOnDisk) {
   FileID ID = createFile("input.cpp", "line1\nline2\nline3\nline4");
-  Replacements Replaces;
-  Replaces.insert(Replacement(Context.Sources, Context.getLocation(ID, 2, 1),
-                              5, "replaced"));
+  Replacements Replaces = toReplacements({Replacement(
+      Context.Sources, Context.getLocation(ID, 2, 1), 5, "replaced")});
   EXPECT_TRUE(applyAllReplacements(Replaces, Context.Rewrite));
   EXPECT_FALSE(Context.Rewrite.overwriteChangedFiles());
   EXPECT_EQ("line1\nreplaced\nline3\nline4",
@@ -418,84 +490,122 @@
   EXPECT_FALSE(Range(0, 10).contains(Range(0, 11)));
 }
 
-TEST(DeduplicateTest, removesDuplicates) {
-  std::vector<Replacement> Input;
-  Input.push_back(Replacement("fileA", 50, 0, " foo "));
-  Input.push_back(Replacement("fileA", 10, 3, " bar "));
-  Input.push_back(Replacement("fileA", 10, 2, " bar ")); // Length differs
-  Input.push_back(Replacement("fileA", 9,  3, " bar ")); // Offset differs
-  Input.push_back(Replacement("fileA", 50, 0, " foo ")); // Duplicate
-  Input.push_back(Replacement("fileA", 51, 3, " bar "));
-  Input.push_back(Replacement("fileB", 51, 3, " bar ")); // Filename differs!
-  Input.push_back(Replacement("fileB", 60, 1, " bar "));
-  Input.push_back(Replacement("fileA", 60, 2, " bar "));
-  Input.push_back(Replacement("fileA", 51, 3, " moo ")); // Replacement text
-                                                         // differs!
+TEST(Range, CalculateRangesOfReplacements) {
+  // Before: aaaabbbbbbz
+  // After : bbbbbbzzzzzzoooooooooooooooo
+  Replacements Replaces = toReplacements(
+      {Replacement("foo", 0, 4, ""), Replacement("foo", 10, 1, "zzzzzz"),
+       Replacement("foo", 11, 0, "oooooooooooooooo")});
 
-  std::vector<Replacement> Expected;
-  Expected.push_back(Replacement("fileA", 9,  3, " bar "));
-  Expected.push_back(Replacement("fileA", 10, 2, " bar "));
-  Expected.push_back(Replacement("fileA", 10, 3, " bar "));
-  Expected.push_back(Replacement("fileA", 50, 0, " foo "));
-  Expected.push_back(Replacement("fileA", 51, 3, " bar "));
-  Expected.push_back(Replacement("fileA", 51, 3, " moo "));
-  Expected.push_back(Replacement("fileB", 60, 1, " bar "));
-  Expected.push_back(Replacement("fileA", 60, 2, " bar "));
+  std::vector<Range> Ranges = Replaces.getAffectedRanges();
 
-  std::vector<Range> Conflicts; // Ignored for this test
-  deduplicate(Input, Conflicts);
-
-  EXPECT_EQ(3U, Conflicts.size());
-  EXPECT_EQ(Expected, Input);
+  EXPECT_EQ(2ul, Ranges.size());
+  EXPECT_TRUE(Ranges[0].getOffset() == 0);
+  EXPECT_TRUE(Ranges[0].getLength() == 0);
+  EXPECT_TRUE(Ranges[1].getOffset() == 6);
+  EXPECT_TRUE(Ranges[1].getLength() == 22);
 }
 
-TEST(DeduplicateTest, detectsConflicts) {
-  {
-    std::vector<Replacement> Input;
-    Input.push_back(Replacement("fileA", 0, 5, " foo "));
-    Input.push_back(Replacement("fileA", 0, 5, " foo ")); // Duplicate not a
-                                                          // conflict.
-    Input.push_back(Replacement("fileA", 2, 6, " bar "));
-    Input.push_back(Replacement("fileA", 7, 3, " moo "));
+TEST(Range, RangesAfterEmptyReplacements) {
+  std::vector<Range> Ranges = {Range(5, 6), Range(10, 5)};
+  Replacements Replaces;
+  std::vector<Range> Expected = {Range(5, 10)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
 
-    std::vector<Range> Conflicts;
-    deduplicate(Input, Conflicts);
+TEST(Range, RangesAfterReplacements) {
+  std::vector<Range> Ranges = {Range(5, 2), Range(10, 5)};
+  Replacements Replaces = toReplacements({Replacement("foo", 0, 2, "1234")});
+  std::vector<Range> Expected = {Range(0, 4), Range(7, 2), Range(12, 5)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
 
-    // One duplicate is removed and the remaining three items form one
-    // conflicted range.
-    ASSERT_EQ(3u, Input.size());
-    ASSERT_EQ(1u, Conflicts.size());
-    ASSERT_EQ(0u, Conflicts.front().getOffset());
-    ASSERT_EQ(3u, Conflicts.front().getLength());
-  }
-  {
-    std::vector<Replacement> Input;
+TEST(Range, RangesBeforeReplacements) {
+  std::vector<Range> Ranges = {Range(5, 2), Range(10, 5)};
+  Replacements Replaces = toReplacements({Replacement("foo", 20, 2, "1234")});
+  std::vector<Range> Expected = {Range(5, 2), Range(10, 5), Range(20, 4)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
 
-    // Expected sorted order is shown. It is the sorted order to which the
-    // returned conflict info refers to.
-    Input.push_back(Replacement("fileA", 0,  5, " foo "));  // 0
-    Input.push_back(Replacement("fileA", 5,  5, " bar "));  // 1
-    Input.push_back(Replacement("fileA", 6,  0, " bar "));  // 3
-    Input.push_back(Replacement("fileA", 5,  5, " moo "));  // 2
-    Input.push_back(Replacement("fileA", 7,  2, " bar "));  // 4
-    Input.push_back(Replacement("fileA", 15, 5, " golf ")); // 5
-    Input.push_back(Replacement("fileA", 16, 5, " bag "));  // 6
-    Input.push_back(Replacement("fileA", 10, 3, " club ")); // 7
+TEST(Range, NotAffectedByReplacements) {
+  std::vector<Range> Ranges = {Range(0, 2), Range(5, 2), Range(10, 5)};
+  Replacements Replaces = toReplacements({Replacement("foo", 3, 2, "12"),
+                                          Replacement("foo", 12, 2, "12"),
+                                          Replacement("foo", 20, 5, "")});
+  std::vector<Range> Expected = {Range(0, 2), Range(3, 4), Range(10, 5),
+                                 Range(20, 0)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
 
-    // #3 is special in that it is completely contained by another conflicting
-    // Replacement. #4 ensures #3 hasn't messed up the conflicting range size.
+TEST(Range, RangesWithNonOverlappingReplacements) {
+  std::vector<Range> Ranges = {Range(0, 2), Range(5, 2), Range(10, 5)};
+  Replacements Replaces = toReplacements({Replacement("foo", 3, 1, ""),
+                                          Replacement("foo", 6, 1, "123"),
+                                          Replacement("foo", 20, 2, "12345")});
+  std::vector<Range> Expected = {Range(0, 2), Range(3, 0), Range(4, 4),
+                                 Range(11, 5), Range(21, 5)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
 
-    std::vector<Range> Conflicts;
-    deduplicate(Input, Conflicts);
+TEST(Range, RangesWithOverlappingReplacements) {
+  std::vector<Range> Ranges = {Range(0, 2), Range(5, 2), Range(15, 5),
+                               Range(30, 5)};
+  Replacements Replaces = toReplacements(
+      {Replacement("foo", 1, 3, ""), Replacement("foo", 6, 1, "123"),
+       Replacement("foo", 13, 3, "1"), Replacement("foo", 25, 15, "")});
+  std::vector<Range> Expected = {Range(0, 1), Range(2, 4), Range(12, 5),
+                                 Range(22, 0)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
 
-    // No duplicates
-    ASSERT_EQ(8u, Input.size());
-    ASSERT_EQ(2u, Conflicts.size());
-    ASSERT_EQ(1u, Conflicts[0].getOffset());
-    ASSERT_EQ(4u, Conflicts[0].getLength());
-    ASSERT_EQ(6u, Conflicts[1].getOffset());
-    ASSERT_EQ(2u, Conflicts[1].getLength());
-  }
+TEST(Range, MergeIntoOneRange) {
+  std::vector<Range> Ranges = {Range(0, 2), Range(5, 2), Range(15, 5)};
+  Replacements Replaces =
+      toReplacements({Replacement("foo", 1, 15, "1234567890")});
+  std::vector<Range> Expected = {Range(0, 15)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
+
+TEST(Range, ReplacementsStartingAtRangeOffsets) {
+  std::vector<Range> Ranges = {Range(0, 2), Range(5, 5), Range(15, 5)};
+  Replacements Replaces = toReplacements(
+      {Replacement("foo", 0, 2, "12"), Replacement("foo", 5, 1, "123"),
+       Replacement("foo", 7, 4, "12345"), Replacement("foo", 15, 10, "12")});
+  std::vector<Range> Expected = {Range(0, 2), Range(5, 9), Range(18, 2)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
+
+TEST(Range, ReplacementsEndingAtRangeEnds) {
+  std::vector<Range> Ranges = {Range(0, 2), Range(5, 2), Range(15, 5)};
+  Replacements Replaces = toReplacements(
+      {Replacement("foo", 6, 1, "123"), Replacement("foo", 17, 3, "12")});
+  std::vector<Range> Expected = {Range(0, 2), Range(5, 4), Range(17, 4)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
+
+TEST(Range, AjacentReplacements) {
+  std::vector<Range> Ranges = {Range(0, 0), Range(15, 5)};
+  Replacements Replaces = toReplacements(
+      {Replacement("foo", 1, 2, "123"), Replacement("foo", 12, 3, "1234")});
+  std::vector<Range> Expected = {Range(0, 0), Range(1, 3), Range(13, 9)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
+
+TEST(Range, MergeRangesAfterReplacements) {
+  std::vector<Range> Ranges = {Range(8, 0), Range(5, 2), Range(9, 0), Range(0, 1)};
+  Replacements Replaces = toReplacements({Replacement("foo", 1, 3, ""),
+                                          Replacement("foo", 7, 0, "12"),
+                                          Replacement("foo", 9, 2, "")});
+  std::vector<Range> Expected = {Range(0, 1), Range(2, 4), Range(7, 0),
+                                 Range(8, 0)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
+}
+
+TEST(Range, ConflictingRangesBeforeReplacements) {
+  std::vector<Range> Ranges = {Range(8, 3), Range(5, 4), Range(9, 1)};
+  Replacements Replaces = toReplacements({Replacement("foo", 1, 3, "")});
+  std::vector<Range> Expected = {Range(1, 0), Range(2, 6)};
+  EXPECT_EQ(Expected, calculateRangesAfterReplacements(Replaces, Ranges));
 }
 
 class MergeReplacementsTest : public ::testing::Test {
@@ -504,27 +614,32 @@
                            StringRef Result, const Replacements &First,
                            const Replacements &Second) {
     // These are mainly to verify the test itself and make it easier to read.
-    std::string AfterFirst = applyAllReplacements(Code, First);
-    std::string InSequenceRewrite = applyAllReplacements(AfterFirst, Second);
-    EXPECT_EQ(Intermediate, AfterFirst);
-    EXPECT_EQ(Result, InSequenceRewrite);
+    auto AfterFirst = applyAllReplacements(Code, First);
+    EXPECT_TRUE(static_cast<bool>(AfterFirst));
+    auto InSequenceRewrite = applyAllReplacements(*AfterFirst, Second);
+    EXPECT_TRUE(static_cast<bool>(InSequenceRewrite));
+    EXPECT_EQ(Intermediate, *AfterFirst);
+    EXPECT_EQ(Result, *InSequenceRewrite);
 
-    tooling::Replacements Merged = mergeReplacements(First, Second);
-    std::string MergedRewrite = applyAllReplacements(Code, Merged);
-    EXPECT_EQ(InSequenceRewrite, MergedRewrite);
-    if (InSequenceRewrite != MergedRewrite)
+    tooling::Replacements Merged = First.merge(Second);
+    auto MergedRewrite = applyAllReplacements(Code, Merged);
+    EXPECT_TRUE(static_cast<bool>(MergedRewrite));
+    EXPECT_EQ(*InSequenceRewrite, *MergedRewrite);
+    if (*InSequenceRewrite != *MergedRewrite)
       for (tooling::Replacement M : Merged)
         llvm::errs() << M.getOffset() << " " << M.getLength() << " "
                      << M.getReplacementText() << "\n";
   }
   void mergeAndTestRewrite(StringRef Code, const Replacements &First,
                            const Replacements &Second) {
-    std::string InSequenceRewrite =
-        applyAllReplacements(applyAllReplacements(Code, First), Second);
-    tooling::Replacements Merged = mergeReplacements(First, Second);
-    std::string MergedRewrite = applyAllReplacements(Code, Merged);
-    EXPECT_EQ(InSequenceRewrite, MergedRewrite);
-    if (InSequenceRewrite != MergedRewrite)
+    auto AfterFirst = applyAllReplacements(Code, First);
+    EXPECT_TRUE(static_cast<bool>(AfterFirst));
+    auto InSequenceRewrite = applyAllReplacements(*AfterFirst, Second);
+    tooling::Replacements Merged = First.merge(Second);
+    auto MergedRewrite = applyAllReplacements(Code, Merged);
+    EXPECT_TRUE(static_cast<bool>(MergedRewrite));
+    EXPECT_EQ(*InSequenceRewrite, *MergedRewrite);
+    if (*InSequenceRewrite != *MergedRewrite)
       for (tooling::Replacement M : Merged)
         llvm::errs() << M.getOffset() << " " << M.getLength() << " "
                      << M.getReplacementText() << "\n";
@@ -533,62 +648,82 @@
 
 TEST_F(MergeReplacementsTest, Offsets) {
   mergeAndTestRewrite("aaa", "aabab", "cacabab",
-                      {{"", 2, 0, "b"}, {"", 3, 0, "b"}},
-                      {{"", 0, 0, "c"}, {"", 1, 0, "c"}});
+                      toReplacements({{"", 2, 0, "b"}, {"", 3, 0, "b"}}),
+                      toReplacements({{"", 0, 0, "c"}, {"", 1, 0, "c"}}));
   mergeAndTestRewrite("aaa", "babaa", "babacac",
-                      {{"", 0, 0, "b"}, {"", 1, 0, "b"}},
-                      {{"", 4, 0, "c"}, {"", 5, 0, "c"}});
-  mergeAndTestRewrite("aaaa", "aaa", "aac", {{"", 1, 1, ""}},
-                      {{"", 2, 1, "c"}});
+                      toReplacements({{"", 0, 0, "b"}, {"", 1, 0, "b"}}),
+                      toReplacements({{"", 4, 0, "c"}, {"", 5, 0, "c"}}));
+  mergeAndTestRewrite("aaaa", "aaa", "aac", toReplacements({{"", 1, 1, ""}}),
+                      toReplacements({{"", 2, 1, "c"}}));
 
   mergeAndTestRewrite("aa", "bbabba", "bbabcba",
-                      {{"", 0, 0, "bb"}, {"", 1, 0, "bb"}}, {{"", 4, 0, "c"}});
+                      toReplacements({{"", 0, 0, "bb"}, {"", 1, 0, "bb"}}),
+                      toReplacements({{"", 4, 0, "c"}}));
 }
 
 TEST_F(MergeReplacementsTest, Concatenations) {
   // Basic concatenations. It is important to merge these into a single
   // replacement to ensure the correct order.
-  EXPECT_EQ((Replacements{{"", 0, 0, "ab"}}),
-            mergeReplacements({{"", 0, 0, "a"}}, {{"", 1, 0, "b"}}));
-  EXPECT_EQ((Replacements{{"", 0, 0, "ba"}}),
-            mergeReplacements({{"", 0, 0, "a"}}, {{"", 0, 0, "b"}}));
-  mergeAndTestRewrite("", "a", "ab", {{"", 0, 0, "a"}}, {{"", 1, 0, "b"}});
-  mergeAndTestRewrite("", "a", "ba", {{"", 0, 0, "a"}}, {{"", 0, 0, "b"}});
+  {
+    auto First = toReplacements({{"", 0, 0, "a"}});
+    auto Second = toReplacements({{"", 1, 0, "b"}});
+    EXPECT_EQ(toReplacements({{"", 0, 0, "ab"}}), First.merge(Second));
+  }
+  {
+    auto First = toReplacements({{"", 0, 0, "a"}});
+    auto Second = toReplacements({{"", 0, 0, "b"}});
+    EXPECT_EQ(toReplacements({{"", 0, 0, "ba"}}), First.merge(Second));
+  }
+  mergeAndTestRewrite("", "a", "ab", toReplacements({{"", 0, 0, "a"}}),
+                      toReplacements({{"", 1, 0, "b"}}));
+  mergeAndTestRewrite("", "a", "ba", toReplacements({{"", 0, 0, "a"}}),
+                      toReplacements({{"", 0, 0, "b"}}));
 }
 
 TEST_F(MergeReplacementsTest, NotChangingLengths) {
-  mergeAndTestRewrite("aaaa", "abba", "acca", {{"", 1, 2, "bb"}},
-                      {{"", 1, 2, "cc"}});
-  mergeAndTestRewrite("aaaa", "abba", "abcc", {{"", 1, 2, "bb"}},
-                      {{"", 2, 2, "cc"}});
-  mergeAndTestRewrite("aaaa", "abba", "ccba", {{"", 1, 2, "bb"}},
-                      {{"", 0, 2, "cc"}});
+  mergeAndTestRewrite("aaaa", "abba", "acca",
+                      toReplacements({{"", 1, 2, "bb"}}),
+                      toReplacements({{"", 1, 2, "cc"}}));
+  mergeAndTestRewrite("aaaa", "abba", "abcc",
+                      toReplacements({{"", 1, 2, "bb"}}),
+                      toReplacements({{"", 2, 2, "cc"}}));
+  mergeAndTestRewrite("aaaa", "abba", "ccba",
+                      toReplacements({{"", 1, 2, "bb"}}),
+                      toReplacements({{"", 0, 2, "cc"}}));
   mergeAndTestRewrite("aaaaaa", "abbdda", "abccda",
-                      {{"", 1, 2, "bb"}, {"", 3, 2, "dd"}}, {{"", 2, 2, "cc"}});
+                      toReplacements({{"", 1, 2, "bb"}, {"", 3, 2, "dd"}}),
+                      toReplacements({{"", 2, 2, "cc"}}));
 }
 
 TEST_F(MergeReplacementsTest, OverlappingRanges) {
   mergeAndTestRewrite("aaa", "bbd", "bcbcd",
-                      {{"", 0, 1, "bb"}, {"", 1, 2, "d"}},
-                      {{"", 1, 0, "c"}, {"", 2, 0, "c"}});
+                      toReplacements({{"", 0, 1, "bb"}, {"", 1, 2, "d"}}),
+                      toReplacements({{"", 1, 0, "c"}, {"", 2, 0, "c"}}));
 
-  mergeAndTestRewrite("aaaa", "aabbaa", "acccca", {{"", 2, 0, "bb"}},
-                      {{"", 1, 4, "cccc"}});
+  mergeAndTestRewrite("aaaa", "aabbaa", "acccca",
+                      toReplacements({{"", 2, 0, "bb"}}),
+                      toReplacements({{"", 1, 4, "cccc"}}));
   mergeAndTestRewrite("aaaa", "aababa", "acccca",
-                      {{"", 2, 0, "b"}, {"", 3, 0, "b"}}, {{"", 1, 4, "cccc"}});
-  mergeAndTestRewrite("aaaaaa", "abbbba", "abba", {{"", 1, 4, "bbbb"}},
-                      {{"", 2, 2, ""}});
-  mergeAndTestRewrite("aaaa", "aa", "cc", {{"", 1, 1, ""}, {"", 2, 1, ""}},
-                      {{"", 0, 2, "cc"}});
-  mergeAndTestRewrite("aa", "abbba", "abcbcba", {{"", 1, 0, "bbb"}},
-                      {{"", 2, 0, "c"}, {"", 3, 0, "c"}});
+                      toReplacements({{"", 2, 0, "b"}, {"", 3, 0, "b"}}),
+                      toReplacements({{"", 1, 4, "cccc"}}));
+  mergeAndTestRewrite("aaaaaa", "abbbba", "abba",
+                      toReplacements({{"", 1, 4, "bbbb"}}),
+                      toReplacements({{"", 2, 2, ""}}));
+  mergeAndTestRewrite("aaaa", "aa", "cc",
+                      toReplacements({{"", 1, 1, ""}, {"", 2, 1, ""}}),
+                      toReplacements({{"", 0, 2, "cc"}}));
+  mergeAndTestRewrite("aa", "abbba", "abcbcba",
+                      toReplacements({{"", 1, 0, "bbb"}}),
+                      toReplacements({{"", 2, 0, "c"}, {"", 3, 0, "c"}}));
 
-  mergeAndTestRewrite("aaa", "abbab", "ccdd",
-                      {{"", 0, 1, ""}, {"", 2, 0, "bb"}, {"", 3, 0, "b"}},
-                      {{"", 0, 2, "cc"}, {"", 2, 3, "dd"}});
-  mergeAndTestRewrite("aa", "babbab", "ccdd",
-                      {{"", 0, 0, "b"}, {"", 1, 0, "bb"}, {"", 2, 0, "b"}},
-                      {{"", 0, 3, "cc"}, {"", 3, 3, "dd"}});
+  mergeAndTestRewrite(
+      "aaa", "abbab", "ccdd",
+      toReplacements({{"", 0, 1, ""}, {"", 2, 0, "bb"}, {"", 3, 0, "b"}}),
+      toReplacements({{"", 0, 2, "cc"}, {"", 2, 3, "dd"}}));
+  mergeAndTestRewrite(
+      "aa", "babbab", "ccdd",
+      toReplacements({{"", 0, 0, "b"}, {"", 1, 0, "bb"}, {"", 2, 0, "b"}}),
+      toReplacements({{"", 0, 3, "cc"}, {"", 3, 3, "dd"}}));
 }
 
 } // end namespace tooling
diff --git a/unittests/Tooling/ReplacementTest.h b/unittests/Tooling/ReplacementTest.h
new file mode 100644
index 0000000..91530f0
--- /dev/null
+++ b/unittests/Tooling/ReplacementTest.h
@@ -0,0 +1,56 @@
+//===- unittest/Tooling/ReplacementTest.h - Replacements related test------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines utility class and function for Replacement related tests.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_UNITTESTS_TOOLING_REPLACEMENTTESTBASE_H
+#define LLVM_CLANG_UNITTESTS_TOOLING_REPLACEMENTTESTBASE_H
+
+#include "RewriterTestContext.h"
+#include "clang/Tooling/Core/Replacement.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace tooling {
+
+/// \brief Converts a set of replacements to Replacements class.
+/// \return A Replacements class containing \p Replaces on success; otherwise,
+/// an empty Replacements is returned.
+static tooling::Replacements
+toReplacements(const std::set<tooling::Replacement> &Replaces) {
+  tooling::Replacements Result;
+  for (const auto &R : Replaces) {
+    auto Err = Result.add(R);
+    EXPECT_TRUE(!Err);
+    if (Err) {
+      llvm::errs() << llvm::toString(std::move(Err)) << "\n";
+      return tooling::Replacements();
+    }
+  }
+  return Result;
+}
+
+/// \brief A utility class for replacement related tests.
+class ReplacementTest : public ::testing::Test {
+protected:
+  tooling::Replacement createReplacement(SourceLocation Start, unsigned Length,
+                                         llvm::StringRef ReplacementText) {
+    return tooling::Replacement(Context.Sources, Start, Length,
+                                ReplacementText);
+  }
+
+  RewriterTestContext Context;
+};
+
+} // namespace tooling
+} // namespace clang
+
+#endif // LLVM_CLANG_UNITTESTS_TOOLING_REPLACEMENTTESTBASE_H
diff --git a/unittests/Tooling/RewriterTest.cpp b/unittests/Tooling/RewriterTest.cpp
index 93f69eb..4305d42 100644
--- a/unittests/Tooling/RewriterTest.cpp
+++ b/unittests/Tooling/RewriterTest.cpp
@@ -39,10 +39,14 @@
 
 TEST(Rewriter, AdjacentInsertAndDelete) {
   Replacements Replaces;
-  Replaces.insert(Replacement("<file>", 6, 6, ""));
-  Replaces.insert(Replacement("<file>", 6, 0, "replaced\n"));
-  EXPECT_EQ("line1\nreplaced\nline3\nline4",
-            applyAllReplacements("line1\nline2\nline3\nline4", Replaces));
+  auto Err = Replaces.add(Replacement("<file>", 6, 6, ""));
+  EXPECT_TRUE(!Err);
+  Replaces =
+      Replaces.merge(Replacements(Replacement("<file>", 6, 0, "replaced\n")));
+
+  auto Rewritten = applyAllReplacements("line1\nline2\nline3\nline4", Replaces);
+  EXPECT_TRUE(static_cast<bool>(Rewritten));
+  EXPECT_EQ("line1\nreplaced\nline3\nline4", *Rewritten);
 }
 
 } // end namespace
diff --git a/unittests/Tooling/ToolingTest.cpp b/unittests/Tooling/ToolingTest.cpp
index c4b174f..82ee602 100644
--- a/unittests/Tooling/ToolingTest.cpp
+++ b/unittests/Tooling/ToolingTest.cpp
@@ -18,8 +18,9 @@
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 #include "gtest/gtest.h"
 #include <algorithm>
 #include <string>
@@ -241,7 +242,7 @@
 struct SkipBodyConsumer : public clang::ASTConsumer {
   /// Skip the 'skipMe' function.
   bool shouldSkipFunctionBody(Decl *D) override {
-    FunctionDecl *F = dyn_cast<FunctionDecl>(D);
+    NamedDecl *F = dyn_cast<NamedDecl>(D);
     return F && F->getNameAsString() == "skipMe";
   }
 };
@@ -255,10 +256,65 @@
 };
 
 TEST(runToolOnCode, TestSkipFunctionBody) {
+  std::vector<std::string> Args = {"-std=c++11"};
+  std::vector<std::string> Args2 = {"-fno-delayed-template-parsing"};
+
   EXPECT_TRUE(runToolOnCode(new SkipBodyAction,
                             "int skipMe() { an_error_here }"));
   EXPECT_FALSE(runToolOnCode(new SkipBodyAction,
                              "int skipMeNot() { an_error_here }"));
+
+  // Test constructors with initializers
+  EXPECT_TRUE(runToolOnCodeWithArgs(
+      new SkipBodyAction,
+      "struct skipMe { skipMe() : an_error() { more error } };", Args));
+  EXPECT_TRUE(runToolOnCodeWithArgs(
+      new SkipBodyAction, "struct skipMe { skipMe(); };"
+                          "skipMe::skipMe() : an_error([](){;}) { more error }",
+      Args));
+  EXPECT_TRUE(runToolOnCodeWithArgs(
+      new SkipBodyAction, "struct skipMe { skipMe(); };"
+                          "skipMe::skipMe() : an_error{[](){;}} { more error }",
+      Args));
+  EXPECT_TRUE(runToolOnCodeWithArgs(
+      new SkipBodyAction,
+      "struct skipMe { skipMe(); };"
+      "skipMe::skipMe() : a<b<c>(e)>>(), f{}, g() { error }",
+      Args));
+  EXPECT_TRUE(runToolOnCodeWithArgs(
+      new SkipBodyAction, "struct skipMe { skipMe() : bases()... { error } };",
+      Args));
+
+  EXPECT_FALSE(runToolOnCodeWithArgs(
+      new SkipBodyAction, "struct skipMeNot { skipMeNot() : an_error() { } };",
+      Args));
+  EXPECT_FALSE(runToolOnCodeWithArgs(new SkipBodyAction,
+                                     "struct skipMeNot { skipMeNot(); };"
+                                     "skipMeNot::skipMeNot() : an_error() { }",
+                                     Args));
+
+  // Try/catch
+  EXPECT_TRUE(runToolOnCode(
+      new SkipBodyAction,
+      "void skipMe() try { an_error() } catch(error) { error };"));
+  EXPECT_TRUE(runToolOnCode(
+      new SkipBodyAction,
+      "struct S { void skipMe() try { an_error() } catch(error) { error } };"));
+  EXPECT_TRUE(
+      runToolOnCode(new SkipBodyAction,
+                    "void skipMe() try { an_error() } catch(error) { error; }"
+                    "catch(error) { error } catch (error) { }"));
+  EXPECT_FALSE(runToolOnCode(
+      new SkipBodyAction,
+      "void skipMe() try something;")); // don't crash while parsing
+
+  // Template
+  EXPECT_TRUE(runToolOnCode(
+      new SkipBodyAction, "template<typename T> int skipMe() { an_error_here }"
+                          "int x = skipMe<int>();"));
+  EXPECT_FALSE(runToolOnCodeWithArgs(
+      new SkipBodyAction,
+      "template<typename T> int skipMeNot() { an_error_here }", Args2));
 }
 
 TEST(runToolOnCodeWithArgs, TestNoDepFile) {
diff --git a/unittests/libclang/LibclangTest.cpp b/unittests/libclang/LibclangTest.cpp
index e190dec..d5c7827 100644
--- a/unittests/libclang/LibclangTest.cpp
+++ b/unittests/libclang/LibclangTest.cpp
@@ -485,7 +485,7 @@
   WriteFile(Clang, "");
 
   const char *Argv[] = {Clang.c_str(), "-target", "arm-linux-gnueabi",
-                        "--gcc-toolchain="};
+                        "-stdlib=libstdc++", "--gcc-toolchain="};
 
   EXPECT_EQ(CXError_Success,
             clang_parseTranslationUnit2FullArgv(Index, Filename.c_str(), Argv,
diff --git a/unittests/libclang/Makefile b/unittests/libclang/Makefile
deleted file mode 100644
index 4dbafb7..0000000
--- a/unittests/libclang/Makefile
+++ /dev/null
@@ -1,37 +0,0 @@
-##===- unittests/libclang/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-CLANG_LEVEL = ../..
-TESTNAME = libclang
-LINK_LIBS_IN_SHARED := 1
-
-include $(CLANG_LEVEL)/../../Makefile.config
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) \
-	AsmParser \
-	BitReader \
-	BitWriter \
-	IPO \
-	MC \
-	ObjCArcOpts \
-	Option \
-	Support
-
-# Note that 'USEDLIBS' must include all of the core clang libraries
-# when -static is given to linker on cygming.
-USEDLIBS = clang.a \
-	   clangCodeGen.a \
-	   clangIndex.a clangFormat.a clangRewrite.a \
-	   clangFrontend.a clangDriver.a \
-	   clangTooling.a \
-	   clangToolingCore.a \
-	   clangSerialization.a clangParse.a clangSema.a \
-	   clangAnalysis.a clangEdit.a clangAST.a clangLex.a \
-	   clangAPINotes.a clangBasic.a
-
-include $(CLANG_LEVEL)/unittests/Makefile
diff --git a/utils/ClangVisualizers/CMakeLists.txt b/utils/ClangVisualizers/CMakeLists.txt
new file mode 100644
index 0000000..16d118a
--- /dev/null
+++ b/utils/ClangVisualizers/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Do this by hand instead of using add_llvm_utilities(), which
+# tries to create a corresponding executable, which we don't want.
+if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
+  set(CLANG_VISUALIZERS clang.natvis)
+  add_custom_target(ClangVisualizers SOURCES ${CLANG_VISUALIZERS})
+  set_target_properties(ClangVisualizers PROPERTIES FOLDER "Utils")
+endif()
diff --git a/utils/ClangVisualizers/clang.natvis b/utils/ClangVisualizers/clang.natvis
new file mode 100644
index 0000000..6e3ca96
--- /dev/null
+++ b/utils/ClangVisualizers/clang.natvis
@@ -0,0 +1,586 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+Visual Studio Native Debugging Visualizers for LLVM
+
+For Visual Studio 2013 only, put this file into 
+"%USERPROFILE%\Documents\Visual Studio 2013\Visualizers" or create a symbolic link so it updates automatically.
+
+For later versions of Visual Studio, no setup is required-->
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+  <Type Name="clang::Type">
+    <!-- To visualize clang::Types, we need to look at TypeBits.TC to determine the actual
+         type subclass and manually dispatch accordingly (Visual Studio can't identify the real type
+         because clang::Type has no virtual members hence no RTTI). 
+         
+         Views:
+           "cmn": Visualization that is common to all clang::Type subclasses
+           "poly": Visualization that is specific to the actual clang::Type subclass. The subtype-specific
+                   <DisplayString> is typically as C++-like as possible (like in dump()) with <Expand>
+                   containing all the gory details.
+           "cpp": Only occasionally used when we need to distinguish between an ordinary view and a C++-like view.
+    -->
+    <DisplayString IncludeView="cmn" Condition="TypeBits.TC==clang::LocInfoType::LocInfo">LocInfoType</DisplayString>
+    <DisplayString IncludeView="cmn">{(clang::Type::TypeClass)TypeBits.TC, en}Type</DisplayString>
+    <!-- Dispatch to visualizers for the actual Type subclass -->
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Builtin" IncludeView="poly">{*(clang::BuiltinType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Pointer" IncludeView="poly">{*(clang::PointerType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::LValueReference" IncludeView="poly">{*(clang::LValueReferenceType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::RValueReference" IncludeView="poly">{*(clang::RValueReferenceType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Attributed" IncludeView="poly">{*(clang::AttributedType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="poly">{*(clang::TemplateTypeParmType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm" IncludeView="poly">{*(clang::SubstTemplateTypeParmType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Record" IncludeView="poly">{*(clang::RecordType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Record" IncludeView="cpp">{*(clang::RecordType *)this,view(cpp)}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto" IncludeView="poly">{*(clang::FunctionProtoType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateSpecialization" IncludeView="poly">{*(clang::TemplateSpecializationType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::InjectedClassName" IncludeView="poly">{*(clang::InjectedClassNameType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::PackExpansion" IncludeView="poly">{*(clang::PackExpansionType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::LocInfoType::LocInfo" IncludeView="poly">{*(clang::LocInfoType *)this}</DisplayString>
+    <DisplayString IncludeView="cpp">{*this,view(poly)}</DisplayString>
+    <DisplayString IncludeView="poly">{*this,view(cmn)}</DisplayString> <!-- Not yet implemented Type subclass -->
+    <DisplayString>{*this,view(cmn)}  {{{*this,view(poly)}}}</DisplayString>
+    <Expand>
+      <Item Name="TypeClass" IncludeView="cmn">(clang::Type::TypeClass)TypeBits.TC</Item>
+      <Item Name="Flags" IncludeView="cmn">TypeBits</Item>
+      <Item Name="Canonical" IncludeView="cmn">CanonicalType</Item>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Builtin">*(clang::BuiltinType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Pointer">*(clang::PointerType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::LValueReference">*(clang::LValueReferenceType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::RValueReference">*(clang::RValueReferenceType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Attributed">*(clang::AttributedType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm">(clang::TemplateTypeParmType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm">(clang::SubstTemplateTypeParmType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Record">(clang::RecordType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto">(clang::FunctionProtoType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateSpecialization">(clang::TemplateSpecializationType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::InjectedClassName">(clang::InjectedClassNameType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::PackExpansion">(clang::PackExpansionType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::LocInfoType::LocInfo">(clang::LocInfoType *)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::PointerType">
+    <DisplayString>{PointeeType, view(poly)} *</DisplayString>
+    <Expand>
+      <Item Name="PointeeType">PointeeType</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <!-- We visualize all inner types for clang reference types. So a rvalue reference to an lvalue reference
+       to an int  would visual as int &amp; &amp;&amp; This is a little different than GetPointeeType(),
+       but more clearly displays the data structure and seems natural -->
+  <Type Name="clang::LValueReferenceType">
+    <DisplayString>{((clang::ReferenceType *)this)-&gt;PointeeType,view(cpp)} &amp;</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+      <Item Name="PointeeType">PointeeType</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::RValueReferenceType">
+    <DisplayString>{((clang::ReferenceType *)this)-&gt;PointeeType,view(cpp)} &amp;&amp;</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+      <Item Name="PointeeType">PointeeType</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::AttributedType">
+    <DisplayString>{ModifiedType} Attribute={(clang::AttributedType::Kind)AttributedTypeBits.AttrKind}</DisplayString>
+  </Type>
+  
+  <!-- Unfortunately, Visual Studio has trouble seeing the PointerBitMask member PointerIntUnion, so I hardwire it to 2 bits-->
+  <Type Name="clang::DeclContext">
+    <DisplayString>{(clang::Decl::Kind)DeclKind,en}Decl</DisplayString>
+    <Expand>
+      <Item Name="DeclKind">(clang::Decl::Kind)DeclKind,en</Item>
+      <Synthetic Name="Members">
+        <DisplayString></DisplayString>
+        <Expand>
+          <LinkedListItems>
+            <HeadPointer>FirstDecl</HeadPointer>
+            <NextPointer>(clang::Decl *)(NextInContextAndBits.Value &amp; ~3)</NextPointer>
+            <ValueNode>*this</ValueNode>
+          </LinkedListItems>
+        </Expand>
+      </Synthetic>
+    </Expand>
+  </Type>
+  <Type Name="clang::FieldDecl">
+    <DisplayString>Field {{{*(clang::DeclaratorDecl *)this,view(cpp)nd}}}</DisplayString>
+  </Type>
+  <Type Name="clang::CXXMethodDecl">
+    <DisplayString IncludeView="cpp">{*(clang::FunctionDecl *)this,nd}</DisplayString>
+    <DisplayString>Method {{{*this,view(cpp)}}}</DisplayString>
+  </Type>
+  <Type Name="clang::CXXConstructorDecl">
+    <DisplayString>Constructor {{{Name,view(cpp)}({*(clang::FunctionDecl *)this,view(parm0)nd})}}</DisplayString>
+  </Type>
+  <Type Name="clang::CXXDestructorDecl">
+    <DisplayString>Destructor {{~{Name,view(cpp)}()}}</DisplayString>
+  </Type>
+  <Type Name="clang::TemplateTypeParmDecl">
+    <DisplayString IncludeView="TorC" Condition="Typename">typename</DisplayString>
+    <DisplayString IncludeView="TorC" Condition="!Typename">class</DisplayString>
+    <DisplayString IncludeView="MaybeEllipses" Condition="((TemplateTypeParmType *)TypeForDecl)->CanTTPTInfo.ParameterPack">...</DisplayString>
+    <DisplayString IncludeView="MaybeEllipses" Condition="!((TemplateTypeParmType *)TypeForDecl)->CanTTPTInfo.ParameterPack"></DisplayString>
+    <DisplayString>{*this,view(TorC)} {*this,view(MaybeEllipses)}{Name,view(cpp)}</DisplayString> 
+  </Type>
+  <Type Name="clang::TemplateDecl">
+    <DisplayString>template{*TemplateParams} {*TemplatedDecl};</DisplayString>
+  </Type>
+  <Type Name="clang::NamedDecl" >
+    <DisplayString IncludeView="cpp">{Name,view(cpp)}</DisplayString>
+    <DisplayString>{Name}</DisplayString>
+  </Type>
+  <Type Name="clang::TagDecl">
+    <DisplayString IncludeView="implicit" Condition="Implicit">implicit{" ",sb}</DisplayString>
+    <DisplayString IncludeView="implicit"></DisplayString>
+    <DisplayString IncludeView="modifiers">{*this,view(implicit)}</DisplayString>
+    <DisplayString IncludeView="cpp">{*this,view(modifiers)}{Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclKind==clang::TagTypeKind::TTK_Struct">{*this,view(modifiers)}struct {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclKind==clang::TagTypeKind::TTK_Interface">{*this,view(modifiers)}interface {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclKind==clang::TagTypeKind::TTK_Union">{*this,view(modifiers)}union {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclKind==clang::TagTypeKind::TTK_Class">{*this,view(modifiers)}class {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclKind==clang::TagTypeKind::TTK_Enum">{*this,view(modifiers)}enum {Name,view(cpp)}</DisplayString>
+    <Expand>
+      <ExpandedItem>(clang::DeclContext *)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TagType">
+    <DisplayString IncludeView="cpp">{*decl,view(cpp)}</DisplayString>
+    <DisplayString>{*decl}</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+      <Item Name="decl">decl</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::RecordType">
+    <DisplayString IncludeView="cpp">{*(clang::TagType *)this,view(cpp)}</DisplayString>
+    <DisplayString>{*(clang::TagType *)this}</DisplayString>
+    <Expand>
+      <Item Name="TagType">*(clang::TagType *)this</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::SubstTemplateTypeParmType">
+    <DisplayString>{*Replaced,view(cpp)} &lt;= {CanonicalType,view(cpp)}</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+      <Item Name="Replaced">*Replaced</Item>
+    </Expand>
+  </Type>
+  <!-- We only show the first 5 parameter types in the display string (can't figure out how to loop in DisplayString)
+       but the expansion has all parameters -->
+  <Type Name="clang::FunctionProtoType">
+    <DisplayString IncludeView="retType">{ResultType,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="parm0" Condition="NumParams==0"></DisplayString>
+    <DisplayString IncludeView="parm0">{*(clang::QualType *)(this+1),view(cpp)}{*this,view(parm1)}</DisplayString>
+    <DisplayString IncludeView="parm1" Condition="NumParams==1"></DisplayString>
+    <DisplayString IncludeView="parm1">, {*((clang::QualType *)(this+1)+1),view(cpp)}{*this,view(parm2)}</DisplayString>
+    <DisplayString IncludeView="parm2" Condition="NumParams==2"></DisplayString>
+    <DisplayString IncludeView="parm2">, {*((clang::QualType *)(this+1)+2),view(cpp)}{*this,view(parm3)}</DisplayString>
+    <DisplayString IncludeView="parm3" Condition="NumParams==3"></DisplayString>
+    <DisplayString IncludeView="parm3">, {*((clang::QualType *)(this+1)+3),view(cpp)}{*this,view(parm4)}</DisplayString>
+    <DisplayString IncludeView="parm4" Condition="NumParams==4"></DisplayString>
+    <DisplayString IncludeView="parm4">, {*((clang::QualType *)(this+1)+4),view(cpp)}{*this,view(parm5)}</DisplayString>
+    <DisplayString IncludeView="parm5" Condition="NumParams==5"></DisplayString>
+    <DisplayString IncludeView="parm5">, /* expand for more params */</DisplayString>
+    <DisplayString>{*this,view(retType)}({*this,view(parm0)})</DisplayString>
+    <Expand>
+      <Item Name="ReturnType">ResultType</Item>
+      <Synthetic Name="Parameter Types">
+        <DisplayString>{*this,view(parm0)}</DisplayString>
+        <Expand>
+          <ArrayItems>
+            <Size>NumParams</Size>
+            <ValuePointer>(clang::QualType *)(this+1)</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TemplateTypeParmType">
+    <DisplayString IncludeView="cpp">{*TTPDecl,view(cpp)}</DisplayString>
+    <DisplayString>{*TTPDecl}</DisplayString>
+  </Type>
+  <Type Name="clang::InjectedClassNameType">
+    <DisplayString>{*Decl,view(cpp)}</DisplayString>
+    <Expand>
+      <Item Name="Decl">Decl</Item>
+      <Item Name="InjectedType">InjectedType</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::PackExpansionType">
+    <DisplayString>{Pattern}</DisplayString>
+    <Expand>
+      <Item Name="Pattern">Pattern</Item>
+      <Item Name="NumExpansions">NumExpansions</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::QualType">
+    <!-- When VS2013 support is deprecated, change 4 to clang::TypeAlignmentInBits (not properly recognized by VS2013) -->
+    <DisplayString IncludeView="poly">{*((clang::ExtQualsTypeCommonBase *)(((uintptr_t)Value.Value) &amp; ~(uintptr_t)((1 &lt;&lt; 4) - 1)))-&gt;BaseType,view(poly)}{*this,view(fastQuals)}</DisplayString>
+    <DisplayString IncludeView="cpp">{*((clang::ExtQualsTypeCommonBase *)(((uintptr_t)Value.Value) &amp; ~(uintptr_t)((1 &lt;&lt; 4) - 1)))-&gt;BaseType,view(cpp)}{*this,view(fastQuals)}</DisplayString>
+    <!-- For the Fast Qualifiers, it is simpler (and probably more efficient) just to list all 8 cases than create
+          views for each qualifier. TODO: Non-fast qualifiers -->
+    <DisplayString IncludeView="fastQuals" Condition="(Value.Value &amp; 15)==0"></DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(Value.Value &amp; 15)==1">{" ",sb}const</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(Value.Value &amp; 15)==2">{" ",sb}restrict</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(Value.Value &amp; 15)==3">{" ",sb}const restrict</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(Value.Value &amp; 15)==4">{" ",sb}volatile</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(Value.Value &amp; 15)==5">{" ",sb}const volatile</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(Value.Value &amp; 15)==6">{" ",sb}volatile restrict</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(Value.Value &amp; 15)==7">{" ",sb}const volatile restrict</DisplayString>
+    <DisplayString IncludeView="fastQuals">Cannot visualize non-fast qualifiers</DisplayString>
+    <DisplayString>{*((clang::ExtQualsTypeCommonBase *)(((uintptr_t)Value.Value) &amp; ~(uintptr_t)((1 &lt;&lt; 4) - 1)))-&gt;BaseType}{*this,view(fastQuals)}</DisplayString>
+    <Expand>
+      <Item Name="Fast Quals">*this,view(fastQuals)</Item>
+      <Item Name="BaseType">*((clang::ExtQualsTypeCommonBase *)(((uintptr_t)Value.Value) &amp; ~(uintptr_t)((1 &lt;&lt; 4) - 1)))-&gt;BaseType</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::LocInfoType">
+    <DisplayString>{*DeclInfo}</DisplayString>
+    <Expand>
+      <Item Name="DeclInfo">DeclInfo</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TypeSourceInfo">
+    <DisplayString>{Ty}</DisplayString>
+  </Type>
+  <Type Name="clang::TemplateArgumentLoc">
+    <DisplayString>{Argument}</DisplayString>
+    <Expand>
+      <ExpandedItem>Argument</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TemplateArgument">
+    <DisplayString IncludeView="cpp" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">{*(clang::QualType *)&amp;TypeOrValue.V,view(cpp)}</DisplayString>
+    <DisplayString Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">{(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} template argument: {*(clang::QualType *)&amp;TypeOrValue.V}</DisplayString>
+    <DisplayString IncludeView="arg0" Condition="Args.NumArgs==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{Args.Args[0]}{*this,view(arg1)}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="Args.NumArgs==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {Args.Args[1]}{*this,view(arg2)}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="Args.NumArgs==2"></DisplayString>
+    <DisplayString IncludeView="arg2">, {Args.Args[2]}, ...</DisplayString>
+    <DisplayString IncludeView="arg0cpp" Condition="Args.NumArgs==0"></DisplayString>
+    <DisplayString IncludeView="arg0cpp">{Args.Args[0],view(cpp)}{*this,view(arg1cpp)}</DisplayString>
+    <DisplayString IncludeView="arg1cpp" Condition="Args.NumArgs==1"></DisplayString>
+    <DisplayString IncludeView="arg1cpp">, {Args.Args[1],view(cpp)}{*this,view(arg2cpp)}</DisplayString>
+    <DisplayString IncludeView="arg2cpp" Condition="Args.NumArgs==2"></DisplayString>
+    <DisplayString IncludeView="arg2cpp">, {Args.Args[2],view(cpp)}, ...</DisplayString>
+    <DisplayString IncludeView="cpp" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">{*this,view(arg0cpp)}</DisplayString>
+    <DisplayString Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">{*this,view(arg0)}</DisplayString>
+    <DisplayString>{(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en}</DisplayString>
+    <Expand>
+      <Item Name="QualType" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">*(clang::QualType *)&amp;TypeOrValue.V</Item>
+      <ArrayItems Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">
+        <Size>Args.NumArgs</Size>
+        <ValuePointer>Args.Args</ValuePointer>
+      </ArrayItems>
+      <!-- TODO: Other kinds-->
+    </Expand>
+  </Type>
+  <Type Name="clang::TemplateArgumentList">
+    <DisplayString IncludeView="arg0" Condition="NumArguments==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{Arguments[0],view(cpp)}{*this,view(arg1)}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="NumArguments==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {Arguments[1],view(cpp)}{*this,view(arg2)}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="NumArguments==2"></DisplayString>
+    <DisplayString IncludeView="arg2">, {Arguments[1],view(cpp)}, ...</DisplayString>
+    <DisplayString>&lt;{*this,view(arg0)}&gt;</DisplayString>
+    <Expand>
+      <Item Name="NumArguments">NumArguments</Item>
+      <ArrayItems>
+        <Size>NumArguments</Size>
+        <ValuePointer>Arguments</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+  <Type Name="llvm::ArrayRef&lt;clang::TemplateArgument&gt;">
+    <DisplayString IncludeView="arg0" Condition="Length==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{Data[0],view(cpp)}{*this,view(arg1)}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="Length==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {Data[1],view(cpp)}{*this,view(arg2)}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="Length==2"></DisplayString>
+    <DisplayString IncludeView="arg2">, {Data[2],view(cpp)}, ...</DisplayString>
+    <DisplayString>&lt;{*this,view(arg0)}&gt;</DisplayString>
+  </Type>
+  <Type Name="clang::MultiLevelTemplateArgumentList">
+    <DisplayString IncludeView="level0" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==0"></DisplayString>
+    <DisplayString IncludeView="level0">{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[0],view(cpp)}{*this,view(level1)}</DisplayString>
+    <DisplayString IncludeView="level1" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==1"></DisplayString>
+    <DisplayString IncludeView="level1">::{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[1],view(cpp)}{*this,view(level2)}</DisplayString>
+    <DisplayString IncludeView="level2" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==2"></DisplayString>
+    <DisplayString IncludeView="level2">::{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[2],view(cpp)}, ...</DisplayString>
+    <DisplayString>{*this,view(level0)}</DisplayString>
+    <Expand>
+      <Item Name="TemplateList">TemplateArgumentLists</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::ParsedTemplateArgument">
+    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::Type">Type template argument: {*(clang::QualType *)Arg}</DisplayString>
+    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::NonType">Non-type template argument: {*(clang::Expr *)Arg}</DisplayString>
+    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::Template">Template template argument: {*(clang::TemplateName *)Arg</DisplayString>
+    <Expand>
+      <Item Name="Kind">Kind,en</Item>
+      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::Type">(clang::QualType *)Arg</Item>
+      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::NonType">(clang::Expr *)Arg</Item>
+      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::Template">(clang::TemplateName *)Arg</Item>
+    </Expand>
+  </Type>
+  <!-- Builtin types that have C++ keywords are manually displayed as that keyword. Otherwise, just use the enum name -->
+  <Type Name="clang::BuiltinType">
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Void">void</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Bool">bool</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char_U">char</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UChar">unsigned char</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::WChar_U">wchar_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char16">char16_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char32">char32_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UShort">unsigned short</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UInt">unsigned int</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::ULong">unsigned long</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::ULongLong">unsigned long long</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UInt128">__uint128_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char_S">char</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::SChar">signed char</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::WChar_S">wchar_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Short">short</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Int">int</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Long">long</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::LongLong">long long</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Int128">__int128_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Half">__fp16</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Float">float</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Double">double</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::LongDouble">long double</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::NullPtr">nullptr_t</DisplayString>
+    <DisplayString>{(clang::BuiltinType::Kind)BuiltinTypeBits.Kind, en}</DisplayString>
+    <Expand>
+      <Item Name="Kind">(clang::BuiltinType::Kind)BuiltinTypeBits.Kind</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="clang::TemplateSpecializationType">
+    <DisplayString IncludeView="arg0" Condition="NumArgs==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{((clang::TemplateArgument *)(this+1))[0],view(cpp)}{*this,view(arg1)}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="NumArgs==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {((clang::TemplateArgument *)(this+1))[1],view(cpp)}{*this,view(arg2)}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="NumArgs==2"></DisplayString>
+    <DisplayString IncludeView="arg2">, {((clang::TemplateArgument *)(this+1))[2],view(cpp)}{*this,view(arg3)}</DisplayString>
+    <DisplayString Condition="(Template.Storage.Val.Val.Value &amp; 3) == 0">
+      {*((clang::TemplateDecl *)((Template.Storage.Val.Val.Value &gt;&gt; 2) &lt;&lt; 2))->TemplatedDecl,view(cpp)}&lt;{*this,view(arg0)}&gt;
+    </DisplayString>
+    <Expand>
+      <Item Name="Template">Template.Storage</Item>
+      <ArrayItems>
+        <Size>NumArgs</Size>
+        <ValuePointer>(clang::TemplateArgument *)(this+1)</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+  <Type Name="clang::IdentifierInfo">
+    <DisplayString Condition="Entry != 0">{((llvm::StringMapEntry&lt;clang::IdentifierInfo *&gt;*)Entry)+1,sb}</DisplayString>
+    <Expand>
+      <Item Condition="Entry != 0" Name="[Identifier]">((llvm::StringMapEntry&lt;clang::IdentifierInfo *&gt;*)Entry)+1,s</Item>
+      <Item Name="Token Kind">(clang::tok::TokenKind)TokenID</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeclarationName">
+    <DisplayString Condition="Ptr == 0" IncludeView="cpp"></DisplayString>
+    <DisplayString Condition="Ptr == 0">Empty</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredIdentifier" IncludeView="cpp">{*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredIdentifier">{{Identifier ({*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredObjCZeroArgSelector">{{ObjC Zero Arg Selector (*{(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredObjCOneArgSelector">{{ObjC One Arg Selector (*{(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra"
+                   IncludeView="cpp">{*(clang::DeclarationNameExtra *)(Ptr &amp; ~PtrMask),view(cpp)}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra">{{Extra ({*(clang::DeclarationNameExtra *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
+    <Expand>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredIdentifier" Name="[Identifier]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredObjCZeroArgSelector" Name="[ObjC Zero Arg Selector]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredObjCOneArgSelector" Name="[ObjC One Arg Selector]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra" Name="[Extra]">(clang::DeclarationNameExtra *)(Ptr &amp; ~PtrMask)</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeclarationNameExtra">
+    <DisplayString IncludeView="cpp"
+                   Condition="ExtraKindOrNumArgs &gt;= clang::DeclarationNameExtra::CXXConstructor 
+                   &amp;&amp; ExtraKindOrNumArgs &lt;= clang::DeclarationNameExtra::CXXConversionFunction"
+                   >{((clang::CXXSpecialName *)this)-&gt;Type,view(cpp)}</DisplayString>
+    <DisplayString>{(clang::DeclarationNameExtra::ExtraKind)ExtraKindOrNumArgs,en}{"  ",sb}{*this,view(cpp)}</DisplayString>
+  </Type>
+  <Type Name="clang::Token">
+    <DisplayString Condition="Kind != clang::tok::identifier">{(clang::tok::TokenKind)Kind,en}</DisplayString>
+    <DisplayString Condition="Kind == clang::tok::identifier">{{Identifier ({*(clang::IdentifierInfo *)(PtrData)})}}</DisplayString>
+  </Type>
+  <Type Name="clang::DeclSpec">
+    <DisplayString>[{(clang::DeclSpec::SCS)StorageClassSpec}], [{(clang::TypeSpecifierType)TypeSpecType}]</DisplayString>
+  </Type>
+  <Type Name="clang::PragmaHandler">
+    <DisplayString>{Name,s}</DisplayString>
+  </Type>
+  <Type Name="clang::FileEntry">
+    <DisplayString>{Name,s}</DisplayString>
+  </Type>
+  <Type Name="clang::DirectoryEntry">
+    <DisplayString>{Name,s}</DisplayString>
+  </Type>
+  <Type Name="clang::VarDecl::VarDeclBitfields">
+    <Expand>
+      <Item Name="StorageClass">(clang::StorageClass)SClass</Item>
+      <Item Name="ThreadStorageClass">(clang::ThreadStorageClassSpecifier)TSCSpec</Item>
+      <Item Name="InitStyle">(clang::VarDecl::InitializationStyle)InitStyle</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeclaratorDecl">
+    <DisplayString>{DeclType,view(cpp)} {Name,view(cpp)}</DisplayString>
+    <Expand>
+      <Item Name="Name">Name</Item>
+      <Item Name="DeclType">DeclType</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::VarDecl">
+    <DisplayString>{*(DeclaratorDecl*)this,nd}</DisplayString>
+    <Expand>
+      <ExpandedItem>*(DeclaratorDecl*)this,nd</ExpandedItem>
+      <Item Name="VarDeclBits">VarDeclBits</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::ParmVarDecl">
+    <DisplayString>{*(VarDecl*)this,nd}</DisplayString>
+    <Expand>
+      <Item Name="ParmVarDeclBits">ParmVarDeclBits</Item>
+      <ExpandedItem>*(VarDecl*)this,nd</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::FunctionDecl">
+    <DisplayString IncludeView="retType">{*(clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType,view(retType)}</DisplayString>
+    <DisplayString IncludeView="parm0" Condition="0 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)-&gt;NumParams"></DisplayString>
+    <DisplayString IncludeView="parm0">{*ParamInfo[0]}{*this,view(parm1)nd}</DisplayString>
+    <DisplayString IncludeView="parm1" Condition="1 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)-&gt;NumParams"></DisplayString>
+    <DisplayString IncludeView="parm1">, {*ParamInfo[1]}{*this,view(parm2)nd}</DisplayString>
+    <DisplayString IncludeView="parm2" Condition="2 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)-&gt;NumParams"></DisplayString>
+    <DisplayString IncludeView="parm2">, {*ParamInfo[2]}{*this,view(parm3)nd}</DisplayString>
+    <DisplayString IncludeView="parm3" Condition="3 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)-&gt;NumParams"></DisplayString>
+    <DisplayString IncludeView="parm3">, {*ParamInfo[3]}{*this,view(parm4)nd}</DisplayString>
+    <DisplayString IncludeView="parm4" Condition="4 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)-&gt;NumParams"></DisplayString>
+    <DisplayString IncludeView="parm4">, {*ParamInfo[4]}{*this,view(parm5)nd}</DisplayString>
+    <DisplayString IncludeView="parm5" Condition="5 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)-&gt;NumParams"></DisplayString>
+    <DisplayString IncludeView="parm5">, /* expand for more params */</DisplayString>
+    <DisplayString>{*this,view(retType)nd} {Name,view(cpp)nd}({*this,view(parm0)nd})</DisplayString>
+    <Expand>
+      <Item Name="ReturnType">*this,view(retType)nd</Item>
+      <Synthetic Name="Parameter Types">
+        <DisplayString>{*this,view(parm0)nd}</DisplayString>
+        <Expand>
+          <ArrayItems>
+            <Size>((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)-&gt;NumParams</Size>
+            <ValuePointer>ParamInfo</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::OpaquePtr&lt;clang::QualType&gt;">
+    <DisplayString>{*(clang::QualType *)this}</DisplayString>
+    <Expand>
+      <Item Name="Ptr">*(clang::QualType *)this</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::UnionOpaquePtr&lt;clang::QualType&gt;">
+    <DisplayString>{*(clang::QualType *)this}</DisplayString>
+    <Expand>
+      <Item Name="Ptr">*(clang::QualType *)this</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::OpaquePtr&lt;*&gt;">
+    <DisplayString>{($T1 *)Ptr}</DisplayString>
+    <Expand>
+      <ExpandedItem>($T1 *)Ptr</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::UnionOpaquePtr&lt;*&gt;">
+    <DisplayString>{($T1 *)Ptr}</DisplayString>
+    <Expand>
+      <ExpandedItem>($T1 *)Ptr</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TemplateParameterList">
+    <DisplayString IncludeView="parm0" Condition="NumParams==0"></DisplayString>
+    <DisplayString IncludeView="parm0">{*((NamedDecl **)(this+1))[0],view(cpp)}{*this,view(parm1)}</DisplayString>
+    <DisplayString IncludeView="parm1" Condition="NumParams==1"></DisplayString>
+    <DisplayString IncludeView="parm1">, {*((NamedDecl **)(this+1))[1],view(cpp)}{*this,view(parm2)}</DisplayString>
+    <DisplayString IncludeView="parm2" Condition="NumParams==2"></DisplayString>
+    <DisplayString IncludeView="parm2">, {*((NamedDecl **)(this+1))[2],view(cpp)}{*this,view(parm3)}</DisplayString>
+    <DisplayString IncludeView="parm3" Condition="NumParams==3"></DisplayString>
+    <DisplayString IncludeView="parm3">, {*((NamedDecl **)(this+1))[3],view(cpp)}{*this,view(parm4)}</DisplayString>
+    <DisplayString IncludeView="parm4" Condition="NumParams==4"></DisplayString>
+    <DisplayString IncludeView="parm4">, {*((NamedDecl **)(this+1))[4],view(cpp)}{*this,view(parm5)}</DisplayString>
+    <DisplayString IncludeView="parm5" Condition="NumParams==5"></DisplayString>
+    <DisplayString IncludeView="parm5">, /* Expand for more params */</DisplayString>
+    <DisplayString>&lt;{*this,view(parm0)}&gt;</DisplayString>
+    <Expand>
+      <ArrayItems>
+        <Size>NumParams</Size>
+      <ValuePointer>(NamedDecl **)(this+1)</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+  <Type Name="clang::Stmt">
+    <DisplayString>{(clang::Stmt::StmtClass)StmtBits.sClass,en}</DisplayString>
+    <Expand>
+      <Item Name="Class">(clang::Stmt::StmtClass)StmtBits.sClass,en</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::Expr">
+    <DisplayString>Expression of class {(clang::Stmt::StmtClass)StmtBits.sClass,en} and type {TR,view(cpp)}</DisplayString>
+  </Type>
+  <Type Name="clang::DeclAccessPair">
+    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_public">public</DisplayString>
+    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_protected">protected</DisplayString>
+    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_private">private</DisplayString>
+    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_none">b</DisplayString>
+    <DisplayString IncludeView="decl">{*(clang::NamedDecl *)(Ptr&amp;~Mask)}</DisplayString>
+    <DisplayString>{*this,view(access)} {*this,view(decl)}</DisplayString>
+  </Type>
+  <Type Name="clang::UnresolvedSet&lt;*&gt;">
+    <DisplayString>{Decls}</DisplayString>
+    <Expand>
+      <ExpandedItem>Decls</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::LookupResult">
+    <DisplayString Condition="ResultKind == clang::LookupResult::Ambiguous">{Ambiguity,en}: {Decls}</DisplayString>
+    <DisplayString>{ResultKind,en}: {Decls}</DisplayString>
+  </Type>
+  <Type Name="clang::ActionResult&lt;*&gt;" IncludeView="packedValidity">
+    <DisplayString Condition="PtrWithInvalid&amp;1">Invalid</DisplayString>
+    <DisplayString Condition="!(PtrWithInvalid&amp;1)">Valid</DisplayString>
+   </Type>
+  <Type Name="clang::ActionResult&lt;*&gt;" IncludeView="unpackedValidity">
+    <DisplayString Condition="Invalid">Invalid</DisplayString>
+    <DisplayString Condition="!Invalid">Valid</DisplayString>
+  </Type>
+  <Type Name="clang::ActionResult&lt;*&gt;" IncludeView="packed">
+    <DisplayString>{*this,view(packedValidity)}: {($T1 *)(PtrWithInvalid&amp;~1)}</DisplayString>
+    <Expand>
+      <Item Name="Invalid">(bool)(PtrWithInvalid&amp;1)</Item>
+      <Item Name="Val">($T1 *)(PtrWithInvalid&amp;~1)</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::ActionResult&lt;*&gt;" IncludeView="unpacked">
+    <DisplayString>{*this,view(unpackedValidity)}: {Val}</DisplayString>
+  </Type>
+  <Type Name="clang::ActionResult&lt;*&gt;">
+    <DisplayString Condition="$T2">{*this,view(packed)}</DisplayString>
+    <DisplayString Condition="!$T2">{*this,view(unpacked)}</DisplayString>
+    <Expand>
+      <ExpandedItem Condition="$T2">*this,view(packed)</ExpandedItem>
+      <ExpandedItem Condition="!$T2">*this,view(unpacked)</ExpandedItem>
+    </Expand>
+  </Type>
+</AutoVisualizer>
diff --git a/utils/TableGen/ClangAttrEmitter.cpp b/utils/TableGen/ClangAttrEmitter.cpp
index b4ef378..50102af 100644
--- a/utils/TableGen/ClangAttrEmitter.cpp
+++ b/utils/TableGen/ClangAttrEmitter.cpp
@@ -11,24 +11,36 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/StringMatcher.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
+#include <cassert>
 #include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <map>
 #include <memory>
 #include <set>
 #include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
 namespace {
+
 class FlattenedSpelling {
   std::string V, N, NS;
   bool K;
@@ -54,6 +66,7 @@
   const std::string &nameSpace() const { return NS; }
   bool knownToGCC() const { return K; }
 };
+
 } // end anonymous namespace
 
 static std::vector<FlattenedSpelling>
@@ -81,22 +94,26 @@
     .Case("TypeSourceInfo *", "GetTypeSourceInfo(F, Record, Idx)")
     .Case("Expr *", "ReadExpr(F)")
     .Case("IdentifierInfo *", "GetIdentifierInfo(F, Record, Idx)")
-    .Case("std::string", "ReadString(Record, Idx)")
+    .Case("StringRef", "ReadString(Record, Idx)")
     .Default("Record[Idx++]");
 }
 
+// Get a type that is suitable for storing an object of the specified type.
+static StringRef getStorageType(StringRef type) {
+  return StringSwitch<StringRef>(type)
+    .Case("StringRef", "std::string")
+    .Default(type);
+}
+
 // Assumes that the way to get the value is SA->getname()
 static std::string WritePCHRecord(StringRef type, StringRef name) {
-  return StringSwitch<std::string>(type)
-    .EndsWith("Decl *", "AddDeclRef(" + std::string(name) +
-                        ", Record);\n")
-    .Case("TypeSourceInfo *",
-          "AddTypeSourceInfo(" + std::string(name) + ", Record);\n")
+  return "Record." + StringSwitch<std::string>(type)
+    .EndsWith("Decl *", "AddDeclRef(" + std::string(name) + ");\n")
+    .Case("TypeSourceInfo *", "AddTypeSourceInfo(" + std::string(name) + ");\n")
     .Case("Expr *", "AddStmt(" + std::string(name) + ");\n")
-    .Case("IdentifierInfo *", 
-          "AddIdentifierRef(" + std::string(name) + ", Record);\n")
-    .Case("std::string", "AddString(" + std::string(name) + ", Record);\n")
-    .Default("Record.push_back(" + std::string(name) + ");\n");
+    .Case("IdentifierInfo *", "AddIdentifierRef(" + std::string(name) + ");\n")
+    .Case("StringRef", "AddString(" + std::string(name) + ");\n")
+    .Default("push_back(" + std::string(name) + ");\n");
 }
 
 // Normalize attribute name by removing leading and trailing
@@ -162,6 +179,7 @@
 }
 
 namespace {
+
   class Argument {
     std::string lowerName, upperName;
     StringRef attrName;
@@ -176,6 +194,11 @@
         lowerName[0] = std::tolower(lowerName[0]);
         upperName[0] = std::toupper(upperName[0]);
       }
+      // Work around MinGW's macro definition of 'interface' to 'struct'. We
+      // have an attribute argument called 'Interface', so only the lower case
+      // name conflicts with the macro definition.
+      if (lowerName == "interface")
+        lowerName = "interface_";
     }
     virtual ~Argument() = default;
 
@@ -223,8 +246,7 @@
 
   public:
     SimpleArgument(const Record &Arg, StringRef Attr, std::string T)
-      : Argument(Arg, Attr), type(T)
-    {}
+        : Argument(Arg, Attr), type(std::move(T)) {}
 
     std::string getType() const { return type; }
 
@@ -233,35 +255,45 @@
       OS << "    return " << getLowerName() << ";\n";
       OS << "  }";
     }
+
     void writeCloneArgs(raw_ostream &OS) const override {
       OS << getLowerName();
     }
+
     void writeTemplateInstantiationArgs(raw_ostream &OS) const override {
       OS << "A->get" << getUpperName() << "()";
     }
+
     void writeCtorInitializers(raw_ostream &OS) const override {
       OS << getLowerName() << "(" << getUpperName() << ")";
     }
+
     void writeCtorDefaultInitializers(raw_ostream &OS) const override {
       OS << getLowerName() << "()";
     }
+
     void writeCtorParameters(raw_ostream &OS) const override {
       OS << type << " " << getUpperName();
     }
+
     void writeDeclarations(raw_ostream &OS) const override {
       OS << type << " " << getLowerName() << ";";
     }
+
     void writePCHReadDecls(raw_ostream &OS) const override {
       std::string read = ReadPCHRecord(type);
       OS << "    " << type << " " << getLowerName() << " = " << read << ";\n";
     }
+
     void writePCHReadArgs(raw_ostream &OS) const override {
       OS << getLowerName();
     }
+
     void writePCHWrite(raw_ostream &OS) const override {
       OS << "    " << WritePCHRecord(type, "SA->get" +
                                            std::string(getUpperName()) + "()");
     }
+
     void writeValue(raw_ostream &OS) const override {
       if (type == "FunctionDecl *") {
         OS << "\" << get" << getUpperName()
@@ -274,6 +306,7 @@
         OS << "\" << get" << getUpperName() << "() << \"";
       }
     }
+
     void writeDump(raw_ostream &OS) const override {
       if (type == "FunctionDecl *") {
         OS << "    OS << \" \";\n";
@@ -309,7 +342,12 @@
       SimpleArgument::writeAccessors(OS);
 
       OS << "\n\n  static const " << getType() << " Default" << getUpperName()
-         << " = " << Default << ";";
+         << " = ";
+      if (getType() == "bool")
+        OS << (Default != 0 ? "true" : "false");
+      else
+        OS << Default;
+      OS << ";";
     }
   };
 
@@ -337,45 +375,57 @@
          << getLowerName() << "Length);\n";
       OS << "  }";
     }
+
     void writeCloneArgs(raw_ostream &OS) const override {
       OS << "get" << getUpperName() << "()";
     }
+
     void writeTemplateInstantiationArgs(raw_ostream &OS) const override {
       OS << "A->get" << getUpperName() << "()";
     }
+
     void writeCtorBody(raw_ostream &OS) const override {
       OS << "      if (!" << getUpperName() << ".empty())\n";
       OS << "        std::memcpy(" << getLowerName() << ", " << getUpperName()
-         << ".data(), " << getLowerName() << "Length);";
+         << ".data(), " << getLowerName() << "Length);\n";
     }
+
     void writeCtorInitializers(raw_ostream &OS) const override {
       OS << getLowerName() << "Length(" << getUpperName() << ".size()),"
          << getLowerName() << "(new (Ctx, 1) char[" << getLowerName()
          << "Length])";
     }
+
     void writeCtorDefaultInitializers(raw_ostream &OS) const override {
       OS << getLowerName() << "Length(0)," << getLowerName() << "(nullptr)";
     }
+
     void writeCtorParameters(raw_ostream &OS) const override {
       OS << "llvm::StringRef " << getUpperName();
     }
+
     void writeDeclarations(raw_ostream &OS) const override {
       OS << "unsigned " << getLowerName() << "Length;\n";
       OS << "char *" << getLowerName() << ";";
     }
+
     void writePCHReadDecls(raw_ostream &OS) const override {
       OS << "    std::string " << getLowerName()
          << "= ReadString(Record, Idx);\n";
     }
+
     void writePCHReadArgs(raw_ostream &OS) const override {
       OS << getLowerName();
     }
+
     void writePCHWrite(raw_ostream &OS) const override {
-      OS << "    AddString(SA->get" << getUpperName() << "(), Record);\n";
+      OS << "    Record.AddString(SA->get" << getUpperName() << "());\n";
     }
+
     void writeValue(raw_ostream &OS) const override {
       OS << "\\\"\" << get" << getUpperName() << "() << \"\\\"";
     }
+
     void writeDump(raw_ostream &OS) const override {
       OS << "    OS << \" \\\"\" << SA->get" << getUpperName()
          << "() << \"\\\"\";\n";
@@ -407,6 +457,7 @@
       OS << "    return " << getLowerName() << "Type;\n";
       OS << "  }";
     }
+
     void writeAccessorDefinitions(raw_ostream &OS) const override {
       OS << "bool " << getAttrName() << "Attr::is" << getUpperName()
          << "Dependent() const {\n";
@@ -434,16 +485,19 @@
       OS << "    return 0; // FIXME\n";
       OS << "}\n";
     }
+
     void writeCloneArgs(raw_ostream &OS) const override {
       OS << "is" << getLowerName() << "Expr, is" << getLowerName()
          << "Expr ? static_cast<void*>(" << getLowerName()
          << "Expr) : " << getLowerName()
          << "Type";
     }
+
     void writeTemplateInstantiationArgs(raw_ostream &OS) const override {
       // FIXME: move the definition in Sema::InstantiateAttrs to here.
       // In the meantime, aligned attributes are cloned.
     }
+
     void writeCtorBody(raw_ostream &OS) const override {
       OS << "    if (is" << getLowerName() << "Expr)\n";
       OS << "       " << getLowerName() << "Expr = reinterpret_cast<Expr *>("
@@ -451,20 +505,25 @@
       OS << "    else\n";
       OS << "       " << getLowerName()
          << "Type = reinterpret_cast<TypeSourceInfo *>(" << getUpperName()
-         << ");";
+         << ");\n";
     }
+
     void writeCtorInitializers(raw_ostream &OS) const override {
       OS << "is" << getLowerName() << "Expr(Is" << getUpperName() << "Expr)";
     }
+
     void writeCtorDefaultInitializers(raw_ostream &OS) const override {
       OS << "is" << getLowerName() << "Expr(false)";
     }
+
     void writeCtorParameters(raw_ostream &OS) const override {
       OS << "bool Is" << getUpperName() << "Expr, void *" << getUpperName();
     }
+
     void writeImplicitCtorArgs(raw_ostream &OS) const override {
       OS << "Is" << getUpperName() << "Expr, " << getUpperName();
     }
+
     void writeDeclarations(raw_ostream &OS) const override {
       OS << "bool is" << getLowerName() << "Expr;\n";
       OS << "union {\n";
@@ -472,9 +531,11 @@
       OS << "TypeSourceInfo *" << getLowerName() << "Type;\n";
       OS << "};";
     }
+
     void writePCHReadArgs(raw_ostream &OS) const override {
       OS << "is" << getLowerName() << "Expr, " << getLowerName() << "Ptr";
     }
+
     void writePCHReadDecls(raw_ostream &OS) const override {
       OS << "    bool is" << getLowerName() << "Expr = Record[Idx++];\n";
       OS << "    void *" << getLowerName() << "Ptr;\n";
@@ -484,14 +545,16 @@
       OS << "      " << getLowerName()
          << "Ptr = GetTypeSourceInfo(F, Record, Idx);\n";
     }
+
     void writePCHWrite(raw_ostream &OS) const override {
       OS << "    Record.push_back(SA->is" << getUpperName() << "Expr());\n";
       OS << "    if (SA->is" << getUpperName() << "Expr())\n";
-      OS << "      AddStmt(SA->get" << getUpperName() << "Expr());\n";
+      OS << "      Record.AddStmt(SA->get" << getUpperName() << "Expr());\n";
       OS << "    else\n";
-      OS << "      AddTypeSourceInfo(SA->get" << getUpperName()
-         << "Type(), Record);\n";
+      OS << "      Record.AddTypeSourceInfo(SA->get" << getUpperName()
+         << "Type());\n";
     }
+
     void writeValue(raw_ostream &OS) const override {
       OS << "\";\n";
       // The aligned attribute argument expression is optional.
@@ -500,8 +563,9 @@
       OS << "      " << getLowerName() << "Expr->printPretty(OS, nullptr, Policy);\n";
       OS << "    OS << \"";
     }
-    void writeDump(raw_ostream &OS) const override {
-    }
+
+    void writeDump(raw_ostream &OS) const override {}
+
     void writeDumpChildren(raw_ostream &OS) const override {
       OS << "    if (SA->is" << getUpperName() << "Expr())\n";
       OS << "      dumpStmt(SA->get" << getUpperName() << "Expr());\n";
@@ -509,6 +573,7 @@
       OS << "      dumpType(SA->get" << getUpperName()
          << "Type()->getType());\n";
     }
+
     void writeHasChildren(raw_ostream &OS) const override {
       OS << "SA->is" << getUpperName() << "Expr()";
     }
@@ -525,10 +590,13 @@
 
   public:
     VariadicArgument(const Record &Arg, StringRef Attr, std::string T)
-        : Argument(Arg, Attr), Type(T), ArgName(getLowerName().str() + "_"),
-          ArgSizeName(ArgName + "Size"), RangeName(getLowerName()) {}
+        : Argument(Arg, Attr), Type(std::move(T)),
+          ArgName(getLowerName().str() + "_"), ArgSizeName(ArgName + "Size"),
+          RangeName(getLowerName()) {}
 
-    std::string getType() const { return Type; }
+    const std::string &getType() const { return Type; }
+    const std::string &getArgName() const { return ArgName; }
+    const std::string &getArgSizeName() const { return ArgSizeName; }
     bool isVariadic() const override { return true; }
 
     void writeAccessors(raw_ostream &OS) const override {
@@ -547,56 +615,87 @@
          << "() const { return llvm::make_range(" << BeginFn << ", " << EndFn
          << "); }\n";
     }
+
     void writeCloneArgs(raw_ostream &OS) const override {
       OS << ArgName << ", " << ArgSizeName;
     }
+
     void writeTemplateInstantiationArgs(raw_ostream &OS) const override {
       // This isn't elegant, but we have to go through public methods...
       OS << "A->" << getLowerName() << "_begin(), "
          << "A->" << getLowerName() << "_size()";
     }
+
     void writeCtorBody(raw_ostream &OS) const override {
       OS << "    std::copy(" << getUpperName() << ", " << getUpperName()
-         << " + " << ArgSizeName << ", " << ArgName << ");";
+         << " + " << ArgSizeName << ", " << ArgName << ");\n";
     }
+
     void writeCtorInitializers(raw_ostream &OS) const override {
       OS << ArgSizeName << "(" << getUpperName() << "Size), "
          << ArgName << "(new (Ctx, 16) " << getType() << "["
          << ArgSizeName << "])";
     }
+
     void writeCtorDefaultInitializers(raw_ostream &OS) const override {
       OS << ArgSizeName << "(0), " << ArgName << "(nullptr)";
     }
+
     void writeCtorParameters(raw_ostream &OS) const override {
       OS << getType() << " *" << getUpperName() << ", unsigned "
          << getUpperName() << "Size";
     }
+
     void writeImplicitCtorArgs(raw_ostream &OS) const override {
       OS << getUpperName() << ", " << getUpperName() << "Size";
     }
+
     void writeDeclarations(raw_ostream &OS) const override {
       OS << "  unsigned " << ArgSizeName << ";\n";
       OS << "  " << getType() << " *" << ArgName << ";";
     }
+
     void writePCHReadDecls(raw_ostream &OS) const override {
-      OS << "  unsigned " << getLowerName() << "Size = Record[Idx++];\n";
-      OS << "  SmallVector<" << Type << ", 4> " << getLowerName()
-         << ";\n";
-      OS << "  " << getLowerName() << ".reserve(" << getLowerName()
+      OS << "    unsigned " << getLowerName() << "Size = Record[Idx++];\n";
+      OS << "    SmallVector<" << getType() << ", 4> "
+         << getLowerName() << ";\n";
+      OS << "    " << getLowerName() << ".reserve(" << getLowerName()
          << "Size);\n";
-      OS << "    for (unsigned i = " << getLowerName() << "Size; i; --i)\n";
-      
+
+      // If we can't store the values in the current type (if it's something
+      // like StringRef), store them in a different type and convert the
+      // container afterwards.
+      std::string StorageType = getStorageType(getType());
+      std::string StorageName = getLowerName();
+      if (StorageType != getType()) {
+        StorageName += "Storage";
+        OS << "    SmallVector<" << StorageType << ", 4> "
+           << StorageName << ";\n";
+        OS << "    " << StorageName << ".reserve(" << getLowerName()
+           << "Size);\n";
+      }
+
+      OS << "    for (unsigned i = 0; i != " << getLowerName() << "Size; ++i)\n";
       std::string read = ReadPCHRecord(Type);
-      OS << "    " << getLowerName() << ".push_back(" << read << ");\n";
+      OS << "      " << StorageName << ".push_back(" << read << ");\n";
+
+      if (StorageType != getType()) {
+        OS << "    for (unsigned i = 0; i != " << getLowerName() << "Size; ++i)\n";
+        OS << "      " << getLowerName() << ".push_back("
+           << StorageName << "[i]);\n";
+      }
     }
+
     void writePCHReadArgs(raw_ostream &OS) const override {
       OS << getLowerName() << ".data(), " << getLowerName() << "Size";
     }
+
     void writePCHWrite(raw_ostream &OS) const override {
       OS << "    Record.push_back(SA->" << getLowerName() << "_size());\n";
       OS << "    for (auto &Val : SA->" << RangeName << "())\n";
       OS << "      " << WritePCHRecord(Type, "Val");
     }
+
     void writeValue(raw_ostream &OS) const override {
       OS << "\";\n";
       OS << "  bool isFirst = true;\n"
@@ -607,6 +706,7 @@
       OS << "  }\n";
       OS << "  OS << \"";
     }
+
     void writeDump(raw_ostream &OS) const override {
       OS << "    for (const auto &Val : SA->" << RangeName << "())\n";
       OS << "      OS << \" \" << Val;\n";
@@ -649,9 +749,11 @@
       OS << "    return " << getLowerName() << ";\n";
       OS << "  }";
     }
+
     void writeCloneArgs(raw_ostream &OS) const override {
       OS << getLowerName();
     }
+
     void writeTemplateInstantiationArgs(raw_ostream &OS) const override {
       OS << "A->get" << getUpperName() << "()";
     }
@@ -678,17 +780,21 @@
       OS << "private:\n";
       OS << "  " << type << " " << getLowerName() << ";";
     }
+
     void writePCHReadDecls(raw_ostream &OS) const override {
       OS << "    " << getAttrName() << "Attr::" << type << " " << getLowerName()
          << "(static_cast<" << getAttrName() << "Attr::" << type
          << ">(Record[Idx++]));\n";
     }
+
     void writePCHReadArgs(raw_ostream &OS) const override {
       OS << getLowerName();
     }
+
     void writePCHWrite(raw_ostream &OS) const override {
       OS << "Record.push_back(SA->get" << getUpperName() << "());\n";
     }
+
     void writeValue(raw_ostream &OS) const override {
       // FIXME: this isn't 100% correct -- some enum arguments require printing
       // as a string literal, while others require printing as an identifier.
@@ -696,6 +802,7 @@
       OS << "\\\"\" << " << getAttrName() << "Attr::Convert" << type << "ToStr(get"
          << getUpperName() << "()) << \"\\\"";
     }
+
     void writeDump(raw_ostream &OS) const override {
       OS << "    switch(SA->get" << getUpperName() << "()) {\n";
       for (const auto &I : uniques) {
@@ -784,6 +891,7 @@
       
       VariadicArgument::writeDeclarations(OS);
     }
+
     void writeDump(raw_ostream &OS) const override {
       OS << "    for (" << getAttrName() << "Attr::" << getLowerName()
          << "_iterator I = SA->" << getLowerName() << "_begin(), E = SA->"
@@ -797,6 +905,7 @@
       OS << "      }\n";
       OS << "    }\n";
     }
+
     void writePCHReadDecls(raw_ostream &OS) const override {
       OS << "    unsigned " << getLowerName() << "Size = Record[Idx++];\n";
       OS << "    SmallVector<" << QualifiedTypeName << ", 4> " << getLowerName()
@@ -807,6 +916,7 @@
       OS << "      " << getLowerName() << ".push_back(" << "static_cast<"
          << QualifiedTypeName << ">(Record[Idx++]));\n";
     }
+
     void writePCHWrite(raw_ostream &OS) const override {
       OS << "    Record.push_back(SA->" << getLowerName() << "_size());\n";
       OS << "    for (" << getAttrName() << "Attr::" << getLowerName()
@@ -814,6 +924,7 @@
          << getLowerName() << "_end(); i != e; ++i)\n";
       OS << "      " << WritePCHRecord(QualifiedTypeName, "(*i)");
     }
+
     void writeConversion(raw_ostream &OS) const {
       OS << "  static bool ConvertStrTo" << type << "(StringRef Val, ";
       OS << type << " &Out) {\n";
@@ -859,37 +970,48 @@
       OS << "    " << getLowerName() << " = V;\n";
       OS << "  }";
     }
+
     void writeCloneArgs(raw_ostream &OS) const override {
       OS << "get" << getUpperName() << "()";
     }
+
     void writeTemplateInstantiationArgs(raw_ostream &OS) const override {
       OS << "A->get" << getUpperName() << "()";
     }
+
     void writeCtorInitializers(raw_ostream &OS) const override {
       OS << getLowerName() << "(" << getUpperName() << ")";
     }
+
     void writeCtorDefaultInitializers(raw_ostream &OS) const override {
       OS << getLowerName() << "()";
     }
+
     void writeCtorParameters(raw_ostream &OS) const override {
       OS << "VersionTuple " << getUpperName();
     }
+
     void writeDeclarations(raw_ostream &OS) const override {
       OS << "VersionTuple " << getLowerName() << ";\n";
     }
+
     void writePCHReadDecls(raw_ostream &OS) const override {
       OS << "    VersionTuple " << getLowerName()
          << "= ReadVersionTuple(Record, Idx);\n";
     }
+
     void writePCHReadArgs(raw_ostream &OS) const override {
       OS << getLowerName();
     }
+
     void writePCHWrite(raw_ostream &OS) const override {
-      OS << "    AddVersionTuple(SA->get" << getUpperName() << "(), Record);\n";
+      OS << "    Record.AddVersionTuple(SA->get" << getUpperName() << "());\n";
     }
+
     void writeValue(raw_ostream &OS) const override {
       OS << getLowerName() << "=\" << get" << getUpperName() << "() << \"";
     }
+
     void writeDump(raw_ostream &OS) const override {
       OS << "    OS << \" \" << SA->get" << getUpperName() << "();\n";
     }
@@ -928,6 +1050,7 @@
     void writeDumpChildren(raw_ostream &OS) const override {
       OS << "    dumpStmt(SA->get" << getUpperName() << "());\n";
     }
+
     void writeHasChildren(raw_ostream &OS) const override { OS << "true"; }
   };
 
@@ -993,8 +1116,21 @@
   class VariadicStringArgument : public VariadicArgument {
   public:
     VariadicStringArgument(const Record &Arg, StringRef Attr)
-      : VariadicArgument(Arg, Attr, "std::string")
+      : VariadicArgument(Arg, Attr, "StringRef")
     {}
+
+    void writeCtorBody(raw_ostream &OS) const override {
+      OS << "    for (size_t I = 0, E = " << getArgSizeName() << "; I != E;\n"
+            "         ++I) {\n"
+            "      StringRef Ref = " << getUpperName() << "[I];\n"
+            "      if (!Ref.empty()) {\n"
+            "        char *Mem = new (Ctx, 1) char[Ref.size()];\n"
+            "        std::memcpy(Mem, Ref.data(), Ref.size());\n"
+            "        " << getArgName() << "[I] = StringRef(Mem, Ref.size());\n"
+            "      }\n"
+            "    }\n";
+    }
+
     void writeValueImpl(raw_ostream &OS) const override {
       OS << "    OS << \"\\\"\" << Val << \"\\\"\";\n";
     }
@@ -1014,14 +1150,17 @@
       OS << "    return " << getLowerName() << ";\n";
       OS << "  }";
     }
+
     void writeTemplateInstantiationArgs(raw_ostream &OS) const override {
       OS << "A->get" << getUpperName() << "Loc()";
     }
+
     void writePCHWrite(raw_ostream &OS) const override {
       OS << "    " << WritePCHRecord(
           getType(), "SA->get" + std::string(getUpperName()) + "Loc()");
     }
   };
+
 } // end anonymous namespace
 
 static std::unique_ptr<Argument>
@@ -1073,7 +1212,7 @@
   if (!Ptr) {
     // Search in reverse order so that the most-derived type is handled first.
     ArrayRef<std::pair<Record*, SMRange>> Bases = Search->getSuperClasses();
-    for (const auto &Base : llvm::make_range(Bases.rbegin(), Bases.rend())) {
+    for (const auto &Base : llvm::reverse(Bases)) {
       if ((Ptr = createArgument(Arg, Attr, Base.first)))
         break;
     }
@@ -1090,6 +1229,7 @@
 
 static void writeAvailabilityValue(raw_ostream &OS) {
   OS << "\" << getPlatform()->getName();\n"
+     << "  if (getStrict()) OS << \", strict\";\n"
      << "  if (!getIntroduced().empty()) OS << \", introduced=\" << getIntroduced();\n"
      << "  if (!getDeprecated().empty()) OS << \", deprecated=\" << getDeprecated();\n"
      << "  if (!getObsoleted().empty()) OS << \", obsoleted=\" << getObsoleted();\n"
@@ -1314,10 +1454,10 @@
   unsigned Idx = 0;
   for (auto I = Spellings.begin(), E = Spellings.end(); I != E; ++I, ++Idx) {
     const FlattenedSpelling &S = *I;
-    std::string Variety = S.variety();
-    std::string Spelling = S.name();
-    std::string Namespace = S.nameSpace();
-    std::string EnumName = "";
+    const std::string &Variety = S.variety();
+    const std::string &Spelling = S.name();
+    const std::string &Namespace = S.nameSpace();
+    std::string EnumName;
 
     EnumName += (Variety + "_");
     if (!Namespace.empty())
@@ -1490,7 +1630,7 @@
     ArrayRef<std::pair<Record *, SMRange>> Supers = R.getSuperClasses();
     assert(!Supers.empty() && "Forgot to specify a superclass for the attr");
     std::string SuperName;
-    for (const auto &Super : llvm::make_range(Supers.rbegin(), Supers.rend())) {
+    for (const auto &Super : llvm::reverse(Supers)) {
       const Record *R = Super.first;
       if (R->getName() != "TargetSpecificAttr" && SuperName.empty())
         SuperName = R->getName();
@@ -1517,7 +1657,7 @@
       }
     }
 
-    OS << "\npublic:\n";
+    OS << "public:\n";
 
     std::vector<FlattenedSpelling> Spellings = GetFlattenedSpellings(R);
 
@@ -1587,8 +1727,8 @@
 
       OS << "             )\n";
       OS << "    : " << SuperName << "(attr::" << R.getName() << ", R, SI, "
-         << R.getValueAsBit("LateParsed") << ", "
-         << R.getValueAsBit("DuplicatesAllowedWhileMerging") << ")\n";
+         << ( R.getValueAsBit("LateParsed") ? "true" : "false" ) << ", "
+         << ( R.getValueAsBit("DuplicatesAllowedWhileMerging") ? "true" : "false" ) << ")\n";
 
       for (auto const &ai : Args) {
         OS << "              , ";
@@ -1605,10 +1745,8 @@
       for (auto const &ai : Args) {
         if (!shouldEmitArg(ai)) continue;
         ai->writeCtorBody(OS);
-        OS << "\n";
       }
       OS << "  }\n\n";
-
     };
 
     // Emit a constructor that includes all the arguments.
@@ -1751,14 +1889,17 @@
 }
 
 namespace {
+
   struct AttrClassDescriptor {
     const char * const MacroName;
     const char * const TableGenName;
   };
-}
+
+} // end anonymous namespace
 
 static const AttrClassDescriptor AttrClassDescriptors[] = {
   { "ATTR", "Attr" },
+  { "STMT_ATTR", "StmtAttr" },
   { "INHERITABLE_ATTR", "InheritableAttr" },
   { "INHERITABLE_PARAM_ATTR", "InheritableParamAttr" },
   { "PARAMETER_ABI_ATTR", "ParameterABIAttr" }
@@ -1773,6 +1914,7 @@
 }
 
 namespace {
+
   /// A class of attributes.
   struct AttrClass {
     const AttrClassDescriptor &Descriptor;
@@ -1848,6 +1990,7 @@
   /// The entire hierarchy of attribute classes.
   class AttrClassHierarchy {
     std::vector<std::unique_ptr<AttrClass>> Classes;
+
   public:
     AttrClassHierarchy(RecordKeeper &Records) {
       // Find records for all the classes.
@@ -1920,9 +2063,11 @@
       return nullptr;
     }
   };
-}
+
+} // end anonymous namespace
 
 namespace clang {
+
 // Emits the enumeration list for attributes.
 void EmitClangAttrList(RecordKeeper &Records, raw_ostream &OS) {
   emitSourceFileHeader("List of all attributes that Clang recognizes", OS);
@@ -2158,7 +2303,7 @@
   for (auto *R : Attrs) {
     std::vector<FlattenedSpelling> Spellings = GetFlattenedSpellings(*R);
     for (const auto &SI : Spellings) {
-      std::string Variety = SI.variety();
+      const std::string &Variety = SI.variety();
       if (Variety == "GNU")
         GNU.push_back(R);
       else if (Variety == "Declspec")
@@ -2508,6 +2653,15 @@
       return "(S.getLangOpts().CPlusPlus ? ExpectedFunctionVariableOrClass : "
                                            "ExpectedVariableOrFunction)";
 
+    case Func | Var | Class | ObjCInterface:
+      return "(S.getLangOpts().CPlusPlus"
+             "     ? ((S.getLangOpts().ObjC1 || S.getLangOpts().ObjC2)"
+             "            ? ExpectedFunctionVariableClassOrObjCInterface"
+             "            : ExpectedFunctionVariableOrClass)"
+             "     : ((S.getLangOpts().ObjC1 || S.getLangOpts().ObjC2)"
+             "            ? ExpectedFunctionVariableOrObjCInterface"
+             "            : ExpectedVariableOrFunction))";
+
     case ObjCMethod | ObjCProp: return "ExpectedMethodOrProperty";
     case ObjCProtocol | ObjCInterface:
       return "ExpectedObjectiveCInterfaceOrProtocol";
@@ -2804,6 +2958,7 @@
     SS << ", " << I->second->getValueAsBit("HasCustomParsing");
     SS << ", " << I->second->isSubClassOf("TargetSpecificAttr");
     SS << ", " << I->second->isSubClassOf("TypeAttr");
+    SS << ", " << I->second->isSubClassOf("StmtAttr");
     SS << ", " << IsKnownToGCC(*I->second);
     SS << ", " << GenerateAppertainsTo(*I->second, OS);
     SS << ", " << GenerateLangOptRequirements(*I->second, OS);
@@ -2857,9 +3012,10 @@
 
       std::vector<FlattenedSpelling> Spellings = GetFlattenedSpellings(Attr);
       for (const auto &S : Spellings) {
-        std::string RawSpelling = S.name();
+        const std::string &RawSpelling = S.name();
         std::vector<StringMatcher::StringPair> *Matches = nullptr;
-        std::string Spelling, Variety = S.variety();
+        std::string Spelling;
+        const std::string &Variety = S.variety();
         if (Variety == "CXX11") {
           Matches = &CXX11;
           Spelling += S.nameSpace();
diff --git a/utils/TableGen/ClangDiagnosticsEmitter.cpp b/utils/TableGen/ClangDiagnosticsEmitter.cpp
index bbc2bdb..dfb715e 100644
--- a/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -14,14 +14,11 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PointerUnion.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/StringToOffsetTable.h"
@@ -30,7 +27,6 @@
 #include <cctype>
 #include <functional>
 #include <map>
-#include <set>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
diff --git a/utils/TableGen/Makefile b/utils/TableGen/Makefile
deleted file mode 100644
index 1fde852..0000000
--- a/utils/TableGen/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-##===- utils/TableGen/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-TOOLNAME = clang-tblgen
-USEDLIBS = LLVMTableGen.a LLVMSupport.a
-
-# This tool has no plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-include $(LEVEL)/Makefile.common
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index a298cb1..75dc0ed 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -26,10 +26,8 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -40,6 +38,7 @@
 #include <map>
 #include <sstream>
 #include <string>
+#include <utility>
 #include <vector>
 using namespace llvm;
 
@@ -146,9 +145,10 @@
         NoManglingQ(false), Bitwidth(0), ElementBitwidth(0), NumVectors(0) {}
 
   Type(TypeSpec TS, char CharMod)
-      : TS(TS), Float(false), Signed(false), Immediate(false), Void(false),
-        Poly(false), Constant(false), Pointer(false), ScalarForMangling(false),
-        NoManglingQ(false), Bitwidth(0), ElementBitwidth(0), NumVectors(0) {
+      : TS(std::move(TS)), Float(false), Signed(false), Immediate(false),
+        Void(false), Poly(false), Constant(false), Pointer(false),
+        ScalarForMangling(false), NoManglingQ(false), Bitwidth(0),
+        ElementBitwidth(0), NumVectors(0) {
     applyModifier(CharMod);
   }
 
@@ -257,7 +257,7 @@
 
 public:
   Variable() : T(Type::getVoid()), N("") {}
-  Variable(Type T, std::string N) : T(T), N(N) {}
+  Variable(Type T, std::string N) : T(std::move(T)), N(std::move(N)) {}
 
   Type getType() const { return T; }
   std::string getName() const { return "__" + N; }
@@ -1195,12 +1195,12 @@
     emitNewLine();
 
     for (unsigned K = 0; K < Dest.getType().getNumVectors(); ++K) {
-      OS << "  " << Dest.getName() << ".val[" << utostr(K) << "] = "
+      OS << "  " << Dest.getName() << ".val[" << K << "] = "
          << "__builtin_shufflevector("
-         << Src.getName() << ".val[" << utostr(K) << "], "
-         << Src.getName() << ".val[" << utostr(K) << "]";
+         << Src.getName() << ".val[" << K << "], "
+         << Src.getName() << ".val[" << K << "]";
       for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
-        OS << ", " << utostr(J);
+        OS << ", " << J;
       OS << ");";
       emitNewLine();
     }
@@ -1208,7 +1208,7 @@
     OS << "  " << Dest.getName()
        << " = __builtin_shufflevector(" << Src.getName() << ", " << Src.getName();
     for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
-      OS << ", " << utostr(J);
+      OS << ", " << J;
     OS << ");";
     emitNewLine();
   }
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 724b0e1..ef6ad3a 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -13,7 +13,6 @@
 
 #include "TableGenBackends.h" // Declares all backends.
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TableGen/Error.h"
@@ -241,7 +240,7 @@
 }
 
 int main(int argc, char **argv) {
-  sys::PrintStackTraceOnErrorSignal();
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
   PrettyStackTraceProgram X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
 
diff --git a/utils/VtableTest/Makefile b/utils/VtableTest/Makefile
deleted file mode 100644
index dd615ae..0000000
--- a/utils/VtableTest/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-GXX := llvm-g++-4.2
-CLANGXX := clang++
-
-all: one
-
-test.cc: gen.cc
-	g++ gen.cc -o gen
-	./gen >test.cc
-
-test-gcc.sum: test.cc
-	time $(GXX) test.cc -o test-gcc.s -S
-	$(GXX) test-gcc.s -o test-gcc
-	./test-gcc >test-gcc.sum
-
-test-clang.sum: test.cc
-	time $(CLANGXX) test.cc -o test-clang.s -S
-	$(CLANGXX) test-clang.s -o test-clang
-	./test-clang >test-clang.sum
-
-one: test-gcc.sum test-clang.sum
-	cmp test-gcc.sum test-clang.sum
-
-clean:
-	rm -f gen test-gcc test-clang test.cc test-gcc.sum test-clang.sum test-gcc.s test-clang.s
diff --git a/utils/clang.natvis b/utils/clang.natvis
deleted file mode 100644
index a0004e9..0000000
--- a/utils/clang.natvis
+++ /dev/null
@@ -1,68 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!--
-Visual Studio 2012 Native Debugging Visualizers for LLVM
-
-Put this file into "%USERPROFILE%\Documents\Visual Studio 2012\Visualizers"
-or create a symbolic link so it updates automatically.
--->
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-  <Type Name="clang::Type">
-    <DisplayString Condition="(clang::Type::TypeClass)TypeBits.TC == clang::Type::Builtin">Builtin Type={(clang::BuiltinType::Kind)BuiltinTypeBits.Kind}</DisplayString>
-    <DisplayString Condition="(clang::Type::TypeClass)TypeBits.TC == clang::Type::Attributed">Modified Type={((clang::AttributedType*)this)->ModifiedType} Attribute={(clang::AttributedType::Kind)AttributedTypeBits.AttrKind}</DisplayString>
-    <DisplayString>Type Class={(clang::Type::TypeClass)TypeBits.TC}</DisplayString>
-  </Type>
-  <Type Name="clang::QualType">
-    <DisplayString>{((clang::ExtQualsTypeCommonBase *)(((uintptr_t)Value.Value) &amp; ~(uintptr_t)((1 &lt;&lt; clang::TypeAlignmentInBits) - 1)))-&gt;BaseType}</DisplayString>
-  </Type>
-  <Type Name="clang::IdentifierInfo">
-    <DisplayString Condition="Entry != 0">({((llvm::StringMapEntry&lt;clang::IdentifierInfo *&gt;*)Entry)+1,s})</DisplayString>
-    <Expand>
-      <Item Condition="Entry != 0" Name="[Identifier]">((llvm::StringMapEntry&lt;clang::IdentifierInfo *&gt;*)Entry)+1,s</Item>
-      <Item Name="Token Kind">(clang::tok::TokenKind)TokenID</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::DeclarationName">
-    <DisplayString Condition="Ptr == 0">Empty</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredIdentifier">{{Identifier ({*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredObjCZeroArgSelector">{{ObjC Zero Arg Selector (*{(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredObjCOneArgSelector">{{ObjC One Arg Selector (*{(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra">{{Extra ({(clang::DeclarationNameExtra::ExtraKind)((clang::DeclarationNameExtra *)(Ptr &amp; ~PtrMask))-&gt;ExtraKindOrNumArgs})}}</DisplayString>
-    <Expand>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredIdentifier" Name="[Identifier]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredObjCZeroArgSelector" Name="[ObjC Zero Arg Selector]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredObjCOneArgSelector" Name="[ObjC One Arg Selector]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra" Name="[Extra]">(clang::DeclarationNameExtra::ExtraKind)((clang::DeclarationNameExtra *)(Ptr &amp; ~PtrMask))-&gt;ExtraKindOrNumArgs</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::Token">
-    <DisplayString Condition="Kind != clang::tok::identifier">{(clang::tok::TokenKind)Kind}</DisplayString>
-    <DisplayString Condition="Kind == clang::tok::identifier">{{Identifier ({*(clang::IdentifierInfo *)(PtrData)})}}</DisplayString>
-  </Type>
-  <Type Name="clang::DeclSpec">
-    <DisplayString>[{(clang::DeclSpec::SCS)StorageClassSpec}], [{(clang::TypeSpecifierType)TypeSpecType}]</DisplayString>
-  </Type>
-  <Type Name="clang::PragmaHandler">
-    <DisplayString>{Name,s}</DisplayString>
-  </Type>
-  <Type Name="clang::FileEntry">
-    <DisplayString>{Name,s}</DisplayString>
-  </Type>
-  <Type Name="clang::DirectoryEntry">
-    <DisplayString>{Name,s}</DisplayString>
-  </Type>
-  <Type Name="clang::VarDecl::VarDeclBitfields">
-    <Expand>
-      <Item Name="StorageClass">(clang::StorageClass)SClass</Item>
-      <Item Name="ThreadStorageClass">(clang::ThreadStorageClassSpecifier)TSCSpec</Item>
-      <Item Name="InitStyle">(clang::VarDecl::InitializationStyle)InitStyle</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::VarDecl">
-    <DisplayString>{Name}</DisplayString>
-    <Expand>
-      <ExpandedItem>*(DeclaratorDecl*)this,nd</ExpandedItem>
-      <Item Name="VarDeclBits">VarDeclBits</Item>
-      <Item Name="ParmVarDeclBits">ParmVarDeclBits</Item>
-    </Expand>
-  </Type>
-</AutoVisualizer>
diff --git a/utils/modfuzz.py b/utils/modfuzz.py
new file mode 100644
index 0000000..a6aa1f1
--- /dev/null
+++ b/utils/modfuzz.py
@@ -0,0 +1,166 @@
+#! /usr/bin/env python
+
+# To use:
+#  1) Update the 'decls' list below with your fuzzing configuration.
+#  2) Run with the clang binary as the command-line argument.
+
+import random
+import subprocess
+import sys
+import os
+
+clang = sys.argv[1]
+none_opts = 0.3
+
+class Decl:
+  def __init__(self, text, depends=[], provides=[], conflicts=[]):
+    self.text = text
+    self.depends = depends
+    self.provides = provides
+    self.conflicts = conflicts
+
+  def valid(self, model):
+    for i in self.depends:
+      if i not in model.decls:
+        return False
+    for i in self.conflicts:
+      if i in model.decls:
+        return False
+    return True
+
+  def apply(self, model, name):
+    for i in self.provides:
+      model.decls[i] = True
+    model.source += self.text % {'name': name}
+
+decls = [
+  Decl('struct X { int n; };\n', provides=['X'], conflicts=['X']),
+  Decl('static_assert(X{.n=1}.n == 1, "");\n', depends=['X']),
+  Decl('X %(name)s;\n', depends=['X']),
+]
+
+class FS:
+  def __init__(self):
+    self.fs = {}
+    self.prevfs = {}
+
+  def write(self, path, contents):
+    self.fs[path] = contents
+
+  def done(self):
+    for f, s in self.fs.items():
+      if self.prevfs.get(f) != s:
+        f = file(f, 'w')
+        f.write(s)
+        f.close()
+
+    for f in self.prevfs:
+      if f not in self.fs:
+        os.remove(f)
+
+    self.prevfs, self.fs = self.fs, {}
+
+fs = FS()
+
+class CodeModel:
+  def __init__(self):
+    self.source = ''
+    self.modules = {}
+    self.decls = {}
+    self.i = 0
+
+  def make_name(self):
+    self.i += 1
+    return 'n' + str(self.i)
+
+  def fails(self):
+    fs.write('module.modulemap',
+          ''.join('module %s { header "%s.h" export * }\n' % (m, m)
+                  for m in self.modules.keys()))
+
+    for m, (s, _) in self.modules.items():
+      fs.write('%s.h' % m, s)
+
+    fs.write('main.cc', self.source)
+    fs.done()
+
+    return subprocess.call([clang, '-std=c++11', '-c', '-fmodules', 'main.cc', '-o', '/dev/null']) != 0
+
+def generate():
+  model = CodeModel()
+  m = []
+
+  try:
+    for d in mutations(model):
+      d(model)
+      m.append(d)
+    if not model.fails():
+      return
+  except KeyboardInterrupt:
+    print
+    return True
+
+  sys.stdout.write('\nReducing:\n')
+  sys.stdout.flush()
+
+  try:
+    while True:
+      assert m, 'got a failure with no steps; broken clang binary?'
+      i = random.choice(range(len(m)))
+      x = m[0:i] + m[i+1:]
+      m2 = CodeModel()
+      for d in x:
+        d(m2)
+      if m2.fails():
+        m = x
+        model = m2
+      else:
+        sys.stdout.write('.')
+        sys.stdout.flush()
+  except KeyboardInterrupt:
+    # FIXME: Clean out output directory first.
+    model.fails()
+    return model
+
+def choose(options):
+  while True:
+    i = int(random.uniform(0, len(options) + none_opts))
+    if i >= len(options):
+      break
+    yield options[i]
+
+def mutations(model):
+  options = [create_module, add_top_level_decl]
+  for opt in choose(options):
+    yield opt(model, options)
+
+def create_module(model, options):
+  n = model.make_name()
+  def go(model):
+    model.modules[n] = (model.source, model.decls)
+    (model.source, model.decls) = ('', {})
+  options += [lambda model, options: add_import(model, options, n)]
+  return go
+
+def add_top_level_decl(model, options):
+  n = model.make_name()
+  d = random.choice([decl for decl in decls if decl.valid(model)])
+  def go(model):
+    if not d.valid(model):
+      return
+    d.apply(model, n)
+  return go
+
+def add_import(model, options, module_name):
+  def go(model):
+    if module_name in model.modules:
+      model.source += '#include "%s.h"\n' % module_name
+      model.decls.update(model.modules[module_name][1])
+  return go
+
+sys.stdout.write('Finding bug: ')
+while True:
+  if generate():
+    break
+  sys.stdout.write('.')
+  sys.stdout.flush()
diff --git a/utils/perf-training/lit.site.cfg.in b/utils/perf-training/lit.site.cfg.in
index 52c5465..66683bc 100644
--- a/utils/perf-training/lit.site.cfg.in
+++ b/utils/perf-training/lit.site.cfg.in
@@ -1,7 +1,7 @@
+@LIT_SITE_CFG_IN_HEADER@
+
 import sys
 
-## Autogenerated by LLVM/Clang configuration.
-# Do not edit!
 config.clang_tools_dir = "@CLANG_TOOLS_DIR@"
 config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
 config.test_source_root = "@CMAKE_CURRENT_SOURCE_DIR@"
diff --git a/utils/perf-training/order-files.lit.site.cfg.in b/utils/perf-training/order-files.lit.site.cfg.in
index b5d4bbb..0490a21 100644
--- a/utils/perf-training/order-files.lit.site.cfg.in
+++ b/utils/perf-training/order-files.lit.site.cfg.in
@@ -1,7 +1,7 @@
+@LIT_SITE_CFG_IN_HEADER@
+
 import sys
 
-## Autogenerated by LLVM/Clang configuration.
-# Do not edit!
 config.clang_tools_dir = "@CLANG_TOOLS_DIR@"
 config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
 config.test_source_root = "@CMAKE_CURRENT_SOURCE_DIR@"
diff --git a/utils/perf-training/perf-helper.py b/utils/perf-training/perf-helper.py
index ee6161b..30b9cae 100644
--- a/utils/perf-training/perf-helper.py
+++ b/utils/perf-training/perf-helper.py
@@ -123,6 +123,7 @@
           ln.startswith('Target:') or
           ln.startswith('Thread model:') or
           ln.startswith('InstalledDir:') or
+          ln.startswith('LLVM Profile Note') or
           ' version ' in ln):
           continue
       cc_commands.append(ln)
@@ -158,7 +159,7 @@
   cc1_cmd = get_cc1_command_for_args(cmd, cc1_env)
 
   subprocess.check_call(cc1_cmd)
-  return 0;
+  return 0
 
 def parse_dtrace_symbol_file(path, all_symbols, all_symbols_set,
                              missing_symbols, opts):
diff --git a/www/analyzer/checker_dev_manual.html b/www/analyzer/checker_dev_manual.html
index c674a74..93c9891 100644
--- a/www/analyzer/checker_dev_manual.html
+++ b/www/analyzer/checker_dev_manual.html
@@ -45,7 +45,13 @@
       <li><a href="#bugs">Bug Reports</a></li>
       <li><a href="#ast">AST Visitors</a></li>
       <li><a href="#testing">Testing</a></li>
-      <li><a href="#commands">Useful Commands/Debugging Hints</a></li>
+      <li><a href="#commands">Useful Commands/Debugging Hints</a>
+      <ul>
+        <li><a href="#attaching">Attaching the Debugger</a></li>
+        <li><a href="#narrowing">Narrowing Down the Problem</a></li>
+        <li><a href="#visualizing">Visualizing the Analysis</a></li>
+        <li><a href="#debugprints">Debug Prints and Tricks</a></li>
+      </ul></li>
       <li><a href="#additioninformation">Additional Sources of Information</a></li>
       <li><a href="#links">Useful Links</a></li>
     </ul>
@@ -115,6 +121,8 @@
   </ul>
   
   <h3 id=interaction>Interaction with Checkers</h3>
+
+  <p>
   Checkers are not merely passive receivers of the analyzer core changes - they 
   actively participate in the <tt>ProgramState</tt> construction through the
   <tt>GenericDataMap</tt> which can be used to store the checker-defined part 
@@ -123,9 +131,12 @@
   opportunity to either report a bug or modify the state. (As a rule of thumb, 
   the checker itself should be stateless.) The checkers are called one after another 
   in the predefined order; thus, calling all the checkers adds a chain to the 
-  <tt>ExplodedGraph</tt>. 
+  <tt>ExplodedGraph</tt>.
+  </p>
   
   <h3 id=values>Representing Values</h3>
+
+  <p>
   During symbolic execution, <a href="http://clang.llvm.org/doxygen/classclang_1_1ento_1_1SVal.html">SVal</a> 
   objects are used to represent the semantic evaluation of expressions. 
   They can represent things like concrete 
@@ -142,7 +153,9 @@
   This represents a case that is outside the realm of the analyzer's reasoning 
   capabilities. <tt>SVals</tt> are value objects and their values can be viewed 
   using the <tt>.dump()</tt> method. Often they wrap persistent objects such as 
-  symbols or regions. 
+  symbols or regions.
+  </p>
+
   <p>
   <a href="http://clang.llvm.org/doxygen/classclang_1_1ento_1_1SymExpr.html">SymExpr</a> (symbol) 
   is meant to represent abstract, but named, symbolic value. Symbols represent 
@@ -150,7 +163,7 @@
   we can associate constraints with that value as we analyze a path. For 
   example, we might record that the value of a symbol is greater than 
   <tt>0</tt>, etc.
-  <p>
+  </p>
 
   <p>
   <a href="http://clang.llvm.org/doxygen/classclang_1_1ento_1_1MemRegion.html">MemRegion</a> is similar to a symbol.  
@@ -163,9 +176,12 @@
   <a href="http://clang.llvm.org/doxygen/classclang_1_1ento_1_1SymbolicRegion.html">SymbolicRegion</a> 
   is for. It is a <tt>MemRegion</tt> that has an associated symbol. Since the 
   symbol is unique and has a unique name; that symbol names the region.
+  </p>
   
-  <P>
+  <p>
   Let's see how the analyzer processes the expressions in the following example:
+  </p>
+
   <p>
   <pre class="code_example">
   int foo(int x) {
@@ -174,6 +190,8 @@
      ...
   }
   </pre>
+  </p>
+
   <p>
 Let's look at how <tt>x*2</tt> gets evaluated. When <tt>x</tt> is evaluated, 
 we first construct an <tt>SVal</tt> that represents the lvalue of <tt>x</tt>, in 
@@ -193,6 +211,7 @@
 The second line is similar. When we evaluate <tt>x</tt> again, we do the same 
 dance, and create an <tt>SVal</tt> that references the symbol <tt>$0</tt>. Note, two <tt>SVals</tt> 
 might reference the same underlying values.
+  </p>
 
 <p>
 To summarize, MemRegions are unique names for blocks of memory. Symbols are 
@@ -200,6 +219,7 @@
 symbolic chunks of memory, and thus are also based on symbols. SVals are just 
 references to values, and can reference either MemRegions, Symbols, or concrete 
 values (e.g., the number 1).
+</p>
 
   <!-- 
   TODO: Add a picture.
@@ -511,75 +531,139 @@
   live in <tt>clang/test/Analysis</tt> folder. To run all of the analyzer tests, 
   execute the following from the <tt>clang</tt> build directory:
     <pre class="code">
-    $ <b>TESTDIRS=Analysis make test</b>
+    $ <b>bin/llvm-lit -sv ../llvm/tools/clang/test/Analysis</b>
     </pre>
 
 <h2 id=commands>Useful Commands/Debugging Hints</h2>
-<ul>
-<li>
-While investigating a checker-related issue, instruct the analyzer to only 
+
+<h3 id=attaching>Attaching the Debugger</h3>
+
+<p>When your command contains the <tt><b>-cc1</b></tt> flag, you can attach the
+debugger to it directly:</p>
+
+<pre class="code">
+    $ <b>gdb --args clang -cc1 -analyze -analyzer-checker=core test.c</b>
+    $ <b>lldb -- clang -cc1 -analyze -analyzer-checker=core test.c</b>
+</pre>
+
+<p>
+Otherwise, if your command line contains <tt><b>--analyze</b></tt>,
+the actual clang instance would be run in a separate process. In
+order to debug it, use the <tt><b>-###</b></tt> flag for obtaining
+the command line of the child process:
+</p>
+
+<pre class="code">
+    $ <b>clang --analyze test.c -\#\#\#</b>
+</pre>
+
+<p>
+Below we describe a few useful command line arguments, all of which assume that
+you are running <tt><b>clang -cc1</b></tt>.
+</p>
+
+<h3 id=narrowing>Narrowing Down the Problem</h3>
+
+<p>While investigating a checker-related issue, instruct the analyzer to only
 execute a single checker:
-<br><tt>
-$ <b>clang -cc1 -analyze -analyzer-checker=osx.KeychainAPI test.c</b>
-</tt>
-</li>
-<li>
-To dump AST:
-<br><tt>
-$ <b>clang -cc1 -ast-dump test.c</b>
-</tt>
-</li>
-<li>
-To view/dump CFG use <tt>debug.ViewCFG</tt> or <tt>debug.DumpCFG</tt> checkers:
-<br><tt>
-$ <b>clang -cc1 -analyze -analyzer-checker=debug.ViewCFG test.c</b>
-</tt> 
-</li>
-<li>
-To see all available debug checkers:
-<br><tt>
-$ <b>clang -cc1 -analyzer-checker-help | grep "debug"</b>
-</tt>
-</li>
-<li>
-To see which function is failing while processing a large file use 
-<tt>-analyzer-display-progress</tt> option.
-</li>
-<li>
-While debugging execute <tt>clang -cc1 -analyze -analyzer-checker=core</tt> 
-instead of <tt>clang --analyze</tt>, as the later would call the compiler 
-in a separate process.
-</li>
-<li>
-To view <tt>ExplodedGraph</tt> (the state graph explored by the analyzer) while 
-debugging, goto a frame that has <tt>clang::ento::ExprEngine</tt> object and 
-execute:
-<br><tt> 
-(gdb) <b>p ViewGraph(0)</b>
-</tt>
-</li>
-<li>
-To see the <tt>ProgramState</tt> while debugging use the following command. 
-<br><tt>
-(gdb) <b>p State->dump()</b>
-</tt> 
-</li>
-<li>
-To see <tt>clang::Expr</tt> while debugging use the following command. If you 
-pass in a SourceManager object, it will also dump the corresponding line in the 
-source code.
-<br><tt>
-(gdb) <b>p E->dump()</b>
-</tt> 
-</li>
-<li>
-To dump AST of a method that the current <tt>ExplodedNode</tt> belongs to:
-<br><tt>
-(gdb) <b>p C.getPredecessor()->getCodeDecl().getBody()->dump()</b>
-(gdb) <b>p C.getPredecessor()->getCodeDecl().getBody()->dump(getContext().getSourceManager())</b>
-</tt>
-</li>
-</ul>
+</p>
+<pre class="code">
+    $ <b>clang -cc1 -analyze -analyzer-checker=osx.KeychainAPI test.c</b>
+</pre>
+
+<p>If you are experiencing a crash, to see which function is failing while
+processing a large file use the  <tt><b>-analyzer-display-progress</b></tt>
+option.</p>
+
+<p>You can analyze a particular function within the file, which is often useful
+because the problem is always in a certain function:</p>
+<pre class="code">
+    $ <b>clang -cc1 -analyze -analyzer-checker=core test.c -analyzer-display-progress</b>
+    ANALYZE (Syntax): test.c foo
+    ANALYZE (Syntax): test.c bar
+    ANALYZE (Path,  Inline_Regular): test.c bar
+    ANALYZE (Path,  Inline_Regular): test.c foo
+    $ <b>clang -cc1 -analyze -analyzer-checker=core test.c -analyzer-display-progress -analyze-function=foo</b>
+    ANALYZE (Syntax): test.c foo
+    ANALYZE (Path,  Inline_Regular): test.c foo
+</pre>
+
+<p>The bug reporter mechanism removes path diagnostics inside intermediate
+function calls that have returned by the time the bug was found and contain
+no interesting pieces. Usually it is up to the checkers to produce more
+interesting pieces by adding custom <tt>BugReporterVisitor</tt> objects.
+However, you can disable path pruning while debugging with the
+<tt><b>-analyzer-config prune-paths=false</b></tt> option.
+
+<h3 id=visualizing>Visualizing the Analysis</h3>
+
+<p>To dump the AST, which often helps understanding how the program should
+behave:</p>
+<pre class="code">
+    $ <b>clang -cc1 -ast-dump test.c</b>
+</pre>
+
+<p>To view/dump CFG use <tt>debug.ViewCFG</tt> or <tt>debug.DumpCFG</tt>
+checkers:</p>
+<pre class="code">
+    $ <b>clang -cc1 -analyze -analyzer-checker=debug.ViewCFG test.c</b>
+</pre>
+
+<p><tt>ExplodedGraph</tt> (the state graph explored by the analyzer) can be
+visualized with another debug checker:</p>
+<pre class="code">
+    $ <b>clang -cc1 -analyze -analyzer-checker=debug.ViewExplodedGraph test.c</b>
+</pre>
+<p>Or, equivalently, with <tt><b>-analyzer-viz-egraph-graphviz</b></tt>
+option, which does the same thing - dumps the exploded graph in graphviz
+<tt><b>.dot</b></tt> format.</p>
+
+<p>You can convert <tt><b>.dot</b></tt> files into other formats - in
+particular, converting to <tt><b>.svg</b></tt> and viewing in your web
+browser might be more comfortable than using a <tt><b>.dot</b></tt> viewer:</p>
+<pre class="code">
+    $ <b>dot -Tsvg ExprEngine-501e2e.dot -o ExprEngine-501e2e.svg</b>
+</pre>
+
+<p>The <tt><b>-trim-egraph</b></tt> option removes all paths except those
+leading to bug reports from the exploded graph dump. This is useful
+because exploded graphs are often huge and hard to navigate.</p>
+
+<p>Viewing <tt>ExplodedGraph</tt> is your most powerful tool for understanding
+the analyzer's false positives, because it gives comprehensive information
+on every decision made by the analyzer across all analysis paths.</p>
+
+<p>There are more debug checkers available. To see all available debug checkers:
+</p>
+<pre class="code">
+    $ <b>clang -cc1 -analyzer-checker-help | grep "debug"</b>
+</pre>
+
+<h3 id=debugprints>Debug Prints and Tricks</h3>
+
+<p>To view "half-baked" <tt>ExplodedGraph</tt> while debugging, jump to a frame
+that has <tt>clang::ento::ExprEngine</tt> object and execute:</p>
+<pre class="code">
+    (gdb) <b>p ViewGraph(0)</b>
+</pre>
+
+<p>To see the <tt>ProgramState</tt> while debugging use the following command.
+<pre class="code">
+    (gdb) <b>p State->dump()</b>
+</pre>
+
+<p>To see <tt>clang::Expr</tt> while debugging use the following command. If you
+pass in a <tt>SourceManager</tt> object, it will also dump the corresponding line in the
+source code.</p>
+<pre class="code">
+    (gdb) <b>p E->dump()</b>
+</pre>
+
+<p>To dump AST of a method that the current <tt>ExplodedNode</tt> belongs
+to:</p>
+<pre class="code">
+    (gdb) <b>p C.getPredecessor()->getCodeDecl().getBody()->dump()</b>
+</pre>
 
 <h2 id=additioninformation>Additional Sources of Information</h2>
 
diff --git a/www/analyzer/faq.html b/www/analyzer/faq.html
index 9d2962b..cf3dc70 100644
--- a/www/analyzer/faq.html
+++ b/www/analyzer/faq.html
@@ -28,6 +28,7 @@
 null?</a></li>
   <li><a href="#dead_store">How do I tell the static analyzer that I don't care about a specific dead store?</a></li>
   <li><a href="#unused_ivar">How do I tell the static analyzer that I don't care about a specific unused instance variable in Objective C?</a></li>
+  <li><a href="#unlocalized_string">How do I tell the static analyzer that I don't care about a specific unlocalized string?</a></li>
   <li><a href="#use_assert">The analyzer assumes that a loop body is never entered.  How can I tell it that the loop body will be entered at least once?</a></li>
   <li><a href="#suppress_issue">How can I suppress a specific analyzer warning?</a></li>
   <li><a href="#exclude_code">How can I selectively exclude code the analyzer examines?</a></li>
@@ -78,6 +79,32 @@
 <pre class="code_example">Instance variable 'commonName' in class 'HappyBird' is never used by the methods in its @implementation</pre>
 You can add <tt>__attribute__((unused))</tt> to the instance variable declaration to suppress the warning.</p>
 
+<h4 id="unlocalized_string" class="faq">Q: How do I tell the static analyzer that I don't care about a specific unlocalized string?</h4>
+
+<p>When the analyzer sees that an unlocalized string is passed to a method that will present that string to the user, it is going to produce a message similar to this one:
+<pre class="code_example">User-facing text should use localized string macro</pre>
+
+If your project deliberately uses unlocalized user-facing strings (for example, in a debugging UI that is never shown to users), you can suppress the analyzer warnings (and document your intent) with a function that just returns its input but is annotated to return a localized string:
+<pre class="code_example">
+__attribute__((annotate("returns_localized_nsstring")))
+static inline NSString *LocalizationNotNeeded(NSString *s) {
+  return s;
+}
+</pre>
+
+You can then call this function when creating your debugging UI:
+<pre class="code_example">
+[field setStringValue:LocalizationNotNeeded(@"Debug")];
+</pre>
+
+Some projects may also find it useful to use NSLocalizedString but add "DNL" or "Do Not Localize" to the string contents as a convention:
+<pre class="code_example">
+UILabel *testLabel = [[UILabel alloc] init];
+NSString *s = NSLocalizedString(@"Hello &lt;Do Not Localize&gt;", @"For debug purposes");
+[testLabel setText:s];
+</pre>
+</p>
+
 <h4 id="use_assert" class="faq">Q: The analyzer assumes that a loop body is never entered.  How can I tell it that the loop body will be entered at least once?</h4>
 
 <img src="images/example_use_assert.png" alt="example use assert">
diff --git a/www/compatibility.html b/www/compatibility.html
index 293be6f..512beaa 100644
--- a/www/compatibility.html
+++ b/www/compatibility.html
@@ -415,19 +415,11 @@
 
 <p>GCC and C99 allow an array's size to be determined at run
 time. This extension is not permitted in standard C++. However, Clang
-supports such variable length arrays in very limited circumstances for
-compatibility with GNU C and C99 programs:</p>
+supports such variable length arrays for compatibility with GNU C and
+C99 programs.</p>
 
-<ul>  
-  <li>The element type of a variable length array must be a POD
-  ("plain old data") type, which means that it cannot have any
-  user-declared constructors or destructors, any base classes, or any
-  members of non-POD type. All C types are POD types.</li>
-
-  <li>Variable length arrays cannot be used as the type of a non-type
-template parameter.</li> </ul>
-
-<p>If your code uses variable length arrays in a manner that Clang doesn't support, there are several ways to fix your code:
+<p>If you would prefer not to use this extension, you can disable it with
+<tt>-Werror=vla</tt>. There are several ways to fix your code:
 
 <ol>
 <li>replace the variable length array with a fixed-size array if you can
@@ -566,7 +558,7 @@
 <h3 id="dep_lookup_bases">Unqualified lookup into dependent bases of class templates</h3>
 <!-- ======================================================================= -->
 
-Some versions of GCC accept the following invalid code:
+<p>Some versions of GCC accept the following invalid code:
 
 <pre>
 template &lt;typename T&gt; struct Base {
@@ -636,7 +628,7 @@
 <h3 id="undep_incomplete">Incomplete types in templates</h3>
 <!-- ======================================================================= -->
 
-The following code is invalid, but compilers are allowed to accept it:
+<p>The following code is invalid, but compilers are allowed to accept it:
 
 <pre>
   class IOOptions;
@@ -667,7 +659,7 @@
 <h3 id="bad_templates">Templates with no valid instantiations</h3>
 <!-- ======================================================================= -->
 
-The following code contains a typo: the programmer
+<p>The following code contains a typo: the programmer
 meant <tt>init()</tt> but wrote <tt>innit()</tt> instead.
 
 <pre>
@@ -714,7 +706,7 @@
 <h3 id="default_init_const">Default initialization of const variable of a class type requires user-defined default constructor</h3>
 <!-- ======================================================================= -->
 
-If a <tt>class</tt> or <tt>struct</tt> has no user-defined default
+<p>If a <tt>class</tt> or <tt>struct</tt> has no user-defined default
 constructor, C++ doesn't allow you to default construct a <tt>const</tt>
 instance of it like this ([dcl.init], p9):
 
@@ -747,11 +739,15 @@
 }
 </pre>
 
+An upcoming change to the C++ standard is expected to weaken this rule to only
+apply when the compiler-supplied default constructor would leave a member
+uninitialized. Clang implements the more relaxed rule in version 3.8 onwards.
+
 <!-- ======================================================================= -->
 <h3 id="param_name_lookup">Parameter name lookup</h3>
 <!-- ======================================================================= -->
 
-<p>Due to a bug in its implementation, GCC allows the redeclaration of function parameter names within a function prototype in C++ code, e.g.</p>
+<p>Some versions of GCC allow the redeclaration of function parameter names within a function prototype in C++ code, e.g.</p>
 <blockquote>
 <pre>
 void f(int a, int a);
diff --git a/www/cxx_dr_status.html b/www/cxx_dr_status.html
index 9bf8f91..0e9af71 100644
--- a/www/cxx_dr_status.html
+++ b/www/cxx_dr_status.html
@@ -1308,11 +1308,11 @@
     <td>Constructors should not be allowed to return normally after an exception</td>
     <td class="full" align="center">Yes</td>
   </tr>
-  <tr class="open" id="212">
+  <tr id="212">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#212">212</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Implicit instantiation is not described clearly enough</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="213">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#213">213</a></td>
@@ -3023,7 +3023,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#497">497</a></td>
     <td>CD1</td>
     <td>Missing required initialization in example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="none" align="center">Superseded by <a href="#253">253</a></td>
   </tr>
   <tr class="open" id="498">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#498">498</a></td>
@@ -3523,7 +3523,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#580">580</a></td>
     <td>C++11</td>
     <td>Access in <I>template-parameter</I>s of member and friend definitions</td>
-    <td class="none" align="center">No</td>
+    <td class="partial" align="center">Partial</td>
   </tr>
   <tr class="open" id="581">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#581">581</a></td>
@@ -7315,7 +7315,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1250">1250</a></td>
     <td>CD3</td>
     <td>Cv-qualification of incomplete virtual function return types</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1251">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1251">1251</a></td>
@@ -7969,7 +7969,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1359">1359</a></td>
     <td>CD3</td>
     <td><TT>constexpr</TT> union constructors</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr class="open" id="1360">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1360">1360</a></td>
@@ -9137,7 +9137,7 @@
   </tr>
   <tr class="open" id="1554">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1554">1554</a></td>
-    <td>drafting</td>
+    <td>open</td>
     <td>Access and alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -9253,7 +9253,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1573">1573</a></td>
     <td>DRWP</td>
     <td>Inherited constructor characteristics</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1574">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#1574">1574</a></td>
@@ -9289,7 +9289,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1579">1579</a></td>
     <td>C++14</td>
     <td>Return by converting move constructor</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr class="open" id="1580">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1580">1580</a></td>
@@ -9601,7 +9601,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1631">1631</a></td>
     <td>DRWP</td>
     <td>Incorrect overload resolution for single-element <I>initializer-list</I></td>
-    <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
+    <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr class="open" id="1632">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1632">1632</a></td>
@@ -9685,7 +9685,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1645">1645</a></td>
     <td>DR</td>
     <td>Identical inheriting constructors via default arguments</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr class="open" id="1646">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1646">1646</a></td>
@@ -10105,7 +10105,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1715">1715</a></td>
     <td>DR</td>
     <td>Access and inherited constructor templates</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1716">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1716">1716</a></td>
@@ -10231,7 +10231,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1736">1736</a></td>
     <td>DR</td>
     <td>Inheriting constructor templates in a local class</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1737">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1737">1737</a></td>
@@ -10351,7 +10351,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1756">1756</a></td>
     <td>DRWP</td>
     <td>Direct-list-initialization of a non-class object</td>
-    <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
+    <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1757">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1757">1757</a></td>
@@ -10363,7 +10363,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1758">1758</a></td>
     <td>DRWP</td>
     <td>Explicit conversion in copy/move list initialization</td>
-    <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
+    <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1759">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1759">1759</a></td>
@@ -11043,11 +11043,11 @@
     <td>Non-identifier characters in <I>ud-suffix</I></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="1872">
+  <tr id="1872">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1872">1872</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Instantiations of <TT>constexpr</TT> templates that cannot appear in constant expressions</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="1873">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1873">1873</a></td>
@@ -11461,7 +11461,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1941">1941</a></td>
     <td>DR</td>
     <td>SFINAE and inherited constructor default arguments</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1942">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1942">1942</a></td>
@@ -11569,7 +11569,7 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1959">1959</a></td>
     <td>DR</td>
     <td>Inadvertently inherited copy constructor</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1960">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#1960">1960</a></td>
@@ -11761,13 +11761,13 @@
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1991">1991</a></td>
     <td>DR</td>
     <td>Inheriting constructors vs default arguments</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.9</td>
   </tr>
-  <tr class="open" id="1992">
+  <tr id="1992">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1992">1992</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td><TT>new (std::nothrow) int[N]</TT> can throw</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1993">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1993">1993</a></td>
@@ -11787,11 +11787,11 @@
     <td><I>exception-specification</I>s and non-type template parameters</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr id="1996">
-    <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/">1996</a></td>
-    <td></td>
+  <tr class="open" id="1996">
+    <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1996">1996</a></td>
+    <td>drafting</td>
     <td>Reference list-initialization ignores conversion functions</td>
-    <td class="none" align="center">Unknown</td>
+    <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1997">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1997">1997</a></td>
@@ -12095,7 +12095,7 @@
   </tr>
   <tr id="2047">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2047">2047</a></td>
-    <td>ready</td>
+    <td>tentatively ready</td>
     <td>Coordinating &#8220;throws anything&#8221; specifications</td>
     <td class="none" align="center">Unknown</td>
   </tr>
@@ -12195,11 +12195,11 @@
     <td>Type/nontype hiding in class scope</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2064">
+  <tr id="2064">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2064">2064</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Conflicting specifications for dependent <I>decltype-specifier</I>s</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2065">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2065">2065</a></td>
@@ -12435,11 +12435,11 @@
     <td>Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2104">
+  <tr id="2104">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2104">2104</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Internal-linkage <TT>constexpr</TT> references and ODR requirements</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2105">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2105">2105</a></td>
@@ -12543,11 +12543,11 @@
     <td>More flexible lambda syntax</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2122">
+  <tr id="2122">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2122">2122</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Glvalues of <TT>void</TT> type</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2123">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2123">2123</a></td>
@@ -12585,11 +12585,11 @@
     <td>Imprecise rule for reference member initializer</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2129">
+  <tr id="2129">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2129">2129</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Non-object prvalues and constant expressions</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="2130">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2130">2130</a></td>
@@ -12651,17 +12651,17 @@
     <td>Floating-point requirements for integer representation</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2140">
+  <tr id="2140">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2140">2140</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Lvalue-to-rvalue conversion of <TT>std::nullptr_t</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2141">
+  <tr id="2141">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2141">2141</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Ambiguity in <I>new-expression</I> with <I>elaborated-type-specifier</I></td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="2142">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#2142">2142</a></td>
@@ -12669,11 +12669,11 @@
     <td>Missing definition of associated classes and namespaces</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2143">
+  <tr id="2143">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2143">2143</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Value-dependency via injected-class-name</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2144">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2144">2144</a></td>
@@ -12687,11 +12687,11 @@
     <td>Parenthesized declarator in function definition</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2146">
+  <tr id="2146">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2146">2146</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Scalar object vs memory location in definition of &#8220;unsequenced&#8221;</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="2147">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2147">2147</a></td>
@@ -12729,17 +12729,17 @@
     <td>Can an alternative token be used as a <I>ud-suffix</I>?</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2153">
+  <tr id="2153">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2153">2153</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td><I>pure-specifier</I> in friend declaration</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2154">
+  <tr id="2154">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2154">2154</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Ambiguity of <I>pure-specifier</I></td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2155">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2155">2155</a></td>
@@ -12747,11 +12747,11 @@
     <td>Defining classes and enumerations via <I>using-declaration</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2156">
+  <tr id="2156">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2156">2156</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Definition of enumeration declared by <I>using-declaration</I></td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="2157">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2157">2157</a></td>
@@ -12783,17 +12783,17 @@
     <td>Explicit instantiation declaration and &#8220;preceding initialization&#8221;</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2162">
+  <tr id="2162">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2162">2162</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Capturing <TT>this</TT> by reference</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2163">
+  <tr id="2163">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2163">2163</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Labels in <TT>constexpr</TT> functions</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2164">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2164">2164</a></td>
@@ -12813,11 +12813,11 @@
     <td>Unclear meaning of &#8220;undefined <TT>constexpr</TT> function&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2167">
+  <tr id="2167">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2167">2167</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Non-member references with lifetimes within the current evaluation</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2168">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2168">2168</a></td>
@@ -12827,7 +12827,7 @@
   </tr>
   <tr class="open" id="2169">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2169">2169</a></td>
-    <td>drafting</td>
+    <td>open</td>
     <td>Narrowing conversions and overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -12861,17 +12861,17 @@
     <td>Unclear rules for friend definitions in templates</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2175">
+  <tr id="2175">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2175">2175</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Ambiguity with attribute in conversion operator declaration</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2176">
+  <tr id="2176">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2176">2176</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Destroying the returned object when a destructor throws</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2177">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2177">2177</a></td>
@@ -12891,11 +12891,11 @@
     <td>Required diagnostic for partial specialization after first use</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2180">
+  <tr id="2180">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2180">2180</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Virtual bases in destructors and defaulted assignment operators</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2181">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2181">2181</a></td>
diff --git a/www/cxx_status.html b/www/cxx_status.html
index fb458ca..b22ec45 100644
--- a/www/cxx_status.html
+++ b/www/cxx_status.html
@@ -14,6 +14,7 @@
     .na { background-color: #DDDDDD }
     span:target { background-color: #FFFFBB; outline: #DDDD55 solid thin; }
     th { background-color: #FFDDAA }
+    td { vertical-align: middle }
   </style>
 </head>
 <body>
@@ -67,8 +68,7 @@
 work with Clang in C++11 mode. Patches are also needed to make
 <a href="libstdc++4.6-clang11.patch">libstdc++-4.6</a>
 and <a href="libstdc++4.7-clang11.patch">libstdc++-4.7</a> work with Clang
-releases prior to version 3.2 in C++11 mode. <tt>thread_local</tt> support
-currently requires the C++ runtime library from g++-4.8 or later.</p>
+releases prior to version 3.2 in C++11 mode.
 
 <table width="689" border="1" cellspacing="0">
  <tr>
@@ -373,7 +373,7 @@
     <tr>
       <td>Thread-local storage</td>
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2659.htm">N2659</a></td>
-      <td class="full" align="center">Clang 3.3</td>
+      <td class="full" align="center">Clang 3.3 <a href="#n2659">(5)</a></td>
     </tr>
     <tr>
       <td>Dynamic initialization and destruction with concurrency</td>
@@ -402,7 +402,7 @@
     <tr>
       <td>Extended integral types</td>
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2006/n1988.pdf">N1988</a></td>
-      <td class="na" align="center">N/A <a href="#n1988">(5)</a></td>
+      <td class="na" align="center">N/A <a href="#n1988">(6)</a></td>
     </tr>
 </table>
 
@@ -415,7 +415,11 @@
 strong compare-exchanges.</span><br>
 <span id="n2664">(4): <code>memory_order_consume</code> is lowered to
 <code>memory_order_acquire</code>.</span><br>
-<span id="n1988">(5): No compiler changes are required for an implementation
+<span id="n2659">(5): <code>thread_local</code> support
+requires a C++ runtime library providing <code>__cxa_thread_atexit</code>, such
+as <a href="http://libcxxabi.llvm.org">libc++abi</a> 3.6 or later,
+or libsupc++ 4.8 or later.</span><br>
+<span id="n1988">(6): No compiler changes are required for an implementation
 such as Clang that does not provide any extended integer types.
 <code>__int128</code> is not treated as an extended integer type,
 because changing <code>intmax_t</code> would be an ABI-incompatible
@@ -501,12 +505,12 @@
     <tr>
       <td>C++ Sized Deallocation</td>
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3778.html">N3778</a></td>
-      <td class="full" align="center">Clang 3.4 <a href="#n3778">(6)</a></td>
+      <td class="full" align="center">Clang 3.4 <a href="#n3778">(7)</a></td>
     </tr>
 </table>
 
 <p>
-<span id="n3778">(6): In Clang 3.7 and later, sized deallocation is only enabled
+<span id="n3778">(7): In Clang 3.7 and later, sized deallocation is only enabled
 if the user passes the <code>-fsized-deallocation</code> flag. The user must
 supply definitions of the sized deallocation functions, either by providing them
 explicitly or by using a C++ standard library that does. <code>libstdc++</code>
@@ -517,9 +521,9 @@
 
 <h2 id="cxx17">C++1z implementation status</h2>
 
-<p>Clang has <b>highly experimental</b> support for some proposed features of
-the C++ standard following C++14,
-provisionally named C++1z.  The following table describes which C++1z features
+<p>Clang has <b>experimental</b> support for some proposed features of
+the C++ standard following C++14, provisionally named C++1z.
+The following table describes which C++1z features
 have been implemented in Clang and in which Clang version they became
 available.</p>
 
@@ -564,14 +568,18 @@
     <tr>
       <td>New <tt>auto</tt> rules for direct-list-initialization
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n3922.html">N3922</a></td>
-      <td class="svn" align="center">Clang 3.8 <a href="#n3922">(7)</a></td>
+      <td class="full" align="center">Clang 3.8 <a href="#n3922">(8)</a></td>
     </tr>
     <!-- Urbana papers -->
     <tr>
-      <td>Fold expressions</td>
+      <td rowspan="2">Fold expressions</td>
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4295.html">N4295</a></td>
       <td class="full" align="center">Clang 3.6</td>
     </tr>
+      <tr> <!-- from Jacksonville -->
+        <td><a href="http://wg21.link/p0036r0">P0036R0</a></td>
+        <td class="full" align="center">Clang 3.9</td>
+      </tr>
     <tr>
       <td><tt>u8</tt> character literals</td>
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4267.html">N4267</a></td>
@@ -596,12 +604,12 @@
     <tr>
       <td>Remove deprecated <tt>register</tt> storage class</td>
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0001r1.html">P0001R1</a></td>
-      <td class="svn" align="center">Clang 3.8</td>
+      <td class="full" align="center">Clang 3.8</td>
     </tr>
     <tr>
       <td>Remove deprecated <tt>bool</tt> increment</td>
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0002r1.html">P0002R1</a></td>
-      <td class="svn" align="center">Clang 3.8</td>
+      <td class="full" align="center">Clang 3.8</td>
     </tr>
     <tr>
       <td>Make exception specifications part of the type system</td>
@@ -613,13 +621,126 @@
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0061.html">P0061R1</a></td>
       <td class="full" align="center">Yes</td>
     </tr>
+    <tr>
+      <td>New specification for inheriting constructors (<a href="cxx_dr_status.html#1941">DR1941</a> et al)</td>
+      <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0136r1.html">P0136R1</a></td>
+      <td class="full" align="center">Clang 3.9 <a href="#p0136">(9)</a></td>
+    </tr>
+    <!-- Jacksonville papers -->
+    <tr>
+      <td><tt>[[fallthrough]]</tt> attribute</td>
+      <td><a href="http://wg21.link/p0188r1">P0188R1</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td><tt>[[nodiscard]]</tt> attribute</td>
+      <td><a href="http://wg21.link/p0189r1">P0189R1</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td><tt>[[maybe_unused]]</tt> attribute</td>
+      <td><a href="http://wg21.link/p0212r1">P0212R1</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td>Aggregate initialization of classes with base classes</td>
+      <td><a href="http://wg21.link/p0017r1">P0017R1</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td><tt>constexpr</tt> lambda expressions</td>
+      <td><a href="http://wg21.link/p0170r1">P0170R1</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Differing <tt>begin</tt> and <tt>end</tt> types in range-based <tt>for</tt></td>
+      <td><a href="http://wg21.link/p0184r0">P0184R0</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td>Lambda capture of <tt>*this</tt></td>
+      <td><a href="http://wg21.link/p0018r3">P0018R3</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td>Direct-list-initialization of <tt>enum</tt>s</td>
+      <td><a href="http://wg21.link/p0138r2">P0138R2</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td>Hexadecimal floating-point literals</td>
+      <td><a href="http://wg21.link/p0245r1">P0245R1</a></td>
+      <td class="full" align="center">Yes</td>
+    </tr>
+    <!-- Oulu papers -->
+    <tr>
+      <td>Using attribute namespaces without repetition</td>
+      <td><a href="http://wg21.link/p0028r4">P0028R4</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td>Dynamic memory allocation for over-aligned data</td>
+      <td><a href="http://wg21.link/p0035r4">P0035R4</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Template argument deduction for class templates</td>
+      <td><a href="http://wg21.link/p0091r3">P0091R3</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Non-type template parameters with <tt>auto</tt> type</td>
+      <td><a href="http://wg21.link/p0127r2">P0127R2</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Guaranteed copy elision</td>
+      <td><a href="http://wg21.link/p0135r1">P0135R1</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td rowspan=2>Stricter expression evaluation order</td>
+      <td><a href="http://wg21.link/p0145r3">P0145R3</a></td>
+      <td class="none" align="center" rowspan=2>No</td>
+    </tr>
+    <tr>
+      <td><a href="http://wg21.link/p0400r0">P0400R0</a></td>
+    </tr>
+    <tr>
+      <td>Requirement to ignore unknown attributes</td>
+      <td><a href="http://wg21.link/p0283r2">P0283R2</a></td>
+      <td class="full" align="center">Yes</td>
+    </tr>
+    <tr>
+      <td><tt>constexpr</tt> <em>if-statement</em>s</td>
+      <td><a href="http://wg21.link/p0292r2">P0292R2</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td>Inline variables</td>
+      <td><a href="http://wg21.link/p0386r2">P0386R2</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
+    <tr>
+      <td>Structured bindings</td>
+      <td><a href="http://wg21.link/p0217r3">P0217R3</a></td>
+      <td class="partial" align="center">Partial</td>
+    </tr>
+    <tr>
+      <td>Separate variable and condition for <tt>if</tt> and <tt>switch</tt></td>
+      <td><a href="http://wg21.link/p0305r1">P0305R1</a></td>
+      <td class="full" align="center">Clang 3.9</td>
+    </tr>
 </table>
 
 <p>
-<span id="n3922">(7): This is a backwards-incompatible change that is applied to
+<span id="n3922">(8): This is a backwards-incompatible change that is applied to
 all language versions that allow type deduction from <tt>auto</tt>
 (per the request of the C++ committee).
 In Clang 3.7, a warning is emitted for all cases that would change meaning.
+</span><br>
+<span id="p0136">(9): This is the resolution to a Defect Report, so is applied
+to all language versions supporting inheriting constructors.
 </span>
 </p>
 
@@ -678,6 +799,11 @@
       <td class="none" align="center">No</td>
     </tr>
     <tr>
+      <td>[TS] Modules</td>
+      <td><a href="http://wg21.link/p0143r2">P0143R2</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
       <td>[TS] Transactional Memory</td>
       <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4514.pdf">N4514</a></td>
       <td class="none" align="center">No</td>
diff --git a/www/get_started.html b/www/get_started.html
index 541c45a..0039b56 100644
--- a/www/get_started.html
+++ b/www/get_started.html
@@ -47,20 +47,20 @@
       http://www.cmake.org/download</a></li>
   </ul>
 
-  <li>Checkout LLVM:
+  <li>Check out LLVM:
   <ul>
     <li>Change directory to where you want the llvm directory placed.</li>
     <li><tt>svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm</tt></li>
   </ul>
   </li>
-  <li>Checkout Clang:
+  <li>Check out Clang:
   <ul>
     <li><tt>cd llvm/tools</tt></li>
     <li><tt>svn co http://llvm.org/svn/llvm-project/cfe/trunk clang</tt></li>
     <li><tt>cd ../..</tt></li>
   </ul>
   </li>
-  <li>Checkout extra Clang Tools: (optional)
+  <li>Check out extra Clang tools: (optional)
   <ul>
     <li><tt>cd llvm/tools/clang/tools</tt></li>
     <li><tt>svn co http://llvm.org/svn/llvm-project/clang-tools-extra/trunk
@@ -68,7 +68,7 @@
     <li><tt>cd ../../../..</tt></li>
   </ul>
   </li>
-  <li>Checkout Compiler-RT:
+  <li>Check out Compiler-RT (optional):
   <ul>
     <li><tt>cd llvm/projects</tt></li>
     <li><tt>svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk
@@ -76,7 +76,7 @@
     <li><tt>cd ../..</tt></li>
   </ul>
   </li>
-  <li>Checkout libcxx: (only required to build and run Compiler-RT tests on OS X, optional otherwise)
+  <li>Check out libcxx: (only required to build and run Compiler-RT tests on OS X, optional otherwise)
   <ul>
     <li><tt>cd llvm/projects</tt></li>
     <li><tt>svn co http://llvm.org/svn/llvm-project/libcxx/trunk
@@ -98,9 +98,6 @@
         KDevelop3. For more details see
         <a href="http://llvm.org/docs/CMake.html">Building LLVM with CMake</a>
         page.</li>
-      <li>You can also build Clang with
-        <a href="http://llvm.org/docs/BuildingLLVMWithAutotools.html">
-        autotools</a>, but some features may be unavailable there.</li>
   </ul>
   </li>
 
@@ -112,7 +109,7 @@
       scenarios, you can use the <tt>-DGCC_INSTALL_PREFIX</tt> cmake option
       to tell Clang where the gcc containing the desired libstdc++ is installed.
   </li>
-  <li>Try it out (assuming you add llvm/Debug+Asserts/bin to your path):
+  <li>Try it out (assuming you add llvm/build/bin to your path):
   <ul>
     <li><tt>clang --help</tt></li>
     <li><tt>clang file.c -fsyntax-only</tt> (check for correctness)</li>
@@ -123,17 +120,17 @@
   </li>
 </ol>
 
-<p>Note that the C front-end uses LLVM, but does not depend on llvm-gcc. If you
-encounter problems with building Clang, make sure you have the latest SVN
-version of LLVM. LLVM contains support libraries for Clang that will be updated
-as well as development on Clang progresses.</p>
-  
+<p>If you encounter problems while building Clang, make sure that your LLVM
+checkout is at the same revision as your Clang checkout. LLVM's interfaces
+change over time, and mismatched revisions are not expected to work
+together.</p>
+
 <h3>Simultaneously Building Clang and LLVM:</h3>
 
 <p>Once you have checked out Clang into the llvm source tree it will build along
 with the rest of <tt>llvm</tt>. To build all of LLVM and Clang together all at
 once simply run <tt>make</tt> from the root LLVM directory.</p>
-    
+
 <p><em>Note:</em> Observe that Clang is technically part of a separate
 Subversion repository. As mentioned above, the latest Clang sources are tied to
 the latest sources in the LLVM tree. You can update your toplevel LLVM project
@@ -172,16 +169,19 @@
   </ul>
   </li>
 
-  <li>Checkout LLVM:
+  <li>Check out LLVM:
   <ul>
     <li><tt>svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm</tt></li>
   </ul>
   </li>
-  <li>Checkout Clang:
+  <li>Check out Clang:
   <ul>
      <li><tt>cd llvm\tools</tt>
      <li><tt>svn co http://llvm.org/svn/llvm-project/cfe/trunk clang</tt></li>
   </ul>
+  <p><em>Note</em>:  Some Clang tests are sensitive to the line endings.  Ensure
+     that checking out the files does not convert LF line endings to CR+LF.
+     If you use git-svn, make sure your <tt>core.autocrlf</tt> setting is false.</p>
   </li>
   <li>Run CMake to generate the Visual Studio solution and project files:
   <ul>
diff --git a/www/hacking.html b/www/hacking.html
index 4535ef4..65d182d 100644
--- a/www/hacking.html
+++ b/www/hacking.html
@@ -98,14 +98,16 @@
   <!--=====================================================================-->
 
   <p>The files 
-    <a href="http://llvm.org/svn/llvm-project/llvm/trunk/utils/llvm.natvis">
-      <tt>utils/llvm.natvis</tt></a> and 
-    <a href="http://llvm.org/svn/llvm-project/cfe/trunk/utils/clang.natvis">
-      <tt>utils/clang.natvis</tt></a> provide debugger visualizers 
+    <a href="http://llvm.org/svn/llvm-project/llvm/trunk/utils/LLVMVisualizers/llvm.natvis">
+      <tt>utils/LLVMVisualizers/llvm.natvis</tt></a> and 
+    <a href="http://llvm.org/svn/llvm-project/cfe/trunk/utils/ClangVisualizers/clang.natvis">
+      <tt>utils/ClangVisualizers/clang.natvis</tt></a> provide debugger visualizers 
       that make debugging of more complex data types much easier.</p>
-  <p>Put the files into 
-    <tt>%USERPROFILE%\Documents\Visual Studio 2012\Visualizers</tt> or 
+  <p>For Visual Studio 2013 only, put the files into 
+    <tt>%USERPROFILE%\Documents\Visual Studio 2013\Visualizers</tt> or 
     create a symbolic link so they update automatically.</p>
+  <p>For later versions of Visual Studio, no installation is required.
+    Note also that later versions of Visual Studio also display better visualizations.</p>
 
   <!--=====================================================================-->
   <h2 id="testing">Testing</h2>
diff --git a/www/index.html b/www/index.html
index 2d3ca8f..62354fe 100644
--- a/www/index.html
+++ b/www/index.html
@@ -93,7 +93,7 @@
    be a production quality C, Objective-C, C++ and Objective-C++ compiler when 
    targeting X86-32, X86-64, and ARM (other targets may have caveats, but are 
    usually easy to fix).  If you are looking for source analysis or
-   source-to-source transformation tools, clang is probably a great
+   source-to-source transformation tools, Clang is probably a great
    solution for you.  Clang supports C++11, please see the <a
     href="cxx_status.html">C++ status</a> page for more
    information.</p>
@@ -104,14 +104,14 @@
   
   <p>Start by <a href="get_started.html">getting the code, building it, and
      playing with it</a>.  This will show you the sorts of things we can do
-     today and will let you have the "clang experience" first hand: hopefully
+     today and will let you have the "Clang experience" first hand: hopefully
      it will "resonate" with you. :)</p>
   
   <p>Once you've done that, please consider <a href="get_involved.html">getting
-     involved in the clang community</a>.  The clang developers include numerous
+     involved in the clang community</a>.  The Clang developers include numerous
      volunteer contributors with a variety of backgrounds.  If you're 
      interested in
-     following the development of clang, signing up for a mailing list is a good
+     following the development of Clang, signing up for a mailing list is a good
      way to learn about how the project works.</p>
 </div>
 </body>
diff --git a/www/make_cxx_dr_status b/www/make_cxx_dr_status
index fe48428..9a679ed 100755
--- a/www/make_cxx_dr_status
+++ b/www/make_cxx_dr_status
@@ -102,10 +102,10 @@
   if status == 'unknown':
     avail = 'Unknown'
     avail_style = ' class="none"'
-  elif status == '3.9':
+  elif status == '4.0':
     avail = 'SVN'
     avail_style = ' class="svn"'
-  elif status.startswith('3.'):
+  elif re.match('^[0-9]+\.', status):
     avail = 'Clang %s' % status
     avail_style = ' class="full"'
   elif status == 'yes':
